Rebase on mainupstream/users/ilovepi/spr/llvmmisexpect-update-misexpect-to-use-provenance-tracking-metadata

Created using spr 1.3.4
author: Paul Kirth <paulkirth@google.com> 2024-05-13 20:52:36 +0000
committer: Paul Kirth <paulkirth@google.com> 2024-05-13 20:52:36 +0000
commit: 93b0bf6ac0173125725325c778e66c7ba93755c3 (patch)
tree: 09324f9214b3a4184beba7c76b0acb3f3ab0317a
parent: 964058caecc40acc79b80f3111113bd089a07130 (diff)
parent: 0dd2b7cbe5750f5a0ca8285ea8faf42afe3c2484 (diff)
4711 files changed, 182509 insertions, 95637 deletions
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 0f178df1d18f..e25b2f50b1b4 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -23,6 +23,7 @@
 /llvm/lib/Analysis/ScalarEvolution.cpp @nikic
 /llvm/lib/Analysis/ValueTracking.cpp @nikic
 /llvm/lib/IR/ConstantRange.cpp @nikic
+/llvm/lib/IR/Core.cpp @nikic
 /llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @nikic
 /llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @nikic
 /llvm/lib/Transforms/InstCombine/ @nikic
@@ -63,8 +64,8 @@ clang/test/AST/Interp/ @tbaederr
 /mlir/Dialect/*/Transforms/Bufferize.cpp @matthias-springer
 
 # Linalg Dialect in MLIR.
-/mlir/include/mlir/Dialect/Linalg/* @dcaballe @nicolasvasilache
-/mlir/lib/Dialect/Linalg/* @dcaballe @nicolasvasilache
+/mlir/include/mlir/Dialect/Linalg/* @dcaballe @nicolasvasilache @rengolin
+/mlir/lib/Dialect/Linalg/* @dcaballe @nicolasvasilache @rengolin
 /mlir/lib/Dialect/Linalg/Transforms/DecomposeLinalgOps.cpp @MaheshRavishankar @nicolasvasilache
 /mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp @MaheshRavishankar @nicolasvasilache
 /mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp @MaheshRavishankar @nicolasvasilache
diff --git a/.github/new-prs-labeler.yml b/.github/new-prs-labeler.yml
index d608ea449f1d..a57ba28faf16 100644
--- a/.github/new-prs-labeler.yml
+++ b/.github/new-prs-labeler.yml
@@ -239,7 +239,7 @@ mlir:dlti:
   - mlir/**/DLTI/**
 
 mlir:emitc:
-  - mlir/**/EmitC/**
+  - mlir/**/*EmitC*/**
   - mlir/lib/Target/Cpp/**
 
 mlir:func:
@@ -306,7 +306,7 @@ mlir:tensor:
   - mlir/**/Tensor/**
 
 mlir:tosa:
-  - mlir/**/Tosa/**
+  - mlir/**/*Tosa*/**
 
 mlir:ub:
   - mlir/**/UB/**
diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index 131ad3004f45..02082a84d8c1 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -38,9 +38,6 @@ jobs:
     if: github.repository == 'llvm/llvm-project'
     outputs:
       release-version: ${{ steps.vars.outputs.release-version }}
-      flags: ${{ steps.vars.outputs.flags }}
-      build-dir: ${{ steps.vars.outputs.build-dir }}
-      rc-flags: ${{ steps.vars.outputs.rc-flags }}
       ref: ${{ steps.vars.outputs.ref }}
       upload: ${{ steps.vars.outputs.upload }}
 
@@ -85,17 +82,11 @@ jobs:
         fi
         bash .github/workflows/set-release-binary-outputs.sh "$tag" "$upload"
 
-  # Try to get around the 6 hour timeout by first running a job to fill
-  # the build cache.
-  fill-cache:
-    name: "Fill Cache ${{ matrix.os }}"
+  build-stage1-linux:
+    name: "Build Stage 1 Linux"
     needs: prepare
-    runs-on: ${{ matrix.os }}
+    runs-on: ubuntu-22.04
     if: github.repository == 'llvm/llvm-project'
-    strategy:
-      matrix:
-        os:
-          - ubuntu-22.04
     steps:
     - name: Checkout LLVM
       uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
@@ -109,81 +100,207 @@ jobs:
       uses: hendrikmuhs/ccache-action@ca3acd2731eef11f1572ccb126356c2f9298d35e # v1.2.9
       with:
         max-size: 250M
-        key: sccache-${{ matrix.os }}-release
+        key: sccache-${{ runner.os }}-release
         variant: sccache
 
-    - name: Build Clang
+    - name: Build Stage 1 Clang
       run: |
-        cmake -G Ninja -C clang/cmake/caches/Release.cmake -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_POSITION_INDEPENDENT_CODE=ON -S llvm -B build
-        ninja -v -C build clang
+        sudo chown $USER:$USER /mnt/
+        cmake -G Ninja -C clang/cmake/caches/Release.cmake -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -S llvm -B /mnt/build
+        ninja -v -C /mnt/build
 
+    # We need to create an archive of the build directory, because it has too
+    # many files to upload.
+    - name: Package Build and Source Directories
+      run: |
+        tar -c . | zstd -T0 -c > llvm-project.tar.zst
+        tar -C /mnt/ -c build/ | zstd -T0 -c > build.tar.zst
 
-  build-binaries:
-    name: ${{ matrix.target.triple }}
-    permissions:
-      contents: write # To upload assets to release.
+    - name: Upload Stage 1 Source
+      uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+      with:
+        name: stage1-source
+        path: llvm-project.tar.zst
+        retention-days: 2
+
+    - name: Upload Stage 1 Build Dir
+      uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+      with:
+        name: stage1-build
+        path: build.tar.zst
+        retention-days: 2
+
+  build-stage2-linux:
+    name: "Build Stage 2 Linux"
     needs:
       - prepare
-      - fill-cache
-    runs-on: ${{ matrix.target.runs-on }}
+      - build-stage1-linux
+    runs-on: ubuntu-22.04
     if: github.repository == 'llvm/llvm-project'
-    strategy:
-      fail-fast: false
-      matrix:
-        target:
-          - triple: x86_64-linux-gnu-ubuntu-22.04
-            os: ubuntu-22.04
-            runs-on: ubuntu-22.04-16x64
-            debian-build-deps: >
-              chrpath
-              gcc-multilib
-              ninja-build
-
     steps:
-    - name: Checkout LLVM
-      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+    - name: Install Ninja
+      uses: llvm/actions/install-ninja@22e9f909d35b50bd1181709564bfe816eaeaae81 # main
+
+    - name: Download Stage 1 Artifacts
+      uses: actions/download-artifact@6b208ae046db98c579e8a3aa621ab581ff575935 # v4.1.1
       with:
-        ref: ${{ needs.prepare.outputs.ref }}
-        path: ${{ needs.prepare.outputs.build-dir }}/llvm-project
+        pattern: stage1-*
+        merge-multiple: true
 
-    - name: Setup sccache
-      uses: hendrikmuhs/ccache-action@ca3acd2731eef11f1572ccb126356c2f9298d35e # v1.2.9
+    - name: Unpack Artifacts
+      run: |
+        tar --zstd -xf llvm-project.tar.zst
+        rm llvm-project.tar.zst
+        sudo chown $USER:$USER /mnt/
+        tar --zstd -C /mnt -xf build.tar.zst
+        rm build.tar.zst
+
+    - name: Build Stage 2
+      run: |
+        ninja -C /mnt/build stage2-instrumented
+
+    # We need to create an archive of the build directory, because it has too
+    # many files to upload.
+    - name: Save Build and Source Directories
+      run: |
+        tar -c . | zstd -T0 -c > llvm-project.tar.zst
+        tar -C /mnt/ -c build/ | zstd -T0 -c > build.tar.zst
+
+    - name: Upload Stage 2 Source
+      uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
       with:
-        max-size: 250M
-        key: sccache-${{ matrix.target.os }}-release
-        save: false
-        variant: sccache
+        name: stage2-source
+        path: ${{ github.workspace }}/llvm-project.tar.zst
+        retention-days: 2
+
+    - name: Upload Stage 2 Build Dir
+      uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+      with:
+        name: stage2-build
+        path: ${{ github.workspace }}/build.tar.zst
+        retention-days: 2
 
-    - name: Install Brew build dependencies
-      if: matrix.target.brew-build-deps != ''
-      run: brew install ${{ matrix.target.brew-build-deps }}
 
-    - name: Install Debian build dependencies
-      if: matrix.target.debian-build-deps != ''
-      run: sudo apt install ${{ matrix.target.debian-build-deps }}
+  build-stage3-linux:
+    name: "Build Stage 3 Linux"
+    needs:
+      - prepare
+      - build-stage2-linux
+    outputs:
+      filename: ${{ steps.package-info.outputs.release-filename }}
+    runs-on: ubuntu-22.04-16x64
+    if: github.repository == 'llvm/llvm-project'
+    steps:
+    - name: Install Ninja
+      uses: llvm/actions/install-ninja@22e9f909d35b50bd1181709564bfe816eaeaae81 # main
+
+    - name: 'Download artifact'
+      uses: actions/download-artifact@6b208ae046db98c579e8a3aa621ab581ff575935 # v4.1.1
+      with:
+        pattern: stage2-*
+        merge-multiple: true
 
-    - name: Set macOS build env variables
-      if: runner.os == 'macOS'
+    - name: Unpack Artifact
       run: |
-        echo "MACOSX_DEPLOYMENT_TARGET=10.9" >> "$GITHUB_ENV"
+        tar --zstd -xf llvm-project.tar.zst
+        rm llvm-project.tar.zst
+        sudo chown $USER:$USER /mnt/
+        tar --zstd -C /mnt -xf build.tar.zst
+        rm build.tar.zst
 
-    - name: Build and test release
+    - name: Build Release Package
       run: |
-        ${{ needs.prepare.outputs.build-dir }}/llvm-project/llvm/utils/release/test-release.sh \
-        ${{ needs.prepare.outputs.flags }} \
-        -triple ${{ matrix.target.triple }} \
-        -use-ninja \
-        -no-checkout \
-        -use-cmake-cache \
-        -no-test-suite \
-        -configure-flags "-DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache"
+        ninja -C /mnt/build stage2-package
 
-    - name: Upload binaries
-      if: ${{ always() && needs.prepare.outputs.upload == 'true' }}
+    - id: package-info
+      run: |
+        filename="LLVM-${{ needs.prepare.outputs.release-version }}-Linux.tar.gz"
+        echo "filename=$filename" >> $GITHUB_OUTPUT
+        echo "path=/mnt/build/tools/clang/stage2-bins/$filename" >> $GITHUB_OUTPUT
+
+    - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+      if: always()
+      with:
+        name: release-binary
+        path: ${{ steps.package-info.outputs.path }}
+
+    # Clean up some build files to reduce size of artifact.
+    - name: Clean Up Build Directory
+      run: |
+        find /mnt/build -iname ${{ steps.package-info.outputs.filename }} -delete
+
+    # We need to create an archive of the build directory, because it has too
+    # many files to upload.
+    - name: Save Build and Source Directories
+      run: |
+        tar -c . | zstd -T0 -c > llvm-project.tar.zst
+        tar -C /mnt/ -c build/ | zstd -T0 -c > build.tar.zst
+
+    - name: Upload Stage 3 Source
+      uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+      with:
+        name: stage3-source
+        path: llvm-project.tar.zst
+        retention-days: 2
+
+    - name: Upload Stage 3 Build Dir
+      uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+      with:
+        name: stage3-build
+        path: build.tar.zst
+        retention-days: 2
+
+  upload-release-binaries-linux:
+    name: "Upload Linux Release Binaries"
+    needs:
+      - prepare
+      - build-stage3-linux
+    if : ${{ needs.prepare.outputs.upload == 'true' }}
+    runs-on: ubuntu-22.04
+    permissions:
+      contents: write # For release uploads
+
+    steps:
+    - name: 'Download artifact'
+      uses: actions/download-artifact@6b208ae046db98c579e8a3aa621ab581ff575935 # v4.1.1
+      with:
+        name: release-binary
+
+    - name: Upload Release
       run: |
         sudo apt install python3-github
-        ${{ needs.prepare.outputs.build-dir }}/llvm-project/llvm/utils/release/github-upload-release.py \
+        ./llvm-project/llvm/utils/release/github-upload-release.py \
         --token ${{ github.token }} \
         --release ${{ needs.prepare.outputs.release-version }} \
         upload \
-        --files ${{ needs.prepare.outputs.build-dir }}/clang+llvm-${{ needs.prepare.outputs.release-version }}-${{ matrix.target.triple }}.tar.xz
+        --files ${{ needs.build-stage3-linux.outputs.release-filename }}
+
+
+  test-stage3-linux:
+    name: "Test Stage 3 Linux"
+    needs:
+      - prepare
+      - build-stage3-linux
+    runs-on: ubuntu-22.04
+    if: github.repository == 'llvm/llvm-project'
+    steps:
+    - name: Install Ninja
+      uses: llvm/actions/install-ninja@22e9f909d35b50bd1181709564bfe816eaeaae81 # main
+
+    - name: 'Download artifact'
+      uses: actions/download-artifact@6b208ae046db98c579e8a3aa621ab581ff575935 # v4.1.1
+      with:
+        pattern: stage3-*
+        merge-multiple: true
+
+    - name: Unpack Artifact
+      run: |
+        tar --zstd -xf llvm-project.tar.zst
+        rm llvm-project.tar.zst
+        sudo chown $USER:$USER /mnt/
+        tar --zstd -C /mnt -xf build.tar.zst
+        rm build.tar.zst
+
+    - name: Run Tests
+      run: |
+        ninja -C /mnt/build stage2-check-all
diff --git a/.github/workflows/release-doxygen.yml b/.github/workflows/release-doxygen.yml
index 5e322849a1d0..12c14bea52f6 100644
--- a/.github/workflows/release-doxygen.yml
+++ b/.github/workflows/release-doxygen.yml
@@ -56,12 +56,12 @@ jobs:
           pip3 install --user -r ./llvm/docs/requirements.txt
 
       - name: Build Doxygen
-        env:
-          GITHUB_TOKEN: ${{ github.token }}
         run: |
           ./llvm/utils/release/build-docs.sh -release "${{ inputs.release-version }}" -no-sphinx
 
       - name: Upload Doxygen
         if: env.upload
+        env:
+          GITHUB_TOKEN: ${{ github.token }}
         run: |
           ./llvm/utils/release/github-upload-release.py --token "$GITHUB_TOKEN" --release "${{ inputs.release-version }}" --user "${{ github.actor }}" upload --files ./*doxygen*.tar.xz
diff --git a/.github/workflows/release-tasks.yml b/.github/workflows/release-tasks.yml
index 53da8662b020..29049ff01428 100644
--- a/.github/workflows/release-tasks.yml
+++ b/.github/workflows/release-tasks.yml
@@ -1,7 +1,7 @@
 name: Release Task
 
 permissions:
-  contents: write
+  contents: read
 
 on:
   push:
@@ -27,6 +27,8 @@ jobs:
   release-create:
     name: Create a New Release
     runs-on: ubuntu-latest
+    permissions:
+      contents: write # For creating the release.
     needs: validate-tag
 
     steps:
@@ -55,6 +57,8 @@ jobs:
 
   release-doxygen:
     name: Build and Upload Release Doxygen
+    permissions:
+      contents: write
     needs:
       - validate-tag
       - release-create
@@ -72,6 +76,8 @@ jobs:
 
   release-binaries:
     name: Build Release Binaries
+    permissions:
+      contents: write
     needs:
       - validate-tag
       - release-create
diff --git a/.github/workflows/set-release-binary-outputs.sh b/.github/workflows/set-release-binary-outputs.sh
index 59470cf83ba7..14d0798364e9 100644
--- a/.github/workflows/set-release-binary-outputs.sh
+++ b/.github/workflows/set-release-binary-outputs.sh
@@ -15,10 +15,8 @@ if echo $tag | grep -e '^[0-9a-f]\+$'; then
   # This is a plain commit.
   # TODO: Don't hardcode this.
   release_version="18"
-  build_dir="$tag"
   upload='false'
   ref="$tag"
-  flags="-git-ref $tag -test-asserts"
 
 else
 
@@ -30,12 +28,7 @@ else
   fi
   release_version=`echo "$tag" | sed 's/llvmorg-//g'`
   release=`echo "$release_version" | sed 's/-.*//g'`
-  build_dir=`echo "$release_version" | sed 's,^[^-]\+,final,' | sed 's,[^-]\+-rc\(.\+\),rc\1,'`
-  rc_flags=`echo "$release_version" | sed 's,^[^-]\+,-final,' | sed 's,[^-]\+-rc\(.\+\),-rc \1 -test-asserts,' | sed 's,--,-,'`
-  flags="-release $release $rc_flags"
 fi
 echo "release-version=$release_version" >> $GITHUB_OUTPUT
-echo "build-dir=$build_dir" >> $GITHUB_OUTPUT
-echo "flags=$flags" >> $GITHUB_OUTPUT
 echo "upload=$upload" >> $GITHUB_OUTPUT
 echo "ref=$tag" >> $GITHUB_OUTPUT
diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h
index 8b1af9e81539..75765819ac46 100644
--- a/bolt/include/bolt/Core/BinaryContext.h
+++ b/bolt/include/bolt/Core/BinaryContext.h
@@ -20,6 +20,7 @@
 #include "bolt/Core/JumpTable.h"
 #include "bolt/Core/MCPlusBuilder.h"
 #include "bolt/RuntimeLibs/RuntimeLibrary.h"
+#include "llvm/ADT/AddressRanges.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/iterator.h"
@@ -726,6 +727,9 @@ public:
   uint64_t OldTextSectionOffset{0};
   uint64_t OldTextSectionSize{0};
 
+  /// Area in the input binary reserved for BOLT.
+  AddressRange BOLTReserved;
+
   /// Address of the code/function that is executed before any other code in
   /// the binary.
   std::optional<uint64_t> StartFunctionAddress;
diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h
index 26d2d01f8626..3c641581e247 100644
--- a/bolt/include/bolt/Core/BinaryFunction.h
+++ b/bolt/include/bolt/Core/BinaryFunction.h
@@ -930,6 +930,8 @@ public:
     return const_cast<BinaryFunction *>(this)->getInstructionAtOffset(Offset);
   }
 
+  std::optional<MCInst> disassembleInstructionAtOffset(uint64_t Offset) const;
+
   /// Return offset for the first instruction. If there is data at the
   /// beginning of a function then offset of the first instruction could
   /// be different from 0
diff --git a/bolt/include/bolt/Passes/FrameAnalysis.h b/bolt/include/bolt/Passes/FrameAnalysis.h
index 66246bd6647b..44b54d4ed45d 100644
--- a/bolt/include/bolt/Passes/FrameAnalysis.h
+++ b/bolt/include/bolt/Passes/FrameAnalysis.h
@@ -170,10 +170,6 @@ class FrameAnalysis {
                      std::unique_ptr<StackPointerTracking>>
       SPTMap;
 
-  /// A vector that stores ids of the allocators that are used in SPT
-  /// computation
-  std::vector<MCPlusBuilder::AllocatorIdTy> SPTAllocatorsId;
-
 public:
   explicit FrameAnalysis(BinaryContext &BC, BinaryFunctionCallGraph &CG);
 
diff --git a/bolt/include/bolt/Passes/IndirectCallPromotion.h b/bolt/include/bolt/Passes/IndirectCallPromotion.h
index adc58d70ec0f..8ec160b867cf 100644
--- a/bolt/include/bolt/Passes/IndirectCallPromotion.h
+++ b/bolt/include/bolt/Passes/IndirectCallPromotion.h
@@ -104,7 +104,7 @@ class IndirectCallPromotion : public BinaryFunctionPass {
   struct Location {
     MCSymbol *Sym{nullptr};
     uint64_t Addr{0};
-    bool isValid() const { return Sym || (!Sym && Addr != 0); }
+    bool isValid() const { return Sym || Addr != 0; }
     Location() {}
     explicit Location(MCSymbol *Sym) : Sym(Sym) {}
     explicit Location(uint64_t Addr) : Addr(Addr) {}
diff --git a/bolt/include/bolt/Rewrite/DWARFRewriter.h b/bolt/include/bolt/Rewrite/DWARFRewriter.h
index 2c482bd2b9ea..12e0813d089d 100644
--- a/bolt/include/bolt/Rewrite/DWARFRewriter.h
+++ b/bolt/include/bolt/Rewrite/DWARFRewriter.h
@@ -177,13 +177,6 @@ private:
       DIEValue &HighPCAttrInfo,
       std::optional<uint64_t> RangesBase = std::nullopt);
 
-  /// Adds a \p Str to .debug_str section.
-  /// Uses \p AttrInfoVal to either update entry in a DIE for legacy DWARF using
-  /// \p DebugInfoPatcher, or for DWARF5 update an index in .debug_str_offsets
-  /// for this contribution of \p Unit.
-  void addStringHelper(DIEBuilder &DIEBldr, DIE &Die, const DWARFUnit &Unit,
-                       DIEValue &DIEAttrInfo, StringRef Str);
-
 public:
   DWARFRewriter(BinaryContext &BC) : BC(BC) {}
 
diff --git a/bolt/include/bolt/Rewrite/RewriteInstance.h b/bolt/include/bolt/Rewrite/RewriteInstance.h
index 41a92e7ba01e..64113bd02601 100644
--- a/bolt/include/bolt/Rewrite/RewriteInstance.h
+++ b/bolt/include/bolt/Rewrite/RewriteInstance.h
@@ -97,6 +97,10 @@ private:
   /// from meta data in the file.
   void discoverFileObjects();
 
+  /// Check if the input binary has a space reserved for BOLT and use it for new
+  /// section allocations if found.
+  void discoverBOLTReserved();
+
   /// Check whether we should use DT_FINI or DT_FINI_ARRAY for instrumentation.
   /// DT_FINI is preferred; DT_FINI_ARRAY is only used when no DT_FINI entry was
   /// found.
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index 1fa96dfaabde..de34421ebeb0 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -1167,6 +1167,21 @@ void BinaryFunction::handleAArch64IndirectCall(MCInst &Instruction,
   }
 }
 
+std::optional<MCInst>
+BinaryFunction::disassembleInstructionAtOffset(uint64_t Offset) const {
+  assert(CurrentState == State::Empty && "Function should not be disassembled");
+  assert(Offset < MaxSize && "Invalid offset");
+  ErrorOr<ArrayRef<unsigned char>> FunctionData = getData();
+  assert(FunctionData && "Cannot get function as data");
+  MCInst Instr;
+  uint64_t InstrSize = 0;
+  const uint64_t InstrAddress = getAddress() + Offset;
+  if (BC.DisAsm->getInstruction(Instr, InstrSize, FunctionData->slice(Offset),
+                                InstrAddress, nulls()))
+    return Instr;
+  return std::nullopt;
+}
+
 Error BinaryFunction::disassemble() {
   NamedRegionTimer T("disassemble", "Disassemble function", "buildfuncs",
                      "Build Binary Functions", opts::TimeBuild);
diff --git a/bolt/lib/Core/ParallelUtilities.cpp b/bolt/lib/Core/ParallelUtilities.cpp
index 5f5e96e0e788..a24c37c06f1a 100644
--- a/bolt/lib/Core/ParallelUtilities.cpp
+++ b/bolt/lib/Core/ParallelUtilities.cpp
@@ -188,8 +188,20 @@ void runOnEachFunctionWithUniqueAllocId(
     LLVM_DEBUG(T.stopTimer());
   };
 
+  unsigned AllocId = 1;
+  auto EnsureAllocatorExists = [&BC](unsigned AllocId) {
+    if (!BC.MIB->checkAllocatorExists(AllocId)) {
+      MCPlusBuilder::AllocatorIdTy Id =
+          BC.MIB->initializeNewAnnotationAllocator();
+      (void)Id;
+      assert(AllocId == Id && "unexpected allocator id created");
+    }
+  };
+
   if (opts::NoThreads || ForceSequential) {
-    runBlock(BC.getBinaryFunctions().begin(), BC.getBinaryFunctions().end(), 0);
+    EnsureAllocatorExists(AllocId);
+    runBlock(BC.getBinaryFunctions().begin(), BC.getBinaryFunctions().end(),
+             AllocId);
     return;
   }
   // This lock is used to postpone task execution
@@ -205,19 +217,13 @@ void runOnEachFunctionWithUniqueAllocId(
   ThreadPoolInterface &Pool = getThreadPool();
   auto BlockBegin = BC.getBinaryFunctions().begin();
   unsigned CurrentCost = 0;
-  unsigned AllocId = 1;
   for (auto It = BC.getBinaryFunctions().begin();
        It != BC.getBinaryFunctions().end(); ++It) {
     BinaryFunction &BF = It->second;
     CurrentCost += computeCostFor(BF, SkipPredicate, SchedPolicy);
 
     if (CurrentCost >= BlockCost) {
-      if (!BC.MIB->checkAllocatorExists(AllocId)) {
-        MCPlusBuilder::AllocatorIdTy Id =
-            BC.MIB->initializeNewAnnotationAllocator();
-        (void)Id;
-        assert(AllocId == Id && "unexpected allocator id created");
-      }
+      EnsureAllocatorExists(AllocId);
       Pool.async(runBlock, BlockBegin, std::next(It), AllocId);
       AllocId++;
       BlockBegin = std::next(It);
@@ -225,12 +231,7 @@ void runOnEachFunctionWithUniqueAllocId(
     }
   }
 
-  if (!BC.MIB->checkAllocatorExists(AllocId)) {
-    MCPlusBuilder::AllocatorIdTy Id =
-        BC.MIB->initializeNewAnnotationAllocator();
-    (void)Id;
-    assert(AllocId == Id && "unexpected allocator id created");
-  }
+  EnsureAllocatorExists(AllocId);
 
   Pool.async(runBlock, BlockBegin, BC.getBinaryFunctions().end(), AllocId);
   Lock.unlock();
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index c0ba73108f57..df6dbcddeed5 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -910,6 +910,11 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction &BF) {
       auto &CTCAnnotation =
           MIB->getOrCreateAnnotationAs<uint64_t>(*CondBranch, "CTCTakenCount");
       CTCAnnotation = CTCTakenFreq;
+      // Preserve Offset annotation, used in BAT.
+      // Instr is a direct tail call instruction that was created when CTCs are
+      // first expanded, and has the original CTC offset set.
+      if (std::optional<uint32_t> Offset = MIB->getOffset(*Instr))
+        MIB->setOffset(*CondBranch, *Offset);
 
       // Remove the unused successor which may be eliminated later
       // if there are no other users.
diff --git a/bolt/lib/Passes/FrameAnalysis.cpp b/bolt/lib/Passes/FrameAnalysis.cpp
index 7f1245e39f56..4ebfd8f158f7 100644
--- a/bolt/lib/Passes/FrameAnalysis.cpp
+++ b/bolt/lib/Passes/FrameAnalysis.cpp
@@ -561,11 +561,6 @@ FrameAnalysis::FrameAnalysis(BinaryContext &BC, BinaryFunctionCallGraph &CG)
     NamedRegionTimer T1("clearspt", "clear spt", "FA", "FA breakdown",
                         opts::TimeFA);
     clearSPTMap();
-
-    // Clean up memory allocated for annotation values
-    if (!opts::NoThreads)
-      for (MCPlusBuilder::AllocatorIdTy Id : SPTAllocatorsId)
-        BC.MIB->freeValuesAllocator(Id);
   }
 }
 
diff --git a/bolt/lib/Passes/SplitFunctions.cpp b/bolt/lib/Passes/SplitFunctions.cpp
index f9e634d15a97..bd0b6dea0e06 100644
--- a/bolt/lib/Passes/SplitFunctions.cpp
+++ b/bolt/lib/Passes/SplitFunctions.cpp
@@ -715,6 +715,12 @@ Error SplitFunctions::runOnFunctions(BinaryContext &BC) {
   if (!opts::SplitFunctions)
     return Error::success();
 
+  if (BC.IsLinuxKernel && BC.BOLTReserved.empty()) {
+    BC.errs() << "BOLT-ERROR: split functions require reserved space in the "
+                 "Linux kernel binary\n";
+    exit(1);
+  }
+
   // If split strategy is not CDSplit, then a second run of the pass is not
   // needed after function reordering.
   if (BC.HasFinalizedFunctionOrder &&
@@ -829,6 +835,13 @@ void SplitFunctions::splitFunction(BinaryFunction &BF, SplitStrategy &S) {
         }
       }
     }
+
+    // Outlining blocks with dynamic branches is not supported yet.
+    if (BC.IsLinuxKernel) {
+      if (llvm::any_of(
+              *BB, [&](MCInst &Inst) { return BC.MIB->isDynamicBranch(Inst); }))
+        BB->setCanOutline(false);
+    }
   }
 
   BF.getLayout().updateLayoutIndices();
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index 5108392c824c..302bcf1f2d87 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -23,6 +23,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
@@ -773,9 +774,19 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc,
 
 bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
                               uint64_t Mispreds) {
+  bool IsReturn = false;
   auto handleAddress = [&](uint64_t &Addr, bool IsFrom) -> BinaryFunction * {
     if (BinaryFunction *Func = getBinaryFunctionContainingAddress(Addr)) {
       Addr -= Func->getAddress();
+      if (IsFrom) {
+        auto checkReturn = [&](auto MaybeInst) {
+          IsReturn = MaybeInst && BC->MIB->isReturn(*MaybeInst);
+        };
+        if (Func->hasInstructions())
+          checkReturn(Func->getInstructionAtOffset(Addr));
+        else
+          checkReturn(Func->disassembleInstructionAtOffset(Addr));
+      }
 
       if (BAT)
         Addr = BAT->translate(Func->getAddress(), Addr, IsFrom);
@@ -792,6 +803,9 @@ bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
   };
 
   BinaryFunction *FromFunc = handleAddress(From, /*IsFrom=*/true);
+  // Ignore returns.
+  if (IsReturn)
+    return true;
   BinaryFunction *ToFunc = handleAddress(To, /*IsFrom=*/false);
   if (!FromFunc && !ToFunc)
     return false;
@@ -1986,7 +2000,7 @@ std::error_code DataAggregator::parseMMapEvents() {
     std::pair<StringRef, MMapInfo> FileMMapInfo = FileMMapInfoRes.get();
     if (FileMMapInfo.second.PID == -1)
       continue;
-    if (FileMMapInfo.first.equals("(deleted)"))
+    if (FileMMapInfo.first == "(deleted)")
       continue;
 
     // Consider only the first mapping of the file for any given PID
@@ -2326,7 +2340,7 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC,
         continue;
       BinaryFunction *BF = BC.getBinaryFunctionAtAddress(FuncAddress);
       assert(BF);
-      YamlBF.Name = FuncName.str();
+      YamlBF.Name = getLocationName(*BF);
       YamlBF.Id = BF->getFunctionNumber();
       YamlBF.Hash = BAT->getBFHash(FuncAddress);
       YamlBF.ExecCount = BF->getKnownExecutionCount();
@@ -2365,10 +2379,19 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC,
         return CSI;
       };
 
+      // Lookup containing basic block offset and index
+      auto getBlock = [&BlockMap](uint32_t Offset) {
+        auto BlockIt = BlockMap.upper_bound(Offset);
+        if (LLVM_UNLIKELY(BlockIt == BlockMap.begin())) {
+          errs() << "BOLT-ERROR: invalid BAT section\n";
+          exit(1);
+        }
+        --BlockIt;
+        return std::pair(BlockIt->first, BlockIt->second.getBBIndex());
+      };
+
       for (const auto &[FromOffset, SuccKV] : Branches.IntraIndex) {
-        if (!BlockMap.isInputBlock(FromOffset))
-          continue;
-        const unsigned Index = BlockMap.getBBIndex(FromOffset);
+        const auto &[_, Index] = getBlock(FromOffset);
         yaml::bolt::BinaryBasicBlockProfile &YamlBB = YamlBF.Blocks[Index];
         for (const auto &[SuccOffset, SuccDataIdx] : SuccKV)
           if (BlockMap.isInputBlock(SuccOffset))
@@ -2376,10 +2399,7 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC,
                 getSuccessorInfo(SuccOffset, SuccDataIdx));
       }
       for (const auto &[FromOffset, CallTo] : Branches.InterIndex) {
-        auto BlockIt = BlockMap.upper_bound(FromOffset);
-        --BlockIt;
-        const unsigned BlockOffset = BlockIt->first;
-        const unsigned BlockIndex = BlockIt->second.getBBIndex();
+        const auto &[BlockOffset, BlockIndex] = getBlock(FromOffset);
         yaml::bolt::BinaryBasicBlockProfile &YamlBB = YamlBF.Blocks[BlockIndex];
         const uint32_t Offset = FromOffset - BlockOffset;
         for (const auto &[CallToLoc, CallToIdx] : CallTo)
@@ -2390,6 +2410,17 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC,
           return A.Offset < B.Offset;
         });
       }
+      // Set entry counts, similar to DataReader::readProfile.
+      for (const llvm::bolt::BranchInfo &BI : Branches.EntryData) {
+        if (!BlockMap.isInputBlock(BI.To.Offset)) {
+          if (opts::Verbosity >= 1)
+            errs() << "BOLT-WARNING: Unexpected EntryData in " << FuncName
+                   << " at 0x" << Twine::utohexstr(BI.To.Offset) << '\n';
+          continue;
+        }
+        const unsigned BlockIndex = BlockMap.getBBIndex(BI.To.Offset);
+        YamlBF.Blocks[BlockIndex].ExecCount += BI.Branches;
+      }
       // Drop blocks without a hash, won't be useful for stale matching.
       llvm::erase_if(YamlBF.Blocks,
                      [](const yaml::bolt::BinaryBasicBlockProfile &YamlBB) {
diff --git a/bolt/lib/Profile/DataReader.cpp b/bolt/lib/Profile/DataReader.cpp
index 67f357fe4d3f..b2511ba10399 100644
--- a/bolt/lib/Profile/DataReader.cpp
+++ b/bolt/lib/Profile/DataReader.cpp
@@ -1205,8 +1205,7 @@ std::error_code DataReader::parse() {
 
     // Add entry data for branches to another function or branches
     // to entry points (including recursive calls)
-    if (BI.To.IsSymbol &&
-        (!BI.From.Name.equals(BI.To.Name) || BI.To.Offset == 0)) {
+    if (BI.To.IsSymbol && (BI.From.Name != BI.To.Name || BI.To.Offset == 0)) {
       I = GetOrCreateFuncEntry(BI.To.Name);
       I->second.EntryData.emplace_back(std::move(BI));
     }
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index feeba89a40dc..9d4297f913f3 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -582,19 +582,51 @@ static void emitDWOBuilder(const std::string &DWOName,
     Rewriter.writeDWOFiles(CU, OverriddenSections, DWOName, LocWriter);
 }
 
-void DWARFRewriter::addStringHelper(DIEBuilder &DIEBldr, DIE &Die,
-                                    const DWARFUnit &Unit,
-                                    DIEValue &DIEAttrInfo, StringRef Str) {
-  uint32_t NewOffset = StrWriter->addString(Str);
+/// Adds a \p Str to .debug_str section.
+/// Uses \p AttrInfoVal to either update entry in a DIE for legacy DWARF using
+/// \p DebugInfoPatcher, or for DWARF5 update an index in .debug_str_offsets
+/// for this contribution of \p Unit.
+static void addStringHelper(DebugStrOffsetsWriter &StrOffstsWriter,
+                            DebugStrWriter &StrWriter, DIEBuilder &DIEBldr,
+                            DIE &Die, const DWARFUnit &Unit,
+                            DIEValue &DIEAttrInfo, StringRef Str) {
+  uint32_t NewOffset = StrWriter.addString(Str);
   if (Unit.getVersion() >= 5) {
-    StrOffstsWriter->updateAddressMap(DIEAttrInfo.getDIEInteger().getValue(),
-                                      NewOffset);
+    StrOffstsWriter.updateAddressMap(DIEAttrInfo.getDIEInteger().getValue(),
+                                     NewOffset);
     return;
   }
   DIEBldr.replaceValue(&Die, DIEAttrInfo.getAttribute(), DIEAttrInfo.getForm(),
                        DIEInteger(NewOffset));
 }
 
+static std::string
+updateDWONameCompDir(DebugStrOffsetsWriter &StrOffstsWriter,
+                     DebugStrWriter &StrWriter,
+                     std::unordered_map<std::string, uint32_t> &NameToIndexMap,
+                     DWARFUnit &Unit, DIEBuilder &DIEBldr, DIE &UnitDIE) {
+  DIEValue DWONameAttrInfo = UnitDIE.findAttribute(dwarf::DW_AT_dwo_name);
+  if (!DWONameAttrInfo)
+    DWONameAttrInfo = UnitDIE.findAttribute(dwarf::DW_AT_GNU_dwo_name);
+  assert(DWONameAttrInfo && "DW_AT_dwo_name is not in Skeleton CU.");
+  std::string ObjectName;
+
+  ObjectName = getDWOName(Unit, NameToIndexMap);
+  addStringHelper(StrOffstsWriter, StrWriter, DIEBldr, UnitDIE, Unit,
+                  DWONameAttrInfo, ObjectName.c_str());
+
+  DIEValue CompDirAttrInfo = UnitDIE.findAttribute(dwarf::DW_AT_comp_dir);
+  assert(CompDirAttrInfo && "DW_AT_comp_dir is not in Skeleton CU.");
+
+  if (!opts::DwarfOutputPath.empty()) {
+    if (!sys::fs::exists(opts::DwarfOutputPath))
+      sys::fs::create_directory(opts::DwarfOutputPath);
+    addStringHelper(StrOffstsWriter, StrWriter, DIEBldr, UnitDIE, Unit,
+                    CompDirAttrInfo, opts::DwarfOutputPath.c_str());
+  }
+  return ObjectName;
+}
+
 using DWARFUnitVec = std::vector<DWARFUnit *>;
 using CUPartitionVector = std::vector<DWARFUnitVec>;
 /// Partitions CUs in to buckets. Bucket size is controlled by
@@ -692,33 +724,6 @@ void DWARFRewriter::updateDebugInfo() {
   // specified.
   std::unordered_map<std::string, uint32_t> NameToIndexMap;
 
-  auto updateDWONameCompDir = [&](DWARFUnit &Unit, DIEBuilder &DIEBldr,
-                                  DIE &UnitDIE) -> std::string {
-    DIEValue DWONameAttrInfo = UnitDIE.findAttribute(dwarf::DW_AT_dwo_name);
-    if (!DWONameAttrInfo)
-      DWONameAttrInfo = UnitDIE.findAttribute(dwarf::DW_AT_GNU_dwo_name);
-    assert(DWONameAttrInfo && "DW_AT_dwo_name is not in Skeleton CU.");
-    std::string ObjectName;
-
-    {
-      std::lock_guard<std::mutex> Lock(AccessMutex);
-      ObjectName = getDWOName(Unit, NameToIndexMap);
-    }
-    addStringHelper(DIEBldr, UnitDIE, Unit, DWONameAttrInfo,
-                    ObjectName.c_str());
-
-    DIEValue CompDirAttrInfo = UnitDIE.findAttribute(dwarf::DW_AT_comp_dir);
-    assert(CompDirAttrInfo && "DW_AT_comp_dir is not in Skeleton CU.");
-
-    if (!opts::DwarfOutputPath.empty()) {
-      if (!sys::fs::exists(opts::DwarfOutputPath))
-        sys::fs::create_directory(opts::DwarfOutputPath);
-      addStringHelper(DIEBldr, UnitDIE, Unit, CompDirAttrInfo,
-                      opts::DwarfOutputPath.c_str());
-    }
-    return ObjectName;
-  };
-
   DWARF5AcceleratorTable DebugNamesTable(opts::CreateDebugNames, BC,
                                          *StrWriter);
   DWPState State;
@@ -741,8 +746,13 @@ void DWARFRewriter::updateDebugInfo() {
       DIEBuilder DWODIEBuilder(BC, &(*SplitCU)->getContext(), DebugNamesTable,
                                Unit);
       DWODIEBuilder.buildDWOUnit(**SplitCU);
-      std::string DWOName = updateDWONameCompDir(
-          *Unit, *DIEBlder, *DIEBlder->getUnitDIEbyUnit(*Unit));
+      std::string DWOName = "";
+      {
+        std::lock_guard<std::mutex> Lock(AccessMutex);
+        DWOName = updateDWONameCompDir(*StrOffstsWriter, *StrWriter,
+                                       NameToIndexMap, *Unit, *DIEBlder,
+                                       *DIEBlder->getUnitDIEbyUnit(*Unit));
+      }
 
       DebugLoclistWriter DebugLocDWoWriter(*Unit, Unit->getVersion(), true);
       DebugRangesSectionWriter *TempRangesSectionWriter = RangesSectionWriter;
@@ -1540,7 +1550,7 @@ CUOffsetMap DWARFRewriter::finalizeTypeSections(DIEBuilder &DIEBlder,
   for (const SectionRef &Section : Obj->sections()) {
     StringRef Contents = cantFail(Section.getContents());
     StringRef Name = cantFail(Section.getName());
-    if (Name.equals(".debug_types"))
+    if (Name == ".debug_types")
       BC.registerOrUpdateNoteSection(".debug_types", copyByteArray(Contents),
                                      Contents.size());
   }
@@ -1623,10 +1633,10 @@ void DWARFRewriter::finalizeDebugSections(
   for (const SectionRef &Secs : Obj->sections()) {
     StringRef Contents = cantFail(Secs.getContents());
     StringRef Name = cantFail(Secs.getName());
-    if (Name.equals(".debug_abbrev")) {
+    if (Name == ".debug_abbrev") {
       BC.registerOrUpdateNoteSection(".debug_abbrev", copyByteArray(Contents),
                                      Contents.size());
-    } else if (Name.equals(".debug_info")) {
+    } else if (Name == ".debug_info") {
       BC.registerOrUpdateNoteSection(".debug_info", copyByteArray(Contents),
                                      Contents.size());
     }
@@ -1761,7 +1771,7 @@ std::optional<StringRef> updateDebugData(
   };
   switch (SectionIter->second.second) {
   default: {
-    if (!SectionName.equals("debug_str.dwo"))
+    if (SectionName != "debug_str.dwo")
       errs() << "BOLT-WARNING: unsupported debug section: " << SectionName
              << "\n";
     return SectionContents;
@@ -1949,7 +1959,7 @@ void DWARFRewriter::updateDWP(DWARFUnit &CU,
       continue;
     }
 
-    if (SectionName.equals("debug_str.dwo")) {
+    if (SectionName == "debug_str.dwo") {
       CurStrSection = OutData;
     } else {
       // Since handleDebugDataPatching returned true, we already know this is
diff --git a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
index 17077b4fa248..99775ccfe38d 100644
--- a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
+++ b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
@@ -62,6 +62,11 @@ static cl::opt<bool>
                   cl::desc("dump Linux kernel PCI fixup table"),
                   cl::init(false), cl::Hidden, cl::cat(BoltCategory));
 
+static cl::opt<bool> DumpSMPLocks("dump-smp-locks",
+                                  cl::desc("dump Linux kernel SMP locks"),
+                                  cl::init(false), cl::Hidden,
+                                  cl::cat(BoltCategory));
+
 static cl::opt<bool> DumpStaticCalls("dump-static-calls",
                                      cl::desc("dump Linux kernel static calls"),
                                      cl::init(false), cl::Hidden,
@@ -119,19 +124,18 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ORCState &E) {
 namespace {
 
 class LinuxKernelRewriter final : public MetadataRewriter {
-  /// Linux Kernel special sections point to a specific instruction in many
-  /// cases. Unlike SDTMarkerInfo, these markers can come from different
-  /// sections.
-  struct LKInstructionMarkerInfo {
-    uint64_t SectionOffset;
-    int32_t PCRelativeOffset;
-    bool IsPCRelative;
-    StringRef SectionName;
+  /// Information required for updating metadata referencing an instruction.
+  struct InstructionFixup {
+    BinarySection &Section; // Section referencing the instruction.
+    uint64_t Offset;        // Offset in the section above.
+    BinaryFunction &BF;     // Function containing the instruction.
+    MCSymbol &Label;        // Label marking the instruction.
+    bool IsPCRelative;      // If the reference type is relative.
   };
+  std::vector<InstructionFixup> Fixups;
 
-  /// Map linux kernel program locations/instructions to their pointers in
-  /// special linux kernel sections
-  std::unordered_map<uint64_t, std::vector<LKInstructionMarkerInfo>> LKMarkers;
+  /// Size of an entry in .smp_locks section.
+  static constexpr size_t SMP_LOCKS_ENTRY_SIZE = 4;
 
   /// Linux ORC sections.
   ErrorOr<BinarySection &> ORCUnwindSection = std::errc::bad_address;
@@ -221,23 +225,20 @@ class LinuxKernelRewriter final : public MetadataRewriter {
   ErrorOr<BinarySection &> PCIFixupSection = std::errc::bad_address;
   static constexpr size_t PCI_FIXUP_ENTRY_SIZE = 16;
 
-  /// Insert an LKMarker for a given code pointer \p PC from a non-code section
-  /// \p SectionName.
-  void insertLKMarker(uint64_t PC, uint64_t SectionOffset,
-                      int32_t PCRelativeOffset, bool IsPCRelative,
-                      StringRef SectionName);
-
   /// Process linux kernel special sections and their relocations.
   void processLKSections();
 
   /// Process __ksymtab and __ksymtab_gpl.
   void processLKKSymtab(bool IsGPL = false);
 
-  /// Process special linux kernel section, .smp_locks.
-  void processLKSMPLocks();
+  // Create relocations in sections requiring fixups.
+  //
+  // Make sure functions that will not be emitted are marked as such before this
+  // function is executed.
+  void processInstructionFixups();
 
-  /// Update LKMarkers' locations for the output binary.
-  void updateLKMarkers();
+  /// Process .smp_locks section.
+  Error processSMPLocks();
 
   /// Read ORC unwind information and annotate instructions.
   Error readORCTables();
@@ -282,16 +283,14 @@ class LinuxKernelRewriter final : public MetadataRewriter {
   Error rewriteStaticKeysJumpTable();
   Error updateStaticKeysJumpTablePostEmit();
 
-  /// Mark instructions referenced by kernel metadata.
-  Error markInstructions();
-
 public:
   LinuxKernelRewriter(BinaryContext &BC)
       : MetadataRewriter("linux-kernel-rewriter", BC) {}
 
   Error preCFGInitializer() override {
     processLKSections();
-    if (Error E = markInstructions())
+
+    if (Error E = processSMPLocks())
       return E;
 
     if (Error E = readORCTables())
@@ -352,12 +351,12 @@ public:
     if (Error E = rewriteBugTable())
       return E;
 
+    processInstructionFixups();
+
     return Error::success();
   }
 
   Error postEmitFinalizer() override {
-    updateLKMarkers();
-
     if (Error E = updateStaticKeysJumpTablePostEmit())
       return E;
 
@@ -368,39 +367,9 @@ public:
   }
 };
 
-Error LinuxKernelRewriter::markInstructions() {
-  for (const uint64_t PC : llvm::make_first_range(LKMarkers)) {
-    BinaryFunction *BF = BC.getBinaryFunctionContainingAddress(PC);
-
-    if (!BF || !BC.shouldEmit(*BF))
-      continue;
-
-    const uint64_t Offset = PC - BF->getAddress();
-    MCInst *Inst = BF->getInstructionAtOffset(Offset);
-    if (!Inst)
-      return createStringError(errc::executable_format_error,
-                               "no instruction matches kernel marker offset");
-
-    BC.MIB->setOffset(*Inst, static_cast<uint32_t>(Offset));
-
-    BF->setHasSDTMarker(true);
-  }
-
-  return Error::success();
-}
-
-void LinuxKernelRewriter::insertLKMarker(uint64_t PC, uint64_t SectionOffset,
-                                         int32_t PCRelativeOffset,
-                                         bool IsPCRelative,
-                                         StringRef SectionName) {
-  LKMarkers[PC].emplace_back(LKInstructionMarkerInfo{
-      SectionOffset, PCRelativeOffset, IsPCRelative, SectionName});
-}
-
 void LinuxKernelRewriter::processLKSections() {
   processLKKSymtab();
   processLKKSymtab(true);
-  processLKSMPLocks();
 }
 
 /// Process __ksymtab[_gpl] sections of Linux Kernel.
@@ -439,79 +408,73 @@ void LinuxKernelRewriter::processLKKSymtab(bool IsGPL) {
 
 /// .smp_locks section contains PC-relative references to instructions with LOCK
 /// prefix. The prefix can be converted to NOP at boot time on non-SMP systems.
-void LinuxKernelRewriter::processLKSMPLocks() {
-  ErrorOr<BinarySection &> SectionOrError =
+Error LinuxKernelRewriter::processSMPLocks() {
+  ErrorOr<BinarySection &> SMPLocksSection =
       BC.getUniqueSectionByName(".smp_locks");
-  if (!SectionOrError)
-    return;
+  if (!SMPLocksSection)
+    return Error::success();
 
-  uint64_t SectionSize = SectionOrError->getSize();
-  const uint64_t SectionAddress = SectionOrError->getAddress();
-  assert((SectionSize % 4) == 0 &&
-         "The size of the .smp_locks section should be a multiple of 4");
+  const uint64_t SectionSize = SMPLocksSection->getSize();
+  const uint64_t SectionAddress = SMPLocksSection->getAddress();
+  if (SectionSize % SMP_LOCKS_ENTRY_SIZE)
+    return createStringError(errc::executable_format_error,
+                             "bad size of .smp_locks section");
 
-  for (uint64_t I = 0; I < SectionSize; I += 4) {
-    const uint64_t EntryAddress = SectionAddress + I;
-    ErrorOr<uint64_t> Offset = BC.getSignedValueAtAddress(EntryAddress, 4);
-    assert(Offset && "Reading valid PC-relative offset for a .smp_locks entry");
-    int32_t SignedOffset = *Offset;
-    uint64_t RefAddress = EntryAddress + SignedOffset;
+  DataExtractor DE = DataExtractor(SMPLocksSection->getContents(),
+                                   BC.AsmInfo->isLittleEndian(),
+                                   BC.AsmInfo->getCodePointerSize());
+  DataExtractor::Cursor Cursor(0);
+  while (Cursor && Cursor.tell() < SectionSize) {
+    const uint64_t Offset = Cursor.tell();
+    const uint64_t IP = SectionAddress + Offset + (int32_t)DE.getU32(Cursor);
+
+    // Consume the status of the cursor.
+    if (!Cursor)
+      return createStringError(errc::executable_format_error,
+                               "error while reading .smp_locks: %s",
+                               toString(Cursor.takeError()).c_str());
+
+    if (opts::DumpSMPLocks)
+      BC.outs() << "SMP lock at 0x: " << Twine::utohexstr(IP) << '\n';
 
-    BinaryFunction *ContainingBF =
-        BC.getBinaryFunctionContainingAddress(RefAddress);
-    if (!ContainingBF)
+    BinaryFunction *BF = BC.getBinaryFunctionContainingAddress(IP);
+    if (!BF || !BC.shouldEmit(*BF))
       continue;
 
-    insertLKMarker(RefAddress, I, SignedOffset, true, ".smp_locks");
-  }
-}
+    MCInst *Inst = BF->getInstructionAtOffset(IP - BF->getAddress());
+    if (!Inst)
+      return createStringError(errc::executable_format_error,
+                               "no instruction matches lock at 0x%" PRIx64, IP);
 
-void LinuxKernelRewriter::updateLKMarkers() {
-  if (LKMarkers.size() == 0)
-    return;
+    // Check for duplicate entries.
+    if (BC.MIB->hasAnnotation(*Inst, "SMPLock"))
+      return createStringError(errc::executable_format_error,
+                               "duplicate SMP lock at 0x%" PRIx64, IP);
 
-  std::unordered_map<std::string, uint64_t> PatchCounts;
-  for (std::pair<const uint64_t, std::vector<LKInstructionMarkerInfo>>
-           &LKMarkerInfoKV : LKMarkers) {
-    const uint64_t OriginalAddress = LKMarkerInfoKV.first;
-    const BinaryFunction *BF =
-        BC.getBinaryFunctionContainingAddress(OriginalAddress, false, true);
-    if (!BF)
-      continue;
+    BC.MIB->addAnnotation(*Inst, "SMPLock", true);
+    MCSymbol *Label =
+        BC.MIB->getOrCreateInstLabel(*Inst, "__SMPLock_", BC.Ctx.get());
 
-    uint64_t NewAddress = BF->translateInputToOutputAddress(OriginalAddress);
-    if (NewAddress == 0)
-      continue;
+    Fixups.push_back({*SMPLocksSection, Offset, *BF, *Label,
+                      /*IsPCRelative*/ true});
+  }
 
-    // Apply base address.
-    if (OriginalAddress >= 0xffffffff00000000 && NewAddress < 0xffffffff)
-      NewAddress = NewAddress + 0xffffffff00000000;
+  const uint64_t NumEntries = SectionSize / SMP_LOCKS_ENTRY_SIZE;
+  BC.outs() << "BOLT-INFO: parsed " << NumEntries << " SMP lock entries\n";
 
-    if (OriginalAddress == NewAddress)
+  return Error::success();
+}
+
+void LinuxKernelRewriter::processInstructionFixups() {
+  for (InstructionFixup &Fixup : Fixups) {
+    if (!BC.shouldEmit(Fixup.BF))
       continue;
 
-    for (LKInstructionMarkerInfo &LKMarkerInfo : LKMarkerInfoKV.second) {
-      StringRef SectionName = LKMarkerInfo.SectionName;
-      SimpleBinaryPatcher *LKPatcher;
-      ErrorOr<BinarySection &> BSec = BC.getUniqueSectionByName(SectionName);
-      assert(BSec && "missing section info for kernel section");
-      if (!BSec->getPatcher())
-        BSec->registerPatcher(std::make_unique<SimpleBinaryPatcher>());
-      LKPatcher = static_cast<SimpleBinaryPatcher *>(BSec->getPatcher());
-      PatchCounts[std::string(SectionName)]++;
-      if (LKMarkerInfo.IsPCRelative)
-        LKPatcher->addLE32Patch(LKMarkerInfo.SectionOffset,
-                                NewAddress - OriginalAddress +
-                                    LKMarkerInfo.PCRelativeOffset);
-      else
-        LKPatcher->addLE64Patch(LKMarkerInfo.SectionOffset, NewAddress);
-    }
+    Fixup.Section.addRelocation(Fixup.Offset, &Fixup.Label,
+                                Fixup.IsPCRelative ? ELF::R_X86_64_PC32
+                                                   : ELF::R_X86_64_64,
+                                /*Addend*/ 0);
   }
-  BC.outs() << "BOLT-INFO: patching linux kernel sections. Total patches per "
-               "section are as follows:\n";
-  for (const std::pair<const std::string, uint64_t> &KV : PatchCounts)
-    BC.outs() << "  Section: " << KV.first << ", patch-counts: " << KV.second
-              << '\n';
 }
 
 Error LinuxKernelRewriter::readORCTables() {
@@ -783,11 +746,9 @@ Error LinuxKernelRewriter::rewriteORCTables() {
   };
 
   // Emit new ORC entries for the emitted function.
-  auto emitORC = [&](const BinaryFunction &BF) -> Error {
-    assert(!BF.isSplit() && "Split functions not supported by ORC writer yet.");
-
+  auto emitORC = [&](const FunctionFragment &FF) -> Error {
     ORCState CurrentState = NullORC;
-    for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
+    for (BinaryBasicBlock *BB : FF) {
       for (MCInst &Inst : *BB) {
         ErrorOr<ORCState> ErrorOrState =
             BC.MIB->tryGetAnnotationAs<ORCState>(Inst, "ORC");
@@ -808,7 +769,36 @@ Error LinuxKernelRewriter::rewriteORCTables() {
     return Error::success();
   };
 
+  // Emit ORC entries for cold fragments. We assume that these fragments are
+  // emitted contiguously in memory using reserved space in the kernel. This
+  // assumption is validated in post-emit pass validateORCTables() where we
+  // check that ORC entries are sorted by their addresses.
+  auto emitColdORC = [&]() -> Error {
+    for (BinaryFunction &BF :
+         llvm::make_second_range(BC.getBinaryFunctions())) {
+      if (!BC.shouldEmit(BF))
+        continue;
+      for (FunctionFragment &FF : BF.getLayout().getSplitFragments())
+        if (Error E = emitORC(FF))
+          return E;
+    }
+
+    return Error::success();
+  };
+
+  bool ShouldEmitCold = !BC.BOLTReserved.empty();
   for (ORCListEntry &Entry : ORCEntries) {
+    if (ShouldEmitCold && Entry.IP > BC.BOLTReserved.start()) {
+      if (Error E = emitColdORC())
+        return E;
+
+      // Emit terminator entry at the end of the reserved region.
+      if (Error E = emitORCEntry(BC.BOLTReserved.end(), NullORC))
+        return E;
+
+      ShouldEmitCold = false;
+    }
+
     // Emit original entries for functions that we haven't modified.
     if (!Entry.BF || !BC.shouldEmit(*Entry.BF)) {
       // Emit terminator only if it marks the start of a function.
@@ -822,7 +812,7 @@ Error LinuxKernelRewriter::rewriteORCTables() {
     // Emit all ORC entries for a function referenced by an entry and skip over
     // the rest of entries for this function by resetting its ORC attribute.
     if (Entry.BF->hasORC()) {
-      if (Error E = emitORC(*Entry.BF))
+      if (Error E = emitORC(Entry.BF->getLayout().getMainFragment()))
         return E;
       Entry.BF->setHasORC(false);
     }
@@ -831,10 +821,9 @@ Error LinuxKernelRewriter::rewriteORCTables() {
   LLVM_DEBUG(dbgs() << "BOLT-DEBUG: emitted " << NumEmitted
                     << " ORC entries\n");
 
-  // Replicate terminator entry at the end of sections to match the original
-  // table sizes.
-  const BinaryFunction &LastBF = BC.getBinaryFunctions().rbegin()->second;
-  const uint64_t LastIP = LastBF.getAddress() + LastBF.getMaxSize();
+  // Populate ORC tables with a terminator entry with max address to match the
+  // original table sizes.
+  const uint64_t LastIP = std::numeric_limits<uint64_t>::max();
   while (UnwindWriter.bytesRemaining()) {
     if (Error E = emitORCEntry(LastIP, NullORC, nullptr, /*Force*/ true))
       return E;
@@ -1696,6 +1685,9 @@ Error LinuxKernelRewriter::readStaticKeysJumpTable() {
     if (!BC.MIB->getSize(*Inst))
       BC.MIB->setSize(*Inst, Size);
 
+    if (!BC.MIB->getOffset(*Inst))
+      BC.MIB->setOffset(*Inst, JumpAddress - BF->getAddress());
+
     if (opts::LongJumpLabels)
       BC.MIB->setSize(*Inst, 5);
   }
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index 23f79e3c135a..85b39176754b 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -1347,6 +1347,35 @@ void RewriteInstance::discoverFileObjects() {
 
   registerFragments();
   FileSymbols.clear();
+
+  discoverBOLTReserved();
+}
+
+void RewriteInstance::discoverBOLTReserved() {
+  BinaryData *StartBD = BC->getBinaryDataByName(getBOLTReservedStart());
+  BinaryData *EndBD = BC->getBinaryDataByName(getBOLTReservedEnd());
+  if (!StartBD != !EndBD) {
+    BC->errs() << "BOLT-ERROR: one of the symbols is missing from the binary: "
+               << getBOLTReservedStart() << ", " << getBOLTReservedEnd()
+               << '\n';
+    exit(1);
+  }
+
+  if (!StartBD)
+    return;
+
+  if (StartBD->getAddress() >= EndBD->getAddress()) {
+    BC->errs() << "BOLT-ERROR: invalid reserved space boundaries\n";
+    exit(1);
+  }
+  BC->BOLTReserved = AddressRange(StartBD->getAddress(), EndBD->getAddress());
+  BC->outs() << "BOLT-INFO: using reserved space for allocating new sections\n";
+
+  PHDRTableOffset = 0;
+  PHDRTableAddress = 0;
+  NewTextSegmentAddress = 0;
+  NewTextSegmentOffset = 0;
+  NextAvailableAddress = BC->BOLTReserved.start();
 }
 
 Error RewriteInstance::discoverRtFiniAddress() {
@@ -3617,26 +3646,6 @@ void RewriteInstance::updateMetadata() {
 void RewriteInstance::mapFileSections(BOLTLinker::SectionMapper MapSection) {
   BC->deregisterUnusedSections();
 
-  // Check if the input has a space reserved for BOLT.
-  BinaryData *StartBD = BC->getBinaryDataByName(getBOLTReservedStart());
-  BinaryData *EndBD = BC->getBinaryDataByName(getBOLTReservedEnd());
-  if (!StartBD != !EndBD) {
-    BC->errs() << "BOLT-ERROR: one of the symbols is missing from the binary: "
-               << getBOLTReservedStart() << ", " << getBOLTReservedEnd()
-               << '\n';
-    exit(1);
-  }
-
-  if (StartBD) {
-    PHDRTableOffset = 0;
-    PHDRTableAddress = 0;
-    NewTextSegmentAddress = 0;
-    NewTextSegmentOffset = 0;
-    NextAvailableAddress = StartBD->getAddress();
-    BC->outs()
-        << "BOLT-INFO: using reserved space for allocating new sections\n";
-  }
-
   // If no new .eh_frame was written, remove relocated original .eh_frame.
   BinarySection *RelocatedEHFrameSection =
       getSection(".relocated" + getEHFrameSectionName());
@@ -3657,12 +3666,12 @@ void RewriteInstance::mapFileSections(BOLTLinker::SectionMapper MapSection) {
   // Map the rest of the sections.
   mapAllocatableSections(MapSection);
 
-  if (StartBD) {
-    const uint64_t ReservedSpace = EndBD->getAddress() - StartBD->getAddress();
-    const uint64_t AllocatedSize = NextAvailableAddress - StartBD->getAddress();
-    if (ReservedSpace < AllocatedSize) {
-      BC->errs() << "BOLT-ERROR: reserved space (" << ReservedSpace << " byte"
-                 << (ReservedSpace == 1 ? "" : "s")
+  if (!BC->BOLTReserved.empty()) {
+    const uint64_t AllocatedSize =
+        NextAvailableAddress - BC->BOLTReserved.start();
+    if (BC->BOLTReserved.size() < AllocatedSize) {
+      BC->errs() << "BOLT-ERROR: reserved space (" << BC->BOLTReserved.size()
+                 << " byte" << (BC->BOLTReserved.size() == 1 ? "" : "s")
                  << ") is smaller than required for new allocations ("
                  << AllocatedSize << " bytes)\n";
       exit(1);
@@ -5852,13 +5861,11 @@ void RewriteInstance::writeEHFrameHeader() {
 
   NextAvailableAddress += EHFrameHdrSec.getOutputSize();
 
-  if (const BinaryData *ReservedEnd =
-          BC->getBinaryDataByName(getBOLTReservedEnd())) {
-    if (NextAvailableAddress > ReservedEnd->getAddress()) {
-      BC->errs() << "BOLT-ERROR: unable to fit " << getEHFrameHdrSectionName()
-                 << " into reserved space\n";
-      exit(1);
-    }
+  if (!BC->BOLTReserved.empty() &&
+      (NextAvailableAddress > BC->BOLTReserved.end())) {
+    BC->errs() << "BOLT-ERROR: unable to fit " << getEHFrameHdrSectionName()
+               << " into reserved space\n";
+    exit(1);
   }
 
   // Merge new .eh_frame with the relocated original so that gdb can locate all
@@ -5892,7 +5899,7 @@ uint64_t RewriteInstance::getNewValueForSymbol(const StringRef Name) {
 
 uint64_t RewriteInstance::getFileOffsetForAddress(uint64_t Address) const {
   // Check if it's possibly part of the new segment.
-  if (Address >= NewTextSegmentAddress)
+  if (NewTextSegmentAddress && Address >= NewTextSegmentAddress)
     return Address - NewTextSegmentAddress + NewTextSegmentOffset;
 
   // Find an existing segment that matches the address.
diff --git a/bolt/lib/Rewrite/SDTRewriter.cpp b/bolt/lib/Rewrite/SDTRewriter.cpp
index cc663b28990f..a3928c554ad6 100644
--- a/bolt/lib/Rewrite/SDTRewriter.cpp
+++ b/bolt/lib/Rewrite/SDTRewriter.cpp
@@ -87,7 +87,7 @@ void SDTRewriter::readSection() {
 
     StringRef Name = DE.getCStr(&Offset);
 
-    if (!Name.equals("stapsdt"))
+    if (Name != "stapsdt")
       errs() << "BOLT-WARNING: SDT note name \"" << Name
              << "\" is not expected\n";
 
diff --git a/bolt/test/X86/Inputs/blarge_new_bat_branchentry.preagg.txt b/bolt/test/X86/Inputs/blarge_new_bat_branchentry.preagg.txt
new file mode 100644
index 000000000000..546da92f94db
--- /dev/null
+++ b/bolt/test/X86/Inputs/blarge_new_bat_branchentry.preagg.txt
@@ -0,0 +1 @@
+B 80010c 800194 1 0
diff --git a/bolt/test/X86/Inputs/jump-table-fixed-ref-pic.s b/bolt/test/X86/Inputs/jump-table-fixed-ref-pic.s
new file mode 100644
index 000000000000..66629a4880e6
--- /dev/null
+++ b/bolt/test/X86/Inputs/jump-table-fixed-ref-pic.s
@@ -0,0 +1,35 @@
+  .globl main
+  .type main, %function
+main:
+  .cfi_startproc
+  cmpq $0x3, %rdi
+  jae .L4
+  cmpq $0x1, %rdi
+  jne .L4
+  mov .Ljt_pic+8(%rip), %rax
+  lea .Ljt_pic(%rip), %rdx
+  add %rdx, %rax
+  jmpq *%rax
+.L1:
+  movq $0x1, %rax
+  jmp .L5
+.L2:
+  movq $0x0, %rax
+  jmp .L5
+.L3:
+  movq $0x2, %rax
+  jmp .L5
+.L4:
+  mov $0x3, %rax
+.L5:
+  retq
+  .cfi_endproc
+
+  .section .rodata
+  .align 16
+.Ljt_pic:
+  .long .L1 - .Ljt_pic
+  .long .L2 - .Ljt_pic
+  .long .L3 - .Ljt_pic
+  .long .L4 - .Ljt_pic
+
diff --git a/bolt/test/X86/bolt-address-translation-yaml.test b/bolt/test/X86/bolt-address-translation-yaml.test
index af24c3d84a0f..c15d6ce15ed0 100644
--- a/bolt/test/X86/bolt-address-translation-yaml.test
+++ b/bolt/test/X86/bolt-address-translation-yaml.test
@@ -5,6 +5,17 @@ RUN: llvm-bolt %t.exe -o %t.out --pa -p %p/Inputs/blarge_new.preagg.txt \
 RUN:   --reorder-blocks=ext-tsp --split-functions --split-strategy=cdsplit \
 RUN:   --reorder-functions=cdsort --enable-bat --dyno-stats --skip-funcs=main \
 RUN:   2>&1 | FileCheck --check-prefix WRITE-BAT-CHECK %s
+# Check that branch with entry in BAT is accounted for.
+RUN: perf2bolt %t.out --pa -p %p/Inputs/blarge_new_bat_branchentry.preagg.txt \
+RUN:   -w %t.yaml -o %t.fdata
+RUN: llvm-bolt %t.exe -data %t.fdata -w %t.yaml-fdata -o %t.null
+RUN: FileCheck --input-file %t.yaml --check-prefix BRANCHENTRY-YAML-CHECK %s
+RUN: FileCheck --input-file %t.yaml-fdata --check-prefix BRANCHENTRY-YAML-CHECK %s
+BRANCHENTRY-YAML-CHECK:    - name: SolveCubic
+BRANCHENTRY-YAML-CHECK:      bid: 0
+BRANCHENTRY-YAML-CHECK:      hash: 0x700F19D24600000
+BRANCHENTRY-YAML-CHECK-NEXT: succ: [ { bid: 7, cnt: 1 }
+# Large profile test
 RUN: perf2bolt %t.out --pa -p %p/Inputs/blarge_new_bat.preagg.txt -w %t.yaml -o %t.fdata \
 RUN:   2>&1 | FileCheck --check-prefix READ-BAT-CHECK %s
 RUN: FileCheck --input-file %t.yaml --check-prefix YAML-BAT-CHECK %s
@@ -13,7 +24,7 @@ RUN: llvm-bolt %t.exe -data %t.fdata -w %t.yaml-fdata -o /dev/null
 RUN: FileCheck --input-file %t.yaml-fdata --check-prefix YAML-BAT-CHECK %s
 
 # Test resulting YAML profile with the original binary (no-stale mode)
-RUN: llvm-bolt %t.exe -data %t.yaml -o %t.null -dyno-stats \
+RUN: llvm-bolt %t.exe -data %t.yaml -o %t.null -dyno-stats 2>&1 \
 RUN:   | FileCheck --check-prefix CHECK-BOLT-YAML %s
 
 WRITE-BAT-CHECK: BOLT-INFO: Wrote 5 BAT maps
@@ -48,6 +59,10 @@ YAML-BAT-CHECK-NEXT:   hash:    0x6AF7E61EA3966722
 YAML-BAT-CHECK-NEXT:   exec:    25
 YAML-BAT-CHECK-NEXT:   nblocks: 15
 YAML-BAT-CHECK-NEXT:   blocks:
+YAML-BAT-CHECK-NEXT:   - bid:   0
+YAML-BAT-CHECK-NEXT:     insns: [[#]]
+YAML-BAT-CHECK-NEXT:     hash:  0x700F19D24600000
+YAML-BAT-CHECK-NEXT:     exec:  25
 YAML-BAT-CHECK:        - bid:   3
 YAML-BAT-CHECK-NEXT:     insns: [[#]]
 YAML-BAT-CHECK-NEXT:     hash:  0xDDA1DC5F69F900AC
@@ -63,7 +78,8 @@ YAML-BAT-CHECK-NEXT:   blocks:
 YAML-BAT-CHECK:        - bid:   1
 YAML-BAT-CHECK-NEXT:       insns: [[#]]
 YAML-BAT-CHECK-NEXT:       hash:  0xD70DC695320E0010
-YAML-BAT-CHECK-NEXT:       succ:  {{.*}} { bid: 2, cnt: [[#]] }
+YAML-BAT-CHECK-NEXT:       succ:  {{.*}} { bid: 2, cnt: [[#]]
 
 CHECK-BOLT-YAML:      pre-processing profile using YAML profile reader
 CHECK-BOLT-YAML-NEXT: 5 out of 16 functions in the binary (31.2%) have non-empty execution profile
+CHECK-BOLT-YAML-NOT: invalid (possibly stale) profile
diff --git a/bolt/test/X86/jump-table-fixed-ref-pic.test b/bolt/test/X86/jump-table-fixed-ref-pic.test
new file mode 100644
index 000000000000..4195b97aac50
--- /dev/null
+++ b/bolt/test/X86/jump-table-fixed-ref-pic.test
@@ -0,0 +1,9 @@
+# Verify that BOLT detects fixed destination of indirect jump for PIC
+# case.
+
+XFAIL: *
+
+RUN: %clang %cflags -no-pie %S/Inputs/jump-table-fixed-ref-pic.s -Wl,-q -o %t
+RUN: llvm-bolt %t --relocs -o %t.null 2>&1 | FileCheck %s
+
+CHECK: BOLT-INFO: fixed indirect branch detected in main
diff --git a/bolt/test/X86/linux-smp-locks.s b/bolt/test/X86/linux-smp-locks.s
new file mode 100644
index 000000000000..5f4410d14fc6
--- /dev/null
+++ b/bolt/test/X86/linux-smp-locks.s
@@ -0,0 +1,40 @@
+# REQUIRES: system-linux
+
+## Check that BOLT correctly parses and updates the Linux kernel .smp_locks
+## section.
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags -nostdlib %t.o -o %t.exe \
+# RUN:   -Wl,--image-base=0xffffffff80000000,--no-dynamic-linker,--no-eh-frame-hdr,--no-pie
+# RUN: llvm-bolt %t.exe --print-normalized --keep-nops=0 --bolt-info=0 -o %t.out \
+# RUN:   |& FileCheck %s
+
+## Check the output of BOLT with NOPs removed.
+
+# RUN: llvm-bolt %t.out -o %t.out.1 --print-normalized |& FileCheck %s
+
+# CHECK:      BOLT-INFO: Linux kernel binary detected
+# CHECK:      BOLT-INFO: parsed 2 SMP lock entries
+
+  .text
+  .globl _start
+  .type _start, %function
+_start:
+  nop
+  nop
+.L0:
+  lock incl (%rdi)
+# CHECK: lock {{.*}} SMPLock
+.L1:
+  lock orb $0x40, 0x4(%rsi)
+# CHECK: lock {{.*}} SMPLock
+  ret
+  .size _start, .-_start
+
+  .section .smp_locks,"a",@progbits
+  .long .L0 - .
+  .long .L1 - .
+
+## Fake Linux Kernel sections.
+  .section __ksymtab,"a",@progbits
+  .section __ksymtab_gpl,"a",@progbits
diff --git a/bolt/test/X86/linux-static-keys.s b/bolt/test/X86/linux-static-keys.s
index 08454bf97631..fb419e0f7627 100644
--- a/bolt/test/X86/linux-static-keys.s
+++ b/bolt/test/X86/linux-static-keys.s
@@ -3,6 +3,8 @@
 ## Check that BOLT correctly updates the Linux kernel static keys jump table.
 
 # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata
+# RUN: llvm-strip --strip-unneeded %t.o
 # RUN: %clang %cflags -nostdlib %t.o -o %t.exe \
 # RUN:   -Wl,--image-base=0xffffffff80000000,--no-dynamic-linker,--no-eh-frame-hdr
 
@@ -11,6 +13,12 @@
 # RUN: llvm-bolt %t.exe --print-normalized -o %t.out --keep-nops=0 \
 # RUN:   --bolt-info=0 |& FileCheck %s
 
+## Verify that profile is matched correctly.
+
+# RUN: llvm-bolt %t.exe --print-normalized -o %t.out --keep-nops=0 \
+# RUN:   --bolt-info=0 --data %t.fdata |& \
+# RUN:   FileCheck --check-prefix=CHECK-FDATA %s
+
 ## Verify the bindings again on the rewritten binary with nops removed.
 
 # RUN: llvm-bolt %t.out -o %t.out.1 --print-normalized |& FileCheck %s
@@ -25,15 +33,24 @@ _start:
 # CHECK: Binary Function "_start"
   nop
 .L0:
-  jmp .L1
+  jmp L1
 # CHECK:      jit
 # CHECK-SAME: # ID: 1 {{.*}} # Likely: 0 # InitValue: 1
   nop
-.L1:
+L1:
   .nops 5
+  jmp .L0
 # CHECK:      jit
 # CHECK-SAME: # ID: 2 {{.*}} # Likely: 1 # InitValue: 1
-.L2:
+
+## Check that a branch profile associated with a NOP is handled properly when
+## dynamic branch is created.
+
+# FDATA: 1 _start #L1# 1 _start #L2# 3 42
+# CHECK-FDATA: jit {{.*}} # ID: 2
+# CHECK-FDATA-NEXT: jmp
+# CHECK-FDATA-NEXT: Successors: {{.*}}  (mispreds: 3, count: 42)
+L2:
   nop
   .size _start, .-_start
 
@@ -51,11 +68,11 @@ foo:
 __start___jump_table:
 
   .long .L0 - . # Jump address
-  .long .L1 - . # Target address
+  .long L1 - . # Target address
   .quad 1       # Key address
 
-  .long .L1 - . # Jump address
-  .long .L2 - . # Target address
+  .long L1 - . # Jump address
+  .long L2 - . # Target address
   .quad 0       # Key address
 
   .globl __stop___jump_table
diff --git a/bolt/test/X86/register-fragments-bolt-symbols.s b/bolt/test/X86/register-fragments-bolt-symbols.s
index fa9b70e0b2d8..6478adf19372 100644
--- a/bolt/test/X86/register-fragments-bolt-symbols.s
+++ b/bolt/test/X86/register-fragments-bolt-symbols.s
@@ -15,6 +15,8 @@
 # PREAGG: B X:0 #chain.cold.0# 1 0
 # RUN: perf2bolt %t.bolt -p %t.preagg --pa -o %t.bat.fdata -w %t.bat.yaml -v=1 \
 # RUN:   | FileCheck %s --check-prefix=CHECK-REGISTER
+# RUN: FileCheck --input-file %t.bat.fdata --check-prefix=CHECK-FDATA %s
+# RUN: FileCheck --input-file %t.bat.yaml --check-prefix=CHECK-YAML %s
 
 # CHECK-SYMS: l df *ABS*          [[#]] chain.s
 # CHECK-SYMS: l  F .bolt.org.text [[#]] chain
@@ -24,6 +26,9 @@
 
 # CHECK-REGISTER: BOLT-INFO: marking chain.cold.0/1(*2) as a fragment of chain/2(*2)
 
+# CHECK-FDATA: 0 [unknown] 0 1 chain/chain.s/2 10 0 1
+# CHECK-YAML: - name: 'chain/chain.s/2'
+
 .file "chain.s"
         .text
         .type   chain, @function
diff --git a/bolt/test/X86/sctc-bug4.test b/bolt/test/X86/sctc-bug4.test
index 00f5ee429b63..92aca5110059 100644
--- a/bolt/test/X86/sctc-bug4.test
+++ b/bolt/test/X86/sctc-bug4.test
@@ -1,20 +1,23 @@
-# Check that fallthrough blocks are handled properly.
+# Check that fallthrough blocks are handled properly and Offset annotation is
+# set for conditional tail calls.
 
 RUN: %clang %cflags %S/Inputs/sctc_bug4.s -o %t
-RUN: llvm-bolt %t -o %t.null \
+RUN: llvm-bolt %t -o %t.null --enable-bat \
 RUN:   -funcs=test_func -print-sctc -sequential-disassembly 2>&1 | FileCheck %s
 
 CHECK:      .Ltmp2 (3 instructions, align : 1)
 CHECK-NEXT:   CFI State : 0
+CHECK-NEXT:   Input offset: 0x24
 CHECK-NEXT:   Predecessors: .LFT1
 CHECK-NEXT:     00000024: 	cmpq	$0x20, %rsi
-CHECK-NEXT:     00000028: 	ja	dummy # TAILCALL {{.*}}# CTCTakenCount: 0
+CHECK-NEXT:     00000028: 	ja	dummy # TAILCALL # Offset: 53 # CTCTakenCount: 0
 CHECK-NEXT:     0000002a: 	jmp .Ltmp4
 CHECK-NEXT:   Successors: .Ltmp4
 CHECK-NEXT:   CFI State: 0
 
 CHECK:      .Ltmp1 (2 instructions, align : 1)
 CHECK-NEXT:   CFI State : 0
+CHECK-NEXT:   Input offset: 0x2c
 CHECK-NEXT:   Predecessors: .LFT0
 CHECK-NEXT:     0000002c: 	xorq	%r11, %rax
 CHECK-NEXT:     0000002f: 	retq
@@ -22,4 +25,5 @@ CHECK-NEXT:   CFI State: 0
 
 CHECK:      .Ltmp4 (4 instructions, align : 1)
 CHECK-NEXT:  CFI State : 0
+CHECK-NEXT:  Input offset: 0x3a
 CHECK-NEXT:  Predecessors: .Ltmp2
diff --git a/bolt/test/runtime/bolt-reserved.cpp b/bolt/test/runtime/bolt-reserved.cpp
new file mode 100644
index 000000000000..c88b1e284d07
--- /dev/null
+++ b/bolt/test/runtime/bolt-reserved.cpp
@@ -0,0 +1,40 @@
+// REQUIRES: system-linux
+
+/*
+ * Check that llvm-bolt uses reserved space in a binary for allocating
+ * new sections.
+ */
+
+// RUN: %clang %s -o %t.exe -Wl,-q
+// RUN: llvm-bolt %t.exe -o %t.bolt.exe 2>&1 | FileCheck %s
+// RUN: %t.bolt.exe
+
+// CHECK: BOLT-INFO: using reserved space
+
+/*
+ * Check that llvm-bolt detects a condition when the reserved space is
+ * not enough for allocating new sections.
+ */
+
+// RUN: %clang %s -o %t.tiny.exe -Wl,--no-eh-frame-hdr -Wl,-q -DTINY
+// RUN: not llvm-bolt %t.tiny.exe -o %t.tiny.bolt.exe 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CHECK-TINY
+
+// CHECK-TINY: BOLT-ERROR: reserved space (1 byte) is smaller than required
+
+#ifdef TINY
+#define RSIZE "1"
+#else
+#define RSIZE "8192 * 1024"
+#endif
+
+asm(".pushsection .text \n\
+       .globl __bolt_reserved_start \n\
+       .type __bolt_reserved_start, @object \n\
+       __bolt_reserved_start: \n\
+       .space " RSIZE " \n\
+       .globl __bolt_reserved_end \n\
+       __bolt_reserved_end: \n\
+     .popsection");
+
+int main() { return 0; }
diff --git a/clang-tools-extra/clang-include-fixer/find-all-symbols/STLPostfixHeaderMap.cpp b/clang-tools-extra/clang-include-fixer/find-all-symbols/STLPostfixHeaderMap.cpp
index df77bf7ea46d..469323f0ee9d 100644
--- a/clang-tools-extra/clang-include-fixer/find-all-symbols/STLPostfixHeaderMap.cpp
+++ b/clang-tools-extra/clang-include-fixer/find-all-symbols/STLPostfixHeaderMap.cpp
@@ -15,9 +15,11 @@ const HeaderMapCollector::RegexHeaderMap *getSTLPostfixHeaderMap() {
   static const HeaderMapCollector::RegexHeaderMap STLPostfixHeaderMap = {
       {"include/__stdarg___gnuc_va_list.h$", "<cstdarg>"},
       {"include/__stdarg___va_copy.h$", "<cstdarg>"},
+      {"include/__stdarg_header_macro.h$", "<cstdarg>"},
       {"include/__stdarg_va_arg.h$", "<cstdarg>"},
       {"include/__stdarg_va_copy.h$", "<cstdarg>"},
       {"include/__stdarg_va_list.h$", "<cstdarg>"},
+      {"include/__stddef_header_macro.h$", "<cstddef>"},
       {"include/__stddef_max_align_t.h$", "<cstddef>"},
       {"include/__stddef_null.h$", "<cstddef>"},
       {"include/__stddef_nullptr_t.h$", "<cstddef>"},
diff --git a/clang-tools-extra/clang-query/Query.cpp b/clang-tools-extra/clang-query/Query.cpp
index c436d6fa9498..9d5807a52fa8 100644
--- a/clang-tools-extra/clang-query/Query.cpp
+++ b/clang-tools-extra/clang-query/Query.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "Query.h"
+#include "QueryParser.h"
 #include "QuerySession.h"
 #include "clang/AST/ASTDumper.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
@@ -281,5 +282,26 @@ const QueryKind SetQueryKind<bool>::value;
 const QueryKind SetQueryKind<OutputKind>::value;
 #endif
 
+bool FileQuery::run(llvm::raw_ostream &OS, QuerySession &QS) const {
+  auto Buffer = llvm::MemoryBuffer::getFile(StringRef{File}.trim());
+  if (!Buffer) {
+    if (Prefix.has_value())
+      llvm::errs() << *Prefix << ": ";
+    llvm::errs() << "cannot open " << File << ": "
+                 << Buffer.getError().message() << "\n";
+    return false;
+  }
+
+  StringRef FileContentRef(Buffer.get()->getBuffer());
+
+  while (!FileContentRef.empty()) {
+    QueryRef Q = QueryParser::parse(FileContentRef, QS);
+    if (!Q->run(llvm::outs(), QS))
+      return false;
+    FileContentRef = Q->RemainingContent;
+  }
+  return true;
+}
+
 } // namespace query
 } // namespace clang
diff --git a/clang-tools-extra/clang-query/Query.h b/clang-tools-extra/clang-query/Query.h
index 7aefa6bb5ee0..7242479633c2 100644
--- a/clang-tools-extra/clang-query/Query.h
+++ b/clang-tools-extra/clang-query/Query.h
@@ -30,7 +30,8 @@ enum QueryKind {
   QK_SetTraversalKind,
   QK_EnableOutputKind,
   QK_DisableOutputKind,
-  QK_Quit
+  QK_Quit,
+  QK_File
 };
 
 class QuerySession;
@@ -188,6 +189,21 @@ struct DisableOutputQuery : SetNonExclusiveOutputQuery {
   }
 };
 
+struct FileQuery : Query {
+  FileQuery(StringRef File, StringRef Prefix = StringRef())
+      : Query(QK_File), File(File),
+        Prefix(!Prefix.empty() ? std::optional<std::string>(Prefix)
+                               : std::nullopt) {}
+
+  bool run(llvm::raw_ostream &OS, QuerySession &QS) const override;
+
+  static bool classof(const Query *Q) { return Q->Kind == QK_File; }
+
+private:
+  std::string File;
+  std::optional<std::string> Prefix;
+};
+
 } // namespace query
 } // namespace clang
 
diff --git a/clang-tools-extra/clang-query/QueryParser.cpp b/clang-tools-extra/clang-query/QueryParser.cpp
index 162acc1a598d..85a442bdd7de 100644
--- a/clang-tools-extra/clang-query/QueryParser.cpp
+++ b/clang-tools-extra/clang-query/QueryParser.cpp
@@ -183,7 +183,8 @@ enum ParsedQueryKind {
   PQK_Unlet,
   PQK_Quit,
   PQK_Enable,
-  PQK_Disable
+  PQK_Disable,
+  PQK_File
 };
 
 enum ParsedQueryVariable {
@@ -222,12 +223,14 @@ QueryRef QueryParser::doParse() {
                               .Case("let", PQK_Let)
                               .Case("m", PQK_Match, /*IsCompletion=*/false)
                               .Case("match", PQK_Match)
-                              .Case("q", PQK_Quit,  /*IsCompletion=*/false)
+                              .Case("q", PQK_Quit, /*IsCompletion=*/false)
                               .Case("quit", PQK_Quit)
                               .Case("set", PQK_Set)
                               .Case("enable", PQK_Enable)
                               .Case("disable", PQK_Disable)
                               .Case("unlet", PQK_Unlet)
+                              .Case("f", PQK_File, /*IsCompletion=*/false)
+                              .Case("file", PQK_File)
                               .Default(PQK_Invalid);
 
   switch (QKind) {
@@ -351,6 +354,9 @@ QueryRef QueryParser::doParse() {
     return endQuery(new LetQuery(Name, VariantValue()));
   }
 
+  case PQK_File:
+    return new FileQuery(Line);
+
   case PQK_Invalid:
     return new InvalidQuery("unknown command: " + CommandStr);
   }
diff --git a/clang-tools-extra/clang-query/tool/ClangQuery.cpp b/clang-tools-extra/clang-query/tool/ClangQuery.cpp
index da7ac2701448..a2de7a2dced8 100644
--- a/clang-tools-extra/clang-query/tool/ClangQuery.cpp
+++ b/clang-tools-extra/clang-query/tool/ClangQuery.cpp
@@ -74,22 +74,8 @@ static cl::opt<std::string> PreloadFile(
 
 bool runCommandsInFile(const char *ExeName, std::string const &FileName,
                        QuerySession &QS) {
-  auto Buffer = llvm::MemoryBuffer::getFile(FileName);
-  if (!Buffer) {
-    llvm::errs() << ExeName << ": cannot open " << FileName << ": "
-                 << Buffer.getError().message() << "\n";
-    return true;
-  }
-
-  StringRef FileContentRef(Buffer.get()->getBuffer());
-
-  while (!FileContentRef.empty()) {
-    QueryRef Q = QueryParser::parse(FileContentRef, QS);
-    if (!Q->run(llvm::outs(), QS))
-      return true;
-    FileContentRef = Q->RemainingContent;
-  }
-  return false;
+  FileQuery Query(FileName, ExeName);
+  return !Query.run(llvm::errs(), QS);
 }
 
 int main(int argc, const char **argv) {
diff --git a/clang-tools-extra/clang-tidy/ClangTidy.cpp b/clang-tools-extra/clang-tidy/ClangTidy.cpp
index b877ea06dc05..1cd7cdd10bc2 100644
--- a/clang-tools-extra/clang-tidy/ClangTidy.cpp
+++ b/clang-tools-extra/clang-tidy/ClangTidy.cpp
@@ -373,11 +373,11 @@ static CheckersList getAnalyzerCheckersAndPackages(ClangTidyContext &Context,
 
   const auto &RegisteredCheckers =
       AnalyzerOptions::getRegisteredCheckers(IncludeExperimental);
-  bool AnalyzerChecksEnabled = false;
-  for (StringRef CheckName : RegisteredCheckers) {
-    std::string ClangTidyCheckName((AnalyzerCheckNamePrefix + CheckName).str());
-    AnalyzerChecksEnabled |= Context.isCheckEnabled(ClangTidyCheckName);
-  }
+  const bool AnalyzerChecksEnabled =
+      llvm::any_of(RegisteredCheckers, [&](StringRef CheckName) -> bool {
+        return Context.isCheckEnabled(
+            (AnalyzerCheckNamePrefix + CheckName).str());
+      });
 
   if (!AnalyzerChecksEnabled)
     return List;
diff --git a/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp b/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp
index 710b361e16c0..6028bb225813 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp
+++ b/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp
@@ -171,7 +171,7 @@ std::optional<int64_t> ClangTidyCheck::OptionsView::getEnumInt(
     if (IgnoreCase) {
       if (Value.equals_insensitive(NameAndEnum.second))
         return NameAndEnum.first;
-    } else if (Value.equals(NameAndEnum.second)) {
+    } else if (Value == NameAndEnum.second) {
       return NameAndEnum.first;
     } else if (Value.equals_insensitive(NameAndEnum.second)) {
       Closest = NameAndEnum.second;
diff --git a/clang-tools-extra/clang-tidy/bugprone/CastingThroughVoidCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/CastingThroughVoidCheck.cpp
index 4c2416a89aef..9e714b4be4df 100644
--- a/clang-tools-extra/clang-tidy/bugprone/CastingThroughVoidCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/CastingThroughVoidCheck.cpp
@@ -7,12 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "CastingThroughVoidCheck.h"
-#include "clang/AST/ASTContext.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/Type.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
-#include "llvm/ADT/StringSet.h"
 
 using namespace clang::ast_matchers;
 
@@ -27,7 +25,8 @@ void CastingThroughVoidCheck::registerMatchers(MatchFinder *Finder) {
           hasSourceExpression(
               explicitCastExpr(
                   hasSourceExpression(
-                      expr(hasType(qualType().bind("source_type")))),
+                      expr(hasType(qualType(unless(pointsTo(voidType())))
+                                       .bind("source_type")))),
                   hasDestinationType(
                       qualType(pointsTo(voidType())).bind("void_type")))
                   .bind("cast"))),
diff --git a/clang-tools-extra/clang-tidy/bugprone/ForwardingReferenceOverloadCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ForwardingReferenceOverloadCheck.cpp
index e7be8134781e..36687a8e761e 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ForwardingReferenceOverloadCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ForwardingReferenceOverloadCheck.cpp
@@ -25,8 +25,8 @@ AST_MATCHER(QualType, isEnableIf) {
     const NamedDecl *TypeDecl =
         Spec->getTemplateName().getAsTemplateDecl()->getTemplatedDecl();
     return TypeDecl->isInStdNamespace() &&
-           (TypeDecl->getName().equals("enable_if") ||
-            TypeDecl->getName().equals("enable_if_t"));
+           (TypeDecl->getName() == "enable_if" ||
+            TypeDecl->getName() == "enable_if_t");
   };
   const Type *BaseType = Node.getTypePtr();
   // Case: pointer or reference to enable_if.
diff --git a/clang-tools-extra/clang-tidy/bugprone/ReservedIdentifierCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ReservedIdentifierCheck.cpp
index f6714d056518..53956661d57d 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ReservedIdentifierCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ReservedIdentifierCheck.cpp
@@ -178,8 +178,11 @@ std::optional<RenamerClangTidyCheck::FailureInfo>
 ReservedIdentifierCheck::getDeclFailureInfo(const NamedDecl *Decl,
                                             const SourceManager &) const {
   assert(Decl && Decl->getIdentifier() && !Decl->getName().empty() &&
-         !Decl->isImplicit() &&
          "Decl must be an explicit identifier with a name.");
+  // Implicit identifiers cannot fail.
+  if (Decl->isImplicit())
+    return std::nullopt;
+
   return getFailureInfoImpl(
       Decl->getName(), isa<TranslationUnitDecl>(Decl->getDeclContext()),
       /*IsMacro = */ false, getLangOpts(), Invert, AllowedIdentifiers);
diff --git a/clang-tools-extra/clang-tidy/bugprone/ReturnConstRefFromParameterCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ReturnConstRefFromParameterCheck.cpp
index 8ae37d4f774d..cacba38b4a5a 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ReturnConstRefFromParameterCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ReturnConstRefFromParameterCheck.cpp
@@ -17,8 +17,11 @@ namespace clang::tidy::bugprone {
 
 void ReturnConstRefFromParameterCheck::registerMatchers(MatchFinder *Finder) {
   Finder->addMatcher(
-      returnStmt(hasReturnValue(declRefExpr(to(parmVarDecl(hasType(
-                     hasCanonicalType(matchers::isReferenceToConst())))))))
+      returnStmt(
+          hasReturnValue(declRefExpr(to(parmVarDecl(hasType(hasCanonicalType(
+              qualType(matchers::isReferenceToConst()).bind("type"))))))),
+          hasAncestor(functionDecl(hasReturnTypeLoc(
+              loc(qualType(hasCanonicalType(equalsBoundNode("type"))))))))
           .bind("ret"),
       this);
 }
@@ -26,9 +29,13 @@ void ReturnConstRefFromParameterCheck::registerMatchers(MatchFinder *Finder) {
 void ReturnConstRefFromParameterCheck::check(
     const MatchFinder::MatchResult &Result) {
   const auto *R = Result.Nodes.getNodeAs<ReturnStmt>("ret");
-  diag(R->getRetValue()->getBeginLoc(),
-       "returning a constant reference parameter may cause a use-after-free "
-       "when the parameter is constructed from a temporary");
+  const SourceRange Range = R->getRetValue()->getSourceRange();
+  if (Range.isInvalid())
+    return;
+  diag(Range.getBegin(),
+       "returning a constant reference parameter may cause use-after-free "
+       "when the parameter is constructed from a temporary")
+      << Range;
 }
 
 } // namespace clang::tidy::bugprone
diff --git a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
index b06a903f92b3..00370ee9b300 100644
--- a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
@@ -25,6 +25,7 @@
 #include "../misc/StaticAssertCheck.h"
 #include "../misc/ThrowByValueCatchByReferenceCheck.h"
 #include "../performance/MoveConstructorInitCheck.h"
+#include "../readability/EnumInitialValueCheck.h"
 #include "../readability/UppercaseLiteralSuffixCheck.h"
 #include "CommandProcessorCheck.h"
 #include "DefaultOperatorNewAlignmentCheck.h"
@@ -299,6 +300,9 @@ public:
         "cert-flp37-c");
     // FIO
     CheckFactories.registerCheck<misc::NonCopyableObjectsCheck>("cert-fio38-c");
+    // INT
+    CheckFactories.registerCheck<readability::EnumInitialValueCheck>(
+        "cert-int09-c");
     // MSC
     CheckFactories.registerCheck<bugprone::UnsafeFunctionsCheck>(
         "cert-msc24-c");
diff --git a/clang-tools-extra/clang-tidy/hicpp/SignedBitwiseCheck.cpp b/clang-tools-extra/clang-tidy/hicpp/SignedBitwiseCheck.cpp
index 51cc26400f7f..bf09a6662d95 100644
--- a/clang-tools-extra/clang-tidy/hicpp/SignedBitwiseCheck.cpp
+++ b/clang-tools-extra/clang-tidy/hicpp/SignedBitwiseCheck.cpp
@@ -9,6 +9,7 @@
 #include "SignedBitwiseCheck.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/ASTMatchers/ASTMatchers.h"
 
 using namespace clang::ast_matchers;
 using namespace clang::ast_matchers::internal;
@@ -29,8 +30,8 @@ void SignedBitwiseCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
 void SignedBitwiseCheck::registerMatchers(MatchFinder *Finder) {
   const auto SignedIntegerOperand =
       (IgnorePositiveIntegerLiterals
-           ? expr(ignoringImpCasts(hasType(isSignedInteger())),
-                  unless(integerLiteral()))
+           ? expr(ignoringImpCasts(
+                 allOf(hasType(isSignedInteger()), unless(integerLiteral()))))
            : expr(ignoringImpCasts(hasType(isSignedInteger()))))
           .bind("signed-operand");
 
diff --git a/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp b/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp
index 3229e302eb43..a1786ba5acfd 100644
--- a/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp
@@ -421,7 +421,7 @@ getContainerFromBeginEndCall(const Expr *Init, bool IsBegin, bool *IsArrow,
     return {};
   if (IsReverse && !Call->Name.consume_back("r"))
     return {};
-  if (!Call->Name.empty() && !Call->Name.equals("c"))
+  if (!Call->Name.empty() && Call->Name != "c")
     return {};
   return std::make_pair(Call->Container, Call->CallKind);
 }
diff --git a/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.cpp
index 660996aba7b7..aa60c904a363 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.cpp
@@ -138,7 +138,8 @@ void UseStdPrintCheck::check(const MatchFinder::MatchResult &Result) {
   if (!Converter.canApply()) {
     diag(PrintfCall->getBeginLoc(),
          "unable to use '%0' instead of %1 because %2")
-        << ReplacementFunction << OldFunction->getIdentifier()
+        << PrintfCall->getSourceRange() << ReplacementFunction
+        << OldFunction->getIdentifier()
         << Converter.conversionNotPossibleReason();
     return;
   }
diff --git a/clang-tools-extra/clang-tidy/readability/ConstReturnTypeCheck.cpp b/clang-tools-extra/clang-tidy/readability/ConstReturnTypeCheck.cpp
index e92350632b55..c13a8010c222 100644
--- a/clang-tools-extra/clang-tidy/readability/ConstReturnTypeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ConstReturnTypeCheck.cpp
@@ -55,14 +55,6 @@ AST_MATCHER(QualType, isLocalConstQualified) {
   return Node.isLocalConstQualified();
 }
 
-AST_MATCHER(QualType, isTypeOfType) {
-  return isa<TypeOfType>(Node.getTypePtr());
-}
-
-AST_MATCHER(QualType, isTypeOfExprType) {
-  return isa<TypeOfExprType>(Node.getTypePtr());
-}
-
 struct CheckResult {
   // Source range of the relevant `const` token in the definition being checked.
   CharSourceRange ConstRange;
@@ -110,16 +102,11 @@ void ConstReturnTypeCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
 void ConstReturnTypeCheck::registerMatchers(MatchFinder *Finder) {
   // Find all function definitions for which the return types are `const`
   // qualified, ignoring decltype types.
-  auto NonLocalConstType =
-      qualType(unless(isLocalConstQualified()),
-               anyOf(decltypeType(), autoType(), isTypeOfType(),
-                     isTypeOfExprType(), substTemplateTypeParmType()));
   Finder->addMatcher(
-      functionDecl(
-          returns(allOf(isConstQualified(), unless(NonLocalConstType))),
-          anyOf(isDefinition(), cxxMethodDecl(isPure())),
-          // Overridden functions are not actionable.
-          unless(cxxMethodDecl(isOverride())))
+      functionDecl(returns(isLocalConstQualified()),
+                   anyOf(isDefinition(), cxxMethodDecl(isPure())),
+                   // Overridden functions are not actionable.
+                   unless(cxxMethodDecl(isOverride())))
           .bind("func"),
       this);
 }
diff --git a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp
index dc30531ebda0..c3208392df15 100644
--- a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp
@@ -1358,7 +1358,7 @@ IdentifierNamingCheck::getFailureInfo(
   std::replace(KindName.begin(), KindName.end(), '_', ' ');
 
   std::string Fixup = fixupWithStyle(Type, Name, Style, HNOption, ND);
-  if (StringRef(Fixup).equals(Name)) {
+  if (StringRef(Fixup) == Name) {
     if (!IgnoreFailedSplit) {
       LLVM_DEBUG(Location.print(llvm::dbgs(), SM);
                  llvm::dbgs()
@@ -1374,6 +1374,10 @@ IdentifierNamingCheck::getFailureInfo(
 std::optional<RenamerClangTidyCheck::FailureInfo>
 IdentifierNamingCheck::getDeclFailureInfo(const NamedDecl *Decl,
                                           const SourceManager &SM) const {
+  // Implicit identifiers cannot be renamed.
+  if (Decl->isImplicit())
+    return std::nullopt;
+
   SourceLocation Loc = Decl->getLocation();
   const FileStyle &FileStyle = getStyleForFile(SM.getFilename(Loc));
   if (!FileStyle.isActive())
diff --git a/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp b/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp
index edb67614bd55..fd4730d9c8b9 100644
--- a/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "SimplifyBooleanExprCheck.h"
+#include "clang/AST/Expr.h"
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/Lex/Lexer.h"
 #include "llvm/Support/SaveAndRestore.h"
@@ -280,9 +281,8 @@ public:
     if (!S) {
       return true;
     }
-    if (Check->IgnoreMacros && S->getBeginLoc().isMacroID()) {
+    if (Check->canBeBypassed(S))
       return false;
-    }
     if (!shouldIgnore(S))
       StmtStack.push_back(S);
     return true;
@@ -513,17 +513,23 @@ public:
     return true;
   }
 
-  static bool isUnaryLNot(const Expr *E) {
-    return isa<UnaryOperator>(E) &&
+  bool isExpectedUnaryLNot(const Expr *E) {
+    return !Check->canBeBypassed(E) && isa<UnaryOperator>(E) &&
            cast<UnaryOperator>(E)->getOpcode() == UO_LNot;
   }
 
+  bool isExpectedBinaryOp(const Expr *E) {
+    const auto *BinaryOp = dyn_cast<BinaryOperator>(E);
+    return !Check->canBeBypassed(E) && BinaryOp && BinaryOp->isLogicalOp() &&
+           BinaryOp->getType()->isBooleanType();
+  }
+
   template <typename Functor>
   static bool checkEitherSide(const BinaryOperator *BO, Functor Func) {
     return Func(BO->getLHS()) || Func(BO->getRHS());
   }
 
-  static bool nestedDemorgan(const Expr *E, unsigned NestingLevel) {
+  bool nestedDemorgan(const Expr *E, unsigned NestingLevel) {
     const auto *BO = dyn_cast<BinaryOperator>(E->IgnoreUnlessSpelledInSource());
     if (!BO)
       return false;
@@ -539,15 +545,13 @@ public:
       return true;
     case BO_LAnd:
     case BO_LOr:
-      if (checkEitherSide(BO, isUnaryLNot))
-        return true;
-      if (NestingLevel) {
-        if (checkEitherSide(BO, [NestingLevel](const Expr *E) {
-              return nestedDemorgan(E, NestingLevel - 1);
-            }))
-          return true;
-      }
-      return false;
+      return checkEitherSide(
+                 BO,
+                 [this](const Expr *E) { return isExpectedUnaryLNot(E); }) ||
+             (NestingLevel &&
+              checkEitherSide(BO, [this, NestingLevel](const Expr *E) {
+                return nestedDemorgan(E, NestingLevel - 1);
+              }));
     default:
       return false;
     }
@@ -556,19 +560,19 @@ public:
   bool TraverseUnaryOperator(UnaryOperator *Op) {
     if (!Check->SimplifyDeMorgan || Op->getOpcode() != UO_LNot)
       return Base::TraverseUnaryOperator(Op);
-    Expr *SubImp = Op->getSubExpr()->IgnoreImplicit();
-    auto *Parens = dyn_cast<ParenExpr>(SubImp);
-    auto *BinaryOp =
-        Parens
-            ? dyn_cast<BinaryOperator>(Parens->getSubExpr()->IgnoreImplicit())
-            : dyn_cast<BinaryOperator>(SubImp);
-    if (!BinaryOp || !BinaryOp->isLogicalOp() ||
-        !BinaryOp->getType()->isBooleanType())
+    const Expr *SubImp = Op->getSubExpr()->IgnoreImplicit();
+    const auto *Parens = dyn_cast<ParenExpr>(SubImp);
+    const Expr *SubExpr =
+        Parens ? Parens->getSubExpr()->IgnoreImplicit() : SubImp;
+    if (!isExpectedBinaryOp(SubExpr))
       return Base::TraverseUnaryOperator(Op);
+    const auto *BinaryOp = cast<BinaryOperator>(SubExpr);
     if (Check->SimplifyDeMorganRelaxed ||
-        checkEitherSide(BinaryOp, isUnaryLNot) ||
-        checkEitherSide(BinaryOp,
-                        [](const Expr *E) { return nestedDemorgan(E, 1); })) {
+        checkEitherSide(
+            BinaryOp,
+            [this](const Expr *E) { return isExpectedUnaryLNot(E); }) ||
+        checkEitherSide(
+            BinaryOp, [this](const Expr *E) { return nestedDemorgan(E, 1); })) {
       if (Check->reportDeMorgan(Context, Op, BinaryOp, !IsProcessing, parent(),
                                 Parens) &&
           !Check->areDiagsSelfContained()) {
@@ -694,6 +698,10 @@ void SimplifyBooleanExprCheck::check(const MatchFinder::MatchResult &Result) {
   Visitor(this, *Result.Context).traverse();
 }
 
+bool SimplifyBooleanExprCheck::canBeBypassed(const Stmt *S) const {
+  return IgnoreMacros && S->getBeginLoc().isMacroID();
+}
+
 void SimplifyBooleanExprCheck::issueDiag(const ASTContext &Context,
                                          SourceLocation Loc,
                                          StringRef Description,
diff --git a/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.h b/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.h
index ccc6f3d879fc..63c3caa01e01 100644
--- a/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.h
@@ -64,6 +64,8 @@ private:
                  StringRef Description, SourceRange ReplacementRange,
                  StringRef Replacement);
 
+  bool canBeBypassed(const Stmt *S) const;
+
   const bool IgnoreMacros;
   const bool ChainedConditionalReturn;
   const bool ChainedConditionalAssignment;
diff --git a/clang-tools-extra/clang-tidy/readability/StaticAccessedThroughInstanceCheck.cpp b/clang-tools-extra/clang-tidy/readability/StaticAccessedThroughInstanceCheck.cpp
index 65356cc3929c..08adc7134cfe 100644
--- a/clang-tools-extra/clang-tidy/readability/StaticAccessedThroughInstanceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/StaticAccessedThroughInstanceCheck.cpp
@@ -59,10 +59,6 @@ void StaticAccessedThroughInstanceCheck::check(
 
   const Expr *BaseExpr = MemberExpression->getBase();
 
-  // Do not warn for overloaded -> operators.
-  if (isa<CXXOperatorCallExpr>(BaseExpr))
-    return;
-
   const QualType BaseType =
       BaseExpr->getType()->isPointerType()
           ? BaseExpr->getType()->getPointeeType().getUnqualifiedType()
@@ -89,17 +85,30 @@ void StaticAccessedThroughInstanceCheck::check(
     return;
 
   SourceLocation MemberExprStartLoc = MemberExpression->getBeginLoc();
-  auto Diag =
-      diag(MemberExprStartLoc, "static member accessed through instance");
-
-  if (BaseExpr->HasSideEffects(*AstContext) ||
-      getNameSpecifierNestingLevel(BaseType) > NameSpecifierNestingThreshold)
-    return;
+  auto CreateFix = [&] {
+    return FixItHint::CreateReplacement(
+        CharSourceRange::getCharRange(MemberExprStartLoc,
+                                      MemberExpression->getMemberLoc()),
+        BaseTypeName + "::");
+  };
+
+  {
+    auto Diag =
+        diag(MemberExprStartLoc, "static member accessed through instance");
+
+    if (getNameSpecifierNestingLevel(BaseType) > NameSpecifierNestingThreshold)
+      return;
+
+    if (!BaseExpr->HasSideEffects(*AstContext,
+                                  /* IncludePossibleEffects =*/true)) {
+      Diag << CreateFix();
+      return;
+    }
+  }
 
-  Diag << FixItHint::CreateReplacement(
-      CharSourceRange::getCharRange(MemberExprStartLoc,
-                                    MemberExpression->getMemberLoc()),
-      BaseTypeName + "::");
+  diag(MemberExprStartLoc, "member base expression may carry some side effects",
+       DiagnosticIDs::Level::Note)
+      << BaseExpr->getSourceRange() << CreateFix();
 }
 
 } // namespace clang::tidy::readability
diff --git a/clang-tools-extra/clang-tidy/readability/StringCompareCheck.cpp b/clang-tools-extra/clang-tidy/readability/StringCompareCheck.cpp
index 3b5d89c8c647..7c0bbef3ca08 100644
--- a/clang-tools-extra/clang-tidy/readability/StringCompareCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/StringCompareCheck.cpp
@@ -7,12 +7,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "StringCompareCheck.h"
-#include "../utils/FixItHintUtils.h"
+#include "../utils/OptionsUtils.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/ASTMatchers/ASTMatchers.h"
 #include "clang/Tooling/FixIt.h"
+#include "llvm/ADT/StringRef.h"
 
 using namespace clang::ast_matchers;
+namespace optutils = clang::tidy::utils::options;
 
 namespace clang::tidy::readability {
 
@@ -20,11 +23,27 @@ static const StringRef CompareMessage = "do not use 'compare' to test equality "
                                         "of strings; use the string equality "
                                         "operator instead";
 
+static const StringRef DefaultStringLikeClasses = "::std::basic_string;"
+                                                  "::std::basic_string_view";
+
+StringCompareCheck::StringCompareCheck(StringRef Name,
+                                       ClangTidyContext *Context)
+    : ClangTidyCheck(Name, Context),
+      StringLikeClasses(optutils::parseStringList(
+          Options.get("StringLikeClasses", DefaultStringLikeClasses))) {}
+
+void StringCompareCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
+  Options.store(Opts, "StringLikeClasses",
+                optutils::serializeStringList(StringLikeClasses));
+}
+
 void StringCompareCheck::registerMatchers(MatchFinder *Finder) {
+  if (StringLikeClasses.empty()) {
+    return;
+  }
   const auto StrCompare = cxxMemberCallExpr(
-      callee(cxxMethodDecl(hasName("compare"),
-                           ofClass(classTemplateSpecializationDecl(
-                               hasName("::std::basic_string"))))),
+      callee(cxxMethodDecl(hasName("compare"), ofClass(cxxRecordDecl(hasAnyName(
+                                                   StringLikeClasses))))),
       hasArgument(0, expr().bind("str2")), argumentCountIs(1),
       callee(memberExpr().bind("str1")));
 
diff --git a/clang-tools-extra/clang-tidy/readability/StringCompareCheck.h b/clang-tools-extra/clang-tidy/readability/StringCompareCheck.h
index 812736d806b7..150090901a6e 100644
--- a/clang-tools-extra/clang-tidy/readability/StringCompareCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/StringCompareCheck.h
@@ -10,6 +10,7 @@
 #define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_STRINGCOMPARECHECK_H
 
 #include "../ClangTidyCheck.h"
+#include <vector>
 
 namespace clang::tidy::readability {
 
@@ -20,13 +21,18 @@ namespace clang::tidy::readability {
 /// http://clang.llvm.org/extra/clang-tidy/checks/readability/string-compare.html
 class StringCompareCheck : public ClangTidyCheck {
 public:
-  StringCompareCheck(StringRef Name, ClangTidyContext *Context)
-      : ClangTidyCheck(Name, Context) {}
+  StringCompareCheck(StringRef Name, ClangTidyContext *Context);
+
   bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
     return LangOpts.CPlusPlus;
   }
+
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
+  void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
+
+private:
+  const std::vector<StringRef> StringLikeClasses;
 };
 
 } // namespace clang::tidy::readability
diff --git a/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp b/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp
index 3eb80019ae75..18420d0c8488 100644
--- a/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp
@@ -138,11 +138,11 @@ static bool applyAbbreviationHeuristic(
     const llvm::StringMap<std::string> &AbbreviationDictionary, StringRef Arg,
     StringRef Param) {
   if (AbbreviationDictionary.contains(Arg) &&
-      Param.equals(AbbreviationDictionary.lookup(Arg)))
+      Param == AbbreviationDictionary.lookup(Arg))
     return true;
 
   if (AbbreviationDictionary.contains(Param) &&
-      Arg.equals(AbbreviationDictionary.lookup(Param)))
+      Arg == AbbreviationDictionary.lookup(Param))
     return true;
 
   return false;
diff --git a/clang-tools-extra/clang-tidy/utils/IncludeSorter.cpp b/clang-tools-extra/clang-tidy/utils/IncludeSorter.cpp
index a44720c47eca..0fa54b3847eb 100644
--- a/clang-tools-extra/clang-tidy/utils/IncludeSorter.cpp
+++ b/clang-tools-extra/clang-tidy/utils/IncludeSorter.cpp
@@ -88,8 +88,7 @@ determineIncludeKind(StringRef CanonicalFile, StringRef IncludeFile,
     if (FileCopy.consume_front(Parts.first) &&
         FileCopy.consume_back(Parts.second)) {
       // Determine the kind of this inclusion.
-      if (FileCopy.equals("/internal/") ||
-          FileCopy.equals("/proto/")) {
+      if (FileCopy == "/internal/" || FileCopy == "/proto/") {
         return IncludeSorter::IK_MainTUInclude;
       }
     }
diff --git a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
index 962a243ce94d..e811f5519de2 100644
--- a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
+++ b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
@@ -61,6 +61,7 @@ struct DenseMapInfo<clang::tidy::RenamerClangTidyCheck::NamingCheckId> {
 namespace clang::tidy {
 
 namespace {
+
 class NameLookup {
   llvm::PointerIntPair<const NamedDecl *, 1, bool> Data;
 
@@ -78,19 +79,58 @@ public:
   operator bool() const { return !hasMultipleResolutions(); }
   const NamedDecl *operator*() const { return getDecl(); }
 };
+
 } // namespace
 
 static const NamedDecl *findDecl(const RecordDecl &RecDecl,
                                  StringRef DeclName) {
   for (const Decl *D : RecDecl.decls()) {
     if (const auto *ND = dyn_cast<NamedDecl>(D)) {
-      if (ND->getDeclName().isIdentifier() && ND->getName().equals(DeclName))
+      if (ND->getDeclName().isIdentifier() && ND->getName() == DeclName)
         return ND;
     }
   }
   return nullptr;
 }
 
+/// Returns the function that \p Method is overridding. If There are none or
+/// multiple overrides it returns nullptr. If the overridden function itself is
+/// overridding then it will recurse up to find the first decl of the function.
+static const CXXMethodDecl *getOverrideMethod(const CXXMethodDecl *Method) {
+  if (Method->size_overridden_methods() != 1)
+    return nullptr;
+
+  while (true) {
+    Method = *Method->begin_overridden_methods();
+    assert(Method && "Overridden method shouldn't be null");
+    unsigned NumOverrides = Method->size_overridden_methods();
+    if (NumOverrides == 0)
+      return Method;
+    if (NumOverrides > 1)
+      return nullptr;
+  }
+}
+
+static bool hasNoName(const NamedDecl *Decl) {
+  return !Decl->getIdentifier() || Decl->getName().empty();
+}
+
+static const NamedDecl *getFailureForNamedDecl(const NamedDecl *ND) {
+  const auto *Canonical = cast<NamedDecl>(ND->getCanonicalDecl());
+  if (Canonical != ND)
+    return Canonical;
+
+  if (const auto *Method = dyn_cast<CXXMethodDecl>(ND)) {
+    if (const CXXMethodDecl *Overridden = getOverrideMethod(Method))
+      Canonical = cast<NamedDecl>(Overridden->getCanonicalDecl());
+
+    if (Canonical != ND)
+      return Canonical;
+  }
+
+  return ND;
+}
+
 /// Returns a decl matching the \p DeclName in \p Parent or one of its base
 /// classes. If \p AggressiveTemplateLookup is `true` then it will check
 /// template dependent base classes as well.
@@ -132,24 +172,6 @@ static NameLookup findDeclInBases(const CXXRecordDecl &Parent,
   return NameLookup(Found); // If nullptr, decl wasn't found.
 }
 
-/// Returns the function that \p Method is overridding. If There are none or
-/// multiple overrides it returns nullptr. If the overridden function itself is
-/// overridding then it will recurse up to find the first decl of the function.
-static const CXXMethodDecl *getOverrideMethod(const CXXMethodDecl *Method) {
-  if (Method->size_overridden_methods() != 1)
-    return nullptr;
-
-  while (true) {
-    Method = *Method->begin_overridden_methods();
-    assert(Method && "Overridden method shouldn't be null");
-    unsigned NumOverrides = Method->size_overridden_methods();
-    if (NumOverrides == 0)
-      return Method;
-    if (NumOverrides > 1)
-      return nullptr;
-  }
-}
-
 namespace {
 
 /// Callback supplies macros to RenamerClangTidyCheck::checkMacro
@@ -192,10 +214,6 @@ public:
       : Check(Check), SM(SM),
         AggressiveDependentMemberLookup(AggressiveDependentMemberLookup) {}
 
-  static bool hasNoName(const NamedDecl *Decl) {
-    return !Decl->getIdentifier() || Decl->getName().empty();
-  }
-
   bool shouldVisitTemplateInstantiations() const { return true; }
 
   bool shouldVisitImplicitCode() const { return false; }
@@ -246,29 +264,10 @@ public:
   }
 
   bool VisitNamedDecl(NamedDecl *Decl) {
-    if (hasNoName(Decl))
-      return true;
-
-    const auto *Canonical = cast<NamedDecl>(Decl->getCanonicalDecl());
-    if (Canonical != Decl) {
-      Check->addUsage(Canonical, Decl->getLocation(), SM);
-      return true;
-    }
-
-    // Fix overridden methods
-    if (const auto *Method = dyn_cast<CXXMethodDecl>(Decl)) {
-      if (const CXXMethodDecl *Overridden = getOverrideMethod(Method)) {
-        Check->addUsage(Overridden, Method->getLocation(), SM);
-        return true; // Don't try to add the actual decl as a Failure.
-      }
-    }
-
-    // Ignore ClassTemplateSpecializationDecl which are creating duplicate
-    // replacements with CXXRecordDecl.
-    if (isa<ClassTemplateSpecializationDecl>(Decl))
-      return true;
-
-    Check->checkNamedDecl(Decl, SM);
+    SourceRange UsageRange =
+        DeclarationNameInfo(Decl->getDeclName(), Decl->getLocation())
+            .getSourceRange();
+    Check->addUsage(Decl, UsageRange, SM);
     return true;
   }
 
@@ -413,82 +412,97 @@ void RenamerClangTidyCheck::registerPPCallbacks(
       std::make_unique<RenamerClangTidyCheckPPCallbacks>(SM, this));
 }
 
-void RenamerClangTidyCheck::addUsage(
-    const RenamerClangTidyCheck::NamingCheckId &Decl, SourceRange Range,
-    const SourceManager &SourceMgr) {
+std::pair<RenamerClangTidyCheck::NamingCheckFailureMap::iterator, bool>
+RenamerClangTidyCheck::addUsage(
+    const RenamerClangTidyCheck::NamingCheckId &FailureId,
+    SourceRange UsageRange, const SourceManager &SourceMgr) {
   // Do nothing if the provided range is invalid.
-  if (Range.isInvalid())
-    return;
+  if (UsageRange.isInvalid())
+    return {NamingCheckFailures.end(), false};
 
-  // If we have a source manager, use it to convert to the spelling location for
-  // performing the fix. This is necessary because macros can map the same
-  // spelling location to different source locations, and we only want to fix
-  // the token once, before it is expanded by the macro.
-  SourceLocation FixLocation = Range.getBegin();
+  // Get the spelling location for performing the fix. This is necessary because
+  // macros can map the same spelling location to different source locations,
+  // and we only want to fix the token once, before it is expanded by the macro.
+  SourceLocation FixLocation = UsageRange.getBegin();
   FixLocation = SourceMgr.getSpellingLoc(FixLocation);
   if (FixLocation.isInvalid())
-    return;
+    return {NamingCheckFailures.end(), false};
+
+  auto EmplaceResult = NamingCheckFailures.try_emplace(FailureId);
+  NamingCheckFailure &Failure = EmplaceResult.first->second;
 
   // Try to insert the identifier location in the Usages map, and bail out if it
   // is already in there
-  RenamerClangTidyCheck::NamingCheckFailure &Failure =
-      NamingCheckFailures[Decl];
   if (!Failure.RawUsageLocs.insert(FixLocation).second)
-    return;
+    return EmplaceResult;
 
-  if (!Failure.shouldFix())
-    return;
+  if (Failure.FixStatus != RenamerClangTidyCheck::ShouldFixStatus::ShouldFix)
+    return EmplaceResult;
 
   if (SourceMgr.isWrittenInScratchSpace(FixLocation))
     Failure.FixStatus = RenamerClangTidyCheck::ShouldFixStatus::InsideMacro;
 
-  if (!utils::rangeCanBeFixed(Range, &SourceMgr))
+  if (!utils::rangeCanBeFixed(UsageRange, &SourceMgr))
     Failure.FixStatus = RenamerClangTidyCheck::ShouldFixStatus::InsideMacro;
+
+  return EmplaceResult;
 }
 
-void RenamerClangTidyCheck::addUsage(const NamedDecl *Decl, SourceRange Range,
+void RenamerClangTidyCheck::addUsage(const NamedDecl *Decl,
+                                     SourceRange UsageRange,
                                      const SourceManager &SourceMgr) {
-  // Don't keep track for non-identifier names.
-  auto *II = Decl->getIdentifier();
-  if (!II)
+  if (hasNoName(Decl))
+    return;
+
+  // Ignore ClassTemplateSpecializationDecl which are creating duplicate
+  // replacements with CXXRecordDecl.
+  if (isa<ClassTemplateSpecializationDecl>(Decl))
     return;
-  if (const auto *Method = dyn_cast<CXXMethodDecl>(Decl)) {
-    if (const CXXMethodDecl *Overridden = getOverrideMethod(Method))
-      Decl = Overridden;
-  }
-  Decl = cast<NamedDecl>(Decl->getCanonicalDecl());
-  return addUsage(
-      RenamerClangTidyCheck::NamingCheckId(Decl->getLocation(), II->getName()),
-      Range, SourceMgr);
-}
 
-void RenamerClangTidyCheck::checkNamedDecl(const NamedDecl *Decl,
-                                           const SourceManager &SourceMgr) {
-  std::optional<FailureInfo> MaybeFailure = getDeclFailureInfo(Decl, SourceMgr);
+  // We don't want to create a failure for every NamedDecl we find. Ideally
+  // there is just one NamedDecl in every group of "related" NamedDecls that
+  // becomes the failure. This NamedDecl and all of its related NamedDecls
+  // become usages. E.g. Since NamedDecls are Redeclarable, only the canonical
+  // NamedDecl becomes the failure and all redeclarations become usages.
+  const NamedDecl *FailureDecl = getFailureForNamedDecl(Decl);
+
+  std::optional<FailureInfo> MaybeFailure =
+      getDeclFailureInfo(FailureDecl, SourceMgr);
   if (!MaybeFailure)
     return;
 
-  FailureInfo &Info = *MaybeFailure;
-  NamingCheckFailure &Failure =
-      NamingCheckFailures[NamingCheckId(Decl->getLocation(), Decl->getName())];
-  SourceRange Range =
-      DeclarationNameInfo(Decl->getDeclName(), Decl->getLocation())
-          .getSourceRange();
-
-  const IdentifierTable &Idents = Decl->getASTContext().Idents;
-  auto CheckNewIdentifier = Idents.find(Info.Fixup);
+  NamingCheckId FailureId(FailureDecl->getLocation(), FailureDecl->getName());
+
+  auto [FailureIter, NewFailure] = addUsage(FailureId, UsageRange, SourceMgr);
+
+  if (FailureIter == NamingCheckFailures.end()) {
+    // Nothing to do if the usage wasn't accepted.
+    return;
+  }
+  if (!NewFailure) {
+    // FailureInfo has already been provided.
+    return;
+  }
+
+  // Update the stored failure with info regarding the FailureDecl.
+  NamingCheckFailure &Failure = FailureIter->second;
+  Failure.Info = std::move(*MaybeFailure);
+
+  // Don't overwritte the failure status if it was already set.
+  if (!Failure.shouldFix()) {
+    return;
+  }
+  const IdentifierTable &Idents = FailureDecl->getASTContext().Idents;
+  auto CheckNewIdentifier = Idents.find(Failure.Info.Fixup);
   if (CheckNewIdentifier != Idents.end()) {
     const IdentifierInfo *Ident = CheckNewIdentifier->second;
     if (Ident->isKeyword(getLangOpts()))
       Failure.FixStatus = ShouldFixStatus::ConflictsWithKeyword;
     else if (Ident->hasMacroDefinition())
       Failure.FixStatus = ShouldFixStatus::ConflictsWithMacroDefinition;
-  } else if (!isValidAsciiIdentifier(Info.Fixup)) {
+  } else if (!isValidAsciiIdentifier(Failure.Info.Fixup)) {
     Failure.FixStatus = ShouldFixStatus::FixInvalidIdentifier;
   }
-
-  Failure.Info = std::move(Info);
-  addUsage(Decl, Range, SourceMgr);
 }
 
 void RenamerClangTidyCheck::check(const MatchFinder::MatchResult &Result) {
diff --git a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.h b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.h
index be5b6f0c7f76..3d5721b789ac 100644
--- a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.h
+++ b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.h
@@ -115,15 +115,9 @@ public:
   void expandMacro(const Token &MacroNameTok, const MacroInfo *MI,
                    const SourceManager &SourceMgr);
 
-  void addUsage(const RenamerClangTidyCheck::NamingCheckId &Decl,
-                SourceRange Range, const SourceManager &SourceMgr);
-
-  /// Convenience method when the usage to be added is a NamedDecl.
   void addUsage(const NamedDecl *Decl, SourceRange Range,
                 const SourceManager &SourceMgr);
 
-  void checkNamedDecl(const NamedDecl *Decl, const SourceManager &SourceMgr);
-
 protected:
   /// Overridden by derived classes, returns information about if and how a Decl
   /// failed the check. A 'std::nullopt' result means the Decl did not fail the
@@ -158,6 +152,14 @@ protected:
                                const NamingCheckFailure &Failure) const = 0;
 
 private:
+  // Manage additions to the Failure/usage map
+  //
+  // return the result of NamingCheckFailures::try_emplace() if the usage was
+  // accepted.
+  std::pair<NamingCheckFailureMap::iterator, bool>
+  addUsage(const RenamerClangTidyCheck::NamingCheckId &FailureId,
+           SourceRange UsageRange, const SourceManager &SourceMgr);
+
   NamingCheckFailureMap NamingCheckFailures;
   const bool AggressiveDependentMemberLookup;
 };
diff --git a/clang-tools-extra/clangd/Preamble.cpp b/clang-tools-extra/clangd/Preamble.cpp
index d5818e0ca309..ecd490145dd3 100644
--- a/clang-tools-extra/clangd/Preamble.cpp
+++ b/clang-tools-extra/clangd/Preamble.cpp
@@ -918,7 +918,9 @@ void PreamblePatch::apply(CompilerInvocation &CI) const {
   // no guarantees around using arbitrary options when reusing PCHs, and
   // different target opts can result in crashes, see
   // ParsedASTTest.PreambleWithDifferentTarget.
-  CI.TargetOpts = Baseline->TargetOpts;
+  // Make sure this is a deep copy, as the same Baseline might be used
+  // concurrently.
+  *CI.TargetOpts = *Baseline->TargetOpts;
 
   // No need to map an empty file.
   if (PatchContents.empty())
diff --git a/clang-tools-extra/clangd/index/CanonicalIncludes.cpp b/clang-tools-extra/clangd/index/CanonicalIncludes.cpp
index 42eeba36a80e..785ec4086ea7 100644
--- a/clang-tools-extra/clangd/index/CanonicalIncludes.cpp
+++ b/clang-tools-extra/clangd/index/CanonicalIncludes.cpp
@@ -18,9 +18,11 @@ namespace {
 const std::pair<llvm::StringRef, llvm::StringRef> IncludeMappings[] = {
     {"include/__stdarg___gnuc_va_list.h", "<cstdarg>"},
     {"include/__stdarg___va_copy.h", "<cstdarg>"},
+    {"include/__stdarg_header_macro.h", "<cstdarg>"},
     {"include/__stdarg_va_arg.h", "<cstdarg>"},
     {"include/__stdarg_va_copy.h", "<cstdarg>"},
     {"include/__stdarg_va_list.h", "<cstdarg>"},
+    {"include/__stddef_header_macro.h", "<cstddef>"},
     {"include/__stddef_max_align_t.h", "<cstddef>"},
     {"include/__stddef_null.h", "<cstddef>"},
     {"include/__stddef_nullptr_t.h", "<cstddef>"},
diff --git a/clang-tools-extra/clangd/refactor/tweaks/ScopifyEnum.cpp b/clang-tools-extra/clangd/refactor/tweaks/ScopifyEnum.cpp
index e36b3249bc7b..44080802a289 100644
--- a/clang-tools-extra/clangd/refactor/tweaks/ScopifyEnum.cpp
+++ b/clang-tools-extra/clangd/refactor/tweaks/ScopifyEnum.cpp
@@ -40,15 +40,12 @@ namespace {
 ///   void f() { E e1 = EV1; }
 ///
 /// After:
-///   enum class E { EV1, EV2 };
-///   void f() { E e1 = E::EV1; }
+///   enum class E { V1, V2 };
+///   void f() { E e1 = E::V1; }
 ///
 /// Note that the respective project code might not compile anymore
 /// if it made use of the now-gone implicit conversion to int.
 /// This is out of scope for this tweak.
-///
-/// TODO: In the above example, we could detect that the values
-///       start with the enum name, and remove that prefix.
 
 class ScopifyEnum : public Tweak {
   const char *id() const final;
@@ -63,14 +60,13 @@ class ScopifyEnum : public Tweak {
       std::function<tooling::Replacement(StringRef, StringRef, unsigned)>;
   llvm::Error addClassKeywordToDeclarations();
   llvm::Error scopifyEnumValues();
-  llvm::Error scopifyEnumValue(const EnumConstantDecl &CD, StringRef Prefix);
+  llvm::Error scopifyEnumValue(const EnumConstantDecl &CD, StringRef EnumName,
+                               bool StripPrefix);
   llvm::Expected<StringRef> getContentForFile(StringRef FilePath);
-  unsigned getOffsetFromPosition(const Position &Pos, StringRef Content) const;
   llvm::Error addReplacementForReference(const ReferencesResult::Reference &Ref,
                                          const MakeReplacement &GetReplacement);
   llvm::Error addReplacement(StringRef FilePath, StringRef Content,
                              const tooling::Replacement &Replacement);
-  Position getPosition(const Decl &D) const;
 
   const EnumDecl *D = nullptr;
   const Selection *S = nullptr;
@@ -109,7 +105,8 @@ Expected<Tweak::Effect> ScopifyEnum::apply(const Selection &Inputs) {
 
 llvm::Error ScopifyEnum::addClassKeywordToDeclarations() {
   for (const auto &Ref :
-       findReferences(*S->AST, getPosition(*D), 0, S->Index, false)
+       findReferences(*S->AST, sourceLocToPosition(*SM, D->getBeginLoc()), 0,
+                      S->Index, false)
            .References) {
     if (!(Ref.Attributes & ReferencesResult::Declaration))
       continue;
@@ -125,25 +122,46 @@ llvm::Error ScopifyEnum::addClassKeywordToDeclarations() {
 }
 
 llvm::Error ScopifyEnum::scopifyEnumValues() {
-  std::string PrefixToInsert(D->getName());
-  PrefixToInsert += "::";
-  for (auto E : D->enumerators()) {
-    if (auto Err = scopifyEnumValue(*E, PrefixToInsert))
+  StringRef EnumName(D->getName());
+  bool StripPrefix = true;
+  for (const EnumConstantDecl *E : D->enumerators()) {
+    if (!E->getName().starts_with(EnumName)) {
+      StripPrefix = false;
+      break;
+    }
+  }
+  for (const EnumConstantDecl *E : D->enumerators()) {
+    if (auto Err = scopifyEnumValue(*E, EnumName, StripPrefix))
       return Err;
   }
   return llvm::Error::success();
 }
 
 llvm::Error ScopifyEnum::scopifyEnumValue(const EnumConstantDecl &CD,
-                                          StringRef Prefix) {
+                                          StringRef EnumName,
+                                          bool StripPrefix) {
   for (const auto &Ref :
-       findReferences(*S->AST, getPosition(CD), 0, S->Index, false)
+       findReferences(*S->AST, sourceLocToPosition(*SM, CD.getBeginLoc()), 0,
+                      S->Index, false)
            .References) {
-    if (Ref.Attributes & ReferencesResult::Declaration)
+    if (Ref.Attributes & ReferencesResult::Declaration) {
+      if (StripPrefix) {
+        const auto MakeReplacement = [&EnumName](StringRef FilePath,
+                                                 StringRef Content,
+                                                 unsigned Offset) {
+          unsigned Length = EnumName.size();
+          if (Content[Offset + Length] == '_')
+            ++Length;
+          return tooling::Replacement(FilePath, Offset, Length, {});
+        };
+        if (auto Err = addReplacementForReference(Ref, MakeReplacement))
+          return Err;
+      }
       continue;
+    }
 
-    const auto MakeReplacement = [&Prefix](StringRef FilePath,
-                                           StringRef Content, unsigned Offset) {
+    const auto MakeReplacement = [&](StringRef FilePath, StringRef Content,
+                                     unsigned Offset) {
       const auto IsAlreadyScoped = [Content, Offset] {
         if (Offset < 2)
           return false;
@@ -164,9 +182,18 @@ llvm::Error ScopifyEnum::scopifyEnumValue(const EnumConstantDecl &CD,
         }
         return false;
       };
-      return IsAlreadyScoped()
-                 ? tooling::Replacement()
-                 : tooling::Replacement(FilePath, Offset, 0, Prefix);
+      if (StripPrefix) {
+        const int ExtraLength =
+            Content[Offset + EnumName.size()] == '_' ? 1 : 0;
+        if (IsAlreadyScoped())
+          return tooling::Replacement(FilePath, Offset,
+                                      EnumName.size() + ExtraLength, {});
+        return tooling::Replacement(FilePath, Offset + EnumName.size(),
+                                    ExtraLength, "::");
+      }
+      return IsAlreadyScoped() ? tooling::Replacement()
+                               : tooling::Replacement(FilePath, Offset, 0,
+                                                      EnumName.str() + "::");
     };
     if (auto Err = addReplacementForReference(Ref, MakeReplacement))
       return Err;
@@ -187,27 +214,19 @@ llvm::Expected<StringRef> ScopifyEnum::getContentForFile(StringRef FilePath) {
   return Content;
 }
 
-unsigned int ScopifyEnum::getOffsetFromPosition(const Position &Pos,
-                                                StringRef Content) const {
-  unsigned int Offset = 0;
-
-  for (std::size_t LinesRemaining = Pos.line;
-       Offset < Content.size() && LinesRemaining;) {
-    if (Content[Offset++] == '\n')
-      --LinesRemaining;
-  }
-  return Offset + Pos.character;
-}
-
 llvm::Error
 ScopifyEnum::addReplacementForReference(const ReferencesResult::Reference &Ref,
                                         const MakeReplacement &GetReplacement) {
   StringRef FilePath = Ref.Loc.uri.file();
-  auto Content = getContentForFile(FilePath);
+  llvm::Expected<StringRef> Content = getContentForFile(FilePath);
   if (!Content)
     return Content.takeError();
-  unsigned Offset = getOffsetFromPosition(Ref.Loc.range.start, *Content);
-  tooling::Replacement Replacement = GetReplacement(FilePath, *Content, Offset);
+  llvm::Expected<size_t> Offset =
+      positionToOffset(*Content, Ref.Loc.range.start);
+  if (!Offset)
+    return Offset.takeError();
+  tooling::Replacement Replacement =
+      GetReplacement(FilePath, *Content, *Offset);
   if (Replacement.isApplicable())
     return addReplacement(FilePath, *Content, Replacement);
   return llvm::Error::success();
@@ -223,13 +242,5 @@ ScopifyEnum::addReplacement(StringRef FilePath, StringRef Content,
   return llvm::Error::success();
 }
 
-Position ScopifyEnum::getPosition(const Decl &D) const {
-  const SourceLocation Loc = D.getLocation();
-  Position Pos;
-  Pos.line = SM->getSpellingLineNumber(Loc) - 1;
-  Pos.character = SM->getSpellingColumnNumber(Loc) - 1;
-  return Pos;
-}
-
 } // namespace
 } // namespace clang::clangd
diff --git a/clang-tools-extra/clangd/test/delimited-input-comment-at-the-end.test b/clang-tools-extra/clangd/test/delimited-input-comment-at-the-end.test
index bbbd72f8c59f..85a1f2199fad 100644
--- a/clang-tools-extra/clangd/test/delimited-input-comment-at-the-end.test
+++ b/clang-tools-extra/clangd/test/delimited-input-comment-at-the-end.test
@@ -1,11 +1,11 @@
-# RUN: clangd -input-style=delimited -sync -input-mirror-file %t < %s
-# RUN: grep '{"jsonrpc":"2.0","id":3,"method":"exit"}' %t
-#
-# RUN: clangd -lit-test -input-mirror-file %t < %s
-# RUN: grep '{"jsonrpc":"2.0","id":3,"method":"exit"}' %t
-#
-{"jsonrpc":"2.0","id":0,"method":"initialize","params":{"processId":123,"rootPath":"clangd","capabilities":{},"trace":"off"}}
----
-{"jsonrpc":"2.0","id":3,"method":"shutdown"}
----
-{"jsonrpc":"2.0","method":"exit"}
+# RUN: clangd -input-style=delimited -sync -input-mirror-file %t < %s
+# RUN: grep '{"jsonrpc":"2.0","id":3,"method":"exit"}' %t
+#
+# RUN: clangd -lit-test -input-mirror-file %t < %s
+# RUN: grep '{"jsonrpc":"2.0","id":3,"method":"exit"}' %t
+#
+{"jsonrpc":"2.0","id":0,"method":"initialize","params":{"processId":123,"rootPath":"clangd","capabilities":{},"trace":"off"}}
+---
+{"jsonrpc":"2.0","id":3,"method":"shutdown"}
+---
+{"jsonrpc":"2.0","method":"exit"}
diff --git a/clang-tools-extra/clangd/test/hover.test b/clang-tools-extra/clangd/test/hover.test
index ec8d0488fa5e..dc76ae85fa41 100644
--- a/clang-tools-extra/clangd/test/hover.test
+++ b/clang-tools-extra/clangd/test/hover.test
@@ -1,57 +1,57 @@
-# RUN: clangd -lit-test < %s | FileCheck %s
-{"jsonrpc":"2.0","id":0,"method":"initialize","params":{"processId":123,"rootPath":"clangd","capabilities":{},"trace":"off"}}
----
-{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"uri":"test:///main.cpp","languageId":"cpp","version":1,"text":"void foo(); int main() { foo(); }\n"}}}
----
-{"jsonrpc":"2.0","id":1,"method":"textDocument/hover","params":{"textDocument":{"uri":"test:///main.cpp"},"position":{"line":0,"character":27}}}
-#      CHECK:  "id": 1,
-# CHECK-NEXT:  "jsonrpc": "2.0",
-# CHECK-NEXT:  "result": {
-# CHECK-NEXT:    "contents": {
-# CHECK-NEXT:      "kind": "plaintext",
-# CHECK-NEXT:      "value": "function foo\n\n→ void\n\nvoid foo()"
-# CHECK-NEXT:    },
-# CHECK-NEXT:    "range": {
-# CHECK-NEXT:      "end": {
-# CHECK-NEXT:        "character": 28,
-# CHECK-NEXT:        "line": 0
-# CHECK-NEXT:      },
-# CHECK-NEXT:      "start": {
-# CHECK-NEXT:        "character": 25,
-# CHECK-NEXT:        "line": 0
-# CHECK-NEXT:      }
-# CHECK-NEXT:    }
-# CHECK-NEXT:  }
-# CHECK-NEXT:}
----
-{"jsonrpc":"2.0","id":1,"method":"textDocument/hover","params":{"textDocument":{"uri":"test:///main.cpp"},"position":{"line":0,"character":10}}}
-#      CHECK:  "id": 1,
-# CHECK-NEXT:  "jsonrpc": "2.0",
-# CHECK-NEXT:  "result": null
----
-{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"uri":"test:///main2.cpp","languageId":"cpp","version":1,"text":"enum foo{}; int main() { foo f; }\n"}}}
----
-{"jsonrpc":"2.0","id":1,"method":"textDocument/hover","params":{"textDocument":{"uri":"test:///main2.cpp"},"position":{"line":0,"character":27}}}
-#      CHECK:  "id": 1,
-# CHECK-NEXT:  "jsonrpc": "2.0",
-# CHECK-NEXT:  "result": {
-# CHECK-NEXT:    "contents": {
-# CHECK-NEXT:      "kind": "plaintext",
-# CHECK-NEXT:      "value": "enum foo\n\nenum foo {}"
-# CHECK-NEXT:    },
-# CHECK-NEXT:    "range": {
-# CHECK-NEXT:      "end": {
-# CHECK-NEXT:        "character": 28,
-# CHECK-NEXT:        "line": 0
-# CHECK-NEXT:      },
-# CHECK-NEXT:      "start": {
-# CHECK-NEXT:        "character": 25,
-# CHECK-NEXT:        "line": 0
-# CHECK-NEXT:      }
-# CHECK-NEXT:    }
-# CHECK-NEXT:  }
-# CHECK-NEXT:}
----
-{"jsonrpc":"2.0","id":3,"method":"shutdown"}
----
-{"jsonrpc":"2.0","method":"exit"}
+# RUN: clangd -lit-test < %s | FileCheck %s
+{"jsonrpc":"2.0","id":0,"method":"initialize","params":{"processId":123,"rootPath":"clangd","capabilities":{},"trace":"off"}}
+---
+{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"uri":"test:///main.cpp","languageId":"cpp","version":1,"text":"void foo(); int main() { foo(); }\n"}}}
+---
+{"jsonrpc":"2.0","id":1,"method":"textDocument/hover","params":{"textDocument":{"uri":"test:///main.cpp"},"position":{"line":0,"character":27}}}
+#      CHECK:  "id": 1,
+# CHECK-NEXT:  "jsonrpc": "2.0",
+# CHECK-NEXT:  "result": {
+# CHECK-NEXT:    "contents": {
+# CHECK-NEXT:      "kind": "plaintext",
+# CHECK-NEXT:      "value": "function foo\n\n→ void\n\nvoid foo()"
+# CHECK-NEXT:    },
+# CHECK-NEXT:    "range": {
+# CHECK-NEXT:      "end": {
+# CHECK-NEXT:        "character": 28,
+# CHECK-NEXT:        "line": 0
+# CHECK-NEXT:      },
+# CHECK-NEXT:      "start": {
+# CHECK-NEXT:        "character": 25,
+# CHECK-NEXT:        "line": 0
+# CHECK-NEXT:      }
+# CHECK-NEXT:    }
+# CHECK-NEXT:  }
+# CHECK-NEXT:}
+---
+{"jsonrpc":"2.0","id":1,"method":"textDocument/hover","params":{"textDocument":{"uri":"test:///main.cpp"},"position":{"line":0,"character":10}}}
+#      CHECK:  "id": 1,
+# CHECK-NEXT:  "jsonrpc": "2.0",
+# CHECK-NEXT:  "result": null
+---
+{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"uri":"test:///main2.cpp","languageId":"cpp","version":1,"text":"enum foo{}; int main() { foo f; }\n"}}}
+---
+{"jsonrpc":"2.0","id":1,"method":"textDocument/hover","params":{"textDocument":{"uri":"test:///main2.cpp"},"position":{"line":0,"character":27}}}
+#      CHECK:  "id": 1,
+# CHECK-NEXT:  "jsonrpc": "2.0",
+# CHECK-NEXT:  "result": {
+# CHECK-NEXT:    "contents": {
+# CHECK-NEXT:      "kind": "plaintext",
+# CHECK-NEXT:      "value": "enum foo\n\nenum foo {}"
+# CHECK-NEXT:    },
+# CHECK-NEXT:    "range": {
+# CHECK-NEXT:      "end": {
+# CHECK-NEXT:        "character": 28,
+# CHECK-NEXT:        "line": 0
+# CHECK-NEXT:      },
+# CHECK-NEXT:      "start": {
+# CHECK-NEXT:        "character": 25,
+# CHECK-NEXT:        "line": 0
+# CHECK-NEXT:      }
+# CHECK-NEXT:    }
+# CHECK-NEXT:  }
+# CHECK-NEXT:}
+---
+{"jsonrpc":"2.0","id":3,"method":"shutdown"}
+---
+{"jsonrpc":"2.0","method":"exit"}
diff --git a/clang-tools-extra/clangd/test/spaces-in-delimited-input.test b/clang-tools-extra/clangd/test/spaces-in-delimited-input.test
index dc2e2f5ea0f6..aa191b6f2097 100644
--- a/clang-tools-extra/clangd/test/spaces-in-delimited-input.test
+++ b/clang-tools-extra/clangd/test/spaces-in-delimited-input.test
@@ -1,13 +1,13 @@
-# RUN: clangd -input-style=delimited -sync < %s 2>&1 | FileCheck %s
-# RUN: clangd -lit-test -sync < %s 2>&1 | FileCheck %s
-#
-{"jsonrpc":"2.0","id":0,"method":"initialize","params":{"processId":123,"rootPath":"clangd","capabilities":{},"trace":"off"}}
-
----
-
-{"jsonrpc":"2.0","id":3,"method":"shutdown"}
-
----
-
-{"jsonrpc":"2.0","method":"exit"}
-# CHECK-NOT: JSON parse error
+# RUN: clangd -input-style=delimited -sync < %s 2>&1 | FileCheck %s
+# RUN: clangd -lit-test -sync < %s 2>&1 | FileCheck %s
+#
+{"jsonrpc":"2.0","id":0,"method":"initialize","params":{"processId":123,"rootPath":"clangd","capabilities":{},"trace":"off"}}
+
+---
+
+{"jsonrpc":"2.0","id":3,"method":"shutdown"}
+
+---
+
+{"jsonrpc":"2.0","method":"exit"}
+# CHECK-NOT: JSON parse error
diff --git a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
index 94437857cecc..0b2273f0a9a6 100644
--- a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
+++ b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
@@ -642,10 +642,7 @@ TEST_F(TargetDeclTest, RewrittenBinaryOperator) {
     bool x = (Foo(1) [[!=]] Foo(2));
   )cpp";
   EXPECT_DECLS("CXXRewrittenBinaryOperator",
-               {"std::strong_ordering operator<=>(const Foo &) const = default",
-                Rel::TemplatePattern},
-               {"bool operator==(const Foo &) const noexcept = default",
-                Rel::TemplateInstantiation});
+               {"bool operator==(const Foo &) const noexcept = default"});
 }
 
 TEST_F(TargetDeclTest, FunctionTemplate) {
diff --git a/clang-tools-extra/clangd/unittests/HoverTests.cpp b/clang-tools-extra/clangd/unittests/HoverTests.cpp
index 5ead74748f55..d9e97e5215a2 100644
--- a/clang-tools-extra/clangd/unittests/HoverTests.cpp
+++ b/clang-tools-extra/clangd/unittests/HoverTests.cpp
@@ -965,6 +965,19 @@ class Foo final {})cpp";
          // Bindings are in theory public members of an anonymous struct.
          HI.AccessSpecifier = "public";
        }},
+      {// Don't crash on invalid decl with invalid init expr.
+       R"cpp(
+          Unknown [[^abc]] = invalid;
+          // error-ok
+          )cpp",
+       [](HoverInfo &HI) {
+         HI.Name = "abc";
+         HI.Kind = index::SymbolKind::Variable;
+         HI.NamespaceScope = "";
+         HI.Definition = "int abc = <recovery - expr>()";
+         HI.Type = "int";
+         HI.AccessSpecifier = "public";
+       }},
       {// Extra info for function call.
        R"cpp(
           void fun(int arg_a, int &arg_b) {};
@@ -3078,7 +3091,7 @@ TEST(Hover, All) {
             HI.NamespaceScope = "";
             HI.Definition =
                 "bool operator==(const Foo &) const noexcept = default";
-            HI.Documentation = "Foo spaceship";
+            HI.Documentation = "";
           }},
   };
 
@@ -3881,7 +3894,7 @@ TEST(Hover, SpaceshipTemplateNoCrash) {
   TU.ExtraArgs.push_back("-std=c++20");
   auto AST = TU.build();
   auto HI = getHover(AST, T.point(), format::getLLVMStyle(), nullptr);
-  EXPECT_EQ(HI->Documentation, "Foo bar baz");
+  EXPECT_EQ(HI->Documentation, "");
 }
 
 TEST(Hover, ForwardStructNoCrash) {
diff --git a/clang-tools-extra/clangd/unittests/tweaks/ScopifyEnumTests.cpp b/clang-tools-extra/clangd/unittests/tweaks/ScopifyEnumTests.cpp
index b5a964a5a26d..5da059faaf5e 100644
--- a/clang-tools-extra/clangd/unittests/tweaks/ScopifyEnumTests.cpp
+++ b/clang-tools-extra/clangd/unittests/tweaks/ScopifyEnumTests.cpp
@@ -26,7 +26,7 @@ enum ^E;
 )cpp");
 }
 
-TEST_F(ScopifyEnumTest, ApplyTest) {
+TEST_F(ScopifyEnumTest, ApplyTestWithPrefix) {
   std::string Original = R"cpp(
 enum ^E { EV1, EV2, EV3 };
 enum E;
@@ -39,13 +39,69 @@ E func(E in)
 }
 )cpp";
   std::string Expected = R"cpp(
-enum class E { EV1, EV2, EV3 };
+enum class E { V1, V2, V3 };
 enum class E;
 E func(E in)
 {
-  E out = E::EV1;
-  if (in == E::EV2)
-    out = E::EV3;
+  E out = E::V1;
+  if (in == E::V2)
+    out = E::V3;
+  return out;
+}
+)cpp";
+  FileName = "Test.cpp";
+  SCOPED_TRACE(Original);
+  EXPECT_EQ(apply(Original), Expected);
+}
+
+TEST_F(ScopifyEnumTest, ApplyTestWithPrefixAndUnderscore) {
+  std::string Original = R"cpp(
+enum ^E { E_V1, E_V2, E_V3 };
+enum E;
+E func(E in)
+{
+  E out = E_V1;
+  if (in == E_V2)
+    out = E::E_V3;
+  return out;
+}
+)cpp";
+  std::string Expected = R"cpp(
+enum class E { V1, V2, V3 };
+enum class E;
+E func(E in)
+{
+  E out = E::V1;
+  if (in == E::V2)
+    out = E::V3;
+  return out;
+}
+)cpp";
+  FileName = "Test.cpp";
+  SCOPED_TRACE(Original);
+  EXPECT_EQ(apply(Original), Expected);
+}
+
+TEST_F(ScopifyEnumTest, ApplyTestWithoutPrefix) {
+  std::string Original = R"cpp(
+enum ^E { V1, V2, V3 };
+enum E;
+E func(E in)
+{
+  E out = V1;
+  if (in == V2)
+    out = E::V3;
+  return out;
+}
+)cpp";
+  std::string Expected = R"cpp(
+enum class E { V1, V2, V3 };
+enum class E;
+E func(E in)
+{
+  E out = E::V1;
+  if (in == E::V2)
+    out = E::V3;
   return out;
 }
 )cpp";
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 5956ccb92548..fc976ce3a33d 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -69,6 +69,9 @@ Code completion
 Code actions
 ^^^^^^^^^^^^
 
+- The tweak for turning unscoped into scoped enums now removes redundant prefixes
+  from the enum values.
+
 Signature help
 ^^^^^^^^^^^^^^
 
@@ -87,7 +90,9 @@ Improvements to clang-doc
 Improvements to clang-query
 ---------------------------
 
-The improvements are...
+- Added the `file` command to dynamically load a list of commands and matchers
+  from an external file, allowing the cost of reading the compilation database
+  and building the AST to be imposed just once for faster prototyping.
 
 Improvements to clang-rename
 ----------------------------
@@ -166,6 +171,10 @@ New checks
 New check aliases
 ^^^^^^^^^^^^^^^^^
 
+- New alias :doc:`cert-int09-c <clang-tidy/checks/cert/int09-c>` to
+  :doc:`readability-enum-initial-value <clang-tidy/checks/readability/enum-initial-value>`
+  was added.
+
 Changes in existing checks
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -173,6 +182,11 @@ Changes in existing checks
   <clang-tidy/checks/bugprone/assert-side-effect>` check by detecting side
   effect from calling a method with non-const reference parameters.
 
+- Improved :doc:`bugprone-casting-through-void
+  <clang-tidy/checks/bugprone/casting-through-void>` check by ignoring casts
+  where source is already a ``void``` pointer, making middle ``void`` pointer
+  casts bug-free.
+
 - Improved :doc:`bugprone-forwarding-reference-overload
   <clang-tidy/checks/bugprone/forwarding-reference-overload>`
   check to ignore deleted constructors which won't hide other overloads.
@@ -247,6 +261,10 @@ Changes in existing checks
 - Improved :doc:`google-runtime-int <clang-tidy/checks/google/runtime-int>`
   check performance through optimizations.
 
+- Improved :doc:`hicpp-signed-bitwise <clang-tidy/checks/hicpp/signed-bitwise>`
+  check by ignoring false positives involving positive integer literals behind
+  implicit casts when `IgnorePositiveIntegerLiterals` is enabled.
+
 - Improved :doc:`hicpp-ignored-remove-result <clang-tidy/checks/hicpp/ignored-remove-result>`
   check by ignoring other functions with same prefixes as the target specific
   functions.
@@ -313,6 +331,10 @@ Changes in existing checks
   <clang-tidy/checks/readability/avoid-return-with-void-value>` check by adding
   fix-its.
 
+- Improved :doc:`readability-const-return-type
+  <clang-tidy/checks/readability/const-return-type>` check to eliminate false
+  positives when returning types with const not at the top level.
+
 - Improved :doc:`readability-duplicate-include
   <clang-tidy/checks/readability/duplicate-include>` check by excluding include
   directives that form the filename using macro.
@@ -332,11 +354,25 @@ Changes in existing checks
   <clang-tidy/checks/readability/redundant-inline-specifier>` check to properly
   emit warnings for static data member with an in-class initializer.
 
+- Improved :doc:`readability-static-accessed-through-instance
+  <clang-tidy/checks/readability/static-accessed-through-instance>` check to
+  support calls to overloaded operators as base expression and provide fixes to
+  expressions with side-effects.
+
+- Improved :doc:`readability-simplify-boolean-expr
+  <clang-tidy/checks/readability/simplify-boolean-expr>` check to avoid to emit
+  warning for macro when IgnoreMacro option is enabled.
+
 - Improved :doc:`readability-static-definition-in-anonymous-namespace
   <clang-tidy/checks/readability/static-definition-in-anonymous-namespace>`
   check by resolving fix-it overlaps in template code by disregarding implicit
   instances.
 
+- Improved :doc:`readability-string-compare
+  <clang-tidy/checks/readability/string-compare>` check to also detect
+  usages of ``std::string_view::compare``. Added a `StringLikeClasses` option
+  to detect usages of ``compare`` method in custom string-like classes.
+
 Removed checks
 ^^^^^^^^^^^^^^
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/bad-signal-to-kill-thread.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/bad-signal-to-kill-thread.rst
index 24b08da6c5c3..365624a8b1a0 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/bad-signal-to-kill-thread.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/bad-signal-to-kill-thread.rst
@@ -14,3 +14,5 @@ just the individual thread. Use any signal except ``SIGTERM``.
 This check corresponds to the CERT C Coding Standard rule
 `POS44-C. Do not use signals to terminate threads
 <https://wiki.sei.cmu.edu/confluence/display/c/POS44-C.+Do+not+use+signals+to+terminate+threads>`_.
+
+`cert-pos44-c` redirects here as an alias of this check.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/macro-parentheses.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/macro-parentheses.rst
index b6bafcec1644..80cea089564e 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/macro-parentheses.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/macro-parentheses.rst
@@ -17,3 +17,7 @@ completely before it is used.
 It is also recommended to surround macro arguments in the replacement list
 with parentheses. This ensures that the argument value is calculated
 properly.
+
+This check corresponds to the CERT C Coding Standard rule
+`PRE20-C. Macro replacement lists should be parenthesized.
+<https://wiki.sei.cmu.edu/confluence/display/c/PRE02-C.+Macro+replacement+lists+should+be+parenthesized>`_
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-memory-comparison.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-memory-comparison.rst
index 549e214b241b..f82863f7c2f1 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-memory-comparison.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-memory-comparison.rst
@@ -29,3 +29,5 @@ This check is also related to and partially overlaps the CERT C++ Coding Standar
 and
 `EXP62-CPP. Do not access the bits of an object representation that are not part of the object's value representation
 <https://wiki.sei.cmu.edu/confluence/display/cplusplus/EXP62-CPP.+Do+not+access+the+bits+of+an+object+representation+that+are+not+part+of+the+object%27s+value+representation>`_
+
+`cert-exp42-c` redirects here as an alias of this check.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert/int09-c.rst b/clang-tools-extra/docs/clang-tidy/checks/cert/int09-c.rst
new file mode 100644
index 000000000000..74c606929547
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/cert/int09-c.rst
@@ -0,0 +1,10 @@
+.. title:: clang-tidy - cert-int09-c
+.. meta::
+   :http-equiv=refresh: 5;URL=../readability/enum-initial-value.html
+
+cert-int09-c
+============
+
+The `cert-int09-c` check is an alias, please see
+:doc:`readability-enum-initial-value <../readability/enum-initial-value>` for
+more information.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/concurrency/thread-canceltype-asynchronous.rst b/clang-tools-extra/docs/clang-tidy/checks/concurrency/thread-canceltype-asynchronous.rst
index 11edd001365d..5e4d980077d5 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/concurrency/thread-canceltype-asynchronous.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/concurrency/thread-canceltype-asynchronous.rst
@@ -17,3 +17,5 @@ be acted upon and the effect is as if it was an asynchronous cancellation.
 This check corresponds to the CERT C Coding Standard rule
 `POS47-C. Do not use threads that can be canceled asynchronously
 <https://wiki.sei.cmu.edu/confluence/display/c/POS47-C.+Do+not+use+threads+that+can+be+canceled+asynchronously>`_.
+
+`cert-pos47-c` redirects here as an alias of this check.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst
index 49747ff896ba..046a5ff57ad1 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst
@@ -120,7 +120,7 @@ Clang-Tidy Checks
    :doc:`bugprone-posix-return <bugprone/posix-return>`, "Yes"
    :doc:`bugprone-redundant-branch-condition <bugprone/redundant-branch-condition>`, "Yes"
    :doc:`bugprone-reserved-identifier <bugprone/reserved-identifier>`, "Yes"
-   :doc:`bugprone-return-const-ref-from-parameter <bugprone/return-const-ref-from-parameter>`
+   :doc:`bugprone-return-const-ref-from-parameter <bugprone/return-const-ref-from-parameter>`,
    :doc:`bugprone-shared-ptr-array-mismatch <bugprone/shared-ptr-array-mismatch>`, "Yes"
    :doc:`bugprone-signal-handler <bugprone/signal-handler>`,
    :doc:`bugprone-signed-char-misuse <bugprone/signed-char-misuse>`,
@@ -395,8 +395,10 @@ Clang-Tidy Checks
    :doc:`readability-use-std-min-max <readability/use-std-min-max>`, "Yes"
    :doc:`zircon-temporary-objects <zircon/temporary-objects>`,
 
+Check aliases
+-------------
 
-.. csv-table:: Aliases..
+.. csv-table::
    :header: "Name", "Redirect", "Offers fixes"
 
    :doc:`bugprone-narrowing-conversions <bugprone/narrowing-conversions>`, :doc:`cppcoreguidelines-narrowing-conversions <cppcoreguidelines/narrowing-conversions>`,
@@ -413,6 +415,7 @@ Clang-Tidy Checks
    :doc:`cert-exp42-c <cert/exp42-c>`, :doc:`bugprone-suspicious-memory-comparison <bugprone/suspicious-memory-comparison>`,
    :doc:`cert-fio38-c <cert/fio38-c>`, :doc:`misc-non-copyable-objects <misc/non-copyable-objects>`,
    :doc:`cert-flp37-c <cert/flp37-c>`, :doc:`bugprone-suspicious-memory-comparison <bugprone/suspicious-memory-comparison>`,
+   :doc:`cert-int09-c <cert/int09-c>`, :doc:`readability-enum-initial-value <readability/enum-initial-value>`, "Yes"
    :doc:`cert-msc24-c <cert/msc24-c>`, :doc:`bugprone-unsafe-functions <bugprone/unsafe-functions>`,
    :doc:`cert-msc30-c <cert/msc30-c>`, :doc:`cert-msc50-cpp <cert/msc50-cpp>`,
    :doc:`cert-msc32-c <cert/msc32-c>`, :doc:`cert-msc51-cpp <cert/msc51-cpp>`,
diff --git a/clang-tools-extra/docs/clang-tidy/checks/misc/throw-by-value-catch-by-reference.rst b/clang-tools-extra/docs/clang-tidy/checks/misc/throw-by-value-catch-by-reference.rst
index af6ec1416e5e..b89fbe8b4466 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/misc/throw-by-value-catch-by-reference.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/misc/throw-by-value-catch-by-reference.rst
@@ -3,8 +3,7 @@
 misc-throw-by-value-catch-by-reference
 ======================================
 
-`cert-err09-cpp` redirects here as an alias for this check.
-`cert-err61-cpp` redirects here as an alias for this check.
+`cert-err09-cpp` and `cert-err61-cpp` redirect here as aliases of this check.
 
 Finds violations of the rule "Throw by value, catch by reference" presented for
 example in "C++ Coding Standards" by H. Sutter and A. Alexandrescu, as well as
diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-std-print.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-std-print.rst
index 9bb691e9d951..79648a1104bc 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-std-print.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-std-print.rst
@@ -118,7 +118,7 @@ Options
 
 .. option:: PrintfLikeFunctions
 
-   A semicolon-separated list of (fully qualified) extra function names to
+   A semicolon-separated list of (fully qualified) function names to
    replace, with the requirement that the first parameter contains the
    printf-style format string and the arguments to be formatted follow
    immediately afterwards. If neither this option nor
@@ -128,7 +128,7 @@ Options
 
 .. option:: FprintfLikeFunctions
 
-   A semicolon-separated list of (fully qualified) extra function names to
+   A semicolon-separated list of (fully qualified) function names to
    replace, with the requirement that the first parameter is retained, the
    second parameter contains the printf-style format string and the
    arguments to be formatted follow immediately afterwards. If neither this
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/enum-initial-value.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/enum-initial-value.rst
index 660efc1eaff3..b27e10d5c133 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/enum-initial-value.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/enum-initial-value.rst
@@ -6,70 +6,83 @@ readability-enum-initial-value
 Enforces consistent style for enumerators' initialization, covering three
 styles: none, first only, or all initialized explicitly.
 
-When adding new enumerations, inconsistent initial value will cause potential
-enumeration value conflicts.
+An inconsistent style and strictness to defining the initializing value of
+enumerators may cause issues if the enumeration is extended with new
+enumerators that obtain their integer representation implicitly.
 
-In an enumeration, the following three cases are accepted. 
-1. none of enumerators are explicit initialized.
-2. the first enumerator is explicit initialized.
-3. all of enumerators are explicit initialized.
+The following three cases are accepted:
+
+#. **No** enumerators are explicit initialized.
+#. Exactly **the first** enumerator is explicit initialized.
+#. **All** enumerators are explicit initialized.
 
 .. code-block:: c++
 
-  // valid, none of enumerators are initialized.
-  enum A {
-    e0,
-    e1,
-    e2,
+  enum A {    // (1) Valid, none of enumerators are initialized.
+    a0,
+    a1,
+    a2,
   };
 
-  // valid, the first enumerator is initialized.
-  enum A {
-    e0 = 0,
-    e1,
-    e2,
+  enum B {    // (2) Valid, the first enumerator is initialized.
+    b0 = 0,
+    b1,
+    b2,
   };
 
-  // valid, all of enumerators are initialized.
-  enum A {
-    e0 = 0,
-    e1 = 1,
-    e2 = 2,
+  enum C {    // (3) Valid, all of enumerators are initialized.
+    c0 = 0,
+    c1 = 1,
+    c2 = 2,
   };
 
-  // invalid, e1 is not explicit initialized.
-  enum A {
+  enum D {    // Invalid, d1 is not explicitly initialized!
+    d0 = 0,
+    d1,
+    d2 = 2,
+  };
+
+  enum E {    // Invalid, e1, e3, and e5 are not explicitly initialized.
     e0 = 0,
     e1,
     e2 = 2,
+    e3,       // Dangerous, as the numeric values of e3 and e5 are both 3, and this is not explicitly visible in the code!
+    e4 = 2,
+    e5,
   };
 
+This check corresponds to the CERT C Coding Standard recommendation `INT09-C. Ensure enumeration constants map to unique values
+<https://wiki.sei.cmu.edu/confluence/display/c/INT09-C.+Ensure+enumeration+constants+map+to+unique+values>`_.
+
+`cert-int09-c` redirects here as an alias of this check.
+
 Options
 -------
 
 .. option:: AllowExplicitZeroFirstInitialValue
 
-  If set to `false`, the first enumerator must not be explicitly initialized.
-  See examples below. Default is `true`.
+  If set to `false`, the first enumerator must not be explicitly initialized to
+  a literal ``0``.
+  Default is `true`.
 
   .. code-block:: c++
 
-    enum A {
-      e0 = 0, // not allowed if AllowExplicitZeroFirstInitialValue is false
-      e1,
-      e2,
+    enum F {
+      f0 = 0, // Not allowed if AllowExplicitZeroFirstInitialValue is false.
+      f1,
+      f2,
     };
 
 
 .. option:: AllowExplicitSequentialInitialValues
 
-  If set to `false`, sequential initializations are not allowed.
-  See examples below. Default is `true`.
+  If set to `false`, explicit initialization to sequential values are not
+  allowed.
+  Default is `true`.
 
   .. code-block:: c++
 
-    enum A {
-      e0 = 1, // not allowed if AllowExplicitSequentialInitialValues is false
-      e1 = 2,
-      e2 = 3,
-    };
+    enum G {
+      g0 = 1, // Not allowed if AllowExplicitSequentialInitialValues is false.
+      g1 = 2,
+      g2 = 3,
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/static-accessed-through-instance.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/static-accessed-through-instance.rst
index 23d12f418366..ffb3738bf72c 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/static-accessed-through-instance.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/static-accessed-through-instance.rst
@@ -35,3 +35,6 @@ is changed to:
   C::E1;
   C::E2;
 
+The `--fix` commandline option provides default support for safe fixes, whereas
+`--fix-notes` enables fixes that may replace expressions with side effects,
+potentially altering the program's behavior.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/string-compare.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/string-compare.rst
index 268632eee61a..4be2473bed2d 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/string-compare.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/string-compare.rst
@@ -14,10 +14,12 @@ recommended to avoid the risk of incorrect interpretation of the return value
 and to simplify the code. The string equality and inequality operators can
 also be faster than the ``compare`` method due to early termination.
 
-Examples:
+Example
+-------
 
 .. code-block:: c++
 
+  // The same rules apply to std::string_view.
   std::string str1{"a"};
   std::string str2{"b"};
 
@@ -50,5 +52,36 @@ Examples:
   }
 
 The above code examples show the list of if-statements that this check will
-give a warning for. All of them uses ``compare`` to check if equality or
+give a warning for. All of them use ``compare`` to check equality or
 inequality of two strings instead of using the correct operators.
+
+Options
+-------
+
+.. option:: StringLikeClasses
+
+   A string containing semicolon-separated names of string-like classes.
+   By default contains only ``::std::basic_string``
+   and ``::std::basic_string_view``. If a class from this list has
+   a ``compare`` method similar to that of ``std::string``, it will be checked
+   in the same way.
+
+Example
+^^^^^^^
+
+.. code-block:: c++
+
+  struct CustomString {
+  public:
+    int compare (const CustomString& other) const;
+  }
+
+  CustomString str1;
+  CustomString str2;
+
+  // use str1 != str2 instead.
+  if (str1.compare(str2)) {
+  }
+
+If `StringLikeClasses` contains ``CustomString``, the check will suggest
+replacing ``compare`` with equality operator.
diff --git a/clang-tools-extra/test/clang-query/Inputs/empty.script b/clang-tools-extra/test/clang-query/Inputs/empty.script
new file mode 100644
index 000000000000..3c30abd1ae5d
--- /dev/null
+++ b/clang-tools-extra/test/clang-query/Inputs/empty.script
@@ -0,0 +1 @@
+# This file intentionally has no queries
diff --git a/clang-tools-extra/test/clang-query/Inputs/file.script b/clang-tools-extra/test/clang-query/Inputs/file.script
new file mode 100644
index 000000000000..b58e7bbc24bf
--- /dev/null
+++ b/clang-tools-extra/test/clang-query/Inputs/file.script
@@ -0,0 +1 @@
+f DIRECTORY/runtime_file.script
diff --git a/clang-tools-extra/test/clang-query/Inputs/runtime_file.script b/clang-tools-extra/test/clang-query/Inputs/runtime_file.script
new file mode 100644
index 000000000000..714d7f03b1bf
--- /dev/null
+++ b/clang-tools-extra/test/clang-query/Inputs/runtime_file.script
@@ -0,0 +1,5 @@
+set bind-root false
+
+l func functionDecl(hasName("bar"))
+m func.bind("f")
+m varDecl().bind("v")
+\ No newline at end of file
diff --git a/clang-tools-extra/test/clang-query/errors.c b/clang-tools-extra/test/clang-query/errors.c
index bbb742125744..3b9059ab0257 100644
--- a/clang-tools-extra/test/clang-query/errors.c
+++ b/clang-tools-extra/test/clang-query/errors.c
@@ -1,10 +1,12 @@
 // RUN: not clang-query -c foo -c bar %s -- | FileCheck %s
 // RUN: not clang-query -f %S/Inputs/foo.script %s -- | FileCheck %s
 // RUN: not clang-query -f %S/Inputs/nonexistent.script %s -- 2>&1 | FileCheck --check-prefix=CHECK-NONEXISTENT %s
+// RUN: not clang-query -c 'file %S/Inputs/nonexistent.script' %s -- 2>&1 | FileCheck --check-prefix=CHECK-NONEXISTENT-FILEQUERY %s
 // RUN: not clang-query -c foo -f foo %s -- 2>&1 | FileCheck --check-prefix=CHECK-BOTH %s
 
 // CHECK: unknown command: foo
 // CHECK-NOT: unknown command: bar
 
 // CHECK-NONEXISTENT: cannot open {{.*}}nonexistent.script
+// CHECK-NONEXISTENT-FILEQUERY: cannot open {{.*}}nonexistent.script
 // CHECK-BOTH: cannot specify both -c and -f
diff --git a/clang-tools-extra/test/clang-query/file-empty.c b/clang-tools-extra/test/clang-query/file-empty.c
new file mode 100644
index 000000000000..15137c57e915
--- /dev/null
+++ b/clang-tools-extra/test/clang-query/file-empty.c
@@ -0,0 +1,2 @@
+// RUN: clang-query -c 'file %S/Inputs/empty.script' %s --
+// COM: no output expected; nothing to CHECK
diff --git a/clang-tools-extra/test/clang-query/file-query.c b/clang-tools-extra/test/clang-query/file-query.c
new file mode 100644
index 000000000000..10a44e7aaccf
--- /dev/null
+++ b/clang-tools-extra/test/clang-query/file-query.c
@@ -0,0 +1,14 @@
+// RUN: rm -rf %/t
+// RUN: mkdir %/t
+// RUN: cp %/S/Inputs/file.script %/t/file.script
+// RUN: cp %/S/Inputs/runtime_file.script %/t/runtime_file.script
+// Need to embed the correct temp path in the actual JSON-RPC requests.
+// RUN: sed -e "s|DIRECTORY|%/t|" %/t/file.script > %/t/file.script.temp
+
+// RUN: clang-query -c 'file %/t/file.script.temp' %s -- | FileCheck %s
+
+// CHECK: file-query.c:11:1: note: "f" binds here
+void bar(void) {}
+
+// CHECK: file-query.c:14:1: note: "v" binds here
+int baz{1};
diff --git a/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/string b/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/string
index d031f27beb9d..0c160bc182b6 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/string
+++ b/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/string
@@ -108,6 +108,8 @@ struct basic_string_view {
   constexpr bool starts_with(C ch) const noexcept;
   constexpr bool starts_with(const C* s) const;
 
+  constexpr int compare(basic_string_view sv) const noexcept;
+
   static constexpr size_t npos = -1;
 };
 
@@ -132,6 +134,14 @@ bool operator==(const std::wstring&, const std::wstring&);
 bool operator==(const std::wstring&, const wchar_t*);
 bool operator==(const wchar_t*, const std::wstring&);
 
+bool operator==(const std::string_view&, const std::string_view&);
+bool operator==(const std::string_view&, const char*);
+bool operator==(const char*, const std::string_view&);
+
+bool operator!=(const std::string_view&, const std::string_view&);
+bool operator!=(const std::string_view&, const char*);
+bool operator!=(const char*, const std::string_view&);
+
 size_t strlen(const char* str);
 }
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/casting-through-void.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/casting-through-void.cpp
index 3913d2d8a295..a784e4988587 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/casting-through-void.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/casting-through-void.cpp
@@ -89,3 +89,10 @@ void bit_cast() {
   __builtin_bit_cast(int *, static_cast<void *>(&d));
   // CHECK-MESSAGES: :[[@LINE-1]]:29: warning: do not cast 'double *' to 'int *' through 'void *' [bugprone-casting-through-void]
 }
+
+namespace PR87069 {
+  void castconstVoidToVoid() {
+    const void* ptr = nullptr;
+    int* numberPtr = static_cast<int*>(const_cast<void*>(ptr));
+  }
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/return-const-ref-from-parameter.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/return-const-ref-from-parameter.cpp
index a83a019ec743..ca41bdf74a10 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/return-const-ref-from-parameter.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/return-const-ref-from-parameter.cpp
@@ -1,9 +1,18 @@
-// RUN: %check_clang_tidy %s bugprone-return-const-ref-from-parameter %t
+// RUN: %check_clang_tidy %s bugprone-return-const-ref-from-parameter %t -- -- -fno-delayed-template-parsing
 
 using T = int;
 using TConst = int const;
 using TConstRef = int const&;
 
+template <typename T>
+struct Wrapper { Wrapper(T); };
+
+template <typename T>
+struct Identity { using type = T; };
+
+template <typename T>
+struct ConstRef { using type = const T&; };
+
 namespace invalid {
 
 int const &f1(int const &a) { return a; }
@@ -18,8 +27,59 @@ int const &f3(TConstRef a) { return a; }
 int const &f4(TConst &a) { return a; }
 // CHECK-MESSAGES: :[[@LINE-1]]:35: warning: returning a constant reference parameter
 
+template <typename T>
+const T& tf1(const T &a) { return a; }
+// CHECK-MESSAGES: :[[@LINE-1]]:35: warning: returning a constant reference parameter
+
+template <typename T>
+const T& itf1(const T &a) { return a; }
+// CHECK-MESSAGES: :[[@LINE-1]]:36: warning: returning a constant reference parameter
+
+template <typename T>
+typename ConstRef<T>::type itf2(const T &a) { return a; }
+// CHECK-MESSAGES: :[[@LINE-1]]:54: warning: returning a constant reference parameter
+
+template <typename T>
+typename ConstRef<T>::type itf3(typename ConstRef<T>::type a) { return a; }
+// CHECK-MESSAGES: :[[@LINE-1]]:72: warning: returning a constant reference parameter
+
+template <typename T>
+const T& itf4(typename ConstRef<T>::type a) { return a; }
+// CHECK-MESSAGES: :[[@LINE-1]]:54: warning: returning a constant reference parameter
+
+void instantiate(const int &param, const float &paramf, int &mut_param, float &mut_paramf) {
+        itf1(0);
+        itf1(param);
+        itf1(paramf);
+        itf2(0);
+        itf2(param);
+        itf2(paramf);
+        itf3<int>(0);
+        itf3<int>(param);
+        itf3<float>(paramf);
+        itf4<int>(0);
+        itf4<int>(param);
+        itf4<float>(paramf);
+}
+
+struct C {
+    const C& foo(const C&c) { return c; }
+// CHECK-MESSAGES: :[[@LINE-1]]:38: warning: returning a constant reference parameter
+};
+
 } // namespace invalid
 
+namespace false_negative_because_dependent_and_not_instantiated {
+template <typename T>
+typename ConstRef<T>::type tf2(const T &a) { return a; }
+
+template <typename T>
+typename ConstRef<T>::type tf3(typename ConstRef<T>::type a) { return a; }
+
+template <typename T>
+const T& tf4(typename ConstRef<T>::type a) { return a; }
+} // false_negative_because_dependent_and_not_instantiated
+
 namespace valid {
 
 int const &f1(int &a) { return a; }
@@ -28,4 +88,58 @@ int const &f2(int &&a) { return a; }
 
 int f1(int const &a) { return a; }
 
+template <typename T>
+T tf1(T a) { return a; }
+
+template <typename T>
+T tf2(const T a) { return a; }
+
+template <typename T>
+T tf3(const T &a) { return a; }
+
+template <typename T>
+Identity<T>::type tf4(const T &a) { return a; }
+
+template <typename T>
+T itf1(T a) { return a; }
+
+template <typename T>
+T itf2(const T a) { return a; }
+
+template <typename T>
+T itf3(const T &a) { return a; }
+
+template <typename T>
+Wrapper<T> itf4(const T& a) { return a; }
+
+template <typename T>
+const T& itf5(T& a) { return a; }
+
+template <typename T>
+T itf6(T& a) { return a; }
+
+void instantiate(const int &param, const float &paramf, int &mut_param, float &mut_paramf) {
+        itf1(0);
+        itf1(param);
+        itf1(paramf);
+        itf2(0);
+        itf2(param);
+        itf2(paramf);
+        itf3(0);
+        itf3(param);
+        itf3(paramf);
+        itf2(0);
+        itf2(param);
+        itf2(paramf);
+        itf3(0);
+        itf3(param);
+        itf3(paramf);
+        itf4(param);
+        itf4(paramf);
+        itf5(mut_param);
+        itf5(mut_paramf);
+        itf6(mut_param);
+        itf6(mut_paramf);
+}
+
 } // namespace valid
diff --git a/clang-tools-extra/test/clang-tidy/checkers/hicpp/signed-bitwise-integer-literals.cpp b/clang-tools-extra/test/clang-tidy/checkers/hicpp/signed-bitwise-integer-literals.cpp
index edbb56f90cb0..aca7ae1fd76f 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/hicpp/signed-bitwise-integer-literals.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/hicpp/signed-bitwise-integer-literals.cpp
@@ -11,6 +11,7 @@ void examples() {
   // CHECK-MESSAGES: :[[@LINE-1]]:19: warning: use of a signed integer operand with a binary bitwise operator
 
   unsigned URes2 = URes << 1; //Ok
+  unsigned URes3 = URes & 1; //Ok
 
   int IResult;
   IResult = 10 & 2; //Ok
@@ -21,6 +22,8 @@ void examples() {
   IResult = Int << 1;
   // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: use of a signed integer operand with a binary bitwise operator
   IResult = ~0; //Ok
+  IResult = -1 & 1;
+  // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: use of a signed integer operand with a binary bitwise operator [hicpp-signed-bitwise]
 }
 
 enum EnumConstruction {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/Inputs/duplicate-include/duplicate-include.h b/clang-tools-extra/test/clang-tidy/checkers/readability/Inputs/duplicate-include/duplicate-include.h
index bf288023274b..22d3a3acbc91 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/Inputs/duplicate-include/duplicate-include.h
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/Inputs/duplicate-include/duplicate-include.h
@@ -1,15 +1,15 @@
-#ifndef READABILITY_DUPLICATE_INCLUDE_H
-#define READABILITY_DUPLICATE_INCLUDE_H
-
-extern int g;
-#include "duplicate-include2.h"
-extern int h;
-#include "duplicate-include2.h"
-extern int i;
-// CHECK-MESSAGES: :[[@LINE-2]]:1: warning: duplicate include
-// CHECK-FIXES:      {{^extern int g;$}}
-// CHECK-FIXES-NEXT: {{^#include "duplicate-include2.h"$}}
-// CHECK-FIXES-NEXT: {{^extern int h;$}}
-// CHECK-FIXES-NEXT: {{^extern int i;$}}
-
-#endif
+#ifndef READABILITY_DUPLICATE_INCLUDE_H
+#define READABILITY_DUPLICATE_INCLUDE_H
+
+extern int g;
+#include "duplicate-include2.h"
+extern int h;
+#include "duplicate-include2.h"
+extern int i;
+// CHECK-MESSAGES: :[[@LINE-2]]:1: warning: duplicate include
+// CHECK-FIXES:      {{^extern int g;$}}
+// CHECK-FIXES-NEXT: {{^#include "duplicate-include2.h"$}}
+// CHECK-FIXES-NEXT: {{^extern int h;$}}
+// CHECK-FIXES-NEXT: {{^extern int i;$}}
+
+#endif
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/Inputs/duplicate-include/duplicate-include2.h b/clang-tools-extra/test/clang-tidy/checkers/readability/Inputs/duplicate-include/duplicate-include2.h
index 58dfa757ee7a..fcbabe12fc37 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/Inputs/duplicate-include/duplicate-include2.h
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/Inputs/duplicate-include/duplicate-include2.h
@@ -1 +1 @@
-// This file is intentionally empty.
+// This file is intentionally empty.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/Inputs/duplicate-include/system/sys/types.h b/clang-tools-extra/test/clang-tidy/checkers/readability/Inputs/duplicate-include/system/sys/types.h
index 58dfa757ee7a..fcbabe12fc37 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/Inputs/duplicate-include/system/sys/types.h
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/Inputs/duplicate-include/system/sys/types.h
@@ -1 +1 @@
-// This file is intentionally empty.
+// This file is intentionally empty.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/const-return-type.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/const-return-type.cpp
index 10b2858c9caa..76a3555663b1 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/const-return-type.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/const-return-type.cpp
@@ -215,11 +215,9 @@ CREATE_FUNCTION();
 
 using ty = const int;
 ty p21() {}
-// CHECK-MESSAGES: [[@LINE-1]]:1: warning: return type 'ty' (aka 'const int') is
 
 typedef const int ty2;
 ty2 p22() {}
-// CHECK-MESSAGES: [[@LINE-1]]:1: warning: return type 'ty2' (aka 'const int') i
 
 // Declaration uses a macro, while definition doesn't.  In this case, we won't
 // fix the declaration, and will instead issue a warning.
@@ -249,7 +247,6 @@ auto p27() -> int const { return 3; }
 // CHECK-MESSAGES: [[@LINE-1]]:1: warning: return type 'const int' is 'const'-qu
 
 std::add_const<int>::type p28() { return 3; }
-// CHECK-MESSAGES: [[@LINE-1]]:1: warning: return type 'std::add_const<int>::typ
 
 // p29, p30 are based on
 // llvm/projects/test-suite/SingleSource/Benchmarks/Misc-C++-EH/spirit.cpp:
@@ -355,3 +352,20 @@ struct p41 {
   // CHECK-FIXES: T foo() const { return 2; }
 };
 template struct p41<int>;
+
+namespace PR73270 {
+  template<typename K, typename V>
+  struct Pair {
+    using first_type = const K;
+    using second_type = V;
+  };
+
+  template<typename PairType>
+  typename PairType::first_type getFirst() {
+    return {};
+  }
+
+  void test() {
+    getFirst<Pair<int, int>>();
+  }
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/else-after-return-if-constexpr.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/else-after-return-if-constexpr.cpp
index 6532940eaf23..1edb3237eaf4 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/else-after-return-if-constexpr.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/else-after-return-if-constexpr.cpp
@@ -1,22 +1,22 @@
-// RUN: %check_clang_tidy %s readability-else-after-return %t -- -- -std=c++17
-
-// Constexpr if is an exception to the rule, we cannot remove the else.
-void f() {
-  if (sizeof(int) > 4)
-    return;
-  else
-    return;
-  // CHECK-MESSAGES: [[@LINE-2]]:3: warning: do not use 'else' after 'return'
-
-  if constexpr (sizeof(int) > 4)
-    return;
-  else
-    return;
-
-  if constexpr (sizeof(int) > 4)
-    return;
-  else if constexpr (sizeof(long) > 4)
-    return;
-  else
-    return;
-}
+// RUN: %check_clang_tidy %s readability-else-after-return %t -- -- -std=c++17
+
+// Constexpr if is an exception to the rule, we cannot remove the else.
+void f() {
+  if (sizeof(int) > 4)
+    return;
+  else
+    return;
+  // CHECK-MESSAGES: [[@LINE-2]]:3: warning: do not use 'else' after 'return'
+
+  if constexpr (sizeof(int) > 4)
+    return;
+  else
+    return;
+
+  if constexpr (sizeof(int) > 4)
+    return;
+  else if constexpr (sizeof(long) > 4)
+    return;
+  else
+    return;
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/magic-numbers-todo.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/magic-numbers-todo.cpp
deleted file mode 100644
index 99d9be262a89..000000000000
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/magic-numbers-todo.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: %check_clang_tidy %s readability-magic-numbers %t --
-// XFAIL: *
-
-int ProcessSomething(int input);
-
-int DoWork()
-{
-  if (((int)4) > ProcessSomething(10))
-  // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: 4 is a magic number; consider replacing it with a named constant [readability-magic-numbers]
-    return 0;
-
-   return 0;
-}
-
-
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/simplify-boolean-expr-macros.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/simplify-boolean-expr-macros.cpp
index 7d0cfe7e27dc..d1df79e23a1e 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/simplify-boolean-expr-macros.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/simplify-boolean-expr-macros.cpp
@@ -6,6 +6,7 @@
 // RUN:     --
 
 #define NEGATE(expr) !(expr)
+#define NOT_AND_NOT(a, b) (!a && !b)
 
 bool without_macro(bool a, bool b) {
     return !(!a && b);
@@ -13,8 +14,17 @@ bool without_macro(bool a, bool b) {
     // CHECK-FIXES: return a || !b;
 }
 
-bool macro(bool a, bool b) {
-    return NEGATE(!a && b);
-    // CHECK-MESSAGES-MACROS: :[[@LINE-1]]:12: warning: boolean expression can be simplified by DeMorgan's theorem
-    // CHECK-FIXES: return NEGATE(!a && b);
+void macro(bool a, bool b) {
+    NEGATE(!a && b);
+    // CHECK-MESSAGES-MACROS: :[[@LINE-1]]:5: warning: boolean expression can be simplified by DeMorgan's theorem
+    // CHECK-FIXES: NEGATE(!a && b);
+    !NOT_AND_NOT(a, b);
+    // CHECK-MESSAGES-MACROS: :[[@LINE-1]]:5: warning: boolean expression can be simplified by DeMorgan's theorem
+    // CHECK-FIXES: !NOT_AND_NOT(a, b);
+    !(NEGATE(a) && b);
+    // CHECK-MESSAGES-MACROS: :[[@LINE-1]]:5: warning: boolean expression can be simplified by DeMorgan's theorem
+    // CHECK-FIXES: !(NEGATE(a) && b);
+    !(a && NEGATE(b));
+    // CHECK-MESSAGES-MACROS: :[[@LINE-1]]:5: warning: boolean expression can be simplified by DeMorgan's theorem
+    // CHECK-FIXES: !(a && NEGATE(b));
 }
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/static-accessed-through-instance.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/static-accessed-through-instance.cpp
index 81c1cecf607f..202fe9be6d00 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/static-accessed-through-instance.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/static-accessed-through-instance.cpp
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy %s readability-static-accessed-through-instance %t -- -- -isystem %S/Inputs/static-accessed-through-instance
+// RUN: %check_clang_tidy %s readability-static-accessed-through-instance %t -- --fix-notes -- -isystem %S/Inputs/static-accessed-through-instance
 #include <__clang_cuda_builtin_vars.h>
 
 enum OutEnum {
@@ -47,7 +47,8 @@ C &f(int, int, int, int);
 void g() {
   f(1, 2, 3, 4).x;
   // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: static member accessed through instance  [readability-static-accessed-through-instance]
-  // CHECK-FIXES: {{^}}  f(1, 2, 3, 4).x;{{$}}
+  // CHECK-MESSAGES: :[[@LINE-2]]:3: note: member base expression may carry some side effects
+  // CHECK-FIXES: {{^}}  C::x;{{$}}
 }
 
 int i(int &);
@@ -59,12 +60,14 @@ int k(bool);
 void f(C c) {
   j(i(h().x));
   // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: static member
-  // CHECK-FIXES: {{^}}  j(i(h().x));{{$}}
+  // CHECK-MESSAGES: :[[@LINE-2]]:7: note: member base expression may carry some side effects
+  // CHECK-FIXES: {{^}}  j(i(C::x));{{$}}
 
   // The execution of h() depends on the return value of a().
   j(k(a() && h().x));
   // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: static member
-  // CHECK-FIXES: {{^}}  j(k(a() && h().x));{{$}}
+  // CHECK-MESSAGES: :[[@LINE-2]]:14: note: member base expression may carry some side effects
+  // CHECK-FIXES: {{^}}  j(k(a() && C::x));{{$}}
 
   if ([c]() {
         c.ns();
@@ -72,7 +75,8 @@ void f(C c) {
       }().x == 15)
     ;
   // CHECK-MESSAGES: :[[@LINE-5]]:7: warning: static member
-  // CHECK-FIXES: {{^}}  if ([c]() {{{$}}
+  // CHECK-MESSAGES: :[[@LINE-6]]:7: note: member base expression may carry some side effects
+  // CHECK-FIXES: {{^}}  if (C::x == 15){{$}}
 }
 
 // Nested specifiers
@@ -261,8 +265,11 @@ struct Qptr {
 };
 
 int func(Qptr qp) {
-  qp->y = 10; // OK, the overloaded operator might have side-effects.
-  qp->K = 10; //
+  qp->y = 10;
+  qp->K = 10;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: static member accessed through instance [readability-static-accessed-through-instance]
+  // CHECK-MESSAGES: :[[@LINE-2]]:3: note: member base expression may carry some side effects
+  // CHECK-FIXES: {{^}}  Q::K = 10;
 }
 
 namespace {
@@ -380,3 +387,20 @@ namespace PR51861 {
     // CHECK-FIXES: {{^}}    PR51861::Foo::getBar();{{$}}
   }
 }
+
+namespace PR75163 {
+  struct Static {
+    static void call();
+  };
+
+  struct Ptr {
+    Static* operator->();
+  };
+
+  void test(Ptr& ptr) {
+    ptr->call();
+    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: static member accessed through instance [readability-static-accessed-through-instance]
+    // CHECK-MESSAGES: :[[@LINE-2]]:5: note: member base expression may carry some side effects
+    // CHECK-FIXES: {{^}}    PR75163::Static::call();{{$}}
+  }
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/string-compare-custom-string-classes.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/string-compare-custom-string-classes.cpp
new file mode 100644
index 000000000000..faf135833ee1
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/string-compare-custom-string-classes.cpp
@@ -0,0 +1,35 @@
+// RUN: %check_clang_tidy %s readability-string-compare %t -- -config='{CheckOptions: {readability-string-compare.StringLikeClasses: "CustomStringTemplateBase;CustomStringNonTemplateBase"}}' -- -isystem %clang_tidy_headers
+#include <string>
+
+struct CustomStringNonTemplateBase {
+  int compare(const CustomStringNonTemplateBase& Other) const {
+    return 123;  // value is not important for check
+  }
+};
+
+template <typename T>
+struct CustomStringTemplateBase {
+  int compare(const CustomStringTemplateBase& Other) const {
+    return 123;
+  }
+};
+
+struct CustomString1 : CustomStringNonTemplateBase {};
+struct CustomString2 : CustomStringTemplateBase<char> {};
+
+void CustomStringClasses() {
+  std::string_view sv1("a");
+  std::string_view sv2("b");
+  if (sv1.compare(sv2)) {  // No warning - if a std class is not listed in StringLikeClasses, it won't be checked.
+  }
+
+  CustomString1 custom1;
+  if (custom1.compare(custom1)) {
+  }
+  // CHECK-MESSAGES: [[@LINE-2]]:7: warning: do not use 'compare' to test equality of strings; use the string equality operator instead [readability-string-compare]
+
+  CustomString2 custom2;
+  if (custom2.compare(custom2)) {
+  }
+  // CHECK-MESSAGES: [[@LINE-2]]:7: warning: do not use 'compare' to test equality of strings; use the string equality operator instead [readability-string-compare]
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/string-compare.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/string-compare.cpp
index 2c08b86cf72f..c4fea4341617 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/string-compare.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/string-compare.cpp
@@ -67,11 +67,27 @@ void Test() {
   if (str1.compare(comp())) {
   }
   // CHECK-MESSAGES: [[@LINE-2]]:7: warning: do not use 'compare' to test equality of strings;
+
+  std::string_view sv1("a");
+  std::string_view sv2("b");
+  if (sv1.compare(sv2)) {
+  }
+  // CHECK-MESSAGES: [[@LINE-2]]:7: warning: do not use 'compare' to test equality of strings; use the string equality operator instead [readability-string-compare]
+}
+
+struct DerivedFromStdString : std::string {};
+
+void TestDerivedClass() {
+  DerivedFromStdString derived;
+  if (derived.compare(derived)) {
+  }
+  // CHECK-MESSAGES: [[@LINE-2]]:7: warning: do not use 'compare' to test equality of strings; use the string equality operator instead [readability-string-compare]
 }
 
 void Valid() {
   std::string str1("a", 1);
   std::string str2("b", 1);
+
   if (str1 == str2) {
   }
   if (str1 != str2) {
@@ -96,4 +112,11 @@ void Valid() {
   }
   if (str1.compare(str2) == -1) {
   }
+
+  std::string_view sv1("a");
+  std::string_view sv2("b");
+  if (sv1 == sv2) {
+  }
+  if (sv1.compare(sv2) > 0) {
+  }
 }
diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/config-files.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/config-files.cpp
index cb0f0bc4d133..d287412454ca 100644
--- a/clang-tools-extra/test/clang-tidy/infrastructure/config-files.cpp
+++ b/clang-tools-extra/test/clang-tidy/infrastructure/config-files.cpp
@@ -66,9 +66,5 @@
 // RUN: clang-tidy --checks="-*,readability-identifier-naming" --dump-config %S/Inputs/config-files/- -- | grep "readability-identifier-naming\." | sort --check
 
 // Dumped config does not overflow for unsigned options
-// RUN: clang-tidy --dump-config \
-// RUN: --checks="-*,misc-throw-by-value-catch-by-reference" \
-// RUN: -- | grep -v -q "misc-throw-by-value-catch-by-reference.MaxSize: '-1'"
-
-// RUN: clang-tidy --dump-config %S/Inputs/config-files/5/- \
-// RUN: -- | grep -q "misc-throw-by-value-catch-by-reference.MaxSize: '1152921504606846976'"
+// RUN: clang-tidy --dump-config %S/Inputs/config-files/5/- -- | FileCheck %s -check-prefix=CHECK-OVERFLOW
+// CHECK-OVERFLOW: misc-throw-by-value-catch-by-reference.MaxSize: '1152921504606846976'
diff --git a/clang-tools-extra/test/modularize/Inputs/CompileError/module.modulemap b/clang-tools-extra/test/modularize/Inputs/CompileError/module.modulemap
index 64180adf5beb..f71b66f148ed 100644
--- a/clang-tools-extra/test/modularize/Inputs/CompileError/module.modulemap
+++ b/clang-tools-extra/test/modularize/Inputs/CompileError/module.modulemap
@@ -1,10 +1,10 @@
-// module.modulemap
-
-module Level1A {
-  header "Level1A.h"
-  export *
-}
-module HasError {
-  header "HasError.h"
-  export *
-}
+// module.modulemap
+
+module Level1A {
+  header "Level1A.h"
+  export *
+}
+module HasError {
+  header "HasError.h"
+  export *
+}
diff --git a/clang-tools-extra/test/modularize/Inputs/MissingHeader/module.modulemap b/clang-tools-extra/test/modularize/Inputs/MissingHeader/module.modulemap
index 9acb4923f9ac..330e13f5ee15 100644
--- a/clang-tools-extra/test/modularize/Inputs/MissingHeader/module.modulemap
+++ b/clang-tools-extra/test/modularize/Inputs/MissingHeader/module.modulemap
@@ -1,10 +1,10 @@
-// module.modulemap
-
-module Level1A {
-  header "Level1A.h"
-  export *
-}
-module Missing {
-  header "Missing.h"
-  export *
-}
+// module.modulemap
+
+module Level1A {
+  header "Level1A.h"
+  export *
+}
+module Missing {
+  header "Missing.h"
+  export *
+}
diff --git a/clang-tools-extra/test/pp-trace/Inputs/module.modulemap b/clang-tools-extra/test/pp-trace/Inputs/module.modulemap
index f16bbc6e2e05..415c874f09d3 100644
--- a/clang-tools-extra/test/pp-trace/Inputs/module.modulemap
+++ b/clang-tools-extra/test/pp-trace/Inputs/module.modulemap
@@ -1,18 +1,18 @@
-// module.modulemap
-
-module Level1A {
-  header "Level1A.h"
-  export *
-}
-module Level1B {
-  header "Level1B.h"
-  export *
-  module Level2B {
-    header "Level2B.h"
-    export *
-  }
-}
-module Level2A {
-  header "Level2A.h"
-  export *
-}
+// module.modulemap
+
+module Level1A {
+  header "Level1A.h"
+  export *
+}
+module Level1B {
+  header "Level1B.h"
+  export *
+  module Level2B {
+    header "Level2B.h"
+    export *
+  }
+}
+module Level2A {
+  header "Level2A.h"
+  export *
+}
diff --git a/clang-tools-extra/unittests/clang-query/QueryParserTest.cpp b/clang-tools-extra/unittests/clang-query/QueryParserTest.cpp
index 06b0d7b36590..b561e2bb9833 100644
--- a/clang-tools-extra/unittests/clang-query/QueryParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-query/QueryParserTest.cpp
@@ -197,7 +197,7 @@ TEST_F(QueryParserTest, Comment) {
 TEST_F(QueryParserTest, Complete) {
   std::vector<llvm::LineEditor::Completion> Comps =
       QueryParser::complete("", 0, QS);
-  ASSERT_EQ(8u, Comps.size());
+  ASSERT_EQ(9u, Comps.size());
   EXPECT_EQ("help ", Comps[0].TypedText);
   EXPECT_EQ("help", Comps[0].DisplayText);
   EXPECT_EQ("let ", Comps[1].TypedText);
@@ -214,6 +214,8 @@ TEST_F(QueryParserTest, Complete) {
   EXPECT_EQ("disable", Comps[6].DisplayText);
   EXPECT_EQ("unlet ", Comps[7].TypedText);
   EXPECT_EQ("unlet", Comps[7].DisplayText);
+  EXPECT_EQ("file ", Comps[8].TypedText);
+  EXPECT_EQ("file", Comps[8].DisplayText);
 
   Comps = QueryParser::complete("set o", 5, QS);
   ASSERT_EQ(1u, Comps.size());
diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt
index cf97e3c6e851..c20ce47a12ab 100644
--- a/clang/CMakeLists.txt
+++ b/clang/CMakeLists.txt
@@ -523,6 +523,8 @@ endif()
 
 
 if( CLANG_INCLUDE_TESTS )
+  find_package(Perl)
+
   add_subdirectory(unittests)
   list(APPEND CLANG_TEST_DEPS ClangUnitTests)
   list(APPEND CLANG_TEST_PARAMS
diff --git a/clang/docs/Block-ABI-Apple.rst b/clang/docs/Block-ABI-Apple.rst
index 68f7a3819ca2..f46f2f991ad7 100644
--- a/clang/docs/Block-ABI-Apple.rst
+++ b/clang/docs/Block-ABI-Apple.rst
@@ -80,7 +80,7 @@ The following flags bits are in use thusly for a possible ABI.2010.3.16:
 In 10.6.ABI the (1<<29) was usually set and was always ignored by the runtime -
 it had been a transitional marker that did not get deleted after the
 transition. This bit is now paired with (1<<30), and represented as the pair
-(3<<30), for the following combinations of valid bit settings, and their
+(3<<29), for the following combinations of valid bit settings, and their
 meanings:
 
 .. code-block:: c
diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index 39f7cded36ed..6d092219877f 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -861,7 +861,8 @@ the configuration (without a prefix: ``Auto``).
 
 **AlignConsecutiveShortCaseStatements** (``ShortCaseStatementsAlignmentStyle``) :versionbadge:`clang-format 17` :ref:`¶ <AlignConsecutiveShortCaseStatements>`
   Style of aligning consecutive short case labels.
-  Only applies if ``AllowShortCaseLabelsOnASingleLine`` is ``true``.
+  Only applies if ``AllowShortCaseExpressionOnASingleLine`` or
+  ``AllowShortCaseLabelsOnASingleLine`` is ``true``.
 
 
   .. code-block:: yaml
@@ -935,8 +936,26 @@ the configuration (without a prefix: ``Auto``).
       default: return "";
       }
 
-  * ``bool AlignCaseColons`` Whether aligned case labels are aligned on the colon, or on the
-    , or on the tokens after the colon.
+  * ``bool AlignCaseArrows`` Whether to align the case arrows when aligning short case expressions.
+
+    .. code-block:: java
+
+      true:
+      i = switch (day) {
+        case THURSDAY, SATURDAY -> 8;
+        case WEDNESDAY          -> 9;
+        default                 -> 0;
+      };
+
+      false:
+      i = switch (day) {
+        case THURSDAY, SATURDAY -> 8;
+        case WEDNESDAY ->          9;
+        default ->                 0;
+      };
+
+  * ``bool AlignCaseColons`` Whether aligned case labels are aligned on the colon, or on the tokens
+    after the colon.
 
     .. code-block:: c++
 
@@ -1692,6 +1711,21 @@ the configuration (without a prefix: ``Auto``).
 
 
 
+.. _AllowShortCaseExpressionOnASingleLine:
+
+**AllowShortCaseExpressionOnASingleLine** (``Boolean``) :versionbadge:`clang-format 19` :ref:`¶ <AllowShortCaseExpressionOnASingleLine>`
+  Whether to merge a short switch labeled rule into a single line.
+
+  .. code-block:: java
+
+    true:                               false:
+    switch (a) {           vs.          switch (a) {
+    case 1 -> 1;                        case 1 ->
+    default -> 0;                         1;
+    };                                  default ->
+                                          0;
+                                        };
+
 .. _AllowShortCaseLabelsOnASingleLine:
 
 **AllowShortCaseLabelsOnASingleLine** (``Boolean``) :versionbadge:`clang-format 3.6` :ref:`¶ <AllowShortCaseLabelsOnASingleLine>`
diff --git a/clang/docs/ClangOffloadBundler.rst b/clang/docs/ClangOffloadBundler.rst
index 515e6c00a3b8..3c241027d405 100644
--- a/clang/docs/ClangOffloadBundler.rst
+++ b/clang/docs/ClangOffloadBundler.rst
@@ -245,7 +245,7 @@ Where:
                     object as a data section with the name ``.hip_fatbin``.
 
       hipv4         Offload code object for the HIP language. Used for AMD GPU
-                    code objects with at least ABI version V4 when the
+                    code objects with at least ABI version V4 and above when the
                     ``clang-offload-bundler`` is used to create a *fat binary*
                     to be loaded by the HIP runtime. The fat binary can be
                     loaded directly from a file, or be embedded in the host code
@@ -254,6 +254,14 @@ Where:
       openmp        Offload code object for the OpenMP language extension.
       ============= ==============================================================
 
+Note: The distinction between the `hip` and `hipv4` offload kinds is historically based.
+Originally, these designations might have indicated different versions of the
+code object ABI. However, as the system has evolved, the ABI version is now embedded
+directly within the code object itself, making these historical distinctions irrelevant
+during the unbundling process. Consequently, `hip` and `hipv4` are treated as compatible
+in current implementations, facilitating interchangeable handling of code objects
+without differentiation based on offload kind.
+
 **target-triple**
     The target triple of the code object. See `Target Triple
     <https://clang.llvm.org/docs/CrossCompilation.html#target-triple>`_.
@@ -295,7 +303,7 @@ Compatibility Rules for Bundle Entry ID
   A code object, specified using its Bundle Entry ID, can be loaded and
   executed on a target processor, if:
 
-  * Their offload kinds are the same.
+  * Their offload kinds are the same or comptible.
   * Their target triples are compatible.
   * Their Target IDs are compatible as defined in :ref:`compatibility-target-id`.
 
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index c2e90f4e7d58..a09c409f8f91 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -656,6 +656,7 @@ Unless specified otherwise operation(±0) = ±0 and operation(±infinity) = ±in
  T __builtin_elementwise_ceil(T x)           return the smallest integral value greater than or equal to x    floating point types
  T __builtin_elementwise_sin(T x)            return the sine of x interpreted as an angle in radians          floating point types
  T __builtin_elementwise_cos(T x)            return the cosine of x interpreted as an angle in radians        floating point types
+ T __builtin_elementwise_tan(T x)            return the tangent of x interpreted as an angle in radians       floating point types
  T __builtin_elementwise_floor(T x)          return the largest integral value less than or equal to x        floating point types
  T __builtin_elementwise_log(T x)            return the natural logarithm of x                                floating point types
  T __builtin_elementwise_log2(T x)           return the base 2 logarithm of x                                 floating point types
@@ -1661,8 +1662,11 @@ The following type trait primitives are supported by Clang. Those traits marked
   ``T`` from ``U`` is ill-formed.
   Deprecated, use ``__reference_constructs_from_temporary``.
 * ``__reference_constructs_from_temporary(T, U)`` (C++)
-  Returns true if a reference ``T`` can be constructed from a temporary of type
+  Returns true if a reference ``T`` can be direct-initialized from a temporary of type
   a non-cv-qualified ``U``.
+* ``__reference_converts_from_temporary(T, U)`` (C++)
+    Returns true if a reference ``T`` can be copy-initialized from a temporary of type
+    a non-cv-qualified ``U``.
 * ``__underlying_type`` (C++, GNU, Microsoft)
 
 In addition, the following expression traits are supported:
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 0e3f7cf89ca8..4702b8c10cdb 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -48,6 +48,13 @@ C++ Specific Potentially Breaking Changes
 - Clang now diagnoses function/variable templates that shadow their own template parameters, e.g. ``template<class T> void T();``.
   This error can be disabled via `-Wno-strict-primary-template-shadow` for compatibility with previous versions of clang.
 
+- The behavior controlled by the `-frelaxed-template-template-args` flag is now
+  on by default, and the flag is deprecated. Until the flag is finally removed,
+  it's negative spelling can be used to obtain compatibility with previous
+  versions of clang.
+
+- Clang now rejects pointer to member from parenthesized expression in unevaluated context such as ``decltype(&(foo::bar))``. (#GH40906).
+
 ABI Changes in This Version
 ---------------------------
 - Fixed Microsoft name mangling of implicitly defined variables used for thread
@@ -69,6 +76,9 @@ ABI Changes in This Version
   returning a class in a register. This affects some uses of std::pair.
   (#GH86384).
 
+- Fixed Microsoft calling convention when returning classes that have a deleted
+  copy assignment operator. Such a class should be returned indirectly.
+
 AST Dumping Potentially Breaking Changes
 ----------------------------------------
 
@@ -85,6 +95,25 @@ Clang Frontend Potentially Breaking Changes
   of ``-Wno-gnu-binary-literal`` will no longer silence this pedantic warning,
   which may break existing uses with ``-Werror``.
 
+- The normalization of 3 element target triples where ``-none-`` is the middle
+  element has changed. For example, ``armv7m-none-eabi`` previously normalized
+  to ``armv7m-none-unknown-eabi``, with ``none`` for the vendor and ``unknown``
+  for the operating system. It now normalizes to ``armv7m-unknown-none-eabi``,
+  which has ``unknown`` vendor and ``none`` operating system.
+
+  The affected triples are primarily for bare metal Arm where it is intended
+  that ``none`` means that there is no operating system. As opposed to an unknown
+  type of operating system.
+
+  This change my cause clang to not find libraries, or libraries to be built at
+  different file system locations. This can be fixed by changing your builds to
+  use the new normalized triple. However, we recommend instead getting the
+  normalized triple from clang itself, as this will make your builds more
+  robust in case of future changes::
+
+    $ clang --target=<your target triple> -print-target-triple
+    <the normalized target triple>
+
 What's New in Clang |release|?
 ==============================
 Some of the major new features and improvements to Clang are listed
@@ -94,6 +123,17 @@ sections with improvements to Clang's support for those languages.
 
 C++ Language Changes
 --------------------
+- C++17 support is now completed, with the enablement of the
+  relaxed temlate template argument matching rules introduced in P0522,
+  which was retroactively applied as a defect report.
+  While the implementation already existed since Clang 4, it was turned off by
+  default, and was controlled with the `-frelaxed-template-template-args` flag.
+  In this release, we implement provisional wording for a core defect on
+  P0522 (CWG2398), which avoids the most serious compatibility issues caused
+  by it, allowing us to enable it by default in this release.
+  The flag is now deprecated, and will be removed in the next release, but can
+  still be used to turn it off and regain compatibility with previous versions
+  (#GH36505).
 - Implemented ``_BitInt`` literal suffixes ``__wb`` or ``__WB`` as a Clang extension with ``unsigned`` modifiers also allowed. (#GH85223).
 
 C++17 Feature Support
@@ -142,6 +182,9 @@ C++23 Feature Support
 
 - Implemented `P2448R2: Relaxing some constexpr restrictions <https://wg21.link/P2448R2>`_.
 
+- Added a ``__reference_converts_from_temporary`` builtin, completing the necessary compiler support for
+  `P2255R2: Type trait to determine if a reference binds to a temporary <https://wg21.link/P2255R2>`_.
+
 C++2c Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
 
@@ -153,6 +196,9 @@ C++2c Feature Support
 
 - Implemented `P2748R5 Disallow Binding a Returned Glvalue to a Temporary <https://wg21.link/P2748R5>`_.
 
+- Implemented `P2809R3: Trivial infinite loops are not Undefined Behavior <https://wg21.link/P2809R3>`_.
+
+
 Resolutions to C++ Defect Reports
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 - Substitute template parameter pack, when it is not explicitly specified
@@ -173,6 +219,9 @@ Resolutions to C++ Defect Reports
 - Clang now diagnoses declarative nested-name-specifiers with pack-index-specifiers.
   (`CWG2858: Declarative nested-name-specifiers and pack-index-specifiers <https://cplusplus.github.io/CWG/issues/2858.html>`_).
 
+- P0522 implementation is enabled by default in all language versions, and
+  provisional wording for CWG2398 is implemented.
+
 C Language Changes
 ------------------
 
@@ -260,6 +309,11 @@ New Compiler Flags
   allow late parsing certain attributes in specific contexts where they would
   not normally be late parsed.
 
+- ``-fseparate-named-sections`` uses separate unique sections for global
+  symbols in named special sections (i.e. symbols annotated with
+  ``__attribute__((section(...)))``. This enables linker GC to collect unused
+  symbols without having to use a per-symbol section.
+
 Deprecated Compiler Flags
 -------------------------
 
@@ -294,6 +348,10 @@ Modified Compiler Flags
 - Carved out ``-Wformat`` warning about scoped enums into a subwarning and
   make it controlled by ``-Wformat-pedantic``. Fixes #GH88595.
 
+- Trivial infinite loops (i.e loops with a constant controlling expresion
+  evaluating to ``true`` and an empty body such as ``while(1);``)
+  are considered infinite, even when the ``-ffinite-loop`` flag is set.
+
 Removed Compiler Flags
 -------------------------
 
@@ -502,6 +560,9 @@ Bug Fixes in This Version
   The values of 0 and 1 block any unrolling of the loop. This keeps the same behavior with GCC.
   Fixes (`#88624 <https://github.com/llvm/llvm-project/issues/88624>`_).
 
+- Clang will no longer emit a duplicate -Wunused-value warning for an expression
+  `(A, B)` which evaluates to glvalue `B` that can be converted to non ODR-use. (#GH45783)
+
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -630,6 +691,23 @@ Bug Fixes to C++ Support
 - Fix a bug on template partial specialization with issue on deduction of nontype template parameter
   whose type is `decltype(auto)`. Fixes (#GH68885).
 - Clang now correctly treats the noexcept-specifier of a friend function to be a complete-class context.
+- Fix an assertion failure when parsing an invalid members of an anonymous class. (#GH85447)
+- Fixed a misuse of ``UnresolvedLookupExpr`` for ill-formed templated expressions. Fixes (#GH48673), (#GH63243)
+  and (#GH88832).
+- Clang now defers all substitution into the exception specification of a function template specialization
+  until the noexcept-specifier is instantiated.
+- Fix a crash when an implicitly declared ``operator==`` function with a trailing requires-clause has its
+  constraints compared to that of another declaration.
+- Fix a bug where explicit specializations of member functions/function templates would have substitution
+  performed incorrectly when checking constraints. Fixes (#GH90349).
+- Clang now allows constrained member functions to be explicitly specialized for an implicit instantiation
+  of a class template.
+- Fix a C++23 bug in implementation of P2564R3 which evaluates immediate invocations in place
+  within initializers for variables that are usable in constant expressions or are constant
+  initialized, rather than evaluating them as a part of the larger manifestly constant evaluated
+  expression.
+- Fix a bug in access control checking due to dealyed checking of friend declaration. Fixes (#GH12361).
+- Correctly treat the compound statement of an ``if consteval`` as an immediate context. Fixes (#GH91509).
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -729,6 +807,7 @@ CUDA/HIP Language Changes
 
 CUDA Support
 ^^^^^^^^^^^^
+- Clang now supports CUDA SDK up to 12.4
 
 AIX Support
 ^^^^^^^^^^^
@@ -781,6 +860,9 @@ clang-format
   ``BreakTemplateDeclarations``.
 - ``AlwaysBreakAfterReturnType`` is deprecated and renamed to
   ``BreakAfterReturnType``.
+- Handles Java ``switch`` expressions.
+- Adds ``AllowShortCaseExpressionOnASingleLine`` option.
+- Adds ``AlignCaseArrows`` suboption to ``AlignConsecutiveShortCaseStatements``.
 
 libclang
 --------
diff --git a/clang/docs/StandardCPlusPlusModules.rst b/clang/docs/StandardCPlusPlusModules.rst
index ee57fb5da648..1c3c4d319c0e 100644
--- a/clang/docs/StandardCPlusPlusModules.rst
+++ b/clang/docs/StandardCPlusPlusModules.rst
@@ -8,109 +8,92 @@ Standard C++ Modules
 Introduction
 ============
 
-The term ``modules`` has a lot of meanings. For the users of Clang, modules may
-refer to ``Objective-C Modules``, ``Clang C++ Modules`` (or ``Clang Header Modules``,
-etc.) or ``Standard C++ Modules``. The implementation of all these kinds of modules in Clang
-has a lot of shared code, but from the perspective of users, their semantics and
-command line interfaces are very different. This document focuses on
-an introduction of how to use standard C++ modules in Clang.
-
-There is already a detailed document about `Clang modules <Modules.html>`_, it
-should be helpful to read `Clang modules <Modules.html>`_ if you want to know
-more about the general idea of modules. Since standard C++ modules have different semantics
-(and work flows) from `Clang modules`, this page describes the background and use of
-Clang with standard C++ modules.
-
-Modules exist in two forms in the C++ Language Specification. They can refer to
-either "Named Modules" or to "Header Units". This document covers both forms.
+The term ``module`` is ambiguous, as it is used to mean multiple things in
+Clang. For Clang users, a module may refer to an ``Objective-C Module``,
+`Clang Module <Modules.html>`_ (also called a ``Clang Header Module``) or a
+``C++20 Module`` (or a ``Standard C++ Module``). The implementation of all
+these kinds of modules in Clang shares a lot of code, but from the perspective
+of users their semantics and command line interfaces are very different. This
+document is an introduction to the use of C++20 modules in Clang. In the
+remainder of this document, the term ``module`` will refer to Standard C++20
+modules and the term ``Clang module`` will refer to the Clang Modules
+extension.
+
+In terms of the C++ Standard, modules consist of two components: "Named
+Modules" or "Header Units". This document covers both.
 
 Standard C++ Named modules
 ==========================
 
-This document was intended to be a manual first and foremost, however, we consider it helpful to
-introduce some language background here for readers who are not familiar with
-the new language feature. This document is not intended to be a language
-tutorial; it will only introduce necessary concepts about the
-structure and building of the project.
+In order to better understand the compiler's behavior, it is helpful to
+understand some terms and definitions for readers who are not familiar with the
+C++ feature. This document is not a tutorial on C++; it only introduces
+necessary concepts to better understand use of modules in a project.
 
 Background and terminology
 --------------------------
 
-Modules
-~~~~~~~
-
-In this document, the term ``Modules``/``modules`` refers to standard C++ modules
-feature if it is not decorated by ``Clang``.
-
-Clang Modules
-~~~~~~~~~~~~~
-
-In this document, the term ``Clang Modules``/``Clang modules`` refer to Clang
-c++ modules extension. These are also known as ``Clang header modules``,
-``Clang module map modules`` or ``Clang c++ modules``.
-
 Module and module unit
 ~~~~~~~~~~~~~~~~~~~~~~
 
-A module consists of one or more module units. A module unit is a special
-translation unit. Every module unit must have a module declaration. The syntax
-of the module declaration is:
+A module consists of one or more module units. A module unit is a special kind
+of translation unit. A module unit should almost always start with a module
+declaration. The syntax of the module declaration is:
 
 .. code-block:: c++
 
   [export] module module_name[:partition_name];
 
-Terms enclosed in ``[]`` are optional. The syntax of ``module_name`` and ``partition_name``
-in regex form corresponds to ``[a-zA-Z_][a-zA-Z_0-9\.]*``. In particular, a literal dot ``.``
-in the name has no semantic meaning (e.g. implying a hierarchy).
+Terms enclosed in ``[]`` are optional. ``module_name`` and ``partition_name``
+follow the rules for a C++ identifier, except that they may contain one or more
+period (``.``) characters. Note that a ``.`` in the name has no semantic
+meaning and does not imply any hierarchy.
 
-In this document, module units are classified into:
+In this document, module units are classified as:
 
-* Primary module interface unit.
-
-* Module implementation unit.
-
-* Module interface partition unit.
-
-* Internal module partition unit.
+* Primary module interface unit
+* Module implementation unit
+* Module partition interface unit
+* Internal module partition unit
 
 A primary module interface unit is a module unit whose module declaration is
-``export module module_name;``. The ``module_name`` here denotes the name of the
+``export module module_name;`` where ``module_name`` denotes the name of the
 module. A module should have one and only one primary module interface unit.
 
 A module implementation unit is a module unit whose module declaration is
-``module module_name;``. A module could have multiple module implementation
-units with the same declaration.
+``module module_name;``. Multiple module implementation units can be declared
+in the same module.
 
-A module interface partition unit is a module unit whose module declaration is
+A module partition interface unit is a module unit whose module declaration is
 ``export module module_name:partition_name;``. The ``partition_name`` should be
 unique within any given module.
 
-An internal module partition unit is a module unit whose module declaration
-is ``module module_name:partition_name;``. The ``partition_name`` should be
-unique within any given module.
+An internal module partition unit is a module unit whose module
+declaration is ``module module_name:partition_name;``. The ``partition_name``
+should be unique within any given module.
 
-In this document, we use the following umbrella terms:
+In this document, we use the following terms:
 
 * A ``module interface unit`` refers to either a ``primary module interface unit``
-  or a ``module interface partition unit``.
+  or a ``module partition interface unit``.
 
-* An ``importable module unit`` refers to either a ``module interface unit``
-  or a ``internal module partition unit``.
+* An ``importable module unit`` refers to either a ``module interface unit`` or
+  an ``internal module partition unit``.
 
-* A ``module partition unit`` refers to either a ``module interface partition unit``
-  or a ``internal module partition unit``.
+* A ``module partition unit`` refers to either a ``module partition interface unit``
+  or an ``internal module partition unit``.
 
-Built Module Interface file
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Built Module Interface
+~~~~~~~~~~~~~~~~~~~~~~
 
-A ``Built Module Interface file`` stands for the precompiled result of an importable module unit.
-It is also called the acronym ``BMI`` generally.
+A ``Built Module Interface`` (or ``BMI``) is the precompiled result of an
+importable module unit.
 
 Global module fragment
 ~~~~~~~~~~~~~~~~~~~~~~
 
-In a module unit, the section from ``module;`` to the module declaration is called the global module fragment.
+The ``global module fragment`` (or ``GMF``) is the code between the ``module;``
+and the module declaration within a module unit.
 
 
 How to build projects using modules
@@ -138,7 +121,7 @@ Let's see a "hello world" example that uses modules.
     return 0;
   }
 
-Then we type:
+Then, on the command line, invoke Clang like:
 
 .. code-block:: console
 
@@ -148,9 +131,9 @@ Then we type:
   Hello World!
 
 In this example, we make and use a simple module ``Hello`` which contains only a
-primary module interface unit ``Hello.cppm``.
+primary module interface unit named ``Hello.cppm``.
 
-Then let's see a little bit more complex "hello world" example which uses the 4 kinds of module units.
+A more complex "hello world" example which uses the 4 kinds of module units is:
 
 .. code-block:: c++
 
@@ -192,7 +175,7 @@ Then let's see a little bit more complex "hello world" example which uses the 4
     return 0;
   }
 
-Then we are able to compile the example by the following command:
+Then, back on the command line, invoke Clang with:
 
 .. code-block:: console
 
@@ -216,51 +199,57 @@ We explain the options in the following sections.
 How to enable standard C++ modules
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Currently, standard C++ modules are enabled automatically
-if the language standard is ``-std=c++20`` or newer.
+Standard C++ modules are enabled automatically when the language standard mode
+is ``-std=c++20`` or newer.
 
 How to produce a BMI
 ~~~~~~~~~~~~~~~~~~~~
 
-We can generate a BMI for an importable module unit by either ``--precompile``
-or ``-fmodule-output`` flags.
+To generate a BMI for an importable module unit, use either the ``--precompile``
+or ``-fmodule-output`` command line options.
 
-The ``--precompile`` option generates the BMI as the output of the compilation and the output path
-can be specified using the ``-o`` option.
+The ``--precompile`` option generates the BMI as the output of the compilation
+with the output path specified using the ``-o`` option.
 
-The ``-fmodule-output`` option generates the BMI as a by-product of the compilation.
-If ``-fmodule-output=`` is specified, the BMI will be emitted the specified location. Then if
-``-fmodule-output`` and ``-c`` are specified, the BMI will be emitted in the directory of the
-output file with the name of the input file with the new extension ``.pcm``. Otherwise, the BMI
-will be emitted in the working directory with the name of the input file with the new extension
+The ``-fmodule-output`` option generates the BMI as a by-product of the
+compilation. If ``-fmodule-output=`` is specified, the BMI will be emitted to
+the specified location. If ``-fmodule-output`` and ``-c`` are specified, the
+BMI will be emitted in the directory of the output file with the name of the
+input file with the extension ``.pcm``. Otherwise, the BMI will be emitted in
+the working directory with the name of the input file with the extension
 ``.pcm``.
 
-The style to generate BMIs by ``--precompile`` is called two-phase compilation since it takes
-2 steps to compile a source file to an object file. The style to generate BMIs by ``-fmodule-output``
-is called one-phase compilation respectively. The one-phase compilation model is simpler
-for build systems to implement and the two-phase compilation has the potential to compile faster due
-to higher parallelism. As an example, if there are two module units A and B, and B depends on A, the
-one-phase compilation model would need to compile them serially, whereas the two-phase compilation
-model may be able to compile them simultaneously if the compilation from A.pcm to A.o takes a long
-time.
-
-File name requirement
-~~~~~~~~~~~~~~~~~~~~~
-
-The file name of an ``importable module unit`` should end with ``.cppm``
-(or ``.ccm``, ``.cxxm``, ``.c++m``). The file name of a ``module implementation unit``
-should end with ``.cpp`` (or ``.cc``, ``.cxx``, ``.c++``).
-
-The file name of BMIs should end with ``.pcm``.
-The file name of the BMI of a ``primary module interface unit`` should be ``module_name.pcm``.
-The file name of BMIs of ``module partition unit`` should be ``module_name-partition_name.pcm``.
-
-If the file names use different extensions, Clang may fail to build the module.
-For example, if the filename of an ``importable module unit`` ends with ``.cpp`` instead of ``.cppm``,
-then we can't generate a BMI for the ``importable module unit`` by ``--precompile`` option
-since ``--precompile`` option now would only run preprocessor, which is equal to `-E` now.
-If we want the filename of an ``importable module unit`` ends with other suffixes instead of ``.cppm``,
-we could put ``-x c++-module`` in front of the file. For example,
+Generating BMIs with ``--precompile`` is referred to as two-phase compilation
+because it takes two steps to compile a source file to an object file.
+Generating BMIs with ``-fmodule-output`` is called one-phase compilation. The
+one-phase compilation model is simpler for build systems to implement while the
+two-phase compilation has the potential to compile faster due to higher
+parallelism. As an example, if there are two module units ``A`` and ``B``, and
+``B`` depends on ``A``, the one-phase compilation model needs to compile them
+serially, whereas the two-phase compilation model is able to be compiled as
+soon as ``A.pcm`` is available, and thus can be compiled simultaneously as the
+``A.pcm`` to ``A.o`` compilation step.
+
+File name requirements
+~~~~~~~~~~~~~~~~~~~~~~
+
+By convention, ``importable module unit`` files should use ``.cppm`` (or
+``.ccm``, ``.cxxm``, or ``.c++m``) as a file extension.
+``Module implementation unit`` files should use ``.cpp`` (or ``.cc``, ``.cxx``,
+or ``.c++``) as a file extension.
+
+A BMI should use ``.pcm`` as a file extension. The file name of the BMI for a
+``primary module interface unit`` should be ``module_name.pcm``. The file name
+of a BMI for a ``module partition unit`` should be
+``module_name-partition_name.pcm``.
+
+Clang may fail to build the module if different extensions are used. For
+example, if the filename of an ``importable module unit`` ends with ``.cpp``
+instead of ``.cppm``, then Clang cannot generate a BMI for the
+``importable module unit`` with the ``--precompile`` option because the
+``--precompile`` option would only run the preprocessor (``-E``). If using a
+different extension than the conventional one for an ``importable module unit``
+you can specify ``-x c++-module`` before the file. For example,
 
 .. code-block:: c++
 
@@ -279,8 +268,9 @@ we could put ``-x c++-module`` in front of the file. For example,
     return 0;
   }
 
-Now the filename of the ``module interface`` ends with ``.cpp`` instead of ``.cppm``,
-we can't compile them by the original command lines. But we are still able to do it by:
+In this example, the extension used by the ``module interface`` is ``.cpp``
+instead of ``.cppm``, so it cannot be compiled like the previous example, but
+it can be compiled with:
 
 .. code-block:: console
 
@@ -289,12 +279,12 @@ we can't compile them by the original command lines. But we are still able to do
   $ ./Hello.out
   Hello World!
 
-Module name requirement
-~~~~~~~~~~~~~~~~~~~~~~~
+Module name requirements
+~~~~~~~~~~~~~~~~~~~~~~~~
 
-[module.unit]p1 says:
+..
 
-.. code-block:: text
+  [module.unit]p1:
 
   All module-names either beginning with an identifier consisting of std followed by zero
   or more digits or containing a reserved identifier ([lex.name]) are reserved and shall not
@@ -302,7 +292,7 @@ Module name requirement
   module-name is a reserved identifier, the module name is reserved for use by C++ implementations;
   otherwise it is reserved for future standardization.
 
-So all of the following name is not valid by default:
+Therefore, none of the following names are valid by default:
 
 .. code-block:: text
 
@@ -312,75 +302,74 @@ So all of the following name is not valid by default:
     __test
     // and so on ...
 
-If you still want to use the reserved module names for any reason, use
-``-Wno-reserved-module-identifier`` to suppress the warning.
+Using a reserved module name is strongly discouraged, but
+``-Wno-reserved-module-identifier`` can be used to suppress the warning.
 
-How to specify the dependent BMIs
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Specifying dependent BMIs
+~~~~~~~~~~~~~~~~~~~~~~~~~
 
-There are 3 methods to specify the dependent BMIs:
+There are 3 ways to specify a dependent BMI:
 
-* (1) ``-fprebuilt-module-path=<path/to/directory>``.
-* (2) ``-fmodule-file=<path/to/BMI>`` (Deprecated).
-* (3) ``-fmodule-file=<module-name>=<path/to/BMI>``.
+1. ``-fprebuilt-module-path=<path/to/directory>``.
+2. ``-fmodule-file=<path/to/BMI>`` (Deprecated).
+3. ``-fmodule-file=<module-name>=<path/to/BMI>``.
 
-The option ``-fprebuilt-module-path`` tells the compiler the path where to search for dependent BMIs.
-It may be used multiple times just like ``-I`` for specifying paths for header files. The look up rule here is:
+The ``-fprebuilt-module-path`` option specifies the path to search for
+dependent BMIs. Multiple paths may be specified, similar to using ``-I`` to
+specify a search path for header files. When importing a module ``M``, the
+compiler looks for ``M.pcm`` in the directories specified by
+``-fprebuilt-module-path``. Similarly, when importing a partition module unit
+``M:P``, the compiler looks for ``M-P.pcm`` in the directories specified by
+``-fprebuilt-module-path``.
 
-* (1) When we import module M. The compiler would look up M.pcm in the directories specified
-  by ``-fprebuilt-module-path``.
-* (2) When we import partition module unit M:P. The compiler would look up M-P.pcm in the
-  directories specified by ``-fprebuilt-module-path``.
-
-The option ``-fmodule-file=<path/to/BMI>`` tells the compiler to load the specified BMI directly.
-The option ``-fmodule-file=<module-name>=<path/to/BMI>`` tells the compiler to load the specified BMI
-for the module specified by ``<module-name>`` when necessary. The main difference is that
+The ``-fmodule-file=<path/to/BMI>`` option causes the compiler to load the
+specified BMI directly. The ``-fmodule-file=<module-name>=<path/to/BMI>``
+option causes the compiler to load the specified BMI for the module specified
+by ``<module-name>`` when necessary. The main difference is that
 ``-fmodule-file=<path/to/BMI>`` will load the BMI eagerly, whereas
-``-fmodule-file=<module-name>=<path/to/BMI>`` will only load the BMI lazily, which is similar
-with ``-fprebuilt-module-path``. The option ``-fmodule-file=<path/to/BMI>`` for named modules is deprecated
-and is planning to be removed in future versions.
+``-fmodule-file=<module-name>=<path/to/BMI>`` will only load the BMI lazily,
+as will ``-fprebuilt-module-path``. The ``-fmodule-file=<path/to/BMI>`` option
+for named modules is deprecated and will be removed in a future version of
+Clang.
 
-In case all ``-fprebuilt-module-path=<path/to/directory>``, ``-fmodule-file=<path/to/BMI>`` and
-``-fmodule-file=<module-name>=<path/to/BMI>`` exist, the ``-fmodule-file=<path/to/BMI>`` option
-takes highest precedence and ``-fmodule-file=<module-name>=<path/to/BMI>`` will take the second
-highest precedence.
+When these options are specified in the same invocation of the compiler, the
+``-fmodule-file=<path/to/BMI>`` option takes precedence over
+``-fmodule-file=<module-name>=<path/to/BMI>``, which takes precedence over
+``-fprebuilt-module-path=<path/to/directory>``.
 
-We need to specify all the dependent (directly and indirectly) BMIs.
-See https://github.com/llvm/llvm-project/issues/62707 for detail.
+Note: all dependant BMIs must be specified explicitly, either directly or
+indirectly dependent BMIs explicitly. See
+https://github.com/llvm/llvm-project/issues/62707 for details.
 
-When we compile a ``module implementation unit``, we must specify the BMI of the corresponding
-``primary module interface unit``.
-Since the language specification says a module implementation unit implicitly imports
-the primary module interface unit.
+When compiling a ``module implementation unit``, the BMI of the corresponding
+``primary module interface unit`` must be specified because a module
+implementation unit implicitly imports the primary module interface unit.
 
   [module.unit]p8
 
   A module-declaration that contains neither an export-keyword nor a module-partition implicitly
   imports the primary module interface unit of the module as if by a module-import-declaration.
 
-All of the 3 options ``-fprebuilt-module-path=<path/to/directory>``, ``-fmodule-file=<path/to/BMI>``
-and ``-fmodule-file=<module-name>=<path/to/BMI>`` may occur multiple times.
-For example, the command line to compile ``M.cppm`` in
-the above example could be rewritten into:
+The ``-fprebuilt-module-path=<path/to/directory>``, ``-fmodule-file=<path/to/BMI>``,
+and ``-fmodule-file=<module-name>=<path/to/BMI>`` options may be specified
+multiple times. For example, the command line to compile ``M.cppm`` in
+the previous example could be rewritten as:
 
 .. code-block:: console
 
   $ clang++ -std=c++20 M.cppm --precompile -fmodule-file=M:interface_part=M-interface_part.pcm -fmodule-file=M:impl_part=M-impl_part.pcm -o M.pcm
 
 When there are multiple ``-fmodule-file=<module-name>=`` options for the same
-``<module-name>``, the last ``-fmodule-file=<module-name>=`` will override the previous
-``-fmodule-file=<module-name>=`` options.
-
-``-fprebuilt-module-path`` is more convenient and ``-fmodule-file`` is faster since
-it saves time for file lookup.
+``<module-name>``, the last ``-fmodule-file=<module-name>=`` overrides the
+previous ``-fmodule-file=<module-name>=`` option.
 
 Remember that module units still have an object counterpart to the BMI
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-It is easy to forget to compile BMIs at first since we may envision module interfaces like headers.
-However, this is not true.
-Module units are translation units. We need to compile them to object files
-and link the object files like the example shows.
+While module interfaces resemble traditional header files, they still require
+compilation. Module units are translation units, and need to be compiled to
+object files, which then need to be linked together as the following examples
+show.
 
 For example, the traditional compilation processes for headers are like:
 
@@ -400,24 +389,27 @@ And the compilation process for module units are like:
                 mod1.cppm -> clang++ mod1.cppm ... -> mod1.pcm --,--> clang++ mod1.pcm ... -> mod1.o -+
                 src2.cpp ----------------------------------------+> clang++ src2.cpp -------> src2.o -'
 
-As the diagrams show, we need to compile the BMI from module units to object files and link the object files.
-(But we can't do this for the BMI from header units. See the later section for the definition of header units)
+As the diagrams show, we need to compile the BMI from module units to object
+files and then link the object files. (However, this cannot be done for the BMI
+from header units. See the section on :ref:`header units <header-units>` for
+more details.
 
-If we want to create a module library, we can't just ship the BMIs in an archive.
-We must compile these BMIs(``*.pcm``) into object files(``*.o``) and add those object files to the archive instead.
+BMIs cannot be shipped in an archive to create a module library. Instead, the
+BMIs(``*.pcm``) are compiled into object files(``*.o``) and those object files
+are added to the archive instead.
 
-Consistency Requirement
-~~~~~~~~~~~~~~~~~~~~~~~
+Consistency Requirements
+~~~~~~~~~~~~~~~~~~~~~~~~
 
-If we envision modules as a cache to speed up compilation, then - as with other caching techniques -
-it is important to keep cache consistency.
-So **currently** Clang will do very strict check for consistency.
+Modules can be viewed as a kind of cache to speed up compilation. Thus, like
+other caching techniques, it is important to maintain cache consistency which
+is why Clang does very strict checking for consistency.
 
 Options consistency
 ^^^^^^^^^^^^^^^^^^^
 
-The language option of module units and their non-module-unit users should be consistent.
-The following example is not allowed:
+Compiler options related to the language dialect for a module unit and its
+non-module-unit uses need to be consistent. Consider the following example:
 
 .. code-block:: c++
 
@@ -432,9 +424,8 @@ The following example is not allowed:
   $ clang++ -std=c++20 M.cppm --precompile -o M.pcm
   $ clang++ -std=c++23 Use.cpp -fprebuilt-module-path=.
 
-The compiler would reject the example due to the inconsistent language options.
-Not all options are language options.
-For example, the following example is allowed:
+Clang rejects the example due to the inconsistent language standard modes. Not
+all compiler options are language dialect options, though. For example:
 
 .. code-block:: console
 
@@ -444,9 +435,12 @@ For example, the following example is allowed:
   # Inconsistent debugging level.
   $ clang++ -std=c++20 -g Use.cpp -fprebuilt-module-path=.
 
-Although the two examples have inconsistent optimization and debugging level, both of them are accepted.
+Although the optimization and debugging levels are inconsistent, these
+compilations are accepted because the compiler options do not impact the
+language dialect.
 
-Note that **currently** the compiler doesn't consider inconsistent macro definition a problem. For example:
+Note that the compiler **currently** doesn't reject inconsistent macro
+definitions (this may change in the future). For example:
 
 .. code-block:: console
 
@@ -454,43 +448,43 @@ Note that **currently** the compiler doesn't consider inconsistent macro definit
   # Inconsistent optimization level.
   $ clang++ -std=c++20 -O3 -DNDEBUG Use.cpp -fprebuilt-module-path=.
 
-Currently Clang would accept the above example. But it may produce surprising results if the
-debugging code depends on consistent use of ``NDEBUG`` also in other translation units.
+Currently, Clang accepts the above example, though it may produce surprising
+results if the debugging code depends on consistent use of ``NDEBUG`` in other
+translation units.
 
-Definitions consistency
-^^^^^^^^^^^^^^^^^^^^^^^
+Object definition consistency
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The C++ language requires that declarations of the same entity in different
+translation units have the same definition, which is known as the One
+Definition Rule (ODR). Without modules, the compiler cannot perform strong ODR
+violation checking because it only sees one translation unit at a time. With
+the use of modules, the compiler can perform checks for ODR violations across
+translation units.
 
-The C++ language defines that same declarations in different translation units should have
-the same definition, as known as ODR (One Definition Rule). Prior to modules, the translation
-units don't dependent on each other and the compiler itself can't perform a strong
-ODR violation check. With the introduction of modules, now the compiler have
-the chance to perform ODR violations with language semantics across translation units.
-
-However, in the practice, we found the existing ODR checking mechanism is not stable
-enough. Many people suffers from the false positive ODR violation diagnostics, AKA,
-the compiler are complaining two identical declarations have different definitions
-incorrectly. Also the true positive ODR violations are rarely reported.
-Also we learned that MSVC don't perform ODR check for declarations in the global module
-fragment.
-
-So in order to get better user experience, save the time checking ODR and keep consistent
-behavior with MSVC, we disabled the ODR check for the declarations in the global module
-fragment by default. Users who want more strict check can still use the
-``-Xclang -fno-skip-odr-check-in-gmf`` flag to get the ODR check enabled. It is also
-encouraged to report issues if users find false positive ODR violations or false negative ODR
-violations with the flag enabled.
+However, the current ODR checking mechanisms are not perfect. There are a
+significant number of false positive ODR violation diagnostics, where the
+compiler incorrectly diagnoses two identical declarations as having different
+definitions. Further, true positive ODR violations are not always reported.
+
+To give a better user experience, improve compilation performance, and for
+consistency with MSVC, ODR checking of declarations in the global module
+fragment is disabled by default. These checks can be enabled by specifying
+``-Xclang -fno-skip-odr-check-in-gmf`` when compiling. If the check is enabled
+and you encounter incorrect or missing diagnostics, please report them via the
+`community issue tracker <https://github.com/llvm/llvm-project/issues/>`_.
 
 ABI Impacts
 -----------
 
-This section describes the new ABI changes brought by modules.
-
-Only Itanium C++ ABI related change are mentioned
+This section describes the new ABI changes brought by modules. Only changes to
+the Itanium C++ ABI are covered.
 
-Mangling Names
-~~~~~~~~~~~~~~
+Name Mangling
+~~~~~~~~~~~~~
 
-The declarations in a module unit which are not in the global module fragment have new linkage names.
+The declarations in a module unit which are not in the global module fragment
+have new linkage names.
 
 For example,
 
@@ -501,22 +495,24 @@ For example,
     export int foo();
   }
 
-The linkage name of ``NS::foo()`` would be ``_ZN2NSW1M3fooEv``.
-This couldn't be demangled by previous versions of the debugger or demangler.
-As of LLVM 15.x, users can utilize ``llvm-cxxfilt`` to demangle this:
+The linkage name of ``NS::foo()`` is ``_ZN2NSW1M3fooEv``. This couldn't be
+demangled by previous versions of the debugger or demangler. As of LLVM 15.x,
+``llvm-cxxfilt`` can be used to demangle this:
 
 .. code-block:: console
 
   $ llvm-cxxfilt _ZN2NSW1M3fooEv
+    NS::foo@M()
 
-The result would be ``NS::foo@M()``, which reads as ``NS::foo()`` in module ``M``.
+The result should be read as ``NS::foo()`` in module ``M``.
 
-The ABI implies that we can't declare something in a module unit and define it in a non-module unit (or vice-versa),
-as this would result in linking errors.
+The ABI implies that something cannot be declared in a module unit and defined
+in a non-module unit (or vice-versa), as this would result in linking errors.
 
-If we still want to implement declarations within the compatible ABI in module unit,
-we can use the language-linkage specifier. Since the declarations in the language-linkage specifier
-is attached to the global module fragments. For example:
+Despite this, it is possible to implement declarations with a compatible ABI in
+a module unit by using a language linkage specifier because the declarations in
+the language linkage specifier are attached to the global module fragment. For
+example:
 
 .. code-block:: c++
 
@@ -530,43 +526,47 @@ Now the linkage name of ``NS::foo()`` will be ``_ZN2NS3fooEv``.
 Module Initializers
 ~~~~~~~~~~~~~~~~~~~
 
-All the importable module units are required to emit an initializer function.
-The initializer function should contain calls to importing modules first and
-all the dynamic-initializers in the current module unit then.
+All importable module units are required to emit an initializer function to
+handle the dynamic initialization of non-inline variables in the module unit.
+The importable module unit has to emit the initializer even if there is no
+dynamic initialization; otherwise, the importer may call a nonexistent
+function. The initializer function emits calls to imported modules first
+followed by calls to all to of the dynamic initializers in the current module
+unit.
 
-Translation units explicitly or implicitly importing named modules must call
-the initializer functions of the imported named modules within the sequence of
-the dynamic-initializers in the TU. Initializations of entities at namespace
-scope are appearance-ordered. This (recursively) extends into imported modules
-at the point of appearance of the import declaration.
+Translation units that explicitly or implicitly import a named module must call
+the initializer functions of the imported named module within the sequence of
+the dynamic initializers in the translation unit. Initializations of entities
+at namespace scope are appearance-ordered. This (recursively) extends to
+imported modules at the point of appearance of the import declaration.
 
-It is allowed to omit calls to importing modules if it is known empty.
-
-It is allowed to omit calls to importing modules for which is known to be called.
+If the imported module is known to be empty, the call to its initializer may be
+omitted. Additionally, if the imported module is known to have already been
+imported, the call to its initializer may be omitted.
 
 Reduced BMI
 -----------
 
-To support the 2 phase compilation model, Clang chose to put everything needed to
-produce an object into the BMI. But every consumer of the BMI, except itself, doesn't
-need such informations. It makes the BMI to larger and so may introduce unnecessary
-dependencies into the BMI. To mitigate the problem, we decided to reduce the information
-contained in the BMI.
-
-To be clear, we call the default BMI as Full BMI and the new introduced BMI as Reduced
-BMI.
+To support the two-phase compilation model, Clang puts everything needed to
+produce an object into the BMI. However, other consumers of the BMI generally
+don't need that information. This makes the BMI larger and may introduce
+unnecessary dependencies for the BMI. To mitigate the problem, Clang has a
+compiler option to reduce the information contained in the BMI. These two
+formats are known as Full BMI and Reduced BMI, respectively.
 
-Users can use ``-fexperimental-modules-reduced-bmi`` flag to enable the Reduced BMI.
+Users can use the ``-fexperimental-modules-reduced-bmi`` option to produce a
+Reduced BMI.
 
-For one phase compilation model (CMake implements this model), with
-``-fexperimental-modules-reduced-bmi``, the generated BMI will be Reduced BMI automatically.
-(The output path of the BMI is specified by ``-fmodule-output=`` as usual one phase
-compilation model).
+For the one-phase compilation model (CMake implements this model), with
+``-fexperimental-modules-reduced-bmi``, the generated BMI will be a Reduced
+BMI automatically. (The output path of the BMI is specified by
+``-fmodule-output=`` as usual with the one-phase compilation model).
 
-It is still possible to support Reduced BMI in two phase compilation model. With
-``-fexperimental-modules-reduced-bmi``, ``--precompile`` and ``-fmodule-output=`` specified,
-the generated BMI specified by ``-o`` will be full BMI and the BMI specified by
-``-fmodule-output=`` will be Reduced BMI. The dependency graph may be:
+It is also possible to produce a Reduced BMI with the two-phase compilation
+model. When ``-fexperimental-modules-reduced-bmi``, ``--precompile``, and
+``-fmodule-output=`` are specified, the generated BMI specified by ``-o`` will
+be a full BMI and the BMI specified by ``-fmodule-output=`` will be a Reduced
+BMI. The dependency graph in this case would look like:
 
 .. code-block:: none
 
@@ -577,15 +577,16 @@ the generated BMI specified by ``-o`` will be full BMI and the BMI specified by
                                                -> ...
                                                -> consumer_n.cpp
 
-We don't emit diagnostics if ``-fexperimental-modules-reduced-bmi`` is used with a non-module
-unit. This design helps the end users of one phase compilation model to perform experiments
-early without asking for the help of build systems. The users of build systems which supports
-two phase compilation model still need helps from build systems.
+Clang does not emit diagnostics when ``-fexperimental-modules-reduced-bmi`` is
+used with a non-module unit. This design permits users of the one-phase
+compilation model to try using reduced BMIs without needing to modify the build
+system. The two-phase compilation module requires build system support.
 
-Within Reduced BMI, we won't write unreachable entities from GMF, definitions of non-inline
-functions and non-inline variables. This may not be a transparent change.
-`[module.global.frag]ex2 <https://eel.is/c++draft/module.global.frag#example-2>`_ may be a good
-example:
+In a Reduced BMI, Clang does not emit unreachable entities from the global
+module fragment, or definitions of non-inline functions and non-inline
+variables. This may not be a transparent change.
+
+Consider the following example:
 
 .. code-block:: c++
 
@@ -633,22 +634,23 @@ example:
                                   // module M's interface, so is discarded
   int c = use_h<int>();           // OK
 
-In the above example, the function definition of ``N::g`` is elided from the Reduced
-BMI of ``M.cppm``. Then the use of ``use_g<int>`` in ``M-impl.cpp`` fails
-to instantiate. For such issues, users can add references to ``N::g`` in the module purview
-of ``M.cppm`` to make sure it is reachable, e.g., ``using N::g;``.
+In the above example, the function definition of ``N::g`` is elided from the
+Reduced BMI of ``M.cppm``. Then the use of ``use_g<int>`` in ``M-impl.cpp``
+fails to instantiate. For such issues, users can add references to ``N::g`` in
+the `module purview <https://eel.is/c++draft/module.unit#5>`_ of ``M.cppm`` to
+ensure it is reachable, e.g. ``using N::g;``.
 
-We think the Reduced BMI is the correct direction. But given it is a drastic change,
-we'd like to make it experimental first to avoid breaking existing users. The roadmap
-of Reduced BMI may be:
+Support for Reduced BMIs is still experimental, but it may become the default
+in the future. The expected roadmap for Reduced BMIs as of Clang 19.x is:
 
-1. ``-fexperimental-modules-reduced-bmi`` is opt in for 1~2 releases. The period depends
-on testing feedbacks.
-2. We would announce Reduced BMI is not experimental and introduce ``-fmodules-reduced-bmi``.
-and suggest users to enable this mode. This may takes 1~2 releases too.
-3. Finally we will enable this by default. When that time comes, the term BMI will refer to
-the reduced BMI today and the Full BMI will only be meaningful to build systems which
-loves to support two phase compilations.
+1. ``-fexperimental-modules-reduced-bmi`` is opt-in for 1~2 releases. The period depends
+   on user feedback and may be extended.
+2. Announce that Reduced BMIs are no longer experimental and introduce
+   ``-fmodules-reduced-bmi`` as a new option, and recommend use of the new
+   option. This transition is expected to take 1~2 additional releases as well.
+3. Finally, ``-fmodules-reduced-bmi`` will be the default. When that time
+   comes, the term BMI will refer to the Reduced BMI and the Full BMI will only
+   be meaningful to build systems which elect to support two-phase compilation.
 
 Performance Tips
 ----------------
@@ -656,13 +658,11 @@ Performance Tips
 Reduce duplications
 ~~~~~~~~~~~~~~~~~~~
 
-While it is legal to have duplicated declarations in the global module fragments
-of different module units, it is not free for clang to deal with the duplicated
-declarations. In other word, for a translation unit, it will compile slower if the
-translation unit itself and its importing module units contains a lot duplicated
-declarations.
-
-For example,
+While it is valid to have duplicated declarations in the global module fragments
+of different module units, it is not free for Clang to deal with the duplicated
+declarations. A translation unit will compile more slowly if there is a lot of
+duplicated declarations between the translation unit and modules it imports.
+For example:
 
 .. code-block:: c++
 
@@ -698,9 +698,9 @@ For example,
   import M;
   ... // use declarations from module M.
 
-When ``big.header.h`` is big enough and there are a lot of partitions,
-the compilation of ``use.cpp`` may be slower than
-the following style significantly:
+When ``big.header.h`` is big enough and there are a lot of partitions, the
+compilation of ``use.cpp`` may be significantly slower than the following
+approach:
 
 .. code-block:: c++
 
@@ -738,22 +738,21 @@ the following style significantly:
   import M;
   ... // use declarations from module M.
 
-The key part of the tip is to reduce the duplications from the text includes.
-
-Ideas for converting to modules
--------------------------------
+Reducing the duplication from textual includes is what improves compile-time
+performance.
 
-For new libraries, we encourage them to use modules completely from day one if possible.
-This will be pretty helpful to make the whole ecosystems to get ready.
+Transitioning to modules
+------------------------
 
-For many existing libraries, it may be a breaking change to refactor themselves
-into modules completely. So that many existing libraries need to provide headers and module
-interfaces for a while to not break existing users.
-Here we provide some ideas to ease the transition process for existing libraries.
-**Note that the this section is only about helping ideas instead of requirement from clang**.
+It is best for new code and libraries to use modules from the start if
+possible. However, it may be a breaking change for existing code or libraries
+to switch to modules. As a result, many existing libraries need to provide
+both headers and module interfaces for a while to not break existing users.
 
-Let's start with the case that there is no dependency or no dependent libraries providing
-modules for your library.
+This section suggests some suggestions on how to ease the transition process
+for existing libraries. **Note that this information is only intended as
+guidance, rather than as requirements to use modules in Clang.** It presumes
+the project is starting with no module-based dependencies.
 
 ABI non-breaking styles
 ~~~~~~~~~~~~~~~~~~~~~~~
@@ -776,9 +775,9 @@ export-using style
     using decl_n;
   }
 
-As the example shows, you need to include all the headers containing declarations needs
-to be exported and `using` such declarations in an `export` block. Then, basically,
-we're done.
+This example shows how to include all the headers containing declarations which
+need to be exported, and uses `using` declarations in an `export` block to
+produce the module interface.
 
 export extern-C++ style
 ^^^^^^^^^^^^^^^^^^^^^^^
@@ -799,7 +798,7 @@ export extern-C++ style
     #include "header_n.h"
   }
 
-Then in your headers (from ``header_1.h`` to ``header_n.h``), you need to define the macro:
+Headers (from ``header_1.h`` to ``header_n.h``) need to define the macro:
 
 .. code-block:: c++
 
@@ -809,9 +808,10 @@ Then in your headers (from ``header_1.h`` to ``header_n.h``), you need to define
   #define EXPORT
   #endif
 
-And you should put ``EXPORT`` to the beginning of the declarations you want to export.
+and put ``EXPORT`` on the declarations you want to export.
 
-Also it is suggested to refactor your headers to include thirdparty headers conditionally:
+Also, it is recommended to refactor headers to include third-party headers
+conditionally:
 
 .. code-block:: c++
 
@@ -823,26 +823,25 @@ Also it is suggested to refactor your headers to include thirdparty headers cond
 
   ...
 
-This may be helpful to get better diagnostic messages if you forgot to update your module
-interface unit file during maintaining.
+This can be helpful because it gives better diagnostic messages if the module
+interface unit is not properly updated when modifying code.
 
-The reasoning for the practice is that the declarations in the language linkage are considered
-to be attached to the global module. So the ABI of your library in the modular version
-wouldn't change.
+This approach works because the declarations with language linkage are attached
+to the global module. Thus, the ABI of the modular form of the library does not
+change.
 
-While this style looks not as convenient as the export-using style, it is easier to convert
-to other styles.
+While this style is more involved than the export-using style, it makes it
+easier to further refactor the library to other styles.
 
 ABI breaking style
 ~~~~~~~~~~~~~~~~~~
 
-The term ``ABI breaking`` sounds terrifying generally. But you may want it here if you want
-to force your users to introduce your library in a consistent way. E.g., they either include
-your headers all the way or import your modules all the way.
-The style prevents the users to include your headers and import your modules at the same time
-in the same repo.
+The term ``ABI breaking`` may sound like a bad approach. However, this style
+forces consumers of the library use it in a consistent way. e.g., either always
+include headers for the library or always import modules. The style prevents
+the ability to mix includes and imports for the library.
 
-The pattern for ABI breaking style is similar with export extern-C++ style.
+The pattern for ABI breaking style is similar to the export extern-C++ style.
 
 .. code-block:: c++
 
@@ -865,7 +864,7 @@ The pattern for ABI breaking style is similar with export extern-C++ style.
   ...
   #include "source_n.cpp"
   #else // the number of .cpp files in your project are a lot
-  // Using all the declarations from thirdparty libraries which are
+  // Using all the declarations from third-party libraries which are
   // used in the .cpp files.
   namespace third_party_namespace {
     using third_party_decl_used_in_cpp_1;
@@ -875,11 +874,11 @@ The pattern for ABI breaking style is similar with export extern-C++ style.
   }
   #endif
 
-(And add `EXPORT` and conditional include to the headers as suggested in the export
-extern-C++ style section)
+(And add `EXPORT` and conditional include to the headers as suggested in the
+export extern-C++ style section.)
 
-Remember that the ABI get changed and we need to compile our source files into the
-new ABI format. This is the job of the additional part of the interface unit:
+The ABI with modules is different and thus we need to compile the source files
+into the new ABI. This is done by an additional part of the interface unit:
 
 .. code-block:: c++
 
@@ -890,7 +889,7 @@ new ABI format. This is the job of the additional part of the interface unit:
   ...
   #include "source_n.cpp"
   #else // the number of .cpp files in your project are a lot
-  // Using all the declarations from thirdparty libraries which are
+  // Using all the declarations from third-party libraries which are
   // used in the .cpp files.
   namespace third_party_namespace {
     using third_party_decl_used_in_cpp_1;
@@ -900,16 +899,17 @@ new ABI format. This is the job of the additional part of the interface unit:
   }
   #endif
 
-In case the number of your source files are small, we may put everything in the private
-module fragment directly. (it is suggested to add conditional include to the source
-files too). But it will make the compilation of the module interface unit to be slow
-when the number of the source files are not small enough.
+If the number of source files is small, everything can be put in the private
+module fragment directly (it is recommended to add conditional includes to the
+source files as well). However, compile time performance will be bad if there
+are a lot of source files to compile.
 
-**Note that the private module fragment can only be in the primary module interface unit
-and the primary module interface unit containing private module fragment should be the only
-module unit of the corresponding module.**
+**Note that the private module fragment can only be in the primary module
+interface unit and the primary module interface unit containing the private
+module fragment should be the only module unit of the corresponding module.**
 
-In that case, you need to convert your source files (.cpp files) to module implementation units:
+In this case, source files (.cpp files) must be converted to module
+implementation units:
 
 .. code-block:: c++
 
@@ -925,45 +925,40 @@ In that case, you need to convert your source files (.cpp files) to module imple
   // Following off should be unchanged.
   ...
 
-The module implementation unit will import the primary module implicitly.
-We don't include any headers in the module implementation units
-here since we want to avoid duplicated declarations between translation units.
-This is the reason why we add non-exported using declarations from the third
-party libraries in the primary module interface unit.
-
-And if you provide your library as ``libyour_library.so``, you probably need to
-provide a modular one ``libyour_library_modules.so`` since you changed the ABI.
-
-What if there are headers only inclued by the source files
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+The module implementation unit will import the primary module implicitly. Do
+not include any headers in the module implementation units as it avoids
+duplicated declarations between translation units. This is why non-exported
+using declarations should be added from third-party libraries in the primary
+module interface unit.
 
-The above practice may be problematic if there are headers only included by the source
-files. If you're using private module fragment, you may solve the issue by including them
-in the private module fragment. While it is OK to solve it by including the implementation
-headers in the module purview if you're using implementation module units, it may be
-suboptimal since the primary module interface units now containing entities not belongs
-to the interface.
+If the library is provided as ``libyour_library.so``, a modular library (e.g.,
+``libyour_library_modules.so``) may also need to be provided for ABI
+compatibility.
 
-If you're a perfectionist, maybe you can improve it by introducing internal module partition unit.
+What if there are headers only included by the source files
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-The internal module partition unit is an importable module unit which is internal
-to the module itself. The concept just meets the headers only included by the source files.
+The above practice may be problematic if there are headers only included by the
+source files. When using a private module fragment, this issue may be solved by
+including those headers in the private module fragment. While it is OK to solve
+it by including the implementation headers in the module purview when using
+implementation module units, it may be suboptimal because the primary module
+interface units now contain entities that do not belong to the interface.
 
-We don't show code snippet since it may be too verbose or not good or not general.
-But it may not be too hard if you can understand the points of the section.
+This can potentially be improved by introducing a module partition
+implementation unit. An internal module partition unit is an importable
+module unit which is internal to the module itself.
 
 Providing a header to skip parsing redundant headers
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-It is a problem for clang to handle redeclarations between translation units.
-Also there is a long standing issue in clang (`problematic include after import <https://github.com/llvm/llvm-project/issues/61465>`_).
-But even if the issue get fixed in clang someday, the users may still get slower compilation speed
-and larger BMI size. So it is suggested to not include headers after importing the corresponding
-library.
-
-However, it is not easy for users if your library are included by other dependencies.
-
-So the users may have to write codes like:
+Many redeclarations shared between translation units causes Clang to have
+slower compile-time performance. Further, there are known issues with
+`include after import <https://github.com/llvm/llvm-project/issues/61465>`_.
+Even when that issue is resolved, users may still get slower compilation speed
+and larger BMIs. For these reasons, it is recommended to not include headers
+after importing the corresponding module. However, it is not always easy if the
+library is included by other dependencies, as in:
 
 .. code-block:: c++
 
@@ -977,9 +972,9 @@ or
   import your_library;
   #include "third_party/A.h" // #include "your_library/a_header.h"
 
-For such cases, we suggest the libraries providing modules and the headers at the same time
-to provide a header to skip parsing all the headers in your libraries. So the users can
-import your library as the following style to skip redundant handling:
+For such cases, it is best if the library providing both module and header
+interfaces also provides a header which skips parsing so that the library can
+be imported with the following approach that skips redundant redeclarations:
 
 .. code-block:: c++
 
@@ -987,9 +982,9 @@ import your library as the following style to skip redundant handling:
   #include "your_library_imported.h"
   #include "third_party/A.h" // #include "your_library/a_header.h" but got skipped
 
-The implementation of ``your_library_imported.h`` can be a set of controlling macros or
-an overall controlling macro if you're using `#pragma once`. So you can convert your
-headers to:
+The implementation of ``your_library_imported.h`` can be a set of controlling
+macros or an overall controlling macro if using `#pragma once`. Then headers
+can be refactored to:
 
 .. code-block:: c++
 
@@ -998,25 +993,24 @@ headers to:
   ...
   #endif
 
-If the modules imported by your library provides such headers too, remember to add them to
-your ``your_library_imported.h`` too.
+If the modules imported by the library provide such headers, remember to add
+them to ``your_library_imported.h`` too.
 
 Importing modules
 ~~~~~~~~~~~~~~~~~
 
-When there are dependent libraries providing modules, we suggest you to import that in
-your module.
-
-Most of the existing libraries would fall into this catagory once the std module gets available.
+When there are dependent libraries providing modules, they should be imported
+in your module as well. Many existing libraries will fall into this category
+once the ``std`` module is more widely available.
 
 All dependent libraries providing modules
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Life gets easier if all the dependent libraries providing modules.
+Of course, most of the complexity disappears if all the dependent libraries
+provide modules.
 
-You need to convert your headers to include thirdparty headers conditionally.
-
-Then for export-using style:
+Headers need to be converted to include third-party headers conditionally. Then,
+for the export-using style:
 
 .. code-block:: c++
 
@@ -1035,7 +1029,7 @@ Then for export-using style:
     using decl_n;
   }
 
-For export extern-C++ style:
+or, for the export extern-C++ style:
 
 .. code-block:: c++
 
@@ -1049,7 +1043,7 @@ For export extern-C++ style:
     #include "header_n.h"
   }
 
-For ABI breaking style,
+or, for the ABI-breaking style,
 
 .. code-block:: c++
 
@@ -1069,35 +1063,39 @@ For ABI breaking style,
   #include "source_n.cpp"
   #endif
 
-We don't need the non-exported using declarations if we're using implementation module
-units now. We can import thirdparty modules directly in the implementation module
-units.
+Non-exported ``using`` declarations are unnecessary if using implementation
+module units. Instead, third-party modules can be imported directly in
+implementation module units.
 
 Partial dependent libraries providing modules
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-In this case, we have to mix the use of ``include`` and ``import`` in the module of our
-library. The key point here is still to remove duplicated declarations in translation
-units as much as possible. If the imported modules provide headers to skip parsing their
-headers, we should include that after the including. If the imported modules don't provide
-the headers, we can make it ourselves if we still want to optimize it.
-
-Known Problems
---------------
-
-The following describes issues in the current implementation of modules.
-Please see https://github.com/llvm/llvm-project/labels/clang%3Amodules for more issues
-or file a new issue if you don't find an existing one.
-If you're going to create a new issue for standard C++ modules,
-please start the title with ``[C++20] [Modules]`` (or ``[C++23] [Modules]``, etc)
-and add the label ``clang:modules`` (if you have permissions for that).
-
-For higher level support for proposals, you could visit https://clang.llvm.org/cxx_status.html.
-
-Including headers after import is problematic
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+If the library has to mix the use of ``include`` and ``import`` in its module,
+the primary goal is still the removal of duplicated declarations in translation
+units as much as possible. If the imported modules provide headers to skip
+parsing their headers, those should be included after the import. If the
+imported modules don't provide such a header, one can be made manually for
+improved compile time performance.
+
+Known Issues
+------------
+
+The following describes issues in the current implementation of modules. Please
+see
+`the issues list for modules <https://github.com/llvm/llvm-project/labels/clang%3Amodules>`_
+for a list of issues or to file a new issue if you don't find an existing one.
+When creating a new issue for standard C++ modules, please start the title with
+``[C++20] [Modules]`` (or ``[C++23] [Modules]``, etc) and add the label
+``clang:modules`` if possible.
+
+A high-level overview of support for standards features, including modules, can
+be found on the `C++ Feature Status <https://clang.llvm.org/cxx_status.html>`_
+page.
+
+Including headers after import is not well-supported
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-For example, the following example can be accept:
+The following example is accepted:
 
 .. code-block:: c++
 
@@ -1110,8 +1108,8 @@ For example, the following example can be accept:
       return 0;
   }
 
-but it will get rejected if we reverse the order of ``#include <iostream>`` and
-``import foo;``:
+but if the order of ``#include <iostream>`` and ``import foo;`` is reversed,
+then the code is currently rejected:
 
 .. code-block:: c++
 
@@ -1126,33 +1124,31 @@ but it will get rejected if we reverse the order of ``#include <iostream>`` and
 
 Both of the above examples should be accepted.
 
-This is a limitation in the implementation. In the first example,
-the compiler will see and parse <iostream> first then the compiler will see the import.
-So the ODR Checking and declarations merging will happen in the deserializer.
-In the second example, the compiler will see the import first and the include second.
-As a result, the ODR Checking and declarations merging will happen in the semantic analyzer.
+This is a limitation of the implementation. In the first example, the compiler
+will see and parse ``<iostream>`` first then it will see the ``import``. In
+this case, ODR checking and declaration merging will happen in the
+deserializer. In the second example, the compiler will see the ``import`` first
+and the ``#include`` second which results in ODR checking and declarations
+merging happening in the semantic analyzer. This is due to a divergence in the
+implementation path. This is tracked by
+`#61465 <https://github.com/llvm/llvm-project/issues/61465>`_.
 
-So there is divergence in the implementation path. It might be understandable that why
-the orders matter here in the case.
-(Note that "understandable" is different from "makes sense").
-
-This is tracked in: https://github.com/llvm/llvm-project/issues/61465
-
-Ignored PreferredName Attribute
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Due to a tricky problem, when Clang writes BMIs, Clang will ignore the ``preferred_name`` attribute, if any.
-This implies that the ``preferred_name`` wouldn't show in debugger or dumping.
+Ignored ``preferred_name`` Attribute
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-This is tracked in: https://github.com/llvm/llvm-project/issues/56490
+When Clang writes BMIs, it will ignore the ``preferred_name`` attribute on
+declarations which use it. Thus, the preferred name will not be displayed in
+the debugger as expected. This is tracked by
+`#56490 <https://github.com/llvm/llvm-project/issues/56490>`_.
 
 Don't emit macros about module declaration
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-This is covered by P1857R3. We mention it again here since users may abuse it before we implement it.
+This is covered by `P1857R3 <https://wg21.link/P1857R3>`_. It is mentioned here
+because we want users to be aware that we don't yet implement it.
 
-Someone may want to write code which could be compiled both by modules or non-modules.
-A direct idea would be use macros like:
+A direct approach to write code that can be compiled by both modules and
+non-module builds may look like:
 
 .. code-block:: c++
 
@@ -1162,39 +1158,37 @@ A direct idea would be use macros like:
   IMPORT header_name
   EXPORT ...
 
-So this file could be triggered like a module unit or a non-module unit depending on the definition
-of some macros.
-However, this kind of usage is forbidden by P1857R3 but we haven't implemented P1857R3 yet.
-This means that is possible to write illegal modules code now, and obviously this will stop working
-once P1857R3 is implemented.
-A simple suggestion would be "Don't play macro tricks with module declarations".
+The intent of this is that this file can be compiled like a module unit or a
+non-module unit depending on the definition of some macros. However, this usage
+is forbidden by P1857R3 which is not yet implemented in Clang. This means that
+is possible to write invalid modules which will no longer be accepted once
+P1857R3 is implemented. This is tracked by
+`#56917 <https://github.com/llvm/llvm-project/issues/56917>`_.
+
+Until then, it is recommended not to mix macros with module declarations.
 
-This is tracked in: https://github.com/llvm/llvm-project/issues/56917
 
 In consistent filename suffix requirement for importable module units
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Currently, clang requires the file name of an ``importable module unit`` should end with ``.cppm``
-(or ``.ccm``, ``.cxxm``, ``.c++m``). However, the behavior is inconsistent with other compilers.
-
-This is tracked in: https://github.com/llvm/llvm-project/issues/57416
-
-clang-cl is not compatible with the standard C++ modules
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Now we can't use the `/clang:-fmodule-file` or `/clang:-fprebuilt-module-path` to specify
-the BMI within ``clang-cl.exe``.
+Currently, Clang requires the file name of an ``importable module unit`` to
+have ``.cppm`` (or ``.ccm``, ``.cxxm``, ``.c++m``) as the file extension.
+However, the behavior is inconsistent with other compilers. This is tracked by
+`#57416 <https://github.com/llvm/llvm-project/issues/57416>`_.
 
-This is tracked in: https://github.com/llvm/llvm-project/issues/64118
+clang-cl is not compatible with standard C++ modules
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-false positive ODR violation diagnostic due to using inconsistent qualified but the same type
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+``/clang:-fmodule-file`` and ``/clang:-fprebuilt-module-path`` cannot be used
+to specify the BMI with ``clang-cl.exe``. This is tracked by
+`#64118 <https://github.com/llvm/llvm-project/issues/64118>`_.
 
-ODR violation is a pretty common issue when using modules.
-Sometimes the program violated the One Definition Rule actually.
-But sometimes it shows the compiler gives false positive diagnostics.
+Incorrect ODR violation diagnostics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-One often reported example is:
+ODR violations are a common issue when using modules. Clang sometimes produces
+false-positive diagnostics or fails to produce true-positive diagnostics of the
+One Definition Rule. One often-reported example is:
 
 .. code-block:: c++
 
@@ -1222,51 +1216,49 @@ One often reported example is:
   export module repro;
   export import :part;
 
-Currently the compiler complains about the inconsistent definition of `fun()` in
-2 module units. This is incorrect. Since both definitions of `fun()` has the same
-spelling and `T` refers to the same type entity finally. So the program should be
-fine.
-
-This is tracked in https://github.com/llvm/llvm-project/issues/78850.
+Currently the compiler incorrectly diagnoses the inconsistent definition of
+``fun()`` in two module units. Because both definitions of ``fun()`` have the
+same spelling and ``T`` refers to the same type entity, there is no ODR
+violation. This is tracked by
+`#78850 <https://github.com/llvm/llvm-project/issues/78850>`_.
 
 Using TU-local entity in other units
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Module units are translation units. So the entities which should only be local to the
-module unit itself shouldn't be used by other units in any means.
+Module units are translation units, so the entities which should be local to
+the module unit itself should never be used by other units.
 
-In the language side, to address the idea formally, the language specification defines
-the concept of ``TU-local`` and ``exposure`` in
+The C++ standard defines the concept of ``TU-local`` and ``exposure`` in
 `basic.link/p14 <https://eel.is/c++draft/basic.link#14>`_,
 `basic.link/p15 <https://eel.is/c++draft/basic.link#15>`_,
 `basic.link/p16 <https://eel.is/c++draft/basic.link#16>`_,
-`basic.link/p17 <https://eel.is/c++draft/basic.link#17>`_ and
+`basic.link/p17 <https://eel.is/c++draft/basic.link#17>`_, and
 `basic.link/p18 <https://eel.is/c++draft/basic.link#18>`_.
 
-However, the compiler doesn't support these 2 ideas formally.
-This results in unclear and confusing diagnostic messages.
-And it is worse that the compiler may import TU-local entities to other units without any
-diagnostics.
+However, Clang doesn't formally support these two concepts. This results in
+unclear or confusing diagnostic messages. Further, Clang may import
+``TU-local`` entities to other units without any diagnostics. This is tracked
+by `#78173 <https://github.com/llvm/llvm-project/issues/78173>`_.
 
-This is tracked in https://github.com/llvm/llvm-project/issues/78173.
+.. _header-units:
 
 Header Units
 ============
 
-How to build projects using header unit
----------------------------------------
+How to build projects using header units
+----------------------------------------
 
 .. warning::
 
-   The user interfaces of header units is highly experimental. There are still
-   many unanswered question about how tools should interact with header units.
-   The user interfaces described here may change after we have progress on how
-   tools should support for header units.
+   The support for header units, including related command line options, is
+   experimental. There are still many unanswered question about how tools
+   should interact with header units. The details described here may change in
+   the future.
 
 Quick Start
 ~~~~~~~~~~~
 
-For the following example,
+The following example:
 
 .. code-block:: c++
 
@@ -1275,7 +1267,7 @@ For the following example,
     std::cout << "Hello World.\n";
   }
 
-we could compile it as
+could be compiled with:
 
 .. code-block:: console
 
@@ -1285,14 +1277,14 @@ we could compile it as
 How to produce BMIs
 ~~~~~~~~~~~~~~~~~~~
 
-Similar to named modules, we could use ``--precompile`` to produce the BMI.
-But we need to specify that the input file is a header by ``-xc++-system-header`` or ``-xc++-user-header``.
+Similar to named modules, ``--precompile`` can be used to produce a BMI.
+However, that requires specifying that the input file is a header by using
+``-xc++-system-header`` or ``-xc++-user-header``.
 
-Also we could use `-fmodule-header={user,system}` option to produce the BMI for header units
-which has suffix like `.h` or `.hh`.
-The value of `-fmodule-header` means the user search path or the system search path.
-The default value for `-fmodule-header` is `user`.
-For example,
+The ``-fmodule-header={user,system}`` option can also be used to produce a BMI
+for header units which have a file extension like `.h` or `.hh`. The argument to
+``-fmodule-header`` specifies either the user search path or the system search
+path. The default value for ``-fmodule-header`` is ``user``. For example:
 
 .. code-block:: c++
 
@@ -1308,16 +1300,16 @@ For example,
     Hello();
   }
 
-We could compile it as:
+could be compiled with:
 
 .. code-block:: console
 
   $ clang++ -std=c++20 -fmodule-header foo.h -o foo.pcm
   $ clang++ -std=c++20 -fmodule-file=foo.pcm use.cpp
 
-For headers which don't have a suffix, we need to pass ``-xc++-header``
-(or ``-xc++-system-header`` or ``-xc++-user-header``) to mark it as a header.
-For example,
+For headers which do not have a file extension, ``-xc++-header`` (or
+``-xc++-system-header``, ``-xc++-user-header``) must be used to specify the
+file as a header. For example:
 
 .. code-block:: c++
 
@@ -1332,23 +1324,25 @@ For example,
   $ clang++ -std=c++20 -fmodule-header=system -xc++-header iostream -o iostream.pcm
   $ clang++ -std=c++20 -fmodule-file=iostream.pcm use.cpp
 
-How to specify the dependent BMIs
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+How to specify dependent BMIs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-We could use ``-fmodule-file`` to specify the BMIs, and this option may occur multiple times as well.
+``-fmodule-file`` can be used to specify a dependent BMI (or multiple times for
+more than one dependent BMI).
 
-With the existing implementation ``-fprebuilt-module-path`` cannot be used for header units
-(since they are nominally anonymous).
-For header units, use  ``-fmodule-file`` to include the relevant PCM file for each header unit.
+With the existing implementation, ``-fprebuilt-module-path`` cannot be used for
+header units (because they are nominally anonymous). For header units, use
+``-fmodule-file`` to include the relevant PCM file for each header unit.
 
-This is expect to be solved in future editions of the compiler either by the tooling finding and specifying
-the -fmodule-file or by the use of a module-mapper that understands how to map the header name to their PCMs.
+This is expect to be solved in a future version of Clang either by the compiler
+finding and specifying ``-fmodule-file`` automatically, or by the use of a
+module-mapper that understands how to map the header name to their PCMs.
 
-Don't compile the BMI
-~~~~~~~~~~~~~~~~~~~~~
+Compiling a header unit to an object file
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Another difference with modules is that we can't compile the BMI from a header unit.
-For example:
+A header unit cannot be compiled to an object file due to the semantics of
+header units. For example:
 
 .. code-block:: console
 
@@ -1356,15 +1350,13 @@ For example:
   # This is not allowed!
   $ clang++ iostream.pcm -c -o iostream.o
 
-It makes sense due to the semantics of header units, which are just like headers.
-
 Include translation
 ~~~~~~~~~~~~~~~~~~~
 
-The C++ spec allows the vendors to convert ``#include header-name`` to ``import header-name;`` when possible.
-Currently, Clang would do this translation for the ``#include`` in the global module fragment.
-
-For example, the following two examples are the same:
+The C++ standard allows vendors to convert ``#include header-name`` to
+``import header-name;`` when possible. Currently, Clang does this translation
+for the ``#include`` in the global module fragment. For example, the following
+example:
 
 .. code-block:: c++
 
@@ -1375,7 +1367,7 @@ For example, the following two examples are the same:
     std::cout << "Hello.\n";
   }
 
-with the following one:
+is the same as this example:
 
 .. code-block:: c++
 
@@ -1391,17 +1383,17 @@ with the following one:
   $ clang++ -std=c++20 -xc++-system-header --precompile iostream -o iostream.pcm
   $ clang++ -std=c++20 -fmodule-file=iostream.pcm --precompile M.cppm -o M.cpp
 
-In the latter example, the Clang could find the BMI for the ``<iostream>``
-so it would try to replace the ``#include <iostream>`` to ``import <iostream>;`` automatically.
+In the latter example, Clang can find the BMI for ``<iostream>`` and so it
+tries to replace the ``#include <iostream>`` with ``import <iostream>;``
+automatically.
 
 
-Relationships between Clang modules
------------------------------------
+Differences between Clang modules and header units
+--------------------------------------------------
 
-Header units have pretty similar semantics with Clang modules.
-The semantics of both of them are like headers.
-
-In fact, we could even "mimic" the sytle of header units by Clang modules:
+Header units have similar semantics to Clang modules. The semantics of both are
+like headers. Therefore, header units can be mimicked by Clang modules as in
+the following example:
 
 .. code-block:: c++
 
@@ -1414,46 +1406,45 @@ In fact, we could even "mimic" the sytle of header units by Clang modules:
 
   $ clang++ -std=c++20 -fimplicit-modules -fmodule-map-file=.modulemap main.cpp
 
-It would be simpler if we are using libcxx:
+This example is simplified when using libc++:
 
 .. code-block:: console
 
   $ clang++ -std=c++20 main.cpp -fimplicit-modules -fimplicit-module-maps
 
-Since there is already one
-`module map <https://github.com/llvm/llvm-project/blob/main/libcxx/include/module.modulemap.in>`_
-in the source of libcxx.
-
-Then immediately leads to the question: why don't we implement header units through Clang header modules?
+because libc++ already supplies a
+`module map <https://github.com/llvm/llvm-project/blob/main/libcxx/include/module.modulemap.in>`_.
 
-The main reason for this is that Clang modules have more semantics like hierarchy or
-wrapping multiple headers together as a big module.
-However, these things are not part of Standard C++ Header units,
-and we want to avoid the impression that these additional semantics get interpreted as Standard C++ behavior.
+This raises the question: why are header units not implemented through Clang
+modules?
 
-Another reason is that there are proposals to introduce module mappers to the C++ standard
-(for example, https://wg21.link/p1184r2).
-If we decide to reuse Clang's modulemap, we may get in trouble once we need to introduce another module mapper.
+This is primarily because Clang modules have more hierarchical semantics when
+wrapping multiple headers together as one module, which is not supported by
+Standard C++ Header units. We want to avoid the impression that these
+additional semantics get interpreted as Standard C++ behavior.
 
-So the final answer for why we don't reuse the interface of Clang modules for header units is that
-there are some differences between header units and Clang modules and that ignoring those
-differences now would likely become a problem in the future.
+Another reason is that there are proposals to introduce module mappers to the
+C++ standard (for example, https://wg21.link/p1184r2). Reusing Clang's
+``modulemap`` may be more difficult if we need to introduce another module
+mapper.
 
-Discover Dependencies
-=====================
+Discovering Dependencies
+========================
 
-Prior to modules, all the translation units can be compiled parallelly.
-But it is not true for the module units. The presence of module units requires
-us to compile the translation units in a (topological) order.
+Without use of modules, all the translation units in a project can be compiled
+in parallel. However, the presence of module units requires compiling the
+translation units in a topological order.
 
-The clang-scan-deps scanner implemented
-`P1689 paper <https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p1689r5.html>`_
-to describe the order. Only named modules are supported now.
+The ``clang-scan-deps`` tool can extract dependency information and produce a
+JSON file conforming to the specification described in
+`P1689 <https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p1689r5.html>`_.
+Only named modules are supported currently.
 
-We need a compilation database to use clang-scan-deps. See
+A compilation database is needed when using ``clang-scan-deps``. See
 `JSON Compilation Database Format Specification <JSONCompilationDatabase.html>`_
-for example. Note that the ``output`` entry is necessary for clang-scan-deps
-to scan P1689 format. Here is an example:
+for more information about compilation databases. Note that the ``output``
+JSON attribute is necessary for ``clang-scan-deps`` to scan using the P1689
+format. For example:
 
 .. code-block:: c++
 
@@ -1533,13 +1524,13 @@ And here is the compilation database:
   }
   ]
 
-And we can get the dependency information in P1689 format by:
+To get the dependency information in P1689 format, use:
 
 .. code-block:: console
 
   $ clang-scan-deps -format=p1689 -compilation-database P1689.json
 
-And we will get:
+to get:
 
 .. code-block:: text
 
@@ -1619,14 +1610,14 @@ And we will get:
 
 See the P1689 paper for the meaning of the fields.
 
-And if the user want a finer-grained control for any reason, e.g., to scan the generated source files,
-the user can choose to get the dependency information per file. For example:
+Getting dependency information per file with finer-grained control (such as
+scanning generated source files) is possible. For example:
 
 .. code-block:: console
 
   $ clang-scan-deps -format=p1689 -- <path-to-compiler-executable>/clang++ -std=c++20 impl_part.cppm -c -o impl_part.o
 
-And we'll get:
+will produce:
 
 .. code-block:: text
 
@@ -1652,22 +1643,23 @@ And we'll get:
     "version": 1
   }
 
-In this way, we can pass the single command line options after the ``--``.
-Then clang-scan-deps will extract the necessary information from the options.
-Note that we need to specify the path to the compiler executable instead of saying
-``clang++`` simply.
+Individual command line options can be specified after ``--``.
+``clang-scan-deps`` will extract the necessary information from the specified
+options. Note that the path to the compiler executable needs to be specified
+explicitly instead of using ``clang++`` directly.
 
-The users may want the scanner to get the transitional dependency information for headers.
-Otherwise, the users have to scan twice for the project, once for headers and once for modules.
-To address the requirement, clang-scan-deps will recognize the specified preprocessor options
-in the given command line and generate the corresponding dependency information. For example,
+Users may want the scanner to get the transitional dependency information for
+headers. Otherwise, the project has to be scanned twice, once for headers and
+once for modules. To address this, ``clang-scan-deps`` will recognize the
+specified preprocessor options in the given command line and generate the
+corresponding dependency information. For example:
 
 .. code-block:: console
 
   $ clang-scan-deps -format=p1689 -- ../bin/clang++ -std=c++20 impl_part.cppm -c -o impl_part.o -MD -MT impl_part.ddi -MF impl_part.dep
   $ cat impl_part.dep
 
-We will get:
+will produce:
 
 .. code-block:: text
 
@@ -1679,41 +1671,41 @@ We will get:
     /usr/include/bits/types/__locale_t.h \
     ...
 
-When clang-scan-deps detects ``-MF`` option, clang-scan-deps will try to write the
+When ``clang-scan-deps`` detects the ``-MF`` option, it will try to write the
 dependency information for headers to the file specified by ``-MF``.
 
 Possible Issues: Failed to find system headers
 ----------------------------------------------
 
-In case the users encounter errors like ``fatal error: 'stddef.h' file not found``,
-probably the specified ``<path-to-compiler-executable>/clang++`` refers to a symlink
-instead a real binary. There are 4 potential solutions to the problem:
-
-* (1) End users can resolve the issue by pointing the specified compiler executable to
-  the real binary instead of the symlink.
-* (2) End users can invoke ``<path-to-compiler-executable>/clang++ -print-resource-dir``
-  to get the corresponding resource directory for your compiler and add that directory
-  to the include search paths manually in the build scripts.
-* (3) Build systems that use a compilation database as the input for clang-scan-deps
-  scanner, the build system can add the flag ``--resource-dir-recipe invoke-compiler`` to
-  the clang-scan-deps scanner to calculate the resources directory dynamically.
-  The calculation happens only once for a unique ``<path-to-compiler-executable>/clang++``.
-* (4) For build systems that invokes the clang-scan-deps scanner per file, repeatedly
-  calculating the resource directory may be inefficient. In such cases, the build
-  system can cache the resource directory by itself and pass ``-resource-dir <resource-dir>``
-  explicitly in the command line options:
+If encountering an error like ``fatal error: 'stddef.h' file not found``,
+the specified ``<path-to-compiler-executable>/clang++`` probably refers to a
+symlink instead a real binary. There are four potential solutions to the
+problem:
 
-.. code-block:: console
+1. Point the specified compiler executable to the real binary instead of the
+   symlink.
+2. Invoke ``<path-to-compiler-executable>/clang++ -print-resource-dir`` to get
+   the corresponding resource directory for your compiler and add that
+   directory to the include search paths manually in the build scripts.
+3. For build systems that use a compilation database as the input for
+   ``clang-scan-deps``, the build system can add the
+   ``--resource-dir-recipe invoke-compiler`` option when executing
+   ``clang-scan-deps`` to calculate the resource directory dynamically.
+   The calculation happens only once for a unique ``<path-to-compiler-executable>/clang++``.
+4. For build systems that invoke ``clang-scan-deps`` per file, repeatedly
+   calculating the resource directory may be inefficient. In such cases, the
+   build system can cache the resource directory and specify
+   ``-resource-dir <resource-dir>`` explicitly, as in:
+
+   .. code-block:: console
 
-  $ clang-scan-deps -format=p1689 -- <path-to-compiler-executable>/clang++ -std=c++20 -resource-dir <resource-dir> mod.cppm -c -o mod.o
+     $ clang-scan-deps -format=p1689 -- <path-to-compiler-executable>/clang++ -std=c++20 -resource-dir <resource-dir> mod.cppm -c -o mod.o
 
 
 Import modules with clang-repl
 ==============================
 
-We're able to import C++20 named modules with clang-repl.
-
-Let's start with a simple example:
+``clang-repl`` supports importing C++20 named modules. For example:
 
 .. code-block:: c++
 
@@ -1723,7 +1715,7 @@ Let's start with a simple example:
       return "Hello Interpreter for Modules!";
   }
 
-We still need to compile the named module in ahead.
+The named module still needs to be compiled ahead of time.
 
 .. code-block:: console
 
@@ -1731,10 +1723,9 @@ We still need to compile the named module in ahead.
   $ clang++ M.pcm -c -o M.o
   $ clang++ -shared M.o -o libM.so
 
-Note that we need to compile the module unit into a dynamic library so that the clang-repl
-can load the object files of the module units.
-
-Then we are able to import module ``M`` in clang-repl.
+Note that the module unit needs to be compiled as a dynamic library so that
+``clang-repl`` can load the object files of the module units. Then it is
+possible to import module ``M`` in clang-repl.
 
 .. code-block:: console
 
@@ -1753,17 +1744,18 @@ Possible Questions
 How modules speed up compilation
 --------------------------------
 
-A classic theory for the reason why modules speed up the compilation is:
-if there are ``n`` headers and ``m`` source files and each header is included by each source file,
-then the complexity of the compilation is ``O(n*m)``;
-But if there are ``n`` module interfaces and ``m`` source files, the complexity of the compilation is
-``O(n+m)``. So, using modules would be a big win when scaling.
-In a simpler word, we could get rid of many redundant compilations by using modules.
+A classic theory for the reason why modules speed up the compilation is: if
+there are ``n`` headers and ``m`` source files and each header is included by
+each source file, then the complexity of the compilation is ``O(n*m)``.
+However, if there are ``n`` module interfaces and ``m`` source files, the
+complexity of the compilation is ``O(n+m)``. Therefore, using modules would be
+a significant improvement at scale. More simply, use of modules causes many of
+the redundant compilations to no longer be necessary.
 
-Roughly, this theory is correct. But the problem is that it is too rough.
-The behavior depends on the optimization level, as we will illustrate below.
+While this is accurate at a high level, this depends greatly on the
+optimization level, as illustrated below.
 
-First is ``O0``. The compilation process is described in the following graph.
+First is ``-O0``. The compilation process is described in the following graph.
 
 .. code-block:: none
 
@@ -1771,13 +1763,13 @@ First is ``O0``. The compilation process is described in the following graph.
   │                               │                                       │               │
   └---parsing----sema----codegen--┴----- transformations ---- codegen ----┴---- codegen --┘
 
-  ┌---------------------------------------------------------------------------------------┐
+  ├---------------------------------------------------------------------------------------┐
   |                                                                                       │
   |                                     source file                                       │
   |                                                                                       │
   └---------------------------------------------------------------------------------------┘
 
-              ┌--------┐
+              ├--------┐
               │        │
               │imported│
               │        │
@@ -1785,18 +1777,17 @@ First is ``O0``. The compilation process is described in the following graph.
               │        │
               └--------┘
 
-Here we can see that the source file (could be a non-module unit or a module unit) would get processed by the
-whole pipeline.
-But the imported code would only get involved in semantic analysis, which is mainly about name lookup,
-overload resolution and template instantiation.
-All of these processes are fast relative to the whole compilation process.
-More importantly, the imported code only needs to be processed once in frontend code generation,
-as well as the whole middle end and backend.
-So we could get a big win for the compilation time in O0.
+In this case, the source file (which could be a non-module unit or a module
+unit) would get processed by the entire pipeline. However, the imported code
+would only get involved in semantic analysis, which, for the most part, is name
+lookup, overload resolution, and template instantiation. All of these processes
+are fast relative to the whole compilation process. More importantly, the
+imported code only needs to be processed once during frontend code generation,
+as well as the whole middle end and backend. So we could get a big win for the
+compilation time in ``-O0``.
 
-But with optimizations, things are different:
-
-(we omit ``code generation`` part for each end due to the limited space)
+But with optimizations, things are different (the ``code generation`` part for
+each end is omitted due to limited space):
 
 .. code-block:: none
 
@@ -1804,12 +1795,12 @@ But with optimizations, things are different:
   │                           │                                               │                   │
   └--- parsing ---- sema -----┴--- optimizations --- IPO ---- optimizations---┴--- optimizations -┘
 
-  ┌-----------------------------------------------------------------------------------------------┐
+  ├-----------------------------------------------------------------------------------------------┐
   │                                                                                               │
   │                                         source file                                           │
   │                                                                                               │
   └-----------------------------------------------------------------------------------------------┘
-                ┌---------------------------------------┐
+                ├---------------------------------------┐
                 │                                       │
                 │                                       │
                 │            imported code              │
@@ -1817,27 +1808,29 @@ But with optimizations, things are different:
                 │                                       │
                 └---------------------------------------┘
 
-It would be very unfortunate if we end up with worse performance after using modules.
-The main concern is that when we compile a source file, the compiler needs to see the function body
-of imported module units so that it can perform IPO (InterProcedural Optimization, primarily inlining
-in practice) to optimize functions in current source file with the help of the information provided by
-the imported module units.
-In other words, the imported code would be processed again and again in importee units
-by optimizations (including IPO itself).
-The optimizations before IPO and the IPO itself are the most time-consuming part in whole compilation process.
-So from this perspective, we might not be able to get the improvements described in the theory.
-But we could still save the time for optimizations after IPO and the whole backend.
-
-Overall, at ``O0`` the implementations of functions defined in a module will not impact module users,
-but at higher optimization levels the definitions of such functions are provided to user compilations for the
-purposes of optimization (but definitions of these functions are still not included in the use's object file)-
-this means the build speedup at higher optimization levels may be lower than expected given ``O0`` experience,
-but does provide by more optimization opportunities.
+It would be very unfortunate if we end up with worse performance when using
+modules. The main concern is that when a source file is compiled, the compiler
+needs to see the body of imported module units so that it can perform IPO
+(InterProcedural Optimization, primarily inlining in practice) to optimize
+functions in the current source file with the help of the information provided
+by the imported module units. In other words, the imported code would be
+processed again and again in importee units by optimizations (including IPO
+itself). The optimizations before IPO and IPO itself are the most time-consuming
+part in whole compilation process. So from this perspective, it might not be
+possible to get the compile time improvements described, but there could be
+time savings for optimizations after IPO and the whole backend.
+
+Overall, at ``-O0`` the implementations of functions defined in a module will
+not impact module users, but at higher optimization levels the definitions of
+such functions are provided to user compilations for the purposes of
+optimization (but definitions of these functions are still not included in the
+use's object file). This means the build speedup at higher optimization levels
+may be lower than expected given ``-O0`` experience, but does provide more
+optimization opportunities.
 
 Interoperability with Clang Modules
 -----------------------------------
 
-We **wish** to support clang modules and standard c++ modules at the same time,
-but the mixed using form is not well used/tested yet.
-
-Please file new github issues as you find interoperability problems.
+We **wish** to support Clang modules and standard C++ modules at the same time,
+but the mixing them together is not well used/tested yet. Please file new
+GitHub issues as you find interoperability problems.
diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst
index 370de7d9c769..80ba70f67126 100644
--- a/clang/docs/UsersManual.rst
+++ b/clang/docs/UsersManual.rst
@@ -3504,7 +3504,7 @@ Differences between ``*17`` and ``*23`` modes:
 - ``nullptr`` and ``nullptr_t`` are supported, only in ``*23`` mode.
 - ``ATOMIC_VAR_INIT`` is removed from ``*23`` mode.
 - ``bool``, ``true``, ``false``, ``alignas``, ``alignof``, ``static_assert``,
-  and ``thread_local` are now first-class keywords, only in ``*23`` mode.
+  and ``thread_local`` are now first-class keywords, only in ``*23`` mode.
 - ``typeof`` and ``typeof_unqual`` are supported, only ``*23`` mode.
 - Bit-precise integers (``_BitInt(N)``) are supported by default in ``*23``
   mode, and as an extension in ``*17`` and earlier modes.
diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index 0d87df36ced0..eb8b58323da4 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -3033,13 +3033,6 @@ Further examples of injection vulnerabilities this checker can find.
     sprintf(buf, s); // warn: untrusted data used as a format string
   }
 
-  void test() {
-    size_t ts;
-    scanf("%zd", &ts); // 'ts' marked as tainted
-    int *p = (int *)malloc(ts * sizeof(int));
-      // warn: untrusted data used as buffer size
-  }
-
 There are built-in sources, propagations and sinks even if no external taint
 configuration is provided.
 
@@ -3067,9 +3060,7 @@ Default propagations rules:
 
 Default sinks:
  ``printf``, ``setproctitle``, ``system``, ``popen``, ``execl``, ``execle``,
- ``execlp``, ``execv``, ``execvp``, ``execvP``, ``execve``, ``dlopen``,
- ``memcpy``, ``memmove``, ``strncpy``, ``strndup``, ``malloc``, ``calloc``,
- ``alloca``, ``memccpy``, ``realloc``, ``bcopy``
+ ``execlp``, ``execv``, ``execvp``, ``execvP``, ``execve``, ``dlopen``
 
 Please note that there are no built-in filter functions.
 
diff --git a/clang/docs/tools/clang-formatted-files.txt b/clang/docs/tools/clang-formatted-files.txt
index 2252d0ccde96..eaeadf2656b0 100644
--- a/clang/docs/tools/clang-formatted-files.txt
+++ b/clang/docs/tools/clang-formatted-files.txt
@@ -632,11 +632,12 @@ clang/unittests/Analysis/FlowSensitive/MapLatticeTest.cpp
 clang/unittests/Analysis/FlowSensitive/MatchSwitchTest.cpp
 clang/unittests/Analysis/FlowSensitive/MultiVarConstantPropagationTest.cpp
 clang/unittests/Analysis/FlowSensitive/SingleVarConstantPropagationTest.cpp
-clang/unittests/Analysis/FlowSensitive/SolverTest.cpp
+clang/unittests/Analysis/FlowSensitive/SolverTest.h
 clang/unittests/Analysis/FlowSensitive/TestingSupport.cpp
 clang/unittests/Analysis/FlowSensitive/TestingSupport.h
 clang/unittests/Analysis/FlowSensitive/TestingSupportTest.cpp
 clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
+clang/unittests/Analysis/FlowSensitive/WatchedLiteralsSolverTest.cpp
 clang/unittests/AST/ASTImporterFixtures.cpp
 clang/unittests/AST/ASTImporterFixtures.h
 clang/unittests/AST/ASTImporterObjCTest.cpp
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 6dbd06251dda..e03b11219478 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -1116,7 +1116,8 @@ public:
   CanQualType BFloat16Ty;
   CanQualType Float16Ty; // C11 extension ISO/IEC TS 18661-3
   CanQualType VoidPtrTy, NullPtrTy;
-  CanQualType DependentTy, OverloadTy, BoundMemberTy, UnknownAnyTy;
+  CanQualType DependentTy, OverloadTy, BoundMemberTy, UnresolvedTemplateTy,
+      UnknownAnyTy;
   CanQualType BuiltinFnTy;
   CanQualType PseudoObjectTy, ARCUnbridgedCastTy;
   CanQualType ObjCBuiltinIdTy, ObjCBuiltinClassTy, ObjCBuiltinSelTy;
diff --git a/clang/include/clang/AST/BuiltinTypes.def b/clang/include/clang/AST/BuiltinTypes.def
index 0a36fdc5d9c0..444be4311a74 100644
--- a/clang/include/clang/AST/BuiltinTypes.def
+++ b/clang/include/clang/AST/BuiltinTypes.def
@@ -285,6 +285,9 @@ PLACEHOLDER_TYPE(Overload, OverloadTy)
 //   x->foo       # if only contains non-static members
 PLACEHOLDER_TYPE(BoundMember, BoundMemberTy)
 
+// The type of an unresolved template. Used in UnresolvedLookupExpr.
+PLACEHOLDER_TYPE(UnresolvedTemplate, UnresolvedTemplateTy)
+
 // The type of an expression which refers to a pseudo-object,
 // such as those introduced by Objective C's @property or
 // VS.NET's __property declarations.  A placeholder type.  The
diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index a53c27a99a8c..de8b923645f8 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -5049,6 +5049,11 @@ static constexpr StringRef getOpenMPVariantManglingSeparatorStr() {
   return "$ompvariant";
 }
 
+/// Returns whether the given FunctionDecl has an __arm[_locally]_streaming
+/// attribute.
+bool IsArmStreamingFunction(const FunctionDecl *FD,
+                            bool IncludeLocallyStreaming);
+
 } // namespace clang
 
 #endif // LLVM_CLANG_AST_DECL_H
diff --git a/clang/include/clang/AST/ExprCXX.h b/clang/include/clang/AST/ExprCXX.h
index ab3f810b4519..fac65628ffed 100644
--- a/clang/include/clang/AST/ExprCXX.h
+++ b/clang/include/clang/AST/ExprCXX.h
@@ -3163,8 +3163,30 @@ public:
 /// This arises in several ways:
 ///   * we might be waiting for argument-dependent lookup;
 ///   * the name might resolve to an overloaded function;
+///   * the name might resolve to a non-function template; for example, in the
+///   following snippet, the return expression of the member function
+///   'foo()' might remain unresolved until instantiation:
+///
+/// \code
+/// struct P {
+///   template <class T> using I = T;
+/// };
+///
+/// struct Q {
+///   template <class T> int foo() {
+///     return T::template I<int>;
+///   }
+/// };
+/// \endcode
+///
+/// ...which is distinct from modeling function overloads, and therefore we use
+/// a different builtin type 'UnresolvedTemplate' to avoid confusion. This is
+/// done in Sema::BuildTemplateIdExpr.
+///
 /// and eventually:
 ///   * the lookup might have included a function template.
+///   * the unresolved template gets transformed in an instantiation or gets
+///   diagnosed for its direct use.
 ///
 /// These never include UnresolvedUsingValueDecls, which are always class
 /// members and therefore appear only in UnresolvedMemberLookupExprs.
diff --git a/clang/include/clang/AST/OpenACCClause.h b/clang/include/clang/AST/OpenACCClause.h
index bb4cb1f5d508..3d0b1ab9d31e 100644
--- a/clang/include/clang/AST/OpenACCClause.h
+++ b/clang/include/clang/AST/OpenACCClause.h
@@ -26,14 +26,17 @@ class OpenACCClause {
 protected:
   OpenACCClause(OpenACCClauseKind K, SourceLocation BeginLoc,
                 SourceLocation EndLoc)
-      : Kind(K), Location(BeginLoc, EndLoc) {}
+      : Kind(K), Location(BeginLoc, EndLoc) {
+    assert(!BeginLoc.isInvalid() && !EndLoc.isInvalid() &&
+           "Begin and end location must be valid for OpenACCClause");
+      }
 
 public:
   OpenACCClauseKind getClauseKind() const { return Kind; }
   SourceLocation getBeginLoc() const { return Location.getBegin(); }
   SourceLocation getEndLoc() const { return Location.getEnd(); }
 
-  static bool classof(const OpenACCClause *) { return true; }
+  static bool classof(const OpenACCClause *) { return false; }
 
   using child_iterator = StmtIterator;
   using const_child_iterator = ConstStmtIterator;
@@ -60,6 +63,8 @@ protected:
       : OpenACCClause(K, BeginLoc, EndLoc), LParenLoc(LParenLoc) {}
 
 public:
+  static bool classof(const OpenACCClause *C);
+
   SourceLocation getLParenLoc() const { return LParenLoc; }
 
   child_range children() {
@@ -89,6 +94,9 @@ protected:
   }
 
 public:
+  static bool classof(const OpenACCClause *C) {
+    return C->getClauseKind() == OpenACCClauseKind::Default;
+  }
   OpenACCDefaultClauseKind getDefaultClauseKind() const {
     return DefaultClauseKind;
   }
@@ -113,6 +121,8 @@ protected:
         ConditionExpr(ConditionExpr) {}
 
 public:
+  static bool classof(const OpenACCClause *C);
+
   bool hasConditionExpr() const { return ConditionExpr; }
   const Expr *getConditionExpr() const { return ConditionExpr; }
   Expr *getConditionExpr() { return ConditionExpr; }
@@ -140,6 +150,9 @@ protected:
                   Expr *ConditionExpr, SourceLocation EndLoc);
 
 public:
+  static bool classof(const OpenACCClause *C) {
+    return C->getClauseKind() == OpenACCClauseKind::If;
+  }
   static OpenACCIfClause *Create(const ASTContext &C, SourceLocation BeginLoc,
                                  SourceLocation LParenLoc, Expr *ConditionExpr,
                                  SourceLocation EndLoc);
@@ -151,6 +164,9 @@ class OpenACCSelfClause : public OpenACCClauseWithCondition {
                     Expr *ConditionExpr, SourceLocation EndLoc);
 
 public:
+  static bool classof(const OpenACCClause *C) {
+    return C->getClauseKind() == OpenACCClauseKind::Self;
+  }
   static OpenACCSelfClause *Create(const ASTContext &C, SourceLocation BeginLoc,
                                    SourceLocation LParenLoc,
                                    Expr *ConditionExpr, SourceLocation EndLoc);
@@ -177,6 +193,7 @@ protected:
   llvm::ArrayRef<Expr *> getExprs() const { return Exprs; }
 
 public:
+  static bool classof(const OpenACCClause *C);
   child_range children() {
     return child_range(reinterpret_cast<Stmt **>(Exprs.begin()),
                        reinterpret_cast<Stmt **>(Exprs.end()));
@@ -189,6 +206,49 @@ public:
   }
 };
 
+// Represents the 'devnum' and expressions lists for the 'wait' clause.
+class OpenACCWaitClause final
+    : public OpenACCClauseWithExprs,
+      public llvm::TrailingObjects<OpenACCWaitClause, Expr *> {
+  SourceLocation QueuesLoc;
+  OpenACCWaitClause(SourceLocation BeginLoc, SourceLocation LParenLoc,
+                    Expr *DevNumExpr, SourceLocation QueuesLoc,
+                    ArrayRef<Expr *> QueueIdExprs, SourceLocation EndLoc)
+      : OpenACCClauseWithExprs(OpenACCClauseKind::Wait, BeginLoc, LParenLoc,
+                               EndLoc),
+        QueuesLoc(QueuesLoc) {
+    // The first element of the trailing storage is always the devnum expr,
+    // whether it is used or not.
+    std::uninitialized_copy(&DevNumExpr, &DevNumExpr + 1,
+                            getTrailingObjects<Expr *>());
+    std::uninitialized_copy(QueueIdExprs.begin(), QueueIdExprs.end(),
+                            getTrailingObjects<Expr *>() + 1);
+    setExprs(
+        MutableArrayRef(getTrailingObjects<Expr *>(), QueueIdExprs.size() + 1));
+  }
+
+public:
+  static bool classof(const OpenACCClause *C) {
+    return C->getClauseKind() == OpenACCClauseKind::Wait;
+  }
+  static OpenACCWaitClause *Create(const ASTContext &C, SourceLocation BeginLoc,
+                                   SourceLocation LParenLoc, Expr *DevNumExpr,
+                                   SourceLocation QueuesLoc,
+                                   ArrayRef<Expr *> QueueIdExprs,
+                                   SourceLocation EndLoc);
+
+  bool hasQueuesTag() const { return !QueuesLoc.isInvalid(); }
+  SourceLocation getQueuesLoc() const { return QueuesLoc; }
+  bool hasDevNumExpr() const { return getExprs()[0]; }
+  Expr *getDevNumExpr() const { return getExprs()[0]; }
+  llvm::ArrayRef<Expr *> getQueueIdExprs() {
+    return OpenACCClauseWithExprs::getExprs().drop_front();
+  }
+  llvm::ArrayRef<Expr *> getQueueIdExprs() const {
+    return OpenACCClauseWithExprs::getExprs().drop_front();
+  }
+};
+
 class OpenACCNumGangsClause final
     : public OpenACCClauseWithExprs,
       public llvm::TrailingObjects<OpenACCNumGangsClause, Expr *> {
@@ -203,6 +263,9 @@ class OpenACCNumGangsClause final
   }
 
 public:
+  static bool classof(const OpenACCClause *C) {
+    return C->getClauseKind() == OpenACCClauseKind::NumGangs;
+  }
   static OpenACCNumGangsClause *
   Create(const ASTContext &C, SourceLocation BeginLoc, SourceLocation LParenLoc,
          ArrayRef<Expr *> IntExprs, SourceLocation EndLoc);
@@ -227,10 +290,12 @@ protected:
                                  SourceLocation EndLoc)
       : OpenACCClauseWithExprs(K, BeginLoc, LParenLoc, EndLoc),
         IntExpr(IntExpr) {
-    setExprs(MutableArrayRef<Expr *>{&this->IntExpr, 1});
+    if (IntExpr)
+      setExprs(MutableArrayRef<Expr *>{&this->IntExpr, 1});
   }
 
 public:
+  static bool classof(const OpenACCClause *C);
   bool hasIntExpr() const { return !getExprs().empty(); }
   const Expr *getIntExpr() const {
     return hasIntExpr() ? getExprs()[0] : nullptr;
@@ -244,6 +309,9 @@ class OpenACCNumWorkersClause : public OpenACCClauseWithSingleIntExpr {
                           Expr *IntExpr, SourceLocation EndLoc);
 
 public:
+  static bool classof(const OpenACCClause *C) {
+    return C->getClauseKind() == OpenACCClauseKind::NumWorkers;
+  }
   static OpenACCNumWorkersClause *Create(const ASTContext &C,
                                          SourceLocation BeginLoc,
                                          SourceLocation LParenLoc,
@@ -255,11 +323,28 @@ class OpenACCVectorLengthClause : public OpenACCClauseWithSingleIntExpr {
                             Expr *IntExpr, SourceLocation EndLoc);
 
 public:
+  static bool classof(const OpenACCClause *C) {
+    return C->getClauseKind() == OpenACCClauseKind::VectorLength;
+  }
   static OpenACCVectorLengthClause *
   Create(const ASTContext &C, SourceLocation BeginLoc, SourceLocation LParenLoc,
          Expr *IntExpr, SourceLocation EndLoc);
 };
 
+class OpenACCAsyncClause : public OpenACCClauseWithSingleIntExpr {
+  OpenACCAsyncClause(SourceLocation BeginLoc, SourceLocation LParenLoc,
+                     Expr *IntExpr, SourceLocation EndLoc);
+
+public:
+  static bool classof(const OpenACCClause *C) {
+    return C->getClauseKind() == OpenACCClauseKind::Async;
+  }
+  static OpenACCAsyncClause *Create(const ASTContext &C,
+                                    SourceLocation BeginLoc,
+                                    SourceLocation LParenLoc, Expr *IntExpr,
+                                    SourceLocation EndLoc);
+};
+
 /// Represents a clause with one or more 'var' objects, represented as an expr,
 /// as its arguments. Var-list is expected to be stored in trailing storage.
 /// For now, we're just storing the original expression in its entirety, unlike
@@ -271,6 +356,7 @@ protected:
       : OpenACCClauseWithExprs(K, BeginLoc, LParenLoc, EndLoc) {}
 
 public:
+  static bool classof(const OpenACCClause *C);
   ArrayRef<Expr *> getVarList() { return getExprs(); }
   ArrayRef<Expr *> getVarList() const { return getExprs(); }
 };
@@ -289,11 +375,249 @@ class OpenACCPrivateClause final
   }
 
 public:
+  static bool classof(const OpenACCClause *C) {
+    return C->getClauseKind() == OpenACCClauseKind::Private;
+  }
   static OpenACCPrivateClause *
   Create(const ASTContext &C, SourceLocation BeginLoc, SourceLocation LParenLoc,
          ArrayRef<Expr *> VarList, SourceLocation EndLoc);
 };
 
+class OpenACCFirstPrivateClause final
+    : public OpenACCClauseWithVarList,
+      public llvm::TrailingObjects<OpenACCFirstPrivateClause, Expr *> {
+
+  OpenACCFirstPrivateClause(SourceLocation BeginLoc, SourceLocation LParenLoc,
+                            ArrayRef<Expr *> VarList, SourceLocation EndLoc)
+      : OpenACCClauseWithVarList(OpenACCClauseKind::FirstPrivate, BeginLoc,
+                                 LParenLoc, EndLoc) {
+    std::uninitialized_copy(VarList.begin(), VarList.end(),
+                            getTrailingObjects<Expr *>());
+    setExprs(MutableArrayRef(getTrailingObjects<Expr *>(), VarList.size()));
+  }
+
+public:
+  static bool classof(const OpenACCClause *C) {
+    return C->getClauseKind() == OpenACCClauseKind::FirstPrivate;
+  }
+  static OpenACCFirstPrivateClause *
+  Create(const ASTContext &C, SourceLocation BeginLoc, SourceLocation LParenLoc,
+         ArrayRef<Expr *> VarList, SourceLocation EndLoc);
+};
+
+class OpenACCDevicePtrClause final
+    : public OpenACCClauseWithVarList,
+      public llvm::TrailingObjects<OpenACCDevicePtrClause, Expr *> {
+
+  OpenACCDevicePtrClause(SourceLocation BeginLoc, SourceLocation LParenLoc,
+                         ArrayRef<Expr *> VarList, SourceLocation EndLoc)
+      : OpenACCClauseWithVarList(OpenACCClauseKind::DevicePtr, BeginLoc,
+                                 LParenLoc, EndLoc) {
+    std::uninitialized_copy(VarList.begin(), VarList.end(),
+                            getTrailingObjects<Expr *>());
+    setExprs(MutableArrayRef(getTrailingObjects<Expr *>(), VarList.size()));
+  }
+
+public:
+  static bool classof(const OpenACCClause *C) {
+    return C->getClauseKind() == OpenACCClauseKind::DevicePtr;
+  }
+  static OpenACCDevicePtrClause *
+  Create(const ASTContext &C, SourceLocation BeginLoc, SourceLocation LParenLoc,
+         ArrayRef<Expr *> VarList, SourceLocation EndLoc);
+};
+
+class OpenACCAttachClause final
+    : public OpenACCClauseWithVarList,
+      public llvm::TrailingObjects<OpenACCAttachClause, Expr *> {
+
+  OpenACCAttachClause(SourceLocation BeginLoc, SourceLocation LParenLoc,
+                      ArrayRef<Expr *> VarList, SourceLocation EndLoc)
+      : OpenACCClauseWithVarList(OpenACCClauseKind::Attach, BeginLoc, LParenLoc,
+                                 EndLoc) {
+    std::uninitialized_copy(VarList.begin(), VarList.end(),
+                            getTrailingObjects<Expr *>());
+    setExprs(MutableArrayRef(getTrailingObjects<Expr *>(), VarList.size()));
+  }
+
+public:
+  static bool classof(const OpenACCClause *C) {
+    return C->getClauseKind() == OpenACCClauseKind::Attach;
+  }
+  static OpenACCAttachClause *
+  Create(const ASTContext &C, SourceLocation BeginLoc, SourceLocation LParenLoc,
+         ArrayRef<Expr *> VarList, SourceLocation EndLoc);
+};
+
+class OpenACCNoCreateClause final
+    : public OpenACCClauseWithVarList,
+      public llvm::TrailingObjects<OpenACCNoCreateClause, Expr *> {
+
+  OpenACCNoCreateClause(SourceLocation BeginLoc, SourceLocation LParenLoc,
+                        ArrayRef<Expr *> VarList, SourceLocation EndLoc)
+      : OpenACCClauseWithVarList(OpenACCClauseKind::NoCreate, BeginLoc,
+                                 LParenLoc, EndLoc) {
+    std::uninitialized_copy(VarList.begin(), VarList.end(),
+                            getTrailingObjects<Expr *>());
+    setExprs(MutableArrayRef(getTrailingObjects<Expr *>(), VarList.size()));
+  }
+
+public:
+  static bool classof(const OpenACCClause *C) {
+    return C->getClauseKind() == OpenACCClauseKind::NoCreate;
+  }
+  static OpenACCNoCreateClause *
+  Create(const ASTContext &C, SourceLocation BeginLoc, SourceLocation LParenLoc,
+         ArrayRef<Expr *> VarList, SourceLocation EndLoc);
+};
+
+class OpenACCPresentClause final
+    : public OpenACCClauseWithVarList,
+      public llvm::TrailingObjects<OpenACCPresentClause, Expr *> {
+
+  OpenACCPresentClause(SourceLocation BeginLoc, SourceLocation LParenLoc,
+                       ArrayRef<Expr *> VarList, SourceLocation EndLoc)
+      : OpenACCClauseWithVarList(OpenACCClauseKind::Present, BeginLoc,
+                                 LParenLoc, EndLoc) {
+    std::uninitialized_copy(VarList.begin(), VarList.end(),
+                            getTrailingObjects<Expr *>());
+    setExprs(MutableArrayRef(getTrailingObjects<Expr *>(), VarList.size()));
+  }
+
+public:
+  static bool classof(const OpenACCClause *C) {
+    return C->getClauseKind() == OpenACCClauseKind::Present;
+  }
+  static OpenACCPresentClause *
+  Create(const ASTContext &C, SourceLocation BeginLoc, SourceLocation LParenLoc,
+         ArrayRef<Expr *> VarList, SourceLocation EndLoc);
+};
+
+class OpenACCCopyClause final
+    : public OpenACCClauseWithVarList,
+      public llvm::TrailingObjects<OpenACCCopyClause, Expr *> {
+
+  OpenACCCopyClause(OpenACCClauseKind Spelling, SourceLocation BeginLoc,
+                    SourceLocation LParenLoc, ArrayRef<Expr *> VarList,
+                    SourceLocation EndLoc)
+      : OpenACCClauseWithVarList(Spelling, BeginLoc, LParenLoc, EndLoc) {
+    assert((Spelling == OpenACCClauseKind::Copy ||
+            Spelling == OpenACCClauseKind::PCopy ||
+            Spelling == OpenACCClauseKind::PresentOrCopy) &&
+           "Invalid clause kind for copy-clause");
+    std::uninitialized_copy(VarList.begin(), VarList.end(),
+                            getTrailingObjects<Expr *>());
+    setExprs(MutableArrayRef(getTrailingObjects<Expr *>(), VarList.size()));
+  }
+
+public:
+  static bool classof(const OpenACCClause *C) {
+    return C->getClauseKind() == OpenACCClauseKind::Copy ||
+           C->getClauseKind() == OpenACCClauseKind::PCopy ||
+           C->getClauseKind() == OpenACCClauseKind::PresentOrCopy;
+  }
+  static OpenACCCopyClause *
+  Create(const ASTContext &C, OpenACCClauseKind Spelling,
+         SourceLocation BeginLoc, SourceLocation LParenLoc,
+         ArrayRef<Expr *> VarList, SourceLocation EndLoc);
+};
+
+class OpenACCCopyInClause final
+    : public OpenACCClauseWithVarList,
+      public llvm::TrailingObjects<OpenACCCopyInClause, Expr *> {
+  bool IsReadOnly;
+
+  OpenACCCopyInClause(OpenACCClauseKind Spelling, SourceLocation BeginLoc,
+                      SourceLocation LParenLoc, bool IsReadOnly,
+                      ArrayRef<Expr *> VarList, SourceLocation EndLoc)
+      : OpenACCClauseWithVarList(Spelling, BeginLoc, LParenLoc, EndLoc),
+        IsReadOnly(IsReadOnly) {
+    assert((Spelling == OpenACCClauseKind::CopyIn ||
+            Spelling == OpenACCClauseKind::PCopyIn ||
+            Spelling == OpenACCClauseKind::PresentOrCopyIn) &&
+           "Invalid clause kind for copyin-clause");
+    std::uninitialized_copy(VarList.begin(), VarList.end(),
+                            getTrailingObjects<Expr *>());
+    setExprs(MutableArrayRef(getTrailingObjects<Expr *>(), VarList.size()));
+  }
+
+public:
+  static bool classof(const OpenACCClause *C) {
+    return C->getClauseKind() == OpenACCClauseKind::CopyIn ||
+           C->getClauseKind() == OpenACCClauseKind::PCopyIn ||
+           C->getClauseKind() == OpenACCClauseKind::PresentOrCopyIn;
+  }
+  bool isReadOnly() const { return IsReadOnly; }
+  static OpenACCCopyInClause *
+  Create(const ASTContext &C, OpenACCClauseKind Spelling,
+         SourceLocation BeginLoc, SourceLocation LParenLoc, bool IsReadOnly,
+         ArrayRef<Expr *> VarList, SourceLocation EndLoc);
+};
+
+class OpenACCCopyOutClause final
+    : public OpenACCClauseWithVarList,
+      public llvm::TrailingObjects<OpenACCCopyOutClause, Expr *> {
+  bool IsZero;
+
+  OpenACCCopyOutClause(OpenACCClauseKind Spelling, SourceLocation BeginLoc,
+                       SourceLocation LParenLoc, bool IsZero,
+                       ArrayRef<Expr *> VarList, SourceLocation EndLoc)
+      : OpenACCClauseWithVarList(Spelling, BeginLoc, LParenLoc, EndLoc),
+        IsZero(IsZero) {
+    assert((Spelling == OpenACCClauseKind::CopyOut ||
+            Spelling == OpenACCClauseKind::PCopyOut ||
+            Spelling == OpenACCClauseKind::PresentOrCopyOut) &&
+           "Invalid clause kind for copyout-clause");
+    std::uninitialized_copy(VarList.begin(), VarList.end(),
+                            getTrailingObjects<Expr *>());
+    setExprs(MutableArrayRef(getTrailingObjects<Expr *>(), VarList.size()));
+  }
+
+public:
+  static bool classof(const OpenACCClause *C) {
+    return C->getClauseKind() == OpenACCClauseKind::CopyOut ||
+           C->getClauseKind() == OpenACCClauseKind::PCopyOut ||
+           C->getClauseKind() == OpenACCClauseKind::PresentOrCopyOut;
+  }
+  bool isZero() const { return IsZero; }
+  static OpenACCCopyOutClause *
+  Create(const ASTContext &C, OpenACCClauseKind Spelling,
+         SourceLocation BeginLoc, SourceLocation LParenLoc, bool IsZero,
+         ArrayRef<Expr *> VarList, SourceLocation EndLoc);
+};
+
+class OpenACCCreateClause final
+    : public OpenACCClauseWithVarList,
+      public llvm::TrailingObjects<OpenACCCreateClause, Expr *> {
+  bool IsZero;
+
+  OpenACCCreateClause(OpenACCClauseKind Spelling, SourceLocation BeginLoc,
+                      SourceLocation LParenLoc, bool IsZero,
+                      ArrayRef<Expr *> VarList, SourceLocation EndLoc)
+      : OpenACCClauseWithVarList(Spelling, BeginLoc, LParenLoc, EndLoc),
+        IsZero(IsZero) {
+    assert((Spelling == OpenACCClauseKind::Create ||
+            Spelling == OpenACCClauseKind::PCreate ||
+            Spelling == OpenACCClauseKind::PresentOrCreate) &&
+           "Invalid clause kind for create-clause");
+    std::uninitialized_copy(VarList.begin(), VarList.end(),
+                            getTrailingObjects<Expr *>());
+    setExprs(MutableArrayRef(getTrailingObjects<Expr *>(), VarList.size()));
+  }
+
+public:
+  static bool classof(const OpenACCClause *C) {
+    return C->getClauseKind() == OpenACCClauseKind::Create ||
+           C->getClauseKind() == OpenACCClauseKind::PCreate ||
+           C->getClauseKind() == OpenACCClauseKind::PresentOrCreate;
+  }
+  bool isZero() const { return IsZero; }
+  static OpenACCCreateClause *
+  Create(const ASTContext &C, OpenACCClauseKind Spelling,
+         SourceLocation BeginLoc, SourceLocation LParenLoc, bool IsZero,
+         ArrayRef<Expr *> VarList, SourceLocation EndLoc);
+};
+
 template <class Impl> class OpenACCClauseVisitor {
   Impl &getDerived() { return static_cast<Impl &>(*this); }
 
@@ -312,6 +636,10 @@ public:
   case OpenACCClauseKind::CLAUSE_NAME:                                         \
     Visit##CLAUSE_NAME##Clause(*cast<OpenACC##CLAUSE_NAME##Clause>(C));        \
     return;
+#define CLAUSE_ALIAS(ALIAS_NAME, CLAUSE_NAME)                                  \
+  case OpenACCClauseKind::ALIAS_NAME:                                          \
+    Visit##CLAUSE_NAME##Clause(*cast<OpenACC##CLAUSE_NAME##Clause>(C));        \
+    return;
 #include "clang/Basic/OpenACCClauses.def"
 
     default:
diff --git a/clang/include/clang/AST/StmtOpenACC.h b/clang/include/clang/AST/StmtOpenACC.h
index 66f8f844e0b2..b706864798ba 100644
--- a/clang/include/clang/AST/StmtOpenACC.h
+++ b/clang/include/clang/AST/StmtOpenACC.h
@@ -93,6 +93,10 @@ protected:
   }
 
 public:
+  static bool classof(const Stmt *T) {
+    return false;
+  }
+
   child_range children() {
     if (getAssociatedStmt())
       return child_range(&AssociatedStmt, &AssociatedStmt + 1);
diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h
index 67eccdd030dc..763af2445476 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h
@@ -283,9 +283,8 @@ llvm::Expected<llvm::SmallVector<Diagnostic>> diagnoseFunction(
   if (!Context)
     return Context.takeError();
 
-  auto OwnedSolver = std::make_unique<WatchedLiteralsSolver>(MaxSATIterations);
-  const WatchedLiteralsSolver *Solver = OwnedSolver.get();
-  DataflowAnalysisContext AnalysisContext(std::move(OwnedSolver));
+  auto Solver = std::make_unique<WatchedLiteralsSolver>(MaxSATIterations);
+  DataflowAnalysisContext AnalysisContext(*Solver);
   Environment Env(AnalysisContext, FuncDecl);
   AnalysisT Analysis = createAnalysis<AnalysisT>(ASTCtx, Env);
   llvm::SmallVector<Diagnostic> Diagnostics;
diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
index aa2c366cb164..5be4a1145f40 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
@@ -67,7 +67,19 @@ public:
   DataflowAnalysisContext(std::unique_ptr<Solver> S,
                           Options Opts = Options{
                               /*ContextSensitiveOpts=*/std::nullopt,
-                              /*Logger=*/nullptr});
+                              /*Logger=*/nullptr})
+      : DataflowAnalysisContext(*S, std::move(S), Opts) {}
+
+  /// Constructs a dataflow analysis context.
+  ///
+  /// Requirements:
+  ///
+  ///  `S` must outlive the `DataflowAnalysisContext`.
+  DataflowAnalysisContext(Solver &S, Options Opts = Options{
+                                         /*ContextSensitiveOpts=*/std::nullopt,
+                                         /*Logger=*/nullptr})
+      : DataflowAnalysisContext(S, nullptr, Opts) {}
+
   ~DataflowAnalysisContext();
 
   /// Sets a callback that returns the names and types of the synthetic fields
@@ -209,6 +221,13 @@ private:
     using DenseMapInfo::isEqual;
   };
 
+  /// `S` is the solver to use. `OwnedSolver` may be:
+  /// *  Null (in which case `S` is non-onwed and must outlive this object), or
+  /// *  Non-null (in which case it must refer to `S`, and the
+  ///    `DataflowAnalysisContext will take ownership of `OwnedSolver`).
+  DataflowAnalysisContext(Solver &S, std::unique_ptr<Solver> &&OwnedSolver,
+                          Options Opts);
+
   // Extends the set of modeled field declarations.
   void addModeledFields(const FieldSet &Fields);
 
@@ -232,7 +251,8 @@ private:
            Solver::Result::Status::Unsatisfiable;
   }
 
-  std::unique_ptr<Solver> S;
+  Solver &S;
+  std::unique_ptr<Solver> OwnedSolver;
   std::unique_ptr<Arena> A;
 
   // Maps from program declarations and statements to storage locations that are
diff --git a/clang/include/clang/Analysis/FlowSensitive/Solver.h b/clang/include/clang/Analysis/FlowSensitive/Solver.h
index 079f6802f241..6166a503ab41 100644
--- a/clang/include/clang/Analysis/FlowSensitive/Solver.h
+++ b/clang/include/clang/Analysis/FlowSensitive/Solver.h
@@ -87,6 +87,9 @@ public:
   ///
   ///  All elements in `Vals` must not be null.
   virtual Result solve(llvm::ArrayRef<const Formula *> Vals) = 0;
+
+  // Did the solver reach its resource limit?
+  virtual bool reachedLimit() const = 0;
 };
 
 llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Solver::Result &);
diff --git a/clang/include/clang/Analysis/FlowSensitive/WatchedLiteralsSolver.h b/clang/include/clang/Analysis/FlowSensitive/WatchedLiteralsSolver.h
index 5448eecf6d41..b5cd7aa10fd7 100644
--- a/clang/include/clang/Analysis/FlowSensitive/WatchedLiteralsSolver.h
+++ b/clang/include/clang/Analysis/FlowSensitive/WatchedLiteralsSolver.h
@@ -48,8 +48,7 @@ public:
 
   Result solve(llvm::ArrayRef<const Formula *> Vals) override;
 
-  // The solver reached its maximum number of iterations.
-  bool reachedLimit() const { return MaxIterations == 0; }
+  bool reachedLimit() const override { return MaxIterations == 0; }
 };
 
 } // namespace dataflow
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 0225598cbbe8..52552ba48856 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -4415,6 +4415,18 @@ def HLSLResourceBinding: InheritableAttr {
   let Documentation = [HLSLResourceBindingDocs];
 }
 
+def HLSLPackOffset: HLSLAnnotationAttr {
+  let Spellings = [HLSLAnnotation<"packoffset">];
+  let LangOpts = [HLSL];
+  let Args = [IntArgument<"Subcomponent">, IntArgument<"Component">];
+  let Documentation = [HLSLPackOffsetDocs];
+  let AdditionalMembers = [{
+      unsigned getOffset() {
+        return subcomponent * 4 + component;
+      }
+  }];
+}
+
 def HLSLSV_DispatchThreadID: HLSLAnnotationAttr {
   let Spellings = [HLSLAnnotation<"SV_DispatchThreadID">];
   let Subjects = SubjectList<[ParmVar, Field]>;
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index a0bbe5861c57..f351822ac74b 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -2527,6 +2527,9 @@ Example "subtarget features" from the x86 backend include: "mmx", "sse", "sse4.2
 "avx", "xop" and largely correspond to the machine specific options handled by
 the front end.
 
+Note that this attribute does not apply transitively to nested functions such
+as blocks or C++ lambdas.
+
 Additionally, this attribute supports function multiversioning for ELF based
 x86/x86-64 targets, which can be used to create multiple implementations of the
 same function that will be resolved at runtime based on the priority of their
@@ -3782,6 +3785,12 @@ for that function.
 
 This attribute is incompatible with the ``always_inline`` and ``minsize``
 attributes.
+
+Note that this attribute does not apply recursively to nested functions such as
+lambdas or blocks when using declaration-specific attribute syntaxes such as double
+square brackets (``[[]]``) or ``__attribute__``. The ``#pragma`` syntax can be
+used to apply the attribute to all functions, including nested functions, in a
+range of source code.
   }];
 }
 
@@ -5654,11 +5663,12 @@ The ``preserve_none`` calling convention tries to preserve as few general
 registers as possible. So all general registers are caller saved registers. It
 also uses more general registers to pass arguments. This attribute doesn't
 impact floating-point registers (XMMs/YMMs). Floating-point registers still
-follow the c calling convention.
+follow the c calling convention. ``preserve_none``'s ABI is still unstable, and
+may be changed in the future.
 
 - Only RSP and RBP are preserved by callee.
 
-- Register RDI, RSI, RDX, RCX, R8, R9, R11, R12, R13, R14, R15 and RAX now can
+- Register R12, R13, R14, R15, RDI, RSI, RDX, RCX, R8, R9, R11, and RAX now can
   be used to pass function arguments.
   }];
 }
@@ -7398,6 +7408,26 @@ The full documentation is available here: https://docs.microsoft.com/en-us/windo
   }];
 }
 
+def HLSLPackOffsetDocs : Documentation {
+  let Category = DocCatFunction;
+  let Content = [{
+The packoffset attribute is used to change the layout of a cbuffer.
+Attribute spelling in HLSL is: ``packoffset( c[Subcomponent][.component] )``.
+A subcomponent is a register number, which is an integer. A component is in the form of [.xyzw].
+
+Examples:
+
+.. code-block:: c++
+
+  cbuffer A {
+    float3 a : packoffset(c0.y);
+    float4 b : packoffset(c4);
+  }
+
+The full documentation is available here: https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-variable-packoffset
+  }];
+}
+
 def HLSLSV_DispatchThreadIDDocs : Documentation {
   let Category = DocCatFunction;
   let Content = [{
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index de721a87b334..11982af3fa60 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -1326,6 +1326,12 @@ def ElementwiseSqrt : Builtin {
   let Prototype = "void(...)";
 }
 
+def ElementwiseTan : Builtin {
+  let Spellings = ["__builtin_elementwise_tan"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
 def ElementwiseTrunc : Builtin {
   let Spellings = ["__builtin_elementwise_trunc"];
   let Attributes = [NoThrow, Const, CustomTypeChecking];
diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def
index 8d3c5e69d55c..9e243d740ed7 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -61,7 +61,9 @@
 #pragma push_macro("PTX81")
 #pragma push_macro("PTX82")
 #pragma push_macro("PTX83")
-#define PTX83 "ptx83"
+#pragma push_macro("PTX84")
+#define PTX84 "ptx84"
+#define PTX83 "ptx83|" PTX84
 #define PTX82 "ptx82|" PTX83
 #define PTX81 "ptx81|" PTX82
 #define PTX80 "ptx80|" PTX81
@@ -1091,3 +1093,4 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", "", AND(SM_90,PTX78))
 #pragma pop_macro("PTX81")
 #pragma pop_macro("PTX82")
 #pragma pop_macro("PTX83")
+#pragma pop_macro("PTX84")
diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def
index 7e950914ad94..8645cff1e867 100644
--- a/clang/include/clang/Basic/BuiltinsWebAssembly.def
+++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def
@@ -190,6 +190,10 @@ TARGET_BUILTIN(__builtin_wasm_relaxed_dot_i8x16_i7x16_s_i16x8, "V8sV16ScV16Sc",
 TARGET_BUILTIN(__builtin_wasm_relaxed_dot_i8x16_i7x16_add_s_i32x4, "V4iV16ScV16ScV4i", "nc", "relaxed-simd")
 TARGET_BUILTIN(__builtin_wasm_relaxed_dot_bf16x8_add_f32_f32x4, "V4fV8UsV8UsV4f", "nc", "relaxed-simd")
 
+// Half-Precision (fp16)
+TARGET_BUILTIN(__builtin_wasm_loadf16_f32, "fh*", "nU", "half-precision")
+TARGET_BUILTIN(__builtin_wasm_storef16_f32, "vfh*", "n", "half-precision")
+
 // Reference Types builtins
 // Some builtins are custom type-checked - see 't' as part of the third argument,
 // in which case the argument spec (second argument) is unused.
diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index 340b08dd7e2a..07b0ca1691a6 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -57,6 +57,7 @@ CODEGENOPT(UniqueSectionNames, 1, 1) ///< Set for -funique-section-names.
 CODEGENOPT(UniqueBasicBlockSectionNames, 1, 1) ///< Set for -funique-basic-block-section-names,
                                                ///< Produce unique section names with
                                                ///< basic block sections.
+CODEGENOPT(SeparateNamedSections, 1, 0) ///< Set for -fseparate-named-sections.
 CODEGENOPT(EnableAIXExtendedAltivecABI, 1, 0) ///< Set for -mabi=vec-extabi. Enables the extended Altivec ABI on AIX.
 CODEGENOPT(XCOFFReadOnlyPointers, 1, 0) ///< Set for -mxcoff-roptr.
 CODEGENOPT(AllTocData, 1, 0) ///< AIX -mtocdata
@@ -308,6 +309,7 @@ CODEGENOPT(UnrollLoops       , 1, 0) ///< Control whether loops are unrolled.
 CODEGENOPT(RerollLoops       , 1, 0) ///< Control whether loops are rerolled.
 CODEGENOPT(NoUseJumpTables   , 1, 0) ///< Set when -fno-jump-tables is enabled.
 VALUE_CODEGENOPT(UnwindTables, 2, 0) ///< Unwind tables (1) or asynchronous unwind tables (2)
+CODEGENOPT(LinkBitcodePostopt, 1, 0) ///< Link builtin bitcodes after optimization pipeline.
 CODEGENOPT(VectorizeLoop     , 1, 0) ///< Run loop vectorizer.
 CODEGENOPT(VectorizeSLP      , 1, 0) ///< Run SLP vectorizer.
 CODEGENOPT(ProfileSampleAccurate, 1, 0) ///< Sample profile is accurate.
diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h
index ba0e4465a0f5..2d67c4181d12 100644
--- a/clang/include/clang/Basic/Cuda.h
+++ b/clang/include/clang/Basic/Cuda.h
@@ -41,9 +41,10 @@ enum class CudaVersion {
   CUDA_121,
   CUDA_122,
   CUDA_123,
+  CUDA_124,
   FULLY_SUPPORTED = CUDA_123,
   PARTIALLY_SUPPORTED =
-      CUDA_123, // Partially supported. Proceed with a warning.
+      CUDA_124, // Partially supported. Proceed with a warning.
   NEW = 10000,  // Too new. Issue a warning, but allow using it.
 };
 const char *CudaVersionToString(CudaVersion V);
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index ed3fd9b1c4a5..9781fcaa4ff5 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -435,7 +435,7 @@ def warn_drv_diagnostics_misexpect_requires_pgo : Warning<
 def warn_drv_clang_unsupported : Warning<
   "the clang compiler does not support '%0'">;
 def warn_drv_deprecated_arg : Warning<
-  "argument '%0' is deprecated, use '%1' instead">, InGroup<Deprecated>;
+  "argument '%0' is deprecated%select{|, use '%2' instead}1">, InGroup<Deprecated>;
 def warn_drv_deprecated_custom : Warning<
   "argument '%0' is deprecated, %1">, InGroup<Deprecated>;
 def warn_drv_assuming_mfloat_abi_is : Warning<
diff --git a/clang/include/clang/Basic/DiagnosticFrontendKinds.td b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
index fcffadacc8e6..e456ec2cac46 100644
--- a/clang/include/clang/Basic/DiagnosticFrontendKinds.td
+++ b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
@@ -134,6 +134,8 @@ def err_fe_no_pch_in_dir : Error<
     "no suitable precompiled header file found in directory '%0'">;
 def err_fe_action_not_available : Error<
     "action %0 not compiled in">;
+def err_fe_invalid_multiple_actions : Error<
+    "'%0' action ignored; '%1' action specified previously">;
 def err_fe_invalid_alignment : Error<
     "invalid value '%1' in '%0'; alignment must be a power of 2">;
 def err_fe_invalid_exception_model
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 60f87da2a738..2beb1d45124b 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -1507,6 +1507,9 @@ def BranchProtection : DiagGroup<"branch-protection">;
 // Warnings for HLSL Clang extensions
 def HLSLExtension : DiagGroup<"hlsl-extensions">;
 
+// Warning for mix packoffset and non-packoffset.
+def HLSLMixPackOffset : DiagGroup<"mix-packoffset">;
+
 // Warnings for DXIL validation
 def DXILValidation : DiagGroup<"dxil-validation">;
 
diff --git a/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td b/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td
index 6896e0f5aa59..674742431dcb 100644
--- a/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td
+++ b/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td
@@ -25,6 +25,7 @@ def err_unsupported_vendor : Error<"vendor '%0' is not supported: '%1'">;
 def err_unsupported_environment : Error<"environment '%0' is not supported: '%1'">;
 def err_unsupported_os : Error<"os '%0' is not supported: '%1'">;
 def err_cannot_read_input_list : Error<"could not read %select{alias list|filelist}0 '%1': %2">;
+def err_invalid_label: Error<"label '%0' is reserved: use a different label name for -X<label>">;
 } // end of command line category.
 
 let CategoryName = "Verification" in {
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index 44bc4e0e130d..8316845844cb 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -1759,5 +1759,7 @@ def err_hlsl_separate_attr_arg_and_number : Error<"wrong argument format for hls
 def ext_hlsl_access_specifiers : ExtWarn<
   "access specifiers are a clang HLSL extension">,
   InGroup<HLSLExtension>;
+def err_hlsl_unsupported_component : Error<"invalid component '%0' used; expected 'x', 'y', 'z', or 'w'">;
+def err_hlsl_packoffset_invalid_reg : Error<"invalid resource class specifier '%0' for packoffset, expected 'c'">;
 
 } // end of Parser diagnostics
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 4b074b853bfe..9e82130c9360 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -5437,6 +5437,11 @@ def note_function_template_spec_matched : Note<
 def err_function_template_partial_spec : Error<
     "function template partial specialization is not allowed">;
 
+def err_function_member_spec_ambiguous : Error<
+    "ambiguous member function specialization %q0 of %q1">;
+def note_function_member_spec_matched : Note<
+    "member function specialization matches %0">;
+
 // C++ Template Instantiation
 def err_template_recursion_depth_exceeded : Error<
   "recursive template instantiation exceeded maximum depth of %0">,
@@ -7511,6 +7516,9 @@ def err_nested_non_static_member_use : Error<
 def warn_cxx98_compat_non_static_member_use : Warning<
   "use of non-static data member %0 in an unevaluated context is "
   "incompatible with C++98">, InGroup<CXX98Compat>, DefaultIgnore;
+def err_form_ptr_to_member_from_parenthesized_expr : Error<
+  "cannot form pointer to member from a parenthesized expression; "
+  "did you mean to remove the parentheses?">;
 def err_invalid_incomplete_type_use : Error<
   "invalid use of incomplete type %0">;
 def err_builtin_func_cast_more_than_one_arg : Error<
@@ -10027,12 +10035,6 @@ def warn_new_dangling_initializer_list : Warning<
   "the allocated initializer list}0 "
   "will be destroyed at the end of the full-expression">,
   InGroup<DanglingInitializerList>;
-def warn_unsupported_lifetime_extension : Warning<
-  "lifetime extension of "
-  "%select{temporary|backing array of initializer list}0 created "
-  "by aggregate initialization using a default member initializer "
-  "is not yet supported; lifetime of %select{temporary|backing array}0 "
-  "will end at the end of the full-expression">, InGroup<Dangling>;
 
 // For non-floating point, expressions of the form x == x or x != x
 // should result in a warning, since these always evaluate to a constant.
@@ -12176,6 +12178,11 @@ def err_hlsl_init_priority_unsupported : Error<
 def err_hlsl_unsupported_register_type : Error<"invalid resource class specifier '%0' used; expected 'b', 's', 't', or 'u'">;
 def err_hlsl_unsupported_register_number : Error<"register number should be an integer">;
 def err_hlsl_expected_space : Error<"invalid space specifier '%0' used; expected 'space' followed by an integer, like space1">;
+def warn_hlsl_packoffset_mix : Warning<"cannot mix packoffset elements with nonpackoffset elements in a cbuffer">,
+    InGroup<HLSLMixPackOffset>;
+def err_hlsl_packoffset_overlap : Error<"packoffset overlap between %0, %1">;
+def err_hlsl_packoffset_cross_reg_boundary : Error<"packoffset cannot cross register boundary">;
+def err_hlsl_packoffset_alignment_mismatch : Error<"packoffset at 'y' not match alignment %0 required by %1">;
 def err_hlsl_pointers_unsupported : Error<
   "%select{pointers|references}0 are unsupported in HLSL">;
 
@@ -12291,8 +12298,8 @@ def warn_acc_if_self_conflict
               "evaluates to true">,
       InGroup<DiagGroup<"openacc-self-if-potential-conflict">>;
 def err_acc_int_expr_requires_integer
-    : Error<"OpenACC %select{clause|directive}0 '%1' requires expression of "
-            "integer type (%2 invalid)">;
+    : Error<"OpenACC %select{clause '%1'|directive '%2'|sub-array bound}0 "
+            "requires expression of integer type (%3 invalid)">;
 def err_acc_int_expr_incomplete_class_type
     : Error<"OpenACC integer expression has incomplete class type %0">;
 def err_acc_int_expr_explicit_conversion
@@ -12310,4 +12317,31 @@ def err_acc_num_gangs_num_args
 def err_acc_not_a_var_ref
     : Error<"OpenACC variable is not a valid variable name, sub-array, array "
             "element, or composite variable member">;
+def err_acc_typecheck_subarray_value
+    : Error<"OpenACC sub-array subscripted value is not an array or pointer">;
+def err_acc_subarray_function_type
+    : Error<"OpenACC sub-array cannot be of function type %0">;
+def err_acc_subarray_incomplete_type
+    : Error<"OpenACC sub-array base is of incomplete type %0">;
+def err_acc_subarray_no_length
+    : Error<"OpenACC sub-array length is unspecified and cannot be inferred "
+            "because the subscripted value is %select{not an array|an array of "
+            "unknown bound}0">;
+def err_acc_subarray_negative
+    : Error<"OpenACC sub-array %select{lower bound|length}0 evaluated to "
+            "negative value %1">;
+def err_acc_subarray_out_of_range
+    : Error<"OpenACC sub-array %select{lower bound|length}0 evaluated to a "
+            "value (%1) that would be out of the range of the subscripted "
+            "array size of %2">;
+def err_acc_subarray_base_plus_length_out_of_range
+    : Error<"OpenACC sub-array specified range [%0:%1] would be out of the "
+            "range of the subscripted array size of %2">;
+def warn_acc_deprecated_alias_name
+    : Warning<"OpenACC clause name '%0' is a deprecated clause name and is "
+              "now an alias for '%1'">,
+      InGroup<DiagGroup<"openacc-deprecated-clause-alias">>;
+def err_acc_var_not_pointer_type
+    : Error<"expected pointer in '%0' clause, type is %1">;
+def note_acc_expected_pointer_var : Note<"expected variable of pointer type">;
 } // end of sema component.
diff --git a/clang/include/clang/Basic/Features.def b/clang/include/clang/Basic/Features.def
index fe4d1c4afcca..b762e44e755e 100644
--- a/clang/include/clang/Basic/Features.def
+++ b/clang/include/clang/Basic/Features.def
@@ -103,6 +103,12 @@ FEATURE(thread_sanitizer, LangOpts.Sanitize.has(SanitizerKind::Thread))
 FEATURE(dataflow_sanitizer, LangOpts.Sanitize.has(SanitizerKind::DataFlow))
 FEATURE(scudo, LangOpts.Sanitize.hasOneOf(SanitizerKind::Scudo))
 FEATURE(ptrauth_intrinsics, LangOpts.PointerAuthIntrinsics)
+FEATURE(ptrauth_calls, LangOpts.PointerAuthCalls)
+FEATURE(ptrauth_returns, LangOpts.PointerAuthReturns)
+FEATURE(ptrauth_vtable_pointer_address_discrimination, LangOpts.PointerAuthVTPtrAddressDiscrimination)
+FEATURE(ptrauth_vtable_pointer_type_discrimination, LangOpts.PointerAuthVTPtrTypeDiscrimination)
+FEATURE(ptrauth_member_function_pointer_type_discrimination, LangOpts.PointerAuthCalls)
+FEATURE(ptrauth_init_fini, LangOpts.PointerAuthInitFini)
 EXTENSION(swiftcc,
   PP.getTargetInfo().checkCallingConvention(CC_Swift) ==
   clang::TargetInfo::CCCR_OK)
diff --git a/clang/include/clang/Basic/IdentifierTable.h b/clang/include/clang/Basic/IdentifierTable.h
index a893e6f4d3d3..ae9ebd9f5915 100644
--- a/clang/include/clang/Basic/IdentifierTable.h
+++ b/clang/include/clang/Basic/IdentifierTable.h
@@ -738,7 +738,7 @@ public:
     II->Entry = &Entry;
 
     // If this is the 'import' contextual keyword, mark it as such.
-    if (Name.equals("import"))
+    if (Name == "import")
       II->setModulesImport(true);
 
     return *II;
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 55c81eab1ec1..09eb92d6f10d 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -158,10 +158,16 @@ LANGOPT(GNUAsm            , 1, 1, "GNU-style inline assembly")
 LANGOPT(Coroutines        , 1, 0, "C++20 coroutines")
 LANGOPT(CoroAlignedAllocation, 1, 0, "prefer Aligned Allocation according to P2014 Option 2")
 LANGOPT(DllExportInlines  , 1, 1, "dllexported classes dllexport inline methods")
-LANGOPT(RelaxedTemplateTemplateArgs, 1, 0, "C++17 relaxed matching of template template arguments")
+LANGOPT(RelaxedTemplateTemplateArgs, 1, 1, "C++17 relaxed matching of template template arguments")
 LANGOPT(ExperimentalLibrary, 1, 0, "enable unstable and experimental library features")
 
 LANGOPT(PointerAuthIntrinsics, 1, 0, "pointer authentication intrinsics")
+LANGOPT(PointerAuthCalls  , 1, 0, "function pointer authentication")
+LANGOPT(PointerAuthReturns, 1, 0, "return pointer authentication")
+LANGOPT(PointerAuthAuthTraps, 1, 0, "pointer authentication failure traps")
+LANGOPT(PointerAuthVTPtrAddressDiscrimination, 1, 0, "incorporate address discrimination in authenticated vtable pointers")
+LANGOPT(PointerAuthVTPtrTypeDiscrimination, 1, 0, "incorporate type discrimination in authenticated vtable pointers")
+LANGOPT(PointerAuthInitFini, 1, 0, "sign function pointers in init/fini arrays")
 
 LANGOPT(DoubleSquareBracketAttributes, 1, 0, "'[[]]' attributes extension for all language standard modes")
 LANGOPT(ExperimentalLateParseAttributes, 1, 0, "experimental late parsing of attributes")
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index e2a2aa71b880..75e88afbd970 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -968,10 +968,7 @@ public:
       setAllowFPContractAcrossStatement();
   }
 
-  void setDisallowOptimizations() {
-    setFPPreciseEnabled(true);
-    setDisallowFPContract();
-  }
+  void setDisallowOptimizations() { setFPPreciseEnabled(true); }
 
   storage_type getAsOpaqueInt() const {
     return (static_cast<storage_type>(Options.getAsOpaqueInt())
diff --git a/clang/include/clang/Basic/MSP430Target.def b/clang/include/clang/Basic/MSP430Target.def
index 7a10be1d54c8..8fd44e3ba8e7 100644
--- a/clang/include/clang/Basic/MSP430Target.def
+++ b/clang/include/clang/Basic/MSP430Target.def
@@ -8,6 +8,10 @@
 //
 // This file defines the MSP430 devices and their features.
 //
+// Generated from TI's devices.csv in version 1.212 using the script in
+// Target/MSP430/gen-msp430-def.py - use this tool rather than adding
+// new MCUs by hand.
+//
 //===----------------------------------------------------------------------===//
 
 #ifndef MSP430_MCU_FEAT
@@ -24,7 +28,19 @@ MSP430_MCU("msp430c112")
 MSP430_MCU("msp430c1121")
 MSP430_MCU("msp430c1331")
 MSP430_MCU("msp430c1351")
+MSP430_MCU("msp430c311s")
+MSP430_MCU("msp430c312")
+MSP430_MCU("msp430c313")
+MSP430_MCU("msp430c314")
+MSP430_MCU("msp430c315")
+MSP430_MCU("msp430c323")
+MSP430_MCU("msp430c325")
+MSP430_MCU("msp430c412")
+MSP430_MCU("msp430c413")
 MSP430_MCU("msp430e112")
+MSP430_MCU("msp430e313")
+MSP430_MCU("msp430e315")
+MSP430_MCU("msp430e325")
 MSP430_MCU("msp430f110")
 MSP430_MCU("msp430f1101")
 MSP430_MCU("msp430f1101a")
@@ -44,7 +60,6 @@ MSP430_MCU("msp430f135")
 MSP430_MCU("msp430f155")
 MSP430_MCU("msp430f156")
 MSP430_MCU("msp430f157")
-MSP430_MCU("msp430p112")
 MSP430_MCU("msp430f2001")
 MSP430_MCU("msp430f2011")
 MSP430_MCU("msp430f2002")
@@ -64,6 +79,58 @@ MSP430_MCU("msp430f2272")
 MSP430_MCU("msp430f2234")
 MSP430_MCU("msp430f2254")
 MSP430_MCU("msp430f2274")
+MSP430_MCU("msp430f412")
+MSP430_MCU("msp430f413")
+MSP430_MCU("msp430f415")
+MSP430_MCU("msp430f417")
+MSP430_MCU("msp430f4132")
+MSP430_MCU("msp430f4152")
+MSP430_MCU("msp430f435")
+MSP430_MCU("msp430f436")
+MSP430_MCU("msp430f437")
+MSP430_MCU("msp430f4351")
+MSP430_MCU("msp430f4361")
+MSP430_MCU("msp430f4371")
+MSP430_MCU("msp430fe423")
+MSP430_MCU("msp430fe425")
+MSP430_MCU("msp430fe427")
+MSP430_MCU("msp430fe423a")
+MSP430_MCU("msp430fe425a")
+MSP430_MCU("msp430fe427a")
+MSP430_MCU("msp430fe4232")
+MSP430_MCU("msp430fe4242")
+MSP430_MCU("msp430fe4252")
+MSP430_MCU("msp430fe4272")
+MSP430_MCU("msp430f4250")
+MSP430_MCU("msp430f4260")
+MSP430_MCU("msp430f4270")
+MSP430_MCU("msp430fg4250")
+MSP430_MCU("msp430fg4260")
+MSP430_MCU("msp430fg4270")
+MSP430_MCU("msp430fw423")
+MSP430_MCU("msp430fw425")
+MSP430_MCU("msp430fw427")
+MSP430_MCU("msp430fw428")
+MSP430_MCU("msp430fw429")
+MSP430_MCU("msp430fg437")
+MSP430_MCU("msp430fg438")
+MSP430_MCU("msp430fg439")
+MSP430_MCU("msp430f438")
+MSP430_MCU("msp430f439")
+MSP430_MCU("msp430f477")
+MSP430_MCU("msp430f478")
+MSP430_MCU("msp430f479")
+MSP430_MCU("msp430fg477")
+MSP430_MCU("msp430fg478")
+MSP430_MCU("msp430fg479")
+MSP430_MCU("msp430p112")
+MSP430_MCU("msp430p313")
+MSP430_MCU("msp430p315")
+MSP430_MCU("msp430p315s")
+MSP430_MCU("msp430p325")
+MSP430_MCU("msp430l092")
+MSP430_MCU("msp430c091")
+MSP430_MCU("msp430c092")
 MSP430_MCU("msp430g2211")
 MSP430_MCU("msp430g2201")
 MSP430_MCU("msp430g2111")
@@ -115,68 +182,32 @@ MSP430_MCU("msp430g2855")
 MSP430_MCU("msp430g2955")
 MSP430_MCU("msp430g2230")
 MSP430_MCU("msp430g2210")
-MSP430_MCU("msp430c311s")
-MSP430_MCU("msp430c312")
-MSP430_MCU("msp430c313")
-MSP430_MCU("msp430c314")
-MSP430_MCU("msp430c315")
-MSP430_MCU("msp430c323")
-MSP430_MCU("msp430c325")
-MSP430_MCU("msp430c412")
-MSP430_MCU("msp430c413")
-MSP430_MCU("msp430e313")
-MSP430_MCU("msp430e315")
-MSP430_MCU("msp430e325")
-MSP430_MCU("msp430p313")
-MSP430_MCU("msp430p315")
-MSP430_MCU("msp430p315s")
-MSP430_MCU("msp430p325")
-MSP430_MCU("msp430f412")
-MSP430_MCU("msp430f413")
-MSP430_MCU("msp430f415")
-MSP430_MCU("msp430f417")
-MSP430_MCU("msp430f4132")
-MSP430_MCU("msp430f4152")
-MSP430_MCU("msp430f435")
-MSP430_MCU("msp430f436")
-MSP430_MCU("msp430f437")
-MSP430_MCU("msp430f4351")
-MSP430_MCU("msp430f4361")
-MSP430_MCU("msp430f4371")
-MSP430_MCU("msp430fe423")
-MSP430_MCU("msp430fe425")
-MSP430_MCU("msp430fe427")
-MSP430_MCU("msp430fe423a")
-MSP430_MCU("msp430fe425a")
-MSP430_MCU("msp430fe427a")
-MSP430_MCU("msp430fe4232")
-MSP430_MCU("msp430fe4242")
-MSP430_MCU("msp430fe4252")
-MSP430_MCU("msp430fe4272")
-MSP430_MCU("msp430f4250")
-MSP430_MCU("msp430f4260")
-MSP430_MCU("msp430f4270")
-MSP430_MCU("msp430fg4250")
-MSP430_MCU("msp430fg4260")
-MSP430_MCU("msp430fg4270")
-MSP430_MCU("msp430fw423")
-MSP430_MCU("msp430fw425")
-MSP430_MCU("msp430fw427")
-MSP430_MCU("msp430fw428")
-MSP430_MCU("msp430fw429")
-MSP430_MCU("msp430fg437")
-MSP430_MCU("msp430fg438")
-MSP430_MCU("msp430fg439")
-MSP430_MCU("msp430f438")
-MSP430_MCU("msp430f439")
-MSP430_MCU("msp430f477")
-MSP430_MCU("msp430f478")
-MSP430_MCU("msp430f479")
-MSP430_MCU("msp430fg477")
-MSP430_MCU("msp430fg478")
-MSP430_MCU("msp430fg479")
+MSP430_MCU("rf430frl152h")
+MSP430_MCU("rf430frl153h")
+MSP430_MCU("rf430frl154h")
+MSP430_MCU("rf430frl152h_rom")
+MSP430_MCU("rf430frl153h_rom")
+MSP430_MCU("rf430frl154h_rom")
+MSP430_MCU("msp430fr4131")
+MSP430_MCU("msp430fr4132")
+MSP430_MCU("msp430fr4133")
+MSP430_MCU("msp430fr2032")
+MSP430_MCU("msp430fr2033")
+MSP430_MCU("msp430fr2110")
+MSP430_MCU("msp430fr2111")
+MSP430_MCU("msp430fr2310")
+MSP430_MCU("msp430fr2311")
+MSP430_MCU("msp430fr2100")
+MSP430_MCU("msp430fr2000")
 
 // With 16-bit hardware multiplier
+MSP430_MCU_FEAT("msp430c336", "16bit")
+MSP430_MCU_FEAT("msp430c337", "16bit")
+MSP430_MCU_FEAT("msp430cg4616", "16bit")
+MSP430_MCU_FEAT("msp430cg4617", "16bit")
+MSP430_MCU_FEAT("msp430cg4618", "16bit")
+MSP430_MCU_FEAT("msp430cg4619", "16bit")
+MSP430_MCU_FEAT("msp430e337", "16bit")
 MSP430_MCU_FEAT("msp430f147", "16bit")
 MSP430_MCU_FEAT("msp430f148", "16bit")
 MSP430_MCU_FEAT("msp430f149", "16bit")
@@ -189,21 +220,6 @@ MSP430_MCU_FEAT("msp430f169", "16bit")
 MSP430_MCU_FEAT("msp430f1610", "16bit")
 MSP430_MCU_FEAT("msp430f1611", "16bit")
 MSP430_MCU_FEAT("msp430f1612", "16bit")
-MSP430_MCU_FEAT("msp430c336", "16bit")
-MSP430_MCU_FEAT("msp430c337", "16bit")
-MSP430_MCU_FEAT("msp430e337", "16bit")
-MSP430_MCU_FEAT("msp430p337", "16bit")
-MSP430_MCU_FEAT("msp430f423", "16bit")
-MSP430_MCU_FEAT("msp430f425", "16bit")
-MSP430_MCU_FEAT("msp430f427", "16bit")
-MSP430_MCU_FEAT("msp430f423a", "16bit")
-MSP430_MCU_FEAT("msp430f425a", "16bit")
-MSP430_MCU_FEAT("msp430f427a", "16bit")
-MSP430_MCU_FEAT("msp430f4481", "16bit")
-MSP430_MCU_FEAT("msp430f4491", "16bit")
-MSP430_MCU_FEAT("msp430f447", "16bit")
-MSP430_MCU_FEAT("msp430f448", "16bit")
-MSP430_MCU_FEAT("msp430f449", "16bit")
 MSP430_MCU_FEAT("msp430f2330", "16bit")
 MSP430_MCU_FEAT("msp430f2350", "16bit")
 MSP430_MCU_FEAT("msp430f2370", "16bit")
@@ -216,12 +232,38 @@ MSP430_MCU_FEAT("msp430f2410", "16bit")
 MSP430_MCU_FEAT("msp430f2471", "16bit")
 MSP430_MCU_FEAT("msp430f2481", "16bit")
 MSP430_MCU_FEAT("msp430f2491", "16bit")
-MSP430_MCU_FEAT("msp430i2020", "16bit")
-MSP430_MCU_FEAT("msp430i2021", "16bit")
-MSP430_MCU_FEAT("msp430i2030", "16bit")
-MSP430_MCU_FEAT("msp430i2031", "16bit")
-MSP430_MCU_FEAT("msp430i2040", "16bit")
-MSP430_MCU_FEAT("msp430i2041", "16bit")
+MSP430_MCU_FEAT("msp430f2416", "16bit")
+MSP430_MCU_FEAT("msp430f2417", "16bit")
+MSP430_MCU_FEAT("msp430f2418", "16bit")
+MSP430_MCU_FEAT("msp430f2419", "16bit")
+MSP430_MCU_FEAT("msp430f2616", "16bit")
+MSP430_MCU_FEAT("msp430f2617", "16bit")
+MSP430_MCU_FEAT("msp430f2618", "16bit")
+MSP430_MCU_FEAT("msp430f2619", "16bit")
+MSP430_MCU_FEAT("msp430f423", "16bit")
+MSP430_MCU_FEAT("msp430f425", "16bit")
+MSP430_MCU_FEAT("msp430f427", "16bit")
+MSP430_MCU_FEAT("msp430f423a", "16bit")
+MSP430_MCU_FEAT("msp430f425a", "16bit")
+MSP430_MCU_FEAT("msp430f427a", "16bit")
+MSP430_MCU_FEAT("msp430f4481", "16bit")
+MSP430_MCU_FEAT("msp430f4491", "16bit")
+MSP430_MCU_FEAT("msp430f447", "16bit")
+MSP430_MCU_FEAT("msp430f448", "16bit")
+MSP430_MCU_FEAT("msp430f449", "16bit")
+MSP430_MCU_FEAT("msp430f46161", "16bit")
+MSP430_MCU_FEAT("msp430f46171", "16bit")
+MSP430_MCU_FEAT("msp430f46181", "16bit")
+MSP430_MCU_FEAT("msp430f46191", "16bit")
+MSP430_MCU_FEAT("msp430f4616", "16bit")
+MSP430_MCU_FEAT("msp430f4617", "16bit")
+MSP430_MCU_FEAT("msp430f4618", "16bit")
+MSP430_MCU_FEAT("msp430f4619", "16bit")
+MSP430_MCU_FEAT("msp430fg4616", "16bit")
+MSP430_MCU_FEAT("msp430fg4617", "16bit")
+MSP430_MCU_FEAT("msp430fg4618", "16bit")
+MSP430_MCU_FEAT("msp430fg4619", "16bit")
+MSP430_MCU_FEAT("msp430p337", "16bit")
 MSP430_MCU_FEAT("msp430afe221", "16bit")
 MSP430_MCU_FEAT("msp430afe231", "16bit")
 MSP430_MCU_FEAT("msp430afe251", "16bit")
@@ -231,12 +273,387 @@ MSP430_MCU_FEAT("msp430afe252", "16bit")
 MSP430_MCU_FEAT("msp430afe223", "16bit")
 MSP430_MCU_FEAT("msp430afe233", "16bit")
 MSP430_MCU_FEAT("msp430afe253", "16bit")
+MSP430_MCU_FEAT("msp430i2020", "16bit")
+MSP430_MCU_FEAT("msp430i2021", "16bit")
+MSP430_MCU_FEAT("msp430i2030", "16bit")
+MSP430_MCU_FEAT("msp430i2031", "16bit")
+MSP430_MCU_FEAT("msp430i2040", "16bit")
+MSP430_MCU_FEAT("msp430i2041", "16bit")
 
-// With 32 Bit Hardware Multiplier
+// With 32-bit hardware multiplier
 MSP430_MCU_FEAT("msp430f4783", "32bit")
 MSP430_MCU_FEAT("msp430f4793", "32bit")
 MSP430_MCU_FEAT("msp430f4784", "32bit")
 MSP430_MCU_FEAT("msp430f4794", "32bit")
+MSP430_MCU_FEAT("msp430f47126", "32bit")
+MSP430_MCU_FEAT("msp430f47127", "32bit")
+MSP430_MCU_FEAT("msp430f47163", "32bit")
+MSP430_MCU_FEAT("msp430f47173", "32bit")
+MSP430_MCU_FEAT("msp430f47183", "32bit")
+MSP430_MCU_FEAT("msp430f47193", "32bit")
+MSP430_MCU_FEAT("msp430f47166", "32bit")
+MSP430_MCU_FEAT("msp430f47176", "32bit")
+MSP430_MCU_FEAT("msp430f47186", "32bit")
+MSP430_MCU_FEAT("msp430f47196", "32bit")
+MSP430_MCU_FEAT("msp430f47167", "32bit")
+MSP430_MCU_FEAT("msp430f47177", "32bit")
+MSP430_MCU_FEAT("msp430f47187", "32bit")
+MSP430_MCU_FEAT("msp430f47197", "32bit")
+MSP430_MCU_FEAT("msp430f5418", "32bit")
+MSP430_MCU_FEAT("msp430f5419", "32bit")
+MSP430_MCU_FEAT("msp430f5435", "32bit")
+MSP430_MCU_FEAT("msp430f5436", "32bit")
+MSP430_MCU_FEAT("msp430f5437", "32bit")
+MSP430_MCU_FEAT("msp430f5438", "32bit")
+MSP430_MCU_FEAT("msp430f5418a", "32bit")
+MSP430_MCU_FEAT("msp430f5419a", "32bit")
+MSP430_MCU_FEAT("msp430f5435a", "32bit")
+MSP430_MCU_FEAT("msp430f5436a", "32bit")
+MSP430_MCU_FEAT("msp430f5437a", "32bit")
+MSP430_MCU_FEAT("msp430f5438a", "32bit")
+MSP430_MCU_FEAT("msp430f5212", "32bit")
+MSP430_MCU_FEAT("msp430f5213", "32bit")
+MSP430_MCU_FEAT("msp430f5214", "32bit")
+MSP430_MCU_FEAT("msp430f5217", "32bit")
+MSP430_MCU_FEAT("msp430f5218", "32bit")
+MSP430_MCU_FEAT("msp430f5219", "32bit")
+MSP430_MCU_FEAT("msp430f5222", "32bit")
+MSP430_MCU_FEAT("msp430f5223", "32bit")
+MSP430_MCU_FEAT("msp430f5224", "32bit")
+MSP430_MCU_FEAT("msp430f5227", "32bit")
+MSP430_MCU_FEAT("msp430f5228", "32bit")
+MSP430_MCU_FEAT("msp430f5229", "32bit")
+MSP430_MCU_FEAT("msp430f5232", "32bit")
+MSP430_MCU_FEAT("msp430f5234", "32bit")
+MSP430_MCU_FEAT("msp430f5237", "32bit")
+MSP430_MCU_FEAT("msp430f5239", "32bit")
+MSP430_MCU_FEAT("msp430f5242", "32bit")
+MSP430_MCU_FEAT("msp430f5244", "32bit")
+MSP430_MCU_FEAT("msp430f5247", "32bit")
+MSP430_MCU_FEAT("msp430f5249", "32bit")
+MSP430_MCU_FEAT("msp430f5304", "32bit")
+MSP430_MCU_FEAT("msp430f5308", "32bit")
+MSP430_MCU_FEAT("msp430f5309", "32bit")
+MSP430_MCU_FEAT("msp430f5310", "32bit")
+MSP430_MCU_FEAT("msp430f5340", "32bit")
+MSP430_MCU_FEAT("msp430f5341", "32bit")
+MSP430_MCU_FEAT("msp430f5342", "32bit")
+MSP430_MCU_FEAT("msp430f5324", "32bit")
+MSP430_MCU_FEAT("msp430f5325", "32bit")
+MSP430_MCU_FEAT("msp430f5326", "32bit")
+MSP430_MCU_FEAT("msp430f5327", "32bit")
+MSP430_MCU_FEAT("msp430f5328", "32bit")
+MSP430_MCU_FEAT("msp430f5329", "32bit")
+MSP430_MCU_FEAT("msp430f5500", "32bit")
+MSP430_MCU_FEAT("msp430f5501", "32bit")
+MSP430_MCU_FEAT("msp430f5502", "32bit")
+MSP430_MCU_FEAT("msp430f5503", "32bit")
+MSP430_MCU_FEAT("msp430f5504", "32bit")
+MSP430_MCU_FEAT("msp430f5505", "32bit")
+MSP430_MCU_FEAT("msp430f5506", "32bit")
+MSP430_MCU_FEAT("msp430f5507", "32bit")
+MSP430_MCU_FEAT("msp430f5508", "32bit")
+MSP430_MCU_FEAT("msp430f5509", "32bit")
+MSP430_MCU_FEAT("msp430f5510", "32bit")
+MSP430_MCU_FEAT("msp430f5513", "32bit")
+MSP430_MCU_FEAT("msp430f5514", "32bit")
+MSP430_MCU_FEAT("msp430f5515", "32bit")
+MSP430_MCU_FEAT("msp430f5517", "32bit")
+MSP430_MCU_FEAT("msp430f5519", "32bit")
+MSP430_MCU_FEAT("msp430f5521", "32bit")
+MSP430_MCU_FEAT("msp430f5522", "32bit")
+MSP430_MCU_FEAT("msp430f5524", "32bit")
+MSP430_MCU_FEAT("msp430f5525", "32bit")
+MSP430_MCU_FEAT("msp430f5526", "32bit")
+MSP430_MCU_FEAT("msp430f5527", "32bit")
+MSP430_MCU_FEAT("msp430f5528", "32bit")
+MSP430_MCU_FEAT("msp430f5529", "32bit")
+MSP430_MCU_FEAT("cc430f5133", "32bit")
+MSP430_MCU_FEAT("cc430f5135", "32bit")
+MSP430_MCU_FEAT("cc430f5137", "32bit")
+MSP430_MCU_FEAT("cc430f6125", "32bit")
+MSP430_MCU_FEAT("cc430f6126", "32bit")
+MSP430_MCU_FEAT("cc430f6127", "32bit")
+MSP430_MCU_FEAT("cc430f6135", "32bit")
+MSP430_MCU_FEAT("cc430f6137", "32bit")
+MSP430_MCU_FEAT("cc430f5123", "32bit")
+MSP430_MCU_FEAT("cc430f5125", "32bit")
+MSP430_MCU_FEAT("cc430f5143", "32bit")
+MSP430_MCU_FEAT("cc430f5145", "32bit")
+MSP430_MCU_FEAT("cc430f5147", "32bit")
+MSP430_MCU_FEAT("cc430f6143", "32bit")
+MSP430_MCU_FEAT("cc430f6145", "32bit")
+MSP430_MCU_FEAT("cc430f6147", "32bit")
+MSP430_MCU_FEAT("msp430f5333", "32bit")
+MSP430_MCU_FEAT("msp430f5335", "32bit")
+MSP430_MCU_FEAT("msp430f5336", "32bit")
+MSP430_MCU_FEAT("msp430f5338", "32bit")
+MSP430_MCU_FEAT("msp430f5630", "32bit")
+MSP430_MCU_FEAT("msp430f5631", "32bit")
+MSP430_MCU_FEAT("msp430f5632", "32bit")
+MSP430_MCU_FEAT("msp430f5633", "32bit")
+MSP430_MCU_FEAT("msp430f5634", "32bit")
+MSP430_MCU_FEAT("msp430f5635", "32bit")
+MSP430_MCU_FEAT("msp430f5636", "32bit")
+MSP430_MCU_FEAT("msp430f5637", "32bit")
+MSP430_MCU_FEAT("msp430f5638", "32bit")
+MSP430_MCU_FEAT("msp430f6433", "32bit")
+MSP430_MCU_FEAT("msp430f6435", "32bit")
+MSP430_MCU_FEAT("msp430f6436", "32bit")
+MSP430_MCU_FEAT("msp430f6438", "32bit")
+MSP430_MCU_FEAT("msp430f6630", "32bit")
+MSP430_MCU_FEAT("msp430f6631", "32bit")
+MSP430_MCU_FEAT("msp430f6632", "32bit")
+MSP430_MCU_FEAT("msp430f6633", "32bit")
+MSP430_MCU_FEAT("msp430f6634", "32bit")
+MSP430_MCU_FEAT("msp430f6635", "32bit")
+MSP430_MCU_FEAT("msp430f6636", "32bit")
+MSP430_MCU_FEAT("msp430f6637", "32bit")
+MSP430_MCU_FEAT("msp430f6638", "32bit")
+MSP430_MCU_FEAT("msp430f5358", "32bit")
+MSP430_MCU_FEAT("msp430f5359", "32bit")
+MSP430_MCU_FEAT("msp430f5658", "32bit")
+MSP430_MCU_FEAT("msp430f5659", "32bit")
+MSP430_MCU_FEAT("msp430f6458", "32bit")
+MSP430_MCU_FEAT("msp430f6459", "32bit")
+MSP430_MCU_FEAT("msp430f6658", "32bit")
+MSP430_MCU_FEAT("msp430f6659", "32bit")
+MSP430_MCU_FEAT("msp430fg6425", "32bit")
+MSP430_MCU_FEAT("msp430fg6426", "32bit")
+MSP430_MCU_FEAT("msp430fg6625", "32bit")
+MSP430_MCU_FEAT("msp430fg6626", "32bit")
+MSP430_MCU_FEAT("msp430f5131", "32bit")
+MSP430_MCU_FEAT("msp430f5151", "32bit")
+MSP430_MCU_FEAT("msp430f5171", "32bit")
+MSP430_MCU_FEAT("msp430f5132", "32bit")
+MSP430_MCU_FEAT("msp430f5152", "32bit")
+MSP430_MCU_FEAT("msp430f5172", "32bit")
+MSP430_MCU_FEAT("msp430f6720", "32bit")
+MSP430_MCU_FEAT("msp430f6721", "32bit")
+MSP430_MCU_FEAT("msp430f6723", "32bit")
+MSP430_MCU_FEAT("msp430f6724", "32bit")
+MSP430_MCU_FEAT("msp430f6725", "32bit")
+MSP430_MCU_FEAT("msp430f6726", "32bit")
+MSP430_MCU_FEAT("msp430f6730", "32bit")
+MSP430_MCU_FEAT("msp430f6731", "32bit")
+MSP430_MCU_FEAT("msp430f6733", "32bit")
+MSP430_MCU_FEAT("msp430f6734", "32bit")
+MSP430_MCU_FEAT("msp430f6735", "32bit")
+MSP430_MCU_FEAT("msp430f6736", "32bit")
+MSP430_MCU_FEAT("msp430f67621", "32bit")
+MSP430_MCU_FEAT("msp430f67641", "32bit")
+MSP430_MCU_FEAT("msp430f6720a", "32bit")
+MSP430_MCU_FEAT("msp430f6721a", "32bit")
+MSP430_MCU_FEAT("msp430f6723a", "32bit")
+MSP430_MCU_FEAT("msp430f6724a", "32bit")
+MSP430_MCU_FEAT("msp430f6725a", "32bit")
+MSP430_MCU_FEAT("msp430f6726a", "32bit")
+MSP430_MCU_FEAT("msp430f6730a", "32bit")
+MSP430_MCU_FEAT("msp430f6731a", "32bit")
+MSP430_MCU_FEAT("msp430f6733a", "32bit")
+MSP430_MCU_FEAT("msp430f6734a", "32bit")
+MSP430_MCU_FEAT("msp430f6735a", "32bit")
+MSP430_MCU_FEAT("msp430f6736a", "32bit")
+MSP430_MCU_FEAT("msp430f67621a", "32bit")
+MSP430_MCU_FEAT("msp430f67641a", "32bit")
+MSP430_MCU_FEAT("msp430f67451", "32bit")
+MSP430_MCU_FEAT("msp430f67651", "32bit")
+MSP430_MCU_FEAT("msp430f67751", "32bit")
+MSP430_MCU_FEAT("msp430f67461", "32bit")
+MSP430_MCU_FEAT("msp430f67661", "32bit")
+MSP430_MCU_FEAT("msp430f67761", "32bit")
+MSP430_MCU_FEAT("msp430f67471", "32bit")
+MSP430_MCU_FEAT("msp430f67671", "32bit")
+MSP430_MCU_FEAT("msp430f67771", "32bit")
+MSP430_MCU_FEAT("msp430f67481", "32bit")
+MSP430_MCU_FEAT("msp430f67681", "32bit")
+MSP430_MCU_FEAT("msp430f67781", "32bit")
+MSP430_MCU_FEAT("msp430f67491", "32bit")
+MSP430_MCU_FEAT("msp430f67691", "32bit")
+MSP430_MCU_FEAT("msp430f67791", "32bit")
+MSP430_MCU_FEAT("msp430f6745", "32bit")
+MSP430_MCU_FEAT("msp430f6765", "32bit")
+MSP430_MCU_FEAT("msp430f6775", "32bit")
+MSP430_MCU_FEAT("msp430f6746", "32bit")
+MSP430_MCU_FEAT("msp430f6766", "32bit")
+MSP430_MCU_FEAT("msp430f6776", "32bit")
+MSP430_MCU_FEAT("msp430f6747", "32bit")
+MSP430_MCU_FEAT("msp430f6767", "32bit")
+MSP430_MCU_FEAT("msp430f6777", "32bit")
+MSP430_MCU_FEAT("msp430f6748", "32bit")
+MSP430_MCU_FEAT("msp430f6768", "32bit")
+MSP430_MCU_FEAT("msp430f6778", "32bit")
+MSP430_MCU_FEAT("msp430f6749", "32bit")
+MSP430_MCU_FEAT("msp430f6769", "32bit")
+MSP430_MCU_FEAT("msp430f6779", "32bit")
+MSP430_MCU_FEAT("msp430f67451a", "32bit")
+MSP430_MCU_FEAT("msp430f67651a", "32bit")
+MSP430_MCU_FEAT("msp430f67751a", "32bit")
+MSP430_MCU_FEAT("msp430f67461a", "32bit")
+MSP430_MCU_FEAT("msp430f67661a", "32bit")
+MSP430_MCU_FEAT("msp430f67761a", "32bit")
+MSP430_MCU_FEAT("msp430f67471a", "32bit")
+MSP430_MCU_FEAT("msp430f67671a", "32bit")
+MSP430_MCU_FEAT("msp430f67771a", "32bit")
+MSP430_MCU_FEAT("msp430f67481a", "32bit")
+MSP430_MCU_FEAT("msp430f67681a", "32bit")
+MSP430_MCU_FEAT("msp430f67781a", "32bit")
+MSP430_MCU_FEAT("msp430f67491a", "32bit")
+MSP430_MCU_FEAT("msp430f67691a", "32bit")
+MSP430_MCU_FEAT("msp430f67791a", "32bit")
+MSP430_MCU_FEAT("msp430f6745a", "32bit")
+MSP430_MCU_FEAT("msp430f6765a", "32bit")
+MSP430_MCU_FEAT("msp430f6775a", "32bit")
+MSP430_MCU_FEAT("msp430f6746a", "32bit")
+MSP430_MCU_FEAT("msp430f6766a", "32bit")
+MSP430_MCU_FEAT("msp430f6776a", "32bit")
+MSP430_MCU_FEAT("msp430f6747a", "32bit")
+MSP430_MCU_FEAT("msp430f6767a", "32bit")
+MSP430_MCU_FEAT("msp430f6777a", "32bit")
+MSP430_MCU_FEAT("msp430f6748a", "32bit")
+MSP430_MCU_FEAT("msp430f6768a", "32bit")
+MSP430_MCU_FEAT("msp430f6778a", "32bit")
+MSP430_MCU_FEAT("msp430f6749a", "32bit")
+MSP430_MCU_FEAT("msp430f6769a", "32bit")
+MSP430_MCU_FEAT("msp430f6779a", "32bit")
+MSP430_MCU_FEAT("msp430fr5720", "32bit")
+MSP430_MCU_FEAT("msp430fr5721", "32bit")
+MSP430_MCU_FEAT("msp430fr5722", "32bit")
+MSP430_MCU_FEAT("msp430fr5723", "32bit")
+MSP430_MCU_FEAT("msp430fr5724", "32bit")
+MSP430_MCU_FEAT("msp430fr5725", "32bit")
+MSP430_MCU_FEAT("msp430fr5726", "32bit")
+MSP430_MCU_FEAT("msp430fr5727", "32bit")
+MSP430_MCU_FEAT("msp430fr5728", "32bit")
+MSP430_MCU_FEAT("msp430fr5729", "32bit")
+MSP430_MCU_FEAT("msp430fr5730", "32bit")
+MSP430_MCU_FEAT("msp430fr5731", "32bit")
+MSP430_MCU_FEAT("msp430fr5732", "32bit")
+MSP430_MCU_FEAT("msp430fr5733", "32bit")
+MSP430_MCU_FEAT("msp430fr5734", "32bit")
+MSP430_MCU_FEAT("msp430fr5735", "32bit")
+MSP430_MCU_FEAT("msp430fr5736", "32bit")
+MSP430_MCU_FEAT("msp430fr5737", "32bit")
+MSP430_MCU_FEAT("msp430fr5738", "32bit")
+MSP430_MCU_FEAT("msp430fr5739", "32bit")
+MSP430_MCU_FEAT("msp430bt5190", "32bit")
+MSP430_MCU_FEAT("msp430fr5857", "32bit")
+MSP430_MCU_FEAT("msp430fr5858", "32bit")
+MSP430_MCU_FEAT("msp430fr5859", "32bit")
+MSP430_MCU_FEAT("msp430fr5847", "32bit")
+MSP430_MCU_FEAT("msp430fr58471", "32bit")
+MSP430_MCU_FEAT("msp430fr5848", "32bit")
+MSP430_MCU_FEAT("msp430fr5849", "32bit")
+MSP430_MCU_FEAT("msp430fr5867", "32bit")
+MSP430_MCU_FEAT("msp430fr58671", "32bit")
+MSP430_MCU_FEAT("msp430fr5868", "32bit")
+MSP430_MCU_FEAT("msp430fr5869", "32bit")
+MSP430_MCU_FEAT("msp430fr5957", "32bit")
+MSP430_MCU_FEAT("msp430fr5958", "32bit")
+MSP430_MCU_FEAT("msp430fr5959", "32bit")
+MSP430_MCU_FEAT("msp430fr5947", "32bit")
+MSP430_MCU_FEAT("msp430fr59471", "32bit")
+MSP430_MCU_FEAT("msp430fr5948", "32bit")
+MSP430_MCU_FEAT("msp430fr5949", "32bit")
+MSP430_MCU_FEAT("msp430fr5967", "32bit")
+MSP430_MCU_FEAT("msp430fr5968", "32bit")
+MSP430_MCU_FEAT("msp430fr5969", "32bit")
+MSP430_MCU_FEAT("msp430fr59691", "32bit")
+MSP430_MCU_FEAT("rf430f5175", "32bit")
+MSP430_MCU_FEAT("rf430f5155", "32bit")
+MSP430_MCU_FEAT("rf430f5144", "32bit")
+MSP430_MCU_FEAT("msp430fr69271", "32bit")
+MSP430_MCU_FEAT("msp430fr68791", "32bit")
+MSP430_MCU_FEAT("msp430fr69791", "32bit")
+MSP430_MCU_FEAT("msp430fr6927", "32bit")
+MSP430_MCU_FEAT("msp430fr6928", "32bit")
+MSP430_MCU_FEAT("msp430fr6877", "32bit")
+MSP430_MCU_FEAT("msp430fr6977", "32bit")
+MSP430_MCU_FEAT("msp430fr6879", "32bit")
+MSP430_MCU_FEAT("msp430fr6979", "32bit")
+MSP430_MCU_FEAT("msp430fr58891", "32bit")
+MSP430_MCU_FEAT("msp430fr68891", "32bit")
+MSP430_MCU_FEAT("msp430fr59891", "32bit")
+MSP430_MCU_FEAT("msp430fr69891", "32bit")
+MSP430_MCU_FEAT("msp430fr5887", "32bit")
+MSP430_MCU_FEAT("msp430fr5888", "32bit")
+MSP430_MCU_FEAT("msp430fr5889", "32bit")
+MSP430_MCU_FEAT("msp430fr6887", "32bit")
+MSP430_MCU_FEAT("msp430fr6888", "32bit")
+MSP430_MCU_FEAT("msp430fr6889", "32bit")
+MSP430_MCU_FEAT("msp430fr5986", "32bit")
+MSP430_MCU_FEAT("msp430fr5987", "32bit")
+MSP430_MCU_FEAT("msp430fr5988", "32bit")
+MSP430_MCU_FEAT("msp430fr5989", "32bit")
+MSP430_MCU_FEAT("msp430fr6987", "32bit")
+MSP430_MCU_FEAT("msp430fr6988", "32bit")
+MSP430_MCU_FEAT("msp430fr6989", "32bit")
+MSP430_MCU_FEAT("msp430fr5922", "32bit")
+MSP430_MCU_FEAT("msp430fr5870", "32bit")
+MSP430_MCU_FEAT("msp430fr5970", "32bit")
+MSP430_MCU_FEAT("msp430fr5872", "32bit")
+MSP430_MCU_FEAT("msp430fr5972", "32bit")
+MSP430_MCU_FEAT("msp430fr6820", "32bit")
+MSP430_MCU_FEAT("msp430fr6920", "32bit")
+MSP430_MCU_FEAT("msp430fr6822", "32bit")
+MSP430_MCU_FEAT("msp430fr6922", "32bit")
+MSP430_MCU_FEAT("msp430fr6870", "32bit")
+MSP430_MCU_FEAT("msp430fr6970", "32bit")
+MSP430_MCU_FEAT("msp430fr6872", "32bit")
+MSP430_MCU_FEAT("msp430fr6972", "32bit")
+MSP430_MCU_FEAT("msp430fr59221", "32bit")
+MSP430_MCU_FEAT("msp430fr58721", "32bit")
+MSP430_MCU_FEAT("msp430fr59721", "32bit")
+MSP430_MCU_FEAT("msp430fr68221", "32bit")
+MSP430_MCU_FEAT("msp430fr69221", "32bit")
+MSP430_MCU_FEAT("msp430fr68721", "32bit")
+MSP430_MCU_FEAT("msp430fr69721", "32bit")
+MSP430_MCU_FEAT("msp430sl5438a", "32bit")
+MSP430_MCU_FEAT("msp430fr2433", "32bit")
+MSP430_MCU_FEAT("msp430fr2532", "32bit")
+MSP430_MCU_FEAT("msp430fr2533", "32bit")
+MSP430_MCU_FEAT("msp430fr2632", "32bit")
+MSP430_MCU_FEAT("msp430fr2633", "32bit")
+MSP430_MCU_FEAT("msp430f5252", "32bit")
+MSP430_MCU_FEAT("msp430f5253", "32bit")
+MSP430_MCU_FEAT("msp430f5254", "32bit")
+MSP430_MCU_FEAT("msp430f5255", "32bit")
+MSP430_MCU_FEAT("msp430f5256", "32bit")
+MSP430_MCU_FEAT("msp430f5257", "32bit")
+MSP430_MCU_FEAT("msp430f5258", "32bit")
+MSP430_MCU_FEAT("msp430f5259", "32bit")
+MSP430_MCU_FEAT("msp430fr5962", "32bit")
+MSP430_MCU_FEAT("msp430fr5964", "32bit")
+MSP430_MCU_FEAT("msp430fr5992", "32bit")
+MSP430_MCU_FEAT("msp430fr5994", "32bit")
+MSP430_MCU_FEAT("msp430fr59941", "32bit")
+MSP430_MCU_FEAT("msp430fr2355", "32bit")
+MSP430_MCU_FEAT("msp430fr2155", "32bit")
+MSP430_MCU_FEAT("msp430fr2353", "32bit")
+MSP430_MCU_FEAT("msp430fr2153", "32bit")
+MSP430_MCU_FEAT("msp430fr2522", "32bit")
+MSP430_MCU_FEAT("msp430fr2512", "32bit")
+MSP430_MCU_FEAT("msp430fr2422", "32bit")
+MSP430_MCU_FEAT("msp430fr2676", "32bit")
+MSP430_MCU_FEAT("msp430fr2476", "32bit")
+MSP430_MCU_FEAT("msp430fr2675", "32bit")
+MSP430_MCU_FEAT("msp430fr2673", "32bit")
+MSP430_MCU_FEAT("msp430fr2475", "32bit")
+MSP430_MCU_FEAT("msp430fr2672", "32bit")
+MSP430_MCU_FEAT("msp430fr6043", "32bit")
+MSP430_MCU_FEAT("msp430fr5043", "32bit")
+MSP430_MCU_FEAT("msp430fr6041", "32bit")
+MSP430_MCU_FEAT("msp430fr60431", "32bit")
+MSP430_MCU_FEAT("msp430fr5041", "32bit")
+MSP430_MCU_FEAT("msp430fr50431", "32bit")
+MSP430_MCU_FEAT("msp430fr6005", "32bit")
+MSP430_MCU_FEAT("msp430fr6047", "32bit")
+MSP430_MCU_FEAT("msp430fr6037", "32bit")
+MSP430_MCU_FEAT("msp430fr6045", "32bit")
+MSP430_MCU_FEAT("msp430fr60471", "32bit")
+MSP430_MCU_FEAT("msp430fr6035", "32bit")
+MSP430_MCU_FEAT("msp430fr6007", "32bit")
+MSP430_MCU_FEAT("msp430fr60371", "32bit")
 
 // Generic MCUs
 MSP430_MCU("msp430i2xxgeneric")
diff --git a/clang/include/clang/Basic/Module.h b/clang/include/clang/Basic/Module.h
index 9f62c058ca0d..2d62d05cd919 100644
--- a/clang/include/clang/Basic/Module.h
+++ b/clang/include/clang/Basic/Module.h
@@ -284,9 +284,10 @@ public:
   /// found on the file system.
   SmallVector<UnresolvedHeaderDirective, 1> MissingHeaders;
 
-  /// An individual requirement: a feature name and a flag indicating
-  /// the required state of that feature.
-  using Requirement = std::pair<std::string, bool>;
+  struct Requirement {
+    std::string FeatureName;
+    bool RequiredState;
+  };
 
   /// The set of language features required to use this module.
   ///
diff --git a/clang/include/clang/Basic/OpenACCClauses.def b/clang/include/clang/Basic/OpenACCClauses.def
index 6c3c2db66ef0..afb7b30b7465 100644
--- a/clang/include/clang/Basic/OpenACCClauses.def
+++ b/clang/include/clang/Basic/OpenACCClauses.def
@@ -14,13 +14,39 @@
 // as used in Clang source (so `Default` instead of `default`).
 //
 // VISIT_CLAUSE(CLAUSE_NAME)
+//
+// CLAUSE_ALIAS(ALIAS_NAME, CLAUSE_NAME)
+
+#ifndef CLAUSE_ALIAS
+#define CLAUSE_ALIAS(ALIAS_NAME, CLAUSE_NAME)
+#endif
 
+VISIT_CLAUSE(Async)
+VISIT_CLAUSE(Attach)
+VISIT_CLAUSE(Copy)
+CLAUSE_ALIAS(PCopy, Copy)
+CLAUSE_ALIAS(PresentOrCopy, Copy)
+VISIT_CLAUSE(CopyIn)
+CLAUSE_ALIAS(PCopyIn, CopyIn)
+CLAUSE_ALIAS(PresentOrCopyIn, CopyIn)
+VISIT_CLAUSE(CopyOut)
+CLAUSE_ALIAS(PCopyOut, CopyOut)
+CLAUSE_ALIAS(PresentOrCopyOut, CopyOut)
+VISIT_CLAUSE(Create)
+CLAUSE_ALIAS(PCreate, Create)
+CLAUSE_ALIAS(PresentOrCreate, Create)
 VISIT_CLAUSE(Default)
+VISIT_CLAUSE(DevicePtr)
+VISIT_CLAUSE(FirstPrivate)
 VISIT_CLAUSE(If)
-VISIT_CLAUSE(Self)
+VISIT_CLAUSE(NoCreate)
 VISIT_CLAUSE(NumGangs)
 VISIT_CLAUSE(NumWorkers)
+VISIT_CLAUSE(Present)
 VISIT_CLAUSE(Private)
+VISIT_CLAUSE(Self)
 VISIT_CLAUSE(VectorLength)
+VISIT_CLAUSE(Wait)
 
 #undef VISIT_CLAUSE
+#undef CLAUSE_ALIAS
diff --git a/clang/include/clang/Basic/OpenACCKinds.h b/clang/include/clang/Basic/OpenACCKinds.h
index e3f741784332..0e38a04e7164 100644
--- a/clang/include/clang/Basic/OpenACCKinds.h
+++ b/clang/include/clang/Basic/OpenACCKinds.h
@@ -189,6 +189,10 @@ enum class OpenACCClauseKind {
   /// 'copy' clause, allowed on Compute and Combined Constructs, plus 'data' and
   /// 'declare'.
   Copy,
+  /// 'copy' clause alias 'pcopy'.  Preserved for diagnostic purposes.
+  PCopy,
+  /// 'copy' clause alias 'present_or_copy'.  Preserved for diagnostic purposes.
+  PresentOrCopy,
   /// 'use_device' clause, allowed on 'host_data' construct.
   UseDevice,
   /// 'attach' clause, allowed on Compute and Combined constructs, plus 'data'
@@ -224,12 +228,27 @@ enum class OpenACCClauseKind {
   /// 'copyout' clause, allowed on Compute and Combined constructs, plus 'data',
   /// 'exit data', and 'declare'.
   CopyOut,
+  /// 'copyout' clause alias 'pcopyout'.  Preserved for diagnostic purposes.
+  PCopyOut,
+  /// 'copyout' clause alias 'present_or_copyout'.  Preserved for diagnostic
+  /// purposes.
+  PresentOrCopyOut,
   /// 'copyin' clause, allowed on Compute and Combined constructs, plus 'data',
   /// 'enter data', and 'declare'.
   CopyIn,
-  /// 'copyin' clause, allowed on Compute and Combined constructs, plus 'data',
+  /// 'copyin' clause alias 'pcopyin'.  Preserved for diagnostic purposes.
+  PCopyIn,
+  /// 'copyin' clause alias 'present_or_copyin'.  Preserved for diagnostic
+  /// purposes.
+  PresentOrCopyIn,
+  /// 'create' clause, allowed on Compute and Combined constructs, plus 'data',
   /// 'enter data', and 'declare'.
   Create,
+  /// 'create' clause alias 'pcreate'.  Preserved for diagnostic purposes.
+  PCreate,
+  /// 'create' clause alias 'present_or_create'.  Preserved for diagnostic
+  /// purposes.
+  PresentOrCreate,
   /// 'reduction' clause, allowed on Parallel, Serial, Loop, and the combined
   /// constructs.
   Reduction,
@@ -310,6 +329,12 @@ inline StreamTy &printOpenACCClauseKind(StreamTy &Out, OpenACCClauseKind K) {
   case OpenACCClauseKind::Copy:
     return Out << "copy";
 
+  case OpenACCClauseKind::PCopy:
+    return Out << "pcopy";
+
+  case OpenACCClauseKind::PresentOrCopy:
+    return Out << "present_or_copy";
+
   case OpenACCClauseKind::UseDevice:
     return Out << "use_device";
 
@@ -352,12 +377,30 @@ inline StreamTy &printOpenACCClauseKind(StreamTy &Out, OpenACCClauseKind K) {
   case OpenACCClauseKind::CopyOut:
     return Out << "copyout";
 
+  case OpenACCClauseKind::PCopyOut:
+    return Out << "pcopyout";
+
+  case OpenACCClauseKind::PresentOrCopyOut:
+    return Out << "present_or_copyout";
+
   case OpenACCClauseKind::CopyIn:
     return Out << "copyin";
 
+  case OpenACCClauseKind::PCopyIn:
+    return Out << "pcopyin";
+
+  case OpenACCClauseKind::PresentOrCopyIn:
+    return Out << "present_or_copyin";
+
   case OpenACCClauseKind::Create:
     return Out << "create";
 
+  case OpenACCClauseKind::PCreate:
+    return Out << "pcreate";
+
+  case OpenACCClauseKind::PresentOrCreate:
+    return Out << "present_or_create";
+
   case OpenACCClauseKind::Reduction:
     return Out << "reduction";
 
diff --git a/clang/include/clang/Basic/SourceLocation.h b/clang/include/clang/Basic/SourceLocation.h
index 00b1e0fa855b..7a0f5ba8d127 100644
--- a/clang/include/clang/Basic/SourceLocation.h
+++ b/clang/include/clang/Basic/SourceLocation.h
@@ -90,6 +90,7 @@ class SourceLocation {
   friend class ASTWriter;
   friend class SourceManager;
   friend struct llvm::FoldingSetTrait<SourceLocation, void>;
+  friend class SourceLocationEncoding;
 
 public:
   using UIntTy = uint32_t;
diff --git a/clang/include/clang/Basic/SourceManager.h b/clang/include/clang/Basic/SourceManager.h
index d2ece14da0b1..5258bab584f4 100644
--- a/clang/include/clang/Basic/SourceManager.h
+++ b/clang/include/clang/Basic/SourceManager.h
@@ -1504,7 +1504,7 @@ public:
     if (Presumed.isInvalid())
       return false;
     StringRef Filename(Presumed.getFilename());
-    return Filename.equals("<built-in>");
+    return Filename == "<built-in>";
   }
 
   /// Returns whether \p Loc is located in a <command line> file.
@@ -1513,7 +1513,7 @@ public:
     if (Presumed.isInvalid())
       return false;
     StringRef Filename(Presumed.getFilename());
-    return Filename.equals("<command line>");
+    return Filename == "<command line>";
   }
 
   /// Returns whether \p Loc is located in a <scratch space> file.
@@ -1522,7 +1522,7 @@ public:
     if (Presumed.isInvalid())
       return false;
     StringRef Filename(Presumed.getFilename());
-    return Filename.equals("<scratch space>");
+    return Filename == "<scratch space>";
   }
 
   /// Returns if a SourceLocation is in a system header.
diff --git a/clang/include/clang/Basic/Target/MSP430/gen-msp430-def.py b/clang/include/clang/Basic/Target/MSP430/gen-msp430-def.py
new file mode 100755
index 000000000000..3ae6fdd9d5c6
--- /dev/null
+++ b/clang/include/clang/Basic/Target/MSP430/gen-msp430-def.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+# ===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ===----------------------------------------------------------------------===##
+"""
+Script to generate MSP430 definitions from TI's devices.csv
+
+Download the devices.csv from [1] using the link "Header and Support Files".
+
+[1]: https://www.ti.com/tool/MSP430-GCC-OPENSOURCE#downloads
+"""
+import csv
+import sys
+
+DEVICE_COLUMN = 0
+MULTIPLIER_COLUMN = 3
+
+MULTIPLIER_SW = "0"
+MULTIPLIER_HW_16 = ("1", "2")
+MULTIPLIER_HW_32 = ("4", "8")
+
+PREFIX = """//===--- MSP430Target.def - MSP430 Feature/Processor Database----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the MSP430 devices and their features.
+//
+// Generated from TI's devices.csv in version {} using the script in
+// Target/MSP430/gen-msp430-def.py - use this tool rather than adding
+// new MCUs by hand.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MSP430_MCU_FEAT
+#define MSP430_MCU_FEAT(NAME, HWMULT) MSP430_MCU(NAME)
+#endif
+
+#ifndef MSP430_MCU
+#define MSP430_MCU(NAME)
+#endif
+
+"""
+
+SUFFIX = """
+// Generic MCUs
+MSP430_MCU("msp430i2xxgeneric")
+
+#undef MSP430_MCU
+#undef MSP430_MCU_FEAT
+"""
+
+
+def csv2def(csv_path, def_path):
+    """
+    Parse the devices.csv file at the given path, generate the definitions and
+    write them to the given path.
+
+    :param csv_path: Path to the devices.csv to parse
+    :type csv_path: str
+    :param def_path: Path to the output file to write the definitions to
+    "type def_path: str
+    """
+
+    mcus_multiplier_sw = []
+    mcus_multiplier_hw_16 = []
+    mcus_multiplier_hw_32 = []
+    version = "unknown"
+
+    with open(csv_path) as csv_file:
+        csv_reader = csv.reader(csv_file)
+        while True:
+            row = next(csv_reader)
+            if len(row) < MULTIPLIER_COLUMN:
+                continue
+
+            if row[DEVICE_COLUMN] == "# Device Name":
+                assert row[MULTIPLIER_COLUMN] == "MPY_TYPE", "File format changed"
+                break
+
+            if row[0] == "Version:":
+                version = row[1]
+
+        for row in csv_reader:
+            if row[DEVICE_COLUMN].endswith("generic"):
+                continue
+            if row[MULTIPLIER_COLUMN] == MULTIPLIER_SW:
+                mcus_multiplier_sw.append(row[DEVICE_COLUMN])
+            elif row[MULTIPLIER_COLUMN] in MULTIPLIER_HW_16:
+                mcus_multiplier_hw_16.append(row[DEVICE_COLUMN])
+            elif row[MULTIPLIER_COLUMN] in MULTIPLIER_HW_32:
+                mcus_multiplier_hw_32.append(row[DEVICE_COLUMN])
+            else:
+                assert 0, "Unknown multiplier type"
+
+    with open(def_path, "w") as def_file:
+        def_file.write(PREFIX.format(version))
+
+        for mcu in mcus_multiplier_sw:
+            def_file.write(f'MSP430_MCU("{mcu}")\n')
+
+        def_file.write("\n// With 16-bit hardware multiplier\n")
+
+        for mcu in mcus_multiplier_hw_16:
+            def_file.write(f'MSP430_MCU_FEAT("{mcu}", "16bit")\n')
+
+        def_file.write("\n// With 32-bit hardware multiplier\n")
+
+        for mcu in mcus_multiplier_hw_32:
+            def_file.write(f'MSP430_MCU_FEAT("{mcu}", "32bit")\n')
+
+        def_file.write(SUFFIX)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        sys.exit(f"Usage: {sys.argv[0]} <CSV_FILE> <DEF_FILE>")
+
+    csv2def(sys.argv[1], sys.argv[2])
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index 3ced2e7397a7..8a6511b9ced8 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -97,6 +97,10 @@ struct TransferrableTargetInfo {
   unsigned char LongLongWidth, LongLongAlign;
   unsigned char Int128Align;
 
+  // This is an optional parameter for targets that
+  // don't use 'LongLongAlign' for '_BitInt' max alignment
+  std::optional<unsigned> BitIntMaxAlign;
+
   // Fixed point bit widths
   unsigned char ShortAccumWidth, ShortAccumAlign;
   unsigned char AccumWidth, AccumAlign;
@@ -518,6 +522,22 @@ public:
   /// getInt128Align() - Returns the alignment of Int128.
   unsigned getInt128Align() const { return Int128Align; }
 
+  /// getBitIntMaxAlign() - Returns the maximum possible alignment of
+  /// '_BitInt' and 'unsigned _BitInt'.
+  unsigned getBitIntMaxAlign() const {
+    return BitIntMaxAlign.value_or(LongLongAlign);
+  }
+
+  /// getBitIntAlign/Width - Return aligned size of '_BitInt' and
+  /// 'unsigned _BitInt' for this target, in bits.
+  unsigned getBitIntWidth(unsigned NumBits) const {
+    return llvm::alignTo(NumBits, getBitIntAlign(NumBits));
+  }
+  unsigned getBitIntAlign(unsigned NumBits) const {
+    return std::clamp<unsigned>(llvm::PowerOf2Ceil(NumBits), getCharWidth(),
+                                getBitIntMaxAlign());
+  }
+
   /// getShortAccumWidth/Align - Return the size of 'signed short _Accum' and
   /// 'unsigned short _Accum' for this target, in bits.
   unsigned getShortAccumWidth() const { return ShortAccumWidth; }
diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def
index a27fbed358a6..56c4b17f769d 100644
--- a/clang/include/clang/Basic/TokenKinds.def
+++ b/clang/include/clang/Basic/TokenKinds.def
@@ -537,6 +537,7 @@ TYPE_TRAIT_1(__is_referenceable, IsReferenceable, KEYCXX)
 TYPE_TRAIT_1(__can_pass_in_regs, CanPassInRegs, KEYCXX)
 TYPE_TRAIT_2(__reference_binds_to_temporary, ReferenceBindsToTemporary, KEYCXX)
 TYPE_TRAIT_2(__reference_constructs_from_temporary, ReferenceConstructsFromTemporary, KEYCXX)
+TYPE_TRAIT_2(__reference_converts_from_temporary, ReferenceConvertsFromTemporary, KEYCXX)
 
 // Embarcadero Expression Traits
 EXPRESSION_TRAIT(__is_lvalue_expr, IsLValueExpr, KEYCXX)
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 1ac6d5170ea2..7808ee559932 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -458,6 +458,40 @@ let TargetGuard = "sme2,sme-f64f64" in {
   def SVMLS_LANE_VG1x4_F64 : Inst<"svmls_lane_za64[_{d}]_vg1x4", "vm4di", "d", MergeNone, "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_1>]>;
 }
 
+let TargetGuard = "sme-f16f16" in {
+  def SVMLA_MULTI_VG1x2_F16 : Inst<"svmla_za16[_f16]_vg1x2", "vm22", "h", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLA_MULTI_VG1x4_F16 : Inst<"svmla_za16[_f16]_vg1x4", "vm44", "h", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_MULTI_VG1x2_F16 : Inst<"svmls_za16[_f16]_vg1x2", "vm22", "h", MergeNone, "aarch64_sme_fmls_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_MULTI_VG1x4_F16 : Inst<"svmls_za16[_f16]_vg1x4", "vm44", "h", MergeNone, "aarch64_sme_fmls_vg1x4", [IsStreaming, IsInOutZA], []>;
+
+  def SVMLA_SINGLE_VG1x2_F16 : Inst<"svmla[_single]_za16[_f16]_vg1x2", "vm2d", "h", MergeNone, "aarch64_sme_fmla_single_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLA_SINGLE_VG1x4_F16 : Inst<"svmla[_single]_za16[_f16]_vg1x4", "vm4d", "h", MergeNone, "aarch64_sme_fmla_single_vg1x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_SINGLE_VG1x2_F16 : Inst<"svmls[_single]_za16[_f16]_vg1x2", "vm2d", "h", MergeNone, "aarch64_sme_fmls_single_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_SINGLE_VG1x4_F16 : Inst<"svmls[_single]_za16[_f16]_vg1x4", "vm4d", "h", MergeNone, "aarch64_sme_fmls_single_vg1x4", [IsStreaming, IsInOutZA], []>;
+
+  def SVMLA_LANE_VG1x2_F16 : Inst<"svmla_lane_za16[_f16]_vg1x2", "vm2di", "h", MergeNone, "aarch64_sme_fmla_lane_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLA_LANE_VG1x4_F16 : Inst<"svmla_lane_za16[_f16]_vg1x4", "vm4di", "h", MergeNone, "aarch64_sme_fmla_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLS_LANE_VG1x2_F16 : Inst<"svmls_lane_za16[_f16]_vg1x2", "vm2di", "h", MergeNone, "aarch64_sme_fmls_lane_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLS_LANE_VG1x4_F16 : Inst<"svmls_lane_za16[_f16]_vg1x4", "vm4di", "h", MergeNone, "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+}
+
+let TargetGuard = "sme2,b16b16" in {
+  def SVMLA_MULTI_VG1x2_BF16 : Inst<"svmla_za16[_bf16]_vg1x2", "vm22", "b", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLA_MULTI_VG1x4_BF16 : Inst<"svmla_za16[_bf16]_vg1x4", "vm44", "b", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_MULTI_VG1x2_BF16 : Inst<"svmls_za16[_bf16]_vg1x2", "vm22", "b", MergeNone, "aarch64_sme_fmls_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_MULTI_VG1x4_BF16 : Inst<"svmls_za16[_bf16]_vg1x4", "vm44", "b", MergeNone, "aarch64_sme_fmls_vg1x4", [IsStreaming, IsInOutZA], []>;
+
+  def SVMLA_SINGLE_VG1x2_BF16 : Inst<"svmla[_single]_za16[_bf16]_vg1x2", "vm2d", "b", MergeNone, "aarch64_sme_fmla_single_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLA_SINGLE_VG1x4_BF16 : Inst<"svmla[_single]_za16[_bf16]_vg1x4", "vm4d", "b", MergeNone, "aarch64_sme_fmla_single_vg1x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_SINGLE_VG1x2_BF16 : Inst<"svmls[_single]_za16[_bf16]_vg1x2", "vm2d", "b", MergeNone, "aarch64_sme_fmls_single_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_SINGLE_VG1x4_BF16 : Inst<"svmls[_single]_za16[_bf16]_vg1x4", "vm4d", "b", MergeNone, "aarch64_sme_fmls_single_vg1x4", [IsStreaming, IsInOutZA], []>;
+
+  def SVMLA_LANE_VG1x2_BF16 : Inst<"svmla_lane_za16[_bf16]_vg1x2", "vm2di", "b", MergeNone, "aarch64_sme_fmla_lane_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLA_LANE_VG1x4_BF16 : Inst<"svmla_lane_za16[_bf16]_vg1x4", "vm4di", "b", MergeNone, "aarch64_sme_fmla_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLS_LANE_VG1x2_BF16 : Inst<"svmls_lane_za16[_bf16]_vg1x2", "vm2di", "b", MergeNone, "aarch64_sme_fmls_lane_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLS_LANE_VG1x4_BF16 : Inst<"svmls_lane_za16[_bf16]_vg1x4", "vm4di", "b", MergeNone, "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+}
+
 // FMLAL/FMLSL/UMLAL/SMLAL
 // SMLALL/UMLALL/USMLALL/SUMLALL
 let TargetGuard = "sme2" in {
@@ -674,3 +708,27 @@ let TargetGuard = "sme2" in {
   def SVLUTI2_LANE_ZT_X2 : Inst<"svluti2_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti2_lane_zt_x2", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_7>]>;
   def SVLUTI4_LANE_ZT_X2 : Inst<"svluti4_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt_x2", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>;
 }
+
+////////////////////////////////////////////////////////////////////////////////
+// SME2p1 - FMOPA, FMOPS (non-widening)
+let TargetGuard = "sme2,b16b16" in {
+  def SVMOPA_BF16_NW : SInst<"svmopa_za16[_bf16]_m", "viPPdd", "b",
+                             MergeNone, "aarch64_sme_mopa",
+                             [IsStreaming, IsInOutZA],
+                             [ImmCheck<0, ImmCheck0_1>]>;
+  def SVMOPS_BF16_NW : SInst<"svmops_za16[_bf16]_m", "viPPdd", "b",
+                             MergeNone, "aarch64_sme_mops",
+                             [IsStreaming, IsInOutZA],
+                             [ImmCheck<0, ImmCheck0_1>]>;
+}
+
+let TargetGuard = "sme-f16f16" in {
+  def SVMOPA_F16_NW : SInst<"svmopa_za16[_f16]_m", "viPPdd", "h",
+                            MergeNone, "aarch64_sme_mopa",
+                            [IsStreaming, IsInOutZA],
+                            [ImmCheck<0, ImmCheck0_1>]>;
+  def SVMOPS_F16_NW : SInst<"svmops_za16[_f16]_m", "viPPdd", "h",
+                            MergeNone, "aarch64_sme_mops",
+                            [IsStreaming, IsInOutZA],
+                            [ImmCheck<0, ImmCheck0_1>]>;
+}
diff --git a/clang/include/clang/Driver/Distro.h b/clang/include/clang/Driver/Distro.h
index a8de94163e8b..1404e1686848 100644
--- a/clang/include/clang/Driver/Distro.h
+++ b/clang/include/clang/Driver/Distro.h
@@ -79,6 +79,7 @@ public:
     UbuntuLunar,
     UbuntuMantic,
     UbuntuNoble,
+    UbuntuOracular,
     UnknownDistro
   };
 
@@ -130,7 +131,7 @@ public:
   }
 
   bool IsUbuntu() const {
-    return DistroVal >= UbuntuHardy && DistroVal <= UbuntuNoble;
+    return DistroVal >= UbuntuHardy && DistroVal <= UbuntuOracular;
   }
 
   bool IsAlpineLinux() const { return DistroVal == AlpineLinux; }
diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h
index 2ffc52bcb7ad..cc1538372d5f 100644
--- a/clang/include/clang/Driver/Driver.h
+++ b/clang/include/clang/Driver/Driver.h
@@ -424,11 +424,6 @@ public:
     return ClangExecutable.c_str();
   }
 
-  /// Get the path to where the clang executable was installed.
-  const char *getInstalledDir() const {
-    return Dir.c_str();
-  }
-
   bool isSaveTempsEnabled() const { return SaveTemps != SaveTempsNone; }
   bool isSaveTempsObj() const { return SaveTemps == SaveTempsObj; }
 
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 864da4e1157f..ed3f1b8b2981 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3383,10 +3383,10 @@ defm application_extension : BoolFOption<"application-extension",
           "Restrict code to those available for App Extensions">,
   NegFlag<SetFalse>>;
 defm relaxed_template_template_args : BoolFOption<"relaxed-template-template-args",
-  LangOpts<"RelaxedTemplateTemplateArgs">, DefaultFalse,
-  PosFlag<SetTrue, [], [ClangOption, CC1Option],
-          "Enable C++17 relaxed template template argument matching">,
-  NegFlag<SetFalse>>;
+  LangOpts<"RelaxedTemplateTemplateArgs">, DefaultTrue,
+  PosFlag<SetTrue, [], [], "Enable">,
+  NegFlag<SetFalse, [], [CC1Option], "Disable">,
+  BothFlags<[], [ClangOption], " C++17 relaxed template template argument matching">>;
 defm sized_deallocation : BoolFOption<"sized-deallocation",
   LangOpts<"SizedDeallocation">, DefaultFalse,
   PosFlag<SetTrue, [], [ClangOption, CC1Option],
@@ -3663,14 +3663,14 @@ defm rwpi : BoolFOption<"rwpi",
           "Generate read-write position independent code (ARM only)">,
   NegFlag<SetFalse, [], [ClangOption, FlangOption, CC1Option]>>;
 def fplugin_EQ : Joined<["-"], "fplugin=">, Group<f_Group>,
-  Flags<[NoXarchOption]>, MetaVarName<"<dsopath>">,
+  Flags<[NoXarchOption, NoArgumentUnused]>, MetaVarName<"<dsopath>">,
   HelpText<"Load the named plugin (dynamic shared object)">;
 def fplugin_arg : Joined<["-"], "fplugin-arg-">,
-  MetaVarName<"<name>-<arg>">,
+  MetaVarName<"<name>-<arg>">, Flags<[NoArgumentUnused]>,
   HelpText<"Pass <arg> to plugin <name>">;
 def fpass_plugin_EQ : Joined<["-"], "fpass-plugin=">,
   Group<f_Group>, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>,
-  MetaVarName<"<dsopath>">,
+  MetaVarName<"<dsopath>">, Flags<[NoArgumentUnused]>,
   HelpText<"Load pass plugin from a dynamic shared object file (only with new pass manager).">,
   MarshallingInfoStringVector<CodeGenOpts<"PassPlugins">>;
 defm tocdata : BoolOption<"m","tocdata",
@@ -3971,7 +3971,7 @@ def funroll_loops : Flag<["-"], "funroll-loops">, Group<f_Group>,
 def fno_unroll_loops : Flag<["-"], "fno-unroll-loops">, Group<f_Group>,
   HelpText<"Turn off loop unroller">, Visibility<[ClangOption, CC1Option]>;
 def ffinite_loops: Flag<["-"],  "ffinite-loops">, Group<f_Group>,
-  HelpText<"Assume all loops are finite.">, Visibility<[ClangOption, CC1Option]>;
+  HelpText<"Assume all non-trivial loops are finite.">, Visibility<[ClangOption, CC1Option]>;
 def fno_finite_loops: Flag<["-"], "fno-finite-loops">, Group<f_Group>,
   HelpText<"Do not assume that any loop is finite.">,
   Visibility<[ClangOption, CC1Option]>;
@@ -4159,6 +4159,11 @@ defm unique_section_names : BoolFOption<"unique-section-names",
   NegFlag<SetFalse, [], [ClangOption, CC1Option],
           "Don't use unique names for text and data sections">,
   PosFlag<SetTrue>>;
+defm separate_named_sections : BoolFOption<"separate-named-sections",
+  CodeGenOpts<"SeparateNamedSections">, DefaultFalse,
+  PosFlag<SetTrue, [], [ClangOption, CC1Option],
+          "Use separate unique sections for named sections (ELF Only)">,
+  NegFlag<SetFalse>>;
 
 defm split_machine_functions: BoolFOption<"split-machine-functions",
   CodeGenOpts<"SplitMachineFunctions">, DefaultFalse,
@@ -4175,6 +4180,14 @@ defm strict_return : BoolFOption<"strict-return",
 
 let Flags = [TargetSpecific] in {
 defm ptrauth_intrinsics : OptInCC1FFlag<"ptrauth-intrinsics", "Enable pointer authentication intrinsics">;
+defm ptrauth_calls : OptInCC1FFlag<"ptrauth-calls", "Enable signing and authentication of all indirect calls">;
+defm ptrauth_returns : OptInCC1FFlag<"ptrauth-returns", "Enable signing and authentication of return addresses">;
+defm ptrauth_auth_traps : OptInCC1FFlag<"ptrauth-auth-traps", "Enable traps on authentication failures">;
+defm ptrauth_vtable_pointer_address_discrimination :
+  OptInCC1FFlag<"ptrauth-vtable-pointer-address-discrimination", "Enable address discrimination of vtable pointers">;
+defm ptrauth_vtable_pointer_type_discrimination :
+  OptInCC1FFlag<"ptrauth-vtable-pointer-type-discrimination", "Enable type discrimination of vtable pointers">;
+defm ptrauth_init_fini : OptInCC1FFlag<"ptrauth-init-fini", "Enable signing of function pointers in init/fini arrays">;
 }
 
 def fenable_matrix : Flag<["-"], "fenable-matrix">, Group<f_Group>,
@@ -4895,34 +4908,34 @@ def mharden_sls_EQ : Joined<["-"], "mharden-sls=">, Group<m_Group>,
            " blr(ARM/AArch64), comdat(ARM/AArch64), nocomdat(ARM/AArch64),"
            " return(X86), indirect-jmp(X86)">;
 
-def msimd128 : Flag<["-"], "msimd128">, Group<m_wasm_Features_Group>;
-def mno_simd128 : Flag<["-"], "mno-simd128">, Group<m_wasm_Features_Group>;
-def mrelaxed_simd : Flag<["-"], "mrelaxed-simd">, Group<m_wasm_Features_Group>;
-def mno_relaxed_simd : Flag<["-"], "mno-relaxed-simd">, Group<m_wasm_Features_Group>;
-def mhalf_precision : Flag<["-"], "mhalf-precision">, Group<m_wasm_Features_Group>;
-def mno_half_precision : Flag<["-"], "mno-half-precision">, Group<m_wasm_Features_Group>;
-def mnontrapping_fptoint : Flag<["-"], "mnontrapping-fptoint">, Group<m_wasm_Features_Group>;
-def mno_nontrapping_fptoint : Flag<["-"], "mno-nontrapping-fptoint">, Group<m_wasm_Features_Group>;
-def msign_ext : Flag<["-"], "msign-ext">, Group<m_wasm_Features_Group>;
-def mno_sign_ext : Flag<["-"], "mno-sign-ext">, Group<m_wasm_Features_Group>;
-def mexception_handing : Flag<["-"], "mexception-handling">, Group<m_wasm_Features_Group>;
-def mno_exception_handing : Flag<["-"], "mno-exception-handling">, Group<m_wasm_Features_Group>;
 def matomics : Flag<["-"], "matomics">, Group<m_wasm_Features_Group>;
 def mno_atomics : Flag<["-"], "mno-atomics">, Group<m_wasm_Features_Group>;
 def mbulk_memory : Flag<["-"], "mbulk-memory">, Group<m_wasm_Features_Group>;
 def mno_bulk_memory : Flag<["-"], "mno-bulk-memory">, Group<m_wasm_Features_Group>;
-def mmutable_globals : Flag<["-"], "mmutable-globals">, Group<m_wasm_Features_Group>;
-def mno_mutable_globals : Flag<["-"], "mno-mutable-globals">, Group<m_wasm_Features_Group>;
-def mmultivalue : Flag<["-"], "mmultivalue">, Group<m_wasm_Features_Group>;
-def mno_multivalue : Flag<["-"], "mno-multivalue">, Group<m_wasm_Features_Group>;
-def mtail_call : Flag<["-"], "mtail-call">, Group<m_wasm_Features_Group>;
-def mno_tail_call : Flag<["-"], "mno-tail-call">, Group<m_wasm_Features_Group>;
-def mreference_types : Flag<["-"], "mreference-types">, Group<m_wasm_Features_Group>;
-def mno_reference_types : Flag<["-"], "mno-reference-types">, Group<m_wasm_Features_Group>;
+def mexception_handing : Flag<["-"], "mexception-handling">, Group<m_wasm_Features_Group>;
+def mno_exception_handing : Flag<["-"], "mno-exception-handling">, Group<m_wasm_Features_Group>;
 def mextended_const : Flag<["-"], "mextended-const">, Group<m_wasm_Features_Group>;
 def mno_extended_const : Flag<["-"], "mno-extended-const">, Group<m_wasm_Features_Group>;
+def mhalf_precision : Flag<["-"], "mhalf-precision">, Group<m_wasm_Features_Group>;
+def mno_half_precision : Flag<["-"], "mno-half-precision">, Group<m_wasm_Features_Group>;
 def mmultimemory : Flag<["-"], "mmultimemory">, Group<m_wasm_Features_Group>;
 def mno_multimemory : Flag<["-"], "mno-multimemory">, Group<m_wasm_Features_Group>;
+def mmultivalue : Flag<["-"], "mmultivalue">, Group<m_wasm_Features_Group>;
+def mno_multivalue : Flag<["-"], "mno-multivalue">, Group<m_wasm_Features_Group>;
+def mmutable_globals : Flag<["-"], "mmutable-globals">, Group<m_wasm_Features_Group>;
+def mno_mutable_globals : Flag<["-"], "mno-mutable-globals">, Group<m_wasm_Features_Group>;
+def mnontrapping_fptoint : Flag<["-"], "mnontrapping-fptoint">, Group<m_wasm_Features_Group>;
+def mno_nontrapping_fptoint : Flag<["-"], "mno-nontrapping-fptoint">, Group<m_wasm_Features_Group>;
+def mreference_types : Flag<["-"], "mreference-types">, Group<m_wasm_Features_Group>;
+def mno_reference_types : Flag<["-"], "mno-reference-types">, Group<m_wasm_Features_Group>;
+def mrelaxed_simd : Flag<["-"], "mrelaxed-simd">, Group<m_wasm_Features_Group>;
+def mno_relaxed_simd : Flag<["-"], "mno-relaxed-simd">, Group<m_wasm_Features_Group>;
+def msign_ext : Flag<["-"], "msign-ext">, Group<m_wasm_Features_Group>;
+def mno_sign_ext : Flag<["-"], "mno-sign-ext">, Group<m_wasm_Features_Group>;
+def msimd128 : Flag<["-"], "msimd128">, Group<m_wasm_Features_Group>;
+def mno_simd128 : Flag<["-"], "mno-simd128">, Group<m_wasm_Features_Group>;
+def mtail_call : Flag<["-"], "mtail-call">, Group<m_wasm_Features_Group>;
+def mno_tail_call : Flag<["-"], "mno-tail-call">, Group<m_wasm_Features_Group>;
 def mexec_model_EQ : Joined<["-"], "mexec-model=">, Group<m_wasm_Features_Driver_Group>,
                      Values<"command,reactor">,
                      HelpText<"Execution model (WebAssembly only)">,
@@ -5072,6 +5085,10 @@ def maix_small_local_dynamic_tls : Flag<["-"], "maix-small-local-dynamic-tls">,
            "where the offset from the TLS base is encoded as an "
            "immediate operand (AIX 64-bit only). "
            "This access sequence is not used for variables larger than 32KB.">;
+def maix_shared_lib_tls_model_opt : Flag<["-"], "maix-shared-lib-tls-model-opt">,
+  Group<m_ppc_Features_Group>,
+  HelpText<"For shared library loaded with the main program, change local-dynamic access(es) "
+           "to initial-exec access(es) at the function level (AIX 64-bit only).">;
 def maix_struct_return : Flag<["-"], "maix-struct-return">,
   Group<m_Group>, Visibility<[ClangOption, CC1Option]>,
   HelpText<"Return all structs in memory (PPC32 only)">,
@@ -5698,7 +5715,7 @@ def whatsloaded : Flag<["-"], "whatsloaded">;
 def why_load : Flag<["-"], "why_load">;
 def whyload : Flag<["-"], "whyload">, Alias<why_load>;
 def w : Flag<["-"], "w">, HelpText<"Suppress all warnings">,
-  Visibility<[ClangOption, CC1Option]>,
+  Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>,
   MarshallingInfoFlag<DiagnosticOpts<"IgnoreWarnings">>;
 def x : JoinedOrSeparate<["-"], "x">,
 Flags<[NoXarchOption]>,
@@ -6256,9 +6273,9 @@ def mno_gather : Flag<["-"], "mno-gather">, Group<m_Group>,
 def mno_scatter : Flag<["-"], "mno-scatter">, Group<m_Group>,
                   HelpText<"Disable generation of scatter instructions in auto-vectorization(x86 only)">;
 def mapx_features_EQ : CommaJoined<["-"], "mapx-features=">, Group<m_x86_Features_Group>,
-    HelpText<"Enable features of APX">, Values<"egpr,push2pop2,ppx,ndd,ccmp,cf">;
+    HelpText<"Enable features of APX">, Values<"egpr,push2pop2,ppx,ndd,ccmp,nf,cf">;
 def mno_apx_features_EQ : CommaJoined<["-"], "mno-apx-features=">, Group<m_x86_Features_Group>,
-    HelpText<"Disable features of APX">, Values<"egpr,push2pop2,ppx,ndd,ccmp,cf">;
+    HelpText<"Disable features of APX">, Values<"egpr,push2pop2,ppx,ndd,ccmp,nf,cf">;
 // Features egpr, push2pop2, ppx and ndd are validated with llvm-test-suite && cpu2017 on Intel SDE.
 // For stability, we turn on these features only for -mapxf. After a feature pass the validation,
 // we will add it to -mapxf.
@@ -7087,6 +7104,11 @@ def mlink_bitcode_file : Separate<["-"], "mlink-bitcode-file">,
 def mlink_builtin_bitcode : Separate<["-"], "mlink-builtin-bitcode">,
   HelpText<"Link and internalize needed symbols from the given bitcode file "
            "before performing optimizations.">;
+defm link_builtin_bitcode_postopt: BoolMOption<"link-builtin-bitcode-postopt",
+  CodeGenOpts<"LinkBitcodePostopt">, DefaultFalse,
+  PosFlag<SetTrue, [], [ClangOption], "Link builtin bitcodes after the "
+  "optimization pipeline">,
+  NegFlag<SetFalse, [], [ClangOption]>>;
 def vectorize_loops : Flag<["-"], "vectorize-loops">,
   HelpText<"Run the Loop vectorization passes">,
   MarshallingInfoFlag<CodeGenOpts<"VectorizeLoop">>;
diff --git a/clang/include/clang/ExtractAPI/API.h b/clang/include/clang/ExtractAPI/API.h
index d323e1668a72..bf291074fd06 100644
--- a/clang/include/clang/ExtractAPI/API.h
+++ b/clang/include/clang/ExtractAPI/API.h
@@ -266,6 +266,8 @@ struct APIRecord {
 
   AccessControl Access;
 
+  RecordKind KindForDisplay;
+
 private:
   const RecordKind Kind;
   friend class RecordContext;
@@ -277,6 +279,7 @@ public:
   APIRecord *getNextInContext() const { return NextInContext; }
 
   RecordKind getKind() const { return Kind; }
+  RecordKind getKindForDisplay() const { return KindForDisplay; }
 
   static APIRecord *castFromRecordContext(const RecordContext *Ctx);
   static RecordContext *castToRecordContext(const APIRecord *Record);
@@ -293,10 +296,10 @@ public:
         Availability(std::move(Availability)), Linkage(Linkage),
         Comment(Comment), Declaration(Declaration), SubHeading(SubHeading),
         IsFromSystemHeader(IsFromSystemHeader), Access(std::move(Access)),
-        Kind(Kind) {}
+        KindForDisplay(Kind), Kind(Kind) {}
 
   APIRecord(RecordKind Kind, StringRef USR, StringRef Name)
-      : USR(USR), Name(Name), Kind(Kind) {}
+      : USR(USR), Name(Name), KindForDisplay(Kind), Kind(Kind) {}
 
   // Pure virtual destructor to make APIRecord abstract
   virtual ~APIRecord() = 0;
diff --git a/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h b/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h
index 97cc457ea2a9..8ccebe457ed5 100644
--- a/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h
+++ b/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h
@@ -194,6 +194,15 @@ protected:
     return Bases;
   }
 
+  APIRecord::RecordKind getKindForDisplay(const CXXRecordDecl *Decl) {
+    if (Decl->isUnion())
+      return APIRecord::RK_Union;
+    if (Decl->isStruct())
+      return APIRecord::RK_Struct;
+
+    return APIRecord::RK_CXXClass;
+  }
+
   StringRef getOwningModuleName(const Decl &D) {
     if (auto *OwningModule = D.getImportedOwningModule())
       return OwningModule->Name;
@@ -599,13 +608,6 @@ bool ExtractAPIVisitorBase<Derived>::VisitCXXRecordDecl(
   DeclarationFragments SubHeading =
       DeclarationFragmentsBuilder::getSubHeading(Decl);
 
-  APIRecord::RecordKind Kind;
-  if (Decl->isUnion())
-    Kind = APIRecord::RecordKind::RK_Union;
-  else if (Decl->isStruct())
-    Kind = APIRecord::RecordKind::RK_Struct;
-  else
-    Kind = APIRecord::RecordKind::RK_CXXClass;
   auto Access = DeclarationFragmentsBuilder::getAccessControl(Decl);
 
   CXXClassRecord *Record;
@@ -619,13 +621,15 @@ bool ExtractAPIVisitorBase<Derived>::VisitCXXRecordDecl(
         AvailabilityInfo::createFromDecl(Decl), Comment, Declaration,
         SubHeading, Template(Decl->getDescribedClassTemplate()), Access,
         isInSystemHeader(Decl));
-  } else
+  } else {
     Record = API.createRecord<CXXClassRecord>(
         USR, Name, createHierarchyInformationForDecl(*Decl), Loc,
         AvailabilityInfo::createFromDecl(Decl), Comment, Declaration,
-        SubHeading, Kind, Access, isInSystemHeader(Decl),
-        isEmbeddedInVarDeclarator(*Decl));
+        SubHeading, APIRecord::RecordKind::RK_CXXClass, Access,
+        isInSystemHeader(Decl), isEmbeddedInVarDeclarator(*Decl));
+  }
 
+  Record->KindForDisplay = getKindForDisplay(Decl);
   Record->Bases = getBases(Decl);
 
   return true;
@@ -849,6 +853,7 @@ bool ExtractAPIVisitorBase<Derived>::
       Template(Decl), DeclarationFragmentsBuilder::getAccessControl(Decl),
       isInSystemHeader(Decl));
 
+  CTPSR->KindForDisplay = getKindForDisplay(Decl);
   CTPSR->Bases = getBases(Decl);
 
   return true;
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index 48f5fb441575..74893f23210c 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -375,8 +375,25 @@ struct FormatStyle {
     ///   }
     /// \endcode
     bool AcrossComments;
-    /// Whether aligned case labels are aligned on the colon, or on the
-    /// , or on the tokens after the colon.
+    /// Whether to align the case arrows when aligning short case expressions.
+    /// \code{.java}
+    ///   true:
+    ///   i = switch (day) {
+    ///     case THURSDAY, SATURDAY -> 8;
+    ///     case WEDNESDAY          -> 9;
+    ///     default                 -> 0;
+    ///   };
+    ///
+    ///   false:
+    ///   i = switch (day) {
+    ///     case THURSDAY, SATURDAY -> 8;
+    ///     case WEDNESDAY ->          9;
+    ///     default ->                 0;
+    ///   };
+    /// \endcode
+    bool AlignCaseArrows;
+    /// Whether aligned case labels are aligned on the colon, or on the tokens
+    /// after the colon.
     /// \code
     ///   true:
     ///   switch (level) {
@@ -396,12 +413,14 @@ struct FormatStyle {
     bool operator==(const ShortCaseStatementsAlignmentStyle &R) const {
       return Enabled == R.Enabled && AcrossEmptyLines == R.AcrossEmptyLines &&
              AcrossComments == R.AcrossComments &&
+             AlignCaseArrows == R.AlignCaseArrows &&
              AlignCaseColons == R.AlignCaseColons;
     }
   };
 
   /// Style of aligning consecutive short case labels.
-  /// Only applies if ``AllowShortCaseLabelsOnASingleLine`` is ``true``.
+  /// Only applies if ``AllowShortCaseExpressionOnASingleLine`` or
+  /// ``AllowShortCaseLabelsOnASingleLine`` is ``true``.
   ///
   /// \code{.yaml}
   ///   # Example of usage:
@@ -724,6 +743,19 @@ struct FormatStyle {
   /// \version 3.5
   ShortBlockStyle AllowShortBlocksOnASingleLine;
 
+  /// Whether to merge a short switch labeled rule into a single line.
+  /// \code{.java}
+  ///   true:                               false:
+  ///   switch (a) {           vs.          switch (a) {
+  ///   case 1 -> 1;                        case 1 ->
+  ///   default -> 0;                         1;
+  ///   };                                  default ->
+  ///                                         0;
+  ///                                       };
+  /// \endcode
+  /// \version 19
+  bool AllowShortCaseExpressionOnASingleLine;
+
   /// If ``true``, short case labels will be contracted to a single line.
   /// \code
   ///   true:                                   false:
@@ -4923,6 +4955,8 @@ struct FormatStyle {
            AllowBreakBeforeNoexceptSpecifier ==
                R.AllowBreakBeforeNoexceptSpecifier &&
            AllowShortBlocksOnASingleLine == R.AllowShortBlocksOnASingleLine &&
+           AllowShortCaseExpressionOnASingleLine ==
+               R.AllowShortCaseExpressionOnASingleLine &&
            AllowShortCaseLabelsOnASingleLine ==
                R.AllowShortCaseLabelsOnASingleLine &&
            AllowShortCompoundRequirementOnASingleLine ==
diff --git a/clang/include/clang/Frontend/MultiplexConsumer.h b/clang/include/clang/Frontend/MultiplexConsumer.h
index f29c8e92fded..4ed0d86d3cdf 100644
--- a/clang/include/clang/Frontend/MultiplexConsumer.h
+++ b/clang/include/clang/Frontend/MultiplexConsumer.h
@@ -32,7 +32,7 @@ public:
   MultiplexASTDeserializationListener(
       const std::vector<ASTDeserializationListener *> &L);
   void ReaderInitialized(ASTReader *Reader) override;
-  void IdentifierRead(serialization::IdentID ID, IdentifierInfo *II) override;
+  void IdentifierRead(serialization::IdentifierID ID, IdentifierInfo *II) override;
   void MacroRead(serialization::MacroID ID, MacroInfo *MI) override;
   void TypeRead(serialization::TypeIdx Idx, QualType T) override;
   void DeclRead(GlobalDeclID ID, const Decl *D) override;
diff --git a/clang/include/clang/InstallAPI/MachO.h b/clang/include/clang/InstallAPI/MachO.h
index 9da91a62e233..1ea544412f4c 100644
--- a/clang/include/clang/InstallAPI/MachO.h
+++ b/clang/include/clang/InstallAPI/MachO.h
@@ -45,6 +45,8 @@ using SimpleSymbol = llvm::MachO::SimpleSymbol;
 using FileType = llvm::MachO::FileType;
 using PackedVersion = llvm::MachO::PackedVersion;
 using PathSeq = llvm::MachO::PathSeq;
+using PlatformType = llvm::MachO::PlatformType;
+using PathToPlatformSeq = llvm::MachO::PathToPlatformSeq;
 using Target = llvm::MachO::Target;
 using TargetList = llvm::MachO::TargetList;
 
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index daefd4f28f01..61589fb7766f 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -3553,6 +3553,23 @@ private:
   OMPClause *ParseOpenMPVarListClause(OpenMPDirectiveKind DKind,
                                       OpenMPClauseKind Kind, bool ParseOnly);
 
+  /// Parses a clause consisting of a list of expressions.
+  ///
+  /// \param Kind          The clause to parse.
+  /// \param ClauseNameLoc [out] The location of the clause name.
+  /// \param OpenLoc       [out] The location of '('.
+  /// \param CloseLoc      [out] The location of ')'.
+  /// \param Exprs         [out] The parsed expressions.
+  /// \param ReqIntConst   If true, each expression must be an integer constant.
+  ///
+  /// \return Whether the clause was parsed successfully.
+  bool ParseOpenMPExprListClause(OpenMPClauseKind Kind,
+                                 SourceLocation &ClauseNameLoc,
+                                 SourceLocation &OpenLoc,
+                                 SourceLocation &CloseLoc,
+                                 SmallVectorImpl<Expr *> &Exprs,
+                                 bool ReqIntConst = false);
+
   /// Parses and creates OpenMP 5.0 iterators expression:
   /// <iterators> = 'iterator' '(' { [ <iterator-type> ] identifier =
   /// <range-specification> }+ ')'
@@ -3632,6 +3649,13 @@ private:
     // Wait constructs, we likely want to put that information in here as well.
   };
 
+  struct OpenACCWaitParseInfo {
+    bool Failed = false;
+    Expr *DevNumExpr = nullptr;
+    SourceLocation QueuesLoc;
+    SmallVector<Expr *> QueueIdExprs;
+  };
+
   /// Represents the 'error' state of parsing an OpenACC Clause, and stores
   /// whether we can continue parsing, or should give up on the directive.
   enum class OpenACCParseCanContinue { Cannot = 0, Can = 1 };
@@ -3674,7 +3698,8 @@ private:
   /// Parses the clause-list for an OpenACC directive.
   SmallVector<OpenACCClause *>
   ParseOpenACCClauseList(OpenACCDirectiveKind DirKind);
-  bool ParseOpenACCWaitArgument(SourceLocation Loc, bool IsDirective);
+  OpenACCWaitParseInfo ParseOpenACCWaitArgument(SourceLocation Loc,
+                                                bool IsDirective);
   /// Parses the clause of the 'bind' argument, which can be a string literal or
   /// an ID expression.
   ExprResult ParseOpenACCBindClauseArgument();
@@ -3698,7 +3723,9 @@ private:
   bool ParseOpenACCDeviceTypeList();
   /// Parses the 'async-argument', which is an integral value with two
   /// 'special' values that are likely negative (but come from Macros).
-  ExprResult ParseOpenACCAsyncArgument();
+  OpenACCIntExprParseResult ParseOpenACCAsyncArgument(OpenACCDirectiveKind DK,
+                                                      OpenACCClauseKind CK,
+                                                      SourceLocation Loc);
   /// Parses the 'size-expr', which is an integral value, or an asterisk.
   bool ParseOpenACCSizeExpr();
   /// Parses a comma delimited list of 'size-expr's.
diff --git a/clang/include/clang/Sema/Scope.h b/clang/include/clang/Sema/Scope.h
index 1752a25111a7..084db7303421 100644
--- a/clang/include/clang/Sema/Scope.h
+++ b/clang/include/clang/Sema/Scope.h
@@ -159,6 +159,9 @@ public:
 
     /// This is a scope of type alias declaration.
     TypeAliasScope = 0x20000000,
+
+    /// This is a scope of friend declaration.
+    FriendScope = 0x40000000,
   };
 
 private:
@@ -586,6 +589,9 @@ public:
   /// Determine whether this scope is a type alias scope.
   bool isTypeAliasScope() const { return getFlags() & Scope::TypeAliasScope; }
 
+  /// Determine whether this scope is a friend scope.
+  bool isFriendScope() const { return getFlags() & Scope::FriendScope; }
+
   /// Returns if rhs has a higher scope depth than this.
   ///
   /// The caller is responsible for calling this only if one of the two scopes
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index a80ac6dbc761..4efd3878e861 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -9739,6 +9739,9 @@ public:
                      const PartialDiagnostic &CandidateDiag,
                      bool Complain = true, QualType TargetType = QualType());
 
+  FunctionDecl *getMoreConstrainedFunction(FunctionDecl *FD1,
+                                           FunctionDecl *FD2);
+
   ///@}
 
   //
@@ -10199,7 +10202,9 @@ public:
         S.ExprEvalContexts.back().InImmediateFunctionContext =
             FD->isImmediateFunction() ||
             S.ExprEvalContexts[S.ExprEvalContexts.size() - 2]
-                .isConstantEvaluated();
+                .isConstantEvaluated() ||
+            S.ExprEvalContexts[S.ExprEvalContexts.size() - 2]
+                .isImmediateFunctionContext();
         S.ExprEvalContexts.back().InImmediateEscalatingFunctionContext =
             S.getLangOpts().CPlusPlus20 && FD->isImmediateEscalating();
       } else
diff --git a/clang/include/clang/Sema/SemaOpenACC.h b/clang/include/clang/Sema/SemaOpenACC.h
index edb0cbb7c5d5..e684ee6b2be1 100644
--- a/clang/include/clang/Sema/SemaOpenACC.h
+++ b/clang/include/clang/Sema/SemaOpenACC.h
@@ -50,10 +50,18 @@ public:
 
     struct VarListDetails {
       SmallVector<Expr *> VarList;
+      bool IsReadOnly;
+      bool IsZero;
+    };
+
+    struct WaitDetails {
+      Expr *DevNumExpr;
+      SourceLocation QueuesLoc;
+      SmallVector<Expr *> QueueIdExprs;
     };
 
     std::variant<std::monostate, DefaultDetails, ConditionDetails,
-                 IntExprDetails, VarListDetails>
+                 IntExprDetails, VarListDetails, WaitDetails>
         Details = std::monostate{};
 
   public:
@@ -99,16 +107,55 @@ public:
     unsigned getNumIntExprs() const {
       assert((ClauseKind == OpenACCClauseKind::NumGangs ||
               ClauseKind == OpenACCClauseKind::NumWorkers ||
+              ClauseKind == OpenACCClauseKind::Async ||
               ClauseKind == OpenACCClauseKind::VectorLength) &&
              "Parsed clause kind does not have a int exprs");
+
+      // 'async' and 'wait' have an optional IntExpr, so be tolerant of that.
+      if ((ClauseKind == OpenACCClauseKind::Async ||
+           ClauseKind == OpenACCClauseKind::Wait) &&
+          std::holds_alternative<std::monostate>(Details))
+        return 0;
       return std::get<IntExprDetails>(Details).IntExprs.size();
     }
 
+    SourceLocation getQueuesLoc() const {
+      assert(ClauseKind == OpenACCClauseKind::Wait &&
+             "Parsed clause kind does not have a queues location");
+
+      if (std::holds_alternative<std::monostate>(Details))
+        return SourceLocation{};
+
+      return std::get<WaitDetails>(Details).QueuesLoc;
+    }
+
+    Expr *getDevNumExpr() const {
+      assert(ClauseKind == OpenACCClauseKind::Wait &&
+             "Parsed clause kind does not have a device number expr");
+
+      if (std::holds_alternative<std::monostate>(Details))
+        return nullptr;
+
+      return std::get<WaitDetails>(Details).DevNumExpr;
+    }
+
+    ArrayRef<Expr *> getQueueIdExprs() const {
+      assert(ClauseKind == OpenACCClauseKind::Wait &&
+             "Parsed clause kind does not have a queue id expr list");
+
+      if (std::holds_alternative<std::monostate>(Details))
+        return ArrayRef<Expr *>{std::nullopt};
+
+      return std::get<WaitDetails>(Details).QueueIdExprs;
+    }
+
     ArrayRef<Expr *> getIntExprs() {
       assert((ClauseKind == OpenACCClauseKind::NumGangs ||
               ClauseKind == OpenACCClauseKind::NumWorkers ||
+              ClauseKind == OpenACCClauseKind::Async ||
               ClauseKind == OpenACCClauseKind::VectorLength) &&
              "Parsed clause kind does not have a int exprs");
+
       return std::get<IntExprDetails>(Details).IntExprs;
     }
 
@@ -117,7 +164,24 @@ public:
     }
 
     ArrayRef<Expr *> getVarList() {
-      assert(ClauseKind == OpenACCClauseKind::Private &&
+      assert((ClauseKind == OpenACCClauseKind::Private ||
+              ClauseKind == OpenACCClauseKind::NoCreate ||
+              ClauseKind == OpenACCClauseKind::Present ||
+              ClauseKind == OpenACCClauseKind::Copy ||
+              ClauseKind == OpenACCClauseKind::PCopy ||
+              ClauseKind == OpenACCClauseKind::PresentOrCopy ||
+              ClauseKind == OpenACCClauseKind::CopyIn ||
+              ClauseKind == OpenACCClauseKind::PCopyIn ||
+              ClauseKind == OpenACCClauseKind::PresentOrCopyIn ||
+              ClauseKind == OpenACCClauseKind::CopyOut ||
+              ClauseKind == OpenACCClauseKind::PCopyOut ||
+              ClauseKind == OpenACCClauseKind::PresentOrCopyOut ||
+              ClauseKind == OpenACCClauseKind::Create ||
+              ClauseKind == OpenACCClauseKind::PCreate ||
+              ClauseKind == OpenACCClauseKind::PresentOrCreate ||
+              ClauseKind == OpenACCClauseKind::Attach ||
+              ClauseKind == OpenACCClauseKind::DevicePtr ||
+              ClauseKind == OpenACCClauseKind::FirstPrivate) &&
              "Parsed clause kind does not have a var-list");
       return std::get<VarListDetails>(Details).VarList;
     }
@@ -126,6 +190,25 @@ public:
       return const_cast<OpenACCParsedClause *>(this)->getVarList();
     }
 
+    bool isReadOnly() const {
+      assert((ClauseKind == OpenACCClauseKind::CopyIn ||
+              ClauseKind == OpenACCClauseKind::PCopyIn ||
+              ClauseKind == OpenACCClauseKind::PresentOrCopyIn) &&
+             "Only copyin accepts 'readonly:' tag");
+      return std::get<VarListDetails>(Details).IsReadOnly;
+    }
+
+    bool isZero() const {
+      assert((ClauseKind == OpenACCClauseKind::CopyOut ||
+              ClauseKind == OpenACCClauseKind::PCopyOut ||
+              ClauseKind == OpenACCClauseKind::PresentOrCopyOut ||
+              ClauseKind == OpenACCClauseKind::Create ||
+              ClauseKind == OpenACCClauseKind::PCreate ||
+              ClauseKind == OpenACCClauseKind::PresentOrCreate) &&
+             "Only copyout/create accepts 'zero' tag");
+      return std::get<VarListDetails>(Details).IsZero;
+    }
+
     void setLParenLoc(SourceLocation EndLoc) { LParenLoc = EndLoc; }
     void setEndLoc(SourceLocation EndLoc) { ClauseRange.setEnd(EndLoc); }
 
@@ -152,6 +235,7 @@ public:
     void setIntExprDetails(ArrayRef<Expr *> IntExprs) {
       assert((ClauseKind == OpenACCClauseKind::NumGangs ||
               ClauseKind == OpenACCClauseKind::NumWorkers ||
+              ClauseKind == OpenACCClauseKind::Async ||
               ClauseKind == OpenACCClauseKind::VectorLength) &&
              "Parsed clause kind does not have a int exprs");
       Details = IntExprDetails{{IntExprs.begin(), IntExprs.end()}};
@@ -159,21 +243,88 @@ public:
     void setIntExprDetails(llvm::SmallVector<Expr *> &&IntExprs) {
       assert((ClauseKind == OpenACCClauseKind::NumGangs ||
               ClauseKind == OpenACCClauseKind::NumWorkers ||
+              ClauseKind == OpenACCClauseKind::Async ||
               ClauseKind == OpenACCClauseKind::VectorLength) &&
              "Parsed clause kind does not have a int exprs");
       Details = IntExprDetails{std::move(IntExprs)};
     }
 
-    void setVarListDetails(ArrayRef<Expr *> VarList) {
-      assert(ClauseKind == OpenACCClauseKind::Private &&
+    void setVarListDetails(ArrayRef<Expr *> VarList, bool IsReadOnly,
+                           bool IsZero) {
+      assert((ClauseKind == OpenACCClauseKind::Private ||
+              ClauseKind == OpenACCClauseKind::NoCreate ||
+              ClauseKind == OpenACCClauseKind::Present ||
+              ClauseKind == OpenACCClauseKind::Copy ||
+              ClauseKind == OpenACCClauseKind::PCopy ||
+              ClauseKind == OpenACCClauseKind::PresentOrCopy ||
+              ClauseKind == OpenACCClauseKind::CopyIn ||
+              ClauseKind == OpenACCClauseKind::PCopyIn ||
+              ClauseKind == OpenACCClauseKind::PresentOrCopyIn ||
+              ClauseKind == OpenACCClauseKind::CopyOut ||
+              ClauseKind == OpenACCClauseKind::PCopyOut ||
+              ClauseKind == OpenACCClauseKind::PresentOrCopyOut ||
+              ClauseKind == OpenACCClauseKind::Create ||
+              ClauseKind == OpenACCClauseKind::PCreate ||
+              ClauseKind == OpenACCClauseKind::PresentOrCreate ||
+              ClauseKind == OpenACCClauseKind::Attach ||
+              ClauseKind == OpenACCClauseKind::DevicePtr ||
+              ClauseKind == OpenACCClauseKind::FirstPrivate) &&
              "Parsed clause kind does not have a var-list");
-      Details = VarListDetails{{VarList.begin(), VarList.end()}};
+      assert((!IsReadOnly || ClauseKind == OpenACCClauseKind::CopyIn ||
+              ClauseKind == OpenACCClauseKind::PCopyIn ||
+              ClauseKind == OpenACCClauseKind::PresentOrCopyIn) &&
+             "readonly: tag only valid on copyin");
+      assert((!IsZero || ClauseKind == OpenACCClauseKind::CopyOut ||
+              ClauseKind == OpenACCClauseKind::PCopyOut ||
+              ClauseKind == OpenACCClauseKind::PresentOrCopyOut ||
+              ClauseKind == OpenACCClauseKind::Create ||
+              ClauseKind == OpenACCClauseKind::PCreate ||
+              ClauseKind == OpenACCClauseKind::PresentOrCreate) &&
+             "zero: tag only valid on copyout/create");
+      Details =
+          VarListDetails{{VarList.begin(), VarList.end()}, IsReadOnly, IsZero};
     }
 
-    void setVarListDetails(llvm::SmallVector<Expr *> &&VarList) {
-      assert(ClauseKind == OpenACCClauseKind::Private &&
+    void setVarListDetails(llvm::SmallVector<Expr *> &&VarList, bool IsReadOnly,
+                           bool IsZero) {
+      assert((ClauseKind == OpenACCClauseKind::Private ||
+              ClauseKind == OpenACCClauseKind::NoCreate ||
+              ClauseKind == OpenACCClauseKind::Present ||
+              ClauseKind == OpenACCClauseKind::Copy ||
+              ClauseKind == OpenACCClauseKind::PCopy ||
+              ClauseKind == OpenACCClauseKind::PresentOrCopy ||
+              ClauseKind == OpenACCClauseKind::CopyIn ||
+              ClauseKind == OpenACCClauseKind::PCopyIn ||
+              ClauseKind == OpenACCClauseKind::PresentOrCopyIn ||
+              ClauseKind == OpenACCClauseKind::CopyOut ||
+              ClauseKind == OpenACCClauseKind::PCopyOut ||
+              ClauseKind == OpenACCClauseKind::PresentOrCopyOut ||
+              ClauseKind == OpenACCClauseKind::Create ||
+              ClauseKind == OpenACCClauseKind::PCreate ||
+              ClauseKind == OpenACCClauseKind::PresentOrCreate ||
+              ClauseKind == OpenACCClauseKind::Attach ||
+              ClauseKind == OpenACCClauseKind::DevicePtr ||
+              ClauseKind == OpenACCClauseKind::FirstPrivate) &&
              "Parsed clause kind does not have a var-list");
-      Details = VarListDetails{std::move(VarList)};
+      assert((!IsReadOnly || ClauseKind == OpenACCClauseKind::CopyIn ||
+              ClauseKind == OpenACCClauseKind::PCopyIn ||
+              ClauseKind == OpenACCClauseKind::PresentOrCopyIn) &&
+             "readonly: tag only valid on copyin");
+      assert((!IsZero || ClauseKind == OpenACCClauseKind::CopyOut ||
+              ClauseKind == OpenACCClauseKind::PCopyOut ||
+              ClauseKind == OpenACCClauseKind::PresentOrCopyOut ||
+              ClauseKind == OpenACCClauseKind::Create ||
+              ClauseKind == OpenACCClauseKind::PCreate ||
+              ClauseKind == OpenACCClauseKind::PresentOrCreate) &&
+             "zero: tag only valid on copyout/create");
+      Details = VarListDetails{std::move(VarList), IsReadOnly, IsZero};
+    }
+
+    void setWaitDetails(Expr *DevNum, SourceLocation QueuesLoc,
+                        llvm::SmallVector<Expr *> &&IntExprs) {
+      assert(ClauseKind == OpenACCClauseKind::Wait &&
+             "Parsed clause kind does not have a wait-details");
+      Details = WaitDetails{DevNum, QueuesLoc, std::move(IntExprs)};
     }
   };
 
@@ -224,6 +375,10 @@ public:
   /// declaration reference to a variable of the correct type.
   ExprResult ActOnVar(Expr *VarExpr);
 
+  /// Called to check the 'var' type is a variable of pointer type, necessary
+  /// for 'deviceptr' and 'attach' clauses. Returns true on success.
+  bool CheckVarIsPointerType(OpenACCClauseKind ClauseKind, Expr *VarExpr);
+
   /// Checks and creates an Array Section used in an OpenACC construct/clause.
   ExprResult ActOnArraySectionExpr(Expr *Base, SourceLocation LBLoc,
                                    Expr *LowerBound,
diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index a8df5a0bda08..d3538e43d3d7 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -23,6 +23,7 @@
 #include "clang/Basic/IdentifierTable.h"
 #include "clang/Basic/OperatorKinds.h"
 #include "clang/Basic/SourceLocation.h"
+#include "clang/Serialization/SourceLocationEncoding.h"
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/Bitstream/BitCodes.h"
 #include <cassert>
@@ -60,6 +61,9 @@ const unsigned VERSION_MINOR = 1;
 /// and start at 1. 0 is reserved for NULL.
 using IdentifierID = uint32_t;
 
+/// The number of predefined identifier IDs.
+const unsigned int NUM_PREDEF_IDENT_IDS = 1;
+
 /// An ID number that refers to a declaration in an AST file. See the comments
 /// in DeclIDBase for details.
 using DeclID = DeclIDBase::DeclID;
@@ -122,12 +126,6 @@ struct UnsafeQualTypeDenseMapInfo {
   }
 };
 
-/// An ID number that refers to an identifier in an AST file.
-using IdentID = uint32_t;
-
-/// The number of predefined identifier IDs.
-const unsigned int NUM_PREDEF_IDENT_IDS = 1;
-
 /// An ID number that refers to a macro in an AST file.
 using MacroID = uint32_t;
 
@@ -165,99 +163,95 @@ using SubmoduleID = uint32_t;
 /// The number of predefined submodule IDs.
 const unsigned int NUM_PREDEF_SUBMODULE_IDS = 1;
 
+/// 32 aligned uint64_t in the AST file. Use splitted 64-bit integer into
+/// low/high parts to keep structure alignment 32-bit (it is important
+/// because blobs in bitstream are 32-bit aligned). This structure is
+/// serialized "as is" to the AST file.
+class UnalignedUInt64 {
+  uint32_t BitLow = 0;
+  uint32_t BitHigh = 0;
+
+public:
+  UnalignedUInt64() = default;
+  UnalignedUInt64(uint64_t BitOffset) { set(BitOffset); }
+
+  void set(uint64_t Offset) {
+    BitLow = Offset;
+    BitHigh = Offset >> 32;
+  }
+
+  uint64_t get() const { return BitLow | (uint64_t(BitHigh) << 32); }
+};
+
 /// Source range/offset of a preprocessed entity.
-struct PPEntityOffset {
+class PPEntityOffset {
+  using RawLocEncoding = SourceLocationEncoding::RawLocEncoding;
+
   /// Raw source location of beginning of range.
-  SourceLocation::UIntTy Begin;
+  UnalignedUInt64 Begin;
 
   /// Raw source location of end of range.
-  SourceLocation::UIntTy End;
+  UnalignedUInt64 End;
 
   /// Offset in the AST file relative to ModuleFile::MacroOffsetsBase.
   uint32_t BitOffset;
 
-  PPEntityOffset(SourceRange R, uint32_t BitOffset)
-      : Begin(R.getBegin().getRawEncoding()), End(R.getEnd().getRawEncoding()),
-        BitOffset(BitOffset) {}
+public:
+  PPEntityOffset(RawLocEncoding Begin, RawLocEncoding End, uint32_t BitOffset)
+      : Begin(Begin), End(End), BitOffset(BitOffset) {}
 
-  SourceLocation getBegin() const {
-    return SourceLocation::getFromRawEncoding(Begin);
-  }
+  RawLocEncoding getBegin() const { return Begin.get(); }
+  RawLocEncoding getEnd() const { return End.get(); }
 
-  SourceLocation getEnd() const {
-    return SourceLocation::getFromRawEncoding(End);
-  }
+  uint32_t getOffset() const { return BitOffset; }
 };
 
 /// Source range of a skipped preprocessor region
-struct PPSkippedRange {
+class PPSkippedRange {
+  using RawLocEncoding = SourceLocationEncoding::RawLocEncoding;
+
   /// Raw source location of beginning of range.
-  SourceLocation::UIntTy Begin;
+  UnalignedUInt64 Begin;
   /// Raw source location of end of range.
-  SourceLocation::UIntTy End;
+  UnalignedUInt64 End;
 
-  PPSkippedRange(SourceRange R)
-      : Begin(R.getBegin().getRawEncoding()), End(R.getEnd().getRawEncoding()) {
-  }
+public:
+  PPSkippedRange(RawLocEncoding Begin, RawLocEncoding End)
+      : Begin(Begin), End(End) {}
 
-  SourceLocation getBegin() const {
-    return SourceLocation::getFromRawEncoding(Begin);
-  }
-  SourceLocation getEnd() const {
-    return SourceLocation::getFromRawEncoding(End);
-  }
+  RawLocEncoding getBegin() const { return Begin.get(); }
+  RawLocEncoding getEnd() const { return End.get(); }
 };
 
-/// Offset in the AST file. Use splitted 64-bit integer into low/high
-/// parts to keep structure alignment 32-bit (it is important because
-/// blobs in bitstream are 32-bit aligned). This structure is serialized
-/// "as is" to the AST file.
-struct UnderalignedInt64 {
-  uint32_t BitOffsetLow = 0;
-  uint32_t BitOffsetHigh = 0;
-
-  UnderalignedInt64() = default;
-  UnderalignedInt64(uint64_t BitOffset) { setBitOffset(BitOffset); }
+/// Source location and bit offset of a declaration. Keep
+/// structure alignment 32-bit since the blob is assumed as 32-bit aligned.
+class DeclOffset {
+  using RawLocEncoding = SourceLocationEncoding::RawLocEncoding;
 
-  void setBitOffset(uint64_t Offset) {
-    BitOffsetLow = Offset;
-    BitOffsetHigh = Offset >> 32;
-  }
-
-  uint64_t getBitOffset() const {
-    return BitOffsetLow | (uint64_t(BitOffsetHigh) << 32);
-  }
-};
-
-/// Source location and bit offset of a declaration.
-struct DeclOffset {
   /// Raw source location.
-  SourceLocation::UIntTy Loc = 0;
+  UnalignedUInt64 RawLoc;
 
-  /// Offset relative to the start of the DECLTYPES_BLOCK block. Keep
-  /// structure alignment 32-bit and avoid padding gap because undefined
-  /// value in the padding affects AST hash.
-  UnderalignedInt64 BitOffset;
+  /// Offset relative to the start of the DECLTYPES_BLOCK block.
+  UnalignedUInt64 BitOffset;
 
+public:
   DeclOffset() = default;
-  DeclOffset(SourceLocation Loc, uint64_t BitOffset,
-             uint64_t DeclTypesBlockStartOffset) {
-    setLocation(Loc);
+  DeclOffset(RawLocEncoding RawLoc, uint64_t BitOffset,
+             uint64_t DeclTypesBlockStartOffset)
+      : RawLoc(RawLoc) {
     setBitOffset(BitOffset, DeclTypesBlockStartOffset);
   }
 
-  void setLocation(SourceLocation L) { Loc = L.getRawEncoding(); }
+  void setRawLoc(RawLocEncoding Loc) { RawLoc = Loc; }
 
-  SourceLocation getLocation() const {
-    return SourceLocation::getFromRawEncoding(Loc);
-  }
+  RawLocEncoding getRawLoc() const { return RawLoc.get(); }
 
   void setBitOffset(uint64_t Offset, const uint64_t DeclTypesBlockStartOffset) {
-    BitOffset.setBitOffset(Offset - DeclTypesBlockStartOffset);
+    BitOffset.set(Offset - DeclTypesBlockStartOffset);
   }
 
   uint64_t getBitOffset(const uint64_t DeclTypesBlockStartOffset) const {
-    return BitOffset.getBitOffset() + DeclTypesBlockStartOffset;
+    return BitOffset.get() + DeclTypesBlockStartOffset;
   }
 };
 
@@ -1091,6 +1085,9 @@ enum PredefinedTypeIDs {
 // \brief WebAssembly reference types with auto numeration
 #define WASM_TYPE(Name, Id, SingletonId) PREDEF_TYPE_##Id##_ID,
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
+
+  /// The placeholder type for unresolved templates.
+  PREDEF_TYPE_UNRESOLVED_TEMPLATE,
   // Sentinel value. Considered a predefined type but not useable as one.
   PREDEF_TYPE_LAST_ID
 };
@@ -1100,7 +1097,7 @@ enum PredefinedTypeIDs {
 ///
 /// Type IDs for non-predefined types will start at
 /// NUM_PREDEF_TYPE_IDs.
-const unsigned NUM_PREDEF_TYPE_IDS = 502;
+const unsigned NUM_PREDEF_TYPE_IDS = 503;
 
 // Ensure we do not overrun the predefined types we reserved
 // in the enum PredefinedTypeIDs above.
diff --git a/clang/include/clang/Serialization/ASTDeserializationListener.h b/clang/include/clang/Serialization/ASTDeserializationListener.h
index 3ab7f1a91843..1d81a9ae3fe2 100644
--- a/clang/include/clang/Serialization/ASTDeserializationListener.h
+++ b/clang/include/clang/Serialization/ASTDeserializationListener.h
@@ -35,7 +35,7 @@ public:
   virtual void ReaderInitialized(ASTReader *Reader) { }
 
   /// An identifier was deserialized from the AST file.
-  virtual void IdentifierRead(serialization::IdentID ID,
+  virtual void IdentifierRead(serialization::IdentifierID ID,
                               IdentifierInfo *II) { }
   /// A macro was read from the AST file.
   virtual void MacroRead(serialization::MacroID ID, MacroInfo *MI) { }
diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index 64f1ebc117b3..1bb5fa27a241 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -667,7 +667,7 @@ private:
   std::vector<IdentifierInfo *> IdentifiersLoaded;
 
   using GlobalIdentifierMapType =
-      ContinuousRangeMap<serialization::IdentID, ModuleFile *, 4>;
+      ContinuousRangeMap<serialization::IdentifierID, ModuleFile *, 4>;
 
   /// Mapping from global identifier IDs to the module in which the
   /// identifier resides along with the offset that should be added to the
@@ -1771,6 +1771,7 @@ public:
 
   /// Retrieve the module manager.
   ModuleManager &getModuleManager() { return ModuleMgr; }
+  const ModuleManager &getModuleManager() const { return ModuleMgr; }
 
   /// Retrieve the preprocessor.
   Preprocessor &getPreprocessor() const { return PP; }
@@ -2177,8 +2178,8 @@ public:
 
   /// Retrieve the global submodule ID given a module and its local ID
   /// number.
-  serialization::SubmoduleID
-  getGlobalSubmoduleID(ModuleFile &M, unsigned LocalID);
+  serialization::SubmoduleID getGlobalSubmoduleID(ModuleFile &M,
+                                                  unsigned LocalID) const;
 
   /// Retrieve the submodule that corresponds to a global submodule ID.
   ///
@@ -2191,7 +2192,7 @@ public:
 
   /// Retrieve the module file with a given local ID within the specified
   /// ModuleFile.
-  ModuleFile *getLocalModuleFile(ModuleFile &M, unsigned ID);
+  ModuleFile *getLocalModuleFile(ModuleFile &M, unsigned ID) const;
 
   /// Get an ID for the given module file.
   unsigned getModuleFileID(ModuleFile *M);
@@ -2227,33 +2228,46 @@ public:
     return Sema::AlignPackInfo::getFromRawEncoding(Raw);
   }
 
+  using RawLocEncoding = SourceLocationEncoding::RawLocEncoding;
+
   /// Read a source location from raw form and return it in its
   /// originating module file's source location space.
-  SourceLocation ReadUntranslatedSourceLocation(SourceLocation::UIntTy Raw,
-                                                LocSeq *Seq = nullptr) const {
+  std::pair<SourceLocation, unsigned>
+  ReadUntranslatedSourceLocation(RawLocEncoding Raw,
+                                 LocSeq *Seq = nullptr) const {
     return SourceLocationEncoding::decode(Raw, Seq);
   }
 
   /// Read a source location from raw form.
-  SourceLocation ReadSourceLocation(ModuleFile &ModuleFile,
-                                    SourceLocation::UIntTy Raw,
+  SourceLocation ReadSourceLocation(ModuleFile &MF, RawLocEncoding Raw,
                                     LocSeq *Seq = nullptr) const {
-    SourceLocation Loc = ReadUntranslatedSourceLocation(Raw, Seq);
-    return TranslateSourceLocation(ModuleFile, Loc);
+    if (!MF.ModuleOffsetMap.empty())
+      ReadModuleOffsetMap(MF);
+
+    auto [Loc, ModuleFileIndex] = ReadUntranslatedSourceLocation(Raw, Seq);
+    ModuleFile *OwningModuleFile =
+        ModuleFileIndex == 0 ? &MF : MF.DependentModules[ModuleFileIndex - 1];
+
+    assert(!SourceMgr.isLoadedSourceLocation(Loc) &&
+           "Run out source location space");
+
+    return TranslateSourceLocation(*OwningModuleFile, Loc);
   }
 
   /// Translate a source location from another module file's source
   /// location space into ours.
   SourceLocation TranslateSourceLocation(ModuleFile &ModuleFile,
                                          SourceLocation Loc) const {
-    if (!ModuleFile.ModuleOffsetMap.empty())
-      ReadModuleOffsetMap(ModuleFile);
-    assert(ModuleFile.SLocRemap.find(Loc.getOffset()) !=
-               ModuleFile.SLocRemap.end() &&
-           "Cannot find offset to remap.");
-    SourceLocation::IntTy Remap =
-        ModuleFile.SLocRemap.find(Loc.getOffset())->second;
-    return Loc.getLocWithOffset(Remap);
+    if (Loc.isInvalid())
+      return Loc;
+
+    // FIXME: TranslateSourceLocation is not re-enterable. It is problematic
+    // to call TranslateSourceLocation on a translated source location.
+    // We either need a method to know whether or not a source location is
+    // translated or refactor the code to make it clear that
+    // TranslateSourceLocation won't be called with translated source location.
+
+    return Loc.getLocWithOffset(ModuleFile.SLocEntryBaseOffset - 2);
   }
 
   /// Read a source location.
diff --git a/clang/include/clang/Serialization/ASTRecordReader.h b/clang/include/clang/Serialization/ASTRecordReader.h
index 1e11d2d5e42f..d00fb182f05f 100644
--- a/clang/include/clang/Serialization/ASTRecordReader.h
+++ b/clang/include/clang/Serialization/ASTRecordReader.h
@@ -272,6 +272,9 @@ public:
   /// Read a list of Exprs used for a var-list.
   llvm::SmallVector<Expr *> readOpenACCVarList();
 
+  /// Read a list of Exprs used for a int-expr-list.
+  llvm::SmallVector<Expr *> readOpenACCIntExprList();
+
   /// Read an OpenACC clause, advancing Idx.
   OpenACCClause *readOpenACCClause();
 
diff --git a/clang/include/clang/Serialization/ASTRecordWriter.h b/clang/include/clang/Serialization/ASTRecordWriter.h
index 8b1da49bd4c5..0c8ac75fc40f 100644
--- a/clang/include/clang/Serialization/ASTRecordWriter.h
+++ b/clang/include/clang/Serialization/ASTRecordWriter.h
@@ -296,6 +296,8 @@ public:
 
   void writeOpenACCVarList(const OpenACCClauseWithVarList *C);
 
+  void writeOpenACCIntExprList(ArrayRef<Expr *> Exprs);
+
   /// Writes out a single OpenACC Clause.
   void writeOpenACCClause(const OpenACCClause *C);
 
diff --git a/clang/include/clang/Serialization/ASTWriter.h b/clang/include/clang/Serialization/ASTWriter.h
index a55dfd327670..7bb0e81545bd 100644
--- a/clang/include/clang/Serialization/ASTWriter.h
+++ b/clang/include/clang/Serialization/ASTWriter.h
@@ -274,13 +274,13 @@ private:
 
   /// Offset of each type in the bitstream, indexed by
   /// the type's ID.
-  std::vector<serialization::UnderalignedInt64> TypeOffsets;
+  std::vector<serialization::UnalignedUInt64> TypeOffsets;
 
   /// The first ID number we can use for our own identifiers.
-  serialization::IdentID FirstIdentID = serialization::NUM_PREDEF_IDENT_IDS;
+  serialization::IdentifierID FirstIdentID = serialization::NUM_PREDEF_IDENT_IDS;
 
   /// The identifier ID that will be assigned to the next new identifier.
-  serialization::IdentID NextIdentID = FirstIdentID;
+  serialization::IdentifierID NextIdentID = FirstIdentID;
 
   /// Map that provides the ID numbers of each identifier in
   /// the output stream.
@@ -288,7 +288,7 @@ private:
   /// The ID numbers for identifiers are consecutive (in order of
   /// discovery), starting at 1. An ID of zero refers to a NULL
   /// IdentifierInfo.
-  llvm::MapVector<const IdentifierInfo *, serialization::IdentID> IdentifierIDs;
+  llvm::MapVector<const IdentifierInfo *, serialization::IdentifierID> IdentifierIDs;
 
   /// The first ID number we can use for our own macros.
   serialization::MacroID FirstMacroID = serialization::NUM_PREDEF_MACRO_IDS;
@@ -357,6 +357,13 @@ private:
   /// contexts.
   llvm::DenseMap<const Decl *, unsigned> AnonymousDeclarationNumbers;
 
+  /// The external top level module during the writing process. Used to
+  /// generate signature for the module file being written.
+  ///
+  /// Only meaningful for standard C++ named modules. See the comments in
+  /// createSignatureForNamedModule() for details.
+  llvm::DenseSet<Module *> TouchedTopLevelModules;
+
   /// An update to a Decl.
   class DeclUpdate {
     /// A DeclUpdateKind.
@@ -676,6 +683,10 @@ public:
   void AddSourceLocation(SourceLocation Loc, RecordDataImpl &Record,
                          LocSeq *Seq = nullptr);
 
+  /// Return the raw encodings for source locations.
+  SourceLocationEncoding::RawLocEncoding
+  getRawSourceLocationEncoding(SourceLocation Loc, LocSeq *Seq = nullptr);
+
   /// Emit a source range.
   void AddSourceRange(SourceRange Range, RecordDataImpl &Record,
                       LocSeq *Seq = nullptr);
@@ -687,7 +698,7 @@ public:
   serialization::SelectorID getSelectorRef(Selector Sel);
 
   /// Get the unique number used to refer to the given identifier.
-  serialization::IdentID getIdentifierRef(const IdentifierInfo *II);
+  serialization::IdentifierID getIdentifierRef(const IdentifierInfo *II);
 
   /// Get the unique number used to refer to the given macro.
   serialization::MacroID getMacroRef(MacroInfo *MI, const IdentifierInfo *Name);
@@ -844,7 +855,7 @@ public:
 private:
   // ASTDeserializationListener implementation
   void ReaderInitialized(ASTReader *Reader) override;
-  void IdentifierRead(serialization::IdentID ID, IdentifierInfo *II) override;
+  void IdentifierRead(serialization::IdentifierID ID, IdentifierInfo *II) override;
   void MacroRead(serialization::MacroID ID, MacroInfo *MI) override;
   void TypeRead(serialization::TypeIdx Idx, QualType T) override;
   void SelectorRead(serialization::SelectorID ID, Selector Sel) override;
diff --git a/clang/include/clang/Serialization/ModuleFile.h b/clang/include/clang/Serialization/ModuleFile.h
index 25f644e76edb..7d8cbe3d40f5 100644
--- a/clang/include/clang/Serialization/ModuleFile.h
+++ b/clang/include/clang/Serialization/ModuleFile.h
@@ -295,10 +295,6 @@ public:
   /// AST file.
   const uint32_t *SLocEntryOffsets = nullptr;
 
-  /// Remapping table for source locations in this module.
-  ContinuousRangeMap<SourceLocation::UIntTy, SourceLocation::IntTy, 2>
-      SLocRemap;
-
   // === Identifiers ===
 
   /// The number of identifiers in this AST file.
@@ -312,7 +308,7 @@ public:
   const uint32_t *IdentifierOffsets = nullptr;
 
   /// Base identifier ID for identifiers local to this module.
-  serialization::IdentID BaseIdentifierID = 0;
+  serialization::IdentifierID BaseIdentifierID = 0;
 
   /// Remapping table for identifier IDs in this module.
   ContinuousRangeMap<uint32_t, int, 2> IdentifierRemap;
@@ -495,7 +491,7 @@ public:
 
   /// Offset of each type within the bitstream, indexed by the
   /// type ID, or the representation of a Type*.
-  const UnderalignedInt64 *TypeOffsets = nullptr;
+  const UnalignedUInt64 *TypeOffsets = nullptr;
 
   /// Base type ID for types local to this module as represented in
   /// the global type ID space.
@@ -512,9 +508,17 @@ public:
   /// List of modules which depend on this module
   llvm::SetVector<ModuleFile *> ImportedBy;
 
-  /// List of modules which this module depends on
+  /// List of modules which this module directly imported
   llvm::SetVector<ModuleFile *> Imports;
 
+  /// List of modules which this modules dependent on. Different
+  /// from `Imports`, this includes indirectly imported modules too.
+  /// The order of DependentModules is significant. It should keep
+  /// the same order with that module file manager when we write
+  /// the current module file. The value of the member will be initialized
+  /// in `ASTReader::ReadModuleOffsetMap`.
+  llvm::SmallVector<ModuleFile *, 16> DependentModules;
+
   /// Determine whether this module was directly imported at
   /// any point during translation.
   bool isDirectlyImported() const { return DirectlyImported; }
diff --git a/clang/include/clang/Serialization/SourceLocationEncoding.h b/clang/include/clang/Serialization/SourceLocationEncoding.h
index 9bb0dbe2e4d6..33ca1728fa47 100644
--- a/clang/include/clang/Serialization/SourceLocationEncoding.h
+++ b/clang/include/clang/Serialization/SourceLocationEncoding.h
@@ -6,28 +6,33 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Source locations are stored pervasively in the AST, making up a third of
-// the size of typical serialized files. Storing them efficiently is important.
+// We wish to encode the SourceLocation from other module file not dependent
+// on the other module file. So that the source location changes from other
+// module file may not affect the contents of the current module file. Then the
+// users don't need to recompile the whole project due to a new line in a module
+// unit in the root of the dependency graph.
 //
-// We use integers optimized by VBR-encoding, because:
-//  - when abbreviations cannot be used, VBR6 encoding is our only choice
-//  - in the worst case a SourceLocation can be ~any 32-bit number, but in
-//    practice they are highly predictable
+// To achieve this, we need to encode the index of the module file into the
+// encoding of the source location. The encoding of the source location may be:
 //
-// We encode the integer so that likely values encode as small numbers that
-// turn into few VBR chunks:
-//  - the invalid sentinel location is a very common value: it encodes as 0
-//  - the "macro or not" bit is stored at the bottom of the integer
-//    (rather than at the top, as in memory), so macro locations can have
-//    small representations.
-//  - related locations (e.g. of a left and right paren pair) are usually
-//    similar, so when encoding a sequence of locations we store only
-//    differences between successive elements.
+//      |-----------------------|-----------------------|
+//      |          A            |         B         | C |
+//
+//  * A: 32 bit. The index of the module file in the module manager + 1. The +1
+//  here is necessary since we wish 0 stands for the current module file.
+//  * B: 31 bit. The offset of the source location to the module file containing
+//  it.
+//  * C: The macro bit. We rotate it to the lowest bit so that we can save some
+//  space in case the index of the module file is 0.
+//
+// Specially, if the index of the module file is 0, we allow to encode a
+// sequence of locations we store only differences between successive elements.
 //
 //===----------------------------------------------------------------------===//
 
-#include <climits>
 #include "clang/Basic/SourceLocation.h"
+#include "llvm/Support/MathExtras.h"
+#include <climits>
 
 #ifndef LLVM_CLANG_SERIALIZATION_SOURCELOCATIONENCODING_H
 #define LLVM_CLANG_SERIALIZATION_SOURCELOCATIONENCODING_H
@@ -52,9 +57,13 @@ class SourceLocationEncoding {
   friend SourceLocationSequence;
 
 public:
-  static uint64_t encode(SourceLocation Loc,
-                         SourceLocationSequence * = nullptr);
-  static SourceLocation decode(uint64_t, SourceLocationSequence * = nullptr);
+  using RawLocEncoding = uint64_t;
+
+  static RawLocEncoding encode(SourceLocation Loc, UIntTy BaseOffset,
+                               unsigned BaseModuleFileIndex,
+                               SourceLocationSequence * = nullptr);
+  static std::pair<SourceLocation, unsigned>
+  decode(RawLocEncoding, SourceLocationSequence * = nullptr);
 };
 
 /// Serialized encoding of a sequence of SourceLocations.
@@ -149,14 +158,44 @@ public:
   operator SourceLocationSequence *() { return &Seq; }
 };
 
-inline uint64_t SourceLocationEncoding::encode(SourceLocation Loc,
-                                               SourceLocationSequence *Seq) {
-  return Seq ? Seq->encode(Loc) : encodeRaw(Loc.getRawEncoding());
+inline SourceLocationEncoding::RawLocEncoding
+SourceLocationEncoding::encode(SourceLocation Loc, UIntTy BaseOffset,
+                               unsigned BaseModuleFileIndex,
+                               SourceLocationSequence *Seq) {
+  // If the source location is a local source location, we can try to optimize
+  // the similar sequences to only record the differences.
+  if (!BaseOffset)
+    return Seq ? Seq->encode(Loc) : encodeRaw(Loc.getRawEncoding());
+
+  if (Loc.isInvalid())
+    return 0;
+
+  // Otherwise, the higher bits are used to store the module file index,
+  // so it is meaningless to optimize the source locations into small
+  // integers. Let's try to always use the raw encodings.
+  assert(Loc.getOffset() >= BaseOffset);
+  Loc = Loc.getLocWithOffset(-BaseOffset);
+  RawLocEncoding Encoded = encodeRaw(Loc.getRawEncoding());
+
+  // 16 bits should be sufficient to store the module file index.
+  assert(BaseModuleFileIndex < (1 << 16));
+  Encoded |= (RawLocEncoding)BaseModuleFileIndex << 32;
+  return Encoded;
 }
-inline SourceLocation
-SourceLocationEncoding::decode(uint64_t Encoded, SourceLocationSequence *Seq) {
-  return Seq ? Seq->decode(Encoded)
-             : SourceLocation::getFromRawEncoding(decodeRaw(Encoded));
+inline std::pair<SourceLocation, unsigned>
+SourceLocationEncoding::decode(RawLocEncoding Encoded,
+                               SourceLocationSequence *Seq) {
+  unsigned ModuleFileIndex = Encoded >> 32;
+
+  if (!ModuleFileIndex)
+    return {Seq ? Seq->decode(Encoded)
+                : SourceLocation::getFromRawEncoding(decodeRaw(Encoded)),
+            ModuleFileIndex};
+
+  Encoded &= llvm::maskTrailingOnes<RawLocEncoding>(32);
+  SourceLocation Loc = SourceLocation::getFromRawEncoding(decodeRaw(Encoded));
+
+  return {Loc, ModuleFileIndex};
 }
 
 } // namespace clang
diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
index 520286b57c9f..64414e3d37f7 100644
--- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -1397,7 +1397,7 @@ def CastValueChecker : Checker<"CastValue">,
   Documentation<NotDocumented>;
 
 def ReturnValueChecker : Checker<"ReturnValue">,
-  HelpText<"Model the guaranteed boolean return value of function calls">,
+  HelpText<"Model certain Error() methods that always return true by convention">,
   Documentation<NotDocumented>;
 
 } // end "apiModeling.llvm"
diff --git a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def
index 2fc825c2af9c..f008c9c581d9 100644
--- a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def
+++ b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def
@@ -414,22 +414,6 @@ ANALYZER_OPTION(
     "serves as an upper bound instead.", 10000)
 
 ANALYZER_OPTION(
-    StringRef, CTUPhase1InliningMode, "ctu-phase1-inlining",
-    "Controls which functions will be inlined during the first phase of the ctu "
-    "analysis. "
-    "If the value is set to 'all' then all foreign functions are inlinied "
-    "immediately during the first phase, thus rendering the second phase a noop. "
-    "The 'ctu-max-nodes-*' budge has no effect in this case. "
-    "If the value is 'small' then only functions with a linear CFG and with a "
-    "limited number of statements would be inlined during the first phase. The "
-    "long and/or nontrivial functions are handled in the second phase and are "
-    "controlled by the 'ctu-max-nodes-*' budge. "
-    "The value 'none' means that all foreign functions are inlined only in the "
-    "second phase, 'ctu-max-nodes-*' budge limits the second phase. "
-    "Value: \"none\", \"small\", \"all\".",
-    "small")
-
-ANALYZER_OPTION(
     unsigned, RegionStoreSmallStructLimit, "region-store-small-struct-limit",
     "The largest number of fields a struct can have and still be considered "
     "small. This is currently used to decide whether or not it is worth forcing "
@@ -479,6 +463,22 @@ ANALYZER_OPTION(
     "")
 
 ANALYZER_OPTION(
+    StringRef, CTUPhase1InliningMode, "ctu-phase1-inlining",
+    "Controls which functions will be inlined during the first phase of the ctu "
+    "analysis. "
+    "If the value is set to 'all' then all foreign functions are inlinied "
+    "immediately during the first phase, thus rendering the second phase a noop. "
+    "The 'ctu-max-nodes-*' budge has no effect in this case. "
+    "If the value is 'small' then only functions with a linear CFG and with a "
+    "limited number of statements would be inlined during the first phase. The "
+    "long and/or nontrivial functions are handled in the second phase and are "
+    "controlled by the 'ctu-max-nodes-*' budge. "
+    "The value 'none' means that all foreign functions are inlined only in the "
+    "second phase, 'ctu-max-nodes-*' budge limits the second phase. "
+    "Value: \"none\", \"small\", \"all\".",
+    "small")
+
+ANALYZER_OPTION(
     StringRef, CXXMemberInliningMode, "c++-inlining",
     "Controls which C++ member functions will be considered for inlining. "
     "Value: \"constructors\", \"destructors\", \"methods\".",
diff --git a/clang/lib/ARCMigrate/ObjCMT.cpp b/clang/lib/ARCMigrate/ObjCMT.cpp
index b9dcfb8951b3..aaf41dc4039c 100644
--- a/clang/lib/ARCMigrate/ObjCMT.cpp
+++ b/clang/lib/ARCMigrate/ObjCMT.cpp
@@ -484,7 +484,7 @@ static void rewriteToObjCProperty(const ObjCMethodDecl *Getter,
 
   // Short circuit 'delegate' properties that contain the name "delegate" or
   // "dataSource", or have exact name "target" to have 'assign' attribute.
-  if (PropertyName.equals("target") || PropertyName.contains("delegate") ||
+  if (PropertyName == "target" || PropertyName.contains("delegate") ||
       PropertyName.contains("dataSource")) {
     QualType QT = Getter->getReturnType();
     if (!QT->isRealType())
diff --git a/clang/lib/AST/ASTConcept.cpp b/clang/lib/AST/ASTConcept.cpp
index b3ec99448b3e..0387fc9f6aec 100644
--- a/clang/lib/AST/ASTConcept.cpp
+++ b/clang/lib/AST/ASTConcept.cpp
@@ -15,6 +15,7 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/PrettyPrinter.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringExtras.h"
 
 using namespace clang;
 
@@ -106,9 +107,12 @@ void ConceptReference::print(llvm::raw_ostream &OS,
   ConceptName.printName(OS, Policy);
   if (hasExplicitTemplateArgs()) {
     OS << "<";
+    llvm::ListSeparator Sep(", ");
     // FIXME: Find corresponding parameter for argument
-    for (auto &ArgLoc : ArgsAsWritten->arguments())
+    for (auto &ArgLoc : ArgsAsWritten->arguments()) {
+      OS << Sep;
       ArgLoc.getArgument().print(Policy, OS, /*IncludeType*/ false);
+    }
     OS << ">";
   }
 }
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index cbf4932aff9a..4475f399a120 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -1307,6 +1307,9 @@ void ASTContext::InitBuiltinTypes(const TargetInfo &Target,
   // Placeholder type for bound members.
   InitBuiltinType(BoundMemberTy,       BuiltinType::BoundMember);
 
+  // Placeholder type for unresolved templates.
+  InitBuiltinType(UnresolvedTemplateTy, BuiltinType::UnresolvedTemplate);
+
   // Placeholder type for pseudo-objects.
   InitBuiltinType(PseudoObjectTy,      BuiltinType::PseudoObject);
 
@@ -1612,15 +1615,7 @@ const llvm::fltSemantics &ASTContext::getFloatTypeSemantics(QualType T) const {
   case BuiltinType::Float16:
     return Target->getHalfFormat();
   case BuiltinType::Half:
-    // For HLSL, when the native half type is disabled, half will be treat as
-    // float.
-    if (getLangOpts().HLSL)
-      if (getLangOpts().NativeHalfType)
-        return Target->getHalfFormat();
-      else
-        return Target->getFloatFormat();
-    else
-      return Target->getHalfFormat();
+    return Target->getHalfFormat();
   case BuiltinType::Float:      return Target->getFloatFormat();
   case BuiltinType::Double:     return Target->getDoubleFormat();
   case BuiltinType::Ibm128:
@@ -2263,9 +2258,8 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const {
   }
   case Type::BitInt: {
     const auto *EIT = cast<BitIntType>(T);
-    Align = std::clamp<unsigned>(llvm::PowerOf2Ceil(EIT->getNumBits()),
-                                 getCharWidth(), Target->getLongLongAlign());
-    Width = llvm::alignTo(EIT->getNumBits(), Align);
+    Align = Target->getBitIntAlign(EIT->getNumBits());
+    Width = Target->getBitIntWidth(EIT->getNumBits());
     break;
   }
   case Type::Record:
@@ -3802,33 +3796,33 @@ QualType ASTContext::getDependentSizedArrayType(QualType elementType,
           numElements->isValueDependent()) &&
          "Size must be type- or value-dependent!");
 
+  SplitQualType canonElementType = getCanonicalType(elementType).split();
+
+  void *insertPos = nullptr;
+  llvm::FoldingSetNodeID ID;
+  DependentSizedArrayType::Profile(
+      ID, *this, numElements ? QualType(canonElementType.Ty, 0) : elementType,
+      ASM, elementTypeQuals, numElements);
+
+  // Look for an existing type with these properties.
+  DependentSizedArrayType *canonTy =
+    DependentSizedArrayTypes.FindNodeOrInsertPos(ID, insertPos);
+
   // Dependently-sized array types that do not have a specified number
   // of elements will have their sizes deduced from a dependent
-  // initializer.  We do no canonicalization here at all, which is okay
-  // because they can't be used in most locations.
+  // initializer.
   if (!numElements) {
+    if (canonTy)
+      return QualType(canonTy, 0);
+
     auto *newType = new (*this, alignof(DependentSizedArrayType))
         DependentSizedArrayType(elementType, QualType(), numElements, ASM,
                                 elementTypeQuals, brackets);
+    DependentSizedArrayTypes.InsertNode(newType, insertPos);
     Types.push_back(newType);
     return QualType(newType, 0);
   }
 
-  // Otherwise, we actually build a new type every time, but we
-  // also build a canonical type.
-
-  SplitQualType canonElementType = getCanonicalType(elementType).split();
-
-  void *insertPos = nullptr;
-  llvm::FoldingSetNodeID ID;
-  DependentSizedArrayType::Profile(ID, *this,
-                                   QualType(canonElementType.Ty, 0),
-                                   ASM, elementTypeQuals, numElements);
-
-  // Look for an existing type with these properties.
-  DependentSizedArrayType *canonTy =
-    DependentSizedArrayTypes.FindNodeOrInsertPos(ID, insertPos);
-
   // If we don't have one, build one.
   if (!canonTy) {
     canonTy = new (*this, alignof(DependentSizedArrayType))
@@ -9554,11 +9548,6 @@ static uint64_t getSVETypeSize(ASTContext &Context, const BuiltinType *Ty) {
 
 bool ASTContext::areCompatibleSveTypes(QualType FirstType,
                                        QualType SecondType) {
-  assert(
-      ((FirstType->isSVESizelessBuiltinType() && SecondType->isVectorType()) ||
-       (FirstType->isVectorType() && SecondType->isSVESizelessBuiltinType())) &&
-      "Expected SVE builtin type and vector type!");
-
   auto IsValidCast = [this](QualType FirstType, QualType SecondType) {
     if (const auto *BT = FirstType->getAs<BuiltinType>()) {
       if (const auto *VT = SecondType->getAs<VectorType>()) {
@@ -9584,11 +9573,6 @@ bool ASTContext::areCompatibleSveTypes(QualType FirstType,
 
 bool ASTContext::areLaxCompatibleSveTypes(QualType FirstType,
                                           QualType SecondType) {
-  assert(
-      ((FirstType->isSVESizelessBuiltinType() && SecondType->isVectorType()) ||
-       (FirstType->isVectorType() && SecondType->isSVESizelessBuiltinType())) &&
-      "Expected SVE builtin type and vector type!");
-
   auto IsLaxCompatible = [this](QualType FirstType, QualType SecondType) {
     const auto *BT = FirstType->getAs<BuiltinType>();
     if (!BT)
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index e7e95c16b697..ec851c9371e1 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -5758,3 +5758,18 @@ ExportDecl *ExportDecl::Create(ASTContext &C, DeclContext *DC,
 ExportDecl *ExportDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
   return new (C, ID) ExportDecl(nullptr, SourceLocation());
 }
+
+bool clang::IsArmStreamingFunction(const FunctionDecl *FD,
+                                   bool IncludeLocallyStreaming) {
+  if (IncludeLocallyStreaming)
+    if (FD->hasAttr<ArmLocallyStreamingAttr>())
+      return true;
+
+  if (const Type *Ty = FD->getType().getTypePtrOrNull())
+    if (const auto *FPT = Ty->getAs<FunctionProtoType>())
+      if (FPT->getAArch64SMEAttributes() &
+          FunctionType::SME_PStateSMEnabledMask)
+        return true;
+
+  return false;
+}
diff --git a/clang/lib/AST/Interp/ByteCodeEmitter.cpp b/clang/lib/AST/Interp/ByteCodeEmitter.cpp
index d912c101449d..918cd66c9a97 100644
--- a/clang/lib/AST/Interp/ByteCodeEmitter.cpp
+++ b/clang/lib/AST/Interp/ByteCodeEmitter.cpp
@@ -82,11 +82,13 @@ Function *ByteCodeEmitter::compileFunc(const FunctionDecl *FuncDecl) {
   // InterpStack when calling the function.
   bool HasThisPointer = false;
   if (const auto *MD = dyn_cast<CXXMethodDecl>(FuncDecl)) {
-    if (MD->isImplicitObjectMemberFunction() && !IsLambdaStaticInvoker) {
-      HasThisPointer = true;
-      ParamTypes.push_back(PT_Ptr);
-      ParamOffsets.push_back(ParamOffset);
-      ParamOffset += align(primSize(PT_Ptr));
+    if (!IsLambdaStaticInvoker) {
+      HasThisPointer = MD->isInstance();
+      if (MD->isImplicitObjectMemberFunction()) {
+        ParamTypes.push_back(PT_Ptr);
+        ParamOffsets.push_back(ParamOffset);
+        ParamOffset += align(primSize(PT_Ptr));
+      }
     }
 
     // Set up lambda capture to closure record field mapping.
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index f1a51e81a92c..7b10482dff23 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -29,15 +29,11 @@ namespace interp {
 template <class Emitter> class DeclScope final : public VariableScope<Emitter> {
 public:
   DeclScope(ByteCodeExprGen<Emitter> *Ctx, const ValueDecl *VD)
-      : VariableScope<Emitter>(Ctx), Scope(Ctx->P, VD),
+      : VariableScope<Emitter>(Ctx, nullptr), Scope(Ctx->P, VD),
         OldGlobalDecl(Ctx->GlobalDecl) {
     Ctx->GlobalDecl = Context::shouldBeGloballyIndexed(VD);
   }
 
-  void addExtended(const Scope::Local &Local) override {
-    return this->addLocal(Local);
-  }
-
   ~DeclScope() { this->Ctx->GlobalDecl = OldGlobalDecl; }
 
 private:
@@ -85,8 +81,7 @@ bool ByteCodeExprGen<Emitter>::VisitCastExpr(const CastExpr *CE) {
     std::optional<PrimType> SubExprT = classify(SubExpr->getType());
     // Prepare storage for the result.
     if (!Initializing && !SubExprT) {
-      std::optional<unsigned> LocalIndex =
-          allocateLocal(SubExpr, /*IsExtended=*/false);
+      std::optional<unsigned> LocalIndex = allocateLocal(SubExpr);
       if (!LocalIndex)
         return false;
       if (!this->emitGetPtrLocal(*LocalIndex, CE))
@@ -362,8 +357,7 @@ bool ByteCodeExprGen<Emitter>::VisitCastExpr(const CastExpr *CE) {
     // We're creating a complex value here, so we need to
     // allocate storage for it.
     if (!Initializing) {
-      std::optional<unsigned> LocalIndex =
-          allocateLocal(CE, /*IsExtended=*/true);
+      std::optional<unsigned> LocalIndex = allocateLocal(CE);
       if (!LocalIndex)
         return false;
       if (!this->emitGetPtrLocal(*LocalIndex, CE))
@@ -390,8 +384,7 @@ bool ByteCodeExprGen<Emitter>::VisitCastExpr(const CastExpr *CE) {
       return this->discard(SubExpr);
 
     if (!Initializing) {
-      std::optional<unsigned> LocalIndex =
-          allocateLocal(CE, /*IsExtended=*/true);
+      std::optional<unsigned> LocalIndex = allocateLocal(CE);
       if (!LocalIndex)
         return false;
       if (!this->emitGetPtrLocal(*LocalIndex, CE))
@@ -492,7 +485,7 @@ bool ByteCodeExprGen<Emitter>::VisitImaginaryLiteral(
     return true;
 
   if (!Initializing) {
-    std::optional<unsigned> LocalIndex = allocateLocal(E, /*IsExtended=*/false);
+    std::optional<unsigned> LocalIndex = allocateLocal(E);
     if (!LocalIndex)
       return false;
     if (!this->emitGetPtrLocal(*LocalIndex, E))
@@ -561,7 +554,7 @@ bool ByteCodeExprGen<Emitter>::VisitBinaryOperator(const BinaryOperator *BO) {
 
     // We need a temporary variable holding our return value.
     if (!Initializing) {
-      std::optional<unsigned> ResultIndex = this->allocateLocal(BO, false);
+      std::optional<unsigned> ResultIndex = this->allocateLocal(BO);
       if (!this->emitGetPtrLocal(*ResultIndex, BO))
         return false;
     }
@@ -784,7 +777,7 @@ template <class Emitter>
 bool ByteCodeExprGen<Emitter>::VisitComplexBinOp(const BinaryOperator *E) {
   // Prepare storage for result.
   if (!Initializing) {
-    std::optional<unsigned> LocalIndex = allocateLocal(E, /*IsExtended=*/false);
+    std::optional<unsigned> LocalIndex = allocateLocal(E);
     if (!LocalIndex)
       return false;
     if (!this->emitGetPtrLocal(*LocalIndex, E))
@@ -1841,11 +1834,12 @@ bool ByteCodeExprGen<Emitter>::VisitCompoundAssignOperator(
 template <class Emitter>
 bool ByteCodeExprGen<Emitter>::VisitExprWithCleanups(
     const ExprWithCleanups *E) {
+  ExprScope<Emitter> ES(this);
   const Expr *SubExpr = E->getSubExpr();
 
   assert(E->getNumObjects() == 0 && "TODO: Implement cleanups");
 
-  return this->delegate(SubExpr);
+  return this->delegate(SubExpr) && ES.destroyLocals();
 }
 
 template <class Emitter>
@@ -1910,9 +1904,8 @@ bool ByteCodeExprGen<Emitter>::VisitMaterializeTemporaryExpr(
     return this->emitGetPtrLocal(LocalIndex, E);
   } else {
     const Expr *Inner = E->getSubExpr()->skipRValueSubobjectAdjustments();
-
     if (std::optional<unsigned> LocalIndex =
-            allocateLocal(Inner, /*IsExtended=*/true)) {
+            allocateLocal(Inner, E->getExtendingDecl())) {
       if (!this->emitGetPtrLocal(*LocalIndex, E))
         return false;
       return this->visitInitializer(SubExpr);
@@ -2032,7 +2025,7 @@ bool ByteCodeExprGen<Emitter>::VisitLambdaExpr(const LambdaExpr *E) {
       if (!this->visit(Init))
         return false;
 
-      if (!this->emitSetField(*T, F.Offset, E))
+      if (!this->emitInitField(*T, F.Offset, E))
         return false;
     } else {
       if (!this->emitDupPtr(E))
@@ -2095,10 +2088,16 @@ bool ByteCodeExprGen<Emitter>::VisitCXXConstructExpr(
   if (T->isRecordType()) {
     const CXXConstructorDecl *Ctor = E->getConstructor();
 
-    // Trivial zero initialization.
-    if (E->requiresZeroInitialization() && Ctor->isTrivial()) {
+    // Zero initialization.
+    if (E->requiresZeroInitialization()) {
       const Record *R = getRecord(E->getType());
-      return this->visitZeroRecordInitializer(R, E);
+
+      if (!this->visitZeroRecordInitializer(R, E))
+        return false;
+
+      // If the constructor is trivial anyway, we're done.
+      if (Ctor->isTrivial())
+        return true;
     }
 
     const Function *Func = getFunction(Ctor);
@@ -2113,8 +2112,7 @@ bool ByteCodeExprGen<Emitter>::VisitCXXConstructExpr(
     // to allocate a variable and call the constructor and destructor.
     if (DiscardResult) {
       assert(!Initializing);
-      std::optional<unsigned> LocalIndex =
-          allocateLocal(E, /*IsExtended=*/true);
+      std::optional<unsigned> LocalIndex = allocateLocal(E);
 
       if (!LocalIndex)
         return false;
@@ -2161,7 +2159,9 @@ bool ByteCodeExprGen<Emitter>::VisitCXXConstructExpr(
   if (T->isArrayType()) {
     const ConstantArrayType *CAT =
         Ctx.getASTContext().getAsConstantArrayType(E->getType());
-    assert(CAT);
+    if (!CAT)
+      return false;
+
     size_t NumElems = CAT->getZExtSize();
     const Function *Func = getFunction(E->getConstructor());
     if (!Func || !Func->isConstexpr())
@@ -2294,8 +2294,7 @@ bool ByteCodeExprGen<Emitter>::VisitCXXScalarValueInitExpr(
 
   if (const auto *CT = Ty->getAs<ComplexType>()) {
     if (!Initializing) {
-      std::optional<unsigned> LocalIndex =
-          allocateLocal(E, /*IsExtended=*/false);
+      std::optional<unsigned> LocalIndex = allocateLocal(E);
       if (!LocalIndex)
         return false;
       if (!this->emitGetPtrLocal(*LocalIndex, E))
@@ -2318,8 +2317,7 @@ bool ByteCodeExprGen<Emitter>::VisitCXXScalarValueInitExpr(
   if (const auto *VT = Ty->getAs<VectorType>()) {
     // FIXME: Code duplication with the _Complex case above.
     if (!Initializing) {
-      std::optional<unsigned> LocalIndex =
-          allocateLocal(E, /*IsExtended=*/false);
+      std::optional<unsigned> LocalIndex = allocateLocal(E);
       if (!LocalIndex)
         return false;
       if (!this->emitGetPtrLocal(*LocalIndex, E))
@@ -2434,6 +2432,8 @@ bool ByteCodeExprGen<Emitter>::VisitCXXUuidofExpr(const CXXUuidofExpr *E) {
 template <class Emitter>
 bool ByteCodeExprGen<Emitter>::VisitRequiresExpr(const RequiresExpr *E) {
   assert(classifyPrim(E->getType()) == PT_Bool);
+  if (DiscardResult)
+    return true;
   return this->emitConstBool(E->isSatisfied(), E);
 }
 
@@ -2441,6 +2441,8 @@ template <class Emitter>
 bool ByteCodeExprGen<Emitter>::VisitConceptSpecializationExpr(
     const ConceptSpecializationExpr *E) {
   assert(classifyPrim(E->getType()) == PT_Bool);
+  if (DiscardResult)
+    return true;
   return this->emitConstBool(E->isSatisfied(), E);
 }
 
@@ -2481,10 +2483,12 @@ bool ByteCodeExprGen<Emitter>::VisitPackIndexingExpr(
   return this->delegate(E->getSelectedExpr());
 }
 
-template <class Emitter> bool ByteCodeExprGen<Emitter>::discard(const Expr *E) {
-  if (E->containsErrors())
-    return false;
+template <class Emitter>
+bool ByteCodeExprGen<Emitter>::VisitRecoveryExpr(const RecoveryExpr *E) {
+  return this->emitError(E);
+}
 
+template <class Emitter> bool ByteCodeExprGen<Emitter>::discard(const Expr *E) {
   OptionScope<Emitter> Scope(this, /*NewDiscardResult=*/true,
                              /*NewInitializing=*/false);
   return this->Visit(E);
@@ -2492,9 +2496,6 @@ template <class Emitter> bool ByteCodeExprGen<Emitter>::discard(const Expr *E) {
 
 template <class Emitter>
 bool ByteCodeExprGen<Emitter>::delegate(const Expr *E) {
-  if (E->containsErrors())
-    return this->emitError(E);
-
   // We're basically doing:
   // OptionScope<Emitter> Scope(this, DicardResult, Initializing);
   // but that's unnecessary of course.
@@ -2502,16 +2503,13 @@ bool ByteCodeExprGen<Emitter>::delegate(const Expr *E) {
 }
 
 template <class Emitter> bool ByteCodeExprGen<Emitter>::visit(const Expr *E) {
-  if (E->containsErrors())
-    return this->emitError(E);
-
   if (E->getType()->isVoidType())
     return this->discard(E);
 
   // Create local variable to hold the return value.
   if (!E->isGLValue() && !E->getType()->isAnyComplexType() &&
       !classify(E->getType())) {
-    std::optional<unsigned> LocalIndex = allocateLocal(E, /*IsExtended=*/true);
+    std::optional<unsigned> LocalIndex = allocateLocal(E);
     if (!LocalIndex)
       return false;
 
@@ -2765,7 +2763,8 @@ unsigned ByteCodeExprGen<Emitter>::allocateLocalPrimitive(DeclTy &&Src,
 
 template <class Emitter>
 std::optional<unsigned>
-ByteCodeExprGen<Emitter>::allocateLocal(DeclTy &&Src, bool IsExtended) {
+ByteCodeExprGen<Emitter>::allocateLocal(DeclTy &&Src,
+                                        const ValueDecl *ExtendingDecl) {
   // Make sure we don't accidentally register the same decl twice.
   if ([[maybe_unused]]  const auto *VD =
           dyn_cast_if_present<ValueDecl>(Src.dyn_cast<const Decl *>())) {
@@ -2798,7 +2797,10 @@ ByteCodeExprGen<Emitter>::allocateLocal(DeclTy &&Src, bool IsExtended) {
   Scope::Local Local = this->createLocal(D);
   if (Key)
     Locals.insert({Key, Local});
-  VarScope->add(Local, IsExtended);
+  if (ExtendingDecl)
+    VarScope->addExtended(Local, ExtendingDecl);
+  else
+    VarScope->add(Local, false);
   return Local.Offset;
 }
 
@@ -2833,14 +2835,14 @@ bool ByteCodeExprGen<Emitter>::visitExpr(const Expr *E) {
   if (E->getType()->isVoidType()) {
     if (!visit(E))
       return false;
-    return this->emitRetVoid(E);
+    return this->emitRetVoid(E) && RootScope.destroyLocals();
   }
 
   // Expressions with a primitive return type.
   if (std::optional<PrimType> T = classify(E)) {
     if (!visit(E))
       return false;
-    return this->emitRet(*T, E);
+    return this->emitRet(*T, E) && RootScope.destroyLocals();
   }
 
   // Expressions with a composite return type.
@@ -2861,6 +2863,7 @@ bool ByteCodeExprGen<Emitter>::visitExpr(const Expr *E) {
     return this->emitRetValue(E) && RootScope.destroyLocals();
   }
 
+  RootScope.destroyLocals();
   return false;
 }
 
@@ -2952,7 +2955,7 @@ bool ByteCodeExprGen<Emitter>::visitVarDecl(const VarDecl *VD) {
 
     return !Init || initGlobal(*GlobalIndex);
   } else {
-    VariableScope<Emitter> LocalScope(this);
+    VariableScope<Emitter> LocalScope(this, VD);
     if (VarT) {
       unsigned Offset = this->allocateLocalPrimitive(
           VD, *VarT, VD->getType().isConstQualified());
@@ -2969,6 +2972,7 @@ bool ByteCodeExprGen<Emitter>::visitVarDecl(const VarDecl *VD) {
         return !Init || this->visitLocalInitializer(Init, *Offset);
       return false;
     }
+
     return true;
   }
 
@@ -3050,7 +3054,7 @@ bool ByteCodeExprGen<Emitter>::VisitBuiltinCallExpr(const CallExpr *E) {
 
   // Non-primitive return type. Prepare storage.
   if (!Initializing && !ReturnT && !ReturnType->isVoidType()) {
-    std::optional<unsigned> LocalIndex = allocateLocal(E, /*IsExtended=*/false);
+    std::optional<unsigned> LocalIndex = allocateLocal(E);
     if (!LocalIndex)
       return false;
     if (!this->emitGetPtrLocal(*LocalIndex, E))
@@ -3464,8 +3468,7 @@ bool ByteCodeExprGen<Emitter>::VisitComplexUnaryOperator(
   std::optional<PrimType> ResT = classify(E);
   auto prepareResult = [=]() -> bool {
     if (!ResT && !Initializing) {
-      std::optional<unsigned> LocalIndex =
-          allocateLocal(SubExpr, /*IsExtended=*/false);
+      std::optional<unsigned> LocalIndex = allocateLocal(SubExpr);
       if (!LocalIndex)
         return false;
       return this->emitGetPtrLocal(*LocalIndex, E);
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.h b/clang/lib/AST/Interp/ByteCodeExprGen.h
index a89e37c67aa6..9f83d173bbae 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.h
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.h
@@ -121,6 +121,7 @@ public:
   bool VisitCXXRewrittenBinaryOperator(const CXXRewrittenBinaryOperator *E);
   bool VisitPseudoObjectExpr(const PseudoObjectExpr *E);
   bool VisitPackIndexingExpr(const PackIndexingExpr *E);
+  bool VisitRecoveryExpr(const RecoveryExpr *E);
 
 protected:
   bool visitExpr(const Expr *E) override;
@@ -234,7 +235,8 @@ protected:
                                   bool IsExtended = false);
 
   /// Allocates a space storing a local given its type.
-  std::optional<unsigned> allocateLocal(DeclTy &&Decl, bool IsExtended = false);
+  std::optional<unsigned>
+  allocateLocal(DeclTy &&Decl, const ValueDecl *ExtendingDecl = nullptr);
 
 private:
   friend class VariableScope<Emitter>;
@@ -321,8 +323,8 @@ extern template class ByteCodeExprGen<EvalEmitter>;
 /// Scope chain managing the variable lifetimes.
 template <class Emitter> class VariableScope {
 public:
-  VariableScope(ByteCodeExprGen<Emitter> *Ctx)
-      : Ctx(Ctx), Parent(Ctx->VarScope) {
+  VariableScope(ByteCodeExprGen<Emitter> *Ctx, const ValueDecl *VD)
+      : Ctx(Ctx), Parent(Ctx->VarScope), ValDecl(VD) {
     Ctx->VarScope = this;
   }
 
@@ -345,6 +347,24 @@ public:
       this->Parent->addExtended(Local);
   }
 
+  void addExtended(const Scope::Local &Local, const ValueDecl *ExtendingDecl) {
+    // Walk up the chain of scopes until we find the one for ExtendingDecl.
+    // If there is no such scope, attach it to the parent one.
+    VariableScope *P = this;
+    while (P) {
+      if (P->ValDecl == ExtendingDecl) {
+        P->addLocal(Local);
+        return;
+      }
+      P = P->Parent;
+      if (!P)
+        break;
+    }
+
+    // Use the parent scope.
+    addExtended(Local);
+  }
+
   virtual void emitDestruction() {}
   virtual bool emitDestructors() { return true; }
   VariableScope *getParent() const { return Parent; }
@@ -354,12 +374,14 @@ protected:
   ByteCodeExprGen<Emitter> *Ctx;
   /// Link to the parent scope.
   VariableScope *Parent;
+  const ValueDecl *ValDecl = nullptr;
 };
 
 /// Generic scope for local variables.
 template <class Emitter> class LocalScope : public VariableScope<Emitter> {
 public:
-  LocalScope(ByteCodeExprGen<Emitter> *Ctx) : VariableScope<Emitter>(Ctx) {}
+  LocalScope(ByteCodeExprGen<Emitter> *Ctx)
+      : VariableScope<Emitter>(Ctx, nullptr) {}
 
   /// Emit a Destroy op for this scope.
   ~LocalScope() override {
@@ -473,16 +495,9 @@ public:
   }
 };
 
-/// Expression scope which tracks potentially lifetime extended
-/// temporaries which are hoisted to the parent scope on exit.
 template <class Emitter> class ExprScope final : public AutoScope<Emitter> {
 public:
   ExprScope(ByteCodeExprGen<Emitter> *Ctx) : AutoScope<Emitter>(Ctx) {}
-
-  void addExtended(const Scope::Local &Local) override {
-    if (this->Parent)
-      this->Parent->addLocal(Local);
-  }
 };
 
 template <class Emitter> class ArrayIndexScope final {
diff --git a/clang/lib/AST/Interp/Descriptor.cpp b/clang/lib/AST/Interp/Descriptor.cpp
index 954c58c8cb37..d0466902247b 100644
--- a/clang/lib/AST/Interp/Descriptor.cpp
+++ b/clang/lib/AST/Interp/Descriptor.cpp
@@ -347,14 +347,6 @@ Descriptor::Descriptor(const DeclTy &D)
   assert(Source && "Missing source");
 }
 
-/// Dummy array.
-Descriptor::Descriptor(const DeclTy &D, UnknownSize)
-    : Source(D), ElemSize(1), Size(UnknownSizeMark), MDSize(0),
-      AllocSize(MDSize), ElemRecord(nullptr), IsConst(true), IsMutable(false),
-      IsTemporary(false), IsArray(true), IsDummy(true) {
-  assert(Source && "Missing source");
-}
-
 QualType Descriptor::getType() const {
   if (auto *E = asExpr())
     return E->getType();
diff --git a/clang/lib/AST/Interp/Descriptor.h b/clang/lib/AST/Interp/Descriptor.h
index cd20495c259c..fcb14e76e7eb 100644
--- a/clang/lib/AST/Interp/Descriptor.h
+++ b/clang/lib/AST/Interp/Descriptor.h
@@ -128,7 +128,7 @@ public:
   /// Flag indicating if the block is an array.
   const bool IsArray = false;
   /// Flag indicating if this is a dummy descriptor.
-  const bool IsDummy = false;
+  bool IsDummy = false;
 
   /// Storage management methods.
   const BlockCtorFn CtorFn = nullptr;
@@ -162,8 +162,8 @@ public:
   /// Allocates a dummy descriptor.
   Descriptor(const DeclTy &D);
 
-  /// Allocates a dummy array descriptor.
-  Descriptor(const DeclTy &D, UnknownSize);
+  /// Make this descriptor a dummy descriptor.
+  void makeDummy() { IsDummy = true; }
 
   QualType getType() const;
   QualType getElemQualType() const;
diff --git a/clang/lib/AST/Interp/EvalEmitter.cpp b/clang/lib/AST/Interp/EvalEmitter.cpp
index d764b4b6f6d1..388c3612f292 100644
--- a/clang/lib/AST/Interp/EvalEmitter.cpp
+++ b/clang/lib/AST/Interp/EvalEmitter.cpp
@@ -34,6 +34,7 @@ EvalEmitter::~EvalEmitter() {
 
 EvaluationResult EvalEmitter::interpretExpr(const Expr *E,
                                             bool ConvertResultToRValue) {
+  S.setEvalLocation(E->getExprLoc());
   this->ConvertResultToRValue = ConvertResultToRValue;
   EvalResult.setSource(E);
 
diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index 66d30cc3fbaa..a0bf87430012 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -823,9 +823,9 @@ inline bool CmpHelperEQ<Pointer>(InterpState &S, CodePtr OpPC, CompareFn Fn) {
     // element in the same array are NOT equal. They have the same Base value,
     // but a different Offset. This is a pretty rare case, so we fix this here
     // by comparing pointers to the first elements.
-    if (!LHS.isZero() && !LHS.isDummy() && LHS.isArrayRoot())
+    if (!LHS.isZero() && LHS.isArrayRoot())
       VL = LHS.atIndex(0).getByteOffset();
-    if (!RHS.isZero() && !RHS.isDummy() && RHS.isArrayRoot())
+    if (!RHS.isZero() && RHS.isArrayRoot())
       VR = RHS.atIndex(0).getByteOffset();
 
     S.Stk.push<BoolT>(BoolT::from(Fn(Compare(VL, VR))));
@@ -1241,14 +1241,16 @@ inline bool GetPtrField(InterpState &S, CodePtr OpPC, uint32_t Off) {
       !CheckNull(S, OpPC, Ptr, CSK_Field))
     return false;
 
-  if (CheckDummy(S, OpPC, Ptr)) {
-    if (!CheckExtern(S, OpPC, Ptr))
-      return false;
-    if (!CheckRange(S, OpPC, Ptr, CSK_Field))
-      return false;
-    if (!CheckSubobject(S, OpPC, Ptr, CSK_Field))
-      return false;
-  }
+  if (!CheckExtern(S, OpPC, Ptr))
+    return false;
+  if (!CheckRange(S, OpPC, Ptr, CSK_Field))
+    return false;
+  if (!CheckSubobject(S, OpPC, Ptr, CSK_Field))
+    return false;
+
+  if (Ptr.isBlockPointer() && Off > Ptr.block()->getSize())
+    return false;
+
   S.Stk.push<Pointer>(Ptr.atField(Off));
   return true;
 }
@@ -2034,11 +2036,6 @@ inline bool ArrayElemPtr(InterpState &S, CodePtr OpPC) {
   if (!Ptr.isZero()) {
     if (!CheckArray(S, OpPC, Ptr))
       return false;
-
-    if (Ptr.isDummy()) {
-      S.Stk.push<Pointer>(Ptr);
-      return true;
-    }
   }
 
   if (!OffsetHelper<T, ArithOp::Add>(S, OpPC, Offset, Ptr))
@@ -2055,11 +2052,6 @@ inline bool ArrayElemPtrPop(InterpState &S, CodePtr OpPC) {
   if (!Ptr.isZero()) {
     if (!CheckArray(S, OpPC, Ptr))
       return false;
-
-    if (Ptr.isDummy()) {
-      S.Stk.push<Pointer>(Ptr);
-      return true;
-    }
   }
 
   if (!OffsetHelper<T, ArithOp::Add>(S, OpPC, Offset, Ptr))
@@ -2113,12 +2105,12 @@ inline bool CopyArray(InterpState &S, CodePtr OpPC, uint32_t SrcIndex, uint32_t
 inline bool ArrayDecay(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
 
-  if (Ptr.isZero() || Ptr.isDummy()) {
+  if (Ptr.isZero()) {
     S.Stk.push<Pointer>(Ptr);
     return true;
   }
 
-  if (!Ptr.isUnknownSizeArray()) {
+  if (!Ptr.isUnknownSizeArray() || Ptr.isDummy()) {
     S.Stk.push<Pointer>(Ptr.atIndex(0));
     return true;
   }
diff --git a/clang/lib/AST/Interp/InterpFrame.cpp b/clang/lib/AST/Interp/InterpFrame.cpp
index ba957546473e..51b0bd5c1551 100644
--- a/clang/lib/AST/Interp/InterpFrame.cpp
+++ b/clang/lib/AST/Interp/InterpFrame.cpp
@@ -191,8 +191,11 @@ Frame *InterpFrame::getCaller() const {
 }
 
 SourceRange InterpFrame::getCallRange() const {
-  if (!Caller->Func)
-    return S.getRange(nullptr, {});
+  if (!Caller->Func) {
+    if (SourceRange NullRange = S.getRange(nullptr, {}); NullRange.isValid())
+      return NullRange;
+    return S.EvalLocation;
+  }
   return S.getRange(Caller->Func, RetPC - sizeof(uintptr_t));
 }
 
@@ -209,10 +212,8 @@ Pointer InterpFrame::getLocalPointer(unsigned Offset) const {
 
 Pointer InterpFrame::getParamPointer(unsigned Off) {
   // Return the block if it was created previously.
-  auto Pt = Params.find(Off);
-  if (Pt != Params.end()) {
+  if (auto Pt = Params.find(Off); Pt != Params.end())
     return Pointer(reinterpret_cast<Block *>(Pt->second.get()));
-  }
 
   // Allocate memory to store the parameter and the block metadata.
   const auto &Desc = Func->getParamDescriptor(Off);
diff --git a/clang/lib/AST/Interp/InterpState.h b/clang/lib/AST/Interp/InterpState.h
index c17cfad11b1e..d483c60c58e2 100644
--- a/clang/lib/AST/Interp/InterpState.h
+++ b/clang/lib/AST/Interp/InterpState.h
@@ -98,6 +98,8 @@ public:
 
   Context &getContext() const { return Ctx; }
 
+  void setEvalLocation(SourceLocation SL) { this->EvalLocation = SL; }
+
 private:
   /// AST Walker state.
   State &Parent;
@@ -115,6 +117,8 @@ public:
   Context &Ctx;
   /// The current frame.
   InterpFrame *Current = nullptr;
+  /// Source location of the evaluating expression
+  SourceLocation EvalLocation;
 };
 
 } // namespace interp
diff --git a/clang/lib/AST/Interp/Pointer.cpp b/clang/lib/AST/Interp/Pointer.cpp
index 5ef31671ae7b..12bef73f7e21 100644
--- a/clang/lib/AST/Interp/Pointer.cpp
+++ b/clang/lib/AST/Interp/Pointer.cpp
@@ -63,9 +63,8 @@ Pointer::~Pointer() {
 }
 
 void Pointer::operator=(const Pointer &P) {
-
   if (!this->isIntegralPointer() || !P.isBlockPointer())
-    assert(P.StorageKind == StorageKind);
+    assert(P.StorageKind == StorageKind || (this->isZero() && P.isZero()));
 
   bool WasBlockPointer = isBlockPointer();
   StorageKind = P.StorageKind;
@@ -92,7 +91,7 @@ void Pointer::operator=(const Pointer &P) {
 
 void Pointer::operator=(Pointer &&P) {
   if (!this->isIntegralPointer() || !P.isBlockPointer())
-    assert(P.StorageKind == StorageKind);
+    assert(P.StorageKind == StorageKind || (this->isZero() && P.isZero()));
 
   bool WasBlockPointer = isBlockPointer();
   StorageKind = P.StorageKind;
diff --git a/clang/lib/AST/Interp/Pointer.h b/clang/lib/AST/Interp/Pointer.h
index c4d701bc71b7..79fab05670e9 100644
--- a/clang/lib/AST/Interp/Pointer.h
+++ b/clang/lib/AST/Interp/Pointer.h
@@ -226,8 +226,7 @@ public:
       return *this;
 
     // If at base, point to an array of base types.
-    if (asBlockPointer().Base == 0 ||
-        asBlockPointer().Base == sizeof(InlineDescriptor))
+    if (isRoot())
       return Pointer(asBlockPointer().Pointee, RootPtrMark, 0);
 
     // Step into the containing array, if inside one.
@@ -306,10 +305,8 @@ public:
   const Descriptor *getFieldDesc() const {
     if (isIntegralPointer())
       return asIntPointer().Desc;
-    if (isBlockPointer() &&
-        (asBlockPointer().Base == 0 ||
-         asBlockPointer().Base == sizeof(InlineDescriptor) ||
-         asBlockPointer().Base == RootPtrMark))
+
+    if (isRoot())
       return getDeclDesc();
     return getInlineDesc()->Desc;
   }
@@ -390,8 +387,7 @@ public:
     // If this points inside a dummy block, return true.
     // FIXME: This might change in the future. If it does, we need
     // to set the proper Ctor/Dtor functions for dummy Descriptors.
-    if (asBlockPointer().Base != 0 &&
-        asBlockPointer().Base != sizeof(InlineDescriptor) && isDummy())
+    if (!isRoot() && isDummy())
       return true;
     return getFieldDesc()->isUnknownSizeArray();
   }
@@ -403,9 +399,11 @@ public:
   }
   /// Pointer points directly to a block.
   bool isRoot() const {
-    return (asBlockPointer().Base == 0 ||
-            asBlockPointer().Base == RootPtrMark) &&
-           Offset == 0;
+    if (isZero() || isIntegralPointer())
+      return true;
+    return (asBlockPointer().Base ==
+                asBlockPointer().Pointee->getDescriptor()->getMetadataSize() ||
+            asBlockPointer().Base == 0);
   }
   /// If this pointer has an InlineDescriptor we can use to initialize.
   bool canBeInitialized() const {
@@ -487,9 +485,7 @@ public:
   bool isActive() const {
     if (!isBlockPointer())
       return true;
-    return asBlockPointer().Base == 0 ||
-           asBlockPointer().Base == sizeof(InlineDescriptor) ||
-           getInlineDesc()->IsActive;
+    return isRoot() || getInlineDesc()->IsActive;
   }
   /// Checks if a structure is a base class.
   bool isBaseClass() const { return isField() && getInlineDesc()->IsBase; }
@@ -508,10 +504,7 @@ public:
   bool isConst() const {
     if (isIntegralPointer())
       return true;
-    return (asBlockPointer().Base == 0 ||
-            asBlockPointer().Base == sizeof(InlineDescriptor))
-               ? getDeclDesc()->IsConst
-               : getInlineDesc()->IsConst;
+    return isRoot() ? getDeclDesc()->IsConst : getInlineDesc()->IsConst;
   }
 
   /// Returns the declaration ID.
@@ -567,6 +560,9 @@ public:
 
     if (!asBlockPointer().Pointee)
       return false;
+    if (isDummy())
+      return false;
+
     return isElementPastEnd() || getSize() == getOffset();
   }
 
diff --git a/clang/lib/AST/Interp/Program.cpp b/clang/lib/AST/Interp/Program.cpp
index 02075c20cf55..6606149f1f69 100644
--- a/clang/lib/AST/Interp/Program.cpp
+++ b/clang/lib/AST/Interp/Program.cpp
@@ -144,17 +144,20 @@ std::optional<unsigned> Program::getOrCreateDummy(const ValueDecl *VD) {
   if (auto It = DummyVariables.find(VD); It != DummyVariables.end())
     return It->second;
 
-  // Create dummy descriptor.
-  // We create desriptors of 'array of unknown size' if the type is an array
-  // type _and_ the size isn't known (it's not a ConstantArrayType). If the size
-  // is known however, we create a regular dummy pointer.
   Descriptor *Desc;
-  if (const auto *AT = VD->getType()->getAsArrayTypeUnsafe();
-      AT && !isa<ConstantArrayType>(AT))
-    Desc = allocateDescriptor(VD, Descriptor::UnknownSize{});
+  if (std::optional<PrimType> T = Ctx.classify(VD->getType()))
+    Desc = createDescriptor(VD, *T, std::nullopt, true, false);
   else
+    Desc = createDescriptor(VD, VD->getType().getTypePtr(), std::nullopt, true,
+                            false);
+  if (!Desc)
     Desc = allocateDescriptor(VD);
 
+  assert(Desc);
+  Desc->makeDummy();
+
+  assert(Desc->isDummy());
+
   // Allocate a block for storage.
   unsigned I = Globals.size();
 
@@ -310,8 +313,7 @@ Record *Program::getOrCreateRecord(const RecordDecl *RD) {
   for (const FieldDecl *FD : RD->fields()) {
     // Note that we DO create fields and descriptors
     // for unnamed bitfields here, even though we later ignore
-    // them everywhere. That's because so the FieldDecl's
-    // getFieldIndex() matches.
+    // them everywhere. That's so the FieldDecl's getFieldIndex() matches.
 
     // Reserve space for the field's descriptor and the offset.
     BaseSize += align(sizeof(InlineDescriptor));
@@ -344,6 +346,7 @@ Descriptor *Program::createDescriptor(const DeclTy &D, const Type *Ty,
                                       Descriptor::MetadataSize MDSize,
                                       bool IsConst, bool IsTemporary,
                                       bool IsMutable, const Expr *Init) {
+
   // Classes and structures.
   if (const auto *RT = Ty->getAs<RecordType>()) {
     if (const auto *Record = getOrCreateRecord(RT->getDecl()))
@@ -383,7 +386,8 @@ Descriptor *Program::createDescriptor(const DeclTy &D, const Type *Ty,
 
     // Array of unknown bounds - cannot be accessed and pointer arithmetic
     // is forbidden on pointers to such objects.
-    if (isa<IncompleteArrayType>(ArrayType)) {
+    if (isa<IncompleteArrayType>(ArrayType) ||
+        isa<VariableArrayType>(ArrayType)) {
       if (std::optional<PrimType> T = Ctx.classify(ElemTy)) {
         return allocateDescriptor(D, *T, MDSize, IsTemporary,
                                   Descriptor::UnknownSize{});
diff --git a/clang/lib/AST/NSAPI.cpp b/clang/lib/AST/NSAPI.cpp
index 6f586173edb0..2d16237f5325 100644
--- a/clang/lib/AST/NSAPI.cpp
+++ b/clang/lib/AST/NSAPI.cpp
@@ -454,6 +454,7 @@ NSAPI::getNSNumberFactoryMethodKind(QualType T) const {
 #define WASM_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
   case BuiltinType::BoundMember:
+  case BuiltinType::UnresolvedTemplate:
   case BuiltinType::Dependent:
   case BuiltinType::Overload:
   case BuiltinType::UnknownAny:
diff --git a/clang/lib/AST/OpenACCClause.cpp b/clang/lib/AST/OpenACCClause.cpp
index 885f3b7618ec..ee13437b97b4 100644
--- a/clang/lib/AST/OpenACCClause.cpp
+++ b/clang/lib/AST/OpenACCClause.cpp
@@ -17,6 +17,33 @@
 
 using namespace clang;
 
+bool OpenACCClauseWithParams::classof(const OpenACCClause *C) {
+  return OpenACCClauseWithCondition::classof(C) ||
+         OpenACCClauseWithExprs::classof(C);
+}
+bool OpenACCClauseWithExprs::classof(const OpenACCClause *C) {
+  return OpenACCWaitClause::classof(C) || OpenACCNumGangsClause::classof(C) ||
+         OpenACCClauseWithSingleIntExpr::classof(C) ||
+         OpenACCClauseWithVarList::classof(C);
+}
+bool OpenACCClauseWithVarList::classof(const OpenACCClause *C) {
+  return OpenACCPrivateClause::classof(C) ||
+         OpenACCFirstPrivateClause::classof(C) ||
+         OpenACCDevicePtrClause::classof(C) ||
+         OpenACCDevicePtrClause::classof(C) ||
+         OpenACCAttachClause::classof(C) || OpenACCNoCreateClause::classof(C) ||
+         OpenACCPresentClause::classof(C) || OpenACCCopyClause::classof(C) ||
+         OpenACCCopyInClause::classof(C) || OpenACCCopyOutClause::classof(C) ||
+         OpenACCCreateClause::classof(C);
+}
+bool OpenACCClauseWithCondition::classof(const OpenACCClause *C) {
+  return OpenACCIfClause::classof(C) || OpenACCSelfClause::classof(C);
+}
+bool OpenACCClauseWithSingleIntExpr::classof(const OpenACCClause *C) {
+  return OpenACCNumWorkersClause::classof(C) ||
+         OpenACCVectorLengthClause::classof(C) ||
+         OpenACCAsyncClause::classof(C);
+}
 OpenACCDefaultClause *OpenACCDefaultClause::Create(const ASTContext &C,
                                                    OpenACCDefaultClauseKind K,
                                                    SourceLocation BeginLoc,
@@ -76,6 +103,9 @@ OpenACCClause::child_range OpenACCClause::children() {
 #define VISIT_CLAUSE(CLAUSE_NAME)                                              \
   case OpenACCClauseKind::CLAUSE_NAME:                                         \
     return cast<OpenACC##CLAUSE_NAME##Clause>(this)->children();
+#define CLAUSE_ALIAS(ALIAS_NAME, CLAUSE_NAME)                                  \
+  case OpenACCClauseKind::ALIAS_NAME:                                          \
+    return cast<OpenACC##CLAUSE_NAME##Clause>(this)->children();
 
 #include "clang/Basic/OpenACCClauses.def"
   }
@@ -124,6 +154,38 @@ OpenACCVectorLengthClause::Create(const ASTContext &C, SourceLocation BeginLoc,
       OpenACCVectorLengthClause(BeginLoc, LParenLoc, IntExpr, EndLoc);
 }
 
+OpenACCAsyncClause::OpenACCAsyncClause(SourceLocation BeginLoc,
+                                       SourceLocation LParenLoc, Expr *IntExpr,
+                                       SourceLocation EndLoc)
+    : OpenACCClauseWithSingleIntExpr(OpenACCClauseKind::Async, BeginLoc,
+                                     LParenLoc, IntExpr, EndLoc) {
+  assert((!IntExpr || IntExpr->isInstantiationDependent() ||
+          IntExpr->getType()->isIntegerType()) &&
+         "Condition expression type not scalar/dependent");
+}
+
+OpenACCAsyncClause *OpenACCAsyncClause::Create(const ASTContext &C,
+                                               SourceLocation BeginLoc,
+                                               SourceLocation LParenLoc,
+                                               Expr *IntExpr,
+                                               SourceLocation EndLoc) {
+  void *Mem =
+      C.Allocate(sizeof(OpenACCAsyncClause), alignof(OpenACCAsyncClause));
+  return new (Mem) OpenACCAsyncClause(BeginLoc, LParenLoc, IntExpr, EndLoc);
+}
+
+OpenACCWaitClause *OpenACCWaitClause::Create(
+    const ASTContext &C, SourceLocation BeginLoc, SourceLocation LParenLoc,
+    Expr *DevNumExpr, SourceLocation QueuesLoc, ArrayRef<Expr *> QueueIdExprs,
+    SourceLocation EndLoc) {
+  // Allocates enough room in trailing storage for all the int-exprs, plus a
+  // placeholder for the devnum.
+  void *Mem = C.Allocate(
+      OpenACCWaitClause::totalSizeToAlloc<Expr *>(QueueIdExprs.size() + 1));
+  return new (Mem) OpenACCWaitClause(BeginLoc, LParenLoc, DevNumExpr, QueuesLoc,
+                                     QueueIdExprs, EndLoc);
+}
+
 OpenACCNumGangsClause *OpenACCNumGangsClause::Create(const ASTContext &C,
                                                      SourceLocation BeginLoc,
                                                      SourceLocation LParenLoc,
@@ -144,6 +206,98 @@ OpenACCPrivateClause *OpenACCPrivateClause::Create(const ASTContext &C,
   return new (Mem) OpenACCPrivateClause(BeginLoc, LParenLoc, VarList, EndLoc);
 }
 
+OpenACCFirstPrivateClause *OpenACCFirstPrivateClause::Create(
+    const ASTContext &C, SourceLocation BeginLoc, SourceLocation LParenLoc,
+    ArrayRef<Expr *> VarList, SourceLocation EndLoc) {
+  void *Mem = C.Allocate(
+      OpenACCFirstPrivateClause::totalSizeToAlloc<Expr *>(VarList.size()));
+  return new (Mem)
+      OpenACCFirstPrivateClause(BeginLoc, LParenLoc, VarList, EndLoc);
+}
+
+OpenACCAttachClause *OpenACCAttachClause::Create(const ASTContext &C,
+                                                 SourceLocation BeginLoc,
+                                                 SourceLocation LParenLoc,
+                                                 ArrayRef<Expr *> VarList,
+                                                 SourceLocation EndLoc) {
+  void *Mem =
+      C.Allocate(OpenACCAttachClause::totalSizeToAlloc<Expr *>(VarList.size()));
+  return new (Mem) OpenACCAttachClause(BeginLoc, LParenLoc, VarList, EndLoc);
+}
+
+OpenACCDevicePtrClause *OpenACCDevicePtrClause::Create(const ASTContext &C,
+                                                       SourceLocation BeginLoc,
+                                                       SourceLocation LParenLoc,
+                                                       ArrayRef<Expr *> VarList,
+                                                       SourceLocation EndLoc) {
+  void *Mem = C.Allocate(
+      OpenACCDevicePtrClause::totalSizeToAlloc<Expr *>(VarList.size()));
+  return new (Mem) OpenACCDevicePtrClause(BeginLoc, LParenLoc, VarList, EndLoc);
+}
+
+OpenACCNoCreateClause *OpenACCNoCreateClause::Create(const ASTContext &C,
+                                                     SourceLocation BeginLoc,
+                                                     SourceLocation LParenLoc,
+                                                     ArrayRef<Expr *> VarList,
+                                                     SourceLocation EndLoc) {
+  void *Mem = C.Allocate(
+      OpenACCNoCreateClause::totalSizeToAlloc<Expr *>(VarList.size()));
+  return new (Mem) OpenACCNoCreateClause(BeginLoc, LParenLoc, VarList, EndLoc);
+}
+
+OpenACCPresentClause *OpenACCPresentClause::Create(const ASTContext &C,
+                                                   SourceLocation BeginLoc,
+                                                   SourceLocation LParenLoc,
+                                                   ArrayRef<Expr *> VarList,
+                                                   SourceLocation EndLoc) {
+  void *Mem = C.Allocate(
+      OpenACCPresentClause::totalSizeToAlloc<Expr *>(VarList.size()));
+  return new (Mem) OpenACCPresentClause(BeginLoc, LParenLoc, VarList, EndLoc);
+}
+
+OpenACCCopyClause *
+OpenACCCopyClause::Create(const ASTContext &C, OpenACCClauseKind Spelling,
+                          SourceLocation BeginLoc, SourceLocation LParenLoc,
+                          ArrayRef<Expr *> VarList, SourceLocation EndLoc) {
+  void *Mem =
+      C.Allocate(OpenACCCopyClause::totalSizeToAlloc<Expr *>(VarList.size()));
+  return new (Mem)
+      OpenACCCopyClause(Spelling, BeginLoc, LParenLoc, VarList, EndLoc);
+}
+
+OpenACCCopyInClause *
+OpenACCCopyInClause::Create(const ASTContext &C, OpenACCClauseKind Spelling,
+                            SourceLocation BeginLoc, SourceLocation LParenLoc,
+                            bool IsReadOnly, ArrayRef<Expr *> VarList,
+                            SourceLocation EndLoc) {
+  void *Mem =
+      C.Allocate(OpenACCCopyInClause::totalSizeToAlloc<Expr *>(VarList.size()));
+  return new (Mem) OpenACCCopyInClause(Spelling, BeginLoc, LParenLoc,
+                                       IsReadOnly, VarList, EndLoc);
+}
+
+OpenACCCopyOutClause *
+OpenACCCopyOutClause::Create(const ASTContext &C, OpenACCClauseKind Spelling,
+                             SourceLocation BeginLoc, SourceLocation LParenLoc,
+                             bool IsZero, ArrayRef<Expr *> VarList,
+                             SourceLocation EndLoc) {
+  void *Mem = C.Allocate(
+      OpenACCCopyOutClause::totalSizeToAlloc<Expr *>(VarList.size()));
+  return new (Mem) OpenACCCopyOutClause(Spelling, BeginLoc, LParenLoc, IsZero,
+                                        VarList, EndLoc);
+}
+
+OpenACCCreateClause *
+OpenACCCreateClause::Create(const ASTContext &C, OpenACCClauseKind Spelling,
+                            SourceLocation BeginLoc, SourceLocation LParenLoc,
+                            bool IsZero, ArrayRef<Expr *> VarList,
+                            SourceLocation EndLoc) {
+  void *Mem =
+      C.Allocate(OpenACCCreateClause::totalSizeToAlloc<Expr *>(VarList.size()));
+  return new (Mem) OpenACCCreateClause(Spelling, BeginLoc, LParenLoc, IsZero,
+                                       VarList, EndLoc);
+}
+
 //===----------------------------------------------------------------------===//
 //  OpenACC clauses printing methods
 //===----------------------------------------------------------------------===//
@@ -192,9 +346,108 @@ void OpenACCClausePrinter::VisitVectorLengthClause(
   OS << ")";
 }
 
+void OpenACCClausePrinter::VisitAsyncClause(const OpenACCAsyncClause &C) {
+  OS << "async";
+  if (C.hasIntExpr()) {
+    OS << "(";
+    printExpr(C.getIntExpr());
+    OS << ")";
+  }
+}
+
 void OpenACCClausePrinter::VisitPrivateClause(const OpenACCPrivateClause &C) {
   OS << "private(";
   llvm::interleaveComma(C.getVarList(), OS,
                         [&](const Expr *E) { printExpr(E); });
   OS << ")";
 }
+
+void OpenACCClausePrinter::VisitFirstPrivateClause(
+    const OpenACCFirstPrivateClause &C) {
+  OS << "firstprivate(";
+  llvm::interleaveComma(C.getVarList(), OS,
+                        [&](const Expr *E) { printExpr(E); });
+  OS << ")";
+}
+
+void OpenACCClausePrinter::VisitAttachClause(const OpenACCAttachClause &C) {
+  OS << "attach(";
+  llvm::interleaveComma(C.getVarList(), OS,
+                        [&](const Expr *E) { printExpr(E); });
+  OS << ")";
+}
+
+void OpenACCClausePrinter::VisitDevicePtrClause(
+    const OpenACCDevicePtrClause &C) {
+  OS << "deviceptr(";
+  llvm::interleaveComma(C.getVarList(), OS,
+                        [&](const Expr *E) { printExpr(E); });
+  OS << ")";
+}
+
+void OpenACCClausePrinter::VisitNoCreateClause(const OpenACCNoCreateClause &C) {
+  OS << "no_create(";
+  llvm::interleaveComma(C.getVarList(), OS,
+                        [&](const Expr *E) { printExpr(E); });
+  OS << ")";
+}
+
+void OpenACCClausePrinter::VisitPresentClause(const OpenACCPresentClause &C) {
+  OS << "present(";
+  llvm::interleaveComma(C.getVarList(), OS,
+                        [&](const Expr *E) { printExpr(E); });
+  OS << ")";
+}
+
+void OpenACCClausePrinter::VisitCopyClause(const OpenACCCopyClause &C) {
+  OS << C.getClauseKind() << '(';
+  llvm::interleaveComma(C.getVarList(), OS,
+                        [&](const Expr *E) { printExpr(E); });
+  OS << ")";
+}
+
+void OpenACCClausePrinter::VisitCopyInClause(const OpenACCCopyInClause &C) {
+  OS << C.getClauseKind() << '(';
+  if (C.isReadOnly())
+    OS << "readonly: ";
+  llvm::interleaveComma(C.getVarList(), OS,
+                        [&](const Expr *E) { printExpr(E); });
+  OS << ")";
+}
+
+void OpenACCClausePrinter::VisitCopyOutClause(const OpenACCCopyOutClause &C) {
+  OS << C.getClauseKind() << '(';
+  if (C.isZero())
+    OS << "zero: ";
+  llvm::interleaveComma(C.getVarList(), OS,
+                        [&](const Expr *E) { printExpr(E); });
+  OS << ")";
+}
+
+void OpenACCClausePrinter::VisitCreateClause(const OpenACCCreateClause &C) {
+  OS << C.getClauseKind() << '(';
+  if (C.isZero())
+    OS << "zero: ";
+  llvm::interleaveComma(C.getVarList(), OS,
+                        [&](const Expr *E) { printExpr(E); });
+  OS << ")";
+}
+
+void OpenACCClausePrinter::VisitWaitClause(const OpenACCWaitClause &C) {
+  OS << "wait";
+  if (!C.getLParenLoc().isInvalid()) {
+    OS << "(";
+    if (C.hasDevNumExpr()) {
+      OS << "devnum: ";
+      printExpr(C.getDevNumExpr());
+      OS << " : ";
+    }
+
+    if (C.hasQueuesTag())
+      OS << "queues: ";
+
+    llvm::interleaveComma(C.getQueueIdExprs(), OS,
+                          [&](const Expr *E) { printExpr(E); });
+    OS << ")";
+  }
+}
diff --git a/clang/lib/AST/PrintfFormatString.cpp b/clang/lib/AST/PrintfFormatString.cpp
index fec8ce13e8c4..dd3b38fabb55 100644
--- a/clang/lib/AST/PrintfFormatString.cpp
+++ b/clang/lib/AST/PrintfFormatString.cpp
@@ -146,13 +146,13 @@ static PrintfSpecifierResult ParsePrintfSpecifier(FormatStringHandler &H,
           if (Warn && (Size == 0 || Size > 8))
             H.handleInvalidMaskType(MaskType);
           FS.setMaskType(MaskType);
-        } else if (MatchedStr.equals("sensitive"))
+        } else if (MatchedStr == "sensitive")
           PrivacyFlags = clang::analyze_os_log::OSLogBufferItem::IsSensitive;
         else if (PrivacyFlags !=
-                 clang::analyze_os_log::OSLogBufferItem::IsSensitive &&
-                 MatchedStr.equals("private"))
+                     clang::analyze_os_log::OSLogBufferItem::IsSensitive &&
+                 MatchedStr == "private")
           PrivacyFlags = clang::analyze_os_log::OSLogBufferItem::IsPrivate;
-        else if (PrivacyFlags == 0 && MatchedStr.equals("public"))
+        else if (PrivacyFlags == 0 && MatchedStr == "public")
           PrivacyFlags = clang::analyze_os_log::OSLogBufferItem::IsPublic;
       } else {
         size_t CommaOrBracePos =
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index 973f6f97bae0..8fb8940142eb 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -2492,6 +2492,28 @@ void OpenACCClauseProfiler::VisitIfClause(const OpenACCIfClause &Clause) {
   Profiler.VisitStmt(Clause.getConditionExpr());
 }
 
+void OpenACCClauseProfiler::VisitCopyClause(const OpenACCCopyClause &Clause) {
+  for (auto *E : Clause.getVarList())
+    Profiler.VisitStmt(E);
+}
+void OpenACCClauseProfiler::VisitCopyInClause(
+    const OpenACCCopyInClause &Clause) {
+  for (auto *E : Clause.getVarList())
+    Profiler.VisitStmt(E);
+}
+
+void OpenACCClauseProfiler::VisitCopyOutClause(
+    const OpenACCCopyOutClause &Clause) {
+  for (auto *E : Clause.getVarList())
+    Profiler.VisitStmt(E);
+}
+
+void OpenACCClauseProfiler::VisitCreateClause(
+    const OpenACCCreateClause &Clause) {
+  for (auto *E : Clause.getVarList())
+    Profiler.VisitStmt(E);
+}
+
 void OpenACCClauseProfiler::VisitSelfClause(const OpenACCSelfClause &Clause) {
   if (Clause.hasConditionExpr())
     Profiler.VisitStmt(Clause.getConditionExpr());
@@ -2515,12 +2537,54 @@ void OpenACCClauseProfiler::VisitPrivateClause(
     Profiler.VisitStmt(E);
 }
 
+void OpenACCClauseProfiler::VisitFirstPrivateClause(
+    const OpenACCFirstPrivateClause &Clause) {
+  for (auto *E : Clause.getVarList())
+    Profiler.VisitStmt(E);
+}
+
+void OpenACCClauseProfiler::VisitAttachClause(
+    const OpenACCAttachClause &Clause) {
+  for (auto *E : Clause.getVarList())
+    Profiler.VisitStmt(E);
+}
+
+void OpenACCClauseProfiler::VisitDevicePtrClause(
+    const OpenACCDevicePtrClause &Clause) {
+  for (auto *E : Clause.getVarList())
+    Profiler.VisitStmt(E);
+}
+
+void OpenACCClauseProfiler::VisitNoCreateClause(
+    const OpenACCNoCreateClause &Clause) {
+  for (auto *E : Clause.getVarList())
+    Profiler.VisitStmt(E);
+}
+
+void OpenACCClauseProfiler::VisitPresentClause(
+    const OpenACCPresentClause &Clause) {
+  for (auto *E : Clause.getVarList())
+    Profiler.VisitStmt(E);
+}
+
 void OpenACCClauseProfiler::VisitVectorLengthClause(
     const OpenACCVectorLengthClause &Clause) {
   assert(Clause.hasIntExpr() &&
          "vector_length clause requires a valid int expr");
   Profiler.VisitStmt(Clause.getIntExpr());
 }
+
+void OpenACCClauseProfiler::VisitAsyncClause(const OpenACCAsyncClause &Clause) {
+  if (Clause.hasIntExpr())
+    Profiler.VisitStmt(Clause.getIntExpr());
+}
+
+void OpenACCClauseProfiler::VisitWaitClause(const OpenACCWaitClause &Clause) {
+  if (Clause.hasDevNumExpr())
+    Profiler.VisitStmt(Clause.getDevNumExpr());
+  for (auto *E : Clause.getQueueIdExprs())
+    Profiler.VisitStmt(E);
+}
 } // namespace
 
 void StmtProfiler::VisitOpenACCComputeConstruct(
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index 89f50d6dacfd..12aa5858b798 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -397,16 +397,53 @@ void TextNodeDumper::Visit(const OpenACCClause *C) {
     case OpenACCClauseKind::Default:
       OS << '(' << cast<OpenACCDefaultClause>(C)->getDefaultClauseKind() << ')';
       break;
+    case OpenACCClauseKind::Async:
+    case OpenACCClauseKind::Attach:
+    case OpenACCClauseKind::Copy:
+    case OpenACCClauseKind::PCopy:
+    case OpenACCClauseKind::PresentOrCopy:
     case OpenACCClauseKind::If:
-    case OpenACCClauseKind::Self:
+    case OpenACCClauseKind::DevicePtr:
+    case OpenACCClauseKind::FirstPrivate:
+    case OpenACCClauseKind::NoCreate:
     case OpenACCClauseKind::NumGangs:
     case OpenACCClauseKind::NumWorkers:
+    case OpenACCClauseKind::Present:
     case OpenACCClauseKind::Private:
+    case OpenACCClauseKind::Self:
     case OpenACCClauseKind::VectorLength:
       // The condition expression will be printed as a part of the 'children',
       // but print 'clause' here so it is clear what is happening from the dump.
       OS << " clause";
       break;
+    case OpenACCClauseKind::CopyIn:
+    case OpenACCClauseKind::PCopyIn:
+    case OpenACCClauseKind::PresentOrCopyIn:
+      OS << " clause";
+      if (cast<OpenACCCopyInClause>(C)->isReadOnly())
+        OS << " : readonly";
+      break;
+    case OpenACCClauseKind::CopyOut:
+    case OpenACCClauseKind::PCopyOut:
+    case OpenACCClauseKind::PresentOrCopyOut:
+      OS << " clause";
+      if (cast<OpenACCCopyOutClause>(C)->isZero())
+        OS << " : zero";
+      break;
+    case OpenACCClauseKind::Create:
+    case OpenACCClauseKind::PCreate:
+    case OpenACCClauseKind::PresentOrCreate:
+      OS << " clause";
+      if (cast<OpenACCCreateClause>(C)->isZero())
+        OS << " : zero";
+      break;
+    case OpenACCClauseKind::Wait:
+      OS << " clause";
+      if (cast<OpenACCWaitClause>(C)->hasDevNumExpr())
+        OS << " has devnum";
+      if (cast<OpenACCWaitClause>(C)->hasQueuesTag())
+        OS << " has queues tag";
+      break;
     default:
       // Nothing to do here.
       break;
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index 68e81f45b4c2..e31741cd4424 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -256,7 +256,8 @@ void DependentSizedArrayType::Profile(llvm::FoldingSetNodeID &ID,
   ID.AddPointer(ET.getAsOpaquePtr());
   ID.AddInteger(llvm::to_underlying(SizeMod));
   ID.AddInteger(TypeQuals);
-  E->Profile(ID, Context, true);
+  if (E)
+    E->Profile(ID, Context, true);
 }
 
 DependentVectorType::DependentVectorType(QualType ElementType,
@@ -3393,6 +3394,8 @@ StringRef BuiltinType::getName(const PrintingPolicy &Policy) const {
     return "<overloaded function type>";
   case BoundMember:
     return "<bound member function type>";
+  case UnresolvedTemplate:
+    return "<unresolved template type>";
   case PseudoObject:
     return "<pseudo-object type>";
   case Dependent:
@@ -4685,6 +4688,7 @@ bool Type::canHaveNullability(bool ResultIfUnknown) const {
 #include "clang/AST/BuiltinTypes.def"
       return false;
 
+    case BuiltinType::UnresolvedTemplate:
     // Dependent types that could instantiate to a pointer type.
     case BuiltinType::Dependent:
     case BuiltinType::Overload:
diff --git a/clang/lib/AST/TypeLoc.cpp b/clang/lib/AST/TypeLoc.cpp
index ce45b47d5cfe..9dd90d9bf4e5 100644
--- a/clang/lib/AST/TypeLoc.cpp
+++ b/clang/lib/AST/TypeLoc.cpp
@@ -399,6 +399,7 @@ TypeSpecifierType BuiltinTypeLoc::getWrittenTypeSpec() const {
   case BuiltinType::NullPtr:
   case BuiltinType::Overload:
   case BuiltinType::Dependent:
+  case BuiltinType::UnresolvedTemplate:
   case BuiltinType::BoundMember:
   case BuiltinType::UnknownAny:
   case BuiltinType::ARCUnbridgedCast:
diff --git a/clang/lib/ASTMatchers/Dynamic/Marshallers.cpp b/clang/lib/ASTMatchers/Dynamic/Marshallers.cpp
index cf9ae7c974a6..37c91abb5c83 100644
--- a/clang/lib/ASTMatchers/Dynamic/Marshallers.cpp
+++ b/clang/lib/ASTMatchers/Dynamic/Marshallers.cpp
@@ -21,7 +21,7 @@ getBestGuess(llvm::StringRef Search, llvm::ArrayRef<llvm::StringRef> Allowed,
   llvm::StringRef Res;
   for (const llvm::StringRef &Item : Allowed) {
     if (Item.equals_insensitive(Search)) {
-      assert(!Item.equals(Search) && "This should be handled earlier on.");
+      assert(Item != Search && "This should be handled earlier on.");
       MaxEditDistance = 1;
       Res = Item;
       continue;
@@ -41,7 +41,7 @@ getBestGuess(llvm::StringRef Search, llvm::ArrayRef<llvm::StringRef> Allowed,
       if (!NoPrefix.consume_front(DropPrefix))
         continue;
       if (NoPrefix.equals_insensitive(Search)) {
-        if (NoPrefix.equals(Search))
+        if (NoPrefix == Search)
           return Item.str();
         MaxEditDistance = 1;
         Res = Item;
diff --git a/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp b/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp
index e94fd39c45dc..4b86daa56d7b 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp
@@ -170,7 +170,7 @@ DataflowAnalysisContext::joinFlowConditions(Atom FirstToken,
 
 Solver::Result DataflowAnalysisContext::querySolver(
     llvm::SetVector<const Formula *> Constraints) {
-  return S->solve(Constraints.getArrayRef());
+  return S.solve(Constraints.getArrayRef());
 }
 
 bool DataflowAnalysisContext::flowConditionImplies(Atom Token,
@@ -338,10 +338,10 @@ static std::unique_ptr<Logger> makeLoggerFromCommandLine() {
   return Logger::html(std::move(StreamFactory));
 }
 
-DataflowAnalysisContext::DataflowAnalysisContext(std::unique_ptr<Solver> S,
-                                                 Options Opts)
-    : S(std::move(S)), A(std::make_unique<Arena>()), Opts(Opts) {
-  assert(this->S != nullptr);
+DataflowAnalysisContext::DataflowAnalysisContext(
+    Solver &S, std::unique_ptr<Solver> &&OwnedSolver, Options Opts)
+    : S(S), OwnedSolver(std::move(OwnedSolver)), A(std::make_unique<Arena>()),
+      Opts(Opts) {
   // If the -dataflow-log command-line flag was set, synthesize a logger.
   // This is ugly but provides a uniform method for ad-hoc debugging dataflow-
   // based tools.
diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
index d79e73440289..cb6c8b2ef107 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
@@ -350,6 +350,17 @@ public:
     return RecursiveASTVisitor<ResultObjectVisitor>::TraverseDecl(D);
   }
 
+  // Don't traverse expressions in unevaluated contexts, as we don't model
+  // fields that are only used in these.
+  // Note: The operand of the `noexcept` operator is an unevaluated operand, but
+  // nevertheless it appears in the Clang CFG, so we don't exclude it here.
+  bool TraverseDecltypeTypeLoc(DecltypeTypeLoc) { return true; }
+  bool TraverseTypeOfExprTypeLoc(TypeOfExprTypeLoc) { return true; }
+  bool TraverseCXXTypeidExpr(CXXTypeidExpr *) { return true; }
+  bool TraverseUnaryExprOrTypeTraitExpr(UnaryExprOrTypeTraitExpr *) {
+    return true;
+  }
+
   bool TraverseBindingDecl(BindingDecl *BD) {
     // `RecursiveASTVisitor` doesn't traverse holding variables for
     // `BindingDecl`s by itself, so we need to tell it to.
diff --git a/clang/lib/Analysis/FlowSensitive/Transfer.cpp b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
index fd224aeb79b1..4214488c98e5 100644
--- a/clang/lib/Analysis/FlowSensitive/Transfer.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
@@ -68,6 +68,14 @@ static BoolValue &evaluateBooleanEquality(const Expr &LHS, const Expr &RHS,
     if (auto *RHSBool = dyn_cast_or_null<BoolValue>(RHSValue))
       return Env.makeIff(*LHSBool, *RHSBool);
 
+  if (auto *LHSPtr = dyn_cast_or_null<PointerValue>(LHSValue))
+    if (auto *RHSPtr = dyn_cast_or_null<PointerValue>(RHSValue))
+      // If the storage locations are the same, the pointers definitely compare
+      // the same. If the storage locations are different, they may still alias,
+      // so we fall through to the case below that returns an atom.
+      if (&LHSPtr->getPointeeLoc() == &RHSPtr->getPointeeLoc())
+        return Env.getBoolLiteralValue(true);
+
   return Env.makeAtomicBoolValue();
 }
 
@@ -556,14 +564,23 @@ public:
 
       copyRecord(*LocSrc, *LocDst, Env);
 
-      // If the expr is a glvalue, we can reasonably assume the operator is
-      // returning T& and thus we can assign it `LocDst`.
-      if (S->isGLValue()) {
+      // The assignment operator can have an arbitrary return type. We model the
+      // return value only if the return type is the same as or a base class of
+      // the destination type.
+      if (S->getType().getCanonicalType().getUnqualifiedType() !=
+          LocDst->getType().getCanonicalType().getUnqualifiedType()) {
+        auto ReturnDecl = S->getType()->getAsCXXRecordDecl();
+        auto DstDecl = LocDst->getType()->getAsCXXRecordDecl();
+        if (ReturnDecl == nullptr || DstDecl == nullptr)
+          return;
+        if (!DstDecl->isDerivedFrom(ReturnDecl))
+          return;
+      }
+
+      if (S->isGLValue())
         Env.setStorageLocation(*S, *LocDst);
-      } else if (S->getType()->isRecordType()) {
-        // Assume that the assignment returns the assigned value.
+      else
         copyRecord(*LocDst, Env.getResultObjectLocation(*S), Env);
-      }
 
       return;
     }
diff --git a/clang/lib/Basic/Builtins.cpp b/clang/lib/Basic/Builtins.cpp
index 3467847ac167..b116abbe034f 100644
--- a/clang/lib/Basic/Builtins.cpp
+++ b/clang/lib/Basic/Builtins.cpp
@@ -64,7 +64,7 @@ bool Builtin::Context::isBuiltinFunc(llvm::StringRef FuncName) {
   bool InStdNamespace = FuncName.consume_front("std-");
   for (unsigned i = Builtin::NotBuiltin + 1; i != Builtin::FirstTSBuiltin;
        ++i) {
-    if (FuncName.equals(BuiltinInfo[i].Name) &&
+    if (FuncName == BuiltinInfo[i].Name &&
         (bool)strchr(BuiltinInfo[i].Attributes, 'z') == InStdNamespace)
       return strchr(BuiltinInfo[i].Attributes, 'f') != nullptr;
   }
diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp
index 113483db5729..e8ce15eb0dec 100644
--- a/clang/lib/Basic/Cuda.cpp
+++ b/clang/lib/Basic/Cuda.cpp
@@ -14,7 +14,7 @@ struct CudaVersionMapEntry {
 };
 #define CUDA_ENTRY(major, minor)                                               \
   {                                                                            \
-#major "." #minor, CudaVersion::CUDA_##major##minor,                       \
+    #major "." #minor, CudaVersion::CUDA_##major##minor,                       \
         llvm::VersionTuple(major, minor)                                       \
   }
 
@@ -41,6 +41,7 @@ static const CudaVersionMapEntry CudaNameVersionMap[] = {
     CUDA_ENTRY(12, 1),
     CUDA_ENTRY(12, 2),
     CUDA_ENTRY(12, 3),
+    CUDA_ENTRY(12, 4),
     {"", CudaVersion::NEW, llvm::VersionTuple(std::numeric_limits<int>::max())},
     {"unknown", CudaVersion::UNKNOWN, {}} // End of list tombstone.
 };
@@ -241,7 +242,7 @@ CudaVersion MaxVersionForCudaArch(CudaArch A) {
   }
 }
 
-bool CudaFeatureEnabled(llvm::VersionTuple  Version, CudaFeature Feature) {
+bool CudaFeatureEnabled(llvm::VersionTuple Version, CudaFeature Feature) {
   return CudaFeatureEnabled(ToCudaVersion(Version), Feature);
 }
 
diff --git a/clang/lib/Basic/Diagnostic.cpp b/clang/lib/Basic/Diagnostic.cpp
index 0208ccc31bd7..10136b4cd943 100644
--- a/clang/lib/Basic/Diagnostic.cpp
+++ b/clang/lib/Basic/Diagnostic.cpp
@@ -851,8 +851,7 @@ FormatDiagnostic(const char *DiagStr, const char *DiagEnd,
   // When the diagnostic string is only "%0", the entire string is being given
   // by an outside source.  Remove unprintable characters from this string
   // and skip all the other string processing.
-  if (DiagEnd - DiagStr == 2 &&
-      StringRef(DiagStr, DiagEnd - DiagStr).equals("%0") &&
+  if (DiagEnd - DiagStr == 2 && StringRef(DiagStr, DiagEnd - DiagStr) == "%0" &&
       getArgKind(0) == DiagnosticsEngine::ak_std_string) {
     const std::string &S = getArgStdStr(0);
     EscapeStringForDiagnostic(S, OutStr);
diff --git a/clang/lib/Basic/LangOptions.cpp b/clang/lib/Basic/LangOptions.cpp
index a0adfbf61840..2b906463931d 100644
--- a/clang/lib/Basic/LangOptions.cpp
+++ b/clang/lib/Basic/LangOptions.cpp
@@ -48,7 +48,7 @@ void LangOptions::resetNonModularOptions() {
 
 bool LangOptions::isNoBuiltinFunc(StringRef FuncName) const {
   for (unsigned i = 0, e = NoBuiltinFuncs.size(); i != e; ++i)
-    if (FuncName.equals(NoBuiltinFuncs[i]))
+    if (FuncName == NoBuiltinFuncs[i])
       return true;
   return false;
 }
diff --git a/clang/lib/Basic/Module.cpp b/clang/lib/Basic/Module.cpp
index bb212cde8788..045ef580f9c3 100644
--- a/clang/lib/Basic/Module.cpp
+++ b/clang/lib/Basic/Module.cpp
@@ -140,8 +140,8 @@ bool Module::isUnimportable(const LangOptions &LangOpts,
       return true;
     }
     for (unsigned I = 0, N = Current->Requirements.size(); I != N; ++I) {
-      if (hasFeature(Current->Requirements[I].first, LangOpts, Target) !=
-              Current->Requirements[I].second) {
+      if (hasFeature(Current->Requirements[I].FeatureName, LangOpts, Target) !=
+          Current->Requirements[I].RequiredState) {
         Req = Current->Requirements[I];
         return true;
       }
@@ -319,7 +319,7 @@ bool Module::directlyUses(const Module *Requested) {
 void Module::addRequirement(StringRef Feature, bool RequiredState,
                             const LangOptions &LangOpts,
                             const TargetInfo &Target) {
-  Requirements.push_back(Requirement(std::string(Feature), RequiredState));
+  Requirements.push_back(Requirement{std::string(Feature), RequiredState});
 
   // If this feature is currently available, we're done.
   if (hasFeature(Feature, LangOpts, Target) == RequiredState)
@@ -504,9 +504,9 @@ void Module::print(raw_ostream &OS, unsigned Indent, bool Dump) const {
     for (unsigned I = 0, N = Requirements.size(); I != N; ++I) {
       if (I)
         OS << ", ";
-      if (!Requirements[I].second)
+      if (!Requirements[I].RequiredState)
         OS << "!";
-      OS << Requirements[I].first;
+      OS << Requirements[I].FeatureName;
     }
     OS << "\n";
   }
diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp
index f96956f31d50..29f5cd14e46e 100644
--- a/clang/lib/Basic/TargetInfo.cpp
+++ b/clang/lib/Basic/TargetInfo.cpp
@@ -406,6 +406,16 @@ void TargetInfo::adjust(DiagnosticsEngine &Diags, LangOptions &Opts) {
     LongDoubleAlign = 64;
   }
 
+  // HLSL explicitly defines the sizes and formats of some data types, and we
+  // need to conform to those regardless of what architecture you are targeting.
+  if (Opts.HLSL) {
+    LongWidth = LongAlign = 64;
+    if (!Opts.NativeHalfType) {
+      HalfFormat = &llvm::APFloat::IEEEsingle();
+      HalfWidth = HalfAlign = 32;
+    }
+  }
+
   if (Opts.OpenCL) {
     // OpenCL C requires specific widths for types, irrespective of
     // what these normally are for the target.
diff --git a/clang/lib/Basic/Targets.cpp b/clang/lib/Basic/Targets.cpp
index e3283510c6aa..dc1792b3471e 100644
--- a/clang/lib/Basic/Targets.cpp
+++ b/clang/lib/Basic/Targets.cpp
@@ -760,7 +760,7 @@ using namespace clang::targets;
 TargetInfo *
 TargetInfo::CreateTargetInfo(DiagnosticsEngine &Diags,
                              const std::shared_ptr<TargetOptions> &Opts) {
-  llvm::Triple Triple(Opts->Triple);
+  llvm::Triple Triple(llvm::Triple::normalize(Opts->Triple));
 
   // Construct the target
   std::unique_ptr<TargetInfo> Target = AllocateTarget(Triple, *Opts);
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index c8d243a8fb7a..5db1ce78c657 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -154,6 +154,7 @@ AArch64TargetInfo::AArch64TargetInfo(const llvm::Triple &Triple,
   else
     LongWidth = LongAlign = PointerWidth = PointerAlign = 32;
 
+  BitIntMaxAlign = 128;
   MaxVectorAlign = 128;
   MaxAtomicInlineWidth = 128;
   MaxAtomicPromoteWidth = 128;
@@ -224,7 +225,7 @@ bool AArch64TargetInfo::validateBranchProtection(StringRef Spec, StringRef,
                                                  BranchProtectionInfo &BPI,
                                                  StringRef &Err) const {
   llvm::ARM::ParsedBranchProtection PBP;
-  if (!llvm::ARM::parseBranchProtection(Spec, PBP, Err))
+  if (!llvm::ARM::parseBranchProtection(Spec, PBP, Err, HasPAuthLR))
     return false;
 
   BPI.SignReturnAddr =
@@ -1480,11 +1481,11 @@ AArch64leTargetInfo::AArch64leTargetInfo(const llvm::Triple &Triple,
 void AArch64leTargetInfo::setDataLayout() {
   if (getTriple().isOSBinFormatMachO()) {
     if(getTriple().isArch32Bit())
-      resetDataLayout("e-m:o-p:32:32-i64:64-i128:128-n32:64-S128", "_");
+      resetDataLayout("e-m:o-p:32:32-i64:64-i128:128-n32:64-S128-Fn32", "_");
     else
-      resetDataLayout("e-m:o-i64:64-i128:128-n32:64-S128", "_");
+      resetDataLayout("e-m:o-i64:64-i128:128-n32:64-S128-Fn32", "_");
   } else
-    resetDataLayout("e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128");
+    resetDataLayout("e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32");
 }
 
 void AArch64leTargetInfo::getTargetDefines(const LangOptions &Opts,
@@ -1507,7 +1508,7 @@ void AArch64beTargetInfo::getTargetDefines(const LangOptions &Opts,
 
 void AArch64beTargetInfo::setDataLayout() {
   assert(!getTriple().isOSBinFormatMachO());
-  resetDataLayout("E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128");
+  resetDataLayout("E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32");
 }
 
 WindowsARM64TargetInfo::WindowsARM64TargetInfo(const llvm::Triple &Triple,
@@ -1530,8 +1531,8 @@ WindowsARM64TargetInfo::WindowsARM64TargetInfo(const llvm::Triple &Triple,
 
 void WindowsARM64TargetInfo::setDataLayout() {
   resetDataLayout(Triple.isOSBinFormatMachO()
-                      ? "e-m:o-i64:64-i128:128-n32:64-S128"
-                      : "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128",
+                      ? "e-m:o-i64:64-i128:128-n32:64-S128-Fn32"
+                      : "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128-Fn32",
                   Triple.isOSBinFormatMachO() ? "_" : "");
 }
 
diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp
index 5742885df046..cc7be64656e5 100644
--- a/clang/lib/Basic/Targets/AMDGPU.cpp
+++ b/clang/lib/Basic/Targets/AMDGPU.cpp
@@ -232,7 +232,7 @@ AMDGPUTargetInfo::AMDGPUTargetInfo(const llvm::Triple &Triple,
 
   HasLegalHalfType = true;
   HasFloat16 = true;
-  WavefrontSize = GPUFeatures & llvm::AMDGPU::FEATURE_WAVE32 ? 32 : 64;
+  WavefrontSize = (GPUFeatures & llvm::AMDGPU::FEATURE_WAVE32) ? 32 : 64;
   AllowAMDGPUUnsafeFPAtomics = Opts.AllowAMDGPUUnsafeFPAtomics;
 
   // Set pointer width and alignment for the generic address space.
diff --git a/clang/lib/Basic/Targets/ARM.cpp b/clang/lib/Basic/Targets/ARM.cpp
index 877799c66ec4..7423626d7c3c 100644
--- a/clang/lib/Basic/Targets/ARM.cpp
+++ b/clang/lib/Basic/Targets/ARM.cpp
@@ -173,8 +173,7 @@ bool ARMTargetInfo::supportsThumb() const {
 }
 
 bool ARMTargetInfo::supportsThumb2() const {
-  return CPUAttr.equals("6T2") ||
-         (ArchVersion >= 7 && !CPUAttr.equals("8M_BASE"));
+  return CPUAttr == "6T2" || (ArchVersion >= 7 && CPUAttr != "8M_BASE");
 }
 
 StringRef ARMTargetInfo::getCPUAttr() const {
@@ -1162,7 +1161,7 @@ bool ARMTargetInfo::validateAsmConstraint(
     return true;
   case 'j': // An immediate integer between 0 and 65535 (valid for MOVW)
     // only available in ARMv6T2 and above
-    if (CPUAttr.equals("6T2") || ArchVersion >= 7) {
+    if (CPUAttr == "6T2" || ArchVersion >= 7) {
       Info.setRequiresImmediate(0, 65535);
       return true;
     }
diff --git a/clang/lib/Basic/Targets/DirectX.h b/clang/lib/Basic/Targets/DirectX.h
index acfcc8c47ba9..a084e2823453 100644
--- a/clang/lib/Basic/Targets/DirectX.h
+++ b/clang/lib/Basic/Targets/DirectX.h
@@ -53,7 +53,6 @@ public:
       : TargetInfo(Triple) {
     TLSSupported = false;
     VLASupported = false;
-    LongWidth = LongAlign = 64;
     AddrSpaceMap = &DirectXAddrSpaceMap;
     UseAddrSpaceMapMangling = true;
     HasLegalHalfType = true;
diff --git a/clang/lib/Basic/Targets/PPC.cpp b/clang/lib/Basic/Targets/PPC.cpp
index d62a7457682e..a1e5f20f7dbe 100644
--- a/clang/lib/Basic/Targets/PPC.cpp
+++ b/clang/lib/Basic/Targets/PPC.cpp
@@ -91,6 +91,8 @@ bool PPCTargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       IsISA3_1 = true;
     } else if (Feature == "+quadword-atomics") {
       HasQuadwordAtomics = true;
+    } else if (Feature == "+aix-shared-lib-tls-model-opt") {
+      HasAIXShLibTLSModelOpt = true;
     }
     // TODO: Finish this list and add an assert that we've handled them
     // all.
@@ -580,6 +582,9 @@ bool PPCTargetInfo::initFeatureMap(
   Features["aix-small-local-exec-tls"] = false;
   Features["aix-small-local-dynamic-tls"] = false;
 
+  // Turn off TLS model opt by default.
+  Features["aix-shared-lib-tls-model-opt"] = false;
+
   Features["spe"] = llvm::StringSwitch<bool>(CPU)
                         .Case("8548", true)
                         .Case("e500", true)
@@ -722,6 +727,7 @@ bool PPCTargetInfo::hasFeature(StringRef Feature) const {
       .Case("isa-v30-instructions", IsISA3_0)
       .Case("isa-v31-instructions", IsISA3_1)
       .Case("quadword-atomics", HasQuadwordAtomics)
+      .Case("aix-shared-lib-tls-model-opt", HasAIXShLibTLSModelOpt)
       .Default(false);
 }
 
@@ -901,6 +907,19 @@ ArrayRef<Builtin::Info> PPCTargetInfo::getTargetBuiltins() const {
 }
 
 bool PPCTargetInfo::validateCpuSupports(StringRef FeatureStr) const {
+  llvm::Triple Triple = getTriple();
+  if (Triple.isOSAIX()) {
+#define PPC_AIX_FEATURE(NAME, DESC, SUPPORT_METHOD, INDEX, MASK, COMP_OP,      \
+                        VALUE)                                                 \
+  .Case(NAME, true)
+    return llvm::StringSwitch<bool>(FeatureStr)
+#include "llvm/TargetParser/PPCTargetParser.def"
+        .Default(false);
+  }
+
+  assert(Triple.isOSLinux() &&
+         "__builtin_cpu_supports() is only supported for AIX and Linux.");
+
 #define PPC_LNX_FEATURE(NAME, DESC, ENUMNAME, ENUMVAL, HWCAPN) .Case(NAME, true)
   return llvm::StringSwitch<bool>(FeatureStr)
 #include "llvm/TargetParser/PPCTargetParser.def"
@@ -910,7 +929,7 @@ bool PPCTargetInfo::validateCpuSupports(StringRef FeatureStr) const {
 bool PPCTargetInfo::validateCpuIs(StringRef CPUName) const {
   llvm::Triple Triple = getTriple();
   if (Triple.isOSAIX()) {
-#define PPC_AIX_CPU(NAME, SUPPORT, INDEX, OP, VALUE) .Case(NAME, true)
+#define PPC_AIX_CPU(NAME, SUPPORT_METHOD, INDEX, OP, VALUE) .Case(NAME, true)
     return llvm::StringSwitch<bool>(CPUName)
 #include "llvm/TargetParser/PPCTargetParser.def"
         .Default(false);
diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h
index cd0f08dfb3bc..fc23c30c6852 100644
--- a/clang/lib/Basic/Targets/PPC.h
+++ b/clang/lib/Basic/Targets/PPC.h
@@ -81,6 +81,7 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo {
   bool IsISA3_0 = false;
   bool IsISA3_1 = false;
   bool HasQuadwordAtomics = false;
+  bool HasAIXShLibTLSModelOpt = false;
 
 protected:
   std::string ABI;
@@ -359,14 +360,21 @@ public:
   bool hasBitIntType() const override { return true; }
 
   bool isSPRegName(StringRef RegName) const override {
-    return RegName.equals("r1") || RegName.equals("x1");
+    return RegName == "r1" || RegName == "x1";
   }
 
   // We support __builtin_cpu_supports/__builtin_cpu_is on targets that
   // have Glibc since it is Glibc that provides the HWCAP[2] in the auxv.
   static constexpr int MINIMUM_AIX_OS_MAJOR = 7;
   static constexpr int MINIMUM_AIX_OS_MINOR = 2;
-  bool supportsCpuSupports() const override { return getTriple().isOSGlibc(); }
+  bool supportsCpuSupports() const override {
+    llvm::Triple Triple = getTriple();
+    // AIX 7.2 is the minimum requirement to support __builtin_cpu_supports().
+    return Triple.isOSGlibc() ||
+           (Triple.isOSAIX() &&
+            !Triple.isOSVersionLT(MINIMUM_AIX_OS_MAJOR, MINIMUM_AIX_OS_MINOR));
+  }
+
   bool supportsCpuIs() const override {
     llvm::Triple Triple = getTriple();
     // AIX 7.2 is the minimum requirement to support __builtin_cpu_is().
diff --git a/clang/lib/Basic/Targets/SystemZ.h b/clang/lib/Basic/Targets/SystemZ.h
index 73d3aa01a043..3bc6f2c1d308 100644
--- a/clang/lib/Basic/Targets/SystemZ.h
+++ b/clang/lib/Basic/Targets/SystemZ.h
@@ -84,7 +84,7 @@ public:
   ArrayRef<TargetInfo::AddlRegName> getGCCAddlRegNames() const override;
 
   bool isSPRegName(StringRef RegName) const override {
-    return RegName.equals("r15");
+    return RegName == "r15";
   }
 
   bool validateAsmConstraint(const char *&Name,
diff --git a/clang/lib/Basic/Targets/WebAssembly.cpp b/clang/lib/Basic/Targets/WebAssembly.cpp
index a6d820e10808..5a000314a72c 100644
--- a/clang/lib/Basic/Targets/WebAssembly.cpp
+++ b/clang/lib/Basic/Targets/WebAssembly.cpp
@@ -45,20 +45,20 @@ bool WebAssemblyTargetInfo::setABI(const std::string &Name) {
 
 bool WebAssemblyTargetInfo::hasFeature(StringRef Feature) const {
   return llvm::StringSwitch<bool>(Feature)
-      .Case("simd128", SIMDLevel >= SIMD128)
-      .Case("relaxed-simd", SIMDLevel >= RelaxedSIMD)
+      .Case("atomics", HasAtomics)
+      .Case("bulk-memory", HasBulkMemory)
+      .Case("exception-handling", HasExceptionHandling)
+      .Case("extended-const", HasExtendedConst)
       .Case("half-precision", HasHalfPrecision)
+      .Case("multimemory", HasMultiMemory)
+      .Case("multivalue", HasMultivalue)
+      .Case("mutable-globals", HasMutableGlobals)
       .Case("nontrapping-fptoint", HasNontrappingFPToInt)
+      .Case("reference-types", HasReferenceTypes)
+      .Case("relaxed-simd", SIMDLevel >= RelaxedSIMD)
       .Case("sign-ext", HasSignExt)
-      .Case("exception-handling", HasExceptionHandling)
-      .Case("bulk-memory", HasBulkMemory)
-      .Case("atomics", HasAtomics)
-      .Case("mutable-globals", HasMutableGlobals)
-      .Case("multivalue", HasMultivalue)
+      .Case("simd128", SIMDLevel >= SIMD128)
       .Case("tail-call", HasTailCall)
-      .Case("reference-types", HasReferenceTypes)
-      .Case("extended-const", HasExtendedConst)
-      .Case("multimemory", HasMultiMemory)
       .Default(false);
 }
 
@@ -74,34 +74,34 @@ void WebAssemblyTargetInfo::fillValidCPUList(
 void WebAssemblyTargetInfo::getTargetDefines(const LangOptions &Opts,
                                              MacroBuilder &Builder) const {
   defineCPUMacros(Builder, "wasm", /*Tuning=*/false);
-  if (SIMDLevel >= SIMD128)
-    Builder.defineMacro("__wasm_simd128__");
-  if (SIMDLevel >= RelaxedSIMD)
-    Builder.defineMacro("__wasm_relaxed_simd__");
-  if (HasNontrappingFPToInt)
-    Builder.defineMacro("__wasm_nontrapping_fptoint__");
-  if (HasSignExt)
-    Builder.defineMacro("__wasm_sign_ext__");
-  if (HasExceptionHandling)
-    Builder.defineMacro("__wasm_exception_handling__");
-  if (HasBulkMemory)
-    Builder.defineMacro("__wasm_bulk_memory__");
   if (HasAtomics)
     Builder.defineMacro("__wasm_atomics__");
-  if (HasMutableGlobals)
-    Builder.defineMacro("__wasm_mutable_globals__");
-  if (HasMultivalue)
-    Builder.defineMacro("__wasm_multivalue__");
-  if (HasTailCall)
-    Builder.defineMacro("__wasm_tail_call__");
-  if (HasReferenceTypes)
-    Builder.defineMacro("__wasm_reference_types__");
+  if (HasBulkMemory)
+    Builder.defineMacro("__wasm_bulk_memory__");
+  if (HasExceptionHandling)
+    Builder.defineMacro("__wasm_exception_handling__");
   if (HasExtendedConst)
     Builder.defineMacro("__wasm_extended_const__");
   if (HasMultiMemory)
     Builder.defineMacro("__wasm_multimemory__");
   if (HasHalfPrecision)
     Builder.defineMacro("__wasm_half_precision__");
+  if (HasMultivalue)
+    Builder.defineMacro("__wasm_multivalue__");
+  if (HasMutableGlobals)
+    Builder.defineMacro("__wasm_mutable_globals__");
+  if (HasNontrappingFPToInt)
+    Builder.defineMacro("__wasm_nontrapping_fptoint__");
+  if (HasReferenceTypes)
+    Builder.defineMacro("__wasm_reference_types__");
+  if (SIMDLevel >= RelaxedSIMD)
+    Builder.defineMacro("__wasm_relaxed_simd__");
+  if (HasSignExt)
+    Builder.defineMacro("__wasm_sign_ext__");
+  if (SIMDLevel >= SIMD128)
+    Builder.defineMacro("__wasm_simd128__");
+  if (HasTailCall)
+    Builder.defineMacro("__wasm_tail_call__");
 
   Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1");
   Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2");
@@ -159,12 +159,14 @@ bool WebAssemblyTargetInfo::initFeatureMap(
     addGenericFeatures();
     Features["atomics"] = true;
     Features["bulk-memory"] = true;
+    Features["exception-handling"] = true;
+    Features["extended-const"] = true;
+    Features["half-precision"] = true;
     Features["multimemory"] = true;
     Features["nontrapping-fptoint"] = true;
     Features["reference-types"] = true;
     Features["tail-call"] = true;
-    Features["half-precision"] = true;
-    setSIMDLevel(Features, SIMD128, true);
+    setSIMDLevel(Features, RelaxedSIMD, true);
   };
   if (CPU == "generic") {
     addGenericFeatures();
@@ -178,36 +180,20 @@ bool WebAssemblyTargetInfo::initFeatureMap(
 bool WebAssemblyTargetInfo::handleTargetFeatures(
     std::vector<std::string> &Features, DiagnosticsEngine &Diags) {
   for (const auto &Feature : Features) {
-    if (Feature == "+simd128") {
-      SIMDLevel = std::max(SIMDLevel, SIMD128);
-      continue;
-    }
-    if (Feature == "-simd128") {
-      SIMDLevel = std::min(SIMDLevel, SIMDEnum(SIMD128 - 1));
-      continue;
-    }
-    if (Feature == "+relaxed-simd") {
-      SIMDLevel = std::max(SIMDLevel, RelaxedSIMD);
-      continue;
-    }
-    if (Feature == "-relaxed-simd") {
-      SIMDLevel = std::min(SIMDLevel, SIMDEnum(RelaxedSIMD - 1));
-      continue;
-    }
-    if (Feature == "+nontrapping-fptoint") {
-      HasNontrappingFPToInt = true;
+    if (Feature == "+atomics") {
+      HasAtomics = true;
       continue;
     }
-    if (Feature == "-nontrapping-fptoint") {
-      HasNontrappingFPToInt = false;
+    if (Feature == "-atomics") {
+      HasAtomics = false;
       continue;
     }
-    if (Feature == "+sign-ext") {
-      HasSignExt = true;
+    if (Feature == "+bulk-memory") {
+      HasBulkMemory = true;
       continue;
     }
-    if (Feature == "-sign-ext") {
-      HasSignExt = false;
+    if (Feature == "-bulk-memory") {
+      HasBulkMemory = false;
       continue;
     }
     if (Feature == "+exception-handling") {
@@ -218,12 +204,12 @@ bool WebAssemblyTargetInfo::handleTargetFeatures(
       HasExceptionHandling = false;
       continue;
     }
-    if (Feature == "+bulk-memory") {
-      HasBulkMemory = true;
+    if (Feature == "+extended-const") {
+      HasExtendedConst = true;
       continue;
     }
-    if (Feature == "-bulk-memory") {
-      HasBulkMemory = false;
+    if (Feature == "-extended-const") {
+      HasExtendedConst = false;
       continue;
     }
     if (Feature == "+half-precision") {
@@ -235,20 +221,12 @@ bool WebAssemblyTargetInfo::handleTargetFeatures(
       HasHalfPrecision = false;
       continue;
     }
-    if (Feature == "+atomics") {
-      HasAtomics = true;
-      continue;
-    }
-    if (Feature == "-atomics") {
-      HasAtomics = false;
-      continue;
-    }
-    if (Feature == "+mutable-globals") {
-      HasMutableGlobals = true;
+    if (Feature == "+multimemory") {
+      HasMultiMemory = true;
       continue;
     }
-    if (Feature == "-mutable-globals") {
-      HasMutableGlobals = false;
+    if (Feature == "-multimemory") {
+      HasMultiMemory = false;
       continue;
     }
     if (Feature == "+multivalue") {
@@ -259,12 +237,20 @@ bool WebAssemblyTargetInfo::handleTargetFeatures(
       HasMultivalue = false;
       continue;
     }
-    if (Feature == "+tail-call") {
-      HasTailCall = true;
+    if (Feature == "+mutable-globals") {
+      HasMutableGlobals = true;
       continue;
     }
-    if (Feature == "-tail-call") {
-      HasTailCall = false;
+    if (Feature == "-mutable-globals") {
+      HasMutableGlobals = false;
+      continue;
+    }
+    if (Feature == "+nontrapping-fptoint") {
+      HasNontrappingFPToInt = true;
+      continue;
+    }
+    if (Feature == "-nontrapping-fptoint") {
+      HasNontrappingFPToInt = false;
       continue;
     }
     if (Feature == "+reference-types") {
@@ -275,20 +261,36 @@ bool WebAssemblyTargetInfo::handleTargetFeatures(
       HasReferenceTypes = false;
       continue;
     }
-    if (Feature == "+extended-const") {
-      HasExtendedConst = true;
+    if (Feature == "+relaxed-simd") {
+      SIMDLevel = std::max(SIMDLevel, RelaxedSIMD);
       continue;
     }
-    if (Feature == "-extended-const") {
-      HasExtendedConst = false;
+    if (Feature == "-relaxed-simd") {
+      SIMDLevel = std::min(SIMDLevel, SIMDEnum(RelaxedSIMD - 1));
       continue;
     }
-    if (Feature == "+multimemory") {
-      HasMultiMemory = true;
+    if (Feature == "+sign-ext") {
+      HasSignExt = true;
       continue;
     }
-    if (Feature == "-multimemory") {
-      HasMultiMemory = false;
+    if (Feature == "-sign-ext") {
+      HasSignExt = false;
+      continue;
+    }
+    if (Feature == "+simd128") {
+      SIMDLevel = std::max(SIMDLevel, SIMD128);
+      continue;
+    }
+    if (Feature == "-simd128") {
+      SIMDLevel = std::min(SIMDLevel, SIMDEnum(SIMD128 - 1));
+      continue;
+    }
+    if (Feature == "+tail-call") {
+      HasTailCall = true;
+      continue;
+    }
+    if (Feature == "-tail-call") {
+      HasTailCall = false;
       continue;
     }
 
diff --git a/clang/lib/Basic/Targets/WebAssembly.h b/clang/lib/Basic/Targets/WebAssembly.h
index e4c18879182e..4db97867df60 100644
--- a/clang/lib/Basic/Targets/WebAssembly.h
+++ b/clang/lib/Basic/Targets/WebAssembly.h
@@ -53,18 +53,18 @@ class LLVM_LIBRARY_VISIBILITY WebAssemblyTargetInfo : public TargetInfo {
     RelaxedSIMD,
   } SIMDLevel = NoSIMD;
 
-  bool HasNontrappingFPToInt = false;
-  bool HasSignExt = false;
-  bool HasExceptionHandling = false;
-  bool HasBulkMemory = false;
   bool HasAtomics = false;
-  bool HasMutableGlobals = false;
-  bool HasMultivalue = false;
-  bool HasTailCall = false;
-  bool HasReferenceTypes = false;
+  bool HasBulkMemory = false;
+  bool HasExceptionHandling = false;
   bool HasExtendedConst = false;
-  bool HasMultiMemory = false;
   bool HasHalfPrecision = false;
+  bool HasMultiMemory = false;
+  bool HasMultivalue = false;
+  bool HasMutableGlobals = false;
+  bool HasNontrappingFPToInt = false;
+  bool HasReferenceTypes = false;
+  bool HasSignExt = false;
+  bool HasTailCall = false;
 
   std::string ABI;
 
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index bf1767c87fe1..b823eaf6ce33 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -64,6 +64,8 @@ static const char *const GCCRegNames[] = {
     "dr0",   "dr1",   "dr2",   "dr3",   "dr6",     "dr7",
     "bnd0",  "bnd1",  "bnd2",  "bnd3",
     "tmm0",  "tmm1",  "tmm2",  "tmm3",  "tmm4",    "tmm5",  "tmm6",  "tmm7",
+    "r16",   "r17",   "r18",   "r19",   "r20",     "r21",   "r22",   "r23",
+    "r24",   "r25",   "r26",   "r27",   "r28",     "r29",   "r30",   "r31",
 };
 
 const TargetInfo::AddlRegName AddlRegNames[] = {
@@ -83,8 +85,23 @@ const TargetInfo::AddlRegName AddlRegNames[] = {
     {{"r13d", "r13w", "r13b"}, 43},
     {{"r14d", "r14w", "r14b"}, 44},
     {{"r15d", "r15w", "r15b"}, 45},
+    {{"r16d", "r16w", "r16b"}, 165},
+    {{"r17d", "r17w", "r17b"}, 166},
+    {{"r18d", "r18w", "r18b"}, 167},
+    {{"r19d", "r19w", "r19b"}, 168},
+    {{"r20d", "r20w", "r20b"}, 169},
+    {{"r21d", "r21w", "r21b"}, 170},
+    {{"r22d", "r22w", "r22b"}, 171},
+    {{"r23d", "r23w", "r23b"}, 172},
+    {{"r24d", "r24w", "r24b"}, 173},
+    {{"r25d", "r25w", "r25b"}, 174},
+    {{"r26d", "r26w", "r26b"}, 175},
+    {{"r27d", "r27w", "r27b"}, 176},
+    {{"r28d", "r28w", "r28b"}, 177},
+    {{"r29d", "r29w", "r29b"}, 178},
+    {{"r30d", "r30w", "r30b"}, 179},
+    {{"r31d", "r31w", "r31b"}, 180},
 };
-
 } // namespace targets
 } // namespace clang
 
@@ -441,6 +458,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasNDD = true;
     } else if (Feature == "+ccmp") {
       HasCCMP = true;
+    } else if (Feature == "+nf") {
+      HasNF = true;
     } else if (Feature == "+cf") {
       HasCF = true;
     }
@@ -952,6 +971,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__NDD__");
   if (HasCCMP)
     Builder.defineMacro("__CCMP__");
+  if (HasNF)
+    Builder.defineMacro("__NF__");
   if (HasCF)
     Builder.defineMacro("__CF__");
   // Condition here is aligned with the feature set of mapxf in Options.td
@@ -1157,6 +1178,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
       .Case("ppx", true)
       .Case("ndd", true)
       .Case("ccmp", true)
+      .Case("nf", true)
       .Case("cf", true)
       .Default(false);
 }
@@ -1279,6 +1301,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
       .Case("ppx", HasPPX)
       .Case("ndd", HasNDD)
       .Case("ccmp", HasCCMP)
+      .Case("nf", HasNF)
       .Case("cf", HasCF)
       .Default(false);
 }
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index c14e4d5f433d..6a0a6cb84203 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -173,6 +173,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
   bool HasPPX = false;
   bool HasNDD = false;
   bool HasCCMP = false;
+  bool HasNF = false;
   bool HasCF = false;
 
 protected:
@@ -218,7 +219,7 @@ public:
   ArrayRef<TargetInfo::AddlRegName> getGCCAddlRegNames() const override;
 
   bool isSPRegName(StringRef RegName) const override {
-    return RegName.equals("esp") || RegName.equals("rsp");
+    return RegName == "esp" || RegName == "rsp";
   }
 
   bool supportsCpuSupports() const override { return true; }
@@ -246,7 +247,7 @@ public:
                                       bool &HasSizeMismatch) const override {
     // esp and ebp are the only 32-bit registers the x86 backend can currently
     // handle.
-    if (RegName.equals("esp") || RegName.equals("ebp")) {
+    if (RegName == "esp" || RegName == "ebp") {
       // Check that the register size is 32-bit.
       HasSizeMismatch = RegSize != 32;
       return true;
@@ -802,7 +803,7 @@ public:
                                       bool &HasSizeMismatch) const override {
     // rsp and rbp are the only 64-bit registers the x86 backend can currently
     // handle.
-    if (RegName.equals("rsp") || RegName.equals("rbp")) {
+    if (RegName == "rsp" || RegName == "rbp") {
       // Check that the register size is 64-bit.
       HasSizeMismatch = RegSize != 64;
       return true;
diff --git a/clang/lib/CodeGen/BackendConsumer.h b/clang/lib/CodeGen/BackendConsumer.h
index fd0f1984d6c0..0fe9929dca2b 100644
--- a/clang/lib/CodeGen/BackendConsumer.h
+++ b/clang/lib/CodeGen/BackendConsumer.h
@@ -34,7 +34,6 @@ class BackendConsumer : public ASTConsumer {
   const CodeGenOptions &CodeGenOpts;
   const TargetOptions &TargetOpts;
   const LangOptions &LangOpts;
-  const FileManager &FileMgr;
   std::unique_ptr<raw_pwrite_stream> AsmOutStream;
   ASTContext *Context;
   IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS;
@@ -76,7 +75,7 @@ public:
                   const PreprocessorOptions &PPOpts,
                   const CodeGenOptions &CodeGenOpts,
                   const TargetOptions &TargetOpts, const LangOptions &LangOpts,
-                  const FileManager &FileMgr, const std::string &InFile,
+                  const std::string &InFile,
                   SmallVector<LinkModule, 4> LinkModules,
                   std::unique_ptr<raw_pwrite_stream> OS, llvm::LLVMContext &C,
                   CoverageSourceInfo *CoverageInfo = nullptr);
@@ -90,8 +89,8 @@ public:
                   const PreprocessorOptions &PPOpts,
                   const CodeGenOptions &CodeGenOpts,
                   const TargetOptions &TargetOpts, const LangOptions &LangOpts,
-                  const FileManager &FileMgr, llvm::Module *Module,
-                  SmallVector<LinkModule, 4> LinkModules, llvm::LLVMContext &C,
+                  llvm::Module *Module, SmallVector<LinkModule, 4> LinkModules,
+                  llvm::LLVMContext &C,
                   CoverageSourceInfo *CoverageInfo = nullptr);
 
   llvm::Module *getModule() const;
@@ -115,10 +114,6 @@ public:
   // Links each entry in LinkModules into our module.  Returns true on error.
   bool LinkInModules(llvm::Module *M, bool ShouldLinkFiles = true);
 
-  // Load a bitcode module from -mlink-builtin-bitcode option using
-  // methods from a BackendConsumer instead of CompilerInstance
-  bool ReloadModules(llvm::Module *M);
-
   /// Get the best possible source location to represent a diagnostic that
   /// may have associated debug info.
   const FullSourceLoc getBestLocationFromDebugLoc(
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 22c3f8642ad8..90985c08fe7f 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -120,11 +120,6 @@ static cl::opt<PGOOptions::ColdFuncOpt> ClPGOColdFuncAttr(
                           "Mark cold functions with optnone.")));
 
 extern cl::opt<InstrProfCorrelator::ProfCorrelatorKind> ProfileCorrelate;
-
-// Re-link builtin bitcodes after optimization
-cl::opt<bool> ClRelinkBuiltinBitcodePostop(
-    "relink-builtin-bitcode-postop", cl::Optional,
-    cl::desc("Re-link builtin bitcodes after optimization."));
 } // namespace llvm
 
 namespace {
@@ -423,6 +418,7 @@ static bool initTargetOptions(DiagnosticsEngine &Diags,
   Options.UniqueSectionNames = CodeGenOpts.UniqueSectionNames;
   Options.UniqueBasicBlockSectionNames =
       CodeGenOpts.UniqueBasicBlockSectionNames;
+  Options.SeparateNamedSections = CodeGenOpts.SeparateNamedSections;
   Options.TLSSize = CodeGenOpts.TLSSize;
   Options.EnableTLSDESC = CodeGenOpts.EnableTLSDESC;
   Options.EmulatedTLS = CodeGenOpts.EmulatedTLS;
@@ -1054,11 +1050,8 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
     }
   }
 
-  // Re-link against any bitcodes supplied via the -mlink-builtin-bitcode option
-  // Some optimizations may generate new function calls that would not have
-  // been linked pre-optimization (i.e. fused sincos calls generated by
-  // AMDGPULibCalls::fold_sincos.)
-  if (ClRelinkBuiltinBitcodePostop)
+  // Link against bitcodes supplied via the -mlink-builtin-bitcode option
+  if (CodeGenOpts.LinkBitcodePostopt)
     MPM.addPass(LinkInModulesPass(BC, false));
 
   // Add a verifier pass if requested. We don't have to do this if the action
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index a370734e00d3..f9ee93049b12 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -513,8 +513,8 @@ static Value *emitBinaryMaybeConstrainedFPBuiltin(CodeGenFunction &CGF,
   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
   llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
 
+  CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
   if (CGF.Builder.getIsFPConstrained()) {
-    CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
     Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID, Src0->getType());
     return CGF.Builder.CreateConstrainedFPCall(F, { Src0, Src1 });
   } else {
@@ -530,8 +530,8 @@ static Value *emitBinaryExpMaybeConstrainedFPBuiltin(
   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
   llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
 
+  CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
   if (CGF.Builder.getIsFPConstrained()) {
-    CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
     Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID,
                                        {Src0->getType(), Src1->getType()});
     return CGF.Builder.CreateConstrainedFPCall(F, {Src0, Src1});
@@ -551,8 +551,8 @@ static Value *emitTernaryMaybeConstrainedFPBuiltin(CodeGenFunction &CGF,
   llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
   llvm::Value *Src2 = CGF.EmitScalarExpr(E->getArg(2));
 
+  CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
   if (CGF.Builder.getIsFPConstrained()) {
-    CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
     Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID, Src0->getType());
     return CGF.Builder.CreateConstrainedFPCall(F, { Src0, Src1, Src2 });
   } else {
@@ -704,6 +704,7 @@ static Value *EmitSignBit(CodeGenFunction &CGF, Value *V) {
 
 static RValue emitLibraryCall(CodeGenFunction &CGF, const FunctionDecl *FD,
                               const CallExpr *E, llvm::Constant *calleeValue) {
+  CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
   CGCallee callee = CGCallee::forDirect(calleeValue, GlobalDecl(FD));
   return CGF.EmitCall(E->getCallee()->getType(), callee, E, ReturnValueSlot());
 }
@@ -2660,7 +2661,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     if (OP.hasMathErrnoOverride())
       ErrnoOverriden = OP.getMathErrnoOverride();
   }
-  // True if 'atttibute__((optnone)) is used. This attibute overrides
+  // True if 'attribute__((optnone))' is used. This attribute overrides
   // fast-math which implies math-errno.
   bool OptNone = CurFuncDecl && CurFuncDecl->hasAttr<OptimizeNoneAttr>();
 
@@ -3239,8 +3240,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
         Builder.getInt1(HasFallback || getTarget().isCLZForZeroUndef());
     Value *Result = Builder.CreateCall(F, {ArgValue, ZeroUndef});
     if (Result->getType() != ResultType)
-      Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
-                                     "cast");
+      Result =
+          Builder.CreateIntCast(Result, ResultType, /*isSigned*/ false, "cast");
     if (!HasFallback)
       return RValue::get(Result);
 
@@ -3271,8 +3272,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
         Builder.getInt1(HasFallback || getTarget().isCLZForZeroUndef());
     Value *Result = Builder.CreateCall(F, {ArgValue, ZeroUndef});
     if (Result->getType() != ResultType)
-      Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
-                                     "cast");
+      Result =
+          Builder.CreateIntCast(Result, ResultType, /*isSigned*/ false, "cast");
     if (!HasFallback)
       return RValue::get(Result);
 
@@ -3351,8 +3352,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     llvm::Type *ResultType = ConvertType(E->getType());
     Value *Result = Builder.CreateCall(F, ArgValue);
     if (Result->getType() != ResultType)
-      Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
-                                     "cast");
+      Result =
+          Builder.CreateIntCast(Result, ResultType, /*isSigned*/ false, "cast");
     return RValue::get(Result);
   }
   case Builtin::BI__builtin_unpredictable: {
@@ -3821,7 +3822,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   case Builtin::BI__builtin_elementwise_sin:
     return RValue::get(
         emitUnaryBuiltin(*this, E, llvm::Intrinsic::sin, "elt.sin"));
-
+  case Builtin::BI__builtin_elementwise_tan:
+    return RValue::get(
+        emitUnaryBuiltin(*this, E, llvm::Intrinsic::tan, "elt.tan"));
   case Builtin::BI__builtin_elementwise_trunc:
     return RValue::get(
         emitUnaryBuiltin(*this, E, llvm::Intrinsic::trunc, "elt.trunc"));
@@ -16783,7 +16786,7 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
 
 #include "llvm/TargetParser/PPCTargetParser.def"
   auto GenAIXPPCBuiltinCpuExpr = [&](unsigned SupportMethod, unsigned FieldIdx,
-                                     unsigned CompOp,
+                                     unsigned Mask, CmpInst::Predicate CompOp,
                                      unsigned OpValue) -> Value * {
     if (SupportMethod == AIX_BUILTIN_PPC_FALSE)
       return llvm::ConstantInt::getFalse(ConvertType(E->getType()));
@@ -16791,24 +16794,45 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
     if (SupportMethod == AIX_BUILTIN_PPC_TRUE)
       return llvm::ConstantInt::getTrue(ConvertType(E->getType()));
 
-    assert(SupportMethod <= USE_SYS_CONF && "Invalid value for SupportMethod.");
-    assert((CompOp == COMP_EQ) && "Only equal comparisons are supported.");
+    assert(SupportMethod <= SYS_CALL && "Invalid value for SupportMethod.");
+
+    llvm::Value *FieldValue = nullptr;
+    if (SupportMethod == USE_SYS_CONF) {
+      llvm::Type *STy = llvm::StructType::get(PPC_SYSTEMCONFIG_TYPE);
+      llvm::Constant *SysConf =
+          CGM.CreateRuntimeVariable(STy, "_system_configuration");
+
+      // Grab the appropriate field from _system_configuration.
+      llvm::Value *Idxs[] = {ConstantInt::get(Int32Ty, 0),
+                             ConstantInt::get(Int32Ty, FieldIdx)};
+
+      FieldValue = Builder.CreateGEP(STy, SysConf, Idxs);
+      FieldValue = Builder.CreateAlignedLoad(Int32Ty, FieldValue,
+                                             CharUnits::fromQuantity(4));
+    } else if (SupportMethod == SYS_CALL) {
+      llvm::FunctionType *FTy =
+          llvm::FunctionType::get(Int64Ty, Int32Ty, false);
+      llvm::FunctionCallee Func =
+          CGM.CreateRuntimeFunction(FTy, "getsystemcfg");
+
+      FieldValue =
+          Builder.CreateCall(Func, {ConstantInt::get(Int32Ty, FieldIdx)});
+    }
+    assert(FieldValue &&
+           "SupportMethod value is not defined in PPCTargetParser.def.");
 
-    llvm::Type *STy = llvm::StructType::get(PPC_SYSTEMCONFIG_TYPE);
-    llvm::Constant *SysConf =
-        CGM.CreateRuntimeVariable(STy, "_system_configuration");
+    if (Mask)
+      FieldValue = Builder.CreateAnd(FieldValue, Mask);
 
-    // Grab the appropriate field from _system_configuration.
-    llvm::Value *Idxs[] = {ConstantInt::get(Int32Ty, 0),
-                           ConstantInt::get(Int32Ty, FieldIdx)};
+    llvm::Type *ValueType = FieldValue->getType();
+    bool IsValueType64Bit = ValueType->isIntegerTy(64);
+    assert(
+        (IsValueType64Bit || ValueType->isIntegerTy(32)) &&
+        "Only 32/64-bit integers are supported in GenAIXPPCBuiltinCpuExpr().");
 
-    llvm::Value *FieldValue = Builder.CreateGEP(STy, SysConf, Idxs);
-    FieldValue = Builder.CreateAlignedLoad(Int32Ty, FieldValue,
-                                           CharUnits::fromQuantity(4));
-    assert(FieldValue->getType()->isIntegerTy(32) &&
-           "Only 32-bit integers are supported in GenAIXPPCBuiltinCpuExpr().");
-    return Builder.CreateICmp(ICmpInst::ICMP_EQ, FieldValue,
-                              ConstantInt::get(Int32Ty, OpValue));
+    return Builder.CreateICmp(
+        CompOp, FieldValue,
+        ConstantInt::get(IsValueType64Bit ? Int64Ty : Int32Ty, OpValue));
   };
 
   switch (BuiltinID) {
@@ -16820,15 +16844,18 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
     llvm::Triple Triple = getTarget().getTriple();
 
     if (Triple.isOSAIX()) {
-      unsigned IsCpuSupport, FieldIdx, CompareOp, CpuIdValue;
-      typedef std::tuple<unsigned, unsigned, unsigned, unsigned> CPUType;
-      std::tie(IsCpuSupport, FieldIdx, CompareOp, CpuIdValue) =
+      unsigned SupportMethod, FieldIdx, CpuIdValue;
+      CmpInst::Predicate CompareOp;
+      typedef std::tuple<unsigned, unsigned, CmpInst::Predicate, unsigned>
+          CPUType;
+      std::tie(SupportMethod, FieldIdx, CompareOp, CpuIdValue) =
           static_cast<CPUType>(StringSwitch<CPUType>(CPUStr)
-#define PPC_AIX_CPU(NAME, SUPPORT_MAGIC, INDEX, COMPARE_OP, VALUE)             \
-  .Case(NAME, {SUPPORT_MAGIC, INDEX, COMPARE_OP, VALUE})
+#define PPC_AIX_CPU(NAME, SUPPORT_METHOD, INDEX, COMPARE_OP, VALUE)            \
+  .Case(NAME, {SUPPORT_METHOD, INDEX, COMPARE_OP, VALUE})
 #include "llvm/TargetParser/PPCTargetParser.def"
-          );
-      return GenAIXPPCBuiltinCpuExpr(IsCpuSupport, FieldIdx, CompareOp,
+                                   .Default({AIX_BUILTIN_PPC_FALSE, 0,
+                                             CmpInst::Predicate(), 0}));
+      return GenAIXPPCBuiltinCpuExpr(SupportMethod, FieldIdx, 0, CompareOp,
                                      CpuIdValue);
     }
 
@@ -16846,10 +16873,31 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
                                 llvm::ConstantInt::get(Int32Ty, NumCPUID));
   }
   case Builtin::BI__builtin_cpu_supports: {
-    unsigned FeatureWord;
-    unsigned BitMask;
+    llvm::Triple Triple = getTarget().getTriple();
     const Expr *CPUExpr = E->getArg(0)->IgnoreParenCasts();
     StringRef CPUStr = cast<clang::StringLiteral>(CPUExpr)->getString();
+    if (Triple.isOSAIX()) {
+      unsigned SupportMethod, FieldIdx, Mask, Value;
+      CmpInst::Predicate CompOp;
+      typedef std::tuple<unsigned, unsigned, unsigned, CmpInst::Predicate,
+                         unsigned>
+          CPUSupportType;
+      std::tie(SupportMethod, FieldIdx, Mask, CompOp, Value) =
+          static_cast<CPUSupportType>(StringSwitch<CPUSupportType>(CPUStr)
+#define PPC_AIX_FEATURE(NAME, DESC, SUPPORT_METHOD, INDEX, MASK, COMP_OP,      \
+                        VALUE)                                                 \
+  .Case(NAME, {SUPPORT_METHOD, INDEX, MASK, COMP_OP, VALUE})
+#include "llvm/TargetParser/PPCTargetParser.def"
+                                          .Default({AIX_BUILTIN_PPC_FALSE, 0, 0,
+                                                    CmpInst::Predicate(), 0}));
+      return GenAIXPPCBuiltinCpuExpr(SupportMethod, FieldIdx, Mask, CompOp,
+                                     Value);
+    }
+
+    assert(Triple.isOSLinux() &&
+           "__builtin_cpu_supports() is only supported for AIX and Linux.");
+    unsigned FeatureWord;
+    unsigned BitMask;
     std::tie(FeatureWord, BitMask) =
         StringSwitch<std::pair<unsigned, unsigned>>(CPUStr)
 #define PPC_LNX_FEATURE(Name, Description, EnumName, Bitmask, FA_WORD)         \
@@ -21257,6 +21305,17 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
         CGM.getIntrinsic(Intrinsic::wasm_relaxed_dot_bf16x8_add_f32);
     return Builder.CreateCall(Callee, {LHS, RHS, Acc});
   }
+  case WebAssembly::BI__builtin_wasm_loadf16_f32: {
+    Value *Addr = EmitScalarExpr(E->getArg(0));
+    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_loadf16_f32);
+    return Builder.CreateCall(Callee, {Addr});
+  }
+  case WebAssembly::BI__builtin_wasm_storef16_f32: {
+    Value *Val = EmitScalarExpr(E->getArg(0));
+    Value *Addr = EmitScalarExpr(E->getArg(1));
+    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_storef16_f32);
+    return Builder.CreateCall(Callee, {Val, Addr});
+  }
   case WebAssembly::BI__builtin_wasm_table_get: {
     assert(E->getArg(0)->getType()->isArrayType());
     Value *Table = EmitArrayToPointerDecay(E->getArg(0)).emitRawPointer(*this);
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 69548902dc43..0c7eef59db53 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -5050,13 +5050,14 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
         (TargetDecl->hasAttr<TargetAttr>() ||
          (CurFuncDecl && CurFuncDecl->hasAttr<TargetAttr>())))
       checkTargetFeatures(Loc, FD);
-
-    // Some architectures (such as x86-64) have the ABI changed based on
-    // attribute-target/features. Give them a chance to diagnose.
-    CGM.getTargetCodeGenInfo().checkFunctionCallABI(
-        CGM, Loc, dyn_cast_or_null<FunctionDecl>(CurCodeDecl), FD, CallArgs);
   }
 
+  // Some architectures (such as x86-64) have the ABI changed based on
+  // attribute-target/features. Give them a chance to diagnose.
+  CGM.getTargetCodeGenInfo().checkFunctionCallABI(
+      CGM, Loc, dyn_cast_or_null<FunctionDecl>(CurCodeDecl),
+      dyn_cast_or_null<FunctionDecl>(TargetDecl), CallArgs, RetTy);
+
   // 1. Set up the arguments.
 
   // If we're using inalloca, insert the allocation after the stack save.
diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index 44d476976a55..6172eb9cdc1b 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -537,9 +537,12 @@ void AggExprEmitter::EmitArrayInit(Address DestPtr, llvm::ArrayType *AType,
       elementType.isTriviallyCopyableType(CGF.getContext())) {
     CodeGen::CodeGenModule &CGM = CGF.CGM;
     ConstantEmitter Emitter(CGF);
-    LangAS AS = ArrayQTy.getAddressSpace();
+    QualType GVArrayQTy = CGM.getContext().getAddrSpaceQualType(
+        CGM.getContext().removeAddrSpaceQualType(ArrayQTy),
+        CGM.GetGlobalConstantAddressSpace());
+    LangAS AS = GVArrayQTy.getAddressSpace();
     if (llvm::Constant *C =
-            Emitter.tryEmitForInitializer(ExprToVisit, AS, ArrayQTy)) {
+            Emitter.tryEmitForInitializer(ExprToVisit, AS, GVArrayQTy)) {
       auto GV = new llvm::GlobalVariable(
           CGM.getModule(), C->getType(),
           /* isConstant= */ true, llvm::GlobalValue::PrivateLinkage, C,
@@ -547,10 +550,10 @@ void AggExprEmitter::EmitArrayInit(Address DestPtr, llvm::ArrayType *AType,
           /* InsertBefore= */ nullptr, llvm::GlobalVariable::NotThreadLocal,
           CGM.getContext().getTargetAddressSpace(AS));
       Emitter.finalize(GV);
-      CharUnits Align = CGM.getContext().getTypeAlignInChars(ArrayQTy);
+      CharUnits Align = CGM.getContext().getTypeAlignInChars(GVArrayQTy);
       GV->setAlignment(Align.getAsAlign());
       Address GVAddr(GV, GV->getValueType(), Align);
-      EmitFinalDestCopy(ArrayQTy, CGF.MakeAddrLValue(GVAddr, ArrayQTy));
+      EmitFinalDestCopy(ArrayQTy, CGF.MakeAddrLValue(GVAddr, GVArrayQTy));
       return;
     }
   }
@@ -1733,7 +1736,7 @@ void AggExprEmitter::VisitCXXParenListOrInitListExpr(
       for (const auto *Field : record->fields())
         assert(
             (Field->isUnnamedBitField() || Field->isAnonymousStructOrUnion()) &&
-            "Only unnamed bitfields or ananymous class allowed");
+            "Only unnamed bitfields or anonymous class allowed");
 #endif
       return;
     }
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index af48e8d2b839..d84531959b50 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2801,6 +2801,19 @@ ScalarExprEmitter::EmitScalarPrePostIncDec(const UnaryOperator *E, LValue LV,
                                   llvm::AtomicOrdering::SequentiallyConsistent);
       return isPre ? Builder.CreateBinOp(op, old, amt) : old;
     }
+    // Special case for atomic increment/decrement on floats
+    if (type->isFloatingType()) {
+      llvm::AtomicRMWInst::BinOp aop =
+          isInc ? llvm::AtomicRMWInst::FAdd : llvm::AtomicRMWInst::FSub;
+      llvm::Instruction::BinaryOps op =
+          isInc ? llvm::Instruction::FAdd : llvm::Instruction::FSub;
+      llvm::Value *amt = llvm::ConstantFP::get(
+          VMContext, llvm::APFloat(static_cast<float>(1.0)));
+      llvm::Value *old =
+          Builder.CreateAtomicRMW(aop, LV.getAddress(CGF), amt,
+                                  llvm::AtomicOrdering::SequentiallyConsistent);
+      return isPre ? Builder.CreateBinOp(op, old, amt) : old;
+    }
     value = EmitLoadOfLValue(LV, E->getExprLoc());
     input = value;
     // For every other atomic operation, we need to emit a load-op-cmpxchg loop
diff --git a/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp b/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp
index 868b1ab98e04..5169be204c14 100644
--- a/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp
+++ b/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp
@@ -75,7 +75,7 @@ struct CGRecordLowering {
   // sentinel member type that ensures correct rounding.
   struct MemberInfo {
     CharUnits Offset;
-    enum InfoKind { VFPtr, VBPtr, Field, Base, VBase, Scissor } Kind;
+    enum InfoKind { VFPtr, VBPtr, Field, Base, VBase } Kind;
     llvm::Type *Data;
     union {
       const FieldDecl *FD;
@@ -197,7 +197,7 @@ struct CGRecordLowering {
                      const CXXRecordDecl *Query) const;
   void calculateZeroInit();
   CharUnits calculateTailClippingOffset(bool isNonVirtualBaseType) const;
-  void checkBitfieldClipping() const;
+  void checkBitfieldClipping(bool isNonVirtualBaseType) const;
   /// Determines if we need a packed llvm struct.
   void determinePacked(bool NVBaseType);
   /// Inserts padding everywhere it's needed.
@@ -299,8 +299,8 @@ void CGRecordLowering::lower(bool NVBaseType) {
       accumulateVBases();
   }
   llvm::stable_sort(Members);
+  checkBitfieldClipping(NVBaseType);
   Members.push_back(StorageInfo(Size, getIntNType(8)));
-  checkBitfieldClipping();
   determinePacked(NVBaseType);
   insertPadding();
   Members.pop_back();
@@ -894,8 +894,6 @@ CGRecordLowering::calculateTailClippingOffset(bool isNonVirtualBaseType) const {
 }
 
 void CGRecordLowering::accumulateVBases() {
-  Members.push_back(MemberInfo(calculateTailClippingOffset(false),
-                               MemberInfo::Scissor, nullptr, RD));
   for (const auto &Base : RD->vbases()) {
     const CXXRecordDecl *BaseDecl = Base.getType()->getAsCXXRecordDecl();
     if (BaseDecl->isEmpty())
@@ -950,18 +948,19 @@ void CGRecordLowering::calculateZeroInit() {
 }
 
 // Verify accumulateBitfields computed the correct storage representations.
-void CGRecordLowering::checkBitfieldClipping() const {
+void CGRecordLowering::checkBitfieldClipping(bool IsNonVirtualBaseType) const {
 #ifndef NDEBUG
+  auto ScissorOffset = calculateTailClippingOffset(IsNonVirtualBaseType);
   auto Tail = CharUnits::Zero();
   for (const auto &M : Members) {
-    // Only members with data and the scissor can cut into tail padding.
-    if (!M.Data && M.Kind != MemberInfo::Scissor)
+    // Only members with data could possibly overlap.
+    if (!M.Data)
       continue;
 
     assert(M.Offset >= Tail && "Bitfield access unit is not clipped");
-    Tail = M.Offset;
-    if (M.Data)
-      Tail += getSize(M.Data);
+    Tail = M.Offset + getSize(M.Data);
+    assert((Tail <= ScissorOffset || M.Offset >= ScissorOffset) &&
+           "Bitfield straddles scissor offset");
   }
 #endif
 }
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index 576fe2f7a2d4..479945e3b4cb 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -908,6 +908,69 @@ void CodeGenFunction::EmitIfStmt(const IfStmt &S) {
     incrementProfileCounter(&S);
 }
 
+bool CodeGenFunction::checkIfLoopMustProgress(const Expr *ControllingExpression,
+                                              bool HasEmptyBody) {
+  if (CGM.getCodeGenOpts().getFiniteLoops() ==
+      CodeGenOptions::FiniteLoopsKind::Never)
+    return false;
+
+  // Now apply rules for plain C (see  6.8.5.6 in C11).
+  // Loops with constant conditions do not have to make progress in any C
+  // version.
+  // As an extension, we consisider loops whose constant expression
+  // can be constant-folded.
+  Expr::EvalResult Result;
+  bool CondIsConstInt =
+      !ControllingExpression ||
+      (ControllingExpression->EvaluateAsInt(Result, getContext()) &&
+       Result.Val.isInt());
+
+  bool CondIsTrue = CondIsConstInt && (!ControllingExpression ||
+                                       Result.Val.getInt().getBoolValue());
+
+  // Loops with non-constant conditions must make progress in C11 and later.
+  if (getLangOpts().C11 && !CondIsConstInt)
+    return true;
+
+  // [C++26][intro.progress] (DR)
+  // The implementation may assume that any thread will eventually do one of the
+  // following:
+  // [...]
+  // - continue execution of a trivial infinite loop ([stmt.iter.general]).
+  if (CGM.getCodeGenOpts().getFiniteLoops() ==
+          CodeGenOptions::FiniteLoopsKind::Always ||
+      getLangOpts().CPlusPlus11) {
+    if (HasEmptyBody && CondIsTrue) {
+      CurFn->removeFnAttr(llvm::Attribute::MustProgress);
+      return false;
+    }
+    return true;
+  }
+  return false;
+}
+
+// [C++26][stmt.iter.general] (DR)
+// A trivially empty iteration statement is an iteration statement matching one
+// of the following forms:
+//  - while ( expression ) ;
+//  - while ( expression ) { }
+//  - do ; while ( expression ) ;
+//  - do { } while ( expression ) ;
+//  - for ( init-statement expression(opt); ) ;
+//  - for ( init-statement expression(opt); ) { }
+template <typename LoopStmt> static bool hasEmptyLoopBody(const LoopStmt &S) {
+  if constexpr (std::is_same_v<LoopStmt, ForStmt>) {
+    if (S.getInc())
+      return false;
+  }
+  const Stmt *Body = S.getBody();
+  if (!Body || isa<NullStmt>(Body))
+    return true;
+  if (const CompoundStmt *Compound = dyn_cast<CompoundStmt>(Body))
+    return Compound->body_empty();
+  return false;
+}
+
 void CodeGenFunction::EmitWhileStmt(const WhileStmt &S,
                                     ArrayRef<const Attr *> WhileAttrs) {
   // Emit the header for the loop, which will also become
@@ -942,13 +1005,12 @@ void CodeGenFunction::EmitWhileStmt(const WhileStmt &S,
   // while(1) is common, avoid extra exit blocks.  Be sure
   // to correctly handle break/continue though.
   llvm::ConstantInt *C = dyn_cast<llvm::ConstantInt>(BoolCondVal);
-  bool CondIsConstInt = C != nullptr;
-  bool EmitBoolCondBranch = !CondIsConstInt || !C->isOne();
+  bool EmitBoolCondBranch = !C || !C->isOne();
   const SourceRange &R = S.getSourceRange();
   LoopStack.push(LoopHeader.getBlock(), CGM.getContext(), CGM.getCodeGenOpts(),
                  WhileAttrs, SourceLocToDebugLoc(R.getBegin()),
                  SourceLocToDebugLoc(R.getEnd()),
-                 checkIfLoopMustProgress(CondIsConstInt));
+                 checkIfLoopMustProgress(S.getCond(), hasEmptyLoopBody(S)));
 
   // When single byte coverage mode is enabled, add a counter to loop condition.
   if (llvm::EnableSingleByteCoverage)
@@ -1059,14 +1121,13 @@ void CodeGenFunction::EmitDoStmt(const DoStmt &S,
   // "do {} while (0)" is common in macros, avoid extra blocks.  Be sure
   // to correctly handle break/continue though.
   llvm::ConstantInt *C = dyn_cast<llvm::ConstantInt>(BoolCondVal);
-  bool CondIsConstInt = C;
   bool EmitBoolCondBranch = !C || !C->isZero();
 
   const SourceRange &R = S.getSourceRange();
   LoopStack.push(LoopBody, CGM.getContext(), CGM.getCodeGenOpts(), DoAttrs,
                  SourceLocToDebugLoc(R.getBegin()),
                  SourceLocToDebugLoc(R.getEnd()),
-                 checkIfLoopMustProgress(CondIsConstInt));
+                 checkIfLoopMustProgress(S.getCond(), hasEmptyLoopBody(S)));
 
   // As long as the condition is true, iterate the loop.
   if (EmitBoolCondBranch) {
@@ -1109,15 +1170,11 @@ void CodeGenFunction::EmitForStmt(const ForStmt &S,
   llvm::BasicBlock *CondBlock = CondDest.getBlock();
   EmitBlock(CondBlock);
 
-  Expr::EvalResult Result;
-  bool CondIsConstInt =
-      !S.getCond() || S.getCond()->EvaluateAsInt(Result, getContext());
-
   const SourceRange &R = S.getSourceRange();
   LoopStack.push(CondBlock, CGM.getContext(), CGM.getCodeGenOpts(), ForAttrs,
                  SourceLocToDebugLoc(R.getBegin()),
                  SourceLocToDebugLoc(R.getEnd()),
-                 checkIfLoopMustProgress(CondIsConstInt));
+                 checkIfLoopMustProgress(S.getCond(), hasEmptyLoopBody(S)));
 
   // Create a cleanup scope for the condition variable cleanups.
   LexicalScope ConditionScope(*this, S.getSourceRange());
diff --git a/clang/lib/CodeGen/CGVTables.cpp b/clang/lib/CodeGen/CGVTables.cpp
index 862369ae009f..8d9c22546b42 100644
--- a/clang/lib/CodeGen/CGVTables.cpp
+++ b/clang/lib/CodeGen/CGVTables.cpp
@@ -131,6 +131,12 @@ static void resolveTopLevelMetadata(llvm::Function *Fn,
   // they are referencing.
   for (auto &BB : *Fn) {
     for (auto &I : BB) {
+      for (llvm::DbgVariableRecord &DVR :
+           llvm::filterDbgVars(I.getDbgRecordRange())) {
+        auto *DILocal = DVR.getVariable();
+        if (!DILocal->isResolved())
+          DILocal->resolve();
+      }
       if (auto *DII = dyn_cast<llvm::DbgVariableIntrinsic>(&I)) {
         auto *DILocal = DII->getVariable();
         if (!DILocal->isResolved())
diff --git a/clang/lib/CodeGen/CodeGenAction.cpp b/clang/lib/CodeGen/CodeGenAction.cpp
index 1a6b628016f7..6d3efdb5ffe3 100644
--- a/clang/lib/CodeGen/CodeGenAction.cpp
+++ b/clang/lib/CodeGen/CodeGenAction.cpp
@@ -60,10 +60,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "codegenaction"
 
-namespace llvm {
-extern cl::opt<bool> ClRelinkBuiltinBitcodePostop;
-}
-
 namespace clang {
 class BackendConsumer;
 class ClangDiagnosticHandler final : public DiagnosticHandler {
@@ -118,13 +114,12 @@ BackendConsumer::BackendConsumer(
     const HeaderSearchOptions &HeaderSearchOpts,
     const PreprocessorOptions &PPOpts, const CodeGenOptions &CodeGenOpts,
     const TargetOptions &TargetOpts, const LangOptions &LangOpts,
-    const FileManager &FileMgr, const std::string &InFile,
-    SmallVector<LinkModule, 4> LinkModules,
+    const std::string &InFile, SmallVector<LinkModule, 4> LinkModules,
     std::unique_ptr<raw_pwrite_stream> OS, LLVMContext &C,
     CoverageSourceInfo *CoverageInfo)
     : Diags(Diags), Action(Action), HeaderSearchOpts(HeaderSearchOpts),
       CodeGenOpts(CodeGenOpts), TargetOpts(TargetOpts), LangOpts(LangOpts),
-      FileMgr(FileMgr), AsmOutStream(std::move(OS)), Context(nullptr), FS(VFS),
+      AsmOutStream(std::move(OS)), Context(nullptr), FS(VFS),
       LLVMIRGeneration("irgen", "LLVM IR Generation Time"),
       LLVMIRGenerationRefCount(0),
       Gen(CreateLLVMCodeGen(Diags, InFile, std::move(VFS), HeaderSearchOpts,
@@ -144,12 +139,11 @@ BackendConsumer::BackendConsumer(
     const HeaderSearchOptions &HeaderSearchOpts,
     const PreprocessorOptions &PPOpts, const CodeGenOptions &CodeGenOpts,
     const TargetOptions &TargetOpts, const LangOptions &LangOpts,
-    const FileManager &FileMgr, llvm::Module *Module,
-    SmallVector<LinkModule, 4> LinkModules, LLVMContext &C,
-    CoverageSourceInfo *CoverageInfo)
+    llvm::Module *Module, SmallVector<LinkModule, 4> LinkModules,
+    LLVMContext &C, CoverageSourceInfo *CoverageInfo)
     : Diags(Diags), Action(Action), HeaderSearchOpts(HeaderSearchOpts),
       CodeGenOpts(CodeGenOpts), TargetOpts(TargetOpts), LangOpts(LangOpts),
-      FileMgr(FileMgr), Context(nullptr), FS(VFS),
+      Context(nullptr), FS(VFS),
       LLVMIRGeneration("irgen", "LLVM IR Generation Time"),
       LLVMIRGenerationRefCount(0),
       Gen(CreateLLVMCodeGen(Diags, "", std::move(VFS), HeaderSearchOpts, PPOpts,
@@ -232,35 +226,6 @@ void BackendConsumer::HandleInterestingDecl(DeclGroupRef D) {
     HandleTopLevelDecl(D);
 }
 
-bool BackendConsumer::ReloadModules(llvm::Module *M) {
-  for (const CodeGenOptions::BitcodeFileToLink &F :
-       CodeGenOpts.LinkBitcodeFiles) {
-    auto BCBuf = FileMgr.getBufferForFile(F.Filename);
-    if (!BCBuf) {
-      Diags.Report(diag::err_cannot_open_file)
-          << F.Filename << BCBuf.getError().message();
-      LinkModules.clear();
-      return true;
-    }
-
-    LLVMContext &Ctx = getModule()->getContext();
-    Expected<std::unique_ptr<llvm::Module>> ModuleOrErr =
-        getOwningLazyBitcodeModule(std::move(*BCBuf), Ctx);
-
-    if (!ModuleOrErr) {
-      handleAllErrors(ModuleOrErr.takeError(), [&](ErrorInfoBase &EIB) {
-        Diags.Report(diag::err_cannot_open_file) << F.Filename << EIB.message();
-      });
-      LinkModules.clear();
-      return true;
-    }
-    LinkModules.push_back({std::move(ModuleOrErr.get()), F.PropagateAttrs,
-                           F.Internalize, F.LinkFlags});
-  }
-
-  return false; // success
-}
-
 // Links each entry in LinkModules into our module.  Returns true on error.
 bool BackendConsumer::LinkInModules(llvm::Module *M, bool ShouldLinkFiles) {
   for (auto &LM : LinkModules) {
@@ -362,7 +327,7 @@ void BackendConsumer::HandleTranslationUnit(ASTContext &C) {
   }
 
   // Link each LinkModule into our module.
-  if (LinkInModules(getModule()))
+  if (!CodeGenOpts.LinkBitcodePostopt && LinkInModules(getModule()))
     return;
 
   for (auto &F : getModule()->functions()) {
@@ -1055,9 +1020,8 @@ CodeGenAction::CreateASTConsumer(CompilerInstance &CI, StringRef InFile) {
   std::unique_ptr<BackendConsumer> Result(new BackendConsumer(
       BA, CI.getDiagnostics(), &CI.getVirtualFileSystem(),
       CI.getHeaderSearchOpts(), CI.getPreprocessorOpts(), CI.getCodeGenOpts(),
-      CI.getTargetOpts(), CI.getLangOpts(), CI.getFileManager(),
-      std::string(InFile), std::move(LinkModules), std::move(OS), *VMContext,
-      CoverageInfo));
+      CI.getTargetOpts(), CI.getLangOpts(), std::string(InFile),
+      std::move(LinkModules), std::move(OS), *VMContext, CoverageInfo));
   BEConsumer = Result.get();
 
   // Enable generating macro debug info only when debug info is not disabled and
@@ -1228,11 +1192,11 @@ void CodeGenAction::ExecuteAction() {
   BackendConsumer Result(BA, CI.getDiagnostics(), &CI.getVirtualFileSystem(),
                          CI.getHeaderSearchOpts(), CI.getPreprocessorOpts(),
                          CI.getCodeGenOpts(), CI.getTargetOpts(),
-                         CI.getLangOpts(), CI.getFileManager(), TheModule.get(),
+                         CI.getLangOpts(), TheModule.get(),
                          std::move(LinkModules), *VMContext, nullptr);
 
   // Link in each pending link module.
-  if (Result.LinkInModules(&*TheModule))
+  if (!CodeGenOpts.LinkBitcodePostopt && Result.LinkInModules(&*TheModule))
     return;
 
   // PR44896: Force DiscardValueNames as false. DiscardValueNames cannot be
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 87766a758311..9f16fcb43855 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -1471,6 +1471,8 @@ void CodeGenFunction::GenerateCode(GlobalDecl GD, llvm::Function *Fn,
 
   // Ensure that the function adheres to the forward progress guarantee, which
   // is required by certain optimizations.
+  // In C++11 and up, the attribute will be removed if the body contains a
+  // trivial empty loop.
   if (checkIfFunctionMustProgress())
     CurFn->addFnAttr(llvm::Attribute::MustProgress);
 
@@ -2759,8 +2761,13 @@ llvm::Value *CodeGenFunction::FormAArch64ResolverCondition(
     const MultiVersionResolverOption &RO) {
   llvm::SmallVector<StringRef, 8> CondFeatures;
   for (const StringRef &Feature : RO.Conditions.Features) {
-    // Form condition for features which are not yet enabled in target
-    if (!getContext().getTargetInfo().hasFeature(Feature))
+    // Optimize the Function Multi Versioning resolver by creating conditions
+    // only for features that are not enabled in the target. The exception is
+    // for features whose extension instructions are executed as NOP on targets
+    // without extension support.
+    if (!getContext().getTargetInfo().hasFeature(Feature) || Feature == "bti" ||
+        Feature == "memtag" || Feature == "memtag2" || Feature == "memtag3" ||
+        Feature == "dgh")
       CondFeatures.push_back(Feature);
   }
   if (!CondFeatures.empty()) {
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 6e7417fc7f52..e1e687af6a78 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -636,28 +636,7 @@ public:
   /// Returns true if a loop must make progress, which means the mustprogress
   /// attribute can be added. \p HasConstantCond indicates whether the branch
   /// condition is a known constant.
-  bool checkIfLoopMustProgress(bool HasConstantCond) {
-    if (CGM.getCodeGenOpts().getFiniteLoops() ==
-        CodeGenOptions::FiniteLoopsKind::Always)
-      return true;
-    if (CGM.getCodeGenOpts().getFiniteLoops() ==
-        CodeGenOptions::FiniteLoopsKind::Never)
-      return false;
-
-    // If the containing function must make progress, loops also must make
-    // progress (as in C++11 and later).
-    if (checkIfFunctionMustProgress())
-      return true;
-
-    // Now apply rules for plain C (see  6.8.5.6 in C11).
-    // Loops with constant conditions do not have to make progress in any C
-    // version.
-    if (HasConstantCond)
-      return false;
-
-    // Loops with non-constant conditions must make progress in C11 and later.
-    return getLangOpts().C11;
-  }
+  bool checkIfLoopMustProgress(const Expr *, bool HasEmptyBody);
 
   const CodeGen::CGBlockInfo *BlockInfo = nullptr;
   llvm::Value *BlockPointer = nullptr;
@@ -1648,8 +1627,10 @@ public:
   void incrementProfileCounter(const Stmt *S, llvm::Value *StepV = nullptr) {
     if (CGM.getCodeGenOpts().hasProfileClangInstr() &&
         !CurFn->hasFnAttribute(llvm::Attribute::NoProfile) &&
-        !CurFn->hasFnAttribute(llvm::Attribute::SkipProfile))
+        !CurFn->hasFnAttribute(llvm::Attribute::SkipProfile)) {
+      auto AL = ApplyDebugLocation::CreateArtificial(*this);
       PGO.emitCounterSetOrIncrement(Builder, S, StepV);
+    }
     PGO.setCurrentStmt(S);
   }
 
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index c8898ce196c1..489c08a4d481 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -53,6 +53,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
 #include "llvm/IR/AttributeMask.h"
 #include "llvm/IR/CallingConv.h"
@@ -1190,6 +1191,37 @@ void CodeGenModule::Release() {
     if (!LangOpts.isSignReturnAddressWithAKey())
       getModule().addModuleFlag(llvm::Module::Min,
                                 "sign-return-address-with-bkey", 1);
+
+    if (getTriple().isOSLinux()) {
+      assert(getTriple().isOSBinFormatELF());
+      using namespace llvm::ELF;
+      uint64_t PAuthABIVersion =
+          (LangOpts.PointerAuthIntrinsics
+           << AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_INTRINSICS) |
+          (LangOpts.PointerAuthCalls
+           << AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_CALLS) |
+          (LangOpts.PointerAuthReturns
+           << AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_RETURNS) |
+          (LangOpts.PointerAuthAuthTraps
+           << AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_AUTHTRAPS) |
+          (LangOpts.PointerAuthVTPtrAddressDiscrimination
+           << AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_VPTRADDRDISCR) |
+          (LangOpts.PointerAuthVTPtrTypeDiscrimination
+           << AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_VPTRTYPEDISCR) |
+          (LangOpts.PointerAuthInitFini
+           << AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_INITFINI);
+      static_assert(AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_INITFINI ==
+                        AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_LAST,
+                    "Update when new enum items are defined");
+      if (PAuthABIVersion != 0) {
+        getModule().addModuleFlag(llvm::Module::Error,
+                                  "aarch64-elf-pauthabi-platform",
+                                  AARCH64_PAUTH_PLATFORM_LLVM_LINUX);
+        getModule().addModuleFlag(llvm::Module::Error,
+                                  "aarch64-elf-pauthabi-version",
+                                  PAuthABIVersion);
+      }
+    }
   }
 
   if (CodeGenOpts.StackClashProtector)
diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp
index 733686d4946b..ce2f39aeb082 100644
--- a/clang/lib/CodeGen/CoverageMappingGen.cpp
+++ b/clang/lib/CodeGen/CoverageMappingGen.cpp
@@ -191,10 +191,7 @@ public:
   bool isBranch() const { return FalseCount.has_value(); }
 
   bool isMCDCDecision() const {
-    const auto *DecisionParams =
-        std::get_if<mcdc::DecisionParameters>(&MCDCParams);
-    assert(!DecisionParams || DecisionParams->NumConditions > 0);
-    return DecisionParams;
+    return std::holds_alternative<mcdc::DecisionParameters>(MCDCParams);
   }
 
   const auto &getMCDCDecisionParams() const {
diff --git a/clang/lib/CodeGen/LinkInModulesPass.cpp b/clang/lib/CodeGen/LinkInModulesPass.cpp
index 929539cc8f33..c3831aae13b6 100644
--- a/clang/lib/CodeGen/LinkInModulesPass.cpp
+++ b/clang/lib/CodeGen/LinkInModulesPass.cpp
@@ -28,12 +28,8 @@ PreservedAnalyses LinkInModulesPass::run(Module &M, ModuleAnalysisManager &AM) {
   if (!BC)
     return PreservedAnalyses::all();
 
-  // Re-load bitcode modules from files
-  if (BC->ReloadModules(&M))
-    report_fatal_error("Bitcode module re-loading failed, aborted!");
-
   if (BC->LinkInModules(&M, ShouldLinkFiles))
-    report_fatal_error("Bitcode module re-linking failed, aborted!");
+    report_fatal_error("Bitcode module postopt linking failed, aborted!");
 
-  return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
 }
diff --git a/clang/lib/CodeGen/MicrosoftCXXABI.cpp b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
index d47927745759..e4f798f6a97d 100644
--- a/clang/lib/CodeGen/MicrosoftCXXABI.cpp
+++ b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
@@ -1122,7 +1122,22 @@ static bool isTrivialForMSVC(const CXXRecordDecl *RD, QualType Ty,
   //   No base classes
   //   No virtual functions
   // Additionally, we need to ensure that there is a trivial copy assignment
-  // operator, a trivial destructor and no user-provided constructors.
+  // operator, a trivial destructor, no user-provided constructors and no
+  // deleted copy assignment operator.
+
+  // We need to cover two cases when checking for a deleted copy assignment
+  // operator.
+  //
+  // struct S { int& r; };
+  // The above will have an implicit copy assignment operator that is deleted
+  // and there will not be a `CXXMethodDecl` for the copy assignment operator.
+  // This is handled by the `needsImplicitCopyAssignment()` check below.
+  //
+  // struct S { S& operator=(const S&) = delete; int i; };
+  // The above will not have an implicit copy assignment operator that is
+  // deleted but there is a deleted `CXXMethodDecl` for the declared copy
+  // assignment operator. This is handled by the `isDeleted()` check below.
+
   if (RD->hasProtectedFields() || RD->hasPrivateFields())
     return false;
   if (RD->getNumBases() > 0)
@@ -1131,6 +1146,8 @@ static bool isTrivialForMSVC(const CXXRecordDecl *RD, QualType Ty,
     return false;
   if (RD->hasNonTrivialCopyAssignment())
     return false;
+  if (RD->needsImplicitCopyAssignment() && !RD->hasSimpleCopyAssignment())
+    return false;
   for (const Decl *D : RD->decls()) {
     if (auto *Ctor = dyn_cast<CXXConstructorDecl>(D)) {
       if (Ctor->isUserProvided())
@@ -1138,6 +1155,9 @@ static bool isTrivialForMSVC(const CXXRecordDecl *RD, QualType Ty,
     } else if (auto *Template = dyn_cast<FunctionTemplateDecl>(D)) {
       if (isa<CXXConstructorDecl>(Template->getTemplatedDecl()))
         return false;
+    } else if (auto *MethodDecl = dyn_cast<CXXMethodDecl>(D)) {
+      if (MethodDecl->isCopyAssignmentOperator() && MethodDecl->isDeleted())
+        return false;
     }
   }
   if (RD->hasNonTrivialDestructor())
diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h
index b1dfe5bf8f27..f242d9e36ed4 100644
--- a/clang/lib/CodeGen/TargetInfo.h
+++ b/clang/lib/CodeGen/TargetInfo.h
@@ -94,7 +94,8 @@ public:
   virtual void checkFunctionCallABI(CodeGenModule &CGM, SourceLocation CallLoc,
                                     const FunctionDecl *Caller,
                                     const FunctionDecl *Callee,
-                                    const CallArgList &Args) const {}
+                                    const CallArgList &Args,
+                                    QualType ReturnType) const {}
 
   /// Determines the size of struct _Unwind_Exception on this platform,
   /// in 8-bit units.  The Itanium ABI defines this as:
diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index 4c32f510101f..e32b060ebeb9 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -8,6 +8,7 @@
 
 #include "ABIInfoImpl.h"
 #include "TargetInfo.h"
+#include "clang/AST/Decl.h"
 #include "clang/Basic/DiagnosticFrontend.h"
 #include "llvm/TargetParser/AArch64TargetParser.h"
 
@@ -170,8 +171,22 @@ public:
 
   void checkFunctionCallABI(CodeGenModule &CGM, SourceLocation CallLoc,
                             const FunctionDecl *Caller,
-                            const FunctionDecl *Callee,
-                            const CallArgList &Args) const override;
+                            const FunctionDecl *Callee, const CallArgList &Args,
+                            QualType ReturnType) const override;
+
+private:
+  // Diagnose calls between functions with incompatible Streaming SVE
+  // attributes.
+  void checkFunctionCallABIStreaming(CodeGenModule &CGM, SourceLocation CallLoc,
+                                     const FunctionDecl *Caller,
+                                     const FunctionDecl *Callee) const;
+  // Diagnose calls which must pass arguments in floating-point registers when
+  // the selected target does not have floating-point registers.
+  void checkFunctionCallABISoftFloat(CodeGenModule &CGM, SourceLocation CallLoc,
+                                     const FunctionDecl *Caller,
+                                     const FunctionDecl *Callee,
+                                     const CallArgList &Args,
+                                     QualType ReturnType) const;
 };
 
 class WindowsAArch64TargetCodeGenInfo : public AArch64TargetCodeGenInfo {
@@ -838,14 +853,6 @@ Address AArch64ABIInfo::EmitMSVAArg(CodeGenFunction &CGF, Address VAListAddr,
                           /*allowHigherAlign*/ false);
 }
 
-static bool isStreaming(const FunctionDecl *F) {
-  if (F->hasAttr<ArmLocallyStreamingAttr>())
-    return true;
-  if (const auto *T = F->getType()->getAs<FunctionProtoType>())
-    return T->getAArch64SMEAttributes() & FunctionType::SME_PStateSMEnabledMask;
-  return false;
-}
-
 static bool isStreamingCompatible(const FunctionDecl *F) {
   if (const auto *T = F->getType()->getAs<FunctionProtoType>())
     return T->getAArch64SMEAttributes() &
@@ -853,42 +860,49 @@ static bool isStreamingCompatible(const FunctionDecl *F) {
   return false;
 }
 
+// Report an error if an argument or return value of type Ty would need to be
+// passed in a floating-point register.
+static void diagnoseIfNeedsFPReg(DiagnosticsEngine &Diags,
+                                 const StringRef ABIName,
+                                 const AArch64ABIInfo &ABIInfo,
+                                 const QualType &Ty, const NamedDecl *D) {
+  const Type *HABase = nullptr;
+  uint64_t HAMembers = 0;
+  if (Ty->isFloatingType() || Ty->isVectorType() ||
+      ABIInfo.isHomogeneousAggregate(Ty, HABase, HAMembers)) {
+    Diags.Report(D->getLocation(), diag::err_target_unsupported_type_for_abi)
+        << D->getDeclName() << Ty << ABIName;
+  }
+}
+
+// If we are using a hard-float ABI, but do not have floating point registers,
+// then report an error for any function arguments or returns which would be
+// passed in floating-pint registers.
 void AArch64TargetCodeGenInfo::checkFunctionABI(
     CodeGenModule &CGM, const FunctionDecl *FuncDecl) const {
   const AArch64ABIInfo &ABIInfo = getABIInfo<AArch64ABIInfo>();
   const TargetInfo &TI = ABIInfo.getContext().getTargetInfo();
 
-  // If we are using a hard-float ABI, but do not have floating point
-  // registers, then report an error for any function arguments or returns
-  // which would be passed in floating-pint registers.
-  auto CheckType = [&CGM, &TI, &ABIInfo](const QualType &Ty,
-                                         const NamedDecl *D) {
-    const Type *HABase = nullptr;
-    uint64_t HAMembers = 0;
-    if (Ty->isFloatingType() || Ty->isVectorType() ||
-        ABIInfo.isHomogeneousAggregate(Ty, HABase, HAMembers)) {
-      CGM.getDiags().Report(D->getLocation(),
-                            diag::err_target_unsupported_type_for_abi)
-          << D->getDeclName() << Ty << TI.getABI();
-    }
-  };
-
   if (!TI.hasFeature("fp") && !ABIInfo.isSoftFloat()) {
-    CheckType(FuncDecl->getReturnType(), FuncDecl);
+    diagnoseIfNeedsFPReg(CGM.getDiags(), TI.getABI(), ABIInfo,
+                         FuncDecl->getReturnType(), FuncDecl);
     for (ParmVarDecl *PVD : FuncDecl->parameters()) {
-      CheckType(PVD->getType(), PVD);
+      diagnoseIfNeedsFPReg(CGM.getDiags(), TI.getABI(), ABIInfo, PVD->getType(),
+                           PVD);
     }
   }
 }
 
-void AArch64TargetCodeGenInfo::checkFunctionCallABI(
+void AArch64TargetCodeGenInfo::checkFunctionCallABIStreaming(
     CodeGenModule &CGM, SourceLocation CallLoc, const FunctionDecl *Caller,
-    const FunctionDecl *Callee, const CallArgList &Args) const {
+    const FunctionDecl *Callee) const {
   if (!Caller || !Callee || !Callee->hasAttr<AlwaysInlineAttr>())
     return;
 
-  bool CallerIsStreaming = isStreaming(Caller);
-  bool CalleeIsStreaming = isStreaming(Callee);
+  bool CallerIsStreaming =
+      IsArmStreamingFunction(Caller, /*IncludeLocallyStreaming=*/true);
+  bool CalleeIsStreaming =
+      IsArmStreamingFunction(Callee, /*IncludeLocallyStreaming=*/true);
   bool CallerIsStreamingCompatible = isStreamingCompatible(Caller);
   bool CalleeIsStreamingCompatible = isStreamingCompatible(Callee);
 
@@ -903,6 +917,37 @@ void AArch64TargetCodeGenInfo::checkFunctionCallABI(
           << Callee->getDeclName();
 }
 
+// If the target does not have floating-point registers, but we are using a
+// hard-float ABI, there is no way to pass floating-point, vector or HFA values
+// to functions, so we report an error.
+void AArch64TargetCodeGenInfo::checkFunctionCallABISoftFloat(
+    CodeGenModule &CGM, SourceLocation CallLoc, const FunctionDecl *Caller,
+    const FunctionDecl *Callee, const CallArgList &Args,
+    QualType ReturnType) const {
+  const AArch64ABIInfo &ABIInfo = getABIInfo<AArch64ABIInfo>();
+  const TargetInfo &TI = ABIInfo.getContext().getTargetInfo();
+
+  if (!Caller || TI.hasFeature("fp") || ABIInfo.isSoftFloat())
+    return;
+
+  diagnoseIfNeedsFPReg(CGM.getDiags(), TI.getABI(), ABIInfo, ReturnType,
+                       Caller);
+
+  for (const CallArg &Arg : Args)
+    diagnoseIfNeedsFPReg(CGM.getDiags(), TI.getABI(), ABIInfo, Arg.getType(),
+                         Caller);
+}
+
+void AArch64TargetCodeGenInfo::checkFunctionCallABI(CodeGenModule &CGM,
+                                                    SourceLocation CallLoc,
+                                                    const FunctionDecl *Caller,
+                                                    const FunctionDecl *Callee,
+                                                    const CallArgList &Args,
+                                                    QualType ReturnType) const {
+  checkFunctionCallABIStreaming(CGM, CallLoc, Caller, Callee);
+  checkFunctionCallABISoftFloat(CGM, CallLoc, Caller, Callee, Args, ReturnType);
+}
+
 void AArch64ABIInfo::appendAttributeMangling(TargetClonesAttr *Attr,
                                              unsigned Index,
                                              raw_ostream &Out) const {
diff --git a/clang/lib/CodeGen/Targets/X86.cpp b/clang/lib/CodeGen/Targets/X86.cpp
index 94cf0d86f9be..29d98aad8fcb 100644
--- a/clang/lib/CodeGen/Targets/X86.cpp
+++ b/clang/lib/CodeGen/Targets/X86.cpp
@@ -792,6 +792,8 @@ ABIArgInfo X86_32ABIInfo::classifyArgumentType(QualType Ty, CCState &State,
         return ABIArgInfo::getDirect();
       return ABIArgInfo::getExpand();
     }
+    if (IsVectorCall && Ty->isBuiltinType())
+      return ABIArgInfo::getDirect();
     return getIndirectResult(Ty, /*ByVal=*/false, State);
   }
 
@@ -1482,8 +1484,8 @@ public:
 
   void checkFunctionCallABI(CodeGenModule &CGM, SourceLocation CallLoc,
                             const FunctionDecl *Caller,
-                            const FunctionDecl *Callee,
-                            const CallArgList &Args) const override;
+                            const FunctionDecl *Callee, const CallArgList &Args,
+                            QualType ReturnType) const override;
 };
 } // namespace
 
@@ -1558,9 +1560,15 @@ static bool checkAVXParam(DiagnosticsEngine &Diag, ASTContext &Ctx,
   return false;
 }
 
-void X86_64TargetCodeGenInfo::checkFunctionCallABI(
-    CodeGenModule &CGM, SourceLocation CallLoc, const FunctionDecl *Caller,
-    const FunctionDecl *Callee, const CallArgList &Args) const {
+void X86_64TargetCodeGenInfo::checkFunctionCallABI(CodeGenModule &CGM,
+                                                   SourceLocation CallLoc,
+                                                   const FunctionDecl *Caller,
+                                                   const FunctionDecl *Callee,
+                                                   const CallArgList &Args,
+                                                   QualType ReturnType) const {
+  if (!Callee)
+    return;
+
   llvm::StringMap<bool> CallerMap;
   llvm::StringMap<bool> CalleeMap;
   unsigned ArgIndex = 0;
diff --git a/clang/lib/Driver/Distro.cpp b/clang/lib/Driver/Distro.cpp
index a7e7f169dc14..6f49e641104c 100644
--- a/clang/lib/Driver/Distro.cpp
+++ b/clang/lib/Driver/Distro.cpp
@@ -95,6 +95,7 @@ static Distro::DistroType DetectLsbRelease(llvm::vfs::FileSystem &VFS) {
                     .Case("lunar", Distro::UbuntuLunar)
                     .Case("mantic", Distro::UbuntuMantic)
                     .Case("noble", Distro::UbuntuNoble)
+                    .Case("oracular", Distro::UbuntuOracular)
                     .Default(Distro::UnknownDistro);
   return Version;
 }
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 114320f5d314..7b36d8e5084c 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -564,9 +564,9 @@ static llvm::Triple computeTargetTriple(const Driver &D,
       StringRef ObjectMode = *ObjectModeValue;
       llvm::Triple::ArchType AT = llvm::Triple::UnknownArch;
 
-      if (ObjectMode.equals("64")) {
+      if (ObjectMode == "64") {
         AT = Target.get64BitArchVariant().getArch();
-      } else if (ObjectMode.equals("32")) {
+      } else if (ObjectMode == "32") {
         AT = Target.get32BitArchVariant().getArch();
       } else {
         D.Diag(diag::err_drv_invalid_object_mode) << ObjectMode;
@@ -6694,7 +6694,7 @@ llvm::StringRef clang::driver::getDriverMode(StringRef ProgName,
   return Opt.consume_front(OptName) ? Opt : "";
 }
 
-bool driver::IsClangCL(StringRef DriverMode) { return DriverMode.equals("cl"); }
+bool driver::IsClangCL(StringRef DriverMode) { return DriverMode == "cl"; }
 
 llvm::Error driver::expandResponseFiles(SmallVectorImpl<const char *> &Args,
                                         bool ClangCLMode,
diff --git a/clang/lib/Driver/OffloadBundler.cpp b/clang/lib/Driver/OffloadBundler.cpp
index 8cc82a0ee716..191d108e9b73 100644
--- a/clang/lib/Driver/OffloadBundler.cpp
+++ b/clang/lib/Driver/OffloadBundler.cpp
@@ -113,8 +113,11 @@ bool OffloadTargetInfo::isOffloadKindValid() const {
 
 bool OffloadTargetInfo::isOffloadKindCompatible(
     const StringRef TargetOffloadKind) const {
-  if (OffloadKind == TargetOffloadKind)
+  if ((OffloadKind == TargetOffloadKind) ||
+      (OffloadKind == "hip" && TargetOffloadKind == "hipv4") ||
+      (OffloadKind == "hipv4" && TargetOffloadKind == "hip"))
     return true;
+
   if (BundlerConfig.HipOpenmpCompatible) {
     bool HIPCompatibleWithOpenMP = OffloadKind.starts_with_insensitive("hip") &&
                                    TargetOffloadKind == "openmp";
diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
index 6a4f2548c0bf..273f215ca94a 100644
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -797,7 +797,8 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC,
         Arg->claim();
         if (LegacySanitizeCoverage != 0 && DiagnoseErrors) {
           D.Diag(diag::warn_drv_deprecated_arg)
-              << Arg->getAsString(Args) << "-fsanitize-coverage=trace-pc-guard";
+              << Arg->getAsString(Args) << /*hasReplacement=*/true
+              << "-fsanitize-coverage=trace-pc-guard";
         }
         continue;
       }
@@ -833,11 +834,11 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC,
     // enabled.
     if (CoverageFeatures & CoverageTraceBB)
       D.Diag(clang::diag::warn_drv_deprecated_arg)
-          << "-fsanitize-coverage=trace-bb"
+          << "-fsanitize-coverage=trace-bb" << /*hasReplacement=*/true
           << "-fsanitize-coverage=trace-pc-guard";
     if (CoverageFeatures & Coverage8bitCounters)
       D.Diag(clang::diag::warn_drv_deprecated_arg)
-          << "-fsanitize-coverage=8bit-counters"
+          << "-fsanitize-coverage=8bit-counters" << /*hasReplacement=*/true
           << "-fsanitize-coverage=trace-pc-guard";
   }
 
@@ -849,7 +850,7 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC,
   if ((CoverageFeatures & InsertionPointTypes) &&
       !(CoverageFeatures & InstrumentationTypes) && DiagnoseErrors) {
     D.Diag(clang::diag::warn_drv_deprecated_arg)
-        << "-fsanitize-coverage=[func|bb|edge]"
+        << "-fsanitize-coverage=[func|bb|edge]" << /*hasReplacement=*/true
         << "-fsanitize-coverage=[func|bb|edge],[trace-pc-guard|trace-pc],["
            "control-flow]";
   }
diff --git a/clang/lib/Driver/ToolChains/AIX.cpp b/clang/lib/Driver/ToolChains/AIX.cpp
index aab98506adb9..85825e1ea65b 100644
--- a/clang/lib/Driver/ToolChains/AIX.cpp
+++ b/clang/lib/Driver/ToolChains/AIX.cpp
@@ -481,8 +481,8 @@ static void addTocDataOptions(const llvm::opt::ArgList &Args,
 
   // Currently only supported for small code model.
   if (TOCDataGloballyinEffect &&
-      (Args.getLastArgValue(options::OPT_mcmodel_EQ).equals("large") ||
-       Args.getLastArgValue(options::OPT_mcmodel_EQ).equals("medium"))) {
+      (Args.getLastArgValue(options::OPT_mcmodel_EQ) == "large" ||
+       Args.getLastArgValue(options::OPT_mcmodel_EQ) == "medium")) {
     D.Diag(clang::diag::warn_drv_unsupported_tocdata);
     return;
   }
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index 07965b487ea7..9ffea57b005d 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -732,7 +732,7 @@ AMDGPUToolChain::TranslateArgs(const DerivedArgList &Args, StringRef BoundArch,
 
   checkTargetID(*DAL);
 
-  if (!Args.getLastArgValue(options::OPT_x).equals("cl"))
+  if (Args.getLastArgValue(options::OPT_x) != "cl")
     return DAL;
 
   // Phase 1 (.cl -> .bc)
diff --git a/clang/lib/Driver/ToolChains/Arch/X86.cpp b/clang/lib/Driver/ToolChains/Arch/X86.cpp
index 53e26a9f8e22..8295d001ec6f 100644
--- a/clang/lib/Driver/ToolChains/Arch/X86.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/X86.cpp
@@ -273,7 +273,8 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
 
       for (StringRef Value : A->getValues()) {
         if (Value == "egpr" || Value == "push2pop2" || Value == "ppx" ||
-            Value == "ndd" || Value == "ccmp" || Value == "cf") {
+            Value == "ndd" || Value == "ccmp" || Value == "nf" ||
+            Value == "cf") {
           Features.push_back(
               Args.MakeArgString((IsNegative ? "-" : "+") + Value));
           continue;
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 1f08c5958dfb..f0cc018b6668 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -55,6 +55,7 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/YAMLParser.h"
+#include "llvm/TargetParser/AArch64TargetParser.h"
 #include "llvm/TargetParser/ARMTargetParserCommon.h"
 #include "llvm/TargetParser/Host.h"
 #include "llvm/TargetParser/LoongArchTargetParser.h"
@@ -1511,7 +1512,24 @@ static void CollectARMPACBTIOptions(const ToolChain &TC, const ArgList &Args,
   } else {
     StringRef DiagMsg;
     llvm::ARM::ParsedBranchProtection PBP;
-    if (!llvm::ARM::parseBranchProtection(A->getValue(), PBP, DiagMsg))
+    bool EnablePAuthLR = false;
+
+    // To know if we need to enable PAuth-LR As part of the standard branch
+    // protection option, it needs to be determined if the feature has been
+    // activated in the `march` argument. This information is stored within the
+    // CmdArgs variable and can be found using a search.
+    if (isAArch64) {
+      auto isPAuthLR = [](const char *member) {
+        llvm::AArch64::ExtensionInfo pauthlr_extension =
+            llvm::AArch64::getExtensionByID(llvm::AArch64::AEK_PAUTHLR);
+        return (pauthlr_extension.Feature.compare(member) == 0);
+      };
+
+      if (std::any_of(CmdArgs.begin(), CmdArgs.end(), isPAuthLR))
+        EnablePAuthLR = true;
+    }
+    if (!llvm::ARM::parseBranchProtection(A->getValue(), PBP, DiagMsg,
+                                          EnablePAuthLR))
       D.Diag(diag::err_drv_unsupported_option_argument)
           << A->getSpelling() << DiagMsg;
     if (!isAArch64 && PBP.Key == "b_key")
@@ -1526,7 +1544,7 @@ static void CollectARMPACBTIOptions(const ToolChain &TC, const ArgList &Args,
 
   CmdArgs.push_back(
       Args.MakeArgString(Twine("-msign-return-address=") + Scope));
-  if (!Scope.equals("none"))
+  if (Scope != "none")
     CmdArgs.push_back(
         Args.MakeArgString(Twine("-msign-return-address-key=") + Key));
   if (BranchProtectionPAuthLR)
@@ -1719,10 +1737,9 @@ void Clang::AddAArch64TargetArgs(const ArgList &Args,
   if (Arg *A = Args.getLastArg(options::OPT_msve_vector_bits_EQ)) {
     StringRef Val = A->getValue();
     const Driver &D = getToolChain().getDriver();
-    if (Val.equals("128") || Val.equals("256") || Val.equals("512") ||
-        Val.equals("1024") || Val.equals("2048") || Val.equals("128+") ||
-        Val.equals("256+") || Val.equals("512+") || Val.equals("1024+") ||
-        Val.equals("2048+")) {
+    if (Val == "128" || Val == "256" || Val == "512" || Val == "1024" ||
+        Val == "2048" || Val == "128+" || Val == "256+" || Val == "512+" ||
+        Val == "1024+" || Val == "2048+") {
       unsigned Bits = 0;
       if (!Val.consume_back("+")) {
         bool Invalid = Val.getAsInteger(10, Bits); (void)Invalid;
@@ -1736,7 +1753,7 @@ void Clang::AddAArch64TargetArgs(const ArgList &Args,
       CmdArgs.push_back(
           Args.MakeArgString("-mvscale-min=" + llvm::Twine(Bits / 128)));
     // Silently drop requests for vector-length agnostic code as it's implied.
-    } else if (!Val.equals("scalable"))
+    } else if (Val != "scalable")
       // Handle the unsupported values passed to msve-vector-bits.
       D.Diag(diag::err_drv_unsupported_option_argument)
           << A->getSpelling() << Val;
@@ -1756,6 +1773,20 @@ void Clang::AddAArch64TargetArgs(const ArgList &Args,
 
   Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_intrinsics,
                     options::OPT_fno_ptrauth_intrinsics);
+  Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_calls,
+                    options::OPT_fno_ptrauth_calls);
+  Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_returns,
+                    options::OPT_fno_ptrauth_returns);
+  Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_auth_traps,
+                    options::OPT_fno_ptrauth_auth_traps);
+  Args.addOptInFlag(
+      CmdArgs, options::OPT_fptrauth_vtable_pointer_address_discrimination,
+      options::OPT_fno_ptrauth_vtable_pointer_address_discrimination);
+  Args.addOptInFlag(
+      CmdArgs, options::OPT_fptrauth_vtable_pointer_type_discrimination,
+      options::OPT_fno_ptrauth_vtable_pointer_type_discrimination);
+  Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_init_fini,
+                    options::OPT_fno_ptrauth_init_fini);
 }
 
 void Clang::AddLoongArchTargetArgs(const ArgList &Args,
@@ -2084,7 +2115,7 @@ void Clang::AddRISCVTargetArgs(const ArgList &Args,
     // If the value is "zvl", use MinVLen from march. Otherwise, try to parse
     // as integer as long as we have a MinVLen.
     unsigned Bits = 0;
-    if (Val.equals("zvl") && MinVLen >= llvm::RISCV::RVVBitsPerBlock) {
+    if (Val == "zvl" && MinVLen >= llvm::RISCV::RVVBitsPerBlock) {
       Bits = MinVLen;
     } else if (!Val.getAsInteger(10, Bits)) {
       // Only accept power of 2 values beteen RVVBitsPerBlock and 65536 that
@@ -2101,7 +2132,7 @@ void Clang::AddRISCVTargetArgs(const ArgList &Args,
           Args.MakeArgString("-mvscale-max=" + llvm::Twine(VScaleMin)));
       CmdArgs.push_back(
           Args.MakeArgString("-mvscale-min=" + llvm::Twine(VScaleMin)));
-    } else if (!Val.equals("scalable")) {
+    } else if (Val != "scalable") {
       // Handle the unsupported values passed to mrvv-vector-bits.
       D.Diag(diag::err_drv_unsupported_option_argument)
           << A->getSpelling() << Val;
@@ -2736,7 +2767,6 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
   bool TrappingMathPresent = false; // Is trapping-math in args, and not
                                     // overriden by ffp-exception-behavior?
   bool RoundingFPMath = false;
-  bool RoundingMathPresent = false; // Is rounding-math in args?
   // -ffp-model values: strict, fast, precise
   StringRef FPModel = "";
   // -ffp-exception-behavior options: strict, maytrap, ignore
@@ -2799,11 +2829,10 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
   }
 
   for (const Arg *A : Args) {
-    auto optID = A->getOption().getID();
-    bool PreciseFPModel = false;
-    switch (optID) {
-    default:
-      break;
+    switch (A->getOption().getID()) {
+    // If this isn't an FP option skip the claim below
+    default: continue;
+
     case options::OPT_fcx_limited_range:
       if (GccRangeComplexOption.empty()) {
         if (Range != LangOptions::ComplexRangeKind::CX_Basic)
@@ -2853,13 +2882,13 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
     case options::OPT_fcomplex_arithmetic_EQ: {
       LangOptions::ComplexRangeKind RangeVal;
       StringRef Val = A->getValue();
-      if (Val.equals("full"))
+      if (Val == "full")
         RangeVal = LangOptions::ComplexRangeKind::CX_Full;
-      else if (Val.equals("improved"))
+      else if (Val == "improved")
         RangeVal = LangOptions::ComplexRangeKind::CX_Improved;
-      else if (Val.equals("promoted"))
+      else if (Val == "promoted")
         RangeVal = LangOptions::ComplexRangeKind::CX_Promoted;
-      else if (Val.equals("basic"))
+      else if (Val == "basic")
         RangeVal = LangOptions::ComplexRangeKind::CX_Basic;
       else {
         D.Diag(diag::err_drv_unsupported_option_argument)
@@ -2895,50 +2924,38 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       AssociativeMath = false;
       ReciprocalMath = false;
       SignedZeros = true;
-      // -fno_fast_math restores default denormal and fpcontract handling
       FPContract = "on";
 
       StringRef Val = A->getValue();
-      if (OFastEnabled && !Val.equals("fast")) {
-          // Only -ffp-model=fast is compatible with OFast, ignore.
+      if (OFastEnabled && Val != "fast") {
+        // Only -ffp-model=fast is compatible with OFast, ignore.
         D.Diag(clang::diag::warn_drv_overriding_option)
             << Args.MakeArgString("-ffp-model=" + Val) << "-Ofast";
         break;
       }
       StrictFPModel = false;
-      PreciseFPModel = true;
-      // ffp-model= is a Driver option, it is entirely rewritten into more
-      // granular options before being passed into cc1.
-      // Use the gcc option in the switch below.
-      if (!FPModel.empty() && !FPModel.equals(Val))
+      if (!FPModel.empty() && FPModel != Val)
         D.Diag(clang::diag::warn_drv_overriding_option)
             << Args.MakeArgString("-ffp-model=" + FPModel)
             << Args.MakeArgString("-ffp-model=" + Val);
-      if (Val.equals("fast")) {
+      if (Val == "fast") {
         FPModel = Val;
         applyFastMath();
-      } else if (Val.equals("precise")) {
-        optID = options::OPT_ffp_contract;
+      } else if (Val == "precise") {
         FPModel = Val;
         FPContract = "on";
-        PreciseFPModel = true;
-      } else if (Val.equals("strict")) {
+      } else if (Val == "strict") {
         StrictFPModel = true;
-        optID = options::OPT_frounding_math;
         FPExceptionBehavior = "strict";
         FPModel = Val;
         FPContract = "off";
         TrappingMath = true;
+        RoundingFPMath = true;
       } else
         D.Diag(diag::err_drv_unsupported_option_argument)
             << A->getSpelling() << Val;
       break;
     }
-    }
-
-    switch (optID) {
-    // If this isn't an FP option skip the claim below
-    default: continue;
 
     // Options controlling individual features
     case options::OPT_fhonor_infinities:    HonorINFs = true;         break;
@@ -2957,7 +2974,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
     case options::OPT_fno_signed_zeros:     SignedZeros = false;      break;
     case options::OPT_ftrapping_math:
       if (!TrappingMathPresent && !FPExceptionBehavior.empty() &&
-          !FPExceptionBehavior.equals("strict"))
+          FPExceptionBehavior != "strict")
         // Warn that previous value of option is overridden.
         D.Diag(clang::diag::warn_drv_overriding_option)
             << Args.MakeArgString("-ffp-exception-behavior=" +
@@ -2969,7 +2986,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       break;
     case options::OPT_fno_trapping_math:
       if (!TrappingMathPresent && !FPExceptionBehavior.empty() &&
-          !FPExceptionBehavior.equals("ignore"))
+          FPExceptionBehavior != "ignore")
         // Warn that previous value of option is overridden.
         D.Diag(clang::diag::warn_drv_overriding_option)
             << Args.MakeArgString("-ffp-exception-behavior=" +
@@ -2982,12 +2999,10 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
 
     case options::OPT_frounding_math:
       RoundingFPMath = true;
-      RoundingMathPresent = true;
       break;
 
     case options::OPT_fno_rounding_math:
       RoundingFPMath = false;
-      RoundingMathPresent = false;
       break;
 
     case options::OPT_fdenormal_fp_math_EQ:
@@ -3010,13 +3025,8 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
     // Validate and pass through -ffp-contract option.
     case options::OPT_ffp_contract: {
       StringRef Val = A->getValue();
-      if (PreciseFPModel) {
-        // -ffp-model=precise enables ffp-contract=on.
-        // -ffp-model=precise sets PreciseFPModel to on and Val to
-        // "precise". FPContract is set.
-        ;
-      } else if (Val.equals("fast") || Val.equals("on") || Val.equals("off") ||
-                 Val.equals("fast-honor-pragmas")) {
+      if (Val == "fast" || Val == "on" || Val == "off" ||
+          Val == "fast-honor-pragmas") {
         FPContract = Val;
         LastSeenFfpContractOption = Val;
       } else
@@ -3025,27 +3035,20 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       break;
     }
 
-    // Validate and pass through -ffp-model option.
-    case options::OPT_ffp_model_EQ:
-      // This should only occur in the error case
-      // since the optID has been replaced by a more granular
-      // floating point option.
-      break;
-
     // Validate and pass through -ffp-exception-behavior option.
     case options::OPT_ffp_exception_behavior_EQ: {
       StringRef Val = A->getValue();
       if (!TrappingMathPresent && !FPExceptionBehavior.empty() &&
-          !FPExceptionBehavior.equals(Val))
+          FPExceptionBehavior != Val)
         // Warn that previous value of option is overridden.
         D.Diag(clang::diag::warn_drv_overriding_option)
             << Args.MakeArgString("-ffp-exception-behavior=" +
                                   FPExceptionBehavior)
             << Args.MakeArgString("-ffp-exception-behavior=" + Val);
       TrappingMath = TrappingMathPresent = false;
-      if (Val.equals("ignore") || Val.equals("maytrap"))
+      if (Val == "ignore" || Val == "maytrap")
         FPExceptionBehavior = Val;
-      else if (Val.equals("strict")) {
+      else if (Val == "strict") {
         FPExceptionBehavior = Val;
         TrappingMath = TrappingMathPresent = true;
       } else
@@ -3057,8 +3060,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
     // Validate and pass through -ffp-eval-method option.
     case options::OPT_ffp_eval_method_EQ: {
       StringRef Val = A->getValue();
-      if (Val.equals("double") || Val.equals("extended") ||
-          Val.equals("source"))
+      if (Val == "double" || Val == "extended" || Val == "source")
         FPEvalMethod = Val;
       else
         D.Diag(diag::err_drv_unsupported_option_argument)
@@ -3070,18 +3072,18 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       StringRef Val = A->getValue();
       const llvm::Triple::ArchType Arch = TC.getArch();
       if (Arch == llvm::Triple::x86 || Arch == llvm::Triple::x86_64) {
-        if (Val.equals("standard") || Val.equals("fast"))
+        if (Val == "standard" || Val == "fast")
           Float16ExcessPrecision = Val;
         // To make it GCC compatible, allow the value of "16" which
         // means disable excess precision, the same meaning than clang's
         // equivalent value "none".
-        else if (Val.equals("16"))
+        else if (Val == "16")
           Float16ExcessPrecision = "none";
         else
           D.Diag(diag::err_drv_unsupported_option_argument)
               << A->getSpelling() << Val;
       } else {
-        if (!(Val.equals("standard") || Val.equals("fast")))
+        if (!(Val == "standard" || Val == "fast"))
           D.Diag(diag::err_drv_unsupported_option_argument)
               << A->getSpelling() << Val;
       }
@@ -3152,12 +3154,18 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       }
       break;
     }
+    // The StrictFPModel local variable is needed to report warnings
+    // in the way we intend. If -ffp-model=strict has been used, we
+    // want to report a warning for the next option encountered that
+    // takes us out of the settings described by fp-model=strict, but
+    // we don't want to continue issuing warnings for other conflicting
+    // options after that.
     if (StrictFPModel) {
       // If -ffp-model=strict has been specified on command line but
       // subsequent options conflict then emit warning diagnostic.
       if (HonorINFs && HonorNaNs && !AssociativeMath && !ReciprocalMath &&
           SignedZeros && TrappingMath && RoundingFPMath && !ApproxFunc &&
-          FPContract.equals("off"))
+          FPContract == "off")
         // OK: Current Arg doesn't conflict with -ffp-model=strict
         ;
       else {
@@ -3203,7 +3211,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
 
   if (TrappingMath) {
     // FP Exception Behavior is also set to strict
-    assert(FPExceptionBehavior.equals("strict"));
+    assert(FPExceptionBehavior == "strict");
   }
 
   // The default is IEEE.
@@ -3225,11 +3233,10 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
   if (!FPContract.empty())
     CmdArgs.push_back(Args.MakeArgString("-ffp-contract=" + FPContract));
 
-  if (!RoundingFPMath)
-    CmdArgs.push_back(Args.MakeArgString("-fno-rounding-math"));
-
-  if (RoundingFPMath && RoundingMathPresent)
+  if (RoundingFPMath)
     CmdArgs.push_back(Args.MakeArgString("-frounding-math"));
+  else
+    CmdArgs.push_back(Args.MakeArgString("-fno-rounding-math"));
 
   if (!FPExceptionBehavior.empty())
     CmdArgs.push_back(Args.MakeArgString("-ffp-exception-behavior=" +
@@ -3253,8 +3260,8 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
   if (!HonorINFs && !HonorNaNs && !MathErrno && AssociativeMath && ApproxFunc &&
       ReciprocalMath && !SignedZeros && !TrappingMath && !RoundingFPMath) {
     CmdArgs.push_back("-ffast-math");
-    if (FPModel.equals("fast")) {
-      if (FPContract.equals("fast"))
+    if (FPModel == "fast") {
+      if (FPContract == "fast")
         // All set, do nothing.
         ;
       else if (FPContract.empty())
@@ -6139,6 +6146,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
 
   Args.addOptOutFlag(CmdArgs, options::OPT_funique_section_names,
                      options::OPT_fno_unique_section_names);
+  Args.addOptInFlag(CmdArgs, options::OPT_fseparate_named_sections,
+                    options::OPT_fno_separate_named_sections);
   Args.addOptInFlag(CmdArgs, options::OPT_funique_internal_linkage_names,
                     options::OPT_fno_unique_internal_linkage_names);
   Args.addOptInFlag(CmdArgs, options::OPT_funique_basic_block_section_names,
@@ -6513,7 +6522,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   if (const Arg *A =
           Args.getLastArg(options::OPT_fvisibility_global_new_delete_hidden)) {
     D.Diag(diag::warn_drv_deprecated_arg)
-        << A->getAsString(Args)
+        << A->getAsString(Args) << /*hasReplacement=*/true
         << "-fvisibility-global-new-delete=force-hidden";
   }
 
@@ -7240,11 +7249,15 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   Args.addOptOutFlag(CmdArgs, options::OPT_fassume_unique_vtables,
                      options::OPT_fno_assume_unique_vtables);
 
-  // -frelaxed-template-template-args is off by default, as it is a severe
-  // breaking change until a corresponding change to template partial ordering
-  // is provided.
-  Args.addOptInFlag(CmdArgs, options::OPT_frelaxed_template_template_args,
-                    options::OPT_fno_relaxed_template_template_args);
+  // -fno-relaxed-template-template-args is deprecated.
+  if (Arg *A = Args.getLastArg(options::OPT_frelaxed_template_template_args,
+                               options::OPT_fno_relaxed_template_template_args);
+      A &&
+      A->getOption().matches(options::OPT_fno_relaxed_template_template_args))
+    D.Diag(diag::warn_drv_deprecated_arg)
+        << A->getAsString(Args) << /*hasReplacement=*/false;
+  else
+    CmdArgs.push_back("-fno-relaxed-template-template-args");
 
   // -fsized-deallocation is off by default, as it is an ABI-breaking change for
   // most platforms.
@@ -8101,7 +8114,7 @@ static EHFlags parseClangCLEHFlags(const Driver &D, const ArgList &Args,
 
   std::vector<std::string> EHArgs =
       Args.getAllArgValues(options::OPT__SLASH_EH);
-  for (auto EHVal : EHArgs) {
+  for (const auto &EHVal : EHArgs) {
     for (size_t I = 0, E = EHVal.size(); I != E; ++I) {
       switch (EHVal[I]) {
       case 'a':
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 6796b43a1550..71e993119436 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -346,7 +346,7 @@ void tools::addDirectoryList(const ArgList &Args, ArgStringList &CmdArgs,
     return; // Nothing to do.
 
   StringRef Name(ArgName);
-  if (Name.equals("-I") || Name.equals("-L") || Name.empty())
+  if (Name == "-I" || Name == "-L" || Name.empty())
     CombinedArg = true;
 
   StringRef Dirs(DirList);
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index 6634e6d818b3..d5f93c9c830f 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -82,6 +82,8 @@ CudaVersion getCudaVersion(uint32_t raw_version) {
     return CudaVersion::CUDA_122;
   if (raw_version < 12040)
     return CudaVersion::CUDA_123;
+  if (raw_version < 12050)
+    return CudaVersion::CUDA_124;
   return CudaVersion::NEW;
 }
 
@@ -688,6 +690,7 @@ void NVPTX::getNVPTXTargetFeatures(const Driver &D, const llvm::Triple &Triple,
   case CudaVersion::CUDA_##CUDA_VER:                                           \
     PtxFeature = "+ptx" #PTX_VER;                                              \
     break;
+    CASE_CUDA_VERSION(124, 84);
     CASE_CUDA_VERSION(123, 83);
     CASE_CUDA_VERSION(122, 82);
     CASE_CUDA_VERSION(121, 81);
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 8955b9fb653c..d275528b6905 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -170,10 +170,9 @@ void Flang::AddAArch64TargetArgs(const ArgList &Args,
   if (Arg *A = Args.getLastArg(options::OPT_msve_vector_bits_EQ)) {
     StringRef Val = A->getValue();
     const Driver &D = getToolChain().getDriver();
-    if (Val.equals("128") || Val.equals("256") || Val.equals("512") ||
-        Val.equals("1024") || Val.equals("2048") || Val.equals("128+") ||
-        Val.equals("256+") || Val.equals("512+") || Val.equals("1024+") ||
-        Val.equals("2048+")) {
+    if (Val == "128" || Val == "256" || Val == "512" || Val == "1024" ||
+        Val == "2048" || Val == "128+" || Val == "256+" || Val == "512+" ||
+        Val == "1024+" || Val == "2048+") {
       unsigned Bits = 0;
       if (!Val.consume_back("+")) {
         [[maybe_unused]] bool Invalid = Val.getAsInteger(10, Bits);
@@ -187,7 +186,7 @@ void Flang::AddAArch64TargetArgs(const ArgList &Args,
       CmdArgs.push_back(
           Args.MakeArgString("-mvscale-min=" + llvm::Twine(Bits / 128)));
       // Silently drop requests for vector-length agnostic code as it's implied.
-    } else if (!Val.equals("scalable"))
+    } else if (Val != "scalable")
       // Handle the unsupported values passed to msve-vector-bits.
       D.Diag(diag::err_drv_unsupported_option_argument)
           << A->getSpelling() << Val;
@@ -214,7 +213,7 @@ void Flang::AddRISCVTargetArgs(const ArgList &Args,
     // If the value is "zvl", use MinVLen from march. Otherwise, try to parse
     // as integer as long as we have a MinVLen.
     unsigned Bits = 0;
-    if (Val.equals("zvl") && MinVLen >= llvm::RISCV::RVVBitsPerBlock) {
+    if (Val == "zvl" && MinVLen >= llvm::RISCV::RVVBitsPerBlock) {
       Bits = MinVLen;
     } else if (!Val.getAsInteger(10, Bits)) {
       // Only accept power of 2 values beteen RVVBitsPerBlock and 65536 that
@@ -231,7 +230,7 @@ void Flang::AddRISCVTargetArgs(const ArgList &Args,
           Args.MakeArgString("-mvscale-max=" + llvm::Twine(VScaleMin)));
       CmdArgs.push_back(
           Args.MakeArgString("-mvscale-min=" + llvm::Twine(VScaleMin)));
-    } else if (!Val.equals("scalable")) {
+    } else if (Val != "scalable") {
       // Handle the unsupported values passed to mrvv-vector-bits.
       D.Diag(diag::err_drv_unsupported_option_argument)
           << A->getSpelling() << Val;
@@ -748,6 +747,10 @@ void Flang::ConstructJob(Compilation &C, const JobAction &JA,
   // Add other compile options
   addOtherOptions(Args, CmdArgs);
 
+  // Disable all warnings
+  // TODO: Handle interactions between -w, -pedantic, -Wall, -WOption
+  Args.AddLastArg(CmdArgs, options::OPT_w);
+
   // Forward flags for OpenMP. We don't do this if the current action is an
   // device offloading action other than OpenMP.
   if (Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
diff --git a/clang/lib/Driver/ToolChains/HIPUtility.cpp b/clang/lib/Driver/ToolChains/HIPUtility.cpp
index 08c647dfcb6f..b1ff697b368b 100644
--- a/clang/lib/Driver/ToolChains/HIPUtility.cpp
+++ b/clang/lib/Driver/ToolChains/HIPUtility.cpp
@@ -60,9 +60,9 @@ public:
         Verbose(C.getArgs().hasArg(options::OPT_v)) {
     populateSymbols();
     if (Verbose) {
-      for (auto Name : FatBinSymbols)
+      for (const auto &Name : FatBinSymbols)
         llvm::errs() << "Found undefined HIP fatbin symbol: " << Name << "\n";
-      for (auto Name : GPUBinHandleSymbols)
+      for (const auto &Name : GPUBinHandleSymbols)
         llvm::errs() << "Found undefined HIP gpubin handle symbol: " << Name
                      << "\n";
     }
diff --git a/clang/lib/Driver/ToolChains/HLSL.cpp b/clang/lib/Driver/ToolChains/HLSL.cpp
index 1169b5d8c92d..8286e3be2180 100644
--- a/clang/lib/Driver/ToolChains/HLSL.cpp
+++ b/clang/lib/Driver/ToolChains/HLSL.cpp
@@ -98,9 +98,49 @@ std::optional<std::string> tryParseProfile(StringRef Profile) {
   else if (llvm::getAsUnsignedInteger(Parts[2], 0, Minor))
     return std::nullopt;
 
-  // dxil-unknown-shadermodel-hull
+  // Determine DXIL version using the minor version number of Shader
+  // Model version specified in target profile. Prior to decoupling DXIL version
+  // numbering from that of Shader Model DXIL version 1.Y corresponds to SM 6.Y.
+  // E.g., dxilv1.Y-unknown-shadermodelX.Y-hull
   llvm::Triple T;
-  T.setArch(Triple::ArchType::dxil);
+  Triple::SubArchType SubArch = llvm::Triple::NoSubArch;
+  switch (Minor) {
+  case 0:
+    SubArch = llvm::Triple::DXILSubArch_v1_0;
+    break;
+  case 1:
+    SubArch = llvm::Triple::DXILSubArch_v1_1;
+    break;
+  case 2:
+    SubArch = llvm::Triple::DXILSubArch_v1_2;
+    break;
+  case 3:
+    SubArch = llvm::Triple::DXILSubArch_v1_3;
+    break;
+  case 4:
+    SubArch = llvm::Triple::DXILSubArch_v1_4;
+    break;
+  case 5:
+    SubArch = llvm::Triple::DXILSubArch_v1_5;
+    break;
+  case 6:
+    SubArch = llvm::Triple::DXILSubArch_v1_6;
+    break;
+  case 7:
+    SubArch = llvm::Triple::DXILSubArch_v1_7;
+    break;
+  case 8:
+    SubArch = llvm::Triple::DXILSubArch_v1_8;
+    break;
+  case OfflineLibMinor:
+    // Always consider minor version x as the latest supported DXIL version
+    SubArch = llvm::Triple::LatestDXILSubArch;
+    break;
+  default:
+    // No DXIL Version corresponding to specified Shader Model version found
+    return std::nullopt;
+  }
+  T.setArch(Triple::ArchType::dxil, SubArch);
   T.setOSName(Triple::getOSTypeName(Triple::OSType::ShaderModel).str() +
               VersionTuple(Major, Minor).getAsString());
   T.setEnvironment(Kind);
@@ -218,8 +258,7 @@ HLSLToolChain::TranslateArgs(const DerivedArgList &Args, StringRef BoundArch,
       }
     }
     if (A->getOption().getID() == options::OPT_emit_pristine_llvm) {
-      // Translate fcgl into -S -emit-llvm and -disable-llvm-passes.
-      DAL->AddFlagArg(nullptr, Opts.getOption(options::OPT_S));
+      // Translate -fcgl into -emit-llvm and -disable-llvm-passes.
       DAL->AddFlagArg(nullptr, Opts.getOption(options::OPT_emit_llvm));
       DAL->AddFlagArg(nullptr,
                       Opts.getOption(options::OPT_disable_llvm_passes));
diff --git a/clang/lib/Driver/ToolChains/WebAssembly.cpp b/clang/lib/Driver/ToolChains/WebAssembly.cpp
index b7c6efab83e8..5b763df9b332 100644
--- a/clang/lib/Driver/ToolChains/WebAssembly.cpp
+++ b/clang/lib/Driver/ToolChains/WebAssembly.cpp
@@ -347,6 +347,23 @@ void WebAssembly::addClangTargetOptions(const ArgList &DriverArgs,
     // Backend needs -wasm-enable-eh to enable Wasm EH
     CC1Args.push_back("-mllvm");
     CC1Args.push_back("-wasm-enable-eh");
+
+    // New Wasm EH spec (adopted in Oct 2023) requires multivalue and
+    // reference-types.
+    if (DriverArgs.hasFlag(options::OPT_mno_multivalue,
+                           options::OPT_mmultivalue, false)) {
+      getDriver().Diag(diag::err_drv_argument_not_allowed_with)
+          << "-fwasm-exceptions" << "-mno-multivalue";
+    }
+    if (DriverArgs.hasFlag(options::OPT_mno_reference_types,
+                           options::OPT_mreference_types, false)) {
+      getDriver().Diag(diag::err_drv_argument_not_allowed_with)
+          << "-fwasm-exceptions" << "-mno-reference-types";
+    }
+    CC1Args.push_back("-target-feature");
+    CC1Args.push_back("+multivalue");
+    CC1Args.push_back("-target-feature");
+    CC1Args.push_back("+reference-types");
   }
 
   for (const Arg *A : DriverArgs.filtered(options::OPT_mllvm)) {
@@ -408,6 +425,23 @@ void WebAssembly::addClangTargetOptions(const ArgList &DriverArgs,
       CC1Args.push_back("+exception-handling");
       // Backend needs '-exception-model=wasm' to use Wasm EH instructions
       CC1Args.push_back("-exception-model=wasm");
+
+      // New Wasm EH spec (adopted in Oct 2023) requires multivalue and
+      // reference-types.
+      if (DriverArgs.hasFlag(options::OPT_mno_multivalue,
+                             options::OPT_mmultivalue, false)) {
+        getDriver().Diag(diag::err_drv_argument_not_allowed_with)
+            << "-mllvm -wasm-enable-sjlj" << "-mno-multivalue";
+      }
+      if (DriverArgs.hasFlag(options::OPT_mno_reference_types,
+                             options::OPT_mreference_types, false)) {
+        getDriver().Diag(diag::err_drv_argument_not_allowed_with)
+            << "-mllvm -wasm-enable-sjlj" << "-mno-reference-types";
+      }
+      CC1Args.push_back("-target-feature");
+      CC1Args.push_back("+multivalue");
+      CC1Args.push_back("-target-feature");
+      CC1Args.push_back("+reference-types");
     }
   }
 }
diff --git a/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp b/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp
index 34278b5d40c4..c16d4623f115 100644
--- a/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp
+++ b/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp
@@ -514,7 +514,7 @@ Object serializeSymbolKind(APIRecord::RecordKind RK, Language Lang) {
 /// which is prefixed by the source language name, useful for tooling to parse
 /// the kind, and a \c displayName for rendering human-readable names.
 Object serializeSymbolKind(const APIRecord &Record, Language Lang) {
-  return serializeSymbolKind(Record.getKind(), Lang);
+  return serializeSymbolKind(Record.KindForDisplay, Lang);
 }
 
 /// Serialize the function signature field, as specified by the
@@ -591,8 +591,8 @@ Array generateParentContexts(const SmallVectorImpl<SymbolReference> &Parents,
     Elem["usr"] = Parent.USR;
     Elem["name"] = Parent.Name;
     if (Parent.Record)
-      Elem["kind"] =
-          serializeSymbolKind(Parent.Record->getKind(), Lang)["identifier"];
+      Elem["kind"] = serializeSymbolKind(Parent.Record->KindForDisplay,
+                                         Lang)["identifier"];
     else
       Elem["kind"] =
           serializeSymbolKind(APIRecord::RK_Unknown, Lang)["identifier"];
diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index ad0e2c3c620c..6b9fbfe0ebf5 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -1422,7 +1422,7 @@ unsigned ContinuationIndenter::getNewLineColumn(const LineState &State) {
   // the next line.
   if (State.Line->InPragmaDirective) {
     FormatToken *PragmaType = State.Line->First->Next->Next;
-    if (PragmaType && PragmaType->TokenText.equals("omp"))
+    if (PragmaType && PragmaType->TokenText == "omp")
       return CurrentState.Indent + Style.ContinuationIndentWidth;
   }
 
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index c8d8ec3afbd9..8f027ffa20cc 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -100,6 +100,7 @@ struct MappingTraits<FormatStyle::ShortCaseStatementsAlignmentStyle> {
     IO.mapOptional("Enabled", Value.Enabled);
     IO.mapOptional("AcrossEmptyLines", Value.AcrossEmptyLines);
     IO.mapOptional("AcrossComments", Value.AcrossComments);
+    IO.mapOptional("AlignCaseArrows", Value.AlignCaseArrows);
     IO.mapOptional("AlignCaseColons", Value.AlignCaseColons);
   }
 };
@@ -911,6 +912,8 @@ template <> struct MappingTraits<FormatStyle> {
                    Style.AllowBreakBeforeNoexceptSpecifier);
     IO.mapOptional("AllowShortBlocksOnASingleLine",
                    Style.AllowShortBlocksOnASingleLine);
+    IO.mapOptional("AllowShortCaseExpressionOnASingleLine",
+                   Style.AllowShortCaseExpressionOnASingleLine);
     IO.mapOptional("AllowShortCaseLabelsOnASingleLine",
                    Style.AllowShortCaseLabelsOnASingleLine);
     IO.mapOptional("AllowShortCompoundRequirementOnASingleLine",
@@ -1423,6 +1426,7 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
   LLVMStyle.AllowAllParametersOfDeclarationOnNextLine = true;
   LLVMStyle.AllowBreakBeforeNoexceptSpecifier = FormatStyle::BBNSS_Never;
   LLVMStyle.AllowShortBlocksOnASingleLine = FormatStyle::SBS_Never;
+  LLVMStyle.AllowShortCaseExpressionOnASingleLine = true;
   LLVMStyle.AllowShortCaseLabelsOnASingleLine = false;
   LLVMStyle.AllowShortCompoundRequirementOnASingleLine = true;
   LLVMStyle.AllowShortEnumsOnASingleLine = true;
@@ -3766,7 +3770,7 @@ reformat(const FormatStyle &Style, StringRef Code,
     tooling::Replacements NonNoOpFixes;
     for (const tooling::Replacement &Fix : Fixes) {
       StringRef OriginalCode = Code.substr(Fix.getOffset(), Fix.getLength());
-      if (!OriginalCode.equals(Fix.getReplacementText())) {
+      if (OriginalCode != Fix.getReplacementText()) {
         auto Err = NonNoOpFixes.add(Fix);
         if (Err) {
           llvm::errs() << "Error adding replacements : "
diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
index 28b6488e54a4..95f16fde5005 100644
--- a/clang/lib/Format/FormatToken.h
+++ b/clang/lib/Format/FormatToken.h
@@ -38,6 +38,7 @@ namespace format {
   /* l_brace of a block that is not the body of a (e.g. loop) statement. */    \
   TYPE(BlockLBrace)                                                            \
   TYPE(BracedListLBrace)                                                       \
+  TYPE(CaseLabelArrow)                                                         \
   /* The colon at the end of a case label. */                                  \
   TYPE(CaseLabelColon)                                                         \
   TYPE(CastRParen)                                                             \
@@ -148,6 +149,8 @@ namespace format {
   TYPE(StructLBrace)                                                           \
   TYPE(StructRBrace)                                                           \
   TYPE(StructuredBindingLSquare)                                               \
+  TYPE(SwitchExpressionLabel)                                                  \
+  TYPE(SwitchExpressionLBrace)                                                 \
   TYPE(TableGenBangOperator)                                                   \
   TYPE(TableGenCondOperator)                                                   \
   TYPE(TableGenCondOperatorColon)                                              \
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index d366ae2080bc..e935d3e2709c 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -5051,6 +5051,8 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
       return true; // "x! as string", "x! in y"
     }
   } else if (Style.Language == FormatStyle::LK_Java) {
+    if (Left.is(TT_CaseLabelArrow) || Right.is(TT_CaseLabelArrow))
+      return true;
     if (Left.is(tok::r_square) && Right.is(tok::l_brace))
       return true;
     // spaces inside square brackets.
diff --git a/clang/lib/Format/UnwrappedLineFormatter.cpp b/clang/lib/Format/UnwrappedLineFormatter.cpp
index 4ae54e56331b..4d53361aaf33 100644
--- a/clang/lib/Format/UnwrappedLineFormatter.cpp
+++ b/clang/lib/Format/UnwrappedLineFormatter.cpp
@@ -515,6 +515,12 @@ private:
       }
     }
 
+    if (TheLine->First->is(TT_SwitchExpressionLabel)) {
+      return Style.AllowShortCaseExpressionOnASingleLine
+                 ? tryMergeShortCaseLabels(I, E, Limit)
+                 : 0;
+    }
+
     if (TheLine->Last->is(tok::l_brace)) {
       bool ShouldMerge = false;
       // Try to merge records.
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 854428389740..310b75485e08 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -430,9 +430,9 @@ bool UnwrappedLineParser::parseLevel(const FormatToken *OpeningBrace,
       unsigned StoredPosition = Tokens->getPosition();
       auto *Next = Tokens->getNextNonComment();
       FormatTok = Tokens->setPosition(StoredPosition);
-      if (Next->isNot(tok::colon)) {
-        // default not followed by ':' is not a case label; treat it like
-        // an identifier.
+      if (!Next->isOneOf(tok::colon, tok::arrow)) {
+        // default not followed by `:` or `->` is not a case label; treat it
+        // like an identifier.
         parseStructuralElement();
         break;
       }
@@ -451,6 +451,7 @@ bool UnwrappedLineParser::parseLevel(const FormatToken *OpeningBrace,
       }
       if (!SwitchLabelEncountered &&
           (Style.IndentCaseLabels ||
+           (OpeningBrace && OpeningBrace->is(TT_SwitchExpressionLBrace)) ||
            (Line->InPPDirective && Line->Level == 1))) {
         ++Line->Level;
       }
@@ -1519,9 +1520,9 @@ void UnwrappedLineParser::parseStructuralElement(
       // 'switch: string' field declaration.
       break;
     }
-    parseSwitch();
+    parseSwitch(/*IsExpr=*/false);
     return;
-  case tok::kw_default:
+  case tok::kw_default: {
     // In Verilog default along with other labels are handled in the next loop.
     if (Style.isVerilog())
       break;
@@ -1529,14 +1530,22 @@ void UnwrappedLineParser::parseStructuralElement(
       // 'default: string' field declaration.
       break;
     }
+    auto *Default = FormatTok;
     nextToken();
     if (FormatTok->is(tok::colon)) {
       FormatTok->setFinalizedType(TT_CaseLabelColon);
       parseLabel();
       return;
     }
+    if (FormatTok->is(tok::arrow)) {
+      FormatTok->setFinalizedType(TT_CaseLabelArrow);
+      Default->setFinalizedType(TT_SwitchExpressionLabel);
+      parseLabel();
+      return;
+    }
     // e.g. "default void f() {}" in a Java interface.
     break;
+  }
   case tok::kw_case:
     // Proto: there are no switch/case statements.
     if (Style.Language == FormatStyle::LK_Proto) {
@@ -1763,8 +1772,9 @@ void UnwrappedLineParser::parseStructuralElement(
       break;
     }
     case tok::kw_enum:
-      // Ignore if this is part of "template <enum ..." or "... -> enum".
-      if (Previous && Previous->isOneOf(tok::less, tok::arrow)) {
+      // Ignore if this is part of "template <enum ..." or "... -> enum" or
+      // "template <..., enum ...>".
+      if (Previous && Previous->isOneOf(tok::less, tok::arrow, tok::comma)) {
         nextToken();
         break;
       }
@@ -2061,6 +2071,11 @@ void UnwrappedLineParser::parseStructuralElement(
     case tok::kw_new:
       parseNew();
       break;
+    case tok::kw_switch:
+      if (Style.Language == FormatStyle::LK_Java)
+        parseSwitch(/*IsExpr=*/true);
+      nextToken();
+      break;
     case tok::kw_case:
       // Proto: there are no switch/case statements.
       if (Style.Language == FormatStyle::LK_Proto) {
@@ -2510,6 +2525,7 @@ bool UnwrappedLineParser::parseParens(TokenType AmpAmpTokenType) {
   assert(FormatTok->is(tok::l_paren) && "'(' expected.");
   auto *LeftParen = FormatTok;
   bool SeenEqual = false;
+  bool MightBeFoldExpr = false;
   const bool MightBeStmtExpr = Tokens->peekNextToken()->is(tok::l_brace);
   nextToken();
   do {
@@ -2521,7 +2537,7 @@ bool UnwrappedLineParser::parseParens(TokenType AmpAmpTokenType) {
         parseChildBlock();
       break;
     case tok::r_paren:
-      if (!MightBeStmtExpr && !Line->InMacroBody &&
+      if (!MightBeStmtExpr && !MightBeFoldExpr && !Line->InMacroBody &&
           Style.RemoveParentheses > FormatStyle::RPS_Leave) {
         const auto *Prev = LeftParen->Previous;
         const auto *Next = Tokens->peekNextToken();
@@ -2564,6 +2580,10 @@ bool UnwrappedLineParser::parseParens(TokenType AmpAmpTokenType) {
         parseBracedList();
       }
       break;
+    case tok::ellipsis:
+      MightBeFoldExpr = true;
+      nextToken();
+      break;
     case tok::equal:
       SeenEqual = true;
       if (Style.isCSharp() && FormatTok->is(TT_FatArrow))
@@ -2583,6 +2603,9 @@ bool UnwrappedLineParser::parseParens(TokenType AmpAmpTokenType) {
       else
         nextToken();
       break;
+    case tok::kw_switch:
+      parseSwitch(/*IsExpr=*/true);
+      break;
     case tok::kw_requires: {
       auto RequiresToken = FormatTok;
       nextToken();
@@ -3201,10 +3224,11 @@ void UnwrappedLineParser::parseDoWhile() {
 void UnwrappedLineParser::parseLabel(bool LeftAlignLabel) {
   nextToken();
   unsigned OldLineLevel = Line->Level;
-  if (Line->Level > 1 || (!Line->InPPDirective && Line->Level > 0))
-    --Line->Level;
+
   if (LeftAlignLabel)
     Line->Level = 0;
+  else if (Line->Level > 1 || (!Line->InPPDirective && Line->Level > 0))
+    --Line->Level;
 
   if (!Style.IndentCaseBlocks && CommentsBeforeNextToken.empty() &&
       FormatTok->is(tok::l_brace)) {
@@ -3239,6 +3263,7 @@ void UnwrappedLineParser::parseLabel(bool LeftAlignLabel) {
 
 void UnwrappedLineParser::parseCaseLabel() {
   assert(FormatTok->is(tok::kw_case) && "'case' expected");
+  auto *Case = FormatTok;
 
   // FIXME: fix handling of complex expressions here.
   do {
@@ -3247,11 +3272,16 @@ void UnwrappedLineParser::parseCaseLabel() {
       FormatTok->setFinalizedType(TT_CaseLabelColon);
       break;
     }
+    if (Style.Language == FormatStyle::LK_Java && FormatTok->is(tok::arrow)) {
+      FormatTok->setFinalizedType(TT_CaseLabelArrow);
+      Case->setFinalizedType(TT_SwitchExpressionLabel);
+      break;
+    }
   } while (!eof());
   parseLabel();
 }
 
-void UnwrappedLineParser::parseSwitch() {
+void UnwrappedLineParser::parseSwitch(bool IsExpr) {
   assert(FormatTok->is(tok::kw_switch) && "'switch' expected");
   nextToken();
   if (FormatTok->is(tok::l_paren))
@@ -3261,10 +3291,15 @@ void UnwrappedLineParser::parseSwitch() {
 
   if (FormatTok->is(tok::l_brace)) {
     CompoundStatementIndenter Indenter(this, Style, Line->Level);
-    FormatTok->setFinalizedType(TT_ControlStatementLBrace);
-    parseBlock();
+    FormatTok->setFinalizedType(IsExpr ? TT_SwitchExpressionLBrace
+                                       : TT_ControlStatementLBrace);
+    if (IsExpr)
+      parseChildBlock();
+    else
+      parseBlock();
     setPreviousRBraceType(TT_ControlStatementRBrace);
-    addUnwrappedLine();
+    if (!IsExpr)
+      addUnwrappedLine();
   } else {
     addUnwrappedLine();
     ++Line->Level;
@@ -3350,7 +3385,7 @@ void UnwrappedLineParser::parseAccessSpecifier() {
 /// \brief Parses a requires, decides if it is a clause or an expression.
 /// \pre The current token has to be the requires keyword.
 /// \returns true if it parsed a clause.
-bool clang::format::UnwrappedLineParser::parseRequires() {
+bool UnwrappedLineParser::parseRequires() {
   assert(FormatTok->is(tok::kw_requires) && "'requires' expected");
   auto RequiresToken = FormatTok;
 
diff --git a/clang/lib/Format/UnwrappedLineParser.h b/clang/lib/Format/UnwrappedLineParser.h
index e2cf28c0c065..2a0fe19d0957 100644
--- a/clang/lib/Format/UnwrappedLineParser.h
+++ b/clang/lib/Format/UnwrappedLineParser.h
@@ -157,7 +157,7 @@ private:
   void parseDoWhile();
   void parseLabel(bool LeftAlignLabel = false);
   void parseCaseLabel();
-  void parseSwitch();
+  void parseSwitch(bool IsExpr);
   void parseNamespace();
   bool parseModuleImport();
   void parseNew();
diff --git a/clang/lib/Format/WhitespaceManager.cpp b/clang/lib/Format/WhitespaceManager.cpp
index 44fd807ec27e..ed06d6098a9f 100644
--- a/clang/lib/Format/WhitespaceManager.cpp
+++ b/clang/lib/Format/WhitespaceManager.cpp
@@ -107,7 +107,8 @@ const tooling::Replacements &WhitespaceManager::generateReplacements() {
   llvm::sort(Changes, Change::IsBeforeInFile(SourceMgr));
   calculateLineBreakInformation();
   alignConsecutiveMacros();
-  alignConsecutiveShortCaseStatements();
+  alignConsecutiveShortCaseStatements(/*IsExpr=*/true);
+  alignConsecutiveShortCaseStatements(/*IsExpr=*/false);
   alignConsecutiveDeclarations();
   alignConsecutiveBitFields();
   alignConsecutiveAssignments();
@@ -878,22 +879,27 @@ void WhitespaceManager::alignConsecutiveColons(
       Changes, /*StartAt=*/0, AlignStyle);
 }
 
-void WhitespaceManager::alignConsecutiveShortCaseStatements() {
+void WhitespaceManager::alignConsecutiveShortCaseStatements(bool IsExpr) {
   if (!Style.AlignConsecutiveShortCaseStatements.Enabled ||
-      !Style.AllowShortCaseLabelsOnASingleLine) {
+      !(IsExpr ? Style.AllowShortCaseExpressionOnASingleLine
+               : Style.AllowShortCaseLabelsOnASingleLine)) {
     return;
   }
 
+  const auto Type = IsExpr ? TT_CaseLabelArrow : TT_CaseLabelColon;
+  const auto &Option = Style.AlignConsecutiveShortCaseStatements;
+  const bool AlignArrowOrColon =
+      IsExpr ? Option.AlignCaseArrows : Option.AlignCaseColons;
+
   auto Matches = [&](const Change &C) {
-    if (Style.AlignConsecutiveShortCaseStatements.AlignCaseColons)
-      return C.Tok->is(TT_CaseLabelColon);
+    if (AlignArrowOrColon)
+      return C.Tok->is(Type);
 
     // Ignore 'IsInsideToken' to allow matching trailing comments which
     // need to be reflowed as that causes the token to appear in two
     // different changes, which will cause incorrect alignment as we'll
     // reflow early due to detecting multiple aligning tokens per line.
-    return !C.IsInsideToken && C.Tok->Previous &&
-           C.Tok->Previous->is(TT_CaseLabelColon);
+    return !C.IsInsideToken && C.Tok->Previous && C.Tok->Previous->is(Type);
   };
 
   unsigned MinColumn = 0;
@@ -944,7 +950,7 @@ void WhitespaceManager::alignConsecutiveShortCaseStatements() {
     if (Changes[I].Tok->isNot(tok::comment))
       LineIsComment = false;
 
-    if (Changes[I].Tok->is(TT_CaseLabelColon)) {
+    if (Changes[I].Tok->is(Type)) {
       LineIsEmptyCase =
           !Changes[I].Tok->Next || Changes[I].Tok->Next->isTrailingComment();
 
diff --git a/clang/lib/Format/WhitespaceManager.h b/clang/lib/Format/WhitespaceManager.h
index 98cf4a260cc4..7b91d8bf4db7 100644
--- a/clang/lib/Format/WhitespaceManager.h
+++ b/clang/lib/Format/WhitespaceManager.h
@@ -233,7 +233,7 @@ private:
   void alignChainedConditionals();
 
   /// Align consecutive short case statements over all \c Changes.
-  void alignConsecutiveShortCaseStatements();
+  void alignConsecutiveShortCaseStatements(bool IsExpr);
 
   /// Align consecutive TableGen DAGArg colon over all \c Changes.
   void alignConsecutiveTableGenBreakingDAGArgColons();
diff --git a/clang/lib/Frontend/ASTUnit.cpp b/clang/lib/Frontend/ASTUnit.cpp
index 1b93588553a2..755aaddc0ad7 100644
--- a/clang/lib/Frontend/ASTUnit.cpp
+++ b/clang/lib/Frontend/ASTUnit.cpp
@@ -2374,8 +2374,6 @@ bool ASTUnit::serialize(raw_ostream &OS) {
   return serializeUnit(Writer, Buffer, getSema(), OS);
 }
 
-using SLocRemap = ContinuousRangeMap<unsigned, int, 2>;
-
 void ASTUnit::TranslateStoredDiagnostics(
                           FileManager &FileMgr,
                           SourceManager &SrcMgr,
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 8312abc36039..14ee02c4cd58 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -1969,7 +1969,7 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args,
       Diags.Report(diag::err_drv_invalid_value)
           << A->getAsString(Args) << A->getValue();
     else if (Val == llvm::FunctionReturnThunksKind::Extern &&
-             Args.getLastArgValue(OPT_mcmodel_EQ).equals("large"))
+             Args.getLastArgValue(OPT_mcmodel_EQ) == "large")
       Diags.Report(diag::err_drv_argument_not_allowed_with)
           << A->getAsString(Args)
           << Args.getLastArg(OPT_mcmodel_EQ)->getAsString(Args);
@@ -2841,6 +2841,30 @@ static bool ParseFrontendArgs(FrontendOptions &Opts, ArgList &Args,
     }
 
     Opts.ProgramAction = *ProgramAction;
+
+    // Catch common mistakes when multiple actions are specified for cc1 (e.g.
+    // -S -emit-llvm means -emit-llvm while -emit-llvm -S means -S). However, to
+    // support driver `-c -Xclang ACTION` (-cc1 -emit-llvm file -main-file-name
+    // X ACTION), we suppress the error when the two actions are separated by
+    // -main-file-name.
+    //
+    // As an exception, accept composable -ast-dump*.
+    if (!A->getSpelling().starts_with("-ast-dump")) {
+      const Arg *SavedAction = nullptr;
+      for (const Arg *AA :
+           Args.filtered(OPT_Action_Group, OPT_main_file_name)) {
+        if (AA->getOption().matches(OPT_main_file_name)) {
+          SavedAction = nullptr;
+        } else if (!SavedAction) {
+          SavedAction = AA;
+        } else {
+          if (!A->getOption().matches(OPT_ast_dump_EQ))
+            Diags.Report(diag::err_fe_invalid_multiple_actions)
+                << SavedAction->getSpelling() << A->getSpelling();
+          break;
+        }
+      }
+    }
   }
 
   if (const Arg* A = Args.getLastArg(OPT_plugin)) {
@@ -3322,11 +3346,31 @@ static void GeneratePointerAuthArgs(const LangOptions &Opts,
                                     ArgumentConsumer Consumer) {
   if (Opts.PointerAuthIntrinsics)
     GenerateArg(Consumer, OPT_fptrauth_intrinsics);
+  if (Opts.PointerAuthCalls)
+    GenerateArg(Consumer, OPT_fptrauth_calls);
+  if (Opts.PointerAuthReturns)
+    GenerateArg(Consumer, OPT_fptrauth_returns);
+  if (Opts.PointerAuthAuthTraps)
+    GenerateArg(Consumer, OPT_fptrauth_auth_traps);
+  if (Opts.PointerAuthVTPtrAddressDiscrimination)
+    GenerateArg(Consumer, OPT_fptrauth_vtable_pointer_address_discrimination);
+  if (Opts.PointerAuthVTPtrTypeDiscrimination)
+    GenerateArg(Consumer, OPT_fptrauth_vtable_pointer_type_discrimination);
+  if (Opts.PointerAuthInitFini)
+    GenerateArg(Consumer, OPT_fptrauth_init_fini);
 }
 
 static void ParsePointerAuthArgs(LangOptions &Opts, ArgList &Args,
                                  DiagnosticsEngine &Diags) {
   Opts.PointerAuthIntrinsics = Args.hasArg(OPT_fptrauth_intrinsics);
+  Opts.PointerAuthCalls = Args.hasArg(OPT_fptrauth_calls);
+  Opts.PointerAuthReturns = Args.hasArg(OPT_fptrauth_returns);
+  Opts.PointerAuthAuthTraps = Args.hasArg(OPT_fptrauth_auth_traps);
+  Opts.PointerAuthVTPtrAddressDiscrimination =
+      Args.hasArg(OPT_fptrauth_vtable_pointer_address_discrimination);
+  Opts.PointerAuthVTPtrTypeDiscrimination =
+      Args.hasArg(OPT_fptrauth_vtable_pointer_type_discrimination);
+  Opts.PointerAuthInitFini = Args.hasArg(OPT_fptrauth_init_fini);
 }
 
 /// Check if input file kind and language standard are compatible.
diff --git a/clang/lib/Frontend/FrontendAction.cpp b/clang/lib/Frontend/FrontendAction.cpp
index 9ae7664b4b49..a9c45e525c69 100644
--- a/clang/lib/Frontend/FrontendAction.cpp
+++ b/clang/lib/Frontend/FrontendAction.cpp
@@ -71,7 +71,7 @@ public:
     if (Previous)
       Previous->ReaderInitialized(Reader);
   }
-  void IdentifierRead(serialization::IdentID ID,
+  void IdentifierRead(serialization::IdentifierID ID,
                       IdentifierInfo *II) override {
     if (Previous)
       Previous->IdentifierRead(ID, II);
diff --git a/clang/lib/Frontend/InterfaceStubFunctionsConsumer.cpp b/clang/lib/Frontend/InterfaceStubFunctionsConsumer.cpp
index d58f5bb09199..d7cfd23bb0a7 100644
--- a/clang/lib/Frontend/InterfaceStubFunctionsConsumer.cpp
+++ b/clang/lib/Frontend/InterfaceStubFunctionsConsumer.cpp
@@ -33,7 +33,8 @@ class InterfaceStubFunctionsConsumer : public ASTConsumer {
 
     MangledSymbol(const std::string &ParentName, uint8_t Type, uint8_t Binding,
                   std::vector<std::string> Names)
-        : ParentName(ParentName), Type(Type), Binding(Binding), Names(Names) {}
+        : ParentName(ParentName), Type(Type), Binding(Binding),
+          Names(std::move(Names)) {}
   };
   using MangledSymbols = std::map<const NamedDecl *, MangledSymbol>;
 
@@ -295,7 +296,7 @@ public:
       OS << "Symbols:\n";
       for (const auto &E : Symbols) {
         const MangledSymbol &Symbol = E.second;
-        for (auto Name : Symbol.Names) {
+        for (const auto &Name : Symbol.Names) {
           OS << "  - { Name: \""
              << (Symbol.ParentName.empty() || Instance.getLangOpts().CPlusPlus
                      ? ""
diff --git a/clang/lib/Frontend/ModuleDependencyCollector.cpp b/clang/lib/Frontend/ModuleDependencyCollector.cpp
index b88cb60ebdd2..e2883f1e027e 100644
--- a/clang/lib/Frontend/ModuleDependencyCollector.cpp
+++ b/clang/lib/Frontend/ModuleDependencyCollector.cpp
@@ -105,7 +105,7 @@ static bool isCaseSensitivePath(StringRef Path) {
   // already expects when sensitivity isn't setup.
   for (auto &C : Path)
     UpperDest.push_back(toUppercase(C));
-  if (!llvm::sys::fs::real_path(UpperDest, RealDest) && Path.equals(RealDest))
+  if (!llvm::sys::fs::real_path(UpperDest, RealDest) && Path == RealDest)
     return false;
   return true;
 }
diff --git a/clang/lib/Frontend/MultiplexConsumer.cpp b/clang/lib/Frontend/MultiplexConsumer.cpp
index c74bfd86195f..8fdc7f55a500 100644
--- a/clang/lib/Frontend/MultiplexConsumer.cpp
+++ b/clang/lib/Frontend/MultiplexConsumer.cpp
@@ -35,7 +35,7 @@ void MultiplexASTDeserializationListener::ReaderInitialized(
 }
 
 void MultiplexASTDeserializationListener::IdentifierRead(
-    serialization::IdentID ID, IdentifierInfo *II) {
+    serialization::IdentifierID ID, IdentifierInfo *II) {
   for (size_t i = 0, e = Listeners.size(); i != e; ++i)
     Listeners[i]->IdentifierRead(ID, II);
 }
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 3416811e39de..5f02c71f6ca5 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -12,6 +12,7 @@ set(core_files
   stdarg.h
   __stdarg___gnuc_va_list.h
   __stdarg___va_copy.h
+  __stdarg_header_macro.h
   __stdarg_va_arg.h
   __stdarg_va_copy.h
   __stdarg_va_list.h
@@ -19,6 +20,7 @@ set(core_files
   stdbool.h
   stdckdint.h
   stddef.h
+  __stddef_header_macro.h
   __stddef_max_align_t.h
   __stddef_null.h
   __stddef_nullptr_t.h
diff --git a/clang/lib/Headers/__stdarg_header_macro.h b/clang/lib/Headers/__stdarg_header_macro.h
new file mode 100644
index 000000000000..beb92ee02526
--- /dev/null
+++ b/clang/lib/Headers/__stdarg_header_macro.h
@@ -0,0 +1,12 @@
+/*===---- __stdarg_header_macro.h ------------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDARG_H
+#define __STDARG_H
+#endif
diff --git a/clang/lib/Headers/__stddef_header_macro.h b/clang/lib/Headers/__stddef_header_macro.h
new file mode 100644
index 000000000000..db5fb3c0abc1
--- /dev/null
+++ b/clang/lib/Headers/__stddef_header_macro.h
@@ -0,0 +1,12 @@
+/*===---- __stddef_header_macro.h ------------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDDEF_H
+#define __STDDEF_H
+#endif
diff --git a/clang/lib/Headers/arm_acle.h b/clang/lib/Headers/arm_acle.h
index 6e557eda1ddd..5785954c9171 100644
--- a/clang/lib/Headers/arm_acle.h
+++ b/clang/lib/Headers/arm_acle.h
@@ -109,7 +109,7 @@ __swp(uint32_t __x, volatile uint32_t *__p) {
 #endif
 
 /* 7.7 NOP */
-#if !defined(_MSC_VER) || !defined(__aarch64__)
+#if !defined(_MSC_VER) || (!defined(__aarch64__) && !defined(__arm64ec__))
 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __nop(void) {
   __builtin_arm_nop();
 }
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index 06409c6fc774..3390f0962f67 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -1442,6 +1442,29 @@ _HLSL_BUILTIN_ALIAS(__builtin_elementwise_sqrt)
 float4 sqrt(float4);
 
 //===----------------------------------------------------------------------===//
+// tan builtins
+//===----------------------------------------------------------------------===//
+#ifdef __HLSL_ENABLE_16_BIT
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_tan)
+half tan(half);
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_tan)
+half2 tan(half2);
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_tan)
+half3 tan(half3);
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_tan)
+half4 tan(half4);
+#endif
+
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_tan)
+float tan(float);
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_tan)
+float2 tan(float2);
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_tan)
+float3 tan(float3);
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_tan)
+float4 tan(float4);
+
+//===----------------------------------------------------------------------===//
 // trunc builtins
 //===----------------------------------------------------------------------===//
 
diff --git a/clang/lib/Headers/module.modulemap b/clang/lib/Headers/module.modulemap
index 8741968fa7f3..4abfd1d98a63 100644
--- a/clang/lib/Headers/module.modulemap
+++ b/clang/lib/Headers/module.modulemap
@@ -203,6 +203,11 @@ module _Builtin_stdarg [system] {
     export *
   }
 
+  explicit module header_macro {
+    header "__stdarg_header_macro.h"
+    export *
+  }
+
   explicit module va_arg {
     header "__stdarg_va_arg.h"
     export *
@@ -232,6 +237,10 @@ module _Builtin_stdbool [system] {
 module _Builtin_stddef [system] {
   textual header "stddef.h"
 
+  explicit module header_macro {
+    header "__stddef_header_macro.h"
+    export *
+  }
   // __stddef_max_align_t.h is always in this module, even if
   // -fbuiltin-headers-in-system-modules is passed.
   explicit module max_align_t {
diff --git a/clang/lib/Headers/stdarg.h b/clang/lib/Headers/stdarg.h
index 6e7bd604b2df..8292ab907bec 100644
--- a/clang/lib/Headers/stdarg.h
+++ b/clang/lib/Headers/stdarg.h
@@ -14,27 +14,13 @@
  * need to use some of its interfaces. Otherwise this header provides all of
  * the expected interfaces.
  *
- * When clang modules are enabled, this header is a textual header. It ignores
- * its header guard so that multiple submodules can export its interfaces.
- * Take module SM with submodules A and B, whose headers both include stdarg.h
- * When SM.A builds, __STDARG_H will be defined. When SM.B builds, the
- * definition from SM.A will leak when building without local submodule
- * visibility. stdarg.h wouldn't include any of its implementation headers, and
- * SM.B wouldn't import any of the stdarg modules, and SM.B's `export *`
- * wouldn't export any stdarg interfaces as expected. However, since stdarg.h
- * ignores its header guard when building with modules, it all works as
- * expected.
- *
- * When clang modules are not enabled, the header guards can function in the
- * normal simple fashion.
+ * When clang modules are enabled, this header is a textual header to support
+ * the multiple include behavior. As such, it doesn't directly declare anything
+ * so that it doesn't add duplicate declarations to all of its includers'
+ * modules.
  */
-#if !defined(__STDARG_H) || __has_feature(modules) ||                          \
-    defined(__need___va_list) || defined(__need_va_list) ||                    \
-    defined(__need_va_arg) || defined(__need___va_copy) ||                     \
-    defined(__need_va_copy)
-
 #if defined(__MVS__) && __has_include_next(<stdarg.h>)
-#define __STDARG_H
+#include <__stdarg_header_macro.h>
 #undef __need___va_list
 #undef __need_va_list
 #undef __need_va_arg
@@ -46,7 +32,7 @@
 #if !defined(__need___va_list) && !defined(__need_va_list) &&                  \
     !defined(__need_va_arg) && !defined(__need___va_copy) &&                   \
     !defined(__need_va_copy)
-#define __STDARG_H
+#include <__stdarg_header_macro.h>
 #define __need___va_list
 #define __need_va_list
 #define __need_va_arg
@@ -87,5 +73,3 @@
 #endif /* defined(__need_va_copy) */
 
 #endif /* __MVS__ */
-
-#endif
diff --git a/clang/lib/Headers/stddef.h b/clang/lib/Headers/stddef.h
index 9ccc0a68fbff..8985c526e8fc 100644
--- a/clang/lib/Headers/stddef.h
+++ b/clang/lib/Headers/stddef.h
@@ -14,30 +14,13 @@
  * need to use some of its interfaces. Otherwise this header provides all of
  * the expected interfaces.
  *
- * When clang modules are enabled, this header is a textual header. It ignores
- * its header guard so that multiple submodules can export its interfaces.
- * Take module SM with submodules A and B, whose headers both include stddef.h
- * When SM.A builds, __STDDEF_H will be defined. When SM.B builds, the
- * definition from SM.A will leak when building without local submodule
- * visibility. stddef.h wouldn't include any of its implementation headers, and
- * SM.B wouldn't import any of the stddef modules, and SM.B's `export *`
- * wouldn't export any stddef interfaces as expected. However, since stddef.h
- * ignores its header guard when building with modules, it all works as
- * expected.
- *
- * When clang modules are not enabled, the header guards can function in the
- * normal simple fashion.
+ * When clang modules are enabled, this header is a textual header to support
+ * the multiple include behavior. As such, it doesn't directly declare anything
+ * so that it doesn't add duplicate declarations to all of its includers'
+ * modules.
  */
-#if !defined(__STDDEF_H) || __has_feature(modules) ||                          \
-    (defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ >= 1) ||        \
-    defined(__need_ptrdiff_t) || defined(__need_size_t) ||                     \
-    defined(__need_rsize_t) || defined(__need_wchar_t) ||                      \
-    defined(__need_NULL) || defined(__need_nullptr_t) ||                       \
-    defined(__need_unreachable) || defined(__need_max_align_t) ||              \
-    defined(__need_offsetof) || defined(__need_wint_t)
-
 #if defined(__MVS__) && __has_include_next(<stddef.h>)
-#define __STDDEF_H
+#include <__stddef_header_macro.h>
 #undef __need_ptrdiff_t
 #undef __need_size_t
 #undef __need_rsize_t
@@ -57,7 +40,7 @@
     !defined(__need_NULL) && !defined(__need_nullptr_t) &&                     \
     !defined(__need_unreachable) && !defined(__need_max_align_t) &&            \
     !defined(__need_offsetof) && !defined(__need_wint_t)
-#define __STDDEF_H
+#include <__stddef_header_macro.h>
 #define __need_ptrdiff_t
 #define __need_size_t
 /* ISO9899:2011 7.20 (C11 Annex K): Define rsize_t if __STDC_WANT_LIB_EXT1__ is
@@ -137,4 +120,3 @@ __WINT_TYPE__ directly; accommodate both by requiring __need_wint_t */
 #endif /* __need_wint_t */
 
 #endif /* __MVS__ */
-#endif
diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp
index 0b22139ebe81..8e7386449dce 100644
--- a/clang/lib/Lex/PPDirectives.cpp
+++ b/clang/lib/Lex/PPDirectives.cpp
@@ -183,7 +183,7 @@ static MacroDiag shouldWarnOnMacroDef(Preprocessor &PP, IdentifierInfo *II) {
     return isFeatureTestMacro(Text) ? MD_NoWarn : MD_ReservedMacro;
   if (II->isKeyword(Lang))
     return MD_KeywordDef;
-  if (Lang.CPlusPlus11 && (Text.equals("override") || Text.equals("final")))
+  if (Lang.CPlusPlus11 && (Text == "override" || Text == "final"))
     return MD_KeywordDef;
   return MD_NoWarn;
 }
@@ -1918,7 +1918,8 @@ bool Preprocessor::checkModuleIsAvailable(const LangOptions &LangOpts,
     // FIXME: Track the location at which the requirement was specified, and
     // use it here.
     Diags.Report(M.DefinitionLoc, diag::err_module_unavailable)
-        << M.getFullModuleName() << Requirement.second << Requirement.first;
+        << M.getFullModuleName() << Requirement.RequiredState
+        << Requirement.FeatureName;
   }
   return true;
 }
@@ -2806,7 +2807,7 @@ static bool isConfigurationPattern(Token &MacroName, MacroInfo *MI,
         if (TrimmedValue.ends_with("__"))
           TrimmedValue = TrimmedValue.drop_back(2);
       }
-      return TrimmedValue.equals(MacroText);
+      return TrimmedValue == MacroText;
     } else {
       return false;
     }
diff --git a/clang/lib/Lex/PPMacroExpansion.cpp b/clang/lib/Lex/PPMacroExpansion.cpp
index a5f22f01682d..a478e0badb0c 100644
--- a/clang/lib/Lex/PPMacroExpansion.cpp
+++ b/clang/lib/Lex/PPMacroExpansion.cpp
@@ -1714,8 +1714,6 @@ void Preprocessor::ExpandBuiltinMacro(Token &Tok) {
           return llvm::StringSwitch<bool>(II->getName())
               .Case("__array_rank", true)
               .Case("__array_extent", true)
-              .Case("__reference_binds_to_temporary", true)
-              .Case("__reference_constructs_from_temporary", true)
 #define TRANSFORM_TYPE_TRAIT_DEF(_, Trait) .Case("__" #Trait, true)
 #include "clang/Basic/TransformTypeTraits.def"
               .Default(false);
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index 4e4b05b21383..7fbaee5690bd 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -2587,25 +2587,30 @@ Decl *Parser::ParseDeclarationAfterDeclaratorAndAttributes(
     Parser &P;
     Declarator &D;
     Decl *ThisDecl;
+    bool Entered;
 
     InitializerScopeRAII(Parser &P, Declarator &D, Decl *ThisDecl)
-        : P(P), D(D), ThisDecl(ThisDecl) {
+        : P(P), D(D), ThisDecl(ThisDecl), Entered(false) {
       if (ThisDecl && P.getLangOpts().CPlusPlus) {
         Scope *S = nullptr;
         if (D.getCXXScopeSpec().isSet()) {
           P.EnterScope(0);
           S = P.getCurScope();
         }
-        P.Actions.ActOnCXXEnterDeclInitializer(S, ThisDecl);
+        if (ThisDecl && !ThisDecl->isInvalidDecl()) {
+          P.Actions.ActOnCXXEnterDeclInitializer(S, ThisDecl);
+          Entered = true;
+        }
       }
     }
-    ~InitializerScopeRAII() { pop(); }
-    void pop() {
+    ~InitializerScopeRAII() {
       if (ThisDecl && P.getLangOpts().CPlusPlus) {
         Scope *S = nullptr;
         if (D.getCXXScopeSpec().isSet())
           S = P.getCurScope();
-        P.Actions.ActOnCXXExitDeclInitializer(S, ThisDecl);
+
+        if (Entered)
+          P.Actions.ActOnCXXExitDeclInitializer(S, ThisDecl);
         if (S)
           P.ExitScope();
       }
@@ -2736,8 +2741,6 @@ Decl *Parser::ParseDeclarationAfterDeclaratorAndAttributes(
         FRI->RangeExpr = Init;
       }
 
-      InitScope.pop();
-
       if (Init.isInvalid()) {
         SmallVector<tok::TokenKind, 2> StopTokens;
         StopTokens.push_back(tok::comma);
@@ -2785,8 +2788,6 @@ Decl *Parser::ParseDeclarationAfterDeclaratorAndAttributes(
 
     bool SawError = ParseExpressionList(Exprs, ExpressionStarts);
 
-    InitScope.pop();
-
     if (SawError) {
       if (ThisVarDecl && PP.isCodeCompletionReached() && !CalledSignatureHelp) {
         Actions.ProduceConstructorSignatureHelp(
@@ -2818,8 +2819,6 @@ Decl *Parser::ParseDeclarationAfterDeclaratorAndAttributes(
     PreferredType.enterVariableInit(Tok.getLocation(), ThisDecl);
     ExprResult Init(ParseBraceInitializer());
 
-    InitScope.pop();
-
     if (Init.isInvalid()) {
       Actions.ActOnInitializerError(ThisDecl);
     } else
@@ -3890,7 +3889,7 @@ void Parser::ParseDeclarationSpecifiers(
       // parse errors if this really is a __declspec attribute. Attempt to
       // recognize that scenario and recover gracefully.
       if (!getLangOpts().DeclSpecKeyword && Tok.is(tok::identifier) &&
-          Tok.getIdentifierInfo()->getName().equals("__declspec")) {
+          Tok.getIdentifierInfo()->getName() == "__declspec") {
         Diag(Loc, diag::err_ms_attributes_not_enabled);
 
         // The next token should be an open paren. If it is, eat the entire
@@ -4332,9 +4331,12 @@ void Parser::ParseDeclarationSpecifiers(
 
     // friend
     case tok::kw_friend:
-      if (DSContext == DeclSpecContext::DSC_class)
+      if (DSContext == DeclSpecContext::DSC_class) {
         isInvalid = DS.SetFriendSpec(Loc, PrevSpec, DiagID);
-      else {
+        Scope *CurS = getCurScope();
+        if (!isInvalid && CurS)
+          CurS->setFlags(CurS->getFlags() | Scope::FriendScope);
+      } else {
         PrevSpec = ""; // not actually used by the diagnostic
         DiagID = diag::err_friend_invalid_in_context;
         isInvalid = true;
diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
index 8e0e86824829..65ddebca49bc 100644
--- a/clang/lib/Parse/ParseDeclCXX.cpp
+++ b/clang/lib/Parse/ParseDeclCXX.cpp
@@ -1779,9 +1779,8 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
           tok::kw___is_union,
           tok::kw___is_unsigned,
           tok::kw___is_void,
-          tok::kw___is_volatile,
-          tok::kw___reference_binds_to_temporary,
-          tok::kw___reference_constructs_from_temporary))
+          tok::kw___is_volatile
+      ))
     // GNU libstdc++ 4.2 and libc++ use certain intrinsic names as the
     // name of struct templates, but some are keywords in GCC >= 4.3
     // and Clang. Therefore, when we see the token sequence "struct
@@ -4549,9 +4548,9 @@ static bool IsBuiltInOrStandardCXX11Attribute(IdentifierInfo *AttrName,
   case ParsedAttr::AT_Unlikely:
     return true;
   case ParsedAttr::AT_WarnUnusedResult:
-    return !ScopeName && AttrName->getName().equals("nodiscard");
+    return !ScopeName && AttrName->getName() == "nodiscard";
   case ParsedAttr::AT_Unused:
-    return !ScopeName && AttrName->getName().equals("maybe_unused");
+    return !ScopeName && AttrName->getName() == "maybe_unused";
   default:
     return false;
   }
diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp
index 7d6febb04a82..0551b8314f9f 100644
--- a/clang/lib/Parse/ParseExpr.cpp
+++ b/clang/lib/Parse/ParseExpr.cpp
@@ -1166,7 +1166,6 @@ ExprResult Parser::ParseCastExpression(CastParseKind ParseKind,
           REVERTIBLE_TYPE_TRAIT(__is_void);
           REVERTIBLE_TYPE_TRAIT(__is_volatile);
           REVERTIBLE_TYPE_TRAIT(__reference_binds_to_temporary);
-          REVERTIBLE_TYPE_TRAIT(__reference_constructs_from_temporary);
 #define TRANSFORM_TYPE_TRAIT_DEF(_, Trait)                                     \
   REVERTIBLE_TYPE_TRAIT(RTT_JOIN(__, Trait));
 #include "clang/Basic/TransformTypeTraits.def"
@@ -2039,7 +2038,8 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) {
         if (Tok.is(tok::colon)) {
           // Consume ':'
           ColonLocFirst = ConsumeToken();
-          Length = Actions.CorrectDelayedTyposInExpr(ParseExpression());
+          if (Tok.isNot(tok::r_square))
+            Length = Actions.CorrectDelayedTyposInExpr(ParseExpression());
         }
       } else if (ArgExprs.size() <= 1 && getLangOpts().OpenMP) {
         ColonProtectionRAIIObject RAII(*this);
diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp
index 0d2ad980696f..825031da358a 100644
--- a/clang/lib/Parse/ParseExprCXX.cpp
+++ b/clang/lib/Parse/ParseExprCXX.cpp
@@ -407,6 +407,20 @@ bool Parser::ParseOptionalCXXScopeSpecifier(
       continue;
     }
 
+    switch (Tok.getKind()) {
+#define TRANSFORM_TYPE_TRAIT_DEF(_, Trait) case tok::kw___##Trait:
+#include "clang/Basic/TransformTypeTraits.def"
+      if (!NextToken().is(tok::l_paren)) {
+        Tok.setKind(tok::identifier);
+        Diag(Tok, diag::ext_keyword_as_ident)
+            << Tok.getIdentifierInfo()->getName() << 0;
+        continue;
+      }
+      [[fallthrough]];
+    default:
+      break;
+    }
+
     // The rest of the nested-name-specifier possibilities start with
     // tok::identifier.
     if (Tok.isNot(tok::identifier))
diff --git a/clang/lib/Parse/ParseHLSL.cpp b/clang/lib/Parse/ParseHLSL.cpp
index f4cbece31f18..4b72afe9986e 100644
--- a/clang/lib/Parse/ParseHLSL.cpp
+++ b/clang/lib/Parse/ParseHLSL.cpp
@@ -174,7 +174,7 @@ void Parser::ParseHLSLAnnotations(ParsedAttributes &Attrs,
       ArgExprs.push_back(ParseIdentifierLoc());
 
       // Add numeric_constant for fix-it.
-      if (SpaceStr.equals("space") && Tok.is(tok::numeric_constant))
+      if (SpaceStr == "space" && Tok.is(tok::numeric_constant))
         fixSeparateAttrArgAndNumber(SpaceStr, SpaceLoc, Tok, ArgExprs, *this,
                                     Actions.Context, PP);
     }
@@ -183,6 +183,94 @@ void Parser::ParseHLSLAnnotations(ParsedAttributes &Attrs,
       return;
     }
   } break;
+  case ParsedAttr::AT_HLSLPackOffset: {
+    // Parse 'packoffset( c[Subcomponent][.component] )'.
+    // Check '('.
+    if (ExpectAndConsume(tok::l_paren, diag::err_expected_lparen_after)) {
+      SkipUntil(tok::r_paren, StopAtSemi); // skip through )
+      return;
+    }
+    // Check c[Subcomponent] as an identifier.
+    if (!Tok.is(tok::identifier)) {
+      Diag(Tok.getLocation(), diag::err_expected) << tok::identifier;
+      SkipUntil(tok::r_paren, StopAtSemi); // skip through )
+      return;
+    }
+    StringRef OffsetStr = Tok.getIdentifierInfo()->getName();
+    SourceLocation SubComponentLoc = Tok.getLocation();
+    if (OffsetStr[0] != 'c') {
+      Diag(Tok.getLocation(), diag::err_hlsl_packoffset_invalid_reg)
+          << OffsetStr;
+      SkipUntil(tok::r_paren, StopAtSemi); // skip through )
+      return;
+    }
+    OffsetStr = OffsetStr.substr(1);
+    unsigned SubComponent = 0;
+    if (!OffsetStr.empty()) {
+      // Make sure SubComponent is a number.
+      if (OffsetStr.getAsInteger(10, SubComponent)) {
+        Diag(SubComponentLoc.getLocWithOffset(1),
+             diag::err_hlsl_unsupported_register_number);
+        SkipUntil(tok::r_paren, StopAtSemi); // skip through )
+        return;
+      }
+    }
+    unsigned Component = 0;
+    ConsumeToken(); // consume identifier.
+    SourceLocation ComponentLoc;
+    if (Tok.is(tok::period)) {
+      ConsumeToken(); // consume period.
+      if (!Tok.is(tok::identifier)) {
+        Diag(Tok.getLocation(), diag::err_expected) << tok::identifier;
+        SkipUntil(tok::r_paren, StopAtSemi); // skip through )
+        return;
+      }
+      StringRef ComponentStr = Tok.getIdentifierInfo()->getName();
+      ComponentLoc = Tok.getLocation();
+      ConsumeToken(); // consume identifier.
+      // Make sure Component is a single character.
+      if (ComponentStr.size() != 1) {
+        Diag(ComponentLoc, diag::err_hlsl_unsupported_component)
+            << ComponentStr;
+        SkipUntil(tok::r_paren, StopAtSemi); // skip through )
+        return;
+      }
+      switch (ComponentStr[0]) {
+      case 'x':
+      case 'r':
+        Component = 0;
+        break;
+      case 'y':
+      case 'g':
+        Component = 1;
+        break;
+      case 'z':
+      case 'b':
+        Component = 2;
+        break;
+      case 'w':
+      case 'a':
+        Component = 3;
+        break;
+      default:
+        Diag(ComponentLoc, diag::err_hlsl_unsupported_component)
+            << ComponentStr;
+        SkipUntil(tok::r_paren, StopAtSemi); // skip through )
+        return;
+      }
+    }
+    ASTContext &Ctx = Actions.getASTContext();
+    QualType SizeTy = Ctx.getSizeType();
+    uint64_t SizeTySize = Ctx.getTypeSize(SizeTy);
+    ArgExprs.push_back(IntegerLiteral::Create(
+        Ctx, llvm::APInt(SizeTySize, SubComponent), SizeTy, SubComponentLoc));
+    ArgExprs.push_back(IntegerLiteral::Create(
+        Ctx, llvm::APInt(SizeTySize, Component), SizeTy, ComponentLoc));
+    if (ExpectAndConsume(tok::r_paren, diag::err_expected)) {
+      SkipUntil(tok::r_paren, StopAtSemi); // skip through )
+      return;
+    }
+  } break;
   case ParsedAttr::UnknownAttribute:
     Diag(Loc, diag::err_unknown_hlsl_semantic) << II;
     return;
diff --git a/clang/lib/Parse/ParseOpenACC.cpp b/clang/lib/Parse/ParseOpenACC.cpp
index 2d1ec6539b2f..0e10632c8317 100644
--- a/clang/lib/Parse/ParseOpenACC.cpp
+++ b/clang/lib/Parse/ParseOpenACC.cpp
@@ -100,10 +100,18 @@ OpenACCClauseKind getOpenACCClauseKind(Token Tok) {
       .Case("auto", OpenACCClauseKind::Auto)
       .Case("bind", OpenACCClauseKind::Bind)
       .Case("create", OpenACCClauseKind::Create)
+      .Case("pcreate", OpenACCClauseKind::PCreate)
+      .Case("present_or_create", OpenACCClauseKind::PresentOrCreate)
       .Case("collapse", OpenACCClauseKind::Collapse)
       .Case("copy", OpenACCClauseKind::Copy)
+      .Case("pcopy", OpenACCClauseKind::PCopy)
+      .Case("present_or_copy", OpenACCClauseKind::PresentOrCopy)
       .Case("copyin", OpenACCClauseKind::CopyIn)
+      .Case("pcopyin", OpenACCClauseKind::PCopyIn)
+      .Case("present_or_copyin", OpenACCClauseKind::PresentOrCopyIn)
       .Case("copyout", OpenACCClauseKind::CopyOut)
+      .Case("pcopyout", OpenACCClauseKind::PCopyOut)
+      .Case("present_or_copyout", OpenACCClauseKind::PresentOrCopyOut)
       .Case("default", OpenACCClauseKind::Default)
       .Case("default_async", OpenACCClauseKind::DefaultAsync)
       .Case("delete", OpenACCClauseKind::Delete)
@@ -488,9 +496,17 @@ ClauseParensKind getClauseParensKind(OpenACCDirectiveKind DirKind,
   case OpenACCClauseKind::Default:
   case OpenACCClauseKind::If:
   case OpenACCClauseKind::Create:
+  case OpenACCClauseKind::PCreate:
+  case OpenACCClauseKind::PresentOrCreate:
   case OpenACCClauseKind::Copy:
+  case OpenACCClauseKind::PCopy:
+  case OpenACCClauseKind::PresentOrCopy:
   case OpenACCClauseKind::CopyIn:
+  case OpenACCClauseKind::PCopyIn:
+  case OpenACCClauseKind::PresentOrCopyIn:
   case OpenACCClauseKind::CopyOut:
+  case OpenACCClauseKind::PCopyOut:
+  case OpenACCClauseKind::PresentOrCopyOut:
   case OpenACCClauseKind::UseDevice:
   case OpenACCClauseKind::NoCreate:
   case OpenACCClauseKind::Present:
@@ -851,7 +867,6 @@ Parser::OpenACCClauseParseResult Parser::ParseOpenACCClauseParams(
   SemaOpenACC::OpenACCParsedClause ParsedClause(DirKind, ClauseKind, ClauseLoc);
 
   if (ClauseHasRequiredParens(DirKind, ClauseKind)) {
-    ParsedClause.setLParenLoc(getCurToken().getLocation());
     if (Parens.expectAndConsume()) {
       // We are missing a paren, so assume that the person just forgot the
       // parameter.  Return 'false' so we try to continue on and parse the next
@@ -860,6 +875,7 @@ Parser::OpenACCClauseParseResult Parser::ParseOpenACCClauseParams(
                 Parser::StopBeforeMatch);
       return OpenACCCanContinue();
     }
+    ParsedClause.setLParenLoc(Parens.getOpenLocation());
 
     switch (ClauseKind) {
     case OpenACCClauseKind::Default: {
@@ -897,16 +913,26 @@ Parser::OpenACCClauseParseResult Parser::ParseOpenACCClauseParams(
       break;
     }
     case OpenACCClauseKind::CopyIn:
-      tryParseAndConsumeSpecialTokenKind(
+    case OpenACCClauseKind::PCopyIn:
+    case OpenACCClauseKind::PresentOrCopyIn: {
+      bool IsReadOnly = tryParseAndConsumeSpecialTokenKind(
           *this, OpenACCSpecialTokenKind::ReadOnly, ClauseKind);
-      ParseOpenACCVarList();
+      ParsedClause.setVarListDetails(ParseOpenACCVarList(), IsReadOnly,
+                                     /*IsZero=*/false);
       break;
+    }
     case OpenACCClauseKind::Create:
+    case OpenACCClauseKind::PCreate:
+    case OpenACCClauseKind::PresentOrCreate:
     case OpenACCClauseKind::CopyOut:
-      tryParseAndConsumeSpecialTokenKind(*this, OpenACCSpecialTokenKind::Zero,
-                                         ClauseKind);
-      ParseOpenACCVarList();
+    case OpenACCClauseKind::PCopyOut:
+    case OpenACCClauseKind::PresentOrCopyOut: {
+      bool IsZero = tryParseAndConsumeSpecialTokenKind(
+          *this, OpenACCSpecialTokenKind::Zero, ClauseKind);
+      ParsedClause.setVarListDetails(ParseOpenACCVarList(),
+                                     /*IsReadOnly=*/false, IsZero);
       break;
+    }
     case OpenACCClauseKind::Reduction:
       // If we're missing a clause-kind (or it is invalid), see if we can parse
       // the var-list anyway.
@@ -919,23 +945,29 @@ Parser::OpenACCClauseParseResult Parser::ParseOpenACCClauseParams(
       // make sure we get the right differentiator.
       assert(DirKind == OpenACCDirectiveKind::Update);
       [[fallthrough]];
-    case OpenACCClauseKind::Attach:
-    case OpenACCClauseKind::Copy:
     case OpenACCClauseKind::Delete:
     case OpenACCClauseKind::Detach:
     case OpenACCClauseKind::Device:
     case OpenACCClauseKind::DeviceResident:
-    case OpenACCClauseKind::DevicePtr:
-    case OpenACCClauseKind::FirstPrivate:
     case OpenACCClauseKind::Host:
     case OpenACCClauseKind::Link:
-    case OpenACCClauseKind::NoCreate:
-    case OpenACCClauseKind::Present:
     case OpenACCClauseKind::UseDevice:
       ParseOpenACCVarList();
       break;
+    case OpenACCClauseKind::Attach:
+    case OpenACCClauseKind::DevicePtr:
+      ParsedClause.setVarListDetails(ParseOpenACCVarList(),
+                                     /*IsReadOnly=*/false, /*IsZero=*/false);
+      break;
+    case OpenACCClauseKind::Copy:
+    case OpenACCClauseKind::PCopy:
+    case OpenACCClauseKind::PresentOrCopy:
+    case OpenACCClauseKind::FirstPrivate:
+    case OpenACCClauseKind::NoCreate:
+    case OpenACCClauseKind::Present:
     case OpenACCClauseKind::Private:
-      ParsedClause.setVarListDetails(ParseOpenACCVarList());
+      ParsedClause.setVarListDetails(ParseOpenACCVarList(),
+                                     /*IsReadOnly=*/false, /*IsZero=*/false);
       break;
     case OpenACCClauseKind::Collapse: {
       tryParseAndConsumeSpecialTokenKind(*this, OpenACCSpecialTokenKind::Force,
@@ -1015,8 +1047,8 @@ Parser::OpenACCClauseParseResult Parser::ParseOpenACCClauseParams(
       return OpenACCCannotContinue();
 
   } else if (ClauseHasOptionalParens(DirKind, ClauseKind)) {
-    ParsedClause.setLParenLoc(getCurToken().getLocation());
     if (!Parens.consumeOpen()) {
+      ParsedClause.setLParenLoc(Parens.getOpenLocation());
       switch (ClauseKind) {
       case OpenACCClauseKind::Self: {
         assert(DirKind != OpenACCDirectiveKind::Update);
@@ -1048,7 +1080,12 @@ Parser::OpenACCClauseParseResult Parser::ParseOpenACCClauseParams(
         break;
       }
       case OpenACCClauseKind::Async: {
-        ExprResult AsyncArg = ParseOpenACCAsyncArgument();
+        ExprResult AsyncArg =
+            ParseOpenACCAsyncArgument(OpenACCDirectiveKind::Invalid,
+                                      OpenACCClauseKind::Async, ClauseLoc)
+                .first;
+        ParsedClause.setIntExprDetails(AsyncArg.isUsable() ? AsyncArg.get()
+                                                           : nullptr);
         if (AsyncArg.isInvalid()) {
           Parens.skipToEnd();
           return OpenACCCanContinue();
@@ -1061,19 +1098,29 @@ Parser::OpenACCClauseParseResult Parser::ParseOpenACCClauseParams(
           return OpenACCCanContinue();
         }
         break;
-      case OpenACCClauseKind::Wait:
-        if (ParseOpenACCWaitArgument(ClauseLoc,
-                                     /*IsDirective=*/false)) {
+      case OpenACCClauseKind::Wait: {
+        OpenACCWaitParseInfo Info =
+            ParseOpenACCWaitArgument(ClauseLoc,
+                                     /*IsDirective=*/false);
+        if (Info.Failed) {
           Parens.skipToEnd();
           return OpenACCCanContinue();
         }
+
+        ParsedClause.setWaitDetails(Info.DevNumExpr, Info.QueuesLoc,
+                                    std::move(Info.QueueIdExprs));
         break;
+      }
       default:
         llvm_unreachable("Not an optional parens type?");
       }
       ParsedClause.setEndLoc(getCurToken().getLocation());
       if (Parens.consumeClose())
         return OpenACCCannotContinue();
+    } else {
+      // If we have optional parens, make sure we set the end-location to the
+      // clause, as we are a 'single token' clause.
+      ParsedClause.setEndLoc(ClauseLoc);
     }
   }
   return OpenACCSuccess(
@@ -1087,15 +1134,19 @@ Parser::OpenACCClauseParseResult Parser::ParseOpenACCClauseParams(
 /// defined in the C header file and the Fortran openacc module. The special
 /// values are negative values, so as not to conflict with a user-specified
 /// nonnegative async-argument.
-ExprResult Parser::ParseOpenACCAsyncArgument() {
-  return getActions().CorrectDelayedTyposInExpr(ParseAssignmentExpression());
+Parser::OpenACCIntExprParseResult
+Parser::ParseOpenACCAsyncArgument(OpenACCDirectiveKind DK, OpenACCClauseKind CK,
+                                  SourceLocation Loc) {
+  return ParseOpenACCIntExpr(DK, CK, Loc);
 }
 
 /// OpenACC 3.3, section 2.16:
 /// In this section and throughout the specification, the term wait-argument
 /// means:
 /// [ devnum : int-expr : ] [ queues : ] async-argument-list
-bool Parser::ParseOpenACCWaitArgument(SourceLocation Loc, bool IsDirective) {
+Parser::OpenACCWaitParseInfo
+Parser::ParseOpenACCWaitArgument(SourceLocation Loc, bool IsDirective) {
+  OpenACCWaitParseInfo Result;
   // [devnum : int-expr : ]
   if (isOpenACCSpecialToken(OpenACCSpecialTokenKind::DevNum, Tok) &&
       NextToken().is(tok::colon)) {
@@ -1104,25 +1155,30 @@ bool Parser::ParseOpenACCWaitArgument(SourceLocation Loc, bool IsDirective) {
     // Consume colon.
     ConsumeToken();
 
-    ExprResult IntExpr =
-        ParseOpenACCIntExpr(IsDirective ? OpenACCDirectiveKind::Wait
-                                        : OpenACCDirectiveKind::Invalid,
-                            IsDirective ? OpenACCClauseKind::Invalid
-                                        : OpenACCClauseKind::Wait,
-                            Loc)
-            .first;
-    if (IntExpr.isInvalid())
-      return true;
+    OpenACCIntExprParseResult Res = ParseOpenACCIntExpr(
+        IsDirective ? OpenACCDirectiveKind::Wait
+                    : OpenACCDirectiveKind::Invalid,
+        IsDirective ? OpenACCClauseKind::Invalid : OpenACCClauseKind::Wait,
+        Loc);
+    if (Res.first.isInvalid() &&
+        Res.second == OpenACCParseCanContinue::Cannot) {
+      Result.Failed = true;
+      return Result;
+    }
 
-    if (ExpectAndConsume(tok::colon))
-      return true;
+    if (ExpectAndConsume(tok::colon)) {
+      Result.Failed = true;
+      return Result;
+    }
+
+    Result.DevNumExpr = Res.first.get();
   }
 
   // [ queues : ]
   if (isOpenACCSpecialToken(OpenACCSpecialTokenKind::Queues, Tok) &&
       NextToken().is(tok::colon)) {
     // Consume queues.
-    ConsumeToken();
+    Result.QueuesLoc = ConsumeToken();
     // Consume colon.
     ConsumeToken();
   }
@@ -1134,18 +1190,29 @@ bool Parser::ParseOpenACCWaitArgument(SourceLocation Loc, bool IsDirective) {
   bool FirstArg = true;
   while (!getCurToken().isOneOf(tok::r_paren, tok::annot_pragma_openacc_end)) {
     if (!FirstArg) {
-      if (ExpectAndConsume(tok::comma))
-        return true;
+      if (ExpectAndConsume(tok::comma)) {
+        Result.Failed = true;
+        return Result;
+      }
     }
     FirstArg = false;
 
-    ExprResult CurArg = ParseOpenACCAsyncArgument();
+    OpenACCIntExprParseResult Res = ParseOpenACCAsyncArgument(
+        IsDirective ? OpenACCDirectiveKind::Wait
+                    : OpenACCDirectiveKind::Invalid,
+        IsDirective ? OpenACCClauseKind::Invalid : OpenACCClauseKind::Wait,
+        Loc);
 
-    if (CurArg.isInvalid())
-      return true;
+    if (Res.first.isInvalid() &&
+        Res.second == OpenACCParseCanContinue::Cannot) {
+      Result.Failed = true;
+      return Result;
+    }
+
+    Result.QueueIdExprs.push_back(Res.first.get());
   }
 
-  return false;
+  return Result;
 }
 
 ExprResult Parser::ParseOpenACCIDExpression() {
@@ -1314,7 +1381,7 @@ Parser::OpenACCDirectiveParseInfo Parser::ParseOpenACCDirective() {
       break;
     case OpenACCDirectiveKind::Wait:
       // OpenACC has an optional paren-wrapped 'wait-argument'.
-      if (ParseOpenACCWaitArgument(StartLoc, /*IsDirective=*/true))
+      if (ParseOpenACCWaitArgument(StartLoc, /*IsDirective=*/true).Failed)
         T.skipToEnd();
       else
         T.consumeClose();
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 53d89ce2fa3e..ca2c6d69eb98 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -740,7 +740,7 @@ static bool parseDeclareSimdClauses(
       BS = Out;
       BSRange = SourceRange(Tok.getLocation(), Tok.getEndLoc());
       P.ConsumeToken();
-    } else if (ClauseName.equals("simdlen")) {
+    } else if (ClauseName == "simdlen") {
       if (SimdLen.isUsable()) {
         P.Diag(Tok, diag::err_omp_more_one_clause)
             << getOpenMPDirectiveName(OMPD_declare_simd) << ClauseName << 0;
@@ -1106,7 +1106,7 @@ static ExprResult parseContextScore(Parser &P) {
   llvm::SmallString<16> Buffer;
   StringRef SelectorName =
       P.getPreprocessor().getSpelling(P.getCurToken(), Buffer);
-  if (!SelectorName.equals("score"))
+  if (SelectorName != "score")
     return ScoreExpr;
   (void)P.ConsumeToken();
   SourceLocation RLoc;
@@ -3107,34 +3107,14 @@ bool Parser::ParseOpenMPSimpleVarList(
 }
 
 OMPClause *Parser::ParseOpenMPSizesClause() {
-  SourceLocation ClauseNameLoc = ConsumeToken();
+  SourceLocation ClauseNameLoc, OpenLoc, CloseLoc;
   SmallVector<Expr *, 4> ValExprs;
-
-  BalancedDelimiterTracker T(*this, tok::l_paren, tok::annot_pragma_openmp_end);
-  if (T.consumeOpen()) {
-    Diag(Tok, diag::err_expected) << tok::l_paren;
+  if (ParseOpenMPExprListClause(OMPC_sizes, ClauseNameLoc, OpenLoc, CloseLoc,
+                                ValExprs))
     return nullptr;
-  }
-
-  while (true) {
-    ExprResult Val = ParseConstantExpression();
-    if (!Val.isUsable()) {
-      T.skipToEnd();
-      return nullptr;
-    }
-
-    ValExprs.push_back(Val.get());
-
-    if (Tok.is(tok::r_paren) || Tok.is(tok::annot_pragma_openmp_end))
-      break;
-
-    ExpectAndConsume(tok::comma);
-  }
-
-  T.consumeClose();
 
-  return Actions.OpenMP().ActOnOpenMPSizesClause(
-      ValExprs, ClauseNameLoc, T.getOpenLocation(), T.getCloseLocation());
+  return Actions.OpenMP().ActOnOpenMPSizesClause(ValExprs, ClauseNameLoc,
+                                                 OpenLoc, CloseLoc);
 }
 
 OMPClause *Parser::ParseOpenMPUsesAllocatorClause(OpenMPDirectiveKind DKind) {
@@ -4316,17 +4296,20 @@ bool Parser::parseMapTypeModifiers(SemaOpenMP::OpenMPVarListDataTy &Data) {
 }
 
 /// Checks if the token is a valid map-type.
-/// FIXME: It will return an OpenMPMapModifierKind if that's what it parses.
+/// If it is not MapType kind, OMPC_MAP_unknown is returned.
 static OpenMPMapClauseKind isMapType(Parser &P) {
   Token Tok = P.getCurToken();
   // The map-type token can be either an identifier or the C++ delete keyword.
   if (!Tok.isOneOf(tok::identifier, tok::kw_delete))
     return OMPC_MAP_unknown;
   Preprocessor &PP = P.getPreprocessor();
-  OpenMPMapClauseKind MapType =
-      static_cast<OpenMPMapClauseKind>(getOpenMPSimpleClauseType(
-          OMPC_map, PP.getSpelling(Tok), P.getLangOpts()));
-  return MapType;
+  unsigned MapType =
+      getOpenMPSimpleClauseType(OMPC_map, PP.getSpelling(Tok), P.getLangOpts());
+  if (MapType == OMPC_MAP_to || MapType == OMPC_MAP_from ||
+      MapType == OMPC_MAP_tofrom || MapType == OMPC_MAP_alloc ||
+      MapType == OMPC_MAP_delete || MapType == OMPC_MAP_release)
+    return static_cast<OpenMPMapClauseKind>(MapType);
+  return OMPC_MAP_unknown;
 }
 
 /// Parse map-type in map clause.
@@ -5023,3 +5006,38 @@ OMPClause *Parser::ParseOpenMPVarListClause(OpenMPDirectiveKind DKind,
   OMPVarListLocTy Locs(Loc, LOpen, Data.RLoc);
   return Actions.OpenMP().ActOnOpenMPVarListClause(Kind, Vars, Locs, Data);
 }
+
+bool Parser::ParseOpenMPExprListClause(OpenMPClauseKind Kind,
+                                       SourceLocation &ClauseNameLoc,
+                                       SourceLocation &OpenLoc,
+                                       SourceLocation &CloseLoc,
+                                       SmallVectorImpl<Expr *> &Exprs,
+                                       bool ReqIntConst) {
+  assert(getOpenMPClauseName(Kind) == PP.getSpelling(Tok) &&
+         "Expected parsing to start at clause name");
+  ClauseNameLoc = ConsumeToken();
+
+  // Parse inside of '(' and ')'.
+  BalancedDelimiterTracker T(*this, tok::l_paren, tok::annot_pragma_openmp_end);
+  if (T.consumeOpen()) {
+    Diag(Tok, diag::err_expected) << tok::l_paren;
+    return true;
+  }
+
+  // Parse the list with interleaved commas.
+  do {
+    ExprResult Val =
+        ReqIntConst ? ParseConstantExpression() : ParseAssignmentExpression();
+    if (!Val.isUsable()) {
+      // Encountered something other than an expression; abort to ')'.
+      T.skipToEnd();
+      return true;
+    }
+    Exprs.push_back(Val.get());
+  } while (TryConsumeToken(tok::comma));
+
+  bool Result = T.consumeClose();
+  OpenLoc = T.getOpenLocation();
+  CloseLoc = T.getCloseLocation();
+  return Result;
+}
diff --git a/clang/lib/Sema/Scope.cpp b/clang/lib/Sema/Scope.cpp
index 11a41753a1bd..c08073e80ff3 100644
--- a/clang/lib/Sema/Scope.cpp
+++ b/clang/lib/Sema/Scope.cpp
@@ -229,6 +229,7 @@ void Scope::dumpImpl(raw_ostream &OS) const {
       {ClassInheritanceScope, "ClassInheritanceScope"},
       {CatchScope, "CatchScope"},
       {OpenACCComputeConstructScope, "OpenACCComputeConstructScope"},
+      {FriendScope, "FriendScope"},
   };
 
   for (auto Info : FlagInfo) {
diff --git a/clang/lib/Sema/SemaAccess.cpp b/clang/lib/Sema/SemaAccess.cpp
index 6a707eeb66d0..979a64b065f3 100644
--- a/clang/lib/Sema/SemaAccess.cpp
+++ b/clang/lib/Sema/SemaAccess.cpp
@@ -1473,12 +1473,32 @@ static Sema::AccessResult CheckAccess(Sema &S, SourceLocation Loc,
   // specifier, like this:
   //   A::private_type A::foo() { ... }
   //
-  // Or we might be parsing something that will turn out to be a friend:
-  //   void foo(A::private_type);
-  //   void B::foo(A::private_type);
+  // friend declaration should not be delayed because it may lead to incorrect
+  // redeclaration chain, such as:
+  //   class D {
+  //    class E{
+  //     class F{};
+  //     friend  void foo(D::E::F& q);
+  //    };
+  //    friend  void foo(D::E::F& q);
+  //   };
   if (S.DelayedDiagnostics.shouldDelayDiagnostics()) {
-    S.DelayedDiagnostics.add(DelayedDiagnostic::makeAccess(Loc, Entity));
-    return Sema::AR_delayed;
+    // [class.friend]p9:
+    // A member nominated by a friend declaration shall be accessible in the
+    // class containing the friend declaration. The meaning of the friend
+    // declaration is the same whether the friend declaration appears in the
+    // private, protected, or public ([class.mem]) portion of the class
+    // member-specification.
+    Scope *TS = S.getCurScope();
+    bool IsFriendDeclaration = false;
+    while (TS && !IsFriendDeclaration) {
+      IsFriendDeclaration = TS->isFriendScope();
+      TS = TS->getParent();
+    }
+    if (!IsFriendDeclaration) {
+      S.DelayedDiagnostics.add(DelayedDiagnostic::makeAccess(Loc, Entity));
+      return Sema::AR_delayed;
+    }
   }
 
   EffectiveContext EC(S.CurContext);
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index cf8840c63024..54789dde5069 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2346,10 +2346,8 @@ static bool BuiltinCpu(Sema &S, const TargetInfo &TI, CallExpr *TheCall,
   if (!SupportsBI(&TI) && SupportsBI(AuxTI))
     TheTI = AuxTI;
 
-  if (IsCPUSupports && !TheTI->supportsCpuSupports())
-    return S.Diag(TheCall->getBeginLoc(), diag::err_builtin_target_unsupported)
-           << SourceRange(TheCall->getBeginLoc(), TheCall->getEndLoc());
-  if (!IsCPUSupports && !TheTI->supportsCpuIs())
+  if ((!IsCPUSupports && !TheTI->supportsCpuIs()) ||
+      (IsCPUSupports && !TheTI->supportsCpuSupports()))
     return S.Diag(TheCall->getBeginLoc(),
                   TI.getTriple().isOSAIX()
                       ? diag::err_builtin_aix_os_unsupported
@@ -2535,18 +2533,18 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
   case Builtin::BI_bittestandset64:
   case Builtin::BI_interlockedbittestandreset64:
   case Builtin::BI_interlockedbittestandset64:
-    if (CheckBuiltinTargetInSupported(*this, BuiltinID, TheCall,
-                                      {llvm::Triple::x86_64, llvm::Triple::arm,
-                                       llvm::Triple::thumb,
-                                       llvm::Triple::aarch64}))
+    if (CheckBuiltinTargetInSupported(
+            *this, BuiltinID, TheCall,
+            {llvm::Triple::x86_64, llvm::Triple::arm, llvm::Triple::thumb,
+             llvm::Triple::aarch64, llvm::Triple::amdgcn}))
       return ExprError();
     break;
 
   case Builtin::BI__builtin_set_flt_rounds:
-    if (CheckBuiltinTargetInSupported(*this, BuiltinID, TheCall,
-                                      {llvm::Triple::x86, llvm::Triple::x86_64,
-                                       llvm::Triple::arm, llvm::Triple::thumb,
-                                       llvm::Triple::aarch64}))
+    if (CheckBuiltinTargetInSupported(
+            *this, BuiltinID, TheCall,
+            {llvm::Triple::x86, llvm::Triple::x86_64, llvm::Triple::arm,
+             llvm::Triple::thumb, llvm::Triple::aarch64, llvm::Triple::amdgcn}))
       return ExprError();
     break;
 
@@ -3049,6 +3047,7 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
   case Builtin::BI__builtin_elementwise_nearbyint:
   case Builtin::BI__builtin_elementwise_sin:
   case Builtin::BI__builtin_elementwise_sqrt:
+  case Builtin::BI__builtin_elementwise_tan:
   case Builtin::BI__builtin_elementwise_trunc:
   case Builtin::BI__builtin_elementwise_canonicalize: {
     if (PrepareBuiltinElementwiseMathOneArgCall(TheCall))
@@ -5679,6 +5678,7 @@ bool Sema::CheckHLSLBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
   case Builtin::BI__builtin_elementwise_roundeven:
   case Builtin::BI__builtin_elementwise_sin:
   case Builtin::BI__builtin_elementwise_sqrt:
+  case Builtin::BI__builtin_elementwise_tan:
   case Builtin::BI__builtin_elementwise_trunc: {
     if (CheckFloatOrHalfRepresentations(this, TheCall))
       return true;
@@ -16574,11 +16574,10 @@ static void CheckImplicitConversion(Sema &S, Expr *E, QualType T,
         std::string PrettySourceValue = toString(Value, 10);
         std::string PrettyTargetValue = PrettyPrintInRange(Value, TargetRange);
 
-        S.DiagRuntimeBehavior(
-            E->getExprLoc(), E,
-            S.PDiag(diag::warn_impcast_integer_precision_constant)
-                << PrettySourceValue << PrettyTargetValue << E->getType() << T
-                << E->getSourceRange() << SourceRange(CC));
+        S.Diag(E->getExprLoc(),
+               S.PDiag(diag::warn_impcast_integer_precision_constant)
+                   << PrettySourceValue << PrettyTargetValue << E->getType()
+                   << T << E->getSourceRange() << SourceRange(CC));
         return;
       }
     }
diff --git a/clang/lib/Sema/SemaCodeComplete.cpp b/clang/lib/Sema/SemaCodeComplete.cpp
index c335017f243e..87aa0cacc249 100644
--- a/clang/lib/Sema/SemaCodeComplete.cpp
+++ b/clang/lib/Sema/SemaCodeComplete.cpp
@@ -4049,18 +4049,17 @@ unsigned clang::getMacroUsagePriority(StringRef MacroName,
   unsigned Priority = CCP_Macro;
 
   // Treat the "nil", "Nil" and "NULL" macros as null pointer constants.
-  if (MacroName.equals("nil") || MacroName.equals("NULL") ||
-      MacroName.equals("Nil")) {
+  if (MacroName == "nil" || MacroName == "NULL" || MacroName == "Nil") {
     Priority = CCP_Constant;
     if (PreferredTypeIsPointer)
       Priority = Priority / CCF_SimilarTypeMatch;
   }
   // Treat "YES", "NO", "true", and "false" as constants.
-  else if (MacroName.equals("YES") || MacroName.equals("NO") ||
-           MacroName.equals("true") || MacroName.equals("false"))
+  else if (MacroName == "YES" || MacroName == "NO" || MacroName == "true" ||
+           MacroName == "false")
     Priority = CCP_Constant;
   // Treat "bool" as a type.
-  else if (MacroName.equals("bool"))
+  else if (MacroName == "bool")
     Priority = CCP_Type + (LangOpts.ObjC ? CCD_bool_in_ObjC : 0);
 
   return Priority;
@@ -6714,14 +6713,16 @@ void Sema::CodeCompleteQualifiedId(Scope *S, CXXScopeSpec &SS,
 
   // If the scope is a concept-constrained type parameter, infer nested
   // members based on the constraints.
-  if (const auto *TTPT =
-          dyn_cast_or_null<TemplateTypeParmType>(NNS->getAsType())) {
-    for (const auto &R : ConceptInfo(*TTPT, S).members()) {
-      if (R.Operator != ConceptInfo::Member::Colons)
-        continue;
-      Results.AddResult(CodeCompletionResult(
-          R.render(*this, CodeCompleter->getAllocator(),
-                   CodeCompleter->getCodeCompletionTUInfo())));
+  if (NNS) {
+    if (const auto *TTPT =
+            dyn_cast_or_null<TemplateTypeParmType>(NNS->getAsType())) {
+      for (const auto &R : ConceptInfo(*TTPT, S).members()) {
+        if (R.Operator != ConceptInfo::Member::Colons)
+          continue;
+        Results.AddResult(CodeCompletionResult(
+            R.render(*this, CodeCompleter->getAllocator(),
+                     CodeCompleter->getCodeCompletionTUInfo())));
+      }
     }
   }
 
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index e00c97260282..7bfec4e11f7a 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -811,7 +811,7 @@ static const Expr *SubstituteConstraintExpressionWithoutSatisfaction(
   // this may happen while we're comparing two templates' constraint
   // equivalence.
   LocalInstantiationScope ScopeForParameters(S);
-  if (auto *FD = llvm::dyn_cast<FunctionDecl>(DeclInfo.getDecl()))
+  if (auto *FD = DeclInfo.getDecl()->getAsFunction())
     for (auto *PVD : FD->parameters())
       ScopeForParameters.InstantiatedLocal(PVD, PVD);
 
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 79fb6c0417e3..fb913034bd83 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -735,8 +735,8 @@ void Sema::DiagnoseUnknownTypeName(IdentifierInfo *&II,
                          << II, CanRecover);
       } else if (DeclContext *DC = computeDeclContext(*SS, false)) {
         std::string CorrectedStr(Corrected.getAsString(getLangOpts()));
-        bool DroppedSpecifier = Corrected.WillReplaceSpecifier() &&
-                                II->getName().equals(CorrectedStr);
+        bool DroppedSpecifier =
+            Corrected.WillReplaceSpecifier() && II->getName() == CorrectedStr;
         diagnoseTypo(Corrected,
                      PDiag(IsTemplateName
                                ? diag::err_no_member_template_suggest
@@ -1007,7 +1007,7 @@ Corrected:
         } else {// FIXME: is this even reachable? Test it.
           std::string CorrectedStr(Corrected.getAsString(getLangOpts()));
           bool DroppedSpecifier = Corrected.WillReplaceSpecifier() &&
-                                  Name->getName().equals(CorrectedStr);
+                                  Name->getName() == CorrectedStr;
           diagnoseTypo(Corrected, PDiag(QualifiedDiag)
                                     << Name << computeDeclContext(SS, false)
                                     << DroppedSpecifier << SS.getRange());
@@ -5790,6 +5790,9 @@ Decl *Sema::BuildAnonymousStructOrUnion(Scope *S, DeclSpec &DS,
     Anon = VarDecl::Create(Context, Owner, DS.getBeginLoc(),
                            Record->getLocation(), /*IdentifierInfo=*/nullptr,
                            Context.getTypeDeclType(Record), TInfo, SC);
+    if (Invalid)
+      Anon->setInvalidDecl();
+
     ProcessDeclAttributes(S, Anon, Dc);
 
     // Default-initialize the implicit variable. This initialization will be
@@ -13527,9 +13530,12 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
   }
 
   if (VDecl->isInvalidDecl()) {
-    CorrectDelayedTyposInExpr(Init, VDecl);
+    ExprResult Res = CorrectDelayedTyposInExpr(Init, VDecl);
+    SmallVector<Expr *> SubExprs;
+    if (Res.isUsable())
+      SubExprs.push_back(Res.get());
     ExprResult Recovery =
-        CreateRecoveryExpr(Init->getBeginLoc(), Init->getEndLoc(), {Init});
+        CreateRecoveryExpr(Init->getBeginLoc(), Init->getEndLoc(), SubExprs);
     if (Expr *E = Recovery.get())
       VDecl->setInit(E);
     return;
@@ -16070,7 +16076,7 @@ static void diagnoseImplicitlyRetainedSelf(Sema &S) {
 
 static bool methodHasName(const FunctionDecl *FD, StringRef Name) {
   return isa<CXXMethodDecl>(FD) && FD->param_empty() &&
-         FD->getDeclName().isIdentifier() && FD->getName().equals(Name);
+         FD->getDeclName().isIdentifier() && FD->getName() == Name;
 }
 
 bool Sema::CanBeGetReturnObject(const FunctionDecl *FD) {
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 363ae93cb62d..6d957ac09e1c 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -3538,13 +3538,6 @@ bool Sema::checkTargetAttr(SourceLocation LiteralLoc, StringRef AttrStr) {
   return false;
 }
 
-static bool hasArmStreamingInterface(const FunctionDecl *FD) {
-  if (const auto *T = FD->getType()->getAs<FunctionProtoType>())
-    if (T->getAArch64SMEAttributes() & FunctionType::SME_PStateSMEnabledMask)
-      return true;
-  return false;
-}
-
 // Check Target Version attrs
 bool Sema::checkTargetVersionAttr(SourceLocation LiteralLoc, Decl *D,
                                   StringRef &AttrStr, bool &isDefault) {
@@ -3563,7 +3556,8 @@ bool Sema::checkTargetVersionAttr(SourceLocation LiteralLoc, Decl *D,
       return Diag(LiteralLoc, diag::warn_unsupported_target_attribute)
              << Unsupported << None << CurFeature << TargetVersion;
   }
-  if (hasArmStreamingInterface(cast<FunctionDecl>(D)))
+  if (IsArmStreamingFunction(cast<FunctionDecl>(D),
+                             /*IncludeLocallyStreaming=*/false))
     return Diag(LiteralLoc, diag::err_sme_streaming_cannot_be_multiversioned);
   return false;
 }
@@ -3665,7 +3659,8 @@ bool Sema::checkTargetClonesAttrString(
           HasNotDefault = true;
         }
       }
-      if (hasArmStreamingInterface(cast<FunctionDecl>(D)))
+      if (IsArmStreamingFunction(cast<FunctionDecl>(D),
+                                 /*IncludeLocallyStreaming=*/false))
         return Diag(LiteralLoc,
                     diag::err_sme_streaming_cannot_be_multiversioned);
     } else {
@@ -7314,6 +7309,55 @@ static void handleHLSLSV_DispatchThreadIDAttr(Sema &S, Decl *D,
   D->addAttr(::new (S.Context) HLSLSV_DispatchThreadIDAttr(S.Context, AL));
 }
 
+static void handleHLSLPackOffsetAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+  if (!isa<VarDecl>(D) || !isa<HLSLBufferDecl>(D->getDeclContext())) {
+    S.Diag(AL.getLoc(), diag::err_hlsl_attr_invalid_ast_node)
+        << AL << "shader constant in a constant buffer";
+    return;
+  }
+
+  uint32_t SubComponent;
+  if (!checkUInt32Argument(S, AL, AL.getArgAsExpr(0), SubComponent))
+    return;
+  uint32_t Component;
+  if (!checkUInt32Argument(S, AL, AL.getArgAsExpr(1), Component))
+    return;
+
+  QualType T = cast<VarDecl>(D)->getType().getCanonicalType();
+  // Check if T is an array or struct type.
+  // TODO: mark matrix type as aggregate type.
+  bool IsAggregateTy = (T->isArrayType() || T->isStructureType());
+
+  // Check Component is valid for T.
+  if (Component) {
+    unsigned Size = S.getASTContext().getTypeSize(T);
+    if (IsAggregateTy || Size > 128) {
+      S.Diag(AL.getLoc(), diag::err_hlsl_packoffset_cross_reg_boundary);
+      return;
+    } else {
+      // Make sure Component + sizeof(T) <= 4.
+      if ((Component * 32 + Size) > 128) {
+        S.Diag(AL.getLoc(), diag::err_hlsl_packoffset_cross_reg_boundary);
+        return;
+      }
+      QualType EltTy = T;
+      if (const auto *VT = T->getAs<VectorType>())
+        EltTy = VT->getElementType();
+      unsigned Align = S.getASTContext().getTypeAlign(EltTy);
+      if (Align > 32 && Component == 1) {
+        // NOTE: Component 3 will hit err_hlsl_packoffset_cross_reg_boundary.
+        // So we only need to check Component 1 here.
+        S.Diag(AL.getLoc(), diag::err_hlsl_packoffset_alignment_mismatch)
+            << Align << EltTy;
+        return;
+      }
+    }
+  }
+
+  D->addAttr(::new (S.Context)
+                 HLSLPackOffsetAttr(S.Context, AL, SubComponent, Component));
+}
+
 static void handleHLSLShaderAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   StringRef Str;
   SourceLocation ArgLoc;
@@ -9735,6 +9779,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_HLSLSV_DispatchThreadID:
     handleHLSLSV_DispatchThreadIDAttr(S, D, AL);
     break;
+  case ParsedAttr::AT_HLSLPackOffset:
+    handleHLSLPackOffsetAttr(S, D, AL);
+    break;
   case ParsedAttr::AT_HLSLShader:
     handleHLSLShaderAttr(S, D, AL);
     break;
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 157d42c09cfc..53238d355ea0 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -12234,8 +12234,8 @@ static bool TryNamespaceTypoCorrection(Sema &S, LookupResult &R, Scope *Sc,
       DiagnoseInvisibleNamespace(Corrected, S);
     } else if (DeclContext *DC = S.computeDeclContext(SS, false)) {
       std::string CorrectedStr(Corrected.getAsString(S.getLangOpts()));
-      bool DroppedSpecifier = Corrected.WillReplaceSpecifier() &&
-                              Ident->getName().equals(CorrectedStr);
+      bool DroppedSpecifier =
+          Corrected.WillReplaceSpecifier() && Ident->getName() == CorrectedStr;
       S.diagnoseTypo(Corrected,
                      S.PDiag(diag::err_using_directive_member_suggest)
                        << Ident << DC << DroppedSpecifier << SS.getRange(),
@@ -18553,15 +18553,6 @@ void Sema::ActOnPureSpecifier(Decl *D, SourceLocation ZeroLoc) {
     Diag(D->getLocation(), diag::err_illegal_initializer);
 }
 
-/// Determine whether the given declaration is a global variable or
-/// static data member.
-static bool isNonlocalVariable(const Decl *D) {
-  if (const VarDecl *Var = dyn_cast_or_null<VarDecl>(D))
-    return Var->hasGlobalStorage();
-
-  return false;
-}
-
 /// Invoked when we are about to parse an initializer for the declaration
 /// 'Dcl'.
 ///
@@ -18570,9 +18561,7 @@ static bool isNonlocalVariable(const Decl *D) {
 /// class X. If the declaration had a scope specifier, a scope will have
 /// been created and passed in for this purpose. Otherwise, S will be null.
 void Sema::ActOnCXXEnterDeclInitializer(Scope *S, Decl *D) {
-  // If there is no declaration, there was an error parsing it.
-  if (!D || D->isInvalidDecl())
-    return;
+  assert(D && !D->isInvalidDecl());
 
   // We will always have a nested name specifier here, but this declaration
   // might not be out of line if the specifier names the current namespace:
@@ -18581,25 +18570,41 @@ void Sema::ActOnCXXEnterDeclInitializer(Scope *S, Decl *D) {
   if (S && D->isOutOfLine())
     EnterDeclaratorContext(S, D->getDeclContext());
 
-  // If we are parsing the initializer for a static data member, push a
-  // new expression evaluation context that is associated with this static
-  // data member.
-  if (isNonlocalVariable(D))
-    PushExpressionEvaluationContext(
-        ExpressionEvaluationContext::PotentiallyEvaluated, D);
+  PushExpressionEvaluationContext(
+      ExpressionEvaluationContext::PotentiallyEvaluated, D);
 }
 
 /// Invoked after we are finished parsing an initializer for the declaration D.
 void Sema::ActOnCXXExitDeclInitializer(Scope *S, Decl *D) {
-  // If there is no declaration, there was an error parsing it.
-  if (!D || D->isInvalidDecl())
-    return;
-
-  if (isNonlocalVariable(D))
-    PopExpressionEvaluationContext();
+  assert(D);
 
   if (S && D->isOutOfLine())
     ExitDeclaratorContext(S);
+
+  if (getLangOpts().CPlusPlus23) {
+    // An expression or conversion is 'manifestly constant-evaluated' if it is:
+    // [...]
+    // - the initializer of a variable that is usable in constant expressions or
+    //   has constant initialization.
+    if (auto *VD = dyn_cast<VarDecl>(D);
+        VD && (VD->isUsableInConstantExpressions(Context) ||
+               VD->hasConstantInitialization())) {
+      // An expression or conversion is in an 'immediate function context' if it
+      // is potentially evaluated and either:
+      // [...]
+      // - it is a subexpression of a manifestly constant-evaluated expression
+      //   or conversion.
+      ExprEvalContexts.back().InImmediateFunctionContext = true;
+    }
+  }
+
+  // Unless the initializer is in an immediate function context (as determined
+  // above), this will evaluate all contained immediate function calls as
+  // constant expressions. If the initializer IS an immediate function context,
+  // the initializer has been determined to be a constant expression, and all
+  // such evaluations will be elided (i.e., as if we "knew the whole time" that
+  // it was a constant expression).
+  PopExpressionEvaluationContext();
 }
 
 /// ActOnCXXConditionDeclarationExpr - Parsed a condition declaration of a
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 0c37f43f7540..bb4b116fd73c 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -4103,6 +4103,8 @@ ExprResult Sema::ActOnNumericConstant(const Token &Tok, Scope *UDLScope) {
       Ty = Context.Float16Ty;
     else if (Literal.isFloat128)
       Ty = Context.Float128Ty;
+    else if (getLangOpts().HLSL)
+      Ty = Context.FloatTy;
     else
       Ty = Context.DoubleTy;
 
@@ -4173,6 +4175,15 @@ ExprResult Sema::ActOnNumericConstant(const Token &Tok, Scope *UDLScope) {
       // be an unsigned int.
       bool AllowUnsigned = Literal.isUnsigned || Literal.getRadix() != 10;
 
+      // HLSL doesn't really have `long` or `long long`. We support the `ll`
+      // suffix for portability of code with C++, but both `l` and `ll` are
+      // 64-bit integer types, and we want the type of `1l` and `1ll` to be the
+      // same.
+      if (getLangOpts().HLSL && !Literal.isLong && Literal.isLongLong) {
+        Literal.isLong = true;
+        Literal.isLongLong = false;
+      }
+
       // Check from smallest to largest, picking the smallest type we can.
       unsigned Width = 0;
 
@@ -5766,10 +5777,9 @@ ExprResult Sema::BuildCXXDefaultArgExpr(SourceLocation CallLoc,
         Res = Immediate.TransformInitializer(Param->getInit(),
                                              /*NotCopy=*/false);
       });
-      if (Res.isInvalid())
-        return ExprError();
-      Res = ConvertParamDefaultArgument(Param, Res.get(),
-                                        Res.get()->getBeginLoc());
+      if (Res.isUsable())
+        Res = ConvertParamDefaultArgument(Param, Res.get(),
+                                          Res.get()->getBeginLoc());
       if (Res.isInvalid())
         return ExprError();
       Init = Res.get();
@@ -5805,7 +5815,7 @@ ExprResult Sema::BuildCXXDefaultInitExpr(SourceLocation Loc, FieldDecl *Field) {
   Expr *Init = nullptr;
 
   bool NestedDefaultChecking = isCheckingDefaultArgumentOrInitializer();
-
+  bool InLifetimeExtendingContext = isInLifetimeExtendingContext();
   EnterExpressionEvaluationContext EvalContext(
       *this, ExpressionEvaluationContext::PotentiallyEvaluated, Field);
 
@@ -5840,19 +5850,35 @@ ExprResult Sema::BuildCXXDefaultInitExpr(SourceLocation Loc, FieldDecl *Field) {
   ImmediateCallVisitor V(getASTContext());
   if (!NestedDefaultChecking)
     V.TraverseDecl(Field);
-  if (V.HasImmediateCalls) {
+
+  // CWG1815
+  // Support lifetime extension of temporary created by aggregate
+  // initialization using a default member initializer. We should always rebuild
+  // the initializer if it contains any temporaries (if the initializer
+  // expression is an ExprWithCleanups). Then make sure the normal lifetime
+  // extension code recurses into the default initializer and does lifetime
+  // extension when warranted.
+  bool ContainsAnyTemporaries =
+      isa_and_present<ExprWithCleanups>(Field->getInClassInitializer());
+  if (V.HasImmediateCalls || InLifetimeExtendingContext ||
+      ContainsAnyTemporaries) {
     ExprEvalContexts.back().DelayedDefaultInitializationContext = {Loc, Field,
                                                                    CurContext};
     ExprEvalContexts.back().IsCurrentlyCheckingDefaultArgumentOrInitializer =
         NestedDefaultChecking;
-
+    // Pass down lifetime extending flag, and collect temporaries in
+    // CreateMaterializeTemporaryExpr when we rewrite the call argument.
+    keepInLifetimeExtendingContext();
     EnsureImmediateInvocationInDefaultArgs Immediate(*this);
     ExprResult Res;
+
+    // Rebuild CXXDefaultInitExpr might cause diagnostics.
+    SFINAETrap Trap(*this);
     runWithSufficientStackSpace(Loc, [&] {
       Res = Immediate.TransformInitializer(Field->getInClassInitializer(),
                                            /*CXXDirectInit=*/false);
     });
-    if (!Res.isInvalid())
+    if (Res.isUsable())
       Res = ConvertMemberDefaultInitExpression(Field, Res.get(), Loc);
     if (Res.isInvalid()) {
       Field->setInvalidDecl();
@@ -6345,6 +6371,7 @@ static bool isPlaceholderToRemoveAsArg(QualType type) {
 #include "clang/AST/BuiltinTypes.def"
     return false;
 
+  case BuiltinType::UnresolvedTemplate:
   // We cannot lower out overload sets; they might validly be resolved
   // by the call machinery.
   case BuiltinType::Overload:
@@ -14651,6 +14678,22 @@ QualType Sema::CheckAddressOfOperand(ExprResult &OrigOp, SourceLocation OpLoc) {
             return QualType();
           }
 
+          // C++11 [expr.unary.op] p4:
+          // A pointer to member is only formed when an explicit & is used and
+          // its operand is a qualified-id not enclosed in parentheses.
+          if (isa<ParenExpr>(OrigOp.get())) {
+            SourceLocation LeftParenLoc = OrigOp.get()->getBeginLoc(),
+                           RightParenLoc = OrigOp.get()->getEndLoc();
+
+            Diag(LeftParenLoc,
+                 diag::err_form_ptr_to_member_from_parenthesized_expr)
+                << SourceRange(OpLoc, RightParenLoc)
+                << FixItHint::CreateRemoval(LeftParenLoc)
+                << FixItHint::CreateRemoval(RightParenLoc);
+
+            // Continuing might lead to better error recovery.
+          }
+
           while (cast<RecordDecl>(Ctx)->isAnonymousStructOrUnion())
             Ctx = Ctx->getParent();
 
@@ -17192,11 +17235,11 @@ bool Sema::DiagnoseAssignmentResult(AssignConvertType ConvTy,
     }
     CheckInferredResultType = DstType->isObjCObjectPointerType() &&
       SrcType->isObjCObjectPointerType();
-    if (!CheckInferredResultType) {
-      ConvHints.tryToFixConversion(SrcExpr, SrcType, DstType, *this);
-    } else if (CheckInferredResultType) {
+    if (CheckInferredResultType) {
       SrcType = SrcType.getUnqualifiedType();
       DstType = DstType.getUnqualifiedType();
+    } else {
+      ConvHints.tryToFixConversion(SrcExpr, SrcType, DstType, *this);
     }
     MayHaveConvFixit = true;
     break;
@@ -18011,7 +18054,7 @@ HandleImmediateInvocations(Sema &SemaRef,
                            Sema::ExpressionEvaluationContextRecord &Rec) {
   if ((Rec.ImmediateInvocationCandidates.size() == 0 &&
        Rec.ReferenceToConsteval.size() == 0) ||
-      SemaRef.RebuildingImmediateInvocation)
+      Rec.isImmediateFunctionContext() || SemaRef.RebuildingImmediateInvocation)
     return;
 
   /// When we have more than 1 ImmediateInvocationCandidates or previously
@@ -19707,18 +19750,17 @@ static ExprResult rebuildPotentialResultsAsNonOdrUsed(Sema &S, Expr *E,
       ExprResult Sub = Rebuild(LHS);
       if (!Sub.isUsable())
         return Sub;
-      LHS = Sub.get();
+      BO->setLHS(Sub.get());
     //   -- If e is a comma expression, ...
     } else if (BO->getOpcode() == BO_Comma) {
       ExprResult Sub = Rebuild(RHS);
       if (!Sub.isUsable())
         return Sub;
-      RHS = Sub.get();
+      BO->setRHS(Sub.get());
     } else {
       break;
     }
-    return S.BuildBinOp(nullptr, BO->getOperatorLoc(), BO->getOpcode(),
-                        LHS, RHS);
+    return ExprResult(BO);
   }
 
   //   -- If e has the form (e1)...
@@ -21238,6 +21280,27 @@ ExprResult Sema::CheckPlaceholderExpr(Expr *E) {
   if (!placeholderType) return E;
 
   switch (placeholderType->getKind()) {
+  case BuiltinType::UnresolvedTemplate: {
+    auto *ULE = cast<UnresolvedLookupExpr>(E);
+    const DeclarationNameInfo &NameInfo = ULE->getNameInfo();
+    // There's only one FoundDecl for UnresolvedTemplate type. See
+    // BuildTemplateIdExpr.
+    NamedDecl *Temp = *ULE->decls_begin();
+    const bool IsTypeAliasTemplateDecl = isa<TypeAliasTemplateDecl>(Temp);
+
+    if (NestedNameSpecifierLoc Loc = ULE->getQualifierLoc(); Loc.hasQualifier())
+      Diag(NameInfo.getLoc(), diag::err_template_kw_refers_to_type_template)
+          << Loc.getNestedNameSpecifier() << NameInfo.getName().getAsString()
+          << Loc.getSourceRange() << IsTypeAliasTemplateDecl;
+    else
+      Diag(NameInfo.getLoc(), diag::err_template_kw_refers_to_type_template)
+          << "" << NameInfo.getName().getAsString() << ULE->getSourceRange()
+          << IsTypeAliasTemplateDecl;
+    Diag(Temp->getLocation(), diag::note_referenced_type_template)
+        << IsTypeAliasTemplateDecl;
+
+    return CreateRecoveryExpr(NameInfo.getBeginLoc(), NameInfo.getEndLoc(), {});
+  }
 
   // Overloaded expressions.
   case BuiltinType::Overload: {
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index c1cb03e4ec7a..c181092113e1 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -5627,6 +5627,76 @@ static bool EvaluateUnaryTypeTrait(Sema &Self, TypeTrait UTT,
 static bool EvaluateBinaryTypeTrait(Sema &Self, TypeTrait BTT, const TypeSourceInfo *Lhs,
                                     const TypeSourceInfo *Rhs, SourceLocation KeyLoc);
 
+static ExprResult CheckConvertibilityForTypeTraits(
+    Sema &Self, const TypeSourceInfo *Lhs, const TypeSourceInfo *Rhs,
+    SourceLocation KeyLoc, llvm::BumpPtrAllocator &OpaqueExprAllocator) {
+
+  QualType LhsT = Lhs->getType();
+  QualType RhsT = Rhs->getType();
+
+  // C++0x [meta.rel]p4:
+  //   Given the following function prototype:
+  //
+  //     template <class T>
+  //       typename add_rvalue_reference<T>::type create();
+  //
+  //   the predicate condition for a template specialization
+  //   is_convertible<From, To> shall be satisfied if and only if
+  //   the return expression in the following code would be
+  //   well-formed, including any implicit conversions to the return
+  //   type of the function:
+  //
+  //     To test() {
+  //       return create<From>();
+  //     }
+  //
+  //   Access checking is performed as if in a context unrelated to To and
+  //   From. Only the validity of the immediate context of the expression
+  //   of the return-statement (including conversions to the return type)
+  //   is considered.
+  //
+  // We model the initialization as a copy-initialization of a temporary
+  // of the appropriate type, which for this expression is identical to the
+  // return statement (since NRVO doesn't apply).
+
+  // Functions aren't allowed to return function or array types.
+  if (RhsT->isFunctionType() || RhsT->isArrayType())
+    return ExprError();
+
+  // A function definition requires a complete, non-abstract return type.
+  if (!Self.isCompleteType(Rhs->getTypeLoc().getBeginLoc(), RhsT) ||
+      Self.isAbstractType(Rhs->getTypeLoc().getBeginLoc(), RhsT))
+    return ExprError();
+
+  // Compute the result of add_rvalue_reference.
+  if (LhsT->isObjectType() || LhsT->isFunctionType())
+    LhsT = Self.Context.getRValueReferenceType(LhsT);
+
+  // Build a fake source and destination for initialization.
+  InitializedEntity To(InitializedEntity::InitializeTemporary(RhsT));
+  Expr *From = new (OpaqueExprAllocator.Allocate<OpaqueValueExpr>())
+      OpaqueValueExpr(KeyLoc, LhsT.getNonLValueExprType(Self.Context),
+                      Expr::getValueKindForType(LhsT));
+  InitializationKind Kind =
+      InitializationKind::CreateCopy(KeyLoc, SourceLocation());
+
+  // Perform the initialization in an unevaluated context within a SFINAE
+  // trap at translation unit scope.
+  EnterExpressionEvaluationContext Unevaluated(
+      Self, Sema::ExpressionEvaluationContext::Unevaluated);
+  Sema::SFINAETrap SFINAE(Self, /*AccessCheckingSFINAE=*/true);
+  Sema::ContextRAII TUContext(Self, Self.Context.getTranslationUnitDecl());
+  InitializationSequence Init(Self, To, Kind, From);
+  if (Init.Failed())
+    return ExprError();
+
+  ExprResult Result = Init.Perform(Self, To, Kind, From);
+  if (Result.isInvalid() || SFINAE.hasErrorOccurred())
+    return ExprError();
+
+  return Result;
+}
+
 static bool EvaluateBooleanTypeTrait(Sema &S, TypeTrait Kind,
                                      SourceLocation KWLoc,
                                      ArrayRef<TypeSourceInfo *> Args,
@@ -5640,13 +5710,16 @@ static bool EvaluateBooleanTypeTrait(Sema &S, TypeTrait Kind,
 
   // Evaluate ReferenceBindsToTemporary and ReferenceConstructsFromTemporary
   // alongside the IsConstructible traits to avoid duplication.
-  if (Kind <= BTT_Last && Kind != BTT_ReferenceBindsToTemporary && Kind != BTT_ReferenceConstructsFromTemporary)
+  if (Kind <= BTT_Last && Kind != BTT_ReferenceBindsToTemporary &&
+      Kind != BTT_ReferenceConstructsFromTemporary &&
+      Kind != BTT_ReferenceConvertsFromTemporary)
     return EvaluateBinaryTypeTrait(S, Kind, Args[0],
                                    Args[1], RParenLoc);
 
   switch (Kind) {
   case clang::BTT_ReferenceBindsToTemporary:
   case clang::BTT_ReferenceConstructsFromTemporary:
+  case clang::BTT_ReferenceConvertsFromTemporary:
   case clang::TT_IsConstructible:
   case clang::TT_IsNothrowConstructible:
   case clang::TT_IsTriviallyConstructible: {
@@ -5710,8 +5783,10 @@ static bool EvaluateBooleanTypeTrait(Sema &S, TypeTrait Kind,
     Sema::ContextRAII TUContext(S, S.Context.getTranslationUnitDecl());
     InitializedEntity To(
         InitializedEntity::InitializeTemporary(S.Context, Args[0]));
-    InitializationKind InitKind(InitializationKind::CreateDirect(KWLoc, KWLoc,
-                                                                 RParenLoc));
+    InitializationKind InitKind(
+        Kind == clang::BTT_ReferenceConvertsFromTemporary
+            ? InitializationKind::CreateCopy(KWLoc, KWLoc)
+            : InitializationKind::CreateDirect(KWLoc, KWLoc, RParenLoc));
     InitializationSequence Init(S, To, InitKind, ArgExprs);
     if (Init.Failed())
       return false;
@@ -5723,7 +5798,9 @@ static bool EvaluateBooleanTypeTrait(Sema &S, TypeTrait Kind,
     if (Kind == clang::TT_IsConstructible)
       return true;
 
-    if (Kind == clang::BTT_ReferenceBindsToTemporary || Kind == clang::BTT_ReferenceConstructsFromTemporary) {
+    if (Kind == clang::BTT_ReferenceBindsToTemporary ||
+        Kind == clang::BTT_ReferenceConstructsFromTemporary ||
+        Kind == clang::BTT_ReferenceConvertsFromTemporary) {
       if (!T->isReferenceType())
         return false;
 
@@ -5737,9 +5814,13 @@ static bool EvaluateBooleanTypeTrait(Sema &S, TypeTrait Kind,
       if (U->isReferenceType())
         return false;
 
-      TypeSourceInfo *TPtr = S.Context.CreateTypeSourceInfo(S.Context.getPointerType(S.BuiltinRemoveReference(T, UnaryTransformType::RemoveCVRef, {})));
-      TypeSourceInfo *UPtr = S.Context.CreateTypeSourceInfo(S.Context.getPointerType(S.BuiltinRemoveReference(U, UnaryTransformType::RemoveCVRef, {})));
-      return EvaluateBinaryTypeTrait(S, TypeTrait::BTT_IsConvertibleTo, UPtr, TPtr, RParenLoc);
+      TypeSourceInfo *TPtr = S.Context.CreateTypeSourceInfo(
+          S.Context.getPointerType(T.getNonReferenceType()));
+      TypeSourceInfo *UPtr = S.Context.CreateTypeSourceInfo(
+          S.Context.getPointerType(U.getNonReferenceType()));
+      return !CheckConvertibilityForTypeTraits(S, UPtr, TPtr, RParenLoc,
+                                               OpaqueExprAllocator)
+                  .isInvalid();
     }
 
     if (Kind == clang::TT_IsNothrowConstructible)
@@ -5945,68 +6026,12 @@ static bool EvaluateBinaryTypeTrait(Sema &Self, TypeTrait BTT, const TypeSourceI
   case BTT_IsConvertible:
   case BTT_IsConvertibleTo:
   case BTT_IsNothrowConvertible: {
-    // C++0x [meta.rel]p4:
-    //   Given the following function prototype:
-    //
-    //     template <class T>
-    //       typename add_rvalue_reference<T>::type create();
-    //
-    //   the predicate condition for a template specialization
-    //   is_convertible<From, To> shall be satisfied if and only if
-    //   the return expression in the following code would be
-    //   well-formed, including any implicit conversions to the return
-    //   type of the function:
-    //
-    //     To test() {
-    //       return create<From>();
-    //     }
-    //
-    //   Access checking is performed as if in a context unrelated to To and
-    //   From. Only the validity of the immediate context of the expression
-    //   of the return-statement (including conversions to the return type)
-    //   is considered.
-    //
-    // We model the initialization as a copy-initialization of a temporary
-    // of the appropriate type, which for this expression is identical to the
-    // return statement (since NRVO doesn't apply).
-
-    // Functions aren't allowed to return function or array types.
-    if (RhsT->isFunctionType() || RhsT->isArrayType())
-      return false;
-
-    // A return statement in a void function must have void type.
     if (RhsT->isVoidType())
       return LhsT->isVoidType();
-
-    // A function definition requires a complete, non-abstract return type.
-    if (!Self.isCompleteType(Rhs->getTypeLoc().getBeginLoc(), RhsT) ||
-        Self.isAbstractType(Rhs->getTypeLoc().getBeginLoc(), RhsT))
-      return false;
-
-    // Compute the result of add_rvalue_reference.
-    if (LhsT->isObjectType() || LhsT->isFunctionType())
-      LhsT = Self.Context.getRValueReferenceType(LhsT);
-
-    // Build a fake source and destination for initialization.
-    InitializedEntity To(InitializedEntity::InitializeTemporary(RhsT));
-    OpaqueValueExpr From(KeyLoc, LhsT.getNonLValueExprType(Self.Context),
-                         Expr::getValueKindForType(LhsT));
-    Expr *FromPtr = &From;
-    InitializationKind Kind(InitializationKind::CreateCopy(KeyLoc,
-                                                           SourceLocation()));
-
-    // Perform the initialization in an unevaluated context within a SFINAE
-    // trap at translation unit scope.
-    EnterExpressionEvaluationContext Unevaluated(
-        Self, Sema::ExpressionEvaluationContext::Unevaluated);
-    Sema::SFINAETrap SFINAE(Self, /*AccessCheckingSFINAE=*/true);
-    Sema::ContextRAII TUContext(Self, Self.Context.getTranslationUnitDecl());
-    InitializationSequence Init(Self, To, Kind, FromPtr);
-    if (Init.Failed())
-      return false;
-
-    ExprResult Result = Init.Perform(Self, To, Kind, FromPtr);
-    if (Result.isInvalid() || SFINAE.hasErrorOccurred())
+    llvm::BumpPtrAllocator OpaqueExprAllocator;
+    ExprResult Result = CheckConvertibilityForTypeTraits(Self, Lhs, Rhs, KeyLoc,
+                                                         OpaqueExprAllocator);
+    if (Result.isInvalid())
       return false;
 
     if (BTT != BTT_IsNothrowConvertible)
diff --git a/clang/lib/Sema/SemaExprMember.cpp b/clang/lib/Sema/SemaExprMember.cpp
index 5facb14a18b7..244488a0b562 100644
--- a/clang/lib/Sema/SemaExprMember.cpp
+++ b/clang/lib/Sema/SemaExprMember.cpp
@@ -995,8 +995,9 @@ Sema::BuildMemberReferenceExpr(Expr *BaseExpr, QualType BaseExprType,
   // arrow operator was used with a dependent non-pointer object expression,
   // build a CXXDependentScopeMemberExpr.
   if (R.wasNotFoundInCurrentInstantiation() ||
-      (IsArrow && !BaseExprType->isPointerType() &&
-       BaseExprType->isDependentType()))
+      (R.getLookupName().getCXXOverloadedOperator() == OO_Equal &&
+       (SS.isSet() ? SS.getScopeRep()->isDependent()
+                   : BaseExprType->isDependentType())))
     return ActOnDependentMemberExpr(BaseExpr, BaseExprType, IsArrow, OpLoc, SS,
                                     TemplateKWLoc, FirstQualifierInScope,
                                     R.getLookupNameInfo(), TemplateArgs);
@@ -1319,28 +1320,28 @@ static ExprResult LookupMemberExpr(Sema &S, LookupResult &R,
     else if (const ObjCObjectPointerType *Ptr =
                  BaseType->getAs<ObjCObjectPointerType>())
       BaseType = Ptr->getPointeeType();
-    else if (!BaseType->isDependentType()) {
-      if (BaseType->isRecordType()) {
-        // Recover from arrow accesses to records, e.g.:
-        //   struct MyRecord foo;
-        //   foo->bar
-        // This is actually well-formed in C++ if MyRecord has an
-        // overloaded operator->, but that should have been dealt with
-        // by now--or a diagnostic message already issued if a problem
-        // was encountered while looking for the overloaded operator->.
-        if (!S.getLangOpts().CPlusPlus) {
-          S.Diag(OpLoc, diag::err_typecheck_member_reference_suggestion)
-              << BaseType << int(IsArrow) << BaseExpr.get()->getSourceRange()
-              << FixItHint::CreateReplacement(OpLoc, ".");
-        }
-        IsArrow = false;
-      } else if (BaseType->isFunctionType()) {
-        goto fail;
-      } else {
-        S.Diag(MemberLoc, diag::err_typecheck_member_reference_arrow)
-            << BaseType << BaseExpr.get()->getSourceRange();
-        return ExprError();
+    else if (BaseType->isFunctionType())
+      goto fail;
+    else if (BaseType->isDependentType())
+      BaseType = S.Context.DependentTy;
+    else if (BaseType->isRecordType()) {
+      // Recover from arrow accesses to records, e.g.:
+      //   struct MyRecord foo;
+      //   foo->bar
+      // This is actually well-formed in C++ if MyRecord has an
+      // overloaded operator->, but that should have been dealt with
+      // by now--or a diagnostic message already issued if a problem
+      // was encountered while looking for the overloaded operator->.
+      if (!S.getLangOpts().CPlusPlus) {
+        S.Diag(OpLoc, diag::err_typecheck_member_reference_suggestion)
+            << BaseType << int(IsArrow) << BaseExpr.get()->getSourceRange()
+            << FixItHint::CreateReplacement(OpLoc, ".");
       }
+      IsArrow = false;
+    } else {
+      S.Diag(MemberLoc, diag::err_typecheck_member_reference_arrow)
+          << BaseType << BaseExpr.get()->getSourceRange();
+      return ExprError();
     }
   }
 
@@ -1360,7 +1361,7 @@ static ExprResult LookupMemberExpr(Sema &S, LookupResult &R,
   }
 
   // Handle field access to simple records.
-  if (BaseType->getAsRecordDecl() || BaseType->isDependentType()) {
+  if (BaseType->getAsRecordDecl()) {
     TypoExpr *TE = nullptr;
     if (LookupMemberExprInRecord(S, R, BaseExpr.get(), BaseType, OpLoc, IsArrow,
                                  SS, HasTemplateArgs, TemplateKWLoc, TE))
@@ -1371,6 +1372,9 @@ static ExprResult LookupMemberExpr(Sema &S, LookupResult &R,
     // failed, the lookup result will have been cleared--that combined with the
     // valid-but-null ExprResult will trigger the appropriate diagnostics.
     return ExprResult(TE);
+  } else if (BaseType->isDependentType()) {
+    R.setNotFoundInCurrentInstantiation();
+    return ExprEmpty();
   }
 
   // Handle ivar access to Objective-C objects.
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index bb9e37f18d37..6a12c417e2f3 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -39,9 +39,89 @@ Decl *SemaHLSL::ActOnStartBuffer(Scope *BufferScope, bool CBuffer,
   return Result;
 }
 
+// Calculate the size of a legacy cbuffer type based on
+// https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-packing-rules
+static unsigned calculateLegacyCbufferSize(const ASTContext &Context,
+                                           QualType T) {
+  unsigned Size = 0;
+  constexpr unsigned CBufferAlign = 128;
+  if (const RecordType *RT = T->getAs<RecordType>()) {
+    const RecordDecl *RD = RT->getDecl();
+    for (const FieldDecl *Field : RD->fields()) {
+      QualType Ty = Field->getType();
+      unsigned FieldSize = calculateLegacyCbufferSize(Context, Ty);
+      unsigned FieldAlign = 32;
+      if (Ty->isAggregateType())
+        FieldAlign = CBufferAlign;
+      Size = llvm::alignTo(Size, FieldAlign);
+      Size += FieldSize;
+    }
+  } else if (const ConstantArrayType *AT = Context.getAsConstantArrayType(T)) {
+    if (unsigned ElementCount = AT->getSize().getZExtValue()) {
+      unsigned ElementSize =
+          calculateLegacyCbufferSize(Context, AT->getElementType());
+      unsigned AlignedElementSize = llvm::alignTo(ElementSize, CBufferAlign);
+      Size = AlignedElementSize * (ElementCount - 1) + ElementSize;
+    }
+  } else if (const VectorType *VT = T->getAs<VectorType>()) {
+    unsigned ElementCount = VT->getNumElements();
+    unsigned ElementSize =
+        calculateLegacyCbufferSize(Context, VT->getElementType());
+    Size = ElementSize * ElementCount;
+  } else {
+    Size = Context.getTypeSize(T);
+  }
+  return Size;
+}
+
 void SemaHLSL::ActOnFinishBuffer(Decl *Dcl, SourceLocation RBrace) {
   auto *BufDecl = cast<HLSLBufferDecl>(Dcl);
   BufDecl->setRBraceLoc(RBrace);
+
+  // Validate packoffset.
+  llvm::SmallVector<std::pair<VarDecl *, HLSLPackOffsetAttr *>> PackOffsetVec;
+  bool HasPackOffset = false;
+  bool HasNonPackOffset = false;
+  for (auto *Field : BufDecl->decls()) {
+    VarDecl *Var = dyn_cast<VarDecl>(Field);
+    if (!Var)
+      continue;
+    if (Field->hasAttr<HLSLPackOffsetAttr>()) {
+      PackOffsetVec.emplace_back(Var, Field->getAttr<HLSLPackOffsetAttr>());
+      HasPackOffset = true;
+    } else {
+      HasNonPackOffset = true;
+    }
+  }
+
+  if (HasPackOffset && HasNonPackOffset)
+    Diag(BufDecl->getLocation(), diag::warn_hlsl_packoffset_mix);
+
+  if (HasPackOffset) {
+    ASTContext &Context = getASTContext();
+    // Make sure no overlap in packoffset.
+    // Sort PackOffsetVec by offset.
+    std::sort(PackOffsetVec.begin(), PackOffsetVec.end(),
+              [](const std::pair<VarDecl *, HLSLPackOffsetAttr *> &LHS,
+                 const std::pair<VarDecl *, HLSLPackOffsetAttr *> &RHS) {
+                return LHS.second->getOffset() < RHS.second->getOffset();
+              });
+
+    for (unsigned i = 0; i < PackOffsetVec.size() - 1; i++) {
+      VarDecl *Var = PackOffsetVec[i].first;
+      HLSLPackOffsetAttr *Attr = PackOffsetVec[i].second;
+      unsigned Size = calculateLegacyCbufferSize(Context, Var->getType());
+      unsigned Begin = Attr->getOffset() * 32;
+      unsigned End = Begin + Size;
+      unsigned NextBegin = PackOffsetVec[i + 1].second->getOffset() * 32;
+      if (End > NextBegin) {
+        VarDecl *NextVar = PackOffsetVec[i + 1].first;
+        Diag(NextVar->getLocation(), diag::err_hlsl_packoffset_overlap)
+            << NextVar << Var;
+      }
+    }
+  }
+
   SemaRef.PopDeclContext();
 }
 
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 7d9eaf672046..fe4a698a612e 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -6576,12 +6576,12 @@ void InitializationSequence::InitializeFrom(Sema &S,
 
     AddPassByIndirectCopyRestoreStep(DestType, ShouldCopy);
   } else if (ICS.isBad()) {
-    DeclAccessPair dap;
-    if (isLibstdcxxPointerReturnFalseHack(S, Entity, Initializer)) {
+    if (isLibstdcxxPointerReturnFalseHack(S, Entity, Initializer))
       AddZeroInitializationStep(Entity.getType());
-    } else if (Initializer->getType() == Context.OverloadTy &&
-               !S.ResolveAddressOfOverloadedFunction(Initializer, DestType,
-                                                     false, dap))
+    else if (DeclAccessPair Found;
+             Initializer->getType() == Context.OverloadTy &&
+             !S.ResolveAddressOfOverloadedFunction(Initializer, DestType,
+                                                   /*Complain=*/false, Found))
       SetFailed(InitializationSequence::FK_AddressOfOverloadFailed);
     else if (Initializer->getType()->isFunctionType() &&
              isExprAnUnaddressableFunction(S, Initializer))
@@ -8065,11 +8065,6 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path,
 enum PathLifetimeKind {
   /// Lifetime-extend along this path.
   Extend,
-  /// We should lifetime-extend, but we don't because (due to technical
-  /// limitations) we can't. This happens for default member initializers,
-  /// which we don't clone for every use, so we don't have a unique
-  /// MaterializeTemporaryExpr to update.
-  ShouldExtend,
   /// Do not lifetime extend along this path.
   NoExtend
 };
@@ -8081,7 +8076,7 @@ shouldLifetimeExtendThroughPath(const IndirectLocalPath &Path) {
   PathLifetimeKind Kind = PathLifetimeKind::Extend;
   for (auto Elem : Path) {
     if (Elem.Kind == IndirectLocalPathEntry::DefaultInit)
-      Kind = PathLifetimeKind::ShouldExtend;
+      Kind = PathLifetimeKind::Extend;
     else if (Elem.Kind != IndirectLocalPathEntry::LambdaCaptureInit)
       return PathLifetimeKind::NoExtend;
   }
@@ -8201,18 +8196,6 @@ void Sema::checkInitializerLifetime(const InitializedEntity &Entity,
                               ExtendingEntity->allocateManglingNumber());
         // Also visit the temporaries lifetime-extended by this initializer.
         return true;
-
-      case PathLifetimeKind::ShouldExtend:
-        // We're supposed to lifetime-extend the temporary along this path (per
-        // the resolution of DR1815), but we don't support that yet.
-        //
-        // FIXME: Properly handle this situation. Perhaps the easiest approach
-        // would be to clone the initializer expression on each use that would
-        // lifetime extend its temporaries.
-        Diag(DiagLoc, diag::warn_unsupported_lifetime_extension)
-            << RK << DiagRange;
-        break;
-
       case PathLifetimeKind::NoExtend:
         // If the path goes through the initialization of a variable or field,
         // it can't possibly reach a temporary created in this full-expression.
@@ -9641,6 +9624,8 @@ bool InitializationSequence::Diagnose(Sema &S,
   if (!Failed())
     return false;
 
+  QualType DestType = Entity.getType();
+
   // When we want to diagnose only one element of a braced-init-list,
   // we need to factor it out.
   Expr *OnlyArg;
@@ -9650,11 +9635,21 @@ bool InitializationSequence::Diagnose(Sema &S,
       OnlyArg = List->getInit(0);
     else
       OnlyArg = Args[0];
+
+    if (OnlyArg->getType() == S.Context.OverloadTy) {
+      DeclAccessPair Found;
+      if (FunctionDecl *FD = S.ResolveAddressOfOverloadedFunction(
+              OnlyArg, DestType.getNonReferenceType(), /*Complain=*/false,
+              Found)) {
+        if (Expr *Resolved =
+                S.FixOverloadedFunctionReference(OnlyArg, Found, FD).get())
+          OnlyArg = Resolved;
+      }
+    }
   }
   else
     OnlyArg = nullptr;
 
-  QualType DestType = Entity.getType();
   switch (Failure) {
   case FK_TooManyInitsForReference:
     // FIXME: Customize for the initialized entity?
diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp
index 2f6ad49fc08b..7251aabc6af2 100644
--- a/clang/lib/Sema/SemaLookup.cpp
+++ b/clang/lib/Sema/SemaLookup.cpp
@@ -1282,6 +1282,7 @@ bool Sema::CppLookupName(LookupResult &R, Scope *S) {
       if (DeclContext *DC = PreS->getEntity())
         DeclareImplicitMemberFunctionsWithName(*this, Name, R.getNameLoc(), DC);
   }
+
   // C++23 [temp.dep.general]p2:
   //   The component name of an unqualified-id is dependent if
   //   - it is a conversion-function-id whose conversion-type-id
@@ -1294,20 +1295,6 @@ bool Sema::CppLookupName(LookupResult &R, Scope *S) {
     return false;
   }
 
-  // If this is the name of an implicitly-declared special member function,
-  // go through the scope stack to implicitly declare
-  if (isImplicitlyDeclaredMemberFunctionName(Name)) {
-    for (Scope *PreS = S; PreS; PreS = PreS->getParent())
-      if (DeclContext *DC = PreS->getEntity()) {
-        if (DC->isDependentContext() && isa<CXXRecordDecl>(DC) &&
-            Name.getCXXOverloadedOperator() == OO_Equal) {
-          R.setNotFoundInCurrentInstantiation();
-          return false;
-        }
-        DeclareImplicitMemberFunctionsWithName(*this, Name, R.getNameLoc(), DC);
-      }
-  }
-
   // Implicitly declare member functions with the name we're looking for, if in
   // fact we are in a scope where it matters.
 
@@ -2485,10 +2472,8 @@ bool Sema::LookupQualifiedName(LookupResult &R, DeclContext *LookupCtx,
     //     is operator=, or
     //   - [...]
     if (DeclarationName Name = R.getLookupName();
-        (Name.getNameKind() == DeclarationName::CXXConversionFunctionName &&
-         Name.getCXXNameType()->isDependentType()) ||
-        (Name.getCXXOverloadedOperator() == OO_Equal && LookupRec &&
-         LookupRec->isDependentContext())) {
+        Name.getNameKind() == DeclarationName::CXXConversionFunctionName &&
+        Name.getCXXNameType()->isDependentType()) {
       R.setNotFoundInCurrentInstantiation();
       return false;
     }
diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp
index 3ea81e0497c2..656d30947a8d 100644
--- a/clang/lib/Sema/SemaOpenACC.cpp
+++ b/clang/lib/Sema/SemaOpenACC.cpp
@@ -16,6 +16,7 @@
 #include "clang/Basic/DiagnosticSema.h"
 #include "clang/Basic/OpenACCKinds.h"
 #include "clang/Sema/Sema.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Casting.h"
 
 using namespace clang;
@@ -103,6 +104,16 @@ bool doesClauseApplyToDirective(OpenACCDirectiveKind DirectiveKind,
     default:
       return false;
     }
+  case OpenACCClauseKind::FirstPrivate:
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Parallel:
+    case OpenACCDirectiveKind::Serial:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+      return true;
+    default:
+      return false;
+    }
   case OpenACCClauseKind::Private:
     switch (DirectiveKind) {
     case OpenACCDirectiveKind::Parallel:
@@ -115,6 +126,113 @@ bool doesClauseApplyToDirective(OpenACCDirectiveKind DirectiveKind,
     default:
       return false;
     }
+  case OpenACCClauseKind::NoCreate:
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Parallel:
+    case OpenACCDirectiveKind::Serial:
+    case OpenACCDirectiveKind::Kernels:
+    case OpenACCDirectiveKind::Data:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+      return true;
+    default:
+      return false;
+    }
+  case OpenACCClauseKind::Present:
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Parallel:
+    case OpenACCDirectiveKind::Serial:
+    case OpenACCDirectiveKind::Kernels:
+    case OpenACCDirectiveKind::Data:
+    case OpenACCDirectiveKind::Declare:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+      return true;
+    default:
+      return false;
+    }
+
+  case OpenACCClauseKind::Copy:
+  case OpenACCClauseKind::PCopy:
+  case OpenACCClauseKind::PresentOrCopy:
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Parallel:
+    case OpenACCDirectiveKind::Serial:
+    case OpenACCDirectiveKind::Kernels:
+    case OpenACCDirectiveKind::Data:
+    case OpenACCDirectiveKind::Declare:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+      return true;
+    default:
+      return false;
+    }
+  case OpenACCClauseKind::Attach:
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Parallel:
+    case OpenACCDirectiveKind::Serial:
+    case OpenACCDirectiveKind::Kernels:
+    case OpenACCDirectiveKind::Data:
+    case OpenACCDirectiveKind::EnterData:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+      return true;
+    default:
+      return false;
+    }
+  case OpenACCClauseKind::DevicePtr:
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Parallel:
+    case OpenACCDirectiveKind::Serial:
+    case OpenACCDirectiveKind::Kernels:
+    case OpenACCDirectiveKind::Data:
+    case OpenACCDirectiveKind::Declare:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+      return true;
+    default:
+      return false;
+    }
+  case OpenACCClauseKind::Async:
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Parallel:
+    case OpenACCDirectiveKind::Serial:
+    case OpenACCDirectiveKind::Kernels:
+    case OpenACCDirectiveKind::Data:
+    case OpenACCDirectiveKind::EnterData:
+    case OpenACCDirectiveKind::ExitData:
+    case OpenACCDirectiveKind::Set:
+    case OpenACCDirectiveKind::Update:
+    case OpenACCDirectiveKind::Wait:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+      return true;
+    default:
+      return false;
+    }
+  case OpenACCClauseKind::Wait:
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Parallel:
+    case OpenACCDirectiveKind::Serial:
+    case OpenACCDirectiveKind::Kernels:
+    case OpenACCDirectiveKind::Data:
+    case OpenACCDirectiveKind::EnterData:
+    case OpenACCDirectiveKind::ExitData:
+    case OpenACCDirectiveKind::Update:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+      return true;
+    default:
+      return false;
+    }
+
   default:
     // Do nothing so we can go to the 'unimplemented' diagnostic instead.
     return true;
@@ -315,6 +433,27 @@ SemaOpenACC::ActOnClause(ArrayRef<const OpenACCClause *> ExistingClauses,
         getASTContext(), Clause.getBeginLoc(), Clause.getLParenLoc(),
         Clause.getIntExprs()[0], Clause.getEndLoc());
   }
+  case OpenACCClauseKind::Async: {
+    // Restrictions only properly implemented on 'compute' constructs, and
+    // 'compute' constructs are the only construct that can do anything with
+    // this yet, so skip/treat as unimplemented in this case.
+    if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind()))
+      break;
+
+    // There is no prose in the standard that says duplicates aren't allowed,
+    // but this diagnostic is present in other compilers, as well as makes
+    // sense.
+    if (checkAlreadyHasClauseOfKind(*this, ExistingClauses, Clause))
+      return nullptr;
+
+    assert(Clause.getNumIntExprs() < 2 &&
+           "Invalid number of expressions for Async");
+
+    return OpenACCAsyncClause::Create(
+        getASTContext(), Clause.getBeginLoc(), Clause.getLParenLoc(),
+        Clause.getNumIntExprs() != 0 ? Clause.getIntExprs()[0] : nullptr,
+        Clause.getEndLoc());
+  }
   case OpenACCClauseKind::Private: {
     // Restrictions only properly implemented on 'compute' constructs, and
     // 'compute' constructs are the only construct that can do anything with
@@ -330,6 +469,188 @@ SemaOpenACC::ActOnClause(ArrayRef<const OpenACCClause *> ExistingClauses,
         getASTContext(), Clause.getBeginLoc(), Clause.getLParenLoc(),
         Clause.getVarList(), Clause.getEndLoc());
   }
+  case OpenACCClauseKind::FirstPrivate: {
+    // Restrictions only properly implemented on 'compute' constructs, and
+    // 'compute' constructs are the only construct that can do anything with
+    // this yet, so skip/treat as unimplemented in this case.
+    if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind()))
+      break;
+
+    // ActOnVar ensured that everything is a valid variable reference, so there
+    // really isn't anything to do here. GCC does some duplicate-finding, though
+    // it isn't apparent in the standard where this is justified.
+
+    return OpenACCFirstPrivateClause::Create(
+        getASTContext(), Clause.getBeginLoc(), Clause.getLParenLoc(),
+        Clause.getVarList(), Clause.getEndLoc());
+  }
+  case OpenACCClauseKind::NoCreate: {
+    // Restrictions only properly implemented on 'compute' constructs, and
+    // 'compute' constructs are the only construct that can do anything with
+    // this yet, so skip/treat as unimplemented in this case.
+    if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind()))
+      break;
+
+    // ActOnVar ensured that everything is a valid variable reference, so there
+    // really isn't anything to do here. GCC does some duplicate-finding, though
+    // it isn't apparent in the standard where this is justified.
+
+    return OpenACCNoCreateClause::Create(
+        getASTContext(), Clause.getBeginLoc(), Clause.getLParenLoc(),
+        Clause.getVarList(), Clause.getEndLoc());
+  }
+  case OpenACCClauseKind::Present: {
+    // Restrictions only properly implemented on 'compute' constructs, and
+    // 'compute' constructs are the only construct that can do anything with
+    // this yet, so skip/treat as unimplemented in this case.
+    if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind()))
+      break;
+
+    // ActOnVar ensured that everything is a valid variable reference, so there
+    // really isn't anything to do here. GCC does some duplicate-finding, though
+    // it isn't apparent in the standard where this is justified.
+
+    return OpenACCPresentClause::Create(
+        getASTContext(), Clause.getBeginLoc(), Clause.getLParenLoc(),
+        Clause.getVarList(), Clause.getEndLoc());
+  }
+  case OpenACCClauseKind::PresentOrCopy:
+  case OpenACCClauseKind::PCopy:
+    Diag(Clause.getBeginLoc(), diag::warn_acc_deprecated_alias_name)
+        << Clause.getClauseKind() << OpenACCClauseKind::Copy;
+    LLVM_FALLTHROUGH;
+  case OpenACCClauseKind::Copy: {
+    // Restrictions only properly implemented on 'compute' constructs, and
+    // 'compute' constructs are the only construct that can do anything with
+    // this yet, so skip/treat as unimplemented in this case.
+    if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind()))
+      break;
+
+    // ActOnVar ensured that everything is a valid variable reference, so there
+    // really isn't anything to do here. GCC does some duplicate-finding, though
+    // it isn't apparent in the standard where this is justified.
+
+    return OpenACCCopyClause::Create(
+        getASTContext(), Clause.getClauseKind(), Clause.getBeginLoc(),
+        Clause.getLParenLoc(), Clause.getVarList(), Clause.getEndLoc());
+  }
+  case OpenACCClauseKind::PresentOrCopyIn:
+  case OpenACCClauseKind::PCopyIn:
+    Diag(Clause.getBeginLoc(), diag::warn_acc_deprecated_alias_name)
+        << Clause.getClauseKind() << OpenACCClauseKind::CopyIn;
+    LLVM_FALLTHROUGH;
+  case OpenACCClauseKind::CopyIn: {
+    // Restrictions only properly implemented on 'compute' constructs, and
+    // 'compute' constructs are the only construct that can do anything with
+    // this yet, so skip/treat as unimplemented in this case.
+    if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind()))
+      break;
+
+    // ActOnVar ensured that everything is a valid variable reference, so there
+    // really isn't anything to do here. GCC does some duplicate-finding, though
+    // it isn't apparent in the standard where this is justified.
+
+    return OpenACCCopyInClause::Create(
+        getASTContext(), Clause.getClauseKind(), Clause.getBeginLoc(),
+        Clause.getLParenLoc(), Clause.isReadOnly(), Clause.getVarList(),
+        Clause.getEndLoc());
+  }
+  case OpenACCClauseKind::PresentOrCopyOut:
+  case OpenACCClauseKind::PCopyOut:
+    Diag(Clause.getBeginLoc(), diag::warn_acc_deprecated_alias_name)
+        << Clause.getClauseKind() << OpenACCClauseKind::CopyOut;
+    LLVM_FALLTHROUGH;
+  case OpenACCClauseKind::CopyOut: {
+    // Restrictions only properly implemented on 'compute' constructs, and
+    // 'compute' constructs are the only construct that can do anything with
+    // this yet, so skip/treat as unimplemented in this case.
+    if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind()))
+      break;
+
+    // ActOnVar ensured that everything is a valid variable reference, so there
+    // really isn't anything to do here. GCC does some duplicate-finding, though
+    // it isn't apparent in the standard where this is justified.
+
+    return OpenACCCopyOutClause::Create(
+        getASTContext(), Clause.getClauseKind(), Clause.getBeginLoc(),
+        Clause.getLParenLoc(), Clause.isZero(), Clause.getVarList(),
+        Clause.getEndLoc());
+  }
+  case OpenACCClauseKind::PresentOrCreate:
+  case OpenACCClauseKind::PCreate:
+    Diag(Clause.getBeginLoc(), diag::warn_acc_deprecated_alias_name)
+        << Clause.getClauseKind() << OpenACCClauseKind::Create;
+    LLVM_FALLTHROUGH;
+  case OpenACCClauseKind::Create: {
+    // Restrictions only properly implemented on 'compute' constructs, and
+    // 'compute' constructs are the only construct that can do anything with
+    // this yet, so skip/treat as unimplemented in this case.
+    if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind()))
+      break;
+
+    // ActOnVar ensured that everything is a valid variable reference, so there
+    // really isn't anything to do here. GCC does some duplicate-finding, though
+    // it isn't apparent in the standard where this is justified.
+
+    return OpenACCCreateClause::Create(getASTContext(), Clause.getClauseKind(),
+                                       Clause.getBeginLoc(),
+                                       Clause.getLParenLoc(), Clause.isZero(),
+                                       Clause.getVarList(), Clause.getEndLoc());
+  }
+  case OpenACCClauseKind::Attach: {
+    // Restrictions only properly implemented on 'compute' constructs, and
+    // 'compute' constructs are the only construct that can do anything with
+    // this yet, so skip/treat as unimplemented in this case.
+    if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind()))
+      break;
+
+    // ActOnVar ensured that everything is a valid variable reference, but we
+    // still have to make sure it is a pointer type.
+    llvm::SmallVector<Expr *> VarList{Clause.getVarList().begin(),
+                                      Clause.getVarList().end()};
+    VarList.erase(std::remove_if(VarList.begin(), VarList.end(), [&](Expr *E) {
+      return CheckVarIsPointerType(OpenACCClauseKind::Attach, E);
+    }), VarList.end());
+    Clause.setVarListDetails(VarList,
+                             /*IsReadOnly=*/false, /*IsZero=*/false);
+
+    return OpenACCAttachClause::Create(getASTContext(), Clause.getBeginLoc(),
+                                       Clause.getLParenLoc(),
+                                       Clause.getVarList(), Clause.getEndLoc());
+  }
+  case OpenACCClauseKind::DevicePtr: {
+    // Restrictions only properly implemented on 'compute' constructs, and
+    // 'compute' constructs are the only construct that can do anything with
+    // this yet, so skip/treat as unimplemented in this case.
+    if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind()))
+      break;
+
+    // ActOnVar ensured that everything is a valid variable reference, but we
+    // still have to make sure it is a pointer type.
+    llvm::SmallVector<Expr *> VarList{Clause.getVarList().begin(),
+                                      Clause.getVarList().end()};
+    VarList.erase(std::remove_if(VarList.begin(), VarList.end(), [&](Expr *E) {
+      return CheckVarIsPointerType(OpenACCClauseKind::DevicePtr, E);
+    }), VarList.end());
+    Clause.setVarListDetails(VarList,
+                             /*IsReadOnly=*/false, /*IsZero=*/false);
+
+    return OpenACCDevicePtrClause::Create(
+        getASTContext(), Clause.getBeginLoc(), Clause.getLParenLoc(),
+        Clause.getVarList(), Clause.getEndLoc());
+  }
+  case OpenACCClauseKind::Wait: {
+    // Restrictions only properly implemented on 'compute' constructs, and
+    // 'compute' constructs are the only construct that can do anything with
+    // this yet, so skip/treat as unimplemented in this case.
+    if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind()))
+      break;
+
+    return OpenACCWaitClause::Create(
+        getASTContext(), Clause.getBeginLoc(), Clause.getLParenLoc(),
+        Clause.getDevNumExpr(), Clause.getQueuesLoc(), Clause.getQueueIdExprs(),
+        Clause.getEndLoc());
+  }
   default:
     break;
   }
@@ -367,7 +688,9 @@ ExprResult SemaOpenACC::ActOnIntExpr(OpenACCDirectiveKind DK,
   assert(((DK != OpenACCDirectiveKind::Invalid &&
            CK == OpenACCClauseKind::Invalid) ||
           (DK == OpenACCDirectiveKind::Invalid &&
-           CK != OpenACCClauseKind::Invalid)) &&
+           CK != OpenACCClauseKind::Invalid) ||
+          (DK == OpenACCDirectiveKind::Invalid &&
+           CK == OpenACCClauseKind::Invalid)) &&
          "Only one of directive or clause kind should be provided");
 
   class IntExprConverter : public Sema::ICEConvertDiagnoser {
@@ -375,6 +698,16 @@ ExprResult SemaOpenACC::ActOnIntExpr(OpenACCDirectiveKind DK,
     OpenACCClauseKind ClauseKind;
     Expr *IntExpr;
 
+    // gets the index into the diagnostics so we can use this for clauses,
+    // directives, and sub array.s
+    unsigned getDiagKind() const {
+      if (ClauseKind != OpenACCClauseKind::Invalid)
+        return 0;
+      if (DirectiveKind != OpenACCDirectiveKind::Invalid)
+        return 1;
+      return 2;
+    }
+
   public:
     IntExprConverter(OpenACCDirectiveKind DK, OpenACCClauseKind CK,
                      Expr *IntExpr)
@@ -390,12 +723,8 @@ ExprResult SemaOpenACC::ActOnIntExpr(OpenACCDirectiveKind DK,
     }
     SemaBase::SemaDiagnosticBuilder diagnoseNotInt(Sema &S, SourceLocation Loc,
                                                    QualType T) override {
-      if (ClauseKind != OpenACCClauseKind::Invalid)
-        return S.Diag(Loc, diag::err_acc_int_expr_requires_integer) <<
-               /*Clause=*/0 << ClauseKind << T;
-
-      return S.Diag(Loc, diag::err_acc_int_expr_requires_integer) <<
-             /*Directive=*/1 << DirectiveKind << T;
+      return S.Diag(Loc, diag::err_acc_int_expr_requires_integer)
+             << getDiagKind() << ClauseKind << DirectiveKind << T;
     }
 
     SemaBase::SemaDiagnosticBuilder
@@ -450,6 +779,36 @@ ExprResult SemaOpenACC::ActOnIntExpr(OpenACCDirectiveKind DK,
   return IntExpr;
 }
 
+bool SemaOpenACC::CheckVarIsPointerType(OpenACCClauseKind ClauseKind,
+                                        Expr *VarExpr) {
+  // We already know that VarExpr is a proper reference to a variable, so we
+  // should be able to just take the type of the expression to get the type of
+  // the referenced variable.
+
+  // We've already seen an error, don't diagnose anything else.
+  if (!VarExpr || VarExpr->containsErrors())
+    return false;
+
+  if (isa<ArraySectionExpr>(VarExpr->IgnoreParenImpCasts()) ||
+      VarExpr->hasPlaceholderType(BuiltinType::ArraySection)) {
+    Diag(VarExpr->getExprLoc(), diag::err_array_section_use) << /*OpenACC=*/0;
+    Diag(VarExpr->getExprLoc(), diag::note_acc_expected_pointer_var);
+    return true;
+  }
+
+  QualType Ty = VarExpr->getType();
+  Ty = Ty.getNonReferenceType().getUnqualifiedType();
+
+  // Nothing we can do if this is a dependent type.
+  if (Ty->isDependentType())
+    return false;
+
+  if (!Ty->isPointerType())
+    return Diag(VarExpr->getExprLoc(), diag::err_acc_var_not_pointer_type)
+           << ClauseKind << Ty;
+  return false;
+}
+
 ExprResult SemaOpenACC::ActOnVar(Expr *VarExpr) {
   // We still need to retain the array subscript/subarray exprs, so work on a
   // copy.
@@ -503,12 +862,211 @@ ExprResult SemaOpenACC::ActOnArraySectionExpr(Expr *Base, SourceLocation LBLoc,
                                               SourceLocation RBLoc) {
   ASTContext &Context = getASTContext();
 
-  // TODO OpenACC: We likely have to reproduce a lot of the same logic from the
-  // OMP version of this, but at the moment we don't have a good way to test it,
-  // so for now we'll just create the node.
+  // Handle placeholders.
+  if (Base->hasPlaceholderType() &&
+      !Base->hasPlaceholderType(BuiltinType::ArraySection)) {
+    ExprResult Result = SemaRef.CheckPlaceholderExpr(Base);
+    if (Result.isInvalid())
+      return ExprError();
+    Base = Result.get();
+  }
+  if (LowerBound && LowerBound->getType()->isNonOverloadPlaceholderType()) {
+    ExprResult Result = SemaRef.CheckPlaceholderExpr(LowerBound);
+    if (Result.isInvalid())
+      return ExprError();
+    Result = SemaRef.DefaultLvalueConversion(Result.get());
+    if (Result.isInvalid())
+      return ExprError();
+    LowerBound = Result.get();
+  }
+  if (Length && Length->getType()->isNonOverloadPlaceholderType()) {
+    ExprResult Result = SemaRef.CheckPlaceholderExpr(Length);
+    if (Result.isInvalid())
+      return ExprError();
+    Result = SemaRef.DefaultLvalueConversion(Result.get());
+    if (Result.isInvalid())
+      return ExprError();
+    Length = Result.get();
+  }
+
+  // Check the 'base' value, it must be an array or pointer type, and not to/of
+  // a function type.
+  QualType OriginalBaseTy = ArraySectionExpr::getBaseOriginalType(Base);
+  QualType ResultTy;
+  if (!Base->isTypeDependent()) {
+    if (OriginalBaseTy->isAnyPointerType()) {
+      ResultTy = OriginalBaseTy->getPointeeType();
+    } else if (OriginalBaseTy->isArrayType()) {
+      ResultTy = OriginalBaseTy->getAsArrayTypeUnsafe()->getElementType();
+    } else {
+      return ExprError(
+          Diag(Base->getExprLoc(), diag::err_acc_typecheck_subarray_value)
+          << Base->getSourceRange());
+    }
+
+    if (ResultTy->isFunctionType()) {
+      Diag(Base->getExprLoc(), diag::err_acc_subarray_function_type)
+          << ResultTy << Base->getSourceRange();
+      return ExprError();
+    }
+
+    if (SemaRef.RequireCompleteType(Base->getExprLoc(), ResultTy,
+                                    diag::err_acc_subarray_incomplete_type,
+                                    Base))
+      return ExprError();
+
+    if (!Base->hasPlaceholderType(BuiltinType::ArraySection)) {
+      ExprResult Result = SemaRef.DefaultFunctionArrayLvalueConversion(Base);
+      if (Result.isInvalid())
+        return ExprError();
+      Base = Result.get();
+    }
+  }
+
+  auto GetRecovery = [&](Expr *E, QualType Ty) {
+    ExprResult Recovery =
+        SemaRef.CreateRecoveryExpr(E->getBeginLoc(), E->getEndLoc(), E, Ty);
+    return Recovery.isUsable() ? Recovery.get() : nullptr;
+  };
+
+  // Ensure both of the expressions are int-exprs.
+  if (LowerBound && !LowerBound->isTypeDependent()) {
+    ExprResult LBRes =
+        ActOnIntExpr(OpenACCDirectiveKind::Invalid, OpenACCClauseKind::Invalid,
+                     LowerBound->getExprLoc(), LowerBound);
+
+    if (LBRes.isUsable())
+      LBRes = SemaRef.DefaultLvalueConversion(LBRes.get());
+    LowerBound =
+        LBRes.isUsable() ? LBRes.get() : GetRecovery(LowerBound, Context.IntTy);
+  }
+
+  if (Length && !Length->isTypeDependent()) {
+    ExprResult LenRes =
+        ActOnIntExpr(OpenACCDirectiveKind::Invalid, OpenACCClauseKind::Invalid,
+                     Length->getExprLoc(), Length);
+
+    if (LenRes.isUsable())
+      LenRes = SemaRef.DefaultLvalueConversion(LenRes.get());
+    Length =
+        LenRes.isUsable() ? LenRes.get() : GetRecovery(Length, Context.IntTy);
+  }
+
+  // Length is required if the base type is not an array of known bounds.
+  if (!Length && (OriginalBaseTy.isNull() ||
+                  (!OriginalBaseTy->isDependentType() &&
+                   !OriginalBaseTy->isConstantArrayType() &&
+                   !OriginalBaseTy->isDependentSizedArrayType()))) {
+    bool IsArray = !OriginalBaseTy.isNull() && OriginalBaseTy->isArrayType();
+    Diag(ColonLoc, diag::err_acc_subarray_no_length) << IsArray;
+    // Fill in a dummy 'length' so that when we instantiate this we don't
+    // double-diagnose here.
+    ExprResult Recovery = SemaRef.CreateRecoveryExpr(
+        ColonLoc, SourceLocation(), ArrayRef<Expr *>{std::nullopt},
+        Context.IntTy);
+    Length = Recovery.isUsable() ? Recovery.get() : nullptr;
+  }
+
+  // Check the values of each of the arguments, they cannot be negative(we
+  // assume), and if the array bound is known, must be within range. As we do
+  // so, do our best to continue with evaluation, we can set the
+  // value/expression to nullptr/nullopt if they are invalid, and treat them as
+  // not present for the rest of evaluation.
+
+  // We don't have to check for dependence, because the dependent size is
+  // represented as a different AST node.
+  std::optional<llvm::APSInt> BaseSize;
+  if (!OriginalBaseTy.isNull() && OriginalBaseTy->isConstantArrayType()) {
+    const auto *ArrayTy = Context.getAsConstantArrayType(OriginalBaseTy);
+    BaseSize = ArrayTy->getSize();
+  }
+
+  auto GetBoundValue = [&](Expr *E) -> std::optional<llvm::APSInt> {
+    if (!E || E->isInstantiationDependent())
+      return std::nullopt;
+
+    Expr::EvalResult Res;
+    if (!E->EvaluateAsInt(Res, Context))
+      return std::nullopt;
+    return Res.Val.getInt();
+  };
+
+  std::optional<llvm::APSInt> LowerBoundValue = GetBoundValue(LowerBound);
+  std::optional<llvm::APSInt> LengthValue = GetBoundValue(Length);
+
+  // Check lower bound for negative or out of range.
+  if (LowerBoundValue.has_value()) {
+    if (LowerBoundValue->isNegative()) {
+      Diag(LowerBound->getExprLoc(), diag::err_acc_subarray_negative)
+          << /*LowerBound=*/0 << toString(*LowerBoundValue, /*Radix=*/10);
+      LowerBoundValue.reset();
+      LowerBound = GetRecovery(LowerBound, LowerBound->getType());
+    } else if (BaseSize.has_value() &&
+               llvm::APSInt::compareValues(*LowerBoundValue, *BaseSize) >= 0) {
+      // Lower bound (start index) must be less than the size of the array.
+      Diag(LowerBound->getExprLoc(), diag::err_acc_subarray_out_of_range)
+          << /*LowerBound=*/0 << toString(*LowerBoundValue, /*Radix=*/10)
+          << toString(*BaseSize, /*Radix=*/10);
+      LowerBoundValue.reset();
+      LowerBound = GetRecovery(LowerBound, LowerBound->getType());
+    }
+  }
+
+  // Check length for negative or out of range.
+  if (LengthValue.has_value()) {
+    if (LengthValue->isNegative()) {
+      Diag(Length->getExprLoc(), diag::err_acc_subarray_negative)
+          << /*Length=*/1 << toString(*LengthValue, /*Radix=*/10);
+      LengthValue.reset();
+      Length = GetRecovery(Length, Length->getType());
+    } else if (BaseSize.has_value() &&
+               llvm::APSInt::compareValues(*LengthValue, *BaseSize) > 0) {
+      // Length must be lessthan or EQUAL to the size of the array.
+      Diag(Length->getExprLoc(), diag::err_acc_subarray_out_of_range)
+          << /*Length=*/1 << toString(*LengthValue, /*Radix=*/10)
+          << toString(*BaseSize, /*Radix=*/10);
+      LengthValue.reset();
+      Length = GetRecovery(Length, Length->getType());
+    }
+  }
+
+  // Adding two APSInts requires matching sign, so extract that here.
+  auto AddAPSInt = [](llvm::APSInt LHS, llvm::APSInt RHS) -> llvm::APSInt {
+    if (LHS.isSigned() == RHS.isSigned())
+      return LHS + RHS;
+
+    unsigned Width = std::max(LHS.getBitWidth(), RHS.getBitWidth()) + 1;
+    return llvm::APSInt(LHS.sext(Width) + RHS.sext(Width), /*Signed=*/true);
+  };
+
+  // If we know all 3 values, we can diagnose that the total value would be out
+  // of range.
+  if (BaseSize.has_value() && LowerBoundValue.has_value() &&
+      LengthValue.has_value() &&
+      llvm::APSInt::compareValues(AddAPSInt(*LowerBoundValue, *LengthValue),
+                                  *BaseSize) > 0) {
+    Diag(Base->getExprLoc(),
+         diag::err_acc_subarray_base_plus_length_out_of_range)
+        << toString(*LowerBoundValue, /*Radix=*/10)
+        << toString(*LengthValue, /*Radix=*/10)
+        << toString(*BaseSize, /*Radix=*/10);
+
+    LowerBoundValue.reset();
+    LowerBound = GetRecovery(LowerBound, LowerBound->getType());
+    LengthValue.reset();
+    Length = GetRecovery(Length, Length->getType());
+  }
+
+  // If any part of the expression is dependent, return a dependent sub-array.
+  QualType ArrayExprTy = Context.ArraySectionTy;
+  if (Base->isTypeDependent() ||
+      (LowerBound && LowerBound->isInstantiationDependent()) ||
+      (Length && Length->isInstantiationDependent()))
+    ArrayExprTy = Context.DependentTy;
+
   return new (Context)
-      ArraySectionExpr(Base, LowerBound, Length, Context.ArraySectionTy,
-                       VK_LValue, OK_Ordinary, ColonLoc, RBLoc);
+      ArraySectionExpr(Base, LowerBound, Length, ArrayExprTy, VK_LValue,
+                       OK_Ordinary, ColonLoc, RBLoc);
 }
 
 bool SemaOpenACC::ActOnStartStmtDirective(OpenACCDirectiveKind K,
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index cf5447f223d4..7d00cf6fb5b6 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -15109,13 +15109,13 @@ StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
                                                 SourceLocation StartLoc,
                                                 SourceLocation EndLoc) {
   ASTContext &Context = getASTContext();
-  auto SizesClauses =
-      OMPExecutableDirective::getClausesOfKind<OMPSizesClause>(Clauses);
-  if (SizesClauses.empty()) {
-    // A missing 'sizes' clause is already reported by the parser.
+  Scope *CurScope = SemaRef.getCurScope();
+
+  const auto *SizesClause =
+      OMPExecutableDirective::getSingleClause<OMPSizesClause>(Clauses);
+  if (!SizesClause ||
+      llvm::any_of(SizesClause->getSizesRefs(), [](Expr *E) { return !E; }))
     return StmtError();
-  }
-  const OMPSizesClause *SizesClause = *SizesClauses.begin();
   unsigned NumLoops = SizesClause->getNumSizes();
 
   // Empty statement should only be possible if there already was an error.
@@ -15136,7 +15136,15 @@ StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
     return OMPTileDirective::Create(Context, StartLoc, EndLoc, Clauses,
                                     NumLoops, AStmt, nullptr, nullptr);
 
+  assert(LoopHelpers.size() == NumLoops &&
+         "Expecting loop iteration space dimensionality to match number of "
+         "affected loops");
+  assert(OriginalInits.size() == NumLoops &&
+         "Expecting loop iteration space dimensionality to match number of "
+         "affected loops");
+
   SmallVector<Decl *, 4> PreInits;
+  CaptureVars CopyTransformer(SemaRef);
 
   // Create iteration variables for the generated loops.
   SmallVector<VarDecl *, 4> FloorIndVars;
@@ -15194,25 +15202,69 @@ StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
   // Once the original iteration values are set, append the innermost body.
   Stmt *Inner = Body;
 
+  auto MakeDimTileSize = [&SemaRef = this->SemaRef, &CopyTransformer, &Context,
+                          SizesClause, CurScope](int I) -> Expr * {
+    Expr *DimTileSizeExpr = SizesClause->getSizesRefs()[I];
+    if (isa<ConstantExpr>(DimTileSizeExpr))
+      return AssertSuccess(CopyTransformer.TransformExpr(DimTileSizeExpr));
+
+    // When the tile size is not a constant but a variable, it is possible to
+    // pass non-positive numbers. For instance:
+    // \code{c}
+    //   int a = 0;
+    //   #pragma omp tile sizes(a)
+    //   for (int i = 0; i < 42; ++i)
+    //     body(i);
+    // \endcode
+    // Although there is no meaningful interpretation of the tile size, the body
+    // should still be executed 42 times to avoid surprises. To preserve the
+    // invariant that every loop iteration is executed exactly once and not
+    // cause an infinite loop, apply a minimum tile size of one.
+    // Build expr:
+    // \code{c}
+    //   (TS <= 0) ? 1 : TS
+    // \endcode
+    QualType DimTy = DimTileSizeExpr->getType();
+    uint64_t DimWidth = Context.getTypeSize(DimTy);
+    IntegerLiteral *Zero = IntegerLiteral::Create(
+        Context, llvm::APInt::getZero(DimWidth), DimTy, {});
+    IntegerLiteral *One =
+        IntegerLiteral::Create(Context, llvm::APInt(DimWidth, 1), DimTy, {});
+    Expr *Cond = AssertSuccess(SemaRef.BuildBinOp(
+        CurScope, {}, BO_LE,
+        AssertSuccess(CopyTransformer.TransformExpr(DimTileSizeExpr)), Zero));
+    Expr *MinOne = new (Context) ConditionalOperator(
+        Cond, {}, One, {},
+        AssertSuccess(CopyTransformer.TransformExpr(DimTileSizeExpr)), DimTy,
+        VK_PRValue, OK_Ordinary);
+    return MinOne;
+  };
+
   // Create tile loops from the inside to the outside.
   for (int I = NumLoops - 1; I >= 0; --I) {
     OMPLoopBasedDirective::HelperExprs &LoopHelper = LoopHelpers[I];
     Expr *NumIterations = LoopHelper.NumIterations;
     auto *OrigCntVar = cast<DeclRefExpr>(LoopHelper.Counters[0]);
     QualType CntTy = OrigCntVar->getType();
-    Expr *DimTileSize = SizesClause->getSizesRefs()[I];
-    Scope *CurScope = SemaRef.getCurScope();
 
-    // Commonly used variables.
-    DeclRefExpr *TileIV = buildDeclRefExpr(SemaRef, TileIndVars[I], CntTy,
-                                           OrigCntVar->getExprLoc());
-    DeclRefExpr *FloorIV = buildDeclRefExpr(SemaRef, FloorIndVars[I], CntTy,
-                                            OrigCntVar->getExprLoc());
+    // Commonly used variables. One of the constraints of an AST is that every
+    // node object must appear at most once, hence we define lamdas that create
+    // a new AST node at every use.
+    auto MakeTileIVRef = [&SemaRef = this->SemaRef, &TileIndVars, I, CntTy,
+                          OrigCntVar]() {
+      return buildDeclRefExpr(SemaRef, TileIndVars[I], CntTy,
+                              OrigCntVar->getExprLoc());
+    };
+    auto MakeFloorIVRef = [&SemaRef = this->SemaRef, &FloorIndVars, I, CntTy,
+                           OrigCntVar]() {
+      return buildDeclRefExpr(SemaRef, FloorIndVars[I], CntTy,
+                              OrigCntVar->getExprLoc());
+    };
 
     // For init-statement: auto .tile.iv = .floor.iv
-    SemaRef.AddInitializerToDecl(TileIndVars[I],
-                                 SemaRef.DefaultLvalueConversion(FloorIV).get(),
-                                 /*DirectInit=*/false);
+    SemaRef.AddInitializerToDecl(
+        TileIndVars[I], SemaRef.DefaultLvalueConversion(MakeFloorIVRef()).get(),
+        /*DirectInit=*/false);
     Decl *CounterDecl = TileIndVars[I];
     StmtResult InitStmt = new (Context)
         DeclStmt(DeclGroupRef::Create(Context, &CounterDecl, 1),
@@ -15220,10 +15272,11 @@ StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
     if (!InitStmt.isUsable())
       return StmtError();
 
-    // For cond-expression: .tile.iv < min(.floor.iv + DimTileSize,
-    // NumIterations)
-    ExprResult EndOfTile = SemaRef.BuildBinOp(
-        CurScope, LoopHelper.Cond->getExprLoc(), BO_Add, FloorIV, DimTileSize);
+    // For cond-expression:
+    //   .tile.iv < min(.floor.iv + DimTileSize, NumIterations)
+    ExprResult EndOfTile =
+        SemaRef.BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_Add,
+                           MakeFloorIVRef(), MakeDimTileSize(I));
     if (!EndOfTile.isUsable())
       return StmtError();
     ExprResult IsPartialTile =
@@ -15238,25 +15291,28 @@ StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
       return StmtError();
     ExprResult CondExpr =
         SemaRef.BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_LT,
-                           TileIV, MinTileAndIterSpace.get());
+                           MakeTileIVRef(), MinTileAndIterSpace.get());
     if (!CondExpr.isUsable())
       return StmtError();
 
     // For incr-statement: ++.tile.iv
     ExprResult IncrStmt = SemaRef.BuildUnaryOp(
-        CurScope, LoopHelper.Inc->getExprLoc(), UO_PreInc, TileIV);
+        CurScope, LoopHelper.Inc->getExprLoc(), UO_PreInc, MakeTileIVRef());
     if (!IncrStmt.isUsable())
       return StmtError();
 
     // Statements to set the original iteration variable's value from the
     // logical iteration number.
     // Generated for loop is:
+    // \code
     // Original_for_init;
-    // for (auto .tile.iv = .floor.iv; .tile.iv < min(.floor.iv + DimTileSize,
-    // NumIterations); ++.tile.iv) {
+    // for (auto .tile.iv = .floor.iv;
+    //      .tile.iv < min(.floor.iv + DimTileSize, NumIterations);
+    //      ++.tile.iv) {
     //   Original_Body;
     //   Original_counter_update;
     // }
+    // \endcode
     // FIXME: If the innermost body is an loop itself, inserting these
     // statements stops it being recognized  as a perfectly nested loop (e.g.
     // for applying tiling again). If this is the case, sink the expressions
@@ -15278,12 +15334,13 @@ StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
     Expr *NumIterations = LoopHelper.NumIterations;
     DeclRefExpr *OrigCntVar = cast<DeclRefExpr>(LoopHelper.Counters[0]);
     QualType CntTy = OrigCntVar->getType();
-    Expr *DimTileSize = SizesClause->getSizesRefs()[I];
-    Scope *CurScope = SemaRef.getCurScope();
 
     // Commonly used variables.
-    DeclRefExpr *FloorIV = buildDeclRefExpr(SemaRef, FloorIndVars[I], CntTy,
-                                            OrigCntVar->getExprLoc());
+    auto MakeFloorIVRef = [&SemaRef = this->SemaRef, &FloorIndVars, I, CntTy,
+                           OrigCntVar]() {
+      return buildDeclRefExpr(SemaRef, FloorIndVars[I], CntTy,
+                              OrigCntVar->getExprLoc());
+    };
 
     // For init-statement: auto .floor.iv = 0
     SemaRef.AddInitializerToDecl(
@@ -15298,15 +15355,16 @@ StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
       return StmtError();
 
     // For cond-expression: .floor.iv < NumIterations
-    ExprResult CondExpr = SemaRef.BuildBinOp(
-        CurScope, LoopHelper.Cond->getExprLoc(), BO_LT, FloorIV, NumIterations);
+    ExprResult CondExpr =
+        SemaRef.BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_LT,
+                           MakeFloorIVRef(), NumIterations);
     if (!CondExpr.isUsable())
       return StmtError();
 
     // For incr-statement: .floor.iv += DimTileSize
     ExprResult IncrStmt =
         SemaRef.BuildBinOp(CurScope, LoopHelper.Inc->getExprLoc(), BO_AddAssign,
-                           FloorIV, DimTileSize);
+                           MakeFloorIVRef(), MakeDimTileSize(I));
     if (!IncrStmt.isUsable())
       return StmtError();
 
@@ -17407,16 +17465,53 @@ OMPClause *SemaOpenMP::ActOnOpenMPSizesClause(ArrayRef<Expr *> SizeExprs,
                                               SourceLocation StartLoc,
                                               SourceLocation LParenLoc,
                                               SourceLocation EndLoc) {
-  for (Expr *SizeExpr : SizeExprs) {
-    ExprResult NumForLoopsResult = VerifyPositiveIntegerConstantInClause(
-        SizeExpr, OMPC_sizes, /*StrictlyPositive=*/true);
-    if (!NumForLoopsResult.isUsable())
-      return nullptr;
+  SmallVector<Expr *> SanitizedSizeExprs(SizeExprs);
+
+  for (Expr *&SizeExpr : SanitizedSizeExprs) {
+    // Skip if already sanitized, e.g. during a partial template instantiation.
+    if (!SizeExpr)
+      continue;
+
+    bool IsValid = isNonNegativeIntegerValue(SizeExpr, SemaRef, OMPC_sizes,
+                                             /*StrictlyPositive=*/true);
+
+    // isNonNegativeIntegerValue returns true for non-integral types (but still
+    // emits error diagnostic), so check for the expected type explicitly.
+    QualType SizeTy = SizeExpr->getType();
+    if (!SizeTy->isIntegerType())
+      IsValid = false;
+
+    // Handling in templates is tricky. There are four possibilities to
+    // consider:
+    //
+    // 1a. The expression is valid and we are in a instantiated template or not
+    //     in a template:
+    //       Pass valid expression to be further analysed later in Sema.
+    // 1b. The expression is valid and we are in a template (including partial
+    //     instantiation):
+    //       isNonNegativeIntegerValue skipped any checks so there is no
+    //       guarantee it will be correct after instantiation.
+    //       ActOnOpenMPSizesClause will be called again at instantiation when
+    //       it is not in a dependent context anymore. This may cause warnings
+    //       to be emitted multiple times.
+    // 2a. The expression is invalid and we are in an instantiated template or
+    //     not in a template:
+    //       Invalidate the expression with a clearly wrong value (nullptr) so
+    //       later in Sema we do not have to do the same validity analysis again
+    //       or crash from unexpected data. Error diagnostics have already been
+    //       emitted.
+    // 2b. The expression is invalid and we are in a template (including partial
+    //     instantiation):
+    //       Pass the invalid expression as-is, template instantiation may
+    //       replace unexpected types/values with valid ones. The directives
+    //       with this clause must not try to use these expressions in dependent
+    //       contexts, but delay analysis until full instantiation.
+    if (!SizeExpr->isInstantiationDependent() && !IsValid)
+      SizeExpr = nullptr;
   }
 
-  DSAStack->setAssociatedLoops(SizeExprs.size());
   return OMPSizesClause::Create(getASTContext(), StartLoc, LParenLoc, EndLoc,
-                                SizeExprs);
+                                SanitizedSizeExprs);
 }
 
 OMPClause *SemaOpenMP::ActOnOpenMPFullClause(SourceLocation StartLoc,
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 04cd9e78739d..f173300b5c96 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -1303,6 +1303,8 @@ static bool IsOverloadOrOverrideImpl(Sema &SemaRef, FunctionDecl *New,
   if (New->isMSVCRTEntryPoint())
     return false;
 
+  NamedDecl *OldDecl = Old;
+  NamedDecl *NewDecl = New;
   FunctionTemplateDecl *OldTemplate = Old->getDescribedFunctionTemplate();
   FunctionTemplateDecl *NewTemplate = New->getDescribedFunctionTemplate();
 
@@ -1347,6 +1349,8 @@ static bool IsOverloadOrOverrideImpl(Sema &SemaRef, FunctionDecl *New,
   // references to non-instantiated entities during constraint substitution.
   // GH78101.
   if (NewTemplate) {
+    OldDecl = OldTemplate;
+    NewDecl = NewTemplate;
     // C++ [temp.over.link]p4:
     //   The signature of a function template consists of its function
     //   signature, its return type and its template parameter list. The names
@@ -1506,13 +1510,14 @@ static bool IsOverloadOrOverrideImpl(Sema &SemaRef, FunctionDecl *New,
     }
   }
 
-  if (!UseOverrideRules) {
+  if (!UseOverrideRules &&
+      New->getTemplateSpecializationKind() != TSK_ExplicitSpecialization) {
     Expr *NewRC = New->getTrailingRequiresClause(),
          *OldRC = Old->getTrailingRequiresClause();
     if ((NewRC != nullptr) != (OldRC != nullptr))
       return true;
-
-    if (NewRC && !SemaRef.AreConstraintExpressionsEqual(Old, OldRC, New, NewRC))
+    if (NewRC &&
+        !SemaRef.AreConstraintExpressionsEqual(OldDecl, OldRC, NewDecl, NewRC))
       return true;
   }
 
@@ -2587,7 +2592,8 @@ bool Sema::IsIntegralPromotion(Expr *From, QualType FromType, QualType ToType) {
 
   // In HLSL an rvalue of integral type can be promoted to an rvalue of a larger
   // integral type.
-  if (Context.getLangOpts().HLSL)
+  if (Context.getLangOpts().HLSL && FromType->isIntegerType() &&
+      ToType->isIntegerType())
     return Context.getTypeSize(FromType) < Context.getTypeSize(ToType);
 
   return false;
@@ -2616,6 +2622,13 @@ bool Sema::IsFloatingPointPromotion(QualType FromType, QualType ToType) {
            ToBuiltin->getKind() == BuiltinType::Ibm128))
         return true;
 
+      // In HLSL, `half` promotes to `float` or `double`, regardless of whether
+      // or not native half types are enabled.
+      if (getLangOpts().HLSL && FromBuiltin->getKind() == BuiltinType::Half &&
+          (ToBuiltin->getKind() == BuiltinType::Float ||
+           ToBuiltin->getKind() == BuiltinType::Double))
+        return true;
+
       // Half can be promoted to float.
       if (!getLangOpts().NativeHalfType &&
            FromBuiltin->getKind() == BuiltinType::Half &&
@@ -4393,6 +4406,24 @@ getFixedEnumPromtion(Sema &S, const StandardConversionSequence &SCS) {
   return FixedEnumPromotion::ToPromotedUnderlyingType;
 }
 
+static ImplicitConversionSequence::CompareKind
+HLSLCompareFloatingRank(QualType LHS, QualType RHS) {
+  assert(LHS->isVectorType() == RHS->isVectorType() &&
+         "Either both elements should be vectors or neither should.");
+  if (const auto *VT = LHS->getAs<VectorType>())
+    LHS = VT->getElementType();
+
+  if (const auto *VT = RHS->getAs<VectorType>())
+    RHS = VT->getElementType();
+
+  const auto L = LHS->getAs<BuiltinType>()->getKind();
+  const auto R = RHS->getAs<BuiltinType>()->getKind();
+  if (L == R)
+    return ImplicitConversionSequence::Indistinguishable;
+  return L < R ? ImplicitConversionSequence::Better
+               : ImplicitConversionSequence::Worse;
+}
+
 /// CompareStandardConversionSequences - Compare two standard
 /// conversion sequences to determine whether one is better than the
 /// other or if they are indistinguishable (C++ 13.3.3.2p3).
@@ -4634,6 +4665,21 @@ CompareStandardConversionSequences(Sema &S, SourceLocation Loc,
                  : ImplicitConversionSequence::Worse;
   }
 
+  if (S.getLangOpts().HLSL) {
+    // On a promotion we prefer the lower rank to disambiguate.
+    if ((SCS1.Second == ICK_Floating_Promotion &&
+         SCS2.Second == ICK_Floating_Promotion) ||
+        (SCS1.Element == ICK_Floating_Promotion &&
+         SCS2.Element == ICK_Floating_Promotion))
+      return HLSLCompareFloatingRank(SCS1.getToType(2), SCS2.getToType(2));
+    // On a conversion we prefer the higher rank to disambiguate.
+    if ((SCS1.Second == ICK_Floating_Conversion &&
+         SCS2.Second == ICK_Floating_Conversion) ||
+        (SCS1.Element == ICK_Floating_Conversion &&
+         SCS2.Element == ICK_Floating_Conversion))
+      return HLSLCompareFloatingRank(SCS2.getToType(2), SCS1.getToType(2));
+  }
+
   return ImplicitConversionSequence::Indistinguishable;
 }
 
@@ -10654,29 +10700,10 @@ bool clang::isBetterOverloadCandidate(
   //   -— F1 and F2 are non-template functions with the same
   //      parameter-type-lists, and F1 is more constrained than F2 [...],
   if (!Cand1IsSpecialization && !Cand2IsSpecialization &&
-      sameFunctionParameterTypeLists(S, Cand1, Cand2)) {
-    FunctionDecl *Function1 = Cand1.Function;
-    FunctionDecl *Function2 = Cand2.Function;
-    if (FunctionDecl *MF = Function1->getInstantiatedFromMemberFunction())
-      Function1 = MF;
-    if (FunctionDecl *MF = Function2->getInstantiatedFromMemberFunction())
-      Function2 = MF;
-
-    const Expr *RC1 = Function1->getTrailingRequiresClause();
-    const Expr *RC2 = Function2->getTrailingRequiresClause();
-    if (RC1 && RC2) {
-      bool AtLeastAsConstrained1, AtLeastAsConstrained2;
-      if (S.IsAtLeastAsConstrained(Function1, RC1, Function2, RC2,
-                                   AtLeastAsConstrained1) ||
-          S.IsAtLeastAsConstrained(Function2, RC2, Function1, RC1,
-                                   AtLeastAsConstrained2))
-        return false;
-      if (AtLeastAsConstrained1 != AtLeastAsConstrained2)
-        return AtLeastAsConstrained1;
-    } else if (RC1 || RC2) {
-      return RC1 != nullptr;
-    }
-  }
+      sameFunctionParameterTypeLists(S, Cand1, Cand2) &&
+      S.getMoreConstrainedFunction(Cand1.Function, Cand2.Function) ==
+          Cand1.Function)
+    return true;
 
   //   -- F1 is a constructor for a class D, F2 is a constructor for a base
   //      class B of D, and for all arguments the corresponding parameters of
@@ -13344,25 +13371,6 @@ Sema::resolveAddressOfSingleOverloadCandidate(Expr *E, DeclAccessPair &Pair) {
            static_cast<int>(CUDA().IdentifyPreference(Caller, FD2));
   };
 
-  auto CheckMoreConstrained = [&](FunctionDecl *FD1,
-                                  FunctionDecl *FD2) -> std::optional<bool> {
-    if (FunctionDecl *MF = FD1->getInstantiatedFromMemberFunction())
-      FD1 = MF;
-    if (FunctionDecl *MF = FD2->getInstantiatedFromMemberFunction())
-      FD2 = MF;
-    SmallVector<const Expr *, 1> AC1, AC2;
-    FD1->getAssociatedConstraints(AC1);
-    FD2->getAssociatedConstraints(AC2);
-    bool AtLeastAsConstrained1, AtLeastAsConstrained2;
-    if (IsAtLeastAsConstrained(FD1, AC1, FD2, AC2, AtLeastAsConstrained1))
-      return std::nullopt;
-    if (IsAtLeastAsConstrained(FD2, AC2, FD1, AC1, AtLeastAsConstrained2))
-      return std::nullopt;
-    if (AtLeastAsConstrained1 == AtLeastAsConstrained2)
-      return std::nullopt;
-    return AtLeastAsConstrained1;
-  };
-
   // Don't use the AddressOfResolver because we're specifically looking for
   // cases where we have one overload candidate that lacks
   // enable_if/pass_object_size/...
@@ -13399,15 +13407,14 @@ Sema::resolveAddressOfSingleOverloadCandidate(Expr *E, DeclAccessPair &Pair) {
       }
       // FD has the same CUDA prefernece than Result. Continue check
       // constraints.
-      std::optional<bool> MoreConstrainedThanPrevious =
-          CheckMoreConstrained(FD, Result);
-      if (!MoreConstrainedThanPrevious) {
-        IsResultAmbiguous = true;
-        AmbiguousDecls.push_back(FD);
+      FunctionDecl *MoreConstrained = getMoreConstrainedFunction(FD, Result);
+      if (MoreConstrained != FD) {
+        if (!MoreConstrained) {
+          IsResultAmbiguous = true;
+          AmbiguousDecls.push_back(FD);
+        }
         continue;
       }
-      if (!*MoreConstrainedThanPrevious)
-        continue;
       // FD is more constrained - replace Result with it.
     }
     FoundBetter();
@@ -13426,7 +13433,7 @@ Sema::resolveAddressOfSingleOverloadCandidate(Expr *E, DeclAccessPair &Pair) {
       // constraints.
       if (getLangOpts().CUDA && CheckCUDAPreference(Skipped, Result) != 0)
         continue;
-      if (!CheckMoreConstrained(Skipped, Result))
+      if (!getMoreConstrainedFunction(Skipped, Result))
         return nullptr;
     }
     Pair = DAP;
diff --git a/clang/lib/Sema/SemaStmtAsm.cpp b/clang/lib/Sema/SemaStmtAsm.cpp
index 83351b703c15..32d42f3c3f3b 100644
--- a/clang/lib/Sema/SemaStmtAsm.cpp
+++ b/clang/lib/Sema/SemaStmtAsm.cpp
@@ -829,7 +829,7 @@ bool Sema::LookupInlineAsmField(StringRef Base, StringRef Member,
   NamedDecl *FoundDecl = nullptr;
 
   // MS InlineAsm uses 'this' as a base
-  if (getLangOpts().CPlusPlus && Base.equals("this")) {
+  if (getLangOpts().CPlusPlus && Base == "this") {
     if (const Type *PT = getCurrentThisType().getTypePtrOrNull())
       FoundDecl = PT->getPointeeType()->getAsTagDecl();
   } else {
diff --git a/clang/lib/Sema/SemaStmtAttr.cpp b/clang/lib/Sema/SemaStmtAttr.cpp
index 1c84830b6ddd..36f8ecadcfab 100644
--- a/clang/lib/Sema/SemaStmtAttr.cpp
+++ b/clang/lib/Sema/SemaStmtAttr.cpp
@@ -670,6 +670,11 @@ ExprResult Sema::ActOnCXXAssumeAttr(Stmt *St, const ParsedAttr &A,
   }
 
   auto *Assumption = A.getArgAsExpr(0);
+
+  if (DiagnoseUnexpandedParameterPack(Assumption)) {
+    return ExprError();
+  }
+
   if (Assumption->getDependence() == ExprDependence::None) {
     ExprResult Res = BuildCXXAssumeExpr(Assumption, A.getAttrName(), Range);
     if (Res.isInvalid())
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 7f18631c6096..bae00c629270 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -750,7 +750,8 @@ Sema::ActOnDependentIdExpression(const CXXScopeSpec &SS,
   if (!MightBeCxx11UnevalField && !isAddressOfOperand && !IsEnum &&
       isa<CXXMethodDecl>(DC) &&
       cast<CXXMethodDecl>(DC)->isImplicitObjectMemberFunction()) {
-    QualType ThisType = cast<CXXMethodDecl>(DC)->getThisType().getNonReferenceType();
+    QualType ThisType =
+        cast<CXXMethodDecl>(DC)->getThisType().getNonReferenceType();
 
     // Since the 'this' expression is synthesized, we don't need to
     // perform the double-lookup check.
@@ -2491,9 +2492,6 @@ struct ConvertConstructorToDeductionGuideTransform {
       Args.addOuterRetainedLevel();
     }
 
-    if (NestedPattern)
-      Args.addOuterRetainedLevels(NestedPattern->getTemplateDepth());
-
     FunctionProtoTypeLoc FPTL = CD->getTypeSourceInfo()->getTypeLoc()
                                    .getAsAdjusted<FunctionProtoTypeLoc>();
     assert(FPTL && "no prototype for constructor declaration");
@@ -2583,11 +2581,27 @@ private:
 
     //    -- The types of the function parameters are those of the constructor.
     for (auto *OldParam : TL.getParams()) {
-      ParmVarDecl *NewParam =
-          transformFunctionTypeParam(OldParam, Args, MaterializedTypedefs);
-      if (NestedPattern && NewParam)
+      ParmVarDecl *NewParam = OldParam;
+      // Given
+      //   template <class T> struct C {
+      //     template <class U> struct D {
+      //       template <class V> D(U, V);
+      //     };
+      //   };
+      // First, transform all the references to template parameters that are
+      // defined outside of the surrounding class template. That is T in the
+      // above example.
+      if (NestedPattern) {
         NewParam = transformFunctionTypeParam(NewParam, OuterInstantiationArgs,
                                               MaterializedTypedefs);
+        if (!NewParam)
+          return QualType();
+      }
+      // Then, transform all the references to template parameters that are
+      // defined at the class template and the constructor. In this example,
+      // they're U and V, respectively.
+      NewParam =
+          transformFunctionTypeParam(NewParam, Args, MaterializedTypedefs);
       if (!NewParam)
         return QualType();
       ParamTypes.push_back(NewParam->getType());
@@ -2665,7 +2679,7 @@ private:
       // placeholder to indicate there is a default argument.
       QualType ParamTy = NewDI->getType();
       NewDefArg = new (SemaRef.Context)
-          OpaqueValueExpr(OldParam->getDefaultArg()->getBeginLoc(),
+          OpaqueValueExpr(OldParam->getDefaultArgRange().getBegin(),
                           ParamTy.getNonLValueExprType(SemaRef.Context),
                           ParamTy->isLValueReferenceType()   ? VK_LValue
                           : ParamTy->isRValueReferenceType() ? VK_XValue
@@ -2744,31 +2758,149 @@ bool hasDeclaredDeductionGuides(DeclarationName Name, DeclContext *DC) {
   return false;
 }
 
+unsigned getTemplateParameterDepth(NamedDecl *TemplateParam) {
+  if (auto *TTP = dyn_cast<TemplateTypeParmDecl>(TemplateParam))
+    return TTP->getDepth();
+  if (auto *TTP = dyn_cast<TemplateTemplateParmDecl>(TemplateParam))
+    return TTP->getDepth();
+  if (auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(TemplateParam))
+    return NTTP->getDepth();
+  llvm_unreachable("Unhandled template parameter types");
+}
+
 NamedDecl *transformTemplateParameter(Sema &SemaRef, DeclContext *DC,
                                       NamedDecl *TemplateParam,
                                       MultiLevelTemplateArgumentList &Args,
-                                      unsigned NewIndex) {
+                                      unsigned NewIndex, unsigned NewDepth) {
   if (auto *TTP = dyn_cast<TemplateTypeParmDecl>(TemplateParam))
-    return transformTemplateTypeParam(SemaRef, DC, TTP, Args, TTP->getDepth(),
+    return transformTemplateTypeParam(SemaRef, DC, TTP, Args, NewDepth,
                                       NewIndex);
   if (auto *TTP = dyn_cast<TemplateTemplateParmDecl>(TemplateParam))
-    return transformTemplateParam(SemaRef, DC, TTP, Args, NewIndex,
-                                  TTP->getDepth());
+    return transformTemplateParam(SemaRef, DC, TTP, Args, NewIndex, NewDepth);
   if (auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(TemplateParam))
-    return transformTemplateParam(SemaRef, DC, NTTP, Args, NewIndex,
-                                  NTTP->getDepth());
+    return transformTemplateParam(SemaRef, DC, NTTP, Args, NewIndex, NewDepth);
   llvm_unreachable("Unhandled template parameter types");
 }
 
-Expr *transformRequireClause(Sema &SemaRef, FunctionTemplateDecl *FTD,
-                             llvm::ArrayRef<TemplateArgument> TransformedArgs) {
-  Expr *RC = FTD->getTemplateParameters()->getRequiresClause();
+// Transform the require-clause of F if any.
+// The return result is expected to be the require-clause for the synthesized
+// alias deduction guide.
+Expr *transformRequireClause(Sema &SemaRef, FunctionTemplateDecl *F,
+                             TypeAliasTemplateDecl *AliasTemplate,
+                             ArrayRef<DeducedTemplateArgument> DeduceResults) {
+  Expr *RC = F->getTemplateParameters()->getRequiresClause();
   if (!RC)
     return nullptr;
+
+  auto &Context = SemaRef.Context;
+  LocalInstantiationScope Scope(SemaRef);
+
+  // In the clang AST, constraint nodes are deliberately not instantiated unless
+  // they are actively being evaluated. Consequently, occurrences of template
+  // parameters in the require-clause expression have a subtle "depth"
+  // difference compared to normal occurrences in places, such as function
+  // parameters. When transforming the require-clause, we must take this
+  // distinction into account:
+  //
+  //   1) In the transformed require-clause, occurrences of template parameters
+  //   must use the "uninstantiated" depth;
+  //   2) When substituting on the require-clause expr of the underlying
+  //   deduction guide, we must use the entire set of template argument lists;
+  //
+  // It's important to note that we're performing this transformation on an
+  // *instantiated* AliasTemplate.
+
+  // For 1), if the alias template is nested within a class template, we
+  // calcualte the 'uninstantiated' depth by adding the substitution level back.
+  unsigned AdjustDepth = 0;
+  if (auto *PrimaryTemplate =
+          AliasTemplate->getInstantiatedFromMemberTemplate())
+    AdjustDepth = PrimaryTemplate->getTemplateDepth();
+
+  // We rebuild all template parameters with the uninstantiated depth, and
+  // build template arguments refer to them.
+  SmallVector<TemplateArgument> AdjustedAliasTemplateArgs;
+
+  for (auto *TP : *AliasTemplate->getTemplateParameters()) {
+    // Rebuild any internal references to earlier parameters and reindex
+    // as we go.
+    MultiLevelTemplateArgumentList Args;
+    Args.setKind(TemplateSubstitutionKind::Rewrite);
+    Args.addOuterTemplateArguments(AdjustedAliasTemplateArgs);
+    NamedDecl *NewParam = transformTemplateParameter(
+        SemaRef, AliasTemplate->getDeclContext(), TP, Args,
+        /*NewIndex=*/AdjustedAliasTemplateArgs.size(),
+        getTemplateParameterDepth(TP) + AdjustDepth);
+
+    auto NewTemplateArgument = Context.getCanonicalTemplateArgument(
+        Context.getInjectedTemplateArg(NewParam));
+    AdjustedAliasTemplateArgs.push_back(NewTemplateArgument);
+  }
+  // Template arguments used to transform the template arguments in
+  // DeducedResults.
+  SmallVector<TemplateArgument> TemplateArgsForBuildingRC(
+      F->getTemplateParameters()->size());
+  // Transform the transformed template args
   MultiLevelTemplateArgumentList Args;
   Args.setKind(TemplateSubstitutionKind::Rewrite);
-  Args.addOuterTemplateArguments(TransformedArgs);
-  ExprResult E = SemaRef.SubstExpr(RC, Args);
+  Args.addOuterTemplateArguments(AdjustedAliasTemplateArgs);
+
+  for (unsigned Index = 0; Index < DeduceResults.size(); ++Index) {
+    const auto &D = DeduceResults[Index];
+    if (D.isNull())
+      continue;
+    TemplateArgumentLoc Input =
+        SemaRef.getTrivialTemplateArgumentLoc(D, QualType(), SourceLocation{});
+    TemplateArgumentLoc Output;
+    if (!SemaRef.SubstTemplateArgument(Input, Args, Output)) {
+      assert(TemplateArgsForBuildingRC[Index].isNull() &&
+             "InstantiatedArgs must be null before setting");
+      TemplateArgsForBuildingRC[Index] = Output.getArgument();
+    }
+  }
+
+  // A list of template arguments for transforming the require-clause of F.
+  // It must contain the entire set of template argument lists.
+  MultiLevelTemplateArgumentList ArgsForBuildingRC;
+  ArgsForBuildingRC.setKind(clang::TemplateSubstitutionKind::Rewrite);
+  ArgsForBuildingRC.addOuterTemplateArguments(TemplateArgsForBuildingRC);
+  // For 2), if the underlying F is instantiated from a member template, we need
+  // the entire template argument list, as the constraint AST in the
+  // require-clause of F remains completely uninstantiated.
+  //
+  // For example:
+  //   template <typename T> // depth 0
+  //   struct Outer {
+  //      template <typename U>
+  //      struct Foo { Foo(U); };
+  //
+  //      template <typename U> // depth 1
+  //      requires C<U>
+  //      Foo(U) -> Foo<int>;
+  //   };
+  //   template <typename U>
+  //   using AFoo = Outer<int>::Foo<U>;
+  //
+  // In this scenario, the deduction guide for `Foo` inside `Outer<int>`:
+  //   - The occurrence of U in the require-expression is [depth:1, index:0]
+  //   - The occurrence of U in the function parameter is [depth:0, index:0]
+  //   - The template parameter of U is [depth:0, index:0]
+  //
+  // We add the outer template arguments which is [int] to the multi-level arg
+  // list to ensure that the occurrence U in `C<U>` will be replaced with int
+  // during the substitution.
+  if (F->getInstantiatedFromMemberTemplate()) {
+    auto OuterLevelArgs = SemaRef.getTemplateInstantiationArgs(
+        F, F->getLexicalDeclContext(),
+        /*Final=*/false, /*Innermost=*/std::nullopt,
+        /*RelativeToPrimary=*/true,
+        /*Pattern=*/nullptr,
+        /*ForConstraintInstantiation=*/true);
+    for (auto It : OuterLevelArgs)
+      ArgsForBuildingRC.addOuterTemplateArguments(It.Args);
+  }
+
+  ExprResult E = SemaRef.SubstExpr(RC, ArgsForBuildingRC);
   if (E.isInvalid())
     return nullptr;
   return E.getAs<Expr>();
@@ -2803,7 +2935,209 @@ getRHSTemplateDeclAndArgs(Sema &SemaRef, TypeAliasTemplateDecl *AliasTemplate) {
   return {Template, AliasRhsTemplateArgs};
 }
 
-// Build deduction guides for a type alias template.
+// Build deduction guides for a type alias template from the given underlying
+// deduction guide F.
+FunctionTemplateDecl *
+BuildDeductionGuideForTypeAlias(Sema &SemaRef,
+                                TypeAliasTemplateDecl *AliasTemplate,
+                                FunctionTemplateDecl *F, SourceLocation Loc) {
+  LocalInstantiationScope Scope(SemaRef);
+  Sema::InstantiatingTemplate BuildingDeductionGuides(
+      SemaRef, AliasTemplate->getLocation(), F,
+      Sema::InstantiatingTemplate::BuildingDeductionGuidesTag{});
+  if (BuildingDeductionGuides.isInvalid())
+    return nullptr;
+
+  auto &Context = SemaRef.Context;
+  auto [Template, AliasRhsTemplateArgs] =
+      getRHSTemplateDeclAndArgs(SemaRef, AliasTemplate);
+
+  auto RType = F->getTemplatedDecl()->getReturnType();
+  // The (trailing) return type of the deduction guide.
+  const TemplateSpecializationType *FReturnType =
+      RType->getAs<TemplateSpecializationType>();
+  if (const auto *InjectedCNT = RType->getAs<InjectedClassNameType>())
+    // implicitly-generated deduction guide.
+    FReturnType = InjectedCNT->getInjectedTST();
+  else if (const auto *ET = RType->getAs<ElaboratedType>())
+    // explicit deduction guide.
+    FReturnType = ET->getNamedType()->getAs<TemplateSpecializationType>();
+  assert(FReturnType && "expected to see a return type");
+  // Deduce template arguments of the deduction guide f from the RHS of
+  // the alias.
+  //
+  // C++ [over.match.class.deduct]p3: ...For each function or function
+  // template f in the guides of the template named by the
+  // simple-template-id of the defining-type-id, the template arguments
+  // of the return type of f are deduced from the defining-type-id of A
+  // according to the process in [temp.deduct.type] with the exception
+  // that deduction does not fail if not all template arguments are
+  // deduced.
+  //
+  //
+  //  template<typename X, typename Y>
+  //  f(X, Y) -> f<Y, X>;
+  //
+  //  template<typename U>
+  //  using alias = f<int, U>;
+  //
+  // The RHS of alias is f<int, U>, we deduced the template arguments of
+  // the return type of the deduction guide from it: Y->int, X->U
+  sema::TemplateDeductionInfo TDeduceInfo(Loc);
+  // Must initialize n elements, this is required by DeduceTemplateArguments.
+  SmallVector<DeducedTemplateArgument> DeduceResults(
+      F->getTemplateParameters()->size());
+
+  // FIXME: DeduceTemplateArguments stops immediately at the first
+  // non-deducible template argument. However, this doesn't seem to casue
+  // issues for practice cases, we probably need to extend it to continue
+  // performing deduction for rest of arguments to align with the C++
+  // standard.
+  SemaRef.DeduceTemplateArguments(
+      F->getTemplateParameters(), FReturnType->template_arguments(),
+      AliasRhsTemplateArgs, TDeduceInfo, DeduceResults,
+      /*NumberOfArgumentsMustMatch=*/false);
+
+  SmallVector<TemplateArgument> DeducedArgs;
+  SmallVector<unsigned> NonDeducedTemplateParamsInFIndex;
+  // !!NOTE: DeduceResults respects the sequence of template parameters of
+  // the deduction guide f.
+  for (unsigned Index = 0; Index < DeduceResults.size(); ++Index) {
+    if (const auto &D = DeduceResults[Index]; !D.isNull()) // Deduced
+      DeducedArgs.push_back(D);
+    else
+      NonDeducedTemplateParamsInFIndex.push_back(Index);
+  }
+  auto DeducedAliasTemplateParams =
+      TemplateParamsReferencedInTemplateArgumentList(
+          AliasTemplate->getTemplateParameters()->asArray(), DeducedArgs);
+  // All template arguments null by default.
+  SmallVector<TemplateArgument> TemplateArgsForBuildingFPrime(
+      F->getTemplateParameters()->size());
+
+  // Create a template parameter list for the synthesized deduction guide f'.
+  //
+  // C++ [over.match.class.deduct]p3.2:
+  //   If f is a function template, f' is a function template whose template
+  //   parameter list consists of all the template parameters of A
+  //   (including their default template arguments) that appear in the above
+  //   deductions or (recursively) in their default template arguments
+  SmallVector<NamedDecl *> FPrimeTemplateParams;
+  // Store template arguments that refer to the newly-created template
+  // parameters, used for building `TemplateArgsForBuildingFPrime`.
+  SmallVector<TemplateArgument, 16> TransformedDeducedAliasArgs(
+      AliasTemplate->getTemplateParameters()->size());
+
+  for (unsigned AliasTemplateParamIdx : DeducedAliasTemplateParams) {
+    auto *TP =
+        AliasTemplate->getTemplateParameters()->getParam(AliasTemplateParamIdx);
+    // Rebuild any internal references to earlier parameters and reindex as
+    // we go.
+    MultiLevelTemplateArgumentList Args;
+    Args.setKind(TemplateSubstitutionKind::Rewrite);
+    Args.addOuterTemplateArguments(TransformedDeducedAliasArgs);
+    NamedDecl *NewParam = transformTemplateParameter(
+        SemaRef, AliasTemplate->getDeclContext(), TP, Args,
+        /*NewIndex=*/FPrimeTemplateParams.size(),
+        getTemplateParameterDepth(TP));
+    FPrimeTemplateParams.push_back(NewParam);
+
+    auto NewTemplateArgument = Context.getCanonicalTemplateArgument(
+        Context.getInjectedTemplateArg(NewParam));
+    TransformedDeducedAliasArgs[AliasTemplateParamIdx] = NewTemplateArgument;
+  }
+  //   ...followed by the template parameters of f that were not deduced
+  //   (including their default template arguments)
+  for (unsigned FTemplateParamIdx : NonDeducedTemplateParamsInFIndex) {
+    auto *TP = F->getTemplateParameters()->getParam(FTemplateParamIdx);
+    MultiLevelTemplateArgumentList Args;
+    Args.setKind(TemplateSubstitutionKind::Rewrite);
+    // We take a shortcut here, it is ok to reuse the
+    // TemplateArgsForBuildingFPrime.
+    Args.addOuterTemplateArguments(TemplateArgsForBuildingFPrime);
+    NamedDecl *NewParam = transformTemplateParameter(
+        SemaRef, F->getDeclContext(), TP, Args, FPrimeTemplateParams.size(),
+        getTemplateParameterDepth(TP));
+    FPrimeTemplateParams.push_back(NewParam);
+
+    assert(TemplateArgsForBuildingFPrime[FTemplateParamIdx].isNull() &&
+           "The argument must be null before setting");
+    TemplateArgsForBuildingFPrime[FTemplateParamIdx] =
+        Context.getCanonicalTemplateArgument(
+            Context.getInjectedTemplateArg(NewParam));
+  }
+
+  // To form a deduction guide f' from f, we leverage clang's instantiation
+  // mechanism, we construct a template argument list where the template
+  // arguments refer to the newly-created template parameters of f', and
+  // then apply instantiation on this template argument list to instantiate
+  // f, this ensures all template parameter occurrences are updated
+  // correctly.
+  //
+  // The template argument list is formed from the `DeducedArgs`, two parts:
+  //  1) appeared template parameters of alias: transfrom the deduced
+  //  template argument;
+  //  2) non-deduced template parameters of f: rebuild a
+  //  template argument;
+  //
+  // 2) has been built already (when rebuilding the new template
+  // parameters), we now perform 1).
+  MultiLevelTemplateArgumentList Args;
+  Args.setKind(TemplateSubstitutionKind::Rewrite);
+  Args.addOuterTemplateArguments(TransformedDeducedAliasArgs);
+  for (unsigned Index = 0; Index < DeduceResults.size(); ++Index) {
+    const auto &D = DeduceResults[Index];
+    if (D.isNull()) {
+      // 2): Non-deduced template parameter has been built already.
+      assert(!TemplateArgsForBuildingFPrime[Index].isNull() &&
+             "template arguments for non-deduced template parameters should "
+             "be been set!");
+      continue;
+    }
+    TemplateArgumentLoc Input =
+        SemaRef.getTrivialTemplateArgumentLoc(D, QualType(), SourceLocation{});
+    TemplateArgumentLoc Output;
+    if (!SemaRef.SubstTemplateArgument(Input, Args, Output)) {
+      assert(TemplateArgsForBuildingFPrime[Index].isNull() &&
+             "InstantiatedArgs must be null before setting");
+      TemplateArgsForBuildingFPrime[Index] = Output.getArgument();
+    }
+  }
+
+  auto *TemplateArgListForBuildingFPrime =
+      TemplateArgumentList::CreateCopy(Context, TemplateArgsForBuildingFPrime);
+  // Form the f' by substituting the template arguments into f.
+  if (auto *FPrime = SemaRef.InstantiateFunctionDeclaration(
+          F, TemplateArgListForBuildingFPrime, AliasTemplate->getLocation(),
+          Sema::CodeSynthesisContext::BuildingDeductionGuides)) {
+    auto *GG = cast<CXXDeductionGuideDecl>(FPrime);
+
+    Expr *RequiresClause =
+        transformRequireClause(SemaRef, F, AliasTemplate, DeduceResults);
+
+    // FIXME: implement the is_deducible constraint per C++
+    // [over.match.class.deduct]p3.3:
+    //    ... and a constraint that is satisfied if and only if the arguments
+    //    of A are deducible (see below) from the return type.
+    auto *FPrimeTemplateParamList = TemplateParameterList::Create(
+        Context, AliasTemplate->getTemplateParameters()->getTemplateLoc(),
+        AliasTemplate->getTemplateParameters()->getLAngleLoc(),
+        FPrimeTemplateParams,
+        AliasTemplate->getTemplateParameters()->getRAngleLoc(),
+        /*RequiresClause=*/RequiresClause);
+    FunctionTemplateDecl *Result = buildDeductionGuide(
+        SemaRef, AliasTemplate, FPrimeTemplateParamList,
+        GG->getCorrespondingConstructor(), GG->getExplicitSpecifier(),
+        GG->getTypeSourceInfo(), AliasTemplate->getBeginLoc(),
+        AliasTemplate->getLocation(), AliasTemplate->getEndLoc(),
+        F->isImplicit());
+    cast<CXXDeductionGuideDecl>(Result->getTemplatedDecl())
+        ->setDeductionCandidateKind(GG->getDeductionCandidateKind());
+    return Result;
+  }
+  return nullptr;
+}
+
 void DeclareImplicitDeductionGuidesForTypeAlias(
     Sema &SemaRef, TypeAliasTemplateDecl *AliasTemplate, SourceLocation Loc) {
   if (AliasTemplate->isInvalidDecl())
@@ -2831,197 +3165,13 @@ void DeclareImplicitDeductionGuidesForTypeAlias(
     if (!F)
       continue;
     // The **aggregate** deduction guides are handled in a different code path
-    // (DeclareImplicitDeductionGuideFromInitList), which involves the tricky
+    // (DeclareAggregateDeductionGuideFromInitList), which involves the tricky
     // cache.
     if (cast<CXXDeductionGuideDecl>(F->getTemplatedDecl())
             ->getDeductionCandidateKind() == DeductionCandidate::Aggregate)
       continue;
 
-    auto RType = F->getTemplatedDecl()->getReturnType();
-    // The (trailing) return type of the deduction guide.
-    const TemplateSpecializationType *FReturnType =
-        RType->getAs<TemplateSpecializationType>();
-    if (const auto *InjectedCNT = RType->getAs<InjectedClassNameType>())
-      // implicitly-generated deduction guide.
-      FReturnType = InjectedCNT->getInjectedTST();
-    else if (const auto *ET = RType->getAs<ElaboratedType>())
-      // explicit deduction guide.
-      FReturnType = ET->getNamedType()->getAs<TemplateSpecializationType>();
-    assert(FReturnType && "expected to see a return type");
-    // Deduce template arguments of the deduction guide f from the RHS of
-    // the alias.
-    //
-    // C++ [over.match.class.deduct]p3: ...For each function or function
-    // template f in the guides of the template named by the
-    // simple-template-id of the defining-type-id, the template arguments
-    // of the return type of f are deduced from the defining-type-id of A
-    // according to the process in [temp.deduct.type] with the exception
-    // that deduction does not fail if not all template arguments are
-    // deduced.
-    //
-    //
-    //  template<typename X, typename Y>
-    //  f(X, Y) -> f<Y, X>;
-    //
-    //  template<typename U>
-    //  using alias = f<int, U>;
-    //
-    // The RHS of alias is f<int, U>, we deduced the template arguments of
-    // the return type of the deduction guide from it: Y->int, X->U
-    sema::TemplateDeductionInfo TDeduceInfo(Loc);
-    // Must initialize n elements, this is required by DeduceTemplateArguments.
-    SmallVector<DeducedTemplateArgument> DeduceResults(
-        F->getTemplateParameters()->size());
-
-    // FIXME: DeduceTemplateArguments stops immediately at the first
-    // non-deducible template argument. However, this doesn't seem to casue
-    // issues for practice cases, we probably need to extend it to continue
-    // performing deduction for rest of arguments to align with the C++
-    // standard.
-    SemaRef.DeduceTemplateArguments(
-        F->getTemplateParameters(), FReturnType->template_arguments(),
-        AliasRhsTemplateArgs, TDeduceInfo, DeduceResults,
-        /*NumberOfArgumentsMustMatch=*/false);
-
-    SmallVector<TemplateArgument> DeducedArgs;
-    SmallVector<unsigned> NonDeducedTemplateParamsInFIndex;
-    // !!NOTE: DeduceResults respects the sequence of template parameters of
-    // the deduction guide f.
-    for (unsigned Index = 0; Index < DeduceResults.size(); ++Index) {
-      if (const auto &D = DeduceResults[Index]; !D.isNull()) // Deduced
-        DeducedArgs.push_back(D);
-      else
-        NonDeducedTemplateParamsInFIndex.push_back(Index);
-    }
-    auto DeducedAliasTemplateParams =
-        TemplateParamsReferencedInTemplateArgumentList(
-            AliasTemplate->getTemplateParameters()->asArray(), DeducedArgs);
-    // All template arguments null by default.
-    SmallVector<TemplateArgument> TemplateArgsForBuildingFPrime(
-        F->getTemplateParameters()->size());
-
-    Sema::InstantiatingTemplate BuildingDeductionGuides(
-        SemaRef, AliasTemplate->getLocation(), F,
-        Sema::InstantiatingTemplate::BuildingDeductionGuidesTag{});
-    if (BuildingDeductionGuides.isInvalid())
-      return;
-    LocalInstantiationScope Scope(SemaRef);
-
-    // Create a template parameter list for the synthesized deduction guide f'.
-    //
-    // C++ [over.match.class.deduct]p3.2:
-    //   If f is a function template, f' is a function template whose template
-    //   parameter list consists of all the template parameters of A
-    //   (including their default template arguments) that appear in the above
-    //   deductions or (recursively) in their default template arguments
-    SmallVector<NamedDecl *> FPrimeTemplateParams;
-    // Store template arguments that refer to the newly-created template
-    // parameters, used for building `TemplateArgsForBuildingFPrime`.
-    SmallVector<TemplateArgument, 16> TransformedDeducedAliasArgs(
-        AliasTemplate->getTemplateParameters()->size());
-
-    for (unsigned AliasTemplateParamIdx : DeducedAliasTemplateParams) {
-      auto *TP = AliasTemplate->getTemplateParameters()->getParam(
-          AliasTemplateParamIdx);
-      // Rebuild any internal references to earlier parameters and reindex as
-      // we go.
-      MultiLevelTemplateArgumentList Args;
-      Args.setKind(TemplateSubstitutionKind::Rewrite);
-      Args.addOuterTemplateArguments(TransformedDeducedAliasArgs);
-      NamedDecl *NewParam = transformTemplateParameter(
-          SemaRef, AliasTemplate->getDeclContext(), TP, Args,
-          /*NewIndex*/ FPrimeTemplateParams.size());
-      FPrimeTemplateParams.push_back(NewParam);
-
-      auto NewTemplateArgument = Context.getCanonicalTemplateArgument(
-          Context.getInjectedTemplateArg(NewParam));
-      TransformedDeducedAliasArgs[AliasTemplateParamIdx] = NewTemplateArgument;
-    }
-    //   ...followed by the template parameters of f that were not deduced
-    //   (including their default template arguments)
-    for (unsigned FTemplateParamIdx : NonDeducedTemplateParamsInFIndex) {
-      auto *TP = F->getTemplateParameters()->getParam(FTemplateParamIdx);
-      MultiLevelTemplateArgumentList Args;
-      Args.setKind(TemplateSubstitutionKind::Rewrite);
-      // We take a shortcut here, it is ok to reuse the
-      // TemplateArgsForBuildingFPrime.
-      Args.addOuterTemplateArguments(TemplateArgsForBuildingFPrime);
-      NamedDecl *NewParam = transformTemplateParameter(
-          SemaRef, F->getDeclContext(), TP, Args, FPrimeTemplateParams.size());
-      FPrimeTemplateParams.push_back(NewParam);
-
-      assert(TemplateArgsForBuildingFPrime[FTemplateParamIdx].isNull() &&
-             "The argument must be null before setting");
-      TemplateArgsForBuildingFPrime[FTemplateParamIdx] =
-          Context.getCanonicalTemplateArgument(
-              Context.getInjectedTemplateArg(NewParam));
-    }
-
-    // To form a deduction guide f' from f, we leverage clang's instantiation
-    // mechanism, we construct a template argument list where the template
-    // arguments refer to the newly-created template parameters of f', and
-    // then apply instantiation on this template argument list to instantiate
-    // f, this ensures all template parameter occurrences are updated
-    // correctly.
-    //
-    // The template argument list is formed from the `DeducedArgs`, two parts:
-    //  1) appeared template parameters of alias: transfrom the deduced
-    //  template argument;
-    //  2) non-deduced template parameters of f: rebuild a
-    //  template argument;
-    //
-    // 2) has been built already (when rebuilding the new template
-    // parameters), we now perform 1).
-    MultiLevelTemplateArgumentList Args;
-    Args.setKind(TemplateSubstitutionKind::Rewrite);
-    Args.addOuterTemplateArguments(TransformedDeducedAliasArgs);
-    for (unsigned Index = 0; Index < DeduceResults.size(); ++Index) {
-      const auto &D = DeduceResults[Index];
-      if (D.isNull()) {
-        // 2): Non-deduced template parameter has been built already.
-        assert(!TemplateArgsForBuildingFPrime[Index].isNull() &&
-               "template arguments for non-deduced template parameters should "
-               "be been set!");
-        continue;
-      }
-      TemplateArgumentLoc Input = SemaRef.getTrivialTemplateArgumentLoc(
-          D, QualType(), SourceLocation{});
-      TemplateArgumentLoc Output;
-      if (!SemaRef.SubstTemplateArgument(Input, Args, Output)) {
-        assert(TemplateArgsForBuildingFPrime[Index].isNull() &&
-               "InstantiatedArgs must be null before setting");
-        TemplateArgsForBuildingFPrime[Index] = (Output.getArgument());
-      }
-    }
-
-    auto *TemplateArgListForBuildingFPrime = TemplateArgumentList::CreateCopy(
-        Context, TemplateArgsForBuildingFPrime);
-    // Form the f' by substituting the template arguments into f.
-    if (auto *FPrime = SemaRef.InstantiateFunctionDeclaration(
-            F, TemplateArgListForBuildingFPrime, AliasTemplate->getLocation(),
-            Sema::CodeSynthesisContext::BuildingDeductionGuides)) {
-      auto *GG = cast<CXXDeductionGuideDecl>(FPrime);
-      // Substitute new template parameters into requires-clause if present.
-      Expr *RequiresClause =
-          transformRequireClause(SemaRef, F, TemplateArgsForBuildingFPrime);
-      // FIXME: implement the is_deducible constraint per C++
-      // [over.match.class.deduct]p3.3:
-      //    ... and a constraint that is satisfied if and only if the arguments
-      //    of A are deducible (see below) from the return type.
-      auto *FPrimeTemplateParamList = TemplateParameterList::Create(
-          Context, AliasTemplate->getTemplateParameters()->getTemplateLoc(),
-          AliasTemplate->getTemplateParameters()->getLAngleLoc(),
-          FPrimeTemplateParams,
-          AliasTemplate->getTemplateParameters()->getRAngleLoc(),
-          /*RequiresClause=*/RequiresClause);
-
-      buildDeductionGuide(SemaRef, AliasTemplate, FPrimeTemplateParamList,
-                          GG->getCorrespondingConstructor(),
-                          GG->getExplicitSpecifier(), GG->getTypeSourceInfo(),
-                          AliasTemplate->getBeginLoc(),
-                          AliasTemplate->getLocation(),
-                          AliasTemplate->getEndLoc(), F->isImplicit());
-    }
+    BuildDeductionGuideForTypeAlias(SemaRef, AliasTemplate, F, Loc);
   }
 }
 
@@ -3037,66 +3187,8 @@ FunctionTemplateDecl *DeclareAggregateDeductionGuideForTypeAlias(
       RHSTemplate, ParamTypes, Loc);
   if (!RHSDeductionGuide)
     return nullptr;
-
-  LocalInstantiationScope Scope(SemaRef);
-  Sema::InstantiatingTemplate BuildingDeductionGuides(
-      SemaRef, AliasTemplate->getLocation(), RHSDeductionGuide,
-      Sema::InstantiatingTemplate::BuildingDeductionGuidesTag{});
-  if (BuildingDeductionGuides.isInvalid())
-    return nullptr;
-
-  // Build a new template parameter list for the synthesized aggregate deduction
-  // guide by transforming the one from RHSDeductionGuide.
-  SmallVector<NamedDecl *> TransformedTemplateParams;
-  // Template args that refer to the rebuilt template parameters.
-  // All template arguments must be initialized in advance.
-  SmallVector<TemplateArgument> TransformedTemplateArgs(
-      RHSDeductionGuide->getTemplateParameters()->size());
-  for (auto *TP : *RHSDeductionGuide->getTemplateParameters()) {
-    // Rebuild any internal references to earlier parameters and reindex as
-    // we go.
-    MultiLevelTemplateArgumentList Args;
-    Args.setKind(TemplateSubstitutionKind::Rewrite);
-    Args.addOuterTemplateArguments(TransformedTemplateArgs);
-    NamedDecl *NewParam = transformTemplateParameter(
-        SemaRef, AliasTemplate->getDeclContext(), TP, Args,
-        /*NewIndex=*/TransformedTemplateParams.size());
-
-    TransformedTemplateArgs[TransformedTemplateParams.size()] =
-        SemaRef.Context.getCanonicalTemplateArgument(
-            SemaRef.Context.getInjectedTemplateArg(NewParam));
-    TransformedTemplateParams.push_back(NewParam);
-  }
-  // FIXME: implement the is_deducible constraint per C++
-  // [over.match.class.deduct]p3.3.
-  Expr *TransformedRequiresClause = transformRequireClause(
-      SemaRef, RHSDeductionGuide, TransformedTemplateArgs);
-  auto *TransformedTemplateParameterList = TemplateParameterList::Create(
-      SemaRef.Context, AliasTemplate->getTemplateParameters()->getTemplateLoc(),
-      AliasTemplate->getTemplateParameters()->getLAngleLoc(),
-      TransformedTemplateParams,
-      AliasTemplate->getTemplateParameters()->getRAngleLoc(),
-      TransformedRequiresClause);
-  auto *TransformedTemplateArgList = TemplateArgumentList::CreateCopy(
-      SemaRef.Context, TransformedTemplateArgs);
-
-  if (auto *TransformedDeductionGuide = SemaRef.InstantiateFunctionDeclaration(
-          RHSDeductionGuide, TransformedTemplateArgList,
-          AliasTemplate->getLocation(),
-          Sema::CodeSynthesisContext::BuildingDeductionGuides)) {
-    auto *GD =
-        llvm::dyn_cast<clang::CXXDeductionGuideDecl>(TransformedDeductionGuide);
-    FunctionTemplateDecl *Result = buildDeductionGuide(
-        SemaRef, AliasTemplate, TransformedTemplateParameterList,
-        GD->getCorrespondingConstructor(), GD->getExplicitSpecifier(),
-        GD->getTypeSourceInfo(), AliasTemplate->getBeginLoc(),
-        AliasTemplate->getLocation(), AliasTemplate->getEndLoc(),
-        GD->isImplicit());
-    cast<CXXDeductionGuideDecl>(Result->getTemplatedDecl())
-        ->setDeductionCandidateKind(DeductionCandidate::Aggregate);
-    return Result;
-  }
-  return nullptr;
+  return BuildDeductionGuideForTypeAlias(SemaRef, AliasTemplate,
+                                         RHSDeductionGuide, Loc);
 }
 
 } // namespace
@@ -4296,8 +4388,8 @@ checkBuiltinTemplateIdType(Sema &SemaRef, BuiltinTemplateDecl *BTD,
 /// Determine whether this alias template is "enable_if_t".
 /// libc++ >=14 uses "__enable_if_t" in C++11 mode.
 static bool isEnableIfAliasTemplate(TypeAliasTemplateDecl *AliasTemplate) {
-  return AliasTemplate->getName().equals("enable_if_t") ||
-         AliasTemplate->getName().equals("__enable_if_t");
+  return AliasTemplate->getName() == "enable_if_t" ||
+         AliasTemplate->getName() == "__enable_if_t";
 }
 
 /// Collect all of the separable terms in the given condition, which
@@ -5554,7 +5646,7 @@ ExprResult Sema::BuildTemplateIdExpr(const CXXScopeSpec &SS,
         R.getRepresentativeDecl(), TemplateKWLoc, TemplateArgs);
     if (Res.isInvalid() || Res.isUsable())
       return Res;
-    // Result is dependent. Carry on to build an UnresolvedLookupEpxr.
+    // Result is dependent. Carry on to build an UnresolvedLookupExpr.
     KnownDependent = true;
   }
 
@@ -5572,6 +5664,13 @@ ExprResult Sema::BuildTemplateIdExpr(const CXXScopeSpec &SS,
       TemplateKWLoc, R.getLookupNameInfo(), RequiresADL, TemplateArgs,
       R.begin(), R.end(), KnownDependent);
 
+  // Model the templates with UnresolvedTemplateTy. The expression should then
+  // either be transformed in an instantiation or be diagnosed in
+  // CheckPlaceholderExpr.
+  if (ULE->getType() == Context.OverloadTy && R.isSingleResult() &&
+      !R.getFoundDecl()->getAsFunction())
+    ULE->setType(Context.UnresolvedTemplateTy);
+
   return ULE;
 }
 
@@ -5608,8 +5707,9 @@ Sema::BuildQualifiedTemplateIdExpr(CXXScopeSpec &SS,
     Diag(NameInfo.getLoc(), diag::err_template_kw_refers_to_type_template)
         << SS.getScopeRep() << NameInfo.getName().getAsString() << SS.getRange()
         << isTypeAliasTemplateDecl;
-    Diag(Temp->getLocation(), diag::note_referenced_type_template) << 0;
-    return ExprError();
+    Diag(Temp->getLocation(), diag::note_referenced_type_template)
+        << isTypeAliasTemplateDecl;
+    return CreateRecoveryExpr(NameInfo.getBeginLoc(), NameInfo.getEndLoc(), {});
   };
 
   if (ClassTemplateDecl *Temp = R.getAsSingle<ClassTemplateDecl>())
@@ -8334,9 +8434,6 @@ bool Sema::CheckTemplateTemplateArgument(TemplateTemplateParmDecl *Param,
   // C++1z [temp.arg.template]p3: (DR 150)
   //   A template-argument matches a template template-parameter P when P
   //   is at least as specialized as the template-argument A.
-  // FIXME: We should enable RelaxedTemplateTemplateArgs by default as it is a
-  //  defect report resolution from C++17 and shouldn't be introduced by
-  //  concepts.
   if (getLangOpts().RelaxedTemplateTemplateArgs) {
     // Quick check for the common case:
     //   If P contains a parameter pack, then A [...] matches P if each of A's
@@ -10376,24 +10473,53 @@ Sema::CheckMemberSpecialization(NamedDecl *Member, LookupResult &Previous) {
   if (Previous.empty()) {
     // Nowhere to look anyway.
   } else if (FunctionDecl *Function = dyn_cast<FunctionDecl>(Member)) {
+    SmallVector<FunctionDecl *> Candidates;
+    bool Ambiguous = false;
     for (LookupResult::iterator I = Previous.begin(), E = Previous.end();
            I != E; ++I) {
-      NamedDecl *D = (*I)->getUnderlyingDecl();
-      if (CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(D)) {
-        QualType Adjusted = Function->getType();
-        if (!hasExplicitCallingConv(Adjusted))
-          Adjusted = adjustCCAndNoReturn(Adjusted, Method->getType());
-        // This doesn't handle deduced return types, but both function
-        // declarations should be undeduced at this point.
-        if (Context.hasSameType(Adjusted, Method->getType())) {
-          FoundInstantiation = *I;
-          Instantiation = Method;
-          InstantiatedFrom = Method->getInstantiatedFromMemberFunction();
-          MSInfo = Method->getMemberSpecializationInfo();
-          break;
-        }
+      CXXMethodDecl *Method =
+          dyn_cast<CXXMethodDecl>((*I)->getUnderlyingDecl());
+      if (!Method)
+        continue;
+      QualType Adjusted = Function->getType();
+      if (!hasExplicitCallingConv(Adjusted))
+        Adjusted = adjustCCAndNoReturn(Adjusted, Method->getType());
+      // This doesn't handle deduced return types, but both function
+      // declarations should be undeduced at this point.
+      if (!Context.hasSameType(Adjusted, Method->getType()))
+        continue;
+      if (ConstraintSatisfaction Satisfaction;
+          Method->getTrailingRequiresClause() &&
+          (CheckFunctionConstraints(Method, Satisfaction,
+                                    /*UsageLoc=*/Member->getLocation(),
+                                    /*ForOverloadResolution=*/true) ||
+           !Satisfaction.IsSatisfied))
+        continue;
+      Candidates.push_back(Method);
+      FunctionDecl *MoreConstrained =
+          Instantiation ? getMoreConstrainedFunction(
+                              Method, cast<FunctionDecl>(Instantiation))
+                        : Method;
+      if (!MoreConstrained) {
+        Ambiguous = true;
+        continue;
+      }
+      if (MoreConstrained == Method) {
+        Ambiguous = false;
+        FoundInstantiation = *I;
+        Instantiation = Method;
+        InstantiatedFrom = Method->getInstantiatedFromMemberFunction();
+        MSInfo = Method->getMemberSpecializationInfo();
       }
     }
+    if (Ambiguous) {
+      Diag(Member->getLocation(), diag::err_function_member_spec_ambiguous)
+          << Member << (InstantiatedFrom ? InstantiatedFrom : Instantiation);
+      for (FunctionDecl *Candidate : Candidates)
+        Diag(Candidate->getLocation(), diag::note_function_member_spec_matched)
+            << Candidate;
+      return true;
+    }
   } else if (isa<VarDecl>(Member)) {
     VarDecl *PrevVar;
     if (Previous.isSingleResult() &&
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index e93f7bd842e4..fe7e35d84151 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -507,10 +507,70 @@ static TemplateDeductionResult DeduceNonTypeTemplateArgument(
       S, TemplateParams, NTTP, DeducedTemplateArgument(New), T, Info, Deduced);
 }
 
+/// Create a shallow copy of a given template parameter declaration, with
+/// empty source locations and using the given TemplateArgument as it's
+/// default argument.
+///
+/// \returns The new template parameter declaration.
+static NamedDecl *getTemplateParameterWithDefault(Sema &S, NamedDecl *A,
+                                                  TemplateArgument Default) {
+  switch (A->getKind()) {
+  case Decl::TemplateTypeParm: {
+    auto *T = cast<TemplateTypeParmDecl>(A);
+    // FIXME: A TemplateTypeParmDecl's DefaultArgument can't hold a full
+    // TemplateArgument, so there is currently no way to specify a pack as a
+    // default argument for these.
+    if (T->isParameterPack())
+      return A;
+    auto *R = TemplateTypeParmDecl::Create(
+        S.Context, A->getDeclContext(), SourceLocation(), SourceLocation(),
+        T->getDepth(), T->getIndex(), T->getIdentifier(),
+        T->wasDeclaredWithTypename(), /*ParameterPack=*/false,
+        T->hasTypeConstraint());
+    R->setDefaultArgument(
+        S.Context.getTrivialTypeSourceInfo(Default.getAsType()));
+    if (R->hasTypeConstraint()) {
+      auto *C = R->getTypeConstraint();
+      R->setTypeConstraint(C->getConceptReference(),
+                           C->getImmediatelyDeclaredConstraint());
+    }
+    return R;
+  }
+  case Decl::NonTypeTemplateParm: {
+    auto *T = cast<NonTypeTemplateParmDecl>(A);
+    // FIXME: Ditto, as above for TemplateTypeParm case.
+    if (T->isParameterPack())
+      return A;
+    auto *R = NonTypeTemplateParmDecl::Create(
+        S.Context, A->getDeclContext(), SourceLocation(), SourceLocation(),
+        T->getDepth(), T->getIndex(), T->getIdentifier(), T->getType(),
+        /*ParameterPack=*/false, T->getTypeSourceInfo());
+    R->setDefaultArgument(Default.getAsExpr());
+    if (auto *PTC = T->getPlaceholderTypeConstraint())
+      R->setPlaceholderTypeConstraint(PTC);
+    return R;
+  }
+  case Decl::TemplateTemplateParm: {
+    auto *T = cast<TemplateTemplateParmDecl>(A);
+    auto *R = TemplateTemplateParmDecl::Create(
+        S.Context, A->getDeclContext(), SourceLocation(), T->getDepth(),
+        T->getIndex(), T->isParameterPack(), T->getIdentifier(),
+        T->wasDeclaredWithTypename(), T->getTemplateParameters());
+    R->setDefaultArgument(
+        S.Context,
+        S.getTrivialTemplateArgumentLoc(Default, QualType(), SourceLocation()));
+    return R;
+  }
+  default:
+    llvm_unreachable("Unexpected Decl Kind");
+  }
+}
+
 static TemplateDeductionResult
 DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
                         TemplateName Param, TemplateName Arg,
                         TemplateDeductionInfo &Info,
+                        ArrayRef<TemplateArgument> DefaultArguments,
                         SmallVectorImpl<DeducedTemplateArgument> &Deduced) {
   TemplateDecl *ParamDecl = Param.getAsTemplateDecl();
   if (!ParamDecl) {
@@ -519,13 +579,45 @@ DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
     return TemplateDeductionResult::Success;
   }
 
-  if (TemplateTemplateParmDecl *TempParam
-        = dyn_cast<TemplateTemplateParmDecl>(ParamDecl)) {
+  if (auto *TempParam = dyn_cast<TemplateTemplateParmDecl>(ParamDecl)) {
     // If we're not deducing at this depth, there's nothing to deduce.
     if (TempParam->getDepth() != Info.getDeducedDepth())
       return TemplateDeductionResult::Success;
 
-    DeducedTemplateArgument NewDeduced(S.Context.getCanonicalTemplateName(Arg));
+    auto NewDeduced = DeducedTemplateArgument(Arg);
+    // Provisional resolution for CWG2398: If Arg is also a template template
+    // param, and it names a template specialization, then we deduce a
+    // synthesized template template parameter based on A, but using the TS's
+    // arguments as defaults.
+    if (auto *TempArg = dyn_cast_or_null<TemplateTemplateParmDecl>(
+            Arg.getAsTemplateDecl())) {
+      assert(Arg.getKind() == TemplateName::Template);
+      assert(!TempArg->isExpandedParameterPack());
+
+      TemplateParameterList *As = TempArg->getTemplateParameters();
+      if (DefaultArguments.size() != 0) {
+        assert(DefaultArguments.size() <= As->size());
+        SmallVector<NamedDecl *, 4> Params(As->size());
+        for (unsigned I = 0; I < DefaultArguments.size(); ++I)
+          Params[I] = getTemplateParameterWithDefault(S, As->getParam(I),
+                                                      DefaultArguments[I]);
+        for (unsigned I = DefaultArguments.size(); I < As->size(); ++I)
+          Params[I] = As->getParam(I);
+        // FIXME: We could unique these, and also the parameters, but we don't
+        // expect programs to contain a large enough amount of these deductions
+        // for that to be worthwhile.
+        auto *TPL = TemplateParameterList::Create(
+            S.Context, SourceLocation(), SourceLocation(), Params,
+            SourceLocation(), As->getRequiresClause());
+        NewDeduced = DeducedTemplateArgument(
+            TemplateName(TemplateTemplateParmDecl::Create(
+                S.Context, TempArg->getDeclContext(), SourceLocation(),
+                TempArg->getDepth(), TempArg->getPosition(),
+                TempArg->isParameterPack(), TempArg->getIdentifier(),
+                TempArg->wasDeclaredWithTypename(), TPL)));
+      }
+    }
+
     DeducedTemplateArgument Result = checkDeducedTemplateArguments(S.Context,
                                                  Deduced[TempParam->getIndex()],
                                                                    NewDeduced);
@@ -604,7 +696,8 @@ DeduceTemplateSpecArguments(Sema &S, TemplateParameterList *TemplateParams,
 
     // Perform template argument deduction for the template name.
     if (auto Result =
-            DeduceTemplateArguments(S, TemplateParams, TNP, TNA, Info, Deduced);
+            DeduceTemplateArguments(S, TemplateParams, TNP, TNA, Info,
+                                    SA->template_arguments(), Deduced);
         Result != TemplateDeductionResult::Success)
       return Result;
     // Perform template argument deduction on each template
@@ -630,7 +723,8 @@ DeduceTemplateSpecArguments(Sema &S, TemplateParameterList *TemplateParams,
   // Perform template argument deduction for the template name.
   if (auto Result = DeduceTemplateArguments(
           S, TemplateParams, TP->getTemplateName(),
-          TemplateName(SA->getSpecializedTemplate()), Info, Deduced);
+          TemplateName(SA->getSpecializedTemplate()), Info,
+          SA->getTemplateArgs().asArray(), Deduced);
       Result != TemplateDeductionResult::Success)
     return Result;
 
@@ -1229,13 +1323,11 @@ bool Sema::isSameOrCompatibleFunctionType(QualType P, QualType A) {
     return Context.hasSameType(P, A);
 
   // Noreturn and noexcept adjustment.
-  QualType AdjustedParam;
-  if (IsFunctionConversion(P, A, AdjustedParam))
-    return Context.hasSameType(AdjustedParam, A);
+  if (QualType AdjustedParam; IsFunctionConversion(P, A, AdjustedParam))
+    P = AdjustedParam;
 
   // FIXME: Compatible calling conventions.
-
-  return Context.hasSameType(P, A);
+  return Context.hasSameFunctionTypeIgnoringExceptionSpec(P, A);
 }
 
 /// Get the index of the first template parameter that was originally from the
@@ -2323,7 +2415,8 @@ DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
   case TemplateArgument::Template:
     if (A.getKind() == TemplateArgument::Template)
       return DeduceTemplateArguments(S, TemplateParams, P.getAsTemplate(),
-                                     A.getAsTemplate(), Info, Deduced);
+                                     A.getAsTemplate(), Info,
+                                     /*DefaultArguments=*/{}, Deduced);
     Info.FirstArg = P;
     Info.SecondArg = A;
     return TemplateDeductionResult::NonDeducedMismatch;
@@ -3414,23 +3507,6 @@ TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
   if (FunctionType) {
     auto EPI = Proto->getExtProtoInfo();
     EPI.ExtParameterInfos = ExtParamInfos.getPointerOrNull(ParamTypes.size());
-
-    // In C++1z onwards, exception specifications are part of the function type,
-    // so substitution into the type must also substitute into the exception
-    // specification.
-    SmallVector<QualType, 4> ExceptionStorage;
-    if (getLangOpts().CPlusPlus17 &&
-        SubstExceptionSpec(
-            Function->getLocation(), EPI.ExceptionSpec, ExceptionStorage,
-            getTemplateInstantiationArgs(
-                FunctionTemplate, nullptr, /*Final=*/true,
-                /*Innermost=*/SugaredExplicitArgumentList->asArray(),
-                /*RelativeToPrimary=*/false,
-                /*Pattern=*/nullptr,
-                /*ForConstraintInstantiation=*/false,
-                /*SkipForSpecialization=*/true)))
-      return TemplateDeductionResult::SubstitutionFailure;
-
     *FunctionType = BuildFunctionType(ResultType, ParamTypes,
                                       Function->getLocation(),
                                       Function->getDeclName(),
@@ -4610,13 +4686,6 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
                                                Info.getLocation()))
     return TemplateDeductionResult::MiscellaneousDeductionFailure;
 
-  auto *SpecializationFPT =
-      Specialization->getType()->castAs<FunctionProtoType>();
-  if (IsAddressOfFunction && getLangOpts().CPlusPlus17 &&
-      isUnresolvedExceptionSpec(SpecializationFPT->getExceptionSpecType()) &&
-      !ResolveExceptionSpec(Info.getLocation(), SpecializationFPT))
-    return TemplateDeductionResult::MiscellaneousDeductionFailure;
-
   // Adjust the exception specification of the argument to match the
   // substituted and resolved type we just formed. (Calling convention and
   // noreturn can't be dependent, so we don't actually need this for them
@@ -5783,6 +5852,38 @@ UnresolvedSetIterator Sema::getMostSpecialized(
   return SpecEnd;
 }
 
+/// Returns the more constrained function according to the rules of
+/// partial ordering by constraints (C++ [temp.constr.order]).
+///
+/// \param FD1 the first function
+///
+/// \param FD2 the second function
+///
+/// \returns the more constrained function. If neither function is
+/// more constrained, returns NULL.
+FunctionDecl *Sema::getMoreConstrainedFunction(FunctionDecl *FD1,
+                                               FunctionDecl *FD2) {
+  assert(!FD1->getDescribedTemplate() && !FD2->getDescribedTemplate() &&
+         "not for function templates");
+  FunctionDecl *F1 = FD1;
+  if (FunctionDecl *MF = FD1->getInstantiatedFromMemberFunction())
+    F1 = MF;
+  FunctionDecl *F2 = FD2;
+  if (FunctionDecl *MF = FD2->getInstantiatedFromMemberFunction())
+    F2 = MF;
+  llvm::SmallVector<const Expr *, 1> AC1, AC2;
+  F1->getAssociatedConstraints(AC1);
+  F2->getAssociatedConstraints(AC2);
+  bool AtLeastAsConstrained1, AtLeastAsConstrained2;
+  if (IsAtLeastAsConstrained(F1, AC1, F2, AC2, AtLeastAsConstrained1))
+    return nullptr;
+  if (IsAtLeastAsConstrained(F2, AC2, F1, AC1, AtLeastAsConstrained2))
+    return nullptr;
+  if (AtLeastAsConstrained1 == AtLeastAsConstrained2)
+    return nullptr;
+  return AtLeastAsConstrained1 ? FD1 : FD2;
+}
+
 /// Determine whether one partial specialization, P1, is at least as
 /// specialized than another, P2.
 ///
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 3a9fd906b7af..07626058c797 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -275,6 +275,13 @@ Response HandleFunction(Sema &SemaRef, const FunctionDecl *Function,
                                      TemplateArgs->asArray(),
                                      /*Final=*/false);
 
+    if (RelativeToPrimary &&
+        (Function->getTemplateSpecializationKind() ==
+             TSK_ExplicitSpecialization ||
+         (Function->getFriendObjectKind() &&
+          !Function->getPrimaryTemplate()->getFriendObjectKind())))
+      return Response::UseNextDecl(Function);
+
     // If this function was instantiated from a specialized member that is
     // a function template, we're done.
     assert(Function->getPrimaryTemplate() && "No function template?");
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index d544cfac55ba..fde2d920c785 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -2269,16 +2269,18 @@ Decl *TemplateDeclInstantiator::VisitFunctionDecl(
                             TemplateArgumentList::CreateCopy(SemaRef.Context,
                                                              Innermost),
                                                 /*InsertPos=*/nullptr);
-  } else if (isFriend && D->isThisDeclarationADefinition()) {
-    // Do not connect the friend to the template unless it's actually a
-    // definition. We don't want non-template functions to be marked as being
-    // template instantiations.
-    Function->setInstantiationOfMemberFunction(D, TSK_ImplicitInstantiation);
-  } else if (!isFriend) {
-    // If this is not a function template, and this is not a friend (that is,
-    // this is a locally declared function), save the instantiation relationship
-    // for the purposes of constraint instantiation.
-    Function->setInstantiatedFromDecl(D);
+  } else if (FunctionRewriteKind == RewriteKind::None) {
+    if (isFriend && D->isThisDeclarationADefinition()) {
+      // Do not connect the friend to the template unless it's actually a
+      // definition. We don't want non-template functions to be marked as being
+      // template instantiations.
+      Function->setInstantiationOfMemberFunction(D, TSK_ImplicitInstantiation);
+    } else if (!isFriend) {
+      // If this is not a function template, and this is not a friend (that is,
+      // this is a locally declared function), save the instantiation
+      // relationship for the purposes of constraint instantiation.
+      Function->setInstantiatedFromDecl(D);
+    }
   }
 
   if (isFriend) {
@@ -2669,7 +2671,7 @@ Decl *TemplateDeclInstantiator::VisitCXXMethodDecl(
                          TemplateArgumentList::CreateCopy(SemaRef.Context,
                                                           Innermost),
                                               /*InsertPos=*/nullptr);
-  } else if (!isFriend) {
+  } else if (!isFriend && FunctionRewriteKind == RewriteKind::None) {
     // Record that this is an instantiation of a member function.
     Method->setInstantiationOfMemberFunction(D, TSK_ImplicitInstantiation);
   }
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index dff7e9df636b..126965088831 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -7964,6 +7964,11 @@ TreeTransform<Derived>::TransformIfStmt(IfStmt *S) {
   // Transform the "then" branch.
   StmtResult Then;
   if (!ConstexprConditionValue || *ConstexprConditionValue) {
+    EnterExpressionEvaluationContext Ctx(
+        getSema(), Sema::ExpressionEvaluationContext::ImmediateFunctionContext,
+        nullptr, Sema::ExpressionEvaluationContextRecord::EK_Other,
+        S->isNonNegatedConsteval());
+
     Then = getDerived().TransformStmt(S->getThen());
     if (Then.isInvalid())
       return StmtError();
@@ -7978,6 +7983,11 @@ TreeTransform<Derived>::TransformIfStmt(IfStmt *S) {
   // Transform the "else" branch.
   StmtResult Else;
   if (!ConstexprConditionValue || !*ConstexprConditionValue) {
+    EnterExpressionEvaluationContext Ctx(
+        getSema(), Sema::ExpressionEvaluationContext::ImmediateFunctionContext,
+        nullptr, Sema::ExpressionEvaluationContextRecord::EK_Other,
+        S->isNegatedConsteval());
+
     Else = getDerived().TransformStmt(S->getElse());
     if (Else.isInvalid())
       return StmtError();
@@ -11112,6 +11122,23 @@ class OpenACCClauseTransform final
   SemaOpenACC::OpenACCParsedClause &ParsedClause;
   OpenACCClause *NewClause = nullptr;
 
+  llvm::SmallVector<Expr *> VisitVarList(ArrayRef<Expr *> VarList) {
+    llvm::SmallVector<Expr *> InstantiatedVarList;
+    for (Expr *CurVar : VarList) {
+      ExprResult Res = Self.TransformExpr(CurVar);
+
+      if (!Res.isUsable())
+        continue;
+
+      Res = Self.getSema().OpenACC().ActOnVar(Res.get());
+
+      if (Res.isUsable())
+        InstantiatedVarList.push_back(Res.get());
+    }
+
+    return InstantiatedVarList;
+  }
+
 public:
   OpenACCClauseTransform(TreeTransform<Derived> &Self,
                          ArrayRef<const OpenACCClause *> ExistingClauses,
@@ -11206,22 +11233,134 @@ void OpenACCClauseTransform<Derived>::VisitNumGangsClause(
 template <typename Derived>
 void OpenACCClauseTransform<Derived>::VisitPrivateClause(
     const OpenACCPrivateClause &C) {
-  llvm::SmallVector<Expr *> InstantiatedVarList;
+  ParsedClause.setVarListDetails(VisitVarList(C.getVarList()),
+                                 /*IsReadOnly=*/false, /*IsZero=*/false);
 
-  for (Expr *CurVar : C.getVarList()) {
-    ExprResult Res = Self.TransformExpr(CurVar);
+  NewClause = OpenACCPrivateClause::Create(
+      Self.getSema().getASTContext(), ParsedClause.getBeginLoc(),
+      ParsedClause.getLParenLoc(), ParsedClause.getVarList(),
+      ParsedClause.getEndLoc());
+}
 
-    if (!Res.isUsable())
-      return;
+template <typename Derived>
+void OpenACCClauseTransform<Derived>::VisitFirstPrivateClause(
+    const OpenACCFirstPrivateClause &C) {
+  ParsedClause.setVarListDetails(VisitVarList(C.getVarList()),
+                                 /*IsReadOnly=*/false, /*IsZero=*/false);
 
-    Res = Self.getSema().OpenACC().ActOnVar(Res.get());
+  NewClause = OpenACCFirstPrivateClause::Create(
+      Self.getSema().getASTContext(), ParsedClause.getBeginLoc(),
+      ParsedClause.getLParenLoc(), ParsedClause.getVarList(),
+      ParsedClause.getEndLoc());
+}
 
-    if (Res.isUsable())
-      InstantiatedVarList.push_back(Res.get());
-  }
-  ParsedClause.setVarListDetails(std::move(InstantiatedVarList));
+template <typename Derived>
+void OpenACCClauseTransform<Derived>::VisitNoCreateClause(
+    const OpenACCNoCreateClause &C) {
+  ParsedClause.setVarListDetails(VisitVarList(C.getVarList()),
+                                 /*IsReadOnly=*/false, /*IsZero=*/false);
 
-  NewClause = OpenACCPrivateClause::Create(
+  NewClause = OpenACCNoCreateClause::Create(
+      Self.getSema().getASTContext(), ParsedClause.getBeginLoc(),
+      ParsedClause.getLParenLoc(), ParsedClause.getVarList(),
+      ParsedClause.getEndLoc());
+}
+
+template <typename Derived>
+void OpenACCClauseTransform<Derived>::VisitPresentClause(
+    const OpenACCPresentClause &C) {
+  ParsedClause.setVarListDetails(VisitVarList(C.getVarList()),
+                                 /*IsReadOnly=*/false, /*IsZero=*/false);
+
+  NewClause = OpenACCPresentClause::Create(
+      Self.getSema().getASTContext(), ParsedClause.getBeginLoc(),
+      ParsedClause.getLParenLoc(), ParsedClause.getVarList(),
+      ParsedClause.getEndLoc());
+}
+
+template <typename Derived>
+void OpenACCClauseTransform<Derived>::VisitCopyClause(
+    const OpenACCCopyClause &C) {
+  ParsedClause.setVarListDetails(VisitVarList(C.getVarList()),
+                                 /*IsReadOnly=*/false, /*IsZero=*/false);
+
+  NewClause = OpenACCCopyClause::Create(
+      Self.getSema().getASTContext(), ParsedClause.getClauseKind(),
+      ParsedClause.getBeginLoc(), ParsedClause.getLParenLoc(),
+      ParsedClause.getVarList(), ParsedClause.getEndLoc());
+}
+
+template <typename Derived>
+void OpenACCClauseTransform<Derived>::VisitCopyInClause(
+    const OpenACCCopyInClause &C) {
+  ParsedClause.setVarListDetails(VisitVarList(C.getVarList()), C.isReadOnly(),
+                                 /*IsZero=*/false);
+
+  NewClause = OpenACCCopyInClause::Create(
+      Self.getSema().getASTContext(), ParsedClause.getClauseKind(),
+      ParsedClause.getBeginLoc(), ParsedClause.getLParenLoc(),
+      ParsedClause.isReadOnly(), ParsedClause.getVarList(),
+      ParsedClause.getEndLoc());
+}
+
+template <typename Derived>
+void OpenACCClauseTransform<Derived>::VisitCopyOutClause(
+    const OpenACCCopyOutClause &C) {
+  ParsedClause.setVarListDetails(VisitVarList(C.getVarList()),
+                                 /*IsReadOnly=*/false, C.isZero());
+
+  NewClause = OpenACCCopyOutClause::Create(
+      Self.getSema().getASTContext(), ParsedClause.getClauseKind(),
+      ParsedClause.getBeginLoc(), ParsedClause.getLParenLoc(),
+      ParsedClause.isZero(), ParsedClause.getVarList(),
+      ParsedClause.getEndLoc());
+}
+
+template <typename Derived>
+void OpenACCClauseTransform<Derived>::VisitCreateClause(
+    const OpenACCCreateClause &C) {
+  ParsedClause.setVarListDetails(VisitVarList(C.getVarList()),
+                                 /*IsReadOnly=*/false, C.isZero());
+
+  NewClause = OpenACCCreateClause::Create(
+      Self.getSema().getASTContext(), ParsedClause.getClauseKind(),
+      ParsedClause.getBeginLoc(), ParsedClause.getLParenLoc(),
+      ParsedClause.isZero(), ParsedClause.getVarList(),
+      ParsedClause.getEndLoc());
+}
+template <typename Derived>
+void OpenACCClauseTransform<Derived>::VisitAttachClause(
+    const OpenACCAttachClause &C) {
+  llvm::SmallVector<Expr *> VarList = VisitVarList(C.getVarList());
+
+  // Ensure each var is a pointer type.
+  VarList.erase(std::remove_if(VarList.begin(), VarList.end(), [&](Expr *E) {
+    return Self.getSema().OpenACC().CheckVarIsPointerType(
+        OpenACCClauseKind::Attach, E);
+  }), VarList.end());
+
+  ParsedClause.setVarListDetails(VarList,
+                                 /*IsReadOnly=*/false, /*IsZero=*/false);
+  NewClause = OpenACCAttachClause::Create(
+      Self.getSema().getASTContext(), ParsedClause.getBeginLoc(),
+      ParsedClause.getLParenLoc(), ParsedClause.getVarList(),
+      ParsedClause.getEndLoc());
+}
+
+template <typename Derived>
+void OpenACCClauseTransform<Derived>::VisitDevicePtrClause(
+    const OpenACCDevicePtrClause &C) {
+  llvm::SmallVector<Expr *> VarList = VisitVarList(C.getVarList());
+
+  // Ensure each var is a pointer type.
+  VarList.erase(std::remove_if(VarList.begin(), VarList.end(), [&](Expr *E) {
+    return Self.getSema().OpenACC().CheckVarIsPointerType(
+        OpenACCClauseKind::DevicePtr, E);
+  }), VarList.end());
+
+  ParsedClause.setVarListDetails(VarList,
+                                 /*IsReadOnly=*/false, /*IsZero=*/false);
+  NewClause = OpenACCDevicePtrClause::Create(
       Self.getSema().getASTContext(), ParsedClause.getBeginLoc(),
       ParsedClause.getLParenLoc(), ParsedClause.getVarList(),
       ParsedClause.getEndLoc());
@@ -11272,6 +11411,75 @@ void OpenACCClauseTransform<Derived>::VisitVectorLengthClause(
       ParsedClause.getLParenLoc(), ParsedClause.getIntExprs()[0],
       ParsedClause.getEndLoc());
 }
+
+template <typename Derived>
+void OpenACCClauseTransform<Derived>::VisitAsyncClause(
+    const OpenACCAsyncClause &C) {
+  if (C.hasIntExpr()) {
+    ExprResult Res = Self.TransformExpr(const_cast<Expr *>(C.getIntExpr()));
+    if (!Res.isUsable())
+      return;
+
+    Res = Self.getSema().OpenACC().ActOnIntExpr(OpenACCDirectiveKind::Invalid,
+                                                C.getClauseKind(),
+                                                C.getBeginLoc(), Res.get());
+    if (!Res.isUsable())
+      return;
+    ParsedClause.setIntExprDetails(Res.get());
+  }
+
+  NewClause = OpenACCAsyncClause::Create(
+      Self.getSema().getASTContext(), ParsedClause.getBeginLoc(),
+      ParsedClause.getLParenLoc(),
+      ParsedClause.getNumIntExprs() != 0 ? ParsedClause.getIntExprs()[0]
+                                         : nullptr,
+      ParsedClause.getEndLoc());
+}
+template <typename Derived>
+void OpenACCClauseTransform<Derived>::VisitWaitClause(
+    const OpenACCWaitClause &C) {
+  if (!C.getLParenLoc().isInvalid()) {
+    Expr *DevNumExpr = nullptr;
+    llvm::SmallVector<Expr *> InstantiatedQueueIdExprs;
+
+    // Instantiate devnum expr if it exists.
+    if (C.getDevNumExpr()) {
+      ExprResult Res = Self.TransformExpr(C.getDevNumExpr());
+      if (!Res.isUsable())
+        return;
+      Res = Self.getSema().OpenACC().ActOnIntExpr(OpenACCDirectiveKind::Invalid,
+                                                  C.getClauseKind(),
+                                                  C.getBeginLoc(), Res.get());
+      if (!Res.isUsable())
+        return;
+
+      DevNumExpr = Res.get();
+    }
+
+    // Instantiate queue ids.
+    for (Expr *CurQueueIdExpr : C.getQueueIdExprs()) {
+      ExprResult Res = Self.TransformExpr(CurQueueIdExpr);
+      if (!Res.isUsable())
+        return;
+      Res = Self.getSema().OpenACC().ActOnIntExpr(OpenACCDirectiveKind::Invalid,
+                                                  C.getClauseKind(),
+                                                  C.getBeginLoc(), Res.get());
+      if (!Res.isUsable())
+        return;
+
+      InstantiatedQueueIdExprs.push_back(Res.get());
+    }
+
+    ParsedClause.setWaitDetails(DevNumExpr, C.getQueuesLoc(),
+                                std::move(InstantiatedQueueIdExprs));
+  }
+
+  NewClause = OpenACCWaitClause::Create(
+      Self.getSema().getASTContext(), ParsedClause.getBeginLoc(),
+      ParsedClause.getLParenLoc(), ParsedClause.getDevNumExpr(),
+      ParsedClause.getQueuesLoc(), ParsedClause.getQueueIdExprs(),
+      ParsedClause.getEndLoc());
+}
 } // namespace
 template <typename Derived>
 OpenACCClause *TreeTransform<Derived>::TransformOpenACCClause(
diff --git a/clang/lib/Serialization/ASTCommon.cpp b/clang/lib/Serialization/ASTCommon.cpp
index e017f5bdb488..63c5140086d8 100644
--- a/clang/lib/Serialization/ASTCommon.cpp
+++ b/clang/lib/Serialization/ASTCommon.cpp
@@ -186,6 +186,9 @@ serialization::TypeIdxFromBuiltin(const BuiltinType *BT) {
   case BuiltinType::Overload:
     ID = PREDEF_TYPE_OVERLOAD_ID;
     break;
+  case BuiltinType::UnresolvedTemplate:
+    ID = PREDEF_TYPE_UNRESOLVED_TEMPLATE;
+    break;
   case BuiltinType::BoundMember:
     ID = PREDEF_TYPE_BOUND_MEMBER;
     break;
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 29b81c1a753c..7627996d2c32 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -829,36 +829,37 @@ bool SimpleASTReaderListener::ReadPreprocessorOptions(
                                   OptionValidateNone);
 }
 
-/// Check the header search options deserialized from the control block
-/// against the header search options in an existing preprocessor.
+/// Check that the specified and the existing module cache paths are equivalent.
 ///
 /// \param Diags If non-null, produce diagnostics for any mismatches incurred.
-static bool checkHeaderSearchOptions(const HeaderSearchOptions &HSOpts,
-                                     StringRef SpecificModuleCachePath,
-                                     StringRef ExistingModuleCachePath,
-                                     DiagnosticsEngine *Diags,
-                                     const LangOptions &LangOpts,
-                                     const PreprocessorOptions &PPOpts) {
-  if (LangOpts.Modules) {
-    if (SpecificModuleCachePath != ExistingModuleCachePath &&
-        !PPOpts.AllowPCHWithDifferentModulesCachePath) {
-      if (Diags)
-        Diags->Report(diag::err_pch_modulecache_mismatch)
-          << SpecificModuleCachePath << ExistingModuleCachePath;
-      return true;
-    }
-  }
-
-  return false;
+/// \returns true when the module cache paths differ.
+static bool checkModuleCachePath(llvm::vfs::FileSystem &VFS,
+                                 StringRef SpecificModuleCachePath,
+                                 StringRef ExistingModuleCachePath,
+                                 DiagnosticsEngine *Diags,
+                                 const LangOptions &LangOpts,
+                                 const PreprocessorOptions &PPOpts) {
+  if (!LangOpts.Modules || PPOpts.AllowPCHWithDifferentModulesCachePath ||
+      SpecificModuleCachePath == ExistingModuleCachePath)
+    return false;
+  auto EqualOrErr =
+      VFS.equivalent(SpecificModuleCachePath, ExistingModuleCachePath);
+  if (EqualOrErr && *EqualOrErr)
+    return false;
+  if (Diags)
+    Diags->Report(diag::err_pch_modulecache_mismatch)
+        << SpecificModuleCachePath << ExistingModuleCachePath;
+  return true;
 }
 
 bool PCHValidator::ReadHeaderSearchOptions(const HeaderSearchOptions &HSOpts,
                                            StringRef SpecificModuleCachePath,
                                            bool Complain) {
-  return checkHeaderSearchOptions(HSOpts, SpecificModuleCachePath,
-                                  PP.getHeaderSearchInfo().getModuleCachePath(),
-                                  Complain ? &Reader.Diags : nullptr,
-                                  PP.getLangOpts(), PP.getPreprocessorOpts());
+  return checkModuleCachePath(Reader.getFileManager().getVirtualFileSystem(),
+                              SpecificModuleCachePath,
+                              PP.getHeaderSearchInfo().getModuleCachePath(),
+                              Complain ? &Reader.Diags : nullptr,
+                              PP.getLangOpts(), PP.getPreprocessorOpts());
 }
 
 void PCHValidator::ReadCounter(const ModuleFile &M, unsigned Value) {
@@ -1004,7 +1005,7 @@ static bool readBit(unsigned &Bits) {
   return Value;
 }
 
-IdentID ASTIdentifierLookupTrait::ReadIdentifierID(const unsigned char *d) {
+IdentifierID ASTIdentifierLookupTrait::ReadIdentifierID(const unsigned char *d) {
   using namespace llvm::support;
 
   unsigned RawID = endian::readNext<uint32_t, llvm::endianness::little>(d);
@@ -1040,7 +1041,7 @@ IdentifierInfo *ASTIdentifierLookupTrait::ReadData(const internal_key_type& k,
   markIdentifierFromAST(Reader, *II);
   Reader.markIdentifierUpToDate(II);
 
-  IdentID ID = Reader.getGlobalIdentifierID(F, RawID);
+  IdentifierID ID = Reader.getGlobalIdentifierID(F, RawID);
   if (!IsInteresting) {
     // For uninteresting identifiers, there's nothing else to do. Just notify
     // the reader that we've finished loading this identifier.
@@ -3038,8 +3039,10 @@ ASTReader::ReadControlBlock(ModuleFile &F,
         // The import location will be the local one for now; we will adjust
         // all import locations of module imports after the global source
         // location info are setup, in ReadAST.
-        SourceLocation ImportLoc =
+        auto [ImportLoc, ImportModuleFileIndex] =
             ReadUntranslatedSourceLocation(Record[Idx++]);
+        // The import location must belong to the current module file itself.
+        assert(ImportModuleFileIndex == 0);
         off_t StoredSize = !IsImportingStdCXXModule ? (off_t)Record[Idx++] : 0;
         time_t StoredModTime =
             !IsImportingStdCXXModule ? (time_t)Record[Idx++] : 0;
@@ -3347,7 +3350,7 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F,
         return llvm::createStringError(
             std::errc::illegal_byte_sequence,
             "duplicate TYPE_OFFSET record in AST file");
-      F.TypeOffsets = reinterpret_cast<const UnderalignedInt64 *>(Blob.data());
+      F.TypeOffsets = reinterpret_cast<const UnalignedUInt64 *>(Blob.data());
       F.LocalNumTypes = Record[0];
       unsigned LocalBaseTypeIndex = Record[1];
       F.BaseTypeIndex = getTotalNumTypes();
@@ -3661,13 +3664,6 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F,
           std::make_pair(SourceManager::MaxLoadedOffset - F.SLocEntryBaseOffset
                            - SLocSpaceSize,&F));
 
-      // Initialize the remapping table.
-      // Invalid stays invalid.
-      F.SLocRemap.insertOrReplace(std::make_pair(0U, 0));
-      // This module. Base was 2 when being compiled.
-      F.SLocRemap.insertOrReplace(std::make_pair(
-          2U, static_cast<SourceLocation::IntTy>(F.SLocEntryBaseOffset - 2)));
-
       TotalNumSLocEntries += F.LocalNumSLocEntries;
       break;
     }
@@ -4055,18 +4051,7 @@ void ASTReader::ReadModuleOffsetMap(ModuleFile &F) const {
   const unsigned char *DataEnd = Data + F.ModuleOffsetMap.size();
   F.ModuleOffsetMap = StringRef();
 
-  // If we see this entry before SOURCE_LOCATION_OFFSETS, add placeholders.
-  if (F.SLocRemap.find(0) == F.SLocRemap.end()) {
-    F.SLocRemap.insert(std::make_pair(0U, 0));
-    F.SLocRemap.insert(std::make_pair(2U, 1));
-  }
-
-  // Continuous range maps we may be updating in our module.
-  using SLocRemapBuilder =
-      ContinuousRangeMap<SourceLocation::UIntTy, SourceLocation::IntTy,
-                         2>::Builder;
   using RemapBuilder = ContinuousRangeMap<uint32_t, int, 2>::Builder;
-  SLocRemapBuilder SLocRemap(F.SLocRemap);
   RemapBuilder IdentifierRemap(F.IdentifierRemap);
   RemapBuilder MacroRemap(F.MacroRemap);
   RemapBuilder PreprocessedEntityRemap(F.PreprocessedEntityRemap);
@@ -4075,6 +4060,9 @@ void ASTReader::ReadModuleOffsetMap(ModuleFile &F) const {
   RemapBuilder DeclRemap(F.DeclRemap);
   RemapBuilder TypeRemap(F.TypeRemap);
 
+  auto &ImportedModuleVector = F.DependentModules;
+  assert(ImportedModuleVector.empty());
+
   while (Data < DataEnd) {
     // FIXME: Looking up dependency modules by filename is horrible. Let's
     // start fixing this with prebuilt, explicit and implicit modules and see
@@ -4090,15 +4078,14 @@ void ASTReader::ReadModuleOffsetMap(ModuleFile &F) const {
                           ? ModuleMgr.lookupByModuleName(Name)
                           : ModuleMgr.lookupByFileName(Name));
     if (!OM) {
-      std::string Msg =
-          "SourceLocation remap refers to unknown module, cannot find ";
+      std::string Msg = "refers to unknown module, cannot find ";
       Msg.append(std::string(Name));
       Error(Msg);
       return;
     }
 
-    SourceLocation::UIntTy SLocOffset =
-        endian::readNext<uint32_t, llvm::endianness::little>(Data);
+    ImportedModuleVector.push_back(OM);
+
     uint32_t IdentifierIDOffset =
         endian::readNext<uint32_t, llvm::endianness::little>(Data);
     uint32_t MacroIDOffset =
@@ -4122,13 +4109,6 @@ void ASTReader::ReadModuleOffsetMap(ModuleFile &F) const {
                                     static_cast<int>(BaseOffset - Offset)));
     };
 
-    constexpr SourceLocation::UIntTy SLocNone =
-        std::numeric_limits<SourceLocation::UIntTy>::max();
-    if (SLocOffset != SLocNone)
-      SLocRemap.insert(std::make_pair(
-          SLocOffset, static_cast<SourceLocation::IntTy>(
-                          OM->SLocEntryBaseOffset - SLocOffset)));
-
     mapOffset(IdentifierIDOffset, OM->BaseIdentifierID, IdentifierRemap);
     mapOffset(MacroIDOffset, OM->BaseMacroID, MacroRemap);
     mapOffset(PreprocessedEntityIDOffset, OM->BasePreprocessedEntityID,
@@ -5397,9 +5377,9 @@ namespace {
     bool ReadHeaderSearchOptions(const HeaderSearchOptions &HSOpts,
                                  StringRef SpecificModuleCachePath,
                                  bool Complain) override {
-      return checkHeaderSearchOptions(HSOpts, SpecificModuleCachePath,
-                                      ExistingModuleCachePath, nullptr,
-                                      ExistingLangOpts, ExistingPPOpts);
+      return checkModuleCachePath(
+          FileMgr.getVirtualFileSystem(), SpecificModuleCachePath,
+          ExistingModuleCachePath, nullptr, ExistingLangOpts, ExistingPPOpts);
     }
 
     bool ReadPreprocessorOptions(const PreprocessorOptions &PPOpts,
@@ -6264,8 +6244,8 @@ SourceRange ASTReader::ReadSkippedRange(unsigned GlobalIndex) {
   unsigned LocalIndex = GlobalIndex - M->BasePreprocessedSkippedRangeID;
   assert(LocalIndex < M->NumPreprocessedSkippedRanges);
   PPSkippedRange RawRange = M->PreprocessedSkippedRangeOffsets[LocalIndex];
-  SourceRange Range(TranslateSourceLocation(*M, RawRange.getBegin()),
-                    TranslateSourceLocation(*M, RawRange.getEnd()));
+  SourceRange Range(ReadSourceLocation(*M, RawRange.getBegin()),
+                    ReadSourceLocation(*M, RawRange.getEnd()));
   assert(Range.isValid());
   return Range;
 }
@@ -6284,7 +6264,7 @@ PreprocessedEntity *ASTReader::ReadPreprocessedEntity(unsigned Index) {
 
   SavedStreamPosition SavedPosition(M.PreprocessorDetailCursor);
   if (llvm::Error Err = M.PreprocessorDetailCursor.JumpToBit(
-          M.MacroOffsetsBase + PPOffs.BitOffset)) {
+          M.MacroOffsetsBase + PPOffs.getOffset())) {
     Error(std::move(Err));
     return nullptr;
   }
@@ -6301,8 +6281,8 @@ PreprocessedEntity *ASTReader::ReadPreprocessedEntity(unsigned Index) {
     return nullptr;
 
   // Read the record.
-  SourceRange Range(TranslateSourceLocation(M, PPOffs.getBegin()),
-                    TranslateSourceLocation(M, PPOffs.getEnd()));
+  SourceRange Range(ReadSourceLocation(M, PPOffs.getBegin()),
+                    ReadSourceLocation(M, PPOffs.getEnd()));
   PreprocessingRecord &PPRec = *PP.getPreprocessingRecord();
   StringRef Blob;
   RecordData Record;
@@ -6414,7 +6394,7 @@ struct PPEntityComp {
   }
 
   SourceLocation getLoc(const PPEntityOffset &PPE) const {
-    return Reader.TranslateSourceLocation(M, PPE.getBegin());
+    return Reader.ReadSourceLocation(M, PPE.getBegin());
   }
 };
 
@@ -6458,7 +6438,7 @@ PreprocessedEntityID ASTReader::findPreprocessedEntity(SourceLocation Loc,
       PPI = First;
       std::advance(PPI, Half);
       if (SourceMgr.isBeforeInTranslationUnit(
-              TranslateSourceLocation(M, PPI->getEnd()), Loc)) {
+              ReadSourceLocation(M, PPI->getEnd()), Loc)) {
         First = PPI;
         ++First;
         Count = Count - Half - 1;
@@ -6499,7 +6479,7 @@ std::optional<bool> ASTReader::isPreprocessedEntityInFileID(unsigned Index,
   unsigned LocalIndex = PPInfo.second;
   const PPEntityOffset &PPOffs = M.PreprocessedEntityOffsets[LocalIndex];
 
-  SourceLocation Loc = TranslateSourceLocation(M, PPOffs.getBegin());
+  SourceLocation Loc = ReadSourceLocation(M, PPOffs.getBegin());
   if (Loc.isInvalid())
     return false;
 
@@ -6690,9 +6670,8 @@ ASTReader::RecordLocation ASTReader::TypeCursorForIndex(unsigned Index) {
   GlobalTypeMapType::iterator I = GlobalTypeMap.find(Index);
   assert(I != GlobalTypeMap.end() && "Corrupted global type map");
   ModuleFile *M = I->second;
-  return RecordLocation(
-      M, M->TypeOffsets[Index - M->BaseTypeIndex].getBitOffset() +
-             M->DeclsBlockStartOffset);
+  return RecordLocation(M, M->TypeOffsets[Index - M->BaseTypeIndex].get() +
+                               M->DeclsBlockStartOffset);
 }
 
 static std::optional<Type::TypeClass> getTypeClassForCode(TypeCode code) {
@@ -7320,6 +7299,9 @@ QualType ASTReader::GetType(TypeID ID) {
     case PREDEF_TYPE_OVERLOAD_ID:
       T = Context.OverloadTy;
       break;
+    case PREDEF_TYPE_UNRESOLVED_TEMPLATE:
+      T = Context.UnresolvedTemplateTy;
+      break;
     case PREDEF_TYPE_BOUND_MEMBER:
       T = Context.BoundMemberTy;
       break;
@@ -8972,7 +8954,7 @@ MacroID ASTReader::getGlobalMacroID(ModuleFile &M, unsigned LocalID) {
 }
 
 serialization::SubmoduleID
-ASTReader::getGlobalSubmoduleID(ModuleFile &M, unsigned LocalID) {
+ASTReader::getGlobalSubmoduleID(ModuleFile &M, unsigned LocalID) const {
   if (LocalID < NUM_PREDEF_SUBMODULE_IDS)
     return LocalID;
 
@@ -9005,7 +8987,7 @@ Module *ASTReader::getModule(unsigned ID) {
   return getSubmodule(ID);
 }
 
-ModuleFile *ASTReader::getLocalModuleFile(ModuleFile &M, unsigned ID) {
+ModuleFile *ASTReader::getLocalModuleFile(ModuleFile &M, unsigned ID) const {
   if (ID & 1) {
     // It's a module, look it up by submodule ID.
     auto I = GlobalSubmoduleMap.find(getGlobalSubmoduleID(M, ID >> 1));
@@ -11784,6 +11766,14 @@ SmallVector<Expr *> ASTRecordReader::readOpenACCVarList() {
   return VarList;
 }
 
+SmallVector<Expr *> ASTRecordReader::readOpenACCIntExprList() {
+  unsigned NumExprs = readInt();
+  llvm::SmallVector<Expr *> ExprList;
+  for (unsigned I = 0; I < NumExprs; ++I)
+    ExprList.push_back(readSubExpr());
+  return ExprList;
+}
+
 OpenACCClause *ASTRecordReader::readOpenACCClause() {
   OpenACCClauseKind ClauseKind = readEnum<OpenACCClauseKind>();
   SourceLocation BeginLoc = readSourceLocation();
@@ -11835,6 +11825,87 @@ OpenACCClause *ASTRecordReader::readOpenACCClause() {
     return OpenACCPrivateClause::Create(getContext(), BeginLoc, LParenLoc,
                                         VarList, EndLoc);
   }
+  case OpenACCClauseKind::FirstPrivate: {
+    SourceLocation LParenLoc = readSourceLocation();
+    llvm::SmallVector<Expr *> VarList = readOpenACCVarList();
+    return OpenACCFirstPrivateClause::Create(getContext(), BeginLoc, LParenLoc,
+                                             VarList, EndLoc);
+  }
+  case OpenACCClauseKind::Attach: {
+    SourceLocation LParenLoc = readSourceLocation();
+    llvm::SmallVector<Expr *> VarList = readOpenACCVarList();
+    return OpenACCAttachClause::Create(getContext(), BeginLoc, LParenLoc,
+                                       VarList, EndLoc);
+  }
+  case OpenACCClauseKind::DevicePtr: {
+    SourceLocation LParenLoc = readSourceLocation();
+    llvm::SmallVector<Expr *> VarList = readOpenACCVarList();
+    return OpenACCDevicePtrClause::Create(getContext(), BeginLoc, LParenLoc,
+                                          VarList, EndLoc);
+  }
+  case OpenACCClauseKind::NoCreate: {
+    SourceLocation LParenLoc = readSourceLocation();
+    llvm::SmallVector<Expr *> VarList = readOpenACCVarList();
+    return OpenACCNoCreateClause::Create(getContext(), BeginLoc, LParenLoc,
+                                         VarList, EndLoc);
+  }
+  case OpenACCClauseKind::Present: {
+    SourceLocation LParenLoc = readSourceLocation();
+    llvm::SmallVector<Expr *> VarList = readOpenACCVarList();
+    return OpenACCPresentClause::Create(getContext(), BeginLoc, LParenLoc,
+                                        VarList, EndLoc);
+  }
+  case OpenACCClauseKind::PCopy:
+  case OpenACCClauseKind::PresentOrCopy:
+  case OpenACCClauseKind::Copy: {
+    SourceLocation LParenLoc = readSourceLocation();
+    llvm::SmallVector<Expr *> VarList = readOpenACCVarList();
+    return OpenACCCopyClause::Create(getContext(), ClauseKind, BeginLoc,
+                                     LParenLoc, VarList, EndLoc);
+  }
+  case OpenACCClauseKind::CopyIn:
+  case OpenACCClauseKind::PCopyIn:
+  case OpenACCClauseKind::PresentOrCopyIn: {
+    SourceLocation LParenLoc = readSourceLocation();
+    bool IsReadOnly = readBool();
+    llvm::SmallVector<Expr *> VarList = readOpenACCVarList();
+    return OpenACCCopyInClause::Create(getContext(), ClauseKind, BeginLoc,
+                                       LParenLoc, IsReadOnly, VarList, EndLoc);
+  }
+  case OpenACCClauseKind::CopyOut:
+  case OpenACCClauseKind::PCopyOut:
+  case OpenACCClauseKind::PresentOrCopyOut: {
+    SourceLocation LParenLoc = readSourceLocation();
+    bool IsZero = readBool();
+    llvm::SmallVector<Expr *> VarList = readOpenACCVarList();
+    return OpenACCCopyOutClause::Create(getContext(), ClauseKind, BeginLoc,
+                                        LParenLoc, IsZero, VarList, EndLoc);
+  }
+  case OpenACCClauseKind::Create:
+  case OpenACCClauseKind::PCreate:
+  case OpenACCClauseKind::PresentOrCreate: {
+    SourceLocation LParenLoc = readSourceLocation();
+    bool IsZero = readBool();
+    llvm::SmallVector<Expr *> VarList = readOpenACCVarList();
+    return OpenACCCreateClause::Create(getContext(), ClauseKind, BeginLoc,
+                                       LParenLoc, IsZero, VarList, EndLoc);
+  }
+  case OpenACCClauseKind::Async: {
+    SourceLocation LParenLoc = readSourceLocation();
+    Expr *AsyncExpr = readBool() ? readSubExpr() : nullptr;
+    return OpenACCAsyncClause::Create(getContext(), BeginLoc, LParenLoc,
+                                      AsyncExpr, EndLoc);
+  }
+  case OpenACCClauseKind::Wait: {
+    SourceLocation LParenLoc = readSourceLocation();
+    Expr *DevNumExpr = readBool() ? readSubExpr() : nullptr;
+    SourceLocation QueuesLoc = readSourceLocation();
+    llvm::SmallVector<Expr *> QueueIdExprs = readOpenACCIntExprList();
+    return OpenACCWaitClause::Create(getContext(), BeginLoc, LParenLoc,
+                                     DevNumExpr, QueuesLoc, QueueIdExprs,
+                                     EndLoc);
+  }
+
   case OpenACCClauseKind::Finalize:
   case OpenACCClauseKind::IfPresent:
   case OpenACCClauseKind::Seq:
@@ -11843,22 +11914,13 @@ OpenACCClause *ASTRecordReader::readOpenACCClause() {
   case OpenACCClauseKind::Worker:
   case OpenACCClauseKind::Vector:
   case OpenACCClauseKind::NoHost:
-  case OpenACCClauseKind::Copy:
   case OpenACCClauseKind::UseDevice:
-  case OpenACCClauseKind::Attach:
   case OpenACCClauseKind::Delete:
   case OpenACCClauseKind::Detach:
   case OpenACCClauseKind::Device:
-  case OpenACCClauseKind::DevicePtr:
   case OpenACCClauseKind::DeviceResident:
-  case OpenACCClauseKind::FirstPrivate:
   case OpenACCClauseKind::Host:
   case OpenACCClauseKind::Link:
-  case OpenACCClauseKind::NoCreate:
-  case OpenACCClauseKind::Present:
-  case OpenACCClauseKind::CopyOut:
-  case OpenACCClauseKind::CopyIn:
-  case OpenACCClauseKind::Create:
   case OpenACCClauseKind::Reduction:
   case OpenACCClauseKind::Collapse:
   case OpenACCClauseKind::Bind:
@@ -11866,10 +11928,8 @@ OpenACCClause *ASTRecordReader::readOpenACCClause() {
   case OpenACCClauseKind::DefaultAsync:
   case OpenACCClauseKind::DeviceType:
   case OpenACCClauseKind::DType:
-  case OpenACCClauseKind::Async:
   case OpenACCClauseKind::Tile:
   case OpenACCClauseKind::Gang:
-  case OpenACCClauseKind::Wait:
   case OpenACCClauseKind::Invalid:
     llvm_unreachable("Clause serialization not yet implemented");
   }
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 744f11de88c2..089ede4f4926 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -805,9 +805,7 @@ void ASTDeclReader::VisitEnumDecl(EnumDecl *ED) {
 
   // If this is a definition subject to the ODR, and we already have a
   // definition, merge this one into it.
-  if (ED->isCompleteDefinition() &&
-      Reader.getContext().getLangOpts().Modules &&
-      Reader.getContext().getLangOpts().CPlusPlus) {
+  if (ED->isCompleteDefinition() && Reader.getContext().getLangOpts().Modules) {
     EnumDecl *&OldDef = Reader.EnumDefinitions[ED->getCanonicalDecl()];
     if (!OldDef) {
       // This is the first time we've seen an imported definition. Look for a
@@ -3249,7 +3247,7 @@ ASTReader::RecordLocation ASTReader::DeclCursorForID(GlobalDeclID ID,
   ModuleFile *M = I->second;
   const DeclOffset &DOffs =
       M->DeclOffsets[ID.get() - M->BaseDeclID - NUM_PREDEF_DECL_IDS];
-  Loc = TranslateSourceLocation(*M, DOffs.getLocation());
+  Loc = ReadSourceLocation(*M, DOffs.getRawLoc());
   return RecordLocation(M, DOffs.getBitOffset(M->DeclsBlockStartOffset));
 }
 
@@ -3304,8 +3302,7 @@ DeclContext *ASTDeclReader::getPrimaryContextForMerging(ASTReader &Reader,
     return RD->getDefinition();
 
   if (auto *ED = dyn_cast<EnumDecl>(DC))
-    return ED->getASTContext().getLangOpts().CPlusPlus? ED->getDefinition()
-                                                      : nullptr;
+    return ED->getDefinition();
 
   if (auto *OID = dyn_cast<ObjCInterfaceDecl>(DC))
     return OID->getDefinition();
diff --git a/clang/lib/Serialization/ASTReaderInternals.h b/clang/lib/Serialization/ASTReaderInternals.h
index 49268ad5251d..536b19f91691 100644
--- a/clang/lib/Serialization/ASTReaderInternals.h
+++ b/clang/lib/Serialization/ASTReaderInternals.h
@@ -175,7 +175,7 @@ public:
                      const unsigned char* d,
                      unsigned DataLen);
 
-  IdentID ReadIdentifierID(const unsigned char *d);
+  IdentifierID ReadIdentifierID(const unsigned char *d);
 
   ASTReader &getReader() const { return Reader; }
 };
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 80c7ce643088..6154ead589d3 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -1200,6 +1200,31 @@ ASTFileSignature ASTWriter::createSignatureForNamedModule() const {
   for (auto [ExportImported, _] : WritingModule->Exports)
     Hasher.update(ExportImported->Signature);
 
+  // We combine all the used modules to make sure the signature is precise.
+  // Consider the case like:
+  //
+  // // a.cppm
+  // export module a;
+  // export inline int a() { ... }
+  //
+  // // b.cppm
+  // export module b;
+  // import a;
+  // export inline int b() { return a(); }
+  //
+  // Since both `a()` and `b()` are inline, we need to make sure the BMI of
+  // `b.pcm` will change after the implementation of `a()` changes. We can't
+  // get that naturally since we won't record the body of `a()` during the
+  // writing process. We can't reuse ODRHash here since ODRHash won't calculate
+  // the called function recursively. So ODRHash will be problematic if `a()`
+  // calls other inline functions.
+  //
+  // Probably we can solve this by a new hash mechanism. But the safety and
+  // efficiency may a problem too. Here we just combine the hash value of the
+  // used modules conservatively.
+  for (Module *M : TouchedTopLevelModules)
+    Hasher.update(M->Signature);
+
   return ASTFileSignature::create(Hasher.result());
 }
 
@@ -2729,8 +2754,10 @@ void ASTWriter::WritePreprocessorDetail(PreprocessingRecord &PPRec,
 
     uint64_t Offset = Stream.GetCurrentBitNo() - MacroOffsetsBase;
     assert((Offset >> 32) == 0 && "Preprocessed entity offset too large");
-    PreprocessedEntityOffsets.push_back(
-        PPEntityOffset(getAdjustedRange((*E)->getSourceRange()), Offset));
+    SourceRange R = getAdjustedRange((*E)->getSourceRange());
+    PreprocessedEntityOffsets.emplace_back(
+        getRawSourceLocationEncoding(R.getBegin()),
+        getRawSourceLocationEncoding(R.getEnd()), Offset);
 
     if (auto *MD = dyn_cast<MacroDefinitionRecord>(*E)) {
       // Record this macro definition's ID.
@@ -2797,7 +2824,9 @@ void ASTWriter::WritePreprocessorDetail(PreprocessingRecord &PPRec,
     std::vector<PPSkippedRange> SerializedSkippedRanges;
     SerializedSkippedRanges.reserve(SkippedRanges.size());
     for (auto const& Range : SkippedRanges)
-      SerializedSkippedRanges.emplace_back(Range);
+      SerializedSkippedRanges.emplace_back(
+          getRawSourceLocationEncoding(Range.getBegin()),
+          getRawSourceLocationEncoding(Range.getEnd()));
 
     using namespace llvm;
     auto Abbrev = std::make_shared<BitCodeAbbrev>();
@@ -2963,8 +2992,8 @@ void ASTWriter::WriteSubmodules(Module *WritingModule) {
       ParentID = SubmoduleIDs[Mod->Parent];
     }
 
-    uint64_t DefinitionLoc =
-        SourceLocationEncoding::encode(getAdjustedLocation(Mod->DefinitionLoc));
+    SourceLocationEncoding::RawLocEncoding DefinitionLoc =
+        getRawSourceLocationEncoding(getAdjustedLocation(Mod->DefinitionLoc));
 
     // Emit the definition of the block.
     {
@@ -2988,8 +3017,8 @@ void ASTWriter::WriteSubmodules(Module *WritingModule) {
 
     // Emit the requirements.
     for (const auto &R : Mod->Requirements) {
-      RecordData::value_type Record[] = {SUBMODULE_REQUIRES, R.second};
-      Stream.EmitRecordWithBlob(RequiresAbbrev, Record, R.first);
+      RecordData::value_type Record[] = {SUBMODULE_REQUIRES, R.RequiredState};
+      Stream.EmitRecordWithBlob(RequiresAbbrev, Record, R.FeatureName);
     }
 
     // Emit the umbrella header, if there is one.
@@ -3247,7 +3276,7 @@ void ASTWriter::WriteType(QualType T) {
     TypeOffsets.emplace_back(Offset);
   else if (TypeOffsets.size() < Index) {
     TypeOffsets.resize(Index + 1);
-    TypeOffsets[Index].setBitOffset(Offset);
+    TypeOffsets[Index].set(Offset);
   } else {
     llvm_unreachable("Types emitted in wrong order");
   }
@@ -3717,7 +3746,7 @@ public:
   using key_type = const IdentifierInfo *;
   using key_type_ref = key_type;
 
-  using data_type = IdentID;
+  using data_type = IdentifierID;
   using data_type_ref = data_type;
 
   using hash_value_type = unsigned;
@@ -3746,7 +3775,7 @@ public:
   }
 
   std::pair<unsigned, unsigned>
-  EmitKeyDataLength(raw_ostream &Out, const IdentifierInfo *II, IdentID ID) {
+  EmitKeyDataLength(raw_ostream &Out, const IdentifierInfo *II, IdentifierID ID) {
     // Record the location of the identifier data. This is used when generating
     // the mapping from persistent IDs to strings.
     Writer.SetIdentifierOffset(II, Out.tell());
@@ -3778,7 +3807,7 @@ public:
     Out.write(II->getNameStart(), KeyLen);
   }
 
-  void EmitData(raw_ostream &Out, const IdentifierInfo *II, IdentID ID,
+  void EmitData(raw_ostream &Out, const IdentifierInfo *II, IdentifierID ID,
                 unsigned) {
     using namespace llvm::support;
 
@@ -3862,7 +3891,7 @@ void ASTWriter::WriteIdentifierTable(Preprocessor &PP,
     IdentifierOffsets.resize(NextIdentID - FirstIdentID);
     for (auto IdentIDPair : IdentifierIDs) {
       const IdentifierInfo *II = IdentIDPair.first;
-      IdentID ID = IdentIDPair.second;
+      IdentifierID ID = IdentIDPair.second;
       assert(II && "NULL identifier in identifier table");
 
       // Write out identifiers if either the ID is local or the identifier has
@@ -4750,7 +4779,7 @@ void ASTWriter::AddVersionTuple(const VersionTuple &Version,
 /// Note that the identifier II occurs at the given offset
 /// within the identifier table.
 void ASTWriter::SetIdentifierOffset(const IdentifierInfo *II, uint32_t Offset) {
-  IdentID ID = IdentifierIDs[II];
+  IdentifierID ID = IdentifierIDs[II];
   // Only store offsets new to this AST file. Other identifier names are looked
   // up earlier in the chain and thus don't need an offset.
   if (ID >= FirstIdentID)
@@ -5380,7 +5409,6 @@ ASTFileSignature ASTWriter::WriteASTCore(Sema &SemaRef, StringRef isysroot,
 
         // These values should be unique within a chain, since they will be read
         // as keys into ContinuousRangeMaps.
-        writeBaseIDOrNone(M.SLocEntryBaseOffset, M.LocalNumSLocEntries);
         writeBaseIDOrNone(M.BaseIdentifierID, M.LocalNumIdentifiers);
         writeBaseIDOrNone(M.BaseMacroID, M.LocalNumMacros);
         writeBaseIDOrNone(M.BasePreprocessedEntityID,
@@ -5873,10 +5901,34 @@ void ASTWriter::AddFileID(FileID FID, RecordDataImpl &Record) {
   Record.push_back(getAdjustedFileID(FID).getOpaqueValue());
 }
 
+SourceLocationEncoding::RawLocEncoding
+ASTWriter::getRawSourceLocationEncoding(SourceLocation Loc, LocSeq *Seq) {
+  unsigned BaseOffset = 0;
+  unsigned ModuleFileIndex = 0;
+
+  // See SourceLocationEncoding.h for the encoding details.
+  if (Context->getSourceManager().isLoadedSourceLocation(Loc) &&
+      Loc.isValid()) {
+    assert(getChain());
+    auto SLocMapI = getChain()->GlobalSLocOffsetMap.find(
+        SourceManager::MaxLoadedOffset - Loc.getOffset() - 1);
+    assert(SLocMapI != getChain()->GlobalSLocOffsetMap.end() &&
+           "Corrupted global sloc offset map");
+    ModuleFile *F = SLocMapI->second;
+    BaseOffset = F->SLocEntryBaseOffset - 2;
+    // 0 means the location is not loaded. So we need to add 1 to the index to
+    // make it clear.
+    ModuleFileIndex = F->Index + 1;
+    assert(&getChain()->getModuleManager()[F->Index] == F);
+  }
+
+  return SourceLocationEncoding::encode(Loc, BaseOffset, ModuleFileIndex, Seq);
+}
+
 void ASTWriter::AddSourceLocation(SourceLocation Loc, RecordDataImpl &Record,
                                   SourceLocationSequence *Seq) {
   Loc = getAdjustedLocation(Loc);
-  Record.push_back(SourceLocationEncoding::encode(Loc, Seq));
+  Record.push_back(getRawSourceLocationEncoding(Loc, Seq));
 }
 
 void ASTWriter::AddSourceRange(SourceRange Range, RecordDataImpl &Record,
@@ -5893,11 +5945,11 @@ void ASTWriter::AddIdentifierRef(const IdentifierInfo *II, RecordDataImpl &Recor
   Record.push_back(getIdentifierRef(II));
 }
 
-IdentID ASTWriter::getIdentifierRef(const IdentifierInfo *II) {
+IdentifierID ASTWriter::getIdentifierRef(const IdentifierInfo *II) {
   if (!II)
     return 0;
 
-  IdentID &ID = IdentifierIDs[II];
+  IdentifierID &ID = IdentifierIDs[II];
   if (ID == 0)
     ID = NextIdentID++;
   return ID;
@@ -6085,8 +6137,12 @@ LocalDeclID ASTWriter::GetDeclRef(const Decl *D) {
 
   // If D comes from an AST file, its declaration ID is already known and
   // fixed.
-  if (D->isFromASTFile())
+  if (D->isFromASTFile()) {
+    if (isWritingStdCXXNamedModules() && D->getOwningModule())
+      TouchedTopLevelModules.insert(D->getOwningModule()->getTopLevelModule());
+
     return LocalDeclID(D->getGlobalID());
+  }
 
   assert(!(reinterpret_cast<uintptr_t>(D) & 0x01) && "Invalid decl pointer");
   LocalDeclID &ID = DeclIDs[D];
@@ -6554,9 +6610,9 @@ void ASTWriter::ReaderInitialized(ASTReader *Reader) {
   NextSubmoduleID = FirstSubmoduleID;
 }
 
-void ASTWriter::IdentifierRead(IdentID ID, IdentifierInfo *II) {
+void ASTWriter::IdentifierRead(IdentifierID ID, IdentifierInfo *II) {
   // Always keep the highest ID. See \p TypeRead() for more information.
-  IdentID &StoredID = IdentifierIDs[II];
+  IdentifierID &StoredID = IdentifierIDs[II];
   if (ID > StoredID)
     StoredID = ID;
 }
@@ -7735,6 +7791,12 @@ void ASTRecordWriter::writeOpenACCVarList(const OpenACCClauseWithVarList *C) {
     AddStmt(E);
 }
 
+void ASTRecordWriter::writeOpenACCIntExprList(ArrayRef<Expr *> Exprs) {
+  writeUInt32(Exprs.size());
+  for (Expr *E : Exprs)
+    AddStmt(E);
+}
+
 void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
   writeEnum(C->getClauseKind());
   writeSourceLocation(C->getBeginLoc());
@@ -7754,7 +7816,7 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
     return;
   }
   case OpenACCClauseKind::Self: {
-    const auto *SC = cast<OpenACCIfClause>(C);
+    const auto *SC = cast<OpenACCSelfClause>(C);
     writeSourceLocation(SC->getLParenLoc());
     writeBool(SC->hasConditionExpr());
     if (SC->hasConditionExpr())
@@ -7787,6 +7849,91 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
     writeOpenACCVarList(PC);
     return;
   }
+  case OpenACCClauseKind::FirstPrivate: {
+    const auto *FPC = cast<OpenACCFirstPrivateClause>(C);
+    writeSourceLocation(FPC->getLParenLoc());
+    writeOpenACCVarList(FPC);
+    return;
+  }
+  case OpenACCClauseKind::Attach: {
+    const auto *AC = cast<OpenACCAttachClause>(C);
+    writeSourceLocation(AC->getLParenLoc());
+    writeOpenACCVarList(AC);
+    return;
+  }
+  case OpenACCClauseKind::DevicePtr: {
+    const auto *DPC = cast<OpenACCDevicePtrClause>(C);
+    writeSourceLocation(DPC->getLParenLoc());
+    writeOpenACCVarList(DPC);
+    return;
+  }
+  case OpenACCClauseKind::NoCreate: {
+    const auto *NCC = cast<OpenACCNoCreateClause>(C);
+    writeSourceLocation(NCC->getLParenLoc());
+    writeOpenACCVarList(NCC);
+    return;
+  }
+  case OpenACCClauseKind::Present: {
+    const auto *PC = cast<OpenACCPresentClause>(C);
+    writeSourceLocation(PC->getLParenLoc());
+    writeOpenACCVarList(PC);
+    return;
+  }
+  case OpenACCClauseKind::Copy:
+  case OpenACCClauseKind::PCopy:
+  case OpenACCClauseKind::PresentOrCopy: {
+    const auto *CC = cast<OpenACCCopyClause>(C);
+    writeSourceLocation(CC->getLParenLoc());
+    writeOpenACCVarList(CC);
+    return;
+  }
+  case OpenACCClauseKind::CopyIn:
+  case OpenACCClauseKind::PCopyIn:
+  case OpenACCClauseKind::PresentOrCopyIn: {
+    const auto *CIC = cast<OpenACCCopyInClause>(C);
+    writeSourceLocation(CIC->getLParenLoc());
+    writeBool(CIC->isReadOnly());
+    writeOpenACCVarList(CIC);
+    return;
+  }
+  case OpenACCClauseKind::CopyOut:
+  case OpenACCClauseKind::PCopyOut:
+  case OpenACCClauseKind::PresentOrCopyOut: {
+    const auto *COC = cast<OpenACCCopyOutClause>(C);
+    writeSourceLocation(COC->getLParenLoc());
+    writeBool(COC->isZero());
+    writeOpenACCVarList(COC);
+    return;
+  }
+  case OpenACCClauseKind::Create:
+  case OpenACCClauseKind::PCreate:
+  case OpenACCClauseKind::PresentOrCreate: {
+    const auto *CC = cast<OpenACCCreateClause>(C);
+    writeSourceLocation(CC->getLParenLoc());
+    writeBool(CC->isZero());
+    writeOpenACCVarList(CC);
+    return;
+  }
+  case OpenACCClauseKind::Async: {
+    const auto *AC = cast<OpenACCAsyncClause>(C);
+    writeSourceLocation(AC->getLParenLoc());
+    writeBool(AC->hasIntExpr());
+    if (AC->hasIntExpr())
+      AddStmt(const_cast<Expr *>(AC->getIntExpr()));
+    return;
+  }
+  case OpenACCClauseKind::Wait: {
+    const auto *WC = cast<OpenACCWaitClause>(C);
+    writeSourceLocation(WC->getLParenLoc());
+    writeBool(WC->getDevNumExpr());
+    if (const Expr *DNE = WC->getDevNumExpr())
+      AddStmt(const_cast<Expr *>(DNE));
+    writeSourceLocation(WC->getQueuesLoc());
+
+    writeOpenACCIntExprList(WC->getQueueIdExprs());
+    return;
+  }
+
   case OpenACCClauseKind::Finalize:
   case OpenACCClauseKind::IfPresent:
   case OpenACCClauseKind::Seq:
@@ -7795,22 +7942,13 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
   case OpenACCClauseKind::Worker:
   case OpenACCClauseKind::Vector:
   case OpenACCClauseKind::NoHost:
-  case OpenACCClauseKind::Copy:
   case OpenACCClauseKind::UseDevice:
-  case OpenACCClauseKind::Attach:
   case OpenACCClauseKind::Delete:
   case OpenACCClauseKind::Detach:
   case OpenACCClauseKind::Device:
-  case OpenACCClauseKind::DevicePtr:
   case OpenACCClauseKind::DeviceResident:
-  case OpenACCClauseKind::FirstPrivate:
   case OpenACCClauseKind::Host:
   case OpenACCClauseKind::Link:
-  case OpenACCClauseKind::NoCreate:
-  case OpenACCClauseKind::Present:
-  case OpenACCClauseKind::CopyOut:
-  case OpenACCClauseKind::CopyIn:
-  case OpenACCClauseKind::Create:
   case OpenACCClauseKind::Reduction:
   case OpenACCClauseKind::Collapse:
   case OpenACCClauseKind::Bind:
@@ -7818,10 +7956,8 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
   case OpenACCClauseKind::DefaultAsync:
   case OpenACCClauseKind::DeviceType:
   case OpenACCClauseKind::DType:
-  case OpenACCClauseKind::Async:
   case OpenACCClauseKind::Tile:
   case OpenACCClauseKind::Gang:
-  case OpenACCClauseKind::Wait:
   case OpenACCClauseKind::Invalid:
     llvm_unreachable("Clause serialization not yet implemented");
   }
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index 0edc4feda3ef..6201d284f0e0 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -2809,14 +2809,16 @@ void ASTWriter::WriteDecl(ASTContext &Context, Decl *D) {
 
   // Record the offset for this declaration
   SourceLocation Loc = D->getLocation();
+  SourceLocationEncoding::RawLocEncoding RawLoc =
+      getRawSourceLocationEncoding(getAdjustedLocation(Loc));
+
   unsigned Index = ID.get() - FirstDeclID.get();
   if (DeclOffsets.size() == Index)
-    DeclOffsets.emplace_back(getAdjustedLocation(Loc), Offset,
-                             DeclTypesBlockStartOffset);
+    DeclOffsets.emplace_back(RawLoc, Offset, DeclTypesBlockStartOffset);
   else if (DeclOffsets.size() < Index) {
     // FIXME: Can/should this happen?
     DeclOffsets.resize(Index+1);
-    DeclOffsets[Index].setLocation(getAdjustedLocation(Loc));
+    DeclOffsets[Index].setRawLoc(RawLoc);
     DeclOffsets[Index].setBitOffset(Offset, DeclTypesBlockStartOffset);
   } else {
     llvm_unreachable("declarations should be emitted in ID order");
diff --git a/clang/lib/Serialization/ModuleFile.cpp b/clang/lib/Serialization/ModuleFile.cpp
index db896fd36115..2c42d33a8f5d 100644
--- a/clang/lib/Serialization/ModuleFile.cpp
+++ b/clang/lib/Serialization/ModuleFile.cpp
@@ -59,7 +59,6 @@ LLVM_DUMP_METHOD void ModuleFile::dump() {
   // Remapping tables.
   llvm::errs() << "  Base source location offset: " << SLocEntryBaseOffset
                << '\n';
-  dumpLocalRemap("Source location offset local -> global map", SLocRemap);
 
   llvm::errs() << "  Base identifier ID: " << BaseIdentifierID << '\n'
                << "  Number of identifiers: " << LocalNumIdentifiers << '\n';
diff --git a/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp
index e4373915410f..e138debd1361 100644
--- a/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp
@@ -148,27 +148,28 @@ using MutexDescriptor =
 class BlockInCriticalSectionChecker : public Checker<check::PostCall> {
 private:
   const std::array<MutexDescriptor, 8> MutexDescriptors{
-      MemberMutexDescriptor(
-          CallDescription(/*QualifiedName=*/{"std", "mutex", "lock"},
-                          /*RequiredArgs=*/0),
-          CallDescription({"std", "mutex", "unlock"}, 0)),
-      FirstArgMutexDescriptor(CallDescription({"pthread_mutex_lock"}, 1),
-                              CallDescription({"pthread_mutex_unlock"}, 1)),
-      FirstArgMutexDescriptor(CallDescription({"mtx_lock"}, 1),
-                              CallDescription({"mtx_unlock"}, 1)),
-      FirstArgMutexDescriptor(CallDescription({"pthread_mutex_trylock"}, 1),
-                              CallDescription({"pthread_mutex_unlock"}, 1)),
-      FirstArgMutexDescriptor(CallDescription({"mtx_trylock"}, 1),
-                              CallDescription({"mtx_unlock"}, 1)),
-      FirstArgMutexDescriptor(CallDescription({"mtx_timedlock"}, 1),
-                              CallDescription({"mtx_unlock"}, 1)),
+      MemberMutexDescriptor({/*MatchAs=*/CDM::CXXMethod,
+                             /*QualifiedName=*/{"std", "mutex", "lock"},
+                             /*RequiredArgs=*/0},
+                            {CDM::CXXMethod, {"std", "mutex", "unlock"}, 0}),
+      FirstArgMutexDescriptor({CDM::CLibrary, {"pthread_mutex_lock"}, 1},
+                              {CDM::CLibrary, {"pthread_mutex_unlock"}, 1}),
+      FirstArgMutexDescriptor({CDM::CLibrary, {"mtx_lock"}, 1},
+                              {CDM::CLibrary, {"mtx_unlock"}, 1}),
+      FirstArgMutexDescriptor({CDM::CLibrary, {"pthread_mutex_trylock"}, 1},
+                              {CDM::CLibrary, {"pthread_mutex_unlock"}, 1}),
+      FirstArgMutexDescriptor({CDM::CLibrary, {"mtx_trylock"}, 1},
+                              {CDM::CLibrary, {"mtx_unlock"}, 1}),
+      FirstArgMutexDescriptor({CDM::CLibrary, {"mtx_timedlock"}, 1},
+                              {CDM::CLibrary, {"mtx_unlock"}, 1}),
       RAIIMutexDescriptor("lock_guard"),
       RAIIMutexDescriptor("unique_lock")};
 
-  const std::array<CallDescription, 5> BlockingFunctions{
-      ArrayRef{StringRef{"sleep"}}, ArrayRef{StringRef{"getc"}},
-      ArrayRef{StringRef{"fgets"}}, ArrayRef{StringRef{"read"}},
-      ArrayRef{StringRef{"recv"}}};
+  const CallDescriptionSet BlockingFunctions{{CDM::CLibrary, {"sleep"}},
+                                             {CDM::CLibrary, {"getc"}},
+                                             {CDM::CLibrary, {"fgets"}},
+                                             {CDM::CLibrary, {"read"}},
+                                             {CDM::CLibrary, {"recv"}}};
 
   const BugType BlockInCritSectionBugType{
       this, "Call to blocking function in critical section", "Blocking Error"};
@@ -291,8 +292,7 @@ void BlockInCriticalSectionChecker::handleUnlock(
 
 bool BlockInCriticalSectionChecker::isBlockingInCritSection(
     const CallEvent &Call, CheckerContext &C) const {
-  return llvm::any_of(BlockingFunctions,
-                      [&Call](auto &&Fn) { return Fn.matches(Call); }) &&
+  return BlockingFunctions.contains(Call) &&
          !C.getState()->get<ActiveCritSections>().isEmpty();
 }
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp
index 63844563de44..238e87a712a4 100644
--- a/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp
@@ -189,8 +189,8 @@ public:
   };
 
   // These require a bit of special handling.
-  CallDescription StdCopy{{"std", "copy"}, 3},
-      StdCopyBackward{{"std", "copy_backward"}, 3};
+  CallDescription StdCopy{CDM::SimpleFunc, {"std", "copy"}, 3},
+      StdCopyBackward{CDM::SimpleFunc, {"std", "copy_backward"}, 3};
 
   FnCheck identifyCall(const CallEvent &Call, CheckerContext &C) const;
   void evalMemcpy(CheckerContext &C, const CallEvent &Call, CharKind CK) const;
@@ -1338,6 +1338,9 @@ void CStringChecker::evalCopyCommon(CheckerContext &C, const CallEvent &Call,
 
   // If the size can be nonzero, we have to check the other arguments.
   if (stateNonZeroSize) {
+    // TODO: If Size is tainted and we cannot prove that it is smaller or equal
+    // to the size of the destination buffer, then emit a warning
+    // that an attacker may provoke a buffer overflow error.
     state = stateNonZeroSize;
 
     // Ensure the destination is not null. If it is NULL there will be a
diff --git a/clang/lib/StaticAnalyzer/Checkers/DereferenceChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/DereferenceChecker.cpp
index 1cebfbbee77d..0355eede75ea 100644
--- a/clang/lib/StaticAnalyzer/Checkers/DereferenceChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/DereferenceChecker.cpp
@@ -31,11 +31,13 @@ class DereferenceChecker
     : public Checker< check::Location,
                       check::Bind,
                       EventDispatcher<ImplicitNullDerefEvent> > {
-  enum DerefKind { NullPointer, UndefinedPointerValue };
+  enum DerefKind { NullPointer, UndefinedPointerValue, AddressOfLabel };
 
   BugType BT_Null{this, "Dereference of null pointer", categories::LogicError};
   BugType BT_Undef{this, "Dereference of undefined pointer value",
                    categories::LogicError};
+  BugType BT_Label{this, "Dereference of the address of a label",
+                   categories::LogicError};
 
   void reportBug(DerefKind K, ProgramStateRef State, const Stmt *S,
                  CheckerContext &C) const;
@@ -167,6 +169,11 @@ void DereferenceChecker::reportBug(DerefKind K, ProgramStateRef State,
     DerefStr1 = " results in an undefined pointer dereference";
     DerefStr2 = " results in a dereference of an undefined pointer value";
     break;
+  case DerefKind::AddressOfLabel:
+    BT = &BT_Label;
+    DerefStr1 = " results in an undefined pointer dereference";
+    DerefStr2 = " results in a dereference of an address of a label";
+    break;
   };
 
   // Generate an error node.
@@ -287,6 +294,12 @@ void DereferenceChecker::checkBind(SVal L, SVal V, const Stmt *S,
   if (V.isUndef())
     return;
 
+  // One should never write to label addresses.
+  if (auto Label = L.getAs<loc::GotoLabel>()) {
+    reportBug(DerefKind::AddressOfLabel, C.getState(), S, C);
+    return;
+  }
+
   const MemRegion *MR = L.getAsRegion();
   const TypedValueRegion *TVR = dyn_cast_or_null<TypedValueRegion>(MR);
   if (!TVR)
diff --git a/clang/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
index 89054512d65a..a0190c30bfd2 100644
--- a/clang/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
@@ -59,13 +59,6 @@ constexpr llvm::StringLiteral MsgSanitizeSystemArgs =
     "Untrusted data is passed to a system call "
     "(CERT/STR02-C. Sanitize data passed to complex subsystems)";
 
-/// Check if tainted data is used as a buffer size in strn.. functions,
-/// and allocators.
-constexpr llvm::StringLiteral MsgTaintedBufferSize =
-    "Untrusted data is used to specify the buffer size "
-    "(CERT/STR31-C. Guarantee that storage for strings has sufficient space "
-    "for character data and the null terminator)";
-
 /// Check if tainted data is used as a custom sink's parameter.
 constexpr llvm::StringLiteral MsgCustomSink =
     "Untrusted data is passed to a user-defined sink";
@@ -298,14 +291,6 @@ public:
     return {{}, {}, std::move(SrcArgs), std::move(DstArgs)};
   }
 
-  /// Make a rule that taints all PropDstArgs if any of PropSrcArgs is tainted.
-  static GenericTaintRule
-  SinkProp(ArgSet &&SinkArgs, ArgSet &&SrcArgs, ArgSet &&DstArgs,
-           std::optional<StringRef> Msg = std::nullopt) {
-    return {
-        std::move(SinkArgs), {}, std::move(SrcArgs), std::move(DstArgs), Msg};
-  }
-
   /// Process a function which could either be a taint source, a taint sink, a
   /// taint filter or a taint propagator.
   void process(const GenericTaintChecker &Checker, const CallEvent &Call,
@@ -733,12 +718,21 @@ void GenericTaintChecker::initTaintRules(CheckerContext &C) const {
       {{CDM::CLibraryMaybeHardened, {{"stpcpy"}}},
        TR::Prop({{1}}, {{0, ReturnValueIndex}})},
       {{CDM::CLibraryMaybeHardened, {{"strcat"}}},
-       TR::Prop({{1}}, {{0, ReturnValueIndex}})},
+       TR::Prop({{0, 1}}, {{0, ReturnValueIndex}})},
       {{CDM::CLibraryMaybeHardened, {{"wcsncat"}}},
-       TR::Prop({{1}}, {{0, ReturnValueIndex}})},
+       TR::Prop({{0, 1}}, {{0, ReturnValueIndex}})},
       {{CDM::CLibrary, {{"strdup"}}}, TR::Prop({{0}}, {{ReturnValueIndex}})},
       {{CDM::CLibrary, {{"strdupa"}}}, TR::Prop({{0}}, {{ReturnValueIndex}})},
       {{CDM::CLibrary, {{"wcsdup"}}}, TR::Prop({{0}}, {{ReturnValueIndex}})},
+      {{CDM::CLibrary, BI.getName(Builtin::BImemcpy)},
+       TR::Prop({{1, 2}}, {{0, ReturnValueIndex}})},
+      {{CDM::CLibrary, {BI.getName(Builtin::BImemmove)}},
+       TR::Prop({{1, 2}}, {{0, ReturnValueIndex}})},
+      {{CDM::CLibrary, {BI.getName(Builtin::BIstrncpy)}},
+       TR::Prop({{1, 2}}, {{0, ReturnValueIndex}})},
+      {{CDM::CLibrary, {BI.getName(Builtin::BIstrndup)}},
+       TR::Prop({{0, 1}}, {{ReturnValueIndex}})},
+      {{CDM::CLibrary, {"bcopy"}}, TR::Prop({{0, 2}}, {{1}})},
 
       // Sinks
       {{{"system"}}, TR::Sink({{0}}, MsgSanitizeSystemArgs)},
@@ -752,30 +746,15 @@ void GenericTaintChecker::initTaintRules(CheckerContext &C) const {
       {{{"execvp"}}, TR::Sink({{0, 1}}, MsgSanitizeSystemArgs)},
       {{{"execvpe"}}, TR::Sink({{0, 1, 2}}, MsgSanitizeSystemArgs)},
       {{{"dlopen"}}, TR::Sink({{0}}, MsgSanitizeSystemArgs)},
-      {{CDM::CLibrary, {{"malloc"}}}, TR::Sink({{0}}, MsgTaintedBufferSize)},
-      {{CDM::CLibrary, {{"calloc"}}}, TR::Sink({{0}}, MsgTaintedBufferSize)},
-      {{CDM::CLibrary, {{"alloca"}}}, TR::Sink({{0}}, MsgTaintedBufferSize)},
-      {{CDM::CLibrary, {{"memccpy"}}}, TR::Sink({{3}}, MsgTaintedBufferSize)},
-      {{CDM::CLibrary, {{"realloc"}}}, TR::Sink({{1}}, MsgTaintedBufferSize)},
+
+      // malloc, calloc, alloca, realloc, memccpy
+      // are intentionally not marked as taint sinks because unconditional
+      // reporting for these functions generates many false positives.
+      // These taint sinks should be implemented in other checkers with more
+      // sophisticated sanitation heuristics.
       {{{{"setproctitle"}}}, TR::Sink({{0}, 1}, MsgUncontrolledFormatString)},
       {{{{"setproctitle_fast"}}},
-       TR::Sink({{0}, 1}, MsgUncontrolledFormatString)},
-
-      // SinkProps
-      {{CDM::CLibraryMaybeHardened, BI.getName(Builtin::BImemcpy)},
-       TR::SinkProp({{2}}, {{1, 2}}, {{0, ReturnValueIndex}},
-                    MsgTaintedBufferSize)},
-      {{CDM::CLibraryMaybeHardened, {BI.getName(Builtin::BImemmove)}},
-       TR::SinkProp({{2}}, {{1, 2}}, {{0, ReturnValueIndex}},
-                    MsgTaintedBufferSize)},
-      {{CDM::CLibraryMaybeHardened, {BI.getName(Builtin::BIstrncpy)}},
-       TR::SinkProp({{2}}, {{1, 2}}, {{0, ReturnValueIndex}},
-                    MsgTaintedBufferSize)},
-      {{CDM::CLibrary, {BI.getName(Builtin::BIstrndup)}},
-       TR::SinkProp({{1}}, {{0, 1}}, {{ReturnValueIndex}},
-                    MsgTaintedBufferSize)},
-      {{CDM::CLibrary, {{"bcopy"}}},
-       TR::SinkProp({{2}}, {{0, 2}}, {{1}}, MsgTaintedBufferSize)}};
+       TR::Sink({{0}, 1}, MsgUncontrolledFormatString)}};
 
   // `getenv` returns taint only in untrusted environments.
   if (TR::UntrustedEnv(C)) {
@@ -1086,15 +1065,14 @@ void GenericTaintChecker::taintUnsafeSocketProtocol(const CallEvent &Call,
   const IdentifierInfo *ID = Call.getCalleeIdentifier();
   if (!ID)
     return;
-  if (!ID->getName().equals("socket"))
+  if (ID->getName() != "socket")
     return;
 
   SourceLocation DomLoc = Call.getArgExpr(0)->getExprLoc();
   StringRef DomName = C.getMacroNameOrSpelling(DomLoc);
   // Allow internal communication protocols.
-  bool SafeProtocol = DomName.equals("AF_SYSTEM") ||
-                      DomName.equals("AF_LOCAL") || DomName.equals("AF_UNIX") ||
-                      DomName.equals("AF_RESERVED_36");
+  bool SafeProtocol = DomName == "AF_SYSTEM" || DomName == "AF_LOCAL" ||
+                      DomName == "AF_UNIX" || DomName == "AF_RESERVED_36";
   if (SafeProtocol)
     return;
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/InnerPointerChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/InnerPointerChecker.cpp
index b673b51c4623..261db2b2a704 100644
--- a/clang/lib/StaticAnalyzer/Checkers/InnerPointerChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/InnerPointerChecker.cpp
@@ -35,9 +35,28 @@ namespace {
 class InnerPointerChecker
     : public Checker<check::DeadSymbols, check::PostCall> {
 
-  CallDescription AppendFn, AssignFn, AddressofFn, AddressofFn_, ClearFn,
-      CStrFn, DataFn, DataMemberFn, EraseFn, InsertFn, PopBackFn, PushBackFn,
-      ReplaceFn, ReserveFn, ResizeFn, ShrinkToFitFn, SwapFn;
+  CallDescriptionSet InvalidatingMemberFunctions{
+      CallDescription(CDM::CXXMethod, {"std", "basic_string", "append"}),
+      CallDescription(CDM::CXXMethod, {"std", "basic_string", "assign"}),
+      CallDescription(CDM::CXXMethod, {"std", "basic_string", "clear"}),
+      CallDescription(CDM::CXXMethod, {"std", "basic_string", "erase"}),
+      CallDescription(CDM::CXXMethod, {"std", "basic_string", "insert"}),
+      CallDescription(CDM::CXXMethod, {"std", "basic_string", "pop_back"}),
+      CallDescription(CDM::CXXMethod, {"std", "basic_string", "push_back"}),
+      CallDescription(CDM::CXXMethod, {"std", "basic_string", "replace"}),
+      CallDescription(CDM::CXXMethod, {"std", "basic_string", "reserve"}),
+      CallDescription(CDM::CXXMethod, {"std", "basic_string", "resize"}),
+      CallDescription(CDM::CXXMethod, {"std", "basic_string", "shrink_to_fit"}),
+      CallDescription(CDM::CXXMethod, {"std", "basic_string", "swap"})};
+
+  CallDescriptionSet AddressofFunctions{
+      CallDescription(CDM::SimpleFunc, {"std", "addressof"}),
+      CallDescription(CDM::SimpleFunc, {"std", "__addressof"})};
+
+  CallDescriptionSet InnerPointerAccessFunctions{
+      CallDescription(CDM::CXXMethod, {"std", "basic_string", "c_str"}),
+      CallDescription(CDM::SimpleFunc, {"std", "data"}, 1),
+      CallDescription(CDM::CXXMethod, {"std", "basic_string", "data"})};
 
 public:
   class InnerPointerBRVisitor : public BugReporterVisitor {
@@ -71,30 +90,10 @@ public:
     }
   };
 
-  InnerPointerChecker()
-      : AppendFn({"std", "basic_string", "append"}),
-        AssignFn({"std", "basic_string", "assign"}),
-        AddressofFn({"std", "addressof"}), AddressofFn_({"std", "__addressof"}),
-        ClearFn({"std", "basic_string", "clear"}),
-        CStrFn({"std", "basic_string", "c_str"}), DataFn({"std", "data"}, 1),
-        DataMemberFn({"std", "basic_string", "data"}),
-        EraseFn({"std", "basic_string", "erase"}),
-        InsertFn({"std", "basic_string", "insert"}),
-        PopBackFn({"std", "basic_string", "pop_back"}),
-        PushBackFn({"std", "basic_string", "push_back"}),
-        ReplaceFn({"std", "basic_string", "replace"}),
-        ReserveFn({"std", "basic_string", "reserve"}),
-        ResizeFn({"std", "basic_string", "resize"}),
-        ShrinkToFitFn({"std", "basic_string", "shrink_to_fit"}),
-        SwapFn({"std", "basic_string", "swap"}) {}
-
   /// Check whether the called member function potentially invalidates
   /// pointers referring to the container object's inner buffer.
   bool isInvalidatingMemberFunction(const CallEvent &Call) const;
 
-  /// Check whether the called function returns a raw inner pointer.
-  bool isInnerPointerAccessFunction(const CallEvent &Call) const;
-
   /// Mark pointer symbols associated with the given memory region released
   /// in the program state.
   void markPtrSymbolsReleased(const CallEvent &Call, ProgramStateRef State,
@@ -127,14 +126,7 @@ bool InnerPointerChecker::isInvalidatingMemberFunction(
     return false;
   }
   return isa<CXXDestructorCall>(Call) ||
-         matchesAny(Call, AppendFn, AssignFn, ClearFn, EraseFn, InsertFn,
-                    PopBackFn, PushBackFn, ReplaceFn, ReserveFn, ResizeFn,
-                    ShrinkToFitFn, SwapFn);
-}
-
-bool InnerPointerChecker::isInnerPointerAccessFunction(
-    const CallEvent &Call) const {
-  return matchesAny(Call, CStrFn, DataFn, DataMemberFn);
+         InvalidatingMemberFunctions.contains(Call);
 }
 
 void InnerPointerChecker::markPtrSymbolsReleased(const CallEvent &Call,
@@ -181,7 +173,7 @@ void InnerPointerChecker::checkFunctionArguments(const CallEvent &Call,
 
       // std::addressof functions accepts a non-const reference as an argument,
       // but doesn't modify it.
-      if (matchesAny(Call, AddressofFn, AddressofFn_))
+      if (AddressofFunctions.contains(Call))
         continue;
 
       markPtrSymbolsReleased(Call, State, ArgRegion, C);
@@ -221,7 +213,7 @@ void InnerPointerChecker::checkPostCall(const CallEvent &Call,
     }
   }
 
-  if (isInnerPointerAccessFunction(Call)) {
+  if (InnerPointerAccessFunctions.contains(Call)) {
 
     if (isa<SimpleFunctionCall>(Call)) {
       // NOTE: As of now, we only have one free access function: std::data.
diff --git a/clang/lib/StaticAnalyzer/Checkers/LLVMConventionsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/LLVMConventionsChecker.cpp
index fa51aa80216b..1cb3848cfed2 100644
--- a/clang/lib/StaticAnalyzer/Checkers/LLVMConventionsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/LLVMConventionsChecker.cpp
@@ -41,7 +41,7 @@ static bool InNamespace(const Decl *D, StringRef NS) {
   if (!ND)
     return false;
   const IdentifierInfo *II = ND->getIdentifier();
-  if (!II || !II->getName().equals(NS))
+  if (!II || II->getName() != NS)
     return false;
   return isa<TranslationUnitDecl>(ND->getDeclContext());
 }
diff --git a/clang/lib/StaticAnalyzer/Checkers/LocalizationChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/LocalizationChecker.cpp
index 882eb0236a18..f524c4c067c8 100644
--- a/clang/lib/StaticAnalyzer/Checkers/LocalizationChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/LocalizationChecker.cpp
@@ -1159,7 +1159,7 @@ void EmptyLocalizationContextChecker::MethodCrawler::VisitObjCMessageExpr(
   }
 
   if (isAnyIdentifier(Result.getKind())) {
-    if (Result.getRawIdentifier().equals("nil")) {
+    if (Result.getRawIdentifier() == "nil") {
       reportEmptyContextError(ME);
       return;
     }
diff --git a/clang/lib/StaticAnalyzer/Checkers/MIGChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MIGChecker.cpp
index 153a0a51e980..9757a00f1fb2 100644
--- a/clang/lib/StaticAnalyzer/Checkers/MIGChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/MIGChecker.cpp
@@ -46,13 +46,13 @@ class MIGChecker : public Checker<check::PostCall, check::PreStmt<ReturnStmt>,
   // additionally an argument of a MIG routine, the checker keeps track of that
   // information and issues a warning when an error is returned from the
   // respective routine.
-  std::vector<std::pair<CallDescription, unsigned>> Deallocators = {
+  CallDescriptionMap<unsigned> Deallocators = {
 #define CALL(required_args, deallocated_arg, ...)                              \
-  {{{__VA_ARGS__}, required_args}, deallocated_arg}
-      // E.g., if the checker sees a C function 'vm_deallocate' that is
-      // defined on class 'IOUserClient' that has exactly 3 parameters, it knows
-      // that argument #1 (starting from 0, i.e. the second argument) is going
-      // to be consumed in the sense of the MIG consume-on-success convention.
+  {{CDM::SimpleFunc, {__VA_ARGS__}, required_args}, deallocated_arg}
+      // E.g., if the checker sees a C function 'vm_deallocate' that has
+      // exactly 3 parameters, it knows that argument #1 (starting from 0, i.e.
+      // the second argument) is going to be consumed in the sense of the MIG
+      // consume-on-success convention.
       CALL(3, 1, "vm_deallocate"),
       CALL(3, 1, "mach_vm_deallocate"),
       CALL(2, 0, "mig_deallocate"),
@@ -78,6 +78,9 @@ class MIGChecker : public Checker<check::PostCall, check::PreStmt<ReturnStmt>,
       CALL(1, 0, "thread_inspect_deallocate"),
       CALL(1, 0, "upl_deallocate"),
       CALL(1, 0, "vm_map_deallocate"),
+#undef CALL
+#define CALL(required_args, deallocated_arg, ...)                              \
+  {{CDM::CXXMethod, {__VA_ARGS__}, required_args}, deallocated_arg}
       // E.g., if the checker sees a method 'releaseAsyncReference64()' that is
       // defined on class 'IOUserClient' that takes exactly 1 argument, it knows
       // that the argument is going to be consumed in the sense of the MIG
@@ -87,7 +90,7 @@ class MIGChecker : public Checker<check::PostCall, check::PreStmt<ReturnStmt>,
 #undef CALL
   };
 
-  CallDescription OsRefRetain{{"os_ref_retain"}, 1};
+  CallDescription OsRefRetain{CDM::SimpleFunc, {"os_ref_retain"}, 1};
 
   void checkReturnAux(const ReturnStmt *RS, CheckerContext &C) const;
 
@@ -198,15 +201,12 @@ void MIGChecker::checkPostCall(const CallEvent &Call, CheckerContext &C) const {
   if (!isInMIGCall(C))
     return;
 
-  auto I = llvm::find_if(Deallocators,
-                         [&](const std::pair<CallDescription, unsigned> &Item) {
-                           return Item.first.matches(Call);
-                         });
-  if (I == Deallocators.end())
+  const unsigned *ArgIdxPtr = Deallocators.lookup(Call);
+  if (!ArgIdxPtr)
     return;
 
   ProgramStateRef State = C.getState();
-  unsigned ArgIdx = I->second;
+  unsigned ArgIdx = *ArgIdxPtr;
   SVal Arg = Call.getArgSVal(ArgIdx);
   const ParmVarDecl *PVD = getOriginParam(Arg, C);
   if (!PVD || State->contains<RefCountedParameters>(PVD))
diff --git a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
index 11651fd491f7..34af7fb131f5 100644
--- a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
@@ -1727,7 +1727,7 @@ static std::optional<bool> getFreeWhenDoneArg(const ObjCMethodCall &Call) {
 
   // FIXME: We should not rely on fully-constrained symbols being folded.
   for (unsigned i = 1; i < S.getNumArgs(); ++i)
-    if (S.getNameForSlot(i).equals("freeWhenDone"))
+    if (S.getNameForSlot(i) == "freeWhenDone")
       return !Call.getArgSVal(i).isZeroConstant();
 
   return std::nullopt;
@@ -1819,6 +1819,10 @@ ProgramStateRef MallocChecker::MallocMemAux(CheckerContext &C,
   if (Size.isUndef())
     Size = UnknownVal();
 
+  // TODO: If Size is tainted and we cannot prove that it is within
+  // reasonable bounds, emit a warning that an attacker may
+  // provoke a memory exhaustion error.
+
   // Set the region's extent.
   State = setDynamicExtent(State, RetVal.getAsRegion(),
                            Size.castAs<DefinedOrUnknownSVal>(), SVB);
@@ -3251,7 +3255,7 @@ bool MallocChecker::mayFreeAnyEscapedMemoryOrIsModeledExplicitly(
     if (FirstSlot.starts_with("addPointer") ||
         FirstSlot.starts_with("insertPointer") ||
         FirstSlot.starts_with("replacePointer") ||
-        FirstSlot.equals("valueWithPointer")) {
+        FirstSlot == "valueWithPointer") {
       return true;
     }
 
@@ -3447,7 +3451,7 @@ static bool isReferenceCountingPointerDestructor(const CXXDestructorDecl *DD) {
     if (N.contains_insensitive("ptr") || N.contains_insensitive("pointer")) {
       if (N.contains_insensitive("ref") || N.contains_insensitive("cnt") ||
           N.contains_insensitive("intrusive") ||
-          N.contains_insensitive("shared")) {
+          N.contains_insensitive("shared") || N.ends_with_insensitive("rc")) {
         return true;
       }
     }
@@ -3479,13 +3483,24 @@ PathDiagnosticPieceRef MallocBugVisitor::VisitNode(const ExplodedNode *N,
   // original reference count is positive, we should not report use-after-frees
   // on objects deleted in such destructors. This can probably be improved
   // through better shared pointer modeling.
-  if (ReleaseDestructorLC) {
+  if (ReleaseDestructorLC && (ReleaseDestructorLC == CurrentLC ||
+                              ReleaseDestructorLC->isParentOf(CurrentLC))) {
     if (const auto *AE = dyn_cast<AtomicExpr>(S)) {
+      // Check for manual use of atomic builtins.
       AtomicExpr::AtomicOp Op = AE->getOp();
       if (Op == AtomicExpr::AO__c11_atomic_fetch_add ||
           Op == AtomicExpr::AO__c11_atomic_fetch_sub) {
-        if (ReleaseDestructorLC == CurrentLC ||
-            ReleaseDestructorLC->isParentOf(CurrentLC)) {
+        BR.markInvalid(getTag(), S);
+      }
+    } else if (const auto *CE = dyn_cast<CallExpr>(S)) {
+      // Check for `std::atomic` and such. This covers both regular method calls
+      // and operator calls.
+      if (const auto *MD =
+              dyn_cast_or_null<CXXMethodDecl>(CE->getDirectCallee())) {
+        const CXXRecordDecl *RD = MD->getParent();
+        // A bit wobbly with ".contains()" because it may be like
+        // "__atomic_base" or something.
+        if (StringRef(RD->getNameAsString()).contains("atomic")) {
           BR.markInvalid(getTag(), S);
         }
       }
diff --git a/clang/lib/StaticAnalyzer/Checkers/ObjCContainersASTChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/ObjCContainersASTChecker.cpp
index 2b008d1c775a..6978d81faf1c 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ObjCContainersASTChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ObjCContainersASTChecker.cpp
@@ -101,14 +101,14 @@ void WalkAST::VisitCallExpr(CallExpr *CE) {
   const Expr *Arg = nullptr;
   unsigned ArgNum;
 
-  if (Name.equals("CFArrayCreate") || Name.equals("CFSetCreate")) {
+  if (Name == "CFArrayCreate" || Name == "CFSetCreate") {
     if (CE->getNumArgs() != 4)
       return;
     ArgNum = 1;
     Arg = CE->getArg(ArgNum)->IgnoreParenCasts();
     if (hasPointerToPointerSizedType(Arg))
         return;
-  } else if (Name.equals("CFDictionaryCreate")) {
+  } else if (Name == "CFDictionaryCreate") {
     if (CE->getNumArgs() != 6)
       return;
     // Check first argument.
diff --git a/clang/lib/StaticAnalyzer/Checkers/ObjCContainersChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/ObjCContainersChecker.cpp
index 28e88245ca95..4937af3b91c2 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ObjCContainersChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ObjCContainersChecker.cpp
@@ -82,7 +82,7 @@ void ObjCContainersChecker::checkPostStmt(const CallExpr *CE,
     return;
 
   // Add array size information to the state.
-  if (Name.equals("CFArrayCreate")) {
+  if (Name == "CFArrayCreate") {
     if (CE->getNumArgs() < 3)
       return;
     // Note, we can visit the Create method in the post-visit because
@@ -92,7 +92,7 @@ void ObjCContainersChecker::checkPostStmt(const CallExpr *CE,
     return;
   }
 
-  if (Name.equals("CFArrayGetCount")) {
+  if (Name == "CFArrayGetCount") {
     addSizeInfo(CE->getArg(0), CE, C);
     return;
   }
@@ -105,7 +105,7 @@ void ObjCContainersChecker::checkPreStmt(const CallExpr *CE,
     return;
 
   // Check the array access.
-  if (Name.equals("CFArrayGetValueAtIndex")) {
+  if (Name == "CFArrayGetValueAtIndex") {
     ProgramStateRef State = C.getState();
     // Retrieve the size.
     // Find out if we saw this array symbol before and have information about
diff --git a/clang/lib/StaticAnalyzer/Checkers/ReturnValueChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/ReturnValueChecker.cpp
index c3112ebe4e79..3da571adfa44 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ReturnValueChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ReturnValueChecker.cpp
@@ -1,4 +1,4 @@
-//===- ReturnValueChecker - Applies guaranteed return values ----*- C++ -*-===//
+//===- ReturnValueChecker - Check methods always returning true -*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,8 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This defines ReturnValueChecker, which checks for calls with guaranteed
-// boolean return value. It ensures the return value of each function call.
+// This defines ReturnValueChecker, which models a very specific coding
+// convention within the LLVM/Clang codebase: there several classes that have
+// Error() methods which always return true.
+// This checker was introduced to eliminate false positives caused by this
+// peculiar "always returns true" invariant. (Normally, the analyzer assumes
+// that a function returning `bool` can return both `true` and `false`, because
+// otherwise it could've been a `void` function.)
 //
 //===----------------------------------------------------------------------===//
 
@@ -18,43 +23,40 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/FormatVariadic.h"
 #include <optional>
 
 using namespace clang;
 using namespace ento;
+using llvm::formatv;
 
 namespace {
-class ReturnValueChecker : public Checker<check::PostCall, check::EndFunction> {
+class ReturnValueChecker : public Checker<check::PostCall> {
 public:
-  // It sets the predefined invariant ('CDM') if the current call not break it.
   void checkPostCall(const CallEvent &Call, CheckerContext &C) const;
 
-  // It reports whether a predefined invariant ('CDM') is broken.
-  void checkEndFunction(const ReturnStmt *RS, CheckerContext &C) const;
-
 private:
-  // The pairs are in the following form: {{{class, call}}, return value}
-  const CallDescriptionMap<bool> CDM = {
+  const CallDescriptionSet Methods = {
       // These are known in the LLVM project: 'Error()'
-      {{{"ARMAsmParser", "Error"}}, true},
-      {{{"HexagonAsmParser", "Error"}}, true},
-      {{{"LLLexer", "Error"}}, true},
-      {{{"LLParser", "Error"}}, true},
-      {{{"MCAsmParser", "Error"}}, true},
-      {{{"MCAsmParserExtension", "Error"}}, true},
-      {{{"TGParser", "Error"}}, true},
-      {{{"X86AsmParser", "Error"}}, true},
+      {CDM::CXXMethod, {"ARMAsmParser", "Error"}},
+      {CDM::CXXMethod, {"HexagonAsmParser", "Error"}},
+      {CDM::CXXMethod, {"LLLexer", "Error"}},
+      {CDM::CXXMethod, {"LLParser", "Error"}},
+      {CDM::CXXMethod, {"MCAsmParser", "Error"}},
+      {CDM::CXXMethod, {"MCAsmParserExtension", "Error"}},
+      {CDM::CXXMethod, {"TGParser", "Error"}},
+      {CDM::CXXMethod, {"X86AsmParser", "Error"}},
       // 'TokError()'
-      {{{"LLParser", "TokError"}}, true},
-      {{{"MCAsmParser", "TokError"}}, true},
-      {{{"MCAsmParserExtension", "TokError"}}, true},
-      {{{"TGParser", "TokError"}}, true},
+      {CDM::CXXMethod, {"LLParser", "TokError"}},
+      {CDM::CXXMethod, {"MCAsmParser", "TokError"}},
+      {CDM::CXXMethod, {"MCAsmParserExtension", "TokError"}},
+      {CDM::CXXMethod, {"TGParser", "TokError"}},
       // 'error()'
-      {{{"MIParser", "error"}}, true},
-      {{{"WasmAsmParser", "error"}}, true},
-      {{{"WebAssemblyAsmParser", "error"}}, true},
+      {CDM::CXXMethod, {"MIParser", "error"}},
+      {CDM::CXXMethod, {"WasmAsmParser", "error"}},
+      {CDM::CXXMethod, {"WebAssemblyAsmParser", "error"}},
       // Other
-      {{{"AsmParser", "printError"}}, true}};
+      {CDM::CXXMethod, {"AsmParser", "printError"}}};
 };
 } // namespace
 
@@ -68,100 +70,32 @@ static std::string getName(const CallEvent &Call) {
   return Name;
 }
 
-// The predefinitions ('CDM') could break due to the ever growing code base.
-// Check for the expected invariants and see whether they apply.
-static std::optional<bool> isInvariantBreak(bool ExpectedValue, SVal ReturnV,
-                                            CheckerContext &C) {
-  auto ReturnDV = ReturnV.getAs<DefinedOrUnknownSVal>();
-  if (!ReturnDV)
-    return std::nullopt;
-
-  if (ExpectedValue)
-    return C.getState()->isNull(*ReturnDV).isConstrainedTrue();
-
-  return C.getState()->isNull(*ReturnDV).isConstrainedFalse();
-}
-
 void ReturnValueChecker::checkPostCall(const CallEvent &Call,
                                        CheckerContext &C) const {
-  const bool *RawExpectedValue = CDM.lookup(Call);
-  if (!RawExpectedValue)
+  if (!Methods.contains(Call))
     return;
 
-  SVal ReturnV = Call.getReturnValue();
-  bool ExpectedValue = *RawExpectedValue;
-  std::optional<bool> IsInvariantBreak =
-      isInvariantBreak(ExpectedValue, ReturnV, C);
-  if (!IsInvariantBreak)
-    return;
+  auto ReturnV = Call.getReturnValue().getAs<DefinedOrUnknownSVal>();
 
-  // If the invariant is broken it is reported by 'checkEndFunction()'.
-  if (*IsInvariantBreak)
+  if (!ReturnV)
     return;
 
-  std::string Name = getName(Call);
-  const NoteTag *CallTag = C.getNoteTag(
-      [Name, ExpectedValue](PathSensitiveBugReport &) -> std::string {
-        SmallString<128> Msg;
-        llvm::raw_svector_ostream Out(Msg);
-
-        Out << '\'' << Name << "' returns "
-            << (ExpectedValue ? "true" : "false");
-        return std::string(Out.str());
-      },
-      /*IsPrunable=*/true);
-
   ProgramStateRef State = C.getState();
-  State = State->assume(ReturnV.castAs<DefinedOrUnknownSVal>(), ExpectedValue);
-  C.addTransition(State, CallTag);
-}
-
-void ReturnValueChecker::checkEndFunction(const ReturnStmt *RS,
-                                          CheckerContext &C) const {
-  if (!RS || !RS->getRetValue())
+  if (ProgramStateRef StTrue = State->assume(*ReturnV, true)) {
+    // The return value can be true, so transition to a state where it's true.
+    std::string Msg =
+        formatv("'{0}' returns true (by convention)", getName(Call));
+    C.addTransition(StTrue, C.getNoteTag(Msg, /*IsPrunable=*/true));
     return;
-
-  // We cannot get the caller in the top-frame.
-  const StackFrameContext *SFC = C.getStackFrame();
-  if (C.getStackFrame()->inTopFrame())
-    return;
-
-  ProgramStateRef State = C.getState();
-  CallEventManager &CMgr = C.getStateManager().getCallEventManager();
-  CallEventRef<> Call = CMgr.getCaller(SFC, State);
-  if (!Call)
-    return;
-
-  const bool *RawExpectedValue = CDM.lookup(*Call);
-  if (!RawExpectedValue)
-    return;
-
-  SVal ReturnV = State->getSVal(RS->getRetValue(), C.getLocationContext());
-  bool ExpectedValue = *RawExpectedValue;
-  std::optional<bool> IsInvariantBreak =
-      isInvariantBreak(ExpectedValue, ReturnV, C);
-  if (!IsInvariantBreak)
-    return;
-
-  // If the invariant is appropriate it is reported by 'checkPostCall()'.
-  if (!*IsInvariantBreak)
-    return;
-
-  std::string Name = getName(*Call);
-  const NoteTag *CallTag = C.getNoteTag(
-      [Name, ExpectedValue](BugReport &BR) -> std::string {
-        SmallString<128> Msg;
-        llvm::raw_svector_ostream Out(Msg);
-
-        // The following is swapped because the invariant is broken.
-        Out << '\'' << Name << "' returns "
-            << (ExpectedValue ? "false" : "true");
-
-        return std::string(Out.str());
-      },
-      /*IsPrunable=*/false);
-
-  C.addTransition(State, CallTag);
+  }
+  // Paranoia: if the return value is known to be false (which is highly
+  // unlikely, it's easy to ensure that the method always returns true), then
+  // produce a note that highlights that this unusual situation.
+  // Note that this checker is 'hidden' so it cannot produce a bug report.
+  std::string Msg = formatv("'{0}' returned false, breaking the convention "
+                            "that it always returns true",
+                            getName(Call));
+  C.addTransition(State, C.getNoteTag(Msg, /*IsPrunable=*/true));
 }
 
 void ento::registerReturnValueChecker(CheckerManager &Mgr) {
diff --git a/clang/lib/StaticAnalyzer/Checkers/SmartPtrModeling.cpp b/clang/lib/StaticAnalyzer/Checkers/SmartPtrModeling.cpp
index 268fc742f050..505020d4bb39 100644
--- a/clang/lib/StaticAnalyzer/Checkers/SmartPtrModeling.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/SmartPtrModeling.cpp
@@ -86,14 +86,14 @@ private:
   using SmartPtrMethodHandlerFn =
       void (SmartPtrModeling::*)(const CallEvent &Call, CheckerContext &) const;
   CallDescriptionMap<SmartPtrMethodHandlerFn> SmartPtrMethodHandlers{
-      {{{"reset"}}, &SmartPtrModeling::handleReset},
-      {{{"release"}}, &SmartPtrModeling::handleRelease},
-      {{{"swap"}, 1}, &SmartPtrModeling::handleSwapMethod},
-      {{{"get"}}, &SmartPtrModeling::handleGet}};
-  const CallDescription StdSwapCall{{"std", "swap"}, 2};
-  const CallDescription StdMakeUniqueCall{{"std", "make_unique"}};
-  const CallDescription StdMakeUniqueForOverwriteCall{
-      {"std", "make_unique_for_overwrite"}};
+      {{CDM::CXXMethod, {"reset"}}, &SmartPtrModeling::handleReset},
+      {{CDM::CXXMethod, {"release"}}, &SmartPtrModeling::handleRelease},
+      {{CDM::CXXMethod, {"swap"}, 1}, &SmartPtrModeling::handleSwapMethod},
+      {{CDM::CXXMethod, {"get"}}, &SmartPtrModeling::handleGet}};
+  const CallDescription StdSwapCall{CDM::SimpleFunc, {"std", "swap"}, 2};
+  const CallDescriptionSet MakeUniqueVariants{
+      {CDM::SimpleFunc, {"std", "make_unique"}},
+      {CDM::SimpleFunc, {"std", "make_unique_for_overwrite"}}};
 };
 } // end of anonymous namespace
 
@@ -296,7 +296,7 @@ bool SmartPtrModeling::evalCall(const CallEvent &Call,
     return handleSwap(State, Call.getArgSVal(0), Call.getArgSVal(1), C);
   }
 
-  if (matchesAny(Call, StdMakeUniqueCall, StdMakeUniqueForOverwriteCall)) {
+  if (MakeUniqueVariants.contains(Call)) {
     if (!ModelSmartPtrDereference)
       return false;
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
index a0aa2316a7b4..d4e020f7a72a 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
@@ -388,17 +388,19 @@ private:
   };
 
   CallDescriptionMap<FnDescription> FnTestDescriptions = {
-      {{{"StreamTesterChecker_make_feof_stream"}, 1},
+      {{CDM::SimpleFunc, {"StreamTesterChecker_make_feof_stream"}, 1},
        {nullptr,
         std::bind(&StreamChecker::evalSetFeofFerror, _1, _2, _3, _4, ErrorFEof,
                   false),
         0}},
-      {{{"StreamTesterChecker_make_ferror_stream"}, 1},
+      {{CDM::SimpleFunc, {"StreamTesterChecker_make_ferror_stream"}, 1},
        {nullptr,
         std::bind(&StreamChecker::evalSetFeofFerror, _1, _2, _3, _4,
                   ErrorFError, false),
         0}},
-      {{{"StreamTesterChecker_make_ferror_indeterminate_stream"}, 1},
+      {{CDM::SimpleFunc,
+        {"StreamTesterChecker_make_ferror_indeterminate_stream"},
+        1},
        {nullptr,
         std::bind(&StreamChecker::evalSetFeofFerror, _1, _2, _3, _4,
                   ErrorFError, true),
@@ -1141,7 +1143,7 @@ void StreamChecker::evalFscanf(const FnDescription *Desc, const CallEvent &Call,
       return;
 
     if (auto const *Callee = Call.getCalleeIdentifier();
-        !Callee || !Callee->getName().equals("vfscanf")) {
+        !Callee || Callee->getName() != "vfscanf") {
       SmallVector<unsigned int> EscArgs;
       for (auto EscArg : llvm::seq(2u, Call.getNumArgs()))
         EscArgs.push_back(EscArg);
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
index b36fa95bc73f..f81db0e67d83 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
@@ -16,8 +16,9 @@
 
 namespace clang {
 
-std::pair<const Expr *, bool>
-tryToFindPtrOrigin(const Expr *E, bool StopAtFirstRefCountedObj) {
+bool tryToFindPtrOrigin(
+    const Expr *E, bool StopAtFirstRefCountedObj,
+    std::function<bool(const clang::Expr *, bool)> callback) {
   while (E) {
     if (auto *tempExpr = dyn_cast<MaterializeTemporaryExpr>(E)) {
       E = tempExpr->getSubExpr();
@@ -27,12 +28,22 @@ tryToFindPtrOrigin(const Expr *E, bool StopAtFirstRefCountedObj) {
       E = tempExpr->getSubExpr();
       continue;
     }
+    if (auto *tempExpr = dyn_cast<ParenExpr>(E)) {
+      E = tempExpr->getSubExpr();
+      continue;
+    }
+    if (auto *Expr = dyn_cast<ConditionalOperator>(E)) {
+      return tryToFindPtrOrigin(Expr->getTrueExpr(), StopAtFirstRefCountedObj,
+                                callback) &&
+             tryToFindPtrOrigin(Expr->getFalseExpr(), StopAtFirstRefCountedObj,
+                                callback);
+    }
     if (auto *cast = dyn_cast<CastExpr>(E)) {
       if (StopAtFirstRefCountedObj) {
         if (auto *ConversionFunc =
                 dyn_cast_or_null<FunctionDecl>(cast->getConversionFunction())) {
           if (isCtorOfRefCounted(ConversionFunc))
-            return {E, true};
+            return callback(E, true);
         }
       }
       // FIXME: This can give false "origin" that would lead to false negatives
@@ -47,7 +58,7 @@ tryToFindPtrOrigin(const Expr *E, bool StopAtFirstRefCountedObj) {
           if (IsGetterOfRefCt && *IsGetterOfRefCt) {
             E = memberCall->getImplicitObjectArgument();
             if (StopAtFirstRefCountedObj) {
-              return {E, true};
+              return callback(E, true);
             }
             continue;
           }
@@ -64,17 +75,17 @@ tryToFindPtrOrigin(const Expr *E, bool StopAtFirstRefCountedObj) {
       if (auto *callee = call->getDirectCallee()) {
         if (isCtorOfRefCounted(callee)) {
           if (StopAtFirstRefCountedObj)
-            return {E, true};
+            return callback(E, true);
 
           E = call->getArg(0);
           continue;
         }
 
         if (isReturnValueRefCounted(callee))
-          return {E, true};
+          return callback(E, true);
 
         if (isSingleton(callee))
-          return {E, true};
+          return callback(E, true);
 
         if (isPtrConversion(callee)) {
           E = call->getArg(0);
@@ -91,7 +102,7 @@ tryToFindPtrOrigin(const Expr *E, bool StopAtFirstRefCountedObj) {
     break;
   }
   // Some other expression.
-  return {E, false};
+  return callback(E, false);
 }
 
 bool isASafeCallArg(const Expr *E) {
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.h b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.h
index e35ea4ef05dd..e972924e0c52 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.h
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.h
@@ -13,6 +13,7 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/Support/Casting.h"
 
+#include <functional>
 #include <string>
 #include <utility>
 
@@ -48,10 +49,12 @@ class Expr;
 /// represents ref-counted object during the traversal we return relevant
 /// sub-expression and true.
 ///
-/// \returns subexpression that we traversed to and if \p
-/// StopAtFirstRefCountedObj is true we also return whether we stopped early.
-std::pair<const clang::Expr *, bool>
-tryToFindPtrOrigin(const clang::Expr *E, bool StopAtFirstRefCountedObj);
+/// Calls \p callback with the subexpression that we traversed to and if \p
+/// StopAtFirstRefCountedObj is true we also specify whether we stopped early.
+/// Returns false if any of calls to callbacks returned false. Otherwise true.
+bool tryToFindPtrOrigin(
+    const clang::Expr *E, bool StopAtFirstRefCountedObj,
+    std::function<bool(const clang::Expr *, bool)> callback);
 
 /// For \p E referring to a ref-countable/-counted pointer/reference we return
 /// whether it's a safe call argument. Examples: function parameter or
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/NoUncountedMembersChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/NoUncountedMembersChecker.cpp
index c753ed84a700..69a0eb3086ab 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/NoUncountedMembersChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/NoUncountedMembersChecker.cpp
@@ -34,7 +34,7 @@ private:
 public:
   NoUncountedMemberChecker()
       : Bug(this,
-            "Member variable is a raw-poiner/reference to reference-countable "
+            "Member variable is a raw-pointer/reference to reference-countable "
             "type",
             "WebKit coding guidelines") {}
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
index 6901dbb415bf..ad493587affa 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
@@ -309,21 +309,8 @@ public:
   bool VisitDefaultStmt(const DefaultStmt *DS) { return VisitChildren(DS); }
 
   bool VisitUnaryOperator(const UnaryOperator *UO) {
-    // Operator '*' and '!' are allowed as long as the operand is trivial.
-    auto op = UO->getOpcode();
-    if (op == UO_Deref || op == UO_AddrOf || op == UO_LNot || op == UO_Not)
-      return Visit(UO->getSubExpr());
-
-    if (UO->isIncrementOp() || UO->isDecrementOp()) {
-      // Allow increment or decrement of a POD type.
-      if (auto *RefExpr = dyn_cast<DeclRefExpr>(UO->getSubExpr())) {
-        if (auto *Decl = dyn_cast<VarDecl>(RefExpr->getDecl()))
-          return Decl->isLocalVarDeclOrParm() &&
-                 Decl->getType().isPODType(Decl->getASTContext());
-      }
-    }
-    // Other operators are non-trivial.
-    return false;
+    // Unary operators are trivial if its operand is trivial except co_await.
+    return UO->getOpcode() != UO_Coawait && Visit(UO->getSubExpr());
   }
 
   bool VisitBinaryOperator(const BinaryOperator *BO) {
@@ -363,8 +350,11 @@ public:
     const auto &Name = safeGetName(Callee);
 
     if (Name == "WTFCrashWithInfo" || Name == "WTFBreakpointTrap" ||
-        Name == "WTFReportAssertionFailure" ||
-        Name == "compilerFenceForCrash" || Name == "__builtin_unreachable")
+        Name == "WTFReportAssertionFailure" || Name == "isMainThread" ||
+        Name == "isMainThreadOrGCThread" || Name == "isMainRunLoop" ||
+        Name == "isWebThread" || Name == "isUIThread" ||
+        Name == "compilerFenceForCrash" || Name == "bitwise_cast" ||
+        Name == "addressof" || Name.find("__builtin") == 0)
       return true;
 
     return TrivialFunctionAnalysis::isTrivialImpl(Callee, Cache);
@@ -405,6 +395,16 @@ public:
     return TrivialFunctionAnalysis::isTrivialImpl(Callee, Cache);
   }
 
+  bool VisitCXXOperatorCallExpr(const CXXOperatorCallExpr *OCE) {
+    if (!checkArguments(OCE))
+      return false;
+    auto *Callee = OCE->getCalleeDecl();
+    if (!Callee)
+      return false;
+    // Recursively descend into the callee to confirm that it's trivial as well.
+    return TrivialFunctionAnalysis::isTrivialImpl(Callee, Cache);
+  }
+
   bool VisitCXXDefaultArgExpr(const CXXDefaultArgExpr *E) {
     if (auto *Expr = E->getExpr()) {
       if (!Visit(Expr))
@@ -431,6 +431,8 @@ public:
     return TrivialFunctionAnalysis::isTrivialImpl(CE->getConstructor(), Cache);
   }
 
+  bool VisitCXXNewExpr(const CXXNewExpr *NE) { return VisitChildren(NE); }
+
   bool VisitImplicitCastExpr(const ImplicitCastExpr *ICE) {
     return Visit(ICE->getSubExpr());
   }
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RefCntblBaseVirtualDtorChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RefCntblBaseVirtualDtorChecker.cpp
index d879c110b75d..7f4c3a7b787e 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RefCntblBaseVirtualDtorChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RefCntblBaseVirtualDtorChecker.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "ASTUtils.h"
 #include "DiagOutputUtils.h"
 #include "PtrTypesSemantics.h"
 #include "clang/AST/CXXInheritance.h"
@@ -90,6 +91,9 @@ public:
           const CXXRecordDecl *C = T->getAsCXXRecordDecl();
           if (!C)
             return false;
+          if (isRefCountedClass(C))
+            return false;
+
           bool AnyInconclusiveBase = false;
           const auto hasPublicRefInBase =
               [&AnyInconclusiveBase](const CXXBaseSpecifier *Base,
@@ -164,6 +168,20 @@ public:
     return false;
   }
 
+  static bool isRefCountedClass(const CXXRecordDecl *D) {
+    if (!D->getTemplateInstantiationPattern())
+      return false;
+    auto *NsDecl = D->getParent();
+    if (!NsDecl || !isa<NamespaceDecl>(NsDecl))
+      return false;
+    auto NamespaceName = safeGetName(NsDecl);
+    auto ClsNameStr = safeGetName(D);
+    StringRef ClsName = ClsNameStr; // FIXME: Make safeGetName return StringRef.
+    return NamespaceName == "WTF" &&
+           (ClsName.ends_with("RefCounted") ||
+            ClsName == "ThreadSafeRefCountedAndCanMakeThreadSafeWeakPtr");
+  }
+
   void reportBug(const CXXRecordDecl *DerivedClass,
                  const CXXBaseSpecifier *BaseSpec,
                  const CXXRecordDecl *ProblematicBaseClass) const {
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
index ae494de58da3..704c082a4d1d 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
@@ -126,30 +126,31 @@ public:
   }
 
   bool isPtrOriginSafe(const Expr *Arg) const {
-    std::pair<const clang::Expr *, bool> ArgOrigin =
-        tryToFindPtrOrigin(Arg, true);
-
-    // Temporary ref-counted object created as part of the call argument
-    // would outlive the call.
-    if (ArgOrigin.second)
-      return true;
-
-    if (isa<CXXNullPtrLiteralExpr>(ArgOrigin.first)) {
-      // foo(nullptr)
-      return true;
-    }
-    if (isa<IntegerLiteral>(ArgOrigin.first)) {
-      // FIXME: Check the value.
-      // foo(NULL)
-      return true;
-    }
-
-    return isASafeCallArg(ArgOrigin.first);
+    return tryToFindPtrOrigin(Arg, /*StopAtFirstRefCountedObj=*/true,
+                              [](const clang::Expr *ArgOrigin, bool IsSafe) {
+                                if (IsSafe)
+                                  return true;
+                                if (isa<CXXNullPtrLiteralExpr>(ArgOrigin)) {
+                                  // foo(nullptr)
+                                  return true;
+                                }
+                                if (isa<IntegerLiteral>(ArgOrigin)) {
+                                  // FIXME: Check the value.
+                                  // foo(NULL)
+                                  return true;
+                                }
+                                if (isASafeCallArg(ArgOrigin))
+                                  return true;
+                                return false;
+                              });
   }
 
   bool shouldSkipCall(const CallExpr *CE) const {
     const auto *Callee = CE->getDirectCallee();
 
+    if (BR->getSourceManager().isInSystemHeader(CE->getExprLoc()))
+      return true;
+
     if (Callee && TFA.isTrivial(Callee))
       return true;
 
@@ -227,10 +228,17 @@ public:
     return NamespaceName == "WTF" &&
            (MethodName == "find" || MethodName == "findIf" ||
             MethodName == "reverseFind" || MethodName == "reverseFindIf" ||
-            MethodName == "get" || MethodName == "inlineGet" ||
-            MethodName == "contains" || MethodName == "containsIf") &&
+            MethodName == "findIgnoringASCIICase" || MethodName == "get" ||
+            MethodName == "inlineGet" || MethodName == "contains" ||
+            MethodName == "containsIf" ||
+            MethodName == "containsIgnoringASCIICase" ||
+            MethodName == "startsWith" || MethodName == "endsWith" ||
+            MethodName == "startsWithIgnoringASCIICase" ||
+            MethodName == "endsWithIgnoringASCIICase" ||
+            MethodName == "substring") &&
            (ClsName.ends_with("Vector") || ClsName.ends_with("Set") ||
-            ClsName.ends_with("Map"));
+            ClsName.ends_with("Map") || ClsName == "StringImpl" ||
+            ClsName.ends_with("String"));
   }
 
   void reportBug(const Expr *CallArg, const ParmVarDecl *Param) const {
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLocalVarsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLocalVarsChecker.cpp
index 6036ad58cf25..0d9710a5e2d8 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLocalVarsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLocalVarsChecker.cpp
@@ -188,39 +188,50 @@ public:
       if (!InitExpr)
         return; // FIXME: later on we might warn on uninitialized vars too
 
-      const clang::Expr *const InitArgOrigin =
-          tryToFindPtrOrigin(InitExpr, /*StopAtFirstRefCountedObj=*/false)
-              .first;
-      if (!InitArgOrigin)
+      if (tryToFindPtrOrigin(
+              InitExpr, /*StopAtFirstRefCountedObj=*/false,
+              [&](const clang::Expr *InitArgOrigin, bool IsSafe) {
+                if (!InitArgOrigin)
+                  return true;
+
+                if (isa<CXXThisExpr>(InitArgOrigin))
+                  return true;
+
+                if (isa<CXXNullPtrLiteralExpr>(InitArgOrigin))
+                  return true;
+
+                if (isa<IntegerLiteral>(InitArgOrigin))
+                  return true;
+
+                if (auto *Ref = llvm::dyn_cast<DeclRefExpr>(InitArgOrigin)) {
+                  if (auto *MaybeGuardian =
+                          dyn_cast_or_null<VarDecl>(Ref->getFoundDecl())) {
+                    const auto *MaybeGuardianArgType =
+                        MaybeGuardian->getType().getTypePtr();
+                    if (MaybeGuardianArgType) {
+                      const CXXRecordDecl *const MaybeGuardianArgCXXRecord =
+                          MaybeGuardianArgType->getAsCXXRecordDecl();
+                      if (MaybeGuardianArgCXXRecord) {
+                        if (MaybeGuardian->isLocalVarDecl() &&
+                            (isRefCounted(MaybeGuardianArgCXXRecord) ||
+                             isRefcountedStringsHack(MaybeGuardian)) &&
+                            isGuardedScopeEmbeddedInGuardianScope(
+                                V, MaybeGuardian))
+                          return true;
+                      }
+                    }
+
+                    // Parameters are guaranteed to be safe for the duration of
+                    // the call by another checker.
+                    if (isa<ParmVarDecl>(MaybeGuardian))
+                      return true;
+                  }
+                }
+
+                return false;
+              }))
         return;
 
-      if (isa<CXXThisExpr>(InitArgOrigin))
-        return;
-
-      if (auto *Ref = llvm::dyn_cast<DeclRefExpr>(InitArgOrigin)) {
-        if (auto *MaybeGuardian =
-                dyn_cast_or_null<VarDecl>(Ref->getFoundDecl())) {
-          const auto *MaybeGuardianArgType =
-              MaybeGuardian->getType().getTypePtr();
-          if (MaybeGuardianArgType) {
-            const CXXRecordDecl *const MaybeGuardianArgCXXRecord =
-                MaybeGuardianArgType->getAsCXXRecordDecl();
-            if (MaybeGuardianArgCXXRecord) {
-              if (MaybeGuardian->isLocalVarDecl() &&
-                  (isRefCounted(MaybeGuardianArgCXXRecord) ||
-                   isRefcountedStringsHack(MaybeGuardian)) &&
-                  isGuardedScopeEmbeddedInGuardianScope(V, MaybeGuardian))
-                return;
-            }
-          }
-
-          // Parameters are guaranteed to be safe for the duration of the call
-          // by another checker.
-          if (isa<ParmVarDecl>(MaybeGuardian))
-            return;
-        }
-      }
-
       reportBug(V);
     }
   }
@@ -230,6 +241,9 @@ public:
     if (!V->isLocalVarDecl())
       return true;
 
+    if (BR->getSourceManager().isInSystemHeader(V->getLocation()))
+      return true;
+
     return false;
   }
 
diff --git a/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp b/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
index 984755fa7e50..487a3bd16b67 100644
--- a/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
+++ b/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
@@ -113,6 +113,9 @@ const Expr *bugreporter::getDerefExpr(const Stmt *S) {
       // Pointer arithmetic: '*(x + 2)' -> 'x') etc.
       if (const Expr *Inner = peelOffPointerArithmetic(B)) {
         E = Inner;
+      } else if (B->isAssignmentOp()) {
+        // Follow LHS of assignments: '*p = 404' -> 'p'.
+        E = B->getLHS();
       } else {
         // Probably more arithmetic can be pattern-matched here,
         // but for now give up.
diff --git a/clang/lib/StaticAnalyzer/Core/CheckerContext.cpp b/clang/lib/StaticAnalyzer/Core/CheckerContext.cpp
index 113abcd4c2ab..96464b30c078 100644
--- a/clang/lib/StaticAnalyzer/Core/CheckerContext.cpp
+++ b/clang/lib/StaticAnalyzer/Core/CheckerContext.cpp
@@ -104,7 +104,7 @@ bool CheckerContext::isCLibraryFunction(const FunctionDecl *FD,
     return true;
 
   StringRef FName = II->getName();
-  if (FName.equals(Name))
+  if (FName == Name)
     return true;
 
   if (FName.starts_with("__inline") && FName.contains(Name))
diff --git a/clang/lib/Tooling/Tooling.cpp b/clang/lib/Tooling/Tooling.cpp
index c5c3cdb47e92..ffacf9cf1f78 100644
--- a/clang/lib/Tooling/Tooling.cpp
+++ b/clang/lib/Tooling/Tooling.cpp
@@ -293,7 +293,7 @@ void addTargetAndModeForProgramName(std::vector<std::string> &CommandLine,
        ++Token) {
     StringRef TokenRef(*Token);
     ShouldAddTarget = ShouldAddTarget && !TokenRef.starts_with(TargetOPT) &&
-                      !TokenRef.equals(TargetOPTLegacy);
+                      TokenRef != TargetOPTLegacy;
     ShouldAddMode = ShouldAddMode && !TokenRef.starts_with(DriverModeOPT);
   }
   if (ShouldAddMode) {
diff --git a/clang/test/APINotes/export-as.c b/clang/test/APINotes/export-as.c
index 7a8a652ab755..24d9338e2993 100644
--- a/clang/test/APINotes/export-as.c
+++ b/clang/test/APINotes/export-as.c
@@ -1,5 +1,5 @@
 // RUN: rm -rf %t && mkdir -p %t
-// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -ast-dump -ast-dump-filter globalInt -x c | FileCheck %s
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers %s -ast-dump -ast-dump-filter globalInt -x c | FileCheck %s
 
 #include "ExportAs.h"
 
diff --git a/clang/test/APINotes/extern-context.cpp b/clang/test/APINotes/extern-context.cpp
index 331dee002361..ffd2e3331be8 100644
--- a/clang/test/APINotes/extern-context.cpp
+++ b/clang/test/APINotes/extern-context.cpp
@@ -1,8 +1,8 @@
 // RUN: rm -rf %t && mkdir -p %t
-// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -ast-dump -ast-dump-filter globalInExternC -x c++ | FileCheck -check-prefix=CHECK-EXTERN-C %s
-// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -ast-dump -ast-dump-filter globalInExternCXX -x c++ | FileCheck -check-prefix=CHECK-EXTERN-CXX %s
-// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -ast-dump -ast-dump-filter globalFuncInExternC -x c++ | FileCheck -check-prefix=CHECK-FUNC-EXTERN-C %s
-// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -ast-dump -ast-dump-filter globalFuncInExternCXX -x c++ | FileCheck -check-prefix=CHECK-FUNC-EXTERN-CXX %s
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers %s -ast-dump -ast-dump-filter globalInExternC -x c++ | FileCheck -check-prefix=CHECK-EXTERN-C %s
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers %s -ast-dump -ast-dump-filter globalInExternCXX -x c++ | FileCheck -check-prefix=CHECK-EXTERN-CXX %s
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers %s -ast-dump -ast-dump-filter globalFuncInExternC -x c++ | FileCheck -check-prefix=CHECK-FUNC-EXTERN-C %s
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers %s -ast-dump -ast-dump-filter globalFuncInExternCXX -x c++ | FileCheck -check-prefix=CHECK-FUNC-EXTERN-CXX %s
 
 #include "ExternCtx.h"
 
diff --git a/clang/test/APINotes/namespaces.cpp b/clang/test/APINotes/namespaces.cpp
index 2f9d93c2ea0e..c19eee565c2d 100644
--- a/clang/test/APINotes/namespaces.cpp
+++ b/clang/test/APINotes/namespaces.cpp
@@ -1,17 +1,17 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -x objective-c++
-// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::my_typedef -x objective-c++ | FileCheck -check-prefix=CHECK-TYPEDEF-IN-NAMESPACE %s
-// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::my_using_decl -x objective-c++ | FileCheck -check-prefix=CHECK-USING-DECL-IN-NAMESPACE %s
-// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::varInNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-GLOBAL-IN-NAMESPACE %s
-// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::funcInNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-FUNC-IN-NAMESPACE %s
-// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::char_box -x objective-c++ | FileCheck -check-prefix=CHECK-STRUCT-IN-NAMESPACE %s
-// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested1::varInNestedNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-GLOBAL-IN-NESTED-NAMESPACE %s
-// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested2::varInNestedNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-ANOTHER-GLOBAL-IN-NESTED-NAMESPACE %s
-// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested1::char_box -x objective-c++ | FileCheck -check-prefix=CHECK-STRUCT-IN-NESTED-NAMESPACE %s
-// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested1::funcInNestedNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-FUNC-IN-NESTED-NAMESPACE %s
-// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested1::Namespace1::char_box -x objective-c++ | FileCheck -check-prefix=CHECK-STRUCT-IN-DEEP-NESTED-NAMESPACE %s
-// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter varInInlineNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-GLOBAL-IN-INLINE-NAMESPACE %s
-// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter funcInInlineNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-FUNC-IN-INLINE-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::my_typedef -x objective-c++ | FileCheck -check-prefix=CHECK-TYPEDEF-IN-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::my_using_decl -x objective-c++ | FileCheck -check-prefix=CHECK-USING-DECL-IN-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::varInNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-GLOBAL-IN-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::funcInNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-FUNC-IN-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::char_box -x objective-c++ | FileCheck -check-prefix=CHECK-STRUCT-IN-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested1::varInNestedNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-GLOBAL-IN-NESTED-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested2::varInNestedNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-ANOTHER-GLOBAL-IN-NESTED-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested1::char_box -x objective-c++ | FileCheck -check-prefix=CHECK-STRUCT-IN-NESTED-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested1::funcInNestedNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-FUNC-IN-NESTED-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested1::Namespace1::char_box -x objective-c++ | FileCheck -check-prefix=CHECK-STRUCT-IN-DEEP-NESTED-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter varInInlineNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-GLOBAL-IN-INLINE-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter funcInInlineNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-FUNC-IN-INLINE-NAMESPACE %s
 
 #import <Namespaces.h>
 
diff --git a/clang/test/APINotes/properties.m b/clang/test/APINotes/properties.m
index f218092a66e1..79b5e2b10c47 100644
--- a/clang/test/APINotes/properties.m
+++ b/clang/test/APINotes/properties.m
@@ -1,7 +1,7 @@
 // RUN: rm -rf %t && mkdir -p %t
 
-// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules  -fblocks -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter 'TestProperties::' | FileCheck -check-prefix=CHECK -check-prefix=CHECK-4 %s
-// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules  -fblocks -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter 'TestProperties::' -fapinotes-swift-version=3 | FileCheck -check-prefix=CHECK -check-prefix=CHECK-3 %s
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules  -fblocks -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter 'TestProperties::' | FileCheck -check-prefix=CHECK -check-prefix=CHECK-4 %s
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules  -fblocks -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter 'TestProperties::' -fapinotes-swift-version=3 | FileCheck -check-prefix=CHECK -check-prefix=CHECK-3 %s
 
 @import VersionedKit;
 
diff --git a/clang/test/APINotes/swift-import-as.cpp b/clang/test/APINotes/swift-import-as.cpp
index 103cf02f431a..62e6450e94e1 100644
--- a/clang/test/APINotes/swift-import-as.cpp
+++ b/clang/test/APINotes/swift-import-as.cpp
@@ -1,9 +1,9 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -x c++
-// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -x c++ -ast-dump -ast-dump-filter ImmortalRefType | FileCheck -check-prefix=CHECK-IMMORTAL %s
-// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -x c++ -ast-dump -ast-dump-filter RefCountedType | FileCheck -check-prefix=CHECK-REF-COUNTED %s
-// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -x c++ -ast-dump -ast-dump-filter NonCopyableType | FileCheck -check-prefix=CHECK-NON-COPYABLE %s
-// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -x c++ -ast-dump -ast-dump-filter CopyableType | FileCheck -check-prefix=CHECK-COPYABLE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers %s -x c++ -ast-dump -ast-dump-filter ImmortalRefType | FileCheck -check-prefix=CHECK-IMMORTAL %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers %s -x c++ -ast-dump -ast-dump-filter RefCountedType | FileCheck -check-prefix=CHECK-REF-COUNTED %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers %s -x c++ -ast-dump -ast-dump-filter NonCopyableType | FileCheck -check-prefix=CHECK-NON-COPYABLE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers %s -x c++ -ast-dump -ast-dump-filter CopyableType | FileCheck -check-prefix=CHECK-COPYABLE %s
 
 #include <SwiftImportAs.h>
 
diff --git a/clang/test/APINotes/templates.cpp b/clang/test/APINotes/templates.cpp
index d4dce291615e..0556eba925a5 100644
--- a/clang/test/APINotes/templates.cpp
+++ b/clang/test/APINotes/templates.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Tmpl -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -x c++
-// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Tmpl -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Box -x c++ | FileCheck -check-prefix=CHECK-BOX %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Tmpl -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Box -x c++ | FileCheck -check-prefix=CHECK-BOX %s
 
 #include "Templates.h"
 
diff --git a/clang/test/APINotes/versioned.m b/clang/test/APINotes/versioned.m
index 4a8da1556f87..264edde2a04f 100644
--- a/clang/test/APINotes/versioned.m
+++ b/clang/test/APINotes/versioned.m
@@ -3,12 +3,12 @@
 // Build and check the unversioned module file.
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Unversioned -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s
 // RUN: %clang_cc1 -ast-print %t/ModulesCache/Unversioned/VersionedKit.pcm | FileCheck -check-prefix=CHECK-UNVERSIONED %s
-// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Unversioned -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter 'DUMP' | FileCheck -check-prefix=CHECK-DUMP -check-prefix=CHECK-UNVERSIONED-DUMP %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Unversioned -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter 'DUMP' | FileCheck -check-prefix=CHECK-DUMP -check-prefix=CHECK-UNVERSIONED-DUMP %s
 
 // Build and check the versioned module file.
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Versioned -fdisable-module-hash -fapinotes-modules -fapinotes-swift-version=3 -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s
 // RUN: %clang_cc1 -ast-print %t/ModulesCache/Versioned/VersionedKit.pcm | FileCheck -check-prefix=CHECK-VERSIONED %s
-// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Versioned -fdisable-module-hash -fapinotes-modules -fapinotes-swift-version=3 -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter 'DUMP' | FileCheck -check-prefix=CHECK-DUMP -check-prefix=CHECK-VERSIONED-DUMP %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Versioned -fdisable-module-hash -fapinotes-modules -fapinotes-swift-version=3 -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter 'DUMP' | FileCheck -check-prefix=CHECK-DUMP -check-prefix=CHECK-VERSIONED-DUMP %s
 
 #import <VersionedKit/VersionedKit.h>
 
diff --git a/clang/test/AST/HLSL/RWBuffer-AST.hlsl b/clang/test/AST/HLSL/RWBuffer-AST.hlsl
index c1613520a146..ac54194be7f0 100644
--- a/clang/test/AST/HLSL/RWBuffer-AST.hlsl
+++ b/clang/test/AST/HLSL/RWBuffer-AST.hlsl
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -fsyntax-only -ast-dump -DEMPTY %s | FileCheck -check-prefix=EMPTY %s 
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -fsyntax-only -ast-dump %s | FileCheck %s 
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY %s | FileCheck -check-prefix=EMPTY %s 
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump %s | FileCheck %s 
 
 
 // This test tests two different AST generations. The "EMPTY" test mode verifies
diff --git a/clang/test/AST/HLSL/ResourceStruct.hlsl b/clang/test/AST/HLSL/ResourceStruct.hlsl
index 34f1419180d8..04b3b9311990 100644
--- a/clang/test/AST/HLSL/ResourceStruct.hlsl
+++ b/clang/test/AST/HLSL/ResourceStruct.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -fsyntax-only -ast-dump %s | FileCheck %s 
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -ast-dump %s | FileCheck %s 
 
 // CHECK: NamespaceDecl {{.*}} implicit hlsl
 // CHECK: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit <undeserialized declarations> class Resource definition
diff --git a/clang/test/AST/HLSL/packoffset.hlsl b/clang/test/AST/HLSL/packoffset.hlsl
new file mode 100644
index 000000000000..060288c2f7f7
--- /dev/null
+++ b/clang/test/AST/HLSL/packoffset.hlsl
@@ -0,0 +1,100 @@
+// RUN: %clang_cc1 -triple dxil-unknown-shadermodel6.3-library -S -finclude-default-header -fnative-half-type -ast-dump  -x hlsl %s | FileCheck %s
+
+
+// CHECK: HLSLBufferDecl {{.*}} cbuffer A
+cbuffer A
+{
+    // CHECK-NEXT: VarDecl {{.*}} A1 'float4'
+    // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 0 0
+    float4 A1 : packoffset(c);
+    // CHECK-NEXT: VarDecl {{.*}} col:11 A2 'float'
+    // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 1 0
+    float A2 : packoffset(c1);
+    // CHECK-NEXT: VarDecl {{.*}} col:11 A3 'float'
+    // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 1 1
+    float A3 : packoffset(c1.y);
+}
+
+// CHECK: HLSLBufferDecl {{.*}} cbuffer B
+cbuffer B
+{
+    // CHECK: VarDecl {{.*}} B0 'float'
+    // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 0 1
+    float B0 : packoffset(c0.g);
+    // CHECK-NEXT: VarDecl {{.*}} B1 'double'
+    // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 0 2
+	double B1 : packoffset(c0.b);
+    // CHECK-NEXT: VarDecl {{.*}} B2 'half'
+    // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 0 0
+	half B2 : packoffset(c0.r);
+}
+
+// CHECK: HLSLBufferDecl {{.*}} cbuffer C
+cbuffer C
+{
+    // CHECK: VarDecl {{.*}} C0 'float'
+    // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 1
+    float C0 : packoffset(c0.y);
+    // CHECK-NEXT: VarDecl {{.*}} C1 'float2'
+    // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 2
+	float2 C1 : packoffset(c0.z);
+    // CHECK-NEXT: VarDecl {{.*}} C2 'half'
+    // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 0
+	half C2 : packoffset(c0.x);
+}
+
+
+// CHECK: HLSLBufferDecl {{.*}} cbuffer D
+cbuffer D
+{
+    // CHECK: VarDecl {{.*}} D0 'float'
+    // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 0 1
+    float D0 : packoffset(c0.y);
+    // CHECK-NEXT: VarDecl {{.*}} D1 'float[2]'
+    // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 1 0
+	float D1[2] : packoffset(c1.x);
+    // CHECK-NEXT: VarDecl {{.*}} D2 'half3'
+    // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 2 1
+	half3 D2 : packoffset(c2.y);
+    // CHECK-NEXT: VarDecl {{.*}} D3 'double'
+    // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 0 2
+	double D3 : packoffset(c0.z);
+}
+
+struct ST {
+  float a;
+  float2 b;
+  half c;
+};
+
+// CHECK: HLSLBufferDecl {{.*}} cbuffer S
+cbuffer S {
+    // CHECK: VarDecl {{.*}} S0 'float'
+    // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 0 1
+  float S0 : packoffset(c0.y);
+    // CHECK: VarDecl {{.*}} S1 'ST'
+    // CHECK: HLSLPackOffsetAttr {{.*}} 1 0
+  ST S1 : packoffset(c1);
+    // CHECK: VarDecl {{.*}} S2 'double2'
+    // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 2 0
+  double2 S2 : packoffset(c2);
+}
+
+struct ST2 {
+  float s0;
+  ST s1;
+  half s2;
+};
+
+// CHECK: HLSLBufferDecl {{.*}} cbuffer S2
+cbuffer S2 {
+    // CHECK: VarDecl {{.*}} S20 'float'
+    // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 0 3
+  float S20 : packoffset(c0.a);
+    // CHECK: VarDecl {{.*}} S21 'ST2'
+    // CHECK: HLSLPackOffsetAttr {{.*}} 1 0
+  ST2 S21 : packoffset(c1);
+    // CHECK: VarDecl {{.*}} S22 'half'
+    // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 3 1
+  half S22 : packoffset(c3.y);
+}
diff --git a/clang/test/AST/HLSL/pch.hlsl b/clang/test/AST/HLSL/pch.hlsl
index 27fae8f499da..839a13093bd1 100644
--- a/clang/test/AST/HLSL/pch.hlsl
+++ b/clang/test/AST/HLSL/pch.hlsl
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl \
 // RUN:  -finclude-default-header -emit-pch -o %t %S/Inputs/pch.hlsl
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl \
-// RUN:  -finclude-default-header -include-pch %t -fsyntax-only -ast-dump-all %s \
+// RUN:  -finclude-default-header -include-pch %t -ast-dump-all %s \
 // RUN: | FileCheck  %s
 
 // Make sure PCH works by using function declared in PCH header and declare a RWBuffer in current file.
diff --git a/clang/test/AST/HLSL/pch_hlsl_buffer.hlsl b/clang/test/AST/HLSL/pch_hlsl_buffer.hlsl
index 0277d4756db8..e9a6ea1a1631 100644
--- a/clang/test/AST/HLSL/pch_hlsl_buffer.hlsl
+++ b/clang/test/AST/HLSL/pch_hlsl_buffer.hlsl
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl \
 // RUN:   -emit-pch -o %t %s
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl \
-// RUN:   -include-pch %t -fsyntax-only -ast-dump-all %S/Inputs/empty.hlsl \
+// RUN:   -include-pch %t -ast-dump-all %S/Inputs/empty.hlsl \
 // RUN: | FileCheck  %s
 
 cbuffer A {
diff --git a/clang/test/AST/HLSL/pch_with_buf.hlsl b/clang/test/AST/HLSL/pch_with_buf.hlsl
index e8eae533af62..63b7ed508a5f 100644
--- a/clang/test/AST/HLSL/pch_with_buf.hlsl
+++ b/clang/test/AST/HLSL/pch_with_buf.hlsl
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -finclude-default-header -emit-pch -o %t %S/Inputs/pch_with_buf.hlsl
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl \
-// RUN:  -finclude-default-header -include-pch %t -fsyntax-only -ast-dump-all %s | FileCheck  %s
+// RUN:  -finclude-default-header -include-pch %t -ast-dump-all %s | FileCheck  %s
 
 // Make sure PCH works by using function declared in PCH header.
 // CHECK:FunctionDecl 0x[[FOO:[0-9a-f]+]] <{{.*}}:2:1, line:4:1> line:2:8 imported used foo 'float2 (float2, float2)'
diff --git a/clang/test/AST/HLSL/this-reference-template.hlsl b/clang/test/AST/HLSL/this-reference-template.hlsl
index d427e73044b7..703bf1f0ff8d 100644
--- a/clang/test/AST/HLSL/this-reference-template.hlsl
+++ b/clang/test/AST/HLSL/this-reference-template.hlsl
@@ -1,46 +1,46 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -disable-llvm-passes -o - -hlsl-entry main %s | FileCheck %s
-
-template<typename K, typename V>
-struct Pair {
-  K First;
-  V Second;
-
-  K getFirst() {
-	  return this.First;
-  }
-
-  V getSecond() {
-    return Second;
-  }
-};
-
-[numthreads(1, 1, 1)]
-void main() {
-  Pair<int, float> Vals = {1, 2.0};
-  Vals.First = Vals.getFirst();
-  Vals.Second = Vals.getSecond();
-}
-
-// CHECK:     -CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <line:8:3, line:10:3> line:8:5 getFirst 'K ()' implicit-inline
-// CHECK-NEXT:-CompoundStmt 0x{{[0-9A-Fa-f]+}} <col:16, line:10:3>
-// CHECK-NEXT:-ReturnStmt 0x{{[0-9A-Fa-f]+}} <line:9:4, col:16>
-// CHECK-NEXT:-MemberExpr 0x{{[0-9A-Fa-f]+}} <col:11, col:16> 'K' lvalue .First 0x{{[0-9A-Fa-f]+}}
-// CHECK-NEXT:-CXXThisExpr 0x{{[0-9A-Fa-f]+}} <col:11> 'Pair<K, V>' lvalue this
-// CHECK-NEXT:-CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <line:12:3, line:14:3> line:12:5 getSecond 'V ()' implicit-inline
-// CHECK-NEXT:-CompoundStmt 0x{{[0-9A-Fa-f]+}} <col:17, line:14:3>
-// CHECK-NEXT:-ReturnStmt 0x{{[0-9A-Fa-f]+}} <line:13:5, col:12>
-// CHECK-NEXT:-MemberExpr 0x{{[0-9A-Fa-f]+}} <col:12> 'V' lvalue .Second 0x{{[0-9A-Fa-f]+}}
-// CHECK-NEXT:-CXXThisExpr 0x{{[0-9A-Fa-f]+}} <col:12> 'Pair<K, V>' lvalue implicit this
-
-// CHECK:     -CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <line:8:3, line:10:3> line:8:5 used getFirst 'int ()' implicit_instantiation implicit-inline
-// CHECK-NEXT:-CompoundStmt 0x{{[0-9A-Fa-f]+}} <col:16, line:10:3>
-// CHECK-NEXT:-ReturnStmt 0x{{[0-9A-Fa-f]+}} <line:9:4, col:16>
-// CHECK-NEXT:-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:11, col:16> 'int' <LValueToRValue>
-// CHECK-NEXT:-MemberExpr 0x{{[0-9A-Fa-f]+}} <col:11, col:16> 'int' lvalue .First 0x{{[0-9A-Fa-f]+}}
-// CHECK-NEXT:-CXXThisExpr 0x{{[0-9A-Fa-f]+}} <col:11> 'Pair<int, float>' lvalue this
-// CHECK-NEXT:-CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <line:12:3, line:14:3> line:12:5 used getSecond 'float ()' implicit_instantiation implicit-inline
-// CHECK-NEXT:-CompoundStmt 0x{{[0-9A-Fa-f]+}} <col:17, line:14:3>
-// CHECK-NEXT:-ReturnStmt 0x{{[0-9A-Fa-f]+}} <line:13:5, col:12>
-// CHECK-NEXT:-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:12> 'float' <LValueToRValue>
-// CHECK-NEXT:-MemberExpr 0x{{[0-9A-Fa-f]+}} <col:12> 'float' lvalue .Second 0x{{[0-9A-Fa-f]+}}
-// CHECK-NEXT:-CXXThisExpr 0x{{[0-9A-Fa-f]+}} <col:12> 'Pair<int, float>' lvalue implicit this
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -disable-llvm-passes -o - -hlsl-entry main %s | FileCheck %s
+
+template<typename K, typename V>
+struct Pair {
+  K First;
+  V Second;
+
+  K getFirst() {
+	  return this.First;
+  }
+
+  V getSecond() {
+    return Second;
+  }
+};
+
+[numthreads(1, 1, 1)]
+void main() {
+  Pair<int, float> Vals = {1, 2.0};
+  Vals.First = Vals.getFirst();
+  Vals.Second = Vals.getSecond();
+}
+
+// CHECK:     -CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <line:8:3, line:10:3> line:8:5 getFirst 'K ()' implicit-inline
+// CHECK-NEXT:-CompoundStmt 0x{{[0-9A-Fa-f]+}} <col:16, line:10:3>
+// CHECK-NEXT:-ReturnStmt 0x{{[0-9A-Fa-f]+}} <line:9:4, col:16>
+// CHECK-NEXT:-MemberExpr 0x{{[0-9A-Fa-f]+}} <col:11, col:16> 'K' lvalue .First 0x{{[0-9A-Fa-f]+}}
+// CHECK-NEXT:-CXXThisExpr 0x{{[0-9A-Fa-f]+}} <col:11> 'Pair<K, V>' lvalue this
+// CHECK-NEXT:-CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <line:12:3, line:14:3> line:12:5 getSecond 'V ()' implicit-inline
+// CHECK-NEXT:-CompoundStmt 0x{{[0-9A-Fa-f]+}} <col:17, line:14:3>
+// CHECK-NEXT:-ReturnStmt 0x{{[0-9A-Fa-f]+}} <line:13:5, col:12>
+// CHECK-NEXT:-MemberExpr 0x{{[0-9A-Fa-f]+}} <col:12> 'V' lvalue .Second 0x{{[0-9A-Fa-f]+}}
+// CHECK-NEXT:-CXXThisExpr 0x{{[0-9A-Fa-f]+}} <col:12> 'Pair<K, V>' lvalue implicit this
+
+// CHECK:     -CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <line:8:3, line:10:3> line:8:5 used getFirst 'int ()' implicit_instantiation implicit-inline
+// CHECK-NEXT:-CompoundStmt 0x{{[0-9A-Fa-f]+}} <col:16, line:10:3>
+// CHECK-NEXT:-ReturnStmt 0x{{[0-9A-Fa-f]+}} <line:9:4, col:16>
+// CHECK-NEXT:-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:11, col:16> 'int' <LValueToRValue>
+// CHECK-NEXT:-MemberExpr 0x{{[0-9A-Fa-f]+}} <col:11, col:16> 'int' lvalue .First 0x{{[0-9A-Fa-f]+}}
+// CHECK-NEXT:-CXXThisExpr 0x{{[0-9A-Fa-f]+}} <col:11> 'Pair<int, float>' lvalue this
+// CHECK-NEXT:-CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <line:12:3, line:14:3> line:12:5 used getSecond 'float ()' implicit_instantiation implicit-inline
+// CHECK-NEXT:-CompoundStmt 0x{{[0-9A-Fa-f]+}} <col:17, line:14:3>
+// CHECK-NEXT:-ReturnStmt 0x{{[0-9A-Fa-f]+}} <line:13:5, col:12>
+// CHECK-NEXT:-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:12> 'float' <LValueToRValue>
+// CHECK-NEXT:-MemberExpr 0x{{[0-9A-Fa-f]+}} <col:12> 'float' lvalue .Second 0x{{[0-9A-Fa-f]+}}
+// CHECK-NEXT:-CXXThisExpr 0x{{[0-9A-Fa-f]+}} <col:12> 'Pair<int, float>' lvalue implicit this
diff --git a/clang/test/AST/HLSL/this-reference.hlsl b/clang/test/AST/HLSL/this-reference.hlsl
index 67d8e7b7b911..b54f8c92d7e3 100644
--- a/clang/test/AST/HLSL/this-reference.hlsl
+++ b/clang/test/AST/HLSL/this-reference.hlsl
@@ -1,62 +1,62 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -disable-llvm-passes -o - -hlsl-entry main %s | FileCheck %s
-
-class Pair {
-  int First;
-  int Second;
-
-  int getFirst() {
-	  return this.First;
-  }
-
-  int getSecond() {
-    return Second;
-  }
-};
-
-class PairInfo : Pair {
-  int Sum;
-
-  int getSum() {
-    return this.First + Second;
-  }
-};
-
-[numthreads(1, 1, 1)]
-void main() {
-  Pair Vals = {1, 2};
-  Vals.First = Vals.getFirst();
-  Vals.Second = Vals.getSecond();
-
-  PairInfo ValsInfo;
-  ValsInfo.First = Vals.First;
-  ValsInfo.Second = Vals.Second;
-  ValsInfo.Sum = ValsInfo.getSum();
-
-}
-
-// CHECK:     -CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <line:7:3, line:9:3> line:7:7 used getFirst 'int ()' implicit-inline
-// CHECK-NEXT:`-CompoundStmt 0x{{[0-9A-Fa-f]+}} <col:18, line:9:3>
-// CHECK-NEXT:`-ReturnStmt 0x{{[0-9A-Fa-f]+}} <line:8:4, col:16>
-// CHECK-NEXT:`-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:11, col:16> 'int' <LValueToRValue>
-// CHECK-NEXT:`-MemberExpr 0x{{[0-9A-Fa-f]+}} <col:11, col:16> 'int' lvalue .First 0x{{[0-9A-Fa-f]+}}
-// CHECK-NEXT:`-CXXThisExpr 0x{{[0-9A-Fa-f]+}} <col:11> 'Pair' lvalue this
-// CHECK-NEXT:-CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <line:11:3, line:13:3> line:11:7 used getSecond 'int ()' implicit-inline
-// CHECK-NEXT:`-CompoundStmt 0x{{[0-9A-Fa-f]+}} <col:19, line:13:3>
-// CHECK-NEXT:`-ReturnStmt 0x{{[0-9A-Fa-f]+}} <line:12:5, col:12>
-// CHECK-NEXT:`-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:12> 'int' <LValueToRValue>
-// CHECK-NEXT:`-MemberExpr 0x{{[0-9A-Fa-f]+}} <col:12> 'int' lvalue .Second 0x{{[0-9A-Fa-f]+}}
-// CHECK-NEXT:`-CXXThisExpr 0x{{[0-9A-Fa-f]+}} <col:12> 'Pair' lvalue implicit this
-
-
-// CHECK:     CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <line:19:3, line:21:3> line:19:7 used getSum 'int ()' implicit-inline
-// CHECK-NEXT:`-CompoundStmt 0x{{[0-9A-Fa-f]+}} <col:16, line:21:3>
-// CHECK-NEXT:`-ReturnStmt 0x{{[0-9A-Fa-f]+}} <line:20:5, col:25>
-// CHECK-NEXT:`-BinaryOperator 0x{{[0-9A-Fa-f]+}} <col:12, col:25> 'int' '+'
-// CHECK-NEXT:-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:12, col:17> 'int' <LValueToRValue>
-// CHECK-NEXT:`-MemberExpr 0x{{[0-9A-Fa-f]+}} <col:12, col:17> 'int' lvalue .First 0x{{[0-9A-Fa-f]+}}
-// CHECK-NEXT:`-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:12> 'Pair' lvalue <UncheckedDerivedToBase (Pair)>
-// CHECK-NEXT:`-CXXThisExpr 0x{{[0-9A-Fa-f]+}} <col:12> 'PairInfo' lvalue this
-// CHECK-NEXT:`-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:25> 'int' <LValueToRValue>
-// CHECK-NEXT:`-MemberExpr 0x{{[0-9A-Fa-f]+}} <col:25> 'int' lvalue .Second 0x{{[0-9A-Fa-f]+}}
-// CHECK-NEXT:`-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:25> 'Pair' lvalue <UncheckedDerivedToBase (Pair)>
-// CHECK-NEXT:`-CXXThisExpr 0x{{[0-9A-Fa-f]+}} <col:25> 'PairInfo' lvalue implicit this
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -disable-llvm-passes -o - -hlsl-entry main %s | FileCheck %s
+
+class Pair {
+  int First;
+  int Second;
+
+  int getFirst() {
+	  return this.First;
+  }
+
+  int getSecond() {
+    return Second;
+  }
+};
+
+class PairInfo : Pair {
+  int Sum;
+
+  int getSum() {
+    return this.First + Second;
+  }
+};
+
+[numthreads(1, 1, 1)]
+void main() {
+  Pair Vals = {1, 2};
+  Vals.First = Vals.getFirst();
+  Vals.Second = Vals.getSecond();
+
+  PairInfo ValsInfo;
+  ValsInfo.First = Vals.First;
+  ValsInfo.Second = Vals.Second;
+  ValsInfo.Sum = ValsInfo.getSum();
+
+}
+
+// CHECK:     -CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <line:7:3, line:9:3> line:7:7 used getFirst 'int ()' implicit-inline
+// CHECK-NEXT:`-CompoundStmt 0x{{[0-9A-Fa-f]+}} <col:18, line:9:3>
+// CHECK-NEXT:`-ReturnStmt 0x{{[0-9A-Fa-f]+}} <line:8:4, col:16>
+// CHECK-NEXT:`-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:11, col:16> 'int' <LValueToRValue>
+// CHECK-NEXT:`-MemberExpr 0x{{[0-9A-Fa-f]+}} <col:11, col:16> 'int' lvalue .First 0x{{[0-9A-Fa-f]+}}
+// CHECK-NEXT:`-CXXThisExpr 0x{{[0-9A-Fa-f]+}} <col:11> 'Pair' lvalue this
+// CHECK-NEXT:-CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <line:11:3, line:13:3> line:11:7 used getSecond 'int ()' implicit-inline
+// CHECK-NEXT:`-CompoundStmt 0x{{[0-9A-Fa-f]+}} <col:19, line:13:3>
+// CHECK-NEXT:`-ReturnStmt 0x{{[0-9A-Fa-f]+}} <line:12:5, col:12>
+// CHECK-NEXT:`-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:12> 'int' <LValueToRValue>
+// CHECK-NEXT:`-MemberExpr 0x{{[0-9A-Fa-f]+}} <col:12> 'int' lvalue .Second 0x{{[0-9A-Fa-f]+}}
+// CHECK-NEXT:`-CXXThisExpr 0x{{[0-9A-Fa-f]+}} <col:12> 'Pair' lvalue implicit this
+
+
+// CHECK:     CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <line:19:3, line:21:3> line:19:7 used getSum 'int ()' implicit-inline
+// CHECK-NEXT:`-CompoundStmt 0x{{[0-9A-Fa-f]+}} <col:16, line:21:3>
+// CHECK-NEXT:`-ReturnStmt 0x{{[0-9A-Fa-f]+}} <line:20:5, col:25>
+// CHECK-NEXT:`-BinaryOperator 0x{{[0-9A-Fa-f]+}} <col:12, col:25> 'int' '+'
+// CHECK-NEXT:-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:12, col:17> 'int' <LValueToRValue>
+// CHECK-NEXT:`-MemberExpr 0x{{[0-9A-Fa-f]+}} <col:12, col:17> 'int' lvalue .First 0x{{[0-9A-Fa-f]+}}
+// CHECK-NEXT:`-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:12> 'Pair' lvalue <UncheckedDerivedToBase (Pair)>
+// CHECK-NEXT:`-CXXThisExpr 0x{{[0-9A-Fa-f]+}} <col:12> 'PairInfo' lvalue this
+// CHECK-NEXT:`-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:25> 'int' <LValueToRValue>
+// CHECK-NEXT:`-MemberExpr 0x{{[0-9A-Fa-f]+}} <col:25> 'int' lvalue .Second 0x{{[0-9A-Fa-f]+}}
+// CHECK-NEXT:`-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:25> 'Pair' lvalue <UncheckedDerivedToBase (Pair)>
+// CHECK-NEXT:`-CXXThisExpr 0x{{[0-9A-Fa-f]+}} <col:25> 'PairInfo' lvalue implicit this
diff --git a/clang/test/AST/HLSL/vector-constructors.hlsl b/clang/test/AST/HLSL/vector-constructors.hlsl
index 7861d5209b5d..5e0900bb6236 100644
--- a/clang/test/AST/HLSL/vector-constructors.hlsl
+++ b/clang/test/AST/HLSL/vector-constructors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s 
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s
 
 typedef float float2 __attribute__((ext_vector_type(2)));
 typedef float float3 __attribute__((ext_vector_type(3)));
@@ -11,41 +11,36 @@ void entry() {
 
 // For the float2 vector, we just expect a conversion from constructor
 // parameters to an initialization list
-// CHECK: VarDecl 0x{{[0-9a-fA-F]+}} <col:3, col:32> col:10 used Vec2 'float2':'float __attribute__((ext_vector_type(2)))' cinit
-// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:17, col:32> 'float2':'float __attribute__((ext_vector_type(2)))' functional cast to float2 <NoOp>
-// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:24, col:29> 'float2':'float __attribute__((ext_vector_type(2)))'
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:24> 'float' <FloatingCast>
-// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:24> 'double' 1.000000e+00
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:29> 'float' <FloatingCast>
-// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:29> 'double' 2.000000e+00
+// CHECK-LABEL: VarDecl 0x{{[0-9a-fA-F]+}} {{.*}} used Vec2 'float2':'float __attribute__((ext_vector_type(2)))' cinit
+// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'float2':'float __attribute__((ext_vector_type(2)))' functional cast to float2 <NoOp>
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'float2':'float __attribute__((ext_vector_type(2)))'
+// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} {{.*}} 'float' 1.000000e+00
+// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} {{.*}} 'float' 2.000000e+00
 
 
 // For the float 3 things get fun...
 // Here we expect accesses to the vec2 to provide the first and second
 // components using ArraySubscriptExpr
-// CHECK: VarDecl 0x{{[0-9a-fA-F]+}} <col:3, col:33> col:10 Vec3 'float3':'float __attribute__((ext_vector_type(3)))' cinit
-// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:17, col:33> 'float3':'float __attribute__((ext_vector_type(3)))' functional cast to float3 <NoOp>
-// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:24, col:30> 'float3':'float __attribute__((ext_vector_type(3)))'
+// CHECK-LABEL: VarDecl 0x{{[0-9a-fA-F]+}} {{.*}} col:10 Vec3 'float3':'float __attribute__((ext_vector_type(3)))' cinit
+// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'float3':'float __attribute__((ext_vector_type(3)))' functional cast to float3 <NoOp>
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'float3':'float __attribute__((ext_vector_type(3)))'
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:24, <invalid sloc>> 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:24, <invalid sloc>> 'float' lvalue
-// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:24> 'float2':'float __attribute__((ext_vector_type(2)))' lvalue Var 0x{{[0-9a-fA-F]+}} 'Vec2' 'float2':'float __attribute__((ext_vector_type(2)))'
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'float2':'float __attribute__((ext_vector_type(2)))' lvalue Var 0x{{[0-9a-fA-F]+}} 'Vec2' 'float2':'float __attribute__((ext_vector_type(2)))'
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> 'int' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:24, <invalid sloc>> 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:24, <invalid sloc>> 'float' lvalue
-// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:24> 'float2':'float __attribute__((ext_vector_type(2)))' lvalue Var 0x{{[0-9a-fA-F]+}} 'Vec2' 'float2':'float __attribute__((ext_vector_type(2)))'
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'float2':'float __attribute__((ext_vector_type(2)))' lvalue Var 0x{{[0-9a-fA-F]+}} 'Vec2' 'float2':'float __attribute__((ext_vector_type(2)))'
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> 'int' 1
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:30> 'float' <FloatingCast>
-// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:30> 'double' 3.000000e+00
-
-// CHECK: VarDecl 0x{{[0-9a-fA-F]+}} <col:3, col:38> col:10 Vec3b 'float3':'float __attribute__((ext_vector_type(3)))' cinit
-// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:18, col:38> 'float3':'float __attribute__((ext_vector_type(3)))' functional cast to float3 <NoOp>
-// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:25, col:35> 'float3':'float __attribute__((ext_vector_type(3)))'
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:25> 'float' <FloatingCast>
-// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:25> 'double' 1.000000e+00
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:30> 'float' <FloatingCast>
-// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:30> 'double' 2.000000e+00
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:35> 'float' <FloatingCast>
-// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:35> 'double' 3.000000e+00
+// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} {{.*}} 'float' 3.000000e+00
+
+// CHECK: VarDecl 0x{{[0-9a-fA-F]+}} {{.*}} col:10 Vec3b 'float3':'float __attribute__((ext_vector_type(3)))' cinit
+// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'float3':'float __attribute__((ext_vector_type(3)))' functional cast to float3 <NoOp>
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'float3':'float __attribute__((ext_vector_type(3)))'
+
+// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} {{.*}} 'float' 1.000000e+00
+// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} {{.*}} 'float' 2.000000e+00
+// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} {{.*}} 'float' 3.000000e+00
 
 // The tests above verify pretty explictily that the Initialization lists are
 // being constructed as expected. The next tests are bit sparser for brevity.
@@ -53,91 +48,85 @@ void entry() {
   float f = 1.0f, g = 2.0f;
   float2 foo0 = float2(f, g); // Non-literal
 
-// CHECK: DeclStmt 0x{{[0-9a-fA-F]+}} <line:54:3, col:29>
-// CHECK-NEXT: VarDecl
+// CHECK-LABEL: VarDecl 0x{{[0-9a-fA-F]+}} {{.*}} foo0 'float2'
 // CHECK-NEXT: CXXFunctionalCastExpr
 // CHECK-NEXT: InitListExpr
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:24> 'float' <LValueToRValue>
-// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:24> 'float' lvalue Var  0x{{[0-9a-fA-F]+}} 'f' 'float'
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:27> 'float' <LValueToRValue>
-// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:27> 'float' lvalue Var  0x{{[0-9a-fA-F]+}} 'g' 'float'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'float' lvalue Var  0x{{[0-9a-fA-F]+}} 'f' 'float'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'float' lvalue Var  0x{{[0-9a-fA-F]+}} 'g' 'float'
 
   int i = 1, j = 2;
   float2 foo1 = float2(1, 2); // Integer literals
 
-// CHECK: DeclStmt 0x{{[0-9a-fA-F]+}} <line:66:3, col:29>
-// CHECK-NEXT: VarDecl
+// CHECK-LABEL: VarDecl 0x{{[0-9a-fA-F]+}} {{.*}} foo1 'float2'
 // CHECK-NEXT: CXXFunctionalCastExpr
 // CHECK-NEXT: InitListExpr
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:24> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:24> 'int' 1
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:27> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:27> 'int' 2
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} {{.*}} 'int' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} {{.*}} 'int' 2
 
   float2 foo2 = float2(i, j); // Integer non-literal
 
-// CHECK: DeclStmt 0x{{[0-9a-fA-F]+}} <line:77:3, col:29>
-// CHECK-NEXT: VarDecl
+// CHECK-LABEL: VarDecl 0x{{[0-9a-fA-F]+}} {{.*}} foo2 'float2'
 // CHECK-NEXT: CXXFunctionalCastExpr
 // CHECK-NEXT: InitListExpr
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:24> 'float' <IntegralToFloating>
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:24> 'int' <LValueToRValue>
-// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:24> 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'i' 'int'
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:27> 'float' <IntegralToFloating>
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:27> 'int' <LValueToRValue>
-// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:27> 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'j' 'int'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'float' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'i' 'int'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'float' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'j' 'int'
 
   struct S { float f; } s;
   float2 foo4 = float2(s.f, s.f);
 
-// CHECK: DeclStmt 0x{{[0-9a-fA-F]+}} <line:91:3, col:33>
-// CHECK-NEXT: VarDecl
+// CHECK-LABEL: VarDecl 0x{{[0-9a-fA-F]+}} {{.*}} foo4 'float2'
 // CHECK-NEXT: CXXFunctionalCastExpr
 // CHECK-NEXT: InitListExpr
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:24, col:26> 'float' <LValueToRValue>
-// CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:24, col:26> 'float' lvalue .f 0x{{[0-9a-fA-F]+}}
-// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:24> 'struct S':'S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S':'S'
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:29, col:31> 'float' <LValueToRValue>
-// CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:29, col:31> 'float' lvalue .f 0x{{[0-9a-fA-F]+}}
-// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:29> 'struct S':'S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S':'S'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'float' lvalue .f 0x{{[0-9a-fA-F]+}}
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'struct S':'S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S':'S'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'float' lvalue .f 0x{{[0-9a-fA-F]+}}
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'struct S':'S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S':'S'
 
   struct T {
     operator float() const { return 1.0f; }
   } t;
   float2 foo5 = float2(t, t); // user-defined cast operator
 
-// CHECK: DeclStmt 0x{{[0-9a-fA-F]+}} <line:107:3, col:29>
-// CHECK-NEXT: VarDecl
+// CHECK-LABEL: VarDecl 0x{{[0-9a-fA-F]+}} {{.*}} foo5 'float2'
 // CHECK-NEXT: CXXFunctionalCastExpr
 // CHECK-NEXT: InitListExpr
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:24> 'float' <UserDefinedConversion>
-// CHECK-NEXT: CXXMemberCallExpr 0x{{[0-9a-fA-F]+}} <col:24> 'float'
-// CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:24> '<bound member function type>' .operator float 0x{{[0-9a-fA-F]+}}
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:24> 'const T' lvalue <NoOp>
-// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:24> 'struct T':'T' lvalue Var 0x{{[0-9a-fA-F]+}} 't' 'struct T':'T'
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:27> 'float' <UserDefinedConversion>
-// CHECK-NEXT: CXXMemberCallExpr 0x{{[0-9a-fA-F]+}} <col:27> 'float'
-// CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:27> '<bound member function type>' .operator float 0x{{[0-9a-fA-F]+}}
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:27> 'const T' lvalue <NoOp>
-// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:27> 'struct T':'T' lvalue Var 0x{{[0-9a-fA-F]+}} 't' 'struct T':'T'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'float' <UserDefinedConversion>
+// CHECK-NEXT: CXXMemberCallExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'float'
+// CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} {{.*}} '<bound member function type>' .operator float 0x{{[0-9a-fA-F]+}}
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'const T' lvalue <NoOp>
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'struct T':'T' lvalue Var 0x{{[0-9a-fA-F]+}} 't' 'struct T':'T'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'float' <UserDefinedConversion>
+// CHECK-NEXT: CXXMemberCallExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'float'
+// CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} {{.*}} '<bound member function type>' .operator float 0x{{[0-9a-fA-F]+}}
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'const T' lvalue <NoOp>
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} {{.*}} 'struct T':'T' lvalue Var 0x{{[0-9a-fA-F]+}} 't' 'struct T':'T'
 
   typedef float2 second_level_of_typedefs;
   second_level_of_typedefs foo6 = float2(1.0f, 2.0f);
 
-// CHECK: DeclStmt 0x{{[0-9a-fA-F]+}} <line:125:3, col:53>
-// CHECK-NEXT: VarDecl
+
+// CHECK-LABEL: VarDecl 0x{{[0-9a-fA-F]+}} {{.*}} foo6 'second_level_of_typedefs'
 // CHECK-NEXT: CXXFunctionalCastExpr
 // CHECK-NEXT: InitListExpr
-// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:42> 'float' 1.000000e+00
-// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:48> 'float' 2.000000e+00
+// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} {{.*}} 'float' 1.000000e+00
+// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} {{.*}} 'float' 2.000000e+00
 
   float2 foo7 = second_level_of_typedefs(1.0f, 2.0f);
 
-// CHECK: DeclStmt 0x{{[0-9a-fA-F]+}} <line:134:3, col:53>
-// CHECK-NEXT: VarDecl
+// CHECK-LABEL: VarDecl 0x{{[0-9a-fA-F]+}} {{.*}} foo7 'float2'
 // CHECK-NEXT: CXXFunctionalCastExpr
 // CHECK-NEXT: InitListExpr
-// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:42> 'float' 1.000000e+00
-// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:48> 'float' 2.000000e+00
+// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} {{.*}} 'float' 1.000000e+00
+// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} {{.*}} 'float' 2.000000e+00
 
 }
diff --git a/clang/test/AST/Interp/arrays.cpp b/clang/test/AST/Interp/arrays.cpp
index 607d67bde020..f6d265d4b3d1 100644
--- a/clang/test/AST/Interp/arrays.cpp
+++ b/clang/test/AST/Interp/arrays.cpp
@@ -560,12 +560,22 @@ namespace LocalVLA {
      // both-note@-4 {{function parameter 'size' with unknown value}}
 #endif
   }
+
+  void f (unsigned int m) {
+    int e[2][m];
+#if __cplusplus >= 202002L
+     // both-note@-3 {{declared here}}
+     // both-warning@-3 2{{variable length array}}
+     // both-note@-4 {{function parameter 'm' with unknown value}}
+#endif
+    e[0][0] = 0;
+  }
 }
 
-char melchizedek[2200000000];
+char melchizedek[2];
 typedef decltype(melchizedek[1] - melchizedek[0]) ptrdiff_t;
-constexpr ptrdiff_t d1 = &melchizedek[0x7fffffff] - &melchizedek[0]; // ok
-constexpr ptrdiff_t d3 = &melchizedek[0] - &melchizedek[0x80000000u]; // ok
+constexpr ptrdiff_t d1 = &melchizedek[1] - &melchizedek[0]; // ok
+constexpr ptrdiff_t d3 = &melchizedek[0] - &melchizedek[1]; // ok
 
 /// GH#88018
 const int SZA[] = {};
diff --git a/clang/test/AST/Interp/builtin-align-cxx.cpp b/clang/test/AST/Interp/builtin-align-cxx.cpp
index 62d73dba929b..c4103953df02 100644
--- a/clang/test/AST/Interp/builtin-align-cxx.cpp
+++ b/clang/test/AST/Interp/builtin-align-cxx.cpp
@@ -2,19 +2,6 @@
 // RUN: %clang_cc1 -triple=x86_64-unknown-unknown -std=c++11 %s -fsyntax-only -verify=expected,both -fexperimental-new-constant-interpreter
 // RUN: %clang_cc1 -triple=x86_64-unknown-unknown -std=c++11 %s -fsyntax-only -verify=ref,both
 
-
-/// This is just a copy of the one from test/SemaCXX/ with some of the
-/// diagnostic output adapted.
-/// Also, align32array has an initializer now, which means it's not just
-/// a dummy pointer for us and we do actually have type information for it.
-/// In the future, we need to retain type information for dummy pointers as
-/// well, so here is a test that will break once we do that:
-namespace {
-  _Alignas(32) char heh[4];
-  static_assert(!__builtin_is_aligned(&heh[1], 4), ""); // expected-error {{failed}}
-}
-
-
 // Check that we don't crash when using dependent types in __builtin_align:
 template <typename a, a b>
 void *c(void *d) { // both-note{{candidate template ignored}}
@@ -177,7 +164,7 @@ static_assert(wrap_align_up(static_cast<bool>(1), const_value(1 << 21)), ""); //
 // both-note@-1{{in instantiation of function template specialization 'wrap_align_up<bool>' requested here}}
 
 // Check constant evaluation for pointers:
-_Alignas(32) char align32array[128] = {};
+_Alignas(32) char align32array[128];
 static_assert(&align32array[0] == &align32array[0], "");
 // __builtin_align_up/down can be constant evaluated as a no-op for values
 // that are known to have greater alignment:
diff --git a/clang/test/AST/Interp/builtins.cpp b/clang/test/AST/Interp/builtins.cpp
index 9095d1bf8d6a..a74b68bb9d89 100644
--- a/clang/test/AST/Interp/builtins.cpp
+++ b/clang/test/AST/Interp/builtins.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -fexperimental-new-constant-interpreter %s -Wno-constant-evaluated -verify -fms-extensions
-// RUN: %clang_cc1 -fexperimental-new-constant-interpreter %s -Wno-constant-evaluated -fms-extensions -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter %s -Wno-constant-evaluated -fms-extensions -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 -verify=ref %s -Wno-constant-evaluated -fms-extensions
-// RUN: %clang_cc1 -verify=ref %s -Wno-constant-evaluated %s -fms-extensions -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify=ref %s -Wno-constant-evaluated %s -fms-extensions -emit-llvm -o - | FileCheck %s
 
 // expected-no-diagnostics
 // ref-no-diagnostics
diff --git a/clang/test/AST/Interp/c.c b/clang/test/AST/Interp/c.c
index 207da5fe8126..2c675f4418ef 100644
--- a/clang/test/AST/Interp/c.c
+++ b/clang/test/AST/Interp/c.c
@@ -270,3 +270,6 @@ int test3(void) {
   a[0] = test3; // all-error {{incompatible pointer to integer conversion assigning to 'int' from 'int (void)'}}
   return 0;
 }
+/// This tests that we have full type info, even for values we cannot read.
+int dummyarray[5];
+_Static_assert(&dummyarray[0] < &dummyarray[1], ""); // pedantic-warning {{GNU extension}}
diff --git a/clang/test/AST/Interp/const-fpfeatures.cpp b/clang/test/AST/Interp/const-fpfeatures.cpp
index e24210810025..0764e3d8ba81 100644
--- a/clang/test/AST/Interp/const-fpfeatures.cpp
+++ b/clang/test/AST/Interp/const-fpfeatures.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -S -emit-llvm -triple i386-linux -std=c++2a -Wno-unknown-pragmas %s -o - | FileCheck %s
-// RUN: %clang_cc1 -S -emit-llvm -triple i386-linux -fexperimental-new-constant-interpreter -std=c++2a -Wno-unknown-pragmas %s -o - | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm -triple i386-linux -std=c++2a -Wno-unknown-pragmas %s -o - | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm -triple i386-linux -fexperimental-new-constant-interpreter -std=c++2a -Wno-unknown-pragmas %s -o - | FileCheck %s
 
 
 #pragma STDC FENV_ROUND FE_UPWARD
diff --git a/clang/test/AST/Interp/cxx23.cpp b/clang/test/AST/Interp/cxx23.cpp
index d1ec93e99803..c91d52c552b1 100644
--- a/clang/test/AST/Interp/cxx23.cpp
+++ b/clang/test/AST/Interp/cxx23.cpp
@@ -1,3 +1,4 @@
+// UNSUPPORTED:  target={{.*}}-zos{{.*}}
 // RUN: %clang_cc1 -std=c++20 -fsyntax-only -fcxx-exceptions -verify=ref20,all,all20 %s
 // RUN: %clang_cc1 -std=c++23 -fsyntax-only -fcxx-exceptions -verify=ref23,all %s
 // RUN: %clang_cc1 -std=c++20 -fsyntax-only -fcxx-exceptions -verify=expected20,all,all20 %s -fexperimental-new-constant-interpreter
@@ -170,3 +171,10 @@ namespace LabelGoto {
   static_assert(foo() == 1, ""); // all-error {{not an integral constant expression}} \
                                  // all-note {{in call to}}
 }
+
+namespace ExplicitLambdaThis {
+  constexpr auto f = [x = 3]<typename Self>(this Self self) { // all20-error {{explicit object parameters are incompatible with C++ standards before C++2b}}
+      return x;
+  };
+  static_assert(f());
+}
diff --git a/clang/test/AST/Interp/eval-order.cpp b/clang/test/AST/Interp/eval-order.cpp
new file mode 100644
index 000000000000..695a43c9d235
--- /dev/null
+++ b/clang/test/AST/Interp/eval-order.cpp
@@ -0,0 +1,117 @@
+// RUN: %clang_cc1 -std=c++1z -verify %s -fcxx-exceptions -triple=x86_64-linux-gnu
+// RUN: %clang_cc1 -std=c++1z -verify %s -fcxx-exceptions -triple=x86_64-linux-gnu -fexperimental-new-constant-interpreter
+
+// ref-no-diagnostics
+// expected-no-diagnostics
+
+/// Check that assignment operators evaluate their operands right-to-left.
+/// Copied from test/SemaCXX/constant-expression-cxx1z.cpp
+///
+/// As you can see from the FIXME comments, some of these are not yet working correctly
+/// in the new interpreter.
+namespace EvalOrder {
+  template<typename T> struct lvalue {
+    T t;
+    constexpr T &get() { return t; }
+  };
+
+  struct UserDefined {
+    int n = 0;
+    constexpr UserDefined &operator=(const UserDefined&) { return *this; }
+    constexpr UserDefined &operator+=(const UserDefined&) { return *this; }
+    constexpr void operator<<(const UserDefined&) const {}
+    constexpr void operator>>(const UserDefined&) const {}
+    constexpr void operator+(const UserDefined&) const {}
+    constexpr void operator[](int) const {}
+  };
+  constexpr UserDefined ud;
+
+  struct NonMember {};
+  constexpr void operator+=(NonMember, NonMember) {}
+  constexpr void operator<<(NonMember, NonMember) {}
+  constexpr void operator>>(NonMember, NonMember) {}
+  constexpr void operator+(NonMember, NonMember) {}
+  constexpr NonMember nm;
+
+  constexpr void f(...) {}
+
+  // Helper to ensure that 'a' is evaluated before 'b'.
+  struct seq_checker {
+    bool done_a = false;
+    bool done_b = false;
+
+    template <typename T> constexpr T &&a(T &&v) {
+      done_a = true;
+      return (T &&)v;
+    }
+    template <typename T> constexpr T &&b(T &&v) {
+      if (!done_a)
+        throw "wrong";
+      done_b = true;
+      return (T &&)v;
+    }
+
+    constexpr bool ok() { return done_a && done_b; }
+  };
+
+  // SEQ(expr), where part of the expression is tagged A(...) and part is
+  // tagged B(...), checks that A is evaluated before B.
+  #define A sc.a
+  #define B sc.b
+  #define SEQ(...) static_assert([](seq_checker sc) { void(__VA_ARGS__); return sc.ok(); }({}))
+
+  // Longstanding sequencing rules.
+  SEQ((A(1), B(2)));
+  SEQ((A(true) ? B(2) : throw "huh?"));
+  SEQ((A(false) ? throw "huh?" : B(2)));
+  SEQ(A(true) && B(true));
+  SEQ(A(false) || B(true));
+
+  // From P0145R3:
+
+  // Rules 1 and 2 have no effect ('b' is not an expression).
+
+  // Rule 3: a->*b
+  // SEQ(A(ud).*B(&UserDefined::n)); FIXME
+  // SEQ(A(&ud)->*B(&UserDefined::n)); FIXME
+
+  // Rule 4: a(b1, b2, b3)
+  // SEQ(A(f)(B(1), B(2), B(3))); FIXME
+
+  // Rule 5: b = a, b @= a
+  // SEQ(B(lvalue<int>().get()) = A(0)); FIXME
+  // SEQ(B(lvalue<UserDefined>().get()) = A(ud)); FIXME
+  SEQ(B(lvalue<int>().get()) += A(0));
+  // SEQ(B(lvalue<UserDefined>().get()) += A(ud)); FIXME
+  // SEQ(B(lvalue<NonMember>().get()) += A(nm)); FIXME
+
+  // Rule 6: a[b]
+  constexpr int arr[3] = {};
+  SEQ(A(arr)[B(0)]);
+  SEQ(A(+arr)[B(0)]);
+  // SEQ(A(0)[B(arr)]); FIXME
+  // SEQ(A(0)[B(+arr)]); FIXME
+  SEQ(A(ud)[B(0)]);
+
+  // Rule 7: a << b
+  SEQ(A(1) << B(2));
+  SEQ(A(ud) << B(ud));
+  SEQ(A(nm) << B(nm));
+
+  // Rule 8: a >> b
+  SEQ(A(1) >> B(2));
+  SEQ(A(ud) >> B(ud));
+  SEQ(A(nm) >> B(nm));
+
+  // No particular order of evaluation is specified in other cases, but we in
+  // practice evaluate left-to-right.
+  // FIXME: Technically we're expected to check for undefined behavior due to
+  // unsequenced read and modification and treat it as non-constant due to UB.
+  SEQ(A(1) + B(2));
+  SEQ(A(ud) + B(ud));
+  SEQ(A(nm) + B(nm));
+  SEQ(f(A(1), B(2)));
+  #undef SEQ
+  #undef A
+  #undef B
+}
diff --git a/clang/test/AST/Interp/lambda.cpp b/clang/test/AST/Interp/lambda.cpp
index d056bb304eeb..77e035ce2547 100644
--- a/clang/test/AST/Interp/lambda.cpp
+++ b/clang/test/AST/Interp/lambda.cpp
@@ -248,3 +248,19 @@ namespace ns2_capture_this_byval {
   constexpr auto L = S{5}.f(S{10});
   static_assert(L(S{100}) == 115, "");
 } // end test_captures_1::ns2_capture_this_byval
+
+namespace CaptureDefaults {
+  struct S {
+    int x;
+  };
+
+  constexpr auto f = [x = S{10}]() {
+      return x.x;
+  };
+  static_assert(f() == 10, "");
+
+  constexpr auto f2 = [x = 3]() {
+      return x;
+  };
+  static_assert(f2() == 3, "");
+}
diff --git a/clang/test/AST/Interp/literals.cpp b/clang/test/AST/Interp/literals.cpp
index 2688b53adde2..c160be06dd24 100644
--- a/clang/test/AST/Interp/literals.cpp
+++ b/clang/test/AST/Interp/literals.cpp
@@ -985,6 +985,8 @@ namespace DiscardExprs {
     __uuidof(GuidType);
     __uuidof(number); // both-error {{cannot call operator __uuidof on a type with no GUID}}
 
+    requires{false;};
+
     return 0;
   }
   static_assert(ignoredExprs() == 0, "");
diff --git a/clang/test/AST/Interp/records.cpp b/clang/test/AST/Interp/records.cpp
index 771e5adfca34..41be9b71a27f 100644
--- a/clang/test/AST/Interp/records.cpp
+++ b/clang/test/AST/Interp/records.cpp
@@ -999,10 +999,9 @@ namespace TemporaryObjectExpr {
       F f{12};
     };
     constexpr int foo(S x) {
-      return x.a; // expected-note {{read of uninitialized object}}
+      return x.a;
     }
-    static_assert(foo(S()) == 0, ""); // expected-error {{not an integral constant expression}} \
-                                      // expected-note {{in call to}}
+    static_assert(foo(S()) == 0, "");
   };
 #endif
 }
@@ -1425,6 +1424,11 @@ namespace ZeroInit {
   };
   constexpr S3 s3d; // both-error {{default initialization of an object of const type 'const S3' without a user-provided default constructor}}
   static_assert(s3d.n == 0, "");
+
+  struct P {
+    int a = 10;
+  };
+  static_assert(P().a == 10, "");
 }
 
 namespace {
@@ -1440,3 +1444,18 @@ namespace {
   static_assert(waldo == 4, "");
 #endif
 }
+
+
+namespace TemporaryWithInvalidDestructor {
+#if __cplusplus >= 202002L
+  struct A {
+    bool a = true;
+    constexpr ~A() noexcept(false) { // both-error {{never produces a constant expression}}
+      throw; // both-note 2{{not valid in a constant expression}} \
+             // both-error {{cannot use 'throw' with exceptions disabled}}
+    }
+  };
+  static_assert(A().a, ""); // both-error {{not an integral constant expression}} \
+                        // both-note {{in call to}}
+#endif
+}
diff --git a/clang/test/AST/alignas_maybe_odr_cleanup.cpp b/clang/test/AST/alignas_maybe_odr_cleanup.cpp
index ed34930e98a0..287eb6e2a4bb 100644
--- a/clang/test/AST/alignas_maybe_odr_cleanup.cpp
+++ b/clang/test/AST/alignas_maybe_odr_cleanup.cpp
@@ -1,5 +1,5 @@
 // Test without serialization:
-// RUN: %clang_cc1 -fsyntax-only %s -ast-dump | FileCheck %s
+// RUN: %clang_cc1 %s -ast-dump | FileCheck %s
 //
 // Test with serialization:
 // RUN: %clang_cc1 -emit-pch -o %t %s
diff --git a/clang/test/AST/arithmetic-fence-builtin.c b/clang/test/AST/arithmetic-fence-builtin.c
index acdefade0748..2450bb4d59d5 100644
--- a/clang/test/AST/arithmetic-fence-builtin.c
+++ b/clang/test/AST/arithmetic-fence-builtin.c
@@ -20,33 +20,31 @@
 // RUN: | FileCheck %s --strict-whitespace --check-prefixes=CHECK,CHECK2
 //
 // Tests with serialization:
-// RUN: %clang_cc1 -ast-dump -triple i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -triple i386-pc-linux-gnu -emit-pch -o %t %s
 // RUN: %clang_cc1 -triple i386-pc-linux-gnu -include-pch %t -ast-dump-all /dev/null \
 // RUN: | FileCheck %s --strict-whitespace
 //
-// RUN: %clang_cc1 -ast-dump -triple aarch64-unknown-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -emit-pch -o %t %s
 // RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -include-pch %t -ast-dump-all /dev/null \
 // RUN: | FileCheck %s --strict-whitespace
 //
-// RUN: %clang_cc1 -ast-dump -triple i386-pc-linux-gnu -DFAST -mreassociate %s \
+// RUN: %clang_cc1 -triple i386-pc-linux-gnu -DFAST -mreassociate %s \
 // RUN: -emit-pch -o %t
 // RUN: %clang_cc1 -triple i386-pc-linux-gnu -include-pch %t -ast-dump-all /dev/null \
 // RUN: | FileCheck %s --strict-whitespace --check-prefixes=CHECK,CHECK1
 //
-// RUN: %clang_cc1 -ast-dump -triple aarch64-unknown-linux-gnu -DFAST -mreassociate %s \
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -DFAST -mreassociate %s \
 // RUN: -emit-pch -o %t
 // RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -include-pch %t -ast-dump-all /dev/null \
 // RUN: | FileCheck %s --strict-whitespace --check-prefixes=CHECK,CHECK1
 //
-// RUN: %clang_cc1 -ast-dump -triple i386-pc-linux-gnu -DFAST -mreassociate %s \
-// RUN: -fprotect-parens \
-// RUN: -emit-pch -o %t
+// RUN: %clang_cc1 -triple i386-pc-linux-gnu -DFAST -mreassociate %s \
+// RUN:   -fprotect-parens -emit-pch -o %t
 // RUN: %clang_cc1 -triple i386-pc-linux-gnu -include-pch %t -ast-dump-all /dev/null -fprotect-parens\
 // RUN: | FileCheck %s --strict-whitespace --check-prefixes=CHECK,CHECK2
 //
-// RUN: %clang_cc1 -ast-dump -triple aarch64-unknown-linux-gnu -DFAST -mreassociate %s \
-// RUN: -fprotect-parens \
-// RUN: -emit-pch -o %t
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -DFAST -mreassociate %s \
+// RUN:   -fprotect-parens -emit-pch -o %t
 // RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -include-pch %t -ast-dump-all /dev/null -fprotect-parens\
 // RUN: | FileCheck %s --strict-whitespace --check-prefixes=CHECK,CHECK2
 
diff --git a/clang/test/AST/ast-crash-doc-function-template.cpp b/clang/test/AST/ast-crash-doc-function-template.cpp
index d48eb0dbe02f..a1627c7b4d54 100644
--- a/clang/test/AST/ast-crash-doc-function-template.cpp
+++ b/clang/test/AST/ast-crash-doc-function-template.cpp
@@ -1,7 +1,7 @@
 // RUN: rm -rf %t
 // RUN: split-file %s %t
 
-// RUN: %clang_cc1 -x c++ -Wdocumentation -fsyntax-only -ast-dump-all %t/t.cpp
+// RUN: %clang_cc1 -x c++ -Wdocumentation -ast-dump-all %t/t.cpp
 
 //--- t.h
 /// MyClass in the header file
diff --git a/clang/test/AST/ast-dump-attr-type.cpp b/clang/test/AST/ast-dump-attr-type.cpp
index 17e710ff7281..78a3b660bbe6 100644
--- a/clang/test/AST/ast-dump-attr-type.cpp
+++ b/clang/test/AST/ast-dump-attr-type.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -ast-dump %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -ast-dump %s | FileCheck %s
 
 int * _Nonnull x;
 using Ty = decltype(x);
diff --git a/clang/test/AST/ast-dump-coroutine.cpp b/clang/test/AST/ast-dump-coroutine.cpp
index 8741c7b35b15..39a6e7e5a3ee 100644
--- a/clang/test/AST/ast-dump-coroutine.cpp
+++ b/clang/test/AST/ast-dump-coroutine.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown %s -std=c++20 \
-// RUN:    -fsyntax-only -ast-dump -ast-dump-filter test | FileCheck %s
+// RUN:    -ast-dump -ast-dump-filter test | FileCheck %s
 
 #include "Inputs/std-coroutine.h"
 
diff --git a/clang/test/AST/ast-dump-ctad-alias.cpp b/clang/test/AST/ast-dump-ctad-alias.cpp
new file mode 100644
index 000000000000..423c3454ccb7
--- /dev/null
+++ b/clang/test/AST/ast-dump-ctad-alias.cpp
@@ -0,0 +1,40 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -std=c++2a -ast-dump %s | FileCheck -strict-whitespace %s
+
+template <typename, typename>
+constexpr bool Concept = true;
+template<typename T> // depth 0
+struct Out {
+  template<typename U> // depth 1
+  struct Inner {
+    U t;
+  };
+
+  template<typename V> // depth1
+  requires Concept<T, V>
+  Inner(V) -> Inner<V>;
+};
+
+template <typename X>
+struct Out2 {
+  template<typename Y> // depth1
+  using AInner = Out<int>::Inner<Y>;
+};
+Out2<double>::AInner t(1.0);
+
+// Verify that the require-clause of alias deduction guide is transformed correctly:
+//   - Occurrence T should be replaced with `int`;
+//   - Occurrence V should be replaced with the Y with depth 1
+//
+// CHECK:      |   `-FunctionTemplateDecl {{.*}} <deduction guide for AInner>
+// CHECK-NEXT: |     |-TemplateTypeParmDecl {{.*}} typename depth 0 index 0 Y
+// CHECK-NEXT: |     |-UnresolvedLookupExpr {{.*}} '<dependent type>' lvalue (no ADL) = 'Concept' 
+// CHECK-NEXT: |     | |-TemplateArgument type 'int'
+// CHECK-NEXT: |     | | `-BuiltinType {{.*}} 'int'
+// CHECK-NEXT: |     | `-TemplateArgument type 'type-parameter-1-0'
+// CHECK-NEXT: |     |   `-TemplateTypeParmType {{.*}} 'type-parameter-1-0' dependent depth 1 index 0
+// CHECK-NEXT: |     |-CXXDeductionGuideDecl {{.*}} <deduction guide for AInner> 'auto (type-parameter-0-0) -> Inner<type-parameter-0-0>'
+// CHECK-NEXT: |     | `-ParmVarDecl {{.*}} 'type-parameter-0-0'
+// CHECK-NEXT: |     `-CXXDeductionGuideDecl {{.*}} used <deduction guide for AInner> 'auto (double) -> Inner<double>' implicit_instantiation
+// CHECK-NEXT: |       |-TemplateArgument type 'double'
+// CHECK-NEXT: |       | `-BuiltinType {{.*}} 'double'
+// CHECK-NEXT: |       `-ParmVarDecl {{.*}} 'double'
diff --git a/clang/test/AST/ast-dump-default-arg-dep.cpp b/clang/test/AST/ast-dump-default-arg-dep.cpp
index a804ac120fca..1cfd20b233fb 100644
--- a/clang/test/AST/ast-dump-default-arg-dep.cpp
+++ b/clang/test/AST/ast-dump-default-arg-dep.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -ast-dump -frecovery-ast %s | FileCheck %s
+// RUN: not %clang_cc1 -triple x86_64-unknown-unknown -ast-dump -frecovery-ast %s | FileCheck %s
 
 // CXXDefaultArgExpr should inherit dependence from the inner Expr, in this case
 // RecoveryExpr.
diff --git a/clang/test/AST/ast-dump-default-arg-recovery.cpp b/clang/test/AST/ast-dump-default-arg-recovery.cpp
index 5ced0ffd826a..980802be35f0 100644
--- a/clang/test/AST/ast-dump-default-arg-recovery.cpp
+++ b/clang/test/AST/ast-dump-default-arg-recovery.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -ast-dump -frecovery-ast %s | FileCheck %s
+// RUN: not %clang_cc1 -triple x86_64-unknown-unknown -ast-dump -frecovery-ast %s | FileCheck %s
 
 void foo();
 void fun(int arg = foo());
diff --git a/clang/test/AST/ast-dump-default-init-json.cpp b/clang/test/AST/ast-dump-default-init-json.cpp
index 1058b4e3ea4d..f4949a9c9eed 100644
--- a/clang/test/AST/ast-dump-default-init-json.cpp
+++ b/clang/test/AST/ast-dump-default-init-json.cpp
@@ -789,10 +789,10 @@ void test() {
 // CHECK-NEXT:                  "valueCategory": "lvalue",
 // CHECK-NEXT:                  "extendingDecl": {
 // CHECK-NEXT:                   "id": "0x{{.*}}",
-// CHECK-NEXT:                   "kind": "FieldDecl",
-// CHECK-NEXT:                   "name": "a",
+// CHECK-NEXT:                   "kind": "VarDecl",
+// CHECK-NEXT:                   "name": "b",
 // CHECK-NEXT:                   "type": {
-// CHECK-NEXT:                    "qualType": "const A &"
+// CHECK-NEXT:                    "qualType": "B"
 // CHECK-NEXT:                   }
 // CHECK-NEXT:                  },
 // CHECK-NEXT:                  "storageDuration": "automatic",
diff --git a/clang/test/AST/ast-dump-default-init.cpp b/clang/test/AST/ast-dump-default-init.cpp
index 9fe945ee6e93..26864fbf1542 100644
--- a/clang/test/AST/ast-dump-default-init.cpp
+++ b/clang/test/AST/ast-dump-default-init.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -ast-dump %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -ast-dump %s | FileCheck %s
 
 struct A {
   int arr[1];
@@ -13,7 +13,7 @@ void test() {
 }
 // CHECK: -CXXDefaultInitExpr 0x{{[^ ]*}} <{{.*}}> 'const A' lvalue has rewritten init
 // CHECK-NEXT:  `-ExprWithCleanups 0x{{[^ ]*}} <{{.*}}> 'const A' lvalue
-// CHECK-NEXT:    `-MaterializeTemporaryExpr 0x{{[^ ]*}} <{{.*}}> 'const A' lvalue extended by Field 0x{{[^ ]*}} 'a' 'const A &'
+// CHECK-NEXT:    `-MaterializeTemporaryExpr 0x{{[^ ]*}} <{{.*}}> 'const A' lvalue extended by Var 0x{{[^ ]*}} 'b' 'B'
 // CHECK-NEXT:      `-ImplicitCastExpr 0x{{[^ ]*}} <{{.*}}> 'const A' <NoOp>
 // CHECK-NEXT:        `-CXXFunctionalCastExpr 0x{{[^ ]*}} <{{.*}}> 'A' functional cast to A <NoOp>
 // CHECK-NEXT:          `-InitListExpr 0x{{[^ ]*}} <{{.*}}> 'A'
diff --git a/clang/test/AST/ast-dump-fpfeatures.cpp b/clang/test/AST/ast-dump-fpfeatures.cpp
index 68144e31a930..cd00650db55c 100644
--- a/clang/test/AST/ast-dump-fpfeatures.cpp
+++ b/clang/test/AST/ast-dump-fpfeatures.cpp
@@ -1,5 +1,5 @@
 // Test without serialization:
-// RUN: %clang_cc1 -fsyntax-only -triple x86_64-pc-linux -std=c++11 -fcxx-exceptions -ast-dump %s \
+// RUN: %clang_cc1 -triple x86_64-pc-linux -std=c++11 -fcxx-exceptions -ast-dump %s \
 // RUN: | FileCheck --strict-whitespace %s
 
 // Test with serialization:
@@ -198,7 +198,7 @@ float func_19(float x, float y) {
 // CHECK-LABEL: FunctionDecl {{.*}} func_19 'float (float, float)'
 // CHECK:         CompoundStmt {{.*}} MathErrno=1
 // CHECK:           ReturnStmt
-// CHECK:             BinaryOperator {{.*}} 'float' '+' ConstRoundingMode=downward MathErrno=1
+// CHECK:             BinaryOperator {{.*}} 'float' '+' FPContractMode=1 ConstRoundingMode=downward MathErrno=1
 
 __attribute__((optnone))
 float func_20(float x, float y) try {
@@ -210,7 +210,7 @@ float func_20(float x, float y) try {
 // CHECK-LABEL: FunctionDecl {{.*}} func_20 'float (float, float)'
 // CHECK:         CompoundStmt {{.*}} ConstRoundingMode=downward MathErrno=1
 // CHECK:           ReturnStmt
-// CHECK:             BinaryOperator {{.*}} 'float' '+' ConstRoundingMode=downward MathErrno=1
+// CHECK:             BinaryOperator {{.*}} 'float' '+' FPContractMode=1 ConstRoundingMode=downward MathErrno=1
 
 struct C21 {
   C21(float x, float y);
@@ -221,15 +221,15 @@ struct C21 {
 };
 
 // CHECK-LABEL: CXXMethodDecl {{.*}} a_method 'float (float, float)'
-// CHECK:         CompoundStmt {{.*}} ConstRoundingMode=downward MathErrno=1
+// CHECK:         CompoundStmt {{.*}} FPContractMode=1 ConstRoundingMode=downward MathErrno=1
 // CHECK:           ReturnStmt
-// CHECK:             BinaryOperator {{.*}} 'float' '*' ConstRoundingMode=downward MathErrno=1
+// CHECK:             BinaryOperator {{.*}} 'float' '*' FPContractMode=1 ConstRoundingMode=downward MathErrno=1
 
 __attribute__((optnone)) C21::C21(float x, float y) : member(x + y) {}
 
 // CHECK-LABEL: CXXConstructorDecl {{.*}} C21 'void (float, float)'
 // CHECK:         CXXCtorInitializer {{.*}} 'member' 'float'
-// CHECK:           BinaryOperator {{.*}} 'float' '+' ConstRoundingMode=downward MathErrno=1
+// CHECK:           BinaryOperator {{.*}} 'float' '+' FPContractMode=1 ConstRoundingMode=downward MathErrno=1
 
 template <typename T>
 __attribute__((optnone)) T func_22(T x, T y) {
@@ -238,13 +238,13 @@ __attribute__((optnone)) T func_22(T x, T y) {
 
 // CHECK-LABEL: FunctionTemplateDecl {{.*}} func_22
 // CHECK:         FunctionDecl {{.*}} func_22 'T (T, T)'
-// CHECK:           CompoundStmt {{.*}} ConstRoundingMode=downward MathErrno=1
+// CHECK:           CompoundStmt {{.*}} FPContractMode=1 ConstRoundingMode=downward MathErrno=1
 // CHECK:             ReturnStmt
-// CHECK:               BinaryOperator {{.*}} '+' ConstRoundingMode=downward MathErrno=1
+// CHECK:               BinaryOperator {{.*}} '+' FPContractMode=1 ConstRoundingMode=downward MathErrno=1
 // CHECK:         FunctionDecl {{.*}} func_22 'float (float, float)'
-// CHECK:           CompoundStmt {{.*}} ConstRoundingMode=downward MathErrno=1
+// CHECK:           CompoundStmt {{.*}} FPContractMode=1 ConstRoundingMode=downward MathErrno=1
 // CHECK:             ReturnStmt
-// CHECK:               BinaryOperator {{.*}} 'float' '+' ConstRoundingMode=downward MathErrno=1
+// CHECK:               BinaryOperator {{.*}} 'float' '+' FPContractMode=1 ConstRoundingMode=downward MathErrno=1
 
 float func_23(float x, float y) {
   return func_22(x, y);
diff --git a/clang/test/AST/ast-dump-fpfeatures.m b/clang/test/AST/ast-dump-fpfeatures.m
index cf77529a7568..e390d5b67686 100644
--- a/clang/test/AST/ast-dump-fpfeatures.m
+++ b/clang/test/AST/ast-dump-fpfeatures.m
@@ -1,5 +1,5 @@
 // Test without serialization:
-// RUN: %clang_cc1 -fsyntax-only -triple x86_64-pc-linux -ast-dump %s \
+// RUN: %clang_cc1 -triple x86_64-pc-linux -ast-dump %s \
 // RUN: | FileCheck --strict-whitespace %s
 
 // Test with serialization:
@@ -24,6 +24,6 @@
 
 // CHECK-LABEL: ObjCImplementationDecl {{.*}} Adder
 // CHECK:         ObjCMethodDecl {{.*}} - sum:with: 'float'
-// CHECK:           CompoundStmt {{.*}} MathErrno=1
+// CHECK:           CompoundStmt {{.*}} FPContractMode=1 MathErrno=1
 // CHECK-NEXT:        ReturnStmt
-// CHECK-NEXT:          BinaryOperator {{.*}} 'float' '+' MathErrno=1
+// CHECK-NEXT:          BinaryOperator {{.*}} 'float' '+' FPContractMode=1 MathErrno=1
diff --git a/clang/test/AST/ast-dump-late-parsing.cpp b/clang/test/AST/ast-dump-late-parsing.cpp
index 760664efc5f1..5115caaa59f5 100644
--- a/clang/test/AST/ast-dump-late-parsing.cpp
+++ b/clang/test/AST/ast-dump-late-parsing.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -triple x86_64-pc-linux -std=c++11 -fcxx-exceptions -fdelayed-template-parsing -ast-dump %s \
+// RUN: %clang_cc1 -triple x86_64-pc-linux -std=c++11 -fcxx-exceptions -fdelayed-template-parsing -ast-dump %s \
 // RUN: | FileCheck %s
 
 #pragma STDC FENV_ROUND FE_DOWNWARD
@@ -11,13 +11,13 @@ __attribute__((optnone)) T func_22(T x, T y) {
 
 // CHECK-LABEL: FunctionTemplateDecl {{.*}} func_22
 // CHECK:         FunctionDecl {{.*}} func_22 'T (T, T)'
-// CHECK:           CompoundStmt {{.*}} ConstRoundingMode=downward MathErrno=1
+// CHECK:           CompoundStmt {{.*}} FPContractMode=1 ConstRoundingMode=downward MathErrno=1
 // CHECK:             ReturnStmt
-// CHECK:               BinaryOperator {{.*}} '+' ConstRoundingMode=downward MathErrno=1
+// CHECK:               BinaryOperator {{.*}} '+' FPContractMode=1 ConstRoundingMode=downward MathErrno=1
 // CHECK:         FunctionDecl {{.*}} func_22 'float (float, float)'
-// CHECK:           CompoundStmt {{.*}} ConstRoundingMode=downward MathErrno=1
+// CHECK:           CompoundStmt {{.*}} FPContractMode=1 ConstRoundingMode=downward MathErrno=1
 // CHECK:             ReturnStmt
-// CHECK:               BinaryOperator {{.*}} 'float' '+' ConstRoundingMode=downward MathErrno=1
+// CHECK:               BinaryOperator {{.*}} 'float' '+' FPContractMode=1 ConstRoundingMode=downward MathErrno=1
 
 float func_23(float x, float y) {
   return func_22(x, y);
diff --git a/clang/test/AST/ast-dump-recovery.cpp b/clang/test/AST/ast-dump-recovery.cpp
index 77527743fe85..a88dff471d9f 100644
--- a/clang/test/AST/ast-dump-recovery.cpp
+++ b/clang/test/AST/ast-dump-recovery.cpp
@@ -419,6 +419,11 @@ void InitializerOfInvalidDecl() {
   // CHECK:      VarDecl {{.*}} invalid InvalidDecl
   // CHECK-NEXT: `-RecoveryExpr {{.*}} '<dependent type>' contains-errors
   // CHECK-NEXT:   `-DeclRefExpr {{.*}} 'int' lvalue Var {{.*}} 'ValidDecl'
+
+  Unknown InvalidDeclWithInvalidInit = Invalid;
+  // CHECK:      VarDecl {{.*}} invalid InvalidDeclWithInvalidInit
+  // CHECK-NEXT: `-RecoveryExpr {{.*}} '<dependent type>' contains-errors
+  // CHECK-NOT:    `-TypoExpr
 }
 
 void RecoverToAnInvalidDecl() {
diff --git a/clang/test/AST/ast-dump-types-errors.cpp b/clang/test/AST/ast-dump-types-errors.cpp
index b623fd049399..b40460611ac3 100644
--- a/clang/test/AST/ast-dump-types-errors.cpp
+++ b/clang/test/AST/ast-dump-types-errors.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -ast-dump %s | FileCheck %s
+// RUN: not %clang_cc1 -triple x86_64-unknown-unknown -ast-dump %s | FileCheck %s
 
 void test() {
   using ContainsErrors = int[sizeof(undef())];
diff --git a/clang/test/AST/ast-print-fp-pragmas.c b/clang/test/AST/ast-print-fp-pragmas.c
index 0200c2c8e1c9..cbc153af67c5 100644
--- a/clang/test/AST/ast-print-fp-pragmas.c
+++ b/clang/test/AST/ast-print-fp-pragmas.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -triple x86_64-pc-linux -ast-print %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux -ast-print %s -o - | FileCheck %s
 
 float func_1(float x, float y) {
 #pragma STDC FENV_ACCESS ON
diff --git a/clang/test/AST/ast-print-openacc-compute-construct.cpp b/clang/test/AST/ast-print-openacc-compute-construct.cpp
index cd39ea087b3c..0bfb90bcb587 100644
--- a/clang/test/AST/ast-print-openacc-compute-construct.cpp
+++ b/clang/test/AST/ast-print-openacc-compute-construct.cpp
@@ -1,8 +1,10 @@
-// RUN: %clang_cc1 -fopenacc -ast-print %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -Wno-openacc-deprecated-clause-alias -ast-print %s -o - | FileCheck %s
 
 void foo() {
   int i;
+  int *iPtr;
   float array[5];
+  float *arrayPtr[5];
 // CHECK: #pragma acc parallel default(none)
 // CHECK-NEXT: while (true)
 #pragma acc parallel default(none)
@@ -38,5 +40,72 @@ void foo() {
 // CHECK: #pragma acc parallel private(i, array[1], array, array[1:2])
 #pragma acc parallel private(i, array[1], array, array[1:2])
   while(true);
+
+// CHECK: #pragma acc parallel firstprivate(i, array[1], array, array[1:2])
+#pragma acc parallel firstprivate(i, array[1], array, array[1:2])
+  while(true);
+
+// CHECK: #pragma acc parallel no_create(i, array[1], array, array[1:2])
+#pragma acc parallel no_create(i, array[1], array, array[1:2])
+  while(true);
+
+// CHECK: #pragma acc parallel present(i, array[1], array, array[1:2])
+#pragma acc parallel present(i, array[1], array, array[1:2])
+  while(true);
+// CHECK: #pragma acc parallel no_create(i, array[1], array, array[1:2]) present(i, array[1], array, array[1:2])
+#pragma acc parallel no_create(i, array[1], array, array[1:2]) present(i, array[1], array, array[1:2])
+  while(true);
+
+// CHECK: #pragma acc parallel copyin(i, array[1], array, array[1:2]) pcopyin(readonly: i, array[1], array, array[1:2]) present_or_copyin(i, array[1], array, array[1:2])
+#pragma acc parallel copyin(i, array[1], array, array[1:2]) pcopyin(readonly:i, array[1], array, array[1:2]) present_or_copyin(i, array[1], array, array[1:2])
+  while(true);
+
+// CHECK: #pragma acc parallel copyout(i, array[1], array, array[1:2]) pcopyout(zero: i, array[1], array, array[1:2]) present_or_copyout(i, array[1], array, array[1:2])
+#pragma acc parallel copyout(i, array[1], array, array[1:2]) pcopyout(zero: i, array[1], array, array[1:2]) present_or_copyout(i, array[1], array, array[1:2])
+  while(true);
+
+// CHECK: #pragma acc parallel create(i, array[1], array, array[1:2]) pcreate(zero: i, array[1], array, array[1:2]) present_or_create(i, array[1], array, array[1:2])
+#pragma acc parallel create(i, array[1], array, array[1:2]) pcreate(zero: i, array[1], array, array[1:2]) present_or_create(i, array[1], array, array[1:2])
+  while(true);
+
+  // CHECK: #pragma acc serial attach(iPtr, arrayPtr[0])
+#pragma acc serial attach(iPtr, arrayPtr[0])
+  while(true);
+
+  // CHECK: #pragma acc kernels deviceptr(iPtr, arrayPtr[0])
+#pragma acc kernels deviceptr(iPtr, arrayPtr[0])
+  while(true);
+
+  // CHECK: #pragma acc kernels async(*iPtr)
+#pragma acc kernels async(*iPtr)
+  while(true);
+
+  // CHECK: #pragma acc kernels async
+#pragma acc kernels async
+  while(true);
+
+// CHECK: #pragma acc parallel wait
+#pragma acc parallel wait
+  while(true);
+
+// CHECK: #pragma acc parallel wait()
+#pragma acc parallel wait()
+  while(true);
+
+// CHECK: #pragma acc parallel wait(*iPtr, i)
+#pragma acc parallel wait(*iPtr, i)
+  while(true);
+
+// CHECK: #pragma acc parallel wait(queues: *iPtr, i)
+#pragma acc parallel wait(queues:*iPtr, i)
+  while(true);
+
+// CHECK: #pragma acc parallel wait(devnum: i : *iPtr, i)
+#pragma acc parallel wait(devnum:i:*iPtr, i)
+  while(true);
+
+// CHECK: #pragma acc parallel wait(devnum: i : queues: *iPtr, i)
+#pragma acc parallel wait(devnum:i:queues:*iPtr, i)
+  while(true);
 }
 
diff --git a/clang/test/AST/ast-print-pragmas.cpp b/clang/test/AST/ast-print-pragmas.cpp
index 5059c5710189..82d662a01549 100644
--- a/clang/test/AST/ast-print-pragmas.cpp
+++ b/clang/test/AST/ast-print-pragmas.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -ast-print %s -o - | FileCheck %s
-// RUN: %clang_cc1 -DMS_EXT -fsyntax-only -fms-extensions %s -triple x86_64-pc-win32 -ast-print | FileCheck %s --check-prefix=MS-EXT
+// RUN: %clang_cc1 -DMS_EXT -fms-extensions %s -triple x86_64-pc-win32 -ast-print | FileCheck %s --check-prefix=MS-EXT
 
 // CHECK: #pragma clang loop vectorize_width(4)
 // CHECK-NEXT: #pragma clang loop interleave_count(8){{$}}
diff --git a/clang/test/AST/attr-swift_attr.m b/clang/test/AST/attr-swift_attr.m
index 70e325723b21..6ea6775aa5a9 100644
--- a/clang/test/AST/attr-swift_attr.m
+++ b/clang/test/AST/attr-swift_attr.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -ast-dump %s | FileCheck %s
+// RUN: %clang_cc1 -ast-dump %s | FileCheck %s
 
 __attribute__((swift_attr("@actor")))
 @interface View
diff --git a/clang/test/AST/attr-swift_bridge.m b/clang/test/AST/attr-swift_bridge.m
index 2caa86bef4c0..64cf5b834a16 100644
--- a/clang/test/AST/attr-swift_bridge.m
+++ b/clang/test/AST/attr-swift_bridge.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -ast-dump %s | FileCheck %s
+// RUN: %clang_cc1 -ast-dump %s | FileCheck %s
 
 struct __attribute__((__swift_bridge__("BridgedS"))) S;
 // CHECK: RecordDecl {{.*}} struct S
diff --git a/clang/test/AST/attr-swift_bridged_typedef.m b/clang/test/AST/attr-swift_bridged_typedef.m
index 8c7c0987569e..0ea571c23a56 100644
--- a/clang/test/AST/attr-swift_bridged_typedef.m
+++ b/clang/test/AST/attr-swift_bridged_typedef.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -ast-dump %s | FileCheck %s
+// RUN: %clang_cc1 -ast-dump %s | FileCheck %s
 
 typedef struct T TBridged __attribute((__swift_bridged_typedef__));
 // CHECK: TypedefDecl {{.*}} TBridged 'struct T'
diff --git a/clang/test/AST/attr-swift_bridged_typedef.mm b/clang/test/AST/attr-swift_bridged_typedef.mm
index 44fd022d5ea7..e7727a33ffe0 100644
--- a/clang/test/AST/attr-swift_bridged_typedef.mm
+++ b/clang/test/AST/attr-swift_bridged_typedef.mm
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only %s -ast-dump | FileCheck %s
+// RUN: %clang_cc1 %s -ast-dump | FileCheck %s
 
 @interface NSString
 @end
diff --git a/clang/test/AST/auto-pragma.cpp b/clang/test/AST/auto-pragma.cpp
index 1cd0781fe9a7..c22164c39be9 100644
--- a/clang/test/AST/auto-pragma.cpp
+++ b/clang/test/AST/auto-pragma.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only %s -std=c++11 -ast-dump -ast-dump-filter AutoVar | FileCheck %s
+// RUN: %clang_cc1 %s -std=c++11 -ast-dump -ast-dump-filter AutoVar | FileCheck %s
 
 namespace {
   class foo {
diff --git a/clang/test/AST/category-attribute.m b/clang/test/AST/category-attribute.m
index e74f1a1ffbc1..55835c7300ac 100644
--- a/clang/test/AST/category-attribute.m
+++ b/clang/test/AST/category-attribute.m
@@ -1,6 +1,6 @@
 // Test without serialization:
 // RUN: %clang_cc1 -fsyntax-only -verify %s
-// RUN: %clang_cc1 -fsyntax-only -ast-dump %s | FileCheck %s
+// RUN: %clang_cc1 -ast-dump %s | FileCheck %s
 //
 // Test with serialization:
 // RUN: %clang_cc1 -emit-pch -o %t %s
diff --git a/clang/test/AST/const-fpfeatures.c b/clang/test/AST/const-fpfeatures.c
index 6600ea27405d..083350fdc8ce 100644
--- a/clang/test/AST/const-fpfeatures.c
+++ b/clang/test/AST/const-fpfeatures.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -S -emit-llvm -triple i386-linux -Wno-unknown-pragmas %s -o - | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm -triple i386-linux -Wno-unknown-pragmas %s -o - | FileCheck %s
 
 // nextUp(1.F) == 0x1.000002p0F
 
diff --git a/clang/test/AST/const-fpfeatures.cpp b/clang/test/AST/const-fpfeatures.cpp
index 9c807b34625f..95eb613df7f0 100644
--- a/clang/test/AST/const-fpfeatures.cpp
+++ b/clang/test/AST/const-fpfeatures.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -S -emit-llvm -triple i386-linux -std=c++2a -Wno-unknown-pragmas %s -o - | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm -triple i386-linux -std=c++2a -Wno-unknown-pragmas %s -o - | FileCheck %s
 
 // nextUp(1.F) == 0x1.000002p0F
 
diff --git a/clang/test/AST/coroutine-co_yield-source-range.cpp b/clang/test/AST/coroutine-co_yield-source-range.cpp
index c5766a6d7189..65fa245d90ad 100644
--- a/clang/test/AST/coroutine-co_yield-source-range.cpp
+++ b/clang/test/AST/coroutine-co_yield-source-range.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple x86_64-apple-darwin9 %s -std=c++20 \
-// RUN:    -fsyntax-only -ast-dump | FileCheck %s
+// RUN:    -ast-dump | FileCheck %s
 
 #include "Inputs/std-coroutine.h"
 
diff --git a/clang/test/AST/coroutine-locals-cleanup.cpp b/clang/test/AST/coroutine-locals-cleanup.cpp
index 6264df01fa2a..a7f524b0b1ac 100644
--- a/clang/test/AST/coroutine-locals-cleanup.cpp
+++ b/clang/test/AST/coroutine-locals-cleanup.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 -fsyntax-only -ast-dump %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 -ast-dump %s | FileCheck %s
 
 #include "Inputs/std-coroutine.h"
 
diff --git a/clang/test/AST/coroutine-source-location-crash.cpp b/clang/test/AST/coroutine-source-location-crash.cpp
index fcf23d21d298..02dfdc29d72b 100644
--- a/clang/test/AST/coroutine-source-location-crash.cpp
+++ b/clang/test/AST/coroutine-source-location-crash.cpp
@@ -1,6 +1,6 @@
 // Test without serialization:
 // RUN: %clang_cc1 -triple x86_64-apple-darwin9 %s -std=c++20 \
-// RUN:    -fsyntax-only -ast-dump | FileCheck %s
+// RUN:    -ast-dump | FileCheck %s
 //
 // Test with serialization:
 // RUN: %clang_cc1 -triple x86_64-apple-darwin9 -std=c++20 -emit-pch -o %t %s
diff --git a/clang/test/AST/deduction-guides.cpp b/clang/test/AST/deduction-guides.cpp
index ffcde2b27016..d96c7e6bd5e3 100644
--- a/clang/test/AST/deduction-guides.cpp
+++ b/clang/test/AST/deduction-guides.cpp
@@ -1,5 +1,5 @@
 // Test without serialization:
-// RUN: %clang_cc1 -fsyntax-only %s -ast-dump -std=c++17 | FileCheck %s
+// RUN: %clang_cc1 %s -ast-dump -std=c++17 | FileCheck %s
 //
 // Test with serialization:
 // RUN: %clang_cc1 -std=c++17 -emit-pch -o %t %s
diff --git a/clang/test/AST/foreachtemplatized.mm b/clang/test/AST/foreachtemplatized.mm
index ab2770a7cefb..62b5a1f20aee 100644
--- a/clang/test/AST/foreachtemplatized.mm
+++ b/clang/test/AST/foreachtemplatized.mm
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -fobjc-arc -Wno-objc-root-class -std=c++11 -ast-dump %s | FileCheck %s
+// RUN: %clang_cc1 -fobjc-arc -Wno-objc-root-class -std=c++11 -ast-dump %s | FileCheck %s
 
 // CHECK-NOT: ImplicitValueInitExpr
 
diff --git a/clang/test/AST/loop-recovery.cpp b/clang/test/AST/loop-recovery.cpp
index 0561846c611f..b1e4ecf1c2ce 100644
--- a/clang/test/AST/loop-recovery.cpp
+++ b/clang/test/AST/loop-recovery.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -std=c++17 %s
-// RUN: not %clang_cc1 -fsyntax-only -ast-dump %s -std=c++17 | FileCheck %s
+// RUN: not %clang_cc1 -ast-dump %s -std=c++17 | FileCheck %s
 
 void test() {
   while(!!!) // expected-error {{expected expression}}
diff --git a/clang/test/AST/multistep-explicit-cast.c b/clang/test/AST/multistep-explicit-cast.c
index 7297b6285186..5d85669c22ab 100644
--- a/clang/test/AST/multistep-explicit-cast.c
+++ b/clang/test/AST/multistep-explicit-cast.c
@@ -1,5 +1,5 @@
 // Test without serialization:
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -fsyntax-only -ast-dump %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -ast-dump %s | FileCheck %s
 //
 // Test with serialization:
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-pch -o %t %s
diff --git a/clang/test/AST/multistep-explicit-cast.cpp b/clang/test/AST/multistep-explicit-cast.cpp
index d9dadde5539c..0901c485367c 100644
--- a/clang/test/AST/multistep-explicit-cast.cpp
+++ b/clang/test/AST/multistep-explicit-cast.cpp
@@ -1,5 +1,5 @@
 // Test without serialization:
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -fsyntax-only -ast-dump %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -ast-dump %s | FileCheck %s
 //
 // Test with serialization:
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-pch -o %t %s
diff --git a/clang/test/AST/objc-default-ctor-init.mm b/clang/test/AST/objc-default-ctor-init.mm
index a14a243a31cc..a01dcd790b9a 100644
--- a/clang/test/AST/objc-default-ctor-init.mm
+++ b/clang/test/AST/objc-default-ctor-init.mm
@@ -1,21 +1,21 @@
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.10 -std=c++11 -ast-dump %s | FileCheck %s
-// CHECK: CXXCtorInitializer Field {{.*}} 'ptr' 'void *'
-// CHECK: CXXCtorInitializer Field {{.*}} 'q' 'Q'
-
-@interface NSObject
-@end
-
-@interface I : NSObject
-@end
-
-struct Q { Q(); };
-
-struct S {
-  S();
-  void *ptr = nullptr;
-  Q q;
-};
-
-@implementation I
-S::S() {}
-@end
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.10 -std=c++11 -ast-dump %s | FileCheck %s
+// CHECK: CXXCtorInitializer Field {{.*}} 'ptr' 'void *'
+// CHECK: CXXCtorInitializer Field {{.*}} 'q' 'Q'
+
+@interface NSObject
+@end
+
+@interface I : NSObject
+@end
+
+struct Q { Q(); };
+
+struct S {
+  S();
+  void *ptr = nullptr;
+  Q q;
+};
+
+@implementation I
+S::S() {}
+@end
diff --git a/clang/test/AST/pr43983.cpp b/clang/test/AST/pr43983.cpp
index c27d825dbb1b..f11df5545f28 100644
--- a/clang/test/AST/pr43983.cpp
+++ b/clang/test/AST/pr43983.cpp
@@ -1,5 +1,5 @@
 // Test without serialization:
-// RUN: %clang_cc1 -fsyntax-only %s -ast-dump | FileCheck %s
+// RUN: %clang_cc1 %s -ast-dump | FileCheck %s
 //
 // Test with serialization:
 // RUN: %clang_cc1 -emit-pch -o %t %s
diff --git a/clang/test/AST/pr47636.cpp b/clang/test/AST/pr47636.cpp
index 5311edb892c5..28c2b7802fe6 100644
--- a/clang/test/AST/pr47636.cpp
+++ b/clang/test/AST/pr47636.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only %s -ast-dump | FileCheck %s
+// RUN: %clang_cc1 %s -ast-dump | FileCheck %s
 
 int(&&intu_rvref)[] {1,2,3,4};
 // CHECK: VarDecl 0x[[GLOB_ADDR:[0-9a-f]+]] {{.*}} intu_rvref 'int (&&)[4]' listinit
diff --git a/clang/test/AST/pragma-attribute-cxx-subject-match-rules.cpp b/clang/test/AST/pragma-attribute-cxx-subject-match-rules.cpp
index 18dfb43a384d..37283c8ad90c 100644
--- a/clang/test/AST/pragma-attribute-cxx-subject-match-rules.cpp
+++ b/clang/test/AST/pragma-attribute-cxx-subject-match-rules.cpp
@@ -1,17 +1,17 @@
-// RUN: %clang_cc1 -std=c++11 -fsyntax-only -ast-dump -ast-dump-filter test "-DSUBJECT=namespace" %s | FileCheck --check-prefix=CHECK-NAMESPACE %s
-// RUN: %clang_cc1 -std=c++11 -fsyntax-only -ast-dump -ast-dump-filter test "-DSUBJECT=type_alias" %s | FileCheck --check-prefix=CHECK-TYPE_ALIAS %s
-// RUN: %clang_cc1 -std=c++11 -fsyntax-only -ast-dump -ast-dump-filter test "-DSUBJECT=enum" %s | FileCheck --check-prefix=CHECK-ENUM %s
-// RUN: %clang_cc1 -std=c++11 -fsyntax-only -ast-dump -ast-dump-filter test "-DSUBJECT=enum_constant" %s | FileCheck --check-prefix=CHECK-ENUM_CONSTANT %s
-// RUN: %clang_cc1 -std=c++11 -fsyntax-only -ast-dump -ast-dump-filter test "-DSUBJECT=record" %s | FileCheck --check-prefix=CHECK-RECORD %s
-// RUN: %clang_cc1 -std=c++11 -fsyntax-only -ast-dump -ast-dump-filter test "-DSUBJECT=record(unless(is_union))" %s | FileCheck --check-prefix=CHECK-RECORD_UNLESS_IS_UNION %s
-// RUN: %clang_cc1 -std=c++11 -fsyntax-only -ast-dump -ast-dump-filter test "-DSUBJECT=field" %s | FileCheck --check-prefix=CHECK-FIELD %s
-// RUN: %clang_cc1 -std=c++11 -fsyntax-only -ast-dump -ast-dump-filter test "-DSUBJECT=function" %s | FileCheck --check-prefix=CHECK-FUNCTION %s
-// RUN: %clang_cc1 -std=c++11 -fsyntax-only -ast-dump -ast-dump-filter test "-DSUBJECT=hasType(functionType)" %s | FileCheck --check-prefix=CHECK-HAS_TYPE_FUNCTION_TYPE %s
-// RUN: %clang_cc1 -std=c++11 -fsyntax-only -ast-dump -ast-dump-filter test "-DSUBJECT=function(is_member)" %s | FileCheck --check-prefix=CHECK-FUNCTION_IS_MEMBER %s
-// RUN: %clang_cc1 -std=c++11 -fsyntax-only -ast-dump -ast-dump-filter test "-DSUBJECT=variable" %s | FileCheck --check-prefix=CHECK-VARIABLE %s
-// RUN: %clang_cc1 -std=c++11 -fsyntax-only -ast-dump -ast-dump-filter test "-DSUBJECT=variable(is_global)" %s | FileCheck --check-prefix=CHECK-VARIABLE_IS_GLOBAL %s
-// RUN: %clang_cc1 -std=c++11 -fsyntax-only -ast-dump -ast-dump-filter test "-DSUBJECT=variable(is_parameter)" %s | FileCheck --check-prefix=CHECK-VARIABLE_IS_PARAMETER %s
-// RUN: %clang_cc1 -std=c++11 -fsyntax-only -ast-dump -ast-dump-filter test "-DSUBJECT=variable(unless(is_parameter))" %s | FileCheck --check-prefix=CHECK-VARIABLE_UNLESS_IS_PARAMETER %s
+// RUN: %clang_cc1 -std=c++11 -ast-dump -ast-dump-filter test "-DSUBJECT=namespace" %s | FileCheck --check-prefix=CHECK-NAMESPACE %s
+// RUN: %clang_cc1 -std=c++11 -ast-dump -ast-dump-filter test "-DSUBJECT=type_alias" %s | FileCheck --check-prefix=CHECK-TYPE_ALIAS %s
+// RUN: %clang_cc1 -std=c++11 -ast-dump -ast-dump-filter test "-DSUBJECT=enum" %s | FileCheck --check-prefix=CHECK-ENUM %s
+// RUN: %clang_cc1 -std=c++11 -ast-dump -ast-dump-filter test "-DSUBJECT=enum_constant" %s | FileCheck --check-prefix=CHECK-ENUM_CONSTANT %s
+// RUN: %clang_cc1 -std=c++11 -ast-dump -ast-dump-filter test "-DSUBJECT=record" %s | FileCheck --check-prefix=CHECK-RECORD %s
+// RUN: %clang_cc1 -std=c++11 -ast-dump -ast-dump-filter test "-DSUBJECT=record(unless(is_union))" %s | FileCheck --check-prefix=CHECK-RECORD_UNLESS_IS_UNION %s
+// RUN: %clang_cc1 -std=c++11 -ast-dump -ast-dump-filter test "-DSUBJECT=field" %s | FileCheck --check-prefix=CHECK-FIELD %s
+// RUN: %clang_cc1 -std=c++11 -ast-dump -ast-dump-filter test "-DSUBJECT=function" %s | FileCheck --check-prefix=CHECK-FUNCTION %s
+// RUN: %clang_cc1 -std=c++11 -ast-dump -ast-dump-filter test "-DSUBJECT=hasType(functionType)" %s | FileCheck --check-prefix=CHECK-HAS_TYPE_FUNCTION_TYPE %s
+// RUN: %clang_cc1 -std=c++11 -ast-dump -ast-dump-filter test "-DSUBJECT=function(is_member)" %s | FileCheck --check-prefix=CHECK-FUNCTION_IS_MEMBER %s
+// RUN: %clang_cc1 -std=c++11 -ast-dump -ast-dump-filter test "-DSUBJECT=variable" %s | FileCheck --check-prefix=CHECK-VARIABLE %s
+// RUN: %clang_cc1 -std=c++11 -ast-dump -ast-dump-filter test "-DSUBJECT=variable(is_global)" %s | FileCheck --check-prefix=CHECK-VARIABLE_IS_GLOBAL %s
+// RUN: %clang_cc1 -std=c++11 -ast-dump -ast-dump-filter test "-DSUBJECT=variable(is_parameter)" %s | FileCheck --check-prefix=CHECK-VARIABLE_IS_PARAMETER %s
+// RUN: %clang_cc1 -std=c++11 -ast-dump -ast-dump-filter test "-DSUBJECT=variable(unless(is_parameter))" %s | FileCheck --check-prefix=CHECK-VARIABLE_UNLESS_IS_PARAMETER %s
 
 #pragma clang attribute push (__attribute__((annotate("test"))), apply_to = any(SUBJECT))
 
diff --git a/clang/test/AST/pragma-attribute-objc-subject-match-rules.m b/clang/test/AST/pragma-attribute-objc-subject-match-rules.m
index f2a8bdde6d0c..ef5e6bf783c6 100644
--- a/clang/test/AST/pragma-attribute-objc-subject-match-rules.m
+++ b/clang/test/AST/pragma-attribute-objc-subject-match-rules.m
@@ -1,12 +1,12 @@
-// RUN: %clang_cc1 -fblocks -fobjc-arc -Wno-objc-root-class -fsyntax-only -ast-dump "-DSUBJECT=objc_interface" %s | FileCheck --check-prefix=CHECK-OBJC_INTERFACE %s
-// RUN: %clang_cc1 -fblocks -fobjc-arc -Wno-objc-root-class -fsyntax-only -ast-dump -ast-dump-filter test "-DSUBJECT=objc_protocol" %s | FileCheck --check-prefix=CHECK-OBJC_PROTOCOL %s
-// RUN: %clang_cc1 -fblocks -fobjc-arc -Wno-objc-root-class -fsyntax-only -ast-dump "-DSUBJECT=objc_category" %s | FileCheck --check-prefix=CHECK-OBJC_CATEGORY %s
-// RUN: %clang_cc1 -fblocks -fobjc-arc -Wno-objc-root-class -fsyntax-only -ast-dump -ast-dump-filter test "-DSUBJECT=objc_method" %s | FileCheck --check-prefix=CHECK-OBJC_METHOD %s
-// RUN: %clang_cc1 -fblocks -fobjc-arc -Wno-objc-root-class -fsyntax-only -ast-dump -ast-dump-filter test "-DSUBJECT=objc_method(is_instance)" %s | FileCheck --check-prefix=CHECK-OBJC_METHOD_IS_INSTANCE %s
-// RUN: %clang_cc1 -fblocks -fobjc-arc -Wno-objc-root-class -fsyntax-only -ast-dump -ast-dump-filter test "-DSUBJECT=field" %s | FileCheck --check-prefix=CHECK-FIELD %s
-// RUN: %clang_cc1 -fblocks -fobjc-arc -Wno-objc-root-class -fsyntax-only -ast-dump -ast-dump-filter test "-DSUBJECT=objc_property" %s | FileCheck --check-prefix=CHECK-OBJC_PROPERTY %s
-// RUN: %clang_cc1 -fblocks -fobjc-arc -Wno-objc-root-class -fsyntax-only -ast-dump -ast-dump-filter test "-DSUBJECT=block" %s | FileCheck --check-prefix=CHECK-BLOCK %s
-// RUN: %clang_cc1 -fblocks -fobjc-arc -Wno-objc-root-class -fsyntax-only -ast-dump -ast-dump-filter test "-DSUBJECT=hasType(functionType)" %s | FileCheck --check-prefix=CHECK-HAS_TYPE_FUNCTION_TYPE %s
+// RUN: %clang_cc1 -fblocks -fobjc-arc -Wno-objc-root-class -ast-dump "-DSUBJECT=objc_interface" %s | FileCheck --check-prefix=CHECK-OBJC_INTERFACE %s
+// RUN: %clang_cc1 -fblocks -fobjc-arc -Wno-objc-root-class -ast-dump -ast-dump-filter test "-DSUBJECT=objc_protocol" %s | FileCheck --check-prefix=CHECK-OBJC_PROTOCOL %s
+// RUN: %clang_cc1 -fblocks -fobjc-arc -Wno-objc-root-class -ast-dump "-DSUBJECT=objc_category" %s | FileCheck --check-prefix=CHECK-OBJC_CATEGORY %s
+// RUN: %clang_cc1 -fblocks -fobjc-arc -Wno-objc-root-class -ast-dump -ast-dump-filter test "-DSUBJECT=objc_method" %s | FileCheck --check-prefix=CHECK-OBJC_METHOD %s
+// RUN: %clang_cc1 -fblocks -fobjc-arc -Wno-objc-root-class -ast-dump -ast-dump-filter test "-DSUBJECT=objc_method(is_instance)" %s | FileCheck --check-prefix=CHECK-OBJC_METHOD_IS_INSTANCE %s
+// RUN: %clang_cc1 -fblocks -fobjc-arc -Wno-objc-root-class -ast-dump -ast-dump-filter test "-DSUBJECT=field" %s | FileCheck --check-prefix=CHECK-FIELD %s
+// RUN: %clang_cc1 -fblocks -fobjc-arc -Wno-objc-root-class -ast-dump -ast-dump-filter test "-DSUBJECT=objc_property" %s | FileCheck --check-prefix=CHECK-OBJC_PROPERTY %s
+// RUN: %clang_cc1 -fblocks -fobjc-arc -Wno-objc-root-class -ast-dump -ast-dump-filter test "-DSUBJECT=block" %s | FileCheck --check-prefix=CHECK-BLOCK %s
+// RUN: %clang_cc1 -fblocks -fobjc-arc -Wno-objc-root-class -ast-dump -ast-dump-filter test "-DSUBJECT=hasType(functionType)" %s | FileCheck --check-prefix=CHECK-HAS_TYPE_FUNCTION_TYPE %s
 
 #pragma clang attribute push (__attribute__((annotate("test"))), apply_to = any(SUBJECT))
 
diff --git a/clang/test/AST/pragma-multiple-attributes-declspec.cpp b/clang/test/AST/pragma-multiple-attributes-declspec.cpp
index 68d697fbf3ef..60722b3c886a 100644
--- a/clang/test/AST/pragma-multiple-attributes-declspec.cpp
+++ b/clang/test/AST/pragma-multiple-attributes-declspec.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple i386-pc-win32 -fms-extensions -fms-compatibility -fsyntax-only -ast-dump %s | FileCheck %s
+// RUN: %clang_cc1 -triple i386-pc-win32 -fms-extensions -fms-compatibility -ast-dump %s | FileCheck %s
 
 #pragma clang attribute push (__declspec(dllexport, noinline), apply_to=function)
 void func1();
diff --git a/clang/test/AST/pragma-multiple-attributes.cpp b/clang/test/AST/pragma-multiple-attributes.cpp
index b717b3a15864..db3824913728 100644
--- a/clang/test/AST/pragma-multiple-attributes.cpp
+++ b/clang/test/AST/pragma-multiple-attributes.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -ast-dump %s | FileCheck %s
+// RUN: %clang_cc1 -ast-dump %s | FileCheck %s
 
 #pragma clang attribute push (__attribute__((disable_sanitizer_instrumentation, annotate("test1"))), apply_to=variable(is_global))
 int var1;
diff --git a/clang/test/AST/spurious-regparm.c b/clang/test/AST/spurious-regparm.c
index 4ae23f017241..d7b09ec8f87e 100644
--- a/clang/test/AST/spurious-regparm.c
+++ b/clang/test/AST/spurious-regparm.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple armv8.1m.main-eabi -mcmse -fsyntax-only %s -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -triple armv8.1m.main-eabi -mcmse %s -ast-dump | FileCheck %s
 // REQUIRES: arm-registered-target
 typedef int (*fn_t)(int) __attribute__((cmse_nonsecure_call));
 // CHECK-NOT: regparm 0
diff --git a/clang/test/AST/template-implicit-vars.cpp b/clang/test/AST/template-implicit-vars.cpp
index fb3e45d77e05..846c4d95c91a 100644
--- a/clang/test/AST/template-implicit-vars.cpp
+++ b/clang/test/AST/template-implicit-vars.cpp
@@ -1,5 +1,5 @@
 // Test without serialization:
-// RUN: %clang_cc1 -fsyntax-only %s -std=c++11 -ast-dump | FileCheck %s
+// RUN: %clang_cc1 %s -std=c++11 -ast-dump | FileCheck %s
 //
 // Test with serialization:
 // RUN: %clang_cc1 -std=c++11 -emit-pch -o %t %s
diff --git a/clang/test/ASTMerge/codegen-exprs/test.c b/clang/test/ASTMerge/codegen-exprs/test.c
index b5069f993be5..47889b91742e 100644
--- a/clang/test/ASTMerge/codegen-exprs/test.c
+++ b/clang/test/ASTMerge/codegen-exprs/test.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple %itanium_abi_triple -emit-pch -o %t.1.ast %S/Inputs/exprs1.c
 // RUN: %clang_cc1 -triple %itanium_abi_triple -emit-pch -o %t.2.ast %S/Inputs/exprs2.c
-// RUN: %clang_cc1 -triple %itanium_abi_triple -emit-obj -o /dev/null -ast-merge %t.1.ast -ast-merge %t.2.ast -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -o /dev/null -ast-merge %t.1.ast -ast-merge %t.2.ast -fsyntax-only -verify %s
 // expected-no-diagnostics
 
diff --git a/clang/test/Analysis/Checkers/WebKit/call-args-wtf-containers.cpp b/clang/test/Analysis/Checkers/WebKit/call-args-wtf-containers.cpp
index 0a63a7898561..17e25d9a6270 100644
--- a/clang/test/Analysis/Checkers/WebKit/call-args-wtf-containers.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/call-args-wtf-containers.cpp
@@ -4,6 +4,92 @@
 
 namespace WTF {
 
+  constexpr unsigned long notFound = static_cast<unsigned long>(-1);
+
+  class String;
+  class StringImpl;
+
+  class StringView {
+  public:
+    StringView(const String&);
+  private:
+    RefPtr<StringImpl> m_impl;
+  };
+
+  class StringImpl {
+  public:
+    void ref() const { ++m_refCount; }
+    void deref() const {
+      if (!--m_refCount)
+        delete this;
+    }
+
+    static constexpr unsigned s_flagIs8Bit = 1u << 0;
+    bool is8Bit() const { return m_hashAndFlags & s_flagIs8Bit; }
+    const char* characters8() const { return m_char8; }
+    const short* characters16() const { return m_char16; }
+    unsigned length() const { return m_length; }
+    Ref<StringImpl> substring(unsigned position, unsigned length) const;
+
+    unsigned long find(char) const;
+    unsigned long find(StringView) const;
+    unsigned long contains(StringView) const;
+    unsigned long findIgnoringASCIICase(StringView) const;
+
+    bool startsWith(StringView) const;
+    bool startsWithIgnoringASCIICase(StringView) const;
+    bool endsWith(StringView) const;
+    bool endsWithIgnoringASCIICase(StringView) const;
+
+  private:
+    mutable unsigned m_refCount { 0 };
+    unsigned m_length { 0 };
+    union {
+      const char* m_char8;
+      const short* m_char16;
+    };
+    unsigned m_hashAndFlags { 0 };
+  };
+
+  class String {
+  public:
+    String() = default;
+    String(StringImpl& impl) : m_impl(&impl) { }
+    String(StringImpl* impl) : m_impl(impl) { }
+    String(Ref<StringImpl>&& impl) : m_impl(impl.get()) { }
+    StringImpl* impl() { return m_impl.get(); }
+    unsigned length() const { return m_impl ? m_impl->length() : 0; }
+    const char* characters8() const { return m_impl ? m_impl->characters8() : nullptr; }
+    const short* characters16() const { return m_impl ? m_impl->characters16() : nullptr; }
+
+    bool is8Bit() const { return !m_impl || m_impl->is8Bit(); }
+
+    unsigned long find(char character) const { return m_impl ? m_impl->find(character) : notFound; }
+    unsigned long find(StringView str) const { return m_impl ? m_impl->find(str) : notFound; }
+    unsigned long findIgnoringASCIICase(StringView) const;
+
+    bool contains(char character) const { return find(character) != notFound; }
+    bool contains(StringView) const;
+    bool containsIgnoringASCIICase(StringView) const;
+
+    bool startsWith(StringView) const;
+    bool startsWithIgnoringASCIICase(StringView) const;
+    bool endsWith(StringView) const;
+    bool endsWithIgnoringASCIICase(StringView) const;
+
+    String substring(unsigned position, unsigned length) const
+    {
+      if (!m_impl)
+        return { };
+      if (!position && length >= m_impl->length())
+        return *this;
+      return m_impl->substring(position, length);
+    }
+
+  private:
+    RefPtr<StringImpl> m_impl;
+  };
+
   template <typename T>
   class HashSet {
   public:
@@ -89,6 +175,9 @@ namespace WTF {
 
 }
 
+using WTF::StringView;
+using WTF::StringImpl;
+using WTF::String;
 using WTF::HashSet;
 using WTF::HashMap;
 using WTF::WeakHashSet;
@@ -101,8 +190,37 @@ public:
 };
 
 RefCounted* object();
+StringImpl* strImpl();
+String* str();
+StringView strView();
 
 void test() {
+  strImpl()->is8Bit();
+  strImpl()->characters8();
+  strImpl()->characters16();
+  strImpl()->length();
+  strImpl()->substring(2, 4);
+  strImpl()->find(strView());
+  strImpl()->contains(strView());
+  strImpl()->findIgnoringASCIICase(strView());
+  strImpl()->startsWith(strView());
+  strImpl()->startsWithIgnoringASCIICase(strView());
+  strImpl()->endsWith(strView());
+  strImpl()->endsWithIgnoringASCIICase(strView());
+
+  str()->is8Bit();
+  str()->characters8();
+  str()->characters16();
+  str()->length();
+  str()->substring(2, 4);
+  str()->find(strView());
+  str()->contains(strView());
+  str()->findIgnoringASCIICase(strView());
+  str()->startsWith(strView());
+  str()->startsWithIgnoringASCIICase(strView());
+  str()->endsWith(strView());
+  str()->endsWithIgnoringASCIICase(strView());
+
   HashSet<RefPtr<RefCounted>> set;
   set.find(*object());
   set.contains(*object());
diff --git a/clang/test/Analysis/Checkers/WebKit/call-args.cpp b/clang/test/Analysis/Checkers/WebKit/call-args.cpp
index 2a4b6bb1f106..e1bee8a23a25 100644
--- a/clang/test/Analysis/Checkers/WebKit/call-args.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/call-args.cpp
@@ -313,6 +313,17 @@ namespace default_arg {
   }
 }
 
+namespace cxx_member_func {
+  Ref<RefCountable> provideProtected();
+  void foo() {
+    provide()->trivial();
+    provide()->method();
+    // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
+    provideProtected()->method();
+    (provideProtected())->method();
+  };
+}
+
 namespace cxx_member_operator_call {
   // The hidden this-pointer argument without a corresponding parameter caused couple bugs in parameter <-> argument attribution.
   struct Foo {
@@ -333,3 +344,17 @@ namespace cxx_member_operator_call {
     // expected-warning@-1{{Call argument for parameter 'bad' is uncounted and unsafe}}
   }
 }
+
+namespace call_with_ptr_on_ref {
+  Ref<RefCountable> provideProtected();
+  void bar(RefCountable* bad);
+  bool baz();
+  void foo(bool v) {
+    bar(v ? nullptr : provideProtected().ptr());
+    bar(baz() ? provideProtected().ptr() : nullptr);
+    bar(v ? provide() : provideProtected().ptr());
+    // expected-warning@-1{{Call argument for parameter 'bad' is uncounted and unsafe}}
+    bar(v ? provideProtected().ptr() : provide());
+    // expected-warning@-1{{Call argument for parameter 'bad' is uncounted and unsafe}}
+  }
+}
diff --git a/clang/test/Analysis/Checkers/WebKit/mock-system-header.h b/clang/test/Analysis/Checkers/WebKit/mock-system-header.h
new file mode 100644
index 000000000000..a1d30957b19c
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/mock-system-header.h
@@ -0,0 +1,17 @@
+#pragma clang system_header
+
+template <typename T, typename CreateFunction>
+void callMethod(CreateFunction createFunction) {
+  createFunction()->method();
+}
+
+template <typename T, typename CreateFunction>
+inline void localVar(CreateFunction createFunction) {
+  T* obj = createFunction();
+  obj->method();
+}
+
+template <typename T>
+struct MemberVariable {
+    T* obj { nullptr };
+};
diff --git a/clang/test/Analysis/Checkers/WebKit/mock-types.h b/clang/test/Analysis/Checkers/WebKit/mock-types.h
index c27ea9baaf3b..c427b22fd683 100644
--- a/clang/test/Analysis/Checkers/WebKit/mock-types.h
+++ b/clang/test/Analysis/Checkers/WebKit/mock-types.h
@@ -47,7 +47,7 @@ template <typename T, typename PtrTraits = RawPtrTraits<T>, typename RefDerefTra
   typename PtrTraits::StorageType t;
 
   Ref() : t{} {};
-  Ref(T &t) : t(RefDerefTraits::refIfNotNull(t)) { }
+  Ref(T &t) : t(&RefDerefTraits::ref(t)) { }
   Ref(const Ref& o) : t(RefDerefTraits::refIfNotNull(PtrTraits::unwrap(o.t))) { }
   ~Ref() { RefDerefTraits::derefIfNotNull(PtrTraits::exchange(t, nullptr)); }
   T &get() { return *PtrTraits::unwrap(t); }
@@ -55,7 +55,7 @@ template <typename T, typename PtrTraits = RawPtrTraits<T>, typename RefDerefTra
   T *operator->() { return PtrTraits::unwrap(t); }
   operator const T &() const { return *PtrTraits::unwrap(t); }
   operator T &() { return *PtrTraits::unwrap(t); }
-  T* leakRef() { PtrTraits::exchange(t, nullptr); }
+  T* leakRef() { return PtrTraits::exchange(t, nullptr); }
 };
 
 template <typename T> struct RefPtr {
@@ -67,6 +67,9 @@ template <typename T> struct RefPtr {
     if (t)
       t->ref();
   }
+  RefPtr(Ref<T>&& o)
+    : t(o.leakRef())
+  { }
   ~RefPtr() {
     if (t)
       t->deref();
@@ -76,7 +79,7 @@ template <typename T> struct RefPtr {
   const T *operator->() const { return t; }
   T &operator*() { return *t; }
   RefPtr &operator=(T *) { return *this; }
-  operator bool() { return t; }
+  operator bool() const { return t; }
 };
 
 template <typename T> bool operator==(const RefPtr<T> &, const RefPtr<T> &) {
diff --git a/clang/test/Analysis/Checkers/WebKit/ref-cntbl-base-virtual-dtor-templates.cpp b/clang/test/Analysis/Checkers/WebKit/ref-cntbl-base-virtual-dtor-templates.cpp
index 3338fa9368e4..eeb62d5d89ec 100644
--- a/clang/test/Analysis/Checkers/WebKit/ref-cntbl-base-virtual-dtor-templates.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/ref-cntbl-base-virtual-dtor-templates.cpp
@@ -28,3 +28,63 @@ struct DerivedClassTmpl3 : T { };
 
 typedef DerivedClassTmpl3<RefCntblBase> Foo;
 Foo c;
+
+
+namespace WTF {
+
+class RefCountedBase {
+public:
+  void ref() const { ++count; }
+
+protected:
+  bool derefBase() const
+  {
+    return !--count;
+  }
+
+private:
+  mutable unsigned count;
+};
+
+template <typename T>
+class RefCounted : public RefCountedBase {
+public:
+  void deref() const {
+    if (derefBase())
+      delete const_cast<T*>(static_cast<const T*>(this));
+  }
+
+protected:
+  RefCounted() { }
+};
+
+template <typename T>
+class ThreadSafeRefCounted {
+public:
+  void ref() const;
+  bool deref() const;
+};
+
+template <typename T>
+class ThreadSafeRefCountedAndCanMakeThreadSafeWeakPtr {
+public:
+  void ref() const;
+  bool deref() const;
+};
+
+} // namespace WTF
+
+class DerivedClass4 : public WTF::RefCounted<DerivedClass4> { };
+
+class DerivedClass5 : public DerivedClass4 { };
+// expected-warning@-1{{Class 'DerivedClass4' is used as a base of class 'DerivedClass5' but doesn't have virtual destructor}}
+
+class DerivedClass6 : public WTF::ThreadSafeRefCounted<DerivedClass6> { };
+
+class DerivedClass7 : public DerivedClass6 { };
+// expected-warning@-1{{Class 'DerivedClass6' is used as a base of class 'DerivedClass7' but doesn't have virtual destructor}}
+
+class DerivedClass8 : public WTF::ThreadSafeRefCountedAndCanMakeThreadSafeWeakPtr<DerivedClass8> { };
+
+class DerivedClass9 : public DerivedClass8 { };
+// expected-warning@-1{{Class 'DerivedClass8' is used as a base of class 'DerivedClass9' but doesn't have virtual destructor}}
diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-local-vars.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-local-vars.cpp
index 00673e91f471..632a82eb0d8d 100644
--- a/clang/test/Analysis/Checkers/WebKit/uncounted-local-vars.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/uncounted-local-vars.cpp
@@ -1,6 +1,7 @@
 // RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UncountedLocalVarsChecker -verify %s
 
 #include "mock-types.h"
+#include "mock-system-header.h"
 
 void someFunction();
 
@@ -187,3 +188,31 @@ void bar() {
 }
 
 } // namespace ignore_for_if
+
+namespace ignore_system_headers {
+
+RefCountable *provide_ref_ctnbl();
+
+void system_header() {
+  localVar<RefCountable>(provide_ref_ctnbl);
+}
+
+} // ignore_system_headers
+
+namespace conditional_op {
+RefCountable *provide_ref_ctnbl();
+bool bar();
+
+void foo() {
+  RefCountable *a = bar() ? nullptr : provide_ref_ctnbl();
+  // expected-warning@-1{{Local variable 'a' is uncounted and unsafe [alpha.webkit.UncountedLocalVarsChecker]}}
+  RefPtr<RefCountable> b = provide_ref_ctnbl();
+  {
+    RefCountable* c = bar() ? nullptr : b.get();
+    c->method();
+    RefCountable* d = bar() ? b.get() : nullptr;
+    d->method();
+  }
+}
+
+} // namespace conditional_op
diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-members.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-members.cpp
index 108d5effdd2e..bca7b3bad3a1 100644
--- a/clang/test/Analysis/Checkers/WebKit/uncounted-members.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/uncounted-members.cpp
@@ -1,6 +1,7 @@
 // RUN: %clang_analyze_cc1 -analyzer-checker=webkit.NoUncountedMemberChecker -verify %s
 
 #include "mock-types.h"
+#include "mock-system-header.h"
 
 namespace members {
   struct Foo {
@@ -50,3 +51,12 @@ namespace ignore_unions {
 
   void forceTmplToInstantiate(RefPtr<RefCountable>) {}
 }
+
+namespace ignore_system_header {
+
+void foo(RefCountable* t) {
+  MemberVariable<RefCountable> var { t };
+  var.obj->method();
+}
+
+} // ignore_system_header
diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
index 63a68a994a5c..073f3252160e 100644
--- a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
@@ -1,6 +1,7 @@
 // RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UncountedCallArgsChecker -verify %s
 
 #include "mock-types.h"
+#include "mock-system-header.h"
 
 void WTFBreakpointTrap();
 void WTFCrashWithInfo(int, const char*, const char*, int);
@@ -58,6 +59,18 @@ void WTFCrashWithInfo(int line, const char* file, const char* function, int coun
     WTFCrashWithInfoImpl(line, file, function, counter, wtfCrashArg(reason));
 }
 
+template<typename ToType, typename FromType>
+ToType bitwise_cast(FromType from);
+
+template<typename T>
+T* addressof(T& arg);
+
+bool isMainThread();
+bool isMainThreadOrGCThread();
+bool isMainRunLoop();
+bool isWebThread();
+bool isUIThread();
+
 enum class Flags : unsigned short {
   Flag1 = 1 << 0,
   Flag2 = 1 << 1,
@@ -137,16 +150,35 @@ public:
   Number(int v) : v(v) { }
   Number(double);
   Number operator+(const Number&);
+  Number& operator++() { ++v; return *this; }
+  Number operator++(int) { Number returnValue(v); ++v; return returnValue; }
   const int& value() const { return v; }
+  void someMethod();
+
 private:
   int v;
 };
 
+class ComplexNumber {
+public:
+  ComplexNumber() : real(0), complex(0) { }
+  ComplexNumber(const ComplexNumber&);
+  ComplexNumber& operator++() { real.someMethod(); return *this; }
+  ComplexNumber operator++(int);
+  ComplexNumber& operator<<(int);
+  ComplexNumber& operator+();
+
+private:
+  Number real;
+  Number complex;
+};
+
 class RefCounted {
 public:
   void ref() const;
   void deref() const;
 
+  void method();
   void someFunction();
   int otherFunction();
 
@@ -208,6 +240,22 @@ public:
   unsigned trivial32() { return sizeof(int); }
   unsigned trivial33() { return ~0xff; }
   template <unsigned v> unsigned trivial34() { return v; }
+  void trivial35() { v++; }
+  void trivial36() { ++(*number); }
+  void trivial37() { (*number)++; }
+  void trivial38() { v++; if (__builtin_expect(!!(number), 1)) (*number)++; }
+  int trivial39() { return -v; }
+  int trivial40() { return v << 2; }
+  unsigned trivial41() { v = ++s_v; return v; }
+  unsigned trivial42() { return bitwise_cast<unsigned long>(nullptr); }
+  Number* trivial43() { return addressof(*number); }
+  Number* trivial44() { return new Number(1); }
+  ComplexNumber* trivial45() { return new ComplexNumber(); }
+  void trivial46() { ASSERT(isMainThread()); }
+  void trivial47() { ASSERT(isMainThreadOrGCThread()); }
+  void trivial48() { ASSERT(isMainRunLoop()); }
+  void trivial49() { ASSERT(isWebThread()); }
+  void trivial50() { ASSERT(isUIThread()); }
 
   static RefCounted& singleton() {
     static RefCounted s_RefCounted;
@@ -282,12 +330,21 @@ public:
 
   int nonTrivial13() { return ~otherFunction(); }
   int nonTrivial14() { int r = 0xff; r |= otherFunction(); return r; }
+  void nonTrivial15() { ++complex; }
+  void nonTrivial16() { complex++; }
+  ComplexNumber nonTrivial17() { return complex << 2; }
+  ComplexNumber nonTrivial18() { return +complex; }
+  ComplexNumber* nonTrivial19() { return new ComplexNumber(complex); }
 
+  static unsigned s_v;
   unsigned v { 0 };
   Number* number { nullptr };
+  ComplexNumber complex;
   Enum enumValue { Enum::Value1 };
 };
 
+unsigned RefCounted::s_v = 0;
+
 RefCounted* refCountedObj();
 
 void test()
@@ -340,6 +397,22 @@ public:
     getFieldTrivial().trivial32(); // no-warning
     getFieldTrivial().trivial33(); // no-warning
     getFieldTrivial().trivial34<7>(); // no-warning
+    getFieldTrivial().trivial35(); // no-warning
+    getFieldTrivial().trivial36(); // no-warning
+    getFieldTrivial().trivial37(); // no-warning
+    getFieldTrivial().trivial38(); // no-warning
+    getFieldTrivial().trivial39(); // no-warning
+    getFieldTrivial().trivial40(); // no-warning
+    getFieldTrivial().trivial41(); // no-warning
+    getFieldTrivial().trivial42(); // no-warning
+    getFieldTrivial().trivial43(); // no-warning
+    getFieldTrivial().trivial44(); // no-warning
+    getFieldTrivial().trivial45(); // no-warning
+    getFieldTrivial().trivial46(); // no-warning
+    getFieldTrivial().trivial47(); // no-warning
+    getFieldTrivial().trivial48(); // no-warning
+    getFieldTrivial().trivial49(); // no-warning
+    getFieldTrivial().trivial50(); // no-warning
 
     RefCounted::singleton().trivial18(); // no-warning
     RefCounted::singleton().someFunction(); // no-warning
@@ -374,6 +447,16 @@ public:
     // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
     getFieldTrivial().nonTrivial14();
     // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
+    getFieldTrivial().nonTrivial15();
+    // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
+    getFieldTrivial().nonTrivial16();
+    // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
+    getFieldTrivial().nonTrivial17();
+    // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
+    getFieldTrivial().nonTrivial18();
+    // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
+    getFieldTrivial().nonTrivial19();
+    // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
   }
 };
 
@@ -399,3 +482,7 @@ void someFunction(const RefCounted&);
 void test2() {
     someFunction(*object());
 }
+
+void system_header() {
+  callMethod<RefCountable>(object);
+}
diff --git a/clang/test/Analysis/Inputs/system-header-simulator-cxx.h b/clang/test/Analysis/Inputs/system-header-simulator-cxx.h
index 1c2be322f83c..29326ec1f928 100644
--- a/clang/test/Analysis/Inputs/system-header-simulator-cxx.h
+++ b/clang/test/Analysis/Inputs/system-header-simulator-cxx.h
@@ -1260,6 +1260,13 @@ template<
     iterator end() const { return iterator(val + 1); }
 };
 
+template <typename T>
+class atomic {
+public:
+  T operator++();
+  T operator--();
+};
+
 namespace execution {
 class sequenced_policy {};
 }
diff --git a/clang/test/Analysis/NewDelete-atomics.cpp b/clang/test/Analysis/NewDelete-atomics.cpp
index 54fce17ea7bd..1425acab7489 100644
--- a/clang/test/Analysis/NewDelete-atomics.cpp
+++ b/clang/test/Analysis/NewDelete-atomics.cpp
@@ -20,7 +20,7 @@ typedef enum memory_order {
   memory_order_seq_cst = __ATOMIC_SEQ_CST
 } memory_order;
 
-class Obj {
+class RawObj {
   int RefCnt;
 
 public:
@@ -37,11 +37,27 @@ public:
   void foo();
 };
 
+class StdAtomicObj {
+  std::atomic<int> RefCnt;
+
+public:
+  int incRef() {
+    return ++RefCnt;
+  }
+
+  int decRef() {
+    return --RefCnt;
+  }
+
+  void foo();
+};
+
+template <typename T>
 class IntrusivePtr {
-  Obj *Ptr;
+  T *Ptr;
 
 public:
-  IntrusivePtr(Obj *Ptr) : Ptr(Ptr) {
+  IntrusivePtr(T *Ptr) : Ptr(Ptr) {
     Ptr->incRef();
   }
 
@@ -55,22 +71,106 @@ public:
       delete Ptr;
   }
 
-  Obj *getPtr() const { return Ptr; } // no-warning
+  T *getPtr() const { return Ptr; } // no-warning
+};
+
+// Also IntrusivePtr but let's dodge name-based heuristics.
+template <typename T>
+class DifferentlyNamed {
+  T *Ptr;
+
+public:
+  DifferentlyNamed(T *Ptr) : Ptr(Ptr) {
+    Ptr->incRef();
+  }
+
+  DifferentlyNamed(const DifferentlyNamed &Other) : Ptr(Other.Ptr) {
+    Ptr->incRef();
+  }
+
+  ~DifferentlyNamed() {
+  // We should not take the path on which the object is deleted.
+    if (Ptr->decRef() == 1)
+      delete Ptr;
+  }
+
+  T *getPtr() const { return Ptr; } // no-warning
 };
 
 void testDestroyLocalRefPtr() {
-  IntrusivePtr p1(new Obj());
+  IntrusivePtr<RawObj> p1(new RawObj());
+  {
+    IntrusivePtr<RawObj> p2(p1);
+  }
+
+  // p1 still maintains ownership. The object is not deleted.
+  p1.getPtr()->foo(); // no-warning
+}
+
+void testDestroySymbolicRefPtr(const IntrusivePtr<RawObj> &p1) {
+  {
+    IntrusivePtr<RawObj> p2(p1);
+  }
+
+  // p1 still maintains ownership. The object is not deleted.
+  p1.getPtr()->foo(); // no-warning
+}
+
+void testDestroyLocalRefPtrWithAtomics() {
+  IntrusivePtr<StdAtomicObj> p1(new StdAtomicObj());
+  {
+    IntrusivePtr<StdAtomicObj> p2(p1);
+  }
+
+  // p1 still maintains ownership. The object is not deleted.
+  p1.getPtr()->foo(); // no-warning
+}
+
+
+void testDestroyLocalRefPtrWithAtomics(const IntrusivePtr<StdAtomicObj> &p1) {
   {
-    IntrusivePtr p2(p1);
+    IntrusivePtr<StdAtomicObj> p2(p1);
   }
 
   // p1 still maintains ownership. The object is not deleted.
   p1.getPtr()->foo(); // no-warning
 }
 
-void testDestroySymbolicRefPtr(const IntrusivePtr &p1) {
+void testDestroyLocalRefPtrDifferentlyNamed() {
+  DifferentlyNamed<RawObj> p1(new RawObj());
+  {
+    DifferentlyNamed<RawObj> p2(p1);
+  }
+
+  // p1 still maintains ownership. The object is not deleted.
+  p1.getPtr()->foo(); // no-warning
+}
+
+void testDestroySymbolicRefPtrDifferentlyNamed(
+    const DifferentlyNamed<RawObj> &p1) {
+  {
+    DifferentlyNamed<RawObj> p2(p1);
+  }
+
+  // p1 still maintains ownership. The object is not deleted.
+  p1.getPtr()->foo(); // no-warning
+}
+
+void testDestroyLocalRefPtrWithAtomicsDifferentlyNamed() {
+  DifferentlyNamed<StdAtomicObj> p1(new StdAtomicObj());
+  {
+    DifferentlyNamed<StdAtomicObj> p2(p1);
+  }
+
+  // p1 still maintains ownership. The object is not deleted.
+  p1.getPtr()->foo(); // no-warning
+}
+
+
+void testDestroyLocalRefPtrWithAtomicsDifferentlyNamed(
+    const DifferentlyNamed<StdAtomicObj> &p1) {
   {
-    IntrusivePtr p2(p1);
+    DifferentlyNamed<StdAtomicObj> p2(p1);
   }
 
   // p1 still maintains ownership. The object is not deleted.
diff --git a/clang/test/Analysis/addrspace-null.c b/clang/test/Analysis/addrspace-null.c
index 54d96b277db0..259d1e289872 100644
--- a/clang/test/Analysis/addrspace-null.c
+++ b/clang/test/Analysis/addrspace-null.c
@@ -1,12 +1,12 @@
 // RUN: %clang_analyze_cc1 -triple amdgcn-unknown-unknown \
-// RUN: -analyze -analyzer-checker=core -DAMDGCN_TRIPLE \
-// RUN: -analyze -analyzer-checker=debug.ExprInspection \
-// RUN: -Wno-implicit-int -Wno-int-conversion -verify %s
+// RUN:   -analyzer-checker=core -DAMDGCN_TRIPLE \
+// RUN:   -analyzer-checker=debug.ExprInspection \
+// RUN:   -Wno-implicit-int -Wno-int-conversion -verify %s
 //
 // RUN: %clang_analyze_cc1 -triple amdgcn-unknown-unknown \
-// RUN: -analyze -analyzer-checker=core -DDEFAULT_TRIPLE \
-// RUN: -analyze -analyzer-checker=debug.ExprInspection \
-// RUN: -Wno-implicit-int -Wno-int-conversion -verify %s
+// RUN:   -analyzer-checker=core -DDEFAULT_TRIPLE \
+// RUN:   -analyzer-checker=debug.ExprInspection \
+// RUN:   -Wno-implicit-int -Wno-int-conversion -verify %s
 
 // From https://llvm.org/docs/AMDGPUUsage.html#address-spaces,
 // select address space 3 (local), since the pointer size is
diff --git a/clang/test/Analysis/ctu-cxxdefaultinitexpr.cpp b/clang/test/Analysis/ctu-cxxdefaultinitexpr.cpp
index e785a76310bb..18f03ee6b010 100644
--- a/clang/test/Analysis/ctu-cxxdefaultinitexpr.cpp
+++ b/clang/test/Analysis/ctu-cxxdefaultinitexpr.cpp
@@ -3,7 +3,7 @@
 // RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -std=c++17 \
 // RUN:   -emit-pch -o %t/ctudir/ctu-cxxdefaultinitexpr-import.cpp.ast %S/Inputs/ctu-cxxdefaultinitexpr-import.cpp
 // RUN: cp %S/Inputs/ctu-cxxdefaultinitexpr-import.cpp.externalDefMap.ast-dump.txt %t/ctudir/externalDefMap.txt
-// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fsyntax-only -std=c++17 -analyze \
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -std=c++17 -analyze \
 // RUN:   -analyzer-checker=core \
 // RUN:   -analyzer-config experimental-enable-naive-ctu-analysis=true \
 // RUN:   -analyzer-config ctu-dir=%t/ctudir \
diff --git a/clang/test/Analysis/ctu-lookup-name-with-space.cpp b/clang/test/Analysis/ctu-lookup-name-with-space.cpp
index 8e28fcca3611..ef733d5c52e3 100644
--- a/clang/test/Analysis/ctu-lookup-name-with-space.cpp
+++ b/clang/test/Analysis/ctu-lookup-name-with-space.cpp
@@ -7,7 +7,7 @@
 // RUN: %clang_cc1 -emit-pch %/S/Inputs/ctu-lookup-name-with-space.cpp -o %t/importee.ast
 
 // RUN: cd %t
-// RUN: %clang_cc1 -fsyntax-only -analyze \
+// RUN: %clang_cc1 -analyze \
 // RUN:   -analyzer-checker=core \
 // RUN:   -analyzer-config experimental-enable-naive-ctu-analysis=true \
 // RUN:   -analyzer-config ctu-dir=. \
diff --git a/clang/test/Analysis/ctu-main.c b/clang/test/Analysis/ctu-main.c
index 46ae5e13ba46..0193a7e6d135 100644
--- a/clang/test/Analysis/ctu-main.c
+++ b/clang/test/Analysis/ctu-main.c
@@ -4,7 +4,7 @@
 // RUN:   -emit-pch -o %t/ctudir2/ctu-other.c.ast %S/Inputs/ctu-other.c
 // RUN: cp %S/Inputs/ctu-other.c.externalDefMap.ast-dump.txt %t/ctudir2/externalDefMap.txt
 
-// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fsyntax-only -std=c89 -analyze \
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -std=c89 -analyze \
 // RUN:   -analyzer-checker=core,debug.ExprInspection \
 // RUN:   -analyzer-config eagerly-assume=false \
 // RUN:   -analyzer-config experimental-enable-naive-ctu-analysis=true \
@@ -14,7 +14,7 @@
 
 // Simulate the behavior of the previous CTU implementation by inlining all
 // functions during the first phase. This way, the second phase is a noop.
-// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fsyntax-only -std=c89 -analyze \
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -std=c89 -analyze \
 // RUN:   -analyzer-checker=core,debug.ExprInspection \
 // RUN:   -analyzer-config eagerly-assume=false \
 // RUN:   -analyzer-config experimental-enable-naive-ctu-analysis=true \
diff --git a/clang/test/Analysis/ctu-on-demand-parsing.c b/clang/test/Analysis/ctu-on-demand-parsing.c
index 815921060350..72288def61b1 100644
--- a/clang/test/Analysis/ctu-on-demand-parsing.c
+++ b/clang/test/Analysis/ctu-on-demand-parsing.c
@@ -11,7 +11,7 @@
 //
 // RUN: cd "%t" && %clang_extdef_map "%t/ctu-other.c" > externalDefMap.txt
 //
-// RUN: cd "%t" && %clang_cc1 -fsyntax-only -std=c89 -analyze \
+// RUN: cd "%t" && %clang_cc1 -std=c89 -analyze \
 // RUN:   -analyzer-checker=core,debug.ExprInspection \
 // RUN:   -analyzer-config experimental-enable-naive-ctu-analysis=true \
 // RUN:   -analyzer-config ctu-dir=. \
diff --git a/clang/test/Analysis/gh-issue-89185.c b/clang/test/Analysis/gh-issue-89185.c
index 8a907f198a5f..49526d2daa86 100644
--- a/clang/test/Analysis/gh-issue-89185.c
+++ b/clang/test/Analysis/gh-issue-89185.c
@@ -1,14 +1,13 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -analyzer-output text -verify %s 
 
-void clang_analyzer_dump(char);
-void clang_analyzer_dump_ptr(char*);
+void clang_analyzer_warnIfReached(void);
 
 // https://github.com/llvm/llvm-project/issues/89185
 void binding_to_label_loc() {
-  char *b = &&MyLabel;
+  char *b = &&MyLabel; // expected-note {{'b' initialized here}}
 MyLabel:
-  *b = 0; // no-crash
-  clang_analyzer_dump_ptr(b); // expected-warning {{&&MyLabel}}
-  clang_analyzer_dump(*b); // expected-warning {{Unknown}}
-  // FIXME: We should never reach here, as storing to a label is invalid.
+  *b = 0;
+  // expected-warning@-1 {{Dereference of the address of a label}}
+  // expected-note@-2    {{Dereference of the address of a label}}
+  clang_analyzer_warnIfReached(); // no-warning: Unreachable due to fatal error.
 }
diff --git a/clang/test/Analysis/handle_constructors_for_default_arguments.cpp b/clang/test/Analysis/handle_constructors_for_default_arguments.cpp
index c54d86526ec7..e20ec5e57d3a 100644
--- a/clang/test/Analysis/handle_constructors_for_default_arguments.cpp
+++ b/clang/test/Analysis/handle_constructors_for_default_arguments.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -analyze \
+// RUN: %clang_cc1 -analyze \
 // RUN:   -analyzer-checker=core,debug.ExprInspection %s -verify
 
 // These test cases demonstrate lack of Static Analyzer features.
diff --git a/clang/test/Analysis/handle_constructors_with_new_array.cpp b/clang/test/Analysis/handle_constructors_with_new_array.cpp
index f0027ede9371..d55d354f9c5c 100644
--- a/clang/test/Analysis/handle_constructors_with_new_array.cpp
+++ b/clang/test/Analysis/handle_constructors_with_new_array.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -analyze \
+// RUN: %clang_cc1 -analyze \
 // RUN:   -analyzer-checker=core,debug.ExprInspection %s -verify
 
 // These test cases demonstrate lack of Static Analyzer features.
diff --git a/clang/test/Analysis/html_diagnostics/relevant_lines/goto.c b/clang/test/Analysis/html_diagnostics/relevant_lines/goto.c
index fc90f1334f07..e77a4aa1b6e2 100644
--- a/clang/test/Analysis/html_diagnostics/relevant_lines/goto.c
+++ b/clang/test/Analysis/html_diagnostics/relevant_lines/goto.c
@@ -8,6 +8,6 @@ mylabel:
 }
 
 // RUN: rm -rf %t.output
-// RUN: %clang_analyze_cc1 -analyze -analyzer-checker=core -analyzer-output html -o %t.output %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core -analyzer-output html -o %t.output %s
 // RUN: cat %t.output/* | FileCheck %s --match-full-lines
 // CHECK: var relevant_lines = {"1": {"1": 1,  "2": 1, "3": 1, "4": 1, "6": 1, "7": 1}};
diff --git a/clang/test/Analysis/html_diagnostics/relevant_lines/macros_same_file.c b/clang/test/Analysis/html_diagnostics/relevant_lines/macros_same_file.c
index 5b3556526add..e0929dd97032 100644
--- a/clang/test/Analysis/html_diagnostics/relevant_lines/macros_same_file.c
+++ b/clang/test/Analysis/html_diagnostics/relevant_lines/macros_same_file.c
@@ -10,6 +10,6 @@ int f(int coin) {
 }
 
 // RUN: rm -rf %t.output
-// RUN: %clang_analyze_cc1 -analyze -analyzer-checker=core -analyzer-output html -o %t.output %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core -analyzer-output html -o %t.output %s
 // RUN: cat %t.output/* | FileCheck %s --match-full-lines
 // CHECK: var relevant_lines = {"1": {"3": 1, "4": 1, "5": 1, "6": 1}};
diff --git a/clang/test/Analysis/html_diagnostics/relevant_lines/multifile.c b/clang/test/Analysis/html_diagnostics/relevant_lines/multifile.c
index 1998c9383d9d..bee417a795f8 100644
--- a/clang/test/Analysis/html_diagnostics/relevant_lines/multifile.c
+++ b/clang/test/Analysis/html_diagnostics/relevant_lines/multifile.c
@@ -9,6 +9,6 @@ int f(int coin) {
 }
 
 // RUN: rm -rf %t.output
-// RUN: %clang_analyze_cc1 -analyze -analyzer-checker=core -analyzer-output html -o %t.output %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core -analyzer-output html -o %t.output %s
 // RUN: cat %t.output/* | FileCheck %s --match-full-lines
 // CHECK: var relevant_lines = {"1": {"3": 1, "4": 1, "5": 1, "6": 1}, "4": {"3": 1, "4": 1, "5": 1, "6": 1, "7": 1}};
diff --git a/clang/test/Analysis/html_diagnostics/relevant_lines/multiline_func_def.c b/clang/test/Analysis/html_diagnostics/relevant_lines/multiline_func_def.c
index 35158a46868c..f9321b9fa037 100644
--- a/clang/test/Analysis/html_diagnostics/relevant_lines/multiline_func_def.c
+++ b/clang/test/Analysis/html_diagnostics/relevant_lines/multiline_func_def.c
@@ -11,6 +11,6 @@ int f(
 }
 
 // RUN: rm -rf %t.output
-// RUN: %clang_analyze_cc1 -analyze -analyzer-checker=core -analyzer-output html -o %t.output %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core -analyzer-output html -o %t.output %s
 // RUN: cat %t.output/* | FileCheck %s --match-full-lines
 // CHECK: var relevant_lines = {"1": {"1": 1, "2": 1, "3": 1, "4": 1, "5": 1, "6": 1, "7": 1}};
diff --git a/clang/test/Analysis/html_diagnostics/relevant_lines/notexecutedlines.c b/clang/test/Analysis/html_diagnostics/relevant_lines/notexecutedlines.c
index 5b5f29809166..cce5e5a93d7b 100644
--- a/clang/test/Analysis/html_diagnostics/relevant_lines/notexecutedlines.c
+++ b/clang/test/Analysis/html_diagnostics/relevant_lines/notexecutedlines.c
@@ -7,6 +7,6 @@ int f(void) {
 // Show line with the warning even if it wasn't executed (e.g. warning given
 // by path-insensitive analysis).
 // RUN: rm -rf %t.output
-// RUN: %clang_analyze_cc1 -analyze -analyzer-checker=core,deadcode -analyzer-output html -o %t.output %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,deadcode -analyzer-output html -o %t.output %s
 // RUN: cat %t.output/* | FileCheck %s --match-full-lines
 // CHECK: var relevant_lines = {"1": {"3": 1}};
diff --git a/clang/test/Analysis/html_diagnostics/relevant_lines/objcmethods.m b/clang/test/Analysis/html_diagnostics/relevant_lines/objcmethods.m
index 41a4c1d2e097..43596a90b678 100644
--- a/clang/test/Analysis/html_diagnostics/relevant_lines/objcmethods.m
+++ b/clang/test/Analysis/html_diagnostics/relevant_lines/objcmethods.m
@@ -14,6 +14,6 @@ void foo(I *i) {
 }
 
 // RUN: rm -rf %t.output
-// RUN: %clang_analyze_cc1 -analyze -analyzer-checker=core -analyzer-output html -o %t.output -Wno-objc-root-class %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core -analyzer-output html -o %t.output -Wno-objc-root-class %s
 // RUN: cat %t.output/* | FileCheck %s
 // CHECK: var relevant_lines = {"1": {"6": 1, "7": 1, "11": 1, "12": 1, "13": 1}};
diff --git a/clang/test/Analysis/html_diagnostics/relevant_lines/simple_conditional.c b/clang/test/Analysis/html_diagnostics/relevant_lines/simple_conditional.c
index 769859dea5ef..ffe26a332a5a 100644
--- a/clang/test/Analysis/html_diagnostics/relevant_lines/simple_conditional.c
+++ b/clang/test/Analysis/html_diagnostics/relevant_lines/simple_conditional.c
@@ -8,6 +8,6 @@ int f(int coin) {
 }
 
 // RUN: rm -rf %t.output
-// RUN: %clang_analyze_cc1 -analyze -analyzer-checker=core -analyzer-output html -o %t.output %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core -analyzer-output html -o %t.output %s
 // RUN: cat %t.output/* | FileCheck %s --match-full-lines
 // CHECK: var relevant_lines = {"1": {"1": 1, "2": 1, "3": 1, "4": 1}};
diff --git a/clang/test/Analysis/html_diagnostics/relevant_lines/switch.c b/clang/test/Analysis/html_diagnostics/relevant_lines/switch.c
index e9032cdece9d..72a2ce1786c2 100644
--- a/clang/test/Analysis/html_diagnostics/relevant_lines/switch.c
+++ b/clang/test/Analysis/html_diagnostics/relevant_lines/switch.c
@@ -15,6 +15,6 @@ int f(enum E input) {
 }
 
 // RUN: rm -rf %t.output
-// RUN: %clang_analyze_cc1 -analyze -analyzer-checker=core -analyzer-output html -o %t.output %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core -analyzer-output html -o %t.output %s
 // RUN: cat %t.output/* | FileCheck %s --match-full-lines
 // CHECK: var relevant_lines = {"1": {"5": 1, "6": 1, "7": 1, "12": 1, "13": 1}};
diff --git a/clang/test/Analysis/html_diagnostics/relevant_lines/switch_default.c b/clang/test/Analysis/html_diagnostics/relevant_lines/switch_default.c
index b14e3f9fa8fd..72b13a32c352 100644
--- a/clang/test/Analysis/html_diagnostics/relevant_lines/switch_default.c
+++ b/clang/test/Analysis/html_diagnostics/relevant_lines/switch_default.c
@@ -15,6 +15,6 @@ int f(enum E input) {
 }
 
 // RUN: rm -rf %t.output
-// RUN: %clang_analyze_cc1 -analyze -analyzer-checker=core -analyzer-output html -o %t.output %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core -analyzer-output html -o %t.output %s
 // RUN: cat %t.output/* | FileCheck %s --match-full-lines
 // CHECK: var relevant_lines = {"1": {"5": 1, "6": 1, "7": 1, "12": 1, "13": 1}};
diff --git a/clang/test/Analysis/html_diagnostics/relevant_lines/synthesized_body.cpp b/clang/test/Analysis/html_diagnostics/relevant_lines/synthesized_body.cpp
index 2ed8355f6e60..bf01b448582e 100644
--- a/clang/test/Analysis/html_diagnostics/relevant_lines/synthesized_body.cpp
+++ b/clang/test/Analysis/html_diagnostics/relevant_lines/synthesized_body.cpp
@@ -20,6 +20,6 @@ void call_deref_once() {
 
 
 // RUN: rm -rf %t.output
-// RUN: %clang_analyze_cc1 -std=c++11 -analyze -analyzer-checker=core -analyzer-output html -o %t.output %s
+// RUN: %clang_analyze_cc1 -std=c++11 -analyzer-checker=core -analyzer-output html -o %t.output %s
 // RUN: cat %t.output/* | FileCheck %s --match-full-lines
 // CHECK: var relevant_lines = {"1": {"3": 1,  "8": 1, "11": 1, "12": 1, "15": 1, "16": 1, "17": 1, "18": 1}};
diff --git a/clang/test/Analysis/html_diagnostics/relevant_lines/unused_header.c b/clang/test/Analysis/html_diagnostics/relevant_lines/unused_header.c
index 4b77c651eddd..3b4994472838 100644
--- a/clang/test/Analysis/html_diagnostics/relevant_lines/unused_header.c
+++ b/clang/test/Analysis/html_diagnostics/relevant_lines/unused_header.c
@@ -14,6 +14,6 @@ int v(int coin) {
 }
 
 // RUN: rm -rf %t.output
-// RUN: %clang_analyze_cc1 -analyze -analyzer-checker=core -analyzer-output html -o %t.output %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core -analyzer-output html -o %t.output %s
 // RUN: cat %t.output/* | FileCheck %s --match-full-lines
 // CHECK: var relevant_lines = {"1": {"3": 1, "4": 1, "5": 1, "6": 1}};
diff --git a/clang/test/Analysis/inlining/temp-dtors-path-notes.cpp b/clang/test/Analysis/inlining/temp-dtors-path-notes.cpp
index 40b26b5c91ae..3580f1195f8d 100644
--- a/clang/test/Analysis/inlining/temp-dtors-path-notes.cpp
+++ b/clang/test/Analysis/inlining/temp-dtors-path-notes.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -analyze -analyzer-checker core -analyzer-config cfg-temporary-dtors=true,c++-temp-dtor-inlining=true -analyzer-output=text -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker core -analyzer-config cfg-temporary-dtors=true,c++-temp-dtor-inlining=true -analyzer-output=text -verify %s
 
 namespace test_simple_temporary {
 class C {
diff --git a/clang/test/Analysis/lifetime-extended-regions.cpp b/clang/test/Analysis/lifetime-extended-regions.cpp
index 4e98bd4b0403..4458ad294af7 100644
--- a/clang/test/Analysis/lifetime-extended-regions.cpp
+++ b/clang/test/Analysis/lifetime-extended-regions.cpp
@@ -120,10 +120,11 @@ void aggregateWithReferences() {
   clang_analyzer_dump(viaReference);    // expected-warning-re {{&lifetime_extended_object{RefAggregate, viaReference, S{{[0-9]+}}} }}
   clang_analyzer_dump(viaReference.rx); // expected-warning-re {{&lifetime_extended_object{int, viaReference, S{{[0-9]+}}} }}
   clang_analyzer_dump(viaReference.ry); // expected-warning-re {{&lifetime_extended_object{Composite, viaReference, S{{[0-9]+}}} }}
-
-  // clang does not currently implement extending lifetime of object bound to reference members of aggregates,
-  // that are created from default member initializer (see `warn_unsupported_lifetime_extension` from `-Wdangling`)
-  RefAggregate defaultInitExtended{i}; // clang-bug does not extend `Composite`
+  
+  // FIXME: clang currently support extending lifetime of object bound to reference members of aggregates,
+  // that are created from default member initializer. But CFG and ExprEngine need to be updated to address this change.
+  // The following expect warning: {{&lifetime_extended_object{Composite, defaultInitExtended, S{{[0-9]+}}} }}
+  RefAggregate defaultInitExtended{i};
   clang_analyzer_dump(defaultInitExtended.ry); // expected-warning {{Unknown }}
 }
 
diff --git a/clang/test/Analysis/new-aligned.cpp b/clang/test/Analysis/new-aligned.cpp
index fae1f4864856..041e63ec24de 100644
--- a/clang/test/Analysis/new-aligned.cpp
+++ b/clang/test/Analysis/new-aligned.cpp
@@ -1,4 +1,4 @@
-//RUN: %clang_analyze_cc1 -std=c++17 -analyze -analyzer-checker=core -verify %s
+//RUN: %clang_analyze_cc1 -std=c++17 -analyzer-checker=core -verify %s
 
 // expected-no-diagnostics
 
diff --git a/clang/test/Analysis/osobject-retain-release.cpp b/clang/test/Analysis/osobject-retain-release.cpp
index 2ae5752f4402..63fd784f6dd0 100644
--- a/clang/test/Analysis/osobject-retain-release.cpp
+++ b/clang/test/Analysis/osobject-retain-release.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -std=c++14 -fblocks -analyze -analyzer-output=text\
+// RUN: %clang_analyze_cc1 -std=c++14 -fblocks -analyzer-output=text\
 // RUN:   -analyzer-checker=core,osx,debug.ExprInspection -verify %s
 
 #include "os_object_base.h"
diff --git a/clang/test/Analysis/ptr-cmp-const-trunc.cl b/clang/test/Analysis/ptr-cmp-const-trunc.cl
index 4483ef68397c..f70eeb738834 100644
--- a/clang/test/Analysis/ptr-cmp-const-trunc.cl
+++ b/clang/test/Analysis/ptr-cmp-const-trunc.cl
@@ -1,4 +1,4 @@
-//RUN: %clang_analyze_cc1 -triple amdgcn-unknown-unknown -analyze -analyzer-checker=core -verify %s
+//RUN: %clang_analyze_cc1 -triple amdgcn-unknown-unknown -analyzer-checker=core -verify %s
 // expected-no-diagnostics
 
 #include <stdint.h>
diff --git a/clang/test/Analysis/region_store_overflow.c b/clang/test/Analysis/region_store_overflow.c
index 505965301bc2..3789c32b6c87 100644
--- a/clang/test/Analysis/region_store_overflow.c
+++ b/clang/test/Analysis/region_store_overflow.c
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -analyze -analyzer-checker=core -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core -verify %s
 
 int **h;
 int overflow_in_memregion(long j) {
diff --git a/clang/test/Analysis/return-value-guaranteed.cpp b/clang/test/Analysis/return-value-guaranteed.cpp
index 367a8e5906af..3b010ffba360 100644
--- a/clang/test/Analysis/return-value-guaranteed.cpp
+++ b/clang/test/Analysis/return-value-guaranteed.cpp
@@ -1,91 +1,91 @@
 // RUN: %clang_analyze_cc1 \
 // RUN:  -analyzer-checker=core,apiModeling.llvm.ReturnValue \
-// RUN:  -analyzer-output=text -verify=class %s
+// RUN:  -analyzer-output=text -verify %s
 
 struct Foo { int Field; };
 bool problem();
 void doSomething();
 
-// We predefined the return value of 'MCAsmParser::Error' as true and we cannot
-// take the false-branches which leads to a "garbage value" false positive.
-namespace test_classes {
+// Test the normal case when the implementation of MCAsmParser::Error() (one of
+// the methods modeled by this checker) is opaque.
+namespace test_normal {
 struct MCAsmParser {
   static bool Error();
 };
 
 bool parseFoo(Foo &F) {
   if (problem()) {
-    // class-note@-1 {{Assuming the condition is false}}
-    // class-note@-2 {{Taking false branch}}
+    // expected-note@-1 {{Assuming the condition is false}}
+    // expected-note@-2 {{Taking false branch}}
     return MCAsmParser::Error();
   }
 
   F.Field = 0;
-  // class-note@-1 {{The value 0 is assigned to 'F.Field'}}
-  return !MCAsmParser::Error();
-  // class-note@-1 {{'MCAsmParser::Error' returns true}}
+  // expected-note@-1 {{The value 0 is assigned to 'F.Field'}}
+  return false;
 }
 
 bool parseFile() {
   Foo F;
   if (parseFoo(F)) {
-    // class-note@-1 {{Calling 'parseFoo'}}
-    // class-note@-2 {{Returning from 'parseFoo'}}
-    // class-note@-3 {{Taking false branch}}
+    // expected-note@-1 {{Calling 'parseFoo'}}
+    // expected-note@-2 {{Returning from 'parseFoo'}}
+    // expected-note@-3 {{Taking false branch}}
     return true;
   }
 
+  // The following expression would produce the false positive report
+  //    "The left operand of '==' is a garbage value"
+  // without the modeling done by apiModeling.llvm.ReturnValue:
   if (F.Field == 0) {
-    // class-note@-1 {{Field 'Field' is equal to 0}}
-    // class-note@-2 {{Taking true branch}}
-
-    // no-warning: "The left operand of '==' is a garbage value" was here.
+    // expected-note@-1 {{Field 'Field' is equal to 0}}
+    // expected-note@-2 {{Taking true branch}}
     doSomething();
   }
 
+  // Trigger a zero division to get path notes:
   (void)(1 / F.Field);
-  // class-warning@-1 {{Division by zero}}
-  // class-note@-2 {{Division by zero}}
+  // expected-warning@-1 {{Division by zero}}
+  // expected-note@-2 {{Division by zero}}
   return false;
 }
-} // namespace test_classes
+} // namespace test_normal
 
 
-// We predefined 'MCAsmParser::Error' as returning true, but now it returns
-// false, which breaks our invariant. Test the notes.
+// Sanity check for the highly unlikely case where the implementation of the
+// method breaks the convention.
 namespace test_break {
 struct MCAsmParser {
   static bool Error() {
-    return false; // class-note {{'MCAsmParser::Error' returns false}}
+    return false;
   }
 };
 
 bool parseFoo(Foo &F) {
   if (problem()) {
-    // class-note@-1 {{Assuming the condition is false}}
-    // class-note@-2 {{Taking false branch}}
+    // expected-note@-1 {{Assuming the condition is false}}
+    // expected-note@-2 {{Taking false branch}}
     return !MCAsmParser::Error();
   }
 
   F.Field = 0;
-  // class-note@-1 {{The value 0 is assigned to 'F.Field'}}
+  // expected-note@-1 {{The value 0 is assigned to 'F.Field'}}
   return MCAsmParser::Error();
-  // class-note@-1 {{Calling 'MCAsmParser::Error'}}
-  // class-note@-2 {{Returning from 'MCAsmParser::Error'}}
+  // expected-note@-1 {{'MCAsmParser::Error' returned false, breaking the convention that it always returns true}}
 }
 
 bool parseFile() {
   Foo F;
   if (parseFoo(F)) {
-    // class-note@-1 {{Calling 'parseFoo'}}
-    // class-note@-2 {{Returning from 'parseFoo'}}
-    // class-note@-3 {{Taking false branch}}
+    // expected-note@-1 {{Calling 'parseFoo'}}
+    // expected-note@-2 {{Returning from 'parseFoo'}}
+    // expected-note@-3 {{Taking false branch}}
     return true;
   }
 
   (void)(1 / F.Field);
-  // class-warning@-1 {{Division by zero}}
-  // class-note@-2 {{Division by zero}}
+  // expected-warning@-1 {{Division by zero}}
+  // expected-note@-2 {{Division by zero}}
   return false;
 }
-} // namespace test_classes
+} // namespace test_break
diff --git a/clang/test/Analysis/scan-build/deduplication.test b/clang/test/Analysis/scan-build/deduplication.test
index 56d888e5fc12..2ec3061701fc 100644
--- a/clang/test/Analysis/scan-build/deduplication.test
+++ b/clang/test/Analysis/scan-build/deduplication.test
@@ -1,4 +1,3 @@
-// FIXME: Actually, "perl".
 REQUIRES: shell
 
 RUN: rm -rf %t.output_dir && mkdir %t.output_dir
diff --git a/clang/test/Analysis/scan-build/exclude_directories.test b/clang/test/Analysis/scan-build/exclude_directories.test
index c161e51b6d26..2c79ed842af1 100644
--- a/clang/test/Analysis/scan-build/exclude_directories.test
+++ b/clang/test/Analysis/scan-build/exclude_directories.test
@@ -1,6 +1,3 @@
-// FIXME: Actually, "perl".
-REQUIRES: shell
-
 RUN: rm -rf %t.output_dir && mkdir %t.output_dir
 RUN: %scan-build -o %t.output_dir %clang -S \
 RUN:     %S/Inputs/multidirectory_project/directory1/file1.c \
diff --git a/clang/test/Analysis/scan-build/help.test b/clang/test/Analysis/scan-build/help.test
index 61915d326094..d1f17cd69f51 100644
--- a/clang/test/Analysis/scan-build/help.test
+++ b/clang/test/Analysis/scan-build/help.test
@@ -1,6 +1,3 @@
-// FIXME: Actually, "perl".
-REQUIRES: shell
-
 RUN: %scan-build -h | FileCheck %s
 RUN: %scan-build --help | FileCheck %s
 
diff --git a/clang/test/Analysis/scan-build/html_output.test b/clang/test/Analysis/scan-build/html_output.test
index add35d83b958..c2b509d9ef66 100644
--- a/clang/test/Analysis/scan-build/html_output.test
+++ b/clang/test/Analysis/scan-build/html_output.test
@@ -1,4 +1,3 @@
-// FIXME: Actually, "perl".
 REQUIRES: shell
 
 RUN: rm -rf %t.output_dir && mkdir %t.output_dir
diff --git a/clang/test/Analysis/scan-build/lit.local.cfg b/clang/test/Analysis/scan-build/lit.local.cfg
index fab52b1c7bd6..aed76ca0e808 100644
--- a/clang/test/Analysis/scan-build/lit.local.cfg
+++ b/clang/test/Analysis/scan-build/lit.local.cfg
@@ -1,8 +1,8 @@
 # -*- Python -*-
 
-import lit.util
 import lit.formats
 import os
+import platform
 
 use_lit_shell = os.environ.get("LIT_USE_INTERNAL_SHELL")
 config.test_format = lit.formats.ShTest(use_lit_shell == "0")
@@ -12,13 +12,16 @@ clang_path = config.clang if config.have_llvm_driver else os.path.realpath(confi
 config.substitutions.append(
     (
         "%scan-build",
-        "'%s' --use-analyzer=%s "
+        "'%s' '%s' --use-analyzer=%s "
         % (
-            lit.util.which(
-                "scan-build",
-                os.path.join(config.clang_src_dir, "tools", "scan-build", "bin"),
+            config.perl_executable,
+            os.path.join(
+                config.clang_src_dir, "tools", "scan-build", "bin", "scan-build"
             ),
             clang_path,
         ),
     )
 )
+
+if not config.perl_executable or platform.system() == "Windows":
+    config.unsupported = True
diff --git a/clang/test/Analysis/scan-build/plist_html_output.test b/clang/test/Analysis/scan-build/plist_html_output.test
index c07891e35fbf..ca9c5256b9d7 100644
--- a/clang/test/Analysis/scan-build/plist_html_output.test
+++ b/clang/test/Analysis/scan-build/plist_html_output.test
@@ -1,4 +1,3 @@
-// FIXME: Actually, "perl".
 REQUIRES: shell
 
 RUN: rm -rf %t.output_dir && mkdir %t.output_dir
diff --git a/clang/test/Analysis/scan-build/plist_output.test b/clang/test/Analysis/scan-build/plist_output.test
index 0112e84630ed..4d01640bff6e 100644
--- a/clang/test/Analysis/scan-build/plist_output.test
+++ b/clang/test/Analysis/scan-build/plist_output.test
@@ -1,4 +1,3 @@
-// FIXME: Actually, "perl".
 REQUIRES: shell
 
 RUN: rm -rf %t.output_dir && mkdir %t.output_dir
diff --git a/clang/test/Analysis/scan-build/rebuild_index/rebuild_index.test b/clang/test/Analysis/scan-build/rebuild_index/rebuild_index.test
index ab70435c6054..711a74f3fd02 100644
--- a/clang/test/Analysis/scan-build/rebuild_index/rebuild_index.test
+++ b/clang/test/Analysis/scan-build/rebuild_index/rebuild_index.test
@@ -1,6 +1,3 @@
-// FIXME: Actually, "perl".
-REQUIRES: shell
-
 RUN: rm -rf %t.output_dir && mkdir %t.output_dir
 RUN: cp %S/report-1.html %t.output_dir
 RUN: cp %S/report-2.html %t.output_dir
diff --git a/clang/test/Analysis/scan-build/silence-core-checkers.test b/clang/test/Analysis/scan-build/silence-core-checkers.test
index 6d9a3017fcd6..7ffa744a545c 100644
--- a/clang/test/Analysis/scan-build/silence-core-checkers.test
+++ b/clang/test/Analysis/scan-build/silence-core-checkers.test
@@ -1,6 +1,3 @@
-// FIXME: Actually, "perl".
-REQUIRES: shell
-
 RUN: rm -rf %t.output_dir && mkdir %t.output_dir
 RUN: %scan-build -o %t.output_dir \
 RUN:   %clang -S %S/Inputs/null_dereference_and_division_by_zero.c \
diff --git a/clang/test/Analysis/solver-sym-simplification-bool.cpp b/clang/test/Analysis/solver-sym-simplification-bool.cpp
index 0e7633dfb87e..f9496633beab 100644
--- a/clang/test/Analysis/solver-sym-simplification-bool.cpp
+++ b/clang/test/Analysis/solver-sym-simplification-bool.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -analyze -analyzer-checker=core \
+// RUN: %clang_analyze_cc1 -analyzer-checker=core \
 // RUN: -analyzer-checker=debug.ExprInspection -verify %s
 
 void clang_analyzer_dump(bool);
diff --git a/clang/test/Analysis/solver-sym-simplification-ptr-bool.cl b/clang/test/Analysis/solver-sym-simplification-ptr-bool.cl
index be8edbf51eba..f9c1c3a064dc 100644
--- a/clang/test/Analysis/solver-sym-simplification-ptr-bool.cl
+++ b/clang/test/Analysis/solver-sym-simplification-ptr-bool.cl
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -triple amdgcn-unknown-unknown -analyze -analyzer-checker=core %s
+// RUN: %clang_analyze_cc1 -triple amdgcn-unknown-unknown -analyzer-checker=core %s
 
 // expected-no-diagnostics
 
diff --git a/clang/test/Analysis/taint-diagnostic-visitor.c b/clang/test/Analysis/taint-diagnostic-visitor.c
index 2ba7d9938fc3..b8b3710a7013 100644
--- a/clang/test/Analysis/taint-diagnostic-visitor.c
+++ b/clang/test/Analysis/taint-diagnostic-visitor.c
@@ -10,7 +10,8 @@ int scanf(const char *restrict format, ...);
 int system(const char *command);
 char* getenv( const char* env_var );
 size_t strlen( const char* str );
-int atoi( const char* str );
+char *strcat( char *dest, const char *src );
+char* strcpy( char* dest, const char* src );
 void *malloc(size_t size );
 void free( void *ptr );
 char *fgets(char *str, int n, FILE *stream);
@@ -53,34 +54,32 @@ void taintDiagnosticVLA(void) {
 
 // Tests if the originated note is correctly placed even if the path is
 // propagating through variables and expressions
-char *taintDiagnosticPropagation(){
-  char *pathbuf;
-  char *size=getenv("SIZE"); // expected-note {{Taint originated here}}
-                                 // expected-note@-1 {{Taint propagated to the return value}}
-  if (size){ // expected-note {{Assuming 'size' is non-null}}
-	               // expected-note@-1 {{Taking true branch}}
-    pathbuf=(char*) malloc(atoi(size)); // expected-warning{{Untrusted data is used to specify the buffer size}}
-                                                // expected-note@-1{{Untrusted data is used to specify the buffer size}}
-                                                // expected-note@-2 {{Taint propagated to the return value}}
-    return pathbuf;
+int taintDiagnosticPropagation(){
+  int res;
+  char *cmd=getenv("CMD"); // expected-note {{Taint originated here}}
+                           // expected-note@-1 {{Taint propagated to the return value}}
+  if (cmd){ // expected-note {{Assuming 'cmd' is non-null}}
+            // expected-note@-1 {{Taking true branch}}
+    res = system(cmd); // expected-warning{{Untrusted data is passed to a system call}}
+                       // expected-note@-1{{Untrusted data is passed to a system call}}
+    return res;
   }
-  return 0;
+  return -1;
 }
 
 // Taint origin should be marked correctly even if there are multiple taint
 // sources in the function
-char *taintDiagnosticPropagation2(){
-  char *pathbuf;
+int taintDiagnosticPropagation2(){
+  int res;
   char *user_env2=getenv("USER_ENV_VAR2");//unrelated taint source
-  char *size=getenv("SIZE"); // expected-note {{Taint originated here}}
-                                 // expected-note@-1 {{Taint propagated to the return value}}
+  char *cmd=getenv("CMD"); // expected-note {{Taint originated here}}
+                           // expected-note@-1 {{Taint propagated to the return value}}
   char *user_env=getenv("USER_ENV_VAR");//unrelated taint source
-  if (size){ // expected-note {{Assuming 'size' is non-null}}
-	               // expected-note@-1 {{Taking true branch}}
-    pathbuf=(char*) malloc(atoi(size)+1); // expected-warning{{Untrusted data is used to specify the buffer size}}
-                                                // expected-note@-1{{Untrusted data is used to specify the buffer size}}
-                                                // expected-note@-2 {{Taint propagated to the return value}}
-    return pathbuf;
+  if (cmd){ // expected-note {{Assuming 'cmd' is non-null}}
+	          // expected-note@-1 {{Taking true branch}}
+    res = system(cmd); // expected-warning{{Untrusted data is passed to a system call}}
+                       // expected-note@-1{{Untrusted data is passed to a system call}}
+    return res;
   }
   return 0;
 }
@@ -95,22 +94,24 @@ void testReadStdIn(){
 }
 
 void multipleTaintSources(void) {
-  int x,y,z;
-  scanf("%d", &x); // expected-note {{Taint originated here}}
+  char cmd[2048], file[1024];
+  scanf ("%1022[^\n] ", cmd); // expected-note {{Taint originated here}}
                    // expected-note@-1 {{Taint propagated to the 2nd argument}}
-  scanf("%d", &y); // expected-note {{Taint originated here}}
+  scanf ("%1023[^\n]", file); // expected-note {{Taint originated here}}
                    // expected-note@-1 {{Taint propagated to the 2nd argument}}
-  scanf("%d", &z);
-  int* ptr = (int*) malloc(y + x); // expected-warning {{Untrusted data is used to specify the buffer size}}
-                                   // expected-note@-1{{Untrusted data is used to specify the buffer size}}
-  free (ptr);
+  strcat(cmd, file); // expected-note {{Taint propagated to the 1st argument}}
+  strcat(cmd, " "); // expected-note {{Taint propagated to the 1st argument}}
+  system(cmd); // expected-warning {{Untrusted data is passed to a system call}}
+               // expected-note@-1{{Untrusted data is passed to a system call}}
 }
 
 void multipleTaintedArgs(void) {
-  int x,y;
-  scanf("%d %d", &x, &y); // expected-note {{Taint originated here}}
+  char cmd[1024], file[1024], buf[2048];
+  scanf("%1022s %1023s", cmd, file); // expected-note {{Taint originated here}}
                           // expected-note@-1 {{Taint propagated to the 2nd argument, 3rd argument}}
-  int* ptr = (int*) malloc(x + y); // expected-warning {{Untrusted data is used to specify the buffer size}}
-                                   // expected-note@-1{{Untrusted data is used to specify the buffer size}}
-  free (ptr);
+  strcpy(buf, cmd);// expected-note {{Taint propagated to the 1st argument}}
+  strcat(buf, " ");// expected-note {{Taint propagated to the 1st argument}}
+  strcat(buf, file);// expected-note {{Taint propagated to the 1st argument}}
+  system(buf); // expected-warning {{Untrusted data is passed to a system call}}
+               // expected-note@-1{{Untrusted data is passed to a system call}}
 }
diff --git a/clang/test/Analysis/taint-generic.c b/clang/test/Analysis/taint-generic.c
index e85b4106a580..b0df85f23729 100644
--- a/clang/test/Analysis/taint-generic.c
+++ b/clang/test/Analysis/taint-generic.c
@@ -305,15 +305,21 @@ void testGets_s(void) {
 
 void testTaintedBufferSize(void) {
   size_t ts;
+  // The functions malloc, calloc, bcopy and memcpy are not taint sinks in the
+  // default config of GenericTaintChecker (because that would cause too many
+  // false positives).
+  // FIXME: We should generate warnings when a value passed to these functions
+  // is tainted and _can be very large_ (because that's exploitable). This
+  // functionality probably belongs to the checkers that do more detailed
+  // modeling of these functions (MallocChecker and CStringChecker).
   scanf("%zd", &ts);
-
-  int *buf1 = (int*)malloc(ts*sizeof(int)); // expected-warning {{Untrusted data is used to specify the buffer size}}
-  char *dst = (char*)calloc(ts, sizeof(char)); //expected-warning {{Untrusted data is used to specify the buffer size}}
-  bcopy(buf1, dst, ts); // expected-warning {{Untrusted data is used to specify the buffer size}}
-  __builtin_memcpy(dst, buf1, (ts + 4)*sizeof(char)); // expected-warning {{Untrusted data is used to specify the buffer size}}
+  int *buf1 = (int*)malloc(ts*sizeof(int)); // warn here, ts is unbounded and tainted
+  char *dst = (char*)calloc(ts, sizeof(char)); // warn here, ts is unbounded tainted
+  bcopy(buf1, dst, ts); // no warning here, since the size of buf1, dst equals ts. Cannot overflow.
+  __builtin_memcpy(dst, buf1, (ts + 4)*sizeof(char)); // warn here, dst overflows (whatever the value of ts)
 
   // If both buffers are trusted, do not issue a warning.
-  char *dst2 = (char*)malloc(ts*sizeof(char)); // expected-warning {{Untrusted data is used to specify the buffer size}}
+  char *dst2 = (char*)malloc(ts*sizeof(char)); // warn here, ts in unbounded
   strncat(dst2, dst, ts); // no-warning
 }
 
@@ -353,7 +359,7 @@ void testStruct(void) {
 
   sock = socket(AF_INET, SOCK_STREAM, 0);
   read(sock, &tainted, sizeof(tainted));
-  __builtin_memcpy(buffer, tainted.buf, tainted.length); // expected-warning {{Untrusted data is used to specify the buffer size}}
+  clang_analyzer_isTainted_int(tainted.length); // expected-warning {{YES }}
 }
 
 void testStructArray(void) {
@@ -368,17 +374,17 @@ void testStructArray(void) {
   __builtin_memset(srcbuf, 0, sizeof(srcbuf));
 
   read(sock, &tainted[0], sizeof(tainted));
-  __builtin_memcpy(dstbuf, srcbuf, tainted[0].length); // expected-warning {{Untrusted data is used to specify the buffer size}}
+  clang_analyzer_isTainted_int(tainted[0].length); // expected-warning {{YES}}
 
   __builtin_memset(&tainted, 0, sizeof(tainted));
   read(sock, &tainted, sizeof(tainted));
-  __builtin_memcpy(dstbuf, srcbuf, tainted[0].length); // expected-warning {{Untrusted data is used to specify the buffer size}}
+  clang_analyzer_isTainted_int(tainted[0].length); // expected-warning {{YES}}
 
   __builtin_memset(&tainted, 0, sizeof(tainted));
   // If we taint element 1, we should not raise an alert on taint for element 0 or element 2
   read(sock, &tainted[1], sizeof(tainted));
-  __builtin_memcpy(dstbuf, srcbuf, tainted[0].length); // no-warning
-  __builtin_memcpy(dstbuf, srcbuf, tainted[2].length); // no-warning
+  clang_analyzer_isTainted_int(tainted[0].length); // expected-warning {{NO}}
+  clang_analyzer_isTainted_int(tainted[2].length); // expected-warning {{NO}}
 }
 
 void testUnion(void) {
diff --git a/clang/test/Analysis/transparent_union_bug.c b/clang/test/Analysis/transparent_union_bug.c
index 5f8a5a4a706e..abb43e97a175 100644
--- a/clang/test/Analysis/transparent_union_bug.c
+++ b/clang/test/Analysis/transparent_union_bug.c
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -analyze -triple x86_64-apple-darwin10 \
+// RUN: %clang_analyze_cc1 -triple x86_64-apple-darwin10 \
 // RUN:  -analyzer-checker=core,debug.ExprInspection -verify %s
 
 void clang_analyzer_warnIfReached(void);
diff --git a/clang/test/Analysis/trustnonnullchecker_test.m b/clang/test/Analysis/trustnonnullchecker_test.m
index cb0c1cdcab61..b18734b6652a 100644
--- a/clang/test/Analysis/trustnonnullchecker_test.m
+++ b/clang/test/Analysis/trustnonnullchecker_test.m
@@ -1,7 +1,7 @@
 // Temporarily disabling the test, it failes the "system is over-constrained" (part of expensive checks)
 // assertion in *non* optimized builds.
 // REQUIRES: rdar44992170
-// RUN: %clang_analyze_cc1 -fblocks -analyze -analyzer-checker=core,nullability,apiModeling,debug.ExprInspection  -verify %s
+// RUN: %clang_analyze_cc1 -fblocks -analyzer-checker=core,nullability,apiModeling,debug.ExprInspection -verify %s
 
 #include "Inputs/system-header-simulator-for-nullability.h"
 
diff --git a/clang/test/Analysis/trustnonnullchecker_test.mm b/clang/test/Analysis/trustnonnullchecker_test.mm
index fa84673492be..bd8b9a7759cb 100644
--- a/clang/test/Analysis/trustnonnullchecker_test.mm
+++ b/clang/test/Analysis/trustnonnullchecker_test.mm
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -fblocks -analyze -analyzer-checker=core,nullability,apiModeling  -verify %s
+// RUN: %clang_analyze_cc1 -fblocks -analyzer-checker=core,nullability,apiModeling -verify %s
 
 #include "Inputs/system-header-simulator-for-nullability-cxx.h"
 
diff --git a/clang/test/Analysis/undef-call.c b/clang/test/Analysis/undef-call.c
index f560da104e56..8019028282aa 100644
--- a/clang/test/Analysis/undef-call.c
+++ b/clang/test/Analysis/undef-call.c
@@ -1,6 +1,6 @@
 // RUN: rm -rf %T/ctudir
 // RUN: mkdir %T/ctudir
-// RUN: %clang_cc1 -fsyntax-only -analyze -analyzer-checker=debug.ExprInspection -analyzer-config experimental-enable-naive-ctu-analysis=true -analyzer-config ctu-dir=%T/ctudir -verify %s
+// RUN: %clang_cc1 -analyze -analyzer-checker=debug.ExprInspection -analyzer-config experimental-enable-naive-ctu-analysis=true -analyzer-config ctu-dir=%T/ctudir -verify %s
 // expected-no-diagnostics
 
 struct S {
diff --git a/clang/test/C/drs/dr290.c b/clang/test/C/drs/dr290.c
index 3a6fd1d0dab6..92b7d77c7468 100644
--- a/clang/test/C/drs/dr290.c
+++ b/clang/test/C/drs/dr290.c
@@ -1,4 +1,4 @@
-/* RUN: %clang_cc1 -fsyntax-only -ast-dump %s | FileCheck %s
+/* RUN: %clang_cc1 -ast-dump %s | FileCheck %s
  */
 
 /* WG14 DR290: no
diff --git a/clang/test/CXX/class/class.compare/class.compare.default/p4.cpp b/clang/test/CXX/class/class.compare/class.compare.default/p4.cpp
index 534c3b34d883..53a8bfc9a4f4 100644
--- a/clang/test/CXX/class/class.compare/class.compare.default/p4.cpp
+++ b/clang/test/CXX/class/class.compare/class.compare.default/p4.cpp
@@ -18,14 +18,22 @@ namespace std {
 
 namespace N {
   struct A {
-    friend constexpr std::strong_ordering operator<=>(const A&, const A&) = default;
+    friend constexpr std::strong_ordering operator<=>(const A&, const A&) = default; // expected-note 2{{declared here}}
   };
 
-  constexpr bool (*test_a_not_found)(const A&, const A&) = &operator==; // expected-error {{undeclared}}
+  constexpr std::strong_ordering (*test_a_threeway_not_found)(const A&, const A&) = &operator<=>; // expected-error {{undeclared}}
+
+  constexpr std::strong_ordering operator<=>(const A&, const A&) noexcept;
+  constexpr std::strong_ordering (*test_a_threeway)(const A&, const A&) = &operator<=>;
+  static_assert(!(*test_a_threeway)(A(), A())); // expected-error {{static assertion expression is not an integral constant expression}}
+                                               // expected-note@-1 {{undefined function 'operator<=>' cannot be used in a constant expression}}
+
+  constexpr bool (*test_a_equal_not_found)(const A&, const A&) = &operator==; // expected-error {{undeclared}}
 
   constexpr bool operator==(const A&, const A&) noexcept;
-  constexpr bool (*test_a)(const A&, const A&) noexcept = &operator==;
-  static_assert((*test_a)(A(), A()));
+  constexpr bool (*test_a_equal)(const A&, const A&) noexcept = &operator==;
+  static_assert((*test_a_equal)(A(), A())); // expected-error {{static assertion expression is not an integral constant expression}}
+                                            // expected-note@-1 {{undefined function 'operator==' cannot be used in a constant expression}}
 }
 
 struct B1 {
@@ -161,3 +169,14 @@ struct non_constexpr_type {
 
 my_struct<non_constexpr_type> obj; // cxx2a-note {{in instantiation of template class 'GH61238::my_struct<GH61238::non_constexpr_type>' requested here}}
 }
+
+namespace Constrained {
+  template<typename T>
+  struct A {
+    std::strong_ordering operator<=>(const A&) const requires true = default;
+  };
+
+  bool f(A<int> a) {
+    return a != A<int>();
+  }
+}
diff --git a/clang/test/CXX/class/class.friend/p7-cxx20.cpp b/clang/test/CXX/class/class.friend/p7-cxx20.cpp
index 054e6fb3e076..8843d55910ea 100644
--- a/clang/test/CXX/class/class.friend/p7-cxx20.cpp
+++ b/clang/test/CXX/class/class.friend/p7-cxx20.cpp
@@ -3,7 +3,7 @@
 // RUN: split-file %s %t
 // RUN: cd %t
 //
-// RUN: %clang_cc1 -std=c++20 no-modules.cpp -fsyntax-only -ast-dump | \
+// RUN: %clang_cc1 -std=c++20 no-modules.cpp -ast-dump | \
 // RUN: FileCheck --match-full-lines --check-prefix=CHECK-NM %s
 // RUN: %clang_cc1 -std=c++20 -xc++-user-header header-unit.h -ast-dump | \
 // RUN: FileCheck --match-full-lines --check-prefix=CHECK-HU %s
diff --git a/clang/test/CXX/class/class.mfct/p1-cxx20.cpp b/clang/test/CXX/class/class.mfct/p1-cxx20.cpp
index 096617f4853b..5b24668d7b66 100644
--- a/clang/test/CXX/class/class.mfct/p1-cxx20.cpp
+++ b/clang/test/CXX/class/class.mfct/p1-cxx20.cpp
@@ -3,7 +3,7 @@
 // RUN: split-file %s %t
 // RUN: cd %t
 //
-// RUN: %clang_cc1 -std=c++20 no-modules.cpp -fsyntax-only -ast-dump | \
+// RUN: %clang_cc1 -std=c++20 no-modules.cpp -ast-dump | \
 // RUN: FileCheck --match-full-lines --check-prefix=CHECK-NM %s
 // RUN: %clang_cc1 -std=c++20 -xc++-user-header header-unit.h -ast-dump | \
 // RUN: FileCheck --match-full-lines --check-prefix=CHECK-HU %s
diff --git a/clang/test/CXX/dcl.decl/dcl.init/dcl.init.ref/p5-examples.cpp b/clang/test/CXX/dcl.decl/dcl.init/dcl.init.ref/p5-examples.cpp
index 46593b7e2adb..77aef99eaa7c 100644
--- a/clang/test/CXX/dcl.decl/dcl.init/dcl.init.ref/p5-examples.cpp
+++ b/clang/test/CXX/dcl.decl/dcl.init/dcl.init.ref/p5-examples.cpp
@@ -1,56 +1,56 @@
-// RUN: %clang_cc1 -ast-dump %s 2>&1 | FileCheck %s
-
-// CHECK-LABEL: example0
-void example0() {
-  double d = 2.0;
-  // CHECK: VarDecl{{.*}}rd 'double &'
-  // CHECK-NEXT: DeclRefExpr
-  double &rd = d;
-  // CHECK: VarDecl{{.*}}rcd 'const double &'
-  // CHECK-NEXT: ImplicitCastExpr{{.*}}'const double' lvalue <NoOp>
-  const double &rcd = d;
-}
-
-struct A { };
-struct B : A { } b;
-
-// CHECK-LABEL: example1
-void example1() {
-  // CHECK: VarDecl{{.*}}ra 'A &'
-  // CHECK: ImplicitCastExpr{{.*}}'A' lvalue <DerivedToBase (A)>
-  A &ra = b;
-  // CHECK: VarDecl{{.*}}rca 'const A &'
-  // CHECK: ImplicitCastExpr{{.*}}'const A' lvalue <DerivedToBase (A)>
-  // CHECK-NOT: MaterializeTemporaryExpr
-  // CHECK: ImplicitCastExpr{{.*}}'const B' lvalue <NoOp>
-  const A& rca = b;
-}
-
-extern B f();
-
-struct X {
-  operator B();
-} x;
-
-// CHECK-LABEL: example2
-void example2() {
-  // CHECK: VarDecl{{.*}}rca 'const A &'
-  // CHECK: ImplicitCastExpr{{.*}}'const A' lvalue <DerivedToBase (A)>
-  // CHECK: MaterializeTemporaryExpr{{.*}}'const B'
-  // CHECK: ImplicitCastExpr{{.*}}'const B' <NoOp>
-  // CHECK: CallExpr{{.*}}B
-  const A &rca = f();
-  // CHECK: VarDecl{{.*}}r 'const A &'
-  // CHECK: ImplicitCastExpr{{.*}}'const A' lvalue <DerivedToBase (A)>
-  // CHECK: MaterializeTemporaryExpr{{.*}}'const B'
-  // CHECK: ImplicitCastExpr{{.*}}'const B' <NoOp>
-  // CHECK: CXXMemberCallExpr{{.*}}'B'
-  const A& r = x;
-}
-
-// CHECK-LABEL: example3
-void example3() {
-  // CHECK: VarDecl{{.*}}rcd2 'const double &'
-  // CHECK: ImplicitCastExpr{{.*}} <IntegralToFloating>
-  const double& rcd2 = 2;
-}
+// RUN: %clang_cc1 -ast-dump %s 2>&1 | FileCheck %s
+
+// CHECK-LABEL: example0
+void example0() {
+  double d = 2.0;
+  // CHECK: VarDecl{{.*}}rd 'double &'
+  // CHECK-NEXT: DeclRefExpr
+  double &rd = d;
+  // CHECK: VarDecl{{.*}}rcd 'const double &'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'const double' lvalue <NoOp>
+  const double &rcd = d;
+}
+
+struct A { };
+struct B : A { } b;
+
+// CHECK-LABEL: example1
+void example1() {
+  // CHECK: VarDecl{{.*}}ra 'A &'
+  // CHECK: ImplicitCastExpr{{.*}}'A' lvalue <DerivedToBase (A)>
+  A &ra = b;
+  // CHECK: VarDecl{{.*}}rca 'const A &'
+  // CHECK: ImplicitCastExpr{{.*}}'const A' lvalue <DerivedToBase (A)>
+  // CHECK-NOT: MaterializeTemporaryExpr
+  // CHECK: ImplicitCastExpr{{.*}}'const B' lvalue <NoOp>
+  const A& rca = b;
+}
+
+extern B f();
+
+struct X {
+  operator B();
+} x;
+
+// CHECK-LABEL: example2
+void example2() {
+  // CHECK: VarDecl{{.*}}rca 'const A &'
+  // CHECK: ImplicitCastExpr{{.*}}'const A' lvalue <DerivedToBase (A)>
+  // CHECK: MaterializeTemporaryExpr{{.*}}'const B'
+  // CHECK: ImplicitCastExpr{{.*}}'const B' <NoOp>
+  // CHECK: CallExpr{{.*}}B
+  const A &rca = f();
+  // CHECK: VarDecl{{.*}}r 'const A &'
+  // CHECK: ImplicitCastExpr{{.*}}'const A' lvalue <DerivedToBase (A)>
+  // CHECK: MaterializeTemporaryExpr{{.*}}'const B'
+  // CHECK: ImplicitCastExpr{{.*}}'const B' <NoOp>
+  // CHECK: CXXMemberCallExpr{{.*}}'B'
+  const A& r = x;
+}
+
+// CHECK-LABEL: example3
+void example3() {
+  // CHECK: VarDecl{{.*}}rcd2 'const double &'
+  // CHECK: ImplicitCastExpr{{.*}} <IntegralToFloating>
+  const double& rcd2 = 2;
+}
diff --git a/clang/test/CXX/dcl/dcl.init/aggr.cpp b/clang/test/CXX/dcl/dcl.init/aggr.cpp
new file mode 100644
index 000000000000..3206d2e7f616
--- /dev/null
+++ b/clang/test/CXX/dcl/dcl.init/aggr.cpp
@@ -0,0 +1,294 @@
+// RUN:  %clang_cc1 -std=c++2c -verify %s
+
+namespace ex1 {
+struct C {
+  union {
+    int a;
+    const char* p;
+  };
+  int x;
+};
+
+constexpr C c = { .a = 1, .x = 3 };
+static_assert(c.a == 1);
+static_assert(c.x == 3);
+
+static constexpr C c2 = { .a = 1.0, .x = 3 };
+// expected-error@-1 {{type 'double' cannot be narrowed to 'int' in initializer list}}
+//   expected-note@-2 {{insert an explicit cast to silence this issue}}
+} // namespace ex1
+
+namespace ex2 {
+struct A {
+  int x;
+  struct B {
+    int i;
+    int j;
+  } b;
+};
+
+constexpr A a = { 1, { 2, 3 } };
+static_assert(a.x == 1);
+static_assert(a.b.i == 2);
+static_assert(a.b.j == 3);
+
+struct base1 { int b1, b2 = 42; };
+struct base2 {
+  constexpr base2() {
+    b3 = 43;
+  }
+  int b3;
+};
+struct derived : base1, base2 {
+  int d;
+};
+
+constexpr derived d1{{1, 2}, {}, 4};
+static_assert(d1.b1 == 1);
+static_assert(d1.b2 == 2);
+static_assert(d1.b3 == 43);
+static_assert(d1.d == 4);
+
+constexpr derived d2{{}, {}, 4};
+static_assert(d2.b1 == 0);
+static_assert(d2.b2 == 42);
+static_assert(d2.b3 == 43);
+static_assert(d2.d == 4);
+} // namespace ex2
+
+namespace ex3 {
+struct S {
+  int a;
+  const char* b;
+  int c;
+  int d = b[a];
+};
+
+constexpr S ss = { 1, "asdf" };
+static_assert(ss.a == 1);
+static_assert(__builtin_strcmp(ss.b, "asdf") == 0);
+static_assert(ss.c == int{});
+static_assert(ss.d == ss.b[ss.a]);
+
+struct string {
+  int d = 43;
+};
+
+struct A {
+  string a;
+  int b = 42;
+  int c = -1;
+};
+
+constexpr A a{.c = 21};
+static_assert(a.a.d == string{}.d);
+static_assert(a.b == 42);
+static_assert(a.c == 21);
+} // namespace ex3
+
+namespace ex4 {
+int x[] = { 1, 3, 5 };
+static_assert(sizeof(x) / sizeof(int) == 3);
+} // namespace ex4
+
+namespace ex5 {
+struct X { int i, j, k; };
+
+constexpr X a[] = { 1, 2, 3, 4, 5, 6 };
+constexpr X b[2] = { { 1, 2, 3 }, { 4, 5, 6 } };
+static_assert(sizeof(a) == sizeof(b));
+static_assert(a[0].i == b[0].i);
+static_assert(a[0].j == b[0].j);
+static_assert(a[0].k == b[0].k);
+static_assert(a[1].i == b[1].i);
+static_assert(a[1].j == b[1].j);
+static_assert(a[1].k == b[1].k);
+} // namespace ex5
+
+namespace ex6 {
+struct S {
+  int y[] = { 0 };
+  // expected-error@-1 {{array bound cannot be deduced from a default member initializer}}
+};
+} // namespace ex6
+
+namespace ex7 {
+struct A {
+  int i;
+  static int s;
+  int j;
+  int :17;
+  int k;
+};
+
+constexpr A a = { 1, 2, 3 };
+static_assert(a.i == 1);
+static_assert(a.j == 2);
+static_assert(a.k == 3);
+} // namespace ex7
+
+namespace ex8 {
+struct A;
+extern A a;
+struct A {
+  const A& a1 { A{a,a} };
+  const A& a2 { A{} };
+  // expected-error@-1 {{default member initializer for 'a2' needed within definition of enclosing class 'A' outside of member functions}}
+  //   expected-note@-2 {{default member initializer declared here}}
+};
+A a{a,a};
+
+struct B {
+  int n = B{}.n;
+  // expected-error@-1 {{default member initializer for 'n' needed within definition of enclosing class 'B' outside of member functions}}
+  //   expected-note@-2 {{default member initializer declared here}}
+};
+} // namespace ex8
+
+namespace ex9 {
+constexpr int x[2][2] = { 3, 1, 4, 2 };
+static_assert(x[0][0] == 3);
+static_assert(x[0][1] == 1);
+static_assert(x[1][0] == 4);
+static_assert(x[1][1] == 2);
+
+constexpr float y[4][3] = {
+  { 1 }, { 2 }, { 3 }, { 4 }
+};
+static_assert(y[0][0] == 1);
+static_assert(y[0][1] == 0);
+static_assert(y[0][2] == 0);
+static_assert(y[1][0] == 2);
+static_assert(y[1][1] == 0);
+static_assert(y[1][2] == 0);
+static_assert(y[2][0] == 3);
+static_assert(y[2][1] == 0);
+static_assert(y[2][2] == 0);
+static_assert(y[3][0] == 4);
+static_assert(y[3][1] == 0);
+static_assert(y[3][2] == 0);
+} // namespace ex9
+
+namespace ex10 {
+struct S1 { int a, b; };
+struct S2 { S1 s, t; };
+
+constexpr S2 x[2] = { 1, 2, 3, 4, 5, 6, 7, 8 };
+constexpr S2 y[2] = {
+  {
+    { 1, 2 },
+    { 3, 4 }
+  },
+  {
+    { 5, 6 },
+    { 7, 8 }
+  }
+};
+static_assert(x[0].s.a == 1);
+static_assert(x[0].s.b == 2);
+static_assert(x[0].t.a == 3);
+static_assert(x[0].t.b == 4);
+static_assert(x[1].s.a == 5);
+static_assert(x[1].s.b == 6);
+static_assert(x[1].t.a == 7);
+static_assert(x[1].t.b == 8);
+} // namespace ex10
+
+namespace ex11 {
+char cv[4] = { 'a', 's', 'd', 'f', 0 };
+// expected-error@-1 {{excess elements in array initializer}}
+} // namespace ex11
+
+namespace ex12 {
+constexpr float y[4][3] = {
+  { 1, 3, 5 },
+  { 2, 4, 6 },
+  { 3, 5, 7 },
+};
+static_assert(y[0][0] == 1);
+static_assert(y[0][1] == 3);
+static_assert(y[0][2] == 5);
+static_assert(y[1][0] == 2);
+static_assert(y[1][1] == 4);
+static_assert(y[1][2] == 6);
+static_assert(y[2][0] == 3);
+static_assert(y[2][1] == 5);
+static_assert(y[2][2] == 7);
+static_assert(y[3][0] == 0.0);
+static_assert(y[3][1] == 0.0);
+static_assert(y[3][2] == 0.0);
+
+constexpr float z[4][3] = {
+  1, 3, 5, 2, 4, 6, 3, 5, 7
+};
+static_assert(z[0][0] == 1);
+static_assert(z[0][1] == 3);
+static_assert(z[0][2] == 5);
+static_assert(z[1][0] == 2);
+static_assert(z[1][1] == 4);
+static_assert(z[1][2] == 6);
+static_assert(z[2][0] == 3);
+static_assert(z[2][1] == 5);
+static_assert(z[2][2] == 7);
+static_assert(z[3][0] == 0.0);
+static_assert(z[3][1] == 0.0);
+static_assert(z[3][2] == 0.0);
+} // namespace ex12
+
+namespace ex13 {
+struct S { } s;
+struct A {
+  S s1;
+  int i1;
+  S s2;
+  int i2;
+  S s3;
+  int i3;
+} a = {
+  { },              // Required initialization
+  0,
+  s,                // Required initialization
+  0
+};                  // Initialization not required for A::s3 because A::i3 is also not initialized
+} // namespace ex13
+
+namespace ex14 {
+struct A {
+  int i;
+  constexpr operator int() const { return 42; };
+};
+struct B {
+  A a1, a2;
+  int z;
+};
+constexpr A a{};
+constexpr B b = { 4, a, a };
+static_assert(b.a1.i == 4);
+static_assert(b.a2.i == a.i);
+static_assert(b.z == a.operator int());
+} // namespace ex14
+
+namespace ex15 {
+union u { // #ex15-u
+  int a;
+  const char* b;
+};
+
+u a = { 1 };
+u b = a;
+u c = 1;
+// expected-error@-1 {{no viable conversion from 'int' to 'u'}}
+//   expected-note@#ex15-u {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'int' to 'const u &' for 1st argument}}
+//   expected-note@#ex15-u {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'int' to 'u &&' for 1st argument}}
+u d = { 0, "asdf" };
+// expected-error@-1 {{excess elements in union initializer}}
+u e = { "asdf" };
+// expected-error@-1 {{cannot initialize a member subobject of type 'int' with an lvalue of type 'const char[5]'}}
+u f = { .b = "asdf" };
+u g = {
+  .a = 1, // #ex15-g-a
+  .b = "asdf"
+  // expected-error@-1 {{initializer partially overrides prior initialization of this subobject}}
+  //   expected-note@#ex15-g-a {{previous initialization is here}}
+};
+} // namespace ex15
diff --git a/clang/test/CXX/drs/dr0xx.cpp b/clang/test/CXX/drs/cwg0xx.cpp
index 6c600bbc7c3f..6c600bbc7c3f 100644
--- a/clang/test/CXX/drs/dr0xx.cpp
+++ b/clang/test/CXX/drs/cwg0xx.cpp
diff --git a/clang/test/CXX/drs/dr10xx.cpp b/clang/test/CXX/drs/cwg10xx.cpp
index 58d552942c77..58d552942c77 100644
--- a/clang/test/CXX/drs/dr10xx.cpp
+++ b/clang/test/CXX/drs/cwg10xx.cpp
diff --git a/clang/test/CXX/drs/dr11xx.cpp b/clang/test/CXX/drs/cwg11xx.cpp
index 46a0e526be39..46a0e526be39 100644
--- a/clang/test/CXX/drs/dr11xx.cpp
+++ b/clang/test/CXX/drs/cwg11xx.cpp
diff --git a/clang/test/CXX/drs/dr12xx.cpp b/clang/test/CXX/drs/cwg12xx.cpp
index cdfbc6d67265..cdfbc6d67265 100644
--- a/clang/test/CXX/drs/dr12xx.cpp
+++ b/clang/test/CXX/drs/cwg12xx.cpp
diff --git a/clang/test/CXX/drs/dr13xx.cpp b/clang/test/CXX/drs/cwg13xx.cpp
index dad82c4e2829..a334b6d01acf 100644
--- a/clang/test/CXX/drs/dr13xx.cpp
+++ b/clang/test/CXX/drs/cwg13xx.cpp
@@ -281,13 +281,10 @@ namespace cwg1330 { // cwg1330: 4 c++11
   decltype(f<char>()) f2; // #cwg1330-f-char
   bool f3 = noexcept(f<float>()); /// #cwg1330-f-float
 #endif
-  // In C++17 onwards, substituting explicit template arguments into the
-  // function type substitutes into the exception specification (because it's
-  // part of the type). In earlier languages, we don't notice there's a problem
-  // until we've already started to instantiate.
   template int f<short>(); // #cwg1330-f-short
-  // since-cxx17-error@-1 {{explicit instantiation of 'f' does not refer to a function template, variable template, member function, member class, or static data member}}
-  //   since-cxx17-note@#cwg1330-f {{candidate template ignored: substitution failure [with T = short]: type 'short' cannot be used prior to '::' because it has no members}}
+  // since-cxx17-error@#cwg1330-f {{type 'short' cannot be used prior to '::' because it has no members}}
+  //   since-cxx17-note@#cwg1330-f {{in instantiation of exception specification for 'f<short>' requested here}}
+  //   since-cxx17-note@#cwg1330-f-short {{in instantiation of function template specialization 'cwg1330::f<short>' requested here}}
 
   template<typename T> struct C {
     C() throw(typename T::type); // #cwg1330-C
@@ -500,7 +497,7 @@ namespace cwg1359 { // cwg1359: 3.5
   union B { constexpr B() = default; int a; }; // #cwg1359-B
   // cxx11-17-error@-1 {{defaulted definition of default constructor cannot be marked constexpr before C++23}}
   union C { constexpr C() = default; int a, b; }; // #cwg1359-C
-  // cxx11-17-error@-1 {{defaulted definition of default constructor cannot be marked constexpr}} 
+  // cxx11-17-error@-1 {{defaulted definition of default constructor cannot be marked constexpr}}
   struct X { constexpr X() = default; union {}; };
   // since-cxx11-error@-1 {{declaration does not declare anything}}
   struct Y { constexpr Y() = default; union { int a; }; }; // #cwg1359-Y
@@ -720,7 +717,7 @@ struct A {
 } // namespace cwg1397
 
 namespace cwg1399 { // cwg1399: dup 1388
-  template<typename ...T> void f(T..., int, T...) {} // #cwg1399-f 
+  template<typename ...T> void f(T..., int, T...) {} // #cwg1399-f
   // cxx98-error@-1 {{variadic templates are a C++11 extension}}
   void g() {
     f(0);
diff --git a/clang/test/CXX/drs/dr14xx.cpp b/clang/test/CXX/drs/cwg14xx.cpp
index 9ff9a68dc13c..9ff9a68dc13c 100644
--- a/clang/test/CXX/drs/dr14xx.cpp
+++ b/clang/test/CXX/drs/cwg14xx.cpp
diff --git a/clang/test/CXX/drs/dr15xx.cpp b/clang/test/CXX/drs/cwg15xx.cpp
index 21a392a5141e..21a392a5141e 100644
--- a/clang/test/CXX/drs/dr15xx.cpp
+++ b/clang/test/CXX/drs/cwg15xx.cpp
diff --git a/clang/test/CXX/drs/dr16xx.cpp b/clang/test/CXX/drs/cwg16xx.cpp
index cf6b45ceabf2..82ef871939d2 100644
--- a/clang/test/CXX/drs/dr16xx.cpp
+++ b/clang/test/CXX/drs/cwg16xx.cpp
@@ -483,8 +483,6 @@ namespace cwg1696 { // cwg1696: 7
     const A &a = A(); // #cwg1696-D1-a
   };
   D1 d1 = {}; // #cwg1696-d1
-  // since-cxx14-warning@-1 {{lifetime extension of temporary created by aggregate initialization using a default member initializer is not yet supported; lifetime of temporary will end at the end of the full-expression}}
-  //   since-cxx14-note@#cwg1696-D1-a {{initializing field 'a' with default member initializer}}
 
   struct D2 {
     const A &a = A(); // #cwg1696-D2-a
diff --git a/clang/test/CXX/drs/dr17xx.cpp b/clang/test/CXX/drs/cwg17xx.cpp
index fb53a56923b1..fb53a56923b1 100644
--- a/clang/test/CXX/drs/dr17xx.cpp
+++ b/clang/test/CXX/drs/cwg17xx.cpp
diff --git a/clang/test/CXX/drs/dr18xx.cpp b/clang/test/CXX/drs/cwg18xx.cpp
index 3a2248a1af55..9eb749153e57 100644
--- a/clang/test/CXX/drs/dr18xx.cpp
+++ b/clang/test/CXX/drs/cwg18xx.cpp
@@ -206,22 +206,78 @@ namespace cwg1814 { // cwg1814: yes
 #endif
 }
 
-namespace cwg1815 { // cwg1815: no
+namespace cwg1815 { // cwg1815: yes
 #if __cplusplus >= 201402L
-  // FIXME: needs codegen test
-  struct A { int &&r = 0; }; // #cwg1815-A 
+  struct A { int &&r = 0; };
   A a = {};
-  // since-cxx14-warning@-1 {{lifetime extension of temporary created by aggregate initialization using a default member initializer is not yet supported; lifetime of temporary will end at the end of the full-expression}} FIXME
-  //   since-cxx14-note@#cwg1815-A {{initializing field 'r' with default member initializer}}
 
   struct B { int &&r = 0; }; // #cwg1815-B
   // since-cxx14-error@-1 {{reference member 'r' binds to a temporary object whose lifetime would be shorter than the lifetime of the constructed object}}
   //   since-cxx14-note@#cwg1815-B {{initializing field 'r' with default member initializer}}
   //   since-cxx14-note@#cwg1815-b {{in implicit default constructor for 'cwg1815::B' first required here}}
   B b; // #cwg1815-b
+
+#if __cplusplus >= 201703L
+  struct C { const int &r = 0; };
+  constexpr C c = {}; // OK, since cwg1815
+  static_assert(c.r == 0);
+
+  constexpr int f() {
+    A a = {}; // OK, since cwg1815
+    return a.r;
+  }
+  static_assert(f() == 0);
 #endif
+#endif
+}
+
+namespace cwg1820 { // cwg1820: 3.5
+typedef int A;
+typedef int cwg1820::A;
+// expected-warning@-1 {{extra qualification on member 'A'}}
+// expected-error@-2 {{typedef declarator cannot be qualified}}
+
+namespace B {
+typedef int cwg1820::A;
+// expected-error@-1 {{cannot define or redeclare 'A' here because namespace 'B' does not enclose namespace 'cwg1820'}}
+// expected-error@-2 {{typedef declarator cannot be qualified}}
+}
+
+class C1 {
+  typedef int cwg1820::A;
+  // expected-error@-1 {{non-friend class member 'A' cannot have a qualified name}}
+  // expected-error@-2 {{typedef declarator cannot be qualified}}
+};
+
+template <typename>
+class C2 {
+  typedef int cwg1820::A;
+  // expected-error@-1 {{non-friend class member 'A' cannot have a qualified name}}
+  // expected-error@-2 {{typedef declarator cannot be qualified}}
+};
+
+void d1() {
+  typedef int cwg1820::A;
+  // expected-error@-1 {{definition or redeclaration of 'A' not allowed inside a function}}
+  // expected-error@-2 {{typedef declarator cannot be qualified}}
+}
+
+template<typename>
+void d2() {
+  typedef int cwg1820::A;
+  // expected-error@-1 {{definition or redeclaration of 'A' not allowed inside a function}}
+  // expected-error@-2 {{typedef declarator cannot be qualified}}
 }
 
+#if __cplusplus >= 201103L
+auto e = [] {
+  typedef int cwg1820::A;
+  // expected-error@-1 {{definition or redeclaration of 'A' not allowed inside a function}}
+  // expected-error@-2 {{typedef declarator cannot be qualified}}
+};
+#endif
+} // namespace cwg1820
+
 namespace cwg1821 { // cwg1821: 2.9
 struct A {
   template <typename> struct B {
diff --git a/clang/test/CXX/drs/dr19xx.cpp b/clang/test/CXX/drs/cwg19xx.cpp
index 3e4f82813f51..3e4f82813f51 100644
--- a/clang/test/CXX/drs/dr19xx.cpp
+++ b/clang/test/CXX/drs/cwg19xx.cpp
diff --git a/clang/test/CXX/drs/dr1xx.cpp b/clang/test/CXX/drs/cwg1xx.cpp
index a8f9b705a986..a8f9b705a986 100644
--- a/clang/test/CXX/drs/dr1xx.cpp
+++ b/clang/test/CXX/drs/cwg1xx.cpp
diff --git a/clang/test/CXX/drs/dr20xx.cpp b/clang/test/CXX/drs/cwg20xx.cpp
index 9797097acce7..9797097acce7 100644
--- a/clang/test/CXX/drs/dr20xx.cpp
+++ b/clang/test/CXX/drs/cwg20xx.cpp
diff --git a/clang/test/CXX/drs/dr21xx.cpp b/clang/test/CXX/drs/cwg21xx.cpp
index 082deb42e4fa..082deb42e4fa 100644
--- a/clang/test/CXX/drs/dr21xx.cpp
+++ b/clang/test/CXX/drs/cwg21xx.cpp
diff --git a/clang/test/CXX/drs/dr22xx.cpp b/clang/test/CXX/drs/cwg22xx.cpp
index 797c3ed8546e..797c3ed8546e 100644
--- a/clang/test/CXX/drs/dr22xx.cpp
+++ b/clang/test/CXX/drs/cwg22xx.cpp
diff --git a/clang/test/CXX/drs/dr23xx.cpp b/clang/test/CXX/drs/cwg23xx.cpp
index db5b7c3cd3c9..db5b7c3cd3c9 100644
--- a/clang/test/CXX/drs/dr23xx.cpp
+++ b/clang/test/CXX/drs/cwg23xx.cpp
diff --git a/clang/test/CXX/drs/dr24xx.cpp b/clang/test/CXX/drs/cwg24xx.cpp
index 9f876cd87083..9f876cd87083 100644
--- a/clang/test/CXX/drs/dr24xx.cpp
+++ b/clang/test/CXX/drs/cwg24xx.cpp
diff --git a/clang/test/CXX/drs/dr25xx.cpp b/clang/test/CXX/drs/cwg25xx.cpp
index 8bca58f44944..8bca58f44944 100644
--- a/clang/test/CXX/drs/dr25xx.cpp
+++ b/clang/test/CXX/drs/cwg25xx.cpp
diff --git a/clang/test/CXX/drs/dr26xx.cpp b/clang/test/CXX/drs/cwg26xx.cpp
index f7a05b9827a2..f7a05b9827a2 100644
--- a/clang/test/CXX/drs/dr26xx.cpp
+++ b/clang/test/CXX/drs/cwg26xx.cpp
diff --git a/clang/test/CXX/drs/dr27xx.cpp b/clang/test/CXX/drs/cwg27xx.cpp
index 0434427d6c92..0434427d6c92 100644
--- a/clang/test/CXX/drs/dr27xx.cpp
+++ b/clang/test/CXX/drs/cwg27xx.cpp
diff --git a/clang/test/CXX/drs/dr28xx.cpp b/clang/test/CXX/drs/cwg28xx.cpp
index be35d366bdd6..be35d366bdd6 100644
--- a/clang/test/CXX/drs/dr28xx.cpp
+++ b/clang/test/CXX/drs/cwg28xx.cpp
diff --git a/clang/test/CXX/drs/dr2xx.cpp b/clang/test/CXX/drs/cwg2xx.cpp
index 2b3131be3305..2b3131be3305 100644
--- a/clang/test/CXX/drs/dr2xx.cpp
+++ b/clang/test/CXX/drs/cwg2xx.cpp
diff --git a/clang/test/CXX/drs/dr3xx.cpp b/clang/test/CXX/drs/cwg3xx.cpp
index 94227dc031c6..94227dc031c6 100644
--- a/clang/test/CXX/drs/dr3xx.cpp
+++ b/clang/test/CXX/drs/cwg3xx.cpp
diff --git a/clang/test/CXX/drs/dr4xx.cpp b/clang/test/CXX/drs/cwg4xx.cpp
index 07162cc28f6b..07162cc28f6b 100644
--- a/clang/test/CXX/drs/dr4xx.cpp
+++ b/clang/test/CXX/drs/cwg4xx.cpp
diff --git a/clang/test/CXX/drs/dr5xx.cpp b/clang/test/CXX/drs/cwg5xx.cpp
index 9d890f981348..9d890f981348 100644
--- a/clang/test/CXX/drs/dr5xx.cpp
+++ b/clang/test/CXX/drs/cwg5xx.cpp
diff --git a/clang/test/CXX/drs/dr6xx.cpp b/clang/test/CXX/drs/cwg6xx.cpp
index 069102d9c597..069102d9c597 100644
--- a/clang/test/CXX/drs/dr6xx.cpp
+++ b/clang/test/CXX/drs/cwg6xx.cpp
diff --git a/clang/test/CXX/drs/dr7xx.cpp b/clang/test/CXX/drs/cwg7xx.cpp
index 69ee6d6d4e6a..0300dae08d6d 100644
--- a/clang/test/CXX/drs/dr7xx.cpp
+++ b/clang/test/CXX/drs/cwg7xx.cpp
@@ -28,10 +28,8 @@ namespace cwg712 { // cwg712: partial
         use(a);
         use((a));
         use(cond ? a : a);
-        // FIXME: should only warn once
         use((cond, a));
         // expected-warning@-1 {{left operand of comma operator has no effect}}
-        // expected-warning@-2 {{left operand of comma operator has no effect}}
 
         (void)a;
         // expected-error@-1 {{reference to local variable 'a' declared in enclosing function 'cwg712::f'}} FIXME
diff --git a/clang/test/CXX/drs/dr8xx.cpp b/clang/test/CXX/drs/cwg8xx.cpp
index eba601300584..eba601300584 100644
--- a/clang/test/CXX/drs/dr8xx.cpp
+++ b/clang/test/CXX/drs/cwg8xx.cpp
diff --git a/clang/test/CXX/drs/dr9xx.cpp b/clang/test/CXX/drs/cwg9xx.cpp
index 8ecb149c355f..8ecb149c355f 100644
--- a/clang/test/CXX/drs/dr9xx.cpp
+++ b/clang/test/CXX/drs/cwg9xx.cpp
diff --git a/clang/test/CXX/except/except.spec/p13.cpp b/clang/test/CXX/except/except.spec/p13.cpp
index 61cdb74f21ec..29390c277c52 100644
--- a/clang/test/CXX/except/except.spec/p13.cpp
+++ b/clang/test/CXX/except/except.spec/p13.cpp
@@ -72,3 +72,30 @@ template<>
 void f(A, int***); // expected-error {{'f<A, int>' is missing exception specification 'noexcept'}}
 
 }
+
+namespace N3 {
+
+template<typename T, typename U>
+void f(T, U) noexcept(T::y); // #1
+
+template<typename T, typename U> // #2
+void f(T, U*) noexcept(T::x);
+
+// Deduction should succeed for both candidates, and #2 should be selected by overload resolution.
+// Only the exception specification of #2 should be instantiated.
+void (*x)(A, int*) = f;
+}
+
+namespace N4 {
+
+template<typename T, typename U>
+void f(T, U) noexcept(T::x); // #1
+
+template<typename T, typename U>
+void f(T, U*) noexcept(T::y); // #2
+// expected-error@-1 {{no member named 'y' in 'A'}}
+
+// Deduction should succeed for both candidates, and #2 should be selected by overload resolution.
+// Only the exception specification of #2 should be instantiated.
+void (*x)(A, int*) = f; // expected-note {{in instantiation of exception specification for 'f<A, int>' requested here}}
+}
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.lambda/p15-star-this-capture.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.lambda/p15-star-this-capture.cpp
index bae1e25add35..69974fcb4dde 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.lambda/p15-star-this-capture.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.lambda/p15-star-this-capture.cpp
@@ -1,22 +1,22 @@
-// RUN: %clang_cc1 -fsyntax-only -std=c++1z %s -verify
-
-class NonCopyable {
-  NonCopyable(const NonCopyable&) = delete; //expected-note3{{explicitly marked deleted here}}
-  int x = 10;
-  void foo() {
-    auto L = [this] { return x; };
-    const auto &M = [*this] { return x; };//expected-error{{call to deleted}}
-    const auto &M2 = [this] () -> auto&& {
-      ++x;
-      return [*this] {  //expected-error{{call to deleted}} expected-warning{{reference to local}}
-         return ++x; //expected-error{{read-only}}
-      }; 
-    };
-    const auto &M3 = [*this] () mutable -> auto&& { //expected-error{{call to deleted}} 
-      ++x;
-      return [this] {  // expected-warning{{reference to local}}
-         return x;
-      }; 
-    };
-  }  
-};
+// RUN: %clang_cc1 -fsyntax-only -std=c++1z %s -verify
+
+class NonCopyable {
+  NonCopyable(const NonCopyable&) = delete; //expected-note3{{explicitly marked deleted here}}
+  int x = 10;
+  void foo() {
+    auto L = [this] { return x; };
+    const auto &M = [*this] { return x; };//expected-error{{call to deleted}}
+    const auto &M2 = [this] () -> auto&& {
+      ++x;
+      return [*this] {  //expected-error{{call to deleted}} expected-warning{{reference to local}}
+         return ++x; //expected-error{{read-only}}
+      };
+    };
+    const auto &M3 = [*this] () mutable -> auto&& { //expected-error{{call to deleted}}
+      ++x;
+      return [this] {  // expected-warning{{reference to local}}
+         return x;
+      };
+    };
+  }
+};
diff --git a/clang/test/CXX/expr/expr.unary/expr.unary.noexcept/cg.cpp b/clang/test/CXX/expr/expr.unary/expr.unary.noexcept/cg.cpp
index e299705a4c11..043169e822fa 100644
--- a/clang/test/CXX/expr/expr.unary/expr.unary.noexcept/cg.cpp
+++ b/clang/test/CXX/expr/expr.unary/expr.unary.noexcept/cg.cpp
@@ -1,7 +1,7 @@
 // REQUIRES: x86-registered-target
-// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -triple x86_64-apple-darwin10 -S -emit-llvm -std=c++11 -include %S/ser.h %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -triple x86_64-apple-darwin10 -emit-llvm -std=c++11 -include %S/ser.h %s -o - | FileCheck %s
 // RUN: %clang_cc1 -fcxx-exceptions -fexceptions -triple x86_64-apple-darwin10 -emit-pch -o %t-ser.pch -std=c++11 -x c++ %S/ser.h
-// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -triple x86_64-apple-darwin10 -S -emit-llvm -std=c++11 -include-pch %t-ser.pch %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -triple x86_64-apple-darwin10 -emit-llvm -std=c++11 -include-pch %t-ser.pch %s -o - | FileCheck %s
 
 struct D {
   ~D() throw();
diff --git a/clang/test/CXX/expr/expr.unary/expr.unary.op/p4.cpp b/clang/test/CXX/expr/expr.unary/expr.unary.op/p4.cpp
index 537d676738be..162d59439d08 100644
--- a/clang/test/CXX/expr/expr.unary/expr.unary.op/p4.cpp
+++ b/clang/test/CXX/expr/expr.unary/expr.unary.op/p4.cpp
@@ -41,3 +41,20 @@ namespace test2 {
     int (A::*ptr)(int) = &(A::foo); // expected-error {{cannot create a non-constant pointer to member function}}
   }
 }
+
+namespace GH40906 {
+  struct A {
+    int val;
+    void func() {}
+  };
+
+  void test() {
+    decltype(&(A::val)) ptr1; // expected-error {{cannot form pointer to member from a parenthesized expression; did you mean to remove the parentheses?}}
+    int A::* ptr2 = &(A::val); // expected-error {{invalid use of non-static data member 'val'}}
+
+    // FIXME: Error messages in these cases are less than clear, we can do
+    // better.
+    int size = sizeof(&(A::func)); // expected-error {{call to non-static member function without an object argument}}
+    void (A::* ptr3)() = &(A::func); // expected-error {{call to non-static member function without an object argument}}
+  }
+}
diff --git a/clang/test/CXX/lex/lex.literal/lex.string/p4.cpp b/clang/test/CXX/lex/lex.literal/lex.string/p4.cpp
index f8561ba17bcf..b73c8ed711a9 100644
--- a/clang/test/CXX/lex/lex.literal/lex.string/p4.cpp
+++ b/clang/test/CXX/lex/lex.literal/lex.string/p4.cpp
@@ -1,17 +1,17 @@
-// RUN: %clang_cc1 -fsyntax-only -std=c++11 -verify %s
-// expected-no-diagnostics
-
-// NOTE: This file intentionally uses DOS-style line endings to test
-// that we don't propagate them into string literals as per [lex.string]p4.
-
-constexpr const char* p = R"(a\
-b
-c)";
-
-static_assert(p[0] == 'a',  "");
-static_assert(p[1] == '\\', "");
-static_assert(p[2] == '\n', "");
-static_assert(p[3] == 'b',  "");
-static_assert(p[4] == '\n', "");
-static_assert(p[5] == 'c',  "");
-static_assert(p[6] == '\0', "");
+// RUN: %clang_cc1 -fsyntax-only -std=c++11 -verify %s
+// expected-no-diagnostics
+
+// NOTE: This file intentionally uses DOS-style line endings to test
+// that we don't propagate them into string literals as per [lex.string]p4.
+
+constexpr const char* p = R"(a\
+b
+c)";
+
+static_assert(p[0] == 'a',  "");
+static_assert(p[1] == '\\', "");
+static_assert(p[2] == '\n', "");
+static_assert(p[3] == 'b',  "");
+static_assert(p[4] == '\n', "");
+static_assert(p[5] == 'c',  "");
+static_assert(p[6] == '\0', "");
diff --git a/clang/test/CXX/module/module.private.frag/p1.cpp b/clang/test/CXX/module/module.private.frag/p1.cpp
index af3d52f3bd32..ff89140b42b2 100644
--- a/clang/test/CXX/module/module.private.frag/p1.cpp
+++ b/clang/test/CXX/module/module.private.frag/p1.cpp
@@ -2,9 +2,9 @@
 // RUN: mkdir %t
 // RUN: split-file %s %t
 
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/parta.cppm -o %t/mod-parta.pcm -fsyntax-only -verify
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/impl.cppm -o %t/mod-impl.pcm -fsyntax-only -verify
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/primary.cppm -o %t/mod.pcm -fsyntax-only -verify
+// RUN: %clang_cc1 -std=c++20 %t/parta.cppm -o %t/mod-parta.pcm -fsyntax-only -verify
+// RUN: %clang_cc1 -std=c++20 %t/impl.cppm -o %t/mod-impl.pcm -fsyntax-only -verify
+// RUN: %clang_cc1 -std=c++20 %t/primary.cppm -o %t/mod.pcm -fsyntax-only -verify
 
 //--- parta.cppm
 export module mod:parta;
diff --git a/clang/test/CXX/special/class.temporary/p6.cpp b/clang/test/CXX/special/class.temporary/p6.cpp
index 5554363cc69a..a6d2adfd1fd2 100644
--- a/clang/test/CXX/special/class.temporary/p6.cpp
+++ b/clang/test/CXX/special/class.temporary/p6.cpp
@@ -269,6 +269,40 @@ void init_capture_init_list() {
   // CHECK: }
 }
 
+void check_dr1815() { // dr1815: yes
+#if __cplusplus >= 201402L
+
+  struct A {
+    int &&r = 0;
+    ~A() {}
+  };
+
+  struct B {
+    A &&a = A{};
+    ~B() {}
+  };
+  B a = {};
+  
+  // CHECK: call {{.*}}block_scope_begin_function
+  extern void block_scope_begin_function();
+  extern void block_scope_end_function();
+  block_scope_begin_function();
+  {
+    // CHECK: call void @_ZZ12check_dr1815vEN1BD1Ev
+    // CHECK: call void @_ZZ12check_dr1815vEN1AD1Ev
+    B b = {};
+  }
+  // CHECK: call {{.*}}block_scope_end_function
+  block_scope_end_function();
+
+  // CHECK: call {{.*}}some_other_function
+  extern void some_other_function();
+  some_other_function();
+  // CHECK: call void @_ZZ12check_dr1815vEN1BD1Ev
+  // CHECK: call void @_ZZ12check_dr1815vEN1AD1Ev
+#endif
+}
+
 namespace P2718R0 {
 namespace basic {
 template <typename E> using T2 = std::list<E>;
diff --git a/clang/test/CXX/temp/temp.arg/temp.arg.template/p3-2a.cpp b/clang/test/CXX/temp/temp.arg/temp.arg.template/p3-2a.cpp
index f58606963861..342ffba53dbf 100644
--- a/clang/test/CXX/temp/temp.arg/temp.arg.template/p3-2a.cpp
+++ b/clang/test/CXX/temp/temp.arg/temp.arg.template/p3-2a.cpp
@@ -1,4 +1,4 @@
-// RUN:  %clang_cc1 -std=c++2a -frelaxed-template-template-args -verify %s
+// RUN:  %clang_cc1 -std=c++2a -verify %s
 
 template<typename T> concept C = T::f(); // #C
 template<typename T> concept D = C<T> && T::g();
diff --git a/clang/test/CXX/temp/temp.deduct/p7.cpp b/clang/test/CXX/temp/temp.deduct/p7.cpp
new file mode 100644
index 000000000000..cf6d17fc51ac
--- /dev/null
+++ b/clang/test/CXX/temp/temp.deduct/p7.cpp
@@ -0,0 +1,14 @@
+// RUN:  %clang_cc1 -verify %s
+
+struct A {
+  static constexpr bool x = true;
+};
+
+template<typename T, typename U>
+void f(T, U) noexcept(T::x);
+
+template<typename T, typename U>
+void f(T, U*) noexcept(T::y); // expected-error {{no member named 'y' in 'A'}}
+
+template<>
+void f<A>(A, int*); // expected-note {{in instantiation of exception specification}}
diff --git a/clang/test/CXX/temp/temp.res/temp.dep/temp.dep.type/p4.cpp b/clang/test/CXX/temp/temp.res/temp.dep/temp.dep.type/p4.cpp
index 0f24d716a7b7..3ca7c6c7eb8e 100644
--- a/clang/test/CXX/temp/temp.res/temp.dep/temp.dep.type/p4.cpp
+++ b/clang/test/CXX/temp/temp.res/temp.dep/temp.dep.type/p4.cpp
@@ -453,6 +453,36 @@ namespace N3 {
       this->A::operator=(*this);
     }
   };
+
+  template<typename T>
+  struct C {
+    template<typename U>
+    void operator=(int);
+
+    void not_instantiated() {
+      operator=<int>(0);
+      C::operator=<int>(0);
+      this->operator=<int>(0);
+      this->C::operator=<int>(0);
+
+      operator=(*this);
+      C::operator=(*this);
+      this->operator=(*this);
+      this->C::operator=(*this);
+    }
+  };
+
+  template<typename T>
+  struct D {
+    auto not_instantiated() -> decltype(operator=(0)); // expected-error {{use of undeclared 'operator='}}
+  };
+
+  template<typename T>
+  struct E {
+    auto instantiated(E& e) -> decltype(operator=(e)); // expected-error {{use of undeclared 'operator='}}
+  };
+
+  template struct E<int>; // expected-note {{in instantiation of template class 'N3::E<int>' requested here}}
 } // namespace N3
 
 namespace N4 {
@@ -509,6 +539,17 @@ namespace N4 {
       a->y;
       a->f();
       a->g();
+
+      a->T::x;
+      a->T::y;
+      a->T::f();
+      a->T::g();
+
+      // FIXME: 'U' should be a dependent name, and its lookup context should be 'a.operator->()'!
+      a->U::x; // expected-error {{use of undeclared identifier 'U'}}
+      a->U::y; // expected-error {{use of undeclared identifier 'U'}}
+      a->U::f(); // expected-error {{use of undeclared identifier 'U'}}
+      a->U::g(); // expected-error {{use of undeclared identifier 'U'}}
     }
 
     void instantiated(D a) {
@@ -516,8 +557,25 @@ namespace N4 {
       a->y; // expected-error {{no member named 'y' in 'N4::B'}}
       a->f();
       a->g(); // expected-error {{no member named 'g' in 'N4::B'}}
+
+      a->T::x;
+      a->T::y; // expected-error {{no member named 'y' in 'N4::B'}}
+      a->T::f();
+      a->T::g(); // expected-error {{no member named 'g' in 'N4::B'}}
     }
   };
 
   template void D<B>::instantiated(D); // expected-note {{in instantiation of}}
+
+  template<typename T>
+  struct Typo {
+    T *operator->();
+
+    void not_instantiated(Typo a) {
+      a->Not_instantiated;
+      a->typo;
+      a->T::Not_instantiated;
+      a->T::typo;
+    }
+  };
 } // namespace N4
diff --git a/clang/test/CXX/temp/temp.spec/temp.expl.spec/p14-23.cpp b/clang/test/CXX/temp/temp.spec/temp.expl.spec/p14-23.cpp
new file mode 100644
index 000000000000..dc17cea99d43
--- /dev/null
+++ b/clang/test/CXX/temp/temp.spec/temp.expl.spec/p14-23.cpp
@@ -0,0 +1,60 @@
+// RUN: %clang_cc1 -std=c++20 -verify %s
+
+template<int I>
+concept C = I >= 4;
+
+template<int I>
+concept D = I < 8;
+
+template<int I>
+struct A {
+  constexpr static int f() { return 0; }
+  constexpr static int f() requires C<I> && D<I> { return 1; }
+  constexpr static int f() requires C<I> { return 2; }
+
+  constexpr static int g() requires C<I> { return 0; } // #candidate-0
+  constexpr static int g() requires D<I> { return 1; } // #candidate-1
+
+  constexpr static int h() requires C<I> { return 0; } // expected-note {{member declaration nearly matches}}
+};
+
+template<>
+constexpr int A<2>::f() { return 3; }
+
+template<>
+constexpr int A<4>::f() { return 4; }
+
+template<>
+constexpr int A<8>::f() { return 5; }
+
+static_assert(A<3>::f() == 0);
+static_assert(A<5>::f() == 1);
+static_assert(A<9>::f() == 2);
+static_assert(A<2>::f() == 3);
+static_assert(A<4>::f() == 4);
+static_assert(A<8>::f() == 5);
+
+template<>
+constexpr int A<0>::g() { return 2; }
+
+template<>
+constexpr int A<8>::g() { return 3; }
+
+template<>
+constexpr int A<6>::g() { return 4; } // expected-error {{ambiguous member function specialization 'A<6>::g' of 'A::g'}}
+                                      // expected-note@#candidate-0 {{member function specialization matches 'g'}}
+                                      // expected-note@#candidate-1 {{member function specialization matches 'g'}}
+
+static_assert(A<9>::g() == 0);
+static_assert(A<1>::g() == 1);
+static_assert(A<0>::g() == 2);
+static_assert(A<8>::g() == 3);
+
+template<>
+constexpr int A<4>::h() { return 1; }
+
+template<>
+constexpr int A<0>::h() { return 2; } // expected-error {{out-of-line definition of 'h' does not match any declaration in 'A<0>'}}
+
+static_assert(A<5>::h() == 0);
+static_assert(A<4>::h() == 1);
diff --git a/clang/test/CXX/temp/temp.spec/temp.expl.spec/p8.cpp b/clang/test/CXX/temp/temp.spec/temp.expl.spec/p8.cpp
new file mode 100644
index 000000000000..87e10d10e4b4
--- /dev/null
+++ b/clang/test/CXX/temp/temp.spec/temp.expl.spec/p8.cpp
@@ -0,0 +1,74 @@
+// RUN: %clang_cc1 -std=c++20 -fsyntax-only -verify %s
+// expected-no-diagnostics
+
+template<typename T>
+concept C = sizeof(T) <= sizeof(long);
+
+template<typename T>
+struct A {
+  template<typename U>
+  void f(U) requires C<U>;
+
+  void g() requires C<T>;
+
+  template<typename U>
+  void h(U) requires C<T>;
+
+  constexpr int i() requires C<T> {
+    return 0;
+  }
+
+  constexpr int i() requires C<T> && true {
+    return 1;
+  }
+
+  template<>
+  void f(char);
+};
+
+template<>
+template<typename U>
+void A<short>::f(U) requires C<U>;
+
+template<>
+template<typename U>
+void A<short>::h(U) requires C<short>;
+
+template<>
+template<>
+void A<int>::f(int);
+
+template<>
+void A<long>::g();
+
+template<>
+constexpr int A<long>::i() {
+  return 2;
+}
+
+static_assert(A<long>().i() == 2);
+
+template<typename T>
+struct D {
+  template<typename U>
+  static constexpr int f(U);
+
+  template<typename U>
+  static constexpr int f(U) requires (sizeof(T) == 1);
+
+  template<>
+  constexpr int f(int) {
+    return 1;
+  }
+};
+
+template<>
+template<typename U>
+constexpr int D<signed char>::f(U) requires (sizeof(signed char) == 1) {
+  return 0;
+}
+
+static_assert(D<char>::f(0) == 1);
+static_assert(D<char[2]>::f(0) == 1);
+static_assert(D<signed char>::f(0) == 1);
+static_assert(D<signed char>::f(0.0) == 0);
diff --git a/clang/test/CodeGen/2010-08-10-DbgConstant.c b/clang/test/CodeGen/2010-08-10-DbgConstant.c
index de3ff8ffe8bd..7220f3f8a15b 100644
--- a/clang/test/CodeGen/2010-08-10-DbgConstant.c
+++ b/clang/test/CodeGen/2010-08-10-DbgConstant.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -S -emit-llvm -debug-info-kind=limited  %s -o - | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm -debug-info-kind=limited  %s -o - | FileCheck %s
 // CHECK: !DIGlobalVariableExpression(var: [[VAR:.*]], expr: !DIExpression(DW_OP_constu, 201, DW_OP_stack_value))
 
 static const unsigned int ro = 201;
diff --git a/clang/test/CodeGen/LoongArch/intrinsic-la32-error.c b/clang/test/CodeGen/LoongArch/intrinsic-la32-error.c
index 026a2db00889..90aa35a32e2c 100644
--- a/clang/test/CodeGen/LoongArch/intrinsic-la32-error.c
+++ b/clang/test/CodeGen/LoongArch/intrinsic-la32-error.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple loongarch32 -emit-llvm -S -verify %s -o /dev/null
+// RUN: %clang_cc1 -triple loongarch32 -emit-llvm -verify %s -o /dev/null
 // RUN: not %clang_cc1 -triple loongarch32 -DFEATURE_CHECK -emit-llvm %s -o /dev/null 2>&1 \
 // RUN:   | FileCheck %s
 
diff --git a/clang/test/CodeGen/LoongArch/intrinsic-la64-error.c b/clang/test/CodeGen/LoongArch/intrinsic-la64-error.c
index a3242dfd41e9..3b33d954f8e4 100644
--- a/clang/test/CodeGen/LoongArch/intrinsic-la64-error.c
+++ b/clang/test/CodeGen/LoongArch/intrinsic-la64-error.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple loongarch64 -emit-llvm -S -verify %s -o /dev/null
+// RUN: %clang_cc1 -triple loongarch64 -emit-llvm -verify %s -o /dev/null
 // RUN: not %clang_cc1 -triple loongarch64 -DFEATURE_CHECK -emit-llvm %s -o /dev/null 2>&1 \
 // RUN:   | FileCheck %s
 
diff --git a/clang/test/CodeGen/PR32874.c b/clang/test/CodeGen/PR32874.c
index ff05ad1f4aea..234eebcbe457 100644
--- a/clang/test/CodeGen/PR32874.c
+++ b/clang/test/CodeGen/PR32874.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -x c -S -emit-llvm -o - -triple x86_64-apple-darwin10 %s \
+// RUN: %clang_cc1 -x c -emit-llvm -o - -triple x86_64-apple-darwin10 %s \
 // RUN:   -w -fsanitize=signed-integer-overflow,unsigned-integer-overflow,integer-divide-by-zero,float-divide-by-zero \
 // RUN:   | FileCheck %s
 
diff --git a/clang/test/CodeGen/PR44896.ll b/clang/test/CodeGen/PR44896.ll
index b155bfcb8293..8c000d04b735 100644
--- a/clang/test/CodeGen/PR44896.ll
+++ b/clang/test/CodeGen/PR44896.ll
@@ -1,6 +1,6 @@
 ; RUN: %clang -fdiscard-value-names -S %s -o /dev/null 2>&1 | FileCheck --check-prefix=WARNING %s
 ; RUN: %clang -S %s -o /dev/null 2>&1 | FileCheck --check-prefix=NOWARNING %s
-; RUN: %clang_cc1 -S -emit-llvm %s -discard-value-names -o /dev/null
+; RUN: %clang_cc1 -emit-llvm %s -discard-value-names -o /dev/null
 ; PR 44896
 
 ; WARNING: ignoring -fdiscard-value-names for LLVM Bitcode
diff --git a/clang/test/CodeGen/PowerPC/aix_alloca_align.c b/clang/test/CodeGen/PowerPC/aix_alloca_align.c
index 61021f0a05a4..411787de6659 100644
--- a/clang/test/CodeGen/PowerPC/aix_alloca_align.c
+++ b/clang/test/CodeGen/PowerPC/aix_alloca_align.c
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -triple=powerpc-ibm-aix-xcoff -S -emit-llvm < %s | \
+// RUN: %clang_cc1 -triple=powerpc-ibm-aix-xcoff -emit-llvm < %s | \
 // RUN:   FileCheck --check-prefix=32BIT %s
 
-// RUN: %clang_cc1 -triple=powerpc64-ibm-aix-xcoff -S -emit-llvm < %s | \
+// RUN: %clang_cc1 -triple=powerpc64-ibm-aix-xcoff -emit-llvm < %s | \
 // RUN:   FileCheck --check-prefix=64BIT %s
 
 typedef __SIZE_TYPE__ size_t;
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-fpconstrained.c b/clang/test/CodeGen/PowerPC/builtins-ppc-fpconstrained.c
index b4f6fa0471aa..838db02415fe 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-fpconstrained.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-fpconstrained.c
@@ -11,7 +11,7 @@
 // RUN: -S -ffp-exception-behavior=strict \
 // RUN: -o - %s | FileCheck --check-prefix=CHECK-ASM \
 // RUN: --check-prefix=FIXME-CHECK  %s
-// RUN: %clang_cc1 -triple powerpcspe -S -ffp-exception-behavior=strict \
+// RUN: %clang_cc1 -triple powerpcspe -ffp-exception-behavior=strict \
 // RUN: -target-feature +vsx -fexperimental-strict-floating-point -emit-llvm \
 // RUN: %s -o - | FileCheck --check-prefix=CHECK-CONSTRAINED %s
 
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-vec-ins-error.c b/clang/test/CodeGen/PowerPC/builtins-ppc-vec-ins-error.c
index 485ef84df086..8f275db876b6 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-vec-ins-error.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-vec-ins-error.c
@@ -1,16 +1,16 @@
 // REQUIRES: powerpc-registered-target
 
-// RUN: %clang_cc1 -flax-vector-conversions=none -target-feature +vsx -target-cpu pwr10 -fsyntax-only \
+// RUN: %clang_cc1 -flax-vector-conversions=none -target-feature +vsx -target-cpu pwr10 \
 // RUN:   -triple powerpc64le-unknown-unknown -emit-llvm-only -ferror-limit 10 %s -verify -D __TEST_ELT_SI
-// RUN: %clang_cc1 -flax-vector-conversions=none -target-feature +vsx -target-cpu pwr10 -fsyntax-only \
+// RUN: %clang_cc1 -flax-vector-conversions=none -target-feature +vsx -target-cpu pwr10 \
 // RUN:   -triple powerpc64-unknown-unknown -emit-llvm-only -ferror-limit 10 %s -verify -D __TEST_ELT_F
-// RUN: %clang_cc1 -flax-vector-conversions=none -target-feature +vsx -target-cpu pwr10 -fsyntax-only \
+// RUN: %clang_cc1 -flax-vector-conversions=none -target-feature +vsx -target-cpu pwr10 \
 // RUN:   -triple powerpc64le-unknown-unknown -emit-llvm-only -ferror-limit 10 %s -verify -D __TEST_ELT_SLL
-// RUN: %clang_cc1 -flax-vector-conversions=none -target-feature +vsx -target-cpu pwr10 -fsyntax-only \
+// RUN: %clang_cc1 -flax-vector-conversions=none -target-feature +vsx -target-cpu pwr10 \
 // RUN:   -triple powerpc64-unknown-unknown -emit-llvm-only -ferror-limit 10 %s -verify -D __TEST_ELT_D
-// RUN: %clang_cc1 -flax-vector-conversions=none -target-feature +vsx -target-cpu pwr10 -fsyntax-only \
+// RUN: %clang_cc1 -flax-vector-conversions=none -target-feature +vsx -target-cpu pwr10 \
 // RUN:   -triple powerpc64le-unknown-unknown -emit-llvm-only -ferror-limit 10 %s -verify -D __TEST_UNALIGNED_UI
-// RUN: %clang_cc1 -flax-vector-conversions=none -target-feature +vsx -target-cpu pwr10 -fsyntax-only \
+// RUN: %clang_cc1 -flax-vector-conversions=none -target-feature +vsx -target-cpu pwr10 \
 // RUN:   -triple powerpc64-unknown-unknown -emit-llvm-only -ferror-limit 10 %s -verify
 
 #include <altivec.h>
diff --git a/clang/test/CodeGen/PowerPC/ibm128-cast.c b/clang/test/CodeGen/PowerPC/ibm128-cast.c
index d744ba7a8fc8..0a859c5819c1 100644
--- a/clang/test/CodeGen/PowerPC/ibm128-cast.c
+++ b/clang/test/CodeGen/PowerPC/ibm128-cast.c
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -emit-llvm -triple powerpc64le-unknown-unknown -verify \
+// RUN: %clang_cc1 -triple powerpc64le-unknown-unknown -verify \
 // RUN:   -target-feature +float128 -mabi=ieeelongdouble -fsyntax-only -Wno-unused %s
-// RUN: %clang_cc1 -emit-llvm -triple powerpc64le-unknown-unknown -verify \
+// RUN: %clang_cc1 -triple powerpc64le-unknown-unknown -verify \
 // RUN:   -target-feature +float128 -fsyntax-only -Wno-unused %s
 
 __float128 cast1(__ibm128 x) { return x; } // expected-error {{returning '__ibm128' from a function with incompatible result type '__float128'}}
diff --git a/clang/test/CodeGen/PowerPC/toc-data-attribute.c b/clang/test/CodeGen/PowerPC/toc-data-attribute.c
index db23d74759ee..d3a2d43b064f 100644
--- a/clang/test/CodeGen/PowerPC/toc-data-attribute.c
+++ b/clang/test/CodeGen/PowerPC/toc-data-attribute.c
@@ -1,8 +1,8 @@
-// RUN: %clang_cc1 %s -triple powerpc-ibm-aix-xcoff -S -mtocdata=f,g,h,i,j,k,l,m,n,o,p -emit-llvm -o - 2>&1 | FileCheck %s -check-prefixes=COMMON,CHECK32 --match-full-lines
-// RUN: %clang_cc1 %s -triple powerpc-ibm-aix-xcoff -S -mtocdata -emit-llvm -o - 2>&1 | FileCheck %s -check-prefixes=COMMON,CHECK32 --match-full-lines
+// RUN: %clang_cc1 %s -triple powerpc-ibm-aix-xcoff -mtocdata=f,g,h,i,j,k,l,m,n,o,p -emit-llvm -o - 2>&1 | FileCheck %s -check-prefixes=COMMON,CHECK32 --match-full-lines
+// RUN: %clang_cc1 %s -triple powerpc-ibm-aix-xcoff -mtocdata -emit-llvm -o - 2>&1 | FileCheck %s -check-prefixes=COMMON,CHECK32 --match-full-lines
 
-// RUN: %clang_cc1 %s -triple powerpc64-ibm-aix-xcoff -S -mtocdata=f,g,h,i,j,k,l,m,n,o,p -emit-llvm -o - 2>&1 | FileCheck %s -check-prefixes=COMMON,CHECK64 --match-full-lines
-// RUN: %clang_cc1 %s -triple powerpc64-ibm-aix-xcoff -S -mtocdata -emit-llvm -o - 2>&1 | FileCheck %s -check-prefixes=COMMON,CHECK64 --match-full-lines
+// RUN: %clang_cc1 %s -triple powerpc64-ibm-aix-xcoff -mtocdata=f,g,h,i,j,k,l,m,n,o,p -emit-llvm -o - 2>&1 | FileCheck %s -check-prefixes=COMMON,CHECK64 --match-full-lines
+// RUN: %clang_cc1 %s -triple powerpc64-ibm-aix-xcoff -mtocdata -emit-llvm -o - 2>&1 | FileCheck %s -check-prefixes=COMMON,CHECK64 --match-full-lines
 
 extern int f;
 long long g = 5;
diff --git a/clang/test/CodeGen/PowerPC/toc-data-attribute.cpp b/clang/test/CodeGen/PowerPC/toc-data-attribute.cpp
index 8183e3b727e7..61eb16beefb6 100644
--- a/clang/test/CodeGen/PowerPC/toc-data-attribute.cpp
+++ b/clang/test/CodeGen/PowerPC/toc-data-attribute.cpp
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 %s -triple powerpc-ibm-aix-xcoff -S -mtocdata -emit-llvm -o - 2>&1 | FileCheck %s -check-prefixes=COMMON,ALLTOC
-// RUN: %clang_cc1 %s -triple powerpc-ibm-aix-xcoff -S -mtocdata=n,_ZN11MyNamespace10myVariableE,_ZL1s,_ZZ4testvE7counter -emit-llvm -o - 2>&1 | FileCheck %s -check-prefixes=COMMON,TOCLIST
-// RUN: %clang_cc1 %s -triple powerpc64-ibm-aix-xcoff -S -mtocdata -emit-llvm -o - 2>&1 | FileCheck %s -check-prefixes=COMMON,ALLTOC
-// RUN: %clang_cc1 %s -triple powerpc64-ibm-aix-xcoff -S -mtocdata=n,_ZN11MyNamespace10myVariableE,_ZL1s,_ZZ4testvE7counter -emit-llvm -o - 2>&1 | FileCheck %s -check-prefixes=COMMON,TOCLIST
+// RUN: %clang_cc1 %s -triple powerpc-ibm-aix-xcoff -mtocdata -emit-llvm -o - 2>&1 | FileCheck %s -check-prefixes=COMMON,ALLTOC
+// RUN: %clang_cc1 %s -triple powerpc-ibm-aix-xcoff -mtocdata=n,_ZN11MyNamespace10myVariableE,_ZL1s,_ZZ4testvE7counter -emit-llvm -o - 2>&1 | FileCheck %s -check-prefixes=COMMON,TOCLIST
+// RUN: %clang_cc1 %s -triple powerpc64-ibm-aix-xcoff -mtocdata -emit-llvm -o - 2>&1 | FileCheck %s -check-prefixes=COMMON,ALLTOC
+// RUN: %clang_cc1 %s -triple powerpc64-ibm-aix-xcoff -mtocdata=n,_ZN11MyNamespace10myVariableE,_ZL1s,_ZZ4testvE7counter -emit-llvm -o - 2>&1 | FileCheck %s -check-prefixes=COMMON,TOCLIST
 
 extern int n;
 static int s = 100;
diff --git a/clang/test/CodeGen/PowerPC/toc-data-diagnostics.c b/clang/test/CodeGen/PowerPC/toc-data-diagnostics.c
index ba8955530e46..169c35513699 100644
--- a/clang/test/CodeGen/PowerPC/toc-data-diagnostics.c
+++ b/clang/test/CodeGen/PowerPC/toc-data-diagnostics.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 %s -triple=powerpc-ibm-aix-xcoff -S -mtocdata=h,g,f,e,d,c,b,a,globalOneWithAlias,globalTwoWithAlias,ll,t3 -verify -emit-llvm -o - | FileCheck %s -check-prefix=CHECK --match-full-lines
-// RUN: %clang_cc1 %s -triple=powerpc-ibm-aix-xcoff -S -mtocdata -verify=none -emit-llvm -o - | FileCheck %s -check-prefix=CHECK --match-full-lines
+// RUN: %clang_cc1 %s -triple=powerpc-ibm-aix-xcoff -mtocdata=h,g,f,e,d,c,b,a,globalOneWithAlias,globalTwoWithAlias,ll,t3 -verify -emit-llvm -o - | FileCheck %s -check-prefix=CHECK --match-full-lines
+// RUN: %clang_cc1 %s -triple=powerpc-ibm-aix-xcoff -mtocdata -verify=none -emit-llvm -o - | FileCheck %s -check-prefix=CHECK --match-full-lines
 
 // none-no-diagnostics
 
diff --git a/clang/test/CodeGen/PowerPC/toc-data-structs-arrays.cpp b/clang/test/CodeGen/PowerPC/toc-data-structs-arrays.cpp
index a717995cdceb..2d988fcbc0f8 100644
--- a/clang/test/CodeGen/PowerPC/toc-data-structs-arrays.cpp
+++ b/clang/test/CodeGen/PowerPC/toc-data-structs-arrays.cpp
@@ -1,11 +1,11 @@
-// RUN: %clang_cc1 %s -triple powerpc-ibm-aix-xcoff -S -mtocdata=a4,a5,a8,a9,b,c,d,e,v -emit-llvm -o - 2>&1 \
+// RUN: %clang_cc1 %s -triple powerpc-ibm-aix-xcoff -mtocdata=a4,a5,a8,a9,b,c,d,e,v -emit-llvm -o - 2>&1 \
 // RUN:          | FileCheck %s -check-prefixes=CHECK32 --match-full-lines
-// RUN: %clang_cc1  %s -triple powerpc-ibm-aix-xcoff -S -mtocdata -emit-llvm -o - 2>&1 \
+// RUN: %clang_cc1 %s -triple powerpc-ibm-aix-xcoff -mtocdata -emit-llvm -o - 2>&1 \
 // RUN:          | FileCheck %s -check-prefixes=CHECK32 --match-full-lines
 
-// RUN: %clang_cc1  %s -triple powerpc64-ibm-aix-xcoff -S -mtocdata=a4,a5,a8,a9,b,c,d,e,v -emit-llvm -o - 2>&1 \
+// RUN: %clang_cc1 %s -triple powerpc64-ibm-aix-xcoff -mtocdata=a4,a5,a8,a9,b,c,d,e,v -emit-llvm -o - 2>&1 \
 // RUN:          | FileCheck %s -check-prefixes=CHECK64 --match-full-lines
-// RUN: %clang_cc1  %s -triple powerpc64-ibm-aix-xcoff -S -mtocdata -emit-llvm -o - 2>&1 \
+// RUN: %clang_cc1 %s -triple powerpc64-ibm-aix-xcoff -mtocdata -emit-llvm -o - 2>&1 \
 // RUN:          | FileCheck %s -check-prefixes=CHECK64 --match-full-lines
 
 struct size4_struct {
diff --git a/clang/test/CodeGen/PowerPC/vector-bool-pixel-altivec-init-no-parentheses.c b/clang/test/CodeGen/PowerPC/vector-bool-pixel-altivec-init-no-parentheses.c
index 071f66a8f0ab..1c6dea34f5c3 100644
--- a/clang/test/CodeGen/PowerPC/vector-bool-pixel-altivec-init-no-parentheses.c
+++ b/clang/test/CodeGen/PowerPC/vector-bool-pixel-altivec-init-no-parentheses.c
@@ -1,20 +1,20 @@
 // RUN: not %clang_cc1 -target-feature +altivec -target-feature +vsx \
-// RUN:   -faltivec-src-compat=mixed -triple powerpc-unknown-unknown -S \
+// RUN:   -faltivec-src-compat=mixed -triple powerpc-unknown-unknown \
 // RUN:   -emit-llvm %s -o - 2>&1 | FileCheck %s --check-prefix=MIXED-ERR
 // RUN: not %clang_cc1 -target-feature +altivec -target-feature +vsx \
-// RUN:   -faltivec-src-compat=mixed -triple powerpc64le-unknown-unknown -S \
+// RUN:   -faltivec-src-compat=mixed -triple powerpc64le-unknown-unknown \
 // RUN:   -emit-llvm %s -o - 2>&1 | FileCheck %s --check-prefix=MIXED-ERR
 // RUN: not %clang_cc1 -target-feature +altivec -target-feature +vsx \
-// RUN:   -faltivec-src-compat=gcc -triple powerpc-unknown-unknown -S \
+// RUN:   -faltivec-src-compat=gcc -triple powerpc-unknown-unknown \
 // RUN:   -emit-llvm %s -o - 2>&1 | FileCheck %s --check-prefix=GCC-ERR
 // RUN: not %clang_cc1 -target-feature +altivec -target-feature +vsx \
-// RUN:   -faltivec-src-compat=gcc -triple powerpc64le-unknown-unknown -S \
+// RUN:   -faltivec-src-compat=gcc -triple powerpc64le-unknown-unknown \
 // RUN:   -emit-llvm %s -o - 2>&1 | FileCheck %s --check-prefix=GCC-ERR
 // RUN: %clang_cc1 -target-feature +altivec -target-feature +vsx \
-// RUN:   -faltivec-src-compat=xl -triple powerpc-unknown-unknown -S \
+// RUN:   -faltivec-src-compat=xl -triple powerpc-unknown-unknown \
 // RUN:   -emit-llvm %s -o - | FileCheck %s --check-prefix=XL
 // RUN: %clang_cc1 -target-feature +altivec -target-feature +vsx \
-// RUN:   -faltivec-src-compat=xl -triple powerpc64le-unknown-unknown -S \
+// RUN:   -faltivec-src-compat=xl -triple powerpc64le-unknown-unknown \
 // RUN:   -emit-llvm %s -o - | FileCheck %s --check-prefix=XL
 // RUN: not %clang -mcpu=pwr8 -faltivec-src-compat=mixed --target=powerpc-unknown-unknown \
 // RUN:   -S -emit-llvm %s -o - 2>&1 | FileCheck %s --check-prefix=MIXED-ERR
diff --git a/clang/test/CodeGen/PowerPC/vector-bool-pixel-altivec-init.c b/clang/test/CodeGen/PowerPC/vector-bool-pixel-altivec-init.c
index 71860c643604..5057a4d593da 100644
--- a/clang/test/CodeGen/PowerPC/vector-bool-pixel-altivec-init.c
+++ b/clang/test/CodeGen/PowerPC/vector-bool-pixel-altivec-init.c
@@ -1,20 +1,20 @@
 // RUN: %clang_cc1 -target-feature +altivec -target-feature +vsx \
-// RUN:   -faltivec-src-compat=mixed -triple powerpc-unknown-unknown -S \
+// RUN:   -faltivec-src-compat=mixed -triple powerpc-unknown-unknown \
 // RUN:   -emit-llvm %s -o - | FileCheck %s --check-prefix=MIXED
 // RUN: %clang_cc1 -target-feature +altivec -target-feature +vsx \
-// RUN:   -faltivec-src-compat=mixed -triple powerpc64le-unknown-unknown -S \
+// RUN:   -faltivec-src-compat=mixed -triple powerpc64le-unknown-unknown \
 // RUN:   -emit-llvm %s -o - | FileCheck %s --check-prefix=MIXED
 // RUN: %clang_cc1 -target-feature +altivec -target-feature +vsx \
-// RUN:   -faltivec-src-compat=xl -triple powerpc-unknown-unknown -S \
+// RUN:   -faltivec-src-compat=xl -triple powerpc-unknown-unknown \
 // RUN:   -emit-llvm %s -o - | FileCheck %s --check-prefix=XL
 // RUN: %clang_cc1 -target-feature +altivec -target-feature +vsx \
-// RUN:   -faltivec-src-compat=xl -triple powerpc64le-unknown-unknown -S \
+// RUN:   -faltivec-src-compat=xl -triple powerpc64le-unknown-unknown \
 // RUN:   -emit-llvm %s -o - | FileCheck %s --check-prefix=XL
 // RUN: not %clang_cc1 -target-feature +altivec -target-feature +vsx \
-// RUN:   -faltivec-src-compat=gcc -triple powerpc-unknown-unknown -S \
+// RUN:   -faltivec-src-compat=gcc -triple powerpc-unknown-unknown \
 // RUN:   -emit-llvm %s -o - 2>&1 | FileCheck %s --check-prefix=GCC
 // RUN: not %clang_cc1 -target-feature +altivec -target-feature +vsx \
-// RUN:   -faltivec-src-compat=gcc -triple powerpc64le-unknown-unknown -S \
+// RUN:   -faltivec-src-compat=gcc -triple powerpc64le-unknown-unknown \
 // RUN:   -emit-llvm %s -o - 2>&1 | FileCheck %s --check-prefix=GCC
 // RUN: %clang -mcpu=pwr8 -faltivec-src-compat=mixed --target=powerpc-unknown-unknown \
 // RUN:   -S -emit-llvm %s -o - | FileCheck %s --check-prefix=MIXED
diff --git a/clang/test/CodeGen/PowerPC/vector-compat-pixel-bool-ternary.c b/clang/test/CodeGen/PowerPC/vector-compat-pixel-bool-ternary.c
index d04875d58380..da4e3040135a 100644
--- a/clang/test/CodeGen/PowerPC/vector-compat-pixel-bool-ternary.c
+++ b/clang/test/CodeGen/PowerPC/vector-compat-pixel-bool-ternary.c
@@ -1,9 +1,9 @@
 // RUN: not %clang_cc1 -target-feature +altivec -target-feature +vsx \
-// RUN:   -faltivec-src-compat=mixed -triple powerpc-unknown-unknown -S -emit-llvm %s -o - 2>&1 | FileCheck %s --check-prefix=ERROR
+// RUN:   -faltivec-src-compat=mixed -triple powerpc-unknown-unknown -emit-llvm %s -o - 2>&1 | FileCheck %s --check-prefix=ERROR
 // RUN: not %clang_cc1 -target-feature +altivec -target-feature +vsx \
-// RUN:   -faltivec-src-compat=gcc -triple powerpc-unknown-unknown -S -emit-llvm %s -o - 2>&1| FileCheck %s --check-prefix=ERROR
+// RUN:   -faltivec-src-compat=gcc -triple powerpc-unknown-unknown -emit-llvm %s -o - 2>&1| FileCheck %s --check-prefix=ERROR
 // RUN: %clang_cc1 -target-feature +altivec -target-feature +vsx \
-// RUN:   -faltivec-src-compat=xl -triple powerpc-unknown-unknown -S -emit-llvm %s -o - | FileCheck %s
+// RUN:   -faltivec-src-compat=xl -triple powerpc-unknown-unknown -emit-llvm %s -o - | FileCheck %s
 // RUN: %clang -mcpu=pwr8 -faltivec-src-compat=xl --target=powerpc-unknown-unknown -S -emit-llvm %s -o - | FileCheck %s
 // RUN: %clang -mcpu=pwr9 -faltivec-src-compat=xl --target=powerpc-unknown-unknown -S -emit-llvm %s -o - | FileCheck %s
 
diff --git a/clang/test/CodeGen/PowerPC/vector-compat-pixel-bool.c b/clang/test/CodeGen/PowerPC/vector-compat-pixel-bool.c
index 58b355a5f296..ee22b3bb29d4 100644
--- a/clang/test/CodeGen/PowerPC/vector-compat-pixel-bool.c
+++ b/clang/test/CodeGen/PowerPC/vector-compat-pixel-bool.c
@@ -1,9 +1,9 @@
 // RUN: %clang_cc1 -target-feature +altivec -target-feature +vsx \
-// RUN:   -faltivec-src-compat=mixed -triple powerpc-unknown-unknown -S -emit-llvm %s -o - | FileCheck %s
+// RUN:   -faltivec-src-compat=mixed -triple powerpc-unknown-unknown -emit-llvm %s -o - | FileCheck %s
 // RUN: %clang_cc1 -target-feature +altivec -target-feature +vsx \
-// RUN:   -faltivec-src-compat=gcc -triple powerpc-unknown-unknown -S -emit-llvm %s -o - | FileCheck %s
+// RUN:   -faltivec-src-compat=gcc -triple powerpc-unknown-unknown -emit-llvm %s -o - | FileCheck %s
 // RUN: not %clang_cc1 -target-feature +altivec -target-feature +vsx \
-// RUN:   -faltivec-src-compat=xl -triple powerpc-unknown-unknown -S -emit-llvm %s -o - 2>&1 | FileCheck %s --check-prefix=ERROR
+// RUN:   -faltivec-src-compat=xl -triple powerpc-unknown-unknown -emit-llvm %s -o - 2>&1 | FileCheck %s --check-prefix=ERROR
 // RUN: %clang -mcpu=pwr8 -faltivec-src-compat=gcc --target=powerpc-unknown-unknown -S -emit-llvm %s -o - | FileCheck %s
 // RUN: %clang -mcpu=pwr9 -faltivec-src-compat=gcc --target=powerpc-unknown-unknown -S -emit-llvm %s -o - | FileCheck %s
 
diff --git a/clang/test/CodeGen/PowerPC/vector-compat-ternary.c b/clang/test/CodeGen/PowerPC/vector-compat-ternary.c
index c5bf227f80e3..77c82118fc4f 100644
--- a/clang/test/CodeGen/PowerPC/vector-compat-ternary.c
+++ b/clang/test/CodeGen/PowerPC/vector-compat-ternary.c
@@ -1,9 +1,9 @@
 // RUN: %clang_cc1 -target-feature +altivec -target-feature +vsx \
-// RUN:   -faltivec-src-compat=mixed -triple powerpc-unknown-unknown -S -emit-llvm %s -o - | FileCheck %s
+// RUN:   -faltivec-src-compat=mixed -triple powerpc-unknown-unknown -emit-llvm %s -o - | FileCheck %s
 // RUN: not %clang_cc1 -target-feature +altivec -target-feature +vsx \
-// RUN:   -faltivec-src-compat=gcc -triple powerpc-unknown-unknown -S -emit-llvm %s -o - 2>&1 | FileCheck %s --check-prefix=ERROR
+// RUN:   -faltivec-src-compat=gcc -triple powerpc-unknown-unknown -emit-llvm %s -o - 2>&1 | FileCheck %s --check-prefix=ERROR
 // RUN: %clang_cc1 -target-feature +altivec -target-feature +vsx \
-// RUN:   -faltivec-src-compat=xl -triple powerpc-unknown-unknown -S -emit-llvm %s -o - | FileCheck %s
+// RUN:   -faltivec-src-compat=xl -triple powerpc-unknown-unknown -emit-llvm %s -o - | FileCheck %s
 // RUN: %clang -mcpu=pwr8 -faltivec-src-compat=xl --target=powerpc-unknown-unknown -S -emit-llvm %s -o - | FileCheck %s
 // RUN: %clang -mcpu=pwr9 -faltivec-src-compat=xl --target=powerpc-unknown-unknown -S -emit-llvm %s -o - | FileCheck %s
 
diff --git a/clang/test/CodeGen/PowerPC/vector-compat.c b/clang/test/CodeGen/PowerPC/vector-compat.c
index 4cf607dd84fd..334d7e24fbd3 100644
--- a/clang/test/CodeGen/PowerPC/vector-compat.c
+++ b/clang/test/CodeGen/PowerPC/vector-compat.c
@@ -1,9 +1,9 @@
 // RUN: not %clang_cc1 -target-feature +altivec -target-feature +vsx \
-// RUN:   -faltivec-src-compat=mixed -triple powerpc-unknown-unknown -S -emit-llvm %s -o - 2>&1 | FileCheck %s --check-prefix=ERROR
+// RUN:   -faltivec-src-compat=mixed -triple powerpc-unknown-unknown -emit-llvm %s -o - 2>&1 | FileCheck %s --check-prefix=ERROR
 // RUN: %clang_cc1 -target-feature +altivec -target-feature +vsx \
-// RUN:   -faltivec-src-compat=gcc -triple powerpc-unknown-unknown -S -emit-llvm %s -o - | FileCheck %s
+// RUN:   -faltivec-src-compat=gcc -triple powerpc-unknown-unknown -emit-llvm %s -o - | FileCheck %s
 // RUN: not %clang_cc1 -target-feature +altivec -target-feature +vsx \
-// RUN:   -faltivec-src-compat=xl -triple powerpc-unknown-unknown -S -emit-llvm %s -o - 2>&1 | FileCheck %s --check-prefix=ERROR
+// RUN:   -faltivec-src-compat=xl -triple powerpc-unknown-unknown -emit-llvm %s -o - 2>&1 | FileCheck %s --check-prefix=ERROR
 // RUN: %clang -mcpu=pwr8 -faltivec-src-compat=gcc --target=powerpc-unknown-unknown -S -emit-llvm %s -o - | FileCheck %s
 // RUN: %clang -mcpu=pwr9 -faltivec-src-compat=gcc --target=powerpc-unknown-unknown -S -emit-llvm %s -o - | FileCheck %s
 
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-bitcast.c b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-bitcast.c
index 20fb4a04564c..edb3386664c8 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-bitcast.c
+++ b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-bitcast.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=1 -mvscale-max=1 -S -O1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-64
-// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=2 -mvscale-max=2 -S -O1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-128
-// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=4 -mvscale-max=4 -S -O1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-256
+// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=1 -mvscale-max=1 -O1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-64
+// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=2 -mvscale-max=2 -O1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-128
+// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=4 -mvscale-max=4 -O1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-256
 
 // REQUIRES: riscv-registered-target
 
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-call.c b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-call.c
index 1824d97d04dd..e0efb9dc4fb0 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-call.c
+++ b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-call.c
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=4 -mvscale-max=4 -S -O1 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=4 -mvscale-max=4 -O1 -emit-llvm -o - %s | FileCheck %s
 
 // REQUIRES: riscv-registered-target
 
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-cast.c
index 3806c3e1b30b..6fd393098182 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c
+++ b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-cast.c
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=4 -mvscale-max=4 -S -O1 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=4 -mvscale-max=4 -O1 -emit-llvm -o - %s | FileCheck %s
 
 // REQUIRES: riscv-registered-target
 
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-codegen.c b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-codegen.c
index eb769fadda9a..a6b3f3732f28 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-codegen.c
+++ b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-codegen.c
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=4 -mvscale-max=4 -S -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=4 -mvscale-max=4 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
 
 // REQUIRES: riscv-registered-target
 
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-globals.c b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-globals.c
index 31a245dcb224..973a25ee9665 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-globals.c
+++ b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-globals.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=1 -mvscale-max=1 -S -O1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-64
-// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=4 -mvscale-max=4 -S -O1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-256
+// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=1 -mvscale-max=1 -O1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-64
+// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=4 -mvscale-max=4 -O1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-256
 
 // REQUIRES: riscv-registered-target
 
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-types.c b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-types.c
index 027f7ab24aa1..cae42ec76c79 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-types.c
+++ b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-types.c
@@ -1,8 +1,8 @@
-// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=1 -mvscale-max=1 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-64
-// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=2 -mvscale-max=2 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-128
-// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=4 -mvscale-max=4 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-256
-// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=8 -mvscale-max=8 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-512
-// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=16 -mvscale-max=16 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-1024
+// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=1 -mvscale-max=1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-64
+// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=2 -mvscale-max=2 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-128
+// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=4 -mvscale-max=4 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-256
+// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=8 -mvscale-max=8 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-512
+// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=16 -mvscale-max=16 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-1024
 
 // REQUIRES: riscv-registered-target
 
diff --git a/clang/test/CodeGen/riscv32-ilp32d-abi.cpp b/clang/test/CodeGen/RISCV/riscv32-ilp32d-abi.cpp
index b98f5f5bc87b..b98f5f5bc87b 100644
--- a/clang/test/CodeGen/riscv32-ilp32d-abi.cpp
+++ b/clang/test/CodeGen/RISCV/riscv32-ilp32d-abi.cpp
diff --git a/clang/test/CodeGen/riscv-rvv-vls-arith-ops.c b/clang/test/CodeGen/RISCV/rvv-vls-arith-ops.c
index 76fcf38a0d98..76fcf38a0d98 100644
--- a/clang/test/CodeGen/riscv-rvv-vls-arith-ops.c
+++ b/clang/test/CodeGen/RISCV/rvv-vls-arith-ops.c
diff --git a/clang/test/CodeGen/riscv-rvv-vls-bitwise-ops.c b/clang/test/CodeGen/RISCV/rvv-vls-bitwise-ops.c
index bb97707a7a9a..bb97707a7a9a 100644
--- a/clang/test/CodeGen/riscv-rvv-vls-bitwise-ops.c
+++ b/clang/test/CodeGen/RISCV/rvv-vls-bitwise-ops.c
diff --git a/clang/test/CodeGen/riscv-rvv-vls-compare-ops.c b/clang/test/CodeGen/RISCV/rvv-vls-compare-ops.c
index f22b4fbb5251..f22b4fbb5251 100644
--- a/clang/test/CodeGen/riscv-rvv-vls-compare-ops.c
+++ b/clang/test/CodeGen/RISCV/rvv-vls-compare-ops.c
diff --git a/clang/test/CodeGen/riscv-rvv-vls-shift-ops.c b/clang/test/CodeGen/RISCV/rvv-vls-shift-ops.c
index d88a5ba239e8..d88a5ba239e8 100644
--- a/clang/test/CodeGen/riscv-rvv-vls-shift-ops.c
+++ b/clang/test/CodeGen/RISCV/rvv-vls-shift-ops.c
diff --git a/clang/test/CodeGen/riscv-rvv-vls-subscript-ops.c b/clang/test/CodeGen/RISCV/rvv-vls-subscript-ops.c
index aed783ba7148..aed783ba7148 100644
--- a/clang/test/CodeGen/riscv-rvv-vls-subscript-ops.c
+++ b/clang/test/CodeGen/RISCV/rvv-vls-subscript-ops.c
diff --git a/clang/test/CodeGen/riscv-vector-bits-vscale-range.c b/clang/test/CodeGen/RISCV/vector-bits-vscale-range.c
index 8b2c1d9ce66b..4dc0b7108539 100644
--- a/clang/test/CodeGen/riscv-vector-bits-vscale-range.c
+++ b/clang/test/CodeGen/RISCV/vector-bits-vscale-range.c
@@ -1,20 +1,20 @@
-// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +zve64x -mvscale-min=1 -mvscale-max=1 -S -emit-llvm -o - %s | FileCheck %s -D#VBITS=1
-// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +v -mvscale-min=2 -mvscale-max=2 -S -emit-llvm -o - %s | FileCheck %s -D#VBITS=2
-// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +v -mvscale-min=4 -mvscale-max=4 -S -emit-llvm -o - %s | FileCheck %s -D#VBITS=4
-// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +v -mvscale-min=8 -mvscale-max=8 -S -emit-llvm -o - %s | FileCheck %s -D#VBITS=8
-// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +v -mvscale-min=16 -mvscale-max=16 -S -emit-llvm -o - %s | FileCheck %s -D#VBITS=16
-// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +zve64x -mvscale-min=1 -S -emit-llvm -o - %s | FileCheck %s -D#VBITS=1 --check-prefix=CHECK-NOMAX
-// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +v -mvscale-min=2 -S -emit-llvm -o - %s | FileCheck %s -D#VBITS=2 --check-prefix=CHECK-NOMAX
-// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +v -mvscale-min=4 -S -emit-llvm -o - %s | FileCheck %s -D#VBITS=4 --check-prefix=CHECK-NOMAX
-// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +v -mvscale-min=8 -S -emit-llvm -o - %s | FileCheck %s -D#VBITS=8 --check-prefix=CHECK-NOMAX
-// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +v -mvscale-min=16 -S -emit-llvm -o - %s | FileCheck %s -D#VBITS=16 --check-prefix=CHECK-NOMAX
-// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +zve64x -mvscale-min=1 -mvscale-max=0 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-UNBOUNDED
-// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +v -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-V
-// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +v -target-feature +zvl512b -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ZVL
-// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +zve64x -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ZVE64
-// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +zve64f -target-feature +f -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ZVE64
-// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +zve64d -target-feature +f -target-feature +d -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ZVE64
-// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +zve32x -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ZVE32
+// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +zve64x -mvscale-min=1 -mvscale-max=1 -emit-llvm -o - %s | FileCheck %s -D#VBITS=1
+// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +v -mvscale-min=2 -mvscale-max=2 -emit-llvm -o - %s | FileCheck %s -D#VBITS=2
+// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +v -mvscale-min=4 -mvscale-max=4 -emit-llvm -o - %s | FileCheck %s -D#VBITS=4
+// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +v -mvscale-min=8 -mvscale-max=8 -emit-llvm -o - %s | FileCheck %s -D#VBITS=8
+// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +v -mvscale-min=16 -mvscale-max=16 -emit-llvm -o - %s | FileCheck %s -D#VBITS=16
+// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +zve64x -mvscale-min=1 -emit-llvm -o - %s | FileCheck %s -D#VBITS=1 --check-prefix=CHECK-NOMAX
+// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +v -mvscale-min=2 -emit-llvm -o - %s | FileCheck %s -D#VBITS=2 --check-prefix=CHECK-NOMAX
+// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +v -mvscale-min=4 -emit-llvm -o - %s | FileCheck %s -D#VBITS=4 --check-prefix=CHECK-NOMAX
+// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +v -mvscale-min=8 -emit-llvm -o - %s | FileCheck %s -D#VBITS=8 --check-prefix=CHECK-NOMAX
+// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +v -mvscale-min=16 -emit-llvm -o - %s | FileCheck %s -D#VBITS=16 --check-prefix=CHECK-NOMAX
+// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +zve64x -mvscale-min=1 -mvscale-max=0 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-UNBOUNDED
+// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +v -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-V
+// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +v -target-feature +zvl512b -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ZVL
+// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +zve64x -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ZVE64
+// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +zve64f -target-feature +f -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ZVE64
+// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +zve64d -target-feature +f -target-feature +d -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ZVE64
+// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +zve32x -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ZVE32
 
 // CHECK-LABEL: @func() #0
 // CHECK: attributes #0 = { {{.*}} vscale_range([[#VBITS]],[[#VBITS]]) {{.*}} }
diff --git a/clang/test/CodeGen/SystemZ/align-systemz-02.c b/clang/test/CodeGen/SystemZ/align-systemz-02.c
index 013faea61ada..4b2d32649226 100644
--- a/clang/test/CodeGen/SystemZ/align-systemz-02.c
+++ b/clang/test/CodeGen/SystemZ/align-systemz-02.c
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -triple s390x-linux-gnu %s -o - -target-feature +vector -emit-llvm \
 // RUN:    | FileCheck %s -check-prefix=VECIR
-// RUN: %clang_cc1 -triple s390x-linux-gnu %s -o - -target-feature +vector -emit-obj -S \
+// RUN: %clang_cc1 -triple s390x-linux-gnu %s -o - -target-feature +vector -S \
 // RUN:    | FileCheck %s -check-prefix=VECASM
 // RUN: %clang_cc1 -triple s390x-linux-gnu %s -o - -target-feature -vector -emit-llvm \
 // RUN:    | FileCheck %s -check-prefix=SCALIR
-// RUN: %clang_cc1 -triple s390x-linux-gnu %s -o - -target-feature -vector -emit-obj -S \
+// RUN: %clang_cc1 -triple s390x-linux-gnu %s -o - -target-feature -vector -S \
 // RUN:    | FileCheck %s -check-prefix=SCALASM
 // REQUIRES: systemz-registered-target
 
diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-error2.c b/clang/test/CodeGen/SystemZ/builtins-systemz-error2.c
index 312a9a156d21..4b18357bb33c 100644
--- a/clang/test/CodeGen/SystemZ/builtins-systemz-error2.c
+++ b/clang/test/CodeGen/SystemZ/builtins-systemz-error2.c
@@ -1,5 +1,5 @@
 // REQUIRES: systemz-registered-target
-// RUN: %clang_cc1 -triple s390x-ibm-linux -S -emit-llvm %s -verify -o -
+// RUN: %clang_cc1 -triple s390x-ibm-linux -emit-llvm-only %s -verify
 
 __int128 f0(__int128 a, __int128 b) {
   __builtin_tbegin ((void *)0);    // expected-error {{'__builtin_tbegin' needs target feature transactional-execution}}
diff --git a/clang/test/CodeGen/VE/ve-velintrin.c b/clang/test/CodeGen/VE/ve-velintrin.c
index 9f68b79e5476..9a235b61045e 100644
--- a/clang/test/CodeGen/VE/ve-velintrin.c
+++ b/clang/test/CodeGen/VE/ve-velintrin.c
@@ -1,6 +1,6 @@
 // REQUIRES: ve-registered-target
 
-// RUN: %clang_cc1 -S -emit-llvm -triple ve-unknown-linux-gnu \
+// RUN: %clang_cc1 -emit-llvm -triple ve-unknown-linux-gnu \
 // RUN:   -ffreestanding %s -o - | FileCheck %s
 
 #include <velintrin.h>
diff --git a/clang/test/CodeGen/X86/amx_errors.c b/clang/test/CodeGen/X86/amx_errors.c
index 52f54617a23a..7e326b6f39f2 100644
--- a/clang/test/CodeGen/X86/amx_errors.c
+++ b/clang/test/CodeGen/X86/amx_errors.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-tile   \
-// RUN: -target-feature +amx-int8 -target-feature +amx-bf16 -target-feature +amx-fp16 -emit-llvm -fsyntax-only -verify
+// RUN: -target-feature +amx-int8 -target-feature +amx-bf16 -target-feature +amx-fp16 -fsyntax-only -verify
 
 #include <immintrin.h>
 
diff --git a/clang/test/CodeGen/X86/amxcomplex-errors.c b/clang/test/CodeGen/X86/amxcomplex-errors.c
index 3dd5ea5b01fb..d8f3a6aa66d9 100644
--- a/clang/test/CodeGen/X86/amxcomplex-errors.c
+++ b/clang/test/CodeGen/X86/amxcomplex-errors.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
-// RUN: -target-feature +amx-complex -emit-llvm -fsyntax-only -verify
+// RUN: -target-feature +amx-complex -fsyntax-only -verify
 
 #include <immintrin.h>
 #include <stddef.h>
diff --git a/clang/test/CodeGen/X86/bfloat16.cpp b/clang/test/CodeGen/X86/bfloat16.cpp
index 6726e42db133..d202de22716f 100644
--- a/clang/test/CodeGen/X86/bfloat16.cpp
+++ b/clang/test/CodeGen/X86/bfloat16.cpp
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -target-feature +fullbf16 -S -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck -check-prefix=CHECK-NBF16 %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -target-feature +fullbf16 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck -check-prefix=CHECK-NBF16 %s
 
 // CHECK-LABEL: define dso_local void @_Z11test_scalarDF16bDF16b
 // CHECK-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
diff --git a/clang/test/CodeGen/X86/cmp-avx-builtins-error.c b/clang/test/CodeGen/X86/cmp-avx-builtins-error.c
index bad2606021b6..82b041c73fc9 100644
--- a/clang/test/CodeGen/X86/cmp-avx-builtins-error.c
+++ b/clang/test/CodeGen/X86/cmp-avx-builtins-error.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
-// RUN: -target-feature +avx -emit-llvm -fsyntax-only -verify
+// RUN: -target-feature +avx -fsyntax-only -verify
 // RUN: %clang_cc1 %s -ffreestanding -triple=i386-unknown-unknown \
-// RUN: -target-feature +avx -emit-llvm -fsyntax-only -verify
+// RUN: -target-feature +avx -fsyntax-only -verify
 
 #include <immintrin.h>
 
diff --git a/clang/test/CodeGen/X86/cmpccxadd-builtins-error-32.c b/clang/test/CodeGen/X86/cmpccxadd-builtins-error-32.c
index 512365b689b7..aa6a0842aa65 100644
--- a/clang/test/CodeGen/X86/cmpccxadd-builtins-error-32.c
+++ b/clang/test/CodeGen/X86/cmpccxadd-builtins-error-32.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 %s -ffreestanding -triple=i386-unknown-unknown \
-// RUN: -target-feature +cmpccxadd  -emit-llvm -fsyntax-only -verify
+// RUN: -target-feature +cmpccxadd  -fsyntax-only -verify
 
 #include <immintrin.h>
 
diff --git a/clang/test/CodeGen/X86/cmpccxadd-builtins-error.c b/clang/test/CodeGen/X86/cmpccxadd-builtins-error.c
index 30515b48e858..8d9ca671f30f 100644
--- a/clang/test/CodeGen/X86/cmpccxadd-builtins-error.c
+++ b/clang/test/CodeGen/X86/cmpccxadd-builtins-error.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
-// RUN: -target-feature +cmpccxadd  -emit-llvm -fsyntax-only -verify
+// RUN: -target-feature +cmpccxadd  -fsyntax-only -verify
 
 #include <immintrin.h>
 
diff --git a/clang/test/CodeGen/X86/inline-asm-gcc-regs.c b/clang/test/CodeGen/X86/inline-asm-gcc-regs.c
new file mode 100644
index 000000000000..17adbdc20a40
--- /dev/null
+++ b/clang/test/CodeGen/X86/inline-asm-gcc-regs.c
@@ -0,0 +1,121 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -O2 %s -o - | FileCheck %s
+
+// CHECK-LABEL: @test_r15
+// CHECK: call void asm sideeffect "", "{r15},~{dirflag},~{fpsr},~{flags}"(i32 undef)
+void test_r15() {
+    register int a asm ("r15");
+    asm ("" :: "r" (a));
+}
+
+// CHECK-LABEL: @test_r16
+// CHECK: call void asm sideeffect "", "{r16},~{dirflag},~{fpsr},~{flags}"(i32 undef)
+void test_r16() {
+    register int a asm ("r16");
+    asm ("" :: "r" (a));
+}
+
+// CHECK-LABEL: @test_r17
+// CHECK: call void asm sideeffect "", "{r17},~{dirflag},~{fpsr},~{flags}"(i32 undef)
+void test_r17() {
+    register int a asm ("r17");
+    asm ("" :: "r" (a));
+}
+
+// CHECK-LABEL: @test_r18
+// CHECK: call void asm sideeffect "", "{r18},~{dirflag},~{fpsr},~{flags}"(i32 undef)
+void test_r18() {
+    register int a asm ("r18");
+    asm ("" :: "r" (a));
+}
+
+// CHECK-LABEL: @test_r19
+// CHECK: call void asm sideeffect "", "{r19},~{dirflag},~{fpsr},~{flags}"(i32 undef)
+void test_r19() {
+    register int a asm ("r19");
+    asm ("" :: "r" (a));
+}
+
+// CHECK-LABEL: @test_r20
+// CHECK: call void asm sideeffect "", "{r20},~{dirflag},~{fpsr},~{flags}"(i32 undef)
+void test_r20() {
+    register int a asm ("r20");
+    asm ("" :: "r" (a));
+}
+
+// CHECK-LABEL: @test_r21
+// CHECK: call void asm sideeffect "", "{r21},~{dirflag},~{fpsr},~{flags}"(i32 undef)
+void test_r21() {
+    register int a asm ("r21");
+    asm ("" :: "r" (a));
+}
+
+// CHECK-LABEL: @test_r22
+// CHECK: call void asm sideeffect "", "{r22},~{dirflag},~{fpsr},~{flags}"(i32 undef)
+void test_r22() {
+    register int a asm ("r22");
+    asm ("" :: "r" (a));
+}
+
+// CHECK-LABEL: @test_r23
+// CHECK: call void asm sideeffect "", "{r23},~{dirflag},~{fpsr},~{flags}"(i32 undef)
+void test_r23() {
+    register int a asm ("r23");
+    asm ("" :: "r" (a));
+}
+
+// CHECK-LABEL: @test_r24
+// CHECK: call void asm sideeffect "", "{r24},~{dirflag},~{fpsr},~{flags}"(i32 undef)
+void test_r24() {
+    register int a asm ("r24");
+    asm ("" :: "r" (a));
+}
+
+// CHECK-LABEL: @test_r25
+// CHECK: call void asm sideeffect "", "{r25},~{dirflag},~{fpsr},~{flags}"(i32 undef)
+void test_r25() {
+    register int a asm ("r25");
+    asm ("" :: "r" (a));
+}
+
+// CHECK-LABEL: @test_r26
+// CHECK: call void asm sideeffect "", "{r26},~{dirflag},~{fpsr},~{flags}"(i32 undef)
+void test_r26() {
+    register int a asm ("r26");
+    asm ("" :: "r" (a));
+}
+
+// CHECK-LABEL: @test_r27
+// CHECK: call void asm sideeffect "", "{r27},~{dirflag},~{fpsr},~{flags}"(i32 undef)
+void test_r27() {
+    register int a asm ("r27");
+    asm ("" :: "r" (a));
+}
+
+// CHECK-LABEL: @test_r28
+// CHECK: call void asm sideeffect "", "{r28},~{dirflag},~{fpsr},~{flags}"(i32 undef)
+void test_r28() {
+    register int a asm ("r28");
+    asm ("" :: "r" (a));
+}
+
+// CHECK-LABEL: @test_r29
+// CHECK: call void asm sideeffect "", "{r29},~{dirflag},~{fpsr},~{flags}"(i32 undef)
+void test_r29() {
+    register int a asm ("r29");
+    asm ("" :: "r" (a));
+}
+
+// CHECK-LABEL: @test_r30
+// CHECK: call void asm sideeffect "", "{r30},~{dirflag},~{fpsr},~{flags}"(i32 undef)
+void test_r30() {
+    register int a asm ("r30");
+    asm ("" :: "r" (a));
+}
+
+// CHECK-LABEL: @test_r31
+// CHECK: call void asm sideeffect "", "{r31},~{dirflag},~{fpsr},~{flags}"(i32 undef)
+void test_r31() {
+    register int a asm ("r31");
+    asm ("" :: "r" (a));
+}
+
diff --git a/clang/test/CodeGen/X86/math-builtins.c b/clang/test/CodeGen/X86/math-builtins.c
index 554c60421995..093239b44826 100644
--- a/clang/test/CodeGen/X86/math-builtins.c
+++ b/clang/test/CodeGen/X86/math-builtins.c
@@ -1,9 +1,9 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -w -S -o - -emit-llvm              %s | FileCheck %s -check-prefix=NO__ERRNO
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -w -S -o - -emit-llvm -fmath-errno %s | FileCheck %s -check-prefix=HAS_ERRNO
-//  RUN: %clang_cc1 -triple x86_64-unknown-unknown -w -S -o - -emit-llvm -disable-llvm-passes -O2              %s | FileCheck %s -check-prefix=NO__ERRNO
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -w -S -o - -emit-llvm -disable-llvm-passes -O2 -fmath-errno %s | FileCheck %s -check-prefix=HAS_ERRNO
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown-gnu -w -S -o - -emit-llvm -fmath-errno %s | FileCheck %s --check-prefix=HAS_ERRNO_GNU
-// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -w -S -o - -emit-llvm -fmath-errno %s | FileCheck %s --check-prefix=HAS_ERRNO_WIN
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -w -o - -emit-llvm %s | FileCheck %s -check-prefix=NO__ERRNO
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -w -o - -emit-llvm -fmath-errno %s | FileCheck %s -check-prefix=HAS_ERRNO
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -w -o - -emit-llvm -disable-llvm-passes -O2 %s | FileCheck %s -check-prefix=NO__ERRNO
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -w -o - -emit-llvm -disable-llvm-passes -O2 -fmath-errno %s | FileCheck %s -check-prefix=HAS_ERRNO
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown-gnu -w -o - -emit-llvm -fmath-errno %s | FileCheck %s --check-prefix=HAS_ERRNO_GNU
+// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -w -o - -emit-llvm -fmath-errno %s | FileCheck %s --check-prefix=HAS_ERRNO_WIN
 
 // Test attributes and codegen of math builtins.
 
diff --git a/clang/test/CodeGen/X86/ms-inline-asm-prefix.c b/clang/test/CodeGen/X86/ms-inline-asm-prefix.c
index bb89599877ec..e7e95b41f2af 100644
--- a/clang/test/CodeGen/X86/ms-inline-asm-prefix.c
+++ b/clang/test/CodeGen/X86/ms-inline-asm-prefix.c
@@ -1,6 +1,6 @@
 // REQUIRES: x86-registered-target
-// RUN:%clang_cc1 %s -ferror-limit 0 -triple=x86_64-pc-windows-msvc -target-feature +avx512f -target-feature +avx2 -target-feature +avx512vl -fasm-blocks -mllvm -x86-asm-syntax=intel -S -emit-llvm -o -  | FileCheck %s -check-prefix=INTEL
-// RUN:%clang_cc1 %s -ferror-limit 0 -triple=x86_64-pc-windows-msvc -target-feature +avx512f -target-feature +avx2 -target-feature +avx512vl -fasm-blocks -mllvm -x86-asm-syntax=att -S -emit-llvm -o -  | FileCheck %s -check-prefix=ATT
+// RUN:%clang_cc1 %s -ferror-limit 0 -triple=x86_64-pc-windows-msvc -target-feature +avx512f -target-feature +avx2 -target-feature +avx512vl -fasm-blocks -mllvm -x86-asm-syntax=intel -emit-llvm -o -  | FileCheck %s -check-prefix=INTEL
+// RUN:%clang_cc1 %s -ferror-limit 0 -triple=x86_64-pc-windows-msvc -target-feature +avx512f -target-feature +avx2 -target-feature +avx512vl -fasm-blocks -mllvm -x86-asm-syntax=att -emit-llvm -o -  | FileCheck %s -check-prefix=ATT
 
 void check_inline_prefix(void) {
   __asm {
diff --git a/clang/test/CodeGen/X86/sm3-error.c b/clang/test/CodeGen/X86/sm3-error.c
index 230ebe7036be..9a38b4c8b8e9 100644
--- a/clang/test/CodeGen/X86/sm3-error.c
+++ b/clang/test/CodeGen/X86/sm3-error.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -ffreestanding -triple=i686-unknown-unknown -target-feature +sm3  -emit-llvm -fsyntax-only -verify
+// RUN: %clang_cc1 %s -ffreestanding -triple=i686-unknown-unknown -target-feature +sm3  -fsyntax-only -verify
 
 #include <immintrin.h>
 
diff --git a/clang/test/CodeGen/X86/usermsr-builtins-error-32.c b/clang/test/CodeGen/X86/usermsr-builtins-error-32.c
index 180b99a4212a..8bfc00a1ea84 100644
--- a/clang/test/CodeGen/X86/usermsr-builtins-error-32.c
+++ b/clang/test/CodeGen/X86/usermsr-builtins-error-32.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 %s -ffreestanding -triple=i386-unknown-unknown -target-feature +usermsr \
-// RUN: -emit-llvm -fsyntax-only -verify
+// RUN: -fsyntax-only -verify
 
 #include <x86gprintrin.h>
 
diff --git a/clang/test/CodeGen/X86/x86-atomic-float.c b/clang/test/CodeGen/X86/x86-atomic-float.c
new file mode 100644
index 000000000000..2d3c72d2a029
--- /dev/null
+++ b/clang/test/CodeGen/X86/x86-atomic-float.c
@@ -0,0 +1,69 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck -check-prefixes=CHECK,CHECK64 %s
+// RUN: %clang_cc1 -triple i686-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck -check-prefixes=CHECK,CHECK32 %s
+
+
+// CHECK-LABEL: define dso_local i32 @test_int_inc(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = atomicrmw add ptr @test_int_inc.n, i32 1 seq_cst, align 4
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int test_int_inc()
+{
+    static _Atomic int n;
+    return n++;
+}
+
+// CHECK-LABEL: define dso_local float @test_float_post_inc(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr @test_float_post_inc.n, float 1.000000e+00 seq_cst, align 4
+// CHECK-NEXT:    ret float [[TMP0]]
+//
+float test_float_post_inc()
+{
+    static _Atomic float n;
+    return n++;
+}
+
+// CHECK-LABEL: define dso_local float @test_float_post_dc(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr @test_float_post_dc.n, float 1.000000e+00 seq_cst, align 4
+// CHECK-NEXT:    ret float [[TMP0]]
+//
+float test_float_post_dc()
+{
+    static _Atomic float n;
+    return n--;
+}
+
+// CHECK-LABEL: define dso_local float @test_float_pre_dc(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr @test_float_pre_dc.n, float 1.000000e+00 seq_cst, align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
+// CHECK-NEXT:    ret float [[TMP1]]
+//
+float test_float_pre_dc()
+{
+    static _Atomic float n;
+    return --n;
+}
+
+// CHECK-LABEL: define dso_local float @test_float_pre_inc(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr @test_float_pre_inc.n, float 1.000000e+00 seq_cst, align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
+// CHECK-NEXT:    ret float [[TMP1]]
+//
+float test_float_pre_inc()
+{
+    static _Atomic float n;
+    return ++n;
+}
+//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+// CHECK32: {{.*}}
+// CHECK64: {{.*}}
diff --git a/clang/test/CodeGen/X86/x86-atomic-long_double.c b/clang/test/CodeGen/X86/x86-atomic-long_double.c
index ca3fb6730fb6..74a22d5db151 100644
--- a/clang/test/CodeGen/X86/x86-atomic-long_double.c
+++ b/clang/test/CodeGen/X86/x86-atomic-long_double.c
@@ -1,351 +1,342 @@
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -target-cpu core2 %s -S -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -triple i686-linux-gnu -target-cpu core2 %s -S -emit-llvm -o - | FileCheck -check-prefix=CHECK32 %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple i686-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck -check-prefix=CHECK32 %s
 
+// CHECK-LABEL: define dso_local x86_fp80 @testinc(
+// CHECK-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 16
+// CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00
+// CHECK-NEXT:    store float [[TMP2]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    ret x86_fp80 [[TMP3]]
+//
+// CHECK32-LABEL: define dso_local x86_fp80 @testinc(
+// CHECK32-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK32-NEXT:  entry:
+// CHECK32-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 4
+// CHECK32-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK32-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4
+// CHECK32-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00
+// CHECK32-NEXT:    store float [[TMP2]], ptr [[RETVAL]], align 4
+// CHECK32-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4
+// CHECK32-NEXT:    ret x86_fp80 [[TMP3]]
+//
 long double testinc(_Atomic long double *addr) {
-  // CHECK-LABEL: @testinc
-  // CHECK: store ptr %{{.+}}, ptr [[ADDR_ADDR:%.+]], align 8
-  // CHECK: [[ADDR:%.+]] = load ptr, ptr [[ADDR_ADDR]], align 8
-  // CHECK: [[INT_VALUE:%.+]] = load atomic i128, ptr [[ADDR]] seq_cst, align 16
-  // CHECK: store i128 [[INT_VALUE]], ptr [[LD_ADDR:%.+]], align 16
-  // CHECK: [[LD_VALUE:%.+]] = load x86_fp80, ptr [[LD_ADDR]], align 16
-  // CHECK: br label %[[ATOMIC_OP:.+]]
-  // CHECK: [[ATOMIC_OP]]
-  // CHECK: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
-  // CHECK: [[INC_VALUE:%.+]] = fadd x86_fp80 [[OLD_VALUE]],
-  // CHECK: call void @llvm.memset.p0.i64(ptr align 16 [[OLD_VALUE_ADDR:%.+]], i8 0, i64 16, i1 false)
-  // CHECK: store x86_fp80 [[OLD_VALUE]], ptr [[OLD_VALUE_ADDR]], align 16
-  // CHECK: [[OLD_INT:%.+]] = load i128, ptr [[OLD_VALUE_ADDR]], align 16
-  // CHECK: call void @llvm.memset.p0.i64(ptr align 16 [[NEW_VALUE_ADDR:%.+]], i8 0, i64 16, i1 false)
-  // CHECK: store x86_fp80 [[INC_VALUE]], ptr [[NEW_VALUE_ADDR]], align 16
-  // CHECK: [[NEW_INT:%.+]] = load i128, ptr [[NEW_VALUE_ADDR]], align 16
-  // CHECK: [[RES:%.+]] = cmpxchg ptr [[ADDR]], i128 [[OLD_INT]], i128 [[NEW_INT]] seq_cst seq_cst, align 16
-  // CHECK: [[OLD_VALUE:%.+]] = extractvalue { i128, i1 } [[RES]], 0
-  // CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i128, i1 } [[RES]], 1
-  // CHECK: store i128 [[OLD_VALUE]], ptr [[OLD_VALUE_RES_PTR:%.+]], align 16
-  // CHECK: [[LD_VALUE]] = load x86_fp80, ptr [[OLD_VALUE_RES_PTR]], align 16
-  // CHECK: br i1 [[FAIL_SUCCESS]], label %[[ATOMIC_CONT:.+]], label %[[ATOMIC_OP]]
-  // CHECK: [[ATOMIC_CONT]]
-  // CHECK: ret x86_fp80 [[INC_VALUE]]
-  // CHECK32-LABEL: @testinc
-  // CHECK32: store ptr %{{.+}}, ptr [[ADDR_ADDR:%.+]], align 4
-  // CHECK32: [[ADDR:%.+]] = load ptr, ptr [[ADDR_ADDR]], align 4
-  // CHECK32: call void @__atomic_load(i32 noundef 12, ptr noundef [[ADDR]], ptr noundef [[TEMP_LD_ADDR:%.+]], i32 noundef 5)
-  // CHECK32: [[LD_VALUE:%.+]] = load x86_fp80, ptr [[TEMP_LD_ADDR]], align 4
-  // CHECK32: br label %[[ATOMIC_OP:.+]]
-  // CHECK32: [[ATOMIC_OP]]
-  // CHECK32: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
-  // CHECK32: [[INC_VALUE:%.+]] = fadd x86_fp80 [[OLD_VALUE]],
-  // CHECK32: call void @llvm.memset.p0.i64(ptr align 4 [[OLD_VALUE_ADDR:%.+]], i8 0, i64 12, i1 false)
-  // CHECK32: store x86_fp80 [[OLD_VALUE]], ptr [[OLD_VALUE_ADDR]], align 4
-  // CHECK32: call void @llvm.memset.p0.i64(ptr align 4 [[DESIRED_VALUE_ADDR:%.+]], i8 0, i64 12, i1 false)
-  // CHECK32: store x86_fp80 [[INC_VALUE]], ptr [[DESIRED_VALUE_ADDR]], align 4
-  // CHECK32: [[FAIL_SUCCESS:%.+]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 12, ptr noundef [[ADDR]], ptr noundef [[OLD_VALUE_ADDR]], ptr noundef [[DESIRED_VALUE_ADDR]], i32 noundef 5, i32 noundef 5)
-  // CHECK32: [[LD_VALUE:%.+]] = load x86_fp80, ptr [[OLD_VALUE_ADDR]], align 4
-  // CHECK32: br i1 [[FAIL_SUCCESS]], label %[[ATOMIC_CONT:.+]], label %[[ATOMIC_OP]]
-  // CHECK32: [[ATOMIC_CONT]]
-  // CHECK32: ret x86_fp80 [[INC_VALUE]]
 
   return ++*addr;
 }
 
+// CHECK-LABEL: define dso_local x86_fp80 @testdec(
+// CHECK-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 16
+// CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16
+// CHECK-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    ret x86_fp80 [[TMP2]]
+//
+// CHECK32-LABEL: define dso_local x86_fp80 @testdec(
+// CHECK32-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// CHECK32-NEXT:  entry:
+// CHECK32-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 4
+// CHECK32-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK32-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4
+// CHECK32-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 4
+// CHECK32-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4
+// CHECK32-NEXT:    ret x86_fp80 [[TMP2]]
+//
 long double testdec(_Atomic long double *addr) {
-  // CHECK-LABEL: @testdec
-  // CHECK: store ptr %{{.+}}, ptr [[ADDR_ADDR:%.+]], align 8
-  // CHECK: [[ADDR:%.+]] = load ptr, ptr [[ADDR_ADDR]], align 8
-  // CHECK: [[INT_VALUE:%.+]] = load atomic i128, ptr [[ADDR]] seq_cst, align 16
-  // CHECK: store i128 [[INT_VALUE]], ptr [[LD_ADDR:%.+]], align 16
-  // CHECK: [[ORIG_LD_VALUE:%.+]] = load x86_fp80, ptr [[LD_ADDR]], align 16
-  // CHECK: br label %[[ATOMIC_OP:.+]]
-  // CHECK: [[ATOMIC_OP]]
-  // CHECK: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[ORIG_LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
-  // CHECK: [[DEC_VALUE:%.+]] = fadd x86_fp80 [[OLD_VALUE]],
-  // CHECK: call void @llvm.memset.p0.i64(ptr align 16 [[OLD_VALUE_ADDR:%.+]], i8 0, i64 16, i1 false)
-  // CHECK: store x86_fp80 [[OLD_VALUE]], ptr [[OLD_VALUE_ADDR]], align 16
-  // CHECK: [[OLD_INT:%.+]] = load i128, ptr [[OLD_VALUE_ADDR]], align 16
-  // CHECK: call void @llvm.memset.p0.i64(ptr align 16 [[NEW_VALUE_ADDR:%.+]], i8 0, i64 16, i1 false)
-  // CHECK: store x86_fp80 [[DEC_VALUE]], ptr [[NEW_VALUE_ADDR]], align 16
-  // CHECK: [[NEW_INT:%.+]] = load i128, ptr [[NEW_VALUE_ADDR]], align 16
-  // CHECK: [[RES:%.+]] = cmpxchg ptr [[ADDR]], i128 [[OLD_INT]], i128 [[NEW_INT]] seq_cst seq_cst, align 16
-  // CHECK: [[OLD_VALUE:%.+]] = extractvalue { i128, i1 } [[RES]], 0
-  // CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i128, i1 } [[RES]], 1
-  // CHECK: store i128 [[OLD_VALUE]], ptr [[OLD_VALUE_RES_PTR:%.+]], align 16
-  // CHECK: [[LD_VALUE]] = load x86_fp80, ptr [[OLD_VALUE_RES_PTR]], align 16
-  // CHECK: br i1 [[FAIL_SUCCESS]], label %[[ATOMIC_CONT:.+]], label %[[ATOMIC_OP]]
-  // CHECK: [[ATOMIC_CONT]]
-  // CHECK: ret x86_fp80 [[ORIG_LD_VALUE]]
-  // CHECK32-LABEL: @testdec
-  // CHECK32: store ptr %{{.+}}, ptr [[ADDR_ADDR:%.+]], align 4
-  // CHECK32: [[ADDR:%.+]] = load ptr, ptr [[ADDR_ADDR]], align 4
-  // CHECK32: call void @__atomic_load(i32 noundef 12, ptr noundef [[ADDR]], ptr noundef [[TEMP_LD_ADDR:%.+]], i32 noundef 5)
-  // CHECK32: [[ORIG_LD_VALUE:%.+]] = load x86_fp80, ptr [[TEMP_LD_ADDR]], align 4
-  // CHECK32: br label %[[ATOMIC_OP:.+]]
-  // CHECK32: [[ATOMIC_OP]]
-  // CHECK32: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[ORIG_LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
-  // CHECK32: [[DEC_VALUE:%.+]] = fadd x86_fp80 [[OLD_VALUE]],
-  // CHECK32: call void @llvm.memset.p0.i64(ptr align 4 [[OLD_VALUE_ADDR:%.+]], i8 0, i64 12, i1 false)
-  // CHECK32: store x86_fp80 [[OLD_VALUE]], ptr [[OLD_VALUE_ADDR]], align 4
-  // CHECK32: call void @llvm.memset.p0.i64(ptr align 4 [[DESIRED_VALUE_ADDR:%.+]], i8 0, i64 12, i1 false)
-  // CHECK32: store x86_fp80 [[DEC_VALUE]], ptr [[DESIRED_VALUE_ADDR]], align 4
-  // CHECK32: [[FAIL_SUCCESS:%.+]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 12, ptr noundef [[ADDR]], ptr noundef [[OLD_VALUE_ADDR]], ptr noundef [[DESIRED_VALUE_ADDR]], i32 noundef 5, i32 noundef 5)
-  // CHECK32: [[LD_VALUE]] = load x86_fp80, ptr [[OLD_VALUE_ADDR]], align 4
-  // CHECK32: br i1 [[FAIL_SUCCESS]], label %[[ATOMIC_CONT:.+]], label %[[ATOMIC_OP]]
-  // CHECK32: [[ATOMIC_CONT]]
-  // CHECK32: ret x86_fp80 [[ORIG_LD_VALUE]]
 
   return (*addr)--;
 }
 
+// CHECK-LABEL: define dso_local x86_fp80 @testcompassign(
+// CHECK-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16
+// CHECK-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16
+// CHECK-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 16
+// CHECK-NEXT:    [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 16
+// CHECK-NEXT:    [[ATOMIC_TEMP5:%.*]] = alloca x86_fp80, align 16
+// CHECK-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    [[ATOMIC_LOAD:%.*]] = load atomic i128, ptr [[TMP0]] seq_cst, align 16
+// CHECK-NEXT:    store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 16
+// CHECK-NEXT:    br label [[ATOMIC_OP:%.*]]
+// CHECK:       atomic_op:
+// CHECK-NEXT:    [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], [[ENTRY:%.*]] ], [ [[TMP8:%.*]], [[ATOMIC_OP]] ]
+// CHECK-NEXT:    [[SUB:%.*]] = fsub x86_fp80 [[TMP2]], 0xK4003C800000000000000
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP1]], i8 0, i64 16, i1 false)
+// CHECK-NEXT:    store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i128, ptr [[ATOMIC_TEMP1]], align 16
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP2]], i8 0, i64 16, i1 false)
+// CHECK-NEXT:    store x86_fp80 [[SUB]], ptr [[ATOMIC_TEMP2]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load i128, ptr [[ATOMIC_TEMP2]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr [[TMP0]], i128 [[TMP3]], i128 [[TMP4]] seq_cst seq_cst, align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i128, i1 } [[TMP5]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i128, i1 } [[TMP5]], 1
+// CHECK-NEXT:    store i128 [[TMP6]], ptr [[ATOMIC_TEMP3]], align 16
+// CHECK-NEXT:    [[TMP8]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 16
+// CHECK-NEXT:    br i1 [[TMP7]], label [[ATOMIC_CONT:%.*]], label [[ATOMIC_OP]]
+// CHECK:       atomic_cont:
+// CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    [[ATOMIC_LOAD4:%.*]] = load atomic i128, ptr [[TMP9]] seq_cst, align 16
+// CHECK-NEXT:    store i128 [[ATOMIC_LOAD4]], ptr [[ATOMIC_TEMP5]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP5]], align 16
+// CHECK-NEXT:    ret x86_fp80 [[TMP10]]
+//
+// CHECK32-LABEL: define dso_local x86_fp80 @testcompassign(
+// CHECK32-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// CHECK32-NEXT:  entry:
+// CHECK32-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK32-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4
+// CHECK32-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4
+// CHECK32-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 4
+// CHECK32-NEXT:    [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 4
+// CHECK32-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// CHECK32-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5)
+// CHECK32-NEXT:    [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 4
+// CHECK32-NEXT:    br label [[ATOMIC_OP:%.*]]
+// CHECK32:       atomic_op:
+// CHECK32-NEXT:    [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], [[ENTRY:%.*]] ], [ [[TMP3:%.*]], [[ATOMIC_OP]] ]
+// CHECK32-NEXT:    [[SUB:%.*]] = fsub x86_fp80 [[TMP2]], 0xK4003C800000000000000
+// CHECK32-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP1]], i8 0, i64 12, i1 false)
+// CHECK32-NEXT:    store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 4
+// CHECK32-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP2]], i8 0, i64 12, i1 false)
+// CHECK32-NEXT:    store x86_fp80 [[SUB]], ptr [[ATOMIC_TEMP2]], align 4
+// CHECK32-NEXT:    [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP1]], ptr noundef [[ATOMIC_TEMP2]], i32 noundef 5, i32 noundef 5)
+// CHECK32-NEXT:    [[TMP3]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4
+// CHECK32-NEXT:    br i1 [[CALL]], label [[ATOMIC_CONT:%.*]], label [[ATOMIC_OP]]
+// CHECK32:       atomic_cont:
+// CHECK32-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// CHECK32-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP4]], ptr noundef [[ATOMIC_TEMP3]], i32 noundef 5)
+// CHECK32-NEXT:    [[TMP5:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 4
+// CHECK32-NEXT:    ret x86_fp80 [[TMP5]]
+//
 long double testcompassign(_Atomic long double *addr) {
   *addr -= 25;
-  // CHECK-LABEL: @testcompassign
-  // CHECK: store ptr %{{.+}}, ptr [[ADDR_ADDR:%.+]], align 8
-  // CHECK: [[ADDR:%.+]] = load ptr, ptr [[ADDR_ADDR]], align 8
-  // CHECK: [[INT_VALUE:%.+]] = load atomic i128, ptr [[ADDR]] seq_cst, align 16
-  // CHECK: store i128 [[INT_VALUE]], ptr [[LD_ADDR:%.+]], align 16
-  // CHECK: [[LD_VALUE:%.+]] = load x86_fp80, ptr [[LD_ADDR]], align 16
-  // CHECK: br label %[[ATOMIC_OP:.+]]
-  // CHECK: [[ATOMIC_OP]]
-  // CHECK: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
-  // CHECK: [[SUB_VALUE:%.+]] = fsub x86_fp80 [[OLD_VALUE]],
-  // CHECK: call void @llvm.memset.p0.i64(ptr align 16 [[OLD_VALUE_ADDR:%.+]], i8 0, i64 16, i1 false)
-  // CHECK: store x86_fp80 [[OLD_VALUE]], ptr [[OLD_VALUE_ADDR]], align 16
-  // CHECK: [[OLD_INT:%.+]] = load i128, ptr [[OLD_VALUE_ADDR]], align 16
-  // CHECK: call void @llvm.memset.p0.i64(ptr align 16 [[NEW_VALUE_ADDR:%.+]], i8 0, i64 16, i1 false)
-  // CHECK: store x86_fp80 [[SUB_VALUE]], ptr [[NEW_VALUE_ADDR]], align 16
-  // CHECK: [[NEW_INT:%.+]] = load i128, ptr [[NEW_VALUE_ADDR]], align 16
-  // CHECK: [[RES:%.+]] = cmpxchg ptr [[ADDR]], i128 [[OLD_INT]], i128 [[NEW_INT]] seq_cst seq_cst, align 16
-  // CHECK: [[OLD_VALUE:%.+]] = extractvalue { i128, i1 } [[RES]], 0
-  // CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i128, i1 } [[RES]], 1
-  // CHECK: store i128 [[OLD_VALUE]], ptr [[OLD_VALUE_RES_PTR:%.+]], align 16
-  // CHECK: [[LD_VALUE]] = load x86_fp80, ptr [[OLD_VALUE_RES_PTR]], align 16
-  // CHECK: br i1 [[FAIL_SUCCESS]], label %[[ATOMIC_CONT:.+]], label %[[ATOMIC_OP]]
-  // CHECK: [[ATOMIC_CONT]]
-  // CHECK: [[ADDR:%.+]] = load ptr, ptr %{{.+}}, align 8
-  // CHECK: [[INT_VAL:%.+]] = load atomic i128, ptr [[ADDR]] seq_cst, align 16
-  // CHECK: store i128 [[INT_VAL]], ptr [[INT_LD_TEMP:%.+]], align 16
-  // CHECK: [[RET_VAL:%.+]] = load x86_fp80, ptr [[LD_TEMP:%.+]], align 16
-  // CHECK: ret x86_fp80 [[RET_VAL]]
-  // CHECK32-LABEL: @testcompassign
-  // CHECK32: store ptr %{{.+}}, ptr [[ADDR_ADDR:%.+]], align 4
-  // CHECK32: [[ADDR:%.+]] = load ptr, ptr [[ADDR_ADDR]], align 4
-  // CHECK32: call void @__atomic_load(i32 noundef 12, ptr noundef [[ADDR]], ptr noundef [[TEMP_LD_ADDR:%.+]], i32 noundef 5)
-  // CHECK32: [[LD_VALUE:%.+]] = load x86_fp80, ptr [[TEMP_LD_ADDR]], align 4
-  // CHECK32: br label %[[ATOMIC_OP:.+]]
-  // CHECK32: [[ATOMIC_OP]]
-  // CHECK32: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
-  // CHECK32: [[INC_VALUE:%.+]] = fsub x86_fp80 [[OLD_VALUE]],
-  // CHECK32: call void @llvm.memset.p0.i64(ptr align 4 [[OLD_VALUE_ADDR:%.+]], i8 0, i64 12, i1 false)
-  // CHECK32: store x86_fp80 [[OLD_VALUE]], ptr [[OLD_VALUE_ADDR]], align 4
-  // CHECK32: call void @llvm.memset.p0.i64(ptr align 4 [[DESIRED_VALUE_ADDR:%.+]], i8 0, i64 12, i1 false)
-  // CHECK32: store x86_fp80 [[INC_VALUE]], ptr [[DESIRED_VALUE_ADDR]], align 4
-  // CHECK32: [[FAIL_SUCCESS:%.+]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 12, ptr noundef [[ADDR]], ptr noundef [[OLD_VALUE_ADDR]], ptr noundef [[DESIRED_VALUE_ADDR]], i32 noundef 5, i32 noundef 5)
-  // CHECK32: [[LD_VALUE]] = load x86_fp80, ptr [[OLD_VALUE_ADDR]], align 4
-  // CHECK32: br i1 [[FAIL_SUCCESS]], label %[[ATOMIC_CONT:.+]], label %[[ATOMIC_OP]]
-  // CHECK32: [[ATOMIC_CONT]]
-  // CHECK32: [[ADDR:%.+]] = load ptr, ptr %{{.+}}, align 4
-  // CHECK32: call void @__atomic_load(i32 noundef 12, ptr noundef [[ADDR]], ptr noundef [[GET_ADDR:%.+]], i32 noundef 5)
-  // CHECK32: [[RET_VAL:%.+]] = load x86_fp80, ptr [[GET_ADDR]], align 4
-  // CHECK32: ret x86_fp80 [[RET_VAL]]
   return *addr;
 }
 
+// CHECK-LABEL: define dso_local x86_fp80 @testassign(
+// CHECK-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16
+// CHECK-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16
+// CHECK-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP]], i8 0, i64 16, i1 false)
+// CHECK-NEXT:    store x86_fp80 0xK4005E600000000000000, ptr [[ATOMIC_TEMP]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[ATOMIC_TEMP]], align 16
+// CHECK-NEXT:    store atomic i128 [[TMP1]], ptr [[TMP0]] seq_cst, align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    [[ATOMIC_LOAD:%.*]] = load atomic i128, ptr [[TMP2]] seq_cst, align 16
+// CHECK-NEXT:    store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP1]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 16
+// CHECK-NEXT:    ret x86_fp80 [[TMP3]]
+//
+// CHECK32-LABEL: define dso_local x86_fp80 @testassign(
+// CHECK32-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// CHECK32-NEXT:  entry:
+// CHECK32-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK32-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4
+// CHECK32-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4
+// CHECK32-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// CHECK32-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP]], i8 0, i64 12, i1 false)
+// CHECK32-NEXT:    store x86_fp80 0xK4005E600000000000000, ptr [[ATOMIC_TEMP]], align 4
+// CHECK32-NEXT:    call void @__atomic_store(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5)
+// CHECK32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// CHECK32-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP1]], ptr noundef [[ATOMIC_TEMP1]], i32 noundef 5)
+// CHECK32-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4
+// CHECK32-NEXT:    ret x86_fp80 [[TMP2]]
+//
 long double testassign(_Atomic long double *addr) {
-  // CHECK-LABEL: @testassign
-  // CHECK: store ptr %{{.+}}, ptr [[ADDR_ADDR:%.+]], align 8
-  // CHECK: [[ADDR:%.+]] = load ptr, ptr [[ADDR_ADDR]], align 8
-  // CHECK: call void @llvm.memset.p0.i64(ptr align 16 [[STORE_TEMP_PTR:%.+]], i8 0, i64 16, i1 false)
-  // CHECK: store x86_fp80 {{.+}}, ptr [[STORE_TEMP_PTR]], align 16
-  // CHECK: [[STORE_TEMP_INT:%.+]] = load i128, ptr [[STORE_TEMP_PTR]], align 16
-  // CHECK: store atomic i128 [[STORE_TEMP_INT]], ptr [[ADDR]] seq_cst, align 16
-  // CHECK32-LABEL: @testassign
-  // CHECK32: store ptr %{{.+}}, ptr [[ADDR_ADDR:%.+]], align 4
-  // CHECK32: [[ADDR:%.+]] = load ptr, ptr [[ADDR_ADDR]], align 4
-  // CHECK32: call void @llvm.memset.p0.i64(ptr align 4 [[STORE_TEMP_PTR:%.+]], i8 0, i64 12, i1 false)
-  // CHECK32: store x86_fp80 {{.+}}, ptr [[STORE_TEMP_PTR]], align 4
-  // CHECK32: call void @__atomic_store(i32 noundef 12, ptr noundef [[ADDR]], ptr noundef [[STORE_TEMP_PTR]], i32 noundef 5)
   *addr = 115;
-  // CHECK: [[ADDR:%.+]] = load ptr, ptr %{{.+}}, align 8
-  // CHECK: [[INT_VAL:%.+]] = load atomic i128, ptr [[ADDR]] seq_cst, align 16
-  // CHECK: store i128 [[INT_VAL]], ptr [[INT_LD_TEMP:%.+]], align 16
-  // CHECK: [[RET_VAL:%.+]] = load x86_fp80, ptr [[LD_TEMP:%.+]], align 16
-  // CHECK: ret x86_fp80 [[RET_VAL]]
-  // CHECK32: [[ADDR:%.+]] = load ptr, ptr %{{.+}}, align 4
-  // CHECK32: call void @__atomic_load(i32 noundef 12, ptr noundef [[ADDR]], ptr noundef [[LD_TEMP:%.+]], i32 noundef 5)
-  // CHECK32: [[RET_VAL:%.+]] = load x86_fp80, ptr [[LD_TEMP]], align 4
-  // CHECK32: ret x86_fp80 [[RET_VAL]]
 
   return *addr;
 }
 
+// CHECK-LABEL: define dso_local x86_fp80 @test_volatile_inc(
+// CHECK-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 16
+// CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00
+// CHECK-NEXT:    store float [[TMP2]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    ret x86_fp80 [[TMP3]]
+//
+// CHECK32-LABEL: define dso_local x86_fp80 @test_volatile_inc(
+// CHECK32-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// CHECK32-NEXT:  entry:
+// CHECK32-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 4
+// CHECK32-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK32-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4
+// CHECK32-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00
+// CHECK32-NEXT:    store float [[TMP2]], ptr [[RETVAL]], align 4
+// CHECK32-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4
+// CHECK32-NEXT:    ret x86_fp80 [[TMP3]]
+//
 long double test_volatile_inc(volatile _Atomic long double *addr) {
-  // CHECK-LABEL: @test_volatile_inc
-  // CHECK: store ptr %{{.+}}, ptr [[ADDR_ADDR:%.+]], align 8
-  // CHECK: [[ADDR:%.+]] = load ptr, ptr [[ADDR_ADDR]], align 8
-  // CHECK: [[INT_VALUE:%.+]] = load atomic volatile i128, ptr [[ADDR]] seq_cst, align 16
-  // CHECK: store i128 [[INT_VALUE]], ptr [[LD_ADDR:%.+]], align 16
-  // CHECK: [[LD_VALUE:%.+]] = load x86_fp80, ptr [[LD_ADDR]], align 16
-  // CHECK: br label %[[ATOMIC_OP:.+]]
-  // CHECK: [[ATOMIC_OP]]
-  // CHECK: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
-  // CHECK: [[INC_VALUE:%.+]] = fadd x86_fp80 [[OLD_VALUE]],
-  // CHECK: call void @llvm.memset.p0.i64(ptr align 16 [[OLD_VALUE_ADDR:%.+]], i8 0, i64 16, i1 false)
-  // CHECK: store x86_fp80 [[OLD_VALUE]], ptr [[OLD_VALUE_ADDR]], align 16
-  // CHECK: [[OLD_INT:%.+]] = load i128, ptr [[OLD_VALUE_ADDR]], align 16
-  // CHECK: call void @llvm.memset.p0.i64(ptr align 16 [[NEW_VALUE_ADDR:%.+]], i8 0, i64 16, i1 false)
-  // CHECK: store x86_fp80 [[INC_VALUE]], ptr [[NEW_VALUE_ADDR]], align 16
-  // CHECK: [[NEW_INT:%.+]] = load i128, ptr [[NEW_VALUE_ADDR]], align 16
-  // CHECK: [[RES:%.+]] = cmpxchg volatile ptr [[ADDR]], i128 [[OLD_INT]], i128 [[NEW_INT]] seq_cst seq_cst, align 16
-  // CHECK: [[OLD_VALUE:%.+]] = extractvalue { i128, i1 } [[RES]], 0
-  // CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i128, i1 } [[RES]], 1
-  // CHECK: store i128 [[OLD_VALUE]], ptr [[OLD_VALUE_RES_PTR:%.+]], align 16
-  // CHECK: [[LD_VALUE]] = load x86_fp80, ptr [[OLD_VALUE_RES_PTR]], align 16
-  // CHECK: br i1 [[FAIL_SUCCESS]], label %[[ATOMIC_CONT:.+]], label %[[ATOMIC_OP]]
-  // CHECK: [[ATOMIC_CONT]]
-  // CHECK: ret x86_fp80 [[INC_VALUE]]
-  // CHECK32-LABEL: @test_volatile_inc
-  // CHECK32: store ptr %{{.+}}, ptr [[ADDR_ADDR:%.+]], align 4
-  // CHECK32: [[ADDR:%.+]] = load ptr, ptr [[ADDR_ADDR]], align 4
-  // CHECK32: call void @__atomic_load(i32 noundef 12, ptr noundef [[ADDR]], ptr noundef [[TEMP_LD_ADDR:%.+]], i32 noundef 5)
-  // CHECK32: [[LD_VALUE:%.+]] = load x86_fp80, ptr [[TEMP_LD_ADDR]], align 4
-  // CHECK32: br label %[[ATOMIC_OP:.+]]
-  // CHECK32: [[ATOMIC_OP]]
-  // CHECK32: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
-  // CHECK32: [[INC_VALUE:%.+]] = fadd x86_fp80 [[OLD_VALUE]],
-  // CHECK32: call void @llvm.memset.p0.i64(ptr align 4 [[OLD_VALUE_ADDR:%.+]], i8 0, i64 12, i1 false)
-  // CHECK32: store x86_fp80 [[OLD_VALUE]], ptr [[OLD_VALUE_ADDR]], align 4
-  // CHECK32: call void @llvm.memset.p0.i64(ptr align 4 [[DESIRED_VALUE_ADDR:%.+]], i8 0, i64 12, i1 false)
-  // CHECK32: store x86_fp80 [[INC_VALUE]], ptr [[DESIRED_VALUE_ADDR]], align 4
-  // CHECK32: [[FAIL_SUCCESS:%.+]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 12, ptr noundef [[ADDR]], ptr noundef [[OLD_VALUE_ADDR]], ptr noundef [[DESIRED_VALUE_ADDR]], i32 noundef 5, i32 noundef 5)
-  // CHECK32: [[LD_VALUE]] = load x86_fp80, ptr [[OLD_VALUE_ADDR]], align 4
-  // CHECK32: br i1 [[FAIL_SUCCESS]], label %[[ATOMIC_CONT:.+]], label %[[ATOMIC_OP]]
-  // CHECK32: [[ATOMIC_CONT]]
-  // CHECK32: ret x86_fp80 [[INC_VALUE]]
   return ++*addr;
 }
 
+// CHECK-LABEL: define dso_local x86_fp80 @test_volatile_dec(
+// CHECK-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 16
+// CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16
+// CHECK-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    ret x86_fp80 [[TMP2]]
+//
+// CHECK32-LABEL: define dso_local x86_fp80 @test_volatile_dec(
+// CHECK32-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// CHECK32-NEXT:  entry:
+// CHECK32-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 4
+// CHECK32-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK32-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4
+// CHECK32-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 4
+// CHECK32-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4
+// CHECK32-NEXT:    ret x86_fp80 [[TMP2]]
+//
 long double test_volatile_dec(volatile _Atomic long double *addr) {
-  // CHECK-LABEL: @test_volatile_dec
-  // CHECK: store ptr %{{.+}}, ptr [[ADDR_ADDR:%.+]], align 8
-  // CHECK: [[ADDR:%.+]] = load ptr, ptr [[ADDR_ADDR]], align 8
-  // CHECK: [[INT_VALUE:%.+]] = load atomic volatile i128, ptr [[ADDR]] seq_cst, align 16
-  // CHECK: store i128 [[INT_VALUE]], ptr [[LD_ADDR:%.+]], align 16
-  // CHECK: [[ORIG_LD_VALUE:%.+]] = load x86_fp80, ptr [[LD_ADDR]], align 16
-  // CHECK: br label %[[ATOMIC_OP:.+]]
-  // CHECK: [[ATOMIC_OP]]
-  // CHECK: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[ORIG_LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
-  // CHECK: [[DEC_VALUE:%.+]] = fadd x86_fp80 [[OLD_VALUE]],
-  // CHECK: call void @llvm.memset.p0.i64(ptr align 16 [[OLD_VALUE_ADDR:%.+]], i8 0, i64 16, i1 false)
-  // CHECK: store x86_fp80 [[OLD_VALUE]], ptr [[OLD_VALUE_ADDR]], align 16
-  // CHECK: [[OLD_INT:%.+]] = load i128, ptr [[OLD_VALUE_ADDR]], align 16
-  // CHECK: call void @llvm.memset.p0.i64(ptr align 16 [[NEW_VALUE_ADDR:%.+]], i8 0, i64 16, i1 false)
-  // CHECK: store x86_fp80 [[DEC_VALUE]], ptr [[NEW_VALUE_ADDR]], align 16
-  // CHECK: [[NEW_INT:%.+]] = load i128, ptr [[NEW_VALUE_ADDR]], align 16
-  // CHECK: [[RES:%.+]] = cmpxchg volatile ptr [[ADDR]], i128 [[OLD_INT]], i128 [[NEW_INT]] seq_cst seq_cst, align 16
-  // CHECK: [[OLD_VALUE:%.+]] = extractvalue { i128, i1 } [[RES]], 0
-  // CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i128, i1 } [[RES]], 1
-  // CHECK: store i128 [[OLD_VALUE]], ptr [[OLD_VALUE_RES_PTR:%.+]], align 16
-  // CHECK: [[LD_VALUE]] = load x86_fp80, ptr [[OLD_VALUE_RES_PTR]], align 16
-  // CHECK: br i1 [[FAIL_SUCCESS]], label %[[ATOMIC_CONT:.+]], label %[[ATOMIC_OP]]
-  // CHECK: [[ATOMIC_CONT]]
-  // CHECK: ret x86_fp80 [[ORIG_LD_VALUE]]
-  // CHECK32-LABEL: @test_volatile_dec
-  // CHECK32: store ptr %{{.+}}, ptr [[ADDR_ADDR:%.+]], align 4
-  // CHECK32: [[ADDR:%.+]] = load ptr, ptr [[ADDR_ADDR]], align 4
-  // CHECK32: call void @__atomic_load(i32 noundef 12, ptr noundef [[ADDR]], ptr noundef [[TEMP_LD_ADDR:%.+]], i32 noundef 5)
-  // CHECK32: [[ORIG_LD_VALUE:%.+]] = load x86_fp80, ptr [[TEMP_LD_ADDR]], align 4
-  // CHECK32: br label %[[ATOMIC_OP:.+]]
-  // CHECK32: [[ATOMIC_OP]]
-  // CHECK32: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[ORIG_LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
-  // CHECK32: [[DEC_VALUE:%.+]] = fadd x86_fp80 [[OLD_VALUE]],
-  // CHECK32: call void @llvm.memset.p0.i64(ptr align 4 [[OLD_VALUE_ADDR:%.+]], i8 0, i64 12, i1 false)
-  // CHECK32: store x86_fp80 [[OLD_VALUE]], ptr [[OLD_VALUE_ADDR]], align 4
-  // CHECK32: call void @llvm.memset.p0.i64(ptr align 4 [[DESIRED_VALUE_ADDR:%.+]], i8 0, i64 12, i1 false)
-  // CHECK32: store x86_fp80 [[DEC_VALUE]], ptr [[DESIRED_VALUE_ADDR]], align 4
-  // CHECK32: [[FAIL_SUCCESS:%.+]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 12, ptr noundef [[ADDR]], ptr noundef [[OLD_VALUE_ADDR]], ptr noundef [[DESIRED_VALUE_ADDR]], i32 noundef 5, i32 noundef 5)
-  // CHECK32: [[LD_VALUE]] = load x86_fp80, ptr [[OLD_VALUE_ADDR]], align 4
-  // CHECK32: br i1 [[FAIL_SUCCESS]], label %[[ATOMIC_CONT:.+]], label %[[ATOMIC_OP]]
-  // CHECK32: [[ATOMIC_CONT]]
-  // CHECK32: ret x86_fp80 [[ORIG_LD_VALUE]]
   return (*addr)--;
 }
 
+// CHECK-LABEL: define dso_local x86_fp80 @test_volatile_compassign(
+// CHECK-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16
+// CHECK-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16
+// CHECK-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 16
+// CHECK-NEXT:    [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 16
+// CHECK-NEXT:    [[ATOMIC_TEMP5:%.*]] = alloca x86_fp80, align 16
+// CHECK-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    [[ATOMIC_LOAD:%.*]] = load atomic volatile i128, ptr [[TMP0]] seq_cst, align 16
+// CHECK-NEXT:    store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 16
+// CHECK-NEXT:    br label [[ATOMIC_OP:%.*]]
+// CHECK:       atomic_op:
+// CHECK-NEXT:    [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], [[ENTRY:%.*]] ], [ [[TMP8:%.*]], [[ATOMIC_OP]] ]
+// CHECK-NEXT:    [[SUB:%.*]] = fsub x86_fp80 [[TMP2]], 0xK4003C800000000000000
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP1]], i8 0, i64 16, i1 false)
+// CHECK-NEXT:    store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i128, ptr [[ATOMIC_TEMP1]], align 16
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP2]], i8 0, i64 16, i1 false)
+// CHECK-NEXT:    store x86_fp80 [[SUB]], ptr [[ATOMIC_TEMP2]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load i128, ptr [[ATOMIC_TEMP2]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg volatile ptr [[TMP0]], i128 [[TMP3]], i128 [[TMP4]] seq_cst seq_cst, align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i128, i1 } [[TMP5]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i128, i1 } [[TMP5]], 1
+// CHECK-NEXT:    store i128 [[TMP6]], ptr [[ATOMIC_TEMP3]], align 16
+// CHECK-NEXT:    [[TMP8]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 16
+// CHECK-NEXT:    br i1 [[TMP7]], label [[ATOMIC_CONT:%.*]], label [[ATOMIC_OP]]
+// CHECK:       atomic_cont:
+// CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    [[ATOMIC_LOAD4:%.*]] = load atomic volatile i128, ptr [[TMP9]] seq_cst, align 16
+// CHECK-NEXT:    store i128 [[ATOMIC_LOAD4]], ptr [[ATOMIC_TEMP5]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP5]], align 16
+// CHECK-NEXT:    ret x86_fp80 [[TMP10]]
+//
+// CHECK32-LABEL: define dso_local x86_fp80 @test_volatile_compassign(
+// CHECK32-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// CHECK32-NEXT:  entry:
+// CHECK32-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK32-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4
+// CHECK32-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4
+// CHECK32-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 4
+// CHECK32-NEXT:    [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 4
+// CHECK32-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// CHECK32-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5)
+// CHECK32-NEXT:    [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 4
+// CHECK32-NEXT:    br label [[ATOMIC_OP:%.*]]
+// CHECK32:       atomic_op:
+// CHECK32-NEXT:    [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], [[ENTRY:%.*]] ], [ [[TMP3:%.*]], [[ATOMIC_OP]] ]
+// CHECK32-NEXT:    [[SUB:%.*]] = fsub x86_fp80 [[TMP2]], 0xK4003C800000000000000
+// CHECK32-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP1]], i8 0, i64 12, i1 false)
+// CHECK32-NEXT:    store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 4
+// CHECK32-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP2]], i8 0, i64 12, i1 false)
+// CHECK32-NEXT:    store x86_fp80 [[SUB]], ptr [[ATOMIC_TEMP2]], align 4
+// CHECK32-NEXT:    [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP1]], ptr noundef [[ATOMIC_TEMP2]], i32 noundef 5, i32 noundef 5)
+// CHECK32-NEXT:    [[TMP3]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4
+// CHECK32-NEXT:    br i1 [[CALL]], label [[ATOMIC_CONT:%.*]], label [[ATOMIC_OP]]
+// CHECK32:       atomic_cont:
+// CHECK32-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// CHECK32-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP4]], ptr noundef [[ATOMIC_TEMP3]], i32 noundef 5)
+// CHECK32-NEXT:    [[TMP5:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 4
+// CHECK32-NEXT:    ret x86_fp80 [[TMP5]]
+//
 long double test_volatile_compassign(volatile _Atomic long double *addr) {
   *addr -= 25;
-  // CHECK-LABEL: @test_volatile_compassign
-  // CHECK: store ptr %{{.+}}, ptr [[ADDR_ADDR:%.+]], align 8
-  // CHECK: [[ADDR:%.+]] = load ptr, ptr [[ADDR_ADDR]], align 8
-  // CHECK: [[INT_VALUE:%.+]] = load atomic volatile i128, ptr [[ADDR]] seq_cst, align 16
-  // CHECK: store i128 [[INT_VALUE]], ptr [[LD_ADDR:%.+]], align 16
-  // CHECK: [[LD_VALUE:%.+]] = load x86_fp80, ptr [[LD_ADDR]], align 16
-  // CHECK: br label %[[ATOMIC_OP:.+]]
-  // CHECK: [[ATOMIC_OP]]
-  // CHECK: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
-  // CHECK: [[SUB_VALUE:%.+]] = fsub x86_fp80 [[OLD_VALUE]],
-  // CHECK: call void @llvm.memset.p0.i64(ptr align 16 [[OLD_VALUE_ADDR:%.+]], i8 0, i64 16, i1 false)
-  // CHECK: store x86_fp80 [[OLD_VALUE]], ptr [[OLD_VALUE_ADDR]], align 16
-  // CHECK: [[OLD_INT:%.+]] = load i128, ptr [[OLD_VALUE_ADDR]], align 16
-  // CHECK: call void @llvm.memset.p0.i64(ptr align 16 [[NEW_VALUE_ADDR:%.+]], i8 0, i64 16, i1 false)
-  // CHECK: store x86_fp80 [[SUB_VALUE]], ptr [[NEW_VALUE_ADDR]], align 16
-  // CHECK: [[NEW_INT:%.+]] = load i128, ptr [[NEW_VALUE_ADDR]], align 16
-  // CHECK: [[RES:%.+]] = cmpxchg volatile ptr [[ADDR]], i128 [[OLD_INT]], i128 [[NEW_INT]] seq_cst seq_cst, align 16
-  // CHECK: [[OLD_VALUE:%.+]] = extractvalue { i128, i1 } [[RES]], 0
-  // CHECK: [[FAIL_SUCCESS:%.+]] = extractvalue { i128, i1 } [[RES]], 1
-  // CHECK: store i128 [[OLD_VALUE]], ptr [[OLD_VALUE_RES_PTR:%.+]], align 16
-  // CHECK: [[LD_VALUE]] = load x86_fp80, ptr [[OLD_VALUE_RES_PTR]], align 16
-  // CHECK: br i1 [[FAIL_SUCCESS]], label %[[ATOMIC_CONT:.+]], label %[[ATOMIC_OP]]
-  // CHECK: [[ATOMIC_CONT]]
-  // CHECK: [[ADDR:%.+]] = load ptr, ptr %{{.+}}, align 8
-  // CHECK: [[INT_VAL:%.+]] = load atomic volatile i128, ptr [[ADDR]] seq_cst, align 16
-  // CHECK: store i128 [[INT_VAL]], ptr [[INT_LD_TEMP:%.+]], align 16
-  // CHECK: [[RET_VAL:%.+]] = load x86_fp80, ptr [[LD_TEMP:%.+]], align 16
-  // CHECK32-LABEL: @test_volatile_compassign
-  // CHECK32: store ptr %{{.+}}, ptr [[ADDR_ADDR:%.+]], align 4
-  // CHECK32: [[ADDR:%.+]] = load ptr, ptr [[ADDR_ADDR]], align 4
-  // CHECK32: call void @__atomic_load(i32 noundef 12, ptr noundef [[ADDR]], ptr noundef [[TEMP_LD_ADDR:%.+]], i32 noundef 5)
-  // CHECK32: [[LD_VALUE:%.+]] = load x86_fp80, ptr [[TEMP_LD_ADDR]], align 4
-  // CHECK32: br label %[[ATOMIC_OP:.+]]
-  // CHECK32: [[ATOMIC_OP]]
-  // CHECK32: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
-  // CHECK32: [[INC_VALUE:%.+]] = fsub x86_fp80 [[OLD_VALUE]],
-  // CHECK32: call void @llvm.memset.p0.i64(ptr align 4 [[OLD_VALUE_ADDR:%.+]], i8 0, i64 12, i1 false)
-  // CHECK32: store x86_fp80 [[OLD_VALUE]], ptr [[OLD_VALUE_ADDR]], align 4
-  // CHECK32: call void @llvm.memset.p0.i64(ptr align 4 [[DESIRED_VALUE_ADDR:%.+]], i8 0, i64 12, i1 false)
-  // CHECK32: store x86_fp80 [[INC_VALUE]], ptr [[DESIRED_VALUE_ADDR]], align 4
-  // CHECK32: [[FAIL_SUCCESS:%.+]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 12, ptr noundef [[ADDR]], ptr noundef [[OLD_VALUE_ADDR]], ptr noundef [[DESIRED_VALUE_ADDR]], i32 noundef 5, i32 noundef 5)
-  // CHECK32: [[LD_VALUE]] = load x86_fp80, ptr [[OLD_VALUE_ADDR]], align 4
-  // CHECK32: br i1 [[FAIL_SUCCESS]], label %[[ATOMIC_CONT:.+]], label %[[ATOMIC_OP]]
-  // CHECK32: [[ATOMIC_CONT]]
-  // CHECK32: [[ADDR:%.+]] = load ptr, ptr %{{.+}}, align 4
-  // CHECK32: call void @__atomic_load(i32 noundef 12, ptr noundef [[ADDR]], ptr noundef [[GET_ADDR:%.+]], i32 noundef 5)
-  // CHECK32: [[RET_VAL:%.+]] = load x86_fp80, ptr [[GET_ADDR]], align 4
-  // CHECK32: ret x86_fp80 [[RET_VAL]]
   return *addr;
 }
 
+// CHECK-LABEL: define dso_local x86_fp80 @test_volatile_assign(
+// CHECK-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16
+// CHECK-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16
+// CHECK-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP]], i8 0, i64 16, i1 false)
+// CHECK-NEXT:    store x86_fp80 0xK4005E600000000000000, ptr [[ATOMIC_TEMP]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[ATOMIC_TEMP]], align 16
+// CHECK-NEXT:    store atomic volatile i128 [[TMP1]], ptr [[TMP0]] seq_cst, align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    [[ATOMIC_LOAD:%.*]] = load atomic volatile i128, ptr [[TMP2]] seq_cst, align 16
+// CHECK-NEXT:    store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP1]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 16
+// CHECK-NEXT:    ret x86_fp80 [[TMP3]]
+//
+// CHECK32-LABEL: define dso_local x86_fp80 @test_volatile_assign(
+// CHECK32-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// CHECK32-NEXT:  entry:
+// CHECK32-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK32-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4
+// CHECK32-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4
+// CHECK32-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
+// CHECK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// CHECK32-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP]], i8 0, i64 12, i1 false)
+// CHECK32-NEXT:    store x86_fp80 0xK4005E600000000000000, ptr [[ATOMIC_TEMP]], align 4
+// CHECK32-NEXT:    call void @__atomic_store(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5)
+// CHECK32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// CHECK32-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP1]], ptr noundef [[ATOMIC_TEMP1]], i32 noundef 5)
+// CHECK32-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4
+// CHECK32-NEXT:    ret x86_fp80 [[TMP2]]
+//
 long double test_volatile_assign(volatile _Atomic long double *addr) {
-  // CHECK-LABEL: @test_volatile_assign
-  // CHECK: store ptr %{{.+}}, ptr [[ADDR_ADDR:%.+]], align 8
-  // CHECK: [[ADDR:%.+]] = load ptr, ptr [[ADDR_ADDR]], align 8
-  // CHECK: call void @llvm.memset.p0.i64(ptr align 16 [[STORE_TEMP_PTR:%.+]], i8 0, i64 16, i1 false)
-  // CHECK: store x86_fp80 {{.+}}, ptr [[STORE_TEMP_PTR]], align 16
-  // CHECK: [[STORE_TEMP_INT:%.+]] = load i128, ptr [[STORE_TEMP_PTR]], align 16
-  // CHECK: store atomic volatile i128 [[STORE_TEMP_INT]], ptr [[ADDR]] seq_cst, align 16
-  // CHECK32-LABEL: @test_volatile_assign
-  // CHECK32: store ptr %{{.+}}, ptr [[ADDR_ADDR:%.+]], align 4
-  // CHECK32: [[ADDR:%.+]] = load ptr, ptr [[ADDR_ADDR]], align 4
-  // CHECK32: call void @llvm.memset.p0.i64(ptr align 4 [[STORE_TEMP_PTR:%.+]], i8 0, i64 12, i1 false)
-  // CHECK32: store x86_fp80 {{.+}}, ptr [[STORE_TEMP_PTR]], align 4
-  // CHECK32: call void @__atomic_store(i32 noundef 12, ptr noundef [[ADDR]], ptr noundef [[STORE_TEMP_PTR]], i32 noundef 5)
   *addr = 115;
-  // CHECK: [[ADDR:%.+]] = load ptr, ptr %{{.+}}, align 8
-  // CHECK: [[INT_VAL:%.+]] = load atomic volatile i128, ptr [[ADDR]] seq_cst, align 16
-  // CHECK: store i128 [[INT_VAL]], ptr [[INT_LD_TEMP:%.+]], align 16
-  // CHECK: [[RET_VAL:%.+]] = load x86_fp80, ptr [[LD_TEMP:%.+]], align 16
-  // CHECK: ret x86_fp80 [[RET_VAL]]
-  // CHECK32: [[ADDR:%.+]] = load ptr, ptr %{{.+}}, align 4
-  // CHECK32: call void @__atomic_load(i32 noundef 12, ptr noundef [[ADDR]], ptr noundef [[LD_TEMP:%.+]], i32 noundef 5)
-  // CHECK32: [[RET_VAL:%.+]] = load x86_fp80, ptr [[LD_TEMP]], align 4
-  // CHECK32: ret x86_fp80 [[RET_VAL]]
 
   return *addr;
 }
diff --git a/clang/test/CodeGen/X86/x86-vec-i128.c b/clang/test/CodeGen/X86/x86-vec-i128.c
index 31853998e900..ee58cb92da6b 100644
--- a/clang/test/CodeGen/X86/x86-vec-i128.c
+++ b/clang/test/CodeGen/X86/x86-vec-i128.c
@@ -1,23 +1,23 @@
-// RUN: %clang_cc1 -triple x86_64-linux-gnu %s -target-feature +sse2 -S -emit-llvm -o - | FileCheck %s --check-prefixes=CLANG10ABI128,MEM256ALIGN32,MEM512ALIGN64
-// RUN: %clang_cc1 -triple x86_64-netbsd %s -target-feature +sse2 -S -emit-llvm -o - | FileCheck %s --check-prefixes=CLANG10ABI128,MEM256ALIGN32,MEM512ALIGN64
-// RUN: %clang_cc1 -triple x86_64-apple-darwin %s -target-feature +sse2 -S -emit-llvm -o - | FileCheck %s --check-prefixes=CLANG9ABI128,MEM256ALIGN16,MEM512ALIGN16
-// RUN: %clang_cc1 -triple x86_64-scei-ps4 %s -target-feature +sse2 -S -emit-llvm -o - | FileCheck %s --check-prefixes=CLANG9ABI128,MEM256ALIGN32,MEM512ALIGN64
-// RUN: %clang_cc1 -triple x86_64-unknown-freebsd10.0 %s -target-feature +sse2 -S -emit-llvm -o - | FileCheck %s --check-prefixes=CLANG9ABI128,MEM256ALIGN32,MEM512ALIGN64
-// RUN: %clang_cc1 -triple x86_64-linux-gnu %s -target-feature +sse2 -S -emit-llvm -o - -fclang-abi-compat=9 | FileCheck %s --check-prefixes=CLANG9ABI128,MEM256ALIGN32,MEM512ALIGN64
-
-// RUN: %clang_cc1 -triple x86_64-linux-gnu %s -target-feature +avx -S -emit-llvm -o - | FileCheck %s --check-prefixes=CLANG10ABI128,CLANG10ABI256,MEM512ALIGN64
-// RUN: %clang_cc1 -triple x86_64-netbsd %s -target-feature +avx -S -emit-llvm -o - | FileCheck %s --check-prefixes=CLANG10ABI128,CLANG10ABI256,MEM512ALIGN64
-// RUN: %clang_cc1 -triple x86_64-apple-darwin %s -target-feature +avx -S -emit-llvm -o - | FileCheck %s --check-prefixes=CLANG9ABI128,CLANG9ABI256,MEM512ALIGN32
-// RUN: %clang_cc1 -triple x86_64-scei-ps4 %s -target-feature +avx -S -emit-llvm -o - | FileCheck %s --check-prefixes=CLANG9ABI128,CLANG9ABI256,MEM512ALIGN64
-// RUN: %clang_cc1 -triple x86_64-unknown-freebsd10.0 %s -target-feature +avx -S -emit-llvm -o - | FileCheck %s --check-prefixes=CLANG9ABI128,CLANG9ABI256,MEM512ALIGN64
-// RUN: %clang_cc1 -triple x86_64-linux-gnu %s -target-feature +avx -S -emit-llvm -o - -fclang-abi-compat=9 | FileCheck %s --check-prefixes=CLANG9ABI128,CLANG9ABI256,MEM512ALIGN64
-
-// RUN: %clang_cc1 -triple x86_64-linux-gnu %s -target-feature +avx512f -S -emit-llvm -o - | FileCheck %s --check-prefixes=CLANG10ABI128,CLANG10ABI256,CLANG10ABI512
-// RUN: %clang_cc1 -triple x86_64-netbsd %s -target-feature +avx512f -S -emit-llvm -o - | FileCheck %s --check-prefixes=CLANG10ABI128,CLANG10ABI256,CLANG10ABI512
-// RUN: %clang_cc1 -triple x86_64-apple-darwin %s -target-feature +avx512f -S -emit-llvm -o - | FileCheck %s --check-prefixes=CLANG9ABI128,CLANG9ABI256,CLANG9ABI512
-// RUN: %clang_cc1 -triple x86_64-scei-ps4 %s -target-feature +avx512f -S -emit-llvm -o - | FileCheck %s --check-prefixes=CLANG9ABI128,CLANG9ABI256,CLANG9ABI512
-// RUN: %clang_cc1 -triple x86_64-unknown-freebsd10.0 %s -target-feature +avx512f -S -emit-llvm -o - | FileCheck %s --check-prefixes=CLANG9ABI128,CLANG9ABI256,CLANG9ABI512
-// RUN: %clang_cc1 -triple x86_64-linux-gnu %s -target-feature +avx512f -S -emit-llvm -o - -fclang-abi-compat=9 | FileCheck %s --check-prefixes=CLANG9ABI128,CLANG9ABI256,CLANG9ABI512
+// RUN: %clang_cc1 -triple x86_64-linux-gnu %s -target-feature +sse2 -emit-llvm -o - | FileCheck %s --check-prefixes=CLANG10ABI128,MEM256ALIGN32,MEM512ALIGN64
+// RUN: %clang_cc1 -triple x86_64-netbsd %s -target-feature +sse2 -emit-llvm -o - | FileCheck %s --check-prefixes=CLANG10ABI128,MEM256ALIGN32,MEM512ALIGN64
+// RUN: %clang_cc1 -triple x86_64-apple-darwin %s -target-feature +sse2 -emit-llvm -o - | FileCheck %s --check-prefixes=CLANG9ABI128,MEM256ALIGN16,MEM512ALIGN16
+// RUN: %clang_cc1 -triple x86_64-scei-ps4 %s -target-feature +sse2 -emit-llvm -o - | FileCheck %s --check-prefixes=CLANG9ABI128,MEM256ALIGN32,MEM512ALIGN64
+// RUN: %clang_cc1 -triple x86_64-unknown-freebsd10.0 %s -target-feature +sse2 -emit-llvm -o - | FileCheck %s --check-prefixes=CLANG9ABI128,MEM256ALIGN32,MEM512ALIGN64
+// RUN: %clang_cc1 -triple x86_64-linux-gnu %s -target-feature +sse2 -emit-llvm -o - -fclang-abi-compat=9 | FileCheck %s --check-prefixes=CLANG9ABI128,MEM256ALIGN32,MEM512ALIGN64
+
+// RUN: %clang_cc1 -triple x86_64-linux-gnu %s -target-feature +avx -emit-llvm -o - | FileCheck %s --check-prefixes=CLANG10ABI128,CLANG10ABI256,MEM512ALIGN64
+// RUN: %clang_cc1 -triple x86_64-netbsd %s -target-feature +avx -emit-llvm -o - | FileCheck %s --check-prefixes=CLANG10ABI128,CLANG10ABI256,MEM512ALIGN64
+// RUN: %clang_cc1 -triple x86_64-apple-darwin %s -target-feature +avx -emit-llvm -o - | FileCheck %s --check-prefixes=CLANG9ABI128,CLANG9ABI256,MEM512ALIGN32
+// RUN: %clang_cc1 -triple x86_64-scei-ps4 %s -target-feature +avx -emit-llvm -o - | FileCheck %s --check-prefixes=CLANG9ABI128,CLANG9ABI256,MEM512ALIGN64
+// RUN: %clang_cc1 -triple x86_64-unknown-freebsd10.0 %s -target-feature +avx -emit-llvm -o - | FileCheck %s --check-prefixes=CLANG9ABI128,CLANG9ABI256,MEM512ALIGN64
+// RUN: %clang_cc1 -triple x86_64-linux-gnu %s -target-feature +avx -emit-llvm -o - -fclang-abi-compat=9 | FileCheck %s --check-prefixes=CLANG9ABI128,CLANG9ABI256,MEM512ALIGN64
+
+// RUN: %clang_cc1 -triple x86_64-linux-gnu %s -target-feature +avx512f -emit-llvm -o - | FileCheck %s --check-prefixes=CLANG10ABI128,CLANG10ABI256,CLANG10ABI512
+// RUN: %clang_cc1 -triple x86_64-netbsd %s -target-feature +avx512f -emit-llvm -o - | FileCheck %s --check-prefixes=CLANG10ABI128,CLANG10ABI256,CLANG10ABI512
+// RUN: %clang_cc1 -triple x86_64-apple-darwin %s -target-feature +avx512f -emit-llvm -o - | FileCheck %s --check-prefixes=CLANG9ABI128,CLANG9ABI256,CLANG9ABI512
+// RUN: %clang_cc1 -triple x86_64-scei-ps4 %s -target-feature +avx512f -emit-llvm -o - | FileCheck %s --check-prefixes=CLANG9ABI128,CLANG9ABI256,CLANG9ABI512
+// RUN: %clang_cc1 -triple x86_64-unknown-freebsd10.0 %s -target-feature +avx512f -emit-llvm -o - | FileCheck %s --check-prefixes=CLANG9ABI128,CLANG9ABI256,CLANG9ABI512
+// RUN: %clang_cc1 -triple x86_64-linux-gnu %s -target-feature +avx512f -emit-llvm -o - -fclang-abi-compat=9 | FileCheck %s --check-prefixes=CLANG9ABI128,CLANG9ABI256,CLANG9ABI512
 
 typedef unsigned long long v16u64 __attribute__((vector_size(16)));
 typedef unsigned __int128 v16u128 __attribute__((vector_size(16)));
diff --git a/clang/test/CodeGen/X86/x86_64-atomic-128.c b/clang/test/CodeGen/X86/x86_64-atomic-128.c
index 2bc53d949a0f..f682ffc75f82 100644
--- a/clang/test/CodeGen/X86/x86_64-atomic-128.c
+++ b/clang/test/CodeGen/X86/x86_64-atomic-128.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -target-cpu core2 %s -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck %s
 
 // All atomics up to 16 bytes should be emitted inline on x86_64. The
 // backend can reform __sync_whatever calls if necessary (e.g. the CPU
diff --git a/clang/test/CodeGen/X86/x86_64-instrument-functions.c b/clang/test/CodeGen/X86/x86_64-instrument-functions.c
index b2cef133ff04..215e629a604f 100644
--- a/clang/test/CodeGen/X86/x86_64-instrument-functions.c
+++ b/clang/test/CodeGen/X86/x86_64-instrument-functions.c
@@ -1,7 +1,7 @@
 // REQUIRES: x86-registered-target
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -S -finstrument-functions -O0 -o - -emit-llvm %s | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -S -finstrument-functions -O2 -o - -emit-llvm %s | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -S -finstrument-functions-after-inlining -O2 -o - -emit-llvm %s | FileCheck -check-prefix=NOINLINE %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -finstrument-functions -O0 -o - -emit-llvm %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -finstrument-functions -O2 -o - -emit-llvm %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -finstrument-functions-after-inlining -O2 -o - -emit-llvm %s | FileCheck -check-prefix=NOINLINE %s
 
 __attribute__((always_inline)) int leaf(int x) {
   return x;
diff --git a/clang/test/CodeGen/X86/x86_inlineasm_curly_bracket_escape.c b/clang/test/CodeGen/X86/x86_inlineasm_curly_bracket_escape.c
index 6599a657fbfe..d1ac654284f0 100644
--- a/clang/test/CodeGen/X86/x86_inlineasm_curly_bracket_escape.c
+++ b/clang/test/CodeGen/X86/x86_inlineasm_curly_bracket_escape.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -O0  -S -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -O0  -emit-llvm -o - -Wall -Werror | FileCheck %s
 // This test checks validity of inline assembly using curly brackets syntax
 // for extended inline asm.
 
diff --git a/clang/test/CodeGen/aapcs-bitfield.c b/clang/test/CodeGen/aapcs-bitfield.c
index 0df250d4ebc5..152cfc1f5929 100644
--- a/clang/test/CodeGen/aapcs-bitfield.c
+++ b/clang/test/CodeGen/aapcs-bitfield.c
@@ -1,14 +1,14 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple armv8-none-linux-eabi   -fno-aapcs-bitfield-width -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefix=LE
+// RUN: %clang_cc1 -triple armv8-none-linux-eabi -fno-aapcs-bitfield-width -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefix=LE
 // RUN: %clang_cc1 -triple armebv8-none-linux-eabi -fno-aapcs-bitfield-width -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefix=BE
 
-// RUN: %clang_cc1 -triple armv8-none-linux-eabi   -faapcs-bitfield-load -fno-aapcs-bitfield-width -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefixes=LENUMLOADS
+// RUN: %clang_cc1 -triple armv8-none-linux-eabi -faapcs-bitfield-load -fno-aapcs-bitfield-width -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefixes=LENUMLOADS
 // RUN: %clang_cc1 -triple armebv8-none-linux-eabi -faapcs-bitfield-load -fno-aapcs-bitfield-width -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefixes=BENUMLOADS
 
-// RUN: %clang_cc1 -triple armv8-none-linux-eabi   -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefix=LEWIDTH
+// RUN: %clang_cc1 -triple armv8-none-linux-eabi -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefix=LEWIDTH
 // RUN: %clang_cc1 -triple armebv8-none-linux-eabi -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefix=BEWIDTH
 
-// RUN: %clang_cc1 -triple armv8-none-linux-eabi   -faapcs-bitfield-load -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefixes=LEWIDTHNUM
+// RUN: %clang_cc1 -triple armv8-none-linux-eabi -faapcs-bitfield-load -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefixes=LEWIDTHNUM
 // RUN: %clang_cc1 -triple armebv8-none-linux-eabi -faapcs-bitfield-load -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefixes=BEWIDTHNUM
 
 struct st0 {
diff --git a/clang/test/CodeGen/aapcs64-align.cpp b/clang/test/CodeGen/aapcs64-align.cpp
index de231f2123b9..7a8151022852 100644
--- a/clang/test/CodeGen/aapcs64-align.cpp
+++ b/clang/test/CodeGen/aapcs64-align.cpp
@@ -1,7 +1,7 @@
 // REQUIRES: arm-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-elf \
 // RUN:   -O2 \
-// RUN:   -emit-llvm -o - %s | FileCheck %s
+// RUN:   -emit-llvm -fexperimental-max-bitint-width=1024 -o - %s | FileCheck %s
 
 extern "C" {
 
@@ -100,4 +100,66 @@ void f5m(int, int, int, int, int, P16);
 // CHECK: declare void @f5(i32 noundef, [2 x i64])
 // CHECK: declare void @f5m(i32 noundef, i32 noundef, i32 noundef, i32 noundef, i32 noundef, [2 x i64])
 
+//BitInt alignment
+struct BITINT129 {
+    char ch;
+    unsigned _BitInt(129) v;
+};
+
+int test_bitint129(){
+  return __builtin_offsetof(struct BITINT129, v);
 }
+// CHECK:  ret i32 16 
+
+struct BITINT127 {
+    char ch;
+    _BitInt(127) v;
+};
+
+int test_bitint127(){
+  return __builtin_offsetof(struct BITINT127, v);
+}
+// CHECK:  ret i32 16 
+
+struct BITINT63 {
+    char ch;
+    _BitInt(63) v;
+};
+
+int test_bitint63(){
+  return __builtin_offsetof(struct BITINT63, v);
+}
+// CHECK:  ret i32 8 
+
+struct BITINT32 {
+    char ch;
+    unsigned _BitInt(32) v;
+};
+
+int test_bitint32(){
+  return __builtin_offsetof(struct BITINT32, v);
+}
+// CHECK:  ret i32 4
+
+struct BITINT9 {
+    char ch;
+    unsigned _BitInt(9) v;
+};
+
+int test_bitint9(){
+  return __builtin_offsetof(struct BITINT9, v);
+}
+// CHECK:  ret i32 2
+
+struct BITINT8 {
+    char ch;
+    unsigned _BitInt(8) v;
+};
+
+int test_bitint8(){
+  return __builtin_offsetof(struct BITINT8, v);
+}
+// CHECK:  ret i32 1
+
+}
+
diff --git a/clang/test/CodeGen/aarch64-ABI-align-packed-assembly.c b/clang/test/CodeGen/aarch64-ABI-align-packed-assembly.c
index 5ac8fd138914..d6e96a6004f6 100644
--- a/clang/test/CodeGen/aarch64-ABI-align-packed-assembly.c
+++ b/clang/test/CodeGen/aarch64-ABI-align-packed-assembly.c
@@ -1,5 +1,5 @@
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fsyntax-only -triple aarch64 -target-feature +neon -S -O2 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +neon -S -O2 -o - %s | FileCheck %s
 #include <stdarg.h>
 #include <arm_neon.h>
 
diff --git a/clang/test/CodeGen/aarch64-ABI-align-packed.c b/clang/test/CodeGen/aarch64-ABI-align-packed.c
index 13c68fe54b84..0349ebc8cc63 100644
--- a/clang/test/CodeGen/aarch64-ABI-align-packed.c
+++ b/clang/test/CodeGen/aarch64-ABI-align-packed.c
@@ -1,5 +1,5 @@
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fsyntax-only -triple aarch64 -target-feature +neon -emit-llvm -O2 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +neon -emit-llvm -O2 -o - %s | FileCheck %s
 #include <stdarg.h>
 #include <arm_neon.h>
 
diff --git a/clang/test/CodeGen/aarch64-bf16-reinterpret-intrinsics.c b/clang/test/CodeGen/aarch64-bf16-reinterpret-intrinsics.c
index 850d8fa5bbab..2b271ac88462 100644
--- a/clang/test/CodeGen/aarch64-bf16-reinterpret-intrinsics.c
+++ b/clang/test/CodeGen/aarch64-bf16-reinterpret-intrinsics.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple aarch64 -target-feature +neon -target-feature +bf16 \
-// RUN: -disable-O0-optnone -S -emit-llvm -o - %s \
+// RUN: -disable-O0-optnone -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg \
 // RUN: | FileCheck %s
 
diff --git a/clang/test/CodeGen/aarch64-elf-pauthabi.c b/clang/test/CodeGen/aarch64-elf-pauthabi.c
new file mode 100644
index 000000000000..aa83ee3e0d7b
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-elf-pauthabi.c
@@ -0,0 +1,59 @@
+// RUN: %clang_cc1 -triple aarch64-linux -emit-llvm -o - \
+// RUN:   -fptrauth-intrinsics \
+// RUN:   -fptrauth-calls \
+// RUN:   -fptrauth-returns \
+// RUN:   -fptrauth-auth-traps \
+// RUN:   -fptrauth-vtable-pointer-address-discrimination \
+// RUN:   -fptrauth-vtable-pointer-type-discrimination \
+// RUN:   -fptrauth-init-fini %s | \
+// RUN:   FileCheck %s --check-prefix=ALL
+
+// RUN: %clang_cc1 -triple aarch64-linux -emit-llvm -o - \
+// RUN:   -fptrauth-intrinsics %s | FileCheck %s --check-prefix=INTRIN
+
+// RUN: %clang_cc1 -triple aarch64-linux -emit-llvm -o - \
+// RUN:   -fptrauth-calls %s | FileCheck %s --check-prefix=CALL
+
+// RUN: %clang_cc1 -triple aarch64-linux -emit-llvm -o - \
+// RUN:   -fptrauth-returns %s | FileCheck %s --check-prefix=RET
+
+// RUN: %clang_cc1 -triple aarch64-linux -emit-llvm -o - \
+// RUN:   -fptrauth-auth-traps %s | FileCheck %s --check-prefix=TRAP
+
+// RUN: %clang_cc1 -triple aarch64-linux -emit-llvm -o - \
+// RUN:   -fptrauth-calls -fptrauth-vtable-pointer-address-discrimination %s | \
+// RUN:   FileCheck %s --check-prefix=VPTRADDR
+
+// RUN: %clang_cc1 -triple aarch64-linux -emit-llvm -o - \
+// RUN:   -fptrauth-calls -fptrauth-vtable-pointer-type-discrimination %s | \
+// RUN:   FileCheck %s --check-prefix=VPTRTYPE
+
+// RUN: %clang_cc1 -triple aarch64-linux -emit-llvm -o - \
+// RUN:   -fptrauth-calls -fptrauth-init-fini %s | \
+// RUN:   FileCheck %s --check-prefix=INITFINI
+
+// ALL: !{i32 1, !"aarch64-elf-pauthabi-platform", i32 268435458}
+// ALL: !{i32 1, !"aarch64-elf-pauthabi-version", i32 127}
+
+// INTRIN: !{i32 1, !"aarch64-elf-pauthabi-platform", i32 268435458}
+// INTRIN: !{i32 1, !"aarch64-elf-pauthabi-version", i32 1}
+
+// CALL: !{i32 1, !"aarch64-elf-pauthabi-platform", i32 268435458}
+// CALL: !{i32 1, !"aarch64-elf-pauthabi-version", i32 2}
+
+// RET: !{i32 1, !"aarch64-elf-pauthabi-platform", i32 268435458}
+// RET: !{i32 1, !"aarch64-elf-pauthabi-version", i32 4}
+
+// TRAP: !{i32 1, !"aarch64-elf-pauthabi-platform", i32 268435458}
+// TRAP: !{i32 1, !"aarch64-elf-pauthabi-version", i32 8}
+
+// VPTRADDR: !{i32 1, !"aarch64-elf-pauthabi-platform", i32 268435458}
+// VPTRADDR: !{i32 1, !"aarch64-elf-pauthabi-version", i32 18}
+
+// VPTRTYPE: !{i32 1, !"aarch64-elf-pauthabi-platform", i32 268435458}
+// VPTRTYPE: !{i32 1, !"aarch64-elf-pauthabi-version", i32 34}
+
+// INITFINI: !{i32 1, !"aarch64-elf-pauthabi-platform", i32 268435458}
+// INITFINI: !{i32 1, !"aarch64-elf-pauthabi-version", i32 66}
+
+void foo() {}
diff --git a/clang/test/CodeGen/aarch64-fmv-dependencies.c b/clang/test/CodeGen/aarch64-fmv-dependencies.c
new file mode 100644
index 000000000000..ec599e1b3fa7
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-fmv-dependencies.c
@@ -0,0 +1,240 @@
+// Test/document all of the dependencies between possible AArch64 FMV extensions.
+// Also test the name mangling.
+
+// RUN: %clang --target=aarch64-linux-gnu --rtlib=compiler-rt -emit-llvm -S -o - %s | FileCheck %s
+
+// CHECK: define dso_local i32 @fmv._Maes() #[[ATTR0:[0-9]+]] {
+__attribute__((target_version("aes"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mbf16() #[[bf16_ebf16:[0-9]+]] {
+__attribute__((target_version("bf16"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mbti() #[[bti:[0-9]+]] {
+__attribute__((target_version("bti"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mcrc() #[[crc:[0-9]+]] {
+__attribute__((target_version("crc"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mdgh() #[[ATTR0:[0-9]+]] {
+__attribute__((target_version("dgh"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mdit() #[[dit:[0-9]+]] {
+__attribute__((target_version("dit"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mdotprod() #[[dotprod:[0-9]+]] {
+__attribute__((target_version("dotprod"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mdpb() #[[dpb:[0-9]+]] {
+__attribute__((target_version("dpb"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mdpb2() #[[dpb2:[0-9]+]] {
+__attribute__((target_version("dpb2"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mebf16() #[[bf16_ebf16:[0-9]+]] {
+__attribute__((target_version("ebf16"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mf32mm() #[[f32mm:[0-9]+]] {
+__attribute__((target_version("f32mm"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mf64mm() #[[f64mm:[0-9]+]] {
+__attribute__((target_version("f64mm"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mfcma() #[[fcma:[0-9]+]] {
+__attribute__((target_version("fcma"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mflagm() #[[flagm:[0-9]+]] {
+__attribute__((target_version("flagm"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mflagm2() #[[flagm2:[0-9]+]] {
+__attribute__((target_version("flagm2"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mfp() #[[ATTR0:[0-9]+]] {
+__attribute__((target_version("fp"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mfp16() #[[fp16:[0-9]+]] {
+__attribute__((target_version("fp16"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mfp16fml() #[[fp16fml:[0-9]+]] {
+__attribute__((target_version("fp16fml"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mfrintts() #[[frintts:[0-9]+]] {
+__attribute__((target_version("frintts"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mi8mm() #[[i8mm:[0-9]+]] {
+__attribute__((target_version("i8mm"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mjscvt() #[[jscvt:[0-9]+]] {
+__attribute__((target_version("jscvt"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mls64() #[[ATTR0:[0-9]+]] {
+__attribute__((target_version("ls64"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mls64_accdata() #[[ls64_accdata:[0-9]+]] {
+__attribute__((target_version("ls64_accdata"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mls64_v() #[[ATTR0:[0-9]+]] {
+__attribute__((target_version("ls64_v"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mlse() #[[lse:[0-9]+]] {
+__attribute__((target_version("lse"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mmemtag() #[[ATTR0:[0-9]+]] {
+__attribute__((target_version("memtag"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mmemtag2() #[[memtag2:[0-9]+]] {
+__attribute__((target_version("memtag2"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mmemtag3() #[[memtag2:[0-9]+]] {
+__attribute__((target_version("memtag3"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mmops() #[[mops:[0-9]+]] {
+__attribute__((target_version("mops"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mpmull() #[[pmull:[0-9]+]] {
+__attribute__((target_version("pmull"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mpredres() #[[predres:[0-9]+]] {
+__attribute__((target_version("predres"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mrcpc() #[[rcpc:[0-9]+]] {
+__attribute__((target_version("rcpc"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mrcpc2() #[[rcpc:[0-9]+]] {
+__attribute__((target_version("rcpc2"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mrcpc3() #[[rcpc3:[0-9]+]] {
+__attribute__((target_version("rcpc3"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mrdm() #[[rdm:[0-9]+]] {
+__attribute__((target_version("rdm"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mrng() #[[rng:[0-9]+]] {
+__attribute__((target_version("rng"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mrpres() #[[ATTR0:[0-9]+]] {
+__attribute__((target_version("rpres"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msb() #[[sb:[0-9]+]] {
+__attribute__((target_version("sb"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msha1() #[[ATTR0:[0-9]+]] {
+__attribute__((target_version("sha1"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msha2() #[[sha2:[0-9]+]] {
+__attribute__((target_version("sha2"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msha3() #[[sha3:[0-9]+]] {
+__attribute__((target_version("sha3"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msimd() #[[ATTR0:[0-9]+]] {
+__attribute__((target_version("simd"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msm4() #[[sm4:[0-9]+]] {
+__attribute__((target_version("sm4"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msme() #[[sme:[0-9]+]] {
+__attribute__((target_version("sme"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msme-f64f64() #[[sme_f64f64:[0-9]+]] {
+__attribute__((target_version("sme-f64f64"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msme-i16i64() #[[sme_i16i64:[0-9]+]] {
+__attribute__((target_version("sme-i16i64"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msme2() #[[sme2:[0-9]+]] {
+__attribute__((target_version("sme2"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mssbs() #[[ATTR0:[0-9]+]] {
+__attribute__((target_version("ssbs"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mssbs2() #[[ssbs2:[0-9]+]] {
+__attribute__((target_version("ssbs2"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msve() #[[sve:[0-9]+]] {
+__attribute__((target_version("sve"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msve-bf16() #[[sve_bf16_ebf16:[0-9]+]] {
+__attribute__((target_version("sve-bf16"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msve-ebf16() #[[sve_bf16_ebf16:[0-9]+]] {
+__attribute__((target_version("sve-ebf16"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msve-i8mm() #[[sve_i8mm:[0-9]+]] {
+__attribute__((target_version("sve-i8mm"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msve2() #[[sve2:[0-9]+]] {
+__attribute__((target_version("sve2"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msve2-aes() #[[sve2_aes_sve2_pmull128:[0-9]+]] {
+__attribute__((target_version("sve2-aes"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msve2-bitperm() #[[sve2_bitperm:[0-9]+]] {
+__attribute__((target_version("sve2-bitperm"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msve2-pmull128() #[[sve2_aes_sve2_pmull128:[0-9]+]] {
+__attribute__((target_version("sve2-pmull128"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msve2-sha3() #[[sve2_sha3:[0-9]+]] {
+__attribute__((target_version("sve2-sha3"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msve2-sm4() #[[sve2_sm4:[0-9]+]] {
+__attribute__((target_version("sve2-sm4"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mwfxt() #[[wfxt:[0-9]+]] {
+__attribute__((target_version("wfxt"))) int fmv(void) { return 0; }
+
+// CHECK-NOT: define dso_local i32 @fmv._M{{.*}}
+__attribute__((target_version("non_existent_extension"))) int fmv(void);
+
+__attribute__((target_version("default"))) int fmv(void);
+
+int caller() {
+  return fmv();
+}
+
+// CHECK: attributes #[[ATTR0:[0-9]+]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[bf16_ebf16:[0-9]+]] = { {{.*}} "target-features"="+bf16,+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[bti:[0-9]+]] = { {{.*}} "target-features"="+bti,+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[crc:[0-9]+]] = { {{.*}} "target-features"="+crc,+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[dit:[0-9]+]] = { {{.*}} "target-features"="+dit,+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[dotprod:[0-9]+]] = { {{.*}} "target-features"="+dotprod,+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[dpb:[0-9]+]] = { {{.*}} "target-features"="+ccpp,+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[dpb2:[0-9]+]] = { {{.*}} "target-features"="+ccdp,+ccpp,+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[f32mm:[0-9]+]] = { {{.*}} "target-features"="+f32mm,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a"
+// CHECK: attributes #[[f64mm:[0-9]+]] = { {{.*}} "target-features"="+f64mm,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a"
+// CHECK: attributes #[[fcma:[0-9]+]] = { {{.*}} "target-features"="+complxnum,+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[flagm:[0-9]+]] = { {{.*}} "target-features"="+flagm,+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[flagm2:[0-9]+]] = { {{.*}} "target-features"="+altnzcv,+flagm,+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[fp16:[0-9]+]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[fp16fml:[0-9]+]] = { {{.*}} "target-features"="+fp-armv8,+fp16fml,+fullfp16,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[frintts:[0-9]+]] = { {{.*}} "target-features"="+fp-armv8,+fptoint,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[i8mm:[0-9]+]] = { {{.*}} "target-features"="+fp-armv8,+i8mm,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[jscvt:[0-9]+]] = { {{.*}} "target-features"="+fp-armv8,+jsconv,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[ls64_accdata:[0-9]+]] = { {{.*}} "target-features"="+fp-armv8,+ls64,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[lse:[0-9]+]] = { {{.*}} "target-features"="+fp-armv8,+lse,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[memtag2:[0-9]+]] = { {{.*}} "target-features"="+fp-armv8,+mte,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[mops:[0-9]+]] = { {{.*}} "target-features"="+fp-armv8,+mops,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[pmull:[0-9]+]] = { {{.*}} "target-features"="+aes,+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[predres:[0-9]+]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+predres,+v8a"
+// CHECK: attributes #[[rcpc:[0-9]+]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+rcpc,+v8a"
+// CHECK: attributes #[[rcpc3:[0-9]+]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+rcpc,+rcpc3,+v8a"
+// CHECK: attributes #[[rdm:[0-9]+]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+rdm,+v8a"
+// CHECK: attributes #[[rng:[0-9]+]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+rand,+v8a"
+// CHECK: attributes #[[sb:[0-9]+]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+sb,+v8a"
+// CHECK: attributes #[[sha2:[0-9]+]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+sha2,+v8a"
+// CHECK: attributes #[[sha3:[0-9]+]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+sha2,+sha3,+v8a"
+// CHECK: attributes #[[sm4:[0-9]+]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+sm4,+v8a"
+// CHECK: attributes #[[sme:[0-9]+]] = { {{.*}} "target-features"="+bf16,+fp-armv8,+neon,+outline-atomics,+sme,+v8a"
+// CHECK: attributes #[[sme_f64f64:[0-9]+]] = { {{.*}} "target-features"="+bf16,+fp-armv8,+neon,+outline-atomics,+sme,+sme-f64f64,+v8a"
+// CHECK: attributes #[[sme_i16i64:[0-9]+]] = { {{.*}} "target-features"="+bf16,+fp-armv8,+neon,+outline-atomics,+sme,+sme-i16i64,+v8a"
+// CHECK: attributes #[[sme2:[0-9]+]] = { {{.*}} "target-features"="+bf16,+fp-armv8,+neon,+outline-atomics,+sme,+sme2,+v8a"
+// CHECK: attributes #[[ssbs2:[0-9]+]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+ssbs,+v8a"
+// CHECK: attributes #[[sve:[0-9]+]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a"
+// CHECK: attributes #[[sve_bf16_ebf16:[0-9]+]] = { {{.*}} "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a"
+// CHECK: attributes #[[sve_i8mm:[0-9]+]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+i8mm,+neon,+outline-atomics,+sve,+v8a"
+// CHECK: attributes #[[sve2:[0-9]+]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+v8a"
+// CHECK: attributes #[[sve2_aes_sve2_pmull128:[0-9]+]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+sve2-aes,+v8a"
+// CHECK: attributes #[[sve2_bitperm:[0-9]+]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+sve2-bitperm,+v8a"
+// CHECK: attributes #[[sve2_sha3:[0-9]+]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+sve2-sha3,+v8a"
+// CHECK: attributes #[[sve2_sm4:[0-9]+]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+sve2-sm4,+v8a"
+// CHECK: attributes #[[wfxt:[0-9]+]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a,+wfxt"
diff --git a/clang/test/CodeGen/aarch64-ls64-inline-asm.c b/clang/test/CodeGen/aarch64-ls64-inline-asm.c
index 0ba12ab47ae5..a01393525bcd 100644
--- a/clang/test/CodeGen/aarch64-ls64-inline-asm.c
+++ b/clang/test/CodeGen/aarch64-ls64-inline-asm.c
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64 -target-feature +ls64 -O1 -S -emit-llvm -x c %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +ls64 -O1 -emit-llvm -x c %s -o - | FileCheck %s
 
 struct foo { unsigned long long x[8]; };
 
diff --git a/clang/test/CodeGen/aarch64-ls64.c b/clang/test/CodeGen/aarch64-ls64.c
index c20be13ed13c..23894f3e399d 100644
--- a/clang/test/CodeGen/aarch64-ls64.c
+++ b/clang/test/CodeGen/aarch64-ls64.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64 -target-feature +ls64 -S -emit-llvm -x c %s -o - | FileCheck --check-prefixes=CHECK-C %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +ls64 -S -emit-llvm -x c++ %s -o - | FileCheck --check-prefixes=CHECK-CXX %s
-// RUN: %clang_cc1 -triple aarch64_be-eabi -target-feature +ls64 -S -emit-llvm -x c %s -o - | FileCheck  --check-prefixes=CHECK-C %s
-// RUN: %clang_cc1 -triple aarch64_be-eabi -target-feature +ls64 -S -emit-llvm -x c++ %s -o - | FileCheck  --check-prefixes=CHECK-CXX %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +ls64 -emit-llvm -x c %s -o - | FileCheck --check-prefixes=CHECK-C %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +ls64 -emit-llvm -x c++ %s -o - | FileCheck --check-prefixes=CHECK-CXX %s
+// RUN: %clang_cc1 -triple aarch64_be-eabi -target-feature +ls64 -emit-llvm -x c %s -o - | FileCheck  --check-prefixes=CHECK-C %s
+// RUN: %clang_cc1 -triple aarch64_be-eabi -target-feature +ls64 -emit-llvm -x c++ %s -o - | FileCheck  --check-prefixes=CHECK-CXX %s
 
 #include <arm_acle.h>
 
diff --git a/clang/test/CodeGen/aarch64-matmul.cpp b/clang/test/CodeGen/aarch64-matmul.cpp
index 58deda1c612c..04bb56fdcb79 100644
--- a/clang/test/CodeGen/aarch64-matmul.cpp
+++ b/clang/test/CodeGen/aarch64-matmul.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1             -triple aarch64 -target-feature +neon -target-feature +i8mm -S -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1             -triple aarch64 -target-feature +neon -target-feature +i8mm -emit-llvm %s -o - | FileCheck %s
 
 #ifdef __ARM_FEATURE_MATMUL_INT8
 extern "C" void arm_feature_matmulint8_defined() {}
diff --git a/clang/test/CodeGen/aarch64-mixed-target-attributes.c b/clang/test/CodeGen/aarch64-mixed-target-attributes.c
index aef6ce36ab1c..6aa747d4cb46 100644
--- a/clang/test/CodeGen/aarch64-mixed-target-attributes.c
+++ b/clang/test/CodeGen/aarch64-mixed-target-attributes.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -v9.5a -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -fmv -S -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK-NOFMV
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -v9.5a -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -fmv -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK-NOFMV
 
 // The following is guarded because in NOFMV we get an error for redefining the default.
 #ifdef __HAVE_FUNCTION_MULTI_VERSIONING
diff --git a/clang/test/CodeGen/aarch64-mops.c b/clang/test/CodeGen/aarch64-mops.c
index 36e1e0af6640..ca877aa8b6c6 100644
--- a/clang/test/CodeGen/aarch64-mops.c
+++ b/clang/test/CodeGen/aarch64-mops.c
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -triple aarch64 -Wno-int-conversion -target-feature +mops -target-feature +mte -w -S -emit-llvm -o - %s  | FileCheck --check-prefix=CHECK-MOPS   %s
-// RUN: not %clang_cc1 -triple aarch64 -Wno-int-conversion -target-feature +mops -Wno-implicit-function-declaration -w -S -emit-llvm -o - %s 2>&1  | FileCheck --check-prefix=CHECK-NOMOPS %s
-// RUN: not %clang_cc1 -triple aarch64 -Wno-int-conversion -Wno-implicit-function-declaration -target-feature +mte -w -S -emit-llvm -o - %s 2>&1 | FileCheck --check-prefix=CHECK-NOMOPS %s
-// RUN: not %clang_cc1 -triple aarch64 -Wno-int-conversion -Wno-implicit-function-declaration -w -S -emit-llvm -o - %s 2>&1 | FileCheck --check-prefix=CHECK-NOMOPS %s
+// RUN: %clang_cc1 -triple aarch64 -Wno-int-conversion -target-feature +mops -target-feature +mte -w -emit-llvm -o - %s  | FileCheck --check-prefix=CHECK-MOPS   %s
+// RUN: not %clang_cc1 -triple aarch64 -Wno-int-conversion -target-feature +mops -Wno-implicit-function-declaration -w -emit-llvm -o - %s 2>&1  | FileCheck --check-prefix=CHECK-NOMOPS %s
+// RUN: not %clang_cc1 -triple aarch64 -Wno-int-conversion -Wno-implicit-function-declaration -target-feature +mte -w -emit-llvm -o - %s 2>&1 | FileCheck --check-prefix=CHECK-NOMOPS %s
+// RUN: not %clang_cc1 -triple aarch64 -Wno-int-conversion -Wno-implicit-function-declaration -w -emit-llvm -o - %s 2>&1 | FileCheck --check-prefix=CHECK-NOMOPS %s
 
 #include <arm_acle.h>
 #include <stddef.h>
diff --git a/clang/test/CodeGen/aarch64-neon-3v.c b/clang/test/CodeGen/aarch64-neon-3v.c
index 74b1c5331fb6..9ed439379722 100644
--- a/clang/test/CodeGen/aarch64-neon-3v.c
+++ b/clang/test/CodeGen/aarch64-neon-3v.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon  -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +neon -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-neon-fma.c b/clang/test/CodeGen/aarch64-neon-fma.c
index 074e22d98fcb..b87c531b8b23 100644
--- a/clang/test/CodeGen/aarch64-neon-fma.c
+++ b/clang/test/CodeGen/aarch64-neon-fma.c
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -S -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-neon-intrinsics-constrained.c b/clang/test/CodeGen/aarch64-neon-intrinsics-constrained.c
index 33700f0d0d34..15ae7eea820e 100644
--- a/clang/test/CodeGen/aarch64-neon-intrinsics-constrained.c
+++ b/clang/test/CodeGen/aarch64-neon-intrinsics-constrained.c
@@ -1,9 +1,9 @@
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:     -S -disable-O0-optnone \
+// RUN:     -disable-O0-optnone \
 // RUN:  -flax-vector-conversions=none -emit-llvm -o - %s | opt -S -passes=mem2reg \
 // RUN: | FileCheck --check-prefixes=COMMON,COMMONIR,UNCONSTRAINED %s
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:     -S -disable-O0-optnone \
+// RUN:     -disable-O0-optnone \
 // RUN:  -ffp-exception-behavior=strict \
 // RUN:  -flax-vector-conversions=none -emit-llvm -o - %s | opt -S -passes=mem2reg \
 // RUN: | FileCheck --check-prefixes=COMMON,COMMONIR,CONSTRAINED %s
diff --git a/clang/test/CodeGen/aarch64-neon-intrinsics.c b/clang/test/CodeGen/aarch64-neon-intrinsics.c
index eeb50d095a5c..145d4302bb12 100644
--- a/clang/test/CodeGen/aarch64-neon-intrinsics.c
+++ b/clang/test/CodeGen/aarch64-neon-intrinsics.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:     -S -disable-O0-optnone \
+// RUN:     -disable-O0-optnone \
 // RUN:  -flax-vector-conversions=none -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg \
 // RUN: | FileCheck %s
diff --git a/clang/test/CodeGen/aarch64-neon-sha3.c b/clang/test/CodeGen/aarch64-neon-sha3.c
index 0fbfb4690373..46c964884b88 100644
--- a/clang/test/CodeGen/aarch64-neon-sha3.c
+++ b/clang/test/CodeGen/aarch64-neon-sha3.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon \
-// RUN:  -target-feature +sha3 -S -emit-llvm -o - %s \
+// RUN:  -target-feature +sha3 -emit-llvm -o - %s \
 // RUN:  | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
diff --git a/clang/test/CodeGen/aarch64-neon-shifts.c b/clang/test/CodeGen/aarch64-neon-shifts.c
index 4d0e13379a68..cf1bbef7f8ad 100644
--- a/clang/test/CodeGen/aarch64-neon-shifts.c
+++ b/clang/test/CodeGen/aarch64-neon-shifts.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:  -disable-O0-optnone -ffp-contract=fast -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN:  -disable-O0-optnone -ffp-contract=fast -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-neon-sm4-sm3.c b/clang/test/CodeGen/aarch64-neon-sm4-sm3.c
index 47f784b8175d..c1ee06939cdf 100644
--- a/clang/test/CodeGen/aarch64-neon-sm4-sm3.c
+++ b/clang/test/CodeGen/aarch64-neon-sm4-sm3.c
@@ -1,9 +1,9 @@
 // RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon \
-// RUN:  -target-feature +sm4 -S -emit-llvm -o - %s \
+// RUN:  -target-feature +sm4 -emit-llvm -o - %s \
 // RUN:  | FileCheck %s
 
 // RUN: not %clang_cc1 -Wno-error=implicit-function-declaration -triple aarch64-linux-gnu -target-feature +neon \
-// RUN: -S -emit-llvm -o - %s 2>&1 | FileCheck --check-prefix=CHECK-NO-CRYPTO %s
+// RUN: -emit-llvm -o - %s 2>&1 | FileCheck --check-prefix=CHECK-NO-CRYPTO %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-neon-vcadd.c b/clang/test/CodeGen/aarch64-neon-vcadd.c
index 9000e2dbb5b4..972e2485acb7 100644
--- a/clang/test/CodeGen/aarch64-neon-vcadd.c
+++ b/clang/test/CodeGen/aarch64-neon-vcadd.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon \
-// RUN:  -target-feature +v8.3a -target-feature +fullfp16 -S -emit-llvm -o - %s \
+// RUN:  -target-feature +v8.3a -target-feature +fullfp16 -emit-llvm -o - %s \
 // RUN:  | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
diff --git a/clang/test/CodeGen/aarch64-neon-vsqadd-float-conversion.c b/clang/test/CodeGen/aarch64-neon-vsqadd-float-conversion.c
index 9ce8f848ef4c..520488a5b794 100644
--- a/clang/test/CodeGen/aarch64-neon-vsqadd-float-conversion.c
+++ b/clang/test/CodeGen/aarch64-neon-vsqadd-float-conversion.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:  -S -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,dce \
+// RUN:  -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,dce \
 // RUN: | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
diff --git a/clang/test/CodeGen/aarch64-neon-vuqadd-float-conversion-warning.c b/clang/test/CodeGen/aarch64-neon-vuqadd-float-conversion-warning.c
index 27ce00174252..247bd4c6749a 100644
--- a/clang/test/CodeGen/aarch64-neon-vuqadd-float-conversion-warning.c
+++ b/clang/test/CodeGen/aarch64-neon-vuqadd-float-conversion-warning.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:  -S -disable-O0-optnone -emit-llvm -o - %s 2>&1 | FileCheck %s
+// RUN:  -disable-O0-optnone -emit-llvm -o - %s 2>&1 | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp
index fdd2de11365d..af8933d93d6c 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme \
-// RUN:   -S -disable-O0-optnone -Werror -emit-llvm -o - %s \
+// RUN:   -disable-O0-optnone -Werror -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg \
 // RUN: | opt -S -passes=inline \
 // RUN: | FileCheck %s
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c
index a333d85818d2..9541e44c9141 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c
index 7617dcef7ea9..b911c2791441 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme-i16i64 -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme-i16i64 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme-i16i64 -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme-i16i64 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme-i16i64 -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c
index 5fa4c35ed770..c0b3e1a06b0f 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c
index b26e32e5ff83..6f84e7b36b14 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c
index 02d4d034befb..fcbd17559dc7 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c
index c2c89aee03b5..4c102f38fd30 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c
index e036cb45feff..824c43e6d247 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c
index 84338597cdb3..37d5d73e97a6 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c
index 7b1a8b0a0201..509ad9ec17f7 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c
index 3d2a4e4d2b38..72c63bc3389a 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c
index 9e44d1c92534..508fad09ea71 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c
index d8e4b853308d..d742292ad9d5 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c
index 467cf9fd092a..b2c609b7c224 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c
index e80a965394e7..9ba1527f2696 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c
index e58021bf8bf4..b6ab6b07fb2b 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c
index 483e81327502..86a691f14623 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c
index 1baa43b7187b..60b783d96673 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/aarch64-sme2-attrs.cpp b/clang/test/CodeGen/aarch64-sme2-intrinsics/aarch64-sme2-attrs.cpp
index 1916f00e0eb9..ccf05de7f4e9 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/aarch64-sme2-attrs.cpp
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/aarch64-sme2-attrs.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \
-// RUN:   -S -disable-O0-optnone -Werror -emit-llvm -o - %s \
+// RUN:   -disable-O0-optnone -Werror -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg \
 // RUN: | opt -S -passes=inline \
 // RUN: | FileCheck %s
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c
index 2cc99d4fb88d..1dec2d6957a5 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c
@@ -2,11 +2,11 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_bmop.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_bmop.c
index 1ff7a7fedf1b..47ff02eb9fb5 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_bmop.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_bmop.c
@@ -2,11 +2,11 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c
index 257cb5952501..57ea4d2a1ac4 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c
@@ -1,13 +1,13 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \
-// RUN:  -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:  -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 \
-// RUN:  -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:  -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \
-// RUN:  -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:  -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 \
-// RUN:  -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:  -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \
 // RUN:  -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c
index 79a11c2ec153..4a5ee7e021f7 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c
@@ -2,11 +2,11 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtn.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtn.c
index 2b2b2e5c0f41..7b6b72fefe10 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtn.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtn.c
@@ -2,11 +2,11 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c
new file mode 100644
index 000000000000..ecc415545414
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c
@@ -0,0 +1,592 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1                               -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1                        -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s --check-prefix CHECK-CXX
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS        -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s --check-prefix CHECK-CXX
+
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall %s -o /dev/null
+
+// REQUIRES: aarch64-registered-target
+#include <arm_sme.h>
+
+#ifdef SME_OVERLOADED_FORMS
+#define SME_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED, A5) A1##A3##A5
+#else
+#define SME_ACLE_FUNC(A1, A2, A3, A4, A5) A1##A2##A3##A4##A5
+#endif
+
+// CHECK-LABEL: define dso_local void @test_svmla_single_za16_f16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z32test_svmla_single_za16_f16_vg1x2j13svfloat16x2_tu13__SVFloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmla_single_za16_f16_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmla,_single,_za16,_f16,_vg1x2)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_single_za16_f16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z32test_svmla_single_za16_f16_vg1x4j13svfloat16x4_tu13__SVFloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmla_single_za16_f16_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmla,_single,_za16,_f16,_vg1x4)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_single_za16_f16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z32test_svmls_single_za16_f16_vg1x2j13svfloat16x2_tu13__SVFloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmls_single_za16_f16_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmls,_single,_za16,_f16,_vg1x2)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_single_za16_f16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z32test_svmls_single_za16_f16_vg1x4j13svfloat16x4_tu13__SVFloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmls_single_za16_f16_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmls,_single,_za16,_f16,_vg1x4)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_za16_f16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 16 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z25test_svmla_za16_f16_vg1x2j13svfloat16x2_tS_(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 16 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmla_za16_f16_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmla,,_za16,_f16,_vg1x2)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_za16_f16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 32 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z25test_svmla_za16_f16_vg1x4j13svfloat16x4_tS_(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 32 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmla_za16_f16_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16x4_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmla,,_za16,_f16,_vg1x4)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_za16_f16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 16 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z25test_svmls_za16_f16_vg1x2j13svfloat16x2_tS_(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 16 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmls_za16_f16_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmls,,_za16,_f16,_vg1x2)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_za16_f16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 32 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z25test_svmls_za16_f16_vg1x4j13svfloat16x4_tS_(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 32 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmls_za16_f16_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16x4_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmls,,_za16,_f16,_vg1x4)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_lane_za16_f16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z30test_svmla_lane_za16_f16_vg1x2j13svfloat16x2_tu13__SVFloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM]], i32 7)
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmla_lane_za16_f16_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmla_lane,,_za16,_f16,_vg1x2)(slice, zn, zm, 7);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_lane_za16_f16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z30test_svmla_lane_za16_f16_vg1x4j13svfloat16x4_tu13__SVFloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM]], i32 7)
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmla_lane_za16_f16_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmla_lane,,_za16,_f16,_vg1x4)(slice, zn, zm, 7);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_lane_za16_f16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z30test_svmls_lane_za16_f16_vg1x2j13svfloat16x2_tu13__SVFloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM]], i32 7)
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmls_lane_za16_f16_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmls_lane,,_za16,_f16,_vg1x2)(slice, zn, zm, 7);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_lane_za16_f16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z30test_svmls_lane_za16_f16_vg1x4j13svfloat16x4_tu13__SVFloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM]], i32 7)
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmls_lane_za16_f16_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmls_lane,,_za16,_f16,_vg1x4)(slice, zn, zm, 7);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_single_za16_bf16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z33test_svmla_single_za16_bf16_vg1x2j14svbfloat16x2_tu14__SVBfloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmla_single_za16_bf16_vg1x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmla, _single, _za16, _bf16, _vg1x2)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_single_za16_bf16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z33test_svmla_single_za16_bf16_vg1x4j14svbfloat16x4_tu14__SVBfloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmla_single_za16_bf16_vg1x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmla, _single, _za16, _bf16, _vg1x4)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_single_za16_bf16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z33test_svmls_single_za16_bf16_vg1x2j14svbfloat16x2_tu14__SVBfloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmls_single_za16_bf16_vg1x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmls, _single, _za16, _bf16, _vg1x2)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_single_za16_bf16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z33test_svmls_single_za16_bf16_vg1x4j14svbfloat16x4_tu14__SVBfloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmls_single_za16_bf16_vg1x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmls, _single, _za16, _bf16, _vg1x4)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_za16_bf16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 16 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z26test_svmla_za16_bf16_vg1x2j14svbfloat16x2_tS_(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 16 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 8)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmla_za16_bf16_vg1x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16x2_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmla, , _za16, _bf16, _vg1x2)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_za16_bf16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 32 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z26test_svmla_za16_bf16_vg1x4j14svbfloat16x4_tS_(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 32 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 24)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmla_za16_bf16_vg1x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16x4_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmla, , _za16, _bf16, _vg1x4)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_za16_bf16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 16 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z26test_svmls_za16_bf16_vg1x2j14svbfloat16x2_tS_(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 16 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 8)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmls_za16_bf16_vg1x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16x2_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmls, , _za16, _bf16, _vg1x2)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_za16_bf16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 32 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z26test_svmls_za16_bf16_vg1x4j14svbfloat16x4_tS_(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 32 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 24)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmls_za16_bf16_vg1x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16x4_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmls, , _za16, _bf16, _vg1x4)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_lane_za16_bf16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z31test_svmla_lane_za16_bf16_vg1x2j14svbfloat16x2_tu14__SVBfloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM]], i32 7)
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmla_lane_za16_bf16_vg1x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmla_lane, , _za16, _bf16, _vg1x2)(slice, zn, zm, 7);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_lane_za16_bf16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z31test_svmla_lane_za16_bf16_vg1x4j14svbfloat16x4_tu14__SVBfloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM]], i32 7)
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmla_lane_za16_bf16_vg1x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmla_lane, , _za16, _bf16, _vg1x4)(slice, zn, zm, 7);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_lane_za16_bf16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z31test_svmls_lane_za16_bf16_vg1x2j14svbfloat16x2_tu14__SVBfloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM]], i32 7)
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmls_lane_za16_bf16_vg1x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmls_lane, , _za16, _bf16, _vg1x2)(slice, zn, zm, 7);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_lane_za16_bf16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z31test_svmls_lane_za16_bf16_vg1x4j14svbfloat16x4_tu14__SVBfloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM]], i32 7)
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmls_lane_za16_bf16_vg1x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmls_lane, , _za16, _bf16, _vg1x4)(slice, zn, zm, 7);
+}
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fp_dots.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fp_dots.c
index 3093eaf74586..cfffa1517c41 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fp_dots.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fp_dots.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c
index 8d1e358176c3..670ed2ba2149 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c
@@ -2,11 +2,11 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_int_dots.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_int_dots.c
index 51c07f3cf38a..07e2a024b6d1 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_int_dots.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_int_dots.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +sme-i16i64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +sme-i16i64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -target-feature +sme-i16i64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -target-feature +sme-i16i64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_ldr_str_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_ldr_str_zt.c
index 3e4454d94335..b7ad0e19e53e 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_ldr_str_zt.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_ldr_str_zt.c
@@ -2,9 +2,9 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c
index 70c31a4a87e7..8aad9114a541 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c
@@ -2,9 +2,9 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c
index 5bc9c9088517..b6ab013286a2 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c
@@ -2,9 +2,9 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c
index 82c004e3105a..f1272b32349f 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c
@@ -2,9 +2,9 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt.c
index e8706f957691..34838ffcc95f 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt.c
@@ -2,9 +2,9 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c
index 99feafbd682a..800a3d34f474 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c
@@ -2,9 +2,9 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c
index 0f0c33e48bd9..aa5bbf0029c6 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c
@@ -2,9 +2,9 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_max.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_max.c
index a4e2616784ef..8e696f1d1fa4 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_max.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_max.c
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 // REQUIRES: aarch64-registered-target
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_maxnm.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_maxnm.c
index 3e554212cb70..a956235034e1 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_maxnm.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_maxnm.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_min.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_min.c
index a438fd395219..860e6237bb9c 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_min.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_min.c
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 // REQUIRES: aarch64-registered-target
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_minnm.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_minnm.c
index b0cbdc748dc8..e34b9d98eff3 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_minnm.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_minnm.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c
index 597efff0eab9..e4aad372e64d 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c
index 252da9af33be..4fe1979b8c09 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c
index a4da4e5410ae..a43a3848d271 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -target-feature +sme-i16i64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +sme-i16i64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +sme-i16i64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -target-feature +sme-i16i64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -target-feature +sme-i16i64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c
index 74511d971400..990482dd5fa3 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c
index ce74f5d307b4..4d63644597d3 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mop.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mop.c
index 5cc0e0e1d36e..9aae8cc68dbd 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mop.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mop.c
@@ -2,11 +2,11 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mopa_nonwide.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mopa_nonwide.c
new file mode 100644
index 000000000000..626bb6d3cf6f
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mopa_nonwide.c
@@ -0,0 +1,97 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1                               -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature  +b16b16 -target-feature +sme-f16f16 -O2 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK
+// RUN: %clang_cc1                        -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature  +b16b16 -target-feature +sme-f16f16 -O2 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK-CXX
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS        -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature  +b16b16 -target-feature +sme-f16f16 -O2 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature  +b16b16 -target-feature +sme-f16f16 -O2 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK-CXX
+
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature  +b16b16 -target-feature +sme-f16f16 -S -O2 -Werror -o /dev/null %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_sme.h>
+
+#ifdef SME_OVERLOADED_FORMS
+#define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SME_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: define dso_local void @test_svmopa_za16_bf16(
+// CHECK-SAME: <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PM:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PN]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PM]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.mopa.nxv8bf16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x bfloat> [[ZN]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z21test_svmopa_za16_bf16u10__SVBool_tS_u14__SVBfloat16_tS0_(
+// CHECK-CXX-SAME: <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PM:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PN]])
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PM]])
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.mopa.nxv8bf16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x bfloat> [[ZN]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmopa_za16_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmopa_za16, _bf16, _m)(0, pn, pm, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmops_za16_bf16(
+// CHECK-SAME: <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PM:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PN]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PM]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.mops.nxv8bf16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x bfloat> [[ZN]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z21test_svmops_za16_bf16u10__SVBool_tS_u14__SVBfloat16_tS0_(
+// CHECK-CXX-SAME: <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PM:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PN]])
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PM]])
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.mops.nxv8bf16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x bfloat> [[ZN]], <vscale x 8 x bfloat> [[ZM]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmops_za16_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmops_za16, _bf16, _m)(0, pn, pm, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmopa_za16_f16(
+// CHECK-SAME: <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PM:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PN]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PM]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.mopa.nxv8f16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[ZN]], <vscale x 8 x half> [[ZM]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z20test_svmopa_za16_f16u10__SVBool_tS_u13__SVFloat16_tS0_(
+// CHECK-CXX-SAME: <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PM:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PN]])
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PM]])
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.mopa.nxv8f16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[ZN]], <vscale x 8 x half> [[ZM]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmopa_za16_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmopa_za16, _f16, _m)(0, pn, pm, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmops_za16_f16(
+// CHECK-SAME: <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PM:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PN]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PM]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.mops.nxv8f16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[ZN]], <vscale x 8 x half> [[ZM]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z20test_svmops_za16_f16u10__SVBool_tS_u13__SVFloat16_tS0_(
+// CHECK-CXX-SAME: <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PM:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PN]])
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PM]])
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.mops.nxv8f16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[ZN]], <vscale x 8 x half> [[ZM]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmops_za16_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmops_za16, _f16, _m)(0, pn, pm, zn, zm);
+}
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c
index 5fd4b0405652..761ace80078d 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c
index b86cb19c01e3..b3d5f4a4c4a5 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c
@@ -2,10 +2,10 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sqdmulh.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sqdmulh.c
index 7cf53e9a1452..6cdc4e01dd55 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sqdmulh.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sqdmulh.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c
index 7af8c589994f..da625b78e263 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c
@@ -2,11 +2,11 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx2.c
index 5937a288dd84..fcab37fb2efe 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx2.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx2.c
@@ -2,12 +2,12 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx4.c
index f54c09d5ef2c..bbafda69f217 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx4.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx4.c
@@ -2,12 +2,12 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vdot.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vdot.c
index 08431b3ceabe..28348002e62b 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vdot.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vdot.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c
index 2fb5d3bea27c..f205ce7ef76b 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c
@@ -2,11 +2,11 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_qrshr.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_qrshr.c
index eee927acc22e..9ddf8a411488 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_qrshr.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_qrshr.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_rshl.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_rshl.c
index 6308d6c596f1..2a70d5dda4df 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_rshl.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_rshl.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx2.c
index c2ecbf93bfaa..680538ddf59e 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx2.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx2.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx4.c
index 784c24c8e7cb..5487262e7570 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx4.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx4.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx2.c
index 6349cec77119..6cd6ffd93743 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx2.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx2.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx4.c
index 3d56948e25f7..c6a18dba99e6 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx4.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx4.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx2.c
index 4cc1f3af32ec..33a3606c877b 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx2.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx2.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx4.c
index cc356600ab53..24d79adabd10 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx4.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx4.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write.c
index 069bf13ff8d2..e7204351015f 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_zero_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_zero_zt.c
index 4105cc3e78ec..75612aa9fdfc 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_zero_zt.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_zero_zt.c
@@ -2,9 +2,9 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/aarch64-soft-float-abi-errors.c b/clang/test/CodeGen/aarch64-soft-float-abi-errors.c
index 551e53bcd63d..95b7668aca1b 100644
--- a/clang/test/CodeGen/aarch64-soft-float-abi-errors.c
+++ b/clang/test/CodeGen/aarch64-soft-float-abi-errors.c
@@ -1,9 +1,9 @@
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +fp-armv8 -S -o /dev/null -target-abi aapcs      -verify=fp-hard %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -fp-armv8 -S -o /dev/null -target-abi aapcs-soft -verify=nofp-soft %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -fp-armv8 -S -o /dev/null -target-abi aapcs      -verify=nofp-hard %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -fp-armv8 -S -o /dev/null -target-abi aapcs -O1  -verify=nofp-hard,nofp-hard-opt -emit-llvm %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +fp-armv8 -S -o /dev/null -target-abi aapcs -verify=fp-hard %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature -fp-armv8 -S -o /dev/null -target-abi aapcs-soft -verify=nofp-soft %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature -fp-armv8 -S -o /dev/null -target-abi aapcs -verify=nofp-hard %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature -fp-armv8 -o /dev/null -target-abi aapcs -O1 -verify=nofp-hard,nofp-hard-opt -emit-llvm %s
 // No run line needed for soft-float ABI with an FPU because that is rejected by the driver
 
 // With the hard-float ABI and a target with an FPU, FP arguments are passed in
@@ -69,6 +69,7 @@ inline void test_float_arg_inline(float a) {}
 inline void test_float_arg_inline_used(float a) {}
 // nofp-hard-opt-error@-1 {{'a' requires 'float' type support, but ABI 'aapcs' does not support it}}
 void use_inline() { test_float_arg_inline_used(1.0f); }
+// nofp-hard-error@-1 {{'use_inline' requires 'float' type support, but ABI 'aapcs' does not support it}}
 
 // The always_inline attribute causes an inline function to always be
 // code-genned, even at -O0, so we always emit the error.
@@ -76,6 +77,7 @@ __attribute((always_inline))
 inline void test_float_arg_always_inline_used(float a) {}
 // nofp-hard-error@-1 {{'a' requires 'float' type support, but ABI 'aapcs' does not support it}}
 void use_always_inline() { test_float_arg_always_inline_used(1.0f); }
+// nofp-hard-error@-1 {{'use_always_inline' requires 'float' type support, but ABI 'aapcs' does not support it}}
 
 // Floating-point expressions, global variables and local variables do not
 // affect the ABI, so are allowed. GCC does reject some uses of floating point
@@ -97,3 +99,25 @@ int test_var_double(int a) {
   d *= 6.0;
   return (int)d;
 }
+
+extern void extern_float_arg(float);
+extern float extern_float_ret(void);
+void call_extern_float_arg() { extern_float_arg(1.0f); }
+// nofp-hard-error@-1 {{'call_extern_float_arg' requires 'float' type support, but ABI 'aapcs' does not support it}}
+void call_extern_float_ret() { extern_float_ret(); }
+// nofp-hard-error@-1 {{'call_extern_float_ret' requires 'float' type support, but ABI 'aapcs' does not support it}}
+
+// Definitions of variadic functions, and calls to them which only use integer
+// argument registers, are both fine.
+void variadic(int, ...);
+void call_variadic_int() { variadic(0, 1); }
+
+// Calls to variadic functions with floating-point arguments are an error,
+// since this would require floating-point registers.
+void call_variadic_double() { variadic(0, 1.0); }
+// nofp-hard-error@-1 {{'call_variadic_double' requires 'double' type support, but ABI 'aapcs' does not support it}}
+
+// Calls through function pointers are also diagnosed.
+void (*fptr)(float);
+void call_indirect() { fptr(1.0f); }
+// nofp-hard-error@-1 {{'call_indirect' requires 'float' type support, but ABI 'aapcs' does not support it}}
diff --git a/clang/test/CodeGen/aarch64-sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.c b/clang/test/CodeGen/aarch64-sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.c
index 1e6a4500cc88..a4abe96cc08a 100644
--- a/clang/test/CodeGen/aarch64-sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.c
+++ b/clang/test/CodeGen/aarch64-sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.c
@@ -1,8 +1,8 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -O1 -Werror -Wall -emit-llvm -o - %s -mvscale-min=1 -mvscale-max=1  | FileCheck %s -D#VBITS=128  --check-prefixes=CHECK128
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -O1 -Werror -Wall -emit-llvm -o - %s -mvscale-min=2 -mvscale-max=2  | FileCheck %s -D#VBITS=256  --check-prefixes=CHECK,CHECK256
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -O1 -Werror -Wall -emit-llvm -o - %s -mvscale-min=4 -mvscale-max=4  | FileCheck %s -D#VBITS=512  --check-prefixes=CHECK,CHECK512
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -O1 -Werror -Wall -emit-llvm -o - %s -mvscale-min=8 -mvscale-max=8 | FileCheck %s -D#VBITS=1024 --check-prefixes=CHECK,CHECK1024
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -O1 -Werror -Wall -emit-llvm -o - %s -mvscale-min=16 -mvscale-max=16 | FileCheck %s -D#VBITS=2048 --check-prefixes=CHECK,CHECK2048
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -O1 -Werror -Wall -emit-llvm -o - %s -mvscale-min=1 -mvscale-max=1 | FileCheck %s -D#VBITS=128 --check-prefixes=CHECK128
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -O1 -Werror -Wall -emit-llvm -o - %s -mvscale-min=2 -mvscale-max=2 | FileCheck %s -D#VBITS=256 --check-prefixes=CHECK,CHECK256
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -O1 -Werror -Wall -emit-llvm -o - %s -mvscale-min=4 -mvscale-max=4 | FileCheck %s -D#VBITS=512 --check-prefixes=CHECK,CHECK512
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -O1 -Werror -Wall -emit-llvm -o - %s -mvscale-min=8 -mvscale-max=8 | FileCheck %s -D#VBITS=1024 --check-prefixes=CHECK,CHECK1024
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -O1 -Werror -Wall -emit-llvm -o - %s -mvscale-min=16 -mvscale-max=16 | FileCheck %s -D#VBITS=2048 --check-prefixes=CHECK,CHECK2048
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.cpp b/clang/test/CodeGen/aarch64-sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.cpp
index 8c41fb956145..05587fd9e7fe 100644
--- a/clang/test/CodeGen/aarch64-sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.cpp
+++ b/clang/test/CodeGen/aarch64-sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.cpp
@@ -1,8 +1,8 @@
-// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +sve -S -O1 -Werror -Wall -emit-llvm -o - %s -mvscale-min=1 -mvscale-max=1  | FileCheck %s -D#VBITS=128  --check-prefixes=CHECK,CHECK128
-// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +sve -S -O1 -Werror -Wall -emit-llvm -o - %s -mvscale-min=2 -mvscale-max=2  | FileCheck %s -D#VBITS=256  --check-prefixes=CHECK,CHECKWIDE
-// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +sve -S -O1 -Werror -Wall -emit-llvm -o - %s -mvscale-min=4 -mvscale-max=4  | FileCheck %s -D#VBITS=512  --check-prefixes=CHECK,CHECKWIDE
-// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +sve -S -O1 -Werror -Wall -emit-llvm -o - %s -mvscale-min=8 -mvscale-max=8 | FileCheck %s -D#VBITS=1024 --check-prefixes=CHECK,CHECKWIDE
-// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +sve -S -O1 -Werror -Wall -emit-llvm -o - %s -mvscale-min=16 -mvscale-max=16 | FileCheck %s -D#VBITS=2048 --check-prefixes=CHECK,CHECKWIDE
+// RUN: %clang_cc1 -x c++ -triple aarch64 -target-feature +sve -O1 -Werror -Wall -emit-llvm -o - %s -mvscale-min=1 -mvscale-max=1 | FileCheck %s -D#VBITS=128 --check-prefixes=CHECK,CHECK128
+// RUN: %clang_cc1 -x c++ -triple aarch64 -target-feature +sve -O1 -Werror -Wall -emit-llvm -o - %s -mvscale-min=2 -mvscale-max=2 | FileCheck %s -D#VBITS=256 --check-prefixes=CHECK,CHECKWIDE
+// RUN: %clang_cc1 -x c++ -triple aarch64 -target-feature +sve -O1 -Werror -Wall -emit-llvm -o - %s -mvscale-min=4 -mvscale-max=4 | FileCheck %s -D#VBITS=512 --check-prefixes=CHECK,CHECKWIDE
+// RUN: %clang_cc1 -x c++ -triple aarch64 -target-feature +sve -O1 -Werror -Wall -emit-llvm -o - %s -mvscale-min=8 -mvscale-max=8 | FileCheck %s -D#VBITS=1024 --check-prefixes=CHECK,CHECKWIDE
+// RUN: %clang_cc1 -x c++ -triple aarch64 -target-feature +sve -O1 -Werror -Wall -emit-llvm -o - %s -mvscale-min=16 -mvscale-max=16 | FileCheck %s -D#VBITS=2048 --check-prefixes=CHECK,CHECKWIDE
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-inline-asm-datatypes.c b/clang/test/CodeGen/aarch64-sve-inline-asm-datatypes.c
index 14a29dfac2c7..37513f8569ef 100644
--- a/clang/test/CodeGen/aarch64-sve-inline-asm-datatypes.c
+++ b/clang/test/CodeGen/aarch64-sve-inline-asm-datatypes.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve \
-// RUN:   -target-feature +neon -S -O1 -o - -emit-llvm %s | FileCheck %s
+// RUN:   -target-feature +neon -O1 -o - -emit-llvm %s | FileCheck %s
 
 // Tests to check that all sve datatypes can be passed in as input operands
 // and passed out as output operands.
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_abd.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_abd.c
index 88aba4fee7f1..18ef16feff3e 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_abd.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_abd.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_abs.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_abs.c
index 116534025d85..fab20023844e 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_abs.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_abs.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_acge.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_acge.c
index bc1e58bb5c99..0133223fb43e 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_acge.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_acge.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_acgt.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_acgt.c
index 8898cc2df65b..1b1e6cbca859 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_acgt.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_acgt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_acle.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_acle.c
index 1b41ee0b4d6e..9cfb26eef206 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_acle.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_acle.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_aclt.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_aclt.c
index 4652b451efd0..8c901d14a26f 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_aclt.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_aclt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_add.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_add.c
index 38792c8a7d29..8844297ff9d8 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_add.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_add.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adda.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adda.c
index 364c33fc40a2..f78c3d7303c2 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adda.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adda.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_addv.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_addv.c
index 2e94575beabb..fdac8aafe2a1 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_addv.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_addv.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adrb.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adrb.c
index 40ff792e5ab2..e5cea9e6f84e 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adrb.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adrb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adrd.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adrd.c
index 857b18a44a13..676aca7c0b52 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adrd.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adrd.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adrh.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adrh.c
index c0f436038917..b2691680ae36 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adrh.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adrh.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adrw.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adrw.c
index bf148ffab1de..416d4bcd3904 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adrw.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adrw.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_and.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_and.c
index 1da5cab782c4..6e02b018834f 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_and.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_and.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_andv.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_andv.c
index 9e3e588a844a..ed5c9da5a47c 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_andv.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_andv.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_asr.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_asr.c
index b303f3027732..073d3bb52c71 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_asr.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_asr.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_asrd.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_asrd.c
index 2d486c398caf..87091eebe851 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_asrd.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_asrd.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfdot.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfdot.c
index 00b9f91c8ee1..6424c5fd6d03 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfdot.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfdot.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmlalb.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmlalb.c
index f7d797416217..57c121742af6 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmlalb.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmlalb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16  -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16  -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmlalt.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmlalt.c
index 730527da91dd..b1904ae4c046 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmlalt.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmlalt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmmla.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmmla.c
index 74b1454be459..7e93514c51ec 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmmla.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmmla.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bic.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bic.c
index f7751ecf8ca8..bf5cf17dafa6 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bic.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bic.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_brka.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_brka.c
index 9e980fa1c01c..c07325f769db 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_brka.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_brka.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_brkb.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_brkb.c
index 41e09b80d788..0dbc7474bd3e 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_brkb.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_brkb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_brkn.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_brkn.c
index 8c36dcf846f7..75fb8bf38eab 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_brkn.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_brkn.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_brkpa.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_brkpa.c
index 7b96afd78c62..d48a63f27499 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_brkpa.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_brkpa.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_brkpb.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_brkpb.c
index 2634fdc68c1c..bbe7963555d8 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_brkpb.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_brkpb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cadd.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cadd.c
index 7e93c9ef52c7..35da7a47bab8 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cadd.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cadd.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clasta-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clasta-bfloat.c
index 5df1d6eb75ac..66478f8bd002 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clasta-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clasta-bfloat.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clasta.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clasta.c
index ef75153e2b66..2c53197a37a1 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clasta.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clasta.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clastb-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clastb-bfloat.c
index f1ab4379bd6a..5c78db73e6cc 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clastb-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clastb-bfloat.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clastb.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clastb.c
index 9807d140095b..0d123f75e221 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clastb.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clastb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cls.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cls.c
index 54529a112078..5936b976e0ac 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cls.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cls.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clz.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clz.c
index 66241c6a4a80..239e6ad5584b 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clz.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clz.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmla.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmla.c
index 5c83fc00633d..4dc2c79cdb89 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmla.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmla.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmpeq.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmpeq.c
index 9e3fc1eec805..fa80e58a9e37 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmpeq.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmpeq.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmpge.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmpge.c
index edc2e39f31e3..478fb34f8ceb 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmpge.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmpge.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmpgt.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmpgt.c
index c235db026dde..8721acdfd3fd 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmpgt.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmpgt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmple.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmple.c
index ecdb41626bc8..688ea57cc732 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmple.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmple.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmplt.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmplt.c
index 799c16090bd0..5919ba72a390 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmplt.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmplt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmpne.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmpne.c
index ba4244433c5c..5cedc1c71e3b 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmpne.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmpne.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmpuo.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmpuo.c
index 0e1eeb501876..98f0e0442936 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmpuo.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cmpuo.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cnot.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cnot.c
index 028cc6528bfe..eb80d68eeae3 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cnot.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cnot.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cnt-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cnt-bfloat.c
index 87a6629ce8f9..0d164fe76393 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cnt-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cnt-bfloat.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cnt.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cnt.c
index 89a0ad758605..45ccccb2fba1 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cnt.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cnt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cntb.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cntb.c
index 659b6af01f10..70a9360f9a32 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cntb.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cntb.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 // CHECK-LABEL: @test_svcntb(
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cntd.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cntd.c
index c520c5d3dae7..5b05fca3c78a 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cntd.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cntd.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 // CHECK-LABEL: @test_svcntd(
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cnth.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cnth.c
index 6bd4cfd39513..82d374302096 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cnth.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cnth.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 // CHECK-LABEL: @test_svcnth(
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cntp.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cntp.c
index 65da709a2ef5..a5208d4dc5b3 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cntp.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cntp.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 // CHECK-LABEL: @test_svcntp_b8(
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cntw.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cntw.c
index f56f64064340..a0a2931211b0 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cntw.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cntw.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 // CHECK-LABEL: @test_svcntw(
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_compact.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_compact.c
index 5201dcfbdad6..4c18969e78f0 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_compact.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_compact.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create2-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create2-bfloat.c
index 1ed09cc5965f..4dbefd6c8634 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create2-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create2-bfloat.c
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64 -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create2.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create2.c
index 1aead4e5572f..836bfe1ecbba 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create2.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create2.c
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64 -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create3-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create3-bfloat.c
index 90176ff0dbd4..00a6d4e34d78 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create3-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create3-bfloat.c
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64 -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create3.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create3.c
index 2fe1a88441b2..3b6e91d59ccb 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create3.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create3.c
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64 -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create4-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create4-bfloat.c
index 8ad801912345..214e81db14d4 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create4-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create4-bfloat.c
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64 -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create4.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create4.c
index 5953cdd4e207..a6d7e611af83 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create4.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create4.c
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64 -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cvt-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cvt-bfloat.c
index ed5974b54c7c..12b804b37ebc 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cvt-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cvt-bfloat.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cvt.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cvt.c
index 558664101980..aa2de6412e6e 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cvt.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cvt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cvtnt.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cvtnt.c
index 59053c4706eb..8772917715a7 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cvtnt.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cvtnt.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_div.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_div.c
index d854a6e37e95..441ce1b71854 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_div.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_div.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_divr.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_divr.c
index 2ce49d7311ee..d4d0364400e7 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_divr.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_divr.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dot.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dot.c
index 96d0b2ed55d4..924dd8f0af26 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dot.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dot.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dup-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dup-bfloat.c
index 5af625461f93..3c459531b7ff 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dup-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dup-bfloat.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dup.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dup.c
index 15806362b3cb..5294ccbc4ef5 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dup.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dup.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dupq-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dupq-bfloat.c
index 9b85fee8fce4..6534beaf59d2 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dupq-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dupq-bfloat.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dupq.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dupq.c
index d9c7fdde969f..9c3f4420d449 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dupq.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dupq.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_eor.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_eor.c
index 0d0adffe10c9..e39012eaed1f 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_eor.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_eor.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_eorv.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_eorv.c
index da2bc9fc86bf..98748dade7bb 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_eorv.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_eorv.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_expa.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_expa.c
index c82f79ce4b0a..52b6822a833f 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_expa.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_expa.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ext-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ext-bfloat.c
index cdf057e97e5c..ca2525799698 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ext-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ext-bfloat.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest  -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ext.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ext.c
index 0d4e07958cd4..1ccfa8ffd8fc 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ext.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ext.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_extb.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_extb.c
index 995c95266526..e9080bc0982f 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_extb.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_extb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_exth.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_exth.c
index eca5a2100bc6..9063c284e036 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_exth.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_exth.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_extw.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_extw.c
index cabacd2635f7..e4ec2e1a5556 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_extw.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_extw.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2-bfloat.c
index b9c46b2261f5..4e18eb6725e0 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2-bfloat.c
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64 -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2.c
index 8cd887aaff40..2603de90a98e 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64 -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3-bfloat.c
index 7a991bc7431d..847455b0cb69 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3-bfloat.c
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64 -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3.c
index de7c3c303ffc..f2547d929da8 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64 -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4-bfloat.c
index 3a5e282bfdfa..b39a46999d58 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4-bfloat.c
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64 -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4.c
index 9b4f9e5332a5..b5109184c657 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64 -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_index.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_index.c
index 1398d6913b0f..0a7158e0b92c 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_index.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_index.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 // CHECK-LABEL: @test_svindex_s8(
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_insr-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_insr-bfloat.c
index a413c3bc981e..d7e59bf4c72d 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_insr-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_insr-bfloat.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_insr.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_insr.c
index 54e2e27607f6..cf3409bbfafb 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_insr.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_insr.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_lasta-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_lasta-bfloat.c
index 22e0a7bc5446..6d74362e1530 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_lasta-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_lasta-bfloat.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_lasta.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_lasta.c
index 6f333b64e12a..ba87f343db47 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_lasta.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_lasta.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_lastb-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_lastb-bfloat.c
index 0ece3339a9a0..3532f628593a 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_lastb-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_lastb-bfloat.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_lastb.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_lastb.c
index 2d3e88572ce9..669c07511f63 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_lastb.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_lastb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1-bfloat.c
index cc9a3186da08..38d88b483968 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1-bfloat.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1.c
index 12a2e95cf957..d355ea192802 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1ro-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1ro-bfloat.c
index 8a40bc67a486..5107877ae361 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1ro-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1ro-bfloat.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -target-feature +f64mm -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -target-feature +f64mm -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -target-feature +f64mm -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -target-feature +f64mm -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -target-feature +f64mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -target-feature +f64mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -target-feature +f64mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -target-feature +f64mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1ro.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1ro.c
index 408552b7a589..2baba98cc505 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1ro.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1ro.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +f64mm -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +f64mm -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +f64mm -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +f64mm -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +f64mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +f64mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +f64mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +f64mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1rq-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1rq-bfloat.c
index fc0559640597..c015614056e7 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1rq-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1rq-bfloat.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1rq.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1rq.c
index a7f14dc78b78..66528ae771af 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1rq.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1rq.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sb.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sb.c
index 1275ab07fd9f..560a18146b08 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sb.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sh.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sh.c
index 5384c432e69d..b2f164635769 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sh.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sh.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sw.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sw.c
index eeac9cdddf02..42cc5bf83a4a 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sw.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sw.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1ub.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1ub.c
index 82852ebb16d5..50f81d1614af 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1ub.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1ub.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1uh.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1uh.c
index 8028f6e544f4..2efccced81f9 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1uh.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1uh.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1uw.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1uw.c
index 41afe4fe6b4b..ee5a41565056 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1uw.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1uw.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld2-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld2-bfloat.c
index 38ae15f858f6..0a7649a23f9a 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld2-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld2-bfloat.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld2.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld2.c
index 84946783cd9f..50bbc144be4f 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld2.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld2.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld3-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld3-bfloat.c
index 90b3674cb1d2..ff04431fb87f 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld3-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld3-bfloat.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld3.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld3.c
index c56ef67007c8..753bf39d6561 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld3.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld3.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld4-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld4-bfloat.c
index be00d117523b..c6063872c63f 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld4-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld4-bfloat.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld4.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld4.c
index c75c85f939df..6920813c44a2 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld4.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld4.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1-bfloat.c
index e9d516d63a14..dce5839ebd75 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1-bfloat.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1.c
index ef0e4de8da2e..a647eb0469f9 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1sb.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1sb.c
index 00077bde953f..e76475a2b376 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1sb.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1sb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1sh.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1sh.c
index fd326def0a26..1b53e7f05f69 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1sh.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1sh.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1sw.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1sw.c
index 848be449bff8..392f4174b9d0 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1sw.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1sw.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1ub.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1ub.c
index 33283973575b..c843ba720ab8 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1ub.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1ub.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1uh.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1uh.c
index 81f14ae7ff72..81397adcc2e4 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1uh.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1uh.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1uw.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1uw.c
index c1a647ba2d17..1f0038cf12e4 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1uw.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1uw.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1-bfloat.c
index 23893e1e0eda..a8ebc5d63d89 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1-bfloat.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // REQUIRES: aarch64-registered-target
 
 #include <arm_sve.h>
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1.c
index 7b47c2c12b5f..017f960d3706 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1sb.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1sb.c
index f5c89606eed2..1e587fd002e0 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1sb.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1sb.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 // CHECK-LABEL: @test_svldnf1sb_s16(
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1sh.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1sh.c
index 73f8a7db982f..d82976b0c3c3 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1sh.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1sh.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 // CHECK-LABEL: @test_svldnf1sh_s32(
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1sw.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1sw.c
index 812a4bcd1199..0b40bbd1bc2f 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1sw.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1sw.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 // CHECK-LABEL: @test_svldnf1sw_s64(
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1ub.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1ub.c
index 119b9ee954bf..ec2b5966b8c8 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1ub.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1ub.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 // CHECK-LABEL: @test_svldnf1ub_s16(
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1uh.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1uh.c
index f2bb7982d798..18e700abedd8 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1uh.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1uh.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 // CHECK-LABEL: @test_svldnf1uh_s32(
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1uw.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1uw.c
index 2e8fc486384c..2648b23010c2 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1uw.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1uw.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 // CHECK-LABEL: @test_svldnf1uw_s64(
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnt1-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnt1-bfloat.c
index 2ce42fe44128..ee0b46db5ebc 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnt1-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnt1-bfloat.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnt1.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnt1.c
index d5d77ebed5a3..37a41d5fd4ed 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnt1.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnt1.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_len-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_len-bfloat.c
index d30506c38c18..1128a7310938 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_len-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_len-bfloat.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_len.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_len.c
index d942a3991fae..10675a2cc08c 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_len.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_len.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_lsl.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_lsl.c
index 98525699359e..1c32eea466fd 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_lsl.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_lsl.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_lsr.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_lsr.c
index 7d13ff8c873e..5efba57d4541 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_lsr.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_lsr.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_mad.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_mad.c
index dc9afccb7d2a..0070faba95e3 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_mad.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_mad.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_matmul_fp32.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_matmul_fp32.c
index b1622ab9484e..10442f4e3115 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_matmul_fp32.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_matmul_fp32.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f32mm -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f32mm -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f32mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f32mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f32mm -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f32mm -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f32mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f32mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_matmul_fp64.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_matmul_fp64.c
index 284271ef9d8b..8586a65fa240 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_matmul_fp64.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_matmul_fp64.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_max.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_max.c
index 156f84073b7f..2cf6cf3439b0 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_max.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_max.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_maxnm.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_maxnm.c
index f18baa5862cf..530717887d39 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_maxnm.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_maxnm.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_maxnmv.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_maxnmv.c
index 68ba7e0baae6..803bce2ee72c 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_maxnmv.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_maxnmv.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_maxv.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_maxv.c
index e9047ea92746..a49e6cb669c8 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_maxv.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_maxv.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_min.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_min.c
index f12553f704af..80c3dd15e8bd 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_min.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_min.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_minnm.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_minnm.c
index 30c516d7fe12..127294f939af 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_minnm.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_minnm.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_minnmv.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_minnmv.c
index 4ff1d4196d63..d4bc5fcb7185 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_minnmv.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_minnmv.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_minv.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_minv.c
index ca6e17f088cf..e01e50340181 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_minv.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_minv.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_mla.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_mla.c
index 20c62c0bec4c..6946c5b472da 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_mla.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_mla.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_mls.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_mls.c
index ce00014b53e1..650b844e8ed0 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_mls.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_mls.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_mmla.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_mmla.c
index 76720d1d5f0d..e34849c10218 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_mmla.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_mmla.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +i8mm -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +i8mm -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +i8mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +i8mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +i8mm -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +i8mm -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +i8mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +i8mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_mov.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_mov.c
index 172cd026249c..79e68be49b2b 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_mov.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_mov.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_msb.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_msb.c
index e4f7fb44c130..888b8331b6b5 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_msb.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_msb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_mul.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_mul.c
index 4799c6b17598..f58f78eacb98 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_mul.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_mul.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_mulh.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_mulh.c
index 31e82a0ac1dc..6698fdfca836 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_mulh.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_mulh.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_mulx.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_mulx.c
index 0e4f0b34cac4..e615b3eab17c 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_mulx.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_mulx.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_nand.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_nand.c
index 32c6307aa0d3..26e29149df2c 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_nand.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_nand.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_neg.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_neg.c
index e12352bd8524..f591b84930d4 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_neg.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_neg.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_nmad.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_nmad.c
index 431686138a5b..1a3edeae057c 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_nmad.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_nmad.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_nmla.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_nmla.c
index fd2a374d61d2..9545ec736f0d 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_nmla.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_nmla.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_nmls.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_nmls.c
index 55b2cbc16cc6..12d564e04feb 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_nmls.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_nmls.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_nmsb.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_nmsb.c
index 6522cf3e12fe..e12b13d0469f 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_nmsb.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_nmsb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_nor.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_nor.c
index be82cfc9f3b3..d10b592dae81 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_nor.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_nor.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_not.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_not.c
index 5579264d006a..68e538b758a9 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_not.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_not.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_orn.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_orn.c
index e69bda5c0f27..6cec937818a1 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_orn.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_orn.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_orr.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_orr.c
index 9c13da2d737e..111ed80f96e6 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_orr.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_orr.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_orv.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_orv.c
index 56d0e020f80a..74759eba210b 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_orv.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_orv.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_pfalse.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_pfalse.c
index 380057ad47b8..846ec490e986 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_pfalse.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_pfalse.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_pfirst.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_pfirst.c
index 83ccd9bdfa04..82960d0bad41 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_pfirst.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_pfirst.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_pnext.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_pnext.c
index 55791ea0de1e..9b23c760700a 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_pnext.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_pnext.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 // CHECK-LABEL: @test_svpnext_b8(
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_prfb.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_prfb.c
index 4c584de625b9..ee183399671c 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_prfb.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_prfb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_prfd.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_prfd.c
index 4034a2b98776..360fb5be0109 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_prfd.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_prfd.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_prfh.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_prfh.c
index adb4a051f906..40c0993f1452 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_prfh.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_prfh.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_prfw.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_prfw.c
index 13b63f7a2d84..a0d8203e64f6 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_prfw.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_prfw.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ptest.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ptest.c
index d1346fbdb1cf..4a640dd69ada 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ptest.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ptest.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 // CHECK-LABEL: @test_svptest_any(
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ptrue.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ptrue.c
index ddd35a1ef487..808f6aa061f5 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ptrue.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ptrue.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 // CHECK-LABEL: @test_svptrue_b8(
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qadd.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qadd.c
index 7bb452fcf490..b536bcb52a50 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qadd.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qadd.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qdecb.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qdecb.c
index 10ecce4fd309..70fff215aa3b 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qdecb.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qdecb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qdecd.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qdecd.c
index 26d569c91dea..3b1fcf11ef50 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qdecd.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qdecd.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qdech.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qdech.c
index a040227b4523..847113c986fb 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qdech.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qdech.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qdecp.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qdecp.c
index 218516c0b502..ca24390d6cf7 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qdecp.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qdecp.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qdecw.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qdecw.c
index 286920a3357f..740f48f4fa0d 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qdecw.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qdecw.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qincb.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qincb.c
index ebd418fee980..30f9d9f9423e 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qincb.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qincb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qincd.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qincd.c
index a6e29814cf0f..706fbbd9e167 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qincd.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qincd.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qinch.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qinch.c
index 39e17c8c113f..5fdb898d6350 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qinch.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qinch.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qincp.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qincp.c
index aedc1ef63ad0..a0f0991302a4 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qincp.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qincp.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qincw.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qincw.c
index b904a6503b66..664328a82687 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qincw.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qincw.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qsub.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qsub.c
index a3e271f6a279..d5d413ebdff6 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qsub.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_qsub.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rbit.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rbit.c
index c64343e75515..997d53755ab2 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rbit.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rbit.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rdffr.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rdffr.c
index 52d116ffac4b..6bf56bdea505 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rdffr.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rdffr.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 // CHECK-LABEL: @test_svrdffr(
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_recpe.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_recpe.c
index 3058b32273ee..344ea90299dd 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_recpe.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_recpe.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_recps.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_recps.c
index 4070c951997e..7be5b15674a0 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_recps.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_recps.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_recpx.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_recpx.c
index 653bf414f252..bfccfb840c47 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_recpx.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_recpx.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret-bfloat.c
index 75d8feb8a847..bf2cd23e4080 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret-bfloat.c
@@ -1,23 +1,23 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x2 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE2
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x3 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE3
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x4 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE4
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x2 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE2
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x3 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE3
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x4 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE4
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -DTUPLE=x2 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE2
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -DTUPLE=x3 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE3
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -DTUPLE=x4 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE4
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -DTUPLE=x2 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE2
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -DTUPLE=x3 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE3
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -DTUPLE=x4 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE4
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x2 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE2
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x3 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE3
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x4 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE4
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x2 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE2
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x3 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE3
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x4 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE4
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -DTUPLE=x2 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE2
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -DTUPLE=x3 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE3
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -DTUPLE=x4 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE4
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -DTUPLE=x2 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE2
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -DTUPLE=x3 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE3
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -DTUPLE=x4 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE4
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret.c
index 24167a8f5ce1..3d9d5c3ce45a 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret.c
@@ -1,22 +1,22 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DTUPLE=x2 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE2
-// RUN: %clang_cc1 -DTUPLE=x3 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE3
-// RUN: %clang_cc1 -DTUPLE=x4 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE4
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DTUPLE=x2 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE2
-// RUN: %clang_cc1 -DTUPLE=x3 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE3
-// RUN: %clang_cc1 -DTUPLE=x4 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE4
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -DTUPLE=x2 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE2
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -DTUPLE=x3 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE3
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -DTUPLE=x4 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE4
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -DTUPLE=x2 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE2
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -DTUPLE=x3 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE3
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -DTUPLE=x4 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE4
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DTUPLE=x2 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE2
+// RUN: %clang_cc1 -DTUPLE=x3 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE3
+// RUN: %clang_cc1 -DTUPLE=x4 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE4
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTUPLE=x2 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE2
+// RUN: %clang_cc1 -DTUPLE=x3 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE3
+// RUN: %clang_cc1 -DTUPLE=x4 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE4
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -DTUPLE=x2 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE2
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -DTUPLE=x3 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE3
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -DTUPLE=x4 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE4
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -DTUPLE=x2 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE2
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -DTUPLE=x3 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE3
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -DTUPLE=x4 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE4
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret_from_streaming_mode.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret_from_streaming_mode.c
index b430a0ba3cc7..f27875836193 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret_from_streaming_mode.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret_from_streaming_mode.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -O1 -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -O1 -Werror -Wall -o /dev/null %s
 
 // Note: We need to run this test with '-O1' because oddly enough the svreinterpret is always inlined at -O0.
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rev-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rev-bfloat.c
index a9121ba6c55b..ff436a62f807 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rev-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rev-bfloat.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rev.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rev.c
index dbd6149d7899..0bd7fe5e7d22 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rev.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rev.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_revb.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_revb.c
index 2d61030bbc90..f5508d2c8d9d 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_revb.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_revb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_revh.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_revh.c
index 28c59b40cae8..e1ceeda3423e 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_revh.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_revh.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_revw.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_revw.c
index e60041cbf247..8798f11adf27 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_revw.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_revw.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rinta.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rinta.c
index 45dba4db7d83..124cbce2dbc9 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rinta.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rinta.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rinti.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rinti.c
index 129fe4ce02a2..4086d4f65072 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rinti.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rinti.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rintm.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rintm.c
index fe74c22a7d92..03f3a667812f 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rintm.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rintm.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rintn.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rintn.c
index c35abcc0cfa4..4f00f7d4a7f5 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rintn.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rintn.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rintp.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rintp.c
index 8187fc5c8135..fb5e9a5f55ee 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rintp.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rintp.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rintx.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rintx.c
index 02278cd44da9..fc257faaf83e 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rintx.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rintx.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rintz.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rintz.c
index ae0d953cb9df..2e405ae59996 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rintz.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rintz.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rsqrte.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rsqrte.c
index 888c675bbd88..b124a6d586f7 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rsqrte.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rsqrte.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rsqrts.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rsqrts.c
index 1aaf205a33f7..4a32324f007c 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rsqrts.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_rsqrts.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_scale.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_scale.c
index 3687a56b1896..e4cc4cb287da 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_scale.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_scale.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_sel-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_sel-bfloat.c
index 81344fce5b53..0665b4411830 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_sel-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_sel-bfloat.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_sel.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_sel.c
index 39126b5c0afe..af7f8da11399 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_sel.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_sel.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set2-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set2-bfloat.c
index 8d683784ddf8..ab19feb26163 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set2-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set2-bfloat.c
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64 -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set2.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set2.c
index b2bf4ad08aa9..2d7d408375f1 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set2.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set2.c
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64 -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set3-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set3-bfloat.c
index d488576c4be7..1c0774e0c392 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set3-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set3-bfloat.c
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64 -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set3.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set3.c
index 9d10e6afca93..8d5cf3e6a6dc 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set3.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set3.c
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64 -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set4-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set4-bfloat.c
index f7124ac2ac4b..7907ee0c8146 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set4-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set4-bfloat.c
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64 -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set4.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set4.c
index ce35bfb83c88..ab74a8d79bd9 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set4.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set4.c
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64 -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_setffr.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_setffr.c
index dad5c592a38d..7e6729e80aa7 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_setffr.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_setffr.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 // CHECK-LABEL: @test_svsetffr(
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_splice-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_splice-bfloat.c
index d05ec6780eff..e3959ac34f00 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_splice-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_splice-bfloat.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_splice.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_splice.c
index 9635ae4a4ec0..6070da575850 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_splice.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_splice.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_sqrt.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_sqrt.c
index 756c7112e551..2c51c473bdc2 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_sqrt.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_sqrt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1-bfloat.c
index 49acb38a2c96..7075e96d62c5 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1-bfloat.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1.c
index 7adaece58c5f..987ec6c3e878 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1b.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1b.c
index 7500c7718e25..10187fba3aa6 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1b.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1b.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o - -emit-llvm %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o - -emit-llvm %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -o - -emit-llvm %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -o - -emit-llvm %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1h.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1h.c
index 7394b9c0fe54..598b42a5e9c2 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1h.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1h.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o - -emit-llvm %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o - -emit-llvm %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -o - -emit-llvm %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -o - -emit-llvm %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1w.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1w.c
index 9dfa096552eb..e224d944f7b9 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1w.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1w.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o - -emit-llvm %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o - -emit-llvm %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -o - -emit-llvm %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -o - -emit-llvm %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2-bfloat.c
index 89a7d0112936..dae405bc2f29 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2-bfloat.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c
index 7848cbc5d9ab..b6cb6dbdc0c4 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3-bfloat.c
index fb66c1e961ac..fb56b830e615 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3-bfloat.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3.c
index cf85c72ed71c..fef8c86822e4 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4-bfloat.c
index 03ff3b4d51b6..d112624d61b3 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4-bfloat.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4.c
index c6b49361fdfb..2d25599fe175 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_stnt1-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_stnt1-bfloat.c
index d1be3024d9e2..03c0b6cb1f61 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_stnt1-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_stnt1-bfloat.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_stnt1.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_stnt1.c
index 2ddb154091a1..9749737b0e3f 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_stnt1.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_stnt1.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_sub.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_sub.c
index d95efdfcd941..7f5c8aae031d 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_sub.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_sub.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_subr.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_subr.c
index a4f7498c24ee..d34108c323bd 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_subr.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_subr.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_sudot.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_sudot.c
index 399d5da0a5c6..4066c71c9ec3 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_sudot.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_sudot.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -target-feature +i8mm -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -target-feature +i8mm -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -target-feature +i8mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -target-feature +i8mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -target-feature +i8mm -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -target-feature +i8mm -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -target-feature +i8mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -target-feature +i8mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_tbl-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_tbl-bfloat.c
index 077b5779e34f..4e1f7cb709ee 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_tbl-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_tbl-bfloat.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_tbl.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_tbl.c
index 5eaeddf00247..e8daca4c600d 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_tbl.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_tbl.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_tmad.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_tmad.c
index 2430d520fff7..ec02e1e4cbed 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_tmad.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_tmad.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn1-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn1-bfloat.c
index 432fda432459..6fce83b6c046 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn1-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn1-bfloat.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn1-fp64-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn1-fp64-bfloat.c
index 369e045d71f5..ea1d515a9b4b 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn1-fp64-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn1-fp64-bfloat.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn1-fp64.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn1-fp64.c
index b0b09908b8ce..04028c7850ce 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn1-fp64.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn1-fp64.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn1.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn1.c
index 4d20fda2efc3..70f97236bab8 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn1.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn1.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn2-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn2-bfloat.c
index 51668b93b77c..fb07f3c911b1 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn2-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn2-bfloat.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn2-fp64-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn2-fp64-bfloat.c
index 21c27e2c1dab..d8171cc24055 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn2-fp64-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn2-fp64-bfloat.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn2-fp64.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn2-fp64.c
index 84d56b7e42fb..e9859192333f 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn2-fp64.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn2-fp64.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn2.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn2.c
index 721e89f2db66..0bf72c919820 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn2.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_trn2.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_tsmul.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_tsmul.c
index ab249b305a0d..496cae8e1bbb 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_tsmul.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_tsmul.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_tssel.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_tssel.c
index 6e38034479fb..2b00c6667380 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_tssel.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_tssel.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef-bfloat.c
index c723137d5958..4a2f512c95ee 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef-bfloat.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef.c
index d92dbd9df0dd..d9eea5ab0413 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 // CHECK-LABEL: @test_svundef_s8(
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef2-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef2-bfloat.c
index d86a1e67b00d..98ae82dc4909 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef2-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef2-bfloat.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef2.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef2.c
index 6d64f2efdb1f..5f9471d42b5d 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef2.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef2.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -O2 -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -O2 -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -O2 -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -O2 -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef3-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef3-bfloat.c
index 7705e25c1e91..cc02e9a3c76d 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef3-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef3-bfloat.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef3.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef3.c
index 606c173a5849..e4b3a5e6860a 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef3.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef3.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -O2 -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -O2 -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -O2 -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -O2 -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef4-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef4-bfloat.c
index 675b5ac269dd..0c7130af9ba6 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef4-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef4-bfloat.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef4.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef4.c
index dad37e98998a..a574f8454144 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef4.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_undef4.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -O2 -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -O2 -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -O2 -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -O2 -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_unpkhi.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_unpkhi.c
index 6d4e1adcf9ae..d18c1c8b5275 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_unpkhi.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_unpkhi.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_unpklo.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_unpklo.c
index 31f052aaabf0..aaadcb69f273 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_unpklo.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_unpklo.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_usdot.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_usdot.c
index 18153f332d38..2ef08d496e17 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_usdot.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_usdot.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -target-feature +i8mm -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -target-feature +i8mm -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -target-feature +i8mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -target-feature +i8mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -target-feature +i8mm -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -target-feature +i8mm -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -target-feature +i8mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -target-feature +i8mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp1-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp1-bfloat.c
index 1ff6d28eb648..82e720d254ab 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp1-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp1-bfloat.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp1-fp64-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp1-fp64-bfloat.c
index f25a273e3898..0ce3685db507 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp1-fp64-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp1-fp64-bfloat.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp1-fp64.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp1-fp64.c
index 3f4513fa3a0d..c8e36d8ca2bb 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp1-fp64.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp1-fp64.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp1.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp1.c
index b54f67184074..f5c6268cf5cb 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp1.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp1.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp2-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp2-bfloat.c
index 85f83216be72..7bb7a93f8142 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp2-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp2-bfloat.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp2-fp64-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp2-fp64-bfloat.c
index 1082a0aa89ce..2cd16655fbb1 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp2-fp64-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp2-fp64-bfloat.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp2-fp64.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp2-fp64.c
index 3d9c88dbddf7..c5ced8c80c98 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp2-fp64.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp2-fp64.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp2.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp2.c
index 31708e60c187..965d10ea9efc 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp2.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_uzp2.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_whilele.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_whilele.c
index 07803e61c4e6..eee0096882cf 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_whilele.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_whilele.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_whilelt.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_whilelt.c
index 9393d0eacba7..4994eb27e630 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_whilelt.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_whilelt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_wrffr.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_wrffr.c
index 61182ca52607..8454c3cf1d0f 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_wrffr.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_wrffr.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 // CHECK-LABEL: @test_svwrffr(
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip1-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip1-bfloat.c
index ceb6afbe9fdc..dd1533c508bb 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip1-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip1-bfloat.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip1-fp64-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip1-fp64-bfloat.c
index 87988126ecfa..f7ea7e177d69 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip1-fp64-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip1-fp64-bfloat.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip1-fp64.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip1-fp64.c
index 6b06e2c882fc..b5aef2270c3c 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip1-fp64.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip1-fp64.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip1.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip1.c
index c284674d85f1..2d6c8d1ed6bf 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip1.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip1.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip2-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip2-bfloat.c
index 4d4fea731ea8..275ab9073f81 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip2-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip2-bfloat.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip2-fp64-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip2-fp64-bfloat.c
index bc0a002482fa..b3ffe987830e 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip2-fp64-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip2-fp64-bfloat.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip2-fp64.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip2-fp64.c
index 71db1c1cfab7..a890d8c77e9e 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip2-fp64.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip2-fp64.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip2.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip2.c
index 0a9cb7b12d11..2a7418fb518f 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip2.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_zip2.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aba.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aba.c
index eadb4e03478e..60b941e320c8 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aba.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aba.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_abalb.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_abalb.c
index 168aa394808a..a883867fc671 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_abalb.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_abalb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_abalt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_abalt.c
index 27cba1bd5d7c..a0788814bee8 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_abalt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_abalt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_abdlb.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_abdlb.c
index 02bc478555c3..310e1a769af4 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_abdlb.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_abdlb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_abdlt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_abdlt.c
index b843cdae902e..6782612685f2 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_abdlt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_abdlt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_adalp.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_adalp.c
index 6d6425263ec4..cd79eaecc67e 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_adalp.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_adalp.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_adclb.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_adclb.c
index 403e76c70432..0d5c5416a9ae 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_adclb.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_adclb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_adclt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_adclt.c
index ad224e4f6fe7..6ea0b9255a8d 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_adclt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_adclt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_addhnb.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_addhnb.c
index 3b8a45debc77..39076cbae940 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_addhnb.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_addhnb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_addhnt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_addhnt.c
index 2b894fda185c..25214c968637 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_addhnt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_addhnt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_addlb.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_addlb.c
index 72d891930403..5effaf680c10 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_addlb.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_addlb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_addlbt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_addlbt.c
index 9941739559ee..378dad5ed668 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_addlbt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_addlbt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_addlt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_addlt.c
index 231fac76b166..2acc28a42da9 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_addlt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_addlt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_addp.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_addp.c
index dd12127b4272..7d0c20149ba7 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_addp.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_addp.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_addwb.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_addwb.c
index 3e786fabf8c7..c0143454e939 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_addwb.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_addwb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_addwt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_addwt.c
index eff54a38c089..f12bb84d6b3f 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_addwt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_addwt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesd.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesd.c
index 57b260a1d409..5f862df0dce1 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesd.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesd.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2-aes -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2-aes -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2-aes -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2-aes -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aese.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aese.c
index af1c22f8ca1f..458437eda564 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aese.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aese.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2-aes -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2-aes -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2-aes -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2-aes -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesimc.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesimc.c
index 11fc0cb80f9a..fef2008887df 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesimc.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesimc.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2-aes -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2-aes -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2-aes -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2-aes -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesmc.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesmc.c
index 3636255eb198..8af1183785ce 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesmc.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesmc.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2-aes -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2-aes -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2-aes -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2-aes -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_bcax.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_bcax.c
index 866b3f6b85a6..663881ff18d7 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_bcax.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_bcax.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_bdep.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_bdep.c
index 2a43e91fe20e..d3b01e2b0e97 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_bdep.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_bdep.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2-bitperm -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2-bitperm -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2-bitperm -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2-bitperm -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_bext.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_bext.c
index 1e4e658f170f..895dd648fe1b 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_bext.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_bext.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2-bitperm -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2-bitperm -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2-bitperm -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2-bitperm -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_bgrp.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_bgrp.c
index 7f91acfb7483..9675865ef8df 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_bgrp.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_bgrp.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2-bitperm -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2-bitperm -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2-bitperm -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2-bitperm -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_bsl.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_bsl.c
index 870b1cd4b0e9..90a86726b9b5 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_bsl.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_bsl.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_bsl1n.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_bsl1n.c
index 021d798b9c2b..3575184a5107 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_bsl1n.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_bsl1n.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_bsl2n.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_bsl2n.c
index 176f43038f11..659d44f1ff18 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_bsl2n.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_bsl2n.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_cadd.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_cadd.c
index 76230351d1d0..bd2baa474b60 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_cadd.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_cadd.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_cdot.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_cdot.c
index 0a30d5a1ab64..667b497b4ab0 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_cdot.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_cdot.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_cmla.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_cmla.c
index bed8dd43b9f1..62ae7b6386b1 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_cmla.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_cmla.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_cvtlt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_cvtlt.c
index d9f8d54bd2a8..1c33d677344c 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_cvtlt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_cvtlt.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_cvtnt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_cvtnt.c
index 7da4dc11698b..5bfd9acd9c55 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_cvtnt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_cvtnt.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_cvtx.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_cvtx.c
index d114d72a7653..02f7fcf842f0 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_cvtx.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_cvtx.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_cvtxnt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_cvtxnt.c
index 180bf96a9e66..e66d545af32a 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_cvtxnt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_cvtxnt.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_eor3.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_eor3.c
index f5f2bdb99921..ec3a7307efaa 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_eor3.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_eor3.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_eorbt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_eorbt.c
index aec3628534b3..4344ea662435 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_eorbt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_eorbt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_eortb.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_eortb.c
index 503c3ff1fa35..44c4b2c4690e 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_eortb.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_eortb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_hadd.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_hadd.c
index a5bf4df70801..c99c384ced50 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_hadd.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_hadd.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_histcnt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_histcnt.c
index 0e85e259da4f..1259a501fb67 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_histcnt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_histcnt.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_histseg.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_histseg.c
index 97d0c1299984..1d65f90b60df 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_histseg.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_histseg.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_hsub.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_hsub.c
index d614b5513660..3689530d5019 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_hsub.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_hsub.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_hsubr.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_hsubr.c
index 019c8e0d25f0..6582f22e41f4 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_hsubr.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_hsubr.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_ldnt1.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_ldnt1.c
index e02d1a77ec6f..ae49dd34461f 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_ldnt1.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_ldnt1.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_ldnt1sb.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_ldnt1sb.c
index b64c7999ba7a..9e447b1e2cd7 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_ldnt1sb.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_ldnt1sb.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_ldnt1sh.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_ldnt1sh.c
index f823311efbb6..956699cdf406 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_ldnt1sh.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_ldnt1sh.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_ldnt1sw.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_ldnt1sw.c
index 1b7825ad9745..8db5e0e5bc60 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_ldnt1sw.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_ldnt1sw.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_ldnt1ub.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_ldnt1ub.c
index 5b800a447fd3..1471849090f9 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_ldnt1ub.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_ldnt1ub.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_ldnt1uh.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_ldnt1uh.c
index c50febdbc549..6947d204add5 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_ldnt1uh.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_ldnt1uh.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_ldnt1uw.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_ldnt1uw.c
index 1030f125e191..859ea7825bc7 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_ldnt1uw.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_ldnt1uw.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_logb.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_logb.c
index ee8b3bf755f2..742a4f4ec893 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_logb.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_logb.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_match.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_match.c
index 6803a7e635fa..27f8ac8c9ddc 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_match.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_match.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_maxnmp.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_maxnmp.c
index 0df693a62aca..1b048791f7e7 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_maxnmp.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_maxnmp.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_maxp.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_maxp.c
index a65c23e066af..2aeb84f88fc8 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_maxp.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_maxp.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_minnmp.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_minnmp.c
index e591f01113f3..f74f2654f4a2 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_minnmp.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_minnmp.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_minp.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_minp.c
index 7e946fb77dd4..7a6677353320 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_minp.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_minp.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mla.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mla.c
index 92f0adbe4fe6..26d5ee37693e 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mla.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mla.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mlalb.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mlalb.c
index f6052236427c..5630989f5ee7 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mlalb.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mlalb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mlalt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mlalt.c
index 5f2621518b7f..6fbef8a834fd 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mlalt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mlalt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mls.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mls.c
index 650a762478d8..9e577868884e 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mls.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mls.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mlslb.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mlslb.c
index 691dc651908a..13bb36135ade 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mlslb.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mlslb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mlslt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mlslt.c
index 415ed7a1b320..dc3714ea7ba0 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mlslt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mlslt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_movlb.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_movlb.c
index b4a9819138b8..46bcdc4577f6 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_movlb.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_movlb.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_movlt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_movlt.c
index dd45ee8f4574..986a3308434c 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_movlt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_movlt.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mul.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mul.c
index 2ce320a3e214..7edda59102f8 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mul.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mul.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mullb.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mullb.c
index 2b4eee899e75..cdff9fc03213 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mullb.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mullb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mullt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mullt.c
index 1d7114da72d5..80a3193c3757 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mullt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_mullt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_nbsl.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_nbsl.c
index 48ff44ba7cfe..27db555ef59d 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_nbsl.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_nbsl.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_nmatch.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_nmatch.c
index 3c380c0d6bf5..41c013d1302b 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_nmatch.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_nmatch.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmul.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmul.c
index 3b7c5fa5a0b5..e117c91ead1e 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmul.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmul.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmullb.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmullb.c
index 634929a95b15..dfce12aaa3e1 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmullb.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmullb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmullb_128.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmullb_128.c
index af87ec3cd32e..bf82d1a97286 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmullb_128.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmullb_128.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2-aes -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2-aes -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2-aes -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2-aes -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmullt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmullt.c
index da71669234be..d9444c604d82 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmullt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmullt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmullt_128.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmullt_128.c
index e3ade4db03cd..0e1e2fe51638 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmullt_128.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmullt_128.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2-aes -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2-aes -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2-aes -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2-aes -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qabs.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qabs.c
index fff93b68f135..8ad4a07f0867 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qabs.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qabs.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qadd.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qadd.c
index 7ce2a25f2061..f6b6028fffc7 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qadd.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qadd.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qcadd.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qcadd.c
index 4f970e5e8efc..2d2acc80085a 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qcadd.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qcadd.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmlalb.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmlalb.c
index 66ed96e12c43..73f981d2796e 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmlalb.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmlalb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmlalbt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmlalbt.c
index 61475e301a60..c2bcda60f349 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmlalbt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmlalbt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmlalt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmlalt.c
index b044f837cfe0..d911057bd0ed 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmlalt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmlalt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmlslb.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmlslb.c
index 7d5ea37f98a8..5500f6361ef5 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmlslb.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmlslb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmlslbt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmlslbt.c
index fa7cb693609c..0dd200f6d79f 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmlslbt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmlslbt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmlslt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmlslt.c
index ce11ea0d4a4b..5734420b9d0c 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmlslt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmlslt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmulh.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmulh.c
index ad8d8e81f1ac..0aa2d1fe1321 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmulh.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmulh.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmullb.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmullb.c
index b62993df37b4..81f26c74456c 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmullb.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmullb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmullt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmullt.c
index c1a1ba96512e..c4c8df3210d8 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmullt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qdmullt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qneg.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qneg.c
index 9c8384378844..5703e79809e1 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qneg.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qneg.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrdcmlah.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrdcmlah.c
index 2903417c7a3c..be624499b6f8 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrdcmlah.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrdcmlah.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrdmlah.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrdmlah.c
index 74881fb8e635..38aeccf8995f 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrdmlah.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrdmlah.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrdmlsh.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrdmlsh.c
index 36d8c727a252..81038c8209b1 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrdmlsh.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrdmlsh.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrdmulh.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrdmulh.c
index 1ee44db958c1..5986f90d83e3 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrdmulh.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrdmulh.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrshl.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrshl.c
index 9e30b20ec5de..f3e066012623 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrshl.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrshl.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrshrnb.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrshrnb.c
index 6f714e52a7bb..23e647524b27 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrshrnb.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrshrnb.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrshrnt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrshrnt.c
index 157a0b7237e7..903e388adc14 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrshrnt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrshrnt.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrshrunb.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrshrunb.c
index a0ddbd2f3569..098268f09f4a 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrshrunb.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrshrunb.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrshrunt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrshrunt.c
index 9f6f7eef8f17..b747203d5afd 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrshrunt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qrshrunt.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qshl.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qshl.c
index 604d28986cc9..baf82eff4d5a 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qshl.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qshl.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qshlu.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qshlu.c
index bc3c8d274469..48c2586dc448 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qshlu.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qshlu.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qshrnb.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qshrnb.c
index ca526d94ff19..bb7322f3464c 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qshrnb.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qshrnb.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qshrnt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qshrnt.c
index 0467df724446..6984957045ba 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qshrnt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qshrnt.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qshrunb.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qshrunb.c
index 4e4d43ee124a..90a5265867c3 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qshrunb.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qshrunb.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qshrunt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qshrunt.c
index 0c00b4a85cfb..fee8ca046c87 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qshrunt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qshrunt.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qsub.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qsub.c
index f38cfc143872..e4113ba6046e 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qsub.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qsub.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qsubr.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qsubr.c
index 4fdb9c658189..dbe1eada9b37 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qsubr.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qsubr.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qxtnb.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qxtnb.c
index 2cdf35117e79..e0e79fd5763e 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qxtnb.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qxtnb.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qxtnt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qxtnt.c
index bf2fd5977562..d7a052ccda6f 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qxtnt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qxtnt.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qxtunb.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qxtunb.c
index 3d04d15a458e..d51e3a42f76b 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qxtunb.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qxtunb.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qxtunt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qxtunt.c
index 7e7c1ef55ee5..ac182b2c6265 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qxtunt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_qxtunt.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_raddhnb.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_raddhnb.c
index 3d757fedeef6..3c359989ad42 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_raddhnb.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_raddhnb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_raddhnt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_raddhnt.c
index 0a5f5b6f853c..2eed004fd267 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_raddhnt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_raddhnt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rax1.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rax1.c
index dc51df0f885d..aa1b8d1b7659 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rax1.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rax1.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2-sha3 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2-sha3 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2-sha3 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2-sha3 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2-sha3 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2-sha3 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2-sha3 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2-sha3 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_recpe.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_recpe.c
index cd37d29064a1..0cc5f7a574af 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_recpe.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_recpe.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_revd.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_revd.c
index d82d69442b8f..128a7eb102da 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_revd.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_revd.c
@@ -1,13 +1,13 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \
-// RUN:   -target-feature +sme2 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN:   -target-feature +sme2 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu \
-// RUN:   -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN:   -target-feature +sve2p1 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \
-// RUN:   -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -target-feature +sve2p1 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu \
-// RUN:   -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -target-feature +sve2p1 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -o /dev/null %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rhadd.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rhadd.c
index 71189e6aeaa1..fef839578e83 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rhadd.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rhadd.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rshl.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rshl.c
index 099636450fc7..68a3a223c4b1 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rshl.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rshl.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rshr.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rshr.c
index 40210a8a279b..a06673cd822b 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rshr.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rshr.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rshrnb.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rshrnb.c
index 2155f333a73d..23ade9dfb3f4 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rshrnb.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rshrnb.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rshrnt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rshrnt.c
index 7f9537e15ed0..1a18cc1045b8 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rshrnt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rshrnt.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rsqrte.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rsqrte.c
index ae216389af0f..89da67f81e29 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rsqrte.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rsqrte.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rsra.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rsra.c
index 9d236c50860c..2cc07a716972 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rsra.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rsra.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rsubhnb.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rsubhnb.c
index 9957b0c15309..2a9b7ed4674e 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rsubhnb.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rsubhnb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rsubhnt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rsubhnt.c
index a579d5b53fd0..726eda3325f0 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rsubhnt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_rsubhnt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sbclb.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sbclb.c
index 1eaa38525943..0edb4ca6bf9a 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sbclb.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sbclb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sbclt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sbclt.c
index 60145a5a2750..5d9ac8343654 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sbclt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sbclt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_shllb.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_shllb.c
index bf3237bde69c..c3adb4f789fa 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_shllb.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_shllb.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_shllt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_shllt.c
index 2266fc43d956..c62a96150888 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_shllt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_shllt.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_shrnb.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_shrnb.c
index 89640cf68ed4..c45a38e16f9e 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_shrnb.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_shrnb.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_shrnt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_shrnt.c
index c6abfea46975..6b23004c29b1 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_shrnt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_shrnt.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sli.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sli.c
index 4c608fc9d71b..7c2085d13744 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sli.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sli.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sm4e.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sm4e.c
index 75148aa7b06b..d1c3dd47c58d 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sm4e.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sm4e.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2-sm4 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2-sm4 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2-sm4 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2-sm4 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2-sm4 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2-sm4 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2-sm4 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2-sm4 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sm4ekey.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sm4ekey.c
index d50c3aeee66a..3bc971fb9eba 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sm4ekey.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sm4ekey.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2-sm4 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2-sm4 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2-sm4 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2-sm4 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2-sm4 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2-sm4 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2-sm4 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2-sm4 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sqadd.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sqadd.c
index d8cffdd81ced..824b65d541e3 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sqadd.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sqadd.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sra.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sra.c
index 0ab643b98707..1975753ac866 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sra.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sra.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sri.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sri.c
index 03f5f7027126..001d83f2f1fa 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sri.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sri.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_stnt1.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_stnt1.c
index b0ef72a4af6e..2f315b4c1709 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_stnt1.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_stnt1.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_stnt1b.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_stnt1b.c
index 6ac43c01dc2f..fee489cb0180 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_stnt1b.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_stnt1b.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_stnt1h.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_stnt1h.c
index 7419adf6b40c..18c0d2d077eb 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_stnt1h.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_stnt1h.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_stnt1w.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_stnt1w.c
index 48d374ee7249..0f376eb26228 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_stnt1w.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_stnt1w.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_subhnb.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_subhnb.c
index 44cdfd20327b..1ab0d5539b01 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_subhnb.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_subhnb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_subhnt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_subhnt.c
index 1f5f92639abd..ab063e0e3b74 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_subhnt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_subhnt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sublb.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sublb.c
index f012d4b6cf51..c583ef0df6f2 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sublb.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sublb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sublbt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sublbt.c
index ea1d48d0e66f..d0c9a591d65a 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sublbt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sublbt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sublt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sublt.c
index 96b05c6c4f9b..0e3cf98b9175 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sublt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_sublt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_subltb.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_subltb.c
index 03f836c6639f..5e27e4171244 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_subltb.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_subltb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_subwb.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_subwb.c
index ea92ff819af0..584cb458fb58 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_subwb.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_subwb.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_subwt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_subwt.c
index 90b0cd19ae1e..e3a86405fad6 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_subwt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_subwt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_tbl2-bfloat.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_tbl2-bfloat.c
index 9026e6cc8616..97d7719297ea 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_tbl2-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_tbl2-bfloat.c
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +bf16 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +bf16 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +bf16 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +bf16 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_tbl2.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_tbl2.c
index 13f967a6cc24..90708577ab7e 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_tbl2.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_tbl2.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_tbx-bfloat.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_tbx-bfloat.c
index 1807a6bc252a..a046b837df4c 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_tbx-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_tbx-bfloat.c
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +bf16 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +bf16 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +bf16 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +bf16 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_tbx.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_tbx.c
index 9fb6a9439781..5782feb01126 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_tbx.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_tbx.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_uqadd.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_uqadd.c
index 9f7c1ecd9adc..a0fc7fd0e4c3 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_uqadd.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_uqadd.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_whilege.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_whilege.c
index 971c65a28a69..eec570826f35 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_whilege.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_whilege.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_whilegt.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_whilegt.c
index 7d6caeea18eb..90a527705062 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_whilegt.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_whilegt.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_whilerw-bfloat.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_whilerw-bfloat.c
index 78e47e07d222..4e151f56cd7a 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_whilerw-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_whilerw-bfloat.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +bf16 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +bf16 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +bf16 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +bf16 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_whilerw.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_whilerw.c
index 9dcb59905a0b..080b698a5161 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_whilerw.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_whilerw.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_whilewr-bfloat.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_whilewr-bfloat.c
index d7969f3fd6d8..36c2dddd768c 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_whilewr-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_whilewr-bfloat.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +bf16 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +bf16 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +bf16 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +bf16 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_whilewr.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_whilewr.c
index 3cd13311f696..75fb3e15fb08 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_whilewr.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_whilewr.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_xar.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_xar.c
index 78f7b96bf034..4683a8a7ea70 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_xar.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_xar.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfadd.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfadd.c
index 2af8995b6fc9..9d01ba773694 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfadd.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfadd.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfclamp.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfclamp.c
index ddb279147bd1..f672451812ec 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfclamp.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfclamp.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmax.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmax.c
index b0534753b1be..ce28a0b0653d 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmax.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmax.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmaxnm.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmaxnm.c
index ddf0a5711bab..d8fec8c5145e 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmaxnm.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmaxnm.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmin.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmin.c
index 42d29e6dbc39..5efaa37a1464 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmin.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmin.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfminnm.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfminnm.c
index 27d85374aadc..98fd12b3a839 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfminnm.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfminnm.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla.c
index 4928147767d8..0a70466a540d 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla_lane.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla_lane.c
index e408c20e325b..3f9983a18682 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla_lane.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla_lane.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls.c
index e952d1b17ae8..ed71d4e490bb 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls_lane.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls_lane.c
index f4d3f9e9bd60..e958bbe847cc 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls_lane.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls_lane.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmlsl.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmlsl.c
index 22d951c069bc..24e484019dac 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmlsl.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmlsl.c
@@ -1,14 +1,14 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme2 -DTEST_SME2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme2 -target-feature +sve -DTEST_SME2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme2 -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul.c
index 078ea58408ad..f0a3664426de 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul_lane.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul_lane.c
index 44cdf49c57bb..a14fb6f1a531 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul_lane.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul_lane.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfsub.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfsub.c
index 442562e9ed20..6f08ea84ab82 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfsub.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfsub.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_cntp.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_cntp.c
index 9bf55eaa6a08..b957abd9b2b8 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_cntp.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_cntp.c
@@ -1,12 +1,12 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -DTEST_SME2 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -DTEST_SME2 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sme2 -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -DTEST_SME2 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -DTEST_SME2 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_create2_bool.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_create2_bool.c
index c4cbec11bf32..d441c934bad0 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_create2_bool.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_create2_bool.c
@@ -1,18 +1,18 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s\
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s\
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_create4_bool.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_create4_bool.c
index bc889d9f6018..650273af8b66 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_create4_bool.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_create4_bool.c
@@ -1,18 +1,18 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s\
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s\
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
+// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_dot.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_dot.c
index 97099c1f1061..88a1a6d9fe47 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_dot.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_dot.c
@@ -1,13 +1,13 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme2 -DTEST_SME2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme2 -DTEST_SME2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme2 -DTEST_SME2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifndef TEST_SME2
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_dupq.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_dupq.c
index 587a67aa6b7c..fceb45768ad7 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_dupq.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_dupq.c
@@ -1,13 +1,13 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_extq.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_extq.c
index 738b290b76cf..43b70544596b 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_extq.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_extq.c
@@ -1,13 +1,13 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_fclamp.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_fclamp.c
index 7687257701a6..39478aa9c09e 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_fclamp.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_fclamp.c
@@ -1,13 +1,13 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
 // RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve \
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_fp_reduce.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_fp_reduce.c
index 9d5ffdafe866..7f2278e0ff5d 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_fp_reduce.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_fp_reduce.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_neon.h>
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_get2_bool.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_get2_bool.c
index 35e6f1b84ab1..c0b65616aa6c 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_get2_bool.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_get2_bool.c
@@ -1,18 +1,18 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s\
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s\
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
+// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 // REQUIRES: aarch64-registered-target
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_get4_bool.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_get4_bool.c
index 8a5f9568e367..e5016d6cb3dc 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_get4_bool.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_get4_bool.c
@@ -1,18 +1,18 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s\
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s\
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
+// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_int_reduce.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_int_reduce.c
index b395b4d1323e..694e7f6f13dc 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_int_reduce.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_int_reduce.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1.c
index 28373bd80177..e7e302fdeb73 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1.c
@@ -1,13 +1,13 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wno-unknown-attributes -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -DTEST_SME2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -O1 -Werror -Wno-unknown-attributes -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -DTEST_SME2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1_single.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1_single.c
index 1cc8c7e469de..f5802af0ca7c 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1_single.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1_single.c
@@ -1,13 +1,13 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
 // RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ldnt1.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ldnt1.c
index 668f2b8b8113..549f1a2fff8e 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ldnt1.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ldnt1.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -DTEST_SME2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -DTEST_SME2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c
index 8922b19eed42..a97028ae87a0 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pext.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pext.c
index 8f08b32618b0..8b6d57a62db0 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pext.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pext.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -DTEST_SME2 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -S -DTEST_SME2 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sme2 -DTEST_SME2 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sve2p1 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -DTEST_SME2 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pfalse.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pfalse.c
index afdb038fb931..0da3b61838fb 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pfalse.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pfalse.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -DTEST_SME2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -DTEST_SME2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pmov_to_pred.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pmov_to_pred.c
index 84f058ad8c16..1cb8f05de10f 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pmov_to_pred.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pmov_to_pred.c
@@ -1,13 +1,13 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1\
 // RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pmov_to_vector.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pmov_to_vector.c
index 1e45f1ecedce..6e3ef44ad804 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pmov_to_vector.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_pmov_to_vector.c
@@ -1,13 +1,13 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1\
 // RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_psel.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_psel.c
index a61d874a1d9e..5cdc3bf43b01 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_psel.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_psel.c
@@ -1,17 +1,17 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \
-// RUN:   -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN:   -target-feature +sve2p1 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \
-// RUN:   -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -target-feature +sve2p1 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \
-// RUN:   -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN:   -target-feature +sve2p1 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \
-// RUN:   -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -target-feature +sve2p1 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu \
-// RUN:   -target-feature +sme -S -DTEST_SME -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -target-feature +sme -DTEST_SME -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -DTEST_SME -disable-O0-optnone -Werror -Wall -o /dev/null %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_psel_svcount.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_psel_svcount.c
index 4b8a582be301..be8b56eb2a9a 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_psel_svcount.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_psel_svcount.c
@@ -1,17 +1,17 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \
-// RUN:   -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN:   -target-feature +sve2p1 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \
-// RUN:   -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -target-feature +sve2p1 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \
-// RUN:   -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN:   -target-feature +sve2p1 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \
-// RUN:   -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -target-feature +sve2p1 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu \
-// RUN:   -target-feature +sme2 -S -DTEST_SME2 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -target-feature +sme2 -DTEST_SME2 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ptrue.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ptrue.c
index 7f9d60a3d320..0d6b6e5baebe 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ptrue.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ptrue.c
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64 -target-feature +sme2 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64 -target-feature +sme2 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qcvtn.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qcvtn.c
index fb53ea456c81..7447a1f970a4 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qcvtn.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qcvtn.c
@@ -2,15 +2,15 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1  -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1  -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
-// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64 -target-feature +sve -target-feature +sme2 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64 -target-feature +sve -target-feature +sme2 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64 -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c
index 6ebf224db923..3d8c956df417 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c
@@ -1,12 +1,12 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S  -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S  -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S  -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S  -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S  -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_sclamp.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_sclamp.c
index 04869fd550ec..828c70aaffd4 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_sclamp.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_sclamp.c
@@ -1,19 +1,19 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
 // RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 \
-// RUN:   -S -DTEST_SME2 -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:   -DTEST_SME2 -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 \
-// RUN:   -S -DTEST_SME2 -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -DTEST_SME2 -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 \
 // RUN:   -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_set2_bool.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_set2_bool.c
index 75c8d035aedb..910fb62f9358 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_set2_bool.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_set2_bool.c
@@ -1,18 +1,18 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s\
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s\
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
+// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_set4_bool.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_set4_bool.c
index d68810352693..5507bf9b349d 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_set4_bool.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_set4_bool.c
@@ -1,18 +1,18 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s\
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s\
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
+// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 // REQUIRES: aarch64-registered-target
 
 #include <arm_sve.h>
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1.c
index 9fe716950ddc..4e0dcc523925 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -DTEST_SME2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -DTEST_SME2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1_single.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1_single.c
index 677ec8c81d24..2874683d90c3 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1_single.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1_single.c
@@ -1,13 +1,13 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
 // RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_stnt1.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_stnt1.c
index 4c7e824ac3fb..ba8ffc4b1640 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_stnt1.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_stnt1.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -DTEST_SME2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -DTEST_SME2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c
index bd63065d4e55..04c822b2ac4f 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_tblq.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_tblq.c
index 6c04413c238a..119c1195e613 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_tblq.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_tblq.c
@@ -1,13 +1,13 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
 // RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_tbxq.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_tbxq.c
index 0ad7107b6767..e9bea301c4e9 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_tbxq.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_tbxq.c
@@ -1,13 +1,13 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
 // RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uclamp.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uclamp.c
index 37bfd4265a43..f2b7cb7cb35c 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uclamp.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uclamp.c
@@ -1,19 +1,19 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
 // RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 \
-// RUN:   -S -DTEST_SME2 -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:   -DTEST_SME2 -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 \
-// RUN:   -S -DTEST_SME2 -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -DTEST_SME2 -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 \
 // RUN:   -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_undef_bool.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_undef_bool.c
index 1cc7476ac538..5197e41c1ffd 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_undef_bool.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_undef_bool.c
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O2 -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O2 -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O2 -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -O2 -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -O2 -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -O2 -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -O2 -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -O2 -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uzpq1.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uzpq1.c
index c0fec3951ff5..433ec30ddac4 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uzpq1.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uzpq1.c
@@ -1,13 +1,13 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
 // RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uzpq2.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uzpq2.c
index 0477ed2c6c28..368aec34384e 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uzpq2.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_uzpq2.c
@@ -1,13 +1,13 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
 // RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_pn.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_pn.c
index 11ebec9e7cbf..db106c04155d 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_pn.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_pn.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -DTEST_SME2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -DTEST_SME2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -S -DTEST_SME2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_x2.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_x2.c
index 475fa14e1165..07f4b4cded79 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_x2.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_x2.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -DTEST_SME -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -DTEST_SME -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_zipq1.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_zipq1.c
index 06297651471f..aa1abc23637c 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_zipq1.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_zipq1.c
@@ -1,13 +1,13 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
 // RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_zipq2.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_zipq2.c
index 04cb6c69de6c..fe6d2322b479 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_zipq2.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_zipq2.c
@@ -1,13 +1,13 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
-// RUN:   -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16\
 // RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
diff --git a/clang/test/CodeGen/aarch64-sysregs-target.c b/clang/test/CodeGen/aarch64-sysregs-target.c
index ec69a67a0814..a2af67bddf1b 100644
--- a/clang/test/CodeGen/aarch64-sysregs-target.c
+++ b/clang/test/CodeGen/aarch64-sysregs-target.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +v8a -fsyntax-only -verify -emit-llvm -o - %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +d128 -fsyntax-only -verify=d128 -emit-llvm -o - %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +v8a -verify -emit-llvm -o - %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +d128 -verify=d128 -emit-llvm -o - %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-targetattr.c b/clang/test/CodeGen/aarch64-targetattr.c
index bf4c1476d881..3e7a20924560 100644
--- a/clang/test/CodeGen/aarch64-targetattr.c
+++ b/clang/test/CodeGen/aarch64-targetattr.c
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64 -S -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -emit-llvm %s -o - | FileCheck %s
 
 // CHECK-LABEL: @v82() #0
 __attribute__((target("arch=armv8.2-a")))
diff --git a/clang/test/CodeGen/aarch64-tme.cpp b/clang/test/CodeGen/aarch64-tme.cpp
index 096a8e4248f6..0e0cfeed7b75 100644
--- a/clang/test/CodeGen/aarch64-tme.cpp
+++ b/clang/test/CodeGen/aarch64-tme.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1             -triple aarch64 -target-feature +tme -S -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -DUSE_ACLE  -triple aarch64 -target-feature +tme -S -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1             -triple aarch64 -target-feature +tme -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -DUSE_ACLE  -triple aarch64 -target-feature +tme -emit-llvm %s -o - | FileCheck %s
 
 #define A -1
 constexpr int f() { return 65536; }
diff --git a/clang/test/CodeGen/aarch64-type-sizes.c b/clang/test/CodeGen/aarch64-type-sizes.c
index 7a2508c6e158..a40423c1f8de 100644
--- a/clang/test/CodeGen/aarch64-type-sizes.c
+++ b/clang/test/CodeGen/aarch64-type-sizes.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -triple aarch64_be-none-linux-gnu -emit-llvm -w -o - %s | FileCheck %s
 // char by definition has size 1
 
-// CHECK: target datalayout = "E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+// CHECK: target datalayout = "E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
 
 int check_short(void) {
   return sizeof(short);
diff --git a/clang/test/CodeGen/aarch64-v8.1a-neon-intrinsics.c b/clang/test/CodeGen/aarch64-v8.1a-neon-intrinsics.c
index 6f2a0173a9b6..bc985efa6bc9 100644
--- a/clang/test/CodeGen/aarch64-v8.1a-neon-intrinsics.c
+++ b/clang/test/CodeGen/aarch64-v8.1a-neon-intrinsics.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon \
-// RUN:  -target-feature +v8.1a -S -emit-llvm -disable-O0-optnone -o - %s | opt -passes=mem2reg,dce -S | FileCheck %s
+// RUN:  -target-feature +v8.1a -emit-llvm -disable-O0-optnone -o - %s | opt -passes=mem2reg,dce -S | FileCheck %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics-constrained.c b/clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics-constrained.c
index 536713402b5d..9109626cea9c 100644
--- a/clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics-constrained.c
+++ b/clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics-constrained.c
@@ -1,9 +1,9 @@
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +fullfp16 \
-// RUN: -S -disable-O0-optnone \
+// RUN: -disable-O0-optnone \
 // RUN: -emit-llvm -o - %s | opt -S -passes=mem2reg \
 // RUN: | FileCheck --check-prefixes=COMMON,COMMONIR,UNCONSTRAINED %s
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +fullfp16 \
-// RUN: -S -disable-O0-optnone \
+// RUN: -disable-O0-optnone \
 // RUN: -ffp-exception-behavior=strict -emit-llvm -o - %s | opt -S -passes=mem2reg \
 // RUN: | FileCheck --check-prefixes=COMMON,COMMONIR,CONSTRAINED %s
 
diff --git a/clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c b/clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c
index d745a7789326..90ee74e459eb 100644
--- a/clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c
+++ b/clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +fullfp16\
-// RUN: -S -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: -disable-O0-optnone -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg \
 // RUN: | FileCheck %s
 
diff --git a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics-constrained.c b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics-constrained.c
index e8798c1b8d91..b51e6f7e6e1a 100644
--- a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics-constrained.c
+++ b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics-constrained.c
@@ -1,19 +1,19 @@
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.2a\
-// RUN: -flax-vector-conversions=none -S -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: -flax-vector-conversions=none -disable-O0-optnone -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg \
 // RUN: | FileCheck --check-prefix=COMMON --check-prefix=COMMONIR --check-prefix=UNCONSTRAINED %s
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.2a\
 // RUN: -ffp-exception-behavior=maytrap -DEXCEPT=1 \
-// RUN: -flax-vector-conversions=none -S -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: -flax-vector-conversions=none -disable-O0-optnone -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg \
 // RUN: | FileCheck --check-prefix=COMMON --check-prefix=COMMONIR --check-prefix=CONSTRAINED --implicit-check-not=fpexcept.maytrap %s
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.2a\
-// RUN: -flax-vector-conversions=none -S -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: -flax-vector-conversions=none -disable-O0-optnone -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg | llc -o=- - \
 // RUN: | FileCheck --check-prefix=COMMON --check-prefix=CHECK-ASM %s
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.2a\
 // RUN: -ffp-exception-behavior=maytrap -DEXCEPT=1 \
-// RUN: -flax-vector-conversions=none -S -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: -flax-vector-conversions=none -disable-O0-optnone -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg | llc -o=- - \
 // RUN: | FileCheck --check-prefix=COMMON --check-prefix=CHECK-ASM --implicit-check-not=fpexcept.maytrap  %s
 
diff --git a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics-generic.c b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics-generic.c
index 78391808fafa..9c15e6dc6a62 100644
--- a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics-generic.c
+++ b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics-generic.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature -fullfp16 -target-feature +v8a\
-// RUN: -flax-vector-conversions=none -S -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: -flax-vector-conversions=none -disable-O0-optnone -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg \
 // RUN: | FileCheck %s
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.2a\
-// RUN: -flax-vector-conversions=none -S -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: -flax-vector-conversions=none -disable-O0-optnone -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg \
 // RUN: | FileCheck %s
 
diff --git a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
index 617d515504fe..1cce977b60e6 100644
--- a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
+++ b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.2a\
-// RUN: -flax-vector-conversions=none -S -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: -flax-vector-conversions=none -disable-O0-optnone -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg \
 // RUN: | FileCheck %s
 
diff --git a/clang/test/CodeGen/aarch64-v8.5a-neon-frint3264-intrinsic.c b/clang/test/CodeGen/aarch64-v8.5a-neon-frint3264-intrinsic.c
index b6362d4bc21f..c44dd333c975 100644
--- a/clang/test/CodeGen/aarch64-v8.5a-neon-frint3264-intrinsic.c
+++ b/clang/test/CodeGen/aarch64-v8.5a-neon-frint3264-intrinsic.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +v8.5a\
-// RUN: -flax-vector-conversions=none -S -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: -flax-vector-conversions=none -disable-O0-optnone -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg \
 // RUN: | FileCheck %s
 
diff --git a/clang/test/CodeGen/aarch64-v8.5a-scalar-frint3264-intrinsic.c b/clang/test/CodeGen/aarch64-v8.5a-scalar-frint3264-intrinsic.c
index ebe0f43f0ccb..169743051d6d 100644
--- a/clang/test/CodeGen/aarch64-v8.5a-scalar-frint3264-intrinsic.c
+++ b/clang/test/CodeGen/aarch64-v8.5a-scalar-frint3264-intrinsic.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +v8.5a\
-// RUN: -flax-vector-conversions=none -S -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: -flax-vector-conversions=none -disable-O0-optnone -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg \
 // RUN: | FileCheck %s
 
diff --git a/clang/test/CodeGen/aarch64-v8.6a-neon-intrinsics.c b/clang/test/CodeGen/aarch64-v8.6a-neon-intrinsics.c
index 4001776fcb3c..7bfeb7939edb 100644
--- a/clang/test/CodeGen/aarch64-v8.6a-neon-intrinsics.c
+++ b/clang/test/CodeGen/aarch64-v8.6a-neon-intrinsics.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.6a -target-feature +i8mm \
-// RUN: -S -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: -disable-O0-optnone -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,sroa \
 // RUN: | FileCheck %s
 
diff --git a/clang/test/CodeGen/aarch64_neon_sve_bridge_intrinsics/acle_neon_sve_bridge_dup_neonq.c b/clang/test/CodeGen/aarch64_neon_sve_bridge_intrinsics/acle_neon_sve_bridge_dup_neonq.c
index 990169329825..516e9aba9849 100644
--- a/clang/test/CodeGen/aarch64_neon_sve_bridge_intrinsics/acle_neon_sve_bridge_dup_neonq.c
+++ b/clang/test/CodeGen/aarch64_neon_sve_bridge_intrinsics/acle_neon_sve_bridge_dup_neonq.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +sve -target-feature +bf16 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +sve -target-feature +bf16 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +sve -target-feature +bf16 -S -O1 -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +sve -target-feature +bf16 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +sve -target-feature +bf16 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +neon -target-feature +sve -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +neon -target-feature +sve -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +neon -target-feature +sve -target-feature +bf16 -S -O1 -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +neon -target-feature +sve -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +neon -target-feature +sve -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 #include <arm_neon_sve_bridge.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64_neon_sve_bridge_intrinsics/acle_neon_sve_bridge_get_neonq.c b/clang/test/CodeGen/aarch64_neon_sve_bridge_intrinsics/acle_neon_sve_bridge_get_neonq.c
index 5675f4e4db03..94657cec86cd 100644
--- a/clang/test/CodeGen/aarch64_neon_sve_bridge_intrinsics/acle_neon_sve_bridge_get_neonq.c
+++ b/clang/test/CodeGen/aarch64_neon_sve_bridge_intrinsics/acle_neon_sve_bridge_get_neonq.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +sve -target-feature +bf16 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +sve -target-feature +bf16 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +sve -target-feature +bf16 -S -O1 -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +sve -target-feature +bf16 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +sve -target-feature +bf16 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +neon -target-feature +sve -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +neon -target-feature +sve -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +neon -target-feature +sve -target-feature +bf16 -S -O1 -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +neon -target-feature +sve -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +neon -target-feature +sve -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 #include <arm_neon_sve_bridge.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64_neon_sve_bridge_intrinsics/acle_neon_sve_bridge_set_neonq.c b/clang/test/CodeGen/aarch64_neon_sve_bridge_intrinsics/acle_neon_sve_bridge_set_neonq.c
index b8271860c195..a32c1446b2df 100644
--- a/clang/test/CodeGen/aarch64_neon_sve_bridge_intrinsics/acle_neon_sve_bridge_set_neonq.c
+++ b/clang/test/CodeGen/aarch64_neon_sve_bridge_intrinsics/acle_neon_sve_bridge_set_neonq.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +sve -target-feature +bf16 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +sve -target-feature +bf16 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +sve -target-feature +bf16 -S -O1 -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +sve -target-feature +bf16 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +sve -target-feature +bf16 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +neon -target-feature +sve -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +neon -target-feature +sve -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +neon -target-feature +sve -target-feature +bf16 -S -O1 -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +neon -target-feature +sve -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +neon -target-feature +sve -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 #include <arm_neon_sve_bridge.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/aarch64_neon_sve_bridge_intrinsics/target.c b/clang/test/CodeGen/aarch64_neon_sve_bridge_intrinsics/target.c
index b5cbd24d5ff4..a08c452fdc7f 100644
--- a/clang/test/CodeGen/aarch64_neon_sve_bridge_intrinsics/target.c
+++ b/clang/test/CodeGen/aarch64_neon_sve_bridge_intrinsics/target.c
@@ -1,5 +1,5 @@
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -S -verify -emit-llvm -o - %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +neon -verify -emit-llvm -o - %s
 
 #include <arm_neon_sve_bridge.h>
 
diff --git a/clang/test/CodeGen/aggregate-assign-call.c b/clang/test/CodeGen/aggregate-assign-call.c
index d6571269456a..7d972397ffaa 100644
--- a/clang/test/CodeGen/aggregate-assign-call.c
+++ b/clang/test/CodeGen/aggregate-assign-call.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O1 -S -emit-llvm -o - %s | FileCheck %s --check-prefixes=O1
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O0 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=O0
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O1 -emit-llvm -o - %s | FileCheck %s --check-prefixes=O1
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O0 -emit-llvm -o - %s | FileCheck %s --check-prefix=O0
 //
 // Ensure that we place appropriate lifetime markers around indirectly returned
 // temporaries, and that the lifetime.ends appear in a timely manner.
diff --git a/clang/test/CodeGen/aix-builtin-cpu-is.c b/clang/test/CodeGen/aix-builtin-cpu-is.c
index b0a0dec41b56..e17cf7353511 100644
--- a/clang/test/CodeGen/aix-builtin-cpu-is.c
+++ b/clang/test/CodeGen/aix-builtin-cpu-is.c
@@ -57,12 +57,12 @@
 // CHECK-NEXT:   ret i32 0
 // CHECK-NEXT: }
 
-// CHECKOP: @_system_configuration = external global { i32, i32, i32 }
+// CHECKOP: @_system_configuration = external global { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i64, i32, i32, i32, i32, i64, i64, i64, i64, i32, i32, i32, i32, i32, i32, i64, i32, i8, i8, i8, i8, i32, i32, i16, i16, [3 x i32], i32 }
 // CHECKOP:   define i32 @main() #0 {
 // CHECKOP-NEXT: entry:
 // CHECKOP-NEXT:   %retval = alloca i32, align 4
 // CHECKOP-NEXT:   store i32 0, ptr %retval, align 4
-// CHECKOP-NEXT:   %0 = load i32, ptr getelementptr inbounds ({ i32, i32, i32 }, ptr @_system_configuration, i32 0, i32 1), align 4
+// CHECKOP-NEXT:   %0 = load i32, ptr getelementptr inbounds ({ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i64, i32, i32, i32, i32, i64, i64, i64, i64, i32, i32, i32, i32, i32, i32, i64, i32, i8, i8, i8, i8, i32, i32, i16, i16, [3 x i32], i32 }, ptr @_system_configuration, i32 0, i32 1), align 4
 // CHECKOP-NEXT:   %1 = icmp eq i32 %0, [[VALUE]] 
 // CHECKOP-NEXT:  %conv = zext i1 %1 to i32
 // CHECKOP-NEXT:   ret i32 %conv
diff --git a/clang/test/CodeGen/aix-builtin-cpu-supports.c b/clang/test/CodeGen/aix-builtin-cpu-supports.c
new file mode 100644
index 000000000000..52073ddfe0fd
--- /dev/null
+++ b/clang/test/CodeGen/aix-builtin-cpu-supports.c
@@ -0,0 +1,171 @@
+// RUN: echo "int main() { return __builtin_cpu_supports(\"4xxmac\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck -DBOOL=0 %s
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"altivec\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DPOS=46 -DOP=ugt -DBIT=i32 -DVALUE=0 \
+// RUN:   --check-prefixes=CHECKOP,OPRT,SYSCONF
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"archpmu\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck -DBOOL=0 %s
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"booke\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck -DBOOL=0 %s
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"cellbe\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck -DBOOL=0 %s
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"darn\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DPOS=1 -DOP=uge -DBIT=i32 -DVALUE=131072 \
+// RUN:   --check-prefixes=CHECKOP,OPRT,SYSCONF
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"dscr\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DPOS=1 -DOP=uge -DBIT=i32 -DVALUE=65536 \
+// RUN:   --check-prefixes=CHECKOP,OPRT,SYSCONF
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"ebb\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DPOS=1 -DOP=uge -DBIT=i32 -DVALUE=65536 \
+// RUN:   --check-prefixes=CHECKOP,OPRT,SYSCONF
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"efpdouble\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck -DBOOL=0 %s
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"efpsingle\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck -DBOOL=0 %s
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"pa6t\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck -DBOOL=0 %s
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"fpu\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck -DBOOL=1 %s
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"htm\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DPOS=1 -DOP=ugt -DLABLE=59  -DBIT=i64 -DVALUE=0 \
+// RUN:   --check-prefixes=CHECKOP,OPRT,SYSCALL
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"htm-nosc\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck -DBOOL=0 %s
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"htm-no-suspend\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck -DBOOL=0 %s
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"ic_snoop\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck -DBOOL=0 %s
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"isel\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck -DBOOL=1 %s
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"mma\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DPOS=1 -DOP=ugt -DLABLE=62 -DBIT=i64 -DVALUE=0 \
+// RUN:   --check-prefixes=CHECKOP,OPRT,SYSCALL
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"mmu\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck -DBOOL=1 %s
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"notb\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck -DBOOL=0 %s
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"arch_2_05\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck -DBOOL=1 %s
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"arch_2_06\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DPOS=1 -DOP=uge -DBIT=i32 -DVALUE=32768 \
+// RUN:   --check-prefixes=CHECKOP,OPRT,SYSCONF
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"arch_2_07\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DPOS=1 -DOP=uge -DBIT=i32 -DVALUE=65536 \
+// RUN:   --check-prefixes=CHECKOP,OPRT,SYSCONF
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"arch_3_00\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DPOS=1 -DOP=uge -DBIT=i32 -DVALUE=131072 \
+// RUN:   --check-prefixes=CHECKOP,OPRT,SYSCONF
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"arch_3_1\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DPOS=1 -DOP=uge -DBIT=i32 -DVALUE=262144 \
+// RUN:   --check-prefixes=CHECKOP,OPRT,SYSCONF
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"dfp\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DPOS=53 -DOP=ne -DBIT=i32 -DVALUE=0 \
+// RUN:   --check-prefixes=CHECKOP,OPRT,SYSCONF
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"power4\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck -DBOOL=1 %s
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"power5\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck -DBOOL=1 %s
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"power5+\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck -DBOOL=1 %s
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"power6x\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck -DBOOL=0 %s
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"ppc32\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck -DBOOL=1 %s
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"ppc601\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck -DBOOL=0 %s
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"ppc64\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck -DBOOL=1 %s
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"ppcle\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck -DBOOL=0 %s
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"smt\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DPOS=44 -DMASK=3 -DOP=eq -DBIT=i32 -DVALUE=3 \
+// RUN:   --check-prefixes=CHECKOP,OPMASK,SYSCONF
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"spe\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck -DBOOL=0 %s
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"scv\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck -DBOOL=0 %s
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"tar\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DPOS=1 -DOP=uge -DBIT=i32 -DVALUE=65536 \
+// RUN:   --check-prefixes=CHECKOP,OPRT,SYSCONF
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"true_le\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck -DBOOL=1 %s
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"ucache\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DPOS=5 -DMASK=2 -DOP=eq -DBIT=i32 -DVALUE=2 \
+// RUN:   --check-prefixes=CHECKOP,OPMASK,SYSCONF
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"vcrypto\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck -DBOOL=0 %s
+
+// RUN: echo "int main() { return __builtin_cpu_supports(\"vsx\");}" > %t.c
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DPOS=46 -DOP=ugt -DBIT=i32 -DVALUE=1 \
+// RUN:   --check-prefixes=CHECKOP,OPRT,SYSCONF
+
+// CHECK:     define i32 @main() #0 {
+// CHECK-NEXT: entry:
+// CHECK-NEXT:   %retval = alloca i32, align 4
+// CHECK-NEXT:   store i32 0, ptr %retval, align 4
+// CHECK-NEXT:   ret i32 [[BOOL]]
+// CHECK-NEXT: }
+
+// SYSCONF: @_system_configuration = external global { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i64, i32, i32, i32, i32, i64, i64, i64, i64, i32, i32, i32, i32, i32, i32, i64, i32, i8, i8, i8, i8, i32, i32, i16, i16, [3 x i32], i32 }
+
+// CHECKOP:   define i32 @main() #0 {
+// CHECKOP-NEXT: entry:
+// CHECKOP-NEXT:   %retval = alloca i32, align 4
+// CHECKOP-NEXT:   store i32 0, ptr %retval, align 4
+
+// SYSCONF-NEXT:   %0 = load i32, ptr getelementptr inbounds ({ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i64, i32, i32, i32, i32, i64, i64, i64, i64, i32, i32, i32, i32, i32, i32, i64, i32, i8, i8, i8, i8, i32, i32, i16, i16, [3 x i32], i32 }, ptr @_system_configuration, i32 0, i32 [[POS]]), align 4
+// SYSCALL-NEXT:  %0 = call i64 @getsystemcfg(i32 [[LABLE]])
+
+// OPRT-NEXT:  %1 = icmp [[OP]] [[BIT]] %0, [[VALUE]]
+// OPRT-NEXT:     %conv = zext i1 %1 to i32
+
+// OPMASK-NEXT:  %1 = and i32 %0, [[MASK]]
+// OPMASK-NEXT:  %2 = icmp [[OP]] i32 %1, [[VALUE]]
+// OPMASK-NEXT:  %conv = zext i1 %2 to i32
+
+// CHECKOP-NEXT:   ret i32 %conv
+// CHECKOP-NEXT: }
+
+// SYSCALL: declare i64 @getsystemcfg(i32)
+
+
diff --git a/clang/test/CodeGen/align-global-large.c b/clang/test/CodeGen/align-global-large.c
index e53323f65f52..14b068e8f60d 100644
--- a/clang/test/CodeGen/align-global-large.c
+++ b/clang/test/CodeGen/align-global-large.c
@@ -1,5 +1,5 @@
 // PR13606 - Clang crashes with large alignment attribute
-// RUN: %clang_cc1 -S -emit-llvm %s -o - -triple i686-pc-gnu | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple i686-pc-gnu | FileCheck %s
 
 // CHECK: x
 // CHECK: align
diff --git a/clang/test/CodeGen/aligned-sret.c b/clang/test/CodeGen/aligned-sret.c
index 4e1f86e7f07a..2716cecc63db 100644
--- a/clang/test/CodeGen/aligned-sret.c
+++ b/clang/test/CodeGen/aligned-sret.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-apple-macos %s -S -emit-llvm -o- | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-macos %s -emit-llvm -o- | FileCheck %s
 
 typedef __attribute__((__ext_vector_type__(4),__aligned__(16))) double simd_double4;
 typedef struct { simd_double4 columns[4]; } simd_double4x4;
diff --git a/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c b/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c
index 161f1a75ca63..c4f0b78fc6a5 100644
--- a/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c
+++ b/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // RUN: %clang_cc1 -triple armv8-arm-none-eabi \
 // RUN:   -target-feature +neon -target-feature +bf16 -mfloat-abi soft \
-// RUN:   -disable-O0-optnone -S -emit-llvm -o - %s \
+// RUN:   -disable-O0-optnone -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg | FileCheck %s
 // RUN: %clang_cc1 -triple armv8-arm-none-eabi \
 // RUN:   -target-feature +neon -target-feature +bf16 -mfloat-abi hard \
-// RUN:   -disable-O0-optnone -S -emit-llvm -o - %s \
+// RUN:   -disable-O0-optnone -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
diff --git a/clang/test/CodeGen/arm-bf16-reinterpret-intrinsics.c b/clang/test/CodeGen/arm-bf16-reinterpret-intrinsics.c
index 9c93f3773559..f8c3a9413313 100644
--- a/clang/test/CodeGen/arm-bf16-reinterpret-intrinsics.c
+++ b/clang/test/CodeGen/arm-bf16-reinterpret-intrinsics.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple armv8.2a-arm-none-eabi -target-feature +neon -target-feature +bf16 -mfloat-abi hard \
-// RUN: -disable-O0-optnone -S -emit-llvm -o - %s \
+// RUN: -disable-O0-optnone -emit-llvm -o - %s \
 // RUN: | opt -S -passes=instcombine \
 // RUN: | FileCheck %s
 
diff --git a/clang/test/CodeGen/arm-byval-align.c b/clang/test/CodeGen/arm-byval-align.c
index 0de6a4e8544c..c1b697a784ec 100644
--- a/clang/test/CodeGen/arm-byval-align.c
+++ b/clang/test/CodeGen/arm-byval-align.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple=armv7-none-eabi < %s -S -emit-llvm | FileCheck %s
+// RUN: %clang_cc1 -triple=armv7-none-eabi < %s -emit-llvm | FileCheck %s
 
 struct foo {
   long long a;
diff --git a/clang/test/CodeGen/arm-cde-gpr.c b/clang/test/CodeGen/arm-cde-gpr.c
index 8807853c30f5..cf69b50d8cc4 100644
--- a/clang/test/CodeGen/arm-cde-gpr.c
+++ b/clang/test/CodeGen/arm-cde-gpr.c
@@ -2,7 +2,7 @@
 // RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi \
 // RUN:   -target-feature +cdecp0 -target-feature +cdecp1 \
 // RUN:   -mfloat-abi hard -O0 -disable-O0-optnone \
-// RUN:   -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN:   -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-cde-reinterpret.c b/clang/test/CodeGen/arm-cde-reinterpret.c
index 443eaebc507e..ca6d2c6a545c 100644
--- a/clang/test/CodeGen/arm-cde-reinterpret.c
+++ b/clang/test/CodeGen/arm-cde-reinterpret.c
@@ -1,11 +1,11 @@
 // RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi \
 // RUN:   -target-feature +cdecp0 -target-feature +mve.fp \
 // RUN:   -mfloat-abi hard -O0 -disable-O0-optnone \
-// RUN:   -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s --check-prefixes=CHECK,CHECK-LE
+// RUN:   -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s --check-prefixes=CHECK,CHECK-LE
 // RUN: %clang_cc1 -triple thumbebv8.1m.main-arm-none-eabi \
 // RUN:   -target-feature +cdecp0 -target-feature +mve.fp \
 // RUN:   -mfloat-abi hard -O0 -disable-O0-optnone \
-// RUN:   -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s --check-prefixes=CHECK,CHECK-BE
+// RUN:   -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s --check-prefixes=CHECK,CHECK-BE
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-cde-vec.c b/clang/test/CodeGen/arm-cde-vec.c
index 2ea6e58b793b..7938588868cc 100644
--- a/clang/test/CodeGen/arm-cde-vec.c
+++ b/clang/test/CodeGen/arm-cde-vec.c
@@ -3,7 +3,7 @@
 // RUN:   -target-feature +cdecp0 -target-feature +cdecp1 \
 // RUN:   -target-feature +mve.fp \
 // RUN:   -mfloat-abi hard -O0 -disable-O0-optnone \
-// RUN:   -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN:   -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-cde-vfp.c b/clang/test/CodeGen/arm-cde-vfp.c
index 118e4f9d5881..0e219fc76ce7 100644
--- a/clang/test/CodeGen/arm-cde-vfp.c
+++ b/clang/test/CodeGen/arm-cde-vfp.c
@@ -2,7 +2,7 @@
 // RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi \
 // RUN:   -target-feature +cdecp0 -target-feature +cdecp1 \
 // RUN:   -mfloat-abi hard -O0 -disable-O0-optnone \
-// RUN:   -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN:   -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/absneg.c b/clang/test/CodeGen/arm-mve-intrinsics/absneg.c
index 755e2e12dded..bf56c0685bb6 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/absneg.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/absneg.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/admin.c b/clang/test/CodeGen/arm-mve-intrinsics/admin.c
index cb2ab0715587..51cfd6ce310e 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/admin.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/admin.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s --check-prefixes=CHECK,CHECK-LE
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s --check-prefixes=CHECK,CHECK-LE
-// RUN: %clang_cc1 -triple thumbebv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s --check-prefixes=CHECK,CHECK-BE
-// RUN: %clang_cc1 -triple thumbebv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s --check-prefixes=CHECK,CHECK-BE
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s --check-prefixes=CHECK,CHECK-LE
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s --check-prefixes=CHECK,CHECK-LE
+// RUN: %clang_cc1 -triple thumbebv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s --check-prefixes=CHECK,CHECK-BE
+// RUN: %clang_cc1 -triple thumbebv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s --check-prefixes=CHECK,CHECK-BE
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/bitwise-imm.c b/clang/test/CodeGen/arm-mve-intrinsics/bitwise-imm.c
index 08c5ebe24f67..b038322bae5b 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/bitwise-imm.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/bitwise-imm.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/compare.c b/clang/test/CodeGen/arm-mve-intrinsics/compare.c
index 9f68735e8df3..8f190990a658 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/compare.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/compare.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/cplusplus.cpp b/clang/test/CodeGen/arm-mve-intrinsics/cplusplus.cpp
index 77170745756c..35af174e1f6f 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/cplusplus.cpp
+++ b/clang/test/CodeGen/arm-mve-intrinsics/cplusplus.cpp
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/dup.c b/clang/test/CodeGen/arm-mve-intrinsics/dup.c
index 0361e03335be..c2c7a9c278f6 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/dup.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/dup.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/get-set-lane.c b/clang/test/CodeGen/arm-mve-intrinsics/get-set-lane.c
index f4eac01249e9..201352431f3a 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/get-set-lane.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/get-set-lane.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/idup.c b/clang/test/CodeGen/arm-mve-intrinsics/idup.c
index 337f58f946db..061f4495407f 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/idup.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/idup.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/load-store.c b/clang/test/CodeGen/arm-mve-intrinsics/load-store.c
index 022daccfde33..2dde75fa5586 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/load-store.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/load-store.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/predicates.c b/clang/test/CodeGen/arm-mve-intrinsics/predicates.c
index a143ee318889..7c5311873b4f 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/predicates.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/predicates.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/reinterpret.c b/clang/test/CodeGen/arm-mve-intrinsics/reinterpret.c
index ad47905ecc66..894bbb7a9049 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/reinterpret.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/reinterpret.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s --check-prefix=BOTH --check-prefix=LE
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s --check-prefix=BOTH --check-prefix=LE
-// RUN: %clang_cc1 -triple thumbebv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s --check-prefix=BOTH --check-prefix=BE
-// RUN: %clang_cc1 -triple thumbebv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s --check-prefix=BOTH --check-prefix=BE
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s --check-prefix=BOTH --check-prefix=LE
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s --check-prefix=BOTH --check-prefix=LE
+// RUN: %clang_cc1 -triple thumbebv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s --check-prefix=BOTH --check-prefix=BE
+// RUN: %clang_cc1 -triple thumbebv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s --check-prefix=BOTH --check-prefix=BE
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/scalar-shifts.c b/clang/test/CodeGen/arm-mve-intrinsics/scalar-shifts.c
index b063b8ffbf1d..d5d7b9be8d95 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/scalar-shifts.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/scalar-shifts.c
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/scatter-gather.c b/clang/test/CodeGen/arm-mve-intrinsics/scatter-gather.c
index 6f14d048809b..adae01727650 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/scatter-gather.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/scatter-gather.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/ternary.c b/clang/test/CodeGen/arm-mve-intrinsics/ternary.c
index afdddebcf2aa..36b2ce063cb1 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/ternary.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/ternary.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vabavq.c b/clang/test/CodeGen/arm-mve-intrinsics/vabavq.c
index 98546a7b4cd1..790ff8b6f7e4 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vabavq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vabavq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vabdq.c b/clang/test/CodeGen/arm-mve-intrinsics/vabdq.c
index b4f2e0356c73..c03bdd059d12 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vabdq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vabdq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vadc.c b/clang/test/CodeGen/arm-mve-intrinsics/vadc.c
index 6511d9ebbecb..21087b83300c 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vadc.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vadc.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vaddq.c b/clang/test/CodeGen/arm-mve-intrinsics/vaddq.c
index 516e26b2698b..238cb4056d4f 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vaddq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vaddq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vaddv.c b/clang/test/CodeGen/arm-mve-intrinsics/vaddv.c
index 9e16b1d0ab22..f4a5fd59db79 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vaddv.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vaddv.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
- // RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
- // RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+ // RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+ // RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vandq.c b/clang/test/CodeGen/arm-mve-intrinsics/vandq.c
index 6fb52bd3a6d2..4f769505be91 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vandq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vandq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vbicq.c b/clang/test/CodeGen/arm-mve-intrinsics/vbicq.c
index 70f3cdaba7e1..dc70647a9c94 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vbicq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vbicq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vbrsrq.c b/clang/test/CodeGen/arm-mve-intrinsics/vbrsrq.c
index c802fd95bf65..a52415a112b5 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vbrsrq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vbrsrq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vcaddq.c b/clang/test/CodeGen/arm-mve-intrinsics/vcaddq.c
index 37b8458144ae..114ede2363e3 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vcaddq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vcaddq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vclz.c b/clang/test/CodeGen/arm-mve-intrinsics/vclz.c
index 776fe3a00dc5..1b9e221150c7 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vclz.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vclz.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vcmlaq.c b/clang/test/CodeGen/arm-mve-intrinsics/vcmlaq.c
index a142cd7282ea..1a9e7fdd0149 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vcmlaq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vcmlaq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vcmulq.c b/clang/test/CodeGen/arm-mve-intrinsics/vcmulq.c
index af5d573a6457..d648f7c55749 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vcmulq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vcmulq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vcvt.c b/clang/test/CodeGen/arm-mve-intrinsics/vcvt.c
index ffb0ae301c50..b2a6d0c1ea66 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vcvt.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vcvt.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -DPOLYMORPHIC -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -DPOLYMORPHIC -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vcvt_anpm.c b/clang/test/CodeGen/arm-mve-intrinsics/vcvt_anpm.c
index 39966e612011..cee865592ee5 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vcvt_anpm.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vcvt_anpm.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -DPOLYMORPHIC -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -DPOLYMORPHIC -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vector-shift-imm-dyadic.c b/clang/test/CodeGen/arm-mve-intrinsics/vector-shift-imm-dyadic.c
index f30269d83721..b56cefba8b66 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vector-shift-imm-dyadic.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vector-shift-imm-dyadic.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vector-shift-imm.c b/clang/test/CodeGen/arm-mve-intrinsics/vector-shift-imm.c
index 1c59851f1a7c..72a03ed8def3 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vector-shift-imm.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vector-shift-imm.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vector-shift-var.c b/clang/test/CodeGen/arm-mve-intrinsics/vector-shift-var.c
index 273f79a17a10..090d2c30c546 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vector-shift-var.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vector-shift-var.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/veorq.c b/clang/test/CodeGen/arm-mve-intrinsics/veorq.c
index bdea310bca97..dbb1d225be7e 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/veorq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/veorq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c b/clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c
index 3cb641a8b2cf..46fc5c4b781b 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vhaddq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vhcaddq.c b/clang/test/CodeGen/arm-mve-intrinsics/vhcaddq.c
index 3bf0e0922f22..c4476d3756cf 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vhcaddq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vhcaddq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vhsubq.c b/clang/test/CodeGen/arm-mve-intrinsics/vhsubq.c
index 20bd665e0af5..3020d9267901 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vhsubq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vhsubq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vld24.c b/clang/test/CodeGen/arm-mve-intrinsics/vld24.c
index bcf176e4b854..03c870e28154 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vld24.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vld24.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vldr.c b/clang/test/CodeGen/arm-mve-intrinsics/vldr.c
index 33600a3d9a65..ec1f3616d078 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vldr.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vldr.c
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmaxaq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmaxaq.c
index 3caf51d5ccfc..99bdd187c899 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmaxaq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmaxaq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmaq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmaq.c
index 4443f88eeb44..613a390bc6d3 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmaq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmaq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmq.c
index 93c7cf7ffefc..bad7cd903ab1 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmaxq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmaxq.c
index 38c8239942c6..940c1e2133eb 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmaxq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmaxq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vminaq.c b/clang/test/CodeGen/arm-mve-intrinsics/vminaq.c
index 93ba9638bc3f..19c669542e46 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vminaq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vminaq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vminnmaq.c b/clang/test/CodeGen/arm-mve-intrinsics/vminnmaq.c
index 8fcd29f49769..0182cf7c5b6b 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vminnmaq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vminnmaq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vminnmq.c b/clang/test/CodeGen/arm-mve-intrinsics/vminnmq.c
index 131fffcf01ad..b48ff9d84b8f 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vminnmq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vminnmq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vminq.c b/clang/test/CodeGen/arm-mve-intrinsics/vminq.c
index eb9c7284d443..7b7bad30bb15 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vminq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vminq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vminvq.c b/clang/test/CodeGen/arm-mve-intrinsics/vminvq.c
index 759065d8f179..df51dda8468d 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vminvq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vminvq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmldav.c b/clang/test/CodeGen/arm-mve-intrinsics/vmldav.c
index 1713e04c9920..1c6dba7e388f 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmldav.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmldav.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmlldav.c b/clang/test/CodeGen/arm-mve-intrinsics/vmlldav.c
index 76b9c815e607..50f101a2b3e9 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmlldav.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmlldav.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmovl.c b/clang/test/CodeGen/arm-mve-intrinsics/vmovl.c
index 35f45ec652b3..d22c068192ff 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmovl.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmovl.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmovn.c b/clang/test/CodeGen/arm-mve-intrinsics/vmovn.c
index 64611bbf6e9b..c290f9c7792b 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmovn.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmovn.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck --check-prefix=LE %s
-// RUN: %clang_cc1 -triple thumbebv8.1m.main-arm-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck --check-prefix=BE %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck --check-prefix=LE %s
-// RUN: %clang_cc1 -triple thumbebv8.1m.main-arm-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck --check-prefix=BE %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck --check-prefix=LE %s
+// RUN: %clang_cc1 -triple thumbebv8.1m.main-arm-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck --check-prefix=BE %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck --check-prefix=LE %s
+// RUN: %clang_cc1 -triple thumbebv8.1m.main-arm-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck --check-prefix=BE %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmulhq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmulhq.c
index 4fdf2dfda6b4..78b9bcada52d 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmulhq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmulhq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmullbq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmullbq.c
index beafd9cc3706..34ba9e0cce64 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmullbq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmullbq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmulltq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmulltq.c
index 804cf07552f9..a368fa80a1a3 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmulltq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmulltq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmulq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmulq.c
index 332985dec9e8..9d36a2d33486 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vmulq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmulq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vornq.c b/clang/test/CodeGen/arm-mve-intrinsics/vornq.c
index 0813867cc085..f8db91cad6b8 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vornq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vornq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vorrq.c b/clang/test/CodeGen/arm-mve-intrinsics/vorrq.c
index f6482c9da38c..bc7921ecd144 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vorrq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vorrq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vqaddq.c b/clang/test/CodeGen/arm-mve-intrinsics/vqaddq.c
index b0f45388e562..8072c8c9627e 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vqaddq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vqaddq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vqdmlad.c b/clang/test/CodeGen/arm-mve-intrinsics/vqdmlad.c
index 9c4ed0792255..02324131584d 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vqdmlad.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vqdmlad.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vqdmulhq.c b/clang/test/CodeGen/arm-mve-intrinsics/vqdmulhq.c
index d9457376186b..ec2a5e18f623 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vqdmulhq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vqdmulhq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vqdmullbq.c b/clang/test/CodeGen/arm-mve-intrinsics/vqdmullbq.c
index 361fcf8019d1..7b67ef281b7b 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vqdmullbq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vqdmullbq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vqdmulltq.c b/clang/test/CodeGen/arm-mve-intrinsics/vqdmulltq.c
index acc2b3c04cc3..b93dee02c4e0 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vqdmulltq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vqdmulltq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vqmovn.c b/clang/test/CodeGen/arm-mve-intrinsics/vqmovn.c
index 9f8fd244d214..96a8aa2f9f8b 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vqmovn.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vqmovn.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -DPOLYMORPHIC -triple thumbv8.1m.main-none-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -DPOLYMORPHIC -triple thumbv8.1m.main-none-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vqrdmulhq.c b/clang/test/CodeGen/arm-mve-intrinsics/vqrdmulhq.c
index 3538b901f5eb..fb581b92816d 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vqrdmulhq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vqrdmulhq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vqsubq.c b/clang/test/CodeGen/arm-mve-intrinsics/vqsubq.c
index 51ab22ad991f..d95a828e45aa 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vqsubq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vqsubq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vrev.c b/clang/test/CodeGen/arm-mve-intrinsics/vrev.c
index 317789f72d41..d584e7e858b6 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vrev.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vrev.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vrhaddq.c b/clang/test/CodeGen/arm-mve-intrinsics/vrhaddq.c
index 119edb73e310..f6b55d736c5d 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vrhaddq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vrhaddq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vrmulhq.c b/clang/test/CodeGen/arm-mve-intrinsics/vrmulhq.c
index 64f7e1a99ddf..9ad99c5f93b0 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vrmulhq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vrmulhq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vrnd.c b/clang/test/CodeGen/arm-mve-intrinsics/vrnd.c
index cca56e263584..3e625c739bde 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vrnd.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vrnd.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes='mem2reg,sroa,early-cse<>' | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vshlc.c b/clang/test/CodeGen/arm-mve-intrinsics/vshlc.c
index d4a1412faff4..8a84ff30caae 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vshlc.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vshlc.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vsubq.c b/clang/test/CodeGen/arm-mve-intrinsics/vsubq.c
index 0a042b4e8c93..3d8777088970 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vsubq.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vsubq.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-none-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -emit-llvm -o - %s | opt -S -passes=sroa | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm-neon-vcadd.c b/clang/test/CodeGen/arm-neon-vcadd.c
index 9280d0b184d5..a1ef99a1f5a1 100644
--- a/clang/test/CodeGen/arm-neon-vcadd.c
+++ b/clang/test/CodeGen/arm-neon-vcadd.c
@@ -1,6 +1,6 @@
 // REQUIRES: arm-registered-target
 // RUN: %clang_cc1 -triple armv8.3a-arm-none-eabi -target-cpu generic \
-// RUN: -target-feature +fullfp16 -mfloat-abi soft -S -emit-llvm -o - %s | \
+// RUN: -target-feature +fullfp16 -mfloat-abi soft -emit-llvm -o - %s | \
 // RUN: opt -S -passes=sroa -o - | FileCheck %s
 
 #include <arm_neon.h>
diff --git a/clang/test/CodeGen/arm-neon-vld.c b/clang/test/CodeGen/arm-neon-vld.c
index 959c07cc2219..4ea44777345e 100644
--- a/clang/test/CodeGen/arm-neon-vld.c
+++ b/clang/test/CodeGen/arm-neon-vld.c
@@ -1,8 +1,8 @@
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:     -S -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | \
+// RUN:     -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | \
 // RUN:     FileCheck -check-prefixes=CHECK,CHECK-A64 %s
 // RUN: %clang_cc1 -triple armv8-none-linux-gnueabi -target-feature +neon \
-// RUN:     -target-feature +fp16 -S -disable-O0-optnone -emit-llvm -o - %s | \
+// RUN:     -target-feature +fp16 -disable-O0-optnone -emit-llvm -o - %s | \
 // RUN:     opt -S -passes=mem2reg | FileCheck -check-prefixes=CHECK,CHECK-A32 %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
diff --git a/clang/test/CodeGen/arm-neon-vst.c b/clang/test/CodeGen/arm-neon-vst.c
index e68c95acd4ef..847c54bab3f2 100644
--- a/clang/test/CodeGen/arm-neon-vst.c
+++ b/clang/test/CodeGen/arm-neon-vst.c
@@ -1,8 +1,8 @@
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:     -S -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | \
+// RUN:     -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | \
 // RUN:     FileCheck -check-prefixes=CHECK,CHECK-A64 %s
 // RUN: %clang_cc1 -triple armv8-none-linux-gnueabi -target-feature +neon \
-// RUN:     -target-feature +fp16 -S -disable-O0-optnone -emit-llvm -o - %s | \
+// RUN:     -target-feature +fp16 -disable-O0-optnone -emit-llvm -o - %s | \
 // RUN:     opt -S -passes=mem2reg | FileCheck -check-prefixes=CHECK,CHECK-A32 %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
diff --git a/clang/test/CodeGen/arm-sve-vector-bits-vscale-range.c b/clang/test/CodeGen/arm-sve-vector-bits-vscale-range.c
index 3ee819ac1578..bd424172a186 100644
--- a/clang/test/CodeGen/arm-sve-vector-bits-vscale-range.c
+++ b/clang/test/CodeGen/arm-sve-vector-bits-vscale-range.c
@@ -1,18 +1,18 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=1 -mvscale-max=1 -S -emit-llvm -o - %s | FileCheck %s -D#VBITS=1
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=2 -mvscale-max=2 -S -emit-llvm -o - %s | FileCheck %s -D#VBITS=2
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=4 -mvscale-max=4 -S -emit-llvm -o - %s | FileCheck %s -D#VBITS=4
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=8 -mvscale-max=8 -S -emit-llvm -o - %s | FileCheck %s -D#VBITS=8
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=16 -mvscale-max=16 -S -emit-llvm -o - %s | FileCheck %s -D#VBITS=16
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -mvscale-min=1 -mvscale-max=1 -S -emit-llvm -o - %s | FileCheck %s -D#VBITS=1
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -mvscale-min=2 -mvscale-max=2 -S -emit-llvm -o - %s | FileCheck %s -D#VBITS=2
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=1 -S -emit-llvm -o - %s | FileCheck %s -D#VBITS=1 --check-prefix=CHECK-NOMAX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=2 -S -emit-llvm -o - %s | FileCheck %s -D#VBITS=2 --check-prefix=CHECK-NOMAX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=4 -S -emit-llvm -o - %s | FileCheck %s -D#VBITS=4 --check-prefix=CHECK-NOMAX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=8 -S -emit-llvm -o - %s | FileCheck %s -D#VBITS=8 --check-prefix=CHECK-NOMAX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=16 -S -emit-llvm -o - %s | FileCheck %s -D#VBITS=16 --check-prefix=CHECK-NOMAX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -mvscale-min=1 -mvscale-max=0 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-UNBOUNDED
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=1 -mvscale-max=0 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-UNBOUNDED
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-NONE
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=1 -mvscale-max=1 -emit-llvm -o - %s | FileCheck %s -D#VBITS=1
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=2 -mvscale-max=2 -emit-llvm -o - %s | FileCheck %s -D#VBITS=2
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=4 -mvscale-max=4 -emit-llvm -o - %s | FileCheck %s -D#VBITS=4
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=8 -mvscale-max=8 -emit-llvm -o - %s | FileCheck %s -D#VBITS=8
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=16 -mvscale-max=16 -emit-llvm -o - %s | FileCheck %s -D#VBITS=16
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -mvscale-min=1 -mvscale-max=1 -emit-llvm -o - %s | FileCheck %s -D#VBITS=1
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -mvscale-min=2 -mvscale-max=2 -emit-llvm -o - %s | FileCheck %s -D#VBITS=2
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=1 -emit-llvm -o - %s | FileCheck %s -D#VBITS=1 --check-prefix=CHECK-NOMAX
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=2 -emit-llvm -o - %s | FileCheck %s -D#VBITS=2 --check-prefix=CHECK-NOMAX
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=4 -emit-llvm -o - %s | FileCheck %s -D#VBITS=4 --check-prefix=CHECK-NOMAX
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=8 -emit-llvm -o - %s | FileCheck %s -D#VBITS=8 --check-prefix=CHECK-NOMAX
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=16 -emit-llvm -o - %s | FileCheck %s -D#VBITS=16 --check-prefix=CHECK-NOMAX
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -mvscale-min=1 -mvscale-max=0 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-UNBOUNDED
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=1 -mvscale-max=0 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-UNBOUNDED
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-NONE
 
 // CHECK-LABEL: @func() #0
 // CHECK: attributes #0 = { {{.*}} vscale_range([[#VBITS]],[[#VBITS]]) {{.*}} }
diff --git a/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c b/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c
index 7910d6c496a6..555f8ccba7c3 100644
--- a/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c
+++ b/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // RUN: %clang_cc1 -triple armv8.1a-linux-gnu -target-abi apcs-gnu -target-feature +neon \
-// RUN:  -S -emit-llvm -o - %s -disable-O0-optnone | opt -passes=mem2reg,dce -S \
+// RUN:  -emit-llvm -o - %s -disable-O0-optnone | opt -passes=mem2reg,dce -S \
 // RUN:  | FileCheck %s --check-prefix=CHECK-ARM
 
 // RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon \
-// RUN:  -target-feature +v8.1a -S -emit-llvm -o - %s -disable-O0-optnone | opt -passes=mem2reg,dce -S \
+// RUN:  -target-feature +v8.1a -emit-llvm -o - %s -disable-O0-optnone | opt -passes=mem2reg,dce -S \
 // RUN:  | FileCheck %s --check-prefix=CHECK-AARCH64
 
 // REQUIRES: arm-registered-target,aarch64-registered-target
diff --git a/clang/test/CodeGen/arm-v8.2a-neon-intrinsics-generic.c b/clang/test/CodeGen/arm-v8.2a-neon-intrinsics-generic.c
index f8d83332ab01..5f1cb34e6603 100644
--- a/clang/test/CodeGen/arm-v8.2a-neon-intrinsics-generic.c
+++ b/clang/test/CodeGen/arm-v8.2a-neon-intrinsics-generic.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
 // RUN: %clang_cc1 -triple armv8.2a-linux-gnu -target-abi apcs-gnu -target-feature +neon -target-feature -fullfp16 \
-// RUN: -S -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: -disable-O0-optnone -emit-llvm -o - %s \
 // RUN: | opt -S -passes=sroa \
 // RUN: | FileCheck %s --check-prefixes=CHECK-NOFP16
 // RUN: %clang_cc1 -triple armv8a-linux-gnu -target-abi apcs-gnu -target-feature +neon -target-feature +fullfp16 \
-// RUN: -S -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: -disable-O0-optnone -emit-llvm -o - %s \
 // RUN: | opt -S -passes=sroa \
 // RUN: | FileCheck %s --check-prefixes=CHECK-FP16
 
diff --git a/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c b/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c
index c62d1c9de0cb..59f56b988d2a 100644
--- a/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c
+++ b/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple armv8.2a-linux-gnu -target-abi apcs-gnu -target-feature +neon -target-feature +fullfp16 \
-// RUN: -S -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: -disable-O0-optnone -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg \
 // RUN: | FileCheck %s
 
diff --git a/clang/test/CodeGen/arm-v8.6a-neon-intrinsics.c b/clang/test/CodeGen/arm-v8.6a-neon-intrinsics.c
index 6ec078774b59..947f42cdd0de 100644
--- a/clang/test/CodeGen/arm-v8.6a-neon-intrinsics.c
+++ b/clang/test/CodeGen/arm-v8.6a-neon-intrinsics.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple armv8.6a-arm-none-eabi -target-feature +neon -target-feature +fullfp16 -target-feature +i8mm \
-// RUN: -S -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: -disable-O0-optnone -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,sroa \
 // RUN: | FileCheck %s
 
diff --git a/clang/test/CodeGen/arm-vector_type-params-returns.c b/clang/test/CodeGen/arm-vector_type-params-returns.c
index a55aba9ce066..e21eb48fd4fe 100644
--- a/clang/test/CodeGen/arm-vector_type-params-returns.c
+++ b/clang/test/CodeGen/arm-vector_type-params-returns.c
@@ -1,16 +1,16 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 
 // RUN: %clang_cc1 -DSVE_HEADER -triple aarch64 -target-feature +sve -emit-llvm -O2 -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s
-// RUN: %clang_cc1 -DSVE_HEADER -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o - /dev/null %s
+// RUN: %clang_cc1 -DSVE_HEADER -triple aarch64 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o - /dev/null %s
 
 // RUN: %clang_cc1 -DNEON_HEADER -triple aarch64 -target-feature +sve -emit-llvm -O2 -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s
-// RUN: %clang_cc1 -DNEON_HEADER -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o - /dev/null %s
+// RUN: %clang_cc1 -DNEON_HEADER -triple aarch64 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o - /dev/null %s
 
 // RUN: %clang_cc1 -DSVE_HEADER -DNEON_HEADER -triple aarch64 -target-feature +sve -emit-llvm -O2 -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s
-// RUN: %clang_cc1 -DSVE_HEADER -DNEON_HEADER -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o - /dev/null %s
+// RUN: %clang_cc1 -DSVE_HEADER -DNEON_HEADER -triple aarch64 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o - /dev/null %s
 
-// RUN: %clang_cc1 -DNEON_HEADER -DSVE_HEADER2  -triple aarch64 -target-feature +sve -emit-llvm -O2 -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s
-// RUN: %clang_cc1 -DNEON_HEADER -DSVE_HEADER2 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o - /dev/null %s
+// RUN: %clang_cc1 -DNEON_HEADER -DSVE_HEADER2 -triple aarch64 -target-feature +sve -emit-llvm -O2 -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s
+// RUN: %clang_cc1 -DNEON_HEADER -DSVE_HEADER2 -triple aarch64 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o - /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/arm64-crc32.c b/clang/test/CodeGen/arm64-crc32.c
index bf0f77b4eb8d..63c422dcf36c 100644
--- a/clang/test/CodeGen/arm64-crc32.c
+++ b/clang/test/CodeGen/arm64-crc32.c
@@ -1,8 +1,8 @@
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +crc \
-// RUN:  -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN:  -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-windows -target-feature +crc \
-// RUN:  -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN:  -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 // RUN: %clang_cc1 -verify -emit-llvm-only -triple aarch64 -target-feature -crc %s
 
 #include <stdint.h>
diff --git a/clang/test/CodeGen/arm64-microsoft-status-reg.cpp b/clang/test/CodeGen/arm64-microsoft-status-reg.cpp
index f63265e2c4ca..5a942169394e 100644
--- a/clang/test/CodeGen/arm64-microsoft-status-reg.cpp
+++ b/clang/test/CodeGen/arm64-microsoft-status-reg.cpp
@@ -1,9 +1,9 @@
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple arm64-windows -fms-compatibility -emit-llvm -S \
+// RUN: %clang_cc1 -triple arm64-windows -fms-compatibility -S \
 // RUN: -o - %s | FileCheck %s -check-prefix CHECK-ASM
 
-// RUN: %clang_cc1 -triple arm64-darwin -fms-compatibility -emit-llvm -S \
+// RUN: %clang_cc1 -triple arm64-darwin -fms-compatibility -S \
 // RUN: -o - %s | FileCheck %s -check-prefix CHECK-ASM
 
 // RUN: %clang_cc1 -triple arm64-windows -fms-compatibility -emit-llvm \
diff --git a/clang/test/CodeGen/arm64-mte.c b/clang/test/CodeGen/arm64-mte.c
index 7dde23cfd8e7..ff6f8de5d4bb 100644
--- a/clang/test/CodeGen/arm64-mte.c
+++ b/clang/test/CodeGen/arm64-mte.c
@@ -1,6 +1,6 @@
 // Test memory tagging extension intrinsics
-// RUN: %clang_cc1 -triple aarch64 -target-feature +mte -O3 -S -emit-llvm -o - %s  | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -DMTE -O3 -S -emit-llvm -o - %s  | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +mte -O3 -emit-llvm -o - %s  | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -DMTE -O3 -emit-llvm -o - %s  | FileCheck %s
 #include <stddef.h>
 #include <arm_acle.h>
 
diff --git a/clang/test/CodeGen/arm64_vcopy.c b/clang/test/CodeGen/arm64_vcopy.c
index 436f104c910a..639bc864ecb7 100644
--- a/clang/test/CodeGen/arm64_vcopy.c
+++ b/clang/test/CodeGen/arm64_vcopy.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -disable-O0-optnone -emit-llvm %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -o - -disable-O0-optnone -emit-llvm %s | opt -S -passes=mem2reg | FileCheck %s
 
 // Test ARM64 SIMD copy vector element to vector element: vcopyq_lane*
 
diff --git a/clang/test/CodeGen/arm64_vcreate.c b/clang/test/CodeGen/arm64_vcreate.c
index 291ba4de92c1..2b6e8e443916 100644
--- a/clang/test/CodeGen/arm64_vcreate.c
+++ b/clang/test/CodeGen/arm64_vcreate.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -o - -emit-llvm %s | opt -S -passes=mem2reg | FileCheck %s
 // Test ARM64 SIMD vcreate intrinsics
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
diff --git a/clang/test/CodeGen/arm64_vdup.c b/clang/test/CodeGen/arm64_vdup.c
index 06f111dc97d2..f81ee5031495 100644
--- a/clang/test/CodeGen/arm64_vdup.c
+++ b/clang/test/CodeGen/arm64_vdup.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -o - -emit-llvm %s | FileCheck %s
 // Test ARM64 SIMD duplicate lane and n intrinsics
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
diff --git a/clang/test/CodeGen/arm64_vdupq_n_f64.c b/clang/test/CodeGen/arm64_vdupq_n_f64.c
index 35d4457cd31e..2da2d3bc8d07 100644
--- a/clang/test/CodeGen/arm64_vdupq_n_f64.c
+++ b/clang/test/CodeGen/arm64_vdupq_n_f64.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -disable-O0-optnone -emit-llvm %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -o - -disable-O0-optnone -emit-llvm %s | opt -S -passes=mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGen/arm_acle.c b/clang/test/CodeGen/arm_acle.c
index 00afaf15fded..1c41f1b5d23f 100644
--- a/clang/test/CodeGen/arm_acle.c
+++ b/clang/test/CodeGen/arm_acle.c
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -ffreestanding -triple armv8a-none-eabi -target-feature +crc -target-feature +dsp -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefixes=ARM,AArch32
-// RUN: %clang_cc1 -ffreestanding -Wno-error=implicit-function-declaration -triple aarch64-none-elf -target-feature +neon -target-feature +crc -target-feature +crypto -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefixes=ARM,AArch64
-// RUN: %clang_cc1 -ffreestanding -triple aarch64-none-elf -target-feature +v8.3a -target-feature +crc -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefixes=ARM,AArch64,AArch6483
-// RUN: %clang_cc1 -ffreestanding -triple aarch64-none-elf -target-feature +v8.5a -target-feature +crc -target-feature +rand -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefixes=ARM,AArch64,AArch6483,AArch6485
-// RUN: %clang_cc1 -ffreestanding -triple aarch64-none-elf -target-feature +v9.4a -target-feature +crc -target-feature +rand -target-feature +d128 -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefixes=ARM,AArch64,AArch6483,AArch6485,AArch6494D128
+// RUN: %clang_cc1 -ffreestanding -triple armv8a-none-eabi -target-feature +crc -target-feature +dsp -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefixes=ARM,AArch32
+// RUN: %clang_cc1 -ffreestanding -Wno-error=implicit-function-declaration -triple aarch64-none-elf -target-feature +neon -target-feature +crc -target-feature +crypto -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefixes=ARM,AArch64
+// RUN: %clang_cc1 -ffreestanding -triple aarch64-none-elf -target-feature +v8.3a -target-feature +crc -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefixes=ARM,AArch64,AArch6483
+// RUN: %clang_cc1 -ffreestanding -triple aarch64-none-elf -target-feature +v8.5a -target-feature +crc -target-feature +rand -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefixes=ARM,AArch64,AArch6483,AArch6485
+// RUN: %clang_cc1 -ffreestanding -triple aarch64-none-elf -target-feature +v9.4a -target-feature +crc -target-feature +rand -target-feature +d128 -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefixes=ARM,AArch64,AArch6483,AArch6485,AArch6494D128
 
 
 #include <arm_acle.h>
diff --git a/clang/test/CodeGen/asan-new-pm.ll b/clang/test/CodeGen/asan-new-pm.ll
index 78d195b0ea24..93a5afad6ff8 100644
--- a/clang/test/CodeGen/asan-new-pm.ll
+++ b/clang/test/CodeGen/asan-new-pm.ll
@@ -1,6 +1,6 @@
 ; Test that ASan runs with the new pass manager
-; RUN: %clang_cc1 -triple x86_64-unknown-unknown -S -emit-llvm -o - -fsanitize=address %s | FileCheck %s
-; RUN: %clang_cc1 -triple x86_64-unknown-unknown -S -emit-llvm -o - -O1 -fsanitize=address %s | FileCheck %s
+; RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - -fsanitize=address %s | FileCheck %s
+; RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - -O1 -fsanitize=address %s | FileCheck %s
 
 ; CHECK-DAG: @llvm.global_ctors = {{.*}}@asan.module_ctor
 
diff --git a/clang/test/CodeGen/asm-label-inline-builtins.c b/clang/test/CodeGen/asm-label-inline-builtins.c
index ab9afc29411d..b6f046ee7895 100644
--- a/clang/test/CodeGen/asm-label-inline-builtins.c
+++ b/clang/test/CodeGen/asm-label-inline-builtins.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64 -S -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64 -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
 //
 // Verifies that clang-generated *.inline carry the same name at call and callee
 // site, in spite of asm labels.
diff --git a/clang/test/CodeGen/attr-alwaysinline.cpp b/clang/test/CodeGen/attr-alwaysinline.cpp
index 2873a3f5de96..7d2a219a9329 100644
--- a/clang/test/CodeGen/attr-alwaysinline.cpp
+++ b/clang/test/CodeGen/attr-alwaysinline.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -S -emit-llvm %s -triple x86_64-unknown-linux-gnu -o - | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm %s -triple x86_64-unknown-linux-gnu -o - | FileCheck %s
 
 bool bar();
 void f(bool, bool);
diff --git a/clang/test/CodeGen/attr-arm-sve-vector-bits-bitcast.c b/clang/test/CodeGen/attr-arm-sve-vector-bits-bitcast.c
index 323afb645912..f924b34bff55 100644
--- a/clang/test/CodeGen/attr-arm-sve-vector-bits-bitcast.c
+++ b/clang/test/CodeGen/attr-arm-sve-vector-bits-bitcast.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -mvscale-min=1 -mvscale-max=1 -S -O1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-128
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -mvscale-min=2 -mvscale-max=2 -S -O1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-256
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -mvscale-min=4 -mvscale-max=4 -S -O1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-512
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -mvscale-min=1 -mvscale-max=1 -O1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-128
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -mvscale-min=2 -mvscale-max=2 -O1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-256
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -mvscale-min=4 -mvscale-max=4 -O1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-512
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/attr-arm-sve-vector-bits-call.c b/clang/test/CodeGen/attr-arm-sve-vector-bits-call.c
index 6685fe0b27b4..b1bc52b9578a 100644
--- a/clang/test/CodeGen/attr-arm-sve-vector-bits-call.c
+++ b/clang/test/CodeGen/attr-arm-sve-vector-bits-call.c
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=4 -mvscale-max=4 -S -O1 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -mvscale-min=4 -mvscale-max=4 -O1 -emit-llvm -o - %s | FileCheck %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c b/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c
index 640031a7b980..8be1123a2305 100644
--- a/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c
+++ b/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -mvscale-min=4 -mvscale-max=4 -S -O1 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -mvscale-min=4 -mvscale-max=4 -O1 -emit-llvm -o - %s | FileCheck %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/attr-arm-sve-vector-bits-codegen.c b/clang/test/CodeGen/attr-arm-sve-vector-bits-codegen.c
index cc414bf766ef..5db75547abda 100644
--- a/clang/test/CodeGen/attr-arm-sve-vector-bits-codegen.c
+++ b/clang/test/CodeGen/attr-arm-sve-vector-bits-codegen.c
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -mvscale-min=4 -mvscale-max=4 -S -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -mvscale-min=4 -mvscale-max=4 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/attr-arm-sve-vector-bits-globals.c b/clang/test/CodeGen/attr-arm-sve-vector-bits-globals.c
index 11d4e8ae8a10..7858a7bf0026 100644
--- a/clang/test/CodeGen/attr-arm-sve-vector-bits-globals.c
+++ b/clang/test/CodeGen/attr-arm-sve-vector-bits-globals.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -mvscale-min=1 -mvscale-max=1 -S -O1 -emit-llvm -o - %s -fhalf-no-semantic-interposition | FileCheck %s --check-prefix=CHECK-128
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -mvscale-min=4 -mvscale-max=4 -S -O1 -emit-llvm -o - %s -fhalf-no-semantic-interposition | FileCheck %s --check-prefix=CHECK-512
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -mvscale-min=1 -mvscale-max=1 -O1 -emit-llvm -o - %s -fhalf-no-semantic-interposition | FileCheck %s --check-prefix=CHECK-128
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -mvscale-min=4 -mvscale-max=4 -O1 -emit-llvm -o - %s -fhalf-no-semantic-interposition | FileCheck %s --check-prefix=CHECK-512
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/attr-arm-sve-vector-bits-types.c b/clang/test/CodeGen/attr-arm-sve-vector-bits-types.c
index 8e662c22b303..c6d5d2d2cffd 100644
--- a/clang/test/CodeGen/attr-arm-sve-vector-bits-types.c
+++ b/clang/test/CodeGen/attr-arm-sve-vector-bits-types.c
@@ -1,9 +1,9 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -mvscale-min=1 -mvscale-max=1 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-128
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -mvscale-min=2 -mvscale-max=2 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-256
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -mvscale-min=4 -mvscale-max=4 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-512
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -mvscale-min=8 -mvscale-max=8 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-1024
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -mvscale-min=16 -mvscale-max=16 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-2048
-// RUN: %clang_cc1 -triple aarch64_32-unknown-darwin -target-feature +sve -target-feature +bf16 -mvscale-min=4 -mvscale-max=4 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ILP32
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -mvscale-min=1 -mvscale-max=1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-128
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -mvscale-min=2 -mvscale-max=2 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-256
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -mvscale-min=4 -mvscale-max=4 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-512
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -mvscale-min=8 -mvscale-max=8 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-1024
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -mvscale-min=16 -mvscale-max=16 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-2048
+// RUN: %clang_cc1 -triple aarch64_32-unknown-darwin -target-feature +sve -target-feature +bf16 -mvscale-min=4 -mvscale-max=4 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ILP32
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/attr-btf_tag-typedef.c b/clang/test/CodeGen/attr-btf_tag-typedef.c
index 57aafde15893..1f00246ed36c 100644
--- a/clang/test/CodeGen/attr-btf_tag-typedef.c
+++ b/clang/test/CodeGen/attr-btf_tag-typedef.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple %itanium_abi_triple -debug-info-kind=limited -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -debug-info-kind=limited -emit-llvm -o - %s | FileCheck %s
 
 #define __tag1 __attribute__((btf_decl_tag("tag1")))
 typedef struct { int a; } __s __tag1;
diff --git a/clang/test/CodeGen/attr-btf_type_tag-func-ptr.c b/clang/test/CodeGen/attr-btf_type_tag-func-ptr.c
index 29ca5f58e4b8..26935c882a01 100644
--- a/clang/test/CodeGen/attr-btf_type_tag-func-ptr.c
+++ b/clang/test/CodeGen/attr-btf_type_tag-func-ptr.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple %itanium_abi_triple -debug-info-kind=limited -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -debug-info-kind=limited -emit-llvm -o - %s | FileCheck %s
 
 struct t {
  int (__attribute__((btf_type_tag("rcu"))) *f)();
diff --git a/clang/test/CodeGen/attr-btf_type_tag-func.c b/clang/test/CodeGen/attr-btf_type_tag-func.c
index c573d1147ccd..dbb886475914 100644
--- a/clang/test/CodeGen/attr-btf_type_tag-func.c
+++ b/clang/test/CodeGen/attr-btf_type_tag-func.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple %itanium_abi_triple -debug-info-kind=limited -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple %itanium_abi_triple -DDOUBLE_BRACKET_ATTRS=1 -debug-info-kind=limited -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -debug-info-kind=limited -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -DDOUBLE_BRACKET_ATTRS=1 -debug-info-kind=limited -emit-llvm -o - %s | FileCheck %s
 
 #if DOUBLE_BRACKET_ATTRS
 #define __tag1 [[clang::btf_type_tag("tag1")]]
diff --git a/clang/test/CodeGen/attr-btf_type_tag-similar-type.c b/clang/test/CodeGen/attr-btf_type_tag-similar-type.c
index ad9d16f3f631..3960d6f5c93f 100644
--- a/clang/test/CodeGen/attr-btf_type_tag-similar-type.c
+++ b/clang/test/CodeGen/attr-btf_type_tag-similar-type.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple %itanium_abi_triple -debug-info-kind=limited -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -debug-info-kind=limited -emit-llvm -o - %s | FileCheck %s
 
 struct map_value {
         int __attribute__((btf_type_tag("tag1"))) __attribute__((btf_type_tag("tag3"))) *a;
diff --git a/clang/test/CodeGen/attr-btf_type_tag-typedef-field.c b/clang/test/CodeGen/attr-btf_type_tag-typedef-field.c
index c80c7e9b45d9..5c8955fbf89a 100644
--- a/clang/test/CodeGen/attr-btf_type_tag-typedef-field.c
+++ b/clang/test/CodeGen/attr-btf_type_tag-typedef-field.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple %itanium_abi_triple -debug-info-kind=limited -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -debug-info-kind=limited -emit-llvm -o - %s | FileCheck %s
 
 #define __tag1 __attribute__((btf_type_tag("tag1")))
 #define __tag2 __attribute__((btf_type_tag("tag2")))
diff --git a/clang/test/CodeGen/attr-btf_type_tag-var.c b/clang/test/CodeGen/attr-btf_type_tag-var.c
index ccc4edd22c1c..ed729e245fbc 100644
--- a/clang/test/CodeGen/attr-btf_type_tag-var.c
+++ b/clang/test/CodeGen/attr-btf_type_tag-var.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple %itanium_abi_triple -debug-info-kind=limited -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple %itanium_abi_triple -DDOUBLE_BRACKET_ATTRS=1 -debug-info-kind=limited -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -debug-info-kind=limited -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -DDOUBLE_BRACKET_ATTRS=1 -debug-info-kind=limited -emit-llvm -o - %s | FileCheck %s
 
 #if DOUBLE_BRACKET_ATTRS
 #define __tag1 [[clang::btf_type_tag("tag1")]]
diff --git a/clang/test/CodeGen/attr-ifunc.c b/clang/test/CodeGen/attr-ifunc.c
index 2ad41edf20df..24d66433ae09 100644
--- a/clang/test/CodeGen/attr-ifunc.c
+++ b/clang/test/CodeGen/attr-ifunc.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -triple x86_64-windows -fsyntax-only -verify %s
-// RUN: %clang_cc1 -triple x86_64-linux -fsyntax-only -verify -emit-llvm-only -DCHECK_ALIASES %s
-// RUN: %clang_cc1 -triple x86_64-linux -fsyntax-only -verify -emit-llvm-only %s
-// RUN: %clang_cc1 -triple x86_64-apple-macosx -fsyntax-only -verify -emit-llvm-only %s
+// RUN: %clang_cc1 -triple x86_64-linux -verify -emit-llvm-only -DCHECK_ALIASES %s
+// RUN: %clang_cc1 -triple x86_64-linux -verify -emit-llvm-only %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx -verify -emit-llvm-only %s
 
 #if defined(_WIN32)
 void foo(void) {}
diff --git a/clang/test/CodeGen/attr-ifunc.cpp b/clang/test/CodeGen/attr-ifunc.cpp
index b6e342df46eb..9e6cd7312122 100644
--- a/clang/test/CodeGen/attr-ifunc.cpp
+++ b/clang/test/CodeGen/attr-ifunc.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -triple x86_64-linux -fsyntax-only -verify -emit-llvm-only %s
-// RUN: %clang_cc1 -triple x86_64-apple-macosx -fsyntax-only -verify -emit-llvm-only %s
-// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify -emit-llvm-only %s
+// RUN: %clang_cc1 -triple x86_64-linux -verify -emit-llvm-only %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx -verify -emit-llvm-only %s
+// RUN: %clang_cc1 -triple arm64-apple-macosx -verify -emit-llvm-only %s
 // RUN: not %clang_cc1 -triple x86_64-linux -emit-llvm-only -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s
 // RUN: not %clang_cc1 -triple x86_64-apple-macosx -emit-llvm-only -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s
 // RUN: not %clang_cc1 -triple arm64-apple-macosx -emit-llvm-only -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s
diff --git a/clang/test/CodeGen/attr-mustprogress.c b/clang/test/CodeGen/attr-mustprogress.c
index b4f8710a9d69..2e8b871912e3 100644
--- a/clang/test/CodeGen/attr-mustprogress.c
+++ b/clang/test/CodeGen/attr-mustprogress.c
@@ -1,22 +1,22 @@
-// RUN: %clang_cc1 -std=c89 -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=C99 %s
-// RUN: %clang_cc1 -std=c99 -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=C99 %s
-// RUN: %clang_cc1 -std=c11 -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=C11 %s
-// RUN: %clang_cc1 -std=c18 -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=C11 %s
-// RUN: %clang_cc1 -std=c2x -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=C11 %s
+// RUN: %clang_cc1 -std=c89 -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=C99 %s
+// RUN: %clang_cc1 -std=c99 -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=C99 %s
+// RUN: %clang_cc1 -std=c11 -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=C11 %s
+// RUN: %clang_cc1 -std=c18 -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=C11 %s
+// RUN: %clang_cc1 -std=c2x -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=C11 %s
 //
 // Check -ffinite-loops option in combination with various standard versions.
-// RUN: %clang_cc1 -std=c89 -ffinite-loops -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=FINITE %s
-// RUN: %clang_cc1 -std=c99 -ffinite-loops -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=FINITE %s
-// RUN: %clang_cc1 -std=c11 -ffinite-loops -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=FINITE %s
-// RUN: %clang_cc1 -std=c18 -ffinite-loops -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=FINITE %s
-// RUN: %clang_cc1 -std=c2x -ffinite-loops -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=FINITE %s
+// RUN: %clang_cc1 -std=c89 -ffinite-loops -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=FINITE %s
+// RUN: %clang_cc1 -std=c99 -ffinite-loops -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=FINITE %s
+// RUN: %clang_cc1 -std=c11 -ffinite-loops -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=FINITE %s
+// RUN: %clang_cc1 -std=c18 -ffinite-loops -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=FINITE %s
+// RUN: %clang_cc1 -std=c2x -ffinite-loops -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=FINITE %s
 //
 // Check -fno-finite-loops option in combination with various standard versions.
-// RUN: %clang_cc1 -std=c89 -fno-finite-loops -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=C99 %s
-// RUN: %clang_cc1 -std=c99 -fno-finite-loops -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=C99 %s
-// RUN: %clang_cc1 -std=c11 -fno-finite-loops -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=C99 %s
-// RUN: %clang_cc1 -std=c18 -fno-finite-loops -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=C99 %s
-// RUN: %clang_cc1 -std=c2x -fno-finite-loops -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=C99 %s
+// RUN: %clang_cc1 -std=c89 -fno-finite-loops -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=C99 %s
+// RUN: %clang_cc1 -std=c99 -fno-finite-loops -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=C99 %s
+// RUN: %clang_cc1 -std=c11 -fno-finite-loops -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=C99 %s
+// RUN: %clang_cc1 -std=c18 -fno-finite-loops -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=C99 %s
+// RUN: %clang_cc1 -std=c2x -fno-finite-loops -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=C99 %s
 
 int a = 0;
 int b = 0;
@@ -30,7 +30,7 @@ int b = 0;
 // CHECK:       for.cond:
 // C99-NOT:       br {{.*}}!llvm.loop
 // C11-NOT:       br {{.*}}!llvm.loop
-// FINITE-NEXT:   br {{.*}}!llvm.loop
+// FINITE-NOR:    br {{.*}}!llvm.loop
 //
 void f0(void) {
   for (; ;) ;
@@ -45,7 +45,7 @@ void f0(void) {
 // CHECK:       for.body:
 // C99-NOT:       br {{.*}}, !llvm.loop
 // C11-NOT:       br {{.*}}, !llvm.loop
-// FINITE-NEXT:   br {{.*}}, !llvm.loop
+// FINITE-NOT:    br {{.*}}, !llvm.loop
 // CHECK:       for.end:
 // CHECK-NEXT:    ret void
 //
@@ -84,7 +84,7 @@ void f2(void) {
 // CHECK:       for.body:
 // C99-NOT:       br {{.*}}, !llvm.loop
 // C11-NOT:       br {{.*}}, !llvm.loop
-// FINITE-NEXT:   br {{.*}}, !llvm.loop
+// FINITE-NOT:    br {{.*}}, !llvm.loop
 // CHECK:       for.end:
 // CHECK-NEXT:    br label %for.cond1
 // CHECK:       for.cond1:
@@ -113,7 +113,7 @@ void F(void) {
 // CHECK:       while.body:
 // C99-NOT:       br {{.*}}, !llvm.loop
 // C11-NOT:       br {{.*}}, !llvm.loop
-// FINITE-NEXT:   br {{.*}}, !llvm.loop
+// FINITE-NOT:    br {{.*}}, !llvm.loop
 //
 void w1(void) {
   while (1) {
@@ -159,7 +159,7 @@ void w2(void) {
 // CHECK:       while.body2:
 // C99-NOT:       br {{.*}} !llvm.loop
 // C11-NOT:       br {{.*}} !llvm.loop
-// FINITE-NEXT:   br {{.*}} !llvm.loop
+// FINITE-NOT:    br {{.*}} !llvm.loop
 //
 void W(void) {
   while (a == b) {
@@ -177,7 +177,7 @@ void W(void) {
 // CHECK:       do.cond:
 // C99-NOT:       br {{.*}}, !llvm.loop
 // C11-NOT:       br {{.*}}, !llvm.loop
-// FINITE-NEXT:   br {{.*}}, !llvm.loop
+// FINITE-NOT:    br {{.*}}, !llvm.loop
 // CHECK:       do.end:
 // CHECK-NEXT:    ret void
 //
diff --git a/clang/test/CodeGen/attr-noinline.cpp b/clang/test/CodeGen/attr-noinline.cpp
index 93779c6873e5..f0588cfecf46 100644
--- a/clang/test/CodeGen/attr-noinline.cpp
+++ b/clang/test/CodeGen/attr-noinline.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -S -emit-llvm %s -triple x86_64-unknown-linux-gnu -o - | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm %s -triple x86_64-unknown-linux-gnu -o - | FileCheck %s
 
 bool bar();
 void f(bool, bool);
diff --git a/clang/test/CodeGen/attr-nomerge.cpp b/clang/test/CodeGen/attr-nomerge.cpp
index 2d015f616455..7305fb73cf1d 100644
--- a/clang/test/CodeGen/attr-nomerge.cpp
+++ b/clang/test/CodeGen/attr-nomerge.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -S -emit-llvm %s -triple x86_64-unknown-linux-gnu -o - | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm %s -triple x86_64-unknown-linux-gnu -o - | FileCheck %s
 
 class A {
 public:
diff --git a/clang/test/CodeGen/attr-noundef.cpp b/clang/test/CodeGen/attr-noundef.cpp
index d236b35fdfd7..e1cab091bfcb 100644
--- a/clang/test/CodeGen/attr-noundef.cpp
+++ b/clang/test/CodeGen/attr-noundef.cpp
@@ -1,10 +1,10 @@
-// RUN: %clang_cc1 -triple x86_64-gnu-linux -x c++ -S -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-INTEL
-// RUN: %clang_cc1 -triple aarch64-gnu-linux -x c++ -S -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AARCH
-// RUN: %clang_cc1 -triple x86_64-gnu-linux -x c++ -S -emit-llvm -fsanitize-memory-param-retval %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-INTEL
-// RUN: %clang_cc1 -triple aarch64-gnu-linux -x c++ -S -emit-llvm -fsanitize-memory-param-retval %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AARCH
+// RUN: %clang_cc1 -triple x86_64-gnu-linux -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-INTEL
+// RUN: %clang_cc1 -triple aarch64-gnu-linux -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AARCH
+// RUN: %clang_cc1 -triple x86_64-gnu-linux -x c++ -emit-llvm -fsanitize-memory-param-retval %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-INTEL
+// RUN: %clang_cc1 -triple aarch64-gnu-linux -x c++ -emit-llvm -fsanitize-memory-param-retval %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AARCH
 
 // no-sanitize-memory-param-retval does NOT conflict with enable-noundef-analysis
-// RUN: %clang_cc1 -triple x86_64-gnu-linux -x c++ -S -emit-llvm -fno-sanitize-memory-param-retval %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-INTEL
+// RUN: %clang_cc1 -triple x86_64-gnu-linux -x c++ -emit-llvm -fno-sanitize-memory-param-retval %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-INTEL
 
 //************ Passing structs by value
 // TODO: No structs may currently be marked noundef
diff --git a/clang/test/CodeGen/attr-nouwtable.c b/clang/test/CodeGen/attr-nouwtable.c
index a0c6d9232ef3..faf0b83f9ad6 100644
--- a/clang/test/CodeGen/attr-nouwtable.c
+++ b/clang/test/CodeGen/attr-nouwtable.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -funwind-tables=2 -S -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -funwind-tables=2 -emit-llvm %s -o - | FileCheck %s
 
 __attribute__((nouwtable))
 int test1(void) { return 0; }
diff --git a/clang/test/CodeGen/attr-target-clones-aarch64.c b/clang/test/CodeGen/attr-target-clones-aarch64.c
index 8c8b951e9118..ad6079a91fcd 100644
--- a/clang/test/CodeGen/attr-target-clones-aarch64.c
+++ b/clang/test/CodeGen/attr-target-clones-aarch64.c
@@ -1,13 +1,15 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-attributes --check-globals --include-generated-funcs
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -fmv -S -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK-NOFMV
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -fmv -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK-NOFMV
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +mte -target-feature +bti -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK-MTE-BTI
 
 int __attribute__((target_clones("lse+aes", "sve2"))) ftc(void) { return 0; }
 int __attribute__((target_clones("sha2", "sha2+memtag2", " default "))) ftc_def(void) { return 1; }
 int __attribute__((target_clones("sha2", "default"))) ftc_dup1(void) { return 2; }
 int __attribute__((target_clones("fp", "crc+dotprod"))) ftc_dup2(void) { return 3; }
+int __attribute__((target_clones("memtag2", "bti"))) ftc_dup3(void) { return 4; }
 int foo() {
-  return ftc() + ftc_def() + ftc_dup1() + ftc_dup2();
+  return ftc() + ftc_def() + ftc_dup1() + ftc_dup2() + ftc_dup3();
 }
 
 inline int __attribute__((target_clones("rng+simd", "rcpc+predres", "sve2-aes+wfxt"))) ftc_inline1(void) { return 1; }
@@ -29,6 +31,7 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK: @ftc_def.ifunc = weak_odr alias i32 (), ptr @ftc_def
 // CHECK: @ftc_dup1.ifunc = weak_odr alias i32 (), ptr @ftc_dup1
 // CHECK: @ftc_dup2.ifunc = weak_odr alias i32 (), ptr @ftc_dup2
+// CHECK: @ftc_dup3.ifunc = weak_odr alias i32 (), ptr @ftc_dup3
 // CHECK: @ftc_inline2.ifunc = weak_odr alias i32 (), ptr @ftc_inline2
 // CHECK: @ftc_inline1.ifunc = weak_odr alias i32 (), ptr @ftc_inline1
 // CHECK: @ftc_inline3.ifunc = weak_odr alias i32 (), ptr @ftc_inline3
@@ -36,10 +39,29 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK: @ftc_def = weak_odr ifunc i32 (), ptr @ftc_def.resolver
 // CHECK: @ftc_dup1 = weak_odr ifunc i32 (), ptr @ftc_dup1.resolver
 // CHECK: @ftc_dup2 = weak_odr ifunc i32 (), ptr @ftc_dup2.resolver
+// CHECK: @ftc_dup3 = weak_odr ifunc i32 (), ptr @ftc_dup3.resolver
 // CHECK: @ftc_inline1 = weak_odr ifunc i32 (), ptr @ftc_inline1.resolver
 // CHECK: @ftc_inline2 = weak_odr ifunc i32 (), ptr @ftc_inline2.resolver
 // CHECK: @ftc_inline3 = weak_odr ifunc i32 (), ptr @ftc_inline3.resolver
 //.
+// CHECK-MTE-BTI: @__aarch64_cpu_features = external dso_local global { i64 }
+// CHECK-MTE-BTI: @ftc.ifunc = weak_odr alias i32 (), ptr @ftc
+// CHECK-MTE-BTI: @ftc_def.ifunc = weak_odr alias i32 (), ptr @ftc_def
+// CHECK-MTE-BTI: @ftc_dup1.ifunc = weak_odr alias i32 (), ptr @ftc_dup1
+// CHECK-MTE-BTI: @ftc_dup2.ifunc = weak_odr alias i32 (), ptr @ftc_dup2
+// CHECK-MTE-BTI: @ftc_dup3.ifunc = weak_odr alias i32 (), ptr @ftc_dup3
+// CHECK-MTE-BTI: @ftc_inline2.ifunc = weak_odr alias i32 (), ptr @ftc_inline2
+// CHECK-MTE-BTI: @ftc_inline1.ifunc = weak_odr alias i32 (), ptr @ftc_inline1
+// CHECK-MTE-BTI: @ftc_inline3.ifunc = weak_odr alias i32 (), ptr @ftc_inline3
+// CHECK-MTE-BTI: @ftc = weak_odr ifunc i32 (), ptr @ftc.resolver
+// CHECK-MTE-BTI: @ftc_def = weak_odr ifunc i32 (), ptr @ftc_def.resolver
+// CHECK-MTE-BTI: @ftc_dup1 = weak_odr ifunc i32 (), ptr @ftc_dup1.resolver
+// CHECK-MTE-BTI: @ftc_dup2 = weak_odr ifunc i32 (), ptr @ftc_dup2.resolver
+// CHECK-MTE-BTI: @ftc_dup3 = weak_odr ifunc i32 (), ptr @ftc_dup3.resolver
+// CHECK-MTE-BTI: @ftc_inline1 = weak_odr ifunc i32 (), ptr @ftc_inline1.resolver
+// CHECK-MTE-BTI: @ftc_inline2 = weak_odr ifunc i32 (), ptr @ftc_inline2.resolver
+// CHECK-MTE-BTI: @ftc_inline3 = weak_odr ifunc i32 (), ptr @ftc_inline3.resolver
+//.
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: @ftc._MaesMlse(
 // CHECK-NEXT:  entry:
@@ -155,6 +177,40 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: @ftc_dup3._Mmemtag2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 4
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: @ftc_dup3._Mbti(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 4
+//
+//
+// CHECK-LABEL: @ftc_dup3.resolver(
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1125899906842624
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1125899906842624
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @ftc_dup3._Mbti
+// CHECK:       resolver_else:
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 17592186044416
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 17592186044416
+// CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
+// CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
+// CHECK:       resolver_return1:
+// CHECK-NEXT:    ret ptr @ftc_dup3._Mmemtag2
+// CHECK:       resolver_else2:
+// CHECK-NEXT:    ret ptr @ftc_dup3.default
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: @foo(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CALL:%.*]] = call i32 @ftc()
@@ -164,7 +220,9 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[ADD]], [[CALL2]]
 // CHECK-NEXT:    [[CALL4:%.*]] = call i32 @ftc_dup2()
 // CHECK-NEXT:    [[ADD5:%.*]] = add nsw i32 [[ADD3]], [[CALL4]]
-// CHECK-NEXT:    ret i32 [[ADD5]]
+// CHECK-NEXT:    [[CALL6:%.*]] = call i32 @ftc_dup3()
+// CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 [[ADD5]], [[CALL6]]
+// CHECK-NEXT:    ret i32 [[ADD7]]
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
@@ -299,6 +357,12 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: @ftc_dup3.default(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 4
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: @ftc_inline2.default(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 2
@@ -371,6 +435,12 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
+// CHECK-NOFMV-LABEL: @ftc_dup3(
+// CHECK-NOFMV-NEXT:  entry:
+// CHECK-NOFMV-NEXT:    ret i32 4
+//
+//
+// CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: @foo(
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    [[CALL:%.*]] = call i32 @ftc()
@@ -380,7 +450,9 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-NOFMV-NEXT:    [[ADD3:%.*]] = add nsw i32 [[ADD]], [[CALL2]]
 // CHECK-NOFMV-NEXT:    [[CALL4:%.*]] = call i32 @ftc_dup2()
 // CHECK-NOFMV-NEXT:    [[ADD5:%.*]] = add nsw i32 [[ADD3]], [[CALL4]]
-// CHECK-NOFMV-NEXT:    ret i32 [[ADD5]]
+// CHECK-NOFMV-NEXT:    [[CALL6:%.*]] = call i32 @ftc_dup3()
+// CHECK-NOFMV-NEXT:    [[ADD7:%.*]] = add nsw i32 [[ADD5]], [[CALL6]]
+// CHECK-NOFMV-NEXT:    ret i32 [[ADD7]]
 //
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
@@ -403,6 +475,354 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-NOFMV-NEXT:    [[ADD5:%.*]] = add nsw i32 [[ADD3]], [[CALL4]]
 // CHECK-NOFMV-NEXT:    ret i32 [[ADD5]]
 //
+//
+// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
+// CHECK-MTE-BTI-LABEL: @ftc._MaesMlse(
+// CHECK-MTE-BTI-NEXT:  entry:
+// CHECK-MTE-BTI-NEXT:    ret i32 0
+//
+//
+// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
+// CHECK-MTE-BTI-LABEL: @ftc._Msve2(
+// CHECK-MTE-BTI-NEXT:  entry:
+// CHECK-MTE-BTI-NEXT:    ret i32 0
+//
+//
+// CHECK-MTE-BTI-LABEL: @ftc.resolver(
+// CHECK-MTE-BTI-NEXT:  resolver_entry:
+// CHECK-MTE-BTI-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-MTE-BTI-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-MTE-BTI-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 16512
+// CHECK-MTE-BTI-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 16512
+// CHECK-MTE-BTI-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-MTE-BTI-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK-MTE-BTI:       resolver_return:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc._MaesMlse
+// CHECK-MTE-BTI:       resolver_else:
+// CHECK-MTE-BTI-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-MTE-BTI-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 68719476736
+// CHECK-MTE-BTI-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 68719476736
+// CHECK-MTE-BTI-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
+// CHECK-MTE-BTI-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
+// CHECK-MTE-BTI:       resolver_return1:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc._Msve2
+// CHECK-MTE-BTI:       resolver_else2:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc.default
+//
+//
+// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
+// CHECK-MTE-BTI-LABEL: @ftc_def._Msha2(
+// CHECK-MTE-BTI-NEXT:  entry:
+// CHECK-MTE-BTI-NEXT:    ret i32 1
+//
+//
+// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
+// CHECK-MTE-BTI-LABEL: @ftc_def._Mmemtag2Msha2(
+// CHECK-MTE-BTI-NEXT:  entry:
+// CHECK-MTE-BTI-NEXT:    ret i32 1
+//
+//
+// CHECK-MTE-BTI-LABEL: @ftc_def.resolver(
+// CHECK-MTE-BTI-NEXT:  resolver_entry:
+// CHECK-MTE-BTI-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-MTE-BTI-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-MTE-BTI-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 17592186048512
+// CHECK-MTE-BTI-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 17592186048512
+// CHECK-MTE-BTI-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-MTE-BTI-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK-MTE-BTI:       resolver_return:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_def._Mmemtag2Msha2
+// CHECK-MTE-BTI:       resolver_else:
+// CHECK-MTE-BTI-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-MTE-BTI-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 4096
+// CHECK-MTE-BTI-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 4096
+// CHECK-MTE-BTI-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
+// CHECK-MTE-BTI-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
+// CHECK-MTE-BTI:       resolver_return1:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_def._Msha2
+// CHECK-MTE-BTI:       resolver_else2:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_def.default
+//
+//
+// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
+// CHECK-MTE-BTI-LABEL: @ftc_dup1._Msha2(
+// CHECK-MTE-BTI-NEXT:  entry:
+// CHECK-MTE-BTI-NEXT:    ret i32 2
+//
+//
+// CHECK-MTE-BTI-LABEL: @ftc_dup1.resolver(
+// CHECK-MTE-BTI-NEXT:  resolver_entry:
+// CHECK-MTE-BTI-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-MTE-BTI-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-MTE-BTI-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 4096
+// CHECK-MTE-BTI-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 4096
+// CHECK-MTE-BTI-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-MTE-BTI-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK-MTE-BTI:       resolver_return:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_dup1._Msha2
+// CHECK-MTE-BTI:       resolver_else:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_dup1.default
+//
+//
+// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
+// CHECK-MTE-BTI-LABEL: @ftc_dup2._Mfp(
+// CHECK-MTE-BTI-NEXT:  entry:
+// CHECK-MTE-BTI-NEXT:    ret i32 3
+//
+//
+// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
+// CHECK-MTE-BTI-LABEL: @ftc_dup2._McrcMdotprod(
+// CHECK-MTE-BTI-NEXT:  entry:
+// CHECK-MTE-BTI-NEXT:    ret i32 3
+//
+//
+// CHECK-MTE-BTI-LABEL: @ftc_dup2.resolver(
+// CHECK-MTE-BTI-NEXT:  resolver_entry:
+// CHECK-MTE-BTI-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-MTE-BTI-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-MTE-BTI-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1040
+// CHECK-MTE-BTI-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1040
+// CHECK-MTE-BTI-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-MTE-BTI-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK-MTE-BTI:       resolver_return:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_dup2._McrcMdotprod
+// CHECK-MTE-BTI:       resolver_else:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_dup2._Mfp
+//
+//
+// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
+// CHECK-MTE-BTI-LABEL: @ftc_dup3._Mmemtag2(
+// CHECK-MTE-BTI-NEXT:  entry:
+// CHECK-MTE-BTI-NEXT:    ret i32 4
+//
+//
+// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
+// CHECK-MTE-BTI-LABEL: @ftc_dup3._Mbti(
+// CHECK-MTE-BTI-NEXT:  entry:
+// CHECK-MTE-BTI-NEXT:    ret i32 4
+//
+//
+// CHECK-MTE-BTI-LABEL: @ftc_dup3.resolver(
+// CHECK-MTE-BTI-NEXT:  resolver_entry:
+// CHECK-MTE-BTI-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-MTE-BTI-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-MTE-BTI-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1125899906842624
+// CHECK-MTE-BTI-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1125899906842624
+// CHECK-MTE-BTI-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-MTE-BTI-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK-MTE-BTI:       resolver_return:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_dup3._Mbti
+// CHECK-MTE-BTI:       resolver_else:
+// CHECK-MTE-BTI-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-MTE-BTI-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 17592186044416
+// CHECK-MTE-BTI-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 17592186044416
+// CHECK-MTE-BTI-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
+// CHECK-MTE-BTI-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
+// CHECK-MTE-BTI:       resolver_return1:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_dup3._Mmemtag2
+// CHECK-MTE-BTI:       resolver_else2:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_dup3.default
+//
+//
+// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
+// CHECK-MTE-BTI-LABEL: @foo(
+// CHECK-MTE-BTI-NEXT:  entry:
+// CHECK-MTE-BTI-NEXT:    [[CALL:%.*]] = call i32 @ftc()
+// CHECK-MTE-BTI-NEXT:    [[CALL1:%.*]] = call i32 @ftc_def()
+// CHECK-MTE-BTI-NEXT:    [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]]
+// CHECK-MTE-BTI-NEXT:    [[CALL2:%.*]] = call i32 @ftc_dup1()
+// CHECK-MTE-BTI-NEXT:    [[ADD3:%.*]] = add nsw i32 [[ADD]], [[CALL2]]
+// CHECK-MTE-BTI-NEXT:    [[CALL4:%.*]] = call i32 @ftc_dup2()
+// CHECK-MTE-BTI-NEXT:    [[ADD5:%.*]] = add nsw i32 [[ADD3]], [[CALL4]]
+// CHECK-MTE-BTI-NEXT:    [[CALL6:%.*]] = call i32 @ftc_dup3()
+// CHECK-MTE-BTI-NEXT:    [[ADD7:%.*]] = add nsw i32 [[ADD5]], [[CALL6]]
+// CHECK-MTE-BTI-NEXT:    ret i32 [[ADD7]]
+//
+//
+// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
+// CHECK-MTE-BTI-LABEL: @ftc_inline2._Mfp16(
+// CHECK-MTE-BTI-NEXT:  entry:
+// CHECK-MTE-BTI-NEXT:    ret i32 2
+//
+//
+// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
+// CHECK-MTE-BTI-LABEL: @ftc_direct(
+// CHECK-MTE-BTI-NEXT:  entry:
+// CHECK-MTE-BTI-NEXT:    ret i32 4
+//
+//
+// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
+// CHECK-MTE-BTI-LABEL: @main(
+// CHECK-MTE-BTI-NEXT:  entry:
+// CHECK-MTE-BTI-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK-MTE-BTI-NEXT:    store i32 0, ptr [[RETVAL]], align 4
+// CHECK-MTE-BTI-NEXT:    [[CALL:%.*]] = call i32 @ftc_inline1()
+// CHECK-MTE-BTI-NEXT:    [[CALL1:%.*]] = call i32 @ftc_inline2()
+// CHECK-MTE-BTI-NEXT:    [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]]
+// CHECK-MTE-BTI-NEXT:    [[CALL2:%.*]] = call i32 @ftc_inline3()
+// CHECK-MTE-BTI-NEXT:    [[ADD3:%.*]] = add nsw i32 [[ADD]], [[CALL2]]
+// CHECK-MTE-BTI-NEXT:    [[CALL4:%.*]] = call i32 @ftc_direct()
+// CHECK-MTE-BTI-NEXT:    [[ADD5:%.*]] = add nsw i32 [[ADD3]], [[CALL4]]
+// CHECK-MTE-BTI-NEXT:    ret i32 [[ADD5]]
+//
+//
+// CHECK-MTE-BTI-LABEL: @ftc_inline1.resolver(
+// CHECK-MTE-BTI-NEXT:  resolver_entry:
+// CHECK-MTE-BTI-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-MTE-BTI-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-MTE-BTI-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 18014535948435456
+// CHECK-MTE-BTI-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 18014535948435456
+// CHECK-MTE-BTI-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-MTE-BTI-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK-MTE-BTI:       resolver_return:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline1._Msve2-aesMwfxt
+// CHECK-MTE-BTI:       resolver_else:
+// CHECK-MTE-BTI-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-MTE-BTI-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 140737492549632
+// CHECK-MTE-BTI-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 140737492549632
+// CHECK-MTE-BTI-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
+// CHECK-MTE-BTI-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
+// CHECK-MTE-BTI:       resolver_return1:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline1._MpredresMrcpc
+// CHECK-MTE-BTI:       resolver_else2:
+// CHECK-MTE-BTI-NEXT:    [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-MTE-BTI-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 513
+// CHECK-MTE-BTI-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 513
+// CHECK-MTE-BTI-NEXT:    [[TMP11:%.*]] = and i1 true, [[TMP10]]
+// CHECK-MTE-BTI-NEXT:    br i1 [[TMP11]], label [[RESOLVER_RETURN3:%.*]], label [[RESOLVER_ELSE4:%.*]]
+// CHECK-MTE-BTI:       resolver_return3:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline1._MrngMsimd
+// CHECK-MTE-BTI:       resolver_else4:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline1.default
+//
+//
+// CHECK-MTE-BTI-LABEL: @ftc_inline2.resolver(
+// CHECK-MTE-BTI-NEXT:  resolver_entry:
+// CHECK-MTE-BTI-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-MTE-BTI-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-MTE-BTI-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 549757911040
+// CHECK-MTE-BTI-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 549757911040
+// CHECK-MTE-BTI-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-MTE-BTI-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK-MTE-BTI:       resolver_return:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline2._MfcmaMsve2-bitperm
+// CHECK-MTE-BTI:       resolver_else:
+// CHECK-MTE-BTI-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-MTE-BTI-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 65536
+// CHECK-MTE-BTI-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 65536
+// CHECK-MTE-BTI-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
+// CHECK-MTE-BTI-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
+// CHECK-MTE-BTI:       resolver_return1:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline2._Mfp16
+// CHECK-MTE-BTI:       resolver_else2:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline2.default
+//
+//
+// CHECK-MTE-BTI-LABEL: @ftc_inline3.resolver(
+// CHECK-MTE-BTI-NEXT:  resolver_entry:
+// CHECK-MTE-BTI-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-MTE-BTI-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-MTE-BTI-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 70369817919488
+// CHECK-MTE-BTI-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 70369817919488
+// CHECK-MTE-BTI-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-MTE-BTI-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK-MTE-BTI:       resolver_return:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline3._MsbMsve
+// CHECK-MTE-BTI:       resolver_else:
+// CHECK-MTE-BTI-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-MTE-BTI-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 1125899906842624
+// CHECK-MTE-BTI-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 1125899906842624
+// CHECK-MTE-BTI-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
+// CHECK-MTE-BTI-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
+// CHECK-MTE-BTI:       resolver_return1:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline3._Mbti
+// CHECK-MTE-BTI:       resolver_else2:
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline3.default
+//
+//
+// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
+// CHECK-MTE-BTI-LABEL: @ftc_inline2._MfcmaMsve2-bitperm(
+// CHECK-MTE-BTI-NEXT:  entry:
+// CHECK-MTE-BTI-NEXT:    ret i32 2
+//
+//
+// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
+// CHECK-MTE-BTI-LABEL: @ftc.default(
+// CHECK-MTE-BTI-NEXT:  entry:
+// CHECK-MTE-BTI-NEXT:    ret i32 0
+//
+//
+// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
+// CHECK-MTE-BTI-LABEL: @ftc_def.default(
+// CHECK-MTE-BTI-NEXT:  entry:
+// CHECK-MTE-BTI-NEXT:    ret i32 1
+//
+//
+// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
+// CHECK-MTE-BTI-LABEL: @ftc_dup1.default(
+// CHECK-MTE-BTI-NEXT:  entry:
+// CHECK-MTE-BTI-NEXT:    ret i32 2
+//
+//
+// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
+// CHECK-MTE-BTI-LABEL: @ftc_dup2.default(
+// CHECK-MTE-BTI-NEXT:  entry:
+// CHECK-MTE-BTI-NEXT:    ret i32 3
+//
+//
+// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
+// CHECK-MTE-BTI-LABEL: @ftc_dup3.default(
+// CHECK-MTE-BTI-NEXT:  entry:
+// CHECK-MTE-BTI-NEXT:    ret i32 4
+//
+//
+// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
+// CHECK-MTE-BTI-LABEL: @ftc_inline2.default(
+// CHECK-MTE-BTI-NEXT:  entry:
+// CHECK-MTE-BTI-NEXT:    ret i32 2
+//
+//
+// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
+// CHECK-MTE-BTI-LABEL: @ftc_inline1._MrngMsimd(
+// CHECK-MTE-BTI-NEXT:  entry:
+// CHECK-MTE-BTI-NEXT:    ret i32 1
+//
+//
+// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
+// CHECK-MTE-BTI-LABEL: @ftc_inline1._MpredresMrcpc(
+// CHECK-MTE-BTI-NEXT:  entry:
+// CHECK-MTE-BTI-NEXT:    ret i32 1
+//
+//
+// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
+// CHECK-MTE-BTI-LABEL: @ftc_inline1._Msve2-aesMwfxt(
+// CHECK-MTE-BTI-NEXT:  entry:
+// CHECK-MTE-BTI-NEXT:    ret i32 1
+//
+//
+// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
+// CHECK-MTE-BTI-LABEL: @ftc_inline1.default(
+// CHECK-MTE-BTI-NEXT:  entry:
+// CHECK-MTE-BTI-NEXT:    ret i32 1
+//
+//
+// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
+// CHECK-MTE-BTI-LABEL: @ftc_inline3._Mbti(
+// CHECK-MTE-BTI-NEXT:  entry:
+// CHECK-MTE-BTI-NEXT:    ret i32 3
+//
+//
+// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
+// CHECK-MTE-BTI-LABEL: @ftc_inline3._MsbMsve(
+// CHECK-MTE-BTI-NEXT:  entry:
+// CHECK-MTE-BTI-NEXT:    ret i32 3
+//
+//
+// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
+// CHECK-MTE-BTI-LABEL: @ftc_inline3.default(
+// CHECK-MTE-BTI-NEXT:  entry:
+// CHECK-MTE-BTI-NEXT:    ret i32 3
+//
 //.
 // CHECK: attributes #[[ATTR0:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+lse,+neon" }
 // CHECK: attributes #[[ATTR1:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2" }
@@ -410,21 +830,38 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK: attributes #[[ATTR3:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+mte,+neon,+sha2" }
 // CHECK: attributes #[[ATTR4:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon" }
 // CHECK: attributes #[[ATTR5:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc,+dotprod,+fp-armv8,+neon" }
-// CHECK: attributes #[[ATTR6:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// CHECK: attributes #[[ATTR7:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon" }
-// CHECK: attributes #[[ATTR8:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+complxnum,+fp-armv8,+fullfp16,+neon,+sve,+sve2,+sve2-bitperm" }
-// CHECK: attributes #[[ATTR9:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+rand" }
-// CHECK: attributes #[[ATTR10:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+predres,+rcpc" }
-// CHECK: attributes #[[ATTR11:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+sve2-aes,+wfxt" }
-// CHECK: attributes #[[ATTR12:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bti" }
-// CHECK: attributes #[[ATTR13:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sb,+sve" }
+// CHECK: attributes #[[ATTR6:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mte" }
+// CHECK: attributes #[[ATTR7:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bti" }
+// CHECK: attributes #[[ATTR8:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CHECK: attributes #[[ATTR9:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon" }
+// CHECK: attributes #[[ATTR10:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+complxnum,+fp-armv8,+fullfp16,+neon,+sve,+sve2,+sve2-bitperm" }
+// CHECK: attributes #[[ATTR11:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+rand" }
+// CHECK: attributes #[[ATTR12:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+predres,+rcpc" }
+// CHECK: attributes #[[ATTR13:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+sve2-aes,+wfxt" }
+// CHECK: attributes #[[ATTR14:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sb,+sve" }
 //.
 // CHECK-NOFMV: attributes #[[ATTR0:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fmv" }
 // CHECK-NOFMV: attributes #[[ATTR1:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fmv" }
 //.
+// CHECK-MTE-BTI: attributes #[[ATTR0:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bti,+fp-armv8,+lse,+mte,+neon" }
+// CHECK-MTE-BTI: attributes #[[ATTR1:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bti,+fp-armv8,+fullfp16,+mte,+neon,+sve,+sve2" }
+// CHECK-MTE-BTI: attributes #[[ATTR2:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bti,+fp-armv8,+mte,+neon,+sha2" }
+// CHECK-MTE-BTI: attributes #[[ATTR3:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bti,+fp-armv8,+mte,+neon" }
+// CHECK-MTE-BTI: attributes #[[ATTR4:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bti,+crc,+dotprod,+fp-armv8,+mte,+neon" }
+// CHECK-MTE-BTI: attributes #[[ATTR5:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bti,+mte" }
+// CHECK-MTE-BTI: attributes #[[ATTR6:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bti,+fp-armv8,+fullfp16,+mte,+neon" }
+// CHECK-MTE-BTI: attributes #[[ATTR7:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bti,+complxnum,+fp-armv8,+fullfp16,+mte,+neon,+sve,+sve2,+sve2-bitperm" }
+// CHECK-MTE-BTI: attributes #[[ATTR8:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bti,+fp-armv8,+mte,+neon,+rand" }
+// CHECK-MTE-BTI: attributes #[[ATTR9:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bti,+mte,+predres,+rcpc" }
+// CHECK-MTE-BTI: attributes #[[ATTR10:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bti,+fp-armv8,+fullfp16,+mte,+neon,+sve,+sve2,+sve2-aes,+wfxt" }
+// CHECK-MTE-BTI: attributes #[[ATTR11:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bti,+fp-armv8,+fullfp16,+mte,+neon,+sb,+sve" }
+//.
 // CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 // CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
 //.
 // CHECK-NOFMV: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 // CHECK-NOFMV: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
 //.
+// CHECK-MTE-BTI: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// CHECK-MTE-BTI: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
diff --git a/clang/test/CodeGen/attr-target-version.c b/clang/test/CodeGen/attr-target-version.c
index dd4cbbf5a898..3597711333d3 100644
--- a/clang/test/CodeGen/attr-target-version.c
+++ b/clang/test/CodeGen/attr-target-version.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -v9.5a -target-feature -fp-armv8 -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -fmv -S -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK-NOFMV
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -v9.5a -target-feature -fp-armv8 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -fmv -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK-NOFMV
 
 int __attribute__((target_version("rng+flagm+fp16fml"))) fmv(void) { return 1; }
 int __attribute__((target_version("flagm2+sme-i16i64"))) fmv(void) { return 2; }
diff --git a/clang/test/CodeGen/blocks-windows.c b/clang/test/CodeGen/blocks-windows.c
index 315ae0475002..4379cd2e6b63 100644
--- a/clang/test/CodeGen/blocks-windows.c
+++ b/clang/test/CodeGen/blocks-windows.c
@@ -1,44 +1,44 @@
-// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DECL -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DECL
-// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DEFN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DEFN
-// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS
-// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN
-// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
-// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_DLLIMPORT -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DECL -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DECL
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DEFN -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DEFN
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_DLLIMPORT -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
 
-// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DECL -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DECL
-// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DEFN -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DEFN
-// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS
-// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN
-// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
-// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_DLLIMPORT -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DECL -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DECL
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DEFN -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DEFN
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_DLLIMPORT -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
 
-// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DECL -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DECL
-// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DEFN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DEFN
-// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS
-// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN
-// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
-// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_DLLIMPORT -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DECL -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DECL
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DEFN -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DEFN
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_DLLIMPORT -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
 
-// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DECL -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DECL
-// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DEFN -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DEFN
-// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS
-// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN
-// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
-// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_DLLIMPORT -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DECL -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DECL
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DEFN -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DEFN
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_DLLIMPORT -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
 
-// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DECL -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DECL
-// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DEFN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DEFN
-// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS
-// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN
-// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
-// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_DLLIMPORT -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DECL -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DECL
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DEFN -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DEFN
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_DLLIMPORT -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
 
-// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DECL -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DECL
-// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DEFN -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DEFN
-// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS
-// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN
-// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
-// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_DLLIMPORT -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DECL -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DECL
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DEFN -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DEFN
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_DLLIMPORT -Os -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
 
 void *_Block_copy(void *);
 
diff --git a/clang/test/CodeGen/bpf-preserve-static-offset-arr.c b/clang/test/CodeGen/bpf-preserve-static-offset-arr.c
index 295bd2919fc6..4ea8c6f76f62 100644
--- a/clang/test/CodeGen/bpf-preserve-static-offset-arr.c
+++ b/clang/test/CodeGen/bpf-preserve-static-offset-arr.c
@@ -1,7 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: bpf-registered-target
-// RUN: %clang -cc1 -triple bpf -disable-llvm-passes -S -emit-llvm -o - %s \
-// RUN: | FileCheck %s
+// RUN: %clang_cc1 -triple bpf -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
 
 // Check that call to preserve.static.offset is generated when array
 // member of a struct marked with __attribute__((preserve_static_offset))
diff --git a/clang/test/CodeGen/bpf-preserve-static-offset-bitfield.c b/clang/test/CodeGen/bpf-preserve-static-offset-bitfield.c
index 5983e6d27721..b10bd6de1a57 100644
--- a/clang/test/CodeGen/bpf-preserve-static-offset-bitfield.c
+++ b/clang/test/CodeGen/bpf-preserve-static-offset-bitfield.c
@@ -1,7 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: bpf-registered-target
-// RUN: %clang -cc1 -triple bpfel -disable-llvm-passes -S -emit-llvm -o - %s \
-// RUN: | FileCheck %s
+// RUN: %clang_cc1 -triple bpfel -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
 
 // Check that call to preserve.static.offset is generated when bitfield
 // from a struct marked with __attribute__((preserve_static_offset)) is
diff --git a/clang/test/CodeGen/bpf-preserve-static-offset-lvalue.c b/clang/test/CodeGen/bpf-preserve-static-offset-lvalue.c
index 4f0c359366f5..c82ea192bcba 100644
--- a/clang/test/CodeGen/bpf-preserve-static-offset-lvalue.c
+++ b/clang/test/CodeGen/bpf-preserve-static-offset-lvalue.c
@@ -1,7 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: bpf-registered-target
-// RUN: %clang -cc1 -triple bpf -disable-llvm-passes -S -emit-llvm -o - %s \
-// RUN: | FileCheck %s
+// RUN: %clang_cc1 -triple bpf -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
 
 // Check that call to preserve.static.offset is generated when field of
 // a struct marked with __attribute__((preserve_static_offset)) is accessed.
diff --git a/clang/test/CodeGen/bpf-preserve-static-offset-non-bpf.c b/clang/test/CodeGen/bpf-preserve-static-offset-non-bpf.c
index 3fe8d2517fe3..0ddf603856a4 100644
--- a/clang/test/CodeGen/bpf-preserve-static-offset-non-bpf.c
+++ b/clang/test/CodeGen/bpf-preserve-static-offset-non-bpf.c
@@ -1,6 +1,5 @@
 // REQUIRES: x86-registered-target
-// RUN: %clang -cc1 -triple x86_64 -disable-llvm-passes -S -emit-llvm  -o - %s \
-// RUN: | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
 
 // Verify that __attribute__((preserve_static_offset))
 // has no effect for non-BPF target.
diff --git a/clang/test/CodeGen/bpf-preserve-static-offset-pai.c b/clang/test/CodeGen/bpf-preserve-static-offset-pai.c
index df1f33b1a664..e0d868df616e 100644
--- a/clang/test/CodeGen/bpf-preserve-static-offset-pai.c
+++ b/clang/test/CodeGen/bpf-preserve-static-offset-pai.c
@@ -1,7 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: bpf-registered-target
-// RUN: %clang -cc1 -triple bpf -disable-llvm-passes -S -emit-llvm -o - %s \
-// RUN: | FileCheck %s
+// RUN: %clang_cc1 -triple bpf -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
 
 // Verify that preserve_static_offset does not interfere with
 // preserve_access_index at IR generation stage.
diff --git a/clang/test/CodeGen/builtin-complex.c b/clang/test/CodeGen/builtin-complex.c
index 7f459b69b99b..93b25c56e01d 100644
--- a/clang/test/CodeGen/builtin-complex.c
+++ b/clang/test/CodeGen/builtin-complex.c
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -triple x86_64-linux -w -S -o - -emit-llvm -DT=float %s | FileCheck %s --check-prefixes=CHECK,CHECK-FLOAT
-// RUN: %clang_cc1 -triple x86_64-linux -w -S -o - -emit-llvm -DT=double %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOUBLE
-// RUN: %clang_cc1 -triple x86_64-linux -w -S -o - -emit-llvm -DT="long double" %s | FileCheck %s --check-prefixes=CHECK,CHECK-FP80
-// RUN: %clang_cc1 -triple x86_64-linux -w -S -o - -emit-llvm -DT=__float128 %s | FileCheck %s --check-prefixes=CHECK,CHECK-FP128
+// RUN: %clang_cc1 -triple x86_64-linux -w -o - -emit-llvm -DT=float %s | FileCheck %s --check-prefixes=CHECK,CHECK-FLOAT
+// RUN: %clang_cc1 -triple x86_64-linux -w -o - -emit-llvm -DT=double %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOUBLE
+// RUN: %clang_cc1 -triple x86_64-linux -w -o - -emit-llvm -DT="long double" %s | FileCheck %s --check-prefixes=CHECK,CHECK-FP80
+// RUN: %clang_cc1 -triple x86_64-linux -w -o - -emit-llvm -DT=__float128 %s | FileCheck %s --check-prefixes=CHECK,CHECK-FP128
 // FIXME: If we start to support _Complex __fp16 or _Complex _Float16, add tests for them too.
 
 // CHECK-FLOAT: @global ={{.*}} global { [[T:float]], [[T]] } { [[T]] 1.0{{.*}}, [[T]] 2.0{{.*}} }
diff --git a/clang/test/CodeGen/builtins-arm64.c b/clang/test/CodeGen/builtins-arm64.c
index 8bd68d9ceb48..0913295b0c5f 100644
--- a/clang/test/CodeGen/builtins-arm64.c
+++ b/clang/test/CodeGen/builtins-arm64.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple arm64-unknown-linux -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LINUX
-// RUN: %clang_cc1 -triple aarch64-windows -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-WIN
+// RUN: %clang_cc1 -triple aarch64-windows -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-WIN
 // RUN: %clang_cc1 -triple arm64_32-apple-ios13 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
 #include <stdint.h>
 
diff --git a/clang/test/CodeGen/builtins-bitint.c b/clang/test/CodeGen/builtins-bitint.c
new file mode 100644
index 000000000000..804e49712877
--- /dev/null
+++ b/clang/test/CodeGen/builtins-bitint.c
@@ -0,0 +1,126 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1 -triple arm-unknown-unknown -O0 -std=c23 -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK-O0
+// RUN: %clang_cc1 -triple arm-unknown-unknown -O1 -std=c23 -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK-O1
+
+// Verify that the result from the intrinsic call is zero extended to avoid that
+// we get a negative result from popcountg/ctzg/clzg.
+
+// CHECK-O0-LABEL: define dso_local arm_aapcscc i32 @test_popcountg_ubi1(
+// CHECK-O0-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-O0-NEXT:  entry:
+// CHECK-O0-NEXT:    [[A:%.*]] = alloca i1, align 1
+// CHECK-O0-NEXT:    store i1 true, ptr [[A]], align 1
+// CHECK-O0-NEXT:    [[TMP0:%.*]] = load i1, ptr [[A]], align 1
+// CHECK-O0-NEXT:    [[TMP1:%.*]] = call i1 @llvm.ctpop.i1(i1 [[TMP0]])
+// CHECK-O0-NEXT:    [[CAST:%.*]] = zext i1 [[TMP1]] to i32
+// CHECK-O0-NEXT:    ret i32 [[CAST]]
+//
+// CHECK-O1-LABEL: define dso_local arm_aapcscc noundef i32 @test_popcountg_ubi1(
+// CHECK-O1-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-O1-NEXT:  entry:
+// CHECK-O1-NEXT:    ret i32 1
+//
+int test_popcountg_ubi1() {
+  unsigned _BitInt(1) a = 1uwb;
+  return __builtin_popcountg(a);
+}
+
+// CHECK-O0-LABEL: define dso_local arm_aapcscc i32 @test_popcountg_ubi2(
+// CHECK-O0-SAME: ) #[[ATTR0]] {
+// CHECK-O0-NEXT:  entry:
+// CHECK-O0-NEXT:    [[A:%.*]] = alloca i2, align 1
+// CHECK-O0-NEXT:    store i2 -1, ptr [[A]], align 1
+// CHECK-O0-NEXT:    [[TMP0:%.*]] = load i2, ptr [[A]], align 1
+// CHECK-O0-NEXT:    [[TMP1:%.*]] = call i2 @llvm.ctpop.i2(i2 [[TMP0]])
+// CHECK-O0-NEXT:    [[CAST:%.*]] = zext i2 [[TMP1]] to i32
+// CHECK-O0-NEXT:    ret i32 [[CAST]]
+//
+// CHECK-O1-LABEL: define dso_local arm_aapcscc noundef i32 @test_popcountg_ubi2(
+// CHECK-O1-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// CHECK-O1-NEXT:  entry:
+// CHECK-O1-NEXT:    ret i32 2
+//
+int test_popcountg_ubi2() {
+  unsigned _BitInt(2) a = 3uwb;
+  return __builtin_popcountg(a);
+}
+
+// CHECK-O0-LABEL: define dso_local arm_aapcscc i32 @test_ctzg_ubi1(
+// CHECK-O0-SAME: ) #[[ATTR0]] {
+// CHECK-O0-NEXT:  entry:
+// CHECK-O0-NEXT:    [[A:%.*]] = alloca i1, align 1
+// CHECK-O0-NEXT:    store i1 false, ptr [[A]], align 1
+// CHECK-O0-NEXT:    [[TMP0:%.*]] = load i1, ptr [[A]], align 1
+// CHECK-O0-NEXT:    [[TMP1:%.*]] = call i1 @llvm.cttz.i1(i1 [[TMP0]], i1 false)
+// CHECK-O0-NEXT:    [[CAST:%.*]] = zext i1 [[TMP1]] to i32
+// CHECK-O0-NEXT:    ret i32 [[CAST]]
+//
+// CHECK-O1-LABEL: define dso_local arm_aapcscc noundef i32 @test_ctzg_ubi1(
+// CHECK-O1-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// CHECK-O1-NEXT:  entry:
+// CHECK-O1-NEXT:    ret i32 1
+//
+int test_ctzg_ubi1() {
+  unsigned _BitInt(1) a = 0uwb;
+  return __builtin_ctzg(a);
+}
+
+// CHECK-O0-LABEL: define dso_local arm_aapcscc i32 @test_ctzg_ubi2(
+// CHECK-O0-SAME: ) #[[ATTR0]] {
+// CHECK-O0-NEXT:  entry:
+// CHECK-O0-NEXT:    [[A:%.*]] = alloca i2, align 1
+// CHECK-O0-NEXT:    store i2 0, ptr [[A]], align 1
+// CHECK-O0-NEXT:    [[TMP0:%.*]] = load i2, ptr [[A]], align 1
+// CHECK-O0-NEXT:    [[TMP1:%.*]] = call i2 @llvm.cttz.i2(i2 [[TMP0]], i1 false)
+// CHECK-O0-NEXT:    [[CAST:%.*]] = zext i2 [[TMP1]] to i32
+// CHECK-O0-NEXT:    ret i32 [[CAST]]
+//
+// CHECK-O1-LABEL: define dso_local arm_aapcscc noundef i32 @test_ctzg_ubi2(
+// CHECK-O1-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// CHECK-O1-NEXT:  entry:
+// CHECK-O1-NEXT:    ret i32 2
+//
+int test_ctzg_ubi2() {
+  unsigned _BitInt(2) a = 0uwb;
+  return __builtin_ctzg(a);
+}
+
+// CHECK-O0-LABEL: define dso_local arm_aapcscc i32 @test_clzg_ubi1(
+// CHECK-O0-SAME: ) #[[ATTR0]] {
+// CHECK-O0-NEXT:  entry:
+// CHECK-O0-NEXT:    [[A:%.*]] = alloca i1, align 1
+// CHECK-O0-NEXT:    store i1 false, ptr [[A]], align 1
+// CHECK-O0-NEXT:    [[TMP0:%.*]] = load i1, ptr [[A]], align 1
+// CHECK-O0-NEXT:    [[TMP1:%.*]] = call i1 @llvm.ctlz.i1(i1 [[TMP0]], i1 false)
+// CHECK-O0-NEXT:    [[CAST:%.*]] = zext i1 [[TMP1]] to i32
+// CHECK-O0-NEXT:    ret i32 [[CAST]]
+//
+// CHECK-O1-LABEL: define dso_local arm_aapcscc noundef i32 @test_clzg_ubi1(
+// CHECK-O1-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// CHECK-O1-NEXT:  entry:
+// CHECK-O1-NEXT:    ret i32 1
+//
+int test_clzg_ubi1() {
+  unsigned _BitInt(1) a = 0uwb;
+  return __builtin_clzg(a);
+}
+
+// CHECK-O0-LABEL: define dso_local arm_aapcscc i32 @test_clzg_ubi2(
+// CHECK-O0-SAME: ) #[[ATTR0]] {
+// CHECK-O0-NEXT:  entry:
+// CHECK-O0-NEXT:    [[A:%.*]] = alloca i2, align 1
+// CHECK-O0-NEXT:    store i2 0, ptr [[A]], align 1
+// CHECK-O0-NEXT:    [[TMP0:%.*]] = load i2, ptr [[A]], align 1
+// CHECK-O0-NEXT:    [[TMP1:%.*]] = call i2 @llvm.ctlz.i2(i2 [[TMP0]], i1 false)
+// CHECK-O0-NEXT:    [[CAST:%.*]] = zext i2 [[TMP1]] to i32
+// CHECK-O0-NEXT:    ret i32 [[CAST]]
+//
+// CHECK-O1-LABEL: define dso_local arm_aapcscc noundef i32 @test_clzg_ubi2(
+// CHECK-O1-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// CHECK-O1-NEXT:  entry:
+// CHECK-O1-NEXT:    ret i32 2
+//
+int test_clzg_ubi2() {
+  unsigned _BitInt(2) a = 0uwb;
+  return __builtin_clzg(a);
+}
diff --git a/clang/test/CodeGen/builtins-elementwise-math.c b/clang/test/CodeGen/builtins-elementwise-math.c
index 1c667e5bff1e..1b5466abd347 100644
--- a/clang/test/CodeGen/builtins-elementwise-math.c
+++ b/clang/test/CodeGen/builtins-elementwise-math.c
@@ -604,6 +604,22 @@ void test_builtin_elementwise_sqrt(float f1, float f2, double d1, double d2,
   vf2 = __builtin_elementwise_sqrt(vf1);
 }
 
+void test_builtin_elementwise_tan(float f1, float f2, double d1, double d2,
+                                  float4 vf1, float4 vf2) {
+  // CHECK-LABEL: define void @test_builtin_elementwise_tan(
+  // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
+  // CHECK-NEXT:  call float @llvm.tan.f32(float [[F1]])
+  f2 = __builtin_elementwise_tan(f1);
+
+  // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
+  // CHECK-NEXT: call double @llvm.tan.f64(double [[D1]])
+  d2 = __builtin_elementwise_tan(d1);
+
+  // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
+  // CHECK-NEXT: call <4 x float> @llvm.tan.v4f32(<4 x float> [[VF1]])
+  vf2 = __builtin_elementwise_tan(vf1);
+}
+
 void test_builtin_elementwise_trunc(float f1, float f2, double d1, double d2,
                                     float4 vf1, float4 vf2) {
   // CHECK-LABEL: define void @test_builtin_elementwise_trunc(
diff --git a/clang/test/CodeGen/builtins-nvptx-mma.cu b/clang/test/CodeGen/builtins-nvptx-mma.cu
index 5375d88032b7..cd71690d7e05 100644
--- a/clang/test/CodeGen/builtins-nvptx-mma.cu
+++ b/clang/test/CodeGen/builtins-nvptx-mma.cu
@@ -9,7 +9,7 @@
 // RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_80 \
 // RUN:            -fcuda-is-device -target-feature +ptx71 \
 // RUN:            -DPTX=71 -DSM=80 \
-// RUN:            -S -emit-llvm -o - -x cuda %s \
+// RUN:            -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefixes=CHECK_PTX70_SM80,CHECK_PTX60_SM70,CHECK_PTX63_SM72,CHECK_PTX61_SM70,CHECK_PTX63_SM75,CHECK_PTX71_SM80 %s
 // Verify that all builtins have correct constraints.
 // RUN: %clang_cc1 -triple nvptx-unknown-unknown \
diff --git a/clang/test/CodeGen/builtins-nvptx-native-half-type-err.c b/clang/test/CodeGen/builtins-nvptx-native-half-type-err.c
index 7e23d3354b02..3b9413ddd4a4 100644
--- a/clang/test/CodeGen/builtins-nvptx-native-half-type-err.c
+++ b/clang/test/CodeGen/builtins-nvptx-native-half-type-err.c
@@ -1,6 +1,6 @@
 // REQUIRES: nvptx-registered-target
 //
-// RUN: not %clang_cc1 -fsyntax-only -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \
+// RUN: not %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \
 // RUN:   sm_86 -target-feature +ptx72 -fcuda-is-device -x cuda -emit-llvm -o - %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK_ERROR %s
 
diff --git a/clang/test/CodeGen/builtins-nvptx-native-half-type.c b/clang/test/CodeGen/builtins-nvptx-native-half-type.c
index 670127f6eb61..4aeae953bc16 100644
--- a/clang/test/CodeGen/builtins-nvptx-native-half-type.c
+++ b/clang/test/CodeGen/builtins-nvptx-native-half-type.c
@@ -1,38 +1,38 @@
 // REQUIRES: nvptx-registered-target
 //
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \
-// RUN:   sm_75 -target-feature +ptx70 -fcuda-is-device -fnative-half-type -S \
+// RUN:   sm_75 -target-feature +ptx70 -fcuda-is-device -fnative-half-type \
 // RUN:   -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM75 %s
 
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \
-// RUN:   sm_80 -target-feature +ptx70 -fcuda-is-device -fnative-half-type -S \
+// RUN:   sm_80 -target-feature +ptx70 -fcuda-is-device -fnative-half-type \
 // RUN:   -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 %s
 
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown \
 // RUN:   -target-cpu sm_80 -target-feature +ptx70 -fcuda-is-device \
-// RUN:   -fnative-half-type -S -emit-llvm -o - -x cuda %s \
+// RUN:   -fnative-half-type -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 %s
 
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \
-// RUN:   sm_86 -target-feature +ptx72 -fcuda-is-device -fnative-half-type -S \
+// RUN:   sm_86 -target-feature +ptx72 -fcuda-is-device -fnative-half-type \
 // RUN:   -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 %s
 
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown \
 // RUN:   -target-cpu sm_86 -target-feature +ptx72 -fcuda-is-device \
-// RUN:   -fnative-half-type -S -emit-llvm -o - -x cuda %s \
+// RUN:   -fnative-half-type -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 %s
 
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \
-// RUN:   sm_53 -target-feature +ptx42 -fcuda-is-device -fnative-half-type -S \
+// RUN:   sm_53 -target-feature +ptx42 -fcuda-is-device -fnative-half-type \
 // RUN:   -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX42_SM53 %s
 
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown \
 // RUN:   -target-cpu sm_53 -target-feature +ptx42 -fcuda-is-device \
-// RUN:   -fnative-half-type -S -emit-llvm -o - -x cuda %s \
+// RUN:   -fnative-half-type -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX42_SM53 %s
 
 #define __device__ __attribute__((device))
diff --git a/clang/test/CodeGen/builtins-nvptx-ptx50.cu b/clang/test/CodeGen/builtins-nvptx-ptx50.cu
index c297c0a58abf..a2d527537aed 100644
--- a/clang/test/CodeGen/builtins-nvptx-ptx50.cu
+++ b/clang/test/CodeGen/builtins-nvptx-ptx50.cu
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_60 \
-// RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
+// RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK %s
 //
 // RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_50 \
diff --git a/clang/test/CodeGen/builtins-nvptx-ptx60.cu b/clang/test/CodeGen/builtins-nvptx-ptx60.cu
index afbe0a45b091..599d09a20e04 100644
--- a/clang/test/CodeGen/builtins-nvptx-ptx60.cu
+++ b/clang/test/CodeGen/builtins-nvptx-ptx60.cu
@@ -1,14 +1,14 @@
 // RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_70 \
 // RUN:            -fcuda-is-device -target-feature +ptx60 \
-// RUN:            -S -emit-llvm -o - -x cuda %s \
+// RUN:            -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK %s
 // RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_80 \
 // RUN:            -fcuda-is-device -target-feature +ptx65 \
-// RUN:            -S -emit-llvm -o - -x cuda %s \
+// RUN:            -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK %s
 // RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_80 \
 // RUN:            -fcuda-is-device -target-feature +ptx70 \
-// RUN:            -S -emit-llvm -o - -x cuda %s \
+// RUN:            -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK %s
 // RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_70 \
 // RUN:   -fcuda-is-device -S -o /dev/null -x cuda -verify %s
diff --git a/clang/test/CodeGen/builtins-nvptx-sm_70.cu b/clang/test/CodeGen/builtins-nvptx-sm_70.cu
index 9de9a70190e2..3b3ee803f464 100644
--- a/clang/test/CodeGen/builtins-nvptx-sm_70.cu
+++ b/clang/test/CodeGen/builtins-nvptx-sm_70.cu
@@ -1,15 +1,15 @@
 // RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_70 \
 // RUN:            -fcuda-is-device -target-feature +ptx60 \
-// RUN:            -S -emit-llvm -o - -x cuda %s \
+// RUN:            -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK_M16 %s
 // RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_70 \
 // RUN:            -fcuda-is-device -target-feature +ptx61 -DPTX61 \
-// RUN:            -S -emit-llvm -o - -x cuda %s \
+// RUN:            -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefixes=CHECK_M16,CHECK_M32_M8 %s
 // Make sure builtins still work with the latest combination of GPU & PTX.
 // RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_86 \
 // RUN:            -fcuda-is-device -target-feature +ptx72 -DPTX61 \
-// RUN:            -S -emit-llvm -o - -x cuda %s \
+// RUN:            -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefixes=CHECK_M16,CHECK_M32_M8 %s
 // RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_60 \
 // RUN:   -DPTX61 -fcuda-is-device -S -o /dev/null -x cuda -verify=pre-sm_70 %s
diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c
index 4dba7670b5c4..75b9d6d1fe19 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -1,26 +1,26 @@
 // REQUIRES: nvptx-registered-target
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
-// RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
+// RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP32 %s
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
-// RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
+// RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP64 %s
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_60 -target-feature +ptx62 \
-// RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
+// RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=LP32 %s
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_60 -target-feature +ptx62 \
-// RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
+// RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_61 -target-feature +ptx62 \
-// RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
+// RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
 // RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_53 -target-feature +ptx62 \
 // RUN:   -DERROR_CHECK -fcuda-is-device -S -o /dev/null -x cuda -verify %s
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \
-// RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
+// RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP32 %s
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \
-// RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
+// RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP64 %s
 
 #define __device__ __attribute__((device))
diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c
index 9a323da9a8e8..bcb15969de1c 100644
--- a/clang/test/CodeGen/builtins-wasm.c
+++ b/clang/test/CodeGen/builtins-wasm.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple wasm32-unknown-unknown -target-feature +reference-types -target-feature +simd128 -target-feature +relaxed-simd -target-feature +nontrapping-fptoint -target-feature +exception-handling -target-feature +bulk-memory -target-feature +atomics -flax-vector-conversions=none -O3 -emit-llvm -o - %s | FileCheck %s -check-prefixes WEBASSEMBLY,WEBASSEMBLY32
-// RUN: %clang_cc1 -triple wasm64-unknown-unknown -target-feature +reference-types -target-feature +simd128 -target-feature +relaxed-simd -target-feature +nontrapping-fptoint -target-feature +exception-handling -target-feature +bulk-memory -target-feature +atomics -flax-vector-conversions=none -O3 -emit-llvm -o - %s | FileCheck %s -check-prefixes WEBASSEMBLY,WEBASSEMBLY64
+// RUN: %clang_cc1 -triple wasm32-unknown-unknown -target-feature +reference-types -target-feature +simd128 -target-feature +relaxed-simd -target-feature +nontrapping-fptoint -target-feature +exception-handling -target-feature +bulk-memory -target-feature +atomics -target-feature +half-precision -flax-vector-conversions=none -O3 -emit-llvm -o - %s | FileCheck %s -check-prefixes WEBASSEMBLY,WEBASSEMBLY32
+// RUN: %clang_cc1 -triple wasm64-unknown-unknown -target-feature +reference-types -target-feature +simd128 -target-feature +relaxed-simd -target-feature +nontrapping-fptoint -target-feature +exception-handling -target-feature +bulk-memory -target-feature +atomics -target-feature +half-precision -flax-vector-conversions=none -O3 -emit-llvm -o - %s | FileCheck %s -check-prefixes WEBASSEMBLY,WEBASSEMBLY64
 // RUN: not %clang_cc1 -triple wasm64-unknown-unknown -target-feature +reference-types -target-feature +nontrapping-fptoint -target-feature +exception-handling -target-feature +bulk-memory -target-feature +atomics -flax-vector-conversions=none -O3 -emit-llvm -o - %s 2>&1 | FileCheck %s -check-prefixes MISSING-SIMD
 
 // SIMD convenience types
@@ -802,6 +802,17 @@ f32x4 relaxed_dot_bf16x8_add_f32_f32x4(u16x8 a, u16x8 b, f32x4 c) {
   // WEBASSEMBLY-NEXT: ret
 }
 
+float load_f16_f32(__fp16 *addr) {
+  return __builtin_wasm_loadf16_f32(addr);
+  // WEBASSEMBLY: call float @llvm.wasm.loadf16.f32(ptr %{{.*}})
+}
+
+void store_f16_f32(float val, __fp16 *addr) {
+  return __builtin_wasm_storef16_f32(val, addr);
+  // WEBASSEMBLY: tail call void @llvm.wasm.storef16.f32(float %val, ptr %{{.*}})
+  // WEBASSEMBLY-NEXT: ret
+}
+
 __externref_t externref_null() {
   return __builtin_wasm_ref_null_extern();
   // WEBASSEMBLY: tail call ptr addrspace(10) @llvm.wasm.ref.null.extern()
diff --git a/clang/test/CodeGen/builtins.c b/clang/test/CodeGen/builtins.c
index 407e0857d223..b41efb59e61d 100644
--- a/clang/test/CodeGen/builtins.c
+++ b/clang/test/CodeGen/builtins.c
@@ -949,12 +949,12 @@ void test_builtin_popcountg(unsigned char uc, unsigned short us,
   pop = __builtin_popcountg(uc);
   // CHECK: %1 = load i8, ptr %uc.addr, align 1
   // CHECK-NEXT: %2 = call i8 @llvm.ctpop.i8(i8 %1)
-  // CHECK-NEXT: %cast = sext i8 %2 to i32
+  // CHECK-NEXT: %cast = zext i8 %2 to i32
   // CHECK-NEXT: store volatile i32 %cast, ptr %pop, align 4
   pop = __builtin_popcountg(us);
   // CHECK-NEXT: %3 = load i16, ptr %us.addr, align 2
   // CHECK-NEXT: %4 = call i16 @llvm.ctpop.i16(i16 %3)
-  // CHECK-NEXT: %cast1 = sext i16 %4 to i32
+  // CHECK-NEXT: %cast1 = zext i16 %4 to i32
   // CHECK-NEXT: store volatile i32 %cast1, ptr %pop, align 4
   pop = __builtin_popcountg(ui);
   // CHECK-NEXT: %5 = load i32, ptr %ui.addr, align 4
@@ -992,12 +992,12 @@ void test_builtin_clzg(unsigned char uc, unsigned short us, unsigned int ui,
   lz = __builtin_clzg(uc);
   // CHECK: %1 = load i8, ptr %uc.addr, align 1
   // CHECK-NEXT: %2 = call i8 @llvm.ctlz.i8(i8 %1, i1 true)
-  // CHECK-NEXT: %cast = sext i8 %2 to i32
+  // CHECK-NEXT: %cast = zext i8 %2 to i32
   // CHECK-NEXT: store volatile i32 %cast, ptr %lz, align 4
   lz = __builtin_clzg(us);
   // CHECK-NEXT: %3 = load i16, ptr %us.addr, align 2
   // CHECK-NEXT: %4 = call i16 @llvm.ctlz.i16(i16 %3, i1 true)
-  // CHECK-NEXT: %cast1 = sext i16 %4 to i32
+  // CHECK-NEXT: %cast1 = zext i16 %4 to i32
   // CHECK-NEXT: store volatile i32 %cast1, ptr %lz, align 4
   lz = __builtin_clzg(ui);
   // CHECK-NEXT: %5 = load i32, ptr %ui.addr, align 4
@@ -1026,7 +1026,7 @@ void test_builtin_clzg(unsigned char uc, unsigned short us, unsigned int ui,
   lz = __builtin_clzg(uc, sc);
   // CHECK-NEXT: %15 = load i8, ptr %uc.addr, align 1
   // CHECK-NEXT: %16 = call i8 @llvm.ctlz.i8(i8 %15, i1 true)
-  // CHECK-NEXT: %cast6 = sext i8 %16 to i32
+  // CHECK-NEXT: %cast6 = zext i8 %16 to i32
   // CHECK-NEXT: %iszero = icmp eq i8 %15, 0
   // CHECK-NEXT: %17 = load i8, ptr %sc.addr, align 1
   // CHECK-NEXT: %conv = sext i8 %17 to i32
@@ -1035,7 +1035,7 @@ void test_builtin_clzg(unsigned char uc, unsigned short us, unsigned int ui,
   lz = __builtin_clzg(us, uc);
   // CHECK-NEXT: %18 = load i16, ptr %us.addr, align 2
   // CHECK-NEXT: %19 = call i16 @llvm.ctlz.i16(i16 %18, i1 true)
-  // CHECK-NEXT: %cast7 = sext i16 %19 to i32
+  // CHECK-NEXT: %cast7 = zext i16 %19 to i32
   // CHECK-NEXT: %iszero8 = icmp eq i16 %18, 0
   // CHECK-NEXT: %20 = load i8, ptr %uc.addr, align 1
   // CHECK-NEXT: %conv9 = zext i8 %20 to i32
@@ -1094,12 +1094,12 @@ void test_builtin_ctzg(unsigned char uc, unsigned short us, unsigned int ui,
   tz = __builtin_ctzg(uc);
   // CHECK: %1 = load i8, ptr %uc.addr, align 1
   // CHECK-NEXT: %2 = call i8 @llvm.cttz.i8(i8 %1, i1 true)
-  // CHECK-NEXT: %cast = sext i8 %2 to i32
+  // CHECK-NEXT: %cast = zext i8 %2 to i32
   // CHECK-NEXT: store volatile i32 %cast, ptr %tz, align 4
   tz = __builtin_ctzg(us);
   // CHECK-NEXT: %3 = load i16, ptr %us.addr, align 2
   // CHECK-NEXT: %4 = call i16 @llvm.cttz.i16(i16 %3, i1 true)
-  // CHECK-NEXT: %cast1 = sext i16 %4 to i32
+  // CHECK-NEXT: %cast1 = zext i16 %4 to i32
   // CHECK-NEXT: store volatile i32 %cast1, ptr %tz, align 4
   tz = __builtin_ctzg(ui);
   // CHECK-NEXT: %5 = load i32, ptr %ui.addr, align 4
@@ -1128,7 +1128,7 @@ void test_builtin_ctzg(unsigned char uc, unsigned short us, unsigned int ui,
   tz = __builtin_ctzg(uc, sc);
   // CHECK-NEXT: %15 = load i8, ptr %uc.addr, align 1
   // CHECK-NEXT: %16 = call i8 @llvm.cttz.i8(i8 %15, i1 true)
-  // CHECK-NEXT: %cast6 = sext i8 %16 to i32
+  // CHECK-NEXT: %cast6 = zext i8 %16 to i32
   // CHECK-NEXT: %iszero = icmp eq i8 %15, 0
   // CHECK-NEXT: %17 = load i8, ptr %sc.addr, align 1
   // CHECK-NEXT: %conv = sext i8 %17 to i32
@@ -1137,7 +1137,7 @@ void test_builtin_ctzg(unsigned char uc, unsigned short us, unsigned int ui,
   tz = __builtin_ctzg(us, uc);
   // CHECK-NEXT: %18 = load i16, ptr %us.addr, align 2
   // CHECK-NEXT: %19 = call i16 @llvm.cttz.i16(i16 %18, i1 true)
-  // CHECK-NEXT: %cast7 = sext i16 %19 to i32
+  // CHECK-NEXT: %cast7 = zext i16 %19 to i32
   // CHECK-NEXT: %iszero8 = icmp eq i16 %18, 0
   // CHECK-NEXT: %20 = load i8, ptr %uc.addr, align 1
   // CHECK-NEXT: %conv9 = zext i8 %20 to i32
diff --git a/clang/test/CodeGen/callback_pthread_create.c b/clang/test/CodeGen/callback_pthread_create.c
index 31bc2e9098f4..a1f16f3dd9a3 100644
--- a/clang/test/CodeGen/callback_pthread_create.c
+++ b/clang/test/CodeGen/callback_pthread_create.c
@@ -2,7 +2,7 @@
 // RUN: false
 // XFAIL: *
 
-// RUN: %clang_cc1 %s -S -emit-llvm -o - -disable-llvm-optzns | FileCheck %s
+// RUN: %clang_cc1 %s -emit-llvm -o - -disable-llvm-optzns | FileCheck %s
 
 // CHECK: declare !callback ![[cid:[0-9]+]] {{.*}}i32 @pthread_create
 // CHECK: ![[cid]] = !{![[cidb:[0-9]+]]}
diff --git a/clang/test/CodeGen/cf-runtime-abi.c b/clang/test/CodeGen/cf-runtime-abi.c
index 2ff9103e68bb..508c5bbf09c9 100644
--- a/clang/test/CodeGen/cf-runtime-abi.c
+++ b/clang/test/CodeGen/cf-runtime-abi.c
@@ -1,42 +1,42 @@
-// RUN: %clang_cc1 -triple x86_64-apple-macosx -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-OBJC
-// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-OBJC-LLP64
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-OBJC
+// RUN: %clang_cc1 -triple x86_64-apple-macosx -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-OBJC
+// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-OBJC-LLP64
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-OBJC
 
-// RUN: %clang_cc1 -triple x86_64-apple-macosx -fcf-runtime-abi=objc -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-OBJC
-// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -fcf-runtime-abi=objc -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-OBJC-LLP64
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fcf-runtime-abi=objc -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-OBJC
+// RUN: %clang_cc1 -triple x86_64-apple-macosx -fcf-runtime-abi=objc -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-OBJC
+// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -fcf-runtime-abi=objc -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-OBJC-LLP64
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fcf-runtime-abi=objc -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-OBJC
 
-// RUN: %clang_cc1 -triple x86_64-apple-macosx -fcf-runtime-abi=standalone -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-OBJC
-// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -fcf-runtime-abi=standalone -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-OBJC-LLP64
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fcf-runtime-abi=standalone -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-OBJC
+// RUN: %clang_cc1 -triple x86_64-apple-macosx -fcf-runtime-abi=standalone -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-OBJC
+// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -fcf-runtime-abi=standalone -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-OBJC-LLP64
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fcf-runtime-abi=standalone -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-OBJC
 
-// RUN: %clang_cc1 -triple x86_64-apple-macosx -fcf-runtime-abi=swift -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-DARWIN-5_0-64
-// RUN: %clang_cc1 -triple aarch64-apple-ios -fcf-runtime-abi=swift -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-DARWIN-5_0-64
-// RUN: %clang_cc1 -triple armv7k-apple-watchos -fcf-runtime-abi=swift -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-DARWIN-5_0-32
-// RUN: %clang_cc1 -triple armv7-apple-tvos -fcf-runtime-abi=swift -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-DARWIN-5_0-32
-// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -fcf-runtime-abi=swift -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-5_0-64
-// RUN: %clang_cc1 -triple armv7-unknown-linux-android -fcf-runtime-abi=swift -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-5_0-32
+// RUN: %clang_cc1 -triple x86_64-apple-macosx -fcf-runtime-abi=swift -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-DARWIN-5_0-64
+// RUN: %clang_cc1 -triple aarch64-apple-ios -fcf-runtime-abi=swift -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-DARWIN-5_0-64
+// RUN: %clang_cc1 -triple armv7k-apple-watchos -fcf-runtime-abi=swift -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-DARWIN-5_0-32
+// RUN: %clang_cc1 -triple armv7-apple-tvos -fcf-runtime-abi=swift -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-DARWIN-5_0-32
+// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -fcf-runtime-abi=swift -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-5_0-64
+// RUN: %clang_cc1 -triple armv7-unknown-linux-android -fcf-runtime-abi=swift -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-5_0-32
 
-// RUN: %clang_cc1 -triple x86_64-apple-macosx -fcf-runtime-abi=swift-5.0 -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-DARWIN-5_0-64
-// RUN: %clang_cc1 -triple aarch64-apple-ios -fcf-runtime-abi=swift-5.0 -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-DARWIN-5_0-64
-// RUN: %clang_cc1 -triple armv7k-apple-watchos -fcf-runtime-abi=swift-5.0 -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-DARWIN-5_0-32
-// RUN: %clang_cc1 -triple armv7-apple-tvos -fcf-runtime-abi=swift-5.0 -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-DARWIN-5_0-32
-// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -fcf-runtime-abi=swift-5.0 -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-5_0-64
-// RUN: %clang_cc1 -triple armv7-unknown-linux-android -fcf-runtime-abi=swift-5.0 -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-5_0-32
+// RUN: %clang_cc1 -triple x86_64-apple-macosx -fcf-runtime-abi=swift-5.0 -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-DARWIN-5_0-64
+// RUN: %clang_cc1 -triple aarch64-apple-ios -fcf-runtime-abi=swift-5.0 -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-DARWIN-5_0-64
+// RUN: %clang_cc1 -triple armv7k-apple-watchos -fcf-runtime-abi=swift-5.0 -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-DARWIN-5_0-32
+// RUN: %clang_cc1 -triple armv7-apple-tvos -fcf-runtime-abi=swift-5.0 -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-DARWIN-5_0-32
+// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -fcf-runtime-abi=swift-5.0 -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-5_0-64
+// RUN: %clang_cc1 -triple armv7-unknown-linux-android -fcf-runtime-abi=swift-5.0 -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-5_0-32
 
-// RUN: %clang_cc1 -triple x86_64-apple-macosx -fcf-runtime-abi=swift-4.2 -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-DARWIN-4_2-64
-// RUN: %clang_cc1 -triple aarch64-apple-ios -fcf-runtime-abi=swift-4.2 -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-DARWIN-4_2-64
-// RUN: %clang_cc1 -triple armv7k-apple-watchos -fcf-runtime-abi=swift-4.2 -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-DARWIN-4_2-32
-// RUN: %clang_cc1 -triple armv7-apple-tvos -fcf-runtime-abi=swift-4.2 -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-DARWIN-4_2-32
-// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -fcf-runtime-abi=swift-4.2 -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-4_2-64
-// RUN: %clang_cc1 -triple armv7-unknown-linux-android -fcf-runtime-abi=swift-4.2 -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-4_2-32
+// RUN: %clang_cc1 -triple x86_64-apple-macosx -fcf-runtime-abi=swift-4.2 -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-DARWIN-4_2-64
+// RUN: %clang_cc1 -triple aarch64-apple-ios -fcf-runtime-abi=swift-4.2 -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-DARWIN-4_2-64
+// RUN: %clang_cc1 -triple armv7k-apple-watchos -fcf-runtime-abi=swift-4.2 -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-DARWIN-4_2-32
+// RUN: %clang_cc1 -triple armv7-apple-tvos -fcf-runtime-abi=swift-4.2 -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-DARWIN-4_2-32
+// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -fcf-runtime-abi=swift-4.2 -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-4_2-64
+// RUN: %clang_cc1 -triple armv7-unknown-linux-android -fcf-runtime-abi=swift-4.2 -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-4_2-32
 
-// RUN: %clang_cc1 -triple x86_64-apple-macosx -fcf-runtime-abi=swift-4.1 -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-DARWIN-4_1-64
-// RUN: %clang_cc1 -triple aarch64-apple-ios -fcf-runtime-abi=swift-4.1 -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-DARWIN-4_1-64
-// RUN: %clang_cc1 -triple armv7k-apple-watchos -fcf-runtime-abi=swift-4.1 -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-DARWIN-4_1-32
-// RUN: %clang_cc1 -triple armv7-apple-tvos -fcf-runtime-abi=swift-4.1 -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-DARWIN-4_1-32
-// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -fcf-runtime-abi=swift-4.1 -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-4_1-64
-// RUN: %clang_cc1 -triple armv7-unknown-linux-android -fcf-runtime-abi=swift-4.1 -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-4_1-32
+// RUN: %clang_cc1 -triple x86_64-apple-macosx -fcf-runtime-abi=swift-4.1 -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-DARWIN-4_1-64
+// RUN: %clang_cc1 -triple aarch64-apple-ios -fcf-runtime-abi=swift-4.1 -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-DARWIN-4_1-64
+// RUN: %clang_cc1 -triple armv7k-apple-watchos -fcf-runtime-abi=swift-4.1 -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-DARWIN-4_1-32
+// RUN: %clang_cc1 -triple armv7-apple-tvos -fcf-runtime-abi=swift-4.1 -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-DARWIN-4_1-32
+// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -fcf-runtime-abi=swift-4.1 -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-4_1-64
+// RUN: %clang_cc1 -triple armv7-unknown-linux-android -fcf-runtime-abi=swift-4.1 -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-SWIFT-4_1-32
 
 const __NSConstantString *s = __builtin___CFStringMakeConstantString("");
 
diff --git a/clang/test/CodeGen/cfstring-elf-cfbuild-x86_64.c b/clang/test/CodeGen/cfstring-elf-cfbuild-x86_64.c
index 63a1803b8318..8ead538a52e9 100644
--- a/clang/test/CodeGen/cfstring-elf-cfbuild-x86_64.c
+++ b/clang/test/CodeGen/cfstring-elf-cfbuild-x86_64.c
@@ -1,14 +1,14 @@
 // REQUIRES: x86-registered-target
 
-// RUN: %clang_cc1 -triple x86_64-elf -DCF_BUILDING_CF -DDECL -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-IN-CF-DECL
-// RUN: %clang_cc1 -triple x86_64-elf -DCF_BUILDING_CF -DDEFN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-IN-CF-DEFN
-// RUN: %clang_cc1 -triple x86_64-elf -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF
-// RUN: %clang_cc1 -triple x86_64-elf -DEXTERN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-EXTERN
+// RUN: %clang_cc1 -triple x86_64-elf -DCF_BUILDING_CF -DDECL -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-IN-CF-DECL
+// RUN: %clang_cc1 -triple x86_64-elf -DCF_BUILDING_CF -DDEFN -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-IN-CF-DEFN
+// RUN: %clang_cc1 -triple x86_64-elf -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF
+// RUN: %clang_cc1 -triple x86_64-elf -DEXTERN -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-EXTERN
 
-// RUN: %clang_cc1 -Os -triple x86_64-elf -DCF_BUILDING_CF -DDECL -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-IN-CF-DECL
-// RUN: %clang_cc1 -Os -triple x86_64-elf -DCF_BUILDING_CF -DDEFN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-IN-CF-DEFN
-// RUN: %clang_cc1 -Os -triple x86_64-elf -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF
-// RUN: %clang_cc1 -Os -triple x86_64-elf -DEXTERN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-EXTERN
+// RUN: %clang_cc1 -Os -triple x86_64-elf -DCF_BUILDING_CF -DDECL -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-IN-CF-DECL
+// RUN: %clang_cc1 -Os -triple x86_64-elf -DCF_BUILDING_CF -DDEFN -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-IN-CF-DEFN
+// RUN: %clang_cc1 -Os -triple x86_64-elf -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF
+// RUN: %clang_cc1 -Os -triple x86_64-elf -DEXTERN -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-EXTERN
 
 
 #if defined(CF_BUILDING_CF)
diff --git a/clang/test/CodeGen/cfstring-windows.c b/clang/test/CodeGen/cfstring-windows.c
index 7c17f31d3dd4..2f132838bb1b 100644
--- a/clang/test/CodeGen/cfstring-windows.c
+++ b/clang/test/CodeGen/cfstring-windows.c
@@ -1,16 +1,16 @@
-// RUN: %clang_cc1 -triple thumbv7-windows -fdeclspec -DCF_BUILDING_CF -DDECL -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-IN-CF-DECL
-// RUN: %clang_cc1 -triple thumbv7-windows -fdeclspec -DCF_BUILDING_CF -DDEFN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-IN-CF-DEFN
-// RUN: %clang_cc1 -triple thumbv7-windows -fdeclspec -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF
-// RUN: %clang_cc1 -triple thumbv7-windows -fdeclspec -DEXTERN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-EXTERN
-// RUN: %clang_cc1 -triple thumbv7-windows -fdeclspec -DEXTERN_DLLIMPORT -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-EXTERN-DLLIMPORT
-// RUN: %clang_cc1 -triple thumbv7-windows -fdeclspec -DDLLIMPORT -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-DLLIMPORT
+// RUN: %clang_cc1 -triple thumbv7-windows -fdeclspec -DCF_BUILDING_CF -DDECL -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-IN-CF-DECL
+// RUN: %clang_cc1 -triple thumbv7-windows -fdeclspec -DCF_BUILDING_CF -DDEFN -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-IN-CF-DEFN
+// RUN: %clang_cc1 -triple thumbv7-windows -fdeclspec -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF
+// RUN: %clang_cc1 -triple thumbv7-windows -fdeclspec -DEXTERN -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-EXTERN
+// RUN: %clang_cc1 -triple thumbv7-windows -fdeclspec -DEXTERN_DLLIMPORT -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-EXTERN-DLLIMPORT
+// RUN: %clang_cc1 -triple thumbv7-windows -fdeclspec -DDLLIMPORT -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-DLLIMPORT
 
-// RUN: %clang_cc1 -Os -triple thumbv7-windows -fdeclspec -DCF_BUILDING_CF -DDECL -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-IN-CF-DECL
-// RUN: %clang_cc1 -Os -triple thumbv7-windows -fdeclspec -DCF_BUILDING_CF -DDEFN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-IN-CF-DEFN
-// RUN: %clang_cc1 -Os -triple thumbv7-windows -fdeclspec -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF
-// RUN: %clang_cc1 -Os -triple thumbv7-windows -fdeclspec -DEXTERN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-EXTERN
-// RUN: %clang_cc1 -Os -triple thumbv7-windows -fdeclspec -DEXTERN_DLLIMPORT -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-EXTERN-DLLIMPORT
-// RUN: %clang_cc1 -Os -triple thumbv7-windows -fdeclspec -DDLLIMPORT -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-DLLIMPORT
+// RUN: %clang_cc1 -Os -triple thumbv7-windows -fdeclspec -DCF_BUILDING_CF -DDECL -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-IN-CF-DECL
+// RUN: %clang_cc1 -Os -triple thumbv7-windows -fdeclspec -DCF_BUILDING_CF -DDEFN -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-IN-CF-DEFN
+// RUN: %clang_cc1 -Os -triple thumbv7-windows -fdeclspec -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF
+// RUN: %clang_cc1 -Os -triple thumbv7-windows -fdeclspec -DEXTERN -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-EXTERN
+// RUN: %clang_cc1 -Os -triple thumbv7-windows -fdeclspec -DEXTERN_DLLIMPORT -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-EXTERN-DLLIMPORT
+// RUN: %clang_cc1 -Os -triple thumbv7-windows -fdeclspec -DDLLIMPORT -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-DLLIMPORT
 
 #if defined(CF_BUILDING_CF)
 #if defined(DECL)
diff --git a/clang/test/CodeGen/cfstring3.c b/clang/test/CodeGen/cfstring3.c
index 1172d2b79a62..d7e07ffaf8a8 100644
--- a/clang/test/CodeGen/cfstring3.c
+++ b/clang/test/CodeGen/cfstring3.c
@@ -1,16 +1,16 @@
 // REQUIRES: arm-registered-target,x86-registered-target
 
-// RUN: %clang_cc1 -triple thumbv7-windows -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-COFF
-// RUN: %clang_cc1 -triple i686-windows -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-COFF
-// RUN: %clang_cc1 -triple x86_64-windows -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-COFF
+// RUN: %clang_cc1 -triple thumbv7-windows -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-COFF
+// RUN: %clang_cc1 -triple i686-windows -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-COFF
+// RUN: %clang_cc1 -triple x86_64-windows -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-COFF
 
-// RUN: %clang_cc1 -triple armv7-elf -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-ELF -check-prefix CHECK-ELF32
-// RUN: %clang_cc1 -triple i686-elf -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-ELF -check-prefix CHECK-ELF32
-// RUN: %clang_cc1 -triple x86_64-elf -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-ELF -check-prefix CHECK-ELF64
+// RUN: %clang_cc1 -triple armv7-elf -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-ELF -check-prefix CHECK-ELF32
+// RUN: %clang_cc1 -triple i686-elf -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-ELF -check-prefix CHECK-ELF32
+// RUN: %clang_cc1 -triple x86_64-elf -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-ELF -check-prefix CHECK-ELF64
 
-// RUN: %clang_cc1 -triple armv7-macho -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACHO -check-prefix CHECK-MACHO32
-// RUN: %clang_cc1 -triple i386-apple-macosx -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACHO -check-prefix CHECK-MACHO32
-// RUN: %clang_cc1 -triple x86_64-macho -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACHO -check-prefix CHECK-MACHO64
+// RUN: %clang_cc1 -triple armv7-macho -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACHO -check-prefix CHECK-MACHO32
+// RUN: %clang_cc1 -triple i386-apple-macosx -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACHO -check-prefix CHECK-MACHO32
+// RUN: %clang_cc1 -triple x86_64-macho -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACHO -check-prefix CHECK-MACHO64
 
 // RUN: %clang_cc1 -triple thumbv7-windows -S %s -o - | FileCheck %s -check-prefix CHECK-ASM-COFF
 // RUN: %clang_cc1 -triple thumbv7-elf -S %s -o - | FileCheck %s -check-prefix CHECK-ASM-ELF
diff --git a/clang/test/CodeGen/cmse-clear-arg.c b/clang/test/CodeGen/cmse-clear-arg.c
index b393a0a26abb..de77d637ef43 100644
--- a/clang/test/CodeGen/cmse-clear-arg.c
+++ b/clang/test/CodeGen/cmse-clear-arg.c
@@ -1,13 +1,13 @@
-// RUN: %clang_cc1 -triple thumbv8m.main   -O0 -mcmse -S -emit-llvm %s -o - | \
+// RUN: %clang_cc1 -triple thumbv8m.main   -O0 -mcmse -emit-llvm %s -o - | \
 // RUN:    FileCheck %s --check-prefixes=CHECK,CHECK-LE,CHECK-SOFTFP
-// RUN: %clang_cc1 -triple thumbebv8m.main -O0 -mcmse -S -emit-llvm %s -o - | \
+// RUN: %clang_cc1 -triple thumbebv8m.main -O0 -mcmse -emit-llvm %s -o - | \
 // RUN:    FileCheck %s --check-prefixes=CHECK,CHECK-BE,CHECK-SOFTFP
-// RUN: %clang_cc1 -triple thumbv8m.main   -O2 -mcmse -S -emit-llvm %s -o - | \
+// RUN: %clang_cc1 -triple thumbv8m.main   -O2 -mcmse -emit-llvm %s -o - | \
 // RUN:    FileCheck %s --check-prefixes=CHECK,CHECK-LE,CHECK-SOFTFP
-// RUN: %clang_cc1 -triple thumbebv8m.main -O2 -mcmse -S -emit-llvm %s -o - | \
+// RUN: %clang_cc1 -triple thumbebv8m.main -O2 -mcmse -emit-llvm %s -o - | \
 // RUN:    FileCheck %s --check-prefixes=CHECK,CHECK-BE,CHECK-SOFTFP
 // RUN: %clang_cc1 -triple thumbv8m.main   -O0 -mcmse -mfloat-abi hard  \
-// RUN:            -S -emit-llvm %s -o - | \
+// RUN:            -emit-llvm %s -o - | \
 // RUN:    FileCheck %s --check-prefixes=CHECK,CHECK-LE,CHECK-HARDFP
 
 // We don't really need to repeat *all* the test cases from cmse-clear-return.c
diff --git a/clang/test/CodeGen/cmse-clear-return.c b/clang/test/CodeGen/cmse-clear-return.c
index c43377522119..f406a3553287 100644
--- a/clang/test/CodeGen/cmse-clear-return.c
+++ b/clang/test/CodeGen/cmse-clear-return.c
@@ -1,12 +1,12 @@
-// RUN: %clang_cc1 -triple thumbv8m.main   -O0 -mcmse -S -emit-llvm %s -o - | \
+// RUN: %clang_cc1 -triple thumbv8m.main   -O0 -mcmse -emit-llvm %s -o - | \
 // RUN:    FileCheck %s --check-prefixes=CHECK,CHECK-LE,CHECK-LE-NOPT,CHECK-SOFT
-// RUN: %clang_cc1 -triple thumbebv8m.main -O0 -mcmse -S -emit-llvm %s -o - | \
+// RUN: %clang_cc1 -triple thumbebv8m.main -O0 -mcmse -emit-llvm %s -o - | \
 // RUN:    FileCheck %s --check-prefixes=CHECK,CHECK-BE,CHECK-SOFT
-// RUN: %clang_cc1 -triple thumbv8m.main   -O2 -mcmse -S -emit-llvm %s -o - | \
+// RUN: %clang_cc1 -triple thumbv8m.main   -O2 -mcmse -emit-llvm %s -o - | \
 // RUN:    FileCheck %s --check-prefixes=CHECK,CHECK-LE,CHECK-LE-OPT,CHECK-SOFT
-// RUN: %clang_cc1 -triple thumbebv8m.main -O2 -mcmse -S -emit-llvm %s -o - | \
+// RUN: %clang_cc1 -triple thumbebv8m.main -O2 -mcmse -emit-llvm %s -o - | \
 // RUN:    FileCheck %s --check-prefixes=CHECK,CHECK-BE,CHECK-BE-OPT,CHECK-SOFT
-// RUN: %clang_cc1 -triple thumbv8m.main   -O0 -mcmse -S -emit-llvm %s -o - \
+// RUN: %clang_cc1 -triple thumbv8m.main   -O0 -mcmse -emit-llvm %s -o - \
 // RUN:            -mfloat-abi hard | \
 // RUN:    FileCheck %s --check-prefixes=CHECK,CHECK-LE,CHECK-LE-NOPT,CHECK-HARD
 
diff --git a/clang/test/CodeGen/code_align.c b/clang/test/CodeGen/code_align.c
index f6d86ec969ae..e9c3baebcc09 100644
--- a/clang/test/CodeGen/code_align.c
+++ b/clang/test/CodeGen/code_align.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -x c %s %s -o - | FileCheck -check-prefix=CHECK-C %s
-// RUN: %clang_cc1 -fsyntax-only -emit-llvm -x c++ -std=c++11 %s -o - | FileCheck %s --check-prefixes CHECK-C,CHECK-CPP
+// RUN: %clang_cc1 -emit-llvm -x c++ -std=c++11 %s -o - | FileCheck %s --check-prefixes CHECK-C,CHECK-CPP
 
 // CHECK-C: br label %for.cond, !llvm.loop ![[MD_FP:[0-9]+]]
 // CHECK-C: br label %while.cond, !llvm.loop ![[MD_FP_1:[0-9]+]]
diff --git a/clang/test/CodeGen/coff-aarch64-type-sizes.c b/clang/test/CodeGen/coff-aarch64-type-sizes.c
index f8286618fc8f..9cb0ddbaef3f 100644
--- a/clang/test/CodeGen/coff-aarch64-type-sizes.c
+++ b/clang/test/CodeGen/coff-aarch64-type-sizes.c
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -triple aarch64-windows -emit-llvm -w -o - %s | FileCheck %s
 
-// CHECK: target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"
+// CHECK: target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128-Fn32"
 // CHECK: target triple = "aarch64-unknown-windows-msvc"
 
 int check_short(void) {
diff --git a/clang/test/CodeGen/complex-builtins-2.c b/clang/test/CodeGen/complex-builtins-2.c
index d112637a6cb5..2a3ea7c2dda2 100644
--- a/clang/test/CodeGen/complex-builtins-2.c
+++ b/clang/test/CodeGen/complex-builtins-2.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -w -S -o - -emit-llvm              %s | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -w -S -o - -emit-llvm -fmath-errno %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -w -o - -emit-llvm %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -w -o - -emit-llvm -fmath-errno %s | FileCheck %s
 
 float _Complex test__builtin_conjf(float _Complex x) {
 // CHECK-LABEL: @test__builtin_conjf(
diff --git a/clang/test/CodeGen/complex-builtins.c b/clang/test/CodeGen/complex-builtins.c
index 29d6e7ba909f..d2dca400c53b 100644
--- a/clang/test/CodeGen/complex-builtins.c
+++ b/clang/test/CodeGen/complex-builtins.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -w -S -o - -emit-llvm              %s | FileCheck %s -check-prefix=NO__ERRNO
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -w -S -o - -emit-llvm -fmath-errno %s | FileCheck %s -check-prefix=HAS_ERRNO
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -w -o - -emit-llvm %s | FileCheck %s -check-prefix=NO__ERRNO
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -w -o - -emit-llvm -fmath-errno %s | FileCheck %s -check-prefix=HAS_ERRNO
 
 // Test attributes and codegen of complex builtins.
 
diff --git a/clang/test/CodeGen/complex-libcalls-2.c b/clang/test/CodeGen/complex-libcalls-2.c
index 8867eb8ff6e7..cdac28ee5079 100644
--- a/clang/test/CodeGen/complex-libcalls-2.c
+++ b/clang/test/CodeGen/complex-libcalls-2.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -w -S -o - -emit-llvm              %s | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -w -S -o - -emit-llvm -fmath-errno %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -w -o - -emit-llvm %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -w -o - -emit-llvm -fmath-errno %s | FileCheck %s
 
 _Complex float conjf(_Complex float);
 _Complex double conj(_Complex double);
diff --git a/clang/test/CodeGen/complex-libcalls.c b/clang/test/CodeGen/complex-libcalls.c
index 7d4e3d04cf64..be2ccc3efbfc 100644
--- a/clang/test/CodeGen/complex-libcalls.c
+++ b/clang/test/CodeGen/complex-libcalls.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -Wno-implicit-function-declaration -w -S -o - -emit-llvm              %s | FileCheck %s -check-prefix=NO__ERRNO
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -Wno-implicit-function-declaration -w -S -o - -emit-llvm -fmath-errno %s | FileCheck %s -check-prefix=HAS_ERRNO
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -Wno-implicit-function-declaration -w -o - -emit-llvm %s | FileCheck %s -check-prefix=NO__ERRNO
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -Wno-implicit-function-declaration -w -o - -emit-llvm -fmath-errno %s | FileCheck %s -check-prefix=HAS_ERRNO
 
 // Test attributes and builtin codegen of complex library calls.
 
diff --git a/clang/test/CodeGen/complex-math-mixed.c b/clang/test/CodeGen/complex-math-mixed.c
index 050163cca80a..761b62ec9fa7 100644
--- a/clang/test/CodeGen/complex-math-mixed.c
+++ b/clang/test/CodeGen/complex-math-mixed.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 %s -O0 -emit-llvm -triple x86_64-unknown-unknown -o - | FileCheck %s --check-prefix=X86
-// RUN: %clang_cc1 %s -O0 -triple x86_64-unknown-unknown -fsyntax-only -ast-dump | FileCheck %s --check-prefix=AST
+// RUN: %clang_cc1 %s -O0 -triple x86_64-unknown-unknown -ast-dump | FileCheck %s --check-prefix=AST
 
 // Check that for 'F _Complex + int' (F = real floating-point type), we emit an
 // implicit cast from 'int' to 'F', but NOT to 'F _Complex' (i.e. that we do
diff --git a/clang/test/CodeGen/constrained-math-builtins.c b/clang/test/CodeGen/constrained-math-builtins.c
index 981cc3ac36bd..2de832dd2b6c 100644
--- a/clang/test/CodeGen/constrained-math-builtins.c
+++ b/clang/test/CodeGen/constrained-math-builtins.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-linux -ffp-exception-behavior=maytrap -w -S -o - -emit-llvm %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux -ffp-exception-behavior=maytrap -w -o - -emit-llvm %s | FileCheck %s
 
 // Test codegen of constrained math builtins.
 //
diff --git a/clang/test/CodeGen/dbg-const-int128.c b/clang/test/CodeGen/dbg-const-int128.c
index c780c8f4ffe6..7f8aaef5a08d 100644
--- a/clang/test/CodeGen/dbg-const-int128.c
+++ b/clang/test/CodeGen/dbg-const-int128.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-linux -S -emit-llvm -debug-info-kind=limited  %s -o - | FileCheck %s
-// RUN: %clang_cc1 -triple powerpc64-ibm-aix-xcoff -S -emit-llvm -debug-info-kind=limited  %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -emit-llvm -debug-info-kind=limited  %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple powerpc64-ibm-aix-xcoff -emit-llvm -debug-info-kind=limited  %s -o - | FileCheck %s
 // CHECK: !DIGlobalVariable({{.*}}
 // CHECK-NOT: expr:
 
diff --git a/clang/test/CodeGen/debug-info-codeview-heapallocsite.c b/clang/test/CodeGen/debug-info-codeview-heapallocsite.c
index 6cc34f688e4d..95d4cc9749c1 100644
--- a/clang/test/CodeGen/debug-info-codeview-heapallocsite.c
+++ b/clang/test/CodeGen/debug-info-codeview-heapallocsite.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-windows-msvc -debug-info-kind=limited -gcodeview -fdeclspec -S -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-windows-msvc -debug-info-kind=limited -gcodeview -fdeclspec -emit-llvm %s -o - | FileCheck %s
 
 struct Foo;
 struct Bar;
diff --git a/clang/test/CodeGen/debug-info-codeview-unnamed.c b/clang/test/CodeGen/debug-info-codeview-unnamed.c
index bd2a7543e56b..0df6e1a0419b 100644
--- a/clang/test/CodeGen/debug-info-codeview-unnamed.c
+++ b/clang/test/CodeGen/debug-info-codeview-unnamed.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -debug-info-kind=limited -S -emit-llvm -o - %s | FileCheck --check-prefix LINUX %s
-// RUN: %clang_cc1 -triple x86_64-windows-msvc -debug-info-kind=limited -gcodeview -S -emit-llvm -o - %s | FileCheck --check-prefix MSVC %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -debug-info-kind=limited -emit-llvm -o - %s | FileCheck --check-prefix LINUX %s
+// RUN: %clang_cc1 -triple x86_64-windows-msvc -debug-info-kind=limited -gcodeview -emit-llvm -o - %s | FileCheck --check-prefix MSVC %s
 
 int main(int argc, char* argv[], char* arge[]) {
 
diff --git a/clang/test/CodeGen/debug-info-gline-tables-only.c b/clang/test/CodeGen/debug-info-gline-tables-only.c
index 6f832be82875..6321edd3600b 100644
--- a/clang/test/CodeGen/debug-info-gline-tables-only.c
+++ b/clang/test/CodeGen/debug-info-gline-tables-only.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 %s -debug-info-kind=line-tables-only -S -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 %s -debug-info-kind=line-directives-only -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -debug-info-kind=line-tables-only -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -debug-info-kind=line-directives-only -emit-llvm -o - | FileCheck %s
 // Checks that clang with "-gline-tables-only" or "-gline-directives-only" doesn't emit debug info
 // for variables and types.
 
diff --git a/clang/test/CodeGen/debug-info-gline-tables-only2.c b/clang/test/CodeGen/debug-info-gline-tables-only2.c
index d5d62351a9b6..204fd6ddf15b 100644
--- a/clang/test/CodeGen/debug-info-gline-tables-only2.c
+++ b/clang/test/CodeGen/debug-info-gline-tables-only2.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 %s -debug-info-kind=line-tables-only -S -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 %s -debug-info-kind=line-directives-only -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -debug-info-kind=line-tables-only -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -debug-info-kind=line-directives-only -emit-llvm -o - | FileCheck %s
 // Checks that clang with "-gline-tables-only" or "-gline-directives-only" emits metadata for
 // compile unit, subprogram and file.
 
diff --git a/clang/test/CodeGen/debug-info-line.c b/clang/test/CodeGen/debug-info-line.c
index 39595d148c12..13cab2e453a2 100644
--- a/clang/test/CodeGen/debug-info-line.c
+++ b/clang/test/CodeGen/debug-info-line.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -w -debug-info-kind=line-tables-only -fexceptions -fcxx-exceptions -S -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -w -debug-info-kind=line-directives-only -fexceptions -fcxx-exceptions -S -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -w -debug-info-kind=line-tables-only -fexceptions -fcxx-exceptions -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -w -debug-info-kind=line-directives-only -fexceptions -fcxx-exceptions -emit-llvm %s -o - | FileCheck %s
 
 int f1(int a, int b) {
   // CHECK: icmp {{.*}}, !dbg [[DBG_F1:!.*]]
diff --git a/clang/test/CodeGen/debug-info-line3.c b/clang/test/CodeGen/debug-info-line3.c
index 042571e790b2..b2da4ef613f5 100644
--- a/clang/test/CodeGen/debug-info-line3.c
+++ b/clang/test/CodeGen/debug-info-line3.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -debug-info-kind=limited -S -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -debug-info-kind=limited -emit-llvm %s -o - | FileCheck %s
 
 void func(char c, char* d)
 {
diff --git a/clang/test/CodeGen/debug-info-macro.c b/clang/test/CodeGen/debug-info-macro.c
index 9d0464102c10..23fd67515e84 100644
--- a/clang/test/CodeGen/debug-info-macro.c
+++ b/clang/test/CodeGen/debug-info-macro.c
@@ -4,7 +4,7 @@
 // RUN: %clang_cc1 -emit-llvm -debug-info-kind=standalone       -debug-info-macro %s -o - "-DC1(x)=( x  + 5 )" -DA -include %S/Inputs/debug-info-macro.h -UC1 | FileCheck -check-prefixes=CHECK,NO_PCH %s 
 // RUN: %clang_cc1 -emit-llvm                                   -debug-info-macro %s -o - "-DC1(x)=( x  + 5 )" -DA -include %S/Inputs/debug-info-macro.h -UC1 | FileCheck -check-prefixes=NO_MACRO %s 
 
-// RUN: %clang_cc1 -emit-llvm -debug-info-kind=limited -debug-info-macro %S/Inputs/debug-info-macro.h -emit-pch -o %t.pch -DC3
+// RUN: %clang_cc1 -debug-info-kind=limited -debug-info-macro %S/Inputs/debug-info-macro.h -emit-pch -o %t.pch -DC3
 // RUN: %clang_cc1 -emit-llvm -debug-info-kind=limited -debug-info-macro %s -o - -include-pch %t.pch "-DC1(x)=( x  + 5 )" -DA -include %S/Inputs/debug-info-macro.h -UC1 | FileCheck -check-prefixes=CHECK,PCH %s 
 
 // This test checks that macro Debug info is correctly generated.
diff --git a/clang/test/CodeGen/debug-info-pseudo-probe.cpp b/clang/test/CodeGen/debug-info-pseudo-probe.cpp
index b1b9c8e342a0..75c10840d95d 100644
--- a/clang/test/CodeGen/debug-info-pseudo-probe.cpp
+++ b/clang/test/CodeGen/debug-info-pseudo-probe.cpp
@@ -1,6 +1,6 @@
 // This test checks if a symbol gets mangled dwarf names with -fpseudo-probe-for-profiling option.
-// RUN: %clang_cc1 -triple x86_64 -x c++ -S -emit-llvm -debug-info-kind=line-tables-only -o - < %s | FileCheck %s --check-prefix=PLAIN
-// RUN: %clang_cc1 -triple x86_64 -x c++  -S -emit-llvm -debug-info-kind=line-tables-only -fpseudo-probe-for-profiling -o - < %s | FileCheck %s --check-prefix=MANGLE
+// RUN: %clang_cc1 -triple x86_64 -x c++ -emit-llvm -debug-info-kind=line-tables-only -o - < %s | FileCheck %s --check-prefix=PLAIN
+// RUN: %clang_cc1 -triple x86_64 -x c++  -emit-llvm -debug-info-kind=line-tables-only -fpseudo-probe-for-profiling -o - < %s | FileCheck %s --check-prefix=MANGLE
 
 int foo() {
   return 0;
diff --git a/clang/test/CodeGen/debug-info-variables.c b/clang/test/CodeGen/debug-info-variables.c
index 8ec60ff7c1d9..9fbace7d7b35 100644
--- a/clang/test/CodeGen/debug-info-variables.c
+++ b/clang/test/CodeGen/debug-info-variables.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -debug-info-kind=standalone -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -debug-info-kind=standalone -emit-llvm -o - | FileCheck %s
 
 // CHECK: DIGlobalVariable(name: "global",{{.*}} line: [[@LINE+1]]
 int global = 42;
diff --git a/clang/test/CodeGen/debug-label.c b/clang/test/CodeGen/debug-label.c
index 20efa49b0a4c..179132ecd6f0 100644
--- a/clang/test/CodeGen/debug-label.c
+++ b/clang/test/CodeGen/debug-label.c
@@ -1,7 +1,7 @@
 // This test will test the correstness of generating DILabel and
 // llvm.dbg.label for labels.
 //
-// RUN: %clang_cc1 -emit-llvm %s -o - -emit-llvm -debug-info-kind=limited | FileCheck %s
+// RUN: %clang_cc1 %s -o - -emit-llvm -debug-info-kind=limited | FileCheck %s
 
 int f1(int a, int b) {
   int sum;
diff --git a/clang/test/CodeGen/debug-nvptx.c b/clang/test/CodeGen/debug-nvptx.c
index 8780c5db6801..3b6ef7046e4c 100644
--- a/clang/test/CodeGen/debug-nvptx.c
+++ b/clang/test/CodeGen/debug-nvptx.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple nvptx-unknown-unknown -S -o - -debug-info-kind=limited %s -emit-llvm | FileCheck %s
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -o - -debug-info-kind=limited %s -emit-llvm | FileCheck %s
 
 // CHECK: DICompileUnit({{.*}}, nameTableKind: None
 
diff --git a/clang/test/CodeGen/debug-prefix-map.cpp b/clang/test/CodeGen/debug-prefix-map.cpp
index 5e90aedd8ed7..174bef5a0769 100644
--- a/clang/test/CodeGen/debug-prefix-map.cpp
+++ b/clang/test/CodeGen/debug-prefix-map.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -debug-info-kind=standalone -fdebug-prefix-map=%p=./UNLIKELY_PATH/empty -S %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -debug-info-kind=standalone -fdebug-prefix-map=%p=./UNLIKELY_PATH/empty %s -emit-llvm -o - | FileCheck %s
 
 struct alignas(64) an {
   struct {
diff --git a/clang/test/CodeGen/denormalfpmode-f32.c b/clang/test/CodeGen/denormalfpmode-f32.c
index 2e2306f5a347..312d1c927772 100644
--- a/clang/test/CodeGen/denormalfpmode-f32.c
+++ b/clang/test/CodeGen/denormalfpmode-f32.c
@@ -1,30 +1,30 @@
-// RUN: %clang_cc1 -S %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-NONE,CHECK-F32-NONE
-// RUN: %clang_cc1 -S -fdenormal-fp-math=ieee %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-NONE,CHECK-F32-NONE
-// RUN: %clang_cc1 -S -fdenormal-fp-math=preserve-sign %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-PS,CHECK-F32-NONE
-// RUN: %clang_cc1 -S -fdenormal-fp-math=positive-zero %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-PZ,CHECK-F32-NONE
-// RUN: %clang_cc1 -S -fdenormal-fp-math=dynamic %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-DYNAMIC,CHECK-F32-NONE
-
-// RUN: %clang_cc1 -S -fdenormal-fp-math-f32=ieee %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-NONE,CHECK-F32-NONE
-// RUN: %clang_cc1 -S -fdenormal-fp-math=ieee -fdenormal-fp-math-f32=ieee %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-NONE,CHECK-F32-NONE
-// RUN: %clang_cc1 -S -fdenormal-fp-math=preserve-sign -fdenormal-fp-math-f32=ieee %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-PS,CHECK-F32-IEEE
-// RUN: %clang_cc1 -S -fdenormal-fp-math=positive-zero -fdenormal-fp-math-f32=ieee %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-PZ,CHECK-F32-IEEE
-// RUN: %clang_cc1 -S -fdenormal-fp-math=positive-zero -fdenormal-fp-math-f32=dynamic %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-PZ,CHECK-F32-DYNAMIC
-
-
-// RUN: %clang_cc1 -S -fdenormal-fp-math-f32=preserve-sign %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-NONE,CHECK-F32-PS
-// RUN: %clang_cc1 -S -fdenormal-fp-math=ieee -fdenormal-fp-math-f32=preserve-sign %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-NONE,CHECK-F32-PS
-// RUN: %clang_cc1 -S -fdenormal-fp-math=preserve-sign -fdenormal-fp-math-f32=preserve-sign %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-PS,CHECK-F32-NONE
-// RUN: %clang_cc1 -S -fdenormal-fp-math=positive-zero -fdenormal-fp-math-f32=preserve-sign %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-PZ,CHECK-F32-PS
-// RUN: %clang_cc1 -S -fdenormal-fp-math=ieee -fdenormal-fp-math-f32=dynamic %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-NONE,CHECK-F32-DYNAMIC
-
-
-// RUN: %clang_cc1 -S -fdenormal-fp-math-f32=positive-zero %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-NONE,CHECK-F32-PZ
-// RUN: %clang_cc1 -S -fdenormal-fp-math-f32=dynamic %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-NONE,CHECK-F32-DYNAMIC
-// RUN: %clang_cc1 -S -fdenormal-fp-math=ieee -fdenormal-fp-math-f32=positive-zero %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-NONE,CHECK-F32-PZ
-// RUN: %clang_cc1 -S -fdenormal-fp-math=dynamic -fdenormal-fp-math-f32=positive-zero %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-DYNAMIC,CHECK-F32-PZ
-// RUN: %clang_cc1 -S -fdenormal-fp-math=preserve-sign -fdenormal-fp-math-f32=positive-zero %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-PS,CHECK-F32-PZ
-// RUN: %clang_cc1 -S -fdenormal-fp-math=positive-zero -fdenormal-fp-math-f32=positive-zero %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-PZ,CHECK-F32-NONE
-// RUN: %clang_cc1 -S -fdenormal-fp-math=dynamic -fdenormal-fp-math-f32=dynamic %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-DYNAMIC,CHECK-F32-NONE
+// RUN: %clang_cc1 %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-NONE,CHECK-F32-NONE
+// RUN: %clang_cc1 -fdenormal-fp-math=ieee %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-NONE,CHECK-F32-NONE
+// RUN: %clang_cc1 -fdenormal-fp-math=preserve-sign %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-PS,CHECK-F32-NONE
+// RUN: %clang_cc1 -fdenormal-fp-math=positive-zero %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-PZ,CHECK-F32-NONE
+// RUN: %clang_cc1 -fdenormal-fp-math=dynamic %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-DYNAMIC,CHECK-F32-NONE
+
+// RUN: %clang_cc1 -fdenormal-fp-math-f32=ieee %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-NONE,CHECK-F32-NONE
+// RUN: %clang_cc1 -fdenormal-fp-math=ieee -fdenormal-fp-math-f32=ieee %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-NONE,CHECK-F32-NONE
+// RUN: %clang_cc1 -fdenormal-fp-math=preserve-sign -fdenormal-fp-math-f32=ieee %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-PS,CHECK-F32-IEEE
+// RUN: %clang_cc1 -fdenormal-fp-math=positive-zero -fdenormal-fp-math-f32=ieee %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-PZ,CHECK-F32-IEEE
+// RUN: %clang_cc1 -fdenormal-fp-math=positive-zero -fdenormal-fp-math-f32=dynamic %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-PZ,CHECK-F32-DYNAMIC
+
+
+// RUN: %clang_cc1 -fdenormal-fp-math-f32=preserve-sign %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-NONE,CHECK-F32-PS
+// RUN: %clang_cc1 -fdenormal-fp-math=ieee -fdenormal-fp-math-f32=preserve-sign %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-NONE,CHECK-F32-PS
+// RUN: %clang_cc1 -fdenormal-fp-math=preserve-sign -fdenormal-fp-math-f32=preserve-sign %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-PS,CHECK-F32-NONE
+// RUN: %clang_cc1 -fdenormal-fp-math=positive-zero -fdenormal-fp-math-f32=preserve-sign %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-PZ,CHECK-F32-PS
+// RUN: %clang_cc1 -fdenormal-fp-math=ieee -fdenormal-fp-math-f32=dynamic %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-NONE,CHECK-F32-DYNAMIC
+
+
+// RUN: %clang_cc1 -fdenormal-fp-math-f32=positive-zero %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-NONE,CHECK-F32-PZ
+// RUN: %clang_cc1 -fdenormal-fp-math-f32=dynamic %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-NONE,CHECK-F32-DYNAMIC
+// RUN: %clang_cc1 -fdenormal-fp-math=ieee -fdenormal-fp-math-f32=positive-zero %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-NONE,CHECK-F32-PZ
+// RUN: %clang_cc1 -fdenormal-fp-math=dynamic -fdenormal-fp-math-f32=positive-zero %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-DYNAMIC,CHECK-F32-PZ
+// RUN: %clang_cc1 -fdenormal-fp-math=preserve-sign -fdenormal-fp-math-f32=positive-zero %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-PS,CHECK-F32-PZ
+// RUN: %clang_cc1 -fdenormal-fp-math=positive-zero -fdenormal-fp-math-f32=positive-zero %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-PZ,CHECK-F32-NONE
+// RUN: %clang_cc1 -fdenormal-fp-math=dynamic -fdenormal-fp-math-f32=dynamic %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK-ATTR,CHECK-DYNAMIC,CHECK-F32-NONE
 
 
 // CHECK-LABEL: main
diff --git a/clang/test/CodeGen/denormalfpmode.c b/clang/test/CodeGen/denormalfpmode.c
index 36f25038ce2b..cffff90d6fbe 100644
--- a/clang/test/CodeGen/denormalfpmode.c
+++ b/clang/test/CodeGen/denormalfpmode.c
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -S -fdenormal-fp-math=ieee %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK-IEEE
-// RUN: %clang_cc1 -S -fdenormal-fp-math=preserve-sign %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK-PS
-// RUN: %clang_cc1 -S -fdenormal-fp-math=positive-zero %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK-PZ
-// RUN: %clang_cc1 -S -fdenormal-fp-math=dynamic %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK-DYNAMIC
+// RUN: %clang_cc1 -fdenormal-fp-math=ieee %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK-IEEE
+// RUN: %clang_cc1 -fdenormal-fp-math=preserve-sign %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK-PS
+// RUN: %clang_cc1 -fdenormal-fp-math=positive-zero %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK-PZ
+// RUN: %clang_cc1 -fdenormal-fp-math=dynamic %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK-DYNAMIC
 
 // CHECK-LABEL: main
 
diff --git a/clang/test/CodeGen/dominating-store-infinite-cycle.c b/clang/test/CodeGen/dominating-store-infinite-cycle.c
index cedd9be0b090..2a9dbd384e6c 100644
--- a/clang/test/CodeGen/dominating-store-infinite-cycle.c
+++ b/clang/test/CodeGen/dominating-store-infinite-cycle.c
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck %s
 
 // Test for PR62830 where there are 2 infinite cycles using goto. Make sure
 // clang codegen doesn't hang.
diff --git a/clang/test/CodeGen/enum-bool.cpp b/clang/test/CodeGen/enum-bool.cpp
index 2dcb3373bb36..4bf3b91361d2 100644
--- a/clang/test/CodeGen/enum-bool.cpp
+++ b/clang/test/CodeGen/enum-bool.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s | FileCheck %s
 
 namespace dr2338 {
 namespace A {
diff --git a/clang/test/CodeGen/ffp-contract-option.c b/clang/test/CodeGen/ffp-contract-option.c
index cd777ac9b43c..2a6443032a4e 100644
--- a/clang/test/CodeGen/ffp-contract-option.c
+++ b/clang/test/CodeGen/ffp-contract-option.c
@@ -1,4 +1,5 @@
 // REQUIRES: x86-registered-target
+// UNSUPPORTED: target={{.*}}-zos{{.*}}
 // RUN: %clang_cc1 -triple=x86_64 %s -emit-llvm -o - \
 // RUN:| FileCheck --check-prefixes CHECK,CHECK-DEFAULT  %s
 
diff --git a/clang/test/CodeGen/ffp-model.c b/clang/test/CodeGen/ffp-model.c
index 780603284a99..4ed9b9dc0a78 100644
--- a/clang/test/CodeGen/ffp-model.c
+++ b/clang/test/CodeGen/ffp-model.c
@@ -1,4 +1,5 @@
 // REQUIRES: x86-registered-target
+// UNSUPPORTED: target={{.*}}-zos{{.*}}
 // RUN: %clang -S -emit-llvm -fenable-matrix -ffp-model=fast %s -o - \
 // RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-FAST
 
diff --git a/clang/test/CodeGen/fp-matrix-pragma.c b/clang/test/CodeGen/fp-matrix-pragma.c
index 45ad6e657daf..5c9909bf60e0 100644
--- a/clang/test/CodeGen/fp-matrix-pragma.c
+++ b/clang/test/CodeGen/fp-matrix-pragma.c
@@ -1,4 +1,5 @@
 // RUN: %clang -emit-llvm -S -fenable-matrix -mllvm -disable-llvm-optzns %s -o - | FileCheck %s
+// UNSUPPORTED: target={{.*}}-zos{{.*}}
 
 typedef float fx2x2_t __attribute__((matrix_type(2, 2)));
 typedef int ix2x2_t __attribute__((matrix_type(2, 2)));
diff --git a/clang/test/CodeGen/fpconstrained.c b/clang/test/CodeGen/fpconstrained.c
index dd853527c215..97a5d23449a1 100644
--- a/clang/test/CodeGen/fpconstrained.c
+++ b/clang/test/CodeGen/fpconstrained.c
@@ -1,25 +1,25 @@
-// RUN: %clang_cc1 -frounding-math -ffp-exception-behavior=strict -fexperimental-strict-floating-point -emit-llvm -o - %s | FileCheck %s -check-prefix=FPMODELSTRICT
-// RUN: %clang_cc1 -ffp-contract=fast -emit-llvm -o - %s | FileCheck %s -check-prefix=PRECISE
-// RUN: %clang_cc1 -ffast-math -ffp-contract=fast -emit-llvm -o - %s | FileCheck %s -check-prefix=FAST
-// RUN: %clang_cc1 -ffast-math -emit-llvm -o - %s | FileCheck %s -check-prefix=FASTNOCONTRACT
-// RUN: %clang_cc1 -ffast-math -ffp-contract=fast -ffp-exception-behavior=ignore -emit-llvm -o - %s | FileCheck %s -check-prefix=FAST
-// RUN: %clang_cc1 -ffast-math -ffp-contract=fast -ffp-exception-behavior=strict -fexperimental-strict-floating-point -emit-llvm -o - %s | FileCheck %s -check-prefix=EXCEPT
-// RUN: %clang_cc1 -ffast-math -ffp-contract=fast -ffp-exception-behavior=maytrap -fexperimental-strict-floating-point -emit-llvm -o - %s | FileCheck %s -check-prefix=MAYTRAP
-
-float f0, f1, f2;
-
-void foo(void) {
-  // CHECK-LABEL: define {{.*}}void @foo()
-
-  // MAYTRAP: llvm.experimental.constrained.fadd.f32(float %{{.*}}, float %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
-  // EXCEPT: llvm.experimental.constrained.fadd.f32(float %{{.*}}, float %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  // FPMODELSTRICT: llvm.experimental.constrained.fadd.f32(float %{{.*}}, float %{{.*}}, metadata !"round.dynamic", metadata !"fpexcept.strict")
-  // STRICTEXCEPT: llvm.experimental.constrained.fadd.f32(float %{{.*}}, float %{{.*}}, metadata !"round.dynamic", metadata !"fpexcept.strict")
-  // STRICTNOEXCEPT: llvm.experimental.constrained.fadd.f32(float %{{.*}}, float %{{.*}}, metadata !"round.dynamic", metadata !"fpexcept.ignore")
-  // PRECISE: fadd contract float %{{.*}}, %{{.*}}
-  // FAST: fadd fast
-  // FASTNOCONTRACT: fadd reassoc nnan ninf nsz arcp afn float
-  f0 = f1 + f2;
-
-  // CHECK: ret
-}
+// RUN: %clang_cc1 -frounding-math -ffp-exception-behavior=strict -fexperimental-strict-floating-point -emit-llvm -o - %s | FileCheck %s -check-prefix=FPMODELSTRICT
+// RUN: %clang_cc1 -ffp-contract=fast -emit-llvm -o - %s | FileCheck %s -check-prefix=PRECISE
+// RUN: %clang_cc1 -ffast-math -ffp-contract=fast -emit-llvm -o - %s | FileCheck %s -check-prefix=FAST
+// RUN: %clang_cc1 -ffast-math -emit-llvm -o - %s | FileCheck %s -check-prefix=FASTNOCONTRACT
+// RUN: %clang_cc1 -ffast-math -ffp-contract=fast -ffp-exception-behavior=ignore -emit-llvm -o - %s | FileCheck %s -check-prefix=FAST
+// RUN: %clang_cc1 -ffast-math -ffp-contract=fast -ffp-exception-behavior=strict -fexperimental-strict-floating-point -emit-llvm -o - %s | FileCheck %s -check-prefix=EXCEPT
+// RUN: %clang_cc1 -ffast-math -ffp-contract=fast -ffp-exception-behavior=maytrap -fexperimental-strict-floating-point -emit-llvm -o - %s | FileCheck %s -check-prefix=MAYTRAP
+
+float f0, f1, f2;
+
+void foo(void) {
+  // CHECK-LABEL: define {{.*}}void @foo()
+
+  // MAYTRAP: llvm.experimental.constrained.fadd.f32(float %{{.*}}, float %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
+  // EXCEPT: llvm.experimental.constrained.fadd.f32(float %{{.*}}, float %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  // FPMODELSTRICT: llvm.experimental.constrained.fadd.f32(float %{{.*}}, float %{{.*}}, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  // STRICTEXCEPT: llvm.experimental.constrained.fadd.f32(float %{{.*}}, float %{{.*}}, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  // STRICTNOEXCEPT: llvm.experimental.constrained.fadd.f32(float %{{.*}}, float %{{.*}}, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+  // PRECISE: fadd contract float %{{.*}}, %{{.*}}
+  // FAST: fadd fast
+  // FASTNOCONTRACT: fadd reassoc nnan ninf nsz arcp afn float
+  f0 = f1 + f2;
+
+  // CHECK: ret
+}
diff --git a/clang/test/CodeGen/fpconstrained.cpp b/clang/test/CodeGen/fpconstrained.cpp
index 884c359acab3..222a0989cf6e 100644
--- a/clang/test/CodeGen/fpconstrained.cpp
+++ b/clang/test/CodeGen/fpconstrained.cpp
@@ -1,49 +1,49 @@
-// RUN: %clang_cc1 -x c++ -fexceptions -fcxx-exceptions -frounding-math -ffp-exception-behavior=strict -fexperimental-strict-floating-point -emit-llvm -o - %s | FileCheck %s -check-prefix=FPMODELSTRICT
-// RUN: %clang_cc1 -x c++ -ffp-contract=fast -fexceptions -fcxx-exceptions -emit-llvm -o - %s | FileCheck %s -check-prefix=PRECISE
-// RUN: %clang_cc1 -x c++ -ffast-math -fexceptions -fcxx-exceptions -ffp-contract=fast -emit-llvm -o - %s | FileCheck %s -check-prefix=FAST
-// RUN: %clang_cc1 -x c++ -ffast-math -fexceptions -fcxx-exceptions -emit-llvm -o - %s | FileCheck %s -check-prefix=FASTNOCONTRACT
-// RUN: %clang_cc1 -x c++ -ffast-math -fexceptions -fcxx-exceptions -ffp-contract=fast -ffp-exception-behavior=ignore -emit-llvm -o - %s | FileCheck %s -check-prefix=FAST
-// RUN: %clang_cc1 -x c++ -ffast-math -fexceptions -fcxx-exceptions -ffp-contract=fast -ffp-exception-behavior=strict -fexperimental-strict-floating-point -emit-llvm -o - %s | FileCheck %s -check-prefix=EXCEPT
-// RUN: %clang_cc1 -x c++ -ffast-math -fexceptions -fcxx-exceptions -ffp-contract=fast -ffp-exception-behavior=maytrap -fexperimental-strict-floating-point -emit-llvm -o - %s | FileCheck %s -check-prefix=MAYTRAP
-
-float f0, f1, f2;
-
-  template <class>
-  class aaaa {
-   public:
-    ~aaaa();
-    void b();
-  };
-  
-  template <class c>
-  aaaa<c>::~aaaa() { try {
-    b();
-  // CHECK-LABEL: define {{.*}}void @_ZN4aaaaIiED2Ev{{.*}}
-
-  } catch (...) {
-    // MAYTRAP: llvm.experimental.constrained.fadd.f32(float %{{.*}}, float %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
-    // EXCEPT: llvm.experimental.constrained.fadd.f32(float %{{.*}}, float %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
-    // FPMODELSTRICT: llvm.experimental.constrained.fadd.f32(float %{{.*}}, float %{{.*}}, metadata !"round.dynamic", metadata !"fpexcept.strict")
-    // STRICTEXCEPT: llvm.experimental.constrained.fadd.f32(float %{{.*}}, float %{{.*}}, metadata !"round.dynamic", metadata !"fpexcept.strict")
-    // STRICTNOEXCEPT: llvm.experimental.constrained.fadd.f32(float %{{.*}}, float %{{.*}}, metadata !"round.dynamic", metadata !"fpexcept.ignore")
-    // PRECISE: fadd contract float %{{.*}}, %{{.*}}
-    // FAST: fadd fast
-    // FASTNOCONTRACT: fadd reassoc nnan ninf nsz arcp afn float
-    f0 = f1 + f2;
-
-    // CHECK: ret void
-  }
-  }
-  
-  class d {
-   public:
-    d(const char *, int);
-    aaaa<int> e;
-  };
-  
-float foo() {
-  d x("", 1);
-  aaaa<int> a;
-  return f0;
-}
-
+// RUN: %clang_cc1 -x c++ -fexceptions -fcxx-exceptions -frounding-math -ffp-exception-behavior=strict -fexperimental-strict-floating-point -emit-llvm -o - %s | FileCheck %s -check-prefix=FPMODELSTRICT
+// RUN: %clang_cc1 -x c++ -ffp-contract=fast -fexceptions -fcxx-exceptions -emit-llvm -o - %s | FileCheck %s -check-prefix=PRECISE
+// RUN: %clang_cc1 -x c++ -ffast-math -fexceptions -fcxx-exceptions -ffp-contract=fast -emit-llvm -o - %s | FileCheck %s -check-prefix=FAST
+// RUN: %clang_cc1 -x c++ -ffast-math -fexceptions -fcxx-exceptions -emit-llvm -o - %s | FileCheck %s -check-prefix=FASTNOCONTRACT
+// RUN: %clang_cc1 -x c++ -ffast-math -fexceptions -fcxx-exceptions -ffp-contract=fast -ffp-exception-behavior=ignore -emit-llvm -o - %s | FileCheck %s -check-prefix=FAST
+// RUN: %clang_cc1 -x c++ -ffast-math -fexceptions -fcxx-exceptions -ffp-contract=fast -ffp-exception-behavior=strict -fexperimental-strict-floating-point -emit-llvm -o - %s | FileCheck %s -check-prefix=EXCEPT
+// RUN: %clang_cc1 -x c++ -ffast-math -fexceptions -fcxx-exceptions -ffp-contract=fast -ffp-exception-behavior=maytrap -fexperimental-strict-floating-point -emit-llvm -o - %s | FileCheck %s -check-prefix=MAYTRAP
+
+float f0, f1, f2;
+
+  template <class>
+  class aaaa {
+   public:
+    ~aaaa();
+    void b();
+  };
+
+  template <class c>
+  aaaa<c>::~aaaa() { try {
+    b();
+  // CHECK-LABEL: define {{.*}}void @_ZN4aaaaIiED2Ev{{.*}}
+
+  } catch (...) {
+    // MAYTRAP: llvm.experimental.constrained.fadd.f32(float %{{.*}}, float %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
+    // EXCEPT: llvm.experimental.constrained.fadd.f32(float %{{.*}}, float %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+    // FPMODELSTRICT: llvm.experimental.constrained.fadd.f32(float %{{.*}}, float %{{.*}}, metadata !"round.dynamic", metadata !"fpexcept.strict")
+    // STRICTEXCEPT: llvm.experimental.constrained.fadd.f32(float %{{.*}}, float %{{.*}}, metadata !"round.dynamic", metadata !"fpexcept.strict")
+    // STRICTNOEXCEPT: llvm.experimental.constrained.fadd.f32(float %{{.*}}, float %{{.*}}, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+    // PRECISE: fadd contract float %{{.*}}, %{{.*}}
+    // FAST: fadd fast
+    // FASTNOCONTRACT: fadd reassoc nnan ninf nsz arcp afn float
+    f0 = f1 + f2;
+
+    // CHECK: ret void
+  }
+  }
+
+  class d {
+   public:
+    d(const char *, int);
+    aaaa<int> e;
+  };
+
+float foo() {
+  d x("", 1);
+  aaaa<int> a;
+  return f0;
+}
+
diff --git a/clang/test/CodeGen/fread-inline-builtin-late-redecl.c b/clang/test/CodeGen/fread-inline-builtin-late-redecl.c
index bc629fa09f49..ce964a7a6f6f 100644
--- a/clang/test/CodeGen/fread-inline-builtin-late-redecl.c
+++ b/clang/test/CodeGen/fread-inline-builtin-late-redecl.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64 -S -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64 -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
 //
 // Verifies that clang-generated *.inline are removed when shadowed by an
 // external definition, even when that definition appears at the end of the
diff --git a/clang/test/CodeGen/fseparate-named-sections.c b/clang/test/CodeGen/fseparate-named-sections.c
new file mode 100644
index 000000000000..7a247dbd085c
--- /dev/null
+++ b/clang/test/CodeGen/fseparate-named-sections.c
@@ -0,0 +1,28 @@
+// REQUIRES: x86-registered-target
+
+// RUN: %clang_cc1 -triple x86_64-pc-linux -S -o - < %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux -S -fseparate-named-sections -o - < %s | FileCheck %s --check-prefix=SEPARATE
+
+__attribute__((section("custom_text"))) void f(void) {}
+__attribute__((section("custom_text"))) void g(void) {}
+
+// CHECK: .section custom_text,"ax",@progbits{{$}}
+// CHECK: f:
+// CHECK: g:
+
+// SEPARATE: .section custom_text,"ax",@progbits,unique,1{{$}}
+// SEPARATE: f:
+// SEPARATE: .section custom_text,"ax",@progbits,unique,2{{$}}
+// SEPARATE: g:
+
+__attribute__((section("custom_data"))) int i = 0;
+__attribute__((section("custom_data"))) int j = 0;
+
+// CHECK: .section custom_data,"aw",@progbits{{$}}
+// CHECK: i:
+// CHECK: j:
+
+// SEPARATE: .section custom_data,"aw",@progbits,unique,3{{$}}
+// SEPARATE: i:
+// SEPARATE: .section custom_data,"aw",@progbits,unique,4{{$}}
+// SEPARATE: j:
diff --git a/clang/test/CodeGen/gh64876.cpp b/clang/test/CodeGen/gh64876.cpp
index de46c4922768..bbaaf4096a96 100644
--- a/clang/test/CodeGen/gh64876.cpp
+++ b/clang/test/CodeGen/gh64876.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64 -S -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64 -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
 
 void f(const char* C, const wchar_t *WC) {
   int x1 = __builtin_strncmp(C, "b", 0xffffffffffffffff);
diff --git a/clang/test/CodeGen/hwasan-new-pm.c b/clang/test/CodeGen/hwasan-new-pm.c
index 47014698f6df..085ae76c1f6a 100644
--- a/clang/test/CodeGen/hwasan-new-pm.c
+++ b/clang/test/CodeGen/hwasan-new-pm.c
@@ -2,11 +2,11 @@
 // We run them under different optimizations to ensure the IR is still
 // being instrumented properly.
 
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -S -emit-llvm -o - -fsanitize=hwaddress %s | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -S -emit-llvm -o - -O1 -fsanitize=hwaddress %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -o - -fsanitize=hwaddress %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -o - -O1 -fsanitize=hwaddress %s | FileCheck %s
 
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -S -emit-llvm -o - -fsanitize=kernel-hwaddress %s | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -S -emit-llvm -o - -O1 -fsanitize=kernel-hwaddress %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -o - -fsanitize=kernel-hwaddress %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -o - -O1 -fsanitize=kernel-hwaddress %s | FileCheck %s
 
 int foo(int *a) { return *a; }
 
diff --git a/clang/test/CodeGen/inline-asm-mixed-style.c b/clang/test/CodeGen/inline-asm-mixed-style.c
index 97410f028ab5..3c1252622443 100644
--- a/clang/test/CodeGen/inline-asm-mixed-style.c
+++ b/clang/test/CodeGen/inline-asm-mixed-style.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -ffreestanding -triple i386-unknown-unknown -fasm-blocks -O0 -emit-llvm -S %s -o - | FileCheck %s
-// RUN: %clang_cc1 -ffreestanding -triple x86_64-unknown-unknown -fasm-blocks -O0 -emit-llvm -S %s -o - | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding -triple i386-unknown-unknown -fasm-blocks -O0 -S %s -o - | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding -triple x86_64-unknown-unknown -fasm-blocks -O0 -S %s -o - | FileCheck %s
 // REQUIRES: x86-registered-target
 
 #include <immintrin.h>
diff --git a/clang/test/CodeGen/inline-builtin-comdat.c b/clang/test/CodeGen/inline-builtin-comdat.c
index 1b00711de43e..059dd2d165c3 100644
--- a/clang/test/CodeGen/inline-builtin-comdat.c
+++ b/clang/test/CodeGen/inline-builtin-comdat.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-windows -S -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-windows -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s
 // Inline builtin are not supported for odr linkage
 // CHECK-NOT: .inline
 
diff --git a/clang/test/CodeGen/instrument-functions.c b/clang/test/CodeGen/instrument-functions.c
index c075c3972dd7..b73278d61ce7 100644
--- a/clang/test/CodeGen/instrument-functions.c
+++ b/clang/test/CodeGen/instrument-functions.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -S -debug-info-kind=standalone -emit-llvm -o - %s -finstrument-functions -disable-llvm-passes | FileCheck %s
-// RUN: %clang_cc1 -S -debug-info-kind=standalone -emit-llvm -o - %s -finstrument-function-entry-bare -disable-llvm-passes | FileCheck -check-prefix=BARE %s
+// RUN: %clang_cc1 -debug-info-kind=standalone -emit-llvm -o - %s -finstrument-functions -disable-llvm-passes | FileCheck %s
+// RUN: %clang_cc1 -debug-info-kind=standalone -emit-llvm -o - %s -finstrument-function-entry-bare -disable-llvm-passes | FileCheck -check-prefix=BARE %s
 
 int test1(int x) {
 // CHECK: @test1(i32 {{.*}}%x) #[[ATTR1:[0-9]+]]
diff --git a/clang/test/CodeGen/instrument-objc-method.m b/clang/test/CodeGen/instrument-objc-method.m
index 82457beb94a3..7758e001e514 100644
--- a/clang/test/CodeGen/instrument-objc-method.m
+++ b/clang/test/CodeGen/instrument-objc-method.m
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -S -triple x86_64-apple-darwin10 -debug-info-kind=standalone -emit-llvm -o - %s -finstrument-functions | FileCheck %s
-// RUN: %clang_cc1 -S -triple x86_64-apple-darwin10 -debug-info-kind=standalone -emit-llvm -o - %s -finstrument-function-entry-bare | FileCheck -check-prefix=BARE %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -debug-info-kind=standalone -emit-llvm -o - %s -finstrument-functions | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -debug-info-kind=standalone -emit-llvm -o - %s -finstrument-function-entry-bare | FileCheck -check-prefix=BARE %s
 
 @interface ObjCClass
 @end
diff --git a/clang/test/CodeGen/isfpclass.c b/clang/test/CodeGen/isfpclass.c
index 6633db88f71a..fd35182a5dbb 100644
--- a/clang/test/CodeGen/isfpclass.c
+++ b/clang/test/CodeGen/isfpclass.c
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
-// RUN: %clang_cc1 -triple aarch64-linux-gnu -S -O1 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -O1 -emit-llvm %s -o - | FileCheck %s
 
 // CHECK-LABEL: define dso_local noundef i1 @check_isfpclass_finite
 // CHECK-SAME: (float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
diff --git a/clang/test/CodeGen/kmsan-param-retval.c b/clang/test/CodeGen/kmsan-param-retval.c
index dd7e1f5786c1..9cd6d8be6e79 100644
--- a/clang/test/CodeGen/kmsan-param-retval.c
+++ b/clang/test/CodeGen/kmsan-param-retval.c
@@ -1,12 +1,12 @@
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -S -emit-llvm -O2 -fsanitize=kernel-memory -no-enable-noundef-analysis -o - %s | \
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -O2 -fsanitize=kernel-memory -no-enable-noundef-analysis -o - %s | \
 // RUN:     FileCheck %s --check-prefix=CLEAN
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -S -emit-llvm -O2 -fsanitize=kernel-memory -fno-sanitize-memory-param-retval -o - %s | \
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -O2 -fsanitize=kernel-memory -fno-sanitize-memory-param-retval -o - %s | \
 // RUN:     FileCheck %s --check-prefixes=NOUNDEF,NOUNDEF_ONLY
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -S -emit-llvm -O2 -fsanitize=kernel-memory -mllvm -msan-eager-checks -o - %s | \
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -O2 -fsanitize=kernel-memory -mllvm -msan-eager-checks -o - %s | \
 // RUN:     FileCheck %s --check-prefixes=NOUNDEF,EAGER
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -S -emit-llvm -O2 -fsanitize=kernel-memory -no-enable-noundef-analysis -fsanitize-memory-param-retval -o - %s | \
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -O2 -fsanitize=kernel-memory -no-enable-noundef-analysis -fsanitize-memory-param-retval -o - %s | \
 // RUN:     FileCheck %s --check-prefixes=CLEAN
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -S -emit-llvm -O2 -fsanitize=kernel-memory -o - %s | \
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -O2 -fsanitize=kernel-memory -o - %s | \
 // RUN:     FileCheck %s --check-prefixes=NOUNDEF,EAGER
 
 void foo();
diff --git a/clang/test/CodeGen/libcall-declarations.c b/clang/test/CodeGen/libcall-declarations.c
index ebdb05d7ff10..36c447d20ba7 100644
--- a/clang/test/CodeGen/libcall-declarations.c
+++ b/clang/test/CodeGen/libcall-declarations.c
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -triple x86_64-apple-darwin12 -S -o - -emit-llvm %s | FileCheck %s -check-prefix=CHECK-NOERRNO
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -S -o - -emit-llvm -fmath-errno %s | FileCheck %s -check-prefix=CHECK-ERRNO
-// RUN: %clang_cc1 -triple x86_64-apple-darwin12 -S -o - -emit-llvm -x c++ %s | FileCheck %s -check-prefix=CHECK-NOERRNO
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -S -o - -emit-llvm -x c++ -fmath-errno %s | FileCheck %s -check-prefix=CHECK-ERRNO
+// RUN: %clang_cc1 -triple x86_64-apple-darwin12 -o - -emit-llvm %s | FileCheck %s -check-prefix=CHECK-NOERRNO
+// RUN: %clang_cc1 -triple x86_64 -o - -emit-llvm -fmath-errno %s | FileCheck %s -check-prefix=CHECK-ERRNO
+// RUN: %clang_cc1 -triple x86_64-apple-darwin12 -o - -emit-llvm -x c++ %s | FileCheck %s -check-prefix=CHECK-NOERRNO
+// RUN: %clang_cc1 -triple x86_64 -o - -emit-llvm -x c++ -fmath-errno %s | FileCheck %s -check-prefix=CHECK-ERRNO
 
 // Prototypes.
 #ifdef __cplusplus
diff --git a/clang/test/CodeGen/libcalls-fno-builtin.c b/clang/test/CodeGen/libcalls-fno-builtin.c
index 967a9f375565..6aef0efa1d9b 100644
--- a/clang/test/CodeGen/libcalls-fno-builtin.c
+++ b/clang/test/CodeGen/libcalls-fno-builtin.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -S -emit-llvm -fno-builtin -o - %s | FileCheck --check-prefixes=GLOBAL,CHECK %s
-// RUN: %clang_cc1 -S -emit-llvm -fno-builtin-ceil -fno-builtin-copysign -fno-builtin-cos \
+// RUN: %clang_cc1 -emit-llvm -fno-builtin -o - %s | FileCheck --check-prefixes=GLOBAL,CHECK %s
+// RUN: %clang_cc1 -emit-llvm -fno-builtin-ceil -fno-builtin-copysign -fno-builtin-cos \
 // RUN:  -fno-builtin-fabs -fno-builtin-floor -fno-builtin-strcat -fno-builtin-strncat \
 // RUN:  -fno-builtin-strchr -fno-builtin-strrchr -fno-builtin-strcmp -fno-builtin-strncmp \
 // RUN:  -fno-builtin-strcpy -fno-builtin-stpcpy -fno-builtin-strncpy -fno-builtin-strlen \
diff --git a/clang/test/CodeGen/lifetime2.c b/clang/test/CodeGen/lifetime2.c
index 29c8061e0f05..88c35fca029d 100644
--- a/clang/test/CodeGen/lifetime2.c
+++ b/clang/test/CodeGen/lifetime2.c
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -S -emit-llvm -o - -O2 -disable-llvm-passes %s      | FileCheck %s --implicit-check-not="call void @llvm.lifetime" --check-prefixes=CHECK,O2
-// RUN: %clang_cc1 -S -emit-llvm -o - -O2 -disable-lifetime-markers %s | FileCheck %s --implicit-check-not="call void @llvm.lifetime" --check-prefixes=CHECK
-// RUN: %clang_cc1 -S -emit-llvm -o - -O0 %s                           | FileCheck %s --implicit-check-not="call void @llvm.lifetime" --check-prefixes=CHECK 
+// RUN: %clang_cc1 -emit-llvm -o - -O2 -disable-llvm-passes %s      | FileCheck %s --implicit-check-not="call void @llvm.lifetime" --check-prefixes=CHECK,O2
+// RUN: %clang_cc1 -emit-llvm -o - -O2 -disable-lifetime-markers %s | FileCheck %s --implicit-check-not="call void @llvm.lifetime" --check-prefixes=CHECK
+// RUN: %clang_cc1 -emit-llvm -o - -O0 %s                           | FileCheck %s --implicit-check-not="call void @llvm.lifetime" --check-prefixes=CHECK 
 
 extern int bar(char *A, int n);
 
diff --git a/clang/test/CodeGen/lifetime3.cpp b/clang/test/CodeGen/lifetime3.cpp
index 37ed5f193811..64a097cfb859 100644
--- a/clang/test/CodeGen/lifetime3.cpp
+++ b/clang/test/CodeGen/lifetime3.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -S -emit-llvm -o - -O2 -disable-llvm-passes %s      | FileCheck %s --implicit-check-not="call void @llvm.lifetime" --check-prefixes=CHECK,O2
-// RUN: %clang_cc1 -S -emit-llvm -o - -O2 -disable-lifetime-markers %s | FileCheck %s --implicit-check-not="call void @llvm.lifetime" --check-prefixes=CHECK
-// RUN: %clang_cc1 -S -emit-llvm -o - -O0 %s                           | FileCheck %s --implicit-check-not="call void @llvm.lifetime" --check-prefixes=CHECK 
+// RUN: %clang_cc1 -emit-llvm -o - -O2 -disable-llvm-passes %s      | FileCheck %s --implicit-check-not="call void @llvm.lifetime" --check-prefixes=CHECK,O2
+// RUN: %clang_cc1 -emit-llvm -o - -O2 -disable-lifetime-markers %s | FileCheck %s --implicit-check-not="call void @llvm.lifetime" --check-prefixes=CHECK
+// RUN: %clang_cc1 -emit-llvm -o - -O0 %s                           | FileCheck %s --implicit-check-not="call void @llvm.lifetime" --check-prefixes=CHECK 
 
 extern int bar(char *A, int n);
 
diff --git a/clang/test/CodeGen/lineno-dbginfo.c b/clang/test/CodeGen/lineno-dbginfo.c
index 5fe64ec3469b..82f54d0fc0bc 100644
--- a/clang/test/CodeGen/lineno-dbginfo.c
+++ b/clang/test/CodeGen/lineno-dbginfo.c
@@ -1,5 +1,5 @@
 // RUN: echo "#include <stddef.h>" > %t.h
-// RUN: %clang_cc1 -S -debug-info-kind=limited -include %t.h %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -debug-info-kind=limited -include %t.h %s -emit-llvm -o - | FileCheck %s
 
 // CHECK: !DIGlobalVariable(name: "outer",
 // CHECK-NOT:               linkageName:
diff --git a/clang/test/CodeGen/linking-bitcode-postopt.cpp b/clang/test/CodeGen/linking-bitcode-postopt.cpp
new file mode 100644
index 000000000000..a0486ed0c9a8
--- /dev/null
+++ b/clang/test/CodeGen/linking-bitcode-postopt.cpp
@@ -0,0 +1,31 @@
+// REQUIRES: amdgpu-registered-target
+
+// Test that -mlink-bitcode-postopt correctly enables LinkInModulesPass
+
+// RUN: %clang_cc1 -triple amdgcn-- -emit-llvm-bc -o /dev/null \
+// RUN:   -mllvm -print-pipeline-passes \
+// RUN: %s 2>&1 | FileCheck --check-prefixes=DEFAULT %s
+
+// DEFAULT-NOT: LinkInModulesPass
+
+// RUN: %clang_cc1 -triple amdgcn-- -emit-llvm-bc -o /dev/null \
+// RUN:   -mllvm -print-pipeline-passes \
+// RUN:   -mlink-builtin-bitcode-postopt \
+// RUN: %s 2>&1 | FileCheck --check-prefixes=OPTION-POSITIVE %s
+
+// OPTION-POSITIVE: LinkInModulesPass
+
+// RUN: %clang_cc1 -triple amdgcn-- -emit-llvm-bc -o /dev/null \
+// RUN:   -mllvm -print-pipeline-passes \
+// RUN:   -mno-link-builtin-bitcode-postopt \
+// RUN: %s 2>&1 | FileCheck --check-prefixes=OPTION-NEGATIVE %s
+
+// OPTION-NEGATIVE-NOT: LinkInModulesPass
+
+// RUN: %clang_cc1 -triple amdgcn-- -emit-llvm-bc -o /dev/null \
+// RUN:   -mllvm -print-pipeline-passes \
+// RUN:   -mlink-builtin-bitcode-postopt \
+// RUN:   -mno-link-builtin-bitcode-postopt \
+// RUN: %s 2>&1 | FileCheck --check-prefixes=OPTION-POSITIVE-NEGATIVE %s
+
+// OPTION-POSITIVE-NEGATIVE-NOT: LinkInModulesPass
diff --git a/clang/test/CodeGen/loop-unroll.c b/clang/test/CodeGen/loop-unroll.c
index 65eb5878e7f1..18d5b46908aa 100644
--- a/clang/test/CodeGen/loop-unroll.c
+++ b/clang/test/CodeGen/loop-unroll.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple x86_64 -target-cpu x86-64 -S -O1 -funroll-loops -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK-ENABLE-UNROLL
-// RUN: %clang_cc1 -triple x86_64 -target-cpu x86-64 -S -O1 -fno-unroll-loops -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK-DISABLE-UNROLL
+// RUN: %clang_cc1 -triple x86_64 -target-cpu x86-64 -O1 -funroll-loops -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK-ENABLE-UNROLL
+// RUN: %clang_cc1 -triple x86_64 -target-cpu x86-64 -O1 -fno-unroll-loops -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK-DISABLE-UNROLL
 // REQUIRES: x86-registered-target
 
 // CHECK-ENABLE-UNROLL-LABEL: @for_test()
diff --git a/clang/test/CodeGen/loop-vectorize.c b/clang/test/CodeGen/loop-vectorize.c
index ebebbbdfcbdc..1aa79b51aed3 100644
--- a/clang/test/CodeGen/loop-vectorize.c
+++ b/clang/test/CodeGen/loop-vectorize.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple x86_64 -target-cpu x86-64 -S -O1 -vectorize-loops -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK-ENABLE-VECT
-// RUN: %clang_cc1 -triple x86_64 -target-cpu x86-64 -S -O1 -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK-DISABLE-VECT
+// RUN: %clang_cc1 -triple x86_64 -target-cpu x86-64 -O1 -vectorize-loops -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK-ENABLE-VECT
+// RUN: %clang_cc1 -triple x86_64 -target-cpu x86-64 -O1 -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK-DISABLE-VECT
 // REQUIRES: x86-registered-target
 
 // CHECK-ENABLE-VECT-LABEL: @for_test()
diff --git a/clang/test/CodeGen/math-builtins-long.c b/clang/test/CodeGen/math-builtins-long.c
index ad0d2122b597..183349e0f017 100644
--- a/clang/test/CodeGen/math-builtins-long.c
+++ b/clang/test/CodeGen/math-builtins-long.c
@@ -1,10 +1,10 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -w -S -o - -emit-llvm %s \
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -w -o - -emit-llvm %s \
 // RUN:   -fmath-errno | FileCheck %s -check-prefix=F80
-// RUN: %clang_cc1 -triple ppc64le-unknown-unknown -w -S -o - -emit-llvm %s \
+// RUN: %clang_cc1 -triple ppc64le-unknown-unknown -w -o - -emit-llvm %s \
 // RUN:   -fmath-errno | FileCheck %s -check-prefix=PPC
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -mlong-double-128 -w -S \
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -mlong-double-128 -w \
 // RUN:   -o - -emit-llvm %s -fmath-errno | FileCheck %s -check-prefix=X86F128
-// RUN: %clang_cc1 -triple ppc64le-unknown-unknown -mabi=ieeelongdouble -w -S \
+// RUN: %clang_cc1 -triple ppc64le-unknown-unknown -mabi=ieeelongdouble -w \
 // RUN:   -o - -emit-llvm %s -fmath-errno | FileCheck %s -check-prefix=PPCF128
 
 void bar(long double);
diff --git a/clang/test/CodeGen/math-errno.c b/clang/test/CodeGen/math-errno.c
index b5354e47e26b..15340a11150c 100644
--- a/clang/test/CodeGen/math-errno.c
+++ b/clang/test/CodeGen/math-errno.c
@@ -27,7 +27,7 @@ float f1(float x) {
 // CHECK: tail call float @sqrtf(float noundef {{.*}}) #[[ATTR4_O2:[0-9]+]]
 
 // FAST-LABEL: define {{.*}} nofpclass(nan inf) float @f1
-// FAST: call fast nofpclass(nan inf) float @sqrtf(float noundef nofpclass(nan inf) {{.*}}) #[[ATTR3_FAST:[0-9]+]]
+// FAST: call nofpclass(nan inf) float @sqrtf(float noundef nofpclass(nan inf) {{.*}}) #[[ATTR3_FAST:[0-9]+]]
 
 // NOOPT-LABEL: define {{.*}} float @f1
 // NOOPT: call float @sqrtf(float noundef {{.*}}) #[[ATTR4_NOOPT:[0-9]+]]
@@ -44,7 +44,7 @@ float f2(float x) {
 // FAST: call fast float @llvm.sqrt.f32(float {{.*}})
 
 // NOOPT-LABEL: define {{.*}} float @f2
-// NOOPT: call float @sqrtf(float {{.*}}) #[[ATTR4_NOOPT:[0-9]+]]
+// NOOPT: call fast float @sqrtf(float {{.*}}) #[[ATTR4_NOOPT:[0-9]+]]
 
 __attribute__((optnone))
 float f3(float x) {
@@ -56,7 +56,7 @@ float f3(float x) {
 // CHECK: call float @sqrtf(float noundef {{.*}})
 
 // FAST-LABEL: define {{.*}} nofpclass(nan inf) float @f3
-// FAST: call fast nofpclass(nan inf) float @sqrtf(float noundef nofpclass(nan inf) {{.*}}) #[[ATTR4_FAST:[0-9]+]]
+// FAST: call nofpclass(nan inf) float @sqrtf(float noundef nofpclass(nan inf) {{.*}}) #[[ATTR4_FAST:[0-9]+]]
 
 // NOOPT-LABEL: define {{.*}} float @f3
 // NOOPT:  call float @sqrtf(float noundef %0) #[[ATTR4_NOOPT:[0-9]+]]
diff --git a/clang/test/CodeGen/math-libcalls.c b/clang/test/CodeGen/math-libcalls.c
index 02df4fe5fea6..29c312ba0eca 100644
--- a/clang/test/CodeGen/math-libcalls.c
+++ b/clang/test/CodeGen/math-libcalls.c
@@ -1,10 +1,10 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -Wno-implicit-function-declaration -w -S -o - -emit-llvm              %s | FileCheck %s --check-prefix=NO__ERRNO
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -Wno-implicit-function-declaration -w -S -o - -emit-llvm -fmath-errno %s | FileCheck %s --check-prefix=HAS_ERRNO
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -Wno-implicit-function-declaration -w -S -o - -emit-llvm -disable-llvm-passes -O2              %s | FileCheck %s --check-prefix=NO__ERRNO
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -Wno-implicit-function-declaration -w -S -o - -emit-llvm -disable-llvm-passes -O2 -fmath-errno %s | FileCheck %s --check-prefix=HAS_ERRNO
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -Wno-implicit-function-declaration -w -S -o - -emit-llvm -ffp-exception-behavior=maytrap %s | FileCheck %s --check-prefix=HAS_MAYTRAP
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown-gnu -Wno-implicit-function-declaration -w -S -o - -emit-llvm -fmath-errno %s | FileCheck %s --check-prefix=HAS_ERRNO_GNU
-// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -Wno-implicit-function-declaration -w -S -o - -emit-llvm -fmath-errno %s | FileCheck %s --check-prefix=HAS_ERRNO_WIN
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -Wno-implicit-function-declaration -w -o - -emit-llvm %s | FileCheck %s --check-prefix=NO__ERRNO
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -Wno-implicit-function-declaration -w -o - -emit-llvm -fmath-errno %s | FileCheck %s --check-prefix=HAS_ERRNO
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -Wno-implicit-function-declaration -w -o - -emit-llvm -disable-llvm-passes -O2 %s | FileCheck %s --check-prefix=NO__ERRNO
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -Wno-implicit-function-declaration -w -o - -emit-llvm -disable-llvm-passes -O2 -fmath-errno %s | FileCheck %s --check-prefix=HAS_ERRNO
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -Wno-implicit-function-declaration -w -o - -emit-llvm -ffp-exception-behavior=maytrap %s | FileCheck %s --check-prefix=HAS_MAYTRAP
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown-gnu -Wno-implicit-function-declaration -w -o - -emit-llvm -fmath-errno %s | FileCheck %s --check-prefix=HAS_ERRNO_GNU
+// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -Wno-implicit-function-declaration -w -o - -emit-llvm -fmath-errno %s | FileCheck %s --check-prefix=HAS_ERRNO_WIN
 
 // Test attributes and builtin codegen of math library calls.
 
diff --git a/clang/test/CodeGen/mcount-aix.c b/clang/test/CodeGen/mcount-aix.c
index 6f5d9b3322e9..17ce0af476db 100644
--- a/clang/test/CodeGen/mcount-aix.c
+++ b/clang/test/CodeGen/mcount-aix.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -pg -triple powerpc-ibm-aix7.2.0.0 -S -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -pg -triple powerpc64-ibm-aix7.2.0.0 -S -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK64
+// RUN: %clang_cc1 -pg -triple powerpc-ibm-aix7.2.0.0 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -pg -triple powerpc64-ibm-aix7.2.0.0 -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK64
 
 void foo() {
 }
diff --git a/clang/test/CodeGen/mdefault-visibility-export-mapping.c b/clang/test/CodeGen/mdefault-visibility-export-mapping.c
index 7f39050907ff..92c9177c7e69 100644
--- a/clang/test/CodeGen/mdefault-visibility-export-mapping.c
+++ b/clang/test/CodeGen/mdefault-visibility-export-mapping.c
@@ -1,10 +1,10 @@
-// RUN: %clang_cc1 -triple powerpc-ibm-aix %s -S -emit-llvm -o - | \
+// RUN: %clang_cc1 -triple powerpc-ibm-aix %s -emit-llvm -o - | \
 // RUN:   FileCheck -check-prefixes=UNSPECIFIED-DEF,EXPLICIT-DEF %s
-// RUN: %clang_cc1 -triple powerpc-ibm-aix %s -mdefault-visibility-export-mapping=none -S -emit-llvm -o - | \
+// RUN: %clang_cc1 -triple powerpc-ibm-aix %s -mdefault-visibility-export-mapping=none -emit-llvm -o - | \
 // RUN:   FileCheck -check-prefixes=UNSPECIFIED-DEF,EXPLICIT-DEF %s
-// RUN: %clang_cc1 -triple powerpc-ibm-aix %s -mdefault-visibility-export-mapping=explicit -S -emit-llvm -o - | \
+// RUN: %clang_cc1 -triple powerpc-ibm-aix %s -mdefault-visibility-export-mapping=explicit -emit-llvm -o - | \
 // RUN:   FileCheck -check-prefixes=UNSPECIFIED-DEF,EXPLICIT-EXP %s
-// RUN: %clang_cc1 -triple powerpc-ibm-aix %s -mdefault-visibility-export-mapping=all -S -emit-llvm -o - | \
+// RUN: %clang_cc1 -triple powerpc-ibm-aix %s -mdefault-visibility-export-mapping=all -emit-llvm -o - | \
 // RUN:   FileCheck -check-prefixes=UNSPECIFIED-EXP,EXPLICIT-EXP %s
 // RUN: %clang -target powerpc-ibm-aix %s -mdefault-visibility-export-mapping=all -fvisibility=hidden -S -emit-llvm -o - | \
 // RUN:   FileCheck -check-prefixes=UNSPECIFIED-HID,EXPLICIT-EXP %s
diff --git a/clang/test/CodeGen/memcmp-inline-builtin-to-asm.c b/clang/test/CodeGen/memcmp-inline-builtin-to-asm.c
index 169acfad6490..1dae1d8f0143 100644
--- a/clang/test/CodeGen/memcmp-inline-builtin-to-asm.c
+++ b/clang/test/CodeGen/memcmp-inline-builtin-to-asm.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 
-// RUN: %clang_cc1 -triple x86_64 -S -emit-llvm -o - %s | opt -S -passes=verify | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64 -emit-llvm -o - %s | opt -S -passes=verify | FileCheck %s
 //
 // Verifies that clang detects memcmp inline version and uses it instead of the builtin.
 
diff --git a/clang/test/CodeGen/memcpy-inline-builtin-mutliple-decl.c b/clang/test/CodeGen/memcpy-inline-builtin-mutliple-decl.c
index 39a964c51c7d..e108297d42a1 100644
--- a/clang/test/CodeGen/memcpy-inline-builtin-mutliple-decl.c
+++ b/clang/test/CodeGen/memcpy-inline-builtin-mutliple-decl.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple i686-w64-mingw32 -S -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple i686-w64-mingw32 -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
 //
 // Verifies that clang detects memcpy inline version and uses it instead of the builtin.
 // Checks that clang correctly walks through multiple forward declaration.
diff --git a/clang/test/CodeGen/memcpy-inline-builtin-no-extern.c b/clang/test/CodeGen/memcpy-inline-builtin-no-extern.c
index bb46e48f847b..f6edbbf6aeb4 100644
--- a/clang/test/CodeGen/memcpy-inline-builtin-no-extern.c
+++ b/clang/test/CodeGen/memcpy-inline-builtin-no-extern.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64 -S -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64 -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
 //
 // Verifies that clang-generated *.inline are flagged as internal.
 
diff --git a/clang/test/CodeGen/memcpy-inline-builtin.c b/clang/test/CodeGen/memcpy-inline-builtin.c
index 8fce67a81a40..abb6457caa33 100644
--- a/clang/test/CodeGen/memcpy-inline-builtin.c
+++ b/clang/test/CodeGen/memcpy-inline-builtin.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 
-// RUN: %clang_cc1 -triple x86_64 -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64 -emit-llvm -o - %s | FileCheck %s
 //
 // Verifies that clang detects memcpy inline version and uses it instead of the builtin.
 // Checks alternate version with the `artificial` attribute.
diff --git a/clang/test/CodeGen/memcpy-no-nobuiltin-if-not-emitted.cpp b/clang/test/CodeGen/memcpy-no-nobuiltin-if-not-emitted.cpp
index 73da4ae41958..dc7b32073d5d 100644
--- a/clang/test/CodeGen/memcpy-no-nobuiltin-if-not-emitted.cpp
+++ b/clang/test/CodeGen/memcpy-no-nobuiltin-if-not-emitted.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple i686-linux-gnu -std=c++11 -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple i686-linux-gnu -std=c++11 -emit-llvm -o - %s | FileCheck %s
 //
 // Regression test for the issue reported at
 // https://reviews.llvm.org/D78162#1986104
diff --git a/clang/test/CodeGen/memcpy-nobuiltin.c b/clang/test/CodeGen/memcpy-nobuiltin.c
index c08212f6810f..6129c530d434 100644
--- a/clang/test/CodeGen/memcpy-nobuiltin.c
+++ b/clang/test/CodeGen/memcpy-nobuiltin.c
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -S -emit-llvm -o- %s -isystem %S -DWITH_DECL | FileCheck --check-prefix=CHECK-WITH-DECL %s
-// RUN: %clang_cc1 -verify -S -emit-llvm -o- %s -isystem %S -UWITH_DECL | FileCheck --check-prefix=CHECK-NO-DECL %s
-// RUN: %clang_cc1 -verify -S -emit-llvm -o- %s -isystem %S -disable-llvm-passes -DWITH_SELF_REFERENCE_DECL | FileCheck --check-prefix=CHECK-SELF-REF-DECL %s
+// RUN: %clang_cc1 -verify -emit-llvm -o- %s -isystem %S -DWITH_DECL | FileCheck --check-prefix=CHECK-WITH-DECL %s
+// RUN: %clang_cc1 -verify -emit-llvm -o- %s -isystem %S -UWITH_DECL | FileCheck --check-prefix=CHECK-NO-DECL %s
+// RUN: %clang_cc1 -verify -emit-llvm -o- %s -isystem %S -disable-llvm-passes -DWITH_SELF_REFERENCE_DECL | FileCheck --check-prefix=CHECK-SELF-REF-DECL %s
 //
 // CHECK-WITH-DECL-NOT: @llvm.memcpy
 // CHECK-NO-DECL: @llvm.memcpy
diff --git a/clang/test/CodeGen/mips-byval-arg.c b/clang/test/CodeGen/mips-byval-arg.c
index a620ef5dd8c8..239124929074 100644
--- a/clang/test/CodeGen/mips-byval-arg.c
+++ b/clang/test/CodeGen/mips-byval-arg.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple mipsel-unknown-linux -O3 -S -o - -emit-llvm %s | FileCheck %s -check-prefix=O32
-// RUN: %clang_cc1 -triple mips64el-unknown-linux -O3 -S -target-abi n64 -o - -emit-llvm %s | FileCheck %s -check-prefix=N64
+// RUN: %clang_cc1 -triple mipsel-unknown-linux -O3 -o - -emit-llvm %s | FileCheck %s -check-prefix=O32
+// RUN: %clang_cc1 -triple mips64el-unknown-linux -O3 -target-abi n64 -o - -emit-llvm %s | FileCheck %s -check-prefix=N64
 
 typedef struct {
   float f[3];
diff --git a/clang/test/CodeGen/mips-transparent-union.c b/clang/test/CodeGen/mips-transparent-union.c
index bb87a17e87ae..555dbbf08480 100644
--- a/clang/test/CodeGen/mips-transparent-union.c
+++ b/clang/test/CodeGen/mips-transparent-union.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple mips64-linux-gnu -S -o - -emit-llvm %s | FileCheck %s
+// RUN: %clang_cc1 -triple mips64-linux-gnu -o - -emit-llvm %s | FileCheck %s
 //
 // Transparent unions are passed according to the calling convention rules of
 // the first member. In this case, it is as if it were a void pointer so we
diff --git a/clang/test/CodeGen/mips-unsigned-ext-var.c b/clang/test/CodeGen/mips-unsigned-ext-var.c
index e3f1ea6b88e5..dee6a92279f6 100644
--- a/clang/test/CodeGen/mips-unsigned-ext-var.c
+++ b/clang/test/CodeGen/mips-unsigned-ext-var.c
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -triple mips64-unknown-linux -O2 -target-abi n64 -S -emit-llvm %s -o - | FileCheck %s -check-prefix=N64
-// RUN: %clang_cc1 -triple mips64-unknown-linux -O2 -target-abi n32 -S -emit-llvm %s -o - | FileCheck %s -check-prefix=N32
-// RUN: %clang_cc1 -triple mips-unknown-linux -O2 -target-abi o32 -S -emit-llvm %s -o - | FileCheck %s -check-prefix=O32
+// RUN: %clang_cc1 -triple mips64-unknown-linux -O2 -target-abi n64 -emit-llvm %s -o - | FileCheck %s -check-prefix=N64
+// RUN: %clang_cc1 -triple mips64-unknown-linux -O2 -target-abi n32 -emit-llvm %s -o - | FileCheck %s -check-prefix=N32
+// RUN: %clang_cc1 -triple mips-unknown-linux -O2 -target-abi o32 -emit-llvm %s -o - | FileCheck %s -check-prefix=O32
 
 #include <stdarg.h>
 
diff --git a/clang/test/CodeGen/mips-unsigned-extend.c b/clang/test/CodeGen/mips-unsigned-extend.c
index 00fae3d02433..31bf60b22c2d 100644
--- a/clang/test/CodeGen/mips-unsigned-extend.c
+++ b/clang/test/CodeGen/mips-unsigned-extend.c
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -triple mips64-unknown-linux -O0 -target-abi n64 -S -emit-llvm %s -o - | FileCheck %s -check-prefix=N64
-// RUN: %clang_cc1 -triple mips64-unknown-linux -O0 -target-abi n32 -S -emit-llvm %s -o - | FileCheck %s -check-prefix=N32
-// RUN: %clang_cc1 -triple mips-unknown-linux -O0 -target-abi o32 -S -emit-llvm %s -o - | FileCheck %s -check-prefix=O32
+// RUN: %clang_cc1 -triple mips64-unknown-linux -O0 -target-abi n64 -emit-llvm %s -o - | FileCheck %s -check-prefix=N64
+// RUN: %clang_cc1 -triple mips64-unknown-linux -O0 -target-abi n32 -emit-llvm %s -o - | FileCheck %s -check-prefix=N32
+// RUN: %clang_cc1 -triple mips-unknown-linux -O0 -target-abi o32 -emit-llvm %s -o - | FileCheck %s -check-prefix=O32
 
 void foo(unsigned a) {
 }
diff --git a/clang/test/CodeGen/mips-vector-arg.c b/clang/test/CodeGen/mips-vector-arg.c
index d73e6ce0c9a8..9bf4324d395d 100644
--- a/clang/test/CodeGen/mips-vector-arg.c
+++ b/clang/test/CodeGen/mips-vector-arg.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple mipsel-unknown-linux -O3 -S -o - -emit-llvm %s | FileCheck %s -check-prefix=O32
-// RUN: %clang_cc1 -triple mips64el-unknown-linux -O3 -S -target-abi n64 -o - -emit-llvm %s | FileCheck %s -check-prefix=N64
+// RUN: %clang_cc1 -triple mipsel-unknown-linux -O3 -o - -emit-llvm %s | FileCheck %s -check-prefix=O32
+// RUN: %clang_cc1 -triple mips64el-unknown-linux -O3 -target-abi n64 -o - -emit-llvm %s | FileCheck %s -check-prefix=N64
 
 // check that
 // 1. vector arguments are passed in integer registers
diff --git a/clang/test/CodeGen/mips-vector-return.c b/clang/test/CodeGen/mips-vector-return.c
index d2103d1def74..c2a9bbfe9db0 100644
--- a/clang/test/CodeGen/mips-vector-return.c
+++ b/clang/test/CodeGen/mips-vector-return.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple mipsel-unknown-linux -O3 -S -o - -emit-llvm %s | FileCheck %s -check-prefix=O32
-// RUN: %clang_cc1 -triple mips64el-unknown-linux -O3 -S -target-abi n64 -o - -emit-llvm %s | FileCheck %s -check-prefix=N64
+// RUN: %clang_cc1 -triple mipsel-unknown-linux -O3 -o - -emit-llvm %s | FileCheck %s -check-prefix=O32
+// RUN: %clang_cc1 -triple mips64el-unknown-linux -O3 -target-abi n64 -o - -emit-llvm %s | FileCheck %s -check-prefix=N64
 
 // vectors larger than 16-bytes are returned via the hidden pointer argument. 
 // N64/N32 returns vectors whose size is equal to or smaller than 16-bytes in
diff --git a/clang/test/CodeGen/mips-zero-sized-struct.c b/clang/test/CodeGen/mips-zero-sized-struct.c
index b40ff59f73fb..a4c5fc87cd9f 100644
--- a/clang/test/CodeGen/mips-zero-sized-struct.c
+++ b/clang/test/CodeGen/mips-zero-sized-struct.c
@@ -1,23 +1,23 @@
-// RUN: %clang_cc1 -triple mips-unknown-linux-gnu -S -emit-llvm -o - %s | FileCheck -check-prefix=O32 %s
-// RUN: %clang_cc1 -triple mipsel-unknown-linux-gnu -S -emit-llvm -o - %s | FileCheck -check-prefix=O32 %s
-// RUN: %clang_cc1 -triple mipsisa32r6-unknown-linux-gnu -S -emit-llvm -o - %s | FileCheck -check-prefix=O32 %s
-// RUN: %clang_cc1 -triple mipsisa32r6el-unknown-linux-gnu -S -emit-llvm -o - %s | FileCheck -check-prefix=O32 %s
-// RUN: %clang_cc1 -triple mips64-unknown-linux-gnu -S -emit-llvm -o - %s  -target-abi n32 | FileCheck -check-prefix=N32 %s
-// RUN: %clang_cc1 -triple mips64el-unknown-linux-gnu -S -emit-llvm -o - %s  -target-abi n32 | FileCheck -check-prefix=N32 %s
-// RUN: %clang_cc1 -triple mipsisa64r6-unknown-linux-gnu -S -emit-llvm -o - %s  -target-abi n32 | FileCheck -check-prefix=N32 %s
-// RUN: %clang_cc1 -triple mipsisa64r6el-unknown-linux-gnu -S -emit-llvm -o - %s  -target-abi n32 | FileCheck -check-prefix=N32 %s
-// RUN: %clang_cc1 -triple mips64-unknown-linux-gnuabin32 -S -emit-llvm -o - %s  | FileCheck -check-prefix=N32 %s
-// RUN: %clang_cc1 -triple mips64el-unknown-linux-gnuabin32 -S -emit-llvm -o - %s  | FileCheck -check-prefix=N32 %s
-// RUN: %clang_cc1 -triple mipsisa64r6-unknown-linux-gnuabin32 -S -emit-llvm -o - %s  | FileCheck -check-prefix=N32 %s
-// RUN: %clang_cc1 -triple mipsisa64r6el-unknown-linux-gnuabin32 -S -emit-llvm -o - %s  | FileCheck -check-prefix=N32 %s
-// RUN: %clang_cc1 -triple mips64-unknown-linux-gnu -S -emit-llvm -o - %s | FileCheck -check-prefix=N64 %s
-// RUN: %clang_cc1 -triple mips64el-unknown-linux-gnu -S -emit-llvm -o - %s | FileCheck -check-prefix=N64 %s
-// RUN: %clang_cc1 -triple mipsisa64r6-unknown-linux-gnu -S -emit-llvm -o - %s | FileCheck -check-prefix=N64 %s
-// RUN: %clang_cc1 -triple mipsisa64r6el-unknown-linux-gnu -S -emit-llvm -o - %s | FileCheck -check-prefix=N64 %s
-// RUN: %clang_cc1 -triple mips64-unknown-linux-gnuabi64 -S -emit-llvm -o - %s | FileCheck -check-prefix=N64 %s
-// RUN: %clang_cc1 -triple mips64el-unknown-linux-gnuabi64 -S -emit-llvm -o - %s | FileCheck -check-prefix=N64 %s
-// RUN: %clang_cc1 -triple mipsisa64r6-unknown-linux-gnuabi64 -S -emit-llvm -o - %s | FileCheck -check-prefix=N64 %s
-// RUN: %clang_cc1 -triple mipsisa64r6el-unknown-linux-gnuabi64 -S -emit-llvm -o - %s | FileCheck -check-prefix=N64 %s
+// RUN: %clang_cc1 -triple mips-unknown-linux-gnu -emit-llvm -o - %s | FileCheck -check-prefix=O32 %s
+// RUN: %clang_cc1 -triple mipsel-unknown-linux-gnu -emit-llvm -o - %s | FileCheck -check-prefix=O32 %s
+// RUN: %clang_cc1 -triple mipsisa32r6-unknown-linux-gnu -emit-llvm -o - %s | FileCheck -check-prefix=O32 %s
+// RUN: %clang_cc1 -triple mipsisa32r6el-unknown-linux-gnu -emit-llvm -o - %s | FileCheck -check-prefix=O32 %s
+// RUN: %clang_cc1 -triple mips64-unknown-linux-gnu -emit-llvm -o - %s  -target-abi n32 | FileCheck -check-prefix=N32 %s
+// RUN: %clang_cc1 -triple mips64el-unknown-linux-gnu -emit-llvm -o - %s  -target-abi n32 | FileCheck -check-prefix=N32 %s
+// RUN: %clang_cc1 -triple mipsisa64r6-unknown-linux-gnu -emit-llvm -o - %s  -target-abi n32 | FileCheck -check-prefix=N32 %s
+// RUN: %clang_cc1 -triple mipsisa64r6el-unknown-linux-gnu -emit-llvm -o - %s  -target-abi n32 | FileCheck -check-prefix=N32 %s
+// RUN: %clang_cc1 -triple mips64-unknown-linux-gnuabin32 -emit-llvm -o - %s  | FileCheck -check-prefix=N32 %s
+// RUN: %clang_cc1 -triple mips64el-unknown-linux-gnuabin32 -emit-llvm -o - %s  | FileCheck -check-prefix=N32 %s
+// RUN: %clang_cc1 -triple mipsisa64r6-unknown-linux-gnuabin32 -emit-llvm -o - %s  | FileCheck -check-prefix=N32 %s
+// RUN: %clang_cc1 -triple mipsisa64r6el-unknown-linux-gnuabin32 -emit-llvm -o - %s  | FileCheck -check-prefix=N32 %s
+// RUN: %clang_cc1 -triple mips64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck -check-prefix=N64 %s
+// RUN: %clang_cc1 -triple mips64el-unknown-linux-gnu -emit-llvm -o - %s | FileCheck -check-prefix=N64 %s
+// RUN: %clang_cc1 -triple mipsisa64r6-unknown-linux-gnu -emit-llvm -o - %s | FileCheck -check-prefix=N64 %s
+// RUN: %clang_cc1 -triple mipsisa64r6el-unknown-linux-gnu -emit-llvm -o - %s | FileCheck -check-prefix=N64 %s
+// RUN: %clang_cc1 -triple mips64-unknown-linux-gnuabi64 -emit-llvm -o - %s | FileCheck -check-prefix=N64 %s
+// RUN: %clang_cc1 -triple mips64el-unknown-linux-gnuabi64 -emit-llvm -o - %s | FileCheck -check-prefix=N64 %s
+// RUN: %clang_cc1 -triple mipsisa64r6-unknown-linux-gnuabi64 -emit-llvm -o - %s | FileCheck -check-prefix=N64 %s
+// RUN: %clang_cc1 -triple mipsisa64r6el-unknown-linux-gnuabi64 -emit-llvm -o - %s | FileCheck -check-prefix=N64 %s
 
 // O32: define{{.*}} void @fn28(ptr dead_on_unwind noalias writable sret(%struct.T2) align 1 %agg.result, i8 noundef signext %arg0)
 // N32: define{{.*}} void @fn28(i8 noundef signext %arg0)
diff --git a/clang/test/CodeGen/mips64-class-return.cpp b/clang/test/CodeGen/mips64-class-return.cpp
index f3fddf1189ec..160d55d19bfd 100644
--- a/clang/test/CodeGen/mips64-class-return.cpp
+++ b/clang/test/CodeGen/mips64-class-return.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple mips64el-unknown-linux -O3 -S -target-abi n64 -o - -emit-llvm %s | FileCheck %s
+// RUN: %clang_cc1 -triple mips64el-unknown-linux -O3 -target-abi n64 -o - -emit-llvm %s | FileCheck %s
 
 class B0 {
   double d;
diff --git a/clang/test/CodeGen/mips64-padding-arg.c b/clang/test/CodeGen/mips64-padding-arg.c
index 038103b1df3a..bb92a841c3f6 100644
--- a/clang/test/CodeGen/mips64-padding-arg.c
+++ b/clang/test/CodeGen/mips64-padding-arg.c
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -triple mipsel-unknown-linux -O3 -S -o - -emit-llvm %s | FileCheck %s -check-prefix=O32
-// RUN: %clang_cc1 -triple mips64el-unknown-linux -O3 -S -target-abi n64 -o - -emit-llvm %s | FileCheck %s -check-prefix=N64
-// RUN: %clang_cc1 -triple mipsel-unknown-linux -target-feature "+fp64" -O3 -S -o - -emit-llvm %s | FileCheck %s -check-prefix=O32
+// RUN: %clang_cc1 -triple mipsel-unknown-linux -O3 -o - -emit-llvm %s | FileCheck %s -check-prefix=O32
+// RUN: %clang_cc1 -triple mips64el-unknown-linux -O3 -target-abi n64 -o - -emit-llvm %s | FileCheck %s -check-prefix=N64
+// RUN: %clang_cc1 -triple mipsel-unknown-linux -target-feature "+fp64" -O3 -o - -emit-llvm %s | FileCheck %s -check-prefix=O32
 
 typedef struct {
   double d;
diff --git a/clang/test/CodeGen/msan-param-retval.c b/clang/test/CodeGen/msan-param-retval.c
index 744d70620bdc..269a759fac10 100644
--- a/clang/test/CodeGen/msan-param-retval.c
+++ b/clang/test/CodeGen/msan-param-retval.c
@@ -1,12 +1,12 @@
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -S -emit-llvm -fsanitize=memory -no-enable-noundef-analysis -o - %s | \
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -fsanitize=memory -no-enable-noundef-analysis -o - %s | \
 // RUN:     FileCheck %s --check-prefixes=CLEAN,CHECK
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -S -emit-llvm -fsanitize=memory -fno-sanitize-memory-param-retval -o - %s | \
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -fsanitize=memory -fno-sanitize-memory-param-retval -o - %s | \
 // RUN:     FileCheck %s --check-prefixes=NOUNDEF,NOUNDEF_ONLY,CHECK
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -S -emit-llvm -fsanitize=memory -mllvm -msan-eager-checks -o - %s | \
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -fsanitize=memory -mllvm -msan-eager-checks -o - %s | \
 // RUN:     FileCheck %s --check-prefixes=NOUNDEF,EAGER,CHECK
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -S -emit-llvm -fsanitize=memory -no-enable-noundef-analysis -fsanitize-memory-param-retval -o - %s | \
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -fsanitize=memory -no-enable-noundef-analysis -fsanitize-memory-param-retval -o - %s | \
 // RUN:     FileCheck %s --check-prefixes=CLEAN,CHECK
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -S -emit-llvm -fsanitize=memory -o - %s | \
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -fsanitize=memory -o - %s | \
 // RUN:     FileCheck %s --check-prefixes=NOUNDEF,EAGER,CHECK
 
 void bar(int x) {
diff --git a/clang/test/CodeGen/msan-param-retval.cpp b/clang/test/CodeGen/msan-param-retval.cpp
index d34dafaa8eb6..c4960a4702f6 100644
--- a/clang/test/CodeGen/msan-param-retval.cpp
+++ b/clang/test/CodeGen/msan-param-retval.cpp
@@ -1,12 +1,12 @@
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -S -emit-llvm -fsanitize=memory -no-enable-noundef-analysis -o - %s | \
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -fsanitize=memory -no-enable-noundef-analysis -o - %s | \
 // RUN:     FileCheck %s --check-prefixes=CLEAN,CHECK
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -S -emit-llvm -fsanitize=memory -fno-sanitize-memory-param-retval -o - %s | \
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -fsanitize=memory -fno-sanitize-memory-param-retval -o - %s | \
 // RUN:     FileCheck %s --check-prefixes=NOUNDEF,NOUNDEF_ONLY,CHECK
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -S -emit-llvm -fsanitize=memory -mllvm -msan-eager-checks -o - %s | \
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -fsanitize=memory -mllvm -msan-eager-checks -o - %s | \
 // RUN:     FileCheck %s --check-prefixes=NOUNDEF,EAGER,CHECK
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -S -emit-llvm -fsanitize=memory -no-enable-noundef-analysis -fsanitize-memory-param-retval -o - %s | \
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -fsanitize=memory -no-enable-noundef-analysis -fsanitize-memory-param-retval -o - %s | \
 // RUN:     FileCheck %s --check-prefixes=CLEAN,CHECK
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -S -emit-llvm -fsanitize=memory -o - %s | \
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -fsanitize=memory -o - %s | \
 // RUN:     FileCheck %s --check-prefixes=NOUNDEF,EAGER,CHECK
 
 void bar(int x) {
diff --git a/clang/test/CodeGen/msvc_pragma_alloc_text.cpp b/clang/test/CodeGen/msvc_pragma_alloc_text.cpp
index 2703fbe6efc0..f53b54c26b8c 100644
--- a/clang/test/CodeGen/msvc_pragma_alloc_text.cpp
+++ b/clang/test/CodeGen/msvc_pragma_alloc_text.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fms-extensions -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fms-extensions -emit-llvm -o - %s | FileCheck %s
 
 extern "C" {
 
diff --git a/clang/test/CodeGen/multi-aix-builtin-cpu-supports.c b/clang/test/CodeGen/multi-aix-builtin-cpu-supports.c
new file mode 100644
index 000000000000..aba57547ff83
--- /dev/null
+++ b/clang/test/CodeGen/multi-aix-builtin-cpu-supports.c
@@ -0,0 +1,54 @@
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %s | FileCheck %s
+
+int main() { 
+  int ret = 0; 
+  ret += __builtin_cpu_supports("vsx");     // Test reading `vsx` information from the system variable `_system_configuration`.
+  ret += __builtin_cpu_supports("htm");     // Test getting `htm` information from the function call `getsystemcfg`
+  ret += __builtin_cpu_supports("cellbe");  // The test always returns false for the feature 'cellbe.
+  ret += __builtin_cpu_supports("power4");  // The test always returns false for the feature `power4`.
+  ret += __builtin_cpu_supports("fpu");     // The test always returns true for the feature `fpu`.
+  ret += __builtin_cpu_supports("mma");     // Test getting `mma` information from the function call `getsystemcfg`.
+  return ret;
+}
+
+// CHECK:     @_system_configuration = external global { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i64, i32, i32, i32, i32, i64, i64, i64, i64, i32, i32, i32, i32, i32, i32, i64, i32, i8, i8, i8, i8, i32, i32, i16, i16, [3 x i32], i32 }
+// CHECK-EMPTY: 
+// CHECK-NEXT: ; Function Attrs: noinline nounwind optnone
+// CHECK-NEXT: define i32 @main() #0 {
+// CHECK-NEXT: entry:
+// CHECK-NEXT:   %retval = alloca i32, align 4
+// CHECK-NEXT:   %ret = alloca i32, align 4
+// CHECK-NEXT:   store i32 0, ptr %retval, align 4
+// CHECK-NEXT:   store i32 0, ptr %ret, align 4
+// CHECK-NEXT:   %0 = load i32, ptr getelementptr inbounds ({ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i64, i32, i32, i32, i32, i64, i64, i64, i64, i32, i32, i32, i32, i32, i32, i64, i32, i8, i8, i8, i8, i32, i32, i16, i16, [3 x i32], i32 }, ptr @_system_configuration, i32 0, i32 46), align 4
+// CHECK-NEXT:   %1 = icmp ugt i32 %0, 1
+// CHECK-NEXT:   %conv = zext i1 %1 to i32
+// CHECK-NEXT:   %2 = load i32, ptr %ret, align 4
+// CHECK-NEXT:   %add = add nsw i32 %2, %conv
+// CHECK-NEXT:   store i32 %add, ptr %ret, align 4
+// CHECK-NEXT:   %3 = call i64 @getsystemcfg(i32 59)
+// CHECK-NEXT:   %4 = icmp ugt i64 %3, 0
+// CHECK-NEXT:   %conv1 = zext i1 %4 to i32
+// CHECK-NEXT:   %5 = load i32, ptr %ret, align 4
+// CHECK-NEXT:   %add2 = add nsw i32 %5, %conv1
+// CHECK-NEXT:   store i32 %add2, ptr %ret, align 4
+// CHECK-NEXT:   %6 = load i32, ptr %ret, align 4
+// CHECK-NEXT:   %add3 = add nsw i32 %6, 0
+// CHECK-NEXT:   store i32 %add3, ptr %ret, align 4
+// CHECK-NEXT:   %7 = load i32, ptr %ret, align 4
+// CHECK-NEXT:   %add4 = add nsw i32 %7, 1
+// CHECK-NEXT:   store i32 %add4, ptr %ret, align 4
+// CHECK-NEXT:   %8 = load i32, ptr %ret, align 4
+// CHECK-NEXT:   %add5 = add nsw i32 %8, 1
+// CHECK-NEXT:   store i32 %add5, ptr %ret, align 4
+// CHECK-NEXT:   %9 = call i64 @getsystemcfg(i32 62)
+// CHECK-NEXT:   %10 = icmp ugt i64 %9, 0
+// CHECK-NEXT:   %conv6 = zext i1 %10 to i32
+// CHECK-NEXT:   %11 = load i32, ptr %ret, align 4
+// CHECK-NEXT:   %add7 = add nsw i32 %11, %conv6
+// CHECK-NEXT:   store i32 %add7, ptr %ret, align 4
+// CHECK-NEXT:   %12 = load i32, ptr %ret, align 4
+// CHECK-NEXT:   ret i32 %12
+// CHECK-NEXT: }
+// CHECK-EMPTY: 
+// CHECK-NEXT: declare i64 @getsystemcfg(i32)
diff --git a/clang/test/CodeGen/named_reg_global.c b/clang/test/CodeGen/named_reg_global.c
index eab98797feb1..6d9efe453c82 100644
--- a/clang/test/CodeGen/named_reg_global.c
+++ b/clang/test/CodeGen/named_reg_global.c
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -S -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-X86-64
-// RUN: %clang_cc1 -triple arm64-linux-gnu -S -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM
-// RUN: %clang_cc1 -triple armv7-linux-gnu -target-abi apcs-gnu -S -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-X86-64
+// RUN: %clang_cc1 -triple arm64-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM
+// RUN: %clang_cc1 -triple armv7-linux-gnu -target-abi apcs-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM
 
 // CHECK-NOT: @sp = common global
 
diff --git a/clang/test/CodeGen/no-builtin.cpp b/clang/test/CodeGen/no-builtin.cpp
index bfad88e4ec32..54354bddc8b6 100644
--- a/clang/test/CodeGen/no-builtin.cpp
+++ b/clang/test/CodeGen/no-builtin.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -o - %s | FileCheck %s
 
 // CHECK-LABEL: define{{.*}} void @foo_no_mempcy() #0
 extern "C" void foo_no_mempcy() __attribute__((no_builtin("memcpy"))) {}
diff --git a/clang/test/CodeGen/no-junk-ftrunc.c b/clang/test/CodeGen/no-junk-ftrunc.c
index ad1219391843..44638e064426 100644
--- a/clang/test/CodeGen/no-junk-ftrunc.c
+++ b/clang/test/CodeGen/no-junk-ftrunc.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -S -fno-strict-float-cast-overflow %s -emit-llvm -o - | FileCheck %s --check-prefix=NOSTRICT
+// RUN: %clang_cc1 -fno-strict-float-cast-overflow %s -emit-llvm -o - | FileCheck %s --check-prefix=NOSTRICT
 
 // When compiling with non-standard semantics, use intrinsics to inhibit the optimizer.
 // This used to require a function attribute, so we check that it is NOT here anymore.
@@ -10,7 +10,7 @@
 
 // The workaround attribute is not applied by default.
 
-// RUN: %clang_cc1 -S %s -emit-llvm -o - | FileCheck %s --check-prefix=STRICT
+// RUN: %clang_cc1 %s -emit-llvm -o - | FileCheck %s --check-prefix=STRICT
 // STRICT-LABEL: main
 // STRICT: = fptosi
 // STRICT: = fptoui
diff --git a/clang/test/CodeGen/noexceptionsfpmath.c b/clang/test/CodeGen/noexceptionsfpmath.c
index 13a02b957705..0b7c750c24fa 100644
--- a/clang/test/CodeGen/noexceptionsfpmath.c
+++ b/clang/test/CodeGen/noexceptionsfpmath.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -S %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -emit-llvm -o - | FileCheck %s
 
 // CHECK-LABEL: main
 // CHECK: attributes #0 = {{.*}}"no-trapping-math"="true"{{.*}}
diff --git a/clang/test/CodeGen/nousejumptable.c b/clang/test/CodeGen/nousejumptable.c
index fb6a2a28339b..40ba0e2f7fb0 100644
--- a/clang/test/CodeGen/nousejumptable.c
+++ b/clang/test/CodeGen/nousejumptable.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -S -fno-jump-tables %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -fno-jump-tables %s -emit-llvm -o - | FileCheck %s
 
 // CHECK-LABEL: main
 // CHECK: attributes #0 = {{.*}}"no-jump-tables"="true"{{.*}}
diff --git a/clang/test/CodeGen/nullptr-arithmetic.c b/clang/test/CodeGen/nullptr-arithmetic.c
index ce9c9765b0f7..59a7c9f94678 100644
--- a/clang/test/CodeGen/nullptr-arithmetic.c
+++ b/clang/test/CodeGen/nullptr-arithmetic.c
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -S %s -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -S %s -emit-llvm -triple i686-unknown-unknown -o - | FileCheck %s
-// RUN: %clang_cc1 -S %s -emit-llvm -triple x86_64-unknown-unknown -o - | FileCheck %s
+// RUN: %clang_cc1 %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -emit-llvm -triple i686-unknown-unknown -o - | FileCheck %s
+// RUN: %clang_cc1 %s -emit-llvm -triple x86_64-unknown-unknown -o - | FileCheck %s
 
 #include <stdint.h>
 
diff --git a/clang/test/CodeGen/nullptr.c b/clang/test/CodeGen/nullptr.c
index 7ea951213df0..1e3510134db7 100644
--- a/clang/test/CodeGen/nullptr.c
+++ b/clang/test/CodeGen/nullptr.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -S %s -std=c2x -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -std=c2x -emit-llvm -o - | FileCheck %s
 
 // Test that null <-> nullptr_t conversions work as expected.
 typedef typeof(nullptr) nullptr_t;
diff --git a/clang/test/CodeGen/nvptx-abi.c b/clang/test/CodeGen/nvptx-abi.c
index f2af88fafa64..f9c1de5713f2 100644
--- a/clang/test/CodeGen/nvptx-abi.c
+++ b/clang/test/CodeGen/nvptx-abi.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple nvptx-unknown-unknown -S -o - %s -emit-llvm | FileCheck %s
-// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -S -o - %s -emit-llvm | FileCheck %s
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -o - %s -emit-llvm | FileCheck %s
+// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -o - %s -emit-llvm | FileCheck %s
 
 typedef struct float4_s {
   float x, y, z, w;
diff --git a/clang/test/CodeGen/nvptx-cc.c b/clang/test/CodeGen/nvptx-cc.c
index 1c0d943f956b..f30587ae3853 100644
--- a/clang/test/CodeGen/nvptx-cc.c
+++ b/clang/test/CodeGen/nvptx-cc.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple nvptx-unknown-unknown -O3 -S -o %t %s -emit-llvm
-// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -O3 -S -o %t %s -emit-llvm
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -O3 -o %t %s -emit-llvm
+// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -O3 -o %t %s -emit-llvm
 
 // Just make sure Clang uses the proper calling convention for the NVPTX back-end.
 // If something is wrong, the back-end will fail.
diff --git a/clang/test/CodeGen/nvptx-cpus.c b/clang/test/CodeGen/nvptx-cpus.c
index 76c55c0edf63..94731248d336 100644
--- a/clang/test/CodeGen/nvptx-cpus.c
+++ b/clang/test/CodeGen/nvptx-cpus.c
@@ -1,11 +1,11 @@
-// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_20 -O3 -S -o %t %s -emit-llvm
-// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_21 -O3 -S -o %t %s -emit-llvm
-// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_30 -O3 -S -o %t %s -emit-llvm
-// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_35 -O3 -S -o %t %s -emit-llvm
-// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_37 -O3 -S -o %t %s -emit-llvm
-// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_50 -O3 -S -o %t %s -emit-llvm
-// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_52 -O3 -S -o %t %s -emit-llvm
-// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_53 -O3 -S -o %t %s -emit-llvm
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_20 -O3 -o %t %s -emit-llvm
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_21 -O3 -o %t %s -emit-llvm
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_30 -O3 -o %t %s -emit-llvm
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_35 -O3 -o %t %s -emit-llvm
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_37 -O3 -o %t %s -emit-llvm
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_50 -O3 -o %t %s -emit-llvm
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_52 -O3 -o %t %s -emit-llvm
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_53 -O3 -o %t %s -emit-llvm
 
 // Make sure clang accepts all supported architectures.
 
diff --git a/clang/test/CodeGen/nvptx-inlineasm-ptx.c b/clang/test/CodeGen/nvptx-inlineasm-ptx.c
index ad1567fb3020..f9cf49b3ebcc 100644
--- a/clang/test/CodeGen/nvptx-inlineasm-ptx.c
+++ b/clang/test/CodeGen/nvptx-inlineasm-ptx.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple nvptx-unknown-unknown -O3 -S -o - %s -emit-llvm | FileCheck %s
-// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -O3 -S -o - %s -emit-llvm | FileCheck %s
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -O3 -o - %s -emit-llvm | FileCheck %s
+// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -O3 -o - %s -emit-llvm | FileCheck %s
 
 void constraints(void) {
   char           c;
diff --git a/clang/test/CodeGen/nvptx-inlineasm.c b/clang/test/CodeGen/nvptx-inlineasm.c
index 860b50ff5852..c9d4facc52e8 100644
--- a/clang/test/CodeGen/nvptx-inlineasm.c
+++ b/clang/test/CodeGen/nvptx-inlineasm.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple nvptx-unknown-unknown -O3 -S -o - %s -emit-llvm | FileCheck %s
-// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -O3 -S -o - %s -emit-llvm | FileCheck %s
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -O3 -o - %s -emit-llvm | FileCheck %s
+// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -O3 -o - %s -emit-llvm | FileCheck %s
 
 int bar(int a) {
   int result;
diff --git a/clang/test/CodeGen/pch-dllexport.cpp b/clang/test/CodeGen/pch-dllexport.cpp
index 1470c9c251b0..b0b5c247d4fd 100644
--- a/clang/test/CodeGen/pch-dllexport.cpp
+++ b/clang/test/CodeGen/pch-dllexport.cpp
@@ -1,20 +1,20 @@
 // Build PCH without object file, then use it.
 // RUN: %clang_cc1 -triple i686-pc-win32 -fms-extensions -emit-pch -o %t %s
-// RUN: %clang_cc1 -triple i686-pc-win32 -fms-extensions -emit-obj -emit-llvm -include-pch %t -o - %s | FileCheck -check-prefix=PCH %s
+// RUN: %clang_cc1 -triple i686-pc-win32 -fms-extensions -emit-llvm -include-pch %t -o - %s | FileCheck -check-prefix=PCH %s
 
 // Build PCH with object file, then use it.
 // RUN: %clang_cc1 -triple i686-pc-win32 -O1 -fms-extensions -emit-pch -building-pch-with-obj -o %t %s
-// RUN: %clang_cc1 -triple i686-pc-win32 -O1 -disable-llvm-optzns -fms-extensions -emit-obj -emit-llvm -include-pch %t -building-pch-with-obj -o - %s | FileCheck -check-prefix=OBJ %s
-// RUN: %clang_cc1 -triple i686-pc-win32 -O1 -disable-llvm-optzns -fms-extensions -emit-obj -emit-llvm -include-pch %t -o - %s | FileCheck -check-prefix=PCHWITHOBJ -check-prefix=PCHWITHOBJ-O1 %s
+// RUN: %clang_cc1 -triple i686-pc-win32 -O1 -disable-llvm-optzns -fms-extensions -emit-llvm -include-pch %t -building-pch-with-obj -o - %s | FileCheck -check-prefix=OBJ %s
+// RUN: %clang_cc1 -triple i686-pc-win32 -O1 -disable-llvm-optzns -fms-extensions -emit-llvm -include-pch %t -o - %s | FileCheck -check-prefix=PCHWITHOBJ -check-prefix=PCHWITHOBJ-O1 %s
 
 // Check for vars separately to avoid having to reorder the check statements.
-// RUN: %clang_cc1 -triple i686-pc-win32 -O1 -disable-llvm-optzns -fms-extensions -emit-obj -emit-llvm -include-pch %t -o - %s | FileCheck -check-prefix=PCHWITHOBJVARS %s
+// RUN: %clang_cc1 -triple i686-pc-win32 -O1 -disable-llvm-optzns -fms-extensions -emit-llvm -include-pch %t -o - %s | FileCheck -check-prefix=PCHWITHOBJVARS %s
 
 // Test the PCHWITHOBJ at -O0 where available_externally definitions are not
 // provided:
 // RUN: %clang_cc1 -triple i686-pc-win32 -fms-extensions -emit-pch -building-pch-with-obj -o %t %s
-// RUN: %clang_cc1 -triple i686-pc-win32 -fms-extensions -emit-obj -emit-llvm -include-pch %t -o - %s | FileCheck -check-prefix=PCHWITHOBJ -check-prefix=PCHWITHOBJ-O0 %s
-// RUN: %clang_cc1 -triple i686-pc-win32 -fms-extensions -emit-obj -emit-llvm -include-pch %t -o - %s | FileCheck -check-prefix=PCHWITHOBJVARS %s
+// RUN: %clang_cc1 -triple i686-pc-win32 -fms-extensions -emit-llvm -include-pch %t -o - %s | FileCheck -check-prefix=PCHWITHOBJ -check-prefix=PCHWITHOBJ-O0 %s
+// RUN: %clang_cc1 -triple i686-pc-win32 -fms-extensions -emit-llvm -include-pch %t -o - %s | FileCheck -check-prefix=PCHWITHOBJVARS %s
 
 
 #ifndef IN_HEADER
@@ -85,7 +85,7 @@ void useTemplate() { implicitInstantiation(42); }
 // PCHWITHOBJ: define weak_odr dso_local dllexport void @"??$implicitInstantiation@H@@YAXH@Z"
 
 template<> inline void __declspec(dllexport) explicitSpecialization<int>(int) {}
-// PCHWITHOBJ: define weak_odr dso_local  dllexport void @"??$explicitSpecialization@H@@YAXH@Z"
+// PCHWITHOBJ: define weak_odr dso_local dllexport void @"??$explicitSpecialization@H@@YAXH@Z"
 
 template void __declspec(dllexport) explicitInstantiationDef<int>(int);
 // PCHWITHOBJ: define weak_odr dso_local dllexport void @"??$explicitInstantiationDef@H@@YAXH@Z"
diff --git a/clang/test/CodeGen/personality.c b/clang/test/CodeGen/personality.c
index 0cfbdb0d978f..54eca92305f5 100644
--- a/clang/test/CodeGen/personality.c
+++ b/clang/test/CodeGen/personality.c
@@ -1,16 +1,16 @@
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fblocks -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GNU
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=dwarf -fblocks -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-DWARF
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=seh -fblocks -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-SEH
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=sjlj -fblocks -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-SJLJ
-
-// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fblocks -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN
-// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -D __SEH_EXCEPTIONS__ -fms-extensions -fexceptions -fblocks -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-SEH-X86
-// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -D __SEH_EXCEPTIONS__ -fms-extensions -fexceptions -fblocks -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-SEH-X64
-
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fblocks -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GNU
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=dwarf -fblocks -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-DWARF
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=seh -fblocks -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-SEH
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=sjlj -fblocks -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-SJLJ
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fblocks -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GNU
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=dwarf -fblocks -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-DWARF
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=seh -fblocks -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-SEH
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=sjlj -fblocks -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-SJLJ
+
+// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fblocks -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN
+// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -D __SEH_EXCEPTIONS__ -fms-extensions -fexceptions -fblocks -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-SEH-X86
+// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -D __SEH_EXCEPTIONS__ -fms-extensions -fexceptions -fblocks -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-SEH-X64
+
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fblocks -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GNU
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=dwarf -fblocks -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-DWARF
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=seh -fblocks -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-SEH
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=sjlj -fblocks -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-SJLJ
 
 
 extern void g(void (^)(void));
diff --git a/clang/test/CodeGen/pr87758.c b/clang/test/CodeGen/pr87758.c
new file mode 100644
index 000000000000..1357449187ec
--- /dev/null
+++ b/clang/test/CodeGen/pr87758.c
@@ -0,0 +1,76 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -o - %s | FileCheck %s
+
+// precise mode
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -fmath-errno -ffp-contract=on \
+// RUN: -fno-rounding-math -emit-llvm  -o - %s | FileCheck \
+// RUN: --check-prefix=CHECK-PRECISE %s
+
+// fast mode
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -ffast-math -ffp-contract=fast \
+// RUN: -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-FAST %s
+
+// Reproducer for issue #87758
+// The testcase below verifies that the "fast" flag are set on the calls.
+
+float sqrtf(float x); // unary fp builtin
+float powf(float x, float y); // binary fp builtin
+float fmaf(float x, float y, float z); // ternary fp builtin
+char *rindex(const char *s, int c); // not a fp builtin
+
+#pragma float_control(push)
+#pragma float_control(precise, off)
+// CHECK: define dso_local float @fp_precise_off_libm_calls(
+// CHECK: call fast float @llvm.sqrt.f32(
+// CHECK: call fast float @llvm.pow.f32(
+// CHECK: call fast float @llvm.fma.f32(
+// CHECK: call ptr @rindex(
+
+// CHECK-PRECISE: define dso_local float @fp_precise_off_libm_calls(
+// CHECK-PRECISE: call fast float @sqrtf(
+// CHECK-PRECISE: call fast float @powf(
+// CHECK-PRECISE: call fast float @llvm.fma.f32(
+// CHECK-PRECISE: call ptr @rindex(
+
+// CHECK-FAST: define dso_local nofpclass(nan inf) float @fp_precise_off_libm_calls(
+// CHECK-FAST: call fast float @llvm.sqrt.f32(
+// CHECK-FAST: call fast float @llvm.pow.f32(
+// CHECK-FAST: call fast float @llvm.fma.f32(
+// CHECK-FAST: call ptr @rindex(
+
+float fp_precise_off_libm_calls(float a, float b, float c, const char *d, char *e, unsigned char f) {
+  a = sqrtf(a);
+  a = powf(a,b);
+  a = fmaf(a,b,c);
+  e = rindex(d, 75);
+  return a;
+}
+#pragma float_control(pop)
+
+#pragma float_control(push)
+#pragma float_control(precise, on)
+// CHECK: define dso_local float @fp_precise_on_libm_calls(
+// CHECK: call float @sqrtf(
+// CHECK: call float @powf(
+// CHECK: call float @llvm.fma.f32(
+// CHECK: call ptr @rindex(
+
+// CHECK-PRECISE: define dso_local float @fp_precise_on_libm_calls(
+// CHECK-PRECISE: call float @sqrtf(
+// CHECK-PRECISE: call float @powf(
+// CHECK-PRECISE: call float @llvm.fma.f32(
+// CHECK-PRECISE: call ptr @rindex(
+
+// CHECK-FAST: define dso_local nofpclass(nan inf) float @fp_precise_on_libm_calls(
+// CHECK-FAST: call nofpclass(nan inf) float @sqrtf(
+// CHECK-FAST: call nofpclass(nan inf) float @powf(
+// CHECK-FAST: call float @llvm.fma.f32(
+// CHECK-FAST: call ptr @rindex(
+
+float fp_precise_on_libm_calls(float a, float b, float c, const char *d, char *e, unsigned char f) {
+  a = sqrtf(a);
+  a = powf(a,b);
+  a = fmaf(a,b,c);
+  e = rindex(d, 75);
+  return a;
+}
+#pragma float_control(pop)
diff --git a/clang/test/CodeGen/rounding-math.c b/clang/test/CodeGen/rounding-math.c
index 6673dfba8a95..fbbf06b64733 100644
--- a/clang/test/CodeGen/rounding-math.c
+++ b/clang/test/CodeGen/rounding-math.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -fexperimental-strict-floating-point -S -emit-llvm -ffp-exception-behavior=strict -Wno-unknown-pragmas %s -o - | FileCheck %s
-// RUN: %clang_cc1 -fexperimental-strict-floating-point -S -emit-llvm -frounding-math -Wno-unknown-pragmas %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fexperimental-strict-floating-point -emit-llvm -ffp-exception-behavior=strict -Wno-unknown-pragmas %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fexperimental-strict-floating-point -emit-llvm -frounding-math -Wno-unknown-pragmas %s -o - | FileCheck %s
 
 float PR47807 = -8.6563630030e-03;
 
diff --git a/clang/test/CodeGen/rounding-math.cpp b/clang/test/CodeGen/rounding-math.cpp
index c0ed9fce8824..264031dc9daa 100644
--- a/clang/test/CodeGen/rounding-math.cpp
+++ b/clang/test/CodeGen/rounding-math.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -S -emit-llvm -triple i386-linux -Wno-unknown-pragmas -frounding-math %s -o - | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm -triple i386-linux -Wno-unknown-pragmas -frounding-math %s -o - | FileCheck %s
 
 constexpr float func_01(float x, float y) {
   return x + y;
diff --git a/clang/test/CodeGen/sanitize-metadata-ignorelist.c b/clang/test/CodeGen/sanitize-metadata-ignorelist.c
index b5656fd0781d..24fb4fa62cc5 100644
--- a/clang/test/CodeGen/sanitize-metadata-ignorelist.c
+++ b/clang/test/CodeGen/sanitize-metadata-ignorelist.c
@@ -1,8 +1,8 @@
-// RUN: %clang_cc1 -O -fexperimental-sanitize-metadata=atomics -triple x86_64-gnu-linux -x c -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=ALLOW
+// RUN: %clang_cc1 -O -fexperimental-sanitize-metadata=atomics -triple x86_64-gnu-linux -x c -emit-llvm %s -o - | FileCheck %s --check-prefixes=ALLOW
 // RUN: echo "fun:foo" > %t.fun
-// RUN: %clang_cc1 -O -fexperimental-sanitize-metadata=atomics -fexperimental-sanitize-metadata-ignorelist=%t.fun -triple x86_64-gnu-linux -x c -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=FUN
+// RUN: %clang_cc1 -O -fexperimental-sanitize-metadata=atomics -fexperimental-sanitize-metadata-ignorelist=%t.fun -triple x86_64-gnu-linux -x c -emit-llvm %s -o - | FileCheck %s --check-prefixes=FUN
 // RUN: echo "src:*sanitize-metadata-ignorelist.c" > %t.src
-// RUN: %clang_cc1 -O -fexperimental-sanitize-metadata=atomics -fexperimental-sanitize-metadata-ignorelist=%t.src -triple x86_64-gnu-linux -x c -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=SRC
+// RUN: %clang_cc1 -O -fexperimental-sanitize-metadata=atomics -fexperimental-sanitize-metadata-ignorelist=%t.src -triple x86_64-gnu-linux -x c -emit-llvm %s -o - | FileCheck %s --check-prefixes=SRC
 
 int y;
 
diff --git a/clang/test/CodeGen/sanitize-metadata-nosanitize.c b/clang/test/CodeGen/sanitize-metadata-nosanitize.c
index 488714fe6078..60f93476b050 100644
--- a/clang/test/CodeGen/sanitize-metadata-nosanitize.c
+++ b/clang/test/CodeGen/sanitize-metadata-nosanitize.c
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-attributes --check-globals --version 2
-// RUN: %clang_cc1 -O -fexperimental-sanitize-metadata=covered -fexperimental-sanitize-metadata=atomics -fexperimental-sanitize-metadata=uar -triple x86_64-gnu-linux -x c -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -O -fexperimental-sanitize-metadata=covered -fexperimental-sanitize-metadata=atomics -fexperimental-sanitize-metadata=uar -triple x86_64-gnu-linux -x c -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK
 
 //.
 // CHECK: @__start_sanmd_covered = extern_weak hidden global ptr
diff --git a/clang/test/CodeGen/sanitize-metadata.c b/clang/test/CodeGen/sanitize-metadata.c
index 0dbcd886f262..7e1de0c20884 100644
--- a/clang/test/CodeGen/sanitize-metadata.c
+++ b/clang/test/CodeGen/sanitize-metadata.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -O -fexperimental-sanitize-metadata=atomics -triple x86_64-gnu-linux -x c -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,ATOMICS
-// RUN: %clang_cc1 -O -fexperimental-sanitize-metadata=atomics -triple aarch64-gnu-linux -x c -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,ATOMICS
+// RUN: %clang_cc1 -O -fexperimental-sanitize-metadata=atomics -triple x86_64-gnu-linux -x c -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,ATOMICS
+// RUN: %clang_cc1 -O -fexperimental-sanitize-metadata=atomics -triple aarch64-gnu-linux -x c -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,ATOMICS
 
 // CHECK: @__start_sanmd_atomics = extern_weak hidden global ptr
 // CHECK: @__stop_sanmd_atomics = extern_weak hidden global ptr
diff --git a/clang/test/CodeGen/split-debug-filename.c b/clang/test/CodeGen/split-debug-filename.c
index 5a0562c8456b..ea710cc5d944 100644
--- a/clang/test/CodeGen/split-debug-filename.c
+++ b/clang/test/CodeGen/split-debug-filename.c
@@ -1,5 +1,5 @@
 // REQUIRES: x86-registered-target
-// RUN: %clang_cc1 -debug-info-kind=limited -split-dwarf-file foo.dwo -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -debug-info-kind=limited -split-dwarf-file foo.dwo -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple x86_64-unknown-linux -debug-info-kind=limited -split-dwarf-file %t.dwo -split-dwarf-output %t.dwo -emit-obj -o - %s | llvm-readobj -S - | FileCheck --check-prefix=O %s
 // RUN: llvm-readobj -S %t.dwo | FileCheck --check-prefix=DWO %s
 
diff --git a/clang/test/CodeGen/split-debug-inlining.c b/clang/test/CodeGen/split-debug-inlining.c
index 4730891e8450..b1c9814c8d11 100644
--- a/clang/test/CodeGen/split-debug-inlining.c
+++ b/clang/test/CodeGen/split-debug-inlining.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -debug-info-kind=limited -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -debug-info-kind=limited -fsplit-dwarf-inlining -S -emit-llvm -o - %s | FileCheck --check-prefix=ABSENT %s
+// RUN: %clang_cc1 -debug-info-kind=limited -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -debug-info-kind=limited -fsplit-dwarf-inlining -emit-llvm -o - %s | FileCheck --check-prefix=ABSENT %s
 void f(void) {}
 // Verify that disabling split debug inlining info is propagated to the debug
 // info metadata.
diff --git a/clang/test/CodeGen/stack-clash-protection.c b/clang/test/CodeGen/stack-clash-protection.c
index dab9ee768c28..b07e4c4ce908 100644
--- a/clang/test/CodeGen/stack-clash-protection.c
+++ b/clang/test/CodeGen/stack-clash-protection.c
@@ -1,9 +1,9 @@
 // Check the correct function attributes are generated
-// RUN: %clang_cc1 -triple x86_64-linux -O0 -S -emit-llvm -o- %s -fstack-clash-protection -mstack-probe-size=8192 | FileCheck %s
-// RUN: %clang_cc1 -triple s390x-linux-gnu -O0 -S -emit-llvm -o- %s -fstack-clash-protection -mstack-probe-size=8192 | FileCheck %s
-// RUN: %clang_cc1 -triple powerpc64le-linux-gnu -O0 -S -emit-llvm -o- %s -fstack-clash-protection -mstack-probe-size=8192 | FileCheck %s
-// RUN: %clang_cc1 -triple powerpc64-linux-gnu -O0 -S -emit-llvm -o- %s -fstack-clash-protection -mstack-probe-size=8192 | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-linux-gnu -O0 -S -emit-llvm -o- %s -fstack-clash-protection -mstack-probe-size=8192 | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux -O0 -emit-llvm -o- %s -fstack-clash-protection -mstack-probe-size=8192 | FileCheck %s
+// RUN: %clang_cc1 -triple s390x-linux-gnu -O0 -emit-llvm -o- %s -fstack-clash-protection -mstack-probe-size=8192 | FileCheck %s
+// RUN: %clang_cc1 -triple powerpc64le-linux-gnu -O0 -emit-llvm -o- %s -fstack-clash-protection -mstack-probe-size=8192 | FileCheck %s
+// RUN: %clang_cc1 -triple powerpc64-linux-gnu -O0 -emit-llvm -o- %s -fstack-clash-protection -mstack-probe-size=8192 | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -O0 -emit-llvm -o- %s -fstack-clash-protection -mstack-probe-size=8192 | FileCheck %s
 
 // CHECK: define{{.*}} void @large_stack() #[[A:.*]] {
 void large_stack(void) {
diff --git a/clang/test/CodeGen/strictfp-elementwise-bulitins.cpp b/clang/test/CodeGen/strictfp-elementwise-bulitins.cpp
index fdf865ebbe89..c72d59499169 100644
--- a/clang/test/CodeGen/strictfp-elementwise-bulitins.cpp
+++ b/clang/test/CodeGen/strictfp-elementwise-bulitins.cpp
@@ -187,6 +187,16 @@ float4 strict_elementwise_sqrt(float4 a) {
   return __builtin_elementwise_sqrt(a);
 }
 
+// CHECK-LABEL: define dso_local noundef <4 x float> @_Z22strict_elementwise_tanDv4_f
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ELT_TAN:%.*]] = tail call <4 x float> @llvm.tan.v4f32(<4 x float> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x float> [[ELT_TAN]]
+//
+float4 strict_elementwise_tan(float4 a) {
+  return __builtin_elementwise_tan(a);
+}
+
 // CHECK-LABEL: define dso_local noundef <4 x float> @_Z24strict_elementwise_truncDv4_f
 // CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  entry:
diff --git a/clang/test/CodeGen/strlen-inline-builtin-redecl.c b/clang/test/CodeGen/strlen-inline-builtin-redecl.c
index c89f843a71a3..ecb062eaaf18 100644
--- a/clang/test/CodeGen/strlen-inline-builtin-redecl.c
+++ b/clang/test/CodeGen/strlen-inline-builtin-redecl.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64 -S -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64 -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
 //
 // Verifies that clang-generated *.inline are removed when shadowed by an external definition
 
diff --git a/clang/test/CodeGen/struct-matching-constraint.c b/clang/test/CodeGen/struct-matching-constraint.c
index dfc3014c5d9a..0cd2ce48f447 100644
--- a/clang/test/CodeGen/struct-matching-constraint.c
+++ b/clang/test/CodeGen/struct-matching-constraint.c
@@ -1,5 +1,5 @@
 // REQUIRES: arm-registered-target
-// RUN: %clang_cc1 -S -emit-llvm -triple armv7a-apple-darwin -target-feature +neon %s -o /dev/null
+// RUN: %clang_cc1 -emit-llvm -triple armv7a-apple-darwin -target-feature +neon %s -o /dev/null
 typedef unsigned short uint16_t;
 typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t;
 
diff --git a/clang/test/CodeGen/struct-union-BE.c b/clang/test/CodeGen/struct-union-BE.c
index d2b6e98c0277..02efdb5a84be 100644
--- a/clang/test/CodeGen/struct-union-BE.c
+++ b/clang/test/CodeGen/struct-union-BE.c
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -triple mips-linux-gnu  -S -emit-llvm %s -o - | FileCheck %s -check-prefix=MIPS
-// RUN: %clang_cc1 -triple mips64-linux-gnu  -S -emit-llvm %s -o - | FileCheck %s -check-prefix=MIPS64
-// RUN: %clang_cc1 -triple armebv7-linux-gnueabihf -S -emit-llvm %s -o - | FileCheck %s -check-prefix=ARM
+// RUN: %clang_cc1 -triple mips-linux-gnu  -emit-llvm %s -o - | FileCheck %s -check-prefix=MIPS
+// RUN: %clang_cc1 -triple mips64-linux-gnu  -emit-llvm %s -o - | FileCheck %s -check-prefix=MIPS64
+// RUN: %clang_cc1 -triple armebv7-linux-gnueabihf -emit-llvm %s -o - | FileCheck %s -check-prefix=ARM
 
 #include <stdarg.h>
 
diff --git a/clang/test/CodeGen/svboolx2_t.cpp b/clang/test/CodeGen/svboolx2_t.cpp
index 060940d8755f..069d4f1fc46c 100644
--- a/clang/test/CodeGen/svboolx2_t.cpp
+++ b/clang/test/CodeGen/svboolx2_t.cpp
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -emit-llvm -o - %s | FileCheck %s
 
 // CHECK-LABEL: @_Z3foo10svboolx2_t(
 // CHECK-NEXT:  entry:
diff --git a/clang/test/CodeGen/svboolx4_t.cpp b/clang/test/CodeGen/svboolx4_t.cpp
index 8360786c06d7..ef20dc0302c2 100644
--- a/clang/test/CodeGen/svboolx4_t.cpp
+++ b/clang/test/CodeGen/svboolx4_t.cpp
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -emit-llvm -o - %s | FileCheck %s
 
 // CHECK-LABEL: @_Z3foo10svboolx4_t(
 // CHECK-NEXT:  entry:
diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c
index c184f314f68f..9d86880d6513 100644
--- a/clang/test/CodeGen/target-data.c
+++ b/clang/test/CodeGen/target-data.c
@@ -185,15 +185,15 @@
 
 // RUN: %clang_cc1 -triple arm64-unknown -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=AARCH64
-// AARCH64: target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+// AARCH64: target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
 
 // RUN: %clang_cc1 -triple arm64_32-apple-ios7.0 -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=AARCH64-ILP32
-// AARCH64-ILP32: target datalayout = "e-m:o-p:32:32-i64:64-i128:128-n32:64-S128"
+// AARCH64-ILP32: target datalayout = "e-m:o-p:32:32-i64:64-i128:128-n32:64-S128-Fn32"
 
 // RUN: %clang_cc1 -triple arm64-pc-win32-macho -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=AARCH64-WIN32-MACHO
-// AARCH64-WIN32-MACHO: target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+// AARCH64-WIN32-MACHO: target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128-Fn32"
 
 // RUN: %clang_cc1 -triple thumb-unknown-gnueabi -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=THUMB
diff --git a/clang/test/CodeGen/thinlto-distributed-cfi-devirt.ll b/clang/test/CodeGen/thinlto-distributed-cfi-devirt.ll
index 433fd1fe2043..acbcdcdb4fd0 100644
--- a/clang/test/CodeGen/thinlto-distributed-cfi-devirt.ll
+++ b/clang/test/CodeGen/thinlto-distributed-cfi-devirt.ll
@@ -38,12 +38,12 @@
 ; CHECK-DIS: ^2 = typeid: (name: "_ZTS1A", summary: (typeTestRes: (kind: allOnes, sizeM1BitWidth: 7), wpdResolutions: ((offset: 0, wpdRes: (kind: branchFunnel)), (offset: 8, wpdRes: (kind: singleImpl, singleImplName: "_ZN1A1nEi"))))) ; guid = 7004155349499253778
 
 ; RUN: %clang_cc1 -triple x86_64-grtev4-linux-gnu \
-; RUN:   -emit-obj -fthinlto-index=%t.o.thinlto.bc -O2 -Rpass=wholeprogramdevirt \
+; RUN:   -fthinlto-index=%t.o.thinlto.bc -O2 -Rpass=wholeprogramdevirt \
 ; RUN:   -emit-llvm -o - -x ir %t.o 2>&1 | FileCheck %s --check-prefixes=CHECK-IR --check-prefixes=REMARKS
 
 ; Check that the devirtualization is suppressed via -wholeprogramdevirt-skip
 ; RUN: %clang_cc1 -triple x86_64-grtev4-linux-gnu -mllvm -wholeprogramdevirt-skip=_ZN1A1nEi \
-; RUN:   -emit-obj -fthinlto-index=%t.o.thinlto.bc -O2 -Rpass=wholeprogramdevirt \
+; RUN:   -fthinlto-index=%t.o.thinlto.bc -O2 -Rpass=wholeprogramdevirt \
 ; RUN:   -emit-llvm -o - -x ir %t.o 2>&1 | FileCheck %s --check-prefixes=SKIP-IR --check-prefixes=SKIP-REMARKS
 
 ; REMARKS: single-impl: devirtualized a call to _ZN1A1nEi
diff --git a/clang/test/CodeGen/thinlto-distributed-cfi.ll b/clang/test/CodeGen/thinlto-distributed-cfi.ll
index 47e56c091a61..6023ba8f32df 100644
--- a/clang/test/CodeGen/thinlto-distributed-cfi.ll
+++ b/clang/test/CodeGen/thinlto-distributed-cfi.ll
@@ -28,7 +28,7 @@
 ; CHECK-DIS: ^2 = typeid: (name: "_ZTS1A", summary: (typeTestRes: (kind: single, sizeM1BitWidth: 0))) ; guid = 7004155349499253778
 
 ; RUN: %clang_cc1 -triple x86_64-grtev4-linux-gnu \
-; RUN:   -emit-obj -fthinlto-index=%t.o.thinlto.bc \
+; RUN:   -fthinlto-index=%t.o.thinlto.bc \
 ; RUN:   -emit-llvm -o - -x ir %t.o | FileCheck %s --check-prefixes=CHECK-IR
 
 ; Ensure that backend does not fail generating native code.
diff --git a/clang/test/CodeGen/thinlto-loop-vectorize-pm.c b/clang/test/CodeGen/thinlto-loop-vectorize-pm.c
index 68891506a811..18fcf7595cd2 100644
--- a/clang/test/CodeGen/thinlto-loop-vectorize-pm.c
+++ b/clang/test/CodeGen/thinlto-loop-vectorize-pm.c
@@ -8,9 +8,9 @@
 // "-mllvm -vectorize-loops=false" will disable loop vectorization, overriding
 // the cc1 option.
 //
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-obj -O2 -vectorize-loops -mllvm -force-vector-width=2 -mllvm -force-vector-interleave=1 -emit-llvm -o - -x ir %t.o -fthinlto-index=%t.thinlto.bc 2>&1 | FileCheck %s --check-prefix=O2-LPV
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-obj -O2 -vectorize-loops -mllvm -vectorize-loops=false -mllvm -force-vector-width=2 -mllvm -force-vector-interleave=1 -emit-llvm -o - -x ir %t.o -fthinlto-index=%t.thinlto.bc 2>&1 | FileCheck %s --check-prefix=O2-NOLPV
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-obj -O0 -vectorize-loops -mllvm -force-vector-width=2 -mllvm -force-vector-interleave=1 -emit-llvm -o - -x ir %t.o -fthinlto-index=%t.thinlto.bc 2>&1 | FileCheck %s --check-prefix=O0-LPV
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -vectorize-loops -mllvm -force-vector-width=2 -mllvm -force-vector-interleave=1 -emit-llvm -o - -x ir %t.o -fthinlto-index=%t.thinlto.bc 2>&1 | FileCheck %s --check-prefix=O2-LPV
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -vectorize-loops -mllvm -vectorize-loops=false -mllvm -force-vector-width=2 -mllvm -force-vector-interleave=1 -emit-llvm -o - -x ir %t.o -fthinlto-index=%t.thinlto.bc 2>&1 | FileCheck %s --check-prefix=O2-NOLPV
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O0 -vectorize-loops -mllvm -force-vector-width=2 -mllvm -force-vector-interleave=1 -emit-llvm -o - -x ir %t.o -fthinlto-index=%t.thinlto.bc 2>&1 | FileCheck %s --check-prefix=O0-LPV
 // O2-LPV: = !{!"llvm.loop.isvectorized", i32 1}
 // O2-NOLPV-NOT: = !{!"llvm.loop.isvectorized", i32 1}
 // O0-LPV-NOT: = !{!"llvm.loop.isvectorized", i32 1}
@@ -21,9 +21,9 @@
 // "-mllvm -interleave-loops=false" will disable the interleaving, overriding
 // the cc1 option.
 //
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-obj -O2 -vectorize-loops -mllvm -force-vector-width=2 -emit-llvm -o - -x ir %t.o -fthinlto-index=%t.thinlto.bc 2>&1 | FileCheck %s --check-prefix=O2-InterLeave
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-obj -O2 -vectorize-loops -mllvm -interleave-loops=false -mllvm -force-vector-width=2 -emit-llvm -o - -x ir %t.o -fthinlto-index=%t.thinlto.bc 2>&1 | FileCheck %s --check-prefix=O2-NoInterLeave
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-obj -O0 -vectorize-loops -mllvm -force-vector-width=2 -emit-llvm -o - -x ir %t.o -fthinlto-index=%t.thinlto.bc 2>&1 | FileCheck %s --check-prefix=O0-InterLeave
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -vectorize-loops -mllvm -force-vector-width=2 -emit-llvm -o - -x ir %t.o -fthinlto-index=%t.thinlto.bc 2>&1 | FileCheck %s --check-prefix=O2-InterLeave
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -vectorize-loops -mllvm -interleave-loops=false -mllvm -force-vector-width=2 -emit-llvm -o - -x ir %t.o -fthinlto-index=%t.thinlto.bc 2>&1 | FileCheck %s --check-prefix=O2-NoInterLeave
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O0 -vectorize-loops -mllvm -force-vector-width=2 -emit-llvm -o - -x ir %t.o -fthinlto-index=%t.thinlto.bc 2>&1 | FileCheck %s --check-prefix=O0-InterLeave
 // O2-InterLeave-COUNT-2: store <2 x double>
 // O2-InterLeave: = !{!"llvm.loop.isvectorized", i32 1}
 // O2-NoInterLeave-COUNT-1: store <2 x double>
diff --git a/clang/test/CodeGen/tls-maxalign-modflag.c b/clang/test/CodeGen/tls-maxalign-modflag.c
index d2936b66eda6..685057c3551a 100644
--- a/clang/test/CodeGen/tls-maxalign-modflag.c
+++ b/clang/test/CodeGen/tls-maxalign-modflag.c
@@ -1,8 +1,8 @@
 // REQUIRES: x86-registered-target
 
 // Test that we get the module flag TLSMaxAlign on the PS platforms.
-// RUN: %clang_cc1 -triple x86_64-scei-ps4 -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-scei-ps5 -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-scei-ps4 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-scei-ps5 -emit-llvm -o - %s | FileCheck %s
 
 int main(void) {
   return 0;
diff --git a/clang/test/CodeGen/ubsan-volatile.c b/clang/test/CodeGen/ubsan-volatile.c
index ce54aada81fd..66e045bcc6b4 100644
--- a/clang/test/CodeGen/ubsan-volatile.c
+++ b/clang/test/CodeGen/ubsan-volatile.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -fsanitize=null,alignment,object-size,vptr -S -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -fsanitize=null,alignment,object-size,vptr -emit-llvm %s -o - | FileCheck %s
 
 // CHECK: @volatile_null_deref
 void volatile_null_deref(volatile int *p) {
diff --git a/clang/test/CodeGen/unique-internal-linkage-names.c b/clang/test/CodeGen/unique-internal-linkage-names.c
index 6f8f243ef14a..0fd5e516eec4 100644
--- a/clang/test/CodeGen/unique-internal-linkage-names.c
+++ b/clang/test/CodeGen/unique-internal-linkage-names.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -S -emit-llvm -funique-internal-linkage-names -o - | FileCheck %s
+// RUN: %clang_cc1 %s -emit-llvm -funique-internal-linkage-names -o - | FileCheck %s
 
 // Check that we do not crash when overloading extern functions.
 
diff --git a/clang/test/CodeGen/unique-internal-linkage-names.cpp b/clang/test/CodeGen/unique-internal-linkage-names.cpp
index 65cf9db80b91..e847cea9d273 100644
--- a/clang/test/CodeGen/unique-internal-linkage-names.cpp
+++ b/clang/test/CodeGen/unique-internal-linkage-names.cpp
@@ -1,7 +1,7 @@
 // This test checks if internal linkage symbols get unique names with
 // -funique-internal-linkage-names option.
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -x c++ -S -emit-llvm -o - < %s | FileCheck %s --check-prefix=PLAIN
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -x c++  -S -emit-llvm -funique-internal-linkage-names -o - < %s | FileCheck %s --check-prefix=UNIQUE
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -x c++ -emit-llvm -o - < %s | FileCheck %s --check-prefix=PLAIN
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -x c++  -emit-llvm -funique-internal-linkage-names -o - < %s | FileCheck %s --check-prefix=UNIQUE
 
 static int glob;
 static int foo() {
diff --git a/clang/test/CodeGen/user-func-gnu-inline-redecl.c b/clang/test/CodeGen/user-func-gnu-inline-redecl.c
index 0415cbe1e6c7..d706f690048b 100644
--- a/clang/test/CodeGen/user-func-gnu-inline-redecl.c
+++ b/clang/test/CodeGen/user-func-gnu-inline-redecl.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64 -S -emit-llvm -O1 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64 -emit-llvm -O1 -o - %s | FileCheck %s
 //
 // Verifies that the gnu_inline version is ignored in favor of the redecl
 
diff --git a/clang/test/CodeGen/vectorcall.c b/clang/test/CodeGen/vectorcall.c
index cb53ecc70351..71dc3b0b9585 100644
--- a/clang/test/CodeGen/vectorcall.c
+++ b/clang/test/CodeGen/vectorcall.c
@@ -140,4 +140,20 @@ void __vectorcall vectorcall_indirect_vec(
 // X86-SAME: ptr inreg noundef %0,
 // X86-SAME: i32 inreg noundef %edx,
 // X86-SAME: ptr noundef %1)
+
+void __vectorcall vectorcall_indirect_fp(
+    double xmm0, double xmm1, double xmm2, double xmm3, double xmm4,
+    v4f32 xmm5, v4f32 ecx, int edx, double mem) {
+}
+
+// X86: define dso_local x86_vectorcallcc void @"\01vectorcall_indirect_fp@@{{[0-9]+}}"
+// X86-SAME: (double inreg noundef %xmm0,
+// X86-SAME: double inreg noundef %xmm1,
+// X86-SAME: double inreg noundef %xmm2,
+// X86-SAME: double inreg noundef %xmm3,
+// X86-SAME: double inreg noundef %xmm4,
+// X86-SAME: <4 x float> inreg noundef %xmm5,
+// X86-SAME: ptr inreg noundef %0,
+// X86-SAME: i32 inreg noundef %edx,
+// X86-SAME: double noundef %mem)
 #endif
diff --git a/clang/test/CodeGen/windows-seh-EHa-CppCatchDotDotDot.cpp b/clang/test/CodeGen/windows-seh-EHa-CppCatchDotDotDot.cpp
index c9e6c9925edd..db23252d0d95 100644
--- a/clang/test/CodeGen/windows-seh-EHa-CppCatchDotDotDot.cpp
+++ b/clang/test/CodeGen/windows-seh-EHa-CppCatchDotDotDot.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-windows -fasync-exceptions -fcxx-exceptions -fexceptions -fms-extensions -x c++ -Wno-implicit-function-declaration -S -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-windows -fasync-exceptions -fcxx-exceptions -fexceptions -fms-extensions -x c++ -Wno-implicit-function-declaration -emit-llvm %s -o - | FileCheck %s
 
 // CHECK: define dso_local void @"?crash@@YAXH@Z
 // CHECK: invoke void @llvm.seh.try.begin()
diff --git a/clang/test/CodeGen/windows-seh-EHa-CppCatchReturn.cpp b/clang/test/CodeGen/windows-seh-EHa-CppCatchReturn.cpp
index 822c629bc207..6102e0ce0eda 100644
--- a/clang/test/CodeGen/windows-seh-EHa-CppCatchReturn.cpp
+++ b/clang/test/CodeGen/windows-seh-EHa-CppCatchReturn.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-windows -fasync-exceptions -fcxx-exceptions -fexceptions -fms-extensions -x c++ -Wno-implicit-function-declaration -S -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-windows -fasync-exceptions -fcxx-exceptions -fexceptions -fms-extensions -x c++ -Wno-implicit-function-declaration -emit-llvm %s -o - | FileCheck %s
 
 // CHECK: define dso_local void @"?foo@@YAXXZ
 // CHECK: invoke void @llvm.seh.try.begin()
diff --git a/clang/test/CodeGen/windows-seh-EHa-CppCondiTemps.cpp b/clang/test/CodeGen/windows-seh-EHa-CppCondiTemps.cpp
index 504826f8a7c0..4b6bb4a94855 100644
--- a/clang/test/CodeGen/windows-seh-EHa-CppCondiTemps.cpp
+++ b/clang/test/CodeGen/windows-seh-EHa-CppCondiTemps.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-windows -fasync-exceptions -fcxx-exceptions -fexceptions -fms-extensions -x c++ -Wno-implicit-function-declaration -S -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-windows -fasync-exceptions -fcxx-exceptions -fexceptions -fms-extensions -x c++ -Wno-implicit-function-declaration -emit-llvm %s -o - | FileCheck %s
 
 // CHECK: define dso_local noundef i32 @"?bar@@YAHHVB1@@VB2@@@Z"
 // CHECK: %coerce.dive1 = getelementptr inbounds %class.B2
diff --git a/clang/test/CodeGen/windows-seh-EHa-CppDtors01.cpp b/clang/test/CodeGen/windows-seh-EHa-CppDtors01.cpp
index 26676f513d28..c12ae65c2361 100644
--- a/clang/test/CodeGen/windows-seh-EHa-CppDtors01.cpp
+++ b/clang/test/CodeGen/windows-seh-EHa-CppDtors01.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-windows -fasync-exceptions -fcxx-exceptions -fexceptions -fms-extensions -x c++ -Wno-implicit-function-declaration -S -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-windows -fasync-exceptions -fcxx-exceptions -fexceptions -fms-extensions -x c++ -Wno-implicit-function-declaration -emit-llvm %s -o - | FileCheck %s
 
 // CHECK: invoke void @llvm.seh.scope.begin()
 // CHECK: invoke void @llvm.seh.scope.begin()
diff --git a/clang/test/CodeGen/windows-seh-EHa-TryInFinally.cpp b/clang/test/CodeGen/windows-seh-EHa-TryInFinally.cpp
index ce2a9528e190..462ba9afb5b3 100644
--- a/clang/test/CodeGen/windows-seh-EHa-TryInFinally.cpp
+++ b/clang/test/CodeGen/windows-seh-EHa-TryInFinally.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-windows -fasync-exceptions -fcxx-exceptions -fexceptions -fms-extensions -x c++ -Wno-implicit-function-declaration -S -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-windows -fasync-exceptions -fcxx-exceptions -fexceptions -fms-extensions -x c++ -Wno-implicit-function-declaration -emit-llvm %s -o - | FileCheck %s
 
 // CHECK-LABEL: @main()
 // CHECK: invoke void @llvm.seh.try.begin()
diff --git a/clang/test/CodeGen/windows-seh-abnormal-exits.c b/clang/test/CodeGen/windows-seh-abnormal-exits.c
index d718ae7370ac..1f1f29f67679 100644
--- a/clang/test/CodeGen/windows-seh-abnormal-exits.c
+++ b/clang/test/CodeGen/windows-seh-abnormal-exits.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-windows -fms-extensions -Wno-implicit-function-declaration -S -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-windows -fms-extensions -Wno-implicit-function-declaration -emit-llvm %s -o - | FileCheck %s
 
 // CHECK: %[[src:[0-9-]+]] = call ptr @llvm.localaddress()
 // CHECK-NEXT: %cleanup.dest = load i32, ptr %cleanup.dest.slot, align 4
diff --git a/clang/test/CodeGen/windows-seh-filter-inFinally.c b/clang/test/CodeGen/windows-seh-filter-inFinally.c
index 2e6fc501512d..fa926c236be8 100644
--- a/clang/test/CodeGen/windows-seh-filter-inFinally.c
+++ b/clang/test/CodeGen/windows-seh-filter-inFinally.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-windows -fms-extensions -Wno-implicit-function-declaration -S -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-windows -fms-extensions -Wno-implicit-function-declaration -emit-llvm %s -o - | FileCheck %s
 
 // CHECK: %[[dst:[0-9-]+]] = call ptr @llvm.eh.recoverfp(ptr @"?fin$0@0@main@@", ptr %frame_pointer)
 // CHECK-NEXT: %[[dst1:[0-9-]+]] = call ptr @llvm.localrecover(ptr @"?fin$0@0@main@@", ptr %[[dst]], i32 0)
diff --git a/clang/test/CodeGen/x64-microsoft-arguments.cpp b/clang/test/CodeGen/x64-microsoft-arguments.cpp
new file mode 100644
index 000000000000..c666c92ad2db
--- /dev/null
+++ b/clang/test/CodeGen/x64-microsoft-arguments.cpp
@@ -0,0 +1,92 @@
+// RUN: %clang_cc1 -triple x86_64-windows-msvc -ffreestanding -emit-llvm -O0 \
+// RUN: -x c++ -o - %s | FileCheck %s
+
+int global_i = 0;
+
+// Pass and return object with a reference type (pass directly, return indirectly).
+// CHECK: define dso_local void @"?f1@@YA?AUS1@@XZ"(ptr dead_on_unwind noalias writable sret(%struct.S1) align 8 {{.*}})
+// CHECK: call void @"?func1@@YA?AUS1@@U1@@Z"(ptr dead_on_unwind writable sret(%struct.S1) align 8 {{.*}}, i64 {{.*}})
+struct S1 {
+  int& r;
+};
+
+S1 func1(S1 x);
+S1 f1() {
+  S1 x{ global_i };
+  return func1(x);
+}
+
+// Pass and return object with a reference type within an inner struct (pass directly, return indirectly).
+// CHECK: define dso_local void @"?f2@@YA?AUS2@@XZ"(ptr dead_on_unwind noalias writable sret(%struct.S2) align 8 {{.*}})
+// CHECK: call void @"?func2@@YA?AUS2@@U1@@Z"(ptr dead_on_unwind writable sret(%struct.S2) align 8 {{.*}}, i64 {{.*}})
+struct Inner {
+  int& r;
+};
+
+struct S2 {
+  Inner i;
+};
+
+S2 func2(S2 x);
+S2 f2() {
+  S2 x{ { global_i } };
+  return func2(x);
+}
+
+// Pass and return object with a reference type (pass directly, return indirectly).
+// CHECK: define dso_local void @"?f3@@YA?AUS3@@XZ"(ptr dead_on_unwind noalias writable sret(%struct.S3) align 8 {{.*}})
+// CHECK: call void @"?func3@@YA?AUS3@@U1@@Z"(ptr dead_on_unwind writable sret(%struct.S3) align 8 {{.*}}, i64 {{.*}})
+struct S3 {
+  const int& r;
+};
+
+S3 func3(S3 x);
+S3 f3() {
+  S3 x{ global_i };
+  return func3(x);
+}
+
+// Pass and return object with a reference type within an inner struct (pass directly, return indirectly).
+// CHECK: define dso_local void @"?f4@@YA?AUS4@@XZ"(ptr dead_on_unwind noalias writable sret(%struct.S4) align 8 {{.*}})
+// CHECK: call void @"?func4@@YA?AUS4@@U1@@Z"(ptr dead_on_unwind writable sret(%struct.S4) align 8 {{.*}}, i64 {{.*}})
+struct InnerConst {
+  const int& r;
+};
+
+struct S4 {
+  InnerConst i;
+};
+
+S4 func4(S4 x);
+S4 f4() {
+  S4 x{ { global_i } };
+  return func4(x);
+}
+
+// Pass and return an object with an explicitly deleted copy assignment operator (pass directly, return indirectly).
+// CHECK: define dso_local void @"?f5@@YA?AUS5@@XZ"(ptr dead_on_unwind noalias writable sret(%struct.S5) align 4 {{.*}})
+// CHECK: call void @"?func5@@YA?AUS5@@U1@@Z"(ptr dead_on_unwind writable sret(%struct.S5) align 4 {{.*}}, i32 {{.*}})
+struct S5 {
+  S5& operator=(const S5&) = delete;
+  int i;
+};
+
+S5 func5(S5 x);
+S5 f5() {
+  S5 x{ 1 };
+  return func5(x);
+}
+
+// Pass and return an object with an explicitly defaulted copy assignment operator that is implicitly deleted (pass directly, return indirectly).
+// CHECK: define dso_local void @"?f6@@YA?AUS6@@XZ"(ptr dead_on_unwind noalias writable sret(%struct.S6) align 8 {{.*}})
+// CHECK: call void @"?func6@@YA?AUS6@@U1@@Z"(ptr dead_on_unwind writable sret(%struct.S6) align 8 {{.*}}, i64 {{.*}})
+struct S6 {
+  S6& operator=(const S6&) = default;
+  int& i;
+};
+
+S6 func6(S6 x);
+S6 f6() {
+  S6 x{ global_i };
+  return func6(x);
+}
diff --git a/clang/test/CodeGenCXX/OmitRTTIComponentABI/simple-vtable-definition.cpp b/clang/test/CodeGenCXX/OmitRTTIComponentABI/simple-vtable-definition.cpp
index 99395ba0e05e..31eaf3f2f1ab 100644
--- a/clang/test/CodeGenCXX/OmitRTTIComponentABI/simple-vtable-definition.cpp
+++ b/clang/test/CodeGenCXX/OmitRTTIComponentABI/simple-vtable-definition.cpp
@@ -1,8 +1,8 @@
 /// Check that -fexperimental-omit-vtable-rtti omits the RTTI component from
 /// the vtable.
 
-// RUN: %clang_cc1 %s -triple=aarch64-unknown-linux-gnu -fno-rtti -fexperimental-omit-vtable-rtti -S -o - -emit-llvm | FileCheck -check-prefixes=POINTER,RTTI %s
-// RUN: %clang_cc1 %s -triple=aarch64-unknown-linux-gnu -fexperimental-relative-c++-abi-vtables -fno-rtti -fexperimental-omit-vtable-rtti -S -o - -emit-llvm | FileCheck -check-prefixes=RELATIVE,RTTI %s
+// RUN: %clang_cc1 %s -triple=aarch64-unknown-linux-gnu -fno-rtti -fexperimental-omit-vtable-rtti -o - -emit-llvm | FileCheck -check-prefixes=POINTER,RTTI %s
+// RUN: %clang_cc1 %s -triple=aarch64-unknown-linux-gnu -fexperimental-relative-c++-abi-vtables -fno-rtti -fexperimental-omit-vtable-rtti -o - -emit-llvm | FileCheck -check-prefixes=RELATIVE,RTTI %s
 
 /// Normally, the vtable would contain at least three components:
 /// - An offset to top
diff --git a/clang/test/CodeGenCXX/OmitRTTIComponentABI/vbase-offset.cpp b/clang/test/CodeGenCXX/OmitRTTIComponentABI/vbase-offset.cpp
index d490cc2dbebe..db35b6c67b43 100644
--- a/clang/test/CodeGenCXX/OmitRTTIComponentABI/vbase-offset.cpp
+++ b/clang/test/CodeGenCXX/OmitRTTIComponentABI/vbase-offset.cpp
@@ -1,8 +1,8 @@
 /// Check that the offset to top calculation is adjusted to account for the
 /// omitted RTTI entry.
 
-// RUN: %clang_cc1 %s -triple=aarch64-unknown-linux-gnu -fexperimental-omit-vtable-rtti -fno-rtti -S -o - -emit-llvm | FileCheck -check-prefixes=POINTER %s
-// RUN: %clang_cc1 %s -triple=aarch64-unknown-linux-gnu -fexperimental-relative-c++-abi-vtables -fexperimental-omit-vtable-rtti -fno-rtti -S -o - -emit-llvm | FileCheck -check-prefixes=RELATIVE %s
+// RUN: %clang_cc1 %s -triple=aarch64-unknown-linux-gnu -fexperimental-omit-vtable-rtti -fno-rtti -o - -emit-llvm | FileCheck -check-prefixes=POINTER %s
+// RUN: %clang_cc1 %s -triple=aarch64-unknown-linux-gnu -fexperimental-relative-c++-abi-vtables -fexperimental-omit-vtable-rtti -fno-rtti -o - -emit-llvm | FileCheck -check-prefixes=RELATIVE %s
 
 /// Some important things to check:
 /// - The n16 here represents the virtual thunk size. Normally this would be 24
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/available_externally-vtable.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/available_externally-vtable.cpp
index 883d79ddabed..db74f50b99a3 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/available_externally-vtable.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/available_externally-vtable.cpp
@@ -2,7 +2,7 @@
 // We check this specifically under the legacy pass manager because the new pass
 // manager seems to remove available_externally vtables from the IR entirely.
 
-// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O1 -disable-llvm-passes -S -o - -emit-llvm | FileCheck %s
+// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O1 -disable-llvm-passes -o - -emit-llvm | FileCheck %s
 
 // The VTable for A is available_externally, meaning it can have a definition in
 // IR, but is never emitted in this compilation unit. Because it won't be
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/child-inheritted-from-parent-in-comdat.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/child-inheritted-from-parent-in-comdat.cpp
index c3a7a33f1fec..62b09c85ce40 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/child-inheritted-from-parent-in-comdat.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/child-inheritted-from-parent-in-comdat.cpp
@@ -1,7 +1,7 @@
 // Cross comdat example
 // Parent VTable is in a comdat section.
 
-// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O1 -S -o - -emit-llvm -fhalf-no-semantic-interposition | FileCheck %s
+// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O1 -o - -emit-llvm -fhalf-no-semantic-interposition | FileCheck %s
 
 // The inline function is emitted in each module with the same comdat
 // CHECK: $_ZTS1A = comdat any
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/child-vtable-in-comdat.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/child-vtable-in-comdat.cpp
index 950921f67509..2b6dde11f071 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/child-vtable-in-comdat.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/child-vtable-in-comdat.cpp
@@ -1,7 +1,7 @@
 // Cross comdat example
 // Child VTable is in a comdat section.
 
-// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O1 -S -o - -emit-llvm | FileCheck %s
+// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O1 -o - -emit-llvm | FileCheck %s
 
 // A comdat is emitted for B but not A
 // CHECK-DAG: $_ZTV1B = comdat any
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/cross-translation-unit-1.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/cross-translation-unit-1.cpp
index 0d482c353b04..660897f75099 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/cross-translation-unit-1.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/cross-translation-unit-1.cpp
@@ -1,7 +1,7 @@
 // Check the vtable layout for classes with key functions defined in different
 // translation units. This TU only manifests the vtable for A.
 
-// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O1 -S -o - -emit-llvm -fhalf-no-semantic-interposition | FileCheck %s
+// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O1 -o - -emit-llvm -fhalf-no-semantic-interposition | FileCheck %s
 
 #include "cross-tu-header.h"
 
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/cross-translation-unit-2.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/cross-translation-unit-2.cpp
index e09ba014bcd4..0c3cf9464606 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/cross-translation-unit-2.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/cross-translation-unit-2.cpp
@@ -1,7 +1,7 @@
 // Check the vtable layout for classes with key functions defined in different
 // translation units. This TU manifests the vtable for B.
 
-// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O1 -S -o - -emit-llvm -fhalf-no-semantic-interposition | FileCheck %s
+// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O1 -o - -emit-llvm -fhalf-no-semantic-interposition | FileCheck %s
 
 #include "cross-tu-header.h"
 
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/diamond-inheritance.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/diamond-inheritance.cpp
index cd3ce22fc8e9..96bbd197ac1c 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/diamond-inheritance.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/diamond-inheritance.cpp
@@ -1,7 +1,7 @@
 // Diamond inheritance.
 // A more complicated multiple inheritance example that includes longer chain of inheritance and a common ancestor.
 
-// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O1 -S -o - -emit-llvm -fhalf-no-semantic-interposition | FileCheck %s
+// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O1 -o - -emit-llvm -fhalf-no-semantic-interposition | FileCheck %s
 
 // VTable for B should contain offset to top (0), RTTI pointer, A::foo(), and B::barB().
 // CHECK: @_ZTV1B.local = private unnamed_addr constant { [4 x i32] } { [4 x i32] [i32 0, i32 trunc (i64 sub (i64 ptrtoint (ptr @_ZTI1B.rtti_proxy to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [4 x i32] }, ptr @_ZTV1B.local, i32 0, i32 0, i32 2) to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @_ZN1A3fooEv to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [4 x i32] }, ptr @_ZTV1B.local, i32 0, i32 0, i32 2) to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @_ZN1B4barBEv to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [4 x i32] }, ptr @_ZTV1B.local, i32 0, i32 0, i32 2) to i64)) to i32)] }, align 4
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/diamond-virtual-inheritance.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/diamond-virtual-inheritance.cpp
index f03bb747b2a7..75a3e21e81ff 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/diamond-virtual-inheritance.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/diamond-virtual-inheritance.cpp
@@ -1,7 +1,7 @@
 // Diamond virtual inheritance.
 // This should cover virtual inheritance, construction vtables, and VTTs.
 
-// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O1 -S -o - -emit-llvm -fhalf-no-semantic-interposition | FileCheck %s
+// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O1 -o - -emit-llvm -fhalf-no-semantic-interposition | FileCheck %s
 
 // Class A contains a vtable ptr, then int, then padding
 
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/dynamic-cast.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/dynamic-cast.cpp
index 4fef80f051ac..83daf57be22f 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/dynamic-cast.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/dynamic-cast.cpp
@@ -1,7 +1,7 @@
 // dynamic_cast
 // Ensure that dynamic casting works normally
 
-// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O3 -S -o - -emit-llvm | FileCheck %s
+// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O3 -o - -emit-llvm | FileCheck %s
 
 // CHECK:      define{{.*}} ptr @_Z6upcastP1B(ptr noundef readnone returned %b) local_unnamed_addr
 // CHECK-NEXT: entry:
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/inheritted-virtual-function.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/inheritted-virtual-function.cpp
index 877d496b09ac..4cd657ce2453 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/inheritted-virtual-function.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/inheritted-virtual-function.cpp
@@ -1,7 +1,7 @@
 // Check the layout of the vtable for a child class that inherits a virtual
 // function but does not override it.
 
-// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O1 -S -o - -emit-llvm -fhalf-no-semantic-interposition | FileCheck %s
+// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O1 -o - -emit-llvm -fhalf-no-semantic-interposition | FileCheck %s
 
 class A {
 public:
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/inline-virtual-function.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/inline-virtual-function.cpp
index e1af2f36d6df..332ba8dd03e1 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/inline-virtual-function.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/inline-virtual-function.cpp
@@ -1,8 +1,8 @@
 // The VTable is not in a comdat but the inline methods are.
 // This doesn’t affect the vtable or the stubs we emit.
 
-// RUN: %clang_cc1 %s -triple=aarch64 -O1 -S -o - -emit-llvm -fexperimental-relative-c++-abi-vtables -fhalf-no-semantic-interposition | FileCheck %s
-// RUN: %clang_cc1 %s -triple=x86_64 -O1 -S -o - -emit-llvm -fexperimental-relative-c++-abi-vtables -fhalf-no-semantic-interposition | FileCheck %s
+// RUN: %clang_cc1 %s -triple=aarch64 -O1 -o - -emit-llvm -fexperimental-relative-c++-abi-vtables -fhalf-no-semantic-interposition | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64 -O1 -o - -emit-llvm -fexperimental-relative-c++-abi-vtables -fhalf-no-semantic-interposition | FileCheck %s
 
 // CHECK: $_ZTI1A.rtti_proxy = comdat any
 
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/inlined-key-function.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/inlined-key-function.cpp
index c542290de5f7..d5d9a85d4e22 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/inlined-key-function.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/inlined-key-function.cpp
@@ -1,7 +1,7 @@
 // Inline comdat method definition example.
 // The VTable is in a comdat and defined anywhere the inline definition is.
 
-// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O1 -S -o - -emit-llvm | FileCheck %s
+// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O1 -o - -emit-llvm | FileCheck %s
 
 // CHECK: $_ZTV1A = comdat any
 // CHECK: $_ZTS1A = comdat any
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/member-function-pointer.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/member-function-pointer.cpp
index 000568b3b6bf..cf0f7392f5b0 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/member-function-pointer.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/member-function-pointer.cpp
@@ -1,6 +1,6 @@
 // Member pointer to virtual function.
 
-// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O3 -S -o - -emit-llvm | FileCheck %s
+// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O3 -o - -emit-llvm | FileCheck %s
 
 // CHECK:      define{{.*}} void @_Z4funcP1AMS_FvvE(ptr noundef %a, [2 x i64] %fn.coerce) local_unnamed_addr
 // CHECK-NEXT: entry:
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/multiple-inheritance.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/multiple-inheritance.cpp
index 632cf34a49c1..0b90000f9add 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/multiple-inheritance.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/multiple-inheritance.cpp
@@ -1,6 +1,6 @@
 // Multiple inheritance.
 
-// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O1 -S -o - -emit-llvm -fhalf-no-semantic-interposition | FileCheck %s
+// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O1 -o - -emit-llvm -fhalf-no-semantic-interposition | FileCheck %s
 
 // VTable for C contains 2 sub-vtables (represented as 2 structs). The first contains the components for B and the second contains the components for C. The RTTI ptr in both arrays still point to the RTTI struct for C.
 // The component for bar() instead points to a thunk which redirects to C::bar() which overrides B::bar().
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/no-alias-when-dso-local.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/no-alias-when-dso-local.cpp
index b2013b34948e..721351393ffd 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/no-alias-when-dso-local.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/no-alias-when-dso-local.cpp
@@ -1,8 +1,8 @@
 // Check that no alias is emitted when the vtable is already dso_local. This can
 // happen if the class is hidden.
 
-// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -S -o - -emit-llvm -fhalf-no-semantic-interposition | FileCheck %s --check-prefix=DEFAULT-VIS
-// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -S -o - -emit-llvm -fvisibility=hidden | FileCheck %s --check-prefix=HIDDEN-VIS
+// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -o - -emit-llvm -fhalf-no-semantic-interposition | FileCheck %s --check-prefix=DEFAULT-VIS
+// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -o - -emit-llvm -fvisibility=hidden | FileCheck %s --check-prefix=HIDDEN-VIS
 
 // DEFAULT-VIS: @_ZTV1A.local = private unnamed_addr constant
 // DEFAULT-VIS: @_ZTV1A ={{.*}} unnamed_addr alias { [3 x i32] }, ptr @_ZTV1A.local
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/override-pure-virtual-method.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/override-pure-virtual-method.cpp
index 2ddcf33d1419..6fed01e08bfd 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/override-pure-virtual-method.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/override-pure-virtual-method.cpp
@@ -2,7 +2,7 @@
 // We instead emit zero for the pure virtual function component. See PR43094 for
 // details.
 
-// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O1 -S -o - -emit-llvm -fhalf-no-semantic-interposition | FileCheck %s
+// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O1 -o - -emit-llvm -fhalf-no-semantic-interposition | FileCheck %s
 
 // CHECK: @_ZTV1A.local = private unnamed_addr constant { [4 x i32] } { [4 x i32] [i32 0, i32 trunc (i64 sub (i64 ptrtoint (ptr @_ZTI1A.rtti_proxy to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [4 x i32] }, ptr @_ZTV1A.local, i32 0, i32 0, i32 2) to i64)) to i32), i32 0, i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @_ZN1A3barEv to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [4 x i32] }, ptr @_ZTV1A.local, i32 0, i32 0, i32 2) to i64)) to i32)] }, align 4
 
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/overriden-virtual-function.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/overriden-virtual-function.cpp
index fe8cd7e63847..e53135abb6e6 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/overriden-virtual-function.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/overriden-virtual-function.cpp
@@ -1,7 +1,7 @@
 // Check the layout of the vtable for a child class that inherits a virtual
 // function but does override it.
 
-// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O1 -S -o - -emit-llvm -fhalf-no-semantic-interposition | FileCheck %s
+// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O1 -o - -emit-llvm -fhalf-no-semantic-interposition | FileCheck %s
 
 // CHECK: @_ZTV1B.local = private unnamed_addr constant { [4 x i32] } { [4 x i32] [i32 0, i32 trunc (i64 sub (i64 ptrtoint (ptr @_ZTI1B.rtti_proxy to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [4 x i32] }, ptr @_ZTV1B.local, i32 0, i32 0, i32 2) to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @_ZN1B3fooEv to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [4 x i32] }, ptr @_ZTV1B.local, i32 0, i32 0, i32 2) to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @_ZN1B3barEv to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [4 x i32] }, ptr @_ZTV1B.local, i32 0, i32 0, i32 2) to i64)) to i32)] }, align 4
 
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/parent-and-child-in-comdats.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/parent-and-child-in-comdats.cpp
index 995510f92dc9..a033ac41868f 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/parent-and-child-in-comdats.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/parent-and-child-in-comdats.cpp
@@ -1,8 +1,8 @@
 // Cross comdat example
 // Both the parent and child VTablea are in their own comdat sections.
 
-// RUN: %clang_cc1 %s -triple=aarch64 -S -o - -emit-llvm -fexperimental-relative-c++-abi-vtables | FileCheck %s
-// RUN: %clang_cc1 %s -triple=x86_64 -S -o - -emit-llvm -fexperimental-relative-c++-abi-vtables | FileCheck %s
+// RUN: %clang_cc1 %s -triple=aarch64 -o - -emit-llvm -fexperimental-relative-c++-abi-vtables | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64 -o - -emit-llvm -fexperimental-relative-c++-abi-vtables | FileCheck %s
 
 // Comdats are emitted for both A and B in this module and for their respective implementations of foo().
 // CHECK: $_ZN1A3fooEv = comdat any
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/parent-vtable-in-comdat.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/parent-vtable-in-comdat.cpp
index ee710100152b..341c53146d47 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/parent-vtable-in-comdat.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/parent-vtable-in-comdat.cpp
@@ -1,8 +1,8 @@
 // Cross comdat example
 // Parent VTable is in a comdat section.
 
-// RUN: %clang_cc1 %s -triple=aarch64 -S -o - -emit-llvm -fexperimental-relative-c++-abi-vtables | FileCheck %s
-// RUN: %clang_cc1 %s -triple=x86_64 -S -o - -emit-llvm -fexperimental-relative-c++-abi-vtables | FileCheck %s
+// RUN: %clang_cc1 %s -triple=aarch64 -o - -emit-llvm -fexperimental-relative-c++-abi-vtables | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64 -o - -emit-llvm -fexperimental-relative-c++-abi-vtables | FileCheck %s
 
 // A::foo() has a comdat since it is an inline function
 // CHECK: $_ZN1A3fooEv = comdat any
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/relative-vtables-flag.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/relative-vtables-flag.cpp
index 3dc2b740af0a..a7054cb48e9d 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/relative-vtables-flag.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/relative-vtables-flag.cpp
@@ -3,8 +3,8 @@
 // of a soft incremental rollout. This ABI should only be used if the flag for
 // it is passed on Fuchsia.
 
-// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -S -o - -emit-llvm -fhalf-no-semantic-interposition | FileCheck --check-prefix=RELATIVE-ABI %s
-// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -S -o - -emit-llvm -fno-experimental-relative-c++-abi-vtables | FileCheck --check-prefix=DEFAULT-ABI %s
+// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -o - -emit-llvm -fhalf-no-semantic-interposition | FileCheck --check-prefix=RELATIVE-ABI %s
+// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -o - -emit-llvm -fno-experimental-relative-c++-abi-vtables | FileCheck --check-prefix=DEFAULT-ABI %s
 
 // VTable contains offsets and references to the hidden symbols
 // RELATIVE-ABI: @_ZTV1A.local = private unnamed_addr constant { [3 x i32] } { [3 x i32] [i32 0, i32 trunc (i64 sub (i64 ptrtoint (ptr @_ZTI1A.rtti_proxy to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [3 x i32] }, ptr @_ZTV1A.local, i32 0, i32 0, i32 2) to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @_ZN1A3fooEv to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [3 x i32] }, ptr @_ZTV1A.local, i32 0, i32 0, i32 2) to i64)) to i32)] }, align 4
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/relative-vtables-hwasan.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/relative-vtables-hwasan.cpp
index 7657a3bd0efd..6b459a16402b 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/relative-vtables-hwasan.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/relative-vtables-hwasan.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -S -o - -emit-llvm -fsanitize=hwaddress | FileCheck %s
+// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -o - -emit-llvm -fsanitize=hwaddress | FileCheck %s
 
 /// The usual vtable will have default visibility. In this case, the actual
 /// vtable is hidden and the alias is made public. With hwasan enabled, we want
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/simple-vtable-definition.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/simple-vtable-definition.cpp
index 5ed4745b9a06..0e88015ab6f3 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/simple-vtable-definition.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/simple-vtable-definition.cpp
@@ -1,6 +1,6 @@
 // Check the layout of the vtable for a normal class.
 
-// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O1 -S -o - -emit-llvm -fhalf-no-semantic-interposition | FileCheck %s
+// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O1 -o - -emit-llvm -fhalf-no-semantic-interposition | FileCheck %s
 
 // We should be emitting comdats for each of the virtual function RTTI proxies
 // CHECK: $_ZTI1A.rtti_proxy = comdat any
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/thunk-mangling.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/thunk-mangling.cpp
index 33f3b98faa92..f8ff8189badd 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/thunk-mangling.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/thunk-mangling.cpp
@@ -5,7 +5,7 @@
 // Running that linked binary still won't work since we're using conflicting
 // ABIs, but we should still be able to link.
 
-// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O1 -S -o - -emit-llvm | FileCheck %s
+// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O1 -o - -emit-llvm | FileCheck %s
 
 // This would be normally n24 (3 ptr widths) but is 12 since the vtable is
 // entierely made of i32s now.
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp
index 83c8956f9ff1..c6ccae587bbf 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp
@@ -1,6 +1,6 @@
 // Check typeid() + type_info
 
-// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O3 -S -o - -emit-llvm -fcxx-exceptions -fexceptions | FileCheck %s
+// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O3 -o - -emit-llvm -fcxx-exceptions -fexceptions | FileCheck %s
 
 // CHECK: $_ZTI1A.rtti_proxy = comdat any
 // CHECK: $_ZTI1B.rtti_proxy = comdat any
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/vbase-offset.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/vbase-offset.cpp
index 7c5f2db72d4e..bf9b1da99fdd 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/vbase-offset.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/vbase-offset.cpp
@@ -1,7 +1,7 @@
 // Check that the pointer adjustment from the virtual base offset is loaded as a
 // 32-bit int.
 
-// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -S -o - -emit-llvm | FileCheck %s
+// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -o - -emit-llvm | FileCheck %s
 
 // CHECK-LABEL: @_ZTv0_n12_N7Derived1fEi(
 // CHECK-NEXT:  entry:
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/virtual-function-call.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/virtual-function-call.cpp
index abb11d7915ec..a0d40355e340 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/virtual-function-call.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/virtual-function-call.cpp
@@ -1,6 +1,6 @@
 // Check that we call llvm.load.relative() on a vtable function call.
 
-// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O3 -S -o - -emit-llvm | FileCheck %s
+// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O3 -o - -emit-llvm | FileCheck %s
 
 // CHECK:      define{{.*}} void @_Z5A_fooP1A(ptr noundef %a) local_unnamed_addr
 // CHECK-NEXT: entry:
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/vtable-hidden-when-in-comdat.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/vtable-hidden-when-in-comdat.cpp
index 1852b0c151d5..0bcbc55ca0de 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/vtable-hidden-when-in-comdat.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/vtable-hidden-when-in-comdat.cpp
@@ -2,7 +2,7 @@
 // is not dso_local. The vtable will need to be hidden and not private so it can
 // be used as acomdat key signature.
 
-// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -S -o - -emit-llvm | FileCheck %s
+// RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -o - -emit-llvm | FileCheck %s
 
 // CHECK: @_ZTV1B.local = linkonce_odr hidden unnamed_addr constant
 // CHECK: @_ZTV1B = linkonce_odr unnamed_addr alias { [3 x i32] }, ptr @_ZTV1B.local
diff --git a/clang/test/CodeGenCXX/aix-static-init-temp-spec-and-inline-var.cpp b/clang/test/CodeGenCXX/aix-static-init-temp-spec-and-inline-var.cpp
index 8c257e5c31c7..5582ede0ebf5 100644
--- a/clang/test/CodeGenCXX/aix-static-init-temp-spec-and-inline-var.cpp
+++ b/clang/test/CodeGenCXX/aix-static-init-temp-spec-and-inline-var.cpp
@@ -1,8 +1,8 @@
-// RUN: %clang_cc1 -triple powerpc-ibm-aix-xcoff -S -emit-llvm -x c++ \
+// RUN: %clang_cc1 -triple powerpc-ibm-aix-xcoff -emit-llvm -x c++ \
 // RUN:     -std=c++2a < %s | \
 // RUN:   FileCheck --check-prefixes=CHECK,CHECK32 %s
 
-// RUN: %clang_cc1 -triple powerpc64-ibm-aix-xcoff -S -emit-llvm -x c++ \
+// RUN: %clang_cc1 -triple powerpc64-ibm-aix-xcoff -emit-llvm -x c++ \
 // RUN:     -std=c++2a < %s | \
 // RUN:   FileCheck --check-prefixes=CHECK,CHECK64 %s
 
diff --git a/clang/test/CodeGenCXX/aix-static-init.cpp b/clang/test/CodeGenCXX/aix-static-init.cpp
index 1a42fd1af212..4711360879dd 100644
--- a/clang/test/CodeGenCXX/aix-static-init.cpp
+++ b/clang/test/CodeGenCXX/aix-static-init.cpp
@@ -1,8 +1,8 @@
-// RUN: %clang_cc1 -triple powerpc-ibm-aix-xcoff -S -emit-llvm -x c++ \
+// RUN: %clang_cc1 -triple powerpc-ibm-aix-xcoff -emit-llvm -x c++ \
 // RUN:     -std=c++2a < %s | \
 // RUN:   FileCheck --check-prefixes=CHECK,CHECK32 %s
 
-// RUN: %clang_cc1 -triple powerpc64-ibm-aix-xcoff -S -emit-llvm -x c++ \
+// RUN: %clang_cc1 -triple powerpc64-ibm-aix-xcoff -emit-llvm -x c++ \
 // RUN:     -std=c++2a < %s | \
 // RUN:   FileCheck --check-prefixes=CHECK,CHECK64 %s
 
diff --git a/clang/test/CodeGenCXX/annotate-type.cpp b/clang/test/CodeGenCXX/annotate-type.cpp
index 456888b1ecd9..a4985c73c95b 100644
--- a/clang/test/CodeGenCXX/annotate-type.cpp
+++ b/clang/test/CodeGenCXX/annotate-type.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm-only %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple-only %s -emit-llvm -o - | FileCheck %s
 
 // Test that `annotate_type` does not affect mangled names.
 
diff --git a/clang/test/CodeGenCXX/attr-annotate-destructor.cpp b/clang/test/CodeGenCXX/attr-annotate-destructor.cpp
index dbe686b04861..4e5a2190a458 100644
--- a/clang/test/CodeGenCXX/attr-annotate-destructor.cpp
+++ b/clang/test/CodeGenCXX/attr-annotate-destructor.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -S -emit-llvm -triple x86_64-unknown-linux-gnu -o - | FileCheck %s
+// RUN: %clang_cc1 %s -emit-llvm -triple x86_64-unknown-linux-gnu -o - | FileCheck %s
 
 // Test annotation attributes on destructors do not crash.
 
diff --git a/clang/test/CodeGenCXX/attr-annotate.cpp b/clang/test/CodeGenCXX/attr-annotate.cpp
index fd08f208fc9b..64627a6b83e1 100644
--- a/clang/test/CodeGenCXX/attr-annotate.cpp
+++ b/clang/test/CodeGenCXX/attr-annotate.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -S -emit-llvm -triple x86_64-unknown-linux-gnu -o - | FileCheck %s
+// RUN: %clang_cc1 %s -emit-llvm -triple x86_64-unknown-linux-gnu -o - | FileCheck %s
 
 //CHECK: @[[STR1:.*]] = private unnamed_addr constant [{{.*}} x i8] c"{{.*}}attr-annotate.cpp\00", section "llvm.metadata"
 //CHECK: @[[STR2:.*]] = private unnamed_addr constant [4 x i8] c"abc\00", align 1
diff --git a/clang/test/CodeGenCXX/attr-annotate2.cpp b/clang/test/CodeGenCXX/attr-annotate2.cpp
index dca6c009d107..1c9c39c9efb8 100644
--- a/clang/test/CodeGenCXX/attr-annotate2.cpp
+++ b/clang/test/CodeGenCXX/attr-annotate2.cpp
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 %s -S -emit-llvm -triple x86_64-unknown-linux-gnu -o - | FileCheck %s
+// RUN: %clang_cc1 %s -emit-llvm -triple x86_64-unknown-linux-gnu -o - | FileCheck %s
 
 // CHECK: @[[STR:.*]] = private unnamed_addr constant [45 x i8] c"_Generic selection expression should be fine\00", section "llvm.metadata"
 // CHECK-NEXT: @[[FILENAME:.*]] = private unnamed_addr constant {{.*}}, section "llvm.metadata"
diff --git a/clang/test/CodeGenCXX/attr-mustprogress.cpp b/clang/test/CodeGenCXX/attr-mustprogress.cpp
index 843f5460426c..43a8164d9391 100644
--- a/clang/test/CodeGenCXX/attr-mustprogress.cpp
+++ b/clang/test/CodeGenCXX/attr-mustprogress.cpp
@@ -1,22 +1,22 @@
-// RUN: %clang_cc1 -std=c++98 -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX98 %s
-// RUN: %clang_cc1 -std=c++11 -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX11 %s
-// RUN: %clang_cc1 -std=c++14 -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX11 %s
-// RUN: %clang_cc1 -std=c++17 -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX11 %s
-// RUN: %clang_cc1 -std=c++20 -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX11 %s
+// RUN: %clang_cc1 -std=c++98 -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX98 %s
+// RUN: %clang_cc1 -std=c++11 -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX11 %s
+// RUN: %clang_cc1 -std=c++14 -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX11 %s
+// RUN: %clang_cc1 -std=c++17 -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX11 %s
+// RUN: %clang_cc1 -std=c++20 -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX11 %s
 
 // Check -ffinite-loops option in combination with various standard versions.
-// RUN: %clang_cc1 -std=c++98 -ffinite-loops -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=FINITE %s
-// RUN: %clang_cc1 -std=c++11 -ffinite-loops -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX11 %s
-// RUN: %clang_cc1 -std=c++14 -ffinite-loops -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX11 %s
-// RUN: %clang_cc1 -std=c++17 -ffinite-loops -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX11 %s
-// RUN: %clang_cc1 -std=c++20 -ffinite-loops -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX11 %s
+// RUN: %clang_cc1 -std=c++98 -ffinite-loops -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=FINITE %s
+// RUN: %clang_cc1 -std=c++11 -ffinite-loops -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX11 %s
+// RUN: %clang_cc1 -std=c++14 -ffinite-loops -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX11 %s
+// RUN: %clang_cc1 -std=c++17 -ffinite-loops -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX11 %s
+// RUN: %clang_cc1 -std=c++20 -ffinite-loops -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX11 %s
 
 // Check -fno-finite-loops option in combination with various standard versions.
-// RUN: %clang_cc1 -std=c++98 -fno-finite-loops -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX98 %s
-// RUN: %clang_cc1 -std=c++11 -fno-finite-loops -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX98 %s
-// RUN: %clang_cc1 -std=c++14 -fno-finite-loops -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX98 %s
-// RUN: %clang_cc1 -std=c++17 -fno-finite-loops -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX98 %s
-// RUN: %clang_cc1 -std=c++20 -fno-finite-loops -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX98 %s
+// RUN: %clang_cc1 -std=c++98 -fno-finite-loops -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX98 %s
+// RUN: %clang_cc1 -std=c++11 -fno-finite-loops -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX98 %s
+// RUN: %clang_cc1 -std=c++14 -fno-finite-loops -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX98 %s
+// RUN: %clang_cc1 -std=c++17 -fno-finite-loops -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX98 %s
+// RUN: %clang_cc1 -std=c++20 -fno-finite-loops -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX98 %s
 
 int a = 0;
 int b = 0;
@@ -24,21 +24,21 @@ int b = 0;
 // CHECK: datalayout
 
 // CXX98-NOT:  mustprogress
-// CXX11:      mustprogress
+// CXX11-NOT:  mustprogress
 // FINITE-NOT: mustprogress
 // CHECK-LABEL: @_Z2f0v(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    br label %for.cond
 // CHECK:       for.cond:
 // CXX98-NOT:    br {{.*}} llvm.loop
-// CXX11-NEXT:   br label %for.cond, !llvm.loop [[LOOP1:!.*]]
-// FINITE-NEXT:  br label %for.cond, !llvm.loop [[LOOP1:!.*]]
+// CXX11-NOT:    br {{.*}} llvm.loop
+// FINITE-NOT:   br {{.*}} llvm.loop
 void f0() {
   for (; ;) ;
 }
 
 // CXX98-NOT:  mustprogress
-// CXX11:      mustprogress
+// CXX11-NOT:  mustprogress
 // FINITE-NOT: mustprogress
 // CHECK-LABEL: @_Z2f1v(
 // CHECK-NEXT:  entry:
@@ -46,9 +46,9 @@ void f0() {
 // CHECK:       for.cond:
 // CHECK-NEXT:    br i1 true, label %for.body, label %for.end
 // CHECK:       for.body:
-// CXX98-NOT:     br {{.*}}, !llvm.loop
-// CXX11-NEXT:    br label %for.cond, !llvm.loop [[LOOP2:!.*]]
-// FINITE-NEXT:  br label %for.cond, !llvm.loop [[LOOP2:!.*]]
+// CXX98-NOT:    br {{.*}}, !llvm.loop
+// CXX11-NOT:    br {{.*}} llvm.loop
+// FINITE-NOT:   br {{.*}} llvm.loop
 // CHECK:       for.end:
 // CHECK-NEXT:    ret void
 //
@@ -81,7 +81,7 @@ void f2() {
 }
 
 // CXX98-NOT:  mustprogress
-// CXX11:      mustprogress
+// CXX11-NOT:  mustprogress
 // FINITE-NOT: mustprogress
 // CHECK-LABEL: @_Z1Fv(
 // CHECK-NEXT:  entry:
@@ -90,8 +90,8 @@ void f2() {
 // CHECK-NEXT:    br i1 true, label %for.body, label %for.end
 // CHECK:       for.body:
 // CXX98-NOT:     br {{.*}}, !llvm.loop
-// CXX11-NEXT:    br label %for.cond, !llvm.loop [[LOOP4:!.*]]
-// FINITE-NEXT:   br label %for.cond, !llvm.loop [[LOOP4:!.*]]
+// CXX11-NOT:     br {{.*}}, !llvm.loop
+// FINITE-NOT:    br {{.*}}, !llvm.loop
 // CHECK:       for.end:
 // CHECK-NEXT:    br label %for.cond1
 // CHECK:       for.cond1:
@@ -114,7 +114,7 @@ void F() {
 }
 
 // CXX98-NOT:  mustprogress
-// CXX11:      mustprogress
+// CXX11-NOT:  mustprogress
 // FINITE-NOT: mustprogress
 // CHECK-LABEL: @_Z2F2v(
 // CHECK-NEXT:  entry:
@@ -134,8 +134,8 @@ void F() {
 // CHECK-NEXT:    br i1 true, label %for.body2, label %for.end3
 // CHECK:       for.body2:
 // CXX98-NOT:     br {{.*}}, !llvm.loop
-// CXX11-NEXT:    br label %for.cond1, !llvm.loop [[LOOP7:!.*]]
-// FINITE-NEXT:   br label %for.cond1, !llvm.loop [[LOOP7:!.*]]
+// CXX11-NOT:     br {{.*}}, !llvm.loop
+// FINITE-NOT:    br {{.*}}, !llvm.loop
 // CHECK:       for.end3:
 // CHECK-NEXT:    ret void
 //
@@ -147,15 +147,15 @@ void F2() {
 }
 
 // CXX98-NOT:  mustprogress
-// CXX11:      mustprogress
+// CXX11-NOT:  mustprogress
 // FINITE-NOT: mustprogress
 // CHECK-LABEL: @_Z2w1v(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    br label %while.body
 // CHECK:       while.body:
 // CXX98-NOT:     br {{.*}}, !llvm.loop
-// CXX11-NEXT:    br label %while.body, !llvm.loop [[LOOP8:!.*]]
-// FINITE-NEXT:   br label %while.body, !llvm.loop [[LOOP8:!.*]]
+// CXX11-NOT:     br {{.*}}, !llvm.loop
+// FINITE-NOT:    br {{.*}}, !llvm.loop
 //
 void w1() {
   while (1)
@@ -186,7 +186,7 @@ void w2() {
 }
 
 // CXX98-NOT:  mustprogress
-// CXX11:      mustprogress
+// CXX11-NOT:  mustprogress
 // FINITE-NOT: mustprogress
 // CHECK-LABEL: @_Z1Wv(
 // CHECK-NEXT:  entry:
@@ -204,8 +204,8 @@ void w2() {
 // CHECK-NEXT:    br label %while.body2
 // CHECK:       while.body2:
 // CXX98-NOT:    br {{.*}}, !llvm.loop
-// CXX11-NEXT:   br label %while.body2, !llvm.loop [[LOOP11:!.*]]
-// FINITE-NEXT:  br label %while.body2, !llvm.loop [[LOOP11:!.*]]
+// CXX11-NOT:    br {{.*}}, !llvm.loop
+// FINITE-NOT:   br {{.*}}, !llvm.loop
 //
 void W() {
   while (a == b)
@@ -215,15 +215,15 @@ void W() {
 }
 
 // CXX98-NOT:  mustprogress
-// CXX11:      mustprogress
+// CXX11-NOT:  mustprogress
 // FINITE-NOT: mustprogress
 // CHECK-LABEL: @_Z2W2v(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    br label %while.body
 // CHECK:       while.body:
 // CXX98-NOT:     br {{.*}}, !llvm.loop
-// CXX11-NEXT:    br label %while.body, !llvm.loop [[LOOP12:!.*]]
-// FINITE-NEXT:   br label %while.body, !llvm.loop [[LOOP12:!.*]]
+// CXX11-NOT:     br {{.*}}, !llvm.loop
+// FINITE-NOT:    br {{.*}}, !llvm.loop
 //
 void W2() {
   while (1)
@@ -233,7 +233,7 @@ void W2() {
 }
 
 // CXX98-NOT:  mustprogress
-// CXX11:      mustprogress
+// CXX11-NOT:  mustprogress
 // FINITE-NOT: mustprogress
 // CHECK-LABEL: @_Z2d1v(
 // CHECK-NEXT:  entry:
@@ -242,8 +242,8 @@ void W2() {
 // CHECK-NEXT:    br label %do.cond
 // CHECK:       do.cond:
 // CXX98-NOT:     br {{.*}}, !llvm.loop
-// CXX11-NEXT:    br i1 true, label %do.body, label %do.end, !llvm.loop [[LOOP13:!.*]]
-// FINITE-NEXT:   br i1 true, label %do.body, label %do.end, !llvm.loop [[LOOP13:!.*]]
+// CXX11-NOT:     br {{.*}}, !llvm.loop
+// FINITE-NOT:    br {{.*}}, !llvm.loop
 // CHECK:       do.end:
 // CHECK-NEXT:    ret void
 //
@@ -278,7 +278,7 @@ void d2() {
 }
 
 // CXX98-NOT:  mustprogress
-// CXX11:      mustprogress
+// CXX11-NOT:  mustprogress
 // FINITE-NOT: mustprogress
 // CHECK-LABEL: @_Z1Dv(
 // CHECK-NEXT:  entry:
@@ -287,8 +287,8 @@ void d2() {
 // CHECK-NEXT:    br label %do.cond
 // CHECK:       do.cond:
 // CXX98-NOT:     br {{.*}}, !llvm.loop
-// CXX11-NEXT:    br i1 true, label %do.body, label %do.end, !llvm.loop [[LOOP15:!.*]]
-// FINITE-NEXT:   br i1 true, label %do.body, label %do.end, !llvm.loop [[LOOP15:!.*]]
+// CXX11-NOT:     br {{.*}}, !llvm.loop
+// FINITE-NOT:    br {{.*}}, !llvm.loop
 // CHECK:       do.end:
 // CHECK-NEXT:    br label %do.body1
 // CHECK:       do.body1:
@@ -312,8 +312,8 @@ void D() {
   while (a == b);
 }
 
-// CXX98-NOT : mustprogress
-// CXX11:      mustprogress
+// CXX98-NOT:  mustprogress
+// CXX11-NOT:  mustprogress
 // FINITE-NOT: mustprogress
 // CHECK-LABEL: @_Z2D2v(
 // CHECK-NEXT:  entry:
@@ -333,8 +333,8 @@ void D() {
 // CHECK-NEXT:    br label %do.cond2
 // CHECK:       do.cond2:
 // CXX98-NOT:     br {{.*}}, !llvm.loop
-// CXX11-NEXT:    br i1 true, label %do.body1, label %do.end3, !llvm.loop [[LOOP18:!.*]]
-// FINITE-NEXT:   br i1 true, label %do.body1, label %do.end3, !llvm.loop [[LOOP18:!.*]]
+// CXX11-NOT:     br {{.*}}, !llvm.loop
+// FINITE-NOT:    br {{.*}}, !llvm.loop
 // CHECK:       do.end3:
 // CHECK-NEXT:    ret void
 //
@@ -347,22 +347,75 @@ void D2() {
   while (1);
 }
 
-// CXX11: [[LOOP1]] = distinct !{[[LOOP1]], [[MP:!.*]]}
+// CXX98-NOT:  mustprogress
+// CXX11-NOT:  mustprogress
+// FINITE-NOT: mustprogress
+// CHECK-LABEL: @_Z9compound0v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    br label %for.cond
+// CHECK:       for.cond:
+// CXX98-NOT:    br {{.*}}, !llvm.loop
+// CXX11-NOT:    br {{.*}}, !llvm.loop
+// FINITE-NOT:   br {{.*}}, !llvm.loop
+void compound0() {
+  for (; ;) {}
+}
+
+// CXX98-NOT:  mustprogress
+// CXX11-NOT:  mustprogress
+// FINITE-NOT: mustprogress
+// CHECK-LABEL: @_Z9compound1v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    br label %for.cond
+// CHECK:       for.cond:
+// CXX98-NOT:    br {{.*}}, llvm.loop
+// CXX11-NOT:    br {{.*}}, llvm.loop
+// FINITE-NOT:   br {{.*}}, !llvm.loop
+void compound1() {
+  for (; ;) {/*! */}
+}
+
+// CXX98-NOT:  mustprogress
+// CXX11-NOT:  mustprogress
+// FINITE-NOT: mustprogress
+// CHECK-LABEL: @_Z9compound2v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    br label %do.body
+// CHECK:       do.body:
+// CHECK-NEXT:    br label %do.cond
+// CHECK:       do.cond:
+// CXX98-NOT:    br {{.*}}, !llvm.loop
+// CXX11-NOT:    br {{.*}}, !llvm.loop
+// FINITE-NOT:   br {{.*}}, !llvm.loop
+// CHECK:       do.end:
+// CHECK-NEXT:    ret void
+//
+void compound2() {
+  do {} while (1+1);
+}
+
+// CXX98-NOT:  mustprogress
+// CXX11    :  mustprogress
+// FINITE   :  mustprogress
+// CHECK-LABEL: @_Z5Falsev(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    br label %do.body
+// CHECK:       do.body:
+// CHECK-NEXT:    br label %do.end
+// CHECK:       do.end:
+// CHECK-NEXT:    ret void
+//
+void False() {
+  do {} while (1-1);
+}
+
+
+// CXX11: [[LOOP3]] = distinct !{[[LOOP3]], [[MP:.*]]}
 // CXX11: [[MP]] = !{!"llvm.loop.mustprogress"}
-// CXX11: [[LOOP2]] = distinct !{[[LOOP2]], [[MP]]}
-// CXX11: [[LOOP3]] = distinct !{[[LOOP3]], [[MP]]}
-// CXX11: [[LOOP4]] = distinct !{[[LOOP4]], [[MP]]}
 // CXX11: [[LOOP5]] = distinct !{[[LOOP5]], [[MP]]}
 // CXX11: [[LOOP6]] = distinct !{[[LOOP6]], [[MP]]}
-// CXX11: [[LOOP7]] = distinct !{[[LOOP7]], [[MP]]}
-// CXX11: [[LOOP8]] = distinct !{[[LOOP8]], [[MP]]}
 // CXX11: [[LOOP9]] = distinct !{[[LOOP9]], [[MP]]}
 // CXX11: [[LOOP10]] = distinct !{[[LOOP10]], [[MP]]}
-// CXX11: [[LOOP11]] = distinct !{[[LOOP11]], [[MP]]}
-// CXX11: [[LOOP12]] = distinct !{[[LOOP12]], [[MP]]}
-// CXX11: [[LOOP13]] = distinct !{[[LOOP13]], [[MP]]}
 // CXX11: [[LOOP14]] = distinct !{[[LOOP14]], [[MP]]}
-// CXX11: [[LOOP15]] = distinct !{[[LOOP15]], [[MP]]}
 // CXX11: [[LOOP16]] = distinct !{[[LOOP16]], [[MP]]}
 // CXX11: [[LOOP17]] = distinct !{[[LOOP17]], [[MP]]}
-// CXX11: [[LOOP18]] = distinct !{[[LOOP18]], [[MP]]}
diff --git a/clang/test/CodeGenCXX/attr-musttail.cpp b/clang/test/CodeGenCXX/attr-musttail.cpp
index 720e50c5a240..c0081ec232e4 100644
--- a/clang/test/CodeGenCXX/attr-musttail.cpp
+++ b/clang/test/CodeGenCXX/attr-musttail.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -fno-elide-constructors -S -emit-llvm %s -triple x86_64-unknown-linux-gnu -o - | FileCheck %s
-// RUN: %clang_cc1 -fno-elide-constructors -S -emit-llvm %s -triple x86_64-unknown-linux-gnu -o - | opt -passes=verify
+// RUN: %clang_cc1 -fno-elide-constructors -emit-llvm %s -triple x86_64-unknown-linux-gnu -o - | FileCheck %s
+// RUN: %clang_cc1 -fno-elide-constructors -emit-llvm %s -triple x86_64-unknown-linux-gnu -o - | opt -passes=verify
 // FIXME: remove the call to "opt" once the tests are running the Clang verifier automatically again.
 
 int Bar(int);
diff --git a/clang/test/CodeGenCXX/attr-x86-no_caller_saved_registers.cpp b/clang/test/CodeGenCXX/attr-x86-no_caller_saved_registers.cpp
index f90f9361fb43..68fc10305218 100644
--- a/clang/test/CodeGenCXX/attr-x86-no_caller_saved_registers.cpp
+++ b/clang/test/CodeGenCXX/attr-x86-no_caller_saved_registers.cpp
@@ -1,31 +1,31 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu %s -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-pc-win32 %s -emit-llvm -o - | FileCheck %s
-
-// CHECK: foo{{[^#]*}}#[[ATTRS:[0-9]+]]
-__attribute__((no_caller_saved_registers)) void foo() {}
-namespace S {
-// CHECK: bar{{[^#]*}}#[[ATTRS]]
-__attribute__((no_caller_saved_registers)) void bar(int *a) { foo(); }
-}
-
-struct St {
-  static void baz(int *a) __attribute__((no_caller_saved_registers)) { S::bar(a); }
-};
-
-__attribute((no_caller_saved_registers)) void (*foobar)(void);
-
-// CHECK-LABEL: @main
-int main(int argc, char **argv) {
-  St::baz(&argc);
-  // CHECK: [[FOOBAR:%.+]] = load ptr, ptr @{{.*}}foobar{{.*}},
-  // CHECK-NEXT: call void [[FOOBAR]]() #[[ATTRS1:.+]]
-  foobar();
-  return 0;
-}
-
-// CHECK: baz{{[^#]*}}#[[ATTRS]]
-
-// CHECK: attributes #[[ATTRS]] = {
-// CHECK-SAME: "no_caller_saved_registers"
-// CHECK-SAME: }
-// CHECK: attributes #[[ATTRS1]] = { "no_caller_saved_registers" }
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-pc-win32 %s -emit-llvm -o - | FileCheck %s
+
+// CHECK: foo{{[^#]*}}#[[ATTRS:[0-9]+]]
+__attribute__((no_caller_saved_registers)) void foo() {}
+namespace S {
+// CHECK: bar{{[^#]*}}#[[ATTRS]]
+__attribute__((no_caller_saved_registers)) void bar(int *a) { foo(); }
+}
+
+struct St {
+  static void baz(int *a) __attribute__((no_caller_saved_registers)) { S::bar(a); }
+};
+
+__attribute((no_caller_saved_registers)) void (*foobar)(void);
+
+// CHECK-LABEL: @main
+int main(int argc, char **argv) {
+  St::baz(&argc);
+  // CHECK: [[FOOBAR:%.+]] = load ptr, ptr @{{.*}}foobar{{.*}},
+  // CHECK-NEXT: call void [[FOOBAR]]() #[[ATTRS1:.+]]
+  foobar();
+  return 0;
+}
+
+// CHECK: baz{{[^#]*}}#[[ATTRS]]
+
+// CHECK: attributes #[[ATTRS]] = {
+// CHECK-SAME: "no_caller_saved_registers"
+// CHECK-SAME: }
+// CHECK: attributes #[[ATTRS1]] = { "no_caller_saved_registers" }
diff --git a/clang/test/CodeGenCXX/builtin-bit-cast-no-tbaa.cpp b/clang/test/CodeGenCXX/builtin-bit-cast-no-tbaa.cpp
index 41b36d29a28d..b4425a8556b7 100644
--- a/clang/test/CodeGenCXX/builtin-bit-cast-no-tbaa.cpp
+++ b/clang/test/CodeGenCXX/builtin-bit-cast-no-tbaa.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -O3 -std=c++2a -S -emit-llvm -o - -disable-llvm-passes -triple x86_64-apple-macos10.14 %s | FileCheck %s
+// RUN: %clang_cc1 -O3 -std=c++2a -emit-llvm -o - -disable-llvm-passes -triple x86_64-apple-macos10.14 %s | FileCheck %s
 
 void test_scalar() {
   // CHECK-LABEL: define{{.*}} void @_Z11test_scalarv
diff --git a/clang/test/CodeGenCXX/builtin-bit-cast.cpp b/clang/test/CodeGenCXX/builtin-bit-cast.cpp
index 5f2f48b8fff6..637590341c98 100644
--- a/clang/test/CodeGenCXX/builtin-bit-cast.cpp
+++ b/clang/test/CodeGenCXX/builtin-bit-cast.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++2a -S -emit-llvm -o - -disable-llvm-passes -triple x86_64-apple-macos10.14 %s | FileCheck %s
+// RUN: %clang_cc1 -std=c++2a -emit-llvm -o - -disable-llvm-passes -triple x86_64-apple-macos10.14 %s | FileCheck %s
 
 void test_scalar(int &oper) {
   // CHECK-LABEL: define{{.*}} void @_Z11test_scalarRi
diff --git a/clang/test/CodeGenCXX/constexpr-late-instantiation.cpp b/clang/test/CodeGenCXX/constexpr-late-instantiation.cpp
index 1c8eef73f2dd..7e8d584c0be6 100644
--- a/clang/test/CodeGenCXX/constexpr-late-instantiation.cpp
+++ b/clang/test/CodeGenCXX/constexpr-late-instantiation.cpp
@@ -1,5 +1,5 @@
 // Make sure foo is instantiated and we don't get a link error
-// RUN: %clang_cc1 -S -emit-llvm -triple %itanium_abi_triple %s -o- | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm -triple %itanium_abi_triple %s -o- | FileCheck %s
 
 template <typename T>
 constexpr T foo(T a);
diff --git a/clang/test/CodeGenCXX/cxx0x-initializer-array.cpp b/clang/test/CodeGenCXX/cxx0x-initializer-array.cpp
index 79b6df515a0e..e76fe818c608 100644
--- a/clang/test/CodeGenCXX/cxx0x-initializer-array.cpp
+++ b/clang/test/CodeGenCXX/cxx0x-initializer-array.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple i386-unknown-unknown -std=c++11 -S -emit-llvm -o - %s -Wno-address-of-temporary | FileCheck %s
+// RUN: %clang_cc1 -triple i386-unknown-unknown -std=c++11 -emit-llvm -o - %s -Wno-address-of-temporary | FileCheck %s
 
 // CHECK: @[[THREE_NULL_MEMPTRS:.*]] = private constant [3 x i32] [i32 -1, i32 -1, i32 -1]
 
diff --git a/clang/test/CodeGenCXX/cxx0x-initializer-constructors.cpp b/clang/test/CodeGenCXX/cxx0x-initializer-constructors.cpp
index 806c19f5dde8..9a1d1f9e0c38 100644
--- a/clang/test/CodeGenCXX/cxx0x-initializer-constructors.cpp
+++ b/clang/test/CodeGenCXX/cxx0x-initializer-constructors.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++11 -S -triple x86_64-none-linux-gnu -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -std=c++11 -triple x86_64 -emit-llvm -o - %s | FileCheck %s
 
 struct S {
   S(int x) { }
diff --git a/clang/test/CodeGenCXX/cxx0x-initializer-references.cpp b/clang/test/CodeGenCXX/cxx0x-initializer-references.cpp
index bbf525be873c..b19ca1d861b1 100644
--- a/clang/test/CodeGenCXX/cxx0x-initializer-references.cpp
+++ b/clang/test/CodeGenCXX/cxx0x-initializer-references.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++11 -S -triple armv7-none-eabi -fmerge-all-constants -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -std=c++11 -triple armv7-none-eabi -fmerge-all-constants -emit-llvm -o - %s | FileCheck %s
 
 // This creates and lifetime-extends a 'const char[5]' temporary.
 // CHECK: @_ZGR19extended_string_ref_ = internal constant [5 x i8] c"hi\00\00\00",
diff --git a/clang/test/CodeGenCXX/cxx0x-initializer-scalars.cpp b/clang/test/CodeGenCXX/cxx0x-initializer-scalars.cpp
index 10c696604867..2f6a6820a758 100644
--- a/clang/test/CodeGenCXX/cxx0x-initializer-scalars.cpp
+++ b/clang/test/CodeGenCXX/cxx0x-initializer-scalars.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++11 -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -std=c++11 -emit-llvm -o - %s | FileCheck %s
 
 void f()
 {
diff --git a/clang/test/CodeGenCXX/cxx0x-initializer-stdinitializerlist-startend.cpp b/clang/test/CodeGenCXX/cxx0x-initializer-stdinitializerlist-startend.cpp
index 36c46fdcb86e..a95a458220b0 100644
--- a/clang/test/CodeGenCXX/cxx0x-initializer-stdinitializerlist-startend.cpp
+++ b/clang/test/CodeGenCXX/cxx0x-initializer-stdinitializerlist-startend.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++11 -S -triple x86_64-none-linux-gnu -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -std=c++11 -triple x86_64 -emit-llvm -o - %s | FileCheck %s
 
 namespace std {
   typedef decltype(sizeof(int)) size_t;
diff --git a/clang/test/CodeGenCXX/cxx11-initializer-aggregate.cpp b/clang/test/CodeGenCXX/cxx11-initializer-aggregate.cpp
index 0c9246d4e411..6fb8f526c1b6 100644
--- a/clang/test/CodeGenCXX/cxx11-initializer-aggregate.cpp
+++ b/clang/test/CodeGenCXX/cxx11-initializer-aggregate.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -std=c++11 -S -emit-llvm -o - %s -triple x86_64-linux-gnu | FileCheck %s
-// RUN: %clang_cc1 -std=c++17 -S -emit-llvm -o - %s -triple x86_64-linux-gnu | FileCheck %s
+// RUN: %clang_cc1 -std=c++11 -emit-llvm -o - %s -triple x86_64-linux-gnu | FileCheck %s
+// RUN: %clang_cc1 -std=c++17 -emit-llvm -o - %s -triple x86_64-linux-gnu | FileCheck %s
 
 struct A { int a, b; int f(); };
 
diff --git a/clang/test/CodeGenCXX/cxx11-trivial-initializer-struct.cpp b/clang/test/CodeGenCXX/cxx11-trivial-initializer-struct.cpp
index 5b6bf6cca73a..83767ec59e23 100644
--- a/clang/test/CodeGenCXX/cxx11-trivial-initializer-struct.cpp
+++ b/clang/test/CodeGenCXX/cxx11-trivial-initializer-struct.cpp
@@ -1,9 +1,9 @@
-// RUN: %clang_cc1 -std=c++11 -S -emit-llvm -o %t-c++11.ll %s -triple x86_64-apple-darwin10
+// RUN: %clang_cc1 -std=c++11 -emit-llvm -o %t-c++11.ll %s -triple x86_64-apple-darwin10
 // RUN: FileCheck %s < %t-c++11.ll
-// RUN: %clang_cc1 -std=c++17 -S -emit-llvm -o %t-c++17.ll %s -triple x86_64-apple-darwin10
+// RUN: %clang_cc1 -std=c++17 -emit-llvm -o %t-c++17.ll %s -triple x86_64-apple-darwin10
 // RUN: FileCheck %s < %t-c++17.ll
-// RUN: %clang_cc1  -std=c++98 -S -emit-llvm -o %t.ll %s -triple x86_64-apple-darwin10
-// RUN: %clang_cc1  -std=c++03 -S -emit-llvm -o %t-c++03.ll %s -triple x86_64-apple-darwin10
+// RUN: %clang_cc1  -std=c++98 -emit-llvm -o %t.ll %s -triple x86_64-apple-darwin10
+// RUN: %clang_cc1  -std=c++03 -emit-llvm -o %t-c++03.ll %s -triple x86_64-apple-darwin10
 // RUN: diff %t-c++11.ll  %t-c++17.ll
 // RUN: diff %t.ll  %t-c++03.ll
 
diff --git a/clang/test/CodeGenCXX/debug-info-atexit-stub.cpp b/clang/test/CodeGenCXX/debug-info-atexit-stub.cpp
index 0282bdd1a283..ca9bc3a13a1e 100644
--- a/clang/test/CodeGenCXX/debug-info-atexit-stub.cpp
+++ b/clang/test/CodeGenCXX/debug-info-atexit-stub.cpp
@@ -1,22 +1,22 @@
-// RUN: %clang_cc1 -emit-llvm %s -triple x86_64-windows-msvc -gcodeview -debug-info-kind=limited -o - | FileCheck %s
-
-struct a {
-  ~a();
-};
-template <typename b> struct c : a {
-  c(void (b::*)());
-};
-struct B {
-  virtual void e();
-};
-c<B> *d() {
-  static c<B> f(&B::e);
-  return &f;
-}
-
-// CHECK: define internal void @"??__Ff@?1??d@@YAPEAU?$c@UB@@@@XZ@YAXXZ"()
-// CHECK-SAME: !dbg ![[SUBPROGRAM:[0-9]+]] {
-// CHECK: call void @"??1?$c@UB@@@@QEAA@XZ"(ptr @"?f@?1??d@@YAPEAU?$c@UB@@@@XZ@4U2@A"), !dbg ![[LOCATION:[0-9]+]]
-// CHECK: ![[SUBPROGRAM]] = distinct !DISubprogram(name: "`dynamic atexit destructor for 'f'"
-// CHECK-SAME: flags: DIFlagArtificial
-// CHECK: ![[LOCATION]] = !DILocation(line: 0, scope: ![[SUBPROGRAM]])
+// RUN: %clang_cc1 -emit-llvm %s -triple x86_64-windows-msvc -gcodeview -debug-info-kind=limited -o - | FileCheck %s
+
+struct a {
+  ~a();
+};
+template <typename b> struct c : a {
+  c(void (b::*)());
+};
+struct B {
+  virtual void e();
+};
+c<B> *d() {
+  static c<B> f(&B::e);
+  return &f;
+}
+
+// CHECK: define internal void @"??__Ff@?1??d@@YAPEAU?$c@UB@@@@XZ@YAXXZ"()
+// CHECK-SAME: !dbg ![[SUBPROGRAM:[0-9]+]] {
+// CHECK: call void @"??1?$c@UB@@@@QEAA@XZ"(ptr @"?f@?1??d@@YAPEAU?$c@UB@@@@XZ@4U2@A"), !dbg ![[LOCATION:[0-9]+]]
+// CHECK: ![[SUBPROGRAM]] = distinct !DISubprogram(name: "`dynamic atexit destructor for 'f'"
+// CHECK-SAME: flags: DIFlagArtificial
+// CHECK: ![[LOCATION]] = !DILocation(line: 0, scope: ![[SUBPROGRAM]])
diff --git a/clang/test/CodeGenCXX/debug-info-blocks.cpp b/clang/test/CodeGenCXX/debug-info-blocks.cpp
index e22594cb5d6a..e05e2ba23383 100644
--- a/clang/test/CodeGenCXX/debug-info-blocks.cpp
+++ b/clang/test/CodeGenCXX/debug-info-blocks.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 %s -debug-info-kind=line-tables-only -fblocks -S -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 %s -debug-info-kind=line-directives-only -fblocks -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -debug-info-kind=line-tables-only -fblocks -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -debug-info-kind=line-directives-only -fblocks -emit-llvm -o - | FileCheck %s
 
 struct A {
   A();
diff --git a/clang/test/CodeGenCXX/debug-info-codeview-heapallocsite.cpp b/clang/test/CodeGenCXX/debug-info-codeview-heapallocsite.cpp
index b22e28fc1501..6468b9f5db53 100644
--- a/clang/test/CodeGenCXX/debug-info-codeview-heapallocsite.cpp
+++ b/clang/test/CodeGenCXX/debug-info-codeview-heapallocsite.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fexceptions -triple x86_64-windows-msvc -debug-info-kind=limited -gcodeview -fdeclspec -S -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fexceptions -triple x86_64-windows-msvc -debug-info-kind=limited -gcodeview -fdeclspec -emit-llvm %s -o - | FileCheck %s
 
 struct Foo {
   int x;
diff --git a/clang/test/CodeGenCXX/debug-info-codeview-unnamed.cpp b/clang/test/CodeGenCXX/debug-info-codeview-unnamed.cpp
index b4c79936ab33..9fcb1c68d7ef 100644
--- a/clang/test/CodeGenCXX/debug-info-codeview-unnamed.cpp
+++ b/clang/test/CodeGenCXX/debug-info-codeview-unnamed.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -debug-info-kind=limited -S -emit-llvm -std=c++11 -o - %s | FileCheck --check-prefix LINUX %s
-// RUN: %clang_cc1 -triple x86_64-windows-msvc -debug-info-kind=limited -gcodeview -S -emit-llvm -std=c++11 -o - %s | FileCheck --check-prefix MSVC %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -debug-info-kind=limited -emit-llvm -std=c++11 -o - %s | FileCheck --check-prefix LINUX %s
+// RUN: %clang_cc1 -triple x86_64-windows-msvc -debug-info-kind=limited -gcodeview -emit-llvm -std=c++11 -o - %s | FileCheck --check-prefix MSVC %s
 
 int main(int argc, char* argv[], char* arge[]) {
   //
diff --git a/clang/test/CodeGenCXX/debug-info-cxx1y.cpp b/clang/test/CodeGenCXX/debug-info-cxx1y.cpp
index 42c801ad6530..012eb38cc403 100644
--- a/clang/test/CodeGenCXX/debug-info-cxx1y.cpp
+++ b/clang/test/CodeGenCXX/debug-info-cxx1y.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm-only -std=c++14 -emit-llvm -debug-info-kind=limited %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple-only -std=c++14 -emit-llvm -debug-info-kind=limited %s -o - | FileCheck %s
 
 // CHECK: imports: [[IMPS:![0-9]*]]
 
diff --git a/clang/test/CodeGenCXX/debug-info-determinism.cpp b/clang/test/CodeGenCXX/debug-info-determinism.cpp
index ea88b8042a1d..c0a70a09c95d 100644
--- a/clang/test/CodeGenCXX/debug-info-determinism.cpp
+++ b/clang/test/CodeGenCXX/debug-info-determinism.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -S -emit-llvm -debug-info-kind=limited -o %t1.ll %s
-// RUN: %clang_cc1 -S -emit-llvm -debug-info-kind=limited -o %t2.ll %s
+// RUN: %clang_cc1 -emit-llvm -debug-info-kind=limited -o %t1.ll %s
+// RUN: %clang_cc1 -emit-llvm -debug-info-kind=limited -o %t2.ll %s
 // RUN: diff %t1.ll %t2.ll
 
 template <int N> struct C {
diff --git a/clang/test/CodeGenCXX/debug-info-gline-tables-only.cpp b/clang/test/CodeGenCXX/debug-info-gline-tables-only.cpp
index ceb7856addc6..9b86a49d69f5 100644
--- a/clang/test/CodeGenCXX/debug-info-gline-tables-only.cpp
+++ b/clang/test/CodeGenCXX/debug-info-gline-tables-only.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 %s -fno-rtti -debug-info-kind=line-tables-only -S -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 %s -fno-rtti -debug-info-kind=line-directives-only -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -fno-rtti -debug-info-kind=line-tables-only -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -fno-rtti -debug-info-kind=line-directives-only -emit-llvm -o - | FileCheck %s
 // Checks that clang with "-gline-tables-only" or "-gline-directives-only" doesn't emit debug info
 // for variables and types.
 
diff --git a/clang/test/CodeGenCXX/debug-info-global-ctor-dtor.cpp b/clang/test/CodeGenCXX/debug-info-global-ctor-dtor.cpp
index ebbd5e5bc8c8..62647615a3dd 100644
--- a/clang/test/CodeGenCXX/debug-info-global-ctor-dtor.cpp
+++ b/clang/test/CodeGenCXX/debug-info-global-ctor-dtor.cpp
@@ -1,8 +1,8 @@
-// RUN: %clang_cc1 %s -debug-info-kind=limited -triple %itanium_abi_triple -fno-use-cxa-atexit -S -disable-O0-optnone  -emit-llvm -o - \
+// RUN: %clang_cc1 %s -debug-info-kind=limited -triple %itanium_abi_triple -fno-use-cxa-atexit -disable-O0-optnone -emit-llvm -o - \
 // RUN:     | FileCheck %s --check-prefix=CHECK-NOKEXT
-// RUN: %clang_cc1 %s -debug-info-kind=limited -triple %itanium_abi_triple -fno-use-cxa-atexit -fapple-kext -S -disable-O0-optnone -emit-llvm -o - \
+// RUN: %clang_cc1 %s -debug-info-kind=limited -triple %itanium_abi_triple -fno-use-cxa-atexit -fapple-kext -disable-O0-optnone -emit-llvm -o - \
 // RUN:     | FileCheck %s --check-prefix=CHECK-KEXT
-// RUN: %clang_cc1 %s -gcodeview -debug-info-kind=limited -triple x86_64-windows-msvc -fno-use-cxa-atexit -S -disable-O0-optnone  -emit-llvm -o - \
+// RUN: %clang_cc1 %s -gcodeview -debug-info-kind=limited -triple x86_64-windows-msvc -fno-use-cxa-atexit -disable-O0-optnone -emit-llvm -o - \
 // RUN:     | FileCheck %s --check-prefix=CHECK-MSVC
 
 class A {
diff --git a/clang/test/CodeGenCXX/debug-info-line-if.cpp b/clang/test/CodeGenCXX/debug-info-line-if.cpp
index 442d705c3d7f..8f52428ce584 100644
--- a/clang/test/CodeGenCXX/debug-info-line-if.cpp
+++ b/clang/test/CodeGenCXX/debug-info-line-if.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -debug-info-kind=limited -gno-column-info -std=c++11 -S -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap %s
+// RUN: %clang_cc1 -debug-info-kind=limited -gno-column-info -std=c++11 -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap %s
 // PR19864
 extern int v[2];
 int a = 0, b = 0;
diff --git a/clang/test/CodeGenCXX/debug-info-line.cpp b/clang/test/CodeGenCXX/debug-info-line.cpp
index 09abb0fd1a42..8ef0e024395d 100644
--- a/clang/test/CodeGenCXX/debug-info-line.cpp
+++ b/clang/test/CodeGenCXX/debug-info-line.cpp
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -w -debug-info-kind=line-tables-only -std=c++11 -fexceptions -fcxx-exceptions -S -mllvm -no-discriminators -emit-llvm %s -o - -triple %itanium_abi_triple | FileCheck %s
-// RUN: %clang_cc1 -w -debug-info-kind=line-tables-only -std=c++11 -fexceptions -fcxx-exceptions -S -mllvm -no-discriminators -emit-llvm %s -o - -triple i686-linux-gnu | FileCheck %s
-// RUN: %clang_cc1 -w -debug-info-kind=line-directives-only -std=c++11 -fexceptions -fcxx-exceptions -S -mllvm -no-discriminators -emit-llvm %s -o - -triple %itanium_abi_triple | FileCheck %s
-// RUN: %clang_cc1 -w -debug-info-kind=line-directives-only -std=c++11 -fexceptions -fcxx-exceptions -S -mllvm -no-discriminators -emit-llvm %s -o - -triple i686-linux-gnu | FileCheck %s
+// RUN: %clang_cc1 -w -debug-info-kind=line-tables-only -std=c++11 -fexceptions -fcxx-exceptions -mllvm -no-discriminators -emit-llvm %s -o - -triple %itanium_abi_triple | FileCheck %s
+// RUN: %clang_cc1 -w -debug-info-kind=line-tables-only -std=c++11 -fexceptions -fcxx-exceptions -mllvm -no-discriminators -emit-llvm %s -o - -triple i686-linux-gnu | FileCheck %s
+// RUN: %clang_cc1 -w -debug-info-kind=line-directives-only -std=c++11 -fexceptions -fcxx-exceptions -mllvm -no-discriminators -emit-llvm %s -o - -triple %itanium_abi_triple | FileCheck %s
+// RUN: %clang_cc1 -w -debug-info-kind=line-directives-only -std=c++11 -fexceptions -fcxx-exceptions -mllvm -no-discriminators -emit-llvm %s -o - -triple i686-linux-gnu | FileCheck %s
 
 int &src();
 int *sink();
diff --git a/clang/test/CodeGenCXX/debug-info-method2.cpp b/clang/test/CodeGenCXX/debug-info-method2.cpp
index 1879b1a364dd..cc19184d0697 100644
--- a/clang/test/CodeGenCXX/debug-info-method2.cpp
+++ b/clang/test/CodeGenCXX/debug-info-method2.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -x c++ -debug-info-kind=limited -S -emit-llvm < %s | FileCheck %s
+// RUN: %clang_cc1 -x c++ -debug-info-kind=limited -emit-llvm < %s | FileCheck %s
 // Preserve type qualifiers in -flimit-debug-info mode.
 
 // CHECK:  DW_TAG_const_type
diff --git a/clang/test/CodeGenCXX/debug-info-namespace.cpp b/clang/test/CodeGenCXX/debug-info-namespace.cpp
index e3cf6507e161..788588348817 100644
--- a/clang/test/CodeGenCXX/debug-info-namespace.cpp
+++ b/clang/test/CodeGenCXX/debug-info-namespace.cpp
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -std=c++11 -debug-info-kind=limited -S -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -std=c++11 -debug-info-kind=line-tables-only -S -emit-llvm %s -o - | FileCheck -check-prefix=CHECK-GMLT %s
-// RUN: %clang_cc1 -std=c++11 -debug-info-kind=line-directives-only -S -emit-llvm %s -o - | FileCheck -check-prefix=CHECK-GMLI %s
-// RUN: %clang_cc1 -std=c++11 -debug-info-kind=standalone -S -emit-llvm %s -o - | FileCheck -check-prefix=CHECK-NOLIMIT %s
+// RUN: %clang_cc1 -std=c++11 -debug-info-kind=limited -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -std=c++11 -debug-info-kind=line-tables-only -emit-llvm %s -o - | FileCheck -check-prefix=CHECK-GMLT %s
+// RUN: %clang_cc1 -std=c++11 -debug-info-kind=line-directives-only -emit-llvm %s -o - | FileCheck -check-prefix=CHECK-GMLI %s
+// RUN: %clang_cc1 -std=c++11 -debug-info-kind=standalone -emit-llvm %s -o - | FileCheck -check-prefix=CHECK-NOLIMIT %s
 
 namespace A {
 #line 1 "foo.cpp"
diff --git a/clang/test/CodeGenCXX/debug-info-struct-align.cpp b/clang/test/CodeGenCXX/debug-info-struct-align.cpp
index 6d75c71476ba..1269cbce83ef 100644
--- a/clang/test/CodeGenCXX/debug-info-struct-align.cpp
+++ b/clang/test/CodeGenCXX/debug-info-struct-align.cpp
@@ -1,5 +1,5 @@
 //  Test for debug info related to DW_AT_alignment attribute in the struct type.
-// RUN: %clang_cc1 -dwarf-version=5 -debug-info-kind=standalone -S -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -dwarf-version=5 -debug-info-kind=standalone -emit-llvm %s -o - | FileCheck %s
 
 // CHECK-DAG: DICompositeType(tag: DW_TAG_structure_type, name: "MyType", {{.*}}, align: 32
 // CHECK-DAG: DICompositeType(tag: DW_TAG_structure_type, name: "MyType1", {{.*}}, align: 8
diff --git a/clang/test/CodeGenCXX/debug-info-thunk.cpp b/clang/test/CodeGenCXX/debug-info-thunk.cpp
index f6130cc28ced..f48adce6634a 100644
--- a/clang/test/CodeGenCXX/debug-info-thunk.cpp
+++ b/clang/test/CodeGenCXX/debug-info-thunk.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 %s -triple=x86_64-pc-windows-msvc -debug-info-kind=limited -S -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 %s -triple %itanium_abi_triple -debug-info-kind=limited -S -emit-llvm -o - | FileCheck %s -check-prefix=ITANIUM
+// RUN: %clang_cc1 %s -triple=x86_64-pc-windows-msvc -debug-info-kind=limited -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple %itanium_abi_triple -debug-info-kind=limited -emit-llvm -o - | FileCheck %s -check-prefix=ITANIUM
 //
 // Validate we emit a "DIFlagThunk" flag on DISubprogram entries for thunks.
 // This flag is used for emitting S_THUNK32 symbols for CodeView debugging.
diff --git a/clang/test/CodeGenCXX/destructor-debug-info.cpp b/clang/test/CodeGenCXX/destructor-debug-info.cpp
index 7b10f8339420..d30c6c3938c2 100644
--- a/clang/test/CodeGenCXX/destructor-debug-info.cpp
+++ b/clang/test/CodeGenCXX/destructor-debug-info.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -debug-info-kind=limited -S -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -debug-info-kind=limited -emit-llvm %s -o - | FileCheck %s
 
 class A { int a; };
 class B {
diff --git a/clang/test/CodeGenCXX/dllexport-alias.cpp b/clang/test/CodeGenCXX/dllexport-alias.cpp
index 6f659e5fd3b8..2a14427cddc7 100644
--- a/clang/test/CodeGenCXX/dllexport-alias.cpp
+++ b/clang/test/CodeGenCXX/dllexport-alias.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-windows-gnu -mconstructor-aliases %s -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-windows-gnu -mconstructor-aliases %s -emit-llvm -o - | FileCheck %s
 
 // This test assumes that the C1 constructor will be aliased to the C2
 // constructor, and the D1 destructor to the D2. It then checks that the aliases
diff --git a/clang/test/CodeGenCXX/duplicate-mangled-name.cpp b/clang/test/CodeGenCXX/duplicate-mangled-name.cpp
index 1eb63f7a27f3..04e6fee506eb 100644
--- a/clang/test/CodeGenCXX/duplicate-mangled-name.cpp
+++ b/clang/test/CodeGenCXX/duplicate-mangled-name.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm-only %s -verify -DTEST1
-// RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm-only %s -verify -DTEST2 -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple-only %s -verify -DTEST2 -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm-only %s -verify -DTEST3
 // RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm-only %s -verify -DTEST4
 
diff --git a/clang/test/CodeGenCXX/fixed-point-mangle.cpp b/clang/test/CodeGenCXX/fixed-point-mangle.cpp
index 103990a61316..1b06b349ca03 100644
--- a/clang/test/CodeGenCXX/fixed-point-mangle.cpp
+++ b/clang/test/CodeGenCXX/fixed-point-mangle.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -ffixed-point -S -emit-llvm %s -o - -triple=x86_64-unknown-linux-gnu | FileCheck %s
+// RUN: %clang_cc1 -ffixed-point -emit-llvm %s -o - -triple=x86_64-unknown-linux-gnu | FileCheck %s
 
 // Primary fixed point types
 void func(signed short _Accum){}    // CHECK: @_Z4funcDAs
diff --git a/clang/test/CodeGenCXX/funcattrs-global-ctor-dtor.cpp b/clang/test/CodeGenCXX/funcattrs-global-ctor-dtor.cpp
index b98cb24d5691..0911dd5cd151 100644
--- a/clang/test/CodeGenCXX/funcattrs-global-ctor-dtor.cpp
+++ b/clang/test/CodeGenCXX/funcattrs-global-ctor-dtor.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -triple x86_64-apple-darwin -S -stack-protector 2 -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple x86_64-apple-darwin -stack-protector 2 -emit-llvm -o - | FileCheck %s
 
 class A {
  public:
diff --git a/clang/test/CodeGenCXX/header-unit-friend-within-class-linkage.cpp b/clang/test/CodeGenCXX/header-unit-friend-within-class-linkage.cpp
index cfee317ba6b2..f3a87ef58bb9 100644
--- a/clang/test/CodeGenCXX/header-unit-friend-within-class-linkage.cpp
+++ b/clang/test/CodeGenCXX/header-unit-friend-within-class-linkage.cpp
@@ -4,7 +4,7 @@
 //
 // RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple -xc++-user-header -emit-header-unit %t/foo.h -o %t/foo.pcm
 // RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple -fmodule-file=%t/foo.pcm %t/user.cpp \
-// RUN:   -S -emit-llvm -disable-llvm-passes -o - | FileCheck %t/user.cpp
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %t/user.cpp
 
 //--- foo.h
 class foo {
diff --git a/clang/test/CodeGenCXX/header-unit-member-func-linkage.cpp b/clang/test/CodeGenCXX/header-unit-member-func-linkage.cpp
index 5ab15dff0158..52a8d58565f8 100644
--- a/clang/test/CodeGenCXX/header-unit-member-func-linkage.cpp
+++ b/clang/test/CodeGenCXX/header-unit-member-func-linkage.cpp
@@ -4,7 +4,7 @@
 //
 // RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple -xc++-user-header -emit-header-unit %t/foo.h -o %t/foo.pcm
 // RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple -fmodule-file=%t/foo.pcm %t/user.cpp \
-// RUN:   -S -emit-llvm -disable-llvm-passes -o - | FileCheck %t/user.cpp
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %t/user.cpp
 
 //--- foo.h
 class foo {
diff --git a/clang/test/CodeGenCXX/instrument-functions.cpp b/clang/test/CodeGenCXX/instrument-functions.cpp
index 45ae48235a9d..d775ad95376e 100644
--- a/clang/test/CodeGenCXX/instrument-functions.cpp
+++ b/clang/test/CodeGenCXX/instrument-functions.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -S -emit-llvm -triple %itanium_abi_triple -o - %s -finstrument-functions -disable-llvm-passes | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm -triple %itanium_abi_triple -o - %s -finstrument-functions -disable-llvm-passes | FileCheck %s
 
 int test1(int x) {
 // CHECK: @_Z5test1i(i32 {{.*}}%x) #[[ATTR1:[0-9]+]]
diff --git a/clang/test/CodeGenCXX/mangle-concept.cpp b/clang/test/CodeGenCXX/mangle-concept.cpp
index bbd2cf6555e3..e9c46d87635a 100644
--- a/clang/test/CodeGenCXX/mangle-concept.cpp
+++ b/clang/test/CodeGenCXX/mangle-concept.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -verify -frelaxed-template-template-args -std=c++20 -emit-llvm -triple %itanium_abi_triple -o - %s -fclang-abi-compat=latest | FileCheck %s
-// RUN: %clang_cc1 -verify -frelaxed-template-template-args -std=c++20 -emit-llvm -triple %itanium_abi_triple -o - %s -fclang-abi-compat=16 | FileCheck %s --check-prefix=CLANG16
+// RUN: %clang_cc1 -verify -std=c++20 -emit-llvm -triple %itanium_abi_triple -o - %s -fclang-abi-compat=latest | FileCheck %s
+// RUN: %clang_cc1 -verify -std=c++20 -emit-llvm -triple %itanium_abi_triple -o - %s -fclang-abi-compat=16 | FileCheck %s --check-prefix=CLANG16
 // expected-no-diagnostics
 
 namespace test1 {
diff --git a/clang/test/CodeGenCXX/mdefault-visibility-export-mapping-alias.cpp b/clang/test/CodeGenCXX/mdefault-visibility-export-mapping-alias.cpp
index 0d6c4ee1e2ee..947103fa625a 100644
--- a/clang/test/CodeGenCXX/mdefault-visibility-export-mapping-alias.cpp
+++ b/clang/test/CodeGenCXX/mdefault-visibility-export-mapping-alias.cpp
@@ -1,8 +1,8 @@
-// RUN: %clang_cc1 -triple powerpc-ibm-aix %s -mdefault-visibility-export-mapping=none -mconstructor-aliases -S -emit-llvm -o - | \
+// RUN: %clang_cc1 -triple powerpc-ibm-aix %s -mdefault-visibility-export-mapping=none -mconstructor-aliases -emit-llvm -o - | \
 // RUN:   FileCheck -check-prefixes=UNSPECIFIED-DEF,EXPLICIT-DEF %s
-// RUN: %clang_cc1 -triple powerpc-ibm-aix %s -mdefault-visibility-export-mapping=explicit -mconstructor-aliases -S -emit-llvm -o - | \
+// RUN: %clang_cc1 -triple powerpc-ibm-aix %s -mdefault-visibility-export-mapping=explicit -mconstructor-aliases -emit-llvm -o - | \
 // RUN:   FileCheck -check-prefixes=UNSPECIFIED-DEF,EXPLICIT-EXP %s
-// RUN: %clang_cc1 -triple powerpc-ibm-aix %s -mdefault-visibility-export-mapping=all -mconstructor-aliases -S -emit-llvm -o - | \
+// RUN: %clang_cc1 -triple powerpc-ibm-aix %s -mdefault-visibility-export-mapping=all -mconstructor-aliases -emit-llvm -o - | \
 // RUN:   FileCheck -check-prefixes=UNSPECIFIED-EXP,EXPLICIT-EXP %s
 
 class A {
diff --git a/clang/test/CodeGenCXX/mdefault-visibility-export-mapping-rtti.cpp b/clang/test/CodeGenCXX/mdefault-visibility-export-mapping-rtti.cpp
index 9e9449f15685..1af105e915e6 100644
--- a/clang/test/CodeGenCXX/mdefault-visibility-export-mapping-rtti.cpp
+++ b/clang/test/CodeGenCXX/mdefault-visibility-export-mapping-rtti.cpp
@@ -1,14 +1,14 @@
-// RUN: %clang_cc1 -triple powerpc64-ibm-aix %s -internal-isystem %S -mdefault-visibility-export-mapping=none -S -emit-llvm -o - | \
+// RUN: %clang_cc1 -triple powerpc64-ibm-aix %s -internal-isystem %S -mdefault-visibility-export-mapping=none -emit-llvm -o - | \
 // RUN:   FileCheck -check-prefixes=CHECK,UNSPECIFIED-DEF,EXPLICIT-DEF,FUND-DEF %s
-// RUN: %clang_cc1 -triple powerpc64-ibm-aix %s -internal-isystem %S -mdefault-visibility-export-mapping=explicit -S -emit-llvm -o - | \
+// RUN: %clang_cc1 -triple powerpc64-ibm-aix %s -internal-isystem %S -mdefault-visibility-export-mapping=explicit -emit-llvm -o - | \
 // RUN:   FileCheck -check-prefixes=CHECK,UNSPECIFIED-DEF,EXPLICIT-EXP,FUND-DEF %s
-// RUN: %clang_cc1 -triple powerpc64-ibm-aix %s -internal-isystem %S -mdefault-visibility-export-mapping=explicit -DFUNDAMENTAL_IS_EXPLICIT -S -emit-llvm -o - | \
+// RUN: %clang_cc1 -triple powerpc64-ibm-aix %s -internal-isystem %S -mdefault-visibility-export-mapping=explicit -DFUNDAMENTAL_IS_EXPLICIT -emit-llvm -o - | \
 // RUN:   FileCheck -check-prefixes=CHECK,UNSPECIFIED-DEF,EXPLICIT-EXP,FUND-EXP %s
-// RUN: %clang_cc1 -triple powerpc64-ibm-aix %s -internal-isystem %S -mdefault-visibility-export-mapping=all -S -emit-llvm -o - | \
+// RUN: %clang_cc1 -triple powerpc64-ibm-aix %s -internal-isystem %S -mdefault-visibility-export-mapping=all -emit-llvm -o - | \
 // RUN:   FileCheck -check-prefixes=CHECK,UNSPECIFIED-EXP,EXPLICIT-EXP,FUND-EXP %s
-// RUN: %clang_cc1 -triple powerpc64-ibm-aix %s -internal-isystem %S -mdefault-visibility-export-mapping=all -fvisibility=hidden -S -emit-llvm -o - | \
+// RUN: %clang_cc1 -triple powerpc64-ibm-aix %s -internal-isystem %S -mdefault-visibility-export-mapping=all -fvisibility=hidden -emit-llvm -o - | \
 // RUN:   FileCheck -check-prefixes=CHECK,UNSPECIFIED-HID,EXPLICIT-EXP,FUND-HID %s
-// RUN: %clang_cc1 -triple powerpc64-ibm-aix %s -internal-isystem %S -mdefault-visibility-export-mapping=all -DFUNDAMENTAL_IS_EXPLICIT -fvisibility=hidden -S -emit-llvm -o - | \
+// RUN: %clang_cc1 -triple powerpc64-ibm-aix %s -internal-isystem %S -mdefault-visibility-export-mapping=all -DFUNDAMENTAL_IS_EXPLICIT -fvisibility=hidden -emit-llvm -o - | \
 // RUN:   FileCheck -check-prefixes=CHECK,UNSPECIFIED-HID,EXPLICIT-EXP,FUND-EXP %s
 
 #include <typeinfo>
diff --git a/clang/test/CodeGenCXX/mdefault-visibility-export-mapping.cpp b/clang/test/CodeGenCXX/mdefault-visibility-export-mapping.cpp
index 6e61f1812f57..66634e547129 100644
--- a/clang/test/CodeGenCXX/mdefault-visibility-export-mapping.cpp
+++ b/clang/test/CodeGenCXX/mdefault-visibility-export-mapping.cpp
@@ -1,10 +1,10 @@
-// RUN: %clang_cc1 -triple powerpc-ibm-aix %s -S -emit-llvm -o - | \
+// RUN: %clang_cc1 -triple powerpc-ibm-aix %s -emit-llvm -o - | \
 // RUN:   FileCheck -check-prefixes=CHECK,UNSPECIFIED-DEF,EXPLICIT-DEF %s
-// RUN: %clang_cc1 -triple powerpc-ibm-aix %s -mdefault-visibility-export-mapping=none -S -emit-llvm -o - | \
+// RUN: %clang_cc1 -triple powerpc-ibm-aix %s -mdefault-visibility-export-mapping=none -emit-llvm -o - | \
 // RUN:   FileCheck -check-prefixes=CHECK,UNSPECIFIED-DEF,EXPLICIT-DEF %s
-// RUN: %clang_cc1 -triple powerpc-ibm-aix %s -mdefault-visibility-export-mapping=explicit -S -emit-llvm -o - | \
+// RUN: %clang_cc1 -triple powerpc-ibm-aix %s -mdefault-visibility-export-mapping=explicit -emit-llvm -o - | \
 // RUN:   FileCheck -check-prefixes=CHECK,UNSPECIFIED-DEF,EXPLICIT-EXP %s
-// RUN: %clang_cc1 -triple powerpc-ibm-aix %s -mdefault-visibility-export-mapping=all -S -emit-llvm -o - | \
+// RUN: %clang_cc1 -triple powerpc-ibm-aix %s -mdefault-visibility-export-mapping=all -emit-llvm -o - | \
 // RUN:   FileCheck -check-prefixes=CHECK,UNSPECIFIED-EXP,EXPLICIT-EXP %s
 
 struct A {};
diff --git a/clang/test/CodeGenCXX/member-alignment.cpp b/clang/test/CodeGenCXX/member-alignment.cpp
index 8e7a353dfe6f..d5c9a5a02b16 100644
--- a/clang/test/CodeGenCXX/member-alignment.cpp
+++ b/clang/test/CodeGenCXX/member-alignment.cpp
@@ -5,11 +5,11 @@
 // RUN: %clang_cc1 -emit-llvm -triple ppc64le-unknown-linux-gnu %s -o - | \
 // RUN: FileCheck -check-prefix CHECK-NOEXTRAALIGN %s
 // RUN: %clang_cc1 -emit-llvm -triple arm64-unknown-linux-gnu %s -o - | \
-// RUN: FileCheck -check-prefix CHECK-EXTRAALIGN %s
+// RUN: FileCheck -check-prefix CHECK-NOEXTRAALIGN %s
 // RUN: %clang_cc1 -emit-llvm -triple arm64-apple-ios %s -o - | \
-// RUN: FileCheck -check-prefix CHECK-EXTRAALIGN %s
+// RUN: FileCheck -check-prefix CHECK-NOEXTRAALIGN %s
 // RUN: %clang_cc1 -emit-llvm -triple aarch64-unknown-linux-gnu %s -o - | \
-// RUN: FileCheck -check-prefix CHECK-EXTRAALIGN %s
+// RUN: FileCheck -check-prefix CHECK-NOEXTRAALIGN %s
 // RUN: %clang_cc1 -emit-llvm -triple mips-unknown-linux-gnu %s -o - | \
 // RUN: FileCheck -check-prefix CHECK-EXTRAALIGN %s
 // RUN: %clang_cc1 -emit-llvm -triple x86_64-unknown-fuchsia %s -o - | \
diff --git a/clang/test/CodeGenCXX/module-funcs-from-imports.cppm b/clang/test/CodeGenCXX/module-funcs-from-imports.cppm
index a2a9122fc391..850d0f6bcae1 100644
--- a/clang/test/CodeGenCXX/module-funcs-from-imports.cppm
+++ b/clang/test/CodeGenCXX/module-funcs-from-imports.cppm
@@ -5,14 +5,14 @@
 // RUN:    -emit-module-interface -o %t/M.pcm
 // RUN: %clang_cc1 -std=c++20 %t/Use.cpp -fprebuilt-module-path=%t \
 // RUN:    -triple %itanium_abi_triple \
-// RUN:    -S -emit-llvm -o - -disable-llvm-passes \
+// RUN:    -emit-llvm -o - -disable-llvm-passes \
 // RUN:    | FileCheck %t/Use.cpp --check-prefix=CHECK-O0
 //
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 -O1 %t/M.cppm \
 // RUN:    -emit-module-interface -o %t/M.pcm
 // RUN: %clang_cc1 -std=c++20 %t/Use.cpp -fprebuilt-module-path=%t -O1 \
 // RUN:    -triple %itanium_abi_triple \
-// RUN:    -S -emit-llvm -o - -disable-llvm-passes | \
+// RUN:    -emit-llvm -o - -disable-llvm-passes | \
 // RUN:    FileCheck %t/Use.cpp --check-prefix=CHECK-O1
 
 //--- foo.h
diff --git a/clang/test/CodeGenCXX/module-initializer-guard-elision.cpp b/clang/test/CodeGenCXX/module-initializer-guard-elision.cpp
index 53e4b909ee2a..c368ad579405 100644
--- a/clang/test/CodeGenCXX/module-initializer-guard-elision.cpp
+++ b/clang/test/CodeGenCXX/module-initializer-guard-elision.cpp
@@ -4,37 +4,37 @@
 
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 O.cpp \
 // RUN:    -emit-module-interface -o O.pcm
-// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 O.pcm -S -emit-llvm \
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 O.pcm -emit-llvm \
 // RUN:  -o - | FileCheck %s --check-prefix=CHECK-O
 
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 P.cpp \
 // RUN:    -emit-module-interface -fprebuilt-module-path=%t -o P.pcm
-// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 P.pcm -S -emit-llvm \
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 P.pcm -emit-llvm \
 // RUN:   -fprebuilt-module-path=%t -o - | FileCheck %s --check-prefix=CHECK-P
 
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 Q.cpp \
 // RUN:    -emit-module-interface -o Q.pcm
-// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 Q.pcm -S -emit-llvm \
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 Q.pcm -emit-llvm \
 // RUN:    -o - | FileCheck %s --check-prefix=CHECK-Q
 
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 R.cpp \
 // RUN:    -emit-module-interface -fprebuilt-module-path=%t -o R.pcm
-// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 R.pcm -S -emit-llvm \
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 R.pcm -emit-llvm \
 // RUN:    -fprebuilt-module-path=%t -o - | FileCheck %s --check-prefix=CHECK-R
 
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 S.cpp \
 // RUN:    -emit-module-interface -fprebuilt-module-path=%t -o S.pcm
-// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 S.pcm -S -emit-llvm \
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 S.pcm -emit-llvm \
 // RUN:    -fprebuilt-module-path=%t -o - | FileCheck %s --check-prefix=CHECK-S
 
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 T.cpp \
 // RUN:    -emit-module-interface -fprebuilt-module-path=%t -o T.pcm
-// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 T.pcm -S -emit-llvm \
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 T.pcm -emit-llvm \
 // RUN:    -fprebuilt-module-path=%t -o - | FileCheck %s --check-prefix=CHECK-T
 
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 U.cpp \
 // RUN:    -emit-module-interface -fprebuilt-module-path=%t -o U.pcm
-// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 U.pcm -S -emit-llvm \
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 U.pcm -emit-llvm \
 // RUN:    -fprebuilt-module-path=%t -o - | FileCheck %s --check-prefix=CHECK-U
 
 // Testing cases where we can elide the module initializer guard variable.
diff --git a/clang/test/CodeGenCXX/module-initializer-header.cppm b/clang/test/CodeGenCXX/module-initializer-header.cppm
index 5cd93529bb5e..5c09100ad7cb 100644
--- a/clang/test/CodeGenCXX/module-initializer-header.cppm
+++ b/clang/test/CodeGenCXX/module-initializer-header.cppm
@@ -3,8 +3,8 @@
 // RUN: split-file %s %t
 //
 // RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple -xc++-user-header -emit-header-unit %t/header.h -o %t/header.pcm
-// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple -fmodule-file=%t/header.pcm %t/M.cppm -S -emit-llvm -o - | FileCheck %t/M.cppm
-// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple -fmodule-file=%t/header.pcm %t/Use.cpp -S -emit-llvm -o - | FileCheck %t/Use.cpp
+// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple -fmodule-file=%t/header.pcm %t/M.cppm -emit-llvm -o - | FileCheck %t/M.cppm
+// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple -fmodule-file=%t/header.pcm %t/Use.cpp -emit-llvm -o - | FileCheck %t/Use.cpp
 //
 //--- header.h
 int foo();
diff --git a/clang/test/CodeGenCXX/module-intializer-pmf.cpp b/clang/test/CodeGenCXX/module-intializer-pmf.cpp
index 7ab4a2e2bd78..b553839784c9 100644
--- a/clang/test/CodeGenCXX/module-intializer-pmf.cpp
+++ b/clang/test/CodeGenCXX/module-intializer-pmf.cpp
@@ -2,7 +2,7 @@
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 %s \
 // RUN:    -emit-module-interface -o %T/HasPMF.pcm
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 %T/HasPMF.pcm \
-// RUN:  -S -emit-llvm -o - | FileCheck %s
+// RUN:  -emit-llvm -o - | FileCheck %s
 
 module;
 
diff --git a/clang/test/CodeGenCXX/module-intializer.cpp b/clang/test/CodeGenCXX/module-intializer.cpp
index 8a464ae7403d..318250a653a7 100644
--- a/clang/test/CodeGenCXX/module-intializer.cpp
+++ b/clang/test/CodeGenCXX/module-intializer.cpp
@@ -4,43 +4,43 @@
 
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 N.cpp \
 // RUN:    -emit-module-interface -o N.pcm
-// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 N.pcm -S -emit-llvm \
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 N.pcm -emit-llvm \
 // RUN:  -o - | FileCheck %s --check-prefix=CHECK-N
 
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 O.cpp \
 // RUN:    -emit-module-interface -o O.pcm
-// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 O.pcm -S -emit-llvm \
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 O.pcm -emit-llvm \
 // RUN:  -o - | FileCheck %s --check-prefix=CHECK-O
 
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 M-Part.cpp \
 // RUN:    -emit-module-interface -o M-Part.pcm
-// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 M-Part.pcm -S \
-// RUN:    -emit-module-interface  -emit-llvm -o - | FileCheck %s --check-prefix=CHECK-P
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 M-Part.pcm \
+// RUN:    -emit-llvm -o - | FileCheck %s --check-prefix=CHECK-P
 
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 M.cpp \
 // RUN:    -fprebuilt-module-path=%t -emit-module-interface -o M.pcm
-// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 M.pcm -S -emit-llvm \
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 M.pcm -emit-llvm \
 // RUN:    -fprebuilt-module-path=%t -o - | FileCheck %s --check-prefix=CHECK-M
 
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 useM.cpp \
-// RUN:   -fprebuilt-module-path=%t -S -emit-llvm  -o - \
+// RUN:   -fprebuilt-module-path=%t -emit-llvm -o - \
 // RUN:   | FileCheck %s --check-prefix=CHECK-USE
 
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 M-impl.cpp \
-// RUN:   -fprebuilt-module-path=%t -S -emit-llvm  -o - \
+// RUN:   -fprebuilt-module-path=%t -emit-llvm -o - \
 // RUN:   | FileCheck %s --check-prefix=CHECK-IMPL
 
-// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 N.cpp -S -emit-llvm \
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 N.cpp -emit-llvm \
 // RUN:   -o - | FileCheck %s --check-prefix=CHECK-N
 
-// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 O.cpp -S -emit-llvm \
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 O.cpp -emit-llvm \
 // RUN:   -o - | FileCheck %s --check-prefix=CHECK-O
 
-// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 M-Part.cpp -S -emit-llvm \
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 M-Part.cpp -emit-llvm \
 // RUN:   -o - | FileCheck %s --check-prefix=CHECK-P
 
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 M.cpp \
-// RUN:   -fprebuilt-module-path=%t -S -emit-llvm -o - | FileCheck %s --check-prefix=CHECK-M
+// RUN:   -fprebuilt-module-path=%t -emit-llvm -o - | FileCheck %s --check-prefix=CHECK-M
 
 //--- N-h.h
 
diff --git a/clang/test/CodeGenCXX/msabi-blocks.cpp b/clang/test/CodeGenCXX/msabi-blocks.cpp
index 02d0958f635c..dc30bdd99f7b 100644
--- a/clang/test/CodeGenCXX/msabi-blocks.cpp
+++ b/clang/test/CodeGenCXX/msabi-blocks.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -std=c++11 -fblocks -S -o - -emit-llvm %s | FileCheck %s -check-prefix CHECK-X86
-// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -std=c++11 -fblocks -S -o - -emit-llvm %s | FileCheck %s -check-prefix CHECK-X64
+// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -std=c++11 -fblocks -o - -emit-llvm %s | FileCheck %s -check-prefix CHECK-X86
+// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -std=c++11 -fblocks -o - -emit-llvm %s | FileCheck %s -check-prefix CHECK-X64
 
 extern int e(void);
 
diff --git a/clang/test/CodeGenCXX/partial-init.cpp b/clang/test/CodeGenCXX/partial-init.cpp
index cb94660915ca..232a384c3e75 100644
--- a/clang/test/CodeGenCXX/partial-init.cpp
+++ b/clang/test/CodeGenCXX/partial-init.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm -std=c++11 -fcxx-exceptions -fexceptions -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm -std=c++11 -fcxx-exceptions -fexceptions -o - %s | FileCheck %s
 
 namespace std {
   struct string {
diff --git a/clang/test/CodeGenCXX/partitions.cpp b/clang/test/CodeGenCXX/partitions.cpp
index 3b3e69271e7c..d283dd071f6b 100644
--- a/clang/test/CodeGenCXX/partitions.cpp
+++ b/clang/test/CodeGenCXX/partitions.cpp
@@ -6,11 +6,11 @@
 // RUN: %clang_cc1 -std=c++20 -emit-module-interface -triple %itanium_abi_triple %t/partb.cppm -o %t/mod-partb.pcm
 // RUN: %clang_cc1 -std=c++20 -emit-module-interface -triple %itanium_abi_triple %t/mod.cppm \
 // RUN:   -fprebuilt-module-path=%t -o %t/mod.pcm
-// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/mod.pcm -S -emit-llvm -disable-llvm-passes -o - \
+// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/mod.pcm -emit-llvm -disable-llvm-passes -o - \
 // RUN:   -fprebuilt-module-path=%t | FileCheck %t/mod.cppm
 // RUN: %clang_cc1 -std=c++20 -O2 -emit-module-interface -triple %itanium_abi_triple \
 // RUN:   -fprebuilt-module-path=%t %t/mod.cppm -o %t/mod.pcm
-// RUN: %clang_cc1 -std=c++20 -O2 -triple %itanium_abi_triple %t/mod.pcm -S -emit-llvm \
+// RUN: %clang_cc1 -std=c++20 -O2 -triple %itanium_abi_triple %t/mod.pcm -emit-llvm \
 // RUN:   -fprebuilt-module-path=%t -disable-llvm-passes -o - | FileCheck %t/mod.cppm  -check-prefix=CHECK-OPT
 
 //--- parta.cppm
diff --git a/clang/test/CodeGenCXX/personality.cpp b/clang/test/CodeGenCXX/personality.cpp
index 42ba2c227f02..21a477fb705f 100644
--- a/clang/test/CodeGenCXX/personality.cpp
+++ b/clang/test/CodeGenCXX/personality.cpp
@@ -1,22 +1,22 @@
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fcxx-exceptions -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GNU
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=dwarf -fcxx-exceptions -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GNU-DWARF
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=seh -fcxx-exceptions -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GNU-SEH
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=sjlj -fcxx-exceptions -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GNU-SJLJ
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fcxx-exceptions -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GNU
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=dwarf -fcxx-exceptions -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GNU-DWARF
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=seh -fcxx-exceptions -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GNU-SEH
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=sjlj -fcxx-exceptions -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GNU-SJLJ
 
-// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fcxx-exceptions -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN
-// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -D __SEH_EXCEPTIONS__ -fms-extensions -fexceptions -fcxx-exceptions -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-SEH-X86
-// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -D __SEH_EXCEPTIONS__ -fms-extensions -fexceptions -fcxx-exceptions -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-SEH-X64
+// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fcxx-exceptions -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN
+// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -D __SEH_EXCEPTIONS__ -fms-extensions -fexceptions -fcxx-exceptions -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-SEH-X86
+// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -D __SEH_EXCEPTIONS__ -fms-extensions -fexceptions -fcxx-exceptions -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-SEH-X64
 
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fcxx-exceptions -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GNU
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=dwarf -fcxx-exceptions -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GNU-DWARF
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=seh -fcxx-exceptions -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GNU-SEH
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=sjlj -fcxx-exceptions -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GNU-SJLJ
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fcxx-exceptions -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GNU
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=dwarf -fcxx-exceptions -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GNU-DWARF
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=seh -fcxx-exceptions -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GNU-SEH
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=sjlj -fcxx-exceptions -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GNU-SJLJ
 
-// RUN: %clang_cc1 -triple powerpc-unknown-aix-xcoff -fexceptions -fcxx-exceptions -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-AIX
-// RUN: %clang_cc1 -triple powerpc64-unknown-aix-xcoff -fexceptions -fcxx-exceptions -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-AIX
+// RUN: %clang_cc1 -triple powerpc-unknown-aix-xcoff -fexceptions -fcxx-exceptions -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-AIX
+// RUN: %clang_cc1 -triple powerpc64-unknown-aix-xcoff -fexceptions -fcxx-exceptions -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-AIX
 
-// RUN: %clang_cc1 -triple s390x-unknown-zos -fexceptions -fcxx-exceptions -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-ZOS
-// RUN: %clang_cc1 -triple systemz-unknown-zos -fexceptions -fcxx-exceptions -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-ZOS
+// RUN: %clang_cc1 -triple s390x-unknown-zos -fexceptions -fcxx-exceptions -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-ZOS
+// RUN: %clang_cc1 -triple systemz-unknown-zos -fexceptions -fcxx-exceptions -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-ZOS
 
 extern void g();
 
diff --git a/clang/test/CodeGenCXX/poly-unsigned.cpp b/clang/test/CodeGenCXX/poly-unsigned.cpp
index 31970e495777..1ffcdda915bf 100644
--- a/clang/test/CodeGenCXX/poly-unsigned.cpp
+++ b/clang/test/CodeGenCXX/poly-unsigned.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios -target-feature +neon -ffreestanding -S -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-UNSIGNED-POLY %s
-// RUN: %clang_cc1 -triple arm64-linux-gnu -target-feature +neon -ffreestanding -S -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-UNSIGNED-POLY %s
-// RUN: %clang_cc1 -triple armv7-apple-ios -ffreestanding -target-cpu cortex-a8 -S -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-SIGNED-POLY %s
+// RUN: %clang_cc1 -triple arm64-apple-ios -target-feature +neon -ffreestanding -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-UNSIGNED-POLY %s
+// RUN: %clang_cc1 -triple arm64-linux-gnu -target-feature +neon -ffreestanding -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-UNSIGNED-POLY %s
+// RUN: %clang_cc1 -triple armv7-apple-ios -ffreestanding -target-cpu cortex-a8 -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-SIGNED-POLY %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
diff --git a/clang/test/CodeGenCXX/pr29160.cpp b/clang/test/CodeGenCXX/pr29160.cpp
index 09cf2a637072..c2c32f98f917 100644
--- a/clang/test/CodeGenCXX/pr29160.cpp
+++ b/clang/test/CodeGenCXX/pr29160.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++11 -triple i686-linux-gnu %s -o /dev/null -S -emit-llvm
+// RUN: %clang_cc1 -std=c++11 -triple i686-linux-gnu %s -o /dev/null -emit-llvm
 //
 // This test's failure mode is running ~forever. (For some value of "forever"
 // that's greater than 25 minutes on my machine)
diff --git a/clang/test/CodeGenCXX/static-init-4.cpp b/clang/test/CodeGenCXX/static-init-4.cpp
index e482a79fc001..c0adac4afafe 100644
--- a/clang/test/CodeGenCXX/static-init-4.cpp
+++ b/clang/test/CodeGenCXX/static-init-4.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++11 -S -emit-llvm -o - %s -triple x86_64-linux-gnu | FileCheck %s
+// RUN: %clang_cc1 -std=c++11 -emit-llvm -o - %s -triple x86_64-linux-gnu | FileCheck %s
 
 typedef __attribute__((vector_size(4*4))) float float32x4_t;
 union QDSUnion { float32x4_t q; float s[4]; };
diff --git a/clang/test/CodeGenCXX/static-init-inline-variable.cpp b/clang/test/CodeGenCXX/static-init-inline-variable.cpp
index 873e32544c7e..77aed61431cc 100644
--- a/clang/test/CodeGenCXX/static-init-inline-variable.cpp
+++ b/clang/test/CodeGenCXX/static-init-inline-variable.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++17 -S -emit-llvm -disable-llvm-passes -o - %s -triple x86_64-linux-gnu | FileCheck %s
+// RUN: %clang_cc1 -std=c++17 -emit-llvm -disable-llvm-passes -o - %s -triple x86_64-linux-gnu | FileCheck %s
 
 struct A {
   int x;
diff --git a/clang/test/CodeGenCXX/static-init-variable-template.cpp b/clang/test/CodeGenCXX/static-init-variable-template.cpp
index 672ab11059a3..b2a2cb58013d 100644
--- a/clang/test/CodeGenCXX/static-init-variable-template.cpp
+++ b/clang/test/CodeGenCXX/static-init-variable-template.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++14 -S -emit-llvm -disable-llvm-passes -o - %s -triple x86_64-linux-gnu | FileCheck %s
+// RUN: %clang_cc1 -std=c++14 -emit-llvm -disable-llvm-passes -o - %s -triple x86_64-linux-gnu | FileCheck %s
 
 template<int N> int Fib = Fib<N-2> + Fib<N-1>;
 template<> int Fib<0> = 0;
diff --git a/clang/test/CodeGenCXX/this-nonnull.cpp b/clang/test/CodeGenCXX/this-nonnull.cpp
index d1f8e0b2e34a..975a42291e2b 100644
--- a/clang/test/CodeGenCXX/this-nonnull.cpp
+++ b/clang/test/CodeGenCXX/this-nonnull.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -S -emit-llvm -o - -triple=x86_64-linux-gnu %s | FileCheck %s -check-prefix=CHECK-YES
-// RUN: %clang_cc1 -S -emit-llvm -o - -fno-delete-null-pointer-checks -triple=x86_64-linux-gnu %s | FileCheck %s -check-prefix=CHECK-NO
+// RUN: %clang_cc1 -emit-llvm -o - -triple=x86_64-linux-gnu %s | FileCheck %s -check-prefix=CHECK-YES
+// RUN: %clang_cc1 -emit-llvm -o - -fno-delete-null-pointer-checks -triple=x86_64-linux-gnu %s | FileCheck %s -check-prefix=CHECK-NO
 
 struct Struct {
   int many;
diff --git a/clang/test/CodeGenCXX/tls-init-funcs.cpp b/clang/test/CodeGenCXX/tls-init-funcs.cpp
index 5f66f6f95259..e98b425b2670 100644
--- a/clang/test/CodeGenCXX/tls-init-funcs.cpp
+++ b/clang/test/CodeGenCXX/tls-init-funcs.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.8 -std=c++1y -S -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-windows-gnu -std=c++1y -S -emit-llvm %s -o - | FileCheck %s --check-prefix=MINGW
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.8 -std=c++1y -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-windows-gnu -std=c++1y -emit-llvm %s -o - | FileCheck %s --check-prefix=MINGW
 
 // CHECK: @a = internal thread_local global
 // CHECK: @_Z2vtIiE = linkonce_odr thread_local global i32 5
diff --git a/clang/test/CodeGenCXX/tmp-md-nodes1.cpp b/clang/test/CodeGenCXX/tmp-md-nodes1.cpp
index 41a0159b0fc4..524b2c08c1ad 100644
--- a/clang/test/CodeGenCXX/tmp-md-nodes1.cpp
+++ b/clang/test/CodeGenCXX/tmp-md-nodes1.cpp
@@ -1,5 +1,5 @@
 // REQUIRES: asserts
-// RUN: %clang_cc1 -O0 -triple %itanium_abi_triple -debug-info-kind=limited -S -emit-llvm %s -o - | \
+// RUN: %clang_cc1 -O0 -triple %itanium_abi_triple -debug-info-kind=limited -emit-llvm %s -o - | \
 // RUN: FileCheck %s
 
 // This test simply checks that the varargs thunk is created. The failing test
diff --git a/clang/test/CodeGenCXX/tmp-md-nodes2.cpp b/clang/test/CodeGenCXX/tmp-md-nodes2.cpp
index e50220cfb7c3..e5081884447b 100644
--- a/clang/test/CodeGenCXX/tmp-md-nodes2.cpp
+++ b/clang/test/CodeGenCXX/tmp-md-nodes2.cpp
@@ -1,5 +1,7 @@
 // REQUIRES: asserts
-// RUN: %clang_cc1 -O0 -triple %itanium_abi_triple -debug-info-kind=limited -S -emit-llvm %s -o - | \
+// RUN: %clang_cc1 -O0 -triple %itanium_abi_triple -debug-info-kind=limited -emit-llvm %s -o - | \
+// RUN: FileCheck %s
+// RUN: %clang_cc1 -O0 -triple %itanium_abi_triple -debug-info-kind=limited -emit-llvm -mllvm --experimental-debuginfo-iterators=true %s -o - | \
 // RUN: FileCheck %s
 
 // This test simply checks that the varargs thunk is created. The failing test
diff --git a/clang/test/CodeGenCXX/ubsan-new-checks.cpp b/clang/test/CodeGenCXX/ubsan-new-checks.cpp
index 352f56b734de..60edd323648a 100644
--- a/clang/test/CodeGenCXX/ubsan-new-checks.cpp
+++ b/clang/test/CodeGenCXX/ubsan-new-checks.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++11 -S -emit-llvm -fsanitize=alignment %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++11 -emit-llvm -fsanitize=alignment %s -o - | FileCheck %s
 
 struct alignas(32) S1 {
   int x;
diff --git a/clang/test/CodeGenCXX/union-dtor.cpp b/clang/test/CodeGenCXX/union-dtor.cpp
index a0b822aa54dd..6b742a83f821 100644
--- a/clang/test/CodeGenCXX/union-dtor.cpp
+++ b/clang/test/CodeGenCXX/union-dtor.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++11 %s -S -o - -emit-llvm | FileCheck %s
+// RUN: %clang_cc1 -std=c++11 %s -o - -emit-llvm | FileCheck %s
 
 // PR10304: destructors should not call destructors for variant members.
 
diff --git a/clang/test/CodeGenCXX/visibility-dllstorageclass.cpp b/clang/test/CodeGenCXX/visibility-dllstorageclass.cpp
index 3ea00bd48899..eecdb4e35dc1 100644
--- a/clang/test/CodeGenCXX/visibility-dllstorageclass.cpp
+++ b/clang/test/CodeGenCXX/visibility-dllstorageclass.cpp
@@ -7,7 +7,7 @@
 // RUN:     -fvisibility=hidden \
 // RUN:     -fapply-global-visibility-to-externs \
 // RUN:     -fvisibility-from-dllstorageclass \
-// RUN:     -x c++ %s -S -emit-llvm -o - | \
+// RUN:     -x c++ %s -emit-llvm -o - | \
 // RUN:   FileCheck %s --check-prefixes=DEFAULTS
 
 // RUN: %clang_cc1 -triple x86_64-unknown-windows-itanium -fdeclspec \
@@ -18,7 +18,7 @@
 // RUN:     -fvisibility-nodllstorageclass=protected \
 // RUN:     -fvisibility-externs-dllimport=hidden \
 // RUN:     -fvisibility-externs-nodllstorageclass=protected \
-// RUN:     -x c++  %s -S -emit-llvm -o - | \
+// RUN:     -x c++  %s -emit-llvm -o - | \
 // RUN:   FileCheck %s --check-prefixes=EXPLICIT
 
 // RUN: %clang_cc1 -triple x86_64-unknown-windows-itanium -fdeclspec \
@@ -29,7 +29,7 @@
 // RUN:     -fvisibility-nodllstorageclass=default \
 // RUN:     -fvisibility-externs-dllimport=default \
 // RUN:     -fvisibility-externs-nodllstorageclass=default \
-// RUN:     -x c++  %s -S -emit-llvm -o - | \
+// RUN:     -x c++  %s -emit-llvm -o - | \
 // RUN:   FileCheck %s --check-prefixes=ALL_DEFAULT
 
 // RUN: %clang_cc1 -triple x86_64-unknown-windows-itanium -fdeclspec \
@@ -40,7 +40,7 @@
 // RUN:     -fvisibility-nodllstorageclass=keep \
 // RUN:     -fvisibility-externs-dllimport=keep \
 // RUN:     -fvisibility-externs-nodllstorageclass=keep \
-// RUN:     -x c++  %s -S -emit-llvm -o - | \
+// RUN:     -x c++  %s -emit-llvm -o - | \
 // RUN:   FileCheck %s --check-prefixes=ALL_KEEP
 
 // Local
diff --git a/clang/test/CodeGenCXX/weak-external.cpp b/clang/test/CodeGenCXX/weak-external.cpp
index 5fc37f73bb46..5eb262cdbead 100644
--- a/clang/test/CodeGenCXX/weak-external.cpp
+++ b/clang/test/CodeGenCXX/weak-external.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -triple %itanium_abi_triple %s -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -triple %itanium_abi_triple %s -emit-llvm -o - | FileCheck %s
 // PR4262
 
 // CHECK-NOT: _ZNSs12_S_constructIPKcEEPcT_S3_RKSaIcESt20forward_iterator_tag
diff --git a/clang/test/CodeGenCXX/windows-on-arm-stack-probe-size.cpp b/clang/test/CodeGenCXX/windows-on-arm-stack-probe-size.cpp
index 235d8a0c465f..850323183fba 100644
--- a/clang/test/CodeGenCXX/windows-on-arm-stack-probe-size.cpp
+++ b/clang/test/CodeGenCXX/windows-on-arm-stack-probe-size.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple thumbv7--windows-msvc -S -emit-llvm -o - -x c++ %s | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv7--windows-itanium -fno-use-cxa-atexit -S -emit-llvm -o - -x c++ %s | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv7--windows-msvc -emit-llvm -o - -x c++ %s | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv7--windows-itanium -fno-use-cxa-atexit -emit-llvm -o - -x c++ %s | FileCheck %s
 
 class C {
 public:
diff --git a/clang/test/CodeGenCoroutines/coro-aligned-alloc-2.cpp b/clang/test/CodeGenCoroutines/coro-aligned-alloc-2.cpp
index a3ee964a22a1..21c2e45b890f 100644
--- a/clang/test/CodeGenCoroutines/coro-aligned-alloc-2.cpp
+++ b/clang/test/CodeGenCoroutines/coro-aligned-alloc-2.cpp
@@ -2,7 +2,7 @@
 // Test the compiler will chose sized deallocation correctly.
 // This is only enabled with `-fsized-deallocation` which is off by default.
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 \
-// RUN:   -fcoro-aligned-allocation -S -emit-llvm %s -o - -disable-llvm-passes \
+// RUN:   -fcoro-aligned-allocation -emit-llvm %s -o - -disable-llvm-passes \
 // RUN:   -fsized-deallocation \
 // RUN:   | FileCheck %s
 
diff --git a/clang/test/CodeGenCoroutines/coro-aligned-alloc.cpp b/clang/test/CodeGenCoroutines/coro-aligned-alloc.cpp
index d14c3d372ddb..8019926b730c 100644
--- a/clang/test/CodeGenCoroutines/coro-aligned-alloc.cpp
+++ b/clang/test/CodeGenCoroutines/coro-aligned-alloc.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 \
-// RUN:   -fcoro-aligned-allocation -S -emit-llvm %s -o - -disable-llvm-passes \
+// RUN:   -fcoro-aligned-allocation -emit-llvm %s -o - -disable-llvm-passes \
 // RUN:   | FileCheck %s
 
 #include "Inputs/coroutine.h"
diff --git a/clang/test/CodeGenCoroutines/coro-alloc-2.cpp b/clang/test/CodeGenCoroutines/coro-alloc-2.cpp
index 8f4b8c5241f6..9c60c32a5c54 100644
--- a/clang/test/CodeGenCoroutines/coro-alloc-2.cpp
+++ b/clang/test/CodeGenCoroutines/coro-alloc-2.cpp
@@ -1,5 +1,5 @@
 // Tests that we wouldn't generate an allocation call in global scope with (std::size_t, p0, ..., pn)
-// RUN: %clang_cc1 %s -std=c++20 -S -triple x86_64-unknown-linux-gnu -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s
+// RUN: %clang_cc1 %s -std=c++20 -triple x86_64 -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s
 #include "Inputs/coroutine.h"
 
 namespace std {
diff --git a/clang/test/CodeGenCoroutines/coro-dealloc.cpp b/clang/test/CodeGenCoroutines/coro-dealloc.cpp
index 1f7d04b3689e..3cdba6cafdc0 100644
--- a/clang/test/CodeGenCoroutines/coro-dealloc.cpp
+++ b/clang/test/CodeGenCoroutines/coro-dealloc.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 \
-// RUN:   -S -emit-llvm %s -o - -disable-llvm-passes \
+// RUN:   -emit-llvm %s -o - -disable-llvm-passes \
 // RUN:   -fsized-deallocation \
 // RUN:   | FileCheck %s
 
diff --git a/clang/test/CodeGenCoroutines/coro-destructor-of-final_suspend.cpp b/clang/test/CodeGenCoroutines/coro-destructor-of-final_suspend.cpp
index 079a0d723eb1..bbc16a196d30 100644
--- a/clang/test/CodeGenCoroutines/coro-destructor-of-final_suspend.cpp
+++ b/clang/test/CodeGenCoroutines/coro-destructor-of-final_suspend.cpp
@@ -1,6 +1,6 @@
 // This addresses https://github.com/llvm/llvm-project/issues/57339
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 -fcxx-exceptions \
-// RUN:     -fexceptions -S -emit-llvm -o - %s -O1 | FileCheck %s
+// RUN:     -fexceptions -emit-llvm -o - %s -O1 | FileCheck %s
 
 #include "Inputs/coroutine.h"
 
diff --git a/clang/test/CodeGenCoroutines/coro-dwarf.cpp b/clang/test/CodeGenCoroutines/coro-dwarf.cpp
index f951b63dc117..0ab70ef55c1d 100644
--- a/clang/test/CodeGenCoroutines/coro-dwarf.cpp
+++ b/clang/test/CodeGenCoroutines/coro-dwarf.cpp
@@ -71,14 +71,10 @@ void f_coro(int val, MoveOnly moParam, MoveAndCopy mcParam) {
 // CHECK: !{{[0-9]+}} = !DILocalVariable(name: "mcParam", arg: 3, scope: ![[SP]], file: !{{[0-9]+}}, line: {{[0-9]+}}, type: !{{[0-9]+}})
 // CHECK: !{{[0-9]+}} = !DILocalVariable(name: "__promise",
 
-// CHECK: !{{[0-9]+}} = distinct !DISubprogram(linkageName: "_Z6f_coroi8MoveOnly11MoveAndCopy.__await_suspend_wrapper__init"
-// CHECK-NEXT: !{{[0-9]+}} = !DIFile
-// CHECK-NEXT: !{{[0-9]+}} = !DISubroutineType
-// CHECK-NEXT: !{{[0-9]+}} = !DILocalVariable(arg: 1,
-// CHECK-NEXT: !{{[0-9]+}} = !DILocation
-// CHECK-NEXT: !{{[0-9]+}} = !DILocalVariable(arg: 2,
+// CHECK: ![[INIT:[0-9]+]] = distinct !DISubprogram(linkageName: "_Z6f_coroi8MoveOnly11MoveAndCopy.__await_suspend_wrapper__init"
+// CHECK: !{{[0-9]+}} = !DILocalVariable(arg: 1, scope: ![[INIT]]
+// CHECK: !{{[0-9]+}} = !DILocalVariable(arg: 2, scope: ![[INIT]]
 
-// CHECK: !{{[0-9]+}} = distinct !DISubprogram(linkageName: "_Z6f_coroi8MoveOnly11MoveAndCopy.__await_suspend_wrapper__final"
-// CHECK-NEXT: !{{[0-9]+}} = !DILocalVariable(arg: 1,
-// CHECK-NEXT: !{{[0-9]+}} = !DILocation
-// CHECK-NEXT: !{{[0-9]+}} = !DILocalVariable(arg: 2,
+// CHECK: ![[FINAL:[0-9]+]] = distinct !DISubprogram(linkageName: "_Z6f_coroi8MoveOnly11MoveAndCopy.__await_suspend_wrapper__final"
+// CHECK: !{{[0-9]+}} = !DILocalVariable(arg: 1, scope: ![[FINAL]]
+// CHECK: !{{[0-9]+}} = !DILocalVariable(arg: 2, scope: ![[FINAL]]
diff --git a/clang/test/CodeGenCoroutines/coro-elide-thinlto.cpp b/clang/test/CodeGenCoroutines/coro-elide-thinlto.cpp
deleted file mode 100644
index 5b2d01465784..000000000000
--- a/clang/test/CodeGenCoroutines/coro-elide-thinlto.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-// REQUIRES: x86_64-linux
-// This tests that the coroutine elide optimization could happen succesfully with ThinLTO.
-// This test is adapted from coro-elide.cpp and splits functions into two files.
-//
-// RUN: split-file %s %t
-// RUN: %clang --target=x86_64-linux -std=c++20 -O2 -flto=thin -I %S -c %t/coro-elide-callee.cpp -o %t/coro-elide-callee.o
-// RUN: %clang --target=x86_64-linux -std=c++20 -O2 -flto=thin -I %S -c %t/coro-elide-caller.cpp -o %t/coro-elide-caller.o
-// RUN: llvm-lto -thinlto %t/coro-elide-callee.o %t/coro-elide-caller.o -o %t/summary
-// RUN: %clang_cc1 -triple x86_64-unknown-linux -O2 -x ir %t/coro-elide-caller.o -fthinlto-index=%t/summary.thinlto.bc -emit-llvm -o - | FileCheck %s
-
-//--- coro-elide-task.h
-#pragma once
-#include "Inputs/coroutine.h"
-
-struct Task {
-  struct promise_type {
-    struct FinalAwaiter {
-      bool await_ready() const noexcept { return false; }
-      template <typename PromiseType>
-      std::coroutine_handle<> await_suspend(std::coroutine_handle<PromiseType> h) noexcept {
-        if (!h)
-          return std::noop_coroutine();
-        return h.promise().continuation;
-      }
-      void await_resume() noexcept {}
-    };
-    Task get_return_object() noexcept {
-      return std::coroutine_handle<promise_type>::from_promise(*this);
-    }
-    std::suspend_always initial_suspend() noexcept { return {}; }
-    FinalAwaiter final_suspend() noexcept { return {}; }
-    void unhandled_exception() noexcept {}
-    void return_value(int x) noexcept {
-      _value = x;
-    }
-    std::coroutine_handle<> continuation;
-    int _value;
-  };
-
-  Task(std::coroutine_handle<promise_type> handle) : handle(handle) {}
-  ~Task() {
-    if (handle)
-      handle.destroy();
-  }
-
-  struct Awaiter {
-    bool await_ready() const noexcept { return false; }
-    void await_suspend(std::coroutine_handle<void> continuation) noexcept {}
-    int await_resume() noexcept {
-      return 43;
-    }
-  };
-
-  auto operator co_await() {
-    return Awaiter{};
-  }
-
-private:
-  std::coroutine_handle<promise_type> handle;
-};
-
-//--- coro-elide-callee.cpp
-#include "coro-elide-task.h"
-Task task0() {
-  co_return 43;
-}
-
-//--- coro-elide-caller.cpp
-#include "coro-elide-task.h"
-
-Task task0();
-
-Task task1() {
-  co_return co_await task0();
-}
-
-// CHECK-LABEL: define{{.*}} void @_Z5task1v.resume
-// CHECK-NOT: {{.*}}_Znwm
diff --git a/clang/test/CodeGenCoroutines/pr56329.cpp b/clang/test/CodeGenCoroutines/pr56329.cpp
index ad8b1b990179..d00022a6c4de 100644
--- a/clang/test/CodeGenCoroutines/pr56329.cpp
+++ b/clang/test/CodeGenCoroutines/pr56329.cpp
@@ -1,6 +1,6 @@
 // Test for PR56919. Tests the we won't contain the resumption of final suspend point.
 //
-// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 %s -O3 -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 %s -O3 -emit-llvm -o - | FileCheck %s
 // This test is expected to fail on PowerPC.
 // XFAIL: target=powerpc{{.*}}
 
diff --git a/clang/test/CodeGenCoroutines/pr59221.cpp b/clang/test/CodeGenCoroutines/pr59221.cpp
index e0e3de559a40..c27afa260316 100644
--- a/clang/test/CodeGenCoroutines/pr59221.cpp
+++ b/clang/test/CodeGenCoroutines/pr59221.cpp
@@ -2,7 +2,7 @@
 //
 // REQUIRES: x86-registered-target
 //
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 %s -O1 -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 %s -O1 -emit-llvm -o - | FileCheck %s
 
 #include "Inputs/coroutine.h"
 
diff --git a/clang/test/CodeGenHLSL/GlobalConstructorFunction.hlsl b/clang/test/CodeGenHLSL/GlobalConstructorFunction.hlsl
index d65dec4a1ddf..f954c9d2f029 100644
--- a/clang/test/CodeGenHLSL/GlobalConstructorFunction.hlsl
+++ b/clang/test/CodeGenHLSL/GlobalConstructorFunction.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -S -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s
 
 int i;
 
diff --git a/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl b/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl
index e7fe4e0c4caf..2c5c4e19c329 100644
--- a/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl
+++ b/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -S -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s
 
 // Make sure global variable for ctors exist for lib profile.
 // CHECK:@llvm.global_ctors
diff --git a/clang/test/CodeGenHLSL/GlobalConstructors.hlsl b/clang/test/CodeGenHLSL/GlobalConstructors.hlsl
index 332302e1a8bb..7e2f288726c9 100644
--- a/clang/test/CodeGenHLSL/GlobalConstructors.hlsl
+++ b/clang/test/CodeGenHLSL/GlobalConstructors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -S -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s
 
 RWBuffer<float> Buffer;
 
diff --git a/clang/test/CodeGenHLSL/GlobalDestructors.hlsl b/clang/test/CodeGenHLSL/GlobalDestructors.hlsl
index b245af7c0f7b..24c3c039fc61 100644
--- a/clang/test/CodeGenHLSL/GlobalDestructors.hlsl
+++ b/clang/test/CodeGenHLSL/GlobalDestructors.hlsl
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -std=hlsl202x -S -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s --check-prefixes=CS,CHECK
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -std=hlsl202x -S -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s --check-prefixes=LIB,CHECK
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -std=hlsl202x -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s --check-prefixes=CS,CHECK
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -std=hlsl202x -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s --check-prefixes=LIB,CHECK
 
 // Make sure global variable for dtors exist for lib profile.
 // LIB:@llvm.global_dtors
diff --git a/clang/test/CodeGenHLSL/basic-target.c b/clang/test/CodeGenHLSL/basic-target.c
index 8db711c3f2a5..b97ebf90a7a1 100644
--- a/clang/test/CodeGenHLSL/basic-target.c
+++ b/clang/test/CodeGenHLSL/basic-target.c
@@ -7,4 +7,4 @@
 // RUN: %clang -target dxil-pc-shadermodel6.0-geometry -S -emit-llvm -o - %s | FileCheck %s
 
 // CHECK: target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
-// CHECK: target triple = "dxil-pc-shadermodel6.0-{{[a-z]+}}"
+// CHECK: target triple = "dxilv1.0-pc-shadermodel6.0-{{[a-z]+}}"
diff --git a/clang/test/CodeGenHLSL/builtins/ScalarSwizzles.hlsl b/clang/test/CodeGenHLSL/builtins/ScalarSwizzles.hlsl
index 36f71f6860c0..6395ddc2fee2 100644
--- a/clang/test/CodeGenHLSL/builtins/ScalarSwizzles.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/ScalarSwizzles.hlsl
@@ -55,7 +55,7 @@ vector<uint64_t,4> FillOneUnsignedLong(){
 // CHECK: [[vec2:%.*]] = shufflevector <1 x double> [[vec1]], <1 x double> poison, <2 x i32> zeroinitializer
 // CHECK: ret <2 x double> [[vec2]]
 double2 FillTwoPointFive(){
-  return 2.5.rr;
+  return 2.5l.rr;
 }
 
 // CHECK-LABEL: FillOneHalf
@@ -65,7 +65,7 @@ double2 FillTwoPointFive(){
 // CHECK: [[vec3:%.*]] = shufflevector <1 x double> [[vec1]], <1 x double> poison, <3 x i32> zeroinitializer
 // CHECK: ret <3 x double> [[vec3]]
 double3 FillOneHalf(){
-  return .5.rrr;
+  return .5l.rrr;
 }
 
 // CHECK-LABEL: FillTwoPointFiveFloat
@@ -110,7 +110,7 @@ float2 HowManyFloats(float V) {
   return V.rr.rr;
 }
 
-// This codegen is gnarly because `1.` is a double, so this creates double
+// This codegen is gnarly because `1.l` is a double, so this creates double
 // vectors that need to be truncated down to floats. The optimizer cleans this
 // up nicely too.
 
@@ -123,6 +123,17 @@ float2 HowManyFloats(float V) {
 // CHECK: ret <3 x float> [[vec3f]]
 
 float3 AllRighty() {
+  return 1.l.rrr;
+}
+
+// CHECK-LABEL: AllRighty2
+// CHECK: [[vec1Ptr:%.*]] = alloca <1 x float>, align 4
+// CHECK: store <1 x float> <float 1.000000e+00>, ptr [[vec1Ptr]], align 4
+// CHECK: [[vec1:%.*]] = load <1 x float>, ptr [[vec1Ptr]], align 4
+// CHECK: [[vec3:%.*]] = shufflevector <1 x float> [[vec1]], <1 x float> poison, <3 x i32>
+// CHECK: ret <3 x float> [[vec3]]
+
+float3 AllRighty2() {
   return 1..rrr;
 }
 
diff --git a/clang/test/CodeGenHLSL/builtins/clamp.hlsl b/clang/test/CodeGenHLSL/builtins/clamp.hlsl
index 029e48ffe258..186114581e9c 100644
--- a/clang/test/CodeGenHLSL/builtins/clamp.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/clamp.hlsl
@@ -1,134 +1,134 @@
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
-
-#ifdef __HLSL_ENABLE_16_BIT
-// NATIVE_HALF: define noundef i16 @
-// NATIVE_HALF: call i16 @llvm.dx.clamp.i16(
-int16_t test_clamp_short(int16_t p0, int16_t p1) { return clamp(p0, p1,p1); }
-// NATIVE_HALF: define noundef <2 x i16> @
-// NATIVE_HALF: call <2 x i16> @llvm.dx.clamp.v2i16(
-int16_t2 test_clamp_short2(int16_t2 p0, int16_t2 p1) { return clamp(p0, p1,p1); }
-// NATIVE_HALF: define noundef <3 x i16> @
-// NATIVE_HALF: call <3 x i16> @llvm.dx.clamp.v3i16
-int16_t3 test_clamp_short3(int16_t3 p0, int16_t3 p1) { return clamp(p0, p1,p1); }
-// NATIVE_HALF: define noundef <4 x i16> @
-// NATIVE_HALF: call <4 x i16> @llvm.dx.clamp.v4i16
-int16_t4 test_clamp_short4(int16_t4 p0, int16_t4 p1) { return clamp(p0, p1,p1); }
-
-// NATIVE_HALF: define noundef i16 @
-// NATIVE_HALF: call i16 @llvm.dx.uclamp.i16(
-uint16_t test_clamp_ushort(uint16_t p0, uint16_t p1) { return clamp(p0, p1,p1); }
-// NATIVE_HALF: define noundef <2 x i16> @
-// NATIVE_HALF: call <2 x i16> @llvm.dx.uclamp.v2i16
-uint16_t2 test_clamp_ushort2(uint16_t2 p0, uint16_t2 p1) { return clamp(p0, p1,p1); }
-// NATIVE_HALF: define noundef <3 x i16> @
-// NATIVE_HALF: call <3 x i16> @llvm.dx.uclamp.v3i16
-uint16_t3 test_clamp_ushort3(uint16_t3 p0, uint16_t3 p1) { return clamp(p0, p1,p1); }
-// NATIVE_HALF: define noundef <4 x i16> @
-// NATIVE_HALF: call <4 x i16> @llvm.dx.uclamp.v4i16
-uint16_t4 test_clamp_ushort4(uint16_t4 p0, uint16_t4 p1) { return clamp(p0, p1,p1); }
-#endif
-
-// CHECK: define noundef i32 @
-// CHECK: call i32 @llvm.dx.clamp.i32(
-int test_clamp_int(int p0, int p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <2 x i32> @
-// CHECK: call <2 x i32> @llvm.dx.clamp.v2i32
-int2 test_clamp_int2(int2 p0, int2 p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <3 x i32> @
-// CHECK: call <3 x i32> @llvm.dx.clamp.v3i32
-int3 test_clamp_int3(int3 p0, int3 p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <4 x i32> @
-// CHECK: call <4 x i32> @llvm.dx.clamp.v4i32
-int4 test_clamp_int4(int4 p0, int4 p1) { return clamp(p0, p1,p1); }
-
-// CHECK: define noundef i32 @
-// CHECK: call i32 @llvm.dx.uclamp.i32(
-int test_clamp_uint(uint p0, uint p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <2 x i32> @
-// CHECK: call <2 x i32> @llvm.dx.uclamp.v2i32
-uint2 test_clamp_uint2(uint2 p0, uint2 p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <3 x i32> @
-// CHECK: call <3 x i32> @llvm.dx.uclamp.v3i32
-uint3 test_clamp_uint3(uint3 p0, uint3 p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <4 x i32> @
-// CHECK: call <4 x i32> @llvm.dx.uclamp.v4i32
-uint4 test_clamp_uint4(uint4 p0, uint4 p1) { return clamp(p0, p1,p1); }
-
-// CHECK: define noundef i64 @
-// CHECK: call i64 @llvm.dx.clamp.i64(
-int64_t test_clamp_long(int64_t p0, int64_t p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <2 x i64> @
-// CHECK: call <2 x i64> @llvm.dx.clamp.v2i64
-int64_t2 test_clamp_long2(int64_t2 p0, int64_t2 p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <3 x i64> @
-// CHECK: call <3 x i64> @llvm.dx.clamp.v3i64
-int64_t3 test_clamp_long3(int64_t3 p0, int64_t3 p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <4 x i64> @
-// CHECK: call <4 x i64> @llvm.dx.clamp.v4i64
-int64_t4 test_clamp_long4(int64_t4 p0, int64_t4 p1) { return clamp(p0, p1,p1); }
-
-// CHECK: define noundef i64 @
-// CHECK: call i64 @llvm.dx.uclamp.i64(
-uint64_t test_clamp_long(uint64_t p0, uint64_t p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <2 x i64> @
-// CHECK: call <2 x i64> @llvm.dx.uclamp.v2i64
-uint64_t2 test_clamp_long2(uint64_t2 p0, uint64_t2 p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <3 x i64> @
-// CHECK: call <3 x i64> @llvm.dx.uclamp.v3i64
-uint64_t3 test_clamp_long3(uint64_t3 p0, uint64_t3 p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <4 x i64> @
-// CHECK: call <4 x i64> @llvm.dx.uclamp.v4i64
-uint64_t4 test_clamp_long4(uint64_t4 p0, uint64_t4 p1) { return clamp(p0, p1,p1); }
-
-// NATIVE_HALF: define noundef half @
-// NATIVE_HALF: call half @llvm.dx.clamp.f16(
-// NO_HALF: define noundef float @"?test_clamp_half
-// NO_HALF: call float @llvm.dx.clamp.f32(
-half test_clamp_half(half p0, half p1) { return clamp(p0, p1,p1); }
-// NATIVE_HALF: define noundef <2 x half> @
-// NATIVE_HALF: call <2 x half> @llvm.dx.clamp.v2f16
-// NO_HALF: define noundef <2 x float> @"?test_clamp_half2
-// NO_HALF: call <2 x float> @llvm.dx.clamp.v2f32(
-half2 test_clamp_half2(half2 p0, half2 p1) { return clamp(p0, p1,p1); }
-// NATIVE_HALF: define noundef <3 x half> @
-// NATIVE_HALF: call <3 x half> @llvm.dx.clamp.v3f16
-// NO_HALF: define noundef <3 x float> @"?test_clamp_half3
-// NO_HALF: call <3 x float> @llvm.dx.clamp.v3f32(
-half3 test_clamp_half3(half3 p0, half3 p1) { return clamp(p0, p1,p1); }
-// NATIVE_HALF: define noundef <4 x half> @
-// NATIVE_HALF: call <4 x half> @llvm.dx.clamp.v4f16
-// NO_HALF: define noundef <4 x float> @"?test_clamp_half4
-// NO_HALF: call <4 x float> @llvm.dx.clamp.v4f32(
-half4 test_clamp_half4(half4 p0, half4 p1) { return clamp(p0, p1,p1); }
-
-// CHECK: define noundef float @"?test_clamp_float
-// CHECK: call float @llvm.dx.clamp.f32(
-float test_clamp_float(float p0, float p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <2 x float> @"?test_clamp_float2
-// CHECK: call <2 x float> @llvm.dx.clamp.v2f32
-float2 test_clamp_float2(float2 p0, float2 p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <3 x float> @"?test_clamp_float3
-// CHECK: call <3 x float> @llvm.dx.clamp.v3f32
-float3 test_clamp_float3(float3 p0, float3 p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <4 x float> @"?test_clamp_float4
-// CHECK: call <4 x float> @llvm.dx.clamp.v4f32
-float4 test_clamp_float4(float4 p0, float4 p1) { return clamp(p0, p1,p1); }
-
-// CHECK: define noundef double @
-// CHECK: call double @llvm.dx.clamp.f64(
-double test_clamp_double(double p0, double p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <2 x double> @
-// CHECK: call <2 x double> @llvm.dx.clamp.v2f64
-double2 test_clamp_double2(double2 p0, double2 p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <3 x double> @
-// CHECK: call <3 x double> @llvm.dx.clamp.v3f64
-double3 test_clamp_double3(double3 p0, double3 p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <4 x double> @
-// CHECK: call <4 x double> @llvm.dx.clamp.v4f64
-double4 test_clamp_double4(double4 p0, double4 p1) { return clamp(p0, p1,p1); }
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+
+#ifdef __HLSL_ENABLE_16_BIT
+// NATIVE_HALF: define noundef i16 @
+// NATIVE_HALF: call i16 @llvm.dx.clamp.i16(
+int16_t test_clamp_short(int16_t p0, int16_t p1) { return clamp(p0, p1,p1); }
+// NATIVE_HALF: define noundef <2 x i16> @
+// NATIVE_HALF: call <2 x i16> @llvm.dx.clamp.v2i16(
+int16_t2 test_clamp_short2(int16_t2 p0, int16_t2 p1) { return clamp(p0, p1,p1); }
+// NATIVE_HALF: define noundef <3 x i16> @
+// NATIVE_HALF: call <3 x i16> @llvm.dx.clamp.v3i16
+int16_t3 test_clamp_short3(int16_t3 p0, int16_t3 p1) { return clamp(p0, p1,p1); }
+// NATIVE_HALF: define noundef <4 x i16> @
+// NATIVE_HALF: call <4 x i16> @llvm.dx.clamp.v4i16
+int16_t4 test_clamp_short4(int16_t4 p0, int16_t4 p1) { return clamp(p0, p1,p1); }
+
+// NATIVE_HALF: define noundef i16 @
+// NATIVE_HALF: call i16 @llvm.dx.uclamp.i16(
+uint16_t test_clamp_ushort(uint16_t p0, uint16_t p1) { return clamp(p0, p1,p1); }
+// NATIVE_HALF: define noundef <2 x i16> @
+// NATIVE_HALF: call <2 x i16> @llvm.dx.uclamp.v2i16
+uint16_t2 test_clamp_ushort2(uint16_t2 p0, uint16_t2 p1) { return clamp(p0, p1,p1); }
+// NATIVE_HALF: define noundef <3 x i16> @
+// NATIVE_HALF: call <3 x i16> @llvm.dx.uclamp.v3i16
+uint16_t3 test_clamp_ushort3(uint16_t3 p0, uint16_t3 p1) { return clamp(p0, p1,p1); }
+// NATIVE_HALF: define noundef <4 x i16> @
+// NATIVE_HALF: call <4 x i16> @llvm.dx.uclamp.v4i16
+uint16_t4 test_clamp_ushort4(uint16_t4 p0, uint16_t4 p1) { return clamp(p0, p1,p1); }
+#endif
+
+// CHECK: define noundef i32 @
+// CHECK: call i32 @llvm.dx.clamp.i32(
+int test_clamp_int(int p0, int p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <2 x i32> @
+// CHECK: call <2 x i32> @llvm.dx.clamp.v2i32
+int2 test_clamp_int2(int2 p0, int2 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <3 x i32> @
+// CHECK: call <3 x i32> @llvm.dx.clamp.v3i32
+int3 test_clamp_int3(int3 p0, int3 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <4 x i32> @
+// CHECK: call <4 x i32> @llvm.dx.clamp.v4i32
+int4 test_clamp_int4(int4 p0, int4 p1) { return clamp(p0, p1,p1); }
+
+// CHECK: define noundef i32 @
+// CHECK: call i32 @llvm.dx.uclamp.i32(
+int test_clamp_uint(uint p0, uint p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <2 x i32> @
+// CHECK: call <2 x i32> @llvm.dx.uclamp.v2i32
+uint2 test_clamp_uint2(uint2 p0, uint2 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <3 x i32> @
+// CHECK: call <3 x i32> @llvm.dx.uclamp.v3i32
+uint3 test_clamp_uint3(uint3 p0, uint3 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <4 x i32> @
+// CHECK: call <4 x i32> @llvm.dx.uclamp.v4i32
+uint4 test_clamp_uint4(uint4 p0, uint4 p1) { return clamp(p0, p1,p1); }
+
+// CHECK: define noundef i64 @
+// CHECK: call i64 @llvm.dx.clamp.i64(
+int64_t test_clamp_long(int64_t p0, int64_t p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <2 x i64> @
+// CHECK: call <2 x i64> @llvm.dx.clamp.v2i64
+int64_t2 test_clamp_long2(int64_t2 p0, int64_t2 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <3 x i64> @
+// CHECK: call <3 x i64> @llvm.dx.clamp.v3i64
+int64_t3 test_clamp_long3(int64_t3 p0, int64_t3 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <4 x i64> @
+// CHECK: call <4 x i64> @llvm.dx.clamp.v4i64
+int64_t4 test_clamp_long4(int64_t4 p0, int64_t4 p1) { return clamp(p0, p1,p1); }
+
+// CHECK: define noundef i64 @
+// CHECK: call i64 @llvm.dx.uclamp.i64(
+uint64_t test_clamp_long(uint64_t p0, uint64_t p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <2 x i64> @
+// CHECK: call <2 x i64> @llvm.dx.uclamp.v2i64
+uint64_t2 test_clamp_long2(uint64_t2 p0, uint64_t2 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <3 x i64> @
+// CHECK: call <3 x i64> @llvm.dx.uclamp.v3i64
+uint64_t3 test_clamp_long3(uint64_t3 p0, uint64_t3 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <4 x i64> @
+// CHECK: call <4 x i64> @llvm.dx.uclamp.v4i64
+uint64_t4 test_clamp_long4(uint64_t4 p0, uint64_t4 p1) { return clamp(p0, p1,p1); }
+
+// NATIVE_HALF: define noundef half @
+// NATIVE_HALF: call half @llvm.dx.clamp.f16(
+// NO_HALF: define noundef float @"?test_clamp_half
+// NO_HALF: call float @llvm.dx.clamp.f32(
+half test_clamp_half(half p0, half p1) { return clamp(p0, p1,p1); }
+// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF: call <2 x half> @llvm.dx.clamp.v2f16
+// NO_HALF: define noundef <2 x float> @"?test_clamp_half2
+// NO_HALF: call <2 x float> @llvm.dx.clamp.v2f32(
+half2 test_clamp_half2(half2 p0, half2 p1) { return clamp(p0, p1,p1); }
+// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF: call <3 x half> @llvm.dx.clamp.v3f16
+// NO_HALF: define noundef <3 x float> @"?test_clamp_half3
+// NO_HALF: call <3 x float> @llvm.dx.clamp.v3f32(
+half3 test_clamp_half3(half3 p0, half3 p1) { return clamp(p0, p1,p1); }
+// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF: call <4 x half> @llvm.dx.clamp.v4f16
+// NO_HALF: define noundef <4 x float> @"?test_clamp_half4
+// NO_HALF: call <4 x float> @llvm.dx.clamp.v4f32(
+half4 test_clamp_half4(half4 p0, half4 p1) { return clamp(p0, p1,p1); }
+
+// CHECK: define noundef float @"?test_clamp_float
+// CHECK: call float @llvm.dx.clamp.f32(
+float test_clamp_float(float p0, float p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <2 x float> @"?test_clamp_float2
+// CHECK: call <2 x float> @llvm.dx.clamp.v2f32
+float2 test_clamp_float2(float2 p0, float2 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <3 x float> @"?test_clamp_float3
+// CHECK: call <3 x float> @llvm.dx.clamp.v3f32
+float3 test_clamp_float3(float3 p0, float3 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <4 x float> @"?test_clamp_float4
+// CHECK: call <4 x float> @llvm.dx.clamp.v4f32
+float4 test_clamp_float4(float4 p0, float4 p1) { return clamp(p0, p1,p1); }
+
+// CHECK: define noundef double @
+// CHECK: call double @llvm.dx.clamp.f64(
+double test_clamp_double(double p0, double p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <2 x double> @
+// CHECK: call <2 x double> @llvm.dx.clamp.v2f64
+double2 test_clamp_double2(double2 p0, double2 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <3 x double> @
+// CHECK: call <3 x double> @llvm.dx.clamp.v3f64
+double3 test_clamp_double3(double3 p0, double3 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <4 x double> @
+// CHECK: call <4 x double> @llvm.dx.clamp.v4f64
+double4 test_clamp_double4(double4 p0, double4 p1) { return clamp(p0, p1,p1); }
diff --git a/clang/test/CodeGenHLSL/builtins/cos.hlsl b/clang/test/CodeGenHLSL/builtins/cos.hlsl
index fb416fcaa49d..58b630977881 100644
--- a/clang/test/CodeGenHLSL/builtins/cos.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/cos.hlsl
@@ -1,41 +1,41 @@
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
-
-// NATIVE_HALF: define noundef half @
-// NATIVE_HALF: call half @llvm.cos.f16(
-// NO_HALF: define noundef float @"?test_cos_half
-// NO_HALF: call float @llvm.cos.f32(
-half test_cos_half(half p0) { return cos(p0); }
-// NATIVE_HALF: define noundef <2 x half> @
-// NATIVE_HALF: call <2 x half> @llvm.cos.v2f16
-// NO_HALF: define noundef <2 x float> @"?test_cos_half2
-// NO_HALF: call <2 x float> @llvm.cos.v2f32(
-half2 test_cos_half2(half2 p0) { return cos(p0); }
-// NATIVE_HALF: define noundef <3 x half> @
-// NATIVE_HALF: call <3 x half> @llvm.cos.v3f16
-// NO_HALF: define noundef <3 x float> @"?test_cos_half3
-// NO_HALF: call <3 x float> @llvm.cos.v3f32(
-half3 test_cos_half3(half3 p0) { return cos(p0); }
-// NATIVE_HALF: define noundef <4 x half> @
-// NATIVE_HALF: call <4 x half> @llvm.cos.v4f16
-// NO_HALF: define noundef <4 x float> @"?test_cos_half4
-// NO_HALF: call <4 x float> @llvm.cos.v4f32(
-half4 test_cos_half4(half4 p0) { return cos(p0); }
-
-// CHECK: define noundef float @"?test_cos_float
-// CHECK: call float @llvm.cos.f32(
-float test_cos_float(float p0) { return cos(p0); }
-// CHECK: define noundef <2 x float> @"?test_cos_float2
-// CHECK: call <2 x float> @llvm.cos.v2f32
-float2 test_cos_float2(float2 p0) { return cos(p0); }
-// CHECK: define noundef <3 x float> @"?test_cos_float3
-// CHECK: call <3 x float> @llvm.cos.v3f32
-float3 test_cos_float3(float3 p0) { return cos(p0); }
-// CHECK: define noundef <4 x float> @"?test_cos_float4
-// CHECK: call <4 x float> @llvm.cos.v4f32
-float4 test_cos_float4(float4 p0) { return cos(p0); }
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+
+// NATIVE_HALF: define noundef half @
+// NATIVE_HALF: call half @llvm.cos.f16(
+// NO_HALF: define noundef float @"?test_cos_half
+// NO_HALF: call float @llvm.cos.f32(
+half test_cos_half(half p0) { return cos(p0); }
+// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF: call <2 x half> @llvm.cos.v2f16
+// NO_HALF: define noundef <2 x float> @"?test_cos_half2
+// NO_HALF: call <2 x float> @llvm.cos.v2f32(
+half2 test_cos_half2(half2 p0) { return cos(p0); }
+// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF: call <3 x half> @llvm.cos.v3f16
+// NO_HALF: define noundef <3 x float> @"?test_cos_half3
+// NO_HALF: call <3 x float> @llvm.cos.v3f32(
+half3 test_cos_half3(half3 p0) { return cos(p0); }
+// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF: call <4 x half> @llvm.cos.v4f16
+// NO_HALF: define noundef <4 x float> @"?test_cos_half4
+// NO_HALF: call <4 x float> @llvm.cos.v4f32(
+half4 test_cos_half4(half4 p0) { return cos(p0); }
+
+// CHECK: define noundef float @"?test_cos_float
+// CHECK: call float @llvm.cos.f32(
+float test_cos_float(float p0) { return cos(p0); }
+// CHECK: define noundef <2 x float> @"?test_cos_float2
+// CHECK: call <2 x float> @llvm.cos.v2f32
+float2 test_cos_float2(float2 p0) { return cos(p0); }
+// CHECK: define noundef <3 x float> @"?test_cos_float3
+// CHECK: call <3 x float> @llvm.cos.v3f32
+float3 test_cos_float3(float3 p0) { return cos(p0); }
+// CHECK: define noundef <4 x float> @"?test_cos_float4
+// CHECK: call <4 x float> @llvm.cos.v4f32
+float4 test_cos_float4(float4 p0) { return cos(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/floor.hlsl b/clang/test/CodeGenHLSL/builtins/floor.hlsl
index 07803bfae3be..48ddf713bcf5 100644
--- a/clang/test/CodeGenHLSL/builtins/floor.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/floor.hlsl
@@ -1,43 +1,43 @@
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
-
-using hlsl::floor;
-
-// NATIVE_HALF: define noundef half @
-// NATIVE_HALF: call half @llvm.floor.f16(
-// NO_HALF: define noundef float @"?test_floor_half@@YA$halff@$halff@@Z"(
-// NO_HALF: call float @llvm.floor.f32(float %0)
-half test_floor_half(half p0) { return floor(p0); }
-// NATIVE_HALF: define noundef <2 x half> @
-// NATIVE_HALF: call <2 x half> @llvm.floor.v2f16(
-// NO_HALF: define noundef <2 x float> @"?test_floor_half2@@YAT?$__vector@$halff@$01@__clang@@T12@@Z"(
-// NO_HALF: call <2 x float> @llvm.floor.v2f32(
-half2 test_floor_half2(half2 p0) { return floor(p0); }
-// NATIVE_HALF: define noundef <3 x half> @
-// NATIVE_HALF: call <3 x half> @llvm.floor.v3f16(
-// NO_HALF: define noundef <3 x float> @"?test_floor_half3@@YAT?$__vector@$halff@$02@__clang@@T12@@Z"(
-// NO_HALF: call <3 x float> @llvm.floor.v3f32(
-half3 test_floor_half3(half3 p0) { return floor(p0); }
-// NATIVE_HALF: define noundef <4 x half> @
-// NATIVE_HALF: call <4 x half> @llvm.floor.v4f16(
-// NO_HALF: define noundef <4 x float> @"?test_floor_half4@@YAT?$__vector@$halff@$03@__clang@@T12@@Z"(
-// NO_HALF: call <4 x float> @llvm.floor.v4f32(
-half4 test_floor_half4(half4 p0) { return floor(p0); }
-
-// CHECK: define noundef float @
-// CHECK: call float @llvm.floor.f32(
-float test_floor_float(float p0) { return floor(p0); }
-// CHECK: define noundef <2 x float> @
-// CHECK: call <2 x float> @llvm.floor.v2f32(
-float2 test_floor_float2(float2 p0) { return floor(p0); }
-// CHECK: define noundef <3 x float> @
-// CHECK: call <3 x float> @llvm.floor.v3f32(
-float3 test_floor_float3(float3 p0) { return floor(p0); }
-// CHECK: define noundef <4 x float> @
-// CHECK: call <4 x float> @llvm.floor.v4f32(
-float4 test_floor_float4(float4 p0) { return floor(p0); }
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+
+using hlsl::floor;
+
+// NATIVE_HALF: define noundef half @
+// NATIVE_HALF: call half @llvm.floor.f16(
+// NO_HALF: define noundef float @"?test_floor_half@@YA$halff@$halff@@Z"(
+// NO_HALF: call float @llvm.floor.f32(float %0)
+half test_floor_half(half p0) { return floor(p0); }
+// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF: call <2 x half> @llvm.floor.v2f16(
+// NO_HALF: define noundef <2 x float> @"?test_floor_half2@@YAT?$__vector@$halff@$01@__clang@@T12@@Z"(
+// NO_HALF: call <2 x float> @llvm.floor.v2f32(
+half2 test_floor_half2(half2 p0) { return floor(p0); }
+// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF: call <3 x half> @llvm.floor.v3f16(
+// NO_HALF: define noundef <3 x float> @"?test_floor_half3@@YAT?$__vector@$halff@$02@__clang@@T12@@Z"(
+// NO_HALF: call <3 x float> @llvm.floor.v3f32(
+half3 test_floor_half3(half3 p0) { return floor(p0); }
+// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF: call <4 x half> @llvm.floor.v4f16(
+// NO_HALF: define noundef <4 x float> @"?test_floor_half4@@YAT?$__vector@$halff@$03@__clang@@T12@@Z"(
+// NO_HALF: call <4 x float> @llvm.floor.v4f32(
+half4 test_floor_half4(half4 p0) { return floor(p0); }
+
+// CHECK: define noundef float @
+// CHECK: call float @llvm.floor.f32(
+float test_floor_float(float p0) { return floor(p0); }
+// CHECK: define noundef <2 x float> @
+// CHECK: call <2 x float> @llvm.floor.v2f32(
+float2 test_floor_float2(float2 p0) { return floor(p0); }
+// CHECK: define noundef <3 x float> @
+// CHECK: call <3 x float> @llvm.floor.v3f32(
+float3 test_floor_float3(float3 p0) { return floor(p0); }
+// CHECK: define noundef <4 x float> @
+// CHECK: call <4 x float> @llvm.floor.v4f32(
+float4 test_floor_float4(float4 p0) { return floor(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl
index cdc9abbd70e4..f9b3cbcddfb6 100644
--- a/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl
@@ -1,15 +1,15 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
-
-// CHECK-LABEL: builtin_lerp_half_vector
-// CHECK: %hlsl.lerp = call <3 x half> @llvm.dx.lerp.v3f16(<3 x half> %0, <3 x half> %1, <3 x half> %2)
-// CHECK: ret <3 x half> %hlsl.lerp
-half3 builtin_lerp_half_vector (half3 p0) {
-  return __builtin_hlsl_lerp ( p0, p0, p0 );
-}
-
-// CHECK-LABEL: builtin_lerp_floar_vector
-// CHECK: %hlsl.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
-// CHECK: ret <2 x float> %hlsl.lerp
-float2 builtin_lerp_floar_vector ( float2 p0) {
-  return __builtin_hlsl_lerp ( p0, p0, p0 );
-}
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+
+// CHECK-LABEL: builtin_lerp_half_vector
+// CHECK: %hlsl.lerp = call <3 x half> @llvm.dx.lerp.v3f16(<3 x half> %0, <3 x half> %1, <3 x half> %2)
+// CHECK: ret <3 x half> %hlsl.lerp
+half3 builtin_lerp_half_vector (half3 p0) {
+  return __builtin_hlsl_lerp ( p0, p0, p0 );
+}
+
+// CHECK-LABEL: builtin_lerp_floar_vector
+// CHECK: %hlsl.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
+// CHECK: ret <2 x float> %hlsl.lerp
+float2 builtin_lerp_floar_vector ( float2 p0) {
+  return __builtin_hlsl_lerp ( p0, p0, p0 );
+}
diff --git a/clang/test/CodeGenHLSL/builtins/lerp.hlsl b/clang/test/CodeGenHLSL/builtins/lerp.hlsl
index 634b20be3a28..87b2e3af5765 100644
--- a/clang/test/CodeGenHLSL/builtins/lerp.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/lerp.hlsl
@@ -1,102 +1,102 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
-// RUN:   --check-prefixes=CHECK,DXIL_CHECK,DXIL_NATIVE_HALF,NATIVE_HALF
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,DXIL_CHECK,NO_HALF,DXIL_NO_HALF
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF,SPIR_NATIVE_HALF,SPIR_CHECK
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF,SPIR_NO_HALF,SPIR_CHECK
-
-
-// DXIL_NATIVE_HALF: %hlsl.lerp = call half @llvm.dx.lerp.f16(half %0, half %1, half %2)
-// SPIR_NATIVE_HALF: %hlsl.lerp = call half @llvm.spv.lerp.f16(half %0, half %1, half %2)
-// NATIVE_HALF: ret half %hlsl.lerp
-// DXIL_NO_HALF: %hlsl.lerp = call float @llvm.dx.lerp.f32(float %0, float %1, float %2)
-// SPIR_NO_HALF: %hlsl.lerp = call float @llvm.spv.lerp.f32(float %0, float %1, float %2)
-// NO_HALF: ret float %hlsl.lerp
-half test_lerp_half(half p0) { return lerp(p0, p0, p0); }
-
-// DXIL_NATIVE_HALF: %hlsl.lerp = call <2 x half> @llvm.dx.lerp.v2f16(<2 x half> %0, <2 x half> %1, <2 x half> %2)
-// SPIR_NATIVE_HALF: %hlsl.lerp = call <2 x half> @llvm.spv.lerp.v2f16(<2 x half> %0, <2 x half> %1, <2 x half> %2)
-// NATIVE_HALF: ret <2 x half> %hlsl.lerp
-// DXIL_NO_HALF: %hlsl.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
-// SPIR_NO_HALF: %hlsl.lerp = call <2 x float> @llvm.spv.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
-// NO_HALF: ret <2 x float> %hlsl.lerp
-half2 test_lerp_half2(half2 p0) { return lerp(p0, p0, p0); }
-
-// DXIL_NATIVE_HALF: %hlsl.lerp = call <3 x half> @llvm.dx.lerp.v3f16(<3 x half> %0, <3 x half> %1, <3 x half> %2)
-// SPIR_NATIVE_HALF: %hlsl.lerp = call <3 x half> @llvm.spv.lerp.v3f16(<3 x half> %0, <3 x half> %1, <3 x half> %2)
-// NATIVE_HALF: ret <3 x half> %hlsl.lerp
-// DXIL_NO_HALF: %hlsl.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
-// SPIR_NO_HALF: %hlsl.lerp = call <3 x float> @llvm.spv.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
-// NO_HALF: ret <3 x float> %hlsl.lerp
-half3 test_lerp_half3(half3 p0) { return lerp(p0, p0, p0); }
-
-// DXIL_NATIVE_HALF: %hlsl.lerp = call <4 x half> @llvm.dx.lerp.v4f16(<4 x half> %0, <4 x half> %1, <4 x half> %2)
-// SPIR_NATIVE_HALF: %hlsl.lerp = call <4 x half> @llvm.spv.lerp.v4f16(<4 x half> %0, <4 x half> %1, <4 x half> %2)
-// NATIVE_HALF: ret <4 x half> %hlsl.lerp
-// DXIL_NO_HALF: %hlsl.lerp = call <4 x float> @llvm.dx.lerp.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
-// SPIR_NO_HALF: %hlsl.lerp = call <4 x float> @llvm.spv.lerp.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
-// NO_HALF: ret <4 x float> %hlsl.lerp
-half4 test_lerp_half4(half4 p0) { return lerp(p0, p0, p0); }
-
-// DXIL_CHECK: %hlsl.lerp = call float @llvm.dx.lerp.f32(float %0, float %1, float %2)
-// SPIR_CHECK: %hlsl.lerp = call float @llvm.spv.lerp.f32(float %0, float %1, float %2)
-// CHECK: ret float %hlsl.lerp
-float test_lerp_float(float p0) { return lerp(p0, p0, p0); }
-
-// DXIL_CHECK: %hlsl.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
-// SPIR_CHECK: %hlsl.lerp = call <2 x float> @llvm.spv.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
-// CHECK: ret <2 x float> %hlsl.lerp
-float2 test_lerp_float2(float2 p0) { return lerp(p0, p0, p0); }
-
-// DXIL_CHECK: %hlsl.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
-// SPIR_CHECK: %hlsl.lerp = call <3 x float> @llvm.spv.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
-// CHECK: ret <3 x float> %hlsl.lerp
-float3 test_lerp_float3(float3 p0) { return lerp(p0, p0, p0); }
-
-// DXIL_CHECK: %hlsl.lerp = call <4 x float> @llvm.dx.lerp.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
-// SPIR_CHECK: %hlsl.lerp = call <4 x float> @llvm.spv.lerp.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
-// CHECK: ret <4 x float> %hlsl.lerp
-float4 test_lerp_float4(float4 p0) { return lerp(p0, p0, p0); }
-
-// DXIL_CHECK: %hlsl.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %splat.splat, <2 x float> %1, <2 x float> %2)
-// SPIR_CHECK: %hlsl.lerp = call <2 x float> @llvm.spv.lerp.v2f32(<2 x float> %splat.splat, <2 x float> %1, <2 x float> %2)
-// CHECK: ret <2 x float> %hlsl.lerp
-float2 test_lerp_float2_splat(float p0, float2 p1) { return lerp(p0, p1, p1); }
-
-// DXIL_CHECK: %hlsl.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %splat.splat, <3 x float> %1, <3 x float> %2)
-// SPIR_CHECK: %hlsl.lerp = call <3 x float> @llvm.spv.lerp.v3f32(<3 x float> %splat.splat, <3 x float> %1, <3 x float> %2)
-// CHECK: ret <3 x float> %hlsl.lerp
-float3 test_lerp_float3_splat(float p0, float3 p1) { return lerp(p0, p1, p1); }
-
-// DXIL_CHECK:  %hlsl.lerp = call <4 x float> @llvm.dx.lerp.v4f32(<4 x float> %splat.splat, <4 x float> %1, <4 x float> %2)
-// SPIR_CHECK:  %hlsl.lerp = call <4 x float> @llvm.spv.lerp.v4f32(<4 x float> %splat.splat, <4 x float> %1, <4 x float> %2)
-// CHECK:  ret <4 x float> %hlsl.lerp
-float4 test_lerp_float4_splat(float p0, float4 p1) { return lerp(p0, p1, p1); }
-
-// CHECK: %conv = sitofp i32 %2 to float
-// CHECK: %splat.splatinsert = insertelement <2 x float> poison, float %conv, i64 0
-// CHECK: %splat.splat = shufflevector <2 x float> %splat.splatinsert, <2 x float> poison, <2 x i32> zeroinitializer
-// DXIL_CHECK: %hlsl.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %splat.splat)
-// SPIR_CHECK: %hlsl.lerp = call <2 x float> @llvm.spv.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %splat.splat)
-// CHECK: ret <2 x float> %hlsl.lerp
-float2 test_lerp_float2_int_splat(float2 p0, int p1) {
-  return lerp(p0, p0, p1);
-}
-
-// CHECK: %conv = sitofp i32 %2 to float
-// CHECK: %splat.splatinsert = insertelement <3 x float> poison, float %conv, i64 0
-// CHECK: %splat.splat = shufflevector <3 x float> %splat.splatinsert, <3 x float> poison, <3 x i32> zeroinitializer
-// DXIL_CHECK:  %hlsl.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %splat.splat)
-// SPIR_CHECK:  %hlsl.lerp = call <3 x float> @llvm.spv.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %splat.splat)
-// CHECK: ret <3 x float> %hlsl.lerp
-float3 test_lerp_float3_int_splat(float3 p0, int p1) {
-  return lerp(p0, p0, p1);
-}
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
+// RUN:   --check-prefixes=CHECK,DXIL_CHECK,DXIL_NATIVE_HALF,NATIVE_HALF
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,DXIL_CHECK,NO_HALF,DXIL_NO_HALF
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF,SPIR_NATIVE_HALF,SPIR_CHECK
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF,SPIR_NO_HALF,SPIR_CHECK
+
+
+// DXIL_NATIVE_HALF: %hlsl.lerp = call half @llvm.dx.lerp.f16(half %0, half %1, half %2)
+// SPIR_NATIVE_HALF: %hlsl.lerp = call half @llvm.spv.lerp.f16(half %0, half %1, half %2)
+// NATIVE_HALF: ret half %hlsl.lerp
+// DXIL_NO_HALF: %hlsl.lerp = call float @llvm.dx.lerp.f32(float %0, float %1, float %2)
+// SPIR_NO_HALF: %hlsl.lerp = call float @llvm.spv.lerp.f32(float %0, float %1, float %2)
+// NO_HALF: ret float %hlsl.lerp
+half test_lerp_half(half p0) { return lerp(p0, p0, p0); }
+
+// DXIL_NATIVE_HALF: %hlsl.lerp = call <2 x half> @llvm.dx.lerp.v2f16(<2 x half> %0, <2 x half> %1, <2 x half> %2)
+// SPIR_NATIVE_HALF: %hlsl.lerp = call <2 x half> @llvm.spv.lerp.v2f16(<2 x half> %0, <2 x half> %1, <2 x half> %2)
+// NATIVE_HALF: ret <2 x half> %hlsl.lerp
+// DXIL_NO_HALF: %hlsl.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
+// SPIR_NO_HALF: %hlsl.lerp = call <2 x float> @llvm.spv.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
+// NO_HALF: ret <2 x float> %hlsl.lerp
+half2 test_lerp_half2(half2 p0) { return lerp(p0, p0, p0); }
+
+// DXIL_NATIVE_HALF: %hlsl.lerp = call <3 x half> @llvm.dx.lerp.v3f16(<3 x half> %0, <3 x half> %1, <3 x half> %2)
+// SPIR_NATIVE_HALF: %hlsl.lerp = call <3 x half> @llvm.spv.lerp.v3f16(<3 x half> %0, <3 x half> %1, <3 x half> %2)
+// NATIVE_HALF: ret <3 x half> %hlsl.lerp
+// DXIL_NO_HALF: %hlsl.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
+// SPIR_NO_HALF: %hlsl.lerp = call <3 x float> @llvm.spv.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
+// NO_HALF: ret <3 x float> %hlsl.lerp
+half3 test_lerp_half3(half3 p0) { return lerp(p0, p0, p0); }
+
+// DXIL_NATIVE_HALF: %hlsl.lerp = call <4 x half> @llvm.dx.lerp.v4f16(<4 x half> %0, <4 x half> %1, <4 x half> %2)
+// SPIR_NATIVE_HALF: %hlsl.lerp = call <4 x half> @llvm.spv.lerp.v4f16(<4 x half> %0, <4 x half> %1, <4 x half> %2)
+// NATIVE_HALF: ret <4 x half> %hlsl.lerp
+// DXIL_NO_HALF: %hlsl.lerp = call <4 x float> @llvm.dx.lerp.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
+// SPIR_NO_HALF: %hlsl.lerp = call <4 x float> @llvm.spv.lerp.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
+// NO_HALF: ret <4 x float> %hlsl.lerp
+half4 test_lerp_half4(half4 p0) { return lerp(p0, p0, p0); }
+
+// DXIL_CHECK: %hlsl.lerp = call float @llvm.dx.lerp.f32(float %0, float %1, float %2)
+// SPIR_CHECK: %hlsl.lerp = call float @llvm.spv.lerp.f32(float %0, float %1, float %2)
+// CHECK: ret float %hlsl.lerp
+float test_lerp_float(float p0) { return lerp(p0, p0, p0); }
+
+// DXIL_CHECK: %hlsl.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
+// SPIR_CHECK: %hlsl.lerp = call <2 x float> @llvm.spv.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
+// CHECK: ret <2 x float> %hlsl.lerp
+float2 test_lerp_float2(float2 p0) { return lerp(p0, p0, p0); }
+
+// DXIL_CHECK: %hlsl.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
+// SPIR_CHECK: %hlsl.lerp = call <3 x float> @llvm.spv.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
+// CHECK: ret <3 x float> %hlsl.lerp
+float3 test_lerp_float3(float3 p0) { return lerp(p0, p0, p0); }
+
+// DXIL_CHECK: %hlsl.lerp = call <4 x float> @llvm.dx.lerp.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
+// SPIR_CHECK: %hlsl.lerp = call <4 x float> @llvm.spv.lerp.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
+// CHECK: ret <4 x float> %hlsl.lerp
+float4 test_lerp_float4(float4 p0) { return lerp(p0, p0, p0); }
+
+// DXIL_CHECK: %hlsl.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %splat.splat, <2 x float> %1, <2 x float> %2)
+// SPIR_CHECK: %hlsl.lerp = call <2 x float> @llvm.spv.lerp.v2f32(<2 x float> %splat.splat, <2 x float> %1, <2 x float> %2)
+// CHECK: ret <2 x float> %hlsl.lerp
+float2 test_lerp_float2_splat(float p0, float2 p1) { return lerp(p0, p1, p1); }
+
+// DXIL_CHECK: %hlsl.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %splat.splat, <3 x float> %1, <3 x float> %2)
+// SPIR_CHECK: %hlsl.lerp = call <3 x float> @llvm.spv.lerp.v3f32(<3 x float> %splat.splat, <3 x float> %1, <3 x float> %2)
+// CHECK: ret <3 x float> %hlsl.lerp
+float3 test_lerp_float3_splat(float p0, float3 p1) { return lerp(p0, p1, p1); }
+
+// DXIL_CHECK:  %hlsl.lerp = call <4 x float> @llvm.dx.lerp.v4f32(<4 x float> %splat.splat, <4 x float> %1, <4 x float> %2)
+// SPIR_CHECK:  %hlsl.lerp = call <4 x float> @llvm.spv.lerp.v4f32(<4 x float> %splat.splat, <4 x float> %1, <4 x float> %2)
+// CHECK:  ret <4 x float> %hlsl.lerp
+float4 test_lerp_float4_splat(float p0, float4 p1) { return lerp(p0, p1, p1); }
+
+// CHECK: %conv = sitofp i32 %2 to float
+// CHECK: %splat.splatinsert = insertelement <2 x float> poison, float %conv, i64 0
+// CHECK: %splat.splat = shufflevector <2 x float> %splat.splatinsert, <2 x float> poison, <2 x i32> zeroinitializer
+// DXIL_CHECK: %hlsl.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %splat.splat)
+// SPIR_CHECK: %hlsl.lerp = call <2 x float> @llvm.spv.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %splat.splat)
+// CHECK: ret <2 x float> %hlsl.lerp
+float2 test_lerp_float2_int_splat(float2 p0, int p1) {
+  return lerp(p0, p0, p1);
+}
+
+// CHECK: %conv = sitofp i32 %2 to float
+// CHECK: %splat.splatinsert = insertelement <3 x float> poison, float %conv, i64 0
+// CHECK: %splat.splat = shufflevector <3 x float> %splat.splatinsert, <3 x float> poison, <3 x i32> zeroinitializer
+// DXIL_CHECK:  %hlsl.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %splat.splat)
+// SPIR_CHECK:  %hlsl.lerp = call <3 x float> @llvm.spv.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %splat.splat)
+// CHECK: ret <3 x float> %hlsl.lerp
+float3 test_lerp_float3_int_splat(float3 p0, int p1) {
+  return lerp(p0, p0, p1);
+}
diff --git a/clang/test/CodeGenHLSL/builtins/log.hlsl b/clang/test/CodeGenHLSL/builtins/log.hlsl
index ecbdf1e98ac3..c89eda683403 100644
--- a/clang/test/CodeGenHLSL/builtins/log.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/log.hlsl
@@ -1,41 +1,41 @@
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
-
-// NATIVE_HALF: define noundef half @
-// NATIVE_HALF: call half @llvm.log.f16(
-// NO_HALF: define noundef float @"?test_log_half@@YA$halff@$halff@@Z"(
-// NO_HALF: call float @llvm.log.f32(
-half test_log_half(half p0) { return log(p0); }
-// NATIVE_HALF: define noundef <2 x half> @
-// NATIVE_HALF: call <2 x half> @llvm.log.v2f16
-// NO_HALF: define noundef <2 x float> @"?test_log_half2
-// NO_HALF: call <2 x float> @llvm.log.v2f32(
-half2 test_log_half2(half2 p0) { return log(p0); }
-// NATIVE_HALF: define noundef <3 x half> @
-// NATIVE_HALF: call <3 x half> @llvm.log.v3f16
-// NO_HALF: define noundef <3 x float> @"?test_log_half3
-// NO_HALF: call <3 x float> @llvm.log.v3f32(
-half3 test_log_half3(half3 p0) { return log(p0); }
-// NATIVE_HALF: define noundef <4 x half> @
-// NATIVE_HALF: call <4 x half> @llvm.log.v4f16
-// NO_HALF: define noundef <4 x float> @"?test_log_half4
-// NO_HALF: call <4 x float> @llvm.log.v4f32(
-half4 test_log_half4(half4 p0) { return log(p0); }
-
-// CHECK: define noundef float @"?test_log_float
-// CHECK: call float @llvm.log.f32(
-float test_log_float(float p0) { return log(p0); }
-// CHECK: define noundef <2 x float> @"?test_log_float2
-// CHECK: call <2 x float> @llvm.log.v2f32
-float2 test_log_float2(float2 p0) { return log(p0); }
-// CHECK: define noundef <3 x float> @"?test_log_float3
-// CHECK: call <3 x float> @llvm.log.v3f32
-float3 test_log_float3(float3 p0) { return log(p0); }
-// CHECK: define noundef <4 x float> @"?test_log_float4
-// CHECK: call <4 x float> @llvm.log.v4f32
-float4 test_log_float4(float4 p0) { return log(p0); }
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+
+// NATIVE_HALF: define noundef half @
+// NATIVE_HALF: call half @llvm.log.f16(
+// NO_HALF: define noundef float @"?test_log_half@@YA$halff@$halff@@Z"(
+// NO_HALF: call float @llvm.log.f32(
+half test_log_half(half p0) { return log(p0); }
+// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF: call <2 x half> @llvm.log.v2f16
+// NO_HALF: define noundef <2 x float> @"?test_log_half2
+// NO_HALF: call <2 x float> @llvm.log.v2f32(
+half2 test_log_half2(half2 p0) { return log(p0); }
+// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF: call <3 x half> @llvm.log.v3f16
+// NO_HALF: define noundef <3 x float> @"?test_log_half3
+// NO_HALF: call <3 x float> @llvm.log.v3f32(
+half3 test_log_half3(half3 p0) { return log(p0); }
+// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF: call <4 x half> @llvm.log.v4f16
+// NO_HALF: define noundef <4 x float> @"?test_log_half4
+// NO_HALF: call <4 x float> @llvm.log.v4f32(
+half4 test_log_half4(half4 p0) { return log(p0); }
+
+// CHECK: define noundef float @"?test_log_float
+// CHECK: call float @llvm.log.f32(
+float test_log_float(float p0) { return log(p0); }
+// CHECK: define noundef <2 x float> @"?test_log_float2
+// CHECK: call <2 x float> @llvm.log.v2f32
+float2 test_log_float2(float2 p0) { return log(p0); }
+// CHECK: define noundef <3 x float> @"?test_log_float3
+// CHECK: call <3 x float> @llvm.log.v3f32
+float3 test_log_float3(float3 p0) { return log(p0); }
+// CHECK: define noundef <4 x float> @"?test_log_float4
+// CHECK: call <4 x float> @llvm.log.v4f32
+float4 test_log_float4(float4 p0) { return log(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/log2.hlsl b/clang/test/CodeGenHLSL/builtins/log2.hlsl
index 9ed8185a06b0..31c7bff214c6 100644
--- a/clang/test/CodeGenHLSL/builtins/log2.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/log2.hlsl
@@ -1,41 +1,41 @@
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
-
-// NATIVE_HALF: define noundef half @
-// NATIVE_HALF: call half @llvm.log2.f16(
-// NO_HALF: define noundef float @"?test_log2_half
-// NO_HALF: call float @llvm.log2.f32(
-half test_log2_half(half p0) { return log2(p0); }
-// NATIVE_HALF: define noundef <2 x half> @
-// NATIVE_HALF: call <2 x half> @llvm.log2.v2f16
-// NO_HALF: define noundef <2 x float> @"?test_log2_half2
-// NO_HALF: call <2 x float> @llvm.log2.v2f32(
-half2 test_log2_half2(half2 p0) { return log2(p0); }
-// NATIVE_HALF: define noundef <3 x half> @
-// NATIVE_HALF: call <3 x half> @llvm.log2.v3f16
-// NO_HALF: define noundef <3 x float> @"?test_log2_half3
-// NO_HALF: call <3 x float> @llvm.log2.v3f32(
-half3 test_log2_half3(half3 p0) { return log2(p0); }
-// NATIVE_HALF: define noundef <4 x half> @
-// NATIVE_HALF: call <4 x half> @llvm.log2.v4f16
-// NO_HALF: define noundef <4 x float> @"?test_log2_half4
-// NO_HALF: call <4 x float> @llvm.log2.v4f32(
-half4 test_log2_half4(half4 p0) { return log2(p0); }
-
-// CHECK: define noundef float @"?test_log2_float
-// CHECK: call float @llvm.log2.f32(
-float test_log2_float(float p0) { return log2(p0); }
-// CHECK: define noundef <2 x float> @"?test_log2_float2
-// CHECK: call <2 x float> @llvm.log2.v2f32
-float2 test_log2_float2(float2 p0) { return log2(p0); }
-// CHECK: define noundef <3 x float> @"?test_log2_float3
-// CHECK: call <3 x float> @llvm.log2.v3f32
-float3 test_log2_float3(float3 p0) { return log2(p0); }
-// CHECK: define noundef <4 x float> @"?test_log2_float4
-// CHECK: call <4 x float> @llvm.log2.v4f32
-float4 test_log2_float4(float4 p0) { return log2(p0); }
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+
+// NATIVE_HALF: define noundef half @
+// NATIVE_HALF: call half @llvm.log2.f16(
+// NO_HALF: define noundef float @"?test_log2_half
+// NO_HALF: call float @llvm.log2.f32(
+half test_log2_half(half p0) { return log2(p0); }
+// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF: call <2 x half> @llvm.log2.v2f16
+// NO_HALF: define noundef <2 x float> @"?test_log2_half2
+// NO_HALF: call <2 x float> @llvm.log2.v2f32(
+half2 test_log2_half2(half2 p0) { return log2(p0); }
+// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF: call <3 x half> @llvm.log2.v3f16
+// NO_HALF: define noundef <3 x float> @"?test_log2_half3
+// NO_HALF: call <3 x float> @llvm.log2.v3f32(
+half3 test_log2_half3(half3 p0) { return log2(p0); }
+// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF: call <4 x half> @llvm.log2.v4f16
+// NO_HALF: define noundef <4 x float> @"?test_log2_half4
+// NO_HALF: call <4 x float> @llvm.log2.v4f32(
+half4 test_log2_half4(half4 p0) { return log2(p0); }
+
+// CHECK: define noundef float @"?test_log2_float
+// CHECK: call float @llvm.log2.f32(
+float test_log2_float(float p0) { return log2(p0); }
+// CHECK: define noundef <2 x float> @"?test_log2_float2
+// CHECK: call <2 x float> @llvm.log2.v2f32
+float2 test_log2_float2(float2 p0) { return log2(p0); }
+// CHECK: define noundef <3 x float> @"?test_log2_float3
+// CHECK: call <3 x float> @llvm.log2.v3f32
+float3 test_log2_float3(float3 p0) { return log2(p0); }
+// CHECK: define noundef <4 x float> @"?test_log2_float4
+// CHECK: call <4 x float> @llvm.log2.v4f32
+float4 test_log2_float4(float4 p0) { return log2(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/mad.hlsl b/clang/test/CodeGenHLSL/builtins/mad.hlsl
index bd4f38067a5c..b4dc636b00b7 100644
--- a/clang/test/CodeGenHLSL/builtins/mad.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/mad.hlsl
@@ -1,247 +1,247 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
-// RUN:   --check-prefixes=CHECK,DXIL_CHECK,DXIL_NATIVE_HALF,NATIVE_HALF
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,DXIL_CHECK,NO_HALF
-
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF,SPIR_NATIVE_HALF,SPIR_CHECK
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF,SPIR_CHECK
-
-#ifdef __HLSL_ENABLE_16_BIT
-// DXIL_NATIVE_HALF: %dx.umad = call i16 @llvm.dx.umad.i16(i16 %0, i16 %1, i16 %2)
-// DXIL_NATIVE_HALF: ret i16 %dx.umad
-// SPIR_NATIVE_HALF: mul nuw i16 %{{.*}}, %{{.*}}
-// SPIR_NATIVE_HALF: add nuw i16 %{{.*}}, %{{.*}}
-uint16_t test_mad_uint16_t(uint16_t p0, uint16_t p1, uint16_t p2) { return mad(p0, p1, p2); }
-
-// DXIL_NATIVE_HALF: %dx.umad = call <2 x i16>  @llvm.dx.umad.v2i16(<2 x i16> %0, <2 x i16> %1, <2 x i16> %2)
-// DXIL_NATIVE_HALF: ret <2 x i16> %dx.umad
-// SPIR_NATIVE_HALF: mul nuw <2 x i16>  %{{.*}}, %{{.*}}
-// SPIR_NATIVE_HALF: add nuw <2 x i16>  %{{.*}}, %{{.*}}
-uint16_t2 test_mad_uint16_t2(uint16_t2 p0, uint16_t2 p1, uint16_t2 p2) { return mad(p0, p1, p2); }
-
-// DXIL_NATIVE_HALF: %dx.umad = call <3 x i16>  @llvm.dx.umad.v3i16(<3 x i16> %0, <3 x i16> %1, <3 x i16> %2)
-// DXIL_NATIVE_HALF: ret <3 x i16> %dx.umad
-// SPIR_NATIVE_HALF: mul nuw <3 x i16>  %{{.*}}, %{{.*}}
-// SPIR_NATIVE_HALF: add nuw <3 x i16>  %{{.*}}, %{{.*}}
-uint16_t3 test_mad_uint16_t3(uint16_t3 p0, uint16_t3 p1, uint16_t3 p2) { return mad(p0, p1, p2); }
-
-// DXIL_NATIVE_HALF: %dx.umad = call <4 x i16>  @llvm.dx.umad.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2)
-// DXIL_NATIVE_HALF: ret <4 x i16> %dx.umad
-// SPIR_NATIVE_HALF: mul nuw <4 x i16>  %{{.*}}, %{{.*}}
-// SPIR_NATIVE_HALF: add nuw <4 x i16>  %{{.*}}, %{{.*}}
-uint16_t4 test_mad_uint16_t4(uint16_t4 p0, uint16_t4 p1, uint16_t4 p2) { return mad(p0, p1, p2); }
-
-// DXIL_NATIVE_HALF: %dx.imad = call i16 @llvm.dx.imad.i16(i16 %0, i16 %1, i16 %2)
-// DXIL_NATIVE_HALF: ret i16 %dx.imad
-// SPIR_NATIVE_HALF: mul nsw i16 %{{.*}}, %{{.*}}
-// SPIR_NATIVE_HALF: add nsw i16 %{{.*}}, %{{.*}}
-int16_t test_mad_int16_t(int16_t p0, int16_t p1, int16_t p2) { return mad(p0, p1, p2); }
-
-// DXIL_NATIVE_HALF: %dx.imad = call <2 x i16>  @llvm.dx.imad.v2i16(<2 x i16> %0, <2 x i16> %1, <2 x i16> %2)
-// DXIL_NATIVE_HALF: ret <2 x i16> %dx.imad
-// SPIR_NATIVE_HALF: mul nsw <2 x i16>  %{{.*}}, %{{.*}}
-// SPIR_NATIVE_HALF: add nsw <2 x i16>  %{{.*}}, %{{.*}}
-int16_t2 test_mad_int16_t2(int16_t2 p0, int16_t2 p1, int16_t2 p2) { return mad(p0, p1, p2); }
-
-// DXIL_NATIVE_HALF: %dx.imad = call <3 x i16>  @llvm.dx.imad.v3i16(<3 x i16> %0, <3 x i16> %1, <3 x i16> %2)
-// DXIL_NATIVE_HALF: ret <3 x i16> %dx.imad
-// SPIR_NATIVE_HALF: mul nsw <3 x i16>  %{{.*}}, %{{.*}}
-// SPIR_NATIVE_HALF: add nsw <3 x i16>  %{{.*}}, %{{.*}}
-int16_t3 test_mad_int16_t3(int16_t3 p0, int16_t3 p1, int16_t3 p2) { return mad(p0, p1, p2); }
-
-// DXIL_NATIVE_HALF: %dx.imad = call <4 x i16>  @llvm.dx.imad.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2)
-// DXIL_NATIVE_HALF: ret <4 x i16> %dx.imad
-// SPIR_NATIVE_HALF: mul nsw <4 x i16>  %{{.*}}, %{{.*}}
-// SPIR_NATIVE_HALF: add nsw <4 x i16>  %{{.*}}, %{{.*}}
-int16_t4 test_mad_int16_t4(int16_t4 p0, int16_t4 p1, int16_t4 p2) { return mad(p0, p1, p2); }
-#endif // __HLSL_ENABLE_16_BIT
-
-// NATIVE_HALF: %hlsl.fmad = call half @llvm.fmuladd.f16(half %0, half %1, half %2)
-// NATIVE_HALF: ret half %hlsl.fmad
-// NO_HALF: %hlsl.fmad = call float @llvm.fmuladd.f32(float %0, float %1, float %2)
-// NO_HALF: ret float %hlsl.fmad
-half test_mad_half(half p0, half p1, half p2) { return mad(p0, p1, p2); }
-
-// NATIVE_HALF: %hlsl.fmad = call <2 x half>  @llvm.fmuladd.v2f16(<2 x half> %0, <2 x half> %1, <2 x half> %2)
-// NATIVE_HALF: ret <2 x half> %hlsl.fmad
-// NO_HALF: %hlsl.fmad = call <2 x float>  @llvm.fmuladd.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
-// NO_HALF: ret <2 x float> %hlsl.fmad
-half2 test_mad_half2(half2 p0, half2 p1, half2 p2) { return mad(p0, p1, p2); }
-
-// NATIVE_HALF: %hlsl.fmad = call <3 x half>  @llvm.fmuladd.v3f16(<3 x half> %0, <3 x half> %1, <3 x half> %2)
-// NATIVE_HALF: ret <3 x half> %hlsl.fmad
-// NO_HALF: %hlsl.fmad = call <3 x float>  @llvm.fmuladd.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
-// NO_HALF: ret <3 x float> %hlsl.fmad
-half3 test_mad_half3(half3 p0, half3 p1, half3 p2) { return mad(p0, p1, p2); }
-
-// NATIVE_HALF: %hlsl.fmad = call <4 x half>  @llvm.fmuladd.v4f16(<4 x half> %0, <4 x half> %1, <4 x half> %2)
-// NATIVE_HALF: ret <4 x half> %hlsl.fmad
-// NO_HALF: %hlsl.fmad = call <4 x float>  @llvm.fmuladd.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
-// NO_HALF: ret <4 x float> %hlsl.fmad
-half4 test_mad_half4(half4 p0, half4 p1, half4 p2) { return mad(p0, p1, p2); }
-
-// CHECK: %hlsl.fmad = call float @llvm.fmuladd.f32(float %0, float %1, float %2)
-// CHECK: ret float %hlsl.fmad
-float test_mad_float(float p0, float p1, float p2) { return mad(p0, p1, p2); }
-
-// CHECK: %hlsl.fmad = call <2 x float>  @llvm.fmuladd.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
-// CHECK: ret <2 x float> %hlsl.fmad
-float2 test_mad_float2(float2 p0, float2 p1, float2 p2) { return mad(p0, p1, p2); }
-
-// CHECK: %hlsl.fmad = call <3 x float>  @llvm.fmuladd.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
-// CHECK: ret <3 x float> %hlsl.fmad
-float3 test_mad_float3(float3 p0, float3 p1, float3 p2) { return mad(p0, p1, p2); }
-
-// CHECK: %hlsl.fmad = call <4 x float>  @llvm.fmuladd.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
-// CHECK: ret <4 x float> %hlsl.fmad
-float4 test_mad_float4(float4 p0, float4 p1, float4 p2) { return mad(p0, p1, p2); }
-
-// CHECK: %hlsl.fmad = call double @llvm.fmuladd.f64(double %0, double %1, double %2)
-// CHECK: ret double %hlsl.fmad
-double test_mad_double(double p0, double p1, double p2) { return mad(p0, p1, p2); }
-
-// CHECK: %hlsl.fmad = call <2 x double>  @llvm.fmuladd.v2f64(<2 x double> %0, <2 x double> %1, <2 x double> %2)
-// CHECK: ret <2 x double> %hlsl.fmad
-double2 test_mad_double2(double2 p0, double2 p1, double2 p2) { return mad(p0, p1, p2); }
-
-// CHECK: %hlsl.fmad = call <3 x double>  @llvm.fmuladd.v3f64(<3 x double> %0, <3 x double> %1, <3 x double> %2)
-// CHECK: ret <3 x double> %hlsl.fmad
-double3 test_mad_double3(double3 p0, double3 p1, double3 p2) { return mad(p0, p1, p2); }
-
-// CHECK: %hlsl.fmad = call <4 x double>  @llvm.fmuladd.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %2)
-// CHECK: ret <4 x double> %hlsl.fmad
-double4 test_mad_double4(double4 p0, double4 p1, double4 p2) { return mad(p0, p1, p2); }
-
-// DXIL_CHECK: %dx.imad = call i32 @llvm.dx.imad.i32(i32 %0, i32 %1, i32 %2)
-// DXIL_CHECK: ret i32 %dx.imad
-// SPIR_CHECK: mul nsw i32 %{{.*}}, %{{.*}}
-// SPIR_CHECK: add nsw i32 %{{.*}}, %{{.*}}
-int test_mad_int(int p0, int p1, int p2) { return mad(p0, p1, p2); }
-
-// DXIL_CHECK: %dx.imad = call <2 x i32>  @llvm.dx.imad.v2i32(<2 x i32> %0, <2 x i32> %1, <2 x i32> %2)
-// DXIL_CHECK: ret <2 x i32> %dx.imad
-// SPIR_CHECK: mul nsw <2 x i32>  %{{.*}}, %{{.*}}
-// SPIR_CHECK: add nsw <2 x i32>  %{{.*}}, %{{.*}}
-int2 test_mad_int2(int2 p0, int2 p1, int2 p2) { return mad(p0, p1, p2); }
-
-// DXIL_CHECK: %dx.imad = call <3 x i32>  @llvm.dx.imad.v3i32(<3 x i32> %0, <3 x i32> %1, <3 x i32> %2)
-// DXIL_CHECK: ret <3 x i32> %dx.imad
-// SPIR_CHECK: mul nsw <3 x i32>  %{{.*}}, %{{.*}}
-// SPIR_CHECK: add nsw <3 x i32>  %{{.*}}, %{{.*}}
-int3 test_mad_int3(int3 p0, int3 p1, int3 p2) { return mad(p0, p1, p2); }
-
-// DXIL_CHECK: %dx.imad = call <4 x i32>  @llvm.dx.imad.v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
-// DXIL_CHECK: ret <4 x i32> %dx.imad
-// SPIR_CHECK: mul nsw <4 x i32>  %{{.*}}, %{{.*}}
-// SPIR_CHECK: add nsw <4 x i32>  %{{.*}}, %{{.*}}
-int4 test_mad_int4(int4 p0, int4 p1, int4 p2) { return mad(p0, p1, p2); }
-
-// DXIL_CHECK: %dx.imad = call i64 @llvm.dx.imad.i64(i64 %0, i64 %1, i64 %2)
-// DXIL_CHECK: ret i64 %dx.imad
-// SPIR_CHECK: mul nsw i64 %{{.*}}, %{{.*}}
-// SPIR_CHECK: add nsw i64 %{{.*}}, %{{.*}}
-int64_t test_mad_int64_t(int64_t p0, int64_t p1, int64_t p2) { return mad(p0, p1, p2); }
-
-// DXIL_CHECK: %dx.imad = call <2 x i64>  @llvm.dx.imad.v2i64(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2)
-// DXIL_CHECK: ret <2 x i64> %dx.imad
-// SPIR_CHECK: mul nsw <2 x i64>  %{{.*}}, %{{.*}}
-// SPIR_CHECK: add nsw <2 x i64>  %{{.*}}, %{{.*}}
-int64_t2 test_mad_int64_t2(int64_t2 p0, int64_t2 p1, int64_t2 p2) { return mad(p0, p1, p2); }
-
-// DXIL_CHECK: %dx.imad = call <3 x i64>  @llvm.dx.imad.v3i64(<3 x i64> %0, <3 x i64> %1, <3 x i64> %2)
-// DXIL_CHECK: ret <3 x i64> %dx.imad
-// SPIR_CHECK: mul nsw <3 x i64>  %{{.*}}, %{{.*}}
-// SPIR_CHECK: add nsw <3 x i64>  %{{.*}}, %{{.*}}
-int64_t3 test_mad_int64_t3(int64_t3 p0, int64_t3 p1, int64_t3 p2) { return mad(p0, p1, p2); }
-
-// DXIL_CHECK: %dx.imad = call <4 x i64>  @llvm.dx.imad.v4i64(<4 x i64> %0, <4 x i64> %1, <4 x i64> %2)
-// DXIL_CHECK: ret <4 x i64> %dx.imad
-// SPIR_CHECK: mul nsw <4 x i64>  %{{.*}}, %{{.*}}
-// SPIR_CHECK: add nsw <4 x i64>  %{{.*}}, %{{.*}}
-int64_t4 test_mad_int64_t4(int64_t4 p0, int64_t4 p1, int64_t4 p2) { return mad(p0, p1, p2); }
-
-// DXIL_CHECK: %dx.umad = call i32 @llvm.dx.umad.i32(i32 %0, i32 %1, i32 %2)
-// DXIL_CHECK: ret i32 %dx.umad
-// SPIR_CHECK: mul nuw i32 %{{.*}}, %{{.*}}
-// SPIR_CHECK: add nuw i32 %{{.*}}, %{{.*}}
-uint test_mad_uint(uint p0, uint p1, uint p2) { return mad(p0, p1, p2); }
-
-// DXIL_CHECK: %dx.umad = call <2 x i32>  @llvm.dx.umad.v2i32(<2 x i32> %0, <2 x i32> %1, <2 x i32> %2)
-// DXIL_CHECK: ret <2 x i32> %dx.umad
-// SPIR_CHECK: mul nuw <2 x i32>  %{{.*}}, %{{.*}}
-// SPIR_CHECK: add nuw <2 x i32>  %{{.*}}, %{{.*}}
-uint2 test_mad_uint2(uint2 p0, uint2 p1, uint2 p2) { return mad(p0, p1, p2); }
-
-// DXIL_CHECK: %dx.umad = call <3 x i32>  @llvm.dx.umad.v3i32(<3 x i32> %0, <3 x i32> %1, <3 x i32> %2)
-// DXIL_CHECK: ret <3 x i32> %dx.umad
-// SPIR_CHECK: mul nuw <3 x i32>  %{{.*}}, %{{.*}}
-// SPIR_CHECK: add nuw <3 x i32>  %{{.*}}, %{{.*}}
-uint3 test_mad_uint3(uint3 p0, uint3 p1, uint3 p2) { return mad(p0, p1, p2); }
-
-// DXIL_CHECK: %dx.umad = call <4 x i32>  @llvm.dx.umad.v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
-// DXIL_CHECK: ret <4 x i32> %dx.umad
-// SPIR_CHECK: mul nuw <4 x i32>  %{{.*}}, %{{.*}}
-// SPIR_CHECK: add nuw <4 x i32>  %{{.*}}, %{{.*}}
-uint4 test_mad_uint4(uint4 p0, uint4 p1, uint4 p2) { return mad(p0, p1, p2); }
-
-// DXIL_CHECK: %dx.umad = call i64 @llvm.dx.umad.i64(i64 %0, i64 %1, i64 %2)
-// DXIL_CHECK: ret i64 %dx.umad
-// SPIR_CHECK: mul nuw i64 %{{.*}}, %{{.*}}
-// SPIR_CHECK: add nuw i64 %{{.*}}, %{{.*}}
-uint64_t test_mad_uint64_t(uint64_t p0, uint64_t p1, uint64_t p2) { return mad(p0, p1, p2); }
-
-// DXIL_CHECK: %dx.umad = call <2 x i64>  @llvm.dx.umad.v2i64(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2)
-// DXIL_CHECK: ret <2 x i64> %dx.umad
-// SPIR_CHECK: mul nuw <2 x i64>  %{{.*}}, %{{.*}}
-// SPIR_CHECK: add nuw <2 x i64>  %{{.*}}, %{{.*}}
-uint64_t2 test_mad_uint64_t2(uint64_t2 p0, uint64_t2 p1, uint64_t2 p2) { return mad(p0, p1, p2); }
-
-// DXIL_CHECK: %dx.umad = call <3 x i64>  @llvm.dx.umad.v3i64(<3 x i64> %0, <3 x i64> %1, <3 x i64> %2)
-// DXIL_CHECK: ret <3 x i64> %dx.umad
-// SPIR_CHECK: mul nuw <3 x i64>  %{{.*}}, %{{.*}}
-// SPIR_CHECK: add nuw <3 x i64>  %{{.*}}, %{{.*}}
-uint64_t3 test_mad_uint64_t3(uint64_t3 p0, uint64_t3 p1, uint64_t3 p2) { return mad(p0, p1, p2); }
-
-// DXIL_CHECK: %dx.umad = call <4 x i64>  @llvm.dx.umad.v4i64(<4 x i64> %0, <4 x i64> %1, <4 x i64> %2)
-// DXIL_CHECK: ret <4 x i64> %dx.umad
-// SPIR_CHECK: mul nuw <4 x i64>  %{{.*}}, %{{.*}}
-// SPIR_CHECK: add nuw <4 x i64>  %{{.*}}, %{{.*}}
-uint64_t4 test_mad_uint64_t4(uint64_t4 p0, uint64_t4 p1, uint64_t4 p2) { return mad(p0, p1, p2); }
-
-// CHECK: %hlsl.fmad = call <2 x float>  @llvm.fmuladd.v2f32(<2 x float> %splat.splat, <2 x float> %1, <2 x float> %2)
-// CHECK: ret <2 x float> %hlsl.fmad
-float2 test_mad_float2_splat(float p0, float2 p1, float2 p2) { return mad(p0, p1, p2); }
-
-// CHECK: %hlsl.fmad = call <3 x float>  @llvm.fmuladd.v3f32(<3 x float> %splat.splat, <3 x float> %1, <3 x float> %2)
-// CHECK: ret <3 x float> %hlsl.fmad
-float3 test_mad_float3_splat(float p0, float3 p1, float3 p2) { return mad(p0, p1, p2); }
-
-// CHECK:  %hlsl.fmad = call <4 x float>  @llvm.fmuladd.v4f32(<4 x float> %splat.splat, <4 x float> %1, <4 x float> %2)
-// CHECK:  ret <4 x float> %hlsl.fmad
-float4 test_mad_float4_splat(float p0, float4 p1, float4 p2) { return mad(p0, p1, p2); }
-
-// CHECK: %conv = sitofp i32 %2 to float
-// CHECK: %splat.splatinsert = insertelement <2 x float> poison, float %conv, i64 0
-// CHECK: %splat.splat = shufflevector <2 x float> %splat.splatinsert, <2 x float> poison, <2 x i32> zeroinitializer
-// CHECK: %hlsl.fmad = call <2 x float>  @llvm.fmuladd.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %splat.splat)
-// CHECK: ret <2 x float> %hlsl.fmad
-float2 test_mad_float2_int_splat(float2 p0, float2 p1, int p2) {
-  return mad(p0, p1, p2);
-}
-
-// CHECK: %conv = sitofp i32 %2 to float
-// CHECK: %splat.splatinsert = insertelement <3 x float> poison, float %conv, i64 0
-// CHECK: %splat.splat = shufflevector <3 x float> %splat.splatinsert, <3 x float> poison, <3 x i32> zeroinitializer
-// CHECK:  %hlsl.fmad = call <3 x float>  @llvm.fmuladd.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %splat.splat)
-// CHECK: ret <3 x float> %hlsl.fmad
-float3 test_mad_float3_int_splat(float3 p0, float3 p1, int p2) {
-  return mad(p0, p1, p2);
-}
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
+// RUN:   --check-prefixes=CHECK,DXIL_CHECK,DXIL_NATIVE_HALF,NATIVE_HALF
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,DXIL_CHECK,NO_HALF
+
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF,SPIR_NATIVE_HALF,SPIR_CHECK
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF,SPIR_CHECK
+
+#ifdef __HLSL_ENABLE_16_BIT
+// DXIL_NATIVE_HALF: %dx.umad = call i16 @llvm.dx.umad.i16(i16 %0, i16 %1, i16 %2)
+// DXIL_NATIVE_HALF: ret i16 %dx.umad
+// SPIR_NATIVE_HALF: mul nuw i16 %{{.*}}, %{{.*}}
+// SPIR_NATIVE_HALF: add nuw i16 %{{.*}}, %{{.*}}
+uint16_t test_mad_uint16_t(uint16_t p0, uint16_t p1, uint16_t p2) { return mad(p0, p1, p2); }
+
+// DXIL_NATIVE_HALF: %dx.umad = call <2 x i16>  @llvm.dx.umad.v2i16(<2 x i16> %0, <2 x i16> %1, <2 x i16> %2)
+// DXIL_NATIVE_HALF: ret <2 x i16> %dx.umad
+// SPIR_NATIVE_HALF: mul nuw <2 x i16>  %{{.*}}, %{{.*}}
+// SPIR_NATIVE_HALF: add nuw <2 x i16>  %{{.*}}, %{{.*}}
+uint16_t2 test_mad_uint16_t2(uint16_t2 p0, uint16_t2 p1, uint16_t2 p2) { return mad(p0, p1, p2); }
+
+// DXIL_NATIVE_HALF: %dx.umad = call <3 x i16>  @llvm.dx.umad.v3i16(<3 x i16> %0, <3 x i16> %1, <3 x i16> %2)
+// DXIL_NATIVE_HALF: ret <3 x i16> %dx.umad
+// SPIR_NATIVE_HALF: mul nuw <3 x i16>  %{{.*}}, %{{.*}}
+// SPIR_NATIVE_HALF: add nuw <3 x i16>  %{{.*}}, %{{.*}}
+uint16_t3 test_mad_uint16_t3(uint16_t3 p0, uint16_t3 p1, uint16_t3 p2) { return mad(p0, p1, p2); }
+
+// DXIL_NATIVE_HALF: %dx.umad = call <4 x i16>  @llvm.dx.umad.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2)
+// DXIL_NATIVE_HALF: ret <4 x i16> %dx.umad
+// SPIR_NATIVE_HALF: mul nuw <4 x i16>  %{{.*}}, %{{.*}}
+// SPIR_NATIVE_HALF: add nuw <4 x i16>  %{{.*}}, %{{.*}}
+uint16_t4 test_mad_uint16_t4(uint16_t4 p0, uint16_t4 p1, uint16_t4 p2) { return mad(p0, p1, p2); }
+
+// DXIL_NATIVE_HALF: %dx.imad = call i16 @llvm.dx.imad.i16(i16 %0, i16 %1, i16 %2)
+// DXIL_NATIVE_HALF: ret i16 %dx.imad
+// SPIR_NATIVE_HALF: mul nsw i16 %{{.*}}, %{{.*}}
+// SPIR_NATIVE_HALF: add nsw i16 %{{.*}}, %{{.*}}
+int16_t test_mad_int16_t(int16_t p0, int16_t p1, int16_t p2) { return mad(p0, p1, p2); }
+
+// DXIL_NATIVE_HALF: %dx.imad = call <2 x i16>  @llvm.dx.imad.v2i16(<2 x i16> %0, <2 x i16> %1, <2 x i16> %2)
+// DXIL_NATIVE_HALF: ret <2 x i16> %dx.imad
+// SPIR_NATIVE_HALF: mul nsw <2 x i16>  %{{.*}}, %{{.*}}
+// SPIR_NATIVE_HALF: add nsw <2 x i16>  %{{.*}}, %{{.*}}
+int16_t2 test_mad_int16_t2(int16_t2 p0, int16_t2 p1, int16_t2 p2) { return mad(p0, p1, p2); }
+
+// DXIL_NATIVE_HALF: %dx.imad = call <3 x i16>  @llvm.dx.imad.v3i16(<3 x i16> %0, <3 x i16> %1, <3 x i16> %2)
+// DXIL_NATIVE_HALF: ret <3 x i16> %dx.imad
+// SPIR_NATIVE_HALF: mul nsw <3 x i16>  %{{.*}}, %{{.*}}
+// SPIR_NATIVE_HALF: add nsw <3 x i16>  %{{.*}}, %{{.*}}
+int16_t3 test_mad_int16_t3(int16_t3 p0, int16_t3 p1, int16_t3 p2) { return mad(p0, p1, p2); }
+
+// DXIL_NATIVE_HALF: %dx.imad = call <4 x i16>  @llvm.dx.imad.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2)
+// DXIL_NATIVE_HALF: ret <4 x i16> %dx.imad
+// SPIR_NATIVE_HALF: mul nsw <4 x i16>  %{{.*}}, %{{.*}}
+// SPIR_NATIVE_HALF: add nsw <4 x i16>  %{{.*}}, %{{.*}}
+int16_t4 test_mad_int16_t4(int16_t4 p0, int16_t4 p1, int16_t4 p2) { return mad(p0, p1, p2); }
+#endif // __HLSL_ENABLE_16_BIT
+
+// NATIVE_HALF: %hlsl.fmad = call half @llvm.fmuladd.f16(half %0, half %1, half %2)
+// NATIVE_HALF: ret half %hlsl.fmad
+// NO_HALF: %hlsl.fmad = call float @llvm.fmuladd.f32(float %0, float %1, float %2)
+// NO_HALF: ret float %hlsl.fmad
+half test_mad_half(half p0, half p1, half p2) { return mad(p0, p1, p2); }
+
+// NATIVE_HALF: %hlsl.fmad = call <2 x half>  @llvm.fmuladd.v2f16(<2 x half> %0, <2 x half> %1, <2 x half> %2)
+// NATIVE_HALF: ret <2 x half> %hlsl.fmad
+// NO_HALF: %hlsl.fmad = call <2 x float>  @llvm.fmuladd.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
+// NO_HALF: ret <2 x float> %hlsl.fmad
+half2 test_mad_half2(half2 p0, half2 p1, half2 p2) { return mad(p0, p1, p2); }
+
+// NATIVE_HALF: %hlsl.fmad = call <3 x half>  @llvm.fmuladd.v3f16(<3 x half> %0, <3 x half> %1, <3 x half> %2)
+// NATIVE_HALF: ret <3 x half> %hlsl.fmad
+// NO_HALF: %hlsl.fmad = call <3 x float>  @llvm.fmuladd.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
+// NO_HALF: ret <3 x float> %hlsl.fmad
+half3 test_mad_half3(half3 p0, half3 p1, half3 p2) { return mad(p0, p1, p2); }
+
+// NATIVE_HALF: %hlsl.fmad = call <4 x half>  @llvm.fmuladd.v4f16(<4 x half> %0, <4 x half> %1, <4 x half> %2)
+// NATIVE_HALF: ret <4 x half> %hlsl.fmad
+// NO_HALF: %hlsl.fmad = call <4 x float>  @llvm.fmuladd.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
+// NO_HALF: ret <4 x float> %hlsl.fmad
+half4 test_mad_half4(half4 p0, half4 p1, half4 p2) { return mad(p0, p1, p2); }
+
+// CHECK: %hlsl.fmad = call float @llvm.fmuladd.f32(float %0, float %1, float %2)
+// CHECK: ret float %hlsl.fmad
+float test_mad_float(float p0, float p1, float p2) { return mad(p0, p1, p2); }
+
+// CHECK: %hlsl.fmad = call <2 x float>  @llvm.fmuladd.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
+// CHECK: ret <2 x float> %hlsl.fmad
+float2 test_mad_float2(float2 p0, float2 p1, float2 p2) { return mad(p0, p1, p2); }
+
+// CHECK: %hlsl.fmad = call <3 x float>  @llvm.fmuladd.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
+// CHECK: ret <3 x float> %hlsl.fmad
+float3 test_mad_float3(float3 p0, float3 p1, float3 p2) { return mad(p0, p1, p2); }
+
+// CHECK: %hlsl.fmad = call <4 x float>  @llvm.fmuladd.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
+// CHECK: ret <4 x float> %hlsl.fmad
+float4 test_mad_float4(float4 p0, float4 p1, float4 p2) { return mad(p0, p1, p2); }
+
+// CHECK: %hlsl.fmad = call double @llvm.fmuladd.f64(double %0, double %1, double %2)
+// CHECK: ret double %hlsl.fmad
+double test_mad_double(double p0, double p1, double p2) { return mad(p0, p1, p2); }
+
+// CHECK: %hlsl.fmad = call <2 x double>  @llvm.fmuladd.v2f64(<2 x double> %0, <2 x double> %1, <2 x double> %2)
+// CHECK: ret <2 x double> %hlsl.fmad
+double2 test_mad_double2(double2 p0, double2 p1, double2 p2) { return mad(p0, p1, p2); }
+
+// CHECK: %hlsl.fmad = call <3 x double>  @llvm.fmuladd.v3f64(<3 x double> %0, <3 x double> %1, <3 x double> %2)
+// CHECK: ret <3 x double> %hlsl.fmad
+double3 test_mad_double3(double3 p0, double3 p1, double3 p2) { return mad(p0, p1, p2); }
+
+// CHECK: %hlsl.fmad = call <4 x double>  @llvm.fmuladd.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %2)
+// CHECK: ret <4 x double> %hlsl.fmad
+double4 test_mad_double4(double4 p0, double4 p1, double4 p2) { return mad(p0, p1, p2); }
+
+// DXIL_CHECK: %dx.imad = call i32 @llvm.dx.imad.i32(i32 %0, i32 %1, i32 %2)
+// DXIL_CHECK: ret i32 %dx.imad
+// SPIR_CHECK: mul nsw i32 %{{.*}}, %{{.*}}
+// SPIR_CHECK: add nsw i32 %{{.*}}, %{{.*}}
+int test_mad_int(int p0, int p1, int p2) { return mad(p0, p1, p2); }
+
+// DXIL_CHECK: %dx.imad = call <2 x i32>  @llvm.dx.imad.v2i32(<2 x i32> %0, <2 x i32> %1, <2 x i32> %2)
+// DXIL_CHECK: ret <2 x i32> %dx.imad
+// SPIR_CHECK: mul nsw <2 x i32>  %{{.*}}, %{{.*}}
+// SPIR_CHECK: add nsw <2 x i32>  %{{.*}}, %{{.*}}
+int2 test_mad_int2(int2 p0, int2 p1, int2 p2) { return mad(p0, p1, p2); }
+
+// DXIL_CHECK: %dx.imad = call <3 x i32>  @llvm.dx.imad.v3i32(<3 x i32> %0, <3 x i32> %1, <3 x i32> %2)
+// DXIL_CHECK: ret <3 x i32> %dx.imad
+// SPIR_CHECK: mul nsw <3 x i32>  %{{.*}}, %{{.*}}
+// SPIR_CHECK: add nsw <3 x i32>  %{{.*}}, %{{.*}}
+int3 test_mad_int3(int3 p0, int3 p1, int3 p2) { return mad(p0, p1, p2); }
+
+// DXIL_CHECK: %dx.imad = call <4 x i32>  @llvm.dx.imad.v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
+// DXIL_CHECK: ret <4 x i32> %dx.imad
+// SPIR_CHECK: mul nsw <4 x i32>  %{{.*}}, %{{.*}}
+// SPIR_CHECK: add nsw <4 x i32>  %{{.*}}, %{{.*}}
+int4 test_mad_int4(int4 p0, int4 p1, int4 p2) { return mad(p0, p1, p2); }
+
+// DXIL_CHECK: %dx.imad = call i64 @llvm.dx.imad.i64(i64 %0, i64 %1, i64 %2)
+// DXIL_CHECK: ret i64 %dx.imad
+// SPIR_CHECK: mul nsw i64 %{{.*}}, %{{.*}}
+// SPIR_CHECK: add nsw i64 %{{.*}}, %{{.*}}
+int64_t test_mad_int64_t(int64_t p0, int64_t p1, int64_t p2) { return mad(p0, p1, p2); }
+
+// DXIL_CHECK: %dx.imad = call <2 x i64>  @llvm.dx.imad.v2i64(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2)
+// DXIL_CHECK: ret <2 x i64> %dx.imad
+// SPIR_CHECK: mul nsw <2 x i64>  %{{.*}}, %{{.*}}
+// SPIR_CHECK: add nsw <2 x i64>  %{{.*}}, %{{.*}}
+int64_t2 test_mad_int64_t2(int64_t2 p0, int64_t2 p1, int64_t2 p2) { return mad(p0, p1, p2); }
+
+// DXIL_CHECK: %dx.imad = call <3 x i64>  @llvm.dx.imad.v3i64(<3 x i64> %0, <3 x i64> %1, <3 x i64> %2)
+// DXIL_CHECK: ret <3 x i64> %dx.imad
+// SPIR_CHECK: mul nsw <3 x i64>  %{{.*}}, %{{.*}}
+// SPIR_CHECK: add nsw <3 x i64>  %{{.*}}, %{{.*}}
+int64_t3 test_mad_int64_t3(int64_t3 p0, int64_t3 p1, int64_t3 p2) { return mad(p0, p1, p2); }
+
+// DXIL_CHECK: %dx.imad = call <4 x i64>  @llvm.dx.imad.v4i64(<4 x i64> %0, <4 x i64> %1, <4 x i64> %2)
+// DXIL_CHECK: ret <4 x i64> %dx.imad
+// SPIR_CHECK: mul nsw <4 x i64>  %{{.*}}, %{{.*}}
+// SPIR_CHECK: add nsw <4 x i64>  %{{.*}}, %{{.*}}
+int64_t4 test_mad_int64_t4(int64_t4 p0, int64_t4 p1, int64_t4 p2) { return mad(p0, p1, p2); }
+
+// DXIL_CHECK: %dx.umad = call i32 @llvm.dx.umad.i32(i32 %0, i32 %1, i32 %2)
+// DXIL_CHECK: ret i32 %dx.umad
+// SPIR_CHECK: mul nuw i32 %{{.*}}, %{{.*}}
+// SPIR_CHECK: add nuw i32 %{{.*}}, %{{.*}}
+uint test_mad_uint(uint p0, uint p1, uint p2) { return mad(p0, p1, p2); }
+
+// DXIL_CHECK: %dx.umad = call <2 x i32>  @llvm.dx.umad.v2i32(<2 x i32> %0, <2 x i32> %1, <2 x i32> %2)
+// DXIL_CHECK: ret <2 x i32> %dx.umad
+// SPIR_CHECK: mul nuw <2 x i32>  %{{.*}}, %{{.*}}
+// SPIR_CHECK: add nuw <2 x i32>  %{{.*}}, %{{.*}}
+uint2 test_mad_uint2(uint2 p0, uint2 p1, uint2 p2) { return mad(p0, p1, p2); }
+
+// DXIL_CHECK: %dx.umad = call <3 x i32>  @llvm.dx.umad.v3i32(<3 x i32> %0, <3 x i32> %1, <3 x i32> %2)
+// DXIL_CHECK: ret <3 x i32> %dx.umad
+// SPIR_CHECK: mul nuw <3 x i32>  %{{.*}}, %{{.*}}
+// SPIR_CHECK: add nuw <3 x i32>  %{{.*}}, %{{.*}}
+uint3 test_mad_uint3(uint3 p0, uint3 p1, uint3 p2) { return mad(p0, p1, p2); }
+
+// DXIL_CHECK: %dx.umad = call <4 x i32>  @llvm.dx.umad.v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
+// DXIL_CHECK: ret <4 x i32> %dx.umad
+// SPIR_CHECK: mul nuw <4 x i32>  %{{.*}}, %{{.*}}
+// SPIR_CHECK: add nuw <4 x i32>  %{{.*}}, %{{.*}}
+uint4 test_mad_uint4(uint4 p0, uint4 p1, uint4 p2) { return mad(p0, p1, p2); }
+
+// DXIL_CHECK: %dx.umad = call i64 @llvm.dx.umad.i64(i64 %0, i64 %1, i64 %2)
+// DXIL_CHECK: ret i64 %dx.umad
+// SPIR_CHECK: mul nuw i64 %{{.*}}, %{{.*}}
+// SPIR_CHECK: add nuw i64 %{{.*}}, %{{.*}}
+uint64_t test_mad_uint64_t(uint64_t p0, uint64_t p1, uint64_t p2) { return mad(p0, p1, p2); }
+
+// DXIL_CHECK: %dx.umad = call <2 x i64>  @llvm.dx.umad.v2i64(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2)
+// DXIL_CHECK: ret <2 x i64> %dx.umad
+// SPIR_CHECK: mul nuw <2 x i64>  %{{.*}}, %{{.*}}
+// SPIR_CHECK: add nuw <2 x i64>  %{{.*}}, %{{.*}}
+uint64_t2 test_mad_uint64_t2(uint64_t2 p0, uint64_t2 p1, uint64_t2 p2) { return mad(p0, p1, p2); }
+
+// DXIL_CHECK: %dx.umad = call <3 x i64>  @llvm.dx.umad.v3i64(<3 x i64> %0, <3 x i64> %1, <3 x i64> %2)
+// DXIL_CHECK: ret <3 x i64> %dx.umad
+// SPIR_CHECK: mul nuw <3 x i64>  %{{.*}}, %{{.*}}
+// SPIR_CHECK: add nuw <3 x i64>  %{{.*}}, %{{.*}}
+uint64_t3 test_mad_uint64_t3(uint64_t3 p0, uint64_t3 p1, uint64_t3 p2) { return mad(p0, p1, p2); }
+
+// DXIL_CHECK: %dx.umad = call <4 x i64>  @llvm.dx.umad.v4i64(<4 x i64> %0, <4 x i64> %1, <4 x i64> %2)
+// DXIL_CHECK: ret <4 x i64> %dx.umad
+// SPIR_CHECK: mul nuw <4 x i64>  %{{.*}}, %{{.*}}
+// SPIR_CHECK: add nuw <4 x i64>  %{{.*}}, %{{.*}}
+uint64_t4 test_mad_uint64_t4(uint64_t4 p0, uint64_t4 p1, uint64_t4 p2) { return mad(p0, p1, p2); }
+
+// CHECK: %hlsl.fmad = call <2 x float>  @llvm.fmuladd.v2f32(<2 x float> %splat.splat, <2 x float> %1, <2 x float> %2)
+// CHECK: ret <2 x float> %hlsl.fmad
+float2 test_mad_float2_splat(float p0, float2 p1, float2 p2) { return mad(p0, p1, p2); }
+
+// CHECK: %hlsl.fmad = call <3 x float>  @llvm.fmuladd.v3f32(<3 x float> %splat.splat, <3 x float> %1, <3 x float> %2)
+// CHECK: ret <3 x float> %hlsl.fmad
+float3 test_mad_float3_splat(float p0, float3 p1, float3 p2) { return mad(p0, p1, p2); }
+
+// CHECK:  %hlsl.fmad = call <4 x float>  @llvm.fmuladd.v4f32(<4 x float> %splat.splat, <4 x float> %1, <4 x float> %2)
+// CHECK:  ret <4 x float> %hlsl.fmad
+float4 test_mad_float4_splat(float p0, float4 p1, float4 p2) { return mad(p0, p1, p2); }
+
+// CHECK: %conv = sitofp i32 %2 to float
+// CHECK: %splat.splatinsert = insertelement <2 x float> poison, float %conv, i64 0
+// CHECK: %splat.splat = shufflevector <2 x float> %splat.splatinsert, <2 x float> poison, <2 x i32> zeroinitializer
+// CHECK: %hlsl.fmad = call <2 x float>  @llvm.fmuladd.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %splat.splat)
+// CHECK: ret <2 x float> %hlsl.fmad
+float2 test_mad_float2_int_splat(float2 p0, float2 p1, int p2) {
+  return mad(p0, p1, p2);
+}
+
+// CHECK: %conv = sitofp i32 %2 to float
+// CHECK: %splat.splatinsert = insertelement <3 x float> poison, float %conv, i64 0
+// CHECK: %splat.splat = shufflevector <3 x float> %splat.splatinsert, <3 x float> poison, <3 x i32> zeroinitializer
+// CHECK:  %hlsl.fmad = call <3 x float>  @llvm.fmuladd.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %splat.splat)
+// CHECK: ret <3 x float> %hlsl.fmad
+float3 test_mad_float3_int_splat(float3 p0, float3 p1, int p2) {
+  return mad(p0, p1, p2);
+}
diff --git a/clang/test/CodeGenHLSL/builtins/max.hlsl b/clang/test/CodeGenHLSL/builtins/max.hlsl
index 272d1e8a10bd..f17062f7bb01 100644
--- a/clang/test/CodeGenHLSL/builtins/max.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/max.hlsl
@@ -1,134 +1,134 @@
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
-
-#ifdef __HLSL_ENABLE_16_BIT
-// NATIVE_HALF: define noundef i16 @
-// NATIVE_HALF: call i16 @llvm.smax.i16(
-int16_t test_max_short(int16_t p0, int16_t p1) { return max(p0, p1); }
-// NATIVE_HALF: define noundef <2 x i16> @
-// NATIVE_HALF: call <2 x i16> @llvm.smax.v2i16(
-int16_t2 test_max_short2(int16_t2 p0, int16_t2 p1) { return max(p0, p1); }
-// NATIVE_HALF: define noundef <3 x i16> @
-// NATIVE_HALF: call <3 x i16> @llvm.smax.v3i16
-int16_t3 test_max_short3(int16_t3 p0, int16_t3 p1) { return max(p0, p1); }
-// NATIVE_HALF: define noundef <4 x i16> @
-// NATIVE_HALF: call <4 x i16> @llvm.smax.v4i16
-int16_t4 test_max_short4(int16_t4 p0, int16_t4 p1) { return max(p0, p1); }
-
-// NATIVE_HALF: define noundef i16 @
-// NATIVE_HALF: call i16 @llvm.umax.i16(
-uint16_t test_max_ushort(uint16_t p0, uint16_t p1) { return max(p0, p1); }
-// NATIVE_HALF: define noundef <2 x i16> @
-// NATIVE_HALF: call <2 x i16> @llvm.umax.v2i16
-uint16_t2 test_max_ushort2(uint16_t2 p0, uint16_t2 p1) { return max(p0, p1); }
-// NATIVE_HALF: define noundef <3 x i16> @
-// NATIVE_HALF: call <3 x i16> @llvm.umax.v3i16
-uint16_t3 test_max_ushort3(uint16_t3 p0, uint16_t3 p1) { return max(p0, p1); }
-// NATIVE_HALF: define noundef <4 x i16> @
-// NATIVE_HALF: call <4 x i16> @llvm.umax.v4i16
-uint16_t4 test_max_ushort4(uint16_t4 p0, uint16_t4 p1) { return max(p0, p1); }
-#endif
-
-// CHECK: define noundef i32 @
-// CHECK: call i32 @llvm.smax.i32(
-int test_max_int(int p0, int p1) { return max(p0, p1); }
-// CHECK: define noundef <2 x i32> @
-// CHECK: call <2 x i32> @llvm.smax.v2i32
-int2 test_max_int2(int2 p0, int2 p1) { return max(p0, p1); }
-// CHECK: define noundef <3 x i32> @
-// CHECK: call <3 x i32> @llvm.smax.v3i32
-int3 test_max_int3(int3 p0, int3 p1) { return max(p0, p1); }
-// CHECK: define noundef <4 x i32> @
-// CHECK: call <4 x i32> @llvm.smax.v4i32
-int4 test_max_int4(int4 p0, int4 p1) { return max(p0, p1); }
-
-// CHECK: define noundef i32 @
-// CHECK: call i32 @llvm.umax.i32(
-int test_max_uint(uint p0, uint p1) { return max(p0, p1); }
-// CHECK: define noundef <2 x i32> @
-// CHECK: call <2 x i32> @llvm.umax.v2i32
-uint2 test_max_uint2(uint2 p0, uint2 p1) { return max(p0, p1); }
-// CHECK: define noundef <3 x i32> @
-// CHECK: call <3 x i32> @llvm.umax.v3i32
-uint3 test_max_uint3(uint3 p0, uint3 p1) { return max(p0, p1); }
-// CHECK: define noundef <4 x i32> @
-// CHECK: call <4 x i32> @llvm.umax.v4i32
-uint4 test_max_uint4(uint4 p0, uint4 p1) { return max(p0, p1); }
-
-// CHECK: define noundef i64 @
-// CHECK: call i64 @llvm.smax.i64(
-int64_t test_max_long(int64_t p0, int64_t p1) { return max(p0, p1); }
-// CHECK: define noundef <2 x i64> @
-// CHECK: call <2 x i64> @llvm.smax.v2i64
-int64_t2 test_max_long2(int64_t2 p0, int64_t2 p1) { return max(p0, p1); }
-// CHECK: define noundef <3 x i64> @
-// CHECK: call <3 x i64> @llvm.smax.v3i64
-int64_t3 test_max_long3(int64_t3 p0, int64_t3 p1) { return max(p0, p1); }
-// CHECK: define noundef <4 x i64> @
-// CHECK: call <4 x i64> @llvm.smax.v4i64
-int64_t4 test_max_long4(int64_t4 p0, int64_t4 p1) { return max(p0, p1); }
-
-// CHECK: define noundef i64 @
-// CHECK: call i64 @llvm.umax.i64(
-uint64_t test_max_long(uint64_t p0, uint64_t p1) { return max(p0, p1); }
-// CHECK: define noundef <2 x i64> @
-// CHECK: call <2 x i64> @llvm.umax.v2i64
-uint64_t2 test_max_long2(uint64_t2 p0, uint64_t2 p1) { return max(p0, p1); }
-// CHECK: define noundef <3 x i64> @
-// CHECK: call <3 x i64> @llvm.umax.v3i64
-uint64_t3 test_max_long3(uint64_t3 p0, uint64_t3 p1) { return max(p0, p1); }
-// CHECK: define noundef <4 x i64> @
-// CHECK: call <4 x i64> @llvm.umax.v4i64
-uint64_t4 test_max_long4(uint64_t4 p0, uint64_t4 p1) { return max(p0, p1); }
-
-// NATIVE_HALF: define noundef half @
-// NATIVE_HALF: call half @llvm.maxnum.f16(
-// NO_HALF: define noundef float @"?test_max_half
-// NO_HALF: call float @llvm.maxnum.f32(
-half test_max_half(half p0, half p1) { return max(p0, p1); }
-// NATIVE_HALF: define noundef <2 x half> @
-// NATIVE_HALF: call <2 x half> @llvm.maxnum.v2f16
-// NO_HALF: define noundef <2 x float> @"?test_max_half2
-// NO_HALF: call <2 x float> @llvm.maxnum.v2f32(
-half2 test_max_half2(half2 p0, half2 p1) { return max(p0, p1); }
-// NATIVE_HALF: define noundef <3 x half> @
-// NATIVE_HALF: call <3 x half> @llvm.maxnum.v3f16
-// NO_HALF: define noundef <3 x float> @"?test_max_half3
-// NO_HALF: call <3 x float> @llvm.maxnum.v3f32(
-half3 test_max_half3(half3 p0, half3 p1) { return max(p0, p1); }
-// NATIVE_HALF: define noundef <4 x half> @
-// NATIVE_HALF: call <4 x half> @llvm.maxnum.v4f16
-// NO_HALF: define noundef <4 x float> @"?test_max_half4
-// NO_HALF: call <4 x float> @llvm.maxnum.v4f32(
-half4 test_max_half4(half4 p0, half4 p1) { return max(p0, p1); }
-
-// CHECK: define noundef float @"?test_max_float
-// CHECK: call float @llvm.maxnum.f32(
-float test_max_float(float p0, float p1) { return max(p0, p1); }
-// CHECK: define noundef <2 x float> @"?test_max_float2
-// CHECK: call <2 x float> @llvm.maxnum.v2f32
-float2 test_max_float2(float2 p0, float2 p1) { return max(p0, p1); }
-// CHECK: define noundef <3 x float> @"?test_max_float3
-// CHECK: call <3 x float> @llvm.maxnum.v3f32
-float3 test_max_float3(float3 p0, float3 p1) { return max(p0, p1); }
-// CHECK: define noundef <4 x float> @"?test_max_float4
-// CHECK: call <4 x float> @llvm.maxnum.v4f32
-float4 test_max_float4(float4 p0, float4 p1) { return max(p0, p1); }
-
-// CHECK: define noundef double @
-// CHECK: call double @llvm.maxnum.f64(
-double test_max_double(double p0, double p1) { return max(p0, p1); }
-// CHECK: define noundef <2 x double> @
-// CHECK: call <2 x double> @llvm.maxnum.v2f64
-double2 test_max_double2(double2 p0, double2 p1) { return max(p0, p1); }
-// CHECK: define noundef <3 x double> @
-// CHECK: call <3 x double> @llvm.maxnum.v3f64
-double3 test_max_double3(double3 p0, double3 p1) { return max(p0, p1); }
-// CHECK: define noundef <4 x double> @
-// CHECK: call <4 x double> @llvm.maxnum.v4f64
-double4 test_max_double4(double4 p0, double4 p1) { return max(p0, p1); }
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+
+#ifdef __HLSL_ENABLE_16_BIT
+// NATIVE_HALF: define noundef i16 @
+// NATIVE_HALF: call i16 @llvm.smax.i16(
+int16_t test_max_short(int16_t p0, int16_t p1) { return max(p0, p1); }
+// NATIVE_HALF: define noundef <2 x i16> @
+// NATIVE_HALF: call <2 x i16> @llvm.smax.v2i16(
+int16_t2 test_max_short2(int16_t2 p0, int16_t2 p1) { return max(p0, p1); }
+// NATIVE_HALF: define noundef <3 x i16> @
+// NATIVE_HALF: call <3 x i16> @llvm.smax.v3i16
+int16_t3 test_max_short3(int16_t3 p0, int16_t3 p1) { return max(p0, p1); }
+// NATIVE_HALF: define noundef <4 x i16> @
+// NATIVE_HALF: call <4 x i16> @llvm.smax.v4i16
+int16_t4 test_max_short4(int16_t4 p0, int16_t4 p1) { return max(p0, p1); }
+
+// NATIVE_HALF: define noundef i16 @
+// NATIVE_HALF: call i16 @llvm.umax.i16(
+uint16_t test_max_ushort(uint16_t p0, uint16_t p1) { return max(p0, p1); }
+// NATIVE_HALF: define noundef <2 x i16> @
+// NATIVE_HALF: call <2 x i16> @llvm.umax.v2i16
+uint16_t2 test_max_ushort2(uint16_t2 p0, uint16_t2 p1) { return max(p0, p1); }
+// NATIVE_HALF: define noundef <3 x i16> @
+// NATIVE_HALF: call <3 x i16> @llvm.umax.v3i16
+uint16_t3 test_max_ushort3(uint16_t3 p0, uint16_t3 p1) { return max(p0, p1); }
+// NATIVE_HALF: define noundef <4 x i16> @
+// NATIVE_HALF: call <4 x i16> @llvm.umax.v4i16
+uint16_t4 test_max_ushort4(uint16_t4 p0, uint16_t4 p1) { return max(p0, p1); }
+#endif
+
+// CHECK: define noundef i32 @
+// CHECK: call i32 @llvm.smax.i32(
+int test_max_int(int p0, int p1) { return max(p0, p1); }
+// CHECK: define noundef <2 x i32> @
+// CHECK: call <2 x i32> @llvm.smax.v2i32
+int2 test_max_int2(int2 p0, int2 p1) { return max(p0, p1); }
+// CHECK: define noundef <3 x i32> @
+// CHECK: call <3 x i32> @llvm.smax.v3i32
+int3 test_max_int3(int3 p0, int3 p1) { return max(p0, p1); }
+// CHECK: define noundef <4 x i32> @
+// CHECK: call <4 x i32> @llvm.smax.v4i32
+int4 test_max_int4(int4 p0, int4 p1) { return max(p0, p1); }
+
+// CHECK: define noundef i32 @
+// CHECK: call i32 @llvm.umax.i32(
+int test_max_uint(uint p0, uint p1) { return max(p0, p1); }
+// CHECK: define noundef <2 x i32> @
+// CHECK: call <2 x i32> @llvm.umax.v2i32
+uint2 test_max_uint2(uint2 p0, uint2 p1) { return max(p0, p1); }
+// CHECK: define noundef <3 x i32> @
+// CHECK: call <3 x i32> @llvm.umax.v3i32
+uint3 test_max_uint3(uint3 p0, uint3 p1) { return max(p0, p1); }
+// CHECK: define noundef <4 x i32> @
+// CHECK: call <4 x i32> @llvm.umax.v4i32
+uint4 test_max_uint4(uint4 p0, uint4 p1) { return max(p0, p1); }
+
+// CHECK: define noundef i64 @
+// CHECK: call i64 @llvm.smax.i64(
+int64_t test_max_long(int64_t p0, int64_t p1) { return max(p0, p1); }
+// CHECK: define noundef <2 x i64> @
+// CHECK: call <2 x i64> @llvm.smax.v2i64
+int64_t2 test_max_long2(int64_t2 p0, int64_t2 p1) { return max(p0, p1); }
+// CHECK: define noundef <3 x i64> @
+// CHECK: call <3 x i64> @llvm.smax.v3i64
+int64_t3 test_max_long3(int64_t3 p0, int64_t3 p1) { return max(p0, p1); }
+// CHECK: define noundef <4 x i64> @
+// CHECK: call <4 x i64> @llvm.smax.v4i64
+int64_t4 test_max_long4(int64_t4 p0, int64_t4 p1) { return max(p0, p1); }
+
+// CHECK: define noundef i64 @
+// CHECK: call i64 @llvm.umax.i64(
+uint64_t test_max_long(uint64_t p0, uint64_t p1) { return max(p0, p1); }
+// CHECK: define noundef <2 x i64> @
+// CHECK: call <2 x i64> @llvm.umax.v2i64
+uint64_t2 test_max_long2(uint64_t2 p0, uint64_t2 p1) { return max(p0, p1); }
+// CHECK: define noundef <3 x i64> @
+// CHECK: call <3 x i64> @llvm.umax.v3i64
+uint64_t3 test_max_long3(uint64_t3 p0, uint64_t3 p1) { return max(p0, p1); }
+// CHECK: define noundef <4 x i64> @
+// CHECK: call <4 x i64> @llvm.umax.v4i64
+uint64_t4 test_max_long4(uint64_t4 p0, uint64_t4 p1) { return max(p0, p1); }
+
+// NATIVE_HALF: define noundef half @
+// NATIVE_HALF: call half @llvm.maxnum.f16(
+// NO_HALF: define noundef float @"?test_max_half
+// NO_HALF: call float @llvm.maxnum.f32(
+half test_max_half(half p0, half p1) { return max(p0, p1); }
+// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF: call <2 x half> @llvm.maxnum.v2f16
+// NO_HALF: define noundef <2 x float> @"?test_max_half2
+// NO_HALF: call <2 x float> @llvm.maxnum.v2f32(
+half2 test_max_half2(half2 p0, half2 p1) { return max(p0, p1); }
+// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF: call <3 x half> @llvm.maxnum.v3f16
+// NO_HALF: define noundef <3 x float> @"?test_max_half3
+// NO_HALF: call <3 x float> @llvm.maxnum.v3f32(
+half3 test_max_half3(half3 p0, half3 p1) { return max(p0, p1); }
+// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF: call <4 x half> @llvm.maxnum.v4f16
+// NO_HALF: define noundef <4 x float> @"?test_max_half4
+// NO_HALF: call <4 x float> @llvm.maxnum.v4f32(
+half4 test_max_half4(half4 p0, half4 p1) { return max(p0, p1); }
+
+// CHECK: define noundef float @"?test_max_float
+// CHECK: call float @llvm.maxnum.f32(
+float test_max_float(float p0, float p1) { return max(p0, p1); }
+// CHECK: define noundef <2 x float> @"?test_max_float2
+// CHECK: call <2 x float> @llvm.maxnum.v2f32
+float2 test_max_float2(float2 p0, float2 p1) { return max(p0, p1); }
+// CHECK: define noundef <3 x float> @"?test_max_float3
+// CHECK: call <3 x float> @llvm.maxnum.v3f32
+float3 test_max_float3(float3 p0, float3 p1) { return max(p0, p1); }
+// CHECK: define noundef <4 x float> @"?test_max_float4
+// CHECK: call <4 x float> @llvm.maxnum.v4f32
+float4 test_max_float4(float4 p0, float4 p1) { return max(p0, p1); }
+
+// CHECK: define noundef double @
+// CHECK: call double @llvm.maxnum.f64(
+double test_max_double(double p0, double p1) { return max(p0, p1); }
+// CHECK: define noundef <2 x double> @
+// CHECK: call <2 x double> @llvm.maxnum.v2f64
+double2 test_max_double2(double2 p0, double2 p1) { return max(p0, p1); }
+// CHECK: define noundef <3 x double> @
+// CHECK: call <3 x double> @llvm.maxnum.v3f64
+double3 test_max_double3(double3 p0, double3 p1) { return max(p0, p1); }
+// CHECK: define noundef <4 x double> @
+// CHECK: call <4 x double> @llvm.maxnum.v4f64
+double4 test_max_double4(double4 p0, double4 p1) { return max(p0, p1); }
diff --git a/clang/test/CodeGenHLSL/builtins/pow.hlsl b/clang/test/CodeGenHLSL/builtins/pow.hlsl
index 057cd7215aa5..9a2264e74075 100644
--- a/clang/test/CodeGenHLSL/builtins/pow.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/pow.hlsl
@@ -1,41 +1,41 @@
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
-
-// NATIVE_HALF: define noundef half @
-// NATIVE_HALF: call half @llvm.pow.f16(
-// NO_HALF: define noundef float @"?test_pow_half
-// NO_HALF: call float @llvm.pow.f32(
-half test_pow_half(half p0, half p1) { return pow(p0, p1); }
-// NATIVE_HALF: define noundef <2 x half> @"?test_pow_half2
-// NATIVE_HALF: call <2 x half> @llvm.pow.v2f16
-// NO_HALF: define noundef <2 x float> @"?test_pow_half2
-// NO_HALF: call <2 x float> @llvm.pow.v2f32(
-half2 test_pow_half2(half2 p0, half2 p1) { return pow(p0, p1); }
-// NATIVE_HALF: define noundef <3 x half> @"?test_pow_half3
-// NATIVE_HALF: call <3 x half> @llvm.pow.v3f16
-// NO_HALF: define noundef <3 x float> @"?test_pow_half3
-// NO_HALF: call <3 x float> @llvm.pow.v3f32(
-half3 test_pow_half3(half3 p0, half3 p1) { return pow(p0, p1); }
-// NATIVE_HALF: define noundef <4 x half> @"?test_pow_half4
-// NATIVE_HALF: call <4 x half> @llvm.pow.v4f16
-// NO_HALF: define noundef <4 x float> @"?test_pow_half4
-// NO_HALF: call <4 x float> @llvm.pow.v4f32(
-half4 test_pow_half4(half4 p0, half4 p1) { return pow(p0, p1); }
-
-// CHECK: define noundef float @"?test_pow_float
-// CHECK: call float @llvm.pow.f32(
-float test_pow_float(float p0, float p1) { return pow(p0, p1); }
-// CHECK: define noundef <2 x float> @"?test_pow_float2
-// CHECK: call <2 x float> @llvm.pow.v2f32
-float2 test_pow_float2(float2 p0, float2 p1) { return pow(p0, p1); }
-// CHECK: define noundef <3 x float> @"?test_pow_float3
-// CHECK: call <3 x float> @llvm.pow.v3f32
-float3 test_pow_float3(float3 p0, float3 p1) { return pow(p0, p1); }
-// CHECK: define noundef <4 x float> @"?test_pow_float4
-// CHECK: call <4 x float> @llvm.pow.v4f32
-float4 test_pow_float4(float4 p0, float4 p1) { return pow(p0, p1); }
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+
+// NATIVE_HALF: define noundef half @
+// NATIVE_HALF: call half @llvm.pow.f16(
+// NO_HALF: define noundef float @"?test_pow_half
+// NO_HALF: call float @llvm.pow.f32(
+half test_pow_half(half p0, half p1) { return pow(p0, p1); }
+// NATIVE_HALF: define noundef <2 x half> @"?test_pow_half2
+// NATIVE_HALF: call <2 x half> @llvm.pow.v2f16
+// NO_HALF: define noundef <2 x float> @"?test_pow_half2
+// NO_HALF: call <2 x float> @llvm.pow.v2f32(
+half2 test_pow_half2(half2 p0, half2 p1) { return pow(p0, p1); }
+// NATIVE_HALF: define noundef <3 x half> @"?test_pow_half3
+// NATIVE_HALF: call <3 x half> @llvm.pow.v3f16
+// NO_HALF: define noundef <3 x float> @"?test_pow_half3
+// NO_HALF: call <3 x float> @llvm.pow.v3f32(
+half3 test_pow_half3(half3 p0, half3 p1) { return pow(p0, p1); }
+// NATIVE_HALF: define noundef <4 x half> @"?test_pow_half4
+// NATIVE_HALF: call <4 x half> @llvm.pow.v4f16
+// NO_HALF: define noundef <4 x float> @"?test_pow_half4
+// NO_HALF: call <4 x float> @llvm.pow.v4f32(
+half4 test_pow_half4(half4 p0, half4 p1) { return pow(p0, p1); }
+
+// CHECK: define noundef float @"?test_pow_float
+// CHECK: call float @llvm.pow.f32(
+float test_pow_float(float p0, float p1) { return pow(p0, p1); }
+// CHECK: define noundef <2 x float> @"?test_pow_float2
+// CHECK: call <2 x float> @llvm.pow.v2f32
+float2 test_pow_float2(float2 p0, float2 p1) { return pow(p0, p1); }
+// CHECK: define noundef <3 x float> @"?test_pow_float3
+// CHECK: call <3 x float> @llvm.pow.v3f32
+float3 test_pow_float3(float3 p0, float3 p1) { return pow(p0, p1); }
+// CHECK: define noundef <4 x float> @"?test_pow_float4
+// CHECK: call <4 x float> @llvm.pow.v4f32
+float4 test_pow_float4(float4 p0, float4 p1) { return pow(p0, p1); }
diff --git a/clang/test/CodeGenHLSL/builtins/reversebits.hlsl b/clang/test/CodeGenHLSL/builtins/reversebits.hlsl
index a319417e97a4..fe137b9cae4e 100644
--- a/clang/test/CodeGenHLSL/builtins/reversebits.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/reversebits.hlsl
@@ -1,80 +1,80 @@
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -O3 -o - | FileCheck %s
-
-#ifdef __HLSL_ENABLE_16_BIT
-// CHECK: define noundef i16 @
-// CHECK: call i16 @llvm.bitreverse.i16(
-uint16_t test_bitreverse_ushort(uint16_t p0)
-{
-	return reversebits(p0);
-}
-// CHECK: define noundef <2 x i16> @
-// CHECK: call <2 x i16> @llvm.bitreverse.v2i16
-uint16_t2 test_bitreverse_ushort2(uint16_t2 p0)
-{
-	return reversebits(p0);
-}
-// CHECK: define noundef <3 x i16> @
-// CHECK: call <3 x i16> @llvm.bitreverse.v3i16
-uint16_t3 test_bitreverse_ushort3(uint16_t3 p0)
-{
-	return reversebits(p0);
-}
-// CHECK: define noundef <4 x i16> @
-// CHECK: call <4 x i16> @llvm.bitreverse.v4i16
-uint16_t4 test_bitreverse_ushort4(uint16_t4 p0)
-{
-	return reversebits(p0);
-}
-#endif
-
-// CHECK: define noundef i32 @
-// CHECK: call i32 @llvm.bitreverse.i32(
-int test_bitreverse_uint(uint p0)
-{
-	return reversebits(p0);
-}
-// CHECK: define noundef <2 x i32> @
-// CHECK: call <2 x i32> @llvm.bitreverse.v2i32
-uint2 test_bitreverse_uint2(uint2 p0)
-{
-	return reversebits(p0);
-}
-// CHECK: define noundef <3 x i32> @
-// CHECK: call <3 x i32> @llvm.bitreverse.v3i32
-uint3 test_bitreverse_uint3(uint3 p0)
-{
-	return reversebits(p0);
-}
-// CHECK: define noundef <4 x i32> @
-// CHECK: call <4 x i32> @llvm.bitreverse.v4i32
-uint4 test_bitreverse_uint4(uint4 p0)
-{
-	return reversebits(p0);
-}
-
-// CHECK: define noundef i64 @
-// CHECK: call i64 @llvm.bitreverse.i64(
-uint64_t test_bitreverse_long(uint64_t p0)
-{
-	return reversebits(p0);
-}
-// CHECK: define noundef <2 x i64> @
-// CHECK: call <2 x i64> @llvm.bitreverse.v2i64
-uint64_t2 test_bitreverse_long2(uint64_t2 p0)
-{
-	return reversebits(p0);
-}
-// CHECK: define noundef <3 x i64> @
-// CHECK: call <3 x i64> @llvm.bitreverse.v3i64
-uint64_t3 test_bitreverse_long3(uint64_t3 p0)
-{
-	return reversebits(p0);
-}
-// CHECK: define noundef <4 x i64> @
-// CHECK: call <4 x i64> @llvm.bitreverse.v4i64
-uint64_t4 test_bitreverse_long4(uint64_t4 p0)
-{
-	return reversebits(p0);
-}
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -O3 -o - | FileCheck %s
+
+#ifdef __HLSL_ENABLE_16_BIT
+// CHECK: define noundef i16 @
+// CHECK: call i16 @llvm.bitreverse.i16(
+uint16_t test_bitreverse_ushort(uint16_t p0)
+{
+	return reversebits(p0);
+}
+// CHECK: define noundef <2 x i16> @
+// CHECK: call <2 x i16> @llvm.bitreverse.v2i16
+uint16_t2 test_bitreverse_ushort2(uint16_t2 p0)
+{
+	return reversebits(p0);
+}
+// CHECK: define noundef <3 x i16> @
+// CHECK: call <3 x i16> @llvm.bitreverse.v3i16
+uint16_t3 test_bitreverse_ushort3(uint16_t3 p0)
+{
+	return reversebits(p0);
+}
+// CHECK: define noundef <4 x i16> @
+// CHECK: call <4 x i16> @llvm.bitreverse.v4i16
+uint16_t4 test_bitreverse_ushort4(uint16_t4 p0)
+{
+	return reversebits(p0);
+}
+#endif
+
+// CHECK: define noundef i32 @
+// CHECK: call i32 @llvm.bitreverse.i32(
+int test_bitreverse_uint(uint p0)
+{
+	return reversebits(p0);
+}
+// CHECK: define noundef <2 x i32> @
+// CHECK: call <2 x i32> @llvm.bitreverse.v2i32
+uint2 test_bitreverse_uint2(uint2 p0)
+{
+	return reversebits(p0);
+}
+// CHECK: define noundef <3 x i32> @
+// CHECK: call <3 x i32> @llvm.bitreverse.v3i32
+uint3 test_bitreverse_uint3(uint3 p0)
+{
+	return reversebits(p0);
+}
+// CHECK: define noundef <4 x i32> @
+// CHECK: call <4 x i32> @llvm.bitreverse.v4i32
+uint4 test_bitreverse_uint4(uint4 p0)
+{
+	return reversebits(p0);
+}
+
+// CHECK: define noundef i64 @
+// CHECK: call i64 @llvm.bitreverse.i64(
+uint64_t test_bitreverse_long(uint64_t p0)
+{
+	return reversebits(p0);
+}
+// CHECK: define noundef <2 x i64> @
+// CHECK: call <2 x i64> @llvm.bitreverse.v2i64
+uint64_t2 test_bitreverse_long2(uint64_t2 p0)
+{
+	return reversebits(p0);
+}
+// CHECK: define noundef <3 x i64> @
+// CHECK: call <3 x i64> @llvm.bitreverse.v3i64
+uint64_t3 test_bitreverse_long3(uint64_t3 p0)
+{
+	return reversebits(p0);
+}
+// CHECK: define noundef <4 x i64> @
+// CHECK: call <4 x i64> @llvm.bitreverse.v4i64
+uint64_t4 test_bitreverse_long4(uint64_t4 p0)
+{
+	return reversebits(p0);
+}
diff --git a/clang/test/CodeGenHLSL/builtins/sin.hlsl b/clang/test/CodeGenHLSL/builtins/sin.hlsl
index ffb522149138..83e8a5be39d0 100644
--- a/clang/test/CodeGenHLSL/builtins/sin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/sin.hlsl
@@ -1,41 +1,41 @@
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes  -o - | FileCheck %s \
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
-
-// NATIVE_HALF: define noundef half @
-// NATIVE_HALF: call half @llvm.sin.f16(
-// NO_HALF: define noundef float @"?test_sin_half@@YA$halff@$halff@@Z"(
-// NO_HALF: call float @llvm.sin.f32(
-half test_sin_half(half p0) { return sin(p0); }
-// NATIVE_HALF: define noundef <2 x half> @
-// NATIVE_HALF: call <2 x half> @llvm.sin.v2f16
-// NO_HALF: define noundef <2 x float> @"?test_sin_half2
-// NO_HALF: call <2 x float> @llvm.sin.v2f32(
-half2 test_sin_half2(half2 p0) { return sin(p0); }
-// NATIVE_HALF: define noundef <3 x half> @
-// NATIVE_HALF: call <3 x half> @llvm.sin.v3f16
-// NO_HALF: define noundef <3 x float> @"?test_sin_half3
-// NO_HALF: call <3 x float> @llvm.sin.v3f32(
-half3 test_sin_half3(half3 p0) { return sin(p0); }
-// NATIVE_HALF: define noundef <4 x half> @
-// NATIVE_HALF: call <4 x half> @llvm.sin.v4f16
-// NO_HALF: define noundef <4 x float> @"?test_sin_half4
-// NO_HALF: call <4 x float> @llvm.sin.v4f32(
-half4 test_sin_half4(half4 p0) { return sin(p0); }
-
-// CHECK: define noundef float @
-// CHECK: call float @llvm.sin.f32(
-float test_sin_float(float p0) { return sin(p0); }
-// CHECK: define noundef <2 x float> @
-// CHECK: call <2 x float> @llvm.sin.v2f32
-float2 test_sin_float2(float2 p0) { return sin(p0); }
-// CHECK: define noundef <3 x float> @
-// CHECK: call <3 x float> @llvm.sin.v3f32
-float3 test_sin_float3(float3 p0) { return sin(p0); }
-// CHECK: define noundef <4 x float> @
-// CHECK: call <4 x float> @llvm.sin.v4f32
-float4 test_sin_float4(float4 p0) { return sin(p0); }
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes  -o - | FileCheck %s \
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+
+// NATIVE_HALF: define noundef half @
+// NATIVE_HALF: call half @llvm.sin.f16(
+// NO_HALF: define noundef float @"?test_sin_half@@YA$halff@$halff@@Z"(
+// NO_HALF: call float @llvm.sin.f32(
+half test_sin_half(half p0) { return sin(p0); }
+// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF: call <2 x half> @llvm.sin.v2f16
+// NO_HALF: define noundef <2 x float> @"?test_sin_half2
+// NO_HALF: call <2 x float> @llvm.sin.v2f32(
+half2 test_sin_half2(half2 p0) { return sin(p0); }
+// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF: call <3 x half> @llvm.sin.v3f16
+// NO_HALF: define noundef <3 x float> @"?test_sin_half3
+// NO_HALF: call <3 x float> @llvm.sin.v3f32(
+half3 test_sin_half3(half3 p0) { return sin(p0); }
+// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF: call <4 x half> @llvm.sin.v4f16
+// NO_HALF: define noundef <4 x float> @"?test_sin_half4
+// NO_HALF: call <4 x float> @llvm.sin.v4f32(
+half4 test_sin_half4(half4 p0) { return sin(p0); }
+
+// CHECK: define noundef float @
+// CHECK: call float @llvm.sin.f32(
+float test_sin_float(float p0) { return sin(p0); }
+// CHECK: define noundef <2 x float> @
+// CHECK: call <2 x float> @llvm.sin.v2f32
+float2 test_sin_float2(float2 p0) { return sin(p0); }
+// CHECK: define noundef <3 x float> @
+// CHECK: call <3 x float> @llvm.sin.v3f32
+float3 test_sin_float3(float3 p0) { return sin(p0); }
+// CHECK: define noundef <4 x float> @
+// CHECK: call <4 x float> @llvm.sin.v4f32
+float4 test_sin_float4(float4 p0) { return sin(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/tan.hlsl b/clang/test/CodeGenHLSL/builtins/tan.hlsl
new file mode 100644
index 000000000000..aa542fac226d
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/tan.hlsl
@@ -0,0 +1,59 @@
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+
+// CHECK-LABEL: test_tan_half
+// NATIVE_HALF: call half @llvm.tan.f16
+// NO_HALF: call float @llvm.tan.f32
+half test_tan_half ( half p0 ) {
+  return tan ( p0 );
+}
+
+// CHECK-LABEL: test_tan_half2
+// NATIVE_HALF: call <2 x half> @llvm.tan.v2f16
+// NO_HALF: call <2 x float> @llvm.tan.v2f32
+half2 test_tan_half2 ( half2 p0 ) {
+  return tan ( p0 );
+}
+
+// CHECK-LABEL: test_tan_half3
+// NATIVE_HALF: call <3 x half> @llvm.tan.v3f16
+// NO_HALF: call <3 x float> @llvm.tan.v3f32
+half3 test_tan_half3 ( half3 p0 ) {
+  return tan ( p0 );
+}
+
+// CHECK-LABEL: test_tan_half4
+// NATIVE_HALF: call <4 x half> @llvm.tan.v4f16
+// NO_HALF: call <4 x float> @llvm.tan.v4f32
+half4 test_tan_half4 ( half4 p0 ) {
+  return tan ( p0 );
+}
+
+// CHECK-LABEL: test_tan_float
+// CHECK: call float @llvm.tan.f32
+float test_tan_float ( float p0 ) {
+  return tan ( p0 );
+}
+
+// CHECK-LABEL: test_tan_float2
+// CHECK: call <2 x float> @llvm.tan.v2f32
+float2 test_tan_float2 ( float2 p0 ) {
+  return tan ( p0 );
+}
+
+// CHECK-LABEL: test_tan_float3
+// CHECK: call <3 x float> @llvm.tan.v3f32
+float3 test_tan_float3 ( float3 p0 ) {
+  return tan ( p0 );
+}
+
+// CHECK-LABEL: test_tan_float4
+// CHECK: call <4 x float> @llvm.tan.v4f32
+float4 test_tan_float4 ( float4 p0 ) {
+  return tan ( p0 );
+}
diff --git a/clang/test/CodeGenHLSL/builtins/trunc.hlsl b/clang/test/CodeGenHLSL/builtins/trunc.hlsl
index 6078aae5f873..40b71f45a9cc 100644
--- a/clang/test/CodeGenHLSL/builtins/trunc.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/trunc.hlsl
@@ -1,47 +1,47 @@
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
-
-// NATIVE_HALF: define noundef half @"?test_trunc_half
-// NATIVE_HALF: call half @llvm.trunc.f16(
-// NO_HALF: define noundef float @"?test_trunc_half
-// NO_HALF: call float @llvm.trunc.f32(
-half test_trunc_half(half p0) { return trunc(p0); }
-
-// NATIVE_HALF: define noundef <2 x half> @"?test_trunc_half2
-// NATIVE_HALF: call <2 x half> @llvm.trunc.v2f16
-// NO_HALF: define noundef <2 x float> @"?test_trunc_half2
-// NO_HALF: call <2 x float> @llvm.trunc.v2f32(
-half2 test_trunc_half2(half2 p0) { return trunc(p0); }
-
-// NATIVE_HALF: define noundef <3 x half> @"?test_trunc_half3
-// NATIVE_HALF: call <3 x half> @llvm.trunc.v3f16
-// NO_HALF: define noundef <3 x float> @"?test_trunc_half3
-// NO_HALF: call <3 x float> @llvm.trunc.v3f32(
-half3 test_trunc_half3(half3 p0) { return trunc(p0); }
-
-// NATIVE_HALF: define noundef <4 x half> @"?test_trunc_half4
-// NATIVE_HALF: call <4 x half> @llvm.trunc.v4f16
-// NO_HALF: define noundef <4 x float> @"?test_trunc_half4
-// NO_HALF: call <4 x float> @llvm.trunc.v4f32(
-half4 test_trunc_half4(half4 p0) { return trunc(p0); }
-
-// CHECK: define noundef float @"?test_trunc_float
-// CHECK: call float @llvm.trunc.f32(
-float test_trunc_float(float p0) { return trunc(p0); }
-
-// CHECK: define noundef <2 x float> @"?test_trunc_float2
-// CHECK: call <2 x float> @llvm.trunc.v2f32
-float2 test_trunc_float2(float2 p0) { return trunc(p0); }
-
-// CHECK: define noundef <3 x float> @"?test_trunc_float3
-// CHECK: call <3 x float> @llvm.trunc.v3f32
-float3 test_trunc_float3(float3 p0) { return trunc(p0); }
-
-// CHECK: define noundef <4 x float> @"?test_trunc_float4
-// CHECK: call <4 x float> @llvm.trunc.v4f32
-float4 test_trunc_float4(float4 p0) { return trunc(p0); }
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+
+// NATIVE_HALF: define noundef half @"?test_trunc_half
+// NATIVE_HALF: call half @llvm.trunc.f16(
+// NO_HALF: define noundef float @"?test_trunc_half
+// NO_HALF: call float @llvm.trunc.f32(
+half test_trunc_half(half p0) { return trunc(p0); }
+
+// NATIVE_HALF: define noundef <2 x half> @"?test_trunc_half2
+// NATIVE_HALF: call <2 x half> @llvm.trunc.v2f16
+// NO_HALF: define noundef <2 x float> @"?test_trunc_half2
+// NO_HALF: call <2 x float> @llvm.trunc.v2f32(
+half2 test_trunc_half2(half2 p0) { return trunc(p0); }
+
+// NATIVE_HALF: define noundef <3 x half> @"?test_trunc_half3
+// NATIVE_HALF: call <3 x half> @llvm.trunc.v3f16
+// NO_HALF: define noundef <3 x float> @"?test_trunc_half3
+// NO_HALF: call <3 x float> @llvm.trunc.v3f32(
+half3 test_trunc_half3(half3 p0) { return trunc(p0); }
+
+// NATIVE_HALF: define noundef <4 x half> @"?test_trunc_half4
+// NATIVE_HALF: call <4 x half> @llvm.trunc.v4f16
+// NO_HALF: define noundef <4 x float> @"?test_trunc_half4
+// NO_HALF: call <4 x float> @llvm.trunc.v4f32(
+half4 test_trunc_half4(half4 p0) { return trunc(p0); }
+
+// CHECK: define noundef float @"?test_trunc_float
+// CHECK: call float @llvm.trunc.f32(
+float test_trunc_float(float p0) { return trunc(p0); }
+
+// CHECK: define noundef <2 x float> @"?test_trunc_float2
+// CHECK: call <2 x float> @llvm.trunc.v2f32
+float2 test_trunc_float2(float2 p0) { return trunc(p0); }
+
+// CHECK: define noundef <3 x float> @"?test_trunc_float3
+// CHECK: call <3 x float> @llvm.trunc.v3f32
+float3 test_trunc_float3(float3 p0) { return trunc(p0); }
+
+// CHECK: define noundef <4 x float> @"?test_trunc_float4
+// CHECK: call <4 x float> @llvm.trunc.v4f32
+float4 test_trunc_float4(float4 p0) { return trunc(p0); }
diff --git a/clang/test/CodeGenHLSL/disable_opt.hlsl b/clang/test/CodeGenHLSL/disable_opt.hlsl
index 9bd92a797cb8..bfffe76cfa9d 100644
--- a/clang/test/CodeGenHLSL/disable_opt.hlsl
+++ b/clang/test/CodeGenHLSL/disable_opt.hlsl
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -S -triple dxil-pc-shadermodel6.3-library -O0 -emit-llvm -xhlsl  -o - %s | FileCheck %s
-// RUN: %clang_cc1 -S -triple dxil-pc-shadermodel6.3-library -O3 -emit-llvm -xhlsl  -o - %s | FileCheck %s --check-prefix=OPT
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -O0 -emit-llvm -xhlsl -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -O3 -emit-llvm -xhlsl -o - %s | FileCheck %s --check-prefix=OPT
 
 // CHECK:!"dx.disable_optimizations", i32 1}
 
diff --git a/clang/test/CodeGenHLSL/semantics/DispatchThreadID.hlsl b/clang/test/CodeGenHLSL/semantics/DispatchThreadID.hlsl
index 3efc36baa35b..2004a9d894a5 100644
--- a/clang/test/CodeGenHLSL/semantics/DispatchThreadID.hlsl
+++ b/clang/test/CodeGenHLSL/semantics/DispatchThreadID.hlsl
@@ -1,25 +1,25 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
-// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
-
-// Make sure SV_DispatchThreadID translated into dx.thread.id.
-
-// CHECK:       define void @foo()
-// CHECK-DXIL:  %[[#ID:]] = call i32 @llvm.dx.thread.id(i32 0)
-// CHECK-SPIRV: %[[#ID:]] = call i32 @llvm.spv.thread.id(i32 0)
-// CHECK:       call void @{{.*}}foo{{.*}}(i32 %[[#ID]])
-[shader("compute")]
-[numthreads(8,8,1)]
-void foo(uint Idx : SV_DispatchThreadID) {}
-
-// CHECK:       define void @bar()
-// CHECK-DXIL:  %[[#ID_X:]] = call i32 @llvm.dx.thread.id(i32 0)
-// CHECK-SPIRV: %[[#ID_X:]] = call i32 @llvm.spv.thread.id(i32 0)
-// CHECK:       %[[#ID_X_:]] = insertelement <2 x i32> poison, i32 %[[#ID_X]], i64 0
-// CHECK-DXIL:  %[[#ID_Y:]] = call i32 @llvm.dx.thread.id(i32 1)
-// CHECK-SPIRV: %[[#ID_Y:]] = call i32 @llvm.spv.thread.id(i32 1)
-// CHECK:       %[[#ID_XY:]] = insertelement <2 x i32> %[[#ID_X_]], i32 %[[#ID_Y]], i64 1
-// CHECK-DXIL:  call void @{{.*}}bar{{.*}}(<2 x i32> %[[#ID_XY]])
-[shader("compute")]
-[numthreads(8,8,1)]
-void bar(uint2 Idx : SV_DispatchThreadID) {}
-
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
+// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
+
+// Make sure SV_DispatchThreadID translated into dx.thread.id.
+
+// CHECK:       define void @foo()
+// CHECK-DXIL:  %[[#ID:]] = call i32 @llvm.dx.thread.id(i32 0)
+// CHECK-SPIRV: %[[#ID:]] = call i32 @llvm.spv.thread.id(i32 0)
+// CHECK:       call void @{{.*}}foo{{.*}}(i32 %[[#ID]])
+[shader("compute")]
+[numthreads(8,8,1)]
+void foo(uint Idx : SV_DispatchThreadID) {}
+
+// CHECK:       define void @bar()
+// CHECK-DXIL:  %[[#ID_X:]] = call i32 @llvm.dx.thread.id(i32 0)
+// CHECK-SPIRV: %[[#ID_X:]] = call i32 @llvm.spv.thread.id(i32 0)
+// CHECK:       %[[#ID_X_:]] = insertelement <2 x i32> poison, i32 %[[#ID_X]], i64 0
+// CHECK-DXIL:  %[[#ID_Y:]] = call i32 @llvm.dx.thread.id(i32 1)
+// CHECK-SPIRV: %[[#ID_Y:]] = call i32 @llvm.spv.thread.id(i32 1)
+// CHECK:       %[[#ID_XY:]] = insertelement <2 x i32> %[[#ID_X_]], i32 %[[#ID_Y]], i64 1
+// CHECK-DXIL:  call void @{{.*}}bar{{.*}}(<2 x i32> %[[#ID_XY]])
+[shader("compute")]
+[numthreads(8,8,1)]
+void bar(uint2 Idx : SV_DispatchThreadID) {}
+
diff --git a/clang/test/CodeGenHLSL/sret_output.hlsl b/clang/test/CodeGenHLSL/sret_output.hlsl
index 33f88c639525..c44914f963a9 100644
--- a/clang/test/CodeGenHLSL/sret_output.hlsl
+++ b/clang/test/CodeGenHLSL/sret_output.hlsl
@@ -1,22 +1,22 @@
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s  \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
-
-// FIXME: add semantic to a.
-// See https://github.com/llvm/llvm-project/issues/57874
-struct S {
-  float a;
-};
-
-
-// Make sure sret parameter is generated.
-// CHECK:define internal void @"?ps_main@@YA?AUS@@XZ"(ptr dead_on_unwind noalias writable sret(%struct.S) align 4 %agg.result)
-// FIXME: change it to real value instead of poison value once semantic is add to a.
-// Make sure the function with sret is called.
-// CHECK:call void @"?ps_main@@YA?AUS@@XZ"(ptr poison)
-[shader("pixel")]
-S ps_main() {
-  S s;
-  s.a = 0;
-  return s;
-};
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s  \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+
+// FIXME: add semantic to a.
+// See https://github.com/llvm/llvm-project/issues/57874
+struct S {
+  float a;
+};
+
+
+// Make sure sret parameter is generated.
+// CHECK:define internal void @"?ps_main@@YA?AUS@@XZ"(ptr dead_on_unwind noalias writable sret(%struct.S) align 4 %agg.result)
+// FIXME: change it to real value instead of poison value once semantic is add to a.
+// Make sure the function with sret is called.
+// CHECK:call void @"?ps_main@@YA?AUS@@XZ"(ptr poison)
+[shader("pixel")]
+S ps_main() {
+  S s;
+  s.a = 0;
+  return s;
+};
diff --git a/clang/test/CodeGenHLSL/this-assignment-overload.hlsl b/clang/test/CodeGenHLSL/this-assignment-overload.hlsl
index 92504dfbd626..d2c630a1fb13 100644
--- a/clang/test/CodeGenHLSL/this-assignment-overload.hlsl
+++ b/clang/test/CodeGenHLSL/this-assignment-overload.hlsl
@@ -1,55 +1,55 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-llvm -disable-llvm-passes -o - -std=hlsl202x %s | FileCheck %s
-
-struct Pair {
-  int First;
-  int Second;
-  int getFirst() {
-    Pair Another = {5, 10};
-    this = Another;
-      return this.First;
-  }
-  int getSecond() {
-    this = Pair();
-    return Second;
-  }
-  void operator=(Pair P) {
-    First = P.First;
-    Second = 2;
-  }
-};
-[numthreads(1, 1, 1)]
-void main() {
-  Pair Vals = {1, 2};
-  Vals.First = Vals.getFirst();
-  Vals.Second = Vals.getSecond();
-}
-
-// This test makes a probably safe assumption that HLSL 202x includes operator overloading for assignment operators.
-// CHECK:     define linkonce_odr noundef i32 @"?getFirst@Pair@@QAAHXZ"(ptr noundef nonnull align 4 dereferenceable(8) %this) #2 align 2 {
-// CHECK-NEXT:entry:
-// CHECK-NEXT:%this.addr = alloca ptr, align 4
-// CHECK-NEXT:%Another = alloca %struct.Pair, align 4
-// CHECK-NEXT:%agg.tmp = alloca %struct.Pair, align 4
-// CHECK-NEXT:store ptr %this, ptr %this.addr, align 4
-// CHECK-NEXT:%this1 = load ptr, ptr %this.addr, align 4
-// CHECK-NEXT:%First = getelementptr inbounds %struct.Pair, ptr %Another, i32 0, i32 0
-// CHECK-NEXT:store i32 5, ptr %First, align 4
-// CHECK-NEXT:%Second = getelementptr inbounds %struct.Pair, ptr %Another, i32 0, i32 1
-// CHECK-NEXT:store i32 10, ptr %Second, align 4
-// CHECK-NEXT:call void @llvm.memcpy.p0.p0.i32(ptr align 4 %agg.tmp, ptr align 4 %Another, i32 8, i1 false)
-// CHECK-NEXT:call void @"??4Pair@@QAAXU0@@Z"(ptr noundef nonnull align 4 dereferenceable(8) %this1, ptr noundef byval(%struct.Pair) align 4 %agg.tmp)
-// CHECK-NEXT:%First2 = getelementptr inbounds %struct.Pair, ptr %this1, i32 0, i32 0
-// CHECK-NEXT:%0 = load i32, ptr %First2, align 4
-// CHECK-NEXT:ret i32 %0
-
-// CHECK:     define linkonce_odr noundef i32 @"?getSecond@Pair@@QAAHXZ"(ptr noundef nonnull align 4 dereferenceable(8) %this) #2 align 2 {
-// CHECK-NEXT:entry:
-// CHECK-NEXT:%this.addr = alloca ptr, align 4
-// CHECK-NEXT:%agg.tmp = alloca %struct.Pair, align 4
-// CHECK-NEXT:store ptr %this, ptr %this.addr, align 4
-// CHECK-NEXT:%this1 = load ptr, ptr %this.addr, align 4
-// CHECK-NEXT:call void @llvm.memset.p0.i32(ptr align 4 %agg.tmp, i8 0, i32 8, i1 false)
-// CHECK-NEXT:call void @"??4Pair@@QAAXU0@@Z"(ptr noundef nonnull align 4 dereferenceable(8) %this1, ptr noundef byval(%struct.Pair) align 4 %agg.tmp)
-// CHECK-NEXT:%Second = getelementptr inbounds %struct.Pair, ptr %this1, i32 0, i32 1
-// CHECK-NEXT:%0 = load i32, ptr %Second, align 4
-// CHECK-NEXT:ret i32 %0
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-llvm -disable-llvm-passes -o - -std=hlsl202x %s | FileCheck %s
+
+struct Pair {
+  int First;
+  int Second;
+  int getFirst() {
+    Pair Another = {5, 10};
+    this = Another;
+      return this.First;
+  }
+  int getSecond() {
+    this = Pair();
+    return Second;
+  }
+  void operator=(Pair P) {
+    First = P.First;
+    Second = 2;
+  }
+};
+[numthreads(1, 1, 1)]
+void main() {
+  Pair Vals = {1, 2};
+  Vals.First = Vals.getFirst();
+  Vals.Second = Vals.getSecond();
+}
+
+// This test makes a probably safe assumption that HLSL 202x includes operator overloading for assignment operators.
+// CHECK:     define linkonce_odr noundef i32 @"?getFirst@Pair@@QAAHXZ"(ptr noundef nonnull align 4 dereferenceable(8) %this) #2 align 2 {
+// CHECK-NEXT:entry:
+// CHECK-NEXT:%this.addr = alloca ptr, align 4
+// CHECK-NEXT:%Another = alloca %struct.Pair, align 4
+// CHECK-NEXT:%agg.tmp = alloca %struct.Pair, align 4
+// CHECK-NEXT:store ptr %this, ptr %this.addr, align 4
+// CHECK-NEXT:%this1 = load ptr, ptr %this.addr, align 4
+// CHECK-NEXT:%First = getelementptr inbounds %struct.Pair, ptr %Another, i32 0, i32 0
+// CHECK-NEXT:store i32 5, ptr %First, align 4
+// CHECK-NEXT:%Second = getelementptr inbounds %struct.Pair, ptr %Another, i32 0, i32 1
+// CHECK-NEXT:store i32 10, ptr %Second, align 4
+// CHECK-NEXT:call void @llvm.memcpy.p0.p0.i32(ptr align 4 %agg.tmp, ptr align 4 %Another, i32 8, i1 false)
+// CHECK-NEXT:call void @"??4Pair@@QAAXU0@@Z"(ptr noundef nonnull align 4 dereferenceable(8) %this1, ptr noundef byval(%struct.Pair) align 4 %agg.tmp)
+// CHECK-NEXT:%First2 = getelementptr inbounds %struct.Pair, ptr %this1, i32 0, i32 0
+// CHECK-NEXT:%0 = load i32, ptr %First2, align 4
+// CHECK-NEXT:ret i32 %0
+
+// CHECK:     define linkonce_odr noundef i32 @"?getSecond@Pair@@QAAHXZ"(ptr noundef nonnull align 4 dereferenceable(8) %this) #2 align 2 {
+// CHECK-NEXT:entry:
+// CHECK-NEXT:%this.addr = alloca ptr, align 4
+// CHECK-NEXT:%agg.tmp = alloca %struct.Pair, align 4
+// CHECK-NEXT:store ptr %this, ptr %this.addr, align 4
+// CHECK-NEXT:%this1 = load ptr, ptr %this.addr, align 4
+// CHECK-NEXT:call void @llvm.memset.p0.i32(ptr align 4 %agg.tmp, i8 0, i32 8, i1 false)
+// CHECK-NEXT:call void @"??4Pair@@QAAXU0@@Z"(ptr noundef nonnull align 4 dereferenceable(8) %this1, ptr noundef byval(%struct.Pair) align 4 %agg.tmp)
+// CHECK-NEXT:%Second = getelementptr inbounds %struct.Pair, ptr %this1, i32 0, i32 1
+// CHECK-NEXT:%0 = load i32, ptr %Second, align 4
+// CHECK-NEXT:ret i32 %0
diff --git a/clang/test/CodeGenHLSL/this-assignment.hlsl b/clang/test/CodeGenHLSL/this-assignment.hlsl
index bb67fb6e103c..74b4a2eb8150 100644
--- a/clang/test/CodeGenHLSL/this-assignment.hlsl
+++ b/clang/test/CodeGenHLSL/this-assignment.hlsl
@@ -1,45 +1,45 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -emit-llvm -disable-llvm-passes -o - -hlsl-entry main %s | FileCheck %s
-
-struct Pair {
-  int First;
-  int Second;
-
-  int getFirst() {
-    Pair Another = {5, 10};
-    this = Another;
-	  return this.First;
-  }
-
-  int getSecond() {
-    this = Pair();
-    return Second;
-  }
-};
-
-[numthreads(1, 1, 1)]
-void main() {
-  Pair Vals = {1, 2.0};
-  Vals.First = Vals.getFirst();
-  Vals.Second = Vals.getSecond();
-}
-
-// This tests reference like implicit this in HLSL
-// CHECK:     define linkonce_odr noundef i32 @"?getFirst@Pair@@QAAHXZ"(ptr noundef nonnull align 4 dereferenceable(8) %this) #3 align 2 {
-// CHECK-NEXT:entry:
-// CHECK-NEXT:%this.addr = alloca ptr, align 4
-// CHECK-NEXT:%Another = alloca %struct.Pair, align 4
-// CHECK-NEXT:store ptr %this, ptr %this.addr, align 4
-// CHECK-NEXT:%this1 = load ptr, ptr %this.addr, align 4
-// CHECK-NEXT:call void @llvm.memcpy.p0.p0.i32(ptr align 4 %Another, ptr align 4 @"__const.?getFirst@Pair@@QAAHXZ.Another", i32 8, i1 false)
-// CHECK-NEXT:call void @llvm.memcpy.p0.p0.i32(ptr align 4 %this1, ptr align 4 %Another, i32 8, i1 false)
-// CHECK-NEXT:%First = getelementptr inbounds %struct.Pair, ptr %this1, i32 0, i32 0
-
-// CHECK:     define linkonce_odr noundef i32 @"?getSecond@Pair@@QAAHXZ"(ptr noundef nonnull align 4 dereferenceable(8) %this) #3 align 2 {
-// CHECK-NEXT:entry:
-// CHECK-NEXT:%this.addr = alloca ptr, align 4
-// CHECK-NEXT:%ref.tmp = alloca %struct.Pair, align 4
-// CHECK-NEXT:store ptr %this, ptr %this.addr, align 4
-// CHECK-NEXT:%this1 = load ptr, ptr %this.addr, align 4
-// CHECK-NEXT:call void @llvm.memset.p0.i32(ptr align 4 %ref.tmp, i8 0, i32 8, i1 false)
-// CHECK-NEXT:call void @llvm.memcpy.p0.p0.i32(ptr align 4 %this1, ptr align 4 %ref.tmp, i32 8, i1 false)
-// CHECK-NEXT:%Second = getelementptr inbounds %struct.Pair, ptr %this1, i32 0, i32 1
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -emit-llvm -disable-llvm-passes -o - -hlsl-entry main %s | FileCheck %s
+
+struct Pair {
+  int First;
+  int Second;
+
+  int getFirst() {
+    Pair Another = {5, 10};
+    this = Another;
+	  return this.First;
+  }
+
+  int getSecond() {
+    this = Pair();
+    return Second;
+  }
+};
+
+[numthreads(1, 1, 1)]
+void main() {
+  Pair Vals = {1, 2.0};
+  Vals.First = Vals.getFirst();
+  Vals.Second = Vals.getSecond();
+}
+
+// This tests reference like implicit this in HLSL
+// CHECK:     define linkonce_odr noundef i32 @"?getFirst@Pair@@QAAHXZ"(ptr noundef nonnull align 4 dereferenceable(8) %this) #3 align 2 {
+// CHECK-NEXT:entry:
+// CHECK-NEXT:%this.addr = alloca ptr, align 4
+// CHECK-NEXT:%Another = alloca %struct.Pair, align 4
+// CHECK-NEXT:store ptr %this, ptr %this.addr, align 4
+// CHECK-NEXT:%this1 = load ptr, ptr %this.addr, align 4
+// CHECK-NEXT:call void @llvm.memcpy.p0.p0.i32(ptr align 4 %Another, ptr align 4 @"__const.?getFirst@Pair@@QAAHXZ.Another", i32 8, i1 false)
+// CHECK-NEXT:call void @llvm.memcpy.p0.p0.i32(ptr align 4 %this1, ptr align 4 %Another, i32 8, i1 false)
+// CHECK-NEXT:%First = getelementptr inbounds %struct.Pair, ptr %this1, i32 0, i32 0
+
+// CHECK:     define linkonce_odr noundef i32 @"?getSecond@Pair@@QAAHXZ"(ptr noundef nonnull align 4 dereferenceable(8) %this) #3 align 2 {
+// CHECK-NEXT:entry:
+// CHECK-NEXT:%this.addr = alloca ptr, align 4
+// CHECK-NEXT:%ref.tmp = alloca %struct.Pair, align 4
+// CHECK-NEXT:store ptr %this, ptr %this.addr, align 4
+// CHECK-NEXT:%this1 = load ptr, ptr %this.addr, align 4
+// CHECK-NEXT:call void @llvm.memset.p0.i32(ptr align 4 %ref.tmp, i8 0, i32 8, i1 false)
+// CHECK-NEXT:call void @llvm.memcpy.p0.p0.i32(ptr align 4 %this1, ptr align 4 %ref.tmp, i32 8, i1 false)
+// CHECK-NEXT:%Second = getelementptr inbounds %struct.Pair, ptr %this1, i32 0, i32 1
diff --git a/clang/test/CodeGenHLSL/this-reference.hlsl b/clang/test/CodeGenHLSL/this-reference.hlsl
index e57f48ccaf3a..1addc51da323 100644
--- a/clang/test/CodeGenHLSL/this-reference.hlsl
+++ b/clang/test/CodeGenHLSL/this-reference.hlsl
@@ -1,34 +1,34 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -emit-llvm -disable-llvm-passes -o - -hlsl-entry main %s -debug-info-kind=standalone -dwarf-version=4 | FileCheck %s
-
-struct Pair {
-  int First;
-  float Second;
-
-  int getFirst() {
-	  return this.First;
-  }
-
-  float getSecond() {
-    return Second;
-  }
-};
-
-[numthreads(1, 1, 1)]
-void main() {
-  Pair Vals = {1, 2.0};
-  Vals.First = Vals.getFirst();
-  Vals.Second = Vals.getSecond();
-}
-
-// This tests reference like `this` in HLSL
-  // CHECK:       %call = call noundef i32 @"?getFirst@Pair@@QAAHXZ"(ptr noundef nonnull align 4 dereferenceable(8) %Vals)
-  // CHECK-NEXT:  %First = getelementptr inbounds %struct.Pair, ptr %Vals, i32 0, i32 0
-  // CHECK-NEXT:  store i32 %call, ptr %First, align 4
-  // CHECK-NEXT:  %call1 = call noundef float @"?getSecond@Pair@@QAAMXZ"(ptr noundef nonnull align 4 dereferenceable(8) %Vals)
-  // CHECK-NEXT:  %Second = getelementptr inbounds %struct.Pair, ptr %Vals, i32 0, i32 1
-
-// CHECK: [[Pair:![0-9]+]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Pair"
-// CHECK: [[getFirst:![0-9]+]] = distinct !DISubprogram(name: "getFirst"
-// CHECK-SAME: scope: [[Pair]]
-// CHECK: [[FirstThis:![0-9]+]] = !DILocalVariable(name: "this", arg: 1, scope: [[getFirst]], type: [[thisType:![0-9]+]]
-// CHECK: [[thisType]] = !DIDerivedType(tag: DW_TAG_reference_type, baseType: [[Pair]], size: 32)
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -emit-llvm -disable-llvm-passes -o - -hlsl-entry main %s -debug-info-kind=standalone -dwarf-version=4 | FileCheck %s
+
+struct Pair {
+  int First;
+  float Second;
+
+  int getFirst() {
+	  return this.First;
+  }
+
+  float getSecond() {
+    return Second;
+  }
+};
+
+[numthreads(1, 1, 1)]
+void main() {
+  Pair Vals = {1, 2.0};
+  Vals.First = Vals.getFirst();
+  Vals.Second = Vals.getSecond();
+}
+
+// This tests reference like `this` in HLSL
+  // CHECK:       %call = call noundef i32 @"?getFirst@Pair@@QAAHXZ"(ptr noundef nonnull align 4 dereferenceable(8) %Vals)
+  // CHECK-NEXT:  %First = getelementptr inbounds %struct.Pair, ptr %Vals, i32 0, i32 0
+  // CHECK-NEXT:  store i32 %call, ptr %First, align 4
+  // CHECK-NEXT:  %call1 = call noundef float @"?getSecond@Pair@@QAAMXZ"(ptr noundef nonnull align 4 dereferenceable(8) %Vals)
+  // CHECK-NEXT:  %Second = getelementptr inbounds %struct.Pair, ptr %Vals, i32 0, i32 1
+
+// CHECK: [[Pair:![0-9]+]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Pair"
+// CHECK: [[getFirst:![0-9]+]] = distinct !DISubprogram(name: "getFirst"
+// CHECK-SAME: scope: [[Pair]]
+// CHECK: [[FirstThis:![0-9]+]] = !DILocalVariable(name: "this", arg: 1, scope: [[getFirst]], type: [[thisType:![0-9]+]]
+// CHECK: [[thisType]] = !DIDerivedType(tag: DW_TAG_reference_type, baseType: [[Pair]], size: 32)
diff --git a/clang/test/CodeGenHLSL/validator_version.hlsl b/clang/test/CodeGenHLSL/validator_version.hlsl
index cb5309dd05cb..b33a78cee993 100644
--- a/clang/test/CodeGenHLSL/validator_version.hlsl
+++ b/clang/test/CodeGenHLSL/validator_version.hlsl
@@ -1,8 +1,8 @@
-// RUN: %clang_cc1 -S -triple dxil-pc-shadermodel6.3-library -S -emit-llvm -xhlsl -validator-version 1.1 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -emit-llvm -xhlsl -validator-version 1.1 -o - %s | FileCheck %s
 
 // FIXME:The following line should work once SPIR-V support for HLSL is added.
 // See: https://github.com/llvm/llvm-project/issues/57877
-// DISABLED: %clang_cc1 -S -triple spirv32 -S -emit-llvm -xhlsl -validator-version 1.1 -o - %s | FileCheck %s --check-prefix=NOT_DXIL
+// DISABLED: %clang_cc1 -triple spirv32 -emit-llvm -xhlsl -validator-version 1.1 -o - %s | FileCheck %s --check-prefix=NOT_DXIL
 
 // CHECK:!dx.valver = !{![[valver:[0-9]+]]}
 // CHECK:![[valver]] = !{i32 1, i32 1}
diff --git a/clang/test/CodeGenObjC/attr-used-on-method.m b/clang/test/CodeGenObjC/attr-used-on-method.m
index d8b2a5d29184..0e31713ae63b 100644
--- a/clang/test/CodeGenObjC/attr-used-on-method.m
+++ b/clang/test/CodeGenObjC/attr-used-on-method.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.10 %s -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.10 %s -emit-llvm -o - | FileCheck %s
 
 // CHECK: @llvm.used =
 // CHECK-SAME: @"\01-[X m]"
diff --git a/clang/test/CodeGenObjC/debug-info-impl.m b/clang/test/CodeGenObjC/debug-info-impl.m
index a648ea17f64f..0e08295fe4c3 100644
--- a/clang/test/CodeGenObjC/debug-info-impl.m
+++ b/clang/test/CodeGenObjC/debug-info-impl.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -debug-info-kind=limited -S -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -debug-info-kind=limited -emit-llvm %s -o - | FileCheck %s
 @interface NSObject {
   struct objc_object *isa;
 }
diff --git a/clang/test/CodeGenObjC/debug-info-property-class-extension.m b/clang/test/CodeGenObjC/debug-info-property-class-extension.m
index ea2551799f30..58f72f519ded 100644
--- a/clang/test/CodeGenObjC/debug-info-property-class-extension.m
+++ b/clang/test/CodeGenObjC/debug-info-property-class-extension.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -S -emit-llvm -debug-info-kind=limited %s -o - | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm -debug-info-kind=limited %s -o - | FileCheck %s
 
 // Checks debug info for properties from class extensions for a few cases.
 
diff --git a/clang/test/CodeGenObjC/debug-info-property-class-instance-same-name.m b/clang/test/CodeGenObjC/debug-info-property-class-instance-same-name.m
index 68423fc07f8a..930544e1f359 100644
--- a/clang/test/CodeGenObjC/debug-info-property-class-instance-same-name.m
+++ b/clang/test/CodeGenObjC/debug-info-property-class-instance-same-name.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -S -emit-llvm -debug-info-kind=limited %s -o - | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm -debug-info-kind=limited %s -o - | FileCheck %s
 
 // Both properties should be emitted as having a class and an instance property
 // with the same name is allowed.
diff --git a/clang/test/CodeGenObjC/debug-info-property3.m b/clang/test/CodeGenObjC/debug-info-property3.m
index 20880600a781..d76988d93ed1 100644
--- a/clang/test/CodeGenObjC/debug-info-property3.m
+++ b/clang/test/CodeGenObjC/debug-info-property3.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -S -emit-llvm -debug-info-kind=limited %s -o - | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm -debug-info-kind=limited %s -o - | FileCheck %s
 
 @interface I1
 // CHECK: !DIObjCProperty(name: "p1"
diff --git a/clang/test/CodeGenObjC/dllstorage.m b/clang/test/CodeGenObjC/dllstorage.m
index f45eb7bb6aee..c94f4c9b5804 100644
--- a/clang/test/CodeGenObjC/dllstorage.m
+++ b/clang/test/CodeGenObjC/dllstorage.m
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -fdeclspec -fobjc-runtime=ios -fobjc-exceptions -S -emit-llvm -o - %s | FileCheck -allow-deprecated-dag-overlap -check-prefix CHECK-IR %s
-// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -fdeclspec -fobjc-runtime=gnustep-2.0 -fobjc-exceptions -S -emit-llvm -o - %s | FileCheck -allow-deprecated-dag-overlap -check-prefix CHECK-NF %s
-// RUN: %clang_cc1 -triple i686-windows-itanium -fms-extensions -fobjc-runtime=macosx -fdeclspec -fobjc-exceptions -S -emit-llvm -o - %s | FileCheck -allow-deprecated-dag-overlap -check-prefix CHECK-IR %s
-// RUN: %clang_cc1 -triple i686-windows-itanium -fms-extensions -fobjc-runtime=objfw -fdeclspec -fobjc-exceptions -S -emit-llvm -o - %s | FileCheck -allow-deprecated-dag-overlap -check-prefix CHECK-FW %s
+// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -fdeclspec -fobjc-runtime=ios -fobjc-exceptions -emit-llvm -o - %s | FileCheck -allow-deprecated-dag-overlap -check-prefix CHECK-IR %s
+// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -fdeclspec -fobjc-runtime=gnustep-2.0 -fobjc-exceptions -emit-llvm -o - %s | FileCheck -allow-deprecated-dag-overlap -check-prefix CHECK-NF %s
+// RUN: %clang_cc1 -triple i686-windows-itanium -fms-extensions -fobjc-runtime=macosx -fdeclspec -fobjc-exceptions -emit-llvm -o - %s | FileCheck -allow-deprecated-dag-overlap -check-prefix CHECK-IR %s
+// RUN: %clang_cc1 -triple i686-windows-itanium -fms-extensions -fobjc-runtime=objfw -fdeclspec -fobjc-exceptions -emit-llvm -o - %s | FileCheck -allow-deprecated-dag-overlap -check-prefix CHECK-FW %s
 
 // CHECK-IR-DAG: @_objc_empty_cache = external dllimport global %struct._objc_cache
 
diff --git a/clang/test/CodeGenObjC/exceptions-personality.m b/clang/test/CodeGenObjC/exceptions-personality.m
index 9c25ee38b6d7..77ca6c2baecb 100644
--- a/clang/test/CodeGenObjC/exceptions-personality.m
+++ b/clang/test/CodeGenObjC/exceptions-personality.m
@@ -1,53 +1,53 @@
-// RUN: %clang_cc1 -triple x86_64-w64-windows-gnu  -emit-llvm -fobjc-runtime=gnustep-2.0 -fexceptions -fobjc-exceptions -o %t %s
-// RUN: FileCheck --check-prefixes=CHECK-MINGW-OBJC2 < %t %s
-
-// RUN: %clang_cc1 -triple x86_64-w64-windows-gnu  -emit-llvm -fobjc-runtime=gcc -fexceptions -fobjc-exceptions -o %t %s
-// RUN: FileCheck --check-prefixes=CHECK-MINGW-GCC < %t %s
-
-// RUN: %clang_cc1 -triple x86_64-w64-windows-msvc  -emit-llvm -fobjc-runtime=gnustep-2.0 -fexceptions -fobjc-exceptions -o %t %s
-// RUN: FileCheck --check-prefixes=CHECK-MSVC-OBJC2 < %t %s
-
-// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu  -emit-llvm -fobjc-runtime=gnustep-2.0 -fexceptions -fobjc-exceptions -o %t %s
-// RUN: FileCheck --check-prefixes=CHECK-LINUX-OBJC2 < %t %s
-
-// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu  -emit-llvm -fobjc-runtime=gcc -fexceptions -fobjc-exceptions -o %t %s
-// RUN: FileCheck --check-prefixes=CHECK-LINUX-GCC < %t %s
-@interface Foo @end
-
-void throwing(void) {
-  @try
-  {
-    // CHECK-MINGW-OBJC2: personality ptr @__gxx_personality_seh0
-    // CHECK-MINGW-OBJC2: invoke void @objc_exception_throw
-
-    // CHECK-MINGW-GCC: personality ptr @__gnu_objc_personality_v0
-    // CHECK-MINGW-GCC: invoke void @objc_exception_throw
-
-    // CHECK-MSVC-OBJC2: personality ptr @__CxxFrameHandler3
-    // CHECK-MSVC-OBJC2: invoke void @objc_exception_throw
-
-    // CHECK-LINUX-OBJC2: personality ptr @__gnustep_objc_personality_v0
-    // CHECK-LINUX-OBJC2: invoke void @objc_exception_throw
-
-    // CHECK-LINUX-GCC: personality ptr @__gnu_objc_personality_v0
-    @throw(@"error!");
-  }
-  @catch(...) 
-  {
-    // CHECK-MINGW-OBJC2: call ptr @__cxa_begin_catch
-    // CHECK-MINGW-OBJC2: invoke ptr @__cxa_rethrow
-    // CHECK-MINGW-OBJC2: invoke void @__cxa_end_catch
-    
-    // CHECK-MINGW-GCC: call void @objc_exception_throw
-
-    // CHECK-MSVC-OBJC2: call void @objc_exception_rethrow
-
-    // CHECK-LINUX-OBJC2: call ptr @objc_begin_catch
-    // CHECK-LINUX-OBJC2: invoke void @objc_exception_throw
-    // CHECK-LINUX-OBJC2: invoke void @objc_end_catch()
-
-    // CHECK-LINUX-GCC: invoke void @objc_exception_throw
-    
-    @throw;
-  }
-}
+// RUN: %clang_cc1 -triple x86_64-w64-windows-gnu  -emit-llvm -fobjc-runtime=gnustep-2.0 -fexceptions -fobjc-exceptions -o %t %s
+// RUN: FileCheck --check-prefixes=CHECK-MINGW-OBJC2 < %t %s
+
+// RUN: %clang_cc1 -triple x86_64-w64-windows-gnu  -emit-llvm -fobjc-runtime=gcc -fexceptions -fobjc-exceptions -o %t %s
+// RUN: FileCheck --check-prefixes=CHECK-MINGW-GCC < %t %s
+
+// RUN: %clang_cc1 -triple x86_64-w64-windows-msvc  -emit-llvm -fobjc-runtime=gnustep-2.0 -fexceptions -fobjc-exceptions -o %t %s
+// RUN: FileCheck --check-prefixes=CHECK-MSVC-OBJC2 < %t %s
+
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu  -emit-llvm -fobjc-runtime=gnustep-2.0 -fexceptions -fobjc-exceptions -o %t %s
+// RUN: FileCheck --check-prefixes=CHECK-LINUX-OBJC2 < %t %s
+
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu  -emit-llvm -fobjc-runtime=gcc -fexceptions -fobjc-exceptions -o %t %s
+// RUN: FileCheck --check-prefixes=CHECK-LINUX-GCC < %t %s
+@interface Foo @end
+
+void throwing(void) {
+  @try
+  {
+    // CHECK-MINGW-OBJC2: personality ptr @__gxx_personality_seh0
+    // CHECK-MINGW-OBJC2: invoke void @objc_exception_throw
+
+    // CHECK-MINGW-GCC: personality ptr @__gnu_objc_personality_v0
+    // CHECK-MINGW-GCC: invoke void @objc_exception_throw
+
+    // CHECK-MSVC-OBJC2: personality ptr @__CxxFrameHandler3
+    // CHECK-MSVC-OBJC2: invoke void @objc_exception_throw
+
+    // CHECK-LINUX-OBJC2: personality ptr @__gnustep_objc_personality_v0
+    // CHECK-LINUX-OBJC2: invoke void @objc_exception_throw
+
+    // CHECK-LINUX-GCC: personality ptr @__gnu_objc_personality_v0
+    @throw(@"error!");
+  }
+  @catch(...)
+  {
+    // CHECK-MINGW-OBJC2: call ptr @__cxa_begin_catch
+    // CHECK-MINGW-OBJC2: invoke ptr @__cxa_rethrow
+    // CHECK-MINGW-OBJC2: invoke void @__cxa_end_catch
+
+    // CHECK-MINGW-GCC: call void @objc_exception_throw
+
+    // CHECK-MSVC-OBJC2: call void @objc_exception_rethrow
+
+    // CHECK-LINUX-OBJC2: call ptr @objc_begin_catch
+    // CHECK-LINUX-OBJC2: invoke void @objc_exception_throw
+    // CHECK-LINUX-OBJC2: invoke void @objc_end_catch()
+
+    // CHECK-LINUX-GCC: invoke void @objc_exception_throw
+
+    @throw;
+  }
+}
diff --git a/clang/test/CodeGenObjC/externally-retained.m b/clang/test/CodeGenObjC/externally-retained.m
index 0c0085e169ab..fedf858bae27 100644
--- a/clang/test/CodeGenObjC/externally-retained.m
+++ b/clang/test/CodeGenObjC/externally-retained.m
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.13.0 -fobjc-arc -fblocks -Wno-objc-root-class -O0 %s -S -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.13.0 -fobjc-arc -fblocks -Wno-objc-root-class -O0 -xobjective-c++ -std=c++11 %s -S -emit-llvm -o - | FileCheck %s --check-prefix CHECKXX
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.13.0 -fobjc-arc -fblocks -Wno-objc-root-class -O0 %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.13.0 -fobjc-arc -fblocks -Wno-objc-root-class -O0 -xobjective-c++ -std=c++11 %s -emit-llvm -o - | FileCheck %s --check-prefix CHECKXX
 
 #define EXT_RET __attribute__((objc_externally_retained))
 
diff --git a/clang/test/CodeGenObjC/gnu-init.m b/clang/test/CodeGenObjC/gnu-init.m
index d0aa6fdc4ac9..341c8c6d95eb 100644
--- a/clang/test/CodeGenObjC/gnu-init.m
+++ b/clang/test/CodeGenObjC/gnu-init.m
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-freebsd -S -emit-llvm -fno-use-init-array -fobjc-runtime=gnustep-2.0 -o - %s | FileCheck %s -check-prefix=CHECK-NEW
-// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -S -emit-llvm -fno-use-init-array -fobjc-runtime=gnustep-2.0 -o - %s | FileCheck %s -check-prefix=CHECK-WIN
-// RUN: %clang_cc1 -triple x86_64-unknown-freebsd -S -emit-llvm -fno-use-init-array -fobjc-runtime=gnustep-1.8 -o - %s | FileCheck %s -check-prefix=CHECK-OLD
-// RUN: %clang_cc1 -triple x86_64-unknown-freebsd -S -emit-llvm -fobjc-runtime=gnustep-2.0 -o - %s | FileCheck %s -check-prefix=CHECK-INIT_ARRAY
+// RUN: %clang_cc1 -triple x86_64-unknown-freebsd -emit-llvm -fno-use-init-array -fobjc-runtime=gnustep-2.0 -o - %s | FileCheck %s -check-prefix=CHECK-NEW
+// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -emit-llvm -fno-use-init-array -fobjc-runtime=gnustep-2.0 -o - %s | FileCheck %s -check-prefix=CHECK-WIN
+// RUN: %clang_cc1 -triple x86_64-unknown-freebsd -emit-llvm -fno-use-init-array -fobjc-runtime=gnustep-1.8 -o - %s | FileCheck %s -check-prefix=CHECK-OLD
+// RUN: %clang_cc1 -triple x86_64-unknown-freebsd -emit-llvm -fobjc-runtime=gnustep-2.0 -o - %s | FileCheck %s -check-prefix=CHECK-INIT_ARRAY
 
 // Almost minimal Objective-C file, check that it emits calls to the correct
 // runtime entry points.
diff --git a/clang/test/CodeGenObjC/gnu-method-only-once.m b/clang/test/CodeGenObjC/gnu-method-only-once.m
index 639209e02116..3675758770ab 100644
--- a/clang/test/CodeGenObjC/gnu-method-only-once.m
+++ b/clang/test/CodeGenObjC/gnu-method-only-once.m
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-freebsd -S -emit-llvm -fobjc-runtime=gnustep-2.0 -o - %s | FileCheck %s -check-prefix=CHECK-NEW
-// RUN: %clang_cc1 -triple x86_64-unknown-freebsd -S -emit-llvm -fobjc-runtime=gnustep-1.8 -o - %s | FileCheck %s -check-prefix=CHECK-OLD
+// RUN: %clang_cc1 -triple x86_64-unknown-freebsd -emit-llvm -fobjc-runtime=gnustep-2.0 -o - %s | FileCheck %s -check-prefix=CHECK-NEW
+// RUN: %clang_cc1 -triple x86_64-unknown-freebsd -emit-llvm -fobjc-runtime=gnustep-1.8 -o - %s | FileCheck %s -check-prefix=CHECK-OLD
 
 // Clang 9 or 10 changed the handling of method lists so that methods provided
 // from synthesised properties showed up in the method list, where previously
diff --git a/clang/test/CodeGenObjC/gnu-nil-receiver.m b/clang/test/CodeGenObjC/gnu-nil-receiver.m
index 7a1ee8afca6d..8d4027a93814 100644
--- a/clang/test/CodeGenObjC/gnu-nil-receiver.m
+++ b/clang/test/CodeGenObjC/gnu-nil-receiver.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-freebsd -fobjc-arc -S -emit-llvm -fobjc-runtime=gnustep-2.0 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-freebsd -fobjc-arc -emit-llvm -fobjc-runtime=gnustep-2.0 -o - %s | FileCheck %s
 
 typedef struct {
   int x[12];
diff --git a/clang/test/CodeGenObjC/gnustep2-category-protocol.m b/clang/test/CodeGenObjC/gnustep2-category-protocol.m
index 750b6e079205..9a17c9f5997b 100644
--- a/clang/test/CodeGenObjC/gnustep2-category-protocol.m
+++ b/clang/test/CodeGenObjC/gnustep2-category-protocol.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-freebsd -S -emit-llvm -fobjc-runtime=gnustep-2.0 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-freebsd -emit-llvm -fobjc-runtime=gnustep-2.0 -o - %s | FileCheck %s
 
 // Regression test.  We weren't emitting definitions for protocols used in
 // categories, causing linker errors when the category was the only reference
diff --git a/clang/test/CodeGenObjC/gnustep2-category.m b/clang/test/CodeGenObjC/gnustep2-category.m
index 6114a5979dbc..d1784e5dfc1f 100644
--- a/clang/test/CodeGenObjC/gnustep2-category.m
+++ b/clang/test/CodeGenObjC/gnustep2-category.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-freebsd -S -emit-llvm -fobjc-runtime=gnustep-2.0 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-freebsd -emit-llvm -fobjc-runtime=gnustep-2.0 -o - %s | FileCheck %s
 
 
 // Check that we have a method list that refers to the correct thing method:
diff --git a/clang/test/CodeGenObjC/gnustep2-class.m b/clang/test/CodeGenObjC/gnustep2-class.m
index 28f9ad12c6b4..94eafe0f404f 100644
--- a/clang/test/CodeGenObjC/gnustep2-class.m
+++ b/clang/test/CodeGenObjC/gnustep2-class.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-freebsd -S -emit-llvm -fobjc-runtime=gnustep-2.0 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-freebsd -emit-llvm -fobjc-runtime=gnustep-2.0 -o - %s | FileCheck %s
 
 @interface Super @end
 
diff --git a/clang/test/CodeGenObjC/gnustep2-direct-method.m b/clang/test/CodeGenObjC/gnustep2-direct-method.m
index 7fa2775eee39..3caed89fb3cf 100644
--- a/clang/test/CodeGenObjC/gnustep2-direct-method.m
+++ b/clang/test/CodeGenObjC/gnustep2-direct-method.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-freebsd -S -emit-llvm -fobjc-runtime=gnustep-2.2 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-freebsd -emit-llvm -fobjc-runtime=gnustep-2.2 -o - %s | FileCheck %s
 
 @interface X
 @end
diff --git a/clang/test/CodeGenObjC/gnustep2-ivar-offset.m b/clang/test/CodeGenObjC/gnustep2-ivar-offset.m
index 1f838cdd7563..dd133ba04e30 100644
--- a/clang/test/CodeGenObjC/gnustep2-ivar-offset.m
+++ b/clang/test/CodeGenObjC/gnustep2-ivar-offset.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-freebsd -S -emit-llvm -fobjc-runtime=gnustep-2.0 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-freebsd -emit-llvm -fobjc-runtime=gnustep-2.0 -o - %s | FileCheck %s
 
 
 @class NSString;
diff --git a/clang/test/CodeGenObjC/gnustep2-nontrivial-destructor-argument.mm b/clang/test/CodeGenObjC/gnustep2-nontrivial-destructor-argument.mm
index a7de79bf7994..db27292db203 100644
--- a/clang/test/CodeGenObjC/gnustep2-nontrivial-destructor-argument.mm
+++ b/clang/test/CodeGenObjC/gnustep2-nontrivial-destructor-argument.mm
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-unknow-windows-msvc -S -emit-llvm -fobjc-runtime=gnustep-2.0 -o - %s 
+// RUN: %clang_cc1 -triple x86_64-unknow-windows-msvc -emit-llvm -fobjc-runtime=gnustep-2.0 -o - %s 
 
 // Regression test.  Ensure that C++ arguments with non-trivial destructors
 // don't crash the compiler.
diff --git a/clang/test/CodeGenObjC/gnustep2-proto.m b/clang/test/CodeGenObjC/gnustep2-proto.m
index d5b61a43655e..3a1fc58d6624 100644
--- a/clang/test/CodeGenObjC/gnustep2-proto.m
+++ b/clang/test/CodeGenObjC/gnustep2-proto.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-freebsd -S -emit-llvm -fobjc-runtime=gnustep-2.0 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-freebsd -emit-llvm -fobjc-runtime=gnustep-2.0 -o - %s | FileCheck %s
 
 @protocol X
 @optional
diff --git a/clang/test/CodeGenObjC/ivar-type-encoding.m b/clang/test/CodeGenObjC/ivar-type-encoding.m
index 3d9cf2549a77..207c2316955b 100644
--- a/clang/test/CodeGenObjC/ivar-type-encoding.m
+++ b/clang/test/CodeGenObjC/ivar-type-encoding.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -S -emit-llvm -fobjc-runtime=gcc -o - %s | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm -fobjc-runtime=gcc -o - %s | FileCheck %s
 
 @protocol NSCopying
 @end
diff --git a/clang/test/CodeGenObjC/null-check-bool-ret.m b/clang/test/CodeGenObjC/null-check-bool-ret.m
index 381d136450fc..85d3fc7f7ff1 100644
--- a/clang/test/CodeGenObjC/null-check-bool-ret.m
+++ b/clang/test/CodeGenObjC/null-check-bool-ret.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple arm64e-apple-ios15.0.0 -emit-llvm-bc -fobjc-arc -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple arm64e-apple-ios15.0.0-bc -fobjc-arc -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s
 
 @protocol NSObject
 @end
diff --git a/clang/test/CodeGenObjC/personality.m b/clang/test/CodeGenObjC/personality.m
index ede0aa34eaf6..0c4beeda9018 100644
--- a/clang/test/CodeGenObjC/personality.m
+++ b/clang/test/CodeGenObjC/personality.m
@@ -1,40 +1,40 @@
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fobjc-exceptions -fobjc-runtime=macosx-fragile -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACOSX-FRAGILE
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fobjc-exceptions -fobjc-runtime=ios -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fobjc-exceptions -fobjc-runtime=macosx -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fobjc-exceptions -fobjc-runtime=watchos -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fobjc-exceptions -fobjc-runtime=gnustep-1.7 -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GNUSTEP-1_7
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fobjc-exceptions -fobjc-runtime=gnustep -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GNUSTEP
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fobjc-exceptions -fobjc-runtime=gcc -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GCC
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fobjc-runtime=gcc -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GCC-SEH
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fobjc-runtime=gcc -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GCC-SJLJ
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fobjc-exceptions -fobjc-runtime=objfw -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-OBJFW
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fobjc-runtime=objfw -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-OBJFW-SEH
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fobjc-runtime=objfw -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-OBJFW-SJLJ
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fobjc-exceptions -fobjc-runtime=macosx-fragile -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACOSX-FRAGILE
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fobjc-exceptions -fobjc-runtime=ios -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fobjc-exceptions -fobjc-runtime=macosx -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fobjc-exceptions -fobjc-runtime=watchos -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fobjc-exceptions -fobjc-runtime=gnustep-1.7 -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GNUSTEP-1_7
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fobjc-exceptions -fobjc-runtime=gnustep -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GNUSTEP
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fobjc-exceptions -fobjc-runtime=gcc -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GCC
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fobjc-runtime=gcc -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GCC-SEH
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fobjc-runtime=gcc -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GCC-SJLJ
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fobjc-exceptions -fobjc-runtime=objfw -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-OBJFW
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fobjc-runtime=objfw -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-OBJFW-SEH
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fobjc-runtime=objfw -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-OBJFW-SJLJ
 
-// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fobjc-exceptions -fobjc-runtime=macosx-fragile -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-MSVC
-// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fobjc-exceptions -fobjc-runtime=ios -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-MSVC
-// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fobjc-exceptions -fobjc-runtime=macosx -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-MSVC
-// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fobjc-exceptions -fobjc-runtime=watchos -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-MSVC
-// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fobjc-exceptions -fobjc-runtime=gnustep-1.7 -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-MSVC
-// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fobjc-exceptions -fobjc-runtime=gnustep -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-MSVC
-// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fobjc-exceptions -fobjc-runtime=gcc -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-MSVC
-// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fobjc-exceptions -fobjc-runtime=objfw -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-MSVC
+// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fobjc-exceptions -fobjc-runtime=macosx-fragile -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-MSVC
+// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fobjc-exceptions -fobjc-runtime=ios -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-MSVC
+// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fobjc-exceptions -fobjc-runtime=macosx -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-MSVC
+// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fobjc-exceptions -fobjc-runtime=watchos -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-MSVC
+// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fobjc-exceptions -fobjc-runtime=gnustep-1.7 -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-MSVC
+// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fobjc-exceptions -fobjc-runtime=gnustep -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-MSVC
+// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fobjc-exceptions -fobjc-runtime=gcc -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-MSVC
+// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fobjc-exceptions -fobjc-runtime=objfw -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-MSVC
 
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fobjc-exceptions -fobjc-runtime=macosx-fragile -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACOSX-FRAGILE
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=dwarf -fobjc-exceptions -fobjc-runtime=macosx-fragile -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACOSX-FRAGILE-MINGW-DWARF
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fobjc-runtime=macosx-fragile -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACOSX-FRAGILE-MINGW-SEH
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fobjc-runtime=macosx-fragile -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACOSX-FRAGILE-MINGW-SJLJ
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fobjc-exceptions -fobjc-runtime=ios -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fobjc-exceptions -fobjc-runtime=macosx -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fobjc-exceptions -fobjc-runtime=watchos -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fobjc-exceptions -fobjc-runtime=gnustep-1.7 -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-GNU
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fobjc-exceptions -fobjc-runtime=gnustep -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-GNU
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fobjc-exceptions -fobjc-runtime=gcc -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GCC
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fobjc-runtime=gcc -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GCC-SEH
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fobjc-runtime=gcc -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GCC-SJLJ
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fobjc-exceptions -fobjc-runtime=objfw -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-OBJFW
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fobjc-runtime=objfw -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-OBJFW-SEH
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fobjc-runtime=objfw -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-OBJFW-SJLJ
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fobjc-exceptions -fobjc-runtime=macosx-fragile -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACOSX-FRAGILE
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=dwarf -fobjc-exceptions -fobjc-runtime=macosx-fragile -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACOSX-FRAGILE-MINGW-DWARF
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fobjc-runtime=macosx-fragile -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACOSX-FRAGILE-MINGW-SEH
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fobjc-runtime=macosx-fragile -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACOSX-FRAGILE-MINGW-SJLJ
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fobjc-exceptions -fobjc-runtime=ios -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fobjc-exceptions -fobjc-runtime=macosx -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fobjc-exceptions -fobjc-runtime=watchos -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fobjc-exceptions -fobjc-runtime=gnustep-1.7 -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-GNU
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fobjc-exceptions -fobjc-runtime=gnustep -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-GNU
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fobjc-exceptions -fobjc-runtime=gcc -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GCC
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fobjc-runtime=gcc -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GCC-SEH
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fobjc-runtime=gcc -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GCC-SJLJ
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fobjc-exceptions -fobjc-runtime=objfw -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-OBJFW
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fobjc-runtime=objfw -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-OBJFW-SEH
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fobjc-runtime=objfw -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-OBJFW-SJLJ
 
 void g(void);
 
diff --git a/clang/test/CodeGenObjC/stret-lifetime.m b/clang/test/CodeGenObjC/stret-lifetime.m
index 55b19aa871b5..03e810ea1e09 100644
--- a/clang/test/CodeGenObjC/stret-lifetime.m
+++ b/clang/test/CodeGenObjC/stret-lifetime.m
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -triple arm64-apple-darwin    -S -emit-llvm -o - -O2 -disable-llvm-passes %s | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -S -emit-llvm -o - -O2 -disable-llvm-passes %s | FileCheck %s
-// RUN: %clang_cc1 -triple arm64-apple-darwin    -fobjc-arc -S -emit-llvm -o - -O2 -disable-llvm-passes %s | FileCheck %s --check-prefixes=CHECK,ARC
-// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -fobjc-arc -S -emit-llvm -o - -O2 -disable-llvm-passes %s | FileCheck %s --check-prefixes=CHECK,ARC
+// RUN: %clang_cc1 -triple arm64-apple-darwin    -emit-llvm -o - -O2 -disable-llvm-passes %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -emit-llvm -o - -O2 -disable-llvm-passes %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-apple-darwin    -fobjc-arc -emit-llvm -o - -O2 -disable-llvm-passes %s | FileCheck %s --check-prefixes=CHECK,ARC
+// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -fobjc-arc -emit-llvm -o - -O2 -disable-llvm-passes %s | FileCheck %s --check-prefixes=CHECK,ARC
 
 struct stret { int x[100]; };
 struct stret one = {{1}};
diff --git a/clang/test/CodeGenObjC/unqual-copy-restore.m b/clang/test/CodeGenObjC/unqual-copy-restore.m
index c59aa59e82a8..f853e6d74436 100644
--- a/clang/test/CodeGenObjC/unqual-copy-restore.m
+++ b/clang/test/CodeGenObjC/unqual-copy-restore.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -fobjc-arc -S -emit-llvm -o /dev/null
+// RUN: %clang_cc1 %s -fobjc-arc -emit-llvm -o /dev/null
 
 // Don't crash if the argument type and the parameter type in an indirect copy
 // restore expression have different qualification.
diff --git a/clang/test/CodeGenObjCXX/msabi-stret.mm b/clang/test/CodeGenObjCXX/msabi-stret.mm
index fbed6f7bb8cb..267f9053e461 100644
--- a/clang/test/CodeGenObjCXX/msabi-stret.mm
+++ b/clang/test/CodeGenObjCXX/msabi-stret.mm
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fobjc-runtime=ios-6.0 -Os -S -emit-llvm -o - %s -mframe-pointer=all | FileCheck %s
+// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fobjc-runtime=ios-6.0 -Os -emit-llvm -o - %s -mframe-pointer=all | FileCheck %s
 
 struct S {
   S() = default;
diff --git a/clang/test/CodeGenObjCXX/personality.mm b/clang/test/CodeGenObjCXX/personality.mm
index b8c7af962bd0..b1a5e4c246cc 100644
--- a/clang/test/CodeGenObjCXX/personality.mm
+++ b/clang/test/CodeGenObjCXX/personality.mm
@@ -1,65 +1,65 @@
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx-fragile -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACOSX-FRAGILE
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=dwarf -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx-fragile -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACOSX-FRAGILE
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx-fragile -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACOSX-FRAGILE-SEH
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx-fragile -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACOSX-FRAGILE-SJLJ
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=dwarf -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=ios -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=dwarf -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=ios -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=ios -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=ios -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=watchos -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=dwarf -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=watchos -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=watchos -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=watchos -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=gnustep-1.7 -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GNUSTEP-1_7
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=gnustep -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GNUSTEP
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=gcc -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GCC
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=dwarf -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=gcc -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GCC
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=gcc -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GCC-SEH
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=gcc -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GCC-SJLJ
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=objfw -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-OBJFW
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=dwarf -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=objfw -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-OBJFW
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=objfw -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-OBJFW-SEH
-// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=objfw -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-OBJFW-SJLJ
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx-fragile -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACOSX-FRAGILE
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=dwarf -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx-fragile -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACOSX-FRAGILE
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx-fragile -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACOSX-FRAGILE-SEH
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx-fragile -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACOSX-FRAGILE-SJLJ
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=dwarf -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=ios -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=dwarf -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=ios -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=ios -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=ios -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=watchos -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=dwarf -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=watchos -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=watchos -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=watchos -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=gnustep-1.7 -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GNUSTEP-1_7
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=gnustep -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GNUSTEP
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=gcc -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GCC
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=dwarf -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=gcc -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GCC
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=gcc -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GCC-SEH
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=gcc -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GCC-SJLJ
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=objfw -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-OBJFW
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=dwarf -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=objfw -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-OBJFW
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=objfw -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-OBJFW-SEH
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=objfw -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-OBJFW-SJLJ
 
-// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx-fragile -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-MSVC
-// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-MSVC
-// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=ios -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-MSVC
-// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=watchos -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-MSVC
-// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=gnustep-1.7 -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-MSVC
-// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=gnustep -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-MSVC
-// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=gcc -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-MSVC
-// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=objfw -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-MSVC
+// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx-fragile -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-MSVC
+// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-MSVC
+// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=ios -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-MSVC
+// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=watchos -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-MSVC
+// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=gnustep-1.7 -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-MSVC
+// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=gnustep -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-MSVC
+// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=gcc -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-MSVC
+// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=objfw -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-MSVC
 
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx-fragile -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACOSX-FRAGILE
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=dwarf -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx-fragile -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACOSX-FRAGILE
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx-fragile -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACOSX-FRAGILE-SEH
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx-fragile -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACOSX-FRAGILE-SJLJ
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=dwarf -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=ios -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=dwarf -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=ios -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=ios -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=ios -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=watchos -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=dwarf -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=watchos -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=watchos -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=watchos -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=gnustep-1.7 -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-GNU
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=gnustep -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-GNU
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=gcc -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GCC
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=dwarf -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=gcc -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GCC
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=gcc -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GCC-SEH
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=gcc -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GCC-SJLJ
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=objfw -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-OBJFW
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=dwarf -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=objfw -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-OBJFW
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=objfw -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-OBJFW-SEH
-// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=objfw -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-OBJFW-SJLJ
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx-fragile -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACOSX-FRAGILE
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=dwarf -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx-fragile -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACOSX-FRAGILE
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx-fragile -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACOSX-FRAGILE-SEH
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx-fragile -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACOSX-FRAGILE-SJLJ
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=dwarf -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=macosx -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=ios -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=dwarf -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=ios -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=ios -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=ios -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=watchos -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=dwarf -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=watchos -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=watchos -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=watchos -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NS
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=gnustep-1.7 -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-GNU
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=gnustep -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-GNU
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=gcc -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GCC
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=dwarf -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=gcc -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GCC
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=gcc -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GCC-SEH
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=gcc -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GCC-SJLJ
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=objfw -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-OBJFW
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=dwarf -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=objfw -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-OBJFW
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=seh -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=objfw -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-OBJFW-SEH
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -exception-model=sjlj -fobjc-exceptions -fcxx-exceptions -fobjc-runtime=objfw -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-OBJFW-SJLJ
 
 void g(void);
 
diff --git a/clang/test/CodeGenOpenCL/amdgcn-flat-scratch-name.cl b/clang/test/CodeGenOpenCL/amdgcn-flat-scratch-name.cl
index 3a98e9099f52..619a9a99568e 100644
--- a/clang/test/CodeGenOpenCL/amdgcn-flat-scratch-name.cl
+++ b/clang/test/CodeGenOpenCL/amdgcn-flat-scratch-name.cl
@@ -1,5 +1,5 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -emit-llvm -o - %s | FileCheck %s
 
 // CHECK-LABEL: @use_flat_scratch_name
 kernel void use_flat_scratch_name()
diff --git a/clang/test/CodeGenOpenCL/amdgcn-large-globals.cl b/clang/test/CodeGenOpenCL/amdgcn-large-globals.cl
index 48b9158388f5..d99b5b925360 100644
--- a/clang/test/CodeGenOpenCL/amdgcn-large-globals.cl
+++ b/clang/test/CodeGenOpenCL/amdgcn-large-globals.cl
@@ -1,5 +1,5 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -emit-llvm -o - %s | FileCheck %s
 
 // CHECK: @One ={{.*}} local_unnamed_addr addrspace(1) global [6442450944 x i8] zeroinitializer, align 1
 unsigned char One[6442450944];
diff --git a/clang/test/CodeGenOpenCL/amdgcn-non-temporal-store.cl b/clang/test/CodeGenOpenCL/amdgcn-non-temporal-store.cl
index 34938607a5e9..c8fe43cf8d03 100644
--- a/clang/test/CodeGenOpenCL/amdgcn-non-temporal-store.cl
+++ b/clang/test/CodeGenOpenCL/amdgcn-non-temporal-store.cl
@@ -1,5 +1,5 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -emit-llvm -o - %s | FileCheck %s
 // CHECK-LABEL: @test_non_temporal_store_kernel
 // CHECK: store i32 0, ptr addrspace(1) %{{.*}}, align 4, !tbaa !{{.*}}, !nontemporal {{.*}}
 
diff --git a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl
index 90e4b65c6f1f..5cb8af6fc6df 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl
@@ -1,6 +1,6 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple r600-unknown-unknown -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple r600-unknown-unknown -emit-llvm -o - %s | FileCheck %s
 
 typedef __attribute__(( ext_vector_type(2) )) char char2;
 typedef __attribute__(( ext_vector_type(3) )) char char3;
diff --git a/clang/test/CodeGenOpenCL/amdgpu-alignment.cl b/clang/test/CodeGenOpenCL/amdgpu-alignment.cl
index 0245dbed5c36..8f57713fe1f0 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-alignment.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-alignment.cl
@@ -1,6 +1,6 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -S -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown-opencl -S -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown-opencl -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
 
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
diff --git a/clang/test/CodeGenOpenCL/amdgpu-call-kernel.cl b/clang/test/CodeGenOpenCL/amdgpu-call-kernel.cl
index 8ad0beac3dde..e4678abf3350 100755
--- a/clang/test/CodeGenOpenCL/amdgpu-call-kernel.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-call-kernel.cl
@@ -1,5 +1,5 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -emit-llvm -o - %s | FileCheck %s
 // CHECK: define{{.*}} amdgpu_kernel void @test_call_kernel(ptr addrspace(1) nocapture noundef writeonly align 4 %out)
 // CHECK: store i32 4, ptr addrspace(1) %out, align 4
 
diff --git a/clang/test/CodeGenOpenCL/amdgpu-calling-conv.cl b/clang/test/CodeGenOpenCL/amdgpu-calling-conv.cl
index 835990c7b1ad..e6ef27435288 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-calling-conv.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-calling-conv.cl
@@ -1,5 +1,5 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -emit-llvm -o - %s | FileCheck %s
 
 // CHECK: define{{.*}} amdgpu_kernel void @calling_conv_amdgpu_kernel()
 kernel void calling_conv_amdgpu_kernel()
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index 7387f9a22f0d..2fda52dcd2dc 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -3,56 +3,56 @@
 // Check that appropriate features are defined for every supported AMDGPU
 // "-target" and "-mcpu" options.
 
-// RUN: %clang_cc1 -triple amdgcn -S -emit-llvm -o - %s | FileCheck --check-prefix=NOCPU %s
-// RUN: %clang_cc1 -triple amdgcn -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck --check-prefix=NOCPU-WAVE32 %s
-// RUN: %clang_cc1 -triple amdgcn -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck --check-prefix=NOCPU-WAVE64 %s
+// RUN: %clang_cc1 -triple amdgcn -emit-llvm -o - %s | FileCheck --check-prefix=NOCPU %s
+// RUN: %clang_cc1 -triple amdgcn -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck --check-prefix=NOCPU-WAVE32 %s
+// RUN: %clang_cc1 -triple amdgcn -target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck --check-prefix=NOCPU-WAVE64 %s
 
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx600 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX600 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx601 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX601 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx602 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX602 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx700 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX700 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx701 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX701 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx702 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX702 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx703 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX703 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx704 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX704 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx705 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX705 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx801 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX801 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx802 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX802 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx803 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX803 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx805 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX805 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx810 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX810 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx900 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX900 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx902 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX902 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx904 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX904 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx906 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX906 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx908 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX908 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx909 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX909 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx90a -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX90A %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx90c -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX90C %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx940 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX940 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx941 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX941 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx942 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX942 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1010 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1010 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1011 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1011 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1012 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1012 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1013 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1013 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1030 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1030 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1031 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1031 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1032 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1032 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1033 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1033 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1034 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1034 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1035 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1035 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1036 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1036 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1100 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1100 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1101 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1101 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1102 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1102 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1103 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1103 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1150 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1150 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1151 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1151 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1200 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1200 %s
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1201 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1201 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx600 -emit-llvm -o - %s | FileCheck --check-prefix=GFX600 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx601 -emit-llvm -o - %s | FileCheck --check-prefix=GFX601 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx602 -emit-llvm -o - %s | FileCheck --check-prefix=GFX602 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx700 -emit-llvm -o - %s | FileCheck --check-prefix=GFX700 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx701 -emit-llvm -o - %s | FileCheck --check-prefix=GFX701 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx702 -emit-llvm -o - %s | FileCheck --check-prefix=GFX702 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx703 -emit-llvm -o - %s | FileCheck --check-prefix=GFX703 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx704 -emit-llvm -o - %s | FileCheck --check-prefix=GFX704 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx705 -emit-llvm -o - %s | FileCheck --check-prefix=GFX705 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx801 -emit-llvm -o - %s | FileCheck --check-prefix=GFX801 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx802 -emit-llvm -o - %s | FileCheck --check-prefix=GFX802 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx803 -emit-llvm -o - %s | FileCheck --check-prefix=GFX803 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx805 -emit-llvm -o - %s | FileCheck --check-prefix=GFX805 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx810 -emit-llvm -o - %s | FileCheck --check-prefix=GFX810 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx900 -emit-llvm -o - %s | FileCheck --check-prefix=GFX900 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx902 -emit-llvm -o - %s | FileCheck --check-prefix=GFX902 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx904 -emit-llvm -o - %s | FileCheck --check-prefix=GFX904 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx906 -emit-llvm -o - %s | FileCheck --check-prefix=GFX906 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx908 -emit-llvm -o - %s | FileCheck --check-prefix=GFX908 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx909 -emit-llvm -o - %s | FileCheck --check-prefix=GFX909 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx90a -emit-llvm -o - %s | FileCheck --check-prefix=GFX90A %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx90c -emit-llvm -o - %s | FileCheck --check-prefix=GFX90C %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx940 -emit-llvm -o - %s | FileCheck --check-prefix=GFX940 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx941 -emit-llvm -o - %s | FileCheck --check-prefix=GFX941 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx942 -emit-llvm -o - %s | FileCheck --check-prefix=GFX942 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1010 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1011 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1011 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1012 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1012 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1013 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1013 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1030 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1030 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1031 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1031 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1032 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1032 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1033 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1033 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1034 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1034 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1035 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1035 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1036 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1036 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1100 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1100 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1101 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1101 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1102 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1102 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1103 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1103 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1150 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1150 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1151 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1151 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1200 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1200 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1201 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1201 %s
 
-// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1103 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1103-W64 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1103 -target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1103-W64 %s
 
 // NOCPU-NOT: "target-features"
 // NOCPU-WAVE32: "target-features"="+wavefrontsize32"
diff --git a/clang/test/CodeGenOpenCL/atomics-cas-remarks-gfx90a.cl b/clang/test/CodeGenOpenCL/atomics-cas-remarks-gfx90a.cl
index a5321ea7c158..d23005e018f3 100644
--- a/clang/test/CodeGenOpenCL/atomics-cas-remarks-gfx90a.cl
+++ b/clang/test/CodeGenOpenCL/atomics-cas-remarks-gfx90a.cl
@@ -3,7 +3,7 @@
 // RUN:     FileCheck %s --check-prefix=REMARK
 
 // RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -triple=amdgcn-amd-amdhsa -target-cpu gfx90a \
-// RUN:     -Rpass=atomic-expand -S -emit-llvm -o - 2>&1 | \
+// RUN:     -Rpass=atomic-expand -emit-llvm -o - 2>&1 | \
 // RUN:     FileCheck %s --check-prefix=GFX90A-CAS
 
 // REQUIRES: amdgpu-registered-target
diff --git a/clang/test/CodeGenOpenCL/atomics-unsafe-hw-remarks-gfx90a.cl b/clang/test/CodeGenOpenCL/atomics-unsafe-hw-remarks-gfx90a.cl
index 1243745c17b5..80ad9b4df8f6 100644
--- a/clang/test/CodeGenOpenCL/atomics-unsafe-hw-remarks-gfx90a.cl
+++ b/clang/test/CodeGenOpenCL/atomics-unsafe-hw-remarks-gfx90a.cl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple=amdgcn-amd-amdhsa -target-cpu gfx90a \
-// RUN:     -Rpass=si-lower -munsafe-fp-atomics %s -S -emit-llvm -o - 2>&1 | \
+// RUN:     -Rpass=si-lower -munsafe-fp-atomics %s -emit-llvm -o - 2>&1 | \
 // RUN:     FileCheck %s --check-prefix=GFX90A-HW
 
 // RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple=amdgcn-amd-amdhsa -target-cpu gfx90a \
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-ci.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-ci.cl
index da989ecba941..e8c17fd39a07 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-ci.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-ci.cl
@@ -1,8 +1,8 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu hawaii -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu fiji -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx906 -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu hawaii -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu fiji -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx906 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s
 
 typedef unsigned int uint;
 typedef unsigned long ulong;
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err-clamp.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err-clamp.cl
index d056f02d2437..e051701f1698 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err-clamp.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err-clamp.cl
@@ -1,7 +1,7 @@
 // REQUIRES: amdgpu-registered-target
 
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx906 -verify -S -emit-llvm -o - %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -verify -S -emit-llvm -o - %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx906 -verify -emit-llvm -o - %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -verify -emit-llvm -o - %s
 
 typedef unsigned int uint;
 typedef half __attribute__((ext_vector_type(2))) half2;
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
index ce36a807a6c0..5db280f339e7 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
@@ -1,7 +1,7 @@
 // REQUIRES: amdgpu-registered-target
 
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx900 -verify -S -emit-llvm -o - %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -verify -S -emit-llvm -o - %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx900 -verify -emit-llvm -o - %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -verify -emit-llvm -o - %s
 
 typedef unsigned int uint;
 typedef half __attribute__((ext_vector_type(2))) half2;
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl
index 1ada16610d0b..2afe39ecc9c6 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl
@@ -1,6 +1,6 @@
 // REQUIRES: amdgpu-registered-target
 
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -emit-llvm -o - %s | FileCheck %s
 
 typedef unsigned int uint;
 typedef half __attribute__((ext_vector_type(2))) half2;
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl
index 087883e9f560..2eb8b6d5f106 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl
@@ -1,6 +1,6 @@
 // REQUIRES: amdgpu-registered-target
 
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -emit-llvm -o - %s | FileCheck %s
 
 typedef unsigned int uint;
 
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts.cl
index a7d635a6d711..d5478609851b 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts.cl
@@ -1,8 +1,8 @@
 // REQUIRES: amdgpu-registered-target
 
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx906 -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1011 -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1012 -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx906 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1011 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1012 -emit-llvm -o - %s | FileCheck %s
 
 typedef unsigned int uint;
 typedef half __attribute__((ext_vector_type(2))) half2;
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-flat-address-space.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-flat-address-space.cl
index 57ee1c1a6471..28f418ffb9e6 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-flat-address-space.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-flat-address-space.cl
@@ -1,5 +1,5 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu tahiti -S -emit-llvm -disable-llvm-passes -o - %s | FileCheck -enable-var-scope %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu tahiti -emit-llvm -disable-llvm-passes -o - %s | FileCheck -enable-var-scope %s
 
 // SI did not actually support flat addressing, but we can codegen the address
 // space test builtins. The target specfic part is a load from the implicit
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl
index 4e3a56b4201b..6593a8de566f 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl
@@ -1,6 +1,6 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940  -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940  -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -emit-llvm -o - %s | FileCheck %s
 
 typedef float  v2f   __attribute__((ext_vector_type(2)));
 
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
index c2ded5c20238..3c40370e7f10 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
@@ -1,7 +1,7 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1011 -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1012 -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1011 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1012 -emit-llvm -o - %s | FileCheck %s
 
 typedef unsigned int uint;
 typedef unsigned long ulong;
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11-err.cl
index 622e9dd2eed4..08f70a25276f 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11-err.cl
@@ -1,6 +1,6 @@
 // REQUIRES: amdgpu-registered-target
 
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -verify -S -emit-llvm -o - %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -verify -emit-llvm -o - %s
 
 void test_s_sleep_var(int d)
 {
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11-param-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11-param-err.cl
index 00ecf32d9492..fc835e1a0933 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11-param-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11-param-err.cl
@@ -1,6 +1,6 @@
 // REQUIRES: amdgpu-registered-target
 
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -verify -S -emit-llvm -o - %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -verify -emit-llvm -o - %s
 
 typedef unsigned int uint;
 typedef uint uint2 __attribute__((ext_vector_type(2)));
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
index cddc83c013c0..d17ff81e5d43 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
@@ -1,10 +1,10 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1101 -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1102 -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1103 -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1150 -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1151 -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1101 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1102 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1103 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1150 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1151 -emit-llvm -o - %s | FileCheck %s
 
 typedef unsigned int uint;
 typedef unsigned long ulong;
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl
index f91fea171451..0c5a39c2c852 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl
@@ -1,6 +1,6 @@
 // REQUIRES: amdgpu-registered-target
 
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -verify -S -emit-llvm -o - %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -verify -emit-llvm -o - %s
 
 typedef unsigned int uint;
 
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-param-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-param-err.cl
index 5e0153c42825..cd6bfbe647ff 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-param-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-param-err.cl
@@ -1,6 +1,6 @@
 // REQUIRES: amdgpu-registered-target
 
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -verify -S -emit-llvm -o - %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -verify -emit-llvm -o - %s
 
 kernel void builtins_amdgcn_s_barrier_signal_err(global int* in, global int* out, int barrier) {
 
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl
index 11747af7ea74..cddc323cb27a 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
 
 typedef int    v2i   __attribute__((ext_vector_type(2)));
 typedef float  v8f   __attribute__((ext_vector_type(8)));
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl
index ef32648743ca..1c1d273eda77 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
 
 typedef float  v4f   __attribute__((ext_vector_type(4)));
 typedef half   v4h   __attribute__((ext_vector_type(4)));
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
index 26c0ee483062..d9ec258e644c 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -emit-llvm -o - %s | FileCheck %s
 
 // REQUIRES: amdgpu-registered-target
 
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx9.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx9.cl
index 3bc1811a35b7..87f2da20a21a 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx9.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx9.cl
@@ -1,6 +1,6 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx900 -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 typedef unsigned int uint;
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl
index 1e78ab283486..1fcb1d721ad7 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1100 -emit-llvm \
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1100 \
 // RUN:   -verify -S -o - %s
 
 // REQUIRES: amdgpu-registered-target
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl
index 1acc4cd7adc9..7a36881c051b 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 -target-feature +wavefrontsize32 -emit-llvm \
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 -target-feature +wavefrontsize32 \
 // RUN:   -verify -S -o - %s
 
 // REQUIRES: amdgpu-registered-target
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl
index 96b0e4c3993a..9155ee6e6182 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 -target-feature +wavefrontsize64 -emit-llvm \
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 -target-feature +wavefrontsize64 \
 // RUN:   -verify -S -o - %s
 
 // REQUIRES: amdgpu-registered-target
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl
index 126d7d6fb7b0..ce8b2c2c7c5b 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
 
 typedef int    v2i   __attribute__((ext_vector_type(2)));
 typedef short  v8s   __attribute__((ext_vector_type(8)));
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl
index 7c70ccf73ad3..b0eed07627f4 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
 
 typedef short  v4s   __attribute__((ext_vector_type(4)));
 
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gws-insts.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gws-insts.cl
index 0f59b3120288..45d2fa18efd5 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gws-insts.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gws-insts.cl
@@ -1,14 +1,14 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: amdgpu-registered-target
 
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx803 -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx906 -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90c -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1030 -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx803 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx906 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90c -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1030 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -emit-llvm -o - %s | FileCheck %s
 
 typedef unsigned int uint;
 
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-interp.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-interp.cl
index 39d913e9026e..ca4e890af691 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-interp.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-interp.cl
@@ -1,5 +1,5 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx900 -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck %s
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
index 1819ff0a6177..dcdeee6b6acc 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -1,7 +1,7 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx908 -DMFMA_GFX908_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX908
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a -DMFMA_GFX90A_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX90A
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -DMFMA_GFX940_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX940
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx908 -DMFMA_GFX908_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX908
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a -DMFMA_GFX90A_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX90A
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -DMFMA_GFX940_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX940
 
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl
index 3c90c9a495e0..7f73cdd61c80 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl
@@ -1,5 +1,5 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1030 -S \
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1030 \
 // RUN:   -emit-llvm -cl-std=CL2.0 -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1030 -S \
 // RUN:   -cl-std=CL2.0 -o - %s | FileCheck -check-prefix=ISA %s
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl
index b303c2f25ddd..56495c85bf1f 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
 
 typedef int    v2i   __attribute__((ext_vector_type(2)));
 typedef int    v4i   __attribute__((ext_vector_type(4)));
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl
index 855fa7351e15..89b26edb2f02 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
 
 typedef int    v2i   __attribute__((ext_vector_type(2)));
 typedef int    v4i   __attribute__((ext_vector_type(4)));
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
index ff8618a5c727..d135d33d7dec 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
@@ -1,8 +1,8 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu tonga -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx900 -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1012 -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu tonga -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1012 -emit-llvm -o - %s | FileCheck %s
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl
index da1ae2444315..5e587cb87e07 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl
@@ -1,8 +1,8 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -D__AMDGCN_WAVEFRONT_SIZE=32 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck -enable-var-scope %s
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -S -emit-llvm -o - %s | FileCheck -enable-var-scope %s
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck -enable-var-scope %s
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck -enable-var-scope %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -D__AMDGCN_WAVEFRONT_SIZE=32 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck -enable-var-scope %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck -enable-var-scope %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck -enable-var-scope %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck -enable-var-scope %s
 
 typedef unsigned int uint;
 
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl
index 5875f6fef2f2..1fc2ac0d3141 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl
@@ -1,8 +1,8 @@
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck -enable-var-scope %s
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx900 -S -emit-llvm -o - %s | FileCheck -enable-var-scope %s
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx900 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck -enable-var-scope %s
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck -enable-var-scope %s
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck -enable-var-scope %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck -enable-var-scope %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck -enable-var-scope %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx900 -target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck -enable-var-scope %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck -enable-var-scope %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck -enable-var-scope %s
 
 typedef unsigned long ulong;
 
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl
index 3c6aaf5e3828..2f9a367ecab8 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -DWMMA_GFX1100_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1100
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -DWMMA_GFX1100_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1100
 
 typedef float  v8f   __attribute__((ext_vector_type(8)));
 typedef half   v16h  __attribute__((ext_vector_type(16)));
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl
index af0d4ce37108..8dfe69bb9a74 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -target-feature +wavefrontsize64 -DWMMA_GFX1100_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1100
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -target-feature +wavefrontsize64 -DWMMA_GFX1100_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1100
 
 typedef float  v4f   __attribute__((ext_vector_type(4)));
 typedef half   v8h   __attribute__((ext_vector_type(8)));
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index bdca97c88786..c2ef9ea947e9 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -1,5 +1,5 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu tahiti -S -emit-llvm -o - %s | FileCheck -enable-var-scope %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu tahiti -emit-llvm -o - %s | FileCheck -enable-var-scope %s
 
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
@@ -839,6 +839,18 @@ unsigned test_wavefrontsize() {
   return __builtin_amdgcn_wavefrontsize();
 }
 
+// CHECK-LABEL test_flt_rounds(
+unsigned test_flt_rounds() {
+
+  // CHECK: call i32 @llvm.get.rounding()
+  unsigned mode = __builtin_flt_rounds();
+
+  // CHECK: call void @llvm.set.rounding(i32 %0)
+  __builtin_set_flt_rounds(mode);
+
+  return mode;
+}
+
 // CHECK-LABEL test_get_fpenv(
 unsigned long test_get_fpenv() {
   // CHECK: call i64 @llvm.get.fpenv.i64()
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
index 20ff12c33763..0b4038a2adc5 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \
-// RUN:   %s -S -emit-llvm -o - | FileCheck %s
+// RUN:   %s -emit-llvm -o - | FileCheck %s
 
 // RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \
 // RUN:   -S -o - %s | FileCheck -check-prefix=GFX12 %s
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl
index f9782c16ab34..823316da20df 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx810 \
-// RUN:   %s -S -emit-llvm -o - | FileCheck %s
+// RUN:   %s -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx810 \
 // RUN:   -S -o - %s | FileCheck -check-prefix=GFX8 %s
 
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
index afe80b17e511..8e816509341d 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \
-// RUN:   %s -S -emit-llvm -o - | FileCheck %s -check-prefix=CHECK
+// RUN:   %s -emit-llvm -o - | FileCheck %s -check-prefix=CHECK
 
 // RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \
 // RUN:   -S -o - %s | FileCheck -check-prefix=GFX90A %s
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
index 8413d24372a2..e415a95eadd2 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx940 \
-// RUN:   %s -S -emit-llvm -o - | FileCheck %s
+// RUN:   %s -emit-llvm -o - | FileCheck %s
 
 // RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx940 \
 // RUN:   -S -o - %s | FileCheck -check-prefix=GFX940 %s
diff --git a/clang/test/CodeGenOpenCL/builtins-generic-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-generic-amdgcn.cl
index 37bea1ff9330..643625652f9e 100644
--- a/clang/test/CodeGenOpenCL/builtins-generic-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-generic-amdgcn.cl
@@ -1,5 +1,5 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -Wno-error=int-conversion -triple amdgcn-unknown-unknown -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -Wno-error=int-conversion -triple amdgcn-unknown-unknown -emit-llvm -o - %s | FileCheck %s
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
diff --git a/clang/test/CodeGenOpenCL/builtins-r600.cl b/clang/test/CodeGenOpenCL/builtins-r600.cl
index 7ceb8d68cae6..c6b40f079b3f 100644
--- a/clang/test/CodeGenOpenCL/builtins-r600.cl
+++ b/clang/test/CodeGenOpenCL/builtins-r600.cl
@@ -1,5 +1,5 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple r600-unknown-unknown -target-cpu cypress -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple r600-unknown-unknown -target-cpu cypress -emit-llvm -o - %s | FileCheck %s
 
 // CHECK-LABEL: @test_recipsqrt_ieee_f32
 // CHECK: call float @llvm.r600.recipsqrt.ieee.f32
diff --git a/clang/test/CodeGenOpenCL/numbered-address-space.cl b/clang/test/CodeGenOpenCL/numbered-address-space.cl
index 13f81330d4e7..bfbc1d687355 100644
--- a/clang/test/CodeGenOpenCL/numbered-address-space.cl
+++ b/clang/test/CodeGenOpenCL/numbered-address-space.cl
@@ -1,5 +1,5 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu tonga -S -emit-llvm -O0 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu tonga -emit-llvm -O0 -o - %s | FileCheck %s
 
 // Make sure using numbered address spaces doesn't trigger crashes when a
 // builtin has an address space parameter.
diff --git a/clang/test/CodeGenOpenCL/sqrt-fpmath.cl b/clang/test/CodeGenOpenCL/sqrt-fpmath.cl
index 7afde7f91bdf..d0637283a7ec 100644
--- a/clang/test/CodeGenOpenCL/sqrt-fpmath.cl
+++ b/clang/test/CodeGenOpenCL/sqrt-fpmath.cl
@@ -3,22 +3,22 @@
 // depending on -cl-fp32-correctly-rounded-divide-sqrt
 
 // Test with -fdeclare-opencl-builtins
-// RUN: %clang_cc1 -disable-llvm-passes -triple amdgcn-unknown-unknown -fdeclare-opencl-builtins -finclude-default-header -S -emit-llvm -o %t.ll %s
+// RUN: %clang_cc1 -disable-llvm-passes -triple amdgcn-unknown-unknown -fdeclare-opencl-builtins -finclude-default-header -emit-llvm -o %t.ll %s
 // RUN: FileCheck -check-prefixes=CHECK,DEFAULT %s < %t.ll
-// RUN: %clang_cc1 -disable-llvm-passes -triple amdgcn-unknown-unknown -fdeclare-opencl-builtins -finclude-default-header -cl-fp32-correctly-rounded-divide-sqrt -S -emit-llvm -o %t.ll %s
+// RUN: %clang_cc1 -disable-llvm-passes -triple amdgcn-unknown-unknown -fdeclare-opencl-builtins -finclude-default-header -cl-fp32-correctly-rounded-divide-sqrt -emit-llvm -o %t.ll %s
 // RUN: FileCheck -check-prefixes=CHECK,CORRECTLYROUNDED %s < %t.ll
 
-// RUN: %clang_cc1 -disable-llvm-passes -triple amdgcn-unknown-unknown -fdeclare-opencl-builtins -finclude-default-header -cl-unsafe-math-optimizations -S -emit-llvm -o %t.ll %s
+// RUN: %clang_cc1 -disable-llvm-passes -triple amdgcn-unknown-unknown -fdeclare-opencl-builtins -finclude-default-header -cl-unsafe-math-optimizations -emit-llvm -o %t.ll %s
 // RUN: FileCheck -check-prefixes=CHECK,DEFAULT-UNSAFE %s < %t.ll
-// RUN: %clang_cc1 -disable-llvm-passes -triple amdgcn-unknown-unknown -fdeclare-opencl-builtins -finclude-default-header -cl-fp32-correctly-rounded-divide-sqrt -cl-unsafe-math-optimizations -S -emit-llvm -o %t.ll %s
+// RUN: %clang_cc1 -disable-llvm-passes -triple amdgcn-unknown-unknown -fdeclare-opencl-builtins -finclude-default-header -cl-fp32-correctly-rounded-divide-sqrt -cl-unsafe-math-optimizations -emit-llvm -o %t.ll %s
 // RUN: FileCheck -check-prefixes=CHECK,CORRECTLYROUNDED-UNSAFE %s < %t.ll
 
 // Test without -fdeclare-opencl-builtins
-// RUN: %clang_cc1 -disable-llvm-passes -triple amdgcn-unknown-unknown -finclude-default-header -S -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK,DEFAULT %s
-// RUN: %clang_cc1 -disable-llvm-passes -triple amdgcn-unknown-unknown -finclude-default-header -cl-fp32-correctly-rounded-divide-sqrt -S -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK,CORRECTLYROUNDED %s
+// RUN: %clang_cc1 -disable-llvm-passes -triple amdgcn-unknown-unknown -finclude-default-header -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK,DEFAULT %s
+// RUN: %clang_cc1 -disable-llvm-passes -triple amdgcn-unknown-unknown -finclude-default-header -cl-fp32-correctly-rounded-divide-sqrt -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK,CORRECTLYROUNDED %s
 
-// RUN: %clang_cc1 -disable-llvm-passes -triple amdgcn-unknown-unknown -finclude-default-header -cl-unsafe-math-optimizations -S -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK,DEFAULT-UNSAFE %s
-// RUN: %clang_cc1 -disable-llvm-passes -triple amdgcn-unknown-unknown -finclude-default-header -cl-fp32-correctly-rounded-divide-sqrt -cl-unsafe-math-optimizations -S -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK,CORRECTLYROUNDED-UNSAFE %s
+// RUN: %clang_cc1 -disable-llvm-passes -triple amdgcn-unknown-unknown -finclude-default-header -cl-unsafe-math-optimizations -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK,DEFAULT-UNSAFE %s
+// RUN: %clang_cc1 -disable-llvm-passes -triple amdgcn-unknown-unknown -finclude-default-header -cl-fp32-correctly-rounded-divide-sqrt -cl-unsafe-math-optimizations -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK,CORRECTLYROUNDED-UNSAFE %s
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
diff --git a/clang/test/CodeGenOpenCL/visibility.cl b/clang/test/CodeGenOpenCL/visibility.cl
index b1e09fbc5158..addfe33377f9 100644
--- a/clang/test/CodeGenOpenCL/visibility.cl
+++ b/clang/test/CodeGenOpenCL/visibility.cl
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -std=cl2.0 -fapply-global-visibility-to-externs -fvisibility=default -triple amdgcn-unknown-unknown -S -emit-llvm -o - %s | FileCheck --check-prefix=FVIS-DEFAULT %s
-// RUN: %clang_cc1 -std=cl2.0 -fapply-global-visibility-to-externs -fvisibility=protected -triple amdgcn-unknown-unknown -S -emit-llvm -o - %s | FileCheck --check-prefix=FVIS-PROTECTED %s
-// RUN: %clang_cc1 -std=cl2.0 -fapply-global-visibility-to-externs -fvisibility=hidden -triple amdgcn-unknown-unknown -S -emit-llvm -o - %s | FileCheck --check-prefix=FVIS-HIDDEN %s
+// RUN: %clang_cc1 -std=cl2.0 -fapply-global-visibility-to-externs -fvisibility=default -triple amdgcn-unknown-unknown -emit-llvm -o - %s | FileCheck --check-prefix=FVIS-DEFAULT %s
+// RUN: %clang_cc1 -std=cl2.0 -fapply-global-visibility-to-externs -fvisibility=protected -triple amdgcn-unknown-unknown -emit-llvm -o - %s | FileCheck --check-prefix=FVIS-PROTECTED %s
+// RUN: %clang_cc1 -std=cl2.0 -fapply-global-visibility-to-externs -fvisibility=hidden -triple amdgcn-unknown-unknown -emit-llvm -o - %s | FileCheck --check-prefix=FVIS-HIDDEN %s
 
 // REQUIRES: amdgpu-registered-target
 
diff --git a/clang/test/CodeGenOpenCLCXX/addrspace-with-class.clcpp b/clang/test/CodeGenOpenCLCXX/addrspace-with-class.clcpp
index 18d97a989a43..a0ed03b25535 100644
--- a/clang/test/CodeGenOpenCLCXX/addrspace-with-class.clcpp
+++ b/clang/test/CodeGenOpenCLCXX/addrspace-with-class.clcpp
@@ -5,7 +5,7 @@
 // for constructors, member functions and destructors.
 // See also atexit.cl and global_init.cl for other specific tests.
 
-// CHECK: %struct.MyType = type { i32 }
+// CHECK: %struct.MyType = type { i32, [5 x i32] }
 struct MyType {
   MyType(int i) : i(i) {}
   MyType(int i) __constant : i(i) {}
@@ -14,6 +14,7 @@ struct MyType {
   int bar() { return i + 2; }
   int bar() __constant { return i + 1; }
   int i;
+  int a[5] = {42, 43, 44, 45, 46};
 };
 
 // CHECK: @const1 ={{.*}} addrspace(2) global %struct.MyType zeroinitializer
@@ -23,6 +24,8 @@ __constant MyType const2(2);
 // CHECK: @glob ={{.*}} addrspace(1) global %struct.MyType zeroinitializer
 MyType glob(1);
 
+// CHECK: @constinit ={{.*}} addrspace(2) constant [5 x i32] [i32 42, i32 43, i32 44, i32 45, i32 46]
+
 // CHECK: call spir_func void @_ZNU3AS26MyTypeC1Ei(ptr addrspace(2) {{[^,]*}} @const1, i32 noundef 1)
 // CHECK: call spir_func void @_ZNU3AS26MyTypeC1Ei(ptr addrspace(2) {{[^,]*}} @const2, i32 noundef 2)
 // CHECK: call spir_func void @_ZNU3AS46MyTypeC1Ei(ptr addrspace(4) {{[^,]*}} addrspacecast (ptr addrspace(1) @glob to ptr addrspace(4)), i32 noundef 1)
diff --git a/clang/test/CodeGenSYCL/function-attrs.cpp b/clang/test/CodeGenSYCL/function-attrs.cpp
index 1606f961f2d3..83a77a617240 100644
--- a/clang/test/CodeGenSYCL/function-attrs.cpp
+++ b/clang/test/CodeGenSYCL/function-attrs.cpp
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals --version 3
-// RUN: %clang_cc1 -fsycl-is-device -emit-llvm -disable-llvm-passes \
+// RUN: %clang_cc1 -fsycl-is-device -disable-llvm-passes \
 // RUN:  -triple spir64 -fexceptions -emit-llvm -fno-ident %s -o - | FileCheck %s
 
 int foo();
diff --git a/clang/test/CodeGenSYCL/functionptr-addrspace.cpp b/clang/test/CodeGenSYCL/functionptr-addrspace.cpp
index 340caa1e62c5..a477b4c7d03a 100644
--- a/clang/test/CodeGenSYCL/functionptr-addrspace.cpp
+++ b/clang/test/CodeGenSYCL/functionptr-addrspace.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -emit-llvm -triple spir64 -verify -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fsycl-is-device -triple spir64 -verify -emit-llvm %s -o - | FileCheck %s
 
 // expected-no-diagnostics
 
diff --git a/clang/test/CoverageMapping/pr32679.cpp b/clang/test/CoverageMapping/pr32679.cpp
index 8e000765710c..639a8304fe84 100644
--- a/clang/test/CoverageMapping/pr32679.cpp
+++ b/clang/test/CoverageMapping/pr32679.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -mllvm -emptyline-comment-coverage=false -cc1 -triple i686-pc-windows-msvc19.0.0 -emit-obj -fprofile-instrument=clang -std=c++14 -fdelayed-template-parsing -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name pr32679.cpp -o - %s | FileCheck %s -check-prefix=MSABI -implicit-check-not=f2
-// RUN: %clang_cc1 -mllvm -emptyline-comment-coverage=false -cc1 -triple %itanium_abi_triple -emit-obj -fprofile-instrument=clang -std=c++14 -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name pr32679.cpp -o - %s | FileCheck %s -check-prefix=ITANIUM -implicit-check-not=f2
+// RUN: %clang_cc1 -mllvm -emptyline-comment-coverage=false -cc1 -triple i686-pc-windows-msvc19.0.0 -fprofile-instrument=clang -std=c++14 -fdelayed-template-parsing -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name pr32679.cpp -o - %s | FileCheck %s -check-prefix=MSABI -implicit-check-not=f2
+// RUN: %clang_cc1 -mllvm -emptyline-comment-coverage=false -cc1 -triple %itanium_abi_triple -fprofile-instrument=clang -std=c++14 -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name pr32679.cpp -o - %s | FileCheck %s -check-prefix=ITANIUM -implicit-check-not=f2
 
 template <typename T, int S1>
 struct CreateSpecialization;
diff --git a/clang/test/Driver/aarch64-fmv.c b/clang/test/Driver/aarch64-fmv.c
new file mode 100644
index 000000000000..e7d01d1d5906
--- /dev/null
+++ b/clang/test/Driver/aarch64-fmv.c
@@ -0,0 +1,27 @@
+// Test which driver flags enable/disable Function Multiversioning on aarch64.
+
+// FMV is enabled for non-android aarch64 targets:
+// RUN: %clang --target=aarch64 --rtlib=compiler-rt -### -c %s 2>&1 | FileCheck -check-prefix=FMV-ENABLED %s
+// RUN: %clang --target=aarch64-linux-gnu --rtlib=compiler-rt -### -c %s 2>&1 | FileCheck -check-prefix=FMV-ENABLED %s
+// RUN: %clang --target=arm64-apple-ios --rtlib=compiler-rt -### -c %s 2>&1 | FileCheck -check-prefix=FMV-ENABLED %s
+// RUN: %clang --target=arm64-apple-macosx --rtlib=compiler-rt -### -c %s 2>&1 | FileCheck -check-prefix=FMV-ENABLED %s
+
+// android23 defaults to --rtlib=compiler-rt:
+// RUN: %clang --target=aarch64-linux-android23 -### -c %s 2>&1 | FileCheck -check-prefix=FMV-ENABLED %s
+// RUN: %clang --target=aarch64-linux-android23 --rtlib=compiler-rt  -### -c %s 2>&1 | FileCheck -check-prefix=FMV-ENABLED %s
+
+// FMV is disabled without compiler-rt:
+// RUN: %clang --rtlib=libgcc --target=aarch64 -### -c %s 2>&1 | FileCheck -check-prefix=FMV-DISABLED %s
+// RUN: %clang --rtlib=libgcc --target=aarch64-linux-gnu -### -c %s 2>&1 | FileCheck -check-prefix=FMV-DISABLED %s
+
+// Disabled for older android versions:
+// RUN: %clang --rtlib=compiler-rt --target=aarch64-linux-android               -### -c %s 2>&1 | FileCheck -check-prefix=FMV-DISABLED %s
+// RUN: %clang --rtlib=compiler-rt --target=aarch64-linux-android22             -### -c %s 2>&1 | FileCheck -check-prefix=FMV-DISABLED %s
+// RUN: %clang --rtlib=compiler-rt --target=aarch64-linux-android22  -mno-fmv   -### -c %s 2>&1 | FileCheck -check-prefix=FMV-DISABLED %s
+
+// Disabled explicitly:
+// RUN: %clang --rtlib=compiler-rt --target=aarch64 -mno-fmv -### -c %s 2>&1 | FileCheck -check-prefix=FMV-DISABLED %s
+// RUN: %clang --rtlib=compiler-rt --target=aarch64-linux-android23  -mno-fmv   -### -c %s 2>&1 | FileCheck -check-prefix=FMV-DISABLED %s
+
+// FMV-ENABLED-NOT: "-target-feature" "-fmv"
+// FMV-DISABLED: "-target-feature" "-fmv"
diff --git a/clang/test/Driver/aarch64-implied-sme-features.c b/clang/test/Driver/aarch64-implied-sme-features.c
index 67836f42f2c0..eca62e2563b7 100644
--- a/clang/test/Driver/aarch64-implied-sme-features.c
+++ b/clang/test/Driver/aarch64-implied-sme-features.c
@@ -14,7 +14,7 @@
 // SME-CONFLICT: "-target-feature" "-bf16"{{.*}} "-target-feature" "-sme"
 
 // RUN: %clang -target aarch64-linux-gnu -march=armv8-a+sme-i16i64 %s -### 2>&1 | FileCheck %s --check-prefix=SME-I16I64
-// SME-I16I64: "-target-feature" "+bf16"{{.*}} "-target-feature" "+sme-i16i64" "-target-feature" "+sme"
+// SME-I16I64: "-target-feature" "+bf16"{{.*}} "-target-feature" "+sme" "-target-feature" "+sme-i16i64"
 
 // RUN: %clang -target aarch64-linux-gnu -march=armv8-a+nosme-i16i64 %s -### 2>&1 | FileCheck %s --check-prefix=NOSME-I16I64
 // NOSME-I16I64-NOT: "-target-feature" "+sme-i16i64"
@@ -23,7 +23,7 @@
 // NOSME-I16I64-NOT: sme-i16i64"
 
 // RUN: %clang -target aarch64-linux-gnu -march=armv8-a+sme-i16i64+nosme-i16i64 %s -### 2>&1 | FileCheck %s --check-prefix=SME-I16I64-REVERT
-// SME-I16I64-REVERT: "-target-feature" "+bf16"{{.*}} "-target-feature" "-sme-i16i64" "-target-feature" "+sme"
+// SME-I16I64-REVERT: "-target-feature" "+bf16"{{.*}} "-target-feature" "+sme" "-target-feature" "-sme-i16i64"
 
 // RUN: %clang -target aarch64-linux-gnu -march=armv8-a+nosme-f64f64 %s -### 2>&1 | FileCheck %s --check-prefix=NOSME-F64F64
 // NOSME-F64F64-NOT: "-target-feature" "+sme-f64f64"
@@ -32,15 +32,15 @@
 // NOSME-F64F64-NOT: sme-f64f64"
 
 // RUN: %clang -target aarch64-linux-gnu -march=armv8-a+sme-f64f64+nosme-f64f64 %s -### 2>&1 | FileCheck %s --check-prefix=SME-F64F64-REVERT
-// SME-F64F64-REVERT: "-target-feature" "+bf16"{{.*}} "-target-feature" "-sme-f64f64" "-target-feature" "+sme"
+// SME-F64F64-REVERT: "-target-feature" "+bf16"{{.*}} "-target-feature" "+sme" "-target-feature" "-sme-f64f64"
 
 // RUN: %clang -target aarch64-linux-gnu -march=armv8-a+sme-f64f64+nosme-i16i64 %s -### 2>&1 | FileCheck %s --check-prefix=SME-SUBFEATURE-MIX
 // SME-SUBFEATURE-MIX-NOT: "+sme-i16i64"
-// SME-SUBFEATURE-MIX: "-target-feature" "+bf16"{{.*}} "-target-feature" "+sme-f64f64" "-target-feature" "+sme"
+// SME-SUBFEATURE-MIX: "-target-feature" "+bf16"{{.*}} "-target-feature" "+sme" "-target-feature" "+sme-f64f64"
 // SME-SUBFEATURE-MIX-NOT: "+sme-i16i64"
 
 // RUN: %clang -target aarch64-linux-gnu -march=armv8-a+sme-i16i64+nosme %s -### 2>&1 | FileCheck %s --check-prefix=SME-SUBFEATURE-CONFLICT1
-// SME-SUBFEATURE-CONFLICT1: "-target-feature" "+bf16"{{.*}} "-target-feature" "-sme-i16i64" "-target-feature" "-sme"
+// SME-SUBFEATURE-CONFLICT1: "-target-feature" "+bf16"{{.*}} "-target-feature" "-sme" "-target-feature" "-sme-i16i64"
 
 // RUN: %clang -target aarch64-linux-gnu -march=armv8-a+sme-f64f64+nobf16 %s -### 2>&1 | FileCheck %s --check-prefix=SME-SUBFEATURE-CONFLICT2
 // SME-SUBFEATURE-CONFLICT2-NOT: "-target-feature" "+bf16"
@@ -48,4 +48,4 @@
 // SME-SUBFEATURE-CONFLICT2-NOT: "-target-feature" "+sme-f64f64"
 
 // RUN: %clang -target aarch64-linux-gnu -march=armv8-a+nosme+sme-i16i64 %s -### 2>&1 | FileCheck %s --check-prefix=SME-SUBFEATURE-CONFLICT-REV
-// SME-SUBFEATURE-CONFLICT-REV: "-target-feature" "+bf16"{{.*}} "-target-feature" "+sme-i16i64" "-target-feature" "+sme"
+// SME-SUBFEATURE-CONFLICT-REV: "-target-feature" "+bf16"{{.*}} "-target-feature" "+sme" "-target-feature" "+sme-i16i64"
diff --git a/clang/test/Driver/aarch64-implied-sve-features.c b/clang/test/Driver/aarch64-implied-sve-features.c
index 9227cd4981c2..f04e1a785673 100644
--- a/clang/test/Driver/aarch64-implied-sve-features.c
+++ b/clang/test/Driver/aarch64-implied-sve-features.c
@@ -24,7 +24,7 @@
 // SVE-SVE2: "-target-feature" "+sve" "-target-feature" "+sve2"
 
 // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-bitperm %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-BITPERM
-// SVE2-BITPERM: "-target-feature" "+sve" "-target-feature" "+sve2-bitperm" "-target-feature" "+sve2"
+// SVE2-BITPERM: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "+sve2-bitperm"
 
 // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+nosve2-bitperm %s -### 2>&1 | FileCheck %s --check-prefix=NOSVE2-BITPERM
 // NOSVE2-BITPERM-NOT: "-target-feature" "+sve2-bitperm"
@@ -33,32 +33,32 @@
 // NOSVE2-BITPERM-NOT: sve2-bitperm"
 
 // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-bitperm+nosve2-bitperm %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-BITPERM-REVERT
-// SVE2-BITPERM-REVERT: "-target-feature" "+sve" "-target-feature" "-sve2-bitperm" "-target-feature" "+sve2"
+// SVE2-BITPERM-REVERT: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "-sve2-bitperm"
 
 // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-aes+nosve2-aes %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-AES-REVERT
-// SVE2-AES-REVERT: "-target-feature" "+sve" "-target-feature" "-sve2-aes" "-target-feature" "+sve2"
+// SVE2-AES-REVERT: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "-sve2-aes"
 
 // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-sha3+nosve2-sha3 %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-SHA3-REVERT
-// SVE2-SHA3-REVERT: "-target-feature" "+sve" "-target-feature" "-sve2-sha3" "-target-feature" "+sve2"
+// SVE2-SHA3-REVERT: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "-sve2-sha3"
 
 // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-sm4+nosve2-sm4 %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-SM4-REVERT
-// SVE2-SM4-REVERT: "-target-feature" "+sve" "-target-feature" "-sve2-sm4" "-target-feature" "+sve2"
+// SVE2-SM4-REVERT: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "-sve2-sm4"
 
 // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-sha3 %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-SHA3
-// SVE2-SHA3: "-target-feature" "+sve" "-target-feature" "+sve2-sha3" "-target-feature" "+sve2"
+// SVE2-SHA3: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "+sve2-sha3"
 
 // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-aes %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-AES
-// SVE2-AES: "-target-feature" "+sve" "-target-feature" "+sve2-aes" "-target-feature" "+sve2"
+// SVE2-AES: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "+sve2-aes"
 
 // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-sm4 %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-SM4
-// SVE2-SM4: "-target-feature" "+sve" "-target-feature" "+sve2-sm4" "-target-feature" "+sve2"
+// SVE2-SM4: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "+sve2-sm4"
 
 // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-bitperm+nosve2-aes %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-SUBFEATURE-MIX
-// SVE2-SUBFEATURE-MIX: "-target-feature" "+sve" "-target-feature" "+sve2-bitperm" "-target-feature" "+sve2"
+// SVE2-SUBFEATURE-MIX: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "+sve2-bitperm"
 // SVE2-SUBFEATURE-NOT: sve2-aes
 
 // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-sm4+nosve2 %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-SUBFEATURE-CONFLICT
-// SVE2-SUBFEATURE-CONFLICT: "-target-feature" "+sve" "-target-feature" "-sve2-sm4" "-target-feature" "-sve2"
+// SVE2-SUBFEATURE-CONFLICT: "-target-feature" "+sve" "-target-feature" "-sve2" "-target-feature" "-sve2-sm4"
 
 // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-aes+nosve %s -### 2>&1 | FileCheck %s --check-prefix=SVE-SUBFEATURE-CONFLICT
 // SVE-SUBFEATURE-CONFLICT-NOT: "-target-feature" "+sve2-aes"
@@ -66,7 +66,7 @@
 // SVE-SUBFEATURE-CONFLICT-NOT: "-target-feature" "+sve"
 
 // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+nosve+sve2-aes %s -### 2>&1 | FileCheck %s --check-prefix=SVE-SUBFEATURE-CONFLICT-REV
-// SVE-SUBFEATURE-CONFLICT-REV: "-target-feature" "+sve" "-target-feature" "+sve2-aes" "-target-feature" "+sve2"
+// SVE-SUBFEATURE-CONFLICT-REV: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "+sve2-aes"
 
 // RUN: %clang --target=aarch64-linux-gnu -mcpu=neoverse-n2+nosve2 %s -### 2>&1 | FileCheck %s --check-prefix=SVE-MCPU-FEATURES
 // SVE-MCPU-FEATURES-NOT: "-target-feature" "+sve2-bitperm"
diff --git a/clang/test/Driver/aarch64-ptrauth.c b/clang/test/Driver/aarch64-ptrauth.c
index 1a69b2c6edfb..fa0125f4b22a 100644
--- a/clang/test/Driver/aarch64-ptrauth.c
+++ b/clang/test/Driver/aarch64-ptrauth.c
@@ -1,5 +1,25 @@
-// RUN: %clang -### -c --target=aarch64 -fno-ptrauth-intrinsics -fptrauth-intrinsics %s 2>&1 | FileCheck %s --check-prefix=INTRIN
-// INTRIN: "-cc1"{{.*}} "-fptrauth-intrinsics"
+// RUN: %clang -### -c --target=aarch64 %s 2>&1 | FileCheck %s --check-prefix NONE
+// NONE: "-cc1"
+// NONE-NOT: "-fptrauth-
 
-// RUN: not %clang -### -c --target=x86_64 -fptrauth-intrinsics %s 2>&1 | FileCheck %s --check-prefix=ERR
-// ERR: error: unsupported option '-fptrauth-intrinsics' for target '{{.*}}'
+// RUN: %clang -### -c --target=aarch64 \
+// RUN:   -fno-ptrauth-intrinsics -fptrauth-intrinsics \
+// RUN:   -fno-ptrauth-calls -fptrauth-calls \
+// RUN:   -fno-ptrauth-returns -fptrauth-returns \
+// RUN:   -fno-ptrauth-auth-traps -fptrauth-auth-traps \
+// RUN:   -fno-ptrauth-vtable-pointer-address-discrimination -fptrauth-vtable-pointer-address-discrimination \
+// RUN:   -fno-ptrauth-vtable-pointer-type-discrimination -fptrauth-vtable-pointer-type-discrimination \
+// RUN:   -fno-ptrauth-init-fini -fptrauth-init-fini \
+// RUN:   %s 2>&1 | FileCheck %s --check-prefix=ALL
+// ALL: "-cc1"{{.*}} "-fptrauth-intrinsics" "-fptrauth-calls" "-fptrauth-returns" "-fptrauth-auth-traps" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-init-fini"
+
+// RUN: not %clang -### -c --target=x86_64 -fptrauth-intrinsics -fptrauth-calls -fptrauth-returns -fptrauth-auth-traps \
+// RUN:   -fptrauth-vtable-pointer-address-discrimination -fptrauth-vtable-pointer-type-discrimination \
+// RUN:   -fptrauth-init-fini %s 2>&1 | FileCheck %s --check-prefix=ERR
+// ERR:      error: unsupported option '-fptrauth-intrinsics' for target '{{.*}}'
+// ERR-NEXT: error: unsupported option '-fptrauth-calls' for target '{{.*}}'
+// ERR-NEXT: error: unsupported option '-fptrauth-returns' for target '{{.*}}'
+// ERR-NEXT: error: unsupported option '-fptrauth-auth-traps' for target '{{.*}}'
+// ERR-NEXT: error: unsupported option '-fptrauth-vtable-pointer-address-discrimination' for target '{{.*}}'
+// ERR-NEXT: error: unsupported option '-fptrauth-vtable-pointer-type-discrimination' for target '{{.*}}'
+// ERR-NEXT: error: unsupported option '-fptrauth-init-fini' for target '{{.*}}'
diff --git a/clang/test/Driver/arm-cortex-cpus-1.c b/clang/test/Driver/arm-cortex-cpus-1.c
index 25abbe1e3a8a..6f0b64910f9b 100644
--- a/clang/test/Driver/arm-cortex-cpus-1.c
+++ b/clang/test/Driver/arm-cortex-cpus-1.c
@@ -153,23 +153,23 @@
 // RUN: %clang -target armv8r-linux-gnueabi -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V8R %s
 // RUN: %clang -target arm -march=armv8r -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V8R %s
 // RUN: %clang -target arm -march=armv8-r -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V8R %s
-// CHECK-V8R: "-cc1"{{.*}} "-triple" "armv8r-{{.*}} "-target-cpu" "cortex-r52"
+// CHECK-V8R: "-cc1"{{.*}} "-triple" "armv8r-{{.*}} "-target-cpu" "generic"
 
 // RUN: %clang -target armv8r-linux-gnueabi -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V8R-BIG %s
 // RUN: %clang -target arm -march=armv8r -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V8R-BIG %s
 // RUN: %clang -target arm -march=armv8-r -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V8R-BIG %s
-// CHECK-V8R-BIG: "-cc1"{{.*}} "-triple" "armebv8r-{{.*}} "-target-cpu" "cortex-r52"
+// CHECK-V8R-BIG: "-cc1"{{.*}} "-triple" "armebv8r-{{.*}} "-target-cpu" "generic"
 
 // RUN: %clang -target armv8r-linux-gnueabi -mthumb -### -c %s 2>&1 | \
 // RUN:     FileCheck -check-prefix=CHECK-V8R-THUMB %s
 // RUN: %clang -target arm -march=armv8r -mthumb -### -c %s 2>&1 | \
 // RUN:     FileCheck -check-prefix=CHECK-V8R-THUMB %s
-// CHECK-V8R-THUMB: "-cc1"{{.*}} "-triple" "thumbv8r-{{.*}} "-target-cpu" "cortex-r52"
+// CHECK-V8R-THUMB: "-cc1"{{.*}} "-triple" "thumbv8r-{{.*}} "-target-cpu" "generic"
 // RUN: %clang -target armv8r-linux-gnueabi -mthumb -mbig-endian -### -c %s 2>&1 | \
 // RUN:     FileCheck -check-prefix=CHECK-V8R-THUMB-BIG %s
 // RUN: %clang -target arm -march=armv8r -mthumb -mbig-endian -### -c %s 2>&1 | \
 // RUN:     FileCheck -check-prefix=CHECK-V8R-THUMB-BIG %s
-// CHECK-V8R-THUMB-BIG: "-cc1"{{.*}} "-triple" "thumbebv8r-{{.*}} "-target-cpu" "cortex-r52"
+// CHECK-V8R-THUMB-BIG: "-cc1"{{.*}} "-triple" "thumbebv8r-{{.*}} "-target-cpu" "generic"
 
 // RUN: %clang -mcpu=generic -target armv8 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V8A-GENERIC %s
 // RUN: %clang -mcpu=generic -target arm -march=armv8 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V8A-GENERIC %s
diff --git a/clang/test/Driver/arm-features.c b/clang/test/Driver/arm-features.c
index e043244f18a6..eb424f5f6111 100644
--- a/clang/test/Driver/arm-features.c
+++ b/clang/test/Driver/arm-features.c
@@ -74,7 +74,7 @@
 // Check +crypto for M and R profiles:
 //
 // RUN: %clang -target arm-arm-none-eabi -march=armv8-r+crypto   -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CRYPTO-R %s
-// CHECK-CRYPTO-R: "-cc1"{{.*}} "-target-cpu" "cortex-r52"{{.*}} "-target-feature" "+sha2" "-target-feature" "+aes"
+// CHECK-CRYPTO-R: "-cc1"{{.*}} "-target-cpu" "generic"{{.*}} "-target-feature" "+sha2" "-target-feature" "+aes"
 // RUN: %clang -target arm-arm-none-eabi -march=armv8-m.base+crypto -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-NOCRYPTO5 %s
 // RUN: %clang -target arm-arm-none-eabi -march=armv8-m.main+crypto -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-NOCRYPTO5 %s
 // RUN: %clang -target arm-arm-none-eabi -mcpu=cortex-m23+crypto -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-NOCRYPTO5 %s
diff --git a/clang/test/Driver/clang-offload-bundler-asserts-on.c b/clang/test/Driver/clang-offload-bundler-asserts-on.c
index eb11d5fbbee4..55060c2c42e7 100644
--- a/clang/test/Driver/clang-offload-bundler-asserts-on.c
+++ b/clang/test/Driver/clang-offload-bundler-asserts-on.c
@@ -1,6 +1,6 @@
 // REQUIRES: x86-registered-target
 // REQUIRES: asserts
-// UNSUPPORTED: target={{.*}}-macosx{{.*}}, target={{.*}}-darwin{{.*}}, target={{.*}}-aix{{.*}}
+// UNSUPPORTED: target={{.*}}-macosx{{.*}}, target={{.*}}-darwin{{.*}}, target={{.*}}-aix{{.*}}, target={{.*}}-zos{{.*}}
 
 // Generate the file we can bundle.
 // RUN: %clang -O0 -target %itanium_abi_triple %s -c -o %t.o
diff --git a/clang/test/Driver/clang-offload-bundler-standardize.c b/clang/test/Driver/clang-offload-bundler-standardize.c
index 91dc8947aabb..52f5ea038e47 100644
--- a/clang/test/Driver/clang-offload-bundler-standardize.c
+++ b/clang/test/Driver/clang-offload-bundler-standardize.c
@@ -1,6 +1,6 @@
 // REQUIRES: x86-registered-target
 // REQUIRES: asserts
-// UNSUPPORTED: target={{.*}}-macosx{{.*}}, target={{.*}}-darwin{{.*}}, target={{.*}}-aix{{.*}}
+// UNSUPPORTED: target={{.*}}-macosx{{.*}}, target={{.*}}-darwin{{.*}}, target={{.*}}-aix{{.*}}, target={{.*}}-zos{{.*}}
 // REQUIRES: asserts
 
 // Generate the file we can bundle.
diff --git a/clang/test/Driver/clang-offload-bundler-zlib.c b/clang/test/Driver/clang-offload-bundler-zlib.c
index 15b60341a8db..fff7a0f54568 100644
--- a/clang/test/Driver/clang-offload-bundler-zlib.c
+++ b/clang/test/Driver/clang-offload-bundler-zlib.c
@@ -1,6 +1,6 @@
 // REQUIRES: zlib && !zstd
 // REQUIRES: x86-registered-target
-// UNSUPPORTED: target={{.*}}-darwin{{.*}}, target={{.*}}-aix{{.*}}
+// UNSUPPORTED: target={{.*}}-darwin{{.*}}, target={{.*}}-aix{{.*}}, target={{.*}}-zos{{.*}}
 
 //
 // Generate the host binary to be bundled.
diff --git a/clang/test/Driver/clang-offload-bundler-zstd.c b/clang/test/Driver/clang-offload-bundler-zstd.c
index a424981c6971..d01d9659a68d 100644
--- a/clang/test/Driver/clang-offload-bundler-zstd.c
+++ b/clang/test/Driver/clang-offload-bundler-zstd.c
@@ -1,6 +1,6 @@
 // REQUIRES: zstd
 // REQUIRES: x86-registered-target
-// UNSUPPORTED: target={{.*}}-darwin{{.*}}, target={{.*}}-aix{{.*}}
+// UNSUPPORTED: target={{.*}}-darwin{{.*}}, target={{.*}}-aix{{.*}}, target={{.*}}-zos{{.*}}
 
 //
 // Generate the host binary to be bundled.
diff --git a/clang/test/Driver/clang-offload-bundler.c b/clang/test/Driver/clang-offload-bundler.c
index a56a5424abf8..1909ff2d71d0 100644
--- a/clang/test/Driver/clang-offload-bundler.c
+++ b/clang/test/Driver/clang-offload-bundler.c
@@ -1,5 +1,5 @@
 // REQUIRES: x86-registered-target
-// UNSUPPORTED: target={{.*}}-macosx{{.*}}, target={{.*}}-darwin{{.*}}, target={{.*}}-aix{{.*}}
+// UNSUPPORTED: target={{.*}}-macosx{{.*}}, target={{.*}}-darwin{{.*}}, target={{.*}}-aix{{.*}}, target={{.*}}-zos{{.*}}
 
 //
 // Generate all the types of files we can bundle.
@@ -506,6 +506,17 @@
 // NOGFX906: error: Can't find bundles for hip-amdgcn-amd-amdhsa--gfx906
 
 //
+// Check hip and hipv4 are compatible as offload kind.
+//
+// RUN: clang-offload-bundler -type=o -targets=hip-amdgcn-amd-amdhsa--gfx90a -input=%t.tgt1 -output=%t.bundle3.o
+// RUN: clang-offload-bundler -type=o -targets=hipv4-amdgcn-amd-amdhsa--gfx90a:sramecc-:xnack+ -output=%t.res.tgt1 -input=%t.bundle3.o -unbundle
+// RUN: diff %t.tgt1 %t.res.tgt1
+
+// RUN: clang-offload-bundler -type=o -targets=hipv4-amdgcn-amd-amdhsa--gfx90a -input=%t.tgt1 -output=%t.bundle3.o
+// RUN: clang-offload-bundler -type=o -targets=hip-amdgcn-amd-amdhsa--gfx90a:sramecc-:xnack+ -output=%t.res.tgt1 -input=%t.bundle3.o -unbundle
+// RUN: diff %t.tgt1 %t.res.tgt1
+
+//
 // Check archive unbundling
 //
 // Create few code object bundles and archive them to create an input archive
diff --git a/clang/test/Driver/dxc_dxv_path.hlsl b/clang/test/Driver/dxc_dxv_path.hlsl
index 3d8e90d0d919..4845de11d5b0 100644
--- a/clang/test/Driver/dxc_dxv_path.hlsl
+++ b/clang/test/Driver/dxc_dxv_path.hlsl
@@ -7,12 +7,12 @@
 // DXV_PATH:dxv{{(.exe)?}}" "-" "-o" "-"
 
 // RUN: %clang_dxc -I test -Vd -Tlib_6_3  -### %s 2>&1 | FileCheck %s --check-prefix=VD
-// VD:"-cc1"{{.*}}"-triple" "dxil-unknown-shadermodel6.3-library"
+// VD:"-cc1"{{.*}}"-triple" "dxilv1.3-unknown-shadermodel6.3-library"
 // VD-NOT:dxv not found
 
 // RUN: %clang_dxc -Tlib_6_3 -ccc-print-bindings --dxv-path=%T -Fo %t.dxo  %s 2>&1 | FileCheck %s --check-prefix=BINDINGS
-// BINDINGS: "dxil-unknown-shadermodel6.3-library" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[DXC:.+]].dxo"
-// BINDINGS-NEXT: "dxil-unknown-shadermodel6.3-library" - "hlsl::Validator", inputs: ["[[DXC]].dxo"]
+// BINDINGS: "dxilv1.3-unknown-shadermodel6.3-library" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[DXC:.+]].dxo"
+// BINDINGS-NEXT: "dxilv1.3-unknown-shadermodel6.3-library" - "hlsl::Validator", inputs: ["[[DXC]].dxo"]
 
 // RUN: %clang_dxc -Tlib_6_3 -ccc-print-phases --dxv-path=%T -Fo %t.dxc  %s 2>&1 | FileCheck %s --check-prefix=PHASES
 
diff --git a/clang/test/Driver/dxc_fcgl.hlsl b/clang/test/Driver/dxc_fcgl.hlsl
index 567bad1bc13b..cfbf2503ddaa 100644
--- a/clang/test/Driver/dxc_fcgl.hlsl
+++ b/clang/test/Driver/dxc_fcgl.hlsl
@@ -1,6 +1,6 @@
 // RUN: not %clang_dxc -fcgl -T lib_6_7 foo.hlsl -### %s 2>&1 | FileCheck %s
 
-// Make sure fcgl option flag which translated into "-S" "-emit-llvm" "-disable-llvm-passes".
+// Make sure fcgl option flag which translated into "-emit-llvm" "-disable-llvm-passes".
 // CHECK:"-S"
 // CHECK-SAME:"-emit-llvm" "-disable-llvm-passes"
 
diff --git a/clang/test/Driver/fp-contract.c b/clang/test/Driver/fp-contract.c
index 660f67fad3cc..e2691dc211cc 100644
--- a/clang/test/Driver/fp-contract.c
+++ b/clang/test/Driver/fp-contract.c
@@ -10,112 +10,112 @@
 // RUN: %clang -### -fno-fast-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-ON %s
 
-// RUN: %clang -### -ffast-math -ffp-contract=on -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffast-math -ffp-contract=on -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-ON %s
 // CHECK-FPC-ON:       "-ffp-contract=on"
 
-// RUN: %clang -### -ffast-math -ffp-contract=off -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffast-math -ffp-contract=off -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-OFF %s
 // CHECK-FPC-OFF:      "-ffp-contract=off"
 
-// RUN: %clang -### -ffast-math -ffp-contract=fast -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffast-math -ffp-contract=fast -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -ffast-math -ffp-contract=fast-honor-pragmas -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffast-math -ffp-contract=fast-honor-pragmas -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST-HONOR %s
 // CHECK-FPC-FAST-HONOR:     "-ffp-contract=fast-honor-pragmas"
 
-// RUN: %clang -### -ffp-contract=fast -ffast-math -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffp-contract=fast -ffast-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
-// RUN: %clang -### -ffp-contract=on -ffast-math -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffp-contract=on -ffast-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
-// RUN: %clang -### -ffp-contract=off -ffast-math -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffp-contract=off -ffast-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -ffp-contract=fast -fno-fast-math -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffp-contract=fast -fno-fast-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
-// RUN: %clang -### -ffp-contract=on -fno-fast-math -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffp-contract=on -fno-fast-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-ON %s
-// RUN: %clang -### -ffp-contract=off -fno-fast-math -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffp-contract=off -fno-fast-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-OFF %s
 
 
-// RUN: %clang -### -ffast-math -ffp-contract=fast -ffp-contract=on -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffast-math -ffp-contract=fast -ffp-contract=on -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-ON %s
 
-// RUN: %clang -### -ffast-math -ffp-contract=on -ffp-contract=off -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffast-math -ffp-contract=on -ffp-contract=off -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-OFF %s
 
-// RUN: %clang -### -ffast-math -ffp-contract=on -ffp-contract=fast -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffast-math -ffp-contract=on -ffp-contract=fast -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -ffast-math -ffp-contract=off -ffp-contract=on -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffast-math -ffp-contract=off -ffp-contract=on -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-ON %s
 
-// RUN: %clang -### -ffast-math -ffp-contract=off -ffp-contract=fast \
+// RUN: %clang -### -Werror -ffast-math -ffp-contract=off -ffp-contract=fast \
 // RUN: -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -ffast-math -ffp-contract=on -fno-fast-math -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffast-math -ffp-contract=on -fno-fast-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-ON %s
 
-// RUN: %clang -### -ffast-math -ffp-contract=off -fno-fast-math -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffast-math -ffp-contract=off -fno-fast-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-OFF %s
 
-// RUN: %clang -### -ffast-math -ffp-contract=fast -fno-fast-math -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffast-math -ffp-contract=fast -fno-fast-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -ffast-math -fno-fast-math -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffast-math -fno-fast-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-ON %s
 
-// RUN: %clang -### -fno-fast-math -ffast-math -c %s 2>&1 \
+// RUN: %clang -### -Werror -fno-fast-math -ffast-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -fno-fast-math -ffp-contract=on -c %s 2>&1 \
+// RUN: %clang -### -Werror -fno-fast-math -ffp-contract=on -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-ON %s
 
-// RUN: %clang -### -fno-fast-math -ffp-contract=off -c %s 2>&1 \
+// RUN: %clang -### -Werror -fno-fast-math -ffp-contract=off -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-OFF %s
 
-// RUN: %clang -### -fno-fast-math -ffp-contract=fast -c %s 2>&1 \
+// RUN: %clang -### -Werror -fno-fast-math -ffp-contract=fast -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -ffp-contract=fast -fno-fast-math -ffp-contract=on \
+// RUN: %clang -### -Werror -ffp-contract=fast -fno-fast-math -ffp-contract=on \
 // RUN: -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-ON %s
 
-// RUN: %clang -### -ffp-contract=fast -fno-fast-math -ffp-contract=off \
+// RUN: %clang -### -Werror -ffp-contract=fast -fno-fast-math -ffp-contract=off \
 // RUN: -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-OFF %s
 
-// RUN: %clang -### -ffp-contract=off -fno-fast-math -ffp-contract=fast \
+// RUN: %clang -### -Werror -ffp-contract=off -fno-fast-math -ffp-contract=fast \
 // RUN: -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -ffp-contract=off -fno-fast-math -ffp-contract=on \
+// RUN: %clang -### -Werror -ffp-contract=off -fno-fast-math -ffp-contract=on \
 // RUN: -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-ON %s
 
-// RUN: %clang -### -ffp-contract=on -ffast-math -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffp-contract=on -ffast-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -ffp-contract=off -ffast-math -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffp-contract=off -ffast-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -ffp-contract=fast -ffast-math -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffp-contract=fast -ffast-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -ffp-contract=on -ffast-math -fno-fast-math -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffp-contract=on -ffast-math -fno-fast-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-ON %s
 
-// RUN: %clang -### -ffp-contract=off -ffast-math -fno-fast-math -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffp-contract=off -ffast-math -fno-fast-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-OFF %s
 
-// RUN: %clang -### -ffp-contract=fast -ffast-math -fno-fast-math -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffp-contract=fast -ffast-math -fno-fast-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -fno-fast-math -ffast-math -ffp-contract=fast \
+// RUN: %clang -### -Werror -fno-fast-math -ffast-math -ffp-contract=fast \
 // RUN: -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -fno-fast-math -ffast-math -ffp-contract=on \
+// RUN: %clang -### -Werror -fno-fast-math -ffast-math -ffp-contract=on \
 // RUN: -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-ON %s
 
-// RUN: %clang -### -fno-fast-math -ffast-math -ffp-contract=off \
+// RUN: %clang -### -Werror -fno-fast-math -ffast-math -ffp-contract=off \
 // RUN: -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-OFF %s
 
 // funsafe-math-optimizations, fno-unsafe-math-optimizations
@@ -125,116 +125,113 @@
 // RUN: %clang -### -fno-unsafe-math-optimizations -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-ON %s
 
-// RUN: %clang -### -funsafe-math-optimizations -ffp-contract=on -c %s 2>&1 \
+// RUN: %clang -### -Werror -funsafe-math-optimizations -ffp-contract=on -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-ON %s
 
-// RUN: %clang -### -funsafe-math-optimizations -ffp-contract=off -c %s 2>&1 \
+// RUN: %clang -### -Werror -funsafe-math-optimizations -ffp-contract=off -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-OFF %s
 
-// RUN: %clang -### -funsafe-math-optimizations -ffp-contract=fast -c %s 2>&1 \
+// RUN: %clang -### -Werror -funsafe-math-optimizations -ffp-contract=fast -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -ffp-contract=fast -funsafe-math-optimizations -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffp-contract=fast -funsafe-math-optimizations -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
-// RUN: %clang -### -ffp-contract=on -funsafe-math-optimizations -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffp-contract=on -funsafe-math-optimizations -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
-// RUN: %clang -### -ffp-contract=off -funsafe-math-optimizations -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffp-contract=off -funsafe-math-optimizations -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -ffp-contract=fast -fno-unsafe-math-optimizations -c \
+// RUN: %clang -### -Werror -ffp-contract=fast -fno-unsafe-math-optimizations -c \
 // RUN: %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-FAST %s
-// RUN: %clang -### -ffp-contract=on -fno-unsafe-math-optimizations -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffp-contract=on -fno-unsafe-math-optimizations -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-ON %s
-// RUN: %clang -### -ffp-contract=off -fno-unsafe-math-optimizations -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffp-contract=off -fno-unsafe-math-optimizations -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-OFF %s
 
-// RUN: %clang -### -funsafe-math-optimizations -ffp-contract=fast \
+// RUN: %clang -### -Werror -funsafe-math-optimizations -ffp-contract=fast \
 // RUN: -ffp-contract=on -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-ON %s
 
-// RUN: %clang -### -funsafe-math-optimizations -ffp-contract=on \
+// RUN: %clang -### -Werror -funsafe-math-optimizations -ffp-contract=on \
 // RUN: -ffp-contract=off -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-OFF %s
 
-// RUN: %clang -### -funsafe-math-optimizations -ffp-contract=on \
+// RUN: %clang -### -Werror -funsafe-math-optimizations -ffp-contract=on \
 // RUN: -ffp-contract=fast -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -funsafe-math-optimizations -ffp-contract=off \
+// RUN: %clang -### -Werror -funsafe-math-optimizations -ffp-contract=off \
 // RUN: -ffp-contract=on -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-ON %s
 
-// RUN: %clang -### -funsafe-math-optimizations -ffp-contract=off \
+// RUN: %clang -### -Werror -funsafe-math-optimizations -ffp-contract=off \
 // RUN: -ffp-contract=fast \
 // RUN: -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -funsafe-math-optimizations -ffp-contract=on \
+// RUN: %clang -### -Werror -funsafe-math-optimizations -ffp-contract=on \
 // RUN: -fno-unsafe-math-optimizations -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-ON %s
 
-// RUN: %clang -### -funsafe-math-optimizations -ffp-contract=off \
+// RUN: %clang -### -Werror -funsafe-math-optimizations -ffp-contract=off \
 // RUN: -fno-unsafe-math-optimizations -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-OFF %s
 
-// RUN: %clang -### -funsafe-math-optimizations -ffp-contract=fast \
+// RUN: %clang -### -Werror -funsafe-math-optimizations -ffp-contract=fast \
 // RUN: -fno-unsafe-math-optimizations -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -funsafe-math-optimizations -fno-unsafe-math-optimizations \
+// RUN: %clang -### -Werror -funsafe-math-optimizations -fno-unsafe-math-optimizations \
 // RUN: -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-ON %s
 
-// RUN: %clang -### -fno-unsafe-math-optimizations -funsafe-math-optimizations \
+// RUN: %clang -### -Werror -fno-unsafe-math-optimizations -funsafe-math-optimizations \
 // RUN: -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -fno-unsafe-math-optimizations -ffp-contract=on -c %s 2>&1 \
+// RUN: %clang -### -Werror -fno-unsafe-math-optimizations -ffp-contract=on -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-ON %s
 
-// RUN: %clang -### -fno-unsafe-math-optimizations -ffp-contract=off -c %s 2>&1 \
+// RUN: %clang -### -Werror -fno-unsafe-math-optimizations -ffp-contract=off -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-OFF %s
 
-// RUN: %clang -### -fno-unsafe-math-optimizations -ffp-contract=fast -c %s 2>&1 \
+// RUN: %clang -### -Werror -fno-unsafe-math-optimizations -ffp-contract=fast -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -ffp-contract=fast -fno-unsafe-math-optimizations \
+// RUN: %clang -### -Werror -ffp-contract=fast -fno-unsafe-math-optimizations \
 // RUN: -ffp-contract=on \
 // RUN: -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-ON %s
 
-// RUN: %clang -### -ffp-contract=fast -fno-unsafe-math-optimizations \
+// RUN: %clang -### -Werror -ffp-contract=fast -fno-unsafe-math-optimizations \
 // RUN: -ffp-contract=off \
 // RUN: -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-OFF %s
 
-// RUN: %clang -### -ffp-contract=off -fno-unsafe-math-optimizations \
+// RUN: %clang -### -Werror -ffp-contract=off -fno-unsafe-math-optimizations \
 // RUN: -ffp-contract=fast \
 // RUN: -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -ffp-contract=off -fno-unsafe-math-optimizations \
+// RUN: %clang -### -Werror -ffp-contract=off -fno-unsafe-math-optimizations \
 // RUN: -ffp-contract=on -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-ON %s
 
-// RUN: %clang -### -ffp-contract=on -funsafe-math-optimizations -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffp-contract=on -funsafe-math-optimizations -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -ffp-contract=off -funsafe-math-optimizations -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffp-contract=off -funsafe-math-optimizations -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -ffp-contract=fast -funsafe-math-optimizations -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffp-contract=fast -funsafe-math-optimizations -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -ffp-contract=on -funsafe-math-optimizations \
-// RUN: -fno-unsafe-math-optimizations -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffp-contract=on -funsafe-math-optimizations -fno-unsafe-math-optimizations -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-ON %s
 
-// RUN: %clang -### -ffp-contract=off -funsafe-math-optimizations \
-// RUN: -fno-unsafe-math-optimizations -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffp-contract=off -funsafe-math-optimizations -fno-unsafe-math-optimizations -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-OFF %s
 
-// RUN: %clang -### -ffp-contract=fast -funsafe-math-optimizations \
-// RUN: -fno-unsafe-math-optimizations -c %s 2>&1 \
+// RUN: %clang -### -Werror -ffp-contract=fast -funsafe-math-optimizations -fno-unsafe-math-optimizations -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -fno-unsafe-math-optimizations -funsafe-math-optimizations \
-// RUN: -ffp-contract=fast \
+// RUN: %clang -### -Werror -fno-unsafe-math-optimizations -funsafe-math-optimizations \
+// RUN:   -ffp-contract=fast \
 // RUN: -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-FAST %s
 
-// RUN: %clang -### -fno-unsafe-math-optimizations -funsafe-math-optimizations \
-// RUN: -ffp-contract=on -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-ON %s
+// RUN: %clang -### -Werror -fno-unsafe-math-optimizations -funsafe-math-optimizations \
+// RUN:   -ffp-contract=on -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-ON %s
 
-// RUN: %clang -### -fno-unsafe-math-optimizations -funsafe-math-optimizations \
-// RUN: -ffp-contract=off -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-OFF %s
+// RUN: %clang -### -Werror -fno-unsafe-math-optimizations -funsafe-math-optimizations \
+// RUN:   -ffp-contract=off -c %s 2>&1 | FileCheck --check-prefix=CHECK-FPC-OFF %s
 
diff --git a/clang/test/Driver/fp-model.c b/clang/test/Driver/fp-model.c
index 9d1245239911..644523394d6b 100644
--- a/clang/test/Driver/fp-model.c
+++ b/clang/test/Driver/fp-model.c
@@ -150,7 +150,7 @@
 // CHECK-FEB-IGNORE: "-fno-rounding-math"
 // CHECK-FEB-IGNORE: "-ffp-exception-behavior=ignore"
 
-// RUN: %clang -### -nostdinc -ffast-math -ffp-model=fast -c %s 2>&1 \
+// RUN: %clang -### -nostdinc -Werror -ffast-math -ffp-model=fast -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FASTMATH-FPM-FAST %s
 // CHECK-FASTMATH-FPM-FAST: "-cc1"
 // CHECK-FASTMATH-FPM-FAST: "-menable-no-infs"
@@ -165,7 +165,7 @@
 // CHECK-FASTMATH-FPM-FAST: "-ffast-math"
 // CHECK-FASTMATH-FPM-FAST: "-ffinite-math-only"
 
-// RUN: %clang -### -nostdinc -ffast-math -ffp-model=precise -c %s 2>&1 \
+// RUN: %clang -### -nostdinc -Werror -ffast-math -ffp-model=precise -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FASTMATH-FPM-PRECISE %s
 // CHECK-FASTMATH-FPM-PRECISE:     "-cc1"
 // CHECK-FASTMATH-FPM-PRECISE-NOT: "-menable-no-infs"
@@ -180,7 +180,7 @@
 // CHECK-FASTMATH-FPM-PRECISE-NOT: "-ffast-math"
 // CHECK-FASTMATH-FPM-PRECISE-NOT: "-ffinite-math-only"
 
-// RUN: %clang -### -nostdinc -ffast-math -ffp-model=strict -c %s 2>&1 \
+// RUN: %clang -### -nostdinc -Werror -ffast-math -ffp-model=strict -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FASTMATH-FPM-STRICT %s
 // CHECK-FASTMATH-FPM-STRICT:     "-cc1"
 // CHECK-FASTMATH-FPM-STRICT-NOT: "-menable-no-infs"
diff --git a/clang/test/Driver/frelaxed-template-template-args.cpp b/clang/test/Driver/frelaxed-template-template-args.cpp
new file mode 100644
index 000000000000..136c360276a1
--- /dev/null
+++ b/clang/test/Driver/frelaxed-template-template-args.cpp
@@ -0,0 +1,7 @@
+// RUN: %clang -fsyntax-only -### %s 2>&1 | FileCheck --check-prefix=CHECK-DEF %s
+// RUN: %clang -fsyntax-only -frelaxed-template-template-args %s 2>&1 | FileCheck --check-prefix=CHECK-ON --allow-empty %s
+// RUN: %clang -fsyntax-only -fno-relaxed-template-template-args %s 2>&1 | FileCheck --check-prefix=CHECK-OFF %s
+
+// CHECK-DEF: "-cc1"{{.*}} "-fno-relaxed-template-template-args"
+// CHECK-ON-NOT: warning: argument '-frelaxed-template-template-args' is deprecated [-Wdeprecated]
+// CHECK-OFF: warning: argument '-fno-relaxed-template-template-args' is deprecated [-Wdeprecated]
diff --git a/clang/test/Driver/fseparate-named-sections.c b/clang/test/Driver/fseparate-named-sections.c
new file mode 100644
index 000000000000..6264b8fcf0d8
--- /dev/null
+++ b/clang/test/Driver/fseparate-named-sections.c
@@ -0,0 +1,4 @@
+// RUN: %clang -### -fseparate-named-sections %s -c 2>&1 | FileCheck -check-prefix=CHECK-OPT %s
+// RUN: %clang -### -fseparate-named-sections -fno-separate-named-sections %s -c 2>&1 | FileCheck -check-prefix=CHECK-NOOPT %s
+// CHECK-OPT: "-fseparate-named-sections"
+// CHECK-NOOPT-NOT: "-fseparate-named-sections"
diff --git a/clang/test/Driver/linker-wrapper-image.c b/clang/test/Driver/linker-wrapper-image.c
index d01445e3aed0..5d5d62805e17 100644
--- a/clang/test/Driver/linker-wrapper-image.c
+++ b/clang/test/Driver/linker-wrapper-image.c
@@ -30,8 +30,8 @@
 
 //      OPENMP: define internal void @.omp_offloading.descriptor_reg() section ".text.startup" {
 // OPENMP-NEXT: entry:
-// OPENMP-NEXT:   %0 = call i32 @atexit(ptr @.omp_offloading.descriptor_unreg)
 // OPENMP-NEXT:   call void @__tgt_register_lib(ptr @.omp_offloading.descriptor)
+// OPENMP-NEXT:   %0 = call i32 @atexit(ptr @.omp_offloading.descriptor_unreg)
 // OPENMP-NEXT:   ret void
 // OPENMP-NEXT: }
 
diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c
index cbf24d4ce3a8..51bf98b2ed39 100644
--- a/clang/test/Driver/linker-wrapper.c
+++ b/clang/test/Driver/linker-wrapper.c
@@ -120,7 +120,7 @@ __attribute__((visibility("protected"), used)) int x;
 
 // HIP: clang{{.*}} -o [[IMG_GFX908:.+]] --target=amdgcn-amd-amdhsa -mcpu=gfx908
 // HIP: clang{{.*}} -o [[IMG_GFX90A:.+]] --target=amdgcn-amd-amdhsa -mcpu=gfx90a
-// HIP: clang-offload-bundler{{.*}}-type=o -bundle-align=4096 -compress -compression-level=6 -targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx90a,hipv4-amdgcn-amd-amdhsa--gfx908 -input=/dev/null -input=[[IMG_GFX90A]] -input=[[IMG_GFX908]] -output={{.*}}.hipfb
+// HIP: clang-offload-bundler{{.*}}-type=o -bundle-align=4096 -compress -compression-level=6 -targets=host-x86_64-unknown-linux,hip-amdgcn-amd-amdhsa--gfx90a,hip-amdgcn-amd-amdhsa--gfx908 -input=/dev/null -input=[[IMG_GFX90A]] -input=[[IMG_GFX908]] -output={{.*}}.hipfb
 
 // RUN: clang-offload-packager -o %t.out \
 // RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908 \
@@ -210,7 +210,7 @@ __attribute__((visibility("protected"), used)) int x;
 // RUN:   %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=RELOCATABLE-LINK-HIP
 
 // RELOCATABLE-LINK-HIP: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa
-// RELOCATABLE-LINK-HIP: clang-offload-bundler{{.*}} -type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx90a -input=/dev/null -input={{.*}} -output={{.*}}
+// RELOCATABLE-LINK-HIP: clang-offload-bundler{{.*}} -type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux,hip-amdgcn-amd-amdhsa--gfx90a -input=/dev/null -input={{.*}} -output={{.*}}
 // RELOCATABLE-LINK-HIP: /usr/bin/ld.lld{{.*}}-r
 // RELOCATABLE-LINK-HIP: llvm-objcopy{{.*}}a.out --remove-section .llvm.offloading
 
diff --git a/clang/test/Driver/msvc-link.c b/clang/test/Driver/msvc-link.c
index b5c32b173785..82659cbb9b49 100644
--- a/clang/test/Driver/msvc-link.c
+++ b/clang/test/Driver/msvc-link.c
@@ -1,34 +1,29 @@
-// RUN: %clang -target i686-pc-windows-msvc -fuse-ld=link -### %s 2>&1 | FileCheck --check-prefix=BASIC %s
-// BASIC: link.exe"
-// BASIC: "-out:a.exe"
-// BASIC: "-defaultlib:libcmt"
-// BASIC: "-defaultlib:oldnames"
-// BASIC: "-nologo"
-// BASIC-NOT: "-Brepro"
-
-// RUN: %clang -target i686-pc-windows-msvc -shared -o a.dll -fuse-ld=link -### %s 2>&1 | FileCheck --check-prefix=DLL %s
-// DLL: link.exe"
-// DLL: "-out:a.dll"
-// DLL: "-defaultlib:libcmt"
-// DLL: "-defaultlib:oldnames"
-// DLL: "-nologo"
-// DLL: "-dll"
-
-// RUN: %clang -target i686-pc-windows-msvc -L/var/empty -L/usr/lib -### %s 2>&1 | FileCheck --check-prefix LIBPATH %s
-// LIBPATH: "-libpath:/var/empty"
-// LIBPATH: "-libpath:/usr/lib"
-// LIBPATH: "-nologo"
+// RUN: %clang --target=i686-pc-windows-msvc -fuse-ld=link -L/var/empty -L/usr/lib -### %s 2>&1 | FileCheck --check-prefix=BASIC %s
+// BASIC:      link.exe"
+// BASIC-SAME: "-out:a.exe"
+// BASIC-SAME: "-defaultlib:libcmt" "-defaultlib:oldnames"
+// BASIC-SAME: "-libpath:/var/empty" "-libpath:/usr/lib"
+// BASIC-SAME: "-nologo"
+// BASIC-NOT:  "-Brepro"
+// BASIC-NOT:  "-dll"
+// BASIC-NOT:  subsystem:console"
+
+// RUN: %clang --target=i686-pc-windows-msvc -shared -o a.dll -fuse-ld=link -### %s 2>&1 | FileCheck --check-prefix=DLL %s
+// DLL:      link.exe"
+// DLL-SAME: "-out:a.dll"
+// DLL-SAME: "-defaultlib:libcmt" "-defaultlib:oldnames"
+// DLL-SAME: "-nologo" "-dll"
 
 // RUN: %clang_cl /Brepro -fuse-ld=link -### -- %s 2>&1 | FileCheck --check-prefix=REPRO %s
-// REPRO: link.exe"
-// REPRO: "-out:msvc-link.exe"
-// REPRO: "-nologo"
-// REPRO: "-Brepro"
+// REPRO:      link.exe"
+// REPRO-SAME: "-out:msvc-link.exe"
+// REPRO-SAME: "-nologo"
+// REPRO-SAME: "-Brepro"
 
 // RUN: %clang_cl /Brepro- -fuse-ld=link -### -- %s 2>&1 | FileCheck --check-prefix=NOREPRO %s
-// NOREPRO: link.exe"
-// NOREPRO: "-out:msvc-link.exe"
-// NOREPRO: "-nologo"
+// NOREPRO:      link.exe"
+// NOREPRO-SAME: "-out:msvc-link.exe"
+// NOREPRO-SAME: "-nologo"
 // NOREPRO-NOT: "-Brepro"
 
 // RUN: %clang_cl -fuse-ld=lld --vfsoverlay %s -### -- %s 2>&1 | FileCheck --check-prefix=VFSOVERLAY %s
@@ -37,22 +32,22 @@
 // VFSOVERLAY: lld-link
 // VFSOVERLAY: "/vfsoverlay:{{.*}}" "{{.*}}.obj"
 
-// RUN: %clang -target arm64ec-pc-windows-msvc -fuse-ld=link -### %s 2>&1 | FileCheck --check-prefix=ARM64EC %s
-// RUN: %clang_cl -target arm64ec-pc-windows-msvc -fuse-ld=link -### -- %s 2>&1 | FileCheck --check-prefix=ARM64EC %s
+// RUN: %clang --target=arm64ec-pc-windows-msvc -fuse-ld=link -### %s 2>&1 | FileCheck --check-prefix=ARM64EC %s
+// RUN: %clang_cl --target=arm64ec-pc-windows-msvc -fuse-ld=link -### -- %s 2>&1 | FileCheck --check-prefix=ARM64EC %s
 // RUN: %clang_cl -arm64EC -fuse-ld=link -### -- %s 2>&1 | FileCheck --check-prefix=ARM64EC %s
 // ARM64EC: "-machine:arm64ec"
 
-// RUN: %clang -target arm64ec-pc-windows-msvc -fuse-ld=link -marm64x -### %s 2>&1 | \
+// RUN: %clang --target=arm64ec-pc-windows-msvc -fuse-ld=link -marm64x -### %s 2>&1 | \
 // RUN:        FileCheck --check-prefix=ARM64X %s
-// RUN: %clang -target aarch64-pc-windows-msvc -fuse-ld=link -marm64x -### %s 2>&1 | \
+// RUN: %clang --target=aarch64-pc-windows-msvc -fuse-ld=link -marm64x -### %s 2>&1 | \
 // RUN:        FileCheck --check-prefix=ARM64X %s
 // RUN: %clang_cl -marm64x -fuse-ld=link -### -- %s 2>&1 | FileCheck --check-prefix=ARM64X %s
 // RUN: %clang_cl -arm64EC -marm64x -fuse-ld=link -### -- %s 2>&1 | FileCheck --check-prefix=ARM64X %s
 // ARM64X: "-machine:arm64x"
 
-// RUN: not %clang -target x86_64-linux-gnu -marm64x -### %s 2>&1 | FileCheck --check-prefix=HYBRID-ERR %s
+// RUN: not %clang --target=x86_64-linux-gnu -marm64x -### %s 2>&1 | FileCheck --check-prefix=HYBRID-ERR %s
 // HYBRID-ERR: error: unsupported option '-marm64x' for target 'x86_64-linux-gnu'
 
-// RUN: %clang -c -marm64x  -target arm64ec-pc-windows-msvc -fuse-ld=link -### %s 2>&1 | \
+// RUN: %clang -c -marm64x  --target=arm64ec-pc-windows-msvc -fuse-ld=link -### %s 2>&1 | \
 // RUN:        FileCheck --check-prefix=HYBRID-WARN %s
 // HYBRID-WARN: warning: argument unused during compilation: '-marm64x' [-Wunused-command-line-argument]
diff --git a/clang/test/Driver/plugin-driver-args.cpp b/clang/test/Driver/plugin-driver-args.cpp
index d6475b4b3d73..6efd859f9d08 100644
--- a/clang/test/Driver/plugin-driver-args.cpp
+++ b/clang/test/Driver/plugin-driver-args.cpp
@@ -20,3 +20,8 @@
 
 // RUN: %clang -fplugin=%llvmshlibdir/CallSuperAttr%pluginext -fplugin-arg-testname- -fsyntax-only %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-PLUGIN-ARG2
 // CHECK-NO-PLUGIN-ARG2: missing plugin argument for plugin testname in -fplugin-arg-testname-
+
+// Plugins are only relevant for the -cc1 phase. No warning should be raised
+// when only using the assembler. See GH #88173.
+// RUN: %clang -c -fpass-plugin=bar.so -fplugin=bar.so -fplugin-arg-bar-option -Wunused-command-line-argument -x assembler %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-PLUGIN-ASM
+// CHECK-PLUGIN-ASM-NOT: argument unused during compilation: '-f{{[a-z-]*plugin[^']*}}'
diff --git a/clang/test/Driver/ps4-ps5-relax-relocations.c b/clang/test/Driver/ps4-ps5-relax-relocations.c
index 41ed3f22b19c..afb83d61cd05 100644
--- a/clang/test/Driver/ps4-ps5-relax-relocations.c
+++ b/clang/test/Driver/ps4-ps5-relax-relocations.c
@@ -1,29 +1,29 @@
-// RUN: %clang -### -target x86_64-scei-ps4 %s -o - 2>&1 | \
-// RUN:   FileCheck %s
-// RUN: %clang -### -target x86_64-scei-ps4 -Wa,-mrelax-relocations=yes %s -o - 2>&1 | \
-// RUN:   FileCheck %s
-// RUN: %clang -### -target x86_64-scei-ps4 -Wa,-mrelax-relocations=no %s -o - 2>&1 | \
-// RUN:   FileCheck -check-prefix=UNSET %s
-// RUN: %clang -### -x assembler -target x86_64-scei-ps4 %s -o - 2>&1 | \
-// RUN:   FileCheck %s
-// RUN: %clang -### -x assembler -target x86_64-scei-ps4 -Wa,-mrelax-relocations=yes %s -o - 2>&1 | \
-// RUN:   FileCheck %s
-// RUN: %clang -### -x assembler -target x86_64-scei-ps4 -Wa,-mrelax-relocations=no %s -o - 2>&1 | \
-// RUN:   FileCheck -check-prefix=UNSET %s
-
-// RUN: %clang -### -target x86_64-sie-ps5 %s -o - 2>&1 | \
-// RUN:   FileCheck %s
-// RUN: %clang -### -target x86_64-sie-ps5 -Wa,-mrelax-relocations=yes %s -o - 2>&1 | \
-// RUN:   FileCheck %s
-// RUN: %clang -### -target x86_64-sie-ps5 -Wa,-mrelax-relocations=no %s -o - 2>&1 | \
-// RUN:   FileCheck -check-prefix=UNSET %s
-// RUN: %clang -### -x assembler -target x86_64-sie-ps5 %s -o - 2>&1 | \
-// RUN:   FileCheck %s
-// RUN: %clang -### -x assembler -target x86_64-sie-ps5 -Wa,-mrelax-relocations=yes %s -o - 2>&1 | \
-// RUN:   FileCheck %s
-// RUN: %clang -### -x assembler -target x86_64-sie-ps5 -Wa,-mrelax-relocations=no %s -o - 2>&1 | \
-// RUN:   FileCheck -check-prefix=UNSET %s
-
-// CHECK-NOT: "-mrelax-relocations
-
-// UNSET: "-mrelax-relocations=no"
+// RUN: %clang -### -target x86_64-scei-ps4 %s -o - 2>&1 | \
+// RUN:   FileCheck %s
+// RUN: %clang -### -target x86_64-scei-ps4 -Wa,-mrelax-relocations=yes %s -o - 2>&1 | \
+// RUN:   FileCheck %s
+// RUN: %clang -### -target x86_64-scei-ps4 -Wa,-mrelax-relocations=no %s -o - 2>&1 | \
+// RUN:   FileCheck -check-prefix=UNSET %s
+// RUN: %clang -### -x assembler -target x86_64-scei-ps4 %s -o - 2>&1 | \
+// RUN:   FileCheck %s
+// RUN: %clang -### -x assembler -target x86_64-scei-ps4 -Wa,-mrelax-relocations=yes %s -o - 2>&1 | \
+// RUN:   FileCheck %s
+// RUN: %clang -### -x assembler -target x86_64-scei-ps4 -Wa,-mrelax-relocations=no %s -o - 2>&1 | \
+// RUN:   FileCheck -check-prefix=UNSET %s
+
+// RUN: %clang -### -target x86_64-sie-ps5 %s -o - 2>&1 | \
+// RUN:   FileCheck %s
+// RUN: %clang -### -target x86_64-sie-ps5 -Wa,-mrelax-relocations=yes %s -o - 2>&1 | \
+// RUN:   FileCheck %s
+// RUN: %clang -### -target x86_64-sie-ps5 -Wa,-mrelax-relocations=no %s -o - 2>&1 | \
+// RUN:   FileCheck -check-prefix=UNSET %s
+// RUN: %clang -### -x assembler -target x86_64-sie-ps5 %s -o - 2>&1 | \
+// RUN:   FileCheck %s
+// RUN: %clang -### -x assembler -target x86_64-sie-ps5 -Wa,-mrelax-relocations=yes %s -o - 2>&1 | \
+// RUN:   FileCheck %s
+// RUN: %clang -### -x assembler -target x86_64-sie-ps5 -Wa,-mrelax-relocations=no %s -o - 2>&1 | \
+// RUN:   FileCheck -check-prefix=UNSET %s
+
+// CHECK-NOT: "-mrelax-relocations
+
+// UNSET: "-mrelax-relocations=no"
diff --git a/clang/test/Driver/rewrite-legacy-objc.m b/clang/test/Driver/rewrite-legacy-objc.m
index 413a7a7a61f0..d45fb8c405c5 100644
--- a/clang/test/Driver/rewrite-legacy-objc.m
+++ b/clang/test/Driver/rewrite-legacy-objc.m
@@ -3,11 +3,11 @@
 // TEST0: "-cc1"
 // TEST0: "-rewrite-objc"
 // FIXME: CHECK-NOT is broken somehow, it doesn't work here. Check adjacency instead.
-// TEST0: "-stack-protector" "1" "-fblocks" "-fencode-extended-block-signature" "-fregister-global-dtors-with-atexit" "-fgnuc-version=4.2.1"{{.*}} "-fobjc-runtime=macosx-fragile" "-fno-objc-infer-related-result-type" "-fobjc-exceptions" "-fexceptions" "-fmax-type-align=16"
+// TEST0: "-stack-protector" "1" "-fblocks" "-fencode-extended-block-signature" "-fregister-global-dtors-with-atexit" "-fgnuc-version=4.2.1"{{.*}} "-fobjc-runtime=macosx-fragile" "-fno-objc-infer-related-result-type" "-fobjc-exceptions" "-fexceptions" "-fno-relaxed-template-template-args" "-fmax-type-align=16"
 // TEST0: rewrite-legacy-objc.m"
 // RUN: %clang --target=i386-apple-macosx10.9.0 -rewrite-legacy-objc %s -o - -### 2>&1 | \
 // RUN:   FileCheck -check-prefix=TEST1 %s
 // RUN: %clang --target=i386-apple-macosx10.6.0 -rewrite-legacy-objc %s -o - -### 2>&1 | \
 // RUN:   FileCheck -check-prefix=TEST2 %s
-// TEST1: "-stack-protector" "1" "-fblocks" "-fencode-extended-block-signature" "-fregister-global-dtors-with-atexit" "-fgnuc-version=4.2.1"{{.*}} "-fobjc-runtime=macosx-fragile" "-fobjc-subscripting-legacy-runtime" "-fno-objc-infer-related-result-type" "-fobjc-exceptions" "-fmax-type-align=16"
-// TEST2: "-stack-protector" "1" "-fblocks" "-fencode-extended-block-signature" "-fregister-global-dtors-with-atexit" "-fgnuc-version=4.2.1"{{.*}} "-fobjc-runtime=macosx-fragile" "-fobjc-subscripting-legacy-runtime" "-fno-objc-infer-related-result-type" "-fobjc-exceptions" "-fmax-type-align=16"
+// TEST1: "-stack-protector" "1" "-fblocks" "-fencode-extended-block-signature" "-fregister-global-dtors-with-atexit" "-fgnuc-version=4.2.1"{{.*}} "-fobjc-runtime=macosx-fragile" "-fobjc-subscripting-legacy-runtime" "-fno-objc-infer-related-result-type" "-fobjc-exceptions" "-fno-relaxed-template-template-args" "-fmax-type-align=16"
+// TEST2: "-stack-protector" "1" "-fblocks" "-fencode-extended-block-signature" "-fregister-global-dtors-with-atexit" "-fgnuc-version=4.2.1"{{.*}} "-fobjc-runtime=macosx-fragile" "-fobjc-subscripting-legacy-runtime" "-fno-objc-infer-related-result-type" "-fobjc-exceptions" "-fno-relaxed-template-template-args" "-fmax-type-align=16"
diff --git a/clang/test/Driver/rewrite-objc.m b/clang/test/Driver/rewrite-objc.m
index de3577a770df..d19d38d8ab83 100644
--- a/clang/test/Driver/rewrite-objc.m
+++ b/clang/test/Driver/rewrite-objc.m
@@ -2,4 +2,4 @@
 // RUN:   FileCheck -check-prefix=TEST0 %s
 // TEST0: "-cc1" {{.*}} "-rewrite-objc"
 // FIXME: CHECK-NOT is broken somehow, it doesn't work here. Check adjacency instead.
-// TEST0: "-stack-protector" "1" "-fblocks" "-fencode-extended-block-signature" "-fregister-global-dtors-with-atexit" "-fgnuc-version=4.2.1"{{.*}} "-fobjc-runtime=macosx" "-fno-objc-infer-related-result-type" "-fobjc-exceptions" "-fexceptions" "-fmax-type-align=16"
+// TEST0: "-stack-protector" "1" "-fblocks" "-fencode-extended-block-signature" "-fregister-global-dtors-with-atexit" "-fgnuc-version=4.2.1"{{.*}} "-fobjc-runtime=macosx" "-fno-objc-infer-related-result-type" "-fobjc-exceptions" "-fexceptions" "-fno-relaxed-template-template-args" "-fmax-type-align=16"
diff --git a/clang/test/Driver/riscv-arch.c b/clang/test/Driver/riscv-arch.c
index 8c701a736fc7..ddf617bbb623 100644
--- a/clang/test/Driver/riscv-arch.c
+++ b/clang/test/Driver/riscv-arch.c
@@ -199,7 +199,7 @@
 // RUN: not %clang --target=riscv32-unknown-elf -march=rv32imC -### %s \
 // RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32-LOWER %s
 // RV32-LOWER: error: invalid arch name 'rv32imC',
-// RV32-LOWER: string must be lowercase
+// RV32-LOWER: string may only contain [a-z0-9_]
 
 // RUN: not %clang --target=riscv32-unknown-elf -march=unknown -### %s \
 // RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32-STR %s
diff --git a/clang/test/Driver/std-trigraph-override.c b/clang/test/Driver/std-trigraph-override.c
new file mode 100644
index 000000000000..e4b83ffcf823
--- /dev/null
+++ b/clang/test/Driver/std-trigraph-override.c
@@ -0,0 +1,7 @@
+// UNSUPPORTED: target={{.*-zos.*}}
+// RUN: %clang -w -std=c99 -trigraphs -std=gnu99 %s -E -o - | FileCheck -check-prefix=OVERRIDE %s
+// OVERRIDE: ??(??)
+// RUN: %clang -w -std=c99 -ftrigraphs -std=gnu99 %s -E -o - | FileCheck -check-prefix=FOVERRIDE %s
+// FOVERRIDE: ??(??)
+
+??(??)
diff --git a/clang/test/Driver/std.c b/clang/test/Driver/std.c
index 54f746cc63d0..fe0c4671d9d6 100644
--- a/clang/test/Driver/std.c
+++ b/clang/test/Driver/std.c
@@ -1,7 +1,3 @@
-// RUN: %clang -w -std=c99 -trigraphs -std=gnu99 %s -E -o - | FileCheck -check-prefix=OVERRIDE %s
-// OVERRIDE: ??(??)
-// RUN: %clang -w -std=c99 -ftrigraphs -std=gnu99 %s -E -o - | FileCheck -check-prefix=FOVERRIDE %s
-// FOVERRIDE: ??(??)
 // RUN: %clang -w -ansi %s -E -o - | FileCheck -check-prefix=ANSI %s
 // ANSI: []
 // RUN: %clang -w -ansi %s -fno-trigraphs -E -o - | FileCheck -check-prefix=ANSI-OVERRIDE %s
diff --git a/clang/test/Driver/wasm-features.c b/clang/test/Driver/wasm-features.c
index 1f7fb2134982..b77cb5ea9b49 100644
--- a/clang/test/Driver/wasm-features.c
+++ b/clang/test/Driver/wasm-features.c
@@ -11,35 +11,35 @@
 // GENERIC: "-target-cpu" "generic"
 // BLEEDING-EDGE: "-target-cpu" "bleeding-edge"
 
+// RUN: %clang --target=wasm32-unknown-unknown -### %s -matomics 2>&1 | FileCheck %s -check-prefix=ATOMICS
+// RUN: %clang --target=wasm32-unknown-unknown -### %s -mno-atomics 2>&1 | FileCheck %s -check-prefix=NO-ATOMICS
+
+// ATOMICS: "-target-feature" "+atomics"
+// NO-ATOMICS: "-target-feature" "-atomics"
+
 // RUN: %clang --target=wasm32-unknown-unknown -### %s -mbulk-memory 2>&1 | FileCheck %s -check-prefix=BULK-MEMORY
 // RUN: %clang --target=wasm32-unknown-unknown -### %s -mno-bulk-memory 2>&1 | FileCheck %s -check-prefix=NO-BULK-MEMORY
 
 // BULK-MEMORY: "-target-feature" "+bulk-memory"
 // NO-BULK-MEMORY: "-target-feature" "-bulk-memory"
 
-// RUN: %clang --target=wasm32-unknown-unknown -### %s -mmutable-globals 2>&1 | FileCheck %s -check-prefix=MUTABLE-GLOBALS
-// RUN: %clang --target=wasm32-unknown-unknown -### %s -mno-mutable-globals 2>&1 | FileCheck %s -check-prefix=NO-MUTABLE-GLOBALS
-
-// MUTABLE-GLOBALS: "-target-feature" "+mutable-globals"
-// NO-MUTABLE-GLOBALS: "-target-feature" "-mutable-globals"
-
-// RUN: %clang --target=wasm32-unknown-unknown -### %s -msign-ext 2>&1 | FileCheck %s -check-prefix=SIGN-EXT
-// RUN: %clang --target=wasm32-unknown-unknown -### %s -mno-sign-ext 2>&1 | FileCheck %s -check-prefix=NO-SIGN-EXT
+// RUN: %clang --target=wasm32-unknown-unknown -### %s -mexception-handling 2>&1 | FileCheck %s -check-prefix=EXCEPTION-HANDLING
+// RUN: %clang --target=wasm32-unknown-unknown -### %s -mno-exception-handling 2>&1 | FileCheck %s -check-prefix=NO-EXCEPTION-HANDLING
 
-// SIGN-EXT: "-target-feature" "+sign-ext"
-// NO-SIGN-EXT: "-target-feature" "-sign-ext"
+// EXCEPTION-HANDLING: "-target-feature" "+exception-handling"
+// NO-EXCEPTION-HANDLING: "-target-feature" "-exception-handling"
 
-// RUN: %clang --target=wasm32-unknown-unknown -### %s -mnontrapping-fptoint 2>&1 | FileCheck %s -check-prefix=NONTRAPPING-FPTOINT
-// RUN: %clang --target=wasm32-unknown-unknown -### %s -mno-nontrapping-fptoint 2>&1 | FileCheck %s -check-prefix=NO-NONTRAPPING-FPTOINT
+// RUN: %clang --target=wasm32-unknown-unknown -### %s -mextended-const 2>&1 | FileCheck %s -check-prefix=EXTENDED-CONST
+// RUN: %clang --target=wasm32-unknown-unknown -### %s -mno-extended-const 2>&1 | FileCheck %s -check-prefix=NO-EXTENDED-CONST
 
-// NONTRAPPING-FPTOINT: "-target-feature" "+nontrapping-fptoint"
-// NO-NONTRAPPING-FPTOINT: "-target-feature" "-nontrapping-fptoint"
+// EXTENDED-CONST: "-target-feature" "+extended-const"
+// NO-EXTENDED-CONST: "-target-feature" "-extended-const"
 
-// RUN: %clang --target=wasm32-unknown-unknown -### %s -mmultivalue 2>&1 | FileCheck %s -check-prefix=MULTIVALUE
-// RUN: %clang --target=wasm32-unknown-unknown -### %s -mno-multivalue 2>&1 | FileCheck %s -check-prefix=NO-MULTIVALUE
+// RUN: %clang --target=wasm32-unknown-unknown -### %s -mhalf-precision 2>&1 | FileCheck %s -check-prefix=HALF-PRECISION
+// RUN: %clang --target=wasm32-unknown-unknown -### %s -mno-half-precision 2>&1 | FileCheck %s -check-prefix=NO-HALF-PRECISION
 
-// MULTIVALUE: "-target-feature" "+multivalue"
-// NO-MULTIVALUE: "-target-feature" "-multivalue"
+// HALF-PRECISION: "-target-feature" "+half-precision"
+// NO-HALF-PRECISION: "-target-feature" "-half-precision"
 
 // RUN: %clang --target=wasm32-unknown-unknown -### %s -mmultimemory 2>&1 | FileCheck %s -check-prefix=MULTIMEMORY
 // RUN: %clang --target=wasm32-unknown-unknown -### %s -mno-multimemory 2>&1 | FileCheck %s -check-prefix=NO-MULTIMEMORY
@@ -47,17 +47,23 @@
 // MULTIMEMORY: "-target-feature" "+multimemory"
 // NO-MULTIMEMORY: "-target-feature" "-multimemory"
 
-// RUN: %clang --target=wasm32-unknown-unknown -### %s -matomics 2>&1 | FileCheck %s -check-prefix=ATOMICS
-// RUN: %clang --target=wasm32-unknown-unknown -### %s -mno-atomics 2>&1 | FileCheck %s -check-prefix=NO-ATOMICS
+// RUN: %clang --target=wasm32-unknown-unknown -### %s -mmultivalue 2>&1 | FileCheck %s -check-prefix=MULTIVALUE
+// RUN: %clang --target=wasm32-unknown-unknown -### %s -mno-multivalue 2>&1 | FileCheck %s -check-prefix=NO-MULTIVALUE
 
-// ATOMICS: "-target-feature" "+atomics"
-// NO-ATOMICS: "-target-feature" "-atomics"
+// MULTIVALUE: "-target-feature" "+multivalue"
+// NO-MULTIVALUE: "-target-feature" "-multivalue"
 
-// RUN: %clang --target=wasm32-unknown-unknown -### %s -mtail-call 2>&1 | FileCheck %s -check-prefix=TAIL-CALL
-// RUN: %clang --target=wasm32-unknown-unknown -### %s -mno-tail-call 2>&1 | FileCheck %s -check-prefix=NO-TAIL-CALL
+// RUN: %clang --target=wasm32-unknown-unknown -### %s -mmutable-globals 2>&1 | FileCheck %s -check-prefix=MUTABLE-GLOBALS
+// RUN: %clang --target=wasm32-unknown-unknown -### %s -mno-mutable-globals 2>&1 | FileCheck %s -check-prefix=NO-MUTABLE-GLOBALS
 
-// TAIL-CALL: "-target-feature" "+tail-call"
-// NO-TAIL-CALL: "-target-feature" "-tail-call"
+// MUTABLE-GLOBALS: "-target-feature" "+mutable-globals"
+// NO-MUTABLE-GLOBALS: "-target-feature" "-mutable-globals"
+
+// RUN: %clang --target=wasm32-unknown-unknown -### %s -mnontrapping-fptoint 2>&1 | FileCheck %s -check-prefix=NONTRAPPING-FPTOINT
+// RUN: %clang --target=wasm32-unknown-unknown -### %s -mno-nontrapping-fptoint 2>&1 | FileCheck %s -check-prefix=NO-NONTRAPPING-FPTOINT
+
+// NONTRAPPING-FPTOINT: "-target-feature" "+nontrapping-fptoint"
+// NO-NONTRAPPING-FPTOINT: "-target-feature" "-nontrapping-fptoint"
 
 // RUN: %clang --target=wasm32-unknown-unknown -### %s -mreference-types 2>&1 | FileCheck %s -check-prefix=REFERENCE-TYPES
 // RUN: %clang --target=wasm32-unknown-unknown -### %s -mno-reference-types 2>&1 | FileCheck %s -check-prefix=NO-REFERENCE-TYPES
@@ -65,32 +71,26 @@
 // REFERENCE-TYPES: "-target-feature" "+reference-types"
 // NO-REFERENCE-TYPES: "-target-feature" "-reference-types"
 
-// RUN: %clang --target=wasm32-unknown-unknown -### %s -msimd128 2>&1 | FileCheck %s -check-prefix=SIMD128
-// RUN: %clang --target=wasm32-unknown-unknown -### %s -mno-simd128 2>&1 | FileCheck %s -check-prefix=NO-SIMD128
-
-// SIMD128: "-target-feature" "+simd128"
-// NO-SIMD128: "-target-feature" "-simd128"
-
 // RUN: %clang --target=wasm32-unknown-unknown -### %s -mrelaxed-simd 2>&1 | FileCheck %s -check-prefix=RELAXED-SIMD
 // RUN: %clang --target=wasm32-unknown-unknown -### %s -mno-relaxed-simd 2>&1 | FileCheck %s -check-prefix=NO-RELAXED-SIMD
 
 // RELAXED-SIMD: "-target-feature" "+relaxed-simd"
 // NO-RELAXED-SIMD: "-target-feature" "-relaxed-simd"
 
-// RUN: %clang --target=wasm32-unknown-unknown -### %s -mhalf-precision 2>&1 | FileCheck %s -check-prefix=HALF-PRECISION
-// RUN: %clang --target=wasm32-unknown-unknown -### %s -mno-half-precision 2>&1 | FileCheck %s -check-prefix=NO-HALF-PRECISION
+// RUN: %clang --target=wasm32-unknown-unknown -### %s -msign-ext 2>&1 | FileCheck %s -check-prefix=SIGN-EXT
+// RUN: %clang --target=wasm32-unknown-unknown -### %s -mno-sign-ext 2>&1 | FileCheck %s -check-prefix=NO-SIGN-EXT
 
-// HALF-PRECISION: "-target-feature" "+half-precision"
-// NO-HALF-PRECISION: "-target-feature" "-half-precision"
+// SIGN-EXT: "-target-feature" "+sign-ext"
+// NO-SIGN-EXT: "-target-feature" "-sign-ext"
 
-// RUN: %clang --target=wasm32-unknown-unknown -### %s -mexception-handling 2>&1 | FileCheck %s -check-prefix=EXCEPTION-HANDLING
-// RUN: %clang --target=wasm32-unknown-unknown -### %s -mno-exception-handling 2>&1 | FileCheck %s -check-prefix=NO-EXCEPTION-HANDLING
+// RUN: %clang --target=wasm32-unknown-unknown -### %s -msimd128 2>&1 | FileCheck %s -check-prefix=SIMD128
+// RUN: %clang --target=wasm32-unknown-unknown -### %s -mno-simd128 2>&1 | FileCheck %s -check-prefix=NO-SIMD128
 
-// EXCEPTION-HANDLING: "-target-feature" "+exception-handling"
-// NO-EXCEPTION-HANDLING: "-target-feature" "-exception-handling"
+// SIMD128: "-target-feature" "+simd128"
+// NO-SIMD128: "-target-feature" "-simd128"
 
-// RUN: %clang --target=wasm32-unknown-unknown -### %s -mextended-const 2>&1 | FileCheck %s -check-prefix=EXTENDED-CONST
-// RUN: %clang --target=wasm32-unknown-unknown -### %s -mno-extended-const 2>&1 | FileCheck %s -check-prefix=NO-EXTENDED-CONST
+// RUN: %clang --target=wasm32-unknown-unknown -### %s -mtail-call 2>&1 | FileCheck %s -check-prefix=TAIL-CALL
+// RUN: %clang --target=wasm32-unknown-unknown -### %s -mno-tail-call 2>&1 | FileCheck %s -check-prefix=NO-TAIL-CALL
 
-// EXTENDED-CONST: "-target-feature" "+extended-const"
-// NO-EXTENDED-CONST: "-target-feature" "-extended-const"
+// TAIL-CALL: "-target-feature" "+tail-call"
+// NO-TAIL-CALL: "-target-feature" "-tail-call"
diff --git a/clang/test/Driver/wasm-toolchain.c b/clang/test/Driver/wasm-toolchain.c
index dabf0ac2433b..7c26c2c13c0b 100644
--- a/clang/test/Driver/wasm-toolchain.c
+++ b/clang/test/Driver/wasm-toolchain.c
@@ -120,11 +120,12 @@
 // RUN:   | FileCheck -check-prefix=EMSCRIPTEN_EH_ALLOWED_WO_ENABLE %s
 // EMSCRIPTEN_EH_ALLOWED_WO_ENABLE: invalid argument '-mllvm -emscripten-cxx-exceptions-allowed' only allowed with '-mllvm -enable-emscripten-cxx-exceptions'
 
-// '-fwasm-exceptions' sets +exception-handling and '-mllvm -wasm-enable-eh'
+// '-fwasm-exceptions' sets +exception-handling, -multivalue, -reference-types
+// and '-mllvm -wasm-enable-eh'
 // RUN: %clang -### --target=wasm32-unknown-unknown \
 // RUN:    --sysroot=/foo %s -fwasm-exceptions 2>&1 \
 // RUN:  | FileCheck -check-prefix=WASM_EXCEPTIONS %s
-// WASM_EXCEPTIONS: "-cc1" {{.*}} "-target-feature" "+exception-handling" "-mllvm" "-wasm-enable-eh"
+// WASM_EXCEPTIONS: "-cc1" {{.*}} "-target-feature" "+exception-handling" "-mllvm" "-wasm-enable-eh" "-target-feature" "+multivalue" "-target-feature" "+reference-types"
 
 // '-fwasm-exceptions' not allowed with '-mno-exception-handling'
 // RUN: not %clang -### --target=wasm32-unknown-unknown \
@@ -132,19 +133,32 @@
 // RUN:   | FileCheck -check-prefix=WASM_EXCEPTIONS_NO_EH %s
 // WASM_EXCEPTIONS_NO_EH: invalid argument '-fwasm-exceptions' not allowed with '-mno-exception-handling'
 
-// '-fwasm-exceptions' not allowed with '-mllvm -enable-emscripten-cxx-exceptions'
+// '-fwasm-exceptions' not allowed with
+// '-mllvm -enable-emscripten-cxx-exceptions'
 // RUN: not %clang -### --target=wasm32-unknown-unknown \
 // RUN:     --sysroot=/foo %s -fwasm-exceptions \
 // RUN:     -mllvm -enable-emscripten-cxx-exceptions 2>&1 \
 // RUN:   | FileCheck -check-prefix=WASM_EXCEPTIONS_EMSCRIPTEN_EH %s
 // WASM_EXCEPTIONS_EMSCRIPTEN_EH: invalid argument '-fwasm-exceptions' not allowed with '-mllvm -enable-emscripten-cxx-exceptions'
 
-// '-mllvm -wasm-enable-sjlj' sets +exception-handling and
-// '-exception-model=wasm'
+// '-fwasm-exceptions' not allowed with '-mno-multivalue'
+// RUN: not %clang -### --target=wasm32-unknown-unknown \
+// RUN:     --sysroot=/foo %s -fwasm-exceptions -mno-multivalue 2>&1 \
+// RUN:   | FileCheck -check-prefix=WASM_EXCEPTIONS_NO_MULTIVALUE %s
+// WASM_EXCEPTIONS_NO_MULTIVALUE: invalid argument '-fwasm-exceptions' not allowed with '-mno-multivalue'
+
+// '-fwasm-exceptions' not allowed with '-mno-reference-types'
+// RUN: not %clang -### --target=wasm32-unknown-unknown \
+// RUN:     --sysroot=/foo %s -fwasm-exceptions -mno-reference-types 2>&1 \
+// RUN:   | FileCheck -check-prefix=WASM_EXCEPTIONS_NO_REFERENCE_TYPES %s
+// WASM_EXCEPTIONS_NO_REFERENCE_TYPES: invalid argument '-fwasm-exceptions' not allowed with '-mno-reference-types'
+
+// '-mllvm -wasm-enable-sjlj' sets +exception-handling, +multivalue,
+// +reference-types  and '-exception-model=wasm'
 // RUN: %clang -### --target=wasm32-unknown-unknown \
 // RUN:    --sysroot=/foo %s -mllvm -wasm-enable-sjlj 2>&1 \
 // RUN:  | FileCheck -check-prefix=WASM_SJLJ %s
-// WASM_SJLJ: "-cc1" {{.*}} "-target-feature" "+exception-handling" "-exception-model=wasm"
+// WASM_SJLJ: "-cc1" {{.*}} "-target-feature" "+exception-handling" "-exception-model=wasm" "-target-feature" "+multivalue" "-target-feature" "+reference-types"
 
 // '-mllvm -wasm-enable-sjlj' not allowed with '-mno-exception-handling'
 // RUN: not %clang -### --target=wasm32-unknown-unknown \
@@ -168,6 +182,19 @@
 // RUN:   | FileCheck -check-prefix=WASM_SJLJ_EMSCRIPTEN_SJLJ %s
 // WASM_SJLJ_EMSCRIPTEN_SJLJ: invalid argument '-mllvm -wasm-enable-sjlj' not allowed with '-mllvm -enable-emscripten-sjlj'
 
+// '-mllvm -wasm-enable-sjlj' not allowed with '-mno-multivalue'
+// RUN: not %clang -### --target=wasm32-unknown-unknown \
+// RUN:     --sysroot=/foo %s -mllvm -wasm-enable-sjlj -mno-multivalue 2>&1 \
+// RUN:   | FileCheck -check-prefix=WASM_SJLJ_NO_MULTIVALUE %s
+// WASM_SJLJ_NO_MULTIVALUE: invalid argument '-mllvm -wasm-enable-sjlj' not allowed with '-mno-multivalue'
+
+// '-mllvm -wasm-enable-sjlj' not allowed with '-mno-reference-types'
+// RUN: not %clang -### --target=wasm32-unknown-unknown \
+// RUN:     --sysroot=/foo %s -mllvm -wasm-enable-sjlj \
+// RUN:     -mno-reference-types 2>&1 \
+// RUN:   | FileCheck -check-prefix=WASM_SJLJ_NO_REFERENCE_TYPES %s
+// WASM_SJLJ_NO_REFERENCE_TYPES: invalid argument '-mllvm -wasm-enable-sjlj' not allowed with '-mno-reference-types'
+
 // RUN: %clang -### %s -fsanitize=address --target=wasm32-unknown-emscripten 2>&1 | FileCheck -check-prefix=CHECK-ASAN-EMSCRIPTEN %s
 // CHECK-ASAN-EMSCRIPTEN: "-fsanitize=address"
 // CHECK-ASAN-EMSCRIPTEN: "-fsanitize-address-globals-dead-stripping"
diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c
index 998d5f37da69..25f8f66bc321 100644
--- a/clang/test/Driver/x86-target-features.c
+++ b/clang/test/Driver/x86-target-features.c
@@ -436,12 +436,14 @@
 // RUN: %clang -target x86_64-unknown-linux-gnu -mapx-features=ppx %s -### -o %t.o 2>&1 | FileCheck -check-prefix=PPX %s
 // RUN: %clang -target x86_64-unknown-linux-gnu -mapx-features=ndd %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NDD %s
 // RUN: %clang -target x86_64-unknown-linux-gnu -mapx-features=ccmp %s -### -o %t.o 2>&1 | FileCheck -check-prefix=CCMP %s
+// RUN: %clang -target x86_64-unknown-linux-gnu -mapx-features=nf %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NF %s
 // RUN: %clang -target x86_64-unknown-linux-gnu -mapx-features=cf %s -### -o %t.o 2>&1 | FileCheck -check-prefix=CF %s
 // EGPR: "-target-feature" "+egpr"
 // PUSH2POP2: "-target-feature" "+push2pop2"
 // PPX: "-target-feature" "+ppx"
 // NDD: "-target-feature" "+ndd"
 // CCMP: "-target-feature" "+ccmp"
+// NF: "-target-feature" "+nf"
 // CF: "-target-feature" "+cf"
 
 // RUN: %clang -target x86_64-unknown-linux-gnu -mapx-features=egpr,ndd %s -### -o %t.o 2>&1 | FileCheck -check-prefix=EGPR-NDD %s
diff --git a/clang/test/FixIt/dereference-addressof.c b/clang/test/FixIt/dereference-addressof.c
index 037622ea995f..f01fc32b7970 100644
--- a/clang/test/FixIt/dereference-addressof.c
+++ b/clang/test/FixIt/dereference-addressof.c
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
 // RUN: cp %s %t
-// RUN: not %clang_cc1 -fsyntax-only -fixit -x c %t
+// RUN: not %clang_cc1 -fixit -x c %t
 // RUN: %clang_cc1 -fsyntax-only -pedantic -x c %t
 
 void ip(int *aPtr) {}   // expected-note{{passing argument to parameter 'aPtr' here}}
diff --git a/clang/test/FixIt/fixit-c++2a-tls.cpp b/clang/test/FixIt/fixit-c++2a-tls.cpp
new file mode 100644
index 000000000000..97f2899c9083
--- /dev/null
+++ b/clang/test/FixIt/fixit-c++2a-tls.cpp
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -verify -std=c++2a -pedantic-errors %s
+// RUN: cp %s %t
+// RUN: %clang_cc1 -x c++ -std=c++2a -fixit %t
+// RUN: %clang_cc1 -Wall -pedantic-errors -x c++ -std=c++2a %t
+// RUN: cat %t | FileCheck %s
+// UNSUPPORTED: target={{.*-zos.*}}
+
+/* This is a test of the various code modification hints that only
+   apply in C++2a. */
+
+namespace constinit_mismatch {
+  extern thread_local constinit int a; // expected-note {{declared constinit here}}
+  thread_local int a = 123; // expected-error {{'constinit' specifier missing on initializing declaration of 'a'}}
+  // CHECK: {{^}}  constinit thread_local int a = 123;
+}
+
diff --git a/clang/test/FixIt/fixit-c++2a.cpp b/clang/test/FixIt/fixit-c++2a.cpp
index 6fe05dabf079..a21dd701ec74 100644
--- a/clang/test/FixIt/fixit-c++2a.cpp
+++ b/clang/test/FixIt/fixit-c++2a.cpp
@@ -16,10 +16,6 @@ template<typename ...T> void init_capture_pack(T ...a) {
 }
 
 namespace constinit_mismatch {
-  extern thread_local constinit int a; // expected-note {{declared constinit here}}
-  thread_local int a = 123; // expected-error {{'constinit' specifier missing on initializing declaration of 'a'}}
-  // CHECK: {{^}}  constinit thread_local int a = 123;
-
   int b = 123; // expected-note {{add the 'constinit' specifier}}
   extern constinit int b; // expected-error {{'constinit' specifier added after initialization of variable}}
   // CHECK: {{^}}  extern int b;
diff --git a/clang/test/FixIt/fixit-format-darwin.m b/clang/test/FixIt/fixit-format-darwin.m
index f3981a7c0152..e8a650d79176 100644
--- a/clang/test/FixIt/fixit-format-darwin.m
+++ b/clang/test/FixIt/fixit-format-darwin.m
@@ -1,5 +1,5 @@
 // RUN: cp %s %t
-// RUN: %clang_cc1 -triple x86_64-apple-darwin9 -fsyntax-only -fblocks -Wformat -fixit %t
+// RUN: %clang_cc1 -triple x86_64-apple-darwin9 -fblocks -Wformat -fixit %t
 // RUN: grep -v CHECK %t | FileCheck %s
 
 /* This is a test of code modifications created by darwin format fix-its hints 
diff --git a/clang/test/FixIt/fixit-format-ios-nopedantic.m b/clang/test/FixIt/fixit-format-ios-nopedantic.m
index 740a86bc82a1..db9ac797c247 100644
--- a/clang/test/FixIt/fixit-format-ios-nopedantic.m
+++ b/clang/test/FixIt/fixit-format-ios-nopedantic.m
@@ -1,5 +1,5 @@
 // RUN: cp %s %t
-// RUN: %clang_cc1 -triple thumbv7-apple-ios8.0.0 -fsyntax-only -Wformat -Werror -fixit %t
+// RUN: %clang_cc1 -triple thumbv7-apple-ios8.0.0 -Wformat -Werror -fixit %t
 
 int printf(const char *restrict, ...);
 typedef unsigned int NSUInteger;
diff --git a/clang/test/FixIt/fixit-format-ios.m b/clang/test/FixIt/fixit-format-ios.m
index c4592c80042d..3db75dfc19e3 100644
--- a/clang/test/FixIt/fixit-format-ios.m
+++ b/clang/test/FixIt/fixit-format-ios.m
@@ -1,5 +1,5 @@
 // RUN: cp %s %t
-// RUN: %clang_cc1 -triple thumbv7-apple-ios8.0.0 -fsyntax-only -Wformat-pedantic -fixit %t
+// RUN: %clang_cc1 -triple thumbv7-apple-ios8.0.0 -Wformat-pedantic -fixit %t
 // RUN: grep -v CHECK %t | FileCheck %s
 
 int printf(const char * restrict, ...);
diff --git a/clang/test/FixIt/fixit-include.c b/clang/test/FixIt/fixit-include.c
index 455455687c0d..f970f6faae7b 100644
--- a/clang/test/FixIt/fixit-include.c
+++ b/clang/test/FixIt/fixit-include.c
@@ -2,7 +2,7 @@
 // RUN: mkdir -p %t-dir
 // RUN: cp %s %t-dir/fixit-include.c
 // RUN: cp %S/fixit-include.h %t-dir/fixit-include.h
-// RUN: not %clang_cc1 -fsyntax-only -fixit %t-dir/fixit-include.c
+// RUN: not %clang_cc1 -fixit %t-dir/fixit-include.c
 // RUN: %clang_cc1 -Wall -pedantic %t-dir/fixit-include.c
 // RUN: not %clang_cc1 -fsyntax-only -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s
 
diff --git a/clang/test/FixIt/fixit-unused-lambda-capture.cpp b/clang/test/FixIt/fixit-unused-lambda-capture.cpp
index c3120093ed1f..ce0c78d67709 100644
--- a/clang/test/FixIt/fixit-unused-lambda-capture.cpp
+++ b/clang/test/FixIt/fixit-unused-lambda-capture.cpp
@@ -1,5 +1,5 @@
 // RUN: cp %s %t
-// RUN: %clang_cc1 -x c++ -fsyntax-only -Wunused-lambda-capture -Wno-unused-value -std=c++1z -fixit %t
+// RUN: %clang_cc1 -x c++ -Wunused-lambda-capture -Wno-unused-value -std=c++1z -fixit %t
 // RUN: grep -v CHECK %t | FileCheck %s
 
 void test() {
diff --git a/clang/test/FixIt/objc-literals.m b/clang/test/FixIt/objc-literals.m
index dbed87b75733..866b00983292 100644
--- a/clang/test/FixIt/objc-literals.m
+++ b/clang/test/FixIt/objc-literals.m
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
 // RUN: cp %s %t
-// RUN: not %clang_cc1 -fsyntax-only -fixit -x objective-c %t
+// RUN: not %clang_cc1 -fixit -x objective-c %t
 // RUN: %clang_cc1 -fsyntax-only -pedantic -Werror -x objective-c %t
 
 typedef unsigned char BOOL;
diff --git a/clang/test/FixIt/typo-location-bugs.cpp b/clang/test/FixIt/typo-location-bugs.cpp
index c7111a801066..fb60fd6d0209 100644
--- a/clang/test/FixIt/typo-location-bugs.cpp
+++ b/clang/test/FixIt/typo-location-bugs.cpp
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
 // RUN: cp %s %t
-// RUN: not %clang_cc1 -fsyntax-only -fixit -x c++ %t
+// RUN: not %clang_cc1 -fixit -x c++ %t
 // RUN: %clang_cc1 -fsyntax-only -pedantic -Werror -x c++ %t
 
 namespace dcl_fct_default_p10 {
diff --git a/clang/test/FixIt/typo-using.cpp b/clang/test/FixIt/typo-using.cpp
index e676b1074f9f..a96effefb432 100644
--- a/clang/test/FixIt/typo-using.cpp
+++ b/clang/test/FixIt/typo-using.cpp
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
 // RUN: cp %s %t
-// RUN: not %clang_cc1 -fsyntax-only -fixit -x c++ %t
+// RUN: not %clang_cc1 -fixit -x c++ %t
 // RUN: %clang_cc1 -fsyntax-only -pedantic -Werror -x c++ %t
 // RUN: grep using_suggestion_tyname_ty_dropped_specifier %t
 
diff --git a/clang/test/FixIt/typo.c b/clang/test/FixIt/typo.c
index 295d2cbd45fa..524e40068638 100644
--- a/clang/test/FixIt/typo.c
+++ b/clang/test/FixIt/typo.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
 // RUN: not %clang_cc1 -fsyntax-only -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s
 // RUN: cp %s %t
-// RUN: not %clang_cc1 -fsyntax-only -fixit -x c %t
+// RUN: not %clang_cc1 -fixit -x c %t
 // RUN: %clang_cc1 -fsyntax-only -pedantic -Werror -x c %t
 
 struct Point {
diff --git a/clang/test/FixIt/typo.cpp b/clang/test/FixIt/typo.cpp
index 2a743991be41..e489fbbcaa1d 100644
--- a/clang/test/FixIt/typo.cpp
+++ b/clang/test/FixIt/typo.cpp
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
 // RUN: cp %s %t
-// RUN: not %clang_cc1 -fsyntax-only -fixit -x c++ %t
+// RUN: not %clang_cc1 -fixit -x c++ %t
 // RUN: %clang_cc1 -fsyntax-only -pedantic -Werror -x c++ %t
 // RUN: grep test_string %t
 
diff --git a/clang/test/FixIt/typo.m b/clang/test/FixIt/typo.m
index 5544eab05eb9..9f777070b634 100644
--- a/clang/test/FixIt/typo.m
+++ b/clang/test/FixIt/typo.m
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -triple x86_64-apple-darwin10 -DNON_FIXITS -verify -Wno-objc-root-class %s
 // RUN: cp %s %t
-// RUN: not %clang_cc1 -x objective-c -fsyntax-only -triple x86_64-apple-darwin10 -fixit -Wno-objc-root-class %t
+// RUN: not %clang_cc1 -x objective-c -triple x86_64-apple-darwin10 -fixit -Wno-objc-root-class %t
 // RUN: %clang_cc1 -x objective-c -fsyntax-only -triple x86_64-apple-darwin10 -pedantic -Werror -Wno-objc-root-class %t
 // RUN: grep "@implementation Sub3" %t
 
diff --git a/clang/test/Frontend/fixed_point_add.c b/clang/test/Frontend/fixed_point_add.c
index 1f1b8d642707..757b68cb6e73 100644
--- a/clang/test/Frontend/fixed_point_add.c
+++ b/clang/test/Frontend/fixed_point_add.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
-// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,UNSIGNED
+// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
+// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,UNSIGNED
 
 short _Accum sa;
 _Accum a, a2, a3, a4;
diff --git a/clang/test/Frontend/fixed_point_add_const.c b/clang/test/Frontend/fixed_point_add_const.c
index 6c8c7cb86b5d..8fa025ab49ac 100644
--- a/clang/test/Frontend/fixed_point_add_const.c
+++ b/clang/test/Frontend/fixed_point_add_const.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
-// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,UNSIGNED
+// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
+// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,UNSIGNED
 
 // Addition between different fixed point types
 short _Accum sa_const = 1.0hk + 2.0hk;
diff --git a/clang/test/Frontend/fixed_point_comparisons.c b/clang/test/Frontend/fixed_point_comparisons.c
index a59f06b9c942..8cd2aa2dbc65 100644
--- a/clang/test/Frontend/fixed_point_comparisons.c
+++ b/clang/test/Frontend/fixed_point_comparisons.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,UNPADDED
-// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,PADDED
+// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,UNPADDED
+// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,PADDED
 
 // Fixed point against other fixed point
 _Bool b_eq_true = 2.5hk == 2.5uhk;  // CHECK-DAG: @b_eq_true  = {{.*}}global i8 1, align 1
diff --git a/clang/test/Frontend/fixed_point_compound.c b/clang/test/Frontend/fixed_point_compound.c
index a1d6ff19912b..b507cbf1f6db 100644
--- a/clang/test/Frontend/fixed_point_compound.c
+++ b/clang/test/Frontend/fixed_point_compound.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
-// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,UNSIGNED
+// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
+// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,UNSIGNED
 
 short _Fract shf;
 _Accum a;
diff --git a/clang/test/Frontend/fixed_point_conversions.c b/clang/test/Frontend/fixed_point_conversions.c
index efa3f1b34724..1d053068232b 100644
--- a/clang/test/Frontend/fixed_point_conversions.c
+++ b/clang/test/Frontend/fixed_point_conversions.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -x c -ffixed-point -triple x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
-// RUN: %clang_cc1 -x c -ffixed-point -triple x86_64-unknown-linux-gnu -S -emit-llvm %s -o - -fpadding-on-unsigned-fixed-point | FileCheck %s --check-prefixes=CHECK,UNSIGNED
-// RUN: %clang_cc1 -x c++ -ffixed-point -triple x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
-// RUN: %clang_cc1 -x c++ -ffixed-point -triple x86_64-unknown-linux-gnu -S -emit-llvm %s -o - -fpadding-on-unsigned-fixed-point | FileCheck %s --check-prefixes=CHECK,UNSIGNED
+// RUN: %clang_cc1 -x c -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
+// RUN: %clang_cc1 -x c -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - -fpadding-on-unsigned-fixed-point | FileCheck %s --check-prefixes=CHECK,UNSIGNED
+// RUN: %clang_cc1 -x c++ -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
+// RUN: %clang_cc1 -x c++ -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - -fpadding-on-unsigned-fixed-point | FileCheck %s --check-prefixes=CHECK,UNSIGNED
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/clang/test/Frontend/fixed_point_conversions_const.c b/clang/test/Frontend/fixed_point_conversions_const.c
index 30aefbd564f3..e6e89ded534f 100644
--- a/clang/test/Frontend/fixed_point_conversions_const.c
+++ b/clang/test/Frontend/fixed_point_conversions_const.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
-// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -S -emit-llvm %s -o - -fpadding-on-unsigned-fixed-point | FileCheck %s --check-prefixes=CHECK,UNSIGNED
+// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
+// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - -fpadding-on-unsigned-fixed-point | FileCheck %s --check-prefixes=CHECK,UNSIGNED
 
 // Between different fixed point types
 short _Accum sa_const = 2.5hk;
diff --git a/clang/test/Frontend/fixed_point_conversions_half.c b/clang/test/Frontend/fixed_point_conversions_half.c
index 4094b6af5dee..38b99123b867 100644
--- a/clang/test/Frontend/fixed_point_conversions_half.c
+++ b/clang/test/Frontend/fixed_point_conversions_half.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -ffixed-point -triple arm64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
-// RUN: %clang_cc1 -ffixed-point -triple arm64-unknown-linux-gnu -S -emit-llvm %s -o - -fpadding-on-unsigned-fixed-point | FileCheck %s --check-prefixes=CHECK,UNSIGNED
+// RUN: %clang_cc1 -ffixed-point -triple arm64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
+// RUN: %clang_cc1 -ffixed-point -triple arm64-unknown-linux-gnu -emit-llvm %s -o - -fpadding-on-unsigned-fixed-point | FileCheck %s --check-prefixes=CHECK,UNSIGNED
 
 short _Fract sf;
 long _Fract lf;
diff --git a/clang/test/Frontend/fixed_point_div.c b/clang/test/Frontend/fixed_point_div.c
index cf11b75233c8..1bb7e2f8db1f 100644
--- a/clang/test/Frontend/fixed_point_div.c
+++ b/clang/test/Frontend/fixed_point_div.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
-// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,UNSIGNED
+// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
+// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,UNSIGNED
 
 short _Accum sa;
 _Accum a, a2, a3, a4;
diff --git a/clang/test/Frontend/fixed_point_div_const.c b/clang/test/Frontend/fixed_point_div_const.c
index 0f89605e7939..46935207d186 100644
--- a/clang/test/Frontend/fixed_point_div_const.c
+++ b/clang/test/Frontend/fixed_point_div_const.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
-// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,UNSIGNED
+// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
+// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,UNSIGNED
 
 // Division between different fixed point types
 short _Accum sa_const = 1.0hk / 2.0hk;
diff --git a/clang/test/Frontend/fixed_point_mul.c b/clang/test/Frontend/fixed_point_mul.c
index fc6584871041..e9e802fdf724 100644
--- a/clang/test/Frontend/fixed_point_mul.c
+++ b/clang/test/Frontend/fixed_point_mul.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
-// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,UNSIGNED
+// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
+// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,UNSIGNED
 
 short _Accum sa;
 _Accum a, a2, a3, a4;
diff --git a/clang/test/Frontend/fixed_point_mul_const.c b/clang/test/Frontend/fixed_point_mul_const.c
index c5c863692897..f11366abaedb 100644
--- a/clang/test/Frontend/fixed_point_mul_const.c
+++ b/clang/test/Frontend/fixed_point_mul_const.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
-// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,UNSIGNED
+// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
+// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,UNSIGNED
 
 // Multiplication between different fixed point types
 short _Accum sa_const = 2.0hk * 2.0hk;
diff --git a/clang/test/Frontend/fixed_point_same_fbits.c b/clang/test/Frontend/fixed_point_same_fbits.c
index 27762e5ca12a..e6bf5f698907 100644
--- a/clang/test/Frontend/fixed_point_same_fbits.c
+++ b/clang/test/Frontend/fixed_point_same_fbits.c
@@ -1,5 +1,5 @@
 // RUN: %clang -ffixed-point -S -emit-llvm -o - %s | FileCheck %s -check-prefix=DEFAULT
-// RUN: %clang_cc1 -ffixed-point -fpadding-on-unsigned-fixed-point -S -emit-llvm -o - %s | FileCheck %s -check-prefix=SAME
+// RUN: %clang_cc1 -ffixed-point -fpadding-on-unsigned-fixed-point -emit-llvm -o - %s | FileCheck %s -check-prefix=SAME
 
 /* The scale for unsigned fixed point types should be the same as that of signed
  * fixed point types when -fsame-fbits is enabled. */
diff --git a/clang/test/Frontend/fixed_point_shift.c b/clang/test/Frontend/fixed_point_shift.c
index cbfc9181c762..e870f98b4bed 100644
--- a/clang/test/Frontend/fixed_point_shift.c
+++ b/clang/test/Frontend/fixed_point_shift.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
-// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,UNSIGNED
+// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
+// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,UNSIGNED
 
 short _Accum sa;
 _Accum a;
diff --git a/clang/test/Frontend/fixed_point_shift_const.c b/clang/test/Frontend/fixed_point_shift_const.c
index 10860efd188b..ea85d6f2319a 100644
--- a/clang/test/Frontend/fixed_point_shift_const.c
+++ b/clang/test/Frontend/fixed_point_shift_const.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
-// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,UNSIGNED
+// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
+// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,UNSIGNED
 
 short _Accum sa_const1 = 1.0hk << 2;
 // CHECK-DAG: @sa_const1 = {{.*}}global i16 512
diff --git a/clang/test/Frontend/fixed_point_sub.c b/clang/test/Frontend/fixed_point_sub.c
index ecadeccd8bb5..fb330afe1003 100644
--- a/clang/test/Frontend/fixed_point_sub.c
+++ b/clang/test/Frontend/fixed_point_sub.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
-// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,UNSIGNED
+// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
+// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,UNSIGNED
 
 short _Accum sa;
 _Accum a, a2, a3, a4;
diff --git a/clang/test/Frontend/fixed_point_sub_const.c b/clang/test/Frontend/fixed_point_sub_const.c
index dc6ad92ec798..c98cfd185205 100644
--- a/clang/test/Frontend/fixed_point_sub_const.c
+++ b/clang/test/Frontend/fixed_point_sub_const.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
-// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,UNSIGNED
+// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
+// RUN: %clang_cc1 -ffixed-point -triple x86_64-unknown-linux-gnu -fpadding-on-unsigned-fixed-point -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,UNSIGNED
 
 // Subtraction between different fixed point types
 short _Accum sa_const = 1.0hk - 2.0hk;
diff --git a/clang/test/Frontend/fixed_point_to_bool.c b/clang/test/Frontend/fixed_point_to_bool.c
index 4b9ed6cebe32..b80ced68e976 100644
--- a/clang/test/Frontend/fixed_point_to_bool.c
+++ b/clang/test/Frontend/fixed_point_to_bool.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -ffixed-point -S -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -ffixed-point -S -emit-llvm %s -o - -fpadding-on-unsigned-fixed-point | FileCheck %s
+// RUN: %clang_cc1 -ffixed-point -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -ffixed-point -emit-llvm %s -o - -fpadding-on-unsigned-fixed-point | FileCheck %s
 
 _Bool global_b = 1.0k;  // @global_b = {{*.}}global i8 1, align 1
 _Bool global_b2 = 0.0k; // @global_b2 = {{*.}}global i8 0, align 1
diff --git a/clang/test/Frontend/fixed_point_unary.c b/clang/test/Frontend/fixed_point_unary.c
index a398a4be203d..e84b763f3250 100644
--- a/clang/test/Frontend/fixed_point_unary.c
+++ b/clang/test/Frontend/fixed_point_unary.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -ffixed-point -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -ffixed-point -fpadding-on-unsigned-fixed-point -S -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,UNSIGNED
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -ffixed-point -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,SIGNED
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -ffixed-point -fpadding-on-unsigned-fixed-point -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,UNSIGNED
 
 _Accum a;
 _Fract f;
diff --git a/clang/test/Frontend/gnu-inline.c b/clang/test/Frontend/gnu-inline.c
index ef9966931372..46472878ed66 100644
--- a/clang/test/Frontend/gnu-inline.c
+++ b/clang/test/Frontend/gnu-inline.c
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -fgnuc-version=4.2.1 -std=c89 -fsyntax-only -x c -E -dM %s | FileCheck --check-prefix=GNU-INLINE %s
-// RUN: %clang_cc1 -fgnuc-version=4.2.1 -std=c99 -fsyntax-only -x c -E -dM %s | FileCheck --check-prefix=STDC-INLINE %s
-// RUN: %clang_cc1 -fgnuc-version=4.2.1 -std=c99 -fgnu89-inline -fsyntax-only -x c -E -dM %s | FileCheck --check-prefix=GNU-INLINE %s
-// RUN: %clang_cc1 -fgnuc-version=4.2.1 -fsyntax-only -x c++ -E -dM %s | FileCheck --check-prefix=GNU-INLINE %s
+// RUN: %clang_cc1 -fgnuc-version=4.2.1 -std=c89 -x c -E -dM %s | FileCheck --check-prefix=GNU-INLINE %s
+// RUN: %clang_cc1 -fgnuc-version=4.2.1 -std=c99 -x c -E -dM %s | FileCheck --check-prefix=STDC-INLINE %s
+// RUN: %clang_cc1 -fgnuc-version=4.2.1 -std=c99 -fgnu89-inline -x c -E -dM %s | FileCheck --check-prefix=GNU-INLINE %s
+// RUN: %clang_cc1 -fgnuc-version=4.2.1 -x c++ -E -dM %s | FileCheck --check-prefix=GNU-INLINE %s
 // RUN: not %clang_cc1 -fgnu89-inline -fgnuc-version=4.2.1 -fsyntax-only -x c++ %s 2>&1 | FileCheck --check-prefix=CXX %s
 // RUN: not %clang_cc1 -fgnu89-inline -fgnuc-version=4.2.1 -fsyntax-only -x objective-c++ %s 2>&1 | FileCheck --check-prefix=OBJCXX %s
 
diff --git a/clang/test/Frontend/hexagon-target-basic.c b/clang/test/Frontend/hexagon-target-basic.c
index 5f95fa2df5a0..25fb19c9b2fb 100644
--- a/clang/test/Frontend/hexagon-target-basic.c
+++ b/clang/test/Frontend/hexagon-target-basic.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -emit-llvm -triple hexagon-unknown-unknown %s -S -o /dev/null
+// RUN: %clang_cc1 -emit-llvm -triple hexagon-unknown-unknown %s -o /dev/null
 // REQUIRES: hexagon-registered-target
 
 // Testcase for bug 14744.  Empty file is sufficient, since the problem
diff --git a/clang/test/Frontend/invalid-cxx-abi.cpp b/clang/test/Frontend/invalid-cxx-abi.cpp
index 923d9d205739..031ccc20ad0a 100644
--- a/clang/test/Frontend/invalid-cxx-abi.cpp
+++ b/clang/test/Frontend/invalid-cxx-abi.cpp
@@ -1,9 +1,9 @@
 // These shouldn't be valid -fc++-abi values.
-// RUN: not %clang_cc1 -S -emit-llvm -o /dev/null -fc++-abi=InvalidABI %s 2>&1 | FileCheck %s -check-prefix=INVALID
-// RUN: not %clang_cc1 -S -emit-llvm -o /dev/null -fc++-abi=Fuchsia %s 2>&1 | FileCheck %s -check-prefix=CASE-SENSITIVE
+// RUN: not %clang_cc1 -emit-llvm -o /dev/null -fc++-abi=InvalidABI %s 2>&1 | FileCheck %s -check-prefix=INVALID
+// RUN: not %clang_cc1 -emit-llvm -o /dev/null -fc++-abi=Fuchsia %s 2>&1 | FileCheck %s -check-prefix=CASE-SENSITIVE
 // INVALID: error: invalid C++ ABI name 'InvalidABI'
 // CASE-SENSITIVE: error: invalid C++ ABI name 'Fuchsia'
 
 // Some C++ ABIs are not supported on some platforms.
-// RUN: not %clang_cc1 -S -emit-llvm -o /dev/null -fc++-abi=fuchsia -triple i386 %s 2>&1 | FileCheck %s -check-prefix=UNSUPPORTED-FUCHSIA
+// RUN: not %clang_cc1 -emit-llvm -o /dev/null -fc++-abi=fuchsia -triple i386 %s 2>&1 | FileCheck %s -check-prefix=UNSUPPORTED-FUCHSIA
 // UNSUPPORTED-FUCHSIA: error: C++ ABI 'fuchsia' is not supported on target triple 'i386'
diff --git a/clang/test/Frontend/llvmplugins.c b/clang/test/Frontend/llvmplugins.c
index 182029814beb..2a0a4426acdb 100644
--- a/clang/test/Frontend/llvmplugins.c
+++ b/clang/test/Frontend/llvmplugins.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -load %llvmshlibdir/LLVMPrintFunctionNames%pluginext -S -o /dev/null -emit-llvm %s 2>&1 | FileCheck %s
+// RUN: %clang_cc1 -load %llvmshlibdir/LLVMPrintFunctionNames%pluginext -o /dev/null -emit-llvm %s 2>&1 | FileCheck %s
 // REQUIRES: plugins, examples
 
 // CHECK: [PrintPass] Found function: x 
diff --git a/clang/test/Frontend/multiple-actions.c b/clang/test/Frontend/multiple-actions.c
new file mode 100644
index 000000000000..284e8f0467bf
--- /dev/null
+++ b/clang/test/Frontend/multiple-actions.c
@@ -0,0 +1,7 @@
+// RUN: not %clang_cc1 -S -emit-llvm -main-file-name %s 2>&1 | FileCheck %s --check-prefix=ERR1 --implicit-check-not=error:
+// ERR1: error: '-S' action ignored; '-emit-llvm' action specified previously
+
+// RUN: not %clang_cc1 -main-file-name %s -emit-llvm-only -emit-llvm -S 2>&1 | FileCheck %s --check-prefix=ERR2 --implicit-check-not=error:
+// ERR2: error: '-emit-llvm-only' action ignored; '-S' action specified previously
+
+// RUN: %clang_cc1 -S -main-file-name %s -emit-llvm -o /dev/null
diff --git a/clang/test/Frontend/objc-bool-is-bool.m b/clang/test/Frontend/objc-bool-is-bool.m
index b7051f37ee2b..a457f29e1c47 100644
--- a/clang/test/Frontend/objc-bool-is-bool.m
+++ b/clang/test/Frontend/objc-bool-is-bool.m
@@ -1,8 +1,8 @@
-// RUN: %clang_cc1 -fsyntax-only -E -dM -triple=armv7k-apple-watchos %s | FileCheck --check-prefix=BOOL %s
-// RUN: %clang_cc1 -fsyntax-only -E -dM -triple=x86_64-apple-darwin16 %s | FileCheck --check-prefix=CHAR %s
-// RUN: %clang_cc1 -x c -fsyntax-only -E -dM -triple=x86_64-apple-darwin16 %s | FileCheck --check-prefix=CHAR %s
-// RUN: %clang_cc1 -x objective-c++ -fsyntax-only -E -dM -triple=x86_64-apple-darwin16 %s | FileCheck --check-prefix=CHAR %s
-// RUN: %clang_cc1 -x c++ -fsyntax-only -E -dM -triple=x86_64-apple-darwin16 %s | FileCheck --check-prefix=CHAR %s
+// RUN: %clang_cc1 -E -dM -triple=armv7k-apple-watchos %s | FileCheck --check-prefix=BOOL %s
+// RUN: %clang_cc1 -E -dM -triple=x86_64-apple-darwin16 %s | FileCheck --check-prefix=CHAR %s
+// RUN: %clang_cc1 -x c -E -dM -triple=x86_64-apple-darwin16 %s | FileCheck --check-prefix=CHAR %s
+// RUN: %clang_cc1 -x objective-c++ -E -dM -triple=x86_64-apple-darwin16 %s | FileCheck --check-prefix=CHAR %s
+// RUN: %clang_cc1 -x c++ -E -dM -triple=x86_64-apple-darwin16 %s | FileCheck --check-prefix=CHAR %s
 
 // BOOL: #define __OBJC_BOOL_IS_BOOL 1
 // BOOL-NOT: #define __OBJC_BOOL_IS_BOOL 0
diff --git a/clang/test/Frontend/output-paths.c b/clang/test/Frontend/output-paths.c
index 836fe971de5e..22d46f67b08b 100644
--- a/clang/test/Frontend/output-paths.c
+++ b/clang/test/Frontend/output-paths.c
@@ -6,7 +6,7 @@
 // Check that -working-directory is respected when diagnosing output failures.
 //
 // RUN: rm -rf %t.d && mkdir -p %t.d/%basename_t-inner.d
-// RUN: %clang_cc1 -emit-llvm -working-directory %t.d -E -o %basename_t-inner.d/somename %s -verify
+// RUN: %clang_cc1 -working-directory %t.d -E -o %basename_t-inner.d/somename %s -verify
 // expected-no-diagnostics
 
 // RUN: %clang_cc1 -working-directory %t.d -E %s -o - | FileCheck %s
diff --git a/clang/test/Frontend/rewrite-includes-macros.cpp b/clang/test/Frontend/rewrite-includes-macros.cpp
index 1c2bfd440342..f374708a76d3 100644
--- a/clang/test/Frontend/rewrite-includes-macros.cpp
+++ b/clang/test/Frontend/rewrite-includes-macros.cpp
@@ -1,15 +1,15 @@
-// RUN: %clang_cl /E -Xclang -frewrite-includes -- %s | %clang_cl /c -Xclang -verify /Tp -
-// expected-no-diagnostics
-
-// This test uses dos-style \r\n line endings.
-// Make sure your editor doesn't rewrite them to unix-style \n line endings.
-int foo();
-int bar();
-#define HELLO \
-  foo(); \
-  bar();
-
-int main() {
-  HELLO
-  return 0;
-}
+// RUN: %clang_cl /E -Xclang -frewrite-includes -- %s | %clang_cl /c -Xclang -verify /Tp -
+// expected-no-diagnostics
+
+// This test uses dos-style \r\n line endings.
+// Make sure your editor doesn't rewrite them to unix-style \n line endings.
+int foo();
+int bar();
+#define HELLO \
+  foo(); \
+  bar();
+
+int main() {
+  HELLO
+  return 0;
+}
diff --git a/clang/test/Frontend/valid-cxx-abi.cpp b/clang/test/Frontend/valid-cxx-abi.cpp
index 99f8dc869e0d..e54dee79cad8 100644
--- a/clang/test/Frontend/valid-cxx-abi.cpp
+++ b/clang/test/Frontend/valid-cxx-abi.cpp
@@ -1,4 +1,4 @@
 // These should be valid cxx abi flags for these targets.
-// RUN: %clang_cc1 -fc++-abi=itanium -triple x86_64-unknown-linux-gnu %s -S -emit-llvm -o /dev/null
-// RUN: %clang_cc1 -fc++-abi=fuchsia -triple x86_64-unknown-fuchsia %s -S -emit-llvm -o /dev/null
-// RUN: %clang_cc1 -fc++-abi=microsoft -triple x86_64-windows-msvc %s -S -emit-llvm -o /dev/null
+// RUN: %clang_cc1 -fc++-abi=itanium -triple x86_64-unknown-linux-gnu %s -emit-llvm -o /dev/null
+// RUN: %clang_cc1 -fc++-abi=fuchsia -triple x86_64-unknown-fuchsia %s -emit-llvm -o /dev/null
+// RUN: %clang_cc1 -fc++-abi=microsoft -triple x86_64-windows-msvc %s -emit-llvm -o /dev/null
diff --git a/clang/test/Headers/arm-acle-header.c b/clang/test/Headers/arm-acle-header.c
index f04c7e1f0f35..fea8472183c8 100644
--- a/clang/test/Headers/arm-acle-header.c
+++ b/clang/test/Headers/arm-acle-header.c
@@ -7,6 +7,7 @@
 // RUN: %clang_cc1 -x c++ -triple thumbv7-windows -target-cpu cortex-a15 -fsyntax-only -ffreestanding -fms-extensions -fms-compatibility -fms-compatibility-version=19.11 %s
 // RUN: %clang_cc1 -x c++ -triple aarch64-windows -target-cpu cortex-a53 -fsyntax-only -ffreestanding -fms-extensions -fms-compatibility -fms-compatibility-version=19.11 %s
 // RUN: %clang_cc1 -x c++ -triple arm64-apple-ios -target-cpu apple-a7 -fsyntax-only -ffreestanding -fms-extensions %s
+// RUN: %clang_cc1 -x c++ -triple arm64ec-windows -target-cpu cortex-a53 -fsyntax-only -ffreestanding -fms-extensions -fms-compatibility -fms-compatibility-version=19.11 %s
 // expected-no-diagnostics
 
 #include <arm_acle.h>
diff --git a/clang/test/Headers/ms-arm64-intrin.cpp b/clang/test/Headers/ms-arm64-intrin.cpp
index 3c37b8ad2f4f..4be9576539cd 100644
--- a/clang/test/Headers/ms-arm64-intrin.cpp
+++ b/clang/test/Headers/ms-arm64-intrin.cpp
@@ -2,8 +2,8 @@
 
 // RUN: %clang_cc1 -triple arm64-windows -O1 \
 // RUN: -fms-compatibility -fms-compatibility-version=17.00 \
-// RUN: -fsyntax-only -Werror \
-// RUN: -isystem %S/Inputs/include %s -S -o - -emit-llvm 2>&1 \
+// RUN: -Werror \
+// RUN: -isystem %S/Inputs/include %s -o - -emit-llvm 2>&1 \
 // RUN: | FileCheck %s
 
 #include <intrin.h>
diff --git a/clang/test/InstallAPI/Inputs/Foundation/Foundation.framework/Modules/module.modulemap b/clang/test/InstallAPI/Inputs/Foundation/Foundation.framework/Modules/module.modulemap
new file mode 100644
index 000000000000..2bb688da1fa4
--- /dev/null
+++ b/clang/test/InstallAPI/Inputs/Foundation/Foundation.framework/Modules/module.modulemap
@@ -0,0 +1,3 @@
+framework module Foundation [system] {
+    umbrella header "Foundation.h"
+}
diff --git a/clang/test/InstallAPI/Inputs/LibFoo/usr/include/foo.h b/clang/test/InstallAPI/Inputs/LibFoo/usr/include/foo.h
new file mode 100644
index 000000000000..e131da67ab7e
--- /dev/null
+++ b/clang/test/InstallAPI/Inputs/LibFoo/usr/include/foo.h
@@ -0,0 +1,15 @@
+#ifndef FOO_H
+#define FOO_H 
+#include <macro_defs.h> 
+
+#if defined(Foo) 
+  #define FOO "FooLib$" 
+#else 
+  #define FOO 
+#endif 
+
+#define __STRING(x)     #x
+#define PLATFORM_ALIAS(sym)	__asm("_" FOO __STRING(sym) DARWIN LINUX)
+extern int foo() PLATFORM_ALIAS(foo);
+
+#endif 
diff --git a/clang/test/InstallAPI/Inputs/LibFoo/usr/include/macro_defs.h b/clang/test/InstallAPI/Inputs/LibFoo/usr/include/macro_defs.h
new file mode 100644
index 000000000000..25566909da06
--- /dev/null
+++ b/clang/test/InstallAPI/Inputs/LibFoo/usr/include/macro_defs.h
@@ -0,0 +1,15 @@
+#ifndef MACRO_DEFS_H
+#define MACRO_DEFS_H 
+
+#if defined(NONDarwin) 
+  #define LINUX "$linux"
+  #define DARWIN 
+#elif defined(Darwin) 
+  #define LINUX 
+  #define DARWIN "$darwin" 
+#else 
+  #define LINUX 
+  #define DARWIN 
+#endif 
+
+#endif // MACRO_DEFS_H
diff --git a/clang/test/InstallAPI/Inputs/LibFoo/usr/include/public.h b/clang/test/InstallAPI/Inputs/LibFoo/usr/include/public.h
new file mode 100644
index 000000000000..fcc6d91adb77
--- /dev/null
+++ b/clang/test/InstallAPI/Inputs/LibFoo/usr/include/public.h
@@ -0,0 +1,9 @@
+#ifndef PUBLIC_H
+#define PUBLIC_H 
+#include <macro_defs.h>
+
+#define __STRING(x)     #x
+#define PLATFORM_ALIAS(sym)	__asm("_" __STRING(sym) DARWIN LINUX)
+extern int foo() PLATFORM_ALIAS(foo);
+
+#endif 
diff --git a/clang/test/InstallAPI/Inputs/Zippered/Zippered.framework/Headers/Zippered.h b/clang/test/InstallAPI/Inputs/Zippered/Zippered.framework/Headers/Zippered.h
new file mode 100644
index 000000000000..ec1b03318be1
--- /dev/null
+++ b/clang/test/InstallAPI/Inputs/Zippered/Zippered.framework/Headers/Zippered.h
@@ -0,0 +1,20 @@
+#if !__is_target_environment(macabi)
+typedef int MyType;
+#else
+typedef float MyType;
+#endif
+
+extern MyType invalidAPI();
+
+#define OS_AVAILABLE(_target, _availability)                                   \
+  __attribute__((availability(_target, _availability)))
+extern int macOSAPI() OS_AVAILABLE(macos, introduced=10.14) OS_AVAILABLE(ios, unavailable);
+extern int iOSAPI() OS_AVAILABLE(ios, introduced=12.0) OS_AVAILABLE(macos, unavailable);
+extern int commonAPI() OS_AVAILABLE(macos, introduced=10.14) OS_AVAILABLE(ios, introduced=12.0);
+
+extern int obsoletedMacOSAPI() OS_AVAILABLE(macos, obsoleted=10.14) OS_AVAILABLE(ios, unavailable);
+
+#if !__is_target_environment(macabi)
+extern int macOSAPI2() OS_AVAILABLE(macos, introduced = 10.14)
+    OS_AVAILABLE(ios, unavailable);
+#endif
diff --git a/clang/test/InstallAPI/Inputs/Zippered/Zippered.framework/PrivateHeaders/Zippered_Private.h b/clang/test/InstallAPI/Inputs/Zippered/Zippered.framework/PrivateHeaders/Zippered_Private.h
new file mode 100644
index 000000000000..2182a17275cb
--- /dev/null
+++ b/clang/test/InstallAPI/Inputs/Zippered/Zippered.framework/PrivateHeaders/Zippered_Private.h
@@ -0,0 +1,9 @@
+#if __is_target_environment(macabi)
+extern int a;
+@class UIImage;
+UIImage *image;
+#else
+extern long a;
+@class NSImage;
+NSImage *image;
+#endif
diff --git a/clang/test/InstallAPI/Inputs/Zippered/Zippered.tbd b/clang/test/InstallAPI/Inputs/Zippered/Zippered.tbd
new file mode 100644
index 000000000000..6ceb589ff0cd
--- /dev/null
+++ b/clang/test/InstallAPI/Inputs/Zippered/Zippered.tbd
@@ -0,0 +1,47 @@
+{
+  "main_library": {
+    "exported_symbols": [
+      {
+        "data": {
+          "global": [
+            "_image", "_a"
+          ]
+        },
+        "text": {
+          "global": [
+            "_invalidAPI", "_commonAPI"
+          ]
+        }
+      },
+      {
+        "targets": [ "x86_64-maccatalyst" ],
+        "text": {
+          "global": [ "_iOSAPI"]
+        }
+      },
+      {
+        "targets": [ "x86_64-macos" ],
+        "text": {
+          "global": [ "_macOSAPI", "_macOSAPI2" ]
+        }
+      }
+    ],
+    "flags": [
+      {
+        "attributes": ["not_app_extension_safe"]
+      }
+    ],
+    "install_names": [
+      {"name": "/System/Library/Frameworks/Zippered.framework/Versions/A/Zippered"}
+    ],
+    "target_info": [
+      {
+        "min_deployment": "13", "target": "x86_64-macos"
+      },
+      {
+        "min_deployment": "16", "target": "x86_64-maccatalyst"
+      }
+    ]
+  },
+  "tapi_tbd_version": 5
+}
diff --git a/clang/test/InstallAPI/Inputs/Zippered/Zippered.yaml b/clang/test/InstallAPI/Inputs/Zippered/Zippered.yaml
new file mode 100644
index 000000000000..284cd46416fd
--- /dev/null
+++ b/clang/test/InstallAPI/Inputs/Zippered/Zippered.yaml
@@ -0,0 +1,383 @@
+# Generated from: 
+# xcrun -sdk macosx clang --target=x86_64-apple-macos13 --target-variant=x86_64-apple-ios16-macabi
+# -dynamiclib 
+#
+#import "Zippered.h"
+#import "Zippered_Private.h"
+# MyType invalidAPI() { return 0; }
+# int macOSAPI() { return 0; }
+# int macOSAPI2() { return 0; }
+# int iOSAPI() { return 0; }
+# int commonAPI() { return 0; }
+# int obsoletedMacOSAPI() { return 0; }
+# 
+# #if __is_target_environment(macabi)
+# int a = 0;
+# UIImage *image = 0;
+# #else
+# long a = 0;
+# NSImage *image = 0;
+# #endif
+
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x1000007
+  cpusubtype:      0x3
+  filetype:        0x6
+  ncmds:           15
+  sizeofcmds:      1584
+  flags:           0x100085
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         312
+    segname:         __TEXT
+    vmaddr:          0
+    vmsize:          12288
+    fileoff:         0
+    filesize:        12288
+    maxprot:         5
+    initprot:        5
+    nsects:          3
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x1090
+        size:            88
+        offset:          0x1090
+        align:           4
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         554889E531C05DC30F1F840000000000554889E531C05DC30F1F840000000000554889E531C05DC30F1F840000000000554889E531C05DC30F1F840000000000554889E531C05DC30F1F840000000000554889E531C05DC3
+      - sectname:        __unwind_info
+        segname:         __TEXT
+        addr:            0x10E8
+        size:            4152
+        offset:          0x10E8
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         010000001C000000010000002000000000000000200000000200000000000001901000003800000038000000E81000000000000038000000030000000C0001001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
+      - sectname:        __eh_frame
+        segname:         __TEXT
+        addr:            0x2120
+        size:            24
+        offset:          0x2120
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x6000000B
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         1400000000000000017A520001781001100C070890010000
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         152
+    segname:         __DATA_CONST
+    vmaddr:          12288
+    vmsize:          4096
+    fileoff:         12288
+    filesize:        4096
+    maxprot:         3
+    initprot:        3
+    nsects:          1
+    flags:           16
+    Sections:
+      - sectname:        __objc_imageinfo
+        segname:         __DATA_CONST
+        addr:            0x3000
+        size:            8
+        offset:          0x3000
+        align:           0
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         '0000000040000000'
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         152
+    segname:         __DATA
+    vmaddr:          16384
+    vmsize:          4096
+    fileoff:         16384
+    filesize:        0
+    maxprot:         3
+    initprot:        3
+    nsects:          1
+    flags:           0
+    Sections:
+      - sectname:        __common
+        segname:         __DATA
+        addr:            0x4000
+        size:            16
+        offset:          0x0
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x1
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          20480
+    vmsize:          384
+    fileoff:         16384
+    filesize:        384
+    maxprot:         1
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_DYLD_INFO_ONLY
+    cmdsize:         48
+    rebase_off:      0
+    rebase_size:     0
+    bind_off:        0
+    bind_size:       0
+    weak_bind_off:   0
+    weak_bind_size:  0
+    lazy_bind_off:   0
+    lazy_bind_size:  0
+    export_off:      16384
+    export_size:     128
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          16520
+    nsyms:           9
+    stroff:          16664
+    strsize:         104
+  - cmd:             LC_DYSYMTAB
+    cmdsize:         80
+    ilocalsym:       0
+    nlocalsym:       0
+    iextdefsym:      0
+    nextdefsym:      8
+    iundefsym:       8
+    nundefsym:       1
+    tocoff:          0
+    ntoc:            0
+    modtaboff:       0
+    nmodtab:         0
+    extrefsymoff:    0
+    nextrefsyms:     0
+    indirectsymoff:  0
+    nindirectsyms:   0
+    extreloff:       0
+    nextrel:         0
+    locreloff:       0
+    nlocrel:         0
+  - cmd:             LC_ID_DYLIB
+    cmdsize:         96
+    dylib:
+      name:            24
+      timestamp:       0
+      current_version: 65536
+      compatibility_version: 65536
+    Content:         '/System/Library/Frameworks/Zippered.framework/Versions/A/Zippered'
+    ZeroPadBytes:    7
+  - cmd:             LC_UUID
+    cmdsize:         24
+    uuid:            4C4C44B0-5555-3144-A126-166C8AB77CD1
+  - cmd:             LC_BUILD_VERSION
+    cmdsize:         32
+    platform:        1
+    minos:           851968
+    sdk:             983040
+    ntools:          1
+    Tools:
+      - tool:            4
+        version:         1245184
+  - cmd:             LC_BUILD_VERSION
+    cmdsize:         32
+    platform:        6
+    minos:           1048576
+    sdk:             1048576
+    ntools:          1
+    Tools:
+      - tool:            4
+        version:         1245184
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         56
+    dylib:
+      name:            24
+      timestamp:       0
+      current_version: 14942208
+      compatibility_version: 65536
+    Content:         '/usr/lib/libobjc.A.dylib'
+    ZeroPadBytes:    8
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         56
+    dylib:
+      name:            24
+      timestamp:       0
+      current_version: 88539136
+      compatibility_version: 65536
+    Content:         '/usr/lib/libSystem.B.dylib'
+    ZeroPadBytes:    6
+  - cmd:             LC_FUNCTION_STARTS
+    cmdsize:         16
+    dataoff:         16512
+    datasize:        8
+  - cmd:             LC_DATA_IN_CODE
+    cmdsize:         16
+    dataoff:         16520
+    datasize:        0
+LinkEditData:
+  ExportTrie:
+    TerminalSize:    0
+    NodeOffset:      0
+    Name:            ''
+    Flags:           0x0
+    Address:         0x0
+    Other:           0x0
+    ImportName:      ''
+    Children:
+      - TerminalSize:    0
+        NodeOffset:      5
+        Name:            _
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    4
+            NodeOffset:      53
+            Name:            a
+            Flags:           0x0
+            Address:         0x4000
+            Other:           0x0
+            ImportName:      ''
+          - TerminalSize:    0
+            NodeOffset:      59
+            Name:            i
+            Flags:           0x0
+            Address:         0x0
+            Other:           0x0
+            ImportName:      ''
+            Children:
+              - TerminalSize:    3
+                NodeOffset:      85
+                Name:            OSAPI
+                Flags:           0x0
+                Address:         0x10C0
+                Other:           0x0
+                ImportName:      ''
+              - TerminalSize:    4
+                NodeOffset:      90
+                Name:            mage
+                Flags:           0x0
+                Address:         0x4008
+                Other:           0x0
+                ImportName:      ''
+              - TerminalSize:    3
+                NodeOffset:      96
+                Name:            nvalidAPI
+                Flags:           0x0
+                Address:         0x1090
+                Other:           0x0
+                ImportName:      ''
+          - TerminalSize:    3
+            NodeOffset:      101
+            Name:            obsoletedMacOSAPI
+            Flags:           0x0
+            Address:         0x10E0
+            Other:           0x0
+            ImportName:      ''
+          - TerminalSize:    3
+            NodeOffset:      106
+            Name:            macOSAPI
+            Flags:           0x0
+            Address:         0x10A0
+            Other:           0x0
+            ImportName:      ''
+            Children:
+              - TerminalSize:    3
+                NodeOffset:      114
+                Name:            '2'
+                Flags:           0x0
+                Address:         0x10B0
+                Other:           0x0
+                ImportName:      ''
+          - TerminalSize:    3
+            NodeOffset:      119
+            Name:            commonAPI
+            Flags:           0x0
+            Address:         0x10D0
+            Other:           0x0
+            ImportName:      ''
+  NameList:
+    - n_strx:          2
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         4240
+    - n_strx:          14
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         4256
+    - n_strx:          24
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         4272
+    - n_strx:          35
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         4288
+    - n_strx:          43
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         4304
+    - n_strx:          54
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         4320
+    - n_strx:          73
+      n_type:          0xF
+      n_sect:          5
+      n_desc:          0
+      n_value:         16384
+    - n_strx:          76
+      n_type:          0xF
+      n_sect:          5
+      n_desc:          0
+      n_value:         16392
+    - n_strx:          83
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+  StringTable:
+    - ' '
+    - _invalidAPI
+    - _macOSAPI
+    - _macOSAPI2
+    - _iOSAPI
+    - _commonAPI
+    - _obsoletedMacOSAPI
+    - _a
+    - _image
+    - dyld_stub_binder
+    - ''
+    - ''
+    - ''
+    - ''
+  FunctionStarts:  [ 0x1090, 0x10A0, 0x10B0, 0x10C0, 0x10D0, 0x10E0 ]
+...
diff --git a/clang/test/InstallAPI/exclusive-passes-2.test b/clang/test/InstallAPI/exclusive-passes-2.test
new file mode 100644
index 000000000000..3e7a6d777d5a
--- /dev/null
+++ b/clang/test/InstallAPI/exclusive-passes-2.test
@@ -0,0 +1,58 @@
+; RUN: rm -rf %t
+; RUN: split-file %s %t
+
+// All passes should include Foo macro definition.
+; RUN: clang-installapi -target arm64-apple-macos12 \
+; RUN: -install_name @rpath/libfoo.dylib -current_version 1 \
+; RUN: -compatibility_version 1 \
+; RUN: -I%S/Inputs/LibFoo/usr/include -dynamiclib \
+; RUN: -extra-public-header %S/Inputs/LibFoo/usr/include/foo.h \
+; RUN: -o %t/output.tbd \
+; RUN: -DFoo -XApple -DDarwin=1 -XElf -DNONDarwin=1 2>&1 | FileCheck -allow-empty %s 
+; RUN: llvm-readtapi --compare %t/output.tbd %t/expected.tbd 2>&1 | FileCheck -allow-empty %s
+
+; CHECK-NOT: error
+; CHECK-NOT: warning
+
+
+//--- expected.tbd
+{
+  "main_library": {
+    "exported_symbols": [
+      {
+        "text": {
+          "global": [
+            "_FooLib$foo$darwin",
+            "_FooLib$foo$linux",
+            "_FooLib$foo"
+          ]
+        }
+      }
+    ],
+    "flags": [
+      {
+        "attributes": [
+          "not_app_extension_safe"
+        ]
+      }
+    ],
+    "install_names": [
+      {
+        "name": "@rpath/libfoo.dylib"
+      }
+    ],
+    "target_info": [
+      {
+        "min_deployment": "12",
+        "target": "arm64-macos"
+      }
+    ]
+  },
+  "tapi_tbd_version": 5
+}
+
+//--- options.json
+{
+  "Apple" : ["-DDarwin=1"],
+  "Elf" : ["-DNONDarwin"]
+}
diff --git a/clang/test/InstallAPI/exclusive-passes-platform.test b/clang/test/InstallAPI/exclusive-passes-platform.test
new file mode 100644
index 000000000000..c5a79cf9a30d
--- /dev/null
+++ b/clang/test/InstallAPI/exclusive-passes-platform.test
@@ -0,0 +1,286 @@
+; RUN: rm -rf %t
+; RUN: split-file %s %t
+; RUN: sed -e "s|DSTROOT|%/t|g" %t/inputs.json.in > %t/inputs.json
+; RUN: yaml2obj %t/Xplatform.yaml -o=%t/Xplatform
+
+// Check that in zippered mode, a successful pass runs in both macos & maccatalyst.
+; RUN: clang-installapi %t/inputs.json \
+; RUN: --target=x86_64-apple-macos10.15 -darwin-target-variant x86_64-apple-ios13.1-macabi \
+; RUN: -Xplatform_ios-macabi -iframework %t/System/iOSSupport/System/Library/Frameworks \
+; RUN: -install_name /System/Library/Frameworks/Xplatform.framework/Versions/A/Xplatform \
+; RUN: -fdefine-target-os-macros --verify-against=%t/Xplatform --verify-mode=Pedantic \
+; RUN: -o Xplatform.tbd  -F%t/Frameworks \
+: RUN: -current_version 1 -compatibility_version 1 2>&1 | FileCheck --allow-empty %s\
+; RUN: --implicit-check-not warning: --implicit-check-not error:
+
+// A missing header error should be invoked in macos pass because it wasn't given the needed search path.
+; RUN: mv %t/Xplatform-macosx.h %t/Frameworks/Xplatform.framework/Headers/Xplatform.h
+
+; RUN: not clang-installapi %t/inputs.json \
+; RUN: --target=x86_64-apple-macos10.15 -darwin-target-variant x86_64-apple-ios13.1-macabi \
+; RUN: -Xplatform_ios-macabi -iframework %t/System/iOSSupport/System/Library/Frameworks \
+; RUN: -install_name /System/Library/Frameworks/Xplatform.framework/Versions/A/Xplatform \
+; RUN: -fdefine-target-os-macros --verify-against=%t/Xplatform --verify-mode=Pedantic \
+; RUN: -o Xplatform.tbd  -F%t/Frameworks \
+: RUN: -current_version 1 -compatibility_version 1 2>&1 | FileCheck -check-prefix=MACOSFAIL %s 
+
+; MACOSFAIL: fatal error: 'IOSMac/IOSMac.h' file not found
+
+;--- Frameworks/Xplatform.framework/Headers/Xplatform.h
+#if TARGET_OS_MACCATALYST
+#include <IOSMac/IOSMac.h>
+#endif
+
+inline int foo() {
+  int x = 1;
+#if TARGET_OS_MACCATALYST
+  x += iOSAPI();
+#endif
+  return x;
+}
+
+extern int bar();
+
+;--- Xplatform-macosx.h
+#include <IOSMac/IOSMac.h>
+inline int foo() {
+  int x = 1;
+  return x;
+}
+
+extern int bar();
+
+;--- System/iOSSupport/System/Library/Frameworks/IOSMac.framework/Headers/IOSMac.h
+extern int iOSAPI();
+
+;--- Xplatform.yaml
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x1000007
+  cpusubtype:      0x3
+  filetype:        0x6
+  ncmds:           16
+  sizeofcmds:      968
+  flags:           0x100085
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         232
+    segname:         __TEXT
+    vmaddr:          0
+    vmsize:          32768
+    fileoff:         0
+    filesize:        32768
+    maxprot:         5
+    initprot:        5
+    nsects:          2
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x4FAD
+        size:            11
+        offset:          0x4FAD
+        align:           0
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         554889E5B8010000005DC3
+      - sectname:        __unwind_info
+        segname:         __TEXT
+        addr:            0x4FB8
+        size:            72
+        offset:          0x4FB8
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         010000001C000000000000001C000000000000001C00000002000000AD4F00003400000034000000B94F00000000000034000000030000000C000100100001000000000000000001
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         152
+    segname:         __DATA
+    vmaddr:          32768
+    vmsize:          16384
+    fileoff:         32768
+    filesize:        16384
+    maxprot:         3
+    initprot:        3
+    nsects:          1
+    flags:           0
+    Sections:
+      - sectname:        __objc_imageinfo
+        segname:         __DATA
+        addr:            0x8000
+        size:            8
+        offset:          0x8000
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         '0000000040000000'
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          49152
+    vmsize:          16384
+    fileoff:         49152
+    filesize:        88
+    maxprot:         1
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_ID_DYLIB
+    cmdsize:         96
+    dylib:
+      name:            24
+      timestamp:       1
+      current_version: 65536
+      compatibility_version: 65536
+    Content:   '/System/Library/Frameworks/Xplatform.framework/Versions/A/Xplatform'
+    ZeroPadBytes:    5
+  - cmd:             LC_DYLD_INFO_ONLY
+    cmdsize:         48
+    rebase_off:      0
+    rebase_size:     0
+    bind_off:        0
+    bind_size:       0
+    weak_bind_off:   0
+    weak_bind_size:  0
+    lazy_bind_off:   0
+    lazy_bind_size:  0
+    export_off:      49152
+    export_size:     16
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          49184
+    nsyms:           2
+    stroff:          49216
+    strsize:         24
+  - cmd:             LC_DYSYMTAB
+    cmdsize:         80
+    ilocalsym:       0
+    nlocalsym:       0
+    iextdefsym:      0
+    nextdefsym:      1
+    iundefsym:       1
+    nundefsym:       1
+    tocoff:          0
+    ntoc:            0
+    modtaboff:       0
+    nmodtab:         0
+    extrefsymoff:    0
+    nextrefsyms:     0
+    indirectsymoff:  0
+    nindirectsyms:   0
+    extreloff:       0
+    nextrel:         0
+    locreloff:       0
+    nlocrel:         0
+  - cmd:             LC_UUID
+    cmdsize:         24
+    uuid:            4AA4F126-BD02-359C-B3EF-E53AD399B590
+  - cmd:             LC_BUILD_VERSION
+    cmdsize:         32
+    platform:        1
+    minos:           659200
+    sdk:             721152
+    ntools:          1
+    Tools:
+      - tool:            3
+        version:         46008832
+  - cmd:             LC_BUILD_VERSION
+    cmdsize:         32
+    platform:        6
+    minos:           0x00d0100 
+    sdk:             851968
+    ntools:          1
+    Tools:
+      - tool:            3
+        version:         46008832
+  - cmd:             LC_SOURCE_VERSION
+    cmdsize:         16
+    version:         0
+  - cmd:             LC_SEGMENT_SPLIT_INFO
+    cmdsize:         16
+    dataoff:         49168
+    datasize:        8
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         56
+    dylib:
+      name:            24
+      timestamp:       2
+      current_version: 14942208
+      compatibility_version: 65536
+    Content:   '/usr/lib/libobjc.A.dylib'
+    ZeroPadBytes:    8
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         56
+    dylib:
+      name:            24
+      timestamp:       2
+      current_version: 84687873
+      compatibility_version: 65536
+    Content:   '/usr/lib/libSystem.B.dylib'
+    ZeroPadBytes:    6
+  - cmd:             LC_FUNCTION_STARTS
+    cmdsize:         16
+    dataoff:         49176
+    datasize:        8
+  - cmd:             LC_DATA_IN_CODE
+    cmdsize:         16
+    dataoff:         49184
+    datasize:        0
+LinkEditData:
+  ExportTrie:
+    TerminalSize:    0
+    NodeOffset:      0
+    Name:            ''
+    Flags:           0x0
+    Address:         0x0
+    Other:           0x0
+    ImportName:      ''
+    Children:
+      - TerminalSize:    4
+        NodeOffset:      8
+        Name:            _bar
+        Flags:           0x0
+        Address:         0x4FAD
+        Other:           0x0
+        ImportName:      ''
+  NameList:
+    - n_strx:          2
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         20397
+    - n_strx:          7
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+  StringTable:
+    - ' '
+    - _bar
+    - dyld_stub_binder
+...
+
+;--- inputs.json.in
+{
+  "headers": [ 
+  {
+    "path" : "DSTROOT/Frameworks/Xplatform.framework/Headers/Xplatform.h",
+    "type" : "public"
+  }
+  ],
+  "version": "3"
+}
diff --git a/clang/test/InstallAPI/exclusive-passes-zippered.test b/clang/test/InstallAPI/exclusive-passes-zippered.test
new file mode 100644
index 000000000000..73d9361d553e
--- /dev/null
+++ b/clang/test/InstallAPI/exclusive-passes-zippered.test
@@ -0,0 +1,56 @@
+; RUN: rm -rf %t
+; RUN: split-file %s %t
+; RUN: mkdir -p %t/Frameworks/
+; RUN: cp -r %S/Inputs/Zippered/Zippered.framework %t/Frameworks/
+; RUN: sed -e "s|DSTROOT|%/t|g" %t/inputs.json.in > %t/inputs.json
+; RUN: yaml2obj %S/Inputs/Zippered/Zippered.yaml -o %t/Frameworks/Zippered.framework/Zippered
+
+; RUN: clang-installapi \
+; RUN: --target=x86_64-apple-macos13 -darwin-target-variant x86_64-apple-ios16-macabi \
+; RUN: -install_name /System/Library/Frameworks/Zippered.framework/Versions/A/Zippered \
+; RUN: -current_version 1 -compatibility_version 1 %t/inputs.json \
+; RUN: --verify-against=%t/Frameworks/Zippered.framework/Zippered \
+; RUN: -isysroot %S/Inputs/MacOSX13.0.sdk -F%t/Frameworks \
+; RUN: --verify-mode=Pedantic -o %t/Zippered.tbd \
+; RUN: --extra-private-header=%t/Extra.h 2>&1 | FileCheck -allow-empty %s \
+; RUN: --implicit-check-not warning: --implicit-check-not error:
+; RUN: llvm-readtapi -compare %t/Zippered.tbd %S/Inputs/Zippered/Zippered.tbd
+
+// Flag extra symbols exposed by macro definition.
+; RUN: not clang-installapi \
+; RUN: --target=x86_64-apple-macos13 -darwin-target-variant x86_64-apple-ios16-macabi \
+; RUN: -install_name /System/Library/Frameworks/Zippered.framework/Versions/A/Zippered \
+; RUN: -current_version 1 -compatibility_version 1 %t/inputs.json \
+; RUN: -isysroot %S/Inputs/MacOSX13.0.sdk -F%t/Frameworks \
+; RUN: --verify-mode=Pedantic -o %t/Zippered.tbd -v \
+; RUN: --verify-against=%t/Frameworks/Zippered.framework/Zippered \
+; RUN: --extra-private-header=%t/Extra.h -XExtra -DExtra 2>&1 | FileCheck %s --check-prefix=MACRO_DEF
+
+; MACRO_DEF-COUNT-2: "-D" "Extra"
+; MACRO_DEF: violations found for x86_64-apple-macos13
+; MACRO_DEF: Extra.h:5:12: error: declaration has external linkage, but dynamic library doesn't have symbol 'foo$bar'
+; MACRO_DEF-COUNT-2: "-D" "Extra"
+; MACRO_DEF-NOT: violations found for x86_64-apple-ios16-macabi
+
+;--- Extra.h
+#define __STRING(x)     #x
+
+#if defined(Extra)
+  #define MACRO_DEF "$bar"
+extern int foo() __asm("_" __STRING(foo) MACRO_DEF) __attribute__((availability(ios, unavailable)));
+#endif
+
+;--- inputs.json.in
+{
+  "headers": [ 
+  {
+    "path" : "DSTROOT/Frameworks/Zippered.framework/PrivateHeaders/Zippered_Private.h",
+    "type" : "private"
+  }, 
+  {
+    "path" : "DSTROOT/Frameworks/Zippered.framework/Headers/Zippered.h",
+    "type" : "public"
+  }
+  ],
+  "version": "3"
+}
diff --git a/clang/test/InstallAPI/exclusive-passes.test b/clang/test/InstallAPI/exclusive-passes.test
new file mode 100644
index 000000000000..29b0fc3d7a2a
--- /dev/null
+++ b/clang/test/InstallAPI/exclusive-passes.test
@@ -0,0 +1,54 @@
+; RUN: rm -rf %t
+; RUN: split-file %s %t
+
+; RUN: clang-installapi \
+; RUN: -target arm64-apple-macos12 -install_name @rpath/libfoo.dylib \
+; RUN: -current_version 1 -compatibility_version 1 \
+; RUN: -XApple -DDarwin=1 -XElf -DNONDarwin=1 \
+; RUN: -I%S/Inputs/LibFoo/usr/include -dynamiclib \
+; RUN: -extra-public-header %S/Inputs/LibFoo/usr/include/public.h \
+; RUN: -o %t/output.tbd -v 2>&1 | FileCheck %s --check-prefix=INSTALLAPI
+; RUN: llvm-readtapi --compare %t/output.tbd %t/expected.tbd 2>&1 | FileCheck -allow-empty %s
+
+; CHECK-NOT: error
+; CHECK-NOT: warning
+
+; INSTALLAPI: Public Headers:
+; INSTALLAPI: Apple Public Headers:
+; INSTALLAPI: Elf Public Headers:
+
+;--- expected.tbd
+{
+  "main_library": {
+    "exported_symbols": [
+      {
+        "text": {
+          "global": [
+            "_foo$darwin",
+            "_foo$linux",
+            "_foo"
+          ]
+        }
+      }
+    ],
+    "flags": [
+      {
+        "attributes": [
+          "not_app_extension_safe"
+        ]
+      }
+    ],
+    "install_names": [
+      {
+        "name": "@rpath/libfoo.dylib"
+      }
+    ],
+    "target_info": [
+      {
+        "min_deployment": "12",
+        "target": "arm64-macos"
+      }
+    ]
+  },
+  "tapi_tbd_version": 5
+}
diff --git a/clang/test/InstallAPI/invalid-exclusive-passes.test b/clang/test/InstallAPI/invalid-exclusive-passes.test
new file mode 100644
index 000000000000..c23c918f0bfb
--- /dev/null
+++ b/clang/test/InstallAPI/invalid-exclusive-passes.test
@@ -0,0 +1,37 @@
+; RUN: rm -rf %t 
+; RUN: split-file %s %t
+
+// Validate arguments not allowed with -X
+; RUN: not clang-installapi \
+; RUN: -target arm64-apple-macos12 \
+; RUN: -install_name @rpath/libfoo.dylib \
+; RUN: -current_version 1 -compatibility_version 1 \
+; RUN: -XApple -I/fake/path -I%t %t/inputs.json \
+; RUN: -dynamiclib -o %t/output.tbd  2>&1 | FileCheck %s --check-prefix=INVALID_OPT
+; INVALID_OPT: error: invalid argument '-XApple' not allowed with '-I/fake/path'
+
+// Validate reserved labels.
+; RUN: not clang-installapi \
+; RUN: -target arm64-apple-macos12 \
+; RUN: -install_name @rpath/libfoo.dylib \
+; RUN: -current_version 1 -compatibility_version 1 \
+; RUN: -XApple -DDarwin=1 -XElf -DNONDarwin=1 \
+; RUN: -I%t -dynamiclib -o %t/output.tbd %t/inputs.json \
+; RUN: -XPrivate -DInvalid=1 2>&1 | FileCheck %s --check-prefix=INVALID_LABELS
+; INVALID_LABELS: error: label 'Private' is reserved: use a different label name for -X<label>
+
+// Validate arguments not allowed with -Xproject
+; RUN: not clang-installapi \
+; RUN: -target arm64-apple-macos12 \
+; RUN: -install_name @rpath/libfoo.dylib \
+; RUN: -current_version 1 -compatibility_version 1 \
+; RUN: -Xproject -fprofile-instr-generate \
+; RUN: %t/inputs.json -I%t -dynamiclib \
+; RUN: -o %t/output.tbd 2>&1 | FileCheck %s --check-prefix=INVALID_PROJECT_OPT
+; INVALID_PROJECT_OPT: error: invalid argument '-Xproject' not allowed with '-fprofile-instr-generate'
+
+;--- inputs.json
+{
+  "headers": [ ],
+  "version": "3"
+}
diff --git a/clang/test/InstallAPI/project-header-only-args.test b/clang/test/InstallAPI/project-header-only-args.test
new file mode 100644
index 000000000000..76fecce5b4a2
--- /dev/null
+++ b/clang/test/InstallAPI/project-header-only-args.test
@@ -0,0 +1,84 @@
+; RUN: rm -rf %t
+; RUN: split-file %s %t
+; RUN: sed -e "s|DSTROOT|%/t|g" %t/inputs.json.in > %t/inputs.json
+; RUN: mkdir -p %t/modules.cache
+
+; RUN: clang-installapi \
+; RUN: -target arm64-apple-macos12 -install_name @rpath/libfoo.dylib \
+; RUN: -current_version 1 -compatibility_version 1 \
+; RUN: -Xproject -fmodules -I%t/usr/include \
+; RUN: -Xproject -fmodules-cache-path=%t/modules.cache \
+; RUN: -F %S/Inputs/Foundation/ \
+; RUN: -exclude-public-header %t/usr/include/public.h \
+; RUN: -extra-project-header %t/project.h -I%t -dynamiclib \
+; RUN: %t/inputs.json \
+; RUN: -o %t/output.tbd 2>&1 | FileCheck %s --allow-empty
+; RUN: llvm-readtapi --compare %t/output.tbd %t/expected.tbd 2>&1 | FileCheck %s --allow-empty
+
+; RUN: not clang-installapi \
+; RUN: -target arm64-apple-macos12 -install_name @rpath/libfoo.dylib \
+; RUN: -current_version 1 -compatibility_version 1 \
+; RUN: -Xproject -fmodules -I%t/usr/include \
+; RUN: -Xproject -fmodules-cache-path=%t/modules.cache \
+; RUN: -extra-project-header %t/project.h \
+; RUN: -F %S/Inputs/Foundation/ \
+; RUN: %t/inputs.json \
+; RUN: -I%t -dynamiclib -o %t/output.tbd 2>&1 | FileCheck %s --check-prefix=PUBLIC
+
+; CHECK-NOT: error
+; CHECK-NOT: warning
+
+; PUBLIC: public.h:1:1: error: use of '@import' when modules are disabled
+; PUBLIC-NEXT: @import Foundation;
+
+//--- usr/include/public.h
+@import Foundation;
+extern int foo();
+
+//--- project.h
+@import Foundation;
+extern int bar();
+
+//--- expected.tbd
+{
+  "main_library": {
+    "exported_symbols": [
+      {
+        "text": {
+          "global": [
+            "_bar"
+          ]
+        }
+      }
+    ],
+    "flags": [
+      {
+        "attributes": [
+          "not_app_extension_safe"
+        ]
+      }
+    ],
+    "install_names": [
+      {
+        "name": "@rpath/libfoo.dylib"
+      }
+    ],
+    "target_info": [
+      {
+        "min_deployment": "12",
+        "target": "arm64-macos"
+      }
+    ]
+  },
+  "tapi_tbd_version": 5
+}
+
+;--- inputs.json.in
+{
+  "headers": [ {
+    "path" : "DSTROOT/usr/include/public.h",
+    "type" : "public"
+  }
+  ],
+  "version": "3"
+}
diff --git a/clang/test/Interpreter/const.cpp b/clang/test/Interpreter/const.cpp
index 86358c1a54fb..57fd880400e6 100644
--- a/clang/test/Interpreter/const.cpp
+++ b/clang/test/Interpreter/const.cpp
@@ -1,4 +1,4 @@
-// UNSUPPORTED: system-aix
+// UNSUPPORTED: system-aix, system-zos
 // see https://github.com/llvm/llvm-project/issues/68092
 // XFAIL: host={{.*}}-windows-msvc
 
diff --git a/clang/test/Layout/aix-power-alignment-typedef-2.cpp b/clang/test/Layout/aix-power-alignment-typedef-2.cpp
index b814be9c06cf..67e2863f2edc 100644
--- a/clang/test/Layout/aix-power-alignment-typedef-2.cpp
+++ b/clang/test/Layout/aix-power-alignment-typedef-2.cpp
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -triple powerpc-ibm-aix-xcoff -S -emit-llvm -x c++ < %s | \
+// RUN: %clang_cc1 -triple powerpc-ibm-aix-xcoff -emit-llvm -x c++ < %s | \
 // RUN:   FileCheck %s
 
-// RUN: %clang_cc1 -triple powerpc64-ibm-aix-xcoff -S -emit-llvm -x c++ < %s | \
+// RUN: %clang_cc1 -triple powerpc64-ibm-aix-xcoff -emit-llvm -x c++ < %s | \
 // RUN:   FileCheck %s
 
 namespace test1 {
diff --git a/clang/test/Layout/aix-type-align-and-pack-attr.cpp b/clang/test/Layout/aix-type-align-and-pack-attr.cpp
index d119511b141f..45ba55266593 100644
--- a/clang/test/Layout/aix-type-align-and-pack-attr.cpp
+++ b/clang/test/Layout/aix-type-align-and-pack-attr.cpp
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -triple powerpc-ibm-aix-xcoff -S -emit-llvm -x c++ < %s | \
+// RUN: %clang_cc1 -triple powerpc-ibm-aix-xcoff -emit-llvm -x c++ < %s | \
 // RUN:   FileCheck %s
 
-// RUN: %clang_cc1 -triple powerpc64-ibm-aix-xcoff -S -emit-llvm -x c++ < %s | \
+// RUN: %clang_cc1 -triple powerpc64-ibm-aix-xcoff -emit-llvm -x c++ < %s | \
 // RUN:   FileCheck %s
 
 namespace test1 {
diff --git a/clang/test/Layout/ms-aligned-array.c b/clang/test/Layout/ms-aligned-array.c
index de3887f8242c..c088a0af12f7 100644
--- a/clang/test/Layout/ms-aligned-array.c
+++ b/clang/test/Layout/ms-aligned-array.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple x86_64-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
+// RUN: %clang_cc1 -fno-rtti -triple x86_64-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
 // RUN:            | FileCheck %s -check-prefix CHECK
 
 // Before PR45420, we would only find the alignment on this record. Afterwards,
diff --git a/clang/test/Layout/ms-vtordisp-local.cpp b/clang/test/Layout/ms-vtordisp-local.cpp
index 048f4e58297b..852d29b78b92 100644
--- a/clang/test/Layout/ms-vtordisp-local.cpp
+++ b/clang/test/Layout/ms-vtordisp-local.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fms-extensions -fexceptions -fcxx-exceptions -emit-llvm-only -triple x86_64-pc-win32 -fdump-record-layouts -fsyntax-only %s 2>&1 | FileCheck %s
+// RUN: %clang_cc1 -fms-extensions -fexceptions -fcxx-exceptions -triple x86_64-pc-win32 -fdump-record-layouts -fsyntax-only %s 2>&1 | FileCheck %s
 
 struct Base {
   virtual ~Base() {}
diff --git a/clang/test/Layout/ms-x86-alias-avoidance-padding.cpp b/clang/test/Layout/ms-x86-alias-avoidance-padding.cpp
index 0e1a5fdac817..678537bb514f 100644
--- a/clang/test/Layout/ms-x86-alias-avoidance-padding.cpp
+++ b/clang/test/Layout/ms-x86-alias-avoidance-padding.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple i686-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
+// RUN: %clang_cc1 -fno-rtti -triple i686-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
 // RUN:            | FileCheck %s --strict-whitespace
-// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple x86_64-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
+// RUN: %clang_cc1 -fno-rtti -triple x86_64-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
 // RUN:            | FileCheck %s -check-prefix CHECK-X64 --strict-whitespace
 
 extern "C" int printf(const char *fmt, ...);
diff --git a/clang/test/Layout/ms-x86-aligned-tail-padding.cpp b/clang/test/Layout/ms-x86-aligned-tail-padding.cpp
index 81d283163672..1acbd62f89ff 100644
--- a/clang/test/Layout/ms-x86-aligned-tail-padding.cpp
+++ b/clang/test/Layout/ms-x86-aligned-tail-padding.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple i686-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
+// RUN: %clang_cc1 -fno-rtti -triple i686-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
 // RUN:            | FileCheck %s --strict-whitespace
-// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple x86_64-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
+// RUN: %clang_cc1 -fno-rtti -triple x86_64-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
 // RUN:            | FileCheck %s -check-prefix CHECK-X64 --strict-whitespace
 
 extern "C" int printf(const char *fmt, ...);
diff --git a/clang/test/Layout/ms-x86-basic-layout.cpp b/clang/test/Layout/ms-x86-basic-layout.cpp
index f2135496b1ef..688d03bba9bb 100644
--- a/clang/test/Layout/ms-x86-basic-layout.cpp
+++ b/clang/test/Layout/ms-x86-basic-layout.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple i686-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
+// RUN: %clang_cc1 -fno-rtti -triple i686-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
 // RUN:            | FileCheck %s --strict-whitespace
-// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple x86_64-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
+// RUN: %clang_cc1 -fno-rtti -triple x86_64-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
 // RUN:            | FileCheck %s -check-prefix CHECK-X64 --strict-whitespace
 
 extern "C" int printf(const char *fmt, ...);
diff --git a/clang/test/Layout/ms-x86-bitfields-vbases.cpp b/clang/test/Layout/ms-x86-bitfields-vbases.cpp
index a78fdad7e2eb..f8cd79ca3f37 100644
--- a/clang/test/Layout/ms-x86-bitfields-vbases.cpp
+++ b/clang/test/Layout/ms-x86-bitfields-vbases.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple i686-pc-win32 -fdump-record-layouts -fsyntax-only %s 2>&1 \
+// RUN: %clang_cc1 -fno-rtti -triple i686-pc-win32 -fdump-record-layouts -fsyntax-only %s 2>&1 \
 // RUN:            | FileCheck %s
-// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple x86_64-pc-win32 -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
+// RUN: %clang_cc1 -fno-rtti -triple x86_64-pc-win32 -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
 // RUN:            | FileCheck %s -check-prefix CHECK-X64
 
 struct B0 { int a; };
diff --git a/clang/test/Layout/ms-x86-declspec-empty_bases.cpp b/clang/test/Layout/ms-x86-declspec-empty_bases.cpp
index 4738ce5720f7..4a4c1a5da38d 100644
--- a/clang/test/Layout/ms-x86-declspec-empty_bases.cpp
+++ b/clang/test/Layout/ms-x86-declspec-empty_bases.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple i686-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
+// RUN: %clang_cc1 -fno-rtti -triple i686-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
 // RUN:            | FileCheck %s
-// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple x86_64-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
+// RUN: %clang_cc1 -fno-rtti -triple x86_64-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
 // RUN:            | FileCheck %s
 
 namespace test1 {
diff --git a/clang/test/Layout/ms-x86-empty-layout.c b/clang/test/Layout/ms-x86-empty-layout.c
index dfcbb851977e..48eb811561f5 100644
--- a/clang/test/Layout/ms-x86-empty-layout.c
+++ b/clang/test/Layout/ms-x86-empty-layout.c
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple i686-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
+// RUN: %clang_cc1 -fno-rtti -triple i686-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
 // RUN:            | FileCheck %s
-// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple x86_64-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
+// RUN: %clang_cc1 -fno-rtti -triple x86_64-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
 // RUN:            | FileCheck %s
 
 struct EmptyIntMemb {
diff --git a/clang/test/Layout/ms-x86-empty-nonvirtual-bases.cpp b/clang/test/Layout/ms-x86-empty-nonvirtual-bases.cpp
index 41658f6a11e5..bfb971d021bc 100644
--- a/clang/test/Layout/ms-x86-empty-nonvirtual-bases.cpp
+++ b/clang/test/Layout/ms-x86-empty-nonvirtual-bases.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple i686-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
+// RUN: %clang_cc1 -fno-rtti -triple i686-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
 // RUN:            | FileCheck %s --strict-whitespace
-// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple x86_64-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
+// RUN: %clang_cc1 -fno-rtti -triple x86_64-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
 // RUN:            | FileCheck %s --strict-whitespace
 
 extern "C" int printf(const char *fmt, ...);
diff --git a/clang/test/Layout/ms-x86-empty-virtual-base.cpp b/clang/test/Layout/ms-x86-empty-virtual-base.cpp
index 8c350a2a56e1..c23495896384 100644
--- a/clang/test/Layout/ms-x86-empty-virtual-base.cpp
+++ b/clang/test/Layout/ms-x86-empty-virtual-base.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple i686-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
+// RUN: %clang_cc1 -fno-rtti -triple i686-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
 // RUN:            | FileCheck %s --strict-whitespace
-// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple x86_64-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
+// RUN: %clang_cc1 -fno-rtti -triple x86_64-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
 // RUN:            | FileCheck %s -check-prefix CHECK-X64 --strict-whitespace
 
 extern "C" int printf(const char *fmt, ...);
diff --git a/clang/test/Layout/ms-x86-lazy-empty-nonvirtual-base.cpp b/clang/test/Layout/ms-x86-lazy-empty-nonvirtual-base.cpp
index 7eb6958ce446..66e89e446d84 100644
--- a/clang/test/Layout/ms-x86-lazy-empty-nonvirtual-base.cpp
+++ b/clang/test/Layout/ms-x86-lazy-empty-nonvirtual-base.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple i686-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
+// RUN: %clang_cc1 -fno-rtti -triple i686-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
 // RUN:            | FileCheck %s --strict-whitespace
-// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple x86_64-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
+// RUN: %clang_cc1 -fno-rtti -triple x86_64-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
 // RUN:            | FileCheck %s -check-prefix CHECK-X64 --strict-whitespace
 
 extern "C" int printf(const char *fmt, ...);
diff --git a/clang/test/Layout/ms-x86-member-pointers.cpp b/clang/test/Layout/ms-x86-member-pointers.cpp
index 89dd211a3515..a45359f23a9c 100644
--- a/clang/test/Layout/ms-x86-member-pointers.cpp
+++ b/clang/test/Layout/ms-x86-member-pointers.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple i686-pc-win32 -fdump-record-layouts -fms-extensions -fsyntax-only %s 2>&1 | FileCheck %s
+// RUN: %clang_cc1 -fno-rtti -triple i686-pc-win32 -fdump-record-layouts -fms-extensions -fsyntax-only %s 2>&1 | FileCheck %s
 
 struct __single_inheritance S;
 struct __multiple_inheritance M;
diff --git a/clang/test/Layout/ms-x86-misalignedarray.cpp b/clang/test/Layout/ms-x86-misalignedarray.cpp
index 189ba4d25774..a9f9d1e1a2cc 100644
--- a/clang/test/Layout/ms-x86-misalignedarray.cpp
+++ b/clang/test/Layout/ms-x86-misalignedarray.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -fno-rtti -triple i686-pc-win32 -fsyntax-only -verify %s
-// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple x86_64-pc-win32 -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
+// RUN: %clang_cc1 -fno-rtti -triple x86_64-pc-win32 -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
 // RUN:            | FileCheck %s -check-prefix CHECK-X64
 
 struct T0 { char c; };
diff --git a/clang/test/Layout/ms-x86-pack-and-align.cpp b/clang/test/Layout/ms-x86-pack-and-align.cpp
index fc4fe17b29e3..98eba930ee53 100644
--- a/clang/test/Layout/ms-x86-pack-and-align.cpp
+++ b/clang/test/Layout/ms-x86-pack-and-align.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple i686-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only -Wno-inaccessible-base %s 2>&1 \
+// RUN: %clang_cc1 -fno-rtti -triple i686-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only -Wno-inaccessible-base %s 2>&1 \
 // RUN:            | FileCheck %s --strict-whitespace
-// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple x86_64-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only -Wno-inaccessible-base %s 2>/dev/null \
+// RUN: %clang_cc1 -fno-rtti -triple x86_64-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only -Wno-inaccessible-base %s 2>/dev/null \
 // RUN:            | FileCheck %s -check-prefix CHECK-X64 --strict-whitespace
 
 extern "C" int printf(const char *fmt, ...);
diff --git a/clang/test/Layout/ms-x86-primary-bases.cpp b/clang/test/Layout/ms-x86-primary-bases.cpp
index 624819a8ebaf..b0dbd091b6c7 100644
--- a/clang/test/Layout/ms-x86-primary-bases.cpp
+++ b/clang/test/Layout/ms-x86-primary-bases.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple i686-pc-win32 -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
+// RUN: %clang_cc1 -fno-rtti -triple i686-pc-win32 -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
 // RUN:            | FileCheck %s --strict-whitespace
-// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple x86_64-pc-win32 -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
+// RUN: %clang_cc1 -fno-rtti -triple x86_64-pc-win32 -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
 // RUN:            | FileCheck %s -check-prefix CHECK-X64 --strict-whitespace
 
 extern "C" int printf(const char *fmt, ...);
diff --git a/clang/test/Layout/ms-x86-vfvb-alignment.cpp b/clang/test/Layout/ms-x86-vfvb-alignment.cpp
index e3c556bc3a58..4d8768437a1b 100644
--- a/clang/test/Layout/ms-x86-vfvb-alignment.cpp
+++ b/clang/test/Layout/ms-x86-vfvb-alignment.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple i686-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>&1 \
+// RUN: %clang_cc1 -fno-rtti -triple i686-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>&1 \
 // RUN:            | FileCheck %s --strict-whitespace
-// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple x86_64-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
+// RUN: %clang_cc1 -fno-rtti -triple x86_64-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
 // RUN:            | FileCheck %s -check-prefix CHECK-X64 --strict-whitespace
 
 extern "C" int printf(const char *fmt, ...);
diff --git a/clang/test/Layout/ms-x86-vfvb-sharing.cpp b/clang/test/Layout/ms-x86-vfvb-sharing.cpp
index 042f9b1f8782..d9ee3f07768c 100644
--- a/clang/test/Layout/ms-x86-vfvb-sharing.cpp
+++ b/clang/test/Layout/ms-x86-vfvb-sharing.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple i686-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>&1 \
+// RUN: %clang_cc1 -fno-rtti -triple i686-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>&1 \
 // RUN:            | FileCheck %s --strict-whitespace
-// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple x86_64-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
+// RUN: %clang_cc1 -fno-rtti -triple x86_64-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
 // RUN:            | FileCheck %s -check-prefix CHECK-X64 --strict-whitespace
 
 extern "C" int printf(const char *fmt, ...);
diff --git a/clang/test/Layout/ms-x86-vtordisp.cpp b/clang/test/Layout/ms-x86-vtordisp.cpp
index 9f85edff5850..78bcece6930f 100644
--- a/clang/test/Layout/ms-x86-vtordisp.cpp
+++ b/clang/test/Layout/ms-x86-vtordisp.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -std=c++14 -fno-rtti -fms-extensions -emit-llvm-only -triple i686-pc-win32 -fdump-record-layouts -fsyntax-only %s 2>&1 \
+// RUN: %clang_cc1 -std=c++14 -fno-rtti -fms-extensions -triple i686-pc-win32 -fdump-record-layouts -fsyntax-only %s 2>&1 \
 // RUN:            | FileCheck %s
-// RUN: %clang_cc1 -std=c++14 -fno-rtti -fms-extensions -emit-llvm-only -triple x86_64-pc-win32 -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
+// RUN: %clang_cc1 -std=c++14 -fno-rtti -fms-extensions -triple x86_64-pc-win32 -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
 // RUN:            | FileCheck %s -check-prefix CHECK-X64
 
 extern "C" int printf(const char *fmt, ...);
diff --git a/clang/test/Lexer/cxx-features.cpp b/clang/test/Lexer/cxx-features.cpp
index 4a08eb61cd39..41550cf02aa3 100644
--- a/clang/test/Lexer/cxx-features.cpp
+++ b/clang/test/Lexer/cxx-features.cpp
@@ -7,7 +7,7 @@
 // RUN: %clang_cc1 -std=c++2c -fcxx-exceptions -fsized-deallocation -verify %s
 
 //
-// RUN: %clang_cc1 -std=c++17 -fcxx-exceptions -fsized-deallocation -frelaxed-template-template-args -DRELAXED_TEMPLATE_TEMPLATE_ARGS=1 -verify %s
+// RUN: %clang_cc1 -std=c++17 -fcxx-exceptions -fsized-deallocation -fno-relaxed-template-template-args -DNO_RELAXED_TEMPLATE_TEMPLATE_ARGS=1 -verify %s
 // RUN: %clang_cc1 -std=c++17 -fcxx-exceptions -fsized-deallocation -DCONCEPTS_TS=1 -verify %s
 // RUN: %clang_cc1 -std=c++14 -fno-rtti -fno-threadsafe-statics -verify %s -DNO_EXCEPTIONS -DNO_RTTI -DNO_THREADSAFE_STATICS -fsized-deallocation
 // RUN: %clang_cc1 -std=c++14 -fchar8_t -DNO_EXCEPTIONS -DCHAR8_T -verify -fsized-deallocation %s
@@ -231,8 +231,8 @@
 #error "wrong value for __cpp_nontype_template_args"
 #endif
 
-#if defined(RELAXED_TEMPLATE_TEMPLATE_ARGS) \
-    ? check(template_template_args, 0, 0, 0, 201611, 201611, 201611, 201611) \
+#if !defined(NO_RELAXED_TEMPLATE_TEMPLATE_ARGS) \
+    ? check(template_template_args, 201611, 201611, 201611, 201611, 201611, 201611, 201611) \
     : check(template_template_args, 0, 0, 0, 0, 0, 0, 0)
 #error "wrong value for __cpp_template_template_args"
 #endif
diff --git a/clang/test/Lexer/minimize_source_to_dependency_directives_include.c b/clang/test/Lexer/minimize_source_to_dependency_directives_include.c
index 678753dd4559..1d3dd158f2cd 100644
--- a/clang/test/Lexer/minimize_source_to_dependency_directives_include.c
+++ b/clang/test/Lexer/minimize_source_to_dependency_directives_include.c
@@ -1,8 +1,8 @@
-// Test double slashes in #include directive along with angle brackets. Previously, this was interpreted as comments.
-// RUN: %clang_cc1 -DTEST -print-dependency-directives-minimized-source %s 2>&1 | FileCheck %s
-
-#include "a//b.h"
-#include <a//b.h>
-
-// CHECK: #include "a//b.h"
-// CHECK: #include <a//b.h>
+// Test double slashes in #include directive along with angle brackets. Previously, this was interpreted as comments.
+// RUN: %clang_cc1 -DTEST -print-dependency-directives-minimized-source %s 2>&1 | FileCheck %s
+
+#include "a//b.h"
+#include <a//b.h>
+
+// CHECK: #include "a//b.h"
+// CHECK: #include <a//b.h>
diff --git a/clang/test/Lexer/minimize_source_to_dependency_directives_utf8bom.c b/clang/test/Lexer/minimize_source_to_dependency_directives_utf8bom.c
index 305442fbd28c..46aba914441b 100644
--- a/clang/test/Lexer/minimize_source_to_dependency_directives_utf8bom.c
+++ b/clang/test/Lexer/minimize_source_to_dependency_directives_utf8bom.c
@@ -1,10 +1,10 @@
-// Test UTF8 BOM at start of file
-// RUN: printf '\xef\xbb\xbf' > %t.c
-// RUN: echo '#ifdef TEST\n' >> %t.c
-// RUN: echo '#include <string>' >> %t.c
-// RUN: echo '#endif' >> %t.c
-// RUN: %clang_cc1 -DTEST -print-dependency-directives-minimized-source %t.c 2>&1 | FileCheck %s
-
-// CHECK:      #ifdef TEST
-// CHECK-NEXT: #include <string>
-// CHECK-NEXT: #endif
+// Test UTF8 BOM at start of file
+// RUN: printf '\xef\xbb\xbf' > %t.c
+// RUN: echo '#ifdef TEST\n' >> %t.c
+// RUN: echo '#include <string>' >> %t.c
+// RUN: echo '#endif' >> %t.c
+// RUN: %clang_cc1 -DTEST -print-dependency-directives-minimized-source %t.c 2>&1 | FileCheck %s
+
+// CHECK:      #ifdef TEST
+// CHECK-NEXT: #include <string>
+// CHECK-NEXT: #endif
diff --git a/clang/test/Lexer/ms-compatibility.c b/clang/test/Lexer/ms-compatibility.c
index d159ad1b8d92..2981b8e062e9 100644
--- a/clang/test/Lexer/ms-compatibility.c
+++ b/clang/test/Lexer/ms-compatibility.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -fsyntax-only -E -fms-compatibility %s | FileCheck --check-prefix=CHECK-MS-COMPAT %s
-// RUN: %clang_cc1 -fsyntax-only -E %s | FileCheck --check-prefix=CHECK-NO-MS-COMPAT %s
+// RUN: %clang_cc1 -E -fms-compatibility %s | FileCheck --check-prefix=CHECK-MS-COMPAT %s
+// RUN: %clang_cc1 -E %s | FileCheck --check-prefix=CHECK-NO-MS-COMPAT %s
 
 #define FN(x) L#x
 #define F L "aaa"
diff --git a/clang/test/Lexer/raw-string-dlim-invalid.cpp b/clang/test/Lexer/raw-string-dlim-invalid.cpp
index da797f00a1d6..8928b398ceb7 100644
--- a/clang/test/Lexer/raw-string-dlim-invalid.cpp
+++ b/clang/test/Lexer/raw-string-dlim-invalid.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -E -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify %s
 
 // expected-error@+2{{invalid character ')' in raw string delimiter; use PREFIX( )PREFIX to delimit raw string}}
 // expected-error@+1{{expected expression}}
diff --git a/clang/test/Lexer/unicode.c b/clang/test/Lexer/unicode.c
index 909b5b424443..e7c7d4b5dad5 100644
--- a/clang/test/Lexer/unicode.c
+++ b/clang/test/Lexer/unicode.c
@@ -3,6 +3,7 @@
 // RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx -x c++ -std=c++11 %s
 // RUN: %clang_cc1 -std=c99 -E -DPP_ONLY=1 %s | FileCheck %s --strict-whitespace
 // RUN: %clang_cc1 -E -DPP_ONLY=1 %s | FileCheck %s --strict-whitespace
+// UNSUPPORTED: system-zos
 
 // This file contains Unicode characters; please do not "fix" them!
 
diff --git a/clang/test/Lexer/wchar-signedness.c b/clang/test/Lexer/wchar-signedness.c
index ea46da015490..8363a709ce51 100644
--- a/clang/test/Lexer/wchar-signedness.c
+++ b/clang/test/Lexer/wchar-signedness.c
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -fsyntax-only -dM -E %s -triple x86_64-none-linux-gnu | FileCheck %s --check-prefix=CHECK-X86
-// RUN: %clang_cc1 -fsyntax-only -dM -E %s -triple armv7-none-eabi | FileCheck %s --check-prefix=CHECK-ARM
-// RUN: %clang_cc1 -fsyntax-only -dM -E %s -triple thumbv7-none-eabi | FileCheck %s --check-prefix=CHECK-ARM
-// RUN: %clang_cc1 -fsyntax-only -dM -E %s -triple s390x-none-zos | FileCheck %s --check-prefix=CHECK-ZOS
+// RUN: %clang_cc1 -dM -E %s -triple x86_64-none-linux-gnu | FileCheck %s --check-prefix=CHECK-X86
+// RUN: %clang_cc1 -dM -E %s -triple armv7-none-eabi | FileCheck %s --check-prefix=CHECK-ARM
+// RUN: %clang_cc1 -dM -E %s -triple thumbv7-none-eabi | FileCheck %s --check-prefix=CHECK-ARM
+// RUN: %clang_cc1 -dM -E %s -triple s390x-none-zos | FileCheck %s --check-prefix=CHECK-ZOS
 
 // CHECK-X86-NOT: #define __WCHAR_UNSIGNED__
 // CHECK-X86: #define __WINT_UNSIGNED__ 1
diff --git a/clang/test/Misc/backend-optimization-failure-nodbg.cpp b/clang/test/Misc/backend-optimization-failure-nodbg.cpp
index 02b0889d0208..970db1904236 100644
--- a/clang/test/Misc/backend-optimization-failure-nodbg.cpp
+++ b/clang/test/Misc/backend-optimization-failure-nodbg.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown %s -O3 -emit-llvm -S -verify -o /dev/null
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown %s -O3 -emit-llvm -verify -o /dev/null
 // REQUIRES: x86-registered-target
 
 // Test verifies optimization failures generated by the backend are handled
diff --git a/clang/test/Misc/backend-optimization-failure.cpp b/clang/test/Misc/backend-optimization-failure.cpp
index 18b0ccb74e8c..ba15434874aa 100644
--- a/clang/test/Misc/backend-optimization-failure.cpp
+++ b/clang/test/Misc/backend-optimization-failure.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown %s -O3 -emit-llvm -debug-info-kind=line-tables-only -S -verify -o /dev/null
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown %s -O3 -emit-llvm -debug-info-kind=line-tables-only -verify -o /dev/null
 // REQUIRES: x86-registered-target
 
 // Test verifies optimization failures generated by the backend are handled
diff --git a/clang/test/Misc/loop-opt-setup.c b/clang/test/Misc/loop-opt-setup.c
index 2ef9f25fb0dd..01643e6073b5 100644
--- a/clang/test/Misc/loop-opt-setup.c
+++ b/clang/test/Misc/loop-opt-setup.c
@@ -1,6 +1,6 @@
 // This tests loop unrolling and loop deletion (enabled under -O1)
-// RUN: %clang_cc1 -std=c11 -O1 -fno-unroll-loops -S -o - %s -emit-llvm | FileCheck %s
-// RUN: %clang_cc1 -std=c99 -O1 -fno-unroll-loops -S -o - %s -emit-llvm | FileCheck %s --check-prefix C99
+// RUN: %clang_cc1 -std=c11 -O1 -fno-unroll-loops -o - %s -emit-llvm | FileCheck %s
+// RUN: %clang_cc1 -std=c99 -O1 -fno-unroll-loops -o - %s -emit-llvm | FileCheck %s --check-prefix C99
 
 extern int a[16];
 int b = 0;
diff --git a/clang/test/Misc/pragma-attribute-cxx.cpp b/clang/test/Misc/pragma-attribute-cxx.cpp
index 38b025e47691..f48c25c824f6 100644
--- a/clang/test/Misc/pragma-attribute-cxx.cpp
+++ b/clang/test/Misc/pragma-attribute-cxx.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -fcxx-exceptions %s
-// RUN: %clang_cc1 -fsyntax-only -ast-dump -ast-dump-filter test -std=c++11 -fcxx-exceptions %s | FileCheck %s
+// RUN: %clang_cc1 -ast-dump -ast-dump-filter test -std=c++11 -fcxx-exceptions %s | FileCheck %s
 // expected-no-diagnostics
 
 class testClass1 {
diff --git a/clang/test/Misc/pragma-attribute-objc.m b/clang/test/Misc/pragma-attribute-objc.m
index 541cfa9ad3bc..f34077b4c920 100644
--- a/clang/test/Misc/pragma-attribute-objc.m
+++ b/clang/test/Misc/pragma-attribute-objc.m
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -Wno-objc-root-class %s
-// RUN: %clang_cc1 -fsyntax-only -Wno-objc-root-class -ast-dump -ast-dump-filter test %s | FileCheck %s
+// RUN: %clang_cc1 -Wno-objc-root-class -ast-dump -ast-dump-filter test %s | FileCheck %s
 
 #pragma clang attribute push (__attribute__((annotate("test"))), apply_to = any(objc_interface, objc_protocol, objc_property, field, objc_method, variable))
 #pragma clang attribute push (__attribute__((objc_subclassing_restricted)), apply_to = objc_interface)
diff --git a/clang/test/Misc/pragma-attribute-strict-subjects.c b/clang/test/Misc/pragma-attribute-strict-subjects.c
index ecd551bee6c7..7c2548c7dfc2 100644
--- a/clang/test/Misc/pragma-attribute-strict-subjects.c
+++ b/clang/test/Misc/pragma-attribute-strict-subjects.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -Wno-pragma-clang-attribute -verify %s
-// RUN: not %clang_cc1 -fsyntax-only -ast-dump -ast-dump-filter test %s | FileCheck %s
+// RUN: not %clang_cc1 -ast-dump -ast-dump-filter test %s | FileCheck %s
 
 // Check for contradictions in rules for attribute without a strict subject set:
 
diff --git a/clang/test/Modules/DebugInfo-fmodule-name.c b/clang/test/Modules/DebugInfo-fmodule-name.c
index 7f2730ddc97b..32dacc1d2cb0 100644
--- a/clang/test/Modules/DebugInfo-fmodule-name.c
+++ b/clang/test/Modules/DebugInfo-fmodule-name.c
@@ -2,7 +2,7 @@
 // RUN: %clang_cc1 -fmodules -fmodule-format=obj -fmodule-name=MainA \
 // RUN:     -debug-info-kind=limited -dwarf-ext-refs \
 // RUN:     -fimplicit-module-maps -x c -fmodules-cache-path=%t -F %S/Inputs \
-// RUN:     %s -S -emit-llvm -debugger-tuning=lldb -o - | FileCheck %s
+// RUN:     %s -emit-llvm -debugger-tuning=lldb -o - | FileCheck %s
 
 #include "MainA/MainPriv.h"
 
diff --git a/clang/test/Modules/cstd.m b/clang/test/Modules/cstd.m
index 6b81b9013e9d..2155037400bd 100644
--- a/clang/test/Modules/cstd.m
+++ b/clang/test/Modules/cstd.m
@@ -1,5 +1,6 @@
 // RUN: rm -rf %t
 // RUN: %clang_cc1 -fsyntax-only -internal-isystem %S/Inputs/System/usr/include -fmodules -fimplicit-module-maps -fbuiltin-headers-in-system-modules -fmodules-cache-path=%t -D__need_wint_t -Werror=implicit-function-declaration %s
+// UNSUPPORTED: target={{.*}}-zos{{.*}}
 
 @import uses_other_constants;
 const double other_value = DBL_MAX;
diff --git a/clang/test/Modules/cxx20-10-5-ex1.cpp b/clang/test/Modules/cxx20-10-5-ex1.cpp
index 0435b3a64c07..e87f4b78a0be 100644
--- a/clang/test/Modules/cxx20-10-5-ex1.cpp
+++ b/clang/test/Modules/cxx20-10-5-ex1.cpp
@@ -2,24 +2,24 @@
 // RUN: split-file %s %t
 // RUN: cd %t
 
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface std-10-5-ex1-interface.cpp \
+// RUN: %clang_cc1 -std=c++20 std-10-5-ex1-interface.cpp \
 // RUN: -DBAD_FWD_DECL  -fsyntax-only -verify
 
 // RUN: %clang_cc1 -std=c++20 -emit-module-interface std-10-5-ex1-interface.cpp \
 // RUN: -o A.pcm
 
-// RUN: %clang_cc1 -std=c++20 std-10-5-ex1-use.cpp  -fmodule-file=A=A.pcm \
+// RUN: %clang_cc1 -std=c++20 std-10-5-ex1-use.cpp -fmodule-file=A=A.pcm \
 // RUN:    -fsyntax-only -verify
 
 // Test again with reduced BMI.
 // RUN: rm A.pcm
-// RUN: %clang_cc1 -std=c++20 -emit-reduced-module-interface std-10-5-ex1-interface.cpp \
+// RUN: %clang_cc1 -std=c++20 std-10-5-ex1-interface.cpp \
 // RUN: -DBAD_FWD_DECL  -fsyntax-only -verify
 
 // RUN: %clang_cc1 -std=c++20 -emit-reduced-module-interface std-10-5-ex1-interface.cpp \
 // RUN: -o A.pcm
 
-// RUN: %clang_cc1 -std=c++20 std-10-5-ex1-use.cpp  -fmodule-file=A=A.pcm \
+// RUN: %clang_cc1 -std=c++20 std-10-5-ex1-use.cpp -fmodule-file=A=A.pcm \
 // RUN:    -fsyntax-only -verify
 
 
diff --git a/clang/test/Modules/cxx20-importing-function-bodies.cppm b/clang/test/Modules/cxx20-importing-function-bodies.cppm
index c34e48aaa3f6..fc75587b3cc5 100644
--- a/clang/test/Modules/cxx20-importing-function-bodies.cppm
+++ b/clang/test/Modules/cxx20-importing-function-bodies.cppm
@@ -8,7 +8,7 @@
 // RUN:     -emit-module-interface -fprebuilt-module-path=%t -o %t/b.pcm
 // RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/c.cppm \
 // RUN:     -emit-module-interface -fprebuilt-module-path=%t -o %t/c.pcm
-// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/c.pcm -S \
+// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/c.pcm \
 // RUN:     -fprebuilt-module-path=%t -emit-llvm -disable-llvm-passes -o - \
 // RUN:     | FileCheck %t/c.cppm
 //
@@ -20,7 +20,7 @@
 // RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple -O3 %t/c.cppm \
 // RUN:     -emit-module-interface -fprebuilt-module-path=%t -o %t/c.pcm
 // RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple -O3 %t/c.pcm \
-// RUN:     -fprebuilt-module-path=%t -S -emit-llvm -disable-llvm-passes \
+// RUN:     -fprebuilt-module-path=%t -emit-llvm -disable-llvm-passes \
 // RUN:     -o - | FileCheck %t/c.cppm
 
 //--- a.cppm
diff --git a/clang/test/Modules/cxx20-include-translation.cpp b/clang/test/Modules/cxx20-include-translation.cpp
index b36eb176c40a..7bf318432582 100644
--- a/clang/test/Modules/cxx20-include-translation.cpp
+++ b/clang/test/Modules/cxx20-include-translation.cpp
@@ -8,17 +8,17 @@
 // RUN: %clang_cc1 -std=c++20 -xc++-user-header h3.h -emit-header-unit -o h3.pcm
 // RUN: %clang_cc1 -std=c++20 -xc++-user-header h4.h -emit-header-unit -o h4.pcm
 
-// RUN: %clang_cc1 -std=c++20 Xlate.cpp -emit-module-interface -o Xlate.pcm \
+// RUN: %clang_cc1 -std=c++20 Xlate.cpp -o Xlate.pcm \
 // RUN: -fmodule-file=h1.pcm -fmodule-file=h2.pcm -fmodule-file=h3.pcm \
 // RUN: -fmodule-file=h4.pcm -fsyntax-only -Rmodule-include-translation -verify
 
 // Check that we do the intended translation and not more.
 // RUN: %clang_cc1 -std=c++20 Xlate.cpp \
 // RUN: -fmodule-file=h1.pcm -fmodule-file=h2.pcm -fmodule-file=h3.pcm \
-// RUN: -fmodule-file=h4.pcm  -E -undef | FileCheck %s
+// RUN: -fmodule-file=h4.pcm -E -undef | FileCheck %s
 
 // We expect no diagnostics here, the used functions should all be available.
-// RUN: %clang_cc1 -std=c++20 Xlate.cpp -emit-module-interface \
+// RUN: %clang_cc1 -std=c++20 Xlate.cpp \
 // RUN: -fmodule-file=h1.pcm -fmodule-file=h2.pcm -fmodule-file=h3.pcm \
 // RUN: -fmodule-file=h4.pcm -fsyntax-only
 
diff --git a/clang/test/Modules/eagerly-load-cxx-named-modules.cppm b/clang/test/Modules/eagerly-load-cxx-named-modules.cppm
index ab2ac891fb40..6bf4b2e7f9f4 100644
--- a/clang/test/Modules/eagerly-load-cxx-named-modules.cppm
+++ b/clang/test/Modules/eagerly-load-cxx-named-modules.cppm
@@ -7,7 +7,7 @@
 // RUN:    2>&1 | FileCheck %t/user.cpp
 // RUN: %clang_cc1 -std=c++20 %t/b.cppm -emit-module-interface -o %t/b.pcm \
 // RUN:    -fprebuilt-module-path=%t
-// RUN: %clang_cc1 -std=c++20 %t/b.pcm -S \
+// RUN: %clang_cc1 -std=c++20 %t/b.pcm \
 // RUN:    -fprebuilt-module-path=%t -emit-llvm 2>&1 -o - | FileCheck %t/b.cppm
 
 //--- a.cppm
diff --git a/clang/test/Modules/function-transitive-change.cppm b/clang/test/Modules/function-transitive-change.cppm
new file mode 100644
index 000000000000..cfce669e3a7b
--- /dev/null
+++ b/clang/test/Modules/function-transitive-change.cppm
@@ -0,0 +1,94 @@
+// Test that, in C++20 modules reduced BMI, the implementation detail changes
+// in non-inline function may not propagate while the inline function changes
+// can get propagate.
+//
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: cd %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/a.cppm -emit-reduced-module-interface -o %t/a.pcm
+// RUN: %clang_cc1 -std=c++20 %t/a.v1.cppm -emit-reduced-module-interface -o %t/a.v1.pcm
+//
+// The BMI of A should differ since the different implementation.
+// RUN: not diff %t/a.pcm %t/a.v1.pcm &> /dev/null
+//
+// The BMI of B should change since the dependent inline function changes
+// RUN: %clang_cc1 -std=c++20 %t/b.cppm -emit-reduced-module-interface -fmodule-file=a=%t/a.pcm \
+// RUN:     -o %t/b.pcm
+// RUN: %clang_cc1 -std=c++20 %t/b.cppm -emit-reduced-module-interface -fmodule-file=a=%t/a.v1.pcm \
+// RUN:     -o %t/b.v1.pcm
+// RUN: not diff %t/b.v1.pcm %t/b.pcm  &> /dev/null
+//
+// Test the case with unused partitions.
+// RUN: %clang_cc1 -std=c++20 %t/M-A.cppm -emit-reduced-module-interface -o %t/M-A.pcm
+// RUN: %clang_cc1 -std=c++20 %t/M-B.cppm -emit-reduced-module-interface -o %t/M-B.pcm
+// RUN: %clang_cc1 -std=c++20 %t/M.cppm -emit-reduced-module-interface -o %t/M.pcm \
+// RUN:     -fmodule-file=M:partA=%t/M-A.pcm \
+// RUN:     -fmodule-file=M:partB=%t/M-B.pcm
+// RUN: %clang_cc1 -std=c++20 %t/N.cppm -emit-reduced-module-interface -o %t/N.pcm \
+// RUN:     -fmodule-file=M:partA=%t/M-A.pcm \
+// RUN:     -fmodule-file=M:partB=%t/M-B.pcm \
+// RUN:     -fmodule-file=M=%t/M.pcm
+//
+// Now we change `M-A.cppm` to `M-A.v1.cppm`.
+// RUN: %clang_cc1 -std=c++20 %t/M-A.v1.cppm -emit-reduced-module-interface -o %t/M-A.v1.pcm
+// RUN: %clang_cc1 -std=c++20 %t/M.cppm -emit-reduced-module-interface -o %t/M.v1.pcm \
+// RUN:     -fmodule-file=M:partA=%t/M-A.v1.pcm \
+// RUN:     -fmodule-file=M:partB=%t/M-B.pcm
+// RUN: %clang_cc1 -std=c++20 %t/N.cppm -emit-reduced-module-interface -o %t/N.v1.pcm \
+// RUN:     -fmodule-file=M:partA=%t/M-A.v1.pcm \
+// RUN:     -fmodule-file=M:partB=%t/M-B.pcm \
+// RUN:     -fmodule-file=M=%t/M.v1.pcm
+//
+// The BMI of N can keep unchanged since the N didn't use the changed partition unit 'M:A'.
+// RUN: diff %t/N.v1.pcm %t/N.pcm  &> /dev/null
+
+//--- a.cppm
+export module a;
+export inline int a() {
+    return 48;
+}
+
+//--- a.v1.cppm
+export module a;
+export inline int a() {
+    return 50;
+}
+
+//--- b.cppm
+export module b;
+import a;
+export inline int b() {
+    return a();
+}
+
+//--- M-A.cppm
+export module M:partA;
+export inline int a() {
+    return 43;
+}
+
+//--- M-A.v1.cppm
+export module M:partA;
+export inline int a() {
+    return 50;
+}
+
+//--- M-B.cppm
+export module M:partB;
+export inline int b() {
+    return 44;
+}
+
+//--- M.cppm
+export module M;
+export import :partA;
+export import :partB;
+
+//--- N.cppm
+export module N;
+import M;
+
+export inline int n() {
+    return b();
+}
diff --git a/clang/test/Modules/getSourceDescriptor-crash.cpp b/clang/test/Modules/getSourceDescriptor-crash.cpp
index 53111786472f..103e8ca23912 100644
--- a/clang/test/Modules/getSourceDescriptor-crash.cpp
+++ b/clang/test/Modules/getSourceDescriptor-crash.cpp
@@ -1,5 +1,5 @@
 // RUN: rm -rf %t
-// RUN: %clang_cc1 -I %S/Inputs/getSourceDescriptor-crash -S -emit-llvm -debug-info-kind=limited -debugger-tuning=lldb -fmodules -fmodules-cache-path=%t -fimplicit-module-maps %s -o - | FileCheck %s
+// RUN: %clang_cc1 -I %S/Inputs/getSourceDescriptor-crash -emit-llvm -debug-info-kind=limited -debugger-tuning=lldb -fmodules -fmodules-cache-path=%t -fimplicit-module-maps %s -o - | FileCheck %s
 
 #include "h1.h"
 #include "h1.h"
diff --git a/clang/test/Modules/implicit-module-no-timestamp.cpp b/clang/test/Modules/implicit-module-no-timestamp.cpp
index 1b681a610bab..50767b4a1146 100644
--- a/clang/test/Modules/implicit-module-no-timestamp.cpp
+++ b/clang/test/Modules/implicit-module-no-timestamp.cpp
@@ -1,4 +1,3 @@
-// UNSUPPORTED: system-windows
 // RUN: rm -rf %t
 // RUN: split-file %s %t
 // RUN: cd %t
diff --git a/clang/test/Modules/language-linkage.cppm b/clang/test/Modules/language-linkage.cppm
index a5db9e9ebc07..7933ed8bdd0e 100644
--- a/clang/test/Modules/language-linkage.cppm
+++ b/clang/test/Modules/language-linkage.cppm
@@ -5,7 +5,7 @@
 // RUN: mkdir %t
 //
 // RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %s -emit-module-interface -o %t/M.pcm
-// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/M.pcm -S -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/M.pcm -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 export module M;
 
 extern "C++" {
diff --git a/clang/test/Modules/load-module-with-errors.m b/clang/test/Modules/load-module-with-errors.m
index 6991d0feb010..1f8e483a19e9 100644
--- a/clang/test/Modules/load-module-with-errors.m
+++ b/clang/test/Modules/load-module-with-errors.m
@@ -29,7 +29,7 @@ void test(Error *x) {
 // RUN:   -x objective-c -emit-module %S/Inputs/error/module.modulemap
 
 // Prebuilt modules
-// RUN: %clang_cc1 -fsyntax-only -fmodules -fallow-pcm-with-compiler-errors \
+// RUN: %clang_cc1 -fmodules -fallow-pcm-with-compiler-errors \
 // RUN:   -fprebuilt-module-path=%t/prebuilt -fmodules-cache-path=%t \
 // RUN:   -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fsyntax-only -fmodules \
@@ -37,7 +37,7 @@ void test(Error *x) {
 // RUN:   -verify=pcherror %s
 
 // Explicit prebuilt modules (loaded when needed)
-// RUN: %clang_cc1 -fsyntax-only -fmodules -fallow-pcm-with-compiler-errors \
+// RUN: %clang_cc1 -fmodules -fallow-pcm-with-compiler-errors \
 // RUN:   -fmodule-file=error=%t/prebuilt/error.pcm \
 // RUN:   -fmodule-file=use_error_a=%t/prebuilt/use_error_a.pcm \
 // RUN:   -fmodule-file=use_error_b=%t/prebuilt/use_error_b.pcm \
@@ -49,7 +49,7 @@ void test(Error *x) {
 // RUN:   -fmodules-cache-path=%t -verify=pcherror %s
 
 // Explicit prebuilt modules without name (always loaded)
-// RUN: %clang_cc1 -fsyntax-only -fmodules -fallow-pcm-with-compiler-errors \
+// RUN: %clang_cc1 -fmodules -fallow-pcm-with-compiler-errors \
 // RUN:   -fmodule-file=%t/prebuilt/error.pcm \
 // RUN:   -fmodule-file=%t/prebuilt/use_error_a.pcm \
 // RUN:   -fmodule-file=%t/prebuilt/use_error_b.pcm \
@@ -87,7 +87,7 @@ void test(Error *x) {
 // the verify would fail as it would be the PCH error instead)
 // RUN: %clang_cc1 -fsyntax-only -fmodules \
 // RUN:   -fmodules-cache-path=%t -fimplicit-module-maps -I %S/Inputs/error \
-// RUN:   -x objective-c  %s -verify=notallowerror
+// RUN:   -x objective-c %s -verify=notallowerror
 
 // allow-pcm-with-compiler-errors should also allow errors in PCH
 // RUN: %clang_cc1 -fallow-pcm-with-compiler-errors -x objective-c \
diff --git a/clang/test/Modules/merge-objc-protocol-visibility.m b/clang/test/Modules/merge-objc-protocol-visibility.m
index f5f048b36902..074c3b1ca668 100644
--- a/clang/test/Modules/merge-objc-protocol-visibility.m
+++ b/clang/test/Modules/merge-objc-protocol-visibility.m
@@ -1,4 +1,4 @@
-// UNSUPPORTED: target={{.*}}-aix{{.*}}
+// UNSUPPORTED: target={{.*}}-aix{{.*}}, target={{.*}}-zos{{.*}}
 // RUN: rm -rf %t
 // RUN: split-file %s %t
 // RUN: %clang_cc1 -emit-llvm -o %t/test.bc -F%t/Frameworks %t/test.m -Werror=objc-method-access -DHIDDEN_FIRST=1 \
diff --git a/clang/test/Modules/module-init-duplicated-import.cppm b/clang/test/Modules/module-init-duplicated-import.cppm
index 1326402bb4de..bccf55c555bb 100644
--- a/clang/test/Modules/module-init-duplicated-import.cppm
+++ b/clang/test/Modules/module-init-duplicated-import.cppm
@@ -7,7 +7,7 @@
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 %t/m.cppm \
 // RUN:      -emit-module-interface -fmodule-file=a=%t/a.pcm -o %t/m.pcm
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 %t/m.pcm  \
-// RUN:      -fmodule-file=a=%t/a.pcm -S -emit-llvm -o - | FileCheck %t/m.cppm
+// RUN:      -fmodule-file=a=%t/a.pcm -emit-llvm -o - | FileCheck %t/m.cppm
 
 // Test again with reduced BMI.
 // Note that we can't use reduced BMI here for m.cppm since it is required
@@ -18,7 +18,7 @@
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 %t/m.cppm \
 // RUN:      -emit-module-interface -fmodule-file=a=%t/a.pcm -o %t/m.pcm
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 %t/m.pcm  \
-// RUN:      -fmodule-file=a=%t/a.pcm -S -emit-llvm -o - | FileCheck %t/m.cppm
+// RUN:      -fmodule-file=a=%t/a.pcm -emit-llvm -o - | FileCheck %t/m.cppm
 
 //--- a.cppm
 export module a;
diff --git a/clang/test/Modules/module-symlink.m b/clang/test/Modules/module-symlink.m
new file mode 100644
index 000000000000..efdaf3db0dfe
--- /dev/null
+++ b/clang/test/Modules/module-symlink.m
@@ -0,0 +1,14 @@
+// REQUIRES: shell
+
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -fmodules-cache-path=%t/modules -fmodules -fimplicit-module-maps -I %S/Inputs -emit-pch -o %t.pch %s -verify
+
+// RUN: ln -s %t/modules %t/modules.symlink
+// RUN: %clang_cc1 -fmodules-cache-path=%t/modules.symlink -fmodules -fimplicit-module-maps -I %S/Inputs -include-pch %t.pch %s -verify
+// RUN: not %clang_cc1 -fmodules-cache-path=%t/modules.dne -fmodules -fimplicit-module-maps -I %S/Inputs -include-pch %t.pch %s -verify
+
+// expected-no-diagnostics
+
+@import ignored_macros;
+
+struct Point p;
diff --git a/clang/test/Modules/modules-reduced-bmi.cppm b/clang/test/Modules/modules-reduced-bmi.cppm
index 9b84220ae030..a2636f6df988 100644
--- a/clang/test/Modules/modules-reduced-bmi.cppm
+++ b/clang/test/Modules/modules-reduced-bmi.cppm
@@ -4,7 +4,7 @@
 //
 // RUN: %clang_cc1 -std=c++20 %t/a.cppm -emit-reduced-module-interface -o %t/a.reduced.pcm
 // RUN: %clang_cc1 -std=c++20 %t/a.cppm -fexperimental-modules-reduced-bmi -fmodule-output=%t/a.pcm \
-// RUN:     -S -emit-llvm -o %t/a.ll
+// RUN:     -emit-llvm -o %t/a.ll
 //
 // Test that the generated BMI from `-fexperimental-modules-reduced-bmi -fmodule-output=` is same with
 // `-emit-reduced-module-interface`.
diff --git a/clang/test/Modules/no-duplicate-codegen-in-GMF.cppm b/clang/test/Modules/no-duplicate-codegen-in-GMF.cppm
index 36a2d8bc8c95..dea04fdce0c6 100644
--- a/clang/test/Modules/no-duplicate-codegen-in-GMF.cppm
+++ b/clang/test/Modules/no-duplicate-codegen-in-GMF.cppm
@@ -7,7 +7,7 @@
 // RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/A.cppm -emit-module-interface -o %t/A.pcm
 // RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/B.cppm -emit-module-interface -o %t/B.pcm \
 // RUN:     -fprebuilt-module-path=%t
-// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/B.pcm -S -emit-llvm -o - \
+// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/B.pcm -emit-llvm -o - \
 // RUN:     -fprebuilt-module-path=%t | FileCheck %t/B.cppm
 
 // Test again with reduced BMI. Note that we need to generate full BMI for B.cppm
@@ -16,7 +16,7 @@
 // RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/A.cppm -emit-reduced-module-interface -o %t/A.pcm
 // RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/B.cppm -emit-module-interface -o %t/B.pcm \
 // RUN:     -fprebuilt-module-path=%t
-// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/B.pcm -S -emit-llvm -o - \
+// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/B.pcm -emit-llvm -o - \
 // RUN:     -fprebuilt-module-path=%t | FileCheck %t/B.cppm
 
 
diff --git a/clang/test/Modules/no-import-func-body.cppm b/clang/test/Modules/no-import-func-body.cppm
index af7c3a3ad84b..4923dbacb819 100644
--- a/clang/test/Modules/no-import-func-body.cppm
+++ b/clang/test/Modules/no-import-func-body.cppm
@@ -8,7 +8,7 @@
 // RUN:     -emit-module-interface -fprebuilt-module-path=%t -o %t/b.pcm
 // RUN: %clang_cc1 -std=c++20 -O1 -triple %itanium_abi_triple %t/c.cppm \
 // RUN:     -emit-module-interface -fprebuilt-module-path=%t -o %t/c.pcm
-// RUN: %clang_cc1 -std=c++20 -O1 -triple %itanium_abi_triple %t/c.pcm -S \
+// RUN: %clang_cc1 -std=c++20 -O1 -triple %itanium_abi_triple %t/c.pcm \
 // RUN:     -fprebuilt-module-path=%t -emit-llvm -disable-llvm-passes -o - \
 // RUN:     | FileCheck %t/c.cppm
 
diff --git a/clang/test/Modules/no-transitive-source-location-change.cppm b/clang/test/Modules/no-transitive-source-location-change.cppm
new file mode 100644
index 000000000000..2a84ef6a912f
--- /dev/null
+++ b/clang/test/Modules/no-transitive-source-location-change.cppm
@@ -0,0 +1,66 @@
+// Testing that adding a new line in a module interface unit won't cause the BMI
+// of consuming module unit changes.
+//
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/A.cppm -emit-reduced-module-interface -o %t/A.pcm
+// RUN: %clang_cc1 -std=c++20 %t/A.v1.cppm -emit-reduced-module-interface -o %t/A.v1.pcm
+//
+// The BMI may not be the same since the source location differs.
+// RUN: not diff %t/A.pcm %t/A.v1.pcm &> /dev/null
+//
+// The BMI of B shouldn't change since all the locations remain the same.
+// RUN: %clang_cc1 -std=c++20 %t/B.cppm -emit-reduced-module-interface -fmodule-file=A=%t/A.pcm \
+// RUN:     -o %t/B.pcm
+// RUN: %clang_cc1 -std=c++20 %t/B.cppm -emit-reduced-module-interface -fmodule-file=A=%t/A.v1.pcm \
+// RUN:     -o %t/B.v1.pcm
+// RUN: diff %t/B.v1.pcm %t/B.pcm  &> /dev/null
+//
+// The BMI of C may change since the locations for instantiations changes.
+// RUN: %clang_cc1 -std=c++20 %t/C.cppm -emit-reduced-module-interface -fmodule-file=A=%t/A.pcm \
+// RUN:     -o %t/C.pcm
+// RUN: %clang_cc1 -std=c++20 %t/C.cppm -emit-reduced-module-interface -fmodule-file=A=%t/A.v1.pcm \
+// RUN:     -o %t/C.v1.pcm
+// RUN: not diff %t/C.v1.pcm %t/C.pcm  &> /dev/null
+
+//--- A.cppm
+export module A;
+export template <class T>
+struct C {
+    T func() {
+        return T(43);
+    }
+};
+export int funcA() {
+    return 43;
+}
+
+//--- A.v1.cppm
+export module A;
+
+export template <class T>
+struct C {
+    T func() {
+        return T(43);
+    }
+};
+export int funcA() {
+    return 43;
+}
+
+//--- B.cppm
+export module B;
+import A;
+
+export int funcB() {
+    return funcA();
+}
+
+//--- C.cppm
+export module C;
+import A;
+export inline void testD() {
+    C<int> c;
+    c.func();
+}
diff --git a/clang/test/Modules/odr_hash-enum.c b/clang/test/Modules/odr_hash-enum.c
new file mode 100644
index 000000000000..f8ede923fe2c
--- /dev/null
+++ b/clang/test/Modules/odr_hash-enum.c
@@ -0,0 +1,75 @@
+// Clear and create directories
+// RUN: rm -rf %t
+// RUN: mkdir %t
+// RUN: mkdir %t/cache
+// RUN: mkdir %t/Inputs
+
+// Build first header file
+// RUN: echo "#define FIRST" >> %t/Inputs/first.h
+// RUN: cat %s               >> %t/Inputs/first.h
+
+// Build second header file
+// RUN: echo "#define SECOND" >> %t/Inputs/second.h
+// RUN: cat %s                >> %t/Inputs/second.h
+
+// Test that each header can compile
+// RUN: %clang_cc1 -fsyntax-only -x c %t/Inputs/first.h
+// RUN: %clang_cc1 -fsyntax-only -x c %t/Inputs/second.h
+
+// Build module map file
+// RUN: echo "module FirstModule {"     >> %t/Inputs/module.modulemap
+// RUN: echo "    header \"first.h\""   >> %t/Inputs/module.modulemap
+// RUN: echo "}"                        >> %t/Inputs/module.modulemap
+// RUN: echo "module SecondModule {"    >> %t/Inputs/module.modulemap
+// RUN: echo "    header \"second.h\""  >> %t/Inputs/module.modulemap
+// RUN: echo "}"                        >> %t/Inputs/module.modulemap
+
+// Run test
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/cache -x c -I%t/Inputs -verify %s
+
+#if !defined(FIRST) && !defined(SECOND)
+#include "first.h"
+#include "second.h"
+#endif
+
+#if defined(FIRST)
+enum DifferentEnumConstants { kDifferentEnumConstantsValueFirst };
+#elif defined(SECOND)
+enum DifferentEnumConstants { kDifferentEnumConstantsValueSecond };
+#else
+enum DifferentEnumConstants differentEnumConstants;
+// expected-error@second.h:* {{'kDifferentEnumConstantsValueSecond' from module 'SecondModule' is not present in definition of 'enum DifferentEnumConstants' in module 'FirstModule'}}
+// expected-note@first.h:* {{definition has no member 'kDifferentEnumConstantsValueSecond'}}
+#endif
+
+#if defined(FIRST)
+enum DifferentEnumValues { kDifferentEnumValue = 0 };
+#elif defined(SECOND)
+enum DifferentEnumValues { kDifferentEnumValue = 1 };
+#else
+enum DifferentEnumValues differentEnumValue;
+// expected-error@first.h:* {{'DifferentEnumValues' has different definitions in different modules; definition in module 'FirstModule' first difference is 1st element 'kDifferentEnumValue' has an initializer}}
+// expected-note@second.h:* {{but in 'SecondModule' found 1st element 'kDifferentEnumValue' has different initializer}}
+#endif
+
+#if defined(FIRST)
+enum {
+    kAnonymousEnumValueFirst = 1,
+};
+#elif defined(SECOND)
+enum {
+    kAnonymousEnumValueSecond = 2,
+};
+#else
+// Anonymous enums don't have to match, no errors expected.
+int anonymousEnumValue = kAnonymousEnumValueFirst + kAnonymousEnumValueSecond;
+#endif
+
+// Keep macros contained to one file.
+#ifdef FIRST
+#undef FIRST
+#endif
+
+#ifdef SECOND
+#undef SECOND
+#endif
diff --git a/clang/test/Modules/odr_using_dependent_name.cppm b/clang/test/Modules/odr_using_dependent_name.cppm
index c2938855fdbe..8816c444d86b 100644
--- a/clang/test/Modules/odr_using_dependent_name.cppm
+++ b/clang/test/Modules/odr_using_dependent_name.cppm
@@ -1,7 +1,7 @@
 // RUN: rm -rf %t
 // RUN: mkdir -p %t
 // RUN: %clang_cc1 -std=c++20 %S/Inputs/odr_using_dependent_name/X.cppm -emit-module-interface -o %t/X.pcm
-// RUN: %clang_cc1 -std=c++20 -I%S/Inputs/odr_using_dependent_name -fprebuilt-module-path=%t %s -emit-module-interface -fsyntax-only -verify
+// RUN: %clang_cc1 -std=c++20 -I%S/Inputs/odr_using_dependent_name -fprebuilt-module-path=%t %s -fsyntax-only -verify
 // expected-no-diagnostics
 module;
 #include "foo.h"
diff --git a/clang/test/Modules/pr58716.cppm b/clang/test/Modules/pr58716.cppm
index 177802fe3afc..cd3db2c19f48 100644
--- a/clang/test/Modules/pr58716.cppm
+++ b/clang/test/Modules/pr58716.cppm
@@ -7,7 +7,7 @@
 // RUN: split-file %s %t
 //
 // RUN: %clang_cc1 -triple=x86_64-linux-gnu -std=c++20 -emit-module-interface %t/m.cppm -o %t/m.pcm
-// RUN: %clang_cc1 -triple=x86_64-linux-gnu -std=c++20 %t/m.pcm -S -emit-llvm -o - | FileCheck %t/m.cppm
+// RUN: %clang_cc1 -triple=x86_64-linux-gnu -std=c++20 %t/m.pcm -emit-llvm -o - | FileCheck %t/m.cppm
 
 //--- m.cppm
 module;
diff --git a/clang/test/Modules/pr59780.cppm b/clang/test/Modules/pr59780.cppm
index ee81ca575d7b..ea5fdd5c4ce2 100644
--- a/clang/test/Modules/pr59780.cppm
+++ b/clang/test/Modules/pr59780.cppm
@@ -5,7 +5,7 @@
 // RUN: split-file %s %t
 //
 // RUN: %clang_cc1 -std=c++20 %t/a.cppm -triple %itanium_abi_triple -emit-module-interface -o %t/a.pcm
-// RUN: %clang_cc1 -std=c++20 %t/use.cpp -fprebuilt-module-path=%t -S \
+// RUN: %clang_cc1 -std=c++20 %t/use.cpp -fprebuilt-module-path=%t \
 // RUN:     -triple %itanium_abi_triple -emit-llvm -o - | FileCheck %t/use.cpp
 // RUN: %clang_cc1 -std=c++20 %t/a.pcm -triple %itanium_abi_triple -emit-llvm -o - | FileCheck %t/a.cppm
 
@@ -14,7 +14,7 @@
 // RUN:     -o %t/a.full.pcm
 // RUN: %clang_cc1 -std=c++20 %t/a.cppm -triple %itanium_abi_triple -emit-reduced-module-interface \
 // RUN:     -o %t/a.pcm
-// RUN: %clang_cc1 -std=c++20 %t/use.cpp -fprebuilt-module-path=%t -S \
+// RUN: %clang_cc1 -std=c++20 %t/use.cpp -fprebuilt-module-path=%t \
 // RUN:     -triple %itanium_abi_triple -emit-llvm -o - | FileCheck %t/use.cpp
 // RUN: %clang_cc1 -std=c++20 %t/a.full.pcm -triple %itanium_abi_triple -emit-llvm -o - | FileCheck %t/a.cppm
 
diff --git a/clang/test/Modules/pr59999.cppm b/clang/test/Modules/pr59999.cppm
index 54452c26de47..d6e6fff2b7c5 100644
--- a/clang/test/Modules/pr59999.cppm
+++ b/clang/test/Modules/pr59999.cppm
@@ -9,7 +9,7 @@
 // RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/Object.cppm \
 // RUN:     -fmodule-file=Module=%t/Module.pcm -emit-module-interface -o %t/Object.pcm
 // RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/Object.pcm \
-// RUN:     -fmodule-file=Module=%t/Module.pcm -S -emit-llvm -o - | FileCheck %t/Object.cppm
+// RUN:     -fmodule-file=Module=%t/Module.pcm -emit-llvm -o - | FileCheck %t/Object.cppm
 
 // Test again with reduced BMI.
 // RUN: rm -rf %t
@@ -21,7 +21,7 @@
 // RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/Object.cppm \
 // RUN:     -fmodule-file=Module=%t/Module.pcm -emit-module-interface -o %t/Object.pcm
 // RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/Object.pcm \
-// RUN:     -fmodule-file=Module=%t/Module.pcm -S -emit-llvm -o - | FileCheck %t/Object.cppm
+// RUN:     -fmodule-file=Module=%t/Module.pcm -emit-llvm -o - | FileCheck %t/Object.cppm
 
 
 //--- Module.cppm
diff --git a/clang/test/Modules/pr60085.cppm b/clang/test/Modules/pr60085.cppm
index 37d8b09350b4..5cd5dcb683d7 100644
--- a/clang/test/Modules/pr60085.cppm
+++ b/clang/test/Modules/pr60085.cppm
@@ -11,7 +11,7 @@
 // RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/a.cppm \
 // RUN:     -emit-module-interface -o %t/a.pcm -fprebuilt-module-path=%t
 // RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/a.pcm \
-// RUN:     -S -emit-llvm -disable-llvm-passes -o - -fprebuilt-module-path=%t \
+// RUN:     -emit-llvm -disable-llvm-passes -o - -fprebuilt-module-path=%t \
 // RUN:     | FileCheck %t/a.cppm
 
 // Test again with reduced BMI.
@@ -28,7 +28,7 @@
 // RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/a.cppm \
 // RUN:     -emit-module-interface -o %t/a.pcm -fprebuilt-module-path=%t
 // RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/a.pcm \
-// RUN:     -S -emit-llvm -disable-llvm-passes -o - -fprebuilt-module-path=%t \
+// RUN:     -emit-llvm -disable-llvm-passes -o - -fprebuilt-module-path=%t \
 // RUN:		| FileCheck %t/a.cppm
 
 //--- d.cppm
diff --git a/clang/test/Modules/pr60693.cppm b/clang/test/Modules/pr60693.cppm
index 6fb3de60e59b..7ca9f8d33161 100644
--- a/clang/test/Modules/pr60693.cppm
+++ b/clang/test/Modules/pr60693.cppm
@@ -5,11 +5,11 @@
 // RUN: split-file %s %t
 //
 // RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/a.cppm -emit-module-interface -o %t/a.pcm
-// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple -fmodule-file=a=%t/a.pcm %t/c.cpp -S -emit-llvm -disable-llvm-passes -o - | FileCheck %t/c.cpp
+// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple -fmodule-file=a=%t/a.pcm %t/c.cpp -emit-llvm -disable-llvm-passes -o - | FileCheck %t/c.cpp
 
 // Test again with reduced BMI
 // RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/a.cppm -emit-reduced-module-interface -o %t/a.pcm
-// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple -fmodule-file=a=%t/a.pcm %t/c.cpp -S -emit-llvm -disable-llvm-passes -o - | FileCheck %t/c.cpp
+// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple -fmodule-file=a=%t/a.pcm %t/c.cpp -emit-llvm -disable-llvm-passes -o - | FileCheck %t/c.cpp
 
 //--- a.cppm
 export module a;
diff --git a/clang/test/Modules/pr60890.cppm b/clang/test/Modules/pr60890.cppm
index 488b512aaac2..b1d9114bf1eb 100644
--- a/clang/test/Modules/pr60890.cppm
+++ b/clang/test/Modules/pr60890.cppm
@@ -7,13 +7,13 @@
 // RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/a.cppm -o %t/a.pcm
 // RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/b.cppm -fprebuilt-module-path=%t -o %t/b.pcm
 // RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/c.cppm -fprebuilt-module-path=%t -o %t/c.pcm
-// RUN: %clang_cc1 -std=c++20 %t/d.cpp -fprebuilt-module-path=%t -S -emit-llvm -o -
+// RUN: %clang_cc1 -std=c++20 %t/d.cpp -fprebuilt-module-path=%t -emit-llvm-only
 
 // Test again with reduced BMI
 // RUN: %clang_cc1 -std=c++20 -emit-reduced-module-interface %t/a.cppm -o %t/a.pcm
 // RUN: %clang_cc1 -std=c++20 -emit-reduced-module-interface %t/b.cppm -fprebuilt-module-path=%t -o %t/b.pcm
 // RUN: %clang_cc1 -std=c++20 -emit-reduced-module-interface %t/c.cppm -fprebuilt-module-path=%t -o %t/c.pcm
-// RUN: %clang_cc1 -std=c++20 %t/d.cpp -fprebuilt-module-path=%t -S -emit-llvm -o -
+// RUN: %clang_cc1 -std=c++20 %t/d.cpp -fprebuilt-module-path=%t -emit-llvm-only
 
 //--- a.cppm
 export module a;
diff --git a/clang/test/Modules/pr61067.cppm b/clang/test/Modules/pr61067.cppm
index b7f9d22e2538..50ab7ba20129 100644
--- a/clang/test/Modules/pr61067.cppm
+++ b/clang/test/Modules/pr61067.cppm
@@ -7,10 +7,10 @@
 // RUN:     -emit-module-interface -o %t/a.pcm
 // RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/b.cppm \
 // RUN:     -emit-module-interface -fmodule-file=a=%t/a.pcm -o %t/b.pcm
-// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/b.pcm -S \
+// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/b.pcm \
 // RUN:     -emit-llvm -fmodule-file=a=%t/a.pcm -disable-llvm-passes -o - | FileCheck %t/b.cppm
 // RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/c.cpp -fmodule-file=a=%t/a.pcm \
-// RUN:     -S -emit-llvm -disable-llvm-passes -o - | FileCheck %t/c.cpp
+// RUN:     -emit-llvm -disable-llvm-passes -o - | FileCheck %t/c.cpp
 
 // Test again with reduced BMI
 // RUN: rm -rf %t
@@ -21,10 +21,10 @@
 // RUN:     -emit-reduced-module-interface -o %t/a.pcm
 // RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/b.cppm \
 // RUN:     -emit-module-interface -fmodule-file=a=%t/a.pcm -o %t/b.pcm
-// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/b.pcm -S \
+// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/b.pcm \
 // RUN:     -emit-llvm -fmodule-file=a=%t/a.pcm -disable-llvm-passes -o - | FileCheck %t/b.cppm
 // RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/c.cpp -fmodule-file=a=%t/a.pcm \
-// RUN:     -S -emit-llvm -disable-llvm-passes -o - | FileCheck %t/c.cpp
+// RUN:     -emit-llvm -disable-llvm-passes -o - | FileCheck %t/c.cpp
 
 //--- a.cppm
 export module a;
diff --git a/clang/test/Modules/pr61783.cppm b/clang/test/Modules/pr61783.cppm
index c3bc853d2dee..4c55fca4757a 100644
--- a/clang/test/Modules/pr61783.cppm
+++ b/clang/test/Modules/pr61783.cppm
@@ -4,18 +4,18 @@
 //
 // RUN: %clang_cc1 -std=c++20 -triple x86_64-pc-windows-msvc19.11.0 -fms-extensions %t/mod.cppm -emit-module-interface \
 // RUN:     -o %t/mod.pcm
-// RUN: %clang_cc1 -std=c++20 -triple x86_64-pc-windows-msvc19.11.0 -fms-extensions %t/mod.pcm -S -emit-llvm -o - | \
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-pc-windows-msvc19.11.0 -fms-extensions %t/mod.pcm -emit-llvm -o - | \
 // RUN:     FileCheck %t/mod.cppm
 // RUN: %clang_cc1 -std=c++20 -triple x86_64-pc-windows-msvc19.11.0 -fms-extensions %t/user.cpp -fmodule-file=mod=%t/mod.pcm \
-// RUN:     -S -emit-llvm -o - | FileCheck %t/user.cpp
+// RUN:     -emit-llvm -o - | FileCheck %t/user.cpp
 
 // Test again with reduced BMI
 // RUN: %clang_cc1 -std=c++20 -triple x86_64-pc-windows-msvc19.11.0 -fms-extensions %t/mod.cppm -emit-reduced-module-interface \
 // RUN:     -o %t/mod.pcm
-// RUN: %clang_cc1 -std=c++20 -triple x86_64-pc-windows-msvc19.11.0 -fms-extensions %t/mod.pcm -S -emit-llvm -o - | \
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-pc-windows-msvc19.11.0 -fms-extensions %t/mod.pcm -emit-llvm -o - | \
 // RUN:     FileCheck %t/mod.cppm
 // RUN: %clang_cc1 -std=c++20 -triple x86_64-pc-windows-msvc19.11.0 -fms-extensions %t/user.cpp -fmodule-file=mod=%t/mod.pcm \
-// RUN:     -S -emit-llvm -o - | FileCheck %t/user.cpp
+// RUN:     -emit-llvm -o - | FileCheck %t/user.cpp
 
 //--- mod.cppm
 module;
diff --git a/clang/test/Modules/pr64091.cpp b/clang/test/Modules/pr64091.cpp
index 6ff45e3c41ae..22984f6bfa77 100644
--- a/clang/test/Modules/pr64091.cpp
+++ b/clang/test/Modules/pr64091.cpp
@@ -15,7 +15,7 @@
 // RUN: %clang_cc1 -fmodules -fno-implicit-modules -fmodule-name=test \
 // RUN:     -fmodule-map-file=test.cppmap -fmodule-map-file=a.cppmap \
 // RUN:     -fmodule-map-file=b.cppmap -fmodule-file=a.pcm -fmodule-file=b.pcm -xc++ \
-// RUN:     test.cc -S -emit-llvm -o - | FileCheck test.cc
+// RUN:     test.cc -emit-llvm -o - | FileCheck test.cc
 
 //--- a.cppmap
 module "a" {
diff --git a/clang/test/Modules/pr67893.cppm b/clang/test/Modules/pr67893.cppm
index 95479193f8ea..488114203144 100644
--- a/clang/test/Modules/pr67893.cppm
+++ b/clang/test/Modules/pr67893.cppm
@@ -7,7 +7,7 @@
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 %t/m.cppm \
 // RUN:      -emit-module-interface -fprebuilt-module-path=%t -o %t/m.pcm
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 %t/m.pcm  \
-// RUN:      -fprebuilt-module-path=%t -S -emit-llvm -o - | FileCheck %t/m.cppm
+// RUN:      -fprebuilt-module-path=%t -emit-llvm -o - | FileCheck %t/m.cppm
 
 // Test again with reduced BMI
 //
@@ -16,7 +16,7 @@
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 %t/m.cppm \
 // RUN:      -emit-reduced-module-interface -fprebuilt-module-path=%t -o %t/m.pcm
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 %t/m.pcm  \
-// RUN:      -fprebuilt-module-path=%t -S -emit-llvm -o - | FileCheck %t/m.cppm
+// RUN:      -fprebuilt-module-path=%t -emit-llvm -o - | FileCheck %t/m.cppm
 
 //--- a.cppm
 export module a;
diff --git a/clang/test/Modules/pr72828.cppm b/clang/test/Modules/pr72828.cppm
index 7432f2831f24..cc324bc63429 100644
--- a/clang/test/Modules/pr72828.cppm
+++ b/clang/test/Modules/pr72828.cppm
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -std=c++23 -triple %itanium_abi_triple \
 // RUN:     %s -emit-module-interface -o %t/m.pcm
 // RUN: %clang_cc1 -std=c++23 -triple %itanium_abi_triple \
-// RUN:     -S -emit-llvm -disable-llvm-passes %t/m.pcm \
+// RUN:     -emit-llvm -disable-llvm-passes %t/m.pcm \
 // RUN:     -o - | FileCheck %s
 
 export module m;
diff --git a/clang/test/Modules/pr78830.cppm b/clang/test/Modules/pr78830.cppm
index a3b1a8021ebe..b8dbe92f5416 100644
--- a/clang/test/Modules/pr78830.cppm
+++ b/clang/test/Modules/pr78830.cppm
@@ -15,7 +15,7 @@
 // RUN:     -fmodule-file=MyVec:Vec2=%t/MyVec-Vec2.pcm \
 // RUN:     -fmodule-file=MyVec:Type=%t/MyVec-Type.pcm \
 // RUN:     -triple=x86_64-linux-gnu 
-// RUN: %clang_cc1 -std=c++20 %t/MyVec-Calculator.pcm -S -emit-llvm \
+// RUN: %clang_cc1 -std=c++20 %t/MyVec-Calculator.pcm -emit-llvm \
 // RUN:     -fmodule-file=MyVec:Vec=%t/MyVec-Vec.pcm   \
 // RUN:     -fmodule-file=MyVec:Vec2=%t/MyVec-Vec2.pcm \
 // RUN:     -fmodule-file=MyVec:Type=%t/MyVec-Type.pcm \
diff --git a/clang/test/Modules/pr91105.cppm b/clang/test/Modules/pr91105.cppm
new file mode 100644
index 000000000000..0873962c3773
--- /dev/null
+++ b/clang/test/Modules/pr91105.cppm
@@ -0,0 +1,47 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/bar.cppm -emit-module-interface -o %t/bar.pcm
+// RUN: %clang_cc1 -std=c++20 %t/foo.cc -fmodule-file=bar=%t/bar.pcm -fsyntax-only -verify
+//
+// RUN: %clang_cc1 -std=c++20 -fskip-odr-check-in-gmf %t/bar.cppm -emit-module-interface \
+// RUN:     -o %t/bar.pcm
+// RUN: %clang_cc1 -std=c++20 -fskip-odr-check-in-gmf %t/foo.cc \
+// RUN:     -fmodule-file=bar=%t/bar.pcm -fsyntax-only -verify
+//
+// RUN: %clang_cc1 -std=c++20 %t/bar.cppm -emit-reduced-module-interface -o %t/bar.pcm
+// RUN: %clang_cc1 -std=c++20 %t/foo.cc -fmodule-file=bar=%t/bar.pcm -fsyntax-only -verify
+//
+// RUN: %clang_cc1 -std=c++20 -fskip-odr-check-in-gmf %t/bar.cppm -emit-reduced-module-interface \
+// RUN:     -o %t/bar.pcm
+// RUN: %clang_cc1 -std=c++20 -fskip-odr-check-in-gmf %t/foo.cc \
+// RUN:     -fmodule-file=bar=%t/bar.pcm -fsyntax-only -verify
+
+//--- h.hpp
+#pragma once
+
+struct T {
+    constexpr T(const char *) {}
+};
+template <char... c>
+struct t {
+    inline constexpr operator T() const { return {s}; }
+
+private:
+    inline static constexpr char s[]{c..., '\0'};
+};
+
+//--- bar.cppm
+module;
+#include "h.hpp"
+export module bar;
+export inline constexpr auto k = t<'k'>{};
+
+//--- foo.cc
+// expected-no-diagnostics
+#include "h.hpp"
+import bar;
+void f() {
+  T x = k;
+}
diff --git a/clang/test/Modules/prune-non-affecting-module-map-files-textual.c b/clang/test/Modules/prune-non-affecting-module-map-files-textual.c
index fce325d4774c..90ceb089eb5d 100644
--- a/clang/test/Modules/prune-non-affecting-module-map-files-textual.c
+++ b/clang/test/Modules/prune-non-affecting-module-map-files-textual.c
@@ -21,12 +21,12 @@ module A { header "A.h" export * }
 
 // RUN: %clang_cc1 -fmodules -emit-module %t/A.modulemap -fmodule-name=A -o %t/A0.pcm \
 // RUN:   -fmodule-map-file=%t/X.modulemap
-// RUN: %clang_cc1 -fsyntax-only -module-file-info %t/A0.pcm | FileCheck %s --check-prefix=A0 --implicit-check-not=Y.modulemap
+// RUN: %clang_cc1 -module-file-info %t/A0.pcm | FileCheck %s --check-prefix=A0 --implicit-check-not=Y.modulemap
 // A0: Input file: {{.*}}X.modulemap
 
 // RUN: %clang_cc1 -fmodules -emit-module %t/A.modulemap -fmodule-name=A -o %t/A1.pcm \
 // RUN:   -fmodule-map-file=%t/X.modulemap -fmodule-map-file=%t/Y.modulemap
-// RUN: %clang_cc1 -fsyntax-only -module-file-info %t/A0.pcm | FileCheck %s --check-prefix=A1 \
+// RUN: %clang_cc1 -module-file-info %t/A0.pcm | FileCheck %s --check-prefix=A1 \
 // RUN:   --implicit-check-not=Y.modulemap
 // A1: Input file: {{.*}}X.modulemap
 
@@ -41,6 +41,6 @@ typedef X_int B_int;
 // RUN: %clang_cc1 -fmodules -emit-module %t/B.modulemap -fmodule-name=B -o %t/B.pcm \
 // RUN:   -fmodule-file=A=%t/A0.pcm \
 // RUN:   -fmodule-map-file=%t/A.modulemap -fmodule-map-file=%t/X.modulemap -fmodule-map-file=%t/Y.modulemap
-// RUN: %clang_cc1 -fsyntax-only -module-file-info %t/B.pcm | FileCheck %s --check-prefix=B \
+// RUN: %clang_cc1 -module-file-info %t/B.pcm | FileCheck %s --check-prefix=B \
 // RUN:   --implicit-check-not=X.modulemap --implicit-check-not=Y.modulemap
 // B: Input file: {{.*}}B.modulemap
diff --git a/clang/test/Modules/reduced-bmi-generating-codes.cppm b/clang/test/Modules/reduced-bmi-generating-codes.cppm
index 13dcda06437b..357476703279 100644
--- a/clang/test/Modules/reduced-bmi-generating-codes.cppm
+++ b/clang/test/Modules/reduced-bmi-generating-codes.cppm
@@ -8,7 +8,7 @@
 // RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/a.cppm \
 // RUN:     -emit-reduced-module-interface -o %t/a.pcm
 // RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/b.cpp \
-// RUN:     -fmodule-file=a=%t/a.pcm -S -emit-llvm -o - \
+// RUN:     -fmodule-file=a=%t/a.pcm -emit-llvm -o - \
 // RUN:     | FileCheck %t/b.cpp
 
 //--- a.cppm
diff --git a/clang/test/Modules/stress1.cpp b/clang/test/Modules/stress1.cpp
index 4da1edd092e3..c47e04dcf509 100644
--- a/clang/test/Modules/stress1.cpp
+++ b/clang/test/Modules/stress1.cpp
@@ -90,7 +90,7 @@
 // RUN:   -fmodule-file=%t/m02.pcm \
 // RUN:   -fmodule-file=%t/m03.pcm \
 // RUN:   -fmodule-file=%t/merge00.pcm \
-// RUN:   -verify stress1.cpp -S -emit-llvm -o %t/stress1.ll
+// RUN:   -verify stress1.cpp -emit-llvm -o %t/stress1.ll
 //
 // RUN: %clang_cc1 -fmodules -x c++ -std=c++11 \
 // RUN:   -I Inputs/stress1 \
@@ -103,7 +103,7 @@
 // RUN:   -fmodule-file=%t/m02.pcm \
 // RUN:   -fmodule-file=%t/m03.pcm \
 // RUN:   -fmodule-file=%t/merge00.pcm \
-// RUN:   -verify stress1.cpp -S -emit-llvm -o %t/stress1_check.ll
+// RUN:   -verify stress1.cpp -emit-llvm -o %t/stress1_check.ll
 //
 // RUN: diff -u %t/stress1.ll %t/stress1_check.ll
 //
diff --git a/clang/test/OpenMP/allocate_allocator_ast_print.cpp b/clang/test/OpenMP/allocate_allocator_ast_print.cpp
index 93ea06f03f02..87a344eeb28c 100644
--- a/clang/test/OpenMP/allocate_allocator_ast_print.cpp
+++ b/clang/test/OpenMP/allocate_allocator_ast_print.cpp
@@ -1,16 +1,16 @@
 // RUN: %clang_cc1 -verify -fopenmp -triple x86_64-apple-darwin10.6.0 -ast-print %s -o - | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -triple x86_64-apple-darwin10.6.0 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -triple x86_64-apple-darwin10.6.0 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -triple x86_64-apple-darwin10.6.0 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-linux-gnu -ast-print %s -o - | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -verify %s -ast-print -o - | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-apple-darwin10.6.0 -ast-print %s -o - | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -triple x86_64-apple-darwin10.6.0 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -triple x86_64-apple-darwin10.6.0 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -triple x86_64-apple-darwin10.6.0 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-linux-gnu -ast-print %s -o - | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -verify %s -ast-print -o - | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/allocate_ast_print.cpp b/clang/test/OpenMP/allocate_ast_print.cpp
index bd0a7e899441..2a4ab2175900 100644
--- a/clang/test/OpenMP/allocate_ast_print.cpp
+++ b/clang/test/OpenMP/allocate_ast_print.cpp
@@ -1,16 +1,16 @@
 // RUN: %clang_cc1 -verify -fopenmp -triple x86_64-apple-darwin10.6.0 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -triple x86_64-apple-darwin10.6.0 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -triple x86_64-apple-darwin10.6.0 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print
+// RUN: %clang_cc1 -fopenmp -triple x86_64-apple-darwin10.6.0 -std=c++11 -include-pch %t -verify %s -ast-print
 // RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-linux-gnu -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print
+// RUN: %clang_cc1 -fopenmp -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -verify %s -ast-print
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-apple-darwin10.6.0 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -triple x86_64-apple-darwin10.6.0 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -triple x86_64-apple-darwin10.6.0 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print
+// RUN: %clang_cc1 -fopenmp-simd -triple x86_64-apple-darwin10.6.0 -std=c++11 -include-pch %t -verify %s -ast-print
 // RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-linux-gnu -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print
+// RUN: %clang_cc1 -fopenmp-simd -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -verify %s -ast-print
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/amdgpu_exceptions.cpp b/clang/test/OpenMP/amdgpu_exceptions.cpp
index 2bc1d4340f75..ae7bf072680d 100644
--- a/clang/test/OpenMP/amdgpu_exceptions.cpp
+++ b/clang/test/OpenMP/amdgpu_exceptions.cpp
@@ -7,10 +7,10 @@
  * target region but emit a warning instead.
 */
 
-// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -emit-llvm -S -verify=with -Wopenmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -emit-llvm -S -verify=with -Wopenmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fexceptions %s -emit-llvm -S -verify=with -Wopenmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device %s -emit-llvm -S -verify=with -Wopenmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -verify=with -Wopenmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -verify=with -Wopenmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fexceptions %s -verify=with -Wopenmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device %s -verify=with -Wopenmp-target-exception -analyze
 
 /**
  * The following four lines test that no warning is emitted when providing 
@@ -18,17 +18,17 @@
  * -fcxx-exceptions.
 */
 
-// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -emit-llvm -S -verify=without -Wno-openmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fcxx-exceptions %s -emit-llvm -S -verify=without -Wno-openmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fexceptions %s -emit-llvm -S -verify=without -Wno-openmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device %s -emit-llvm -S -verify=without -Wno-openmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -verify=without -Wno-openmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fcxx-exceptions %s -verify=without -Wno-openmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fexceptions %s -verify=without -Wno-openmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device %s -verify=without -Wno-openmp-target-exception -analyze
 
 /**
  * Finally we should test that we only ignore exceptions in the OpenMP 
  * offloading tool-chain
 */
 
-// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa %s -emit-llvm -S -verify=noexceptions -o -
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa %s -emit-llvm-only -verify=noexceptions
 
 // noexceptions-error@39 {{cannot use 'try' with exceptions disabled}}
 // noexceptions-error@40 {{cannot use 'throw' with exceptions disabled}}
diff --git a/clang/test/OpenMP/amdgpu_throw.cpp b/clang/test/OpenMP/amdgpu_throw.cpp
index c7248222d7ef..84d861371a82 100644
--- a/clang/test/OpenMP/amdgpu_throw.cpp
+++ b/clang/test/OpenMP/amdgpu_throw.cpp
@@ -7,10 +7,10 @@
  * target region but emit a warning instead.
 */
 
-// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -emit-llvm -S -verify=with -Wopenmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -emit-llvm -S -verify=with -Wopenmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fexceptions %s -emit-llvm -S -verify=with -Wopenmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device %s -emit-llvm -S -verify=with -Wopenmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -verify=with -Wopenmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -verify=with -Wopenmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fexceptions %s -verify=with -Wopenmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device %s -verify=with -Wopenmp-target-exception -analyze
 
 /**
  * The following four lines test that no warning is emitted when providing 
@@ -18,17 +18,17 @@
  * -fcxx-exceptions.
 */
 
-// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -emit-llvm -S -verify=without -Wno-openmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fcxx-exceptions %s -emit-llvm -S -verify=without -Wno-openmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fexceptions %s -emit-llvm -S -verify=without -Wno-openmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device %s -emit-llvm -S -verify=without -Wno-openmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -verify=without -Wno-openmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fcxx-exceptions %s -verify=without -Wno-openmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fexceptions %s -verify=without -Wno-openmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device %s -verify=without -Wno-openmp-target-exception -analyze
 
 /**
  * Finally we should test that we only ignore exceptions in the OpenMP 
  * offloading tool-chain
 */
 
-// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa %s -emit-llvm -S -verify=noexceptions -o -
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa %s -emit-llvm-only -verify=noexceptions
 
 // noexceptions-error@37 {{cannot use 'throw' with exceptions disabled}}
 
diff --git a/clang/test/OpenMP/amdgpu_throw_trap.cpp b/clang/test/OpenMP/amdgpu_throw_trap.cpp
index 82b325275ac3..50ead7532c5a 100644
--- a/clang/test/OpenMP/amdgpu_throw_trap.cpp
+++ b/clang/test/OpenMP/amdgpu_throw_trap.cpp
@@ -1,7 +1,7 @@
 // REQUIRES: amdgpu-registered-target, x86-registered-target
 
-// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device %s -emit-llvm -S -Wno-openmp-target-exception -o - | FileCheck -check-prefix=DEVICE %s
-// RUN: %clang_cc1 -fopenmp -triple x86_64-pc-linux-gnu -fopenmp-is-target-device -fcxx-exceptions %s -emit-llvm -S -Wno-openmp-target-exception -o - | FileCheck -check-prefix=HOST %s
+// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device %s -S -Wno-openmp-target-exception -o - | FileCheck -check-prefix=DEVICE %s
+// RUN: %clang_cc1 -fopenmp -triple x86_64-pc-linux-gnu -fopenmp-is-target-device -fcxx-exceptions %s -S -Wno-openmp-target-exception -o - | FileCheck -check-prefix=HOST %s
 // DEVICE: s_trap
 // DEVICE-NOT: __cxa_throw
 // HOST: __cxa_throw
diff --git a/clang/test/OpenMP/amdgpu_try_catch.cpp b/clang/test/OpenMP/amdgpu_try_catch.cpp
index 76568f2cc696..3ea1a76f2a68 100644
--- a/clang/test/OpenMP/amdgpu_try_catch.cpp
+++ b/clang/test/OpenMP/amdgpu_try_catch.cpp
@@ -7,10 +7,10 @@
  * target region but emit a warning instead.
 */
 
-// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -emit-llvm -S -verify=with -Wopenmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -emit-llvm -S -verify=with -Wopenmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fexceptions %s -emit-llvm -S -verify=with -Wopenmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device %s -emit-llvm -S -verify=with -Wopenmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -verify=with -Wopenmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -verify=with -Wopenmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fexceptions %s -verify=with -Wopenmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device %s -verify=with -Wopenmp-target-exception -analyze
 
 /**
  * The following four lines test that no warning is emitted when providing 
@@ -18,17 +18,17 @@
  * -fcxx-exceptions.
 */
 
-// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -emit-llvm -S -verify=without -Wno-openmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fcxx-exceptions %s -emit-llvm -S -verify=without -Wno-openmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fexceptions %s -emit-llvm -S -verify=without -Wno-openmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device %s -emit-llvm -S -verify=without -Wno-openmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -verify=without -Wno-openmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fcxx-exceptions %s -verify=without -Wno-openmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -fexceptions %s -verify=without -Wno-openmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple amdgcn-amd-amdhsa -fopenmp-is-target-device %s -verify=without -Wno-openmp-target-exception -analyze
 
 /**
  * Finally we should test that we only ignore exceptions in the OpenMP 
  * offloading tool-chain
 */
 
-// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa %s -emit-llvm -S -verify=noexceptions -o -
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa %s -emit-llvm-only -verify=noexceptions
 
 // noexceptions-error@38 {{cannot use 'try' with exceptions disabled}}
 
diff --git a/clang/test/OpenMP/assumes_print.cpp b/clang/test/OpenMP/assumes_print.cpp
index da3629f70408..d8bdaaaf4518 100644
--- a/clang/test/OpenMP/assumes_print.cpp
+++ b/clang/test/OpenMP/assumes_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/assumes_template_print.cpp b/clang/test/OpenMP/assumes_template_print.cpp
index e0bc3e9884ca..614138b2ee0b 100644
--- a/clang/test/OpenMP/assumes_template_print.cpp
+++ b/clang/test/OpenMP/assumes_template_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 // It is unclear if we want to annotate the template instantiations, e.g., S<int>::foo, or not in the two
diff --git a/clang/test/OpenMP/atomic_ast_print.cpp b/clang/test/OpenMP/atomic_ast_print.cpp
index b16a5fcdbdd3..c97606797304 100644
--- a/clang/test/OpenMP/atomic_ast_print.cpp
+++ b/clang/test/OpenMP/atomic_ast_print.cpp
@@ -1,18 +1,18 @@
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -DOMP51 -verify -fopenmp -ast-print %s | FileCheck --check-prefixes=CHECK,CHECK-51 %s
 // RUN: %clang_cc1 -DOMP51 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -DOMP51 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck --check-prefixes=CHECK,CHECK-51 %s
+// RUN: %clang_cc1 -DOMP51 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck --check-prefixes=CHECK,CHECK-51 %s
 
 // RUN: %clang_cc1 -DOMP51 -verify -fopenmp-simd -ast-print %s | FileCheck --check-prefixes=CHECK,CHECK-51 %s
 // RUN: %clang_cc1 -DOMP51 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -DOMP51 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck --check-prefixes=CHECK,CHECK-51 %s
+// RUN: %clang_cc1 -DOMP51 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck --check-prefixes=CHECK,CHECK-51 %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/barrier_ast_print.cpp b/clang/test/OpenMP/barrier_ast_print.cpp
index 9b7398b3d5b4..3a82de2aa4a6 100644
--- a/clang/test/OpenMP/barrier_ast_print.cpp
+++ b/clang/test/OpenMP/barrier_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/cancel_ast_print.cpp b/clang/test/OpenMP/cancel_ast_print.cpp
index f5173ed4ca51..4ee8c8d1446e 100644
--- a/clang/test/OpenMP/cancel_ast_print.cpp
+++ b/clang/test/OpenMP/cancel_ast_print.cpp
@@ -1,18 +1,18 @@
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // expected-no-diagnostics
 
diff --git a/clang/test/OpenMP/cancel_codegen.cpp b/clang/test/OpenMP/cancel_codegen.cpp
index 03024cf331b2..186f0ab7356d 100644
--- a/clang/test/OpenMP/cancel_codegen.cpp
+++ b/clang/test/OpenMP/cancel_codegen.cpp
@@ -1,27 +1,27 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -triple x86_64-apple-darwin13.4.0 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK1
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple x86_64-apple-darwin13.4.0 -emit-pch -o %t.0 %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t.0 -fsyntax-only -verify %s -triple x86_64-apple-darwin13.4.0 -emit-llvm -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t.0 -verify %s -triple x86_64-apple-darwin13.4.0 -emit-llvm -o - | FileCheck %s --check-prefix=CHECK1
 
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-enable-irbuilder -triple x86_64-apple-darwin13.4.0 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK3
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -fopenmp-enable-irbuilder -x c++ -std=c++11 -triple x86_64-apple-darwin13.4.0 -emit-pch -o %t.1 %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -fopenmp-enable-irbuilder -std=c++11 -include-pch %t.1 -fsyntax-only -verify %s -triple x86_64-apple-darwin13.4.0 -emit-llvm -o - | FileCheck %s --check-prefix=CHECK3
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -fopenmp-enable-irbuilder -std=c++11 -include-pch %t.1 -verify %s -triple x86_64-apple-darwin13.4.0 -emit-llvm -o - | FileCheck %s --check-prefix=CHECK3
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -triple x86_64-apple-darwin13.4.0 -emit-llvm -o - %s | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -triple x86_64-apple-darwin13.4.0 -emit-pch -o %t.2 %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t.2 -fsyntax-only -verify %s -triple x86_64-apple-darwin13.4.0 -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t.2 -verify %s -triple x86_64-apple-darwin13.4.0 -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
 
 // RUN: %clang_cc1 -verify -fopenmp -triple x86_64-apple-darwin13.4.0 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK1
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin13.4.0 -emit-pch -o %t.3 %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t.3 -fsyntax-only -verify %s -triple x86_64-apple-darwin13.4.0 -emit-llvm -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t.3 -verify %s -triple x86_64-apple-darwin13.4.0 -emit-llvm -o - | FileCheck %s --check-prefix=CHECK1
 
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-enable-irbuilder -triple x86_64-apple-darwin13.4.0 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK3
 // RUN: %clang_cc1 -fopenmp -fopenmp-enable-irbuilder -x c++ -std=c++11 -triple x86_64-apple-darwin13.4.0 -emit-pch -o %t.4 %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-enable-irbuilder -std=c++11 -include-pch %t.4 -fsyntax-only -verify %s -triple x86_64-apple-darwin13.4.0 -emit-llvm -o - | FileCheck %s --check-prefix=CHECK3
+// RUN: %clang_cc1 -fopenmp -fopenmp-enable-irbuilder -std=c++11 -include-pch %t.4 -verify %s -triple x86_64-apple-darwin13.4.0 -emit-llvm -o - | FileCheck %s --check-prefix=CHECK3
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-apple-darwin13.4.0 -emit-llvm -o - %s | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple x86_64-apple-darwin13.4.0 -emit-pch -o %t.5 %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t.5 -fsyntax-only -verify %s -triple x86_64-apple-darwin13.4.0 -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t.5 -verify %s -triple x86_64-apple-darwin13.4.0 -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
 
 // expected-no-diagnostics
 #ifndef HEADER
diff --git a/clang/test/OpenMP/cancellation_point_ast_print.cpp b/clang/test/OpenMP/cancellation_point_ast_print.cpp
index d27d2b75fd4c..23b1765048f1 100644
--- a/clang/test/OpenMP/cancellation_point_ast_print.cpp
+++ b/clang/test/OpenMP/cancellation_point_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/cancellation_point_codegen.cpp b/clang/test/OpenMP/cancellation_point_codegen.cpp
index 85c7585b357c..4fa73bff064e 100644
--- a/clang/test/OpenMP/cancellation_point_codegen.cpp
+++ b/clang/test/OpenMP/cancellation_point_codegen.cpp
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
 // RUN: %clang_cc1 -verify -fopenmp -triple x86_64-apple-darwin13.4.0 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK1
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin13.4.0 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -triple x86_64-apple-darwin13.4.0 -emit-llvm -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -triple x86_64-apple-darwin13.4.0 -emit-llvm -o - | FileCheck %s --check-prefix=CHECK1
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-apple-darwin13.4.0 -emit-llvm -o - %s | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple x86_64-apple-darwin13.4.0 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -triple x86_64-apple-darwin13.4.0 -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -triple x86_64-apple-darwin13.4.0 -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
diff --git a/clang/test/OpenMP/constexpr_capture.cpp b/clang/test/OpenMP/constexpr_capture.cpp
index a85442e11ac3..ee5283e66154 100644
--- a/clang/test/OpenMP/constexpr_capture.cpp
+++ b/clang/test/OpenMP/constexpr_capture.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-linux -S -emit-llvm %s -o - -std=c++11 2>&1 | FileCheck %s
-// RUN: %clang_cc1 -verify -fopenmp -x c++  -fopenmp-targets=x86_64-pc-linux-gnu -triple powerpc64le-unknown-linux -S -emit-llvm %s -o - -std=c++11 2>&1 | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-linux -emit-llvm %s -o - -std=c++11 2>&1 | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++  -fopenmp-targets=x86_64-pc-linux-gnu -triple powerpc64le-unknown-linux -emit-llvm %s -o - -std=c++11 2>&1 | FileCheck %s
 // expected-no-diagnostics
 
 template <int __v> struct integral_constant {
diff --git a/clang/test/OpenMP/critical_ast_print.cpp b/clang/test/OpenMP/critical_ast_print.cpp
index 20cb9bf99a6f..74491395450c 100644
--- a/clang/test/OpenMP/critical_ast_print.cpp
+++ b/clang/test/OpenMP/critical_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/declare_mapper_ast_print.c b/clang/test/OpenMP/declare_mapper_ast_print.c
index e6a0546fe5c9..3c554a106fe4 100644
--- a/clang/test/OpenMP/declare_mapper_ast_print.c
+++ b/clang/test/OpenMP/declare_mapper_ast_print.c
@@ -1,26 +1,26 @@
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -DOMP51 -verify -fopenmp -ast-print %s | FileCheck -check-prefixes=CHECK,OMP51 %s
 // RUN: %clang_cc1 -DOMP51 -fopenmp -emit-pch -o %t %s
-// RUN: %clang_cc1 -DOMP51 -fopenmp -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck -check-prefixes=CHECK,OMP51 %s
+// RUN: %clang_cc1 -DOMP51 -fopenmp -include-pch %t -verify %s -ast-print | FileCheck -check-prefixes=CHECK,OMP51 %s
 
 // RUN: %clang_cc1 -DOMP51 -verify -fopenmp-simd -ast-print %s | FileCheck -check-prefixes=CHECK,OMP51 %s
 // RUN: %clang_cc1 -DOMP51 -fopenmp-simd -emit-pch -o %t %s
-// RUN: %clang_cc1 -DOMP51 -fopenmp-simd -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck -check-prefixes=CHECK,OMP51 %s
+// RUN: %clang_cc1 -DOMP51 -fopenmp-simd -include-pch %t -verify %s -ast-print | FileCheck -check-prefixes=CHECK,OMP51 %s
 
 // RUN: %clang_cc1 -DOMP52 -verify -fopenmp -fopenmp-version=52 -ast-print %s | FileCheck -check-prefixes=CHECK,OMP52 %s
 // RUN: %clang_cc1 -DOMP52 -fopenmp -fopenmp-version=52 -emit-pch -o %t %s
-// RUN: %clang_cc1 -DOMP52 -fopenmp -fopenmp-version=52 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck -check-prefixes=CHECK,OMP52 %s
+// RUN: %clang_cc1 -DOMP52 -fopenmp -fopenmp-version=52 -include-pch %t -verify %s -ast-print | FileCheck -check-prefixes=CHECK,OMP52 %s
 
 // RUN: %clang_cc1 -DOMP52 -verify -fopenmp-simd -fopenmp-version=52 -ast-print %s | FileCheck -check-prefixes=CHECK,OMP52 %s
 // RUN: %clang_cc1 -DOMP52 -fopenmp-simd -fopenmp-version=52 -emit-pch -o %t %s
-// RUN: %clang_cc1 -DOMP52 -fopenmp-simd -fopenmp-version=52 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck -check-prefixes=CHECK,OMP52 %s
+// RUN: %clang_cc1 -DOMP52 -fopenmp-simd -fopenmp-version=52 -include-pch %t -verify %s -ast-print | FileCheck -check-prefixes=CHECK,OMP52 %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/declare_mapper_ast_print.cpp b/clang/test/OpenMP/declare_mapper_ast_print.cpp
index 9761fac70cab..422fa9981672 100644
--- a/clang/test/OpenMP/declare_mapper_ast_print.cpp
+++ b/clang/test/OpenMP/declare_mapper_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/declare_reduction_ast_print.c b/clang/test/OpenMP/declare_reduction_ast_print.c
index 74c28b3219f5..e1bbf99398a8 100644
--- a/clang/test/OpenMP/declare_reduction_ast_print.c
+++ b/clang/test/OpenMP/declare_reduction_ast_print.c
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/declare_reduction_ast_print.cpp b/clang/test/OpenMP/declare_reduction_ast_print.cpp
index 15bee2800ba0..dcb78ff1c63b 100644
--- a/clang/test/OpenMP/declare_reduction_ast_print.cpp
+++ b/clang/test/OpenMP/declare_reduction_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/declare_simd_ast_print.c b/clang/test/OpenMP/declare_simd_ast_print.c
index 414b01038793..990a6cd8c650 100644
--- a/clang/test/OpenMP/declare_simd_ast_print.c
+++ b/clang/test/OpenMP/declare_simd_ast_print.c
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/declare_simd_ast_print.cpp b/clang/test/OpenMP/declare_simd_ast_print.cpp
index 565dc2dfc04d..2704aae8617f 100644
--- a/clang/test/OpenMP/declare_simd_ast_print.cpp
+++ b/clang/test/OpenMP/declare_simd_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -std=c++11 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/declare_target_ast_print.cpp b/clang/test/OpenMP/declare_target_ast_print.cpp
index 43cccf763e97..3334b7491fab 100644
--- a/clang/test/OpenMP/declare_target_ast_print.cpp
+++ b/clang/test/OpenMP/declare_target_ast_print.cpp
@@ -1,19 +1,19 @@
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -I %S/Inputs -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -I %S/Inputs -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -I %S/Inputs -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -I %S/Inputs -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -verify -fopenmp -I %S/Inputs -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -I %S/Inputs -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -I %S/Inputs -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -I %S/Inputs -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -I %S/Inputs -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -I %S/Inputs -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -I %S/Inputs -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -I %S/Inputs -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -x c++ -std=c++11 -I %S/Inputs -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -std=c++11 -include-pch %t -fsyntax-only -I %S/Inputs -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -std=c++11 -include-pch %t -I %S/Inputs -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -I %S/Inputs -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -I %S/Inputs -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -I %S/Inputs -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -I %S/Inputs -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/declare_target_codegen.cpp b/clang/test/OpenMP/declare_target_codegen.cpp
index a5a9b790b468..ba93772ede3e 100644
--- a/clang/test/OpenMP/declare_target_codegen.cpp
+++ b/clang/test/OpenMP/declare_target_codegen.cpp
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc -DLOAD
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-target-device -fvisibility=protected -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -DLOAD | FileCheck %s
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-target-device -fvisibility=protected -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown %s -fopenmp-is-target-device -fvisibility=protected -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-target-device -fvisibility=protected -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - -DLOAD | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - -DOMP5 | FileCheck %s --check-prefix HOST5
@@ -15,7 +15,7 @@
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc -fopenmp-version=45
 // RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-target-device -fvisibility=protected -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-version=45 | FileCheck %s --check-prefix SIMD-ONLY
-// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-target-device -fvisibility=protected -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t -fopenmp-version=45
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown %s -fopenmp-is-target-device -fvisibility=protected -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t -fopenmp-version=45
 // RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-target-device -fvisibility=protected -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify -o - -fopenmp-version=45 | FileCheck %s --check-prefix SIMD-ONLY
 
 // expected-no-diagnostics
diff --git a/clang/test/OpenMP/declare_target_link_codegen.cpp b/clang/test/OpenMP/declare_target_link_codegen.cpp
index dd1ac813efaa..189c9ac59c15 100644
--- a/clang/test/OpenMP/declare_target_link_codegen.cpp
+++ b/clang/test/OpenMP/declare_target_link_codegen.cpp
@@ -2,13 +2,13 @@
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-windows-gnu -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix HOST-COFF --check-prefix CHECK
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix DEVICE --check-prefix CHECK
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - | FileCheck %s --check-prefix DEVICE --check-prefix CHECK
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix SIMD-ONLY
 // RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
 // RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o -| FileCheck %s --check-prefix SIMD-ONLY
-// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t
 // RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify -o - | FileCheck %s --check-prefix SIMD-ONLY
 
 // expected-no-diagnostics
diff --git a/clang/test/OpenMP/declare_variant_device_kind_codegen.cpp b/clang/test/OpenMP/declare_variant_device_kind_codegen.cpp
index 4f9a86f1e008..9335df10f957 100644
--- a/clang/test/OpenMP/declare_variant_device_kind_codegen.cpp
+++ b/clang/test/OpenMP/declare_variant_device_kind_codegen.cpp
@@ -20,22 +20,22 @@
 
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-linux -fopenmp-targets=x86_64-unknown-linux -emit-llvm-bc %s -o %t-host.bc -DCPU
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -o - -DCPU | FileCheck %s
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -emit-pch -o %t -DCPU
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-linux %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -emit-pch -o %t -DCPU
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -include-pch %t -o - -DCPU | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple ppc64le-unknown-linux -fopenmp-targets=ppc64le-unknown-linux -emit-llvm-bc %s -o %t-host.bc -DCPU
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple ppc64le-unknown-linux -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -o - -DCPU | FileCheck %s
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple ppc64le-unknown-linux -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -emit-pch -o %t -DCPU
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple ppc64le-unknown-linux %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -emit-pch -o %t -DCPU
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple ppc64le-unknown-linux -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -include-pch %t -o - -DCPU | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-linux -fopenmp-targets=x86_64-unknown-linux -emit-llvm-bc %s -o %t-host.bc -DNOHOST
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -o - -DNOHOST | FileCheck %s
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -emit-pch -o %t -DNOHOST
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-linux %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -emit-pch -o %t -DNOHOST
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -include-pch %t -o - -DNOHOST | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple ppc64le-unknown-linux -fopenmp-targets=ppc64le-unknown-linux -emit-llvm-bc %s -o %t-host.bc -DNOHOST
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple ppc64le-unknown-linux -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -o - -DNOHOST | FileCheck %s
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple ppc64le-unknown-linux -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -emit-pch -o %t -DNOHOST
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple ppc64le-unknown-linux %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -emit-pch -o %t -DNOHOST
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple ppc64le-unknown-linux -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -include-pch %t -o - -DNOHOST | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fexceptions -fcxx-exceptions -o - -fsanitize-address-use-after-scope -DHOST | FileCheck %s
@@ -60,22 +60,22 @@
 
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux -fopenmp-targets=x86_64-unknown-linux -emit-llvm-bc %s -o %t-host.bc -DCPU
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -o - -DCPU | FileCheck %s
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -emit-pch -o %t -DCPU
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -emit-pch -o %t -DCPU
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -include-pch %t -o - -DCPU | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple ppc64le-unknown-linux -fopenmp-targets=ppc64le-unknown-linux -emit-llvm-bc %s -o %t-host.bc -DCPU
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple ppc64le-unknown-linux -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -o - -DCPU | FileCheck %s
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple ppc64le-unknown-linux -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -emit-pch -o %t -DCPU
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple ppc64le-unknown-linux %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -emit-pch -o %t -DCPU
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple ppc64le-unknown-linux -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -include-pch %t -o - -DCPU | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux -fopenmp-targets=x86_64-unknown-linux -emit-llvm-bc %s -o %t-host.bc -DNOHOST
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -o - -DNOHOST | FileCheck %s
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -emit-pch -o %t -DNOHOST
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -emit-pch -o %t -DNOHOST
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -include-pch %t -o - -DNOHOST | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple ppc64le-unknown-linux -fopenmp-targets=ppc64le-unknown-linux -emit-llvm-bc %s -o %t-host.bc -DNOHOST
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple ppc64le-unknown-linux -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -o - -DNOHOST | FileCheck %s
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple ppc64le-unknown-linux -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -emit-pch -o %t -DNOHOST
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple ppc64le-unknown-linux %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -emit-pch -o %t -DNOHOST
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple ppc64le-unknown-linux -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -include-pch %t -o - -DNOHOST | FileCheck %s
 
 // expected-no-diagnostics
diff --git a/clang/test/OpenMP/declare_variant_mixed_codegen.c b/clang/test/OpenMP/declare_variant_mixed_codegen.c
index 2ee8a4e184c9..ee44ead25b74 100644
--- a/clang/test/OpenMP/declare_variant_mixed_codegen.c
+++ b/clang/test/OpenMP/declare_variant_mixed_codegen.c
@@ -3,7 +3,7 @@
 // RUN: %clang_cc1 -fopenmp -x c -triple x86_64-unknown-linux -include-pch %t -verify %s -emit-llvm -o - -fopenmp-version=45 | FileCheck %s --check-prefix HOST
 // RUN: %clang_cc1 -verify -fopenmp -x c -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -fopenmp-version=45
 // RUN: %clang_cc1 -verify -fopenmp -x c -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-version=45 | FileCheck %s --check-prefix GPU
-// RUN: %clang_cc1 -verify -fopenmp -x c -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t -fopenmp-version=45
+// RUN: %clang_cc1 -verify -fopenmp -x c -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t -fopenmp-version=45
 // RUN: %clang_cc1 -verify -fopenmp -x c -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - -fopenmp-version=45 | FileCheck %s --check-prefix GPU
 
 // RUN: %clang_cc1 -verify -fopenmp -x c -triple x86_64-unknown-linux -emit-llvm %s -o - | FileCheck %s --check-prefix HOST
@@ -11,7 +11,7 @@
 // RUN: %clang_cc1 -fopenmp -x c -triple x86_64-unknown-linux -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix HOST
 // RUN: %clang_cc1 -verify -fopenmp -x c -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
 // RUN: %clang_cc1 -verify -fopenmp -x c -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix GPU
-// RUN: %clang_cc1 -verify -fopenmp -x c -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t
+// RUN: %clang_cc1 -verify -fopenmp -x c -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t
 // RUN: %clang_cc1 -verify -fopenmp -x c -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - | FileCheck %s --check-prefix GPU
 // expected-no-diagnostics
 
diff --git a/clang/test/OpenMP/depobj_ast_print.cpp b/clang/test/OpenMP/depobj_ast_print.cpp
index 3959396f8095..f536799922a0 100644
--- a/clang/test/OpenMP/depobj_ast_print.cpp
+++ b/clang/test/OpenMP/depobj_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/distribute_ast_print.cpp b/clang/test/OpenMP/distribute_ast_print.cpp
index fb90505e6efd..7467a081f39f 100644
--- a/clang/test/OpenMP/distribute_ast_print.cpp
+++ b/clang/test/OpenMP/distribute_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s -Wno-openmp-mapping | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s -Wno-openmp-mapping | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/distribute_dist_schedule_ast_print.cpp b/clang/test/OpenMP/distribute_dist_schedule_ast_print.cpp
index de5e59aef73b..16fda05b9112 100644
--- a/clang/test/OpenMP/distribute_dist_schedule_ast_print.cpp
+++ b/clang/test/OpenMP/distribute_dist_schedule_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/distribute_parallel_for_ast_print.cpp b/clang/test/OpenMP/distribute_parallel_for_ast_print.cpp
index b3424e8928c7..7af4e5f6b2b4 100644
--- a/clang/test/OpenMP/distribute_parallel_for_ast_print.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_ast_print.cpp
@@ -1,22 +1,22 @@
 // RUN: %clang_cc1 -verify -std=c++11 -fopenmp -fopenmp-version=45 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -verify -std=c++11 -fopenmp -ast-print %s -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s -DOMP5
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -verify -std=c++11 -fopenmp -fopenmp-version=51 -ast-print %s -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP51
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 
 // RUN: %clang_cc1 -verify -std=c++11 -fopenmp-simd -fopenmp-version=45 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -verify -std=c++11 -fopenmp-simd -ast-print %s -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s -DOMP5
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -verify -std=c++11 -fopenmp-simd -fopenmp-version=51 -ast-print %s -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP51
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/distribute_parallel_for_num_threads_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_num_threads_codegen.cpp
index dfe79d6a9493..2bbf29acba16 100644
--- a/clang/test/OpenMP/distribute_parallel_for_num_threads_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_num_threads_codegen.cpp
@@ -2728,7 +2728,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_ZN1SC1El
-// CHECK9-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
+// CHECK9-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
@@ -2741,7 +2741,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_ZN1ScvcEv
-// CHECK9-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) #[[ATTR1]] comdat align 2 {
+// CHECK9-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) #[[ATTR1]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -3214,7 +3214,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_ZN1SD1Ev
-// CHECK9-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 {
+// CHECK9-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -3224,7 +3224,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_ZN1SC2El
-// CHECK9-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 {
+// CHECK9-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR6]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
@@ -3238,7 +3238,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_ZN1SD2Ev
-// CHECK9-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 {
+// CHECK9-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -3980,7 +3980,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SC1El
-// CHECK13-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
+// CHECK13-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
@@ -3993,7 +3993,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1ScvcEv
-// CHECK13-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) #[[ATTR1]] comdat align 2 {
+// CHECK13-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) #[[ATTR1]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -4466,7 +4466,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SD1Ev
-// CHECK13-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 {
+// CHECK13-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -4476,7 +4476,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SC2El
-// CHECK13-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 {
+// CHECK13-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR6]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
@@ -5093,7 +5093,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SD2Ev
-// CHECK13-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 {
+// CHECK13-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_ast_print.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_ast_print.cpp
index 94ba8494d712..ae3caa88d454 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_ast_print.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_ast_print.cpp
@@ -1,22 +1,22 @@
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix CHECK --check-prefix OMP45
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix CHECK --check-prefix OMP45
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix CHECK --check-prefix OMP45
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s -DOMP5
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -ast-print %s -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix CHECK --check-prefix OMP51
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP51
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix CHECK --check-prefix OMP51
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix CHECK --check-prefix OMP51
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix CHECK --check-prefix OMP45
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix CHECK --check-prefix OMP45
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix CHECK --check-prefix OMP45
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s -DOMP5
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=51 -ast-print %s -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix CHECK --check-prefix OMP51
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP51
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix CHECK --check-prefix OMP51
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix CHECK --check-prefix OMP51
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_num_threads_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_num_threads_codegen.cpp
index 5c1ca41b1c62..ff89ea8342dc 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_num_threads_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_num_threads_codegen.cpp
@@ -3235,7 +3235,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_ZN1SC1El
-// CHECK9-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 [[A:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
+// CHECK9-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 [[A:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
@@ -3248,7 +3248,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_ZN1ScvcEv
-// CHECK9-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) #[[ATTR1]] comdat align 2 {
+// CHECK9-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) #[[ATTR1]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -3749,7 +3749,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_ZN1SD1Ev
-// CHECK9-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 {
+// CHECK9-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -3759,7 +3759,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_ZN1SC2El
-// CHECK9-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 [[A:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 {
+// CHECK9-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 [[A:%.*]]) unnamed_addr #[[ATTR6]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
@@ -3773,7 +3773,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_ZN1SD2Ev
-// CHECK9-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 {
+// CHECK9-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -4558,7 +4558,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@_ZN1SC1El
-// CHECK11-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 [[A:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
+// CHECK11-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 [[A:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
@@ -4571,7 +4571,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@_ZN1ScvcEv
-// CHECK11-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) #[[ATTR1]] comdat align 2 {
+// CHECK11-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) #[[ATTR1]] comdat {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -4747,7 +4747,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@_ZN1SD1Ev
-// CHECK11-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 {
+// CHECK11-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -4757,7 +4757,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@_ZN1SC2El
-// CHECK11-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 [[A:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 {
+// CHECK11-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 [[A:%.*]]) unnamed_addr #[[ATTR4]] comdat {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
@@ -4771,7 +4771,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@_ZN1SD2Ev
-// CHECK11-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 {
+// CHECK11-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -4910,7 +4910,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SC1El
-// CHECK13-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 [[A:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
+// CHECK13-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 [[A:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
@@ -4923,7 +4923,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1ScvcEv
-// CHECK13-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) #[[ATTR1]] comdat align 2 {
+// CHECK13-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) #[[ATTR1]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -5424,7 +5424,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SD1Ev
-// CHECK13-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 {
+// CHECK13-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -5434,7 +5434,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SC2El
-// CHECK13-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 [[A:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 {
+// CHECK13-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 [[A:%.*]]) unnamed_addr #[[ATTR6]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
@@ -6107,7 +6107,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SD2Ev
-// CHECK13-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 {
+// CHECK13-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
diff --git a/clang/test/OpenMP/distribute_simd_ast_print.cpp b/clang/test/OpenMP/distribute_simd_ast_print.cpp
index aed9d6376ef5..913020300831 100644
--- a/clang/test/OpenMP/distribute_simd_ast_print.cpp
+++ b/clang/test/OpenMP/distribute_simd_ast_print.cpp
@@ -1,29 +1,29 @@
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s -DOMP5
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -ast-print %s -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP51
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -ast-print %s -Wno-openmp-mapping -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP52
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s -DOMP5
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=51 -ast-print %s -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP51
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // expected-no-diagnostics
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -ast-print %s -Wno-openmp-mapping -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP52
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/error_ast_print.cpp b/clang/test/OpenMP/error_ast_print.cpp
index c6d361b1a3b8..8e40f7da7132 100644
--- a/clang/test/OpenMP/error_ast_print.cpp
+++ b/clang/test/OpenMP/error_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -fopenmp-version=51 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -fopenmp-version=51 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -fopenmp-version=51 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=51 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -fopenmp-version=51 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -fopenmp-version=51 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -fopenmp-version=51 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/flush_ast_print.cpp b/clang/test/OpenMP/flush_ast_print.cpp
index f3d81d111b46..9578ada02022 100644
--- a/clang/test/OpenMP/flush_ast_print.cpp
+++ b/clang/test/OpenMP/flush_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/for_ast_print.cpp b/clang/test/OpenMP/for_ast_print.cpp
index abc27caa9ae8..f793050067d8 100644
--- a/clang/test/OpenMP/for_ast_print.cpp
+++ b/clang/test/OpenMP/for_ast_print.cpp
@@ -1,22 +1,22 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s -Wsign-conversion | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -ast-print %s -Wsign-conversion -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP51
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -std=c++11 -include-pch %t -verify %s -ast-print -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -ast-print %s -Wsign-conversion -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP52
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -std=c++11 -include-pch %t -verify %s -ast-print -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s -Wsign-conversion | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=51 -ast-print %s -Wsign-conversion -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP51
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -std=c++11 -include-pch %t -verify %s -ast-print -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -ast-print %s -Wsign-conversion -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP52
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -std=c++11 -include-pch %t -verify %s -ast-print -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/for_ast_print_cxx20.cpp b/clang/test/OpenMP/for_ast_print_cxx20.cpp
index 800ce1230e4e..c3c8326e7db5 100644
--- a/clang/test/OpenMP/for_ast_print_cxx20.cpp
+++ b/clang/test/OpenMP/for_ast_print_cxx20.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp --std=c++20 -ast-print %s -Wsign-conversion | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++20 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++20 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++20 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd --std=c++20 -ast-print %s -Wsign-conversion | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++20 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++20 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++20 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/for_loop_auto.cpp b/clang/test/OpenMP/for_loop_auto.cpp
index 4467de6bba18..2215f9bd651a 100644
--- a/clang/test/OpenMP/for_loop_auto.cpp
+++ b/clang/test/OpenMP/for_loop_auto.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print -std=c++20 %s -Wsign-conversion | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++20 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++20 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++20 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print -std=c++20 %s -Wsign-conversion | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++20 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++20 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++20 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/for_simd_ast_print.cpp b/clang/test/OpenMP/for_simd_ast_print.cpp
index 0e8c13f7cc8f..7f77107b8ed7 100644
--- a/clang/test/OpenMP/for_simd_ast_print.cpp
+++ b/clang/test/OpenMP/for_simd_ast_print.cpp
@@ -1,29 +1,29 @@
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s -DOMP5
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -ast-print %s -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP51
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -std=c++11 -include-pch %t -verify %s -ast-print -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -ast-print %s -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP52
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -std=c++11 -include-pch %t -verify %s -ast-print -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s -DOMP5
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=51 -ast-print %s -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP51
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -std=c++11 -include-pch %t -verify %s -ast-print -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // expected-no-diagnostics
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -ast-print %s -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP52
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -std=c++11 -include-pch %t -verify %s -ast-print -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/masked_ast_print.cpp b/clang/test/OpenMP/masked_ast_print.cpp
index 719a2fbf8a94..1deef7583f9b 100644
--- a/clang/test/OpenMP/masked_ast_print.cpp
+++ b/clang/test/OpenMP/masked_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/masked_taskloop_ast_print.cpp b/clang/test/OpenMP/masked_taskloop_ast_print.cpp
index b33b0f6c6ab6..3b3f74828ba4 100644
--- a/clang/test/OpenMP/masked_taskloop_ast_print.cpp
+++ b/clang/test/OpenMP/masked_taskloop_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/masked_taskloop_simd_ast_print.cpp b/clang/test/OpenMP/masked_taskloop_simd_ast_print.cpp
index 68890e614f24..e35cc7bb7b7d 100644
--- a/clang/test/OpenMP/masked_taskloop_simd_ast_print.cpp
+++ b/clang/test/OpenMP/masked_taskloop_simd_ast_print.cpp
@@ -1,16 +1,16 @@
 // RUN: %clang_cc1 -verify -fopenmp -DOMP5 -ast-print %s | FileCheck %s --check-prefix CHECK --check-prefix OMP50
 // RUN: %clang_cc1 -fopenmp -DOMP5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -DOMP5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix CHECK --check-prefix OMP50
+// RUN: %clang_cc1 -fopenmp -DOMP5 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s --check-prefix CHECK --check-prefix OMP50
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s -DOMP5
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -DOMP5 -ast-print %s | FileCheck %s --check-prefix CHECK --check-prefix OMP50
 // RUN: %clang_cc1 -fopenmp-simd -DOMP5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -DOMP5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix CHECK --check-prefix OMP50
+// RUN: %clang_cc1 -fopenmp-simd -DOMP5 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s --check-prefix CHECK --check-prefix OMP50
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s -DOMP5
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/master_ast_print.cpp b/clang/test/OpenMP/master_ast_print.cpp
index 40b14c5c5086..cf247ac90bbb 100644
--- a/clang/test/OpenMP/master_ast_print.cpp
+++ b/clang/test/OpenMP/master_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/master_taskloop_ast_print.cpp b/clang/test/OpenMP/master_taskloop_ast_print.cpp
index 4f58d03250de..a35fd033daed 100644
--- a/clang/test/OpenMP/master_taskloop_ast_print.cpp
+++ b/clang/test/OpenMP/master_taskloop_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/master_taskloop_simd_ast_print.cpp b/clang/test/OpenMP/master_taskloop_simd_ast_print.cpp
index cc3de60938e7..84fa1eebc164 100644
--- a/clang/test/OpenMP/master_taskloop_simd_ast_print.cpp
+++ b/clang/test/OpenMP/master_taskloop_simd_ast_print.cpp
@@ -1,18 +1,18 @@
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -DOMP5 -ast-print %s | FileCheck %s --check-prefix CHECK --check-prefix OMP50
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -DOMP5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -DOMP5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix CHECK --check-prefix OMP50
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -DOMP5 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s --check-prefix CHECK --check-prefix OMP50
 
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s -DOMP51 | FileCheck %s --check-prefix CHECK --check-prefix OMP51
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s -DOMP51
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -DOMP51 | FileCheck %s --check-prefix CHECK --check-prefix OMP51
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print -DOMP51 | FileCheck %s --check-prefix CHECK --check-prefix OMP51
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -DOMP5 -ast-print %s | FileCheck %s --check-prefix CHECK --check-prefix OMP50
 // RUN: %clang_cc1 -fopenmp-simd -DOMP5 -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -DOMP5 -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix CHECK --check-prefix OMP50
+// RUN: %clang_cc1 -fopenmp-simd -DOMP5 -fopenmp-version=50 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s --check-prefix CHECK --check-prefix OMP50
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -DOMP51 -ast-print %s | FileCheck %s --check-prefix CHECK --check-prefix OMP51
 // RUN: %clang_cc1 -fopenmp-simd -DOMP51 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -DOMP51 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix CHECK --check-prefix OMP51
+// RUN: %clang_cc1 -fopenmp-simd -DOMP51 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s --check-prefix CHECK --check-prefix OMP51
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/nvptx_declare_target_var_ctor_dtor_codegen.cpp b/clang/test/OpenMP/nvptx_declare_target_var_ctor_dtor_codegen.cpp
index 1d9ef0c39816..767fb3a6fbe1 100644
--- a/clang/test/OpenMP/nvptx_declare_target_var_ctor_dtor_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_declare_target_var_ctor_dtor_codegen.cpp
@@ -1,13 +1,13 @@
 // RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -o - | FileCheck %s --check-prefix HOST --check-prefix CHECK
 // RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
 // RUN: %clang_cc1 -fopenmp -x c++ -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fvisibility=protected -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix DEVICE --check-prefix CHECK
-// RUN: %clang_cc1 -fopenmp -x c++ -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fvisibility=protected -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t
+// RUN: %clang_cc1 -fopenmp -x c++ -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda %s -fopenmp-is-target-device -fvisibility=protected -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t
 // RUN: %clang_cc1 -fopenmp -x c++ -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fvisibility=protected -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - | FileCheck %s --check-prefix DEVICE --check-prefix CHECK
 
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o - | FileCheck %s --check-prefix SIMD-ONLY
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fvisibility=protected -fopenmp-host-ir-file-path %t-ppc-host.bc -o -| FileCheck %s --check-prefix SIMD-ONLY
-// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fvisibility=protected -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda %s -fopenmp-is-target-device -fvisibility=protected -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fvisibility=protected -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - | FileCheck %s --check-prefix SIMD-ONLY
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/nvptx_declare_variant_device_kind_codegen.cpp b/clang/test/OpenMP/nvptx_declare_variant_device_kind_codegen.cpp
index 1cfa992fe200..964982035981 100644
--- a/clang/test/OpenMP/nvptx_declare_variant_device_kind_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_declare_variant_device_kind_codegen.cpp
@@ -1,21 +1,21 @@
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -DGPU
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -DGPU | FileCheck %s --implicit-check-not='ret i32 {{6|7|9|10|12|14|17|18|20|21|22|23|24|26}}'
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t -DGPU
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t -DGPU
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - -DGPU | FileCheck %s --implicit-check-not='ret i32 {{6|7|9|10|12|14|17|18|20|21|22|23|24|26}}'
 
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -DNOHOST
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -DNOHOST | FileCheck %s --implicit-check-not='ret i32 {{6|7|9|10|12|14|17|18|20|21|22|23|24|26}}'
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t -DNOHOST
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t -DNOHOST
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - -DNOHOST | FileCheck %s --implicit-check-not='ret i32 {{6|7|9|10|12|14|17|18|20|21|22|23|24|26}}'
 
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -DGPU
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -DGPU | FileCheck %s --implicit-check-not='ret i32 {{6|7|9|10|12|14|17|18|20|21|22|23|24|26}}'
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t -DGPU
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t -DGPU
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - -DGPU | FileCheck %s --implicit-check-not='ret i32 {{6|7|9|10|12|14|17|18|20|21|22|23|24|26}}'
 
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -DNOHOST
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -DNOHOST | FileCheck %s --implicit-check-not='ret i32 {{6|7|9|10|12|14|17|18|20|21|22|23|24|26}}'
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t -DNOHOST
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t -DNOHOST
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - -DNOHOST | FileCheck %s --implicit-check-not='ret i32 {{6|7|9|10|12|14|17|18|20|21|22|23|24|26}}'
 // expected-no-diagnostics
 
diff --git a/clang/test/OpenMP/nvptx_declare_variant_implementation_vendor_codegen.cpp b/clang/test/OpenMP/nvptx_declare_variant_implementation_vendor_codegen.cpp
index 8f1c0479c358..d17080ded521 100644
--- a/clang/test/OpenMP/nvptx_declare_variant_implementation_vendor_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_declare_variant_implementation_vendor_codegen.cpp
@@ -1,11 +1,11 @@
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -fopenmp-version=45
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-version=45 | FileCheck %s --implicit-check-not='ret i32 {{6|7|8|9|10|12|13|14|15|17|18|19|20|21|22|23|24|26}}'
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t -fopenmp-version=45
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t -fopenmp-version=45
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - -fopenmp-version=45 | FileCheck %s --implicit-check-not='ret i32 {{6|7|8|9|10|12|13|14|15|17|18|19|20|21|22|23|24|26}}'
 
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --implicit-check-not='ret i32 {{6|7|8|9|10|12|13|14|15|17|18|19|20|21|22|23|24|26}}'
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - | FileCheck %s --implicit-check-not='ret i32 {{6|7|8|9|10|12|13|14|15|17|18|19|20|21|22|23|24|26}}'
 // expected-no-diagnostics
 
diff --git a/clang/test/OpenMP/nvptx_declare_variant_name_mangling.cpp b/clang/test/OpenMP/nvptx_declare_variant_name_mangling.cpp
index 9b232f52e31e..0a5690e77a7b 100644
--- a/clang/test/OpenMP/nvptx_declare_variant_name_mangling.cpp
+++ b/clang/test/OpenMP/nvptx_declare_variant_name_mangling.cpp
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --implicit-check-not='call i32 {@_Z3bazv|@_Z3barv}'
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - | FileCheck %s --implicit-check-not='call i32 {@_Z3bazv|@_Z3barv}'
 // expected-no-diagnostics
 
diff --git a/clang/test/OpenMP/nvptx_exceptions.cpp b/clang/test/OpenMP/nvptx_exceptions.cpp
index af283b0649d8..12447fa0ada8 100644
--- a/clang/test/OpenMP/nvptx_exceptions.cpp
+++ b/clang/test/OpenMP/nvptx_exceptions.cpp
@@ -7,10 +7,10 @@
  * target region but emit a warning instead.
 */
 
-// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -emit-llvm -S -verify=with -Wopenmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -emit-llvm -S -verify=with -Wopenmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fexceptions %s -emit-llvm -S -verify=with -Wopenmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device %s -emit-llvm -S -verify=with -Wopenmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -verify=with -Wopenmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -verify=with -Wopenmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fexceptions %s -verify=with -Wopenmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device %s -verify=with -Wopenmp-target-exception -analyze
 
 /**
  * The following four lines test that no warning is emitted when providing 
@@ -18,17 +18,17 @@
  * -fcxx-exceptions.
 */
 
-// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -emit-llvm -S -verify=without -Wno-openmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fcxx-exceptions %s -emit-llvm -S -verify=without -Wno-openmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fexceptions %s -emit-llvm -S -verify=without -Wno-openmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device %s -emit-llvm -S -verify=without -Wno-openmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -verify=without -Wno-openmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fcxx-exceptions %s -verify=without -Wno-openmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fexceptions %s -verify=without -Wno-openmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device %s -verify=without -Wno-openmp-target-exception -analyze
 
 /**
  * Finally we should test that we only ignore exceptions in the OpenMP 
  * offloading tool-chain
 */
 
-// RUN: %clang_cc1 -triple nvptx64 %s -emit-llvm -S -verify=noexceptions -o -
+// RUN: %clang_cc1 -triple nvptx64 %s -emit-llvm-only -verify=noexceptions
 
 // noexceptions-error@39 {{cannot use 'try' with exceptions disabled}}
 // noexceptions-error@40 {{cannot use 'throw' with exceptions disabled}}
diff --git a/clang/test/OpenMP/nvptx_lambda_capturing.cpp b/clang/test/OpenMP/nvptx_lambda_capturing.cpp
index 86035e17ef3b..641fbc38dd6b 100644
--- a/clang/test/OpenMP/nvptx_lambda_capturing.cpp
+++ b/clang/test/OpenMP/nvptx_lambda_capturing.cpp
@@ -5,7 +5,7 @@
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK2
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - | FileCheck %s --check-prefix=CHECK3
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -o - | FileCheck %s --check-prefix=CHECK3
 
diff --git a/clang/test/OpenMP/nvptx_throw.cpp b/clang/test/OpenMP/nvptx_throw.cpp
index 8bc4366fcf76..e85046ea5dc1 100644
--- a/clang/test/OpenMP/nvptx_throw.cpp
+++ b/clang/test/OpenMP/nvptx_throw.cpp
@@ -7,10 +7,10 @@
  * target region but emit a warning instead.
 */
 
-// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -emit-llvm -S -verify=with -Wopenmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -emit-llvm -S -verify=with -Wopenmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fexceptions %s -emit-llvm -S -verify=with -Wopenmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device %s -emit-llvm -S -verify=with -Wopenmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -verify=with -Wopenmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -verify=with -Wopenmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fexceptions %s -verify=with -Wopenmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device %s -verify=with -Wopenmp-target-exception -analyze
 
 /**
  * The following four lines test that no warning is emitted when providing 
@@ -18,17 +18,17 @@
  * -fcxx-exceptions.
 */
 
-// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -emit-llvm -S -verify=without -Wno-openmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fcxx-exceptions %s -emit-llvm -S -verify=without -Wno-openmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fexceptions %s -emit-llvm -S -verify=without -Wno-openmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device %s -emit-llvm -S -verify=without -Wno-openmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -verify=without -Wno-openmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fcxx-exceptions %s -verify=without -Wno-openmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fexceptions %s -verify=without -Wno-openmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device %s -verify=without -Wno-openmp-target-exception -analyze
 
 /**
  * Finally we should test that we only ignore exceptions in the OpenMP 
  * offloading tool-chain
 */
 
-// RUN: %clang_cc1 -triple nvptx64 %s -emit-llvm -S -verify=noexceptions -o -
+// RUN: %clang_cc1 -triple nvptx64 %s -emit-llvm-only -verify=noexceptions
 
 // noexceptions-error@37 {{cannot use 'throw' with exceptions disabled}}
 
diff --git a/clang/test/OpenMP/nvptx_throw_trap.cpp b/clang/test/OpenMP/nvptx_throw_trap.cpp
index c1c76c4e1b18..b13a09136c17 100644
--- a/clang/test/OpenMP/nvptx_throw_trap.cpp
+++ b/clang/test/OpenMP/nvptx_throw_trap.cpp
@@ -1,7 +1,7 @@
 // REQUIRES: nvptx-registered-target
 
-// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device %s -emit-llvm -S -Wno-openmp-target-exception -o - | FileCheck -check-prefix=DEVICE %s
-// RUN: %clang_cc1 -fopenmp -triple x86_64-pc-linux-gnu -fopenmp-is-target-device -fcxx-exceptions %s -emit-llvm -S -Wno-openmp-target-exception -o - | FileCheck -check-prefix=HOST %s
+// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device %s -S -Wno-openmp-target-exception -o - | FileCheck -check-prefix=DEVICE %s
+// RUN: %clang_cc1 -fopenmp -triple x86_64-pc-linux-gnu -fopenmp-is-target-device -fcxx-exceptions %s -S -Wno-openmp-target-exception -o - | FileCheck -check-prefix=HOST %s
 // DEVICE: trap;
 // DEVICE-NOT: __cxa_throw
 // HOST: __cxa_throw
diff --git a/clang/test/OpenMP/nvptx_try_catch.cpp b/clang/test/OpenMP/nvptx_try_catch.cpp
index 0e9954c6e223..9bd30d166459 100644
--- a/clang/test/OpenMP/nvptx_try_catch.cpp
+++ b/clang/test/OpenMP/nvptx_try_catch.cpp
@@ -7,10 +7,10 @@
  * target region but emit a warning instead.
 */
 
-// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -emit-llvm -S -verify=with -Wopenmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -emit-llvm -S -verify=with -Wopenmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fexceptions %s -emit-llvm -S -verify=with -Wopenmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device %s -emit-llvm -S -verify=with -Wopenmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -verify=with -Wopenmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -verify=with -Wopenmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fexceptions %s -verify=with -Wopenmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device %s -verify=with -Wopenmp-target-exception -analyze
 
 /**
  * The following four lines test that no warning is emitted when providing 
@@ -18,17 +18,17 @@
  * -fcxx-exceptions.
 */
 
-// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -emit-llvm -S -verify=without -Wno-openmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fcxx-exceptions %s -emit-llvm -S -verify=without -Wno-openmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fexceptions %s -emit-llvm -S -verify=without -Wno-openmp-target-exception -analyze
-// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device %s -emit-llvm -S -verify=without -Wno-openmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -verify=without -Wno-openmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fcxx-exceptions %s -verify=without -Wno-openmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device -fexceptions %s -verify=without -Wno-openmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple nvptx64 -fopenmp-is-target-device %s -verify=without -Wno-openmp-target-exception -analyze
 
 /**
  * Finally we should test that we only ignore exceptions in the OpenMP 
  * offloading tool-chain
 */
 
-// RUN: %clang_cc1 -triple nvptx64 %s -emit-llvm -S -verify=noexceptions -o -
+// RUN: %clang_cc1 -triple nvptx64 %s -emit-llvm-only -verify=noexceptions
 
 // noexceptions-error@38 {{cannot use 'try' with exceptions disabled}}
 
diff --git a/clang/test/OpenMP/openmp-read-only-feature.c b/clang/test/OpenMP/openmp-read-only-feature.c
index 4d1ec84075c8..0cebaae32338 100644
--- a/clang/test/OpenMP/openmp-read-only-feature.c
+++ b/clang/test/OpenMP/openmp-read-only-feature.c
@@ -8,7 +8,7 @@
 
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1030 \
 // RUN:   -fopenmp -nogpulib -target-feature -image-insts \
-// RUN:   -fopenmp-is-target-device -emit-llvm -S -o - %s 2>&1 | FileCheck %s
+// RUN:   -fopenmp-is-target-device -S -o - %s 2>&1 | FileCheck %s
 // CHECK: warning: feature flag '-image-insts' is ignored since the feature is read only
 
 #pragma omp begin declare variant match(device = {arch(amdgcn)})
diff --git a/clang/test/OpenMP/openmp_capture_const_var_ast_print.cpp b/clang/test/OpenMP/openmp_capture_const_var_ast_print.cpp
index 4d4b0b00cbaf..c46e433dabaf 100644
--- a/clang/test/OpenMP/openmp_capture_const_var_ast_print.cpp
+++ b/clang/test/OpenMP/openmp_capture_const_var_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/openmp_offload_codegen.cpp b/clang/test/OpenMP/openmp_offload_codegen.cpp
index 6987b2de74f1..5246c7f6d782 100644
--- a/clang/test/OpenMP/openmp_offload_codegen.cpp
+++ b/clang/test/OpenMP/openmp_offload_codegen.cpp
@@ -1,9 +1,9 @@
 // Test device for mapping codegen.
 ///==========================================================================///
 
-// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -S -emit-llvm %s -o - 2>&1 | FileCheck -check-prefix=CK1 %s
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -o - 2>&1 | FileCheck -check-prefix=CK1 %s
 
-// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -S -emit-llvm %s -o - 2>&1 | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -o - 2>&1 | FileCheck --check-prefix SIMD-ONLY0 %s
 // SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
 
 // RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
diff --git a/clang/test/OpenMP/ordered_ast_print.cpp b/clang/test/OpenMP/ordered_ast_print.cpp
index e64c22abc3c7..cddb69a5c903 100644
--- a/clang/test/OpenMP/ordered_ast_print.cpp
+++ b/clang/test/OpenMP/ordered_ast_print.cpp
@@ -1,14 +1,14 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck -check-prefixes CHECK,OMP51 %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck -check-prefixes CHECK,OMP51 %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck -check-prefixes CHECK,OMP51 %s
 
 // RUN: %clang_cc1 -verify -fopenmp-version=52 -fopenmp -ast-print %s | FileCheck -check-prefixes CHECK,OMP52 %s
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck -check-prefixes CHECK,OMP52 %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck -check-prefixes CHECK,OMP52 %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck -check-prefixes CHECK,OMP51 %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck -check-prefixes CHECK,OMP51 %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck -check-prefixes CHECK,OMP51 %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/parallel_ast_print.cpp b/clang/test/OpenMP/parallel_ast_print.cpp
index dcfec564ed07..83afedcb740d 100644
--- a/clang/test/OpenMP/parallel_ast_print.cpp
+++ b/clang/test/OpenMP/parallel_ast_print.cpp
@@ -1,18 +1,18 @@
 // RUN: %clang_cc1 -verify -Wno-vla -fopenmp -fopenmp-version=50 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify -Wno-vla %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -verify -Wno-vla %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -Wno-vla -fopenmp-simd -fopenmp-version=50 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify -Wno-vla %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -verify -Wno-vla %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -DOMP51 -verify -Wno-vla -fopenmp -ast-print %s | FileCheck -check-prefixes=CHECK,OMP51 %s
 // RUN: %clang_cc1 -DOMP51 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -DOMP51 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify -Wno-vla %s -ast-print | FileCheck -check-prefixes=CHECK,OMP51 %s
+// RUN: %clang_cc1 -DOMP51 -fopenmp -std=c++11 -include-pch %t -verify -Wno-vla %s -ast-print | FileCheck -check-prefixes=CHECK,OMP51 %s
 
 // RUN: %clang_cc1 -DOMP51 -verify -Wno-vla -fopenmp-simd -ast-print %s | FileCheck -check-prefixes=CHECK,OMP51 %s
 // RUN: %clang_cc1 -DOMP51 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -DOMP51 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify -Wno-vla %s -ast-print | FileCheck -check-prefixes=CHECK,OMP51 %s
+// RUN: %clang_cc1 -DOMP51 -fopenmp-simd -std=c++11 -include-pch %t -verify -Wno-vla %s -ast-print | FileCheck -check-prefixes=CHECK,OMP51 %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/parallel_for_ast_print.cpp b/clang/test/OpenMP/parallel_for_ast_print.cpp
index df5e7596f6a9..699dcd2a23ab 100644
--- a/clang/test/OpenMP/parallel_for_ast_print.cpp
+++ b/clang/test/OpenMP/parallel_for_ast_print.cpp
@@ -1,23 +1,23 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -ast-print %s -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP51
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -std=c++11 -include-pch %t -verify %s -ast-print -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -ast-print %s -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP52
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -std=c++11 -include-pch %t -verify %s -ast-print -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=51 -ast-print %s -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP51
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -std=c++11 -include-pch %t -verify %s -ast-print -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // expected-no-diagnostics
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -ast-print %s -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP52
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -std=c++11 -include-pch %t -verify %s -ast-print -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/parallel_for_simd_ast_print.cpp b/clang/test/OpenMP/parallel_for_simd_ast_print.cpp
index 69c21f5bdfdf..b1fe98e17113 100644
--- a/clang/test/OpenMP/parallel_for_simd_ast_print.cpp
+++ b/clang/test/OpenMP/parallel_for_simd_ast_print.cpp
@@ -1,22 +1,22 @@
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -verify -fopenmp -DOMP5 -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -fopenmp -DOMP5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -DOMP5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
+// RUN: %clang_cc1 -fopenmp -DOMP5 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -DOMP51 -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -DOMP51 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -DOMP51 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -DOMP51 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -verify -fopenmp-simd -DOMP5 -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -fopenmp-simd -DOMP5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -DOMP5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
+// RUN: %clang_cc1 -fopenmp-simd -DOMP5 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=51 -DOMP51 -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -DOMP51 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -DOMP51 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -DOMP51 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/parallel_masked_ast_print.cpp b/clang/test/OpenMP/parallel_masked_ast_print.cpp
index c56c7aa14c93..c97df54f47ab 100644
--- a/clang/test/OpenMP/parallel_masked_ast_print.cpp
+++ b/clang/test/OpenMP/parallel_masked_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -Wno-vla -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify -Wno-vla %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify -Wno-vla %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -Wno-vla -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify -Wno-vla %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify -Wno-vla %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/parallel_masked_taskloop_ast_print.cpp b/clang/test/OpenMP/parallel_masked_taskloop_ast_print.cpp
index 1c9fcff047f9..8f4cd670caeb 100644
--- a/clang/test/OpenMP/parallel_masked_taskloop_ast_print.cpp
+++ b/clang/test/OpenMP/parallel_masked_taskloop_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/parallel_masked_taskloop_simd_ast_print.cpp b/clang/test/OpenMP/parallel_masked_taskloop_simd_ast_print.cpp
index 1d01ac9a668c..3b4d71289d5b 100644
--- a/clang/test/OpenMP/parallel_masked_taskloop_simd_ast_print.cpp
+++ b/clang/test/OpenMP/parallel_masked_taskloop_simd_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/parallel_master_ast_print.cpp b/clang/test/OpenMP/parallel_master_ast_print.cpp
index c2d3a6a25b9b..150a09cd73e2 100644
--- a/clang/test/OpenMP/parallel_master_ast_print.cpp
+++ b/clang/test/OpenMP/parallel_master_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -Wno-vla -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify -Wno-vla %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify -Wno-vla %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -Wno-vla -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify -Wno-vla %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify -Wno-vla %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/parallel_master_taskloop_ast_print.cpp b/clang/test/OpenMP/parallel_master_taskloop_ast_print.cpp
index 35fdddc0a7be..291c097d8a6a 100644
--- a/clang/test/OpenMP/parallel_master_taskloop_ast_print.cpp
+++ b/clang/test/OpenMP/parallel_master_taskloop_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/parallel_master_taskloop_simd_ast_print.cpp b/clang/test/OpenMP/parallel_master_taskloop_simd_ast_print.cpp
index 441e16c268b1..96f2f2501c24 100644
--- a/clang/test/OpenMP/parallel_master_taskloop_simd_ast_print.cpp
+++ b/clang/test/OpenMP/parallel_master_taskloop_simd_ast_print.cpp
@@ -1,16 +1,16 @@
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -DOMP5 -ast-print %s | FileCheck %s --check-prefix CHECK --check-prefix OMP50
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -DOMP5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -DOMP5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix CHECK --check-prefix OMP50
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -DOMP5 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s --check-prefix CHECK --check-prefix OMP50
 // RUN: %clang_cc1 -verify -fopenmp -DOMP51 -ast-print %s | FileCheck %s --check-prefix CHECK --check-prefix OMP51
 // RUN: %clang_cc1 -fopenmp -DOMP51 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -DOMP51 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix CHECK --check-prefix OMP51
+// RUN: %clang_cc1 -fopenmp -DOMP51 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s --check-prefix CHECK --check-prefix OMP51
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -DOMP5 -ast-print %s | FileCheck %s --check-prefix CHECK --check-prefix OMP50
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -DOMP5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -DOMP5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix CHECK --check-prefix OMP50
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -DOMP5 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s --check-prefix CHECK --check-prefix OMP50
 // RUN: %clang_cc1 -verify -fopenmp-simd -DOMP51 -ast-print %s | FileCheck %s --check-prefix CHECK --check-prefix OMP51
 // RUN: %clang_cc1 -fopenmp-simd -DOMP51 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -DOMP51 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix CHECK --check-prefix OMP51
+// RUN: %clang_cc1 -fopenmp-simd -DOMP51 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s --check-prefix CHECK --check-prefix OMP51
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/parallel_sections_ast_print.cpp b/clang/test/OpenMP/parallel_sections_ast_print.cpp
index e3183f9a7a99..3dca1ebf10ff 100644
--- a/clang/test/OpenMP/parallel_sections_ast_print.cpp
+++ b/clang/test/OpenMP/parallel_sections_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/parallel_sections_codegen.cpp b/clang/test/OpenMP/parallel_sections_codegen.cpp
index 7b8427d97288..9343250d08a5 100644
--- a/clang/test/OpenMP/parallel_sections_codegen.cpp
+++ b/clang/test/OpenMP/parallel_sections_codegen.cpp
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -emit-llvm -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -o - %s | FileCheck %s --check-prefix=CHECK1
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -fexceptions -fcxx-exceptions -triple x86_64-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -include-pch %t -fsyntax-only -verify %s -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-llvm -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -include-pch %t -verify %s -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-llvm -o - | FileCheck %s --check-prefix=CHECK1
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -emit-llvm -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -o - %s | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -fexceptions -fcxx-exceptions -triple x86_64-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -include-pch %t -fsyntax-only -verify %s -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -include-pch %t -verify %s -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
diff --git a/clang/test/OpenMP/requires_acq_rel_codegen.cpp b/clang/test/OpenMP/requires_acq_rel_codegen.cpp
index e44d1cd93d61..0256d25f850f 100644
--- a/clang/test/OpenMP/requires_acq_rel_codegen.cpp
+++ b/clang/test/OpenMP/requires_acq_rel_codegen.cpp
@@ -4,7 +4,7 @@
 
 // RUN: %clang_cc1 -verify -fopenmp-simd %s -x c++ -emit-llvm -triple x86_64-apple-darwin10 -o -| FileCheck %s --check-prefix SIMD-ONLY0
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s -triple x86_64-apple-darwin10
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -emit-llvm -x c++ -emit-llvm -triple x86_64-apple-darwin10 -o -| FileCheck %s --check-prefix SIMD-ONLY0
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -x c++ -emit-llvm -triple x86_64-apple-darwin10 -o -| FileCheck %s --check-prefix SIMD-ONLY0
 // SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
 // expected-no-diagnostics
 
diff --git a/clang/test/OpenMP/requires_acq_rel_print.cpp b/clang/test/OpenMP/requires_acq_rel_print.cpp
index f4102416f084..9e7e0d5ef5f6 100644
--- a/clang/test/OpenMP/requires_acq_rel_print.cpp
+++ b/clang/test/OpenMP/requires_acq_rel_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/requires_ast_print.cpp b/clang/test/OpenMP/requires_ast_print.cpp
index 8343608070c1..efa1551e7589 100644
--- a/clang/test/OpenMP/requires_ast_print.cpp
+++ b/clang/test/OpenMP/requires_ast_print.cpp
@@ -1,18 +1,18 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=99 -DOMP99 -ast-print %s | FileCheck --check-prefixes=CHECK,REV %s
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=99 -DOMP99 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=99 -DOMP99 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck --check-prefixes=CHECK,REV %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=99 -DOMP99 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck --check-prefixes=CHECK,REV %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=99 -DOMP99 -ast-print %s | FileCheck --check-prefixes=CHECK,REV %s
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=99 -DOMP99 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=99 -DOMP99 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck --check-prefixes=CHECK,REV %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=99 -DOMP99 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck --check-prefixes=CHECK,REV %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/requires_relaxed_codegen.cpp b/clang/test/OpenMP/requires_relaxed_codegen.cpp
index 5cabab6d80c9..00a66b3967c9 100644
--- a/clang/test/OpenMP/requires_relaxed_codegen.cpp
+++ b/clang/test/OpenMP/requires_relaxed_codegen.cpp
@@ -4,7 +4,7 @@
 
 // RUN: %clang_cc1 -verify -fopenmp-simd %s -x c++ -emit-llvm -triple x86_64-apple-darwin10 -o -| FileCheck %s --check-prefix SIMD-ONLY0
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s -triple x86_64-apple-darwin10
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -emit-llvm -x c++ -emit-llvm -triple x86_64-apple-darwin10 -o -| FileCheck %s --check-prefix SIMD-ONLY0
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -x c++ -emit-llvm -triple x86_64-apple-darwin10 -o -| FileCheck %s --check-prefix SIMD-ONLY0
 // SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
 // expected-no-diagnostics
 
diff --git a/clang/test/OpenMP/requires_relaxed_print.cpp b/clang/test/OpenMP/requires_relaxed_print.cpp
index 98786e56605f..16939b3c7805 100644
--- a/clang/test/OpenMP/requires_relaxed_print.cpp
+++ b/clang/test/OpenMP/requires_relaxed_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/requires_seq_cst_codegen.cpp b/clang/test/OpenMP/requires_seq_cst_codegen.cpp
index 20b0b0d3ca8b..b7825e7d8503 100644
--- a/clang/test/OpenMP/requires_seq_cst_codegen.cpp
+++ b/clang/test/OpenMP/requires_seq_cst_codegen.cpp
@@ -4,7 +4,7 @@
 
 // RUN: %clang_cc1 -verify -fopenmp-simd %s -x c++ -emit-llvm -triple x86_64-apple-darwin10 -o -| FileCheck %s --check-prefix SIMD-ONLY0
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s -triple x86_64-apple-darwin10
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -emit-llvm -x c++ -emit-llvm -triple x86_64-apple-darwin10 -o -| FileCheck %s --check-prefix SIMD-ONLY0
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -x c++ -emit-llvm -triple x86_64-apple-darwin10 -o -| FileCheck %s --check-prefix SIMD-ONLY0
 // SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
 // expected-no-diagnostics
 
diff --git a/clang/test/OpenMP/scan_ast_print.cpp b/clang/test/OpenMP/scan_ast_print.cpp
index 82cb13eb6e70..ff4f2a0e6ba3 100644
--- a/clang/test/OpenMP/scan_ast_print.cpp
+++ b/clang/test/OpenMP/scan_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/sections_ast_print.cpp b/clang/test/OpenMP/sections_ast_print.cpp
index 4759b4df0849..26bd47685ea7 100644
--- a/clang/test/OpenMP/sections_ast_print.cpp
+++ b/clang/test/OpenMP/sections_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/sections_codegen.cpp b/clang/test/OpenMP/sections_codegen.cpp
index 68269e52b33f..5a5e32751c67 100644
--- a/clang/test/OpenMP/sections_codegen.cpp
+++ b/clang/test/OpenMP/sections_codegen.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -emit-llvm -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -o - %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -fexceptions -fcxx-exceptions -triple x86_64-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -include-pch %t -fsyntax-only -verify %s -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -include-pch %t -verify %s -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-llvm -o - | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -emit-llvm -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -o - %s | FileCheck --check-prefix SIMD-ONLY0 %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -fexceptions -fcxx-exceptions -triple x86_64-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -include-pch %t -fsyntax-only -verify %s -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -include-pch %t -verify %s -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
 // SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
 // expected-no-diagnostics
 #ifndef HEADER
diff --git a/clang/test/OpenMP/simd_ast_print.cpp b/clang/test/OpenMP/simd_ast_print.cpp
index 057f1e7c03c0..9a4174c35d7c 100644
--- a/clang/test/OpenMP/simd_ast_print.cpp
+++ b/clang/test/OpenMP/simd_ast_print.cpp
@@ -1,29 +1,29 @@
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -ast-print %s -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP5
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -verify %s -ast-print -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s -DOMP51
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -ast-print %s -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP52
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -std=c++11 -include-pch %t -verify %s -ast-print -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -ast-print %s -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP5
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -verify %s -ast-print -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s -DOMP51
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // expected-no-diagnostics
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -ast-print %s -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP52
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -std=c++11 -include-pch %t -verify %s -ast-print -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/single_ast_print.cpp b/clang/test/OpenMP/single_ast_print.cpp
index b8dfc51377f6..4c77da8d137a 100644
--- a/clang/test/OpenMP/single_ast_print.cpp
+++ b/clang/test/OpenMP/single_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/target_ast_print.cpp b/clang/test/OpenMP/target_ast_print.cpp
index 4e066bcf5e43..ec6cf2130d7a 100644
--- a/clang/test/OpenMP/target_ast_print.cpp
+++ b/clang/test/OpenMP/target_ast_print.cpp
@@ -6,11 +6,11 @@
 
 // RUN: %clang_cc1 -DOMP45 -verify -Wno-vla -fopenmp -fopenmp-version=45 -ast-print %s | FileCheck %s --check-prefix=OMP45
 // RUN: %clang_cc1 -DOMP45 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -DOMP45 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify -Wno-vla %s -ast-print | FileCheck %s --check-prefix=OMP45
+// RUN: %clang_cc1 -DOMP45 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -verify -Wno-vla %s -ast-print | FileCheck %s --check-prefix=OMP45
 
 // RUN: %clang_cc1 -DOMP45 -verify -Wno-vla -fopenmp-simd -fopenmp-version=45 -ast-print %s | FileCheck %s --check-prefix=OMP45
 // RUN: %clang_cc1 -DOMP45 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -DOMP45 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify -Wno-vla %s -ast-print | FileCheck %s --check-prefix=OMP45
+// RUN: %clang_cc1 -DOMP45 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -verify -Wno-vla %s -ast-print | FileCheck %s --check-prefix=OMP45
 #ifdef OMP45
 
 void foo() {}
@@ -337,19 +337,19 @@ int main (int argc, char **argv) {
 ///==========================================================================///
 // RUN: %clang_cc1 -DOMP5 -verify -Wno-vla -fopenmp -fopenmp-version=50 -ast-print %s | FileCheck %s --check-prefix OMP5
 // RUN: %clang_cc1 -DOMP5 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -DOMP5 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify -Wno-vla %s -ast-print | FileCheck %s --check-prefix OMP5
+// RUN: %clang_cc1 -DOMP5 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -verify -Wno-vla %s -ast-print | FileCheck %s --check-prefix OMP5
 
 // RUN: %clang_cc1 -DOMP5 -verify -Wno-vla -fopenmp-simd -fopenmp-version=50 -ast-print %s | FileCheck %s --check-prefix OMP5
 // RUN: %clang_cc1 -DOMP5 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -DOMP5 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify -Wno-vla %s -ast-print | FileCheck %s --check-prefix OMP5
+// RUN: %clang_cc1 -DOMP5 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -verify -Wno-vla %s -ast-print | FileCheck %s --check-prefix OMP5
 
 // RUN: %clang_cc1 -DOMP5 -verify -Wno-vla -fopenmp -fopenmp-version=99 -DOMP99 -ast-print %s | FileCheck %s --check-prefixes=OMP5,REV
 // RUN: %clang_cc1 -DOMP5 -fopenmp -fopenmp-version=99 -DOMP99 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -DOMP5 -fopenmp -fopenmp-version=99 -DOMP99 -std=c++11 -include-pch %t -fsyntax-only -verify -Wno-vla %s -ast-print | FileCheck %s --check-prefixes=OMP5,REV
+// RUN: %clang_cc1 -DOMP5 -fopenmp -fopenmp-version=99 -DOMP99 -std=c++11 -include-pch %t -verify -Wno-vla %s -ast-print | FileCheck %s --check-prefixes=OMP5,REV
 
 // RUN: %clang_cc1 -DOMP5 -verify -Wno-vla -fopenmp-simd -fopenmp-version=99 -DOMP99 -ast-print %s | FileCheck %s --check-prefixes=OMP5,REV
 // RUN: %clang_cc1 -DOMP5 -fopenmp-simd -fopenmp-version=99 -DOMP99 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -DOMP5 -fopenmp-simd -fopenmp-version=99 -DOMP99 -std=c++11 -include-pch %t -fsyntax-only -verify -Wno-vla %s -ast-print | FileCheck %s --check-prefixes=OMP5,REV
+// RUN: %clang_cc1 -DOMP5 -fopenmp-simd -fopenmp-version=99 -DOMP99 -std=c++11 -include-pch %t -verify -Wno-vla %s -ast-print | FileCheck %s --check-prefixes=OMP5,REV
 
 #ifdef OMP99
 #pragma omp requires reverse_offload
@@ -1092,11 +1092,11 @@ int main (int argc, char **argv) {
 ///==========================================================================///
 // RUN: %clang_cc1 -DOMP51 -verify -Wno-vla -fopenmp -ast-print %s | FileCheck %s --check-prefix OMP51
 // RUN: %clang_cc1 -DOMP51 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -DOMP51 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify -Wno-vla %s -ast-print | FileCheck %s --check-prefix OMP51
+// RUN: %clang_cc1 -DOMP51 -fopenmp -std=c++11 -include-pch %t -verify -Wno-vla %s -ast-print | FileCheck %s --check-prefix OMP51
 
 // RUN: %clang_cc1 -DOMP51 -verify -Wno-vla -fopenmp-simd -ast-print %s | FileCheck %s --check-prefix OMP51
 // RUN: %clang_cc1 -DOMP51 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -DOMP51 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify -Wno-vla %s -ast-print | FileCheck %s --check-prefix OMP51
+// RUN: %clang_cc1 -DOMP51 -fopenmp-simd -std=c++11 -include-pch %t -verify -Wno-vla %s -ast-print | FileCheck %s --check-prefix OMP51
 
 void foo() {}
 
@@ -1152,11 +1152,11 @@ int main (int argc, char **argv) {
 ///==========================================================================///
 // RUN: %clang_cc1 -DOMP52 -verify -Wno-vla -fopenmp -fopenmp-version=52 -ast-print %s | FileCheck %s --check-prefix OMP52
 // RUN: %clang_cc1 -DOMP52 -fopenmp -fopenmp-version=52 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -DOMP52 -fopenmp -fopenmp-version=52 -std=c++11 -include-pch %t -fsyntax-only -verify -Wno-vla %s -ast-print | FileCheck %s --check-prefix OMP52
+// RUN: %clang_cc1 -DOMP52 -fopenmp -fopenmp-version=52 -std=c++11 -include-pch %t -verify -Wno-vla %s -ast-print | FileCheck %s --check-prefix OMP52
 
 // RUN: %clang_cc1 -DOMP52 -verify -Wno-vla -fopenmp-simd -fopenmp-version=52 -ast-print %s | FileCheck %s --check-prefix OMP52
 // RUN: %clang_cc1 -DOMP52 -fopenmp-simd -fopenmp-version=52 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -DOMP52 -fopenmp-simd -fopenmp-version=52 -std=c++11 -include-pch %t -fsyntax-only -verify -Wno-vla %s -ast-print | FileCheck %s --check-prefix OMP52
+// RUN: %clang_cc1 -DOMP52 -fopenmp-simd -fopenmp-version=52 -std=c++11 -include-pch %t -verify -Wno-vla %s -ast-print | FileCheck %s --check-prefix OMP52
 
 void foo() {}
 
@@ -1206,11 +1206,11 @@ foo();
 ///==========================================================================///
 // RUN: %clang_cc1 -DOMP60 -verify -Wno-vla -fopenmp -fopenmp-version=60 -ast-print %s | FileCheck %s --check-prefix OMP60
 // RUN: %clang_cc1 -DOMP60 -fopenmp -fopenmp-version=60 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -DOMP60 -fopenmp -fopenmp-version=60 -std=c++11 -include-pch %t -fsyntax-only -verify -Wno-vla %s -ast-print | FileCheck %s --check-prefix OMP60
+// RUN: %clang_cc1 -DOMP60 -fopenmp -fopenmp-version=60 -std=c++11 -include-pch %t -verify -Wno-vla %s -ast-print | FileCheck %s --check-prefix OMP60
 
 // RUN: %clang_cc1 -DOMP60 -verify -Wno-vla -fopenmp-simd -fopenmp-version=60 -ast-print %s | FileCheck %s --check-prefix OMP60
 // RUN: %clang_cc1 -DOMP60 -fopenmp-simd -fopenmp-version=60 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -DOMP60 -fopenmp-simd -fopenmp-version=60 -std=c++11 -include-pch %t -fsyntax-only -verify -Wno-vla %s -ast-print | FileCheck %s --check-prefix OMP60
+// RUN: %clang_cc1 -DOMP60 -fopenmp-simd -fopenmp-version=60 -std=c++11 -include-pch %t -verify -Wno-vla %s -ast-print | FileCheck %s --check-prefix OMP60
 
 void foo() {}
 template <typename T, int C>
@@ -1263,11 +1263,11 @@ int main (int argc, char **argv) {
 
 // RUN: %clang_cc1 -DOMPX -verify -Wno-vla -fopenmp -fopenmp-extensions -ast-print %s | FileCheck %s --check-prefix=OMPX
 // RUN: %clang_cc1 -DOMPX -fopenmp -fopenmp-extensions -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -DOMPX -fopenmp -fopenmp-extensions -std=c++11 -include-pch %t -fsyntax-only -verify -Wno-vla %s -ast-print | FileCheck %s --check-prefix=OMPX
+// RUN: %clang_cc1 -DOMPX -fopenmp -fopenmp-extensions -std=c++11 -include-pch %t -verify -Wno-vla %s -ast-print | FileCheck %s --check-prefix=OMPX
 
 // RUN: %clang_cc1 -DOMPX -verify -Wno-vla -fopenmp-simd -fopenmp-extensions -ast-print %s | FileCheck %s --check-prefix=OMPX
 // RUN: %clang_cc1 -DOMPX -fopenmp-simd -fopenmp-extensions -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -DOMPX -fopenmp-simd -fopenmp-extensions -std=c++11 -include-pch %t -fsyntax-only -verify -Wno-vla %s -ast-print | FileCheck %s --check-prefix=OMPX
+// RUN: %clang_cc1 -DOMPX -fopenmp-simd -fopenmp-extensions -std=c++11 -include-pch %t -verify -Wno-vla %s -ast-print | FileCheck %s --check-prefix=OMPX
 
 void foo() {}
 
diff --git a/clang/test/OpenMP/target_data_ast_print.cpp b/clang/test/OpenMP/target_data_ast_print.cpp
index 5bddc2374d18..9e883b25f11d 100644
--- a/clang/test/OpenMP/target_data_ast_print.cpp
+++ b/clang/test/OpenMP/target_data_ast_print.cpp
@@ -1,18 +1,18 @@
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -DOMP51 -DOMPX -verify -fopenmp -fopenmp-extensions -ast-print %s | FileCheck -check-prefixes=CHECK,OMP51,OMPX %s
 // RUN: %clang_cc1 -DOMP51 -DOMPX -fopenmp -fopenmp-extensions -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -DOMP51 -DOMPX -fopenmp -fopenmp-extensions -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck -check-prefixes=CHECK,OMP51,OMPX %s
+// RUN: %clang_cc1 -DOMP51 -DOMPX -fopenmp -fopenmp-extensions -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck -check-prefixes=CHECK,OMP51,OMPX %s
 
 // RUN: %clang_cc1 -DOMP51 -DOMPX -verify -fopenmp-simd -fopenmp-extensions -ast-print %s | FileCheck -check-prefixes=CHECK,OMP51,OMPX %s
 // RUN: %clang_cc1 -DOMP51 -DOMPX -fopenmp-simd -fopenmp-extensions -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -DOMP51 -DOMPX -fopenmp-simd -fopenmp-extensions -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck -check-prefixes=CHECK,OMP51,OMPX %s
+// RUN: %clang_cc1 -DOMP51 -DOMPX -fopenmp-simd -fopenmp-extensions -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck -check-prefixes=CHECK,OMP51,OMPX %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/target_data_use_device_ptr_addr_ast_print.cpp b/clang/test/OpenMP/target_data_use_device_ptr_addr_ast_print.cpp
index 147def727ca4..43ff9896d3fc 100644
--- a/clang/test/OpenMP/target_data_use_device_ptr_addr_ast_print.cpp
+++ b/clang/test/OpenMP/target_data_use_device_ptr_addr_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -std=c++11 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/target_enter_data_ast_print.cpp b/clang/test/OpenMP/target_enter_data_ast_print.cpp
index b11d5de13de6..80acf56bd370 100644
--- a/clang/test/OpenMP/target_enter_data_ast_print.cpp
+++ b/clang/test/OpenMP/target_enter_data_ast_print.cpp
@@ -1,22 +1,22 @@
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // expected-no-diagnostics
 
diff --git a/clang/test/OpenMP/target_enter_data_ast_print_openmp52.cpp b/clang/test/OpenMP/target_enter_data_ast_print_openmp52.cpp
index 578f9a254274..0cfe665586b4 100644
--- a/clang/test/OpenMP/target_enter_data_ast_print_openmp52.cpp
+++ b/clang/test/OpenMP/target_enter_data_ast_print_openmp52.cpp
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck --check-prefix=CHECK --check-prefix=CHECK-52 %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck --check-prefix=CHECK --check-prefix=CHECK-52 %s
 
 // expected-no-diagnostics
 
diff --git a/clang/test/OpenMP/target_exit_data_ast_print.cpp b/clang/test/OpenMP/target_exit_data_ast_print.cpp
index f482f379361b..f0b4c68af5ae 100644
--- a/clang/test/OpenMP/target_exit_data_ast_print.cpp
+++ b/clang/test/OpenMP/target_exit_data_ast_print.cpp
@@ -1,22 +1,22 @@
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // expected-no-diagnostics
 
diff --git a/clang/test/OpenMP/target_exit_data_ast_print_openmp52.cpp b/clang/test/OpenMP/target_exit_data_ast_print_openmp52.cpp
index fbc431eadbcc..5b9daab61bb0 100644
--- a/clang/test/OpenMP/target_exit_data_ast_print_openmp52.cpp
+++ b/clang/test/OpenMP/target_exit_data_ast_print_openmp52.cpp
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // expected-no-diagnostics
 
diff --git a/clang/test/OpenMP/target_has_device_addr_ast_print.cpp b/clang/test/OpenMP/target_has_device_addr_ast_print.cpp
index 118010036813..24033e5a395b 100644
--- a/clang/test/OpenMP/target_has_device_addr_ast_print.cpp
+++ b/clang/test/OpenMP/target_has_device_addr_ast_print.cpp
@@ -5,7 +5,7 @@
 // RUN:  -emit-pch -o %t %s
 
 // RUN: %clang_cc1 -fopenmp -std=c++11 \
-// RUN:  -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN:  -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd \
 // RUN:  -std=c++11 -ast-print %s | FileCheck %s
@@ -14,7 +14,7 @@
 // RUN:  -emit-pch -o %t %s
 
 // RUN: %clang_cc1 -fopenmp-simd -std=c++11 \
-// RUN:  -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN:  -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // expected-no-diagnostics
 
diff --git a/clang/test/OpenMP/target_indirect_codegen.cpp b/clang/test/OpenMP/target_indirect_codegen.cpp
index bc0aa4541703..974f8b20c0bf 100644
--- a/clang/test/OpenMP/target_indirect_codegen.cpp
+++ b/clang/test/OpenMP/target_indirect_codegen.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -o - | FileCheck %s --check-prefix=HOST
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-host.bc
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -x c++ -triple amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fvisibility=protected -fopenmp-host-ir-file-path %t-host.bc -o - | FileCheck %s --check-prefix=DEVICE
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -x c++ -triple amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fvisibility=protected -fopenmp-host-ir-file-path %t-host.bc -emit-pch -o %t
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -x c++ -triple amdgcn-amd-amdhsa %s -fopenmp-is-target-device -fvisibility=protected -fopenmp-host-ir-file-path %t-host.bc -emit-pch -o %t
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -x c++ -triple amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fvisibility=protected -fopenmp-host-ir-file-path %t-host.bc -include-pch %t -o - | FileCheck %s --check-prefix=DEVICE
 
 // expected-no-diagnostics
diff --git a/clang/test/OpenMP/target_is_device_ptr_ast_print.cpp b/clang/test/OpenMP/target_is_device_ptr_ast_print.cpp
index 4e8bf8143542..583793a06caf 100644
--- a/clang/test/OpenMP/target_is_device_ptr_ast_print.cpp
+++ b/clang/test/OpenMP/target_is_device_ptr_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -std=c++11 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/target_parallel_ast_print.cpp b/clang/test/OpenMP/target_parallel_ast_print.cpp
index f246f74f0805..7e27ac7b92ca 100644
--- a/clang/test/OpenMP/target_parallel_ast_print.cpp
+++ b/clang/test/OpenMP/target_parallel_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/target_parallel_for_ast_print.cpp b/clang/test/OpenMP/target_parallel_for_ast_print.cpp
index 16c2175099ac..ce9c3beeb95f 100644
--- a/clang/test/OpenMP/target_parallel_for_ast_print.cpp
+++ b/clang/test/OpenMP/target_parallel_for_ast_print.cpp
@@ -1,16 +1,16 @@
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping -DOMP51
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping -DOMP51
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/target_parallel_for_is_device_ptr_ast_print.cpp b/clang/test/OpenMP/target_parallel_for_is_device_ptr_ast_print.cpp
index fe600101b8ff..fb1bcc58c6de 100644
--- a/clang/test/OpenMP/target_parallel_for_is_device_ptr_ast_print.cpp
+++ b/clang/test/OpenMP/target_parallel_for_is_device_ptr_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -std=c++11 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/target_parallel_for_simd_ast_print.cpp b/clang/test/OpenMP/target_parallel_for_simd_ast_print.cpp
index c3b115dafa09..6b7e197d6b29 100644
--- a/clang/test/OpenMP/target_parallel_for_simd_ast_print.cpp
+++ b/clang/test/OpenMP/target_parallel_for_simd_ast_print.cpp
@@ -1,29 +1,29 @@
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -ast-print %s -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping -DOMP5
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping -DOMP51
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -ast-print %s -Wno-openmp-mapping -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping -DOMP52
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -ast-print %s -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping -DOMP5
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping -DOMP5 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping -DOMP51
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // expected-no-diagnostics
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -ast-print %s -Wno-openmp-mapping -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping -DOMP52
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/target_parallel_for_simd_is_device_ptr_ast_print.cpp b/clang/test/OpenMP/target_parallel_for_simd_is_device_ptr_ast_print.cpp
index feeaa1ef72ac..f8b4c8cdbe3e 100644
--- a/clang/test/OpenMP/target_parallel_for_simd_is_device_ptr_ast_print.cpp
+++ b/clang/test/OpenMP/target_parallel_for_simd_is_device_ptr_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -std=c++11 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/target_parallel_is_device_ptr_ast_print.cpp b/clang/test/OpenMP/target_parallel_is_device_ptr_ast_print.cpp
index 16cc188e872e..b435142f4694 100644
--- a/clang/test/OpenMP/target_parallel_is_device_ptr_ast_print.cpp
+++ b/clang/test/OpenMP/target_parallel_is_device_ptr_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -std=c++11 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/target_simd_ast_print.cpp b/clang/test/OpenMP/target_simd_ast_print.cpp
index ec9de55392b4..91117e76186f 100644
--- a/clang/test/OpenMP/target_simd_ast_print.cpp
+++ b/clang/test/OpenMP/target_simd_ast_print.cpp
@@ -1,29 +1,29 @@
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -DOMP5 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -DOMP5 -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -DOMP5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -DOMP5 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -ast-print %s -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping -DOMP51
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -ast-print %s -Wno-openmp-mapping -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping -DOMP52
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -DOMP5 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -DOMP5 -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -DOMP5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -DOMP5 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=51 -ast-print %s -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping -DOMP51
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // expected-no-diagnostics
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -ast-print %s -Wno-openmp-mapping -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping -DOMP52
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping -DOMP52 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/target_teams_ast_print.cpp b/clang/test/OpenMP/target_teams_ast_print.cpp
index 8eaf4cbf2493..2ff34e4498bf 100644
--- a/clang/test/OpenMP/target_teams_ast_print.cpp
+++ b/clang/test/OpenMP/target_teams_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/target_teams_distribute_ast_print.cpp b/clang/test/OpenMP/target_teams_distribute_ast_print.cpp
index 9901c72c6522..cfe602541e1b 100644
--- a/clang/test/OpenMP/target_teams_distribute_ast_print.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s -Wno-openmp-mapping | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s -Wno-openmp-mapping | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_ast_print.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_ast_print.cpp
index 164a82be2416..9187d41e78e1 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_ast_print.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_ast_print.cpp
@@ -1,16 +1,16 @@
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -ast-print %s -Wno-openmp-mapping -Wsign-conversion | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s -Wno-openmp-mapping -Wsign-conversion -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping -DOMP51
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -ast-print %s -Wno-openmp-mapping -Wsign-conversion | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s -Wno-openmp-mapping -Wsign-conversion -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping -DOMP51
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_is_device_ptr_ast_print.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_is_device_ptr_ast_print.cpp
index b3944bc19530..04dc2f2be0b7 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_is_device_ptr_ast_print.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_is_device_ptr_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -std=c++11 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_ast_print.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_ast_print.cpp
index 4f901b75e578..cc1bdbbe25a3 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_ast_print.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_ast_print.cpp
@@ -1,16 +1,16 @@
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping -DOMP51
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping -DOMP51
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_is_device_ptr_ast_print.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_is_device_ptr_ast_print.cpp
index 510751967230..c12edaff9f02 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_is_device_ptr_ast_print.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_is_device_ptr_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -std=c++11 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_ast_print.cpp b/clang/test/OpenMP/target_teams_distribute_simd_ast_print.cpp
index 1db0d8b8effe..58425367fb3a 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_ast_print.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_ast_print.cpp
@@ -1,22 +1,22 @@
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -DOMP5 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -DOMP5 -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -DOMP5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -DOMP5 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -verify -fopenmp -DOMP51 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -fopenmp -DOMP51 -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping
-// RUN: %clang_cc1 -fopenmp -DOMP51 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
+// RUN: %clang_cc1 -fopenmp -DOMP51 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -DOMP5 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -DOMP5 -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -DOMP5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -DOMP5 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -verify -fopenmp-simd -DOMP51 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -fopenmp-simd -DOMP51 -x c++ -std=c++11 -emit-pch -o %t %s -Wno-openmp-mapping
-// RUN: %clang_cc1 -fopenmp-simd -DOMP51 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
+// RUN: %clang_cc1 -fopenmp-simd -DOMP51 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_is_device_ptr_ast_print.cpp b/clang/test/OpenMP/target_teams_distribute_simd_is_device_ptr_ast_print.cpp
index eb5347d61d65..621402d945a4 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_is_device_ptr_ast_print.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_is_device_ptr_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -std=c++11 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/target_teams_is_device_ptr_ast_print.cpp b/clang/test/OpenMP/target_teams_is_device_ptr_ast_print.cpp
index 19fe955ee76b..a125ad7ad639 100644
--- a/clang/test/OpenMP/target_teams_is_device_ptr_ast_print.cpp
+++ b/clang/test/OpenMP/target_teams_is_device_ptr_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -std=c++11 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/target_update_ast_print.cpp b/clang/test/OpenMP/target_update_ast_print.cpp
index be00c4fd9fb2..d4cc84bc8b73 100644
--- a/clang/test/OpenMP/target_update_ast_print.cpp
+++ b/clang/test/OpenMP/target_update_ast_print.cpp
@@ -1,18 +1,18 @@
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -DOMP51 -verify -fopenmp -ast-print %s | FileCheck -check-prefixes=CHECK,OMP51 %s
 // RUN: %clang_cc1 -DOMP51 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -DOMP51 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck -check-prefixes=CHECK,OMP51 %s
+// RUN: %clang_cc1 -DOMP51 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck -check-prefixes=CHECK,OMP51 %s
 
 // RUN: %clang_cc1 -DOMP51 -verify -fopenmp-simd -ast-print %s | FileCheck -check-prefixes=CHECK,OMP51 %s
 // RUN: %clang_cc1 -DOMP51 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -DOMP51 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck -check-prefixes=CHECK,OMP51 %s
+// RUN: %clang_cc1 -DOMP51 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck -check-prefixes=CHECK,OMP51 %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/task_ast_print.cpp b/clang/test/OpenMP/task_ast_print.cpp
index 79f56b0362f1..12923e6ab424 100644
--- a/clang/test/OpenMP/task_ast_print.cpp
+++ b/clang/test/OpenMP/task_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -Wno-vla -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify -Wno-vla %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify -Wno-vla %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -Wno-vla -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify -Wno-vla %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify -Wno-vla %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/task_depend_template_call_ast_print.cpp b/clang/test/OpenMP/task_depend_template_call_ast_print.cpp
index 2b50eefbcec1..5186e4bd5b4c 100644
--- a/clang/test/OpenMP/task_depend_template_call_ast_print.cpp
+++ b/clang/test/OpenMP/task_depend_template_call_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/taskgroup_ast_print.cpp b/clang/test/OpenMP/taskgroup_ast_print.cpp
index a15600cb779b..137d28679269 100644
--- a/clang/test/OpenMP/taskgroup_ast_print.cpp
+++ b/clang/test/OpenMP/taskgroup_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/taskloop_ast_print.cpp b/clang/test/OpenMP/taskloop_ast_print.cpp
index 076889846223..1b6d7240fa66 100644
--- a/clang/test/OpenMP/taskloop_ast_print.cpp
+++ b/clang/test/OpenMP/taskloop_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/taskloop_simd_ast_print.cpp b/clang/test/OpenMP/taskloop_simd_ast_print.cpp
index 622aeb835c24..dac87d167d45 100644
--- a/clang/test/OpenMP/taskloop_simd_ast_print.cpp
+++ b/clang/test/OpenMP/taskloop_simd_ast_print.cpp
@@ -1,16 +1,16 @@
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -DOMP5 -ast-print %s | FileCheck %s --check-prefix CHECK --check-prefix OMP50
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -DOMP5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -DOMP5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix CHECK --check-prefix OMP50
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -DOMP5 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s --check-prefix CHECK --check-prefix OMP50
 // RUN: %clang_cc1 -verify -fopenmp -DOMP51 -ast-print %s | FileCheck %s --check-prefix CHECK --check-prefix OMP51
 // RUN: %clang_cc1 -fopenmp -DOMP51 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -DOMP51 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix CHECK --check-prefix OMP51
+// RUN: %clang_cc1 -fopenmp -DOMP51 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s --check-prefix CHECK --check-prefix OMP51
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -DOMP5 -ast-print %s | FileCheck %s --check-prefix CHECK --check-prefix OMP50
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -DOMP5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -DOMP5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix CHECK --check-prefix OMP50
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -DOMP5 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s --check-prefix CHECK --check-prefix OMP50
 // RUN: %clang_cc1 -verify -fopenmp-simd -DOMP51 -ast-print %s | FileCheck %s --check-prefix CHECK --check-prefix OMP51
 // RUN: %clang_cc1 -fopenmp-simd -DOMP51 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -DOMP51 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix CHECK --check-prefix OMP51
+// RUN: %clang_cc1 -fopenmp-simd -DOMP51 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s --check-prefix CHECK --check-prefix OMP51
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/taskloop_strict_modifier_ast_print.cpp b/clang/test/OpenMP/taskloop_strict_modifier_ast_print.cpp
index 84538582108c..a34c632c48f5 100644
--- a/clang/test/OpenMP/taskloop_strict_modifier_ast_print.cpp
+++ b/clang/test/OpenMP/taskloop_strict_modifier_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=51 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/taskwait_ast_print.cpp b/clang/test/OpenMP/taskwait_ast_print.cpp
index aefba76678a9..b487bb4697a9 100644
--- a/clang/test/OpenMP/taskwait_ast_print.cpp
+++ b/clang/test/OpenMP/taskwait_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/taskyield_ast_print.cpp b/clang/test/OpenMP/taskyield_ast_print.cpp
index 7acf53dd6afe..d85944f48337 100644
--- a/clang/test/OpenMP/taskyield_ast_print.cpp
+++ b/clang/test/OpenMP/taskyield_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/teams_ast_print.cpp b/clang/test/OpenMP/teams_ast_print.cpp
index 110b4ac174fe..0087f71ac9f7 100644
--- a/clang/test/OpenMP/teams_ast_print.cpp
+++ b/clang/test/OpenMP/teams_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/teams_distribute_ast_print.cpp b/clang/test/OpenMP/teams_distribute_ast_print.cpp
index 417562c45bbf..df6c3f89eb11 100644
--- a/clang/test/OpenMP/teams_distribute_ast_print.cpp
+++ b/clang/test/OpenMP/teams_distribute_ast_print.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s -Wno-openmp-mapping | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s -Wno-openmp-mapping | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_ast_print.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_ast_print.cpp
index b289af7b1c72..5927d1509824 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_ast_print.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_ast_print.cpp
@@ -1,16 +1,16 @@
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s -DOMP51
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s -DOMP51
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping -DOMP51 | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_ast_print.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_ast_print.cpp
index f8b49f2b3d37..8ba36a05f93e 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_ast_print.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_ast_print.cpp
@@ -1,22 +1,22 @@
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix CHECK --check-prefix OMP45
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix CHECK --check-prefix OMP45
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix CHECK --check-prefix OMP45
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -DOMP5 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix CHECK --check-prefix OMP50
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -DOMP5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -DOMP5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix CHECK --check-prefix OMP50
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -DOMP5 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix CHECK --check-prefix OMP50
 // RUN: %clang_cc1 -verify -fopenmp -DOMP51 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix CHECK --check-prefix OMP51
 // RUN: %clang_cc1 -fopenmp -DOMP51 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -DOMP51 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix CHECK --check-prefix OMP51
+// RUN: %clang_cc1 -fopenmp -DOMP51 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix CHECK --check-prefix OMP51
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix CHECK --check-prefix OMP45
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix CHECK --check-prefix OMP45
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix CHECK --check-prefix OMP45
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -DOMP5 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix CHECK --check-prefix OMP50
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -DOMP5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -DOMP5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix CHECK --check-prefix OMP50
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -DOMP5 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix CHECK --check-prefix OMP50
 // RUN: %clang_cc1 -verify -fopenmp-simd -DOMP51 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix CHECK --check-prefix OMP51
 // RUN: %clang_cc1 -fopenmp-simd -DOMP51 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -DOMP51 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix CHECK --check-prefix OMP51
+// RUN: %clang_cc1 -fopenmp-simd -DOMP51 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix CHECK --check-prefix OMP51
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/teams_distribute_simd_ast_print.cpp b/clang/test/OpenMP/teams_distribute_simd_ast_print.cpp
index 059091c1ac29..4836d772da1e 100644
--- a/clang/test/OpenMP/teams_distribute_simd_ast_print.cpp
+++ b/clang/test/OpenMP/teams_distribute_simd_ast_print.cpp
@@ -1,22 +1,22 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -DOMP5 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -DOMP5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -DOMP5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -DOMP5 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -verify -fopenmp -DOMP51 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -fopenmp -DOMP51 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -DOMP51 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
+// RUN: %clang_cc1 -fopenmp -DOMP51 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
+// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP45
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -DOMP5 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -DOMP5 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -DOMP5 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -DOMP5 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -verify -fopenmp-simd -DOMP51 -ast-print %s -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -fopenmp-simd -DOMP51 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -DOMP51 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
+// RUN: %clang_cc1 -fopenmp-simd -DOMP51 -std=c++11 -include-pch %t -verify %s -ast-print -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/threadprivate_ast_print.cpp b/clang/test/OpenMP/threadprivate_ast_print.cpp
index a1482031f5bf..6794265ec59b 100644
--- a/clang/test/OpenMP/threadprivate_ast_print.cpp
+++ b/clang/test/OpenMP/threadprivate_ast_print.cpp
@@ -1,16 +1,16 @@
 // RUN: %clang_cc1 -verify -fopenmp -triple x86_64-apple-darwin10.6.0 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -triple x86_64-apple-darwin10.6.0 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -triple x86_64-apple-darwin10.6.0 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print
+// RUN: %clang_cc1 -fopenmp -triple x86_64-apple-darwin10.6.0 -std=c++11 -include-pch %t -verify %s -ast-print
 // RUN: %clang_cc1 -verify -fopenmp -triple x86_64-unknown-linux-gnu -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print
+// RUN: %clang_cc1 -fopenmp -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -verify %s -ast-print
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-apple-darwin10.6.0 -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -triple x86_64-apple-darwin10.6.0 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -triple x86_64-apple-darwin10.6.0 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print
+// RUN: %clang_cc1 -fopenmp-simd -triple x86_64-apple-darwin10.6.0 -std=c++11 -include-pch %t -verify %s -ast-print
 // RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-linux-gnu -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print
+// RUN: %clang_cc1 -fopenmp-simd -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -verify %s -ast-print
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/tile_ast_print.cpp b/clang/test/OpenMP/tile_ast_print.cpp
index afc8b34911e3..c4dff2c4be44 100644
--- a/clang/test/OpenMP/tile_ast_print.cpp
+++ b/clang/test/OpenMP/tile_ast_print.cpp
@@ -183,4 +183,21 @@ void tfoo7() {
 }
 
 
+// PRINT-LABEL: void foo8(
+// DUMP-LABEL:  FunctionDecl {{.*}} foo8
+void foo8(int a) {
+  // PRINT:     #pragma omp tile sizes(a)
+  // DUMP:      OMPTileDirective
+  // DUMP-NEXT:   OMPSizesClause
+  // DUMP-NEXT:     ImplicitCastExpr
+  // DUMP-NEXT:       DeclRefExpr {{.*}} 'a'
+  #pragma omp tile sizes(a)
+  // PRINT-NEXT: for (int i = 7; i < 19; i += 3)
+  // DUMP-NEXT: ForStmt
+  for (int i = 7; i < 19; i += 3)
+    // PRINT: body(i);
+    // DUMP:  CallExpr
+    body(i);
+}
+
 #endif
diff --git a/clang/test/OpenMP/tile_codegen.cpp b/clang/test/OpenMP/tile_codegen.cpp
index 76cf2d8f1992..93a3a14133ab 100644
--- a/clang/test/OpenMP/tile_codegen.cpp
+++ b/clang/test/OpenMP/tile_codegen.cpp
@@ -83,6 +83,14 @@ extern "C" void tfoo7() {
   foo7<int,3,5>(0, 42);
 }
 
+
+extern "C" void foo8(int a) {
+#pragma omp tile sizes(a)
+  for (int i = 7; i < 17; i += 3)
+    body(i);
+}
+
+
 #endif /* HEADER */
 // CHECK1-LABEL: define {{[^@]+}}@body
 // CHECK1-SAME: (...) #[[ATTR0:[0-9]+]] {
@@ -98,7 +106,7 @@ extern "C" void tfoo7() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_ZN1SC1Ev
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2:[0-9]+]] comdat align 2 {
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -108,7 +116,7 @@ extern "C" void tfoo7() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_ZN1SC2Ev
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[I:%.*]] = alloca ptr, align 8
@@ -885,7 +893,7 @@ extern "C" void tfoo7() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@foo6.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1071,6 +1079,95 @@ extern "C" void tfoo7() {
 // CHECK1-NEXT:    ret void
 //
 //
+// CHECK1-LABEL: define {{[^@]+}}@foo8
+// CHECK1-SAME: (i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTFLOOR_0_IV_I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTTILE_0_IV_I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK1-NEXT:    store i32 7, ptr [[I]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK1-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK1:       for.cond:
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP0]], 4
+// CHECK1-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END24:%.*]]
+// CHECK1:       for.body:
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTTILE_0_IV_I]], align 4
+// CHECK1-NEXT:    br label [[FOR_COND1:%.*]]
+// CHECK1:       for.cond1:
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK1-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP4]], 0
+// CHECK1-NEXT:    br i1 [[CMP2]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1:       cond.true:
+// CHECK1-NEXT:    br label [[COND_END:%.*]]
+// CHECK1:       cond.false:
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK1-NEXT:    br label [[COND_END]]
+// CHECK1:       cond.end:
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], [[COND]]
+// CHECK1-NEXT:    [[CMP3:%.*]] = icmp slt i32 4, [[ADD]]
+// CHECK1-NEXT:    br i1 [[CMP3]], label [[COND_TRUE4:%.*]], label [[COND_FALSE5:%.*]]
+// CHECK1:       cond.true4:
+// CHECK1-NEXT:    br label [[COND_END12:%.*]]
+// CHECK1:       cond.false5:
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP7]], 0
+// CHECK1-NEXT:    br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]]
+// CHECK1:       cond.true7:
+// CHECK1-NEXT:    br label [[COND_END9:%.*]]
+// CHECK1:       cond.false8:
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK1-NEXT:    br label [[COND_END9]]
+// CHECK1:       cond.end9:
+// CHECK1-NEXT:    [[COND10:%.*]] = phi i32 [ 1, [[COND_TRUE7]] ], [ [[TMP8]], [[COND_FALSE8]] ]
+// CHECK1-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP6]], [[COND10]]
+// CHECK1-NEXT:    br label [[COND_END12]]
+// CHECK1:       cond.end12:
+// CHECK1-NEXT:    [[COND13:%.*]] = phi i32 [ 4, [[COND_TRUE4]] ], [ [[ADD11]], [[COND_END9]] ]
+// CHECK1-NEXT:    [[CMP14:%.*]] = icmp slt i32 [[TMP2]], [[COND13]]
+// CHECK1-NEXT:    br i1 [[CMP14]], label [[FOR_BODY15:%.*]], label [[FOR_END:%.*]]
+// CHECK1:       for.body15:
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 3
+// CHECK1-NEXT:    [[ADD16:%.*]] = add nsw i32 7, [[MUL]]
+// CHECK1-NEXT:    store i32 [[ADD16]], ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP10]])
+// CHECK1-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK1:       for.inc:
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
+// CHECK1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK1-NEXT:    store i32 [[INC]], ptr [[DOTTILE_0_IV_I]], align 4
+// CHECK1-NEXT:    br label [[FOR_COND1]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK1:       for.end:
+// CHECK1-NEXT:    br label [[FOR_INC17:%.*]]
+// CHECK1:       for.inc17:
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK1-NEXT:    [[CMP18:%.*]] = icmp sle i32 [[TMP12]], 0
+// CHECK1-NEXT:    br i1 [[CMP18]], label [[COND_TRUE19:%.*]], label [[COND_FALSE20:%.*]]
+// CHECK1:       cond.true19:
+// CHECK1-NEXT:    br label [[COND_END21:%.*]]
+// CHECK1:       cond.false20:
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK1-NEXT:    br label [[COND_END21]]
+// CHECK1:       cond.end21:
+// CHECK1-NEXT:    [[COND22:%.*]] = phi i32 [ 1, [[COND_TRUE19]] ], [ [[TMP13]], [[COND_FALSE20]] ]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK1-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP14]], [[COND22]]
+// CHECK1-NEXT:    store i32 [[ADD23]], ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
+// CHECK1:       for.end24:
+// CHECK1-NEXT:    ret void
+//
+//
 // CHECK1-LABEL: define {{[^@]+}}@_GLOBAL__sub_I_tile_codegen.cpp
 // CHECK1-SAME: () #[[ATTR1]] section ".text.startup" {
 // CHECK1-NEXT:  entry:
@@ -1159,13 +1256,13 @@ extern "C" void tfoo7() {
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@body
-// CHECK2-SAME: (...) #[[ATTR2:[0-9]+]] {
+// CHECK2-SAME: (...) #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    ret void
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@foo1
-// CHECK2-SAME: (i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR2]] {
+// CHECK2-SAME: (i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[START_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[END_ADDR:%.*]] = alloca i32, align 4
@@ -1255,7 +1352,7 @@ extern "C" void tfoo7() {
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@foo2
-// CHECK2-SAME: (i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR2]] {
+// CHECK2-SAME: (i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[START_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[END_ADDR:%.*]] = alloca i32, align 4
@@ -1368,7 +1465,7 @@ extern "C" void tfoo7() {
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@foo3
-// CHECK2-SAME: () #[[ATTR2]] {
+// CHECK2-SAME: () #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -1510,7 +1607,7 @@ extern "C" void tfoo7() {
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@foo4
-// CHECK2-SAME: () #[[ATTR2]] {
+// CHECK2-SAME: () #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -1663,7 +1760,7 @@ extern "C" void tfoo7() {
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@foo5
-// CHECK2-SAME: () #[[ATTR2]] {
+// CHECK2-SAME: () #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTOMP_IV:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -1872,14 +1969,14 @@ extern "C" void tfoo7() {
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@foo6
-// CHECK2-SAME: () #[[ATTR2]] {
+// CHECK2-SAME: () #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 0, ptr @foo6.omp_outlined)
 // CHECK2-NEXT:    ret void
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@foo6.omp_outlined
-// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1974,15 +2071,104 @@ extern "C" void tfoo7() {
 // CHECK2-NEXT:    ret void
 //
 //
+// CHECK2-LABEL: define {{[^@]+}}@foo8
+// CHECK2-SAME: (i32 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTFLOOR_0_IV_I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTTILE_0_IV_I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK2-NEXT:    store i32 7, ptr [[I]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK2-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK2:       for.cond:
+// CHECK2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP0]], 4
+// CHECK2-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END24:%.*]]
+// CHECK2:       for.body:
+// CHECK2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK2-NEXT:    store i32 [[TMP1]], ptr [[DOTTILE_0_IV_I]], align 4
+// CHECK2-NEXT:    br label [[FOR_COND1:%.*]]
+// CHECK2:       for.cond1:
+// CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
+// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK2-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP4]], 0
+// CHECK2-NEXT:    br i1 [[CMP2]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK2:       cond.true:
+// CHECK2-NEXT:    br label [[COND_END:%.*]]
+// CHECK2:       cond.false:
+// CHECK2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK2-NEXT:    br label [[COND_END]]
+// CHECK2:       cond.end:
+// CHECK2-NEXT:    [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], [[COND]]
+// CHECK2-NEXT:    [[CMP3:%.*]] = icmp slt i32 4, [[ADD]]
+// CHECK2-NEXT:    br i1 [[CMP3]], label [[COND_TRUE4:%.*]], label [[COND_FALSE5:%.*]]
+// CHECK2:       cond.true4:
+// CHECK2-NEXT:    br label [[COND_END12:%.*]]
+// CHECK2:       cond.false5:
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK2-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP7]], 0
+// CHECK2-NEXT:    br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]]
+// CHECK2:       cond.true7:
+// CHECK2-NEXT:    br label [[COND_END9:%.*]]
+// CHECK2:       cond.false8:
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK2-NEXT:    br label [[COND_END9]]
+// CHECK2:       cond.end9:
+// CHECK2-NEXT:    [[COND10:%.*]] = phi i32 [ 1, [[COND_TRUE7]] ], [ [[TMP8]], [[COND_FALSE8]] ]
+// CHECK2-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP6]], [[COND10]]
+// CHECK2-NEXT:    br label [[COND_END12]]
+// CHECK2:       cond.end12:
+// CHECK2-NEXT:    [[COND13:%.*]] = phi i32 [ 4, [[COND_TRUE4]] ], [ [[ADD11]], [[COND_END9]] ]
+// CHECK2-NEXT:    [[CMP14:%.*]] = icmp slt i32 [[TMP2]], [[COND13]]
+// CHECK2-NEXT:    br i1 [[CMP14]], label [[FOR_BODY15:%.*]], label [[FOR_END:%.*]]
+// CHECK2:       for.body15:
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
+// CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 3
+// CHECK2-NEXT:    [[ADD16:%.*]] = add nsw i32 7, [[MUL]]
+// CHECK2-NEXT:    store i32 [[ADD16]], ptr [[I]], align 4
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP10]])
+// CHECK2-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK2:       for.inc:
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
+// CHECK2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK2-NEXT:    store i32 [[INC]], ptr [[DOTTILE_0_IV_I]], align 4
+// CHECK2-NEXT:    br label [[FOR_COND1]], !llvm.loop [[LOOP21:![0-9]+]]
+// CHECK2:       for.end:
+// CHECK2-NEXT:    br label [[FOR_INC17:%.*]]
+// CHECK2:       for.inc17:
+// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK2-NEXT:    [[CMP18:%.*]] = icmp sle i32 [[TMP12]], 0
+// CHECK2-NEXT:    br i1 [[CMP18]], label [[COND_TRUE19:%.*]], label [[COND_FALSE20:%.*]]
+// CHECK2:       cond.true19:
+// CHECK2-NEXT:    br label [[COND_END21:%.*]]
+// CHECK2:       cond.false20:
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK2-NEXT:    br label [[COND_END21]]
+// CHECK2:       cond.end21:
+// CHECK2-NEXT:    [[COND22:%.*]] = phi i32 [ 1, [[COND_TRUE19]] ], [ [[TMP13]], [[COND_FALSE20]] ]
+// CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK2-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP14]], [[COND22]]
+// CHECK2-NEXT:    store i32 [[ADD23]], ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
+// CHECK2:       for.end24:
+// CHECK2-NEXT:    ret void
+//
+//
 // CHECK2-LABEL: define {{[^@]+}}@tfoo7
-// CHECK2-SAME: () #[[ATTR2]] {
+// CHECK2-SAME: () #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    call void @_Z4foo7IiTnT_Li3ETnS0_Li5EEvS0_S0_(i32 noundef 0, i32 noundef 42)
 // CHECK2-NEXT:    ret void
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@_Z4foo7IiTnT_Li3ETnS0_Li5EEvS0_S0_
-// CHECK2-SAME: (i32 noundef [[START:%.*]], i32 noundef [[END:%.*]]) #[[ATTR2]] comdat {
+// CHECK2-SAME: (i32 noundef [[START:%.*]], i32 noundef [[END:%.*]]) #[[ATTR1]] comdat {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[START_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[END_ADDR:%.*]] = alloca i32, align 4
@@ -2053,14 +2239,14 @@ extern "C" void tfoo7() {
 // CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
 // CHECK2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP16]], 1
 // CHECK2-NEXT:    store i32 [[INC]], ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND6]], !llvm.loop [[LOOP21:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND6]], !llvm.loop [[LOOP23:![0-9]+]]
 // CHECK2:       for.end:
 // CHECK2-NEXT:    br label [[FOR_INC15:%.*]]
 // CHECK2:       for.inc15:
 // CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK2-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP17]], 5
 // CHECK2-NEXT:    store i32 [[ADD16]], ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
 // CHECK2:       for.end17:
 // CHECK2-NEXT:    ret void
 //
diff --git a/clang/test/OpenMP/tile_messages.cpp b/clang/test/OpenMP/tile_messages.cpp
index adeef617b75c..5268dfe97e0c 100644
--- a/clang/test/OpenMP/tile_messages.cpp
+++ b/clang/test/OpenMP/tile_messages.cpp
@@ -43,13 +43,7 @@ void func() {
 
   // expected-error@+1 {{argument to 'sizes' clause must be a strictly positive integer value}}
   #pragma omp tile sizes(0)
-    ;
-
-  // expected-error@+4 {{expression is not an integral constant expression}}
-  // expected-note@+3 {{read of non-const variable 'a' is not allowed in a constant expression}}
-  // expected-note@+1 {{declared here}}
-  int a;
-  #pragma omp tile sizes(a)
+  for (int i = 0; i < 7; ++i)
     ;
 
   // expected-warning@+2 {{extra tokens at the end of '#pragma omp tile' are ignored}}
@@ -124,4 +118,46 @@ void func() {
   #pragma omp tile sizes(5)
   for (int i = 0; i/3<7; ++i)
     ;
+
+  // expected-error@+2 {{expression must have integral or unscoped enumeration type, not 'struct S'}}
+  struct S{} s;
+  #pragma omp tile sizes(s)
+  for (int i = 0; i < 7; ++i)
+    ;
+}
+
+
+template <typename T>
+static void templated_func() {
+  // In a template context, but expression itself not instantiation-dependent
+
+  // expected-error@+1 {{argument to 'sizes' clause must be a strictly positive integer value}}
+  #pragma omp tile sizes(0)
+  for (int i = 0; i < 7; ++i)
+    ;
+}
+
+template <int S>
+static void templated_func_value_dependent() {
+  // expected-error@+1 {{argument to 'sizes' clause must be a strictly positive integer value}}
+  #pragma omp tile sizes(S)
+  for (int i = 0; i < 7; ++i)
+    ;
+}
+
+template <typename T>
+static void templated_func_type_dependent() {
+  constexpr T s = 0;
+  // expected-error@+1 {{argument to 'sizes' clause must be a strictly positive integer value}}
+  #pragma omp tile sizes(s)
+  for (int i = 0; i < 7; ++i)
+    ;
+}
+
+void template_inst() {
+  templated_func<int>();
+  // expected-note@+1 {{in instantiation of function template specialization 'templated_func_value_dependent<0>' requested here}}
+  templated_func_value_dependent<0>();
+  // expected-note@+1 {{in instantiation of function template specialization 'templated_func_type_dependent<int>' requested here}}
+  templated_func_type_dependent<int>();
 }
diff --git a/clang/test/OpenMP/x86_target_exceptions.cpp b/clang/test/OpenMP/x86_target_exceptions.cpp
index 5c02bcb92621..490b62441bd1 100644
--- a/clang/test/OpenMP/x86_target_exceptions.cpp
+++ b/clang/test/OpenMP/x86_target_exceptions.cpp
@@ -1,6 +1,6 @@
 // REQUIRES: x86-registered-target, staticanalyzer
 
-// RUN: %clang_cc1 -fopenmp -triple x86_64-pc-linux-gnu -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -emit-llvm -S -verify -Wopenmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple x86_64-pc-linux-gnu -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -verify -Wopenmp-target-exception -analyze
 #pragma omp declare target
 int foo(void) {
 	int error = -1;
diff --git a/clang/test/OpenMP/x86_target_throw.cpp b/clang/test/OpenMP/x86_target_throw.cpp
index a9186bac43d6..c1fef3cdb596 100644
--- a/clang/test/OpenMP/x86_target_throw.cpp
+++ b/clang/test/OpenMP/x86_target_throw.cpp
@@ -1,6 +1,6 @@
 // REQUIRES: x86-registered-target, staticanalyzer
 
-// RUN: %clang_cc1 -fopenmp -triple x86_64-pc-linux-gnu -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -emit-llvm -S -verify -Wopenmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple x86_64-pc-linux-gnu -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -verify -Wopenmp-target-exception -analyze
 #pragma omp declare target
 void foo(void) {
 	throw 404;
diff --git a/clang/test/OpenMP/x86_target_try_catch.cpp b/clang/test/OpenMP/x86_target_try_catch.cpp
index 698af912c6ac..36389a511dc3 100644
--- a/clang/test/OpenMP/x86_target_try_catch.cpp
+++ b/clang/test/OpenMP/x86_target_try_catch.cpp
@@ -1,6 +1,6 @@
 // REQUIRES: x86-registered-target, staticanalyzer
 
-// RUN: %clang_cc1 -fopenmp -triple x86_64-pc-linux-gnu -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -emit-llvm -S -verify -Wopenmp-target-exception -analyze
+// RUN: %clang_cc1 -fopenmp -triple x86_64-pc-linux-gnu -fopenmp-is-target-device -fcxx-exceptions -fexceptions %s -verify -Wopenmp-target-exception -analyze
 #pragma omp declare target
 int foo(void) {
 	int error = -1;
diff --git a/clang/test/Options/HV.hlsl b/clang/test/Options/HV.hlsl
index 9f7e1ebc02f2..f88eb6977f4f 100644
--- a/clang/test/Options/HV.hlsl
+++ b/clang/test/Options/HV.hlsl
@@ -1,20 +1,20 @@
-// RUN: %clang_dxc -T lib_6_4 -HV 2016 %s 2>&1 -###   | FileCheck -check-prefix=2016 %s
-// RUN: %clang_dxc -T lib_6_4 -HV 2017 %s 2>&1 -###   | FileCheck -check-prefix=2017 %s
-// RUN: %clang_dxc -T lib_6_4 /HV 2018 %s 2>&1 -###   | FileCheck -check-prefix=2018 %s
-// RUN: %clang_dxc -T lib_6_4 /HV 2021 %s 2>&1 -###   | FileCheck -check-prefix=2021 %s
-// RUN: %clang_dxc -T lib_6_4 /HV 202x %s 2>&1 -###   | FileCheck -check-prefix=202x %s
-// RUN: %clang_dxc -T lib_6_4 %s 2>&1 -###   | FileCheck -check-prefix=NO_HV %s
-// RUN: not %clang_dxc -T lib_6_4 /HV gibberish -### %s 2>&1 | FileCheck -check-prefix=CHECK-ERR %s
-
-// 2016: "-std=hlsl2016"
-// 2017: "-std=hlsl2017"
-// 2018: "-std=hlsl2018"
-// 2021: "-std=hlsl2021"
-// 202x: "-std=hlsl202x"
-// NO_HV-NOT: "-std="
-// CHECK-ERR: error: invalid value 'gibberish' in 'HV'
-float4 main(float4 a : A) : SV_TARGET
-{
-  return -a.yxxx;
-}
-
+// RUN: %clang_dxc -T lib_6_4 -HV 2016 %s 2>&1 -###   | FileCheck -check-prefix=2016 %s
+// RUN: %clang_dxc -T lib_6_4 -HV 2017 %s 2>&1 -###   | FileCheck -check-prefix=2017 %s
+// RUN: %clang_dxc -T lib_6_4 /HV 2018 %s 2>&1 -###   | FileCheck -check-prefix=2018 %s
+// RUN: %clang_dxc -T lib_6_4 /HV 2021 %s 2>&1 -###   | FileCheck -check-prefix=2021 %s
+// RUN: %clang_dxc -T lib_6_4 /HV 202x %s 2>&1 -###   | FileCheck -check-prefix=202x %s
+// RUN: %clang_dxc -T lib_6_4 %s 2>&1 -###   | FileCheck -check-prefix=NO_HV %s
+// RUN: not %clang_dxc -T lib_6_4 /HV gibberish -### %s 2>&1 | FileCheck -check-prefix=CHECK-ERR %s
+
+// 2016: "-std=hlsl2016"
+// 2017: "-std=hlsl2017"
+// 2018: "-std=hlsl2018"
+// 2021: "-std=hlsl2021"
+// 202x: "-std=hlsl202x"
+// NO_HV-NOT: "-std="
+// CHECK-ERR: error: invalid value 'gibberish' in 'HV'
+float4 main(float4 a : A) : SV_TARGET
+{
+  return -a.yxxx;
+}
+
diff --git a/clang/test/Options/enable_16bit_types_validation.hlsl b/clang/test/Options/enable_16bit_types_validation.hlsl
index 71d336f6f503..bcb217e8982e 100644
--- a/clang/test/Options/enable_16bit_types_validation.hlsl
+++ b/clang/test/Options/enable_16bit_types_validation.hlsl
@@ -1,25 +1,25 @@
-// RUN: not %clang_dxc -enable-16bit-types -T cs_6_0 -HV 2016 %s 2>&1  | FileCheck -check-prefix=both_invalid %s
-// RUN: not %clang_dxc -enable-16bit-types -T lib_6_4 -HV 2017 %s 2>&1 | FileCheck -check-prefix=HV_invalid_2017 %s
-// RUN: not %clang_dxc -enable-16bit-types -T cs_6_0 /HV 2021 %s 2>&1  | FileCheck -check-prefix=TP_invalid %s
-// RUN: %clang_dxc -enable-16bit-types -T lib_6_4 /HV 2018 %s 2>&1 -###   | FileCheck -check-prefix=valid_2018 %s
-// RUN: %clang_dxc -enable-16bit-types -T lib_6_4 /HV 2021 %s 2>&1 -###   | FileCheck -check-prefix=valid_2021 %s
-
-
-// both_invalid: error: '-enable-16bit-types' option requires target HLSL Version >= 2018 and shader model >= 6.2, but HLSL Version is 'hlsl2016' and shader model is '6.0'
-// HV_invalid_2017: error: '-enable-16bit-types' option requires target HLSL Version >= 2018 and shader model >= 6.2, but HLSL Version is 'hlsl2017' and shader model is '6.4'
-// TP_invalid: error: '-enable-16bit-types' option requires target HLSL Version >= 2018 and shader model >= 6.2, but HLSL Version is 'hlsl2021' and shader model is '6.0'
-
-// valid_2021: "dxil-unknown-shadermodel6.4-library"
-// valid_2021-SAME: "-std=hlsl2021"
-// valid_2021-SAME: "-fnative-half-type"
-
-// valid_2018: "dxil-unknown-shadermodel6.4-library"
-// valid_2018-SAME: "-std=hlsl2018"
-// valid_2018-SAME: "-fnative-half-type"
-
-[numthreads(1,1,1)]
-void main()
-{
-  return;
-}
-
+// RUN: not %clang_dxc -enable-16bit-types -T cs_6_0 -HV 2016 %s 2>&1  | FileCheck -check-prefix=both_invalid %s
+// RUN: not %clang_dxc -enable-16bit-types -T lib_6_4 -HV 2017 %s 2>&1 | FileCheck -check-prefix=HV_invalid_2017 %s
+// RUN: not %clang_dxc -enable-16bit-types -T cs_6_0 /HV 2021 %s 2>&1  | FileCheck -check-prefix=TP_invalid %s
+// RUN: %clang_dxc -enable-16bit-types -T lib_6_4 /HV 2018 %s 2>&1 -###   | FileCheck -check-prefix=valid_2018 %s
+// RUN: %clang_dxc -enable-16bit-types -T lib_6_4 /HV 2021 %s 2>&1 -###   | FileCheck -check-prefix=valid_2021 %s
+
+
+// both_invalid: error: '-enable-16bit-types' option requires target HLSL Version >= 2018 and shader model >= 6.2, but HLSL Version is 'hlsl2016' and shader model is '6.0'
+// HV_invalid_2017: error: '-enable-16bit-types' option requires target HLSL Version >= 2018 and shader model >= 6.2, but HLSL Version is 'hlsl2017' and shader model is '6.4'
+// TP_invalid: error: '-enable-16bit-types' option requires target HLSL Version >= 2018 and shader model >= 6.2, but HLSL Version is 'hlsl2021' and shader model is '6.0'
+
+// valid_2021: "dxilv1.4-unknown-shadermodel6.4-library"
+// valid_2021-SAME: "-std=hlsl2021"
+// valid_2021-SAME: "-fnative-half-type"
+
+// valid_2018: "dxilv1.4-unknown-shadermodel6.4-library"
+// valid_2018-SAME: "-std=hlsl2018"
+// valid_2018-SAME: "-fnative-half-type"
+
+[numthreads(1,1,1)]
+void main()
+{
+  return;
+}
+
diff --git a/clang/test/Options/enable_16bit_types_validation_spirv.hlsl b/clang/test/Options/enable_16bit_types_validation_spirv.hlsl
index a9700ef87a27..aeb7a8369f40 100644
--- a/clang/test/Options/enable_16bit_types_validation_spirv.hlsl
+++ b/clang/test/Options/enable_16bit_types_validation_spirv.hlsl
@@ -1,14 +1,14 @@
-// RUN: not %clang_cc1 -internal-isystem D:\llvm-project\build\x64-Release\lib\clang\19\include -nostdsysteminc -triple spirv-vulkan-library -x hlsl -std=hlsl2016 -fnative-half-type -emit-llvm -disable-llvm-passes  -o - %s 2>&1 | FileCheck %s --check-prefix=SPIRV
-// RUN: %clang_cc1 -internal-isystem D:\llvm-project\build\x64-Release\lib\clang\19\include -nostdsysteminc -triple spirv-vulkan-library -x hlsl -std=hlsl2021 -fnative-half-type -emit-llvm -disable-llvm-passes  -o - %s 2>&1 | FileCheck %s --check-prefix=valid
-
-// SPIRV: error: '-fnative-half-type' option requires target HLSL Version >= 2018, but HLSL Version is 'hlsl2016'
-
-// valid: "spirv-unknown-vulkan-library"
-// valid: define spir_func void @main() #0 {
-
-[numthreads(1,1,1)]
-void main()
-{
-  return;
-}
-
+// RUN: not %clang_cc1 -internal-isystem D:\llvm-project\build\x64-Release\lib\clang\19\include -nostdsysteminc -triple spirv-vulkan-library -x hlsl -std=hlsl2016 -fnative-half-type -emit-llvm -disable-llvm-passes  -o - %s 2>&1 | FileCheck %s --check-prefix=SPIRV
+// RUN: %clang_cc1 -internal-isystem D:\llvm-project\build\x64-Release\lib\clang\19\include -nostdsysteminc -triple spirv-vulkan-library -x hlsl -std=hlsl2021 -fnative-half-type -emit-llvm -disable-llvm-passes  -o - %s 2>&1 | FileCheck %s --check-prefix=valid
+
+// SPIRV: error: '-fnative-half-type' option requires target HLSL Version >= 2018, but HLSL Version is 'hlsl2016'
+
+// valid: "spirv-unknown-vulkan-library"
+// valid: define spir_func void @main() #0 {
+
+[numthreads(1,1,1)]
+void main()
+{
+  return;
+}
+
diff --git a/clang/test/PCH/arc.m b/clang/test/PCH/arc.m
index 63c77778f40a..32069e231416 100644
--- a/clang/test/PCH/arc.m
+++ b/clang/test/PCH/arc.m
@@ -1,15 +1,15 @@
 // REQUIRES: x86-registered-target
 // Test this without pch.
-// RUN: %clang_cc1 -fblocks -triple x86_64-apple-darwin11 -fobjc-arc -include %S/Inputs/arc.h -fsyntax-only -emit-llvm-only %s
+// RUN: %clang_cc1 -fblocks -triple x86_64-apple-darwin11 -fobjc-arc -include %S/Inputs/arc.h -emit-llvm-only %s
 
 // Test with pch.
 // RUN: %clang_cc1 -emit-pch -fblocks -triple x86_64-apple-darwin11 -fobjc-arc -x objective-c-header -o %t %S/Inputs/arc.h
-// RUN: %clang_cc1 -fblocks -triple x86_64-apple-darwin11 -fobjc-arc -include-pch %t -fsyntax-only -emit-llvm-only %s 
+// RUN: %clang_cc1 -fblocks -triple x86_64-apple-darwin11 -fobjc-arc -include-pch %t -emit-llvm-only %s 
 
 // Test error when pch's -fobjc-arc state is different.
-// RUN: not %clang_cc1 -fblocks -triple x86_64-apple-darwin11 -include-pch %t -fsyntax-only -emit-llvm-only %s 2>&1 | FileCheck -check-prefix=CHECK-ERR1 %s 
+// RUN: not %clang_cc1 -fblocks -triple x86_64-apple-darwin11 -include-pch %t -emit-llvm-only %s 2>&1 | FileCheck -check-prefix=CHECK-ERR1 %s 
 // RUN: %clang_cc1 -emit-pch -fblocks -triple x86_64-apple-darwin11 -x objective-c-header -o %t %S/Inputs/arc.h
-// RUN: not %clang_cc1 -fblocks -triple x86_64-apple-darwin11 -fobjc-arc -include-pch %t -fsyntax-only -emit-llvm-only %s 2>&1 | FileCheck -check-prefix=CHECK-ERR2 %s
+// RUN: not %clang_cc1 -fblocks -triple x86_64-apple-darwin11 -fobjc-arc -include-pch %t -emit-llvm-only %s 2>&1 | FileCheck -check-prefix=CHECK-ERR2 %s
 
 array0 a0;
 array1 a1;
diff --git a/clang/test/PCH/blocks.c b/clang/test/PCH/blocks.c
index e7498865bd88..496e415b7884 100644
--- a/clang/test/PCH/blocks.c
+++ b/clang/test/PCH/blocks.c
@@ -1,9 +1,9 @@
 // Test this without pch.
-// RUN: %clang_cc1 -fblocks -include %S/blocks.h -fsyntax-only -emit-llvm -o - %s
+// RUN: %clang_cc1 -fblocks -include %S/blocks.h -emit-llvm -o - %s
 
 // Test with pch.
 // RUN: %clang_cc1 -emit-pch -fblocks -o %t %S/blocks.h
-// RUN: %clang_cc1 -fblocks -include-pch %t -fsyntax-only -emit-llvm -o - %s 
+// RUN: %clang_cc1 -fblocks -include-pch %t -emit-llvm -o - %s 
 
 int do_add(int x, int y) { return add(x, y); }
 
diff --git a/clang/test/PCH/chain-openmp-threadprivate.cpp b/clang/test/PCH/chain-openmp-threadprivate.cpp
index 05cd65063789..21b9f6868cc3 100644
--- a/clang/test/PCH/chain-openmp-threadprivate.cpp
+++ b/clang/test/PCH/chain-openmp-threadprivate.cpp
@@ -8,6 +8,7 @@
 // with PCH
 // RUN: %clang_cc1 -fopenmp -emit-llvm -chain-include %s -chain-include %s %s -o - | FileCheck %s -check-prefix=CHECK-TLS-1
 // RUN: %clang_cc1 -fopenmp -emit-llvm -chain-include %s -chain-include %s %s -o - | FileCheck %s -check-prefix=CHECK-TLS-2
+// // UNSUPPORTED: target={{.*}}-zos{{.*}}
 
 #if !defined(PASS1)
 #define PASS1
diff --git a/clang/test/PCH/cxx-alias-decl.cpp b/clang/test/PCH/cxx-alias-decl.cpp
index 2fdf40c0ae83..f2b4b79134a1 100644
--- a/clang/test/PCH/cxx-alias-decl.cpp
+++ b/clang/test/PCH/cxx-alias-decl.cpp
@@ -1,12 +1,12 @@
 // Test this without pch.
-// RUN: %clang_cc1 -x c++ -std=c++11 -include %S/cxx-alias-decl.h -fsyntax-only -emit-llvm -o - %s
+// RUN: %clang_cc1 -x c++ -std=c++11 -include %S/cxx-alias-decl.h -emit-llvm -o - %s
 
 // Test with pch.
 // RUN: %clang_cc1 -x c++ -std=c++11 -emit-pch -o %t %S/cxx-alias-decl.h
-// RUN: %clang_cc1 -x c++ -std=c++11 -include-pch %t -fsyntax-only -emit-llvm -o - %s 
+// RUN: %clang_cc1 -x c++ -std=c++11 -include-pch %t -emit-llvm -o - %s 
 
 // RUN: %clang_cc1 -x c++ -std=c++11 -emit-pch -fpch-instantiate-templates -o %t %S/cxx-alias-decl.h
-// RUN: %clang_cc1 -x c++ -std=c++11 -include-pch %t -fsyntax-only -emit-llvm -o - %s
+// RUN: %clang_cc1 -x c++ -std=c++11 -include-pch %t -emit-llvm -o - %s
 
 template struct T<S>;
 C<A>::A<char> a;
diff --git a/clang/test/PCH/cxx-for-range.cpp b/clang/test/PCH/cxx-for-range.cpp
index 48310dbc55ce..285442faf4bd 100644
--- a/clang/test/PCH/cxx-for-range.cpp
+++ b/clang/test/PCH/cxx-for-range.cpp
@@ -1,9 +1,9 @@
 // Test this without pch.
-// RUN: %clang_cc1 -x c++ -std=c++11 -include %S/cxx-for-range.h -fsyntax-only -emit-llvm -o - %s
+// RUN: %clang_cc1 -x c++ -std=c++11 -include %S/cxx-for-range.h -emit-llvm -o - %s
 
 // Test with pch.
 // RUN: %clang_cc1 -x c++ -std=c++11 -emit-pch -o %t %S/cxx-for-range.h
-// RUN: %clang_cc1 -x c++ -std=c++11 -include-pch %t -fsyntax-only -emit-llvm -o - %s 
+// RUN: %clang_cc1 -x c++ -std=c++11 -include-pch %t -emit-llvm -o - %s 
 
 void h() {
   f();
diff --git a/clang/test/PCH/cxx-member-init.cpp b/clang/test/PCH/cxx-member-init.cpp
index 1bced567b9de..52ea7bb174a0 100644
--- a/clang/test/PCH/cxx-member-init.cpp
+++ b/clang/test/PCH/cxx-member-init.cpp
@@ -1,12 +1,12 @@
 // Test this without pch.
-// RUN: %clang_cc1 -x c++ -std=c++11 -DHEADER -DSOURCE -fsyntax-only -emit-llvm -o - %s
+// RUN: %clang_cc1 -x c++ -std=c++11 -DHEADER -DSOURCE -emit-llvm -o - %s
 
 // Test with pch.
 // RUN: %clang_cc1 -x c++ -std=c++11 -DHEADER -emit-pch -o %t %s
-// RUN: %clang_cc1 -x c++ -std=c++11 -DHEADER -include-pch %t -fsyntax-only -emit-llvm -o - %s 
+// RUN: %clang_cc1 -x c++ -std=c++11 -DHEADER -include-pch %t -emit-llvm -o - %s 
 
 // RUN: %clang_cc1 -x c++ -std=c++11 -DHEADER -emit-pch -fpch-instantiate-templates -o %t %s
-// RUN: %clang_cc1 -x c++ -std=c++11 -DHEADER -include-pch %t -fsyntax-only -emit-llvm -o - %s
+// RUN: %clang_cc1 -x c++ -std=c++11 -DHEADER -include-pch %t -emit-llvm -o - %s
 
 #ifdef HEADER
 int n;
diff --git a/clang/test/PCH/cxx-namespaces.cpp b/clang/test/PCH/cxx-namespaces.cpp
index d1bf98b4ca17..ebbc387746ec 100644
--- a/clang/test/PCH/cxx-namespaces.cpp
+++ b/clang/test/PCH/cxx-namespaces.cpp
@@ -4,12 +4,12 @@
 // Test with pch.
 // RUN: %clang_cc1 -x c++-header -emit-pch -o %t %S/cxx-namespaces.h
 // RUN: %clang_cc1 -include-pch %t -fsyntax-only -verify %s
-// RUN: %clang_cc1 -include-pch %t -fsyntax-only -ast-dump-lookups -ast-dump-filter N %s | FileCheck %s
+// RUN: %clang_cc1 -include-pch %t -ast-dump-lookups -ast-dump-filter N %s | FileCheck %s
 
 // Test with modules.
 // RUN: %clang_cc1 -fmodules -x c++-header -emit-pch -o %t %S/cxx-namespaces.h
 // RUN: %clang_cc1 -fmodules -include-pch %t -fsyntax-only -verify %s
-// RUN: %clang_cc1 -fmodules -include-pch %t -fsyntax-only -ast-dump-lookups -ast-dump-filter N %s | FileCheck %s
+// RUN: %clang_cc1 -fmodules -include-pch %t -ast-dump-lookups -ast-dump-filter N %s | FileCheck %s
 
 // expected-no-diagnostics
 
diff --git a/clang/test/PCH/cxx-reference.cpp b/clang/test/PCH/cxx-reference.cpp
index becb9356731b..7dc8363e02f5 100644
--- a/clang/test/PCH/cxx-reference.cpp
+++ b/clang/test/PCH/cxx-reference.cpp
@@ -1,6 +1,6 @@
 // Test this without pch.
-// RUN: %clang_cc1 -x c++ -triple %itanium_abi_triple -std=c++11 -include %S/cxx-reference.h -fsyntax-only -emit-llvm -o - %s
+// RUN: %clang_cc1 -x c++ -triple %itanium_abi_triple -std=c++11 -include %S/cxx-reference.h -emit-llvm -o - %s
 
 // Test with pch.
 // RUN: %clang_cc1 -x c++ -triple %itanium_abi_triple -std=c++11 -emit-pch -o %t %S/cxx-reference.h
-// RUN: %clang_cc1 -x c++ -triple %itanium_abi_triple -std=c++11 -include-pch %t -fsyntax-only -emit-llvm -o - %s 
+// RUN: %clang_cc1 -x c++ -triple %itanium_abi_triple -std=c++11 -include-pch %t -emit-llvm -o - %s 
diff --git a/clang/test/PCH/cxx1z-init-statement.cpp b/clang/test/PCH/cxx1z-init-statement.cpp
index d08fb7c56b71..bd478829d2db 100644
--- a/clang/test/PCH/cxx1z-init-statement.cpp
+++ b/clang/test/PCH/cxx1z-init-statement.cpp
@@ -1,9 +1,9 @@
 // Test this without pch.
-// RUN: %clang_cc1 -std=c++1z -include %S/cxx1z-init-statement.h -fsyntax-only -emit-llvm -o - %s
+// RUN: %clang_cc1 -std=c++1z -include %S/cxx1z-init-statement.h -emit-llvm -o - %s
 
 // Test with pch.
 // RUN: %clang_cc1 -x c++ -std=c++1z -emit-pch -o %t %S/cxx1z-init-statement.h
-// RUN: %clang_cc1 -std=c++1z -include-pch %t -fsyntax-only -emit-llvm -o - %s 
+// RUN: %clang_cc1 -std=c++1z -include-pch %t -emit-llvm -o - %s 
 
 void g0(void) {
   static_assert(test_if(-1) == -1, "");
diff --git a/clang/test/PCH/cxx_exprs.cpp b/clang/test/PCH/cxx_exprs.cpp
index c901bd7fc71c..0b3194e9939d 100644
--- a/clang/test/PCH/cxx_exprs.cpp
+++ b/clang/test/PCH/cxx_exprs.cpp
@@ -1,9 +1,9 @@
 // Test this without pch.
-// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -include %S/cxx_exprs.h -std=c++11 -fsyntax-only -verify %s -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -include %S/cxx_exprs.h -std=c++11 -verify %s -ast-dump | FileCheck %s
 
 // Test with pch. Use '-ast-dump' to force deserialization of function bodies.
 // RUN: %clang_cc1 -fcxx-exceptions -fexceptions -x c++-header -std=c++11 -emit-pch -o %t %S/cxx_exprs.h
-// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-dump-all | FileCheck %s
+// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -std=c++11 -include-pch %t -verify %s -ast-dump-all | FileCheck %s
 
 // expected-no-diagnostics
 
diff --git a/clang/test/PCH/cxx_paren_init.cpp b/clang/test/PCH/cxx_paren_init.cpp
index 3150f8d9835d..9731ea7737c1 100644
--- a/clang/test/PCH/cxx_paren_init.cpp
+++ b/clang/test/PCH/cxx_paren_init.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -x c++ -std=c++20 -triple x86_64-unknown-linux-gnu -emit-pch -o %t %S/cxx_paren_init.h
-// RUN: %clang_cc1 -x c++ -std=c++20 -triple x86_64-unknown-linux-gnu -include-pch %t %s -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -x c++ -std=c++20 -triple x86_64-unknown-linux-gnu -include-pch %t %s -emit-llvm -o - | FileCheck %s
 
 // CHECK-DAG: [[STRUCT_S:%.*]] = type { i32, i32 }
 // CHECK-DAG: @{{.*s.*}} = {{(dso_local )?}}global [[STRUCT_S]] { i32 1, i32 2 }, align 4
diff --git a/clang/test/PCH/empty-with-headers.c b/clang/test/PCH/empty-with-headers.c
index b51f0ce25848..18f0266cef52 100644
--- a/clang/test/PCH/empty-with-headers.c
+++ b/clang/test/PCH/empty-with-headers.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -std=c99 -pedantic-errors %s
-// RUN: %clang_cc1 -fsyntax-only -std=c99 -emit-pch -o %t %s
+// RUN: %clang_cc1 -std=c99 -emit-pch -o %t %s
 // RUN: %clang_cc1 -fsyntax-only -std=c99 -pedantic-errors -include-pch %t %s
 
 // RUN: %clang_cc1 -fsyntax-only -std=c99 -pedantic-errors -DINCLUDED %s -verify
diff --git a/clang/test/PCH/fixed-point-literal.c b/clang/test/PCH/fixed-point-literal.c
index 996d8c117089..de5e25adce5e 100644
--- a/clang/test/PCH/fixed-point-literal.c
+++ b/clang/test/PCH/fixed-point-literal.c
@@ -1,10 +1,10 @@
 
 // Test this without pch.
-// RUN: %clang_cc1 -ffixed-point -include %S/Inputs/fixed-point-literal.h -fsyntax-only -ast-print -o - %s | FileCheck %s
+// RUN: %clang_cc1 -ffixed-point -include %S/Inputs/fixed-point-literal.h -ast-print -o - %s | FileCheck %s
 
 // Test with pch.
 // RUN: %clang_cc1 -ffixed-point -emit-pch -o %t %S/Inputs/fixed-point-literal.h
-// RUN: %clang_cc1 -ffixed-point -include-pch %t -fsyntax-only -ast-print -o - %s | FileCheck %s
+// RUN: %clang_cc1 -ffixed-point -include-pch %t -ast-print -o - %s | FileCheck %s
 
 // CHECK: const short _Fract sf = -0.25r;
 // CHECK: const _Fract f = 0.75r;
diff --git a/clang/test/PCH/local_static.cpp b/clang/test/PCH/local_static.cpp
index d198d84e0b98..92d75da9c204 100644
--- a/clang/test/PCH/local_static.cpp
+++ b/clang/test/PCH/local_static.cpp
@@ -1,6 +1,6 @@
 // REQUIRES: x86-registered-target
 // Test this without PCH.
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9.0 -include %S/local_static.h -fsyntax-only %s -emit-llvm -o %t.no_pch.ll %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9.0 -include %S/local_static.h %s -emit-llvm -o %t.no_pch.ll %s
 // RUN: FileCheck --input-file %t.no_pch.ll %s
 
 // Test with PCH.
diff --git a/clang/test/PCH/ms-if-exists.cpp b/clang/test/PCH/ms-if-exists.cpp
index c875b1db7245..36c21832982e 100644
--- a/clang/test/PCH/ms-if-exists.cpp
+++ b/clang/test/PCH/ms-if-exists.cpp
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -x c++ -fms-extensions -fsyntax-only -emit-pch -o %t %s
+// RUN: %clang_cc1 -x c++ -fms-extensions -emit-pch -o %t %s
 // RUN: %clang_cc1 -x c++ -fms-extensions -fsyntax-only -include-pch %t %s -verify
 
-// RUN: %clang_cc1 -x c++ -fms-extensions -fsyntax-only -emit-pch -fpch-instantiate-templates -o %t %s
+// RUN: %clang_cc1 -x c++ -fms-extensions -emit-pch -fpch-instantiate-templates -o %t %s
 // RUN: %clang_cc1 -x c++ -fms-extensions -fsyntax-only -include-pch %t %s -verify
 
 #ifndef HEADER
diff --git a/clang/test/PCH/multiple_decls.c b/clang/test/PCH/multiple_decls.c
index e2cc552336ea..376210e71a77 100644
--- a/clang/test/PCH/multiple_decls.c
+++ b/clang/test/PCH/multiple_decls.c
@@ -1,9 +1,9 @@
 // Test this without pch.
-// RUN: %clang_cc1 -include %S/multiple_decls.h -fsyntax-only -ast-print -o - %s
+// RUN: %clang_cc1 -include %S/multiple_decls.h -ast-print -o - %s
 
 // Test with pch.
 // RUN: %clang_cc1 -emit-pch -o %t %S/multiple_decls.h
-// RUN: %clang_cc1 -include-pch %t -fsyntax-only -ast-print -o - %s 
+// RUN: %clang_cc1 -include-pch %t -ast-print -o - %s 
 
 void f0(char c) {
   wide(c);
diff --git a/clang/test/PCH/ocl_types.cl b/clang/test/PCH/ocl_types.cl
index 60323e1609a7..1f51ce50361f 100644
--- a/clang/test/PCH/ocl_types.cl
+++ b/clang/test/PCH/ocl_types.cl
@@ -3,7 +3,7 @@
 
 // Test with pch.
 // RUN: %clang_cc1 -triple spir-unknown-unknown -x cl -emit-pch -o %t %S/ocl_types.h -cl-std=CL2.0 -D__OPENCL_VERSION__=200
-// RUN: %clang_cc1 -triple spir-unknown-unknown -include-pch %t -fsyntax-only %s -ast-print -cl-std=CL2.0 -D__OPENCL_VERSION__=200
+// RUN: %clang_cc1 -triple spir-unknown-unknown -include-pch %t %s -ast-print -cl-std=CL2.0 -D__OPENCL_VERSION__=200
 
 void foo1(img1d_t img);
 
diff --git a/clang/test/PCH/rdar10830559.cpp b/clang/test/PCH/rdar10830559.cpp
index 90aba9ce261d..8a5571a6d6fb 100644
--- a/clang/test/PCH/rdar10830559.cpp
+++ b/clang/test/PCH/rdar10830559.cpp
@@ -1,5 +1,5 @@
 // Test this without pch.
-// RUN: %clang_cc1 -fsyntax-only -emit-llvm-only %s
+// RUN: %clang_cc1 -emit-llvm-only %s
 
 // Test with pch.
 // RUN: touch %t.empty.cpp
diff --git a/clang/test/PCH/stmt-openmp_structured_block-bit.cpp b/clang/test/PCH/stmt-openmp_structured_block-bit.cpp
index cb2d3638c5bd..8c78a69d1722 100644
--- a/clang/test/PCH/stmt-openmp_structured_block-bit.cpp
+++ b/clang/test/PCH/stmt-openmp_structured_block-bit.cpp
@@ -1,9 +1,9 @@
 // Test this without pch.
-// RUN: %clang_cc1 -std=c++11 -fopenmp -fsyntax-only -verify %s -ast-dump-all | FileCheck %s -implicit-check-not=openmp_structured_block
+// RUN: %clang_cc1 -std=c++11 -fopenmp -verify %s -ast-dump-all | FileCheck %s -implicit-check-not=openmp_structured_block
 
 // Test with pch. Use '-ast-dump' to force deserialization of function bodies.
 // RUN: %clang_cc1 -std=c++11 -fopenmp -emit-pch -o %t %s
-// RUN: echo "// expected-no-diagnostics" | %clang_cc1 -x c++ -std=c++11 -include-pch %t -fopenmp -fsyntax-only -verify - -ast-dump-all | FileCheck %s -implicit-check-not=openmp_structured_block
+// RUN: echo "// expected-no-diagnostics" | %clang_cc1 -x c++ -std=c++11 -include-pch %t -fopenmp -verify - -ast-dump-all | FileCheck %s -implicit-check-not=openmp_structured_block
 
 void test() {
 #pragma omp parallel
diff --git a/clang/test/PCH/stmts.c b/clang/test/PCH/stmts.c
index 6def453c86e4..21c6605d592c 100644
--- a/clang/test/PCH/stmts.c
+++ b/clang/test/PCH/stmts.c
@@ -1,9 +1,9 @@
 // Test this without pch.
-// RUN: %clang_cc1 -include %S/stmts.h -fsyntax-only -emit-llvm -o - %s
+// RUN: %clang_cc1 -include %S/stmts.h -emit-llvm -o - %s
 
 // Test with pch.
 // RUN: %clang_cc1 -emit-pch -o %t %S/stmts.h
-// RUN: %clang_cc1 -include-pch %t -fsyntax-only -emit-llvm -o - %s 
+// RUN: %clang_cc1 -include-pch %t -emit-llvm -o - %s 
 
 void g0(void) { f0(5); }
 int g1(int x) { return f1(x); }
diff --git a/clang/test/PCH/types.c b/clang/test/PCH/types.c
index 45f71611175e..a9617b6a7cc6 100644
--- a/clang/test/PCH/types.c
+++ b/clang/test/PCH/types.c
@@ -3,7 +3,7 @@
 
 // Test with pch.
 // RUN: %clang_cc1 -emit-pch -fblocks -Wno-strict-prototypes -o %t %S/types.h
-// RUN: %clang_cc1 -fblocks -include-pch %t -fsyntax-only -verify -Wno-strict-prototypes %s -ast-print
+// RUN: %clang_cc1 -fblocks -include-pch %t -verify -Wno-strict-prototypes %s -ast-print
 
 typedef int INT;
 INT int_value;
diff --git a/clang/test/PCH/uuidof.cpp b/clang/test/PCH/uuidof.cpp
index 207a8dafee3b..baccd97ee6d8 100644
--- a/clang/test/PCH/uuidof.cpp
+++ b/clang/test/PCH/uuidof.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -fms-extensions -x c++-header -emit-pch -o %t %s
-// RUN: %clang_cc1 -fms-extensions -include-pch %t -fsyntax-only %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -fms-extensions -include-pch %t %s -emit-llvm -o - | FileCheck %s
 
 #ifndef HEADER
 #define HEADER
diff --git a/clang/test/Parser/extra-semi.cpp b/clang/test/Parser/extra-semi.cpp
index 7287f856d8c9..c08756149ef7 100644
--- a/clang/test/Parser/extra-semi.cpp
+++ b/clang/test/Parser/extra-semi.cpp
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
 // RUN: cp %s %t.cpp
-// RUN: not %clang_cc1 -fsyntax-only %t.cpp -fixit
+// RUN: not %clang_cc1 %t.cpp -fixit
 // RUN: %clang_cc1 -fsyntax-only %t.cpp
 
 void test1(int a;) { // expected-error{{unexpected ';' before ')'}}
diff --git a/clang/test/Parser/objc-attr.m b/clang/test/Parser/objc-attr.m
index e214cf574a4a..b29ddf5fbde5 100644
--- a/clang/test/Parser/objc-attr.m
+++ b/clang/test/Parser/objc-attr.m
@@ -1,28 +1,28 @@
-// RUN: %clang_cc1 -fsyntax-only -triple x86_64-apple-macosx10.10.0 -verify %s
-// expected-no-diagnostics
-
-@interface NSObject
-@end
-
-[[clang::objc_exception]]
-@interface Foo {
-  [[clang::iboutlet]] NSObject *h;
-}
-@property (readonly) [[clang::objc_returns_inner_pointer]] void *i, *j;
-@property (readonly) [[clang::iboutlet]] NSObject *k;
-@end
-
-[[clang::objc_runtime_name("name")]] @protocol Bar;
-
-[[clang::objc_protocol_requires_explicit_implementation]] 
-@protocol Baz
-@end
-
-@interface Quux
--(void)g1 [[clang::ns_consumes_self]];
--(void)g2 __attribute__((ns_consumes_self));
--(void)h1: (int)x [[clang::ns_consumes_self]];
--(void)h2: (int)x __attribute__((ns_consumes_self));
--(void) [[clang::ns_consumes_self]] i1;
--(void) __attribute__((ns_consumes_self)) i2;
-@end
+// RUN: %clang_cc1 -fsyntax-only -triple x86_64-apple-macosx10.10.0 -verify %s
+// expected-no-diagnostics
+
+@interface NSObject
+@end
+
+[[clang::objc_exception]]
+@interface Foo {
+  [[clang::iboutlet]] NSObject *h;
+}
+@property (readonly) [[clang::objc_returns_inner_pointer]] void *i, *j;
+@property (readonly) [[clang::iboutlet]] NSObject *k;
+@end
+
+[[clang::objc_runtime_name("name")]] @protocol Bar;
+
+[[clang::objc_protocol_requires_explicit_implementation]]
+@protocol Baz
+@end
+
+@interface Quux
+-(void)g1 [[clang::ns_consumes_self]];
+-(void)g2 __attribute__((ns_consumes_self));
+-(void)h1: (int)x [[clang::ns_consumes_self]];
+-(void)h2: (int)x __attribute__((ns_consumes_self));
+-(void) [[clang::ns_consumes_self]] i1;
+-(void) __attribute__((ns_consumes_self)) i2;
+@end
diff --git a/clang/test/ParserOpenACC/parse-cache-construct.c b/clang/test/ParserOpenACC/parse-cache-construct.c
index de26fc2b277a..8937aa095d5e 100644
--- a/clang/test/ParserOpenACC/parse-cache-construct.c
+++ b/clang/test/ParserOpenACC/parse-cache-construct.c
@@ -134,9 +134,8 @@ void func() {
   }
 
   for (int i = 0; i < 10; ++i) {
-    // expected-error@+2{{expected expression}}
     // expected-warning@+1{{OpenACC construct 'cache' not yet implemented, pragma ignored}}
-    #pragma acc cache(readonly:ArrayPtr[5:])
+    #pragma acc cache(readonly:ArrayPtr[5:1])
   }
 
   for (int i = 0; i < 10; ++i) {
diff --git a/clang/test/ParserOpenACC/parse-cache-construct.cpp b/clang/test/ParserOpenACC/parse-cache-construct.cpp
index f1c71e8b5847..374fe2697b63 100644
--- a/clang/test/ParserOpenACC/parse-cache-construct.cpp
+++ b/clang/test/ParserOpenACC/parse-cache-construct.cpp
@@ -74,12 +74,12 @@ void use() {
   for (int i = 0; i < 10; ++i) {
     // expected-error@+2{{OpenACC sub-array is not allowed here}}
     // expected-warning@+1{{OpenACC construct 'cache' not yet implemented, pragma ignored}}
-    #pragma acc cache(Arrs.MemArr[3:4].array[1:4])
+    #pragma acc cache(Arrs.MemArr[2:1].array[1:4])
   }
   for (int i = 0; i < 10; ++i) {
     // expected-error@+2{{OpenACC sub-array is not allowed here}}
     // expected-warning@+1{{OpenACC construct 'cache' not yet implemented, pragma ignored}}
-    #pragma acc cache(Arrs.MemArr[3:4].array[4])
+    #pragma acc cache(Arrs.MemArr[2:1].array[4])
   }
   for (int i = 0; i < 10; ++i) {
     // expected-error@+3{{expected ']'}}
@@ -88,7 +88,7 @@ void use() {
     #pragma acc cache(Arrs.MemArr[3:4:].array[4])
   }
   for (int i = 0; i < 10; ++i) {
-    // expected-error@+2{{expected expression}}
+    // expected-error@+2{{OpenACC sub-array is not allowed here}}
     // expected-warning@+1{{OpenACC construct 'cache' not yet implemented, pragma ignored}}
     #pragma acc cache(Arrs.MemArr[:].array[4])
   }
@@ -105,7 +105,7 @@ void use() {
     #pragma acc cache(Arrs.MemArr[: :].array[4])
   }
   for (int i = 0; i < 10; ++i) {
-    // expected-error@+2{{expected expression}}
+    // expected-error@+2{{OpenACC sub-array is not allowed here}}
     // expected-warning@+1{{OpenACC construct 'cache' not yet implemented, pragma ignored}}
     #pragma acc cache(Arrs.MemArr[3:].array[4])
   }
diff --git a/clang/test/ParserOpenACC/parse-clauses.c b/clang/test/ParserOpenACC/parse-clauses.c
index 8a439a5ccd4b..51858b441e93 100644
--- a/clang/test/ParserOpenACC/parse-clauses.c
+++ b/clang/test/ParserOpenACC/parse-clauses.c
@@ -453,13 +453,11 @@ void VarListClauses() {
 #pragma acc serial copy(, seq
   for(;;){}
 
-  // expected-error@+2{{expected expression}}
-  // expected-warning@+1{{OpenACC clause 'copy' not yet implemented, clause ignored}}
+  // expected-error@+1{{expected expression}}
 #pragma acc serial copy()
   for(;;){}
 
-  // expected-error@+3{{expected expression}}
-  // expected-warning@+2{{OpenACC clause 'copy' not yet implemented, clause ignored}}
+  // expected-error@+2{{expected expression}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copy(), seq
   for(;;){}
@@ -467,64 +465,60 @@ void VarListClauses() {
   struct Members s;
   struct HasMembersArray HasMem;
 
-  // expected-warning@+2{{OpenACC clause 'copy' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copy(s.array[s.value]), seq
   for(;;){}
 
-  // expected-warning@+2{{OpenACC clause 'copy' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copy(s.array[s.value], s.array[s.value :5] ), seq
   for(;;){}
 
-  // expected-warning@+2{{OpenACC clause 'copy' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copy(HasMem.MemArr[3].array[1]), seq
   for(;;){}
 
-  // expected-warning@+2{{OpenACC clause 'copy' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copy(HasMem.MemArr[3].array[1:4]), seq
   for(;;){}
 
-  // expected-error@+3{{OpenACC sub-array is not allowed here}}
-  // expected-warning@+2{{OpenACC clause 'copy' not yet implemented, clause ignored}}
+  // expected-error@+2{{OpenACC sub-array is not allowed here}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copy(HasMem.MemArr[1:3].array[1]), seq
   for(;;){}
 
-  // expected-error@+3{{OpenACC sub-array is not allowed here}}
-  // expected-warning@+2{{OpenACC clause 'copy' not yet implemented, clause ignored}}
+  // expected-error@+2{{OpenACC sub-array is not allowed here}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copy(HasMem.MemArr[1:3].array[1:2]), seq
   for(;;){}
 
-  // expected-error@+3{{expected expression}}
-  // expected-warning@+2{{OpenACC clause 'copy' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copy(HasMem.MemArr[:]), seq
   for(;;){}
 
-  // expected-error@+3{{expected expression}}
-  // expected-warning@+2{{OpenACC clause 'copy' not yet implemented, clause ignored}}
+  // expected-error@+2{{expected expression}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copy(HasMem.MemArr[::]), seq
   for(;;){}
 
-  // expected-error@+5{{expected expression}}
-  // expected-error@+4{{expected ']'}}
-  // expected-note@+3{{to match this '['}}
-  // expected-warning@+2{{OpenACC clause 'copy' not yet implemented, clause ignored}}
+  // expected-error@+4{{expected expression}}
+  // expected-error@+3{{expected ']'}}
+  // expected-note@+2{{to match this '['}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copy(HasMem.MemArr[: :]), seq
   for(;;){}
 
-  // expected-error@+3{{expected expression}}
-  // expected-warning@+2{{OpenACC clause 'copy' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copy(HasMem.MemArr[3:]), seq
   for(;;){}
 
+  // expected-warning@+1{{OpenACC clause name 'pcopy' is a deprecated clause name and is now an alias for 'copy'}}
+#pragma acc serial pcopy(HasMem.MemArr[3:])
+  for(;;){}
+
+  // expected-warning@+1{{OpenACC clause name 'present_or_copy' is a deprecated clause name and is now an alias for 'copy'}}
+#pragma acc serial present_or_copy(HasMem.MemArr[3:])
+  for(;;){}
+
   // expected-error@+3{{expected ','}}
   // expected-warning@+2{{OpenACC clause 'use_device' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
@@ -536,48 +530,48 @@ void VarListClauses() {
 #pragma acc serial use_device(s.array[s.value : 5]), seq
   for(;;){}
 
-  // expected-error@+3{{expected ','}}
-  // expected-warning@+2{{OpenACC clause 'no_create' not yet implemented, clause ignored}}
+  // expected-error@+2{{expected ','}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial no_create(s.array[s.value] s.array[s.value :5] ), seq
   for(;;){}
 
-  // expected-warning@+2{{OpenACC clause 'no_create' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial no_create(s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-error@+3{{expected ','}}
-  // expected-warning@+2{{OpenACC clause 'present' not yet implemented, clause ignored}}
+  // expected-error@+2{{expected ','}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial present(s.array[s.value] s.array[s.value :5] ), seq
   for(;;){}
 
-  // expected-warning@+2{{OpenACC clause 'present' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial present(s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-error@+3{{expected ','}}
-  // expected-warning@+2{{OpenACC clause 'deviceptr' not yet implemented, clause ignored}}
+
+  void *IsPointer;
+  // expected-error@+5{{expected ','}}
+  // expected-error@+4{{expected pointer in 'deviceptr' clause, type is 'char'}}
+  // expected-error@+3{{OpenACC sub-array is not allowed here}}
+  // expected-note@+2{{expected variable of pointer type}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial deviceptr(s.array[s.value] s.array[s.value :5] ), seq
   for(;;){}
 
-  // expected-warning@+2{{OpenACC clause 'deviceptr' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
-#pragma acc serial deviceptr(s.array[s.value : 5], s.value), seq
+#pragma acc serial deviceptr(IsPointer), seq
   for(;;){}
 
-  // expected-error@+3{{expected ','}}
-  // expected-warning@+2{{OpenACC clause 'attach' not yet implemented, clause ignored}}
+  // expected-error@+5{{expected ','}}
+  // expected-error@+4{{expected pointer in 'attach' clause, type is 'char'}}
+  // expected-error@+3{{OpenACC sub-array is not allowed here}}
+  // expected-note@+2{{expected variable of pointer type}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial attach(s.array[s.value] s.array[s.value :5] ), seq
   for(;;){}
 
-  // expected-warning@+2{{OpenACC clause 'attach' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
-#pragma acc serial attach(s.array[s.value : 5], s.value), seq
+#pragma acc serial attach(IsPointer), seq
   for(;;){}
 
   // expected-error@+3{{expected ','}}
@@ -600,13 +594,11 @@ void VarListClauses() {
 #pragma acc serial private(s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-error@+3{{expected ','}}
-  // expected-warning@+2{{OpenACC clause 'firstprivate' not yet implemented, clause ignored}}
+  // expected-error@+2{{expected ','}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial firstprivate(s.array[s.value] s.array[s.value :5] ), seq
   for(;;){}
 
-  // expected-warning@+2{{OpenACC clause 'firstprivate' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial firstprivate(s.array[s.value : 5], s.value), seq
   for(;;){}
@@ -677,161 +669,158 @@ void VarListClauses() {
 #pragma acc serial device(s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-error@+3{{expected ','}}
-  // expected-warning@+2{{OpenACC clause 'copyout' not yet implemented, clause ignored}}
+  // expected-error@+2{{expected ','}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copyout(s.array[s.value] s.array[s.value :5] ), seq
   for(;;){}
 
-  // expected-warning@+2{{OpenACC clause 'copyout' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copyout(s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-warning@+2{{OpenACC clause 'copyout' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copyout(zero:s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-warning@+2{{OpenACC clause 'copyout' not yet implemented, clause ignored}}
+  // expected-warning@+1{{OpenACC clause name 'pcopyout' is a deprecated clause name and is now an alias for 'copyout'}}
+#pragma acc serial pcopyout(s.array[s.value : 5], s.value)
+  for(;;){}
+
+  // expected-warning@+1{{OpenACC clause name 'present_or_copyout' is a deprecated clause name and is now an alias for 'copyout'}}
+#pragma acc serial present_or_copyout(zero:s.array[s.value : 5], s.value)
+  for(;;){}
+
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copyout(zero : s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-error@+4{{use of undeclared identifier 'zero'}}
-  // expected-error@+3{{expected ','}}
-  // expected-warning@+2{{OpenACC clause 'copyout' not yet implemented, clause ignored}}
+  // expected-error@+3{{use of undeclared identifier 'zero'}}
+  // expected-error@+2{{expected ','}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copyout(zero s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-error@+3{{invalid tag 'readonly' on 'copyout' clause}}
-  // expected-warning@+2{{OpenACC clause 'copyout' not yet implemented, clause ignored}}
+  // expected-error@+2{{invalid tag 'readonly' on 'copyout' clause}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copyout(readonly:s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-error@+3{{invalid tag 'invalid' on 'copyout' clause}}
-  // expected-warning@+2{{OpenACC clause 'copyout' not yet implemented, clause ignored}}
+  // expected-error@+2{{invalid tag 'invalid' on 'copyout' clause}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copyout(invalid:s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-error@+3{{invalid tag 'invalid' on 'copyout' clause}}
-  // expected-warning@+2{{OpenACC clause 'copyout' not yet implemented, clause ignored}}
+  // expected-error@+2{{invalid tag 'invalid' on 'copyout' clause}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copyout(invalid:s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-error@+4{{use of undeclared identifier 'invalid'}}
-  // expected-error@+3{{expected ','}}
-  // expected-warning@+2{{OpenACC clause 'copyout' not yet implemented, clause ignored}}
+  // expected-error@+3{{use of undeclared identifier 'invalid'}}
+  // expected-error@+2{{expected ','}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copyout(invalid s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-error@+3{{expected ','}}
-  // expected-warning@+2{{OpenACC clause 'create' not yet implemented, clause ignored}}
+  // expected-error@+2{{expected ','}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial create(s.array[s.value] s.array[s.value :5] ), seq
   for(;;){}
 
-  // expected-warning@+2{{OpenACC clause 'create' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial create(s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-warning@+2{{OpenACC clause 'create' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial create(zero:s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-warning@+2{{OpenACC clause 'create' not yet implemented, clause ignored}}
+  // expected-warning@+1{{OpenACC clause name 'pcreate' is a deprecated clause name and is now an alias for 'create'}}
+#pragma acc serial pcreate(s.array[s.value : 5], s.value)
+  for(;;){}
+
+  // expected-warning@+1{{OpenACC clause name 'present_or_create' is a deprecated clause name and is now an alias for 'create'}}
+#pragma acc serial present_or_create(zero:s.array[s.value : 5], s.value)
+  for(;;){}
+
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial create(zero : s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-error@+4{{use of undeclared identifier 'zero'}}
-  // expected-error@+3{{expected ','}}
-  // expected-warning@+2{{OpenACC clause 'create' not yet implemented, clause ignored}}
+  // expected-error@+3{{use of undeclared identifier 'zero'}}
+  // expected-error@+2{{expected ','}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial create(zero s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-error@+3{{invalid tag 'readonly' on 'create' clause}}
-  // expected-warning@+2{{OpenACC clause 'create' not yet implemented, clause ignored}}
+  // expected-error@+2{{invalid tag 'readonly' on 'create' clause}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial create(readonly:s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-error@+3{{invalid tag 'invalid' on 'create' clause}}
-  // expected-warning@+2{{OpenACC clause 'create' not yet implemented, clause ignored}}
+  // expected-error@+2{{invalid tag 'invalid' on 'create' clause}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial create(invalid:s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-error@+3{{invalid tag 'invalid' on 'create' clause}}
-  // expected-warning@+2{{OpenACC clause 'create' not yet implemented, clause ignored}}
+  // expected-error@+2{{invalid tag 'invalid' on 'create' clause}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial create(invalid:s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-error@+4{{use of undeclared identifier 'invalid'}}
-  // expected-error@+3{{expected ','}}
-  // expected-warning@+2{{OpenACC clause 'create' not yet implemented, clause ignored}}
+  // expected-error@+3{{use of undeclared identifier 'invalid'}}
+  // expected-error@+2{{expected ','}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial create(invalid s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-error@+3{{expected ','}}
-  // expected-warning@+2{{OpenACC clause 'copyin' not yet implemented, clause ignored}}
+  // expected-error@+2{{expected ','}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copyin(s.array[s.value] s.array[s.value :5] ), seq
   for(;;){}
 
-  // expected-warning@+2{{OpenACC clause 'copyin' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copyin(s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-warning@+2{{OpenACC clause 'copyin' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copyin(readonly:s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-warning@+2{{OpenACC clause 'copyin' not yet implemented, clause ignored}}
+  // expected-warning@+1{{OpenACC clause name 'pcopyin' is a deprecated clause name and is now an alias for 'copyin'}}
+#pragma acc serial pcopyin(s.array[s.value : 5], s.value)
+  for(;;){}
+
+  // expected-warning@+1{{OpenACC clause name 'present_or_copyin' is a deprecated clause name and is now an alias for 'copyin'}}
+#pragma acc serial present_or_copyin(readonly:s.array[s.value : 5], s.value)
+  for(;;){}
+
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copyin(readonly : s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-error@+4{{use of undeclared identifier 'readonly'}}
-  // expected-error@+3{{expected ','}}
-  // expected-warning@+2{{OpenACC clause 'copyin' not yet implemented, clause ignored}}
+  // expected-error@+3{{use of undeclared identifier 'readonly'}}
+  // expected-error@+2{{expected ','}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copyin(readonly s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-error@+3{{invalid tag 'zero' on 'copyin' clause}}
-  // expected-warning@+2{{OpenACC clause 'copyin' not yet implemented, clause ignored}}
+  // expected-error@+2{{invalid tag 'zero' on 'copyin' clause}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copyin(zero :s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-error@+3{{invalid tag 'invalid' on 'copyin' clause}}
-  // expected-warning@+2{{OpenACC clause 'copyin' not yet implemented, clause ignored}}
+  // expected-error@+2{{invalid tag 'invalid' on 'copyin' clause}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copyin(invalid:s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-error@+3{{invalid tag 'invalid' on 'copyin' clause}}
-  // expected-warning@+2{{OpenACC clause 'copyin' not yet implemented, clause ignored}}
+  // expected-error@+2{{invalid tag 'invalid' on 'copyin' clause}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copyin(invalid:s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-error@+4{{use of undeclared identifier 'invalid'}}
-  // expected-error@+3{{expected ','}}
-  // expected-warning@+2{{OpenACC clause 'copyin' not yet implemented, clause ignored}}
+  // expected-error@+3{{use of undeclared identifier 'invalid'}}
+  // expected-error@+2{{expected ','}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copyin(invalid s.array[s.value : 5], s.value), seq
   for(;;){}
@@ -1244,7 +1233,6 @@ void device_type() {
 
 #define acc_async_sync -1
 void AsyncArgument() {
-  // expected-warning@+1{{OpenACC clause 'async' not yet implemented, clause ignored}}
 #pragma acc parallel async
   {}
 
@@ -1261,15 +1249,12 @@ void AsyncArgument() {
 #pragma acc parallel async(4, 3)
   {}
 
-  // expected-warning@+1{{OpenACC clause 'async' not yet implemented, clause ignored}}
 #pragma acc parallel async(returns_int())
   {}
 
-  // expected-warning@+1{{OpenACC clause 'async' not yet implemented, clause ignored}}
 #pragma acc parallel async(5)
   {}
 
-  // expected-warning@+1{{OpenACC clause 'async' not yet implemented, clause ignored}}
 #pragma acc parallel async(acc_async_sync)
   {}
 }
diff --git a/clang/test/ParserOpenACC/parse-clauses.cpp b/clang/test/ParserOpenACC/parse-clauses.cpp
index 8c1d64374799..702eb75ca890 100644
--- a/clang/test/ParserOpenACC/parse-clauses.cpp
+++ b/clang/test/ParserOpenACC/parse-clauses.cpp
@@ -18,13 +18,14 @@ void templ() {
 #pragma acc parallel vector_length(I)
   for(;;){}
 
-  // expected-warning@+1{{OpenACC clause 'async' not yet implemented, clause ignored}}
 #pragma acc parallel async(T::value)
   for(;;){}
 
-  // expected-warning@+1{{OpenACC clause 'async' not yet implemented, clause ignored}}
 #pragma acc parallel async(I)
   for(;;){}
+
+#pragma acc parallel async
+  for(;;){}
 }
 
 struct S {
diff --git a/clang/test/ParserOpenACC/parse-sub-array.cpp b/clang/test/ParserOpenACC/parse-sub-array.cpp
new file mode 100644
index 000000000000..c0d3f89159e8
--- /dev/null
+++ b/clang/test/ParserOpenACC/parse-sub-array.cpp
@@ -0,0 +1,89 @@
+// RUN: %clang_cc1 %s -verify -fopenacc
+
+void Func(int i, int j) {
+  int array[5];
+#pragma acc parallel private(array[:])
+  while (true);
+#pragma acc parallel private(array[i:])
+  while (true);
+#pragma acc parallel private(array[:j])
+  while (true);
+#pragma acc parallel private(array[i:j])
+  while (true);
+#pragma acc parallel private(array[1:2])
+  while (true);
+
+  // expected-error@+1{{expected unqualified-id}}
+#pragma acc parallel private(array[::])
+  while (true);
+  // expected-error@+2{{expected ']'}}
+  // expected-note@+1{{to match this '['}}
+#pragma acc parallel private(array[1::])
+  while (true);
+  // expected-error@+2{{expected ']'}}
+  // expected-note@+1{{to match this '['}}
+#pragma acc parallel private(array[:2:])
+  while (true);
+  // expected-error@+3{{expected unqualified-id}}
+  // expected-error@+2{{expected ']'}}
+  // expected-note@+1{{to match this '['}}
+#pragma acc parallel private(array[::3])
+  while (true);
+  // expected-error@+2{{expected ']'}}
+  // expected-note@+1{{to match this '['}}
+#pragma acc parallel private(array[1:2:3])
+  while (true);
+}
+
+template<typename T, unsigned I, auto &IPtr>// #IPTR
+void TemplFunc() {
+  T array[I];
+  T array2[2*I];
+  T t; // #tDecl
+#pragma acc parallel private(array[:])
+  while (true);
+#pragma acc parallel private(array[t:])
+  while (true);
+#pragma acc parallel private(array[I-1:])
+  while (true);
+#pragma acc parallel private(array[IPtr:])
+  while (true);
+#pragma acc parallel private(array[:t])
+  while (true);
+#pragma acc parallel private(array[:I])
+  while (true);
+#pragma acc parallel private(array[:IPtr])
+  while (true);
+#pragma acc parallel private(array[t:t])
+  while (true);
+#pragma acc parallel private(array2[I:I])
+  while (true);
+#pragma acc parallel private(array[IPtr:IPtr])
+  while (true);
+
+  // expected-error@+1{{expected unqualified-id}}
+#pragma acc parallel private(array[::])
+  while (true);
+  // expected-error@+3{{'t' is not a class, namespace, or enumeration}}
+  // expected-note@#tDecl{{'t' declared here}}
+  // expected-error@+1{{expected unqualified-id}}
+#pragma acc parallel private(array[t::])
+  while (true);
+  // expected-error@+2{{expected ']'}}
+  // expected-note@+1{{to match this '['}}
+#pragma acc parallel private(array[:I:])
+  while (true);
+  // expected-error@+2{{no member named 'IPtr' in the global namespace}}
+  // expected-note@#IPTR{{'IPtr' declared here}}
+#pragma acc parallel private(array[::IPtr])
+  while (true);
+  // expected-error@+2{{expected ']'}}
+  // expected-note@+1{{to match this '['}}
+#pragma acc parallel private(array[IPtr:I:t])
+  while (true);
+}
+
+void use() {
+  static constexpr int SomeI = 1;
+  TemplFunc<int, 5, SomeI>();
+}
diff --git a/clang/test/ParserOpenACC/parse-wait-clause.c b/clang/test/ParserOpenACC/parse-wait-clause.c
index f3e651de4583..9c7faa5c02eb 100644
--- a/clang/test/ParserOpenACC/parse-wait-clause.c
+++ b/clang/test/ParserOpenACC/parse-wait-clause.c
@@ -3,12 +3,10 @@
 void func() {
   int i, j;
 
-  // expected-warning@+1{{OpenACC clause 'wait' not yet implemented, clause ignored}}
   #pragma acc parallel wait
   {}
 
-  // expected-error@+2{{invalid OpenACC clause 'clause'}}
-  // expected-warning@+1{{OpenACC clause 'wait' not yet implemented, clause ignored}}
+  // expected-error@+1{{invalid OpenACC clause 'clause'}}
   #pragma acc parallel wait clause-list
   {}
 
@@ -17,12 +15,10 @@ void func() {
   #pragma acc parallel wait (
       {}
 
-  // expected-warning@+1{{OpenACC clause 'wait' not yet implemented, clause ignored}}
   #pragma acc parallel wait ()
       {}
 
-  // expected-error@+2{{invalid OpenACC clause 'clause'}}
-  // expected-warning@+1{{OpenACC clause 'wait' not yet implemented, clause ignored}}
+  // expected-error@+1{{invalid OpenACC clause 'clause'}}
   #pragma acc parallel wait () clause-list
       {}
 
@@ -61,12 +57,10 @@ void func() {
   #pragma acc parallel wait (queues:
     {}
 
-  // expected-warning@+1{{OpenACC clause 'wait' not yet implemented, clause ignored}}
   #pragma acc parallel wait (queues:)
     {}
 
-  // expected-error@+2{{invalid OpenACC clause 'clause'}}
-  // expected-warning@+1{{OpenACC clause 'wait' not yet implemented, clause ignored}}
+  // expected-error@+1{{invalid OpenACC clause 'clause'}}
   #pragma acc parallel wait (queues:) clause-list
     {}
 
@@ -75,40 +69,42 @@ void func() {
   #pragma acc parallel wait (devnum: i + j:queues:
     {}
 
-  // expected-warning@+1{{OpenACC clause 'wait' not yet implemented, clause ignored}}
   #pragma acc parallel wait (devnum: i + j:queues:)
     {}
 
-  // expected-error@+2{{invalid OpenACC clause 'clause'}}
-  // expected-warning@+1{{OpenACC clause 'wait' not yet implemented, clause ignored}}
+  // expected-error@+1{{invalid OpenACC clause 'clause'}}
   #pragma acc parallel wait (devnum: i + j:queues:) clause-list
     {}
 
-  // expected-error@+3{{use of undeclared identifier 'devnum'}}
+  // expected-error@+4{{use of undeclared identifier 'devnum'}}
+  // expected-error@+3{{expected ','}}
   // expected-error@+2{{expected ')'}}
   // expected-note@+1{{to match this '('}}
   #pragma acc parallel wait (queues:devnum: i + j
     {}
 
+  // expected-error@+2{{expected ','}}
   // expected-error@+1{{use of undeclared identifier 'devnum'}}
   #pragma acc parallel wait (queues:devnum: i + j)
     {}
 
+  // expected-error@+3{{expected ','}}
   // expected-error@+2{{use of undeclared identifier 'devnum'}}
   // expected-error@+1{{invalid OpenACC clause 'clause'}}
   #pragma acc parallel wait (queues:devnum: i + j) clause-list
     {}
 
+  // expected-error@+3{{OpenACC clause 'wait' requires expression of integer type ('double' invalid)}}
   // expected-error@+2{{expected ')'}}
   // expected-note@+1{{to match this '('}}
   #pragma acc parallel wait(i, j, 1+1, 3.3
     {}
 
-  // expected-warning@+1{{OpenACC clause 'wait' not yet implemented, clause ignored}}
+  // expected-error@+1{{OpenACC clause 'wait' requires expression of integer type ('double' invalid)}}
   #pragma acc parallel wait(i, j, 1+1, 3.3)
     {}
-  // expected-error@+2{{invalid OpenACC clause 'clause'}}
-  // expected-warning@+1{{OpenACC clause 'wait' not yet implemented, clause ignored}}
+  // expected-error@+2{{OpenACC clause 'wait' requires expression of integer type ('double' invalid)}}
+  // expected-error@+1{{invalid OpenACC clause 'clause'}}
   #pragma acc parallel wait(i, j, 1+1, 3.3) clause-list
     {}
 
@@ -127,47 +123,51 @@ void func() {
   #pragma acc parallel wait(,) clause-list
     {}
 
+  // expected-error@+3{{OpenACC clause 'wait' requires expression of integer type ('double' invalid)}}
   // expected-error@+2{{expected ')'}}
   // expected-note@+1{{to match this '('}}
   #pragma acc parallel wait(queues:i, j, 1+1, 3.3
     {}
 
+  // expected-error@+4{{OpenACC clause 'wait' requires expression of integer type ('double' invalid)}}
   // expected-error@+3{{expected expression}}
   // expected-error@+2{{expected ')'}}
   // expected-note@+1{{to match this '('}}
   #pragma acc parallel wait(queues:i, j, 1+1, 3.3,
     {}
 
-  // expected-warning@+1{{OpenACC clause 'wait' not yet implemented, clause ignored}}
+  // expected-error@+1{{OpenACC clause 'wait' requires expression of integer type ('double' invalid)}}
   #pragma acc parallel wait(queues:i, j, 1+1, 3.3)
     {}
 
-  // expected-error@+2{{invalid OpenACC clause 'clause'}}
-  // expected-warning@+1{{OpenACC clause 'wait' not yet implemented, clause ignored}}
+  // expected-error@+2{{OpenACC clause 'wait' requires expression of integer type ('double' invalid)}}
+  // expected-error@+1{{invalid OpenACC clause 'clause'}}
   #pragma acc parallel wait(queues:i, j, 1+1, 3.3) clause-list
     {}
 
+  // expected-error@+3{{OpenACC clause 'wait' requires expression of integer type ('double' invalid)}}
   // expected-error@+2{{expected ')'}}
   // expected-note@+1{{to match this '('}}
   #pragma acc parallel wait(devnum:3:i, j, 1+1, 3.3
     {}
-  // expected-warning@+1{{OpenACC clause 'wait' not yet implemented, clause ignored}}
+  // expected-error@+1{{OpenACC clause 'wait' requires expression of integer type ('double' invalid)}}
   #pragma acc parallel wait(devnum:3:i, j, 1+1, 3.3)
     {}
-  // expected-error@+2{{invalid OpenACC clause 'clause'}}
-  // expected-warning@+1{{OpenACC clause 'wait' not yet implemented, clause ignored}}
+  // expected-error@+2{{OpenACC clause 'wait' requires expression of integer type ('double' invalid)}}
+  // expected-error@+1{{invalid OpenACC clause 'clause'}}
   #pragma acc parallel wait(devnum:3:i, j, 1+1, 3.3) clause-list
     {}
 
+  // expected-error@+3{{OpenACC clause 'wait' requires expression of integer type ('double' invalid)}}
   // expected-error@+2{{expected ')'}}
   // expected-note@+1{{to match this '('}}
   #pragma acc parallel wait(devnum:3:queues:i, j, 1+1, 3.3
     {}
-  // expected-warning@+1{{OpenACC clause 'wait' not yet implemented, clause ignored}}
+  // expected-error@+1{{OpenACC clause 'wait' requires expression of integer type ('double' invalid)}}
   #pragma acc parallel wait(devnum:3:queues:i, j, 1+1, 3.3)
     {}
-  // expected-error@+2{{invalid OpenACC clause 'clause'}}
-  // expected-warning@+1{{OpenACC clause 'wait' not yet implemented, clause ignored}}
+  // expected-error@+2{{OpenACC clause 'wait' requires expression of integer type ('double' invalid)}}
+  // expected-error@+1{{invalid OpenACC clause 'clause'}}
   #pragma acc parallel wait(devnum:3:queues:i, j, 1+1, 3.3) clause-list
     {}
 }
diff --git a/clang/test/ParserOpenACC/parse-wait-construct.c b/clang/test/ParserOpenACC/parse-wait-construct.c
index 30a9fc8c12a4..8f7ea8efd576 100644
--- a/clang/test/ParserOpenACC/parse-wait-construct.c
+++ b/clang/test/ParserOpenACC/parse-wait-construct.c
@@ -76,28 +76,34 @@ void func() {
   // expected-warning@+1{{OpenACC construct 'wait' not yet implemented, pragma ignored}}
   #pragma acc wait (devnum: i + j:queues:) clause-list
 
-  // expected-error@+4{{use of undeclared identifier 'devnum'}}
+  // expected-error@+5{{use of undeclared identifier 'devnum'}}
+  // expected-error@+4{{expected ','}}
   // expected-error@+3{{expected ')'}}
   // expected-note@+2{{to match this '('}}
   // expected-warning@+1{{OpenACC construct 'wait' not yet implemented, pragma ignored}}
   #pragma acc wait (queues:devnum: i + j
 
-  // expected-error@+2{{use of undeclared identifier 'devnum'}}
+  // expected-error@+3{{use of undeclared identifier 'devnum'}}
+  // expected-error@+2{{expected ','}}
   // expected-warning@+1{{OpenACC construct 'wait' not yet implemented, pragma ignored}}
   #pragma acc wait (queues:devnum: i + j)
 
-  // expected-error@+3{{use of undeclared identifier 'devnum'}}
+  // expected-error@+4{{use of undeclared identifier 'devnum'}}
+  // expected-error@+3{{expected ','}}
   // expected-error@+2{{invalid OpenACC clause 'clause'}}
   // expected-warning@+1{{OpenACC construct 'wait' not yet implemented, pragma ignored}}
   #pragma acc wait (queues:devnum: i + j) clause-list
 
+  // expected-error@+4{{OpenACC directive 'wait' requires expression of integer type ('double' invalid)}}
   // expected-error@+3{{expected ')'}}
   // expected-note@+2{{to match this '('}}
   // expected-warning@+1{{OpenACC construct 'wait' not yet implemented, pragma ignored}}
   #pragma acc wait(i, j, 1+1, 3.3
 
+  // expected-error@+2{{OpenACC directive 'wait' requires expression of integer type ('double' invalid)}}
   // expected-warning@+1{{OpenACC construct 'wait' not yet implemented, pragma ignored}}
   #pragma acc wait(i, j, 1+1, 3.3)
+  // expected-error@+3{{OpenACC directive 'wait' requires expression of integer type ('double' invalid)}}
   // expected-error@+2{{invalid OpenACC clause 'clause'}}
   // expected-warning@+1{{OpenACC construct 'wait' not yet implemented, pragma ignored}}
   #pragma acc wait(i, j, 1+1, 3.3) clause-list
@@ -117,40 +123,50 @@ void func() {
   // expected-warning@+1{{OpenACC construct 'wait' not yet implemented, pragma ignored}}
   #pragma acc wait(,) clause-list
 
+  // expected-error@+4{{OpenACC directive 'wait' requires expression of integer type ('double' invalid)}}
   // expected-error@+3{{expected ')'}}
   // expected-note@+2{{to match this '('}}
   // expected-warning@+1{{OpenACC construct 'wait' not yet implemented, pragma ignored}}
   #pragma acc wait(queues:i, j, 1+1, 3.3
 
+  // expected-error@+5{{OpenACC directive 'wait' requires expression of integer type ('double' invalid)}}
   // expected-error@+4{{expected expression}}
   // expected-error@+3{{expected ')'}}
   // expected-note@+2{{to match this '('}}
   // expected-warning@+1{{OpenACC construct 'wait' not yet implemented, pragma ignored}}
   #pragma acc wait(queues:i, j, 1+1, 3.3,
 
+  // expected-error@+2{{OpenACC directive 'wait' requires expression of integer type ('double' invalid)}}
   // expected-warning@+1{{OpenACC construct 'wait' not yet implemented, pragma ignored}}
   #pragma acc wait(queues:i, j, 1+1, 3.3)
 
+  // expected-error@+3{{OpenACC directive 'wait' requires expression of integer type ('double' invalid)}}
   // expected-error@+2{{invalid OpenACC clause 'clause'}}
   // expected-warning@+1{{OpenACC construct 'wait' not yet implemented, pragma ignored}}
   #pragma acc wait(queues:i, j, 1+1, 3.3) clause-list
 
+  // expected-error@+4{{OpenACC directive 'wait' requires expression of integer type ('double' invalid)}}
   // expected-error@+3{{expected ')'}}
   // expected-note@+2{{to match this '('}}
   // expected-warning@+1{{OpenACC construct 'wait' not yet implemented, pragma ignored}}
   #pragma acc wait(devnum:3:i, j, 1+1, 3.3
+  // expected-error@+2{{OpenACC directive 'wait' requires expression of integer type ('double' invalid)}}
   // expected-warning@+1{{OpenACC construct 'wait' not yet implemented, pragma ignored}}
   #pragma acc wait(devnum:3:i, j, 1+1, 3.3)
+  // expected-error@+3{{OpenACC directive 'wait' requires expression of integer type ('double' invalid)}}
   // expected-error@+2{{invalid OpenACC clause 'clause'}}
   // expected-warning@+1{{OpenACC construct 'wait' not yet implemented, pragma ignored}}
   #pragma acc wait(devnum:3:i, j, 1+1, 3.3) clause-list
 
+  // expected-error@+4{{OpenACC directive 'wait' requires expression of integer type ('double' invalid)}}
   // expected-error@+3{{expected ')'}}
   // expected-note@+2{{to match this '('}}
   // expected-warning@+1{{OpenACC construct 'wait' not yet implemented, pragma ignored}}
   #pragma acc wait(devnum:3:queues:i, j, 1+1, 3.3
+  // expected-error@+2{{OpenACC directive 'wait' requires expression of integer type ('double' invalid)}}
   // expected-warning@+1{{OpenACC construct 'wait' not yet implemented, pragma ignored}}
   #pragma acc wait(devnum:3:queues:i, j, 1+1, 3.3)
+  // expected-error@+3{{OpenACC directive 'wait' requires expression of integer type ('double' invalid)}}
   // expected-error@+2{{invalid OpenACC clause 'clause'}}
   // expected-warning@+1{{OpenACC construct 'wait' not yet implemented, pragma ignored}}
   #pragma acc wait(devnum:3:queues:i, j, 1+1, 3.3) clause-list
diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c
index 4d10eeafa884..82304a15a04a 100644
--- a/clang/test/Preprocessor/aarch64-target-features.c
+++ b/clang/test/Preprocessor/aarch64-target-features.c
@@ -616,6 +616,9 @@
 // ================== Check Armv9.5-A Pointer Authentication Enhancements(PAuth_LR).
 // RUN: %clang -target arm64-none-linux-gnu -march=armv8-a -x c -E -dM %s -o - | FileCheck -check-prefix=CHECK-PAUTH-LR-OFF %s
 // RUN: %clang -target arm64-none-linux-gnu -march=armv9.5-a -x c -E -dM %s -o - | FileCheck -check-prefix=CHECK-PAUTH-LR-OFF %s
+// RUN: %clang -target arm64-none-linux-gnu -march=armv9.5-a -mbranch-protection=standard -x c -E -dM %s -o - | FileCheck -check-prefixes=CHECK-PAUTH-LR-OFF,CHECK-BRANCH-PROTECTION-NO-PC %s
+// RUN: %clang -target arm64-none-linux-gnu -march=armv9.5-a+pauth-lr -mbranch-protection=standard -x c -E -dM %s -o - | FileCheck -check-prefixes=CHECK-PAUTH-LR,CHECK-BRANCH-PROTECTION-PC %s
+// RUN: %clang -target arm64-none-linux-gnu -march=armv9.5-a+nopauth-lr -mbranch-protection=standard -x c -E -dM %s -o - | FileCheck -check-prefixes=CHECK-PAUTH-LR-OFF,CHECK-BRANCH-PROTECTION-NO-PC %s
 // RUN: %clang -target arm64-none-linux-gnu -march=armv8-a+pauth -mbranch-protection=none -x c -E -dM %s -o - | FileCheck -check-prefix=CHECK-PAUTH-LR-OFF %s
 // RUN: %clang -target arm64-none-linux-gnu -march=armv8-a+pauth-lr -mbranch-protection=none -x c -E -dM %s -o - | FileCheck -check-prefix=CHECK-PAUTH-LR %s
 // RUN: %clang -target arm64-none-linux-gnu -march=armv8-a+pauth-lr -mbranch-protection=bti -x c -E -dM %s -o - | FileCheck -check-prefix=CHECK-PAUTH-LR %s
@@ -636,6 +639,7 @@
 // RUN: %clang -target arm64-none-linux-gnu -march=armv8-a+pauth-lr -mbranch-protection=pac-ret+pc+b-key -x c -E -dM %s -o - | FileCheck -check-prefixes=CHECK-PAUTH-LR,CHECK-BRANCH-PROTECTION-PC-BKEY %s
 // RUN: %clang -target arm64-none-linux-gnu -march=armv8-a+pauth-lr -mbranch-protection=pac-ret+pc+leaf -x c -E -dM %s -o - | FileCheck -check-prefixes=CHECK-PAUTH-LR,CHECK-BRANCH-PROTECTION-PC-LEAF %s
 // RUN: %clang -target arm64-none-linux-gnu -march=armv8-a+pauth-lr -mbranch-protection=pac-ret+pc+leaf+b-key -x c -E -dM %s -o - | FileCheck -check-prefixes=CHECK-PAUTH-LR,CHECK-BRANCH-PROTECTION-PC-LEAF-BKEY %s
+// CHECK-BRANCH-PROTECTION-NO-PC:        #define __ARM_FEATURE_PAC_DEFAULT 1
 // CHECK-BRANCH-PROTECTION-PC:           #define __ARM_FEATURE_PAC_DEFAULT 9
 // CHECK-BRANCH-PROTECTION-PC-BKEY:      #define __ARM_FEATURE_PAC_DEFAULT 10
 // CHECK-BRANCH-PROTECTION-PC-LEAF:      #define __ARM_FEATURE_PAC_DEFAULT 13
diff --git a/clang/test/Preprocessor/arm-target-features.c b/clang/test/Preprocessor/arm-target-features.c
index 236c9f2479b7..2d65bfd4f439 100644
--- a/clang/test/Preprocessor/arm-target-features.c
+++ b/clang/test/Preprocessor/arm-target-features.c
@@ -88,8 +88,8 @@
 // CHECK-V8R: #define __ARM_FEATURE_NUMERIC_MAXMIN 1
 // CHECK-V8R-NOT: #define __ARM_FP 0x
 
-// RUN: %clang -target armv8r-none-linux-gnueabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V8R-ALLOW-FP-INSTR %s
-// RUN: %clang -target armv8r-none-linux-gnueabihf -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V8R-ALLOW-FP-INSTR %s
+// RUN: %clang -target armv8r-none-linux-gnueabi -mcpu=cortex-r52 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V8R-ALLOW-FP-INSTR %s
+// RUN: %clang -target armv8r-none-linux-gnueabihf -mcpu=cortex-r52 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V8R-ALLOW-FP-INSTR %s
 // CHECK-V8R-ALLOW-FP-INSTR: #define __ARMEL__ 1
 // CHECK-V8R-ALLOW-FP-INSTR: #define __ARM_ARCH 8
 // CHECK-V8R-ALLOW-FP-INSTR: #define __ARM_ARCH_8R__ 1
diff --git a/clang/test/Preprocessor/macro_vaopt_check.cpp b/clang/test/Preprocessor/macro_vaopt_check.cpp
index 28f92fc17c6c..f318a08327e7 100644
--- a/clang/test/Preprocessor/macro_vaopt_check.cpp
+++ b/clang/test/Preprocessor/macro_vaopt_check.cpp
@@ -1,76 +1,76 @@
-// RUN: %clang_cc1 %s -Eonly -verify -Wno-all -Wno-c++23-extensions -pedantic -std=c++20
-// RUN: %clang_cc1 %s -Eonly -verify -Wno-all -Wno-c++23-extensions -pedantic -std=c++11
-// RUN: %clang_cc1 -x c %s -Eonly -verify -Wno-all -Wno-c2x-extensions -pedantic -std=c99
-
-//expected-error@+1{{missing '('}}
-#define V1(...) __VA_OPT__  
-#undef V1
-// OK
-#define V1(...) __VA_OPT__  ()
-#undef V1 
-
-//expected-warning@+1{{can only appear in the expansion of a variadic macro}}
-#define V2() __VA_OPT__(x) 
-#undef V2
-
-//expected-error@+2{{missing ')' after}}
-//expected-note@+1{{to match this '('}}
-#define V3(...) __VA_OPT__(
-#undef V3
-
-#define V4(...) __VA_OPT__(__VA_ARGS__)
-#undef V4
-
-//expected-error@+1{{nested}}
-#define V5(...) __VA_OPT__(__VA_OPT__())
-#undef V5
-
-//expected-error@+1{{not followed by}}
-#define V1(...) __VA_OPT__  (#)
-#undef V1
-
-//expected-error@+1{{cannot appear at start}}
-#define V1(...) __VA_OPT__  (##)
-#undef V1
-
-//expected-error@+1{{cannot appear at start}}
-#define V1(...) __VA_OPT__  (## X) x
-#undef V1
-
-//expected-error@+1{{cannot appear at end}}
-#define V1(...) y __VA_OPT__  (X ##)
-#undef V1
-                            
-
-#define FOO(x,...) # __VA_OPT__(x) #x #__VA_OPT__(__VA_ARGS__) //OK
-
-//expected-error@+1{{not followed by a macro parameter}}
-#define V1(...) __VA_OPT__(#)
-#undef V1
-
-//expected-error@+1{{cannot appear at start}}
-#define V1(...) a __VA_OPT__(##) b
-#undef V1
-
-//expected-error@+1{{cannot appear at start}}
-#define V1(...) a __VA_OPT__(a ## b) b __VA_OPT__(##)
-#undef V1
-
-#define V1(x,...) # __VA_OPT__(b x) // OK
-#undef V1
-
-//expected-error@+2{{missing ')' after}}
-//expected-note@+1{{to match this '('}}
-#define V1(...) __VA_OPT__  ((())
-#undef V1
-
-// __VA_OPT__ can't appear anywhere else.
-#if __VA_OPT__ // expected-warning {{__VA_OPT__ can only appear in the expansion of a variadic macro}}
-#endif
-
-// expected-warning@+2 {{__VA_OPT__ can only appear in the expansion of a variadic macro}}
-#ifdef __VA_OPT__ // expected-warning {{__VA_OPT__ can only appear in the expansion of a variadic macro}}
-#elifdef __VA_OPT__
-#endif
-
-#define BAD __VA_OPT__ // expected-warning {{__VA_OPT__ can only appear in the expansion of a variadic macro}}
+// RUN: %clang_cc1 %s -Eonly -verify -Wno-all -Wno-c++23-extensions -pedantic -std=c++20
+// RUN: %clang_cc1 %s -Eonly -verify -Wno-all -Wno-c++23-extensions -pedantic -std=c++11
+// RUN: %clang_cc1 -x c %s -Eonly -verify -Wno-all -Wno-c2x-extensions -pedantic -std=c99
+
+//expected-error@+1{{missing '('}}
+#define V1(...) __VA_OPT__
+#undef V1
+// OK
+#define V1(...) __VA_OPT__  ()
+#undef V1
+
+//expected-warning@+1{{can only appear in the expansion of a variadic macro}}
+#define V2() __VA_OPT__(x)
+#undef V2
+
+//expected-error@+2{{missing ')' after}}
+//expected-note@+1{{to match this '('}}
+#define V3(...) __VA_OPT__(
+#undef V3
+
+#define V4(...) __VA_OPT__(__VA_ARGS__)
+#undef V4
+
+//expected-error@+1{{nested}}
+#define V5(...) __VA_OPT__(__VA_OPT__())
+#undef V5
+
+//expected-error@+1{{not followed by}}
+#define V1(...) __VA_OPT__  (#)
+#undef V1
+
+//expected-error@+1{{cannot appear at start}}
+#define V1(...) __VA_OPT__  (##)
+#undef V1
+
+//expected-error@+1{{cannot appear at start}}
+#define V1(...) __VA_OPT__  (## X) x
+#undef V1
+
+//expected-error@+1{{cannot appear at end}}
+#define V1(...) y __VA_OPT__  (X ##)
+#undef V1
+
+
+#define FOO(x,...) # __VA_OPT__(x) #x #__VA_OPT__(__VA_ARGS__) //OK
+
+//expected-error@+1{{not followed by a macro parameter}}
+#define V1(...) __VA_OPT__(#)
+#undef V1
+
+//expected-error@+1{{cannot appear at start}}
+#define V1(...) a __VA_OPT__(##) b
+#undef V1
+
+//expected-error@+1{{cannot appear at start}}
+#define V1(...) a __VA_OPT__(a ## b) b __VA_OPT__(##)
+#undef V1
+
+#define V1(x,...) # __VA_OPT__(b x) // OK
+#undef V1
+
+//expected-error@+2{{missing ')' after}}
+//expected-note@+1{{to match this '('}}
+#define V1(...) __VA_OPT__  ((())
+#undef V1
+
+// __VA_OPT__ can't appear anywhere else.
+#if __VA_OPT__ // expected-warning {{__VA_OPT__ can only appear in the expansion of a variadic macro}}
+#endif
+
+// expected-warning@+2 {{__VA_OPT__ can only appear in the expansion of a variadic macro}}
+#ifdef __VA_OPT__ // expected-warning {{__VA_OPT__ can only appear in the expansion of a variadic macro}}
+#elifdef __VA_OPT__
+#endif
+
+#define BAD __VA_OPT__ // expected-warning {{__VA_OPT__ can only appear in the expansion of a variadic macro}}
diff --git a/clang/test/Preprocessor/macro_vaopt_expand.cpp b/clang/test/Preprocessor/macro_vaopt_expand.cpp
index 5eb0facb83f7..b34a28288cd8 100644
--- a/clang/test/Preprocessor/macro_vaopt_expand.cpp
+++ b/clang/test/Preprocessor/macro_vaopt_expand.cpp
@@ -1,150 +1,150 @@
-// RUN: %clang_cc1 -E %s -pedantic -std=c++20 | FileCheck -strict-whitespace %s
-// RUN: %clang_cc1 -E %s -pedantic -std=c++11 | FileCheck -strict-whitespace %s
-// RUN: %clang_cc1 -E -x c %s -pedantic -std=c99 | FileCheck -strict-whitespace %s
-
-#define LPAREN ( 
-#define RPAREN ) 
-
-#define A0 expandedA0
-#define A1  expandedA1 A0
-#define A2  expandedA2 A1
-#define A3  expandedA3 A2
-
-#define A() B LPAREN )
-#define B() C LPAREN )
-#define C() D LPAREN )
-
-
-#define F(x, y) x + y 
-#define ELLIP_FUNC(...) __VA_OPT__(__VA_ARGS__)
-
-1: ELLIP_FUNC(F, LPAREN, 'a', 'b', RPAREN); 
-2: ELLIP_FUNC(F LPAREN 'a', 'b' RPAREN); 
-#undef F
-#undef ELLIP_FUNC
-
-// CHECK: 1: F, (, 'a', 'b', );
-// CHECK: 2: 'a' + 'b';
-
-#define F(...) f(0 __VA_OPT__(,) __VA_ARGS__)
-3: F(a, b, c) // replaced by f(0, a, b, c) 
-4: F() // replaced by f(0)
-
-// CHECK: 3: f(0 , a, b, c) 
-// CHECK: 4: f(0 )
-#undef F
-
-#define G(X, ...) f(0, X __VA_OPT__(,) __VA_ARGS__)
-
-5: G(a, b, c) // replaced by f(0, a , b, c) 
-6: G(a) // replaced by f(0, a) 
-7: G(a,) // replaced by f(0, a) 
-7.1: G(a,,)
-
-
-// CHECK: 5: f(0, a , b, c) 
-// CHECK: 6: f(0, a ) 
-// CHECK: 7: f(0, a ) 
-// CHECK: 7.1: f(0, a , ,)
-#undef G 
-
-#define HT_B() TONG
-
-#define F(x, ...) HT_ ## __VA_OPT__(x x A()  #x)
-
-8: F(1)
-9: F(A(),1)
-
-// CHECK: 8: HT_
-// CHECK: 9: TONG C ( ) B ( ) "A()"
-#undef HT_B
-#undef F
-
-#define F(a,...) #__VA_OPT__(A1 a)
-
-10: F(A())
-11: F(A1 A(), 1)
-// CHECK: 10: ""
-// CHECK: 11: "A1 expandedA1 expandedA0 B ( )"
-#undef F
-
-
-#define F(a,...) a ## __VA_OPT__(A1 a) ## __VA_ARGS__ ## a
-12.0: F()
-12: F(,)
-13: F(B,)
-// CHECK: 12.0: 
-// CHECK: 12: 
-// CHECK: 13: BB 
-#undef F
-
-#define F(...) #__VA_OPT__()  X ## __VA_OPT__()  #__VA_OPT__(        )
-
-14: F()
-15: F(1)
-
-// CHECK: 14: "" X ""
-// CHECK: 15: "" X ""
-
-#undef F
-
-#define SDEF(sname, ...) S sname __VA_OPT__(= { __VA_ARGS__ })
-
-16: SDEF(foo); // replaced by S foo; 
-17: SDEF(bar, 1, 2); // replaced by S bar = { 1, 2 }; 
-
-// CHECK: 16: S foo ;
-// CHECK: 17: S bar = { 1, 2 }; 
-#undef SDEF
-
-#define F(a,...) A() #__VA_OPT__(A3 __VA_ARGS__ a ## __VA_ARGS__ ## a ## C A3) A()
-
-18: F()
-19: F(,)
-20: F(,A3)
-21: F(A3, A(),A0)
-
-
-// CHECK: 18: B ( ) "" B ( ) 
-// CHECK: 19: B ( ) "" B ( ) 
-// CHECK: 20: B ( ) "A3 expandedA3 expandedA2 expandedA1 expandedA0 A3C A3" B ( )
-// CHECK: 21: B ( ) "A3 B ( ),expandedA0 A3A(),A0A3C A3" B ( )
-
-#undef F
-
-#define F(a,...) A() #__VA_OPT__(A3 __VA_ARGS__ a ## __VA_ARGS__ ## a ## C A3) a __VA_OPT__(A0 __VA_ARGS__ a ## __VA_ARGS__ ## a ## C A0) A()
-
-22: F()
-23: F(,)
-24: F(,A0)
-25: F(A0, A(),A0)
-
-
-// CHECK: 22: B ( ) "" B ( ) 
-// CHECK: 23: B ( ) "" B ( ) 
-// CHECK: 24: B ( ) "A3 expandedA0 A0C A3" expandedA0 expandedA0 A0C expandedA0 B ( )
-// CHECK: 25: B ( ) "A3 B ( ),expandedA0 A0A(),A0A0C A3" expandedA0 expandedA0 C ( ),expandedA0 A0A(),A0A0C expandedA0 B ( )
-
-#undef F
-
-#define F(a,...)  __VA_OPT__(B a ## a) ## 1
-#define G(a,...)  __VA_OPT__(B a) ## 1
-26: F(,1)
-26_1: G(,1)
-// CHECK: 26: B 1
-// CHECK: 26_1: B 1
-#undef F
-#undef G
-
-#define F(a,...)  B ## __VA_OPT__(a 1) ## 1
-#define G(a,...)  B ## __VA_OPT__(a ## a 1) ## 1
-
-27: F(,1)
-27_1: F(A0,1)
-28: G(,1)
-// CHECK: 27: B 11
-// CHECK: 27_1: BexpandedA0 11
-// CHECK: 28: B 11
-
-#undef F
-#undef G
+// RUN: %clang_cc1 -E %s -pedantic -std=c++20 | FileCheck -strict-whitespace %s
+// RUN: %clang_cc1 -E %s -pedantic -std=c++11 | FileCheck -strict-whitespace %s
+// RUN: %clang_cc1 -E -x c %s -pedantic -std=c99 | FileCheck -strict-whitespace %s
+
+#define LPAREN (
+#define RPAREN )
+
+#define A0 expandedA0
+#define A1  expandedA1 A0
+#define A2  expandedA2 A1
+#define A3  expandedA3 A2
+
+#define A() B LPAREN )
+#define B() C LPAREN )
+#define C() D LPAREN )
+
+
+#define F(x, y) x + y
+#define ELLIP_FUNC(...) __VA_OPT__(__VA_ARGS__)
+
+1: ELLIP_FUNC(F, LPAREN, 'a', 'b', RPAREN);
+2: ELLIP_FUNC(F LPAREN 'a', 'b' RPAREN);
+#undef F
+#undef ELLIP_FUNC
+
+// CHECK: 1: F, (, 'a', 'b', );
+// CHECK: 2: 'a' + 'b';
+
+#define F(...) f(0 __VA_OPT__(,) __VA_ARGS__)
+3: F(a, b, c) // replaced by f(0, a, b, c)
+4: F() // replaced by f(0)
+
+// CHECK: 3: f(0 , a, b, c)
+// CHECK: 4: f(0 )
+#undef F
+
+#define G(X, ...) f(0, X __VA_OPT__(,) __VA_ARGS__)
+
+5: G(a, b, c) // replaced by f(0, a , b, c)
+6: G(a) // replaced by f(0, a)
+7: G(a,) // replaced by f(0, a)
+7.1: G(a,,)
+
+
+// CHECK: 5: f(0, a , b, c)
+// CHECK: 6: f(0, a )
+// CHECK: 7: f(0, a )
+// CHECK: 7.1: f(0, a , ,)
+#undef G
+
+#define HT_B() TONG
+
+#define F(x, ...) HT_ ## __VA_OPT__(x x A()  #x)
+
+8: F(1)
+9: F(A(),1)
+
+// CHECK: 8: HT_
+// CHECK: 9: TONG C ( ) B ( ) "A()"
+#undef HT_B
+#undef F
+
+#define F(a,...) #__VA_OPT__(A1 a)
+
+10: F(A())
+11: F(A1 A(), 1)
+// CHECK: 10: ""
+// CHECK: 11: "A1 expandedA1 expandedA0 B ( )"
+#undef F
+
+
+#define F(a,...) a ## __VA_OPT__(A1 a) ## __VA_ARGS__ ## a
+12.0: F()
+12: F(,)
+13: F(B,)
+// CHECK: 12.0:
+// CHECK: 12:
+// CHECK: 13: BB
+#undef F
+
+#define F(...) #__VA_OPT__()  X ## __VA_OPT__()  #__VA_OPT__(        )
+
+14: F()
+15: F(1)
+
+// CHECK: 14: "" X ""
+// CHECK: 15: "" X ""
+
+#undef F
+
+#define SDEF(sname, ...) S sname __VA_OPT__(= { __VA_ARGS__ })
+
+16: SDEF(foo); // replaced by S foo;
+17: SDEF(bar, 1, 2); // replaced by S bar = { 1, 2 };
+
+// CHECK: 16: S foo ;
+// CHECK: 17: S bar = { 1, 2 };
+#undef SDEF
+
+#define F(a,...) A() #__VA_OPT__(A3 __VA_ARGS__ a ## __VA_ARGS__ ## a ## C A3) A()
+
+18: F()
+19: F(,)
+20: F(,A3)
+21: F(A3, A(),A0)
+
+
+// CHECK: 18: B ( ) "" B ( )
+// CHECK: 19: B ( ) "" B ( )
+// CHECK: 20: B ( ) "A3 expandedA3 expandedA2 expandedA1 expandedA0 A3C A3" B ( )
+// CHECK: 21: B ( ) "A3 B ( ),expandedA0 A3A(),A0A3C A3" B ( )
+
+#undef F
+
+#define F(a,...) A() #__VA_OPT__(A3 __VA_ARGS__ a ## __VA_ARGS__ ## a ## C A3) a __VA_OPT__(A0 __VA_ARGS__ a ## __VA_ARGS__ ## a ## C A0) A()
+
+22: F()
+23: F(,)
+24: F(,A0)
+25: F(A0, A(),A0)
+
+
+// CHECK: 22: B ( ) "" B ( )
+// CHECK: 23: B ( ) "" B ( )
+// CHECK: 24: B ( ) "A3 expandedA0 A0C A3" expandedA0 expandedA0 A0C expandedA0 B ( )
+// CHECK: 25: B ( ) "A3 B ( ),expandedA0 A0A(),A0A0C A3" expandedA0 expandedA0 C ( ),expandedA0 A0A(),A0A0C expandedA0 B ( )
+
+#undef F
+
+#define F(a,...)  __VA_OPT__(B a ## a) ## 1
+#define G(a,...)  __VA_OPT__(B a) ## 1
+26: F(,1)
+26_1: G(,1)
+// CHECK: 26: B 1
+// CHECK: 26_1: B 1
+#undef F
+#undef G
+
+#define F(a,...)  B ## __VA_OPT__(a 1) ## 1
+#define G(a,...)  B ## __VA_OPT__(a ## a 1) ## 1
+
+27: F(,1)
+27_1: F(A0,1)
+28: G(,1)
+// CHECK: 27: B 11
+// CHECK: 27_1: BexpandedA0 11
+// CHECK: 28: B 11
+
+#undef F
+#undef G
diff --git a/clang/test/Preprocessor/print-pragma-microsoft.c b/clang/test/Preprocessor/print-pragma-microsoft.c
index 5c4fb4ffc965..d121d93f0d85 100644
--- a/clang/test/Preprocessor/print-pragma-microsoft.c
+++ b/clang/test/Preprocessor/print-pragma-microsoft.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -fsyntax-only -fms-extensions -E -o - | FileCheck %s
+// RUN: %clang_cc1 %s -fms-extensions -E -o - | FileCheck %s
 
 #define BAR "2"
 #pragma comment(linker, "bar=" BAR)
diff --git a/clang/test/Preprocessor/ptrauth_feature.c b/clang/test/Preprocessor/ptrauth_feature.c
index e45c6ea90fd1..80e239110ffc 100644
--- a/clang/test/Preprocessor/ptrauth_feature.c
+++ b/clang/test/Preprocessor/ptrauth_feature.c
@@ -1,5 +1,59 @@
-// RUN: %clang_cc1 %s -E -triple=arm64-- | FileCheck %s --check-prefixes=NOINTRIN
-// RUN: %clang_cc1 %s -E -triple=arm64-- -fptrauth-intrinsics | FileCheck %s --check-prefixes=INTRIN
+// RUN: %clang_cc1 -E %s -triple=aarch64 \
+// RUN:   -fptrauth-intrinsics \
+// RUN:   -fptrauth-calls \
+// RUN:   -fptrauth-returns \
+// RUN:   -fptrauth-vtable-pointer-address-discrimination \
+// RUN:   -fptrauth-vtable-pointer-type-discrimination \
+// RUN:   -fptrauth-init-fini | \
+// RUN:   FileCheck %s --check-prefixes=INTRIN,CALLS,RETS,VPTR_ADDR_DISCR,VPTR_TYPE_DISCR,INITFINI
+
+// RUN: %clang_cc1 -E %s -triple=aarch64 \
+// RUN:   -fptrauth-calls \
+// RUN:   -fptrauth-returns \
+// RUN:   -fptrauth-vtable-pointer-address-discrimination \
+// RUN:   -fptrauth-vtable-pointer-type-discrimination \
+// RUN:   -fptrauth-init-fini | \
+// RUN:   FileCheck %s --check-prefixes=NOINTRIN,CALLS,RETS,VPTR_ADDR_DISCR,VPTR_TYPE_DISCR,INITFINI
+
+// RUN: %clang_cc1 -E %s -triple=aarch64 \
+// RUN:   -fptrauth-intrinsics \
+// RUN:   -fptrauth-returns \
+// RUN:   -fptrauth-vtable-pointer-address-discrimination \
+// RUN:   -fptrauth-vtable-pointer-type-discrimination \
+// RUN:   -fptrauth-init-fini | \
+// RUN:   FileCheck %s --check-prefixes=INTRIN,NOCALLS,RETS,VPTR_ADDR_DISCR,VPTR_TYPE_DISCR,INITFINI
+
+// RUN: %clang_cc1 -E %s -triple=aarch64 \
+// RUN:   -fptrauth-intrinsics \
+// RUN:   -fptrauth-calls \
+// RUN:   -fptrauth-vtable-pointer-address-discrimination \
+// RUN:   -fptrauth-vtable-pointer-type-discrimination \
+// RUN:   -fptrauth-init-fini | \
+// RUN:   FileCheck %s --check-prefixes=INTRIN,CALLS,NORETS,VPTR_ADDR_DISCR,VPTR_TYPE_DISCR,INITFINI
+
+// RUN: %clang_cc1 -E %s -triple=aarch64 \
+// RUN:   -fptrauth-intrinsics \
+// RUN:   -fptrauth-calls \
+// RUN:   -fptrauth-returns \
+// RUN:   -fptrauth-vtable-pointer-type-discrimination \
+// RUN:   -fptrauth-init-fini | \
+// RUN:   FileCheck %s --check-prefixes=INTRIN,CALLS,RETS,NOVPTR_ADDR_DISCR,VPTR_TYPE_DISCR,INITFINI
+
+// RUN: %clang_cc1 -E %s -triple=aarch64 \
+// RUN:   -fptrauth-intrinsics \
+// RUN:   -fptrauth-calls \
+// RUN:   -fptrauth-returns \
+// RUN:   -fptrauth-vtable-pointer-address-discrimination \
+// RUN:   -fptrauth-init-fini | \
+// RUN:   FileCheck %s --check-prefixes=INTRIN,CALLS,RETS,VPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,INITFINI
+
+// RUN: %clang_cc1 -E %s -triple=aarch64 \
+// RUN:   -fptrauth-intrinsics \
+// RUN:   -fptrauth-calls \
+// RUN:   -fptrauth-returns \
+// RUN:   -fptrauth-vtable-pointer-address-discrimination \
+// RUN:   -fptrauth-vtable-pointer-type-discrimination | \
+// RUN:   FileCheck %s --check-prefixes=INTRIN,CALLS,RETS,VPTR_ADDR_DISCR,VPTR_TYPE_DISCR,NOINITFINI
 
 #if __has_feature(ptrauth_intrinsics)
 // INTRIN: has_ptrauth_intrinsics
@@ -8,3 +62,52 @@ void has_ptrauth_intrinsics() {}
 // NOINTRIN: no_ptrauth_intrinsics
 void no_ptrauth_intrinsics() {}
 #endif
+
+#if __has_feature(ptrauth_calls)
+// CALLS: has_ptrauth_calls
+void has_ptrauth_calls() {}
+#else
+// NOCALLS: no_ptrauth_calls
+void no_ptrauth_calls() {}
+#endif
+
+// This is always enabled when ptrauth_calls is enabled
+#if __has_feature(ptrauth_member_function_pointer_type_discrimination)
+// CALLS: has_ptrauth_member_function_pointer_type_discrimination
+void has_ptrauth_member_function_pointer_type_discrimination() {}
+#else
+// NOCALLS: no_ptrauth_member_function_pointer_type_discrimination
+void no_ptrauth_member_function_pointer_type_discrimination() {}
+#endif
+
+#if __has_feature(ptrauth_returns)
+// RETS: has_ptrauth_returns
+void has_ptrauth_returns() {}
+#else
+// NORETS: no_ptrauth_returns
+void no_ptrauth_returns() {}
+#endif
+
+#if __has_feature(ptrauth_vtable_pointer_address_discrimination)
+// VPTR_ADDR_DISCR: has_ptrauth_vtable_pointer_address_discrimination
+void has_ptrauth_vtable_pointer_address_discrimination() {}
+#else
+// NOVPTR_ADDR_DISCR: no_ptrauth_vtable_pointer_address_discrimination
+void no_ptrauth_vtable_pointer_address_discrimination() {}
+#endif
+
+#if __has_feature(ptrauth_vtable_pointer_type_discrimination)
+// VPTR_TYPE_DISCR: has_ptrauth_vtable_pointer_type_discrimination
+void has_ptrauth_vtable_pointer_type_discrimination() {}
+#else
+// NOVPTR_TYPE_DISCR: no_ptrauth_vtable_pointer_type_discrimination
+void no_ptrauth_vtable_pointer_type_discrimination() {}
+#endif
+
+#if __has_feature(ptrauth_init_fini)
+// INITFINI: has_ptrauth_init_fini
+void has_ptrauth_init_fini() {}
+#else
+// NOINITFINI: no_ptrauth_init_fini
+void no_ptrauth_init_fini() {}
+#endif
diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c
index ee4f81cd654b..913093bb51db 100644
--- a/clang/test/Preprocessor/riscv-target-features.c
+++ b/clang/test/Preprocessor/riscv-target-features.c
@@ -27,6 +27,7 @@
 // CHECK-NOT: __riscv_shvstvecd {{.*$}}
 // CHECK-NOT: __riscv_smaia {{.*$}}
 // CHECK-NOT: __riscv_smepmp {{.*$}}
+// CHECK-NOT: __riscv_smstateen {{.*$}}
 // CHECK-NOT: __riscv_ssaia {{.*$}}
 // CHECK-NOT: __riscv_ssccptr {{.*$}}
 // CHECK-NOT: __riscv_sscofpmf {{.*$}}
@@ -374,6 +375,14 @@
 // CHECK-SSCOUNTERENW-EXT: __riscv_sscounterenw 1000000{{$}}
 
 // RUN: %clang --target=riscv32-unknown-linux-gnu \
+// RUN:   -march=rv32ismstateen -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-SMSTATEEN-EXT %s
+// RUN: %clang --target=riscv64-unknown-linux-gnu \
+// RUN:   -march=rv64ismstateen -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-SMSTATEEN-EXT %s
+// CHECK-SMSTATEEN-EXT: __riscv_smstateen 1000000{{$}}
+
+// RUN: %clang --target=riscv32-unknown-linux-gnu \
 // RUN:   -march=rv32issstateen -E -dM %s \
 // RUN:   -o - | FileCheck --check-prefix=CHECK-SSSTATEEN-EXT %s
 // RUN: %clang --target=riscv64-unknown-linux-gnu \
diff --git a/clang/test/Preprocessor/wasm-target-features.c b/clang/test/Preprocessor/wasm-target-features.c
index 5a4f85461d5a..9d49e3af603f 100644
--- a/clang/test/Preprocessor/wasm-target-features.c
+++ b/clang/test/Preprocessor/wasm-target-features.c
@@ -194,26 +194,18 @@
 //
 // BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_atomics__ 1{{$}}
 // BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_bulk_memory__ 1{{$}}
+// BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_exception_handling__ 1{{$}}
+// BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_extended_const__ 1{{$}}
 // BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_half_precision__ 1{{$}}
 // BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_multimemory__ 1{{$}}
 // BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_multivalue__ 1{{$}}
 // BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_mutable_globals__ 1{{$}}
 // BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_nontrapping_fptoint__ 1{{$}}
 // BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_reference_types__ 1{{$}}
+// BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_relaxed_simd__ 1{{$}}
 // BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_sign_ext__ 1{{$}}
 // BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_simd128__ 1{{$}}
 // BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_tail_call__ 1{{$}}
-//
-// RUN: %clang -E -dM %s -o - 2>&1 \
-// RUN:     -target wasm32-unknown-unknown -mcpu=bleeding-edge \
-// RUN:   | FileCheck %s -check-prefix=BLEEDING-EDGE
-// RUN: %clang -E -dM %s -o - 2>&1 \
-// RUN:     -target wasm64-unknown-unknown -mcpu=bleeding-edge \
-// RUN:   | FileCheck %s -check-prefix=BLEEDING-EDGE
-//
-// BLEEDING-EDGE-NOT: #define __wasm_exception_handling__ 1{{$}}
-// BLEEDING-EDGE-NOT: #define __wasm_extended_const__ 1{{$}}
-// BLEEDING-EDGE-NOT: #define __wasm_relaxed_simd__ 1{{$}}
 
 // RUN: %clang -E -dM %s -o - 2>&1 \
 // RUN:     -target wasm32-unknown-unknown -mcpu=bleeding-edge -mno-simd128 \
diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c
index 5602c59158fe..57104c9e7a50 100644
--- a/clang/test/Preprocessor/x86_target_features.c
+++ b/clang/test/Preprocessor/x86_target_features.c
@@ -802,6 +802,7 @@
 // RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=ppx -x c -E -dM -o - %s | FileCheck --check-prefix=PPX %s
 // RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=ndd -x c -E -dM -o - %s | FileCheck --check-prefix=NDD %s
 // RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=ccmp -x c -E -dM -o - %s | FileCheck --check-prefix=CCMP %s
+// RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=nf -x c -E -dM -o - %s | FileCheck --check-prefix=NF %s
 // RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=cf -x c -E -dM -o - %s | FileCheck --check-prefix=CF %s
 // RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapxf -x c -E -dM -o - %s | FileCheck --check-prefixes=EGPR,PUSH2POP2,PPX,NDD,APXF %s
 // APXF: #define __APX_F__ 1
@@ -809,5 +810,6 @@
 // CF: #define __CF__ 1
 // EGPR: #define __EGPR__ 1
 // NDD: #define __NDD__ 1
+// NF: #define __NF__ 1
 // PPX: #define __PPX__ 1
 // PUSH2POP2: #define __PUSH2POP2__ 1
diff --git a/clang/test/Profile/cxx-missing-bodies.cpp b/clang/test/Profile/cxx-missing-bodies.cpp
index fe926b3b2182..6d34fca482c9 100644
--- a/clang/test/Profile/cxx-missing-bodies.cpp
+++ b/clang/test/Profile/cxx-missing-bodies.cpp
@@ -1,8 +1,8 @@
-// RUN: %clang_cc1 -emit-llvm %s -std=c++11 -S -emit-llvm -o - -triple=i386-pc-win32 -fno-rtti -fprofile-instrument=clang | FileCheck %s --check-prefix=GEN
+// RUN: %clang_cc1 %s -std=c++11 -emit-llvm -o - -triple=i386-pc-win32 -fno-rtti -fprofile-instrument=clang | FileCheck %s --check-prefix=GEN
 //
 // Don't crash when presented profile data for functions without bodies:
 // RUN: llvm-profdata merge %S/Inputs/cxx-missing-bodies.proftext -o %t.profdata
-// RUN: %clang_cc1 -emit-llvm %s -std=c++11 -S -emit-llvm -o /dev/null -triple=i386-pc-win32 -fno-rtti -fprofile-instrument-use-path=%t.profdata -w
+// RUN: %clang_cc1 %s -std=c++11 -emit-llvm-only -triple=i386-pc-win32 -fno-rtti -fprofile-instrument-use-path=%t.profdata -w
 
 // GEN-NOT: __profn{{.*}}??_GA@@UAEPAXI@Z
 // GEN-NOT: __profn{{.*}}??_DA@@QAEXXZ
diff --git a/clang/test/Profile/debug-info-instr_profile_switch.cpp b/clang/test/Profile/debug-info-instr_profile_switch.cpp
new file mode 100644
index 000000000000..a78a1812a5a2
--- /dev/null
+++ b/clang/test/Profile/debug-info-instr_profile_switch.cpp
@@ -0,0 +1,40 @@
+// Tests that we don't attach misleading debug locations to llvm.instrprof.increment
+// counters.
+
+// RUN: %clang_cc1 -x c++ %s -debug-info-kind=standalone -triple %itanium_abi_triple -main-file-name debug-info-instr_profile_switch.cpp -std=c++11 -o - -emit-llvm -fprofile-instrument=clang | FileCheck %s
+
+int main(int argc, const char *argv[]) {
+  switch(argc) {
+    case 0:
+      return 0;
+    case 1:
+      return 1;
+  }
+}
+
+// CHECK: define {{.*}} @main({{.*}}) #0 !dbg ![[MAIN_SCOPE:[0-9]+]]
+
+// CHECK:        switch i32 {{.*}}, label {{.*}} [
+// CHECK-NEXT:     i32 0, label %[[CASE1_LBL:[a-z0-9.]+]]
+// CHECK-NEXT:     i32 1, label %[[CASE2_LBL:[a-z0-9.]+]]
+// CHECK-NEXT:   ], !dbg ![[SWITCH_LOC:[0-9]+]]
+
+// CHECK:       [[CASE1_LBL]]:
+// CHECK-NEXT:     %{{.*}} = load i64, ptr getelementptr inbounds ({{.*}}, ptr @__profc_main, {{.*}}), align {{.*}}, !dbg ![[CTR_LOC:[0-9]+]]
+// CHECK-NEXT:     %{{.*}} = add {{.*}}, !dbg ![[CTR_LOC]]
+// CHECK-NEXT:     store i64 {{.*}}, ptr getelementptr inbounds ({{.*}}, ptr @__profc_main, {{.*}}), align {{.*}}, !dbg ![[CTR_LOC]]
+// CHECK-NEXT:     store i32 0, {{.*}} !dbg ![[CASE1_LOC:[0-9]+]]
+// CHECK-NEXT:     br label {{.*}}, !dbg ![[CASE1_LOC]]
+
+// CHECK:       [[CASE2_LBL]]:
+// CHECK-NEXT:     %{{.*}} = load i64, ptr getelementptr inbounds ({{.*}}, ptr @__profc_main, {{.*}}), align {{.*}}, !dbg ![[CTR_LOC]]
+// CHECK-NEXT:     %{{.*}} = add {{.*}}, !dbg ![[CTR_LOC]]
+// CHECK-NEXT:     store i64 {{.*}}, ptr getelementptr inbounds ({{.*}}, ptr @__profc_main, {{.*}}), align {{.*}}, !dbg ![[CTR_LOC]]
+// CHECK-NEXT:     store i32 1, {{.*}} !dbg ![[CASE2_LOC:[0-9]+]]
+// CHECK-NEXT:     br label {{.*}}, !dbg ![[CASE2_LOC]]
+
+// CHECK: ![[SWITCH_LOC]] = !DILocation({{.*}}, scope: ![[MAIN_SCOPE]])
+// CHECK: ![[CTR_LOC]] = !DILocation(line: 0, scope: ![[BLOCK_SCOPE:[0-9]+]])
+// CHECK: ![[BLOCK_SCOPE]] = distinct !DILexicalBlock(scope: ![[MAIN_SCOPE]]
+// CHECK: ![[CASE1_LOC]] = !DILocation(line: {{.*}}, column: {{.*}}, scope: ![[BLOCK_SCOPE]])
+// CHECK: ![[CASE2_LOC]] = !DILocation(line: {{.*}}, column: {{.*}}, scope: ![[BLOCK_SCOPE]])
diff --git a/clang/test/Rewriter/rewrite-super-message.mm b/clang/test/Rewriter/rewrite-super-message.mm
index c1d0888b5fff..0b9272117edc 100644
--- a/clang/test/Rewriter/rewrite-super-message.mm
+++ b/clang/test/Rewriter/rewrite-super-message.mm
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -x objective-c++ -Wno-return-type -fblocks -fms-extensions -rewrite-objc -fobjc-runtime=macosx-fragile-10.5 %s -o %t-rw.cpp
-// RUN: %clang_cc1 -fsyntax-only -Wno-address-of-temporary -DKEEP_ATTRIBUTES -D"id=struct objc_object *" -D"Class=struct objc_class *" -D"SEL=void*" -D"__declspec(X)=" -emit-llvm -o - %t-rw.cpp | FileCheck %t-rw.cpp
+// RUN: %clang_cc1 -Wno-address-of-temporary -DKEEP_ATTRIBUTES -D"id=struct objc_object *" -D"Class=struct objc_class *" -D"SEL=void*" -D"__declspec(X)=" -emit-llvm -o - %t-rw.cpp | FileCheck %t-rw.cpp
 
 void *sel_registerName(const char *);
 
diff --git a/clang/test/Sema/aarch64-fp16-target.c b/clang/test/Sema/aarch64-fp16-target.c
index 9d84cca84482..af359dc9ecf1 100644
--- a/clang/test/Sema/aarch64-fp16-target.c
+++ b/clang/test/Sema/aarch64-fp16-target.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -fsyntax-only -verify -emit-llvm -o - %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -verify -emit-llvm -o - %s
 // REQUIRES: aarch64-registered-target
 
 // Test that functions with the correct target attributes can use the correct FP16 intrinsics.
diff --git a/clang/test/Sema/aarch64-neon-target.c b/clang/test/Sema/aarch64-neon-target.c
index 18e4f981acc5..fa45fff1d183 100644
--- a/clang/test/Sema/aarch64-neon-target.c
+++ b/clang/test/Sema/aarch64-neon-target.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -fsyntax-only -verify -emit-llvm -o - %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -verify -emit-llvm -o - %s
 // REQUIRES: aarch64-registered-target
 
 // Test that functions with the correct target attributes can use the correct NEON intrinsics.
diff --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c
index 6af115beba8e..7bf8fce1fcb2 100644
--- a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c
+++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fsyntax-only -verify -emit-llvm -o - %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -verify -emit-llvm -o - %s
 // REQUIRES: aarch64-registered-target
 
 // Test that functions with the correct target attributes can use the correct SME intrinsics.
diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c
new file mode 100644
index 000000000000..b1582569971d
--- /dev/null
+++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c
@@ -0,0 +1,90 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -verify -emit-llvm %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_sme.h>
+
+
+void test_features_f16f16(uint32_t slice,
+                          svfloat16_t zm,
+                          svfloat16x2_t zn2, svfloat16x2_t zm2,
+                          svfloat16x4_t zn4, svfloat16x4_t zm4,
+                          svbfloat16_t bzm,
+                          svbfloat16x2_t bzn2, svbfloat16x2_t bzm2,
+                          svbfloat16x4_t bzn4, svbfloat16x4_t bzm4)
+
+   __arm_streaming __arm_inout("za") {
+  // expected-error@+1 {{'svmla_single_za16_f16_vg1x2' needs target feature sme-f16f16}}
+  svmla_single_za16_f16_vg1x2(slice, zn2, zm);
+  // expected-error@+1 {{'svmla_single_za16_f16_vg1x4' needs target feature sme-f16f16}}
+  svmla_single_za16_f16_vg1x4(slice, zn4, zm);
+  // expected-error@+1 {{'svmls_single_za16_f16_vg1x2' needs target feature sme-f16f16}}
+  svmls_single_za16_f16_vg1x2(slice, zn2, zm);
+  // expected-error@+1 {{'svmls_single_za16_f16_vg1x4' needs target feature sme-f16f16}}
+  svmls_single_za16_f16_vg1x4(slice, zn4, zm);
+  // expected-error@+1 {{'svmla_za16_f16_vg1x2' needs target feature sme-f16f16}}
+  svmla_za16_f16_vg1x2(slice, zn2, zm2);
+  // expected-error@+1 {{'svmla_za16_f16_vg1x4' needs target feature sme-f16f16}}
+  svmla_za16_f16_vg1x4(slice, zn4, zm4);
+  // expected-error@+1 {{'svmls_za16_f16_vg1x2' needs target feature sme-f16f16}}
+  svmls_za16_f16_vg1x2(slice, zn2, zm2);
+  // expected-error@+1 {{'svmls_za16_f16_vg1x4' needs target feature sme-f16f16}}
+  svmls_za16_f16_vg1x4(slice, zn4, zm4);
+  // expected-error@+1 {{'svmla_lane_za16_f16_vg1x2' needs target feature sme-f16f16}}
+  svmla_lane_za16_f16_vg1x2(slice, zn2, zm, 7);
+  // expected-error@+1 {{'svmla_lane_za16_f16_vg1x4' needs target feature sme-f16f16}}
+  svmla_lane_za16_f16_vg1x4(slice, zn4, zm, 7);
+  // expected-error@+1 {{'svmls_lane_za16_f16_vg1x2' needs target feature sme-f16f16}}
+  svmls_lane_za16_f16_vg1x2(slice, zn2, zm, 7);
+  // expected-error@+1 {{'svmls_lane_za16_f16_vg1x4' needs target feature sme-f16f16}}
+  svmls_lane_za16_f16_vg1x4(slice, zn4, zm, 7);
+
+  // expected-error@+1 {{'svmla_single_za16_bf16_vg1x2' needs target feature sme2,b16b16}}
+  svmla_single_za16_bf16_vg1x2(slice, bzn2, bzm);
+  // expected-error@+1 {{'svmla_single_za16_bf16_vg1x4' needs target feature sme2,b16b16}}
+  svmla_single_za16_bf16_vg1x4(slice, bzn4, bzm);
+  // expected-error@+1 {{'svmls_single_za16_bf16_vg1x2' needs target feature sme2,b16b16}}
+  svmls_single_za16_bf16_vg1x2(slice, bzn2, bzm);
+  // expected-error@+1 {{'svmls_single_za16_bf16_vg1x4' needs target feature sme2,b16b16}}
+  svmls_single_za16_bf16_vg1x4(slice, bzn4, bzm);
+  // expected-error@+1 {{'svmla_za16_bf16_vg1x2' needs target feature sme2,b16b16}}
+  svmla_za16_bf16_vg1x2(slice, bzn2, bzm2);
+  // expected-error@+1 {{'svmla_za16_bf16_vg1x4' needs target feature sme2,b16b16}}
+  svmla_za16_bf16_vg1x4(slice, bzn4, bzm4);
+  // expected-error@+1 {{'svmls_za16_bf16_vg1x2' needs target feature sme2,b16b16}}
+  svmls_za16_bf16_vg1x2(slice, bzn2, bzm2);
+  // expected-error@+1 {{'svmls_za16_bf16_vg1x4' needs target feature sme2,b16b16}}
+  svmls_za16_bf16_vg1x4(slice, bzn4, bzm4);
+  // expected-error@+1 {{'svmla_lane_za16_bf16_vg1x2' needs target feature sme2,b16b16}}
+  svmla_lane_za16_bf16_vg1x2(slice, bzn2, bzm, 7);
+  // expected-error@+1 {{'svmla_lane_za16_bf16_vg1x4' needs target feature sme2,b16b16}}
+  svmla_lane_za16_bf16_vg1x4(slice, bzn4, bzm, 7);
+  // expected-error@+1 {{'svmls_lane_za16_bf16_vg1x2' needs target feature sme2,b16b16}}
+  svmls_lane_za16_bf16_vg1x2(slice, bzn2, bzm, 7);
+  // expected-error@+1 {{'svmls_lane_za16_bf16_vg1x4' needs target feature sme2,b16b16}}
+  svmls_lane_za16_bf16_vg1x4(slice, bzn4, bzm, 7);
+}
+
+
+void test_imm(uint32_t slice, svfloat16_t zm, svfloat16x2_t zn2,svfloat16x4_t zn4,
+              svbfloat16_t bzm, svbfloat16x2_t bzn2, svbfloat16x4_t bzn4)
+  __arm_streaming __arm_inout("za") {
+
+  // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+  svmla_lane_za16_f16_vg1x2(slice, zn2, zm, -1);
+  // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+  svmla_lane_za16_f16_vg1x4(slice, zn4, zm, -1);
+  // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+  svmls_lane_za16_f16_vg1x2(slice, zn2, zm, -1);
+  // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+  svmls_lane_za16_f16_vg1x4(slice, zn4, zm, -1);
+
+  // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+  svmla_lane_za16_bf16_vg1x2(slice, bzn2, bzm, -1);
+  // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+  svmla_lane_za16_bf16_vg1x4(slice, bzn4, bzm, -1);
+  // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+  svmls_lane_za16_bf16_vg1x2(slice, bzn2, bzm, -1);
+  // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+  svmls_lane_za16_bf16_vg1x4(slice, bzn4, bzm, -1);
+}
+\ No newline at end of file
diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_mopa_nonwide.c b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_mopa_nonwide.c
new file mode 100644
index 000000000000..201ad4b8ff7f
--- /dev/null
+++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_mopa_nonwide.c
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -verify -emit-llvm %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_sme.h>
+
+void test_features(svbool_t pn, svbool_t pm,
+                   svfloat16_t zn, svfloat16_t zm,
+                   svbfloat16_t znb, svbfloat16_t zmb)
+  __arm_streaming __arm_inout("za") {
+// expected-error@+1 {{'svmopa_za16_bf16_m' needs target feature sme2,b16b16}}
+  svmopa_za16_bf16_m(0, pn, pm, znb, zmb);
+// expected-error@+1 {{'svmops_za16_bf16_m' needs target feature sme2,b16b16}}
+  svmops_za16_bf16_m(0, pn, pm, znb, zmb);
+// expected-error@+1 {{'svmopa_za16_f16_m' needs target feature sme-f16f16}}
+  svmopa_za16_f16_m(0, pn, pm, zn, zm);
+// expected-error@+1 {{'svmops_za16_f16_m' needs target feature sme-f16f16}}
+  svmops_za16_f16_m(0, pn, pm, zn, zm);
+}
+
+void test_imm(svbool_t pn, svbool_t pm,
+              svfloat16_t zn, svfloat16_t zm,
+              svbfloat16_t znb, svbfloat16_t zmb)
+  __arm_streaming __arm_inout("za") {
+// expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
+  svmopa_za16_bf16_m(-1, pn, pm, znb, zmb);
+// expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
+  svmops_za16_bf16_m(-1, pn, pm, znb, zmb);
+// expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
+  svmopa_za16_f16_m(-1, pn, pm, zn, zm);
+// expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
+  svmops_za16_f16_m(-1, pn, pm, zn, zm);
+}
+
diff --git a/clang/test/Sema/aarch64-sve-intrinsics/acle_sve_bfloat.cpp b/clang/test/Sema/aarch64-sve-intrinsics/acle_sve_bfloat.cpp
index 8ee2037da1c3..1e6401ed50e4 100644
--- a/clang/test/Sema/aarch64-sve-intrinsics/acle_sve_bfloat.cpp
+++ b/clang/test/Sema/aarch64-sve-intrinsics/acle_sve_bfloat.cpp
@@ -1,6 +1,6 @@
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fsyntax-only -verify -verify-ignore-unexpected=error,note -emit-llvm -o - %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -verify -verify-ignore-unexpected=error,note -emit-llvm -o - %s
 
 #include <arm_sve.h>
 
diff --git a/clang/test/Sema/aarch64-sve-intrinsics/acle_sve_target.cpp b/clang/test/Sema/aarch64-sve-intrinsics/acle_sve_target.cpp
index 2f771ca170e7..f1578488fa0a 100644
--- a/clang/test/Sema/aarch64-sve-intrinsics/acle_sve_target.cpp
+++ b/clang/test/Sema/aarch64-sve-intrinsics/acle_sve_target.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -fsyntax-only -verify -emit-llvm -o - -ferror-limit 100 %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -verify -emit-llvm -o - -ferror-limit 100 %s
 // REQUIRES: aarch64-registered-target
 
 // Test that functions with the correct target attributes can use the correct SVE intrinsics.
diff --git a/clang/test/Sema/aarch64-sve-vector-log-ops.c b/clang/test/Sema/aarch64-sve-vector-log-ops.c
index 2beb616c1edb..ef16e8581844 100644
--- a/clang/test/Sema/aarch64-sve-vector-log-ops.c
+++ b/clang/test/Sema/aarch64-sve-vector-log-ops.c
@@ -1,23 +1,23 @@
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve \
-// RUN:   -disable-O0-optnone -o - -fsyntax-only %s -verify
-// REQUIRES: aarch64-registered-target
-
-#include <arm_sve.h>
-
-svfloat32_t test_log_vv_i8mf8(svfloat32_t v) {
-
-  return __builtin_elementwise_log(v);
-  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
-}
-
-svfloat32_t test_log10_vv_i8mf8(svfloat32_t v) {
-
-  return __builtin_elementwise_log10(v);
-  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
-}
-
-svfloat32_t test_log2_vv_i8mf8(svfloat32_t v) {
-
-  return __builtin_elementwise_log2(v);
-  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
-}
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve \
+// RUN:   -disable-O0-optnone -o - -fsyntax-only %s -verify
+// REQUIRES: aarch64-registered-target
+
+#include <arm_sve.h>
+
+svfloat32_t test_log_vv_i8mf8(svfloat32_t v) {
+
+  return __builtin_elementwise_log(v);
+  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
+}
+
+svfloat32_t test_log10_vv_i8mf8(svfloat32_t v) {
+
+  return __builtin_elementwise_log10(v);
+  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
+}
+
+svfloat32_t test_log2_vv_i8mf8(svfloat32_t v) {
+
+  return __builtin_elementwise_log2(v);
+  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
+}
diff --git a/clang/test/Sema/aarch64-sve-vector-trig-ops.c b/clang/test/Sema/aarch64-sve-vector-trig-ops.c
index 7ca941f578c7..6863f32b5948 100644
--- a/clang/test/Sema/aarch64-sve-vector-trig-ops.c
+++ b/clang/test/Sema/aarch64-sve-vector-trig-ops.c
@@ -6,7 +6,7 @@
 
 
 svfloat32_t test_sin_vv_i8mf8(svfloat32_t v) {
-  
+
   return __builtin_elementwise_sin(v);
   // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
 }
@@ -16,3 +16,9 @@ svfloat32_t test_cos_vv_i8mf8(svfloat32_t v) {
   return __builtin_elementwise_cos(v);
   // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
 }
+
+svfloat32_t test_tan_vv_i8mf8(svfloat32_t v) {
+
+  return __builtin_elementwise_tan(v);
+  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
+}
diff --git a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2.cpp b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2.cpp
index 28fb1bf78d5f..a12b57db56a4 100644
--- a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2.cpp
+++ b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2.cpp
@@ -1,7 +1,7 @@
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fsyntax-only -verify -verify-ignore-unexpected=error,note -emit-llvm -o - %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -fsyntax-only -verify=overload -verify-ignore-unexpected=error,note -emit-llvm -o - %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -verify -verify-ignore-unexpected=error,note -emit-llvm -o - %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -verify=overload -verify-ignore-unexpected=error,note -emit-llvm -o - %s
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_aes_bitperm_sha3_sm4.cpp b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_aes_bitperm_sha3_sm4.cpp
index d7703b4d2f6e..898f18efa938 100644
--- a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_aes_bitperm_sha3_sm4.cpp
+++ b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_aes_bitperm_sha3_sm4.cpp
@@ -1,7 +1,7 @@
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -fsyntax-only -verify -verify-ignore-unexpected=error,note -emit-llvm -o - %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -fsyntax-only -verify=overload -verify-ignore-unexpected=error,note -emit-llvm -o - %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -verify -verify-ignore-unexpected=error,note -emit-llvm -o - %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2 -verify=overload -verify-ignore-unexpected=error,note -emit-llvm -o - %s
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_bfloat.cpp b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_bfloat.cpp
index b82592a12b83..78593b2f9c9e 100644
--- a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_bfloat.cpp
+++ b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_bfloat.cpp
@@ -1,7 +1,7 @@
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -fsyntax-only -verify -verify-ignore-unexpected=error,note -emit-llvm -o - %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fsyntax-only -verify=overload -verify-ignore-unexpected=error,note -emit-llvm -o - %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -verify -verify-ignore-unexpected=error,note -emit-llvm -o - %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -verify=overload -verify-ignore-unexpected=error,note -emit-llvm -o - %s
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/Sema/aix-builtin-cpu-unsupports.c b/clang/test/Sema/aix-builtin-cpu-unsupports.c
index 10e21867c393..e7ac780e4d0e 100644
--- a/clang/test/Sema/aix-builtin-cpu-unsupports.c
+++ b/clang/test/Sema/aix-builtin-cpu-unsupports.c
@@ -1,6 +1,30 @@
-// RUN: %clang_cc1 -fsyntax-only -triple  powerpc-ibm-aix7.1.0.0 -verify %s
+// RUN: %clang_cc1 -fsyntax-only -triple powerpc-ibm-aix7.2.0.0 -verify %s
 
 int main(void) {
-  if (__builtin_cpu_is("power8")) // expected-error {{this builtin is available only on AIX 7.2 and later operating systems}}
+  if (__builtin_cpu_supports("aes")) // expected-warning {{invalid cpu feature string for builtin}}
+    return 1;
+
+  if (__builtin_cpu_supports("archpmu")) // expected-warning {{invalid cpu feature string for builtin}}
+    return 1;
+
+  if (__builtin_cpu_supports("htm-nosc")) // expected-warning {{invalid cpu feature string for builtin}}
+    return 1;
+
+  if (__builtin_cpu_supports("htm-no-suspend")) // expected-warning {{invalid cpu feature string for builtin}}
+    return 1;
+
+  if (__builtin_cpu_supports("ic_snoop")) // expected-warning {{invalid cpu feature string for builtin}}
+    return 1;
+
+  if (__builtin_cpu_supports("ieee128")) // expected-warning {{invalid cpu feature string for builtin}}
+    return 1;
+
+  if (__builtin_cpu_supports("notb")) // expected-warning {{invalid cpu feature string for builtin}}
+    return 1;
+
+  if (__builtin_cpu_supports("scv")) // expected-warning {{invalid cpu feature string for builtin}}
+    return 1;
+
+  if (__builtin_cpu_supports("vcrypto")) // expected-warning {{invalid cpu feature string for builtin}}
     return 1;
 }
diff --git a/clang/test/Sema/arm-neon-target.c b/clang/test/Sema/arm-neon-target.c
index f1f17418f36f..1dc2b00925d6 100644
--- a/clang/test/Sema/arm-neon-target.c
+++ b/clang/test/Sema/arm-neon-target.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple armv8a-none-linux-gnu -target-feature +neon -fsyntax-only -verify -emit-llvm -o - %s
+// RUN: %clang_cc1 -triple armv8a-none-linux-gnu -target-feature +neon -verify -emit-llvm -o - %s
 // REQUIRES: arm-registered-target
 
 // Test that functions with the correct target attributes can use the correct NEON intrinsics.
diff --git a/clang/test/Sema/attr-alias-elf.c b/clang/test/Sema/attr-alias-elf.c
index 6b629e1b509a..d2674d1db031 100644
--- a/clang/test/Sema/attr-alias-elf.c
+++ b/clang/test/Sema/attr-alias-elf.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-pc-linux -Wno-strict-prototypes -fsyntax-only -verify -emit-llvm-only %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux -Wno-strict-prototypes -verify -emit-llvm-only %s
 
 void f1(void) __attribute__((alias("g1")));
 void g1(void) {
diff --git a/clang/test/Sema/attr-availability-swift.c b/clang/test/Sema/attr-availability-swift.c
index d77094cb2163..a0ff5302af3e 100644
--- a/clang/test/Sema/attr-availability-swift.c
+++ b/clang/test/Sema/attr-availability-swift.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple x86_64-apple-darwin9 -fsyntax-only -fblocks -verify %s
-// RUN: %clang_cc1 -triple x86_64-apple-darwin9 -fsyntax-only -ast-dump %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin9 -ast-dump %s | FileCheck %s
 //
 
 #if !__has_feature(attribute_availability_with_message)
diff --git a/clang/test/Sema/attr-self-alias.c b/clang/test/Sema/attr-self-alias.c
index 7c50458be37a..ff75326f6b0c 100644
--- a/clang/test/Sema/attr-self-alias.c
+++ b/clang/test/Sema/attr-self-alias.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-pc-linux  -fsyntax-only -verify -emit-llvm-only %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux  -verify -emit-llvm-only %s
 
 int self_alias(void) __attribute__((weak, alias("self_alias"))); // expected-error {{alias definition is part of a cycle}}
 
diff --git a/clang/test/Sema/bpf-attr-preserve-static-offset.c b/clang/test/Sema/bpf-attr-preserve-static-offset.c
index 5f53469869f3..f1519a3f1f3c 100644
--- a/clang/test/Sema/bpf-attr-preserve-static-offset.c
+++ b/clang/test/Sema/bpf-attr-preserve-static-offset.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -ast-dump -triple bpf-pc-linux-gnu %s | FileCheck %s
+// RUN: %clang_cc1 -ast-dump -triple bpf-pc-linux-gnu %s | FileCheck %s
 
 // The 'preserve_static_offset' attribute should be propagated to
 // inline declarations (foo's 'b', 'bb', 'c' but not 'd').
diff --git a/clang/test/Sema/builtin-cpu-unsupports-AIX-Os.c b/clang/test/Sema/builtin-cpu-unsupports-AIX-Os.c
new file mode 100644
index 000000000000..25d25b2ac4c9
--- /dev/null
+++ b/clang/test/Sema/builtin-cpu-unsupports-AIX-Os.c
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -fsyntax-only -triple  powerpc-ibm-aix7.1.0.0 -verify %s
+
+int main(void) {
+  if (__builtin_cpu_is("power8")) // expected-error {{this builtin is available only on AIX 7.2 and later operating systems}}
+    return 1;
+
+  if (__builtin_cpu_supports("power8")) // expected-error {{this builtin is available only on AIX 7.2 and later operating systems}}
+    return 1;
+}
diff --git a/clang/test/Sema/builtin-setjmp.c b/clang/test/Sema/builtin-setjmp.c
index 5092d1665c19..a71f87162612 100644
--- a/clang/test/Sema/builtin-setjmp.c
+++ b/clang/test/Sema/builtin-setjmp.c
@@ -1,13 +1,13 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify=c,expected -DNO_JMP_BUF %s -ast-dump | FileCheck %s --check-prefixes=CHECK1,CHECK2
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify=c,expected -DWRONG_JMP_BUF %s -ast-dump | FileCheck %s --check-prefixes=CHECK1,CHECK2
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify=c,expected -DRIGHT_JMP_BUF %s -ast-dump | FileCheck %s --check-prefixes=CHECK1,CHECK2
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify=c,expected -DONLY_JMP_BUF %s -ast-dump | FileCheck %s --check-prefixes=CHECK1,CHECK2
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify=c,expected -DNO_SETJMP %s -ast-dump 2>&1 | FileCheck %s --check-prefixes=CHECK1,CHECK2
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify=cxx,expected -x c++ -DNO_JMP_BUF %s -ast-dump | FileCheck %s --check-prefixes=CHECK1,CHECK2
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify=cxx,expected -x c++ -DWRONG_JMP_BUF %s -ast-dump | FileCheck %s --check-prefixes=CHECK1,CHECK2
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify=cxx,expected -x c++ -DRIGHT_JMP_BUF %s -ast-dump | FileCheck %s --check-prefixes=CHECK1,CHECK2
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify=cxx,expected -x c++ -DONLY_JMP_BUF %s -ast-dump | FileCheck %s --check-prefixes=CHECK2
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify=cxx,expected -x c++ -DNO_SETJMP %s -ast-dump | FileCheck %s --check-prefixes=CHECK2
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -verify=c,expected -DNO_JMP_BUF %s -ast-dump | FileCheck %s --check-prefixes=CHECK1,CHECK2
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -verify=c,expected -DWRONG_JMP_BUF %s -ast-dump | FileCheck %s --check-prefixes=CHECK1,CHECK2
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -verify=c,expected -DRIGHT_JMP_BUF %s -ast-dump | FileCheck %s --check-prefixes=CHECK1,CHECK2
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -verify=c,expected -DONLY_JMP_BUF %s -ast-dump | FileCheck %s --check-prefixes=CHECK1,CHECK2
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -verify=c,expected -DNO_SETJMP %s -ast-dump 2>&1 | FileCheck %s --check-prefixes=CHECK1,CHECK2
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -verify=cxx,expected -x c++ -DNO_JMP_BUF %s -ast-dump | FileCheck %s --check-prefixes=CHECK1,CHECK2
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -verify=cxx,expected -x c++ -DWRONG_JMP_BUF %s -ast-dump | FileCheck %s --check-prefixes=CHECK1,CHECK2
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -verify=cxx,expected -x c++ -DRIGHT_JMP_BUF %s -ast-dump | FileCheck %s --check-prefixes=CHECK1,CHECK2
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -verify=cxx,expected -x c++ -DONLY_JMP_BUF %s -ast-dump | FileCheck %s --check-prefixes=CHECK2
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -verify=cxx,expected -x c++ -DNO_SETJMP %s -ast-dump | FileCheck %s --check-prefixes=CHECK2
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/clang/test/Sema/builtins-arm64-mte.c b/clang/test/Sema/builtins-arm64-mte.c
index 49db0484a101..1b0621f3c5e3 100644
--- a/clang/test/Sema/builtins-arm64-mte.c
+++ b/clang/test/Sema/builtins-arm64-mte.c
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -triple arm64-arm-eabi %s -target-feature +mte -fsyntax-only -verify
 // RUN: %clang_cc1 -triple arm64-arm-eabi %s -target-feature +mte -x c++ -fsyntax-only -verify
-// RUN: %clang_cc1 -triple arm64-arm-eabi %s -DNO_MTE -x c++ -S -emit-llvm -verify -o -
+// RUN: %clang_cc1 -triple arm64-arm-eabi %s -DNO_MTE -x c++ -emit-llvm-only -verify
 #include <stddef.h>
 #include <arm_acle.h>
 
diff --git a/clang/test/Sema/builtins-elementwise-math.c b/clang/test/Sema/builtins-elementwise-math.c
index 2e05337273ee..2e4319d158e7 100644
--- a/clang/test/Sema/builtins-elementwise-math.c
+++ b/clang/test/Sema/builtins-elementwise-math.c
@@ -626,6 +626,27 @@ void test_builtin_elementwise_sqrt(int i, float f, double d, float4 v, int3 iv,
   // expected-error@-1 {{1st argument must be a floating point type (was 'unsigned4' (vector of 4 'unsigned int' values))}}
 }
 
+void test_builtin_elementwise_tan(int i, float f, double d, float4 v, int3 iv, unsigned u, unsigned4 uv) {
+
+  struct Foo s = __builtin_elementwise_tan(f);
+  // expected-error@-1 {{initializing 'struct Foo' with an expression of incompatible type 'float'}}
+
+  i = __builtin_elementwise_tan();
+  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
+
+  i = __builtin_elementwise_tan(i);
+  // expected-error@-1 {{1st argument must be a floating point type (was 'int')}}
+
+  i = __builtin_elementwise_tan(f, f);
+  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
+
+  u = __builtin_elementwise_tan(u);
+  // expected-error@-1 {{1st argument must be a floating point type (was 'unsigned int')}}
+
+  uv = __builtin_elementwise_tan(uv);
+  // expected-error@-1 {{1st argument must be a floating point type (was 'unsigned4' (vector of 4 'unsigned int' values))}}
+}
+
 void test_builtin_elementwise_trunc(int i, float f, double d, float4 v, int3 iv, unsigned u, unsigned4 uv) {
 
   struct Foo s = __builtin_elementwise_trunc(f);
diff --git a/clang/test/Sema/code_align_ast.c b/clang/test/Sema/code_align_ast.c
index 2cfbf11f1fd0..c9b6466f0ae9 100644
--- a/clang/test/Sema/code_align_ast.c
+++ b/clang/test/Sema/code_align_ast.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -fsyntax-only -ast-dump -verify -x c %s | FileCheck -check-prefix=CHECK-C %s
-// RUN: %clang_cc1 -fsyntax-only -ast-dump -x c++ -std=c++11 %s | FileCheck %s --check-prefixes CHECK-C,CHECK-CPP
+// RUN: %clang_cc1 -ast-dump -verify -x c %s | FileCheck -check-prefix=CHECK-C %s
+// RUN: %clang_cc1 -ast-dump -x c++ -std=c++11 %s | FileCheck %s --check-prefixes CHECK-C,CHECK-CPP
 
 // expected-no-diagnostics
 
diff --git a/clang/test/Sema/format-strings-no-fixit.c b/clang/test/Sema/format-strings-no-fixit.c
index dd323bf5c5de..9cfec49157a7 100644
--- a/clang/test/Sema/format-strings-no-fixit.c
+++ b/clang/test/Sema/format-strings-no-fixit.c
@@ -1,5 +1,5 @@
 // RUN: cp %s %t
-// RUN: %clang_cc1 -fsyntax-only -fixit %t
+// RUN: %clang_cc1 -fixit %t
 // RUN: %clang_cc1 -E -o - %t | FileCheck %s
 
 /* This is a test of the various code modification hints that are
diff --git a/clang/test/Sema/incomplete-struct-decl.cpp b/clang/test/Sema/incomplete-struct-decl.cpp
new file mode 100644
index 000000000000..bc3bd6b2eae2
--- /dev/null
+++ b/clang/test/Sema/incomplete-struct-decl.cpp
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -x c++ -fsyntax-only -verify=cxx,expected %s
+
+template <class a> using __impl_of = a; // expected-note {{'__impl_of' declared here}} \
+                                           expected-note {{template is declared here}}
+struct {                                // expected-error {{anonymous structs and classes must be class members}} \
+                                           expected-note {{to match this '{'}}
+  __impl_;                              // expected-error {{no template named '__impl_'; did you mean '__impl_of'?}} \
+                                           expected-error {{cannot specify deduction guide for alias template '__impl_of'}} \
+                                           expected-error {{expected ';' after struct}}
+                                        // expected-error {{expected '}'}}
diff --git a/clang/test/Sema/incorrect_pure.cpp b/clang/test/Sema/incorrect_pure.cpp
index 69ae41c42130..acafb7ffabaa 100644
--- a/clang/test/Sema/incorrect_pure.cpp
+++ b/clang/test/Sema/incorrect_pure.cpp
@@ -1,14 +1,14 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
-
-[[gnu::pure]] void foo(); // expected-warning{{'pure' attribute on function returning 'void'; attribute ignored}}
-
-[[gnu::const]] void bar(); // expected-warning{{'const' attribute on function returning 'void'; attribute ignored}}
-
-struct A {
-    [[gnu::pure]] A(); // expected-warning{{'pure' attribute on function returning 'void'; attribute ignored}}
-
-    [[gnu::const]] A(int); // expected-warning{{'const' attribute on function returning 'void'; attribute ignored}}
-    [[gnu::pure]] ~A(); // expected-warning{{'pure' attribute on function returning 'void'; attribute ignored}}
-
-    [[gnu::const]] [[gnu::pure]] int m(); // expected-warning{{'const' attribute imposes more restrictions; 'pure' attribute ignored}}
-};
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+[[gnu::pure]] void foo(); // expected-warning{{'pure' attribute on function returning 'void'; attribute ignored}}
+
+[[gnu::const]] void bar(); // expected-warning{{'const' attribute on function returning 'void'; attribute ignored}}
+
+struct A {
+    [[gnu::pure]] A(); // expected-warning{{'pure' attribute on function returning 'void'; attribute ignored}}
+
+    [[gnu::const]] A(int); // expected-warning{{'const' attribute on function returning 'void'; attribute ignored}}
+    [[gnu::pure]] ~A(); // expected-warning{{'pure' attribute on function returning 'void'; attribute ignored}}
+
+    [[gnu::const]] [[gnu::pure]] int m(); // expected-warning{{'const' attribute imposes more restrictions; 'pure' attribute ignored}}
+};
diff --git a/clang/test/Sema/ppc-attr-target-inline.c b/clang/test/Sema/ppc-attr-target-inline.c
index ad198b842bb0..6f84cf773f23 100644
--- a/clang/test/Sema/ppc-attr-target-inline.c
+++ b/clang/test/Sema/ppc-attr-target-inline.c
@@ -1,5 +1,5 @@
 // REQUIRES: powerpc-registered-target
-// RUN: %clang_cc1 -triple powerpc64le -target-feature +htm -fsyntax-only -emit-llvm-only %s -verify
+// RUN: %clang_cc1 -triple powerpc64le -target-feature +htm -emit-llvm-only %s -verify
 
 __attribute__((always_inline))
 int test1(int *x) {
diff --git a/clang/test/Sema/riscv-rvv-vector-log-ops.c b/clang/test/Sema/riscv-rvv-vector-log-ops.c
index dfbfa0664fde..970cfe3fd68b 100644
--- a/clang/test/Sema/riscv-rvv-vector-log-ops.c
+++ b/clang/test/Sema/riscv-rvv-vector-log-ops.c
@@ -1,25 +1,25 @@
-// RUN: %clang_cc1 -triple riscv64 -target-feature +f -target-feature +d \
-// RUN:   -target-feature +v -target-feature +zfh -target-feature +zvfh \
-// RUN:   -disable-O0-optnone -o - -fsyntax-only %s -verify 
-// REQUIRES: riscv-registered-target
-
-#include <riscv_vector.h>
-
-
-vfloat32mf2_t test_log_vv_i8mf8(vfloat32mf2_t v) {
-
-  return __builtin_elementwise_log(v);
-  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
-}
-
-vfloat32mf2_t test_log10_vv_i8mf8(vfloat32mf2_t v) {
-
-  return __builtin_elementwise_log10(v);
-  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
-}
-
-vfloat32mf2_t test_log2_vv_i8mf8(vfloat32mf2_t v) {
-
-  return __builtin_elementwise_log2(v);
-  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
-}
+// RUN: %clang_cc1 -triple riscv64 -target-feature +f -target-feature +d \
+// RUN:   -target-feature +v -target-feature +zfh -target-feature +zvfh \
+// RUN:   -disable-O0-optnone -o - -fsyntax-only %s -verify
+// REQUIRES: riscv-registered-target
+
+#include <riscv_vector.h>
+
+
+vfloat32mf2_t test_log_vv_i8mf8(vfloat32mf2_t v) {
+
+  return __builtin_elementwise_log(v);
+  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
+}
+
+vfloat32mf2_t test_log10_vv_i8mf8(vfloat32mf2_t v) {
+
+  return __builtin_elementwise_log10(v);
+  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
+}
+
+vfloat32mf2_t test_log2_vv_i8mf8(vfloat32mf2_t v) {
+
+  return __builtin_elementwise_log2(v);
+  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
+}
diff --git a/clang/test/Sema/riscv-rvv-vector-trig-ops.c b/clang/test/Sema/riscv-rvv-vector-trig-ops.c
index a457e4848606..459582fe2839 100644
--- a/clang/test/Sema/riscv-rvv-vector-trig-ops.c
+++ b/clang/test/Sema/riscv-rvv-vector-trig-ops.c
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -triple riscv64 -target-feature +f -target-feature +d \
 // RUN:   -target-feature +v -target-feature +zfh -target-feature +zvfh \
-// RUN:   -disable-O0-optnone -o - -fsyntax-only %s -verify 
+// RUN:   -disable-O0-optnone -o - -fsyntax-only %s -verify
 // REQUIRES: riscv-registered-target
 
 #include <riscv_vector.h>
@@ -17,3 +17,9 @@ vfloat32mf2_t test_cos_vv_i8mf8(vfloat32mf2_t v) {
   return __builtin_elementwise_cos(v);
   // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
 }
+
+vfloat32mf2_t test_tan_vv_i8mf8(vfloat32mf2_t v) {
+
+  return __builtin_elementwise_tan(v);
+  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
+}
diff --git a/clang/test/Sema/sizeof-struct-non-zero-as-member.cl b/clang/test/Sema/sizeof-struct-non-zero-as-member.cl
index ecc545b4d899..2fa7d1f109a4 100644
--- a/clang/test/Sema/sizeof-struct-non-zero-as-member.cl
+++ b/clang/test/Sema/sizeof-struct-non-zero-as-member.cl
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -verify -fsyntax-only -triple amdgcn -target-cpu verde -S -emit-llvm -o - %s
-// RUN: %clang_cc1 -verify -fsyntax-only -triple amdgcn--opencl -target-cpu verde -S -emit-llvm -o - %s
+// RUN: %clang_cc1 -verify -triple amdgcn -target-cpu verde -emit-llvm -o - %s
+// RUN: %clang_cc1 -verify -triple amdgcn--opencl -target-cpu verde -emit-llvm -o - %s
 // expected-no-diagnostics
 
 // Record lowering was crashing on SI and newer targets, because it
diff --git a/clang/test/Sema/test-wunaligned-access.c b/clang/test/Sema/test-wunaligned-access.c
index 909cda45f489..4680ff4cf19c 100644
--- a/clang/test/Sema/test-wunaligned-access.c
+++ b/clang/test/Sema/test-wunaligned-access.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -triple=armv7-none-none-eabi -verify -Wunaligned-access -S -emit-llvm -o %t
+// RUN: %clang_cc1 %s -triple=armv7-none-none-eabi -verify -Wunaligned-access -emit-llvm -o %t
 // REQUIRES: arm-registered-target
 //
 // This test suite tests the warning triggered by the -Wunaligned-access option.
diff --git a/clang/test/Sema/test-wunaligned-access.cpp b/clang/test/Sema/test-wunaligned-access.cpp
index 33f518310b0b..a27fd9cba85d 100644
--- a/clang/test/Sema/test-wunaligned-access.cpp
+++ b/clang/test/Sema/test-wunaligned-access.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -triple=armv7-none-none-eabi -verify -Wunaligned-access -S -emit-llvm -o %t
+// RUN: %clang_cc1 %s -triple=armv7-none-none-eabi -verify -Wunaligned-access -emit-llvm -o %t
 // REQUIRES: arm-registered-target
 //
 // This test suite tests the warning triggered by the -Wunaligned-access option.
diff --git a/clang/test/Sema/thread_local.c b/clang/test/Sema/thread_local.c
index a0de0aa4e39a..b65f1119c738 100644
--- a/clang/test/Sema/thread_local.c
+++ b/clang/test/Sema/thread_local.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -std=c23 %s -verify
+// UNSUPPORTED: target={{.*}}-zos{{.*}}
 
 // Ensure that thread_local and _Thread_local are synonyms in C23 and both
 // restrict local variables to be explicitly static or extern.
diff --git a/clang/test/Sema/uninit-variables-riscv-vector.c b/clang/test/Sema/uninit-variables-riscv-vector.c
index 91af7514656b..0653c77dc472 100644
--- a/clang/test/Sema/uninit-variables-riscv-vector.c
+++ b/clang/test/Sema/uninit-variables-riscv-vector.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple riscv64 -fsyntax-only -Wuninitialized -fsyntax-only -target-feature +v %s -verify
+// RUN: %clang_cc1 -triple riscv64 -Wuninitialized -fsyntax-only -target-feature +v %s -verify
 
 #pragma clang riscv intrinsic vector
 
diff --git a/clang/test/Sema/uninit-variables-vectors.c b/clang/test/Sema/uninit-variables-vectors.c
index 10a8ecc378f0..942248babdb3 100644
--- a/clang/test/Sema/uninit-variables-vectors.c
+++ b/clang/test/Sema/uninit-variables-vectors.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-apple-darwin10.0.0 -fsyntax-only -Wuninitialized -fsyntax-only %s -verify
+// RUN: %clang_cc1 -triple x86_64-apple-darwin10.0.0 -Wuninitialized -fsyntax-only %s -verify
 
 typedef int __v4si __attribute__((__vector_size__(16)));
 typedef float __m128 __attribute__((__vector_size__(16)));
diff --git a/clang/test/Sema/uninit-variables.c b/clang/test/Sema/uninit-variables.c
index cba8ee7b1998..70a00793fd29 100644
--- a/clang/test/Sema/uninit-variables.c
+++ b/clang/test/Sema/uninit-variables.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -fsyntax-only -Wuninitialized -Wconditional-uninitialized -fsyntax-only -fblocks %s -verify
-// RUN: %clang_cc1 -fsyntax-only -Wuninitialized -Wconditional-uninitialized -ftrivial-auto-var-init=pattern -fsyntax-only -fblocks %s -verify
+// RUN: %clang_cc1 -Wuninitialized -Wconditional-uninitialized -fsyntax-only -fblocks %s -verify
+// RUN: %clang_cc1 -Wuninitialized -Wconditional-uninitialized -ftrivial-auto-var-init=pattern -fsyntax-only -fblocks %s -verify
 
 typedef __typeof(sizeof(int)) size_t;
 void *malloc(size_t);
diff --git a/clang/test/Sema/warn-documentation-almost-trailing.c b/clang/test/Sema/warn-documentation-almost-trailing.c
index 9ff71a3bce36..3eafb1daef2b 100644
--- a/clang/test/Sema/warn-documentation-almost-trailing.c
+++ b/clang/test/Sema/warn-documentation-almost-trailing.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -fsyntax-only -Wdocumentation -verify %s
 // RUN: %clang_cc1 -fsyntax-only -Wdocumentation -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s
 // RUN: cp %s %t
-// RUN: %clang_cc1 -fsyntax-only -Wdocumentation -fixit %t
+// RUN: %clang_cc1 -Wdocumentation -fixit %t
 // RUN: %clang_cc1 -fsyntax-only -Wdocumentation -Werror %t
 
 struct a {
diff --git a/clang/test/Sema/warn-strncat-size.c b/clang/test/Sema/warn-strncat-size.c
index 215eb0d079da..f343465d65bc 100644
--- a/clang/test/Sema/warn-strncat-size.c
+++ b/clang/test/Sema/warn-strncat-size.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -Wstrncat-size -verify -fsyntax-only %s
 // RUN: %clang_cc1 -DUSE_BUILTINS -Wstrncat-size -verify -fsyntax-only %s
-// RUN: %clang_cc1 -fsyntax-only -Wstrncat-size -fixit -x c %s
-// RUN: %clang_cc1 -DUSE_BUILTINS -fsyntax-only -Wstrncat-size -fixit -x c %s
+// RUN: %clang_cc1 -Wstrncat-size -fixit -x c %s
+// RUN: %clang_cc1 -DUSE_BUILTINS -Wstrncat-size -fixit -x c %s
 
 typedef __SIZE_TYPE__ size_t;
 size_t strlen (const char *s);
diff --git a/clang/test/SemaCUDA/call-kernel-from-kernel.cu b/clang/test/SemaCUDA/call-kernel-from-kernel.cu
index 900efcef43b8..5f8832f3cd07 100644
--- a/clang/test/SemaCUDA/call-kernel-from-kernel.cu
+++ b/clang/test/SemaCUDA/call-kernel-from-kernel.cu
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s --std=c++11 -triple nvptx -emit-llvm -o - \
+// RUN: %clang_cc1 %s --std=c++11 -triple nvptx -o - \
 // RUN:   -verify -fcuda-is-device -fsyntax-only -verify-ignore-unexpected=note
 
 #include "Inputs/cuda.h"
diff --git a/clang/test/SemaCUDA/constexpr-variables.cu b/clang/test/SemaCUDA/constexpr-variables.cu
index aa88cbadb73f..1c54714e42f9 100644
--- a/clang/test/SemaCUDA/constexpr-variables.cu
+++ b/clang/test/SemaCUDA/constexpr-variables.cu
@@ -1,10 +1,10 @@
-// RUN: %clang_cc1 -std=c++14 %s -emit-llvm -o - -triple nvptx64-nvidia-cuda \
+// RUN: %clang_cc1 -std=c++14 %s -triple nvptx64-nvidia-cuda \
 // RUN:   -fcuda-is-device -verify -fsyntax-only
-// RUN: %clang_cc1 -std=c++17 %s -emit-llvm -o - -triple nvptx64-nvidia-cuda \
+// RUN: %clang_cc1 -std=c++17 %s -triple nvptx64-nvidia-cuda \
 // RUN:   -fcuda-is-device -verify -fsyntax-only
-// RUN: %clang_cc1 -std=c++14 %s -emit-llvm -o - \
+// RUN: %clang_cc1 -std=c++14 %s \
 // RUN:   -triple x86_64-unknown-linux-gnu -verify -fsyntax-only
-// RUN: %clang_cc1 -std=c++17 %s -emit-llvm -o - \
+// RUN: %clang_cc1 -std=c++17 %s \
 // RUN:   -triple x86_64-unknown-linux-gnu -verify -fsyntax-only
 #include "Inputs/cuda.h"
 
diff --git a/clang/test/SemaCUDA/fp16-arg-return.cu b/clang/test/SemaCUDA/fp16-arg-return.cu
index 23a9613b18b2..46d543f44445 100644
--- a/clang/test/SemaCUDA/fp16-arg-return.cu
+++ b/clang/test/SemaCUDA/fp16-arg-return.cu
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -emit-llvm -o - -triple amdgcn-amd-amdhsa -fcuda-is-device -fsyntax-only -verify %s
+// RUN: %clang_cc1 -o - -triple amdgcn-amd-amdhsa -fcuda-is-device -fsyntax-only -verify %s
 
 // expected-no-diagnostics
 
diff --git a/clang/test/SemaCUDA/qualifiers.cu b/clang/test/SemaCUDA/qualifiers.cu
index 4be850586fbf..0097d3900385 100644
--- a/clang/test/SemaCUDA/qualifiers.cu
+++ b/clang/test/SemaCUDA/qualifiers.cu
@@ -5,9 +5,9 @@
 // intentional errors. CC1 failure is expected and must be ignored
 // here. We're interested in what ends up in AST and that's what
 // FileCheck verifies.
-// RUN: not %clang_cc1 -triple x86_64-unknown-linux-gnu -fsyntax-only -ast-dump %s \
+// RUN: not %clang_cc1 -triple x86_64-unknown-linux-gnu -ast-dump %s \
 // RUN:   | FileCheck %s --check-prefix=CHECK-ALL --check-prefix=CHECK-HOST
-// RUN: not %clang_cc1 -triple nvptx-unknown-cuda -fsyntax-only -ast-dump -fcuda-is-device %s \
+// RUN: not %clang_cc1 -triple nvptx-unknown-cuda -ast-dump -fcuda-is-device %s \
 // RUN:   | FileCheck %s --check-prefix=CHECK-ALL --check-prefix=CHECK-DEVICE
 
 #include "Inputs/cuda.h"
diff --git a/clang/test/SemaCUDA/static-device-var.cu b/clang/test/SemaCUDA/static-device-var.cu
index 8027f265266e..42be40aaae9f 100644
--- a/clang/test/SemaCUDA/static-device-var.cu
+++ b/clang/test/SemaCUDA/static-device-var.cu
@@ -2,10 +2,10 @@
 // REQUIRES: amdgpu-registered-target
 
 // RUN: %clang_cc1 -triple nvptx -fcuda-is-device -std=c++11 \
-// RUN:    -emit-llvm -o - %s -fsyntax-only -verify=dev,com
+// RUN:    -o - %s -fsyntax-only -verify=dev,com
 
 // RUN: %clang_cc1 -triple x86_64-gnu-linux -std=c++11 \
-// RUN:    -emit-llvm -o - %s -fsyntax-only -verify=host,com
+// RUN:    -o - %s -fsyntax-only -verify=host,com
 
 // Checks allowed usage of file-scope and function-scope static variables.
 
diff --git a/clang/test/SemaCXX/PR12361.cpp b/clang/test/SemaCXX/PR12361.cpp
new file mode 100644
index 000000000000..95ceb45b7ba0
--- /dev/null
+++ b/clang/test/SemaCXX/PR12361.cpp
@@ -0,0 +1,30 @@
+ // RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+ // RUN: %clang_cc1 -fsyntax-only -verify -std=c++17 %s
+ 
+class D {
+    class E{
+        class F{}; // expected-note{{implicitly declared private here}}
+        friend  void foo(D::E::F& q);
+        };
+    friend  void foo(D::E::F& q); // expected-error{{'F' is a private member of 'D::E'}}
+    };
+
+void foo(D::E::F& q) {}
+
+class D1 {
+    class E1{
+        class F1{}; // expected-note{{implicitly declared private here}}
+        friend  D1::E1::F1 foo1();
+        };
+    friend  D1::E1::F1 foo1(); // expected-error{{'F1' is a private member of 'D1::E1'}}
+    };
+
+D1::E1::F1 foo1() { return D1::E1::F1(); }
+
+class D2 {
+    class E2{
+        class F2{};
+        friend  void foo2();
+        };
+    friend  void foo2(){ D2::E2::F2 c;}
+    };
diff --git a/clang/test/SemaCXX/PR62533.cpp b/clang/test/SemaCXX/PR62533.cpp
index 920ea54d4b00..0753156813f8 100644
--- a/clang/test/SemaCXX/PR62533.cpp
+++ b/clang/test/SemaCXX/PR62533.cpp
@@ -2,7 +2,7 @@
 
 template<typename T>
 struct test {
-  template<typename> using fun_diff = char; // expected-note 2{{class template declared here}}
+  template<typename> using fun_diff = char; // expected-note 2{{type alias template declared here}}
 };
 
 template<typename T, typename V>
diff --git a/clang/test/SemaCXX/attr-non-x86-no_caller_saved_registers.cpp b/clang/test/SemaCXX/attr-non-x86-no_caller_saved_registers.cpp
index e31a16e3400d..00fa5bd7336b 100644
--- a/clang/test/SemaCXX/attr-non-x86-no_caller_saved_registers.cpp
+++ b/clang/test/SemaCXX/attr-non-x86-no_caller_saved_registers.cpp
@@ -1,29 +1,29 @@
-// RUN: %clang_cc1 -std=c++11 -triple armv7-unknown-linux-gnueabi -fsyntax-only -verify %s
-
-struct a {
-  int __attribute__((no_caller_saved_registers)) b;                     // expected-warning {{unknown attribute 'no_caller_saved_registers' ignored}}
-  static void foo(int *a) __attribute__((no_caller_saved_registers)) {} // expected-warning {{unknown attribute 'no_caller_saved_registers' ignored}}
-};
-
-struct a test __attribute__((no_caller_saved_registers)); // expected-warning {{unknown attribute 'no_caller_saved_registers' ignored}}
-
-__attribute__((no_caller_saved_registers(999))) void bar(int *) {} // expected-warning {{unknown attribute 'no_caller_saved_registers' ignored}}
-
-__attribute__((no_caller_saved_registers)) void foo(int *){} // expected-warning {{unknown attribute 'no_caller_saved_registers' ignored}}
-
-[[gnu::no_caller_saved_registers]] void foo2(int *) {} // expected-warning {{unknown attribute 'no_caller_saved_registers' ignored}}
-
-typedef __attribute__((no_caller_saved_registers)) void (*foo3)(int *); // expected-warning {{unknown attribute 'no_caller_saved_registers' ignored}}
-
-typedef void (*foo5)(int *);
-
-int (*foo4)(double a, __attribute__((no_caller_saved_registers)) float b); // expected-warning {{unknown attribute 'no_caller_saved_registers' ignored}}
-
-int main(int argc, char **argv) {
-  void (*fp)(int *) = foo;
-  a::foo(&argc);
-  foo3 func = foo2;
-  func(&argc);
-  foo5 __attribute__((no_caller_saved_registers)) func2 = foo2; // expected-warning {{unknown attribute 'no_caller_saved_registers' ignored}}
-  return 0;
-}
+// RUN: %clang_cc1 -std=c++11 -triple armv7-unknown-linux-gnueabi -fsyntax-only -verify %s
+
+struct a {
+  int __attribute__((no_caller_saved_registers)) b;                     // expected-warning {{unknown attribute 'no_caller_saved_registers' ignored}}
+  static void foo(int *a) __attribute__((no_caller_saved_registers)) {} // expected-warning {{unknown attribute 'no_caller_saved_registers' ignored}}
+};
+
+struct a test __attribute__((no_caller_saved_registers)); // expected-warning {{unknown attribute 'no_caller_saved_registers' ignored}}
+
+__attribute__((no_caller_saved_registers(999))) void bar(int *) {} // expected-warning {{unknown attribute 'no_caller_saved_registers' ignored}}
+
+__attribute__((no_caller_saved_registers)) void foo(int *){} // expected-warning {{unknown attribute 'no_caller_saved_registers' ignored}}
+
+[[gnu::no_caller_saved_registers]] void foo2(int *) {} // expected-warning {{unknown attribute 'no_caller_saved_registers' ignored}}
+
+typedef __attribute__((no_caller_saved_registers)) void (*foo3)(int *); // expected-warning {{unknown attribute 'no_caller_saved_registers' ignored}}
+
+typedef void (*foo5)(int *);
+
+int (*foo4)(double a, __attribute__((no_caller_saved_registers)) float b); // expected-warning {{unknown attribute 'no_caller_saved_registers' ignored}}
+
+int main(int argc, char **argv) {
+  void (*fp)(int *) = foo;
+  a::foo(&argc);
+  foo3 func = foo2;
+  func(&argc);
+  foo5 __attribute__((no_caller_saved_registers)) func2 = foo2; // expected-warning {{unknown attribute 'no_caller_saved_registers' ignored}}
+  return 0;
+}
diff --git a/clang/test/SemaCXX/attr-x86-no_caller_saved_registers.cpp b/clang/test/SemaCXX/attr-x86-no_caller_saved_registers.cpp
index 55500519c49e..b88f38ff40f6 100644
--- a/clang/test/SemaCXX/attr-x86-no_caller_saved_registers.cpp
+++ b/clang/test/SemaCXX/attr-x86-no_caller_saved_registers.cpp
@@ -1,33 +1,33 @@
-// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -fsyntax-only -verify %s
-
-struct a {
-  int b __attribute__((no_caller_saved_registers)); // expected-warning {{'no_caller_saved_registers' only applies to function types; type here is 'int'}}
-  static void foo(int *a) __attribute__((no_caller_saved_registers)) {}
-};
-
-struct a test __attribute__((no_caller_saved_registers)); // expected-warning {{'no_caller_saved_registers' only applies to function types; type here is 'struct a'}}
-
-__attribute__((no_caller_saved_registers(999))) void bar(int *) {} // expected-error {{'no_caller_saved_registers' attribute takes no arguments}}
-
-void __attribute__((no_caller_saved_registers)) foo(int *){}
-
-[[gnu::no_caller_saved_registers]] void foo2(int *) {}
-
-typedef __attribute__((no_caller_saved_registers)) void (*foo3)(int *);
-
-int (*foo4)(double a, __attribute__((no_caller_saved_registers)) float b); // expected-warning {{'no_caller_saved_registers' only applies to function types; type here is 'float'}}
-
-typedef void (*foo5)(int *);
-
-void foo6(){} // expected-note {{previous declaration is here}}
-
-void __attribute__((no_caller_saved_registers)) foo6(); // expected-error {{function declared with 'no_caller_saved_registers' attribute was previously declared without the 'no_caller_saved_registers' attribute}} 
-
-int main(int argc, char **argv) {
-  void (*fp)(int *) = foo; // expected-error {{cannot initialize a variable of type 'void (*)(int *)' with an lvalue of type 'void (int *) __attribute__((no_caller_saved_registers))'}} 
-  a::foo(&argc);
-  foo3 func = foo2;
-  func(&argc);
-  foo5 __attribute__((no_caller_saved_registers)) func2 = foo2;
-  return 0;
-}
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -fsyntax-only -verify %s
+
+struct a {
+  int b __attribute__((no_caller_saved_registers)); // expected-warning {{'no_caller_saved_registers' only applies to function types; type here is 'int'}}
+  static void foo(int *a) __attribute__((no_caller_saved_registers)) {}
+};
+
+struct a test __attribute__((no_caller_saved_registers)); // expected-warning {{'no_caller_saved_registers' only applies to function types; type here is 'struct a'}}
+
+__attribute__((no_caller_saved_registers(999))) void bar(int *) {} // expected-error {{'no_caller_saved_registers' attribute takes no arguments}}
+
+void __attribute__((no_caller_saved_registers)) foo(int *){}
+
+[[gnu::no_caller_saved_registers]] void foo2(int *) {}
+
+typedef __attribute__((no_caller_saved_registers)) void (*foo3)(int *);
+
+int (*foo4)(double a, __attribute__((no_caller_saved_registers)) float b); // expected-warning {{'no_caller_saved_registers' only applies to function types; type here is 'float'}}
+
+typedef void (*foo5)(int *);
+
+void foo6(){} // expected-note {{previous declaration is here}}
+
+void __attribute__((no_caller_saved_registers)) foo6(); // expected-error {{function declared with 'no_caller_saved_registers' attribute was previously declared without the 'no_caller_saved_registers' attribute}}
+
+int main(int argc, char **argv) {
+  void (*fp)(int *) = foo; // expected-error {{cannot initialize a variable of type 'void (*)(int *)' with an lvalue of type 'void (int *) __attribute__((no_caller_saved_registers))'}}
+  a::foo(&argc);
+  foo3 func = foo2;
+  func(&argc);
+  foo5 __attribute__((no_caller_saved_registers)) func2 = foo2;
+  return 0;
+}
diff --git a/clang/test/SemaCXX/builtins-elementwise-math.cpp b/clang/test/SemaCXX/builtins-elementwise-math.cpp
index 44a44ab055e9..499f2795ddb2 100644
--- a/clang/test/SemaCXX/builtins-elementwise-math.cpp
+++ b/clang/test/SemaCXX/builtins-elementwise-math.cpp
@@ -111,6 +111,13 @@ void test_builtin_elementwise_sin() {
   static_assert(!is_const<decltype(__builtin_elementwise_sin(b))>::value);
 }
 
+void test_builtin_elementwise_tan() {
+  const float a = 42.0;
+  float b = 42.3;
+  static_assert(!is_const<decltype(__builtin_elementwise_tan(a))>::value);
+  static_assert(!is_const<decltype(__builtin_elementwise_tan(b))>::value);
+}
+
 void test_builtin_elementwise_sqrt() {
   const float a = 42.0;
   float b = 42.3;
diff --git a/clang/test/SemaCXX/co_await-ast.cpp b/clang/test/SemaCXX/co_await-ast.cpp
index 10cee21da0e8..f792a2cb7b1a 100644
--- a/clang/test/SemaCXX/co_await-ast.cpp
+++ b/clang/test/SemaCXX/co_await-ast.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -std=c++20 -fsyntax-only -ast-dump -ast-dump-filter=foo %s | FileCheck %s --strict-whitespace
-// RUN: %clang_cc1 -std=c++20 -triple i386-windows-pc -fsyntax-only -ast-dump -ast-dump-filter=foo %s | FileCheck %s --strict-whitespace
+// RUN: %clang_cc1 -std=c++20 -ast-dump -ast-dump-filter=foo %s | FileCheck %s --strict-whitespace
+// RUN: %clang_cc1 -std=c++20 -triple i386-windows-pc -ast-dump -ast-dump-filter=foo %s | FileCheck %s --strict-whitespace
 
 namespace std {
 template <typename, typename...> struct coroutine_traits;
diff --git a/clang/test/SemaCXX/compound-literal.cpp b/clang/test/SemaCXX/compound-literal.cpp
index a3d3b9faa9fe..a62e4f79b5a0 100644
--- a/clang/test/SemaCXX/compound-literal.cpp
+++ b/clang/test/SemaCXX/compound-literal.cpp
@@ -1,131 +1,131 @@
-// RUN: %clang_cc1 -fsyntax-only -std=c++03 -verify -ast-dump %s > %t-03
-// RUN: FileCheck --input-file=%t-03 %s
-// RUN: %clang_cc1 -fsyntax-only -std=c++11 -verify -ast-dump %s > %t-11
-// RUN: FileCheck --input-file=%t-11 %s
-// RUN: FileCheck --input-file=%t-11 %s --check-prefix=CHECK-CXX11
-// RUN: %clang_cc1 -verify -std=c++17 %s
-
-// http://llvm.org/PR7905
-namespace PR7905 {
-struct S; // expected-note {{forward declaration}}
-void foo1() {
-  (void)(S[]) {{3}}; // expected-error {{array has incomplete element type}}
-}
-
-template <typename T> struct M { T m; };
-void foo2() {
-  (void)(M<short> []) {{3}};
-}
-}
-
-// Check compound literals mixed with C++11 list-initialization.
-namespace brace_initializers {
-  struct POD {
-    int x, y;
-  };
-  struct HasCtor {
-    HasCtor(int x, int y);
-  };
-  struct HasDtor {
-    int x, y;
-    ~HasDtor();
-  };
-  struct HasCtorDtor {
-    HasCtorDtor(int x, int y);
-    ~HasCtorDtor();
-  };
-
-  POD p = (POD){1, 2};
-  // CHECK-NOT: CXXBindTemporaryExpr {{.*}} 'brace_initializers::POD'
-  // CHECK: CompoundLiteralExpr {{.*}} 'POD':'brace_initializers::POD'
-  // CHECK-NEXT: InitListExpr {{.*}} 'POD':'brace_initializers::POD'
-  // CHECK-NEXT: ConstantExpr {{.*}}
-  // CHECK-NEXT: IntegerLiteral {{.*}} 1{{$}}
-  // CHECK-NEXT: ConstantExpr {{.*}}
-  // CHECK-NEXT: IntegerLiteral {{.*}} 2{{$}}
-
-  void test() {
-    (void)(POD){1, 2};
-    // CHECK-NOT: CXXBindTemporaryExpr {{.*}} 'POD':'brace_initializers::POD'
-    // CHECK-NOT: ConstantExpr {{.*}} 'POD':'brace_initializers::POD'
-    // CHECK: CompoundLiteralExpr {{.*}} 'POD':'brace_initializers::POD'
-    // CHECK-NEXT: InitListExpr {{.*}} 'POD':'brace_initializers::POD'
-    // CHECK-NEXT: IntegerLiteral {{.*}} 1{{$}}
-    // CHECK-NEXT: IntegerLiteral {{.*}} 2{{$}}
-
-    (void)(HasDtor){1, 2};
-    // CHECK: CXXBindTemporaryExpr {{.*}} 'HasDtor':'brace_initializers::HasDtor'
-    // CHECK-NEXT: CompoundLiteralExpr {{.*}} 'HasDtor':'brace_initializers::HasDtor'
-    // CHECK-NEXT: InitListExpr {{.*}} 'HasDtor':'brace_initializers::HasDtor'
-    // CHECK-NEXT: IntegerLiteral {{.*}} 1{{$}}
-    // CHECK-NEXT: IntegerLiteral {{.*}} 2{{$}}
-
-#if __cplusplus >= 201103L
-    (void)(HasCtor){1, 2};
-    // CHECK-CXX11-NOT: CXXBindTemporaryExpr {{.*}} 'HasCtor':'brace_initializers::HasCtor'
-    // CHECK-CXX11-NOT: ConstantExpr {{.*}} 'HasCtor':'brace_initializers::HasCtor'
-    // CHECK-CXX11: CompoundLiteralExpr {{.*}} 'HasCtor':'brace_initializers::HasCtor'
-    // CHECK-CXX11-NEXT: CXXTemporaryObjectExpr {{.*}} 'HasCtor':'brace_initializers::HasCtor'
-    // CHECK-CXX11-NEXT: IntegerLiteral {{.*}} 1{{$}}
-    // CHECK-CXX11-NEXT: IntegerLiteral {{.*}} 2{{$}}
-
-    (void)(HasCtorDtor){1, 2};
-    // CHECK-CXX11: CXXBindTemporaryExpr {{.*}} 'HasCtorDtor':'brace_initializers::HasCtorDtor'
-    // CHECK-CXX11-NOT: ConstantExpr {{.*}} 'HasCtorDtor':'brace_initializers::HasCtorDtor'
-    // CHECK-CXX11: CompoundLiteralExpr {{.*}} 'HasCtorDtor':'brace_initializers::HasCtorDtor'
-    // CHECK-CXX11-NEXT: CXXTemporaryObjectExpr {{.*}} 'HasCtorDtor':'brace_initializers::HasCtorDtor'
-    // CHECK-CXX11-NEXT: IntegerLiteral {{.*}} 1{{$}}
-    // CHECK-CXX11-NEXT: IntegerLiteral {{.*}} 2{{$}}
-#endif
-  }
-
-  struct PrivateDtor {
-    int x, y;
-  private:
-    ~PrivateDtor(); // expected-note {{declared private here}}
-  };
-
-  void testPrivateDtor() {
-    (void)(PrivateDtor){1, 2}; // expected-error {{temporary of type 'PrivateDtor' has private destructor}}
-  }
-}
-
-// This doesn't necessarily need to be an error, but CodeGen can't handle it
-// at the moment.
-int PR17415 = (int){PR17415}; // expected-error {{initializer element is not a compile-time constant}}
-
-// Make sure we accept this.  (Not sure if we actually should... but we do
-// at the moment.)
-template<unsigned> struct Value { };
-template<typename T>
-int &check_narrowed(Value<sizeof((T){1.1})>);
-
-#if __cplusplus >= 201103L
-// Compound literals in global lambdas have automatic storage duration
-// and are not subject to the constant-initialization rules.
-int computed_with_lambda = [] {
-  int x = 5;
-  int result = ((int[]) { x, x + 2, x + 4, x + 6 })[0];
-  return result;
-}();
-#endif
-
-namespace DynamicFileScopeLiteral {
-// This covers the case where we have a file-scope compound literal with a
-// non-constant initializer in C++. Previously, we had a bug where Clang forgot
-// to consider initializer list elements for bases.
-struct Empty {};
-struct Foo : Empty { // expected-note 0+ {{candidate constructor}}
-  int x;
-  int y;
-};
-int f();
-#if __cplusplus < 201103L
-// expected-error@+6 {{non-aggregate type 'Foo' cannot be initialized with an initializer list}}
-#elif __cplusplus < 201703L
-// expected-error@+4 {{no matching constructor}}
-#else
-// expected-error@+2 {{initializer element is not a compile-time constant}}
-#endif
-Foo o = (Foo){ {}, 1, f() };
-}
+// RUN: %clang_cc1 -std=c++03 -verify -ast-dump %s > %t-03
+// RUN: FileCheck --input-file=%t-03 %s
+// RUN: %clang_cc1 -std=c++11 -verify -ast-dump %s > %t-11
+// RUN: FileCheck --input-file=%t-11 %s
+// RUN: FileCheck --input-file=%t-11 %s --check-prefix=CHECK-CXX11
+// RUN: %clang_cc1 -verify -std=c++17 %s
+
+// http://llvm.org/PR7905
+namespace PR7905 {
+struct S; // expected-note {{forward declaration}}
+void foo1() {
+  (void)(S[]) {{3}}; // expected-error {{array has incomplete element type}}
+}
+
+template <typename T> struct M { T m; };
+void foo2() {
+  (void)(M<short> []) {{3}};
+}
+}
+
+// Check compound literals mixed with C++11 list-initialization.
+namespace brace_initializers {
+  struct POD {
+    int x, y;
+  };
+  struct HasCtor {
+    HasCtor(int x, int y);
+  };
+  struct HasDtor {
+    int x, y;
+    ~HasDtor();
+  };
+  struct HasCtorDtor {
+    HasCtorDtor(int x, int y);
+    ~HasCtorDtor();
+  };
+
+  POD p = (POD){1, 2};
+  // CHECK-NOT: CXXBindTemporaryExpr {{.*}} 'brace_initializers::POD'
+  // CHECK: CompoundLiteralExpr {{.*}} 'POD':'brace_initializers::POD'
+  // CHECK-NEXT: InitListExpr {{.*}} 'POD':'brace_initializers::POD'
+  // CHECK-NEXT: ConstantExpr {{.*}}
+  // CHECK-NEXT: IntegerLiteral {{.*}} 1{{$}}
+  // CHECK-NEXT: ConstantExpr {{.*}}
+  // CHECK-NEXT: IntegerLiteral {{.*}} 2{{$}}
+
+  void test() {
+    (void)(POD){1, 2};
+    // CHECK-NOT: CXXBindTemporaryExpr {{.*}} 'POD':'brace_initializers::POD'
+    // CHECK-NOT: ConstantExpr {{.*}} 'POD':'brace_initializers::POD'
+    // CHECK: CompoundLiteralExpr {{.*}} 'POD':'brace_initializers::POD'
+    // CHECK-NEXT: InitListExpr {{.*}} 'POD':'brace_initializers::POD'
+    // CHECK-NEXT: IntegerLiteral {{.*}} 1{{$}}
+    // CHECK-NEXT: IntegerLiteral {{.*}} 2{{$}}
+
+    (void)(HasDtor){1, 2};
+    // CHECK: CXXBindTemporaryExpr {{.*}} 'HasDtor':'brace_initializers::HasDtor'
+    // CHECK-NEXT: CompoundLiteralExpr {{.*}} 'HasDtor':'brace_initializers::HasDtor'
+    // CHECK-NEXT: InitListExpr {{.*}} 'HasDtor':'brace_initializers::HasDtor'
+    // CHECK-NEXT: IntegerLiteral {{.*}} 1{{$}}
+    // CHECK-NEXT: IntegerLiteral {{.*}} 2{{$}}
+
+#if __cplusplus >= 201103L
+    (void)(HasCtor){1, 2};
+    // CHECK-CXX11-NOT: CXXBindTemporaryExpr {{.*}} 'HasCtor':'brace_initializers::HasCtor'
+    // CHECK-CXX11-NOT: ConstantExpr {{.*}} 'HasCtor':'brace_initializers::HasCtor'
+    // CHECK-CXX11: CompoundLiteralExpr {{.*}} 'HasCtor':'brace_initializers::HasCtor'
+    // CHECK-CXX11-NEXT: CXXTemporaryObjectExpr {{.*}} 'HasCtor':'brace_initializers::HasCtor'
+    // CHECK-CXX11-NEXT: IntegerLiteral {{.*}} 1{{$}}
+    // CHECK-CXX11-NEXT: IntegerLiteral {{.*}} 2{{$}}
+
+    (void)(HasCtorDtor){1, 2};
+    // CHECK-CXX11: CXXBindTemporaryExpr {{.*}} 'HasCtorDtor':'brace_initializers::HasCtorDtor'
+    // CHECK-CXX11-NOT: ConstantExpr {{.*}} 'HasCtorDtor':'brace_initializers::HasCtorDtor'
+    // CHECK-CXX11: CompoundLiteralExpr {{.*}} 'HasCtorDtor':'brace_initializers::HasCtorDtor'
+    // CHECK-CXX11-NEXT: CXXTemporaryObjectExpr {{.*}} 'HasCtorDtor':'brace_initializers::HasCtorDtor'
+    // CHECK-CXX11-NEXT: IntegerLiteral {{.*}} 1{{$}}
+    // CHECK-CXX11-NEXT: IntegerLiteral {{.*}} 2{{$}}
+#endif
+  }
+
+  struct PrivateDtor {
+    int x, y;
+  private:
+    ~PrivateDtor(); // expected-note {{declared private here}}
+  };
+
+  void testPrivateDtor() {
+    (void)(PrivateDtor){1, 2}; // expected-error {{temporary of type 'PrivateDtor' has private destructor}}
+  }
+}
+
+// This doesn't necessarily need to be an error, but CodeGen can't handle it
+// at the moment.
+int PR17415 = (int){PR17415}; // expected-error {{initializer element is not a compile-time constant}}
+
+// Make sure we accept this.  (Not sure if we actually should... but we do
+// at the moment.)
+template<unsigned> struct Value { };
+template<typename T>
+int &check_narrowed(Value<sizeof((T){1.1})>);
+
+#if __cplusplus >= 201103L
+// Compound literals in global lambdas have automatic storage duration
+// and are not subject to the constant-initialization rules.
+int computed_with_lambda = [] {
+  int x = 5;
+  int result = ((int[]) { x, x + 2, x + 4, x + 6 })[0];
+  return result;
+}();
+#endif
+
+namespace DynamicFileScopeLiteral {
+// This covers the case where we have a file-scope compound literal with a
+// non-constant initializer in C++. Previously, we had a bug where Clang forgot
+// to consider initializer list elements for bases.
+struct Empty {};
+struct Foo : Empty { // expected-note 0+ {{candidate constructor}}
+  int x;
+  int y;
+};
+int f();
+#if __cplusplus < 201103L
+// expected-error@+6 {{non-aggregate type 'Foo' cannot be initialized with an initializer list}}
+#elif __cplusplus < 201703L
+// expected-error@+4 {{no matching constructor}}
+#else
+// expected-error@+2 {{initializer element is not a compile-time constant}}
+#endif
+Foo o = (Foo){ {}, 1, f() };
+}
diff --git a/clang/test/SemaCXX/constexpr-default-arg.cpp b/clang/test/SemaCXX/constexpr-default-arg.cpp
index 7c8836928295..901123bfb359 100644
--- a/clang/test/SemaCXX/constexpr-default-arg.cpp
+++ b/clang/test/SemaCXX/constexpr-default-arg.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -std=c++1y -S -o - -emit-llvm -verify %s
-// RUN: %clang_cc1 -std=c++1y -fexperimental-new-constant-interpreter -S -o - -emit-llvm -verify %s
+// RUN: %clang_cc1 -std=c++1y -o - -emit-llvm -verify %s
+// RUN: %clang_cc1 -std=c++1y -fexperimental-new-constant-interpreter -o - -emit-llvm -verify %s
 
 namespace default_arg_temporary {
 
@@ -32,8 +32,8 @@ void test_default_arg2() {
 }
 
 // Check that multiple CXXDefaultInitExprs don't cause an assertion failure.
-struct A { int &&r = 0; }; // expected-note 2{{default member initializer}}
+struct A { int &&r = 0; };
 struct B { A x, y; };
-B b = {}; // expected-warning 2{{lifetime extension of temporary created by aggregate initialization using a default member initializer is not yet supported}}
+B b = {}; // expected-no-diagnostics
 
 }
diff --git a/clang/test/SemaCXX/cxx1y-generic-lambdas-capturing.cpp b/clang/test/SemaCXX/cxx1y-generic-lambdas-capturing.cpp
index dcc964cd60b3..b234c541a203 100644
--- a/clang/test/SemaCXX/cxx1y-generic-lambdas-capturing.cpp
+++ b/clang/test/SemaCXX/cxx1y-generic-lambdas-capturing.cpp
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -std=c++1y -verify -fsyntax-only -fblocks -emit-llvm-only %s
-// RUN: %clang_cc1 -std=c++2a -verify -verify=expected-cxx2a -fsyntax-only -fblocks -emit-llvm-only -Wno-deprecated-this-capture %s
-// RUN: %clang_cc1 -std=c++1y -verify -fsyntax-only -fblocks -emit-llvm-only -triple i386-windows-pc %s
-// RUN: %clang_cc1 -std=c++2a -verify -verify=expected-cxx2a -fsyntax-only -fblocks -emit-llvm-only -triple i386-windows-pc -Wno-deprecated-this-capture %s
+// RUN: %clang_cc1 -std=c++1y -verify -fblocks -emit-llvm-only %s
+// RUN: %clang_cc1 -std=c++2a -verify -verify=expected-cxx2a -fblocks -emit-llvm-only -Wno-deprecated-this-capture %s
+// RUN: %clang_cc1 -std=c++1y -verify -fblocks -emit-llvm-only -triple i386-windows-pc %s
+// RUN: %clang_cc1 -std=c++2a -verify -verify=expected-cxx2a -fblocks -emit-llvm-only -triple i386-windows-pc -Wno-deprecated-this-capture %s
 // DONTRUNYET: %clang_cc1 -std=c++1y -verify -fsyntax-only -fblocks -fdelayed-template-parsing %s -DDELAYED_TEMPLATE_PARSING
 // DONTRUNYET: %clang_cc1 -std=c++1y -verify -fsyntax-only -fblocks -fms-extensions %s -DMS_EXTENSIONS
 // DONTRUNYET: %clang_cc1 -std=c++1y -verify -fsyntax-only -fblocks -fdelayed-template-parsing -fms-extensions %s -DMS_EXTENSIONS -DDELAYED_TEMPLATE_PARSING
diff --git a/clang/test/SemaCXX/cxx1y-generic-lambdas.cpp b/clang/test/SemaCXX/cxx1y-generic-lambdas.cpp
index 22205db6984f..22765542b1aa 100644
--- a/clang/test/SemaCXX/cxx1y-generic-lambdas.cpp
+++ b/clang/test/SemaCXX/cxx1y-generic-lambdas.cpp
@@ -1,8 +1,8 @@
-// RUN: %clang_cc1 -std=c++1y -verify -fsyntax-only -fblocks -emit-llvm-only %s
+// RUN: %clang_cc1 -std=c++1y -verify -fblocks -emit-llvm-only %s
 // RUN: %clang_cc1 -std=c++1y -verify -fsyntax-only -fblocks -fdelayed-template-parsing %s -DDELAYED_TEMPLATE_PARSING
 // RUN: %clang_cc1 -std=c++1y -verify -fsyntax-only -fblocks -fms-extensions %s -DMS_EXTENSIONS
 // RUN: %clang_cc1 -std=c++1y -verify -fsyntax-only -fblocks -fdelayed-template-parsing -fms-extensions %s -DMS_EXTENSIONS -DDELAYED_TEMPLATE_PARSING
-// RUN: %clang_cc1 -std=c++1y -verify -fsyntax-only -fblocks -triple i386-windows-pc -emit-llvm-only %s
+// RUN: %clang_cc1 -std=c++1y -verify -fblocks -triple i386-windows-pc -emit-llvm-only %s
 // RUN: %clang_cc1 -std=c++1y -verify -fsyntax-only -fblocks -triple i386-windows-pc -fdelayed-template-parsing %s -DDELAYED_TEMPLATE_PARSING
 // RUN: %clang_cc1 -std=c++1y -verify -fsyntax-only -fblocks -triple i386-windows-pc -fms-extensions %s -DMS_EXTENSIONS
 // RUN: %clang_cc1 -std=c++1y -verify -fsyntax-only -fblocks -triple i386-windows-pc -fdelayed-template-parsing -fms-extensions %s -DMS_EXTENSIONS -DDELAYED_TEMPLATE_PARSING
diff --git a/clang/test/SemaCXX/cxx1z-lambda-star-this.cpp b/clang/test/SemaCXX/cxx1z-lambda-star-this.cpp
index 45b78139d0b0..1edeb89da171 100644
--- a/clang/test/SemaCXX/cxx1z-lambda-star-this.cpp
+++ b/clang/test/SemaCXX/cxx1z-lambda-star-this.cpp
@@ -1,9 +1,9 @@
-// RUN: %clang_cc1 -std=c++1z -verify -fsyntax-only -fblocks -emit-llvm-only %s
+// RUN: %clang_cc1 -std=c++1z -verify -fblocks -emit-llvm-only %s
 // RUN: %clang_cc1 -std=c++1z -verify -fsyntax-only -fblocks -fdelayed-template-parsing %s -DDELAYED_TEMPLATE_PARSING
 // RUN: %clang_cc1 -std=c++1z -verify -fsyntax-only -fblocks -fms-extensions %s -DMS_EXTENSIONS
 // RUN: %clang_cc1 -std=c++1z -verify -fsyntax-only -fblocks -fdelayed-template-parsing -fms-extensions %s -DMS_EXTENSIONS -DDELAYED_TEMPLATE_PARSING
 
-// RUN: %clang_cc1 -std=c++1z -verify -fsyntax-only -fblocks -emit-llvm-only %s -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -std=c++1z -verify -fblocks -emit-llvm-only %s -fexperimental-new-constant-interpreter
 // RUN: %clang_cc1 -std=c++1z -verify -fsyntax-only -fblocks -fdelayed-template-parsing %s -DDELAYED_TEMPLATE_PARSING -fexperimental-new-constant-interpreter
 // RUN: %clang_cc1 -std=c++1z -verify -fsyntax-only -fblocks -fms-extensions %s -DMS_EXTENSIONS -fexperimental-new-constant-interpreter
 // RUN: %clang_cc1 -std=c++1z -verify -fsyntax-only -fblocks -fdelayed-template-parsing -fms-extensions %s -DMS_EXTENSIONS -DDELAYED_TEMPLATE_PARSING -fexperimental-new-constant-interpreter
diff --git a/clang/test/SemaCXX/cxx1z-noexcept-function-type.cpp b/clang/test/SemaCXX/cxx1z-noexcept-function-type.cpp
index 5e56f19477d6..c8204c21523a 100644
--- a/clang/test/SemaCXX/cxx1z-noexcept-function-type.cpp
+++ b/clang/test/SemaCXX/cxx1z-noexcept-function-type.cpp
@@ -18,7 +18,7 @@ template<typename A, typename B> void redecl3() throw(B); // expected-error {{do
 
 typedef int I;
 template<bool B> void redecl4(I) noexcept(B);
-template<bool B> void redecl4(I) noexcept(B); // expected-note {{could not match 'void (I) noexcept(false)' (aka 'void (int) noexcept(false)') against 'void (int) noexcept'}}
+template<bool B> void redecl4(I) noexcept(B);
 
 void (*init_with_exact_type_a)(int) noexcept = redecl4<true>;
 void (*init_with_mismatched_type_a)(int) = redecl4<true>;
@@ -27,7 +27,7 @@ using DeducedType_a = decltype(deduce_auto_from_noexcept_function_ptr_a);
 using DeducedType_a = void (*)(int) noexcept;
 
 void (*init_with_exact_type_b)(int) = redecl4<false>;
-void (*init_with_mismatched_type_b)(int) noexcept = redecl4<false>; // expected-error {{does not match required type}}
+void (*init_with_mismatched_type_b)(int) noexcept = redecl4<false>; // expected-error {{cannot initialize a variable of type}}
 auto deduce_auto_from_noexcept_function_ptr_b = redecl4<false>;
 using DeducedType_b = decltype(deduce_auto_from_noexcept_function_ptr_b);
 using DeducedType_b = void (*)(int);
diff --git a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
index 508a3a5da76a..4c5595e409f2 100644
--- a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
+++ b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
@@ -307,3 +307,85 @@ using AFoo = Foo<int, Derived<U>>;
 
 AFoo a(Derived<int>{});
 } // namespace test22
+
+namespace test23 {
+// We have an aggregate deduction guide "G(T) -> G<T>".
+template<typename T>
+struct G { T t1; };
+
+template<typename X = int>
+using AG = G<int>;
+
+AG ag(1.0);
+// Verify that the aggregate deduction guide "AG(int) -> AG<int>" is built and
+// choosen.
+static_assert(__is_same(decltype(ag.t1), int));
+} // namespace test23
+
+// GH90177
+// verify that the transformed require-clause of the alias deduction gudie has
+// the right depth info.
+namespace test24 {
+class Forward;
+class Key {};
+
+template <typename D>
+constexpr bool C = sizeof(D);
+
+// Case1: the alias template and the underlying deduction guide are in the same
+// scope.
+template <typename T>
+struct Case1 {
+  template <typename U>
+  struct Foo {
+    Foo(U);
+  };
+
+  template <typename V>
+  requires (C<V>)
+  Foo(V) -> Foo<V>;
+
+  template <typename Y>
+  using Alias = Foo<Y>;
+};
+// The require-clause should be evaluated on the type Key.
+Case1<Forward>::Alias t2 = Key();
+
+
+// Case2: the alias template and underlying deduction guide are in different
+// scope.
+template <typename T>
+struct Foo {
+  Foo(T);
+};
+template <typename U>
+requires (C<U>)
+Foo(U) -> Foo<U>;
+
+template <typename T>
+struct Case2 {
+  template <typename Y>
+  using Alias = Foo<Y>;
+};
+// The require-caluse should be evaluated on the type Key.
+Case2<Forward>::Alias t1 = Key();
+
+// Case3: crashes on the constexpr evaluator due to the mixed-up depth in
+// require-expr.
+template <class T1>
+struct A1 {
+  template<class T2>
+  struct A2 {
+    template <class T3>
+    struct Foo {
+      Foo(T3);
+    };
+    template <class T3>
+    requires C<T3>
+    Foo(T3) -> Foo<T3>;
+  };
+};
+template <typename U>
+using AFoo = A1<int>::A2<int>::Foo<U>;
+AFoo case3(1);
+} // namespace test24
diff --git a/clang/test/SemaCXX/cxx20-lambda-decltype-this.cpp b/clang/test/SemaCXX/cxx20-lambda-decltype-this.cpp
index 161a2bcb25d7..4f66ec415b72 100644
--- a/clang/test/SemaCXX/cxx20-lambda-decltype-this.cpp
+++ b/clang/test/SemaCXX/cxx20-lambda-decltype-this.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++2a -fsyntax-only -emit-llvm-only %s
+// RUN: %clang_cc1 -std=c++2a -emit-llvm-only %s
 // RUN: %clang_cc1 -std=c++2a -fsyntax-only -fdelayed-template-parsing %s
 // RUN: %clang_cc1 -std=c++2a -fsyntax-only -fms-extensions %s
 // RUN: %clang_cc1 -std=c++2a -fsyntax-only -fdelayed-template-parsing -fms-extensions %s
diff --git a/clang/test/SemaCXX/cxx23-assume.cpp b/clang/test/SemaCXX/cxx23-assume.cpp
index 8676970de14f..e67d72ae0a99 100644
--- a/clang/test/SemaCXX/cxx23-assume.cpp
+++ b/clang/test/SemaCXX/cxx23-assume.cpp
@@ -138,3 +138,8 @@ constexpr int foo() {
 }
 
 static_assert(foo() == 0);
+
+template <bool ...val>
+void f() {
+    [[assume(val)]]; // expected-error {{expression contains unexpanded parameter pack}}
+}
diff --git a/clang/test/SemaCXX/cxx23-static-callop-lambda-expression.cpp b/clang/test/SemaCXX/cxx23-static-callop-lambda-expression.cpp
index 2b89e7a3a712..84fac9edad10 100644
--- a/clang/test/SemaCXX/cxx23-static-callop-lambda-expression.cpp
+++ b/clang/test/SemaCXX/cxx23-static-callop-lambda-expression.cpp
@@ -1,33 +1,33 @@
-// RUN: %clang_cc1 -std=c++23 -fsyntax-only -verify %s
-// RUN: %clang_cc1 -std=c++23 -fsyntax-only -verify %s -fexperimental-new-constant-interpreter
-
-namespace ns1 {
-  auto lstatic = []() static { return 3; }; 
-  int (*f2)(void) = lstatic;   
-    
-}
-
-namespace ns1_1 {
-  
-  auto lstatic = []() static consteval  //expected-error{{cannot take address of consteval call}} \
-                                          expected-note {{declared here}} 
-  { return 3; };   
-  
-  // FIXME: the above error should indicate that it was triggered below.
-  int (*f2)(void) = lstatic;   
-    
-}
-
-
-namespace ns2 {
-  auto lstatic = []() static { return 3; }; 
-  constexpr int (*f2)(void) = lstatic;                              
-  static_assert(lstatic() == f2());
-}
-
-namespace ns3 {
-  void main() {
-    static int x = 10;
-    auto L = []() static { return x; };
-  }
-}
+// RUN: %clang_cc1 -std=c++23 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -std=c++23 -fsyntax-only -verify %s -fexperimental-new-constant-interpreter
+
+namespace ns1 {
+  auto lstatic = []() static { return 3; };
+  int (*f2)(void) = lstatic;
+
+}
+
+namespace ns1_1 {
+
+  auto lstatic = []() static consteval  //expected-error{{cannot take address of consteval call}} \
+                                          expected-note {{declared here}}
+  { return 3; };
+
+  // FIXME: the above error should indicate that it was triggered below.
+  int (*f2)(void) = lstatic;
+
+}
+
+
+namespace ns2 {
+  auto lstatic = []() static { return 3; };
+  constexpr int (*f2)(void) = lstatic;
+  static_assert(lstatic() == f2());
+}
+
+namespace ns3 {
+  void main() {
+    static int x = 10;
+    auto L = []() static { return x; };
+  }
+}
diff --git a/clang/test/SemaCXX/cxx2a-consteval.cpp b/clang/test/SemaCXX/cxx2a-consteval.cpp
index e19807437207..622ec31c459d 100644
--- a/clang/test/SemaCXX/cxx2a-consteval.cpp
+++ b/clang/test/SemaCXX/cxx2a-consteval.cpp
@@ -1068,6 +1068,14 @@ void test() {
   constexpr int (*f2)(void) = lstatic; // expected-error {{constexpr variable 'f2' must be initialized by a constant expression}} \
                                        // expected-note  {{pointer to a consteval declaration is not a constant expression}}
 
+  int (*f3)(void) = []() consteval { return 3; };  // expected-error {{cannot take address of consteval call operator of '(lambda at}} \
+                                                   // expected-note {{declared here}}
+}
+
+consteval void consteval_test() {
+  constexpr auto l1 = []() consteval { return 3; };
+
+  int (*f1)(void) = l1;  // ok
 }
 }
 
@@ -1098,11 +1106,11 @@ int bad = 10; // expected-note 6{{declared here}}
 tester glob1(make_name("glob1"));
 tester glob2(make_name("glob2"));
 constexpr tester cglob(make_name("cglob"));
-tester paddedglob(make_name(pad(bad))); // expected-error {{call to consteval function 'GH58207::make_name' is not a constant expression}} \
+tester paddedglob(make_name(pad(bad))); // expected-error {{call to consteval function 'GH58207::tester::tester' is not a constant expression}} \
                                         // expected-note {{read of non-const variable 'bad' is not allowed in a constant expression}}
 
 constexpr tester glob3 = { make_name("glob3") };
-constexpr tester glob4 = { make_name(pad(bad)) }; // expected-error {{call to consteval function 'GH58207::make_name' is not a constant expression}} \
+constexpr tester glob4 = { make_name(pad(bad)) }; // expected-error {{call to consteval function 'GH58207::tester::tester' is not a constant expression}} \
                                                   // expected-error {{constexpr variable 'glob4' must be initialized by a constant expression}} \
                                                   // expected-note 2{{read of non-const variable 'bad' is not allowed in a constant expression}}
 
@@ -1114,12 +1122,12 @@ auto V1 = make_name(pad(bad)); // expected-error {{call to consteval function 'G
 void foo() {
   static tester loc1(make_name("loc1"));
   static constexpr tester loc2(make_name("loc2"));
-  static tester paddedloc(make_name(pad(bad))); // expected-error {{call to consteval function 'GH58207::make_name' is not a constant expression}} \
+  static tester paddedloc(make_name(pad(bad))); // expected-error {{call to consteval function 'GH58207::tester::tester' is not a constant expression}} \
                                                 // expected-note {{read of non-const variable 'bad' is not allowed in a constant expression}}
 }
 
 void bar() {
-  static tester paddedloc(make_name(pad(bad))); // expected-error {{call to consteval function 'GH58207::make_name' is not a constant expression}} \
+  static tester paddedloc(make_name(pad(bad))); // expected-error {{call to consteval function 'GH58207::tester::tester' is not a constant expression}} \
                                                 // expected-note {{read of non-const variable 'bad' is not allowed in a constant expression}}
 }
 }
diff --git a/clang/test/SemaCXX/cxx2b-ast-print.cpp b/clang/test/SemaCXX/cxx2b-ast-print.cpp
index a245647d2eb5..37a2bf98742e 100644
--- a/clang/test/SemaCXX/cxx2b-ast-print.cpp
+++ b/clang/test/SemaCXX/cxx2b-ast-print.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++23 -fsyntax-only -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -std=c++23 -ast-print %s | FileCheck %s
 
 template <template <class...> class C>
 void test_auto_expr(long long y, auto &&z) {
diff --git a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
index 4a75392045d0..07937deb6673 100644
--- a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
+++ b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
@@ -394,3 +394,55 @@ static_assert(none_of(
 ));
 
 }
+
+#if __cplusplus >= 202302L
+namespace lvalue_to_rvalue_init_from_heap {
+
+struct S {
+    int *value;
+    constexpr S(int v) : value(new int {v}) {}  // expected-note 2 {{heap allocation performed here}}
+    constexpr ~S() { delete value; }
+};
+consteval S fn() { return S(5); }
+int fn2() { return 2; }  // expected-note {{declared here}}
+
+constexpr int a = *fn().value;
+constinit int b = *fn().value;
+const int c = *fn().value;
+int d = *fn().value;
+
+constexpr int e = *fn().value + fn2(); // expected-error {{must be initialized by a constant expression}} \
+                                       // expected-error {{call to consteval function 'lvalue_to_rvalue_init_from_heap::fn' is not a constant expression}} \
+                                       // expected-note {{non-constexpr function 'fn2'}} \
+                                       // expected-note {{pointer to heap-allocated object}}
+
+int f = *fn().value + fn2();  // expected-error {{call to consteval function 'lvalue_to_rvalue_init_from_heap::fn' is not a constant expression}} \
+                              // expected-note {{pointer to heap-allocated object}}
+}
+#endif
+
+
+#if __cplusplus >= 202302L
+
+namespace GH91509 {
+
+consteval int f(int) { return 0; }
+
+template<typename T>
+constexpr int g(int x) {
+    if consteval {
+        return f(x);
+    }
+    if !consteval {}
+    else {
+        return f(x);
+    }
+    return 1;
+}
+
+int h(int x) {
+    return g<void>(x);
+}
+}
+
+#endif
diff --git a/clang/test/SemaCXX/cxx2b-deducing-this-constexpr.cpp b/clang/test/SemaCXX/cxx2b-deducing-this-constexpr.cpp
index 9dbea17dd2ca..191fb013e031 100644
--- a/clang/test/SemaCXX/cxx2b-deducing-this-constexpr.cpp
+++ b/clang/test/SemaCXX/cxx2b-deducing-this-constexpr.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -std=c++2b %s -verify
+// RUN: %clang_cc1 -fsyntax-only -std=c++2b %s -verify -fexperimental-new-constant-interpreter
 // expected-no-diagnostics
 
 template <typename Base>
diff --git a/clang/test/SemaCXX/delete-and-function-templates.cpp b/clang/test/SemaCXX/delete-and-function-templates.cpp
index 0232b5bc6f12..bc46f17a8c17 100644
--- a/clang/test/SemaCXX/delete-and-function-templates.cpp
+++ b/clang/test/SemaCXX/delete-and-function-templates.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++11 -verify -fsyntax-only  -emit-llvm-only %s
+// RUN: %clang_cc1 -std=c++11 -verify  -emit-llvm-only %s
 // RUN: %clang_cc1 -std=c++11 -verify -fsyntax-only  -fdelayed-template-parsing %s 
 // RUN: %clang_cc1 -std=c++11 -verify -fsyntax-only  -fms-extensions %s 
 // RUN: %clang_cc1 -std=c++11 -verify -fsyntax-only  -fdelayed-template-parsing -fms-extensions %s 
diff --git a/clang/test/SemaCXX/delete.cpp b/clang/test/SemaCXX/delete.cpp
index 0c853f68c061..08cc1766e9f7 100644
--- a/clang/test/SemaCXX/delete.cpp
+++ b/clang/test/SemaCXX/delete.cpp
@@ -3,7 +3,7 @@
 
 // Test with PCH
 // RUN: %clang_cc1 -x c++-header -std=c++11 -emit-pch -o %t %S/delete-mismatch.h
-// RUN: %clang_cc1 -std=c++11 -include-pch %t -DWITH_PCH -fsyntax-only -verify %s -ast-dump
+// RUN: %clang_cc1 -std=c++11 -include-pch %t -DWITH_PCH -verify %s -ast-dump
 
 void f(int a[10][20]) {
   delete a; // expected-warning {{'delete' applied to a pointer-to-array type}}
diff --git a/clang/test/SemaCXX/enum-scoped.cpp b/clang/test/SemaCXX/enum-scoped.cpp
index b1d9a215c437..d7b7923430af 100644
--- a/clang/test/SemaCXX/enum-scoped.cpp
+++ b/clang/test/SemaCXX/enum-scoped.cpp
@@ -53,6 +53,7 @@ enum class E4 {
   e1 = -2147483648, // ok
   e2 = 2147483647, // ok
   e3 = 2147483648 // expected-error{{enumerator value evaluates to 2147483648, which cannot be narrowed to type 'int'}}
+                  // expected-warning@-1{{changes value}}
 };
 
 enum class E5 {
diff --git a/clang/test/SemaCXX/eval-crashes.cpp b/clang/test/SemaCXX/eval-crashes.cpp
index 017df977b26b..a06f60f71e9c 100644
--- a/clang/test/SemaCXX/eval-crashes.cpp
+++ b/clang/test/SemaCXX/eval-crashes.cpp
@@ -25,11 +25,9 @@ namespace pr33140_0b {
 }
 
 namespace pr33140_2 {
-  // FIXME: The declaration of 'b' below should lifetime-extend two int
-  // temporaries.
-  struct A { int &&r = 0; }; // expected-note 2{{initializing field 'r' with default member initializer}}
+  struct A { int &&r = 0; };
   struct B { A x, y; };
-  B b = {}; // expected-warning 2{{lifetime extension of temporary created by aggregate initialization using a default member initializer is not yet supported}}
+  B b = {};
 }
 
 namespace pr33140_3 {
diff --git a/clang/test/SemaCXX/friend3.cpp b/clang/test/SemaCXX/friend3.cpp
index 8b83ca78d403..3368fdb07615 100644
--- a/clang/test/SemaCXX/friend3.cpp
+++ b/clang/test/SemaCXX/friend3.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -S -triple %itanium_abi_triple -std=c++11 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++11 -emit-llvm %s -o - | FileCheck %s
 
 namespace pr8852 {
 void foo();
diff --git a/clang/test/SemaCXX/lambda-conversion-op-cc.cpp b/clang/test/SemaCXX/lambda-conversion-op-cc.cpp
index 16ca5535019d..3632f8c8c80a 100644
--- a/clang/test/SemaCXX/lambda-conversion-op-cc.cpp
+++ b/clang/test/SemaCXX/lambda-conversion-op-cc.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -fsyntax-only -triple x86_64-linux-pc %s -verify -DBAD_CONVERSION
 // RUN: %clang_cc1 -fsyntax-only -triple i386-windows-pc %s -verify -DBAD_CONVERSION -DWIN32
-// RUN: %clang_cc1 -fsyntax-only -triple x86_64-linux-pc %s -ast-dump | FileCheck %s --check-prefixes=CHECK,LIN64,NODEF
-// RUN: %clang_cc1 -fsyntax-only -triple i386-windows-pc %s -ast-dump -DWIN32 | FileCheck %s --check-prefixes=CHECK,WIN32,NODEF
+// RUN: %clang_cc1 -triple x86_64-linux-pc %s -ast-dump | FileCheck %s --check-prefixes=CHECK,LIN64,NODEF
+// RUN: %clang_cc1 -triple i386-windows-pc %s -ast-dump -DWIN32 | FileCheck %s --check-prefixes=CHECK,WIN32,NODEF
 
 // RUN: %clang_cc1 -fsyntax-only -triple x86_64-linux-pc -fdefault-calling-conv=vectorcall %s -verify -DBAD_VEC_CONVERS
-// RUN: %clang_cc1 -fsyntax-only -triple x86_64-linux-pc -fdefault-calling-conv=vectorcall %s -ast-dump | FileCheck %s --check-prefixes=CHECK,VECTDEF
+// RUN: %clang_cc1 -triple x86_64-linux-pc -fdefault-calling-conv=vectorcall %s -ast-dump | FileCheck %s --check-prefixes=CHECK,VECTDEF
 
 void useage() {
   auto normal = [](int, float, double) {};                                // #1
diff --git a/clang/test/SemaCXX/ms_wide_bitfield.cpp b/clang/test/SemaCXX/ms_wide_bitfield.cpp
index 0dcc787928b0..a5b7a433c7c4 100644
--- a/clang/test/SemaCXX/ms_wide_bitfield.cpp
+++ b/clang/test/SemaCXX/ms_wide_bitfield.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple i686-pc-win32 -fdump-record-layouts -fsyntax-only -mms-bitfields -verify %s 2>&1
+// RUN: %clang_cc1 -fno-rtti -triple i686-pc-win32 -fdump-record-layouts -fsyntax-only -mms-bitfields -verify %s 2>&1
 
 struct A {
   char a : 9; // expected-error{{width of bit-field 'a' (9 bits) exceeds the size of its type (8 bits)}}
diff --git a/clang/test/SemaCXX/type-traits.cpp b/clang/test/SemaCXX/type-traits.cpp
index 01991887b284..f2fd45762abf 100644
--- a/clang/test/SemaCXX/type-traits.cpp
+++ b/clang/test/SemaCXX/type-traits.cpp
@@ -2908,6 +2908,12 @@ struct ConvertsToRef {
   operator RefType() const { return static_cast<RefType>(obj); }
   mutable T obj = 42;
 };
+template <class T, class RefType = T &>
+class ConvertsToRefPrivate {
+  operator RefType() const { return static_cast<RefType>(obj); }
+  mutable T obj = 42;
+};
+
 
 void reference_binds_to_temporary_checks() {
   static_assert(!(__reference_binds_to_temporary(int &, int &)));
@@ -2937,6 +2943,8 @@ void reference_binds_to_temporary_checks() {
 
   static_assert((__is_constructible(int const &, LongRef)));
   static_assert((__reference_binds_to_temporary(int const &, LongRef)));
+  static_assert(!__reference_binds_to_temporary(int const &, ConvertsToRefPrivate<long, long &>));
+
 
   // Test that it doesn't accept non-reference types as input.
   static_assert(!(__reference_binds_to_temporary(int, long)));
@@ -2944,6 +2952,17 @@ void reference_binds_to_temporary_checks() {
   static_assert((__reference_binds_to_temporary(const int &, long)));
 }
 
+
+struct ExplicitConversionRvalueRef {
+    operator int();
+    explicit operator int&&();
+};
+
+struct ExplicitConversionRef {
+    operator int();
+    explicit operator int&();
+};
+
 void reference_constructs_from_temporary_checks() {
   static_assert(!__reference_constructs_from_temporary(int &, int &));
   static_assert(!__reference_constructs_from_temporary(int &, int &&));
@@ -2973,6 +2992,8 @@ void reference_constructs_from_temporary_checks() {
 
   static_assert(__is_constructible(int const &, LongRef));
   static_assert(__reference_constructs_from_temporary(int const &, LongRef));
+  static_assert(!__reference_constructs_from_temporary(int const &, ConvertsToRefPrivate<long, long &>));
+
 
   // Test that it doesn't accept non-reference types as input.
   static_assert(!__reference_constructs_from_temporary(int, long));
@@ -2987,6 +3008,65 @@ void reference_constructs_from_temporary_checks() {
   static_assert(!__reference_constructs_from_temporary(const int&, int&&));
   static_assert(__reference_constructs_from_temporary(int&&, long&&));
   static_assert(__reference_constructs_from_temporary(int&&, long));
+
+
+  static_assert(!__reference_constructs_from_temporary(int&, ExplicitConversionRef));
+  static_assert(!__reference_constructs_from_temporary(const int&, ExplicitConversionRef));
+  static_assert(!__reference_constructs_from_temporary(int&&, ExplicitConversionRvalueRef));
+
+
+}
+
+void reference_converts_from_temporary_checks() {
+  static_assert(!__reference_converts_from_temporary(int &, int &));
+  static_assert(!__reference_converts_from_temporary(int &, int &&));
+
+  static_assert(!__reference_converts_from_temporary(int const &, int &));
+  static_assert(!__reference_converts_from_temporary(int const &, int const &));
+  static_assert(!__reference_converts_from_temporary(int const &, int &&));
+
+  static_assert(!__reference_converts_from_temporary(int &, long &)); // doesn't construct
+
+  static_assert(__reference_converts_from_temporary(int const &, long &));
+  static_assert(__reference_converts_from_temporary(int const &, long &&));
+  static_assert(__reference_converts_from_temporary(int &&, long &));
+
+  using LRef = ConvertsToRef<int, int &>;
+  using RRef = ConvertsToRef<int, int &&>;
+  using CLRef = ConvertsToRef<int, const int &>;
+  using LongRef = ConvertsToRef<long, long &>;
+  static_assert(__is_constructible(int &, LRef));
+  static_assert(!__reference_converts_from_temporary(int &, LRef));
+
+  static_assert(__is_constructible(int &&, RRef));
+  static_assert(!__reference_converts_from_temporary(int &&, RRef));
+
+  static_assert(__is_constructible(int const &, CLRef));
+  static_assert(!__reference_converts_from_temporary(int &&, CLRef));
+
+  static_assert(__is_constructible(int const &, LongRef));
+  static_assert(__reference_converts_from_temporary(int const &, LongRef));
+  static_assert(!__reference_converts_from_temporary(int const &, ConvertsToRefPrivate<long, long &>));
+
+
+  // Test that it doesn't accept non-reference types as input.
+  static_assert(!__reference_converts_from_temporary(int, long));
+
+  static_assert(__reference_converts_from_temporary(const int &, long));
+
+  // Additional checks
+  static_assert(__reference_converts_from_temporary(POD const&, Derives));
+  static_assert(__reference_converts_from_temporary(int&&, int));
+  static_assert(__reference_converts_from_temporary(const int&, int));
+  static_assert(!__reference_converts_from_temporary(int&&, int&&));
+  static_assert(!__reference_converts_from_temporary(const int&, int&&));
+  static_assert(__reference_converts_from_temporary(int&&, long&&));
+  static_assert(__reference_converts_from_temporary(int&&, long));
+
+  static_assert(!__reference_converts_from_temporary(int&, ExplicitConversionRef));
+  static_assert(__reference_converts_from_temporary(const int&, ExplicitConversionRef));
+  static_assert(__reference_converts_from_temporary(int&&, ExplicitConversionRvalueRef));
+
 }
 
 void array_rank() {
diff --git a/clang/test/SemaCXX/uninit-variables-conditional.cpp b/clang/test/SemaCXX/uninit-variables-conditional.cpp
index 3c44c7249d51..3e3af672b8fa 100644
--- a/clang/test/SemaCXX/uninit-variables-conditional.cpp
+++ b/clang/test/SemaCXX/uninit-variables-conditional.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -Wconditional-uninitialized -fsyntax-only %s -verify
+// RUN: %clang_cc1 -Wconditional-uninitialized -fsyntax-only %s -verify
 
 class Foo {
 public:
diff --git a/clang/test/SemaCXX/uninit-variables.cpp b/clang/test/SemaCXX/uninit-variables.cpp
index 90d1ddb31718..36f3e04d250a 100644
--- a/clang/test/SemaCXX/uninit-variables.cpp
+++ b/clang/test/SemaCXX/uninit-variables.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -Wuninitialized -Wno-uninitialized-const-reference -fsyntax-only -fcxx-exceptions %s -verify -std=c++1y
+// RUN: %clang_cc1 -Wuninitialized -Wno-uninitialized-const-reference -fsyntax-only -fcxx-exceptions %s -verify -std=c++1y
 
 // Stub out types for 'typeid' to work.
 namespace std { class type_info {}; }
diff --git a/clang/test/SemaCXX/vla-ext-diag.cpp b/clang/test/SemaCXX/vla-ext-diag.cpp
index 7492bbae6c46..08b78c1f3e11 100644
--- a/clang/test/SemaCXX/vla-ext-diag.cpp
+++ b/clang/test/SemaCXX/vla-ext-diag.cpp
@@ -1,40 +1,40 @@
-// RUN: %clang_cc1 -verify=gnu -std=gnu++11 %s
-// RUN: %clang_cc1 -verify=expected,cxx11 -Wvla -std=gnu++11 %s
-// RUN: %clang_cc1 -verify=expected,cxx11 -std=c++11 %s
-// RUN: %clang_cc1 -verify=expected,cxx98 -std=c++98 %s
-// RUN: %clang_cc1 -verify=expected,off -std=c++11 -Wno-vla-extension-static-assert %s
-// gnu-no-diagnostics
-
-// Demonstrate that we do not diagnose use of VLAs by default in GNU mode, but
-// we do diagnose them in C++ mode. Also note that we suggest use of
-// static_assert, but only in C++11 and later and only if the warning group is
-// not disabled.
-
-// C++98 mode does not emit the same notes as C++11 mode because in C++98,
-// we're looking for an integer constant expression, whereas in C++11 and later,
-// we're looking for a constant expression that is of integer type (these are
-// different operations; ICE looks at the syntactic form of the expression, but
-// C++11 constant expressions require calculating the expression value).
-void func(int n) { // cxx11-note {{declared here}} off-note {{declared here}}
-  int vla[n]; // expected-warning {{variable length arrays in C++ are a Clang extension}} \
-                 cxx11-note {{function parameter 'n' with unknown value cannot be used in a constant expression}} \
-                 off-note {{function parameter 'n' with unknown value cannot be used in a constant expression}}
-}
-
-void old_style_static_assert(int n) { // cxx11-note 5 {{declared here}} off-note 2 {{declared here}}
-  int array1[n != 12 ? 1 : -1]; // cxx11-warning {{variable length arrays in C++ are a Clang extension; did you mean to use 'static_assert'?}} \
-                                   cxx98-warning {{variable length arrays in C++ are a Clang extension}} \
-                                   cxx11-note {{function parameter 'n' with unknown value cannot be used in a constant expression}}
-  int array2[n != 12 ? -1 : 1]; // cxx11-warning {{variable length arrays in C++ are a Clang extension; did you mean to use 'static_assert'?}} \
-                                   cxx98-warning {{variable length arrays in C++ are a Clang extension}} \
-                                   cxx11-note {{function parameter 'n' with unknown value cannot be used in a constant expression}}
-  int array3[n != 12 ? 1 : n];  // expected-warning {{variable length arrays in C++ are a Clang extension}} \
-                                   cxx11-note {{function parameter 'n' with unknown value cannot be used in a constant expression}} \
-                                   off-note {{function parameter 'n' with unknown value cannot be used in a constant expression}}
-  int array4[(n ? 1 : -1)];     // cxx11-warning {{variable length arrays in C++ are a Clang extension; did you mean to use 'static_assert'?}} \
-                                   cxx98-warning {{variable length arrays in C++ are a Clang extension}} \
-                                   cxx11-note {{function parameter 'n' with unknown value cannot be used in a constant expression}}
-  int array5[n ? 1 : 0];        // expected-warning {{variable length arrays in C++ are a Clang extension}} \
-                                   cxx11-note {{function parameter 'n' with unknown value cannot be used in a constant expression}} \
-                                   off-note {{function parameter 'n' with unknown value cannot be used in a constant expression}}
-}
+// RUN: %clang_cc1 -verify=gnu -std=gnu++11 %s
+// RUN: %clang_cc1 -verify=expected,cxx11 -Wvla -std=gnu++11 %s
+// RUN: %clang_cc1 -verify=expected,cxx11 -std=c++11 %s
+// RUN: %clang_cc1 -verify=expected,cxx98 -std=c++98 %s
+// RUN: %clang_cc1 -verify=expected,off -std=c++11 -Wno-vla-extension-static-assert %s
+// gnu-no-diagnostics
+
+// Demonstrate that we do not diagnose use of VLAs by default in GNU mode, but
+// we do diagnose them in C++ mode. Also note that we suggest use of
+// static_assert, but only in C++11 and later and only if the warning group is
+// not disabled.
+
+// C++98 mode does not emit the same notes as C++11 mode because in C++98,
+// we're looking for an integer constant expression, whereas in C++11 and later,
+// we're looking for a constant expression that is of integer type (these are
+// different operations; ICE looks at the syntactic form of the expression, but
+// C++11 constant expressions require calculating the expression value).
+void func(int n) { // cxx11-note {{declared here}} off-note {{declared here}}
+  int vla[n]; // expected-warning {{variable length arrays in C++ are a Clang extension}} \
+                 cxx11-note {{function parameter 'n' with unknown value cannot be used in a constant expression}} \
+                 off-note {{function parameter 'n' with unknown value cannot be used in a constant expression}}
+}
+
+void old_style_static_assert(int n) { // cxx11-note 5 {{declared here}} off-note 2 {{declared here}}
+  int array1[n != 12 ? 1 : -1]; // cxx11-warning {{variable length arrays in C++ are a Clang extension; did you mean to use 'static_assert'?}} \
+                                   cxx98-warning {{variable length arrays in C++ are a Clang extension}} \
+                                   cxx11-note {{function parameter 'n' with unknown value cannot be used in a constant expression}}
+  int array2[n != 12 ? -1 : 1]; // cxx11-warning {{variable length arrays in C++ are a Clang extension; did you mean to use 'static_assert'?}} \
+                                   cxx98-warning {{variable length arrays in C++ are a Clang extension}} \
+                                   cxx11-note {{function parameter 'n' with unknown value cannot be used in a constant expression}}
+  int array3[n != 12 ? 1 : n];  // expected-warning {{variable length arrays in C++ are a Clang extension}} \
+                                   cxx11-note {{function parameter 'n' with unknown value cannot be used in a constant expression}} \
+                                   off-note {{function parameter 'n' with unknown value cannot be used in a constant expression}}
+  int array4[(n ? 1 : -1)];     // cxx11-warning {{variable length arrays in C++ are a Clang extension; did you mean to use 'static_assert'?}} \
+                                   cxx98-warning {{variable length arrays in C++ are a Clang extension}} \
+                                   cxx11-note {{function parameter 'n' with unknown value cannot be used in a constant expression}}
+  int array5[n ? 1 : 0];        // expected-warning {{variable length arrays in C++ are a Clang extension}} \
+                                   cxx11-note {{function parameter 'n' with unknown value cannot be used in a constant expression}} \
+                                   off-note {{function parameter 'n' with unknown value cannot be used in a constant expression}}
+}
diff --git a/clang/test/SemaCXX/warn-redundant-move.cpp b/clang/test/SemaCXX/warn-redundant-move.cpp
index 2bfc8c9312f0..9853361367e9 100644
--- a/clang/test/SemaCXX/warn-redundant-move.cpp
+++ b/clang/test/SemaCXX/warn-redundant-move.cpp
@@ -1,116 +1,116 @@
-// RUN: %clang_cc1 -fsyntax-only -Wredundant-move -std=c++11 -verify %s
-// RUN: %clang_cc1 -fsyntax-only -Wredundant-move -std=c++11 -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s
-// RUN: %clang_cc1 -fsyntax-only -std=c++11 %s -ast-dump | FileCheck %s --check-prefix=CHECK-AST
-
-// definitions for std::move
-namespace std {
-inline namespace foo {
-template <class T> struct remove_reference { typedef T type; };
-template <class T> struct remove_reference<T&> { typedef T type; };
-template <class T> struct remove_reference<T&&> { typedef T type; };
-
-template <class T> typename remove_reference<T>::type &&move(T &&t);
-}
-}
-
-// test1 and test2 should not warn until after implementation of DR1579.
-struct A {};
-struct B : public A {};
-
-A test1(B b1) {
-  B b2;
-  return b1;
-  return b2;
-  return std::move(b1);
-  return std::move(b2);
-}
-
-struct C {
-  C() {}
-  C(A) {}
-};
-
-C test2(A a1, B b1) {
-  A a2;
-  B b2;
-
-  return a1;
-  return a2;
-  return b1;
-  return b2;
-
-  return std::move(a1);
-  return std::move(a2);
-  return std::move(b1);
-  return std::move(b2);
-}
-
-// Copy of tests above with types changed to reference types.
-A test3(B& b1) {
-  B& b2 = b1;
-  return b1;
-  return b2;
-  return std::move(b1);
-  return std::move(b2);
-}
-
-C test4(A& a1, B& b1) {
-  A& a2 = a1;
-  B& b2 = b1;
-
-  return a1;
-  return a2;
-  return b1;
-  return b2;
-
-  return std::move(a1);
-  return std::move(a2);
-  return std::move(b1);
-  return std::move(b2);
-}
-
-// PR23819, case 2
-struct D {};
-D test5(D d) {
-  return d;
-  // Verify the implicit move from the AST dump
-  // CHECK-AST: ReturnStmt{{.*}}line:[[@LINE-2]]
-  // CHECK-AST-NEXT: CXXConstructExpr{{.*}}D{{.*}}void (D &&)
-  // CHECK-AST-NEXT: ImplicitCastExpr
-  // CHECK-AST-NEXT: DeclRefExpr{{.*}}ParmVar{{.*}}'d'
-
-  return std::move(d);
-  // expected-warning@-1{{redundant move in return statement}}
-  // expected-note@-2{{remove std::move call here}}
-  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:10-[[@LINE-3]]:20}:""
-  // CHECK: fix-it:"{{.*}}":{[[@LINE-4]]:21-[[@LINE-4]]:22}:""
-}
-
-namespace templates {
-  struct A {};
-  struct B { B(A); };
-
-  // Warn once here since the type is not dependent.
-  template <typename T>
-  A test1(A a) {
-    return std::move(a);
-    // expected-warning@-1{{redundant move in return statement}}
-    // expected-note@-2{{remove std::move call here}}
-    // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:12-[[@LINE-3]]:22}:""
-    // CHECK: fix-it:"{{.*}}":{[[@LINE-4]]:23-[[@LINE-4]]:24}:""
-  }
-  void run_test1() {
-    test1<A>(A());
-    test1<B>(A());
-  }
-
-  // T1 and T2 may not be the same, the warning may not always apply.
-  template <typename T1, typename T2>
-  T1 test2(T2 t) {
-    return std::move(t);
-  }
-  void run_test2() {
-    test2<A, A>(A());
-    test2<B, A>(A());
-  }
-}
+// RUN: %clang_cc1 -fsyntax-only -Wredundant-move -std=c++11 -verify %s
+// RUN: %clang_cc1 -fsyntax-only -Wredundant-move -std=c++11 -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s
+// RUN: %clang_cc1 -std=c++11 %s -ast-dump | FileCheck %s --check-prefix=CHECK-AST
+
+// definitions for std::move
+namespace std {
+inline namespace foo {
+template <class T> struct remove_reference { typedef T type; };
+template <class T> struct remove_reference<T&> { typedef T type; };
+template <class T> struct remove_reference<T&&> { typedef T type; };
+
+template <class T> typename remove_reference<T>::type &&move(T &&t);
+}
+}
+
+// test1 and test2 should not warn until after implementation of DR1579.
+struct A {};
+struct B : public A {};
+
+A test1(B b1) {
+  B b2;
+  return b1;
+  return b2;
+  return std::move(b1);
+  return std::move(b2);
+}
+
+struct C {
+  C() {}
+  C(A) {}
+};
+
+C test2(A a1, B b1) {
+  A a2;
+  B b2;
+
+  return a1;
+  return a2;
+  return b1;
+  return b2;
+
+  return std::move(a1);
+  return std::move(a2);
+  return std::move(b1);
+  return std::move(b2);
+}
+
+// Copy of tests above with types changed to reference types.
+A test3(B& b1) {
+  B& b2 = b1;
+  return b1;
+  return b2;
+  return std::move(b1);
+  return std::move(b2);
+}
+
+C test4(A& a1, B& b1) {
+  A& a2 = a1;
+  B& b2 = b1;
+
+  return a1;
+  return a2;
+  return b1;
+  return b2;
+
+  return std::move(a1);
+  return std::move(a2);
+  return std::move(b1);
+  return std::move(b2);
+}
+
+// PR23819, case 2
+struct D {};
+D test5(D d) {
+  return d;
+  // Verify the implicit move from the AST dump
+  // CHECK-AST: ReturnStmt{{.*}}line:[[@LINE-2]]
+  // CHECK-AST-NEXT: CXXConstructExpr{{.*}}D{{.*}}void (D &&)
+  // CHECK-AST-NEXT: ImplicitCastExpr
+  // CHECK-AST-NEXT: DeclRefExpr{{.*}}ParmVar{{.*}}'d'
+
+  return std::move(d);
+  // expected-warning@-1{{redundant move in return statement}}
+  // expected-note@-2{{remove std::move call here}}
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:10-[[@LINE-3]]:20}:""
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-4]]:21-[[@LINE-4]]:22}:""
+}
+
+namespace templates {
+  struct A {};
+  struct B { B(A); };
+
+  // Warn once here since the type is not dependent.
+  template <typename T>
+  A test1(A a) {
+    return std::move(a);
+    // expected-warning@-1{{redundant move in return statement}}
+    // expected-note@-2{{remove std::move call here}}
+    // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:12-[[@LINE-3]]:22}:""
+    // CHECK: fix-it:"{{.*}}":{[[@LINE-4]]:23-[[@LINE-4]]:24}:""
+  }
+  void run_test1() {
+    test1<A>(A());
+    test1<B>(A());
+  }
+
+  // T1 and T2 may not be the same, the warning may not always apply.
+  template <typename T1, typename T2>
+  T1 test2(T2 t) {
+    return std::move(t);
+  }
+  void run_test2() {
+    test2<A, A>(A());
+    test2<B, A>(A());
+  }
+}
diff --git a/clang/test/SemaCXX/warn-shadow.cpp b/clang/test/SemaCXX/warn-shadow.cpp
index ca7f9624c08e..2969bd39fed4 100644
--- a/clang/test/SemaCXX/warn-shadow.cpp
+++ b/clang/test/SemaCXX/warn-shadow.cpp
@@ -61,13 +61,13 @@ class A {
   // expected-warning-re@+1 4 {{constructor parameter 'f{{[0-4]}}' shadows the field 'f{{[0-9]}}' of 'A'}}
   A(int f1, int f2, int f3, int f4, double overload_dummy) {}
 
-  void test() {
-    char *field; // expected-warning {{declaration shadows a field of 'A'}}
-    char *data; // expected-warning {{declaration shadows a static data member of 'A'}}
-    char *a1; // no warning 
-    char *a2; // no warning
-    char *jj; // no warning
-    char *jjj; // no warning
+  void test() {
+    char *field; // expected-warning {{declaration shadows a field of 'A'}}
+    char *data; // expected-warning {{declaration shadows a static data member of 'A'}}
+    char *a1; // no warning
+    char *a2; // no warning
+    char *jj; // no warning
+    char *jjj; // no warning
     static char *f1; // expected-warning {{declaration shadows a field of 'A'}}
   }
 
@@ -197,14 +197,14 @@ void avoidWarningWhenRedefining(int b) { // expected-note {{previous definition
   int k; // expected-note {{previous definition is here}}
   typedef int k; // expected-error {{redefinition of 'k'}}
 
-  using l=char; // no warning or error.
-  using l=char; // no warning or error.
-  typedef char l; // no warning or error.
- 
-  typedef char n; // no warning or error. 
+  using l=char; // no warning or error.
+  using l=char; // no warning or error.
+  typedef char l; // no warning or error.
+
+  typedef char n; // no warning or error.
   typedef char n; // no warning or error.
-  using n=char; // no warning or error.
-}
+  using n=char; // no warning or error.
+}
 
 }
 
@@ -220,42 +220,42 @@ void f(int a) {
   struct A {
     void g(int a) {}
     A() { int a; }
-  };
-}
-}
-
-namespace PR34120 {
-struct A {
-  int B; // expected-note 2 {{declared here}}
-};
-
-class C : public A {
-  void D(int B) {} // expected-warning {{parameter 'B' shadows member inherited from type 'A'}}
-  void E() {
-    extern void f(int B); // Ok
-  }
-  void F(int B); // Ok, declaration; not definition.
-  void G(int B);
-};
-
-void C::G(int B) { // expected-warning {{parameter 'B' shadows member inherited from type 'A'}}
-}
-
-class Private {
-  int B;
-};
-class Derived : Private {
-  void D(int B) {} // Ok
-};
-
-struct Static {
-  static int B;
-};
-
-struct Derived2 : Static {
-  void D(int B) {}
-};
-}
+  };
+}
+}
+
+namespace PR34120 {
+struct A {
+  int B; // expected-note 2 {{declared here}}
+};
+
+class C : public A {
+  void D(int B) {} // expected-warning {{parameter 'B' shadows member inherited from type 'A'}}
+  void E() {
+    extern void f(int B); // Ok
+  }
+  void F(int B); // Ok, declaration; not definition.
+  void G(int B);
+};
+
+void C::G(int B) { // expected-warning {{parameter 'B' shadows member inherited from type 'A'}}
+}
+
+class Private {
+  int B;
+};
+class Derived : Private {
+  void D(int B) {} // Ok
+};
+
+struct Static {
+  static int B;
+};
+
+struct Derived2 : Static {
+  void D(int B) {}
+};
+}
 
 int PR24718;
 enum class X { PR24718 }; // Ok, not shadowing
diff --git a/clang/test/SemaHLSL/BuiltIns/clamp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/clamp-errors.hlsl
index f669098ef515..8e0709eb0302 100644
--- a/clang/test/SemaHLSL/BuiltIns/clamp-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/clamp-errors.hlsl
@@ -1,91 +1,91 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
-
-float2 test_no_second_arg(float2 p0) {
-  return __builtin_hlsl_elementwise_clamp(p0);
-  // expected-error@-1 {{too few arguments to function call, expected 3, have 1}}
-}
-
-float2 test_no_third_arg(float2 p0) {
-  return __builtin_hlsl_elementwise_clamp(p0, p0);
-  // expected-error@-1 {{too few arguments to function call, expected 3, have 2}}
-}
-
-float2 test_too_many_arg(float2 p0) {
-  return __builtin_hlsl_elementwise_clamp(p0, p0, p0, p0);
-  // expected-error@-1 {{too many arguments to function call, expected 3, have 4}}
-}
-
-float2 test_clamp_no_second_arg(float2 p0) {
-  return clamp(p0);
-  // expected-error@-1 {{no matching function for call to 'clamp'}}
-}
-
-float2 test_clamp_vector_size_mismatch(float3 p0, float2 p1) {
-  return clamp(p0, p0, p1);
-  // expected-warning@-1 {{implicit conversion truncates vector: 'float3' (aka 'vector<float, 3>') to 'float __attribute__((ext_vector_type(2)))' (vector of 2 'float' values)}}
-}
-
-float2 test_clamp_builtin_vector_size_mismatch(float3 p0, float2 p1) {
-  return __builtin_hlsl_elementwise_clamp(p0, p1, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_elementwise_clamp' must have the same type}}
-}
-
-float test_clamp_scalar_mismatch(float p0, half p1) {
-  return clamp(p1, p0, p1);
-  // expected-error@-1 {{call to 'clamp' is ambiguous}}
-}
-
-float2 test_clamp_element_type_mismatch(half2 p0, float2 p1) {
-  return clamp(p1, p0, p1);
-  // expected-error@-1 {{call to 'clamp' is ambiguous}}
-}
-
-float2 test_builtin_clamp_float2_splat(float p0, float2 p1) {
-  return __builtin_hlsl_elementwise_clamp(p0, p1, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_elementwise_clamp' must be vectors}}
-}
-
-float3 test_builtin_clamp_float3_splat(float p0, float3 p1) {
-  return __builtin_hlsl_elementwise_clamp(p0, p1, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_elementwise_clamp' must be vectors}}
-}
-
-float4 test_builtin_clamp_float4_splat(float p0, float4 p1) {
-  return __builtin_hlsl_elementwise_clamp(p0, p1, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_elementwise_clamp' must be vectors}}
-}
-
-float2 test_clamp_float2_int_splat(float2 p0, int p1) {
-  return __builtin_hlsl_elementwise_clamp(p0, p1, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_elementwise_clamp' must be vectors}}
-}
-
-float3 test_clamp_float3_int_splat(float3 p0, int p1) {
-  return __builtin_hlsl_elementwise_clamp(p0, p1, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_elementwise_clamp' must be vectors}}
-}
-
-float2 test_builtin_clamp_int_vect_to_float_vec_promotion(int2 p0, float p1) {
-  return __builtin_hlsl_elementwise_clamp(p0, p1, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_elementwise_clamp' must be vectors}}
-}
-
-float test_builtin_clamp_bool_type_promotion(bool p0) {
-  return __builtin_hlsl_elementwise_clamp(p0, p0, p0);
-  // expected-error@-1 {{1st argument must be a vector, integer or floating point type (was 'bool')}}
-}
-
-float builtin_bool_to_float_type_promotion(float p0, bool p1) {
-  return __builtin_hlsl_elementwise_clamp(p0, p0, p1);
-  // expected-error@-1 {{3rd argument must be a floating point type (was 'bool')}}
-}
-
-float builtin_bool_to_float_type_promotion2(bool p0, float p1) {
-  return __builtin_hlsl_elementwise_clamp(p1, p0, p1);
-  // expected-error@-1 {{2nd argument must be a floating point type (was 'bool')}}
-}
-
-float builtin_clamp_int_to_float_promotion(float p0, int p1) {
-  return __builtin_hlsl_elementwise_clamp(p0, p0, p1);
-  // expected-error@-1 {{3rd argument must be a floating point type (was 'int')}}
-}
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
+
+float2 test_no_second_arg(float2 p0) {
+  return __builtin_hlsl_elementwise_clamp(p0);
+  // expected-error@-1 {{too few arguments to function call, expected 3, have 1}}
+}
+
+float2 test_no_third_arg(float2 p0) {
+  return __builtin_hlsl_elementwise_clamp(p0, p0);
+  // expected-error@-1 {{too few arguments to function call, expected 3, have 2}}
+}
+
+float2 test_too_many_arg(float2 p0) {
+  return __builtin_hlsl_elementwise_clamp(p0, p0, p0, p0);
+  // expected-error@-1 {{too many arguments to function call, expected 3, have 4}}
+}
+
+float2 test_clamp_no_second_arg(float2 p0) {
+  return clamp(p0);
+  // expected-error@-1 {{no matching function for call to 'clamp'}}
+}
+
+float2 test_clamp_vector_size_mismatch(float3 p0, float2 p1) {
+  return clamp(p0, p0, p1);
+  // expected-warning@-1 {{implicit conversion truncates vector: 'float3' (aka 'vector<float, 3>') to 'float __attribute__((ext_vector_type(2)))' (vector of 2 'float' values)}}
+}
+
+float2 test_clamp_builtin_vector_size_mismatch(float3 p0, float2 p1) {
+  return __builtin_hlsl_elementwise_clamp(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_elementwise_clamp' must have the same type}}
+}
+
+float test_clamp_scalar_mismatch(float p0, half p1) {
+  return clamp(p1, p0, p1);
+  // expected-error@-1 {{call to 'clamp' is ambiguous}}
+}
+
+float2 test_clamp_element_type_mismatch(half2 p0, float2 p1) {
+  return clamp(p1, p0, p1);
+  // expected-error@-1 {{call to 'clamp' is ambiguous}}
+}
+
+float2 test_builtin_clamp_float2_splat(float p0, float2 p1) {
+  return __builtin_hlsl_elementwise_clamp(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_elementwise_clamp' must be vectors}}
+}
+
+float3 test_builtin_clamp_float3_splat(float p0, float3 p1) {
+  return __builtin_hlsl_elementwise_clamp(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_elementwise_clamp' must be vectors}}
+}
+
+float4 test_builtin_clamp_float4_splat(float p0, float4 p1) {
+  return __builtin_hlsl_elementwise_clamp(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_elementwise_clamp' must be vectors}}
+}
+
+float2 test_clamp_float2_int_splat(float2 p0, int p1) {
+  return __builtin_hlsl_elementwise_clamp(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_elementwise_clamp' must be vectors}}
+}
+
+float3 test_clamp_float3_int_splat(float3 p0, int p1) {
+  return __builtin_hlsl_elementwise_clamp(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_elementwise_clamp' must be vectors}}
+}
+
+float2 test_builtin_clamp_int_vect_to_float_vec_promotion(int2 p0, float p1) {
+  return __builtin_hlsl_elementwise_clamp(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_elementwise_clamp' must be vectors}}
+}
+
+float test_builtin_clamp_bool_type_promotion(bool p0) {
+  return __builtin_hlsl_elementwise_clamp(p0, p0, p0);
+  // expected-error@-1 {{1st argument must be a vector, integer or floating point type (was 'bool')}}
+}
+
+float builtin_bool_to_float_type_promotion(float p0, bool p1) {
+  return __builtin_hlsl_elementwise_clamp(p0, p0, p1);
+  // expected-error@-1 {{3rd argument must be a floating point type (was 'bool')}}
+}
+
+float builtin_bool_to_float_type_promotion2(bool p0, float p1) {
+  return __builtin_hlsl_elementwise_clamp(p1, p0, p1);
+  // expected-error@-1 {{2nd argument must be a floating point type (was 'bool')}}
+}
+
+float builtin_clamp_int_to_float_promotion(float p0, int p1) {
+  return __builtin_hlsl_elementwise_clamp(p0, p0, p1);
+  // expected-error@-1 {{3rd argument must be a floating point type (was 'int')}}
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl
index 095f3c12ba87..58722aaeb924 100644
--- a/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl
@@ -1,119 +1,119 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
-
-float test_no_second_arg(float2 p0) {
-  return __builtin_hlsl_dot(p0);
-  // expected-error@-1 {{too few arguments to function call, expected 2, have 1}}
-}
-
-float test_too_many_arg(float2 p0) {
-  return __builtin_hlsl_dot(p0, p0, p0);
-  // expected-error@-1 {{too many arguments to function call, expected 2, have 3}}
-}
-
-float test_dot_no_second_arg(float2 p0) {
-  return dot(p0);
-  // expected-error@-1 {{no matching function for call to 'dot'}}
-}
-
-float test_dot_vector_size_mismatch(float3 p0, float2 p1) {
-  return dot(p0, p1);
-  // expected-warning@-1 {{implicit conversion truncates vector: 'float3' (aka 'vector<float, 3>') to 'float __attribute__((ext_vector_type(2)))' (vector of 2 'float' values)}}
-}
-
-float test_dot_builtin_vector_size_mismatch(float3 p0, float2 p1) {
-  return __builtin_hlsl_dot(p0, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must have the same type}}
-}
-
-float test_dot_scalar_mismatch(float p0, int p1) {
-  return dot(p0, p1);
-  // expected-error@-1 {{call to 'dot' is ambiguous}}
-}
-
-float test_dot_element_type_mismatch(int2 p0, float2 p1) {
-  return dot(p0, p1);
-  // expected-error@-1 {{call to 'dot' is ambiguous}}
-}
-
-//NOTE: for all the *_promotion we are intentionally not handling type promotion in builtins
-float test_builtin_dot_vec_int_to_float_promotion(int2 p0, float2 p1) {
-  return __builtin_hlsl_dot(p0, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must have the same type}}
-}
-
-int64_t test_builtin_dot_vec_int_to_int64_promotion(int64_t2 p0, int2 p1) {
-  return __builtin_hlsl_dot(p0, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must have the same type}}
-}
-
-float test_builtin_dot_vec_half_to_float_promotion(float2 p0, half2 p1) {
-  return __builtin_hlsl_dot(p0, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must have the same type}}
-}
-
-#ifdef __HLSL_ENABLE_16_BIT
-float test_builtin_dot_vec_int16_to_float_promotion(float2 p0, int16_t2 p1) {
-  return __builtin_hlsl_dot(p0, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must have the same type}}
-}
-
-half test_builtin_dot_vec_int16_to_half_promotion(half2 p0, int16_t2 p1) {
-  return __builtin_hlsl_dot(p0, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must have the same type}}
-}
-
-int test_builtin_dot_vec_int16_to_int_promotion(int2 p0, int16_t2 p1) {
-  return __builtin_hlsl_dot(p0, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must have the same type}}
-}
-
-int64_t test_builtin_dot_vec_int16_to_int64_promotion(int64_t2 p0,
-                                                      int16_t2 p1) {
-  return __builtin_hlsl_dot(p0, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must have the same type}}
-}
-#endif
-
-float test_builtin_dot_float2_splat(float p0, float2 p1) {
-  return __builtin_hlsl_dot(p0, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must be vectors}}
-}
-
-float test_builtin_dot_float3_splat(float p0, float3 p1) {
-  return __builtin_hlsl_dot(p0, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must be vectors}}
-}
-
-float test_builtin_dot_float4_splat(float p0, float4 p1) {
-  return __builtin_hlsl_dot(p0, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must be vectors}}
-}
-
-float test_dot_float2_int_splat(float2 p0, int p1) {
-  return __builtin_hlsl_dot(p0, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must be vectors}}
-}
-
-float test_dot_float3_int_splat(float3 p0, int p1) {
-  return __builtin_hlsl_dot(p0, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must be vectors}}
-}
-
-float test_builtin_dot_int_vect_to_float_vec_promotion(int2 p0, float p1) {
-  return __builtin_hlsl_dot(p0, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must be vectors}}
-}
-
-int test_builtin_dot_bool_type_promotion(bool p0, bool p1) {
-  return __builtin_hlsl_dot(p0, p1);
-  // expected-error@-1 {{1st argument must be a vector, integer or floating point type (was 'bool')}}
-}
-
-double test_dot_double(double2 p0, double2 p1) {
-  return dot(p0, p1);
-  // expected-error@-1 {{call to 'dot' is ambiguous}}
-}
-double test_dot_double_builtin(double2 p0, double2 p1) {
-  return __builtin_hlsl_dot(p0, p1);
-  // expected-error@-1 {{passing 'double2' (aka 'vector<double, 2>') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}}
-}
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
+
+float test_no_second_arg(float2 p0) {
+  return __builtin_hlsl_dot(p0);
+  // expected-error@-1 {{too few arguments to function call, expected 2, have 1}}
+}
+
+float test_too_many_arg(float2 p0) {
+  return __builtin_hlsl_dot(p0, p0, p0);
+  // expected-error@-1 {{too many arguments to function call, expected 2, have 3}}
+}
+
+float test_dot_no_second_arg(float2 p0) {
+  return dot(p0);
+  // expected-error@-1 {{no matching function for call to 'dot'}}
+}
+
+float test_dot_vector_size_mismatch(float3 p0, float2 p1) {
+  return dot(p0, p1);
+  // expected-warning@-1 {{implicit conversion truncates vector: 'float3' (aka 'vector<float, 3>') to 'float __attribute__((ext_vector_type(2)))' (vector of 2 'float' values)}}
+}
+
+float test_dot_builtin_vector_size_mismatch(float3 p0, float2 p1) {
+  return __builtin_hlsl_dot(p0, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must have the same type}}
+}
+
+float test_dot_scalar_mismatch(float p0, int p1) {
+  return dot(p0, p1);
+  // expected-error@-1 {{call to 'dot' is ambiguous}}
+}
+
+float test_dot_element_type_mismatch(int2 p0, float2 p1) {
+  return dot(p0, p1);
+  // expected-error@-1 {{call to 'dot' is ambiguous}}
+}
+
+//NOTE: for all the *_promotion we are intentionally not handling type promotion in builtins
+float test_builtin_dot_vec_int_to_float_promotion(int2 p0, float2 p1) {
+  return __builtin_hlsl_dot(p0, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must have the same type}}
+}
+
+int64_t test_builtin_dot_vec_int_to_int64_promotion(int64_t2 p0, int2 p1) {
+  return __builtin_hlsl_dot(p0, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must have the same type}}
+}
+
+float test_builtin_dot_vec_half_to_float_promotion(float2 p0, half2 p1) {
+  return __builtin_hlsl_dot(p0, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must have the same type}}
+}
+
+#ifdef __HLSL_ENABLE_16_BIT
+float test_builtin_dot_vec_int16_to_float_promotion(float2 p0, int16_t2 p1) {
+  return __builtin_hlsl_dot(p0, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must have the same type}}
+}
+
+half test_builtin_dot_vec_int16_to_half_promotion(half2 p0, int16_t2 p1) {
+  return __builtin_hlsl_dot(p0, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must have the same type}}
+}
+
+int test_builtin_dot_vec_int16_to_int_promotion(int2 p0, int16_t2 p1) {
+  return __builtin_hlsl_dot(p0, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must have the same type}}
+}
+
+int64_t test_builtin_dot_vec_int16_to_int64_promotion(int64_t2 p0,
+                                                      int16_t2 p1) {
+  return __builtin_hlsl_dot(p0, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must have the same type}}
+}
+#endif
+
+float test_builtin_dot_float2_splat(float p0, float2 p1) {
+  return __builtin_hlsl_dot(p0, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must be vectors}}
+}
+
+float test_builtin_dot_float3_splat(float p0, float3 p1) {
+  return __builtin_hlsl_dot(p0, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must be vectors}}
+}
+
+float test_builtin_dot_float4_splat(float p0, float4 p1) {
+  return __builtin_hlsl_dot(p0, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must be vectors}}
+}
+
+float test_dot_float2_int_splat(float2 p0, int p1) {
+  return __builtin_hlsl_dot(p0, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must be vectors}}
+}
+
+float test_dot_float3_int_splat(float3 p0, int p1) {
+  return __builtin_hlsl_dot(p0, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must be vectors}}
+}
+
+float test_builtin_dot_int_vect_to_float_vec_promotion(int2 p0, float p1) {
+  return __builtin_hlsl_dot(p0, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_dot' must be vectors}}
+}
+
+int test_builtin_dot_bool_type_promotion(bool p0, bool p1) {
+  return __builtin_hlsl_dot(p0, p1);
+  // expected-error@-1 {{1st argument must be a vector, integer or floating point type (was 'bool')}}
+}
+
+double test_dot_double(double2 p0, double2 p1) {
+  return dot(p0, p1);
+  // expected-error@-1 {{call to 'dot' is ambiguous}}
+}
+double test_dot_double_builtin(double2 p0, double2 p1) {
+  return __builtin_hlsl_dot(p0, p1);
+  // expected-error@-1 {{passing 'double2' (aka 'vector<double, 2>') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}}
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl
index ef0928f8fef0..4089188134d3 100644
--- a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl
@@ -9,6 +9,7 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_sin
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_sqrt
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_roundeven
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_tan
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_trunc
 
 double2 test_double_builtin(double2 p0) {
diff --git a/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl
index d23357239b7e..868ba8a1a471 100644
--- a/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl
@@ -1,109 +1,109 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
-
-float2 test_no_second_arg(float2 p0) {
-  return __builtin_hlsl_lerp(p0);
-  // expected-error@-1 {{too few arguments to function call, expected 3, have 1}}
-}
-
-float2 test_no_third_arg(float2 p0) {
-  return __builtin_hlsl_lerp(p0, p0);
-  // expected-error@-1 {{too few arguments to function call, expected 3, have 2}}
-}
-
-float2 test_too_many_arg(float2 p0) {
-  return __builtin_hlsl_lerp(p0, p0, p0, p0);
-  // expected-error@-1 {{too many arguments to function call, expected 3, have 4}}
-}
-
-float2 test_lerp_no_second_arg(float2 p0) {
-  return lerp(p0);
-  // expected-error@-1 {{no matching function for call to 'lerp'}}
-}
-
-float2 test_lerp_vector_size_mismatch(float3 p0, float2 p1) {
-  return lerp(p0, p0, p1);
-  // expected-warning@-1 {{implicit conversion truncates vector: 'float3' (aka 'vector<float, 3>') to 'float __attribute__((ext_vector_type(2)))' (vector of 2 'float' values)}}
-}
-
-float2 test_lerp_builtin_vector_size_mismatch(float3 p0, float2 p1) {
-  return __builtin_hlsl_lerp(p0, p1, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must have the same type}}
-}
-
-float test_lerp_scalar_mismatch(float p0, half p1) {
-  return lerp(p1, p0, p1);
-  // expected-error@-1 {{call to 'lerp' is ambiguous}}
-}
-
-float2 test_lerp_element_type_mismatch(half2 p0, float2 p1) {
-  return lerp(p1, p0, p1);
-  // expected-error@-1 {{call to 'lerp' is ambiguous}}
-}
-
-float2 test_builtin_lerp_float2_splat(float p0, float2 p1) {
-  return __builtin_hlsl_lerp(p0, p1, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must be vectors}}
-}
-
-float3 test_builtin_lerp_float3_splat(float p0, float3 p1) {
-  return __builtin_hlsl_lerp(p0, p1, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must be vectors}}
-}
-
-float4 test_builtin_lerp_float4_splat(float p0, float4 p1) {
-  return __builtin_hlsl_lerp(p0, p1, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must be vectors}}
-}
-
-float2 test_lerp_float2_int_splat(float2 p0, int p1) {
-  return __builtin_hlsl_lerp(p0, p1, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must be vectors}}
-}
-
-float3 test_lerp_float3_int_splat(float3 p0, int p1) {
-  return __builtin_hlsl_lerp(p0, p1, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must be vectors}}
-}
-
-float2 test_builtin_lerp_int_vect_to_float_vec_promotion(int2 p0, float p1) {
-  return __builtin_hlsl_lerp(p0, p1, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must be vectors}}
-}
-
-float test_builtin_lerp_bool_type_promotion(bool p0) {
-  return __builtin_hlsl_lerp(p0, p0, p0);
-  // expected-error@-1 {{1st argument must be a floating point type (was 'bool')}}
-}
-
-float builtin_bool_to_float_type_promotion(float p0, bool p1) {
-  return __builtin_hlsl_lerp(p0, p0, p1);
-  // expected-error@-1 {{3rd argument must be a floating point type (was 'bool')}}
-}
-
-float builtin_bool_to_float_type_promotion2(bool p0, float p1) {
-  return __builtin_hlsl_lerp(p1, p0, p1);
-  // expected-error@-1 {{2nd argument must be a floating point type (was 'bool')}}
-}
-
-float builtin_lerp_int_to_float_promotion(float p0, int p1) {
-  return __builtin_hlsl_lerp(p0, p0, p1);
-  // expected-error@-1 {{3rd argument must be a floating point type (was 'int')}}
-}
-
-float4 test_lerp_int4(int4 p0, int4 p1, int4 p2) {
-  return __builtin_hlsl_lerp(p0, p1, p2);
-  // expected-error@-1 {{1st argument must be a floating point type (was 'int4' (aka 'vector<int, 4>'))}}
-}
-
-// note: DefaultVariadicArgumentPromotion --> DefaultArgumentPromotion has already promoted to double
-// we don't know anymore that the input was half when __builtin_hlsl_lerp is called so we default to float
-// for expected type
-half builtin_lerp_half_scalar (half p0) {
-  return __builtin_hlsl_lerp ( p0, p0, p0 );
-  // expected-error@-1 {{passing 'double' to parameter of incompatible type 'float'}}
-}
-
-float builtin_lerp_float_scalar ( float p0) {
-  return __builtin_hlsl_lerp ( p0, p0, p0 );
-  // expected-error@-1 {{passing 'double' to parameter of incompatible type 'float'}}
-}
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
+
+float2 test_no_second_arg(float2 p0) {
+  return __builtin_hlsl_lerp(p0);
+  // expected-error@-1 {{too few arguments to function call, expected 3, have 1}}
+}
+
+float2 test_no_third_arg(float2 p0) {
+  return __builtin_hlsl_lerp(p0, p0);
+  // expected-error@-1 {{too few arguments to function call, expected 3, have 2}}
+}
+
+float2 test_too_many_arg(float2 p0) {
+  return __builtin_hlsl_lerp(p0, p0, p0, p0);
+  // expected-error@-1 {{too many arguments to function call, expected 3, have 4}}
+}
+
+float2 test_lerp_no_second_arg(float2 p0) {
+  return lerp(p0);
+  // expected-error@-1 {{no matching function for call to 'lerp'}}
+}
+
+float2 test_lerp_vector_size_mismatch(float3 p0, float2 p1) {
+  return lerp(p0, p0, p1);
+  // expected-warning@-1 {{implicit conversion truncates vector: 'float3' (aka 'vector<float, 3>') to 'float __attribute__((ext_vector_type(2)))' (vector of 2 'float' values)}}
+}
+
+float2 test_lerp_builtin_vector_size_mismatch(float3 p0, float2 p1) {
+  return __builtin_hlsl_lerp(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must have the same type}}
+}
+
+float test_lerp_scalar_mismatch(float p0, half p1) {
+  return lerp(p1, p0, p1);
+  // expected-error@-1 {{call to 'lerp' is ambiguous}}
+}
+
+float2 test_lerp_element_type_mismatch(half2 p0, float2 p1) {
+  return lerp(p1, p0, p1);
+  // expected-error@-1 {{call to 'lerp' is ambiguous}}
+}
+
+float2 test_builtin_lerp_float2_splat(float p0, float2 p1) {
+  return __builtin_hlsl_lerp(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must be vectors}}
+}
+
+float3 test_builtin_lerp_float3_splat(float p0, float3 p1) {
+  return __builtin_hlsl_lerp(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must be vectors}}
+}
+
+float4 test_builtin_lerp_float4_splat(float p0, float4 p1) {
+  return __builtin_hlsl_lerp(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must be vectors}}
+}
+
+float2 test_lerp_float2_int_splat(float2 p0, int p1) {
+  return __builtin_hlsl_lerp(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must be vectors}}
+}
+
+float3 test_lerp_float3_int_splat(float3 p0, int p1) {
+  return __builtin_hlsl_lerp(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must be vectors}}
+}
+
+float2 test_builtin_lerp_int_vect_to_float_vec_promotion(int2 p0, float p1) {
+  return __builtin_hlsl_lerp(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_lerp' must be vectors}}
+}
+
+float test_builtin_lerp_bool_type_promotion(bool p0) {
+  return __builtin_hlsl_lerp(p0, p0, p0);
+  // expected-error@-1 {{1st argument must be a floating point type (was 'bool')}}
+}
+
+float builtin_bool_to_float_type_promotion(float p0, bool p1) {
+  return __builtin_hlsl_lerp(p0, p0, p1);
+  // expected-error@-1 {{3rd argument must be a floating point type (was 'bool')}}
+}
+
+float builtin_bool_to_float_type_promotion2(bool p0, float p1) {
+  return __builtin_hlsl_lerp(p1, p0, p1);
+  // expected-error@-1 {{2nd argument must be a floating point type (was 'bool')}}
+}
+
+float builtin_lerp_int_to_float_promotion(float p0, int p1) {
+  return __builtin_hlsl_lerp(p0, p0, p1);
+  // expected-error@-1 {{3rd argument must be a floating point type (was 'int')}}
+}
+
+float4 test_lerp_int4(int4 p0, int4 p1, int4 p2) {
+  return __builtin_hlsl_lerp(p0, p1, p2);
+  // expected-error@-1 {{1st argument must be a floating point type (was 'int4' (aka 'vector<int, 4>'))}}
+}
+
+// note: DefaultVariadicArgumentPromotion --> DefaultArgumentPromotion has already promoted to double
+// we don't know anymore that the input was half when __builtin_hlsl_lerp is called so we default to float
+// for expected type
+half builtin_lerp_half_scalar (half p0) {
+  return __builtin_hlsl_lerp ( p0, p0, p0 );
+  // expected-error@-1 {{passing 'double' to parameter of incompatible type 'float'}}
+}
+
+float builtin_lerp_float_scalar ( float p0) {
+  return __builtin_hlsl_lerp ( p0, p0, p0 );
+  // expected-error@-1 {{passing 'double' to parameter of incompatible type 'float'}}
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl
index 636910b7ac8a..5dfbc23f8def 100644
--- a/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl
@@ -1,86 +1,86 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
-
-float2 test_no_second_arg(float2 p0) {
-  return __builtin_hlsl_mad(p0);
-  // expected-error@-1 {{too few arguments to function call, expected 3, have 1}}
-}
-
-float2 test_no_third_arg(float2 p0) {
-  return __builtin_hlsl_mad(p0, p0);
-  // expected-error@-1 {{too few arguments to function call, expected 3, have 2}}
-}
-
-float2 test_too_many_arg(float2 p0) {
-  return __builtin_hlsl_mad(p0, p0, p0, p0);
-  // expected-error@-1 {{too many arguments to function call, expected 3, have 4}}
-}
-
-float2 test_mad_no_second_arg(float2 p0) {
-  return mad(p0);
-  // expected-error@-1 {{no matching function for call to 'mad'}}
-}
-
-float2 test_mad_vector_size_mismatch(float3 p0, float2 p1) {
-  return mad(p0, p0, p1);
-  // expected-warning@-1 {{implicit conversion truncates vector: 'float3' (aka 'vector<float, 3>') to 'float __attribute__((ext_vector_type(2)))' (vector of 2 'float' values)}}
-}
-
-float2 test_mad_builtin_vector_size_mismatch(float3 p0, float2 p1) {
-  return __builtin_hlsl_mad(p0, p1, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_mad' must have the same type}}
-}
-
-float test_mad_scalar_mismatch(float p0, half p1) {
-  return mad(p1, p0, p1);
-  // expected-error@-1 {{call to 'mad' is ambiguous}}
-}
-
-float2 test_mad_element_type_mismatch(half2 p0, float2 p1) {
-  return mad(p1, p0, p1);
-  // expected-error@-1 {{call to 'mad' is ambiguous}}
-}
-
-float2 test_builtin_mad_float2_splat(float p0, float2 p1) {
-  return __builtin_hlsl_mad(p0, p1, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_mad' must be vectors}}
-}
-
-float3 test_builtin_mad_float3_splat(float p0, float3 p1) {
-  return __builtin_hlsl_mad(p0, p1, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_mad' must be vectors}}
-}
-
-float4 test_builtin_mad_float4_splat(float p0, float4 p1) {
-  return __builtin_hlsl_mad(p0, p1, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_mad' must be vectors}}
-}
-
-float2 test_mad_float2_int_splat(float2 p0, int p1) {
-  return __builtin_hlsl_mad(p0, p1, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_mad' must be vectors}}
-}
-
-float3 test_mad_float3_int_splat(float3 p0, int p1) {
-  return __builtin_hlsl_mad(p0, p1, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_mad' must be vectors}}
-}
-
-float2 test_builtin_mad_int_vect_to_float_vec_promotion(int2 p0, float p1) {
-  return __builtin_hlsl_mad(p0, p1, p1);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_mad' must be vectors}}
-}
-
-float builtin_bool_to_float_type_promotion(float p0, bool p1) {
-  return __builtin_hlsl_mad(p0, p0, p1);
-  // expected-error@-1 {{3rd argument must be a floating point type (was 'bool')}}
-}
-
-float builtin_bool_to_float_type_promotion2(bool p0, float p1) {
-  return __builtin_hlsl_mad(p1, p0, p1);
-  // expected-error@-1 {{2nd argument must be a floating point type (was 'bool')}}
-}
-
-float builtin_mad_int_to_float_promotion(float p0, int p1) {
-  return __builtin_hlsl_mad(p0, p0, p1);
-  // expected-error@-1 {{3rd argument must be a floating point type (was 'int')}}
-}
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
+
+float2 test_no_second_arg(float2 p0) {
+  return __builtin_hlsl_mad(p0);
+  // expected-error@-1 {{too few arguments to function call, expected 3, have 1}}
+}
+
+float2 test_no_third_arg(float2 p0) {
+  return __builtin_hlsl_mad(p0, p0);
+  // expected-error@-1 {{too few arguments to function call, expected 3, have 2}}
+}
+
+float2 test_too_many_arg(float2 p0) {
+  return __builtin_hlsl_mad(p0, p0, p0, p0);
+  // expected-error@-1 {{too many arguments to function call, expected 3, have 4}}
+}
+
+float2 test_mad_no_second_arg(float2 p0) {
+  return mad(p0);
+  // expected-error@-1 {{no matching function for call to 'mad'}}
+}
+
+float2 test_mad_vector_size_mismatch(float3 p0, float2 p1) {
+  return mad(p0, p0, p1);
+  // expected-warning@-1 {{implicit conversion truncates vector: 'float3' (aka 'vector<float, 3>') to 'float __attribute__((ext_vector_type(2)))' (vector of 2 'float' values)}}
+}
+
+float2 test_mad_builtin_vector_size_mismatch(float3 p0, float2 p1) {
+  return __builtin_hlsl_mad(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_mad' must have the same type}}
+}
+
+float test_mad_scalar_mismatch(float p0, half p1) {
+  return mad(p1, p0, p1);
+  // expected-error@-1 {{call to 'mad' is ambiguous}}
+}
+
+float2 test_mad_element_type_mismatch(half2 p0, float2 p1) {
+  return mad(p1, p0, p1);
+  // expected-error@-1 {{call to 'mad' is ambiguous}}
+}
+
+float2 test_builtin_mad_float2_splat(float p0, float2 p1) {
+  return __builtin_hlsl_mad(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_mad' must be vectors}}
+}
+
+float3 test_builtin_mad_float3_splat(float p0, float3 p1) {
+  return __builtin_hlsl_mad(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_mad' must be vectors}}
+}
+
+float4 test_builtin_mad_float4_splat(float p0, float4 p1) {
+  return __builtin_hlsl_mad(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_mad' must be vectors}}
+}
+
+float2 test_mad_float2_int_splat(float2 p0, int p1) {
+  return __builtin_hlsl_mad(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_mad' must be vectors}}
+}
+
+float3 test_mad_float3_int_splat(float3 p0, int p1) {
+  return __builtin_hlsl_mad(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_mad' must be vectors}}
+}
+
+float2 test_builtin_mad_int_vect_to_float_vec_promotion(int2 p0, float p1) {
+  return __builtin_hlsl_mad(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_mad' must be vectors}}
+}
+
+float builtin_bool_to_float_type_promotion(float p0, bool p1) {
+  return __builtin_hlsl_mad(p0, p0, p1);
+  // expected-error@-1 {{3rd argument must be a floating point type (was 'bool')}}
+}
+
+float builtin_bool_to_float_type_promotion2(bool p0, float p1) {
+  return __builtin_hlsl_mad(p1, p0, p1);
+  // expected-error@-1 {{2nd argument must be a floating point type (was 'bool')}}
+}
+
+float builtin_mad_int_to_float_promotion(float p0, int p1) {
+  return __builtin_hlsl_mad(p0, p0, p1);
+  // expected-error@-1 {{3rd argument must be a floating point type (was 'int')}}
+}
diff --git a/clang/test/SemaHLSL/OverloadResolutionBugs.hlsl b/clang/test/SemaHLSL/OverloadResolutionBugs.hlsl
index c13cb299127a..30de00063f54 100644
--- a/clang/test/SemaHLSL/OverloadResolutionBugs.hlsl
+++ b/clang/test/SemaHLSL/OverloadResolutionBugs.hlsl
@@ -4,15 +4,6 @@
 // https://github.com/llvm/llvm-project/issues/81047
 
 // expected-no-diagnostics
-void Fn3(double2 D);
-void Fn3(float2 F);
-
-void Call3(half2 H) { Fn3(H); }
-
-void Fn5(double2 D);
-
-void Call5(half2 H) { Fn5(H); }
-
 void Fn4(int64_t2 L);
 void Fn4(int2 I);
 
@@ -61,13 +52,12 @@ float test_frac_int(int p0) { return frac(p0); }
 
 float test_frac_bool(bool p0) { return frac(p0); }
 
-// https://github.com/llvm/llvm-project/issues/81049
+// This resolves the wrong overload. In clang this converts down to an int, in
+// DXC it extends the scalar to a vector.
+void Fn(int) {}
+void Fn(vector<int64_t,2>) {}
 
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.2-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefix=NO_HALF
-
-half sqrt_h(half x) { return sqrt(x); }
-
-// NO_HALF: define noundef float @"?sqrt_h@@YA$halff@$halff@@Z"(
-// NO_HALF: call float @llvm.sqrt.f32(float %0)
+void Call() {
+  int64_t V;
+  Fn(V);
+}
diff --git a/clang/test/SemaHLSL/ScalarOverloadResolution.hlsl b/clang/test/SemaHLSL/ScalarOverloadResolution.hlsl
new file mode 100644
index 000000000000..41702ef17532
--- /dev/null
+++ b/clang/test/SemaHLSL/ScalarOverloadResolution.hlsl
@@ -0,0 +1,229 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -fnative-half-type -finclude-default-header -Wconversion -verify -o - %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -fnative-half-type -finclude-default-header -ast-dump %s | FileCheck %s
+
+// This test verifies floating point type implicit conversion ranks for overload
+// resolution. In HLSL the built-in type ranks are half < float < double. This
+// applies to both scalar and vector types.
+
+// HLSL allows implicit truncation fo types, so it differentiates between
+// promotions (converting to larger types) and conversions (converting to
+// smaller types). Promotions are preferred over conversions. Promotions prefer
+// promoting to the next lowest type in the ranking order. Conversions prefer
+// converting to the next highest type in the ranking order.
+
+void HalfFloatDouble(double D);
+void HalfFloatDouble(float F);
+void HalfFloatDouble(half H);
+
+// CHECK: FunctionDecl {{.*}} used HalfFloatDouble 'void (double)'
+// CHECK: FunctionDecl {{.*}} used HalfFloatDouble 'void (float)'
+// CHECK: FunctionDecl {{.*}} used HalfFloatDouble 'void (half)'
+
+void FloatDouble(double D);
+void FloatDouble(float F);
+
+// CHECK: FunctionDecl {{.*}} used FloatDouble 'void (double)'
+// CHECK: FunctionDecl {{.*}} used FloatDouble 'void (float)'
+
+void HalfDouble(double D);
+void HalfDouble(half H);
+
+// CHECK: FunctionDecl {{.*}} used HalfDouble 'void (double)'
+// CHECK: FunctionDecl {{.*}} used HalfDouble 'void (half)'
+
+void HalfFloat(float F);
+void HalfFloat(half H);
+
+// CHECK: FunctionDecl {{.*}} used HalfFloat 'void (float)'
+// CHECK: FunctionDecl {{.*}} used HalfFloat 'void (half)'
+
+void Double(double D);
+void Float(float F);
+void Half(half H);
+
+// CHECK: FunctionDecl {{.*}} used Double 'void (double)'
+// CHECK: FunctionDecl {{.*}} used Float 'void (float)'
+// CHECK: FunctionDecl {{.*}} used Half 'void (half)'
+
+
+// Case 1: A function declared with overloads for half float and double types.
+//   (a) When called with half, it will resolve to half because half is an exact
+//   match.
+//   (b) When called with float it will resolve to float because float is an
+//   exact match.
+//   (c) When called with double it will resolve to double because it is an
+//   exact match.
+
+// CHECK-LABEL: FunctionDecl {{.*}} Case1 'void (half, float, double)'
+void Case1(half H, float F, double D) {
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(half)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (half)' lvalue Function {{.*}} 'HalfFloatDouble' 'void (half)'
+  HalfFloatDouble(H);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(float)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (float)' lvalue Function {{.*}} 'HalfFloatDouble' 'void (float)'
+  HalfFloatDouble(F);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(double)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (double)' lvalue Function {{.*}} 'HalfFloatDouble' 'void (double)'
+  HalfFloatDouble(D);
+}
+
+// Case 2: A function declared with double and float overlaods.
+//   (a) When called with half, it will resolve to float because float is lower
+//   ranked than double.
+//   (b) When called with float it will resolve to float because float is an
+//   exact match.
+//   (c) When called with double it will resolve to double because it is an
+//   exact match.
+
+// CHECK-LABEL: FunctionDecl {{.*}} Case2 'void (half, float, double)'
+void Case2(half H, float F, double D) {
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(float)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (float)' lvalue Function {{.*}} 'FloatDouble' 'void (float)'
+  FloatDouble(H);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(float)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (float)' lvalue Function {{.*}} 'FloatDouble' 'void (float)'
+  FloatDouble(F);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(double)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (double)' lvalue Function {{.*}} 'FloatDouble' 'void (double)'
+  FloatDouble(D);
+}
+
+// Case 3: A function declared with half and double overloads
+//   (a) When called with half, it will resolve to half because it is an exact
+//   match.
+//   (b) When called with flaot, it will resolve to double because double is a
+//   valid promotion.
+//   (c) When called with double, it will resolve to double because it is an
+//   exact match.
+
+// CHECK-LABEL: FunctionDecl {{.*}} Case3 'void (half, float, double)'
+void Case3(half H, float F, double D) {
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(half)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (half)' lvalue Function {{.*}} 'HalfDouble' 'void (half)'
+  HalfDouble(H);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(double)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (double)' lvalue Function {{.*}} 'HalfDouble' 'void (double)'
+  HalfDouble(F);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(double)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (double)' lvalue Function {{.*}} 'HalfDouble' 'void (double)'
+  HalfDouble(D);
+}
+
+// Case 4: A function declared with half and float overloads.
+//   (a) When called with half, it will resolve to half because half is an exact
+//   match.
+//   (b) When called with float it will resolve to float because float is an
+//   exact match.
+//   (c) When called with double it will resolve to float because it is the
+//   float is higher rank than half.
+
+// CHECK-LABEL: FunctionDecl {{.*}} Case4 'void (half, float, double)'
+void Case4(half H, float F, double D) {
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(half)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (half)' lvalue Function {{.*}} 'HalfFloat' 'void (half)'
+  HalfFloat(H);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(float)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (float)' lvalue Function {{.*}} 'HalfFloat' 'void (float)'
+  HalfFloat(F);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(float)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (float)' lvalue Function {{.*}} 'HalfFloat' 'void (float)'
+  HalfFloat(D); // expected-warning{{implicit conversion loses floating-point precision: 'double' to 'float'}}
+}
+
+// Case 5: A function declared with only a double overload.
+//   (a) When called with half, it will resolve to double because double is a
+//   valid promotion.
+//   (b) When called with float it will resolve to double because double is a
+//   valid promotion.
+//   (c) When called with double it will resolve to double because it is an
+//   exact match.
+
+// CHECK-LABEL: FunctionDecl {{.*}} Case5 'void (half, float, double)'
+void Case5(half H, float F, double D) {
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(double)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (double)' lvalue Function {{.*}} 'Double' 'void (double)'
+  Double(H);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(double)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (double)' lvalue Function {{.*}} 'Double' 'void (double)'
+  Double(F);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(double)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (double)' lvalue Function {{.*}} 'Double' 'void (double)'
+  Double(D);
+}
+
+// Case 6: A function declared with only a float overload.
+//   (a) When called with half, it will resolve to float because float is a
+//   valid promotion.
+//   (b) When called with float it will resolve to float because float is an
+//   exact match.
+//   (c) When called with double it will resolve to float because it is a
+//   valid conversion.
+
+// CHECK-LABEL: FunctionDecl {{.*}} Case6 'void (half, float, double)'
+void Case6(half H, float F, double D) {
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(float)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (float)' lvalue Function {{.*}} 'Float' 'void (float)'
+  Float(H);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(float)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (float)' lvalue Function {{.*}} 'Float' 'void (float)'
+  Float(F);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(float)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (float)' lvalue Function {{.*}} 'Float' 'void (float)'
+  Float(D); // expected-warning{{implicit conversion loses floating-point precision: 'double' to 'float'}}
+}
+
+// Case 7: A function declared with only a half overload.
+//   (a) When called with half, it will resolve to half because half is an
+//   exact match
+//   (b) When called with float it will resolve to half because half is a
+//   valid conversion.
+//   (c) When called with double it will resolve to float because it is a
+//   valid conversion.
+
+// CHECK-LABEL: FunctionDecl {{.*}} Case7 'void (half, float, double)'
+void Case7(half H, float F, double D) {
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(half)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (half)' lvalue Function {{.*}} 'Half' 'void (half)'
+  Half(H);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(half)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (half)' lvalue Function {{.*}} 'Half' 'void (half)'
+  Half(F); // expected-warning{{implicit conversion loses floating-point precision: 'float' to 'half'}}
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(half)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (half)' lvalue Function {{.*}} 'Half' 'void (half)'
+  Half(D); // expected-warning{{implicit conversion loses floating-point precision: 'double' to 'half'}}
+}
diff --git a/clang/test/SemaHLSL/Types/Arithmetic/half_size.hlsl b/clang/test/SemaHLSL/Types/Arithmetic/half_size.hlsl
new file mode 100644
index 000000000000..7de467469993
--- /dev/null
+++ b/clang/test/SemaHLSL/Types/Arithmetic/half_size.hlsl
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.4-library -verify %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.4-library -verify -fnative-half-type %s
+// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -verify %s
+// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -verify -fnative-half-type %s
+
+// expected-no-diagnostics
+#ifdef __HLSL_ENABLE_16_BIT
+_Static_assert(sizeof(half) == 2, "half is 2 bytes");
+#else
+_Static_assert(sizeof(half) == 4, "half is 4 bytes");
+#endif
diff --git a/clang/test/SemaHLSL/literal_suffixes.hlsl b/clang/test/SemaHLSL/Types/Arithmetic/literal_suffixes.hlsl
index 25a4d3b5103c..91324e57ce69 100644
--- a/clang/test/SemaHLSL/literal_suffixes.hlsl
+++ b/clang/test/SemaHLSL/Types/Arithmetic/literal_suffixes.hlsl
@@ -49,10 +49,7 @@ struct is_same<T, T> {
   static const bool value = true;
 };
 
-// The no-suffix behavior is currently wrong. The behavior in DXC is complicated
-// and undocumented. We have a language change planned to address this, and an
-// issue tracking: https://github.com/llvm/llvm-project/issues/85714.
-_Static_assert(is_same<double, __decltype(1.0)>::value, "1.0f literal is double (should be float)");
+_Static_assert(is_same<float, __decltype(1.0)>::value, "1.0 literal is float");
 
 _Static_assert(is_same<half, __decltype(1.0h)>::value, "1.0h literal is half");
 _Static_assert(is_same<float, __decltype(1.0f)>::value, "1.0f literal is float");
diff --git a/clang/test/SemaHLSL/Types/Arithmetic/literal_suffixes_202x.hlsl b/clang/test/SemaHLSL/Types/Arithmetic/literal_suffixes_202x.hlsl
new file mode 100644
index 000000000000..2aeb4047565d
--- /dev/null
+++ b/clang/test/SemaHLSL/Types/Arithmetic/literal_suffixes_202x.hlsl
@@ -0,0 +1,115 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.4-library -finclude-default-header -verify %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.4-library -finclude-default-header -verify -fnative-half-type %s
+// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -finclude-default-header -verify %s
+// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -finclude-default-header -verify -fnative-half-type %s
+
+// This test is adapted from the test in DXC:
+// tools/clang/test/SemaHLSL/v202x/conforming-literals/valid-literals.hlsl
+
+template <typename T, typename U>
+struct is_same {
+  static const bool value = false;
+};
+
+template <typename T>
+struct is_same<T, T> {
+  static const bool value = true;
+};
+
+bool B; // Used for ternary operator tests below
+
+////////////////////////////////////////////////////////////////////////////////
+// Literals Without Suffixes
+////////////////////////////////////////////////////////////////////////////////
+
+_Static_assert(is_same<__decltype(1.0), float>::value, "Literals are now float");
+
+_Static_assert(is_same<__decltype(0), int>::value, "0 is int");
+_Static_assert(is_same<__decltype(1), int>::value, "1 is int");
+
+// Decimal literals are always signed.
+_Static_assert(is_same<__decltype(2147483647), int>::value, "2147483647 is int");
+_Static_assert(is_same<__decltype(2147483648), int64_t>::value, "2147483648 is int64_t");
+_Static_assert(is_same<__decltype(4294967296), int64_t>::value, "4294967296 is int64_t");
+
+// This is an anomaly that exists in C as well as HLSL. This value can't be
+// represented as a signed integer, but base-10 literals are always signed.
+// Clang emits a warning that it is interpreting it as unsigned because that is
+// not conforming to the C standard.
+
+// expected-warning@+1{{integer literal is too large to be represented in type 'long' and is subject to undefined behavior under C++98, interpreting as 'unsigned long'; this literal will be ill-formed in C++11 onwards}}
+static const uint64_t V = 9223372036854775808;
+
+_Static_assert(is_same<__decltype(0x0), int>::value, "0x0 is int");
+_Static_assert(is_same<__decltype(0x70000000), int>::value, "0x70000000 is int");
+_Static_assert(is_same<__decltype(0xF0000000), uint>::value, "0xF0000000 is uint");
+
+_Static_assert(is_same<__decltype(0x7000000000000000), int64_t>::value, "0x7000000000000000 is int64_t");
+_Static_assert(is_same<__decltype(0xF000000000000000), uint64_t>::value, "0xF000000000000000 is uint64_t");
+
+////////////////////////////////////////////////////////////////////////////////
+// Integer literals With Suffixes
+////////////////////////////////////////////////////////////////////////////////
+
+_Static_assert(is_same<__decltype(1l), int64_t>::value, "1l is int64_t");
+_Static_assert(is_same<__decltype(1ul), uint64_t>::value, "1ul is uint64_t");
+_Static_assert(is_same<__decltype(1lu), uint64_t>::value, "1lu is uint64_t");
+
+// HLSL 2021 does not define a `long long` type, so the suffix should be
+// invalid.
+_Static_assert(is_same<__decltype(1ll), int64_t>::value, "1ll is int64_t");
+_Static_assert(is_same<__decltype(1ull), uint64_t>::value, "1ull is uint64_t");
+_Static_assert(is_same<__decltype(1llu), uint64_t>::value, "1llu is uint64_t");
+
+// Verify that the size of `long long` is the same as the size of `int64_t`.
+_Static_assert(sizeof(__decltype(1ll)) == sizeof(int64_t), "sizeof(1ll) == sizeof(int64_t)");
+_Static_assert(sizeof(__decltype(1llu)) == sizeof(uint64_t), "sizeof(1llu) == sizeof(uint64_t)");
+
+////////////////////////////////////////////////////////////////////////////////
+// Ternary operators on integer literals
+////////////////////////////////////////////////////////////////////////////////
+
+_Static_assert(is_same<__decltype(B ? 1 : 1), int>::value, "B ? 1 : 1 is int");
+
+_Static_assert(is_same<__decltype(B ? 1l : 1), int64_t>::value, "B ? 1l : 1 is int64_t");
+_Static_assert(is_same<__decltype(B ? 1 : 1l), int64_t>::value, "B ? 1 : 1l is int64_t");
+
+_Static_assert(is_same<__decltype(B ? 1ul : 1), uint64_t>::value, "B ? 1ul : 1 is uint64_t");
+_Static_assert(is_same<__decltype(B ? 1 : 1ul), uint64_t>::value, "B ? 1 : 1ul is uint64_t");
+
+////////////////////////////////////////////////////////////////////////////////
+// Floating point literals With Suffixes
+////////////////////////////////////////////////////////////////////////////////
+
+_Static_assert(is_same<__decltype(1.0h), half>::value, "1.0h is half");
+_Static_assert(is_same<__decltype(1.0f), float>::value, "1.0f is float");
+_Static_assert(is_same<__decltype(1.0l), double>::value, "1.0l is double");
+
+////////////////////////////////////////////////////////////////////////////////
+// Ternary operators on floating point literals
+////////////////////////////////////////////////////////////////////////////////
+
+_Static_assert(is_same<__decltype(B ? 1.0 : 1.0), float>::value, "B ? 1.0 : 1.0 is float");
+
+_Static_assert(is_same<__decltype(B ? 1.0l : 1.0l), double>::value, "B ? 1.0l : 1.0l is double");
+_Static_assert(is_same<__decltype(B ? 1.0f : 1.0f), float>::value, "B ? 1.0f : 1.0f is float");
+
+
+_Static_assert(is_same<__decltype(B ? 1.0f : 1.0l), double>::value, "B ? 1.0f : 1.0l is double");
+_Static_assert(is_same<__decltype(B ? 1.0l : 1.0f), double>::value, "B ? 1.0l : 1.0f is double");
+
+_Static_assert(is_same<__decltype(B ? 1.0l : 1.0), double>::value, "B ? 1.0l : 1.0 is double");
+_Static_assert(is_same<__decltype(B ? 1.0 : 1.0l), double>::value, "B ? 1.0 : 1.0l is double");
+_Static_assert(is_same<__decltype(B ? 1.0f : 1.0), float>::value, "B ? 1.0f : 1.0 is float");
+_Static_assert(is_same<__decltype(B ? 1.0 : 1.0f), float>::value, "B ? 1.0 : 1.0f is float");
+
+_Static_assert(is_same<__decltype(B ? 1.0h : 1.0h), half>::value, "B ? 1.0h : 1.0h is half");
+
+_Static_assert(is_same<__decltype(B ? 1.0f : 1.0h), float>::value, "B ? 1.0f : 1.0h is float");
+_Static_assert(is_same<__decltype(B ? 1.0h : 1.0f), float>::value, "B ? 1.0h : 1.0f is float");
+
+_Static_assert(is_same<__decltype(B ? 1.0l : 1.0h), double>::value, "B ? 1.0l : 1.0h is double");
+_Static_assert(is_same<__decltype(B ? 1.0h : 1.0l), double>::value, "B ? 1.0h : 1.0l is double");
+
+_Static_assert(is_same<__decltype(B ? 1.0h : 1.0), float>::value, "B ? 1.0h : 1.0 is float");
+_Static_assert(is_same<__decltype(B ? 1.0 : 1.0h), float>::value, "B ? 1.0 : 1.0h is float");
diff --git a/clang/test/SemaHLSL/literal_suffixes_no_16bit.hlsl b/clang/test/SemaHLSL/Types/Arithmetic/literal_suffixes_no_16bit.hlsl
index 73e57041329e..f7e3e6ba577d 100644
--- a/clang/test/SemaHLSL/literal_suffixes_no_16bit.hlsl
+++ b/clang/test/SemaHLSL/Types/Arithmetic/literal_suffixes_no_16bit.hlsl
@@ -49,10 +49,7 @@ struct is_same<T, T> {
   static const bool value = true;
 };
 
-// The no-suffix behavior is currently wrong. The behavior in DXC is complicated
-// and undocumented. We have a language change planned to address this, and an
-// issue tracking: https://github.com/llvm/llvm-project/issues/85714.
-_Static_assert(is_same<double, __decltype(1.0)>::value, "1.0f literal is double (should be float)");
+_Static_assert(is_same<float, __decltype(1.0)>::value, "1.0 literal is float");
 
 _Static_assert(is_same<half, __decltype(1.0h)>::value, "1.0h literal is half");
 _Static_assert(is_same<float, __decltype(1.0f)>::value, "1.0f literal is float");
diff --git a/clang/test/SemaHLSL/Types/BuiltinVector/ScalarSwizzles.hlsl b/clang/test/SemaHLSL/Types/BuiltinVector/ScalarSwizzles.hlsl
index a2e9a5f865ec..4fa04f3d5988 100644
--- a/clang/test/SemaHLSL/Types/BuiltinVector/ScalarSwizzles.hlsl
+++ b/clang/test/SemaHLSL/Types/BuiltinVector/ScalarSwizzles.hlsl
@@ -54,7 +54,7 @@ vector<uint64_t,4> FillOneUnsignedLong(){
 // CHECK-NEXT: FloatingLiteral {{.*}} 'double' 2.500000e+00
 
 double2 FillTwoPointFive(){
-  return 2.5.rr;
+  return 2.5l.rr;
 }
 
 // CHECK-LABEL: FillOneHalf
@@ -63,7 +63,7 @@ double2 FillTwoPointFive(){
 // CHECK-NEXT: FloatingLiteral {{.*}} 'double' 5.000000e-01
 
 double3 FillOneHalf(){
-  return .5.rrr;
+  return .5l.rrr;
 }
 
 // CHECK-LABEL: FillTwoPointFiveFloat
@@ -119,5 +119,14 @@ int64_t4 HooBoy() {
 // CHECK-NEXT: FloatingLiteral {{.*}} 'double' 1.000000e+00
 
 float3 AllRighty() {
+  return 1.l.rrr;
+}
+
+// CHECK-LABEL: AllRighty2
+// CHECK: ExtVectorElementExpr {{.*}} 'float __attribute__((ext_vector_type(3)))' rrr
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float __attribute__((ext_vector_type(1)))' <VectorSplat>
+// CHECK-NEXT: FloatingLiteral {{.*}} 'float' 1.000000e+00
+
+float3 AllRighty2() {
   return 1..rrr;
 }
diff --git a/clang/test/SemaHLSL/VectorElementOverloadResolution.hlsl b/clang/test/SemaHLSL/VectorElementOverloadResolution.hlsl
new file mode 100644
index 000000000000..12575084ead2
--- /dev/null
+++ b/clang/test/SemaHLSL/VectorElementOverloadResolution.hlsl
@@ -0,0 +1,228 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -fnative-half-type -finclude-default-header -Wconversion -verify -o - %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -fnative-half-type -finclude-default-header -Wno-conversion -ast-dump %s | FileCheck %s
+
+// This test verifies floating point type implicit conversion ranks for overload
+// resolution. In HLSL the built-in type ranks are half < float < double. This
+// applies to both scalar and vector types.
+
+// HLSL allows implicit truncation fo types, so it differentiates between
+// promotions (converting to larger types) and conversions (converting to
+// smaller types). Promotions are preferred over conversions. Promotions prefer
+// promoting to the next lowest type in the ranking order. Conversions prefer
+// converting to the next highest type in the ranking order.
+
+void HalfFloatDouble(double2 D);
+void HalfFloatDouble(float2 F);
+void HalfFloatDouble(half2 H);
+
+// CHECK: FunctionDecl {{.*}} used HalfFloatDouble 'void (double2)'
+// CHECK: FunctionDecl {{.*}} used HalfFloatDouble 'void (float2)'
+// CHECK: FunctionDecl {{.*}} used HalfFloatDouble 'void (half2)'
+
+void FloatDouble(double2 D);
+void FloatDouble(float2 F);
+
+// CHECK: FunctionDecl {{.*}} used FloatDouble 'void (double2)'
+// CHECK: FunctionDecl {{.*}} used FloatDouble 'void (float2)'
+
+void HalfDouble(double2 D);
+void HalfDouble(half2 H);
+
+// CHECK: FunctionDecl {{.*}} used HalfDouble 'void (double2)'
+// CHECK: FunctionDecl {{.*}} used HalfDouble 'void (half2)'
+
+void HalfFloat(float2 F);
+void HalfFloat(half2 H);
+
+// CHECK: FunctionDecl {{.*}} used HalfFloat 'void (float2)'
+// CHECK: FunctionDecl {{.*}} used HalfFloat 'void (half2)'
+
+void Double(double2 D);
+void Float(float2 F);
+void Half(half2 H);
+
+// CHECK: FunctionDecl {{.*}} used Double 'void (double2)'
+// CHECK: FunctionDecl {{.*}} used Float 'void (float2)'
+// CHECK: FunctionDecl {{.*}} used Half 'void (half2)'
+
+// Case 1: A function declared with overloads for half float and double types.
+//   (a) When called with half, it will resolve to half because half is an exact
+//   match.
+//   (b) When called with float it will resolve to float because float is an
+//   exact match.
+//   (c) When called with double it will resolve to double because it is an
+//   exact match.
+
+// CHECK-LABEL: FunctionDecl {{.*}} Case1 'void (half2, float2, double2)'
+void Case1(half2 H, float2 F, double2 D) {
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(half2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (half2)' lvalue Function {{.*}} 'HalfFloatDouble' 'void (half2)'
+  HalfFloatDouble(H);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(float2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (float2)' lvalue Function {{.*}} 'HalfFloatDouble' 'void (float2)'
+  HalfFloatDouble(F);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(double2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (double2)' lvalue Function {{.*}} 'HalfFloatDouble' 'void (double2)'
+  HalfFloatDouble(D);
+}
+
+// Case 2: A function declared with double and float overlaods.
+//   (a) When called with half, it will resolve to float because float is lower
+//   ranked than double.
+//   (b) When called with float it will resolve to float because float is an
+//   exact match.
+//   (c) When called with double it will resolve to double because it is an
+//   exact match.
+
+// CHECK-LABEL: FunctionDecl {{.*}} Case2 'void (half2, float2, double2)'
+void Case2(half2 H, float2 F, double2 D) {
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(float2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (float2)' lvalue Function {{.*}} 'FloatDouble' 'void (float2)'
+  FloatDouble(H);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(float2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (float2)' lvalue Function {{.*}} 'FloatDouble' 'void (float2)'
+  FloatDouble(F);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(double2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (double2)' lvalue Function {{.*}} 'FloatDouble' 'void (double2)'
+  FloatDouble(D);
+}
+
+// Case 3: A function declared with half and double overloads
+//   (a) When called with half, it will resolve to half because it is an exact
+//   match.
+//   (b) When called with flaot, it will resolve to double because double is a
+//   valid promotion.
+//   (c) When called with double, it will resolve to double because it is an
+//   exact match.
+
+// CHECK-LABEL: FunctionDecl {{.*}} Case3 'void (half2, float2, double2)'
+void Case3(half2 H, float2 F, double2 D) {
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(half2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (half2)' lvalue Function {{.*}} 'HalfDouble' 'void (half2)'
+  HalfDouble(H);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(double2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (double2)' lvalue Function {{.*}} 'HalfDouble' 'void (double2)'
+  HalfDouble(F);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(double2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (double2)' lvalue Function {{.*}} 'HalfDouble' 'void (double2)'
+  HalfDouble(D);
+}
+
+// Case 4: A function declared with half and float overloads.
+//   (a) When called with half, it will resolve to half because half is an exact
+//   match.
+//   (b) When called with float it will resolve to float because float is an
+//   exact match.
+//   (c) When called with double it will resolve to float because it is the
+//   float is higher rank than half.
+
+// CHECK-LABEL: FunctionDecl {{.*}} Case4 'void (half2, float2, double2)'
+void Case4(half2 H, float2 F, double2 D) {
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(half2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (half2)' lvalue Function {{.*}} 'HalfFloat' 'void (half2)'
+  HalfFloat(H);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(float2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (float2)' lvalue Function {{.*}} 'HalfFloat' 'void (float2)'
+  HalfFloat(F);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(float2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (float2)' lvalue Function {{.*}} 'HalfFloat' 'void (float2)'
+  HalfFloat(D); // expected-warning{{implicit conversion loses floating-point precision: 'double2' (aka 'vector<double, 2>') to 'float2' (aka 'vector<float, 2>')}}
+}
+
+// Case 5: A function declared with only a double overload.
+//   (a) When called with half, it will resolve to double because double is a
+//   valid promotion.
+//   (b) When called with float it will resolve to double because double is a
+//   valid promotion.
+//   (c) When called with double it will resolve to double because it is an
+//   exact match.
+
+// CHECK-LABEL: FunctionDecl {{.*}} Case5 'void (half2, float2, double2)'
+void Case5(half2 H, float2 F, double2 D) {
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(double2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (double2)' lvalue Function {{.*}} 'Double' 'void (double2)'
+  Double(H);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(double2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (double2)' lvalue Function {{.*}} 'Double' 'void (double2)'
+  Double(F);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(double2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (double2)' lvalue Function {{.*}} 'Double' 'void (double2)'
+  Double(D);
+}
+
+// Case 6: A function declared with only a float overload.
+//   (a) When called with half, it will resolve to float because float is a
+//   valid promotion.
+//   (b) When called with float it will resolve to float because float is an
+//   exact match.
+//   (c) When called with double it will resolve to float because it is a
+//   valid conversion.
+
+// CHECK-LABEL: FunctionDecl {{.*}} Case6 'void (half2, float2, double2)'
+void Case6(half2 H, float2 F, double2 D) {
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(float2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (float2)' lvalue Function {{.*}} 'Float' 'void (float2)'
+  Float(H);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(float2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (float2)' lvalue Function {{.*}} 'Float' 'void (float2)'
+  Float(F);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(float2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (float2)' lvalue Function {{.*}} 'Float' 'void (float2)'
+  Float(D); // expected-warning{{implicit conversion loses floating-point precision: 'double2' (aka 'vector<double, 2>') to 'float2' (aka 'vector<float, 2>')}}
+}
+
+// Case 7: A function declared with only a half overload.
+//   (a) When called with half, it will resolve to half because half is an
+//   exact match
+//   (b) When called with float it will resolve to half because half is a
+//   valid conversion.
+//   (c) When called with double it will resolve to float because it is a
+//   valid conversion.
+
+// CHECK-LABEL: FunctionDecl {{.*}} Case7 'void (half2, float2, double2)'
+void Case7(half2 H, float2 F, double2 D) {
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(half2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (half2)' lvalue Function {{.*}} 'Half' 'void (half2)'
+  Half(H);
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(half2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (half2)' lvalue Function {{.*}} 'Half' 'void (half2)'
+  Half(F); // expected-warning{{implicit conversion loses floating-point precision: 'float2' (aka 'vector<float, 2>') to 'half2' (aka 'vector<half, 2>')}}
+
+  // CHECK: CallExpr {{.*}} 'void'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(half2)' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr {{.*}} 'void (half2)' lvalue Function {{.*}} 'Half' 'void (half2)'
+  Half(D); // expected-warning{{implicit conversion loses floating-point precision: 'double2' (aka 'vector<double, 2>') to 'half2' (aka 'vector<half, 2>')}}
+}
diff --git a/clang/test/SemaHLSL/packoffset-invalid.hlsl b/clang/test/SemaHLSL/packoffset-invalid.hlsl
new file mode 100644
index 000000000000..526a511edf1f
--- /dev/null
+++ b/clang/test/SemaHLSL/packoffset-invalid.hlsl
@@ -0,0 +1,122 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library -fnative-half-type -verify %s
+
+// expected-warning@+1{{cannot mix packoffset elements with nonpackoffset elements in a cbuffer}}
+cbuffer Mix
+{
+    float4 M1 : packoffset(c0);
+    float M2;
+    float M3 : packoffset(c1.y);
+}
+
+// expected-warning@+1{{cannot mix packoffset elements with nonpackoffset elements in a cbuffer}}
+cbuffer Mix2
+{
+    float4 M4;
+    float M5 : packoffset(c1.y);
+    float M6 ;
+}
+
+// expected-error@+1{{attribute 'packoffset' only applies to shader constant in a constant buffer}}
+float4 g : packoffset(c0);
+
+cbuffer IllegalOffset
+{
+    // expected-error@+1{{invalid resource class specifier 't2' for packoffset, expected 'c'}}
+    float4 i1 : packoffset(t2);
+    // expected-error@+1{{invalid component 'm' used; expected 'x', 'y', 'z', or 'w'}}
+    float i2 : packoffset(c1.m);
+}
+
+cbuffer Overlap
+{
+    float4 o1 : packoffset(c0);
+    // expected-error@+1{{packoffset overlap between 'o2', 'o1'}}
+    float2 o2 : packoffset(c0.z);
+}
+
+cbuffer CrossReg
+{
+    // expected-error@+1{{packoffset cannot cross register boundary}}
+    float4 c1 : packoffset(c0.y);
+    // expected-error@+1{{packoffset cannot cross register boundary}}
+    float2 c2 : packoffset(c1.w);
+}
+
+struct ST {
+  float s;
+};
+
+cbuffer Aggregate
+{
+    // expected-error@+1{{packoffset cannot cross register boundary}}
+    ST A1 : packoffset(c0.y);
+    // expected-error@+1{{packoffset cannot cross register boundary}}
+    float A2[2] : packoffset(c1.w);
+}
+
+cbuffer Double {
+    // expected-error@+1{{packoffset at 'y' not match alignment 64 required by 'double'}}
+    double d : packoffset(c.y);
+    // expected-error@+1{{packoffset cannot cross register boundary}}
+	double2 d2 : packoffset(c.z);
+    // expected-error@+1{{packoffset cannot cross register boundary}}
+	double3 d3 : packoffset(c.z);
+}
+
+cbuffer ParsingFail {
+// expected-error@+1{{expected identifier}}
+float pf0 : packoffset();
+// expected-error@+1{{expected identifier}}
+float pf1 : packoffset((c0));
+// expected-error@+1{{expected ')'}}
+float pf2 : packoffset(c0, x);
+// expected-error@+1{{invalid component 'X' used}}
+float pf3 : packoffset(c.X);
+// expected-error@+1{{expected '(' after ''}}
+float pf4 : packoffset;
+// expected-error@+1{{expected identifier}}
+float pf5 : packoffset(;
+// expected-error@+1{{expected '(' after '}}
+float pf6 : packoffset);
+// expected-error@+1{{expected '(' after '}}
+float pf7 : packoffset c0.x;
+
+// expected-error@+1{{invalid component 'xy' used}}
+float pf8 : packoffset(c0.xy);
+// expected-error@+1{{invalid component 'rg' used}}
+float pf9 : packoffset(c0.rg);
+// expected-error@+1{{invalid component 'yes' used}}
+float pf10 : packoffset(c0.yes);
+// expected-error@+1{{invalid component 'woo'}}
+float pf11 : packoffset(c0.woo);
+// expected-error@+1{{invalid component 'xr' used}}
+float pf12 : packoffset(c0.xr);
+}
+
+struct ST2 {
+  float a;
+  float2 b;
+};
+
+cbuffer S {
+  float S0 : packoffset(c0.y);
+  ST2 S1[2] : packoffset(c1);
+  // expected-error@+1{{packoffset overlap between 'S2', 'S1'}}
+  half2 S2 : packoffset(c1.w);
+  half2 S3 : packoffset(c2.w);
+}
+
+struct ST23 {
+  float s0;
+  ST2 s1;
+};
+
+cbuffer S2 {
+  float S20 : packoffset(c0.y);
+  ST2 S21 : packoffset(c1);
+  half2 S22 : packoffset(c2.w);
+  double S23[2] : packoffset(c3);
+  // expected-error@+1{{packoffset overlap between 'S24', 'S23'}}
+  float S24 : packoffset(c3.z);
+  float S25 : packoffset(c4.z);
+}
diff --git a/clang/test/SemaObjC/uninit-variables.m b/clang/test/SemaObjC/uninit-variables.m
index 34bc337a1614..7c02c6bc822f 100644
--- a/clang/test/SemaObjC/uninit-variables.m
+++ b/clang/test/SemaObjC/uninit-variables.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -Wuninitialized -fsyntax-only -fblocks %s -verify
+// RUN: %clang_cc1 -Wuninitialized -fsyntax-only -fblocks %s -verify
 
 #include <stdarg.h>
 
diff --git a/clang/test/SemaObjCXX/block-cleanup.mm b/clang/test/SemaObjCXX/block-cleanup.mm
index 53b2c224ab5e..56bbf952d967 100644
--- a/clang/test/SemaObjCXX/block-cleanup.mm
+++ b/clang/test/SemaObjCXX/block-cleanup.mm
@@ -1,16 +1,16 @@
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.11.0 -std=gnu++11 -o /dev/null -x objective-c++ -fblocks -ast-dump %s 2>&1 | FileCheck %s
-
-// CHECK:      -FunctionDecl {{.*}} test 'id ()'
-// CHECK-NEXT:   -CompoundStmt
-// CHECK-NEXT:     -ReturnStmt
-// CHECK-NEXT:       -ExprWithCleanups
-// CHECK-NEXT:         -cleanup Block
-// CHECK-NEXT:         -cleanup Block
-
-@interface NSDictionary
-+ (id)dictionaryWithObjects:(const id [])objects forKeys:(const id [])keys count:(unsigned long)cnt;
-@end
-
-id test() {
-  return @{@"a": [](){}, @"b": [](){}};
-}
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.11.0 -std=gnu++11 -o /dev/null -x objective-c++ -fblocks -ast-dump %s 2>&1 | FileCheck %s
+
+// CHECK:      -FunctionDecl {{.*}} test 'id ()'
+// CHECK-NEXT:   -CompoundStmt
+// CHECK-NEXT:     -ReturnStmt
+// CHECK-NEXT:       -ExprWithCleanups
+// CHECK-NEXT:         -cleanup Block
+// CHECK-NEXT:         -cleanup Block
+
+@interface NSDictionary
++ (id)dictionaryWithObjects:(const id [])objects forKeys:(const id [])keys count:(unsigned long)cnt;
+@end
+
+id test() {
+  return @{@"a": [](){}, @"b": [](){}};
+}
diff --git a/clang/test/SemaOpenACC/compute-construct-async-clause.c b/clang/test/SemaOpenACC/compute-construct-async-clause.c
new file mode 100644
index 000000000000..a8af06bc0afd
--- /dev/null
+++ b/clang/test/SemaOpenACC/compute-construct-async-clause.c
@@ -0,0 +1,41 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+short getS();
+
+void Test() {
+#pragma acc parallel async
+  while(1);
+#pragma acc parallel async(1)
+  while(1);
+#pragma acc kernels async(1)
+  while(1);
+#pragma acc kernels async(-51)
+  while(1);
+
+#pragma acc serial async(1)
+  while(1);
+
+  // expected-error@+2{{expected ')'}}
+  // expected-note@+1{{to match this '('}}
+#pragma acc serial async(1, 2)
+  while(1);
+
+  struct NotConvertible{} NC;
+  // expected-error@+1{{OpenACC clause 'async' requires expression of integer type ('struct NotConvertible' invalid)}}
+#pragma acc parallel async(NC)
+  while(1);
+
+#pragma acc kernels async(getS())
+  while(1);
+
+  struct Incomplete *SomeIncomplete;
+
+  // expected-error@+1{{OpenACC clause 'async' requires expression of integer type ('struct Incomplete' invalid)}}
+#pragma acc kernels async(*SomeIncomplete)
+  while(1);
+
+  enum E{A} SomeE;
+
+#pragma acc kernels async(SomeE)
+  while(1);
+}
diff --git a/clang/test/SemaOpenACC/compute-construct-async-clause.cpp b/clang/test/SemaOpenACC/compute-construct-async-clause.cpp
new file mode 100644
index 000000000000..a5da7c8f4e56
--- /dev/null
+++ b/clang/test/SemaOpenACC/compute-construct-async-clause.cpp
@@ -0,0 +1,135 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+struct NotConvertible{} NC;
+struct Incomplete *SomeIncomplete; // #INCOMPLETE
+enum E{} SomeE;
+enum class E2{} SomeE2;
+
+struct CorrectConvert {
+  operator int();
+} Convert;
+
+struct ExplicitConvertOnly {
+  explicit operator int() const; // #EXPL_CONV
+} Explicit;
+
+struct AmbiguousConvert{
+  operator int(); // #AMBIG_INT
+  operator short(); // #AMBIG_SHORT
+  operator float();
+} Ambiguous;
+
+void Test() {
+#pragma acc parallel async
+  while(1);
+#pragma acc parallel async(1)
+  while(1);
+#pragma acc kernels async(-51)
+  while(1);
+
+  // expected-error@+1{{OpenACC clause 'async' requires expression of integer type ('struct NotConvertible' invalid}}
+#pragma acc parallel async(NC)
+  while(1);
+
+  // expected-error@+2{{OpenACC integer expression has incomplete class type 'struct Incomplete'}}
+  // expected-note@#INCOMPLETE{{forward declaration of 'Incomplete'}}
+#pragma acc kernels async(*SomeIncomplete)
+  while(1);
+
+#pragma acc parallel async(SomeE)
+  while(1);
+
+  // expected-error@+1{{OpenACC clause 'async' requires expression of integer type ('enum E2' invalid}}
+#pragma acc kernels async(SomeE2)
+  while(1);
+
+#pragma acc parallel async(Convert)
+  while(1);
+
+  // expected-error@+2{{OpenACC integer expression type 'struct ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
+#pragma acc kernels async(Explicit)
+  while(1);
+
+  // expected-error@+3{{multiple conversions from expression type 'struct AmbiguousConvert' to an integral type}}
+  // expected-note@#AMBIG_INT{{conversion to integral type 'int'}}
+  // expected-note@#AMBIG_SHORT{{conversion to integral type 'short'}}
+#pragma acc parallel async(Ambiguous)
+  while(1);
+}
+
+struct HasInt {
+  using IntTy = int;
+  using ShortTy = short;
+  static constexpr int value = 1;
+  static constexpr AmbiguousConvert ACValue;
+  static constexpr ExplicitConvertOnly EXValue;
+
+  operator char();
+};
+
+template<typename T>
+void TestInst() {
+
+  // expected-error@+1{{no member named 'Invalid' in 'HasInt'}}
+#pragma acc parallel async(HasInt::Invalid)
+  while (1);
+
+  // expected-error@+2{{no member named 'Invalid' in 'HasInt'}}
+  // expected-note@#INST{{in instantiation of function template specialization 'TestInst<HasInt>' requested here}}
+#pragma acc kernels async(T::Invalid)
+  while (1);
+
+  // expected-error@+3{{multiple conversions from expression type 'const AmbiguousConvert' to an integral type}}
+  // expected-note@#AMBIG_INT{{conversion to integral type 'int'}}
+  // expected-note@#AMBIG_SHORT{{conversion to integral type 'short'}}
+#pragma acc parallel async(HasInt::ACValue)
+  while (1);
+
+  // expected-error@+3{{multiple conversions from expression type 'const AmbiguousConvert' to an integral type}}
+  // expected-note@#AMBIG_INT{{conversion to integral type 'int'}}
+  // expected-note@#AMBIG_SHORT{{conversion to integral type 'short'}}
+#pragma acc kernels async(T::ACValue)
+  while (1);
+
+  // expected-error@+2{{OpenACC integer expression type 'const ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
+#pragma acc parallel async(HasInt::EXValue)
+  while (1);
+
+  // expected-error@+2{{OpenACC integer expression type 'const ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
+#pragma acc kernels async(T::EXValue)
+  while (1);
+
+#pragma acc parallel async(HasInt::value)
+  while (1);
+
+#pragma acc kernels async(T::value)
+  while (1);
+
+#pragma acc parallel async(HasInt::IntTy{})
+  while (1);
+
+#pragma acc kernels async(typename T::ShortTy{})
+  while (1);
+
+#pragma acc parallel async(HasInt::IntTy{})
+  while (1);
+
+#pragma acc kernels async(typename T::ShortTy{})
+  while (1);
+
+  HasInt HI{};
+  T MyT{};
+
+#pragma acc parallel async(HI)
+  while (1);
+
+#pragma acc kernels async(MyT)
+  while (1);
+}
+
+void Inst() {
+  TestInst<HasInt>(); // #INST
+}
diff --git a/clang/test/SemaOpenACC/compute-construct-attach-clause.c b/clang/test/SemaOpenACC/compute-construct-attach-clause.c
new file mode 100644
index 000000000000..de735308528a
--- /dev/null
+++ b/clang/test/SemaOpenACC/compute-construct-attach-clause.c
@@ -0,0 +1,61 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+struct S {
+  int IntMem;
+  int *PtrMem;
+};
+
+void uses() {
+  int LocalInt;
+  int *LocalPtr;
+  int Array[5];
+  int *PtrArray[5];
+  struct S s;
+
+  // expected-error@+1{{expected pointer in 'attach' clause, type is 'int'}}
+#pragma acc parallel attach(LocalInt)
+  while (1);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel attach(&LocalInt)
+  while (1);
+
+#pragma acc serial attach(LocalPtr)
+  while (1);
+
+  // expected-error@+1{{expected pointer in 'attach' clause, type is 'int[5]'}}
+#pragma acc kernels attach(Array)
+  while (1);
+
+  // expected-error@+1{{expected pointer in 'attach' clause, type is 'int'}}
+#pragma acc parallel attach(Array[0])
+  while (1);
+
+  // expected-error@+2{{OpenACC sub-array is not allowed here}}
+  // expected-note@+1{{expected variable of pointer type}}
+#pragma acc parallel attach(Array[0:1])
+  while (1);
+
+  // expected-error@+1{{expected pointer in 'attach' clause, type is 'int *[5]'}}
+#pragma acc parallel attach(PtrArray)
+  while (1);
+
+#pragma acc parallel attach(PtrArray[0])
+  while (1);
+
+  // expected-error@+2{{OpenACC sub-array is not allowed here}}
+  // expected-note@+1{{expected variable of pointer type}}
+#pragma acc parallel attach(PtrArray[0:1])
+  while (1);
+
+  // expected-error@+1{{expected pointer in 'attach' clause, type is 'struct S'}}
+#pragma acc parallel attach(s)
+  while (1);
+
+  // expected-error@+1{{expected pointer in 'attach' clause, type is 'int'}}
+#pragma acc parallel attach(s.IntMem)
+  while (1);
+
+#pragma acc parallel attach(s.PtrMem)
+  while (1);
+}
diff --git a/clang/test/SemaOpenACC/compute-construct-attach-clause.cpp b/clang/test/SemaOpenACC/compute-construct-attach-clause.cpp
new file mode 100644
index 000000000000..a89d346c2645
--- /dev/null
+++ b/clang/test/SemaOpenACC/compute-construct-attach-clause.cpp
@@ -0,0 +1,120 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+struct S {
+  int IntMem;
+  int *PtrMem;
+  operator int*();
+};
+
+void uses() {
+  int LocalInt;
+  int *LocalPtr;
+  int Array[5];
+  int *PtrArray[5];
+  struct S s;
+
+  // expected-error@+1{{expected pointer in 'attach' clause, type is 'int'}}
+#pragma acc parallel attach(LocalInt)
+  while (true);
+
+#pragma acc parallel attach(LocalPtr)
+  while (true);
+
+  // expected-error@+1{{expected pointer in 'attach' clause, type is 'int[5]'}}
+#pragma acc parallel attach(Array)
+  while (true);
+
+  // expected-error@+1{{expected pointer in 'attach' clause, type is 'int'}}
+#pragma acc parallel attach(Array[0])
+  while (true);
+
+  // expected-error@+2{{OpenACC sub-array is not allowed here}}
+  // expected-note@+1{{expected variable of pointer type}}
+#pragma acc parallel attach(Array[0:1])
+  while (true);
+
+  // expected-error@+1{{expected pointer in 'attach' clause, type is 'int *[5]'}}
+#pragma acc parallel attach(PtrArray)
+  while (true);
+
+#pragma acc parallel attach(PtrArray[0])
+  while (true);
+
+  // expected-error@+2{{OpenACC sub-array is not allowed here}}
+  // expected-note@+1{{expected variable of pointer type}}
+#pragma acc parallel attach(PtrArray[0:1])
+  while (true);
+
+  // expected-error@+1{{expected pointer in 'attach' clause, type is 'struct S'}}
+#pragma acc parallel attach(s)
+  while (true);
+
+  // expected-error@+1{{expected pointer in 'attach' clause, type is 'int'}}
+#pragma acc parallel attach(s.IntMem)
+  while (true);
+
+#pragma acc parallel attach(s.PtrMem)
+  while (true);
+}
+
+template<typename T, typename TPtr, typename TStruct, auto &R1>
+void Templ() {
+  T SomeInt;
+  TPtr SomePtr;
+  T SomeIntArray[5];
+  TPtr SomeIntPtrArray[5];
+  TStruct SomeStruct;
+
+  // expected-error@+2{{expected pointer in 'attach' clause, type is 'int'}}
+  // expected-note@#INST{{in instantiation of function template specialization}}
+#pragma acc parallel attach(SomeInt)
+  while (true);
+
+#pragma acc parallel attach(SomePtr)
+  while (true);
+
+  // expected-error@+1{{expected pointer in 'attach' clause, type is 'int[5]'}}
+#pragma acc parallel attach(SomeIntArray)
+  while (true);
+
+  // expected-error@+1{{expected pointer in 'attach' clause, type is 'int'}}
+#pragma acc parallel attach(SomeIntArray[0])
+  while (true);
+
+  // expected-error@+2{{OpenACC sub-array is not allowed here}}
+  // expected-note@+1{{expected variable of pointer type}}
+#pragma acc parallel attach(SomeIntArray[0:1])
+  while (true);
+
+  // expected-error@+1{{expected pointer in 'attach' clause, type is 'int *[5]'}}
+#pragma acc parallel attach(SomeIntPtrArray)
+  while (true);
+
+#pragma acc parallel attach(SomeIntPtrArray[0])
+  while (true);
+
+  // expected-error@+2{{OpenACC sub-array is not allowed here}}
+  // expected-note@+1{{expected variable of pointer type}}
+#pragma acc parallel attach(SomeIntPtrArray[0:1])
+  while (true);
+
+  // expected-error@+1{{expected pointer in 'attach' clause, type is 'S'}}
+#pragma acc parallel attach(SomeStruct)
+  while (true);
+
+  // expected-error@+1{{expected pointer in 'attach' clause, type is 'int'}}
+#pragma acc parallel attach(SomeStruct.IntMem)
+  while (true);
+
+#pragma acc parallel attach(SomeStruct.PtrMem)
+  while (true);
+
+  // expected-error@+1{{expected pointer in 'attach' clause, type is 'int'}}
+#pragma acc parallel attach(R1)
+  while (true);
+}
+
+void inst() {
+  static constexpr int CEVar = 1;
+  Templ<int, int*, S, CEVar>(); // #INST
+}
diff --git a/clang/test/SemaOpenACC/compute-construct-copy-clause.c b/clang/test/SemaOpenACC/compute-construct-copy-clause.c
new file mode 100644
index 000000000000..accbe43cea40
--- /dev/null
+++ b/clang/test/SemaOpenACC/compute-construct-copy-clause.c
@@ -0,0 +1,62 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+typedef struct IsComplete {
+  struct S { int A; } CompositeMember;
+  int ScalarMember;
+  float ArrayMember[5];
+  void *PointerMember;
+} Complete;
+void uses(int IntParam, short *PointerParam, float ArrayParam[5], Complete CompositeParam) {
+  int LocalInt;
+  short *LocalPointer;
+  float LocalArray[5];
+  Complete LocalComposite;
+  // Check Appertainment:
+#pragma acc parallel copy(LocalInt)
+  while(1);
+#pragma acc serial copy(LocalInt)
+  while(1);
+#pragma acc kernels copy(LocalInt)
+  while(1);
+
+  // expected-warning@+1{{OpenACC clause name 'pcopy' is a deprecated clause name and is now an alias for 'copy'}}
+#pragma acc parallel pcopy(LocalInt)
+  while(1);
+
+  // expected-warning@+1{{OpenACC clause name 'present_or_copy' is a deprecated clause name and is now an alias for 'copy'}}
+#pragma acc parallel present_or_copy(LocalInt)
+  while(1);
+
+  // Valid cases:
+#pragma acc parallel copy(LocalInt, LocalPointer, LocalArray)
+  while(1);
+#pragma acc parallel copy(LocalArray[2:1])
+  while(1);
+
+#pragma acc parallel copy(LocalComposite.ScalarMember, LocalComposite.ScalarMember)
+  while(1);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel copy(1 + IntParam)
+  while(1);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel copy(+IntParam)
+  while(1);
+
+  // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is not an array}}
+#pragma acc parallel copy(PointerParam[2:])
+  while(1);
+
+  // expected-error@+1{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
+#pragma acc parallel copy(ArrayParam[2:5])
+  while(1);
+
+  // expected-error@+2{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel copy((float*)ArrayParam[2:5])
+  while(1);
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel copy((float)ArrayParam[2])
+  while(1);
+}
diff --git a/clang/test/SemaOpenACC/compute-construct-copy-clause.cpp b/clang/test/SemaOpenACC/compute-construct-copy-clause.cpp
new file mode 100644
index 000000000000..16e78a43026a
--- /dev/null
+++ b/clang/test/SemaOpenACC/compute-construct-copy-clause.cpp
@@ -0,0 +1,112 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+enum SomeE{};
+typedef struct IsComplete {
+  struct S { int A; } CompositeMember;
+  int ScalarMember;
+  float ArrayMember[5];
+  SomeE EnumMember;
+  char *PointerMember;
+} Complete;
+
+void uses(int IntParam, char *PointerParam, float ArrayParam[5], Complete CompositeParam, int &IntParamRef) {
+  int LocalInt;
+  char *LocalPointer;
+  float LocalArray[5];
+  // Check Appertainment:
+#pragma acc parallel copy(LocalInt)
+  while(1);
+#pragma acc serial copy(LocalInt)
+  while(1);
+#pragma acc kernels copy(LocalInt)
+  while(1);
+
+  // Valid cases:
+#pragma acc parallel copy(LocalInt, LocalPointer, LocalArray)
+  while(1);
+#pragma acc parallel copy(LocalArray[2:1])
+  while(1);
+
+  Complete LocalComposite2;
+#pragma acc parallel copy(LocalComposite2.ScalarMember, LocalComposite2.ScalarMember)
+  while(1);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel copy(1 + IntParam)
+  while(1);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel copy(+IntParam)
+  while(1);
+
+  // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is not an array}}
+#pragma acc parallel copy(PointerParam[2:])
+  while(1);
+
+  // expected-error@+1{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
+#pragma acc parallel copy(ArrayParam[2:5])
+  while(1);
+
+  // expected-error@+2{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel copy((float*)ArrayParam[2:5])
+  while(1);
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel copy((float)ArrayParam[2])
+  while(1);
+}
+
+template<typename T, unsigned I, typename V>
+void TemplUses(T t, T (&arrayT)[I], V TemplComp) {
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel copy(+t)
+  while(true);
+
+  // NTTP's are only valid if it is a reference to something.
+  // expected-error@+2{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+  // expected-note@#TEMPL_USES_INST{{in instantiation of}}
+#pragma acc parallel copy(I)
+  while(true);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel copy(t, I)
+  while(true);
+
+#pragma acc parallel copy(arrayT)
+  while(true);
+
+#pragma acc parallel copy(TemplComp)
+  while(true);
+
+#pragma acc parallel copy(TemplComp.PointerMember[5])
+  while(true);
+ int *Pointer;
+#pragma acc parallel copy(Pointer[:I])
+  while(true);
+#pragma acc parallel copy(Pointer[:t])
+  while(true);
+  // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is not an array}}
+#pragma acc parallel copy(Pointer[1:])
+  while(true);
+}
+
+template<unsigned I, auto &NTTP_REF>
+void NTTP() {
+  // NTTP's are only valid if it is a reference to something.
+  // expected-error@+2{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+  // expected-note@#NTTP_INST{{in instantiation of}}
+#pragma acc parallel copy(I)
+  while(true);
+
+#pragma acc parallel copy(NTTP_REF)
+  while(true);
+}
+
+void Inst() {
+  static constexpr int NTTP_REFed = 1;
+  int i;
+  int Arr[5];
+  Complete C;
+  TemplUses(i, Arr, C); // #TEMPL_USES_INST
+  NTTP<5, NTTP_REFed>(); // #NTTP_INST
+}
diff --git a/clang/test/SemaOpenACC/compute-construct-copyin-clause.c b/clang/test/SemaOpenACC/compute-construct-copyin-clause.c
new file mode 100644
index 000000000000..6f200b357f52
--- /dev/null
+++ b/clang/test/SemaOpenACC/compute-construct-copyin-clause.c
@@ -0,0 +1,68 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+typedef struct IsComplete {
+  struct S { int A; } CompositeMember;
+  int ScalarMember;
+  float ArrayMember[5];
+  void *PointerMember;
+} Complete;
+void uses(int IntParam, short *PointerParam, float ArrayParam[5], Complete CompositeParam) {
+  int LocalInt;
+  short *LocalPointer;
+  float LocalArray[5];
+  Complete LocalComposite;
+  // Check Appertainment:
+#pragma acc parallel copyin(LocalInt)
+  while(1);
+#pragma acc serial copyin(LocalInt)
+  while(1);
+#pragma acc kernels copyin(LocalInt)
+  while(1);
+
+  // expected-warning@+1{{OpenACC clause name 'pcopyin' is a deprecated clause name and is now an alias for 'copyin'}}
+#pragma acc parallel pcopyin(LocalInt)
+  while(1);
+
+  // expected-warning@+1{{OpenACC clause name 'present_or_copyin' is a deprecated clause name and is now an alias for 'copyin'}}
+#pragma acc parallel present_or_copyin(LocalInt)
+  while(1);
+
+  // Valid cases:
+#pragma acc parallel copyin(LocalInt, LocalPointer, LocalArray)
+  while(1);
+#pragma acc parallel copyin(LocalArray[2:1])
+  while(1);
+#pragma acc parallel copyin(readonly:LocalArray[2:1])
+  while(1);
+
+#pragma acc parallel copyin(LocalComposite.ScalarMember, LocalComposite.ScalarMember)
+  while(1);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel copyin(1 + IntParam)
+  while(1);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel copyin(+IntParam)
+  while(1);
+
+  // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is not an array}}
+#pragma acc parallel copyin(PointerParam[2:])
+  while(1);
+
+  // expected-error@+1{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
+#pragma acc parallel copyin(ArrayParam[2:5])
+  while(1);
+
+  // expected-error@+2{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel copyin((float*)ArrayParam[2:5])
+  while(1);
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel copyin((float)ArrayParam[2])
+  while(1);
+  // expected-error@+2{{invalid tag 'invalid' on 'copyin' clause}}
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel copyin(invalid:(float)ArrayParam[2])
+  while(1);
+}
diff --git a/clang/test/SemaOpenACC/compute-construct-copyin-clause.cpp b/clang/test/SemaOpenACC/compute-construct-copyin-clause.cpp
new file mode 100644
index 000000000000..79275e701161
--- /dev/null
+++ b/clang/test/SemaOpenACC/compute-construct-copyin-clause.cpp
@@ -0,0 +1,112 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+enum SomeE{};
+typedef struct IsComplete {
+  struct S { int A; } CompositeMember;
+  int ScalarMember;
+  float ArrayMember[5];
+  SomeE EnumMember;
+  char *PointerMember;
+} Complete;
+
+void uses(int IntParam, char *PointerParam, float ArrayParam[5], Complete CompositeParam, int &IntParamRef) {
+  int LocalInt;
+  char *LocalPointer;
+  float LocalArray[5];
+  // Check Appertainment:
+#pragma acc parallel copyin(LocalInt)
+  while(1);
+#pragma acc serial copyin(LocalInt)
+  while(1);
+#pragma acc kernels copyin(LocalInt)
+  while(1);
+
+  // Valid cases:
+#pragma acc parallel copyin(LocalInt, LocalPointer, LocalArray)
+  while(1);
+#pragma acc parallel copyin(LocalArray[2:1])
+  while(1);
+
+  Complete LocalComposite2;
+#pragma acc parallel copyin(LocalComposite2.ScalarMember, LocalComposite2.ScalarMember)
+  while(1);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel copyin(1 + IntParam)
+  while(1);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel copyin(+IntParam)
+  while(1);
+
+  // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is not an array}}
+#pragma acc parallel copyin(PointerParam[2:])
+  while(1);
+
+  // expected-error@+1{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
+#pragma acc parallel copyin(ArrayParam[2:5])
+  while(1);
+
+  // expected-error@+2{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel copyin((float*)ArrayParam[2:5])
+  while(1);
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel copyin((float)ArrayParam[2])
+  while(1);
+}
+
+template<typename T, unsigned I, typename V>
+void TemplUses(T t, T (&arrayT)[I], V TemplComp) {
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel copyin(+t)
+  while(true);
+
+  // NTTP's are only valid if it is a reference to something.
+  // expected-error@+2{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+  // expected-note@#TEMPL_USES_INST{{in instantiation of}}
+#pragma acc parallel copyin(I)
+  while(true);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel copyin(t, I)
+  while(true);
+
+#pragma acc parallel copyin(arrayT)
+  while(true);
+
+#pragma acc parallel copyin(TemplComp)
+  while(true);
+
+#pragma acc parallel copyin(TemplComp.PointerMember[5])
+  while(true);
+ int *Pointer;
+#pragma acc parallel copyin(Pointer[:I])
+  while(true);
+#pragma acc parallel copyin(Pointer[:t])
+  while(true);
+  // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is not an array}}
+#pragma acc parallel copyin(Pointer[1:])
+  while(true);
+}
+
+template<unsigned I, auto &NTTP_REF>
+void NTTP() {
+  // NTTP's are only valid if it is a reference to something.
+  // expected-error@+2{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+  // expected-note@#NTTP_INST{{in instantiation of}}
+#pragma acc parallel copyin(I)
+  while(true);
+
+#pragma acc parallel copyin(NTTP_REF)
+  while(true);
+}
+
+void Inst() {
+  static constexpr int NTTP_REFed = 1;
+  int i;
+  int Arr[5];
+  Complete C;
+  TemplUses(i, Arr, C); // #TEMPL_USES_INST
+  NTTP<5, NTTP_REFed>(); // #NTTP_INST
+}
diff --git a/clang/test/SemaOpenACC/compute-construct-copyout-clause.c b/clang/test/SemaOpenACC/compute-construct-copyout-clause.c
new file mode 100644
index 000000000000..38a50f8373e8
--- /dev/null
+++ b/clang/test/SemaOpenACC/compute-construct-copyout-clause.c
@@ -0,0 +1,68 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+typedef struct IsComplete {
+  struct S { int A; } CompositeMember;
+  int ScalarMember;
+  float ArrayMember[5];
+  void *PointerMember;
+} Complete;
+void uses(int IntParam, short *PointerParam, float ArrayParam[5], Complete CompositeParam) {
+  int LocalInt;
+  short *LocalPointer;
+  float LocalArray[5];
+  Complete LocalComposite;
+  // Check Appertainment:
+#pragma acc parallel copyout(LocalInt)
+  while(1);
+#pragma acc serial copyout(LocalInt)
+  while(1);
+#pragma acc kernels copyout(LocalInt)
+  while(1);
+
+  // expected-warning@+1{{OpenACC clause name 'pcopyout' is a deprecated clause name and is now an alias for 'copyout'}}
+#pragma acc parallel pcopyout(LocalInt)
+  while(1);
+
+  // expected-warning@+1{{OpenACC clause name 'present_or_copyout' is a deprecated clause name and is now an alias for 'copyout'}}
+#pragma acc parallel present_or_copyout(LocalInt)
+  while(1);
+
+  // Valid cases:
+#pragma acc parallel copyout(LocalInt, LocalPointer, LocalArray)
+  while(1);
+#pragma acc parallel copyout(LocalArray[2:1])
+  while(1);
+#pragma acc parallel copyout(zero:LocalArray[2:1])
+  while(1);
+
+#pragma acc parallel copyout(LocalComposite.ScalarMember, LocalComposite.ScalarMember)
+  while(1);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel copyout(1 + IntParam)
+  while(1);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel copyout(+IntParam)
+  while(1);
+
+  // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is not an array}}
+#pragma acc parallel copyout(PointerParam[2:])
+  while(1);
+
+  // expected-error@+1{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
+#pragma acc parallel copyout(ArrayParam[2:5])
+  while(1);
+
+  // expected-error@+2{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel copyout((float*)ArrayParam[2:5])
+  while(1);
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel copyout((float)ArrayParam[2])
+  while(1);
+  // expected-error@+2{{invalid tag 'invalid' on 'copyout' clause}}
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel copyout(invalid:(float)ArrayParam[2])
+  while(1);
+}
diff --git a/clang/test/SemaOpenACC/compute-construct-copyout-clause.cpp b/clang/test/SemaOpenACC/compute-construct-copyout-clause.cpp
new file mode 100644
index 000000000000..3d05a5670092
--- /dev/null
+++ b/clang/test/SemaOpenACC/compute-construct-copyout-clause.cpp
@@ -0,0 +1,112 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+enum SomeE{};
+typedef struct IsComplete {
+  struct S { int A; } CompositeMember;
+  int ScalarMember;
+  float ArrayMember[5];
+  SomeE EnumMember;
+  char *PointerMember;
+} Complete;
+
+void uses(int IntParam, char *PointerParam, float ArrayParam[5], Complete CompositeParam, int &IntParamRef) {
+  int LocalInt;
+  char *LocalPointer;
+  float LocalArray[5];
+  // Check Appertainment:
+#pragma acc parallel copyout(LocalInt)
+  while(1);
+#pragma acc serial copyout(LocalInt)
+  while(1);
+#pragma acc kernels copyout(LocalInt)
+  while(1);
+
+  // Valid cases:
+#pragma acc parallel copyout(LocalInt, LocalPointer, LocalArray)
+  while(1);
+#pragma acc parallel copyout(LocalArray[2:1])
+  while(1);
+
+  Complete LocalComposite2;
+#pragma acc parallel copyout(LocalComposite2.ScalarMember, LocalComposite2.ScalarMember)
+  while(1);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel copyout(1 + IntParam)
+  while(1);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel copyout(+IntParam)
+  while(1);
+
+  // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is not an array}}
+#pragma acc parallel copyout(PointerParam[2:])
+  while(1);
+
+  // expected-error@+1{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
+#pragma acc parallel copyout(ArrayParam[2:5])
+  while(1);
+
+  // expected-error@+2{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel copyout((float*)ArrayParam[2:5])
+  while(1);
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel copyout((float)ArrayParam[2])
+  while(1);
+}
+
+template<typename T, unsigned I, typename V>
+void TemplUses(T t, T (&arrayT)[I], V TemplComp) {
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel copyout(+t)
+  while(true);
+
+  // NTTP's are only valid if it is a reference to something.
+  // expected-error@+2{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+  // expected-note@#TEMPL_USES_INST{{in instantiation of}}
+#pragma acc parallel copyout(I)
+  while(true);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel copyout(t, I)
+  while(true);
+
+#pragma acc parallel copyout(arrayT)
+  while(true);
+
+#pragma acc parallel copyout(TemplComp)
+  while(true);
+
+#pragma acc parallel copyout(TemplComp.PointerMember[5])
+  while(true);
+ int *Pointer;
+#pragma acc parallel copyout(Pointer[:I])
+  while(true);
+#pragma acc parallel copyout(Pointer[:t])
+  while(true);
+  // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is not an array}}
+#pragma acc parallel copyout(Pointer[1:])
+  while(true);
+}
+
+template<unsigned I, auto &NTTP_REF>
+void NTTP() {
+  // NTTP's are only valid if it is a reference to something.
+  // expected-error@+2{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+  // expected-note@#NTTP_INST{{in instantiation of}}
+#pragma acc parallel copyout(I)
+  while(true);
+
+#pragma acc parallel copyout(NTTP_REF)
+  while(true);
+}
+
+void Inst() {
+  static constexpr int NTTP_REFed = 1;
+  int i;
+  int Arr[5];
+  Complete C;
+  TemplUses(i, Arr, C); // #TEMPL_USES_INST
+  NTTP<5, NTTP_REFed>(); // #NTTP_INST
+}
diff --git a/clang/test/SemaOpenACC/compute-construct-create-clause.c b/clang/test/SemaOpenACC/compute-construct-create-clause.c
new file mode 100644
index 000000000000..9c94e3a1a407
--- /dev/null
+++ b/clang/test/SemaOpenACC/compute-construct-create-clause.c
@@ -0,0 +1,69 @@
+
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+typedef struct IsComplete {
+  struct S { int A; } CompositeMember;
+  int ScalarMember;
+  float ArrayMember[5];
+  void *PointerMember;
+} Complete;
+void uses(int IntParam, short *PointerParam, float ArrayParam[5], Complete CompositeParam) {
+  int LocalInt;
+  short *LocalPointer;
+  float LocalArray[5];
+  Complete LocalComposite;
+  // Check Appertainment:
+#pragma acc parallel create(LocalInt)
+  while(1);
+#pragma acc serial create(LocalInt)
+  while(1);
+#pragma acc kernels create(LocalInt)
+  while(1);
+
+  // expected-warning@+1{{OpenACC clause name 'pcreate' is a deprecated clause name and is now an alias for 'create'}}
+#pragma acc parallel pcreate(LocalInt)
+  while(1);
+
+  // expected-warning@+1{{OpenACC clause name 'present_or_create' is a deprecated clause name and is now an alias for 'create'}}
+#pragma acc parallel present_or_create(LocalInt)
+  while(1);
+
+  // Valid cases:
+#pragma acc parallel create(LocalInt, LocalPointer, LocalArray)
+  while(1);
+#pragma acc parallel create(LocalArray[2:1])
+  while(1);
+#pragma acc parallel create(zero:LocalArray[2:1])
+  while(1);
+
+#pragma acc parallel create(LocalComposite.ScalarMember, LocalComposite.ScalarMember)
+  while(1);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel create(1 + IntParam)
+  while(1);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel create(+IntParam)
+  while(1);
+
+  // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is not an array}}
+#pragma acc parallel create(PointerParam[2:])
+  while(1);
+
+  // expected-error@+1{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
+#pragma acc parallel create(ArrayParam[2:5])
+  while(1);
+
+  // expected-error@+2{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel create((float*)ArrayParam[2:5])
+  while(1);
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel create((float)ArrayParam[2])
+  while(1);
+  // expected-error@+2{{invalid tag 'invalid' on 'create' clause}}
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel create(invalid:(float)ArrayParam[2])
+  while(1);
+}
diff --git a/clang/test/SemaOpenACC/compute-construct-create-clause.cpp b/clang/test/SemaOpenACC/compute-construct-create-clause.cpp
new file mode 100644
index 000000000000..d0323620b8f7
--- /dev/null
+++ b/clang/test/SemaOpenACC/compute-construct-create-clause.cpp
@@ -0,0 +1,112 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+enum SomeE{};
+typedef struct IsComplete {
+  struct S { int A; } CompositeMember;
+  int ScalarMember;
+  float ArrayMember[5];
+  SomeE EnumMember;
+  char *PointerMember;
+} Complete;
+
+void uses(int IntParam, char *PointerParam, float ArrayParam[5], Complete CompositeParam, int &IntParamRef) {
+  int LocalInt;
+  char *LocalPointer;
+  float LocalArray[5];
+  // Check Appertainment:
+#pragma acc parallel create(LocalInt)
+  while(1);
+#pragma acc serial create(LocalInt)
+  while(1);
+#pragma acc kernels create(LocalInt)
+  while(1);
+
+  // Valid cases:
+#pragma acc parallel create(LocalInt, LocalPointer, LocalArray)
+  while(1);
+#pragma acc parallel create(LocalArray[2:1])
+  while(1);
+
+  Complete LocalComposite2;
+#pragma acc parallel create(LocalComposite2.ScalarMember, LocalComposite2.ScalarMember)
+  while(1);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel create(1 + IntParam)
+  while(1);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel create(+IntParam)
+  while(1);
+
+  // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is not an array}}
+#pragma acc parallel create(PointerParam[2:])
+  while(1);
+
+  // expected-error@+1{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
+#pragma acc parallel create(ArrayParam[2:5])
+  while(1);
+
+  // expected-error@+2{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel create((float*)ArrayParam[2:5])
+  while(1);
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel create((float)ArrayParam[2])
+  while(1);
+}
+
+template<typename T, unsigned I, typename V>
+void TemplUses(T t, T (&arrayT)[I], V TemplComp) {
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel create(+t)
+  while(true);
+
+  // NTTP's are only valid if it is a reference to something.
+  // expected-error@+2{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+  // expected-note@#TEMPL_USES_INST{{in instantiation of}}
+#pragma acc parallel create(I)
+  while(true);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel create(t, I)
+  while(true);
+
+#pragma acc parallel create(arrayT)
+  while(true);
+
+#pragma acc parallel create(TemplComp)
+  while(true);
+
+#pragma acc parallel create(TemplComp.PointerMember[5])
+  while(true);
+ int *Pointer;
+#pragma acc parallel create(Pointer[:I])
+  while(true);
+#pragma acc parallel create(Pointer[:t])
+  while(true);
+  // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is not an array}}
+#pragma acc parallel create(Pointer[1:])
+  while(true);
+}
+
+template<unsigned I, auto &NTTP_REF>
+void NTTP() {
+  // NTTP's are only valid if it is a reference to something.
+  // expected-error@+2{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+  // expected-note@#NTTP_INST{{in instantiation of}}
+#pragma acc parallel create(I)
+  while(true);
+
+#pragma acc parallel create(NTTP_REF)
+  while(true);
+}
+
+void Inst() {
+  static constexpr int NTTP_REFed = 1;
+  int i;
+  int Arr[5];
+  Complete C;
+  TemplUses(i, Arr, C); // #TEMPL_USES_INST
+  NTTP<5, NTTP_REFed>(); // #NTTP_INST
+}
diff --git a/clang/test/SemaOpenACC/compute-construct-deviceptr-clause.c b/clang/test/SemaOpenACC/compute-construct-deviceptr-clause.c
new file mode 100644
index 000000000000..e5d328eb0b28
--- /dev/null
+++ b/clang/test/SemaOpenACC/compute-construct-deviceptr-clause.c
@@ -0,0 +1,61 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+struct S {
+  int IntMem;
+  int *PtrMem;
+};
+
+void uses() {
+  int LocalInt;
+  int *LocalPtr;
+  int Array[5];
+  int *PtrArray[5];
+  struct S s;
+
+  // expected-error@+1{{expected pointer in 'deviceptr' clause, type is 'int'}}
+#pragma acc parallel deviceptr(LocalInt)
+  while (1);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel deviceptr(&LocalInt)
+  while (1);
+
+#pragma acc serial deviceptr(LocalPtr)
+  while (1);
+
+  // expected-error@+1{{expected pointer in 'deviceptr' clause, type is 'int[5]'}}
+#pragma acc kernels deviceptr(Array)
+  while (1);
+
+  // expected-error@+1{{expected pointer in 'deviceptr' clause, type is 'int'}}
+#pragma acc parallel deviceptr(Array[0])
+  while (1);
+
+  // expected-error@+2{{OpenACC sub-array is not allowed here}}
+  // expected-note@+1{{expected variable of pointer type}}
+#pragma acc parallel deviceptr(Array[0:1])
+  while (1);
+
+  // expected-error@+1{{expected pointer in 'deviceptr' clause, type is 'int *[5]'}}
+#pragma acc parallel deviceptr(PtrArray)
+  while (1);
+
+#pragma acc parallel deviceptr(PtrArray[0])
+  while (1);
+
+  // expected-error@+2{{OpenACC sub-array is not allowed here}}
+  // expected-note@+1{{expected variable of pointer type}}
+#pragma acc parallel deviceptr(PtrArray[0:1])
+  while (1);
+
+  // expected-error@+1{{expected pointer in 'deviceptr' clause, type is 'struct S'}}
+#pragma acc parallel deviceptr(s)
+  while (1);
+
+  // expected-error@+1{{expected pointer in 'deviceptr' clause, type is 'int'}}
+#pragma acc parallel deviceptr(s.IntMem)
+  while (1);
+
+#pragma acc parallel deviceptr(s.PtrMem)
+  while (1);
+}
diff --git a/clang/test/SemaOpenACC/compute-construct-deviceptr-clause.cpp b/clang/test/SemaOpenACC/compute-construct-deviceptr-clause.cpp
new file mode 100644
index 000000000000..83409c91d481
--- /dev/null
+++ b/clang/test/SemaOpenACC/compute-construct-deviceptr-clause.cpp
@@ -0,0 +1,120 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+struct S {
+  int IntMem;
+  int *PtrMem;
+  operator int*();
+};
+
+void uses() {
+  int LocalInt;
+  int *LocalPtr;
+  int Array[5];
+  int *PtrArray[5];
+  struct S s;
+
+  // expected-error@+1{{expected pointer in 'deviceptr' clause, type is 'int'}}
+#pragma acc parallel deviceptr(LocalInt)
+  while (true);
+
+#pragma acc parallel deviceptr(LocalPtr)
+  while (true);
+
+  // expected-error@+1{{expected pointer in 'deviceptr' clause, type is 'int[5]'}}
+#pragma acc parallel deviceptr(Array)
+  while (true);
+
+  // expected-error@+1{{expected pointer in 'deviceptr' clause, type is 'int'}}
+#pragma acc parallel deviceptr(Array[0])
+  while (true);
+
+  // expected-error@+2{{OpenACC sub-array is not allowed here}}
+  // expected-note@+1{{expected variable of pointer type}}
+#pragma acc parallel deviceptr(Array[0:1])
+  while (true);
+
+  // expected-error@+1{{expected pointer in 'deviceptr' clause, type is 'int *[5]'}}
+#pragma acc parallel deviceptr(PtrArray)
+  while (true);
+
+#pragma acc parallel deviceptr(PtrArray[0])
+  while (true);
+
+  // expected-error@+2{{OpenACC sub-array is not allowed here}}
+  // expected-note@+1{{expected variable of pointer type}}
+#pragma acc parallel deviceptr(PtrArray[0:1])
+  while (true);
+
+  // expected-error@+1{{expected pointer in 'deviceptr' clause, type is 'struct S'}}
+#pragma acc parallel deviceptr(s)
+  while (true);
+
+  // expected-error@+1{{expected pointer in 'deviceptr' clause, type is 'int'}}
+#pragma acc parallel deviceptr(s.IntMem)
+  while (true);
+
+#pragma acc parallel deviceptr(s.PtrMem)
+  while (true);
+}
+
+template<typename T, typename TPtr, typename TStruct, auto &R1>
+void Templ() {
+  T SomeInt;
+  TPtr SomePtr;
+  T SomeIntArray[5];
+  TPtr SomeIntPtrArray[5];
+  TStruct SomeStruct;
+
+  // expected-error@+2{{expected pointer in 'deviceptr' clause, type is 'int'}}
+  // expected-note@#INST{{in instantiation of function template specialization}}
+#pragma acc parallel deviceptr(SomeInt)
+  while (true);
+
+#pragma acc parallel deviceptr(SomePtr)
+  while (true);
+
+  // expected-error@+1{{expected pointer in 'deviceptr' clause, type is 'int[5]'}}
+#pragma acc parallel deviceptr(SomeIntArray)
+  while (true);
+
+  // expected-error@+1{{expected pointer in 'deviceptr' clause, type is 'int'}}
+#pragma acc parallel deviceptr(SomeIntArray[0])
+  while (true);
+
+  // expected-error@+2{{OpenACC sub-array is not allowed here}}
+  // expected-note@+1{{expected variable of pointer type}}
+#pragma acc parallel deviceptr(SomeIntArray[0:1])
+  while (true);
+
+  // expected-error@+1{{expected pointer in 'deviceptr' clause, type is 'int *[5]'}}
+#pragma acc parallel deviceptr(SomeIntPtrArray)
+  while (true);
+
+#pragma acc parallel deviceptr(SomeIntPtrArray[0])
+  while (true);
+
+  // expected-error@+2{{OpenACC sub-array is not allowed here}}
+  // expected-note@+1{{expected variable of pointer type}}
+#pragma acc parallel deviceptr(SomeIntPtrArray[0:1])
+  while (true);
+
+  // expected-error@+1{{expected pointer in 'deviceptr' clause, type is 'S'}}
+#pragma acc parallel deviceptr(SomeStruct)
+  while (true);
+
+  // expected-error@+1{{expected pointer in 'deviceptr' clause, type is 'int'}}
+#pragma acc parallel deviceptr(SomeStruct.IntMem)
+  while (true);
+
+#pragma acc parallel deviceptr(SomeStruct.PtrMem)
+  while (true);
+
+  // expected-error@+1{{expected pointer in 'deviceptr' clause, type is 'int'}}
+#pragma acc parallel deviceptr(R1)
+  while (true);
+}
+
+void inst() {
+  static constexpr int CEVar = 1;
+  Templ<int, int*, S, CEVar>(); // #INST
+}
diff --git a/clang/test/SemaOpenACC/compute-construct-firstprivate-clause.c b/clang/test/SemaOpenACC/compute-construct-firstprivate-clause.c
new file mode 100644
index 000000000000..4e057bf32c2d
--- /dev/null
+++ b/clang/test/SemaOpenACC/compute-construct-firstprivate-clause.c
@@ -0,0 +1,55 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+typedef struct IsComplete {
+  struct S { int A; } CompositeMember;
+  int ScalarMember;
+  float ArrayMember[5];
+  void *PointerMember;
+} Complete;
+void uses(int IntParam, short *PointerParam, float ArrayParam[5], Complete CompositeParam) {
+  int LocalInt;
+  short *LocalPointer;
+  float LocalArray[5];
+  Complete LocalComposite;
+  // Check Appertainment:
+#pragma acc parallel firstprivate(LocalInt)
+  while(1);
+#pragma acc serial firstprivate(LocalInt)
+  while(1);
+  // expected-error@+1{{OpenACC 'firstprivate' clause is not valid on 'kernels' directive}}
+#pragma acc kernels firstprivate(LocalInt)
+  while(1);
+
+  // Valid cases:
+#pragma acc parallel firstprivate(LocalInt, LocalPointer, LocalArray)
+  while(1);
+#pragma acc parallel firstprivate(LocalArray[2:1])
+  while(1);
+
+#pragma acc parallel firstprivate(LocalComposite.ScalarMember, LocalComposite.ScalarMember)
+  while(1);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel firstprivate(1 + IntParam)
+  while(1);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel firstprivate(+IntParam)
+  while(1);
+
+  // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is not an array}}
+#pragma acc parallel firstprivate(PointerParam[2:])
+  while(1);
+
+  // expected-error@+1{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
+#pragma acc parallel firstprivate(ArrayParam[2:5])
+  while(1);
+
+  // expected-error@+2{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel firstprivate((float*)ArrayParam[2:5])
+  while(1);
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel firstprivate((float)ArrayParam[2])
+  while(1);
+}
diff --git a/clang/test/SemaOpenACC/compute-construct-firstprivate-clause.cpp b/clang/test/SemaOpenACC/compute-construct-firstprivate-clause.cpp
new file mode 100644
index 000000000000..2fbb80f7b2fb
--- /dev/null
+++ b/clang/test/SemaOpenACC/compute-construct-firstprivate-clause.cpp
@@ -0,0 +1,113 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+enum SomeE{};
+typedef struct IsComplete {
+  struct S { int A; } CompositeMember;
+  int ScalarMember;
+  float ArrayMember[5];
+  SomeE EnumMember;
+  char *PointerMember;
+} Complete;
+
+void uses(int IntParam, char *PointerParam, float ArrayParam[5], Complete CompositeParam, int &IntParamRef) {
+  int LocalInt;
+  char *LocalPointer;
+  float LocalArray[5];
+  // Check Appertainment:
+#pragma acc parallel firstprivate(LocalInt)
+  while(1);
+#pragma acc serial firstprivate(LocalInt)
+  while(1);
+  // expected-error@+1{{OpenACC 'firstprivate' clause is not valid on 'kernels' directive}}
+#pragma acc kernels firstprivate(LocalInt)
+  while(1);
+
+  // Valid cases:
+#pragma acc parallel firstprivate(LocalInt, LocalPointer, LocalArray)
+  while(1);
+#pragma acc parallel firstprivate(LocalArray[2:1])
+  while(1);
+
+  Complete LocalComposite2;
+#pragma acc parallel firstprivate(LocalComposite2.ScalarMember, LocalComposite2.ScalarMember)
+  while(1);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel firstprivate(1 + IntParam)
+  while(1);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel firstprivate(+IntParam)
+  while(1);
+
+  // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is not an array}}
+#pragma acc parallel firstprivate(PointerParam[2:])
+  while(1);
+
+  // expected-error@+1{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
+#pragma acc parallel firstprivate(ArrayParam[2:5])
+  while(1);
+
+  // expected-error@+2{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel firstprivate((float*)ArrayParam[2:5])
+  while(1);
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel firstprivate((float)ArrayParam[2])
+  while(1);
+}
+
+template<typename T, unsigned I, typename V>
+void TemplUses(T t, T (&arrayT)[I], V TemplComp) {
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel private(+t)
+  while(true);
+
+  // NTTP's are only valid if it is a reference to something.
+  // expected-error@+2{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+  // expected-note@#TEMPL_USES_INST{{in instantiation of}}
+#pragma acc parallel private(I)
+  while(true);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel private(t, I)
+  while(true);
+
+#pragma acc parallel private(arrayT)
+  while(true);
+
+#pragma acc parallel private(TemplComp)
+  while(true);
+
+#pragma acc parallel private(TemplComp.PointerMember[5])
+  while(true);
+ int *Pointer;
+#pragma acc parallel private(Pointer[:I])
+  while(true);
+#pragma acc parallel private(Pointer[:t])
+  while(true);
+  // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is not an array}}
+#pragma acc parallel private(Pointer[1:])
+  while(true);
+}
+
+template<unsigned I, auto &NTTP_REF>
+void NTTP() {
+  // NTTP's are only valid if it is a reference to something.
+  // expected-error@+2{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+  // expected-note@#NTTP_INST{{in instantiation of}}
+#pragma acc parallel private(I)
+  while(true);
+
+#pragma acc parallel private(NTTP_REF)
+  while(true);
+}
+
+void Inst() {
+  static constexpr int NTTP_REFed = 1;
+  int i;
+  int Arr[5];
+  Complete C;
+  TemplUses(i, Arr, C); // #TEMPL_USES_INST
+  NTTP<5, NTTP_REFed>(); // #NTTP_INST
+}
diff --git a/clang/test/SemaOpenACC/compute-construct-intexpr-clause-ast.cpp b/clang/test/SemaOpenACC/compute-construct-intexpr-clause-ast.cpp
index 5a4c9f05ee08..56c3512dec3b 100644
--- a/clang/test/SemaOpenACC/compute-construct-intexpr-clause-ast.cpp
+++ b/clang/test/SemaOpenACC/compute-construct-intexpr-clause-ast.cpp
@@ -116,8 +116,115 @@ void NormalUses() {
   // CHECK-NEXT: WhileStmt
   // CHECK-NEXT: CXXBoolLiteralExpr
   // CHECK-NEXT: CompoundStmt
+
+#pragma acc kernels async(some_int())
+  while(true){}
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}kernels
+  // CHECK-NEXT: async clause
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int ()' lvalue Function{{.*}} 'some_int' 'int ()'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc kernels async
+  while(true){}
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}kernels
+  // CHECK-NEXT: async clause
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc parallel wait
+  while (true){}
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}parallel
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: CompoundStmt
+#pragma acc parallel wait()
+  while (true){}
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}parallel
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: CompoundStmt
+#pragma acc parallel wait(some_int(), some_long())
+  while (true){}
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}parallel
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int ()' lvalue Function{{.*}} 'some_int' 'int ()'
+  // CHECK-NEXT: CallExpr{{.*}}'long'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'long (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'long ()' lvalue Function{{.*}} 'some_long' 'long ()'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: CompoundStmt
+#pragma acc parallel wait(queues:some_int(), some_long())
+  while (true){}
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}parallel
+  // CHECK-NEXT: wait clause has queues tag
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int ()' lvalue Function{{.*}} 'some_int' 'int ()'
+  // CHECK-NEXT: CallExpr{{.*}}'long'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'long (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'long ()' lvalue Function{{.*}} 'some_long' 'long ()'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: CompoundStmt
+#pragma acc parallel wait(devnum: some_int() :some_int(), some_long())
+  while (true){}
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}parallel
+  // CHECK-NEXT: wait clause has devnum
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int ()' lvalue Function{{.*}} 'some_int' 'int ()'
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int ()' lvalue Function{{.*}} 'some_int' 'int ()'
+  // CHECK-NEXT: CallExpr{{.*}}'long'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'long (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'long ()' lvalue Function{{.*}} 'some_long' 'long ()'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: CompoundStmt
+#pragma acc parallel wait(devnum: some_int() : queues :some_int(), some_long()) wait(devnum: some_int() : queues :some_int(), some_long())
+  while (true){}
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}parallel
+  // CHECK-NEXT: wait clause has devnum has queues tag
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int ()' lvalue Function{{.*}} 'some_int' 'int ()'
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int ()' lvalue Function{{.*}} 'some_int' 'int ()'
+  // CHECK-NEXT: CallExpr{{.*}}'long'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'long (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'long ()' lvalue Function{{.*}} 'some_long' 'long ()'
+  // CHECK-NEXT: wait clause has devnum has queues tag
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int ()' lvalue Function{{.*}} 'some_int' 'int ()'
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int ()' lvalue Function{{.*}} 'some_int' 'int ()'
+  // CHECK-NEXT: CallExpr{{.*}}'long'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'long (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'long ()' lvalue Function{{.*}} 'some_long' 'long ()'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: CompoundStmt
 }
 
+
 template<typename T, typename U>
 void TemplUses(T t, U u) {
   // CHECK-NEXT: FunctionTemplateDecl
@@ -235,6 +342,99 @@ void TemplUses(T t, U u) {
   // CHECK-NEXT: CXXBoolLiteralExpr
   // CHECK-NEXT: CompoundStmt
 
+#pragma acc kernels async
+  while(true){}
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}kernels
+  // CHECK-NEXT: async clause
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc kernels async(u)
+  while(true){}
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}kernels
+  // CHECK-NEXT: async clause
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'U' lvalue ParmVar{{.*}} 'u' 'U'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc parallel async (U::value)
+  while(true){}
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}parallel
+  // CHECK-NEXT: async clause
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}} '<dependent type>' lvalue
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'U'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc parallel wait
+  while (true){}
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}parallel
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc parallel wait()
+  while (true){}
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}parallel
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc parallel wait(U::value, u)
+  while (true){}
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}parallel
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}} '<dependent type>' lvalue
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'U'
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'U' lvalue ParmVar{{.*}} 'u' 'U'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc parallel wait(queues: U::value, u)
+  while (true){}
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}parallel
+  // CHECK-NEXT: wait clause has queues tag
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}} '<dependent type>' lvalue
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'U'
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'U' lvalue ParmVar{{.*}} 'u' 'U'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc parallel wait(devnum:u:queues: U::value, u)
+  while (true){}
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}parallel
+  // CHECK-NEXT: wait clause has devnum has queues tag
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'U' lvalue ParmVar{{.*}} 'u' 'U'
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}} '<dependent type>' lvalue
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'U'
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'U' lvalue ParmVar{{.*}} 'u' 'U'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc parallel wait(devnum:u: U::value, u)
+  while (true){}
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}parallel
+  // CHECK-NEXT: wait clause has devnum
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'U' lvalue ParmVar{{.*}} 'u' 'U'
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}} '<dependent type>' lvalue
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'U'
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'U' lvalue ParmVar{{.*}} 'u' 'U'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: CompoundStmt
+
 
   // CHECK-NEXT: DeclStmt
   // CHECK-NEXT: VarDecl{{.*}}EndMarker
@@ -365,6 +565,107 @@ void TemplUses(T t, U u) {
   // CHECK-NEXT: CXXBoolLiteralExpr
   // CHECK-NEXT: CompoundStmt
 
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}kernels
+  // CHECK-NEXT: async clause
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: CompoundStmt
+
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}kernels
+  // CHECK-NEXT: async clause
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'char' <UserDefinedConversion>
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'char'
+  // CHECK-NEXT: MemberExpr{{.*}} '<bound member function type>' .operator char
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'HasInt' lvalue ParmVar
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: CompoundStmt
+
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}parallel
+  // CHECK-NEXT: async clause
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'const int' lvalue Var{{.*}} 'value' 'const int'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'HasInt'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: CompoundStmt
+
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}parallel
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: CompoundStmt
+
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}parallel
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: CompoundStmt
+
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}parallel
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'const int' lvalue Var{{.*}} 'value' 'const int'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'HasInt'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'char' <UserDefinedConversion>
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'char'
+  // CHECK-NEXT: MemberExpr{{.*}} '<bound member function type>' .operator char
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'HasInt' lvalue ParmVar
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: CompoundStmt
+
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}parallel
+  // CHECK-NEXT: wait clause has queues tag
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'const int' lvalue Var{{.*}} 'value' 'const int'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'HasInt'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'char' <UserDefinedConversion>
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'char'
+  // CHECK-NEXT: MemberExpr{{.*}} '<bound member function type>' .operator char
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'HasInt' lvalue ParmVar
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: CompoundStmt
+
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}parallel
+  // CHECK-NEXT: wait clause has devnum has queues tag
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'char' <UserDefinedConversion>
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'char'
+  // CHECK-NEXT: MemberExpr{{.*}} '<bound member function type>' .operator char
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'HasInt' lvalue ParmVar
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'const int' lvalue Var{{.*}} 'value' 'const int'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'HasInt'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'char' <UserDefinedConversion>
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'char'
+  // CHECK-NEXT: MemberExpr{{.*}} '<bound member function type>' .operator char
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'HasInt' lvalue ParmVar
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: CompoundStmt
+
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}parallel
+  // CHECK-NEXT: wait clause has devnum
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'char' <UserDefinedConversion>
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'char'
+  // CHECK-NEXT: MemberExpr{{.*}} '<bound member function type>' .operator char
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'HasInt' lvalue ParmVar
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'const int' lvalue Var{{.*}} 'value' 'const int'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'HasInt'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'char' <UserDefinedConversion>
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'char'
+  // CHECK-NEXT: MemberExpr{{.*}} '<bound member function type>' .operator char
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'HasInt' lvalue ParmVar
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: CompoundStmt
+
   // CHECK-NEXT: DeclStmt
   // CHECK-NEXT: VarDecl{{.*}}EndMarker
 }
diff --git a/clang/test/SemaOpenACC/compute-construct-no_create-clause.c b/clang/test/SemaOpenACC/compute-construct-no_create-clause.c
new file mode 100644
index 000000000000..07a60b73c34f
--- /dev/null
+++ b/clang/test/SemaOpenACC/compute-construct-no_create-clause.c
@@ -0,0 +1,54 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+typedef struct IsComplete {
+  struct S { int A; } CompositeMember;
+  int ScalarMember;
+  float ArrayMember[5];
+  void *PointerMember;
+} Complete;
+void uses(int IntParam, short *PointerParam, float ArrayParam[5], Complete CompositeParam) {
+  int LocalInt;
+  short *LocalPointer;
+  float LocalArray[5];
+  Complete LocalComposite;
+  // Check Appertainment:
+#pragma acc parallel no_create(LocalInt)
+  while(1);
+#pragma acc serial no_create(LocalInt)
+  while(1);
+#pragma acc kernels no_create(LocalInt)
+  while(1);
+
+  // Valid cases:
+#pragma acc parallel no_create(LocalInt, LocalPointer, LocalArray)
+  while(1);
+#pragma acc parallel no_create(LocalArray[2:1])
+  while(1);
+
+#pragma acc parallel no_create(LocalComposite.ScalarMember, LocalComposite.ScalarMember)
+  while(1);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel no_create(1 + IntParam)
+  while(1);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel no_create(+IntParam)
+  while(1);
+
+  // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is not an array}}
+#pragma acc parallel no_create(PointerParam[2:])
+  while(1);
+
+  // expected-error@+1{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
+#pragma acc parallel no_create(ArrayParam[2:5])
+  while(1);
+
+  // expected-error@+2{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel no_create((float*)ArrayParam[2:5])
+  while(1);
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel no_create((float)ArrayParam[2])
+  while(1);
+}
diff --git a/clang/test/SemaOpenACC/compute-construct-no_create-clause.cpp b/clang/test/SemaOpenACC/compute-construct-no_create-clause.cpp
new file mode 100644
index 000000000000..3820d5e3999d
--- /dev/null
+++ b/clang/test/SemaOpenACC/compute-construct-no_create-clause.cpp
@@ -0,0 +1,112 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+enum SomeE{};
+typedef struct IsComplete {
+  struct S { int A; } CompositeMember;
+  int ScalarMember;
+  float ArrayMember[5];
+  SomeE EnumMember;
+  char *PointerMember;
+} Complete;
+
+void uses(int IntParam, char *PointerParam, float ArrayParam[5], Complete CompositeParam, int &IntParamRef) {
+  int LocalInt;
+  char *LocalPointer;
+  float LocalArray[5];
+  // Check Appertainment:
+#pragma acc parallel no_create(LocalInt)
+  while(1);
+#pragma acc serial no_create(LocalInt)
+  while(1);
+#pragma acc kernels no_create(LocalInt)
+  while(1);
+
+  // Valid cases:
+#pragma acc parallel no_create(LocalInt, LocalPointer, LocalArray)
+  while(1);
+#pragma acc parallel no_create(LocalArray[2:1])
+  while(1);
+
+  Complete LocalComposite2;
+#pragma acc parallel no_create(LocalComposite2.ScalarMember, LocalComposite2.ScalarMember)
+  while(1);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel no_create(1 + IntParam)
+  while(1);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel no_create(+IntParam)
+  while(1);
+
+  // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is not an array}}
+#pragma acc parallel no_create(PointerParam[2:])
+  while(1);
+
+  // expected-error@+1{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
+#pragma acc parallel no_create(ArrayParam[2:5])
+  while(1);
+
+  // expected-error@+2{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel no_create((float*)ArrayParam[2:5])
+  while(1);
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel no_create((float)ArrayParam[2])
+  while(1);
+}
+
+template<typename T, unsigned I, typename V>
+void TemplUses(T t, T (&arrayT)[I], V TemplComp) {
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel no_create(+t)
+  while(true);
+
+  // NTTP's are only valid if it is a reference to something.
+  // expected-error@+2{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+  // expected-note@#TEMPL_USES_INST{{in instantiation of}}
+#pragma acc parallel no_create(I)
+  while(true);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel no_create(t, I)
+  while(true);
+
+#pragma acc parallel no_create(arrayT)
+  while(true);
+
+#pragma acc parallel no_create(TemplComp)
+  while(true);
+
+#pragma acc parallel no_create(TemplComp.PointerMember[5])
+  while(true);
+ int *Pointer;
+#pragma acc parallel no_create(Pointer[:I])
+  while(true);
+#pragma acc parallel no_create(Pointer[:t])
+  while(true);
+  // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is not an array}}
+#pragma acc parallel no_create(Pointer[1:])
+  while(true);
+}
+
+template<unsigned I, auto &NTTP_REF>
+void NTTP() {
+  // NTTP's are only valid if it is a reference to something.
+  // expected-error@+2{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+  // expected-note@#NTTP_INST{{in instantiation of}}
+#pragma acc parallel no_create(I)
+  while(true);
+
+#pragma acc parallel no_create(NTTP_REF)
+  while(true);
+}
+
+void Inst() {
+  static constexpr int NTTP_REFed = 1;
+  int i;
+  int Arr[5];
+  Complete C;
+  TemplUses(i, Arr, C); // #TEMPL_USES_INST
+  NTTP<5, NTTP_REFed>(); // #NTTP_INST
+}
diff --git a/clang/test/SemaOpenACC/compute-construct-present-clause.c b/clang/test/SemaOpenACC/compute-construct-present-clause.c
new file mode 100644
index 000000000000..99c4b1dcd19b
--- /dev/null
+++ b/clang/test/SemaOpenACC/compute-construct-present-clause.c
@@ -0,0 +1,54 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+typedef struct IsComplete {
+  struct S { int A; } CompositeMember;
+  int ScalarMember;
+  float ArrayMember[5];
+  void *PointerMember;
+} Complete;
+void uses(int IntParam, short *PointerParam, float ArrayParam[5], Complete CompositeParam) {
+  int LocalInt;
+  short *LocalPointer;
+  float LocalArray[5];
+  Complete LocalComposite;
+  // Check Appertainment:
+#pragma acc parallel present(LocalInt)
+  while(1);
+#pragma acc serial present(LocalInt)
+  while(1);
+#pragma acc kernels present(LocalInt)
+  while(1);
+
+  // Valid cases:
+#pragma acc parallel present(LocalInt, LocalPointer, LocalArray)
+  while(1);
+#pragma acc parallel present(LocalArray[2:1])
+  while(1);
+
+#pragma acc parallel present(LocalComposite.ScalarMember, LocalComposite.ScalarMember)
+  while(1);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel present(1 + IntParam)
+  while(1);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel present(+IntParam)
+  while(1);
+
+  // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is not an array}}
+#pragma acc parallel present(PointerParam[2:])
+  while(1);
+
+  // expected-error@+1{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
+#pragma acc parallel present(ArrayParam[2:5])
+  while(1);
+
+  // expected-error@+2{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel present((float*)ArrayParam[2:5])
+  while(1);
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel present((float)ArrayParam[2])
+  while(1);
+}
diff --git a/clang/test/SemaOpenACC/compute-construct-present-clause.cpp b/clang/test/SemaOpenACC/compute-construct-present-clause.cpp
new file mode 100644
index 000000000000..62e481dea3e2
--- /dev/null
+++ b/clang/test/SemaOpenACC/compute-construct-present-clause.cpp
@@ -0,0 +1,112 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+enum SomeE{};
+typedef struct IsComplete {
+  struct S { int A; } CompositeMember;
+  int ScalarMember;
+  float ArrayMember[5];
+  SomeE EnumMember;
+  char *PointerMember;
+} Complete;
+
+void uses(int IntParam, char *PointerParam, float ArrayParam[5], Complete CompositeParam, int &IntParamRef) {
+  int LocalInt;
+  char *LocalPointer;
+  float LocalArray[5];
+  // Check Appertainment:
+#pragma acc parallel present(LocalInt)
+  while(1);
+#pragma acc serial present(LocalInt)
+  while(1);
+#pragma acc kernels present(LocalInt)
+  while(1);
+
+  // Valid cases:
+#pragma acc parallel present(LocalInt, LocalPointer, LocalArray)
+  while(1);
+#pragma acc parallel present(LocalArray[2:1])
+  while(1);
+
+  Complete LocalComposite2;
+#pragma acc parallel present(LocalComposite2.ScalarMember, LocalComposite2.ScalarMember)
+  while(1);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel present(1 + IntParam)
+  while(1);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel present(+IntParam)
+  while(1);
+
+  // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is not an array}}
+#pragma acc parallel present(PointerParam[2:])
+  while(1);
+
+  // expected-error@+1{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
+#pragma acc parallel present(ArrayParam[2:5])
+  while(1);
+
+  // expected-error@+2{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel present((float*)ArrayParam[2:5])
+  while(1);
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel present((float)ArrayParam[2])
+  while(1);
+}
+
+template<typename T, unsigned I, typename V>
+void TemplUses(T t, T (&arrayT)[I], V TemplComp) {
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel present(+t)
+  while(true);
+
+  // NTTP's are only valid if it is a reference to something.
+  // expected-error@+2{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+  // expected-note@#TEMPL_USES_INST{{in instantiation of}}
+#pragma acc parallel present(I)
+  while(true);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel present(t, I)
+  while(true);
+
+#pragma acc parallel present(arrayT)
+  while(true);
+
+#pragma acc parallel present(TemplComp)
+  while(true);
+
+#pragma acc parallel present(TemplComp.PointerMember[5])
+  while(true);
+ int *Pointer;
+#pragma acc parallel present(Pointer[:I])
+  while(true);
+#pragma acc parallel present(Pointer[:t])
+  while(true);
+  // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is not an array}}
+#pragma acc parallel present(Pointer[1:])
+  while(true);
+}
+
+template<unsigned I, auto &NTTP_REF>
+void NTTP() {
+  // NTTP's are only valid if it is a reference to something.
+  // expected-error@+2{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+  // expected-note@#NTTP_INST{{in instantiation of}}
+#pragma acc parallel present(I)
+  while(true);
+
+#pragma acc parallel present(NTTP_REF)
+  while(true);
+}
+
+void Inst() {
+  static constexpr int NTTP_REFed = 1;
+  int i;
+  int Arr[5];
+  Complete C;
+  TemplUses(i, Arr, C); // #TEMPL_USES_INST
+  NTTP<5, NTTP_REFed>(); // #NTTP_INST
+}
diff --git a/clang/test/SemaOpenACC/compute-construct-private-clause.c b/clang/test/SemaOpenACC/compute-construct-private-clause.c
index 15775279fc86..d2615c384cdb 100644
--- a/clang/test/SemaOpenACC/compute-construct-private-clause.c
+++ b/clang/test/SemaOpenACC/compute-construct-private-clause.c
@@ -12,12 +12,12 @@ typedef struct IsComplete {
 
 int GlobalInt;
 float GlobalArray[5];
-void *GlobalPointer;
+short *GlobalPointer;
 Complete GlobalComposite;
 
-void uses(int IntParam, void *PointerParam, float ArrayParam[5], Complete CompositeParam) {
+void uses(int IntParam, short *PointerParam, float ArrayParam[5], Complete CompositeParam) {
   int LocalInt;
-  void *LocalPointer;
+  short *LocalPointer;
   float LocalArray[5];
   Complete LocalComposite;
 
@@ -35,17 +35,13 @@ void uses(int IntParam, void *PointerParam, float ArrayParam[5], Complete Compos
   while(1);
 #pragma acc parallel private(LocalArray)
   while(1);
-  // TODO OpenACC: Fix array sections, this should be allowed.
-  // expected-error@+1{{expected expression}}
 #pragma acc parallel private(LocalArray[:])
   while(1);
 #pragma acc parallel private(LocalArray[:5])
   while(1);
-  // TODO OpenACC: Fix array sections, this should be allowed.
-  // expected-error@+1{{expected expression}}
 #pragma acc parallel private(LocalArray[2:])
   while(1);
-#pragma acc parallel private(LocalArray[2:5])
+#pragma acc parallel private(LocalArray[2:1])
   while(1);
 #pragma acc parallel private(LocalArray[2])
   while(1);
@@ -103,40 +99,36 @@ void uses(int IntParam, void *PointerParam, float ArrayParam[5], Complete Compos
 #pragma acc parallel private(+GlobalInt)
   while(1);
 
-  // TODO OpenACC: Fix array sections, this should be allowed.
-  // expected-error@+1{{expected expression}}
+  // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is not an array}}
 #pragma acc parallel private(PointerParam[:])
   while(1);
 #pragma acc parallel private(PointerParam[:5])
   while(1);
 #pragma acc parallel private(PointerParam[:IntParam])
   while(1);
-  // TODO OpenACC: Fix array sections, this should be allowed.
-  // expected-error@+1{{expected expression}}
+  // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is not an array}}
 #pragma acc parallel private(PointerParam[2:])
   while(1);
 #pragma acc parallel private(PointerParam[2:5])
   while(1);
 #pragma acc parallel private(PointerParam[2])
   while(1);
-  // TODO OpenACC: Fix array sections, this should be allowed.
-  // expected-error@+1{{expected expression}}
 #pragma acc parallel private(ArrayParam[:])
   while(1);
 #pragma acc parallel private(ArrayParam[:5])
   while(1);
 #pragma acc parallel private(ArrayParam[:IntParam])
   while(1);
-  // TODO OpenACC: Fix array sections, this should be allowed.
-  // expected-error@+1{{expected expression}}
 #pragma acc parallel private(ArrayParam[2:])
   while(1);
+  // expected-error@+1{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
 #pragma acc parallel private(ArrayParam[2:5])
   while(1);
 #pragma acc parallel private(ArrayParam[2])
   while(1);
 
-  // expected-error@+1{{OpenACC sub-array is not allowed here}}
+  // expected-error@+2{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
 #pragma acc parallel private((float*)ArrayParam[2:5])
   while(1);
   // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
diff --git a/clang/test/SemaOpenACC/compute-construct-private-clause.cpp b/clang/test/SemaOpenACC/compute-construct-private-clause.cpp
index 4dd4e0d8029d..a776b16f0feb 100644
--- a/clang/test/SemaOpenACC/compute-construct-private-clause.cpp
+++ b/clang/test/SemaOpenACC/compute-construct-private-clause.cpp
@@ -112,8 +112,7 @@ void TemplUses(T t, T (&arrayT)[I], V TemplComp) {
   while(true);
 #pragma acc parallel private(Pointer[:t])
   while(true);
-  // TODO OpenACC: When fixing sub-arrays, this should be permitted}}
-  // expected-error@+1{{expected expression}}
+  // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is not an array}}
 #pragma acc parallel private(Pointer[1:])
   while(true);
 }
diff --git a/clang/test/SemaOpenACC/compute-construct-varlist-ast.cpp b/clang/test/SemaOpenACC/compute-construct-varlist-ast.cpp
index 341be3c58ebd..e057678d9249 100644
--- a/clang/test/SemaOpenACC/compute-construct-varlist-ast.cpp
+++ b/clang/test/SemaOpenACC/compute-construct-varlist-ast.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -fopenacc -ast-dump | FileCheck %s
+// RUN: %clang_cc1 %s -fopenacc -Wno-openacc-deprecated-clause-alias -ast-dump | FileCheck %s
 
 int Global;
 short GlobalArray[5];
@@ -35,6 +35,48 @@ void NormalUses(float *PointerParam) {
   // CHECK-NEXT: CXXBoolLiteralExpr
   // CHECK-NEXT: NullStmt
 
+#pragma acc parallel firstprivate(GlobalArray, PointerParam[Global])
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: firstprivate clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'short[5]' lvalue Var{{.*}}'GlobalArray' 'short[5]'
+  // CHECK-NEXT: ArraySubscriptExpr{{.*}}'float' lvalue
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'float *' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'float *' lvalue ParmVar{{.*}}'PointerParam' 'float *'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel present(GlobalArray, PointerParam[Global])
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: present clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'short[5]' lvalue Var{{.*}}'GlobalArray' 'short[5]'
+  // CHECK-NEXT: ArraySubscriptExpr{{.*}}'float' lvalue
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'float *' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'float *' lvalue ParmVar{{.*}}'PointerParam' 'float *'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel no_create(GlobalArray, PointerParam[Global])
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: no_create clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'short[5]' lvalue Var{{.*}}'GlobalArray' 'short[5]'
+  // CHECK-NEXT: ArraySubscriptExpr{{.*}}'float' lvalue
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'float *' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'float *' lvalue ParmVar{{.*}}'PointerParam' 'float *'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
 #pragma acc parallel private(GlobalArray) private(PointerParam[Global])
   while(true);
   // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
@@ -50,18 +92,116 @@ void NormalUses(float *PointerParam) {
   // CHECK-NEXT: CXXBoolLiteralExpr
   // CHECK-NEXT: NullStmt
 
+#pragma acc parallel copy(GlobalArray) pcopy(PointerParam[Global]) present_or_copy(Global)
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: copy clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'short[5]' lvalue Var{{.*}}'GlobalArray' 'short[5]'
+  // CHECK-NEXT: pcopy clause
+  // CHECK-NEXT: ArraySubscriptExpr{{.*}}'float' lvalue
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'float *' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'float *' lvalue ParmVar{{.*}}'PointerParam' 'float *'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int'
+  // CHECK-NEXT: present_or_copy clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel copyin(GlobalArray) pcopyin(readonly: PointerParam[Global]) present_or_copyin(Global)
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: copyin clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'short[5]' lvalue Var{{.*}}'GlobalArray' 'short[5]'
+  // CHECK-NEXT: pcopyin clause : readonly
+  // CHECK-NEXT: ArraySubscriptExpr{{.*}}'float' lvalue
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'float *' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'float *' lvalue ParmVar{{.*}}'PointerParam' 'float *'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int'
+  // CHECK-NEXT: present_or_copyin clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel copyout(GlobalArray) pcopyout(zero:PointerParam[Global]) present_or_copyout(Global)
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: copyout clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'short[5]' lvalue Var{{.*}}'GlobalArray' 'short[5]'
+  // CHECK-NEXT: pcopyout clause : zero
+  // CHECK-NEXT: ArraySubscriptExpr{{.*}}'float' lvalue
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'float *' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'float *' lvalue ParmVar{{.*}}'PointerParam' 'float *'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int'
+  // CHECK-NEXT: present_or_copyout clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel create(GlobalArray) pcreate(zero:PointerParam[Global]) present_or_create(Global)
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: create clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'short[5]' lvalue Var{{.*}}'GlobalArray' 'short[5]'
+  // CHECK-NEXT: pcreate clause : zero
+  // CHECK-NEXT: ArraySubscriptExpr{{.*}}'float' lvalue
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'float *' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'float *' lvalue ParmVar{{.*}}'PointerParam' 'float *'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int'
+  // CHECK-NEXT: present_or_create clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
 #pragma acc parallel private(GlobalArray, PointerParam[Global : Global])
   while(true);
   // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
   // CHECK-NEXT: private clause
   // CHECK-NEXT: DeclRefExpr{{.*}}'short[5]' lvalue Var{{.*}}'GlobalArray' 'short[5]'
   // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'float *' <LValueToRValue>
   // CHECK-NEXT: DeclRefExpr{{.*}}'float *' lvalue ParmVar{{.*}} 'PointerParam' 'float *'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
   // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
   // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int'
   // CHECK-NEXT: WhileStmt
   // CHECK-NEXT: CXXBoolLiteralExpr
   // CHECK-NEXT: NullStmt
+
+#pragma acc parallel firstprivate(GlobalArray, PointerParam[Global : Global])
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: firstprivate clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'short[5]' lvalue Var{{.*}}'GlobalArray' 'short[5]'
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'float *' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'float *' lvalue ParmVar{{.*}} 'PointerParam' 'float *'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel attach(PointerParam) deviceptr(PointerParam)
+  while (true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: attach clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'float *' lvalue ParmVar{{.*}} 'PointerParam' 'float *'
+  // CHECK-NEXT: deviceptr clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'float *' lvalue ParmVar{{.*}} 'PointerParam' 'float *'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
 }
 
 // This example is an error typically, but we want to make sure we're properly
@@ -80,6 +220,14 @@ void UnInstTempl() {
   // CHECK-NEXT: WhileStmt
   // CHECK-NEXT: CXXBoolLiteralExpr
   // CHECK-NEXT: NullStmt
+#pragma acc parallel firstprivate(I)
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: firstprivate clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'unsigned int' NonTypeTemplateParm{{.*}}'I' 'unsigned int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
 }
 
 template<auto &NTTP, typename T, typename U>
@@ -117,6 +265,16 @@ void TemplUses(T t, U u, T*PointerParam) {
   // CHECK-NEXT: CXXBoolLiteralExpr
   // CHECK-NEXT: NullStmt
 
+#pragma acc parallel firstprivate(t, u)
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: firstprivate clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 't' 'T'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
 #pragma acc parallel private(t) private(u)
   while(true);
   // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
@@ -140,6 +298,98 @@ void TemplUses(T t, U u, T*PointerParam) {
   // CHECK-NEXT: CXXBoolLiteralExpr
   // CHECK-NEXT: NullStmt
 
+#pragma acc parallel private(t) firstprivate(NTTP, u)
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 't' 'T'
+  // CHECK-NEXT: firstprivate clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'auto' lvalue NonTypeTemplateParm{{.*}} 'NTTP' 'auto &'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel no_create(t) present(NTTP, u)
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: no_create clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 't' 'T'
+  // CHECK-NEXT: present clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'auto' lvalue NonTypeTemplateParm{{.*}} 'NTTP' 'auto &'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel copy(t) pcopy(NTTP, u) present_or_copy(u[0:t])
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: copy clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 't' 'T'
+  // CHECK-NEXT: pcopy clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'auto' lvalue NonTypeTemplateParm{{.*}} 'NTTP' 'auto &'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U'
+  // CHECK-NEXT: present_or_copy clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U'
+  // CHECK-NEXT: IntegerLiteral{{.*}} 'int' 0
+  // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 't' 'T'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel copyin(t) pcopyin(readonly:NTTP, u) present_or_copyin(u[0:t])
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: copyin clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 't' 'T'
+  // CHECK-NEXT: pcopyin clause : readonly
+  // CHECK-NEXT: DeclRefExpr{{.*}}'auto' lvalue NonTypeTemplateParm{{.*}} 'NTTP' 'auto &'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U'
+  // CHECK-NEXT: present_or_copyin clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U'
+  // CHECK-NEXT: IntegerLiteral{{.*}} 'int' 0
+  // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 't' 'T'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel copyout(t) pcopyout(zero:NTTP, u) present_or_copyout(u[0:t])
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: copyout clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 't' 'T'
+  // CHECK-NEXT: pcopyout clause : zero
+  // CHECK-NEXT: DeclRefExpr{{.*}}'auto' lvalue NonTypeTemplateParm{{.*}} 'NTTP' 'auto &'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U'
+  // CHECK-NEXT: present_or_copyout clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U'
+  // CHECK-NEXT: IntegerLiteral{{.*}} 'int' 0
+  // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 't' 'T'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel create(t) pcreate(zero: NTTP, u) present_or_create(u[0:t])
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: create clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 't' 'T'
+  // CHECK-NEXT: pcreate clause : zero
+  // CHECK-NEXT: DeclRefExpr{{.*}}'auto' lvalue NonTypeTemplateParm{{.*}} 'NTTP' 'auto &'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U'
+  // CHECK-NEXT: present_or_create clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U'
+  // CHECK-NEXT: IntegerLiteral{{.*}} 'int' 0
+  // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 't' 'T'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
 #pragma acc parallel private(u[0])
   while(true);
   // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
@@ -163,6 +413,17 @@ void TemplUses(T t, U u, T*PointerParam) {
   // CHECK-NEXT: CXXBoolLiteralExpr
   // CHECK-NEXT: NullStmt
 
+#pragma acc parallel attach(PointerParam) deviceptr(PointerParam)
+  while (true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: attach clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'T *' lvalue ParmVar{{.*}} 'PointerParam' 'T *'
+  // CHECK-NEXT: deviceptr clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'T *' lvalue ParmVar{{.*}} 'PointerParam' 'T *'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
   // CHECK-NEXT: DeclStmt
   // CHECK-NEXT: VarDecl{{.*}}EndMarker
   int EndMarker;
@@ -203,6 +464,15 @@ void TemplUses(T t, U u, T*PointerParam) {
   // CHECK-NEXT: CXXBoolLiteralExpr
   // CHECK-NEXT: NullStmt
 
+// #pragma acc parallel firstprivate(t, u)
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: firstprivate clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 't' 'int'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int *' lvalue ParmVar{{.*}} 'u' 'int *'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
 // #pragma acc parallel private(t) private(u)
   // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
   // CHECK-NEXT: private clause
@@ -226,6 +496,112 @@ void TemplUses(T t, U u, T*PointerParam) {
   // CHECK-NEXT: CXXBoolLiteralExpr
   // CHECK-NEXT: NullStmt
 
+// #pragma acc parallel private(t) firstprivate(NTTP, u)
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 't' 'int'
+  // CHECK-NEXT: firstprivate clause
+  // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}}'const unsigned int' lvalue
+  // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} referenced 'auto &' depth 0 index 0 NTTP
+  // CHECK-NEXT: DeclRefExpr{{.*}}'const unsigned int' lvalue Var{{.*}} 'CEVar' 'const unsigned int'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int *' lvalue ParmVar{{.*}} 'u' 'int *'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+// #pragma acc parallel no_create(t) present(NTTP, u)
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: no_create clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 't' 'int'
+  // CHECK-NEXT: present clause
+  // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}}'const unsigned int' lvalue
+  // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} referenced 'auto &' depth 0 index 0 NTTP
+  // CHECK-NEXT: DeclRefExpr{{.*}}'const unsigned int' lvalue Var{{.*}} 'CEVar' 'const unsigned int'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int *' lvalue ParmVar{{.*}} 'u' 'int *'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+//#pragma acc parallel copy(t) pcopy(NTTP, u) copy_or_present(u[0:t])
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: copy clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 't' 'int'
+  // CHECK-NEXT: pcopy clause
+  // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}}'const unsigned int' lvalue
+  // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} referenced 'auto &' depth 0 index 0 NTTP
+  // CHECK-NEXT: DeclRefExpr{{.*}}'const unsigned int' lvalue Var{{.*}} 'CEVar' 'const unsigned int'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int *' lvalue ParmVar{{.*}} 'u' 'int *'
+  // CHECK-NEXT: present_or_copy clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int *' lvalue ParmVar{{.*}} 'u' 'int *'
+  // CHECK-NEXT: IntegerLiteral{{.*}} 'int' 0
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 't' 'int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+//#pragma acc parallel copyin(t) pcopyin(readonly:NTTP, u) present_or_copyin(u[0:t])
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: copyin clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 't' 'int'
+  // CHECK-NEXT: pcopyin clause : readonly
+  // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}}'const unsigned int' lvalue
+  // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} referenced 'auto &' depth 0 index 0 NTTP
+  // CHECK-NEXT: DeclRefExpr{{.*}}'const unsigned int' lvalue Var{{.*}} 'CEVar' 'const unsigned int'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int *' lvalue ParmVar{{.*}} 'u' 'int *'
+  // CHECK-NEXT: present_or_copyin clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int *' lvalue ParmVar{{.*}} 'u' 'int *'
+  // CHECK-NEXT: IntegerLiteral{{.*}} 'int' 0
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 't' 'int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+//#pragma acc parallel copyout(t) pcopyout(zero:NTTP, u) present_or_copyout(u[0:t])
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: copyout clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 't' 'int'
+  // CHECK-NEXT: pcopyout clause : zero
+  // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}}'const unsigned int' lvalue
+  // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} referenced 'auto &' depth 0 index 0 NTTP
+  // CHECK-NEXT: DeclRefExpr{{.*}}'const unsigned int' lvalue Var{{.*}} 'CEVar' 'const unsigned int'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int *' lvalue ParmVar{{.*}} 'u' 'int *'
+  // CHECK-NEXT: present_or_copyout clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int *' lvalue ParmVar{{.*}} 'u' 'int *'
+  // CHECK-NEXT: IntegerLiteral{{.*}} 'int' 0
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 't' 'int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+//#pragma acc parallel create(t) pcreate(zero: NTTP, u) present_or_create(u[0:t])
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: create clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 't' 'int'
+  // CHECK-NEXT: pcreate clause : zero
+  // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}}'const unsigned int' lvalue
+  // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} referenced 'auto &' depth 0 index 0 NTTP
+  // CHECK-NEXT: DeclRefExpr{{.*}}'const unsigned int' lvalue Var{{.*}} 'CEVar' 'const unsigned int'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int *' lvalue ParmVar{{.*}} 'u' 'int *'
+  // CHECK-NEXT: present_or_create clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int *' lvalue ParmVar{{.*}} 'u' 'int *'
+  // CHECK-NEXT: IntegerLiteral{{.*}} 'int' 0
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 't' 'int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
 // #pragma acc parallel private(u[0])
   // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
   // CHECK-NEXT: private clause
@@ -241,13 +617,25 @@ void TemplUses(T t, U u, T*PointerParam) {
   // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
   // CHECK-NEXT: private clause
   // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <LValueToRValue>
   // CHECK-NEXT: DeclRefExpr{{.*}}'int *' lvalue ParmVar{{.*}} 'u' 'int *'
   // CHECK-NEXT: IntegerLiteral{{.*}} 'int' 0
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
   // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 't' 'int'
   // CHECK-NEXT: WhileStmt
   // CHECK-NEXT: CXXBoolLiteralExpr
   // CHECK-NEXT: NullStmt
 
+//#pragma acc parallel attach(PointerParam) deviceptr(PointerParam)
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: attach clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int *' lvalue ParmVar{{.*}} 'PointerParam' 'int *'
+  // CHECK-NEXT: deviceptr clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int *' lvalue ParmVar{{.*}} 'PointerParam' 'int *'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
   // CHECK-NEXT: DeclStmt
   // CHECK-NEXT: VarDecl{{.*}}EndMarker
 }
@@ -257,6 +645,8 @@ struct S {
   // CHECK: CXXRecordDecl{{.*}} implicit struct S
   int ThisMember;
   // CHECK-NEXT: FieldDecl{{.*}} ThisMember 'int'
+  int *ThisMemberPtr;
+  // CHECK-NEXT: FieldDecl{{.*}} ThisMemberPtr 'int *'
   int ThisMemberArray[5];
   // CHECK-NEXT: FieldDecl{{.*}} ThisMemberArray 'int[5]'
 
@@ -264,10 +654,11 @@ struct S {
   // CHECK-NEXT: CXXMethodDecl{{.*}} foo 'void ()'
 
   template<typename T>
-  void bar() {
+  void bar(T *PointerParam) {
   // CHECK-NEXT: FunctionTemplateDecl{{.*}}bar
   // CHECK-NEXT: TemplateTypeParmDecl{{.*}}typename depth 0 index 0 T
-  // CHECK-NEXT: CXXMethodDecl{{.*}} bar 'void ()' implicit-inline
+  // CHECK-NEXT: CXXMethodDecl{{.*}} bar 'void (T *)' implicit-inline
+  // CHECK-NEXT: ParmVarDecl{{.*}} PointerParam 'T *'
   // CHECK-NEXT: CompoundStmt
 
 #pragma acc parallel private(ThisMember, this->ThisMemberArray[1])
@@ -290,6 +681,7 @@ struct S {
   // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
   // CHECK-NEXT: private clause
   // CHECK-NEXT: ArraySectionExpr{{.*}}
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <ArrayToPointerDecay>
   // CHECK-NEXT: MemberExpr{{.*}} 'int[5]' lvalue ->ThisMemberArray
   // CHECK-NEXT: CXXThisExpr{{.*}} 'S *' implicit this
   // CHECK-NEXT: IntegerLiteral{{.*}}'int' 1
@@ -307,10 +699,28 @@ struct S {
   // CHECK-NEXT: CXXBoolLiteralExpr
   // CHECK-NEXT: NullStmt
 
+#pragma acc parallel attach(PointerParam, this, this->ThisMemberPtr) deviceptr(PointerParam, this, ThisMemberPtr)
+  while (true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: attach clause
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'T *' lvalue ParmVar{{.*}} 'PointerParam' 'T *'
+  // CHECK-NEXT: CXXThisExpr{{.*}} 'S *' this
+  // CHECK-NEXT: MemberExpr{{.*}} 'int *' lvalue ->ThisMemberPtr
+  // CHECK-NEXT: CXXThisExpr{{.*}} 'S *' this
+  // CHECK-NEXT: deviceptr clause
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'T *' lvalue ParmVar{{.*}} 'PointerParam' 'T *'
+  // CHECK-NEXT: CXXThisExpr{{.*}} 'S *' this
+  // CHECK-NEXT: MemberExpr{{.*}} 'int *' lvalue ->ThisMemberPtr
+  // CHECK-NEXT: CXXThisExpr{{.*}} 'S *' implicit this
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
   // Check Instantiations:
-  // CHECK-NEXT: CXXMethodDecl{{.*}} used bar 'void ()' implicit_instantiation implicit-inline
+  // CHECK-NEXT: CXXMethodDecl{{.*}} used bar 'void (int *)' implicit_instantiation implicit-inline
   // CHECK-NEXT: TemplateArgument type 'int'
   // CHECK-NEXT: BuiltinType{{.*}} 'int'
+  // CHECK-NEXT: ParmVarDecl{{.*}} PointerParam 'int *'
   // CHECK-NEXT: CompoundStmt
 
 // #pragma acc parallel private(ThisMember, this->ThisMemberArray[1])
@@ -331,6 +741,7 @@ struct S {
   // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
   // CHECK-NEXT: private clause
   // CHECK-NEXT: ArraySectionExpr{{.*}}
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <ArrayToPointerDecay>
   // CHECK-NEXT: MemberExpr{{.*}} 'int[5]' lvalue ->ThisMemberArray
   // CHECK-NEXT: CXXThisExpr{{.*}} 'S *' implicit this
   // CHECK-NEXT: IntegerLiteral{{.*}}'int' 1
@@ -346,6 +757,22 @@ struct S {
   // CHECK-NEXT: WhileStmt
   // CHECK-NEXT: CXXBoolLiteralExpr
   // CHECK-NEXT: NullStmt
+
+//#pragma acc parallel attach(PointerParam, this, this->ThisMemberPtr) deviceptr(PointerParam, this, ThisMemberPtr)
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: attach clause
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int *' lvalue ParmVar{{.*}} 'PointerParam' 'int *'
+  // CHECK-NEXT: CXXThisExpr{{.*}} 'S *' this
+  // CHECK-NEXT: MemberExpr{{.*}} 'int *' lvalue ->ThisMemberPtr
+  // CHECK-NEXT: CXXThisExpr{{.*}} 'S *' this
+  // CHECK-NEXT: deviceptr clause
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int *' lvalue ParmVar{{.*}} 'PointerParam' 'int *'
+  // CHECK-NEXT: CXXThisExpr{{.*}} 'S *' this
+  // CHECK-NEXT: MemberExpr{{.*}} 'int *' lvalue ->ThisMemberPtr
+  // CHECK-NEXT: CXXThisExpr{{.*}} 'S *' implicit this
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
 }
 };
 
@@ -372,6 +799,7 @@ void S::foo() {
   // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
   // CHECK-NEXT: private clause
   // CHECK-NEXT: ArraySectionExpr{{.*}}
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <ArrayToPointerDecay>
   // CHECK-NEXT: MemberExpr{{.*}} 'int[5]' lvalue ->ThisMemberArray
   // CHECK-NEXT: CXXThisExpr{{.*}} 'S *' implicit this
   // CHECK-NEXT: IntegerLiteral{{.*}}'int' 1
@@ -522,6 +950,7 @@ struct STempl {
   // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
   // CHECK-NEXT: private clause
   // CHECK-NEXT: ArraySectionExpr{{.*}}
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <ArrayToPointerDecay>
   // CHECK-NEXT: MemberExpr{{.*}} 'int[5]' lvalue ->ThisMemberArray
   // CHECK-NEXT: CXXThisExpr{{.*}} 'STempl<int> *' implicit this
   // CHECK-NEXT: IntegerLiteral{{.*}}'int' 1
@@ -546,7 +975,7 @@ void Inst() {
   TemplUses<CEVar, int, int[1]>({}, {}, &i);
 
   S s;
-  s.bar<int>();
+  s.bar<int>(&i);
   STempl<int> stempl;
   stempl.bar<int>();
 }
diff --git a/clang/test/SemaOpenACC/compute-construct-wait-clause.c b/clang/test/SemaOpenACC/compute-construct-wait-clause.c
new file mode 100644
index 000000000000..254aba8442fe
--- /dev/null
+++ b/clang/test/SemaOpenACC/compute-construct-wait-clause.c
@@ -0,0 +1,38 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+struct NotConvertible{} NC;
+short getS();
+int getI();
+
+void uses() {
+  int arr[5];
+
+#pragma acc parallel wait
+  while(1);
+
+#pragma acc serial wait()
+  while(1);
+
+#pragma acc kernels wait(getS(), getI())
+  while(1);
+
+#pragma acc parallel wait(devnum:getS(): getI())
+  while(1);
+
+#pragma acc parallel wait(devnum:getS(): queues: getI()) wait(devnum:getI(): queues: getS(), getI(), 5)
+  while(1);
+
+  // expected-error@+1{{OpenACC clause 'wait' requires expression of integer type ('struct NotConvertible' invalid)}}
+#pragma acc parallel wait(devnum:NC : 5)
+  while(1);
+
+  // expected-error@+1{{OpenACC clause 'wait' requires expression of integer type ('struct NotConvertible' invalid)}}
+#pragma acc parallel wait(devnum:5 : NC)
+  while(1);
+
+  // expected-error@+3{{OpenACC clause 'wait' requires expression of integer type ('int[5]' invalid)}}
+  // expected-error@+2{{OpenACC clause 'wait' requires expression of integer type ('int[5]' invalid)}}
+  // expected-error@+1{{OpenACC clause 'wait' requires expression of integer type ('struct NotConvertible' invalid)}}
+#pragma acc parallel wait(devnum:arr : queues: arr, NC, 5)
+  while(1);
+}
diff --git a/clang/test/SemaOpenACC/compute-construct-wait-clause.cpp b/clang/test/SemaOpenACC/compute-construct-wait-clause.cpp
new file mode 100644
index 000000000000..94f669be0f67
--- /dev/null
+++ b/clang/test/SemaOpenACC/compute-construct-wait-clause.cpp
@@ -0,0 +1,104 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+struct ExplicitConvertOnly {
+  explicit operator int() const; // #EXPL_CONV
+} Explicit;
+
+struct AmbiguousConvert{
+  operator int(); // #AMBIG_INT
+  operator short(); // #AMBIG_SHORT
+  operator float();
+} Ambiguous;
+
+void Test() {
+
+  // expected-error@+3{{multiple conversions from expression type 'struct AmbiguousConvert' to an integral type}}
+  // expected-note@#AMBIG_INT{{conversion to integral type 'int'}}
+  // expected-note@#AMBIG_SHORT{{conversion to integral type 'short'}}
+#pragma acc parallel wait(Ambiguous)
+  while (true);
+
+  // expected-error@+2{{OpenACC integer expression type 'struct ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
+#pragma acc parallel wait(4, Explicit, 5)
+  while (true);
+
+  // expected-error@+3{{multiple conversions from expression type 'struct AmbiguousConvert' to an integral type}}
+  // expected-note@#AMBIG_INT{{conversion to integral type 'int'}}
+  // expected-note@#AMBIG_SHORT{{conversion to integral type 'short'}}
+#pragma acc parallel wait(queues: Ambiguous, 5)
+  while (true);
+
+  // expected-error@+2{{OpenACC integer expression type 'struct ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
+#pragma acc parallel wait(devnum: Explicit: 5)
+  while (true);
+
+  // expected-error@+2{{OpenACC integer expression type 'struct ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
+#pragma acc parallel wait(devnum: Explicit:queues:  5)
+  while (true);
+
+  // expected-error@+1{{use of undeclared identifier 'queues'}}
+#pragma acc parallel wait(devnum: queues:  5)
+  while (true);
+}
+
+struct HasInt {
+  using IntTy = int;
+  using ShortTy = short;
+  static constexpr int value = 1;
+  static constexpr AmbiguousConvert ACValue;
+  static constexpr ExplicitConvertOnly EXValue;
+
+  operator char();
+};
+
+template<typename T>
+void TestInst() {
+
+#pragma acc parallel wait(T{})
+  while (true);
+
+#pragma acc parallel wait(devnum:typename T::ShortTy{}:queues:typename T::IntTy{})
+  while (true);
+
+  // expected-error@+4{{multiple conversions from expression type 'const AmbiguousConvert' to an integral type}}
+  // expected-note@#INST{{in instantiation of function template specialization}}
+  // expected-note@#AMBIG_INT{{conversion to integral type 'int'}}
+  // expected-note@#AMBIG_SHORT{{conversion to integral type 'short'}}
+#pragma acc parallel wait(devnum:T::value :queues:T::ACValue)
+  while (true);
+
+  // expected-error@+5{{OpenACC integer expression type 'const ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
+  // expected-error@+3{{multiple conversions from expression type 'const AmbiguousConvert' to an integral type}}
+  // expected-note@#AMBIG_INT{{conversion to integral type 'int'}}
+  // expected-note@#AMBIG_SHORT{{conversion to integral type 'short'}}
+#pragma acc parallel wait(devnum:T::EXValue :queues:T::ACValue)
+  while (true);
+
+  // expected-error@+5{{OpenACC integer expression type 'const ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
+  // expected-error@+3{{multiple conversions from expression type 'const AmbiguousConvert' to an integral type}}
+  // expected-note@#AMBIG_INT{{conversion to integral type 'int'}}
+  // expected-note@#AMBIG_SHORT{{conversion to integral type 'short'}}
+#pragma acc parallel wait(T::EXValue, T::ACValue)
+  while (true);
+
+  // expected-error@+5{{OpenACC integer expression type 'const ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
+  // expected-error@+3{{multiple conversions from expression type 'const AmbiguousConvert' to an integral type}}
+  // expected-note@#AMBIG_INT{{conversion to integral type 'int'}}
+  // expected-note@#AMBIG_SHORT{{conversion to integral type 'short'}}
+#pragma acc parallel wait(queues: T::EXValue, T::ACValue)
+  while (true);
+
+  // expected-error@+1{{no member named 'Invalid' in 'HasInt'}}
+#pragma acc parallel wait(queues: T::Invalid, T::Invalid2)
+  while (true);
+}
+
+void Inst() {
+  TestInst<HasInt>(); // #INST
+}
diff --git a/clang/test/SemaOpenACC/sub-array-ast.cpp b/clang/test/SemaOpenACC/sub-array-ast.cpp
new file mode 100644
index 000000000000..094976e16427
--- /dev/null
+++ b/clang/test/SemaOpenACC/sub-array-ast.cpp
@@ -0,0 +1,566 @@
+// RUN: %clang_cc1 %s -fopenacc -ast-dump | FileCheck %s
+
+// Test this with PCH.
+// RUN: %clang_cc1 %s -fopenacc -emit-pch -o %t %s
+// RUN: %clang_cc1 %s -fopenacc -include-pch %t -ast-dump-all | FileCheck %s
+
+#ifndef PCH_HELPER
+#define PCH_HELPER
+
+constexpr int returns_3() { return 3; }
+
+void Func(int i, int j) {
+  // CHECK: FunctionDecl{{.*}}Func
+  // CHECK-NEXT: ParmVarDecl{{.*}} i 'int'
+  // CHECK-NEXT: ParmVarDecl{{.*}} j 'int'
+  // CHECK-NEXT: CompoundStmt
+  int array[5];
+  // CHECK-NEXT: DeclStmt
+  // CHECK-NEXT: VarDecl{{.*}} array 'int[5]'
+  int VLA[i];
+  // CHECK-NEXT: DeclStmt
+  // CHECK-NEXT: VarDecl{{.*}} VLA 'int[i]'
+  int *ptr;
+  // CHECK-NEXT: DeclStmt
+  // CHECK-NEXT: VarDecl{{.*}} ptr 'int *'
+
+#pragma acc parallel private(array[returns_3():])
+  while (true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <ArrayToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int[5]' lvalue Var{{.*}} 'array' 'int[5]'
+  // CHECK-NEXT: CallExpr{{.*}} 'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int ()' lvalue Function{{.*}} 'returns_3' 'int ()'
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(array[:1])
+  while (true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <ArrayToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int[5]' lvalue Var{{.*}} 'array' 'int[5]'
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: IntegerLiteral{{.*}} 'int' 1
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(array[returns_3():1])
+  while (true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <ArrayToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int[5]' lvalue Var{{.*}} 'array' 'int[5]'
+  // CHECK-NEXT: CallExpr{{.*}} 'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int ()' lvalue Function{{.*}} 'returns_3' 'int ()'
+  // CHECK-NEXT: IntegerLiteral{{.*}} 'int' 1
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(array[i:j])
+  while (true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <ArrayToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int[5]' lvalue Var{{.*}} 'array' 'int[5]'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int' lvalue ParmVar{{.*}} 'i' 'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int' lvalue ParmVar{{.*}} 'j' 'int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(VLA[:1])
+  while (true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <ArrayToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int[i]' lvalue Var{{.*}} 'VLA' 'int[i]'
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: IntegerLiteral{{.*}} 'int' 1
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(VLA[returns_3():1])
+  while (true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <ArrayToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int[i]' lvalue Var{{.*}} 'VLA' 'int[i]'
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int ()' lvalue Function{{.*}} 'returns_3' 'int ()'
+  // CHECK-NEXT: IntegerLiteral{{.*}} 'int' 1
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(VLA[i:j])
+  while (true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <ArrayToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int[i]' lvalue Var{{.*}} 'VLA' 'int[i]'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int' lvalue ParmVar{{.*}} 'i' 'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int' lvalue ParmVar{{.*}} 'j' 'int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(ptr[:1])
+  while (true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int *' lvalue Var{{.*}} 'ptr' 'int *'
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: IntegerLiteral{{.*}} 'int' 1
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(ptr[returns_3():1])
+  while (true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int *' lvalue Var{{.*}} 'ptr' 'int *'
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int ()' lvalue Function{{.*}} 'returns_3' 'int ()'
+  // CHECK-NEXT: IntegerLiteral{{.*}} 'int' 1
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(ptr[i:j])
+  while (true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int *' lvalue Var{{.*}} 'ptr' 'int *'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int' lvalue ParmVar{{.*}} 'i' 'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int' lvalue ParmVar{{.*}} 'j' 'int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+}
+
+template<typename T, unsigned I, auto &CEArray>
+void Templ(int i){
+  // CHECK-NEXT: FunctionTemplateDecl{{.*}}Templ
+  // CHECK-NEXT: TemplateTypeParmDecl{{.*}} typename depth 0 index 0 T
+  // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} 'unsigned int' depth 0 index 1 I
+  // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} 'auto &' depth 0 index 2 CEArray
+  // CHECK-NEXT: FunctionDecl{{.*}}Templ 'void (int)'
+  // CHECK-NEXT: ParmVarDecl{{.*}} i 'int'
+  // CHECK-NEXT: CompoundStmt
+  T array[I+2];
+  // CHECK-NEXT: DeclStmt
+  // CHECK-NEXT: VarDecl{{.*}} array 'T[I + 2]'
+  T VLA[i];
+  // CHECK-NEXT: DeclStmt
+  // CHECK-NEXT: VarDecl{{.*}} VLA 'T[i]'
+  T *ptr;
+  // CHECK-NEXT: DeclStmt
+  // CHECK-NEXT: VarDecl{{.*}} ptr 'T *'
+
+#pragma acc parallel private(array[returns_3():])
+  while (true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'T[I + 2]' lvalue Var{{.*}} 'array' 'T[I + 2]'
+  // CHECK-NEXT: CallExpr{{.*}} 'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int ()' lvalue Function{{.*}}'returns_3' 'int ()'
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(array[:I])
+  while (true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'T[I + 2]' lvalue Var{{.*}} 'array' 'T[I + 2]'
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'unsigned int' NonTypeTemplateParm{{.*}} 'I' 'unsigned int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(array[returns_3()-2:I])
+  while (true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'T[I + 2]' lvalue Var{{.*}} 'array' 'T[I + 2]'
+  // CHECK-NEXT: BinaryOperator{{.*}} 'int' '-'
+  // CHECK-NEXT: CallExpr{{.*}} 'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int ()' lvalue Function{{.*}} 'returns_3' 'int ()'
+  // CHECK-NEXT: IntegerLiteral{{.*}} 2
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'unsigned int' NonTypeTemplateParm{{.*}} 'I' 'unsigned int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(array[i:i])
+  while (true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'T[I + 2]' lvalue Var{{.*}} 'array' 'T[I + 2]'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int' lvalue ParmVar{{.*}} 'i' 'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int' lvalue ParmVar{{.*}} 'i' 'int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(VLA[:I])
+  while (true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'T[i]' lvalue Var{{.*}} 'VLA' 'T[i]'
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'unsigned int' NonTypeTemplateParm{{.*}} 'I' 'unsigned int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(VLA[returns_3():I])
+  while (true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'T[i]' lvalue Var{{.*}} 'VLA' 'T[i]'
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int ()' lvalue Function{{.*}} 'returns_3' 'int ()'
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'unsigned int' NonTypeTemplateParm{{.*}} 'I' 'unsigned int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(VLA[i:i])
+  while (true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'T[i]' lvalue Var{{.*}} 'VLA' 'T[i]'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int' lvalue ParmVar{{.*}} 'i' 'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int' lvalue ParmVar{{.*}} 'i' 'int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(ptr[:I])
+  while (true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'T *' lvalue Var{{.*}} 'ptr' 'T *'
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'unsigned int' NonTypeTemplateParm{{.*}} 'I' 'unsigned int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(ptr[returns_3():I])
+  while (true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'T *' lvalue Var{{.*}} 'ptr' 'T *'
+  // CHECK-NEXT: CallExpr{{.*}} 'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int ()' lvalue Function{{.*}} 'returns_3' 'int ()'
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'unsigned int' NonTypeTemplateParm{{.*}} 'I' 'unsigned int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(ptr[i:i])
+  while (true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'T *' lvalue Var{{.*}} 'ptr' 'T *'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int' lvalue ParmVar{{.*}} 'i' 'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int' lvalue ParmVar{{.*}} 'i' 'int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(CEArray[returns_3() - 2: I])
+  while (true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'auto' lvalue NonTypeTemplateParm{{.*}} 'CEArray' 'auto &'
+  // CHECK-NEXT: BinaryOperator{{.*}} 'int' '-'
+  // CHECK-NEXT: CallExpr{{.*}} 'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int ()' lvalue Function{{.*}} 'returns_3' 'int ()'
+  // CHECK-NEXT: IntegerLiteral{{.*}} 2
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'unsigned int' NonTypeTemplateParm{{.*}} 'I' 'unsigned int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(CEArray[: I])
+  while (true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'auto' lvalue NonTypeTemplateParm{{.*}} 'CEArray' 'auto &'
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'unsigned int' NonTypeTemplateParm{{.*}} 'I' 'unsigned int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+
+  // Instantiation:
+  // CHECK-NEXT: FunctionDecl{{.*}} Templ 'void (int)' implicit_instantiation
+  // CHECK-NEXT: TemplateArgument{{.*}} 'int'
+  // CHECK-NEXT: BuiltinType{{.*}} 'int'
+  // CHECK-NEXT: TemplateArgument integral 3
+  // CHECK-NEXT: TemplateArgument decl
+  // CHECK-NEXT: Var{{.*}} 'CEArray' 'const int[5]'
+  // CHECK-NEXT: ParmVarDecl{{.*}} i 'int'
+  // CHECK-NEXT: CompoundStmt
+
+  // T array[I+2];
+  // CHECK-NEXT: DeclStmt
+  // CHECK-NEXT: VarDecl{{.*}} array 'int[5]'
+  // T VLA[i];
+  // CHECK-NEXT: DeclStmt
+  // CHECK-NEXT: VarDecl{{.*}} VLA 'int[i]'
+  // T *ptr;
+  // CHECK-NEXT: DeclStmt
+  // CHECK-NEXT: VarDecl{{.*}} ptr 'int *'
+
+//#pragma acc parallel private(array[returns_3():])
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <ArrayToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int[5]' lvalue Var{{.*}} 'array' 'int[5]'
+  // CHECK-NEXT: CallExpr{{.*}} 'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int ()' lvalue Function{{.*}}'returns_3' 'int ()'
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+//#pragma acc parallel private(array[:I])
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <ArrayToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int[5]' lvalue Var{{.*}} 'array' 'int[5]'
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}} 'unsigned int'
+  // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} 'unsigned int' depth 0 index 1 I
+  // CHECK-NEXT: IntegerLiteral{{.*}} 'unsigned int' 3
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+//#pragma acc parallel private(array[returns_3()-2:I])
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <ArrayToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int[5]' lvalue Var{{.*}} 'array' 'int[5]'
+  // CHECK-NEXT: BinaryOperator{{.*}} 'int' '-'
+  // CHECK-NEXT: CallExpr{{.*}} 'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int ()' lvalue Function{{.*}} 'returns_3' 'int ()'
+  // CHECK-NEXT: IntegerLiteral{{.*}} 2
+  // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}} 'unsigned int'
+  // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} 'unsigned int' depth 0 index 1 I
+  // CHECK-NEXT: IntegerLiteral{{.*}} 'unsigned int' 3
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+//#pragma acc parallel private(array[i:i])
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <ArrayToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int[5]' lvalue Var{{.*}} 'array' 'int[5]'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int' lvalue ParmVar{{.*}} 'i' 'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int' lvalue ParmVar{{.*}} 'i' 'int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+//#pragma acc parallel private(VLA[:I])
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <ArrayToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int[i]' lvalue Var{{.*}} 'VLA' 'int[i]'
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}} 'unsigned int'
+  // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} 'unsigned int' depth 0 index 1 I
+  // CHECK-NEXT: IntegerLiteral{{.*}} 'unsigned int' 3
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+//#pragma acc parallel private(VLA[returns_3():I])
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <ArrayToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int[i]' lvalue Var{{.*}} 'VLA' 'int[i]'
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int ()' lvalue Function{{.*}} 'returns_3' 'int ()'
+  // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}} 'unsigned int'
+  // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} 'unsigned int' depth 0 index 1 I
+  // CHECK-NEXT: IntegerLiteral{{.*}} 'unsigned int' 3
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+//#pragma acc parallel private(VLA[i:i])
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <ArrayToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int[i]' lvalue Var{{.*}} 'VLA' 'int[i]'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int' lvalue ParmVar{{.*}} 'i' 'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int' lvalue ParmVar{{.*}} 'i' 'int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+//#pragma acc parallel private(ptr[:I])
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int *' lvalue Var{{.*}} 'ptr' 'int *'
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}} 'unsigned int'
+  // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} 'unsigned int' depth 0 index 1 I
+  // CHECK-NEXT: IntegerLiteral{{.*}} 'unsigned int' 3
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+//#pragma acc parallel private(ptr[returns_3():I])
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int *' lvalue Var{{.*}} 'ptr' 'int *'
+  // CHECK-NEXT: CallExpr{{.*}} 'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int ()' lvalue Function{{.*}} 'returns_3' 'int ()'
+  // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}} 'unsigned int'
+  // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} 'unsigned int' depth 0 index 1 I
+  // CHECK-NEXT: IntegerLiteral{{.*}} 'unsigned int' 3
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+//#pragma acc parallel private(ptr[i:i])
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int *' lvalue Var{{.*}} 'ptr' 'int *'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int' lvalue ParmVar{{.*}} 'i' 'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int' lvalue ParmVar{{.*}} 'i' 'int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+//#pragma acc parallel private(CEArray[returns_3() - 2: I])
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'const int *' <ArrayToPointerDecay>
+  // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}} 'const int[5]' lvalue
+  // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} 'auto &' depth 0 index 2 CEArray
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'const int[5]' lvalue Var{{.*}}'CEArray' 'const int[5]'
+  // CHECK-NEXT: BinaryOperator{{.*}} 'int' '-'
+  // CHECK-NEXT: CallExpr{{.*}} 'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'int ()' lvalue Function{{.*}} 'returns_3' 'int ()'
+  // CHECK-NEXT: IntegerLiteral{{.*}} 2
+  // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}} 'unsigned int'
+  // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} 'unsigned int' depth 0 index 1 I
+  // CHECK-NEXT: IntegerLiteral{{.*}} 'unsigned int' 3
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+//#pragma acc parallel private(CEArray[: I])
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'const int *' <ArrayToPointerDecay>
+  // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}} 'const int[5]' lvalue
+  // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} 'auto &' depth 0 index 2 CEArray
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'const int[5]' lvalue Var{{.*}}'CEArray' 'const int[5]'
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}} 'unsigned int'
+  // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} 'unsigned int' depth 0 index 1 I
+  // CHECK-NEXT: IntegerLiteral{{.*}} 'unsigned int' 3
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+}
+
+// CHECK-NEXT: FunctionDecl{{.*}}inst
+void inst() {
+  static constexpr int CEArray[5]={1,2,3,4,5};
+  Templ<int, 3, CEArray>(5);
+}
+#endif
diff --git a/clang/test/SemaOpenACC/sub-array.cpp b/clang/test/SemaOpenACC/sub-array.cpp
new file mode 100644
index 000000000000..355ac5ef1d3c
--- /dev/null
+++ b/clang/test/SemaOpenACC/sub-array.cpp
@@ -0,0 +1,208 @@
+// RUN: %clang_cc1 %s -verify -fopenacc
+
+struct Incomplete; // #INCOMPLETE
+struct NotConvertible{} NC;
+
+struct CorrectConvert {
+  operator int();
+} Convert;
+
+constexpr int returns_3() { return 3; }
+
+using FuncPtrTy = void (*)();
+FuncPtrTy FuncPtrTyArray[2];
+
+void Func(int i, int j) {
+  int array[5];
+  int VLA[i];
+  int *ptr;
+  void *void_ptr;
+
+  // Follows int-expr rules, so only convertible to int.
+  // expected-error@+1{{OpenACC sub-array bound requires expression of integer type ('struct NotConvertible' invalid}}
+#pragma acc parallel private(array[NC:])
+  while (true);
+
+  // expected-error@+1{{OpenACC sub-array bound requires expression of integer type ('struct NotConvertible' invalid}}
+#pragma acc parallel private(array[:NC])
+  while (true);
+
+  // expected-error@+2{{OpenACC sub-array bound requires expression of integer type ('struct NotConvertible' invalid}}
+  // expected-error@+1{{OpenACC sub-array bound requires expression of integer type ('struct NotConvertible' invalid}}
+#pragma acc parallel private(array[NC:NC])
+  while (true);
+
+  // expected-error@+2{{OpenACC sub-array bound requires expression of integer type ('struct NotConvertible' invalid}}
+  // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is not an array}}
+#pragma acc parallel private(ptr[NC:])
+  while (true);
+
+  // expected-error@+1{{OpenACC sub-array bound requires expression of integer type ('struct NotConvertible' invalid}}
+#pragma acc parallel private(ptr[:NC])
+  while (true);
+
+  // expected-error@+2{{OpenACC sub-array bound requires expression of integer type ('struct NotConvertible' invalid}}
+  // expected-error@+1{{OpenACC sub-array bound requires expression of integer type ('struct NotConvertible' invalid}}
+#pragma acc parallel private(ptr[NC:NC])
+  while (true);
+
+  // These are convertible, so they work.
+#pragma acc parallel private(array[Convert:Convert])
+  while (true);
+
+#pragma acc parallel private(ptr[Convert:Convert])
+  while (true);
+
+
+  // The length for "dynamically" allocated dimensions of an array must be
+  // explicitly specified.
+
+  // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is not an array}}
+#pragma acc parallel private(ptr[3:])
+  while (true);
+
+  // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is an array of unknown bound}}
+#pragma acc parallel private(VLA[3:])
+  while (true);
+
+#pragma acc parallel private(ptr[:3])
+  while (true);
+
+#pragma acc parallel private(VLA[:3])
+  while (true);
+
+  // Error if the length of the array + the initializer is bigger the the array
+  // with known bounds.
+
+  // expected-error@+1{{OpenACC sub-array length evaluated to a value (6) that would be out of the range of the subscripted array size of 5}}
+#pragma acc parallel private(array[i:returns_3() + 3])
+  while (true);
+
+  // expected-error@+1{{OpenACC sub-array length evaluated to a value (6) that would be out of the range of the subscripted array size of 5}}
+#pragma acc parallel private(array[:returns_3() + 3])
+  while (true);
+
+#pragma acc parallel private(array[:returns_3()])
+  while (true);
+
+  // expected-error@+1{{OpenACC sub-array specified range [3:3] would be out of the range of the subscripted array size of 5}}
+#pragma acc parallel private(array[returns_3():returns_3()])
+  while (true);
+
+  // expected-error@+1{{OpenACC sub-array lower bound evaluated to a value (6) that would be out of the range of the subscripted array size of 5}}
+#pragma acc parallel private(array[returns_3() + 3:])
+  while (true);
+
+  // expected-error@+1{{OpenACC sub-array lower bound evaluated to a value (6) that would be out of the range of the subscripted array size of 5}}
+#pragma acc parallel private(array[returns_3() + 3:1])
+  while (true);
+
+  // Standard doesn't specify this, but negative values are likely not
+  // permitted, so disallow them here until we come up with a good reason to do
+  // otherwise.
+
+  // expected-error@+1{{OpenACC sub-array lower bound evaluated to negative value -1}}
+#pragma acc parallel private(array[returns_3() - 4 : ])
+  while (true);
+
+  // expected-error@+1{{OpenACC sub-array length evaluated to negative value -1}}
+#pragma acc parallel private(array[: -1])
+  while (true);
+
+  Incomplete *IncompletePtr;
+  // expected-error@+2{{OpenACC sub-array base is of incomplete type 'Incomplete'}}
+  // expected-note@#INCOMPLETE{{forward declaration of 'Incomplete'}}
+#pragma acc parallel private(IncompletePtr[0 :1])
+  while (true);
+
+  // expected-error@+1{{OpenACC sub-array base is of incomplete type 'void'}}
+#pragma acc parallel private(void_ptr[0:1])
+  while (true);
+
+  // OK: these are function pointers.
+#pragma acc parallel private(FuncPtrTyArray[0 :1])
+  while (true);
+
+  // expected-error@+1{{OpenACC sub-array cannot be of function type 'void ()'}}
+#pragma acc parallel private(FuncPtrTyArray[0][0 :1])
+  while (true);
+
+
+  // expected-error@+1{{OpenACC sub-array subscripted value is not an array or pointer}}
+#pragma acc parallel private(i[0:1])
+  while (true);
+}
+
+template<typename T, typename U, typename V, unsigned I, auto &CEArray>
+void Templ(int i){
+  T array[I];
+  T VLA[i];
+  T *ptr;
+  U NC;
+  V Conv;
+
+  // Convertible:
+  // expected-error@+2{{OpenACC sub-array bound requires expression of integer type ('NotConvertible' invalid}}
+  // expected-note@#INST{{in instantiation of function template specialization}}
+#pragma acc parallel private(array[NC:])
+  while (true);
+  // expected-error@+1{{OpenACC sub-array bound requires expression of integer type ('NotConvertible' invalid}}
+#pragma acc parallel private(array[:NC])
+  while (true);
+
+#pragma acc parallel private(array[Conv:])
+  while (true);
+#pragma acc parallel private(array[:Conv])
+  while (true);
+
+  // Need a length for unknown size.
+  // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is not an array}}
+#pragma acc parallel private(ptr[Conv:])
+  while (true);
+  // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is an array of unknown bound}}
+#pragma acc parallel private(VLA[Conv:])
+  while (true);
+#pragma acc parallel private(ptr[:Conv])
+  while (true);
+#pragma acc parallel private(VLA[:Conv])
+  while (true);
+
+  // Out of bounds.
+  // expected-error@+1{{OpenACC sub-array lower bound evaluated to a value (2) that would be out of the range of the subscripted array size of 2}}
+#pragma acc parallel private(array[I:])
+  while (true);
+
+  // OK, don't know the value.
+#pragma acc parallel private(array[i:])
+  while (true);
+
+  // expected-error@+1{{OpenACC sub-array length evaluated to a value (3) that would be out of the range of the subscripted array size of 2}}
+#pragma acc parallel private(array[:I + 1])
+  while (true);
+
+  // expected-error@+1{{OpenACC sub-array lower bound evaluated to a value (5) that would be out of the range of the subscripted array size of 5}}
+#pragma acc parallel private(CEArray[5:])
+  while (true);
+
+  // expected-error@+1{{OpenACC sub-array length evaluated to a value (6) that would be out of the range of the subscripted array size of 5}}
+#pragma acc parallel private(CEArray[:2 + I + I])
+  while (true);
+
+  // expected-error@+1{{OpenACC sub-array length evaluated to a value (4294967295) that would be out of the range of the subscripted array size of 5}}
+#pragma acc parallel private(CEArray[:1 - I])
+  while (true);
+
+  // expected-error@+1{{OpenACC sub-array lower bound evaluated to a value (4294967295) that would be out of the range of the subscripted array size of 5}}
+#pragma acc parallel private(CEArray[1 - I:])
+  while (true);
+
+  T not_ptr;
+  // expected-error@+1{{OpenACC sub-array subscripted value is not an array or pointer}}
+#pragma acc parallel private(not_ptr[0:1])
+  while (true);
+}
+
+void inst() {
+  static constexpr int CEArray[5]={1,2,3,4,5};
+  Templ<int, NotConvertible, CorrectConvert, 2, CEArray>(5); // #INST
+}
diff --git a/clang/test/SemaOpenCL/as_type.cl b/clang/test/SemaOpenCL/as_type.cl
index ece216047481..95f782400f6d 100644
--- a/clang/test/SemaOpenCL/as_type.cl
+++ b/clang/test/SemaOpenCL/as_type.cl
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 %s -emit-llvm -DBITS=32 -triple spir-unknown-unknown -finclude-default-header -fdeclare-opencl-builtins -o - -verify -fsyntax-only
-// RUN: %clang_cc1 %s -emit-llvm -DBITS=64 -triple spir64-unknown-unknown -finclude-default-header -fdeclare-opencl-builtins -o - -verify -fsyntax-only
+// RUN: %clang_cc1 %s -DBITS=32 -triple spir-unknown-unknown -finclude-default-header -fdeclare-opencl-builtins -o - -verify -fsyntax-only
+// RUN: %clang_cc1 %s -DBITS=64 -triple spir64-unknown-unknown -finclude-default-header -fdeclare-opencl-builtins -o - -verify -fsyntax-only
 
 char3 f1(char16 x) {
   return  __builtin_astype(x, char3); // expected-error{{invalid reinterpretation: sizes of 'char3' (vector of 3 'char' values) and '__private char16' (vector of 16 'char' values) must match}}
diff --git a/clang/test/SemaOpenCL/multistep-explicit-cast.cl b/clang/test/SemaOpenCL/multistep-explicit-cast.cl
index 5e3d12a0e449..d5ec9d9deb10 100644
--- a/clang/test/SemaOpenCL/multistep-explicit-cast.cl
+++ b/clang/test/SemaOpenCL/multistep-explicit-cast.cl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -fsyntax-only -ast-dump %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -ast-dump %s | FileCheck %s
 // expected-no-diagnostics
 
 typedef __attribute__((ext_vector_type(2)))  char char2;
diff --git a/clang/test/SemaTemplate/ctad.cpp b/clang/test/SemaTemplate/ctad.cpp
index ec144d4f44ba..e981ea8d5ecf 100644
--- a/clang/test/SemaTemplate/ctad.cpp
+++ b/clang/test/SemaTemplate/ctad.cpp
@@ -54,3 +54,18 @@ template<class T, class B> struct Y { Y(T); };
 template<class T, class B=void> struct Y ;
 Y y(1);
 }
+
+namespace NoCrashOnGettingDefaultArgLoc {
+template <typename>
+class A {
+  A(int = 1); // expected-note {{candidate template ignored: couldn't infer template argumen}}
+};
+class C : A<int> {
+  using A::A;
+};
+template <typename>
+class D : C { // expected-note {{candidate function template not viable: requires 1 argument}}
+  using C::C;
+};
+D abc; // expected-error {{no viable constructor or deduction guide}}
+}
diff --git a/clang/test/SemaTemplate/cwg2398.cpp b/clang/test/SemaTemplate/cwg2398.cpp
new file mode 100644
index 000000000000..a20155486b12
--- /dev/null
+++ b/clang/test/SemaTemplate/cwg2398.cpp
@@ -0,0 +1,139 @@
+// RUN: %clang_cc1 %s -fsyntax-only -std=c++23                                     -verify=expected,new
+// RUN: %clang_cc1 %s -fsyntax-only -std=c++23 -fno-relaxed-template-template-args -verify=expected,old
+
+namespace issue1 {
+  template<class T, class U = T> class B {};
+  template<template<class> class P, class T> void f(P<T>);
+  // new-note@-1 {{deduced type 'B<[...], (default) int>' of 1st parameter does not match adjusted type 'B<[...], float>' of argument [with P = issue1::B, T = int]}}
+  // old-note@-2 2{{template template argument has different template parameters}}
+
+  void g() {
+    f(B<int>()); // old-error {{no matching function for call}}
+    f(B<int,float>()); // expected-error {{no matching function for call}}
+  }
+} // namespace issue1
+
+namespace issue2 {
+  template<typename> struct match;
+
+  template<template<typename> class t,typename T> struct match<t<T>>;
+
+  template<template<typename,typename> class t,typename T0,typename T1>
+  struct match<t<T0,T1>> {};
+
+  template<typename,typename = void> struct other {};
+  template struct match<other<void,void>>;
+} // namespace issue2
+
+namespace type {
+  template<class T1, class T2 = float> struct A;
+
+  template<class T3> struct B;
+  template<template<class T4          > class TT1, class T5          > struct B<TT1<T5    >>   ;
+  template<template<class T6, class T7> class TT2, class T8, class T9> struct B<TT2<T8, T9>> {};
+  template struct B<A<int>>;
+} // namespace type
+
+namespace value {
+  template<class T1, int V1 = 1> struct A;
+
+  template<class T2> struct B;
+  template<template<class T3        > class TT1, class T4        > struct B<TT1<T4    >>   ;
+  template<template<class T5, int V2> class TT2, class T6, int V3> struct B<TT2<T6, V3>> {};
+  template struct B<A<int>>;
+} // namespace value
+
+namespace templ {
+  template <class T1> struct A;
+
+  template<class T2, template <class T3> class T4 = A> struct B {};
+
+  template<class T5> struct C;
+
+  template<template<class T6> class TT1, class T7> struct C<TT1<T7>>;
+
+  template<template<class T8, template <class T9> class> class TT2,
+    class T10, template <class T11> class TT3>
+  struct C<TT2<T10, TT3>> {};
+
+  template struct C<B<int>>;
+} // namespace templ
+
+namespace type_pack1 {
+  template<class T2> struct A;
+  template<template<class ...T3s> class TT1, class T4> struct A<TT1<T4>>   ;
+  // new-note@-1 {{partial specialization matches}}
+  template<template<class    T5 > class TT2, class T6> struct A<TT2<T6>> {};
+  // new-note@-1 {{partial specialization matches}}
+
+  template<class T1> struct B;
+  template struct A<B<char>>;
+  // new-error@-1 {{ambiguous partial specialization}}
+} // namespace type_pack1
+
+namespace type_pack2 {
+  template<class T2> struct A;
+  template<template<class ...T3s> class TT1, class ...T4> struct A<TT1<T4...>>   ;
+  // new-note@-1 {{partial specialization matches}}
+  template<template<class    T5 > class TT2, class ...T6> struct A<TT2<T6...>> {};
+  // new-note@-1 {{partial specialization matches}}
+
+  template<class T1> struct B;
+  template struct A<B<char>>;
+  // new-error@-1 {{ambiguous partial specialization}}
+} // namespace type_pack2
+
+namespace type_pack3 {
+  template<class T1, class T2 = float> struct A;
+
+  template<class T3> struct B;
+
+  template<template<class T4              > class TT1, class T5              > struct B<TT1<T5        >>;
+  // new-note@-1 {{template is declared here}}
+  template<template<class T6, class ...T7s> class TT2, class T8, class ...T9s> struct B<TT2<T8, T9s...>>;
+  // old-note@-1 {{template is declared here}}
+
+  template struct B<A<int>>;
+  // expected-error@-1 {{explicit instantiation of undefined template}}
+} // namespace type_pack3
+
+namespace gcc_issue {
+  template<class T1, class T2> struct A;
+
+  template<template<class T1> class TT1, class T2> struct A<TT1<T2>, typename TT1<T2>::type>;
+  // new-note@-1 {{partial specialization matches}}
+
+  template<template<class T3, class T4> class TT2, class T5, class T6>
+  struct A<TT2<T5, T6>, typename TT2<T5, T5>::type>;
+  // new-note@-1 {{partial specialization matches}}
+  // old-note@-2 {{template is declared here}}
+
+  template <class T7, class T8 = T7> struct B { using type = int; };
+
+  template struct A<B<int>, int>;
+  // new-error@-1 {{ambiguous partial specializations}}
+  // old-error@-2 {{explicit instantiation of undefined template}}
+} // namespace gcc_issue
+
+namespace ttp_defaults {
+  template <template <class T1> class TT1> struct A {};
+  // old-note@-1 2{{previous template template parameter}}
+
+  template <template <class T2> class TT2> void f(A<TT2>);
+  // new-note@-1 {{explicit instantiation candidate}}
+  // old-note@-2 {{invalid explicitly-specified argument for template parameter 'TT2'}}
+
+  // FIXME: The default arguments on the TTP are not available during partial ordering.
+  template <template <class T3, class T4 = float> class TT3> void f(A<TT3>) {};
+  // new-note@-1 {{explicit instantiation candidate}}
+  // old-error@-2 {{template template argument has different template parameters}}
+  // old-note@-3 {{too many template parameters}}
+
+  template <class T5, class T6 = int> struct B;
+  // old-note@-1 {{too many template parameters}}
+
+  template void f<B>(A<B>);
+  // new-error@-1 {{partial ordering for explicit instantiation of 'f' is ambiguous}}
+  // old-error@-2 {{template template argument has different template parameters}}
+  // old-error@-3 {{explicit instantiation of 'f' does not refer to a function template}}
+} // namespace ttp_defaults
diff --git a/clang/test/SemaTemplate/deduction-guide.cpp b/clang/test/SemaTemplate/deduction-guide.cpp
index ff5e39216762..51e1eb49c5de 100644
--- a/clang/test/SemaTemplate/deduction-guide.cpp
+++ b/clang/test/SemaTemplate/deduction-guide.cpp
@@ -261,6 +261,13 @@ AG ag = {1};
 // CHECK:   | `-BuiltinType {{.*}} 'int'
 // CHECK:   `-ParmVarDecl {{.*}} 'int'
 
+template <typename X = int>
+using BG = G<int>;
+BG bg(1.0);
+// CHECK-LABEL: Dumping <deduction guide for BG>
+// CHECK: FunctionTemplateDecl {{.*}} implicit <deduction guide for BG>
+// CHECK: |-CXXDeductionGuideDecl {{.*}} 'auto (int) -> G<int>' aggregate
+
 template <typename D>
 requires (sizeof(D) == 4)
 struct Foo {
diff --git a/clang/test/SemaTemplate/default-arguments-ast-print.cpp b/clang/test/SemaTemplate/default-arguments-ast-print.cpp
index 4623f0a8cdf4..1491df40588b 100644
--- a/clang/test/SemaTemplate/default-arguments-ast-print.cpp
+++ b/clang/test/SemaTemplate/default-arguments-ast-print.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -ast-print %s | FileCheck %s
 
 template <typename T, typename U = double> class Foo;
 
diff --git a/clang/test/SemaTemplate/default-arguments.cpp b/clang/test/SemaTemplate/default-arguments.cpp
index a850d273ccba..d5d9687cc90f 100644
--- a/clang/test/SemaTemplate/default-arguments.cpp
+++ b/clang/test/SemaTemplate/default-arguments.cpp
@@ -112,15 +112,14 @@ template<typename T, template<typename> class X = T::template apply>
 int array4[is_same<X4<add_pointer>, 
                    X4<add_pointer, add_pointer::apply> >::value? 1 : -1];
 
-template<int> struct X5 {}; // expected-note{{has a different type 'int'}}
+template<int> struct X5 {};
 template<long> struct X5b {};
 template<typename T, 
-         template<T> class B = X5> // expected-error{{template template argument has different}} \
-                                   // expected-note{{previous non-type template parameter}}
+         template<T> class B = X5>
   struct X6 {};
 
 X6<int> x6a;
-X6<long> x6b; // expected-note{{while checking a default template argument}}
+X6<long> x6b;
 X6<long, X5b> x6c;
 
 
diff --git a/clang/test/SemaTemplate/default-expr-arguments-3.cpp b/clang/test/SemaTemplate/default-expr-arguments-3.cpp
index 4d04209e110b..09fb7b290a1a 100644
--- a/clang/test/SemaTemplate/default-expr-arguments-3.cpp
+++ b/clang/test/SemaTemplate/default-expr-arguments-3.cpp
@@ -1,55 +1,55 @@
-// RUN: %clang_cc1 -std=c++14 -verify -ast-dump %s | FileCheck %s
-// expected-no-diagnostics
-
-// CHECK: FunctionDecl {{.*}} used func 'void ()'
-// CHECK-NEXT: TemplateArgument type 'int'
-// CHECK: LambdaExpr {{.*}} '(lambda at
-// CHECK: ParmVarDecl {{.*}} used f 'foo' cinit
-// CHECK-NEXT: DeclRefExpr {{.*}} 'foo' EnumConstant {{.*}} 'a' 'foo'
-
-namespace PR28795 {
-  template<typename T>
-  void func() {
-    enum class foo { a, b };
-    auto bar = [](foo f = foo::a) { return f; };
-    bar();
-  }
-
-  void foo() {
-    func<int>();
-  }
-}
-
-// CHECK: ClassTemplateSpecializationDecl {{.*}} struct class2 definition
-// CHECK: TemplateArgument type 'int'
-// CHECK: LambdaExpr {{.*}} '(lambda at
-// CHECK: ParmVarDecl {{.*}} used f 'foo' cinit
-// CHECK-NEXT: DeclRefExpr {{.*}} 'foo' EnumConstant {{.*}} 'a' 'foo'
-
-// Template struct case:
-template <class T> struct class2 {
-  void bar() {
-    enum class foo { a, b };
-    [](foo f = foo::a) { return f; }();
-  }
-};
-
-template struct class2<int>;
-
-// CHECK: FunctionTemplateDecl {{.*}} f1
-// CHECK-NEXT: TemplateTypeParmDecl {{.*}} typename depth 0 index 0 T
-// CHECK-NEXT: FunctionDecl {{.*}} f1 'void ()'
-// CHECK: FunctionDecl {{.*}} f1 'void ()'
-// CHECK-NEXT: TemplateArgument type 'int'
-// CHECK: ParmVarDecl {{.*}} n 'foo' cinit
-// CHECK-NEXT: DeclRefExpr {{.*}} 'foo' EnumConstant {{.*}} 'a' 'foo'
-
-template<typename T>
-void f1() {
-  enum class foo { a, b };
-  struct S {
-    int g1(foo n = foo::a);
-  };
-}
-
-template void f1<int>();
+// RUN: %clang_cc1 -std=c++14 -verify -ast-dump %s | FileCheck %s
+// expected-no-diagnostics
+
+// CHECK: FunctionDecl {{.*}} used func 'void ()'
+// CHECK-NEXT: TemplateArgument type 'int'
+// CHECK: LambdaExpr {{.*}} '(lambda at
+// CHECK: ParmVarDecl {{.*}} used f 'foo' cinit
+// CHECK-NEXT: DeclRefExpr {{.*}} 'foo' EnumConstant {{.*}} 'a' 'foo'
+
+namespace PR28795 {
+  template<typename T>
+  void func() {
+    enum class foo { a, b };
+    auto bar = [](foo f = foo::a) { return f; };
+    bar();
+  }
+
+  void foo() {
+    func<int>();
+  }
+}
+
+// CHECK: ClassTemplateSpecializationDecl {{.*}} struct class2 definition
+// CHECK: TemplateArgument type 'int'
+// CHECK: LambdaExpr {{.*}} '(lambda at
+// CHECK: ParmVarDecl {{.*}} used f 'foo' cinit
+// CHECK-NEXT: DeclRefExpr {{.*}} 'foo' EnumConstant {{.*}} 'a' 'foo'
+
+// Template struct case:
+template <class T> struct class2 {
+  void bar() {
+    enum class foo { a, b };
+    [](foo f = foo::a) { return f; }();
+  }
+};
+
+template struct class2<int>;
+
+// CHECK: FunctionTemplateDecl {{.*}} f1
+// CHECK-NEXT: TemplateTypeParmDecl {{.*}} typename depth 0 index 0 T
+// CHECK-NEXT: FunctionDecl {{.*}} f1 'void ()'
+// CHECK: FunctionDecl {{.*}} f1 'void ()'
+// CHECK-NEXT: TemplateArgument type 'int'
+// CHECK: ParmVarDecl {{.*}} n 'foo' cinit
+// CHECK-NEXT: DeclRefExpr {{.*}} 'foo' EnumConstant {{.*}} 'a' 'foo'
+
+template<typename T>
+void f1() {
+  enum class foo { a, b };
+  struct S {
+    int g1(foo n = foo::a);
+  };
+}
+
+template void f1<int>();
diff --git a/clang/test/SemaTemplate/instantiate-friend-function.cpp b/clang/test/SemaTemplate/instantiate-friend-function.cpp
index 1a923a2e92e0..1b14068599d9 100644
--- a/clang/test/SemaTemplate/instantiate-friend-function.cpp
+++ b/clang/test/SemaTemplate/instantiate-friend-function.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
-// RUN: %clang_cc1 -S -triple %itanium_abi_triple -std=c++11 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++11 -emit-llvm %s -o - | FileCheck %s
 // expected-no-diagnostics
 
 namespace PR10856 {
diff --git a/clang/test/SemaTemplate/instantiate-template-template-parm.cpp b/clang/test/SemaTemplate/instantiate-template-template-parm.cpp
index a70c7e8b081a..39aeeb1c1a6a 100644
--- a/clang/test/SemaTemplate/instantiate-template-template-parm.cpp
+++ b/clang/test/SemaTemplate/instantiate-template-template-parm.cpp
@@ -20,30 +20,29 @@ apply<add_reference, int>::type ir = i;
 apply<add_reference, float>::type fr = i; // expected-error{{non-const lvalue reference to type 'float' cannot bind to a value of unrelated type 'int'}}
 
 // Template template parameters
-template<int> struct B; // expected-note{{has a different type 'int'}}
+template<int> struct B;
 
-template<typename T, 
-         template<T Value> class X> // expected-error{{cannot have type 'float'}} \
-                                    // expected-note{{with type 'long'}}
+template<typename T,
+         template<T Value> class X> // expected-error{{cannot have type 'float'}}
 struct X0 { };
 
 X0<int, B> x0b1;
 X0<float, B> x0b2; // expected-note{{while substituting}}
-X0<long, B> x0b3; // expected-error{{template template argument has different template parameters}}
+X0<long, B> x0b3;
 
-template<template<int V> class TT> // expected-note{{parameter with type 'int'}}
+template<template<int V> class TT>
 struct X1 { };
 
 template<typename T, template<T V> class TT>
 struct X2 {
-  X1<TT> x1; // expected-error{{has different template parameters}}
+  X1<TT> x1;
 };
 
 template<int V> struct X3i { };
-template<long V> struct X3l { }; // expected-note{{different type 'long'}}
+template<long V> struct X3l { };
 
 X2<int, X3i> x2okay;
-X2<long, X3l> x2bad; // expected-note{{instantiation}}
+X2<long, X3l> x2okay2;
 
 template <typename T, template <T, T> class TT, class R = TT<1, 2> >
 struct Comp {
diff --git a/clang/test/SemaTemplate/instantiation-depth-default.cpp b/clang/test/SemaTemplate/instantiation-depth-default.cpp
index f5835b86b3a3..5934d4e542ee 100644
--- a/clang/test/SemaTemplate/instantiation-depth-default.cpp
+++ b/clang/test/SemaTemplate/instantiation-depth-default.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -ftemplate-backtrace-limit=2 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -ftemplate-backtrace-limit=2 %if {{ubsan}} %{ -Wno-stack-exhausted %} %s
 //
 // FIXME: Disable this test when Clang was built with ASan, because ASan
 // increases our per-frame stack usage enough that this test no longer fits
diff --git a/clang/test/SemaTemplate/make_integer_seq.cpp b/clang/test/SemaTemplate/make_integer_seq.cpp
index 644bf41f8614..3a692f5ae2bf 100644
--- a/clang/test/SemaTemplate/make_integer_seq.cpp
+++ b/clang/test/SemaTemplate/make_integer_seq.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++23 -fsyntax-only -triple x86_64-linux-gnu -ast-dump -verify -xc++ < %s | FileCheck %s
+// RUN: %clang_cc1 -std=c++23 -triple x86_64-linux-gnu -ast-dump -verify -xc++ < %s | FileCheck %s
 
 template <class A1, A1... A2> struct A {};
 
diff --git a/clang/test/SemaTemplate/nested-implicit-deduction-guides.cpp b/clang/test/SemaTemplate/nested-implicit-deduction-guides.cpp
index 38b6706595a1..f289dc045286 100644
--- a/clang/test/SemaTemplate/nested-implicit-deduction-guides.cpp
+++ b/clang/test/SemaTemplate/nested-implicit-deduction-guides.cpp
@@ -84,3 +84,17 @@ nested_init_list<int>::concept_fail nil_invalid{1, ""};
 // expected-note@#INIT_LIST_INNER_INVALID {{candidate template ignored: substitution failure [with F = const char *]: constraints not satisfied for class template 'concept_fail' [with F = const char *]}}
 // expected-note@#INIT_LIST_INNER_INVALID {{candidate function template not viable: requires 1 argument, but 2 were provided}}
 // expected-note@#INIT_LIST_INNER_INVALID {{candidate function template not viable: requires 0 arguments, but 2 were provided}}
+
+namespace GH88142 {
+
+template <typename, typename...> struct X {
+  template <typename> struct Y {
+    template <typename T> Y(T) {}
+  };
+
+  template <typename T> Y(T) -> Y<T>;
+};
+
+X<int>::Y y(42);
+
+} // namespace PR88142
diff --git a/clang/test/SemaTemplate/nested-template.cpp b/clang/test/SemaTemplate/nested-template.cpp
index efbde2076b9f..5bd388d4dff3 100644
--- a/clang/test/SemaTemplate/nested-template.cpp
+++ b/clang/test/SemaTemplate/nested-template.cpp
@@ -112,18 +112,16 @@ template struct X1<int>::B<bool>;
 // Template template parameters
 template<typename T>
 struct X2 {
-  template<template<class U, T Value> class>  // expected-error{{cannot have type 'float'}} \
-                                              // expected-note{{previous non-type template}}
+  template<template<class U, T Value> class>  // expected-error{{cannot have type 'float'}}
     struct Inner { };
 };
 
-template<typename T, 
-         int Value> // expected-note{{template non-type parameter}}
+template<typename T, int Value>
   struct X2_arg;
 
 X2<int>::Inner<X2_arg> x2i1;
 X2<float> x2a; // expected-note{{instantiation}}
-X2<long>::Inner<X2_arg> x2i3; // expected-error{{template template argument has different}}
+X2<long>::Inner<X2_arg> x2i3;
 
 namespace PR10896 {
   template<typename TN>
diff --git a/clang/test/SemaTemplate/pr47676.cpp b/clang/test/SemaTemplate/pr47676.cpp
index b62b62ca2b7f..536cea785c82 100644
--- a/clang/test/SemaTemplate/pr47676.cpp
+++ b/clang/test/SemaTemplate/pr47676.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple=powerpc64le-unknown-linux-gnu \
-// RUN:            -target-feature +altivec -fsyntax-only -ast-dump \
+// RUN:            -target-feature +altivec -ast-dump \
 // RUN:            -xc++ < %s \
 // RUN:   | FileCheck %s
 
diff --git a/clang/test/SemaTemplate/temp_arg_enum_printing.cpp b/clang/test/SemaTemplate/temp_arg_enum_printing.cpp
index 5a486edd2d30..bf343af96359 100644
--- a/clang/test/SemaTemplate/temp_arg_enum_printing.cpp
+++ b/clang/test/SemaTemplate/temp_arg_enum_printing.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -ast-print %s | FileCheck %s
 
 namespace NamedEnumNS
 {
diff --git a/clang/test/SemaTemplate/temp_arg_enum_printing_more.cpp b/clang/test/SemaTemplate/temp_arg_enum_printing_more.cpp
index a3a7158412e4..b7ed28dfa00c 100644
--- a/clang/test/SemaTemplate/temp_arg_enum_printing_more.cpp
+++ b/clang/test/SemaTemplate/temp_arg_enum_printing_more.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -ast-print %s -std=c++11 | FileCheck %s
+// RUN: %clang_cc1 -ast-print %s -std=c++11 | FileCheck %s
 
 // Make sure that for template value arguments that are unscoped enumerators,
 // no qualified enum information is included in their name, as their visibility
diff --git a/clang/test/SemaTemplate/temp_arg_string_printing.cpp b/clang/test/SemaTemplate/temp_arg_string_printing.cpp
index 2851e4f84ce2..60e89bb09d75 100644
--- a/clang/test/SemaTemplate/temp_arg_string_printing.cpp
+++ b/clang/test/SemaTemplate/temp_arg_string_printing.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++20 -fsyntax-only -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -std=c++20 -ast-print %s | FileCheck %s
 
 using size_t = __SIZE_TYPE__;
 static_assert(__has_builtin(__make_integer_seq));
diff --git a/clang/test/SemaTemplate/temp_arg_template.cpp b/clang/test/SemaTemplate/temp_arg_template.cpp
index 3c2697329212..a7236669276a 100644
--- a/clang/test/SemaTemplate/temp_arg_template.cpp
+++ b/clang/test/SemaTemplate/temp_arg_template.cpp
@@ -5,11 +5,11 @@ template<template<typename T> class X> struct A; // expected-note 2{{previous te
 
 template<template<typename T, int I> class X> struct B; // expected-note{{previous template template parameter is here}}
 
-template<template<int I> class X> struct C;  // expected-note 2{{previous non-type template parameter with type 'int' is here}}
+template<template<int I> class X> struct C;  // expected-note {{previous non-type template parameter with type 'int' is here}}
 
 template<class> struct X; // expected-note{{too few template parameters in template template argument}}
 template<int N> struct Y; // expected-note{{template parameter has a different kind in template argument}}
-template<long N> struct Ylong; // expected-note{{template non-type parameter has a different type 'long' in template argument}}
+template<long N> struct Ylong;
 template<const int &N> struct Yref; // expected-note{{template non-type parameter has a different type 'const int &' in template argument}}
 
 namespace N {
@@ -26,7 +26,7 @@ A<Y> *a4; // expected-error{{template template argument has different template p
 A<TooMany> *a5; // expected-error{{template template argument has different template parameters than its corresponding template template parameter}}
 B<X> *a6; // expected-error{{template template argument has different template parameters than its corresponding template template parameter}}
 C<Y> *a7;
-C<Ylong> *a8; // expected-error{{template template argument has different template parameters than its corresponding template template parameter}}
+C<Ylong> *a8;
 C<Yref> *a9; // expected-error{{template template argument has different template parameters than its corresponding template template parameter}}
 
 template<typename T> void f(int);
diff --git a/clang/test/SemaTemplate/temp_arg_template_cxx1z.cpp b/clang/test/SemaTemplate/temp_arg_template_cxx1z.cpp
index 03ef78f8cf14..372a00efc601 100644
--- a/clang/test/SemaTemplate/temp_arg_template_cxx1z.cpp
+++ b/clang/test/SemaTemplate/temp_arg_template_cxx1z.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -std=c++1z -frelaxed-template-template-args %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++1z %s
 
 // expected-note@temp_arg_template_cxx1z.cpp:* 1+{{}}
 
diff --git a/clang/test/SemaTemplate/temp_arg_type.cpp b/clang/test/SemaTemplate/temp_arg_type.cpp
index 9069f63e0224..cdbcf281125e 100644
--- a/clang/test/SemaTemplate/temp_arg_type.cpp
+++ b/clang/test/SemaTemplate/temp_arg_type.cpp
@@ -11,7 +11,7 @@ A<0> *a1; // expected-error{{template argument for template type parameter must
 A<A> *a2; // expected-error{{use of class template 'A' requires template arguments}}
 
 A<int> *a3;
-A<int()> *a4; 
+A<int()> *a4;
 A<int(float)> *a5;
 A<A<int> > *a6;
 
@@ -95,15 +95,13 @@ namespace deduce_noexcept {
   template void dep() noexcept(true); // expected-error {{does not refer to a function template}}
   template void dep() noexcept(false); // expected-error {{does not refer to a function template}}
 
-  // FIXME: It's also not clear whether this should be valid: do we substitute
-  // into the function type (including the exception specification) or not?
-  template<typename T> typename T::type1 f() noexcept(T::a);
-  template<typename T> typename T::type2 f() noexcept(T::b) {}
+  template<typename T> typename T::type1 f() noexcept(T::a); // expected-note {{candidate}}
+  template<typename T> typename T::type2 f() noexcept(T::b) {} // expected-note {{candidate}}
   struct X {
     static constexpr bool b = true;
     using type1 = void;
     using type2 = void;
   };
-  template void f<X>();
+  template void f<X>(); // expected-error {{partial ordering for explicit instantiation of 'f' is ambiguous}}
 }
 #endif
diff --git a/clang/test/SemaTemplate/template-id-expr.cpp b/clang/test/SemaTemplate/template-id-expr.cpp
index 0555d8b94504..ce40aade9cf1 100644
--- a/clang/test/SemaTemplate/template-id-expr.cpp
+++ b/clang/test/SemaTemplate/template-id-expr.cpp
@@ -186,3 +186,93 @@ class E {
 #endif
 template<typename T> using D = int; // expected-note {{declared here}} 
 E<D> ed; // expected-note {{instantiation of}}
+
+namespace non_functions {
+
+#if __cplusplus >= 201103L
+namespace PR88832 {
+template <typename T> struct O {
+  static const T v = 0;
+};
+
+struct P {
+  template <typename T> using I = typename O<T>::v; // #TypeAlias
+};
+
+struct Q {
+  template <typename T> int foo() {
+    return T::template I<int>;
+    // expected-error@-1 {{'P::I' is expected to be a non-type template, but instantiated to a type alias template}}
+    // expected-note@#TypeAlias {{type alias template declared here}}
+  }
+};
+
+int bar() {
+  return Q().foo<P>(); // expected-note-re {{function template specialization {{.*}} requested here}}
+}
+
+} // namespace PR88832
+#endif
+
+namespace PR63243 {
+
+namespace std {
+template <class T> struct add_pointer { // #add_pointer
+};
+} // namespace std
+
+class A {};
+
+int main() {
+  std::__add_pointer<A>::type ptr;
+  // expected-warning@-1 {{keyword '__add_pointer' will be made available as an identifier here}}
+  // expected-error@-2 {{no template named '__add_pointer'}}
+  // expected-note@#add_pointer {{'add_pointer' declared here}}
+  // expected-error-re@-4 {{no type named 'type' in '{{.*}}std::add_pointer<{{.*}}A>'}}
+
+  __add_pointer<A>::type ptr2;
+  // expected-error@-1 {{no template named '__add_pointer'}}
+  // expected-error-re@-2 {{no type named 'type' in '{{.*}}std::add_pointer<{{.*}}A>'}}
+  // expected-note@#add_pointer {{'std::add_pointer' declared here}}
+}
+
+} // namespace PR63243
+
+namespace PR48673 {
+
+template <typename T> struct C {
+  template <int TT> class Type {}; // #ClassTemplate
+};
+
+template <typename T1> struct A {
+
+  template <typename T2>
+  void foo(T2) {}
+
+  void foo() {
+    C<T1>::template Type<2>;
+    // expected-error@-1 {{'C<float>::Type' is expected to be a non-type template, but instantiated to a class template}}}
+    // expected-note@#ClassTemplate {{class template declared here}}
+
+    foo(C<T1>::Type<2>); // expected-error {{expected expression}}
+
+    foo(C<T1>::template Type<2>);
+    // expected-error@-1 {{'C<float>::Type' is expected to be a non-type template, but instantiated to a class template}}
+    // expected-note@#ClassTemplate {{class template declared here}}
+
+    foo(C<T1>::template Type<2>());
+    // expected-error@-1 {{'C<float>::Type' is expected to be a non-type template, but instantiated to a class template}}
+    // expected-error@-2 {{called object type '<dependent type>' is not a function or function pointer}}
+    // expected-note@#ClassTemplate {{class template declared here}}
+
+    foo(typename C<T1>::template Type<2>());
+  }
+};
+
+void test() {
+  A<float>().foo(); // expected-note-re {{instantiation of member function {{.*}} requested here}}
+}
+
+} // namespace PR48673
+
+}
diff --git a/clang/test/SemaTemplate/template-id-printing.cpp b/clang/test/SemaTemplate/template-id-printing.cpp
index 047589b1ce43..d9fc7201eee0 100644
--- a/clang/test/SemaTemplate/template-id-printing.cpp
+++ b/clang/test/SemaTemplate/template-id-printing.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -ast-print %s | FileCheck %s
 namespace N {
   template<typename T, typename U> void f(U);
   template<int> void f();
diff --git a/clang/test/SemaTemplate/type_pack_element.cpp b/clang/test/SemaTemplate/type_pack_element.cpp
index 9bca846e6659..9e23ef1ff3cf 100644
--- a/clang/test/SemaTemplate/type_pack_element.cpp
+++ b/clang/test/SemaTemplate/type_pack_element.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++23 -fsyntax-only -triple x86_64-linux-gnu -ast-dump -verify -xc++ < %s | FileCheck %s
+// RUN: %clang_cc1 -std=c++23 -triple x86_64-linux-gnu -ast-dump -verify -xc++ < %s | FileCheck %s
 
 using test1 = __type_pack_element<0, int>;
 //      CHECK: |-TypeAliasDecl 0x{{[0-9A-Fa-f]+}} <<stdin>:3:1, col:41> col:7 test1 '__type_pack_element<0, int>':'int'
diff --git a/clang/test/VFS/directory.c b/clang/test/VFS/directory.c
index b850ace54c93..6b01f4986c30 100644
--- a/clang/test/VFS/directory.c
+++ b/clang/test/VFS/directory.c
@@ -12,10 +12,10 @@
 
 // 1) Underlying -> Overlay (C.h found, B.h falling back to Underlying)
 // RUN: sed -e "s@INPUT_DIR@%{/t:regex_replacement}/Overlay@g" -e "s@OUT_DIR@%{/t:regex_replacement}/Underlying@g" %S/Inputs/vfsoverlay-directory.yaml > %t/vfs.yaml
-// RUN: %clang_cc1 -Werror -I %t/Underlying -ivfsoverlay %t/vfs.yaml -fsyntax-only -E -C %s 2>&1 | FileCheck --check-prefix=DIRECT %s
-// RUN: %clang_cc1 -Werror -I %t/Underlying -ivfsoverlay %t/vfs.yaml -fsyntax-only -DNESTED -E -C %s 2>&1 | FileCheck --check-prefix=DIRECT %s
+// RUN: %clang_cc1 -Werror -I %t/Underlying -ivfsoverlay %t/vfs.yaml -E -C %s 2>&1 | FileCheck --check-prefix=DIRECT %s
+// RUN: %clang_cc1 -Werror -I %t/Underlying -ivfsoverlay %t/vfs.yaml -DNESTED -E -C %s 2>&1 | FileCheck --check-prefix=DIRECT %s
 // RUN: sed -e "s@INPUT_DIR@Overlay@g" -e "s@OUT_DIR@%{/t:regex_replacement}/Underlying@g" %S/Inputs/vfsoverlay-directory-relative.yaml > %t/vfs-relative.yaml
-// RUN: %clang_cc1 -Werror -I %t/Underlying -ivfsoverlay %t/vfs-relative.yaml -fsyntax-only -E -C %s 2>&1 | FileCheck --check-prefix=DIRECT %s
+// RUN: %clang_cc1 -Werror -I %t/Underlying -ivfsoverlay %t/vfs-relative.yaml -E -C %s 2>&1 | FileCheck --check-prefix=DIRECT %s
 
 // DIRECT: {{^}}// B.h in Underlying
 // DIRECT: {{^}}// C.h in Overlay
@@ -23,21 +23,21 @@
 // 2) Underlying -> Middle -> Overlay (C.h found, B.h falling back to Underlying)
 // RUN: sed -e "s@INPUT_DIR@%{/t:regex_replacement}/Overlay@g" -e "s@OUT_DIR@%{/t:regex_replacement}/Middle@g" %S/Inputs/vfsoverlay-directory.yaml > %t/vfs.yaml
 // RUN: sed -e "s@INPUT_DIR@%{/t:regex_replacement}/Middle@g" -e "s@OUT_DIR@%{/t:regex_replacement}/Underlying@g" %S/Inputs/vfsoverlay-directory.yaml > %t/vfs2.yaml
-// RUN: %clang_cc1 -Werror -I %t/Underlying -ivfsoverlay %t/vfs.yaml -ivfsoverlay %t/vfs2.yaml -fsyntax-only -E -C %s 2>&1 | FileCheck --check-prefix=DIRECT %s
-// RUN: %clang_cc1 -Werror -I %t/Underlying -ivfsoverlay %t/vfs.yaml -ivfsoverlay %t/vfs2.yaml -DNESTED -fsyntax-only -E -C %s 2>&1 | FileCheck --check-prefix=DIRECT %s
+// RUN: %clang_cc1 -Werror -I %t/Underlying -ivfsoverlay %t/vfs.yaml -ivfsoverlay %t/vfs2.yaml -E -C %s 2>&1 | FileCheck --check-prefix=DIRECT %s
+// RUN: %clang_cc1 -Werror -I %t/Underlying -ivfsoverlay %t/vfs.yaml -ivfsoverlay %t/vfs2.yaml -DNESTED -E -C %s 2>&1 | FileCheck --check-prefix=DIRECT %s
 
 // Same as direct above
 
 // 3) Underlying -> Middle -> Overlay (C.h falling back to Middle, B.h falling back to Underlying)
 // RUN: rm -f %t/Overlay/C.h
-// RUN: %clang_cc1 -Werror -I %t/Underlying -ivfsoverlay %t/vfs.yaml -ivfsoverlay %t/vfs2.yaml -fsyntax-only -E -C %s 2>&1 | FileCheck --check-prefix=FALLBACK %s
+// RUN: %clang_cc1 -Werror -I %t/Underlying -ivfsoverlay %t/vfs.yaml -ivfsoverlay %t/vfs2.yaml -E -C %s 2>&1 | FileCheck --check-prefix=FALLBACK %s
 
 // FALLBACK: {{^}}// B.h in Underlying
 // FALLBACK: {{^}}// C.h in Middle
 
 // 3) Underlying -> Middle -> Overlay (C.h falling back to Underlying, B.h falling back to Underlying)
 // RUN: rm -f %t/Middle/C.h
-// RUN: %clang_cc1 -Werror -I %t/Underlying -ivfsoverlay %t/vfs.yaml -ivfsoverlay %t/vfs2.yaml -fsyntax-only -E -C %s 2>&1 | FileCheck --check-prefix=FALLBACK2 %s
+// RUN: %clang_cc1 -Werror -I %t/Underlying -ivfsoverlay %t/vfs.yaml -ivfsoverlay %t/vfs2.yaml -E -C %s 2>&1 | FileCheck --check-prefix=FALLBACK2 %s
 
 // FALLBACK2: {{^}}// B.h in Underlying
 // FALLBACK2: {{^}}// C.h in Underlying
diff --git a/clang/test/VFS/external-names-multi-overlay.c b/clang/test/VFS/external-names-multi-overlay.c
index 338131b90589..f4e6c8643788 100644
--- a/clang/test/VFS/external-names-multi-overlay.c
+++ b/clang/test/VFS/external-names-multi-overlay.c
@@ -5,10 +5,10 @@
 
 // Check that the external name is given when multiple overlays are provided
 
-// RUN: %clang_cc1 -Werror -I %t/A -ivfsoverlay %t/vfs/a-b-ft.yaml -ivfsoverlay %t/vfs/empty.yaml -fsyntax-only -E -C %t/main.c 2>&1 | FileCheck --check-prefix=FROM_B %s
-// RUN: %clang_cc1 -Werror -I %t/A -ivfsoverlay %t/vfs/a-b-fb.yaml -ivfsoverlay %t/vfs/empty.yaml -fsyntax-only -E -C %t/main.c 2>&1 | FileCheck --check-prefix=FROM_B %s
-// RUN: %clang_cc1 -Werror -I %t/B -ivfsoverlay %t/vfs/a-b-ft.yaml -ivfsoverlay %t/vfs/empty.yaml -fsyntax-only -E -C %t/main.c 2>&1 | FileCheck --check-prefix=FROM_B %s
-// RUN: %clang_cc1 -Werror -I %t/B -ivfsoverlay %t/vfs/a-b-fb.yaml -ivfsoverlay %t/vfs/empty.yaml -fsyntax-only -E -C %t/main.c 2>&1 | FileCheck --check-prefix=FROM_B %s
+// RUN: %clang_cc1 -Werror -I %t/A -ivfsoverlay %t/vfs/a-b-ft.yaml -ivfsoverlay %t/vfs/empty.yaml -E -C %t/main.c 2>&1 | FileCheck --check-prefix=FROM_B %s
+// RUN: %clang_cc1 -Werror -I %t/A -ivfsoverlay %t/vfs/a-b-fb.yaml -ivfsoverlay %t/vfs/empty.yaml -E -C %t/main.c 2>&1 | FileCheck --check-prefix=FROM_B %s
+// RUN: %clang_cc1 -Werror -I %t/B -ivfsoverlay %t/vfs/a-b-ft.yaml -ivfsoverlay %t/vfs/empty.yaml -E -C %t/main.c 2>&1 | FileCheck --check-prefix=FROM_B %s
+// RUN: %clang_cc1 -Werror -I %t/B -ivfsoverlay %t/vfs/a-b-fb.yaml -ivfsoverlay %t/vfs/empty.yaml -E -C %t/main.c 2>&1 | FileCheck --check-prefix=FROM_B %s
 // FROM_B: # 1 "{{.*(/|\\\\)B(/|\\\\)}}Header.h"
 // FROM_B: // Header.h in B
 
diff --git a/clang/test/VFS/fallback.c b/clang/test/VFS/fallback.c
index 11392bdc4e44..add2190d7c81 100644
--- a/clang/test/VFS/fallback.c
+++ b/clang/test/VFS/fallback.c
@@ -23,7 +23,7 @@
 // RUN: sed -e "s@EXTERNAL_DIR@%{/t:regex_replacement}/CFallback/Base@g" -e "s@NAME_DIR@%{/t:regex_replacement}/CFallback/UseFirst@g" %t/vfs/base.yaml > %t/vfs/c-fallback.yaml
 
 // Both B.h and C.h are in both folders
-// RUN: %clang_cc1 -Werror -I %t/Both/UseFirst -ivfsoverlay %t/vfs/both.yaml -fsyntax-only -E -C %t/main.c 2>&1 | FileCheck --check-prefix=IN_UF %s
+// RUN: %clang_cc1 -Werror -I %t/Both/UseFirst -ivfsoverlay %t/vfs/both.yaml -E -C %t/main.c 2>&1 | FileCheck --check-prefix=IN_UF %s
 
 // IN_UF: # 1 "{{.*(/|\\\\)UseFirst(/|\\\\)}}B.h"
 // IN_UF-NEXT: // B.h in UseFirst
@@ -31,10 +31,10 @@
 // IN_UF-NEXT: // C.h in UseFirst
 
 // Base missing, so now they are only in UseFirst
-// RUN: %clang_cc1 -Werror -I %t/UseFirstOnly/UseFirst -ivfsoverlay %t/vfs/use-first-only.yaml -fsyntax-only -E -C %t/main.c 2>&1 | FileCheck --check-prefix=IN_UF %s
+// RUN: %clang_cc1 -Werror -I %t/UseFirstOnly/UseFirst -ivfsoverlay %t/vfs/use-first-only.yaml -E -C %t/main.c 2>&1 | FileCheck --check-prefix=IN_UF %s
 
 // UseFirst missing, fallback to Base
-// RUN: %clang_cc1 -Werror -I %t/BaseOnly/UseFirst -ivfsoverlay %t/vfs/base-only.yaml -fsyntax-only -E -C %t/main.c 2>&1 | FileCheck --check-prefix=IN_BASE %s
+// RUN: %clang_cc1 -Werror -I %t/BaseOnly/UseFirst -ivfsoverlay %t/vfs/base-only.yaml -E -C %t/main.c 2>&1 | FileCheck --check-prefix=IN_BASE %s
 
 // IN_BASE: # 1 "{{.*(/|\\\\)Base(/|\\\\)}}B.h"
 // IN_BASE-NEXT: // B.h in Base
@@ -42,7 +42,7 @@
 // IN_BASE-NEXT: // C.h in Base
 
 // B.h missing from UseFirst
-// RUN: %clang_cc1 -Werror -I %t/BFallback/UseFirst -ivfsoverlay %t/vfs/b-fallback.yaml -fsyntax-only -E -C %t/main.c 2>&1 | FileCheck --check-prefix=B_FALLBACK %s
+// RUN: %clang_cc1 -Werror -I %t/BFallback/UseFirst -ivfsoverlay %t/vfs/b-fallback.yaml -E -C %t/main.c 2>&1 | FileCheck --check-prefix=B_FALLBACK %s
 
 // B_FALLBACK: # 1 "{{.*(/|\\\\)Base(/|\\\\)}}B.h"
 // B_FALLBACK-NEXT: // B.h in Base
@@ -50,7 +50,7 @@
 // B_FALLBACK-NEXT: // C.h in UseFirst
 
 // C.h missing from UseFirst
-// RUN: %clang_cc1 -Werror -I %t/CFallback/UseFirst -ivfsoverlay %t/vfs/c-fallback.yaml -fsyntax-only -E -C %t/main.c 2>&1 | FileCheck --check-prefix=C_FALLBACK %s
+// RUN: %clang_cc1 -Werror -I %t/CFallback/UseFirst -ivfsoverlay %t/vfs/c-fallback.yaml -E -C %t/main.c 2>&1 | FileCheck --check-prefix=C_FALLBACK %s
 
 // C_FALLBACK: # 1 "{{.*(/|\\\\)UseFirst(/|\\\\)}}B.h"
 // C_FALLBACK-NEXT: // B.h in UseFirst
diff --git a/clang/test/lit.site.cfg.py.in b/clang/test/lit.site.cfg.py.in
index 6641811c5883..ec6d30e6c220 100644
--- a/clang/test/lit.site.cfg.py.in
+++ b/clang/test/lit.site.cfg.py.in
@@ -34,6 +34,7 @@ config.enable_backtrace = @ENABLE_BACKTRACES@
 config.enable_threads = @LLVM_ENABLE_THREADS@
 config.reverse_iteration = @LLVM_ENABLE_REVERSE_ITERATION@
 config.host_arch = "@HOST_ARCH@"
+config.perl_executable = "@PERL_EXECUTABLE@"
 config.python_executable = "@Python3_EXECUTABLE@"
 config.use_z3_solver = lit_config.params.get('USE_Z3_SOLVER', "@USE_Z3_SOLVER@")
 config.has_plugins = @CLANG_PLUGIN_SUPPORT@
diff --git a/clang/tools/clang-installapi/ClangInstallAPI.cpp b/clang/tools/clang-installapi/ClangInstallAPI.cpp
index add28ab4fcda..308e5285e325 100644
--- a/clang/tools/clang-installapi/ClangInstallAPI.cpp
+++ b/clang/tools/clang-installapi/ClangInstallAPI.cpp
@@ -39,7 +39,7 @@ using namespace clang::driver::options;
 using namespace llvm::opt;
 using namespace llvm::MachO;
 
-static bool runFrontend(StringRef ProgName, bool Verbose,
+static bool runFrontend(StringRef ProgName, Twine Label, bool Verbose,
                         InstallAPIContext &Ctx,
                         llvm::vfs::InMemoryFileSystem *FS,
                         const ArrayRef<std::string> InitialArgs) {
@@ -50,7 +50,7 @@ static bool runFrontend(StringRef ProgName, bool Verbose,
     return true;
 
   if (Verbose)
-    llvm::errs() << getName(Ctx.Type) << " Headers:\n"
+    llvm::errs() << Label << " Headers:\n"
                  << ProcessedInput->getBuffer() << "\n\n";
 
   std::string InputFile = ProcessedInput->getBufferIdentifier().str();
@@ -65,7 +65,6 @@ static bool runFrontend(StringRef ProgName, bool Verbose,
   // Create & run invocation.
   clang::tooling::ToolInvocation Invocation(
       std::move(Args), std::make_unique<InstallAPIAction>(Ctx), Ctx.FM);
-
   return Invocation.run();
 }
 
@@ -127,10 +126,23 @@ static bool run(ArrayRef<const char *> Args, const char *ProgName) {
     Ctx.Slice = std::make_shared<FrontendRecordsSlice>(Trip);
     for (const HeaderType Type :
          {HeaderType::Public, HeaderType::Private, HeaderType::Project}) {
+      std::vector<std::string> ArgStrings = Opts.getClangFrontendArgs();
+      Opts.addConditionalCC1Args(ArgStrings, Trip, Type);
       Ctx.Type = Type;
-      if (!runFrontend(ProgName, Opts.DriverOpts.Verbose, Ctx,
-                       InMemoryFileSystem.get(), Opts.getClangFrontendArgs()))
+      StringRef HeaderLabel = getName(Ctx.Type);
+      if (!runFrontend(ProgName, HeaderLabel, Opts.DriverOpts.Verbose, Ctx,
+                       InMemoryFileSystem.get(), ArgStrings))
         return EXIT_FAILURE;
+
+      // Run extra passes for unique compiler arguments.
+      for (const auto &[Label, ExtraArgs] : Opts.FEOpts.UniqueArgs) {
+        std::vector<std::string> FinalArguments = ArgStrings;
+        llvm::append_range(FinalArguments, ExtraArgs);
+        if (!runFrontend(ProgName, Label + " " + HeaderLabel,
+                         Opts.DriverOpts.Verbose, Ctx, InMemoryFileSystem.get(),
+                         FinalArguments))
+          return EXIT_FAILURE;
+      }
     }
     FrontendRecords.emplace_back(std::move(Ctx.Slice));
   }
diff --git a/clang/tools/clang-installapi/InstallAPIOpts.td b/clang/tools/clang-installapi/InstallAPIOpts.td
index 8b1998c280dd..a95a7a80a9d2 100644
--- a/clang/tools/clang-installapi/InstallAPIOpts.td
+++ b/clang/tools/clang-installapi/InstallAPIOpts.td
@@ -91,6 +91,15 @@ def project_umbrella_header_EQ : Joined<["--"], "project-umbrella-header=">,
   Alias<project_umbrella_header>;
 
 //
+/// X<label> overrides.
+//
+def Xplatform__ : Joined<["-"], "Xplatform_">;
+def Xproject : Joined<["-"], "Xproject">;
+def X__ : Joined<["-"], "X">,
+  HelpText<"Pass <arg> to run unique clang invocation identified as <label>">, 
+  MetaVarName<"<label> <arg>">;
+
+//
 /// Overidden clang options for different behavior.
 //
 
@@ -108,4 +117,25 @@ def reexport_library : Separate<["-"], "reexport_library">, MetaVarName<"<path>"
 def reexport_framework : Separate<["-"], "reexport_framework">,
   HelpText<"Re-export the specified framework">;
 
+// Xproject supported options.
+def fobjc_arc : Flag<["-"], "fobjc-arc">,
+  HelpText<"Synthesize retain and release calls for Objective-C pointers">;
+def include_ : JoinedOrSeparate<["-", "--"], "include">,
+  MetaVarName<"<file>">, HelpText<"Include file before parsing, can only be used with -Xproject">;
+def fvisibility_EQ : Joined<["-"], "fvisibility=">,
+  HelpText<"Set the default symbol visibility for all global declarations">;
+def fmodules : Flag <["-"], "fmodules">,
+  HelpText<"Enable the 'modules' language feature">;
+def fmodules_cache_path : Joined<["-"], "fmodules-cache-path=">,
+  MetaVarName<"<directory>">,
+  HelpText<"Specify the module cache path">;
+
+// Xplatform supported options.
+def iframework : JoinedOrSeparate<["-"], "iframework">,
+  HelpText<"Add directory to SYSTEM framework search path">;
+
+// X<label> prefixes supported options.
+def D : JoinedOrSeparate<["-"], "D">, HelpText<"Define macro">;
+def U : JoinedOrSeparate<["-"], "U">, HelpText<"Undefine macro">;
+
 
diff --git a/clang/tools/clang-installapi/Options.cpp b/clang/tools/clang-installapi/Options.cpp
index 21f04a291b2f..53340da704fc 100644
--- a/clang/tools/clang-installapi/Options.cpp
+++ b/clang/tools/clang-installapi/Options.cpp
@@ -189,12 +189,123 @@ bool Options::processDriverOptions(InputArgList &Args) {
 
 bool Options::processInstallAPIXOptions(InputArgList &Args) {
   for (arg_iterator It = Args.begin(), End = Args.end(); It != End; ++It) {
-    if ((*It)->getOption().matches(OPT_Xarch__)) {
+    Arg *A = *It;
+    if (A->getOption().matches(OPT_Xarch__)) {
       if (!processXarchOption(Args, It))
         return false;
+      continue;
+    } else if (A->getOption().matches(OPT_Xplatform__)) {
+      if (!processXplatformOption(Args, It))
+        return false;
+      continue;
+    } else if (A->getOption().matches(OPT_Xproject)) {
+      if (!processXprojectOption(Args, It))
+        return false;
+      continue;
+    } else if (!A->getOption().matches(OPT_X__))
+      continue;
+
+    // Handle any user defined labels.
+    const StringRef Label = A->getValue(0);
+
+    // Ban "public" and "private" labels.
+    if ((Label.lower() == "public") || (Label.lower() == "private")) {
+      Diags->Report(diag::err_invalid_label) << Label;
+      return false;
     }
+
+    auto NextIt = std::next(It);
+    if (NextIt == End) {
+      Diags->Report(clang::diag::err_drv_missing_argument)
+          << A->getAsString(Args) << 1;
+      return false;
+    }
+    Arg *NextA = *NextIt;
+    switch ((ID)NextA->getOption().getID()) {
+    case OPT_D:
+    case OPT_U:
+      break;
+    default:
+      Diags->Report(clang::diag::err_drv_argument_not_allowed_with)
+          << A->getAsString(Args) << NextA->getAsString(Args);
+      return false;
+    }
+    const StringRef ASpelling = NextA->getSpelling();
+    const auto &AValues = NextA->getValues();
+    if (AValues.empty())
+      FEOpts.UniqueArgs[Label].emplace_back(ASpelling.str());
+    else
+      for (const StringRef Val : AValues)
+        FEOpts.UniqueArgs[Label].emplace_back((ASpelling + Val).str());
+
+    A->claim();
+    NextA->claim();
+  }
+
+  return true;
+}
+
+bool Options::processXplatformOption(InputArgList &Args, arg_iterator Curr) {
+  Arg *A = *Curr;
+
+  PlatformType Platform = getPlatformFromName(A->getValue(0));
+  if (Platform == PLATFORM_UNKNOWN) {
+    Diags->Report(diag::err_unsupported_os)
+        << getPlatformName(Platform) << A->getAsString(Args);
+    return false;
+  }
+  auto NextIt = std::next(Curr);
+  if (NextIt == Args.end()) {
+    Diags->Report(diag::err_drv_missing_argument) << A->getAsString(Args) << 1;
+    return false;
+  }
+
+  Arg *NextA = *NextIt;
+  switch ((ID)NextA->getOption().getID()) {
+  case OPT_iframework:
+    FEOpts.SystemFwkPaths.emplace_back(NextA->getValue(), Platform);
+    break;
+  default:
+    Diags->Report(diag::err_drv_invalid_argument_to_option)
+        << A->getAsString(Args) << NextA->getAsString(Args);
+    return false;
+  }
+
+  A->claim();
+  NextA->claim();
+
+  return true;
+}
+
+bool Options::processXprojectOption(InputArgList &Args, arg_iterator Curr) {
+  Arg *A = *Curr;
+  auto NextIt = std::next(Curr);
+  if (NextIt == Args.end()) {
+    Diags->Report(diag::err_drv_missing_argument) << A->getAsString(Args) << 1;
+    return false;
   }
-  // TODO: Add support for the all of the X* options installapi supports.
+
+  Arg *NextA = *NextIt;
+  switch ((ID)NextA->getOption().getID()) {
+  case OPT_fobjc_arc:
+  case OPT_fmodules:
+  case OPT_fmodules_cache_path:
+  case OPT_include_:
+  case OPT_fvisibility_EQ:
+    break;
+  default:
+    Diags->Report(diag::err_drv_argument_not_allowed_with)
+        << A->getAsString(Args) << NextA->getAsString(Args);
+    return false;
+  }
+
+  std::string ArgString = NextA->getSpelling().str();
+  for (const StringRef Val : NextA->getValues())
+    ArgString += Val.str();
+
+  ProjectLevelArgs.push_back(ArgString);
+  A->claim();
+  NextA->claim();
 
   return true;
 }
@@ -333,10 +444,10 @@ bool Options::processFrontendOptions(InputArgList &Args) {
     }
   }
 
-  // Capture system frameworks.
-  // TODO: Support passing framework paths per platform.
+  // Capture system frameworks for all platforms.
   for (const Arg *A : Args.filtered(drv::OPT_iframework))
-    FEOpts.SystemFwkPaths.emplace_back(A->getValue());
+    FEOpts.SystemFwkPaths.emplace_back(A->getValue(),
+                                       std::optional<PlatformType>{});
 
   // Capture framework paths.
   PathSeq FrameworkPaths;
@@ -359,7 +470,8 @@ bool Options::processFrontendOptions(InputArgList &Args) {
   for (const StringRef FwkPath : DefaultFrameworkPaths) {
     SmallString<PATH_MAX> Path(FEOpts.ISysroot);
     sys::path::append(Path, FwkPath);
-    FEOpts.SystemFwkPaths.emplace_back(Path.str());
+    FEOpts.SystemFwkPaths.emplace_back(Path.str(),
+                                       std::optional<PlatformType>{});
   }
 
   return true;
@@ -510,7 +622,11 @@ Options::processAndFilterOutInstallAPIOptions(ArrayRef<const char *> Args) {
   for (const Arg *A : ParsedArgs) {
     if (A->isClaimed())
       continue;
-    llvm::copy(A->getValues(), std::back_inserter(ClangDriverArgs));
+    // Forward along unclaimed but overlapping arguments to the clang driver.
+    if (A->getOption().getID() > (unsigned)OPT_UNKNOWN) {
+      ClangDriverArgs.push_back(A->getSpelling().data());
+    } else
+      llvm::copy(A->getValues(), std::back_inserter(ClangDriverArgs));
   }
   return ClangDriverArgs;
 }
@@ -622,12 +738,30 @@ std::pair<LibAttrs, ReexportedInterfaces> Options::getReexportedLibraries() {
     return true;
   };
 
+  PlatformSet Platforms;
+  llvm::for_each(DriverOpts.Targets,
+                 [&](const auto &T) { Platforms.insert(T.first.Platform); });
   // Populate search paths by looking at user paths before system ones.
   PathSeq FwkSearchPaths(FEOpts.FwkPaths.begin(), FEOpts.FwkPaths.end());
-  // FIXME: System framework paths need to reset if installapi is invoked with
-  // different platforms.
-  FwkSearchPaths.insert(FwkSearchPaths.end(), FEOpts.SystemFwkPaths.begin(),
-                        FEOpts.SystemFwkPaths.end());
+  for (const PlatformType P : Platforms) {
+    PathSeq PlatformSearchPaths = getPathsForPlatform(FEOpts.SystemFwkPaths, P);
+    FwkSearchPaths.insert(FwkSearchPaths.end(), PlatformSearchPaths.begin(),
+                          PlatformSearchPaths.end());
+    for (const StringMapEntry<ArchitectureSet> &Lib :
+         LinkerOpts.ReexportedFrameworks) {
+      std::string Name = (Lib.getKey() + ".framework/" + Lib.getKey()).str();
+      std::string Path = findLibrary(Name, *FM, FwkSearchPaths, {}, {});
+      if (Path.empty()) {
+        Diags->Report(diag::err_cannot_find_reexport) << false << Lib.getKey();
+        return {};
+      }
+      if (DriverOpts.TraceLibraryLocation)
+        errs() << Path << "\n";
+
+      AccumulateReexports(Path, Lib.getValue());
+    }
+    FwkSearchPaths.resize(FwkSearchPaths.size() - PlatformSearchPaths.size());
+  }
 
   for (const StringMapEntry<ArchitectureSet> &Lib :
        LinkerOpts.ReexportedLibraries) {
@@ -647,20 +781,6 @@ std::pair<LibAttrs, ReexportedInterfaces> Options::getReexportedLibraries() {
        LinkerOpts.ReexportedLibraryPaths)
     AccumulateReexports(Lib.getKey(), Lib.getValue());
 
-  for (const StringMapEntry<ArchitectureSet> &Lib :
-       LinkerOpts.ReexportedFrameworks) {
-    std::string Name = (Lib.getKey() + ".framework/" + Lib.getKey()).str();
-    std::string Path = findLibrary(Name, *FM, FwkSearchPaths, {}, {});
-    if (Path.empty()) {
-      Diags->Report(diag::err_cannot_find_reexport) << false << Lib.getKey();
-      return {};
-    }
-    if (DriverOpts.TraceLibraryLocation)
-      errs() << Path << "\n";
-
-    AccumulateReexports(Path, Lib.getValue());
-  }
-
   return {std::move(Reexports), std::move(ReexportIFs)};
 }
 
@@ -876,5 +996,25 @@ InstallAPIContext Options::createContext() {
   return Ctx;
 }
 
+void Options::addConditionalCC1Args(std::vector<std::string> &ArgStrings,
+                                    const llvm::Triple &Targ,
+                                    const HeaderType Type) {
+  // Unique to architecture (Xarch) options hold no arguments to pass along for
+  // frontend.
+
+  // Add specific to platform arguments.
+  PathSeq PlatformSearchPaths =
+      getPathsForPlatform(FEOpts.SystemFwkPaths, mapToPlatformType(Targ));
+  llvm::for_each(PlatformSearchPaths, [&ArgStrings](const StringRef Path) {
+    ArgStrings.push_back("-iframework");
+    ArgStrings.push_back(Path.str());
+  });
+
+  // Add specific to header type arguments.
+  if (Type == HeaderType::Project)
+    for (const StringRef A : ProjectLevelArgs)
+      ArgStrings.emplace_back(A);
+}
+
 } // namespace installapi
 } // namespace clang
diff --git a/clang/tools/clang-installapi/Options.h b/clang/tools/clang-installapi/Options.h
index e9ac75889ad3..fd1e10065d10 100644
--- a/clang/tools/clang-installapi/Options.h
+++ b/clang/tools/clang-installapi/Options.h
@@ -133,6 +133,9 @@ struct LinkerOptions {
 };
 
 struct FrontendOptions {
+  /// \brief Unique clang options to pass per key in map.
+  llvm::StringMap<std::vector<std::string>> UniqueArgs;
+
   /// \brief The language mode to parse headers in.
   Language LangMode = Language::ObjC;
 
@@ -143,7 +146,7 @@ struct FrontendOptions {
   PathSeq FwkPaths;
 
   /// \brief Additional SYSTEM framework search paths.
-  PathSeq SystemFwkPaths;
+  PathToPlatformSeq SystemFwkPaths;
 };
 
 using arg_iterator = llvm::opt::arg_iterator<llvm::opt::Arg **>;
@@ -156,6 +159,8 @@ private:
   processAndFilterOutInstallAPIOptions(ArrayRef<const char *> Args);
   bool processInstallAPIXOptions(llvm::opt::InputArgList &Args);
   bool processXarchOption(llvm::opt::InputArgList &Args, arg_iterator Curr);
+  bool processXplatformOption(llvm::opt::InputArgList &Args, arg_iterator Curr);
+  bool processXprojectOption(llvm::opt::InputArgList &Args, arg_iterator Curr);
 
 public:
   /// The various options grouped together.
@@ -176,6 +181,11 @@ public:
   /// ones.
   std::vector<std::string> &getClangFrontendArgs() { return FrontendArgs; }
 
+  /// \brief Add relevant, but conditionalized by active target and header type,
+  /// arguments for constructing a CC1 invocation.
+  void addConditionalCC1Args(std::vector<std::string> &ArgStrings,
+                             const llvm::Triple &Targ, const HeaderType Type);
+
 private:
   bool addFilePaths(llvm::opt::InputArgList &Args, PathSeq &Headers,
                     llvm::opt::OptSpecifier ID);
@@ -186,6 +196,7 @@ private:
   FileManager *FM;
   std::vector<std::string> FrontendArgs;
   llvm::DenseMap<const llvm::opt::Arg *, Architecture> ArgToArchMap;
+  std::vector<std::string> ProjectLevelArgs;
 };
 
 enum ID {
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index a1879fc7712d..69d8cb446fad 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -413,7 +413,7 @@ fatbinary(ArrayRef<std::pair<StringRef, StringRef>> InputFiles,
 
   SmallVector<StringRef> Targets = {"-targets=host-x86_64-unknown-linux"};
   for (const auto &[File, Arch] : InputFiles)
-    Targets.push_back(Saver.save("hipv4-amdgcn-amd-amdhsa--" + Arch));
+    Targets.push_back(Saver.save("hip-amdgcn-amd-amdhsa--" + Arch));
   CmdArgs.push_back(Saver.save(llvm::join(Targets, ",")));
 
 #ifdef _WIN32
diff --git a/clang/tools/diagtool/ShowEnabledWarnings.cpp b/clang/tools/diagtool/ShowEnabledWarnings.cpp
index 285efe6ae05b..66a295db054c 100644
--- a/clang/tools/diagtool/ShowEnabledWarnings.cpp
+++ b/clang/tools/diagtool/ShowEnabledWarnings.cpp
@@ -90,11 +90,11 @@ int ShowEnabledWarnings::run(unsigned int argc, char **argv, raw_ostream &Out) {
   bool ShouldShowLevels = true;
   if (argc > 0) {
     StringRef FirstArg(*argv);
-    if (FirstArg.equals("--no-levels")) {
+    if (FirstArg == "--no-levels") {
       ShouldShowLevels = false;
       --argc;
       ++argv;
-    } else if (FirstArg.equals("--levels")) {
+    } else if (FirstArg == "--levels") {
       ShouldShowLevels = true;
       --argc;
       ++argv;
diff --git a/clang/tools/diagtool/TreeView.cpp b/clang/tools/diagtool/TreeView.cpp
index 00d1097b5fbf..8d1ce14b0f52 100644
--- a/clang/tools/diagtool/TreeView.cpp
+++ b/clang/tools/diagtool/TreeView.cpp
@@ -144,7 +144,7 @@ int TreeView::run(unsigned int argc, char **argv, llvm::raw_ostream &out) {
   bool Internal = false;
   if (argc > 0) {
     StringRef FirstArg(*argv);
-    if (FirstArg.equals("--internal")) {
+    if (FirstArg == "--internal") {
       Internal = true;
       --argc;
       ++argv;
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index eb0ba09c5b91..ae6659fe95e8 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -2816,6 +2816,47 @@ void OpenACCClauseEnqueue::VisitNumGangsClause(const OpenACCNumGangsClause &C) {
 void OpenACCClauseEnqueue::VisitPrivateClause(const OpenACCPrivateClause &C) {
   VisitVarList(C);
 }
+
+void OpenACCClauseEnqueue::VisitFirstPrivateClause(
+    const OpenACCFirstPrivateClause &C) {
+  VisitVarList(C);
+}
+
+void OpenACCClauseEnqueue::VisitPresentClause(const OpenACCPresentClause &C) {
+  VisitVarList(C);
+}
+void OpenACCClauseEnqueue::VisitNoCreateClause(const OpenACCNoCreateClause &C) {
+  VisitVarList(C);
+}
+void OpenACCClauseEnqueue::VisitCopyClause(const OpenACCCopyClause &C) {
+  VisitVarList(C);
+}
+void OpenACCClauseEnqueue::VisitCopyInClause(const OpenACCCopyInClause &C) {
+  VisitVarList(C);
+}
+void OpenACCClauseEnqueue::VisitCopyOutClause(const OpenACCCopyOutClause &C) {
+  VisitVarList(C);
+}
+void OpenACCClauseEnqueue::VisitCreateClause(const OpenACCCreateClause &C) {
+  VisitVarList(C);
+}
+void OpenACCClauseEnqueue::VisitAttachClause(const OpenACCAttachClause &C) {
+  VisitVarList(C);
+}
+void OpenACCClauseEnqueue::VisitDevicePtrClause(
+    const OpenACCDevicePtrClause &C) {
+  VisitVarList(C);
+}
+void OpenACCClauseEnqueue::VisitAsyncClause(const OpenACCAsyncClause &C) {
+  if (C.hasIntExpr())
+    Visitor.AddStmt(C.getIntExpr());
+}
+void OpenACCClauseEnqueue::VisitWaitClause(const OpenACCWaitClause &C) {
+  if (const Expr *DevNumExpr = C.getDevNumExpr())
+    Visitor.AddStmt(DevNumExpr);
+  for (Expr *QE : C.getQueueIdExprs())
+    Visitor.AddStmt(QE);
+}
 } // namespace
 
 void EnqueueVisitor::EnqueueChildren(const OpenACCClause *C) {
diff --git a/clang/unittests/AST/CMakeLists.txt b/clang/unittests/AST/CMakeLists.txt
index 54765e36db00..29d2b39cff8b 100644
--- a/clang/unittests/AST/CMakeLists.txt
+++ b/clang/unittests/AST/CMakeLists.txt
@@ -24,6 +24,7 @@ add_clang_unittest(ASTTests
   CommentLexer.cpp
   CommentParser.cpp
   CommentTextTest.cpp
+  ConceptPrinterTest.cpp
   DataCollectionTest.cpp
   DeclPrinterTest.cpp
   DeclTest.cpp
diff --git a/clang/unittests/AST/ConceptPrinterTest.cpp b/clang/unittests/AST/ConceptPrinterTest.cpp
new file mode 100644
index 000000000000..bd1f6bbfa565
--- /dev/null
+++ b/clang/unittests/AST/ConceptPrinterTest.cpp
@@ -0,0 +1,57 @@
+//===- unittests/AST/ConceptPrinterTest.cpp --- Concept printer tests -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ASTPrint.h"
+#include "clang/AST/ASTConcept.h"
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/ExprConcepts.h"
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/Tooling/Tooling.h"
+#include "llvm/ADT/SmallString.h"
+#include "gtest/gtest.h"
+
+using namespace clang;
+using namespace ast_matchers;
+using namespace tooling;
+
+namespace {
+
+static void PrintConceptReference(raw_ostream &Out, const ASTContext *Context,
+                                  const ConceptSpecializationExpr *T,
+                                  PrintingPolicyAdjuster PolicyAdjuster) {
+  assert(T && T->getConceptReference() &&
+         "Expected non-null concept reference");
+
+  PrintingPolicy Policy = Context->getPrintingPolicy();
+  if (PolicyAdjuster)
+    PolicyAdjuster(Policy);
+  T->getConceptReference()->print(Out, Policy);
+}
+
+::testing::AssertionResult
+PrintedConceptMatches(StringRef Code, const std::vector<std::string> &Args,
+                      const StatementMatcher &NodeMatch,
+                      StringRef ExpectedPrinted) {
+  return PrintedNodeMatches<ConceptSpecializationExpr>(
+      Code, Args, NodeMatch, ExpectedPrinted, "", PrintConceptReference);
+}
+const internal::VariadicDynCastAllOfMatcher<Stmt, ConceptSpecializationExpr>
+    conceptSpecializationExpr;
+} // unnamed namespace
+
+TEST(ConceptPrinter, ConceptReference) {
+  std::string Code = R"cpp(
+    template <typename, typename> concept D = true;
+    template<typename T, typename U>
+    requires D<T, U>
+    void g(T);
+  )cpp";
+  auto Matcher = conceptSpecializationExpr().bind("id");
+
+  ASSERT_TRUE(PrintedConceptMatches(Code, {"-std=c++20"}, Matcher, "D<T, U>"));
+}
diff --git a/clang/unittests/AST/Interp/Descriptor.cpp b/clang/unittests/AST/Interp/Descriptor.cpp
index 4ea0fbc285a9..053d579ea391 100644
--- a/clang/unittests/AST/Interp/Descriptor.cpp
+++ b/clang/unittests/AST/Interp/Descriptor.cpp
@@ -115,7 +115,7 @@ TEST(Descriptor, Primitives) {
 
   // Check pointer stuff.
   // Global variables have an inline descriptor.
-  ASSERT_FALSE(GlobalPtr.isRoot());
+  ASSERT_TRUE(GlobalPtr.isRoot());
   ASSERT_TRUE(GlobalPtr.isLive());
   ASSERT_FALSE(GlobalPtr.isZero());
   ASSERT_FALSE(GlobalPtr.isField());
diff --git a/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt b/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt
index 94160d949637..cfabb80576bc 100644
--- a/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt
+++ b/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt
@@ -19,7 +19,6 @@ add_clang_unittest(ClangAnalysisFlowSensitiveTests
   SignAnalysisTest.cpp
   SimplifyConstraintsTest.cpp
   SingleVarConstantPropagationTest.cpp
-  SolverTest.cpp
   TestingSupport.cpp
   TestingSupportTest.cpp
   TransferBranchTest.cpp
@@ -27,6 +26,7 @@ add_clang_unittest(ClangAnalysisFlowSensitiveTests
   TypeErasedDataflowAnalysisTest.cpp
   UncheckedOptionalAccessModelTest.cpp
   ValueTest.cpp
+  WatchedLiteralsSolverTest.cpp
   )
 
 clang_target_link_libraries(ClangAnalysisFlowSensitiveTests
diff --git a/clang/unittests/Analysis/FlowSensitive/SolverTest.cpp b/clang/unittests/Analysis/FlowSensitive/SolverTest.h
index 71f6da93594e..b37534438121 100644
--- a/clang/unittests/Analysis/FlowSensitive/SolverTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/SolverTest.h
@@ -1,4 +1,4 @@
-//===- unittests/Analysis/FlowSensitive/SolverTest.cpp --------------------===//
+//===--- SolverTest.h - Type-parameterized test for solvers ---------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,43 +6,53 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <utility>
+#ifndef LLVM_CLANG_ANALYSIS_FLOW_SENSITIVE_SOLVER_TEST_H_
+#define LLVM_CLANG_ANALYSIS_FLOW_SENSITIVE_SOLVER_TEST_H_
 
 #include "TestingSupport.h"
-#include "clang/Analysis/FlowSensitive/Arena.h"
-#include "clang/Analysis/FlowSensitive/Formula.h"
 #include "clang/Analysis/FlowSensitive/Solver.h"
-#include "clang/Analysis/FlowSensitive/WatchedLiteralsSolver.h"
-#include "clang/Basic/LLVM.h"
-#include "llvm/ADT/ArrayRef.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include <vector>
+
+namespace clang::dataflow::test {
 
 namespace {
 
-using namespace clang;
-using namespace dataflow;
+constexpr auto AssignedTrue = Solver::Result::Assignment::AssignedTrue;
+constexpr auto AssignedFalse = Solver::Result::Assignment::AssignedFalse;
 
-using test::ConstraintContext;
-using test::parseFormulas;
 using testing::_;
 using testing::AnyOf;
 using testing::Pair;
 using testing::UnorderedElementsAre;
 
-constexpr auto AssignedTrue = Solver::Result::Assignment::AssignedTrue;
-constexpr auto AssignedFalse = Solver::Result::Assignment::AssignedFalse;
+} // namespace
 
-// Checks if the conjunction of `Vals` is satisfiable and returns the
-// corresponding result.
-Solver::Result solve(llvm::ArrayRef<const Formula *> Vals) {
-  return WatchedLiteralsSolver().solve(Vals);
-}
+/// Type-parameterized test for implementations of the `Solver` interface.
+/// To use:
+/// 1.  Implement a specialization of `createSolverWithLowTimeout()` for the
+///     solver you want to test.
+/// 2.  Instantiate the test suite for the solver you want to test using
+///     `INSTANTIATE_TYPED_TEST_SUITE_P()`.
+/// See WatchedLiteralsSolverTest.cpp for an example.
+template <typename SolverT> class SolverTest : public ::testing::Test {
+protected:
+  // Checks if the conjunction of `Vals` is satisfiable and returns the
+  // corresponding result.
+  Solver::Result solve(llvm::ArrayRef<const Formula *> Vals) {
+    return SolverT().solve(Vals);
+  }
+
+  // Create a specialization for the solver type to test.
+  SolverT createSolverWithLowTimeout();
+};
+
+TYPED_TEST_SUITE_P(SolverTest);
 
 MATCHER(unsat, "") {
   return arg.getStatus() == Solver::Result::Status::Unsatisfiable;
 }
+
 MATCHER_P(sat, SolutionMatcher,
           "is satisfiable, where solution " +
               (testing::DescribeMatcher<
@@ -55,57 +65,57 @@ MATCHER_P(sat, SolutionMatcher,
                                      result_listener);
 }
 
-TEST(SolverTest, Var) {
+TYPED_TEST_P(SolverTest, Var) {
   ConstraintContext Ctx;
   auto X = Ctx.atom();
 
   // X
-  EXPECT_THAT(solve({X}),
+  EXPECT_THAT(this->solve({X}),
               sat(UnorderedElementsAre(Pair(X->getAtom(), AssignedTrue))));
 }
 
-TEST(SolverTest, NegatedVar) {
+TYPED_TEST_P(SolverTest, NegatedVar) {
   ConstraintContext Ctx;
   auto X = Ctx.atom();
   auto NotX = Ctx.neg(X);
 
   // !X
-  EXPECT_THAT(solve({NotX}),
+  EXPECT_THAT(this->solve({NotX}),
               sat(UnorderedElementsAre(Pair(X->getAtom(), AssignedFalse))));
 }
 
-TEST(SolverTest, UnitConflict) {
+TYPED_TEST_P(SolverTest, UnitConflict) {
   ConstraintContext Ctx;
   auto X = Ctx.atom();
   auto NotX = Ctx.neg(X);
 
   // X ^ !X
-  EXPECT_THAT(solve({X, NotX}), unsat());
+  EXPECT_THAT(this->solve({X, NotX}), unsat());
 }
 
-TEST(SolverTest, DistinctVars) {
+TYPED_TEST_P(SolverTest, DistinctVars) {
   ConstraintContext Ctx;
   auto X = Ctx.atom();
   auto Y = Ctx.atom();
   auto NotY = Ctx.neg(Y);
 
   // X ^ !Y
-  EXPECT_THAT(solve({X, NotY}),
+  EXPECT_THAT(this->solve({X, NotY}),
               sat(UnorderedElementsAre(Pair(X->getAtom(), AssignedTrue),
                                        Pair(Y->getAtom(), AssignedFalse))));
 }
 
-TEST(SolverTest, DoubleNegation) {
+TYPED_TEST_P(SolverTest, DoubleNegation) {
   ConstraintContext Ctx;
   auto X = Ctx.atom();
   auto NotX = Ctx.neg(X);
   auto NotNotX = Ctx.neg(NotX);
 
   // !!X ^ !X
-  EXPECT_THAT(solve({NotNotX, NotX}), unsat());
+  EXPECT_THAT(this->solve({NotNotX, NotX}), unsat());
 }
 
-TEST(SolverTest, NegatedDisjunction) {
+TYPED_TEST_P(SolverTest, NegatedDisjunction) {
   ConstraintContext Ctx;
   auto X = Ctx.atom();
   auto Y = Ctx.atom();
@@ -113,10 +123,10 @@ TEST(SolverTest, NegatedDisjunction) {
   auto NotXOrY = Ctx.neg(XOrY);
 
   // !(X v Y) ^ (X v Y)
-  EXPECT_THAT(solve({NotXOrY, XOrY}), unsat());
+  EXPECT_THAT(this->solve({NotXOrY, XOrY}), unsat());
 }
 
-TEST(SolverTest, NegatedConjunction) {
+TYPED_TEST_P(SolverTest, NegatedConjunction) {
   ConstraintContext Ctx;
   auto X = Ctx.atom();
   auto Y = Ctx.atom();
@@ -124,48 +134,48 @@ TEST(SolverTest, NegatedConjunction) {
   auto NotXAndY = Ctx.neg(XAndY);
 
   // !(X ^ Y) ^ (X ^ Y)
-  EXPECT_THAT(solve({NotXAndY, XAndY}), unsat());
+  EXPECT_THAT(this->solve({NotXAndY, XAndY}), unsat());
 }
 
-TEST(SolverTest, DisjunctionSameVarWithNegation) {
+TYPED_TEST_P(SolverTest, DisjunctionSameVarWithNegation) {
   ConstraintContext Ctx;
   auto X = Ctx.atom();
   auto NotX = Ctx.neg(X);
   auto XOrNotX = Ctx.disj(X, NotX);
 
   // X v !X
-  EXPECT_THAT(solve({XOrNotX}), sat(_));
+  EXPECT_THAT(this->solve({XOrNotX}), sat(_));
 }
 
-TEST(SolverTest, DisjunctionSameVar) {
+TYPED_TEST_P(SolverTest, DisjunctionSameVar) {
   ConstraintContext Ctx;
   auto X = Ctx.atom();
   auto XOrX = Ctx.disj(X, X);
 
   // X v X
-  EXPECT_THAT(solve({XOrX}), sat(_));
+  EXPECT_THAT(this->solve({XOrX}), sat(_));
 }
 
-TEST(SolverTest, ConjunctionSameVarsConflict) {
+TYPED_TEST_P(SolverTest, ConjunctionSameVarsConflict) {
   ConstraintContext Ctx;
   auto X = Ctx.atom();
   auto NotX = Ctx.neg(X);
   auto XAndNotX = Ctx.conj(X, NotX);
 
   // X ^ !X
-  EXPECT_THAT(solve({XAndNotX}), unsat());
+  EXPECT_THAT(this->solve({XAndNotX}), unsat());
 }
 
-TEST(SolverTest, ConjunctionSameVar) {
+TYPED_TEST_P(SolverTest, ConjunctionSameVar) {
   ConstraintContext Ctx;
   auto X = Ctx.atom();
   auto XAndX = Ctx.conj(X, X);
 
   // X ^ X
-  EXPECT_THAT(solve({XAndX}), sat(_));
+  EXPECT_THAT(this->solve({XAndX}), sat(_));
 }
 
-TEST(SolverTest, PureVar) {
+TYPED_TEST_P(SolverTest, PureVar) {
   ConstraintContext Ctx;
   auto X = Ctx.atom();
   auto Y = Ctx.atom();
@@ -175,12 +185,12 @@ TEST(SolverTest, PureVar) {
   auto NotXOrNotY = Ctx.disj(NotX, NotY);
 
   // (!X v Y) ^ (!X v !Y)
-  EXPECT_THAT(solve({NotXOrY, NotXOrNotY}),
+  EXPECT_THAT(this->solve({NotXOrY, NotXOrNotY}),
               sat(UnorderedElementsAre(Pair(X->getAtom(), AssignedFalse),
                                        Pair(Y->getAtom(), _))));
 }
 
-TEST(SolverTest, MustAssumeVarIsFalse) {
+TYPED_TEST_P(SolverTest, MustAssumeVarIsFalse) {
   ConstraintContext Ctx;
   auto X = Ctx.atom();
   auto Y = Ctx.atom();
@@ -191,12 +201,12 @@ TEST(SolverTest, MustAssumeVarIsFalse) {
   auto NotXOrNotY = Ctx.disj(NotX, NotY);
 
   // (X v Y) ^ (!X v Y) ^ (!X v !Y)
-  EXPECT_THAT(solve({XOrY, NotXOrY, NotXOrNotY}),
+  EXPECT_THAT(this->solve({XOrY, NotXOrY, NotXOrNotY}),
               sat(UnorderedElementsAre(Pair(X->getAtom(), AssignedFalse),
                                        Pair(Y->getAtom(), AssignedTrue))));
 }
 
-TEST(SolverTest, DeepConflict) {
+TYPED_TEST_P(SolverTest, DeepConflict) {
   ConstraintContext Ctx;
   auto X = Ctx.atom();
   auto Y = Ctx.atom();
@@ -208,10 +218,10 @@ TEST(SolverTest, DeepConflict) {
   auto XOrNotY = Ctx.disj(X, NotY);
 
   // (X v Y) ^ (!X v Y) ^ (!X v !Y) ^ (X v !Y)
-  EXPECT_THAT(solve({XOrY, NotXOrY, NotXOrNotY, XOrNotY}), unsat());
+  EXPECT_THAT(this->solve({XOrY, NotXOrY, NotXOrNotY, XOrNotY}), unsat());
 }
 
-TEST(SolverTest, IffIsEquivalentToDNF) {
+TYPED_TEST_P(SolverTest, IffIsEquivalentToDNF) {
   ConstraintContext Ctx;
   auto X = Ctx.atom();
   auto Y = Ctx.atom();
@@ -222,19 +232,19 @@ TEST(SolverTest, IffIsEquivalentToDNF) {
   auto NotEquivalent = Ctx.neg(Ctx.iff(XIffY, XIffYDNF));
 
   // !((X <=> Y) <=> ((X ^ Y) v (!X ^ !Y)))
-  EXPECT_THAT(solve({NotEquivalent}), unsat());
+  EXPECT_THAT(this->solve({NotEquivalent}), unsat());
 }
 
-TEST(SolverTest, IffSameVars) {
+TYPED_TEST_P(SolverTest, IffSameVars) {
   ConstraintContext Ctx;
   auto X = Ctx.atom();
   auto XEqX = Ctx.iff(X, X);
 
   // X <=> X
-  EXPECT_THAT(solve({XEqX}), sat(_));
+  EXPECT_THAT(this->solve({XEqX}), sat(_));
 }
 
-TEST(SolverTest, IffDistinctVars) {
+TYPED_TEST_P(SolverTest, IffDistinctVars) {
   ConstraintContext Ctx;
   auto X = Ctx.atom();
   auto Y = Ctx.atom();
@@ -242,36 +252,36 @@ TEST(SolverTest, IffDistinctVars) {
 
   // X <=> Y
   EXPECT_THAT(
-      solve({XEqY}),
+      this->solve({XEqY}),
       sat(AnyOf(UnorderedElementsAre(Pair(X->getAtom(), AssignedTrue),
                                      Pair(Y->getAtom(), AssignedTrue)),
                 UnorderedElementsAre(Pair(X->getAtom(), AssignedFalse),
                                      Pair(Y->getAtom(), AssignedFalse)))));
 }
 
-TEST(SolverTest, IffWithUnits) {
+TYPED_TEST_P(SolverTest, IffWithUnits) {
   ConstraintContext Ctx;
   auto X = Ctx.atom();
   auto Y = Ctx.atom();
   auto XEqY = Ctx.iff(X, Y);
 
   // (X <=> Y) ^ X ^ Y
-  EXPECT_THAT(solve({XEqY, X, Y}),
+  EXPECT_THAT(this->solve({XEqY, X, Y}),
               sat(UnorderedElementsAre(Pair(X->getAtom(), AssignedTrue),
                                        Pair(Y->getAtom(), AssignedTrue))));
 }
 
-TEST(SolverTest, IffWithUnitsConflict) {
+TYPED_TEST_P(SolverTest, IffWithUnitsConflict) {
   Arena A;
   auto Constraints = parseFormulas(A, R"(
      (V0 = V1)
      V0
      !V1
   )");
-  EXPECT_THAT(solve(Constraints), unsat());
+  EXPECT_THAT(this->solve(Constraints), unsat());
 }
 
-TEST(SolverTest, IffTransitiveConflict) {
+TYPED_TEST_P(SolverTest, IffTransitiveConflict) {
   Arena A;
   auto Constraints = parseFormulas(A, R"(
      (V0 = V1)
@@ -279,63 +289,63 @@ TEST(SolverTest, IffTransitiveConflict) {
      V2
      !V0
   )");
-  EXPECT_THAT(solve(Constraints), unsat());
+  EXPECT_THAT(this->solve(Constraints), unsat());
 }
 
-TEST(SolverTest, DeMorgan) {
+TYPED_TEST_P(SolverTest, DeMorgan) {
   Arena A;
   auto Constraints = parseFormulas(A, R"(
      (!(V0 | V1) = (!V0 & !V1))
      (!(V2 & V3) = (!V2 | !V3))
   )");
-  EXPECT_THAT(solve(Constraints), sat(_));
+  EXPECT_THAT(this->solve(Constraints), sat(_));
 }
 
-TEST(SolverTest, RespectsAdditionalConstraints) {
+TYPED_TEST_P(SolverTest, RespectsAdditionalConstraints) {
   Arena A;
   auto Constraints = parseFormulas(A, R"(
      (V0 = V1)
      V0
      !V1
   )");
-  EXPECT_THAT(solve(Constraints), unsat());
+  EXPECT_THAT(this->solve(Constraints), unsat());
 }
 
-TEST(SolverTest, ImplicationIsEquivalentToDNF) {
+TYPED_TEST_P(SolverTest, ImplicationIsEquivalentToDNF) {
   Arena A;
   auto Constraints = parseFormulas(A, R"(
      !((V0 => V1) = (!V0 | V1))
   )");
-  EXPECT_THAT(solve(Constraints), unsat());
+  EXPECT_THAT(this->solve(Constraints), unsat());
 }
 
-TEST(SolverTest, ImplicationConflict) {
+TYPED_TEST_P(SolverTest, ImplicationConflict) {
   Arena A;
   auto Constraints = parseFormulas(A, R"(
      (V0 => V1)
      (V0 & !V1)
   )");
-  EXPECT_THAT(solve(Constraints), unsat());
+  EXPECT_THAT(this->solve(Constraints), unsat());
 }
 
-TEST(SolverTest, ReachedLimitsReflectsTimeouts) {
+TYPED_TEST_P(SolverTest, ReachedLimitsReflectsTimeouts) {
   Arena A;
   auto Constraints = parseFormulas(A, R"(
      (!(V0 | V1) = (!V0 & !V1))
      (!(V2 & V3) = (!V2 & !V3))
   )");
-  WatchedLiteralsSolver solver(10);
+  TypeParam solver = this->createSolverWithLowTimeout();
   ASSERT_EQ(solver.solve(Constraints).getStatus(),
             Solver::Result::Status::TimedOut);
   EXPECT_TRUE(solver.reachedLimit());
 }
 
-TEST(SolverTest, SimpleButLargeContradiction) {
+TYPED_TEST_P(SolverTest, SimpleButLargeContradiction) {
   // This test ensures that the solver takes a short-cut on known
   // contradictory inputs, without using max_iterations. At the time
   // this test is added, formulas that are easily recognized to be
   // contradictory at CNF construction time would lead to timeout.
-  WatchedLiteralsSolver solver(10);
+  TypeParam solver = this->createSolverWithLowTimeout();
   ConstraintContext Ctx;
   auto first = Ctx.atom();
   auto last = first;
@@ -358,4 +368,16 @@ TEST(SolverTest, SimpleButLargeContradiction) {
   EXPECT_FALSE(solver.reachedLimit());
 }
 
-} // namespace
+REGISTER_TYPED_TEST_SUITE_P(
+    SolverTest, Var, NegatedVar, UnitConflict, DistinctVars, DoubleNegation,
+    NegatedDisjunction, NegatedConjunction, DisjunctionSameVarWithNegation,
+    DisjunctionSameVar, ConjunctionSameVarsConflict, ConjunctionSameVar,
+    PureVar, MustAssumeVarIsFalse, DeepConflict, IffIsEquivalentToDNF,
+    IffSameVars, IffDistinctVars, IffWithUnits, IffWithUnitsConflict,
+    IffTransitiveConflict, DeMorgan, RespectsAdditionalConstraints,
+    ImplicationIsEquivalentToDNF, ImplicationConflict,
+    ReachedLimitsReflectsTimeouts, SimpleButLargeContradiction);
+
+} // namespace clang::dataflow::test
+
+#endif // LLVM_CLANG_ANALYSIS_FLOW_SENSITIVE_TESTING_SUPPORT_H_
diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
index 301bec32c0cf..e1fb16b64fd6 100644
--- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
@@ -2228,7 +2228,7 @@ TEST(TransferTest, AssignmentOperator) {
       A Foo = { 1 };
       A Bar = { 2 };
       // [[p1]]
-      Foo = Bar;
+      A &Rval = (Foo = Bar);
       // [[p2]]
       Foo.Baz = 3;
       // [[p3]]
@@ -2274,6 +2274,7 @@ TEST(TransferTest, AssignmentOperator) {
               cast<RecordStorageLocation>(Env2.getStorageLocation(*BarDecl));
 
           EXPECT_TRUE(recordsEqual(*FooLoc2, *BarLoc2, Env2));
+          EXPECT_EQ(&getLocForDecl(ASTCtx, Env2, "Rval"), FooLoc2);
 
           const auto *FooBazVal2 =
               cast<IntegerValue>(getFieldValue(FooLoc2, *BazDecl, Env2));
@@ -2441,7 +2442,75 @@ TEST(TransferTest, AssignmentOperatorReturnsByValue) {
   // This is a crash repro.
   std::string Code = R"(
     struct S {
-      S operator=(S&& other);
+      S operator=(const S&);
+      int i;
+    };
+    void target() {
+      S S1 = { 1 };
+      S S2 = { 2 };
+      S S3 = { 3 };
+      // [[before]]
+      // Test that the returned value is modeled by assigning to another value.
+      S1 = (S2 = S3);
+      (void)0;
+      // [[after]]
+    }
+  )";
+  runDataflow(
+      Code,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {
+        const ValueDecl *S1Decl = findValueDecl(ASTCtx, "S1");
+        const ValueDecl *S2Decl = findValueDecl(ASTCtx, "S2");
+        const ValueDecl *S3Decl = findValueDecl(ASTCtx, "S3");
+
+        const Environment &EnvBefore =
+            getEnvironmentAtAnnotation(Results, "before");
+
+        EXPECT_FALSE(recordsEqual(
+            *EnvBefore.get<RecordStorageLocation>(*S1Decl),
+            *EnvBefore.get<RecordStorageLocation>(*S2Decl), EnvBefore));
+        EXPECT_FALSE(recordsEqual(
+            *EnvBefore.get<RecordStorageLocation>(*S2Decl),
+            *EnvBefore.get<RecordStorageLocation>(*S3Decl), EnvBefore));
+
+        const Environment &EnvAfter =
+            getEnvironmentAtAnnotation(Results, "after");
+
+        EXPECT_TRUE(recordsEqual(*EnvAfter.get<RecordStorageLocation>(*S1Decl),
+                                 *EnvAfter.get<RecordStorageLocation>(*S2Decl),
+                                 EnvAfter));
+        EXPECT_TRUE(recordsEqual(*EnvAfter.get<RecordStorageLocation>(*S2Decl),
+                                 *EnvAfter.get<RecordStorageLocation>(*S3Decl),
+                                 EnvAfter));
+      });
+}
+
+TEST(TransferTest, AssignmentOperatorReturnsDifferentTypeByRef) {
+  // This is a crash repro.
+  std::string Code = R"(
+    struct DifferentType {};
+    struct S {
+      DifferentType& operator=(const S&);
+    };
+    void target() {
+      S s;
+      s = S();
+      // [[p]]
+    }
+  )";
+  runDataflow(
+      Code,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {});
+}
+
+TEST(TransferTest, AssignmentOperatorReturnsDifferentTypeByValue) {
+  // This is a crash repro.
+  std::string Code = R"(
+    struct DifferentType {};
+    struct S {
+      DifferentType operator=(const S&);
     };
     void target() {
       S s;
@@ -3331,6 +3400,60 @@ TEST(TransferTest, ResultObjectLocationDontVisitNestedRecordDecl) {
          ASTContext &ASTCtx) {});
 }
 
+TEST(TransferTest, ResultObjectLocationDontVisitUnevaluatedContexts) {
+  // This is a crash repro.
+  // We used to crash because when propagating result objects, we would visit
+  // unevaluated contexts, but we don't model fields used only in these.
+
+  auto testFunction = [](llvm::StringRef Code, llvm::StringRef TargetFun) {
+    runDataflow(
+        Code,
+        [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+           ASTContext &ASTCtx) {},
+        LangStandard::lang_gnucxx17,
+        /* ApplyBuiltinTransfer= */ true, TargetFun);
+  };
+
+  std::string Code = R"cc(
+    // Definitions needed for `typeid`.
+    namespace std {
+      class type_info {};
+      class bad_typeid {};
+    }  // namespace std
+
+    struct S1 {};
+    struct S2 { S1 s1; };
+
+    // We test each type of unevaluated context from a different target
+    // function. Some types of unevaluated contexts may actually cause the
+    // field `s1` to be modeled, and we don't want this to "pollute" the tests
+    // for the other unevaluated contexts.
+    void decltypeTarget() {
+        decltype(S2{}) Dummy;
+    }
+    void typeofTarget() {
+        typeof(S2{}) Dummy;
+    }
+    void typeidTarget() {
+#if __has_feature(cxx_rtti)
+        typeid(S2{});
+#endif
+    }
+    void sizeofTarget() {
+        sizeof(S2{});
+    }
+    void noexceptTarget() {
+        noexcept(S2{});
+    }
+  )cc";
+
+  testFunction(Code, "decltypeTarget");
+  testFunction(Code, "typeofTarget");
+  testFunction(Code, "typeidTarget");
+  testFunction(Code, "sizeofTarget");
+  testFunction(Code, "noexceptTarget");
+}
+
 TEST(TransferTest, StaticCast) {
   std::string Code = R"(
     void target(int Foo) {
@@ -4586,6 +4709,94 @@ TEST(TransferTest, BooleanInequality) {
       });
 }
 
+TEST(TransferTest, PointerEquality) {
+  std::string Code = R"cc(
+    void target() {
+      int i = 0;
+      int i_other = 0;
+      int *p1 = &i;
+      int *p2 = &i;
+      int *p_other = &i_other;
+      int *null = nullptr;
+
+      bool p1_eq_p1 = (p1 == p1);
+      bool p1_eq_p2 = (p1 == p2);
+      bool p1_eq_p_other = (p1 == p_other);
+
+      bool p1_eq_null = (p1 == null);
+      bool p1_eq_nullptr = (p1 == nullptr);
+      bool null_eq_nullptr = (null == nullptr);
+      bool nullptr_eq_nullptr = (nullptr == nullptr);
+
+      // We won't duplicate all of the tests above with `!=`, as we know that
+      // the implementation simply negates the result of the `==` comparison.
+      // Instaed, just spot-check one case.
+      bool p1_ne_p1 = (p1 != p1);
+
+      (void)0; // [[p]]
+    }
+  )cc";
+  runDataflow(
+      Code,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {
+        const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
+
+        // Check the we have indeed set things up so that `p1` and `p2` have
+        // different pointer values.
+        EXPECT_NE(&getValueForDecl<PointerValue>(ASTCtx, Env, "p1"),
+                  &getValueForDecl<PointerValue>(ASTCtx, Env, "p2"));
+
+        EXPECT_EQ(&getValueForDecl<BoolValue>(ASTCtx, Env, "p1_eq_p1"),
+                  &Env.getBoolLiteralValue(true));
+        EXPECT_EQ(&getValueForDecl<BoolValue>(ASTCtx, Env, "p1_eq_p2"),
+                  &Env.getBoolLiteralValue(true));
+        EXPECT_TRUE(isa<AtomicBoolValue>(
+            getValueForDecl<BoolValue>(ASTCtx, Env, "p1_eq_p_other")));
+
+        EXPECT_TRUE(isa<AtomicBoolValue>(
+            getValueForDecl<BoolValue>(ASTCtx, Env, "p1_eq_null")));
+        EXPECT_TRUE(isa<AtomicBoolValue>(
+            getValueForDecl<BoolValue>(ASTCtx, Env, "p1_eq_nullptr")));
+        EXPECT_EQ(&getValueForDecl<BoolValue>(ASTCtx, Env, "null_eq_nullptr"),
+                  &Env.getBoolLiteralValue(true));
+        EXPECT_EQ(
+            &getValueForDecl<BoolValue>(ASTCtx, Env, "nullptr_eq_nullptr"),
+            &Env.getBoolLiteralValue(true));
+
+        EXPECT_EQ(&getValueForDecl<BoolValue>(ASTCtx, Env, "p1_ne_p1"),
+                  &Env.getBoolLiteralValue(false));
+      });
+}
+
+TEST(TransferTest, PointerEqualityUnionMembers) {
+  std::string Code = R"cc(
+    union U {
+      int i1;
+      int i2;
+    };
+    void target() {
+      U u;
+      bool i1_eq_i2 = (&u.i1 == &u.i2);
+
+      (void)0; // [[p]]
+    }
+  )cc";
+  runDataflow(
+      Code,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {
+        const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
+
+        // FIXME: By the standard, `u.i1` and `u.i2` should have the same
+        // address, but we don't yet model this property of union members
+        // correctly. The result is therefore weaker than it could be (just an
+        // atom rather than a true literal), though not wrong.
+        EXPECT_TRUE(isa<AtomicBoolValue>(
+            getValueForDecl<BoolValue>(ASTCtx, Env, "i1_eq_i2")));
+      });
+}
+
 TEST(TransferTest, IntegerLiteralEquality) {
   std::string Code = R"(
     void target() {
diff --git a/clang/unittests/Analysis/FlowSensitive/WatchedLiteralsSolverTest.cpp b/clang/unittests/Analysis/FlowSensitive/WatchedLiteralsSolverTest.cpp
new file mode 100644
index 000000000000..d194742dbea7
--- /dev/null
+++ b/clang/unittests/Analysis/FlowSensitive/WatchedLiteralsSolverTest.cpp
@@ -0,0 +1,26 @@
+//===- unittests/Analysis/FlowSensitive/WatchedLiteralsSolverTest.cpp -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Analysis/FlowSensitive/WatchedLiteralsSolver.h"
+#include "SolverTest.h"
+
+namespace clang::dataflow::test {
+
+template <>
+WatchedLiteralsSolver
+SolverTest<WatchedLiteralsSolver>::createSolverWithLowTimeout() {
+  return WatchedLiteralsSolver(10);
+}
+
+namespace {
+
+INSTANTIATE_TYPED_TEST_SUITE_P(WatchedLiteralsSolverTest, SolverTest,
+                               WatchedLiteralsSolver, );
+
+} // namespace
+} // namespace clang::dataflow::test
diff --git a/clang/unittests/CodeGen/IRMatchers.h b/clang/unittests/CodeGen/IRMatchers.h
index 47e420498036..3572a317f07a 100644
--- a/clang/unittests/CodeGen/IRMatchers.h
+++ b/clang/unittests/CodeGen/IRMatchers.h
@@ -317,7 +317,7 @@ public:
   NameMetaMatcher(StringRef N) : Name(N) {}
   bool matchEntity(const Metadata &M, MatcherContext &C) override {
     if (auto *MDS = dyn_cast<MDString>(&M))
-      return MDS->getString().equals(Name);
+      return MDS->getString() == Name;
     return false;
   }
 };
diff --git a/clang/unittests/Driver/DXCModeTest.cpp b/clang/unittests/Driver/DXCModeTest.cpp
index b3767c042edb..416723d498a2 100644
--- a/clang/unittests/Driver/DXCModeTest.cpp
+++ b/clang/unittests/Driver/DXCModeTest.cpp
@@ -68,25 +68,27 @@ TEST(DxcModeTest, TargetProfileValidation) {
   IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts = new DiagnosticOptions();
   DiagnosticsEngine Diags(DiagID, &*DiagOpts, DiagConsumer);
 
-  validateTargetProfile("-Tvs_6_0", "dxil--shadermodel6.0-vertex",
+  validateTargetProfile("-Tvs_6_0", "dxilv1.0--shadermodel6.0-vertex",
                         InMemoryFileSystem, Diags);
-  validateTargetProfile("-Ths_6_1", "dxil--shadermodel6.1-hull",
+  validateTargetProfile("-Ths_6_1", "dxilv1.1--shadermodel6.1-hull",
                         InMemoryFileSystem, Diags);
-  validateTargetProfile("-Tds_6_2", "dxil--shadermodel6.2-domain",
+  validateTargetProfile("-Tds_6_2", "dxilv1.2--shadermodel6.2-domain",
                         InMemoryFileSystem, Diags);
-  validateTargetProfile("-Tds_6_2", "dxil--shadermodel6.2-domain",
+  validateTargetProfile("-Tds_6_2", "dxilv1.2--shadermodel6.2-domain",
                         InMemoryFileSystem, Diags);
-  validateTargetProfile("-Tgs_6_3", "dxil--shadermodel6.3-geometry",
+  validateTargetProfile("-Tgs_6_3", "dxilv1.3--shadermodel6.3-geometry",
                         InMemoryFileSystem, Diags);
-  validateTargetProfile("-Tps_6_4", "dxil--shadermodel6.4-pixel",
+  validateTargetProfile("-Tps_6_4", "dxilv1.4--shadermodel6.4-pixel",
                         InMemoryFileSystem, Diags);
-  validateTargetProfile("-Tcs_6_5", "dxil--shadermodel6.5-compute",
+  validateTargetProfile("-Tcs_6_5", "dxilv1.5--shadermodel6.5-compute",
                         InMemoryFileSystem, Diags);
-  validateTargetProfile("-Tms_6_6", "dxil--shadermodel6.6-mesh",
+  validateTargetProfile("-Tms_6_6", "dxilv1.6--shadermodel6.6-mesh",
                         InMemoryFileSystem, Diags);
-  validateTargetProfile("-Tas_6_7", "dxil--shadermodel6.7-amplification",
+  validateTargetProfile("-Tas_6_7", "dxilv1.7--shadermodel6.7-amplification",
                         InMemoryFileSystem, Diags);
-  validateTargetProfile("-Tlib_6_x", "dxil--shadermodel6.15-library",
+  validateTargetProfile("-Tcs_6_8", "dxilv1.8--shadermodel6.8-compute",
+                        InMemoryFileSystem, Diags);
+  validateTargetProfile("-Tlib_6_x", "dxilv1.8--shadermodel6.15-library",
                         InMemoryFileSystem, Diags);
 
   // Invalid tests.
diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp
index 8c74ed2d119a..82e72f08ffb5 100644
--- a/clang/unittests/Format/ConfigParseTest.cpp
+++ b/clang/unittests/Format/ConfigParseTest.cpp
@@ -153,6 +153,7 @@ TEST(ConfigParseTest, ParsesConfigurationBools) {
   Style.Language = FormatStyle::LK_Cpp;
   CHECK_PARSE_BOOL(AllowAllArgumentsOnNextLine);
   CHECK_PARSE_BOOL(AllowAllParametersOfDeclarationOnNextLine);
+  CHECK_PARSE_BOOL(AllowShortCaseExpressionOnASingleLine);
   CHECK_PARSE_BOOL(AllowShortCaseLabelsOnASingleLine);
   CHECK_PARSE_BOOL(AllowShortCompoundRequirementOnASingleLine);
   CHECK_PARSE_BOOL(AllowShortEnumsOnASingleLine);
@@ -205,6 +206,7 @@ TEST(ConfigParseTest, ParsesConfigurationBools) {
   CHECK_PARSE_NESTED_BOOL(AlignConsecutiveShortCaseStatements,
                           AcrossEmptyLines);
   CHECK_PARSE_NESTED_BOOL(AlignConsecutiveShortCaseStatements, AcrossComments);
+  CHECK_PARSE_NESTED_BOOL(AlignConsecutiveShortCaseStatements, AlignCaseArrows);
   CHECK_PARSE_NESTED_BOOL(AlignConsecutiveShortCaseStatements, AlignCaseColons);
   CHECK_PARSE_NESTED_BOOL(BraceWrapping, AfterCaseLabel);
   CHECK_PARSE_NESTED_BOOL(BraceWrapping, AfterClass);
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 32ba6b6853c7..e6f8e4a06515 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -27204,8 +27204,14 @@ TEST_F(FormatTest, RemoveParentheses) {
                "if ((({ a; })))\n"
                "  b;",
                Style);
+  verifyFormat("static_assert((std::is_constructible_v<T, Args &&> && ...));",
+               "static_assert(((std::is_constructible_v<T, Args &&> && ...)));",
+               Style);
   verifyFormat("return (0);", "return (((0)));", Style);
   verifyFormat("return (({ 0; }));", "return ((({ 0; })));", Style);
+  verifyFormat("return ((... && std::is_convertible_v<TArgsLocal, TArgs>));",
+               "return (((... && std::is_convertible_v<TArgsLocal, TArgs>)));",
+               Style);
 
   Style.RemoveParentheses = FormatStyle::RPS_ReturnStatement;
   verifyFormat("#define Return0 return (0);", Style);
@@ -27213,6 +27219,9 @@ TEST_F(FormatTest, RemoveParentheses) {
   verifyFormat("co_return 0;", "co_return ((0));", Style);
   verifyFormat("return 0;", "return (((0)));", Style);
   verifyFormat("return ({ 0; });", "return ((({ 0; })));", Style);
+  verifyFormat("return (... && std::is_convertible_v<TArgsLocal, TArgs>);",
+               "return (((... && std::is_convertible_v<TArgsLocal, TArgs>)));",
+               Style);
   verifyFormat("inline decltype(auto) f() {\n"
                "  if (a) {\n"
                "    return (a);\n"
diff --git a/clang/unittests/Format/FormatTestJava.cpp b/clang/unittests/Format/FormatTestJava.cpp
index 6da5f4fa2543..33998bc7ff85 100644
--- a/clang/unittests/Format/FormatTestJava.cpp
+++ b/clang/unittests/Format/FormatTestJava.cpp
@@ -618,6 +618,177 @@ TEST_F(FormatTestJava, ConfigurableSpacesInSquareBrackets) {
   verifyFormat("types[ i ] = arguments[ i ].getClass();", Spaces);
 }
 
+TEST_F(FormatTestJava, SwitchExpression) {
+  auto Style = getLLVMStyle(FormatStyle::LK_Java);
+  EXPECT_TRUE(Style.AllowShortCaseExpressionOnASingleLine);
+
+  verifyFormat("foo(switch (day) {\n"
+               "  case THURSDAY, SATURDAY -> 8;\n"
+               "  case WEDNESDAY -> 9;\n"
+               "  default -> 1;\n"
+               "});",
+               Style);
+
+  constexpr StringRef Code1{"i = switch (day) {\n"
+                            "  case THURSDAY, SATURDAY -> 8;\n"
+                            "  case WEDNESDAY -> 9;\n"
+                            "  default -> 0;\n"
+                            "};"};
+  verifyFormat(Code1, Style);
+
+  Style.IndentCaseLabels = true;
+  verifyFormat(Code1, Style);
+
+  constexpr StringRef Code2{"i = switch (day) {\n"
+                            "  case THURSDAY, SATURDAY -> {\n"
+                            "    foo();\n"
+                            "    yield 8;\n"
+                            "  }\n"
+                            "  case WEDNESDAY -> {\n"
+                            "    bar();\n"
+                            "    yield 9;\n"
+                            "  }\n"
+                            "  default -> {\n"
+                            "    yield 0;\n"
+                            "  }\n"
+                            "};"};
+  verifyFormat(Code2, Style);
+
+  Style.IndentCaseLabels = false;
+  verifyFormat(Code2, Style);
+
+  constexpr StringRef Code3{"switch (day) {\n"
+                            "case THURSDAY, SATURDAY -> i = 8;\n"
+                            "case WEDNESDAY -> i = 9;\n"
+                            "default -> i = 0;\n"
+                            "};"};
+  verifyFormat(Code3, Style);
+
+  Style.IndentCaseLabels = true;
+  verifyFormat("switch (day) {\n"
+               "  case THURSDAY, SATURDAY -> i = 8;\n"
+               "  case WEDNESDAY -> i = 9;\n"
+               "  default -> i = 0;\n"
+               "};",
+               Code3, Style);
+}
+
+TEST_F(FormatTestJava, ShortCaseExpression) {
+  auto Style = getLLVMStyle(FormatStyle::LK_Java);
+
+  verifyFormat("i = switch (a) {\n"
+               "  case 1 -> 1;\n"
+               "  case 2 -> // comment\n"
+               "    2;\n"
+               "  case 3 ->\n"
+               "    // comment\n"
+               "    3;\n"
+               "  case 4 -> 4; // comment\n"
+               "  default -> 0;\n"
+               "};",
+               Style);
+
+  verifyNoChange("i = switch (a) {\n"
+                 "  case 1 -> 1;\n"
+                 "  // comment\n"
+                 "  case 2 -> 2;\n"
+                 "  // comment 1\n"
+                 "  // comment 2\n"
+                 "  case 3 -> 3; /* comment */\n"
+                 "  case 4 -> /* comment */ 4;\n"
+                 "  case 5 -> x + /* comment */ 1;\n"
+                 "  default ->\n"
+                 "    0; // comment line 1\n"
+                 "       // comment line 2\n"
+                 "};",
+                 Style);
+
+  Style.ColumnLimit = 18;
+  verifyFormat("i = switch (a) {\n"
+               "  case Monday ->\n"
+               "    1;\n"
+               "  default -> 9999;\n"
+               "};",
+               Style);
+
+  Style.ColumnLimit = 80;
+  Style.AllowShortCaseExpressionOnASingleLine = false;
+  Style.IndentCaseLabels = true;
+  verifyFormat("i = switch (n) {\n"
+               "  default /*comments*/ ->\n"
+               "    1;\n"
+               "  case 0 ->\n"
+               "    0;\n"
+               "};",
+               Style);
+
+  Style.AllowShortCaseExpressionOnASingleLine = true;
+  Style.BreakBeforeBraces = FormatStyle::BS_Custom;
+  Style.BraceWrapping.AfterCaseLabel = true;
+  Style.BraceWrapping.AfterControlStatement = FormatStyle::BWACS_Always;
+  verifyFormat("i = switch (n)\n"
+               "{\n"
+               "  case 0 ->\n"
+               "  {\n"
+               "    yield 0;\n"
+               "  }\n"
+               "  default ->\n"
+               "  {\n"
+               "    yield 1;\n"
+               "  }\n"
+               "};",
+               Style);
+}
+
+TEST_F(FormatTestJava, AlignCaseArrows) {
+  auto Style = getLLVMStyle(FormatStyle::LK_Java);
+  Style.AlignConsecutiveShortCaseStatements.Enabled = true;
+
+  verifyFormat("foo(switch (day) {\n"
+               "  case THURSDAY, SATURDAY -> 8;\n"
+               "  case WEDNESDAY ->          9;\n"
+               "  default ->                 1;\n"
+               "});",
+               Style);
+
+  verifyFormat("i = switch (day) {\n"
+               "  case THURSDAY, SATURDAY -> 8;\n"
+               "  case WEDNESDAY ->          9;\n"
+               "  default ->                 0;\n"
+               "};",
+               Style);
+
+  verifyFormat("switch (day) {\n"
+               "case THURSDAY, SATURDAY -> i = 8;\n"
+               "case WEDNESDAY ->          i = 9;\n"
+               "default ->                 i = 0;\n"
+               "};",
+               Style);
+
+  Style.AlignConsecutiveShortCaseStatements.AlignCaseArrows = true;
+
+  verifyFormat("foo(switch (day) {\n"
+               "  case THURSDAY, SATURDAY -> 8;\n"
+               "  case WEDNESDAY          -> 9;\n"
+               "  default                 -> 1;\n"
+               "});",
+               Style);
+
+  verifyFormat("i = switch (day) {\n"
+               "  case THURSDAY, SATURDAY -> 8;\n"
+               "  case WEDNESDAY          -> 9;\n"
+               "  default                 -> 0;\n"
+               "};",
+               Style);
+
+  verifyFormat("switch (day) {\n"
+               "case THURSDAY, SATURDAY -> i = 8;\n"
+               "case WEDNESDAY          -> i = 9;\n"
+               "default                 -> i = 0;\n"
+               "};",
+               Style);
+}
+
 } // namespace
 } // namespace test
 } // namespace format
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 01daf8dee505..51b475d37977 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -489,6 +489,10 @@ TEST_F(TokenAnnotatorTest, UnderstandsStructs) {
   EXPECT_TOKEN(Tokens[24], tok::amp, TT_UnaryOperator);
   EXPECT_TOKEN(Tokens[27], tok::l_square, TT_ArraySubscriptLSquare);
   EXPECT_TOKEN(Tokens[32], tok::r_brace, TT_StructRBrace);
+
+  Tokens = annotate("template <typename T, enum E e> struct S {};");
+  ASSERT_EQ(Tokens.size(), 15u) << Tokens;
+  EXPECT_TOKEN(Tokens[11], tok::l_brace, TT_StructLBrace);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsUnions) {
@@ -2977,6 +2981,24 @@ TEST_F(TokenAnnotatorTest, BlockLBrace) {
   EXPECT_BRACE_KIND(Tokens[5], BK_Block);
 }
 
+TEST_F(TokenAnnotatorTest, SwitchExpression) {
+  auto Style = getLLVMStyle(FormatStyle::LK_Java);
+  auto Tokens = annotate("i = switch (day) {\n"
+                         "  case THURSDAY, SATURDAY -> 8;\n"
+                         "  case WEDNESDAY -> 9;\n"
+                         "  default -> 1;\n"
+                         "};",
+                         Style);
+  ASSERT_EQ(Tokens.size(), 26u) << Tokens;
+  EXPECT_TOKEN(Tokens[6], tok::l_brace, TT_SwitchExpressionLBrace);
+  EXPECT_TOKEN(Tokens[7], tok::kw_case, TT_SwitchExpressionLabel);
+  EXPECT_TOKEN(Tokens[11], tok::arrow, TT_CaseLabelArrow);
+  EXPECT_TOKEN(Tokens[14], tok::kw_case, TT_SwitchExpressionLabel);
+  EXPECT_TOKEN(Tokens[16], tok::arrow, TT_CaseLabelArrow);
+  EXPECT_TOKEN(Tokens[19], tok::kw_default, TT_SwitchExpressionLabel);
+  EXPECT_TOKEN(Tokens[20], tok::arrow, TT_CaseLabelArrow);
+}
+
 } // namespace
 } // namespace format
 } // namespace clang
diff --git a/clang/unittests/Serialization/SourceLocationEncodingTest.cpp b/clang/unittests/Serialization/SourceLocationEncodingTest.cpp
index 141da4c27f8d..c80a8fd0e52b 100644
--- a/clang/unittests/Serialization/SourceLocationEncodingTest.cpp
+++ b/clang/unittests/Serialization/SourceLocationEncodingTest.cpp
@@ -23,13 +23,14 @@ using LocSeq = SourceLocationSequence;
 // Loc is the raw (in-memory) form of SourceLocation.
 void roundTrip(SourceLocation::UIntTy Loc,
                std::optional<uint64_t> ExpectedEncoded = std::nullopt) {
-  uint64_t ActualEncoded =
-      SourceLocationEncoding::encode(SourceLocation::getFromRawEncoding(Loc));
+  uint64_t ActualEncoded = SourceLocationEncoding::encode(
+      SourceLocation::getFromRawEncoding(Loc), /*BaseOffset=*/0,
+      /*BaseModuleFileIndex=*/0);
   if (ExpectedEncoded) {
     ASSERT_EQ(ActualEncoded, *ExpectedEncoded) << "Encoding " << Loc;
   }
   SourceLocation::UIntTy DecodedEncoded =
-      SourceLocationEncoding::decode(ActualEncoded).getRawEncoding();
+      SourceLocationEncoding::decode(ActualEncoded).first.getRawEncoding();
   ASSERT_EQ(DecodedEncoded, Loc) << "Decoding " << ActualEncoded;
 }
 
@@ -41,7 +42,8 @@ void roundTrip(std::vector<SourceLocation::UIntTy> Locs,
     LocSeq::State Seq;
     for (auto L : Locs)
       ActualEncoded.push_back(SourceLocationEncoding::encode(
-          SourceLocation::getFromRawEncoding(L), Seq));
+          SourceLocation::getFromRawEncoding(L), /*BaseOffset=*/0,
+          /*BaseModuleFileIndex=*/0, Seq));
     if (!ExpectedEncoded.empty()) {
       ASSERT_EQ(ActualEncoded, ExpectedEncoded)
           << "Encoding " << testing::PrintToString(Locs);
@@ -51,7 +53,7 @@ void roundTrip(std::vector<SourceLocation::UIntTy> Locs,
   {
     LocSeq::State Seq;
     for (auto L : ActualEncoded) {
-      SourceLocation Loc = SourceLocationEncoding::decode(L, Seq);
+      SourceLocation Loc = SourceLocationEncoding::decode(L, Seq).first;
       DecodedEncoded.push_back(Loc.getRawEncoding());
     }
     ASSERT_EQ(DecodedEncoded, Locs)
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index 875521bd505d..92fdcf5556ed 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -10698,7 +10698,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/1815.html">1815</a></td>
     <td>CD4</td>
     <td>Lifetime extension in aggregate initialization</td>
-    <td class="none" align="center">No</td>
+    <td class="unreleased" align="center">Clang 19</td>
   </tr>
   <tr id="1816">
     <td><a href="https://cplusplus.github.io/CWG/issues/1816.html">1816</a></td>
@@ -10728,7 +10728,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/1820.html">1820</a></td>
     <td>CD6</td>
     <td>Qualified typedef names</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1821">
     <td><a href="https://cplusplus.github.io/CWG/issues/1821.html">1821</a></td>
@@ -16823,7 +16823,7 @@ objects</td>
   </tr>
   <tr class="open" id="2836">
     <td><a href="https://cplusplus.github.io/CWG/issues/2836.html">2836</a></td>
-    <td>review</td>
+    <td>tentatively ready</td>
     <td>Conversion rank of <TT>long double</TT> and extended floating-point types</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -16997,7 +16997,7 @@ objects</td>
   </tr>
   <tr class="open" id="2865">
     <td><a href="https://cplusplus.github.io/CWG/issues/2865.html">2865</a></td>
-    <td>open</td>
+    <td>tentatively ready</td>
     <td>Regression on result of conditional operator</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -17009,7 +17009,7 @@ objects</td>
   </tr>
   <tr class="open" id="2867">
     <td><a href="https://cplusplus.github.io/CWG/issues/2867.html">2867</a></td>
-    <td>open</td>
+    <td>review</td>
     <td>Order of initialization for structured bindings</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -17126,6 +17126,18 @@ objects</td>
     <td>open</td>
     <td>Temporaries and trivial potentially-throwing special member functions</td>
     <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2887">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2887.html">2887</a></td>
+    <td>open</td>
+    <td>Missing compatibility entries for xvalues</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2888">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2888.html">2888</a></td>
+    <td>open</td>
+    <td>Missing cases for reference and array types for argument-dependent lookup</td>
+    <td align="center">Not resolved</td>
   </tr></table>
 
 </div>
diff --git a/clang/www/cxx_status.html b/clang/www/cxx_status.html
index 260f74ded93c..1338f544ffcb 100755
--- a/clang/www/cxx_status.html
+++ b/clang/www/cxx_status.html
@@ -187,7 +187,7 @@ C++23, informally referred to as C++26.</p>
  <tr>
   <td>Trivial infinite loops are not Undefined Behavior</td>
   <td><a href="https://wg21.link/P2809R3">P2809R3</a> (<a href="#dr">DR</a>)</td>
-  <td class="none" align="center">No</td>
+  <td class="unreleased" align="center">Clang 19</td>
  </tr>
  <tr>
   <td>Erroneous behaviour for uninitialized reads</td>
@@ -347,15 +347,7 @@ C++23, informally referred to as C++26.</p>
     <tr>
       <td>Type trait to determine if a reference binds to a temporary</td>
       <td><a href="https://wg21.link/P2255R2">P2255R2</a></td>
-      <td class="partial" align="center">
-        <details><summary>Partial</summary>
-          Clang provides <tt>__reference_constructs_from_temporary</tt> type
-          trait builtin, with which <tt>std::reference_constructs_from_temporary</tt>
-          is implemented. <tt>__reference_converts_from_temporary</tt> needs to be
-          provided, following the normal cross-vendor convention to implement
-          traits requiring compiler support directly.
-        </details></td>
-      </td>
+      <td class="unreleased" align="center">Clang 19</td>
     </tr>
     <!-- July 2022 papers -->
     <tr>
@@ -466,7 +458,7 @@ C++23, informally referred to as C++26.</p>
         <details>
           <summary>Clang 19 (Partial)</summary>
             The lifetime extension of temporaries bound to member references
-            by default member initializers in aggregate initialization was 
+            by default member initializers in aggregate initialization was
             not supported now.
         </details>
       </td>
@@ -923,7 +915,7 @@ C++23, informally referred to as C++26.</p>
 You can use Clang in C++17 mode with the <code>-std=c++17</code> option
 (use <code>-std=c++1z</code> in Clang 4 and earlier).</p>
 
-<details open>
+<details>
 <summary>List of features and minimum Clang version with support</summary>
 
 <table width="689" border="1" cellspacing="0">
@@ -1140,8 +1132,8 @@ You can use Clang in C++17 mode with the <code>-std=c++17</code> option
     <!-- Issaquah 2016 papers -->
     <tr>
       <td>Matching template template parameters to compatible arguments</td>
-      <td><a href="https://wg21.link/p0522r0">P0522R0</a></td>
-      <td class="partial" align="center">Partial <a href="#p0522">(10)</a></td>
+      <td><a href="https://wg21.link/p0522r0">P0522R0</a> (<a href="#dr">DR</a>)</td>
+      <td class="unreleased" align="center">Clang 19 <a href="#p0522">(10)</a></td>
     </tr>
     <tr>
       <td>Removing deprecated dynamic exception specifications</td>
@@ -1169,13 +1161,11 @@ functions using expression syntax are no longer guaranteed to be destroyed in
 reverse construction order in that ABI.
 This is not fully supported during constant expression evaluation until Clang 12.
 </span><br>
-<span id="p0522">(10): Despite being the resolution to a Defect Report, this
-feature is disabled by default in all language versions, and can be enabled
-explicitly with the flag <tt>-frelaxed-template-template-args</tt> in Clang 4
-onwards.
-The change to the standard lacks a corresponding change for template partial
-ordering, resulting in ambiguity errors for reasonable and previously-valid
-code. This issue is expected to be rectified soon.
+<span id="p0522">(10): While this feature was initially implemented in Clang 4,
+it was not enabled by default prior to clang 19, but could be enabled with
+<tt>-frelaxed-template-template-args</tt>.
+Starting from Clang 19, the flag is deprecated and will be removed in a future
+version.
 </span>
 </p>
 </details>
diff --git a/compiler-rt/cmake/Modules/CompilerRTUtils.cmake b/compiler-rt/cmake/Modules/CompilerRTUtils.cmake
index a6c6ef93500d..8e331634b76b 100644
--- a/compiler-rt/cmake/Modules/CompilerRTUtils.cmake
+++ b/compiler-rt/cmake/Modules/CompilerRTUtils.cmake
@@ -369,7 +369,11 @@ macro(construct_compiler_rt_default_triple)
   endif()
 
   if ("${CMAKE_C_COMPILER_ID}" MATCHES "Clang")
-    execute_process(COMMAND ${CMAKE_C_COMPILER} --target=${COMPILER_RT_DEFAULT_TARGET_TRIPLE} -print-target-triple
+    set(option_prefix "")
+    if (CMAKE_C_SIMULATE_ID MATCHES "MSVC")
+      set(option_prefix "/clang:")
+    endif()
+    execute_process(COMMAND ${CMAKE_C_COMPILER} ${option_prefix}--target=${COMPILER_RT_DEFAULT_TARGET_TRIPLE} ${option_prefix}-print-target-triple
                     OUTPUT_VARIABLE COMPILER_RT_DEFAULT_TARGET_TRIPLE
                     OUTPUT_STRIP_TRAILING_WHITESPACE)
   endif()
diff --git a/compiler-rt/lib/ctx_profile/CMakeLists.txt b/compiler-rt/lib/ctx_profile/CMakeLists.txt
index 80e71acc38f8..1fa70594b28a 100644
--- a/compiler-rt/lib/ctx_profile/CMakeLists.txt
+++ b/compiler-rt/lib/ctx_profile/CMakeLists.txt
@@ -5,6 +5,7 @@ set(CTX_PROFILE_SOURCES
   )
 
 set(CTX_PROFILE_HEADERS
+  CtxInstrContextNode.h
   CtxInstrProfiling.h
   )
 
diff --git a/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h b/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h
new file mode 100644
index 000000000000..a916f197aa14
--- /dev/null
+++ b/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h
@@ -0,0 +1,116 @@
+//===--- CtxInstrContextNode.h - Contextual Profile Node --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//==============================================================================
+//
+// NOTE!
+// llvm/lib/ProfileData/CtxInstrContextNode.h and
+//   compiler-rt/lib/ctx_profile/CtxInstrContextNode.h
+// must be exact copies of eachother
+//
+// compiler-rt creates these objects as part of the instrumentation runtime for
+// contextual profiling. LLVM only consumes them to convert a contextual tree
+// to a bitstream.
+//
+//==============================================================================
+
+/// The contextual profile is a directed tree where each node has one parent. A
+/// node (ContextNode) corresponds to a function activation. The root of the
+/// tree is at a function that was marked as entrypoint to the compiler. A node
+/// stores counter values for edges and a vector of subcontexts. These are the
+/// contexts of callees. The index in the subcontext vector corresponds to the
+/// index of the callsite (as was instrumented via llvm.instrprof.callsite). At
+/// that index we find a linked list, potentially empty, of ContextNodes. Direct
+/// calls will have 0 or 1 values in the linked list, but indirect callsites may
+/// have more.
+///
+/// The ContextNode has a fixed sized header describing it - the GUID of the
+/// function, the size of the counter and callsite vectors. It is also an
+/// (intrusive) linked list for the purposes of the indirect call case above.
+///
+/// Allocation is expected to happen on an Arena. The allocation lays out inline
+/// the counter and subcontexts vectors. The class offers APIs to correctly
+/// reference the latter.
+///
+/// The layout is as follows:
+///
+/// [[declared fields][counters vector][vector of ptrs to subcontexts]]
+///
+/// See also documentation on the counters and subContexts members below.
+///
+/// The structure of the ContextNode is known to LLVM, because LLVM needs to:
+///   (1) increment counts, and
+///   (2) form a GEP for the position in the subcontext list of a callsite
+/// This means changes to LLVM contextual profile lowering and changes here
+/// must be coupled.
+/// Note: the header content isn't interesting to LLVM (other than its size)
+///
+/// Part of contextual collection is the notion of "scratch contexts". These are
+/// buffers that are "large enough" to allow for memory-safe acceses during
+/// counter increments - meaning the counter increment code in LLVM doesn't need
+/// to be concerned with memory safety. Their subcontexts never get populated,
+/// though. The runtime code here produces and recognizes them.
+
+#ifndef LLVM_PROFILEDATA_CTXINSTRCONTEXTNODE_H
+#define LLVM_PROFILEDATA_CTXINSTRCONTEXTNODE_H
+
+#include <stdint.h>
+#include <stdlib.h>
+
+namespace llvm {
+namespace ctx_profile {
+using GUID = uint64_t;
+
+class ContextNode final {
+  const GUID Guid;
+  ContextNode *const Next;
+  const uint32_t NrCounters;
+  const uint32_t NrCallsites;
+
+public:
+  ContextNode(GUID Guid, uint32_t NrCounters, uint32_t NrCallsites,
+              ContextNode *Next = nullptr)
+      : Guid(Guid), Next(Next), NrCounters(NrCounters),
+        NrCallsites(NrCallsites) {}
+
+  static inline size_t getAllocSize(uint32_t NrCounters, uint32_t NrCallsites) {
+    return sizeof(ContextNode) + sizeof(uint64_t) * NrCounters +
+           sizeof(ContextNode *) * NrCallsites;
+  }
+
+  // The counters vector starts right after the static header.
+  uint64_t *counters() {
+    ContextNode *addr_after = &(this[1]);
+    return reinterpret_cast<uint64_t *>(addr_after);
+  }
+
+  uint32_t counters_size() const { return NrCounters; }
+  uint32_t callsites_size() const { return NrCallsites; }
+
+  const uint64_t *counters() const {
+    return const_cast<ContextNode *>(this)->counters();
+  }
+
+  // The subcontexts vector starts right after the end of the counters vector.
+  ContextNode **subContexts() {
+    return reinterpret_cast<ContextNode **>(&(counters()[NrCounters]));
+  }
+
+  ContextNode *const *subContexts() const {
+    return const_cast<ContextNode *>(this)->subContexts();
+  }
+
+  GUID guid() const { return Guid; }
+  ContextNode *next() const { return Next; }
+
+  size_t size() const { return getAllocSize(NrCounters, NrCallsites); }
+
+  uint64_t entrycount() const { return counters()[0]; }
+};
+} // namespace ctx_profile
+} // namespace llvm
+#endif
+\ No newline at end of file
diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
index 7620ce92f7eb..cff39eeafba6 100644
--- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
+++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
@@ -10,20 +10,139 @@
 #include "sanitizer_common/sanitizer_allocator_internal.h"
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_dense_map.h"
+#include "sanitizer_common/sanitizer_libc.h"
 #include "sanitizer_common/sanitizer_mutex.h"
 #include "sanitizer_common/sanitizer_placement_new.h"
 #include "sanitizer_common/sanitizer_thread_safety.h"
+#include "sanitizer_common/sanitizer_vector.h"
 
 #include <assert.h>
 
 using namespace __ctx_profile;
 
+namespace {
+// Keep track of all the context roots we actually saw, so we can then traverse
+// them when the user asks for the profile in __llvm_ctx_profile_fetch
+__sanitizer::SpinMutex AllContextsMutex;
+SANITIZER_GUARDED_BY(AllContextsMutex)
+__sanitizer::Vector<ContextRoot *> AllContextRoots;
+
+// utility to taint a pointer by setting the LSB. There is an assumption
+// throughout that the addresses of contexts are even (really, they should be
+// align(8), but "even"-ness is the minimum assumption)
+// "scratch contexts" are buffers that we return in certain cases - they are
+// large enough to allow for memory safe counter access, but they don't link
+// subcontexts below them (the runtime recognizes them and enforces that)
+ContextNode *markAsScratch(const ContextNode *Ctx) {
+  return reinterpret_cast<ContextNode *>(reinterpret_cast<uint64_t>(Ctx) | 1);
+}
+
+// Used when getting the data from TLS. We don't *really* need to reset, but
+// it's a simpler system if we do.
+template <typename T> inline T consume(T &V) {
+  auto R = V;
+  V = {0};
+  return R;
+}
+
+// We allocate at least kBuffSize Arena pages. The scratch buffer is also that
+// large.
+constexpr size_t kPower = 20;
+constexpr size_t kBuffSize = 1 << kPower;
+
+// Highly unlikely we need more than kBuffSize for a context.
+size_t getArenaAllocSize(size_t Needed) {
+  if (Needed >= kBuffSize)
+    return 2 * Needed;
+  return kBuffSize;
+}
+
+// verify the structural integrity of the context
+bool validate(const ContextRoot *Root) {
+  // all contexts should be laid out in some arena page. Go over each arena
+  // allocated for this Root, and jump over contained contexts based on
+  // self-reported sizes.
+  __sanitizer::DenseMap<uint64_t, bool> ContextStartAddrs;
+  for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) {
+    const auto *Pos = Mem->start();
+    while (Pos < Mem->pos()) {
+      const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos);
+      if (!ContextStartAddrs.insert({reinterpret_cast<uint64_t>(Ctx), true})
+               .second)
+        return false;
+      Pos += Ctx->size();
+    }
+  }
+
+  // Now traverse the contexts again the same way, but validate all nonull
+  // subcontext addresses appear in the set computed above.
+  for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) {
+    const auto *Pos = Mem->start();
+    while (Pos < Mem->pos()) {
+      const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos);
+      for (uint32_t I = 0; I < Ctx->callsites_size(); ++I)
+        for (auto *Sub = Ctx->subContexts()[I]; Sub; Sub = Sub->next())
+          if (!ContextStartAddrs.find(reinterpret_cast<uint64_t>(Sub)))
+            return false;
+
+      Pos += Ctx->size();
+    }
+  }
+  return true;
+}
+
+inline ContextNode *allocContextNode(char *Place, GUID Guid,
+                                     uint32_t NrCounters, uint32_t NrCallsites,
+                                     ContextNode *Next = nullptr) {
+  assert(reinterpret_cast<uint64_t>(Place) % ExpectedAlignment == 0);
+  return new (Place) ContextNode(Guid, NrCounters, NrCallsites, Next);
+}
+
+void resetContextNode(ContextNode &Node) {
+  // FIXME(mtrofin): this is std::memset, which we can probably use if we
+  // drop/reduce the dependency on sanitizer_common.
+  for (uint32_t I = 0; I < Node.counters_size(); ++I)
+    Node.counters()[I] = 0;
+  for (uint32_t I = 0; I < Node.callsites_size(); ++I)
+    for (auto *Next = Node.subContexts()[I]; Next; Next = Next->next())
+      resetContextNode(*Next);
+}
+
+void onContextEnter(ContextNode &Node) { ++Node.counters()[0]; }
+
+} // namespace
+
+// the scratch buffer - what we give when we can't produce a real context (the
+// scratch isn't "real" in that it's expected to be clobbered carelessly - we
+// don't read it). The other important thing is that the callees from a scratch
+// context also get a scratch context.
+// Eventually this can be replaced with per-function buffers, a'la the typical
+// (flat) instrumented FDO buffers. The clobbering aspect won't apply there, but
+// the part about determining the nature of the subcontexts does.
+__thread char __Buffer[kBuffSize] = {0};
+
+#define TheScratchContext                                                      \
+  markAsScratch(reinterpret_cast<ContextNode *>(__Buffer))
+
+// init the TLSes
+__thread void *volatile __llvm_ctx_profile_expected_callee[2] = {nullptr,
+                                                                 nullptr};
+__thread ContextNode **volatile __llvm_ctx_profile_callsite[2] = {0, 0};
+
+__thread ContextRoot *volatile __llvm_ctx_profile_current_context_root =
+    nullptr;
+
+Arena::Arena(uint32_t Size) : Size(Size) {
+  __sanitizer::internal_memset(start(), 0, Size);
+}
+
 // FIXME(mtrofin): use malloc / mmap instead of sanitizer common APIs to reduce
 // the dependency on the latter.
 Arena *Arena::allocateNewArena(size_t Size, Arena *Prev) {
   assert(!Prev || Prev->Next == nullptr);
-  Arena *NewArena =
-      new (__sanitizer::InternalAlloc(Size + sizeof(Arena))) Arena(Size);
+  Arena *NewArena = new (__sanitizer::InternalAlloc(
+      Size + sizeof(Arena), /*cache=*/nullptr, /*alignment=*/ExpectedAlignment))
+      Arena(Size);
   if (Prev)
     Prev->Next = NewArena;
   return NewArena;
@@ -38,3 +157,169 @@ void Arena::freeArenaList(Arena *&A) {
   }
   A = nullptr;
 }
+
+// If this is the first time we hit a callsite with this (Guid) particular
+// callee, we need to allocate.
+ContextNode *getCallsiteSlow(GUID Guid, ContextNode **InsertionPoint,
+                             uint32_t NrCounters, uint32_t NrCallsites) {
+  auto AllocSize = ContextNode::getAllocSize(NrCounters, NrCallsites);
+  auto *Mem = __llvm_ctx_profile_current_context_root->CurrentMem;
+  char *AllocPlace = Mem->tryBumpAllocate(AllocSize);
+  if (!AllocPlace) {
+    // if we failed to allocate on the current arena, allocate a new arena,
+    // and place it on __llvm_ctx_profile_current_context_root->CurrentMem so we
+    // find it from now on for other cases when we need to getCallsiteSlow.
+    // Note that allocateNewArena will link the allocated memory in the list of
+    // Arenas.
+    __llvm_ctx_profile_current_context_root->CurrentMem = Mem =
+        Mem->allocateNewArena(getArenaAllocSize(AllocSize), Mem);
+    AllocPlace = Mem->tryBumpAllocate(AllocSize);
+  }
+  auto *Ret = allocContextNode(AllocPlace, Guid, NrCounters, NrCallsites,
+                               *InsertionPoint);
+  *InsertionPoint = Ret;
+  return Ret;
+}
+
+ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
+                                            uint32_t NrCounters,
+                                            uint32_t NrCallsites) {
+  // fast "out" if we're not even doing contextual collection.
+  if (!__llvm_ctx_profile_current_context_root)
+    return TheScratchContext;
+
+  // also fast "out" if the caller is scratch. We can see if it's scratch by
+  // looking at the interior pointer into the subcontexts vector that the caller
+  // provided, which, if the context is scratch, so is that interior pointer
+  // (because all the address calculations are using even values. Or more
+  // precisely, aligned - 8 values)
+  auto **CallsiteContext = consume(__llvm_ctx_profile_callsite[0]);
+  if (!CallsiteContext || isScratch(CallsiteContext))
+    return TheScratchContext;
+
+  // if the callee isn't the expected one, return scratch.
+  // Signal handler(s) could have been invoked at any point in the execution.
+  // Should that have happened, and had it (the handler) be built with
+  // instrumentation, its __llvm_ctx_profile_get_context would have failed here.
+  // Its sub call graph would have then populated
+  // __llvm_ctx_profile_{expected_callee | callsite} at index 1.
+  // The normal call graph may be impacted in that, if the signal handler
+  // happened somewhere before we read the TLS here, we'd see the TLS reset and
+  // we'd also fail here. That would just mean we would loose counter values for
+  // the normal subgraph, this time around. That should be very unlikely, but if
+  // it happens too frequently, we should be able to detect discrepancies in
+  // entry counts (caller-callee). At the moment, the design goes on the
+  // assumption that is so unfrequent, though, that it's not worth doing more
+  // for that case.
+  auto *ExpectedCallee = consume(__llvm_ctx_profile_expected_callee[0]);
+  if (ExpectedCallee != Callee)
+    return TheScratchContext;
+
+  auto *Callsite = *CallsiteContext;
+  // in the case of indirect calls, we will have all seen targets forming a
+  // linked list here. Find the one corresponding to this callee.
+  while (Callsite && Callsite->guid() != Guid) {
+    Callsite = Callsite->next();
+  }
+  auto *Ret = Callsite ? Callsite
+                       : getCallsiteSlow(Guid, CallsiteContext, NrCounters,
+                                         NrCallsites);
+  if (Ret->callsites_size() != NrCallsites ||
+      Ret->counters_size() != NrCounters)
+    __sanitizer::Printf("[ctxprof] Returned ctx differs from what's asked: "
+                        "Context: %p, Asked: %lu %u %u, Got: %lu %u %u \n",
+                        Ret, Guid, NrCallsites, NrCounters, Ret->guid(),
+                        Ret->callsites_size(), Ret->counters_size());
+  onContextEnter(*Ret);
+  return Ret;
+}
+
+// This should be called once for a Root. Allocate the first arena, set up the
+// first context.
+void setupContext(ContextRoot *Root, GUID Guid, uint32_t NrCounters,
+                  uint32_t NrCallsites) {
+  __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
+      &AllContextsMutex);
+  // Re-check - we got here without having had taken a lock.
+  if (Root->FirstMemBlock)
+    return;
+  const auto Needed = ContextNode::getAllocSize(NrCounters, NrCallsites);
+  auto *M = Arena::allocateNewArena(getArenaAllocSize(Needed));
+  Root->FirstMemBlock = M;
+  Root->CurrentMem = M;
+  Root->FirstNode = allocContextNode(M->tryBumpAllocate(Needed), Guid,
+                                     NrCounters, NrCallsites);
+  AllContextRoots.PushBack(Root);
+}
+
+ContextNode *__llvm_ctx_profile_start_context(
+    ContextRoot *Root, GUID Guid, uint32_t Counters,
+    uint32_t Callsites) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+  if (!Root->FirstMemBlock) {
+    setupContext(Root, Guid, Counters, Callsites);
+  }
+  if (Root->Taken.TryLock()) {
+    __llvm_ctx_profile_current_context_root = Root;
+    onContextEnter(*Root->FirstNode);
+    return Root->FirstNode;
+  }
+  // If this thread couldn't take the lock, return scratch context.
+  __llvm_ctx_profile_current_context_root = nullptr;
+  return TheScratchContext;
+}
+
+void __llvm_ctx_profile_release_context(ContextRoot *Root)
+    SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+  if (__llvm_ctx_profile_current_context_root) {
+    __llvm_ctx_profile_current_context_root = nullptr;
+    Root->Taken.Unlock();
+  }
+}
+
+void __llvm_ctx_profile_start_collection() {
+  size_t NrMemUnits = 0;
+  __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
+      &AllContextsMutex);
+  for (uint32_t I = 0; I < AllContextRoots.Size(); ++I) {
+    auto *Root = AllContextRoots[I];
+    __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> Lock(
+        &Root->Taken);
+    for (auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next())
+      ++NrMemUnits;
+
+    resetContextNode(*Root->FirstNode);
+  }
+  __sanitizer::Printf("[ctxprof] Initial NrMemUnits: %zu \n", NrMemUnits);
+}
+
+bool __llvm_ctx_profile_fetch(void *Data,
+                              bool (*Writer)(void *W, const ContextNode &)) {
+  assert(Writer);
+  __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
+      &AllContextsMutex);
+
+  for (int I = 0, E = AllContextRoots.Size(); I < E; ++I) {
+    auto *Root = AllContextRoots[I];
+    __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> TakenLock(
+        &Root->Taken);
+    if (!validate(Root)) {
+      __sanitizer::Printf("[ctxprof] Contextual Profile is %s\n", "invalid");
+      return false;
+    }
+    if (!Writer(Data, *Root->FirstNode))
+      return false;
+  }
+  return true;
+}
+
+void __llvm_ctx_profile_free() {
+  __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
+      &AllContextsMutex);
+  for (int I = 0, E = AllContextRoots.Size(); I < E; ++I)
+    for (auto *A = AllContextRoots[I]->FirstMemBlock; A;) {
+      auto *C = A;
+      A = A->next();
+      __sanitizer::InternalFree(C);
+    }
+  AllContextRoots.Reset();
+}
diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
index c1789c32a64c..f55068e98dd4 100644
--- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
+++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
@@ -9,10 +9,23 @@
 #ifndef CTX_PROFILE_CTXINSTRPROFILING_H_
 #define CTX_PROFILE_CTXINSTRPROFILING_H_
 
+#include "CtxInstrContextNode.h"
+#include "sanitizer_common/sanitizer_mutex.h"
 #include <sanitizer/common_interface_defs.h>
 
+using namespace llvm::ctx_profile;
+
+// Forward-declare for the one unittest checking Arena construction zeroes out
+// its allocatable space.
+class ArenaTest_ZeroInit_Test;
 namespace __ctx_profile {
 
+static constexpr size_t ExpectedAlignment = 8;
+// We really depend on this, see further below. We currently support x86_64.
+// When we want to support other archs, we need to trace the places Alignment is
+// used and adjust accordingly.
+static_assert(sizeof(void *) == ExpectedAlignment);
+
 /// Arena (bump allocator) forming a linked list. Intentionally not thread safe.
 /// Allocation and de-allocation happen using sanitizer APIs. We make that
 /// explicit.
@@ -41,7 +54,8 @@ public:
   const char *pos() const { return start() + Pos; }
 
 private:
-  explicit Arena(uint32_t Size) : Size(Size) {}
+  friend class ::ArenaTest_ZeroInit_Test;
+  explicit Arena(uint32_t Size);
   ~Arena() = delete;
 
   char *start() { return reinterpret_cast<char *>(&this[1]); }
@@ -51,5 +65,111 @@ private:
   const uint64_t Size;
 };
 
+// The memory available for allocation follows the Arena header, and we expect
+// it to be thus aligned.
+static_assert(alignof(Arena) == ExpectedAlignment);
+
+// Verify maintenance to ContextNode doesn't change this invariant, which makes
+// sure the inlined vectors are appropriately aligned.
+static_assert(alignof(ContextNode) == ExpectedAlignment);
+
+/// ContextRoots are allocated by LLVM for entrypoints. LLVM is only concerned
+/// with allocating and zero-initializing the global value (as in, GlobalValue)
+/// for it.
+struct ContextRoot {
+  ContextNode *FirstNode = nullptr;
+  Arena *FirstMemBlock = nullptr;
+  Arena *CurrentMem = nullptr;
+  // This is init-ed by the static zero initializer in LLVM.
+  // Taken is used to ensure only one thread traverses the contextual graph -
+  // either to read it or to write it. On server side, the same entrypoint will
+  // be entered by numerous threads, but over time, the profile aggregated by
+  // collecting sequentially on one thread at a time is expected to converge to
+  // the aggregate profile that may have been observable on all the threads.
+  // Note that this is node-by-node aggregation, i.e. summing counters of nodes
+  // at the same position in the graph, not flattening.
+  // Threads that cannot lock Taken (fail TryLock) are given a "scratch context"
+  // - a buffer they can clobber, safely from a memory access perspective.
+  //
+  // Note about "scratch"-ness: we currently ignore the data written in them
+  // (which is anyway clobbered). The design allows for that not be the case -
+  // because "scratch"-ness is first and foremost about not trying to build
+  // subcontexts, and is captured by tainting the pointer value (pointer to the
+  // memory treated as context), but right now, we drop that info.
+  //
+  // We could consider relaxing the requirement of more than one thread
+  // entering by holding a few context trees per entrypoint and then aggregating
+  // them (as explained above) at the end of the profile collection - it's a
+  // tradeoff between collection time and memory use: higher precision can be
+  // obtained with either less concurrent collections but more collection time,
+  // or with more concurrent collections (==more memory) and less collection
+  // time. Note that concurrent collection does happen for different
+  // entrypoints, regardless.
+  ::__sanitizer::StaticSpinMutex Taken;
+
+  // If (unlikely) StaticSpinMutex internals change, we need to modify the LLVM
+  // instrumentation lowering side because it is responsible for allocating and
+  // zero-initializing ContextRoots.
+  static_assert(sizeof(Taken) == 1);
+};
+
+/// This API is exposed for testing. See the APIs below about the contract with
+/// LLVM.
+inline bool isScratch(const void *Ctx) {
+  return (reinterpret_cast<uint64_t>(Ctx) & 1);
+}
+
 } // namespace __ctx_profile
+
+extern "C" {
+
+// LLVM fills these in when lowering a llvm.instrprof.callsite intrinsic.
+// position 0 is used when the current context isn't scratch, 1 when it is. They
+// are volatile because of signal handlers - we mean to specifically control
+// when the data is loaded.
+//
+/// TLS where LLVM stores the pointer of the called value, as part of lowering a
+/// llvm.instrprof.callsite
+extern __thread void *volatile __llvm_ctx_profile_expected_callee[2];
+/// TLS where LLVM stores the pointer inside a caller's subcontexts vector that
+/// corresponds to the callsite being lowered.
+extern __thread ContextNode **volatile __llvm_ctx_profile_callsite[2];
+
+// __llvm_ctx_profile_current_context_root is exposed for unit testing,
+// othwerise it's only used internally by compiler-rt/ctx_profile.
+extern __thread __ctx_profile::ContextRoot
+    *volatile __llvm_ctx_profile_current_context_root;
+
+/// called by LLVM in the entry BB of a "entry point" function. The returned
+/// pointer may be "tainted" - its LSB set to 1 - to indicate it's scratch.
+ContextNode *__llvm_ctx_profile_start_context(__ctx_profile::ContextRoot *Root,
+                                              GUID Guid, uint32_t Counters,
+                                              uint32_t Callsites);
+
+/// paired with __llvm_ctx_profile_start_context, and called at the exit of the
+/// entry point function.
+void __llvm_ctx_profile_release_context(__ctx_profile::ContextRoot *Root);
+
+/// called for any other function than entry points, in the entry BB of such
+/// function. Same consideration about LSB of returned value as .._start_context
+ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
+                                            uint32_t NrCounters,
+                                            uint32_t NrCallsites);
+
+/// Prepares for collection. Currently this resets counter values but preserves
+/// internal context tree structure.
+void __llvm_ctx_profile_start_collection();
+
+/// Completely free allocated memory.
+void __llvm_ctx_profile_free();
+
+/// Used to obtain the profile. The Writer is called for each root ContextNode,
+/// with the ContextRoot::Taken taken. The Writer is responsible for traversing
+/// the structure underneath.
+/// The Writer's first parameter plays the role of closure for Writer, and is
+/// what the caller of __llvm_ctx_profile_fetch passes as the Data parameter.
+/// The second parameter is the root of a context tree.
+bool __llvm_ctx_profile_fetch(void *Data,
+                              bool (*Writer)(void *, const ContextNode &));
+}
 #endif // CTX_PROFILE_CTXINSTRPROFILING_H_
diff --git a/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp b/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp
index 44f37d257632..d9f08b1e7efe 100644
--- a/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp
+++ b/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp
@@ -1,8 +1,26 @@
 #include "../CtxInstrProfiling.h"
 #include "gtest/gtest.h"
+#include <thread>
 
 using namespace __ctx_profile;
 
+class ContextTest : public ::testing::Test {
+  void SetUp() override { memset(&Root, 0, sizeof(ContextRoot)); }
+  void TearDown() override { __llvm_ctx_profile_free(); }
+
+public:
+  ContextRoot Root;
+};
+
+TEST(ArenaTest, ZeroInit) {
+  char Buffer[1024];
+  memset(Buffer, 1, 1024);
+  Arena *A = new (Buffer) Arena(10);
+  for (auto I = 0U; I < A->size(); ++I)
+    EXPECT_EQ(A->pos()[I], static_cast<char>(0));
+  EXPECT_EQ(A->size(), 10U);
+}
+
 TEST(ArenaTest, Basic) {
   Arena *A = Arena::allocateNewArena(1024);
   EXPECT_EQ(A->size(), 1024U);
@@ -20,3 +38,186 @@ TEST(ArenaTest, Basic) {
   Arena::freeArenaList(A);
   EXPECT_EQ(A, nullptr);
 }
+
+TEST_F(ContextTest, Basic) {
+  auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4);
+  ASSERT_NE(Ctx, nullptr);
+  EXPECT_NE(Root.CurrentMem, nullptr);
+  EXPECT_EQ(Root.FirstMemBlock, Root.CurrentMem);
+  EXPECT_EQ(Ctx->size(), sizeof(ContextNode) + 10 * sizeof(uint64_t) +
+                             4 * sizeof(ContextNode *));
+  EXPECT_EQ(Ctx->counters_size(), 10U);
+  EXPECT_EQ(Ctx->callsites_size(), 4U);
+  EXPECT_EQ(__llvm_ctx_profile_current_context_root, &Root);
+  Root.Taken.CheckLocked();
+  EXPECT_FALSE(Root.Taken.TryLock());
+  __llvm_ctx_profile_release_context(&Root);
+  EXPECT_EQ(__llvm_ctx_profile_current_context_root, nullptr);
+  EXPECT_TRUE(Root.Taken.TryLock());
+  Root.Taken.Unlock();
+}
+
+TEST_F(ContextTest, Callsite) {
+  auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4);
+  int FakeCalleeAddress = 0;
+  const bool IsScratch = isScratch(Ctx);
+  EXPECT_FALSE(IsScratch);
+  // This is the sequence the caller performs - it's the lowering of the
+  // instrumentation of the callsite "2". "2" is arbitrary here.
+  __llvm_ctx_profile_expected_callee[0] = &FakeCalleeAddress;
+  __llvm_ctx_profile_callsite[0] = &Ctx->subContexts()[2];
+  // This is what the callee does
+  auto *Subctx = __llvm_ctx_profile_get_context(&FakeCalleeAddress, 2, 3, 1);
+  // We expect the subcontext to be appropriately placed and dimensioned
+  EXPECT_EQ(Ctx->subContexts()[2], Subctx);
+  EXPECT_EQ(Subctx->counters_size(), 3U);
+  EXPECT_EQ(Subctx->callsites_size(), 1U);
+  // We reset these in _get_context.
+  EXPECT_EQ(__llvm_ctx_profile_expected_callee[0], nullptr);
+  EXPECT_EQ(__llvm_ctx_profile_callsite[0], nullptr);
+
+  EXPECT_EQ(Subctx->size(), sizeof(ContextNode) + 3 * sizeof(uint64_t) +
+                                1 * sizeof(ContextNode *));
+  __llvm_ctx_profile_release_context(&Root);
+}
+
+TEST_F(ContextTest, ScratchNoCollection) {
+  EXPECT_EQ(__llvm_ctx_profile_current_context_root, nullptr);
+  int FakeCalleeAddress = 0;
+  // this would be the very first function executing this. the TLS is empty,
+  // too.
+  auto *Ctx = __llvm_ctx_profile_get_context(&FakeCalleeAddress, 2, 3, 1);
+  // We never entered a context (_start_context was never called) - so the
+  // returned context must be scratch.
+  EXPECT_TRUE(isScratch(Ctx));
+}
+
+TEST_F(ContextTest, ScratchDuringCollection) {
+  auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4);
+  int FakeCalleeAddress = 0;
+  int OtherFakeCalleeAddress = 0;
+  __llvm_ctx_profile_expected_callee[0] = &FakeCalleeAddress;
+  __llvm_ctx_profile_callsite[0] = &Ctx->subContexts()[2];
+  auto *Subctx =
+      __llvm_ctx_profile_get_context(&OtherFakeCalleeAddress, 2, 3, 1);
+  // We expected a different callee - so return scratch. It mimics what happens
+  // in the case of a signal handler - in this case, OtherFakeCalleeAddress is
+  // the signal handler.
+  EXPECT_TRUE(isScratch(Subctx));
+  EXPECT_EQ(__llvm_ctx_profile_expected_callee[0], nullptr);
+  EXPECT_EQ(__llvm_ctx_profile_callsite[0], nullptr);
+
+  int ThirdFakeCalleeAddress = 0;
+  __llvm_ctx_profile_expected_callee[1] = &ThirdFakeCalleeAddress;
+  __llvm_ctx_profile_callsite[1] = &Subctx->subContexts()[0];
+
+  auto *Subctx2 =
+      __llvm_ctx_profile_get_context(&ThirdFakeCalleeAddress, 3, 0, 0);
+  // We again expect scratch because the '0' position is where the runtime
+  // looks, so it doesn't matter the '1' position is populated correctly.
+  EXPECT_TRUE(isScratch(Subctx2));
+
+  __llvm_ctx_profile_expected_callee[0] = &ThirdFakeCalleeAddress;
+  __llvm_ctx_profile_callsite[0] = &Subctx->subContexts()[0];
+  auto *Subctx3 =
+      __llvm_ctx_profile_get_context(&ThirdFakeCalleeAddress, 3, 0, 0);
+  // We expect scratch here, too, because the value placed in
+  // __llvm_ctx_profile_callsite is scratch
+  EXPECT_TRUE(isScratch(Subctx3));
+
+  __llvm_ctx_profile_release_context(&Root);
+}
+
+TEST_F(ContextTest, NeedMoreMemory) {
+  auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4);
+  int FakeCalleeAddress = 0;
+  const bool IsScratch = isScratch(Ctx);
+  EXPECT_FALSE(IsScratch);
+  const auto *CurrentMem = Root.CurrentMem;
+  __llvm_ctx_profile_expected_callee[0] = &FakeCalleeAddress;
+  __llvm_ctx_profile_callsite[0] = &Ctx->subContexts()[2];
+  // Allocate a massive subcontext to force new arena allocation
+  auto *Subctx =
+      __llvm_ctx_profile_get_context(&FakeCalleeAddress, 3, 1 << 20, 1);
+  EXPECT_EQ(Ctx->subContexts()[2], Subctx);
+  EXPECT_NE(CurrentMem, Root.CurrentMem);
+  EXPECT_NE(Root.CurrentMem, nullptr);
+}
+
+TEST_F(ContextTest, ConcurrentRootCollection) {
+  std::atomic<int> NonScratch = 0;
+  std::atomic<int> Executions = 0;
+
+  __sanitizer::Semaphore GotCtx;
+
+  auto Entrypoint = [&]() {
+    ++Executions;
+    auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4);
+    GotCtx.Post();
+    const bool IS = isScratch(Ctx);
+    NonScratch += (!IS);
+    if (!IS) {
+      GotCtx.Wait();
+      GotCtx.Wait();
+    }
+    __llvm_ctx_profile_release_context(&Root);
+  };
+  std::thread T1(Entrypoint);
+  std::thread T2(Entrypoint);
+  T1.join();
+  T2.join();
+  EXPECT_EQ(NonScratch, 1);
+  EXPECT_EQ(Executions, 2);
+}
+
+TEST_F(ContextTest, Dump) {
+  auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4);
+  int FakeCalleeAddress = 0;
+  __llvm_ctx_profile_expected_callee[0] = &FakeCalleeAddress;
+  __llvm_ctx_profile_callsite[0] = &Ctx->subContexts()[2];
+  auto *Subctx = __llvm_ctx_profile_get_context(&FakeCalleeAddress, 2, 3, 1);
+  (void)Subctx;
+  __llvm_ctx_profile_release_context(&Root);
+
+  struct Writer {
+    ContextRoot *const Root;
+    const size_t Entries;
+    bool State = false;
+    Writer(ContextRoot *Root, size_t Entries) : Root(Root), Entries(Entries) {}
+
+    bool write(const ContextNode &Node) {
+      EXPECT_FALSE(Root->Taken.TryLock());
+      EXPECT_EQ(Node.guid(), 1U);
+      EXPECT_EQ(Node.counters()[0], Entries);
+      EXPECT_EQ(Node.counters_size(), 10U);
+      EXPECT_EQ(Node.callsites_size(), 4U);
+      EXPECT_EQ(Node.subContexts()[0], nullptr);
+      EXPECT_EQ(Node.subContexts()[1], nullptr);
+      EXPECT_NE(Node.subContexts()[2], nullptr);
+      EXPECT_EQ(Node.subContexts()[3], nullptr);
+      const auto &SN = *Node.subContexts()[2];
+      EXPECT_EQ(SN.guid(), 2U);
+      EXPECT_EQ(SN.counters()[0], Entries);
+      EXPECT_EQ(SN.counters_size(), 3U);
+      EXPECT_EQ(SN.callsites_size(), 1U);
+      EXPECT_EQ(SN.subContexts()[0], nullptr);
+      State = true;
+      return true;
+    }
+  };
+  Writer W(&Root, 1);
+  EXPECT_FALSE(W.State);
+  __llvm_ctx_profile_fetch(&W, [](void *W, const ContextNode &Node) -> bool {
+    return reinterpret_cast<Writer *>(W)->write(Node);
+  });
+  EXPECT_TRUE(W.State);
+
+  // this resets all counters but not the internal structure.
+  __llvm_ctx_profile_start_collection();
+  Writer W2(&Root, 0);
+  EXPECT_FALSE(W2.State);
+  __llvm_ctx_profile_fetch(&W2, [](void *W, const ContextNode &Node) -> bool {
+    return reinterpret_cast<Writer *>(W)->write(Node);
+  });
+  EXPECT_TRUE(W2.State);
+}
diff --git a/compiler-rt/lib/fuzzer/FuzzerDriver.cpp b/compiler-rt/lib/fuzzer/FuzzerDriver.cpp
index 8674d788932f..3771abf5f532 100644
--- a/compiler-rt/lib/fuzzer/FuzzerDriver.cpp
+++ b/compiler-rt/lib/fuzzer/FuzzerDriver.cpp
@@ -229,6 +229,7 @@ static void PulseThread() {
 
 static void WorkerThread(const Command &BaseCmd, std::atomic<unsigned> *Counter,
                          unsigned NumJobs, std::atomic<bool> *HasErrors) {
+  ScopedDisableMsanInterceptorChecks S;
   while (true) {
     unsigned C = (*Counter)++;
     if (C >= NumJobs) break;
diff --git a/compiler-rt/lib/xray/xray_utils.h b/compiler-rt/lib/xray/xray_utils.h
index 333826168c0d..5dc73d7fa8cd 100644
--- a/compiler-rt/lib/xray/xray_utils.h
+++ b/compiler-rt/lib/xray/xray_utils.h
@@ -61,7 +61,7 @@ constexpr size_t gcd(size_t a, size_t b) {
 constexpr size_t lcm(size_t a, size_t b) { return a * b / gcd(a, b); }
 
 constexpr size_t nearest_boundary(size_t number, size_t multiple) {
-  return multiple * ((number / multiple) + (number % multiple ? 1 : 0));
+  return multiple * ((number / multiple) + ((number % multiple) ? 1 : 0));
 }
 
 constexpr size_t next_pow2_helper(size_t num, size_t acc) {
diff --git a/compiler-rt/test/asan/TestCases/Posix/fake_stack_gc.cpp b/compiler-rt/test/asan/TestCases/Posix/fake_stack_gc.cpp
index 524ca29f2fc5..8c368b9b1b94 100644
--- a/compiler-rt/test/asan/TestCases/Posix/fake_stack_gc.cpp
+++ b/compiler-rt/test/asan/TestCases/Posix/fake_stack_gc.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx_asan -O0 -pthread %s -o %t && %env_asan_opts=use_sigaltstack=0 %run not --crash %t 2>&1 | FileCheck %s
+// RUN: %clangxx_asan -O0 -pthread %s -o %t && %env_asan_opts=use_sigaltstack=0 not --crash %run %t 2>&1 | FileCheck %s
 
 // Check that fake stack does not discard frames on the main stack, when GC is
 // triggered from high alt stack.
diff --git a/compiler-rt/test/ctx_profile/CMakeLists.txt b/compiler-rt/test/ctx_profile/CMakeLists.txt
index 23c6fb16ed1f..371f1a2dcbb0 100644
--- a/compiler-rt/test/ctx_profile/CMakeLists.txt
+++ b/compiler-rt/test/ctx_profile/CMakeLists.txt
@@ -2,6 +2,28 @@ set(CTX_PROFILE_LIT_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 
 set(CTX_PROFILE_TESTSUITES)
 
+macro(get_bits_for_arch arch bits)
+  if (${arch} MATCHES "x86_64")
+    set(${bits} 64)
+  else()
+    message(FATAL_ERROR "Unexpected target architecture: ${arch}")
+  endif()
+endmacro()
+
+set(CTX_PROFILE_TEST_DEPS ${SANITIZER_COMMON_LIT_TEST_DEPS} ctx_profile)
+
+foreach(arch ${CTX_PROFILE_SUPPORTED_ARCH})
+  set(CTX_PROFILE_TEST_TARGET_ARCH ${arch})
+  string(TOLOWER "-${arch}-${OS_NAME}" CTX_PROFILE_TEST_CONFIG_SUFFIX)
+  string(TOUPPER ${arch} ARCH_UPPER_CASE)
+  set(CONFIG_NAME ${ARCH_UPPER_CASE}${OS_NAME}Config)
+  configure_lit_site_cfg(
+    ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
+    ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME}/lit.site.cfg.py
+    )
+  list(APPEND CTX_PROFILE_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME})
+endforeach()
+
 # Add unit tests.
 if(COMPILER_RT_INCLUDE_TESTS)
   foreach(arch ${CTX_PROFILE_SUPPORTED_ARCH})
diff --git a/compiler-rt/test/ctx_profile/TestCases/check-same-ctx-node.test b/compiler-rt/test/ctx_profile/TestCases/check-same-ctx-node.test
new file mode 100644
index 000000000000..4ad7b23d458f
--- /dev/null
+++ b/compiler-rt/test/ctx_profile/TestCases/check-same-ctx-node.test
@@ -0,0 +1,5 @@
+;
+; NOTE: if this test fails, please make sure the two files are identical copies
+; of eachother.
+;
+; RUN: diff %crt_src/lib/ctx_profile/CtxInstrContextNode.h %llvm_src/include/llvm/ProfileData/CtxInstrContextNode.h
diff --git a/compiler-rt/test/ctx_profile/lit.cfg.py b/compiler-rt/test/ctx_profile/lit.cfg.py
new file mode 100644
index 000000000000..a56dabb8ebeb
--- /dev/null
+++ b/compiler-rt/test/ctx_profile/lit.cfg.py
@@ -0,0 +1,31 @@
+# -*- Python -*-
+
+import os
+import platform
+import re
+
+import lit.formats
+
+# Only run the tests on supported OSs.
+if config.host_os not in ["Linux"]:
+    config.unsupported = True
+
+
+def get_required_attr(config, attr_name):
+    attr_value = getattr(config, attr_name, None)
+    if attr_value == None:
+        lit_config.fatal(
+            "No attribute %r in test configuration! You may need to run "
+            "tests from your build directory or add this attribute "
+            "to lit.site.cfg.py " % attr_name
+        )
+    return attr_value
+
+
+# Setup config name.
+config.name = "CtxProfile" + config.name_suffix
+
+# Setup source root.
+config.test_source_root = os.path.dirname(__file__)
+# Default test suffixes.
+config.suffixes = [".c", ".cpp", ".test"]
diff --git a/compiler-rt/test/ctx_profile/lit.site.cfg.py.in b/compiler-rt/test/ctx_profile/lit.site.cfg.py.in
new file mode 100644
index 000000000000..e8df42d097d8
--- /dev/null
+++ b/compiler-rt/test/ctx_profile/lit.site.cfg.py.in
@@ -0,0 +1,14 @@
+@LIT_SITE_CFG_IN_HEADER@
+
+# Tool-specific config options.
+config.name_suffix = "@CTX_PROFILE_TEST_CONFIG_SUFFIX@"
+config.target_cflags = "@CTX_PROFILE_TEST_TARGET_CFLAGS@"
+config.clang = "@CTX_PROFILE_TEST_TARGET_CC@"
+config.bits = "@CTX_PROFILE_TEST_BITS@"
+config.target_arch = "@CTX_PROFILE_TEST_TARGET_ARCH@"
+
+# Load common config for all compiler-rt lit tests.
+lit_config.load_config(config, "@COMPILER_RT_BINARY_DIR@/test/lit.common.configured")
+
+# Load tool-specific config that would do the real work.
+lit_config.load_config(config, "@CTX_PROFILE_LIT_SOURCE_DIR@/lit.cfg.py")
diff --git a/compiler-rt/test/dfsan/release_shadow_space.c b/compiler-rt/test/dfsan/release_shadow_space.c
index 675640a1c296..60dec98ebec4 100644
--- a/compiler-rt/test/dfsan/release_shadow_space.c
+++ b/compiler-rt/test/dfsan/release_shadow_space.c
@@ -3,6 +3,9 @@
 // DFSAN_OPTIONS=no_huge_pages_for_shadow=false RUN: %clang_dfsan %s -DORIGIN_TRACKING -mllvm -dfsan-track-origins=1 -o %t && %run %t
 // DFSAN_OPTIONS=no_huge_pages_for_shadow=true RUN: %clang_dfsan %s -DORIGIN_TRACKING -mllvm -dfsan-track-origins=1 -o %t && %run %t
 
+// This test is flaky right now: https://github.com/llvm/llvm-project/issues/91287
+// UNSUPPORTED:  target={{.*}}
+
 #include <assert.h>
 #include <sanitizer/dfsan_interface.h>
 #include <stdbool.h>
diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py
index 28f126a11b16..fae1d1686e56 100644
--- a/compiler-rt/test/lit.common.cfg.py
+++ b/compiler-rt/test/lit.common.cfg.py
@@ -987,3 +987,9 @@ if config.compiler_id == "GNU":
     gcc_dir = os.path.dirname(config.clang)
     libasan_dir = os.path.join(gcc_dir, "..", "lib" + config.bits)
     push_dynamic_library_lookup_path(config, libasan_dir)
+
+
+# Help tests that make sure certain files are in-sync between compiler-rt and
+# llvm.
+config.substitutions.append(("%crt_src", config.compiler_rt_src_root))
+config.substitutions.append(("%llvm_src", config.llvm_src_root))
diff --git a/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_trace_pc_guard.cpp b/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_trace_pc_guard.cpp
index e46c2edac4ce..ee47a1228fcc 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_trace_pc_guard.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_trace_pc_guard.cpp
@@ -3,7 +3,7 @@
 // REQUIRES: has_sancovcc
 // UNSUPPORTED: ubsan,i386-darwin,target={{(powerpc64|s390x|thumb).*}}
 // This test is failing for lsan on darwin on x86_64h.
-// UNSUPPORTED: x86_64h && lsan && darwin
+// UNSUPPORTED: darwin && x86-target-arch && lsan
 // XFAIL: tsan
 // XFAIL: android && asan
 
diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 9030207d9bda..43ed35e36a6e 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -107,12 +107,6 @@ end
   These definitions yield fairly poor results due to floating-point
   cancellation, and every Fortran compiler (including this one)
   uses better algorithms.
-* When an index variable of a `FORALL` or `DO CONCURRENT` is present
-  in the enclosing scope, and the construct does not have an explicit
-  type specification for its index variables, some weird restrictions
-  in F'2023 subclause 19.4 paragraphs 6 & 8 should apply.  Since this
-  compiler properly scopes these names, violations of these restrictions
-  elicit only portability warnings by default.
 * The rules for pairwise distinguishing the specific procedures of a
   generic interface are inadequate, as admitted in note C.11.6 of F'2023.
   Generic interfaces whose specific procedures can be easily proven by
@@ -120,6 +114,10 @@ end
   appear in real applications, but are still non-conforming under the
   incomplete tests in F'2023 15.4.3.4.5.
   These cases are compiled with optional portability warnings.
+* `PROCEDURE(), BIND(C) :: PROC` is not conforming, as there is no
+  procedure interface.  This compiler accepts it, since there is otherwise
+  no way to declare an interoperable dummy procedure with an arbitrary
+  interface like `void (*)()`.
 
 ## Extensions, deletions, and legacy features supported by default
 
@@ -351,6 +349,9 @@ end
   when necessary to the type of the result.
   An `OPTIONAL`, `POINTER`, or `ALLOCATABLE` argument after
   the first two cannot be converted, as it may not be present.
+* A derived type that meets (most of) the requirements of an interoperable
+  derived type can be used as such where an interoperable type is
+  required, with warnings, even if it lacks the BIND(C) attribute.
 
 ### Extensions supported when enabled by options
 
@@ -728,6 +729,23 @@ end
   array and structure constructors not to be finalized, so it also makes sense
   not to finalize their allocatable components when releasing their storage).
 
+* F'2023 19.4 paragraph 5: "If integer-type-spec appears in data-implied-do or
+  ac-implied-do-control it has the specified type and type parameters; otherwise
+  it has the type and type parameters that it would have if it were the name of
+  a variable in the innermost executable construct or scoping unit that includes
+  the DATA statement or array constructor, and this type shall be integer type."
+  Reading "would have if it were" as being the subjunctive, this would mean that
+  an untyped implied DO index variable should be implicitly typed according to
+  the rules active in the enclosing scope.  But all other Fortran compilers interpret
+  the "would have if it were" as meaning "has if it is" -- i.e., if the name
+  is visible in the enclosing scope, the type of that name is used as the
+  type of the implied DO index.  So this is an error, not a simple application
+  of the default implicit typing rule:
+```
+character j
+print *, [(j,j=1,10)]
+```
+
 ## De Facto Standard Features
 
 * `EXTENDS_TYPE_OF()` returns `.TRUE.` if both of its arguments have the
diff --git a/flang/docs/FlangDriver.md b/flang/docs/FlangDriver.md
index 351595ac0afd..e1c110621250 100644
--- a/flang/docs/FlangDriver.md
+++ b/flang/docs/FlangDriver.md
@@ -518,6 +518,16 @@ to re-analyze expressions and modify scope or symbols. You can check
 [Semantics.md](Semantics.md) for more details on how `ParseTree` is edited
 e.g. during the semantic checks.
 
+## FIR Optimizer Pass Pipeline Extension Points
+
+The default FIR optimizer pass pipeline `createDefaultFIROptimizerPassPipeline`
+in `flang/include/flang/Tools/CLOptions.inc` contains extension point callback
+invocations `invokeFIROptEarlyEPCallbacks`, `invokeFIRInlinerCallback`, and
+`invokeFIROptLastEPCallbacks` for Flang drivers to be able to insert additonal
+passes at different points of the default pass pipeline. An example use of these
+extension point callbacks is shown in `registerDefaultInlinerPass` to invoke the
+default inliner pass in `flang-new`.
+
 ## LLVM Pass Plugins
 
 Pass plugins are dynamic shared objects that consist of one or more LLVM IR
diff --git a/flang/docs/OpenMP-descriptor-management.md b/flang/docs/OpenMP-descriptor-management.md
index 368ff3e911fc..d0eb01b00f9b 100644
--- a/flang/docs/OpenMP-descriptor-management.md
+++ b/flang/docs/OpenMP-descriptor-management.md
@@ -44,7 +44,7 @@ Currently, Flang will lower these descriptor types in the OpenMP lowering (lower
 to all other map types, generating an omp.MapInfoOp containing relevant information required for lowering
 the OpenMP dialect to LLVM-IR during the final stages of the MLIR lowering. However, after 
 the lowering to FIR/HLFIR has been performed an OpenMP dialect specific pass for Fortran, 
-`OMPDescriptorMapInfoGenPass` (Optimizer/OMPDescriptorMapInfoGen.cpp) will expand the 
+`OMPMapInfoFinalizationPass` (Optimizer/OMPMapInfoFinalization.cpp) will expand the 
 `omp.MapInfoOp`'s containing descriptors (which currently will be a `BoxType` or `BoxAddrOp`) into multiple 
 mappings, with one extra per pointer member in the descriptor that is supported on top of the original
 descriptor map operation. These pointers members are linked to the parent descriptor by adding them to 
@@ -53,7 +53,7 @@ owning operation's (`omp.TargetOp`, `omp.TargetDataOp` etc.) map operand list an
 operation is `IsolatedFromAbove`, it also inserts them as `BlockArgs` to canonicalize the mappings and
 simplify lowering.
 
-An example transformation by the `OMPDescriptorMapInfoGenPass`:
+An example transformation by the `OMPMapInfoFinalizationPass`:
 
 ```
 
diff --git a/flang/include/flang/Common/Fortran-features.h b/flang/include/flang/Common/Fortran-features.h
index 6b3e37cd9c25..f57fcdc895ad 100644
--- a/flang/include/flang/Common/Fortran-features.h
+++ b/flang/include/flang/Common/Fortran-features.h
@@ -48,7 +48,8 @@ ENUM_CLASS(LanguageFeature, BackslashEscapes, OldDebugLines,
     ImpliedDoIndexScope, DistinctCommonSizes, OddIndexVariableRestrictions,
     IndistinguishableSpecifics, SubroutineAndFunctionSpecifics,
     EmptySequenceType, NonSequenceCrayPointee, BranchIntoConstruct,
-    BadBranchTarget, ConvertedArgument, HollerithPolymorphic, ListDirectedSize)
+    BadBranchTarget, ConvertedArgument, HollerithPolymorphic, ListDirectedSize,
+    NonBindCInteroperability, CudaManaged, CudaUnified)
 
 // Portability and suspicious usage warnings
 ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable,
@@ -80,6 +81,8 @@ public:
     disable_.set(LanguageFeature::OpenACC);
     disable_.set(LanguageFeature::OpenMP);
     disable_.set(LanguageFeature::CUDA); // !@cuf
+    disable_.set(LanguageFeature::CudaManaged);
+    disable_.set(LanguageFeature::CudaUnified);
     disable_.set(LanguageFeature::ImplicitNoneTypeNever);
     disable_.set(LanguageFeature::ImplicitNoneTypeAlways);
     disable_.set(LanguageFeature::DefaultSave);
diff --git a/flang/include/flang/Common/Fortran.h b/flang/include/flang/Common/Fortran.h
index 3b965fe60c2f..0701e3e8b64c 100644
--- a/flang/include/flang/Common/Fortran.h
+++ b/flang/include/flang/Common/Fortran.h
@@ -19,6 +19,7 @@
 #include <string>
 
 namespace Fortran::common {
+class LanguageFeatureControl;
 
 // Fortran has five kinds of intrinsic data types, plus the derived types.
 ENUM_CLASS(TypeCategory, Integer, Real, Complex, Character, Logical, Derived)
@@ -115,7 +116,8 @@ static constexpr IgnoreTKRSet ignoreTKRAll{IgnoreTKR::Type, IgnoreTKR::Kind,
 std::string AsFortran(IgnoreTKRSet);
 
 bool AreCompatibleCUDADataAttrs(std::optional<CUDADataAttr>,
-    std::optional<CUDADataAttr>, IgnoreTKRSet, bool allowUnifiedMatchingRule);
+    std::optional<CUDADataAttr>, IgnoreTKRSet, bool allowUnifiedMatchingRule,
+    const LanguageFeatureControl *features = nullptr);
 
 static constexpr char blankCommonObjectName[] = "__BLNK__";
 
diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h
index ca14c144af2d..cb750d5e82d8 100644
--- a/flang/include/flang/Evaluate/tools.h
+++ b/flang/include/flang/Evaluate/tools.h
@@ -152,9 +152,11 @@ std::optional<Expr<SomeType>> AsGenericExpr(const Symbol &);
 // Propagate std::optional from input to output.
 template <typename A>
 std::optional<Expr<SomeType>> AsGenericExpr(std::optional<A> &&x) {
-  if (!x)
+  if (x) {
+    return AsGenericExpr(std::move(*x));
+  } else {
     return std::nullopt;
-  return AsGenericExpr(std::move(*x));
+  }
 }
 
 template <typename A>
diff --git a/flang/include/flang/Frontend/CompilerInvocation.h b/flang/include/flang/Frontend/CompilerInvocation.h
index 4924d090eaf9..0fefaecfe4f0 100644
--- a/flang/include/flang/Frontend/CompilerInvocation.h
+++ b/flang/include/flang/Frontend/CompilerInvocation.h
@@ -114,8 +114,10 @@ class CompilerInvocation : public CompilerInvocationBase {
   // Fortran Dialect options
   Fortran::common::IntrinsicTypeDefaultKinds defaultKinds;
 
+  // Fortran Warning options
   bool enableConformanceChecks = false;
   bool enableUsageChecks = false;
+  bool disableWarnings = false;
 
   /// Used in e.g. unparsing to dump the analyzed rather than the original
   /// parse-tree objects.
@@ -197,6 +199,9 @@ public:
   bool &getEnableUsageChecks() { return enableUsageChecks; }
   const bool &getEnableUsageChecks() const { return enableUsageChecks; }
 
+  bool &getDisableWarnings() { return disableWarnings; }
+  const bool &getDisableWarnings() const { return disableWarnings; }
+
   Fortran::parser::AnalyzedObjectsAsFortran &getAsFortran() {
     return asFortran;
   }
@@ -226,6 +231,9 @@ public:
   // Enables the usage checks
   void setEnableUsageChecks() { enableUsageChecks = true; }
 
+  // Disables all Warnings
+  void setDisableWarnings() { disableWarnings = true; }
+
   /// Useful setters
   void setArgv0(const char *dir) { argv0 = dir; }
 
diff --git a/flang/include/flang/Lower/AbstractConverter.h b/flang/include/flang/Lower/AbstractConverter.h
index d5dab9040d22..0bc68de6938d 100644
--- a/flang/include/flang/Lower/AbstractConverter.h
+++ b/flang/include/flang/Lower/AbstractConverter.h
@@ -134,9 +134,12 @@ public:
   virtual bool isPresentShallowLookup(Fortran::semantics::Symbol &sym) = 0;
 
   /// Collect the set of symbols with \p flag in \p eval
-  /// region if \p collectSymbols is true. Likewise, collect the
+  /// region if \p collectSymbols is true. Otherwise, collect the
   /// set of the host symbols with \p flag of the associated symbols in \p eval
-  /// region if collectHostAssociatedSymbols is true.
+  /// region if collectHostAssociatedSymbols is true. This allows gathering
+  /// host association details of symbols particularly in nested directives
+  /// irrespective of \p flag \p, and can be useful where host
+  /// association details are needed in flag-agnostic manner.
   virtual void collectSymbolSet(
       pft::Evaluation &eval,
       llvm::SetVector<const Fortran::semantics::Symbol *> &symbolSet,
@@ -216,6 +219,18 @@ public:
   /// function.
   virtual void bindHostAssocTuple(mlir::Value val) = 0;
 
+  /// Returns fir.dummy_scope operation's result value to be used
+  /// as dummy_scope operand of hlfir.declare operations for the dummy
+  /// arguments of this function.
+  virtual mlir::Value dummyArgsScopeValue() const = 0;
+
+  /// Returns true if the given symbol is a dummy argument of this function.
+  /// Note that it returns false for all the symbols after all the variables
+  /// are instantiated for this function, i.e. it can only be used reliably
+  /// during the instatiation of the variables.
+  virtual bool
+  isRegisteredDummySymbol(Fortran::semantics::SymbolRef symRef) const = 0;
+
   //===--------------------------------------------------------------------===//
   // Types
   //===--------------------------------------------------------------------===//
diff --git a/flang/include/flang/Optimizer/Builder/FIRBuilder.h b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
index e4c954159f71..0d650f830b64 100644
--- a/flang/include/flang/Optimizer/Builder/FIRBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
@@ -708,6 +708,13 @@ mlir::Value createNullBoxProc(fir::FirOpBuilder &builder, mlir::Location loc,
 
 /// Set internal linkage attribute on a function.
 void setInternalLinkage(mlir::func::FuncOp);
+
+llvm::SmallVector<mlir::Value>
+elideExtentsAlreadyInType(mlir::Type type, mlir::ValueRange shape);
+
+llvm::SmallVector<mlir::Value>
+elideLengthsAlreadyInType(mlir::Type type, mlir::ValueRange lenParams);
+
 } // namespace fir::factory
 
 #endif // FORTRAN_OPTIMIZER_BUILDER_FIRBUILDER_H
diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
index 6c36f7e84db6..6cc8e71b3b18 100644
--- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
@@ -223,7 +223,7 @@ public:
 using CleanupFunction = std::function<void()>;
 std::pair<fir::ExtendedValue, std::optional<CleanupFunction>>
 translateToExtendedValue(mlir::Location loc, fir::FirOpBuilder &builder,
-                         Entity entity);
+                         Entity entity, bool contiguousHint = false);
 
 /// Function to translate FortranVariableOpInterface to fir::ExtendedValue.
 /// It may generates IR to unbox fir.boxchar, but has otherwise no side effects
@@ -238,6 +238,7 @@ fir::FortranVariableOpInterface
 genDeclare(mlir::Location loc, fir::FirOpBuilder &builder,
            const fir::ExtendedValue &exv, llvm::StringRef name,
            fir::FortranVariableFlagsAttr flags,
+           mlir::Value dummyScope = nullptr,
            fir::CUDADataAttributeAttr cudaAttr = {});
 
 /// Generate an hlfir.associate to build a variable from an expression value.
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index dc38e56d93c6..64c5e360b28f 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -3364,6 +3364,8 @@ def fir_CUDAAllocOp : fir_Op<"cuda_alloc", [AttrSizedOperandSegments,
       CArg<"mlir::ValueRange", "{}">:$typeparams,
       CArg<"mlir::ValueRange", "{}">:$shape,
       CArg<"llvm::ArrayRef<mlir::NamedAttribute>", "{}">:$attributes)>];
+
+  let hasVerifier = 1;
 }
 
 def fir_CUDAFreeOp : fir_Op<"cuda_free", [MemoryEffects<[MemFree]>]> {
@@ -3381,6 +3383,8 @@ def fir_CUDAFreeOp : fir_Op<"cuda_free", [MemoryEffects<[MemFree]>]> {
   );
 
   let assemblyFormat = "$devptr `:` qualified(type($devptr)) attr-dict";
+
+  let hasVerifier = 1;
 }
 
 #endif
diff --git a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
index ee3c26800ae3..9558a6832972 100644
--- a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
+++ b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
@@ -104,6 +104,7 @@ def hlfir_DeclareOp : hlfir_Op<"declare", [AttrSizedOperandSegments,
   let builders = [
     OpBuilder<(ins "mlir::Value":$memref, "llvm::StringRef":$uniq_name,
       CArg<"mlir::Value", "{}">:$shape, CArg<"mlir::ValueRange", "{}">:$typeparams,
+      CArg<"mlir::Value", "{}">:$dummy_scope,
       CArg<"fir::FortranVariableFlagsAttr", "{}">:$fortran_attrs,
       CArg<"fir::CUDADataAttributeAttr", "{}">:$cuda_attr)>];
 
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h
index 470ed8a125ac..ae1d72a3526b 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.h
+++ b/flang/include/flang/Optimizer/Transforms/Passes.h
@@ -68,7 +68,7 @@ std::unique_ptr<mlir::Pass> createAlgebraicSimplificationPass();
 std::unique_ptr<mlir::Pass>
 createAlgebraicSimplificationPass(const mlir::GreedyRewriteConfig &config);
 
-std::unique_ptr<mlir::Pass> createOMPDescriptorMapInfoGenPass();
+std::unique_ptr<mlir::Pass> createOMPMapInfoFinalizationPass();
 std::unique_ptr<mlir::Pass> createOMPFunctionFilteringPass();
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 createOMPMarkDeclareTargetPass();
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td
index 1eaaa32a508a..e22c1b5f338b 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.td
+++ b/flang/include/flang/Optimizer/Transforms/Passes.td
@@ -321,15 +321,15 @@ def LoopVersioning : Pass<"loop-versioning", "mlir::func::FuncOp"> {
   let dependentDialects = [ "fir::FIROpsDialect" ];
 }
 
-def OMPDescriptorMapInfoGenPass
-    : Pass<"omp-descriptor-map-info-gen", "mlir::func::FuncOp"> {
+def OMPMapInfoFinalizationPass
+    : Pass<"omp-map-info-finalization", "mlir::func::FuncOp"> {
   let summary = "expands OpenMP MapInfo operations containing descriptors";
   let description = [{
     Expands MapInfo operations containing descriptor types into multiple 
     MapInfo's for each pointer element in the descriptor that requires 
     explicit individual mapping by the OpenMP runtime.
   }];
-  let constructor = "::fir::createOMPDescriptorMapInfoGenPass()";
+  let constructor = "::fir::createOMPMapInfoFinalizationPass()";
   let dependentDialects = ["mlir::omp::OpenMPDialect"];
 }
 
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index 4641f9d20d5b..c06354458379 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -2905,7 +2905,8 @@ struct ModuleSubprogram {
   UNION_CLASS_BOILERPLATE(ModuleSubprogram);
   std::variant<common::Indirection<FunctionSubprogram>,
       common::Indirection<SubroutineSubprogram>,
-      common::Indirection<SeparateModuleSubprogram>>
+      common::Indirection<SeparateModuleSubprogram>,
+      common::Indirection<CompilerDirective>>
       u;
 };
 
diff --git a/flang/include/flang/Semantics/symbol.h b/flang/include/flang/Semantics/symbol.h
index 4d0a993d0275..50f7b68d80cb 100644
--- a/flang/include/flang/Semantics/symbol.h
+++ b/flang/include/flang/Semantics/symbol.h
@@ -745,7 +745,8 @@ public:
       OmpCommonBlock, OmpReduction, OmpAligned, OmpNontemporal, OmpAllocate,
       OmpDeclarativeAllocateDirective, OmpExecutableAllocateDirective,
       OmpDeclareSimd, OmpDeclareTarget, OmpThreadprivate, OmpDeclareReduction,
-      OmpFlushed, OmpCriticalLock, OmpIfSpecified, OmpNone, OmpPreDetermined);
+      OmpFlushed, OmpCriticalLock, OmpIfSpecified, OmpNone, OmpPreDetermined,
+      OmpImplicit);
   using Flags = common::EnumSet<Flag, Flag_enumSize>;
 
   const Scope &owner() const { return *owner_; }
diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h
index efb5c9ba1077..46978441a640 100644
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -222,6 +222,25 @@ inline bool HasCUDAAttr(const Symbol &sym) {
   return false;
 }
 
+inline bool NeedCUDAAlloc(const Symbol &sym) {
+  bool inDeviceSubprogram{IsCUDADeviceContext(&sym.owner())};
+  if (Fortran::semantics::IsDummy(sym))
+    return false;
+  if (const auto *details{
+          sym.GetUltimate().detailsIf<semantics::ObjectEntityDetails>()}) {
+    if (details->cudaDataAttr() &&
+        (*details->cudaDataAttr() == common::CUDADataAttr::Device ||
+            *details->cudaDataAttr() == common::CUDADataAttr::Managed ||
+            *details->cudaDataAttr() == common::CUDADataAttr::Unified)) {
+      // Descriptor is allocated on host when in host context.
+      if (Fortran::semantics::IsAllocatable(sym))
+        return inDeviceSubprogram;
+      return true;
+    }
+  }
+  return false;
+}
+
 const Scope *FindCUDADeviceContext(const Scope *);
 std::optional<common::CUDADataAttr> GetCUDADataAttr(const Symbol *);
 
diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc
index d85436489870..cc3431d5b71d 100644
--- a/flang/include/flang/Tools/CLOptions.inc
+++ b/flang/include/flang/Tools/CLOptions.inc
@@ -232,12 +232,27 @@ inline void addExternalNameConversionPass(
   });
 }
 
+// Use inliner extension point callback to register the default inliner pass.
+inline void registerDefaultInlinerPass(MLIRToLLVMPassPipelineConfig &config) {
+  config.registerFIRInlinerCallback(
+      [](mlir::PassManager &pm, llvm::OptimizationLevel level) {
+        llvm::StringMap<mlir::OpPassManager> pipelines;
+        // The default inliner pass adds the canonicalizer pass with the default
+        // configuration.
+        pm.addPass(mlir::createInlinerPass(
+            pipelines, addCanonicalizerPassWithoutRegionSimplification));
+      });
+}
+
 /// Create a pass pipeline for running default optimization passes for
 /// incremental conversion of FIR.
 ///
 /// \param pm - MLIR pass manager that will hold the pipeline definition
 inline void createDefaultFIROptimizerPassPipeline(
-    mlir::PassManager &pm, const MLIRToLLVMPassPipelineConfig &pc) {
+    mlir::PassManager &pm, MLIRToLLVMPassPipelineConfig &pc) {
+  // Early Optimizer EP Callback
+  pc.invokeFIROptEarlyEPCallbacks(pm, pc.OptLevel);
+
   // simplify the IR
   mlir::GreedyRewriteConfig config;
   config.enableRegionSimplification = false;
@@ -262,11 +277,9 @@ inline void createDefaultFIROptimizerPassPipeline(
   else
     fir::addMemoryAllocationOpt(pm);
 
-  // The default inliner pass adds the canonicalizer pass with the default
-  // configuration. Create the inliner pass with tco config.
-  llvm::StringMap<mlir::OpPassManager> pipelines;
-  pm.addPass(mlir::createInlinerPass(
-      pipelines, addCanonicalizerPassWithoutRegionSimplification));
+  // FIR Inliner Callback
+  pc.invokeFIRInlinerCallback(pm, pc.OptLevel);
+
   pm.addPass(fir::createSimplifyRegionLite());
   pm.addPass(mlir::createCSEPass());
 
@@ -283,6 +296,9 @@ inline void createDefaultFIROptimizerPassPipeline(
   pm.addPass(mlir::createCanonicalizerPass(config));
   pm.addPass(fir::createSimplifyRegionLite());
   pm.addPass(mlir::createCSEPass());
+
+  // Last Optimizer EP Callback
+  pc.invokeFIROptLastEPCallbacks(pm, pc.OptLevel);
 }
 
 /// Create a pass pipeline for lowering from HLFIR to FIR
@@ -319,7 +335,7 @@ inline void createHLFIRToFIRPassPipeline(
 /// rather than the host device.
 inline void createOpenMPFIRPassPipeline(
     mlir::PassManager &pm, bool isTargetDevice) {
-  pm.addPass(fir::createOMPDescriptorMapInfoGenPass());
+  pm.addPass(fir::createOMPMapInfoFinalizationPass());
   pm.addPass(fir::createOMPMarkDeclareTargetPass());
   if (isTargetDevice)
     pm.addPass(fir::createOMPFunctionFilteringPass());
@@ -375,8 +391,7 @@ inline void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm,
 /// \param optLevel - optimization level used for creating FIR optimization
 ///   passes pipeline
 inline void createMLIRToLLVMPassPipeline(mlir::PassManager &pm,
-    const MLIRToLLVMPassPipelineConfig &config,
-    llvm::StringRef inputFilename = {}) {
+    MLIRToLLVMPassPipelineConfig &config, llvm::StringRef inputFilename = {}) {
   fir::createHLFIRToFIRPassPipeline(pm, config.OptLevel);
 
   // Add default optimizer pass pipeline.
diff --git a/flang/include/flang/Tools/CrossToolHelpers.h b/flang/include/flang/Tools/CrossToolHelpers.h
index cebdd6d181c3..f79520707714 100644
--- a/flang/include/flang/Tools/CrossToolHelpers.h
+++ b/flang/include/flang/Tools/CrossToolHelpers.h
@@ -20,11 +20,66 @@
 
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "mlir/Pass/PassRegistry.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Frontend/Debug/Options.h"
 #include "llvm/Passes/OptimizationLevel.h"
 
+// Flang Extension Point Callbacks
+class FlangEPCallBacks {
+public:
+  void registerFIROptEarlyEPCallbacks(
+      const std::function<void(mlir::PassManager &, llvm::OptimizationLevel)>
+          &C) {
+    FIROptEarlyEPCallbacks.push_back(C);
+  }
+
+  void registerFIRInlinerCallback(
+      const std::function<void(mlir::PassManager &, llvm::OptimizationLevel)>
+          &C) {
+    FIRInlinerCallback.push_back(C);
+  }
+
+  void registerFIROptLastEPCallbacks(
+      const std::function<void(mlir::PassManager &, llvm::OptimizationLevel)>
+          &C) {
+    FIROptLastEPCallbacks.push_back(C);
+  }
+
+  void invokeFIROptEarlyEPCallbacks(
+      mlir::PassManager &pm, llvm::OptimizationLevel optLevel) {
+    for (auto &C : FIROptEarlyEPCallbacks)
+      C(pm, optLevel);
+  };
+
+  void invokeFIRInlinerCallback(
+      mlir::PassManager &pm, llvm::OptimizationLevel optLevel) {
+    for (auto &C : FIRInlinerCallback)
+      C(pm, optLevel);
+  };
+
+  void invokeFIROptLastEPCallbacks(
+      mlir::PassManager &pm, llvm::OptimizationLevel optLevel) {
+    for (auto &C : FIROptLastEPCallbacks)
+      C(pm, optLevel);
+  };
+
+private:
+  llvm::SmallVector<
+      std::function<void(mlir::PassManager &, llvm::OptimizationLevel)>, 1>
+      FIROptEarlyEPCallbacks;
+
+  llvm::SmallVector<
+      std::function<void(mlir::PassManager &, llvm::OptimizationLevel)>, 1>
+      FIRInlinerCallback;
+
+  llvm::SmallVector<
+      std::function<void(mlir::PassManager &, llvm::OptimizationLevel)>, 1>
+      FIROptLastEPCallbacks;
+};
+
 /// Configuriation for the MLIR to LLVM pass pipeline.
-struct MLIRToLLVMPassPipelineConfig {
+struct MLIRToLLVMPassPipelineConfig : public FlangEPCallBacks {
   explicit MLIRToLLVMPassPipelineConfig(llvm::OptimizationLevel level) {
     OptLevel = level;
   }
diff --git a/flang/lib/Common/Fortran.cpp b/flang/lib/Common/Fortran.cpp
index 170ce8c22509..c014b1263a67 100644
--- a/flang/lib/Common/Fortran.cpp
+++ b/flang/lib/Common/Fortran.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Common/Fortran.h"
+#include "flang/Common/Fortran-features.h"
 
 namespace Fortran::common {
 
@@ -102,7 +103,13 @@ std::string AsFortran(IgnoreTKRSet tkr) {
 /// dummy argument attribute while `y` represents the actual argument attribute.
 bool AreCompatibleCUDADataAttrs(std::optional<CUDADataAttr> x,
     std::optional<CUDADataAttr> y, IgnoreTKRSet ignoreTKR,
-    bool allowUnifiedMatchingRule) {
+    bool allowUnifiedMatchingRule, const LanguageFeatureControl *features) {
+  bool isCudaManaged{features
+          ? features->IsEnabled(common::LanguageFeature::CudaManaged)
+          : false};
+  bool isCudaUnified{features
+          ? features->IsEnabled(common::LanguageFeature::CudaUnified)
+          : false};
   if (!x && !y) {
     return true;
   } else if (x && y && *x == *y) {
@@ -120,19 +127,27 @@ bool AreCompatibleCUDADataAttrs(std::optional<CUDADataAttr> x,
     return true;
   } else if (allowUnifiedMatchingRule) {
     if (!x) { // Dummy argument has no attribute -> host
-      if (y && (*y == CUDADataAttr::Managed || *y == CUDADataAttr::Unified)) {
+      if ((y && (*y == CUDADataAttr::Managed || *y == CUDADataAttr::Unified)) ||
+          (!y && (isCudaUnified || isCudaManaged))) {
         return true;
       }
     } else {
-      if (*x == CUDADataAttr::Device && y &&
-          (*y == CUDADataAttr::Managed || *y == CUDADataAttr::Unified)) {
-        return true;
-      } else if (*x == CUDADataAttr::Managed && y &&
-          *y == CUDADataAttr::Unified) {
-        return true;
-      } else if (*x == CUDADataAttr::Unified && y &&
-          *y == CUDADataAttr::Managed) {
-        return true;
+      if (*x == CUDADataAttr::Device) {
+        if ((y &&
+                (*y == CUDADataAttr::Managed || *y == CUDADataAttr::Unified)) ||
+            (!y && (isCudaUnified || isCudaManaged))) {
+          return true;
+        }
+      } else if (*x == CUDADataAttr::Managed) {
+        if ((y && *y == CUDADataAttr::Unified) ||
+            (!y && (isCudaUnified || isCudaManaged))) {
+          return true;
+        }
+      } else if (*x == CUDADataAttr::Unified) {
+        if ((y && *y == CUDADataAttr::Managed) ||
+            (!y && (isCudaUnified || isCudaManaged))) {
+          return true;
+        }
       }
     }
     return false;
diff --git a/flang/lib/Evaluate/fold-designator.cpp b/flang/lib/Evaluate/fold-designator.cpp
index 6952436681f7..0d8c22fb2977 100644
--- a/flang/lib/Evaluate/fold-designator.cpp
+++ b/flang/lib/Evaluate/fold-designator.cpp
@@ -273,9 +273,8 @@ static std::optional<DataRef> OffsetToDataRef(FoldingContext &context,
   if (IsAllocatableOrPointer(symbol)) {
     return entity.IsSymbol() ? DataRef{symbol}
                              : DataRef{std::move(entity.GetComponent())};
-  }
-  std::optional<DataRef> result;
-  if (std::optional<DynamicType> type{DynamicType::From(symbol)}) {
+  } else if (std::optional<DynamicType> type{DynamicType::From(symbol)}) {
+    std::optional<DataRef> result;
     if (!type->IsUnlimitedPolymorphic()) {
       if (std::optional<Shape> shape{GetShape(context, symbol)}) {
         if (GetRank(*shape) > 0) {
@@ -289,7 +288,7 @@ static std::optional<DataRef> OffsetToDataRef(FoldingContext &context,
               : DataRef{std::move(entity.GetComponent())};
         }
         if (result && type->category() == TypeCategory::Derived &&
-            size < result->GetLastSymbol().size()) {
+            size <= result->GetLastSymbol().size()) {
           if (const Symbol *
               component{OffsetToUniqueComponent(
                   type->GetDerivedTypeSpec(), offset)}) {
@@ -298,25 +297,32 @@ static std::optional<DataRef> OffsetToDataRef(FoldingContext &context,
                 NamedEntity{Component{std::move(*result), *component}}, offset,
                 size);
           }
-          result.reset();
         }
       }
     }
+    return result;
+  } else {
+    return std::nullopt;
   }
-  return result;
 }
 
 // Reconstructs a Designator from a symbol, an offset, and a size.
+// Returns a ProcedureDesignator in the case of a whole procedure pointer.
 std::optional<Expr<SomeType>> OffsetToDesignator(FoldingContext &context,
     const Symbol &baseSymbol, ConstantSubscript offset, std::size_t size) {
   if (offset < 0) {
     return std::nullopt;
-  }
-  if (std::optional<DataRef> dataRef{
-          OffsetToDataRef(context, NamedEntity{baseSymbol}, offset, size)}) {
+  } else if (std::optional<DataRef> dataRef{OffsetToDataRef(
+                 context, NamedEntity{baseSymbol}, offset, size)}) {
     const Symbol &symbol{dataRef->GetLastSymbol()};
-    if (std::optional<Expr<SomeType>> result{
-            AsGenericExpr(std::move(*dataRef))}) {
+    if (IsProcedurePointer(symbol)) {
+      if (std::holds_alternative<SymbolRef>(dataRef->u)) {
+        return Expr<SomeType>{ProcedureDesignator{symbol}};
+      } else if (auto *component{std::get_if<Component>(&dataRef->u)}) {
+        return Expr<SomeType>{ProcedureDesignator{std::move(*component)}};
+      }
+    } else if (std::optional<Expr<SomeType>> result{
+                   AsGenericExpr(std::move(*dataRef))}) {
       if (IsAllocatableOrPointer(symbol)) {
       } else if (auto type{DynamicType::From(symbol)}) {
         if (auto elementBytes{
diff --git a/flang/lib/Evaluate/fold-real.cpp b/flang/lib/Evaluate/fold-real.cpp
index 1ccf3f979ece..238ce34adfb7 100644
--- a/flang/lib/Evaluate/fold-real.cpp
+++ b/flang/lib/Evaluate/fold-real.cpp
@@ -202,10 +202,10 @@ Expr<Type<TypeCategory::Real, KIND>> FoldIntrinsicFunction(
     }
   } else if (name == "abs") { // incl. zabs & cdabs
     // Argument can be complex or real
-    if (auto *x{UnwrapExpr<Expr<SomeReal>>(args[0])}) {
+    if (UnwrapExpr<Expr<SomeReal>>(args[0])) {
       return FoldElementalIntrinsic<T, T>(
           context, std::move(funcRef), &Scalar<T>::ABS);
-    } else if (auto *z{UnwrapExpr<Expr<SomeComplex>>(args[0])}) {
+    } else if (UnwrapExpr<Expr<SomeComplex>>(args[0])) {
       return FoldElementalIntrinsic<T, ComplexT>(context, std::move(funcRef),
           ScalarFunc<T, ComplexT>([&name, &context](
                                       const Scalar<ComplexT> &z) -> Scalar<T> {
diff --git a/flang/lib/Evaluate/formatting.cpp b/flang/lib/Evaluate/formatting.cpp
index 5f822bbcbb04..20193b006bf2 100644
--- a/flang/lib/Evaluate/formatting.cpp
+++ b/flang/lib/Evaluate/formatting.cpp
@@ -539,10 +539,10 @@ std::string DynamicType::AsFortran() const {
       result += length->AsFortran();
     }
     return result + ')';
-  } else if (IsUnlimitedPolymorphic()) {
-    return "CLASS(*)";
   } else if (IsAssumedType()) {
     return "TYPE(*)";
+  } else if (IsUnlimitedPolymorphic()) {
+    return "CLASS(*)";
   } else if (IsTypelessIntrinsicArgument()) {
     return "(typeless intrinsic function argument)";
   } else {
diff --git a/flang/lib/Evaluate/tools.cpp b/flang/lib/Evaluate/tools.cpp
index 9a5f9130632e..826b97b87bf3 100644
--- a/flang/lib/Evaluate/tools.cpp
+++ b/flang/lib/Evaluate/tools.cpp
@@ -28,11 +28,11 @@ namespace Fortran::evaluate {
 static constexpr bool allowOperandDuplication{false};
 
 std::optional<Expr<SomeType>> AsGenericExpr(DataRef &&ref) {
-  const Symbol &symbol{ref.GetLastSymbol()};
-  if (auto dyType{DynamicType::From(symbol)}) {
+  if (auto dyType{DynamicType::From(ref.GetLastSymbol())}) {
     return TypedWrapper<Designator, DataRef>(*dyType, std::move(ref));
+  } else {
+    return std::nullopt;
   }
-  return std::nullopt;
 }
 
 std::optional<Expr<SomeType>> AsGenericExpr(const Symbol &symbol) {
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index f1b7b5397539..db7fd3cccc7a 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -883,7 +883,7 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
 
   // -x cuda
   auto language = args.getLastArgValue(clang::driver::options::OPT_x);
-  if (language.equals("cuda")) {
+  if (language == "cuda") {
     res.getFrontendOpts().features.Enable(
         Fortran::common::LanguageFeature::CUDA);
   }
@@ -975,13 +975,18 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
     res.setEnableConformanceChecks();
     res.setEnableUsageChecks();
   }
+
+  // -w
+  if (args.hasArg(clang::driver::options::OPT_w))
+    res.setDisableWarnings();
+
   // -std=f2018
   // TODO: Set proper options when more fortran standards
   // are supported.
   if (args.hasArg(clang::driver::options::OPT_std_EQ)) {
     auto standard = args.getLastArgValue(clang::driver::options::OPT_std_EQ);
     // We only allow f2018 as the given standard
-    if (standard.equals("f2018")) {
+    if (standard == "f2018") {
       res.setEnableConformanceChecks();
     } else {
       const unsigned diagID =
@@ -1403,6 +1408,11 @@ void CompilerInvocation::setFortranOpts() {
 
   if (getEnableUsageChecks())
     fortranOptions.features.WarnOnAllUsage();
+
+  if (getDisableWarnings()) {
+    fortranOptions.features.DisableAllNonstandardWarnings();
+    fortranOptions.features.DisableAllUsageWarnings();
+  }
 }
 
 std::unique_ptr<Fortran::semantics::SemanticsContext>
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index b96e2c87ae05..2f65ab6102f4 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -802,6 +802,7 @@ void CodeGenAction::generateLLVMIR() {
   pm.enableVerifier(/*verifyPasses=*/true);
 
   MLIRToLLVMPassPipelineConfig config(level, opts, mathOpts);
+  fir::registerDefaultInlinerPass(config);
 
   if (auto vsr = getVScaleRange(ci)) {
     config.VScaleMin = vsr->first;
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index fb01789d3f8a..79d6bbf65cbf 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -810,7 +810,7 @@ public:
                               bool collectSymbol) {
             if (collectSymbol && oriSymbol.test(flag))
               symbolSet.insert(&oriSymbol);
-            if (checkHostAssociatedSymbols)
+            else if (checkHostAssociatedSymbols)
               if (const auto *details{
                       oriSymbol
                           .detailsIf<Fortran::semantics::HostAssocDetails>()})
@@ -900,6 +900,16 @@ public:
     hostAssocTuple = val;
   }
 
+  mlir::Value dummyArgsScopeValue() const override final {
+    return dummyArgsScope;
+  }
+
+  bool isRegisteredDummySymbol(
+      Fortran::semantics::SymbolRef symRef) const override final {
+    auto *sym = &*symRef;
+    return registeredDummySymbols.contains(sym);
+  }
+
   void registerTypeInfo(mlir::Location loc,
                         Fortran::lower::SymbolRef typeInfoSym,
                         const Fortran::semantics::DerivedTypeSpec &typeSpec,
@@ -1145,10 +1155,11 @@ private:
   /// yet. The final mapping will be done using this pre-mapping in
   /// Fortran::lower::mapSymbolAttributes.
   bool mapBlockArgToDummyOrResult(const Fortran::semantics::SymbolRef sym,
-                                  mlir::Value val, bool forced = false) {
-    if (!forced && lookupSymbol(sym))
-      return false;
-    localSymbols.addSymbol(sym, val, forced);
+                                  mlir::Value val, bool isResult) {
+    localSymbols.addSymbol(sym, val);
+    if (!isResult)
+      registerDummySymbol(sym);
+
     return true;
   }
 
@@ -2055,17 +2066,19 @@ private:
   /// Generate structured or unstructured FIR for an IF construct.
   /// The initial statement may be either an IfStmt or an IfThenStmt.
   void genFIR(const Fortran::parser::IfConstruct &) {
-    mlir::Location loc = toLocation();
     Fortran::lower::pft::Evaluation &eval = getEval();
+
+    // Structured fir.if nest.
     if (eval.lowerAsStructured()) {
-      // Structured fir.if nest.
       fir::IfOp topIfOp, currentIfOp;
       for (Fortran::lower::pft::Evaluation &e : eval.getNestedEvaluations()) {
         auto genIfOp = [&](mlir::Value cond) {
-          auto ifOp = builder->create<fir::IfOp>(loc, cond, /*withElse=*/true);
+          auto ifOp =
+              builder->create<fir::IfOp>(toLocation(), cond, /*withElse=*/true);
           builder->setInsertionPointToStart(&ifOp.getThenRegion().front());
           return ifOp;
         };
+        setCurrentPosition(e.position);
         if (auto *s = e.getIf<Fortran::parser::IfThenStmt>()) {
           topIfOp = currentIfOp = genIfOp(genIfCondition(s, e.negateCondition));
         } else if (auto *s = e.getIf<Fortran::parser::IfStmt>()) {
@@ -2096,6 +2109,7 @@ private:
         else // non-empty block
           genConditionalBranch(cond, e.lexicalSuccessor, e.controlSuccessor);
       };
+      setCurrentPosition(e.position);
       if (auto *s = e.getIf<Fortran::parser::IfThenStmt>()) {
         maybeStartBlock(e.block);
         genIfBranch(genIfCondition(s, e.negateCondition));
@@ -2582,11 +2596,10 @@ private:
     llvm::SmallVector<mlir::Type> ivTypes;
     llvm::SmallVector<mlir::Location> ivLocs;
     llvm::SmallVector<mlir::Value> ivValues;
+    Fortran::lower::pft::Evaluation *loopEval =
+        &getEval().getFirstNestedEvaluation();
     for (unsigned i = 0; i < nestedLoops; ++i) {
       const Fortran::parser::LoopControl *loopControl;
-      Fortran::lower::pft::Evaluation *loopEval =
-          &getEval().getFirstNestedEvaluation();
-
       mlir::Location crtLoc = loc;
       if (i == 0) {
         loopControl = &*outerDoConstruct->GetLoopControl();
@@ -2863,6 +2876,7 @@ private:
     Fortran::lower::StatementContext stmtCtx;
     pushActiveConstruct(eval, stmtCtx);
     for (Fortran::lower::pft::Evaluation &e : eval.getNestedEvaluations()) {
+      setCurrentPosition(e.position);
       if (auto *stmt = e.getIf<Fortran::parser::AssociateStmt>()) {
         if (eval.lowerAsUnstructured())
           maybeStartBlock(e.block);
@@ -2891,10 +2905,10 @@ private:
     Fortran::lower::StatementContext stmtCtx;
     pushActiveConstruct(eval, stmtCtx);
     for (Fortran::lower::pft::Evaluation &e : eval.getNestedEvaluations()) {
+      setCurrentPosition(e.position);
       if (e.getIf<Fortran::parser::BlockStmt>()) {
         if (eval.lowerAsUnstructured())
           maybeStartBlock(e.block);
-        setCurrentPosition(e.position);
         const Fortran::parser::CharBlock &endPosition =
             eval.getLastNestedEvaluation().position;
         localSymbols.pushScope();
@@ -2921,7 +2935,6 @@ private:
       } else if (e.getIf<Fortran::parser::EndBlockStmt>()) {
         if (eval.lowerAsUnstructured())
           maybeStartBlock(e.block);
-        setCurrentPosition(e.position);
         localSymbols.popScope();
       } else {
         genFIR(e);
@@ -2963,7 +2976,6 @@ private:
   }
 
   void genFIR(const Fortran::parser::SelectTypeConstruct &selectTypeConstruct) {
-    mlir::Location loc = toLocation();
     mlir::MLIRContext *context = builder->getContext();
     Fortran::lower::StatementContext stmtCtx;
     fir::ExtendedValue selector;
@@ -2989,6 +3001,8 @@ private:
     pushActiveConstruct(getEval(), stmtCtx);
     for (Fortran::lower::pft::Evaluation &eval :
          getEval().getNestedEvaluations()) {
+      setCurrentPosition(eval.position);
+      mlir::Location loc = toLocation();
       if (auto *selectTypeStmt =
               eval.getIf<Fortran::parser::SelectTypeStmt>()) {
         // A genFIR(SelectTypeStmt) call would have unwanted side effects.
@@ -4534,9 +4548,13 @@ private:
     // constructs, this can be done for either the end construct statement,
     // or for the construct itself, which will skip this code if the
     // end statement was visited first and generated a branch.
-    Fortran::lower::pft::Evaluation *successor =
-        eval.isConstruct() ? eval.getLastNestedEvaluation().lexicalSuccessor
-                           : eval.lexicalSuccessor;
+    Fortran::lower::pft::Evaluation *successor = [&]() {
+      if (eval.isConstruct() ||
+          (eval.isDirective() && eval.hasNestedEvaluations()))
+        return eval.getLastNestedEvaluation().lexicalSuccessor;
+      return eval.lexicalSuccessor;
+    }();
+
     if (successor && blockIsUnterminated()) {
       if (successor->isIntermediateConstructStmt() &&
           successor->parentConstruct->lowerAsUnstructured())
@@ -4556,7 +4574,7 @@ private:
                             const Fortran::lower::CalleeInterface &callee) {
     assert(builder && "require a builder object at this point");
     using PassBy = Fortran::lower::CalleeInterface::PassEntityBy;
-    auto mapPassedEntity = [&](const auto arg) {
+    auto mapPassedEntity = [&](const auto arg, bool isResult = false) {
       if (arg.passBy == PassBy::AddressAndLength) {
         if (callee.characterize().IsBindC())
           return;
@@ -4566,10 +4584,11 @@ private:
         fir::factory::CharacterExprHelper charHelp{*builder, loc};
         mlir::Value box =
             charHelp.createEmboxChar(arg.firArgument, arg.firLength);
-        mapBlockArgToDummyOrResult(arg.entity->get(), box);
+        mapBlockArgToDummyOrResult(arg.entity->get(), box, isResult);
       } else {
         if (arg.entity.has_value()) {
-          mapBlockArgToDummyOrResult(arg.entity->get(), arg.firArgument);
+          mapBlockArgToDummyOrResult(arg.entity->get(), arg.firArgument,
+                                     isResult);
         } else {
           assert(funit.parentHasTupleHostAssoc() && "expect tuple argument");
         }
@@ -4578,15 +4597,19 @@ private:
     for (const Fortran::lower::CalleeInterface::PassedEntity &arg :
          callee.getPassedArguments())
       mapPassedEntity(arg);
+    if (lowerToHighLevelFIR() && !callee.getPassedArguments().empty()) {
+      mlir::Value scopeOp = builder->create<fir::DummyScopeOp>(toLocation());
+      setDummyArgsScope(scopeOp);
+    }
     if (std::optional<Fortran::lower::CalleeInterface::PassedEntity>
             passedResult = callee.getPassedResult()) {
-      mapPassedEntity(*passedResult);
+      mapPassedEntity(*passedResult, /*isResult=*/true);
       // FIXME: need to make sure things are OK here. addSymbol may not be OK
       if (funit.primaryResult &&
           passedResult->entity->get() != *funit.primaryResult)
         mapBlockArgToDummyOrResult(
-            *funit.primaryResult,
-            getSymbolAddress(passedResult->entity->get()));
+            *funit.primaryResult, getSymbolAddress(passedResult->entity->get()),
+            /*isResult=*/true);
     }
   }
 
@@ -4763,7 +4786,8 @@ private:
       Fortran::lower::StatementContext stmtCtx;
       if (std::optional<Fortran::lower::CalleeInterface::PassedEntity>
               passedResult = callee.getPassedResult()) {
-        mapBlockArgToDummyOrResult(altResult.getSymbol(), resultArg.getAddr());
+        mapBlockArgToDummyOrResult(altResult.getSymbol(), resultArg.getAddr(),
+                                   /*isResult=*/true);
         Fortran::lower::mapSymbolAttributes(*this, altResult, localSymbols,
                                             stmtCtx);
       } else {
@@ -4807,6 +4831,11 @@ private:
     if (!funit.getHostAssoc().empty())
       funit.getHostAssoc().hostProcedureBindings(*this, localSymbols);
 
+    // Unregister all dummy symbols, so that their cloning (e.g. for OpenMP
+    // privatization) does not create the cloned hlfir.declare operations
+    // with dummy_scope operands.
+    resetRegisteredDummySymbols();
+
     // Create most function blocks in advance.
     createEmptyBlocks(funit.evaluationList);
 
@@ -4926,6 +4955,8 @@ private:
     hostAssocTuple = mlir::Value{};
     localSymbols.clear();
     blockId = 0;
+    dummyArgsScope = mlir::Value{};
+    resetRegisteredDummySymbols();
   }
 
   /// Helper to generate GlobalOps when the builder is not positioned in any
@@ -4954,6 +4985,7 @@ private:
     delete builder;
     builder = nullptr;
     localSymbols.clear();
+    resetRegisteredDummySymbols();
   }
 
   /// Instantiate the data from a BLOCK DATA unit.
@@ -5371,6 +5403,23 @@ private:
                                         globalOmpRequiresSymbol);
   }
 
+  /// Record fir.dummy_scope operation for this function.
+  /// It will be used to set dummy_scope operand of the hlfir.declare
+  /// operations.
+  void setDummyArgsScope(mlir::Value val) {
+    assert(!dummyArgsScope && val);
+    dummyArgsScope = val;
+  }
+
+  /// Record the given symbol as a dummy argument of this function.
+  void registerDummySymbol(Fortran::semantics::SymbolRef symRef) {
+    auto *sym = &*symRef;
+    registeredDummySymbols.insert(sym);
+  }
+
+  /// Reset all registered dummy symbols.
+  void resetRegisteredDummySymbols() { registeredDummySymbols.clear(); }
+
   //===--------------------------------------------------------------------===//
 
   Fortran::lower::LoweringBridge &bridge;
@@ -5397,6 +5446,15 @@ private:
   /// Tuple of host associated variables
   mlir::Value hostAssocTuple;
 
+  /// Value of fir.dummy_scope operation for this function.
+  mlir::Value dummyArgsScope;
+
+  /// A set of dummy argument symbols for this function.
+  /// The set is only preserved during the instatiation
+  /// of variables for this function.
+  llvm::SmallPtrSet<const Fortran::semantics::Symbol *, 16>
+      registeredDummySymbols;
+
   /// A map of unique names for constant expressions.
   /// The names are used for representing the constant expressions
   /// with global constant initialized objects.
diff --git a/flang/lib/Lower/CMakeLists.txt b/flang/lib/Lower/CMakeLists.txt
index f92d1a2bc7de..1546409752e7 100644
--- a/flang/lib/Lower/CMakeLists.txt
+++ b/flang/lib/Lower/CMakeLists.txt
@@ -27,6 +27,7 @@ add_flang_library(FortranLower
   OpenMP/ClauseProcessor.cpp
   OpenMP/Clauses.cpp
   OpenMP/DataSharingProcessor.cpp
+  OpenMP/Decomposer.cpp
   OpenMP/OpenMP.cpp
   OpenMP/ReductionProcessor.cpp
   OpenMP/Utils.cpp
diff --git a/flang/lib/Lower/ConvertArrayConstructor.cpp b/flang/lib/Lower/ConvertArrayConstructor.cpp
index a5b5838fe6b6..341fad9a5e43 100644
--- a/flang/lib/Lower/ConvertArrayConstructor.cpp
+++ b/flang/lib/Lower/ConvertArrayConstructor.cpp
@@ -318,7 +318,7 @@ public:
       mlir::Value shape = builder.genShape(loc, extents);
       declare = builder.create<hlfir::DeclareOp>(
           loc, tempStorage, tempName, shape, lengths,
-          fir::FortranVariableFlagsAttr{});
+          /*dummy_scope=*/nullptr, fir::FortranVariableFlagsAttr{});
       initialBoxValue =
           builder.createBox(loc, boxType, declare->getOriginalBase(), shape,
                             /*slice=*/mlir::Value{}, lengths, /*tdesc=*/{});
diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp
index e4a0cc8d4730..3659dad367b4 100644
--- a/flang/lib/Lower/ConvertCall.cpp
+++ b/flang/lib/Lower/ConvertCall.cpp
@@ -1184,12 +1184,15 @@ static PreparedDummyArgument preparePresentUserCallActualArgument(
   // actual argument shape information. A descriptor with the dummy shape
   // information will be created later when all actual arguments are ready.
   mlir::Type dummyTypeWithActualRank = dummyType;
-  if (auto baseBoxDummy = mlir::dyn_cast<fir::BaseBoxType>(dummyType))
+  if (auto baseBoxDummy = mlir::dyn_cast<fir::BaseBoxType>(dummyType)) {
     if (baseBoxDummy.isAssumedRank() ||
         arg.testTKR(Fortran::common::IgnoreTKR::Rank) ||
-        arg.isSequenceAssociatedDescriptor())
-      dummyTypeWithActualRank =
-          baseBoxDummy.getBoxTypeWithNewShape(actual.getType());
+        arg.isSequenceAssociatedDescriptor()) {
+      mlir::Type actualTy =
+          hlfir::getFortranElementOrSequenceType(actual.getType());
+      dummyTypeWithActualRank = baseBoxDummy.getBoxTypeWithNewShape(actualTy);
+    }
+  }
   // Preserve the actual type in the argument preparation in case IgnoreTKR(t)
   // is set (descriptors must be created with the actual type in this case, and
   // copy-in/copy-out should be driven by the contiguity with regard to the
diff --git a/flang/lib/Lower/ConvertExprToHLFIR.cpp b/flang/lib/Lower/ConvertExprToHLFIR.cpp
index 93bdf650f9ff..3c305955520e 100644
--- a/flang/lib/Lower/ConvertExprToHLFIR.cpp
+++ b/flang/lib/Lower/ConvertExprToHLFIR.cpp
@@ -1676,7 +1676,8 @@ private:
     mlir::Value storagePtr = builder.createTemporary(loc, recTy);
     auto varOp = hlfir::EntityWithAttributes{builder.create<hlfir::DeclareOp>(
         loc, storagePtr, "ctor.temp", /*shape=*/nullptr,
-        /*typeparams=*/mlir::ValueRange{}, fir::FortranVariableFlagsAttr{})};
+        /*typeparams=*/mlir::ValueRange{}, /*dummy_scope=*/nullptr,
+        fir::FortranVariableFlagsAttr{})};
 
     // Initialize any components that need initialization.
     mlir::Value box = builder.createBox(loc, fir::ExtendedValue{varOp});
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index 413563fe95ca..5ddd8a6a9d41 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -693,6 +693,22 @@ static mlir::Value createNewLocal(Fortran::lower::AbstractConverter &converter,
   if (ultimateSymbol.test(Fortran::semantics::Symbol::Flag::CrayPointee))
     return builder.create<fir::ZeroOp>(loc, fir::ReferenceType::get(ty));
 
+  if (Fortran::semantics::NeedCUDAAlloc(ultimateSymbol)) {
+    fir::CUDADataAttributeAttr cudaAttr =
+        Fortran::lower::translateSymbolCUDADataAttribute(builder.getContext(),
+                                                         ultimateSymbol);
+    llvm::SmallVector<mlir::Value> indices;
+    llvm::SmallVector<mlir::Value> elidedShape =
+        fir::factory::elideExtentsAlreadyInType(ty, shape);
+    llvm::SmallVector<mlir::Value> elidedLenParams =
+        fir::factory::elideLengthsAlreadyInType(ty, lenParams);
+    auto idxTy = builder.getIndexType();
+    for (mlir::Value sh : elidedShape)
+      indices.push_back(builder.createConvert(loc, idxTy, sh));
+    return builder.create<fir::CUDAAllocOp>(loc, ty, nm, symNm, cudaAttr,
+                                            lenParams, indices);
+  }
+
   // Let the builder do all the heavy lifting.
   if (!Fortran::semantics::IsProcedurePointer(ultimateSymbol))
     return builder.allocateLocal(loc, ty, nm, symNm, shape, lenParams, isTarg);
@@ -927,6 +943,19 @@ static void instantiateLocal(Fortran::lower::AbstractConverter &converter,
       });
     }
   }
+  if (Fortran::semantics::NeedCUDAAlloc(var.getSymbol())) {
+    auto *builder = &converter.getFirOpBuilder();
+    mlir::Location loc = converter.getCurrentLocation();
+    fir::ExtendedValue exv =
+        converter.getSymbolExtendedValue(var.getSymbol(), &symMap);
+    auto *sym = &var.getSymbol();
+    converter.getFctCtx().attachCleanup([builder, loc, exv, sym]() {
+      fir::CUDADataAttributeAttr cudaAttr =
+          Fortran::lower::translateSymbolCUDADataAttribute(
+              builder->getContext(), *sym);
+      builder->create<fir::CUDAFreeOp>(loc, fir::getBase(exv), cudaAttr);
+    });
+  }
 }
 
 //===----------------------------------------------------------------===//
@@ -1654,7 +1683,8 @@ static void genDeclareSymbol(Fortran::lower::AbstractConverter &converter,
 
       // Declare a local pointer variable.
       auto newBase = builder.create<hlfir::DeclareOp>(
-          loc, boxAlloc, name, /*shape=*/nullptr, lenParams, attributes);
+          loc, boxAlloc, name, /*shape=*/nullptr, lenParams,
+          /*dummy_scope=*/nullptr, attributes);
       mlir::Value nullAddr = builder.createNullConstant(
           loc, llvm::cast<fir::BaseBoxType>(ptrBoxType).getEleTy());
 
@@ -1681,8 +1711,12 @@ static void genDeclareSymbol(Fortran::lower::AbstractConverter &converter,
       symMap.addVariableDefinition(sym, newBase, force);
       return;
     }
+    mlir::Value dummyScope;
+    if (converter.isRegisteredDummySymbol(sym))
+      dummyScope = converter.dummyArgsScopeValue();
     auto newBase = builder.create<hlfir::DeclareOp>(
-        loc, base, name, shapeOrShift, lenParams, attributes, cudaAttr);
+        loc, base, name, shapeOrShift, lenParams, dummyScope, attributes,
+        cudaAttr);
     symMap.addVariableDefinition(sym, newBase, force);
     return;
   }
@@ -1732,8 +1766,11 @@ void Fortran::lower::genDeclareSymbol(
         Fortran::lower::translateSymbolCUDADataAttribute(builder.getContext(),
                                                          sym.GetUltimate());
     auto name = converter.mangleName(sym);
-    hlfir::EntityWithAttributes declare =
-        hlfir::genDeclare(loc, builder, exv, name, attributes, cudaAttr);
+    mlir::Value dummyScope;
+    if (converter.isRegisteredDummySymbol(sym))
+      dummyScope = converter.dummyArgsScopeValue();
+    hlfir::EntityWithAttributes declare = hlfir::genDeclare(
+        loc, builder, exv, name, attributes, dummyScope, cudaAttr);
     symMap.addVariableDefinition(sym, declare.getIfVariableInterface(), force);
     return;
   }
@@ -1993,7 +2030,9 @@ void Fortran::lower::mapSymbolAttributes(
           fir::factory::genMutableBoxRead(
               builder, loc,
               fir::factory::createTempMutableBox(builder, loc, ty, {}, {},
-                                                 isPolymorphic)));
+                                                 isPolymorphic)),
+          fir::FortranVariableFlagsEnum::None,
+          converter.isRegisteredDummySymbol(sym));
       return true;
     }
     return false;
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index eae2afc760e6..b02e7be75d20 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -425,7 +425,8 @@ static void genPrivateLikeInitRegion(mlir::OpBuilder &builder, RecipeOp recipe,
       auto alloca = builder.create<fir::AllocaOp>(loc, refTy.getEleTy());
       auto declareOp = builder.create<hlfir::DeclareOp>(
           loc, alloca, accPrivateInitName, /*shape=*/nullptr,
-          llvm::ArrayRef<mlir::Value>{}, fir::FortranVariableFlagsAttr{});
+          llvm::ArrayRef<mlir::Value>{}, /*dummy_scope=*/nullptr,
+          fir::FortranVariableFlagsAttr{});
       retVal = declareOp.getBase();
     } else if (auto seqTy = mlir::dyn_cast_or_null<fir::SequenceType>(
                    refTy.getEleTy())) {
@@ -446,7 +447,8 @@ static void genPrivateLikeInitRegion(mlir::OpBuilder &builder, RecipeOp recipe,
             loc, seqTy, /*typeparams=*/mlir::ValueRange{}, extents);
         auto declareOp = builder.create<hlfir::DeclareOp>(
             loc, alloca, accPrivateInitName, shape,
-            llvm::ArrayRef<mlir::Value>{}, fir::FortranVariableFlagsAttr{});
+            llvm::ArrayRef<mlir::Value>{}, /*dummy_scope=*/nullptr,
+            fir::FortranVariableFlagsAttr{});
         retVal = declareOp.getBase();
       }
     }
@@ -666,10 +668,12 @@ mlir::acc::FirstprivateRecipeOp Fortran::lower::createOrGetFirstprivateRecipe(
 
     auto leftDeclOp = builder.create<hlfir::DeclareOp>(
         loc, recipe.getCopyRegion().getArgument(0), llvm::StringRef{}, shape,
-        llvm::ArrayRef<mlir::Value>{}, fir::FortranVariableFlagsAttr{});
+        llvm::ArrayRef<mlir::Value>{}, /*dummy_scope=*/nullptr,
+        fir::FortranVariableFlagsAttr{});
     auto rightDeclOp = builder.create<hlfir::DeclareOp>(
         loc, recipe.getCopyRegion().getArgument(1), llvm::StringRef{}, shape,
-        llvm::ArrayRef<mlir::Value>{}, fir::FortranVariableFlagsAttr{});
+        llvm::ArrayRef<mlir::Value>{}, /*dummy_scope=*/nullptr,
+        fir::FortranVariableFlagsAttr{});
 
     hlfir::DesignateOp::Subscripts triplets =
         getSubscriptsFromArgs(recipe.getCopyRegion().getArguments());
@@ -975,7 +979,8 @@ static mlir::Value genReductionInitRegion(fir::FirOpBuilder &builder,
     mlir::Value alloca = builder.create<fir::AllocaOp>(loc, ty);
     auto declareOp = builder.create<hlfir::DeclareOp>(
         loc, alloca, accReductionInitName, /*shape=*/nullptr,
-        llvm::ArrayRef<mlir::Value>{}, fir::FortranVariableFlagsAttr{});
+        llvm::ArrayRef<mlir::Value>{}, /*dummy_scope=*/nullptr,
+        fir::FortranVariableFlagsAttr{});
     builder.create<fir::StoreOp>(loc, builder.createConvert(loc, ty, initValue),
                                  declareOp.getBase());
     return declareOp.getBase();
@@ -991,7 +996,8 @@ static mlir::Value genReductionInitRegion(fir::FirOpBuilder &builder,
           loc, seqTy, /*typeparams=*/mlir::ValueRange{}, extents);
       auto declareOp = builder.create<hlfir::DeclareOp>(
           loc, alloca, accReductionInitName, shape,
-          llvm::ArrayRef<mlir::Value>{}, fir::FortranVariableFlagsAttr{});
+          llvm::ArrayRef<mlir::Value>{}, /*dummy_scope=*/nullptr,
+          fir::FortranVariableFlagsAttr{});
       mlir::Type idxTy = builder.getIndexType();
       mlir::Type refTy = fir::ReferenceType::get(seqTy.getEleTy());
       llvm::SmallVector<fir::DoLoopOp> loops;
@@ -1143,10 +1149,10 @@ static void genCombiner(fir::FirOpBuilder &builder, mlir::Location loc,
                                    recipe.getCombinerRegion().getArguments());
       auto v1DeclareOp = builder.create<hlfir::DeclareOp>(
           loc, value1, llvm::StringRef{}, shape, llvm::ArrayRef<mlir::Value>{},
-          fir::FortranVariableFlagsAttr{});
+          /*dummy_scope=*/nullptr, fir::FortranVariableFlagsAttr{});
       auto v2DeclareOp = builder.create<hlfir::DeclareOp>(
           loc, value2, llvm::StringRef{}, shape, llvm::ArrayRef<mlir::Value>{},
-          fir::FortranVariableFlagsAttr{});
+          /*dummy_scope=*/nullptr, fir::FortranVariableFlagsAttr{});
       hlfir::DesignateOp::Subscripts triplets = getTripletsFromArgs(recipe);
 
       llvm::SmallVector<mlir::Value> lenParamsLeft;
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index 79525d6dfe7a..0ea87314d571 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -555,9 +555,16 @@ bool ClauseProcessor::processCopyin() const {
   // synchronize threads and avoid data races on propagation master's thread
   // values of threadprivate variables to local instances of that variables of
   // all other implicit threads.
+
+  // All copies are inserted at either "insPt" (i.e. immediately before it),
+  // or at some earlier point (as determined by "copyHostAssociateVar").
+  // Unless the insertion point is given to "copyHostAssociateVar" explicitly,
+  // it will not restore the builder's insertion point. Since the copies may be
+  // inserted in any order (not following the execution order), make sure the
+  // barrier is inserted following all of them.
+  firOpBuilder.restoreInsertionPoint(insPt);
   if (hasCopyin)
     firOpBuilder.create<mlir::omp::BarrierOp>(converter.getCurrentLocation());
-  firOpBuilder.restoreInsertionPoint(insPt);
   return hasCopyin;
 }
 
@@ -650,12 +657,12 @@ createCopyFunc(mlir::Location loc, Fortran::lower::AbstractConverter &converter,
           builder.createIntegerConstant(loc, builder.getIndexType(), extent));
     shape = builder.create<fir::ShapeOp>(loc, extents);
   }
-  auto declDst = builder.create<hlfir::DeclareOp>(loc, funcOp.getArgument(0),
-                                                  copyFuncName + "_dst", shape,
-                                                  typeparams, attrs);
-  auto declSrc = builder.create<hlfir::DeclareOp>(loc, funcOp.getArgument(1),
-                                                  copyFuncName + "_src", shape,
-                                                  typeparams, attrs);
+  auto declDst = builder.create<hlfir::DeclareOp>(
+      loc, funcOp.getArgument(0), copyFuncName + "_dst", shape, typeparams,
+      /*dummy_scope=*/nullptr, attrs);
+  auto declSrc = builder.create<hlfir::DeclareOp>(
+      loc, funcOp.getArgument(1), copyFuncName + "_src", shape, typeparams,
+      /*dummy_scope=*/nullptr, attrs);
   converter.copyVar(loc, declDst.getBase(), declSrc.getBase());
   builder.create<mlir::func::ReturnOp>(loc);
   return funcOp;
@@ -807,30 +814,6 @@ bool ClauseProcessor::processLink(
       });
 }
 
-mlir::omp::MapInfoOp
-createMapInfoOp(fir::FirOpBuilder &builder, mlir::Location loc,
-                mlir::Value baseAddr, mlir::Value varPtrPtr, std::string name,
-                llvm::ArrayRef<mlir::Value> bounds,
-                llvm::ArrayRef<mlir::Value> members, uint64_t mapType,
-                mlir::omp::VariableCaptureKind mapCaptureType, mlir::Type retTy,
-                bool isVal) {
-  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(baseAddr.getType())) {
-    baseAddr = builder.create<fir::BoxAddrOp>(loc, baseAddr);
-    retTy = baseAddr.getType();
-  }
-
-  mlir::TypeAttr varType = mlir::TypeAttr::get(
-      llvm::cast<mlir::omp::PointerLikeType>(retTy).getElementType());
-
-  mlir::omp::MapInfoOp op = builder.create<mlir::omp::MapInfoOp>(
-      loc, retTy, baseAddr, varType, varPtrPtr, members, bounds,
-      builder.getIntegerAttr(builder.getIntegerType(64, false), mapType),
-      builder.getAttr<mlir::omp::VariableCaptureKindAttr>(mapCaptureType),
-      builder.getStringAttr(name));
-
-  return op;
-}
-
 bool ClauseProcessor::processMap(
     mlir::Location currentLocation, Fortran::lower::StatementContext &stmtCtx,
     mlir::omp::MapClauseOps &result,
@@ -838,7 +821,17 @@ bool ClauseProcessor::processMap(
     llvm::SmallVectorImpl<mlir::Location> *mapSymLocs,
     llvm::SmallVectorImpl<mlir::Type> *mapSymTypes) const {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  return findRepeatableClause<omp::clause::Map>(
+  // We always require tracking of symbols, even if the caller does not,
+  // so we create an optionally used local set of symbols when the mapSyms
+  // argument is not present.
+  llvm::SmallVector<const Fortran::semantics::Symbol *> localMapSyms;
+  llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> *ptrMapSyms =
+      mapSyms ? mapSyms : &localMapSyms;
+  std::map<const Fortran::semantics::Symbol *,
+           llvm::SmallVector<OmpMapMemberIndicesData>>
+      parentMemberIndices;
+
+  bool clauseFound = findRepeatableClause<omp::clause::Map>(
       [&](const omp::clause::Map &clause,
           const Fortran::parser::CharBlock &source) {
         using Map = omp::clause::Map;
@@ -903,24 +896,33 @@ bool ClauseProcessor::processMap(
           // Explicit map captures are captured ByRef by default,
           // optimisation passes may alter this to ByCopy or other capture
           // types to optimise
-          mlir::Value mapOp = createMapInfoOp(
-              firOpBuilder, clauseLocation, symAddr, mlir::Value{},
-              asFortran.str(), bounds, {},
+          mlir::omp::MapInfoOp mapOp = createMapInfoOp(
+              firOpBuilder, clauseLocation, symAddr,
+              /*varPtrPtr=*/mlir::Value{}, asFortran.str(), bounds,
+              /*members=*/{}, /*membersIndex=*/mlir::DenseIntElementsAttr{},
               static_cast<
                   std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
                   mapTypeBits),
               mlir::omp::VariableCaptureKind::ByRef, symAddr.getType());
 
-          result.mapVars.push_back(mapOp);
-
-          if (mapSyms)
-            mapSyms->push_back(object.id());
-          if (mapSymLocs)
-            mapSymLocs->push_back(symAddr.getLoc());
-          if (mapSymTypes)
-            mapSymTypes->push_back(symAddr.getType());
+          if (object.id()->owner().IsDerivedType()) {
+            addChildIndexAndMapToParent(object, parentMemberIndices, mapOp,
+                                        semaCtx);
+          } else {
+            result.mapVars.push_back(mapOp);
+            ptrMapSyms->push_back(object.id());
+            if (mapSymTypes)
+              mapSymTypes->push_back(symAddr.getType());
+            if (mapSymLocs)
+              mapSymLocs->push_back(symAddr.getLoc());
+          }
         }
       });
+
+  insertChildMapInfoIntoParent(converter, parentMemberIndices, result.mapVars,
+                               *ptrMapSyms, mapSymTypes, mapSymLocs);
+
+  return clauseFound;
 }
 
 bool ClauseProcessor::processReduction(
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index 78c148ab0216..54ffcd0c964b 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -185,7 +185,12 @@ template <typename T>
 bool ClauseProcessor::processMotionClauses(
     Fortran::lower::StatementContext &stmtCtx,
     mlir::omp::MapClauseOps &result) {
-  return findRepeatableClause<T>(
+  std::map<const Fortran::semantics::Symbol *,
+           llvm::SmallVector<OmpMapMemberIndicesData>>
+      parentMemberIndices;
+  llvm::SmallVector<const Fortran::semantics::Symbol *> mapSymbols;
+
+  bool clauseFound = findRepeatableClause<T>(
       [&](const T &clause, const Fortran::parser::CharBlock &source) {
         mlir::Location clauseLocation = converter.genLocation(source);
         fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
@@ -203,6 +208,7 @@ bool ClauseProcessor::processMotionClauses(
         for (const omp::Object &object : objects) {
           llvm::SmallVector<mlir::Value> bounds;
           std::stringstream asFortran;
+
           Fortran::lower::AddrAndBoundsInfo info =
               Fortran::lower::gatherDataOperandAddrAndBounds<
                   mlir::omp::MapBoundsOp, mlir::omp::MapBoundsType>(
@@ -218,17 +224,29 @@ bool ClauseProcessor::processMotionClauses(
           // Explicit map captures are captured ByRef by default,
           // optimisation passes may alter this to ByCopy or other capture
           // types to optimise
-          mlir::Value mapOp = createMapInfoOp(
-              firOpBuilder, clauseLocation, symAddr, mlir::Value{},
-              asFortran.str(), bounds, {},
+          mlir::omp::MapInfoOp mapOp = createMapInfoOp(
+              firOpBuilder, clauseLocation, symAddr,
+              /*varPtrPtr=*/mlir::Value{}, asFortran.str(), bounds,
+              /*members=*/{}, /*membersIndex=*/mlir::DenseIntElementsAttr{},
               static_cast<
                   std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
                   mapTypeBits),
               mlir::omp::VariableCaptureKind::ByRef, symAddr.getType());
 
-          result.mapVars.push_back(mapOp);
+          if (object.id()->owner().IsDerivedType()) {
+            addChildIndexAndMapToParent(object, parentMemberIndices, mapOp,
+                                        semaCtx);
+          } else {
+            result.mapVars.push_back(mapOp);
+            mapSymbols.push_back(object.id());
+          }
         }
       });
+
+  insertChildMapInfoIntoParent(converter, parentMemberIndices, result.mapVars,
+                               mapSymbols,
+                               /*mapSymTypes=*/nullptr, /*mapSymLocs=*/nullptr);
+  return clauseFound;
 }
 
 template <typename... Ts>
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
index 97337cfc08c7..87370c92964a 100644
--- a/flang/lib/Lower/OpenMP/Clauses.cpp
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -1227,4 +1227,27 @@ List<Clause> makeClauses(const parser::OmpClauseList &clauses,
     return makeClause(s, semaCtx);
   });
 }
+
+bool transferLocations(const List<Clause> &from, List<Clause> &to) {
+  bool allDone = true;
+
+  for (Clause &clause : to) {
+    if (!clause.source.empty())
+      continue;
+    auto found =
+        llvm::find_if(from, [&](const Clause &c) { return c.id == clause.id; });
+    // This is not completely accurate, but should be good enough for now.
+    // It can be improved in the future if necessary, but in cases of
+    // synthesized clauses getting accurate location may be impossible.
+    if (found != from.end()) {
+      clause.source = found->source;
+    } else {
+      // Found a clause that won't have "source".
+      allDone = false;
+    }
+  }
+
+  return allDone;
+}
+
 } // namespace Fortran::lower::omp
diff --git a/flang/lib/Lower/OpenMP/Clauses.h b/flang/lib/Lower/OpenMP/Clauses.h
index 3e776425c733..ca610c652896 100644
--- a/flang/lib/Lower/OpenMP/Clauses.h
+++ b/flang/lib/Lower/OpenMP/Clauses.h
@@ -23,11 +23,15 @@
 
 namespace Fortran::lower::omp {
 using namespace Fortran;
-using SomeType = evaluate::SomeType;
 using SomeExpr = semantics::SomeExpr;
 using MaybeExpr = semantics::MaybeExpr;
 
-using TypeTy = SomeType;
+// evaluate::SomeType doesn't provide == operation. It's not really used in
+// flang's clauses so far, so a trivial implementation is sufficient.
+struct TypeTy : public evaluate::SomeType {
+  bool operator==(const TypeTy &t) const { return true; }
+};
+
 using IdTy = semantics::Symbol *;
 using ExprTy = SomeExpr;
 
@@ -222,6 +226,8 @@ using When = tomp::clause::WhenT<TypeTy, IdTy, ExprTy>;
 using Write = tomp::clause::WriteT<TypeTy, IdTy, ExprTy>;
 } // namespace clause
 
+using tomp::type::operator==;
+
 struct CancellationConstructType {
   using EmptyTrait = std::true_type;
 };
@@ -244,13 +250,16 @@ using ClauseBase = tomp::ClauseT<TypeTy, IdTy, ExprTy,
                                  MemoryOrder, Threadprivate>;
 
 struct Clause : public ClauseBase {
+  Clause(ClauseBase &&base, const parser::CharBlock source = {})
+      : ClauseBase(std::move(base)), source(source) {}
+  // "source" will be ignored by tomp::type::operator==.
   parser::CharBlock source;
 };
 
 template <typename Specific>
 Clause makeClause(llvm::omp::Clause id, Specific &&specific,
                   parser::CharBlock source = {}) {
-  return Clause{{id, specific}, source};
+  return Clause(typename Clause::BaseT{id, specific}, source);
 }
 
 Clause makeClause(const Fortran::parser::OmpClause &cls,
@@ -258,6 +267,8 @@ Clause makeClause(const Fortran::parser::OmpClause &cls,
 
 List<Clause> makeClauses(const parser::OmpClauseList &clauses,
                          semantics::SemanticsContext &semaCtx);
+
+bool transferLocations(const List<Clause> &from, List<Clause> &to);
 } // namespace Fortran::lower::omp
 
 #endif // FORTRAN_LOWER_OPENMP_CLAUSES_H
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index f63a774fa44b..82d8d8dd98ea 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -15,6 +15,7 @@
 #include "Utils.h"
 #include "flang/Lower/PFTBuilder.h"
 #include "flang/Lower/SymbolMap.h"
+#include "flang/Optimizer/Builder/HLFIRTools.h"
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Semantics/tools.h"
 
@@ -27,15 +28,20 @@ void DataSharingProcessor::processStep1(
     llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> *privateSyms) {
   collectSymbolsForPrivatization();
   collectDefaultSymbols();
+  collectImplicitSymbols();
   privatize(clauseOps, privateSyms);
   defaultPrivatize(clauseOps, privateSyms);
+  implicitPrivatize(clauseOps, privateSyms);
   insertBarrier();
 }
 
 void DataSharingProcessor::processStep2(mlir::Operation *op, bool isLoop) {
-  insPt = firOpBuilder.saveInsertionPoint();
-  copyLastPrivatize(op);
-  firOpBuilder.restoreInsertionPoint(insPt);
+  // 'sections' lastprivate is handled by genOMP()
+  if (!mlir::isa<mlir::omp::SectionsOp>(op)) {
+    insPt = firOpBuilder.saveInsertionPoint();
+    copyLastPrivatize(op);
+    firOpBuilder.restoreInsertionPoint(insPt);
+  }
 
   if (isLoop) {
     // push deallocs out of the loop
@@ -140,6 +146,10 @@ void DataSharingProcessor::collectSymbolsForPrivatization() {
 }
 
 bool DataSharingProcessor::needBarrier() {
+  // Emit implicit barrier to synchronize threads and avoid data races on
+  // initialization of firstprivate variables and post-update of lastprivate
+  // variables.
+  // Emit implicit barrier for linear clause. Maybe on somewhere else.
   for (const Fortran::semantics::Symbol *sym : privatizedSymbols) {
     if (sym->test(Fortran::semantics::Symbol::Flag::OmpFirstPrivate) &&
         sym->test(Fortran::semantics::Symbol::Flag::OmpLastPrivate))
@@ -149,13 +159,6 @@ bool DataSharingProcessor::needBarrier() {
 }
 
 void DataSharingProcessor::insertBarrier() {
-  // Emit implicit barrier to synchronize threads and avoid data races on
-  // initialization of firstprivate variables and post-update of lastprivate
-  // variables.
-  // FIXME: Emit barrier for lastprivate clause when 'sections' directive has
-  // 'nowait' clause. Otherwise, emit barrier when 'sections' directive has
-  // both firstprivate and lastprivate clause.
-  // Emit implicit barrier for linear clause. Maybe on somewhere else.
   if (needBarrier())
     firOpBuilder.create<mlir::omp::BarrierOp>(converter.getCurrentLocation());
 }
@@ -173,76 +176,7 @@ void DataSharingProcessor::insertLastPrivateCompare(mlir::Operation *op) {
     if (clause.id != llvm::omp::OMPC_lastprivate)
       continue;
     // TODO: Add lastprivate support for simd construct
-    if (mlir::isa<mlir::omp::SectionOp>(op)) {
-      if (&eval == &eval.parentConstruct->getLastNestedEvaluation()) {
-        // For `omp.sections`, lastprivatized variables occur in
-        // lexically final `omp.section` operation. The following FIR
-        // shall be generated for the same:
-        //
-        // omp.sections lastprivate(...) {
-        //  omp.section {...}
-        //  omp.section {...}
-        //  omp.section {
-        //      fir.allocate for `private`/`firstprivate`
-        //      <More operations here>
-        //      fir.if %true {
-        //          ^%lpv_update_blk
-        //      }
-        //  }
-        // }
-        //
-        // To keep code consistency while handling privatization
-        // through this control flow, add a `fir.if` operation
-        // that always evaluates to true, in order to create
-        // a dedicated sub-region in `omp.section` where
-        // lastprivate FIR can reside. Later canonicalizations
-        // will optimize away this operation.
-        if (!eval.lowerAsUnstructured()) {
-          auto ifOp = firOpBuilder.create<fir::IfOp>(
-              op->getLoc(),
-              firOpBuilder.createIntegerConstant(
-                  op->getLoc(), firOpBuilder.getIntegerType(1), 0x1),
-              /*else*/ false);
-          firOpBuilder.setInsertionPointToStart(&ifOp.getThenRegion().front());
-
-          const Fortran::parser::OpenMPConstruct *parentOmpConstruct =
-              eval.parentConstruct->getIf<Fortran::parser::OpenMPConstruct>();
-          assert(parentOmpConstruct &&
-                 "Expected a valid enclosing OpenMP construct");
-          const Fortran::parser::OpenMPSectionsConstruct *sectionsConstruct =
-              std::get_if<Fortran::parser::OpenMPSectionsConstruct>(
-                  &parentOmpConstruct->u);
-          assert(sectionsConstruct &&
-                 "Expected an enclosing omp.sections construct");
-          const Fortran::parser::OmpClauseList &sectionsEndClauseList =
-              std::get<Fortran::parser::OmpClauseList>(
-                  std::get<Fortran::parser::OmpEndSectionsDirective>(
-                      sectionsConstruct->t)
-                      .t);
-          for (const Fortran::parser::OmpClause &otherClause :
-               sectionsEndClauseList.v)
-            if (std::get_if<Fortran::parser::OmpClause::Nowait>(&otherClause.u))
-              // Emit implicit barrier to synchronize threads and avoid data
-              // races on post-update of lastprivate variables when `nowait`
-              // clause is present.
-              firOpBuilder.create<mlir::omp::BarrierOp>(
-                  converter.getCurrentLocation());
-          firOpBuilder.setInsertionPointToStart(&ifOp.getThenRegion().front());
-          lastPrivIP = firOpBuilder.saveInsertionPoint();
-          firOpBuilder.setInsertionPoint(ifOp);
-          insPt = firOpBuilder.saveInsertionPoint();
-        } else {
-          // Lastprivate operation is inserted at the end
-          // of the lexically last section in the sections
-          // construct
-          mlir::OpBuilder::InsertionGuard unstructuredSectionsGuard(
-              firOpBuilder);
-          mlir::Operation *lastOper = op->getRegion(0).back().getTerminator();
-          firOpBuilder.setInsertionPoint(lastOper);
-          lastPrivIP = firOpBuilder.saveInsertionPoint();
-        }
-      }
-    } else if (mlir::isa<mlir::omp::WsloopOp>(op)) {
+    if (mlir::isa<mlir::omp::WsloopOp>(op)) {
       // Update the original variable just before exiting the worksharing
       // loop. Conversion as follows:
       //
@@ -294,6 +228,8 @@ void DataSharingProcessor::insertLastPrivateCompare(mlir::Operation *op) {
       assert(loopIV && "loopIV was not set");
       firOpBuilder.create<fir::StoreOp>(loopOp.getLoc(), v, loopIV);
       lastPrivIP = firOpBuilder.saveInsertionPoint();
+    } else if (mlir::isa<mlir::omp::SectionsOp>(op)) {
+      // Already handled by genOMP()
     } else {
       TODO(converter.getCurrentLocation(),
            "lastprivate clause in constructs other than "
@@ -302,20 +238,117 @@ void DataSharingProcessor::insertLastPrivateCompare(mlir::Operation *op) {
   }
 }
 
+static const Fortran::parser::CharBlock *
+getSource(const Fortran::semantics::SemanticsContext &semaCtx,
+          const Fortran::lower::pft::Evaluation &eval) {
+  const Fortran::parser::CharBlock *source = nullptr;
+
+  auto ompConsVisit = [&](const Fortran::parser::OpenMPConstruct &x) {
+    std::visit(Fortran::common::visitors{
+                   [&](const Fortran::parser::OpenMPSectionsConstruct &x) {
+                     source = &std::get<0>(x.t).source;
+                   },
+                   [&](const Fortran::parser::OpenMPLoopConstruct &x) {
+                     source = &std::get<0>(x.t).source;
+                   },
+                   [&](const Fortran::parser::OpenMPBlockConstruct &x) {
+                     source = &std::get<0>(x.t).source;
+                   },
+                   [&](const Fortran::parser::OpenMPCriticalConstruct &x) {
+                     source = &std::get<0>(x.t).source;
+                   },
+                   [&](const Fortran::parser::OpenMPAtomicConstruct &x) {
+                     std::visit([&](const auto &x) { source = &x.source; },
+                                x.u);
+                   },
+                   [&](const auto &x) { source = &x.source; },
+               },
+               x.u);
+  };
+
+  eval.visit(Fortran::common::visitors{
+      [&](const Fortran::parser::OpenMPConstruct &x) { ompConsVisit(x); },
+      [&](const Fortran::parser::OpenMPDeclarativeConstruct &x) {
+        source = &x.source;
+      },
+      [&](const Fortran::parser::OmpEndLoopDirective &x) {
+        source = &x.source;
+      },
+      [&](const auto &x) {},
+  });
+
+  return source;
+}
+
+void DataSharingProcessor::collectSymbolsInNestedRegions(
+    Fortran::lower::pft::Evaluation &eval,
+    Fortran::semantics::Symbol::Flag flag,
+    llvm::SetVector<const Fortran::semantics::Symbol *>
+        &symbolsInNestedRegions) {
+  for (Fortran::lower::pft::Evaluation &nestedEval :
+       eval.getNestedEvaluations()) {
+    if (nestedEval.hasNestedEvaluations()) {
+      if (nestedEval.isConstruct())
+        // Recursively look for OpenMP constructs within `nestedEval`'s region
+        collectSymbolsInNestedRegions(nestedEval, flag, symbolsInNestedRegions);
+      else
+        converter.collectSymbolSet(nestedEval, symbolsInNestedRegions, flag,
+                                   /*collectSymbols=*/true,
+                                   /*collectHostAssociatedSymbols=*/false);
+    }
+  }
+}
+
+// Collect symbols to be default privatized in two steps.
+// In step 1, collect all symbols in `eval` that match `flag` into
+// `defaultSymbols`. In step 2, for nested constructs (if any), if and only if
+// the nested construct is an OpenMP construct, collect those nested
+// symbols skipping host associated symbols into `symbolsInNestedRegions`.
+// Later, in current context, all symbols in the set
+// `defaultSymbols` - `symbolsInNestedRegions` will be privatized.
 void DataSharingProcessor::collectSymbols(
-    Fortran::semantics::Symbol::Flag flag) {
-  converter.collectSymbolSet(eval, defaultSymbols, flag,
+    Fortran::semantics::Symbol::Flag flag,
+    llvm::SetVector<const Fortran::semantics::Symbol *> &symbols) {
+  // Collect all scopes associated with 'eval'.
+  llvm::SetVector<const Fortran::semantics::Scope *> clauseScopes;
+  std::function<void(const Fortran::semantics::Scope *)> collectScopes =
+      [&](const Fortran::semantics::Scope *scope) {
+        clauseScopes.insert(scope);
+        for (const Fortran::semantics::Scope &child : scope->children())
+          collectScopes(&child);
+      };
+  const Fortran::parser::CharBlock *source =
+      clauses.empty() ? getSource(semaCtx, eval) : &clauses.front().source;
+  const Fortran::semantics::Scope *curScope = nullptr;
+  if (source && !source->empty()) {
+    curScope = &semaCtx.FindScope(*source);
+    collectScopes(curScope);
+  }
+  // Collect all symbols referenced in the evaluation being processed,
+  // that matches 'flag'.
+  llvm::SetVector<const Fortran::semantics::Symbol *> allSymbols;
+  converter.collectSymbolSet(eval, allSymbols, flag,
                              /*collectSymbols=*/true,
                              /*collectHostAssociatedSymbols=*/true);
-  for (Fortran::lower::pft::Evaluation &e : eval.getNestedEvaluations()) {
-    if (e.hasNestedEvaluations())
-      converter.collectSymbolSet(e, symbolsInNestedRegions, flag,
-                                 /*collectSymbols=*/true,
-                                 /*collectHostAssociatedSymbols=*/false);
-    else
-      converter.collectSymbolSet(e, symbolsInParentRegions, flag,
-                                 /*collectSymbols=*/false,
-                                 /*collectHostAssociatedSymbols=*/true);
+  llvm::SetVector<const Fortran::semantics::Symbol *> symbolsInNestedRegions;
+  collectSymbolsInNestedRegions(eval, flag, symbolsInNestedRegions);
+  // Filter-out symbols that must not be privatized.
+  bool collectImplicit = flag == Fortran::semantics::Symbol::Flag::OmpImplicit;
+  auto isPrivatizable = [](const Fortran::semantics::Symbol &sym) -> bool {
+    return !Fortran::semantics::IsProcedure(sym) &&
+           !sym.GetUltimate().has<Fortran::semantics::DerivedTypeDetails>() &&
+           !sym.GetUltimate().has<Fortran::semantics::NamelistDetails>() &&
+           !Fortran::semantics::IsImpliedDoIndex(sym.GetUltimate());
+  };
+  for (const auto *sym : allSymbols) {
+    assert(curScope && "couldn't find current scope");
+    if (isPrivatizable(*sym) && !symbolsInNestedRegions.contains(sym) &&
+        !privatizedSymbols.contains(sym) &&
+        !sym->test(Fortran::semantics::Symbol::Flag::OmpPreDetermined) &&
+        (collectImplicit ||
+         !sym->test(Fortran::semantics::Symbol::Flag::OmpImplicit)) &&
+        clauseScopes.contains(&sym->owner()))
+      symbols.insert(sym);
   }
 }
 
@@ -325,13 +358,22 @@ void DataSharingProcessor::collectDefaultSymbols() {
     if (const auto *defaultClause =
             std::get_if<omp::clause::Default>(&clause.u)) {
       if (defaultClause->v == DataSharingAttribute::Private)
-        collectSymbols(Fortran::semantics::Symbol::Flag::OmpPrivate);
+        collectSymbols(Fortran::semantics::Symbol::Flag::OmpPrivate,
+                       defaultSymbols);
       else if (defaultClause->v == DataSharingAttribute::Firstprivate)
-        collectSymbols(Fortran::semantics::Symbol::Flag::OmpFirstPrivate);
+        collectSymbols(Fortran::semantics::Symbol::Flag::OmpFirstPrivate,
+                       defaultSymbols);
     }
   }
 }
 
+void DataSharingProcessor::collectImplicitSymbols() {
+  // There will be no implicit symbols when a default clause is present.
+  if (defaultSymbols.empty())
+    collectSymbols(Fortran::semantics::Symbol::Flag::OmpImplicit,
+                   implicitSymbols);
+}
+
 void DataSharingProcessor::privatize(
     mlir::omp::PrivateClauseOps *clauseOps,
     llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> *privateSyms) {
@@ -361,16 +403,15 @@ void DataSharingProcessor::copyLastPrivatize(mlir::Operation *op) {
 void DataSharingProcessor::defaultPrivatize(
     mlir::omp::PrivateClauseOps *clauseOps,
     llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> *privateSyms) {
-  for (const Fortran::semantics::Symbol *sym : defaultSymbols) {
-    if (!Fortran::semantics::IsProcedure(*sym) &&
-        !sym->GetUltimate().has<Fortran::semantics::DerivedTypeDetails>() &&
-        !sym->GetUltimate().has<Fortran::semantics::NamelistDetails>() &&
-        !Fortran::semantics::IsImpliedDoIndex(sym->GetUltimate()) &&
-        !symbolsInNestedRegions.contains(sym) &&
-        !symbolsInParentRegions.contains(sym) &&
-        !privatizedSymbols.contains(sym))
-      doPrivatize(sym, clauseOps, privateSyms);
-  }
+  for (const Fortran::semantics::Symbol *sym : defaultSymbols)
+    doPrivatize(sym, clauseOps, privateSyms);
+}
+
+void DataSharingProcessor::implicitPrivatize(
+    mlir::omp::PrivateClauseOps *clauseOps,
+    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> *privateSyms) {
+  for (const Fortran::semantics::Symbol *sym : implicitSymbols)
+    doPrivatize(sym, clauseOps, privateSyms);
 }
 
 void DataSharingProcessor::doPrivatize(
@@ -422,8 +463,16 @@ void DataSharingProcessor::doPrivatize(
           &allocRegion, /*insertPt=*/{}, symType, symLoc);
 
       firOpBuilder.setInsertionPointToEnd(allocEntryBlock);
-      symTable->addSymbol(*sym,
-                          fir::substBase(symExV, allocRegion.getArgument(0)));
+
+      fir::ExtendedValue localExV =
+          hlfir::translateToExtendedValue(
+              symLoc, firOpBuilder, hlfir::Entity{allocRegion.getArgument(0)},
+              /*contiguousHint=*/
+              Fortran::evaluate::IsSimplyContiguous(
+                  *sym, converter.getFoldingContext()))
+              .first;
+
+      symTable->addSymbol(*sym, localExV);
       symTable->pushScope();
       cloneSymbol(sym);
       firOpBuilder.create<mlir::omp::YieldOp>(
@@ -440,12 +489,23 @@ void DataSharingProcessor::doPrivatize(
       mlir::Block *copyEntryBlock = firOpBuilder.createBlock(
           &copyRegion, /*insertPt=*/{}, {symType, symType}, {symLoc, symLoc});
       firOpBuilder.setInsertionPointToEnd(copyEntryBlock);
-      symTable->addSymbol(*sym,
-                          fir::substBase(symExV, copyRegion.getArgument(0)),
-                          /*force=*/true);
+
+      auto addSymbol = [&](unsigned argIdx, bool force = false) {
+        symExV.match(
+            [&](const fir::MutableBoxValue &box) {
+              symTable->addSymbol(
+                  *sym, fir::substBase(box, copyRegion.getArgument(argIdx)),
+                  force);
+            },
+            [&](const auto &box) {
+              symTable->addSymbol(*sym, copyRegion.getArgument(argIdx), force);
+            });
+      };
+
+      addSymbol(0, true);
       symTable->pushScope();
-      symTable->addSymbol(*sym,
-                          fir::substBase(symExV, copyRegion.getArgument(1)));
+      addSymbol(1);
+
       auto ip = firOpBuilder.saveInsertionPoint();
       copyFirstPrivateSymbol(sym, &ip);
 
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.h b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
index f709a64211a8..ec6848f7bba3 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.h
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
@@ -39,11 +39,11 @@ private:
   // Symbols in private, firstprivate, and/or lastprivate clauses.
   llvm::SetVector<const Fortran::semantics::Symbol *> privatizedSymbols;
   llvm::SetVector<const Fortran::semantics::Symbol *> defaultSymbols;
-  llvm::SetVector<const Fortran::semantics::Symbol *> symbolsInNestedRegions;
-  llvm::SetVector<const Fortran::semantics::Symbol *> symbolsInParentRegions;
+  llvm::SetVector<const Fortran::semantics::Symbol *> implicitSymbols;
   llvm::DenseMap<const Fortran::semantics::Symbol *, mlir::omp::PrivateClauseOp>
       symToPrivatizer;
   Fortran::lower::AbstractConverter &converter;
+  Fortran::semantics::SemanticsContext &semaCtx;
   fir::FirOpBuilder &firOpBuilder;
   omp::List<omp::Clause> clauses;
   Fortran::lower::pft::Evaluation &eval;
@@ -51,19 +51,30 @@ private:
   Fortran::lower::SymMap *symTable;
 
   bool needBarrier();
-  void collectSymbols(Fortran::semantics::Symbol::Flag flag);
+  void
+  collectSymbols(Fortran::semantics::Symbol::Flag flag,
+                 llvm::SetVector<const Fortran::semantics::Symbol *> &symbols);
+  void collectSymbolsInNestedRegions(
+      Fortran::lower::pft::Evaluation &eval,
+      Fortran::semantics::Symbol::Flag flag,
+      llvm::SetVector<const Fortran::semantics::Symbol *>
+          &symbolsInNestedRegions);
   void collectOmpObjectListSymbol(
       const omp::ObjectList &objects,
       llvm::SetVector<const Fortran::semantics::Symbol *> &symbolSet);
   void collectSymbolsForPrivatization();
   void insertBarrier();
   void collectDefaultSymbols();
+  void collectImplicitSymbols();
   void privatize(
       mlir::omp::PrivateClauseOps *clauseOps,
       llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> *privateSyms);
   void defaultPrivatize(
       mlir::omp::PrivateClauseOps *clauseOps,
       llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> *privateSyms);
+  void implicitPrivatize(
+      mlir::omp::PrivateClauseOps *clauseOps,
+      llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> *privateSyms);
   void doPrivatize(
       const Fortran::semantics::Symbol *sym,
       mlir::omp::PrivateClauseOps *clauseOps,
@@ -85,7 +96,7 @@ public:
                        Fortran::lower::pft::Evaluation &eval,
                        bool useDelayedPrivatization = false,
                        Fortran::lower::SymMap *symTable = nullptr)
-      : hasLastPrivateOp(false), converter(converter),
+      : hasLastPrivateOp(false), converter(converter), semaCtx(semaCtx),
         firOpBuilder(converter.getFirOpBuilder()), clauses(clauses), eval(eval),
         useDelayedPrivatization(useDelayedPrivatization), symTable(symTable) {}
 
diff --git a/flang/lib/Lower/OpenMP/Decomposer.cpp b/flang/lib/Lower/OpenMP/Decomposer.cpp
new file mode 100644
index 000000000000..e6897cb81e94
--- /dev/null
+++ b/flang/lib/Lower/OpenMP/Decomposer.cpp
@@ -0,0 +1,126 @@
+//===-- Decomposer.cpp -- Compound directive decomposition ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/
+//
+//===----------------------------------------------------------------------===//
+
+#include "Decomposer.h"
+
+#include "Clauses.h"
+#include "Utils.h"
+#include "flang/Lower/PFTBuilder.h"
+#include "flang/Semantics/semantics.h"
+#include "flang/Tools/CrossToolHelpers.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Frontend/OpenMP/ClauseT.h"
+#include "llvm/Frontend/OpenMP/ConstructCompositionT.h"
+#include "llvm/Frontend/OpenMP/ConstructDecompositionT.h"
+#include "llvm/Frontend/OpenMP/OMP.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <optional>
+#include <utility>
+#include <variant>
+
+using namespace Fortran;
+
+namespace {
+using namespace Fortran::lower::omp;
+
+struct ConstructDecomposition {
+  ConstructDecomposition(mlir::ModuleOp modOp,
+                         semantics::SemanticsContext &semaCtx,
+                         lower::pft::Evaluation &ev,
+                         llvm::omp::Directive compound,
+                         const List<Clause> &clauses)
+      : semaCtx(semaCtx), mod(modOp), eval(ev) {
+    tomp::ConstructDecompositionT decompose(getOpenMPVersionAttribute(modOp),
+                                            *this, compound,
+                                            llvm::ArrayRef(clauses));
+    output = std::move(decompose.output);
+  }
+
+  // Given an object, return its base object if one exists.
+  std::optional<Object> getBaseObject(const Object &object) {
+    return lower::omp::getBaseObject(object, semaCtx);
+  }
+
+  // Return the iteration variable of the associated loop if any.
+  std::optional<Object> getLoopIterVar() {
+    if (semantics::Symbol *symbol = getIterationVariableSymbol(eval))
+      return Object{symbol, /*designator=*/{}};
+    return std::nullopt;
+  }
+
+  semantics::SemanticsContext &semaCtx;
+  mlir::ModuleOp mod;
+  lower::pft::Evaluation &eval;
+  List<UnitConstruct> output;
+};
+} // namespace
+
+static UnitConstruct mergeConstructs(uint32_t version,
+                                     llvm::ArrayRef<UnitConstruct> units) {
+  tomp::ConstructCompositionT compose(version, units);
+  return compose.merged;
+}
+
+namespace Fortran::lower::omp {
+LLVM_DUMP_METHOD llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
+                                               const UnitConstruct &uc) {
+  os << llvm::omp::getOpenMPDirectiveName(uc.id);
+  for (auto [index, clause] : llvm::enumerate(uc.clauses)) {
+    os << (index == 0 ? '\t' : ' ');
+    os << llvm::omp::getOpenMPClauseName(clause.id);
+  }
+  return os;
+}
+
+ConstructQueue buildConstructQueue(
+    mlir::ModuleOp modOp, Fortran::semantics::SemanticsContext &semaCtx,
+    Fortran::lower::pft::Evaluation &eval, const parser::CharBlock &source,
+    llvm::omp::Directive compound, const List<Clause> &clauses) {
+
+  List<UnitConstruct> constructs;
+
+  ConstructDecomposition decompose(modOp, semaCtx, eval, compound, clauses);
+  assert(!decompose.output.empty() && "Construct decomposition failed");
+
+  llvm::SmallVector<llvm::omp::Directive> loweringUnits;
+  std::ignore =
+      llvm::omp::getLeafOrCompositeConstructs(compound, loweringUnits);
+  uint32_t version = getOpenMPVersionAttribute(modOp);
+
+  int leafIndex = 0;
+  for (llvm::omp::Directive dir_id : loweringUnits) {
+    llvm::ArrayRef<llvm::omp::Directive> leafsOrSelf =
+        llvm::omp::getLeafConstructsOrSelf(dir_id);
+    size_t numLeafs = leafsOrSelf.size();
+
+    llvm::ArrayRef<UnitConstruct> toMerge{&decompose.output[leafIndex],
+                                          numLeafs};
+    auto &uc = constructs.emplace_back(mergeConstructs(version, toMerge));
+
+    if (!transferLocations(clauses, uc.clauses)) {
+      // If some clauses are left without source information, use the
+      // directive's source.
+      for (auto &clause : uc.clauses) {
+        if (clause.source.empty())
+          clause.source = source;
+      }
+    }
+    leafIndex += numLeafs;
+  }
+
+  return constructs;
+}
+} // namespace Fortran::lower::omp
diff --git a/flang/lib/Lower/OpenMP/Decomposer.h b/flang/lib/Lower/OpenMP/Decomposer.h
new file mode 100644
index 000000000000..f42d8f5c1740
--- /dev/null
+++ b/flang/lib/Lower/OpenMP/Decomposer.h
@@ -0,0 +1,51 @@
+//===-- Decomposer.h -- Compound directive decomposition ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef FORTRAN_LOWER_OPENMP_DECOMPOSER_H
+#define FORTRAN_LOWER_OPENMP_DECOMPOSER_H
+
+#include "Clauses.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "llvm/Frontend/OpenMP/ConstructCompositionT.h"
+#include "llvm/Frontend/OpenMP/ConstructDecompositionT.h"
+#include "llvm/Frontend/OpenMP/OMP.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+class raw_ostream;
+}
+
+namespace Fortran {
+namespace semantics {
+class SemanticsContext;
+}
+namespace lower::pft {
+struct Evaluation;
+}
+} // namespace Fortran
+
+namespace Fortran::lower::omp {
+using UnitConstruct = tomp::DirectiveWithClauses<lower::omp::Clause>;
+using ConstructQueue = List<UnitConstruct>;
+
+LLVM_DUMP_METHOD llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
+                                               const UnitConstruct &uc);
+
+// Given a potentially compound construct with a list of clauses that
+// apply to it, break it up into individual sub-constructs each with
+// the subset of applicable clauses (plus implicit clauses, if any).
+// From that create a work queue where each work item corresponds to
+// the sub-construct with its clauses.
+ConstructQueue buildConstructQueue(mlir::ModuleOp modOp,
+                                   semantics::SemanticsContext &semaCtx,
+                                   lower::pft::Evaluation &eval,
+                                   const parser::CharBlock &source,
+                                   llvm::omp::Directive compound,
+                                   const List<Clause> &clauses);
+} // namespace Fortran::lower::omp
+
+#endif // FORTRAN_LOWER_OPENMP_DECOMPOSER_H
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index c54f100b73da..f9ba2fcbbca7 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -15,6 +15,7 @@
 #include "ClauseProcessor.h"
 #include "Clauses.h"
 #include "DataSharingProcessor.h"
+#include "Decomposer.h"
 #include "DirectivesCommon.h"
 #include "ReductionProcessor.h"
 #include "Utils.h"
@@ -44,6 +45,13 @@ using namespace Fortran::lower::omp;
 // Code generation helper functions
 //===----------------------------------------------------------------------===//
 
+static void genOMPDispatch(Fortran::lower::AbstractConverter &converter,
+                           Fortran::lower::SymMap &symTable,
+                           Fortran::semantics::SemanticsContext &semaCtx,
+                           Fortran::lower::pft::Evaluation &eval,
+                           mlir::Location loc, const ConstructQueue &queue,
+                           ConstructQueue::iterator item);
+
 static Fortran::lower::pft::Evaluation *
 getCollapsedLoopEval(Fortran::lower::pft::Evaluation &eval, int collapseValue) {
   // Return the Evaluation of the innermost collapsed loop, or the current one
@@ -460,81 +468,6 @@ markDeclareTarget(mlir::Operation *op,
   declareTargetOp.setDeclareTarget(deviceType, captureClause);
 }
 
-/// Split a combined directive into an outer leaf directive and the (possibly
-/// combined) rest of the combined directive. Composite directives and
-/// non-compound directives are not split, in which case it will return the
-/// input directive as its first output and an empty value as its second output.
-static std::pair<llvm::omp::Directive, std::optional<llvm::omp::Directive>>
-splitCombinedDirective(llvm::omp::Directive dir) {
-  using D = llvm::omp::Directive;
-  switch (dir) {
-  case D::OMPD_masked_taskloop:
-    return {D::OMPD_masked, D::OMPD_taskloop};
-  case D::OMPD_masked_taskloop_simd:
-    return {D::OMPD_masked, D::OMPD_taskloop_simd};
-  case D::OMPD_master_taskloop:
-    return {D::OMPD_master, D::OMPD_taskloop};
-  case D::OMPD_master_taskloop_simd:
-    return {D::OMPD_master, D::OMPD_taskloop_simd};
-  case D::OMPD_parallel_do:
-    return {D::OMPD_parallel, D::OMPD_do};
-  case D::OMPD_parallel_do_simd:
-    return {D::OMPD_parallel, D::OMPD_do_simd};
-  case D::OMPD_parallel_masked:
-    return {D::OMPD_parallel, D::OMPD_masked};
-  case D::OMPD_parallel_masked_taskloop:
-    return {D::OMPD_parallel, D::OMPD_masked_taskloop};
-  case D::OMPD_parallel_masked_taskloop_simd:
-    return {D::OMPD_parallel, D::OMPD_masked_taskloop_simd};
-  case D::OMPD_parallel_master:
-    return {D::OMPD_parallel, D::OMPD_master};
-  case D::OMPD_parallel_master_taskloop:
-    return {D::OMPD_parallel, D::OMPD_master_taskloop};
-  case D::OMPD_parallel_master_taskloop_simd:
-    return {D::OMPD_parallel, D::OMPD_master_taskloop_simd};
-  case D::OMPD_parallel_sections:
-    return {D::OMPD_parallel, D::OMPD_sections};
-  case D::OMPD_parallel_workshare:
-    return {D::OMPD_parallel, D::OMPD_workshare};
-  case D::OMPD_target_parallel:
-    return {D::OMPD_target, D::OMPD_parallel};
-  case D::OMPD_target_parallel_do:
-    return {D::OMPD_target, D::OMPD_parallel_do};
-  case D::OMPD_target_parallel_do_simd:
-    return {D::OMPD_target, D::OMPD_parallel_do_simd};
-  case D::OMPD_target_simd:
-    return {D::OMPD_target, D::OMPD_simd};
-  case D::OMPD_target_teams:
-    return {D::OMPD_target, D::OMPD_teams};
-  case D::OMPD_target_teams_distribute:
-    return {D::OMPD_target, D::OMPD_teams_distribute};
-  case D::OMPD_target_teams_distribute_parallel_do:
-    return {D::OMPD_target, D::OMPD_teams_distribute_parallel_do};
-  case D::OMPD_target_teams_distribute_parallel_do_simd:
-    return {D::OMPD_target, D::OMPD_teams_distribute_parallel_do_simd};
-  case D::OMPD_target_teams_distribute_simd:
-    return {D::OMPD_target, D::OMPD_teams_distribute_simd};
-  case D::OMPD_teams_distribute:
-    return {D::OMPD_teams, D::OMPD_distribute};
-  case D::OMPD_teams_distribute_parallel_do:
-    return {D::OMPD_teams, D::OMPD_distribute_parallel_do};
-  case D::OMPD_teams_distribute_parallel_do_simd:
-    return {D::OMPD_teams, D::OMPD_distribute_parallel_do_simd};
-  case D::OMPD_teams_distribute_simd:
-    return {D::OMPD_teams, D::OMPD_distribute_simd};
-  case D::OMPD_parallel_loop:
-    return {D::OMPD_parallel, D::OMPD_loop};
-  case D::OMPD_target_parallel_loop:
-    return {D::OMPD_target, D::OMPD_parallel_loop};
-  case D::OMPD_target_teams_loop:
-    return {D::OMPD_target, D::OMPD_teams_loop};
-  case D::OMPD_teams_loop:
-    return {D::OMPD_teams, D::OMPD_loop};
-  default:
-    return {dir, std::nullopt};
-  }
-}
-
 //===----------------------------------------------------------------------===//
 // Op body generation helper structures and functions
 //===----------------------------------------------------------------------===//
@@ -555,11 +488,6 @@ struct OpWithBodyGenInfo {
       : converter(converter), symTable(symTable), semaCtx(semaCtx), loc(loc),
         eval(eval), dir(dir) {}
 
-  OpWithBodyGenInfo &setGenNested(bool value) {
-    genNested = value;
-    return *this;
-  }
-
   OpWithBodyGenInfo &setOuterCombined(bool value) {
     outerCombined = value;
     return *this;
@@ -600,8 +528,6 @@ struct OpWithBodyGenInfo {
   Fortran::lower::pft::Evaluation &eval;
   /// [in] leaf directive for which to generate the op body.
   llvm::omp::Directive dir;
-  /// [in] whether to generate FIR for nested evaluations
-  bool genNested = true;
   /// [in] is this an outer operation - prevents privatization.
   bool outerCombined = false;
   /// [in] list of clauses to process.
@@ -620,9 +546,13 @@ struct OpWithBodyGenInfo {
 
 /// Create the body (block) for an OpenMP Operation.
 ///
-/// \param [in]   op - the operation the body belongs to.
-/// \param [in] info - options controlling code-gen for the construction.
-static void createBodyOfOp(mlir::Operation &op, OpWithBodyGenInfo &info) {
+/// \param [in]   op  - the operation the body belongs to.
+/// \param [in] info  - options controlling code-gen for the construction.
+/// \param [in] queue - work queue with nested constructs.
+/// \param [in] item  - item in the queue to generate body for.
+static void createBodyOfOp(mlir::Operation &op, const OpWithBodyGenInfo &info,
+                           const ConstructQueue &queue,
+                           ConstructQueue::iterator item) {
   fir::FirOpBuilder &firOpBuilder = info.converter.getFirOpBuilder();
 
   auto insertMarker = [](fir::FirOpBuilder &builder) {
@@ -678,7 +608,10 @@ static void createBodyOfOp(mlir::Operation &op, OpWithBodyGenInfo &info) {
     }
   }
 
-  if (info.genNested) {
+  if (ConstructQueue::iterator next = std::next(item); next != queue.end()) {
+    genOMPDispatch(info.converter, info.symTable, info.semaCtx, info.eval,
+                   info.loc, queue, next);
+  } else {
     // genFIR(Evaluation&) tries to patch up unterminated blocks, causing
     // a lot of complications for our approach if the terminator generation
     // is delayed past this point. Insert a temporary terminator here, then
@@ -769,11 +702,12 @@ static void genBodyOfTargetDataOp(
     Fortran::lower::AbstractConverter &converter,
     Fortran::lower::SymMap &symTable,
     Fortran::semantics::SemanticsContext &semaCtx,
-    Fortran::lower::pft::Evaluation &eval, bool genNested,
-    mlir::omp::TargetDataOp &dataOp, llvm::ArrayRef<mlir::Type> useDeviceTypes,
+    Fortran::lower::pft::Evaluation &eval, mlir::omp::TargetDataOp &dataOp,
+    llvm::ArrayRef<mlir::Type> useDeviceTypes,
     llvm::ArrayRef<mlir::Location> useDeviceLocs,
     llvm::ArrayRef<const Fortran::semantics::Symbol *> useDeviceSymbols,
-    const mlir::Location &currentLocation) {
+    const mlir::Location &currentLocation, const ConstructQueue &queue,
+    ConstructQueue::iterator item) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   mlir::Region &region = dataOp.getRegion();
 
@@ -826,8 +760,13 @@ static void genBodyOfTargetDataOp(
 
   // Set the insertion point after the marker.
   firOpBuilder.setInsertionPointAfter(undefMarker.getDefiningOp());
-  if (genNested)
+
+  if (ConstructQueue::iterator next = std::next(item); next != queue.end()) {
+    genOMPDispatch(converter, symTable, semaCtx, eval, currentLocation, queue,
+                   next);
+  } else {
     genNestedEvaluations(converter, eval);
+  }
 }
 
 // This functions creates a block for the body of the targetOp's region. It adds
@@ -836,12 +775,13 @@ static void
 genBodyOfTargetOp(Fortran::lower::AbstractConverter &converter,
                   Fortran::lower::SymMap &symTable,
                   Fortran::semantics::SemanticsContext &semaCtx,
-                  Fortran::lower::pft::Evaluation &eval, bool genNested,
+                  Fortran::lower::pft::Evaluation &eval,
                   mlir::omp::TargetOp &targetOp,
                   llvm::ArrayRef<const Fortran::semantics::Symbol *> mapSyms,
                   llvm::ArrayRef<mlir::Location> mapSymLocs,
                   llvm::ArrayRef<mlir::Type> mapSymTypes,
-                  const mlir::Location &currentLocation) {
+                  const mlir::Location &currentLocation,
+                  const ConstructQueue &queue, ConstructQueue::iterator item) {
   assert(mapSymTypes.size() == mapSymLocs.size());
 
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
@@ -939,8 +879,10 @@ genBodyOfTargetOp(Fortran::lower::AbstractConverter &converter,
         std::stringstream name;
         firOpBuilder.setInsertionPoint(targetOp);
         mlir::Value mapOp = createMapInfoOp(
-            firOpBuilder, copyVal.getLoc(), copyVal, mlir::Value{}, name.str(),
-            bounds, llvm::SmallVector<mlir::Value>{},
+            firOpBuilder, copyVal.getLoc(), copyVal,
+            /*varPtrPtr=*/mlir::Value{}, name.str(), bounds,
+            /*members=*/llvm::SmallVector<mlir::Value>{},
+            /*membersIndex=*/mlir::DenseIntElementsAttr{},
             static_cast<
                 std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
                 llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT),
@@ -981,15 +923,22 @@ genBodyOfTargetOp(Fortran::lower::AbstractConverter &converter,
 
   // Create the insertion point after the marker.
   firOpBuilder.setInsertionPointAfter(undefMarker.getDefiningOp());
-  if (genNested)
+
+  if (ConstructQueue::iterator next = std::next(item); next != queue.end()) {
+    genOMPDispatch(converter, symTable, semaCtx, eval, currentLocation, queue,
+                   next);
+  } else {
     genNestedEvaluations(converter, eval);
+  }
 }
 
 template <typename OpTy, typename... Args>
-static OpTy genOpWithBody(OpWithBodyGenInfo &info, Args &&...args) {
+static OpTy genOpWithBody(const OpWithBodyGenInfo &info,
+                          const ConstructQueue &queue,
+                          ConstructQueue::iterator item, Args &&...args) {
   auto op = info.converter.getFirOpBuilder().create<OpTy>(
       info.loc, std::forward<Args>(args)...);
-  createBodyOfOp(*op, info);
+  createBodyOfOp(*op, info, queue, item);
   return op;
 }
 
@@ -1274,7 +1223,8 @@ static mlir::omp::BarrierOp
 genBarrierOp(Fortran::lower::AbstractConverter &converter,
              Fortran::lower::SymMap &symTable,
              Fortran::semantics::SemanticsContext &semaCtx,
-             Fortran::lower::pft::Evaluation &eval, mlir::Location loc) {
+             Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
+             const ConstructQueue &queue, ConstructQueue::iterator item) {
   return converter.getFirOpBuilder().create<mlir::omp::BarrierOp>(loc);
 }
 
@@ -1282,8 +1232,9 @@ static mlir::omp::CriticalOp
 genCriticalOp(Fortran::lower::AbstractConverter &converter,
               Fortran::lower::SymMap &symTable,
               Fortran::semantics::SemanticsContext &semaCtx,
-              Fortran::lower::pft::Evaluation &eval, bool genNested,
-              mlir::Location loc, const List<Clause> &clauses,
+              Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
+              const List<Clause> &clauses, const ConstructQueue &queue,
+              ConstructQueue::iterator item,
               const std::optional<Fortran::parser::Name> &name) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   mlir::FlatSymbolRefAttr nameAttr;
@@ -1306,17 +1257,17 @@ genCriticalOp(Fortran::lower::AbstractConverter &converter,
 
   return genOpWithBody<mlir::omp::CriticalOp>(
       OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
-                        llvm::omp::Directive::OMPD_critical)
-          .setGenNested(genNested),
-      nameAttr);
+                        llvm::omp::Directive::OMPD_critical),
+      queue, item, nameAttr);
 }
 
 static mlir::omp::DistributeOp
 genDistributeOp(Fortran::lower::AbstractConverter &converter,
                 Fortran::lower::SymMap &symTable,
                 Fortran::semantics::SemanticsContext &semaCtx,
-                Fortran::lower::pft::Evaluation &eval, bool genNested,
-                mlir::Location loc, const List<Clause> &clauses) {
+                Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
+                const List<Clause> &clauses, const ConstructQueue &queue,
+                ConstructQueue::iterator item) {
   TODO(loc, "Distribute construct");
   return nullptr;
 }
@@ -1326,7 +1277,8 @@ genFlushOp(Fortran::lower::AbstractConverter &converter,
            Fortran::lower::SymMap &symTable,
            Fortran::semantics::SemanticsContext &semaCtx,
            Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-           const ObjectList &objects, const List<Clause> &clauses) {
+           const ObjectList &objects, const List<Clause> &clauses,
+           const ConstructQueue &queue, ConstructQueue::iterator item) {
   llvm::SmallVector<mlir::Value> operandRange;
   genFlushClauses(converter, semaCtx, objects, clauses, loc, operandRange);
 
@@ -1338,12 +1290,13 @@ static mlir::omp::MasterOp
 genMasterOp(Fortran::lower::AbstractConverter &converter,
             Fortran::lower::SymMap &symTable,
             Fortran::semantics::SemanticsContext &semaCtx,
-            Fortran::lower::pft::Evaluation &eval, bool genNested,
-            mlir::Location loc) {
+            Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
+            const List<Clause> &clauses, const ConstructQueue &queue,
+            ConstructQueue::iterator item) {
   return genOpWithBody<mlir::omp::MasterOp>(
       OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
-                        llvm::omp::Directive::OMPD_master)
-          .setGenNested(genNested));
+                        llvm::omp::Directive::OMPD_master),
+      queue, item);
 }
 
 static mlir::omp::OrderedOp
@@ -1351,7 +1304,8 @@ genOrderedOp(Fortran::lower::AbstractConverter &converter,
              Fortran::lower::SymMap &symTable,
              Fortran::semantics::SemanticsContext &semaCtx,
              Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-             const List<Clause> &clauses) {
+             const List<Clause> &clauses, const ConstructQueue &queue,
+             ConstructQueue::iterator item) {
   TODO(loc, "OMPD_ordered");
   return nullptr;
 }
@@ -1360,25 +1314,25 @@ static mlir::omp::OrderedRegionOp
 genOrderedRegionOp(Fortran::lower::AbstractConverter &converter,
                    Fortran::lower::SymMap &symTable,
                    Fortran::semantics::SemanticsContext &semaCtx,
-                   Fortran::lower::pft::Evaluation &eval, bool genNested,
-                   mlir::Location loc, const List<Clause> &clauses) {
+                   Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
+                   const List<Clause> &clauses, const ConstructQueue &queue,
+                   ConstructQueue::iterator item) {
   mlir::omp::OrderedRegionClauseOps clauseOps;
   genOrderedRegionClauses(converter, semaCtx, clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::OrderedRegionOp>(
       OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
-                        llvm::omp::Directive::OMPD_ordered)
-          .setGenNested(genNested),
-      clauseOps);
+                        llvm::omp::Directive::OMPD_ordered),
+      queue, item, clauseOps);
 }
 
 static mlir::omp::ParallelOp
 genParallelOp(Fortran::lower::AbstractConverter &converter,
               Fortran::lower::SymMap &symTable,
               Fortran::semantics::SemanticsContext &semaCtx,
-              Fortran::lower::pft::Evaluation &eval, bool genNested,
-              mlir::Location loc, const List<Clause> &clauses,
-              bool outerCombined = false) {
+              Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
+              const List<Clause> &clauses, const ConstructQueue &queue,
+              ConstructQueue::iterator item, bool outerCombined = false) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   Fortran::lower::StatementContext stmtCtx;
   mlir::omp::ParallelClauseOps clauseOps;
@@ -1397,14 +1351,14 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
   OpWithBodyGenInfo genInfo =
       OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_parallel)
-          .setGenNested(genNested)
           .setOuterCombined(outerCombined)
           .setClauses(&clauses)
           .setReductions(&reductionSyms, &reductionTypes)
           .setGenRegionEntryCb(reductionCallback);
 
   if (!enableDelayedPrivatization)
-    return genOpWithBody<mlir::omp::ParallelOp>(genInfo, clauseOps);
+    return genOpWithBody<mlir::omp::ParallelOp>(genInfo, queue, item,
+                                                clauseOps);
 
   bool privatize = !outerCombined;
   DataSharingProcessor dsp(converter, semaCtx, clauses, eval,
@@ -1439,30 +1393,36 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
         reductionSyms;
     allSymbols.append(privateSyms);
     for (auto [arg, prv] : llvm::zip_equal(allSymbols, region.getArguments())) {
-      converter.bindSymbol(*arg, prv);
+      fir::ExtendedValue hostExV = converter.getSymbolExtendedValue(*arg);
+      converter.bindSymbol(*arg, hlfir::translateToExtendedValue(
+                                     loc, firOpBuilder, hlfir::Entity{prv},
+                                     /*contiguousHint=*/
+                                     Fortran::evaluate::IsSimplyContiguous(
+                                         *arg, converter.getFoldingContext()))
+                                     .first);
     }
 
     return allSymbols;
   };
 
-  // TODO Merge with the reduction CB.
   genInfo.setGenRegionEntryCb(genRegionEntryCB).setDataSharingProcessor(&dsp);
-  return genOpWithBody<mlir::omp::ParallelOp>(genInfo, clauseOps);
+  return genOpWithBody<mlir::omp::ParallelOp>(genInfo, queue, item, clauseOps);
 }
 
 static mlir::omp::SectionOp
 genSectionOp(Fortran::lower::AbstractConverter &converter,
              Fortran::lower::SymMap &symTable,
              Fortran::semantics::SemanticsContext &semaCtx,
-             Fortran::lower::pft::Evaluation &eval, bool genNested,
-             mlir::Location loc, const List<Clause> &clauses) {
+             Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
+             const List<Clause> &clauses, const ConstructQueue &queue,
+             ConstructQueue::iterator item) {
   // Currently only private/firstprivate clause is handled, and
   // all privatization is done within `omp.section` operations.
   return genOpWithBody<mlir::omp::SectionOp>(
       OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_section)
-          .setGenNested(genNested)
-          .setClauses(&clauses));
+          .setClauses(&clauses),
+      queue, item);
 }
 
 static mlir::omp::SectionsOp
@@ -1470,12 +1430,77 @@ genSectionsOp(Fortran::lower::AbstractConverter &converter,
               Fortran::lower::SymMap &symTable,
               Fortran::semantics::SemanticsContext &semaCtx,
               Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-              const mlir::omp::SectionsClauseOps &clauseOps) {
-  return genOpWithBody<mlir::omp::SectionsOp>(
+              const List<Clause> &clauses, const ConstructQueue &queue,
+              ConstructQueue::iterator item) {
+  mlir::omp::SectionsClauseOps clauseOps;
+  genSectionsClauses(converter, semaCtx, clauses, loc, clauseOps);
+
+  auto &builder = converter.getFirOpBuilder();
+
+  // Insert privatizations before SECTIONS
+  symTable.pushScope();
+  DataSharingProcessor dsp(converter, semaCtx, clauses, eval);
+  dsp.processStep1();
+
+  List<Clause> nonDsaClauses;
+  List<const clause::Lastprivate *> lastprivates;
+
+  for (const Clause &clause : clauses) {
+    if (clause.id == llvm::omp::Clause::OMPC_lastprivate) {
+      lastprivates.push_back(&std::get<clause::Lastprivate>(clause.u));
+    } else {
+      switch (clause.id) {
+      case llvm::omp::Clause::OMPC_firstprivate:
+      case llvm::omp::Clause::OMPC_private:
+      case llvm::omp::Clause::OMPC_shared:
+        break;
+      default:
+        nonDsaClauses.push_back(clause);
+      }
+    }
+  }
+
+  // SECTIONS construct.
+  mlir::omp::SectionsOp sectionsOp = genOpWithBody<mlir::omp::SectionsOp>(
       OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_sections)
-          .setGenNested(false),
-      clauseOps);
+          .setClauses(&nonDsaClauses),
+      queue, item, clauseOps);
+
+  if (!lastprivates.empty()) {
+    mlir::Region &sectionsBody = sectionsOp.getRegion();
+    assert(sectionsBody.hasOneBlock());
+    mlir::Block &body = sectionsBody.front();
+
+    auto lastSectionOp = llvm::find_if(
+        llvm::reverse(body.getOperations()), [](const mlir::Operation &op) {
+          return llvm::isa<mlir::omp::SectionOp>(op);
+        });
+    assert(lastSectionOp != body.rend());
+
+    for (const clause::Lastprivate *lastp : lastprivates) {
+      builder.setInsertionPoint(
+          lastSectionOp->getRegion(0).back().getTerminator());
+      mlir::OpBuilder::InsertPoint insp = builder.saveInsertionPoint();
+      const auto &objList = std::get<ObjectList>(lastp->t);
+      for (const Object &object : objList) {
+        Fortran::semantics::Symbol *sym = object.id();
+        converter.copyHostAssociateVar(*sym, &insp);
+      }
+    }
+  }
+
+  // Perform DataSharingProcessor's step2 out of SECTIONS
+  builder.setInsertionPointAfter(sectionsOp.getOperation());
+  dsp.processStep2(sectionsOp, false);
+  // Emit implicit barrier to synchronize threads and avoid data
+  // races on post-update of lastprivate variables when `nowait`
+  // clause is present.
+  if (clauseOps.nowaitAttr && !lastprivates.empty())
+    builder.create<mlir::omp::BarrierOp>(loc);
+
+  symTable.popScope();
+  return sectionsOp;
 }
 
 static mlir::omp::SimdOp
@@ -1483,7 +1508,8 @@ genSimdOp(Fortran::lower::AbstractConverter &converter,
           Fortran::lower::SymMap &symTable,
           Fortran::semantics::SemanticsContext &semaCtx,
           Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-          const List<Clause> &clauses) {
+          const List<Clause> &clauses, const ConstructQueue &queue,
+          ConstructQueue::iterator item) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   DataSharingProcessor dsp(converter, semaCtx, clauses, eval);
   dsp.processStep1();
@@ -1518,7 +1544,8 @@ genSimdOp(Fortran::lower::AbstractConverter &converter,
                                    *nestedEval, llvm::omp::Directive::OMPD_simd)
                      .setClauses(&clauses)
                      .setDataSharingProcessor(&dsp)
-                     .setGenRegionEntryCb(ivCallback));
+                     .setGenRegionEntryCb(ivCallback),
+                 queue, item);
 
   return simdOp;
 }
@@ -1527,26 +1554,26 @@ static mlir::omp::SingleOp
 genSingleOp(Fortran::lower::AbstractConverter &converter,
             Fortran::lower::SymMap &symTable,
             Fortran::semantics::SemanticsContext &semaCtx,
-            Fortran::lower::pft::Evaluation &eval, bool genNested,
-            mlir::Location loc, const List<Clause> &clauses) {
+            Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
+            const List<Clause> &clauses, const ConstructQueue &queue,
+            ConstructQueue::iterator item) {
   mlir::omp::SingleClauseOps clauseOps;
   genSingleClauses(converter, semaCtx, clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::SingleOp>(
       OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_single)
-          .setGenNested(genNested)
           .setClauses(&clauses),
-      clauseOps);
+      queue, item, clauseOps);
 }
 
 static mlir::omp::TargetOp
 genTargetOp(Fortran::lower::AbstractConverter &converter,
             Fortran::lower::SymMap &symTable,
             Fortran::semantics::SemanticsContext &semaCtx,
-            Fortran::lower::pft::Evaluation &eval, bool genNested,
-            mlir::Location loc, const List<Clause> &clauses,
-            bool outerCombined = false) {
+            Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
+            const List<Clause> &clauses, const ConstructQueue &queue,
+            ConstructQueue::iterator item, bool outerCombined = false) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   Fortran::lower::StatementContext stmtCtx;
 
@@ -1635,8 +1662,9 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
         }
 
         mlir::Value mapOp = createMapInfoOp(
-            firOpBuilder, baseOp.getLoc(), baseOp, mlir::Value{}, name.str(),
-            bounds, {},
+            firOpBuilder, baseOp.getLoc(), baseOp, /*varPtrPtr=*/mlir::Value{},
+            name.str(), bounds, /*members=*/{},
+            /*membersIndex=*/mlir::DenseIntElementsAttr{},
             static_cast<
                 std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
                 mapFlag),
@@ -1652,8 +1680,8 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
   Fortran::lower::pft::visitAllSymbols(eval, captureImplicitMap);
 
   auto targetOp = firOpBuilder.create<mlir::omp::TargetOp>(loc, clauseOps);
-  genBodyOfTargetOp(converter, symTable, semaCtx, eval, genNested, targetOp,
-                    mapSyms, mapLocs, mapTypes, loc);
+  genBodyOfTargetOp(converter, symTable, semaCtx, eval, targetOp, mapSyms,
+                    mapLocs, mapTypes, loc, queue, item);
   return targetOp;
 }
 
@@ -1661,8 +1689,9 @@ static mlir::omp::TargetDataOp
 genTargetDataOp(Fortran::lower::AbstractConverter &converter,
                 Fortran::lower::SymMap &symTable,
                 Fortran::semantics::SemanticsContext &semaCtx,
-                Fortran::lower::pft::Evaluation &eval, bool genNested,
-                mlir::Location loc, const List<Clause> &clauses) {
+                Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
+                const List<Clause> &clauses, const ConstructQueue &queue,
+                ConstructQueue::iterator item) {
   Fortran::lower::StatementContext stmtCtx;
   mlir::omp::TargetDataClauseOps clauseOps;
   llvm::SmallVector<mlir::Type> useDeviceTypes;
@@ -1674,9 +1703,9 @@ genTargetDataOp(Fortran::lower::AbstractConverter &converter,
   auto targetDataOp =
       converter.getFirOpBuilder().create<mlir::omp::TargetDataOp>(loc,
                                                                   clauseOps);
-  genBodyOfTargetDataOp(converter, symTable, semaCtx, eval, genNested,
-                        targetDataOp, useDeviceTypes, useDeviceLocs,
-                        useDeviceSyms, loc);
+  genBodyOfTargetDataOp(converter, symTable, semaCtx, eval, targetDataOp,
+                        useDeviceTypes, useDeviceLocs, useDeviceSyms, loc,
+                        queue, item);
   return targetDataOp;
 }
 
@@ -1685,8 +1714,9 @@ static OpTy
 genTargetEnterExitUpdateDataOp(Fortran::lower::AbstractConverter &converter,
                                Fortran::lower::SymMap &symTable,
                                Fortran::semantics::SemanticsContext &semaCtx,
-                               mlir::Location loc,
-                               const List<Clause> &clauses) {
+                               mlir::Location loc, const List<Clause> &clauses,
+                               const ConstructQueue &queue,
+                               ConstructQueue::iterator item) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   Fortran::lower::StatementContext stmtCtx;
 
@@ -1713,8 +1743,9 @@ static mlir::omp::TaskOp
 genTaskOp(Fortran::lower::AbstractConverter &converter,
           Fortran::lower::SymMap &symTable,
           Fortran::semantics::SemanticsContext &semaCtx,
-          Fortran::lower::pft::Evaluation &eval, bool genNested,
-          mlir::Location loc, const List<Clause> &clauses) {
+          Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
+          const List<Clause> &clauses, const ConstructQueue &queue,
+          ConstructQueue::iterator item) {
   Fortran::lower::StatementContext stmtCtx;
   mlir::omp::TaskClauseOps clauseOps;
   genTaskClauses(converter, semaCtx, stmtCtx, clauses, loc, clauseOps);
@@ -1722,26 +1753,25 @@ genTaskOp(Fortran::lower::AbstractConverter &converter,
   return genOpWithBody<mlir::omp::TaskOp>(
       OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_task)
-          .setGenNested(genNested)
           .setClauses(&clauses),
-      clauseOps);
+      queue, item, clauseOps);
 }
 
 static mlir::omp::TaskgroupOp
 genTaskgroupOp(Fortran::lower::AbstractConverter &converter,
                Fortran::lower::SymMap &symTable,
                Fortran::semantics::SemanticsContext &semaCtx,
-               Fortran::lower::pft::Evaluation &eval, bool genNested,
-               mlir::Location loc, const List<Clause> &clauses) {
+               Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
+               const List<Clause> &clauses, const ConstructQueue &queue,
+               ConstructQueue::iterator item) {
   mlir::omp::TaskgroupClauseOps clauseOps;
   genTaskgroupClauses(converter, semaCtx, clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::TaskgroupOp>(
       OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_taskgroup)
-          .setGenNested(genNested)
           .setClauses(&clauses),
-      clauseOps);
+      queue, item, clauseOps);
 }
 
 static mlir::omp::TaskloopOp
@@ -1749,7 +1779,8 @@ genTaskloopOp(Fortran::lower::AbstractConverter &converter,
               Fortran::lower::SymMap &symTable,
               Fortran::semantics::SemanticsContext &semaCtx,
               Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-              const List<Clause> &clauses) {
+              const List<Clause> &clauses, const ConstructQueue &queue,
+              ConstructQueue::iterator item) {
   TODO(loc, "Taskloop construct");
 }
 
@@ -1758,7 +1789,8 @@ genTaskwaitOp(Fortran::lower::AbstractConverter &converter,
               Fortran::lower::SymMap &symTable,
               Fortran::semantics::SemanticsContext &semaCtx,
               Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-              const List<Clause> &clauses) {
+              const List<Clause> &clauses, const ConstructQueue &queue,
+              ConstructQueue::iterator item) {
   mlir::omp::TaskwaitClauseOps clauseOps;
   genTaskwaitClauses(converter, semaCtx, clauses, loc, clauseOps);
   return converter.getFirOpBuilder().create<mlir::omp::TaskwaitOp>(loc,
@@ -1769,7 +1801,8 @@ static mlir::omp::TaskyieldOp
 genTaskyieldOp(Fortran::lower::AbstractConverter &converter,
                Fortran::lower::SymMap &symTable,
                Fortran::semantics::SemanticsContext &semaCtx,
-               Fortran::lower::pft::Evaluation &eval, mlir::Location loc) {
+               Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
+               const ConstructQueue &queue, ConstructQueue::iterator item) {
   return converter.getFirOpBuilder().create<mlir::omp::TaskyieldOp>(loc);
 }
 
@@ -1777,9 +1810,9 @@ static mlir::omp::TeamsOp
 genTeamsOp(Fortran::lower::AbstractConverter &converter,
            Fortran::lower::SymMap &symTable,
            Fortran::semantics::SemanticsContext &semaCtx,
-           Fortran::lower::pft::Evaluation &eval, bool genNested,
-           mlir::Location loc, const List<Clause> &clauses,
-           bool outerCombined = false) {
+           Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
+           const List<Clause> &clauses, const ConstructQueue &queue,
+           ConstructQueue::iterator item, bool outerCombined = false) {
   Fortran::lower::StatementContext stmtCtx;
   mlir::omp::TeamsClauseOps clauseOps;
   genTeamsClauses(converter, semaCtx, stmtCtx, clauses, loc, clauseOps);
@@ -1787,10 +1820,9 @@ genTeamsOp(Fortran::lower::AbstractConverter &converter,
   return genOpWithBody<mlir::omp::TeamsOp>(
       OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_teams)
-          .setGenNested(genNested)
           .setOuterCombined(outerCombined)
           .setClauses(&clauses),
-      clauseOps);
+      queue, item, clauseOps);
 }
 
 static mlir::omp::WsloopOp
@@ -1798,7 +1830,8 @@ genWsloopOp(Fortran::lower::AbstractConverter &converter,
             Fortran::lower::SymMap &symTable,
             Fortran::semantics::SemanticsContext &semaCtx,
             Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-            const List<Clause> &clauses) {
+            const List<Clause> &clauses, const ConstructQueue &queue,
+            ConstructQueue::iterator item) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   DataSharingProcessor dsp(converter, semaCtx, clauses, eval);
   dsp.processStep1();
@@ -1839,7 +1872,8 @@ genWsloopOp(Fortran::lower::AbstractConverter &converter,
                      .setClauses(&clauses)
                      .setDataSharingProcessor(&dsp)
                      .setReductions(&reductionSyms, &reductionTypes)
-                     .setGenRegionEntryCb(ivCallback));
+                     .setGenRegionEntryCb(ivCallback),
+                 queue, item);
   return wsloopOp;
 }
 
@@ -1847,13 +1881,13 @@ genWsloopOp(Fortran::lower::AbstractConverter &converter,
 // Code generation functions for composite constructs
 //===----------------------------------------------------------------------===//
 
-static void
-genCompositeDistributeParallelDo(Fortran::lower::AbstractConverter &converter,
-                                 Fortran::lower::SymMap &symTable,
-                                 Fortran::semantics::SemanticsContext &semaCtx,
-                                 Fortran::lower::pft::Evaluation &eval,
-                                 const List<Clause> &clauses,
-                                 mlir::Location loc) {
+static void genCompositeDistributeParallelDo(
+    Fortran::lower::AbstractConverter &converter,
+    Fortran::lower::SymMap &symTable,
+    Fortran::semantics::SemanticsContext &semaCtx,
+    Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
+    const List<Clause> &clauses, const ConstructQueue &queue,
+    ConstructQueue::iterator item) {
   TODO(loc, "Composite DISTRIBUTE PARALLEL DO");
 }
 
@@ -1861,8 +1895,9 @@ static void genCompositeDistributeParallelDoSimd(
     Fortran::lower::AbstractConverter &converter,
     Fortran::lower::SymMap &symTable,
     Fortran::semantics::SemanticsContext &semaCtx,
-    Fortran::lower::pft::Evaluation &eval, const List<Clause> &clauses,
-    mlir::Location loc) {
+    Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
+    const List<Clause> &clauses, const ConstructQueue &queue,
+    ConstructQueue::iterator item) {
   TODO(loc, "Composite DISTRIBUTE PARALLEL DO SIMD");
 }
 
@@ -1871,7 +1906,9 @@ genCompositeDistributeSimd(Fortran::lower::AbstractConverter &converter,
                            Fortran::lower::SymMap &symTable,
                            Fortran::semantics::SemanticsContext &semaCtx,
                            Fortran::lower::pft::Evaluation &eval,
-                           const List<Clause> &clauses, mlir::Location loc) {
+                           mlir::Location loc, const List<Clause> &clauses,
+                           const ConstructQueue &queue,
+                           ConstructQueue::iterator item) {
   TODO(loc, "Composite DISTRIBUTE SIMD");
 }
 
@@ -1879,8 +1916,9 @@ static void genCompositeDoSimd(Fortran::lower::AbstractConverter &converter,
                                Fortran::lower::SymMap &symTable,
                                Fortran::semantics::SemanticsContext &semaCtx,
                                Fortran::lower::pft::Evaluation &eval,
-                               const List<Clause> &clauses,
-                               mlir::Location loc) {
+                               mlir::Location loc, const List<Clause> &clauses,
+                               const ConstructQueue &queue,
+                               ConstructQueue::iterator item) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processTODO<clause::Aligned, clause::Allocate, clause::Linear,
                  clause::Order, clause::Safelen, clause::Simdlen>(
@@ -1893,7 +1931,7 @@ static void genCompositeDoSimd(Fortran::lower::AbstractConverter &converter,
   // When support for vectorization is enabled, then we need to add handling of
   // if clause. Currently if clause can be skipped because we always assume
   // SIMD length = 1.
-  genWsloopOp(converter, symTable, semaCtx, eval, loc, clauses);
+  genWsloopOp(converter, symTable, semaCtx, eval, loc, clauses, queue, item);
 }
 
 static void
@@ -1901,11 +1939,129 @@ genCompositeTaskloopSimd(Fortran::lower::AbstractConverter &converter,
                          Fortran::lower::SymMap &symTable,
                          Fortran::semantics::SemanticsContext &semaCtx,
                          Fortran::lower::pft::Evaluation &eval,
-                         const List<Clause> &clauses, mlir::Location loc) {
+                         mlir::Location loc, const List<Clause> &clauses,
+                         const ConstructQueue &queue,
+                         ConstructQueue::iterator item) {
   TODO(loc, "Composite TASKLOOP SIMD");
 }
 
 //===----------------------------------------------------------------------===//
+// Dispatch
+//===----------------------------------------------------------------------===//
+
+static void genOMPDispatch(Fortran::lower::AbstractConverter &converter,
+                           Fortran::lower::SymMap &symTable,
+                           Fortran::semantics::SemanticsContext &semaCtx,
+                           Fortran::lower::pft::Evaluation &eval,
+                           mlir::Location loc, const ConstructQueue &queue,
+                           ConstructQueue::iterator item) {
+  assert(item != queue.end());
+  const List<Clause> &clauses = item->clauses;
+
+  switch (llvm::omp::Directive dir = item->id) {
+  case llvm::omp::Directive::OMPD_distribute:
+    genDistributeOp(converter, symTable, semaCtx, eval, loc, clauses, queue,
+                    item);
+    break;
+  case llvm::omp::Directive::OMPD_do:
+    genWsloopOp(converter, symTable, semaCtx, eval, loc, clauses, queue, item);
+    break;
+  case llvm::omp::Directive::OMPD_loop:
+  case llvm::omp::Directive::OMPD_masked:
+  case llvm::omp::Directive::OMPD_tile:
+  case llvm::omp::Directive::OMPD_unroll:
+    TODO(loc, "Unhandled loop directive (" +
+                  llvm::omp::getOpenMPDirectiveName(dir) + ")");
+    break;
+  case llvm::omp::Directive::OMPD_master:
+    genMasterOp(converter, symTable, semaCtx, eval, loc, clauses, queue, item);
+    break;
+  case llvm::omp::Directive::OMPD_ordered:
+    genOrderedRegionOp(converter, symTable, semaCtx, eval, loc, clauses, queue,
+                       item);
+    break;
+  case llvm::omp::Directive::OMPD_parallel:
+    genParallelOp(converter, symTable, semaCtx, eval, loc, clauses, queue, item,
+                  /*outerCombined=*/false);
+    break;
+  case llvm::omp::Directive::OMPD_sections:
+    genSectionsOp(converter, symTable, semaCtx, eval, loc, clauses, queue,
+                  item);
+    break;
+  case llvm::omp::Directive::OMPD_simd:
+    genSimdOp(converter, symTable, semaCtx, eval, loc, clauses, queue, item);
+    break;
+  case llvm::omp::Directive::OMPD_single:
+    genSingleOp(converter, symTable, semaCtx, eval, loc, clauses, queue, item);
+    break;
+  case llvm::omp::Directive::OMPD_target:
+    genTargetOp(converter, symTable, semaCtx, eval, loc, clauses, queue, item,
+                /*outerCombined=*/false);
+    break;
+  case llvm::omp::Directive::OMPD_target_data:
+    genTargetDataOp(converter, symTable, semaCtx, eval, loc, clauses, queue,
+                    item);
+    break;
+  case llvm::omp::Directive::OMPD_target_enter_data:
+    genTargetEnterExitUpdateDataOp<mlir::omp::TargetEnterDataOp>(
+        converter, symTable, semaCtx, loc, clauses, queue, item);
+    break;
+  case llvm::omp::Directive::OMPD_target_exit_data:
+    genTargetEnterExitUpdateDataOp<mlir::omp::TargetExitDataOp>(
+        converter, symTable, semaCtx, loc, clauses, queue, item);
+    break;
+  case llvm::omp::Directive::OMPD_target_update:
+    genTargetEnterExitUpdateDataOp<mlir::omp::TargetUpdateOp>(
+        converter, symTable, semaCtx, loc, clauses, queue, item);
+    break;
+  case llvm::omp::Directive::OMPD_task:
+    genTaskOp(converter, symTable, semaCtx, eval, loc, clauses, queue, item);
+    break;
+  case llvm::omp::Directive::OMPD_taskgroup:
+    genTaskgroupOp(converter, symTable, semaCtx, eval, loc, clauses, queue,
+                   item);
+    break;
+  case llvm::omp::Directive::OMPD_taskloop:
+    genTaskloopOp(converter, symTable, semaCtx, eval, loc, clauses, queue,
+                  item);
+    break;
+  case llvm::omp::Directive::OMPD_teams:
+    genTeamsOp(converter, symTable, semaCtx, eval, loc, clauses, queue, item);
+    break;
+  // case llvm::omp::Directive::OMPD_workdistribute:
+  case llvm::omp::Directive::OMPD_workshare:
+    // FIXME: Workshare is not a commonly used OpenMP construct, an
+    // implementation for this feature will come later. For the codes
+    // that use this construct, add a single construct for now.
+    genSingleOp(converter, symTable, semaCtx, eval, loc, clauses, queue, item);
+    break;
+  // Composite constructs
+  case llvm::omp::Directive::OMPD_distribute_parallel_do:
+    genCompositeDistributeParallelDo(converter, symTable, semaCtx, eval, loc,
+                                     clauses, queue, item);
+    break;
+  case llvm::omp::Directive::OMPD_distribute_parallel_do_simd:
+    genCompositeDistributeParallelDoSimd(converter, symTable, semaCtx, eval,
+                                         loc, clauses, queue, item);
+    break;
+  case llvm::omp::Directive::OMPD_distribute_simd:
+    genCompositeDistributeSimd(converter, symTable, semaCtx, eval, loc, clauses,
+                               queue, item);
+    break;
+  case llvm::omp::Directive::OMPD_do_simd:
+    genCompositeDoSimd(converter, symTable, semaCtx, eval, loc, clauses, queue,
+                       item);
+    break;
+  case llvm::omp::Directive::OMPD_taskloop_simd:
+    genCompositeTaskloopSimd(converter, symTable, semaCtx, eval, loc, clauses,
+                             queue, item);
+    break;
+  default:
+    break;
+  }
+}
+
+//===----------------------------------------------------------------------===//
 // OpenMPDeclarativeConstruct visitors
 //===----------------------------------------------------------------------===//
 
@@ -2015,36 +2171,47 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
       semaCtx);
   mlir::Location currentLocation = converter.genLocation(directive.source);
 
+  ConstructQueue queue{
+      buildConstructQueue(converter.getFirOpBuilder().getModule(), semaCtx,
+                          eval, directive.source, directive.v, clauses)};
+
   switch (directive.v) {
   default:
     break;
   case llvm::omp::Directive::OMPD_barrier:
-    genBarrierOp(converter, symTable, semaCtx, eval, currentLocation);
+    genBarrierOp(converter, symTable, semaCtx, eval, currentLocation, queue,
+                 queue.begin());
     break;
   case llvm::omp::Directive::OMPD_taskwait:
-    genTaskwaitOp(converter, symTable, semaCtx, eval, currentLocation, clauses);
+    genTaskwaitOp(converter, symTable, semaCtx, eval, currentLocation, clauses,
+                  queue, queue.begin());
     break;
   case llvm::omp::Directive::OMPD_taskyield:
-    genTaskyieldOp(converter, symTable, semaCtx, eval, currentLocation);
+    genTaskyieldOp(converter, symTable, semaCtx, eval, currentLocation, queue,
+                   queue.begin());
     break;
   case llvm::omp::Directive::OMPD_target_data:
-    genTargetDataOp(converter, symTable, semaCtx, eval, /*genNested=*/true,
-                    currentLocation, clauses);
+    genTargetDataOp(converter, symTable, semaCtx, eval, currentLocation,
+                    clauses, queue, queue.begin());
     break;
   case llvm::omp::Directive::OMPD_target_enter_data:
     genTargetEnterExitUpdateDataOp<mlir::omp::TargetEnterDataOp>(
-        converter, symTable, semaCtx, currentLocation, clauses);
+        converter, symTable, semaCtx, currentLocation, clauses, queue,
+        queue.begin());
     break;
   case llvm::omp::Directive::OMPD_target_exit_data:
     genTargetEnterExitUpdateDataOp<mlir::omp::TargetExitDataOp>(
-        converter, symTable, semaCtx, currentLocation, clauses);
+        converter, symTable, semaCtx, currentLocation, clauses, queue,
+        queue.begin());
     break;
   case llvm::omp::Directive::OMPD_target_update:
     genTargetEnterExitUpdateDataOp<mlir::omp::TargetUpdateOp>(
-        converter, symTable, semaCtx, currentLocation, clauses);
+        converter, symTable, semaCtx, currentLocation, clauses, queue,
+        queue.begin());
     break;
   case llvm::omp::Directive::OMPD_ordered:
-    genOrderedOp(converter, symTable, semaCtx, eval, currentLocation, clauses);
+    genOrderedOp(converter, symTable, semaCtx, eval, currentLocation, clauses,
+                 queue, queue.begin());
     break;
   }
 }
@@ -2068,8 +2235,12 @@ genOMP(Fortran::lower::AbstractConverter &converter,
                             [&](auto &&s) { return makeClause(s.v, semaCtx); })
                  : List<Clause>{};
   mlir::Location currentLocation = converter.genLocation(verbatim.source);
+
+  ConstructQueue queue{buildConstructQueue(
+      converter.getFirOpBuilder().getModule(), semaCtx, eval, verbatim.source,
+      llvm::omp::Directive::OMPD_flush, clauses)};
   genFlushOp(converter, symTable, semaCtx, eval, currentLocation, objects,
-             clauses);
+             clauses, queue, queue.begin());
 }
 
 static void
@@ -2181,6 +2352,7 @@ genOMP(Fortran::lower::AbstractConverter &converter,
 
   assert(llvm::omp::blockConstructSet.test(origDirective) &&
          "Expected block construct");
+  (void)origDirective;
 
   for (const Clause &clause : clauses) {
     mlir::Location clauseLocation = converter.genLocation(clause.source);
@@ -2212,75 +2384,15 @@ genOMP(Fortran::lower::AbstractConverter &converter,
     }
   }
 
-  std::optional<llvm::omp::Directive> nextDir = origDirective;
-  bool outermostLeafConstruct = true;
-  while (nextDir) {
-    llvm::omp::Directive leafDir;
-    std::tie(leafDir, nextDir) = splitCombinedDirective(*nextDir);
-    const bool genNested = !nextDir;
-    const bool outerCombined = outermostLeafConstruct && nextDir.has_value();
-    switch (leafDir) {
-    case llvm::omp::Directive::OMPD_master:
-      // 2.16 MASTER construct.
-      genMasterOp(converter, symTable, semaCtx, eval, genNested,
-                  currentLocation);
-      break;
-    case llvm::omp::Directive::OMPD_ordered:
-      // 2.17.9 ORDERED construct.
-      genOrderedRegionOp(converter, symTable, semaCtx, eval, genNested,
-                         currentLocation, clauses);
-      break;
-    case llvm::omp::Directive::OMPD_parallel:
-      // 2.6 PARALLEL construct.
-      genParallelOp(converter, symTable, semaCtx, eval, genNested,
-                    currentLocation, clauses, outerCombined);
-      break;
-    case llvm::omp::Directive::OMPD_single:
-      // 2.8.2 SINGLE construct.
-      genSingleOp(converter, symTable, semaCtx, eval, genNested,
-                  currentLocation, clauses);
-      break;
-    case llvm::omp::Directive::OMPD_target:
-      // 2.12.5 TARGET construct.
-      genTargetOp(converter, symTable, semaCtx, eval, genNested,
-                  currentLocation, clauses, outerCombined);
-      break;
-    case llvm::omp::Directive::OMPD_target_data:
-      // 2.12.2 TARGET DATA construct.
-      genTargetDataOp(converter, symTable, semaCtx, eval, genNested,
-                      currentLocation, clauses);
-      break;
-    case llvm::omp::Directive::OMPD_task:
-      // 2.10.1 TASK construct.
-      genTaskOp(converter, symTable, semaCtx, eval, genNested, currentLocation,
-                clauses);
-      break;
-    case llvm::omp::Directive::OMPD_taskgroup:
-      // 2.17.6 TASKGROUP construct.
-      genTaskgroupOp(converter, symTable, semaCtx, eval, genNested,
-                     currentLocation, clauses);
-      break;
-    case llvm::omp::Directive::OMPD_teams:
-      // 2.7 TEAMS construct.
-      // FIXME Pass the outerCombined argument or rename it to better describe
-      // what it represents if it must always be `false` in this context.
-      genTeamsOp(converter, symTable, semaCtx, eval, genNested, currentLocation,
-                 clauses);
-      break;
-    case llvm::omp::Directive::OMPD_workshare:
-      // 2.8.3 WORKSHARE construct.
-      // FIXME: Workshare is not a commonly used OpenMP construct, an
-      // implementation for this feature will come later. For the codes
-      // that use this construct, add a single construct for now.
-      genSingleOp(converter, symTable, semaCtx, eval, genNested,
-                  currentLocation, clauses);
-      break;
-    default:
-      llvm_unreachable("Unexpected block construct");
-      break;
-    }
-    outermostLeafConstruct = false;
-  }
+  llvm::omp::Directive directive =
+      std::get<parser::OmpBlockDirective>(beginBlockDirective.t).v;
+  const parser::CharBlock &source =
+      std::get<parser::OmpBlockDirective>(beginBlockDirective.t).source;
+  ConstructQueue queue{
+      buildConstructQueue(converter.getFirOpBuilder().getModule(), semaCtx,
+                          eval, source, directive, clauses)};
+  genOMPDispatch(converter, symTable, semaCtx, eval, currentLocation, queue,
+                 queue.begin());
 }
 
 static void
@@ -2293,10 +2405,15 @@ genOMP(Fortran::lower::AbstractConverter &converter,
       std::get<Fortran::parser::OmpCriticalDirective>(criticalConstruct.t);
   List<Clause> clauses =
       makeClauses(std::get<Fortran::parser::OmpClauseList>(cd.t), semaCtx);
+
+  ConstructQueue queue{buildConstructQueue(
+      converter.getFirOpBuilder().getModule(), semaCtx, eval, cd.source,
+      llvm::omp::Directive::OMPD_critical, clauses)};
+
   const auto &name = std::get<std::optional<Fortran::parser::Name>>(cd.t);
   mlir::Location currentLocation = converter.getCurrentLocation();
-  genCriticalOp(converter, symTable, semaCtx, eval, /*genNested=*/true,
-                currentLocation, clauses, name);
+  genCriticalOp(converter, symTable, semaCtx, eval, currentLocation, clauses,
+                queue, queue.begin(), name);
 }
 
 static void
@@ -2317,14 +2434,6 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
       std::get<Fortran::parser::OmpBeginLoopDirective>(loopConstruct.t);
   List<Clause> clauses = makeClauses(
       std::get<Fortran::parser::OmpClauseList>(beginLoopDirective.t), semaCtx);
-  mlir::Location currentLocation =
-      converter.genLocation(beginLoopDirective.source);
-  const auto origDirective =
-      std::get<Fortran::parser::OmpLoopDirective>(beginLoopDirective.t).v;
-
-  assert(llvm::omp::loopConstructSet.test(origDirective) &&
-         "Expected loop construct");
-
   if (auto &endLoopDirective =
           std::get<std::optional<Fortran::parser::OmpEndLoopDirective>>(
               loopConstruct.t)) {
@@ -2333,101 +2442,18 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
         semaCtx));
   }
 
-  std::optional<llvm::omp::Directive> nextDir = origDirective;
-  while (nextDir) {
-    llvm::omp::Directive leafDir;
-    std::tie(leafDir, nextDir) = splitCombinedDirective(*nextDir);
-    if (llvm::omp::compositeConstructSet.test(leafDir)) {
-      assert(!nextDir && "Composite construct cannot be split");
-      switch (leafDir) {
-      case llvm::omp::Directive::OMPD_distribute_parallel_do:
-        // 2.9.4.3 DISTRIBUTE PARALLEL Worksharing-Loop construct.
-        genCompositeDistributeParallelDo(converter, symTable, semaCtx, eval,
-                                         clauses, currentLocation);
-        break;
-      case llvm::omp::Directive::OMPD_distribute_parallel_do_simd:
-        // 2.9.4.4 DISTRIBUTE PARALLEL Worksharing-Loop SIMD construct.
-        genCompositeDistributeParallelDoSimd(converter, symTable, semaCtx, eval,
-                                             clauses, currentLocation);
-        break;
-      case llvm::omp::Directive::OMPD_distribute_simd:
-        // 2.9.4.2 DISTRIBUTE SIMD construct.
-        genCompositeDistributeSimd(converter, symTable, semaCtx, eval, clauses,
-                                   currentLocation);
-        break;
-      case llvm::omp::Directive::OMPD_do_simd:
-        // 2.9.3.2 Worksharing-Loop SIMD construct.
-        genCompositeDoSimd(converter, symTable, semaCtx, eval, clauses,
-                           currentLocation);
-        break;
-      case llvm::omp::Directive::OMPD_taskloop_simd:
-        // 2.10.3 TASKLOOP SIMD construct.
-        genCompositeTaskloopSimd(converter, symTable, semaCtx, eval, clauses,
-                                 currentLocation);
-        break;
-      default:
-        llvm_unreachable("Unexpected composite construct");
-      }
-    } else {
-      const bool genNested = !nextDir;
-      switch (leafDir) {
-      case llvm::omp::Directive::OMPD_distribute:
-        // 2.9.4.1 DISTRIBUTE construct.
-        genDistributeOp(converter, symTable, semaCtx, eval, genNested,
-                        currentLocation, clauses);
-        break;
-      case llvm::omp::Directive::OMPD_do:
-        // 2.9.2 Worksharing-Loop construct.
-        genWsloopOp(converter, symTable, semaCtx, eval, currentLocation,
-                    clauses);
-        break;
-      case llvm::omp::Directive::OMPD_parallel:
-        // 2.6 PARALLEL construct.
-        // FIXME This is not necessarily always the outer leaf construct of a
-        // combined construct in this constext (e.g. distribute parallel do).
-        // Maybe rename the argument if it represents something else or
-        // initialize it properly.
-        genParallelOp(converter, symTable, semaCtx, eval, genNested,
-                      currentLocation, clauses,
-                      /*outerCombined=*/true);
-        break;
-      case llvm::omp::Directive::OMPD_simd:
-        // 2.9.3.1 SIMD construct.
-        genSimdOp(converter, symTable, semaCtx, eval, currentLocation, clauses);
-        break;
-      case llvm::omp::Directive::OMPD_target:
-        // 2.12.5 TARGET construct.
-        genTargetOp(converter, symTable, semaCtx, eval, genNested,
-                    currentLocation, clauses, /*outerCombined=*/true);
-        break;
-      case llvm::omp::Directive::OMPD_taskloop:
-        // 2.10.2 TASKLOOP construct.
-        genTaskloopOp(converter, symTable, semaCtx, eval, currentLocation,
-                      clauses);
-        break;
-      case llvm::omp::Directive::OMPD_teams:
-        // 2.7 TEAMS construct.
-        // FIXME This is not necessarily always the outer leaf construct of a
-        // combined construct in this constext (e.g. target teams distribute).
-        // Maybe rename the argument if it represents something else or
-        // initialize it properly.
-        genTeamsOp(converter, symTable, semaCtx, eval, genNested,
-                   currentLocation, clauses, /*outerCombined=*/true);
-        break;
-      case llvm::omp::Directive::OMPD_loop:
-      case llvm::omp::Directive::OMPD_masked:
-      case llvm::omp::Directive::OMPD_master:
-      case llvm::omp::Directive::OMPD_tile:
-      case llvm::omp::Directive::OMPD_unroll:
-        TODO(currentLocation, "Unhandled loop directive (" +
-                                  llvm::omp::getOpenMPDirectiveName(leafDir) +
-                                  ")");
-        break;
-      default:
-        llvm_unreachable("Unexpected loop construct");
-      }
-    }
-  }
+  mlir::Location currentLocation =
+      converter.genLocation(beginLoopDirective.source);
+
+  llvm::omp::Directive directive =
+      std::get<parser::OmpLoopDirective>(beginLoopDirective.t).v;
+  const parser::CharBlock &source =
+      std::get<parser::OmpLoopDirective>(beginLoopDirective.t).source;
+  ConstructQueue queue{
+      buildConstructQueue(converter.getFirOpBuilder().getModule(), semaCtx,
+                          eval, source, directive, clauses)};
+  genOMPDispatch(converter, symTable, semaCtx, eval, currentLocation, queue,
+                 queue.begin());
 }
 
 static void
@@ -2436,8 +2462,12 @@ genOMP(Fortran::lower::AbstractConverter &converter,
        Fortran::semantics::SemanticsContext &semaCtx,
        Fortran::lower::pft::Evaluation &eval,
        const Fortran::parser::OpenMPSectionConstruct &sectionConstruct) {
-  // SECTION constructs are handled as a part of SECTIONS.
-  llvm_unreachable("Unexpected standalone OMP SECTION");
+  mlir::Location loc = converter.getCurrentLocation();
+  ConstructQueue queue{buildConstructQueue(
+      converter.getFirOpBuilder().getModule(), semaCtx, eval,
+      sectionConstruct.source, llvm::omp::Directive::OMPD_section, {})};
+  genSectionOp(converter, symTable, semaCtx, eval, loc,
+               /*clauses=*/{}, queue, queue.begin());
 }
 
 static void
@@ -2456,39 +2486,17 @@ genOMP(Fortran::lower::AbstractConverter &converter,
   clauses.append(makeClauses(
       std::get<Fortran::parser::OmpClauseList>(endSectionsDirective.t),
       semaCtx));
-
-  // Process clauses before optional omp.parallel, so that new variables are
-  // allocated outside of the parallel region
   mlir::Location currentLocation = converter.getCurrentLocation();
-  mlir::omp::SectionsClauseOps clauseOps;
-  genSectionsClauses(converter, semaCtx, clauses, currentLocation, clauseOps);
-
-  // Parallel wrapper of PARALLEL SECTIONS construct
-  llvm::omp::Directive dir =
-      std::get<Fortran::parser::OmpSectionsDirective>(beginSectionsDirective.t)
-          .v;
-  if (dir == llvm::omp::Directive::OMPD_parallel_sections) {
-    genParallelOp(converter, symTable, semaCtx, eval,
-                  /*genNested=*/false, currentLocation, clauses,
-                  /*outerCombined=*/true);
-  }
 
-  // SECTIONS construct.
-  genSectionsOp(converter, symTable, semaCtx, eval, currentLocation, clauseOps);
-
-  // Generate nested SECTION operations recursively.
-  const auto &sectionBlocks =
-      std::get<Fortran::parser::OmpSectionBlocks>(sectionsConstruct.t);
-  auto &firOpBuilder = converter.getFirOpBuilder();
-  auto ip = firOpBuilder.saveInsertionPoint();
-  for (const auto &[nblock, neval] :
-       llvm::zip(sectionBlocks.v, eval.getNestedEvaluations())) {
-    symTable.pushScope();
-    genSectionOp(converter, symTable, semaCtx, neval, /*genNested=*/true,
-                 currentLocation, clauses);
-    symTable.popScope();
-    firOpBuilder.restoreInsertionPoint(ip);
-  }
+  llvm::omp::Directive directive =
+      std::get<parser::OmpSectionsDirective>(beginSectionsDirective.t).v;
+  const parser::CharBlock &source =
+      std::get<parser::OmpSectionsDirective>(beginSectionsDirective.t).source;
+  ConstructQueue queue{
+      buildConstructQueue(converter.getFirOpBuilder().getModule(), semaCtx,
+                          eval, source, directive, clauses)};
+  genOMPDispatch(converter, symTable, semaCtx, eval, currentLocation, queue,
+                 queue.begin());
 }
 
 static void genOMP(Fortran::lower::AbstractConverter &converter,
diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp
index c38e0c18cac8..cb1d1a5a7f3d 100644
--- a/flang/lib/Lower/OpenMP/Utils.cpp
+++ b/flang/lib/Lower/OpenMP/Utils.cpp
@@ -22,6 +22,9 @@
 #include <flang/Semantics/tools.h>
 #include <llvm/Support/CommandLine.h>
 
+#include <algorithm>
+#include <numeric>
+
 llvm::cl::opt<bool> treatIndexAsSection(
     "openmp-treat-index-as-section",
     llvm::cl::desc("In the OpenMP data clauses treat `a(N)` as `a(N:N)`."),
@@ -48,12 +51,6 @@ int64_t getCollapseValue(const List<Clause> &clauses) {
   return 1;
 }
 
-uint32_t getOpenMPVersion(mlir::ModuleOp mod) {
-  if (mlir::Attribute verAttr = mod->getAttr("omp.version"))
-    return llvm::cast<mlir::omp::VersionAttr>(verAttr).getVersion();
-  llvm_unreachable("Expecting OpenMP version attribute in module");
-}
-
 void genObjectList(const ObjectList &objects,
                    Fortran::lower::AbstractConverter &converter,
                    llvm::SmallVectorImpl<mlir::Value> &operands) {
@@ -116,6 +113,216 @@ void gatherFuncAndVarSyms(
     symbolAndClause.emplace_back(clause, *object.id());
 }
 
+mlir::omp::MapInfoOp
+createMapInfoOp(fir::FirOpBuilder &builder, mlir::Location loc,
+                mlir::Value baseAddr, mlir::Value varPtrPtr, std::string name,
+                llvm::ArrayRef<mlir::Value> bounds,
+                llvm::ArrayRef<mlir::Value> members,
+                mlir::DenseIntElementsAttr membersIndex, uint64_t mapType,
+                mlir::omp::VariableCaptureKind mapCaptureType, mlir::Type retTy,
+                bool partialMap) {
+  if (auto boxTy = llvm::dyn_cast<fir::BaseBoxType>(baseAddr.getType())) {
+    baseAddr = builder.create<fir::BoxAddrOp>(loc, baseAddr);
+    retTy = baseAddr.getType();
+  }
+
+  mlir::TypeAttr varType = mlir::TypeAttr::get(
+      llvm::cast<mlir::omp::PointerLikeType>(retTy).getElementType());
+
+  mlir::omp::MapInfoOp op = builder.create<mlir::omp::MapInfoOp>(
+      loc, retTy, baseAddr, varType, varPtrPtr, members, membersIndex, bounds,
+      builder.getIntegerAttr(builder.getIntegerType(64, false), mapType),
+      builder.getAttr<mlir::omp::VariableCaptureKindAttr>(mapCaptureType),
+      builder.getStringAttr(name), builder.getBoolAttr(partialMap));
+
+  return op;
+}
+
+static int
+getComponentPlacementInParent(const Fortran::semantics::Symbol *componentSym) {
+  const auto *derived =
+      componentSym->owner()
+          .derivedTypeSpec()
+          ->typeSymbol()
+          .detailsIf<Fortran::semantics::DerivedTypeDetails>();
+  assert(derived &&
+         "expected derived type details when processing component symbol");
+  for (auto [placement, name] : llvm::enumerate(derived->componentNames()))
+    if (name == componentSym->name())
+      return placement;
+  return -1;
+}
+
+static std::optional<Object>
+getComponentObject(std::optional<Object> object,
+                   Fortran::semantics::SemanticsContext &semaCtx) {
+  if (!object)
+    return std::nullopt;
+
+  auto ref = evaluate::ExtractDataRef(*object.value().ref());
+  if (!ref)
+    return std::nullopt;
+
+  if (std::holds_alternative<evaluate::Component>(ref->u))
+    return object;
+
+  auto baseObj = getBaseObject(object.value(), semaCtx);
+  if (!baseObj)
+    return std::nullopt;
+
+  return getComponentObject(baseObj.value(), semaCtx);
+}
+
+static void
+generateMemberPlacementIndices(const Object &object,
+                               llvm::SmallVectorImpl<int> &indices,
+                               Fortran::semantics::SemanticsContext &semaCtx) {
+  auto compObj = getComponentObject(object, semaCtx);
+  while (compObj) {
+    indices.push_back(getComponentPlacementInParent(compObj->id()));
+    compObj =
+        getComponentObject(getBaseObject(compObj.value(), semaCtx), semaCtx);
+  }
+
+  indices = llvm::SmallVector<int>{llvm::reverse(indices)};
+}
+
+void addChildIndexAndMapToParent(
+    const omp::Object &object,
+    std::map<const Fortran::semantics::Symbol *,
+             llvm::SmallVector<OmpMapMemberIndicesData>> &parentMemberIndices,
+    mlir::omp::MapInfoOp &mapOp,
+    Fortran::semantics::SemanticsContext &semaCtx) {
+  std::optional<Fortran::evaluate::DataRef> dataRef =
+      ExtractDataRef(object.designator);
+  assert(dataRef.has_value() &&
+         "DataRef could not be extracted during mapping of derived type "
+         "cannot proceed");
+  const Fortran::semantics::Symbol *parentSym = &dataRef->GetFirstSymbol();
+  assert(parentSym && "Could not find parent symbol during lower of "
+                      "a component member in OpenMP map clause");
+  llvm::SmallVector<int> indices;
+  generateMemberPlacementIndices(object, indices, semaCtx);
+  parentMemberIndices[parentSym].push_back({indices, mapOp});
+}
+
+static void calculateShapeAndFillIndices(
+    llvm::SmallVectorImpl<int64_t> &shape,
+    llvm::SmallVectorImpl<OmpMapMemberIndicesData> &memberPlacementData) {
+  shape.push_back(memberPlacementData.size());
+  size_t largestIndicesSize =
+      std::max_element(memberPlacementData.begin(), memberPlacementData.end(),
+                       [](auto a, auto b) {
+                         return a.memberPlacementIndices.size() <
+                                b.memberPlacementIndices.size();
+                       })
+          ->memberPlacementIndices.size();
+  shape.push_back(largestIndicesSize);
+
+  // DenseElementsAttr expects a rectangular shape for the data, so all
+  // index lists have to be of the same length, this emplaces -1 as filler.
+  for (auto &v : memberPlacementData) {
+    if (v.memberPlacementIndices.size() < largestIndicesSize) {
+      auto *prevEnd = v.memberPlacementIndices.end();
+      v.memberPlacementIndices.resize(largestIndicesSize);
+      std::fill(prevEnd, v.memberPlacementIndices.end(), -1);
+    }
+  }
+}
+
+static mlir::DenseIntElementsAttr createDenseElementsAttrFromIndices(
+    llvm::SmallVectorImpl<OmpMapMemberIndicesData> &memberPlacementData,
+    fir::FirOpBuilder &builder) {
+  llvm::SmallVector<int64_t> shape;
+  calculateShapeAndFillIndices(shape, memberPlacementData);
+
+  llvm::SmallVector<int> indicesFlattened = std::accumulate(
+      memberPlacementData.begin(), memberPlacementData.end(),
+      llvm::SmallVector<int>(),
+      [](llvm::SmallVector<int> &x, OmpMapMemberIndicesData y) {
+        x.insert(x.end(), y.memberPlacementIndices.begin(),
+                 y.memberPlacementIndices.end());
+        return x;
+      });
+
+  return mlir::DenseIntElementsAttr::get(
+      mlir::VectorType::get(shape,
+                            mlir::IntegerType::get(builder.getContext(), 32)),
+      indicesFlattened);
+}
+
+void insertChildMapInfoIntoParent(
+    Fortran::lower::AbstractConverter &converter,
+    std::map<const Fortran::semantics::Symbol *,
+             llvm::SmallVector<OmpMapMemberIndicesData>> &parentMemberIndices,
+    llvm::SmallVectorImpl<mlir::Value> &mapOperands,
+    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> &mapSyms,
+    llvm::SmallVectorImpl<mlir::Type> *mapSymTypes,
+    llvm::SmallVectorImpl<mlir::Location> *mapSymLocs) {
+  for (auto indices : parentMemberIndices) {
+    bool parentExists = false;
+    size_t parentIdx;
+    for (parentIdx = 0; parentIdx < mapSyms.size(); ++parentIdx) {
+      if (mapSyms[parentIdx] == indices.first) {
+        parentExists = true;
+        break;
+      }
+    }
+
+    if (parentExists) {
+      auto mapOp = llvm::cast<mlir::omp::MapInfoOp>(
+          mapOperands[parentIdx].getDefiningOp());
+
+      // NOTE: To maintain appropriate SSA ordering, we move the parent map
+      // which will now have references to its children after the last
+      // of its members to be generated. This is necessary when a user
+      // has defined a series of parent and children maps where the parent
+      // precedes the children. An alternative, may be to do
+      // delayed generation of map info operations from the clauses and
+      // organize them first before generation.
+      mapOp->moveAfter(indices.second.back().memberMap);
+
+      for (auto memberIndicesData : indices.second)
+        mapOp.getMembersMutable().append(
+            memberIndicesData.memberMap.getResult());
+
+      mapOp.setMembersIndexAttr(createDenseElementsAttrFromIndices(
+          indices.second, converter.getFirOpBuilder()));
+    } else {
+      // NOTE: We take the map type of the first child, this may not
+      // be the correct thing to do, however, we shall see. For the moment
+      // it allows this to work with enter and exit without causing MLIR
+      // verification issues. The more appropriate thing may be to take
+      // the "main" map type clause from the directive being used.
+      uint64_t mapType = indices.second[0].memberMap.getMapType().value_or(0);
+
+      // create parent to emplace and bind members
+      mlir::Value origSymbol = converter.getSymbolAddress(*indices.first);
+
+      llvm::SmallVector<mlir::Value> members;
+      for (OmpMapMemberIndicesData memberIndicesData : indices.second)
+        members.push_back((mlir::Value)memberIndicesData.memberMap);
+
+      mlir::Value mapOp = createMapInfoOp(
+          converter.getFirOpBuilder(), origSymbol.getLoc(), origSymbol,
+          /*varPtrPtr=*/mlir::Value(), indices.first->name().ToString(),
+          /*bounds=*/{}, members,
+          createDenseElementsAttrFromIndices(indices.second,
+                                             converter.getFirOpBuilder()),
+          mapType, mlir::omp::VariableCaptureKind::ByRef, origSymbol.getType(),
+          /*partialMap=*/true);
+
+      mapOperands.push_back(mapOp);
+      mapSyms.push_back(indices.first);
+
+      if (mapSymTypes)
+        mapSymTypes->push_back(mapOp.getType());
+      if (mapSymLocs)
+        mapSymLocs->push_back(mapOp.getLoc());
+    }
+  }
+}
+
 Fortran::semantics::Symbol *
 getOmpObjectSymbol(const Fortran::parser::OmpObject &ompObject) {
   Fortran::semantics::Symbol *sym = nullptr;
@@ -125,7 +332,11 @@ getOmpObjectSymbol(const Fortran::parser::OmpObject &ompObject) {
             if (auto *arrayEle =
                     Fortran::parser::Unwrap<Fortran::parser::ArrayElement>(
                         designator)) {
-              sym = GetFirstName(arrayEle->base).symbol;
+              // Use getLastName to retrieve the arrays symbol, this will
+              // provide the farthest right symbol (the last) in a designator,
+              // i.e. providing something like the following:
+              // "dtype1%dtype2%array[2:10]", will result in "array"
+              sym = GetLastName(arrayEle->base).symbol;
             } else if (auto *structComp = Fortran::parser::Unwrap<
                            Fortran::parser::StructureComponent>(designator)) {
               sym = structComp->component.symbol;
diff --git a/flang/lib/Lower/OpenMP/Utils.h b/flang/lib/Lower/OpenMP/Utils.h
index 5e0ebba23bf3..345ce55620ee 100644
--- a/flang/lib/Lower/OpenMP/Utils.h
+++ b/flang/lib/Lower/OpenMP/Utils.h
@@ -21,7 +21,6 @@ extern llvm::cl::opt<bool> enableDelayedPrivatization;
 namespace fir {
 class FirOpBuilder;
 } // namespace fir
-
 namespace Fortran {
 
 namespace semantics {
@@ -46,13 +45,42 @@ using DeclareTargetCapturePair =
     std::pair<mlir::omp::DeclareTargetCaptureClause,
               const Fortran::semantics::Symbol &>;
 
+// A small helper structure for keeping track of a component members MapInfoOp
+// and index data when lowering OpenMP map clauses. Keeps track of the
+// placement of the component in the derived type hierarchy it rests within,
+// alongside the generated mlir::omp::MapInfoOp for the mapped component.
+struct OmpMapMemberIndicesData {
+  // The indices representing the component members placement in its derived
+  // type parents hierarchy.
+  llvm::SmallVector<int> memberPlacementIndices;
+
+  // Placement of the member in the member vector.
+  mlir::omp::MapInfoOp memberMap;
+};
+
 mlir::omp::MapInfoOp
 createMapInfoOp(fir::FirOpBuilder &builder, mlir::Location loc,
                 mlir::Value baseAddr, mlir::Value varPtrPtr, std::string name,
                 mlir::ArrayRef<mlir::Value> bounds,
-                mlir::ArrayRef<mlir::Value> members, uint64_t mapType,
+                mlir::ArrayRef<mlir::Value> members,
+                mlir::DenseIntElementsAttr membersIndex, uint64_t mapType,
                 mlir::omp::VariableCaptureKind mapCaptureType, mlir::Type retTy,
-                bool isVal = false);
+                bool partialMap = false);
+
+void addChildIndexAndMapToParent(
+    const omp::Object &object,
+    std::map<const Fortran::semantics::Symbol *,
+             llvm::SmallVector<OmpMapMemberIndicesData>> &parentMemberIndices,
+    mlir::omp::MapInfoOp &mapOp, Fortran::semantics::SemanticsContext &semaCtx);
+
+void insertChildMapInfoIntoParent(
+    Fortran::lower::AbstractConverter &converter,
+    std::map<const Fortran::semantics::Symbol *,
+             llvm::SmallVector<OmpMapMemberIndicesData>> &parentMemberIndices,
+    llvm::SmallVectorImpl<mlir::Value> &mapOperands,
+    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> &mapSyms,
+    llvm::SmallVectorImpl<mlir::Type> *mapSymTypes,
+    llvm::SmallVectorImpl<mlir::Location> *mapSymLocs);
 
 mlir::Type getLoopVarType(Fortran::lower::AbstractConverter &converter,
                           std::size_t loopVarTypeSize);
@@ -65,7 +93,6 @@ void gatherFuncAndVarSyms(
     llvm::SmallVectorImpl<DeclareTargetCapturePair> &symbolAndClause);
 
 int64_t getCollapseValue(const List<Clause> &clauses);
-uint32_t getOpenMPVersion(mlir::ModuleOp mod);
 
 Fortran::semantics::Symbol *
 getOmpObjectSymbol(const Fortran::parser::OmpObject &ompObject);
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index a6da38763726..bd018d7f015b 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -176,8 +176,9 @@ mlir::Value fir::FirOpBuilder::createRealConstant(mlir::Location loc,
   llvm_unreachable("should use builtin floating-point type");
 }
 
-static llvm::SmallVector<mlir::Value>
-elideExtentsAlreadyInType(mlir::Type type, mlir::ValueRange shape) {
+llvm::SmallVector<mlir::Value>
+fir::factory::elideExtentsAlreadyInType(mlir::Type type,
+                                        mlir::ValueRange shape) {
   auto arrTy = mlir::dyn_cast<fir::SequenceType>(type);
   if (shape.empty() || !arrTy)
     return {};
@@ -191,8 +192,9 @@ elideExtentsAlreadyInType(mlir::Type type, mlir::ValueRange shape) {
   return dynamicShape;
 }
 
-static llvm::SmallVector<mlir::Value>
-elideLengthsAlreadyInType(mlir::Type type, mlir::ValueRange lenParams) {
+llvm::SmallVector<mlir::Value>
+fir::factory::elideLengthsAlreadyInType(mlir::Type type,
+                                        mlir::ValueRange lenParams) {
   if (lenParams.empty())
     return {};
   if (auto arrTy = mlir::dyn_cast<fir::SequenceType>(type))
@@ -211,9 +213,9 @@ mlir::Value fir::FirOpBuilder::allocateLocal(
   // Convert the shape extents to `index`, as needed.
   llvm::SmallVector<mlir::Value> indices;
   llvm::SmallVector<mlir::Value> elidedShape =
-      elideExtentsAlreadyInType(ty, shape);
+      fir::factory::elideExtentsAlreadyInType(ty, shape);
   llvm::SmallVector<mlir::Value> elidedLenParams =
-      elideLengthsAlreadyInType(ty, lenParams);
+      fir::factory::elideLengthsAlreadyInType(ty, lenParams);
   auto idxTy = getIndexType();
   for (mlir::Value sh : elidedShape)
     indices.push_back(createConvert(loc, idxTy, sh));
@@ -283,9 +285,9 @@ fir::FirOpBuilder::createTemporary(mlir::Location loc, mlir::Type type,
                                    mlir::ValueRange lenParams,
                                    llvm::ArrayRef<mlir::NamedAttribute> attrs) {
   llvm::SmallVector<mlir::Value> dynamicShape =
-      elideExtentsAlreadyInType(type, shape);
+      fir::factory::elideExtentsAlreadyInType(type, shape);
   llvm::SmallVector<mlir::Value> dynamicLength =
-      elideLengthsAlreadyInType(type, lenParams);
+      fir::factory::elideLengthsAlreadyInType(type, lenParams);
   InsertPoint insPt;
   const bool hoistAlloc = dynamicShape.empty() && dynamicLength.empty();
   if (hoistAlloc) {
@@ -306,9 +308,9 @@ mlir::Value fir::FirOpBuilder::createHeapTemporary(
     mlir::ValueRange shape, mlir::ValueRange lenParams,
     llvm::ArrayRef<mlir::NamedAttribute> attrs) {
   llvm::SmallVector<mlir::Value> dynamicShape =
-      elideExtentsAlreadyInType(type, shape);
+      fir::factory::elideExtentsAlreadyInType(type, shape);
   llvm::SmallVector<mlir::Value> dynamicLength =
-      elideLengthsAlreadyInType(type, lenParams);
+      fir::factory::elideLengthsAlreadyInType(type, lenParams);
 
   assert(!mlir::isa<fir::ReferenceType>(type) && "cannot be a reference");
   return create<fir::AllocMemOp>(loc, type, /*unique_name=*/llvm::StringRef{},
@@ -660,7 +662,8 @@ mlir::Value fir::FirOpBuilder::createBox(mlir::Location loc, mlir::Type boxType,
   mlir::Type valueOrSequenceType = fir::unwrapPassByRefType(boxType);
   return create<fir::EmboxOp>(
       loc, boxType, addr, shape, slice,
-      elideLengthsAlreadyInType(valueOrSequenceType, lengths), tdesc);
+      fir::factory::elideLengthsAlreadyInType(valueOrSequenceType, lengths),
+      tdesc);
 }
 
 void fir::FirOpBuilder::dumpFunc() { getFunction().dump(); }
diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
index 44779427ab55..8fdab2a57181 100644
--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -198,7 +198,7 @@ mlir::Value hlfir::Entity::getFirBase() const {
 fir::FortranVariableOpInterface
 hlfir::genDeclare(mlir::Location loc, fir::FirOpBuilder &builder,
                   const fir::ExtendedValue &exv, llvm::StringRef name,
-                  fir::FortranVariableFlagsAttr flags,
+                  fir::FortranVariableFlagsAttr flags, mlir::Value dummyScope,
                   fir::CUDADataAttributeAttr cudaAttr) {
 
   mlir::Value base = fir::getBase(exv);
@@ -229,7 +229,7 @@ hlfir::genDeclare(mlir::Location loc, fir::FirOpBuilder &builder,
       },
       [](const auto &) {});
   auto declareOp = builder.create<hlfir::DeclareOp>(
-      loc, base, name, shapeOrShift, lenParams, flags, cudaAttr);
+      loc, base, name, shapeOrShift, lenParams, dummyScope, flags, cudaAttr);
   return mlir::cast<fir::FortranVariableOpInterface>(declareOp.getOperation());
 }
 
@@ -844,10 +844,9 @@ hlfir::LoopNest hlfir::genLoopNest(mlir::Location loc,
   return loopNest;
 }
 
-static fir::ExtendedValue
-translateVariableToExtendedValue(mlir::Location loc, fir::FirOpBuilder &builder,
-                                 hlfir::Entity variable,
-                                 bool forceHlfirBase = false) {
+static fir::ExtendedValue translateVariableToExtendedValue(
+    mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity variable,
+    bool forceHlfirBase = false, bool contiguousHint = false) {
   assert(variable.isVariable() && "must be a variable");
   /// When going towards FIR, use the original base value to avoid
   /// introducing descriptors at runtime when they are not required.
@@ -858,7 +857,8 @@ translateVariableToExtendedValue(mlir::Location loc, fir::FirOpBuilder &builder,
                                 fir::MutableProperties{});
 
   if (mlir::isa<fir::BaseBoxType>(base.getType())) {
-    if (!variable.isSimplyContiguous() || variable.isPolymorphic() ||
+    bool contiguous = variable.isSimplyContiguous() || contiguousHint;
+    if (!contiguous || variable.isPolymorphic() ||
         variable.isDerivedWithLengthParameters() || variable.isOptional()) {
       llvm::SmallVector<mlir::Value> nonDefaultLbounds =
           getNonDefaultLowerBounds(loc, builder, variable);
@@ -907,9 +907,10 @@ hlfir::translateToExtendedValue(mlir::Location loc, fir::FirOpBuilder &builder,
 
 std::pair<fir::ExtendedValue, std::optional<hlfir::CleanupFunction>>
 hlfir::translateToExtendedValue(mlir::Location loc, fir::FirOpBuilder &builder,
-                                hlfir::Entity entity) {
+                                hlfir::Entity entity, bool contiguousHint) {
   if (entity.isVariable())
-    return {translateVariableToExtendedValue(loc, builder, entity),
+    return {translateVariableToExtendedValue(loc, builder, entity, false,
+                                             contiguousHint),
             std::nullopt};
 
   if (entity.isProcedure()) {
@@ -1095,8 +1096,9 @@ hlfir::createTempFromMold(mlir::Location loc, fir::FirOpBuilder &builder,
                                     /*shape=*/std::nullopt, lenParams);
     isHeapAlloc = builder.createBool(loc, false);
   }
-  auto declareOp = builder.create<hlfir::DeclareOp>(loc, alloc, tmpName, shape,
-                                                    lenParams, declAttrs);
+  auto declareOp =
+      builder.create<hlfir::DeclareOp>(loc, alloc, tmpName, shape, lenParams,
+                                       /*dummy_scope=*/nullptr, declAttrs);
   if (mold.isPolymorphic()) {
     int rank = mold.getRank();
     // TODO: should probably read rank from the mold.
@@ -1133,8 +1135,9 @@ hlfir::Entity hlfir::createStackTempFromMold(mlir::Location loc,
     alloc = builder.createTemporary(loc, mold.getFortranElementType(), tmpName,
                                     /*shape=*/std::nullopt, lenParams);
   }
-  auto declareOp = builder.create<hlfir::DeclareOp>(loc, alloc, tmpName, shape,
-                                                    lenParams, declAttrs);
+  auto declareOp =
+      builder.create<hlfir::DeclareOp>(loc, alloc, tmpName, shape, lenParams,
+                                       /*dummy_scope=*/nullptr, declAttrs);
   return hlfir::Entity{declareOp.getBase()};
 }
 
@@ -1152,7 +1155,7 @@ hlfir::convertCharacterKind(mlir::Location loc, fir::FirOpBuilder &builder,
   return hlfir::EntityWithAttributes{builder.create<hlfir::DeclareOp>(
       loc, res.getAddr(), ".temp.kindconvert", /*shape=*/nullptr,
       /*typeparams=*/mlir::ValueRange{res.getLen()},
-      fir::FortranVariableFlagsAttr{})};
+      /*dummy_scope=*/nullptr, fir::FortranVariableFlagsAttr{})};
 }
 
 std::pair<hlfir::Entity, std::optional<hlfir::CleanupFunction>>
@@ -1224,7 +1227,8 @@ hlfir::genTypeAndKindConvert(mlir::Location loc, fir::FirOpBuilder &builder,
         builder.create<fir::ShapeShiftOp>(loc, shapeShiftType, lbAndExtents);
     auto declareOp = builder.create<hlfir::DeclareOp>(
         loc, associate.getFirBase(), *associate.getUniqName(), shapeShift,
-        associate.getTypeparams(), /*flags=*/fir::FortranVariableFlagsAttr{});
+        associate.getTypeparams(), /*dummy_scope=*/nullptr,
+        /*flags=*/fir::FortranVariableFlagsAttr{});
     hlfir::Entity castWithLbounds =
         mlir::cast<fir::FortranVariableOpInterface>(declareOp.getOperation());
     fir::FirOpBuilder *bldr = &builder;
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 9d72e76e2369..58064d23eb08 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -1607,7 +1607,7 @@ static bool isIntrinsicModuleProcedure(llvm::StringRef name) {
 static bool isCoarrayIntrinsic(llvm::StringRef name) {
   return name.starts_with("atomic_") || name.starts_with("co_") ||
          name.contains("image") || name.ends_with("cobound") ||
-         name.equals("team_number");
+         name == "team_number";
 }
 
 /// Return the generic name of an intrinsic module procedure specific name.
@@ -6306,11 +6306,9 @@ void IntrinsicLibrary::genSystem(llvm::ArrayRef<fir::ExtendedValue> args) {
   // Create a dummmy cmdstat to prevent EXECUTE_COMMAND_LINE terminate itself
   // when cmdstat is assigned with a non-zero value but not present
   mlir::Value tempValue =
-      builder.createIntegerConstant(loc, builder.getI2Type(), 0);
+      builder.createIntegerConstant(loc, builder.getI16Type(), 0);
   mlir::Value temp = builder.createTemporary(loc, builder.getI16Type());
-  mlir::Value castVal =
-      builder.createConvert(loc, builder.getI16Type(), tempValue);
-  builder.create<fir::StoreOp>(loc, castVal, temp);
+  builder.create<fir::StoreOp>(loc, tempValue, temp);
   mlir::Value cmdstatBox = builder.createBox(loc, temp);
 
   mlir::Value cmdmsgBox =
diff --git a/flang/lib/Optimizer/Builder/TemporaryStorage.cpp b/flang/lib/Optimizer/Builder/TemporaryStorage.cpp
index dbc285ce9e22..d34dad52c28b 100644
--- a/flang/lib/Optimizer/Builder/TemporaryStorage.cpp
+++ b/flang/lib/Optimizer/Builder/TemporaryStorage.cpp
@@ -83,7 +83,8 @@ fir::factory::HomogeneousScalarStack::HomogeneousScalarStack(
   mlir::Value shape = builder.genShape(loc, extents);
   temp = builder
              .create<hlfir::DeclareOp>(loc, tempStorage, tempName, shape,
-                                       lengths, fir::FortranVariableFlagsAttr{})
+                                       lengths, /*dummy_scope=*/nullptr,
+                                       fir::FortranVariableFlagsAttr{})
              .getBase();
 }
 
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index b4705aa47992..21154902d23f 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -3497,7 +3497,7 @@ public:
     rewriter.startOpModification(op);
     auto callee = op.getCallee();
     if (callee)
-      if (callee->equals("hypotf"))
+      if (*callee == "hypotf")
         op.setCalleeAttr(mlir::SymbolRefAttr::get(op.getContext(), "_hypotf"));
 
     rewriter.finalizeOpModification(op);
@@ -3514,7 +3514,7 @@ public:
   matchAndRewrite(mlir::LLVM::LLVMFuncOp op,
                   mlir::PatternRewriter &rewriter) const override {
     rewriter.startOpModification(op);
-    if (op.getSymName().equals("hypotf"))
+    if (op.getSymName() == "hypotf")
       op.setSymNameAttr(rewriter.getStringAttr("_hypotf"));
     rewriter.finalizeOpModification(op);
     return mlir::success();
@@ -3629,11 +3629,11 @@ public:
             auto callee = op.getCallee();
             if (!callee)
               return true;
-            return !callee->equals("hypotf");
+            return *callee != "hypotf";
           });
       target.addDynamicallyLegalOp<mlir::LLVM::LLVMFuncOp>(
           [](mlir::LLVM::LLVMFuncOp op) {
-            return !op.getSymName().equals("hypotf");
+            return op.getSymName() != "hypotf";
           });
     }
 
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index 6773d0adced0..edf7f7f4b1a9 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -4033,6 +4033,34 @@ mlir::LogicalResult fir::CUDADeallocateOp::verify() {
   return mlir::success();
 }
 
+void fir::CUDAAllocOp::build(
+    mlir::OpBuilder &builder, mlir::OperationState &result, mlir::Type inType,
+    llvm::StringRef uniqName, llvm::StringRef bindcName,
+    fir::CUDADataAttributeAttr cudaAttr, mlir::ValueRange typeparams,
+    mlir::ValueRange shape, llvm::ArrayRef<mlir::NamedAttribute> attributes) {
+  mlir::StringAttr nameAttr =
+      uniqName.empty() ? mlir::StringAttr{} : builder.getStringAttr(uniqName);
+  mlir::StringAttr bindcAttr =
+      bindcName.empty() ? mlir::StringAttr{} : builder.getStringAttr(bindcName);
+  build(builder, result, wrapAllocaResultType(inType),
+        mlir::TypeAttr::get(inType), nameAttr, bindcAttr, typeparams, shape,
+        cudaAttr);
+  result.addAttributes(attributes);
+}
+
+template <typename Op>
+static mlir::LogicalResult checkCudaAttr(Op op) {
+  if (op.getCudaAttr() == fir::CUDADataAttribute::Device ||
+      op.getCudaAttr() == fir::CUDADataAttribute::Managed ||
+      op.getCudaAttr() == fir::CUDADataAttribute::Unified)
+    return mlir::success();
+  return op.emitOpError("expect device, managed or unified cuda attribute");
+}
+
+mlir::LogicalResult fir::CUDAAllocOp::verify() { return checkCudaAttr(*this); }
+
+mlir::LogicalResult fir::CUDAFreeOp::verify() { return checkCudaAttr(*this); }
+
 //===----------------------------------------------------------------------===//
 // FIROpsDialect
 //===----------------------------------------------------------------------===//
diff --git a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
index 4b586ad1d3a4..c232ae165d4c 100644
--- a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
+++ b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
@@ -125,6 +125,7 @@ void hlfir::DeclareOp::build(mlir::OpBuilder &builder,
                              mlir::OperationState &result, mlir::Value memref,
                              llvm::StringRef uniq_name, mlir::Value shape,
                              mlir::ValueRange typeparams,
+                             mlir::Value dummy_scope,
                              fir::FortranVariableFlagsAttr fortran_attrs,
                              fir::CUDADataAttributeAttr cuda_attr) {
   auto nameAttr = builder.getStringAttr(uniq_name);
@@ -133,8 +134,7 @@ void hlfir::DeclareOp::build(mlir::OpBuilder &builder,
   mlir::Type hlfirVariableType =
       getHLFIRVariableType(inputType, hasExplicitLbs);
   build(builder, result, {hlfirVariableType, inputType}, memref, shape,
-        typeparams, /*dummy_scope=*/nullptr, nameAttr, fortran_attrs,
-        cuda_attr);
+        typeparams, dummy_scope, nameAttr, fortran_attrs, cuda_attr);
 }
 
 mlir::LogicalResult hlfir::DeclareOp::verify() {
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp b/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp
index d4e4835ee726..76b42c57277b 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp
@@ -122,9 +122,10 @@ createArrayTemp(mlir::Location loc, fir::FirOpBuilder &builder,
         fir::FortranVariableFlagsAttr::get(
             builder.getContext(), fir::FortranVariableFlagsEnum::allocatable);
 
-    auto declareOp = builder.create<hlfir::DeclareOp>(loc, alloc, tmpName,
-                                                      /*shape=*/nullptr,
-                                                      lenParams, declAttrs);
+    auto declareOp =
+        builder.create<hlfir::DeclareOp>(loc, alloc, tmpName,
+                                         /*shape=*/nullptr, lenParams,
+                                         /*dummy_scope=*/nullptr, declAttrs);
 
     int rank = extents.size();
     fir::runtime::genAllocatableApplyMold(builder, loc, alloc,
@@ -152,9 +153,9 @@ createArrayTemp(mlir::Location loc, fir::FirOpBuilder &builder,
 
   mlir::Value allocmem = builder.createHeapTemporary(loc, sequenceType, tmpName,
                                                      extents, lenParams);
-  auto declareOp =
-      builder.create<hlfir::DeclareOp>(loc, allocmem, tmpName, shape, lenParams,
-                                       fir::FortranVariableFlagsAttr{});
+  auto declareOp = builder.create<hlfir::DeclareOp>(
+      loc, allocmem, tmpName, shape, lenParams,
+      /*dummy_scope=*/nullptr, fir::FortranVariableFlagsAttr{});
   mlir::Value trueVal = builder.createBool(loc, true);
   return {hlfir::Entity{declareOp.getBase()}, trueVal};
 }
@@ -331,7 +332,7 @@ struct SetLengthOpConversion
                                           /*shape=*/std::nullopt, lenParams);
     auto declareOp = builder.create<hlfir::DeclareOp>(
         loc, alloca, tmpName, /*shape=*/mlir::Value{}, lenParams,
-        fir::FortranVariableFlagsAttr{});
+        /*dummy_scope=*/nullptr, fir::FortranVariableFlagsAttr{});
     hlfir::Entity temp{declareOp.getBase()};
     // Assign string value to the created temp.
     builder.create<hlfir::AssignOp>(loc, string, temp,
diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt
index 5a542f237f8f..308b5ed06623 100644
--- a/flang/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt
@@ -17,8 +17,8 @@ add_flang_library(FIRTransforms
   AddDebugInfo.cpp
   PolymorphicOpConversion.cpp
   LoopVersioning.cpp
-  OMPDescriptorMapInfoGen.cpp
   OMPFunctionFiltering.cpp
+  OMPMapInfoFinalization.cpp
   OMPMarkDeclareTarget.cpp
   VScaleAttr.cpp
   FunctionAttr.cpp
diff --git a/flang/lib/Optimizer/Transforms/OMPDescriptorMapInfoGen.cpp b/flang/lib/Optimizer/Transforms/OMPDescriptorMapInfoGen.cpp
deleted file mode 100644
index 6ffcf0746c76..000000000000
--- a/flang/lib/Optimizer/Transforms/OMPDescriptorMapInfoGen.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-//===- OMPDescriptorMapInfoGen.cpp
-//---------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-/// \file
-/// An OpenMP dialect related pass for FIR/HLFIR which expands MapInfoOp's
-/// containing descriptor related types (fir::BoxType's) into multiple
-/// MapInfoOp's containing the parent descriptor and pointer member components
-/// for individual mapping, treating the descriptor type as a record type for
-/// later lowering in the OpenMP dialect.
-//===----------------------------------------------------------------------===//
-
-#include "flang/Optimizer/Builder/FIRBuilder.h"
-#include "flang/Optimizer/Dialect/FIRType.h"
-#include "flang/Optimizer/Dialect/Support/KindMapping.h"
-#include "flang/Optimizer/Transforms/Passes.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
-#include "mlir/IR/BuiltinDialect.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/IR/SymbolTable.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Support/LLVM.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include <iterator>
-
-namespace fir {
-#define GEN_PASS_DEF_OMPDESCRIPTORMAPINFOGENPASS
-#include "flang/Optimizer/Transforms/Passes.h.inc"
-} // namespace fir
-
-namespace {
-class OMPDescriptorMapInfoGenPass
-    : public fir::impl::OMPDescriptorMapInfoGenPassBase<
-          OMPDescriptorMapInfoGenPass> {
-
-  void genDescriptorMemberMaps(mlir::omp::MapInfoOp op,
-                               fir::FirOpBuilder &builder,
-                               mlir::Operation *target) {
-    mlir::Location loc = builder.getUnknownLoc();
-    mlir::Value descriptor = op.getVarPtr();
-
-    // If we enter this function, but the mapped type itself is not the
-    // descriptor, then it's likely the address of the descriptor so we
-    // must retrieve the descriptor SSA.
-    if (!fir::isTypeWithDescriptor(op.getVarType())) {
-      if (auto addrOp = mlir::dyn_cast_if_present<fir::BoxAddrOp>(
-              op.getVarPtr().getDefiningOp())) {
-        descriptor = addrOp.getVal();
-      }
-    }
-
-    // The fir::BoxOffsetOp only works with !fir.ref<!fir.box<...>> types, as
-    // allowing it to access non-reference box operations can cause some
-    // problematic SSA IR. However, in the case of assumed shape's the type
-    // is not a !fir.ref, in these cases to retrieve the appropriate
-    // !fir.ref<!fir.box<...>> to access the data we need to map we must
-    // perform an alloca and then store to it and retrieve the data from the new
-    // alloca.
-    if (mlir::isa<fir::BaseBoxType>(descriptor.getType())) {
-      mlir::OpBuilder::InsertPoint insPt = builder.saveInsertionPoint();
-      builder.setInsertionPointToStart(builder.getAllocaBlock());
-      auto alloca = builder.create<fir::AllocaOp>(loc, descriptor.getType());
-      builder.restoreInsertionPoint(insPt);
-      builder.create<fir::StoreOp>(loc, descriptor, alloca);
-      descriptor = alloca;
-    }
-
-    mlir::Value baseAddrAddr = builder.create<fir::BoxOffsetOp>(
-        loc, descriptor, fir::BoxFieldAttr::base_addr);
-
-    // Member of the descriptor pointing at the allocated data
-    mlir::Value baseAddr = builder.create<mlir::omp::MapInfoOp>(
-        loc, baseAddrAddr.getType(), descriptor,
-        llvm::cast<mlir::omp::PointerLikeType>(
-            fir::unwrapRefType(baseAddrAddr.getType()))
-            .getElementType(),
-        baseAddrAddr, mlir::SmallVector<mlir::Value>{}, op.getBounds(),
-        builder.getIntegerAttr(builder.getIntegerType(64, false),
-                               op.getMapType().value()),
-        builder.getAttr<mlir::omp::VariableCaptureKindAttr>(
-            mlir::omp::VariableCaptureKind::ByRef),
-        builder.getStringAttr("") /*name*/);
-
-    // TODO: map the addendum segment of the descriptor, similarly to the
-    // above base address/data pointer member.
-
-    if (auto mapClauseOwner =
-            llvm::dyn_cast<mlir::omp::MapClauseOwningOpInterface>(target)) {
-      llvm::SmallVector<mlir::Value> newMapOps;
-      mlir::OperandRange mapOperandsArr = mapClauseOwner.getMapOperands();
-
-      for (size_t i = 0; i < mapOperandsArr.size(); ++i) {
-        if (mapOperandsArr[i] == op) {
-          // Push new implicit maps generated for the descriptor.
-          newMapOps.push_back(baseAddr);
-
-          // for TargetOp's which have IsolatedFromAbove we must align the
-          // new additional map operand with an appropriate BlockArgument,
-          // as the printing and later processing currently requires a 1:1
-          // mapping of BlockArgs to MapInfoOp's at the same placement in
-          // each array (BlockArgs and MapOperands).
-          if (auto targetOp = llvm::dyn_cast<mlir::omp::TargetOp>(target))
-            targetOp.getRegion().insertArgument(i, baseAddr.getType(), loc);
-        }
-        newMapOps.push_back(mapOperandsArr[i]);
-      }
-      mapClauseOwner.getMapOperandsMutable().assign(newMapOps);
-    }
-
-    mlir::Value newDescParentMapOp = builder.create<mlir::omp::MapInfoOp>(
-        op->getLoc(), op.getResult().getType(), descriptor,
-        fir::unwrapRefType(descriptor.getType()), mlir::Value{},
-        mlir::SmallVector<mlir::Value>{baseAddr},
-        mlir::SmallVector<mlir::Value>{},
-        builder.getIntegerAttr(builder.getIntegerType(64, false),
-                               op.getMapType().value()),
-        op.getMapCaptureTypeAttr(), op.getNameAttr());
-    op.replaceAllUsesWith(newDescParentMapOp);
-    op->erase();
-  }
-
-  // This pass executes on mlir::ModuleOp's finding omp::MapInfoOp's containing
-  // descriptor based types (allocatables, pointers, assumed shape etc.) and
-  // expanding them into multiple omp::MapInfoOp's for each pointer member
-  // contained within the descriptor.
-  void runOnOperation() override {
-    mlir::func::FuncOp func = getOperation();
-    mlir::ModuleOp module = func->getParentOfType<mlir::ModuleOp>();
-    fir::KindMapping kindMap = fir::getKindMapping(module);
-    fir::FirOpBuilder builder{module, std::move(kindMap)};
-
-    func->walk([&](mlir::omp::MapInfoOp op) {
-      if (fir::isTypeWithDescriptor(op.getVarType()) ||
-          mlir::isa_and_present<fir::BoxAddrOp>(
-              op.getVarPtr().getDefiningOp())) {
-        builder.setInsertionPoint(op);
-        // TODO: Currently only supports a single user for the MapInfoOp, this
-        // is fine for the moment as the Fortran Frontend will generate a
-        // new MapInfoOp per Target operation for the moment. However, when/if
-        // we optimise/cleanup the IR, it likely isn't too difficult to
-        // extend this function, it would require some modification to create a
-        // single new MapInfoOp per new MapInfoOp generated and share it across
-        // all users appropriately, making sure to only add a single member link
-        // per new generation for the original originating descriptor MapInfoOp.
-        assert(llvm::hasSingleElement(op->getUsers()) &&
-               "OMPDescriptorMapInfoGen currently only supports single users "
-               "of a MapInfoOp");
-        genDescriptorMemberMaps(op, builder, *op->getUsers().begin());
-      }
-    });
-  }
-};
-
-} // namespace
-
-namespace fir {
-std::unique_ptr<mlir::Pass> createOMPDescriptorMapInfoGenPass() {
-  return std::make_unique<OMPDescriptorMapInfoGenPass>();
-}
-} // namespace fir
diff --git a/flang/lib/Optimizer/Transforms/OMPMapInfoFinalization.cpp b/flang/lib/Optimizer/Transforms/OMPMapInfoFinalization.cpp
new file mode 100644
index 000000000000..5a5d4b9e0da4
--- /dev/null
+++ b/flang/lib/Optimizer/Transforms/OMPMapInfoFinalization.cpp
@@ -0,0 +1,261 @@
+//===- OMPMapInfoFinalization.cpp
+//---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+/// \file
+/// An OpenMP dialect related pass for FIR/HLFIR which performs some
+/// pre-processing of MapInfoOp's after the module has been lowered to
+/// finalize them.
+///
+/// For example, it expands MapInfoOp's containing descriptor related
+/// types (fir::BoxType's) into multiple MapInfoOp's containing the parent
+/// descriptor and pointer member components for individual mapping,
+/// treating the descriptor type as a record type for later lowering in the
+/// OpenMP dialect.
+///
+/// The pass also adds MapInfoOp's that are members of a parent object but are
+/// not directly used in the body of a target region to its BlockArgument list
+/// to maintain consistency across all MapInfoOp's tied to a region directly or
+/// indirectly via a parent object.
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Optimizer/Dialect/Support/KindMapping.h"
+#include "flang/Optimizer/Transforms/Passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/IR/BuiltinDialect.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Frontend/OpenMP/OMPConstants.h"
+#include <iterator>
+
+namespace fir {
+#define GEN_PASS_DEF_OMPMAPINFOFINALIZATIONPASS
+#include "flang/Optimizer/Transforms/Passes.h.inc"
+} // namespace fir
+
+namespace {
+class OMPMapInfoFinalizationPass
+    : public fir::impl::OMPMapInfoFinalizationPassBase<
+          OMPMapInfoFinalizationPass> {
+
+  void genDescriptorMemberMaps(mlir::omp::MapInfoOp op,
+                               fir::FirOpBuilder &builder,
+                               mlir::Operation *target) {
+    mlir::Location loc = op.getLoc();
+    mlir::Value descriptor = op.getVarPtr();
+
+    // If we enter this function, but the mapped type itself is not the
+    // descriptor, then it's likely the address of the descriptor so we
+    // must retrieve the descriptor SSA.
+    if (!fir::isTypeWithDescriptor(op.getVarType())) {
+      if (auto addrOp = mlir::dyn_cast_if_present<fir::BoxAddrOp>(
+              op.getVarPtr().getDefiningOp())) {
+        descriptor = addrOp.getVal();
+      }
+    }
+
+    // The fir::BoxOffsetOp only works with !fir.ref<!fir.box<...>> types, as
+    // allowing it to access non-reference box operations can cause some
+    // problematic SSA IR. However, in the case of assumed shape's the type
+    // is not a !fir.ref, in these cases to retrieve the appropriate
+    // !fir.ref<!fir.box<...>> to access the data we need to map we must
+    // perform an alloca and then store to it and retrieve the data from the new
+    // alloca.
+    if (mlir::isa<fir::BaseBoxType>(descriptor.getType())) {
+      mlir::OpBuilder::InsertPoint insPt = builder.saveInsertionPoint();
+      builder.setInsertionPointToStart(builder.getAllocaBlock());
+      auto alloca = builder.create<fir::AllocaOp>(loc, descriptor.getType());
+      builder.restoreInsertionPoint(insPt);
+      builder.create<fir::StoreOp>(loc, descriptor, alloca);
+      descriptor = alloca;
+    }
+
+    mlir::Value baseAddrAddr = builder.create<fir::BoxOffsetOp>(
+        loc, descriptor, fir::BoxFieldAttr::base_addr);
+
+    // Member of the descriptor pointing at the allocated data
+    mlir::Value baseAddr = builder.create<mlir::omp::MapInfoOp>(
+        loc, baseAddrAddr.getType(), descriptor,
+        mlir::TypeAttr::get(llvm::cast<mlir::omp::PointerLikeType>(
+                                fir::unwrapRefType(baseAddrAddr.getType()))
+                                .getElementType()),
+        baseAddrAddr, /*members=*/mlir::SmallVector<mlir::Value>{},
+        /*member_index=*/mlir::DenseIntElementsAttr{}, op.getBounds(),
+        builder.getIntegerAttr(builder.getIntegerType(64, false),
+                               op.getMapType().value()),
+        builder.getAttr<mlir::omp::VariableCaptureKindAttr>(
+            mlir::omp::VariableCaptureKind::ByRef),
+        /*name=*/builder.getStringAttr(""),
+        /*partial_map=*/builder.getBoolAttr(false));
+
+    // TODO: map the addendum segment of the descriptor, similarly to the
+    // above base address/data pointer member.
+
+    if (auto mapClauseOwner =
+            llvm::dyn_cast<mlir::omp::MapClauseOwningOpInterface>(target)) {
+      llvm::SmallVector<mlir::Value> newMapOps;
+      mlir::OperandRange mapOperandsArr = mapClauseOwner.getMapOperands();
+
+      for (size_t i = 0; i < mapOperandsArr.size(); ++i) {
+        if (mapOperandsArr[i] == op) {
+          // Push new implicit maps generated for the descriptor.
+          newMapOps.push_back(baseAddr);
+
+          // for TargetOp's which have IsolatedFromAbove we must align the
+          // new additional map operand with an appropriate BlockArgument,
+          // as the printing and later processing currently requires a 1:1
+          // mapping of BlockArgs to MapInfoOp's at the same placement in
+          // each array (BlockArgs and MapOperands).
+          if (auto targetOp = llvm::dyn_cast<mlir::omp::TargetOp>(target))
+            targetOp.getRegion().insertArgument(i, baseAddr.getType(), loc);
+        }
+        newMapOps.push_back(mapOperandsArr[i]);
+      }
+      mapClauseOwner.getMapOperandsMutable().assign(newMapOps);
+    }
+
+    mlir::Value newDescParentMapOp = builder.create<mlir::omp::MapInfoOp>(
+        op->getLoc(), op.getResult().getType(), descriptor,
+        mlir::TypeAttr::get(fir::unwrapRefType(descriptor.getType())),
+        /*varPtrPtr=*/mlir::Value{},
+        /*members=*/mlir::SmallVector<mlir::Value>{baseAddr},
+        /*members_index=*/
+        mlir::DenseIntElementsAttr::get(
+            mlir::VectorType::get(
+                llvm::ArrayRef<int64_t>({1, 1}),
+                mlir::IntegerType::get(builder.getContext(), 32)),
+            llvm::ArrayRef<int32_t>({0})),
+        /*bounds=*/mlir::SmallVector<mlir::Value>{},
+        builder.getIntegerAttr(builder.getIntegerType(64, false),
+                               op.getMapType().value()),
+        op.getMapCaptureTypeAttr(), op.getNameAttr(), op.getPartialMapAttr());
+    op.replaceAllUsesWith(newDescParentMapOp);
+    op->erase();
+  }
+
+  // We add all mapped record members not directly used in the target region
+  // to the block arguments in front of their parent and we place them into
+  // the map operands list for consistency.
+  //
+  // These indirect uses (via accesses to their parent) will still be
+  // mapped individually in most cases, and a parent mapping doesn't
+  // guarantee the parent will be mapped in its totality, partial
+  // mapping is common.
+  //
+  // For example:
+  //    map(tofrom: x%y)
+  //
+  // Will generate a mapping for "x" (the parent) and "y" (the member).
+  // The parent "x" will not be mapped, but the member "y" will.
+  // However, we must have the parent as a BlockArg and MapOperand
+  // in these cases, to maintain the correct uses within the region and
+  // to help tracking that the member is part of a larger object.
+  //
+  // In the case of:
+  //    map(tofrom: x%y, x%z)
+  //
+  // The parent member becomes more critical, as we perform a partial
+  // structure mapping where we link the mapping of the members y
+  // and z together via the parent x. We do this at a kernel argument
+  // level in LLVM IR and not just MLIR, which is important to maintain
+  // similarity to Clang and for the runtime to do the correct thing.
+  // However, we still do not map the structure in its totality but
+  // rather we generate an un-sized "binding" map entry for it.
+  //
+  // In the case of:
+  //    map(tofrom: x, x%y, x%z)
+  //
+  // We do actually map the entirety of "x", so the explicit mapping of
+  // x%y, x%z becomes unnecessary. It is redundant to write this from a
+  // Fortran OpenMP perspective (although it is legal), as even if the
+  // members were allocatables or pointers, we are mandated by the
+  // specification to map these (and any recursive components) in their
+  // entirety, which is different to the C++ equivalent, which requires
+  // explicit mapping of these segments.
+  void addImplicitMembersToTarget(mlir::omp::MapInfoOp op,
+                                  fir::FirOpBuilder &builder,
+                                  mlir::Operation *target) {
+    auto mapClauseOwner =
+        llvm::dyn_cast<mlir::omp::MapClauseOwningOpInterface>(target);
+    if (!mapClauseOwner)
+      return;
+
+    llvm::SmallVector<mlir::Value> newMapOps;
+    mlir::OperandRange mapOperandsArr = mapClauseOwner.getMapOperands();
+    auto targetOp = llvm::dyn_cast<mlir::omp::TargetOp>(target);
+
+    for (size_t i = 0; i < mapOperandsArr.size(); ++i) {
+      if (mapOperandsArr[i] == op) {
+        for (auto [j, mapMember] : llvm::enumerate(op.getMembers())) {
+          newMapOps.push_back(mapMember);
+          // for TargetOp's which have IsolatedFromAbove we must align the
+          // new additional map operand with an appropriate BlockArgument,
+          // as the printing and later processing currently requires a 1:1
+          // mapping of BlockArgs to MapInfoOp's at the same placement in
+          // each array (BlockArgs and MapOperands).
+          if (targetOp) {
+            targetOp.getRegion().insertArgument(i + j, mapMember.getType(),
+                                                targetOp->getLoc());
+          }
+        }
+      }
+      newMapOps.push_back(mapOperandsArr[i]);
+    }
+    mapClauseOwner.getMapOperandsMutable().assign(newMapOps);
+  }
+
+  // This pass executes on mlir::ModuleOp's finding omp::MapInfoOp's containing
+  // descriptor based types (allocatables, pointers, assumed shape etc.) and
+  // expanding them into multiple omp::MapInfoOp's for each pointer member
+  // contained within the descriptor.
+  void runOnOperation() override {
+    mlir::func::FuncOp func = getOperation();
+    mlir::ModuleOp module = func->getParentOfType<mlir::ModuleOp>();
+    fir::KindMapping kindMap = fir::getKindMapping(module);
+    fir::FirOpBuilder builder{module, std::move(kindMap)};
+
+    func->walk([&](mlir::omp::MapInfoOp op) {
+      // TODO: Currently only supports a single user for the MapInfoOp, this
+      // is fine for the moment as the Fortran Frontend will generate a
+      // new MapInfoOp per Target operation for the moment. However, when/if
+      // we optimise/cleanup the IR, it likely isn't too difficult to
+      // extend this function, it would require some modification to create a
+      // single new MapInfoOp per new MapInfoOp generated and share it across
+      // all users appropriately, making sure to only add a single member link
+      // per new generation for the original originating descriptor MapInfoOp.
+      assert(llvm::hasSingleElement(op->getUsers()) &&
+             "OMPMapInfoFinalization currently only supports single users "
+             "of a MapInfoOp");
+
+      if (!op.getMembers().empty()) {
+        addImplicitMembersToTarget(op, builder, *op->getUsers().begin());
+      } else if (fir::isTypeWithDescriptor(op.getVarType()) ||
+                 mlir::isa_and_present<fir::BoxAddrOp>(
+                     op.getVarPtr().getDefiningOp())) {
+        builder.setInsertionPoint(op);
+        genDescriptorMemberMaps(op, builder, *op->getUsers().begin());
+      }
+    });
+  }
+};
+
+} // namespace
+
+namespace fir {
+std::unique_ptr<mlir::Pass> createOMPMapInfoFinalizationPass() {
+  return std::make_unique<OMPMapInfoFinalizationPass>();
+}
+} // namespace fir
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index eae478416914..48f213794247 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -634,18 +634,20 @@ TYPE_PARSER(
 
 // Declarative constructs
 TYPE_PARSER(startOmpLine >>
-    sourced(construct<OpenMPDeclarativeConstruct>(
-                Parser<OpenMPDeclareReductionConstruct>{}) ||
-        construct<OpenMPDeclarativeConstruct>(
-            Parser<OpenMPDeclareSimdConstruct>{}) ||
-        construct<OpenMPDeclarativeConstruct>(
-            Parser<OpenMPDeclareTargetConstruct>{}) ||
-        construct<OpenMPDeclarativeConstruct>(
-            Parser<OpenMPDeclarativeAllocate>{}) ||
-        construct<OpenMPDeclarativeConstruct>(
-            Parser<OpenMPRequiresConstruct>{}) ||
-        construct<OpenMPDeclarativeConstruct>(Parser<OpenMPThreadprivate>{})) /
-        endOmpLine)
+    withMessage("expected OpenMP construct"_err_en_US,
+        sourced(construct<OpenMPDeclarativeConstruct>(
+                    Parser<OpenMPDeclareReductionConstruct>{}) ||
+            construct<OpenMPDeclarativeConstruct>(
+                Parser<OpenMPDeclareSimdConstruct>{}) ||
+            construct<OpenMPDeclarativeConstruct>(
+                Parser<OpenMPDeclareTargetConstruct>{}) ||
+            construct<OpenMPDeclarativeConstruct>(
+                Parser<OpenMPDeclarativeAllocate>{}) ||
+            construct<OpenMPDeclarativeConstruct>(
+                Parser<OpenMPRequiresConstruct>{}) ||
+            construct<OpenMPDeclarativeConstruct>(
+                Parser<OpenMPThreadprivate>{})) /
+            endOmpLine))
 
 // Block Construct
 TYPE_PARSER(construct<OpenMPBlockConstruct>(
@@ -681,17 +683,18 @@ TYPE_PARSER(construct<OpenMPSectionsConstruct>(
 
 TYPE_CONTEXT_PARSER("OpenMP construct"_en_US,
     startOmpLine >>
-        first(construct<OpenMPConstruct>(Parser<OpenMPSectionsConstruct>{}),
-            construct<OpenMPConstruct>(Parser<OpenMPLoopConstruct>{}),
-            construct<OpenMPConstruct>(Parser<OpenMPBlockConstruct>{}),
-            // OpenMPBlockConstruct is attempted before
-            // OpenMPStandaloneConstruct to resolve !$OMP ORDERED
-            construct<OpenMPConstruct>(Parser<OpenMPStandaloneConstruct>{}),
-            construct<OpenMPConstruct>(Parser<OpenMPAtomicConstruct>{}),
-            construct<OpenMPConstruct>(Parser<OpenMPExecutableAllocate>{}),
-            construct<OpenMPConstruct>(Parser<OpenMPAllocatorsConstruct>{}),
-            construct<OpenMPConstruct>(Parser<OpenMPDeclarativeAllocate>{}),
-            construct<OpenMPConstruct>(Parser<OpenMPCriticalConstruct>{})))
+        withMessage("expected OpenMP construct"_err_en_US,
+            first(construct<OpenMPConstruct>(Parser<OpenMPSectionsConstruct>{}),
+                construct<OpenMPConstruct>(Parser<OpenMPLoopConstruct>{}),
+                construct<OpenMPConstruct>(Parser<OpenMPBlockConstruct>{}),
+                // OpenMPBlockConstruct is attempted before
+                // OpenMPStandaloneConstruct to resolve !$OMP ORDERED
+                construct<OpenMPConstruct>(Parser<OpenMPStandaloneConstruct>{}),
+                construct<OpenMPConstruct>(Parser<OpenMPAtomicConstruct>{}),
+                construct<OpenMPConstruct>(Parser<OpenMPExecutableAllocate>{}),
+                construct<OpenMPConstruct>(Parser<OpenMPAllocatorsConstruct>{}),
+                construct<OpenMPConstruct>(Parser<OpenMPDeclarativeAllocate>{}),
+                construct<OpenMPConstruct>(Parser<OpenMPCriticalConstruct>{}))))
 
 // END OMP Block directives
 TYPE_PARSER(
diff --git a/flang/lib/Parser/program-parsers.cpp b/flang/lib/Parser/program-parsers.cpp
index e24559bf14f7..ff5e58ebc721 100644
--- a/flang/lib/Parser/program-parsers.cpp
+++ b/flang/lib/Parser/program-parsers.cpp
@@ -247,7 +247,8 @@ TYPE_CONTEXT_PARSER("module subprogram part"_en_US,
 //         separate-module-subprogram
 TYPE_PARSER(construct<ModuleSubprogram>(indirect(functionSubprogram)) ||
     construct<ModuleSubprogram>(indirect(subroutineSubprogram)) ||
-    construct<ModuleSubprogram>(indirect(Parser<SeparateModuleSubprogram>{})))
+    construct<ModuleSubprogram>(indirect(Parser<SeparateModuleSubprogram>{})) ||
+    construct<ModuleSubprogram>(indirect(compilerDirective)))
 
 // R1410 module-nature -> INTRINSIC | NON_INTRINSIC
 constexpr auto moduleNature{
diff --git a/flang/lib/Semantics/check-call.cpp b/flang/lib/Semantics/check-call.cpp
index 94afcbb68b34..8f51ef5ebeba 100644
--- a/flang/lib/Semantics/check-call.cpp
+++ b/flang/lib/Semantics/check-call.cpp
@@ -914,7 +914,7 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
     }
     if (!common::AreCompatibleCUDADataAttrs(dummyDataAttr, actualDataAttr,
             dummy.ignoreTKR,
-            /*allowUnifiedMatchingRule=*/true)) {
+            /*allowUnifiedMatchingRule=*/true, &context.languageFeatures())) {
       auto toStr{[](std::optional<common::CUDADataAttr> x) {
         return x ? "ATTRIBUTES("s +
                 parser::ToUpperCaseLetters(common::EnumToString(*x)) + ")"s
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index c1d9538e557f..ce7870b8d54e 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -87,6 +87,7 @@ private:
   bool CheckDefinedAssignmentArg(const Symbol &, const DummyArgument &, int);
   void CheckSpecifics(const Symbol &, const GenericDetails &);
   void CheckEquivalenceSet(const EquivalenceSet &);
+  void CheckEquivalenceObject(const EquivalenceObject &);
   void CheckBlockData(const Scope &);
   void CheckGenericOps(const Scope &);
   bool CheckConflicting(const Symbol &, Attr, Attr);
@@ -115,11 +116,16 @@ private:
     }
     return msg;
   }
+  bool InModuleFile() const {
+    return FindModuleFileContaining(context_.FindScope(messages_.at())) !=
+        nullptr;
+  }
   template <typename... A> parser::Message *WarnIfNotInModuleFile(A &&...x) {
-    if (FindModuleFileContaining(context_.FindScope(messages_.at()))) {
+    if (InModuleFile()) {
       return nullptr;
+    } else {
+      return messages_.Say(std::forward<A>(x)...);
     }
-    return messages_.Say(std::forward<A>(x)...);
   }
   template <typename... A>
   parser::Message *WarnIfNotInModuleFile(parser::CharBlock source, A &&...x) {
@@ -132,6 +138,7 @@ private:
   void CheckGlobalName(const Symbol &);
   void CheckProcedureAssemblyName(const Symbol &symbol);
   void CheckExplicitSave(const Symbol &);
+  parser::Messages WhyNotInteroperableDerivedType(const Symbol &, bool isError);
   void CheckBindC(const Symbol &);
   void CheckBindCFunctionResult(const Symbol &);
   // Check functions for defined I/O procedures
@@ -182,6 +189,8 @@ private:
   // Collection of target dependent assembly names of external and BIND(C)
   // procedures.
   std::map<std::string, SymbolRef> procedureAssemblyNames_;
+  // Derived types that have been examined by WhyNotInteroperableDerivedType
+  UnorderedSymbolSet examinedByWhyNotInteroperableDerivedType_;
 };
 
 class DistinguishabilityHelper {
@@ -1352,7 +1361,7 @@ void CheckHelper::CheckSubprogram(
     SubprogramMatchHelper{*this}.Check(symbol, *iface);
   }
   if (const Scope *entryScope{details.entryScope()}) {
-    // ENTRY 15.6.2.6, esp. C1571
+    // ENTRY F'2023 15.6.2.6
     std::optional<parser::MessageFixedText> error;
     const Symbol *subprogram{entryScope->symbol()};
     const SubprogramDetails *subprogramDetails{nullptr};
@@ -1384,6 +1393,27 @@ void CheckHelper::CheckSubprogram(
       }
     }
   }
+  if (details.isFunction() &&
+      details.result().name() != symbol.name()) { // F'2023 C1569 & C1583
+    if (auto iter{symbol.owner().find(details.result().name())};
+        iter != symbol.owner().end()) {
+      const Symbol &resNameSym{*iter->second};
+      if (const auto *resNameSubp{resNameSym.detailsIf<SubprogramDetails>()}) {
+        if (const Scope * resNameEntryScope{resNameSubp->entryScope()}) {
+          const Scope *myScope{
+              details.entryScope() ? details.entryScope() : symbol.scope()};
+          if (resNameEntryScope == myScope) {
+            if (auto *msg{messages_.Say(symbol.name(),
+                    "Explicit RESULT('%s') of function '%s' cannot have the same name as a distinct ENTRY into the same scope"_err_en_US,
+                    details.result().name(), symbol.name())}) {
+              msg->Attach(
+                  resNameSym.name(), "ENTRY with conflicting name"_en_US);
+            }
+          }
+        }
+      }
+    }
+  }
   if (const MaybeExpr & stmtFunction{details.stmtFunction()}) {
     if (auto msg{evaluate::CheckStatementFunction(
             symbol, *stmtFunction, context_.foldingContext())}) {
@@ -1440,6 +1470,13 @@ void CheckHelper::CheckSubprogram(
     messages_.Say(symbol.name(),
         "A function may not have ATTRIBUTES(GLOBAL) or ATTRIBUTES(GRID_GLOBAL)"_err_en_US);
   }
+  if (cudaAttrs &&
+      (*cudaAttrs == common::CUDASubprogramAttrs::Global ||
+          *cudaAttrs == common::CUDASubprogramAttrs::Grid_Global) &&
+      symbol.attrs().HasAny({Attr::RECURSIVE, Attr::PURE, Attr::ELEMENTAL})) {
+    messages_.Say(symbol.name(),
+        "A kernel subprogram may not be RECURSIVE, PURE, or ELEMENTAL"_err_en_US);
+  }
   if (cudaAttrs && *cudaAttrs != common::CUDASubprogramAttrs::Host) {
     // CUDA device subprogram checks
     if (ClassifyProcedure(symbol) == ProcedureDefinitionClass::Internal) {
@@ -2558,14 +2595,77 @@ void CheckHelper::CheckEquivalenceSet(const EquivalenceSet &set) {
       }
     }
   }
-  // TODO: Move C8106 (&al.) checks here from resolve-names-utils.cpp
   for (const EquivalenceObject &object : set) {
-    if (object.symbol.test(Symbol::Flag::CrayPointee)) {
+    CheckEquivalenceObject(object);
+  }
+}
+
+static bool InCommonWithBind(const Symbol &symbol) {
+  if (const auto *details{symbol.detailsIf<ObjectEntityDetails>()}) {
+    const Symbol *commonBlock{details->commonBlock()};
+    return commonBlock && commonBlock->attrs().test(Attr::BIND_C);
+  } else {
+    return false;
+  }
+}
+
+void CheckHelper::CheckEquivalenceObject(const EquivalenceObject &object) {
+  parser::MessageFixedText msg;
+  const Symbol &symbol{object.symbol};
+  if (symbol.owner().IsDerivedType()) {
+    msg =
+        "Derived type component '%s' is not allowed in an equivalence set"_err_en_US;
+  } else if (IsDummy(symbol)) {
+    msg = "Dummy argument '%s' is not allowed in an equivalence set"_err_en_US;
+  } else if (symbol.IsFuncResult()) {
+    msg = "Function result '%s' is not allow in an equivalence set"_err_en_US;
+  } else if (IsPointer(symbol)) {
+    msg = "Pointer '%s' is not allowed in an equivalence set"_err_en_US;
+  } else if (IsAllocatable(symbol)) {
+    msg =
+        "Allocatable variable '%s' is not allowed in an equivalence set"_err_en_US;
+  } else if (symbol.Corank() > 0) {
+    msg = "Coarray '%s' is not allowed in an equivalence set"_err_en_US;
+  } else if (symbol.has<UseDetails>()) {
+    msg =
+        "Use-associated variable '%s' is not allowed in an equivalence set"_err_en_US;
+  } else if (symbol.attrs().test(Attr::BIND_C)) {
+    msg =
+        "Variable '%s' with BIND attribute is not allowed in an equivalence set"_err_en_US;
+  } else if (symbol.attrs().test(Attr::TARGET)) {
+    msg =
+        "Variable '%s' with TARGET attribute is not allowed in an equivalence set"_err_en_US;
+  } else if (IsNamedConstant(symbol)) {
+    msg = "Named constant '%s' is not allowed in an equivalence set"_err_en_US;
+  } else if (InCommonWithBind(symbol)) {
+    msg =
+        "Variable '%s' in common block with BIND attribute is not allowed in an equivalence set"_err_en_US;
+  } else if (!symbol.has<ObjectEntityDetails>()) {
+    msg = "'%s' in equivalence set is not a data object"_err_en_US;
+  } else if (const auto *type{symbol.GetType()}) {
+    const auto *derived{type->AsDerived()};
+    if (derived && !derived->IsVectorType()) {
+      if (const auto *comp{
+              FindUltimateComponent(*derived, IsAllocatableOrPointer)}) {
+        msg = IsPointer(*comp)
+            ? "Derived type object '%s' with pointer ultimate component is not allowed in an equivalence set"_err_en_US
+            : "Derived type object '%s' with allocatable ultimate component is not allowed in an equivalence set"_err_en_US;
+      } else if (!derived->typeSymbol().get<DerivedTypeDetails>().sequence()) {
+        msg =
+            "Nonsequence derived type object '%s' is not allowed in an equivalence set"_err_en_US;
+      }
+    } else if (IsAutomatic(symbol)) {
+      msg =
+          "Automatic object '%s' is not allowed in an equivalence set"_err_en_US;
+    } else if (symbol.test(Symbol::Flag::CrayPointee)) {
       messages_.Say(object.symbol.name(),
           "Cray pointee '%s' may not be a member of an EQUIVALENCE group"_err_en_US,
           object.symbol.name());
     }
   }
+  if (!msg.text().empty()) {
+    context_.Say(object.source, std::move(msg), symbol.name());
+  }
 }
 
 void CheckHelper::CheckBlockData(const Scope &scope) {
@@ -2758,11 +2858,129 @@ void CheckHelper::CheckProcedureAssemblyName(const Symbol &symbol) {
   }
 }
 
+parser::Messages CheckHelper::WhyNotInteroperableDerivedType(
+    const Symbol &symbol, bool isError) {
+  parser::Messages msgs;
+  if (examinedByWhyNotInteroperableDerivedType_.find(symbol) !=
+      examinedByWhyNotInteroperableDerivedType_.end()) {
+    return msgs;
+  }
+  isError |= symbol.attrs().test(Attr::BIND_C);
+  examinedByWhyNotInteroperableDerivedType_.insert(symbol);
+  if (const auto *derived{symbol.detailsIf<DerivedTypeDetails>()}) {
+    if (derived->sequence()) { // C1801
+      msgs.Say(symbol.name(),
+          "An interoperable derived type cannot have the SEQUENCE attribute"_err_en_US);
+    } else if (!derived->paramDecls().empty()) { // C1802
+      msgs.Say(symbol.name(),
+          "An interoperable derived type cannot have a type parameter"_err_en_US);
+    } else if (const auto *parent{
+                   symbol.scope()->GetDerivedTypeParent()}) { // C1803
+      if (isError) {
+        msgs.Say(symbol.name(),
+            "A derived type with the BIND attribute cannot be an extended derived type"_err_en_US);
+      } else {
+        bool interoperableParent{true};
+        if (parent->symbol()) {
+          auto bad{WhyNotInteroperableDerivedType(*parent->symbol(), false)};
+          if (bad.AnyFatalError()) {
+            auto &msg{msgs.Say(symbol.name(),
+                "The parent of an interoperable type is not interoperable"_err_en_US)};
+            bad.AttachTo(msg, parser::Severity::None);
+            interoperableParent = false;
+          }
+        }
+        if (interoperableParent) {
+          msgs.Say(symbol.name(),
+              "An interoperable type should not be an extended derived type"_warn_en_US);
+        }
+      }
+    }
+    const Symbol *parentComponent{symbol.scope()
+            ? derived->GetParentComponent(*symbol.scope())
+            : nullptr};
+    for (const auto &pair : *symbol.scope()) {
+      const Symbol &component{*pair.second};
+      if (&component == parentComponent) {
+        continue; // was checked above
+      }
+      if (IsProcedure(component)) { // C1804
+        msgs.Say(component.name(),
+            "An interoperable derived type cannot have a type bound procedure"_err_en_US);
+      } else if (IsAllocatableOrPointer(component)) { // C1806
+        msgs.Say(component.name(),
+            "An interoperable derived type cannot have a pointer or allocatable component"_err_en_US);
+      } else if (const auto *type{component.GetType()}) {
+        if (const auto *derived{type->AsDerived()}) {
+          auto bad{
+              WhyNotInteroperableDerivedType(derived->typeSymbol(), isError)};
+          if (bad.AnyFatalError()) {
+            auto &msg{msgs.Say(component.name(),
+                "Component '%s' of an interoperable derived type must have an interoperable type but does not"_err_en_US,
+                component.name())};
+            bad.AttachTo(msg, parser::Severity::None);
+          } else if (!derived->typeSymbol().GetUltimate().attrs().test(
+                         Attr::BIND_C)) {
+            auto &msg{
+                msgs.Say(component.name(),
+                        "Derived type of component '%s' of an interoperable derived type should have the BIND attribute"_warn_en_US,
+                        component.name())
+                    .Attach(derived->typeSymbol().name(),
+                        "Non-BIND(C) component type"_en_US)};
+            bad.AttachTo(msg, parser::Severity::None);
+          } else {
+            msgs.Annex(std::move(bad));
+          }
+        } else if (!IsInteroperableIntrinsicType(
+                       *type, context_.languageFeatures())) {
+          auto maybeDyType{evaluate::DynamicType::From(*type)};
+          if (type->category() == DeclTypeSpec::Logical) {
+            if (context_.ShouldWarn(common::UsageWarning::LogicalVsCBool)) {
+              msgs.Say(component.name(),
+                  "A LOGICAL component of an interoperable type should have the interoperable KIND=C_BOOL"_port_en_US);
+            }
+          } else if (type->category() == DeclTypeSpec::Character &&
+              maybeDyType && maybeDyType->kind() == 1) {
+            if (context_.ShouldWarn(common::UsageWarning::BindCCharLength)) {
+              msgs.Say(component.name(),
+                  "A CHARACTER component of an interoperable type should have length 1"_port_en_US);
+            }
+          } else {
+            msgs.Say(component.name(),
+                "Each component of an interoperable derived type must have an interoperable type"_err_en_US);
+          }
+        }
+      }
+      if (auto extents{
+              evaluate::GetConstantExtents(foldingContext_, &component)};
+          extents && evaluate::GetSize(*extents) == 0) {
+        msgs.Say(component.name(),
+            "An array component of an interoperable type must have at least one element"_err_en_US);
+      }
+    }
+    if (derived->componentNames().empty()) { // F'2023 C1805
+      if (context_.ShouldWarn(common::LanguageFeature::EmptyBindCDerivedType)) {
+        msgs.Say(symbol.name(),
+            "A derived type with the BIND attribute should not be empty"_port_en_US);
+      }
+    }
+  }
+  if (isError) {
+    for (auto &m : msgs.messages()) {
+      if (!m.IsFatal()) {
+        m.set_severity(parser::Severity::Error);
+      }
+    }
+  }
+  return msgs;
+}
+
 void CheckHelper::CheckBindC(const Symbol &symbol) {
   bool isExplicitBindC{symbol.attrs().test(Attr::BIND_C)};
   if (isExplicitBindC) {
-    CheckConflicting(symbol, Attr::BIND_C, Attr::PARAMETER);
     CheckConflicting(symbol, Attr::BIND_C, Attr::ELEMENTAL);
+    CheckConflicting(symbol, Attr::BIND_C, Attr::INTRINSIC);
+    CheckConflicting(symbol, Attr::BIND_C, Attr::PARAMETER);
   } else {
     // symbol must be interoperable (e.g., dummy argument of interoperable
     // procedure interface) but is not itself BIND(C).
@@ -2832,13 +3050,30 @@ void CheckHelper::CheckBindC(const Symbol &symbol) {
     }
     if (const auto *type{symbol.GetType()}) {
       const auto *derived{type->AsDerived()};
-      if (derived && !derived->typeSymbol().attrs().test(Attr::BIND_C)) {
-        if (auto *msg{messages_.Say(symbol.name(),
-                "The derived type of a BIND(C) object must also be BIND(C)"_err_en_US)}) {
-          msg->Attach(
-              derived->typeSymbol().name(), "Non-interoperable type"_en_US);
+      if (derived) {
+        if (derived->typeSymbol().attrs().test(Attr::BIND_C)) {
+        } else if (isExplicitBindC) {
+          if (auto *msg{messages_.Say(symbol.name(),
+                  "The derived type of a BIND(C) object must also be BIND(C)"_err_en_US)}) {
+            msg->Attach(derived->typeSymbol().name(), "Non-BIND(C) type"_en_US);
+          }
+          context_.SetError(symbol);
+        } else if (auto bad{WhyNotInteroperableDerivedType(
+                       derived->typeSymbol(), false)};
+                   !bad.empty()) {
+          if (auto *msg{messages_.Say(symbol.name(),
+                  "The derived type of an interoperable object must be interoperable, but is not"_err_en_US)}) {
+            msg->Attach(
+                derived->typeSymbol().name(), "Non-interoperable type"_en_US);
+            bad.AttachTo(*msg, parser::Severity::None);
+          }
+          context_.SetError(symbol);
+        } else {
+          if (auto *msg{messages_.Say(symbol.name(),
+                  "The derived type of an interoperable object should be BIND(C)"_warn_en_US)}) {
+            msg->Attach(derived->typeSymbol().name(), "Non-BIND(C) type"_en_US);
+          }
         }
-        context_.SetError(symbol);
       }
       if (type->IsAssumedType() || IsAssumedLengthCharacter(symbol)) {
         // ok
@@ -2881,17 +3116,20 @@ void CheckHelper::CheckBindC(const Symbol &symbol) {
           "An interoperable pointer must not be CONTIGUOUS"_err_en_US);
     }
   } else if (const auto *proc{symbol.detailsIf<ProcEntityDetails>()}) {
-    if (!proc->procInterface() ||
-        !proc->procInterface()->attrs().test(Attr::BIND_C)) {
-      if (proc->isDummy()) {
-        messages_.Say(symbol.name(),
-            "A dummy procedure to an interoperable procedure must also be interoperable"_err_en_US);
-        context_.SetError(symbol);
-      } else {
-        messages_.Say(symbol.name(),
-            "An interface name with BIND attribute must be specified if the BIND attribute is specified in a procedure declaration statement"_err_en_US);
-        context_.SetError(symbol);
+    if (!IsBindCProcedure(symbol) && proc->isDummy()) {
+      messages_.Say(symbol.name(),
+          "A dummy procedure to an interoperable procedure must also be interoperable"_err_en_US);
+      context_.SetError(symbol);
+    } else if (!proc->procInterface()) {
+      if (context_.ShouldWarn(
+              common::LanguageFeature::NonBindCInteroperability)) {
+        WarnIfNotInModuleFile(symbol.name(),
+            "An interface name with BIND attribute should be specified if the BIND attribute is specified in a procedure declaration statement"_warn_en_US);
       }
+    } else if (!proc->procInterface()->attrs().test(Attr::BIND_C)) {
+      messages_.Say(symbol.name(),
+          "An interface name with BIND attribute must be specified if the BIND attribute is specified in a procedure declaration statement"_err_en_US);
+      context_.SetError(symbol);
     }
   } else if (const auto *subp{symbol.detailsIf<SubprogramDetails>()}) {
     for (const Symbol *dummy : subp->dummyArgs()) {
@@ -2903,77 +3141,18 @@ void CheckHelper::CheckBindC(const Symbol &symbol) {
         context_.SetError(symbol);
       }
     }
-  } else if (const auto *derived{symbol.detailsIf<DerivedTypeDetails>()}) {
-    if (derived->sequence()) { // C1801
-      messages_.Say(symbol.name(),
-          "A derived type with the BIND attribute cannot have the SEQUENCE attribute"_err_en_US);
-      context_.SetError(symbol);
-    } else if (!derived->paramDecls().empty()) { // C1802
-      messages_.Say(symbol.name(),
-          "A derived type with the BIND attribute has type parameter(s)"_err_en_US);
-      context_.SetError(symbol);
-    } else if (symbol.scope()->GetDerivedTypeParent()) { // C1803
-      messages_.Say(symbol.name(),
-          "A derived type with the BIND attribute cannot extend from another derived type"_err_en_US);
-      context_.SetError(symbol);
-    } else {
-      for (const auto &pair : *symbol.scope()) {
-        const Symbol *component{&*pair.second};
-        if (IsProcedure(*component)) { // C1804
-          messages_.Say(component->name(),
-              "A derived type with the BIND attribute cannot have a type bound procedure"_err_en_US);
-          context_.SetError(symbol);
-        }
-        if (IsAllocatableOrPointer(*component)) { // C1806
-          messages_.Say(component->name(),
-              "A derived type with the BIND attribute cannot have a pointer or allocatable component"_err_en_US);
-          context_.SetError(symbol);
-        }
-        if (const auto *type{component->GetType()}) {
-          if (const auto *derived{type->AsDerived()}) {
-            if (!derived->typeSymbol().attrs().test(Attr::BIND_C)) {
-              if (auto *msg{messages_.Say(component->name(),
-                      "Component '%s' of an interoperable derived type must have the BIND attribute"_err_en_US,
-                      component->name())}) {
-                msg->Attach(derived->typeSymbol().name(),
-                    "Non-interoperable component type"_en_US);
-              }
-              context_.SetError(symbol);
-            }
-          } else if (!IsInteroperableIntrinsicType(
-                         *type, context_.languageFeatures())) {
-            auto maybeDyType{evaluate::DynamicType::From(*type)};
-            if (type->category() == DeclTypeSpec::Logical) {
-              if (context_.ShouldWarn(common::UsageWarning::LogicalVsCBool)) {
-                WarnIfNotInModuleFile(component->name(),
-                    "A LOGICAL component of a BIND(C) type should have the interoperable KIND=C_BOOL"_port_en_US);
-              }
-            } else if (type->category() == DeclTypeSpec::Character &&
-                maybeDyType && maybeDyType->kind() == 1) {
-              if (context_.ShouldWarn(common::UsageWarning::BindCCharLength)) {
-                WarnIfNotInModuleFile(component->name(),
-                    "A CHARACTER component of a BIND(C) type should have length 1"_port_en_US);
-              }
-            } else {
-              messages_.Say(component->name(),
-                  "Each component of an interoperable derived type must have an interoperable type"_err_en_US);
-              context_.SetError(symbol);
-            }
-          }
-        }
-        if (auto extents{
-                evaluate::GetConstantExtents(foldingContext_, component)};
-            extents && evaluate::GetSize(*extents) == 0) {
-          messages_.Say(component->name(),
-              "An array component of an interoperable type must have at least one element"_err_en_US);
-          context_.SetError(symbol);
-        }
+  } else if (symbol.has<DerivedTypeDetails>()) {
+    if (auto msgs{WhyNotInteroperableDerivedType(symbol, false)};
+        !msgs.empty()) {
+      bool anyFatal{msgs.AnyFatalError()};
+      if (msgs.AnyFatalError() ||
+          (!InModuleFile() &&
+              context_.ShouldWarn(
+                  common::LanguageFeature::NonBindCInteroperability))) {
+        context_.messages().Annex(std::move(msgs));
       }
-    }
-    if (derived->componentNames().empty()) { // F'2023 C1805
-      if (context_.ShouldWarn(common::LanguageFeature::EmptyBindCDerivedType)) {
-        WarnIfNotInModuleFile(symbol.name(),
-            "A derived type with the BIND attribute is empty"_port_en_US);
+      if (anyFatal) {
+        context_.SetError(symbol);
       }
     }
   }
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index ab76fe59911b..2493eb3ed367 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -84,48 +84,70 @@ private:
   parser::CharBlock source_;
 };
 
-class OmpCycleChecker {
+class OmpCycleAndExitChecker {
 public:
-  OmpCycleChecker(SemanticsContext &context, std::int64_t cycleLevel)
-      : context_{context}, cycleLevel_{cycleLevel} {}
+  OmpCycleAndExitChecker(SemanticsContext &context, std::int64_t level)
+      : context_{context}, level_{level} {}
 
   template <typename T> bool Pre(const T &) { return true; }
   template <typename T> void Post(const T &) {}
 
   bool Pre(const parser::DoConstruct &dc) {
-    cycleLevel_--;
-    const auto &labelName{std::get<0>(std::get<0>(dc.t).statement.t)};
-    if (labelName) {
-      labelNamesandLevels_.emplace(labelName.value().ToString(), cycleLevel_);
+    level_--;
+    const auto &constructName{std::get<0>(std::get<0>(dc.t).statement.t)};
+    if (constructName) {
+      constructNamesAndLevels_.emplace(
+          constructName.value().ToString(), level_);
     }
     return true;
   }
 
+  void Post(const parser::DoConstruct &dc) { level_++; }
+
   bool Pre(const parser::CycleStmt &cyclestmt) {
     std::map<std::string, std::int64_t>::iterator it;
     bool err{false};
     if (cyclestmt.v) {
-      it = labelNamesandLevels_.find(cyclestmt.v->source.ToString());
-      err = (it != labelNamesandLevels_.end() && it->second > 0);
+      it = constructNamesAndLevels_.find(cyclestmt.v->source.ToString());
+      err = (it != constructNamesAndLevels_.end() && it->second > 0);
+    } else { // If there is no label then use the level of the last enclosing DO
+      err = level_ > 0;
     }
-    if (cycleLevel_ > 0 || err) {
-      context_.Say(*cycleSource_,
+    if (err) {
+      context_.Say(*source_,
           "CYCLE statement to non-innermost associated loop of an OpenMP DO "
           "construct"_err_en_US);
     }
     return true;
   }
 
+  bool Pre(const parser::ExitStmt &exitStmt) {
+    std::map<std::string, std::int64_t>::iterator it;
+    bool err{false};
+    if (exitStmt.v) {
+      it = constructNamesAndLevels_.find(exitStmt.v->source.ToString());
+      err = (it != constructNamesAndLevels_.end() && it->second >= 0);
+    } else { // If there is no label then use the level of the last enclosing DO
+      err = level_ >= 0;
+    }
+    if (err) {
+      context_.Say(*source_,
+          "EXIT statement terminates associated loop of an OpenMP DO "
+          "construct"_err_en_US);
+    }
+    return true;
+  }
+
   bool Pre(const parser::Statement<parser::ActionStmt> &actionstmt) {
-    cycleSource_ = &actionstmt.source;
+    source_ = &actionstmt.source;
     return true;
   }
 
 private:
   SemanticsContext &context_;
-  const parser::CharBlock *cycleSource_;
-  std::int64_t cycleLevel_;
-  std::map<std::string, std::int64_t> labelNamesandLevels_;
+  const parser::CharBlock *source_;
+  std::int64_t level_;
+  std::map<std::string, std::int64_t> constructNamesAndLevels_;
 };
 
 bool OmpStructureChecker::IsCloselyNestedRegion(const OmpDirectiveSet &set) {
@@ -652,8 +674,8 @@ std::int64_t OmpStructureChecker::GetOrdCollapseLevel(
 void OmpStructureChecker::CheckCycleConstraints(
     const parser::OpenMPLoopConstruct &x) {
   std::int64_t ordCollapseLevel{GetOrdCollapseLevel(x)};
-  OmpCycleChecker ompCycleChecker{context_, ordCollapseLevel};
-  parser::Walk(x, ompCycleChecker);
+  OmpCycleAndExitChecker checker{context_, ordCollapseLevel};
+  parser::Walk(x, checker);
 }
 
 void OmpStructureChecker::CheckDistLinear(
diff --git a/flang/lib/Semantics/compute-offsets.cpp b/flang/lib/Semantics/compute-offsets.cpp
index 2eb3a34ad806..d9a9576e9d67 100644
--- a/flang/lib/Semantics/compute-offsets.cpp
+++ b/flang/lib/Semantics/compute-offsets.cpp
@@ -277,20 +277,22 @@ std::size_t ComputeOffsetsHelper::ComputeOffset(
     const EquivalenceObject &object) {
   std::size_t offset{0};
   if (!object.subscripts.empty()) {
-    const ArraySpec &shape{object.symbol.get<ObjectEntityDetails>().shape()};
-    auto lbound{[&](std::size_t i) {
-      return *ToInt64(shape[i].lbound().GetExplicit());
-    }};
-    auto ubound{[&](std::size_t i) {
-      return *ToInt64(shape[i].ubound().GetExplicit());
-    }};
-    for (std::size_t i{object.subscripts.size() - 1};;) {
-      offset += object.subscripts[i] - lbound(i);
-      if (i == 0) {
-        break;
+    if (const auto *details{object.symbol.detailsIf<ObjectEntityDetails>()}) {
+      const ArraySpec &shape{details->shape()};
+      auto lbound{[&](std::size_t i) {
+        return *ToInt64(shape[i].lbound().GetExplicit());
+      }};
+      auto ubound{[&](std::size_t i) {
+        return *ToInt64(shape[i].ubound().GetExplicit());
+      }};
+      for (std::size_t i{object.subscripts.size() - 1};;) {
+        offset += object.subscripts[i] - lbound(i);
+        if (i == 0) {
+          break;
+        }
+        --i;
+        offset *= ubound(i) - lbound(i) + 1;
       }
-      --i;
-      offset *= ubound(i) - lbound(i) + 1;
     }
   }
   auto result{offset * GetSizeAndAlignment(object.symbol, false).size};
diff --git a/flang/lib/Semantics/data-to-inits.cpp b/flang/lib/Semantics/data-to-inits.cpp
index 64050874bcde..605a9f10712e 100644
--- a/flang/lib/Semantics/data-to-inits.cpp
+++ b/flang/lib/Semantics/data-to-inits.cpp
@@ -903,7 +903,13 @@ void ConstructInitializer(const Symbol &symbol,
       if (const auto *procDesignator{
               std::get_if<evaluate::ProcedureDesignator>(&expr->u)}) {
         CHECK(!procDesignator->GetComponent());
-        mutableProc.set_init(DEREF(procDesignator->GetSymbol()));
+        if (const auto *intrin{procDesignator->GetSpecificIntrinsic()}) {
+          const Symbol *intrinSymbol{
+              symbol.owner().FindSymbol(SourceName{intrin->name})};
+          mutableProc.set_init(DEREF(intrinSymbol));
+        } else {
+          mutableProc.set_init(DEREF(procDesignator->GetSymbol()));
+        }
       } else {
         CHECK(evaluate::IsNullProcedurePointer(*expr));
         mutableProc.set_init(nullptr);
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index f677973ca275..06e38da6626a 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -1805,10 +1805,13 @@ void ArrayConstructorContext::Add(const parser::AcImpliedDo &impliedDo) {
   const auto &bounds{std::get<parser::AcImpliedDoControl::Bounds>(control.t)};
   exprAnalyzer_.Analyze(bounds.name);
   parser::CharBlock name{bounds.name.thing.thing.source};
-  const Symbol *symbol{bounds.name.thing.thing.symbol};
   int kind{ImpliedDoIntType::kind};
-  if (const auto dynamicType{DynamicType::From(symbol)}) {
-    kind = dynamicType->kind();
+  if (const Symbol * symbol{bounds.name.thing.thing.symbol}) {
+    if (auto dynamicType{DynamicType::From(symbol)}) {
+      if (dynamicType->category() == TypeCategory::Integer) {
+        kind = dynamicType->kind();
+      }
+    }
   }
   std::optional<Expr<ImpliedDoIntType>> lower{
       GetSpecificIntExpr<ImpliedDoIntType::kind>(bounds.lower)};
@@ -2494,6 +2497,109 @@ static bool CheckCompatibleArguments(
   return true;
 }
 
+static constexpr int cudaInfMatchingValue{std::numeric_limits<int>::max()};
+
+// Compute the matching distance as described in section 3.2.3 of the CUDA
+// Fortran references.
+static int GetMatchingDistance(const common::LanguageFeatureControl &features,
+    const characteristics::DummyArgument &dummy,
+    const std::optional<ActualArgument> &actual) {
+  bool isCudaManaged{features.IsEnabled(common::LanguageFeature::CudaManaged)};
+  bool isCudaUnified{features.IsEnabled(common::LanguageFeature::CudaUnified)};
+  CHECK(!(isCudaUnified && isCudaManaged) && "expect only one enabled.");
+
+  std::optional<common::CUDADataAttr> actualDataAttr, dummyDataAttr;
+  if (actual) {
+    if (auto *expr{actual->UnwrapExpr()}) {
+      const auto *actualLastSymbol{evaluate::GetLastSymbol(*expr)};
+      if (actualLastSymbol) {
+        actualLastSymbol = &semantics::ResolveAssociations(*actualLastSymbol);
+        if (const auto *actualObject{actualLastSymbol
+                    ? actualLastSymbol
+                          ->detailsIf<semantics::ObjectEntityDetails>()
+                    : nullptr}) {
+          actualDataAttr = actualObject->cudaDataAttr();
+        }
+      }
+    }
+  }
+
+  common::visit(common::visitors{
+                    [&](const characteristics::DummyDataObject &object) {
+                      dummyDataAttr = object.cudaDataAttr;
+                    },
+                    [&](const auto &) {},
+                },
+      dummy.u);
+
+  if (!dummyDataAttr) {
+    if (!actualDataAttr) {
+      if (isCudaUnified || isCudaManaged) {
+        return 3;
+      }
+      return 0;
+    } else if (*actualDataAttr == common::CUDADataAttr::Device) {
+      return cudaInfMatchingValue;
+    } else if (*actualDataAttr == common::CUDADataAttr::Managed ||
+        *actualDataAttr == common::CUDADataAttr::Unified) {
+      return 3;
+    }
+  } else if (*dummyDataAttr == common::CUDADataAttr::Device) {
+    if (!actualDataAttr) {
+      if (isCudaUnified || isCudaManaged) {
+        return 2;
+      }
+      return cudaInfMatchingValue;
+    } else if (*actualDataAttr == common::CUDADataAttr::Device) {
+      return 0;
+    } else if (*actualDataAttr == common::CUDADataAttr::Managed ||
+        *actualDataAttr == common::CUDADataAttr::Unified) {
+      return 2;
+    }
+  } else if (*dummyDataAttr == common::CUDADataAttr::Managed) {
+    if (!actualDataAttr) {
+      return isCudaUnified ? 1 : isCudaManaged ? 0 : cudaInfMatchingValue;
+    }
+    if (*actualDataAttr == common::CUDADataAttr::Device) {
+      return cudaInfMatchingValue;
+    } else if (*actualDataAttr == common::CUDADataAttr::Managed) {
+      return 0;
+    } else if (*actualDataAttr == common::CUDADataAttr::Unified) {
+      return 1;
+    }
+  } else if (*dummyDataAttr == common::CUDADataAttr::Unified) {
+    if (!actualDataAttr) {
+      return isCudaUnified ? 0 : isCudaManaged ? 1 : cudaInfMatchingValue;
+    }
+    if (*actualDataAttr == common::CUDADataAttr::Device) {
+      return cudaInfMatchingValue;
+    } else if (*actualDataAttr == common::CUDADataAttr::Managed) {
+      return 1;
+    } else if (*actualDataAttr == common::CUDADataAttr::Unified) {
+      return 0;
+    }
+  }
+  return cudaInfMatchingValue;
+}
+
+static int ComputeCudaMatchingDistance(
+    const common::LanguageFeatureControl &features,
+    const characteristics::Procedure &procedure,
+    const ActualArguments &actuals) {
+  const auto &dummies{procedure.dummyArguments};
+  CHECK(dummies.size() == actuals.size());
+  int distance{0};
+  for (std::size_t i{0}; i < dummies.size(); ++i) {
+    const characteristics::DummyArgument &dummy{dummies[i]};
+    const std::optional<ActualArgument> &actual{actuals[i]};
+    int d{GetMatchingDistance(features, dummy, actual)};
+    if (d == cudaInfMatchingValue)
+      return d;
+    distance += d;
+  }
+  return distance;
+}
+
 // Handles a forward reference to a module function from what must
 // be a specification expression.  Return false if the symbol is
 // an invalid forward reference.
@@ -2541,6 +2647,7 @@ std::pair<const Symbol *, bool> ExpressionAnalyzer::ResolveGeneric(
   const Symbol *elemental{nullptr}; // matching elemental specific proc
   const Symbol *nonElemental{nullptr}; // matching non-elemental specific
   const Symbol &ultimate{symbol.GetUltimate()};
+  int crtMatchingDistance{cudaInfMatchingValue};
   // Check for a match with an explicit INTRINSIC
   if (ultimate.attrs().test(semantics::Attr::INTRINSIC)) {
     parser::Messages buffer;
@@ -2577,12 +2684,23 @@ std::pair<const Symbol *, bool> ExpressionAnalyzer::ResolveGeneric(
             CheckCompatibleArguments(*procedure, localActuals)) {
           if ((procedure->IsElemental() && elemental) ||
               (!procedure->IsElemental() && nonElemental)) {
-            // 16.9.144(6): a bare NULL() is not allowed as an actual
-            // argument to a generic procedure if the specific procedure
-            // cannot be unambiguously distinguished
-            // Underspecified external procedure actual arguments can
-            // also lead to ambiguity.
-            return {nullptr, true /* due to ambiguity */};
+            int d{ComputeCudaMatchingDistance(
+                context_.languageFeatures(), *procedure, localActuals)};
+            llvm::errs() << "matching distance: " << d << "\n";
+            if (d != crtMatchingDistance) {
+              if (d > crtMatchingDistance) {
+                continue;
+              }
+              // Matching distance is smaller than the previously matched
+              // specific. Let it go thourgh so the current procedure is picked.
+            } else {
+              // 16.9.144(6): a bare NULL() is not allowed as an actual
+              // argument to a generic procedure if the specific procedure
+              // cannot be unambiguously distinguished
+              // Underspecified external procedure actual arguments can
+              // also lead to ambiguity.
+              return {nullptr, true /* due to ambiguity */};
+            }
           }
           if (!procedure->IsElemental()) {
             // takes priority over elemental match
@@ -2590,6 +2708,8 @@ std::pair<const Symbol *, bool> ExpressionAnalyzer::ResolveGeneric(
           } else {
             elemental = &specific;
           }
+          crtMatchingDistance = ComputeCudaMatchingDistance(
+              context_.languageFeatures(), *procedure, localActuals);
         }
       }
     }
@@ -4210,7 +4330,9 @@ MaybeExpr ArgumentAnalyzer::TryDefinedOp(
     if (Symbol *symbol{scope.FindSymbol(oprName)}) {
       anyPossibilities = true;
       parser::Name name{symbol->name(), symbol};
-      result = context_.AnalyzeDefinedOp(name, GetActuals());
+      if (!fatalErrors_) {
+        result = context_.AnalyzeDefinedOp(name, GetActuals());
+      }
       if (result) {
         inaccessible = CheckAccessibleSymbol(scope, *symbol);
         if (inaccessible) {
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 318687508ff1..2add2056f658 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -633,6 +633,8 @@ public:
               [&](const auto &name) {},
           },
           ompObj.u);
+
+      ResolveOmpObject(ompObj, ompFlag);
     }
   }
 
@@ -1811,6 +1813,7 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPSectionsConstruct &x) {
   case llvm::omp::Directive::OMPD_parallel_sections:
   case llvm::omp::Directive::OMPD_sections:
     PushContext(beginDir.source, beginDir.v);
+    GetContext().withinConstruct = true;
     break;
   default:
     break;
@@ -1823,6 +1826,7 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPCriticalConstruct &x) {
   const auto &beginCriticalDir{std::get<parser::OmpCriticalDirective>(x.t)};
   const auto &endCriticalDir{std::get<parser::OmpEndCriticalDirective>(x.t)};
   PushContext(beginCriticalDir.source, llvm::omp::Directive::OMPD_critical);
+  GetContext().withinConstruct = true;
   if (const auto &criticalName{
           std::get<std::optional<parser::Name>>(beginCriticalDir.t)}) {
     ResolveOmpName(*criticalName, Symbol::Flag::OmpCriticalLock);
@@ -2026,34 +2030,108 @@ void OmpAttributeVisitor::Post(const parser::Name &name) {
       if (found->test(semantics::Symbol::Flag::OmpThreadprivate))
         return;
     }
-    std::vector<Symbol *> defaultDSASymbols;
+
+    // Implicitly determined DSAs
+    // OMP 5.2 5.1.1 - Variables Referenced in a Construct
+    Symbol *lastDeclSymbol = nullptr;
+    std::optional<Symbol::Flag> prevDSA;
     for (int dirDepth{0}; dirDepth < (int)dirContext_.size(); ++dirDepth) {
       DirContext &dirContext = dirContext_[dirDepth];
-      bool hasDataSharingAttr{false};
+      std::optional<Symbol::Flag> dsa;
+
       for (auto symMap : dirContext.objectWithDSA) {
         // if the `symbol` already has a data-sharing attribute
         if (symMap.first->name() == name.symbol->name()) {
-          hasDataSharingAttr = true;
+          dsa = symMap.second;
           break;
         }
       }
-      if (hasDataSharingAttr) {
-        if (defaultDSASymbols.size())
-          symbol = &MakeAssocSymbol(symbol->name(), *defaultDSASymbols.back(),
+
+      // When handling each implicit rule, either a new private symbol is
+      // declared or the last declared symbol is used.
+      // In the latter case, it's necessary to insert a new symbol in the scope
+      // being processed, associated with the last declared symbol.
+      // This captures the fact that, although we are using the last declared
+      // symbol, its DSA could be different in this scope.
+      // Also, because of how symbols are collected in lowering, not inserting
+      // a new symbol in this scope could lead to the conclusion that the
+      // symbol was declared in this construct, which would result in wrong
+      // privatization code being generated.
+      // Consider the following example:
+      //
+      // !$omp parallel default(private)              ! p1
+      //   !$omp parallel default(private) shared(x)  ! p2
+      //     x = 10
+      //   !$omp end parallel
+      // !$omp end parallel
+      //
+      // If a new x symbol was not inserted in the inner parallel construct
+      // (p2), it would use the x symbol definition from the enclosing scope.
+      // Then, when p2's default symbols were collected in lowering, the x
+      // symbol from the outer parallel construct (p1) would be collected, as
+      // it would have the private flag set (note that symbols that don't have
+      // any private flag are considered as shared).
+      // This would make x appear to be defined in p2, causing it to be
+      // privatized in p2 and its privatization in p1 to be skipped.
+      auto declNewSymbol = [&](Symbol::Flag flag) {
+        Symbol *hostSymbol =
+            lastDeclSymbol ? lastDeclSymbol : &symbol->GetUltimate();
+        lastDeclSymbol = DeclarePrivateAccessEntity(
+            *hostSymbol, flag, context_.FindScope(dirContext.directiveSource));
+        return lastDeclSymbol;
+      };
+      auto useLastDeclSymbol = [&]() {
+        if (lastDeclSymbol)
+          MakeAssocSymbol(symbol->name(), *lastDeclSymbol,
               context_.FindScope(dirContext.directiveSource));
+      };
+
+      if (dsa.has_value()) {
+        useLastDeclSymbol();
+        prevDSA = dsa;
         continue;
       }
 
-      if (dirContext.defaultDSA == semantics::Symbol::Flag::OmpPrivate ||
-          dirContext.defaultDSA == semantics::Symbol::Flag::OmpFirstPrivate) {
-        Symbol *hostSymbol = defaultDSASymbols.size() ? defaultDSASymbols.back()
-                                                      : &symbol->GetUltimate();
-        defaultDSASymbols.push_back(
-            DeclarePrivateAccessEntity(*hostSymbol, dirContext.defaultDSA,
-                context_.FindScope(dirContext.directiveSource)));
-      } else if (defaultDSASymbols.size())
-        symbol = &MakeAssocSymbol(symbol->name(), *defaultDSASymbols.back(),
-            context_.FindScope(dirContext.directiveSource));
+      bool taskGenDir = llvm::omp::taskGeneratingSet.test(dirContext.directive);
+      bool targetDir = llvm::omp::allTargetSet.test(dirContext.directive);
+      bool parallelDir = llvm::omp::allParallelSet.test(dirContext.directive);
+
+      if (dirContext.defaultDSA == Symbol::Flag::OmpPrivate ||
+          dirContext.defaultDSA == Symbol::Flag::OmpFirstPrivate ||
+          dirContext.defaultDSA == Symbol::Flag::OmpShared) {
+        // 1) default
+        // Allowed only with parallel, teams and task generating constructs.
+        assert(parallelDir || taskGenDir ||
+            llvm::omp::allTeamsSet.test(dirContext.directive));
+        if (dirContext.defaultDSA != Symbol::Flag::OmpShared)
+          declNewSymbol(dirContext.defaultDSA);
+        else
+          useLastDeclSymbol();
+        dsa = dirContext.defaultDSA;
+      } else if (parallelDir) {
+        // 2) parallel -> shared
+        useLastDeclSymbol();
+        dsa = Symbol::Flag::OmpShared;
+      } else if (!taskGenDir && !targetDir) {
+        // 3) enclosing context
+        useLastDeclSymbol();
+        dsa = prevDSA;
+      } else if (targetDir) {
+        // TODO 4) not mapped target variable -> firstprivate
+        dsa = prevDSA;
+      } else if (taskGenDir) {
+        // TODO 5) dummy arg in orphaned taskgen construct -> firstprivate
+        if (prevDSA == Symbol::Flag::OmpShared) {
+          // 6) shared in enclosing context -> shared
+          useLastDeclSymbol();
+          dsa = Symbol::Flag::OmpShared;
+        } else {
+          // 7) firstprivate
+          dsa = Symbol::Flag::OmpFirstPrivate;
+          declNewSymbol(*dsa)->set(Symbol::Flag::OmpImplicit);
+        }
+      }
+      prevDSA = dsa;
     }
   } // within OpenMP construct
 }
diff --git a/flang/lib/Semantics/resolve-names-utils.cpp b/flang/lib/Semantics/resolve-names-utils.cpp
index 801473876e7e..3ca460b8e46a 100644
--- a/flang/lib/Semantics/resolve-names-utils.cpp
+++ b/flang/lib/Semantics/resolve-names-utils.cpp
@@ -568,75 +568,9 @@ bool EquivalenceSets::CheckDataRef(
       x.u);
 }
 
-static bool InCommonWithBind(const Symbol &symbol) {
-  if (const auto *details{symbol.detailsIf<ObjectEntityDetails>()}) {
-    const Symbol *commonBlock{details->commonBlock()};
-    return commonBlock && commonBlock->attrs().test(Attr::BIND_C);
-  } else {
-    return false;
-  }
-}
-
-// If symbol can't be in equivalence set report error and return false;
 bool EquivalenceSets::CheckObject(const parser::Name &name) {
-  if (!name.symbol) {
-    return false; // an error has already occurred
-  }
   currObject_.symbol = name.symbol;
-  parser::MessageFixedText msg;
-  const Symbol &symbol{*name.symbol};
-  if (symbol.owner().IsDerivedType()) { // C8107
-    msg = "Derived type component '%s'"
-          " is not allowed in an equivalence set"_err_en_US;
-  } else if (IsDummy(symbol)) { // C8106
-    msg = "Dummy argument '%s' is not allowed in an equivalence set"_err_en_US;
-  } else if (symbol.IsFuncResult()) { // C8106
-    msg = "Function result '%s' is not allow in an equivalence set"_err_en_US;
-  } else if (IsPointer(symbol)) { // C8106
-    msg = "Pointer '%s' is not allowed in an equivalence set"_err_en_US;
-  } else if (IsAllocatable(symbol)) { // C8106
-    msg = "Allocatable variable '%s'"
-          " is not allowed in an equivalence set"_err_en_US;
-  } else if (symbol.Corank() > 0) { // C8106
-    msg = "Coarray '%s' is not allowed in an equivalence set"_err_en_US;
-  } else if (symbol.has<UseDetails>()) { // C8115
-    msg = "Use-associated variable '%s'"
-          " is not allowed in an equivalence set"_err_en_US;
-  } else if (symbol.attrs().test(Attr::BIND_C)) { // C8106
-    msg = "Variable '%s' with BIND attribute"
-          " is not allowed in an equivalence set"_err_en_US;
-  } else if (symbol.attrs().test(Attr::TARGET)) { // C8108
-    msg = "Variable '%s' with TARGET attribute"
-          " is not allowed in an equivalence set"_err_en_US;
-  } else if (IsNamedConstant(symbol)) { // C8106
-    msg = "Named constant '%s' is not allowed in an equivalence set"_err_en_US;
-  } else if (InCommonWithBind(symbol)) { // C8106
-    msg = "Variable '%s' in common block with BIND attribute"
-          " is not allowed in an equivalence set"_err_en_US;
-  } else if (const auto *type{symbol.GetType()}) {
-    const auto *derived{type->AsDerived()};
-    if (derived && !derived->IsVectorType()) {
-      if (const auto *comp{FindUltimateComponent(
-              *derived, IsAllocatableOrPointer)}) { // C8106
-        msg = IsPointer(*comp)
-            ? "Derived type object '%s' with pointer ultimate component"
-              " is not allowed in an equivalence set"_err_en_US
-            : "Derived type object '%s' with allocatable ultimate component"
-              " is not allowed in an equivalence set"_err_en_US;
-      } else if (!derived->typeSymbol().get<DerivedTypeDetails>().sequence()) {
-        msg = "Nonsequence derived type object '%s'"
-              " is not allowed in an equivalence set"_err_en_US;
-      }
-    } else if (IsAutomatic(symbol)) {
-      msg = "Automatic object '%s'"
-            " is not allowed in an equivalence set"_err_en_US;
-    }
-  }
-  if (!msg.text().empty()) {
-    context_.Say(name.source, std::move(msg), name.source);
-    return false;
-  }
-  return true;
+  return currObject_.symbol != nullptr;
 }
 
 bool EquivalenceSets::CheckArrayBound(const parser::Expr &bound) {
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 61394b0f41de..e2875081b732 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -4048,27 +4048,10 @@ void SubprogramVisitor::CreateEntry(
       attrs = extant->attrs();
     }
   }
-  bool badResultName{false};
   std::optional<SourceName> distinctResultName;
   if (suffix && suffix->resultName &&
       suffix->resultName->source != entryName.source) {
     distinctResultName = suffix->resultName->source;
-    const parser::Name &resultName{*suffix->resultName};
-    if (resultName.source == subprogram.name()) { // C1574
-      Say2(resultName.source,
-          "RESULT(%s) may not have the same name as the function"_err_en_US,
-          subprogram, "Containing function"_en_US);
-      badResultName = true;
-    } else if (const Symbol * extant{FindSymbol(outer, resultName)}) { // C1574
-      if (const auto *details{extant->detailsIf<SubprogramDetails>()}) {
-        if (details->entryScope() == &currScope()) {
-          Say2(resultName.source,
-              "RESULT(%s) may not have the same name as an ENTRY in the function"_err_en_US,
-              extant->name(), "Conflicting ENTRY"_en_US);
-          badResultName = true;
-        }
-      }
-    }
   }
   if (outer.IsModule() && !attrs.test(Attr::PRIVATE)) {
     attrs.set(Attr::PUBLIC);
@@ -4104,17 +4087,24 @@ void SubprogramVisitor::CreateEntry(
     EntityDetails resultDetails;
     resultDetails.set_funcResult(true);
     if (distinctResultName) {
-      if (!badResultName) {
-        // RESULT(x) can be the same explicitly-named RESULT(x) as
-        // the enclosing function or another ENTRY.
-        if (auto iter{currScope().find(suffix->resultName->source)};
-            iter != currScope().end()) {
-          result = &*iter->second;
-        }
-        if (!result) {
-          result = &MakeSymbol(
-              *distinctResultName, Attrs{}, std::move(resultDetails));
-        }
+      // An explicit RESULT() can also be an explicit RESULT()
+      // of the function or another ENTRY.
+      if (auto iter{currScope().find(suffix->resultName->source)};
+          iter != currScope().end()) {
+        result = &*iter->second;
+      }
+      if (!result) {
+        result =
+            &MakeSymbol(*distinctResultName, Attrs{}, std::move(resultDetails));
+      } else if (!result->has<EntityDetails>()) {
+        Say(*distinctResultName,
+            "ENTRY cannot have RESULT(%s) that is not a variable"_err_en_US,
+            *distinctResultName)
+            .Attach(result->name(), "Existing declaration of '%s'"_en_US,
+                result->name());
+        result = nullptr;
+      }
+      if (result) {
         Resolve(*suffix->resultName, *result);
       }
     } else {
@@ -4124,8 +4114,7 @@ void SubprogramVisitor::CreateEntry(
       entryDetails.set_result(*result);
     }
   }
-  if (subpFlag == Symbol::Flag::Subroutine ||
-      (distinctResultName && !badResultName)) {
+  if (subpFlag == Symbol::Flag::Subroutine || distinctResultName) {
     Symbol &assoc{MakeSymbol(entryName.source)};
     assoc.set_details(HostAssocDetails{*entrySymbol});
     assoc.set(Symbol::Flag::Subroutine);
@@ -5550,8 +5539,7 @@ void DeclarationVisitor::Post(const parser::TypeParamDefStmt &x) {
       SetType(name, *type);
       if (auto &init{
               std::get<std::optional<parser::ScalarIntConstantExpr>>(decl.t)}) {
-        if (auto maybeExpr{EvaluateNonPointerInitializer(
-                *symbol, *init, init->thing.thing.thing.value().source)}) {
+        if (auto maybeExpr{AnalyzeExpr(context(), *init)}) {
           if (auto *intExpr{std::get_if<SomeIntExpr>(&maybeExpr->u)}) {
             symbol->get<TypeParamDetails>().set_init(std::move(*intExpr));
           }
@@ -6556,6 +6544,7 @@ Symbol *DeclarationVisitor::DeclareStatementEntity(
       return nullptr;
     }
     name.symbol = nullptr;
+    // F'2023 19.4 p5 ambiguous rule about outer declarations
     declTypeSpec = prev->GetType();
   }
   Symbol &symbol{DeclareEntity<ObjectEntityDetails>(name, {})};
@@ -6574,9 +6563,7 @@ Symbol *DeclarationVisitor::DeclareStatementEntity(
   } else {
     ApplyImplicitRules(symbol);
   }
-  Symbol *result{Resolve(name, &symbol)};
-  AnalyzeExpr(context(), doVar); // enforce INTEGER type
-  return result;
+  return Resolve(name, &symbol);
 }
 
 // Set the type of an entity or report an error.
diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp
index 2d0caff82eb2..99381918fc63 100644
--- a/flang/lib/Semantics/tools.cpp
+++ b/flang/lib/Semantics/tools.cpp
@@ -256,15 +256,17 @@ static const Symbol &FollowHostAssoc(const Symbol &symbol) {
 }
 
 bool IsHostAssociated(const Symbol &symbol, const Scope &scope) {
-  return DoesScopeContain(
-      &GetProgramUnitOrBlockConstructContaining(FollowHostAssoc(symbol)),
-      GetProgramUnitOrBlockConstructContaining(scope));
+  const Symbol &base{FollowHostAssoc(symbol)};
+  return base.owner().IsTopLevel() ||
+      DoesScopeContain(&GetProgramUnitOrBlockConstructContaining(base),
+          GetProgramUnitOrBlockConstructContaining(scope));
 }
 
 bool IsHostAssociatedIntoSubprogram(const Symbol &symbol, const Scope &scope) {
-  return DoesScopeContain(
-      &GetProgramUnitOrBlockConstructContaining(FollowHostAssoc(symbol)),
-      GetProgramUnitContaining(scope));
+  const Symbol &base{FollowHostAssoc(symbol)};
+  return base.owner().IsTopLevel() ||
+      DoesScopeContain(&GetProgramUnitOrBlockConstructContaining(base),
+          GetProgramUnitContaining(scope));
 }
 
 bool IsInStmtFunction(const Symbol &symbol) {
diff --git a/flang/runtime/extensions.cpp b/flang/runtime/extensions.cpp
index 4b110cc10c84..be3833db88b0 100644
--- a/flang/runtime/extensions.cpp
+++ b/flang/runtime/extensions.cpp
@@ -23,6 +23,12 @@
 #include <thread>
 
 #ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+#include <windows.h>
+
+#include <synchapi.h>
+
 inline void CtimeBuffer(char *buffer, size_t bufsize, const time_t cur_time,
     Fortran::runtime::Terminator terminator) {
   int error{ctime_s(buffer, bufsize, &cur_time)};
@@ -136,7 +142,11 @@ void RTNAME(Sleep)(std::int64_t seconds) {
   if (seconds < 1) {
     return;
   }
-  std::this_thread::sleep_for(std::chrono::seconds(seconds));
+#if _WIN32
+  Sleep(seconds * 1000);
+#else
+  sleep(seconds);
+#endif
 }
 
 // TODO: not supported on Windows
diff --git a/flang/runtime/product.cpp b/flang/runtime/product.cpp
index 4c3b8c33a12e..7fc0fcd3b107 100644
--- a/flang/runtime/product.cpp
+++ b/flang/runtime/product.cpp
@@ -107,7 +107,7 @@ CppTypeFor<TypeCategory::Integer, 16> RTDEF(ProductInteger16)(
 CppTypeFor<TypeCategory::Real, 4> RTDEF(ProductReal4)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Real, 4>(x, source, line, dim, mask,
-      NonComplexProductAccumulator<CppTypeFor<TypeCategory::Real, 8>>{x},
+      NonComplexProductAccumulator<CppTypeFor<TypeCategory::Real, 4>>{x},
       "PRODUCT");
 }
 CppTypeFor<TypeCategory::Real, 8> RTDEF(ProductReal8)(const Descriptor &x,
@@ -137,7 +137,7 @@ void RTDEF(CppProductComplex4)(CppTypeFor<TypeCategory::Complex, 4> &result,
     const Descriptor &x, const char *source, int line, int dim,
     const Descriptor *mask) {
   result = GetTotalReduction<TypeCategory::Complex, 4>(x, source, line, dim,
-      mask, ComplexProductAccumulator<CppTypeFor<TypeCategory::Real, 8>>{x},
+      mask, ComplexProductAccumulator<CppTypeFor<TypeCategory::Real, 4>>{x},
       "PRODUCT");
 }
 void RTDEF(CppProductComplex8)(CppTypeFor<TypeCategory::Complex, 8> &result,
@@ -169,8 +169,8 @@ void RTDEF(CppProductComplex16)(CppTypeFor<TypeCategory::Complex, 16> &result,
 void RTDEF(ProductDim)(Descriptor &result, const Descriptor &x, int dim,
     const char *source, int line, const Descriptor *mask) {
   TypedPartialNumericReduction<NonComplexProductAccumulator,
-      NonComplexProductAccumulator, ComplexProductAccumulator>(
-      result, x, dim, source, line, mask, "PRODUCT");
+      NonComplexProductAccumulator, ComplexProductAccumulator,
+      /*MIN_REAL_KIND=*/4>(result, x, dim, source, line, mask, "PRODUCT");
 }
 
 RT_EXT_API_GROUP_END
diff --git a/flang/runtime/reduction-templates.h b/flang/runtime/reduction-templates.h
index f8e6f6095509..d102e5642547 100644
--- a/flang/runtime/reduction-templates.h
+++ b/flang/runtime/reduction-templates.h
@@ -240,11 +240,10 @@ inline RT_API_ATTRS void PartialIntegerReduction(Descriptor &result,
       kind, terminator, result, x, dim, mask, terminator, intrinsic);
 }
 
-template <TypeCategory CAT, template <typename> class ACCUM>
+template <TypeCategory CAT, template <typename> class ACCUM, int MIN_KIND>
 struct PartialFloatingReductionHelper {
   template <int KIND> struct Functor {
-    static constexpr int Intermediate{
-        std::max(KIND, 8)}; // use at least "double" for intermediate results
+    static constexpr int Intermediate{std::max(KIND, MIN_KIND)};
     RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x,
         int dim, const Descriptor *mask, Terminator &terminator,
         const char *intrinsic) const {
@@ -260,7 +259,7 @@ struct PartialFloatingReductionHelper {
 
 template <template <typename> class INTEGER_ACCUM,
     template <typename> class REAL_ACCUM,
-    template <typename> class COMPLEX_ACCUM>
+    template <typename> class COMPLEX_ACCUM, int MIN_REAL_KIND>
 inline RT_API_ATTRS void TypedPartialNumericReduction(Descriptor &result,
     const Descriptor &x, int dim, const char *source, int line,
     const Descriptor *mask, const char *intrinsic) {
@@ -274,13 +273,13 @@ inline RT_API_ATTRS void TypedPartialNumericReduction(Descriptor &result,
     break;
   case TypeCategory::Real:
     ApplyFloatingPointKind<PartialFloatingReductionHelper<TypeCategory::Real,
-                               REAL_ACCUM>::template Functor,
+                               REAL_ACCUM, MIN_REAL_KIND>::template Functor,
         void>(catKind->second, terminator, result, x, dim, mask, terminator,
         intrinsic);
     break;
   case TypeCategory::Complex:
     ApplyFloatingPointKind<PartialFloatingReductionHelper<TypeCategory::Complex,
-                               COMPLEX_ACCUM>::template Functor,
+                               COMPLEX_ACCUM, MIN_REAL_KIND>::template Functor,
         void>(catKind->second, terminator, result, x, dim, mask, terminator,
         intrinsic);
     break;
diff --git a/flang/runtime/sum.cpp b/flang/runtime/sum.cpp
index d2495e3e956f..63d8c9029a0e 100644
--- a/flang/runtime/sum.cpp
+++ b/flang/runtime/sum.cpp
@@ -134,7 +134,7 @@ CppTypeFor<TypeCategory::Integer, 16> RTDEF(SumInteger16)(const Descriptor &x,
 CppTypeFor<TypeCategory::Real, 4> RTDEF(SumReal4)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Real, 4>(
-      x, source, line, dim, mask, RealSumAccumulator<double>{x}, "SUM");
+      x, source, line, dim, mask, RealSumAccumulator<float>{x}, "SUM");
 }
 CppTypeFor<TypeCategory::Real, 8> RTDEF(SumReal8)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
@@ -160,7 +160,7 @@ void RTDEF(CppSumComplex4)(CppTypeFor<TypeCategory::Complex, 4> &result,
     const Descriptor &x, const char *source, int line, int dim,
     const Descriptor *mask) {
   result = GetTotalReduction<TypeCategory::Complex, 4>(
-      x, source, line, dim, mask, ComplexSumAccumulator<double>{x}, "SUM");
+      x, source, line, dim, mask, ComplexSumAccumulator<float>{x}, "SUM");
 }
 void RTDEF(CppSumComplex8)(CppTypeFor<TypeCategory::Complex, 8> &result,
     const Descriptor &x, const char *source, int line, int dim,
@@ -188,7 +188,8 @@ void RTDEF(CppSumComplex16)(CppTypeFor<TypeCategory::Complex, 16> &result,
 void RTDEF(SumDim)(Descriptor &result, const Descriptor &x, int dim,
     const char *source, int line, const Descriptor *mask) {
   TypedPartialNumericReduction<IntegerSumAccumulator, RealSumAccumulator,
-      ComplexSumAccumulator>(result, x, dim, source, line, mask, "SUM");
+      ComplexSumAccumulator, /*MIN_REAL_KIND=*/4>(
+      result, x, dim, source, line, mask, "SUM");
 }
 
 RT_EXT_API_GROUP_END
diff --git a/flang/test/Driver/driver-help-hidden.f90 b/flang/test/Driver/driver-help-hidden.f90
deleted file mode 100644
index 706b2cb6c245..000000000000
--- a/flang/test/Driver/driver-help-hidden.f90
+++ /dev/null
@@ -1,172 +0,0 @@
-
-!--------------------------
-! FLANG DRIVER (flang-new)
-!--------------------------
-! RUN: %flang --help-hidden 2>&1 | FileCheck %s
-! RUN: not %flang  -help-hidden 2>&1 | FileCheck %s --check-prefix=ERROR-FLANG
-
-!----------------------------------------
-! FLANG FRONTEND DRIVER (flang-new -fc1)
-!----------------------------------------
-! RUN: not %flang_fc1 --help-hidden 2>&1 | FileCheck %s --check-prefix=ERROR-FLANG-FC1
-! RUN: not %flang_fc1  -help-hidden 2>&1 | FileCheck %s --check-prefix=ERROR-FLANG-FC1
-
-! CHECK:USAGE: flang-new
-! CHECK-EMPTY:
-! CHECK-NEXT: DRIVER OPTIONS:
-! CHECK-NEXT:  --driver-mode=<value> Set the driver mode to either 'gcc', 'g++', 'cpp', 'cl' or 'flang'
-! CHECK-EMPTY:
-! CHECK-NEXT:OPTIONS:
-! CHECK-NEXT: -###                    Print (but do not run) the commands to run for this compilation
-! CHECK-NEXT: -ccc-print-phases       Dump list of actions to perform
-! CHECK-NEXT: -cpp                    Enable predefined and command line preprocessor macros
-! CHECK-NEXT: -c                      Only run preprocess, compile, and assemble steps
-! CHECK-NEXT: -dM                     Print macro definitions in -E mode instead of normal output
-! CHECK-NEXT: -dumpmachine            Display the compiler's target processor
-! CHECK-NEXT: -dumpversion            Display the version of the compiler
-! CHECK-NEXT: -D <macro>=<value>      Define <macro> to <value> (or 1 if <value> omitted)
-! CHECK-NEXT: -emit-llvm              Use the LLVM representation for assembler and object files
-! CHECK-NEXT: -E                      Only run the preprocessor
-! CHECK-NEXT: -falternative-parameter-statement
-! CHECK-NEXT:                         Enable the old style PARAMETER statement
-! CHECK-NEXT: -fapprox-func           Allow certain math function calls to be replaced with an approximately equivalent calculation
-! CHECK-NEXT: -fbackslash             Specify that backslash in string introduces an escape character
-! CHECK-NEXT: -fcolor-diagnostics     Enable colors in diagnostics
-! CHECK-NEXT: -fconvert=<value>       Set endian conversion of data for unformatted files
-! CHECK-NEXT: -fdefault-double-8      Set the default double precision kind to an 8 byte wide type
-! CHECK-NEXT: -fdefault-integer-8     Set the default integer and logical kind to an 8 byte wide type
-! CHECK-NEXT: -fdefault-real-8        Set the default real kind to an 8 byte wide type
-! CHECK-NEXT: -ffast-math             Allow aggressive, lossy floating-point optimizations
-! CHECK-NEXT: -ffixed-form            Process source files in fixed form
-! CHECK-NEXT: -ffixed-line-length=<value>
-! CHECK-NEXT:                         Use <value> as character line width in fixed mode
-! CHECK-NEXT: -ffp-contract=<value>   Form fused FP ops (e.g. FMAs)
-! CHECK-NEXT: -ffree-form             Process source files in free form
-! CHECK-NEXT: -fhonor-infinities      Specify that floating-point optimizations are not allowed that assume arguments and results are not +-inf.
-! CHECK-NEXT: -fhonor-nans            Specify that floating-point optimizations are not allowed that assume arguments and results are not NANs.
-! CHECK-NEXT: -fimplicit-none         No implicit typing allowed unless overridden by IMPLICIT statements
-! CHECK-NEXT: -finput-charset=<value> Specify the default character set for source files
-! CHECK-NEXT: -fintegrated-as         Enable the integrated assembler
-! CHECK-NEXT: -fintrinsic-modules-path <dir>
-! CHECK-NEXT:                         Specify where to find the compiled intrinsic modules
-! CHECK-NEXT: -flang-deprecated-no-hlfir
-! CHECK-NEXT:                         Do not use HLFIR lowering (deprecated)
-! CHECK-NEXT: -flang-experimental-hlfir
-! CHECK-NEXT:                         Use HLFIR lowering (experimental)
-! CHECK-NEXT: -flarge-sizes           Use INTEGER(KIND=8) for the result type in size-related intrinsics
-! CHECK-NEXT: -flogical-abbreviations Enable logical abbreviations
-! CHECK-NEXT: -flto=auto              Enable LTO in 'full' mode
-! CHECK-NEXT: -flto=jobserver         Enable LTO in 'full' mode
-! CHECK-NEXT: -flto=<value>           Set LTO mode
-! CHECK-NEXT: -flto                   Enable LTO in 'full' mode
-! CHECK-NEXT: -fms-runtime-lib=<value>
-! CHECK-NEXT:                         Select Windows run-time library
-! CHECK-NEXT: -fno-automatic          Implies the SAVE attribute for non-automatic local objects in subprograms unless RECURSIVE
-! CHECK-NEXT: -fno-color-diagnostics  Disable colors in diagnostics
-! CHECK-NEXT: -fno-integrated-as      Disable the integrated assembler
-! CHECK-NEXT: -fno-lto                Disable LTO mode (default)
-! CHECK-NEXT: -fno-ppc-native-vector-element-order
-! CHECK-NEXT:                         Specifies PowerPC non-native vector element order
-! CHECK-NEXT: -fno-rtlib-add-rpath Do not add -rpath with architecture-specific resource directory to the linker flags. When --hip-link is specified, do not add -rpath with HIP runtime library directory to the linker flags
-! CHECK-NEXT: -fno-signed-zeros       Allow optimizations that ignore the sign of floating point zeros
-! CHECK-NEXT: -fno-stack-arrays       Allocate array temporaries on the heap (default)
-! CHECK-NEXT: -fno-version-loops-for-stride
-! CHECK-NEXT:                         Do not create unit-strided loops (default)
-! CHECK-NEXT: -fomit-frame-pointer    Omit the frame pointer from functions that don't need it. Some stack unwinding cases, such as profilers and sanitizers, may prefer specifying -fno-omit-frame-pointer. On many targets, -O1 and higher omit the frame pointer by default. -m[no-]omit-leaf-frame-pointer takes precedence for leaf functions
-! CHECK-NEXT: -fopenacc               Enable OpenACC
-! CHECK-NEXT: -fopenmp-assume-no-nested-parallelism
-! CHECK-NEXT:                         Assert no nested parallel regions in the GPU
-! CHECK-NEXT: -fopenmp-assume-no-thread-state
-! CHECK-NEXT:                         Assert no thread in a parallel region modifies an ICV
-! CHECK-NEXT: -fopenmp-target-debug   Enable debugging in the OpenMP offloading device RTL
-! CHECK-NEXT: -fopenmp-targets=<value>
-! CHECK-NEXT:                         Specify comma-separated list of triples OpenMP offloading targets to be supported
-! CHECK-NEXT: -fopenmp-version=<value>
-! CHECK-NEXT:                         Set OpenMP version (e.g. 45 for OpenMP 4.5, 51 for OpenMP 5.1). Default value is 11 for Flang
-! CHECK-NEXT: -fopenmp                Parse OpenMP pragmas and generate parallel code.
-! CHECK-NEXT: -foptimization-record-file=<file>
-! CHECK-NEXT:                         Specify the output name of the file containing the optimization remarks. Implies -fsave-optimization-record. On Darwin platforms, this cannot be used with multiple -arch <arch> options.
-! CHECK-NEXT: -foptimization-record-passes=<regex>
-! CHECK-NEXT:                         Only include passes which match a specified regular expression in the generated optimization record (by default, include all passes)
-! CHECK-NEXT: -fpass-plugin=<dsopath> Load pass plugin from a dynamic shared object file (only with new pass manager).
-! CHECK-NEXT: -fppc-native-vector-element-order
-! CHECK-NEXT:                         Specifies PowerPC native vector element order (default)
-! CHECK-NEXT: -freciprocal-math       Allow division operations to be reassociated
-! CHECK-NEXT: -fropi                  Generate read-only position independent code (ARM only)
-! CHECK-NEXT: -frtlib-add-rpath Add -rpath with architecture-specific resource directory to the linker flags. When --hip-link is specified, also add -rpath with HIP runtime library directory to the linker flags
-! CHECK-NEXT: -frwpi                  Generate read-write position independent code (ARM only)
-! CHECK-NEXT: -fsave-optimization-record=<format>
-! CHECK-NEXT:                         Generate an optimization record file in a specific format
-! CHECK-NEXT: -fsave-optimization-record
-! CHECK-NEXT:                         Generate a YAML optimization record file
-! CHECK-NEXT: -fstack-arrays          Attempt to allocate array temporaries on the stack, no matter their size
-! CHECK-NEXT: -fsyntax-only           Run the preprocessor, parser and semantic analysis stages
-! CHECK-NEXT: -funderscoring          Appends one trailing underscore to external names
-! CHECK-NEXT: -fveclib=<value>        Use the given vector functions library
-! CHECK-NEXT: -fversion-loops-for-stride
-! CHECK-NEXT:                         Create unit-strided versions of loops
-! CHECK-NEXT: -fxor-operator          Enable .XOR. as a synonym of .NEQV.
-! CHECK-NEXT: --gcc-install-dir=<value>
-! CHECK-NEXT:                         Use GCC installation in the specified directory. The directory ends with path components like 'lib{,32,64}/gcc{,-cross}/$triple/$version'. Note: executables (e.g. ld) used by the compiler are not overridden by the selected GCC installation
-! CHECK-NEXT: --gcc-toolchain=<value> Specify a directory where Flang can find 'lib{,32,64}/gcc{,-cross}/$triple/$version'. Flang will use the GCC installation with the largest version
-! CHECK-NEXT: -gline-directives-only  Emit debug line info directives only
-! CHECK-NEXT: -gline-tables-only      Emit debug line number tables only
-! CHECK-NEXT: -gpulibc                Link the LLVM C Library for GPUs
-! CHECK-NEXT: -g                      Generate source-level debug information
-! CHECK-NEXT: --help-hidden           Display help for hidden options
-! CHECK-NEXT: -help                   Display available options
-! CHECK-NEXT: -isysroot <dir>         Set the system root directory (usually /)
-! CHECK-NEXT: -I <dir>                Add directory to the end of the list of include search paths
-! CHECK-NEXT: -L <dir>                Add directory to library search path
-! CHECK-NEXT: -march=<value>          For a list of available architectures for the target use '-mcpu=help'
-! CHECK-NEXT: -mcode-object-version=<value>
-! CHECK-NEXT:                         Specify code object ABI version. Defaults to 5. (AMDGPU only)
-! CHECK-NEXT: -mcpu=<value>           For a list of available CPUs for the target use '-mcpu=help'
-! CHECK-NEXT: -mllvm=<arg>            Alias for -mllvm
-! CHECK-NEXT: -mllvm <value>          Additional arguments to forward to LLVM's option processing
-! CHECK-NEXT: -mmlir <value>          Additional arguments to forward to MLIR's option processing
-! CHECK-NEXT: -mno-outline-atomics    Don't generate local calls to out-of-line atomic operations
-! CHECK-NEXT: -module-dir <dir>       Put MODULE files in <dir>
-! CHECK-NEXT: -moutline-atomics       Generate local calls to out-of-line atomic operations
-! CHECK-NEXT: -mrvv-vector-bits=<value>
-! CHECK-NEXT:                         Specify the size in bits of an RVV vector register
-! CHECK-NEXT: -msve-vector-bits=<value>
-! CHECK-NEXT:                          Specify the size in bits of an SVE vector register. Defaults to the vector length agnostic value of "scalable". (AArch64 only)
-! CHECK-NEXT: --no-offload-arch=<value>
-! CHECK-NEXT:                         Remove CUDA/HIP offloading device architecture (e.g. sm_35, gfx906) from the list of devices to compile for. 'all' resets the list to its default value.
-! CHECK-NEXT: -nocpp                  Disable predefined and command line preprocessor macros
-! CHECK-NEXT: -nogpulib               Do not link device library for CUDA/HIP device compilation
-! CHECK-NEXT: --offload-arch=<value>  Specify an offloading device architecture for CUDA, HIP, or OpenMP. (e.g. sm_35). If 'native' is used the compiler will detect locally installed architectures. For HIP offloading, the device architecture can be followed by target ID features delimited by a colon (e.g. gfx908:xnack+:sramecc-). May be specified more than once.
-! CHECK-NEXT: --offload-device-only   Only compile for the offloading device.
-! CHECK-NEXT: --offload-host-device   Compile for both the offloading host and device (default).
-! CHECK-NEXT: --offload-host-only     Only compile for the offloading host.
-! CHECK-NEXT: -o <file>               Write output to <file>
-! CHECK-NEXT: -pedantic               Warn on language extensions
-! CHECK-NEXT: -print-effective-triple Print the effective target triple
-! CHECK-NEXT: -print-target-triple    Print the normalized target triple
-! CHECK-NEXT: -pthread                Support POSIX threads in generated code
-! CHECK-NEXT: -P                      Disable linemarker output in -E mode
-! CHECK-NEXT: -resource-dir <value>   The directory which holds the compiler resource files
-! CHECK-NEXT: --rocm-path=<value> ROCm installation path, used for finding and automatically linking required bitcode libraries.
-! CHECK-NEXT: -Rpass-analysis=<value> Report transformation analysis from optimization passes whose name matches the given POSIX regular expression
-! CHECK-NEXT: -Rpass-missed=<value>   Report missed transformations by optimization passes whose name matches the given POSIX regular expression
-! CHECK-NEXT: -Rpass=<value>          Report transformations performed by optimization passes whose name matches the given POSIX regular expression
-! CHECK-NEXT: -R<remark>              Enable the specified remark
-! CHECK-NEXT: -save-temps=<value>     Save intermediate compilation results.
-! CHECK-NEXT: -save-temps             Alias for --save-temps=cwd
-! CHECK-NEXT: -std=<value>            Language standard to compile for
-! CHECK-NEXT: -S                      Only run preprocess and compilation steps
-! CHECK-NEXT: --target=<value>        Generate code for the given target
-! CHECK-NEXT: -U <macro>              Undefine macro <macro>
-! CHECK-NEXT: --version               Print version information
-! CHECK-NEXT: -v                      Show commands to run and use verbose output
-! CHECK-NEXT: -Wl,<arg>               Pass the comma separated arguments in <arg> to the linker
-! CHECK-NEXT: -W<warning>             Enable the specified warning
-! CHECK-NEXT: -Xflang <arg>           Pass <arg> to the flang compiler
-! CHECK-NEXT: -x <language>           Treat subsequent input files as having type <language>
-
-
-! ERROR-FLANG: error: unknown argument '-help-hidden'; did you mean '--help-hidden'?
-
-! Frontend driver -help-hidden is not supported
-! ERROR-FLANG-FC1: error: unknown argument: '{{.*}}'
diff --git a/flang/test/Driver/w-option.f90 b/flang/test/Driver/w-option.f90
new file mode 100644
index 000000000000..e34cddaab373
--- /dev/null
+++ b/flang/test/Driver/w-option.f90
@@ -0,0 +1,31 @@
+! Test the default setting. Emit warnings only.
+! RUN: %flang -c %s 2>&1 | FileCheck %s -check-prefix=DEFAULT
+
+! Test that the warnings are not generated with `-w` option.
+! RUN: %flang -c -w %s 2>&1 | FileCheck --allow-empty %s -check-prefix=WARNING
+
+! Test that warnings are portability messages are generated.
+! RUN: %flang -c -pedantic %s 2>&1 | FileCheck %s -check-prefixes=DEFAULT,PORTABILITY
+
+! Test that warnings and portability messages are not generated.
+! TODO: Support the last flag wins behaviour.
+! RUN: %flang -c -pedantic -w %s 2>&1 | FileCheck --allow-empty %s -check-prefixes=WARNING,PORTABILITY-WARNING
+! RUN: %flang -c -w -pedantic %s 2>&1 | FileCheck --allow-empty %s -check-prefixes=WARNING,PORTABILITY-WARNING
+! DEFAULT: warning: Label '40' is in a construct that should not be used as a branch target here
+! DEFAULT: warning: Label '50' is in a construct that should not be used as a branch target here
+! WARNING-NOT: warning
+! PORTABILITY: portability: Statement function 'sf1' should not contain an array constructor
+! PORTABILITY-WARNING-NOT: portability
+
+subroutine sub01(n)
+  integer n
+  GOTO (40,50,60) n
+  if (n .eq. 1) then
+40   print *, "xyz"
+50 end if
+60 continue
+end subroutine sub01
+
+subroutine sub02
+  sf1(n) = sum([(j,j=1,n)])
+end subroutine sub02
diff --git a/flang/test/Evaluate/rewrite-out_of_range.F90 b/flang/test/Evaluate/rewrite-out_of_range.F90
index a5cd09cb2853..b5df610ff2fb 100644
--- a/flang/test/Evaluate/rewrite-out_of_range.F90
+++ b/flang/test/Evaluate/rewrite-out_of_range.F90
@@ -1,5 +1,5 @@
 ! Tests rewriting of OUT_OF_RANGE()
-! RUN: %flang_fc1 -fdebug-unparse %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -fdebug-unparse -cpp %s 2>&1 | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-X86-64%}
 
 logical round
 
@@ -194,12 +194,12 @@ end
 !CHECK:   PRINT *, " real", 8_4, "real", 8_4, .false._4
 !CHECK:   PRINT *, " real", 8_4, "real", 10_4, .false._4
 !CHECK:   PRINT *, " real", 8_4, "real", 16_4, .false._4
-!CHECK:   PRINT *, " real", 10_4, "real", 2_4, blt(transfer(abs(x)-6.5504e4_10,0_16)-1_16,604444463063240877801471_16)
-!CHECK:   PRINT *, " real", 10_4, "real", 3_4, blt(transfer(abs(x)-3.3895313892515354759047080037148786688e38_10,0_16)-1_16,604444463063240877801471_16)
-!CHECK:   PRINT *, " real", 10_4, "real", 4_4, blt(transfer(abs(x)-3.4028234663852885981170418348451692544e38_10,0_16)-1_16,604444463063240877801471_16)
-!CHECK:   PRINT *, " real", 10_4, "real", 8_4, blt(transfer(abs(x)-1.79769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368e308_10,0_16)-1_16,604444463063240877801471_16)
-!CHECK:   PRINT *, " real", 10_4, "real", 10_4, .false._4
-!CHECK:   PRINT *, " real", 10_4, "real", 16_4, .false._4
+!CHECK-X86-64:   PRINT *, " real", 10_4, "real", 2_4, blt(transfer(abs(x)-6.5504e4_10,0_16)-1_16,604444463063240877801471_16)
+!CHECK-X86-64:   PRINT *, " real", 10_4, "real", 3_4, blt(transfer(abs(x)-3.3895313892515354759047080037148786688e38_10,0_16)-1_16,604444463063240877801471_16)
+!CHECK-X86-64:   PRINT *, " real", 10_4, "real", 4_4, blt(transfer(abs(x)-3.4028234663852885981170418348451692544e38_10,0_16)-1_16,604444463063240877801471_16)
+!CHECK-X86-64:   PRINT *, " real", 10_4, "real", 8_4, blt(transfer(abs(x)-1.79769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368e308_10,0_16)-1_16,604444463063240877801471_16)
+!CHECK-X86-64:   PRINT *, " real", 10_4, "real", 10_4, .false._4
+!CHECK-X86-64:   PRINT *, " real", 10_4, "real", 16_4, .false._4
 !CHECK:   PRINT *, " real", 16_4, "real", 2_4, blt(transfer(abs(x)-6.5504e4_16,0_16)-1_16,170135991163610696904058773219554885631_16)
 !CHECK:   PRINT *, " real", 16_4, "real", 3_4, blt(transfer(abs(x)-3.3895313892515354759047080037148786688e38_16,0_16)-1_16,170135991163610696904058773219554885631_16)
 !CHECK:   PRINT *, " real", 16_4, "real", 4_4, blt(transfer(abs(x)-3.4028234663852885981170418348451692544e38_16,0_16)-1_16,170135991163610696904058773219554885631_16)
diff --git a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
index 8cf4f566964f..72cd0a763e71 100644
--- a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
+++ b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
@@ -937,11 +937,68 @@ func.func @omp_map_info_descriptor_type_conversion(%arg0 : !fir.ref<!fir.box<!fi
   %0 = fir.box_offset %arg0 base_addr : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> !fir.llvm_ptr<!fir.ref<i32>>
   // CHECK: %[[MEMBER_MAP:.*]] = omp.map.info var_ptr(%[[GEP]] : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
   %1 = omp.map.info var_ptr(%0 : !fir.llvm_ptr<!fir.ref<i32>>, i32) map_clauses(tofrom) capture(ByRef) -> !fir.llvm_ptr<!fir.ref<i32>> {name = ""}
-  // CHECK: %[[DESC_MAP:.*]] = omp.map.info var_ptr(%[[ARG_0]] : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>) map_clauses(always, delete) capture(ByRef) members(%[[MEMBER_MAP]] : !llvm.ptr) -> !llvm.ptr {name = ""}
-  %2 = omp.map.info var_ptr(%arg0 : !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.box<!fir.heap<i32>>) map_clauses(always, delete) capture(ByRef) members(%1 : !fir.llvm_ptr<!fir.ref<i32>>) -> !fir.ref<!fir.box<!fir.heap<i32>>> {name = ""}
+  // CHECK: %[[DESC_MAP:.*]] = omp.map.info var_ptr(%[[ARG_0]] : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>) map_clauses(always, delete) capture(ByRef) members(%[[MEMBER_MAP]] : [0] : !llvm.ptr) -> !llvm.ptr {name = ""}
+  %2 = omp.map.info var_ptr(%arg0 : !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.box<!fir.heap<i32>>) map_clauses(always, delete) capture(ByRef) members(%1 : [0] : !fir.llvm_ptr<!fir.ref<i32>>) -> !fir.ref<!fir.box<!fir.heap<i32>>> {name = ""}
   // CHECK: omp.target_exit_data map_entries(%[[DESC_MAP]] : !llvm.ptr) 
   omp.target_exit_data   map_entries(%2 : !fir.ref<!fir.box<!fir.heap<i32>>>)
   return 
 }
 
 // -----
+
+// CHECK-LABEL:  llvm.func @omp_map_info_derived_type_explicit_member_conversion
+// CHECK-SAME:   %[[ARG_0:.*]]: !llvm.ptr)
+
+func.func @omp_map_info_derived_type_explicit_member_conversion(%arg0 : !fir.ref<!fir.type<_QFderived_type{real:f32,array:!fir.array<10xi32>,int:i32}>>) {
+  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ARG_0]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"_QFderived_type", (f32, array<10 x i32>, i32)>
+  %0 = fir.field_index int, !fir.type<_QFderived_type{real:f32,array:!fir.array<10xi32>,int:i32}>
+  %1 = fir.coordinate_of %arg0, %0 : (!fir.ref<!fir.type<_QFderived_type{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.field) -> !fir.ref<i32>
+  // CHECK: %[[MAP_MEMBER_1:.*]] = omp.map.info var_ptr(%[[GEP]] : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "dtype%int"}
+  %2 = omp.map.info var_ptr(%1 : !fir.ref<i32>, i32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "dtype%int"}
+  // CHECK: %[[GEP_2:.*]] = llvm.getelementptr %[[ARG_0]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"_QFderived_type", (f32, array<10 x i32>, i32)>
+  %3 = fir.field_index real, !fir.type<_QFderived_type{real:f32,array:!fir.array<10xi32>,int:i32}>
+  %4 = fir.coordinate_of %arg0, %3 : (!fir.ref<!fir.type<_QFderived_type{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.field) -> !fir.ref<f32>
+  // CHECK: %[[MAP_MEMBER_2:.*]] = omp.map.info var_ptr(%[[GEP_2]] : !llvm.ptr, f32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "dtype%real"}
+  %5 = omp.map.info var_ptr(%4 : !fir.ref<f32>, f32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<f32> {name = "dtype%real"}    
+  // CHECK: %[[MAP_PARENT:.*]] = omp.map.info var_ptr(%[[ARG_0]] : !llvm.ptr, !llvm.struct<"_QFderived_type", (f32, array<10 x i32>, i32)>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_MEMBER_1]], %[[MAP_MEMBER_2]] : [2], [0] : !llvm.ptr, !llvm.ptr) -> !llvm.ptr {name = "dtype", partial_map = true} 
+  %6 = omp.map.info var_ptr(%arg0 : !fir.ref<!fir.type<_QFderived_type{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.type<_QFderived_type{real:f32,array:!fir.array<10xi32>,int:i32}>) map_clauses(tofrom) capture(ByRef) members(%2, %5 : [2], [0] : !fir.ref<i32>, !fir.ref<f32>) -> !fir.ref<!fir.type<_QFderived_type{real:f32,array:!fir.array<10xi32>,int:i32}>> {name = "dtype", partial_map = true}
+  // CHECK: omp.target map_entries(%[[MAP_MEMBER_1]] -> %[[ARG_1:.*]], %[[MAP_MEMBER_2]] -> %[[ARG_2:.*]], %[[MAP_PARENT]] -> %[[ARG_3:.*]] : !llvm.ptr, !llvm.ptr, !llvm.ptr) {
+  // CHECK: ^bb0(%[[ARG_1]]: !llvm.ptr, %[[ARG_2]]: !llvm.ptr, %[[ARG_3]]: !llvm.ptr):
+  omp.target map_entries(%2 -> %arg1, %5 -> %arg2, %6 -> %arg3 : !fir.ref<i32>, !fir.ref<f32>, !fir.ref<!fir.type<_QFderived_type{real:f32,array:!fir.array<10xi32>,int:i32}>>) {
+  ^bb0(%arg1: !fir.ref<f32>, %arg2: !fir.ref<i32>, %arg3: !fir.ref<!fir.type<_QFderived_type{real:f32,array:!fir.array<10xi32>,int:i32}>>):
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL:  llvm.func @omp_map_info_nested_derived_type_explicit_member_conversion
+// CHECK-SAME:   %[[ARG_0:.*]]: !llvm.ptr)
+
+func.func @omp_map_info_nested_derived_type_explicit_member_conversion(%arg0 : !fir.ref<!fir.type<_QFTtop_layer{array_i:!fir.array<10xi32>,nested:!fir.type<_QFTbottom_layer{array_i2:!fir.array<10xf32>,i2:f64}>,k:i32}>>) {
+    // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ARG_0]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"_QFTtop_layer", (array<10 x i32>, struct<"_QFTbottom_layer", (array<10 x f32>, f64)>, i32)>
+    %0 = fir.field_index nested, !fir.type<_QFTtop_layer{array_i:!fir.array<10xi32>,nested:!fir.type<_QFTbottom_layer{array_i2:!fir.array<10xf32>,i2:f64}>,k:i32}>
+    %1 = fir.coordinate_of %arg0, %0 : (!fir.ref<!fir.type<_QFTtop_layer{array_i:!fir.array<10xi32>,nested:!fir.type<_QFTbottom_layer{array_i2:!fir.array<10xf32>,i2:f64}>,k:i32}>>, !fir.field) -> !fir.ref<!fir.type<_QFTbottom_layer{array_i2:!fir.array<10xf32>,i2:f64}>>
+    // CHECK: %[[GEP_2:.*]] = llvm.getelementptr %[[GEP]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"_QFTbottom_layer", (array<10 x f32>, f64)>
+    %2 = fir.field_index i2, !fir.type<_QFTbottom_layer{array_i2:!fir.array<10xf32>,i2:f64}>
+    %3 = fir.coordinate_of %1, %2 : (!fir.ref<!fir.type<_QFTbottom_layer{array_i2:!fir.array<10xf32>,i2:f64}>>, !fir.field) -> !fir.ref<f64>
+    // CHECK: %[[MAP_MEMBER_1:.*]] = omp.map.info var_ptr(%[[GEP_2]] : !llvm.ptr, f64) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr
+    %4 = omp.map.info var_ptr(%3 : !fir.ref<f64>, f64) map_clauses(tofrom) capture(ByRef) -> !fir.ref<f64>
+    // CHECK: %[[GEP_3:.*]] = llvm.getelementptr %[[ARG_0]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"_QFTtop_layer", (array<10 x i32>, struct<"_QFTbottom_layer", (array<10 x f32>, f64)>, i32)>
+    %5 = fir.field_index k, !fir.type<_QFTtop_layer{array_i:!fir.array<10xi32>,nested:!fir.type<_QFTbottom_layer{array_i2:!fir.array<10xf32>,i2:f64}>,k:i32}>
+    %6 = fir.coordinate_of %arg0, %5 : (!fir.ref<!fir.type<_QFTtop_layer{array_i:!fir.array<10xi32>,nested:!fir.type<_QFTbottom_layer{array_i2:!fir.array<10xf32>,i2:f64}>,k:i32}>>, !fir.field) -> !fir.ref<i32>
+    // CHECK: %[[MAP_MEMBER_2:.*]] = omp.map.info var_ptr(%[[GEP_3]] : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr
+    %7 = omp.map.info var_ptr(%6 : !fir.ref<i32>, i32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32>
+    // CHECK: %[[PARENT_MAP:.*]] = omp.map.info var_ptr(%[[ARG_0]] : !llvm.ptr, !llvm.struct<"_QFTtop_layer", (array<10 x i32>, struct<"_QFTbottom_layer", (array<10 x f32>, f64)>, i32)>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_MEMBER_1]], %[[MAP_MEMBER_2]] : [1,1], [2,-1] : !llvm.ptr, !llvm.ptr) -> !llvm.ptr {partial_map = true}
+    %9 = omp.map.info var_ptr(%arg0 : !fir.ref<!fir.type<_QFTtop_layer{array_i:!fir.array<10xi32>,nested:!fir.type<_QFTbottom_layer{array_i2:!fir.array<10xf32>,i2:f64}>,k:i32}>>, !fir.type<_QFTtop_layer{array_i:!fir.array<10xi32>,nested:!fir.type<_QFTbottom_layer{array_i2:!fir.array<10xf32>,i2:f64}>,k:i32}>) map_clauses(tofrom) capture(ByRef) members(%4, %7 : [1,1], [2,-1] : !fir.ref<f64>, !fir.ref<i32>) -> !fir.ref<!fir.type<_QFTtop_layer{array_i:!fir.array<10xi32>,nested:!fir.type<_QFTbottom_layer{array_i2:!fir.array<10xf32>,i2:f64}>,k:i32}>> {partial_map = true}
+    // CHECK: omp.target map_entries(%[[MAP_MEMBER_1]] -> %{{.*}}, %[[MAP_MEMBER_2]] -> %{{.*}}, %[[PARENT_MAP]] -> %{{.*}} : !llvm.ptr, !llvm.ptr, !llvm.ptr) {
+    // CHECK: ^bb0(%{{.*}}: !llvm.ptr, %{{.*}}: !llvm.ptr, %{{.*}}: !llvm.ptr):
+    omp.target map_entries(%4 -> %arg1, %7 -> %arg2, %9 -> %arg3 : !fir.ref<f64>, !fir.ref<i32>, !fir.ref<!fir.type<_QFTtop_layer{array_i:!fir.array<10xi32>,nested:!fir.type<_QFTbottom_layer{array_i2:!fir.array<10xf32>,i2:f64}>,k:i32}>>) {
+     ^bb0(%arg1: !fir.ref<i32>, %arg2: !fir.ref<f64>, %arg3: !fir.ref<!fir.type<_QFTtop_layer{array_i:!fir.array<10xi32>,nested:!fir.type<_QFTbottom_layer{array_i2:!fir.array<10xf32>,i2:f64}>,k:i32}>>):
+      omp.terminator
+    }
+  return
+}
+
+// -----
diff --git a/flang/test/Fir/cuf-invalid.fir b/flang/test/Fir/cuf-invalid.fir
index 6c533a32ccf9..5a12e3c1a4bf 100644
--- a/flang/test/Fir/cuf-invalid.fir
+++ b/flang/test/Fir/cuf-invalid.fir
@@ -85,3 +85,21 @@ func.func @_QPsub1() {
   %13 = fir.cuda_deallocate %11 : !fir.ref<!fir.box<none>> errmsg(%16 : !fir.box<none>) {cuda_attr = #fir.cuda<device>} -> i32
   return
 }
+
+// -----
+
+func.func @_QPsub1() {
+  // expected-error@+1{{'fir.cuda_alloc' op expect device, managed or unified cuda attribute}}
+  %0 = fir.cuda_alloc f32 {bindc_name = "r", cuda_attr = #fir.cuda<pinned>, uniq_name = "_QFsub1Er"} -> !fir.ref<f32>
+  fir.cuda_free %0 : !fir.ref<f32> {cuda_attr = #fir.cuda<constant>}
+  return
+}
+
+// -----
+
+func.func @_QPsub1() {
+  %0 = fir.cuda_alloc f32 {bindc_name = "r", cuda_attr = #fir.cuda<device>, uniq_name = "_QFsub1Er"} -> !fir.ref<f32>
+  // expected-error@+1{{'fir.cuda_free' op expect device, managed or unified cuda attribute}}
+  fir.cuda_free %0 : !fir.ref<f32> {cuda_attr = #fir.cuda<constant>}
+  return
+}
diff --git a/flang/test/Fir/dispatch.f90 b/flang/test/Fir/dispatch.f90
index 1479d611b986..fc935217defa 100644
--- a/flang/test/Fir/dispatch.f90
+++ b/flang/test/Fir/dispatch.f90
@@ -184,7 +184,7 @@ end
 
 ! CHECK-LABEL: func.func @_QMdispatch1Pdisplay_class(
 ! CHECK-SAME: %[[ARG:.*]]: [[CLASS:!fir.class<.*>>]]
-! CHECK: %[[ARG_DECL:.*]]:2 = hlfir.declare %[[ARG]] {uniq_name = "_QMdispatch1Fdisplay_classEp"} : (!fir.class<!fir.type<_QMdispatch1Tp1{a:i32,b:i32}>>) -> (!fir.class<!fir.type<_QMdispatch1Tp1{a:i32,b:i32}>>, !fir.class<!fir.type<_QMdispatch1Tp1{a:i32,b:i32}>>)
+! CHECK: %[[ARG_DECL:.*]]:2 = hlfir.declare %[[ARG]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMdispatch1Fdisplay_classEp"} : (!fir.class<!fir.type<_QMdispatch1Tp1{a:i32,b:i32}>>, !fir.dscope) -> (!fir.class<!fir.type<_QMdispatch1Tp1{a:i32,b:i32}>>, !fir.class<!fir.type<_QMdispatch1Tp1{a:i32,b:i32}>>)
 
 ! Check dynamic dispatch equal to `call p%display2()` with binding index = 2.
 ! CHECK: %[[BOXDESC:.*]] = fir.box_tdesc %[[ARG_DECL]]#0 : ([[CLASS]]) -> !fir.tdesc<none>
diff --git a/flang/test/HLFIR/assumed-type-actual-args.f90 b/flang/test/HLFIR/assumed-type-actual-args.f90
index dbdfc1785ce9..7ce1067d7acd 100644
--- a/flang/test/HLFIR/assumed-type-actual-args.f90
+++ b/flang/test/HLFIR/assumed-type-actual-args.f90
@@ -104,30 +104,34 @@ end subroutine
 
 ! CHECK-LABEL:   func.func @_QPtest1(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<none> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest1Ex"} : (!fir.ref<none>) -> (!fir.ref<none>, !fir.ref<none>)
+! CHECK:           %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest1Ex"} : (!fir.ref<none>, !fir.dscope) -> (!fir.ref<none>, !fir.ref<none>)
 ! CHECK:           fir.call @_QPs1(%[[VAL_1]]#1) fastmath<contract> : (!fir.ref<none>) -> ()
 ! CHECK:           return
 ! CHECK:         }
 
 ! CHECK-LABEL:   func.func @_QPtest2(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<!fir.array<?xnone>> {fir.bindc_name = "x"}) {
+! CHECK:           %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_1:.*]] = arith.constant -1 : index
 ! CHECK:           %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_2]]) {uniq_name = "_QFtest2Ex"} : (!fir.ref<!fir.array<?xnone>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xnone>>, !fir.ref<!fir.array<?xnone>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_2]]) dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest2Ex"} : (!fir.ref<!fir.array<?xnone>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.ref<!fir.array<?xnone>>)
 ! CHECK:           fir.call @_QPs2(%[[VAL_3]]#1) fastmath<contract> : (!fir.ref<!fir.array<?xnone>>) -> ()
 ! CHECK:           return
 ! CHECK:         }
 
 ! CHECK-LABEL:   func.func @_QPtest3(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.box<!fir.array<?xnone>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest3Ex"} : (!fir.box<!fir.array<?xnone>>) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>)
+! CHECK:           %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest3Ex"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>)
 ! CHECK:           fir.call @_QPs3(%[[VAL_1]]#0) fastmath<contract> : (!fir.box<!fir.array<?xnone>>) -> ()
 ! CHECK:           return
 ! CHECK:         }
 
 ! CHECK-LABEL:   func.func @_QPtest4(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.box<!fir.array<?xnone>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest4Ex"} : (!fir.box<!fir.array<?xnone>>) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>)
+! CHECK:           %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest4Ex"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>)
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 : (!fir.box<!fir.array<?xnone>>) -> (!fir.box<!fir.array<?xnone>>, i1)
 ! CHECK:           %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]]#0 : (!fir.box<!fir.array<?xnone>>) -> !fir.ref<!fir.array<?xnone>>
 ! CHECK:           fir.call @_QPs4(%[[VAL_3]]) fastmath<contract> : (!fir.ref<!fir.array<?xnone>>) -> ()
@@ -137,7 +141,8 @@ end subroutine
 
 ! CHECK-LABEL:   func.func @_QPtest3b(
 ! CHECK-SAME:                         %[[VAL_0:.*]]: !fir.box<!fir.array<?xnone>> {fir.bindc_name = "x", fir.optional}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest3bEx"} : (!fir.box<!fir.array<?xnone>>) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>)
+! CHECK:           %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest3bEx"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.is_present %[[VAL_1]]#0 : (!fir.box<!fir.array<?xnone>>) -> i1
 ! CHECK:           %[[VAL_3:.*]]:4 = fir.if %[[VAL_2]] -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>, i1, !fir.box<!fir.array<?xnone>>) {
 ! CHECK:             %[[VAL_4:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 : (!fir.box<!fir.array<?xnone>>) -> (!fir.box<!fir.array<?xnone>>, i1)
@@ -156,7 +161,8 @@ end subroutine
 
 ! CHECK-LABEL:   func.func @_QPtest4b(
 ! CHECK-SAME:                         %[[VAL_0:.*]]: !fir.box<!fir.array<?xnone>> {fir.bindc_name = "x", fir.optional}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest4bEx"} : (!fir.box<!fir.array<?xnone>>) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>)
+! CHECK:           %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest4bEx"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.is_present %[[VAL_1]]#0 : (!fir.box<!fir.array<?xnone>>) -> i1
 ! CHECK:           %[[VAL_3:.*]]:4 = fir.if %[[VAL_2]] -> (!fir.ref<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>, i1, !fir.box<!fir.array<?xnone>>) {
 ! CHECK:             %[[VAL_4:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 : (!fir.box<!fir.array<?xnone>>) -> (!fir.box<!fir.array<?xnone>>, i1)
@@ -176,7 +182,8 @@ end subroutine
 
 ! CHECK-LABEL:   func.func @_QPtest4c(
 ! CHECK-SAME:                         %[[VAL_0:.*]]: !fir.box<!fir.array<?xnone>> {fir.bindc_name = "x", fir.contiguous, fir.optional}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<contiguous, optional>, uniq_name = "_QFtest4cEx"} : (!fir.box<!fir.array<?xnone>>) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>)
+! CHECK:           %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<contiguous, optional>, uniq_name = "_QFtest4cEx"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.is_present %[[VAL_1]]#0 : (!fir.box<!fir.array<?xnone>>) -> i1
 ! CHECK:           %[[VAL_3:.*]] = fir.if %[[VAL_2]] -> (!fir.ref<!fir.array<?xnone>>) {
 ! CHECK:             %[[VAL_4:.*]] = fir.box_addr %[[VAL_1]]#1 : (!fir.box<!fir.array<?xnone>>) -> !fir.ref<!fir.array<?xnone>>
@@ -191,7 +198,8 @@ end subroutine
 
 ! CHECK-LABEL:   func.func @_QPtest4d(
 ! CHECK-SAME:                         %[[VAL_0:.*]]: !fir.box<!fir.array<?xnone>> {fir.bindc_name = "x", fir.contiguous}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<contiguous>, uniq_name = "_QFtest4dEx"} : (!fir.box<!fir.array<?xnone>>) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>)
+! CHECK:           %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<contiguous>, uniq_name = "_QFtest4dEx"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]]#1 : (!fir.box<!fir.array<?xnone>>) -> !fir.ref<!fir.array<?xnone>>
 ! CHECK:           fir.call @_QPs4d(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.array<?xnone>>) -> ()
 ! CHECK:           return
@@ -199,7 +207,8 @@ end subroutine
 
 ! CHECK-LABEL:   func.func @_QPtest5(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.box<!fir.array<?xnone>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest5Ex"} : (!fir.box<!fir.array<?xnone>>) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>)
+! CHECK:           %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest5Ex"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.box<!fir.array<?xnone>>) -> !fir.box<!fir.array<*:none>>
 ! CHECK:           fir.call @_QPs5(%[[VAL_2]]) fastmath<contract> : (!fir.box<!fir.array<*:none>>) -> ()
 ! CHECK:           return
@@ -207,7 +216,8 @@ end subroutine
 
 ! CHECK-LABEL:   func.func @_QPtest5b(
 ! CHECK-SAME:                         %[[VAL_0:.*]]: !fir.box<!fir.array<?xnone>> {fir.bindc_name = "x", fir.optional}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest5bEx"} : (!fir.box<!fir.array<?xnone>>) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>)
+! CHECK:           %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest5bEx"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.is_present %[[VAL_1]]#0 : (!fir.box<!fir.array<?xnone>>) -> i1
 ! CHECK:           %[[VAL_3:.*]]:4 = fir.if %[[VAL_2]] -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>, i1, !fir.box<!fir.array<?xnone>>) {
 ! CHECK:             %[[VAL_4:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 : (!fir.box<!fir.array<?xnone>>) -> (!fir.box<!fir.array<?xnone>>, i1)
diff --git a/flang/test/HLFIR/assumed_shape_with_value_keyword.f90 b/flang/test/HLFIR/assumed_shape_with_value_keyword.f90
index b5080d9bedca..da3dff16382c 100644
--- a/flang/test/HLFIR/assumed_shape_with_value_keyword.f90
+++ b/flang/test/HLFIR/assumed_shape_with_value_keyword.f90
@@ -9,7 +9,7 @@ end
 
 ! CHECK-LABEL:  func.func @_QPtest_integer_value1(
 ! CHECK-SAME:     %[[ARG0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "x"}) {
-! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_integer_value1Ex"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_integer_value1Ex"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:          %[[VAL_1:.*]]:2 = hlfir.copy_in %[[VAL_0]]#0 : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, i1)
 ! CHECK:          %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]]#0 : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
 ! CHECK:          fir.call @_QPinternal_call1(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.array<?xi32>>) -> ()
@@ -23,7 +23,7 @@ subroutine test_integer_value2(x)
 end
 ! CHECK-LABEL:  func.func @_QPtest_integer_value2(
 ! CHECK-SAME:     %[[ARG0:.*]]: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "x"}) {
-! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_integer_value2Ex"} : (!fir.box<!fir.array<?x?xi32>>) -> (!fir.box<!fir.array<?x?xi32>>, !fir.box<!fir.array<?x?xi32>>)
+! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_integer_value2Ex"} : (!fir.box<!fir.array<?x?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xi32>>, !fir.box<!fir.array<?x?xi32>>)
 ! CHECK:          %[[VAL_1:.*]]:2 = hlfir.copy_in %[[VAL_0]]#0 : (!fir.box<!fir.array<?x?xi32>>) -> (!fir.box<!fir.array<?x?xi32>>, i1)
 ! CHECK:          %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]]#0 : (!fir.box<!fir.array<?x?xi32>>) -> !fir.ref<!fir.array<?x?xi32>>
 ! CHECK:          fir.call @_QPinternal_call2(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.array<?x?xi32>>) -> ()
@@ -37,7 +37,7 @@ subroutine test_real_value1(x)
 end
 ! CHECK-LABEL:  func.func @_QPtest_real_value1(
 ! CHECK-SAME:     %[[ARG0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}) {
-! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_real_value1Ex"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_real_value1Ex"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:          %[[VAL_1:.*]]:2 = hlfir.copy_in %[[VAL_0]]#0 : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, i1)
 ! CHECK:          %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]]#0 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
 ! CHECK:          fir.call @_QPinternal_call3(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.array<?xf32>>) -> ()
@@ -51,7 +51,7 @@ subroutine test_real_value2(x)
 end
 ! CHECK-LABEL:  func.func @_QPtest_real_value2(
 ! CHECK-SAME:     %[[ARG0:.*]]: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "x"}) {
-! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_real_value2Ex"} : (!fir.box<!fir.array<?x?xf32>>) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
+! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_real_value2Ex"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
 ! CHECK:          %[[VAL_1:.*]]:2 = hlfir.copy_in %[[VAL_0]]#0 : (!fir.box<!fir.array<?x?xf32>>) -> (!fir.box<!fir.array<?x?xf32>>, i1)
 ! CHECK:          %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]]#0 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
 ! CHECK:          fir.call @_QPinternal_call4(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.array<?x?xf32>>) -> ()
@@ -65,7 +65,7 @@ subroutine test_complex_value1(x)
 end
 ! CHECK-LABEL:  func.func @_QPtest_complex_value1(
 ! CHECK-SAME:     %[[ARG0:.*]]: !fir.box<!fir.array<?x!fir.complex<4>>> {fir.bindc_name = "x"}) {
-! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_complex_value1Ex"} : (!fir.box<!fir.array<?x!fir.complex<4>>>) -> (!fir.box<!fir.array<?x!fir.complex<4>>>, !fir.box<!fir.array<?x!fir.complex<4>>>)
+! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_complex_value1Ex"} : (!fir.box<!fir.array<?x!fir.complex<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.complex<4>>>, !fir.box<!fir.array<?x!fir.complex<4>>>)
 ! CHECK:          %[[VAL_1:.*]]:2 = hlfir.copy_in %[[VAL_0]]#0 : (!fir.box<!fir.array<?x!fir.complex<4>>>) -> (!fir.box<!fir.array<?x!fir.complex<4>>>, i1)
 ! CHECK:          %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]]#0 : (!fir.box<!fir.array<?x!fir.complex<4>>>) -> !fir.ref<!fir.array<?x!fir.complex<4>>>
 ! CHECK:          fir.call @_QPinternal_call5(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.array<?x!fir.complex<4>>>) -> ()
@@ -79,7 +79,7 @@ subroutine test_complex_value2(x)
 end
 ! CHECK-LABEL:  func.func @_QPtest_complex_value2(
 ! CHECK-SAME:     %[[ARG0:.*]]: !fir.box<!fir.array<?x?x!fir.complex<4>>> {fir.bindc_name = "x"}) {
-! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_complex_value2Ex"} : (!fir.box<!fir.array<?x?x!fir.complex<4>>>) -> (!fir.box<!fir.array<?x?x!fir.complex<4>>>, !fir.box<!fir.array<?x?x!fir.complex<4>>>)
+! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_complex_value2Ex"} : (!fir.box<!fir.array<?x?x!fir.complex<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x!fir.complex<4>>>, !fir.box<!fir.array<?x?x!fir.complex<4>>>)
 ! CHECK:          %[[VAL_1:.*]]:2 = hlfir.copy_in %[[VAL_0]]#0 : (!fir.box<!fir.array<?x?x!fir.complex<4>>>) -> (!fir.box<!fir.array<?x?x!fir.complex<4>>>, i1)
 ! CHECK:          %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]]#0 : (!fir.box<!fir.array<?x?x!fir.complex<4>>>) -> !fir.ref<!fir.array<?x?x!fir.complex<4>>>
 ! CHECK:          fir.call @_QPinternal_call6(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.array<?x?x!fir.complex<4>>>) -> ()
@@ -95,7 +95,7 @@ subroutine test_optional1(x)
 end
 ! CHECK-LABEL:  func.func @_QPtest_optional1(
 ! CHECK-SAME:     %[[ARG0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x", fir.optional}) {
-! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<optional, value>, uniq_name = "_QFtest_optional1Ex"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional, value>, uniq_name = "_QFtest_optional1Ex"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:          %[[VAL_1:.*]] = fir.is_present %[[VAL_0]]#1 : (!fir.box<!fir.array<?xf32>>) -> i1
 ! CHECK:          fir.if %[[VAL_1:.*]] {
 ! CHECK:            %[[VAL_2:.*]]:2 = hlfir.copy_in %[[VAL_0]]#0 : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, i1)
@@ -115,7 +115,7 @@ subroutine test_optional2(x)
 end
 ! CHECK-LABEL:  func.func @_QPtest_optional2(
 ! CHECK-SAME:     %[[ARG0:.*]]: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "x", fir.optional}) {
-! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<optional, value>, uniq_name = "_QFtest_optional2Ex"} : (!fir.box<!fir.array<?x?xf32>>) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
+! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional, value>, uniq_name = "_QFtest_optional2Ex"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
 ! CHECK:          %[[VAL_1:.*]] = fir.is_present %[[VAL_0]]#1 : (!fir.box<!fir.array<?x?xf32>>) -> i1
 ! CHECK:          fir.if %[[VAL_1:.*]] {
 ! CHECK:            %[[VAL_2:.*]]:2 = hlfir.copy_in %[[VAL_0]]#0 : (!fir.box<!fir.array<?x?xf32>>) -> (!fir.box<!fir.array<?x?xf32>>, i1)
@@ -135,7 +135,7 @@ subroutine test_optional3(x)
 end
 ! CHECK-LABEL:  func.func @_QPtest_optional3(
 ! CHECK-SAME:     %[[ARG0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x", fir.optional}) {
-! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<optional, value>, uniq_name = "_QFtest_optional3Ex"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional, value>, uniq_name = "_QFtest_optional3Ex"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:          %[[VAL_1:.*]] = fir.is_present %[[VAL_0]]#1 : (!fir.box<!fir.array<?xf32>>) -> i1
 ! CHECK:          cf.cond_br %[[VAL_1]], ^bb1, ^bb2
 ! CHECK:          b1:  // pred: ^bb0
@@ -146,4 +146,4 @@ end
 ! CHECK:          fir.unreachable
 ! CHECK:          b2:  // pred: ^bb0
 ! CHECK:          return
-! CHECK:        }
-\ No newline at end of file
+! CHECK:        }
diff --git a/flang/test/HLFIR/boxchar_emboxing.f90 b/flang/test/HLFIR/boxchar_emboxing.f90
index fbc41bbea72d..c25a5c283e36 100644
--- a/flang/test/HLFIR/boxchar_emboxing.f90
+++ b/flang/test/HLFIR/boxchar_emboxing.f90
@@ -2,7 +2,7 @@
 
 ! CHECK-LABEL:   func.func @_QPtest1(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.class<none> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest1Ex"} : (!fir.class<none>) -> (!fir.class<none>, !fir.class<none>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest1Ex"} : (!fir.class<none>, !fir.dscope) -> (!fir.class<none>, !fir.class<none>)
 ! CHECK:           fir.select_type %[[VAL_1]]#1 : !fir.class<none> [#fir.type_is<!fir.char<1,?>>, ^bb1, unit, ^bb2]
 ! CHECK:         ^bb1:
 ! CHECK:           %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]]#1 : (!fir.class<none>) -> !fir.ref<!fir.char<1,?>>
@@ -44,7 +44,7 @@ end subroutine test1
 
 ! CHECK-LABEL:   func.func @_QPtest2(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.class<!fir.array<10xnone>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest2Ex"} : (!fir.class<!fir.array<10xnone>>) -> (!fir.class<!fir.array<10xnone>>, !fir.class<!fir.array<10xnone>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest2Ex"} : (!fir.class<!fir.array<10xnone>>, !fir.dscope) -> (!fir.class<!fir.array<10xnone>>, !fir.class<!fir.array<10xnone>>)
 ! CHECK:           fir.select_type %[[VAL_1]]#1 : !fir.class<!fir.array<10xnone>> [#fir.type_is<!fir.char<1,?>>, ^bb1, unit, ^bb2]
 ! CHECK:         ^bb1:
 ! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]]#1 : (!fir.class<!fir.array<10xnone>>) -> !fir.box<!fir.array<10x!fir.char<1,?>>>
diff --git a/flang/test/HLFIR/c_ptr_byvalue.f90 b/flang/test/HLFIR/c_ptr_byvalue.f90
index 45e17c0ff630..377c9fccbee3 100644
--- a/flang/test/HLFIR/c_ptr_byvalue.f90
+++ b/flang/test/HLFIR/c_ptr_byvalue.f90
@@ -22,7 +22,8 @@ end
 
 ! CHECK-LABEL:   func.func @_QPtest2(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>> {fir.bindc_name = "cptr"}) {
-! CHECK:           %[[VAL_97:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest2Ecptr"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>)
+! CHECK:           %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_97:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest2Ecptr"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>)
 ! CHECK:           %[[VAL_98:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
 ! CHECK:           %[[VAL_99:.*]] = fir.coordinate_of %[[VAL_97]]#0, %[[VAL_98]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
 ! CHECK:           %[[VAL_100:.*]] = fir.load %[[VAL_99]] : !fir.ref<i64>
diff --git a/flang/test/HLFIR/call_with_poly_dummy.f90 b/flang/test/HLFIR/call_with_poly_dummy.f90
index 00a795c5b1fb..93cd410428f7 100644
--- a/flang/test/HLFIR/call_with_poly_dummy.f90
+++ b/flang/test/HLFIR/call_with_poly_dummy.f90
@@ -22,7 +22,8 @@ end subroutine test1
 
 ! CHECK-LABEL:   func.func @_QPtest2(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<f32> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest2Ex"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest2Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<f32>
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
 ! CHECK:           %[[VAL_4:.*]] = arith.cmpf oeq, %[[VAL_2]], %[[VAL_3]] {{.*}} : f32
diff --git a/flang/test/HLFIR/optional_dummy.f90 b/flang/test/HLFIR/optional_dummy.f90
index 0f1a8d5b9c39..8534a414eaaf 100644
--- a/flang/test/HLFIR/optional_dummy.f90
+++ b/flang/test/HLFIR/optional_dummy.f90
@@ -5,7 +5,7 @@
 
 ! CHECK-LABEL:   func.func @_QPtest(
 ! CHECK-SAME:        %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "ext_buf", fir.contiguous, fir.optional}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<contiguous, optional>, uniq_name = "_QFtestEext_buf"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<contiguous, optional>, uniq_name = "_QFtestEext_buf"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.is_present %[[VAL_1]]#1 : (!fir.box<!fir.array<?xi32>>) -> i1
 ! CHECK:           cf.cond_br %[[VAL_2]], ^bb1, ^bb2
 ! CHECK:         ^bb1:
diff --git a/flang/test/HLFIR/order_assignments/where-scheduling.f90 b/flang/test/HLFIR/order_assignments/where-scheduling.f90
index 0f79058a6ae9..d3665d234a71 100644
--- a/flang/test/HLFIR/order_assignments/where-scheduling.f90
+++ b/flang/test/HLFIR/order_assignments/where-scheduling.f90
@@ -134,7 +134,7 @@ end subroutine
 !CHECK-NEXT: run 1 save    : where/mask
 !CHECK-NEXT: run 2 evaluate: where/region_assign1
 !CHECK-LABEL: ------------ scheduling where in _QPonly_once ------------
-!CHECK-NEXT: unknown effect: %9 = fir.call @llvm.stacksave.p0() fastmath<contract> : () -> !fir.ref<i8>
+!CHECK-NEXT: unknown effect: %{{[0-9]+}} = fir.call @llvm.stacksave.p0() fastmath<contract> : () -> !fir.ref<i8>
 !CHECK-NEXT: run 1 save  (w): where/mask
 !CHECK-NEXT: run 2 evaluate: where/region_assign1
 !CHECK-NEXT: run 3 evaluate: where/region_assign2
@@ -172,12 +172,12 @@ end subroutine
 !CHECK-NEXT: run 1 save    : forall/where1/region_assign1/rhs
 !CHECK-NEXT: run 2 evaluate: forall/where1/region_assign1
 !CHECK-LABEL: ------------ scheduling where in _QFno_need_to_make_lhs_tempPinternal ------------
-!CHECK-NEXT: conflict: R/W: %7 = fir.load %6 : !fir.llvm_ptr<!fir.ref<i32>> W:%13 = fir.load %12 : !fir.ref<!fir.box<!fir.array<?x?xi32>>>
+!CHECK-NEXT: conflict: R/W: %{{[0-9]+}} = fir.load %{{[0-9]+}} : !fir.llvm_ptr<!fir.ref<i32>> W:%{{[0-9]+}} = fir.load %{{[0-9]+}} : !fir.ref<!fir.box<!fir.array<?x?xi32>>>
 !CHECK-NEXT: run 1 save    : where/mask
 !CHECK-NEXT: run 2 evaluate: where/region_assign1
 !CHECK-NEXT: ------------ scheduling where in _QPwhere_construct_unknown_conflict ------------
 !CHECK-NEXT: unknown effect: %{{.*}} = fir.call @_QPf() fastmath<contract> : () -> f32
-!CHECK-NEXT: conflict: R/W: %{{.*}} = hlfir.declare %{{.*}} {uniq_name = "_QFwhere_construct_unknown_conflictEmask"} : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>) W:<unknown>
+!CHECK-NEXT: conflict: R/W: %{{.*}} = hlfir.declare %{{.*}} {uniq_name = "_QFwhere_construct_unknown_conflictEmask"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>) W:<unknown>
 !CHECK-NEXT: run 1 save    : where/mask
 !CHECK-NEXT: unknown effect: %{{.*}} = fir.call @_QPf() fastmath<contract> : () -> f32
 !CHECK-NEXT: run 2 save  (w): where/region_assign1/rhs
@@ -185,9 +185,9 @@ end subroutine
 !CHECK-NEXT: ------------ scheduling where in _QPelsewhere_construct_unknown_conflict ------------
 !CHECK-NEXT: run 1 evaluate: where/region_assign1
 !CHECK-NEXT: unknown effect: %{{.*}} = fir.call @_QPf() fastmath<contract> : () -> f32
-!CHECK-NEXT: conflict: R/W: %{{.*}} = hlfir.declare %{{.*}} {uniq_name = "_QFelsewhere_construct_unknown_conflictEmask1"} : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>) W:<unknown>
+!CHECK-NEXT: conflict: R/W: %{{.*}} = hlfir.declare %{{.*}} {uniq_name = "_QFelsewhere_construct_unknown_conflictEmask1"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>) W:<unknown>
 !CHECK-NEXT: run 2 save    : where/mask
-!CHECK-NEXT: conflict: R/W: %{{.*}} = hlfir.declare %{{.*}} {uniq_name = "_QFelsewhere_construct_unknown_conflictEmask2"} : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>) W:<unknown>
+!CHECK-NEXT: conflict: R/W: %{{.*}} = hlfir.declare %{{.*}} {uniq_name = "_QFelsewhere_construct_unknown_conflictEmask2"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>) W:<unknown>
 !CHECK-NEXT: run 2 save    : where/elsewhere1/mask
 !CHECK-NEXT: unknown effect: %{{.*}} = fir.call @_QPf() fastmath<contract> : () -> f32
 !CHECK-NEXT: run 3 save  (w): where/elsewhere1/region_assign1/rhs
diff --git a/flang/test/Integration/OpenMP/map-types-and-sizes.f90 b/flang/test/Integration/OpenMP/map-types-and-sizes.f90
index 283ac227f343..f3a20690f05a 100644
--- a/flang/test/Integration/OpenMP/map-types-and-sizes.f90
+++ b/flang/test/Integration/OpenMP/map-types-and-sizes.f90
@@ -69,6 +69,147 @@ subroutine mapType_allocatable_explicit
   !$omp end target
   deallocate(a)
 end subroutine mapType_allocatable_explicit
+ 
+!CHECK: @.offload_sizes{{.*}} = private unnamed_addr constant [1 x i64] [i64 48]
+!CHECK: @.offload_maptypes{{.*}} = private unnamed_addr constant [1 x i64] [i64 547]
+subroutine mapType_derived_implicit
+  type :: scalar_and_array
+    real(4) :: real
+    integer(4) :: array(10)
+    integer(4) :: int
+  end type scalar_and_array
+  type(scalar_and_array) :: scalar_arr 
+  
+  !$omp target
+     scalar_arr%int = 1
+  !$omp end target
+end subroutine mapType_derived_implicit
+
+!CHECK: @.offload_sizes{{.*}} = private unnamed_addr constant [1 x i64] [i64 48]
+!CHECK: @.offload_maptypes{{.*}} = private unnamed_addr constant [1 x i64] [i64 35]
+subroutine mapType_derived_explicit
+  type :: scalar_and_array
+    real(4) :: real
+    integer(4) :: array(10)
+    integer(4) :: int
+  end type scalar_and_array
+  type(scalar_and_array) :: scalar_arr 
+  
+  !$omp target map(tofrom: scalar_arr)
+     scalar_arr%int = 1
+  !$omp end target
+end subroutine mapType_derived_explicit
+
+!CHECK: @.offload_sizes{{.*}} = private unnamed_addr constant [1 x i64] [i64 40]
+!CHECK: @.offload_maptypes{{.*}} = private unnamed_addr constant [1 x i64] [i64 35]
+subroutine mapType_derived_explicit_single_member
+  type :: scalar_and_array
+    real(4) :: real
+    integer(4) :: array(10)
+    integer(4) :: int
+  end type scalar_and_array
+  type(scalar_and_array) :: scalar_arr 
+  
+  !$omp target map(tofrom: scalar_arr%array)
+     scalar_arr%array(1) = 1
+  !$omp end target
+end subroutine mapType_derived_explicit_single_member
+
+!CHECK: @.offload_sizes{{.*}} = private unnamed_addr constant [3 x i64] [i64 0, i64 4, i64 4]
+!CHECK: @.offload_maptypes{{.*}} = private unnamed_addr constant [3 x i64] [i64 32, i64 281474976710659, i64 281474976710659]
+subroutine mapType_derived_explicit_multiple_members
+  type :: scalar_and_array
+    real(4) :: real
+    integer(4) :: array(10)
+    integer(4) :: int
+  end type scalar_and_array
+  type(scalar_and_array) :: scalar_arr 
+  
+  !$omp target map(tofrom: scalar_arr%int, scalar_arr%real)
+     scalar_arr%int = 1
+  !$omp end target
+end subroutine mapType_derived_explicit_multiple_members
+
+!CHECK: @.offload_sizes{{.*}} = private unnamed_addr constant [1 x i64] [i64 16]
+!CHECK: @.offload_maptypes{{.*}} = private unnamed_addr constant [1 x i64] [i64 35]
+subroutine mapType_derived_explicit_member_with_bounds
+  type :: scalar_and_array
+    real(4) :: real
+    integer(4) :: array(10)
+    integer(4) :: int
+  end type scalar_and_array
+  type(scalar_and_array) :: scalar_arr 
+  
+  !$omp target map(tofrom: scalar_arr%array(2:5))
+     scalar_arr%array(3) = 3
+  !$omp end target
+end subroutine mapType_derived_explicit_member_with_bounds
+
+!CHECK: @.offload_sizes{{.*}} = private unnamed_addr constant [1 x i64] [i64 4]
+!CHECK: @.offload_maptypes{{.*}} = private unnamed_addr constant [1 x i64] [i64 35]
+subroutine mapType_derived_explicit_nested_single_member
+  type :: nested
+    integer(4) :: int
+    real(4) :: real
+    integer(4) :: array(10)
+  end type nested
+
+  type :: scalar_and_array
+    real(4) :: real
+    integer(4) :: array(10)
+    type(nested) :: nest
+    integer(4) :: int
+  end type scalar_and_array
+  type(scalar_and_array) :: scalar_arr 
+  
+  !$omp target map(tofrom: scalar_arr%nest%real)
+    scalar_arr%nest%real = 1
+  !$omp end target
+end subroutine mapType_derived_explicit_nested_single_member
+
+!CHECK: @.offload_sizes{{.*}} = private unnamed_addr constant [3 x i64] [i64 0, i64 4, i64 4]
+!CHECK: @.offload_maptypes{{.*}} = private unnamed_addr constant [3 x i64] [i64 32, i64 281474976710659, i64 281474976710659]
+subroutine mapType_derived_explicit_multiple_nested_members
+  type :: nested
+    integer(4) :: int
+    real(4) :: real
+    integer(4) :: array(10)
+  end type nested
+
+  type :: scalar_and_array
+    real(4) :: real
+    integer(4) :: array(10)
+    type(nested) :: nest
+    integer(4) :: int
+  end type scalar_and_array
+  type(scalar_and_array) :: scalar_arr 
+  
+!$omp target map(tofrom: scalar_arr%nest%int, scalar_arr%nest%real)
+  scalar_arr%nest%int = 1
+!$omp end target
+end subroutine mapType_derived_explicit_multiple_nested_members
+
+!CHECK: @.offload_sizes{{.*}} = private unnamed_addr constant [1 x i64] [i64 16]
+!CHECK: @.offload_maptypes{{.*}} = private unnamed_addr constant [1 x i64] [i64 35]
+subroutine mapType_derived_explicit_nested_member_with_bounds
+  type :: nested
+    integer(4) :: int
+    real(4) :: real
+    integer(4) :: array(10)
+  end type nested
+
+  type :: scalar_and_array
+    real(4) :: real
+    integer(4) :: array(10)
+    type(nested) :: nest
+    integer(4) :: int
+  end type scalar_and_array
+  type(scalar_and_array) :: scalar_arr 
+  
+!$omp target map(tofrom: scalar_arr%nest%array(2:5))
+    scalar_arr%nest%array(3) = 3
+!$omp end target
+end subroutine mapType_derived_explicit_nested_member_with_bounds
 
 !CHECK: @.offload_sizes{{.*}} = private unnamed_addr constant [2 x i64] [i64 8, i64 4]
 !CHECK: @.offload_maptypes{{.*}} = private unnamed_addr constant [2 x i64] [i64 544, i64 800]
@@ -100,7 +241,6 @@ end subroutine mapType_char
 !CHECK: %[[OFFLOAD_SIZE_ARR:.*]] = getelementptr inbounds [3 x i64], ptr %.offload_sizes, i32 0, i32 0
 !CHECK: store i64 %[[DIV]], ptr %[[OFFLOAD_SIZE_ARR]], align 8
 
-
 !CHECK-LABEL: define {{.*}} @{{.*}}maptype_allocatable_explicit_{{.*}}
 !CHECK: %[[ALLOCA:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, i64 1, align 8
 !CHECK: %[[ALLOCA_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[ALLOCA]], i32 1
@@ -110,3 +250,99 @@ end subroutine mapType_char
 !CHECK: %[[DIV:.*]] = sdiv exact i64 %[[SIZE_DIFF]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
 !CHECK: %[[OFFLOAD_SIZE_ARR:.*]] = getelementptr inbounds [3 x i64], ptr %.offload_sizes, i32 0, i32 0
 !CHECK: store i64 %[[DIV]], ptr %[[OFFLOAD_SIZE_ARR]], align 8
+
+!CHECK-LABEL: define {{.*}} @{{.*}}maptype_derived_implicit_{{.*}}
+!CHECK: %[[ALLOCA:.*]] = alloca %_QFmaptype_derived_implicitTscalar_and_array, i64 1, align 8
+!CHECK: %[[BASE_PTR_ARR:.*]] = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
+!CHECK: store ptr %[[ALLOCA]], ptr %[[BASE_PTR_ARR]], align 8
+!CHECK: %[[OFFLOAD_PTR_ARR:.*]] = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0
+!CHECK: store ptr %[[ALLOCA]], ptr %[[OFFLOAD_PTR_ARR]], align 8
+
+!CHECK-LABEL: define {{.*}} @{{.*}}maptype_derived_explicit_{{.*}}
+!CHECK: %[[ALLOCA:.*]] = alloca %_QFmaptype_derived_explicitTscalar_and_array, i64 1, align 8
+!CHECK: %[[BASE_PTR_ARR:.*]] = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
+!CHECK: store ptr %[[ALLOCA]], ptr %[[BASE_PTR_ARR]], align 8
+!CHECK: %[[OFFLOAD_PTR_ARR:.*]] = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0
+!CHECK: store ptr %[[ALLOCA]], ptr %[[OFFLOAD_PTR_ARR]], align 8
+
+!CHECK-LABEL: define {{.*}} @{{.*}}maptype_derived_explicit_single_member_{{.*}}
+!CHECK: %[[ALLOCA:.*]] = alloca %_QFmaptype_derived_explicit_single_memberTscalar_and_array, i64 1, align 8
+!CHECK: %[[MEMBER_ACCESS:.*]] = getelementptr %_QFmaptype_derived_explicit_single_memberTscalar_and_array, ptr %[[ALLOCA]], i32 0, i32 1
+!CHECK: %[[ARR_OFF:.*]] = getelementptr inbounds [10 x i32], ptr %[[MEMBER_ACCESS]], i64 0, i64 0
+!CHECK: %[[BASE_PTR_ARR:.*]] = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
+!CHECK: store ptr %[[ALLOCA]], ptr %[[BASE_PTR_ARR]], align 8
+!CHECK: %[[OFFLOAD_PTR_ARR:.*]] = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0
+!CHECK: store ptr %[[ARR_OFF]], ptr %[[OFFLOAD_PTR_ARR]], align 8
+
+!CHECK-LABEL: define {{.*}} @{{.*}}maptype_derived_explicit_multiple_members_{{.*}}
+!CHECK: %[[ALLOCA:.*]] = alloca %_QFmaptype_derived_explicit_multiple_membersTscalar_and_array, i64 1, align 8
+!CHECK: %[[MEMBER_ACCESS_1:.*]] = getelementptr %_QFmaptype_derived_explicit_multiple_membersTscalar_and_array, ptr %[[ALLOCA]], i32 0, i32 2
+!CHECK: %[[MEMBER_ACCESS_2:.*]] = getelementptr %_QFmaptype_derived_explicit_multiple_membersTscalar_and_array, ptr %[[ALLOCA]], i32 0, i32 0
+!CHECK: %[[ARR_END_OFF:.*]] = getelementptr i32, ptr %[[MEMBER_ACCESS_1]], i64 1
+!CHECK: %[[ARR_END:.*]] = ptrtoint ptr %[[ARR_END_OFF]] to i64
+!CHECK: %[[FIRST_MEMBER:.*]] = ptrtoint ptr %[[MEMBER_ACCESS_2]] to i64
+!CHECK: %[[SIZE_DIFF:.*]] = sub i64 %[[ARR_END]], %[[FIRST_MEMBER]]
+!CHECK: %[[SIZE:.*]] = sdiv exact i64 %[[SIZE_DIFF]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
+!CHECK: %[[BASE_PTR_ARR:.*]] = getelementptr inbounds [3 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
+!CHECK: store ptr %[[ALLOCA]], ptr %[[BASE_PTR_ARR]], align 8
+!CHECK: %[[OFFLOAD_PTR_ARR:.*]] = getelementptr inbounds [3 x ptr], ptr %.offload_ptrs, i32 0, i32 0
+!CHECK: store ptr %[[MEMBER_ACCESS_2]], ptr %[[OFFLOAD_PTR_ARR]], align 8
+!CHECK: %[[OFFLOAD_SIZE_ARR:.*]] = getelementptr inbounds [3 x i64], ptr %.offload_sizes, i32 0, i32 0
+!CHECK: store i64 %[[SIZE]], ptr %[[OFFLOAD_SIZE_ARR]], align 8
+!CHECK: %[[BASE_PTR_ARR_2:.*]] = getelementptr inbounds [3 x ptr], ptr %.offload_baseptrs, i32 0, i32 1
+!CHECK: store ptr %[[ALLOCA]], ptr %[[BASE_PTR_ARR_2]], align 8
+!CHECK: %[[OFFLOAD_PTR_ARR_2:.*]] = getelementptr inbounds [3 x ptr], ptr %.offload_ptrs, i32 0, i32 1
+!CHECK: store ptr %[[MEMBER_ACCESS_1]], ptr %[[OFFLOAD_PTR_ARR_2]], align 8
+!CHECK: %[[BASE_PTR_ARR_3:.*]] = getelementptr inbounds [3 x ptr], ptr %.offload_baseptrs, i32 0, i32 2
+!CHECK: store ptr %[[ALLOCA]], ptr %[[BASE_PTR_ARR_3]], align 8
+!CHECK: %[[OFFLOAD_PTR_ARR_3:.*]] = getelementptr inbounds [3 x ptr], ptr %.offload_ptrs, i32 0, i32 2
+!CHECK: store ptr %[[MEMBER_ACCESS_2]], ptr %[[OFFLOAD_PTR_ARR_3]], align 8
+
+!CHECK-LABEL: define {{.*}} @{{.*}}maptype_derived_explicit_member_with_bounds_{{.*}}
+!CHECK: %[[ALLOCA:.*]] = alloca %_QFmaptype_derived_explicit_member_with_boundsTscalar_and_array, i64 1, align 8
+!CHECK: %[[MEMBER_ACCESS:.*]] = getelementptr %_QFmaptype_derived_explicit_member_with_boundsTscalar_and_array, ptr %[[ALLOCA]], i32 0, i32 1
+!CHECK: %[[ARR_OFF:.*]] = getelementptr inbounds [10 x i32], ptr %[[MEMBER_ACCESS]], i64 0, i64 1
+!CHECK: %[[BASE_PTR_ARR:.*]] = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
+!CHECK: store ptr %[[ALLOCA]], ptr %[[BASE_PTR_ARR]], align 8
+!CHECK: %[[OFFLOAD_PTR_ARR:.*]] = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0
+!CHECK: store ptr %[[ARR_OFF]], ptr %[[OFFLOAD_PTR_ARR]], align 8
+
+!CHECK-LABEL: define {{.*}} @{{.*}}maptype_derived_explicit_nested_single_member_{{.*}}
+!CHECK: %[[ALLOCA:.*]] = alloca %_QFmaptype_derived_explicit_nested_single_memberTscalar_and_array, i64 1, align 8
+!CHECK: %[[MEMBER_ACCESS:.*]] = getelementptr %_QFmaptype_derived_explicit_nested_single_memberTscalar_and_array, ptr %[[ALLOCA]], i32 0, i32 2, i32 1
+!CHECK: store ptr %[[ALLOCA]], ptr %[[BASE_PTR_ARR]], align 8
+!CHECK: %[[OFFLOAD_PTR_ARR:.*]] = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0
+!CHECK: store ptr %[[MEMBER_ACCESS]], ptr %[[OFFLOAD_PTR_ARR]], align 8
+
+!CHECK-LABEL: define {{.*}} @{{.*}}maptype_derived_explicit_multiple_nested_members_{{.*}}
+!CHECK: %[[ALLOCA:.*]] = alloca %_QFmaptype_derived_explicit_multiple_nested_membersTscalar_and_array, i64 1, align 8
+!CHECK: %[[MEMBER_ACCESS_1:.*]] = getelementptr %_QFmaptype_derived_explicit_multiple_nested_membersTscalar_and_array, ptr %[[ALLOCA]], i32 0, i32 2, i32 0
+!CHECK: %[[MEMBER_ACCESS_2:.*]] = getelementptr %_QFmaptype_derived_explicit_multiple_nested_membersTscalar_and_array, ptr %[[ALLOCA]], i32 0, i32 2, i32 1
+!CHECK: %[[ARR_END_OFF:.*]] = getelementptr float, ptr %[[MEMBER_ACCESS_2]], i64 1
+!CHECK: %[[ARR_END:.*]] = ptrtoint ptr %[[ARR_END_OFF]] to i64
+!CHECK: %[[FIRST_MEMBER:.*]] = ptrtoint ptr %[[MEMBER_ACCESS_1]] to i64
+!CHECK: %[[SIZE_DIFF:.*]] = sub i64 %[[ARR_END]], %[[FIRST_MEMBER]]
+!CHECK: %[[SIZE:.*]] = sdiv exact i64 %[[SIZE_DIFF]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
+!CHECK: %[[BASE_PTR_ARR:.*]] = getelementptr inbounds [3 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
+!CHECK: store ptr %[[ALLOCA]], ptr %[[BASE_PTR_ARR]], align 8
+!CHECK: %[[OFFLOAD_PTR_ARR:.*]] = getelementptr inbounds [3 x ptr], ptr %.offload_ptrs, i32 0, i32 0
+!CHECK: store ptr %[[MEMBER_ACCESS_1]], ptr %[[OFFLOAD_PTR_ARR]], align 8
+!CHECK: %[[OFFLOAD_SIZE_ARR:.*]] = getelementptr inbounds [3 x i64], ptr %.offload_sizes, i32 0, i32 0
+!CHECK: store i64 %[[SIZE]], ptr %[[OFFLOAD_SIZE_ARR]], align 8
+!CHECK: %[[BASE_PTR_ARR_2:.*]] = getelementptr inbounds [3 x ptr], ptr %.offload_baseptrs, i32 0, i32 1
+!CHECK: store ptr %[[ALLOCA]], ptr %[[BASE_PTR_ARR_2]], align 8
+!CHECK: %[[OFFLOAD_PTR_ARR_2:.*]] = getelementptr inbounds [3 x ptr], ptr %.offload_ptrs, i32 0, i32 1
+!CHECK: store ptr %[[MEMBER_ACCESS_1]], ptr %[[OFFLOAD_PTR_ARR_2]], align 8
+!CHECK: %[[BASE_PTR_ARR_3:.*]] = getelementptr inbounds [3 x ptr], ptr %.offload_baseptrs, i32 0, i32 2
+!CHECK: store ptr %[[ALLOCA]], ptr %[[BASE_PTR_ARR_3]], align 8
+!CHECK: %[[OFFLOAD_PTR_ARR_3:.*]] = getelementptr inbounds [3 x ptr], ptr %.offload_ptrs, i32 0, i32 2
+!CHECK: store ptr %[[MEMBER_ACCESS_2]], ptr %[[OFFLOAD_PTR_ARR_3]], align 8
+
+!CHECK-LABEL: define {{.*}} @{{.*}}maptype_derived_explicit_nested_member_with_bounds_{{.*}}
+!CHECK: %[[ALLOCA:.*]] = alloca %_QFmaptype_derived_explicit_nested_member_with_boundsTscalar_and_array, i64 1, align 8
+!CHECK: %[[MEMBER_ACCESS:.*]] = getelementptr %_QFmaptype_derived_explicit_nested_member_with_boundsTscalar_and_array, ptr %[[ALLOCA]], i32 0, i32 2, i32 2
+!CHECK: %[[ARR_OFF:.*]] = getelementptr inbounds [10 x i32], ptr %[[MEMBER_ACCESS]], i64 0, i64 1
+!CHECK: %[[BASE_PTR_ARR:.*]] = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
+!CHECK: store ptr %[[ALLOCA]], ptr %[[BASE_PTR_ARR]], align 8
+!CHECK: %[[OFFLOAD_PTR_ARR:.*]] = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0
+!CHECK: store ptr %[[ARR_OFF]], ptr %[[OFFLOAD_PTR_ARR]], align 8
diff --git a/flang/test/Integration/debug-loc-1.f90 b/flang/test/Integration/debug-loc-1.f90
new file mode 100644
index 000000000000..5fe2c8e31dd9
--- /dev/null
+++ b/flang/test/Integration/debug-loc-1.f90
@@ -0,0 +1,30 @@
+!RUN: %flang_fc1 -emit-llvm -debug-info-kind=line-tables-only -fopenmp %s -o - | FileCheck %s
+
+! Test that this file builds without an error.
+
+module debugloc
+contains
+subroutine test1
+implicit none
+ integer :: i
+ real, save :: var
+
+! CHECK: DILocation(line: [[@LINE+1]], {{.*}})
+!$omp parallel do
+do i=1,100
+  var = var + 0.1
+end do
+!$omp end parallel do
+
+end subroutine test1
+
+subroutine test2
+
+real, save :: tp
+!$omp threadprivate (tp)
+! CHECK: DILocation(line: [[@LINE+1]], {{.*}})
+  tp = tp + 1
+
+end subroutine test2
+
+end module debugloc
diff --git a/flang/test/Lower/CUDA/cuda-data-attribute.cuf b/flang/test/Lower/CUDA/cuda-data-attribute.cuf
index 937c981bddd3..3eb42a6a5d40 100644
--- a/flang/test/Lower/CUDA/cuda-data-attribute.cuf
+++ b/flang/test/Lower/CUDA/cuda-data-attribute.cuf
@@ -39,27 +39,59 @@ subroutine dummy_arg_device(dd)
 end subroutine
 ! CHECK-LABEL: func.func @_QMcuda_varPdummy_arg_device(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<f32> {fir.bindc_name = "dd", fir.cuda_attr = #fir.cuda<device>}) {
-! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] {cuda_attr = #fir.cuda<device>, uniq_name = "_QMcuda_varFdummy_arg_deviceEdd"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {cuda_attr = #fir.cuda<device>, uniq_name = "_QMcuda_varFdummy_arg_deviceEdd"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 
 subroutine dummy_arg_managed(dm)
   real, allocatable, managed :: dm
 end subroutine
 ! CHECK-LABEL: func.func @_QMcuda_varPdummy_arg_managed(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<f32>>> {fir.bindc_name = "dm", fir.cuda_attr = #fir.cuda<managed>}) {
-! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] {cuda_attr = #fir.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFdummy_arg_managedEdm"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {cuda_attr = #fir.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFdummy_arg_managedEdm"} : (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
 
 subroutine dummy_arg_pinned(dp)
   real, allocatable, pinned :: dp
 end subroutine
 ! CHECK-LABEL: func.func @_QMcuda_varPdummy_arg_pinned(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<f32>>> {fir.bindc_name = "dp", fir.cuda_attr = #fir.cuda<pinned>}) {
-! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] {cuda_attr = #fir.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFdummy_arg_pinnedEdp"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {cuda_attr = #fir.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFdummy_arg_pinnedEdp"} : (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
 
 subroutine dummy_arg_unified(du)
   real, unified :: du
 end subroutine
 ! CHECK-LABEL: func.func @_QMcuda_varPdummy_arg_unified(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<f32> {fir.bindc_name = "du", fir.cuda_attr = #fir.cuda<unified>})
-! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] {cuda_attr = #fir.cuda<unified>, uniq_name = "_QMcuda_varFdummy_arg_unifiedEdu"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {cuda_attr = #fir.cuda<unified>, uniq_name = "_QMcuda_varFdummy_arg_unifiedEdu"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+
+subroutine cuda_alloc_free(n)
+  integer :: n
+  real, device :: a(10)
+  integer, unified :: u
+  real, managed :: b(n)
+end
+
+! CHECK-LABEL: func.func @_QMcuda_varPcuda_alloc_free
+! CHECK: %[[ALLOC_A:.*]] = fir.cuda_alloc !fir.array<10xf32> {bindc_name = "a", cuda_attr = #fir.cuda<device>, uniq_name = "_QMcuda_varFcuda_alloc_freeEa"} -> !fir.ref<!fir.array<10xf32>>
+! CHECK: %[[SHAPE:.*]] = fir.shape %c10 : (index) -> !fir.shape<1>
+! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ALLOC_A]](%[[SHAPE]]) {cuda_attr = #fir.cuda<device>, uniq_name = "_QMcuda_varFcuda_alloc_freeEa"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
+
+! CHECK: %[[ALLOC_U:.*]] = fir.cuda_alloc i32 {bindc_name = "u", cuda_attr = #fir.cuda<unified>, uniq_name = "_QMcuda_varFcuda_alloc_freeEu"} -> !fir.ref<i32>
+! CHECK: %[[DECL_U:.*]]:2 = hlfir.declare %[[ALLOC_U]] {cuda_attr = #fir.cuda<unified>, uniq_name = "_QMcuda_varFcuda_alloc_freeEu"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+! CHECK: %[[ALLOC_B:.*]] = fir.cuda_alloc !fir.array<?xf32>, %{{.*}} : index {bindc_name = "b", cuda_attr = #fir.cuda<managed>, uniq_name = "_QMcuda_varFcuda_alloc_freeEb"} -> !fir.ref<!fir.array<?xf32>>
+! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
+! CHECK: %[[DECL_B:.*]]:2 = hlfir.declare %[[ALLOC_B]](%[[SHAPE]]) {cuda_attr = #fir.cuda<managed>, uniq_name = "_QMcuda_varFcuda_alloc_freeEb"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+
+! CHECK: fir.cuda_free %[[DECL_B]]#1 : !fir.ref<!fir.array<?xf32>> {cuda_attr = #fir.cuda<managed>}
+! CHECK: fir.cuda_free %[[DECL_U]]#1 : !fir.ref<i32> {cuda_attr = #fir.cuda<unified>}
+! CHECK: fir.cuda_free %[[DECL_A]]#1 : !fir.ref<!fir.array<10xf32>> {cuda_attr = #fir.cuda<device>}
+
+subroutine dummy(x)
+  real, target, device :: x
+end subroutine
+
+! CHECK: func.func @_QMcuda_varPdummy
+! CHECK-NOT: fir.cuda_free
 
 end module
+
+
diff --git a/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf b/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
index d80542f76c92..e1cc35772618 100644
--- a/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
+++ b/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
@@ -11,7 +11,7 @@ subroutine sub1()
 
 ! CHECK-LABEL: func.func @_QPsub1()
 ! CHECK: %[[IV:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsub1Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-
+! CHECK: %[[IV_J:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsub1Ej"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
   !$cuf kernel do <<< 1, 2 >>>
   do i = 1, n
     a(i) = a(i) * b(i)
@@ -41,7 +41,11 @@ subroutine sub1()
     end do
   end do
 
-! CHECK: fir.cuda_kernel<<<%c1{{.*}}, (%c256{{.*}}, %c1{{.*}})>>> (%{{.*}} : index, %{{.*}} : index) = (%{{.*}}, %{{.*}} : index, index) to (%{{.*}}, %{{.*}} : index, index) step (%{{.*}}, %{{.*}} : index, index)
+! CHECK: fir.cuda_kernel<<<%c1{{.*}}, (%c256{{.*}}, %c1{{.*}})>>> (%[[ARG0:.*]] : index, %[[ARG1:.*]] : index) = (%{{.*}}, %{{.*}} : index, index) to (%{{.*}}, %{{.*}} : index, index) step (%{{.*}}, %{{.*}} : index, index)
+! CHECK: %[[ARG0_I32:.*]] = fir.convert %[[ARG0]] : (index) -> i32
+! CHECK: fir.store %[[ARG0_I32]] to %[[IV]]#1 : !fir.ref<i32>
+! CHECK: %[[ARG1_I32:.*]] = fir.convert %[[ARG1]] : (index) -> i32
+! CHECK: fir.store %[[ARG1_I32]] to %[[IV_J]]#1 : !fir.ref<i32>
 ! CHECK: {n = 2 : i64}
 
   !$cuf kernel do(2) <<< (1,*), (256,1) >>>
diff --git a/flang/test/Lower/HLFIR/actual_target_for_dummy_pointer.f90 b/flang/test/Lower/HLFIR/actual_target_for_dummy_pointer.f90
index 129aa49b811d..e6c247205c39 100644
--- a/flang/test/Lower/HLFIR/actual_target_for_dummy_pointer.f90
+++ b/flang/test/Lower/HLFIR/actual_target_for_dummy_pointer.f90
@@ -50,7 +50,7 @@ end subroutine integer_assumed_shape_array
 ! CHECK-SAME:                                              %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "i", fir.target}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.array<?xnone>>>
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFinteger_assumed_shape_arrayEi"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFinteger_assumed_shape_arrayEi"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:           %[[VAL_4:.*]] = fir.rebox %[[VAL_3]]#1 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.ptr<!fir.array<?xi32>>>
 ! CHECK:           fir.store %[[VAL_4]] to %[[VAL_2]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
 ! CHECK:           fir.call @_QPinteger_assumed_shape_array_callee(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> ()
@@ -159,8 +159,8 @@ end subroutine char_assumed_shape_array
 ! CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?x!fir.char<1,?>>>>
 ! CHECK:           %[[VAL_7:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?x!fir.char<1,2>>>>
 ! CHECK:           %[[VAL_8:.*]] = arith.constant 2 : index
-! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_8]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFchar_assumed_shape_arrayEa1"} : (!fir.box<!fir.array<?x!fir.char<1,2>>>, index) -> (!fir.box<!fir.array<?x!fir.char<1,2>>>, !fir.box<!fir.array<?x!fir.char<1,2>>>)
-! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_1]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFchar_assumed_shape_arrayEa2"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_8]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFchar_assumed_shape_arrayEa1"} : (!fir.box<!fir.array<?x!fir.char<1,2>>>, index, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,2>>>, !fir.box<!fir.array<?x!fir.char<1,2>>>)
+! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFchar_assumed_shape_arrayEa2"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>)
 ! CHECK:           %[[VAL_11:.*]] = fir.rebox %[[VAL_9]]#1 : (!fir.box<!fir.array<?x!fir.char<1,2>>>) -> !fir.box<!fir.ptr<!fir.array<?x!fir.char<1,2>>>>
 ! CHECK:           fir.store %[[VAL_11]] to %[[VAL_7]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.char<1,2>>>>>
 ! CHECK:           fir.call @_QPchar_assumed_shape_array_explicit_len_callee(%[[VAL_7]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.char<1,2>>>>>) -> ()
@@ -220,7 +220,7 @@ end subroutine char_explicit_shape_array
 ! CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_12]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<100x!fir.char<1,?>>>
 ! CHECK:           %[[VAL_14:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_15:.*]] = fir.shape %[[VAL_14]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_13]](%[[VAL_15]]) typeparams %[[VAL_12]]#1 {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFchar_explicit_shape_arrayEa2"} : (!fir.ref<!fir.array<100x!fir.char<1,?>>>, !fir.shape<1>, index) -> (!fir.box<!fir.array<100x!fir.char<1,?>>>, !fir.ref<!fir.array<100x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_13]](%[[VAL_15]]) typeparams %[[VAL_12]]#1 dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFchar_explicit_shape_arrayEa2"} : (!fir.ref<!fir.array<100x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<100x!fir.char<1,?>>>, !fir.ref<!fir.array<100x!fir.char<1,?>>>)
 ! CHECK:           %[[VAL_17:.*]] = fir.shape %[[VAL_8]] : (index) -> !fir.shape<1>
 ! CHECK:           %[[VAL_18:.*]] = fir.convert %[[VAL_11]]#1 : (!fir.ref<!fir.array<100x!fir.char<1,2>>>) -> !fir.ref<!fir.array<?x!fir.char<1,2>>>
 ! CHECK:           %[[VAL_19:.*]] = fir.embox %[[VAL_18]](%[[VAL_17]]) : (!fir.ref<!fir.array<?x!fir.char<1,2>>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?x!fir.char<1,2>>>>
@@ -317,7 +317,7 @@ end subroutine type_assumed_shape_array
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.array<?xnone>>>
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFtype_assumed_shape_arrayEt"} : (!fir.box<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>) -> (!fir.box<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>, !fir.box<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFtype_assumed_shape_arrayEt"} : (!fir.box<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>, !fir.box<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>)
 ! CHECK:           %[[VAL_5:.*]] = fir.rebox %[[VAL_4]]#1 : (!fir.box<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>) -> !fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>
 ! CHECK:           fir.store %[[VAL_5]] to %[[VAL_3]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>>
 ! CHECK:           fir.call @_QPtype_assumed_shape_array_callee(%[[VAL_3]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>>) -> ()
@@ -400,7 +400,7 @@ end subroutine class_scalar
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.class<!fir.ptr<none>>
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.type<_QMtarget_to_pointer_typesTt1>>>
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.type<_QMtarget_to_pointer_typesTt1>>>
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFclass_scalarEt"} : (!fir.class<!fir.type<_QMtarget_to_pointer_typesTt1>>) -> (!fir.class<!fir.type<_QMtarget_to_pointer_typesTt1>>, !fir.class<!fir.type<_QMtarget_to_pointer_typesTt1>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFclass_scalarEt"} : (!fir.class<!fir.type<_QMtarget_to_pointer_typesTt1>>, !fir.dscope) -> (!fir.class<!fir.type<_QMtarget_to_pointer_typesTt1>>, !fir.class<!fir.type<_QMtarget_to_pointer_typesTt1>>)
 ! CHECK:           %[[VAL_5:.*]] = fir.rebox %[[VAL_4]]#1 : (!fir.class<!fir.type<_QMtarget_to_pointer_typesTt1>>) -> !fir.box<!fir.ptr<!fir.type<_QMtarget_to_pointer_typesTt1>>>
 ! CHECK:           fir.store %[[VAL_5]] to %[[VAL_3]] : !fir.ref<!fir.box<!fir.ptr<!fir.type<_QMtarget_to_pointer_typesTt1>>>>
 ! CHECK:           fir.call @_QPclass_scalar_callee(%[[VAL_3]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.ptr<!fir.type<_QMtarget_to_pointer_typesTt1>>>>) -> ()
@@ -439,7 +439,7 @@ end subroutine class_assumed_shape_array
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.array<?xnone>>>
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFclass_assumed_shape_arrayEt"} : (!fir.class<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>) -> (!fir.class<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>, !fir.class<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFclass_assumed_shape_arrayEt"} : (!fir.class<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>, !fir.class<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>)
 ! CHECK:           %[[VAL_5:.*]] = fir.rebox %[[VAL_4]]#1 : (!fir.class<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>) -> !fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>
 ! CHECK:           fir.store %[[VAL_5]] to %[[VAL_3]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>>
 ! CHECK:           fir.call @_QPclass_assumed_shape_array_callee(%[[VAL_3]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>>) -> ()
@@ -478,7 +478,7 @@ end subroutine class_explicit_shape_array
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.array<?xnone>>>
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFclass_explicit_shape_arrayEt"} : (!fir.class<!fir.array<100x!fir.type<_QMtarget_to_pointer_typesTt1>>>) -> (!fir.class<!fir.array<100x!fir.type<_QMtarget_to_pointer_typesTt1>>>, !fir.class<!fir.array<100x!fir.type<_QMtarget_to_pointer_typesTt1>>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFclass_explicit_shape_arrayEt"} : (!fir.class<!fir.array<100x!fir.type<_QMtarget_to_pointer_typesTt1>>>, !fir.dscope) -> (!fir.class<!fir.array<100x!fir.type<_QMtarget_to_pointer_typesTt1>>>, !fir.class<!fir.array<100x!fir.type<_QMtarget_to_pointer_typesTt1>>>)
 ! CHECK:           %[[VAL_5:.*]] = fir.rebox %[[VAL_4]]#1 : (!fir.class<!fir.array<100x!fir.type<_QMtarget_to_pointer_typesTt1>>>) -> !fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>
 ! CHECK:           fir.store %[[VAL_5]] to %[[VAL_3]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>>
 ! CHECK:           fir.call @_QPclass_explicit_shape_array_callee(%[[VAL_3]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>>) -> ()
@@ -505,7 +505,7 @@ end subroutine uclass_scalar
 ! CHECK-LABEL:   func.func @_QPuclass_scalar(
 ! CHECK-SAME:                                %[[VAL_0:.*]]: !fir.class<none> {fir.bindc_name = "t", fir.target}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.class<!fir.ptr<none>>
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFuclass_scalarEt"} : (!fir.class<none>) -> (!fir.class<none>, !fir.class<none>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFuclass_scalarEt"} : (!fir.class<none>, !fir.dscope) -> (!fir.class<none>, !fir.class<none>)
 ! CHECK:           %[[VAL_3:.*]] = fir.rebox %[[VAL_2]]#1 : (!fir.class<none>) -> !fir.class<!fir.ptr<none>>
 ! CHECK:           fir.store %[[VAL_3]] to %[[VAL_1]] : !fir.ref<!fir.class<!fir.ptr<none>>>
 ! CHECK:           fir.call @_QPuclass_scalar_uclass_callee(%[[VAL_1]]) fastmath<contract> : (!fir.ref<!fir.class<!fir.ptr<none>>>) -> ()
@@ -526,7 +526,7 @@ end subroutine uclass_assumed_shape_array
 ! CHECK-LABEL:   func.func @_QPuclass_assumed_shape_array(
 ! CHECK-SAME:                                             %[[VAL_0:.*]]: !fir.class<!fir.array<?xnone>> {fir.bindc_name = "t", fir.target}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.array<?xnone>>>
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFuclass_assumed_shape_arrayEt"} : (!fir.class<!fir.array<?xnone>>) -> (!fir.class<!fir.array<?xnone>>, !fir.class<!fir.array<?xnone>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFuclass_assumed_shape_arrayEt"} : (!fir.class<!fir.array<?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?xnone>>, !fir.class<!fir.array<?xnone>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.rebox %[[VAL_2]]#1 : (!fir.class<!fir.array<?xnone>>) -> !fir.class<!fir.ptr<!fir.array<?xnone>>>
 ! CHECK:           fir.store %[[VAL_3]] to %[[VAL_1]] : !fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>
 ! CHECK:           fir.call @_QPuclass_assumed_shape_array_uclass_callee(%[[VAL_1]]) fastmath<contract> : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>) -> ()
@@ -547,7 +547,7 @@ end subroutine uclass_explicit_shape_array
 ! CHECK-LABEL:   func.func @_QPuclass_explicit_shape_array(
 ! CHECK-SAME:                                              %[[VAL_0:.*]]: !fir.class<!fir.array<100xnone>> {fir.bindc_name = "t", fir.target}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.array<?xnone>>>
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFuclass_explicit_shape_arrayEt"} : (!fir.class<!fir.array<100xnone>>) -> (!fir.class<!fir.array<100xnone>>, !fir.class<!fir.array<100xnone>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFuclass_explicit_shape_arrayEt"} : (!fir.class<!fir.array<100xnone>>, !fir.dscope) -> (!fir.class<!fir.array<100xnone>>, !fir.class<!fir.array<100xnone>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.rebox %[[VAL_2]]#1 : (!fir.class<!fir.array<100xnone>>) -> !fir.class<!fir.ptr<!fir.array<?xnone>>>
 ! CHECK:           fir.store %[[VAL_3]] to %[[VAL_1]] : !fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>
 ! CHECK:           fir.call @_QPuclass_explicit_shape_array_uclass_callee(%[[VAL_1]]) fastmath<contract> : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>) -> ()
diff --git a/flang/test/Lower/HLFIR/allocatable-and-pointer-status-change.f90 b/flang/test/Lower/HLFIR/allocatable-and-pointer-status-change.f90
index f5ae6592faa4..328fb778eaf8 100644
--- a/flang/test/Lower/HLFIR/allocatable-and-pointer-status-change.f90
+++ b/flang/test/Lower/HLFIR/allocatable-and-pointer-status-change.f90
@@ -5,7 +5,7 @@
 subroutine allocation(x)
   character(*), allocatable :: x(:)
 ! CHECK-LABEL: func.func @_QPallocation(
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] typeparams %[[VAL_2:[a-z0-9]*]] {fortran_attrs = #fir.var_attrs<allocatable>,  {{.*}}Ex
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] typeparams %[[VAL_2:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>,  {{.*}}Ex
   deallocate(x)
 ! CHECK:  %[[VAL_4:.*]] = fir.load %[[VAL_3]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>
 ! CHECK:  %[[VAL_5:.*]] = fir.box_addr %[[VAL_4]] : (!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>) -> !fir.heap<!fir.array<?x!fir.char<1,?>>>
@@ -30,8 +30,8 @@ subroutine pointer_assignment(p, ziel)
   real, pointer :: p(:)
   real, target :: ziel(42:)
 ! CHECK-LABEL: func.func @_QPpointer_assignment(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] {fortran_attrs = #fir.var_attrs<pointer>,  {{.*}}Ep
-! CHECK:  %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]](%[[VAL_5:[a-z0-9]*]]) {fortran_attrs = #fir.var_attrs<target>,  {{.*}}Eziel
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>,  {{.*}}Ep
+! CHECK:  %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]](%[[VAL_5:[a-z0-9]*]]) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>,  {{.*}}Eziel
   p => ziel
 ! CHECK:  %[[VAL_7:.*]] = fir.shift %[[VAL_4:.*]] : (index) -> !fir.shift<1>
 ! CHECK:  %[[VAL_8:.*]] = fir.rebox %[[VAL_6]]#1(%[[VAL_7]]) : (!fir.box<!fir.array<?xf32>>, !fir.shift<1>) -> !fir.box<!fir.ptr<!fir.array<?xf32>>>
@@ -46,8 +46,8 @@ subroutine pointer_remapping(p, ziel)
   real, pointer :: p(:, :)
   real, target :: ziel(10, 20, 30)
 ! CHECK-LABEL: func.func @_QPpointer_remapping(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] {fortran_attrs = #fir.var_attrs<pointer>,  {{.*}}Ep
-! CHECK:  %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]](%[[VAL_6:[a-z0-9]*]]) {fortran_attrs = #fir.var_attrs<target>,  {{.*}}Eziel
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>,  {{.*}}Ep
+! CHECK:  %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]](%[[VAL_6:[a-z0-9]*]]) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>,  {{.*}}Eziel
   p(2:7, 3:102) => ziel
 ! CHECK:  %[[VAL_8:.*]] = arith.constant 2 : i64
 ! CHECK:  %[[VAL_9:.*]] = arith.constant 7 : i64
@@ -101,7 +101,7 @@ subroutine ptr_comp_assign(x, ziel)
   x(9_8)%p => ziel
 ! CHECK:  %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:  %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:  %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]](%[[VAL_6:[a-z0-9]*]]) {fortran_attrs = #fir.var_attrs<target>,  {{.*}}Eziel
+! CHECK:  %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]](%[[VAL_6:[a-z0-9]*]]) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>,  {{.*}}Eziel
 ! CHECK:  %[[VAL_8:.*]] = arith.constant 9 : index
 ! CHECK:  %[[VAL_9:.*]] = hlfir.designate %[[VAL_4]]#0 (%[[VAL_8]])  : (!fir.ref<!fir.array<10x!fir.type<_QFptr_comp_assignTt{p:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>>, index) -> !fir.ref<!fir.type<_QFptr_comp_assignTt{p:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>
 ! CHECK:  %[[VAL_10:.*]] = hlfir.designate %[[VAL_9]]{"p"}   {fortran_attrs = #fir.var_attrs<pointer>} : (!fir.ref<!fir.type<_QFptr_comp_assignTt{p:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
diff --git a/flang/test/Lower/HLFIR/allocatables-and-pointers.f90 b/flang/test/Lower/HLFIR/allocatables-and-pointers.f90
index ad6b2cf932e3..eb278508eba2 100644
--- a/flang/test/Lower/HLFIR/allocatables-and-pointers.f90
+++ b/flang/test/Lower/HLFIR/allocatables-and-pointers.f90
@@ -15,7 +15,7 @@ subroutine passing_allocatable(x)
   call takes_array(x)
 end subroutine
 ! CHECK-LABEL: func.func @_QPpassing_allocatable(
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name =  {{.*}}Ex"}
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name =  {{.*}}Ex"}
 ! CHECK:  fir.call @_QPtakes_allocatable(%[[VAL_1]]#0) {{.*}} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> ()
 ! CHECK:  %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 ! CHECK:  %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.heap<!fir.array<?xf32>>
@@ -34,7 +34,7 @@ subroutine passing_pointer(x)
 end subroutine
 ! CHECK-LABEL: func.func @_QPpassing_pointer(
 ! CHECK:  %[[VAL_1:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xf32>>>
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name =  {{.*}}Ex"}
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name =  {{.*}}Ex"}
 ! CHECK:  fir.call @_QPtakes_pointer(%[[VAL_2]]#0) {{.*}} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> ()
 ! CHECK:  %[[VAL_3:.*]] = fir.zero_bits !fir.ptr<!fir.array<?xf32>>
 ! CHECK:  %[[VAL_4:.*]] = arith.constant 0 : index
@@ -53,7 +53,7 @@ subroutine passing_contiguous_pointer(x)
   call takes_array(x)
 end subroutine
 ! CHECK-LABEL: func.func @_QPpassing_contiguous_pointer(
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] {fortran_attrs = #fir.var_attrs<contiguous, pointer>, uniq_name =  {{.*}}Ex"}
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<contiguous, pointer>, uniq_name =  {{.*}}Ex"}
 ! CHECK:  %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
 ! CHECK:  %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.ptr<!fir.array<?xf32>>
 ! CHECK:  %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ptr<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
@@ -66,7 +66,7 @@ subroutine character_allocatable_cst_len(x)
 end subroutine
 ! CHECK-LABEL: func.func @_QPcharacter_allocatable_cst_len(
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 10 : index
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] typeparams %[[VAL_1:[a-z0-9]*]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name =  {{.*}}Ex"}
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] typeparams %[[VAL_1:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name =  {{.*}}Ex"}
 ! CHECK:  %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,10>>>>
 ! CHECK:  %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.box<!fir.heap<!fir.char<1,10>>>) -> !fir.heap<!fir.char<1,10>>
 ! CHECK:  %[[VAL_5:.*]] = arith.constant 10 : index
@@ -87,12 +87,12 @@ subroutine character_allocatable_dyn_len(x, l)
   call takes_char(x//"hello")
 end subroutine
 ! CHECK-LABEL: func.func @_QPcharacter_allocatable_dyn_len(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]] {uniq_name =  {{.*}}El"}
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {uniq_name =  {{.*}}El"}
 ! CHECK:  %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i64>
 ! CHECK:  %[[VAL_4:.*]] = arith.constant 0 : i64
 ! CHECK:  %[[VAL_5:.*]] = arith.cmpi sgt, %[[VAL_3]], %[[VAL_4]] : i64
 ! CHECK:  %[[VAL_6:.*]] = arith.select %[[VAL_5]], %[[VAL_3]], %[[VAL_4]] : i64
-! CHECK:  %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] typeparams %[[VAL_6:[a-z0-9]*]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name =  {{.*}}Ex"}
+! CHECK:  %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] typeparams %[[VAL_6:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name =  {{.*}}Ex"}
 ! CHECK:  %[[VAL_8:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
 ! CHECK:  %[[VAL_9:.*]] = fir.box_addr %[[VAL_8]] : (!fir.box<!fir.heap<!fir.char<1,?>>>) -> !fir.heap<!fir.char<1,?>>
 ! CHECK:  %[[VAL_10:.*]] = fir.emboxchar %[[VAL_9]], %[[VAL_6]] : (!fir.heap<!fir.char<1,?>>, i64) -> !fir.boxchar<1>
@@ -110,7 +110,7 @@ subroutine print_allocatable(x)
   print *, x
 end subroutine
 ! CHECK-LABEL: func.func @_QPprint_allocatable(
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name =  {{.*}}Ex"}
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name =  {{.*}}Ex"}
 ! CHECK:  %[[VAL_7:.*]] = fir.load %[[VAL_1]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 ! CHECK:  %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.box<none>
 ! CHECK:  %[[VAL_9:.*]] = fir.call @_FortranAioOutputDescriptor(%{{.*}}, %[[VAL_8]])
@@ -120,7 +120,7 @@ subroutine print_pointer(x)
   print *, x
 end subroutine
 ! CHECK-LABEL: func.func @_QPprint_pointer(
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name =  {{.*}}Ex"}
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name =  {{.*}}Ex"}
 ! CHECK:  %[[VAL_7:.*]] = fir.load %[[VAL_1]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
 ! CHECK:  %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.box<none>
 ! CHECK:  %[[VAL_9:.*]] = fir.call @_FortranAioOutputDescriptor(%{{.*}}, %[[VAL_8]])
@@ -130,7 +130,7 @@ subroutine elemental_expr(x)
   call takes_array_2(x+42)
 end subroutine
 ! CHECK-LABEL: func.func @_QPelemental_expr(
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name =  {{.*}}Ex"}
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name =  {{.*}}Ex"}
 ! CHECK:  %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xi32>>>>
 ! CHECK:  %[[VAL_3:.*]] = arith.constant 42 : i32
 ! CHECK:  %[[VAL_4:.*]] = arith.constant 0 : index
diff --git a/flang/test/Lower/HLFIR/array-ctor-as-elemental-nested.f90 b/flang/test/Lower/HLFIR/array-ctor-as-elemental-nested.f90
index f3f5653a7c48..a30c6c6e4a22 100644
--- a/flang/test/Lower/HLFIR/array-ctor-as-elemental-nested.f90
+++ b/flang/test/Lower/HLFIR/array-ctor-as-elemental-nested.f90
@@ -9,14 +9,14 @@
 ! CHECK-SAME:                       %[[VAL_1:.*]]: !fir.ref<!fir.array<2xf32>> {fir.bindc_name = "h1"}) {
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 2 : index
 ! CHECK:           %[[VAL_3:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_3]]) {uniq_name = "_QFtestEh1"} : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<2xf32>>, !fir.ref<!fir.array<2xf32>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFtestEh1"} : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<2xf32>>, !fir.ref<!fir.array<2xf32>>)
 ! CHECK:           %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "k", uniq_name = "_QFtestEk"}
 ! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFtestEk"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_7:.*]] = fir.alloca i32 {bindc_name = "l", uniq_name = "_QFtestEl"}
 ! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFtestEl"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_9:.*]] = fir.address_of(@_QFtestECn) : !fir.ref<i32>
 ! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_9]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QFtestECn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtestEpi"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtestEpi"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:           %[[VAL_12:.*]] = arith.constant 2 : index
 ! CHECK:           %[[VAL_13:.*]] = fir.shape %[[VAL_12]] : (index) -> !fir.shape<1>
 ! CHECK:           %[[VAL_14:.*]] = hlfir.elemental %[[VAL_13]] unordered : (!fir.shape<1>) -> !hlfir.expr<2xf32> {
diff --git a/flang/test/Lower/HLFIR/array-ctor-as-elemental.f90 b/flang/test/Lower/HLFIR/array-ctor-as-elemental.f90
index 7cbc052ea709..277e2683c64f 100644
--- a/flang/test/Lower/HLFIR/array-ctor-as-elemental.f90
+++ b/flang/test/Lower/HLFIR/array-ctor-as-elemental.f90
@@ -7,7 +7,7 @@ subroutine test_as_simple_elemental(n)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_as_simple_elemental(
 ! CHECK-SAME:                                           %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest_as_simple_elementalEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_as_simple_elementalEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 4 : index
 ! CHECK:           %[[VAL_3:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 1 : i64
@@ -41,9 +41,10 @@ end subroutine
 ! CHECK-SAME:                                            %[[VAL_0:.*]]: !fir.ref<i64> {fir.bindc_name = "lb"},
 ! CHECK-SAME:                                            %[[VAL_1:.*]]: !fir.ref<i64> {fir.bindc_name = "ub"},
 ! CHECK-SAME:                                            %[[VAL_2:.*]]: !fir.ref<i64> {fir.bindc_name = "stride"}) {
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest_as_strided_elementalElb"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFtest_as_strided_elementalEstride"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFtest_as_strided_elementalEub"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest_as_strided_elementalElb"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest_as_strided_elementalEstride"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest_as_strided_elementalEub"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i64
 ! CHECK:           %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i64>
 ! CHECK:           %[[VAL_8:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i64>
@@ -91,7 +92,7 @@ subroutine test_as_elemental_with_pure_call(n)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_as_elemental_with_pure_call(
 ! CHECK-SAME:                                                   %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest_as_elemental_with_pure_callEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_as_elemental_with_pure_callEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 4 : index
 ! CHECK:           %[[VAL_3:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 1 : i64
diff --git a/flang/test/Lower/HLFIR/array-ctor-as-inlined-temp.f90 b/flang/test/Lower/HLFIR/array-ctor-as-inlined-temp.f90
index 1fc882015108..a7c2faa410fb 100644
--- a/flang/test/Lower/HLFIR/array-ctor-as-inlined-temp.f90
+++ b/flang/test/Lower/HLFIR/array-ctor-as-inlined-temp.f90
@@ -116,7 +116,7 @@ end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_implied_do(
 ! CHECK-SAME:                                  %[[VAL_0:.*]]: !fir.ref<i64> {fir.bindc_name = "n"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca index
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest_implied_doEn"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_implied_doEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 0 : i64
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 2 : i64
 ! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i64>
@@ -178,9 +178,9 @@ end subroutine
 ! CHECK-SAME:                                          %[[VAL_1:.*]]: !fir.ref<i64> {fir.bindc_name = "ub"},
 ! CHECK-SAME:                                          %[[VAL_2:.*]]: !fir.ref<i64> {fir.bindc_name = "stride"}) {
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca index
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest_strided_implied_doElb"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFtest_strided_implied_doEstride"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFtest_strided_implied_doEub"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_strided_implied_doElb"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_strided_implied_doEstride"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_strided_implied_doEub"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK:           %[[VAL_7:.*]] = arith.constant 0 : i64
 ! CHECK:           %[[VAL_8:.*]] = arith.constant 2 : i64
 ! CHECK:           %[[VAL_9:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i64>
@@ -241,8 +241,8 @@ end subroutine
 ! CHECK-SAME:                                         %[[VAL_0:.*]]: !fir.ref<i64> {fir.bindc_name = "n"},
 ! CHECK-SAME:                                         %[[VAL_1:.*]]: !fir.ref<i64> {fir.bindc_name = "m"}) {
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca index
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFtest_nested_implied_doEm"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest_nested_implied_doEn"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_nested_implied_doEm"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_nested_implied_doEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : i64
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i64
 ! CHECK:           %[[VAL_7:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i64>
diff --git a/flang/test/Lower/HLFIR/array-ctor-index.f90 b/flang/test/Lower/HLFIR/array-ctor-index.f90
index 83eb2cd3a408..f0c7cf620e9a 100644
--- a/flang/test/Lower/HLFIR/array-ctor-index.f90
+++ b/flang/test/Lower/HLFIR/array-ctor-index.f90
@@ -8,7 +8,7 @@ function test1(k)
 end function test1
 ! CHECK-LABEL:   func.func @_QPtest1(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<i8> {fir.bindc_name = "k"}) -> !fir.array<4xi8> {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest1Ek"} : (!fir.ref<i8>) -> (!fir.ref<i8>, !fir.ref<i8>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest1Ek"} : (!fir.ref<i8>, !fir.dscope) -> (!fir.ref<i8>, !fir.ref<i8>)
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 4 : index
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.array<4xi8> {bindc_name = "test1", uniq_name = "_QFtest1Etest1"}
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
@@ -58,7 +58,7 @@ function test2(k)
 end function test2
 ! CHECK-LABEL:   func.func @_QPtest2(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<i16> {fir.bindc_name = "k"}) -> !fir.array<4xi16> {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest2Ek"} : (!fir.ref<i16>) -> (!fir.ref<i16>, !fir.ref<i16>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest2Ek"} : (!fir.ref<i16>, !fir.dscope) -> (!fir.ref<i16>, !fir.ref<i16>)
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 4 : index
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.array<4xi16> {bindc_name = "test2", uniq_name = "_QFtest2Etest2"}
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
@@ -108,7 +108,7 @@ function test3(k)
 end function test3
 ! CHECK-LABEL:   func.func @_QPtest3(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "k"}) -> !fir.array<4xi32> {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest3Ek"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest3Ek"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 4 : index
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.array<4xi32> {bindc_name = "test3", uniq_name = "_QFtest3Etest3"}
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
@@ -158,7 +158,7 @@ function test4(k)
 end function test4
 ! CHECK-LABEL:   func.func @_QPtest4(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<i64> {fir.bindc_name = "k"}) -> !fir.array<4xi64> {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest4Ek"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest4Ek"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 4 : index
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.array<4xi64> {bindc_name = "test4", uniq_name = "_QFtest4Etest4"}
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
diff --git a/flang/test/Lower/HLFIR/assignment-intrinsics.f90 b/flang/test/Lower/HLFIR/assignment-intrinsics.f90
index 984395d8f90d..544815e88140 100644
--- a/flang/test/Lower/HLFIR/assignment-intrinsics.f90
+++ b/flang/test/Lower/HLFIR/assignment-intrinsics.f90
@@ -10,8 +10,8 @@ subroutine scalar_int(x, y)
   x = y
 end subroutine
 ! CHECK-LABEL: func.func @_QPscalar_int(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFscalar_intEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFscalar_intEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFscalar_intEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFscalar_intEy"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:  %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0
 ! CHECK:  hlfir.assign %[[VAL_4]] to %[[VAL_2]]#0 : i32, !fir.ref<i32>
 
@@ -20,8 +20,8 @@ subroutine scalar_logical(x, y)
   x = y
 end subroutine
 ! CHECK-LABEL: func.func @_QPscalar_logical(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFscalar_logicalEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFscalar_logicalEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFscalar_logicalEx"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFscalar_logicalEy"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:  %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0
 ! CHECK:  hlfir.assign %[[VAL_4]] to %[[VAL_2]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
 
@@ -30,8 +30,8 @@ subroutine scalar_real(x, y)
   x = y
 end subroutine
 ! CHECK-LABEL: func.func @_QPscalar_real(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFscalar_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFscalar_realEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFscalar_realEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFscalar_realEy"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:  %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0
 ! CHECK:  hlfir.assign %[[VAL_4]] to %[[VAL_2]]#0 : f32, !fir.ref<f32>
 
@@ -40,8 +40,8 @@ subroutine scalar_complex(x, y)
   x = y
 end subroutine
 ! CHECK-LABEL: func.func @_QPscalar_complex(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFscalar_complexEx"} : (!fir.ref<!fir.complex<4>>) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFscalar_complexEy"} : (!fir.ref<!fir.complex<4>>) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFscalar_complexEx"} : (!fir.ref<!fir.complex<4>>, !fir.dscope) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFscalar_complexEy"} : (!fir.ref<!fir.complex<4>>, !fir.dscope) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
 ! CHECK:  %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0
 ! CHECK:  hlfir.assign %[[VAL_4]] to %[[VAL_2]]#0 : !fir.complex<4>, !fir.ref<!fir.complex<4>>
 
@@ -50,8 +50,8 @@ subroutine scalar_character(x, y)
   x = y
 end subroutine
 ! CHECK-LABEL: func.func @_QPscalar_character(
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFscalar_characterEx"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
-! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFscalar_characterEy"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFscalar_characterEx"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFscalar_characterEy"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:  hlfir.assign %[[VAL_5]]#0 to %[[VAL_3]]#0 : !fir.boxchar<1>, !fir.boxchar<1>
 
 ! -----------------------------------------------------------------------------
@@ -63,7 +63,7 @@ subroutine scalar_int_2(x)
   x = 42
 end subroutine
 ! CHECK-LABEL: func.func @_QPscalar_int_2(
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFscalar_int_2Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFscalar_int_2Ex"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:  %[[VAL_2:.*]] = arith.constant 42 : i32
 ! CHECK:  hlfir.assign %[[VAL_2]] to %[[VAL_1]]#0 : i32, !fir.ref<i32>
 
@@ -72,7 +72,7 @@ subroutine scalar_logical_2(x)
   x = .true.
 end subroutine
 ! CHECK-LABEL: func.func @_QPscalar_logical_2(
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFscalar_logical_2Ex"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFscalar_logical_2Ex"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:  %[[VAL_2:.*]] = arith.constant true
 ! CHECK:  %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (i1) -> !fir.logical<4>
 ! CHECK:  hlfir.assign %[[VAL_3]] to %[[VAL_1]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -82,7 +82,7 @@ subroutine scalar_real_2(x)
   x = 3.14
 end subroutine
 ! CHECK-LABEL: func.func @_QPscalar_real_2(
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFscalar_real_2Ex"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFscalar_real_2Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:  %[[VAL_2:.*]] = arith.constant 3.140000e+00 : f32
 ! CHECK:  hlfir.assign %[[VAL_2]] to %[[VAL_1]]#0 : f32, !fir.ref<f32>
 
@@ -91,7 +91,7 @@ subroutine scalar_complex_2(x)
   x = (1., -1.)
 end subroutine
 ! CHECK-LABEL: func.func @_QPscalar_complex_2(
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFscalar_complex_2Ex"} : (!fir.ref<!fir.complex<4>>) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFscalar_complex_2Ex"} : (!fir.ref<!fir.complex<4>>, !fir.dscope) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
 ! CHECK:  %[[VAL_2:.*]] = arith.constant 1.000000e+00 : f32
 ! CHECK:  %[[VAL_3:.*]] = arith.constant -1.000000e+00 : f32
 ! CHECK:  %[[VAL_4:.*]] = fir.undefined !fir.complex<4>
@@ -104,7 +104,7 @@ subroutine scalar_character_2(x)
   x = "hello"
 end subroutine
 ! CHECK-LABEL: func.func @_QPscalar_character_2(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare {{.*}} {uniq_name = "_QFscalar_character_2Ex"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare {{.*}} {uniq_name = "_QFscalar_character_2Ex"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare {{.*}} {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX68656C6C6F"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
 ! CHECK:  hlfir.assign %[[VAL_5]]#0 to %[[VAL_2]]#0 : !fir.ref<!fir.char<1,5>>, !fir.boxchar<1>
 
@@ -117,8 +117,8 @@ subroutine array(x, y)
   x = y
 end subroutine
 ! CHECK-LABEL: func.func @_QParray(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFarrayEx"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
-! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFarrayEy"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFarrayEx"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFarrayEy"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
 ! CHECK:  hlfir.assign %[[VAL_5]]#0 to %[[VAL_2]]#0 : !fir.ref<!fir.array<100xi32>>, !fir.box<!fir.array<?xi32>>
 
 subroutine array_lbs(x, y)
@@ -126,8 +126,8 @@ subroutine array_lbs(x, y)
   x = y
 end subroutine
 ! CHECK-LABEL: func.func @_QParray_lbs(
-! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFarray_lbsEx"} : (!fir.ref<!fir.array<20x!fir.logical<4>>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<20x!fir.logical<4>>>, !fir.ref<!fir.array<20x!fir.logical<4>>>)
-! CHECK:  %[[VAL_9:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFarray_lbsEy"} : (!fir.ref<!fir.array<20x!fir.logical<4>>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<20x!fir.logical<4>>>, !fir.ref<!fir.array<20x!fir.logical<4>>>)
+! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFarray_lbsEx"} : (!fir.ref<!fir.array<20x!fir.logical<4>>>, !fir.shapeshift<1>, !fir.dscope) -> (!fir.box<!fir.array<20x!fir.logical<4>>>, !fir.ref<!fir.array<20x!fir.logical<4>>>)
+! CHECK:  %[[VAL_9:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFarray_lbsEy"} : (!fir.ref<!fir.array<20x!fir.logical<4>>>, !fir.shapeshift<1>, !fir.dscope) -> (!fir.box<!fir.array<20x!fir.logical<4>>>, !fir.ref<!fir.array<20x!fir.logical<4>>>)
 ! CHECK:  hlfir.assign %[[VAL_9]]#0 to %[[VAL_5]]#0 : !fir.box<!fir.array<20x!fir.logical<4>>>, !fir.box<!fir.array<20x!fir.logical<4>>>
 
 
@@ -136,8 +136,8 @@ subroutine array_character(x, y)
   x = y
 end subroutine
 ! CHECK-LABEL: func.func @_QParray_character(
-! CHECK:  %[[VAL_6:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFarray_characterEx"} : (!fir.ref<!fir.array<10x!fir.char<1,?>>>, !fir.shape<1>, index) -> (!fir.box<!fir.array<10x!fir.char<1,?>>>, !fir.ref<!fir.array<10x!fir.char<1,?>>>)
-! CHECK:  %[[VAL_11:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFarray_characterEy"} : (!fir.ref<!fir.array<10x!fir.char<1,?>>>, !fir.shape<1>, index) -> (!fir.box<!fir.array<10x!fir.char<1,?>>>, !fir.ref<!fir.array<10x!fir.char<1,?>>>)
+! CHECK:  %[[VAL_6:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFarray_characterEx"} : (!fir.ref<!fir.array<10x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<10x!fir.char<1,?>>>, !fir.ref<!fir.array<10x!fir.char<1,?>>>)
+! CHECK:  %[[VAL_11:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFarray_characterEy"} : (!fir.ref<!fir.array<10x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<10x!fir.char<1,?>>>, !fir.ref<!fir.array<10x!fir.char<1,?>>>)
 ! CHECK:  hlfir.assign %[[VAL_11]]#0 to %[[VAL_6]]#0 : !fir.box<!fir.array<10x!fir.char<1,?>>>, !fir.box<!fir.array<10x!fir.char<1,?>>>
 
 subroutine array_pointer(x, y)
@@ -160,8 +160,8 @@ subroutine array_scalar(x, y)
   x = y
 end subroutine
 ! CHECK-LABEL: func.func @_QParray_scalar(
-! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFarray_scalarEx"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
-! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFarray_scalarEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFarray_scalarEx"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
+! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare {{.*}}  {uniq_name = "_QFarray_scalarEy"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:  %[[VAL_6:.*]] = fir.load %[[VAL_5]]#0
 ! CHECK:  hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<!fir.array<100xi32>>
 
diff --git a/flang/test/Lower/HLFIR/assumed-rank-iface-alloc-ptr.f90 b/flang/test/Lower/HLFIR/assumed-rank-iface-alloc-ptr.f90
index 1bb5c001ece8..fb1385f87f1b 100644
--- a/flang/test/Lower/HLFIR/assumed-rank-iface-alloc-ptr.f90
+++ b/flang/test/Lower/HLFIR/assumed-rank-iface-alloc-ptr.f90
@@ -23,7 +23,7 @@ subroutine scalar_alloc_to_assumed_rank(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPscalar_alloc_to_assumed_rank(
 ! CHECK-SAME:                                               %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<f32>>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFscalar_alloc_to_assumed_rankEx"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFscalar_alloc_to_assumed_rankEx"} : (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>
 ! CHECK:           fir.call @_QPalloc_assumed_rank(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>) -> ()
 
@@ -34,7 +34,7 @@ subroutine r2_alloc_to_assumed_rank(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPr2_alloc_to_assumed_rank(
 ! CHECK-SAME:                                           %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFr2_alloc_to_assumed_rankEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFr2_alloc_to_assumed_rankEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>
 ! CHECK:           fir.call @_QPalloc_assumed_rank(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>) -> ()
 
@@ -45,7 +45,7 @@ subroutine scalar_pointer_to_assumed_rank(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPscalar_pointer_to_assumed_rank(
 ! CHECK-SAME:                                                 %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<f32>>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFscalar_pointer_to_assumed_rankEx"} : (!fir.ref<!fir.box<!fir.ptr<f32>>>) -> (!fir.ref<!fir.box<!fir.ptr<f32>>>, !fir.ref<!fir.box<!fir.ptr<f32>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFscalar_pointer_to_assumed_rankEx"} : (!fir.ref<!fir.box<!fir.ptr<f32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<f32>>>, !fir.ref<!fir.box<!fir.ptr<f32>>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.ref<!fir.box<!fir.ptr<f32>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>
 ! CHECK:           fir.call @_QPpointer_assumed_rank(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>) -> ()
 
@@ -56,7 +56,7 @@ subroutine r2_pointer_to_assumed_rank(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPr2_pointer_to_assumed_rank(
 ! CHECK-SAME:                                             %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFr2_pointer_to_assumed_rankEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFr2_pointer_to_assumed_rankEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>
 ! CHECK:           fir.call @_QPpointer_assumed_rank(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>) -> ()
 
@@ -68,7 +68,7 @@ end subroutine
 ! CHECK-LABEL:   func.func @_QPr2_target_to_pointer_assumed_rank(
 ! CHECK-SAME:                                                    %[[VAL_0:.*]]: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "x", fir.target}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?x?xf32>>>
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFr2_target_to_pointer_assumed_rankEx"} : (!fir.box<!fir.array<?x?xf32>>) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFr2_target_to_pointer_assumed_rankEx"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.rebox %[[VAL_2]]#1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<!fir.ptr<!fir.array<?x?xf32>>>
 ! CHECK:           fir.store %[[VAL_3]] to %[[VAL_1]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>
diff --git a/flang/test/Lower/HLFIR/assumed-rank-iface.f90 b/flang/test/Lower/HLFIR/assumed-rank-iface.f90
index 155ce8fb55f2..2d1d941238b1 100644
--- a/flang/test/Lower/HLFIR/assumed-rank-iface.f90
+++ b/flang/test/Lower/HLFIR/assumed-rank-iface.f90
@@ -23,7 +23,7 @@ subroutine int_scalar_to_assumed_rank(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPint_scalar_to_assumed_rank(
 ! CHECK-SAME:                                             %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFint_scalar_to_assumed_rankEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFint_scalar_to_assumed_rankEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = fir.embox %[[VAL_1]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
 ! CHECK:           %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.box<i32>) -> !fir.box<!fir.array<*:i32>>
 ! CHECK:           fir.call @_QPint_assumed_rank(%[[VAL_3]]) fastmath<contract> : (!fir.box<!fir.array<*:i32>>) -> ()
@@ -35,7 +35,7 @@ subroutine int_scalar_to_assumed_rank_bindc(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPint_scalar_to_assumed_rank_bindc(
 ! CHECK-SAME:                                                   %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFint_scalar_to_assumed_rank_bindcEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFint_scalar_to_assumed_rank_bindcEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = fir.embox %[[VAL_1]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
 ! CHECK:           %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.box<i32>) -> !fir.box<!fir.array<*:i32>>
 ! CHECK:           fir.call @int_assumed_rank_bindc(%[[VAL_3]]) fastmath<contract> : (!fir.box<!fir.array<*:i32>>) -> ()
@@ -49,7 +49,7 @@ end subroutine
 ! CHECK-SAME:                                         %[[VAL_0:.*]]: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_2]]) {uniq_name = "_QFint_r1_to_assumed_rankEx"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_2]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFint_r1_to_assumed_rankEx"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_4:.*]] = fir.embox %[[VAL_3]]#0(%[[VAL_2]]) : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<10xi32>>
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.box<!fir.array<10xi32>>) -> !fir.box<!fir.array<*:i32>>
 ! CHECK:           fir.call @_QPint_assumed_rank(%[[VAL_5]]) fastmath<contract> : (!fir.box<!fir.array<*:i32>>) -> ()
@@ -66,7 +66,7 @@ end subroutine
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 4 : index
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 5 : index
 ! CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_1]], %[[VAL_2]], %[[VAL_3]], %[[VAL_4]] : (index, index, index, index) -> !fir.shape<4>
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) {uniq_name = "_QFint_r4_to_assumed_rankEx"} : (!fir.ref<!fir.array<2x3x4x5xi32>>, !fir.shape<4>) -> (!fir.ref<!fir.array<2x3x4x5xi32>>, !fir.ref<!fir.array<2x3x4x5xi32>>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFint_r4_to_assumed_rankEx"} : (!fir.ref<!fir.array<2x3x4x5xi32>>, !fir.shape<4>, !fir.dscope) -> (!fir.ref<!fir.array<2x3x4x5xi32>>, !fir.ref<!fir.array<2x3x4x5xi32>>)
 ! CHECK:           %[[VAL_7:.*]] = fir.embox %[[VAL_6]]#0(%[[VAL_5]]) : (!fir.ref<!fir.array<2x3x4x5xi32>>, !fir.shape<4>) -> !fir.box<!fir.array<2x3x4x5xi32>>
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (!fir.box<!fir.array<2x3x4x5xi32>>) -> !fir.box<!fir.array<*:i32>>
 ! CHECK:           fir.call @_QPint_assumed_rank(%[[VAL_8]]) fastmath<contract> : (!fir.box<!fir.array<*:i32>>) -> ()
@@ -78,7 +78,7 @@ subroutine int_assumed_shape_to_assumed_rank(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPint_assumed_shape_to_assumed_rank(
 ! CHECK-SAME:                                                    %[[VAL_0:.*]]: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFint_assumed_shape_to_assumed_rankEx"} : (!fir.box<!fir.array<?x?xi32>>) -> (!fir.box<!fir.array<?x?xi32>>, !fir.box<!fir.array<?x?xi32>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFint_assumed_shape_to_assumed_rankEx"} : (!fir.box<!fir.array<?x?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xi32>>, !fir.box<!fir.array<?x?xi32>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<!fir.array<*:i32>>
 ! CHECK:           fir.call @_QPint_assumed_rank(%[[VAL_2]]) fastmath<contract> : (!fir.box<!fir.array<*:i32>>) -> ()
 
@@ -89,7 +89,7 @@ subroutine int_assumed_shape_to_assumed_rank_bindc(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPint_assumed_shape_to_assumed_rank_bindc(
 ! CHECK-SAME:                                                          %[[VAL_0:.*]]: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFint_assumed_shape_to_assumed_rank_bindcEx"} : (!fir.box<!fir.array<?x?xi32>>) -> (!fir.box<!fir.array<?x?xi32>>, !fir.box<!fir.array<?x?xi32>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFint_assumed_shape_to_assumed_rank_bindcEx"} : (!fir.box<!fir.array<?x?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xi32>>, !fir.box<!fir.array<?x?xi32>>)
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_3:.*]] = fir.shift %[[VAL_2]], %[[VAL_2]] : (index, index) -> !fir.shift<2>
 ! CHECK:           %[[VAL_4:.*]] = fir.rebox %[[VAL_1]]#0(%[[VAL_3]]) : (!fir.box<!fir.array<?x?xi32>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?xi32>>
@@ -103,7 +103,7 @@ subroutine int_allocatable_to_assumed_rank(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPint_allocatable_to_assumed_rank(
 ! CHECK-SAME:                                                  %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFint_allocatable_to_assumed_rankEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFint_allocatable_to_assumed_rankEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>
 ! CHECK:           %[[VAL_3:.*]] = fir.rebox %[[VAL_2]] : (!fir.box<!fir.heap<!fir.array<?x?xi32>>>) -> !fir.box<!fir.array<?x?xi32>>
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<!fir.array<*:i32>>
@@ -116,7 +116,7 @@ subroutine int_allocatable_to_assumed_rank_opt(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPint_allocatable_to_assumed_rank_opt(
 ! CHECK-SAME:                                                      %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFint_allocatable_to_assumed_rank_optEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFint_allocatable_to_assumed_rank_optEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>
 ! CHECK:           %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.heap<!fir.array<?x?xi32>>>) -> !fir.heap<!fir.array<?x?xi32>>
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.heap<!fir.array<?x?xi32>>) -> i64
@@ -147,6 +147,6 @@ end subroutine
 ! CHECK:           %[[VAL_5:.*]] = arith.select %[[VAL_4]], %[[VAL_2]], %[[VAL_3]] : index
 ! CHECK:           %[[VAL_6:.*]] = arith.constant -1 : index
 ! CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_5]], %[[VAL_6]] : (index, index) -> !fir.shape<2>
-! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_7]]) {uniq_name = "_QFint_r2_assumed_size_to_assumed_rankEx"} : (!fir.ref<!fir.array<10x?xi32>>, !fir.shape<2>) -> (!fir.box<!fir.array<10x?xi32>>, !fir.ref<!fir.array<10x?xi32>>)
+! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_7]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFint_r2_assumed_size_to_assumed_rankEx"} : (!fir.ref<!fir.array<10x?xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.box<!fir.array<10x?xi32>>, !fir.ref<!fir.array<10x?xi32>>)
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]]#0 : (!fir.box<!fir.array<10x?xi32>>) -> !fir.box<!fir.array<*:i32>>
 ! CHECK:           fir.call @_QPint_assumed_rank(%[[VAL_9]]) fastmath<contract> : (!fir.box<!fir.array<*:i32>>) -> ()
diff --git a/flang/test/Lower/HLFIR/binary-ops.f90 b/flang/test/Lower/HLFIR/binary-ops.f90
index e0af9258cda3..912cea0f5e0e 100644
--- a/flang/test/Lower/HLFIR/binary-ops.f90
+++ b/flang/test/Lower/HLFIR/binary-ops.f90
@@ -6,8 +6,8 @@ subroutine int_add(x, y, z)
  x = y + z
 end subroutine
 ! CHECK-LABEL: func.func @_QPint_add(
-! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:  %[[VAL_6:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<i32>
 ! CHECK:  %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
 ! CHECK:  %[[VAL_8:.*]] = arith.addi %[[VAL_6]], %[[VAL_7]] : i32
@@ -17,8 +17,8 @@ subroutine real_add(x, y, z)
  x = y + z
 end subroutine
 ! CHECK-LABEL: func.func @_QPreal_add(
-! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:  %[[VAL_6:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<f32>
 ! CHECK:  %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<f32>
 ! CHECK:  %[[VAL_8:.*]] = arith.addf %[[VAL_6]], %[[VAL_7]] fastmath<contract> : f32
@@ -28,8 +28,8 @@ subroutine complex_add(x, y, z)
  x = y + z
 end subroutine
 ! CHECK-LABEL: func.func @_QPcomplex_add(
-! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<!fir.complex<4>>) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
-! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<!fir.complex<4>>) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
+! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<!fir.complex<4>>, !fir.dscope) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
+! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<!fir.complex<4>>, !fir.dscope) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
 ! CHECK:  %[[VAL_6:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.complex<4>>
 ! CHECK:  %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<!fir.complex<4>>
 ! CHECK:  %[[VAL_8:.*]] = fir.addc %[[VAL_6]], %[[VAL_7]] {fastmath = #arith.fastmath<contract>} : !fir.complex<4>
@@ -39,8 +39,8 @@ subroutine int_sub(x, y, z)
  x = y - z
 end subroutine
 ! CHECK-LABEL: func.func @_QPint_sub(
-! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:  %[[VAL_6:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<i32>
 ! CHECK:  %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
 ! CHECK:  %[[VAL_8:.*]] = arith.subi %[[VAL_6]], %[[VAL_7]] : i32
@@ -50,8 +50,8 @@ subroutine real_sub(x, y, z)
  x = y - z
 end subroutine
 ! CHECK-LABEL: func.func @_QPreal_sub(
-! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:  %[[VAL_6:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<f32>
 ! CHECK:  %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<f32>
 ! CHECK:  %[[VAL_8:.*]] = arith.subf %[[VAL_6]], %[[VAL_7]] fastmath<contract> : f32
@@ -61,8 +61,8 @@ subroutine complex_sub(x, y, z)
  x = y - z
 end subroutine
 ! CHECK-LABEL: func.func @_QPcomplex_sub(
-! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<!fir.complex<4>>) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
-! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<!fir.complex<4>>) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
+! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<!fir.complex<4>>, !fir.dscope) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
+! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<!fir.complex<4>>, !fir.dscope) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
 ! CHECK:  %[[VAL_6:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.complex<4>>
 ! CHECK:  %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<!fir.complex<4>>
 ! CHECK:  %[[VAL_8:.*]] = fir.subc %[[VAL_6]], %[[VAL_7]] {fastmath = #arith.fastmath<contract>} : !fir.complex<4>
@@ -72,8 +72,8 @@ subroutine int_mul(x, y, z)
  x = y * z
 end subroutine
 ! CHECK-LABEL: func.func @_QPint_mul(
-! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:  %[[VAL_6:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<i32>
 ! CHECK:  %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
 ! CHECK:  %[[VAL_8:.*]] = arith.muli %[[VAL_6]], %[[VAL_7]] : i32
@@ -83,8 +83,8 @@ subroutine real_mul(x, y, z)
  x = y * z
 end subroutine
 ! CHECK-LABEL: func.func @_QPreal_mul(
-! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:  %[[VAL_6:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<f32>
 ! CHECK:  %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<f32>
 ! CHECK:  %[[VAL_8:.*]] = arith.mulf %[[VAL_6]], %[[VAL_7]] fastmath<contract> : f32
@@ -94,8 +94,8 @@ subroutine complex_mul(x, y, z)
  x = y * z
 end subroutine
 ! CHECK-LABEL: func.func @_QPcomplex_mul(
-! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<!fir.complex<4>>) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
-! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<!fir.complex<4>>) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
+! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<!fir.complex<4>>, !fir.dscope) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
+! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<!fir.complex<4>>, !fir.dscope) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
 ! CHECK:  %[[VAL_6:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.complex<4>>
 ! CHECK:  %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<!fir.complex<4>>
 ! CHECK:  %[[VAL_8:.*]] = fir.mulc %[[VAL_6]], %[[VAL_7]] {fastmath = #arith.fastmath<contract>} : !fir.complex<4>
@@ -105,8 +105,8 @@ subroutine int_div(x, y, z)
  x = y / z
 end subroutine
 ! CHECK-LABEL: func.func @_QPint_div(
-! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:  %[[VAL_6:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<i32>
 ! CHECK:  %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
 ! CHECK:  %[[VAL_8:.*]] = arith.divsi %[[VAL_6]], %[[VAL_7]] : i32
@@ -116,8 +116,8 @@ subroutine real_div(x, y, z)
  x = y / z
 end subroutine
 ! CHECK-LABEL: func.func @_QPreal_div(
-! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:  %[[VAL_6:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<f32>
 ! CHECK:  %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<f32>
 ! CHECK:  %[[VAL_8:.*]] = arith.divf %[[VAL_6]], %[[VAL_7]] fastmath<contract> : f32
@@ -127,8 +127,8 @@ subroutine complex_div(x, y, z)
  x = y / z
 end subroutine
 ! CHECK-LABEL: func.func @_QPcomplex_div(
-! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<!fir.complex<4>>) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
-! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<!fir.complex<4>>) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
+! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<!fir.complex<4>>, !fir.dscope) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
+! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<!fir.complex<4>>, !fir.dscope) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
 ! CHECK:  %[[VAL_6:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.complex<4>>
 ! CHECK:  %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<!fir.complex<4>>
 ! CHECK:  %[[VAL_8:.*]] = fir.extract_value %[[VAL_6]], [0 : index] : (!fir.complex<4>) -> f32
@@ -142,8 +142,8 @@ subroutine int_power(x, y, z)
   x = y**z
 end subroutine
 ! CHECK-LABEL: func.func @_QPint_power(
-! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:  %[[VAL_6:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<i32>
 ! CHECK:  %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
 ! CHECK:  %[[VAL_8:.*]] = math.ipowi %[[VAL_6]], %[[VAL_7]] : i32
@@ -153,8 +153,8 @@ subroutine real_power(x, y, z)
   x = y**z
 end subroutine
 ! CHECK-LABEL: func.func @_QPreal_power(
-! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:  %[[VAL_6:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<f32>
 ! CHECK:  %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<f32>
 ! CHECK:  %[[VAL_8:.*]] = math.powf %[[VAL_6]], %[[VAL_7]] fastmath<contract> : f32
@@ -164,8 +164,8 @@ subroutine complex_power(x, y, z)
   x = y**z
 end subroutine
 ! CHECK-LABEL: func.func @_QPcomplex_power(
-! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<!fir.complex<4>>) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
-! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<!fir.complex<4>>) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
+! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<!fir.complex<4>>, !fir.dscope) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
+! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<!fir.complex<4>>, !fir.dscope) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
 ! CHECK:  %[[VAL_6:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.complex<4>>
 ! CHECK:  %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<!fir.complex<4>>
 ! CHECK:  %[[VAL_8:.*]] = fir.call @cpowf(%[[VAL_6]], %[[VAL_7]]) fastmath<contract> : (!fir.complex<4>, !fir.complex<4>) -> !fir.complex<4>
@@ -177,8 +177,8 @@ subroutine real_to_int_power(x, y, z)
   x = y**z
 end subroutine
 ! CHECK-LABEL: func.func @_QPreal_to_int_power(
-! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:  %[[VAL_6:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<f32>
 ! CHECK:  %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
 ! CHECK:  %[[VAL_8:.*]] = math.fpowi %[[VAL_6]], %[[VAL_7]] fastmath<contract> : f32, i32
@@ -189,8 +189,8 @@ subroutine complex_to_int_power(x, y, z)
   x = y**z
 end subroutine
 ! CHECK-LABEL: func.func @_QPcomplex_to_int_power(
-! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<!fir.complex<4>>) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
-! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<!fir.complex<4>>, !fir.dscope) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
+! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:  %[[VAL_6:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.complex<4>>
 ! CHECK:  %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
 ! CHECK:  %[[VAL_8:.*]] = fir.call @_FortranAcpowi(%[[VAL_6]], %[[VAL_7]]) fastmath<contract> : (!fir.complex<4>, i32) -> !fir.complex<4>
@@ -203,7 +203,7 @@ subroutine extremum(c, n, l)
   n = len(c, 8)
 end subroutine
 ! CHECK-LABEL: func.func @_QPextremum(
-! CHECK:  hlfir.declare {{.*}}c
+! CHECK:  hlfir.declare {{.*}}c"}
 ! CHECK:  %[[VAL_11:.*]] = arith.constant 0 : i64
 ! CHECK:  %[[VAL_12:.*]] = fir.load %{{.*}} : !fir.ref<i64>
 ! CHECK:  %[[VAL_13:.*]] = arith.cmpi sgt, %[[VAL_11]], %[[VAL_12]] : i64
@@ -281,8 +281,8 @@ subroutine cmp_char(l, x, y)
   l = x .eq. y
 end subroutine
 ! CHECK-LABEL: func.func @_QPcmp_char(
-! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}} typeparams %[[VAL_4:.*]]#1 {uniq_name = "_QFcmp_charEx"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
-! CHECK:  %[[VAL_7:.*]]:2 = hlfir.declare %{{.*}} typeparams %[[VAL_6:.*]]#1 {uniq_name = "_QFcmp_charEy"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}} typeparams %[[VAL_4:.*]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFcmp_charEx"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:  %[[VAL_7:.*]]:2 = hlfir.declare %{{.*}} typeparams %[[VAL_6:.*]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFcmp_charEy"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:  %[[VAL_8:.*]] = fir.convert %[[VAL_5]]#1 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8>
 ! CHECK:  %[[VAL_9:.*]] = fir.convert %[[VAL_7]]#1 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8>
 ! CHECK:  %[[VAL_10:.*]] = fir.convert %[[VAL_4]]#1 : (index) -> i64
@@ -296,8 +296,8 @@ subroutine logical_and(x, y, z)
   x = y.and.z
 end subroutine
 ! CHECK-LABEL: func.func @_QPlogical_and(
-! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:  %[[VAL_6:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.logical<4>>
 ! CHECK:  %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<!fir.logical<4>>
 ! CHECK:  %[[VAL_8:.*]] = fir.convert %[[VAL_6]] : (!fir.logical<4>) -> i1
@@ -331,8 +331,8 @@ subroutine cmplx_ctor(z, x, y)
   z = cmplx(x, y)
 end subroutine
 ! CHECK-LABEL: func.func @_QPcmplx_ctor(
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}}x"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}}x"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}}y"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:  %[[VAL_6:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<f32>
 ! CHECK:  %[[VAL_7:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<f32>
 ! CHECK:  %[[VAL_8:.*]] = fir.undefined !fir.complex<4>
@@ -345,7 +345,7 @@ subroutine cmplx_ctor_2(z, x)
   z = cmplx(x, 1._8, kind=8)
 end subroutine
 ! CHECK-LABEL: func.func @_QPcmplx_ctor_2(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}}x"} : (!fir.ref<f64>) -> (!fir.ref<f64>, !fir.ref<f64>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}}x"} : (!fir.ref<f64>, !fir.dscope) -> (!fir.ref<f64>, !fir.ref<f64>)
 ! CHECK:  %[[VAL_4:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<f64>
 ! CHECK:  %[[VAL_5:.*]] = arith.constant 1.000000e+00 : f64
 ! CHECK:  %[[VAL_6:.*]] = fir.undefined !fir.complex<8>
diff --git a/flang/test/Lower/HLFIR/bindc-value-derived.f90 b/flang/test/Lower/HLFIR/bindc-value-derived.f90
index 671e6d45b9a9..7103d54c3e3d 100644
--- a/flang/test/Lower/HLFIR/bindc-value-derived.f90
+++ b/flang/test/Lower/HLFIR/bindc-value-derived.f90
@@ -17,7 +17,7 @@ contains
 ! CHECK-SAME:                    %[[VAL_0:.*]]: !fir.type<_QMbindc_byvalTt{i:i32}> {fir.bindc_name = "x"}) attributes {fir.bindc_name = "test"} {
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.type<_QMbindc_byvalTt{i:i32}>
 ! CHECK:           fir.store %[[VAL_0]] to %[[VAL_1]] : !fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QMbindc_byvalFtestEx"} : (!fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>) -> (!fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>, !fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QMbindc_byvalFtestEx"} : (!fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>, !fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>)
 ! CHECK:           %[[VAL_3:.*]] = hlfir.designate %[[VAL_2]]#0{"i"}   : (!fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>) -> !fir.ref<i32>
 ! CHECK:           fir.call @_QPuse_it(%[[VAL_3]]) fastmath<contract> : (!fir.ref<i32>) -> ()
 ! CHECK:           return
@@ -29,7 +29,7 @@ contains
   end subroutine
 ! CHECK-LABEL:   func.func @_QMbindc_byvalPcall_it(
 ! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QMbindc_byvalFcall_itEx"} : (!fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>) -> (!fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>, !fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMbindc_byvalFcall_itEx"} : (!fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>, !fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#1 : !fir.ref<!fir.type<_QMbindc_byvalTt{i:i32}>>
 ! CHECK:           fir.call @test(%[[VAL_2]]) fastmath<contract> : (!fir.type<_QMbindc_byvalTt{i:i32}>) -> ()
 ! CHECK:           return
diff --git a/flang/test/Lower/HLFIR/call-sequence-associated-descriptors.f90 b/flang/test/Lower/HLFIR/call-sequence-associated-descriptors.f90
index 7a2ea5cc14b6..bad647bb3ac9 100644
--- a/flang/test/Lower/HLFIR/call-sequence-associated-descriptors.f90
+++ b/flang/test/Lower/HLFIR/call-sequence-associated-descriptors.f90
@@ -23,7 +23,7 @@ contains
     call takes_char(x, 100)
   end subroutine
 ! CHECK-LABEL:   func.func @_QMbindc_seq_assocPtest_char_1(
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2:.*]](%[[VAL_5:.*]]) typeparams %[[VAL_1:.*]]#1 {uniq_name = "_QMbindc_seq_assocFtest_char_1Ex"} : (!fir.ref<!fir.array<10x20x!fir.char<1,?>>>, !fir.shape<2>, index) -> (!fir.box<!fir.array<10x20x!fir.char<1,?>>>, !fir.ref<!fir.array<10x20x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2:.*]](%[[VAL_5:.*]]) typeparams %[[VAL_1:.*]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QMbindc_seq_assocFtest_char_1Ex"} : (!fir.ref<!fir.array<10x20x!fir.char<1,?>>>, !fir.shape<2>, index, !fir.dscope) -> (!fir.box<!fir.array<10x20x!fir.char<1,?>>>, !fir.ref<!fir.array<10x20x!fir.char<1,?>>>)
 ! CHECK:           %[[VAL_7:.*]] = arith.constant 100 : i32
 ! CHECK:           %[[VAL_8:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_9:.*]] = fir.shift %[[VAL_8]], %[[VAL_8]] : (index, index) -> !fir.shift<2>
@@ -56,7 +56,7 @@ contains
     call takes_char(x, 100)
   end subroutine
 ! CHECK-LABEL:   func.func @_QMbindc_seq_assocPtest_char_copy_in_copy_out(
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] {uniq_name = "_QMbindc_seq_assocFtest_char_copy_in_copy_outEx"} : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.box<!fir.array<?x?x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMbindc_seq_assocFtest_char_copy_in_copy_outEx"} : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.box<!fir.array<?x?x!fir.char<1,?>>>)
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 100 : i32
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, i1)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
@@ -91,7 +91,7 @@ contains
     call takes_char_assumed_size(x)
   end subroutine
 ! CHECK-LABEL:   func.func @_QMbindc_seq_assocPtest_char_assumed_size(
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] {uniq_name = "_QMbindc_seq_assocFtest_char_assumed_sizeEx"} : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.box<!fir.array<?x?x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMbindc_seq_assocFtest_char_assumed_sizeEx"} : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.box<!fir.array<?x?x!fir.char<1,?>>>)
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, i1)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_4:.*]] = fir.shift %[[VAL_3]], %[[VAL_3]] : (index, index) -> !fir.shift<2>
@@ -123,7 +123,7 @@ contains
     call takes_optional_char(x, 100)
   end subroutine
 ! CHECK-LABEL:   func.func @_QMbindc_seq_assocPtest_optional_char(
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2:.*]](%[[VAL_5:.*]]) typeparams %[[VAL_1:.*]]#1 {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMbindc_seq_assocFtest_optional_charEx"} : (!fir.ref<!fir.array<10x20x!fir.char<1,?>>>, !fir.shape<2>, index) -> (!fir.box<!fir.array<10x20x!fir.char<1,?>>>, !fir.ref<!fir.array<10x20x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2:.*]](%[[VAL_5:.*]]) typeparams %[[VAL_1:.*]]#1 dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMbindc_seq_assocFtest_optional_charEx"} : (!fir.ref<!fir.array<10x20x!fir.char<1,?>>>, !fir.shape<2>, index, !fir.dscope) -> (!fir.box<!fir.array<10x20x!fir.char<1,?>>>, !fir.ref<!fir.array<10x20x!fir.char<1,?>>>)
 ! CHECK:           %[[VAL_7:.*]] = fir.is_present %[[VAL_6]]#0 : (!fir.box<!fir.array<10x20x!fir.char<1,?>>>) -> i1
 ! CHECK:           %[[VAL_8:.*]] = arith.constant 100 : i32
 ! CHECK:           %[[VAL_9:.*]] = fir.if %[[VAL_7]] -> (!fir.box<!fir.array<10x20x!fir.char<1,?>>>) {
@@ -186,7 +186,7 @@ contains
   end subroutine
 ! CHECK-LABEL:   func.func @_QMpoly_seq_assocPtest_poly_1(
 ! CHECK-SAME:                                             %[[VAL_0:.*]]: !fir.class<!fir.array<10x20xnone>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] {uniq_name = "_QMpoly_seq_assocFtest_poly_1Ex"} : (!fir.class<!fir.array<10x20xnone>>) -> (!fir.class<!fir.array<10x20xnone>>, !fir.class<!fir.array<10x20xnone>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMpoly_seq_assocFtest_poly_1Ex"} : (!fir.class<!fir.array<10x20xnone>>, !fir.dscope) -> (!fir.class<!fir.array<10x20xnone>>, !fir.class<!fir.array<10x20xnone>>)
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 100 : i32
 ! CHECK:           %[[VAL_3:.*]]:3 = hlfir.associate %[[VAL_2]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]]#1 {uniq_name = "_QMpoly_seq_assocFtakes_polyEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -214,7 +214,7 @@ contains
     call takes_poly(x, 100)
   end subroutine
 ! CHECK-LABEL:   func.func @_QMpoly_seq_assocPtest_poly_copy_in_copy_out(
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] {uniq_name = "_QMpoly_seq_assocFtest_poly_copy_in_copy_outEx"} : (!fir.class<!fir.array<?x?xnone>>) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMpoly_seq_assocFtest_poly_copy_in_copy_outEx"} : (!fir.class<!fir.array<?x?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>)
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 100 : i32
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 : (!fir.class<!fir.array<?x?xnone>>) -> (!fir.class<!fir.array<?x?xnone>>, i1)
 ! CHECK:           %[[VAL_4:.*]]:3 = hlfir.associate %[[VAL_2]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
@@ -244,7 +244,7 @@ contains
     call takes_poly_assumed_size(x)
   end subroutine
 ! CHECK-LABEL:   func.func @_QMpoly_seq_assocPtest_poly_assumed_size(
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] {uniq_name = "_QMpoly_seq_assocFtest_poly_assumed_sizeEx"} : (!fir.class<!fir.array<?x?xnone>>) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMpoly_seq_assocFtest_poly_assumed_sizeEx"} : (!fir.class<!fir.array<?x?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>)
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 : (!fir.class<!fir.array<?x?xnone>>) -> (!fir.class<!fir.array<?x?xnone>>, i1)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 10 : i64
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 1 : i64
@@ -271,7 +271,7 @@ contains
     call takes_optional_poly(x, 100)
   end subroutine
 ! CHECK-LABEL:   func.func @_QMpoly_seq_assocPtest_optional_poly(
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMpoly_seq_assocFtest_optional_polyEx"} : (!fir.class<!fir.array<10x20xnone>>) -> (!fir.class<!fir.array<10x20xnone>>, !fir.class<!fir.array<10x20xnone>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMpoly_seq_assocFtest_optional_polyEx"} : (!fir.class<!fir.array<10x20xnone>>, !fir.dscope) -> (!fir.class<!fir.array<10x20xnone>>, !fir.class<!fir.array<10x20xnone>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.is_present %[[VAL_1]]#0 : (!fir.class<!fir.array<10x20xnone>>) -> i1
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 100 : i32
 ! CHECK:           %[[VAL_4:.*]] = fir.if %[[VAL_2]] -> (!fir.class<!fir.array<10x20xnone>>) {
diff --git a/flang/test/Lower/HLFIR/calls-assumed-shape.f90 b/flang/test/Lower/HLFIR/calls-assumed-shape.f90
index a2094f1f1f0e..cfe607a69102 100644
--- a/flang/test/Lower/HLFIR/calls-assumed-shape.f90
+++ b/flang/test/Lower/HLFIR/calls-assumed-shape.f90
@@ -12,7 +12,7 @@ subroutine test_assumed_to_assumed(x)
   call takes_assumed(x)
 end subroutine
 ! CHECK-LABEL: func.func @_QPtest_assumed_to_assumed(
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] {uniq_name = "_QFtest_assumed_to_assumedEx"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_assumed_to_assumedEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:  fir.call @_QPtakes_assumed(%[[VAL_1]]#0) {{.*}} : (!fir.box<!fir.array<?xf32>>) -> ()
 
 subroutine test_ptr_to_assumed(p)
@@ -25,7 +25,7 @@ subroutine test_ptr_to_assumed(p)
   call takes_assumed(p)
 end subroutine
 ! CHECK-LABEL: func.func @_QPtest_ptr_to_assumed(
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_ptr_to_assumedEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_ptr_to_assumedEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
 ! CHECK:  %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
 ! CHECK:  %[[VAL_3:.*]] = fir.rebox %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.box<!fir.array<?xf32>>
 ! CHECK:  fir.call @_QPtakes_assumed(%[[VAL_3]]) {{.*}} : (!fir.box<!fir.array<?xf32>>) -> ()
@@ -40,7 +40,7 @@ subroutine test_ptr_to_contiguous_assumed(p)
   call takes_contiguous_assumed(p)
 end subroutine
 ! CHECK-LABEL: func.func @_QPtest_ptr_to_contiguous_assumed(
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_ptr_to_contiguous_assumedEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_ptr_to_contiguous_assumedEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
 ! CHECK:  %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
 ! CHECK:  %[[VAL_3:.*]]:2 = hlfir.copy_in %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> (!fir.box<!fir.ptr<!fir.array<?xf32>>>, i1)
 ! CHECK:  %[[VAL_4:.*]] = fir.rebox %[[VAL_3]]#0 : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.box<!fir.array<?xf32>>
@@ -57,7 +57,7 @@ subroutine test_ptr_to_contiguous_assumed_classstar(p)
   call takes_contiguous_assumed_classstar(p)
 end subroutine
 ! CHECK-LABEL: func.func @_QPtest_ptr_to_contiguous_assumed_classstar(
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_ptr_to_contiguous_assumed_classstarEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_ptr_to_contiguous_assumed_classstarEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
 ! CHECK:  %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
 ! CHECK:  %[[VAL_3:.*]]:2 = hlfir.copy_in %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> (!fir.box<!fir.ptr<!fir.array<?xf32>>>, i1)
 ! CHECK:  %[[VAL_4:.*]] = fir.rebox %[[VAL_3]]#0 : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.class<!fir.array<?xnone>>
@@ -74,7 +74,7 @@ subroutine test_ptr_to_assumed_typestar(p)
   call takes_assumed_typestar(p)
 end subroutine
 ! CHECK-LABEL: func.func @_QPtest_ptr_to_assumed_typestar(
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_ptr_to_assumed_typestarEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_ptr_to_assumed_typestarEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
 ! CHECK:  %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
 ! CHECK:  %[[VAL_3:.*]] = fir.rebox %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.box<!fir.array<?xnone>>
 ! CHECK:  fir.call @_QPtakes_assumed_typestar(%[[VAL_3]]) {{.*}} : (!fir.box<!fir.array<?xnone>>) -> ()
@@ -94,7 +94,7 @@ end subroutine
 ! CHECK:  %[[VAL_2:.*]] = arith.constant 10 : index
 ! CHECK:  %[[VAL_4:.*]] = arith.constant 20 : index
 ! CHECK:  %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
-! CHECK:  %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_3]](%[[VAL_5:[a-z0-9]*]]) typeparams %[[VAL_2:[a-z0-9]*]] {uniq_name = "_QFtest_explicit_char_to_boxEe"} : (!fir.ref<!fir.array<20x!fir.char<1,10>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<20x!fir.char<1,10>>>, !fir.ref<!fir.array<20x!fir.char<1,10>>>)
+! CHECK:  %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_3]](%[[VAL_5:[a-z0-9]*]]) typeparams %[[VAL_2:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_explicit_char_to_boxEe"} : (!fir.ref<!fir.array<20x!fir.char<1,10>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<20x!fir.char<1,10>>>, !fir.ref<!fir.array<20x!fir.char<1,10>>>)
 ! CHECK:  %[[VAL_7:.*]] = fir.embox %[[VAL_6]]#0(%[[VAL_5]]) : (!fir.ref<!fir.array<20x!fir.char<1,10>>>, !fir.shape<1>) -> !fir.box<!fir.array<20x!fir.char<1,10>>>
 ! CHECK:  %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (!fir.box<!fir.array<20x!fir.char<1,10>>>) -> !fir.box<!fir.array<?x!fir.char<1,?>>>
 ! CHECK:  fir.call @_QPtakes_assumed_character(%[[VAL_8]]) {{.*}} : (!fir.box<!fir.array<?x!fir.char<1,?>>>) -> ()
@@ -109,7 +109,7 @@ subroutine test_explicit_by_val(x)
   call takes_explicit_by_value(x)
 end subroutine
 ! CHECK-LABEL: func.func @_QPtest_explicit_by_val(
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]](%[[VAL_2:[a-z0-9]*]]) {uniq_name = "_QFtest_explicit_by_valEx"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]](%[[VAL_2:[a-z0-9]*]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_explicit_by_valEx"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
 ! CHECK:  %[[VAL_4:.*]] = hlfir.as_expr %[[VAL_3]]#0 : (!fir.ref<!fir.array<10xf32>>) -> !hlfir.expr<10xf32>
 ! CHECK:  %[[VAL_5:.*]]:3 = hlfir.associate %[[VAL_4]](%[[VAL_2]]) {adapt.valuebyref} : (!hlfir.expr<10xf32>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>, i1)
 ! CHECK:  fir.call @_QPtakes_explicit_by_value(%[[VAL_5]]#1) {{.*}} : (!fir.ref<!fir.array<10xf32>>) -> ()
diff --git a/flang/test/Lower/HLFIR/calls-constant-expr-arg.f90 b/flang/test/Lower/HLFIR/calls-constant-expr-arg.f90
index cfe8cf726045..7c8faf4fca8f 100644
--- a/flang/test/Lower/HLFIR/calls-constant-expr-arg.f90
+++ b/flang/test/Lower/HLFIR/calls-constant-expr-arg.f90
@@ -18,7 +18,7 @@ end subroutine sub
 ! CHECK-LABEL:   func.func @_QPsub(
 ! CHECK-SAME:                      %[[VAL_0:.*]]: !fir.ref<!fir.array<?xi32>> {fir.bindc_name = "i"},
 ! CHECK-SAME:                      %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFsubEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFsubEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i32>
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i32) -> i64
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i64) -> index
@@ -26,7 +26,7 @@ end subroutine sub
 ! CHECK:           %[[VAL_7:.*]] = arith.cmpi sgt, %[[VAL_5]], %[[VAL_6]] : index
 ! CHECK:           %[[VAL_8:.*]] = arith.select %[[VAL_7]], %[[VAL_5]], %[[VAL_6]] : index
 ! CHECK:           %[[VAL_9:.*]] = fir.shape %[[VAL_8]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_9]]) {uniq_name = "_QFsubEi"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_9]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsubEi"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>)
 ! CHECK:           %[[VAL_11:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_12:.*]] = arith.constant 2 : index
 ! CHECK:           %[[VAL_13:.*]] = arith.constant 0 : index
diff --git a/flang/test/Lower/HLFIR/calls-f77.f90 b/flang/test/Lower/HLFIR/calls-f77.f90
index cefe379a45d3..a970deb056f5 100644
--- a/flang/test/Lower/HLFIR/calls-f77.f90
+++ b/flang/test/Lower/HLFIR/calls-f77.f90
@@ -18,7 +18,7 @@ subroutine call_int_arg_var(n)
 end subroutine
 ! CHECK-LABEL: func.func @_QPcall_int_arg_var(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<i32>
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFcall_int_arg_varEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFcall_int_arg_varEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:  fir.call @_QPtake_i4(%[[VAL_1]]#1) fastmath<contract> : (!fir.ref<i32>) -> ()
 
 subroutine call_int_arg_expr()
@@ -45,7 +45,7 @@ subroutine call_real_arg_var(x)
 end subroutine
 ! CHECK-LABEL: func.func @_QPcall_real_arg_var(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<f32>
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFcall_real_arg_varEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFcall_real_arg_varEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:  fir.call @_QPtake_r4(%[[VAL_1]]#1) fastmath<contract> : (!fir.ref<f32>) -> ()
 
 subroutine call_logical_arg_var(x)
@@ -54,7 +54,7 @@ subroutine call_logical_arg_var(x)
 end subroutine
 ! CHECK-LABEL: func.func @_QPcall_logical_arg_var(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFcall_logical_arg_varEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFcall_logical_arg_varEx"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:  fir.call @_QPtake_l4(%[[VAL_1]]#1) fastmath<contract> : (!fir.ref<!fir.logical<4>>) -> ()
 
 subroutine call_logical_arg_expr()
@@ -84,7 +84,7 @@ end subroutine
 ! CHECK-LABEL: func.func @_QPcall_char_arg_var(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.boxchar<1>
 ! CHECK:  %[[VAL_1:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 {uniq_name = "_QFcall_char_arg_varEx"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFcall_char_arg_varEx"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:  fir.call @_QPtake_c(%[[VAL_2]]#0) fastmath<contract> : (!fir.boxchar<1>) -> ()
 
 subroutine call_char_arg_var_expr(x)
@@ -94,7 +94,7 @@ end subroutine
 ! CHECK-LABEL: func.func @_QPcall_char_arg_var_expr(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.boxchar<1>
 ! CHECK:  %[[VAL_1:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 {uniq_name = "_QFcall_char_arg_var_exprEx"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFcall_char_arg_var_exprEx"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:  %[[VAL_3:.*]] = arith.addi %[[VAL_1]]#1, %[[VAL_1]]#1 : index
 ! CHECK:  %[[VAL_4:.*]] = hlfir.concat %[[VAL_2]]#0, %[[VAL_2]]#0 len %[[VAL_3]] : (!fir.boxchar<1>, !fir.boxchar<1>, index) -> !hlfir.expr<!fir.char<1,?>>
 ! CHECK:  %[[VAL_5:.*]]:3 = hlfir.associate %[[VAL_4]] typeparams %[[VAL_3]] {adapt.valuebyref} : (!hlfir.expr<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>, i1)
@@ -110,7 +110,7 @@ end subroutine
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 10 : index
 ! CHECK:  %[[VAL_2:.*]] = arith.constant 20 : index
 ! CHECK:  %[[VAL_3:.*]] = fir.shape %[[VAL_1]], %[[VAL_2]] : (index, index) -> !fir.shape<2>
-! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) {uniq_name = "_QFcall_arg_array_varEn"} : (!fir.ref<!fir.array<10x20xi32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<10x20xi32>>, !fir.ref<!fir.array<10x20xi32>>)
+! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFcall_arg_array_varEn"} : (!fir.ref<!fir.array<10x20xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xi32>>, !fir.ref<!fir.array<10x20xi32>>)
 ! CHECK:  fir.call @_QPtake_arr(%[[VAL_4]]#1) fastmath<contract> : (!fir.ref<!fir.array<10x20xi32>>) -> ()
 
 subroutine call_arg_array_2(n)
@@ -119,7 +119,7 @@ subroutine call_arg_array_2(n)
 end subroutine
 ! CHECK-LABEL: func.func @_QPcall_arg_array_2(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.box<!fir.array<?x?xi32>>
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<contiguous, optional>, uniq_name = "_QFcall_arg_array_2En"} : (!fir.box<!fir.array<?x?xi32>>) -> (!fir.box<!fir.array<?x?xi32>>, !fir.box<!fir.array<?x?xi32>>)
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<contiguous, optional>, uniq_name = "_QFcall_arg_array_2En"} : (!fir.box<!fir.array<?x?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xi32>>, !fir.box<!fir.array<?x?xi32>>)
 ! CHECK:  %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]]#1 : (!fir.box<!fir.array<?x?xi32>>) -> !fir.ref<!fir.array<?x?xi32>>
 ! CHECK:  fir.call @_QPtake_arr_2(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.array<?x?xi32>>) -> ()
 
diff --git a/flang/test/Lower/HLFIR/calls-optional.f90 b/flang/test/Lower/HLFIR/calls-optional.f90
index df9519a24fb7..1ada5b198aed 100644
--- a/flang/test/Lower/HLFIR/calls-optional.f90
+++ b/flang/test/Lower/HLFIR/calls-optional.f90
@@ -14,7 +14,7 @@ subroutine optional_copy_in_out(x)
   call  takes_optional_explicit(x)
 end subroutine
 ! CHECK-LABEL: func.func @_QPoptional_copy_in_out(
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFoptional_copy_in_outEx"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFoptional_copy_in_outEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:  %[[VAL_2:.*]] = fir.is_present %[[VAL_1]]#0 : (!fir.box<!fir.array<?xf32>>) -> i1
 ! CHECK:  %[[VAL_3:.*]]:4 = fir.if %[[VAL_2]] -> (!fir.ref<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>, i1, !fir.box<!fir.array<?xf32>>) {
 ! CHECK:    %[[VAL_4:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, i1)
@@ -40,7 +40,7 @@ subroutine optional_value_copy(x)
   call  takes_optional_explicit_value(x)
 end subroutine
 ! CHECK-LABEL: func.func @_QPoptional_value_copy(
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]](%[[VAL_2:[a-z0-9]*]]) {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFoptional_value_copyEx"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]](%[[VAL_2:[a-z0-9]*]]) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFoptional_value_copyEx"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
 ! CHECK:  %[[VAL_4:.*]] = fir.is_present %[[VAL_3]]#0 : (!fir.ref<!fir.array<100xf32>>) -> i1
 ! CHECK:  %[[VAL_5:.*]]:3 = fir.if %[[VAL_4]] -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>, i1) {
 ! CHECK:    %[[VAL_6:.*]] = hlfir.as_expr %[[VAL_3]]#0 : (!fir.ref<!fir.array<100xf32>>) -> !hlfir.expr<100xf32>
@@ -66,8 +66,8 @@ subroutine elem_pointer_to_optional(x, y)
   call elem_takes_two_optional(x, y)
 end subroutine
 ! CHECK-LABEL: func.func @_QPelem_pointer_to_optional(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] {uniq_name = "_QFelem_pointer_to_optionalEx"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFelem_pointer_to_optionalEy"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFelem_pointer_to_optionalEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFelem_pointer_to_optionalEy"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
 ! CHECK:  %[[VAL_4:.*]] = fir.load %[[VAL_3]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
 ! CHECK:  %[[VAL_5:.*]] = fir.box_addr %[[VAL_4]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.ptr<!fir.array<?xf32>>
 ! CHECK:  %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (!fir.ptr<!fir.array<?xf32>>) -> i64
@@ -105,7 +105,7 @@ subroutine optional_cannot_be_absent_optional(x)
   call elem_takes_one_optional(x)
 end subroutine
 ! CHECK-LABEL: func.func @_QPoptional_cannot_be_absent_optional(
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFoptional_cannot_be_absent_optionalEx"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFoptional_cannot_be_absent_optionalEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:  %[[VAL_2:.*]] = arith.constant 0 : index
 ! CHECK:  %[[VAL_3:.*]]:3 = fir.box_dims %[[VAL_1]]#0, %[[VAL_2]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
 ! CHECK:  %[[VAL_4:.*]] = arith.constant 1 : index
@@ -125,8 +125,8 @@ subroutine optional_elem_poly(x, y)
   call elem_optional_poly(x, y)
 end subroutine
 ! CHECK-LABEL: func.func @_QPoptional_elem_poly(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] {uniq_name = "_QFoptional_elem_polyEx"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFoptional_elem_polyEy"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFoptional_elem_polyEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFoptional_elem_polyEy"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:  %[[VAL_4:.*]] = fir.is_present %[[VAL_3]]#0 : (!fir.box<!fir.array<?xf32>>) -> i1
 ! CHECK:  %[[VAL_5:.*]] = arith.constant 0 : index
 ! CHECK:  %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_2]]#0, %[[VAL_5]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
diff --git a/flang/test/Lower/HLFIR/calls-percent-val-ref.f90 b/flang/test/Lower/HLFIR/calls-percent-val-ref.f90
index c6acc42455f1..c8724e6d7bee 100644
--- a/flang/test/Lower/HLFIR/calls-percent-val-ref.f90
+++ b/flang/test/Lower/HLFIR/calls-percent-val-ref.f90
@@ -7,7 +7,7 @@ subroutine test_val_1(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_val_1(
 ! CHECK-SAME:                             %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest_val_1Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_val_1Ex"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<i32>
 ! CHECK:           fir.call @_QPval1(%[[VAL_2]]) fastmath<contract> : (i32) -> ()
 
@@ -17,7 +17,7 @@ subroutine test_val_2(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_val_2(
 ! CHECK-SAME:                             %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.complex<4>>>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_val_2Ex"} : (!fir.ref<!fir.box<!fir.heap<!fir.complex<4>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.complex<4>>>>, !fir.ref<!fir.box<!fir.heap<!fir.complex<4>>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_val_2Ex"} : (!fir.ref<!fir.box<!fir.heap<!fir.complex<4>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.complex<4>>>>, !fir.ref<!fir.box<!fir.heap<!fir.complex<4>>>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.complex<4>>>>
 ! CHECK:           %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.heap<!fir.complex<4>>>) -> !fir.heap<!fir.complex<4>>
 ! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_3]] : !fir.heap<!fir.complex<4>>
@@ -32,7 +32,7 @@ end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_ref_char(
 ! CHECK-SAME:                                %[[VAL_0:.*]]: !fir.boxchar<1> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 {uniq_name = "_QFtest_ref_charEx"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_ref_charEx"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:           %[[VAL_3:.*]]:2 = fir.unboxchar %[[VAL_2]]#0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK:           fir.call @_QPref_char(%[[VAL_3]]#0) fastmath<contract> : (!fir.ref<!fir.char<1,?>>) -> ()
 
@@ -42,7 +42,7 @@ subroutine test_ref_1(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_ref_1(
 ! CHECK-SAME:                             %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest_ref_1Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_ref_1Ex"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           fir.call @_QPref1(%[[VAL_1]]#1) fastmath<contract> : (!fir.ref<i32>) -> ()
 
 subroutine test_ref_2(x)
@@ -51,7 +51,7 @@ subroutine test_ref_2(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_ref_2(
 ! CHECK-SAME:                             %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_ref_2Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_ref_2Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>
 ! CHECK:           %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.complex<4>>>) -> !fir.ptr<!fir.complex<4>>
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ptr<!fir.complex<4>>) -> !fir.ref<!fir.complex<4>>
@@ -63,7 +63,7 @@ subroutine test_skip_copy_in_out(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_skip_copy_in_out(
 ! CHECK-SAME:                                        %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest_skip_copy_in_outEx"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_skip_copy_in_outEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]]#1 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
 ! CHECK:           %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.array<?xf32>>) -> i64
 ! CHECK:           fir.call @_QPval3(%[[VAL_3]]) fastmath<contract> : (i64) -> ()
diff --git a/flang/test/Lower/HLFIR/calls-poly-to-assumed-type.f90 b/flang/test/Lower/HLFIR/calls-poly-to-assumed-type.f90
index b14f1bb1f443..05885e729f93 100644
--- a/flang/test/Lower/HLFIR/calls-poly-to-assumed-type.f90
+++ b/flang/test/Lower/HLFIR/calls-poly-to-assumed-type.f90
@@ -12,7 +12,7 @@ subroutine pass_poly_to_assumed_type_assumed_size(x)
   call assumed_type_assumed_size(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPpass_poly_to_assumed_type_assumed_size(
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFpass_poly_to_assumed_type_assumed_sizeEx"} : (!fir.class<!fir.array<?x?xnone>>) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFpass_poly_to_assumed_type_assumed_sizeEx"} : (!fir.class<!fir.array<?x?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>)
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 : (!fir.class<!fir.array<?x?xnone>>) -> (!fir.class<!fir.array<?x?xnone>>, i1)
 ! CHECK:           %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]]#0 : (!fir.class<!fir.array<?x?xnone>>) -> !fir.ref<!fir.array<?x?xnone>>
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.array<?x?xnone>>) -> !fir.ref<!fir.array<?xnone>>
diff --git a/flang/test/Lower/HLFIR/char_extremum.f03 b/flang/test/Lower/HLFIR/char_extremum.f03
index cc7b80184935..4996128a3753 100644
--- a/flang/test/Lower/HLFIR/char_extremum.f03
+++ b/flang/test/Lower/HLFIR/char_extremum.f03
@@ -8,11 +8,11 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPmax1
 ! CHECK:  %[[VAL_0:[a-zA-Z0-9_]*]]:2 = fir.unboxchar %{{.*}} : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:  %[[VAL_1:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_0]]#0 typeparams %[[VAL_0]]#1 {{.*}} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:  %[[VAL_1:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_0]]#0 typeparams %[[VAL_0]]#1 dummy_scope %{{[0-9]+}} {{.*}} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:  %[[VAL_2:[a-zA-Z0-9_]*]]:2 = fir.unboxchar %{{.*}} : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:  %[[VAL_3:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 {{.*}} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:  %[[VAL_3:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 dummy_scope %{{[0-9]+}} {{.*}} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:  %[[VAL_4:[a-zA-Z0-9_]*]]:2 = fir.unboxchar %{{.*}} : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:  %[[VAL_5:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_4]]#0 typeparams %[[VAL_4]]#1 {{.*}} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:  %[[VAL_5:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_4]]#0 typeparams %[[VAL_4]]#1 dummy_scope %{{[0-9]+}} {{.*}} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:  %[[VAL_6:[a-zA-Z0-9_]*]] = hlfir.char_extremum max, %[[VAL_3]]#0, %[[VAL_5]]#0 : (!fir.boxchar<1>, !fir.boxchar<1>) -> !hlfir.expr<!fir.char<1,?>>
 ! CHECK:  hlfir.assign %[[VAL_6]] to %[[VAL_1]]#0 : !hlfir.expr<!fir.char<1,?>>, !fir.boxchar<1>
 ! CHECK:  hlfir.destroy %[[VAL_6]] : !hlfir.expr<!fir.char<1,?>>
@@ -23,11 +23,11 @@ subroutine min1(c1, c2, c3)
 end subroutine
 ! CHECK-LABEL: func.func @_QPmin1
 ! CHECK:  %[[VAL_0:[a-zA-Z0-9_]*]]:2 = fir.unboxchar %{{.*}} : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:  %[[VAL_1:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_0]]#0 typeparams %[[VAL_0]]#1 {{.*}} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:  %[[VAL_1:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_0]]#0 typeparams %[[VAL_0]]#1 dummy_scope %{{[0-9]+}} {{.*}} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:  %[[VAL_2:[a-zA-Z0-9_]*]]:2 = fir.unboxchar %{{.*}} : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:  %[[VAL_3:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 {{.*}} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:  %[[VAL_3:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 dummy_scope %{{[0-9]+}} {{.*}} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:  %[[VAL_4:[a-zA-Z0-9_]*]]:2 = fir.unboxchar %{{.*}} : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:  %[[VAL_5:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_4]]#0 typeparams %[[VAL_4]]#1 {{.*}} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:  %[[VAL_5:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_4]]#0 typeparams %[[VAL_4]]#1 dummy_scope %{{[0-9]+}} {{.*}} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:  %[[VAL_6:[a-zA-Z0-9_]*]] = hlfir.char_extremum min, %[[VAL_3]]#0, %[[VAL_5]]#0 : (!fir.boxchar<1>, !fir.boxchar<1>) -> !hlfir.expr<!fir.char<1,?>>
 ! CHECK:  hlfir.assign %[[VAL_6]] to %[[VAL_1]]#0 : !hlfir.expr<!fir.char<1,?>>, !fir.boxchar<1>
 ! CHECK:  hlfir.destroy %[[VAL_6]] : !hlfir.expr<!fir.char<1,?>>
@@ -43,19 +43,19 @@ end subroutine
 ! CHECK:  %[[VAL_1:[a-zA-Z0-9_]*]] = fir.convert %[[VAL_0]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<100x!fir.char<1,?>>>
 ! CHECK:  %[[VAL_C100:[a-zA-Z0-9_]*]] = arith.constant 100 : index
 ! CHECK:  %[[VAL_2:[a-zA-Z0-9_]*]]  = fir.shape %[[VAL_C100]] : (index) -> !fir.shape<1>
-! CHECK:  %[[VAL_3:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_2]]) typeparams %[[VAL_0]]#1 {{.*}} : (!fir.ref<!fir.array<100x!fir.char<1,?>>>, !fir.shape<1>, index) -> (!fir.box<!fir.array<100x!fir.char<1,?>>>, !fir.ref<!fir.array<100x!fir.char<1,?>>>)
+! CHECK:  %[[VAL_3:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_2]]) typeparams %[[VAL_0]]#1 dummy_scope %{{[0-9]+}} {{.*}} : (!fir.ref<!fir.array<100x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<100x!fir.char<1,?>>>, !fir.ref<!fir.array<100x!fir.char<1,?>>>)
 ! CHECK:  %[[VAL_4:[a-zA-Z0-9_]*]]:2 = fir.unboxchar %{{.*}} : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK:  %[[VAL_5:[a-zA-Z0-9_]*]] = fir.convert %[[VAL_4]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<100x!fir.char<1,10>>>
 ! CHECK:  %[[VAL_C10:[a-zA-Z0-9_]*]] = arith.constant 10 : index
 ! CHECK:  %[[VAL_C100_0:[a-zA-Z0-9_]*]] = arith.constant 100 : index
 ! CHECK:  %[[VAL_6:[a-zA-Z0-9_]*]] = fir.shape %[[VAL_C100_0]] : (index) -> !fir.shape<1>
-! CHECK:  %[[VAL_7:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_5]](%[[VAL_6]]) typeparams %[[VAL_C10]] {{.*}} : (!fir.ref<!fir.array<100x!fir.char<1,10>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<100x!fir.char<1,10>>>, !fir.ref<!fir.array<100x!fir.char<1,10>>>)
+! CHECK:  %[[VAL_7:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_5]](%[[VAL_6]]) typeparams %[[VAL_C10]] dummy_scope %{{[0-9]+}} {{.*}} : (!fir.ref<!fir.array<100x!fir.char<1,10>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.char<1,10>>>, !fir.ref<!fir.array<100x!fir.char<1,10>>>)
 ! CHECK:  %[[VAL_8:[a-zA-Z0-9_]*]]:2 = fir.unboxchar %{{.*}} : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK:  %[[VAL_9:[a-zA-Z0-9_]*]] = fir.convert %[[VAL_8]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<100x!fir.char<1,20>>>
 ! CHECK:  %[[VAL_C20:[a-zA-Z0-9_]*]] = arith.constant 20 : index
 ! CHECK:  %[[VAL_C100_1:[a-zA-Z0-9_]*]] = arith.constant 100 : index
 ! CHECK:  %[[VAL_10:[a-zA-Z0-9_]*]] = fir.shape %[[VAL_C100_1]] : (index) -> !fir.shape<1>
-! CHECK:  %[[VAL_11:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_9]](%[[VAL_10]]) typeparams %[[VAL_C20]] {{.*}} : (!fir.ref<!fir.array<100x!fir.char<1,20>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<100x!fir.char<1,20>>>, !fir.ref<!fir.array<100x!fir.char<1,20>>>)
+! CHECK:  %[[VAL_11:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_9]](%[[VAL_10]]) typeparams %[[VAL_C20]] dummy_scope %{{[0-9]+}} {{.*}} : (!fir.ref<!fir.array<100x!fir.char<1,20>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.char<1,20>>>, !fir.ref<!fir.array<100x!fir.char<1,20>>>)
 ! CHECK:  %[[VAL_C1:[a-zA-Z0-9_]*]] = arith.constant 1 : index
 ! CHECK:  %[[VAL_12:[a-zA-Z0-9_]*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_C1]])  typeparams %[[VAL_C10]] : (!fir.ref<!fir.array<100x!fir.char<1,10>>>, index, index) -> !fir.ref<!fir.char<1,10>>
 ! CHECK:  %[[VAL_C1_2:[a-zA-Z0-9_]*]] = arith.constant 1 : index
@@ -76,19 +76,19 @@ end subroutine
 ! CHECK:  %[[VAL_1:[a-zA-Z0-9_]*]] = fir.convert %[[VAL_0]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<100x!fir.char<1,?>>>
 ! CHECK:  %[[VAL_C100:[a-zA-Z0-9_]*]] = arith.constant 100 : index
 ! CHECK:  %[[VAL_2:[a-zA-Z0-9_]*]] = fir.shape %[[VAL_C100]] : (index) -> !fir.shape<1>
-! CHECK:  %[[VAL_3:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_2]]) typeparams %[[VAL_0]]#1 {{.*}} : (!fir.ref<!fir.array<100x!fir.char<1,?>>>, !fir.shape<1>, index) -> (!fir.box<!fir.array<100x!fir.char<1,?>>>, !fir.ref<!fir.array<100x!fir.char<1,?>>>)
+! CHECK:  %[[VAL_3:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_2]]) typeparams %[[VAL_0]]#1 dummy_scope %{{[0-9]+}} {{.*}} : (!fir.ref<!fir.array<100x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<100x!fir.char<1,?>>>, !fir.ref<!fir.array<100x!fir.char<1,?>>>)
 ! CHECK:  %[[VAL_4:[a-zA-Z0-9_]*]]:2 = fir.unboxchar %arg1 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK:  %[[VAL_5:[a-zA-Z0-9_]*]] = fir.convert %[[VAL_4]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<100x!fir.char<1,10>>>
 ! CHECK:  %[[VAL_C10:[a-zA-Z0-9_]*]] = arith.constant 10 : index
 ! CHECK:  %[[VAL_C100_0:[a-zA-Z0-9_]*]] = arith.constant 100 : index
 ! CHECK:  %[[VAL_6:[a-zA-Z0-9_]*]] = fir.shape %[[VAL_C100_0]] : (index) -> !fir.shape<1>
-! CHECK:  %[[VAL_7:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_5]](%[[VAL_6]]) typeparams %[[VAL_C10]] {{.*}} : (!fir.ref<!fir.array<100x!fir.char<1,10>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<100x!fir.char<1,10>>>, !fir.ref<!fir.array<100x!fir.char<1,10>>>)
+! CHECK:  %[[VAL_7:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_5]](%[[VAL_6]]) typeparams %[[VAL_C10]] dummy_scope %{{[0-9]+}} {{.*}} : (!fir.ref<!fir.array<100x!fir.char<1,10>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.char<1,10>>>, !fir.ref<!fir.array<100x!fir.char<1,10>>>)
 ! CHECK:  %[[VAL_8:[a-zA-Z0-9_]*]]:2 = fir.unboxchar %arg2 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK:  %[[VAL_C9:[a-zA-Z0-9_]*]] = fir.convert %[[VAL_8]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<100x!fir.char<1,20>>>
 ! CHECK:  %[[VAL_C20:[a-zA-Z0-9_]*]] = arith.constant 20 : index
 ! CHECK:  %[[VAL_C100_1:[a-zA-Z0-9_]*]] = arith.constant 100 : index
 ! CHECK:  %[[VAL_10:[a-zA-Z0-9_]*]] = fir.shape %[[VAL_C100_1]] : (index) -> !fir.shape<1>
-! CHECK:  %[[VAL_11:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_C9]](%[[VAL_10]]) typeparams %[[VAL_C20]] {{.*}} : (!fir.ref<!fir.array<100x!fir.char<1,20>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<100x!fir.char<1,20>>>, !fir.ref<!fir.array<100x!fir.char<1,20>>>)
+! CHECK:  %[[VAL_11:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_C9]](%[[VAL_10]]) typeparams %[[VAL_C20]] dummy_scope %{{[0-9]+}} {{.*}} : (!fir.ref<!fir.array<100x!fir.char<1,20>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.char<1,20>>>, !fir.ref<!fir.array<100x!fir.char<1,20>>>)
 ! CHECK:  %[[VAL_C1:[a-zA-Z0-9_]*]] = arith.constant 1 : index
 ! CHECK:  %[[VAL_12:[a-zA-Z0-9_]*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_C1]])  typeparams %[[VAL_C10]] : (!fir.ref<!fir.array<100x!fir.char<1,10>>>, index, index) -> !fir.ref<!fir.char<1,10>>
 ! CHECK:  %[[VAL_C1_2:[a-zA-Z0-9_]*]] = arith.constant 1 : index
@@ -105,13 +105,13 @@ subroutine max3(c1, c2, c3, c4)
 end subroutine
 ! CHECK-LABEL: func.func @_QPmax3
 ! CHECK:  %[[VAL_0:[a-zA-Z0-9_]*]]:2 = fir.unboxchar %arg0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:  %[[VAL_1:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_0]]#0 typeparams %[[VAL_0]]#1 {{.*}} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:  %[[VAL_1:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_0]]#0 typeparams %[[VAL_0]]#1 dummy_scope %{{[0-9]+}} {{.*}} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:  %[[VAL_2:[a-zA-Z0-9_]*]]:2 = fir.unboxchar %arg1 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:  %[[VAL_3:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 {{.*}} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:  %[[VAL_3:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 dummy_scope %{{[0-9]+}} {{.*}} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:  %[[VAL_4:[a-zA-Z0-9_]*]]:2 = fir.unboxchar %arg2 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:  %[[VAL_5:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_4]]#0 typeparams %[[VAL_4]]#1 {{.*}} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:  %[[VAL_5:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_4]]#0 typeparams %[[VAL_4]]#1 dummy_scope %{{[0-9]+}} {{.*}} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:  %[[VAL_6:[a-zA-Z0-9_]*]]:2 = fir.unboxchar %arg3 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:  %[[VAL_7:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_6]]#0 typeparams %[[VAL_6]]#1 {{.*}} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:  %[[VAL_7:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_6]]#0 typeparams %[[VAL_6]]#1 dummy_scope %{{[0-9]+}} {{.*}} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:  %[[VAL_8:[a-zA-Z0-9_]*]] = hlfir.char_extremum max, %[[VAL_3]]#0, %[[VAL_5]]#0, %[[VAL_7]]#0 : (!fir.boxchar<1>, !fir.boxchar<1>, !fir.boxchar<1>) -> !hlfir.expr<!fir.char<1,?>>
 ! CHECK:  hlfir.assign %[[VAL_8]] to %[[VAL_1]]#0 : !hlfir.expr<!fir.char<1,?>>, !fir.boxchar<1>
 ! CHECK:  hlfir.destroy %[[VAL_8]] : !hlfir.expr<!fir.char<1,?>>
@@ -122,13 +122,13 @@ subroutine min3(c1, c2, c3, c4)
 end subroutine
 ! CHECK-LABEL: func.func @_QPmin3
 ! CHECK:  %[[VAL_0:[a-zA-Z0-9_]*]]:2 = fir.unboxchar %arg0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:  %[[VAL_1:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_0]]#0 typeparams %[[VAL_0]]#1 {{.*}} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:  %[[VAL_1:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_0]]#0 typeparams %[[VAL_0]]#1 dummy_scope %{{[0-9]+}} {{.*}} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:  %[[VAL_2:[a-zA-Z0-9_]*]]:2 = fir.unboxchar %arg1 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:  %[[VAL_3:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 {{.*}} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:  %[[VAL_3:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 dummy_scope %{{[0-9]+}} {{.*}} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:  %[[VAL_4:[a-zA-Z0-9_]*]]:2 = fir.unboxchar %arg2 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:  %[[VAL_5:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_4]]#0 typeparams %[[VAL_4]]#1 {{.*}} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:  %[[VAL_5:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_4]]#0 typeparams %[[VAL_4]]#1 dummy_scope %{{[0-9]+}} {{.*}} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:  %[[VAL_6:[a-zA-Z0-9_]*]]:2 = fir.unboxchar %arg3 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:  %[[VAL_7:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_6]]#0 typeparams %[[VAL_6]]#1 {{.*}} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:  %[[VAL_7:[a-zA-Z0-9_]*]]:2 = hlfir.declare %[[VAL_6]]#0 typeparams %[[VAL_6]]#1 dummy_scope %{{[0-9]+}} {{.*}} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:  %[[VAL_8:[a-zA-Z0-9_]*]] = hlfir.char_extremum min, %[[VAL_3]]#0, %[[VAL_5]]#0, %[[VAL_7]]#0 : (!fir.boxchar<1>, !fir.boxchar<1>, !fir.boxchar<1>) -> !hlfir.expr<!fir.char<1,?>>
 ! CHECK:  hlfir.assign %[[VAL_8]] to %[[VAL_1]]#0 : !hlfir.expr<!fir.char<1,?>>, !fir.boxchar<1>
 ! CHECK:  hlfir.destroy %[[VAL_8]] : !hlfir.expr<!fir.char<1,?>>
diff --git a/flang/test/Lower/HLFIR/charconvert.f90 b/flang/test/Lower/HLFIR/charconvert.f90
index 117fdc0d3ad4..45b0f356617a 100644
--- a/flang/test/Lower/HLFIR/charconvert.f90
+++ b/flang/test/Lower/HLFIR/charconvert.f90
@@ -13,7 +13,7 @@ subroutine charconvert1(c,n)
 end subroutine charconvert1
 
 ! CHECK-LABEL: func.func @_QPcharconvert1
-! CHECK:   %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFcharconvert1Ec"} : (!fir.box<!fir.array<?x!fir.char<4,?>>>) -> (!fir.box<!fir.array<?x!fir.char<4,?>>>, !fir.box<!fir.array<?x!fir.char<4,?>>>)
+! CHECK:   %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFcharconvert1Ec"} : (!fir.box<!fir.array<?x!fir.char<4,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<4,?>>>, !fir.box<!fir.array<?x!fir.char<4,?>>>)
 ! CHECK:   ^bb0(%[[ARG2:.*]]: index):
 ! CHECK:     %[[VAL_37:.*]] = fir.box_elesize %[[VAL_2]]#1 : (!fir.box<!fir.array<?x!fir.char<4,?>>>) -> index
 ! CHECK:     %[[C4_4:.*]] = arith.constant 4 : index
@@ -36,7 +36,7 @@ end subroutine charconvert2
 ! CHECK:   %[[C1:.*]] = arith.constant 1 : index
 ! CHECK:   %[[VAL_1:.*]] = fir.alloca !fir.char<4> {bindc_name = "cx", uniq_name = "_QFcharconvert2Ecx"}
 ! CHECK:   %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] typeparams %[[C1]] {uniq_name = "_QFcharconvert2Ecx"} : (!fir.ref<!fir.char<4>>, index) -> (!fir.ref<!fir.char<4>>, !fir.ref<!fir.char<4>>)
-! CHECK:   %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFcharconvert2Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:   %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFcharconvert2Ex"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:   %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
 ! CHECK:   %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i32) -> i64
 ! CHECK:   %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (i64) -> i8
@@ -58,9 +58,9 @@ end subroutine
 ! CHECK-LABEL: func.func @_QPcharconvert3
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.boxchar<1> {{.*}}, %[[ARG1:.*]]: !fir.boxchar<4> 
 ! CHECK:   %[[VAL_0:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:   %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]]#0 typeparams %[[VAL_0]]#1 {uniq_name = "_QFcharconvert3Ec"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:   %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]]#0 typeparams %[[VAL_0]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFcharconvert3Ec"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:   %[[VAL_2:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<4>) -> (!fir.ref<!fir.char<4,?>>, index)
-! CHECK:   %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 {uniq_name = "_QFcharconvert3Ec4"} : (!fir.ref<!fir.char<4,?>>, index) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>)
+! CHECK:   %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFcharconvert3Ec4"} : (!fir.ref<!fir.char<4,?>>, index, !fir.dscope) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>)
 ! CHECK:   %[[VAL_4:.*]] = arith.addi %[[VAL_0]]#1, %[[VAL_0]]#1 : index
 ! CHECK:   %[[VAL_5:.*]] = hlfir.concat %[[VAL_1]]#0, %[[VAL_1]]#0 len %[[VAL_4]] : (!fir.boxchar<1>, !fir.boxchar<1>, index) -> !hlfir.expr<!fir.char<1,?>>
 ! CHECK:   %[[VAL_7:.*]]:3 = hlfir.associate %[[VAL_5]] typeparams %[[VAL_4]] {adapt.valuebyref} : (!hlfir.expr<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>, i1)
diff --git a/flang/test/Lower/HLFIR/convert-mbox-to-value.f90 b/flang/test/Lower/HLFIR/convert-mbox-to-value.f90
index b9d55d3fde4f..ef9c12102a56 100644
--- a/flang/test/Lower/HLFIR/convert-mbox-to-value.f90
+++ b/flang/test/Lower/HLFIR/convert-mbox-to-value.f90
@@ -7,7 +7,7 @@ subroutine test_int_allocatable(a)
 end subroutine test_int_allocatable
 ! CHECK-LABEL:   func.func @_QPtest_int_allocatable(
 ! CHECK-SAME:                                       %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<i32>>> {fir.bindc_name = "a"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_int_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_int_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 6 : i32
 ! CHECK:           %[[VAL_3:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,{{[0-9]*}}>>
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
@@ -27,7 +27,7 @@ subroutine test_int_pointer(p)
 end subroutine test_int_pointer
 ! CHECK-LABEL:   func.func @_QPtest_int_pointer(
 ! CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>> {fir.bindc_name = "p"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_int_pointerEp"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_int_pointerEp"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 6 : i32
 ! CHECK:           %[[VAL_3:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,{{[0-9]*}}>>
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
@@ -49,7 +49,7 @@ end subroutine test_char_allocatable
 ! CHECK-LABEL:   func.func @_QPtest_char_allocatable(
 ! CHECK-SAME:                                        %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.char<1,11>>>> {fir.bindc_name = "a"}) {
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 11 : index
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_char_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,11>>>>, index) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,11>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,11>>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_char_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,11>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,11>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,11>>>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtest_char_allocatableEi"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFtest_char_allocatableEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,11>>>>
@@ -86,7 +86,7 @@ end subroutine test_char_pointer
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtest_char_pointerEi"}
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFtest_char_pointerEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 11 : index
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_3]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_char_pointerEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,11>>>>, index) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,11>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,11>>>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_3]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_char_pointerEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,11>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,11>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,11>>>>)
 ! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,11>>>>
 ! CHECK:           %[[VAL_6:.*]] = fir.box_addr %[[VAL_5]] : (!fir.box<!fir.ptr<!fir.char<1,11>>>) -> !fir.ptr<!fir.char<1,11>>
 ! CHECK:           %[[VAL_3B:.*]] = arith.constant 11 : index
@@ -120,7 +120,7 @@ end subroutine test_dyn_char_allocatable
 ! CHECK-SAME:                                            %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>> {fir.bindc_name = "a"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
 ! CHECK:           %[[VAL_2:.*]] = fir.box_elesize %[[VAL_1]] : (!fir.box<!fir.heap<!fir.char<1,?>>>) -> index
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_2]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_dyn_char_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, index) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_2]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_dyn_char_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>)
 ! CHECK:           %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtest_dyn_char_allocatableEi"}
 ! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFtest_dyn_char_allocatableEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
@@ -157,7 +157,7 @@ end subroutine test_dyn_char_pointer
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFtest_dyn_char_pointerEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>
 ! CHECK:           %[[VAL_4:.*]] = fir.box_elesize %[[VAL_3]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> index
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_4]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_dyn_char_pointerEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, index) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_4]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_dyn_char_pointerEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>)
 ! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>
 ! CHECK:           %[[VAL_7:.*]] = fir.box_addr %[[VAL_6]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> !fir.ptr<!fir.char<1,?>>
 ! CHECK:           %[[VAL_8:.*]] = arith.constant 1 : index
@@ -201,7 +201,7 @@ end subroutine test_derived_allocatable
 ! CHECK:           %[[VAL_7:.*]] = fir.embox %[[VAL_6]] : (!fir.heap<!fir.type<_QFtest_derived_allocatableTt>>) -> !fir.class<!fir.heap<!fir.type<_QFtest_derived_allocatableTt>>>
 ! CHECK:           fir.store %[[VAL_7]] to %[[VAL_5]] : !fir.ref<!fir.class<!fir.heap<!fir.type<_QFtest_derived_allocatableTt>>>>
 ! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_5]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_derived_allocatableEa2"} : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QFtest_derived_allocatableTt>>>>) -> (!fir.ref<!fir.class<!fir.heap<!fir.type<_QFtest_derived_allocatableTt>>>>, !fir.ref<!fir.class<!fir.heap<!fir.type<_QFtest_derived_allocatableTt>>>>)
-! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest_derived_allocatableEl"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_derived_allocatableEl"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_10:.*]] = fir.alloca !fir.class<!fir.heap<!fir.type<_QFtest_derived_allocatableTt>>> {bindc_name = "r", uniq_name = "_QFtest_derived_allocatableEr"}
 ! CHECK:           %[[VAL_11:.*]] = fir.zero_bits !fir.heap<!fir.type<_QFtest_derived_allocatableTt>>
 ! CHECK:           %[[VAL_12:.*]] = fir.embox %[[VAL_11]] : (!fir.heap<!fir.type<_QFtest_derived_allocatableTt>>) -> !fir.class<!fir.heap<!fir.type<_QFtest_derived_allocatableTt>>>
@@ -241,7 +241,7 @@ end subroutine test_derived_pointer
 ! CHECK:           %[[VAL_7:.*]] = fir.embox %[[VAL_6]] : (!fir.heap<!fir.type<_QFtest_derived_pointerTt>>) -> !fir.class<!fir.heap<!fir.type<_QFtest_derived_pointerTt>>>
 ! CHECK:           fir.store %[[VAL_7]] to %[[VAL_5]] : !fir.ref<!fir.class<!fir.heap<!fir.type<_QFtest_derived_pointerTt>>>>
 ! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_5]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_derived_pointerEa2"} : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QFtest_derived_pointerTt>>>>) -> (!fir.ref<!fir.class<!fir.heap<!fir.type<_QFtest_derived_pointerTt>>>>, !fir.ref<!fir.class<!fir.heap<!fir.type<_QFtest_derived_pointerTt>>>>)
-! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest_derived_pointerEl"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_derived_pointerEl"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_10:.*]] = fir.alloca !fir.class<!fir.heap<!fir.type<_QFtest_derived_pointerTt>>> {bindc_name = "r", uniq_name = "_QFtest_derived_pointerEr"}
 ! CHECK:           %[[VAL_11:.*]] = fir.zero_bits !fir.heap<!fir.type<_QFtest_derived_pointerTt>>
 ! CHECK:           %[[VAL_12:.*]] = fir.embox %[[VAL_11]] : (!fir.heap<!fir.type<_QFtest_derived_pointerTt>>) -> !fir.class<!fir.heap<!fir.type<_QFtest_derived_pointerTt>>>
diff --git a/flang/test/Lower/HLFIR/convert-variable-block.f90 b/flang/test/Lower/HLFIR/convert-variable-block.f90
index 30f8eacaaed1..dad6bc14fbdb 100644
--- a/flang/test/Lower/HLFIR/convert-variable-block.f90
+++ b/flang/test/Lower/HLFIR/convert-variable-block.f90
@@ -12,7 +12,7 @@ subroutine test(n)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPtest(
 ! CHECK-SAME:                       %[[VAL_0:.*]]: !fir.ref<i64> {fir.bindc_name = "n"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtestEn"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtestEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK:           fir.call @_QPbefore_block() {{.*}}: () -> ()
 ! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<i64>
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i64) -> index
diff --git a/flang/test/Lower/HLFIR/convert-variable.f90 b/flang/test/Lower/HLFIR/convert-variable.f90
index e7487ef870d1..7acb1be578b9 100644
--- a/flang/test/Lower/HLFIR/convert-variable.f90
+++ b/flang/test/Lower/HLFIR/convert-variable.f90
@@ -6,7 +6,7 @@ subroutine scalar_numeric(x)
 end subroutine
 ! CHECK-LABEL: func.func @_QPscalar_numeric(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<i32>
-! CHECK:  %[[VAL_1:.*]] = hlfir.declare %[[VAL_0]] {uniq_name = "_QFscalar_numericEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_1:.*]] = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFscalar_numericEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 
 subroutine scalar_character(c)
   character(*) :: c
@@ -14,7 +14,7 @@ end subroutine
 ! CHECK-LABEL: func.func @_QPscalar_character(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.boxchar<1>
 ! CHECK:  %[[VAL_1:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:  %[[VAL_2:.*]] = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 {uniq_name = "_QFscalar_characterEc"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:  %[[VAL_2:.*]] = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFscalar_characterEc"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 
 subroutine scalar_character_cst_len(c)
   character(10) :: c
@@ -24,7 +24,7 @@ end subroutine
 ! CHECK:  %[[VAL_1:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK:  %[[VAL_3:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,10>>
 ! CHECK:  %[[VAL_2:.*]] = arith.constant 10 : index
-! CHECK:  %[[VAL_4:.*]] = hlfir.declare %[[VAL_3]] typeparams %[[VAL_2]] {uniq_name = "_QFscalar_character_cst_lenEc"} : (!fir.ref<!fir.char<1,10>>, index) -> (!fir.ref<!fir.char<1,10>>, !fir.ref<!fir.char<1,10>>)
+! CHECK:  %[[VAL_4:.*]] = hlfir.declare %[[VAL_3]] typeparams %[[VAL_2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFscalar_character_cst_lenEc"} : (!fir.ref<!fir.char<1,10>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,10>>, !fir.ref<!fir.char<1,10>>)
 
 subroutine array_numeric(x)
   integer :: x(10, 20)
@@ -34,7 +34,7 @@ end subroutine
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 10 : index
 ! CHECK:  %[[VAL_2:.*]] = arith.constant 20 : index
 ! CHECK:  %[[VAL_3:.*]] = fir.shape %[[VAL_1]], %[[VAL_2]] : (index, index) -> !fir.shape<2>
-! CHECK:  %[[VAL_4:.*]] = hlfir.declare %[[VAL_0]](%[[VAL_3]]) {uniq_name = "_QFarray_numericEx"} : (!fir.ref<!fir.array<10x20xi32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<10x20xi32>>, !fir.ref<!fir.array<10x20xi32>>)
+! CHECK:  %[[VAL_4:.*]] = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFarray_numericEx"} : (!fir.ref<!fir.array<10x20xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xi32>>, !fir.ref<!fir.array<10x20xi32>>)
 
 
 subroutine array_numeric_lbounds(x)
@@ -47,7 +47,7 @@ end subroutine
 ! CHECK:  %[[VAL_3:.*]] = arith.constant -2 : index
 ! CHECK:  %[[VAL_4:.*]] = arith.constant 23 : index
 ! CHECK:  %[[VAL_5:.*]] = fir.shape_shift %[[VAL_1]], %[[VAL_2]], %[[VAL_3]], %[[VAL_4]] : (index, index, index, index) -> !fir.shapeshift<2>
-! CHECK:  %[[VAL_6:.*]] = hlfir.declare %[[VAL_0]](%[[VAL_5]]) {uniq_name = "_QFarray_numeric_lboundsEx"} : (!fir.ref<!fir.array<12x23xi32>>, !fir.shapeshift<2>) -> (!fir.box<!fir.array<12x23xi32>>, !fir.ref<!fir.array<12x23xi32>>)
+! CHECK:  %[[VAL_6:.*]] = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %{{[0-9]+}}  {uniq_name = "_QFarray_numeric_lboundsEx"} : (!fir.ref<!fir.array<12x23xi32>>, !fir.shapeshift<2>, !fir.dscope) -> (!fir.box<!fir.array<12x23xi32>>, !fir.ref<!fir.array<12x23xi32>>)
 
 subroutine array_character(c)
   character(*) :: c(50)
@@ -58,14 +58,14 @@ end subroutine
 ! CHECK:  %[[VAL_2:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<50x!fir.char<1,?>>>
 ! CHECK:  %[[VAL_3:.*]] = arith.constant 50 : index
 ! CHECK:  %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
-! CHECK:  %[[VAL_5:.*]] = hlfir.declare %[[VAL_2]](%[[VAL_4]]) typeparams %[[VAL_1]]#1 {uniq_name = "_QFarray_characterEc"} : (!fir.ref<!fir.array<50x!fir.char<1,?>>>, !fir.shape<1>, index) -> (!fir.box<!fir.array<50x!fir.char<1,?>>>, !fir.ref<!fir.array<50x!fir.char<1,?>>>)
+! CHECK:  %[[VAL_5:.*]] = hlfir.declare %[[VAL_2]](%[[VAL_4]]) typeparams %[[VAL_1]]#1 dummy_scope %{{[0-9]+}}  {uniq_name = "_QFarray_characterEc"} : (!fir.ref<!fir.array<50x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<50x!fir.char<1,?>>>, !fir.ref<!fir.array<50x!fir.char<1,?>>>)
 
 subroutine scalar_numeric_attributes(x)
   integer, optional, target, intent(in) :: x
 end subroutine
 ! CHECK-LABEL: func.func @_QPscalar_numeric_attributes(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<i32>
-! CHECK:  %[[VAL_1:.*]] = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<intent_in, optional, target>, uniq_name = "_QFscalar_numeric_attributesEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_1:.*]] = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in, optional, target>, uniq_name = "_QFscalar_numeric_attributesEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 
 subroutine scalar_numeric_attributes_2(x)
   real(16), value :: x(100)
@@ -74,21 +74,21 @@ end subroutine
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<!fir.array<100xf128>>
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 100 : index
 ! CHECK:  %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
-! CHECK:  %[[VAL_3:.*]] = hlfir.declare %[[VAL_0]](%[[VAL_2]]) {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFscalar_numeric_attributes_2Ex"} : (!fir.ref<!fir.array<100xf128>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xf128>>, !fir.ref<!fir.array<100xf128>>)
+! CHECK:  %[[VAL_3:.*]] = hlfir.declare %[[VAL_0]](%[[VAL_2]]) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFscalar_numeric_attributes_2Ex"} : (!fir.ref<!fir.array<100xf128>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf128>>, !fir.ref<!fir.array<100xf128>>)
 
 subroutine scalar_numeric_attributes_3(x)
   real, intent(in) :: x
 end subroutine
 ! CHECK-LABEL: func.func @_QPscalar_numeric_attributes_3(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<f32>
-! CHECK:  %[[VAL_1:.*]] = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFscalar_numeric_attributes_3Ex"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:  %[[VAL_1:.*]] = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFscalar_numeric_attributes_3Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 
 subroutine scalar_numeric_attributes_4(x)
   logical(8), intent(out) :: x
 end subroutine
 ! CHECK-LABEL: func.func @_QPscalar_numeric_attributes_4(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<!fir.logical<8>>
-! CHECK:  %[[VAL_1:.*]] = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<intent_out>, uniq_name = "_QFscalar_numeric_attributes_4Ex"} : (!fir.ref<!fir.logical<8>>) -> (!fir.ref<!fir.logical<8>>, !fir.ref<!fir.logical<8>>)
+! CHECK:  %[[VAL_1:.*]] = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_out>, uniq_name = "_QFscalar_numeric_attributes_4Ex"} : (!fir.ref<!fir.logical<8>>, !fir.dscope) -> (!fir.ref<!fir.logical<8>>, !fir.ref<!fir.logical<8>>)
 
 subroutine scalar_numeric_parameter()
   integer, parameter :: p = 42
diff --git a/flang/test/Lower/HLFIR/cray-pointers.f90 b/flang/test/Lower/HLFIR/cray-pointers.f90
index d969aa5d747a..ae903c8b44be 100644
--- a/flang/test/Lower/HLFIR/cray-pointers.f90
+++ b/flang/test/Lower/HLFIR/cray-pointers.f90
@@ -62,8 +62,8 @@ end subroutine test3
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<i64> {fir.bindc_name = "cp"},
 ! CHECK-SAME:                        %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?x!fir.char<1,11>>>>
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFtest3En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest3Ecp"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest3En"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest3Ecp"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 11 : index
 ! CHECK:           %[[VAL_8:.*]] = arith.constant 11 : index
 ! CHECK:           %[[VAL_24:.*]] = fir.shape_shift %{{.*}}, %{{.*}} : (index, index) -> !fir.shapeshift<1>
@@ -88,7 +88,7 @@ end subroutine test4
 ! CHECK-LABEL:   func.func @_QPtest4(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.char<1,?>>>
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest4En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest4En"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca i64 {bindc_name = "cp", uniq_name = "_QFtest4Ecp"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFtest4Ecp"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i32>
@@ -153,7 +153,7 @@ end subroutine test6
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xf32>>>
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?x!fir.char<1,?>>>>
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest6En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest6En"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_4:.*]] = fir.alloca i64 {bindc_name = "cp", uniq_name = "_QFtest6Ecp"}
 ! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFtest6Ecp"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK:           %[[VAL_8:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
@@ -379,7 +379,7 @@ end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_craypointer_capture(
 ! CHECK-SAME:                                           %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.char<1,?>>>
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest_craypointer_captureEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_craypointer_captureEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca i64 {bindc_name = "cray_pointer", uniq_name = "_QFtest_craypointer_captureEcray_pointer"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFtest_craypointer_captureEcray_pointer"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i32>
diff --git a/flang/test/Lower/HLFIR/custom-intrinsic.f90 b/flang/test/Lower/HLFIR/custom-intrinsic.f90
index cf91ea332cdc..f4af94cfee2f 100644
--- a/flang/test/Lower/HLFIR/custom-intrinsic.f90
+++ b/flang/test/Lower/HLFIR/custom-intrinsic.f90
@@ -7,8 +7,9 @@ end function
 ! CHECK-LABEL: func.func @_QPmax_simple(
 ! CHECK-SAME:      %[[A_ARG:.*]]: !fir.ref<i32> {fir.bindc_name = "a"}
 ! CHECK-SAME:      %[[B_ARG:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}
-! CHECK-NEXT:    %[[A_DECL:.*]]:2 = hlfir.declare %[[A_ARG]] {uniq_name = "_QFmax_simpleEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK-NEXT:    %[[B_DECL:.*]]:2 = hlfir.declare %[[B_ARG]] {uniq_name = "_QFmax_simpleEb"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK-NEXT:    %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK-NEXT:    %[[A_DECL:.*]]:2 = hlfir.declare %[[A_ARG]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFmax_simpleEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK-NEXT:    %[[B_DECL:.*]]:2 = hlfir.declare %[[B_ARG]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFmax_simpleEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK-NEXT:    %[[RES_ALLOC:.*]] = fir.alloca i32 {bindc_name = "max_simple", uniq_name = "_QFmax_simpleEmax_simple"}
 ! CHECK-NEXT:    %[[RES_DECL:.*]]:2 = hlfir.declare %[[RES_ALLOC]] {uniq_name = "_QFmax_simpleEmax_simple"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK-NEXT:    %[[A_LD:.*]] = fir.load %[[A_DECL]]#0 : !fir.ref<i32>
@@ -29,9 +30,9 @@ end function
 ! CHECK-SAME:                                              %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"},
 ! CHECK-SAME:                                              %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "b"},
 ! CHECK-SAME:                                              %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "c", fir.optional}) -> i32 {
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFmax_dynamic_optional_scalarEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmax_dynamic_optional_scalarEb"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmax_dynamic_optional_scalarEc"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmax_dynamic_optional_scalarEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmax_dynamic_optional_scalarEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}}  {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmax_dynamic_optional_scalarEc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_6:.*]] = fir.alloca i32 {bindc_name = "max_dynamic_optional_scalar", uniq_name = "_QFmax_dynamic_optional_scalarEmax_dynamic_optional_scalar"}
 ! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmax_dynamic_optional_scalarEmax_dynamic_optional_scalar"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_8:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
@@ -62,10 +63,10 @@ end function
 ! CHECK-SAME:                                               %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "b"},
 ! CHECK-SAME:                                               %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "c", fir.optional},
 ! CHECK-SAME:                                               %[[VAL_3:.*]]: !fir.ref<i32> {fir.bindc_name = "d", fir.optional}) -> i32 {
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFmax_dynamic_optional_scalar2Ea"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmax_dynamic_optional_scalar2Eb"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmax_dynamic_optional_scalar2Ec"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_3]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmax_dynamic_optional_scalar2Ed"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmax_dynamic_optional_scalar2Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmax_dynamic_optional_scalar2Eb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmax_dynamic_optional_scalar2Ec"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_3]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmax_dynamic_optional_scalar2Ed"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_8:.*]] = fir.alloca i32 {bindc_name = "max_dynamic_optional_scalar2", uniq_name = "_QFmax_dynamic_optional_scalar2Emax_dynamic_optional_scalar2"}
 ! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFmax_dynamic_optional_scalar2Emax_dynamic_optional_scalar2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<i32>
@@ -104,10 +105,10 @@ end function
 ! CHECK-SAME:                            %[[VAL_1:.*]]: !fir.ref<!fir.array<42xi32>> {fir.bindc_name = "b"}) -> !fir.array<42xi32> {
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 42 : index
 ! CHECK:           %[[VAL_3:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) {uniq_name = "_QFmax_arrayEa"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmax_arrayEa"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 42 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_6]]) {uniq_name = "_QFmax_arrayEb"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmax_arrayEb"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant 42 : index
 ! CHECK:           %[[VAL_9:.*]] = fir.alloca !fir.array<42xi32> {bindc_name = "max_array", uniq_name = "_QFmax_arrayEmax_array"}
 ! CHECK:           %[[VAL_10:.*]] = fir.shape %[[VAL_8]] : (index) -> !fir.shape<1>
@@ -137,13 +138,13 @@ end function
 ! CHECK-SAME:                                             %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"},
 ! CHECK-SAME:                                             %[[VAL_1:.*]]: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "b"},
 ! CHECK-SAME:                                             %[[VAL_2:.*]]: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "c", fir.optional}) -> !fir.array<10xi32> {
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFmax_dynamic_optional_arrayEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmax_dynamic_optional_arrayEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_5]]) {uniq_name = "_QFmax_dynamic_optional_arrayEb"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_5]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmax_dynamic_optional_arrayEb"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_7:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_8]]) {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmax_dynamic_optional_arrayEc"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_8]]) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmax_dynamic_optional_arrayEc"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_10:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_11:.*]] = fir.alloca !fir.array<10xi32> {bindc_name = "max_dynamic_optional_array", uniq_name = "_QFmax_dynamic_optional_arrayEmax_dynamic_optional_array"}
 ! CHECK:           %[[VAL_12:.*]] = fir.shape %[[VAL_10]] : (index) -> !fir.shape<1>
@@ -180,8 +181,8 @@ end function
 ! CHECK-LABEL:   func.func @_QPmin_simple(
 ! CHECK-SAME:                             %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"},
 ! CHECK-SAME:                             %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}) -> i32 {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFmin_simpleEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmin_simpleEb"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmin_simpleEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmin_simpleEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "min_simple", uniq_name = "_QFmin_simpleEmin_simple"}
 ! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFmin_simpleEmin_simple"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i32>
@@ -202,9 +203,9 @@ end function
 ! CHECK-SAME:                                              %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"},
 ! CHECK-SAME:                                              %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "b"},
 ! CHECK-SAME:                                              %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "c", fir.optional}) -> i32 {
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFmin_dynamic_optional_scalarEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmin_dynamic_optional_scalarEb"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmin_dynamic_optional_scalarEc"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmin_dynamic_optional_scalarEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmin_dynamic_optional_scalarEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}}  {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmin_dynamic_optional_scalarEc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_6:.*]] = fir.alloca i32 {bindc_name = "min_dynamic_optional_scalar", uniq_name = "_QFmin_dynamic_optional_scalarEmin_dynamic_optional_scalar"}
 ! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmin_dynamic_optional_scalarEmin_dynamic_optional_scalar"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_8:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
@@ -235,10 +236,10 @@ end function
 ! CHECK-SAME:                                               %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "b"},
 ! CHECK-SAME:                                               %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "c", fir.optional},
 ! CHECK-SAME:                                               %[[VAL_3:.*]]: !fir.ref<i32> {fir.bindc_name = "d", fir.optional}) -> i32 {
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFmin_dynamic_optional_scalar2Ea"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmin_dynamic_optional_scalar2Eb"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmin_dynamic_optional_scalar2Ec"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_3]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmin_dynamic_optional_scalar2Ed"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmin_dynamic_optional_scalar2Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmin_dynamic_optional_scalar2Eb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmin_dynamic_optional_scalar2Ec"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_3]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmin_dynamic_optional_scalar2Ed"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_8:.*]] = fir.alloca i32 {bindc_name = "min_dynamic_optional_scalar2", uniq_name = "_QFmin_dynamic_optional_scalar2Emin_dynamic_optional_scalar2"}
 ! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFmin_dynamic_optional_scalar2Emin_dynamic_optional_scalar2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<i32>
@@ -277,10 +278,10 @@ end function
 ! CHECK-SAME:                            %[[VAL_1:.*]]: !fir.ref<!fir.array<42xi32>> {fir.bindc_name = "b"}) -> !fir.array<42xi32> {
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 42 : index
 ! CHECK:           %[[VAL_3:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) {uniq_name = "_QFmin_arrayEa"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmin_arrayEa"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 42 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_6]]) {uniq_name = "_QFmin_arrayEb"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmin_arrayEb"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant 42 : index
 ! CHECK:           %[[VAL_9:.*]] = fir.alloca !fir.array<42xi32> {bindc_name = "min_array", uniq_name = "_QFmin_arrayEmin_array"}
 ! CHECK:           %[[VAL_10:.*]] = fir.shape %[[VAL_8]] : (index) -> !fir.shape<1>
@@ -310,13 +311,13 @@ end function
 ! CHECK-SAME:                                             %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"},
 ! CHECK-SAME:                                             %[[VAL_1:.*]]: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "b"},
 ! CHECK-SAME:                                             %[[VAL_2:.*]]: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "c", fir.optional}) -> !fir.array<10xi32> {
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFmin_dynamic_optional_arrayEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmin_dynamic_optional_arrayEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_5]]) {uniq_name = "_QFmin_dynamic_optional_arrayEb"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_5]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmin_dynamic_optional_arrayEb"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_7:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_8]]) {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmin_dynamic_optional_arrayEc"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_8]]) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmin_dynamic_optional_arrayEc"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_10:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_11:.*]] = fir.alloca !fir.array<10xi32> {bindc_name = "min_dynamic_optional_array", uniq_name = "_QFmin_dynamic_optional_arrayEmin_dynamic_optional_array"}
 ! CHECK:           %[[VAL_12:.*]] = fir.shape %[[VAL_10]] : (index) -> !fir.shape<1>
@@ -355,7 +356,7 @@ end function
 ! CHECK-SAME:                                    %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>> {fir.bindc_name = "pointer"}) -> !fir.logical<4> {
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.logical<4> {bindc_name = "associated_simple", uniq_name = "_QFassociated_simpleEassociated_simple"}
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFassociated_simpleEassociated_simple"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_simpleEpointer"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_simpleEpointer"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
 ! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_3]]#1 : !fir.ref<!fir.box<!fir.ptr<i32>>>
 ! CHECK:           %[[VAL_5:.*]] = fir.box_addr %[[VAL_4]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
 ! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (!fir.ptr<i32>) -> i64
@@ -378,8 +379,8 @@ end function
 ! CHECK-SAME:                                    %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "target", fir.target}) -> !fir.logical<4> {
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "associated_target", uniq_name = "_QFassociated_targetEassociated_target"}
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFassociated_targetEassociated_target"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_targetEpointer"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFassociated_targetEtarget"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_targetEpointer"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFassociated_targetEtarget"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_6:.*]] = fir.embox %[[VAL_5]]#1 : (!fir.ref<i32>) -> !fir.box<i32>
 ! CHECK:           %[[VAL_7:.*]] = fir.load %[[VAL_4]]#1 : !fir.ref<!fir.box<!fir.ptr<i32>>>
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (!fir.box<!fir.ptr<i32>>) -> !fir.box<none>
@@ -402,8 +403,8 @@ end function
 ! CHECK-SAME:                                     %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>> {fir.bindc_name = "target"}) -> !fir.logical<4> {
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "associated_pointer", uniq_name = "_QFassociated_pointerEassociated_pointer"}
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFassociated_pointerEassociated_pointer"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_pointerEpointer"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_pointerEtarget"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_pointerEpointer"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_pointerEtarget"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
 ! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_5]]#1 : !fir.ref<!fir.box<!fir.ptr<i32>>>
 ! CHECK:           %[[VAL_7:.*]] = fir.load %[[VAL_4]]#1 : !fir.ref<!fir.box<!fir.ptr<i32>>>
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (!fir.box<!fir.ptr<i32>>) -> !fir.box<none>
@@ -426,8 +427,8 @@ end function
 ! CHECK-SAME:                                   %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> {fir.bindc_name = "target"}) -> !fir.logical<4> {
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "associated_array", uniq_name = "_QFassociated_arrayEassociated_array"}
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFassociated_arrayEassociated_array"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_arrayEpointer"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_arrayEtarget"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_arrayEpointer"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_arrayEtarget"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
 ! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_5]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
 ! CHECK:           %[[VAL_7:.*]] = fir.load %[[VAL_4]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (!fir.box<!fir.ptr<!fir.array<?xi32>>>) -> !fir.box<none>
@@ -447,11 +448,11 @@ end function
 ! CHECK-SAME:                                %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "i"},
 ! CHECK-SAME:                                %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "shift"},
 ! CHECK-SAME:                                %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "size"}) -> i32 {
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFishftc_simpleEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFishftc_simpleEi"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "ishftc_simple", uniq_name = "_QFishftc_simpleEishftc_simple"}
 ! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFishftc_simpleEishftc_simple"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFishftc_simpleEshift"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFishftc_simpleEsize"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFishftc_simpleEshift"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFishftc_simpleEsize"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_8:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
 ! CHECK:           %[[VAL_9:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
 ! CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<i32>
@@ -498,11 +499,11 @@ end function
 ! CHECK-SAME:                                                     %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "i"},
 ! CHECK-SAME:                                                     %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "shift"},
 ! CHECK-SAME:                                                     %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "size", fir.optional}) -> i32 {
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFishftc_dynamically_optional_scalarEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFishftc_dynamically_optional_scalarEi"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "ishftc_dynamically_optional_scalar", uniq_name = "_QFishftc_dynamically_optional_scalarEishftc_dynamically_optional_scalar"}
 ! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFishftc_dynamically_optional_scalarEishftc_dynamically_optional_scalar"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFishftc_dynamically_optional_scalarEshift"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_2]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFishftc_dynamically_optional_scalarEsize"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFishftc_dynamically_optional_scalarEshift"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFishftc_dynamically_optional_scalarEsize"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_8:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
 ! CHECK:           %[[VAL_9:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
 ! CHECK:           %[[VAL_10:.*]] = fir.is_present %[[VAL_7]]#0 : (!fir.ref<i32>) -> i1
@@ -557,17 +558,17 @@ end function
 ! CHECK-SAME:                               %[[VAL_2:.*]]: !fir.ref<!fir.array<42xi32>> {fir.bindc_name = "size"}) -> !fir.array<42xi32> {
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 42 : index
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) {uniq_name = "_QFishftc_arrayEi"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFishftc_arrayEi"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 42 : index
 ! CHECK:           %[[VAL_7:.*]] = fir.alloca !fir.array<42xi32> {bindc_name = "ishftc_array", uniq_name = "_QFishftc_arrayEishftc_array"}
 ! CHECK:           %[[VAL_8:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1>
 ! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_7]](%[[VAL_8]]) {uniq_name = "_QFishftc_arrayEishftc_array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
 ! CHECK:           %[[VAL_10:.*]] = arith.constant 42 : index
 ! CHECK:           %[[VAL_11:.*]] = fir.shape %[[VAL_10]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_11]]) {uniq_name = "_QFishftc_arrayEshift"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+! CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_11]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFishftc_arrayEshift"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
 ! CHECK:           %[[VAL_13:.*]] = arith.constant 42 : index
 ! CHECK:           %[[VAL_14:.*]] = fir.shape %[[VAL_13]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_14]]) {uniq_name = "_QFishftc_arrayEsize"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+! CHECK:           %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_14]]) dummy_scope %{{[0-9]+}}  {uniq_name = "_QFishftc_arrayEsize"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
 ! CHECK:           %[[VAL_16:.*]] = hlfir.elemental %[[VAL_4]] unordered : (!fir.shape<1>) -> !hlfir.expr<42xi32> {
 ! CHECK:           ^bb0(%[[VAL_17:.*]]: index):
 ! CHECK:             %[[VAL_18:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_17]])  : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
@@ -624,13 +625,13 @@ end function
 ! CHECK-SAME:                                                    %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "size", fir.optional}) -> !fir.array<42xi32> {
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 42 : index
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) {uniq_name = "_QFishftc_dynamically_optional_arrayEi"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFishftc_dynamically_optional_arrayEi"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 42 : index
 ! CHECK:           %[[VAL_7:.*]] = fir.alloca !fir.array<42xi32> {bindc_name = "ishftc_dynamically_optional_array", uniq_name = "_QFishftc_dynamically_optional_arrayEishftc_dynamically_optional_array"}
 ! CHECK:           %[[VAL_8:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1>
 ! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_7]](%[[VAL_8]]) {uniq_name = "_QFishftc_dynamically_optional_arrayEishftc_dynamically_optional_array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
-! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFishftc_dynamically_optional_arrayEshift"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_2]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFishftc_dynamically_optional_arrayEsize"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFishftc_dynamically_optional_arrayEshift"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFishftc_dynamically_optional_arrayEsize"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_12:.*]] = fir.is_present %[[VAL_11]]#0 : (!fir.ref<i32>) -> i1
 ! CHECK:           %[[VAL_13:.*]] = fir.load %[[VAL_10]]#0 : !fir.ref<i32>
 ! CHECK:           %[[VAL_14:.*]] = hlfir.elemental %[[VAL_4]] unordered : (!fir.shape<1>) -> !hlfir.expr<42xi32> {
@@ -698,9 +699,9 @@ end subroutine
 ! CHECK-SAME:                                    %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>> {fir.bindc_name = "a"},
 ! CHECK-SAME:                                    %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>> {fir.bindc_name = "b"},
 ! CHECK-SAME:                                    %[[VAL_2:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>> {fir.bindc_name = "c"}) {
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocatables_testEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocatables_testEb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocatables_testEc"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocatables_testEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocatables_testEb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocatables_testEc"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>)
 ! CHECK:           %[[VAL_6:.*]] = fir.address_of(@_QFallocatables_testECnx) : !fir.ref<i32>
 ! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QFallocatables_testECnx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_8:.*]] = fir.address_of(@_QFallocatables_testECny) : !fir.ref<i32>
@@ -840,4 +841,4 @@ end subroutine
 ! CHECK:           hlfir.assign %[[VAL_136:.*]] to %[[VAL_5]]#0 realloc : !hlfir.expr<?x?x?xi32>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>
 ! CHECK:           hlfir.destroy %[[VAL_136]] : !hlfir.expr<?x?x?xi32>
 ! CHECK:           return
-! CHECK:         }
-\ No newline at end of file
+! CHECK:         }
diff --git a/flang/test/Lower/HLFIR/designators-component-ref.f90 b/flang/test/Lower/HLFIR/designators-component-ref.f90
index 392eda66fd03..69cc7d2e5aa6 100644
--- a/flang/test/Lower/HLFIR/designators-component-ref.f90
+++ b/flang/test/Lower/HLFIR/designators-component-ref.f90
@@ -340,7 +340,7 @@ subroutine test_scalar_array_complex_chain(a)
   type(t_complex) :: a
   print *, a%array_comp%im
 ! CHECK-LABEL:   func.func @_QPtest_scalar_array_complex_chain(
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest_scalar_array_complex_chainEa"} : (!fir.ref<!fir.type<_QMcomp_refTt_complex{array_comp:!fir.array<10x20x!fir.complex<4>>}>>) -> (!fir.ref<!fir.type<_QMcomp_refTt_complex{array_comp:!fir.array<10x20x!fir.complex<4>>}>>, !fir.ref<!fir.type<_QMcomp_refTt_complex{array_comp:!fir.array<10x20x!fir.complex<4>>}>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_scalar_array_complex_chainEa"} : (!fir.ref<!fir.type<_QMcomp_refTt_complex{array_comp:!fir.array<10x20x!fir.complex<4>>}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMcomp_refTt_complex{array_comp:!fir.array<10x20x!fir.complex<4>>}>>, !fir.ref<!fir.type<_QMcomp_refTt_complex{array_comp:!fir.array<10x20x!fir.complex<4>>}>>)
 ! CHECK:           %[[VAL_7:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_8:.*]] = arith.constant 20 : index
 ! CHECK:           %[[VAL_9:.*]] = arith.constant 2 : index
@@ -379,13 +379,13 @@ end subroutine test_poly_array_vector_subscript
 ! CHECK-SAME:      %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcomp_refTt1{scalar_i:i32,scalar_x:f32}>>>>> {fir.bindc_name = "p"},
 ! CHECK-SAME:      %[[VAL_1:.*]]: !fir.ref<!fir.array<3xi32>> {fir.bindc_name = "v"},
 ! CHECK-SAME:      %[[VAL_2:.*]]: !fir.ref<!fir.array<3xi32>> {fir.bindc_name = "r"}) {
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_poly_array_vector_subscriptEp"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcomp_refTt1{scalar_i:i32,scalar_x:f32}>>>>>) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcomp_refTt1{scalar_i:i32,scalar_x:f32}>>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcomp_refTt1{scalar_i:i32,scalar_x:f32}>>>>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_poly_array_vector_subscriptEp"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcomp_refTt1{scalar_i:i32,scalar_x:f32}>>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcomp_refTt1{scalar_i:i32,scalar_x:f32}>>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcomp_refTt1{scalar_i:i32,scalar_x:f32}>>>>>)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_5]]) {uniq_name = "_QFtest_poly_array_vector_subscriptEr"} : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<3xi32>>, !fir.ref<!fir.array<3xi32>>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_5]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_poly_array_vector_subscriptEr"} : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<3xi32>>, !fir.ref<!fir.array<3xi32>>)
 ! CHECK:           %[[VAL_7:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_8]]) {uniq_name = "_QFtest_poly_array_vector_subscriptEv"} : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<3xi32>>, !fir.ref<!fir.array<3xi32>>)
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_8]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_poly_array_vector_subscriptEv"} : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<3xi32>>, !fir.ref<!fir.array<3xi32>>)
 ! CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcomp_refTt1{scalar_i:i32,scalar_x:f32}>>>>>
 ! CHECK:           %[[VAL_11:.*]] = hlfir.elemental %[[VAL_8]] unordered : (!fir.shape<1>) -> !hlfir.expr<3xi64> {
 ! CHECK:           ^bb0(%[[VAL_12:.*]]: index):
diff --git a/flang/test/Lower/HLFIR/designators.f90 b/flang/test/Lower/HLFIR/designators.f90
index de1ec6e5b3cf..09753d06cc27 100644
--- a/flang/test/Lower/HLFIR/designators.f90
+++ b/flang/test/Lower/HLFIR/designators.f90
@@ -7,8 +7,8 @@ subroutine array_ref(x, n)
   print *, x(n)
 end subroutine
 ! CHECK-LABEL: func.func @_QParray_ref(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFarray_refEn"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFarray_refEx"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFarray_refEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFarray_refEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:  %[[VAL_9:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i64>
 ! CHECK:  %[[VAL_10:.*]] = hlfir.designate %[[VAL_3]]#0 (%[[VAL_9]])  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
 
@@ -17,8 +17,8 @@ subroutine char_array_ref(x, n)
   print *, x(10)
 end subroutine
 ! CHECK-LABEL: func.func @_QPchar_array_ref(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFchar_array_refEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFchar_array_refEx"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFchar_array_refEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFchar_array_refEx"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>)
 ! CHECK:  %[[VAL_9:.*]] = fir.box_elesize %[[VAL_3]]#1 : (!fir.box<!fir.array<?x!fir.char<1,?>>>) -> index
 ! CHECK:  %[[VAL_10:.*]] = arith.constant 10 : index
 ! CHECK:  %[[VAL_11:.*]] = hlfir.designate %[[VAL_3]]#0 (%[[VAL_10]])  typeparams %[[VAL_9]] : (!fir.box<!fir.array<?x!fir.char<1,?>>>, index, index) -> !fir.boxchar<1>
@@ -28,9 +28,9 @@ subroutine char_array_ref_cst_len(x, n)
   print *, x(10)
 end subroutine
 ! CHECK-LABEL: func.func @_QPchar_array_ref_cst_len(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFchar_array_ref_cst_lenEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFchar_array_ref_cst_lenEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:  %[[VAL_3:.*]] = arith.constant 5 : index
-! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} typeparams %[[VAL_3]] {uniq_name = "_QFchar_array_ref_cst_lenEx"} : (!fir.box<!fir.array<?x!fir.char<1,5>>>, index) -> (!fir.box<!fir.array<?x!fir.char<1,5>>>, !fir.box<!fir.array<?x!fir.char<1,5>>>)
+! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} typeparams %[[VAL_3]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFchar_array_ref_cst_lenEx"} : (!fir.box<!fir.array<?x!fir.char<1,5>>>, index, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,5>>>, !fir.box<!fir.array<?x!fir.char<1,5>>>)
 ! CHECK:  %[[VAL_10:.*]] = arith.constant 10 : index
 ! CHECK:  %[[VAL_11:.*]] = hlfir.designate %[[VAL_4]]#0 (%[[VAL_10]])  typeparams %[[VAL_3]] : (!fir.box<!fir.array<?x!fir.char<1,5>>>, index, index) -> !fir.ref<!fir.char<1,5>>
 
@@ -41,7 +41,7 @@ end subroutine
 ! CHECK-LABEL: func.func @_QParray_section(
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 10 : index
 ! CHECK:  %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}}(%[[VAL_2]]) {uniq_name = "_QFarray_sectionEx"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}}(%[[VAL_2]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFarray_sectionEx"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
 ! CHECK:  %[[VAL_9:.*]] = arith.constant 2 : index
 ! CHECK:  %[[VAL_10:.*]] = arith.constant 8 : index
 ! CHECK:  %[[VAL_11:.*]] = arith.constant 3 : index
@@ -55,8 +55,8 @@ subroutine array_section_2(x, n)
   print *, x(n::3)
 end subroutine
 ! CHECK-LABEL: func.func @_QParray_section_2(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFarray_section_2En"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFarray_section_2Ex"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFarray_section_2En"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFarray_section_2Ex"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:  %[[VAL_9:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i64>
 ! CHECK:  %[[VAL_10:.*]] = arith.constant 0 : index
 ! CHECK:  %[[VAL_11:.*]]:3 = fir.box_dims %[[VAL_3]]#1, %[[VAL_10]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
@@ -76,8 +76,8 @@ subroutine char_array_section(x, n)
   print *, x(::3)
 end subroutine
 ! CHECK-LABEL: func.func @_QPchar_array_section(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFchar_array_sectionEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFchar_array_sectionEx"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFchar_array_sectionEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFchar_array_sectionEx"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>)
 ! CHECK:  %[[VAL_9:.*]] = fir.box_elesize %[[VAL_3]]#1 : (!fir.box<!fir.array<?x!fir.char<1,?>>>) -> index
 ! CHECK:  %[[VAL_10:.*]] = arith.constant 1 : index
 ! CHECK:  %[[VAL_11:.*]] = arith.constant 0 : index
@@ -97,9 +97,9 @@ subroutine char_array_section_cst_len(x, n)
   print *, x(::3)
 end subroutine
 ! CHECK-LABEL: func.func @_QPchar_array_section_cst_len(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFchar_array_section_cst_lenEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFchar_array_section_cst_lenEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:  %[[VAL_3:.*]] = arith.constant 5 : index
-! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} typeparams %[[VAL_3]] {uniq_name = "_QFchar_array_section_cst_lenEx"} : (!fir.box<!fir.array<?x!fir.char<1,5>>>, index) -> (!fir.box<!fir.array<?x!fir.char<1,5>>>, !fir.box<!fir.array<?x!fir.char<1,5>>>)
+! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} typeparams %[[VAL_3]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFchar_array_section_cst_lenEx"} : (!fir.box<!fir.array<?x!fir.char<1,5>>>, index, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,5>>>, !fir.box<!fir.array<?x!fir.char<1,5>>>)
 ! CHECK:  %[[VAL_10:.*]] = arith.constant 1 : index
 ! CHECK:  %[[VAL_11:.*]] = arith.constant 0 : index
 ! CHECK:  %[[VAL_12:.*]]:3 = fir.box_dims %[[VAL_4]]#1, %[[VAL_11]] : (!fir.box<!fir.array<?x!fir.char<1,5>>>, index) -> (index, index, index)
@@ -120,7 +120,7 @@ subroutine complex_imag_ref(x)
   print *, x%im
 end subroutine
 ! CHECK-LABEL: func.func @_QPcomplex_imag_ref(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFcomplex_imag_refEx"} : (!fir.box<!fir.array<?x!fir.complex<4>>>) -> (!fir.box<!fir.array<?x!fir.complex<4>>>, !fir.box<!fir.array<?x!fir.complex<4>>>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFcomplex_imag_refEx"} : (!fir.box<!fir.array<?x!fir.complex<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.complex<4>>>, !fir.box<!fir.array<?x!fir.complex<4>>>)
 ! CHECK:  %[[VAL_3:.*]] = fir.shape %[[VAL_4:.*]]#1 : (index) -> !fir.shape<1>
 ! CHECK:  %[[VAL_5:.*]] = hlfir.designate %[[VAL_2]]#0  imag shape %[[VAL_3]] : (!fir.box<!fir.array<?x!fir.complex<4>>>, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>>
 
@@ -129,7 +129,7 @@ subroutine complex_real_ref(x)
   print *, x%re
 end subroutine
 ! CHECK-LABEL: func.func @_QPcomplex_real_ref(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFcomplex_real_refEx"} : (!fir.box<!fir.array<?x!fir.complex<4>>>) -> (!fir.box<!fir.array<?x!fir.complex<4>>>, !fir.box<!fir.array<?x!fir.complex<4>>>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFcomplex_real_refEx"} : (!fir.box<!fir.array<?x!fir.complex<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.complex<4>>>, !fir.box<!fir.array<?x!fir.complex<4>>>)
 ! CHECK:  %[[VAL_3:.*]] = fir.shape %[[VAL_4:.*]]#1 : (index) -> !fir.shape<1>
 ! CHECK:  %[[VAL_5:.*]] = hlfir.designate %[[VAL_2]]#0  real shape %[[VAL_3]] : (!fir.box<!fir.array<?x!fir.complex<4>>>, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>>
 
@@ -139,11 +139,11 @@ subroutine complex_individual_ref(x, n)
   print *, x(n)%im
 end subroutine
 ! CHECK-LABEL: func.func @_QPcomplex_individual_ref(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFcomplex_individual_refEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFcomplex_individual_refEx"} : (!fir.box<!fir.array<?x!fir.complex<4>>>) -> (!fir.box<!fir.array<?x!fir.complex<4>>>, !fir.box<!fir.array<?x!fir.complex<4>>>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFcomplex_individual_refEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFcomplex_individual_refEx"} : (!fir.box<!fir.array<?x!fir.complex<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.complex<4>>>, !fir.box<!fir.array<?x!fir.complex<4>>>)
 ! CHECK:  %[[VAL_4:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i32>
 ! CHECK:  %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i32) -> i64
-! CHECK:  %[[VAL_6:.*]] = hlfir.designate %1#0 (%[[VAL_5]]) imag : (!fir.box<!fir.array<?x!fir.complex<4>>>, i64) -> !fir.ref<f32>
+! CHECK:  %[[VAL_6:.*]] = hlfir.designate %{{[0-9]+}}#0 (%[[VAL_5]]) imag : (!fir.box<!fir.array<?x!fir.complex<4>>>, i64) -> !fir.ref<f32>
 
 subroutine complex_slice_ref(x, start, end)
   complex :: x(:)
@@ -151,9 +151,9 @@ subroutine complex_slice_ref(x, start, end)
   print *, x(start:end)%re
 end subroutine
 ! CHECK-LABEL: func.func @_QPcomplex_slice_ref(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFcomplex_slice_refEend"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFcomplex_slice_refEstart"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %arg0 {uniq_name = "_QFcomplex_slice_refEx"} : (!fir.box<!fir.array<?x!fir.complex<4>>>) -> (!fir.box<!fir.array<?x!fir.complex<4>>>, !fir.box<!fir.array<?x!fir.complex<4>>>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFcomplex_slice_refEend"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFcomplex_slice_refEstart"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %arg0 dummy_scope %{{[0-9]+}} {uniq_name = "_QFcomplex_slice_refEx"} : (!fir.box<!fir.array<?x!fir.complex<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.complex<4>>>, !fir.box<!fir.array<?x!fir.complex<4>>>)
 ! CHECK:  %[[VAL_5:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
 ! CHECK:  %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (i32) -> i64
 ! CHECK:  %[[VAL_7:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i32>
diff --git a/flang/test/Lower/HLFIR/dot_product.f90 b/flang/test/Lower/HLFIR/dot_product.f90
index 890dc4abca49..2d3ee97b7e40 100644
--- a/flang/test/Lower/HLFIR/dot_product.f90
+++ b/flang/test/Lower/HLFIR/dot_product.f90
@@ -72,10 +72,10 @@ endsubroutine
 ! CHECK-NEXT:   }
 
 ! CHECK-LABEL: func.func @_QPdot_product5
-! CHECK:    %[[LHS:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFdot_product5Elhs"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:    %[[LHS:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFdot_product5Elhs"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:    %[[C3:.*]] = arith.constant 3 : index
 ! CHECK:    %[[RHS_SHAPE:.*]] = fir.shape %[[C3]] : (index) -> !fir.shape<1>
-! CHECK:    %[[RHS:.*]]:2 = hlfir.declare %{{.*}}(%[[RHS_SHAPE]]) {uniq_name = "_QFdot_product5Erhs"} : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<3xi32>>, !fir.ref<!fir.array<3xi32>>)
+! CHECK:    %[[RHS:.*]]:2 = hlfir.declare %{{.*}}(%[[RHS_SHAPE]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFdot_product5Erhs"} : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<3xi32>>, !fir.ref<!fir.array<3xi32>>)
 ! CHECK:    {{.*}} = hlfir.dot_product %[[LHS]]#0 %[[RHS]]#0 {fastmath = #arith.fastmath<contract>} : (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<3xi32>>) -> i32
 subroutine dot_product5(lhs, rhs, res)
   integer :: lhs(:), rhs(3)
diff --git a/flang/test/Lower/HLFIR/elemental-array-ops.f90 b/flang/test/Lower/HLFIR/elemental-array-ops.f90
index 9778adeb6179..80801fdde0d7 100644
--- a/flang/test/Lower/HLFIR/elemental-array-ops.f90
+++ b/flang/test/Lower/HLFIR/elemental-array-ops.f90
@@ -166,9 +166,9 @@ end subroutine char_return
 ! CHECK:           fir.store %[[VAL_7]] to %[[VAL_3]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>
 ! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_3]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFchar_returnEl"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>)
 ! CHECK:           %[[VAL_9:.*]] = arith.constant 3 : index
-! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_9]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFchar_returnEx"} : (!fir.box<!fir.array<?x!fir.char<1,3>>>, index) -> (!fir.box<!fir.array<?x!fir.char<1,3>>>, !fir.box<!fir.array<?x!fir.char<1,3>>>)
+! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_9]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFchar_returnEx"} : (!fir.box<!fir.array<?x!fir.char<1,3>>>, index, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,3>>>, !fir.box<!fir.array<?x!fir.char<1,3>>>)
 ! CHECK:           %[[VAL_11:.*]] = arith.constant 3 : index
-! CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_1]] typeparams %[[VAL_11]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFchar_returnEy"} : (!fir.box<!fir.array<?x!fir.char<1,3>>>, index) -> (!fir.box<!fir.array<?x!fir.char<1,3>>>, !fir.box<!fir.array<?x!fir.char<1,3>>>)
+! CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_1]] typeparams %[[VAL_11]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFchar_returnEy"} : (!fir.box<!fir.array<?x!fir.char<1,3>>>, index, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,3>>>, !fir.box<!fir.array<?x!fir.char<1,3>>>)
 ! CHECK:           %[[VAL_13:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_14:.*]]:3 = fir.box_dims %[[VAL_12]]#0, %[[VAL_13]] : (!fir.box<!fir.array<?x!fir.char<1,3>>>, index) -> (index, index, index)
 ! CHECK:           %[[VAL_15:.*]] = fir.shape %[[VAL_14]]#1 : (index) -> !fir.shape<1>
@@ -225,8 +225,8 @@ end subroutine polymorphic_parenthesis
 ! CHECK-LABEL:   func.func @_QPpolymorphic_parenthesis(
 ! CHECK-SAME:        %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>>> {fir.bindc_name = "x"},
 ! CHECK-SAME:        %[[VAL_1:.*]]: !fir.class<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFpolymorphic_parenthesisEx"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>>>) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>>>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFpolymorphic_parenthesisEy"} : (!fir.class<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>) -> (!fir.class<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>, !fir.class<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFpolymorphic_parenthesisEx"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFpolymorphic_parenthesisEy"} : (!fir.class<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>, !fir.class<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_3]]#0, %[[VAL_4]] : (!fir.class<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>, index) -> (index, index, index)
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]]#1 : (index) -> !fir.shape<1>
@@ -249,8 +249,8 @@ end subroutine unlimited_polymorphic_parenthesis
 ! CHECK-LABEL:   func.func @_QPunlimited_polymorphic_parenthesis(
 ! CHECK-SAME:        %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>> {fir.bindc_name = "x"},
 ! CHECK-SAME:        %[[VAL_1:.*]]: !fir.class<!fir.array<?xnone>> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFunlimited_polymorphic_parenthesisEx"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFunlimited_polymorphic_parenthesisEy"} : (!fir.class<!fir.array<?xnone>>) -> (!fir.class<!fir.array<?xnone>>, !fir.class<!fir.array<?xnone>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFunlimited_polymorphic_parenthesisEx"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFunlimited_polymorphic_parenthesisEy"} : (!fir.class<!fir.array<?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?xnone>>, !fir.class<!fir.array<?xnone>>)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_3]]#0, %[[VAL_4]] : (!fir.class<!fir.array<?xnone>>, index) -> (index, index, index)
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]]#1 : (index) -> !fir.shape<1>
diff --git a/flang/test/Lower/HLFIR/elemental-polymorphic-merge.f90 b/flang/test/Lower/HLFIR/elemental-polymorphic-merge.f90
index eb93099b3890..36762d47100c 100644
--- a/flang/test/Lower/HLFIR/elemental-polymorphic-merge.f90
+++ b/flang/test/Lower/HLFIR/elemental-polymorphic-merge.f90
@@ -14,10 +14,10 @@ end subroutine test_polymorphic_merge
 ! CHECK-SAME:        %[[VAL_1:.*]]: !fir.class<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>> {fir.bindc_name = "y"},
 ! CHECK-SAME:        %[[VAL_2:.*]]: !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>>> {fir.bindc_name = "r"},
 ! CHECK-SAME:        %[[VAL_3:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>> {fir.bindc_name = "m"}) {
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFtest_polymorphic_mergeEm"} : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_polymorphic_mergeEr"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>>>) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>>>)
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_polymorphic_mergeEx"} : (!fir.class<!fir.type<_QFtest_polymorphic_mergeTt>>) -> (!fir.class<!fir.type<_QFtest_polymorphic_mergeTt>>, !fir.class<!fir.type<_QFtest_polymorphic_mergeTt>>)
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_polymorphic_mergeEy"} : (!fir.class<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>) -> (!fir.class<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>, !fir.class<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_polymorphic_mergeEm"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_polymorphic_mergeEr"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>>>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_polymorphic_mergeEx"} : (!fir.class<!fir.type<_QFtest_polymorphic_mergeTt>>, !fir.dscope) -> (!fir.class<!fir.type<_QFtest_polymorphic_mergeTt>>, !fir.class<!fir.type<_QFtest_polymorphic_mergeTt>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_polymorphic_mergeEy"} : (!fir.class<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>, !fir.class<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_9:.*]]:3 = fir.box_dims %[[VAL_7]]#0, %[[VAL_8]] : (!fir.class<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>, index) -> (index, index, index)
 ! CHECK:           %[[VAL_10:.*]] = fir.shape %[[VAL_9]]#1 : (index) -> !fir.shape<1>
diff --git a/flang/test/Lower/HLFIR/elemental-user-procedure-ref.f90 b/flang/test/Lower/HLFIR/elemental-user-procedure-ref.f90
index d015ba3b0707..aea23d8d9467 100644
--- a/flang/test/Lower/HLFIR/elemental-user-procedure-ref.f90
+++ b/flang/test/Lower/HLFIR/elemental-user-procedure-ref.f90
@@ -111,7 +111,7 @@ end subroutine
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 20 : index
 ! CHECK:           %[[VAL_3:.*]] = fir.shape %[[VAL_1]], %[[VAL_2]] : (index, index) -> !fir.shape<2>
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) {uniq_name = "_QFimpure_elementalEx"} : (!fir.ref<!fir.array<10x20xf32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<10x20xf32>>, !fir.ref<!fir.array<10x20xf32>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFimpure_elementalEx"} : (!fir.ref<!fir.array<10x20xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xf32>>, !fir.ref<!fir.array<10x20xf32>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
 ! CHECK:           fir.do_loop %[[VAL_6:.*]] = %[[VAL_5]] to %[[VAL_2]] step %[[VAL_5]] {
 ! CHECK:             fir.do_loop %[[VAL_7:.*]] = %[[VAL_5]] to %[[VAL_1]] step %[[VAL_5]] {
@@ -136,7 +136,7 @@ end subroutine
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 20 : index
 ! CHECK:           %[[VAL_3:.*]] = fir.shape %[[VAL_1]], %[[VAL_2]] : (index, index) -> !fir.shape<2>
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) {uniq_name = "_QFordered_elementalEx"} : (!fir.ref<!fir.array<10x20xf32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<10x20xf32>>, !fir.ref<!fir.array<10x20xf32>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFordered_elementalEx"} : (!fir.ref<!fir.array<10x20xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xf32>>, !fir.ref<!fir.array<10x20xf32>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
 ! CHECK:           fir.do_loop %[[VAL_6:.*]] = %[[VAL_5]] to %[[VAL_2]] step %[[VAL_5]] {
 ! CHECK:             fir.do_loop %[[VAL_7:.*]] = %[[VAL_5]] to %[[VAL_1]] step %[[VAL_5]] {
@@ -161,7 +161,7 @@ end subroutine
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 20 : index
 ! CHECK:           %[[VAL_3:.*]] = fir.shape %[[VAL_1]], %[[VAL_2]] : (index, index) -> !fir.shape<2>
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) {uniq_name = "_QFimpure_elemental_arg_evalEx"} : (!fir.ref<!fir.array<10x20xf32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<10x20xf32>>, !fir.ref<!fir.array<10x20xf32>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFimpure_elemental_arg_evalEx"} : (!fir.ref<!fir.array<10x20xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xf32>>, !fir.ref<!fir.array<10x20xf32>>)
 ! CHECK:           %[[VAL_5:.*]] = hlfir.elemental %[[VAL_3]] unordered : (!fir.shape<2>) -> !hlfir.expr<10x20xf32> {
 ! CHECK:           ^bb0(%[[VAL_6:.*]]: index, %[[VAL_7:.*]]: index):
 ! CHECK:             %[[VAL_8:.*]] = hlfir.designate %[[VAL_4]]#0 (%[[VAL_6]], %[[VAL_7]])  : (!fir.ref<!fir.array<10x20xf32>>, index, index) -> !fir.ref<f32>
diff --git a/flang/test/Lower/HLFIR/expr-addr.f90 b/flang/test/Lower/HLFIR/expr-addr.f90
index 876aad8925d7..917a68d59910 100644
--- a/flang/test/Lower/HLFIR/expr-addr.f90
+++ b/flang/test/Lower/HLFIR/expr-addr.f90
@@ -6,7 +6,7 @@
 subroutine foo(x)
   integer :: x
   read (*,*) x
-  ! CHECK: %[[x:.]]:2 = hlfir.declare %[[arg0]] {uniq_name = "_QFfooEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK: %[[x:.]]:2 = hlfir.declare %[[arg0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfooEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   ! CHECK: %[[x_cast:.*]] = fir.convert %[[x]]#1 : (!fir.ref<i32>) -> !fir.ref<i64>
   ! CHECK: fir.call @_FortranAioInputInteger(%{{.*}}, %[[x_cast]], %{{.*}}) {{.*}}: (!fir.ref<i8>, !fir.ref<i64>, i32) -> i1
 end subroutine
diff --git a/flang/test/Lower/HLFIR/expr-box.f90 b/flang/test/Lower/HLFIR/expr-box.f90
index e7ab006751a0..f0de381c7457 100644
--- a/flang/test/Lower/HLFIR/expr-box.f90
+++ b/flang/test/Lower/HLFIR/expr-box.f90
@@ -9,7 +9,7 @@ subroutine foo(x)
 ! CHECK-DAG:  %[[VAL_3:.*]] = arith.constant 21 : index
 ! CHECK-DAG:  %[[VAL_4:.*]] = arith.constant 10 : index
 ! CHECK:  %[[VAL_5:.*]] = fir.shape_shift %[[VAL_3]], %[[VAL_4]] : (index, index) -> !fir.shapeshift<1>
-! CHECK:  %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) {uniq_name = "_QFfooEx"} : (!fir.ref<!fir.array<10xi32>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:  %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFfooEx"} : (!fir.ref<!fir.array<10xi32>>, !fir.shapeshift<1>, !fir.dscope) -> (!fir.box<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:  fir.embox %[[VAL_6]]#1(%[[VAL_5]]) : (!fir.ref<!fir.array<10xi32>>, !fir.shapeshift<1>) -> !fir.box<!fir.array<10xi32>>
 end subroutine
 
diff --git a/flang/test/Lower/HLFIR/expr-value.f90 b/flang/test/Lower/HLFIR/expr-value.f90
index cd2f42533c27..c692ec72bf7e 100644
--- a/flang/test/Lower/HLFIR/expr-value.f90
+++ b/flang/test/Lower/HLFIR/expr-value.f90
@@ -11,7 +11,7 @@ end subroutine
 ! CHECK-LABEL: func.func @_QPfoo_designator(
 ! CHECK-SAME: %[[arg0:.*]]: !fir.ref<i32>
 subroutine foo_designator(n)
-  !CHECK:  %[[n:.*]]:2 = hlfir.declare %[[arg0]] {uniq_name = "_QFfoo_designatorEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  !CHECK:  %[[n:.*]]:2 = hlfir.declare %[[arg0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfoo_designatorEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   print *, n
   ! CHECK: %[[nval:.*]] = fir.load %[[n]]#0 : !fir.ref<i32>
   ! CHECK: fir.call @_FortranAioOutputInteger32(%{{.*}}, %[[nval]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
diff --git a/flang/test/Lower/HLFIR/ignore-rank-unlimited-polymorphic.f90 b/flang/test/Lower/HLFIR/ignore-rank-unlimited-polymorphic.f90
index 43986c8198b9..c2118432a981 100644
--- a/flang/test/Lower/HLFIR/ignore-rank-unlimited-polymorphic.f90
+++ b/flang/test/Lower/HLFIR/ignore-rank-unlimited-polymorphic.f90
@@ -49,7 +49,7 @@ subroutine test_logical_assumed_shape_array(x)
 end subroutine test_logical_assumed_shape_array
 ! CHECK-LABEL:   func.func @_QPtest_logical_assumed_shape_array(
 ! CHECK-SAME:                                                   %[[VAL_0:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest_logical_assumed_shape_arrayEx"} : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_logical_assumed_shape_arrayEx"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.rebox %[[VAL_1]]#0 : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> !fir.class<!fir.array<?xnone>>
 ! CHECK:           %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.class<!fir.array<?xnone>>) -> !fir.class<none>
 ! CHECK:           fir.call @_QPcallee(%[[VAL_3]]) fastmath<contract> : (!fir.class<none>) -> ()
@@ -63,7 +63,7 @@ subroutine test_real_2d_pointer(x)
 end subroutine test_real_2d_pointer
 ! CHECK-LABEL:   func.func @_QPtest_real_2d_pointer(
 ! CHECK-SAME:                                       %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_real_2d_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_real_2d_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>
 ! CHECK:           %[[VAL_3:.*]] = fir.rebox %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.array<?x?xf32>>>) -> !fir.class<!fir.array<?x?xnone>>
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.class<!fir.array<?x?xnone>>) -> !fir.class<none>
@@ -78,7 +78,7 @@ subroutine test_up_assumed_shape_1d_array(x)
 end subroutine test_up_assumed_shape_1d_array
 ! CHECK-LABEL:   func.func @_QPtest_up_assumed_shape_1d_array(
 ! CHECK-SAME:                                                 %[[VAL_0:.*]]: !fir.class<!fir.array<?xnone>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest_up_assumed_shape_1d_arrayEx"} : (!fir.class<!fir.array<?xnone>>) -> (!fir.class<!fir.array<?xnone>>, !fir.class<!fir.array<?xnone>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_up_assumed_shape_1d_arrayEx"} : (!fir.class<!fir.array<?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?xnone>>, !fir.class<!fir.array<?xnone>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.class<!fir.array<?xnone>>) -> !fir.class<none>
 ! CHECK:           fir.call @_QPcallee(%[[VAL_2]]) fastmath<contract> : (!fir.class<none>) -> ()
 ! CHECK:           return
@@ -115,7 +115,7 @@ subroutine test_up_allocatable_2d_array(x)
 end subroutine test_up_allocatable_2d_array
 ! CHECK-LABEL:   func.func @_QPtest_up_allocatable_2d_array(
 ! CHECK-SAME:                                               %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_up_allocatable_2d_arrayEx"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_up_allocatable_2d_arrayEx"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>
 ! CHECK:           %[[VAL_3:.*]] = fir.rebox %[[VAL_2]] : (!fir.class<!fir.heap<!fir.array<?x?xnone>>>) -> !fir.class<!fir.array<?x?xnone>>
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.class<!fir.array<?x?xnone>>) -> !fir.class<none>
@@ -130,7 +130,7 @@ subroutine test_up_pointer_1d_array(x)
 end subroutine test_up_pointer_1d_array
 ! CHECK-LABEL:   func.func @_QPtest_up_pointer_1d_array(
 ! CHECK-SAME:                                           %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_up_pointer_1d_arrayEx"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_up_pointer_1d_arrayEx"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>
 ! CHECK:           %[[VAL_3:.*]] = fir.rebox %[[VAL_2]] : (!fir.class<!fir.ptr<!fir.array<?xnone>>>) -> !fir.class<!fir.array<?xnone>>
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.class<!fir.array<?xnone>>) -> !fir.class<none>
diff --git a/flang/test/Lower/HLFIR/implicit-type-conversion.f90 b/flang/test/Lower/HLFIR/implicit-type-conversion.f90
index ec0fb6e3bb12..dc2d111a8f7f 100644
--- a/flang/test/Lower/HLFIR/implicit-type-conversion.f90
+++ b/flang/test/Lower/HLFIR/implicit-type-conversion.f90
@@ -3,8 +3,8 @@
 ! CHECK-LABEL:   func.func @_QPtest1(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "x"},
 ! CHECK-SAME:                        %[[VAL_1:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest1Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFtest1Ey"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest1Ex"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest1Ey"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.logical<4>>
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.logical<4>) -> i32
 ! CHECK:           hlfir.assign %[[VAL_5]] to %[[VAL_2]]#0 : i32, !fir.ref<i32>
@@ -19,8 +19,8 @@ end subroutine test1
 ! CHECK-LABEL:   func.func @_QPtest2(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "x"},
 ! CHECK-SAME:                        %[[VAL_1:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest2Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFtest2Ey"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest2Ex"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest2Ey"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i32>
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i32) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_5]] to %[[VAL_3]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -35,8 +35,8 @@ end subroutine test2
 ! CHECK-LABEL:   func.func @_QPtest3(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "x"},
 ! CHECK-SAME:                        %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest3Ex"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFtest3Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest3Ex"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest3Ey"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 1 : i32
 ! CHECK:           %[[VAL_6:.*]] = arith.cmpi eq, %[[VAL_4]], %[[VAL_5]] : i32
@@ -54,8 +54,8 @@ end subroutine test3
 ! CHECK-LABEL:   func.func @_QPtest4(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "x"},
 ! CHECK-SAME:                        %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest4Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFtest4Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest4Ex"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest4Ey"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 1 : i32
 ! CHECK:           %[[VAL_6:.*]] = arith.cmpi eq, %[[VAL_4]], %[[VAL_5]] : i32
@@ -73,8 +73,8 @@ end subroutine test4
 ! CHECK-LABEL:   func.func @_QPtest5(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "x"},
 ! CHECK-SAME:                        %[[VAL_1:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest5Ex"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFtest5Ey"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest5Ex"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest5Ey"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.logical<4>>
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.logical<4>) -> i32
 ! CHECK:           hlfir.assign %[[VAL_5]] to %[[VAL_2]]#0 : i32, !fir.box<!fir.array<?xi32>>
@@ -89,8 +89,8 @@ end subroutine test5
 ! CHECK-LABEL:   func.func @_QPtest6(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "x"},
 ! CHECK-SAME:                        %[[VAL_1:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest6Ex"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFtest6Ey"} : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest6Ex"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest6Ey"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_3]]#0, %[[VAL_4]] : (!fir.box<!fir.array<?x!fir.logical<4>>>, index) -> (index, index, index)
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]]#1 : (index) -> !fir.shape<1>
@@ -114,8 +114,8 @@ end subroutine test6
 ! CHECK-LABEL:   func.func @_QPtest7(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>> {fir.bindc_name = "x"},
 ! CHECK-SAME:                        %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest7Ex"} : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFtest7Ey"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest7Ex"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest7Ey"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_3]]#0, %[[VAL_4]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]]#1 : (index) -> !fir.shape<1>
diff --git a/flang/test/Lower/HLFIR/intentout-allocatable-components.f90 b/flang/test/Lower/HLFIR/intentout-allocatable-components.f90
index 797e4c89ae23..9d4bedbd9be6 100644
--- a/flang/test/Lower/HLFIR/intentout-allocatable-components.f90
+++ b/flang/test/Lower/HLFIR/intentout-allocatable-components.f90
@@ -10,7 +10,7 @@ subroutine test_intentout_component_deallocate(a)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_intentout_component_deallocate(
 ! CHECK-SAME:      %[[VAL_0:.*]]: !fir.ref<!fir.type<_QFtest_intentout_component_deallocateTt{x:!fir.box<!fir.heap<i32>>}>>
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<intent_out>, uniq_name = "_QFtest_intentout_component_deallocateEa"}
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_out>, uniq_name = "_QFtest_intentout_component_deallocateEa"}
 ! CHECK:  %[[VAL_2:.*]] = fir.embox %[[VAL_1]]#1 : (!fir.ref<!fir.type<_QFtest_intentout_component_deallocateTt{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<!fir.type<_QFtest_intentout_component_deallocateTt{x:!fir.box<!fir.heap<i32>>}>>
 ! CHECK:  %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.box<!fir.type<_QFtest_intentout_component_deallocateTt{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<none>
 ! CHECK:  %[[VAL_4:.*]] = fir.call @_FortranADestroy(%[[VAL_3]]) fastmath<contract> : (!fir.box<none>) -> none
@@ -23,7 +23,7 @@ subroutine test_intentout_optional_component_deallocate(a)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_intentout_optional_component_deallocate(
 ! CHECK-SAME:      %[[VAL_0:.*]]: !fir.ref<!fir.type<_QFtest_intentout_optional_component_deallocateTt{x:!fir.box<!fir.heap<i32>>}>>
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<intent_out, optional>, uniq_name = "_QFtest_intentout_optional_component_deallocateEa"}
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_out, optional>, uniq_name = "_QFtest_intentout_optional_component_deallocateEa"}
 ! CHECK:  %[[VAL_2:.*]] = fir.is_present %[[VAL_1]]#1 : (!fir.ref<!fir.type<_QFtest_intentout_optional_component_deallocateTt{x:!fir.box<!fir.heap<i32>>}>>) -> i1
 ! CHECK:  fir.if %[[VAL_2]] {
 ! CHECK:    %[[VAL_3:.*]] = fir.embox %[[VAL_1]]#1 : (!fir.ref<!fir.type<_QFtest_intentout_optional_component_deallocateTt{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<!fir.type<_QFtest_intentout_optional_component_deallocateTt{x:!fir.box<!fir.heap<i32>>}>>
diff --git a/flang/test/Lower/HLFIR/internal-procedures.f90 b/flang/test/Lower/HLFIR/internal-procedures.f90
index 3c4439911809..f0df1a7f6e64 100644
--- a/flang/test/Lower/HLFIR/internal-procedures.f90
+++ b/flang/test/Lower/HLFIR/internal-procedures.f90
@@ -64,7 +64,7 @@ contains
 end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_proc_pointer(
 ! CHECK-SAME:                                    %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointerEp"} : (!fir.ref<!fir.boxproc<() -> ()>>) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointerEp"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca tuple<!fir.ref<!fir.boxproc<() -> ()>>>
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 0 : i32
 ! CHECK:           %[[VAL_4:.*]] = fir.coordinate_of %[[VAL_2]], %[[VAL_3]] : (!fir.ref<tuple<!fir.ref<!fir.boxproc<() -> ()>>>>, i32) -> !fir.llvm_ptr<!fir.ref<!fir.boxproc<() -> ()>>>
diff --git a/flang/test/Lower/HLFIR/intrinsic-dynamically-optional.f90 b/flang/test/Lower/HLFIR/intrinsic-dynamically-optional.f90
index 39671d7931a1..d1969049828c 100644
--- a/flang/test/Lower/HLFIR/intrinsic-dynamically-optional.f90
+++ b/flang/test/Lower/HLFIR/intrinsic-dynamically-optional.f90
@@ -166,10 +166,10 @@ end function
 ! CHECK-SAME:                                                   %[[VAL_1:.*]]: !fir.ref<!fir.array<3xf32>> {fir.bindc_name = "imaginary", fir.optional}) -> !fir.array<3x!fir.complex<4>> {
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_3:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_3]]) {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest_elemental_optional_as_valueEimaginary"} : (!fir.ref<!fir.array<3xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<3xf32>>, !fir.ref<!fir.array<3xf32>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest_elemental_optional_as_valueEimaginary"} : (!fir.ref<!fir.array<3xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<3xf32>>, !fir.ref<!fir.array<3xf32>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "_QFtest_elemental_optional_as_valueEreal"} : (!fir.ref<!fir.array<3xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<3xf32>>, !fir.ref<!fir.array<3xf32>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_elemental_optional_as_valueEreal"} : (!fir.ref<!fir.array<3xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<3xf32>>, !fir.ref<!fir.array<3xf32>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_9:.*]] = fir.alloca !fir.array<3x!fir.complex<4>> {bindc_name = "test_elemental_optional_as_value", uniq_name = "_QFtest_elemental_optional_as_valueEtest_elemental_optional_as_value"}
 ! CHECK:           %[[VAL_10:.*]] = fir.shape %[[VAL_8]] : (index) -> !fir.shape<1>
diff --git a/flang/test/Lower/HLFIR/issue80884.f90 b/flang/test/Lower/HLFIR/issue80884.f90
index 2a7792b6004c..725ed1982975 100644
--- a/flang/test/Lower/HLFIR/issue80884.f90
+++ b/flang/test/Lower/HLFIR/issue80884.f90
@@ -12,8 +12,8 @@ subroutine issue80884(p, targ)
   p(1:100) => targ%array
 end subroutine
 ! CHECK-LABEL:   func.func @_QPissue80884(
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFissue80884Ep"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFissue80884Etarg"} : (!fir.ref<!fir.type<_QFissue80884Tt{t0:!fir.type<_QFissue80884Tt0{array:!fir.array<10x10xf32>}>}>>) -> (!fir.ref<!fir.type<_QFissue80884Tt{t0:!fir.type<_QFissue80884Tt0{array:!fir.array<10x10xf32>}>}>>, !fir.ref<!fir.type<_QFissue80884Tt{t0:!fir.type<_QFissue80884Tt0{array:!fir.array<10x10xf32>}>}>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFissue80884Ep"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFissue80884Etarg"} : (!fir.ref<!fir.type<_QFissue80884Tt{t0:!fir.type<_QFissue80884Tt0{array:!fir.array<10x10xf32>}>}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QFissue80884Tt{t0:!fir.type<_QFissue80884Tt0{array:!fir.array<10x10xf32>}>}>>, !fir.ref<!fir.type<_QFissue80884Tt{t0:!fir.type<_QFissue80884Tt0{array:!fir.array<10x10xf32>}>}>>)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 1 : i64
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : i64
 ! CHECK:           %[[VAL_6:.*]] = hlfir.designate %[[VAL_3]]#0{"t0"}   : (!fir.ref<!fir.type<_QFissue80884Tt{t0:!fir.type<_QFissue80884Tt0{array:!fir.array<10x10xf32>}>}>>) -> !fir.ref<!fir.type<_QFissue80884Tt0{array:!fir.array<10x10xf32>}>>
diff --git a/flang/test/Lower/HLFIR/maxloc.f90 b/flang/test/Lower/HLFIR/maxloc.f90
index ea3cce92ae90..166a1b9db172 100644
--- a/flang/test/Lower/HLFIR/maxloc.f90
+++ b/flang/test/Lower/HLFIR/maxloc.f90
@@ -357,11 +357,12 @@ subroutine scalar_dim1(a, d, m, b, s)
 end subroutine
 ! CHECK-LABEL:  func.func @_QPscalar_dim1(
 ! CHECK:            %[[ARG0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "d"}, %[[ARG2:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>> {fir.bindc_name = "m"}, %[[ARG3:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "b"}, %[[ARG4:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "s"}) {
-! CHECK-NEXT:    %[[V0:.*]]:2 = hlfir.declare %[[ARG0]]
-! CHECK-NEXT:    %[[V1:.*]]:2 = hlfir.declare %[[ARG3]]
-! CHECK-NEXT:    %[[V2:.*]]:2 = hlfir.declare %[[ARG1]]
-! CHECK-NEXT:    %[[V3:.*]]:2 = hlfir.declare %[[ARG2]]
-! CHECK-NEXT:    %[[V4:.*]]:2 = hlfir.declare %[[ARG4]]
+! CHECK-NEXT:    %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK-NEXT:    %[[V0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[DSCOPE]]
+! CHECK-NEXT:    %[[V1:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %[[DSCOPE]]
+! CHECK-NEXT:    %[[V2:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[DSCOPE]]
+! CHECK-NEXT:    %[[V3:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[DSCOPE]]
+! CHECK-NEXT:    %[[V4:.*]]:2 = hlfir.declare %[[ARG4]] dummy_scope %[[DSCOPE]]
 ! CHECK-NEXT:    %[[V5:.*]] = fir.load %[[V1]]#0 : !fir.ref<!fir.logical<4>>
 ! CHECK-NEXT:    %[[V6:.*]] = fir.load %[[V2]]#0 : !fir.ref<i32>
 ! CHECK-NEXT:    %[[V7:.*]] = hlfir.maxloc %[[V0]]#0 dim %[[V6]] mask %[[V3]]#0 back %[[V5]] {fastmath = #arith.fastmath<contract>} : (!fir.box<!fir.array<?xi32>>, i32, !fir.box<!fir.array<?x!fir.logical<4>>>, !fir.logical<4>) -> i16
diff --git a/flang/test/Lower/HLFIR/minloc.f90 b/flang/test/Lower/HLFIR/minloc.f90
index c27430689ee0..f835cf54b2a7 100644
--- a/flang/test/Lower/HLFIR/minloc.f90
+++ b/flang/test/Lower/HLFIR/minloc.f90
@@ -357,11 +357,12 @@ subroutine scalar_dim1(a, d, m, b, s)
 end subroutine
 ! CHECK-LABEL:  func.func @_QPscalar_dim1(
 ! CHECK:            %[[ARG0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "d"}, %[[ARG2:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>> {fir.bindc_name = "m"}, %[[ARG3:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "b"}, %[[ARG4:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "s"}) {
-! CHECK-NEXT:    %[[V0:.*]]:2 = hlfir.declare %[[ARG0]]
-! CHECK-NEXT:    %[[V1:.*]]:2 = hlfir.declare %[[ARG3]]
-! CHECK-NEXT:    %[[V2:.*]]:2 = hlfir.declare %[[ARG1]]
-! CHECK-NEXT:    %[[V3:.*]]:2 = hlfir.declare %[[ARG2]]
-! CHECK-NEXT:    %[[V4:.*]]:2 = hlfir.declare %[[ARG4]]
+! CHECK-NEXT:    %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK-NEXT:    %[[V0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[DSCOPE]]
+! CHECK-NEXT:    %[[V1:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %[[DSCOPE]]
+! CHECK-NEXT:    %[[V2:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[DSCOPE]]
+! CHECK-NEXT:    %[[V3:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[DSCOPE]]
+! CHECK-NEXT:    %[[V4:.*]]:2 = hlfir.declare %[[ARG4]] dummy_scope %[[DSCOPE]]
 ! CHECK-NEXT:    %[[V5:.*]] = fir.load %[[V1]]#0 : !fir.ref<!fir.logical<4>>
 ! CHECK-NEXT:    %[[V6:.*]] = fir.load %[[V2]]#0 : !fir.ref<i32>
 ! CHECK-NEXT:    %[[V7:.*]] = hlfir.minloc %[[V0]]#0 dim %[[V6]] mask %[[V3]]#0 back %[[V5]] {fastmath = #arith.fastmath<contract>} : (!fir.box<!fir.array<?xi32>>, i32, !fir.box<!fir.array<?x!fir.logical<4>>>, !fir.logical<4>) -> i16
diff --git a/flang/test/Lower/HLFIR/procedure-pointer.f90 b/flang/test/Lower/HLFIR/procedure-pointer.f90
index 28965b22de97..ce20f19322b4 100644
--- a/flang/test/Lower/HLFIR/procedure-pointer.f90
+++ b/flang/test/Lower/HLFIR/procedure-pointer.f90
@@ -186,10 +186,10 @@ end subroutine
 subroutine sub7(p1, p2)
 use m
   procedure(real_func), pointer :: p1
-! CHECK: %[[VAL_0:.*]]:2 = hlfir.declare %arg0 {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsub7Ep1"} : (!fir.ref<!fir.boxproc<() -> ()>>) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK: %[[VAL_0:.*]]:2 = hlfir.declare %arg0 dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsub7Ep1"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
 
   procedure(char_func), pointer :: p2
-! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %arg1 {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsub7Ep2"} : (!fir.ref<!fir.boxproc<() -> ()>>) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %arg1 dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsub7Ep2"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
 
   call foo1(p1)
 ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]]#0 : !fir.ref<!fir.boxproc<() -> ()>>
@@ -265,7 +265,7 @@ contains
   function reffunc(arg) result(pp)
     integer :: arg
     procedure(real_func), pointer :: pp
-! CHECK: %[[VAL_0:.*]]:2 = hlfir.declare %arg0 {uniq_name = "_QFsub10FreffuncEarg"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[VAL_0:.*]]:2 = hlfir.declare %arg0 dummy_scope %{{[0-9]+}} {uniq_name = "_QFsub10FreffuncEarg"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.boxproc<(!fir.ref<f32>) -> f32> {bindc_name = "pp", uniq_name = "_QFsub10FreffuncEpp"}
 ! CHECK: %[[VAL_2:.*]] = fir.zero_bits (!fir.ref<f32>) -> f32
 ! CHECK: %[[VAL_3:.*]] = fir.emboxproc %[[VAL_2]] : ((!fir.ref<f32>) -> f32) -> !fir.boxproc<(!fir.ref<f32>) -> f32>
diff --git a/flang/test/Lower/HLFIR/statement-functions.f90 b/flang/test/Lower/HLFIR/statement-functions.f90
index d19b912e0fe2..4f91c947690c 100644
--- a/flang/test/Lower/HLFIR/statement-functions.f90
+++ b/flang/test/Lower/HLFIR/statement-functions.f90
@@ -43,7 +43,7 @@ subroutine char_test2(c)
   call test(stmt_func(c))
 end subroutine
 ! CHECK-LABEL:  func.func @_QPchar_test2(
-! CHECK:    %[[C:.*]]:2 = hlfir.declare %1 typeparams %c10 {uniq_name = "_QFchar_test2Ec"} : (!fir.ref<!fir.char<1,10>>, index) -> (!fir.ref<!fir.char<1,10>>, !fir.ref<!fir.char<1,10>>)
+! CHECK:    %[[C:.*]]:2 = hlfir.declare %{{.*}} typeparams %c10 dummy_scope %{{[0-9]+}} {uniq_name = "_QFchar_test2Ec"} : (!fir.ref<!fir.char<1,10>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,10>>, !fir.ref<!fir.char<1,10>>)
 ! CHECK:    %[[CAST:.*]] = fir.convert %[[C]]#0 : (!fir.ref<!fir.char<1,10>>) -> !fir.ref<!fir.char<1,5>>
 ! CHECK:    %[[C_STMT_FUNC:.*]]:2 = hlfir.declare %[[CAST]] typeparams %c5{{.*}} {uniq_name = "_QFchar_test2Fstmt_funcEc_stmt_func"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
 ! CHECK:    hlfir.concat %[[C_STMT_FUNC]]#0, %{{.*}} len %{{.*}} : (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,7>>, index) -> !hlfir.expr<!fir.char<1,12>>
diff --git a/flang/test/Lower/HLFIR/structure-constructor.f90 b/flang/test/Lower/HLFIR/structure-constructor.f90
index d02427d2ff67..41d08c14f5fa 100644
--- a/flang/test/Lower/HLFIR/structure-constructor.f90
+++ b/flang/test/Lower/HLFIR/structure-constructor.f90
@@ -43,7 +43,7 @@ end subroutine test1
 ! CHECK:           %[[VAL_4:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,4>>
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 4 : index
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_5]] typeparams %[[VAL_6]] {uniq_name = "_QFtest1Ex"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_5]] typeparams %[[VAL_6]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest1Ex"} : (!fir.ref<!fir.char<1,4>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
 ! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>) -> (!fir.ref<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>, !fir.ref<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>)
 ! CHECK:           %[[VAL_9:.*]] = fir.embox %[[VAL_8]]#0 : (!fir.ref<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>) -> !fir.box<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>
 ! CHECK:           %[[VAL_10:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,{{[0-9]*}}>>
@@ -71,7 +71,7 @@ end subroutine test2
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFtest2Eres"} : (!fir.ref<!fir.type<_QMtypesTt2{i:!fir.array<10xi32>}>>) -> (!fir.ref<!fir.type<_QMtypesTt2{i:!fir.array<10xi32>}>>, !fir.ref<!fir.type<_QMtypesTt2{i:!fir.array<10xi32>}>>)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) {uniq_name = "_QFtest2Ex"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest2Ex"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMtypesTt2{i:!fir.array<10xi32>}>>) -> (!fir.ref<!fir.type<_QMtypesTt2{i:!fir.array<10xi32>}>>, !fir.ref<!fir.type<_QMtypesTt2{i:!fir.array<10xi32>}>>)
 ! CHECK:           %[[VAL_8:.*]] = fir.embox %[[VAL_7]]#0 : (!fir.ref<!fir.type<_QMtypesTt2{i:!fir.array<10xi32>}>>) -> !fir.box<!fir.type<_QMtypesTt2{i:!fir.array<10xi32>}>>
 ! CHECK:           %[[VAL_9:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,{{[0-9]*}}>>
@@ -104,7 +104,7 @@ end subroutine test3
 ! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_4]] : (!fir.box<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_5]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
 ! CHECK:           %[[VAL_9:.*]] = fir.call @_FortranAInitialize(%[[VAL_7]], %[[VAL_8]], %[[VAL_6]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> none
-! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest3Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest3Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
 ! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) -> (!fir.ref<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>, !fir.ref<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>)
 ! CHECK:           %[[VAL_12:.*]] = fir.embox %[[VAL_11]]#0 : (!fir.ref<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) -> !fir.box<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>
 ! CHECK:           %[[VAL_13:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,{{[0-9]*}}>>
@@ -141,7 +141,7 @@ end subroutine test4
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_5]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
 ! CHECK:           %[[VAL_9:.*]] = fir.call @_FortranAInitialize(%[[VAL_7]], %[[VAL_8]], %[[VAL_6]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> none
 ! CHECK:           %[[VAL_10:.*]] = arith.constant 2 : index
-! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_10]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest4Ex"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>>, index) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>>)
+! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_10]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest4Ex"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>>)
 ! CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>) -> (!fir.ref<!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>, !fir.ref<!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>)
 ! CHECK:           %[[VAL_13:.*]] = fir.embox %[[VAL_12]]#0 : (!fir.ref<!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>) -> !fir.box<!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>
 ! CHECK:           %[[VAL_14:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,{{[0-9]*}}>>
@@ -184,7 +184,7 @@ end subroutine test5
 ! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_4]] : (!fir.box<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_5]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
 ! CHECK:           %[[VAL_9:.*]] = fir.call @_FortranAInitialize(%[[VAL_7]], %[[VAL_8]], %[[VAL_6]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> none
-! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest5Ex"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>)
+! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest5Ex"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>)
 ! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>) -> (!fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>, !fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>)
 ! CHECK:           %[[VAL_12:.*]] = fir.embox %[[VAL_11]]#0 : (!fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>) -> !fir.box<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>
 ! CHECK:           %[[VAL_13:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,{{[0-9]*}}>>
@@ -226,7 +226,7 @@ end subroutine test6
 ! CHECK:           %[[VAL_7:.*]]:2 = fir.unboxchar %[[VAL_1]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_7]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,4>>
 ! CHECK:           %[[VAL_9:.*]] = arith.constant 4 : index
-! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_8]] typeparams %[[VAL_9]] {uniq_name = "_QFtest6Ec"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
+! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_8]] typeparams %[[VAL_9]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest6Ec"} : (!fir.ref<!fir.char<1,4>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
 ! CHECK:           %[[VAL_11:.*]] = fir.alloca !fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}> {bindc_name = "res", uniq_name = "_QFtest6Eres"}
 ! CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFtest6Eres"} : (!fir.ref<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>) -> (!fir.ref<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>, !fir.ref<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>)
 ! CHECK:           %[[VAL_13:.*]] = fir.embox %[[VAL_12]]#1 : (!fir.ref<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>) -> !fir.box<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>
@@ -235,7 +235,7 @@ end subroutine test6
 ! CHECK:           %[[VAL_16:.*]] = fir.convert %[[VAL_13]] : (!fir.box<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_14]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
 ! CHECK:           %[[VAL_18:.*]] = fir.call @_FortranAInitialize(%[[VAL_16]], %[[VAL_17]], %[[VAL_15]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> none
-! CHECK:           %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest6Ex"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>)
+! CHECK:           %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest6Ex"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>)
 ! CHECK:           %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>) -> (!fir.ref<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>, !fir.ref<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>)
 ! CHECK:           %[[VAL_21:.*]] = fir.embox %[[VAL_20]]#0 : (!fir.ref<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>) -> !fir.box<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>
 ! CHECK:           %[[VAL_22:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,{{[0-9]*}}>>
@@ -316,7 +316,7 @@ end subroutine test7
 ! CHECK-LABEL:   func.func @_QPtest7(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.type<_QMtypesTt7{c1:i32,c2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest7En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest7En"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.type<_QMtypesTt7{c1:i32,c2:!fir.box<!fir.heap<!fir.array<?xf32>>>}> {bindc_name = "x", uniq_name = "_QFtest7Ex"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFtest7Ex"} : (!fir.ref<!fir.type<_QMtypesTt7{c1:i32,c2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>) -> (!fir.ref<!fir.type<_QMtypesTt7{c1:i32,c2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>, !fir.ref<!fir.type<_QMtypesTt7{c1:i32,c2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>)
 ! CHECK:           %[[VAL_5:.*]] = fir.embox %[[VAL_4]]#1 : (!fir.ref<!fir.type<_QMtypesTt7{c1:i32,c2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>) -> !fir.box<!fir.type<_QMtypesTt7{c1:i32,c2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>
diff --git a/flang/test/Lower/HLFIR/transformational.f90 b/flang/test/Lower/HLFIR/transformational.f90
index 5f1137277336..96cda5daaacb 100644
--- a/flang/test/Lower/HLFIR/transformational.f90
+++ b/flang/test/Lower/HLFIR/transformational.f90
@@ -16,7 +16,7 @@ subroutine test_transformational_implemented_with_runtime_allocation(x)
 end subroutine
 ! CHECK-LABEL: func.func @_QPtest_transformational_implemented_with_runtime_allocation(
 ! CHECK-SAME:                                                                          %[[ARG0:.*]]: !fir.ref<!fir.array<10x10xf32>> {fir.bindc_name = "x"}) {
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) {uniq_name = "_QFtest_transformational_implemented_with_runtime_allocationEx"}
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_transformational_implemented_with_runtime_allocationEx"}
 ! CHECK:  %[[VAL_2:.*]] = hlfir.minloc %[[VAL_1]]#0
 ! CHECK:  %[[VAL_3:.*]] = hlfir.shape_of %[[VAL_2]]
 ! CHECK:  %[[VAL_4:.*]]:3 = hlfir.associate %[[VAL_2]](%[[VAL_3]]) {adapt.valuebyref}
diff --git a/flang/test/Lower/HLFIR/transpose.f90 b/flang/test/Lower/HLFIR/transpose.f90
index e37e83c7a501..6d8e337f1ac8 100644
--- a/flang/test/Lower/HLFIR/transpose.f90
+++ b/flang/test/Lower/HLFIR/transpose.f90
@@ -8,8 +8,8 @@ endsubroutine
 ! CHECK-LABEL: func.func @_QPtranspose1
 ! CHECK:           %[[M_ARG:.*]]: !fir.ref<!fir.array<1x2xi32>>
 ! CHECK:           %[[RES_ARG:.*]]: !fir.ref<!fir.array<2x1xi32>>
-! CHECK-DAG:     %[[ARG:.*]]:2 = hlfir.declare %[[M_ARG]](%[[M_SHAPE:.*]]) {[[NAME:.*]]} : (!fir.ref<!fir.array<1x2xi32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<1x2xi32>>, !fir.ref<!fir.array<1x2xi32>>)
-! CHECK-DAG:     %[[RES:.*]]:2 = hlfir.declare %[[RES_ARG]](%[[RES_SHAPE:.*]]) {[[NAME2:.*]]} : (!fir.ref<!fir.array<2x1xi32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<2x1xi32>>, !fir.ref<!fir.array<2x1xi32>>)
+! CHECK-DAG:     %[[ARG:.*]]:2 = hlfir.declare %[[M_ARG]](%[[M_SHAPE:.*]]) dummy_scope %{{[0-9]+}} {[[NAME:.*]]} : (!fir.ref<!fir.array<1x2xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<1x2xi32>>, !fir.ref<!fir.array<1x2xi32>>)
+! CHECK-DAG:     %[[RES:.*]]:2 = hlfir.declare %[[RES_ARG]](%[[RES_SHAPE:.*]]) dummy_scope %{{[0-9]+}} {[[NAME2:.*]]} : (!fir.ref<!fir.array<2x1xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<2x1xi32>>, !fir.ref<!fir.array<2x1xi32>>)
 ! CHECK:         %[[EXPR:.*]] = hlfir.transpose %[[ARG]]#0 : (!fir.ref<!fir.array<1x2xi32>>) -> !hlfir.expr<2x1xi32>
 ! CHECK-NEXT:    hlfir.assign %[[EXPR]] to %[[RES]]#0
 ! CHECK-NEXT:    hlfir.destroy %[[EXPR]]
@@ -38,7 +38,7 @@ endsubroutine
 ! CHECK:           %[[M_ARG:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>
 ! CHECK:           %[[RES_ARG:.*]]: !fir.ref<!fir.array<2x1xi32>>
 ! CHECK-DAG:     %[[ARG:.*]]:2 = hlfir.declare %[[M_ARG]]
-! CHECK-DAG:     %[[RES:.*]]:2 = hlfir.declare %[[RES_ARG]](%[[RES_SHAPE:.*]]) {[[NAME2:.*]]} : (!fir.ref<!fir.array<2x1xi32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<2x1xi32>>, !fir.ref<!fir.array<2x1xi32>>)
+! CHECK-DAG:     %[[RES:.*]]:2 = hlfir.declare %[[RES_ARG]](%[[RES_SHAPE:.*]]) dummy_scope %{{[0-9]+}} {[[NAME2:.*]]} : (!fir.ref<!fir.array<2x1xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<2x1xi32>>, !fir.ref<!fir.array<2x1xi32>>)
 ! CHECK:         %[[ARG_LOADED:.*]] = fir.load %[[ARG]]#0
 ! CHECK:         %[[EXPR:.*]] = hlfir.transpose %[[ARG_LOADED]] : (!fir.box<!fir.heap<!fir.array<?x?xi32>>>) -> !hlfir.expr<?x?xi32>
 ! CHECK-NEXT:    hlfir.assign %[[EXPR]] to %[[RES]]#0
@@ -54,8 +54,8 @@ end subroutine test_polymorphic_result
 ! CHECK-LABEL:   func.func @_QPtest_polymorphic_result(
 ! CHECK-SAME:        %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>> {fir.bindc_name = "m"},
 ! CHECK-SAME:        %[[VAL_1:.*]]: !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>> {fir.bindc_name = "res"}) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_polymorphic_resultEm"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_polymorphic_resultEres"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_polymorphic_resultEm"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_polymorphic_resultEres"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>)
 ! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>
 ! CHECK:           %[[VAL_5:.*]] = hlfir.transpose %[[VAL_4]] : (!fir.class<!fir.heap<!fir.array<?x?xnone>>>) -> !hlfir.expr<?x?xnone?>
 ! CHECK:           hlfir.assign %[[VAL_5]] to %[[VAL_3]]#0 realloc : !hlfir.expr<?x?xnone?>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>
diff --git a/flang/test/Lower/HLFIR/unary-ops.f90 b/flang/test/Lower/HLFIR/unary-ops.f90
index db2c1ceefaa9..b04d6b4cf949 100644
--- a/flang/test/Lower/HLFIR/unary-ops.f90
+++ b/flang/test/Lower/HLFIR/unary-ops.f90
@@ -39,7 +39,7 @@ subroutine test_not(l, x)
   l = .not.x
 end subroutine
 ! CHECK-LABEL: func.func @_QPtest_not(
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}}x"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}}x"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:  %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.logical<4>>
 ! CHECK:  %[[VAL_5:.*]] = arith.constant true
 ! CHECK:  %[[VAL_6:.*]] = fir.convert %[[VAL_4]] : (!fir.logical<4>) -> i1
@@ -50,7 +50,7 @@ subroutine test_negate_int(res, x)
   res = -x
 end subroutine
 ! CHECK-LABEL: func.func @_QPtest_negate_int(
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}}x"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}}x"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:  %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
 ! CHECK:  %[[VAL_5:.*]] = arith.constant 0 : i32
 ! CHECK:  %[[VAL_6:.*]] = arith.subi %[[VAL_5]], %[[VAL_4]] : i32
@@ -60,7 +60,7 @@ subroutine test_negate_real(res, x)
   res = -x
 end subroutine
 ! CHECK-LABEL: func.func @_QPtest_negate_real(
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}}x"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}}x"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:  %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<f32>
 ! CHECK:  %[[VAL_5:.*]] = arith.negf %[[VAL_4]] fastmath<contract> : f32
 
@@ -69,7 +69,7 @@ subroutine test_negate_complex(res, x)
   res = -x
 end subroutine
 ! CHECK-LABEL: func.func @_QPtest_negate_complex(
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}}x"} : (!fir.ref<!fir.complex<4>>) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}}x"} : (!fir.ref<!fir.complex<4>>, !fir.dscope) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
 ! CHECK:  %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.complex<4>>
 ! CHECK:  %[[VAL_5:.*]] = fir.negc %[[VAL_4]] : !fir.complex<4>
 
@@ -79,7 +79,7 @@ subroutine test_complex_component_real(res, x)
   res = real(x)
 end subroutine
 ! CHECK-LABEL: func.func @_QPtest_complex_component_real(
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}}x"} : (!fir.ref<!fir.complex<4>>) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}}x"} : (!fir.ref<!fir.complex<4>>, !fir.dscope) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
 ! CHECK:  %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.complex<4>>
 ! CHECK:  %[[VAL_5:.*]] = fir.extract_value %[[VAL_4]], [0 : index] : (!fir.complex<4>) -> f32
 
@@ -89,6 +89,6 @@ subroutine test_complex_component_imag(res, x)
   res = aimag(x)
 end subroutine
 ! CHECK-LABEL: func.func @_QPtest_complex_component_imag(
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}}x"} : (!fir.ref<!fir.complex<4>>) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}}x"} : (!fir.ref<!fir.complex<4>>, !fir.dscope) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
 ! CHECK:  %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.complex<4>>
 ! CHECK:  %[[VAL_5:.*]] = fir.extract_value %[[VAL_4]], [1 : index] : (!fir.complex<4>) -> f32
diff --git a/flang/test/Lower/HLFIR/user-defined-assignment.f90 b/flang/test/Lower/HLFIR/user-defined-assignment.f90
index 6f887cb00de3..f0e24f11c5ab 100644
--- a/flang/test/Lower/HLFIR/user-defined-assignment.f90
+++ b/flang/test/Lower/HLFIR/user-defined-assignment.f90
@@ -35,8 +35,8 @@ end subroutine
 ! CHECK-LABEL:   func.func @_QMuser_defPtest_user_defined_elemental_array(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "i"},
 ! CHECK-SAME:    %[[VAL_1:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>> {fir.bindc_name = "l"}) {
-! CHECK:    %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QMuser_defFtest_user_defined_elemental_arrayEi"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
-! CHECK:    %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QMuser_defFtest_user_defined_elemental_arrayEl"} : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
+! CHECK:    %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_user_defined_elemental_arrayEi"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:    %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_user_defined_elemental_arrayEl"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
 ! CHECK:    hlfir.region_assign {
 ! CHECK:      hlfir.yield %[[VAL_3]]#0 : !fir.box<!fir.array<?x!fir.logical<4>>>
 ! CHECK:    } to {
@@ -53,8 +53,8 @@ end subroutine
 ! CHECK-LABEL:   func.func @_QMuser_defPtest_user_defined_elemental_array_value(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.box<!fir.array<?x!fir.complex<4>>> {fir.bindc_name = "z"},
 ! CHECK-SAME:    %[[VAL_1:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>> {fir.bindc_name = "l"}) {
-! CHECK:    %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QMuser_defFtest_user_defined_elemental_array_valueEl"} : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
-! CHECK:    %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QMuser_defFtest_user_defined_elemental_array_valueEz"} : (!fir.box<!fir.array<?x!fir.complex<4>>>) -> (!fir.box<!fir.array<?x!fir.complex<4>>>, !fir.box<!fir.array<?x!fir.complex<4>>>)
+! CHECK:    %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_user_defined_elemental_array_valueEl"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
+! CHECK:    %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_user_defined_elemental_array_valueEz"} : (!fir.box<!fir.array<?x!fir.complex<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.complex<4>>>, !fir.box<!fir.array<?x!fir.complex<4>>>)
 ! CHECK:    hlfir.region_assign {
 ! CHECK:      hlfir.yield %[[VAL_2]]#0 : !fir.box<!fir.array<?x!fir.logical<4>>>
 ! CHECK:    } to {
@@ -72,8 +72,8 @@ end subroutine
 ! CHECK-LABEL:   func.func @_QMuser_defPtest_user_defined_scalar(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "i"},
 ! CHECK-SAME:    %[[VAL_1:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "l"}) {
-! CHECK:    %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QMuser_defFtest_user_defined_scalarEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:    %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QMuser_defFtest_user_defined_scalarEl"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:    %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_user_defined_scalarEi"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:    %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_user_defined_scalarEl"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:    hlfir.region_assign {
 ! CHECK:      %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.logical<4>>
 ! CHECK:      hlfir.yield %[[VAL_4]] : !fir.logical<4>
@@ -91,7 +91,7 @@ subroutine test_non_elemental_array(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QMuser_defPtest_non_elemental_array(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}) {
-! CHECK:    %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QMuser_defFtest_non_elemental_arrayEx"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:    %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_non_elemental_arrayEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:    hlfir.region_assign {
 ! CHECK:      %[[VAL_2:.*]] = arith.constant 4.200000e+01 : f32
 ! CHECK:      %[[VAL_3:.*]] = arith.constant 0 : index
@@ -126,9 +126,9 @@ end subroutine
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "i"},
 ! CHECK-SAME:    %[[VAL_1:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>> {fir.bindc_name = "l"},
 ! CHECK-SAME:    %[[VAL_2:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>> {fir.bindc_name = "l2"}) {
-! CHECK:    %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QMuser_defFtest_where_user_def_assignmentEi"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
-! CHECK:    %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QMuser_defFtest_where_user_def_assignmentEl"} : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
-! CHECK:    %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QMuser_defFtest_where_user_def_assignmentEl2"} : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
+! CHECK:    %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_where_user_def_assignmentEi"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:    %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_where_user_def_assignmentEl"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
+! CHECK:    %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_where_user_def_assignmentEl2"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
 ! CHECK:    hlfir.where {
 ! CHECK:      hlfir.yield %[[VAL_4]]#0 : !fir.box<!fir.array<?x!fir.logical<4>>>
 ! CHECK:    } do {
@@ -171,11 +171,11 @@ end subroutine
 ! CHECK:    %[[VAL_2:.*]] = arith.constant 20 : index
 ! CHECK:    %[[VAL_3:.*]] = arith.constant 10 : index
 ! CHECK:    %[[VAL_4:.*]] = fir.shape %[[VAL_2]], %[[VAL_3]] : (index, index) -> !fir.shape<2>
-! CHECK:    %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) {uniq_name = "_QMuser_defFtest_forall_user_def_assignmentEi"} : (!fir.ref<!fir.array<20x10xi32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<20x10xi32>>, !fir.ref<!fir.array<20x10xi32>>)
+! CHECK:    %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_forall_user_def_assignmentEi"} : (!fir.ref<!fir.array<20x10xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<20x10xi32>>, !fir.ref<!fir.array<20x10xi32>>)
 ! CHECK:    %[[VAL_6:.*]] = arith.constant 20 : index
 ! CHECK:    %[[VAL_7:.*]] = arith.constant 10 : index
 ! CHECK:    %[[VAL_8:.*]] = fir.shape %[[VAL_6]], %[[VAL_7]] : (index, index) -> !fir.shape<2>
-! CHECK:    %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_8]]) {uniq_name = "_QMuser_defFtest_forall_user_def_assignmentEl"} : (!fir.ref<!fir.array<20x10x!fir.logical<4>>>, !fir.shape<2>) -> (!fir.ref<!fir.array<20x10x!fir.logical<4>>>, !fir.ref<!fir.array<20x10x!fir.logical<4>>>)
+! CHECK:    %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_8]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_forall_user_def_assignmentEl"} : (!fir.ref<!fir.array<20x10x!fir.logical<4>>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<20x10x!fir.logical<4>>>, !fir.ref<!fir.array<20x10x!fir.logical<4>>>)
 ! CHECK:    %[[VAL_10:.*]] = arith.constant 1 : i32
 ! CHECK:    %[[VAL_11:.*]] = arith.constant 10 : i32
 ! CHECK:    hlfir.forall lb {
@@ -218,11 +218,11 @@ end subroutine
 ! CHECK:    %[[VAL_2:.*]] = arith.constant 20 : index
 ! CHECK:    %[[VAL_3:.*]] = arith.constant 10 : index
 ! CHECK:    %[[VAL_4:.*]] = fir.shape %[[VAL_2]], %[[VAL_3]] : (index, index) -> !fir.shape<2>
-! CHECK:    %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_4]]) {uniq_name = "_QMuser_defFtest_forall_user_def_assignment_non_elemental_arrayEl"} : (!fir.ref<!fir.array<20x10x!fir.logical<4>>>, !fir.shape<2>) -> (!fir.ref<!fir.array<20x10x!fir.logical<4>>>, !fir.ref<!fir.array<20x10x!fir.logical<4>>>)
+! CHECK:    %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_forall_user_def_assignment_non_elemental_arrayEl"} : (!fir.ref<!fir.array<20x10x!fir.logical<4>>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<20x10x!fir.logical<4>>>, !fir.ref<!fir.array<20x10x!fir.logical<4>>>)
 ! CHECK:    %[[VAL_6:.*]] = arith.constant 20 : index
 ! CHECK:    %[[VAL_7:.*]] = arith.constant 10 : index
 ! CHECK:    %[[VAL_8:.*]] = fir.shape %[[VAL_6]], %[[VAL_7]] : (index, index) -> !fir.shape<2>
-! CHECK:    %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_8]]) {uniq_name = "_QMuser_defFtest_forall_user_def_assignment_non_elemental_arrayEx"} : (!fir.ref<!fir.array<20x10xf32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<20x10xf32>>, !fir.ref<!fir.array<20x10xf32>>)
+! CHECK:    %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_8]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_forall_user_def_assignment_non_elemental_arrayEx"} : (!fir.ref<!fir.array<20x10xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<20x10xf32>>, !fir.ref<!fir.array<20x10xf32>>)
 ! CHECK:    %[[VAL_10:.*]] = arith.constant 1 : i32
 ! CHECK:    %[[VAL_11:.*]] = arith.constant 10 : i32
 ! CHECK:    hlfir.forall lb {
@@ -269,8 +269,8 @@ end subroutine
 ! CHECK-LABEL:   func.func @_QMuser_defPtest_pointer(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> {fir.bindc_name = "p"},
 ! CHECK-SAME:    %[[VAL_1:.*]]: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "x"}) {
-! CHECK:    %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMuser_defFtest_pointerEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
-! CHECK:    %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QMuser_defFtest_pointerEx"} : (!fir.box<!fir.array<?x?xf32>>) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
+! CHECK:    %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMuser_defFtest_pointerEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
+! CHECK:    %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_pointerEx"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
 ! CHECK:    hlfir.region_assign {
 ! CHECK:      hlfir.yield %[[VAL_3]]#0 : !fir.box<!fir.array<?x?xf32>>
 ! CHECK:    } to {
@@ -287,8 +287,8 @@ end subroutine
 ! CHECK-LABEL:   func.func @_QMuser_defPtest_allocatable(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>> {fir.bindc_name = "a"},
 ! CHECK-SAME:    %[[VAL_1:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}) {
-! CHECK:    %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMuser_defFtest_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>)
-! CHECK:    %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QMuser_defFtest_allocatableEx"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:    %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMuser_defFtest_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>)
+! CHECK:    %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_allocatableEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:    hlfir.region_assign {
 ! CHECK:      hlfir.yield %[[VAL_3]]#0 : !fir.box<!fir.array<?xf32>>
 ! CHECK:    } to {
@@ -313,7 +313,7 @@ end subroutine test_char_get_length
 ! CHECK-LABEL:   func.func @_QPtest_char_get_length(
 ! CHECK-SAME:                                       %[[VAL_0:.*]]: !fir.boxchar<1> {fir.bindc_name = "ch"}) {
 ! CHECK:           %[[VAL_1:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 {uniq_name = "_QFtest_char_get_lengthEch"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_char_get_lengthEch"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFtest_char_get_lengthEx"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFtest_char_get_lengthEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           hlfir.region_assign {
diff --git a/flang/test/Lower/HLFIR/vector-subscript-as-value.f90 b/flang/test/Lower/HLFIR/vector-subscript-as-value.f90
index ee8ded197c95..7161ee088b57 100644
--- a/flang/test/Lower/HLFIR/vector-subscript-as-value.f90
+++ b/flang/test/Lower/HLFIR/vector-subscript-as-value.f90
@@ -68,7 +68,7 @@ subroutine foo3(x, y)
   call bar2(x(1:8:2, 5, y))
 end subroutine
 ! CHECK-LABEL:   func.func @_QPfoo3(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFfoo3Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?x?xi32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?x?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?x?xi32>>>>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFfoo3Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?x?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?x?xi32>>>>)
 ! CHECK:  %[[VAL_3:.*]] = arith.constant 20 : index
 ! CHECK:  %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
 ! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]](%[[VAL_4:[a-z0-9]*]])  {{.*}}Ey
@@ -196,8 +196,8 @@ end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_passing_subscripted_poly(
 ! CHECK-SAME:                                                %[[VAL_0:.*]]: !fir.class<!fir.array<?x?xnone>>
 ! CHECK-SAME:                                                %[[VAL_1:.*]]: !fir.box<!fir.array<?xi64>>
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFtest_passing_subscripted_polyEvector"} : (!fir.box<!fir.array<?xi64>>) -> (!fir.box<!fir.array<?xi64>>, !fir.box<!fir.array<?xi64>>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest_passing_subscripted_polyEx"} : (!fir.class<!fir.array<?x?xnone>>) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_passing_subscripted_polyEvector"} : (!fir.box<!fir.array<?xi64>>, !fir.dscope) -> (!fir.box<!fir.array<?xi64>>, !fir.box<!fir.array<?xi64>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_passing_subscripted_polyEx"} : (!fir.class<!fir.array<?x?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 314 : index
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_2]]#0, %[[VAL_5]] : (!fir.box<!fir.array<?xi64>>, index) -> (index, index, index)
diff --git a/flang/test/Lower/Intrinsics/associated-proc-pointers.f90 b/flang/test/Lower/Intrinsics/associated-proc-pointers.f90
index 1772b9afdfc0..e07e61b7e597 100644
--- a/flang/test/Lower/Intrinsics/associated-proc-pointers.f90
+++ b/flang/test/Lower/Intrinsics/associated-proc-pointers.f90
@@ -9,7 +9,7 @@ end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_proc_pointer_1(
 ! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>,
 ! CHECK-SAME:                                      %[[VAL_1:.*]]: !fir.boxproc<() -> ()>) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_1Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_1Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#1 : !fir.ref<!fir.boxproc<() -> ()>>
 ! CHECK:           %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.boxproc<() -> ()>) -> (() -> ())
 ! CHECK:           %[[VAL_5:.*]] = fir.box_addr %[[VAL_1]] : (!fir.boxproc<() -> ()>) -> (() -> ())
@@ -28,8 +28,8 @@ end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_proc_pointer_2(
 ! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>,
 ! CHECK-SAME:                                      %[[VAL_1:.*]]: !fir.ref<!fir.boxproc<() -> ()>>) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_2Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_2Ep_target"} : (!fir.ref<!fir.boxproc<() -> ()>>) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_2Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_2Ep_target"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
 ! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_2]]#1 : !fir.ref<!fir.boxproc<() -> ()>>
 ! CHECK:           %[[VAL_5:.*]] = fir.box_addr %[[VAL_4]] : (!fir.boxproc<() -> ()>) -> (() -> ())
 ! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_3]]#1 : !fir.ref<!fir.boxproc<() -> ()>>
@@ -50,7 +50,7 @@ end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_proc_pointer_3(
 ! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>,
 ! CHECK-SAME:                                      %[[VAL_1:.*]]: !fir.boxproc<() -> ()>) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_3Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_3Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#1 : !fir.ref<!fir.boxproc<() -> ()>>
 ! CHECK:           %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.boxproc<() -> ()>) -> (() -> ())
 ! CHECK:           %[[VAL_5:.*]] = fir.box_addr %[[VAL_1]] : (!fir.boxproc<() -> ()>) -> (() -> ())
@@ -69,7 +69,7 @@ subroutine test_proc_pointer_4(p)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_proc_pointer_4(
 ! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_4Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_4Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.address_of(@_QPsome_external) : () -> ()
 ! CHECK:           %[[VAL_3:.*]] = fir.emboxproc %[[VAL_2]] : (() -> ()) -> !fir.boxproc<() -> ()>
 ! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_1]]#1 : !fir.ref<!fir.boxproc<() -> ()>>
@@ -95,7 +95,7 @@ end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_proc_pointer_5(
 ! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>,
 ! CHECK-SAME:                                      %[[VAL_1:.*]]: tuple<!fir.boxproc<() -> ()>, i64> {fir.char_proc}) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_5Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_5Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.extract_value %[[VAL_1]], [0 : index] : (tuple<!fir.boxproc<() -> ()>, i64>) -> !fir.boxproc<() -> ()>
 ! CHECK:           %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.boxproc<() -> ()>) -> (() -> ())
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 10 : i64
diff --git a/flang/test/Lower/Intrinsics/c_f_procpointer.f90 b/flang/test/Lower/Intrinsics/c_f_procpointer.f90
index f70a56c91b91..f8792e4c1be0 100644
--- a/flang/test/Lower/Intrinsics/c_f_procpointer.f90
+++ b/flang/test/Lower/Intrinsics/c_f_procpointer.f90
@@ -10,8 +10,8 @@ end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_c_funloc(
 ! CHECK-SAME:                                %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>,
 ! CHECK-SAME:                                %[[VAL_1:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>> {fir.bindc_name = "cptr"}) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFtest_c_funlocEcptr"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_c_funlocEfptr"} : (!fir.ref<!fir.boxproc<() -> ()>>) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_c_funlocEcptr"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_c_funlocEfptr"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
 ! CHECK:           %[[VAL_4:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>
 ! CHECK:           %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_2]]#1, %[[VAL_4]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
 ! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_5]] : !fir.ref<i64>
@@ -32,8 +32,8 @@ end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_c_funloc_char(
 ! CHECK-SAME:                                     %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>,
 ! CHECK-SAME:                                     %[[VAL_1:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>> {fir.bindc_name = "cptr"}) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFtest_c_funloc_charEcptr"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_c_funloc_charEfptr"} : (!fir.ref<!fir.boxproc<() -> ()>>) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_c_funloc_charEcptr"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_c_funloc_charEfptr"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
 ! CHECK:           %[[VAL_4:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>
 ! CHECK:           %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_2]]#1, %[[VAL_4]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
 ! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_5]] : !fir.ref<i64>
diff --git a/flang/test/Lower/Intrinsics/c_funloc-proc-pointers.f90 b/flang/test/Lower/Intrinsics/c_funloc-proc-pointers.f90
index c9578b17ac52..0f398a346d45 100644
--- a/flang/test/Lower/Intrinsics/c_funloc-proc-pointers.f90
+++ b/flang/test/Lower/Intrinsics/c_funloc-proc-pointers.f90
@@ -8,7 +8,7 @@ subroutine test_c_funloc(p)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_c_funloc(
 ! CHECK-SAME:                                %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_c_funlocEp"} : (!fir.ref<!fir.boxproc<() -> ()>>) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_c_funlocEp"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.boxproc<() -> ()>>
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>
 ! CHECK:           %[[VAL_4:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>
@@ -28,7 +28,7 @@ subroutine test_c_funloc_char(p)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_c_funloc_char(
 ! CHECK-SAME:                                     %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_c_funloc_charEp"} : (!fir.ref<!fir.boxproc<() -> ()>>) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_c_funloc_charEp"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.boxproc<() -> ()>>
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>
 ! CHECK:           %[[VAL_4:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>
diff --git a/flang/test/Lower/Intrinsics/c_ptr_eq_ne.f90 b/flang/test/Lower/Intrinsics/c_ptr_eq_ne.f90
index 38468739ead5..c6a2f186e4c1 100644
--- a/flang/test/Lower/Intrinsics/c_ptr_eq_ne.f90
+++ b/flang/test/Lower/Intrinsics/c_ptr_eq_ne.f90
@@ -10,8 +10,8 @@ end
 
 ! CHECK-LABEL: func.func @_QPtest_c_ptr_eq(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>> {fir.bindc_name = "ptr1"}, %[[ARG1:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>> {fir.bindc_name = "ptr2"}) -> !fir.logical<4> {
-! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_c_ptr_eqEptr1"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>)
-! CHECK: %[[DECL_ARG1:.*]]:2 = hlfir.declare %[[ARG1]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_c_ptr_eqEptr2"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>)
+! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_c_ptr_eqEptr1"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>)
+! CHECK: %[[DECL_ARG1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_c_ptr_eqEptr2"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>)
 ! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.logical<4> {bindc_name = "test_c_ptr_eq", uniq_name = "_QFtest_c_ptr_eqEtest_c_ptr_eq"}
 ! CHECK: %[[DECL_RET:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "_QFtest_c_ptr_eqEtest_c_ptr_eq"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK: %[[FIELD_ADDRESS:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
@@ -37,8 +37,8 @@ end
 
 ! CHECK-LABEL: func.func @_QPtest_c_ptr_ne(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>> {fir.bindc_name = "ptr1"}, %[[ARG1:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>> {fir.bindc_name = "ptr2"}) -> !fir.logical<4> {
-! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_c_ptr_neEptr1"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>)
-! CHECK: %[[DECL_ARG1:.*]]:2 = hlfir.declare %[[ARG1]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_c_ptr_neEptr2"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>)
+! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_c_ptr_neEptr1"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>)
+! CHECK: %[[DECL_ARG1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_c_ptr_neEptr2"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>)
 ! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.logical<4> {bindc_name = "test_c_ptr_ne", uniq_name = "_QFtest_c_ptr_neEtest_c_ptr_ne"}
 ! CHECK: %[[DECL_RET:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "_QFtest_c_ptr_neEtest_c_ptr_ne"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK: %[[FIELD_ADDRESS:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
diff --git a/flang/test/Lower/Intrinsics/execute_command_line-optional.f90 b/flang/test/Lower/Intrinsics/execute_command_line-optional.f90
index 0b75a20216af..e4f9a241197c 100644
--- a/flang/test/Lower/Intrinsics/execute_command_line-optional.f90
+++ b/flang/test/Lower/Intrinsics/execute_command_line-optional.f90
@@ -15,14 +15,15 @@ subroutine all_args_optional(command, isWait, exitVal, cmdVal, msg)
 ! CHECK-NEXT:    %[[c14:.*]] = arith.constant 14 : i32 
 ! CHECK-NEXT:    %true = arith.constant true 
 ! CHECK-NEXT:    %[[c0:.*]] = arith.constant 0 : i64 
-! CHECK-NEXT:    %[[cmdstatDeclare:.*]] = fir.declare %[[cmdstatArg]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEcmdval"} : (!fir.ref<i32>) -> !fir.ref<i32>
+! CHECK-NEXT:    %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK-NEXT:    %[[cmdstatDeclare:.*]] = fir.declare %[[cmdstatArg]] dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEcmdval"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
 ! CHECK-NEXT:    %[[commandUnbox:.*]]:2 = fir.unboxchar %[[commandArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK-NEXT:    %[[commandDeclare:.*]] = fir.declare %[[commandUnbox]]#0 typeparams %[[commandUnbox]]#1 {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEcommand"} : (!fir.ref<!fir.char<1,?>>, index) -> !fir.ref<!fir.char<1,?>>
+! CHECK-NEXT:    %[[commandDeclare:.*]] = fir.declare %[[commandUnbox]]#0 typeparams %[[commandUnbox]]#1 dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEcommand"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,?>>
 ! CHECK-NEXT:    %[[commandBoxTemp:.*]] = fir.emboxchar %[[commandDeclare]], %[[commandUnbox]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
-! CHECK-NEXT:    %[[exitstatDeclare:.*]] = fir.declare %[[exitstatArg]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEexitval"} : (!fir.ref<i32>) -> !fir.ref<i32>
-! CHECK-NEXT:    %[[waitDeclare:.*]] = fir.declare %[[waitArg]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEiswait"} : (!fir.ref<!fir.logical<4>>) -> !fir.ref<!fir.logical<4>>
+! CHECK-NEXT:    %[[exitstatDeclare:.*]] = fir.declare %[[exitstatArg]] dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEexitval"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
+! CHECK-NEXT:    %[[waitDeclare:.*]] = fir.declare %[[waitArg]] dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEiswait"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> !fir.ref<!fir.logical<4>>
 ! CHECK-NEXT:    %[[cmdmsgUnbox:.*]]:2 = fir.unboxchar %[[cmdmsgArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK-NEXT:    %[[cmdmsgDeclare:.*]] = fir.declare %[[cmdmsgUnbox]]#0 typeparams %[[cmdmsgUnbox]]#1 {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEmsg"} : (!fir.ref<!fir.char<1,?>>, index) -> !fir.ref<!fir.char<1,?>>
+! CHECK-NEXT:    %[[cmdmsgDeclare:.*]] = fir.declare %[[cmdmsgUnbox]]#0 typeparams %[[cmdmsgUnbox]]#1 dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEmsg"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,?>>
 ! CHECK-NEXT:    %[[cmdmsgBoxTemp:.*]] = fir.emboxchar %[[cmdmsgDeclare]], %[[cmdmsgUnbox]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
 ! CHECK-NEXT:    %[[exitstatIsPresent:.*]] = fir.is_present %[[exitstatDeclare]] : (!fir.ref<i32>) -> i1
 ! CHECK-NEXT:    %[[cmdstatIsPresent:.*]] = fir.is_present %[[cmdstatDeclare]] : (!fir.ref<i32>) -> i1
diff --git a/flang/test/Lower/Intrinsics/execute_command_line.f90 b/flang/test/Lower/Intrinsics/execute_command_line.f90
index 8aacd34346b4..6bde50e807b2 100644
--- a/flang/test/Lower/Intrinsics/execute_command_line.f90
+++ b/flang/test/Lower/Intrinsics/execute_command_line.f90
@@ -15,15 +15,16 @@ call execute_command_line(command, isWait, exitVal, cmdVal, msg)
 ! CHECK-NEXT:        %true = arith.constant true 
 ! CHECK-NEXT:        %[[c0:.*]] = arith.constant 0 : i64 
 ! CHECK-NEXT:        %[[c30:.*]] = arith.constant 30 : index
-! CHECK-NEXT:        %[[cmdstatsDeclare:.*]] = fir.declare %[[cmdstatArg]] {uniq_name = "_QFall_argsEcmdval"} : (!fir.ref<i32>) -> !fir.ref<i32>
+! CHECK-NEXT:        %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK-NEXT:        %[[cmdstatsDeclare:.*]] = fir.declare %[[cmdstatArg]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFall_argsEcmdval"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
 ! CHECK-NEXT:        %[[commandUnbox:.*]]:2 = fir.unboxchar %[[commandArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK-NEXT:        %[[commandCast:.*]] = fir.convert %[[commandUnbox]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,30>>
-! CHECK-NEXT:        %[[commandDeclare:.*]] = fir.declare %[[commandCast]] typeparams %[[c30]] {uniq_name = "_QFall_argsEcommand"} : (!fir.ref<!fir.char<1,30>>, index) -> !fir.ref<!fir.char<1,30>>
-! CHECK-NEXT:        %[[exitstatDeclare:.*]] = fir.declare %[[exitstatArg]] {uniq_name = "_QFall_argsEexitval"} : (!fir.ref<i32>) -> !fir.ref<i32>
-! CHECK-NEXT:        %[[waitDeclare:.*]] = fir.declare %[[waitArg]] {uniq_name = "_QFall_argsEiswait"} : (!fir.ref<!fir.logical<4>>) -> !fir.ref<!fir.logical<4>>
+! CHECK-NEXT:        %[[commandDeclare:.*]] = fir.declare %[[commandCast]] typeparams %[[c30]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFall_argsEcommand"} : (!fir.ref<!fir.char<1,30>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,30>>
+! CHECK-NEXT:        %[[exitstatDeclare:.*]] = fir.declare %[[exitstatArg]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFall_argsEexitval"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
+! CHECK-NEXT:        %[[waitDeclare:.*]] = fir.declare %[[waitArg]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFall_argsEiswait"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> !fir.ref<!fir.logical<4>>
 ! CHECK-NEXT:        %[[cmdmsgUnbox:.*]]:2 = fir.unboxchar %[[cmdmsgArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK-NEXT:        %[[cmdmsgCast:.*]] = fir.convert %[[cmdmsgUnbox]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,30>>
-! CHECK-NEXT:        %[[cmdmsgDeclare:.*]] = fir.declare %[[cmdmsgCast]] typeparams %[[c30]] {uniq_name = "_QFall_argsEmsg"} : (!fir.ref<!fir.char<1,30>>, index) -> !fir.ref<!fir.char<1,30>>
+! CHECK-NEXT:        %[[cmdmsgDeclare:.*]] = fir.declare %[[cmdmsgCast]] typeparams %[[c30]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFall_argsEmsg"} : (!fir.ref<!fir.char<1,30>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,30>>
 ! CHECK-NEXT:        %[[commandBox:.*]] = fir.embox %[[commandDeclare]] : (!fir.ref<!fir.char<1,30>>) -> !fir.box<!fir.char<1,30>>
 ! CHECK-NEXT:        %[[exitstatBox:.*]] = fir.embox %[[exitstatDeclare]] : (!fir.ref<i32>) -> !fir.box<i32>
 ! CHECK-NEXT:        %[[cmdstatBox:.*]] = fir.embox %[[cmdstatsDeclare]] : (!fir.ref<i32>) -> !fir.box<i32>
@@ -50,12 +51,13 @@ end subroutine all_args
 subroutine only_command_default_wait_true(command)
 CHARACTER(30) :: command
 call execute_command_line(command)
-! CHECK-NEXT:     %[[c52:.*]] = arith.constant 52 : i32 
+! CHECK-NEXT:     %[[c52:.*]] = arith.constant 53 : i32 
 ! CHECK-NEXT:     %true = arith.constant true 
 ! CHECK-NEXT:     %[[c30:.*]] = arith.constant 30 : index
+! CHECK-NEXT:        %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK-NEXT:     %[[commandUnbox:.*]]:2 = fir.unboxchar %[[cmdArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK-NEXT:     %[[commandCast:.*]] = fir.convert %[[commandUnbox]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,30>>
-! CHECK-NEXT:     %[[commandDeclare:.*]] = fir.declare %[[commandCast]] typeparams %[[c30]] {uniq_name = "_QFonly_command_default_wait_trueEcommand"} : (!fir.ref<!fir.char<1,30>>, index) -> !fir.ref<!fir.char<1,30>>
+! CHECK-NEXT:     %[[commandDeclare:.*]] = fir.declare %[[commandCast]] typeparams %[[c30]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFonly_command_default_wait_trueEcommand"} : (!fir.ref<!fir.char<1,30>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,30>>
 ! CHECK-NEXT:     %[[commandBox:.*]] = fir.embox %[[commandDeclare]] : (!fir.ref<!fir.char<1,30>>) -> !fir.box<!fir.char<1,30>>
 ! CHECK-NEXT:     %[[absent:.*]] = fir.absent !fir.box<none>
 ! CHECK:          %[[command:.*]] = fir.convert %[[commandBox]] : (!fir.box<!fir.char<1,30>>) -> !fir.box<none> 
diff --git a/flang/test/Lower/Intrinsics/ieee_logb.f90 b/flang/test/Lower/Intrinsics/ieee_logb.f90
index df15661d51b2..4195ac7af245 100644
--- a/flang/test/Lower/Intrinsics/ieee_logb.f90
+++ b/flang/test/Lower/Intrinsics/ieee_logb.f90
@@ -9,7 +9,7 @@ subroutine out(x)
   ! CHECK:     %[[V_61:[0-9]+]] = fir.declare %[[V_60]] {uniq_name = "_QFoutEl"} : (!fir.ref<!fir.logical<4>>) -> !fir.ref<!fir.logical<4>>
   ! CHECK:     %[[V_62:[0-9]+]] = fir.alloca f64 {bindc_name = "r", uniq_name = "_QFoutEr"}
   ! CHECK:     %[[V_63:[0-9]+]] = fir.declare %[[V_62]] {uniq_name = "_QFoutEr"} : (!fir.ref<f64>) -> !fir.ref<f64>
-  ! CHECK:     %[[V_64:[0-9]+]] = fir.declare %arg0 {uniq_name = "_QFoutEx"} : (!fir.ref<f64>) -> !fir.ref<f64>
+  ! CHECK:     %[[V_64:[0-9]+]] = fir.declare %arg0 dummy_scope %{{[0-9]+}} {uniq_name = "_QFoutEx"} : (!fir.ref<f64>, !fir.dscope) -> !fir.ref<f64>
   real(k) :: x, r
   logical :: L
 
diff --git a/flang/test/Lower/Intrinsics/product.f90 b/flang/test/Lower/Intrinsics/product.f90
index e7f7c0d39ee0..ddefa7a37184 100644
--- a/flang/test/Lower/Intrinsics/product.f90
+++ b/flang/test/Lower/Intrinsics/product.f90
@@ -58,7 +58,7 @@ product_test4 = product(x)
 ! CHECK-DAG: %[[a5:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?x!fir.complex<10>>>) -> !fir.box<none>
 ! CHECK-DAG:  %[[a7:.*]] = fir.convert %[[c0]] : (index) -> i32
 ! CHECK-DAG:  %[[a8:.*]] = fir.convert %[[a2]] : (!fir.box<i1>) -> !fir.box<none>
-! CHECK: fir.call @_FortranACppProductComplex10(%[[a4]], %[[a5]], %{{.*}}, %{{.*}}, %[[a7]], %8) {{.*}}: (!fir.ref<complex<f80>>, !fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> ()
+! CHECK: fir.call @_FortranACppProductComplex10(%[[a4]], %[[a5]], %{{.*}}, %{{.*}}, %[[a7]], %{{[0-9]+}}) {{.*}}: (!fir.ref<complex<f80>>, !fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> ()
 end
 
 ! CHECK-LABEL: func @_QPproduct_test_optional(
diff --git a/flang/test/Lower/Intrinsics/signal.f90 b/flang/test/Lower/Intrinsics/signal.f90
index d6678000677e..5d20bb5c5c07 100644
--- a/flang/test/Lower/Intrinsics/signal.f90
+++ b/flang/test/Lower/Intrinsics/signal.f90
@@ -23,7 +23,7 @@ contains
     integer, optional, intent(out) :: optional_status
 
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca i32
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<intent_out, optional>, uniq_name = "_QMmFsetup_signalsEoptional_status"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_out, optional>, uniq_name = "_QMmFsetup_signalsEoptional_status"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_14:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QMmFsetup_signalsEstat"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 
     call signal(SIGFPE, handler)
diff --git a/flang/test/Lower/Intrinsics/sizeof.f90 b/flang/test/Lower/Intrinsics/sizeof.f90
index e10cb79981a6..7e749f096112 100644
--- a/flang/test/Lower/Intrinsics/sizeof.f90
+++ b/flang/test/Lower/Intrinsics/sizeof.f90
@@ -6,7 +6,7 @@ integer(8) function test1(x)
   test1 = sizeof(x)
 end function
 ! CHECK-LABEL:   func.func @_QPtest1(
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest1Ex"} : (!fir.class<none>) -> (!fir.class<none>, !fir.class<none>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest1Ex"} : (!fir.class<none>, !fir.dscope) -> (!fir.class<none>, !fir.class<none>)
 ! CHECK:           %[[VAL_4:.*]] = fir.box_elesize %[[VAL_3]]#1 : (!fir.class<none>) -> i64
 ! CHECK:           hlfir.assign %[[VAL_4]] to %{{.*}} : i64, !fir.ref<i64>
 
@@ -15,7 +15,7 @@ integer(8) function test2(x)
   test2 = sizeof(x)
 end function
 ! CHECK-LABEL:   func.func @_QPtest2(
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest2Ex"} : (!fir.class<!fir.array<?x?xnone>>) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest2Ex"} : (!fir.class<!fir.array<?x?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>)
 ! CHECK:           %[[VAL_4:.*]] = fir.box_elesize %[[VAL_3]]#1 : (!fir.class<!fir.array<?x?xnone>>) -> i64
 ! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_3]]#1 : (!fir.class<!fir.array<?x?xnone>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_9:.*]] = fir.call @_FortranASize(%[[VAL_7]], %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> i64
diff --git a/flang/test/Lower/Intrinsics/sum.f90 b/flang/test/Lower/Intrinsics/sum.f90
index cafcc0828df8..696892d29126 100644
--- a/flang/test/Lower/Intrinsics/sum.f90
+++ b/flang/test/Lower/Intrinsics/sum.f90
@@ -58,7 +58,7 @@ sum_test4 = sum(x)
 ! CHECK-DAG: %[[a5:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?x!fir.complex<10>>>) -> !fir.box<none>
 ! CHECK-DAG:  %[[a7:.*]] = fir.convert %[[c0]] : (index) -> i32
 ! CHECK-DAG:  %[[a8:.*]] = fir.convert %[[a2]] : (!fir.box<i1>) -> !fir.box<none>
-! CHECK: fir.call @_FortranACppSumComplex10(%[[a4]], %[[a5]], %{{.*}}, %{{.*}}, %[[a7]], %8) {{.*}}: (!fir.ref<complex<f80>>, !fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> ()
+! CHECK: fir.call @_FortranACppSumComplex10(%[[a4]], %[[a5]], %{{.*}}, %{{.*}}, %[[a7]], %{{[0-9]+}}) {{.*}}: (!fir.ref<complex<f80>>, !fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> ()
 end
 
 ! CHECK-LABEL: func @_QPsum_test_optional(
diff --git a/flang/test/Lower/Intrinsics/system-optional.f90 b/flang/test/Lower/Intrinsics/system-optional.f90
index 5047437c5c3c..8001e76fb93b 100644
--- a/flang/test/Lower/Intrinsics/system-optional.f90
+++ b/flang/test/Lower/Intrinsics/system-optional.f90
@@ -9,17 +9,17 @@ INTEGER, OPTIONAL :: exitstat
 call system(command, exitstat)
 
 ! CHECK-NEXT:    %[[cmdstatVal:.*]] = fir.alloca i16
+! CHECK-NEXT:    %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK-NEXT:    %[[commandUnbox:.*]]:2 = fir.unboxchar %[[commandArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK-NEXT:    %[[commandDeclare:.*]]:2 = hlfir.declare %[[commandUnbox]]#0 typeparams %[[commandUnbox]]#1 {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_argsEcommand"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
-! CHECK-NEXT:    %[[exitstatDeclare:.*]]:2 = hlfir.declare %[[exitstatArg]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_argsEexitstat"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK-NEXT:    %[[commandDeclare:.*]]:2 = hlfir.declare %[[commandUnbox]]#0 typeparams %[[commandUnbox]]#1 dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_argsEcommand"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK-NEXT:    %[[exitstatDeclare:.*]]:2 = hlfir.declare %[[exitstatArg]] dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_argsEexitstat"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK-NEXT:    %[[exitstatIsPresent:.*]] = fir.is_present %[[exitstatDeclare]]#0 : (!fir.ref<i32>) -> i1
 ! CHECK-NEXT:    %[[commandBox:.*]] = fir.embox %[[commandDeclare]]#1 typeparams %[[commandUnbox]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.box<!fir.char<1,?>>
 ! CHECK-NEXT:    %[[exitstatBox:.*]] = fir.embox %[[exitstatDeclare]]#1 : (!fir.ref<i32>) -> !fir.box<i32>
 ! CHECK-NEXT:    %[[absentIntBox:.*]] = fir.absent !fir.box<i32>
 ! CHECK-NEXT:    %[[exitstatRealBox:.*]] = arith.select %[[exitstatIsPresent]], %[[exitstatBox]], %[[absentIntBox]] : !fir.box<i32>
 ! CHECK-NEXT:    %[[true:.*]] = arith.constant true
-! CHECK-NEXT:    %[[c0_i2:.*]] = arith.constant 0 : i2
-! CHECK-NEXT:    %[[c0_i16:.*]] = fir.convert %[[c0_i2]] : (i2) -> i16
+! CHECK-NEXT:    %[[c0_i16:.*]] = arith.constant 0 : i16
 ! CHECK-NEXT:    fir.store %[[c0_i16]] to %[[cmdstatVal]] : !fir.ref<i16>
 ! CHECK-NEXT:    %[[cmdstatBox:.*]] = fir.embox %[[cmdstatVal]] : (!fir.ref<i16>) -> !fir.box<i16>
 ! CHECK-NEXT:    %[[absentBox:.*]] = fir.absent !fir.box<none>
diff --git a/flang/test/Lower/Intrinsics/system.f90 b/flang/test/Lower/Intrinsics/system.f90
index 0cafc0b2a9cf..71655938113f 100644
--- a/flang/test/Lower/Intrinsics/system.f90
+++ b/flang/test/Lower/Intrinsics/system.f90
@@ -8,14 +8,14 @@ CHARACTER(*) :: command
 INTEGER :: exitstat
 call system(command, exitstat)
 ! CHECK-NEXT:   %[[cmdstatVal:.*]] = fir.alloca i16
+! CHECK-NEXT:   %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK-NEXT:   %[[commandUnbox:.*]]:2 = fir.unboxchar %[[commandArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK-NEXT:   %[[commandDeclare:.*]]:2 = hlfir.declare %[[commandUnbox]]#0 typeparams %[[commandUnbox]]#1 {uniq_name = "_QFall_argsEcommand"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
-! CHECK-NEXT:   %[[exitstatDeclare:.*]]:2 = hlfir.declare %[[exitstatArg]] {uniq_name = "_QFall_argsEexitstat"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK-NEXT:   %[[commandDeclare:.*]]:2 = hlfir.declare %[[commandUnbox]]#0 typeparams %[[commandUnbox]]#1 dummy_scope %[[DSCOPE]] {uniq_name = "_QFall_argsEcommand"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK-NEXT:   %[[exitstatDeclare:.*]]:2 = hlfir.declare %[[exitstatArg]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFall_argsEexitstat"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK-NEXT:   %[[commandBox:.*]] = fir.embox %[[commandDeclare]]#1 typeparams %[[commandUnbox]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.box<!fir.char<1,?>>
 ! CHECK-NEXT:   %[[exitstatBox:.*]] = fir.embox %[[exitstatDeclare]]#1 : (!fir.ref<i32>) -> !fir.box<i32>
 ! CHECK-NEXT:   %[[true:.*]] = arith.constant true
-! CHECK-NEXT:   %[[c0_i2:.*]] = arith.constant 0 : i2
-! CHECK-NEXT:   %[[c0_i16:.*]] = fir.convert %[[c0_i2]] : (i2) -> i16
+! CHECK-NEXT:   %[[c0_i16:.*]] = arith.constant 0 : i16
 ! CHECK-NEXT:   fir.store %[[c0_i16]] to %[[cmdstatVal]] : !fir.ref<i16>
 ! CHECK-NEXT:   %[[cmdstatBox:.*]] = fir.embox %[[cmdstatVal]] : (!fir.ref<i16>) -> !fir.box<i16>
 ! CHECK-NEXT:   %[[absentBox:.*]] = fir.absent !fir.box<none>
@@ -34,17 +34,17 @@ subroutine only_command(command)
 CHARACTER(*) :: command
 call system(command)
 ! CHECK-NEXT:   %[[cmdstatVal:.*]] = fir.alloca i16
+! CHECK-NEXT:   %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK-NEXT:   %[[commandUnbox:.*]]:2 = fir.unboxchar %arg0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK-NEXT:   %[[commandDeclare:.*]]:2 = hlfir.declare %[[commandUnbox]]#0 typeparams %[[commandUnbox]]#1 {uniq_name = "_QFonly_commandEcommand"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK-NEXT:   %[[commandDeclare:.*]]:2 = hlfir.declare %[[commandUnbox]]#0 typeparams %[[commandUnbox]]#1 dummy_scope %[[DSCOPE]] {uniq_name = "_QFonly_commandEcommand"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK-NEXT:   %[[commandBox:.*]] = fir.embox %[[commandDeclare]]#1 typeparams %[[commandUnbox]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.box<!fir.char<1,?>>
 ! CHECK-NEXT:   %[[true:.*]] = arith.constant true
 ! CHECK-NEXT:   %[[absentBox:.*]] = fir.absent !fir.box<none>
-! CHECK-NEXT:   %[[c0_i2:.*]] = arith.constant 0 : i2
-! CHECK-NEXT:   %[[c0_i16:.*]] = fir.convert %[[c0_i2]] : (i2) -> i16
+! CHECK-NEXT:   %[[c0_i16:.*]] = arith.constant 0 : i16
 ! CHECK-NEXT:   fir.store %[[c0_i16]] to %[[cmdstatVal]] : !fir.ref<i16>
 ! CHECK-NEXT:   %[[cmdstatBox:.*]] = fir.embox %[[cmdstatVal]] : (!fir.ref<i16>) -> !fir.box<i16>
 ! CHECK-NEXT:   %[[absentBox2:.*]] = fir.absent !fir.box<none>
-! CHECK:        %[[c35_i32:.*]] = arith.constant 35 : i32
+! CHECK:        %[[c35_i32:.*]] = arith.constant {{[0-9]+}} : i32
 ! CHECK-NEXT:   %[[command:.*]] = fir.convert %[[commandBox]] : (!fir.box<!fir.char<1,?>>) -> !fir.box<none>
 ! CHECK-NEXT:   %[[cmdstat:.*]] = fir.convert %[[cmdstatBox]] : (!fir.box<i16>) -> !fir.box<none>
 ! CHECK:        %[[VAL_12:.*]] = fir.call @_FortranAExecuteCommandLine(%[[command]], %[[true]], %[[absentBox]], %[[cmdstat]], %[[absentBox2]], %[[VAL_11:.*]], %[[c35_i32]]) fastmath<contract> : (!fir.box<none>, i1, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
diff --git a/flang/test/Lower/OpenACC/acc-atomic-update-array.f90 b/flang/test/Lower/OpenACC/acc-atomic-update-array.f90
index e36c39c830ec..eeb7ea299408 100644
--- a/flang/test/Lower/OpenACC/acc-atomic-update-array.f90
+++ b/flang/test/Lower/OpenACC/acc-atomic-update-array.f90
@@ -20,8 +20,8 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPatomic_update_array1(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "r"}, %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}, %[[ARG2:.*]]: !fir.ref<f32> {fir.bindc_name = "x"}) {
-! CHECK: %[[DECL_ARG2:.*]]:2 = hlfir.declare %[[ARG2]] {uniq_name = "_QFatomic_update_array1Ex"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) {uniq_name = "_QFatomic_update_array1Er"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK: %[[DECL_ARG2:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFatomic_update_array1Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {uniq_name = "_QFatomic_update_array1Er"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
 ! CHECK: %[[ARRAY_REF:.*]] = hlfir.designate %[[DECL_ARG0]]#0 (%{{.*}})  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
 ! CHECK: %[[LOAD_X:.*]] = fir.load %[[DECL_ARG2]]#0 : !fir.ref<f32>
 ! CHECK: acc.atomic.update %[[ARRAY_REF]] : !fir.ref<f32> {
@@ -42,8 +42,8 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPatomic_read_array1(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "r"}, %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}, %[[ARG2:.*]]: !fir.ref<f32> {fir.bindc_name = "x"}) {
-! CHECK: %[[DECL_X:.*]]:2 = hlfir.declare %[[ARG2]] {uniq_name = "_QFatomic_read_array1Ex"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK: %[[DECL_R:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) {uniq_name = "_QFatomic_read_array1Er"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK: %[[DECL_X:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFatomic_read_array1Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK: %[[DECL_R:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {uniq_name = "_QFatomic_read_array1Er"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
 ! CHECK: %[[DES:.*]] = hlfir.designate %[[DECL_R]]#0 (%{{.*}})  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
 ! CHECK: acc.atomic.read %[[DECL_X]]#1 = %[[DES]] : !fir.ref<f32>, f32
 
@@ -58,8 +58,8 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPatomic_write_array1(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "r"}, %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}, %[[ARG2:.*]]: !fir.ref<f32> {fir.bindc_name = "x"}) {
-! CHECK: %[[DECL_X:.*]]:2 = hlfir.declare %[[ARG2]] {uniq_name = "_QFatomic_write_array1Ex"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK: %[[DECL_R:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) {uniq_name = "_QFatomic_write_array1Er"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK: %[[DECL_X:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFatomic_write_array1Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK: %[[DECL_R:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {uniq_name = "_QFatomic_write_array1Er"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
 ! CHECK: %[[DES:.*]] = hlfir.designate %[[DECL_R]]#0 (%{{.*}})  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
 ! CHECK: %[[LOAD:.*]] = fir.load %[[DES]] : !fir.ref<f32> 
 ! CHECK: acc.atomic.write %[[DECL_X]]#1 = %[[LOAD]] : !fir.ref<f32>, f32
@@ -77,9 +77,9 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPatomic_capture_array1(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "r"}, %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}, %[[ARG2:.*]]: !fir.ref<f32> {fir.bindc_name = "x"}, %[[ARG3:.*]]: !fir.ref<f32> {fir.bindc_name = "y"}) {
-! CHECK: %[[DECL_X:.*]]:2 = hlfir.declare %[[ARG2]] {uniq_name = "_QFatomic_capture_array1Ex"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK: %[[DECL_Y:.*]]:2 = hlfir.declare %[[ARG3]] {uniq_name = "_QFatomic_capture_array1Ey"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK: %[[DECL_R:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) {uniq_name = "_QFatomic_capture_array1Er"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK: %[[DECL_X:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFatomic_capture_array1Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK: %[[DECL_Y:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFatomic_capture_array1Ey"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK: %[[DECL_R:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {uniq_name = "_QFatomic_capture_array1Er"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
 ! CHECK: %[[R_I:.*]] = hlfir.designate %[[DECL_R]]#0 (%{{.*}})  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
 ! CHECK: %[[LOAD:.*]] = fir.load %[[DECL_X]]#0 : !fir.ref<f32>
 ! CHECK: acc.atomic.capture {
diff --git a/flang/test/Lower/OpenACC/acc-bounds.f90 b/flang/test/Lower/OpenACC/acc-bounds.f90
index c275d4f1b1d5..a83de91a67ae 100644
--- a/flang/test/Lower/OpenACC/acc-bounds.f90
+++ b/flang/test/Lower/OpenACC/acc-bounds.f90
@@ -88,7 +88,7 @@ contains
 
 ! CHECK-LABEL: func.func @_QMopenacc_boundsPacc_undefined_extent(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "a"}) {
-! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) {uniq_name = "_QMopenacc_boundsFacc_undefined_extentEa"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {uniq_name = "_QMopenacc_boundsFacc_undefined_extentEa"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
 ! CHECK: %[[DIMS0:.*]]:3 = fir.box_dims %[[DECL_ARG0]]#0, %c0{{.*}} : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
 ! CHECK: %[[UB:.*]] = arith.subi %[[DIMS0]]#1, %c1{{.*}} : index
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%c0{{.*}} : index) upperbound(%[[UB]] : index) extent(%[[DIMS0]]#1 : index) stride(%[[DIMS0]]#2 : index) startIdx(%c1{{.*}} : index) {strideInBytes = true}
@@ -105,7 +105,7 @@ contains
 
 ! CHECK-LABEL: func.func @_QMopenacc_boundsPacc_multi_strides(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?x?x?xf32>> {fir.bindc_name = "a"})
-! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_QMopenacc_boundsFacc_multi_stridesEa"} : (!fir.box<!fir.array<?x?x?xf32>>) -> (!fir.box<!fir.array<?x?x?xf32>>, !fir.box<!fir.array<?x?x?xf32>>)
+! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMopenacc_boundsFacc_multi_stridesEa"} : (!fir.box<!fir.array<?x?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x?xf32>>, !fir.box<!fir.array<?x?x?xf32>>)
 ! CHECK: %[[BOX_DIMS0:.*]]:3 = fir.box_dims %[[DECL_ARG0]]#0, %c0{{.*}} : (!fir.box<!fir.array<?x?x?xf32>>, index) -> (index, index, index)
 ! CHECK: %[[BOUNDS0:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) extent(%[[BOX_DIMS0]]#1 : index) stride(%[[BOX_DIMS0]]#2 : index) startIdx(%{{.*}} : index) {strideInBytes = true}
 ! CHECK: %[[STRIDE1:.*]] = arith.muli %[[BOX_DIMS0]]#2, %[[BOX_DIMS0]]#1 : index
@@ -126,7 +126,7 @@ contains
   
 ! CHECK-LABEL: func.func @_QMopenacc_boundsPacc_optional_data(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {fir.bindc_name = "a", fir.optional}) {
-! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<optional, pointer>, uniq_name = "_QMopenacc_boundsFacc_optional_dataEa"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional, pointer>, uniq_name = "_QMopenacc_boundsFacc_optional_dataEa"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
 ! CHECK: %[[IS_PRESENT:.*]] = fir.is_present %[[ARG0_DECL]]#1 : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> i1
 ! CHECK: %[[BOX:.*]] = fir.if %[[IS_PRESENT]] -> (!fir.box<!fir.ptr<!fir.array<?xf32>>>) {
 ! CHECK:   %[[LOAD:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
@@ -162,8 +162,8 @@ contains
 
 ! CHECK-LABEL: func.func @_QMopenacc_boundsPacc_optional_data2(
 ! CHECK-SAME: %[[A:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "a", fir.optional}, %[[N:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
-! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[A]](%{{.*}}) {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMopenacc_boundsFacc_optional_data2Ea"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
-! CHECK: %[[NO_CREATE:.*]] = acc.nocreate varPtr(%[[DECL_A]]#1 : !fir.ref<!fir.array<?xf32>>) bounds(%10) -> !fir.ref<!fir.array<?xf32>> {name = "a"}
+! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[A]](%{{.*}}) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMopenacc_boundsFacc_optional_data2Ea"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK: %[[NO_CREATE:.*]] = acc.nocreate varPtr(%[[DECL_A]]#1 : !fir.ref<!fir.array<?xf32>>) bounds(%{{[0-9]+}}) -> !fir.ref<!fir.array<?xf32>> {name = "a"}
 ! CHECK: acc.data dataOperands(%[[NO_CREATE]] : !fir.ref<!fir.array<?xf32>>) {
 
   subroutine acc_optional_data3(a, n)
@@ -175,7 +175,7 @@ contains
 
 ! CHECK-LABEL: func.func @_QMopenacc_boundsPacc_optional_data3(
 ! CHECK-SAME: %[[A:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "a", fir.optional}, %[[N:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
-! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[A]](%{{.*}}) {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMopenacc_boundsFacc_optional_data3Ea"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[A]](%{{.*}}) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMopenacc_boundsFacc_optional_data3Ea"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
 ! CHECK: %[[PRES:.*]] = fir.is_present %[[DECL_A]]#1 : (!fir.ref<!fir.array<?xf32>>) -> i1
 ! CHECK: %[[STRIDE:.*]] = fir.if %[[PRES]] -> (index) {
 ! CHECK:   %[[DIMS:.*]]:3 = fir.box_dims %[[DECL_A]]#0, %c0{{.*}} : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
diff --git a/flang/test/Lower/OpenACC/acc-declare.f90 b/flang/test/Lower/OpenACC/acc-declare.f90
index 5d3f9e3fe97e..ff1e756c20e1 100644
--- a/flang/test/Lower/OpenACC/acc-declare.f90
+++ b/flang/test/Lower/OpenACC/acc-declare.f90
@@ -62,7 +62,7 @@ module acc_declare
 ! CHECK-LABEL: func.func @_QMacc_declarePacc_declare_present(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>> {fir.bindc_name = "a"})
 ! CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
-! CHECK-DAG: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) {acc.declare = #acc.declare<dataClause =  acc_present>, uniq_name = "_QMacc_declareFacc_declare_presentEa"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
+! CHECK-DAG: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {acc.declare = #acc.declare<dataClause =  acc_present>, uniq_name = "_QMacc_declareFacc_declare_presentEa"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) extent(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%[[C1]] : index)
 ! CHECK: %[[PRESENT:.*]] = acc.present varPtr(%[[DECL]]#0 : !fir.ref<!fir.array<100xi32>>)   bounds(%[[BOUND]]) -> !fir.ref<!fir.array<100xi32>> {name = "a"}
 ! CHECK: acc.declare_enter dataOperands(%[[PRESENT]] : !fir.ref<!fir.array<100xi32>>)
@@ -119,7 +119,7 @@ module acc_declare
 
 ! CHECK-LABEL: func.func @_QMacc_declarePacc_declare_deviceptr(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>> {fir.bindc_name = "a"}) {
-! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) {acc.declare = #acc.declare<dataClause =  acc_deviceptr>, uniq_name = "_QMacc_declareFacc_declare_deviceptrEa"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
+! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {acc.declare = #acc.declare<dataClause =  acc_deviceptr>, uniq_name = "_QMacc_declareFacc_declare_deviceptrEa"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
 ! CHECK: %[[DEVICEPTR:.*]] = acc.deviceptr varPtr(%[[DECL]]#0 : !fir.ref<!fir.array<100xi32>>)   bounds(%{{.*}}) -> !fir.ref<!fir.array<100xi32>> {name = "a"}
 ! CHECK: acc.declare_enter dataOperands(%[[DEVICEPTR]] : !fir.ref<!fir.array<100xi32>>)
 ! CHECK: %{{.*}}:2 = fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%arg{{.*}} = %{{.*}}) -> (index, i32)
@@ -135,7 +135,7 @@ module acc_declare
 
 ! CHECK-LABEL: func.func @_QMacc_declarePacc_declare_link(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>> {fir.bindc_name = "a"})
-! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) {acc.declare = #acc.declare<dataClause =  acc_declare_link>, uniq_name = "_QMacc_declareFacc_declare_linkEa"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
+! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {acc.declare = #acc.declare<dataClause =  acc_declare_link>, uniq_name = "_QMacc_declareFacc_declare_linkEa"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
 ! CHECK: %[[LINK:.*]] = acc.declare_link varPtr(%[[DECL]]#0 : !fir.ref<!fir.array<100xi32>>)   bounds(%{{.*}}) -> !fir.ref<!fir.array<100xi32>> {name = "a"}
 ! CHECK: acc.declare_enter dataOperands(%[[LINK]] : !fir.ref<!fir.array<100xi32>>)
 ! CHECK: %{{.*}}:2 = fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%arg{{.*}} = %{{.*}}) -> (index, i32)
@@ -151,7 +151,7 @@ module acc_declare
 
 ! CHECK-LABEL: func.func @_QMacc_declarePacc_declare_device_resident(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>> {fir.bindc_name = "a"})
-! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) {acc.declare = #acc.declare<dataClause =  acc_declare_device_resident>, uniq_name = "_QMacc_declareFacc_declare_device_residentEa"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
+! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {acc.declare = #acc.declare<dataClause =  acc_declare_device_resident>, uniq_name = "_QMacc_declareFacc_declare_device_residentEa"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
 ! CHECK: %[[DEVICERES:.*]] = acc.declare_device_resident varPtr(%[[DECL]]#0 : !fir.ref<!fir.array<100xi32>>)   bounds(%{{.*}}) -> !fir.ref<!fir.array<100xi32>> {name = "a"}
 ! CHECK: %[[TOKEN:.*]] = acc.declare_enter dataOperands(%[[DEVICERES]] : !fir.ref<!fir.array<100xi32>>)
 ! CHECK: %{{.*}}:2 = fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%arg{{.*}} = %{{.*}}) -> (index, i32)
@@ -220,12 +220,12 @@ module acc_declare
 ! CHECK-LABEL: func.func @_QMacc_declarePacc_declare_in_func2(%arg0: !fir.ref<i32> {fir.bindc_name = "i"}) -> f32 {
 ! CHECK: %[[ALLOCA_A:.*]] = fir.alloca !fir.array<1024xf32> {bindc_name = "a", uniq_name = "_QMacc_declareFacc_declare_in_func2Ea"}
 ! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ALLOCA_A]](%{{.*}}) {acc.declare = #acc.declare<dataClause =  acc_create>, uniq_name = "_QMacc_declareFacc_declare_in_func2Ea"} : (!fir.ref<!fir.array<1024xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<1024xf32>>, !fir.ref<!fir.array<1024xf32>>)
-! CHECK: %[[CREATE:.*]] = acc.create varPtr(%[[DECL_A]]#0 : !fir.ref<!fir.array<1024xf32>>) bounds(%7) -> !fir.ref<!fir.array<1024xf32>> {name = "a"}
+! CHECK: %[[CREATE:.*]] = acc.create varPtr(%[[DECL_A]]#0 : !fir.ref<!fir.array<1024xf32>>) bounds(%{{[0-9]+}}) -> !fir.ref<!fir.array<1024xf32>> {name = "a"}
 ! CHECK: %[[TOKEN:.*]] = acc.declare_enter dataOperands(%[[CREATE]] : !fir.ref<!fir.array<1024xf32>>)
 ! CHECK:   cf.br ^bb1
 ! CHECK: ^bb1:
 ! CHECK: acc.declare_exit token(%[[TOKEN]]) dataOperands(%[[CREATE]] : !fir.ref<!fir.array<1024xf32>>)
-! CHECK: acc.delete accPtr(%[[CREATE]] : !fir.ref<!fir.array<1024xf32>>) bounds(%7) {dataClause = #acc<data_clause acc_create>, name = "a"}
+! CHECK: acc.delete accPtr(%[[CREATE]] : !fir.ref<!fir.array<1024xf32>>) bounds(%{{[0-9]+}}) {dataClause = #acc<data_clause acc_create>, name = "a"}
 ! CHECK:   return %{{.*}} : f32
 ! CHECK: }
 
@@ -294,8 +294,8 @@ module acc_declare
 
 ! CHECK-LABEL: func.func @_QMacc_declarePacc_declare_multiple_directive(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>> {fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.ref<!fir.array<100xi32>> {fir.bindc_name = "b"}) {
-! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) {acc.declare = #acc.declare<dataClause =  acc_copy>, uniq_name = "_QMacc_declareFacc_declare_multiple_directiveEa"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
-! CHECK: %[[DECL_B:.*]]:2 = hlfir.declare %[[ARG1]](%{{.*}}) {acc.declare = #acc.declare<dataClause =  acc_copyout>, uniq_name = "_QMacc_declareFacc_declare_multiple_directiveEb"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
+! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {acc.declare = #acc.declare<dataClause =  acc_copy>, uniq_name = "_QMacc_declareFacc_declare_multiple_directiveEa"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
+! CHECK: %[[DECL_B:.*]]:2 = hlfir.declare %[[ARG1]](%{{.*}}) dummy_scope %{{[0-9]+}} {acc.declare = #acc.declare<dataClause =  acc_copyout>, uniq_name = "_QMacc_declareFacc_declare_multiple_directiveEb"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
 ! CHECK: %[[COPYIN:.*]] = acc.copyin varPtr(%[[DECL_A]]#0 : !fir.ref<!fir.array<100xi32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<100xi32>> {dataClause = #acc<data_clause acc_copy>, name = "a"}
 ! CHECK: %[[CREATE:.*]] = acc.create varPtr(%[[DECL_B]]#0 : !fir.ref<!fir.array<100xi32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<100xi32>> {dataClause = #acc<data_clause acc_copyout>, name = "b"}
 ! CHECK: acc.declare_enter dataOperands(%[[COPYIN]], %[[CREATE]] : !fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
@@ -316,7 +316,7 @@ module acc_declare
 
 ! CHECK-LABEL: func.func @_QMacc_declarePacc_declare_array_section(
 ! CHECK-SAME:    %[[ARG0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"}) {
-! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_QMacc_declareFacc_declare_array_sectionEa"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMacc_declareFacc_declare_array_sectionEa"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[DECL_A]]#0 {acc.declare = #acc.declare<dataClause =  acc_copy>} : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
 ! CHECK: %[[COPYIN:.*]] = acc.copyin varPtr(%[[BOX_ADDR]] : !fir.ref<!fir.array<?xi32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<?xi32>> {dataClause = #acc<data_clause acc_copy>, name = "a(1:10)"}
 ! CHECK: acc.declare_enter dataOperands(%[[COPYIN]] : !fir.ref<!fir.array<?xi32>>)
diff --git a/flang/test/Lower/OpenACC/acc-loop-exit.f90 b/flang/test/Lower/OpenACC/acc-loop-exit.f90
index c1ea057af667..85394e4a5b74 100644
--- a/flang/test/Lower/OpenACC/acc-loop-exit.f90
+++ b/flang/test/Lower/OpenACC/acc-loop-exit.f90
@@ -14,9 +14,9 @@ subroutine sub1(x, a)
 end 
 
 ! CHECK-LABEL: func.func @_QPsub1
-! CHECK: %[[A:.*]]:2 = hlfir.declare %arg1 {uniq_name = "_QFsub1Ea"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK: %[[I:.*]]:2 = hlfir.declare %2 {uniq_name = "_QFsub1Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK: %[[I:.*]]:2 = hlfir.declare %6 {uniq_name = "_QFsub1Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[A:.*]]:2 = hlfir.declare %arg1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFsub1Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[I:.*]]:2 = hlfir.declare %{{[0-9]+}} {uniq_name = "_QFsub1Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[I:.*]]:2 = hlfir.declare %{{[0-9]+}} {uniq_name = "_QFsub1Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: %[[EXIT_COND:.*]] = acc.loop
 ! CHECK: ^bb{{.*}}:
 ! CHECK: ^bb{{.*}}:
diff --git a/flang/test/Lower/OpenACC/acc-private.f90 b/flang/test/Lower/OpenACC/acc-private.f90
index 4d9f84b1fa74..a299a7486c3b 100644
--- a/flang/test/Lower/OpenACC/acc-private.f90
+++ b/flang/test/Lower/OpenACC/acc-private.f90
@@ -271,7 +271,7 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPacc_private_assumed_shape(
 ! CHECK-SAME:    %[[ARG0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"}
-! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_QFacc_private_assumed_shapeEa"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFacc_private_assumed_shapeEa"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK: acc.parallel {{.*}} {
 ! CHECK: %[[ADDR:.*]] = fir.box_addr %[[DECL_A]]#0 : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
 ! CHECK: %[[PRIVATE:.*]] = acc.private varPtr(%[[ADDR]] : !fir.ref<!fir.array<?xi32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<?xi32>> {name = "a"}
@@ -293,7 +293,7 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPacc_private_allocatable_array(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {fir.bindc_name = "a"}
-! CHECK: %[[DECLA_A:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFacc_private_allocatable_arrayEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
+! CHECK: %[[DECLA_A:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFacc_private_allocatable_arrayEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 ! CHECK: acc.parallel {{.*}} {
 ! CHECK: %[[BOX:.*]] = fir.load %[[DECLA_A]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[BOX]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
@@ -313,7 +313,7 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPacc_private_pointer_array(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> {fir.bindc_name = "a"}, %arg1: !fir.ref<i32> {fir.bindc_name = "n"}) {
-! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %arg0 {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFacc_private_pointer_arrayEa"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
+! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %arg0 dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFacc_private_pointer_arrayEa"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
 ! CHECK: acc.parallel {{.*}} {
 ! CHECK: %[[BOX:.*]] = fir.load %[[DECLA_A]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
 ! CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[BOX]] : (!fir.box<!fir.ptr<!fir.array<?xi32>>>) -> !fir.ptr<!fir.array<?xi32>>
@@ -332,8 +332,8 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPacc_private_dynamic_extent(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?x?x2xi32>> {fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
-! CHECK: %[[DECL_N:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_QFacc_private_dynamic_extentEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) {uniq_name = "_QFacc_private_dynamic_extentEa"} : (!fir.ref<!fir.array<?x?x2xi32>>, !fir.shape<3>) -> (!fir.box<!fir.array<?x?x2xi32>>, !fir.ref<!fir.array<?x?x2xi32>>)
+! CHECK: %[[DECL_N:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFacc_private_dynamic_extentEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {uniq_name = "_QFacc_private_dynamic_extentEa"} : (!fir.ref<!fir.array<?x?x2xi32>>, !fir.shape<3>, !fir.dscope) -> (!fir.box<!fir.array<?x?x2xi32>>, !fir.ref<!fir.array<?x?x2xi32>>)
 ! CHECK: acc.parallel {{.*}} {
 ! CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[DECL_A]]#0 : (!fir.box<!fir.array<?x?x2xi32>>) -> !fir.ref<!fir.array<?x?x2xi32>>
 ! CHECK: %[[PRIV:.*]] = acc.private varPtr(%[[BOX_ADDR]] : !fir.ref<!fir.array<?x?x2xi32>>) bounds(%{{.*}}, %{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<?x?x2xi32>> {name = "a"}
diff --git a/flang/test/Lower/OpenACC/acc-reduction.f90 b/flang/test/Lower/OpenACC/acc-reduction.f90
index 6918bc1ec7d6..545c4f217577 100644
--- a/flang/test/Lower/OpenACC/acc-reduction.f90
+++ b/flang/test/Lower/OpenACC/acc-reduction.f90
@@ -1162,7 +1162,7 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPacc_reduction_add_dynamic_extent_add_with_section(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"})
-! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_QFacc_reduction_add_dynamic_extent_add_with_sectionEa"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFacc_reduction_add_dynamic_extent_add_with_sectionEa"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%c1{{.*}} : index) upperbound(%c3{{.*}} : index) extent(%{{.*}}#1 : index) stride(%{{.*}}#2 : index) startIdx(%{{.*}} : index) {strideInBytes = true}
 ! CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[DECL]]#0 : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%[[BOX_ADDR]] : !fir.ref<!fir.array<?xi32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<?xi32>> {name = "a(2:4)"}
@@ -1176,11 +1176,11 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPacc_reduction_add_allocatable(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {fir.bindc_name = "a"})
-! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFacc_reduction_add_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFacc_reduction_add_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 ! CHECK: %[[BOX:.*]] = fir.load %[[DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%c0{{.*}} : index) upperbound(%{{.*}} : index) extent(%{{.*}}#1 : index) stride(%{{.*}}#2 : index) startIdx(%{{.*}}#0 : index) {strideInBytes = true}
 ! CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[BOX]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.heap<!fir.array<?xf32>>
-! CHECK: %[[RED:.*]] = acc.reduction varPtr(%[[BOX_ADDR]] : !fir.heap<!fir.array<?xf32>>)   bounds(%6) -> !fir.heap<!fir.array<?xf32>> {name = "a"}
+! CHECK: %[[RED:.*]] = acc.reduction varPtr(%[[BOX_ADDR]] : !fir.heap<!fir.array<?xf32>>)   bounds(%{{[0-9]+}}) -> !fir.heap<!fir.array<?xf32>> {name = "a"}
 ! CHECK: acc.parallel reduction(@reduction_max_box_heap_Uxf32 -> %[[RED]] : !fir.heap<!fir.array<?xf32>>)
 
 subroutine acc_reduction_add_pointer_array(a)
@@ -1191,7 +1191,7 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPacc_reduction_add_pointer_array(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {fir.bindc_name = "a"})
-! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFacc_reduction_add_pointer_arrayEa"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFacc_reduction_add_pointer_arrayEa"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
 ! CHECK: %[[BOX:.*]] = fir.load %[[DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%c0{{.*}} : index) upperbound(%{{.*}} : index) extent(%{{.*}}#1 : index) stride(%{{.*}}#2 : index) startIdx(%{{.*}}#0 : index) {strideInBytes = true}
 ! CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[BOX]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.ptr<!fir.array<?xf32>>
@@ -1207,7 +1207,7 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPacc_reduction_max_dynamic_extent_max(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?x?xf32>> {fir.bindc_name = "a"}, %{{.*}}: !fir.ref<i32> {fir.bindc_name = "n"})
-! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) {uniq_name = "_QFacc_reduction_max_dynamic_extent_maxEa"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shape<2>) -> (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<?x?xf32>>)
+! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {uniq_name = "_QFacc_reduction_max_dynamic_extent_maxEa"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<?x?xf32>>)
 ! CHECK: %[[ADDR:.*]] = fir.box_addr %[[DECL_A]]#0 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%[[ADDR]] : !fir.ref<!fir.array<?x?xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<?x?xf32>> {name = "a"}
 ! CHECK: acc.parallel reduction(@reduction_max_box_UxUxf32 -> %[[RED]] : !fir.ref<!fir.array<?x?xf32>>)
diff --git a/flang/test/Lower/OpenMP/allocatable-array-bounds.f90 b/flang/test/Lower/OpenMP/allocatable-array-bounds.f90
index aeb56a0427e3..87ca400e82e2 100644
--- a/flang/test/Lower/OpenMP/allocatable-array-bounds.f90
+++ b/flang/test/Lower/OpenMP/allocatable-array-bounds.f90
@@ -24,7 +24,7 @@
 !HOST: %[[BOUNDS_1:.*]] = omp.map.bounds lower_bound(%[[LB_1]] : index) upper_bound(%[[UB_1]] : index) extent(%[[BOX_3]]#1 : index) stride(%[[BOX_2]]#2 : index) start_idx(%[[BOX_1]]#0 : index) {stride_in_bytes = true}
 !HOST: %[[VAR_PTR_PTR:.*]] = fir.box_offset %[[DECLARE_1]]#1 base_addr : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
 !HOST: %[[MAP_INFO_MEMBER:.*]] = omp.map.info var_ptr(%[[DECLARE_1]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.array<?xi32>) var_ptr_ptr(%[[VAR_PTR_PTR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS_1]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}    
-!HOST: %[[MAP_INFO_1:.*]] = omp.map.info var_ptr(%[[DECLARE_1]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.box<!fir.heap<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_INFO_MEMBER]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {name = "sp_read(2:5)"}
+!HOST: %[[MAP_INFO_1:.*]] = omp.map.info var_ptr(%[[DECLARE_1]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.box<!fir.heap<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_INFO_MEMBER]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {name = "sp_read(2:5)"}
 
 !HOST: %[[LOAD_3:.*]] = fir.load %[[DECLARE_2]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 !HOST: %[[LOAD_4:.*]] = fir.load %[[DECLARE_2]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
@@ -42,7 +42,7 @@
 !HOST: %[[BOUNDS_2:.*]] = omp.map.bounds lower_bound(%[[LB_2]] : index) upper_bound(%[[UB_2]] : index) extent(%[[BOX_5]]#1 : index) stride(%[[BOX_4]]#2 : index) start_idx(%[[BOX_3]]#0 : index) {stride_in_bytes = true}
 !HOST: %[[VAR_PTR_PTR:.*]] = fir.box_offset %[[DECLARE_2]]#1 base_addr : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
 !HOST: %[[MAP_INFO_MEMBER:.*]] = omp.map.info var_ptr(%[[DECLARE_2]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.array<?xi32>) var_ptr_ptr(%[[VAR_PTR_PTR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS_2]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}    
-!HOST: %[[MAP_INFO_2:.*]] = omp.map.info var_ptr(%[[DECLARE_2]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.box<!fir.heap<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_INFO_MEMBER]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {name = "sp_write(2:5)"}
+!HOST: %[[MAP_INFO_2:.*]] = omp.map.info var_ptr(%[[DECLARE_2]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.box<!fir.heap<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_INFO_MEMBER]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {name = "sp_write(2:5)"}
 
 subroutine read_write_section()
     integer, allocatable :: sp_read(:)
@@ -64,7 +64,7 @@ module assumed_allocatable_array_routines
 
 !HOST-LABEL: func.func @_QMassumed_allocatable_array_routinesPassumed_shape_array(
 
-!HOST: %[[DECLARE:.*]]:2 = hlfir.declare %[[ARG:.*]] {fortran_attrs = #fir.var_attrs<allocatable, intent_inout>, uniq_name = "_QMassumed_allocatable_array_routinesFassumed_shape_arrayEarr_read_write"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
+!HOST: %[[DECLARE:.*]]:2 = hlfir.declare %[[ARG:.*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable, intent_inout>, uniq_name = "_QMassumed_allocatable_array_routinesFassumed_shape_arrayEarr_read_write"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 !HOST: %[[LOAD_1:.*]] = fir.load %[[DECLARE]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 !HOST: %[[LOAD_2:.*]] = fir.load %[[DECLARE]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 !HOST: %[[CONSTANT_1:.*]] = arith.constant 0 : index
@@ -81,7 +81,7 @@ module assumed_allocatable_array_routines
 !HOST: %[[BOUNDS:.*]] = omp.map.bounds lower_bound(%[[LB]] : index) upper_bound(%[[UB]] : index) extent(%[[BOX_3]]#1 : index) stride(%[[BOX_2]]#2 : index) start_idx(%[[BOX_1]]#0 : index) {stride_in_bytes = true}
 !HOST: %[[VAR_PTR_PTR:.*]] = fir.box_offset %[[DECLARE]]#1 base_addr : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
 !HOST: %[[MAP_INFO_MEMBER:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.array<?xi32>) var_ptr_ptr(%[[VAR_PTR_PTR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}    
-!HOST: %[[MAP_INFO:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.box<!fir.heap<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_INFO_MEMBER]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {name = "arr_read_write(2:5)"}    
+!HOST: %[[MAP_INFO:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.box<!fir.heap<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_INFO_MEMBER]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {name = "arr_read_write(2:5)"}    
 subroutine assumed_shape_array(arr_read_write)
     integer, allocatable, intent(inout) :: arr_read_write(:)
 
diff --git a/flang/test/Lower/OpenMP/allocatable-map.f90 b/flang/test/Lower/OpenMP/allocatable-map.f90
index 396d45373b84..a9f576a6f099 100644
--- a/flang/test/Lower/OpenMP/allocatable-map.f90
+++ b/flang/test/Lower/OpenMP/allocatable-map.f90
@@ -2,12 +2,12 @@
 
 !HLFIRDIALECT: %[[POINTER:.*]]:2 = hlfir.declare %0 {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFpointer_routineEpoint"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
 !HLFIRDIALECT: %[[BOX_OFF:.*]] = fir.box_offset %[[POINTER]]#1 base_addr : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> !fir.llvm_ptr<!fir.ref<i32>>
-!HLFIRDIALECT: %[[POINTER_MAP_MEMBER:.*]] = omp.map.info var_ptr(%[[POINTER]]#1 : !fir.ref<!fir.box<!fir.ptr<i32>>>, i32) var_ptr_ptr(%[[BOX_OFF]] : !fir.llvm_ptr<!fir.ref<i32>>)  map_clauses(implicit, tofrom) capture(ByRef) -> !fir.llvm_ptr<!fir.ref<i32>> {name = ""}
-!HLFIRDIALECT: %[[POINTER_MAP:.*]] = omp.map.info var_ptr(%[[POINTER]]#1 : !fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.box<!fir.ptr<i32>>) map_clauses(implicit, tofrom) capture(ByRef) members(%[[POINTER_MAP_MEMBER]] : !fir.llvm_ptr<!fir.ref<i32>>) -> !fir.ref<!fir.box<!fir.ptr<i32>>> {name = "point"}
-!HLFIRDIALECT: omp.target map_entries({{.*}}, %[[POINTER_MAP_MEMBER]] -> {{.*}}, %[[POINTER_MAP]] -> {{.*}} : {{.*}}, !fir.llvm_ptr<!fir.ref<i32>>, !fir.ref<!fir.box<!fir.ptr<i32>>>) {
+!HLFIRDIALECT: %[[POINTER_MAP_MEMBER:.*]] = omp.map.info var_ptr(%[[POINTER]]#1 : !fir.ref<!fir.box<!fir.ptr<i32>>>, i32) var_ptr_ptr(%[[BOX_OFF]] : !fir.llvm_ptr<!fir.ref<i32>>)  map_clauses(tofrom) capture(ByRef) -> !fir.llvm_ptr<!fir.ref<i32>> {name = ""}
+!HLFIRDIALECT: %[[POINTER_MAP:.*]] = omp.map.info var_ptr(%[[POINTER]]#1 : !fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.box<!fir.ptr<i32>>) map_clauses(tofrom) capture(ByRef) members(%[[POINTER_MAP_MEMBER]] : [0] : !fir.llvm_ptr<!fir.ref<i32>>) -> !fir.ref<!fir.box<!fir.ptr<i32>>> {name = "point"}
+!HLFIRDIALECT: omp.target map_entries(%[[POINTER_MAP_MEMBER]] -> {{.*}}, %[[POINTER_MAP]] -> {{.*}} : !fir.llvm_ptr<!fir.ref<i32>>, !fir.ref<!fir.box<!fir.ptr<i32>>>) {
 subroutine pointer_routine()
     integer, pointer :: point 
-!$omp target map(tofrom:pointer)
+!$omp target map(tofrom:point)
     point = 1
 !$omp end target
 end subroutine pointer_routine
diff --git a/flang/test/Lower/OpenMP/array-bounds.f90 b/flang/test/Lower/OpenMP/array-bounds.f90
index 2c8a8999a2cc..f235d5041ab2 100644
--- a/flang/test/Lower/OpenMP/array-bounds.f90
+++ b/flang/test/Lower/OpenMP/array-bounds.f90
@@ -41,7 +41,7 @@ module assumed_array_routines
 !HOST-LABEL: func.func @_QMassumed_array_routinesPassumed_shape_array(
 !HOST-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "arr_read_write"}) {
 !HOST: %[[INTERMEDIATE_ALLOCA:.*]] = fir.alloca !fir.box<!fir.array<?xi32>>
-!HOST: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QMassumed_array_routinesFassumed_shape_arrayEarr_read_write"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+!HOST: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QMassumed_array_routinesFassumed_shape_arrayEarr_read_write"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 !HOST: %[[C0:.*]] = arith.constant 1 : index
 !HOST: %[[C1:.*]] = arith.constant 0 : index
 !HOST: %[[DIMS0:.*]]:3 = fir.box_dims %[[ARG0_DECL]]#0, %[[C1]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
@@ -52,7 +52,7 @@ module assumed_array_routines
 !HOST: %[[BOUNDS:.*]] = omp.map.bounds   lower_bound(%[[C3]] : index) upper_bound(%[[C4]] : index) extent(%[[DIMS1]]#1 : index) stride(%[[DIMS0]]#2 : index) start_idx(%[[C0]] : index) {stride_in_bytes = true}
 !HOST: %[[VAR_PTR_PTR:.*]] = fir.box_offset %0 base_addr : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
 !HOST: %[[MAP_INFO_MEMBER:.*]] = omp.map.info var_ptr(%[[INTERMEDIATE_ALLOCA]] : !fir.ref<!fir.box<!fir.array<?xi32>>>, !fir.array<?xi32>) var_ptr_ptr(%[[VAR_PTR_PTR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}
-!HOST: %[[MAP:.*]] = omp.map.info var_ptr(%[[INTERMEDIATE_ALLOCA]] : !fir.ref<!fir.box<!fir.array<?xi32>>>, !fir.box<!fir.array<?xi32>>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_INFO_MEMBER]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.array<?xi32>> {name = "arr_read_write(2:5)"}
+!HOST: %[[MAP:.*]] = omp.map.info var_ptr(%[[INTERMEDIATE_ALLOCA]] : !fir.ref<!fir.box<!fir.array<?xi32>>>, !fir.box<!fir.array<?xi32>>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_INFO_MEMBER]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.array<?xi32>> {name = "arr_read_write(2:5)"}
 !HOST: omp.target   map_entries(%[[MAP_INFO_MEMBER]] -> %{{.*}}, %[[MAP]] -> %{{.*}}, {{.*}} -> {{.*}} : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, !fir.ref<!fir.array<?xi32>>, !fir.ref<i32>) {
     subroutine assumed_shape_array(arr_read_write)
             integer, intent(inout) :: arr_read_write(:)
@@ -69,7 +69,7 @@ module assumed_array_routines
 !HOST-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xi32>> {fir.bindc_name = "arr_read_write"}) {
 !HOST: %[[INTERMEDIATE_ALLOCA:.*]] = fir.alloca !fir.box<!fir.array<?xi32>>
 !HOST: %[[ARG0_SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
-!HOST: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]](%[[ARG0_SHAPE]]) {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QMassumed_array_routinesFassumed_size_arrayEarr_read_write"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>)
+!HOST: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]](%[[ARG0_SHAPE]]) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QMassumed_array_routinesFassumed_size_arrayEarr_read_write"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>)
 !HOST: %[[ALLOCA:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QMassumed_array_routinesFassumed_size_arrayEi"}
 !HOST: %[[DIMS0:.*]]:3 = fir.box_dims %[[ARG0_DECL]]#0, %c0{{.*}} : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
 !HOST: %[[C4_1:.*]] = arith.subi %c4, %c1{{.*}} : index
@@ -77,7 +77,7 @@ module assumed_array_routines
 !HOST: %[[BOUNDS:.*]] = omp.map.bounds lower_bound(%c1{{.*}} : index) upper_bound(%c4{{.*}} : index) extent(%[[EXT]] : index) stride(%[[DIMS0]]#2 : index) start_idx(%c1{{.*}} : index) {stride_in_bytes = true}
 !HOST: %[[VAR_PTR_PTR:.*]] = fir.box_offset %[[INTERMEDIATE_ALLOCA]] base_addr : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
 !HOST: %[[MAP_INFO_MEMBER:.*]] = omp.map.info var_ptr(%[[INTERMEDIATE_ALLOCA]] : !fir.ref<!fir.box<!fir.array<?xi32>>>, !fir.array<?xi32>) var_ptr_ptr(%[[VAR_PTR_PTR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}
-!HOST: %[[MAP:.*]] = omp.map.info var_ptr(%[[INTERMEDIATE_ALLOCA]] : !fir.ref<!fir.box<!fir.array<?xi32>>>, !fir.box<!fir.array<?xi32>>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_INFO_MEMBER]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.array<?xi32>> {name = "arr_read_write(2:5)"}
+!HOST: %[[MAP:.*]] = omp.map.info var_ptr(%[[INTERMEDIATE_ALLOCA]] : !fir.ref<!fir.box<!fir.array<?xi32>>>, !fir.box<!fir.array<?xi32>>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_INFO_MEMBER]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.array<?xi32>> {name = "arr_read_write(2:5)"}
 !HOST: omp.target map_entries(%[[MAP_INFO_MEMBER]] -> %{{.*}}, %[[MAP]] -> %{{.*}}, {{.*}} -> {{.*}} : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, !fir.ref<!fir.array<?xi32>>, !fir.ref<i32>) {
     subroutine assumed_size_array(arr_read_write)
         integer, intent(inout) :: arr_read_write(*)
diff --git a/flang/test/Lower/OpenMP/copyin-order.f90 b/flang/test/Lower/OpenMP/copyin-order.f90
new file mode 100644
index 000000000000..0620d89ffb67
--- /dev/null
+++ b/flang/test/Lower/OpenMP/copyin-order.f90
@@ -0,0 +1,31 @@
+!RUN: bbc -fopenmp -emit-hlfir -o - %s | FileCheck %s
+
+!https://github.com/llvm/llvm-project/issues/91205
+
+!CHECK: omp.parallel if(%{{[0-9]+}} : i1) {
+!CHECK:   %[[THP1:[0-9]+]] = omp.threadprivate %{{[0-9]+}}#1
+!CHECK:   %[[DCL1:[0-9]+]]:2 = hlfir.declare %[[THP1]] {uniq_name = "_QFcopyin_scalar_arrayEx1"}
+!CHECK:   %[[LD1:[0-9]+]] = fir.load %{{[0-9]+}}#0
+!CHECK:   hlfir.assign %[[LD1]] to %[[DCL1]]#0 temporary_lhs
+!CHECK:   %[[THP2:[0-9]+]] = omp.threadprivate %{{[0-9]+}}#1
+!CHECK:   %[[SHP2:[0-9]+]] = fir.shape %c{{[0-9]+}}
+!CHECK:   %[[DCL2:[0-9]+]]:2 = hlfir.declare %[[THP2]](%[[SHP2]]) {uniq_name = "_QFcopyin_scalar_arrayEx2"}
+!CHECK:   hlfir.assign %{{[0-9]+}}#0 to %[[DCL2]]#0 temporary_lhs
+!CHECK:   omp.barrier
+!CHECK:   fir.call @_QPsub1(%[[DCL1]]#1, %[[DCL2]]#1)
+!CHECK:   omp.terminator
+!CHECK: }
+
+subroutine copyin_scalar_array()
+  integer(kind=4), save :: x1
+  integer(kind=8), save :: x2(10)
+  !$omp threadprivate(x1, x2)
+
+  ! Have x1 appear before x2 in the AST node for the `parallel` construct,
+  ! but at the same time have them in a different order in `copyin`.
+  !$omp parallel if (x1 .eq. x2(1)) copyin(x2, x1)
+    call sub1(x1, x2)
+  !$omp end parallel
+
+end
+
diff --git a/flang/test/Lower/OpenMP/default-clause-byref.f90 b/flang/test/Lower/OpenMP/default-clause-byref.f90
index 6a91927ab02d..7cc2bc2e0c71 100644
--- a/flang/test/Lower/OpenMP/default-clause-byref.f90
+++ b/flang/test/Lower/OpenMP/default-clause-byref.f90
@@ -161,12 +161,12 @@ subroutine nested_default_clause_tests
 !CHECK: %[[Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFnested_default_clause_testsEz"}
 !CHECK: %[[Z_DECL:.*]]:2 = hlfir.declare %[[Z]] {uniq_name = "_QFnested_default_clause_testsEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: omp.parallel   {
+!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
+!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
 !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
 !CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
-!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
-!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFnested_default_clause_testsEz"}
 !CHECK: %[[PRIVATE_Z_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Z]] {uniq_name = "_QFnested_default_clause_testsEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_K:.*]] = fir.alloca i32 {bindc_name = "k", pinned, uniq_name = "_QFnested_default_clause_testsEk"}
@@ -221,13 +221,12 @@ subroutine nested_default_clause_tests
     
     
 !CHECK: omp.parallel {
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
 !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
 !CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[PRIVATE_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFnested_default_clause_testsEz"}
 !CHECK: %[[PRIVATE_Z_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Z]] {uniq_name = "_QFnested_default_clause_testsEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFnested_default_clause_testsEw"}
-!CHECK: %[[PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_W]] {uniq_name = "_QFnested_default_clause_testsEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: omp.parallel {
 !CHECK: %[[PRIVATE_INNER_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
 !CHECK: %[[PRIVATE_INNER_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_INNER_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
diff --git a/flang/test/Lower/OpenMP/default-clause.f90 b/flang/test/Lower/OpenMP/default-clause.f90
index d3c6550821f0..843ee6bb7910 100644
--- a/flang/test/Lower/OpenMP/default-clause.f90
+++ b/flang/test/Lower/OpenMP/default-clause.f90
@@ -148,34 +148,33 @@ program default_clause_lowering
 
 end program default_clause_lowering
 
-subroutine nested_default_clause_tests
-    integer :: x, y, z, w, k, a
-!CHECK: %[[K:.*]] = fir.alloca i32 {bindc_name = "k", uniq_name = "_QFnested_default_clause_testsEk"}
-!CHECK: %[[K_DECL:.*]]:2 = hlfir.declare %[[K]] {uniq_name = "_QFnested_default_clause_testsEk"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[W:.*]] = fir.alloca i32 {bindc_name = "w", uniq_name = "_QFnested_default_clause_testsEw"}
-!CHECK: %[[W_DECL:.*]]:2 = hlfir.declare %[[W]] {uniq_name = "_QFnested_default_clause_testsEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFnested_default_clause_testsEx"}
-!CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFnested_default_clause_testsEy"}
-!CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFnested_default_clause_testsEz"}
-!CHECK: %[[Z_DECL:.*]]:2 = hlfir.declare %[[Z]] {uniq_name = "_QFnested_default_clause_testsEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK-LABEL: func @_QPnested_default_clause_test1
+!CHECK: %[[K:.*]] = fir.alloca i32 {bindc_name = "k", uniq_name = "_QFnested_default_clause_test1Ek"}
+!CHECK: %[[K_DECL:.*]]:2 = hlfir.declare %[[K]] {uniq_name = "_QFnested_default_clause_test1Ek"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[W:.*]] = fir.alloca i32 {bindc_name = "w", uniq_name = "_QFnested_default_clause_test1Ew"}
+!CHECK: %[[W_DECL:.*]]:2 = hlfir.declare %[[W]] {uniq_name = "_QFnested_default_clause_test1Ew"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFnested_default_clause_test1Ex"}
+!CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFnested_default_clause_test1Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFnested_default_clause_test1Ey"}
+!CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFnested_default_clause_test1Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFnested_default_clause_test1Ez"}
+!CHECK: %[[Z_DECL:.*]]:2 = hlfir.declare %[[Z]] {uniq_name = "_QFnested_default_clause_test1Ez"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: omp.parallel   {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
-!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_test1Ey"}
+!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_test1Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_test1Ex"}
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFnested_default_clause_test1Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
 !CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
-!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
-!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[PRIVATE_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFnested_default_clause_testsEz"}
-!CHECK: %[[PRIVATE_Z_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Z]] {uniq_name = "_QFnested_default_clause_testsEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[PRIVATE_K:.*]] = fir.alloca i32 {bindc_name = "k", pinned, uniq_name = "_QFnested_default_clause_testsEk"}
-!CHECK: %[[PRIVATE_K_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_K]] {uniq_name = "_QFnested_default_clause_testsEk"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFnested_default_clause_test1Ez"}
+!CHECK: %[[PRIVATE_Z_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Z]] {uniq_name = "_QFnested_default_clause_test1Ez"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_K:.*]] = fir.alloca i32 {bindc_name = "k", pinned, uniq_name = "_QFnested_default_clause_test1Ek"}
+!CHECK: %[[PRIVATE_K_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_K]] {uniq_name = "_QFnested_default_clause_test1Ek"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: omp.parallel {
-!CHECK: %[[INNER_PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
-!CHECK: %[[INNER_PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[INNER_PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
-!CHECK: %[[INNER_PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[INNER_PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_test1Ey"}
+!CHECK: %[[INNER_PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_test1Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[INNER_PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_test1Ex"}
+!CHECK: %[[INNER_PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_X]] {uniq_name = "_QFnested_default_clause_test1Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[CONST:.*]] = arith.constant 20 : i32
 !CHECK: hlfir.assign %[[CONST]] to %[[INNER_PRIVATE_Y_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK: %[[CONST:.*]] = arith.constant 10 : i32
@@ -183,14 +182,14 @@ subroutine nested_default_clause_tests
 !CHECK: omp.terminator
 !CHECK: }
 !CHECK: omp.parallel   {
-!CHECK: %[[INNER_PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFnested_default_clause_testsEw"}
-!CHECK: %[[INNER_PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_W]] {uniq_name = "_QFnested_default_clause_testsEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[INNER_PRIVATE_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFnested_default_clause_testsEz"}
-!CHECK: %[[INNER_PRIVATE_Z_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_Z]] {uniq_name = "_QFnested_default_clause_testsEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[INNER_PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFnested_default_clause_test1Ew"}
+!CHECK: %[[INNER_PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_W]] {uniq_name = "_QFnested_default_clause_test1Ew"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[INNER_PRIVATE_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFnested_default_clause_test1Ez"}
+!CHECK: %[[INNER_PRIVATE_Z_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_Z]] {uniq_name = "_QFnested_default_clause_test1Ez"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_Z_DECL]]#0 : !fir.ref<i32>
 !CHECK: hlfir.assign %[[TEMP]] to %[[INNER_PRIVATE_Z_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
-!CHECK: %[[INNER_PRIVATE_K:.*]] = fir.alloca i32 {bindc_name = "k", pinned, uniq_name = "_QFnested_default_clause_testsEk"}
-!CHECK: %[[INNER_PRIVATE_K_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_K]] {uniq_name = "_QFnested_default_clause_testsEk"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[INNER_PRIVATE_K:.*]] = fir.alloca i32 {bindc_name = "k", pinned, uniq_name = "_QFnested_default_clause_test1Ek"}
+!CHECK: %[[INNER_PRIVATE_K_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_K]] {uniq_name = "_QFnested_default_clause_test1Ek"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_K_DECL]]#0 : !fir.ref<i32>
 !CHECK: hlfir.assign %[[TEMP]] to %[[INNER_PRIVATE_K_DECL]]#0 temporary_lhs : i32, !fir.ref<i32> 
 !CHECK: %[[CONST:.*]] = arith.constant 30 : i32
@@ -205,79 +204,95 @@ subroutine nested_default_clause_tests
 !CHECK: }
 !CHECK: omp.terminator
 !CHECK: }
-    !$omp parallel  firstprivate(x) private(y) shared(w) default(private)  
-        !$omp parallel default(private)
-           y = 20
-           x = 10 
-        !$omp end parallel 
-
-        !$omp parallel default(firstprivate) shared(y) private(w) 
-            y = 30
-            w = 40 
-            z = 50
-            k = 40
-        !$omp end parallel
+subroutine nested_default_clause_test1
+  integer :: x, y, z, w, k
+
+  !$omp parallel  firstprivate(x) private(y) shared(w) default(private)
+    !$omp parallel default(private)
+     y = 20
+     x = 10
     !$omp end parallel
-    
-    
+
+    !$omp parallel default(firstprivate) shared(y) private(w)
+      y = 30
+      w = 40
+      z = 50
+      k = 40
+    !$omp end parallel
+  !$omp end parallel
+end subroutine
+
+!CHECK-LABEL: func @_QPnested_default_clause_test2
 !CHECK: omp.parallel {
-!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
-!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[PRIVATE_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFnested_default_clause_testsEz"}
-!CHECK: %[[PRIVATE_Z_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Z]] {uniq_name = "_QFnested_default_clause_testsEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFnested_default_clause_testsEw"}
-!CHECK: %[[PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_W]] {uniq_name = "_QFnested_default_clause_testsEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_test2Ex"}
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFnested_default_clause_test2Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_test2Ey"}
+!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_test2Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFnested_default_clause_test2Ew"}
+!CHECK: %[[PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_W]] {uniq_name = "_QFnested_default_clause_test2Ew"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFnested_default_clause_test2Ez"}
+!CHECK: %[[PRIVATE_Z_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Z]] {uniq_name = "_QFnested_default_clause_test2Ez"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: omp.parallel {
-!CHECK: %[[PRIVATE_INNER_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
-!CHECK: %[[PRIVATE_INNER_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_INNER_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_INNER_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_test2Ex"}
+!CHECK: %[[PRIVATE_INNER_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_INNER_X]] {uniq_name = "_QFnested_default_clause_test2Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
 !CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_INNER_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
-!CHECK: %[[INNER_PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
-!CHECK: %[[INNER_PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[INNER_PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_test2Ey"}
+!CHECK: %[[INNER_PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_test2Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
 !CHECK: hlfir.assign %[[TEMP]] to %[[INNER_PRIVATE_Y_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[INNER_PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFnested_default_clause_test2Ew"}
+!CHECK: %[[INNER_PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_W]] {{.*}}
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_W_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[INNER_PRIVATE_W_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
 !CHECK: %[[TEMP:.*]] = fir.load %[[INNER_PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
 !CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_INNER_X_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK: omp.terminator
 !CHECK: }
 !CHECK: omp.parallel {
-!CHECK: %[[PRIVATE_INNER_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFnested_default_clause_testsEw"}
-!CHECK: %[[PRIVATE_INNER_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_INNER_W]] {uniq_name = "_QFnested_default_clause_testsEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[PRIVATE_INNER_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
-!CHECK: %[[PRIVATE_INNER_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_INNER_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_INNER_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFnested_default_clause_test2Ew"}
+!CHECK: %[[PRIVATE_INNER_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_INNER_W]] {uniq_name = "_QFnested_default_clause_test2Ew"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_INNER_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_test2Ex"}
+!CHECK: %[[PRIVATE_INNER_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_INNER_X]] {uniq_name = "_QFnested_default_clause_test2Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[TEMP_1:.*]] = fir.load %[[PRIVATE_INNER_X_DECL]]#0 : !fir.ref<i32>
 !CHECK: %[[TEMP_2:.*]] = fir.load %[[PRIVATE_Z_DECL]]#0 : !fir.ref<i32>
 !CHECK: %[[RESULT:.*]] = arith.addi %{{.*}}, %{{.*}} : i32
 !CHECK: hlfir.assign %[[RESULT]] to %[[PRIVATE_INNER_W_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK: omp.terminator
 !CHECK: }
-    !$omp parallel default(private)
-        !$omp parallel default(firstprivate)
-            x = y
-        !$omp end parallel
+!CHECK: }
+subroutine nested_default_clause_test2
+  integer :: x, y, z, w
 
-        !$omp parallel default(private) shared(z)
-            w = x + z
-        !$omp end parallel
-    !$omp end parallel    
-    
+  !$omp parallel default(private)
+    !$omp parallel default(firstprivate)
+      x = y
+      w = w + 1
+    !$omp end parallel
+
+    !$omp parallel default(private) shared(z)
+      w = x + z
+    !$omp end parallel
+  !$omp end parallel
+end subroutine
+
+!CHECK-LABEL: func @_QPnested_default_clause_test3
 !CHECK: omp.parallel {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
-!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
-!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFnested_default_clause_testsEw"}
-!CHECK: %[[PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_W]] {uniq_name = "_QFnested_default_clause_testsEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[PRIVATE_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFnested_default_clause_testsEz"}
-!CHECK: %[[PRIVATE_Z_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Z]] {uniq_name = "_QFnested_default_clause_testsEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_test3Ex"}
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFnested_default_clause_test3Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_test3Ey"}
+!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_test3Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFnested_default_clause_test3Ew"}
+!CHECK: %[[PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_W]] {uniq_name = "_QFnested_default_clause_test3Ew"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFnested_default_clause_test3Ez"}
+!CHECK: %[[PRIVATE_Z_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Z]] {uniq_name = "_QFnested_default_clause_test3Ez"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: omp.parallel {
-!CHECK: %[[INNER_PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
-!CHECK: %[[INNER_PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[INNER_PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_test3Ex"}
+!CHECK: %[[INNER_PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_X]] {uniq_name = "_QFnested_default_clause_test3Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
 !CHECK: hlfir.assign %[[TEMP]] to %[[INNER_PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
-!CHECK: %[[INNER_PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
-!CHECK: %[[INNER_PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[INNER_PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_test3Ey"}
+!CHECK: %[[INNER_PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_test3Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
 !CHECK: hlfir.assign %[[TEMP]] to %[[INNER_PRIVATE_Y_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
 !CHECK: %[[TEMP:.*]] = fir.load %[[INNER_PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
@@ -292,23 +307,32 @@ subroutine nested_default_clause_tests
 !CHECK: omp.terminator
 !CHECK: }
 !CHECK: }
-    !$omp parallel default(private)
-		!$omp parallel default(firstprivate)
-			x = y
-		!$omp end parallel
+subroutine nested_default_clause_test3
+  integer :: x, y, z, w
 
-		!$omp parallel default(shared)
-			w = x + z
-		!$omp end parallel
-	!$omp end parallel
+  !$omp parallel default(private)
+    !$omp parallel default(firstprivate)
+      x = y
+    !$omp end parallel
+
+    !$omp parallel default(shared)
+      w = x + z
+    !$omp end parallel
+  !$omp end parallel
+end subroutine
 
+!CHECK-LABEL: func @_QPnested_default_clause_test4
+!CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFnested_default_clause_test4Ex"}
+!CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFnested_default_clause_test4Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFnested_default_clause_test4Ey"}
+!CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFnested_default_clause_test4Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: omp.parallel {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
-!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_test4Ex"}
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFnested_default_clause_test4Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
 !CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
-!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
-!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_test4Ey"}
+!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_test4Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[TEMP:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<i32>
 !CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_Y_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
 !CHECK: omp.single {
@@ -318,13 +342,96 @@ subroutine nested_default_clause_tests
 !CHECK: }
 !CHECK: omp.terminator
 !CHECK: }
+subroutine nested_default_clause_test4
+  integer :: x, y
+
+  !$omp parallel default(firstprivate)
+    !$omp single
+      x = y
+    !$omp end single
+  !$omp end parallel
+end subroutine
+
+!CHECK-LABEL: func @_QPnested_default_clause_test5
+!CHECK: omp.parallel {
+!CHECK: %[[LOOP_VAR_ALLOCA:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK: %[[LOOP_VAR_DECLARE:.*]]:2 = hlfir.declare %[[LOOP_VAR_ALLOCA]] {{.*}}
+!CHECK: %[[X_ALLOCA:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_test5Ex"}
+!CHECK: %[[X_DECLARE:.*]]:2 = hlfir.declare %[[X_ALLOCA]] {{.*}}
+!CHECK: %[[CONST_LB:.*]] = arith.constant 1 : i32
+!CHECK: %[[CONST_UB:.*]] = arith.constant 50 : i32
+!CHECK: %[[CONST_STEP:.*]] = arith.constant 1 : i32
+!CHECK: omp.loop_nest (%[[ARG:.*]]) : i32 = (%[[CONST_LB]]) to (%[[CONST_UB]]) inclusive step (%[[CONST_STEP]]) {
+!CHECK: fir.store %[[ARG]] to %[[LOOP_VAR_DECLARE]]#1 : !fir.ref<i32>
+!CHECK: %[[LOADED_X:.*]] = fir.load %[[X_DECLARE]]#0 : !fir.ref<i32>
+!CHECK: %[[CONST:.*]] = arith.constant 1 : i32
+!CHECK: %[[RESULT:.*]] = arith.addi %[[LOADED_X]], %[[CONST]] : i32
+!CHECK: hlfir.assign %[[RESULT]] to %[[X_DECLARE]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.yield
+!CHECK: }
+!CHECK: omp.terminator
+!CHECK: }
+subroutine nested_default_clause_test5
+  integer :: i, x
+
+  !$omp parallel do private(x)
+    do i=1, 50
+      x = x + 1
+    end do
+  !$omp end parallel do
+end subroutine
+
+!CHECK-LABEL: func @_QPnested_default_clause_test6
+!CHECK: omp.parallel {
+!CHECK: %[[LOOP_VAR:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK: %[[LOOP_VAR_DECLARE:.*]]:2 = hlfir.declare %[[LOOP_VAR]] {{.*}}
+!CHECK: %[[X_VAR:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_test6Ex"}
+!CHECK: %[[X_VAR_DECLARE:.*]]:2 = hlfir.declare %[[X_VAR]] {{.*}}
+!CHECK: %[[Y_VAR:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_test6Ey"}
+!CHECK: %[[Y_VAR_DECLARE:.*]]:2 = hlfir.declare %[[Y_VAR]] {{.*}}
+!CHECK: %[[Z_VAR:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFnested_default_clause_test6Ez"}
+!CHECK: %[[Z_VAR_DECLARE:.*]]:2 = hlfir.declare %[[Z_VAR]] {{.*}}
+!CHECK: %[[CONST_LB:.*]] = arith.constant 1 : i32
+!CHECK: %[[CONST_UB:.*]] = arith.constant 10 : i32
+!CHECK: %[[CONST_STEP:.*]] = arith.constant 1 : i32
+!CHECK: omp.loop_nest (%[[ARG:.*]]) : i32 = (%[[CONST_LB]]) to (%[[CONST_UB]]) inclusive step (%[[CONST_STEP]]) {
+!CHECK: fir.store %[[ARG]] to %[[LOOP_VAR_DECLARE]]#1 : !fir.ref<i32>
+!CHECK: %[[LOADED_X:.*]] = fir.load %[[X_VAR_DECLARE]]#0 : !fir.ref<i32>
+!CHECK: %[[CONST:.*]] = arith.constant 1 : i32
+!CHECK: %[[ADD:.*]] = arith.addi %[[LOADED_X]], %[[CONST]] : i32
+!CHECK: hlfir.assign %[[ADD]] to %[[X_VAR_DECLARE]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.parallel {
+!CHECK: %[[INNER_Y_ALLOCA:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_test6Ey"}
+!CHECK: %[[INNER_Y_DECLARE:.*]]:2 = hlfir.declare %[[INNER_Y_ALLOCA]] {{.}}
+!CHECK: %[[TEMP:.*]] = fir.load %[[Y_VAR_DECLARE]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[INNER_Y_DECLARE]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[INNER_Z_ALLOCA:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFnested_default_clause_test6Ez"}
+!CHECK: %[[INNER_Z_DECLARE:.*]]:2 = hlfir.declare %[[INNER_Z_ALLOCA]] {{.}}
+!CHECK: %[[TEMP:.*]] = fir.load %[[Z_VAR_DECLARE]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[INNER_Z_DECLARE]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[LOADED_Y:.*]] = fir.load %[[INNER_Y_DECLARE]]#0 : !fir.ref<i32>
+!CHECK: %[[LOADED_Z:.*]] = fir.load %[[INNER_Z_DECLARE]]#0 : !fir.ref<i32>
+!CHECK: %[[RESULT:.*]] = arith.addi %[[LOADED_Y]], %[[LOADED_Z]] : i32
+!CHECK: hlfir.assign %[[RESULT]] to %[[INNER_Y_DECLARE]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+!CHECK: omp.yield
+!CHECK: }
+!CHECK: omp.terminator
+!CHECK: }
 !CHECK: return
-!CHECK: } 
-	!$omp parallel default(firstprivate)
-		!$omp single
-			x = y
-		!$omp end single
-	!$omp end parallel
+!CHECK: }
+subroutine nested_default_clause_test6
+  integer :: i, x, y, z
+
+  !$omp parallel do default(private)
+    do i = 1, 10
+      x  = x + 1
+      !$omp parallel default(firstprivate)
+         y = y + z
+      !$omp end parallel
+    end do
+  !$omp end parallel do
 end subroutine
 
 !CHECK: func.func @_QPskipped_default_clause_checks() {
@@ -415,3 +522,49 @@ subroutine threadprivate_with_default
            end do
         !$omp end parallel do
 end subroutine
+
+subroutine nested_constructs
+!CHECK: %[[I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFnested_constructsEi"}
+!CHECK: %[[I_DECL:.*]]:2 = hlfir.declare %[[I]] {{.*}}
+!CHECK: %[[J:.*]] = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFnested_constructsEj"}
+!CHECK: %[[J_DECL:.*]]:2 = hlfir.declare %[[J]] {{.*}}
+!CHECK: %[[Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFnested_constructsEy"}
+!CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {{.*}}
+!CHECK: %[[Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFnested_constructsEz"}
+!CHECK: %[[Z_DECL:.*]]:2 = hlfir.declare %[[Z]] {{.*}}
+
+    integer :: y, z
+!CHECK: omp.parallel {
+!CHECK: %[[INNER_J:.*]] = fir.alloca i32 {bindc_name = "j", pinned}
+!CHECK: %[[INNER_J_DECL:.*]]:2 = hlfir.declare %[[INNER_J]] {{.*}}
+!CHECK: %[[INNER_I:.*]] = fir.alloca i32 {bindc_name = "i", pinned}
+!CHECK: %[[INNER_I_DECL:.*]]:2 = hlfir.declare %[[INNER_I]] {{.*}}
+!CHECK: %[[INNER_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_constructsEy"}
+!CHECK: %[[INNER_Y_DECL:.*]]:2 = hlfir.declare %[[INNER_Y]] {{.*}}
+!CHECK: %[[TEMP:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[INNER_Y_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[INNER_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFnested_constructsEz"}
+!CHECK: %[[INNER_Z_DECL:.*]]:2 = hlfir.declare %[[INNER_Z]] {{.*}}
+    !$omp parallel default(private) firstprivate(y)
+!CHECK: {{.*}} = fir.do_loop {{.*}} {
+      do i = 1, 10
+!CHECK: %[[CONST_1:.*]] = arith.constant 1 : i32
+!CHECK: hlfir.assign %[[CONST_1]] to %[[INNER_Y_DECL]]#0 : i32, !fir.ref<i32>
+        y = 1
+!CHECK: {{.*}} = fir.do_loop {{.*}} {
+        do j = 1, 10
+!CHECK: %[[CONST_20:.*]] = arith.constant 20 : i32
+!CHECK: hlfir.assign %[[CONST_20]] to %[[INNER_Z_DECL]]#0 : i32, !fir.ref<i32>
+          z = 20
+!CHECK: omp.parallel {
+!CHECK: %[[NESTED_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_constructsEy"}
+!CHECK: %[[NESTED_Y_DECL:.*]]:2 = hlfir.declare %[[NESTED_Y]] {{.*}}
+!CHECK: %[[CONST_2:.*]] = arith.constant 2 : i32
+!CHECK: hlfir.assign %[[CONST_2]] to %[[NESTED_Y_DECL]]#0 : i32, !fir.ref<i32>
+          !$omp parallel default(private)
+             y = 2
+          !$omp end parallel
+        end do
+      end do
+    !$omp end parallel
+end subroutine
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-allocatable-array.f90 b/flang/test/Lower/OpenMP/delayed-privatization-allocatable-array.f90
new file mode 100644
index 000000000000..47e163014fe8
--- /dev/null
+++ b/flang/test/Lower/OpenMP/delayed-privatization-allocatable-array.f90
@@ -0,0 +1,67 @@
+! Test delayed privatization for allocatable arrays.
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --openmp-enable-delayed-privatization \
+! RUN:   -o - %s 2>&1 | FileCheck %s
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - %s 2>&1 |\
+! RUN:   FileCheck %s
+
+subroutine delayed_privatization_private(var1, l1)
+  implicit none
+  integer(8):: l1
+  integer, allocatable, dimension(:) :: var1
+
+!$omp parallel firstprivate(var1)
+  var1(l1 + 1) = 10
+!$omp end parallel
+end subroutine
+
+! CHECK-LABEL: omp.private {type = firstprivate}
+! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.ref<!fir.box<!fir.heap<!fir.array<\?xi32>>>>]] alloc {
+
+! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+! CHECK-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<{{\?}}xi32>>> {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_privateEvar1"}
+
+! CHECK-NEXT:   %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]]
+! CHECK-NEXT:   %[[PRIV_ARG_BOX:.*]] = fir.box_addr %[[PRIV_ARG_VAL]]
+! CHECK-NEXT:   %[[PRIV_ARG_ADDR:.*]] = fir.convert %[[PRIV_ARG_BOX]]
+! CHECK-NEXT:   %[[C0:.*]] = arith.constant 0 : i64
+! CHECK-NEXT:   %[[ALLOC_COND:.*]] = arith.cmpi ne, %[[PRIV_ARG_ADDR]], %[[C0]] : i64
+
+! CHECK-NEXT:   fir.if %[[ALLOC_COND]] {
+! CHECK-NEXT:     %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : [[TYPE]]
+! CHECK-NEXT:     %[[C0:.*]] = arith.constant 0 : index
+! CHECK-NEXT:     %[[DIMS:.*]]:3 = fir.box_dims %[[PRIV_ARG_VAL]], %[[C0]]
+! CHECK-NEXT:     fir.box_addr %[[PRIV_ARG_VAL]]
+! CHECK-NEXT:     %[[C0_2:.*]] = arith.constant 0 : index 
+! CHECK-NEXT:     %[[CMP:.*]] = arith.cmpi sgt, %[[DIMS]]#1, %[[C0_2]] : index
+! CHECK-NEXT:     %[[SELECT:.*]] = arith.select %[[CMP]], %[[DIMS]]#1, %[[C0_2]] : index
+! CHECK-NEXT:     %[[MEM:.*]] = fir.allocmem !fir.array<?xi32>, %[[SELECT]]
+! CHECK-NEXT:     %[[SHAPE_SHIFT:.*]] = fir.shape_shift %[[DIMS]]#0, %[[SELECT]] : (index, index) -> !fir.shapeshift<1>
+! CHECK-NEXT:     %[[EMBOX:.*]] = fir.embox %[[MEM]](%[[SHAPE_SHIFT]])
+! CHECK-NEXT:     fir.store %[[EMBOX]] to %[[PRIV_ALLOC]]
+! CHECK-NEXT:   } else {
+! CHECK-NEXT:     %[[ZEROS:.*]] = fir.zero_bits
+! CHECK-NEXT:     %[[C0_3:.*]] = arith.constant 0 : index
+! CHECK-NEXT:     %[[SHAPE:.*]] = fir.shape %[[C0_3]] : (index) -> !fir.shape<1>
+! CHECK-NEXT:     %[[EMBOX_2:.*]] = fir.embox %[[ZEROS]](%[[SHAPE]])
+! CHECK-NEXT:     fir.store %[[EMBOX_2]] to %[[PRIV_ALLOC]]
+! CHECK-NEXT:   }
+
+! CHECK-NEXT:   hlfir.declare
+! CHECK-NEXT:   omp.yield
+
+! CHECK-NEXT: } copy {
+! CHECK-NEXT: ^bb0(%[[PRIV_ORIG_ARG:.*]]: [[TYPE]], %[[PRIV_PRIV_ARG:.*]]: [[TYPE]]):
+! CHECK-NEXT:  %[[PRIV_BASE_VAL:.*]] = fir.load %[[PRIV_PRIV_ARG]]
+! CHECK-NEXT:  %[[PRIV_BASE_BOX:.*]] = fir.box_addr %[[PRIV_BASE_VAL]]
+! CHECK-NEXT:  %[[PRIV_BASE_ADDR:.*]] = fir.convert %[[PRIV_BASE_BOX]]
+! CHECK-NEXT:  %[[C0:.*]] = arith.constant 0 : i64
+! CHECK-NEXT:  %[[COPY_COND:.*]] = arith.cmpi ne, %[[PRIV_BASE_ADDR]], %[[C0]] : i64
+
+
+! CHECK-NEXT:  fir.if %[[COPY_COND]] {
+! CHECK-NEXT:    %[[PRIV_ORIG_ARG_VAL:.*]] = fir.load %[[PRIV_ORIG_ARG]]
+! CHECK-NEXT:    hlfir.assign %[[PRIV_ORIG_ARG_VAL]] to %[[PRIV_BASE_VAL]] temporary_lhs
+! CHECK-NEXT:   }
+! CHECK-NEXT:   omp.yield
+! CHECK-NEXT: }
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-array.f90 b/flang/test/Lower/OpenMP/delayed-privatization-array.f90
new file mode 100644
index 000000000000..1d291b2ac0fe
--- /dev/null
+++ b/flang/test/Lower/OpenMP/delayed-privatization-array.f90
@@ -0,0 +1,100 @@
+! Test delayed privatization for arrays.
+
+! RUN: split-file %s %t
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --openmp-enable-delayed-privatization \
+! RUN:   -o - %t/one_dim_array.f90 2>&1 | FileCheck %s --check-prefix=ONE_DIM
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - \
+! RUN:   %t/one_dim_array.f90 2>&1 | FileCheck %s --check-prefix=ONE_DIM
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --openmp-enable-delayed-privatization \
+! RUN:   -o - %t/two_dim_array.f90 2>&1 | FileCheck %s --check-prefix=TWO_DIM
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - \
+! RUN:   %t/two_dim_array.f90 2>&1 | FileCheck %s --check-prefix=TWO_DIM
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --openmp-enable-delayed-privatization \
+! RUN:   -o - %t/one_dim_array_default_lb.f90 2>&1 | FileCheck %s --check-prefix=ONE_DIM_DEFAULT_LB
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - \
+! RUN:   %t/one_dim_array_default_lb.f90 2>&1 | FileCheck %s --check-prefix=ONE_DIM_DEFAULT_LB
+
+!--- one_dim_array.f90
+subroutine delayed_privatization_private_1d(var1, l1, u1)
+  implicit none
+  integer(8):: l1, u1
+  integer, dimension(l1:u1) :: var1
+
+!$omp parallel firstprivate(var1)
+  var1(l1 + 1) = 10
+!$omp end parallel
+end subroutine
+
+! ONE_DIM-LABEL: omp.private {type = firstprivate}
+! ONE_DIM-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.box<!fir.array<\?xi32>>]] alloc {
+
+! ONE_DIM-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+
+! ONE_DIM:   %[[C0:.*]] = arith.constant 0 : index
+! ONE_DIM-NEXT:   %[[DIMS:.*]]:3 = fir.box_dims %[[PRIV_ARG]], %[[C0]] : ([[TYPE]], index) -> (index, index, index)
+! ONE_DIM:   %[[PRIV_ALLOCA:.*]] = fir.alloca !fir.array<{{\?}}xi32>
+! ONE_DIM-NEXT:   %[[SHAPE_SHIFT:.*]] = fir.shape_shift %[[DIMS]]#0, %[[DIMS]]#1 : (index, index) -> !fir.shapeshift<1>
+! ONE_DIM-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOCA]](%[[SHAPE_SHIFT]]) {uniq_name = "_QFdelayed_privatization_private_1dEvar1"}
+! ONE_DIM-NEXT:  omp.yield(%[[PRIV_DECL]]#0 : [[TYPE]])
+
+! ONE_DIM-NEXT: } copy {
+! ONE_DIM-NEXT: ^bb0(%[[PRIV_ORIG_ARG:.*]]: [[TYPE]], %[[PRIV_PRIV_ARG:.*]]: [[TYPE]]):
+! ONE_DIM-NEXT:  hlfir.assign %[[PRIV_ORIG_ARG]] to %[[PRIV_PRIV_ARG]] temporary_lhs
+! ONE_DIM-NEXT:   omp.yield(%[[PRIV_PRIV_ARG]] : [[TYPE]])
+! ONE_DIM-NEXT: }
+
+!--- two_dim_array.f90
+subroutine delayed_privatization_private_2d(var1, l1, u1, l2, u2)
+  implicit none
+  integer(8):: l1, u1, l2, u2
+  integer, dimension(l1:u1, l2:u2) :: var1
+
+!$omp parallel firstprivate(var1)
+  var1(l1 + 1, u2) = 10
+!$omp end parallel
+end subroutine
+
+! TWO_DIM-LABEL: omp.private {type = firstprivate}
+! TWO_DIM-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.box<!fir.array<\?x\?xi32>>]] alloc {
+
+! TWO_DIM-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+! TWO_DIM:        %[[C0:.*]] = arith.constant 0 : index
+! TWO_DIM-NEXT:   %[[DIMS0:.*]]:3 = fir.box_dims %[[PRIV_ARG]], %[[C0]] : ([[TYPE]], index) -> (index, index, index)
+
+! TWO_DIM-NEXT:   %[[C1:.*]] = arith.constant 1 : index
+! TWO_DIM-NEXT:   %[[DIMS1:.*]]:3 = fir.box_dims %[[PRIV_ARG]], %[[C1]] : ([[TYPE]], index) -> (index, index, index)
+
+! TWO_DIM-NEXT:   %[[PRIV_ALLOCA:.*]] = fir.alloca !fir.array<{{\?}}x{{\?}}xi32>, %[[DIMS0]]#1, %[[DIMS1]]#1 {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_private_2dEvar1"}
+! TWO_DIM-NEXT:   %[[SHAPE_SHIFT:.*]] = fir.shape_shift %[[DIMS0]]#0, %[[DIMS0]]#1, %[[DIMS1]]#0, %[[DIMS1]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+
+! TWO_DIM-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOCA]](%[[SHAPE_SHIFT]]) {uniq_name = "_QFdelayed_privatization_private_2dEvar1"}
+! TWO_DIM-NEXT:  omp.yield(%[[PRIV_DECL]]#0 : [[TYPE]])
+
+! TWO_DIM-NEXT: } copy {
+! TWO_DIM-NEXT: ^bb0(%[[PRIV_ORIG_ARG:.*]]: [[TYPE]], %[[PRIV_PRIV_ARG:.*]]: [[TYPE]]):
+! TWO_DIM-NEXT:  hlfir.assign %[[PRIV_ORIG_ARG]] to %[[PRIV_PRIV_ARG]] temporary_lhs
+! TWO_DIM-NEXT:   omp.yield(%[[PRIV_PRIV_ARG]] : [[TYPE]])
+! TWO_DIM-NEXT: }
+
+!--- one_dim_array_default_lb.f90
+program main
+  implicit none
+  integer, dimension(10) :: var1
+
+!$omp parallel private(var1)
+  var1(1) = 10
+!$omp end parallel
+end program
+
+! ONE_DIM_DEFAULT_LB-LABEL: omp.private {type = private}
+! ONE_DIM_DEFAULT_LB-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.ref<!fir.array<10xi32>>]] alloc {
+
+! ONE_DIM_DEFAULT_LB-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+
+! ONE_DIM_DEFAULT_LB:   %[[C10:.*]] = arith.constant 10 : index
+! ONE_DIM_DEFAULT_LB:   %[[PRIV_ALLOCA:.*]] = fir.alloca !fir.array<10xi32>
+! ONE_DIM_DEFAULT_LB:   %[[SHAPE:.*]] = fir.shape %[[C10]] : (index) -> !fir.shape<1>
+! ONE_DIM_DEFAULT_LB:   hlfir.declare %[[PRIV_ALLOCA]](%[[SHAPE]])
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-character-array.f90 b/flang/test/Lower/OpenMP/delayed-privatization-character-array.f90
new file mode 100644
index 000000000000..9a9d0c01212c
--- /dev/null
+++ b/flang/test/Lower/OpenMP/delayed-privatization-character-array.f90
@@ -0,0 +1,67 @@
+! Test delayed privatization for the `CHARACTER` array type.
+
+! RUN: split-file %s %t
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --openmp-enable-delayed-privatization \
+! RUN:   -o - %t/static_len.f90 2>&1 | FileCheck %s --check-prefix=STATIC_LEN
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - %t/static_len.f90 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=STATIC_LEN
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --openmp-enable-delayed-privatization \
+! RUN:   -o - %t/dyn_len.f90 2>&1 | FileCheck %s --check-prefix=DYN_LEN
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - %t/dyn_len.f90 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=DYN_LEN
+
+!--- static_len.f90
+subroutine delayed_privatization_character_array_static_len(var1)
+  implicit none
+  character(len = 10)  :: var1(5)
+
+!$omp parallel firstprivate(var1)
+  var1(1) = "test"
+!$omp end parallel
+end subroutine
+
+! STATIC_LEN-LABEL: omp.private {type = firstprivate}
+! STATIC_LEN-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.ref<!fir.array<5x!fir.char<1,10>>>]] alloc {
+
+! STATIC_LEN-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+! STATIC_LEN-DAG:    %[[C5:.*]] = arith.constant 5 : index
+! STATIC_LEN-DAG:    %[[C10:.*]] = arith.constant 10 : index
+! STATIC_LEN-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca !fir.array<5x!fir.char<1,10>>
+! STATIC_LEN-NEXT:   %[[ARRAY_SHAPE:.*]] = fir.shape %[[C5]]
+! STATIC_LEN-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]](%[[ARRAY_SHAPE]]) typeparams %[[C10]]
+! STATIC_LEN-NEXT:   omp.yield(%[[PRIV_DECL]]#0
+
+! STATIC_LEN-NEXT: } copy {
+! STATIC_LEN-NEXT: ^bb0(%[[PRIV_ORIG_ARG:.*]]: [[TYPE]], %[[PRIV_PRIV_ARG:.*]]: [[TYPE]]):
+! STATIC_LEN-NEXT:   hlfir.assign %[[PRIV_ORIG_ARG]] to %[[PRIV_PRIV_ARG]]
+
+! STATIC_LEN-NEXT:   omp.yield(%[[PRIV_PRIV_ARG]]
+! STATIC_LEN-NEXT: }
+
+!--- dyn_len.f90
+subroutine delayed_privatization_character_array_dynamic_len(var1, char_len, array_len)
+  implicit none
+  integer(8):: char_len
+  integer(8):: array_len
+  character(len = char_len)  :: var1(array_len)
+
+!$omp parallel private(var1)
+  var1(1) = "test"
+!$omp end parallel
+end subroutine
+
+! DYN_LEN-LABEL: omp.private {type = private}
+! DYN_LEN-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.box<!fir.array<\?x!fir.char<1,\?>>>]] alloc {
+
+! DYN_LEN-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+
+! DYN_LEN:        %[[C0:.*]] = arith.constant 0 : index
+! DYN_LEN-NEXT:   %[[BOX_DIM:.*]]:3 = fir.box_dims %[[PRIV_ARG]], %[[C0]]
+! DYN_LEN:        %[[CHAR_LEN:.*]] = fir.box_elesize %[[PRIV_ARG]]
+! DYN_LEN-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca !fir.array<?x!fir.char<1,?>>(%[[CHAR_LEN]] : index)
+! DYN_LEN-NEXT:   %[[ARRAY_SHAPE:.*]] = fir.shape
+! DYN_LEN-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]](%[[ARRAY_SHAPE]]) typeparams %[[CHAR_LEN]]
+
+! DYN_LEN-NEXT:   omp.yield(%[[PRIV_DECL]]#0
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-character.f90 b/flang/test/Lower/OpenMP/delayed-privatization-character.f90
new file mode 100644
index 000000000000..db678ab13bbe
--- /dev/null
+++ b/flang/test/Lower/OpenMP/delayed-privatization-character.f90
@@ -0,0 +1,59 @@
+! Test delayed privatization for the `CHARACTER` type.
+
+! RUN: split-file %s %t
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --openmp-enable-delayed-privatization \
+! RUN:   -o - %t/dyn_len.f90 2>&1 | FileCheck %s --check-prefix=DYN_LEN
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - %t/dyn_len.f90 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=DYN_LEN
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --openmp-enable-delayed-privatization \
+! RUN:   -o - %t/static_len.f90 2>&1 | FileCheck %s --check-prefix=STATIC_LEN
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - %t/static_len.f90 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=STATIC_LEN
+
+!--- dyn_len.f90
+subroutine delayed_privatization_character(var1, l)
+  implicit none
+  integer(8):: l
+  character(len = l)  :: var1
+
+!$omp parallel firstprivate(var1)
+  var1 = "test"
+!$omp end parallel
+end subroutine
+
+! DYN_LEN-LABEL: omp.private {type = firstprivate}
+! DYN_LEN-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.boxchar<1>]] alloc {
+
+! DYN_LEN-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+! DYN_LEN-NEXT:   %[[UNBOX:.*]]:2 = fir.unboxchar %[[PRIV_ARG]]
+! DYN_LEN:        %[[PRIV_ALLOC:.*]] = fir.alloca !fir.char<1,?>(%[[UNBOX]]#1 : index)
+! DYN_LEN-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]] typeparams %[[UNBOX]]#1
+! DYN_LEN-NEXT:   omp.yield(%[[PRIV_DECL]]#0 : !fir.boxchar<1>)
+
+! DYN_LEN-NEXT: } copy {
+! DYN_LEN-NEXT: ^bb0(%[[PRIV_ORIG_ARG:.*]]: [[TYPE]], %[[PRIV_PRIV_ARG:.*]]: [[TYPE]]):
+
+! DYN_LEN-NEXT:   hlfir.assign %[[PRIV_ORIG_ARG]] to %[[PRIV_PRIV_ARG]]
+
+! DYN_LEN-NEXT:   omp.yield(%[[PRIV_PRIV_ARG]] : !fir.boxchar<1>)
+! DYN_LEN-NEXT: }
+
+!--- static_len.f90
+subroutine delayed_privatization_character_static_len(var1)
+  implicit none
+  character(len = 10)  :: var1
+
+!$omp parallel private(var1)
+  var1 = "test"
+!$omp end parallel
+end subroutine
+
+! STATIC_LEN-LABEL: omp.private {type = private}
+! STATIC_LEN-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.ref<!fir.char<1,10>>]] alloc {
+
+! STATIC_LEN-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+! STATIC_LEN-NEXT:   %[[C10:.*]] = arith.constant 10 : index
+! STATIC_LEN-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca !fir.char<1,10>
+! STATIC_LEN-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]] typeparams %[[C10]]
diff --git a/flang/test/Lower/OpenMP/derived-type-map.f90 b/flang/test/Lower/OpenMP/derived-type-map.f90
new file mode 100644
index 000000000000..6121b450f062
--- /dev/null
+++ b/flang/test/Lower/OpenMP/derived-type-map.f90
@@ -0,0 +1,238 @@
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+
+!CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.type<_QFmaptype_derived_implicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}> {bindc_name = "scalar_arr", uniq_name = "_QFmaptype_derived_implicitEscalar_arr"}
+!CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "_QFmaptype_derived_implicitEscalar_arr"} : (!fir.ref<!fir.type<_QFmaptype_derived_implicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>) -> (!fir.ref<!fir.type<_QFmaptype_derived_implicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.ref<!fir.type<_QFmaptype_derived_implicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>)
+!CHECK: %[[MAP:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref<!fir.type<_QFmaptype_derived_implicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.type<_QFmaptype_derived_implicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>) map_clauses(implicit, tofrom) capture(ByRef) -> !fir.ref<!fir.type<_QFmaptype_derived_implicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>> {name = "scalar_arr"}
+!CHECK:     omp.target map_entries(%[[MAP]] -> %[[ARG0:.*]] : !fir.ref<!fir.type<_QFmaptype_derived_implicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>) {
+!CHECK:         ^bb0(%[[ARG0]]: !fir.ref<!fir.type<_QFmaptype_derived_implicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>):
+subroutine mapType_derived_implicit
+    type :: scalar_and_array
+      real(4) :: real
+      integer(4) :: array(10)
+      integer(4) :: int
+    end type scalar_and_array
+    type(scalar_and_array) :: scalar_arr 
+    
+    !$omp target
+       scalar_arr%int = 1
+    !$omp end target
+end subroutine mapType_derived_implicit
+
+!CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.type<_QFmaptype_derived_explicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}> {bindc_name = "scalar_arr", uniq_name = "_QFmaptype_derived_explicitEscalar_arr"}
+!CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "_QFmaptype_derived_explicitEscalar_arr"} : (!fir.ref<!fir.type<_QFmaptype_derived_explicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>) -> (!fir.ref<!fir.type<_QFmaptype_derived_explicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.ref<!fir.type<_QFmaptype_derived_explicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>)
+!CHECK: %[[MAP:.*]] = omp.map.info var_ptr(%[[DECLARE]]#0 : !fir.ref<!fir.type<_QFmaptype_derived_explicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.type<_QFmaptype_derived_explicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>) map_clauses(tofrom) capture(ByRef) -> !fir.ref<!fir.type<_QFmaptype_derived_explicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>> {name = "scalar_arr"}
+!CHECK:  omp.target map_entries(%[[MAP]] -> %[[ARG0:.*]] : !fir.ref<!fir.type<_QFmaptype_derived_explicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>) {
+!CHECK:    ^bb0(%[[ARG0]]: !fir.ref<!fir.type<_QFmaptype_derived_explicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>):
+subroutine mapType_derived_explicit
+    type :: scalar_and_array
+      real(4) :: real
+      integer(4) :: array(10)
+      integer(4) :: int
+    end type scalar_and_array
+    type(scalar_and_array) :: scalar_arr 
+    
+    !$omp target map(tofrom: scalar_arr)
+       scalar_arr%int = 1
+    !$omp end target
+end subroutine mapType_derived_explicit
+
+!CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.type<_QFmaptype_derived_explicit_single_memberTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}> {bindc_name = "scalar_arr", uniq_name = "_QFmaptype_derived_explicit_single_memberEscalar_arr"}
+!CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "_QFmaptype_derived_explicit_single_memberEscalar_arr"} : (!fir.ref<!fir.type<_QFmaptype_derived_explicit_single_memberTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>) -> (!fir.ref<!fir.type<_QFmaptype_derived_explicit_single_memberTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.ref<!fir.type<_QFmaptype_derived_explicit_single_memberTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>)
+!CHECK: %[[MEMBER:.*]] = hlfir.designate %[[DECLARE]]#0{"array"}   shape %{{.*}} : (!fir.ref<!fir.type<_QFmaptype_derived_explicit_single_memberTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.shape<1>) -> !fir.ref<!fir.array<10xi32>>
+!CHECK: %[[BOUNDS:.*]] = omp.map.bounds lower_bound(%{{.*}} : index) upper_bound(%{{.*}} : index) extent(%{{.*}} : index) stride(%{{.*}} : index) start_idx(%{{.*}} : index)
+!CHECK: %[[MEMBER_MAP:.*]] = omp.map.info var_ptr(%[[MEMBER]] : !fir.ref<!fir.array<10xi32>>, !fir.array<10xi32>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<10xi32>> {name = "scalar_arr%array"}
+!CHECK: %[[PARENT_MAP:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref<!fir.type<_QFmaptype_derived_explicit_single_memberTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.type<_QFmaptype_derived_explicit_single_memberTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>) map_clauses(tofrom) capture(ByRef) members(%[[MEMBER_MAP]] : [1] : !fir.ref<!fir.array<10xi32>>) -> !fir.ref<!fir.type<_QFmaptype_derived_explicit_single_memberTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>> {name = "scalar_arr", partial_map = true}
+!CHECK: omp.target map_entries(%[[MEMBER_MAP]] -> %[[ARG0:.*]], %[[PARENT_MAP]] -> %[[ARG1:.*]] : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.type<_QFmaptype_derived_explicit_single_memberTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>) {
+!CHECK:  ^bb0(%[[ARG0]]: !fir.ref<!fir.array<10xi32>>, %[[ARG1]]: !fir.ref<!fir.type<_QFmaptype_derived_explicit_single_memberTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>):
+subroutine mapType_derived_explicit_single_member
+    type :: scalar_and_array
+      real(4) :: real
+      integer(4) :: array(10)
+      integer(4) :: int
+    end type scalar_and_array
+    type(scalar_and_array) :: scalar_arr 
+    
+    !$omp target map(tofrom: scalar_arr%array)
+       scalar_arr%array(1) = 1
+    !$omp end target
+end subroutine mapType_derived_explicit_single_member
+
+!CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.type<_QFmaptype_derived_explicit_multiple_membersTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}> {bindc_name = "scalar_arr", uniq_name = "_QFmaptype_derived_explicit_multiple_membersEscalar_arr"}
+!CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "_QFmaptype_derived_explicit_multiple_membersEscalar_arr"} : (!fir.ref<!fir.type<_QFmaptype_derived_explicit_multiple_membersTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>) -> (!fir.ref<!fir.type<_QFmaptype_derived_explicit_multiple_membersTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.ref<!fir.type<_QFmaptype_derived_explicit_multiple_membersTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>)
+!CHECK: %[[MEMBER1:.*]] = hlfir.designate %[[DECLARE]]#0{"int"}   : (!fir.ref<!fir.type<_QFmaptype_derived_explicit_multiple_membersTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>) -> !fir.ref<i32>
+!CHECK: %[[MEMBER_MAP_1:.*]] = omp.map.info var_ptr(%[[MEMBER1]] : !fir.ref<i32>, i32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "scalar_arr%int"}
+!CHECK: %[[MEMBER2:.*]] = hlfir.designate %[[DECLARE]]#0{"real"}   : (!fir.ref<!fir.type<_QFmaptype_derived_explicit_multiple_membersTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>) -> !fir.ref<f32>
+!CHECK: %[[MEMBER_MAP_2:.*]] = omp.map.info var_ptr(%[[MEMBER2]] : !fir.ref<f32>, f32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<f32> {name = "scalar_arr%real"}
+!CHECK: %[[PARENT_MAP:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref<!fir.type<_QFmaptype_derived_explicit_multiple_membersTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.type<_QFmaptype_derived_explicit_multiple_membersTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>) map_clauses(tofrom) capture(ByRef) members(%[[MEMBER_MAP_1]], %[[MEMBER_MAP_2]] : [2], [0] : !fir.ref<i32>, !fir.ref<f32>) -> !fir.ref<!fir.type<_QFmaptype_derived_explicit_multiple_membersTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>> {name = "scalar_arr", partial_map = true}
+!CHECK: omp.target map_entries(%[[MEMBER_MAP_1]] -> %[[ARG0:.*]], %[[MEMBER_MAP_2]] -> %[[ARG1:.*]], %[[PARENT_MAP]] -> %[[ARG2:.*]] : !fir.ref<i32>, !fir.ref<f32>, !fir.ref<!fir.type<_QFmaptype_derived_explicit_multiple_membersTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>) {
+!CHECK:  ^bb0(%[[ARG0]]: !fir.ref<i32>, %[[ARG1]]: !fir.ref<f32>, %[[ARG2]]: !fir.ref<!fir.type<_QFmaptype_derived_explicit_multiple_membersTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>):
+subroutine mapType_derived_explicit_multiple_members
+    type :: scalar_and_array
+      real(4) :: real
+      integer(4) :: array(10)
+      integer(4) :: int
+    end type scalar_and_array
+    type(scalar_and_array) :: scalar_arr 
+    
+    !$omp target map(tofrom: scalar_arr%int, scalar_arr%real)
+       scalar_arr%int = 1
+    !$omp end target
+end subroutine mapType_derived_explicit_multiple_members
+  
+!CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.type<_QFmaptype_derived_explicit_member_with_boundsTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}> {bindc_name = "scalar_arr", uniq_name = "_QFmaptype_derived_explicit_member_with_boundsEscalar_arr"}
+!CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "_QFmaptype_derived_explicit_member_with_boundsEscalar_arr"} : (!fir.ref<!fir.type<_QFmaptype_derived_explicit_member_with_boundsTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>) -> (!fir.ref<!fir.type<_QFmaptype_derived_explicit_member_with_boundsTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.ref<!fir.type<_QFmaptype_derived_explicit_member_with_boundsTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>)
+!CHECK: %[[MEMBER:.*]] = hlfir.designate %[[DECLARE]]#0{"array"}   shape %{{.*}} : (!fir.ref<!fir.type<_QFmaptype_derived_explicit_member_with_boundsTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.shape<1>) -> !fir.ref<!fir.array<10xi32>>
+!CHECK: %{{.*}} = arith.constant 1 : index
+!CHECK: %[[LB:.*]] = arith.constant 1 : index
+!CHECK: %[[UB:.*]] = arith.constant 4 : index
+!CHECK: %[[BOUNDS:.*]] = omp.map.bounds lower_bound(%[[LB]] : index) upper_bound(%[[UB]] : index) extent(%{{.*}} : index) stride(%{{.*}} : index) start_idx(%{{.*}} : index)
+!CHECK: %[[MEMBER_MAP:.*]] = omp.map.info var_ptr(%[[MEMBER]] : !fir.ref<!fir.array<10xi32>>, !fir.array<10xi32>) map_clauses(tofrom) capture(ByRef) bounds(%20) -> !fir.ref<!fir.array<10xi32>> {name = "scalar_arr%array(2:5)"}
+!CHECK: %[[PARENT_MAP:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref<!fir.type<_QFmaptype_derived_explicit_member_with_boundsTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.type<_QFmaptype_derived_explicit_member_with_boundsTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>) map_clauses(tofrom) capture(ByRef) members(%[[MEMBER_MAP]] : [1] : !fir.ref<!fir.array<10xi32>>) -> !fir.ref<!fir.type<_QFmaptype_derived_explicit_member_with_boundsTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>> {name = "scalar_arr", partial_map = true}
+!CHECK: omp.target map_entries(%[[MEMBER_MAP]] -> %[[ARG0:.*]], %[[PARENT_MAP]] -> %[[ARG1:.*]] : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.type<_QFmaptype_derived_explicit_member_with_boundsTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>) {
+!CHECK: ^bb0(%[[ARG0]]: !fir.ref<!fir.array<10xi32>>, %[[ARG1]]: !fir.ref<!fir.type<_QFmaptype_derived_explicit_member_with_boundsTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>):
+subroutine mapType_derived_explicit_member_with_bounds
+    type :: scalar_and_array
+      real(4) :: real
+      integer(4) :: array(10)
+      integer(4) :: int
+    end type scalar_and_array
+    type(scalar_and_array) :: scalar_arr 
+    
+    !$omp target map(tofrom: scalar_arr%array(2:5))
+       scalar_arr%array(3) = 3
+    !$omp end target
+end subroutine mapType_derived_explicit_member_with_bounds
+
+!CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.type<_QFmaptype_derived_nested_explicit_single_memberTscalar_and_array{real:f32,array:!fir.array<10xi32>,nest:!fir.type<_QFmaptype_derived_nested_explicit_single_memberTnested{int:i32,real:f32,array:!fir.array<10xi32>}>,int:i32}> {bindc_name = "scalar_arr", uniq_name = "_QFmaptype_derived_nested_explicit_single_memberEscalar_arr"}
+!CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "_QFmaptype_derived_nested_explicit_single_memberEscalar_arr"} : (!fir.ref<!fir.type<_QFmaptype_derived_nested_explicit_single_memberTscalar_and_array{real:f32,array:!fir.array<10xi32>,nest:!fir.type<_QFmaptype_derived_nested_explicit_single_memberTnested{int:i32,real:f32,array:!fir.array<10xi32>}>,int:i32}>>) -> {{.*}}
+!CHECK: %[[NEST:.*]] = hlfir.designate %[[DECLARE]]#0{"nest"}   : (!fir.ref<!fir.type<_QFmaptype_derived_nested_explicit_single_memberTscalar_and_array{real:f32,array:!fir.array<10xi32>,nest:!fir.type<_QFmaptype_derived_nested_explicit_single_memberTnested{int:i32,real:f32,array:!fir.array<10xi32>}>,int:i32}>>) -> {{.*}}
+!CHECK: %[[NEST_MEMBER:.*]] = hlfir.designate %[[NEST]]{"array"}   shape %{{.*}} : (!fir.ref<!fir.type<_QFmaptype_derived_nested_explicit_single_memberTnested{int:i32,real:f32,array:!fir.array<10xi32>}>>, !fir.shape<1>) -> !fir.ref<!fir.array<10xi32>>
+!CHECK: %[[BOUNDS:.*]] = omp.map.bounds lower_bound(%{{.*}} : index) upper_bound(%{{.*}} : index) extent(%{{.*}} : index) stride(%{{.*}} : index) start_idx(%{{.*}} : index)
+!CHECK: %[[MEMBER_MAP:.*]] = omp.map.info var_ptr(%[[NEST_MEMBER]] : !fir.ref<!fir.array<10xi32>>, !fir.array<10xi32>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<10xi32>> {name = "scalar_arr%nest%array"}
+!CHECK: %[[PARENT_MAP:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : {{.*}}) map_clauses(tofrom) capture(ByRef) members(%35 : [2,2] : !fir.ref<!fir.array<10xi32>>) -> {{.*}} {name = "scalar_arr", partial_map = true}
+!CHECK: omp.target map_entries(%[[MEMBER_MAP]] -> %[[ARG0:.*]], %[[PARENT_MAP]] -> %[[ARG1:.*]] : {{.*}}, {{.*}}) {
+!CHECK:  ^bb0(%[[ARG0]]: {{.*}}, %[[ARG1]]: {{.*}}):
+subroutine mapType_derived_nested_explicit_single_member
+  type :: nested
+    integer(4) :: int
+    real(4) :: real
+    integer(4) :: array(10)
+  end type nested
+
+  type :: scalar_and_array
+    real(4) :: real
+    integer(4) :: array(10)
+    type(nested) :: nest
+    integer(4) :: int
+  end type scalar_and_array
+  
+  type(scalar_and_array) :: scalar_arr 
+
+  !$omp target map(tofrom: scalar_arr%nest%array)
+    scalar_arr%nest%array(1) = 1
+  !$omp end target
+end subroutine mapType_derived_nested_explicit_single_member
+
+!CHECK: %[[ALLOCA:.*]] = fir.alloca {{.*}} {bindc_name = "scalar_arr", uniq_name = "_QFmaptype_derived_nested_explicit_multiple_membersEscalar_arr"}
+!CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "_QFmaptype_derived_nested_explicit_multiple_membersEscalar_arr"} : ({{.*}}) -> {{.*}}
+!CHECK: %[[NEST:.*]] = hlfir.designate %[[DECLARE]]#0{"nest"}   : ({{.*}}) -> {{.*}}
+!CHECK: %[[NEST_MEMBER1:.*]] = hlfir.designate %[[NEST]]{"int"}   : ({{.*}}) -> !fir.ref<i32>
+!CHECK: %[[MEMBER_MAP_1:.*]] = omp.map.info var_ptr(%[[NEST_MEMBER1]] : !fir.ref<i32>, i32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "scalar_arr%nest%int"}
+!CHECK: %[[NEST:.*]] = hlfir.designate %[[DECLARE]]#0{"nest"}   : ({{.*}}) -> {{.*}}
+!CHECK: %[[NEST_MEMBER2:.*]] = hlfir.designate %[[NEST]]{"real"}   : ({{.*}}) -> !fir.ref<f32>
+!CHECK: %[[MEMBER_MAP_2:.*]] = omp.map.info var_ptr(%[[NEST_MEMBER2]] : !fir.ref<f32>, f32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<f32> {name = "scalar_arr%nest%real"}
+!CHECK: %[[PARENT_MAP:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : {{.*}}, {{.*}}) map_clauses(tofrom) capture(ByRef) members(%[[MEMBER_MAP_1]], %[[MEMBER_MAP_2]] : [2,0], [2,1] : !fir.ref<i32>, !fir.ref<f32>) -> {{.*}} {name = "scalar_arr", partial_map = true}
+!CHECK: omp.target map_entries(%[[MEMBER_MAP_1]] -> %[[ARG0:.*]], %[[MEMBER_MAP_2]] -> %[[ARG1:.*]], %[[PARENT_MAP]] -> %[[ARG2:.*]] : !fir.ref<i32>, !fir.ref<f32>, {{.*}}) {
+!CHECK: ^bb0(%[[ARG0]]: !fir.ref<i32>, %[[ARG1]]: !fir.ref<f32>, %[[ARG2]]: {{.*}}):
+subroutine mapType_derived_nested_explicit_multiple_members
+  type :: nested
+    integer(4) :: int
+    real(4) :: real
+    integer(4) :: array(10)
+  end type nested
+
+  type :: scalar_and_array
+    real(4) :: real
+    integer(4) :: array(10)
+    type(nested) :: nest
+    integer(4) :: int
+  end type scalar_and_array
+
+  type(scalar_and_array) :: scalar_arr 
+
+  !$omp target map(tofrom: scalar_arr%nest%int, scalar_arr%nest%real)
+    scalar_arr%nest%int = 1
+  !$omp end target
+end subroutine mapType_derived_nested_explicit_multiple_members
+
+!CHECK: %[[ALLOCA:.*]] = fir.alloca {{.*}} {bindc_name = "scalar_arr", uniq_name = "_QFmaptype_derived_nested_explicit_member_with_boundsEscalar_arr"}
+!CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "_QFmaptype_derived_nested_explicit_member_with_boundsEscalar_arr"} : {{.*}} -> {{.*}}
+!CHECK: %[[NEST:.*]] = hlfir.designate %[[DECLARE]]#0{"nest"}   : {{.*}} -> {{.*}}
+!CHECK: %[[C10:.*]] = arith.constant 10 : index
+!CHECK: %[[NEST_MEMBER:.*]] = hlfir.designate %[[NEST]]{"array"}   {{.*}} : {{.*}} -> !fir.ref<!fir.array<10xi32>>
+!CHECK: %[[C1:.*]] = arith.constant 1 : index
+!CHECK: %[[C1_2:.*]] = arith.constant 1 : index
+!CHECK: %[[C4:.*]] = arith.constant 4 : index
+!CHECK: %[[BOUNDS:.*]] = omp.map.bounds lower_bound(%[[C1_2]] : index) upper_bound(%[[C4]] : index) extent(%[[C10]] : index) stride(%[[C1]] : index) start_idx(%[[C1]] : index)
+!CHECK: %[[MEMBER_MAP:.*]] = omp.map.info var_ptr(%[[NEST_MEMBER]] : !fir.ref<!fir.array<10xi32>>, !fir.array<10xi32>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<10xi32>> {name = "scalar_arr%nest%array(2:5)"}
+!CHECK: %[[PARENT_MAP:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : {{.*}}, {{.*}}) map_clauses(tofrom) capture(ByRef) members(%[[MEMBER_MAP]] : [2,2] : !fir.ref<!fir.array<10xi32>>) -> {{.*}} {name = "scalar_arr", partial_map = true}
+!CHECK: omp.target map_entries(%[[MEMBER_MAP]] -> %[[ARG0:.*]], %[[PARENT_MAP]] -> %[[ARG1:.*]] : !fir.ref<!fir.array<10xi32>>, {{.*}}) {
+!CHECK: ^bb0(%[[ARG0]]: !fir.ref<!fir.array<10xi32>>, %[[ARG1]]: {{.*}}):
+subroutine mapType_derived_nested_explicit_member_with_bounds
+  type :: nested
+    integer(4) :: int
+    real(4) :: real
+    integer(4) :: array(10)
+  end type nested
+
+  type :: scalar_and_array
+    real(4) :: real
+    integer(4) :: array(10)
+    type(nested) :: nest
+    integer(4) :: int
+  end type scalar_and_array
+  
+  type(scalar_and_array) :: scalar_arr 
+  
+  !$omp target map(tofrom: scalar_arr%nest%array(2:5))
+    scalar_arr%nest%array(3) = 3
+  !$omp end target
+end subroutine mapType_derived_nested_explicit_member_with_bounds
+
+!CHECK: %[[ALLOCA_1:.*]] = fir.alloca {{.*}} {bindc_name = "scalar_arr1", uniq_name = "_QFmaptype_multilpe_derived_nested_explicit_memberEscalar_arr1"}
+!CHECK: %[[DECLARE_1:.*]]:2 = hlfir.declare %[[ALLOCA_1]] {uniq_name = "_QFmaptype_multilpe_derived_nested_explicit_memberEscalar_arr1"} : {{.*}} -> {{.*}}
+!CHECK: %[[ALLOCA_2:.*]] = fir.alloca {{.*}} {bindc_name = "scalar_arr2", uniq_name = "_QFmaptype_multilpe_derived_nested_explicit_memberEscalar_arr2"}
+!CHECK: %[[DECLARE_2:.*]]:2 = hlfir.declare %[[ALLOCA_2]] {uniq_name = "_QFmaptype_multilpe_derived_nested_explicit_memberEscalar_arr2"} : {{.*}} -> {{.*}}
+!CHECK: %[[PARENT_1:.*]] = hlfir.designate %[[DECLARE_1]]#0{"nest"}   : {{.*}} -> {{.*}}
+!CHECK: %[[MEMBER_1:.*]] = hlfir.designate %[[PARENT_1]]{"int"}   : {{.*}} -> !fir.ref<i32>
+!CHECK: %[[MAP_MEMBER_1:.*]] = omp.map.info var_ptr(%[[MEMBER_1]] : !fir.ref<i32>, i32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "scalar_arr1%nest%int"}
+!CHECK: %[[PARENT_2:.*]] = hlfir.designate %[[DECLARE_2]]#0{"nest"}   : {{.*}} -> {{.*}}
+!CHECK: %[[MEMBER_2:.*]] = hlfir.designate %[[PARENT_2]]{"int"}   : {{.*}} -> !fir.ref<i32>
+!CHECK: %[[MAP_MEMBER_2:.*]] = omp.map.info var_ptr(%[[MEMBER_2]] : !fir.ref<i32>, i32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "scalar_arr2%nest%int"}
+!CHECK: %[[MAP_PARENT_1:.*]] = omp.map.info var_ptr(%[[DECLARE_1]]#1 : {{.*}}) map_clauses(tofrom) capture(ByRef) members(%[[MAP_MEMBER_1]] : [2,0] : !fir.ref<i32>) -> {{.*}} {name = "scalar_arr1", partial_map = true}
+!CHECK: %[[MAP_PARENT_2:.*]] = omp.map.info var_ptr(%[[DECLARE_2]]#1 : {{.*}}) map_clauses(tofrom) capture(ByRef) members(%[[MAP_MEMBER_2]] : [2,0] : !fir.ref<i32>) -> {{.*}} {name = "scalar_arr2", partial_map = true}
+!CHECK: omp.target map_entries(%[[MAP_MEMBER_1]] -> %[[ARG0:.*]], %[[MAP_PARENT_1]] -> %[[ARG1:.*]], %[[MAP_MEMBER_2]] -> %[[ARG2:.*]], %[[MAP_PARENT_2:.*]] -> %[[ARG3:.*]] : !fir.ref<i32>, {{.*}}, !fir.ref<i32>, {{.*}}) {
+!CHECK: ^bb0(%[[ARG0]]: !fir.ref<i32>, %[[ARG1]]: {{.*}}, %[[ARG2]]: !fir.ref<i32>, %[[ARG3]]: {{.*}}):
+subroutine mapType_multilpe_derived_nested_explicit_member
+  type :: nested
+    integer(4) :: int
+    real(4) :: real
+    integer(4) :: array(10)
+  end type nested
+
+  type :: scalar_and_array
+    real(4) :: real
+    integer(4) :: array(10)
+    type(nested) :: nest
+    integer(4) :: int
+  end type scalar_and_array
+  
+  type(scalar_and_array) :: scalar_arr1
+  type(scalar_and_array) :: scalar_arr2
+
+!$omp target map(tofrom:scalar_arr1%nest%int, scalar_arr2%nest%int)
+  scalar_arr1%nest%int = 3
+  scalar_arr2%nest%int = 2
+!$omp end target
+end subroutine mapType_multilpe_derived_nested_explicit_member
diff --git a/flang/test/Lower/OpenMP/flush.f90 b/flang/test/Lower/OpenMP/flush.f90
index ad2e3609ebef..8438fdba4ee4 100644
--- a/flang/test/Lower/OpenMP/flush.f90
+++ b/flang/test/Lower/OpenMP/flush.f90
@@ -7,9 +7,9 @@
 subroutine flush_standalone(a, b, c)
     integer, intent(inout) :: a, b, c
 
-!CHECK:    %[[A:.*]]:2 = hlfir.declare %[[ARG_A]] {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_standaloneEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:    %[[B:.*]]:2 = hlfir.declare %[[ARG_B]] {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_standaloneEb"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:    %[[C:.*]]:2 = hlfir.declare %[[ARG_C]] {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_standaloneEc"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[A:.*]]:2 = hlfir.declare %[[ARG_A]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_standaloneEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[B:.*]]:2 = hlfir.declare %[[ARG_B]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_standaloneEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[C:.*]]:2 = hlfir.declare %[[ARG_C]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_standaloneEc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:    omp.flush(%[[A]]#1, %[[B]]#1, %[[C]]#1 : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>)
 !CHECK:    omp.flush
 !$omp flush(a,b,c)
@@ -21,9 +21,9 @@ end subroutine flush_standalone
 !CHECK-SAME: %[[ARG_A:.*]]: !fir.ref<i32> {fir.bindc_name = "a"}, %[[ARG_B:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}, %[[ARG_C:.*]]: !fir.ref<i32> {fir.bindc_name = "c"})
 subroutine flush_parallel(a, b, c)
     integer, intent(inout) :: a, b, c
-!CHECK:    %[[A:.*]]:2 = hlfir.declare %[[ARG_A]] {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_parallelEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:    %[[B:.*]]:2 = hlfir.declare %[[ARG_B]] {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_parallelEb"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:    %[[C:.*]]:2 = hlfir.declare %[[ARG_C]] {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_parallelEc"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[A:.*]]:2 = hlfir.declare %[[ARG_A]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_parallelEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[B:.*]]:2 = hlfir.declare %[[ARG_B]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_parallelEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[C:.*]]:2 = hlfir.declare %[[ARG_C]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_parallelEc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 
 !$omp parallel
 !CHECK:    omp.parallel
@@ -34,7 +34,7 @@ subroutine flush_parallel(a, b, c)
 
 !CHECK:      %[[A_VAL:.*]] = fir.load %[[A]]#0 : !fir.ref<i32>
 !CHECK:      %[[B_VAL:.*]] = fir.load %[[B]]#0 : !fir.ref<i32>
-!CHECK:      %[[C_VAL:.*]] = arith.addi %3, %4 : i32
+!CHECK:      %[[C_VAL:.*]] = arith.addi %[[A_VAL]], %[[B_VAL]] : i32
 !CHECK:      hlfir.assign %[[C_VAL]] to %[[C]]#0 : i32, !fir.ref<i32>
     c = a + b
 
diff --git a/flang/test/Lower/OpenMP/implicit-dsa.f90 b/flang/test/Lower/OpenMP/implicit-dsa.f90
new file mode 100644
index 000000000000..0f67d5bfd194
--- /dev/null
+++ b/flang/test/Lower/OpenMP/implicit-dsa.f90
@@ -0,0 +1,275 @@
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+! Checks lowering of OpenMP variables with implicitly determined DSAs.
+
+! Basic cases.
+!CHECK-LABEL: func @_QPimplicit_dsa_test1
+!CHECK:       %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFimplicit_dsa_test1Ex"}
+!CHECK:       %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFimplicit_dsa_test1Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:       %[[Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFimplicit_dsa_test1Ey"}
+!CHECK:       %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFimplicit_dsa_test1Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:       %[[Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFimplicit_dsa_test1Ez"}
+!CHECK:       %[[Z_DECL:.*]]:2 = hlfir.declare %[[Z]] {uniq_name = "_QFimplicit_dsa_test1Ez"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:       omp.task {
+!CHECK-NEXT:    %[[PRIV_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFimplicit_dsa_test1Ey"}
+!CHECK-NEXT:    %[[PRIV_Y_DECL:.*]]:2 = hlfir.declare %[[PRIV_Y]] {uniq_name = "_QFimplicit_dsa_test1Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK-NEXT:    %[[PRIV_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test1Ex"}
+!CHECK-NEXT:    %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X]] {uniq_name = "_QFimplicit_dsa_test1Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK-NEXT:    %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
+!CHECK-NEXT:    hlfir.assign %[[TEMP]] to %[[PRIV_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK-NOT:     fir.alloca
+!CHECK:       }
+!CHECK:       omp.task {
+!CHECK-NOT:     fir.alloca
+!CHECK:       }
+subroutine implicit_dsa_test1
+  integer :: x, y, z
+
+  !$omp task private(y) shared(z)
+    x = y + z
+  !$omp end task
+
+  !$omp task default(shared)
+    x = y + z
+  !$omp end task
+end subroutine
+
+! Nested task with implicit firstprivate DSA variable.
+!CHECK-LABEL: func @_QPimplicit_dsa_test2
+!CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFimplicit_dsa_test2Ex"}
+!CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFimplicit_dsa_test2Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: omp.task {
+!CHECK:   omp.task {
+!CHECK:     %[[PRIV_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test2Ex"}
+!CHECK:     %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X]] {uniq_name = "_QFimplicit_dsa_test2Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:     %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
+!CHECK:     hlfir.assign %[[TEMP]] to %[[PRIV_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK:   }
+!CHECK: }
+subroutine implicit_dsa_test2
+  integer :: x
+
+  !$omp task
+    !$omp task
+      x = 1
+    !$omp end task
+  !$omp end task
+end subroutine
+
+! Nested tasks with implicit shared DSA variables.
+!CHECK-LABEL: func @_QPimplicit_dsa_test3
+!CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFimplicit_dsa_test3Ex"}
+!CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFimplicit_dsa_test3Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFimplicit_dsa_test3Ey"}
+!CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFimplicit_dsa_test3Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFimplicit_dsa_test3Ez"}
+!CHECK: %[[Z_DECL:.*]]:2 = hlfir.declare %[[Z]] {uniq_name = "_QFimplicit_dsa_test3Ez"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: omp.parallel {
+!CHECK:   omp.task {
+!CHECK:     %[[ONE:.*]] = arith.constant 1 : i32
+!CHECK:     hlfir.assign %[[ONE]] to %[[X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:     %[[ONE:.*]] = arith.constant 1 : i32
+!CHECK:     hlfir.assign %[[ONE]] to %[[Y_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:   }
+!CHECK:   omp.task {
+!CHECK:     %[[PRIV_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test3Ex"}
+!CHECK:     %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X]] {uniq_name = "_QFimplicit_dsa_test3Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:     %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
+!CHECK:     hlfir.assign %[[TEMP]] to %[[PRIV_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK:     %[[ONE:.*]] = arith.constant 1 : i32
+!CHECK:     hlfir.assign %[[ONE]] to %[[PRIV_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:     %[[ONE:.*]] = arith.constant 1 : i32
+!CHECK:     hlfir.assign %[[ONE]] to %[[Z_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:   }
+!CHECK: }
+subroutine implicit_dsa_test3
+  integer :: x, y, z
+
+  !$omp parallel
+    !$omp task
+      x = 1
+      y = 1
+    !$omp end task
+
+    !$omp task firstprivate(x)
+      x = 1
+      z = 1
+    !$omp end task
+  !$omp end parallel
+end subroutine
+
+! Task with implicit firstprivate DSA variables, enclosed in private context.
+!CHECK-LABEL: func @_QPimplicit_dsa_test4
+!CHECK:       %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFimplicit_dsa_test4Ex"}
+!CHECK:       %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFimplicit_dsa_test4Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:       %[[Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFimplicit_dsa_test4Ey"}
+!CHECK:       %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFimplicit_dsa_test4Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:       %[[Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFimplicit_dsa_test4Ez"}
+!CHECK:       %[[Z_DECL:.*]]:2 = hlfir.declare %[[Z]] {uniq_name = "_QFimplicit_dsa_test4Ez"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:       omp.parallel {
+!CHECK:         %[[PRIV_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test4Ex"}
+!CHECK:         %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X]] {uniq_name = "_QFimplicit_dsa_test4Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:         %[[PRIV_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFimplicit_dsa_test4Ez"}
+!CHECK:         %[[PRIV_Z_DECL:.*]]:2 = hlfir.declare %[[PRIV_Z]] {uniq_name = "_QFimplicit_dsa_test4Ez"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:         %[[PRIV_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFimplicit_dsa_test4Ey"}
+!CHECK:         %[[PRIV_Y_DECL:.*]]:2 = hlfir.declare %[[PRIV_Y]] {uniq_name = "_QFimplicit_dsa_test4Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:         omp.task {
+!CHECK-NEXT:      %[[PRIV2_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test4Ex"}
+!CHECK-NEXT:      %[[PRIV2_X_DECL:.*]]:2 = hlfir.declare %[[PRIV2_X]] {uniq_name = "_QFimplicit_dsa_test4Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK-NEXT:      %[[TEMP:.*]] = fir.load %[[PRIV_X_DECL]]#0 : !fir.ref<i32>
+!CHECK-NEXT:      hlfir.assign %[[TEMP]] to %[[PRIV2_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK-NEXT:      %[[PRIV2_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFimplicit_dsa_test4Ez"}
+!CHECK-NEXT:      %[[PRIV2_Z_DECL:.*]]:2 = hlfir.declare %[[PRIV2_Z]] {uniq_name = "_QFimplicit_dsa_test4Ez"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK-NEXT:      %[[TEMP2:.*]] = fir.load %[[PRIV_Z_DECL]]#0 : !fir.ref<i32>
+!CHECK-NEXT:      hlfir.assign %[[TEMP2]] to %[[PRIV2_Z_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK:           %[[ZERO:.*]] = arith.constant 0 : i32
+!CHECK-NEXT:      hlfir.assign %[[ZERO]] to %[[PRIV2_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:           %[[ONE:.*]] = arith.constant 1 : i32
+!CHECK-NEXT:      hlfir.assign %[[ONE]] to %[[PRIV2_Z_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:         }
+!CHECK:         omp.task {
+!CHECK-NEXT:      %[[PRIV2_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test4Ex"}
+!CHECK-NEXT:      %[[PRIV2_X_DECL:.*]]:2 = hlfir.declare %[[PRIV2_X]] {uniq_name = "_QFimplicit_dsa_test4Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK-NEXT:      %[[TEMP:.*]] = fir.load %[[PRIV_X_DECL]]#0 : !fir.ref<i32>
+!CHECK-NEXT:      hlfir.assign %[[TEMP]] to %[[PRIV2_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK-NEXT:      %[[PRIV2_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFimplicit_dsa_test4Ey"}
+!CHECK-NEXT:      %[[PRIV2_Y_DECL:.*]]:2 = hlfir.declare %[[PRIV2_Y]] {uniq_name = "_QFimplicit_dsa_test4Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK-NEXT:      %[[TEMP2:.*]] = fir.load %[[PRIV_Y_DECL]]#0 : !fir.ref<i32>
+!CHECK-NEXT:      hlfir.assign %[[TEMP2]] to %[[PRIV2_Y_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK:           %[[ONE:.*]] = arith.constant 1 : i32
+!CHECK-NEXT:      hlfir.assign %[[ONE]] to %[[PRIV2_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:           %[[ZERO:.*]] = arith.constant 0 : i32
+!CHECK-NEXT:      hlfir.assign %[[ZERO]] to %[[PRIV2_Z_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:         }
+!CHECK:       }
+subroutine implicit_dsa_test4
+  integer :: x, y, z
+
+  !$omp parallel default(private)
+    !$omp task
+      x = 0
+      z = 1
+    !$omp end task
+
+    !$omp task
+      x = 1
+      y = 0
+    !$omp end task
+  !$omp end parallel
+end subroutine
+
+! Inner parallel using implicit firstprivate symbol.
+!CHECK-LABEL: func @_QPimplicit_dsa_test5
+!CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFimplicit_dsa_test5Ex"}
+!CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFimplicit_dsa_test5Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: omp.parallel {
+!CHECK:     %[[PRIV_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test5Ex"}
+!CHECK:     %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X]] {uniq_name = "_QFimplicit_dsa_test5Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:   omp.task {
+!CHECK:     %[[PRIV2_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test5Ex"}
+!CHECK:     %[[PRIV2_X_DECL:.*]]:2 = hlfir.declare %[[PRIV2_X]] {uniq_name = "_QFimplicit_dsa_test5Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:      %[[TEMP:.*]] = fir.load %[[PRIV_X_DECL]]#0 : !fir.ref<i32>
+!CHECK:      hlfir.assign %[[TEMP]] to %[[PRIV2_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK:     omp.parallel {
+!CHECK:       %[[ONE:.*]] = arith.constant 1 : i32
+!CHECK:       hlfir.assign %[[ONE]] to %[[PRIV2_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:     }
+!CHECK:   }
+!CHECK: }
+subroutine implicit_dsa_test5
+  integer :: x
+
+  !$omp parallel default(private)
+    !$omp task
+      !$omp parallel
+        x = 1
+      !$omp end parallel
+    !$omp end task
+  !$omp end parallel
+end subroutine
+
+! Constructs nested inside a task with implicit DSA variables.
+!CHECK-LABEL: func @_QPimplicit_dsa_test6
+!CHECK:       %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFimplicit_dsa_test6Ex"}
+!CHECK:       %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFimplicit_dsa_test6Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:       %[[Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFimplicit_dsa_test6Ey"}
+!CHECK:       %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFimplicit_dsa_test6Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:       %[[Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFimplicit_dsa_test6Ez"}
+!CHECK:       %[[Z_DECL:.*]]:2 = hlfir.declare %[[Z]] {uniq_name = "_QFimplicit_dsa_test6Ez"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:       omp.task {
+!CHECK-NEXT:    %[[PRIV_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test6Ex"}
+!CHECK-NEXT:    %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X]] {uniq_name = "_QFimplicit_dsa_test6Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK-NEXT:    %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
+!CHECK-NEXT:    hlfir.assign %[[TEMP]] to %[[PRIV_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK-NEXT:    %[[PRIV_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFimplicit_dsa_test6Ey"}
+!CHECK-NEXT:    %[[PRIV_Y_DECL:.*]]:2 = hlfir.declare %[[PRIV_Y]] {uniq_name = "_QFimplicit_dsa_test6Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK-NEXT:    %[[TEMP2:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<i32>
+!CHECK-NEXT:    hlfir.assign %[[TEMP2]] to %[[PRIV_Y_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK-NEXT:    %[[PRIV_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFimplicit_dsa_test6Ez"}
+!CHECK-NEXT:    %[[PRIV_Z_DECL:.*]]:2 = hlfir.declare %[[PRIV_Z]] {uniq_name = "_QFimplicit_dsa_test6Ez"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK-NEXT:    %[[TEMP3:.*]] = fir.load %[[Z_DECL]]#0 : !fir.ref<i32>
+!CHECK-NEXT:    hlfir.assign %[[TEMP3]] to %[[PRIV_Z_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK:         omp.parallel {
+!CHECK:           %[[PRIV2_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test6Ex"}
+!CHECK:           %[[PRIV2_X_DECL:.*]]:2 = hlfir.declare %[[PRIV2_X]] {uniq_name = "_QFimplicit_dsa_test6Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK-NOT:       hlfir.assign
+!CHECK:           %[[PRIV2_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFimplicit_dsa_test6Ey"}
+!CHECK:           %[[PRIV2_Y_DECL:.*]]:2 = hlfir.declare %[[PRIV2_Y]] {uniq_name = "_QFimplicit_dsa_test6Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK-NOT:       hlfir.assign
+!CHECK:           hlfir.assign %{{.*}} to %[[PRIV2_X_DECL]]
+!CHECK:         }
+!CHECK:         omp.parallel {
+!CHECK-NEXT:      %[[PRIV3_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test6Ex"}
+!CHECK-NEXT:      %[[PRIV3_X_DECL:.*]]:2 = hlfir.declare %[[PRIV3_X]] {uniq_name = "_QFimplicit_dsa_test6Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK-NEXT:      %[[TEMP4:.*]] = fir.load %[[PRIV_X_DECL]]#0 : !fir.ref<i32>
+!CHECK-NEXT:      hlfir.assign %[[TEMP4]] to %[[PRIV3_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK-NEXT:      %[[PRIV3_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFimplicit_dsa_test6Ez"}
+!CHECK-NEXT:      %[[PRIV3_Z_DECL:.*]]:2 = hlfir.declare %[[PRIV3_Z]] {uniq_name = "_QFimplicit_dsa_test6Ez"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK-NEXT:      %[[TEMP5:.*]] = fir.load %[[PRIV_Z_DECL]]#0 : !fir.ref<i32>
+!CHECK-NEXT:      hlfir.assign %[[TEMP5]] to %[[PRIV3_Z_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK:           hlfir.assign %{{.*}} to %[[PRIV_Y_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:         }
+!CHECK:       }
+subroutine implicit_dsa_test6
+  integer :: x, y, z
+
+  !$omp task
+    !$omp parallel default(private)
+      x = y
+    !$omp end parallel
+
+    !$omp parallel default(firstprivate) shared(y)
+      y = x + z
+    !$omp end parallel
+  !$omp end task
+end subroutine
+
+! Test taskgroup - it uses the same scope as task.
+!CHECK-LABEL: func @_QPimplicit_dsa_test7
+!CHECK:       %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFimplicit_dsa_test7Ex"}
+!CHECK:       %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFimplicit_dsa_test7Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:       %[[Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFimplicit_dsa_test7Ey"}
+!CHECK:       %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFimplicit_dsa_test7Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:       omp.task {
+!CHECK:         omp.taskgroup {
+!CHECK-NEXT:      %[[PRIV_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test7Ex"}
+!CHECK-NEXT:      %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X]] {uniq_name = "_QFimplicit_dsa_test7Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK-NEXT:      %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
+!CHECK-NEXT:      hlfir.assign %[[TEMP]] to %[[PRIV_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK-NEXT:      %[[PRIV_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFimplicit_dsa_test7Ey"}
+!CHECK-NEXT:      %[[PRIV_Y_DECL:.*]]:2 = hlfir.declare %[[PRIV_Y]] {uniq_name = "_QFimplicit_dsa_test7Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK-NEXT:      %[[TEMP2:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<i32>
+!CHECK-NEXT:      hlfir.assign %[[TEMP2]] to %[[PRIV_Y_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK:         }
+!CHECK:       }
+subroutine implicit_dsa_test7
+  integer :: x, y
+
+  !$omp task
+    !$omp taskgroup
+      x = y
+    !$omp end taskgroup
+  !$omp end task
+end subroutine
+
+! TODO Test taskloop
diff --git a/flang/test/Lower/OpenMP/map-component-ref.f90 b/flang/test/Lower/OpenMP/map-component-ref.f90
index 435bc8e6bd36..2c582667f38d 100644
--- a/flang/test/Lower/OpenMP/map-component-ref.f90
+++ b/flang/test/Lower/OpenMP/map-component-ref.f90
@@ -5,7 +5,7 @@
 ! CHECK: %[[V1:[0-9]+]]:2 = hlfir.declare %[[V0]] {uniq_name = "_QFfooEa"} : (!fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>) -> (!fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>, !fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>)
 ! CHECK: %[[V2:[0-9]+]] = hlfir.designate %[[V1]]#0{"a1"}   : (!fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>) -> !fir.ref<i32>
 ! CHECK: %[[V3:[0-9]+]] = omp.map.info var_ptr(%[[V2]] : !fir.ref<i32>, i32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "a%a1"}
-! CHECK: %[[V4:[0-9]+]] = omp.map.info var_ptr(%[[V1]]#1 : !fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>, !fir.type<_QFfooTt0{a0:i32,a1:i32}>) map_clauses(implicit, tofrom) capture(ByRef) -> !fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>> {name = "a"}
+! CHECK: %[[V4:[0-9]+]] = omp.map.info var_ptr(%[[V1]]#1 : !fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>, !fir.type<_QFfooTt0{a0:i32,a1:i32}>) map_clauses(tofrom) capture(ByRef) members(%[[V3]] : [1] : !fir.ref<i32>) -> !fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>> {name = "a", partial_map = true}
 ! CHECK: omp.target map_entries(%[[V3]] -> %arg0, %[[V4]] -> %arg1 : !fir.ref<i32>, !fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>) {
 ! CHECK: ^bb0(%arg0: !fir.ref<i32>, %arg1: !fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>):
 ! CHECK:   %[[V5:[0-9]+]]:2 = hlfir.declare %arg1 {uniq_name = "_QFfooEa"} : (!fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>) -> (!fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>, !fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>)
diff --git a/flang/test/Lower/OpenMP/parallel-firstprivate-clause-scalar.f90 b/flang/test/Lower/OpenMP/parallel-firstprivate-clause-scalar.f90
index 6402f98a2add..93dcd4b74b00 100644
--- a/flang/test/Lower/OpenMP/parallel-firstprivate-clause-scalar.f90
+++ b/flang/test/Lower/OpenMP/parallel-firstprivate-clause-scalar.f90
@@ -4,8 +4,8 @@
 ! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s --check-prefix=CHECK
 
 !CHECK-DAG: func @_QPfirstprivate_complex(%[[ARG1:.*]]: !fir.ref<!fir.complex<4>>{{.*}}, %[[ARG2:.*]]: !fir.ref<!fir.complex<8>>{{.*}}) {
-!CHECK:    %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_QFfirstprivate_complexEarg1"} : (!fir.ref<!fir.complex<4>>) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
-!CHECK:    %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] {uniq_name = "_QFfirstprivate_complexEarg2"} : (!fir.ref<!fir.complex<8>>) -> (!fir.ref<!fir.complex<8>>, !fir.ref<!fir.complex<8>>)
+!CHECK:    %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_complexEarg1"} : (!fir.ref<!fir.complex<4>>, !fir.dscope) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
+!CHECK:    %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_complexEarg2"} : (!fir.ref<!fir.complex<8>>, !fir.dscope) -> (!fir.ref<!fir.complex<8>>, !fir.ref<!fir.complex<8>>)
 !CHECK:   omp.parallel {
 !CHECK:     %[[ARG1_PVT:.*]] = fir.alloca !fir.complex<4> {bindc_name = "arg1", pinned, uniq_name = "_QFfirstprivate_complexEarg1"}
 !CHECK:     %[[ARG1_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG1_PVT]] {uniq_name = "_QFfirstprivate_complexEarg1"} : (!fir.ref<!fir.complex<4>>) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
@@ -30,12 +30,12 @@ subroutine firstprivate_complex(arg1, arg2)
 end subroutine
 
 !CHECK-DAG: func @_QPfirstprivate_integer(%[[ARG1:.*]]: !fir.ref<i32>{{.*}}, %[[ARG2:.*]]: !fir.ref<i8>{{.*}}, %[[ARG3:.*]]: !fir.ref<i16>{{.*}}, %[[ARG4:.*]]: !fir.ref<i32>{{.*}}, %[[ARG5:.*]]: !fir.ref<i64>{{.*}}, %[[ARG6:.*]]: !fir.ref<i128>{{.*}}) {
-!CHECK:  %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_QFfirstprivate_integerEarg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:  %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] {uniq_name = "_QFfirstprivate_integerEarg2"} : (!fir.ref<i8>) -> (!fir.ref<i8>, !fir.ref<i8>)
-!CHECK:  %[[ARG3_DECL:.*]]:2 = hlfir.declare %[[ARG3]] {uniq_name = "_QFfirstprivate_integerEarg3"} : (!fir.ref<i16>) -> (!fir.ref<i16>, !fir.ref<i16>)
-!CHECK:  %[[ARG4_DECL:.*]]:2 = hlfir.declare %[[ARG4]] {uniq_name = "_QFfirstprivate_integerEarg4"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:  %[[ARG5_DECL:.*]]:2 = hlfir.declare %[[ARG5]] {uniq_name = "_QFfirstprivate_integerEarg5"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
-!CHECK:  %[[ARG6_DECL:.*]]:2 = hlfir.declare %[[ARG6]] {uniq_name = "_QFfirstprivate_integerEarg6"} : (!fir.ref<i128>) -> (!fir.ref<i128>, !fir.ref<i128>)
+!CHECK:  %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_integerEarg1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:  %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_integerEarg2"} : (!fir.ref<i8>, !fir.dscope) -> (!fir.ref<i8>, !fir.ref<i8>)
+!CHECK:  %[[ARG3_DECL:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_integerEarg3"} : (!fir.ref<i16>, !fir.dscope) -> (!fir.ref<i16>, !fir.ref<i16>)
+!CHECK:  %[[ARG4_DECL:.*]]:2 = hlfir.declare %[[ARG4]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_integerEarg4"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:  %[[ARG5_DECL:.*]]:2 = hlfir.declare %[[ARG5]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_integerEarg5"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+!CHECK:  %[[ARG6_DECL:.*]]:2 = hlfir.declare %[[ARG6]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_integerEarg6"} : (!fir.ref<i128>, !fir.dscope) -> (!fir.ref<i128>, !fir.ref<i128>)
 !CHECK:  omp.parallel {
 !CHECK:    %[[ARG1_PVT:.*]] = fir.alloca i32 {bindc_name = "arg1", pinned, uniq_name = "_QFfirstprivate_integerEarg1"}
 !CHECK:    %[[ARG1_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG1_PVT]] {uniq_name = "_QFfirstprivate_integerEarg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -81,11 +81,11 @@ subroutine firstprivate_integer(arg1, arg2, arg3, arg4, arg5, arg6)
 end subroutine
 
 !CHECK-DAG: func @_QPfirstprivate_logical(%[[ARG1:.*]]: !fir.ref<!fir.logical<4>>{{.*}}, %[[ARG2:.*]]: !fir.ref<!fir.logical<1>>{{.*}}, %[[ARG3:.*]]: !fir.ref<!fir.logical<2>>{{.*}}, %[[ARG4:.*]]: !fir.ref<!fir.logical<4>>{{.*}}, %[[ARG5:.*]]: !fir.ref<!fir.logical<8>>{{.*}}) {
-!CHECK:    %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_QFfirstprivate_logicalEarg1"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-!CHECK:    %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] {uniq_name = "_QFfirstprivate_logicalEarg2"} : (!fir.ref<!fir.logical<1>>) -> (!fir.ref<!fir.logical<1>>, !fir.ref<!fir.logical<1>>)
-!CHECK:    %[[ARG3_DECL:.*]]:2 = hlfir.declare %[[ARG3]] {uniq_name = "_QFfirstprivate_logicalEarg3"} : (!fir.ref<!fir.logical<2>>) -> (!fir.ref<!fir.logical<2>>, !fir.ref<!fir.logical<2>>)
-!CHECK:    %[[ARG4_DECL:.*]]:2 = hlfir.declare %[[ARG4]] {uniq_name = "_QFfirstprivate_logicalEarg4"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-!CHECK:    %[[ARG5_DECL:.*]]:2 = hlfir.declare %[[ARG5]] {uniq_name = "_QFfirstprivate_logicalEarg5"} : (!fir.ref<!fir.logical<8>>) -> (!fir.ref<!fir.logical<8>>, !fir.ref<!fir.logical<8>>)
+!CHECK:    %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_logicalEarg1"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!CHECK:    %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_logicalEarg2"} : (!fir.ref<!fir.logical<1>>, !fir.dscope) -> (!fir.ref<!fir.logical<1>>, !fir.ref<!fir.logical<1>>)
+!CHECK:    %[[ARG3_DECL:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_logicalEarg3"} : (!fir.ref<!fir.logical<2>>, !fir.dscope) -> (!fir.ref<!fir.logical<2>>, !fir.ref<!fir.logical<2>>)
+!CHECK:    %[[ARG4_DECL:.*]]:2 = hlfir.declare %[[ARG4]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_logicalEarg4"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!CHECK:    %[[ARG5_DECL:.*]]:2 = hlfir.declare %[[ARG5]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_logicalEarg5"} : (!fir.ref<!fir.logical<8>>, !fir.dscope) -> (!fir.ref<!fir.logical<8>>, !fir.ref<!fir.logical<8>>)
 !CHECK:   omp.parallel {
 !CHECK:     %[[ARG1_PVT:.*]] = fir.alloca !fir.logical<4> {bindc_name = "arg1", pinned, uniq_name = "_QFfirstprivate_logicalEarg1"}
 !CHECK:     %[[ARG1_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG1_PVT]] {uniq_name = "_QFfirstprivate_logicalEarg1"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
@@ -125,12 +125,12 @@ subroutine firstprivate_logical(arg1, arg2, arg3, arg4, arg5)
 end subroutine
 
 !CHECK-DAG: func @_QPfirstprivate_real(%[[ARG1:.*]]: !fir.ref<f32>{{.*}}, %[[ARG2:.*]]: !fir.ref<f16>{{.*}}, %[[ARG3:.*]]: !fir.ref<f32>{{.*}}, %[[ARG4:.*]]: !fir.ref<f64>{{.*}}, %[[ARG5:.*]]: !fir.ref<f80>{{.*}}, %[[ARG6:.*]]: !fir.ref<f128>{{.*}}) {
-!CHECK:   %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_QFfirstprivate_realEarg1"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-!CHECK:   %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] {uniq_name = "_QFfirstprivate_realEarg2"} : (!fir.ref<f16>) -> (!fir.ref<f16>, !fir.ref<f16>)
-!CHECK:   %[[ARG3_DECL:.*]]:2 = hlfir.declare %[[ARG3]] {uniq_name = "_QFfirstprivate_realEarg3"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-!CHECK:   %[[ARG4_DECL:.*]]:2 = hlfir.declare %[[ARG4]] {uniq_name = "_QFfirstprivate_realEarg4"} : (!fir.ref<f64>) -> (!fir.ref<f64>, !fir.ref<f64>)
-!CHECK:   %[[ARG5_DECL:.*]]:2 = hlfir.declare %[[ARG5]] {uniq_name = "_QFfirstprivate_realEarg5"} : (!fir.ref<f80>) -> (!fir.ref<f80>, !fir.ref<f80>)
-!CHECK:   %[[ARG6_DECL:.*]]:2 = hlfir.declare %[[ARG6]] {uniq_name = "_QFfirstprivate_realEarg6"} : (!fir.ref<f128>) -> (!fir.ref<f128>, !fir.ref<f128>)
+!CHECK:   %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_realEarg1"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:   %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_realEarg2"} : (!fir.ref<f16>, !fir.dscope) -> (!fir.ref<f16>, !fir.ref<f16>)
+!CHECK:   %[[ARG3_DECL:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_realEarg3"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:   %[[ARG4_DECL:.*]]:2 = hlfir.declare %[[ARG4]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_realEarg4"} : (!fir.ref<f64>, !fir.dscope) -> (!fir.ref<f64>, !fir.ref<f64>)
+!CHECK:   %[[ARG5_DECL:.*]]:2 = hlfir.declare %[[ARG5]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_realEarg5"} : (!fir.ref<f80>, !fir.dscope) -> (!fir.ref<f80>, !fir.ref<f80>)
+!CHECK:   %[[ARG6_DECL:.*]]:2 = hlfir.declare %[[ARG6]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_realEarg6"} : (!fir.ref<f128>, !fir.dscope) -> (!fir.ref<f128>, !fir.ref<f128>)
 !CHECK:   omp.parallel {
 !CHECK:     %[[ARG1_PVT:.*]] = fir.alloca f32 {bindc_name = "arg1", pinned, uniq_name = "_QFfirstprivate_realEarg1"}
 !CHECK:     %[[ARG1_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG1_PVT]] {uniq_name = "_QFfirstprivate_realEarg1"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
@@ -177,8 +177,8 @@ end subroutine
 !CHECK-LABEL:   func.func @_QPmultiple_firstprivate(
 !CHECK-SAME:                                        %[[A_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "a"},
 !CHECK-SAME:                                        %[[B_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}) {
-!CHECK:           %[[A_DECL:.*]]:2 = hlfir.declare %[[A_ADDR]] {uniq_name = "_QFmultiple_firstprivateEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:           %[[B_DECL:.*]]:2 = hlfir.declare %[[B_ADDR]] {uniq_name = "_QFmultiple_firstprivateEb"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           %[[A_DECL:.*]]:2 = hlfir.declare %[[A_ADDR]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_firstprivateEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           %[[B_DECL:.*]]:2 = hlfir.declare %[[B_ADDR]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_firstprivateEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:           omp.parallel   {
 !CHECK:             %[[A_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFmultiple_firstprivateEa"}
 !CHECK:             %[[A_PRIV_DECL:.*]]:2 = hlfir.declare %[[A_PRIV_ADDR]] {uniq_name = "_QFmultiple_firstprivateEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
diff --git a/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90 b/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90
index bb81e5eac62f..e6ee75c8a5be 100644
--- a/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90
+++ b/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90
@@ -7,7 +7,7 @@
 !CHECK-DAG: %[[ARG1_UNBOX:.*]]:2 = fir.unboxchar
 !CHECK-DAG: %[[FIVE:.*]] = arith.constant 5 : index
 !CHECK-DAG: %[[ARG1_REF:.*]] = fir.convert %[[ARG1_UNBOX]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,5>>
-!CHECK-DAG: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1_REF]] typeparams %[[FIVE]] {uniq_name = "_QFlastprivate_characterEarg1"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
+!CHECK-DAG: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1_REF]] typeparams %[[FIVE]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFlastprivate_characterEarg1"} : (!fir.ref<!fir.char<1,5>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
 
 !CHECK: omp.parallel {
 !CHECK-DAG: %[[ARG1_PVT:.*]] = fir.alloca !fir.char<1,5> {bindc_name = "arg1",
@@ -57,7 +57,7 @@ end do
 end subroutine
 
 !CHECK: func @_QPlastprivate_int(%[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "arg1"}) {
-!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_QFlastprivate_intEarg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFlastprivate_intEarg1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK-DAG: omp.parallel  {
 !CHECK-DAG: %[[CLONE:.*]] = fir.alloca i32 {bindc_name = "arg1"
 !CHECK-DAG: %[[CLONE_DECL:.*]]:2 = hlfir.declare %[[CLONE]] {uniq_name = "_QFlastprivate_intEarg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -97,8 +97,8 @@ print *, arg1
 end subroutine
 
 !CHECK: func.func @_QPmult_lastprivate_int(%[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "arg1"}, %[[ARG2:.*]]: !fir.ref<i32> {fir.bindc_name = "arg2"}) {
-!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_QFmult_lastprivate_intEarg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] {uniq_name = "_QFmult_lastprivate_intEarg2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmult_lastprivate_intEarg1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmult_lastprivate_intEarg2"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: omp.parallel  {
 !CHECK-DAG: %[[CLONE1:.*]] = fir.alloca i32 {bindc_name = "arg1"
 !CHECK-DAG: %[[CLONE1_DECL:.*]]:2 = hlfir.declare %[[CLONE1]] {uniq_name = "_QFmult_lastprivate_intEarg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -142,13 +142,13 @@ print *, arg1, arg2
 end subroutine
 
 !CHECK: func.func @_QPmult_lastprivate_int2(%[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "arg1"}, %[[ARG2:.*]]: !fir.ref<i32> {fir.bindc_name = "arg2"}) {
-!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %arg0 {uniq_name = "_QFmult_lastprivate_int2Earg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[ARG2_DECL:.*]]:2 = hlfir.declare %arg1 {uniq_name = "_QFmult_lastprivate_int2Earg2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmult_lastprivate_int2Earg1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmult_lastprivate_int2Earg2"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: omp.parallel  {
-!CHECK-DAG: %[[CLONE1:.*]] = fir.alloca i32 {bindc_name = "arg1"
-!CHECK-DAG: %[[CLONE1_DECL:.*]]:2 = hlfir.declare %[[CLONE1]] {uniq_name = "_QFmult_lastprivate_int2Earg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK-DAG: %[[CLONE2:.*]] = fir.alloca i32 {bindc_name = "arg2"
 !CHECK-DAG: %[[CLONE2_DECL:.*]]:2 = hlfir.declare %[[CLONE2]] {uniq_name = "_QFmult_lastprivate_int2Earg2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK-DAG: %[[CLONE1:.*]] = fir.alloca i32 {bindc_name = "arg1"
+!CHECK-DAG: %[[CLONE1_DECL:.*]]:2 = hlfir.declare %[[CLONE1]] {uniq_name = "_QFmult_lastprivate_int2Earg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: omp.wsloop {
 !CHECK-NEXT: omp.loop_nest (%[[INDX_WS:.*]]) : {{.*}} {
 
@@ -187,8 +187,8 @@ print *, arg1, arg2
 end subroutine
 
 !CHECK: func.func @_QPfirstpriv_lastpriv_int(%[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "arg1"}, %[[ARG2:.*]]: !fir.ref<i32> {fir.bindc_name = "arg2"}) {
-!CHECK:    %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_QFfirstpriv_lastpriv_intEarg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:    %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] {uniq_name = "_QFfirstpriv_lastpriv_intEarg2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstpriv_lastpriv_intEarg1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstpriv_lastpriv_intEarg2"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: omp.parallel  {
 ! Firstprivate update
 !CHECK: %[[CLONE1:.*]] = fir.alloca i32 {bindc_name = "arg1"
@@ -235,7 +235,7 @@ print *, arg1, arg2
 end subroutine
 
 !CHECK: func.func @_QPfirstpriv_lastpriv_int2(%[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "arg1"}) {
-!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_QFfirstpriv_lastpriv_int2Earg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstpriv_lastpriv_int2Earg1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: omp.parallel  {
 ! Firstprivate update
 !CHECK: %[[CLONE1:.*]] = fir.alloca i32 {bindc_name = "arg1"
diff --git a/flang/test/Lower/OpenMP/parallel-private-clause-fixes.f90 b/flang/test/Lower/OpenMP/parallel-private-clause-fixes.f90
index f8343338112c..d3843c8e241a 100644
--- a/flang/test/Lower/OpenMP/parallel-private-clause-fixes.f90
+++ b/flang/test/Lower/OpenMP/parallel-private-clause-fixes.f90
@@ -4,7 +4,7 @@
 
 ! CHECK-LABEL: multiple_private_fix
 ! CHECK-SAME:  %[[GAMA:.*]]: !fir.ref<i32> {fir.bindc_name = "gama"}
-! CHECK-DAG:         %[[GAMA_DECL:.*]]:2 = hlfir.declare %[[GAMA]] {uniq_name = "_QFmultiple_private_fixEgama"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK-DAG:         %[[GAMA_DECL:.*]]:2 = hlfir.declare %[[GAMA]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_private_fixEgama"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK-DAG:         %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_private_fixEi"}
 ! CHECK-DAG:         %[[I_DECL:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFmultiple_private_fixEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK-DAG:         %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFmultiple_private_fixEj"}
@@ -99,7 +99,7 @@ end subroutine
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>> {fir.bindc_name = "aaa"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
 ! CHECK:           %[[VAL_2:.*]] = fir.box_elesize %[[VAL_1]] : (!fir.box<!fir.heap<!fir.char<1,?>>>) -> index
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_2]] {fortran_attrs = #{{.*}}<allocatable>, uniq_name = "_QFsub01Eaaa"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, index) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_2]] dummy_scope %{{[0-9]+}} {fortran_attrs = #{{.*}}<allocatable>, uniq_name = "_QFsub01Eaaa"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>)
 ! CHECK:           omp.parallel {
 ! CHECK:             %[[VAL_4:.*]] = fir.alloca !fir.box<!fir.heap<!fir.char<1,?>>> {bindc_name = "aaa", pinned, uniq_name = "_QFsub01Eaaa"}
 ! CHECK:             %[[VAL_5:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
@@ -148,7 +148,7 @@ end subroutine
 
 ! CHECK-LABEL:   func.func @_QPsub02(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>> {fir.bindc_name = "bbb"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #{{.*}}<allocatable>, uniq_name = "_QFsub02Ebbb"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #{{.*}}<allocatable>, uniq_name = "_QFsub02Ebbb"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>)
 ! CHECK:           omp.parallel {
 ! CHECK:             %[[VAL_2:.*]] = fir.alloca !fir.box<!fir.heap<!fir.char<1,?>>> {bindc_name = "bbb", pinned, uniq_name = "_QFsub02Ebbb"}
 ! CHECK:             %[[VAL_3:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
diff --git a/flang/test/Lower/OpenMP/parallel-private-clause-str.f90 b/flang/test/Lower/OpenMP/parallel-private-clause-str.f90
index 025e51e06617..19ea37c5339b 100644
--- a/flang/test/Lower/OpenMP/parallel-private-clause-str.f90
+++ b/flang/test/Lower/OpenMP/parallel-private-clause-str.f90
@@ -30,8 +30,8 @@ subroutine test_allocatable_string(n)
   !$omp end parallel
 end subroutine
 
-!CHECK:  func.func @_QPtest_allocatable_string_array(%{{.*}}: !fir.ref<i32> {fir.bindc_name = "n"}) {
-!CHECK:    %0:2 = hlfir.declare %arg0 {uniq_name = "_QFtest_allocatable_string_arrayEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:  func.func @_QPtest_allocatable_string_array(%[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
+!CHECK:    %{{.*}} = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_allocatable_string_arrayEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:    %[[C_BOX_REF:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>> {bindc_name = "c", uniq_name = "_QFtest_allocatable_string_arrayEc"}
 !CHECK:    %[[C_BOX:.*]] = fir.embox %{{.*}}(%{{.*}}) typeparams %{{.*}} : (!fir.heap<!fir.array<?x!fir.char<1,?>>>, !fir.shape<1>, i32) -> !fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>
 !CHECK:    fir.store %[[C_BOX]] to %[[C_BOX_REF]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>
diff --git a/flang/test/Lower/OpenMP/parallel-reduction3.f90 b/flang/test/Lower/OpenMP/parallel-reduction3.f90
index 2a4e338f255e..17d805c0d142 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction3.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction3.f90
@@ -52,7 +52,7 @@
 
 ! CHECK-LABEL:   func.func @_QPs(
 ! CHECK-SAME:                    %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFsEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsEi"}
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/parallel-wsloop-firstpriv.f90 b/flang/test/Lower/OpenMP/parallel-wsloop-firstpriv.f90
index ac8b9f50f54e..c32eb2400a34 100644
--- a/flang/test/Lower/OpenMP/parallel-wsloop-firstpriv.f90
+++ b/flang/test/Lower/OpenMP/parallel-wsloop-firstpriv.f90
@@ -5,7 +5,7 @@
 
 ! CHECK: func @_QPomp_do_firstprivate(%[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"}) 
 subroutine omp_do_firstprivate(a)
-  ! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_QFomp_do_firstprivateEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_do_firstprivateEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   integer::a
   integer::n
   n = a+1
@@ -38,8 +38,8 @@ end subroutine omp_do_firstprivate
 
 ! CHECK: func @_QPomp_do_firstprivate2(%[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) 
 subroutine omp_do_firstprivate2(a, n)
-  ! CHECK:  %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_QFomp_do_firstprivate2Ea"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-  ! CHECK:  %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_QFomp_do_firstprivate2En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK:  %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_do_firstprivate2Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK:  %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_do_firstprivate2En"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   integer::a
   integer::n
   n = a+1
diff --git a/flang/test/Lower/OpenMP/parallel-wsloop.f90 b/flang/test/Lower/OpenMP/parallel-wsloop.f90
index 602b3d1c05f0..5fa42da2269f 100644
--- a/flang/test/Lower/OpenMP/parallel-wsloop.f90
+++ b/flang/test/Lower/OpenMP/parallel-wsloop.f90
@@ -27,8 +27,8 @@ end subroutine
 ! CHECK-LABEL: func @_QPparallel_do_with_parallel_clauses
 ! CHECK-SAME: %[[COND_REF:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"}, %[[NT_REF:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"}
 subroutine parallel_do_with_parallel_clauses(cond, nt)
-  ! CHECK: %[[COND_DECL:.*]]:2 = hlfir.declare %[[COND_REF]] {uniq_name = "_QFparallel_do_with_parallel_clausesEcond"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-  ! CHECK: %[[NT_DECL:.*]]:2 = hlfir.declare %[[NT_REF]] {uniq_name = "_QFparallel_do_with_parallel_clausesEnt"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK: %[[COND_DECL:.*]]:2 = hlfir.declare %[[COND_REF]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFparallel_do_with_parallel_clausesEcond"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+  ! CHECK: %[[NT_DECL:.*]]:2 = hlfir.declare %[[NT_REF]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFparallel_do_with_parallel_clausesEnt"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   logical :: cond
   integer :: nt
   integer :: i
@@ -57,7 +57,7 @@ end subroutine
 ! CHECK-LABEL: func @_QPparallel_do_with_clauses
 ! CHECK-SAME: %[[NT_REF:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"}
 subroutine parallel_do_with_clauses(nt)
-  ! CHECK:  %[[NT_DECL:.*]]:2 = hlfir.declare %[[NT_REF]] {uniq_name = "_QFparallel_do_with_clausesEnt"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK:  %[[NT_DECL:.*]]:2 = hlfir.declare %[[NT_REF]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFparallel_do_with_clausesEnt"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   integer :: nt
   integer :: i
   ! CHECK:  %[[NT:.*]] = fir.load %[[NT_DECL]]#0 : !fir.ref<i32>
@@ -88,8 +88,8 @@ end subroutine
 ! CHECK-LABEL: func @_QPparallel_do_with_privatisation_clauses
 ! CHECK-SAME: %[[COND_REF:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"}, %[[NT_REF:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"}
 subroutine parallel_do_with_privatisation_clauses(cond,nt)
-  ! CHECK: %[[COND_DECL:.*]]:2 = hlfir.declare %[[COND_REF]] {uniq_name = "_QFparallel_do_with_privatisation_clausesEcond"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-  ! CHECK: %[[NT_DECL:.*]]:2 = hlfir.declare %[[NT_REF]] {uniq_name = "_QFparallel_do_with_privatisation_clausesEnt"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK: %[[COND_DECL:.*]]:2 = hlfir.declare %[[COND_REF]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFparallel_do_with_privatisation_clausesEcond"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+  ! CHECK: %[[NT_DECL:.*]]:2 = hlfir.declare %[[NT_REF]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFparallel_do_with_privatisation_clausesEnt"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   logical :: cond
   integer :: nt
   integer :: i
@@ -145,7 +145,7 @@ end subroutine parallel_private_do
 ! CHECK-LABEL:   func.func @_QPparallel_private_do(
 ! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"},
 ! CHECK-SAME:                                      %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"}) {
-! CHECK:           %[[NT_DECL:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFparallel_private_doEnt"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[NT_DECL:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFparallel_private_doEnt"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           omp.parallel   {
 ! CHECK:             %[[I_PRIV:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 ! CHECK:             %[[I_PRIV_DECL:.*]]:2 = hlfir.declare %[[I_PRIV]] {uniq_name = "_QFparallel_private_doEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -191,8 +191,8 @@ end subroutine omp_parallel_multiple_firstprivate_do
 ! CHECK-LABEL:   func.func @_QPomp_parallel_multiple_firstprivate_do(
 ! CHECK-SAME:                                                        %[[A_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "a"},
 ! CHECK-SAME:                                                        %[[B_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}) {
-! CHECK:            %[[A_DECL:.*]]:2 = hlfir.declare %[[A_ADDR]] {uniq_name = "_QFomp_parallel_multiple_firstprivate_doEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:            %[[B_DECL:.*]]:2 = hlfir.declare %[[B_ADDR]] {uniq_name = "_QFomp_parallel_multiple_firstprivate_doEb"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:            %[[A_DECL:.*]]:2 = hlfir.declare %[[A_ADDR]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_parallel_multiple_firstprivate_doEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:            %[[B_DECL:.*]]:2 = hlfir.declare %[[B_ADDR]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_parallel_multiple_firstprivate_doEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           omp.parallel   {
 ! CHECK:             %[[I_PRIV_ADDR:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 ! CHECK:             %[[I_PRIV_DECL:.*]]:2 = hlfir.declare %[[I_PRIV_ADDR]] {uniq_name = "_QFomp_parallel_multiple_firstprivate_doEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -242,7 +242,7 @@ end subroutine parallel_do_private
 ! CHECK-LABEL:   func.func @_QPparallel_do_private(
 ! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"},
 ! CHECK-SAME:                                      %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"}) {
-! CHECK:           %[[NT_DECL:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFparallel_do_privateEnt"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[NT_DECL:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFparallel_do_privateEnt"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           omp.parallel   {
 ! CHECK:             %[[I_PRIV_ADDR:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 ! CHECK:             %[[I_PRIV_DECL:.*]]:2 = hlfir.declare %[[I_PRIV_ADDR]] {uniq_name = "_QFparallel_do_privateEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -288,8 +288,8 @@ end subroutine omp_parallel_do_multiple_firstprivate
 ! CHECK-LABEL:   func.func @_QPomp_parallel_do_multiple_firstprivate(
 ! CHECK-SAME:                                                        %[[A_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "a"},
 ! CHECK-SAME:                                                        %[[B_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}) {
-! CHECK:           %[[A_DECL:.*]]:2 = hlfir.declare %[[A_ADDR]] {uniq_name = "_QFomp_parallel_do_multiple_firstprivateEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[B_DECL:.*]]:2 = hlfir.declare %[[B_ADDR]] {uniq_name = "_QFomp_parallel_do_multiple_firstprivateEb"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>
+! CHECK:           %[[A_DECL:.*]]:2 = hlfir.declare %[[A_ADDR]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_parallel_do_multiple_firstprivateEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[B_DECL:.*]]:2 = hlfir.declare %[[B_ADDR]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_parallel_do_multiple_firstprivateEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>
 ! CHECK:           omp.parallel {
 ! CHECK:             %[[I_PRIV_ADDR:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 ! CHECK:             %[[I_PRIV_DECL:.*]]:2 = hlfir.declare %[[I_PRIV_ADDR]] {uniq_name = "_QFomp_parallel_do_multiple_firstprivateEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
diff --git a/flang/test/Lower/OpenMP/sections.f90 b/flang/test/Lower/OpenMP/sections.f90
index 018848e63573..bd76bd53e5a0 100644
--- a/flang/test/Lower/OpenMP/sections.f90
+++ b/flang/test/Lower/OpenMP/sections.f90
@@ -10,12 +10,12 @@
 !CHECK:   %[[COUNT_DECL:.*]]:2 = hlfir.declare %[[COUNT]] {uniq_name = "_QFEcount"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:   %[[ETA:.*]] = fir.alloca f32 {bindc_name = "eta", uniq_name = "_QFEeta"}
 !CHECK:   %[[CONST_1:.*]] = arith.constant 4 : i64
+!CHECK:   %[[PRIVATE_ETA:.*]] = fir.alloca f32 {bindc_name = "eta", pinned, uniq_name = "_QFEeta"}
+!CHECK:   %[[PRIVATE_ETA_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_ETA]] {uniq_name = "_QFEeta"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:   %[[PRIVATE_DOUBLE_COUNT:.*]] = fir.alloca i32 {bindc_name = "double_count", pinned, uniq_name = "_QFEdouble_count"} 
+!CHECK:   %[[PRIVATE_DOUBLE_COUNT_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_DOUBLE_COUNT]] {uniq_name = "_QFEdouble_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:   omp.sections allocate(%[[CONST_1]] : i64 -> %[[COUNT_DECL]]#1 : !fir.ref<i32>)  {
 !CHECK:     omp.section {
-!CHECK:       %[[PRIVATE_ETA:.*]] = fir.alloca f32 {bindc_name = "eta", pinned, uniq_name = "_QFEeta"}
-!CHECK:       %[[PRIVATE_ETA_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_ETA]] {uniq_name = "_QFEeta"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-!CHECK:       %[[PRIVATE_DOUBLE_COUNT:.*]] = fir.alloca i32 {bindc_name = "double_count", pinned, uniq_name = "_QFEdouble_count"} 
-!CHECK:       %[[PRIVATE_DOUBLE_COUNT_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_DOUBLE_COUNT]] {uniq_name = "_QFEdouble_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:       %[[CONST5:.*]] = arith.constant 5 : i32
 !CHECK:       hlfir.assign %[[CONST5]] to %[[COUNT_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK:       %[[TEMP_COUNT:.*]] = fir.load %[[COUNT_DECL]]#0 : !fir.ref<i32>
@@ -26,10 +26,6 @@
 !CHECK:       omp.terminator
 !CHECK:     }
 !CHECK:     omp.section {
-!CHECK:       %[[PRIVATE_ETA:.*]] = fir.alloca f32 {bindc_name = "eta", pinned, uniq_name = "_QFEeta"}
-!CHECK:       %[[PRIVATE_ETA_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_ETA]] {uniq_name = "_QFEeta"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-!CHECK:       %[[PRIVATE_DOUBLE_COUNT:.*]] = fir.alloca i32 {bindc_name = "double_count", pinned, uniq_name = "_QFEdouble_count"} 
-!CHECK:       %[[PRIVATE_DOUBLE_COUNT_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_DOUBLE_COUNT]] {uniq_name = "_QFEdouble_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:       %[[TEMP:.*]] = fir.load %[[PRIVATE_DOUBLE_COUNT_DECL]]#0 : !fir.ref<i32>
 !CHECK:       %[[CONST:.*]] = arith.constant 1 : i32
 !CHECK:       %[[RESULT:.*]] = arith.addi %[[TEMP]], %[[CONST]] : i32
@@ -37,10 +33,6 @@
 !CHECK:       omp.terminator
 !CHECK:     }
 !CHECK:     omp.section {
-!CHECK:       %[[PRIVATE_ETA:.*]] = fir.alloca f32 {bindc_name = "eta", pinned, uniq_name = "_QFEeta"}
-!CHECK:       %[[PRIVATE_ETA_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_ETA]] {uniq_name = "_QFEeta"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-!CHECK:       %[[PRIVATE_DOUBLE_COUNT:.*]] = fir.alloca i32 {bindc_name = "double_count", pinned, uniq_name = "_QFEdouble_count"} 
-!CHECK:       %[[PRIVATE_DOUBLE_COUNT_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_DOUBLE_COUNT]] {uniq_name = "_QFEdouble_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:       %[[TEMP:.*]] = fir.load %[[PRIVATE_ETA_DECL]]#0 : !fir.ref<f32>
 !CHECK:       %[[CONST:.*]] = arith.constant 7.000000e+00 : f32
 !CHECK:       %[[RESULT:.*]] = arith.subf %[[TEMP]], %[[CONST]] {{.*}}: f32
@@ -87,13 +79,13 @@ program sample
 end program sample
 
 !CHECK: func @_QPfirstprivate(%[[ARG:.*]]: !fir.ref<f32> {fir.bindc_name = "alpha"}) {
-!CHECK:   %[[ARG_DECL:.*]]:2 = hlfir.declare %[[ARG]] {uniq_name = "_QFfirstprivateEalpha"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>) 
+!CHECK:   %[[ARG_DECL:.*]]:2 = hlfir.declare %[[ARG]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivateEalpha"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) 
+!CHECK:   %[[PRIVATE_ALPHA:.*]] = fir.alloca f32 {bindc_name = "alpha", pinned, uniq_name = "_QFfirstprivateEalpha"}
+!CHECK:   %[[PRIVATE_ALPHA_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_ALPHA]] {uniq_name = "_QFfirstprivateEalpha"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:   %[[TEMP:.*]] = fir.load %[[ARG_DECL]]#0 : !fir.ref<f32>
+!CHECK:   hlfir.assign %[[TEMP]] to %[[PRIVATE_ALPHA_DECL]]#0 temporary_lhs : f32, !fir.ref<f32>
 !CHECK:   omp.sections {
 !CHECK:     omp.section  {
-!CHECK:         %[[PRIVATE_ALPHA:.*]] = fir.alloca f32 {bindc_name = "alpha", pinned, uniq_name = "_QFfirstprivateEalpha"}
-!CHECK:         %[[PRIVATE_ALPHA_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_ALPHA]] {uniq_name = "_QFfirstprivateEalpha"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-!CHECK:         %[[TEMP:.*]] = fir.load %[[ARG_DECL]]#0 : !fir.ref<f32>
-!CHECK:         hlfir.assign %[[TEMP]] to %[[PRIVATE_ALPHA_DECL]]#0 temporary_lhs : f32, !fir.ref<f32>
 !CHECK:       omp.terminator
 !CHECK:     }
 !CHECK:     omp.terminator
@@ -126,11 +118,11 @@ subroutine lastprivate()
         integer :: x
 !CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFlastprivateEx"}
 !CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFlastprivateEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivateEx"}
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFlastprivateEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: omp.sections   {
 	!$omp sections lastprivate(x)
 !CHECK: omp.section {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivateEx"}
-!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFlastprivateEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[CONST10:.*]] = arith.constant 10 : i32
 !CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
 !CHECK: %[[RESULT:.*]] = arith.muli %[[CONST10]], %[[TEMP]] : i32
@@ -141,17 +133,12 @@ subroutine lastprivate()
             x = x * 10
 
 !CHECK: omp.section {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivateEx"}
-!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFlastprivateEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
 !CHECK: %[[CONST:.*]] = arith.constant 1 : i32
 !CHECK: %[[RESULT:.*]] = arith.addi %[[TEMP]], %[[CONST]] : i32
 !CHECK: hlfir.assign %[[RESULT]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
-!CHECK: %[[TRUE:.*]] = arith.constant true
-!CHECK: fir.if %[[TRUE]] {
 !CHECK: %[[TEMP1:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
 !CHECK: hlfir.assign %[[TEMP1]] to %[[X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
-!CHECK: }
 !CHECK: omp.terminator
 !CHECK: }
         !$omp section
@@ -160,14 +147,14 @@ subroutine lastprivate()
 !CHECK: }
     !$omp end sections
 
-!CHECK: omp.sections   {
-    !$omp sections firstprivate(x) lastprivate(x)
-!CHECK: omp.section {
 !CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivateEx"}
 !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFlastprivateEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
 !CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
 !CHECK: omp.barrier
+!CHECK: omp.sections {
+    !$omp sections firstprivate(x) lastprivate(x)
+!CHECK: omp.section {
 !CHECK: %[[CONST:.*]] = arith.constant 10 : i32
 !CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
 !CHECK: %[[RESULT:.*]] = arith.muli %[[CONST]], %[[TEMP]] : i32
@@ -177,20 +164,12 @@ subroutine lastprivate()
         !$omp section
             x = x * 10
 !CHECK: omp.section {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivateEx"}
-!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFlastprivateEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
-!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
-!CHECK: omp.barrier
 !CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
 !CHECK: %[[CONST:.*]] = arith.constant 1 : i32
 !CHECK: %[[RESULT:.*]] = arith.addi %[[TEMP]], %[[CONST]] : i32
 !CHECK: hlfir.assign %[[RESULT]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
-!CHECK: %[[TRUE:.*]] = arith.constant true
-!CHECK: fir.if %[[TRUE]] {
 !CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
 !CHECK: hlfir.assign %[[TEMP]] to %[[X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
-!CHECK: }
 !CHECK: omp.terminator
 !CHECK: }
         !$omp section
@@ -199,14 +178,14 @@ subroutine lastprivate()
 !CHECK: }
     !$omp end sections
 
-!CHECK: omp.sections nowait {
-    !$omp sections firstprivate(x) lastprivate(x)
-!CHECK: omp.section {
 !CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivateEx"}
 !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFlastprivateEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
 !CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
 !CHECK: omp.barrier
+!CHECK: omp.sections nowait {
+    !$omp sections firstprivate(x) lastprivate(x)
+!CHECK: omp.section {
 !CHECK: %[[CONST:.*]] = arith.constant 10 : i32
 !CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
 !CHECK: %[[RESULT:.*]] = arith.muli %[[CONST]], %[[TEMP]] : i32
@@ -216,33 +195,25 @@ subroutine lastprivate()
         !$omp section
             x = x * 10
 !CHECK: omp.section {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivateEx"}
-!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFlastprivateEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
-!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
-!CHECK: omp.barrier
 !CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
 !CHECK: %[[CONST:.*]] = arith.constant 1 : i32
 !CHECK: %[[RESULT:.*]] = arith.addi %[[TEMP]], %[[CONST]] : i32
 !CHECK: hlfir.assign %[[RESULT]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
-!CHECK: %[[TRUE:.*]] = arith.constant true
-!CHECK: fir.if %[[TRUE]] {
 !CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
 !CHECK: hlfir.assign %[[TEMP]] to %[[X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
-!CHECK: omp.barrier
-!CHECK: }
 !CHECK: omp.terminator
 !CHECK: }
         !$omp section
             x = x + 1
 !CHECK: omp.terminator
 !CHECK: }
+!CHECK: omp.barrier
      !$omp end sections nowait
 
-!CHECK: omp.sections {
-!CHECK: omp.section {
 !CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivateEx"}
 !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFlastprivateEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: omp.sections {
+!CHECK: omp.section {
 !CHECK: cf.br ^bb1
 !CHECK: ^bb1:  // pred: ^bb0
 !CHECK: %[[INNER_PRIVATE_X:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
@@ -257,7 +228,6 @@ subroutine lastprivate()
 !CHECK: }
 !CHECK: return
 !CHECK: }
-
     !$omp sections lastprivate(x)
         !$omp section
                 goto 30
@@ -265,14 +235,42 @@ subroutine lastprivate()
     !$omp end sections
 end subroutine
 
+!CHECK-LABEL: func @_QPlastprivate2
+!CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFlastprivate2Ex"}
+!CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFlastprivate2Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFlastprivate2Ey"}
+!CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFlastprivate2Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivate2Ex"}
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFlastprivate2Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFlastprivate2Ey"}
+!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFlastprivate2Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: omp.sections {
+!CHECK:   omp.section {
+!CHECK:     %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
+!CHECK:     hlfir.assign %[[TEMP]] to %[[X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK:     %[[TEMP2:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
+!CHECK:     hlfir.assign %[[TEMP2]] to %[[Y_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK:     omp.terminator
+!CHECK:   }
+!CHECK:   omp.terminator
+!CHECK: }
+subroutine lastprivate2()
+    integer :: x, y
+
+    !$omp sections lastprivate(x) lastprivate(y)
+        !$omp section
+          x = y + 1
+    !$omp end sections
+end subroutine
+
 !CHECK-LABEL: func @_QPunstructured_sections_privatization
 subroutine unstructured_sections_privatization()
 !CHECK: %[[X:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFunstructured_sections_privatizationEx"}
 !CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFunstructured_sections_privatizationEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-!CHECK: omp.sections {
-!CHECK: omp.section {
 !CHECK: %[[PRIVATE_X:.*]] = fir.alloca f32 {bindc_name = "x", pinned, uniq_name = "_QFunstructured_sections_privatizationEx"}
 !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFunstructured_sections_privatizationEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK: omp.sections {
+!CHECK: omp.section {
 !CHECK: cf.br ^bb1
 !CHECK: ^bb1:  // pred: ^bb0
 !CHECK: %[[INNER_PRIVATE_X:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<f32>
@@ -288,12 +286,12 @@ subroutine unstructured_sections_privatization()
             goto 40
         40  x = x + 1
     !$omp end sections
-!CHECK: omp.sections {
-!CHECK: omp.section {
 !CHECK: %[[PRIVATE_X:.*]] = fir.alloca f32 {bindc_name = "x", pinned, uniq_name = "_QFunstructured_sections_privatizationEx"}
 !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFunstructured_sections_privatizationEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 !CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<f32>
 !CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 temporary_lhs : f32, !fir.ref<f32>
+!CHECK: omp.sections {
+!CHECK: omp.section {
 !CHECK: cf.br ^bb1
 !CHECK: ^bb1:
 !CHECK: %[[INNER_PRIVATE_X:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<f32>
diff --git a/flang/test/Lower/OpenMP/simd.f90 b/flang/test/Lower/OpenMP/simd.f90
index 8ec1a3cefb4a..223b248b7934 100644
--- a/flang/test/Lower/OpenMP/simd.f90
+++ b/flang/test/Lower/OpenMP/simd.f90
@@ -24,7 +24,7 @@ end subroutine
 
 !CHECK-LABEL: func @_QPsimd_with_if_clause
 subroutine simd_with_if_clause(n, threshold)
-  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimd_with_if_clauseEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimd_with_if_clauseEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   integer :: i, n, threshold
   !$OMP SIMD IF( n .GE. threshold )
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
@@ -44,7 +44,7 @@ end subroutine
 
 !CHECK-LABEL: func @_QPsimd_with_simdlen_clause
 subroutine simd_with_simdlen_clause(n, threshold)
-  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimd_with_simdlen_clauseEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimd_with_simdlen_clauseEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   integer :: i, n, threshold
   !$OMP SIMD SIMDLEN(2)
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
@@ -63,7 +63,7 @@ end subroutine
 
 !CHECK-LABEL: func @_QPsimd_with_simdlen_clause_from_param
 subroutine simd_with_simdlen_clause_from_param(n, threshold)
-  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimd_with_simdlen_clause_from_paramEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimd_with_simdlen_clause_from_paramEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   integer :: i, n, threshold
   integer, parameter :: simdlen = 2;
   !$OMP SIMD SIMDLEN(simdlen)
@@ -83,7 +83,7 @@ end subroutine
 
 !CHECK-LABEL: func @_QPsimd_with_simdlen_clause_from_expr_from_param
 subroutine simd_with_simdlen_clause_from_expr_from_param(n, threshold)
-  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimd_with_simdlen_clause_from_expr_from_paramEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimd_with_simdlen_clause_from_expr_from_paramEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   integer :: i, n, threshold
   integer, parameter :: simdlen = 2;
   !$OMP SIMD SIMDLEN(simdlen*2 + 2)
@@ -103,7 +103,7 @@ end subroutine
 
 !CHECK-LABEL: func @_QPsimd_with_safelen_clause
 subroutine simd_with_safelen_clause(n, threshold)
-  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimd_with_safelen_clauseEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimd_with_safelen_clauseEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   integer :: i, n, threshold
   !$OMP SIMD SAFELEN(2)
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
@@ -122,7 +122,7 @@ end subroutine
 
 !CHECK-LABEL: func @_QPsimd_with_safelen_clause_from_expr_from_param
 subroutine simd_with_safelen_clause_from_expr_from_param(n, threshold)
-  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimd_with_safelen_clause_from_expr_from_paramEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimd_with_safelen_clause_from_expr_from_paramEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   integer :: i, n, threshold
   integer, parameter :: safelen = 2;
   !$OMP SIMD SAFELEN(safelen*2 + 2)
@@ -142,7 +142,7 @@ end subroutine
 
 !CHECK-LABEL: func @_QPsimd_with_simdlen_safelen_clause
 subroutine simd_with_simdlen_safelen_clause(n, threshold)
-  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimd_with_simdlen_safelen_clauseEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimd_with_simdlen_safelen_clauseEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   integer :: i, n, threshold
   !$OMP SIMD SIMDLEN(1) SAFELEN(2)
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
diff --git a/flang/test/Lower/OpenMP/single.f90 b/flang/test/Lower/OpenMP/single.f90
index 10d537a0e18b..91f8a592909a 100644
--- a/flang/test/Lower/OpenMP/single.f90
+++ b/flang/test/Lower/OpenMP/single.f90
@@ -11,7 +11,7 @@
 !CHECK-SAME: (%[[X:.*]]: !fir.ref<i32> {fir.bindc_name = "x"})
 subroutine omp_single(x)
   integer, intent(inout) :: x
-  !CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFomp_singleEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  !CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFomp_singleEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   !CHECK: omp.parallel
   !$omp parallel
   !CHECK: omp.single
@@ -34,7 +34,7 @@ end subroutine omp_single
 !CHECK-SAME: (%[[X:.*]]: !fir.ref<i32> {fir.bindc_name = "x"})
 subroutine omp_single_nowait(x)
   integer, intent(inout) :: x
-  !CHECK:   %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFomp_single_nowaitEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  !CHECK:   %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFomp_single_nowaitEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   !CHECK: omp.parallel
   !$omp parallel
   !CHECK: omp.single nowait
@@ -76,8 +76,8 @@ end subroutine single_allocate
 ! CHECK-LABEL: func.func @_QPsingle_privatization(
 ! CHECK-SAME:                                     %[[X:.*]]: !fir.ref<f32> {fir.bindc_name = "x"}, 
 ! CHECK-SAME:                                     %[[Y:.*]]: !fir.ref<f64> {fir.bindc_name = "y"}) {
-! CHECK:           %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFsingle_privatizationEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK:           %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFsingle_privatizationEy"} : (!fir.ref<f64>) -> (!fir.ref<f64>, !fir.ref<f64>)
+! CHECK:           %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFsingle_privatizationEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFsingle_privatizationEy"} : (!fir.ref<f64>, !fir.dscope) -> (!fir.ref<f64>, !fir.ref<f64>)
 ! CHECK:           omp.single   {
 ! CHECK:             %[[X_PVT:.*]] = fir.alloca f32 {bindc_name = "x", pinned, uniq_name = "_QFsingle_privatizationEx"}
 ! CHECK:             %[[X_PVT_DECL:.*]]:2 = hlfir.declare %[[X_PVT]] {uniq_name = "_QFsingle_privatizationEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
@@ -103,8 +103,8 @@ end subroutine
 ! CHECK-LABEL: func.func @_QPsingle_privatization2(
 ! CHECK-SAME:                                      %[[X:.*]]: !fir.ref<f32> {fir.bindc_name = "x"},
 ! CHECK-SAME:                                      %[[Y:.*]]: !fir.ref<f64> {fir.bindc_name = "y"}) {
-! CHECK:         %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFsingle_privatization2Ex"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK:         %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFsingle_privatization2Ey"} : (!fir.ref<f64>) -> (!fir.ref<f64>, !fir.ref<f64>)
+! CHECK:         %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFsingle_privatization2Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:         %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFsingle_privatization2Ey"} : (!fir.ref<f64>, !fir.dscope) -> (!fir.ref<f64>, !fir.ref<f64>)
 ! CHECK:         omp.parallel   {
 ! CHECK:           omp.single   {
 ! CHECK:             %[[X_PVT:.*]] = fir.alloca f32 {bindc_name = "x", pinned, uniq_name = "_QFsingle_privatization2Ex"}
diff --git a/flang/test/Lower/OpenMP/target.f90 b/flang/test/Lower/OpenMP/target.f90
index 44f77b5c3360..9bb855e44694 100644
--- a/flang/test/Lower/OpenMP/target.f90
+++ b/flang/test/Lower/OpenMP/target.f90
@@ -444,7 +444,7 @@ end subroutine omp_target_implicit_nested
 !CHECK: %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
 subroutine omp_target_implicit_bounds(n)
    !CHECK: %[[VAL_COPY:.*]] = fir.alloca i32
-   !CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFomp_target_implicit_boundsEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+   !CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_target_implicit_boundsEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
    !CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<i32>
    !CHECK: fir.store %[[VAL_2]] to %[[VAL_COPY]] : !fir.ref<i32>
    !CHECK: %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (i32) -> i64
@@ -455,7 +455,7 @@ subroutine omp_target_implicit_bounds(n)
    !CHECK: %[[VAL_8:.*]] = fir.alloca !fir.array<?xi32>, %[[VAL_7]] {bindc_name = "a", uniq_name = "_QFomp_target_implicit_boundsEa"}
    !CHECK: %[[VAL_9:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1>
    !CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_8]](%[[VAL_9]]) {uniq_name = "_QFomp_target_implicit_boundsEa"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>)
-   !CHECK: %[[DIMS0:.*]]:3 = fir.box_dims %9#0, %c0{{.*}} : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+   !CHECK: %[[DIMS0:.*]]:3 = fir.box_dims %{{[0-9]+}}#0, %c0{{.*}} : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
    !CHECK: %[[UB:.*]] = arith.subi %[[DIMS0]]#1, %c1{{.*}} : index
 
    integer :: n
@@ -532,7 +532,7 @@ end subroutine omp_target_device_ptr
    !CHECK: %[[VAL_0:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "a", uniq_name = "_QFomp_target_device_addrEa"}
    !CHECK: %[[VAL_0_DECL:.*]]:2 = hlfir.declare %0 {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFomp_target_device_addrEa"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
    !CHECK: %[[MAP_MEMBERS:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<!fir.box<!fir.ptr<i32>>>, i32) var_ptr_ptr({{.*}} : !fir.llvm_ptr<!fir.ref<i32>>) map_clauses(tofrom) capture(ByRef) -> !fir.llvm_ptr<!fir.ref<i32>> {name = ""}
-   !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.box<!fir.ptr<i32>>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_MEMBERS]] : !fir.llvm_ptr<!fir.ref<i32>>) -> !fir.ref<!fir.box<!fir.ptr<i32>>> {name = "a"}
+   !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.box<!fir.ptr<i32>>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_MEMBERS]] : [0] : !fir.llvm_ptr<!fir.ref<i32>>) -> !fir.ref<!fir.box<!fir.ptr<i32>>> {name = "a"}
    !CHECK: omp.target_data map_entries(%[[MAP_MEMBERS]], %[[MAP]] : {{.*}}) use_device_addr(%[[VAL_0_DECL]]#1 : !fir.ref<!fir.box<!fir.ptr<i32>>>) {
    !$omp target data map(tofrom: a) use_device_addr(a)
    !CHECK: ^bb0(%[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>>):
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90
index 6c9bc75b81d7..197800486c39 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90
@@ -75,7 +75,7 @@ end program
 ! CHECK-SAME:                                  %[[VAL_0:.*]]: !fir.box<!fir.array<?xf64>> {fir.bindc_name = "r"}) attributes {{.*}} {
 ! CHECK:           %[[VAL_1:.*]] = fir.address_of(@_QFFreduceEi) : !fir.ref<i32>
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFFreduceEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = {{.*}}, uniq_name = "_QFFreduceEr"} : (!fir.box<!fir.array<?xf64>>) -> (!fir.box<!fir.array<?xf64>>, !fir.box<!fir.array<?xf64>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = {{.*}}, uniq_name = "_QFFreduceEr"} : (!fir.box<!fir.array<?xf64>>, !fir.dscope) -> (!fir.box<!fir.array<?xf64>>, !fir.box<!fir.array<?xf64>>)
 ! CHECK:           omp.parallel {
 ! CHECK:             %[[VAL_4:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 ! CHECK:             %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFFreduceEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-iand-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-iand-byref.f90
index 40280c56dad6..df07a9065331 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-iand-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-iand-byref.f90
@@ -26,7 +26,7 @@
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_iandEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_iandEx"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_iandEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFreduction_iandEy"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_iandEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel {
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-iand.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-iand.f90
index 986892d3584f..ae771c692b98 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-iand.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-iand.f90
@@ -20,7 +20,7 @@
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_iandEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_iandEx"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_iandEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFreduction_iandEy"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_iandEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel {
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-ieor-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-ieor-byref.f90
index ee33ce2f348d..50cec61b602b 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-ieor-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-ieor-byref.f90
@@ -22,7 +22,7 @@
 !CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>>
 !CHECK: %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_ieorEx"}
 !CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X_REF]] {uniq_name = "_QFreduction_ieorEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y_BOX]] {uniq_name = "_QFreduction_ieorEy"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+!CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y_BOX]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_ieorEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 
 
 !CHECK: omp.parallel
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-ieor.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-ieor.f90
index b362731b3371..d50f6b854f48 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-ieor.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-ieor.f90
@@ -13,7 +13,7 @@
 !CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>>
 !CHECK: %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_ieorEx"}
 !CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X_REF]] {uniq_name = "_QFreduction_ieorEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y_BOX]] {uniq_name = "_QFreduction_ieorEy"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+!CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y_BOX]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_ieorEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 
 
 !CHECK: omp.parallel
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-ior-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-ior-byref.f90
index 0052773bb5ad..d847bba89782 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-ior-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-ior-byref.f90
@@ -24,7 +24,7 @@
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_iorEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_iorEx"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_iorEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFreduction_iorEy"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_iorEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-ior.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-ior.f90
index f32be43b9b71..182f1eaeeeb7 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-ior.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-ior.f90
@@ -20,7 +20,7 @@
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_iorEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_iorEx"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_iorEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFreduction_iorEy"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_iorEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-and-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-and-byref.f90
index dfc018ed7c5a..69789e4c751e 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-and-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-and-byref.f90
@@ -32,7 +32,7 @@
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant true
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -82,7 +82,7 @@ end subroutine simple_reduction
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant true
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -129,7 +129,7 @@ end subroutine
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
 ! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-and.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-and.f90
index c529bd4755b6..078a463919e9 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-and.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-and.f90
@@ -26,7 +26,7 @@
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant true
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -76,7 +76,7 @@ end subroutine simple_reduction
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant true
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -123,7 +123,7 @@ end subroutine
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
 ! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv-byref.f90
index a54795a4446f..54175994ecd8 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv-byref.f90
@@ -32,7 +32,7 @@
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant true
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -81,7 +81,7 @@ end subroutine
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant true
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -128,7 +128,7 @@ end subroutine
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
 ! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv.f90
index 1021b5926b91..8204e88815f3 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv.f90
@@ -26,7 +26,7 @@
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant true
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -75,7 +75,7 @@ end subroutine
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant true
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -122,7 +122,7 @@ end subroutine
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
 ! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv-byref.f90
index 854cb19ecd75..c0a82476c7b1 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv-byref.f90
@@ -32,7 +32,7 @@
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant true
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -82,7 +82,7 @@ end subroutine
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant true
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -131,7 +131,7 @@ end subroutine
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
 ! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv.f90
index f5c84aaaf485..957de9b6741a 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv.f90
@@ -26,7 +26,7 @@
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant true
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -76,7 +76,7 @@ end subroutine
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant true
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -125,7 +125,7 @@ end subroutine
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
 ! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-or-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-or-byref.f90
index e268c6ff6cf5..0af9e0d5c9fd 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-or-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-or-byref.f90
@@ -31,7 +31,7 @@
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant true
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -80,7 +80,7 @@ end subroutine
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant true
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -127,7 +127,7 @@ end subroutine
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
 ! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-or.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-or.f90
index 26dc0c327aad..d77566b109e5 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-or.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-or.f90
@@ -26,7 +26,7 @@
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant true
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -75,7 +75,7 @@ end subroutine
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant true
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -122,7 +122,7 @@ end subroutine
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
 ! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90
index 95bdc98f18c2..11d039f9226c 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90
@@ -37,7 +37,7 @@
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_max_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_max_intEx"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_max_intEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFreduction_max_intEy"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_max_intEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel {
@@ -68,7 +68,7 @@
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_max_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFreduction_max_realEx"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_max_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFreduction_max_realEy"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_max_realEy"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : f32, !fir.ref<f32>
 ! CHECK:           omp.parallel {
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-max-hlfir-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-max-hlfir-byref.f90
index 352888bb94f5..a352cb195c25 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-max-hlfir-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-max-hlfir-byref.f90
@@ -24,7 +24,7 @@
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_max_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_max_intEx"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_max_intEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFreduction_max_intEy"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_max_intEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel {
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-max-hlfir.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-max-hlfir.f90
index f4caea5a269a..71631fb14592 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-max-hlfir.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-max-hlfir.f90
@@ -20,7 +20,7 @@
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_max_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_max_intEx"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_max_intEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFreduction_max_intEy"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_max_intEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel {
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-max.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-max.f90
index ff005f32487e..d4e827f3b7e2 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-max.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-max.f90
@@ -31,7 +31,7 @@
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_max_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_max_intEx"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_max_intEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFreduction_max_intEy"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_max_intEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel {
@@ -62,7 +62,7 @@
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_max_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFreduction_max_realEx"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_max_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFreduction_max_realEy"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_max_realEy"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : f32, !fir.ref<f32>
 ! CHECK:           omp.parallel {
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90
index 9787512ab078..d168b2a89295 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90
@@ -37,7 +37,7 @@
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_min_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_min_intEx"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_min_intEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFreduction_min_intEy"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_min_intEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel {
@@ -68,7 +68,7 @@
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_min_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFreduction_min_realEx"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_min_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFreduction_min_realEy"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_min_realEy"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : f32, !fir.ref<f32>
 ! CHECK:           omp.parallel {
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-min.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-min.f90
index 801ef99480a2..80c056b5e8c5 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-min.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-min.f90
@@ -31,7 +31,7 @@
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_min_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_min_intEx"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_min_intEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFreduction_min_intEy"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_min_intEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel {
@@ -62,7 +62,7 @@
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_min_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFreduction_min_realEx"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_min_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFreduction_min_realEy"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_min_realEy"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : f32, !fir.ref<f32>
 ! CHECK:           omp.parallel {
diff --git a/flang/test/Lower/allocatable-polymorphic.f90 b/flang/test/Lower/allocatable-polymorphic.f90
index 10d7d957a257..e96945ef89e5 100644
--- a/flang/test/Lower/allocatable-polymorphic.f90
+++ b/flang/test/Lower/allocatable-polymorphic.f90
@@ -520,8 +520,8 @@ contains
 
 ! CHECK-LABEL: func.func @_QMpolyPtest_allocatable_up_from_up_mold(
 ! CHECK-SAME: %[[A:.*]]: !fir.ref<!fir.class<!fir.heap<none>>> {fir.bindc_name = "a"}, %[[B:.*]]: !fir.ref<!fir.class<!fir.ptr<none>>> {fir.bindc_name = "b"}) {
-! CHECK: %[[A_DECL:.*]]:2 = hlfir.declare %[[A]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMpolyFtest_allocatable_up_from_up_moldEa"} : (!fir.ref<!fir.class<!fir.heap<none>>>) -> (!fir.ref<!fir.class<!fir.heap<none>>>, !fir.ref<!fir.class<!fir.heap<none>>>)
-! CHECK: %[[B_DECL:.*]]:2 = hlfir.declare %[[B]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMpolyFtest_allocatable_up_from_up_moldEb"} : (!fir.ref<!fir.class<!fir.ptr<none>>>) -> (!fir.ref<!fir.class<!fir.ptr<none>>>, !fir.ref<!fir.class<!fir.ptr<none>>>)
+! CHECK: %[[A_DECL:.*]]:2 = hlfir.declare %[[A]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMpolyFtest_allocatable_up_from_up_moldEa"} : (!fir.ref<!fir.class<!fir.heap<none>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<none>>>, !fir.ref<!fir.class<!fir.heap<none>>>)
+! CHECK: %[[B_DECL:.*]]:2 = hlfir.declare %[[B]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMpolyFtest_allocatable_up_from_up_moldEb"} : (!fir.ref<!fir.class<!fir.ptr<none>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<none>>>, !fir.ref<!fir.class<!fir.ptr<none>>>)
 ! CHECK: %[[LOAD_B:.*]] = fir.load %[[B_DECL]]#1 : !fir.ref<!fir.class<!fir.ptr<none>>>
 ! CHECK: %[[RANK:.*]] = arith.constant 0 : i32
 ! CHECK: %[[A_BOX_NONE:.*]] = fir.convert %[[A_DECL]]#1 : (!fir.ref<!fir.class<!fir.heap<none>>>) -> !fir.ref<!fir.box<none>>
@@ -539,7 +539,7 @@ contains
 ! CHECK-LABEL: func.func @_QMpolyPtest_allocatable_up_from_mold_rank(
 ! CHECK-SAME: %[[A:.*]]: !fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>> {fir.bindc_name = "a"}) {
 ! CHECK: %[[VALUE_10:.*]] = fir.alloca i32
-! CHECK: %[[A_DECL:.*]]:2 = hlfir.declare %[[A]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMpolyFtest_allocatable_up_from_mold_rankEa"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>)
+! CHECK: %[[A_DECL:.*]]:2 = hlfir.declare %[[A]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMpolyFtest_allocatable_up_from_mold_rankEa"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>)
 ! CHECK: %[[C10:.*]] = arith.constant 10 : i32
 ! CHECK: fir.store %[[C10]] to %[[VALUE_10]] : !fir.ref<i32>
 ! CHECK: %[[EMBOX_10:.*]] = fir.embox %[[VALUE_10]] : (!fir.ref<i32>) -> !fir.box<i32>
diff --git a/flang/test/Lower/array-expression.f90 b/flang/test/Lower/array-expression.f90
index 75789cd6952a..bdfbe6dd3509 100644
--- a/flang/test/Lower/array-expression.f90
+++ b/flang/test/Lower/array-expression.f90
@@ -991,7 +991,7 @@ end subroutine test19f
 ! CHECK:         %[[VAL_24:.*]] = fir.array_load %[[VAL_5]](%[[VAL_22]]) {{\[}}%[[VAL_23]]] : (!fir.ref<!fir.array<140x!fir.char<2,13>>>, !fir.shape<1>, !fir.slice<1>) -> !fir.array<140x!fir.char<2,13>>
 ! CHECK:         %[[VAL_25:.*]] = fir.load %[[VAL_2]] : !fir.ref<i32>
 ! CHECK:         %[[VAL_26:.*]] = fir.convert %[[VAL_25]] : (i32) -> i64
-! CHECK:         %[[char_temp:.*]] = fir.alloca !fir.char<4,?>(%16 : i64) {bindc_name = ".chrtmp"}
+! CHECK:         %[[char_temp:.*]] = fir.alloca !fir.char<4,?>(%{{[0-9]+}} : i64) {bindc_name = ".chrtmp"}
 ! CHECK:         %[[VAL_27:.*]] = arith.constant 1 : index
 ! CHECK:         %[[VAL_28:.*]] = arith.constant 0 : index
 ! CHECK:         %[[VAL_29:.*]] = arith.subi %[[VAL_13]], %[[VAL_27]] : index
diff --git a/flang/test/Lower/branching-directive.f90 b/flang/test/Lower/branching-directive.f90
new file mode 100644
index 000000000000..a0a147f1053a
--- /dev/null
+++ b/flang/test/Lower/branching-directive.f90
@@ -0,0 +1,25 @@
+!RUN: flang-new -fc1 -emit-hlfir -fopenmp -o - %s | FileCheck %s
+
+!https://github.com/llvm/llvm-project/issues/91526
+
+!CHECK:   cf.cond_br %{{[0-9]+}}, ^bb[[THEN:[0-9]+]], ^bb[[ELSE:[0-9]+]]
+!CHECK: ^bb[[THEN]]:
+!CHECK:   cf.br ^bb[[EXIT:[0-9]+]]
+!CHECK: ^bb[[ELSE]]:
+!CHECK:   fir.call @_FortranAStopStatement
+!CHECK:   fir.unreachable
+!CHECK: ^bb[[EXIT]]:
+
+subroutine simple(y)
+  implicit none
+  logical, intent(in) :: y
+  integer :: i
+  if (y) then
+!$omp parallel
+    i = 1
+!$omp end parallel
+  else
+    stop 1
+  end if
+end subroutine simple
+
diff --git a/flang/test/Lower/character-substrings.f90 b/flang/test/Lower/character-substrings.f90
index 8e1a91a247d4..874f2944cff1 100644
--- a/flang/test/Lower/character-substrings.f90
+++ b/flang/test/Lower/character-substrings.f90
@@ -231,7 +231,7 @@ end subroutine array_substring_assignment
 ! CHECK:         %[[c0:.*]] = arith.constant 0 : index
 ! CHECK:         %[[sub:.*]] = arith.subi %[[VAL_1]], %[[VAL_4]] : index
 ! CHECK:         %[[add:.*]] = arith.addi %[[sub]], %[[VAL_4]] : index
-! CHECK:         %[[div:.*]] = arith.divsi %4, %[[VAL_4]] : index
+! CHECK:         %[[div:.*]] = arith.divsi %{{[0-9]+}}, %[[VAL_4]] : index
 ! CHECK:         %[[cmp:.*]] = arith.cmpi sgt, %[[div]], %[[c0]] : index
 ! CHECK:         %[[select:.*]] = arith.select %[[cmp]], %[[div]], %[[c0]] : index
 ! CHECK:         %[[VAL_6:.*]] = fir.array_load %[[VAL_0]](%[[VAL_3]]) {{\[}}%[[VAL_5]]] : (!fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment2Tt{ch:!fir.char<1,7>}>>>, !fir.shape<1>, !fir.slice<1>) -> !fir.array<8x!fir.char<1,7>>
@@ -323,7 +323,7 @@ end subroutine array_substring_assignment2
 ! CHECK:         %[[c0:.*]] = arith.constant 0 : index
 ! CHECK:         %[[sub:.*]] = arith.subi %[[VAL_2]], %[[VAL_6]] : index
 ! CHECK:         %[[add:.*]] = arith.addi %[[sub]], %[[VAL_6]] : index
-! CHECK:         %[[div:.*]] = arith.divsi %4, %[[VAL_6]] : index
+! CHECK:         %[[div:.*]] = arith.divsi %[[add]], %[[VAL_6]] : index
 ! CHECK:         %[[cmp:.*]] = arith.cmpi sgt, %[[div]], %[[c0]] : index
 ! CHECK:         %[[select:.*]] = arith.select %[[cmp]], %[[div]], %[[c0]] : index
 ! CHECK:         %[[VAL_8:.*]] = fir.array_load %[[VAL_0]](%[[VAL_5]]) {{\[}}%[[VAL_7]]] : (!fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment3Tt{ch:!fir.char<1,7>}>>>, !fir.shape<1>, !fir.slice<1>) -> !fir.array<8x!fir.char<1,7>>
diff --git a/flang/test/Lower/charconvert.f90 b/flang/test/Lower/charconvert.f90
index c8ec254b6a54..e3f7f66b8476 100644
--- a/flang/test/Lower/charconvert.f90
+++ b/flang/test/Lower/charconvert.f90
@@ -14,17 +14,17 @@ end subroutine
                                                 
 ! CHECK: func.func @_QPtest_c1_to_c4(%[[ARG0:.*]]: !fir.boxchar<4> {fir.bindc_name = "c4"}, %[[ARG1:.*]]: !fir.boxchar<1> {fir.bindc_name = "c1"}) {
 ! CHECK:   %[[VAL_0:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:   %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]]#0 typeparams %[[VAL_0]]#1 {uniq_name = "_QFtest_c1_to_c4Ec1"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:   %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]]#0 typeparams %[[VAL_0]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_c1_to_c4Ec1"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:   %[[VAL_2:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<4>) -> (!fir.ref<!fir.char<4,?>>, index)
-! CHECK:   %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 {uniq_name = "_QFtest_c1_to_c4Ec4"} : (!fir.ref<!fir.char<4,?>>, index) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>)
+! CHECK:   %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_c1_to_c4Ec4"} : (!fir.ref<!fir.char<4,?>>, index, !fir.dscope) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>)
 ! CHECK:   %[[VAL_4:.*]] = fir.alloca !fir.char<4,?>(%[[VAL_0]]#1 : index)
 ! CHECK:   fir.char_convert %[[VAL_1]]#1 for %[[VAL_0]]#1 to %[[VAL_4:.*]] : !fir.ref<!fir.char<1,?>>, index, !fir.ref<!fir.char<4,?>>
 
 ! CHECK: func.func @_QPtest_c4_to_c1(%[[ARG0:.*]]: !fir.boxchar<4> {fir.bindc_name = "c4"}, %[[ARG1:.*]]: !fir.boxchar<1> {fir.bindc_name = "c1"}) {
 ! CHECK:   %[[VAL_0:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:   %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]]#0 typeparams %[[VAL_0]]#1 {uniq_name = "_QFtest_c4_to_c1Ec1"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:   %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]]#0 typeparams %[[VAL_0]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_c4_to_c1Ec1"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:   %[[VAL_2:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<4>) -> (!fir.ref<!fir.char<4,?>>, index)
-! CHECK:   %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 {uniq_name = "_QFtest_c4_to_c1Ec4"} : (!fir.ref<!fir.char<4,?>>, index) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>)
+! CHECK:   %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_c4_to_c1Ec4"} : (!fir.ref<!fir.char<4,?>>, index, !fir.dscope) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>)
 ! CHECK:   %[[C4:.*]] = arith.constant 4 : index
 ! CHECK:   %[[VAL_4:.*]] = arith.muli %[[VAL_2]]#1, %[[C4]] : index
 ! CHECK:   %[[VAL_5:.*]] = fir.alloca !fir.char<1,?>(%[[VAL_4]] : index)
diff --git a/flang/test/Lower/dispatch.f90 b/flang/test/Lower/dispatch.f90
index 60364076e633..02338065548d 100644
--- a/flang/test/Lower/dispatch.f90
+++ b/flang/test/Lower/dispatch.f90
@@ -151,7 +151,7 @@ module call_dispatch
 
 ! CHECK-LABEL: func.func @_QMcall_dispatchPcheck_dispatch(
 ! CHECK-SAME:  %[[P:.*]]: !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>> {fir.bindc_name = "p"}) {
-! CHECK:       %[[P_DECL:.*]]:2 = hlfir.declare %[[P]] {uniq_name = "_QMcall_dispatchFcheck_dispatchEp"} : (!fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) -> (!fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>, !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>)
+! CHECK:       %[[P_DECL:.*]]:2 = hlfir.declare %[[P]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatchEp"} : (!fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>, !fir.dscope) -> (!fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>, !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>)
 ! CHECK:       fir.dispatch "tbp_nopass"(%[[P_DECL]]#1 : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>){{$}}
 ! CHECK:       fir.dispatch "tbp_pass"(%[[P_DECL]]#0 : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) (%[[P_DECL]]#0 : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) {pass_arg_pos = 0 : i32}
 ! CHECK:       fir.dispatch "tbp_pass_arg0"(%[[P_DECL]]#0 : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) (%[[P_DECL]]#0 : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) {pass_arg_pos = 0 : i32}
@@ -176,8 +176,8 @@ module call_dispatch
 ! CHECK-LABEL: func.func @_QMcall_dispatchPcheck_dispatch_deferred(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.class<!fir.type<_QMcall_dispatchTa1{a:f32,b:f32}>> {fir.bindc_name = "a"}, 
 ! CHECK-SAME: %[[ARG1:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}) {
-! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_QMcall_dispatchFcheck_dispatch_deferredEa"} : (!fir.class<!fir.type<_QMcall_dispatchTa1{a:f32,b:f32}>>) -> (!fir.class<!fir.type<_QMcall_dispatchTa1{a:f32,b:f32}>>, !fir.class<!fir.type<_QMcall_dispatchTa1{a:f32,b:f32}>>)
-! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_QMcall_dispatchFcheck_dispatch_deferredEx"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_deferredEa"} : (!fir.class<!fir.type<_QMcall_dispatchTa1{a:f32,b:f32}>>, !fir.dscope) -> (!fir.class<!fir.type<_QMcall_dispatchTa1{a:f32,b:f32}>>, !fir.class<!fir.type<_QMcall_dispatchTa1{a:f32,b:f32}>>)
+! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_deferredEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK: fir.dispatch "nopassd"(%[[ARG0_DECL]]#1 : !fir.class<!fir.type<_QMcall_dispatchTa1{a:f32,b:f32}>>) (%[[ARG1_DECL]]#0 : !fir.box<!fir.array<?xf32>>)
 
     subroutine check_dispatch_scalar_allocatable(p)
@@ -187,7 +187,7 @@ module call_dispatch
 
 ! CHECK-LABEL: func.func @_QMcall_dispatchPcheck_dispatch_scalar_allocatable(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.class<!fir.heap<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>> {fir.bindc_name = "p"}) {
-! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %arg0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcall_dispatchFcheck_dispatch_scalar_allocatableEp"} : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>) -> (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>, !fir.ref<!fir.class<!fir.heap<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>)
+! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcall_dispatchFcheck_dispatch_scalar_allocatableEp"} : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>, !fir.ref<!fir.class<!fir.heap<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>)
 ! CHECK: %[[LOAD:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>
 ! CHECK: %[[REBOX:.*]] = fir.rebox %[[LOAD]] : (!fir.class<!fir.heap<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>) -> !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>
 ! CHECK: fir.dispatch "tbp_pass"(%[[REBOX]] : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) (%[[REBOX]] : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) {pass_arg_pos = 0 : i32}
@@ -199,7 +199,7 @@ module call_dispatch
 
 ! CHECK-LABEL: func.func @_QMcall_dispatchPcheck_dispatch_scalar_pointer(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>> {fir.bindc_name = "p"}) {
-! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMcall_dispatchFcheck_dispatch_scalar_pointerEp"} : (!fir.ref<!fir.class<!fir.ptr<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>) -> (!fir.ref<!fir.class<!fir.ptr<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>)
+! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMcall_dispatchFcheck_dispatch_scalar_pointerEp"} : (!fir.ref<!fir.class<!fir.ptr<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>)
 ! CHECK: %[[LOAD:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>
 ! CHECK: %[[REBOX:.*]] = fir.rebox %[[LOAD]] : (!fir.class<!fir.ptr<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>) -> !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>
 ! CHECK: fir.dispatch "tbp_pass"(%[[REBOX]] : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) (%[[REBOX]] : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) {pass_arg_pos = 0 : i32}
@@ -220,8 +220,8 @@ module call_dispatch
 ! CHECK-LABEL: func.func @_QMcall_dispatchPcheck_dispatch_static_array(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.class<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>> {fir.bindc_name = "p"}, 
 ! CHECK-SAME: %[[ARG1:.*]]: !fir.ref<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>> {fir.bindc_name = "t"}) {
-! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_QMcall_dispatchFcheck_dispatch_static_arrayEp"} : (!fir.class<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>) -> (!fir.class<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.class<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>)
-! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]](%{{.*}}) {uniq_name = "_QMcall_dispatchFcheck_dispatch_static_arrayEt"} : (!fir.ref<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.ref<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>)
+! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_static_arrayEp"} : (!fir.class<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.dscope) -> (!fir.class<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.class<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>)
+! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]](%{{.*}}) dummy_scope %{{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_static_arrayEt"} : (!fir.ref<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.ref<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>)
 ! CHECK: fir.do_loop {{.*}} {
 ! CHECK: %[[DESIGNATE:.*]] = hlfir.designate %[[ARG0_DECL]]#0 (%{{.*}})  : (!fir.class<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, i64) -> !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>
 ! CHECK: fir.dispatch "tbp_pass"(%[[DESIGNATE]] : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) (%[[DESIGNATE]] : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) {pass_arg_pos = 0 : i32}
@@ -248,8 +248,8 @@ module call_dispatch
 ! CHECK-LABEL: func.func @_QMcall_dispatchPcheck_dispatch_dynamic_array(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>> {fir.bindc_name = "p"}, 
 ! CHECK-SAME: %[[ARG1:.*]]: !fir.box<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>> {fir.bindc_name = "t"}) {
-! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_QMcall_dispatchFcheck_dispatch_dynamic_arrayEp"} : (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>) -> (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>)
-! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_QMcall_dispatchFcheck_dispatch_dynamic_arrayEt"} : (!fir.box<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>) -> (!fir.box<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.box<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>)
+! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_dynamic_arrayEp"} : (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>)
+! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_dynamic_arrayEt"} : (!fir.box<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.box<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>)
 ! CHECK: %{{.*}} = fir.do_loop {{.*}} {
 ! CHECK: %[[DESIGNATE:.*]] = hlfir.designate %[[ARG0_DECL]]#0 (%{{.*}})  : (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, i64) -> !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>
 ! CHECK: fir.dispatch "tbp_pass"(%[[DESIGNATE]] : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) (%[[DESIGNATE]] : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) {pass_arg_pos = 0 : i32}
@@ -276,8 +276,8 @@ module call_dispatch
 ! CHECK-LABEL: func.func @_QMcall_dispatchPcheck_dispatch_allocatable_array(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>> {fir.bindc_name = "p"}, 
 ! CHECK-SAME: %[[ARG1:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>> {fir.bindc_name = "t"}) {
-! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcall_dispatchFcheck_dispatch_allocatable_arrayEp"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>)
-! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcall_dispatchFcheck_dispatch_allocatable_arrayEt"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>)
+! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcall_dispatchFcheck_dispatch_allocatable_arrayEp"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>)
+! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcall_dispatchFcheck_dispatch_allocatable_arrayEt"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>)
 ! CHECK: %{{.*}} = fir.do_loop {{.*}} {
 ! CHECK: %[[LOAD_ARG0:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>
 ! CHECK: %[[DESIGNATE:.*]] = hlfir.designate %[[LOAD_ARG0]] (%{{.*}})  : (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>, i64) -> !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>
@@ -306,8 +306,8 @@ module call_dispatch
 ! CHECK-LABEL: func.func @_QMcall_dispatchPcheck_dispatch_pointer_array(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>> {fir.bindc_name = "p"}, 
 ! CHECK-SAME: %[[ARG1:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>> {fir.bindc_name = "t"}) {
-! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMcall_dispatchFcheck_dispatch_pointer_arrayEp"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>)
-! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMcall_dispatchFcheck_dispatch_pointer_arrayEt"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>)
+! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMcall_dispatchFcheck_dispatch_pointer_arrayEp"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>)
+! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMcall_dispatchFcheck_dispatch_pointer_arrayEt"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>)
 
 ! CHECK: %{{.*}} = fir.do_loop {{.*}} {
 ! CHECK: %[[LOAD_ARG0:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>
@@ -334,8 +334,8 @@ module call_dispatch
 ! CHECK-LABEL: func.func @_QMcall_dispatchPcheck_dispatch_dynamic_array_copy(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>> {fir.bindc_name = "p"}, 
 ! CHECK-SAME: %[[ARG1:.*]]: !fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>> {fir.bindc_name = "o"}) {
-! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_QMcall_dispatchFcheck_dispatch_dynamic_array_copyEo"} : (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>) -> (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>)
-! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_QMcall_dispatchFcheck_dispatch_dynamic_array_copyEp"} : (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>) -> (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>)
+! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_dynamic_array_copyEo"} : (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>)
+! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_dynamic_array_copyEp"} : (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>)
 
 ! CHECK: %{{.*}} = fir.do_loop {{.*}} {
 ! CHECK: %[[DESIGNATE0:.*]] = hlfir.designate %[[ARG0_DECL]]#0 (%{{.*}})  : (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, i64) -> !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>
diff --git a/flang/test/Lower/do_loop.f90 b/flang/test/Lower/do_loop.f90
index 4ace17342ade..d9c83658ee25 100644
--- a/flang/test/Lower/do_loop.f90
+++ b/flang/test/Lower/do_loop.f90
@@ -132,6 +132,7 @@ end subroutine
 ! CHECK-SAME: (%[[S_REF:.*]]: !fir.ref<i32> {fir.bindc_name = "s"}, %[[E_REF:.*]]: !fir.ref<i32> {fir.bindc_name = "e"}, %[[ST_REF:.*]]: !fir.ref<i32> {fir.bindc_name = "st"}) {
 subroutine loop_with_variable_step(s,e,st)
   integer :: s, e, st
+  ! CHECK: %[[I_REF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFloop_with_variable_stepEi"}
   ! CHECK: %[[S:.*]] = fir.load %[[S_REF]] : !fir.ref<i32>
   ! CHECK: %[[S_CVT:.*]] = fir.convert %[[S]] : (i32) -> index
   ! CHECK: %[[E:.*]] = fir.load %[[E_REF]] : !fir.ref<i32>
diff --git a/flang/test/Lower/if-loc.f90 b/flang/test/Lower/if-loc.f90
new file mode 100644
index 000000000000..88d6609799a5
--- /dev/null
+++ b/flang/test/Lower/if-loc.f90
@@ -0,0 +1,34 @@
+! RUN: bbc -emit-hlfir -mlir-print-debuginfo -o - %s | FileCheck %s
+
+  integer :: n = 0, x = 1
+  if (x .ne. 1) goto 9
+  n = n + 1
+  if (x .gt. 1) goto 9
+  n = n + 1
+9 print *, 'n =', n
+end
+
+! CHECK-LABEL: c.func @_QQmain
+  ! CHECK:  %[[V_0:[0-9]+]] = fir.address_of(@_QFEn) : !fir.ref<i32> loc("{{.*}}if-loc.f90":3:
+  ! CHECK:  %[[V_1:[0-9]+]]:2 = hlfir.declare %[[V_0]] {uniq_name = "_QFEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) loc("{{.*}}if-loc.f90":3:
+  ! CHECK:  %[[V_2:[0-9]+]] = fir.address_of(@_QFEx) : !fir.ref<i32> loc("{{.*}}if-loc.f90":3:
+  ! CHECK:  %[[V_3:[0-9]+]]:2 = hlfir.declare %[[V_2]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) loc("{{.*}}if-loc.f90":3:
+  ! CHECK:  %[[V_4:[0-9]+]] = fir.load %[[V_3]]#0 : !fir.ref<i32> loc("{{.*}}if-loc.f90":4:
+  ! CHECK:  %[[V_5:[0-9]+]] = arith.cmpi ne, %[[V_4]], %c1{{.*}} : i32 loc("{{.*}}if-loc.f90":4:
+  ! CHECK:  %[[V_6:[0-9]+]] = arith.xori %[[V_5]], %true{{[_0-9]*}} : i1 loc("{{.*}}if-loc.f90":4:
+  ! CHECK:  fir.if %[[V_6]] {
+  ! CHECK:    %[[V_18:[0-9]+]] = fir.load %[[V_1]]#0 : !fir.ref<i32> loc("{{.*}}if-loc.f90":5:
+  ! CHECK:    %[[V_19:[0-9]+]] = arith.addi %[[V_18]], %c1{{.*}} : i32 loc("{{.*}}if-loc.f90":5:
+  ! CHECK:    hlfir.assign %[[V_19]] to %[[V_1]]#0 : i32, !fir.ref<i32> loc("{{.*}}if-loc.f90":5:
+  ! CHECK:    %[[V_20:[0-9]+]] = fir.load %[[V_3]]#0 : !fir.ref<i32> loc("{{.*}}if-loc.f90":6:
+  ! CHECK:    %[[V_21:[0-9]+]] = arith.cmpi sgt, %[[V_20]], %c1{{.*}} : i32 loc("{{.*}}if-loc.f90":6:
+  ! CHECK:    %[[V_22:[0-9]+]] = arith.xori %[[V_21]], %true{{[_0-9]*}} : i1 loc("{{.*}}if-loc.f90":6:
+  ! CHECK:    fir.if %[[V_22]] {
+  ! CHECK:      %[[V_23:[0-9]+]] = fir.load %[[V_1]]#0 : !fir.ref<i32> loc("{{.*}}if-loc.f90":7:
+  ! CHECK:      %[[V_24:[0-9]+]] = arith.addi %[[V_23]], %c1{{.*}} : i32 loc("{{.*}}if-loc.f90":7:
+  ! CHECK:      hlfir.assign %[[V_24]] to %[[V_1]]#0 : i32, !fir.ref<i32> loc("{{.*}}if-loc.f90":7:
+  ! CHECK:    }
+  ! CHECK:  }
+  ! CHECK:  %[[V_9:[0-9]+]] = fir.call @_FortranAioBeginExternalListOutput{{.*}} loc("{{.*}}if-loc.f90":8:
+  ! CHECK:  return loc("{{.*}}if-loc.f90":9:
+  ! CHECK:}
diff --git a/flang/test/Lower/pointer-references.f90 b/flang/test/Lower/pointer-references.f90
index ace64f9ec7ef..02394e7ec76b 100644
--- a/flang/test/Lower/pointer-references.f90
+++ b/flang/test/Lower/pointer-references.f90
@@ -34,7 +34,7 @@ subroutine char_ptr(p)
   ! CHECK: %[[count:.*]] = arith.muli %[[one]], %[[size]] : i64
   ! CHECK: %[[dst:.*]] = fir.convert %[[addr]] : (!fir.ptr<!fir.char<1,12>>) -> !fir.ref<i8>
   ! CHECK: %[[src:.*]] = fir.convert %[[str]] : (!fir.ref<!fir.char<1,12>>) -> !fir.ref<i8>
-  ! CHECK: fir.call @llvm.memmove.p0.p0.i64(%[[dst]], %[[src]], %5, %false) {{.*}}: (!fir.ref<i8>, !fir.ref<i8>, i64, i1) -> ()
+  ! CHECK: fir.call @llvm.memmove.p0.p0.i64(%[[dst]], %[[src]], %{{[0-9]+}}, %false) {{.*}}: (!fir.ref<i8>, !fir.ref<i8>, i64, i1) -> ()
   p = "hello world!"
 
   ! CHECK: %[[boxload2:.*]] = fir.load %[[arg0]]
diff --git a/flang/test/Lower/polymorphic.f90 b/flang/test/Lower/polymorphic.f90
index 70c1f768e389..14ec8a06a964 100644
--- a/flang/test/Lower/polymorphic.f90
+++ b/flang/test/Lower/polymorphic.f90
@@ -298,7 +298,7 @@ module polymorphic_test
 ! CHECK: %[[ZERO:.*]] = fir.zero_bits !fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>
 ! CHECK: fir.store %[[ZERO]] to %[[PTR]] : !fir.ref<!fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>
 ! CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[ARG0]] : (!fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>) -> !fir.ref<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>
-! CHECK: %[[CONVERT:.*]] = fir.convert %3 : (!fir.ref<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>) -> !fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>
+! CHECK: %[[CONVERT:.*]] = fir.convert %{{[0-9]+}} : (!fir.ref<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>) -> !fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>
 ! CHECK: fir.store %[[CONVERT]] to %[[PTR]] : !fir.ref<!fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>
 
   subroutine nullify_pointer_array(a)
diff --git a/flang/test/Lower/select-type.f90 b/flang/test/Lower/select-type.f90
index 3243a813e9d5..e4ff2fef0efd 100644
--- a/flang/test/Lower/select-type.f90
+++ b/flang/test/Lower/select-type.f90
@@ -498,7 +498,7 @@ contains
 ! CHECK:  fir.array_merge_store %[[ARRAY_LOAD]], %[[LOOP_RES]] to %[[BOX]] : !fir.array<?xf32>, !fir.array<?xf32>, !fir.box<!fir.array<?xf32>>
 ! CHECK:  cf.br ^{{.*}}
 ! CHECK: ^bb{{.*}}:
-! CHECK:  %[[BOX:.*]] = fir.convert %0 : (!fir.class<!fir.array<?xnone>>) -> !fir.box<!fir.array<?x!fir.char<1,?>>> 
+! CHECK:  %[[BOX:.*]] = fir.convert %{{[0-9]+}} : (!fir.class<!fir.array<?xnone>>) -> !fir.box<!fir.array<?x!fir.char<1,?>>> 
 ! CHECK:  cf.br ^bb{{.*}}
 ! CHECK: ^bb{{.*}}:
 ! CHECK:  %[[EXACT_BOX:.*]] = fir.convert %[[SELECTOR]] : (!fir.class<!fir.array<?xnone>>) -> !fir.box<!fir.array<?x!fir.type<_QMselect_type_lower_testTp1{a:i32,b:i32}>>>
diff --git a/flang/test/Lower/structure-constructors-alloc-comp.f90 b/flang/test/Lower/structure-constructors-alloc-comp.f90
index 5b56463303ba..5b1bca317c94 100644
--- a/flang/test/Lower/structure-constructors-alloc-comp.f90
+++ b/flang/test/Lower/structure-constructors-alloc-comp.f90
@@ -24,7 +24,7 @@ contains
 ! HLFIR-LABEL:  func.func @_QMm_struct_ctorPtest_alloc1(
 ! HLFIR-SAME:      %[[ARG_0:.*]]: !fir.ref<f32> {fir.bindc_name = "y"}) {
 ! HLFIR:    %[[VAL_0:.*]] = fir.alloca !fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>
-! HLFIR:    %[[VAL_12:.*]]:2 = hlfir.declare %[[ARG_0]] {uniq_name = "_QMm_struct_ctorFtest_alloc1Ey"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! HLFIR:    %[[VAL_12:.*]]:2 = hlfir.declare %[[ARG_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMm_struct_ctorFtest_alloc1Ey"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! HLFIR:    %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> (!fir.ref<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>, !fir.ref<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>)
 ! HLFIR:    %[[VAL_14:.*]] = fir.embox %[[VAL_13]]#0 : (!fir.ref<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> !fir.box<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>
 ! HLFIR:    %[[VAL_15:.*]] = fir.address_of(@_QQ{{.*}}) : !fir.ref<!fir.char<1,{{.*}}>>
@@ -49,8 +49,8 @@ contains
 ! HLFIR:    %[[VAL_0:.*]] = fir.alloca !fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>
 ! HLFIR:    %[[CONS_6:.*]] = arith.constant 5 : index
 ! HLFIR:    %[[VAL_12:.*]] = fir.shape %[[CONS_6]] : (index) -> !fir.shape<1>
-! HLFIR:    %[[VAL_13:.*]]:2 = hlfir.declare %[[ARG_1]](%[[VAL_12]]) {uniq_name = "_QMm_struct_ctorFtest_alloc2Eb"} : (!fir.ref<!fir.array<5xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<5xi32>>, !fir.ref<!fir.array<5xi32>>)
-! HLFIR:    %[[VAL_14:.*]]:2 = hlfir.declare %[[ARG_0]] {uniq_name = "_QMm_struct_ctorFtest_alloc2Ey"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! HLFIR:    %[[VAL_13:.*]]:2 = hlfir.declare %[[ARG_1]](%[[VAL_12]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QMm_struct_ctorFtest_alloc2Eb"} : (!fir.ref<!fir.array<5xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<5xi32>>, !fir.ref<!fir.array<5xi32>>)
+! HLFIR:    %[[VAL_14:.*]]:2 = hlfir.declare %[[ARG_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMm_struct_ctorFtest_alloc2Ey"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! HLFIR:    %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> (!fir.ref<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>, !fir.ref<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>)
 ! HLFIR:    %[[VAL_16:.*]] = fir.embox %[[VAL_15]]#0 : (!fir.ref<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> !fir.box<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>
 ! HLFIR:    %[[VAL_17:.*]] = fir.address_of(@_QQ{{.*}}) : !fir.ref<!fir.char<1,{{.*}}>>
diff --git a/flang/test/Parser/OpenMP/fail-construct1.f90 b/flang/test/Parser/OpenMP/fail-construct1.f90
new file mode 100644
index 000000000000..f0ee22125cee
--- /dev/null
+++ b/flang/test/Parser/OpenMP/fail-construct1.f90
@@ -0,0 +1,5 @@
+! RUN: not %flang_fc1 -fsyntax-only -fopenmp %s 2>&1 | FileCheck %s
+
+! CHECK: error: expected OpenMP construct
+!$omp  parallel
+end
diff --git a/flang/test/Parser/OpenMP/fail-construct2.f90 b/flang/test/Parser/OpenMP/fail-construct2.f90
new file mode 100644
index 000000000000..b7f5736d1329
--- /dev/null
+++ b/flang/test/Parser/OpenMP/fail-construct2.f90
@@ -0,0 +1,5 @@
+! RUN: not %flang_fc1 -fsyntax-only -fopenmp %s 2>&1 | FileCheck %s
+
+! CHECK: error: expected OpenMP construct
+!$omp dummy
+end
diff --git a/flang/test/Semantics/OpenMP/do08.f90 b/flang/test/Semantics/OpenMP/do08.f90
index 3ba63072a80b..5143dff0dd31 100644
--- a/flang/test/Semantics/OpenMP/do08.f90
+++ b/flang/test/Semantics/OpenMP/do08.f90
@@ -4,6 +4,8 @@
 
 program omp
   integer i, j, k
+  logical cond(10,10,10)
+  cond = .false.
 
   !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
   !$omp do  collapse(3)
@@ -135,4 +137,33 @@ program omp
   end do foo
   !$omp end do
 
+  !$omp do collapse(3)
+  loopk: do k=1,10
+    loopj: do j=1,10
+      loopi: do i=1,10
+        ifi : if (.true.) then
+          !ERROR: EXIT statement terminates associated loop of an OpenMP DO construct
+          if (cond(i,j,k)) exit
+          if (cond(i,j,k)) exit ifi
+          !ERROR: EXIT statement terminates associated loop of an OpenMP DO construct
+          if (cond(i,j,k)) exit loopi
+          !ERROR: EXIT statement terminates associated loop of an OpenMP DO construct
+          if (cond(i,j,k)) exit loopj
+        end if ifi
+      end do loopi
+    end do loopj
+  end do loopk
+  !$omp end do
+
+  !$omp do collapse(2)
+  loopk: do k=1,10
+    loopj: do j=1,10
+      do i=1,10
+      end do
+      !ERROR: EXIT statement terminates associated loop of an OpenMP DO construct
+      if (cond(i,j,k)) exit
+    end do loopj
+  end do loopk
+  !$omp end do
+
 end program omp
diff --git a/flang/test/Semantics/OpenMP/implicit-dsa.f90 b/flang/test/Semantics/OpenMP/implicit-dsa.f90
new file mode 100644
index 000000000000..92d2421d06f9
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/implicit-dsa.f90
@@ -0,0 +1,158 @@
+! RUN: %python %S/../test_symbols.py %s %flang_fc1 -fopenmp
+
+! Test symbols generated in block constructs that have implicitly
+! determined DSAs.
+
+! Basic cases.
+!DEF: /implicit_dsa_test1 (Subroutine) Subprogram
+subroutine implicit_dsa_test1
+  !DEF: /implicit_dsa_test1/i ObjectEntity INTEGER(4)
+  !DEF: /implicit_dsa_test1/x ObjectEntity INTEGER(4)
+  !DEF: /implicit_dsa_test1/y ObjectEntity INTEGER(4)
+  !DEF: /implicit_dsa_test1/z ObjectEntity INTEGER(4)
+  integer i, x, y, z
+
+  !$omp task private(y) shared(z)
+    !DEF: /implicit_dsa_test1/OtherConstruct1/x (OmpFirstPrivate, OmpImplicit) HostAssoc INTEGER(4)
+    !DEF: /implicit_dsa_test1/OtherConstruct1/y (OmpPrivate) HostAssoc INTEGER(4)
+    !REF: /implicit_dsa_test1/z
+    x = y + z
+  !$omp end task
+
+  !$omp task default(shared)
+    !REF: /implicit_dsa_test1/x
+    !REF: /implicit_dsa_test1/y
+    !REF: /implicit_dsa_test1/z
+    x = y + z
+  !$omp end task
+
+  !$omp taskloop
+    !DEF: /implicit_dsa_test1/OtherConstruct3/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+    do i = 0, 10
+      !DEF: /implicit_dsa_test1/OtherConstruct3/x (OmpFirstPrivate, OmpImplicit) HostAssoc INTEGER(4)
+      !DEF: /implicit_dsa_test1/OtherConstruct3/y (OmpFirstPrivate, OmpImplicit) HostAssoc INTEGER(4)
+      !REF: /implicit_dsa_test1/OtherConstruct3/i
+      x = y + i
+    end do
+  !$omp end taskloop
+end subroutine
+
+! Nested task with implicit firstprivate DSA variable.
+!DEF: /implicit_dsa_test2 (Subroutine) Subprogram
+subroutine implicit_dsa_test2
+  !DEF: /implicit_dsa_test2/x ObjectEntity INTEGER(4)
+  integer x
+
+  !$omp task
+    !$omp task
+      !DEF: /implicit_dsa_test2/OtherConstruct1/OtherConstruct1/x (OmpFirstPrivate, OmpImplicit) HostAssoc INTEGER(4)
+      x = 1
+    !$omp end task
+  !$omp end task
+end subroutine
+
+! Nested tasks with implicit shared DSA variables.
+!DEF: /implicit_dsa_test3 (Subroutine) Subprogram
+subroutine implicit_dsa_test3
+  !DEF: /implicit_dsa_test3/x ObjectEntity INTEGER(4)
+  !DEF: /implicit_dsa_test3/y ObjectEntity INTEGER(4)
+  !DEF: /implicit_dsa_test3/z ObjectEntity INTEGER(4)
+  integer x, y, z
+
+  !$omp parallel
+    !$omp task
+      !REF: /implicit_dsa_test3/x
+      x = 1
+      !REF: /implicit_dsa_test3/y
+      y = 1
+    !$omp end task
+
+    !$omp task firstprivate(x)
+      !DEF: /implicit_dsa_test3/OtherConstruct1/OtherConstruct2/x (OmpFirstPrivate) HostAssoc INTEGER(4)
+      x = 1
+      !REF: /implicit_dsa_test3/z
+      z = 1
+    !$omp end task
+  !$omp end parallel
+end subroutine
+
+! Task with implicit firstprivate DSA variables, enclosed in private context.
+!DEF: /implicit_dsa_test4 (Subroutine) Subprogram
+subroutine implicit_dsa_test4
+  !DEF: /implicit_dsa_test4/x ObjectEntity INTEGER(4)
+  !DEF: /implicit_dsa_test4/y ObjectEntity INTEGER(4)
+  !DEF: /implicit_dsa_test4/z ObjectEntity INTEGER(4)
+  integer x, y, z
+
+  !$omp parallel default(private)
+    !$omp task
+      !DEF: /implicit_dsa_test4/OtherConstruct1/OtherConstruct1/x (OmpFirstPrivate, OmpImplicit) HostAssoc INTEGER(4)
+      x = 0
+      !DEF: /implicit_dsa_test4/OtherConstruct1/OtherConstruct1/z (OmpFirstPrivate, OmpImplicit) HostAssoc INTEGER(4)
+      z = 1
+    !$omp end task
+
+    !$omp task
+      !DEF: /implicit_dsa_test4/OtherConstruct1/OtherConstruct2/x (OmpFirstPrivate, OmpImplicit) HostAssoc INTEGER(4)
+      x = 1
+      !DEF: /implicit_dsa_test4/OtherConstruct1/OtherConstruct2/y (OmpFirstPrivate, OmpImplicit) HostAssoc INTEGER(4)
+      y = 0
+    !$omp end task
+  !$omp end parallel
+end subroutine
+
+! Inner parallel using implicit firstprivate symbol.
+!DEF: /implicit_dsa_test5 (Subroutine) Subprogram
+subroutine implicit_dsa_test5
+  !DEF: /implicit_dsa_test5/x ObjectEntity INTEGER(4)
+  integer x
+
+  !$omp parallel default(private)
+    !$omp task
+      !$omp parallel
+        !DEF: /implicit_dsa_test5/OtherConstruct1/OtherConstruct1/OtherConstruct1/x HostAssoc INTEGER(4)
+        x = 1
+      !$omp end parallel
+    !$omp end task
+  !$omp end parallel
+end subroutine
+
+! Constructs nested inside a task with implicit DSA variables.
+!DEF: /implicit_dsa_test6 (Subroutine) Subprogram
+subroutine implicit_dsa_test6
+  !DEF: /implicit_dsa_test6/x ObjectEntity INTEGER(4)
+  !DEF: /implicit_dsa_test6/y ObjectEntity INTEGER(4)
+  !DEF: /implicit_dsa_test6/z ObjectEntity INTEGER(4)
+  integer x, y, z
+
+  !$omp task
+    !$omp parallel default(private)
+      !DEF: /implicit_dsa_test6/OtherConstruct1/OtherConstruct1/x (OmpPrivate) HostAssoc INTEGER(4)
+      !DEF: /implicit_dsa_test6/OtherConstruct1/OtherConstruct1/y (OmpPrivate) HostAssoc INTEGER(4)
+      x = y
+    !$omp end parallel
+
+    !$omp parallel default(firstprivate) shared(y)
+      !DEF: /implicit_dsa_test6/OtherConstruct1/OtherConstruct2/y HostAssoc INTEGER(4)
+      !DEF: /implicit_dsa_test6/OtherConstruct1/OtherConstruct2/x (OmpFirstPrivate) HostAssocINTEGER(4)
+      !DEF: /implicit_dsa_test6/OtherConstruct1/OtherConstruct2/z (OmpFirstPrivate) HostAssocINTEGER(4)
+      y = x + z
+    !$omp end parallel
+  !$omp end task
+end subroutine
+
+! Test taskgroup - it uses the same scope as task.
+!DEF: /implicit_dsa_test7 (Subroutine) Subprogram
+subroutine implicit_dsa_test7
+  !DEF: /implicit_dsa_test7/x ObjectEntity INTEGER(4)
+  !DEF: /implicit_dsa_test7/y ObjectEntity INTEGER(4)
+  integer x, y
+
+  !$omp task
+    !$omp taskgroup
+      !DEF: /implicit_dsa_test7/OtherConstruct1/x (OmpFirstPrivate, OmpImplicit) HostAssoc INTEGER(4)
+      !DEF: /implicit_dsa_test7/OtherConstruct1/y (OmpFirstPrivate, OmpImplicit) HostAssoc INTEGER(4)
+      x = y
+    !$omp end taskgroup
+  !$omp end task
+end subroutine
diff --git a/flang/test/Semantics/OpenMP/map-clause.f90 b/flang/test/Semantics/OpenMP/map-clause.f90
index b46b2550b04e..a7430c3edeb9 100644
--- a/flang/test/Semantics/OpenMP/map-clause.f90
+++ b/flang/test/Semantics/OpenMP/map-clause.f90
@@ -2,9 +2,12 @@
 ! Check OpenMP MAP clause validity. Section 5.8.3 OpenMP 5.2.
 
 subroutine sb(arr)
+  implicit none
   real(8) :: arr(*)
   real :: a
-
+  integer:: b, c, i
+  common /var/ b, c  
+  
   !ERROR: Assumed-size whole arrays may not appear on the MAP clause
   !$omp target map(arr)
   do i = 1, 100
@@ -12,6 +15,7 @@ subroutine sb(arr)
   enddo
   !$omp end target
 
+  !ERROR: Assumed-size array 'arr' must have explicit final subscript upper bound value
   !$omp target map(arr(:))
   do i = 1, 100
      a = 3.14
@@ -23,4 +27,9 @@ subroutine sb(arr)
      a = 3.14
   enddo
   !$omp end target
+
+ !$omp target map(tofrom: /var/)
+   b = 1
+   c = 2
+ !$omp end target
 end subroutine
diff --git a/flang/test/Semantics/OpenMP/parallel-critical-do.f90 b/flang/test/Semantics/OpenMP/parallel-critical-do.f90
new file mode 100644
index 000000000000..6e10b46dea9a
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/parallel-critical-do.f90
@@ -0,0 +1,18 @@
+! RUN: %python %S/../test_symbols.py %s %flang_fc1 -fopenmp
+
+! Check that loop iteration variables are private and predetermined, even when
+! nested inside parallel/critical constructs.
+
+!DEF: /test1 (Subroutine) Subprogram
+subroutine test1
+  !DEF: /test1/i ObjectEntity INTEGER(4)
+  integer i
+
+  !$omp parallel default(none)
+    !$omp critical
+      !DEF: /test1/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+      do i = 1, 10
+      end do
+    !$omp end critical
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Semantics/OpenMP/parallel-sections-do.f90 b/flang/test/Semantics/OpenMP/parallel-sections-do.f90
new file mode 100644
index 000000000000..39102175299b
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/parallel-sections-do.f90
@@ -0,0 +1,19 @@
+! RUN: %python %S/../test_symbols.py %s %flang_fc1 -fopenmp
+
+! Check that loop iteration variables are private and predetermined, even when
+! nested inside parallel/sections constructs.
+
+!DEF: /test1 (Subroutine) Subprogram
+subroutine test1
+  !DEF: /test1/i ObjectEntity INTEGER(4)
+  integer i
+
+  !$omp parallel default(none)
+    !$omp sections
+      !$omp section
+        !DEF: /test1/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+        do i = 1, 10
+        end do
+    !$omp end sections
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Semantics/OpenMP/symbol08.f90 b/flang/test/Semantics/OpenMP/symbol08.f90
index 50f34b736cdb..3af85af74ee9 100644
--- a/flang/test/Semantics/OpenMP/symbol08.f90
+++ b/flang/test/Semantics/OpenMP/symbol08.f90
@@ -94,7 +94,7 @@ subroutine test_taskloop
   !DEF: /test_taskloop/OtherConstruct1/j (OmpPrivate) HostAssoc INTEGER(4)
   !REF: /test_taskloop/OtherConstruct1/i
   do j=1,i
-   !REF: /test_taskloop/a
+   !DEF: /test_taskloop/OtherConstruct1/a (OmpFirstPrivate, OmpImplicit) HostAssoc REAL(4)
    !REF: /test_taskloop/OtherConstruct1/j
    !REF: /test_taskloop/OtherConstruct1/i
    a(j,i) = 3.14
diff --git a/flang/test/Semantics/array-constr-index01.f90 b/flang/test/Semantics/array-constr-index01.f90
new file mode 100644
index 000000000000..560b6be83139
--- /dev/null
+++ b/flang/test/Semantics/array-constr-index01.f90
@@ -0,0 +1,8 @@
+!RUN: %python %S/test_errors.py %s %flang_fc1
+subroutine s(i)
+  type(*) :: i
+  !ERROR: TYPE(*) dummy argument may only be used as an actual argument
+  !ERROR: Assumed-type entity 'i' must be a dummy argument
+  !ERROR: Must have INTEGER type, but is TYPE(*)
+  print *, [(i, i = 1,1)]
+end
diff --git a/flang/test/Semantics/bind-c03.f90 b/flang/test/Semantics/bind-c03.f90
index 65d52e964ca4..c37cb2bccb1f 100644
--- a/flang/test/Semantics/bind-c03.f90
+++ b/flang/test/Semantics/bind-c03.f90
@@ -1,4 +1,4 @@
-! RUN: %python %S/test_errors.py %s %flang_fc1
+! RUN: %python %S/test_errors.py %s %flang_fc1 -pedantic
 ! Check for C1521
 ! If proc-language-binding-spec (bind(c)) is specified, the proc-interface
 ! shall appear, it shall be an interface-name, and interface-name shall be
@@ -24,7 +24,10 @@ module m
   !ERROR: An interface name with BIND attribute must be specified if the BIND attribute is specified in a procedure declaration statement
   procedure(proc2), bind(c) :: pc2
 
-  !ERROR: An interface name with BIND attribute must be specified if the BIND attribute is specified in a procedure declaration statement
+  !WARNING: An interface name with BIND attribute should be specified if the BIND attribute is specified in a procedure declaration statement
   procedure(integer), bind(c) :: pc3
 
+  !WARNING: An interface name with BIND attribute should be specified if the BIND attribute is specified in a procedure declaration statement
+  procedure(), bind(c) :: pc5
+
 end
diff --git a/flang/test/Semantics/bind-c06.f90 b/flang/test/Semantics/bind-c06.f90
index 4c25722cb775..3ad3078c4b4a 100644
--- a/flang/test/Semantics/bind-c06.f90
+++ b/flang/test/Semantics/bind-c06.f90
@@ -16,19 +16,19 @@ program main
     integer :: i
   end type
 
-  ! ERROR: A derived type with the BIND attribute cannot have the SEQUENCE attribute
+  ! ERROR: An interoperable derived type cannot have the SEQUENCE attribute
   type, bind(c) :: t1
     sequence
     integer :: x
   end type
 
-  ! ERROR: A derived type with the BIND attribute has type parameter(s)
+  ! ERROR: An interoperable derived type cannot have a type parameter
   type, bind(c) :: t2(k)
     integer, KIND :: k
     integer :: x
   end type
 
-  ! ERROR: A derived type with the BIND attribute cannot extend from another derived type
+  ! ERROR: A derived type with the BIND attribute cannot be an extended derived type
   type, bind(c), extends(v) :: t3
     integer :: x
   end type
@@ -36,21 +36,21 @@ program main
   type, bind(c) :: t4
     integer :: x
    contains
-    ! ERROR: A derived type with the BIND attribute cannot have a type bound procedure
+    ! ERROR: An interoperable derived type cannot have a type bound procedure
     procedure, nopass :: b => s
   end type
 
-  ! WARNING: A derived type with the BIND attribute is empty
+  ! WARNING: A derived type with the BIND attribute should not be empty
   type, bind(c) :: t5
   end type
 
   type, bind(c) :: t6
-    ! ERROR: A derived type with the BIND attribute cannot have a pointer or allocatable component
+    ! ERROR: An interoperable derived type cannot have a pointer or allocatable component
     integer, pointer :: x
   end type
 
   type, bind(c) :: t7
-    ! ERROR: A derived type with the BIND attribute cannot have a pointer or allocatable component
+    ! ERROR: An interoperable derived type cannot have a pointer or allocatable component
     integer, allocatable :: y
   end type
 
@@ -58,14 +58,20 @@ program main
     integer :: x
   end type
 
+  type :: t8a
+    integer, pointer :: x
+  end type
+
   type, bind(c) :: t9
-    !ERROR: Component 'y' of an interoperable derived type must have the BIND attribute
-    type(t8) :: y
+    !WARNING: Derived type of component 'x' of an interoperable derived type should have the BIND attribute
+    type(t8) :: x
+    !ERROR: Component 'y' of an interoperable derived type must have an interoperable type but does not
+    type(t8a) :: y
     integer :: z
   end type
 
   type, bind(c) :: t10
-    !WARNING: A CHARACTER component of a BIND(C) type should have length 1
+    !WARNING: A CHARACTER component of an interoperable type should have length 1
     character(len=2) x
   end type
   type, bind(c) :: t11
@@ -73,7 +79,7 @@ program main
     character(kind=2) x
   end type
   type, bind(c) :: t12
-    !PORTABILITY: A LOGICAL component of a BIND(C) type should have the interoperable KIND=C_BOOL
+    !PORTABILITY: A LOGICAL component of an interoperable type should have the interoperable KIND=C_BOOL
     logical(kind=8) x
   end type
   type, bind(c) :: t13
diff --git a/flang/test/Semantics/bindings01.f90 b/flang/test/Semantics/bindings01.f90
index 7f119d4e55bf..7c2dc6448bb3 100644
--- a/flang/test/Semantics/bindings01.f90
+++ b/flang/test/Semantics/bindings01.f90
@@ -4,7 +4,7 @@
 
 module m
   !ERROR: An ABSTRACT derived type must be extensible
-  !PORTABILITY: A derived type with the BIND attribute is empty
+  !PORTABILITY: A derived type with the BIND attribute should not be empty
   type, abstract, bind(c) :: badAbstract1
   end type
   !ERROR: An ABSTRACT derived type must be extensible
@@ -45,7 +45,7 @@ module m
   end type
   type, extends(intermediate) :: concrete2  ! ensure no false missing binding error
   end type
-  !WARNING: A derived type with the BIND attribute is empty
+  !WARNING: A derived type with the BIND attribute should not be empty
   type, bind(c) :: inextensible1
   end type
   !ERROR: The parent type is not extensible
diff --git a/flang/test/Semantics/cuf02.cuf b/flang/test/Semantics/cuf02.cuf
index a4a229565a3e..58cb3cf49011 100644
--- a/flang/test/Semantics/cuf02.cuf
+++ b/flang/test/Semantics/cuf02.cuf
@@ -29,11 +29,23 @@ module m
   !ERROR: A function may not have ATTRIBUTES(GLOBAL) or ATTRIBUTES(GRID_GLOBAL)
   attributes(global) real function f1
   end
-  recursive attributes(global) subroutine s7 ! ok
+  !ERROR: A kernel subprogram may not be RECURSIVE, PURE, or ELEMENTAL
+  recursive attributes(global) subroutine s7
   end
-  pure attributes(global) subroutine s8 ! ok
+  !ERROR: A kernel subprogram may not be RECURSIVE, PURE, or ELEMENTAL
+  pure attributes(global) subroutine s8
   end
-  elemental attributes(global) subroutine s9 ! ok
+  !ERROR: A kernel subprogram may not be RECURSIVE, PURE, or ELEMENTAL
+  elemental attributes(global) subroutine s9
+  end
+  !ERROR: A kernel subprogram may not be RECURSIVE, PURE, or ELEMENTAL
+  recursive attributes(grid_global) subroutine s10
+  end
+  !ERROR: A kernel subprogram may not be RECURSIVE, PURE, or ELEMENTAL
+  pure attributes(grid_global) subroutine s11
+  end
+  !ERROR: A kernel subprogram may not be RECURSIVE, PURE, or ELEMENTAL
+  elemental attributes(grid_global) subroutine s12
   end
 end
 
diff --git a/flang/test/Semantics/cuf13.cuf b/flang/test/Semantics/cuf13.cuf
index 6db829002fae..dafcffa5e93b 100644
--- a/flang/test/Semantics/cuf13.cuf
+++ b/flang/test/Semantics/cuf13.cuf
@@ -1,13 +1,11 @@
-! RUN: %python %S/test_errors.py %s %flang_fc1
+! RUN: %flang_fc1 -x cuda -fdebug-unparse %s | FileCheck %s
 
 module matching
   interface sub
     module procedure sub_host
     module procedure sub_device
-  end interface
-
-  interface subman
-    module procedure sub_host
+    module procedure sub_managed
+    module procedure sub_unified
   end interface
 
 contains
@@ -19,6 +17,13 @@ contains
     integer, device :: a(:)
   end
 
+  subroutine sub_managed(a)
+    integer, managed :: a(:)
+  end
+
+  subroutine sub_unified(a)
+    integer, unified :: a(:)
+  end
 end module
 
 program m
@@ -26,12 +31,21 @@ program m
 
   integer, pinned, allocatable :: a(:)
   integer, managed, allocatable :: b(:)
+  integer, unified, allocatable :: u(:)
+  integer, device :: d(10)
   logical :: plog
   allocate(a(100), pinned = plog)
   allocate(b(200))
+  allocate(u(100))
 
-  call sub(a)
-
-  call subman(b)
+  call sub(a) ! Should resolve to sub_host
+  call sub(b) ! Should resolve to sub_managed
+  call sub(u) ! Should resolve to sub_unified
+  call sub(d) ! Should resolve to sub_device
 
 end
+
+! CHECK: CALL sub_host
+! CHECK: CALL sub_managed
+! CHECK: CALL sub_unified
+! CHECK: CALL sub_device
diff --git a/flang/test/Semantics/cuf14.cuf b/flang/test/Semantics/cuf14.cuf
new file mode 100644
index 000000000000..29c9ecf90677
--- /dev/null
+++ b/flang/test/Semantics/cuf14.cuf
@@ -0,0 +1,55 @@
+! RUN: bbc -emit-hlfir -fcuda -gpu=unified %s -o - | FileCheck %s
+
+module matching
+  interface host_and_device
+    module procedure sub_host
+    module procedure sub_device
+  end interface
+
+  interface all
+    module procedure sub_host
+    module procedure sub_device
+    module procedure sub_managed
+    module procedure sub_unified
+  end interface
+
+  interface all_without_unified
+    module procedure sub_host
+    module procedure sub_device
+    module procedure sub_managed
+  end interface
+
+contains
+  subroutine sub_host(a)
+    integer :: a(:)
+  end
+
+  subroutine sub_device(a)
+    integer, device :: a(:)
+  end
+
+  subroutine sub_managed(a)
+    integer, managed :: a(:)
+  end
+
+  subroutine sub_unified(a)
+    integer, unified :: a(:)
+  end
+end module
+
+program m
+  use matching
+
+  integer, allocatable :: actual_host(:)
+
+  allocate(actual_host(10))
+
+  call host_and_device(actual_host)     ! Should resolve to sub_device
+  call all(actual_host)                 ! Should resolved to unified
+  call all_without_unified(actual_host) ! Should resolved to managed
+end
+
+! CHECK: fir.call @_QMmatchingPsub_device
+! CHECK: fir.call @_QMmatchingPsub_unified
+! CHECK: fir.call @_QMmatchingPsub_managed
+
diff --git a/flang/test/Semantics/cuf15.cuf b/flang/test/Semantics/cuf15.cuf
new file mode 100644
index 000000000000..030dd6ff8ffe
--- /dev/null
+++ b/flang/test/Semantics/cuf15.cuf
@@ -0,0 +1,55 @@
+! RUN: bbc -emit-hlfir -fcuda -gpu=managed %s -o - | FileCheck %s
+
+module matching
+  interface host_and_device
+    module procedure sub_host
+    module procedure sub_device
+  end interface
+
+  interface all
+    module procedure sub_host
+    module procedure sub_device
+    module procedure sub_managed
+    module procedure sub_unified
+  end interface
+
+  interface all_without_managed
+    module procedure sub_host
+    module procedure sub_device
+    module procedure sub_unified
+  end interface
+
+contains
+  subroutine sub_host(a)
+    integer :: a(:)
+  end
+
+  subroutine sub_device(a)
+    integer, device :: a(:)
+  end
+
+  subroutine sub_managed(a)
+    integer, managed :: a(:)
+  end
+
+  subroutine sub_unified(a)
+    integer, unified :: a(:)
+  end
+end module
+
+program m
+  use matching
+
+  integer, allocatable :: actual_host(:)
+
+  allocate(actual_host(10))
+
+  call host_and_device(actual_host)     ! Should resolve to sub_device
+  call all(actual_host)                 ! Should resolved to unified
+  call all_without_managed(actual_host) ! Should resolved to managed
+end
+
+! CHECK: fir.call @_QMmatchingPsub_device
+! CHECK: fir.call @_QMmatchingPsub_managed
+! CHECK: fir.call @_QMmatchingPsub_unified
+
diff --git a/flang/test/Semantics/data01.f90 b/flang/test/Semantics/data01.f90
index 9046487fa176..fe2d16e95ee1 100644
--- a/flang/test/Semantics/data01.f90
+++ b/flang/test/Semantics/data01.f90
@@ -67,6 +67,6 @@ subroutine CheckValue
   !ERROR: DATA statement value 'b(1_8)' for 'z' is not a constant
   data z / b(1) /
   type(hasAlloc) ha
-  !ERROR: DATA statement value 'hasalloc(a=0_4)' for 'ha' is not a constant
+  !ERROR: DATA statement value 'hasalloc(a=0_4)' for 'ha%a' is not a constant
   data ha / hasAlloc(0) /
 end
diff --git a/flang/test/Semantics/data23.f90 b/flang/test/Semantics/data23.f90
new file mode 100644
index 000000000000..8210e9e62b81
--- /dev/null
+++ b/flang/test/Semantics/data23.f90
@@ -0,0 +1,18 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+program p
+  interface
+    subroutine s
+    end subroutine
+  end interface
+  !ERROR: DATA statement initializations affect 'p' more than once
+  procedure(s), pointer :: p
+  type t
+    procedure(s), pointer, nopass :: p
+  end type
+  !ERROR: DATA statement initializations affect 'x%p' more than once
+  type(t) x
+  data p /s/
+  data p /s/
+  data x%p /s/
+  data x%p /s/
+end
diff --git a/flang/test/Semantics/entry01.f90 b/flang/test/Semantics/entry01.f90
index 64bd954f8ae0..970cd109921a 100644
--- a/flang/test/Semantics/entry01.f90
+++ b/flang/test/Semantics/entry01.f90
@@ -86,11 +86,12 @@ function ifunc()
   entry ibad2()
   !ERROR: ENTRY in a function may not have an alternate return dummy argument
   entry ibadalt(*) ! C1573
-  !ERROR: RESULT(ifunc) may not have the same name as the function
+  !ERROR: ENTRY cannot have RESULT(ifunc) that is not a variable
   entry isameres() result(ifunc) ! C1574
   entry iok()
-  !ERROR: RESULT(iok) may not have the same name as an ENTRY in the function
+  !ERROR: Explicit RESULT('iok') of function 'isameres2' cannot have the same name as a distinct ENTRY into the same scope
   entry isameres2() result(iok) ! C1574
+  !ERROR: Explicit RESULT('iok2') of function 'isameres3' cannot have the same name as a distinct ENTRY into the same scope
   entry isameres3() result(iok2) ! C1574
   !ERROR: 'iok2' is already declared in this scoping unit
   entry iok2()
@@ -255,3 +256,13 @@ subroutine s7(q,q)
   !ERROR: 'z' appears more than once as a dummy argument name in this ENTRY statement
   entry baz(z,z)
 end
+
+!ERROR: Explicit RESULT('f8e1') of function 'f8' cannot have the same name as a distinct ENTRY into the same scope
+function f8() result(f8e1)
+  entry f8e1()
+  entry f8e2() result(f8e2) ! ok
+  !ERROR: Explicit RESULT('f8e1') of function 'f8e3' cannot have the same name as a distinct ENTRY into the same scope
+  entry f8e3() result(f8e1)
+  !ERROR: ENTRY cannot have RESULT(f8) that is not a variable
+  entry f8e4() result(f8)
+end
diff --git a/flang/test/Semantics/equivalence01.f90 b/flang/test/Semantics/equivalence01.f90
index 7ef47fb554b5..ec68e9066a29 100644
--- a/flang/test/Semantics/equivalence01.f90
+++ b/flang/test/Semantics/equivalence01.f90
@@ -244,3 +244,12 @@ module m18
   type(t1) x
   common x
 end
+
+subroutine s19
+  entry e19
+  !ERROR: 'e19' in equivalence set is not a data object
+  equivalence (e19, j)
+  !ERROR: 'e20' in equivalence set is not a data object
+  equivalence (e20, j)
+  entry e20
+end
diff --git a/flang/test/Semantics/modfile12.f90 b/flang/test/Semantics/modfile12.f90
index 17b6e95c4a56..41ab300e00f6 100644
--- a/flang/test/Semantics/modfile12.f90
+++ b/flang/test/Semantics/modfile12.f90
@@ -41,7 +41,7 @@ end
 !  real(4)::y(1_8:8_8)
 !  type::t(c,d)
 !    integer(4),kind::c=1_4
-!    integer(4),len::d=3_4
+!    integer(4),len::d=3_8
 !  end type
 !  type(t(c=4_4,d=:)),allocatable::z
 !  class(t(c=5_4,d=:)),allocatable::z2
diff --git a/flang/test/Semantics/modfile17.f90 b/flang/test/Semantics/modfile17.f90
index 189d8a83de8c..4ab5cc85db25 100644
--- a/flang/test/Semantics/modfile17.f90
+++ b/flang/test/Semantics/modfile17.f90
@@ -97,10 +97,10 @@ end module
 !integer(k8)::j8
 !end type
 !type::defaulted(n1,n2,n4,n8)
-!integer(1),kind::n1=1_1
-!integer(2),kind::n2=int(2_4*int(int(n1,kind=1),kind=4),kind=2)
+!integer(1),kind::n1=1_4
+!integer(2),kind::n2=2_4*int(int(n1,kind=1),kind=4)
 !integer(4),kind::n4=2_4*int(int(n2,kind=2),kind=4)
-!integer(8),kind::n8=int(12_4-int(n4,kind=4),kind=8)
+!integer(8),kind::n8=12_4-int(n4,kind=4)
 !type(capture(k1=int(n1,kind=1),k2=int(n2,kind=2),k4=int(n4,kind=4),k8=n8))::cap
 !end type
 !type,extends(defaulted)::extension(k5)
diff --git a/flang/test/Semantics/pdt03.f90 b/flang/test/Semantics/pdt03.f90
new file mode 100644
index 000000000000..2fb63d21540b
--- /dev/null
+++ b/flang/test/Semantics/pdt03.f90
@@ -0,0 +1,9 @@
+! RUN: %flang_fc1 -fdebug-unparse %s 2>&1 | FileCheck %s
+type t(kp1,kp2)
+  integer, kind :: kp1
+  integer(kp1), kind :: kp2 = kp1
+end type
+type(t(kp1=8_8)) x
+!CHECK: 4_4, 8_4, 8_4, 8_8
+print *, kind(x%kp1), x%kp1, kind(x%kp2), x%kp2
+end
diff --git a/flang/test/Semantics/resolve81.f90 b/flang/test/Semantics/resolve81.f90
index 87901fd7d2ef..5f0b66669423 100644
--- a/flang/test/Semantics/resolve81.f90
+++ b/flang/test/Semantics/resolve81.f90
@@ -5,9 +5,9 @@
 ! R801 type-declaration-stmt ->
 !        declaration-type-spec [[, attr-spec]... ::] entity-decl-list
 !  attr-spec values are:
-!    PUBLIC, PRIVATE, ALLOCATABLE, ASYNCHRONOUS, CODIMENSION, CONTIGUOUS, 
-!    DIMENSION (array-spec), EXTERNAL, INTENT (intent-spec), INTRINSIC, 
-!    BIND(C), OPTIONAL, PARAMETER, POINTER, PROTECTED, SAVE, TARGET, VALUE, 
+!    PUBLIC, PRIVATE, ALLOCATABLE, ASYNCHRONOUS, CODIMENSION, CONTIGUOUS,
+!    DIMENSION (array-spec), EXTERNAL, INTENT (intent-spec), INTRINSIC,
+!    BIND(C), OPTIONAL, PARAMETER, POINTER, PROTECTED, SAVE, TARGET, VALUE,
 !    VOLATILE
 module m
 
@@ -28,7 +28,7 @@ module m
   !WARNING: Attribute 'EXTERNAL' cannot be used more than once
   real, external, external :: externFunc
   !WARNING: Attribute 'INTRINSIC' cannot be used more than once
-  !ERROR: An interface name with BIND attribute must be specified if the BIND attribute is specified in a procedure declaration statement
+  !ERROR: 'cos' may not have both the BIND(C) and INTRINSIC attributes
   real, intrinsic, bind(c), intrinsic :: cos
   !WARNING: Attribute 'BIND(C)' cannot be used more than once
   integer, bind(c), volatile, bind(c) :: bindVar
diff --git a/flang/test/Semantics/resolve85.f90 b/flang/test/Semantics/resolve85.f90
index f598456f9830..9b9358ecf477 100644
--- a/flang/test/Semantics/resolve85.f90
+++ b/flang/test/Semantics/resolve85.f90
@@ -24,7 +24,7 @@ module m
   end type derived4
 
   !WARNING: Attribute 'BIND(C)' cannot be used more than once
-  !WARNING: A derived type with the BIND attribute is empty
+  !WARNING: A derived type with the BIND attribute should not be empty
   type, bind(c), public, bind(c) :: derived5
   end type derived5
 
diff --git a/flang/test/Semantics/stmt-func01.f90 b/flang/test/Semantics/stmt-func01.f90
index 733a7a56dfdb..3c9ffa565900 100644
--- a/flang/test/Semantics/stmt-func01.f90
+++ b/flang/test/Semantics/stmt-func01.f90
@@ -83,3 +83,11 @@ subroutine s4
   !ERROR: VOLATILE attribute may apply only to a variable
   sf(x) = 1.
 end
+
+subroutine s5
+  !ERROR: Invalid specification expression: reference to impure function 'k'
+  real x(k())
+  !WARNING: Name 'k' from host scope should have a type declaration before its local statement function definition
+  !ERROR: 'k' is already declared in this scoping unit
+  k() = 0.0
+end
diff --git a/flang/test/Transforms/debug-90683.fir b/flang/test/Transforms/debug-90683.fir
new file mode 100644
index 000000000000..9da0e5347d3f
--- /dev/null
+++ b/flang/test/Transforms/debug-90683.fir
@@ -0,0 +1,25 @@
+// RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s -o - | FileCheck %s
+
+// This test checks that debug information for fir.real type works ok.
+
+module attributes {} {
+  func.func @_QPfn1(%arg0: !fir.ref<!fir.complex<8>> {fir.bindc_name = "a"} ) {
+    %0 = fir.declare %arg0 {uniq_name = "_QFfn1Ea"} : (!fir.ref<!fir.complex<8>>) -> !fir.ref<!fir.complex<8>>
+    %1 = fir.alloca f32 {bindc_name = "abserror", uniq_name = "_QFfn1Eabserror"}
+    %2 = fir.declare %1 {uniq_name = "_QFfn1Eabserror"} : (!fir.ref<f32>) -> !fir.ref<f32>
+    %3 = fir.load %0 : !fir.ref<!fir.complex<8>>
+    %4 = fir.extract_value %3, [0 : i32] : (!fir.complex<8>) -> !fir.real<8>
+    %5 = fir.extract_value %3, [1 : i32] : (!fir.complex<8>) -> !fir.real<8>
+    %6 = fir.call @cabs(%4, %5) : (!fir.real<8>, !fir.real<8>) -> f64
+    %7 = fir.convert %6 : (f64) -> f32
+    fir.store %7 to %2 : !fir.ref<f32>
+    return
+  } loc(#loc1)
+  func.func private @cabs(!fir.real<8>, !fir.real<8>) -> f64 attributes {fir.bindc_name = "cabs", fir.runtime}
+} loc(#loc)
+#loc1 = loc("test.f90":5:1)
+#loc = loc("test.f90":0:0)
+
+// CHECK-DAG: #[[TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 64, encoding = DW_ATE_float>
+// CHECK-DAG: #[[TY1:.*]] = #llvm.di_subroutine_type<callingConvention = DW_CC_normal, types = #[[TY]], #[[TY]], #[[TY]]>
+// CHECK-DAG: #{{.*}} = #llvm.di_subprogram<scope = #{{.*}}, name = "cabs", linkageName = "cabs", file = #{{.*}}, line = {{.*}}, scopeLine = {{.*}}, type = #[[TY1]]>
diff --git a/flang/test/Transforms/omp-descriptor-map-info-gen.fir b/flang/test/Transforms/omp-descriptor-map-info-gen.fir
deleted file mode 100644
index 05d05b0ecb59..000000000000
--- a/flang/test/Transforms/omp-descriptor-map-info-gen.fir
+++ /dev/null
@@ -1,44 +0,0 @@
-// RUN: fir-opt --omp-descriptor-map-info-gen %s | FileCheck %s
-
-module attributes {omp.is_target_device = false} {
-  func.func @test_descriptor_expansion_pass(%arg0: !fir.box<!fir.array<?xi32>>) {
-    %0 = fir.alloca !fir.box<!fir.heap<i32>>
-    %1 = fir.zero_bits !fir.heap<i32>
-    %2:2 = hlfir.declare %arg0 {fortran_attrs = #fir.var_attrs<intent_out>, uniq_name = "test"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
-    %3 = fir.embox %1 : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>>
-    fir.store %3 to %0 : !fir.ref<!fir.box<!fir.heap<i32>>>
-    %4:2 = hlfir.declare %0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "test2"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
-    %5 = fir.allocmem i32 {fir.must_be_heap = true}
-    %6 = fir.embox %5 : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>>
-    fir.store %6 to %4#1 : !fir.ref<!fir.box<!fir.heap<i32>>>
-    %c0 = arith.constant 1 : index  
-    %c1 = arith.constant 0 : index
-    %c2 = arith.constant 10 : index
-    %dims:3 = fir.box_dims %2#1, %c1 : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
-    %bounds = omp.map.bounds lower_bound(%c1 : index) upper_bound(%c2 : index) extent(%dims#1 : index) stride(%dims#2 : index) start_idx(%c0 : index) {stride_in_bytes = true}
-    %7 = fir.box_addr %2#1 : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
-    %8 = omp.map.info var_ptr(%4#1 : !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.box<!fir.heap<i32>>) map_clauses(tofrom) capture(ByRef) -> !fir.ref<!fir.box<!fir.heap<i32>>>
-    %9 = omp.map.info var_ptr(%7 : !fir.ref<!fir.array<?xi32>>, !fir.array<?xi32>) map_clauses(from) capture(ByRef) bounds(%bounds) -> !fir.ref<!fir.array<?xi32>>
-    omp.target map_entries(%8 -> %arg1, %9 -> %arg2 : !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.array<?xi32>>) {
-    ^bb0(%arg1: !fir.ref<!fir.box<!fir.heap<i32>>>, %arg2: !fir.ref<!fir.array<?xi32>>):
-      omp.terminator
-    }
-    return 
-  }
-}
- 
-// CHECK: func.func @test_descriptor_expansion_pass(%[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>) {
-// CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.box<!fir.array<?xi32>>
-// CHECK: %[[ALLOCA2:.*]] = fir.alloca !fir.box<!fir.heap<i32>>
-// CHECK: %[[DECLARE1:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<intent_out>, uniq_name = "test"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
-// CHECK: %[[DECLARE2:.*]]:2 = hlfir.declare %[[ALLOCA2]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "test2"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
-// CHECK: %[[BOUNDS:.*]] = omp.map.bounds lower_bound(%{{.*}} : index) upper_bound(%{{.*}} : index) extent(%{{.*}} : index) stride(%{{.*}} : index) start_idx(%{{.*}} : index) {stride_in_bytes = true}
-// CHECK: %[[BASE_ADDR_OFF:.*]] = fir.box_offset %[[DECLARE2]]#1 base_addr : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> !fir.llvm_ptr<!fir.ref<i32>>
-// CHECK: %[[DESC_MEMBER_MAP:.*]] = omp.map.info var_ptr(%[[DECLARE2]]#1 : !fir.ref<!fir.box<!fir.heap<i32>>>, i32) var_ptr_ptr(%[[BASE_ADDR_OFF]] : !fir.llvm_ptr<!fir.ref<i32>>) map_clauses(tofrom) capture(ByRef) -> !fir.llvm_ptr<!fir.ref<i32>> {name = ""}
-// CHECK: %[[DESC_PARENT_MAP:.*]] = omp.map.info var_ptr(%[[DECLARE2]]#1 : !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.box<!fir.heap<i32>>) map_clauses(tofrom) capture(ByRef) members(%[[DESC_MEMBER_MAP]] : !fir.llvm_ptr<!fir.ref<i32>>) -> !fir.ref<!fir.box<!fir.heap<i32>>>
-// CHECK: fir.store %[[DECLARE1]]#1 to %[[ALLOCA]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
-// CHECK: %[[BASE_ADDR_OFF_2:.*]] = fir.box_offset %[[ALLOCA]] base_addr : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
-// CHECK: %[[DESC_MEMBER_MAP_2:.*]] = omp.map.info var_ptr(%[[ALLOCA]] : !fir.ref<!fir.box<!fir.array<?xi32>>>, !fir.array<?xi32>) var_ptr_ptr(%[[BASE_ADDR_OFF_2]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(from) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}
-// CHECK: %[[DESC_PARENT_MAP_2:.*]] = omp.map.info var_ptr(%[[ALLOCA]] : !fir.ref<!fir.box<!fir.array<?xi32>>>, !fir.box<!fir.array<?xi32>>) map_clauses(from) capture(ByRef) members(%15 : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.array<?xi32>>
-// CHECK: omp.target map_entries(%[[DESC_MEMBER_MAP]] -> %[[ARG1:.*]], %[[DESC_PARENT_MAP]] -> %[[ARG2:.*]], %[[DESC_MEMBER_MAP_2]] -> %[[ARG3:.*]], %[[DESC_PARENT_MAP_2]] -> %[[ARG4:.*]] : {{.*}}) {
-// CHECK: ^bb0(%[[ARG1]]: !fir.llvm_ptr<!fir.ref<i32>>, %[[ARG2]]: !fir.ref<!fir.box<!fir.heap<i32>>>, %[[ARG3]]: !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, %[[ARG4]]: !fir.ref<!fir.array<?xi32>>):
diff --git a/flang/test/Transforms/omp-map-info-finalization.fir b/flang/test/Transforms/omp-map-info-finalization.fir
new file mode 100644
index 000000000000..9d776b674151
--- /dev/null
+++ b/flang/test/Transforms/omp-map-info-finalization.fir
@@ -0,0 +1,99 @@
+// RUN: fir-opt --split-input-file --omp-map-info-finalization %s | FileCheck %s
+
+module attributes {omp.is_target_device = false} {
+  func.func @test_descriptor_expansion_pass(%arg0: !fir.box<!fir.array<?xi32>>) {
+    %0 = fir.alloca !fir.box<!fir.heap<i32>>
+    %1 = fir.zero_bits !fir.heap<i32>
+    %2:2 = hlfir.declare %arg0 {fortran_attrs = #fir.var_attrs<intent_out>, uniq_name = "test"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+    %3 = fir.embox %1 : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>>
+    fir.store %3 to %0 : !fir.ref<!fir.box<!fir.heap<i32>>>
+    %4:2 = hlfir.declare %0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "test2"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
+    %5 = fir.allocmem i32 {fir.must_be_heap = true}
+    %6 = fir.embox %5 : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>>
+    fir.store %6 to %4#1 : !fir.ref<!fir.box<!fir.heap<i32>>>
+    %c0 = arith.constant 1 : index  
+    %c1 = arith.constant 0 : index
+    %c2 = arith.constant 10 : index
+    %dims:3 = fir.box_dims %2#1, %c1 : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+    %bounds = omp.map.bounds lower_bound(%c1 : index) upper_bound(%c2 : index) extent(%dims#1 : index) stride(%dims#2 : index) start_idx(%c0 : index) {stride_in_bytes = true}
+    %7 = fir.box_addr %2#1 : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+    %8 = omp.map.info var_ptr(%4#1 : !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.box<!fir.heap<i32>>) map_clauses(tofrom) capture(ByRef) -> !fir.ref<!fir.box<!fir.heap<i32>>>
+    %9 = omp.map.info var_ptr(%7 : !fir.ref<!fir.array<?xi32>>, !fir.array<?xi32>) map_clauses(from) capture(ByRef) bounds(%bounds) -> !fir.ref<!fir.array<?xi32>>
+    omp.target map_entries(%8 -> %arg1, %9 -> %arg2 : !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.array<?xi32>>) {
+    ^bb0(%arg1: !fir.ref<!fir.box<!fir.heap<i32>>>, %arg2: !fir.ref<!fir.array<?xi32>>):
+      omp.terminator
+    }
+    return 
+  }
+}
+ 
+// CHECK: func.func @test_descriptor_expansion_pass(%[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>) {
+// CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.box<!fir.array<?xi32>>
+// CHECK: %[[ALLOCA2:.*]] = fir.alloca !fir.box<!fir.heap<i32>>
+// CHECK: %[[DECLARE1:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<intent_out>, uniq_name = "test"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+// CHECK: %[[DECLARE2:.*]]:2 = hlfir.declare %[[ALLOCA2]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "test2"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
+// CHECK: %[[BOUNDS:.*]] = omp.map.bounds lower_bound(%{{.*}} : index) upper_bound(%{{.*}} : index) extent(%{{.*}} : index) stride(%{{.*}} : index) start_idx(%{{.*}} : index) {stride_in_bytes = true}
+// CHECK: %[[BASE_ADDR_OFF:.*]] = fir.box_offset %[[DECLARE2]]#1 base_addr : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> !fir.llvm_ptr<!fir.ref<i32>>
+// CHECK: %[[DESC_MEMBER_MAP:.*]] = omp.map.info var_ptr(%[[DECLARE2]]#1 : !fir.ref<!fir.box<!fir.heap<i32>>>, i32) var_ptr_ptr(%[[BASE_ADDR_OFF]] : !fir.llvm_ptr<!fir.ref<i32>>) map_clauses(tofrom) capture(ByRef) -> !fir.llvm_ptr<!fir.ref<i32>> {name = ""}
+// CHECK: %[[DESC_PARENT_MAP:.*]] = omp.map.info var_ptr(%[[DECLARE2]]#1 : !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.box<!fir.heap<i32>>) map_clauses(tofrom) capture(ByRef) members(%[[DESC_MEMBER_MAP]] : [0] : !fir.llvm_ptr<!fir.ref<i32>>) -> !fir.ref<!fir.box<!fir.heap<i32>>>
+// CHECK: fir.store %[[DECLARE1]]#1 to %[[ALLOCA]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
+// CHECK: %[[BASE_ADDR_OFF_2:.*]] = fir.box_offset %[[ALLOCA]] base_addr : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
+// CHECK: %[[DESC_MEMBER_MAP_2:.*]] = omp.map.info var_ptr(%[[ALLOCA]] : !fir.ref<!fir.box<!fir.array<?xi32>>>, !fir.array<?xi32>) var_ptr_ptr(%[[BASE_ADDR_OFF_2]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(from) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}
+// CHECK: %[[DESC_PARENT_MAP_2:.*]] = omp.map.info var_ptr(%[[ALLOCA]] : !fir.ref<!fir.box<!fir.array<?xi32>>>, !fir.box<!fir.array<?xi32>>) map_clauses(from) capture(ByRef) members(%[[DESC_MEMBER_MAP_2]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.array<?xi32>>
+// CHECK: omp.target map_entries(%[[DESC_MEMBER_MAP]] -> %[[ARG1:.*]], %[[DESC_PARENT_MAP]] -> %[[ARG2:.*]], %[[DESC_MEMBER_MAP_2]] -> %[[ARG3:.*]], %[[DESC_PARENT_MAP_2]] -> %[[ARG4:.*]] : {{.*}}) {
+// CHECK: ^bb0(%[[ARG1]]: !fir.llvm_ptr<!fir.ref<i32>>, %[[ARG2]]: !fir.ref<!fir.box<!fir.heap<i32>>>, %[[ARG3]]: !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, %[[ARG4]]: !fir.ref<!fir.array<?xi32>>):
+
+// -----
+
+module attributes {omp.is_target_device = false} {
+  func.func @test_derived_type_map_operand_and_block_addition(%arg0: !fir.ref<!fir.type<_QFTdtype{ix:i32,rx:f32,zx:!fir.complex<4>,nested:!fir.box<!fir.heap<!fir.type<_QFTdtype>>>,ry:f32}>>) {
+    %0 = hlfir.designate %arg0{"rx"}   : (!fir.ref<!fir.type<_QFTdtype{ix:i32,rx:f32,zx:!fir.complex<4>,nested:!fir.box<!fir.heap<!fir.type<_QFTdtype>>>,ry:f32}>>) -> !fir.ref<f32>
+    %1 = omp.map.info var_ptr(%0 : !fir.ref<f32>, f32) map_clauses(from) capture(ByRef) -> !fir.ref<f32> {name = "scalar_struct%rx"}
+    %2 = hlfir.designate %arg0{"ry"}   : (!fir.ref<!fir.type<_QFTdtype{ix:i32,rx:f32,zx:!fir.complex<4>,nested:!fir.box<!fir.heap<!fir.type<_QFTdtype>>>,ry:f32}>>) -> !fir.ref<f32>
+    %3 = omp.map.info var_ptr(%2 : !fir.ref<f32>, f32) map_clauses(from) capture(ByRef) -> !fir.ref<f32> {name = "scalar_struct%ry"}
+    %4 = omp.map.info var_ptr(%arg0 : !fir.ref<!fir.type<_QFTdtype{ix:i32,rx:f32,zx:!fir.complex<4>,nested:!fir.box<!fir.heap<!fir.type<_QFTdtype>>>,ry:f32}>>, !fir.type<_QFTdtype{ix:i32,rx:f32,zx:!fir.complex<4>,nested:!fir.box<!fir.heap<!fir.type<_QFTdtype>>>,ry:f32}>) map_clauses(from) capture(ByRef) members(%1, %3 : [1], [4] : !fir.ref<f32>, !fir.ref<f32>) -> !fir.ref<!fir.type<_QFTdtype{ix:i32,rx:f32,zx:!fir.complex<4>,nested:!fir.box<!fir.heap<!fir.type<_QFTdtype>>>,ry:f32}>> {name = "scalar_struct", partial_map = true}
+    omp.target map_entries(%4 -> %arg1 : !fir.ref<!fir.type<_QFTdtype{ix:i32,rx:f32,zx:!fir.complex<4>,nested:!fir.box<!fir.heap<!fir.type<_QFTdtype>>>,ry:f32}>>) {
+    ^bb0(%arg1: !fir.ref<!fir.type<_QFTdtype{ix:i32,rx:f32,zx:!fir.complex<4>,nested:!fir.box<!fir.heap<!fir.type<_QFTdtype>>>,ry:f32}>>):
+      omp.terminator
+    }
+    return
+  }
+}
+
+// CHECK: func.func @test_derived_type_map_operand_and_block_addition(%{{.*}}: !fir.ref<!fir.type<_QFTdtype{ix:i32,rx:f32,zx:!fir.complex<4>,nested:!fir.box<!fir.heap<!fir.type<_QFTdtype>>>,ry:f32}>>) { 
+// CHECK:   %[[MAP_MEMBER_1:.*]] = omp.map.info var_ptr(%{{.*}} : !fir.ref<f32>, f32) map_clauses(from) capture(ByRef) -> !fir.ref<f32> {name = "scalar_struct%rx"}
+// CHECK:   %[[MAP_MEMBER_2:.*]] = omp.map.info var_ptr(%{{.*}} : !fir.ref<f32>, f32) map_clauses(from) capture(ByRef) -> !fir.ref<f32> {name = "scalar_struct%ry"}
+// CHECK:   %[[MAP_PARENT:.*]] = omp.map.info var_ptr(%{{.*}} : !fir.ref<!fir.type<_QFTdtype{ix:i32,rx:f32,zx:!fir.complex<4>,nested:!fir.box<!fir.heap<!fir.type<_QFTdtype>>>,ry:f32}>>, !fir.type<_QFTdtype{ix:i32,rx:f32,zx:!fir.complex<4>,nested:!fir.box<!fir.heap<!fir.type<_QFTdtype>>>,ry:f32}>) map_clauses(from) capture(ByRef) members(%[[MAP_MEMBER_1]], %[[MAP_MEMBER_2]] : [1], [4] : !fir.ref<f32>, !fir.ref<f32>) -> !fir.ref<!fir.type<_QFTdtype{ix:i32,rx:f32,zx:!fir.complex<4>,nested:!fir.box<!fir.heap<!fir.type<_QFTdtype>>>,ry:f32}>> {name = "scalar_struct", partial_map = true}
+// CHECK:   omp.target map_entries(%[[MAP_MEMBER_1]] -> %[[ARG1:.*]], %[[MAP_MEMBER_2]] -> %[[ARG2:.*]], %[[MAP_PARENT]] -> %[[ARG3:.*]] : !fir.ref<f32>, !fir.ref<f32>, !fir.ref<!fir.type<_QFTdtype{ix:i32,rx:f32,zx:!fir.complex<4>,nested:!fir.box<!fir.heap<!fir.type<_QFTdtype>>>,ry:f32}>>) {
+// CHECK:     ^bb0(%[[ARG1]]: !fir.ref<f32>, %[[ARG2]]: !fir.ref<f32>, %[[ARG3]]: !fir.ref<!fir.type<_QFTdtype{ix:i32,rx:f32,zx:!fir.complex<4>,nested:!fir.box<!fir.heap<!fir.type<_QFTdtype>>>,ry:f32}>>):
+
+// -----
+
+module attributes {omp.is_target_device = false} {
+func.func @test_nested_derived_type_map_operand_and_block_addition(%arg0: !fir.ref<!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTscalar_and_array{r:f32,n:!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTnested{i:i32,r:f32}>}>>) {        
+    %0 = fir.declare %arg0 {uniq_name = "_QFmaptype_derived_nested_explicit_multiple_membersEsa"} : (!fir.ref<!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTscalar_and_array{r:f32,n:!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTnested{i:i32,r:f32}>}>>) -> !fir.ref<!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTscalar_and_array{r:f32,n:!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTnested{i:i32,r:f32}>}>>
+    %1 = fir.field_index n, !fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTscalar_and_array{r:f32,n:!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTnested{i:i32,r:f32}>}>
+    %2 = fir.coordinate_of %0, %1 : (!fir.ref<!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTscalar_and_array{r:f32,n:!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTnested{i:i32,r:f32}>}>>, !fir.field) -> !fir.ref<!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTnested{i:i32,r:f32}>>
+    %3 = fir.field_index i, !fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTnested{i:i32,r:f32}>
+    %4 = fir.coordinate_of %2, %3 : (!fir.ref<!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTnested{i:i32,r:f32}>>, !fir.field) -> !fir.ref<i32>
+    %5 = omp.map.info var_ptr(%4 : !fir.ref<i32>, i32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "sa%n%i"}
+    %6 = fir.field_index n, !fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTscalar_and_array{r:f32,n:!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTnested{i:i32,r:f32}>}>
+    %7 = fir.coordinate_of %0, %6 : (!fir.ref<!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTscalar_and_array{r:f32,n:!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTnested{i:i32,r:f32}>}>>, !fir.field) -> !fir.ref<!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTnested{i:i32,r:f32}>>
+    %8 = fir.field_index r, !fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTnested{i:i32,r:f32}>
+    %9 = fir.coordinate_of %7, %8 : (!fir.ref<!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTnested{i:i32,r:f32}>>, !fir.field) -> !fir.ref<f32>
+    %10 = omp.map.info var_ptr(%9 : !fir.ref<f32>, f32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<f32> {name = "sa%n%r"}
+    %11 = omp.map.info var_ptr(%0 : !fir.ref<!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTscalar_and_array{r:f32,n:!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTnested{i:i32,r:f32}>}>>, !fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTscalar_and_array{r:f32,n:!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTnested{i:i32,r:f32}>}>) map_clauses(tofrom) capture(ByRef) members(%5, %10 : [1,0], [1,1] : !fir.ref<i32>, !fir.ref<f32>) -> !fir.ref<!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTscalar_and_array{r:f32,n:!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTnested{i:i32,r:f32}>}>> {name = "sa", partial_map = true}
+    omp.target map_entries(%11 -> %arg1 : !fir.ref<!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTscalar_and_array{r:f32,n:!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTnested{i:i32,r:f32}>}>>) {
+    ^bb0(%arg1: !fir.ref<!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTscalar_and_array{r:f32,n:!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTnested{i:i32,r:f32}>}>>):
+      omp.terminator
+    }
+    return
+  }
+}
+
+// CHECK: func.func @test_nested_derived_type_map_operand_and_block_addition(%{{.*}}: !fir.ref<!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTscalar_and_array{r:f32,n:!fir.type<_QFmaptype_derived_nested_explicit_multiple_membersTnested{i:i32,r:f32}>}>>) { 
+// CHECK:   %[[MAP_MEMBER_1:.*]] = omp.map.info var_ptr(%{{.*}} : !fir.ref<i32>, i32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "sa%n%i"}
+// CHECK:   %[[MAP_MEMBER_2:.*]] = omp.map.info var_ptr(%{{.*}} : !fir.ref<f32>, f32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<f32> {name = "sa%n%r"}
+// CHECK:   %[[MAP_PARENT:.*]] = omp.map.info var_ptr(%{{.*}} : {{.*}}, {{.*}}) map_clauses(tofrom) capture(ByRef) members(%[[MAP_MEMBER_1]], %[[MAP_MEMBER_2]] : [1,0], [1,1] : !fir.ref<i32>, !fir.ref<f32>) -> {{.*}} {name = "sa", partial_map = true}
+// CHECK:   omp.target map_entries(%[[MAP_MEMBER_1]] -> %[[ARG1:.*]], %[[MAP_MEMBER_2]] -> %[[ARG2:.*]], %[[MAP_PARENT]] -> %[[ARG3:.*]] : !fir.ref<i32>, !fir.ref<f32>, {{.*}}) {
+// CHECK:     ^bb0(%[[ARG1]]: !fir.ref<i32>, %[[ARG2]]: !fir.ref<f32>, %[[ARG3]]: {{.*}}):
diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp
index f9349d50055a..f7092d35eeb5 100644
--- a/flang/tools/bbc/bbc.cpp
+++ b/flang/tools/bbc/bbc.cpp
@@ -204,6 +204,10 @@ static llvm::cl::opt<bool> enableCUDA("fcuda",
                                       llvm::cl::desc("enable CUDA Fortran"),
                                       llvm::cl::init(false));
 
+static llvm::cl::opt<std::string>
+    enableGPUMode("gpu", llvm::cl::desc("Enable GPU Mode managed|unified"),
+                  llvm::cl::init(""));
+
 static llvm::cl::opt<bool> fixedForm("ffixed-form",
                                      llvm::cl::desc("enable fixed form"),
                                      llvm::cl::init(false));
@@ -427,8 +431,9 @@ static mlir::LogicalResult convertFortranSourceToMLIR(
     pm.addPass(std::make_unique<Fortran::lower::VerifierPass>());
 
     // Add O2 optimizer pass pipeline.
-    fir::createDefaultFIROptimizerPassPipeline(
-        pm, MLIRToLLVMPassPipelineConfig(llvm::OptimizationLevel::O2));
+    MLIRToLLVMPassPipelineConfig config(llvm::OptimizationLevel::O2);
+    fir::registerDefaultInlinerPass(config);
+    fir::createDefaultFIROptimizerPassPipeline(pm, config);
   }
 
   if (mlir::succeeded(pm.run(mlirModule))) {
@@ -494,6 +499,12 @@ int main(int argc, char **argv) {
     options.features.Enable(Fortran::common::LanguageFeature::CUDA);
   }
 
+  if (enableGPUMode == "managed") {
+    options.features.Enable(Fortran::common::LanguageFeature::CudaManaged);
+  } else if (enableGPUMode == "unified") {
+    options.features.Enable(Fortran::common::LanguageFeature::CudaUnified);
+  }
+
   if (fixedForm) {
     options.isFixedForm = fixedForm;
   }
diff --git a/flang/tools/tco/tco.cpp b/flang/tools/tco/tco.cpp
index 45284e74f584..399ea1362fda 100644
--- a/flang/tools/tco/tco.cpp
+++ b/flang/tools/tco/tco.cpp
@@ -140,6 +140,7 @@ compileFIR(const mlir::PassPipelineCLParser &passPipeline) {
       fir::createDefaultFIRCodeGenPassPipeline(pm, config);
     } else {
       // Run tco with O2 by default.
+      fir::registerDefaultInlinerPass(config);
       fir::createMLIRToLLVMPassPipeline(pm, config);
     }
     fir::addLLVMDialectToLLVMPass(pm, out.os());
diff --git a/libc/fuzzing/__support/CMakeLists.txt b/libc/fuzzing/__support/CMakeLists.txt
index d4f6db71fdd8..b088761f4586 100644
--- a/libc/fuzzing/__support/CMakeLists.txt
+++ b/libc/fuzzing/__support/CMakeLists.txt
@@ -5,3 +5,21 @@ add_libc_fuzzer(
   DEPENDS
     libc.src.__support.big_int
 )
+
+add_libc_fuzzer(
+  hashtable_fuzz
+  SRCS
+    hashtable_fuzz.cpp
+  DEPENDS
+    libc.src.__support.HashTable.table
+)
+
+add_libc_fuzzer(
+  hashtable_opt_fuzz
+  SRCS
+    hashtable_fuzz.cpp
+  DEPENDS
+    libc.src.__support.HashTable.table
+  COMPILE_OPTIONS
+    -D__LIBC_EXPLICIT_SIMD_OPT
+) 
diff --git a/libc/fuzzing/__support/hashtable_fuzz.cpp b/libc/fuzzing/__support/hashtable_fuzz.cpp
new file mode 100644
index 000000000000..07f105771411
--- /dev/null
+++ b/libc/fuzzing/__support/hashtable_fuzz.cpp
@@ -0,0 +1,182 @@
+//===-- hashtable_fuzz.cpp ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Fuzzing test for llvm-libc hashtable implementations.
+///
+//===----------------------------------------------------------------------===//
+#include "include/llvm-libc-types/ENTRY.h"
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/HashTable/table.h"
+
+namespace LIBC_NAMESPACE {
+
+// A fuzzing payload starts with
+// - uint16_t: initial capacity for table A
+// - uint64_t: seed for table A
+// - uint16_t: initial capacity for table B
+// - uint64_t: seed for table B
+// Followed by a sequence of actions:
+// - CrossCheck: only a single byte valued (4 mod 5)
+// - Find: a single byte valued (3 mod 5) followed by a null-terminated string
+// - Insert: a single byte valued (0,1,2 mod 5) followed by a null-terminated
+// string
+static constexpr size_t INITIAL_HEADER_SIZE =
+    2 * (sizeof(uint16_t) + sizeof(uint64_t));
+extern "C" size_t LLVMFuzzerMutate(uint8_t *data, size_t size, size_t max_size);
+extern "C" size_t LLVMFuzzerCustomMutator(uint8_t *data, size_t size,
+                                          size_t max_size, unsigned int seed) {
+  size = LLVMFuzzerMutate(data, size, max_size);
+  // not enough to read the initial capacities and seeds
+  if (size < INITIAL_HEADER_SIZE)
+    return 0;
+
+  // skip the initial capacities and seeds
+  size_t i = INITIAL_HEADER_SIZE;
+  while (i < size) {
+    // cross check
+    if (static_cast<uint8_t>(data[i]) % 5 == 4) {
+      // skip the cross check byte
+      ++i;
+      continue;
+    }
+
+    // find or insert
+    // check if there is enough space for the action byte and the
+    // null-terminator
+    if (i + 2 >= max_size)
+      return i;
+    // skip the action byte
+    ++i;
+    // skip the null-terminated string
+    while (i < max_size && data[i] != 0)
+      ++i;
+    // in the case the string is not null-terminated, null-terminate it
+    if (i == max_size && data[i - 1] != 0) {
+      data[i - 1] = 0;
+      return max_size;
+    }
+
+    // move to the next action
+    ++i;
+  }
+  // return the new size
+  return i;
+}
+
+// a tagged union
+struct Action {
+  enum class Tag { Find, Insert, CrossCheck } tag;
+  cpp::string_view key;
+};
+
+static struct {
+  size_t remaining;
+  const char *buffer;
+
+  template <typename T> T next() {
+    static_assert(cpp::is_integral<T>::value, "T must be an integral type");
+    union {
+      T result;
+      char data[sizeof(T)];
+    };
+    for (size_t i = 0; i < sizeof(result); i++)
+      data[i] = buffer[i];
+    buffer += sizeof(result);
+    remaining -= sizeof(result);
+    return result;
+  }
+
+  cpp::string_view next_string() {
+    cpp::string_view result(buffer);
+    buffer = result.end() + 1;
+    remaining -= result.size() + 1;
+    return result;
+  }
+
+  Action next_action() {
+    uint8_t byte = next<uint8_t>();
+    switch (byte % 5) {
+    case 4:
+      return {Action::Tag::CrossCheck, {}};
+    case 3:
+      return {Action::Tag::Find, next_string()};
+    default:
+      return {Action::Tag::Insert, next_string()};
+    }
+  }
+} global_status;
+
+class HashTable {
+  internal::HashTable *table;
+
+public:
+  HashTable(uint64_t size, uint64_t seed)
+      : table(internal::HashTable::allocate(size, seed)) {}
+  HashTable(internal::HashTable *table) : table(table) {}
+  ~HashTable() { internal::HashTable::deallocate(table); }
+  HashTable(HashTable &&other) : table(other.table) { other.table = nullptr; }
+  bool is_valid() const { return table != nullptr; }
+  ENTRY *find(const char *key) { return table->find(key); }
+  ENTRY *insert(const ENTRY &entry) {
+    return internal::HashTable::insert(this->table, entry);
+  }
+  using iterator = internal::HashTable::iterator;
+  iterator begin() const { return table->begin(); }
+  iterator end() const { return table->end(); }
+};
+
+HashTable next_hashtable() {
+  size_t size = global_status.next<uint16_t>();
+  uint64_t seed = global_status.next<uint64_t>();
+  return HashTable(size, seed);
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  global_status.buffer = reinterpret_cast<const char *>(data);
+  global_status.remaining = size;
+  if (global_status.remaining < INITIAL_HEADER_SIZE)
+    return 0;
+
+  HashTable table_a = next_hashtable();
+  HashTable table_b = next_hashtable();
+  for (;;) {
+    if (global_status.remaining == 0)
+      break;
+    Action action = global_status.next_action();
+    switch (action.tag) {
+    case Action::Tag::Find: {
+      if (static_cast<bool>(table_a.find(action.key.data())) !=
+          static_cast<bool>(table_b.find(action.key.data())))
+        __builtin_trap();
+      break;
+    }
+    case Action::Tag::Insert: {
+      char *ptr = const_cast<char *>(action.key.data());
+      ENTRY *a = table_a.insert(ENTRY{ptr, ptr});
+      ENTRY *b = table_b.insert(ENTRY{ptr, ptr});
+      if (a->data != b->data)
+        __builtin_trap();
+      break;
+    }
+    case Action::Tag::CrossCheck: {
+      for (ENTRY a : table_a)
+        if (const ENTRY *b = table_b.find(a.key); a.data != b->data)
+          __builtin_trap();
+
+      for (ENTRY b : table_b)
+        if (const ENTRY *a = table_a.find(b.key); a->data != b.data)
+          __builtin_trap();
+
+      break;
+    }
+    }
+  }
+  return 0;
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/fuzzing/__support/uint_fuzz.cpp b/libc/fuzzing/__support/uint_fuzz.cpp
index 07149f511b83..109375f84da7 100644
--- a/libc/fuzzing/__support/uint_fuzz.cpp
+++ b/libc/fuzzing/__support/uint_fuzz.cpp
@@ -1,3 +1,14 @@
+//===-- uint_fuzz.cpp -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Fuzzing test for llvm-libc unsigned integer utilities.
+///
+//===----------------------------------------------------------------------===//
 #include "src/__support/CPP/bit.h"
 #include "src/__support/big_int.h"
 #include "src/string/memory_utils/inline_memcpy.h"
diff --git a/libc/hdr/CMakeLists.txt b/libc/hdr/CMakeLists.txt
index 179b05e6ee96..754934251430 100644
--- a/libc/hdr/CMakeLists.txt
+++ b/libc/hdr/CMakeLists.txt
@@ -68,4 +68,13 @@ add_proxy_header_library(
     libc.include.llvm-libc-macros.sys_epoll_macros
 )
 
+add_proxy_header_library(
+  time_macros
+  HDRS
+    time_macros.h
+  FULL_BUILD_DEPENDS
+    libc.include.time
+    libc.include.llvm-libc-macros.time_macros
+)
+
 add_subdirectory(types)
diff --git a/libc/hdr/fenv_macros.h b/libc/hdr/fenv_macros.h
index 1ad28cc278a9..a2e4462ef02d 100644
--- a/libc/hdr/fenv_macros.h
+++ b/libc/hdr/fenv_macros.h
@@ -17,6 +17,52 @@
 
 #include <fenv.h>
 
+// In some environment, FE_ALL_EXCEPT is set to 0 and the remaining exceptions
+// FE_* are missing.
+#if (FE_ALL_EXCEPT == 0)
+#ifndef FE_DIVBYZERO
+#define FE_DIVBYZERO 0
+#endif // FE_DIVBYZERO
+
+#ifndef FE_INEXACT
+#define FE_INEXACT 0
+#endif // FE_INEXACT
+
+#ifndef FE_INVALID
+#define FE_INVALID 0
+#endif // FE_INVALID
+
+#ifndef FE_OVERFLOW
+#define FE_OVERFLOW 0
+#endif // FE_OVERFLOW
+
+#ifndef FE_UNDERFLOW
+#define FE_UNDERFLOW 0
+#endif // FE_UNDERFLOW
+#else
+// If this is not provided by the system, define it for use internally.
+#ifndef __FE_DENORM
+#define __FE_DENORM (1 << 6)
+#endif
+#endif
+
+// Rounding mode macros might be missing.
+#ifndef FE_DOWNWARD
+#define FE_DOWNWARD 0x400
+#endif // FE_DOWNWARD
+
+#ifndef FE_TONEAREST
+#define FE_TONEAREST 0
+#endif // FE_TONEAREST
+
+#ifndef FE_TOWARDZERO
+#define FE_TOWARDZERO 0xC00
+#endif // FE_TOWARDZERO
+
+#ifndef FE_UPWARD
+#define FE_UPWARD 0x800
+#endif // FE_UPWARD
+
 #endif // LLVM_LIBC_FULL_BUILD
 
 #endif // LLVM_LIBC_HDR_FENV_MACROS_H
diff --git a/libc/hdr/time_macros.h b/libc/hdr/time_macros.h
new file mode 100644
index 000000000000..dc36fe66f7a8
--- /dev/null
+++ b/libc/hdr/time_macros.h
@@ -0,0 +1,22 @@
+//===-- Definition of macros from time.h ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_HDR_TIME_MACROS_H
+#define LLVM_LIBC_HDR_TIME_MACROS_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-macros/time-macros.h"
+
+#else // Overlay mode
+
+#include <time.h>
+
+#endif // LLVM_LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_TIME_MACROS_H
diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt
index 46a66ec59020..3a1bb2f3c340 100644
--- a/libc/hdr/types/CMakeLists.txt
+++ b/libc/hdr/types/CMakeLists.txt
@@ -63,3 +63,48 @@ add_proxy_header_library(
     libc.include.llvm-libc-types.fexcept_t
     libc.include.fenv
 )
+
+add_proxy_header_library(
+  time_t
+  HDRS
+    time_t.h
+  FULL_BUILD_DEPENDS
+    libc.include.llvm-libc-types.time_t
+    libc.include.time
+)
+
+add_proxy_header_library(
+  clockid_t
+  HDRS
+    clockid_t.h
+  FULL_BUILD_DEPENDS
+    libc.include.llvm-libc-types.clockid_t
+    libc.include.sys_types
+)
+
+add_proxy_header_library(
+  clock_t
+  HDRS
+    clock_t.h
+  FULL_BUILD_DEPENDS
+    libc.include.llvm-libc-types.clock_t
+    libc.include.time
+)
+
+add_proxy_header_library(
+  suseconds_t
+  HDRS
+    suseconds_t.h
+  FULL_BUILD_DEPENDS
+    libc.include.llvm-libc-types.suseconds_t
+    libc.include.sys_time
+)
+
+add_proxy_header_library(
+  struct_timeval
+  HDRS
+    struct_timeval.h
+  FULL_BUILD_DEPENDS
+    libc.include.llvm-libc-types.struct_timeval
+    libc.include.sys_time
+)
diff --git a/libc/hdr/types/clock_t.h b/libc/hdr/types/clock_t.h
new file mode 100644
index 000000000000..b0b658e96c3d
--- /dev/null
+++ b/libc/hdr/types/clock_t.h
@@ -0,0 +1,22 @@
+//===-- Proxy for clock_t -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_HDR_TYPES_CLOCK_T_H
+#define LLVM_LIBC_HDR_TYPES_CLOCK_T_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-types/clock_t.h"
+
+#else // Overlay mode
+
+#include <sys/types.h>
+
+#endif // LLVM_LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_TYPES_CLOCK_T_H
diff --git a/libc/hdr/types/clockid_t.h b/libc/hdr/types/clockid_t.h
new file mode 100644
index 000000000000..333342072a2f
--- /dev/null
+++ b/libc/hdr/types/clockid_t.h
@@ -0,0 +1,22 @@
+//===-- Proxy for clockid_t -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_HDR_TYPES_CLOCKID_T_H
+#define LLVM_LIBC_HDR_TYPES_CLOCKID_T_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-types/clockid_t.h"
+
+#else // Overlay mode
+
+#include <sys/types.h>
+
+#endif // LLVM_LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_TYPES_CLOCKID_T_H
diff --git a/libc/hdr/types/struct_timeval.h b/libc/hdr/types/struct_timeval.h
new file mode 100644
index 000000000000..8fc321a52d71
--- /dev/null
+++ b/libc/hdr/types/struct_timeval.h
@@ -0,0 +1,21 @@
+//===-- Proxy for struct timeval  ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_HDR_TYPES_STRUCT_TIMEVAL_H
+#define LLVM_LIBC_HDR_TYPES_STRUCT_TIMEVAL_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-types/struct_timeval.h"
+
+#else
+
+#include <sys/time.h>
+
+#endif // LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_TYPES_STRUCT_TIMEVAL_H
diff --git a/libc/hdr/types/suseconds_t.h b/libc/hdr/types/suseconds_t.h
new file mode 100644
index 000000000000..72e54a965f75
--- /dev/null
+++ b/libc/hdr/types/suseconds_t.h
@@ -0,0 +1,22 @@
+//===-- Proxy for suseconds_t ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_HDR_TIMES_SUSECONDS_T_H
+#define LLVM_LIBC_HDR_TIMES_SUSECONDS_T_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-types/suseconds_t.h"
+
+#else // Overlay mode
+
+#include <sys/types.h>
+
+#endif // LLVM_LIBC_FULL_BUILD
+
+#endif // #ifndef LLVM_LIBC_HDR_TIMES_SUSECONDS_T_H
diff --git a/libc/hdr/types/time_t.h b/libc/hdr/types/time_t.h
new file mode 100644
index 000000000000..fc9a1506a2cd
--- /dev/null
+++ b/libc/hdr/types/time_t.h
@@ -0,0 +1,22 @@
+//===-- Proxy for time_t --------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_HDR_TYPES_TIME_T_H
+#define LLVM_LIBC_HDR_TYPES_TIME_T_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-types/time_t.h"
+
+#else // Overlay mode
+
+#include <time.h>
+
+#endif // LLVM_LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_TYPES_TIME_T_H
diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
index 6101ec136b26..be876c9090d3 100644
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -142,6 +142,7 @@ add_gen_header(
   GEN_HDR assert.h
   DEPENDS
     .llvm_libc_common_h
+    .llvm-libc-macros.assert_macros
 )
 
 add_gen_header(
diff --git a/libc/include/llvm-libc-macros/fenv-macros.h b/libc/include/llvm-libc-macros/fenv-macros.h
index 72ac660cd98c..1826723f9349 100644
--- a/libc/include/llvm-libc-macros/fenv-macros.h
+++ b/libc/include/llvm-libc-macros/fenv-macros.h
@@ -9,11 +9,12 @@
 #ifndef LLVM_LIBC_MACROS_FENV_MACROS_H
 #define LLVM_LIBC_MACROS_FENV_MACROS_H
 
-#define FE_DIVBYZERO 1
-#define FE_INEXACT 2
-#define FE_INVALID 4
-#define FE_OVERFLOW 8
-#define FE_UNDERFLOW 16
+#define FE_DIVBYZERO 0x1
+#define FE_INEXACT 0x2
+#define FE_INVALID 0x4
+#define FE_OVERFLOW 0x8
+#define FE_UNDERFLOW 0x10
+#define __FE_DENORM 0x20
 #define FE_ALL_EXCEPT                                                          \
   (FE_DIVBYZERO | FE_INEXACT | FE_INVALID | FE_OVERFLOW | FE_UNDERFLOW)
 
diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index dcae55e050bf..32d693ec6a26 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -281,3 +281,5 @@ add_subdirectory(File)
 add_subdirectory(HashTable)
 
 add_subdirectory(fixed_point)
+
+add_subdirectory(time)
diff --git a/libc/src/__support/CPP/CMakeLists.txt b/libc/src/__support/CPP/CMakeLists.txt
index 84d01fe04516..08661aba5b6b 100644
--- a/libc/src/__support/CPP/CMakeLists.txt
+++ b/libc/src/__support/CPP/CMakeLists.txt
@@ -52,6 +52,12 @@ add_header_library(
 )
 
 add_header_library(
+  mutex
+  HDRS
+    mutex.h
+)
+
+add_header_library(
   span
   HDRS
     span.h
diff --git a/libc/src/__support/CPP/atomic.h b/libc/src/__support/CPP/atomic.h
index 5e428940565b..e273d998c070 100644
--- a/libc/src/__support/CPP/atomic.h
+++ b/libc/src/__support/CPP/atomic.h
@@ -101,6 +101,36 @@ public:
                                        int(mem_ord), int(mem_ord));
   }
 
+  // Atomic compare exchange (separate success and failure memory orders)
+  bool compare_exchange_strong(
+      T &expected, T desired, MemoryOrder success_order,
+      MemoryOrder failure_order,
+      [[maybe_unused]] MemoryScope mem_scope = MemoryScope::DEVICE) {
+    return __atomic_compare_exchange_n(&val, &expected, desired, false,
+                                       static_cast<int>(success_order),
+                                       static_cast<int>(failure_order));
+  }
+
+  // Atomic compare exchange (weak version)
+  bool compare_exchange_weak(
+      T &expected, T desired, MemoryOrder mem_ord = MemoryOrder::SEQ_CST,
+      [[maybe_unused]] MemoryScope mem_scope = MemoryScope::DEVICE) {
+    return __atomic_compare_exchange_n(&val, &expected, desired, true,
+                                       static_cast<int>(mem_ord),
+                                       static_cast<int>(mem_ord));
+  }
+
+  // Atomic compare exchange (weak version with separate success and failure
+  // memory orders)
+  bool compare_exchange_weak(
+      T &expected, T desired, MemoryOrder success_order,
+      MemoryOrder failure_order,
+      [[maybe_unused]] MemoryScope mem_scope = MemoryScope::DEVICE) {
+    return __atomic_compare_exchange_n(&val, &expected, desired, true,
+                                       static_cast<int>(success_order),
+                                       static_cast<int>(failure_order));
+  }
+
   T exchange(T desired, MemoryOrder mem_ord = MemoryOrder::SEQ_CST,
              [[maybe_unused]] MemoryScope mem_scope = MemoryScope::DEVICE) {
 #if __has_builtin(__scoped_atomic_exchange_n)
diff --git a/libc/src/__support/CPP/mutex.h b/libc/src/__support/CPP/mutex.h
new file mode 100644
index 000000000000..ff9c9f43a43c
--- /dev/null
+++ b/libc/src/__support/CPP/mutex.h
@@ -0,0 +1,49 @@
+//===--- A self contained equivalent of std::mutex --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_CPP_MUTEX_H
+#define LLVM_LIBC_SRC___SUPPORT_CPP_MUTEX_H
+
+namespace LIBC_NAMESPACE {
+namespace cpp {
+
+// Assume the calling thread has already obtained mutex ownership.
+struct adopt_lock_t {
+  explicit adopt_lock_t() = default;
+};
+
+// Tag used to make a scoped lock take ownership of a locked mutex.
+constexpr adopt_lock_t adopt_lock{};
+
+// An RAII class for easy locking and unlocking of mutexes.
+template <typename MutexType> class lock_guard {
+  MutexType &mutex;
+
+public:
+  // Calls `m.lock()` upon resource acquisition.
+  explicit lock_guard(MutexType &m) : mutex(m) { mutex.lock(); }
+
+  // Acquires ownership of the mutex object `m` without attempting to lock
+  // it. The behavior is undefined if the current thread does not hold the
+  // lock on `m`. Does not call `m.lock()` upon resource acquisition.
+  lock_guard(MutexType &m, adopt_lock_t /* t */) : mutex(m) {}
+
+  ~lock_guard() { mutex.unlock(); }
+
+  // non-copyable
+  lock_guard &operator=(const lock_guard &) = delete;
+  lock_guard(const lock_guard &) = delete;
+};
+
+// Deduction guide for lock_guard to suppress CTAD warnings.
+template <typename T> lock_guard(T &) -> lock_guard<T>;
+
+} // namespace cpp
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC___SUPPORT_CPP_MUTEX_H
diff --git a/libc/src/__support/FPUtil/FMA.h b/libc/src/__support/FPUtil/FMA.h
index 0e1ede02d5cc..c277da49538b 100644
--- a/libc/src/__support/FPUtil/FMA.h
+++ b/libc/src/__support/FPUtil/FMA.h
@@ -9,25 +9,31 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_FMA_H
 #define LLVM_LIBC_SRC___SUPPORT_FPUTIL_FMA_H
 
+#include "src/__support/CPP/type_traits.h"
 #include "src/__support/macros/properties/architectures.h"
 #include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
 
 #if defined(LIBC_TARGET_CPU_HAS_FMA)
 
-#if defined(LIBC_TARGET_ARCH_IS_X86_64)
-#include "x86_64/FMA.h"
-#elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
-#include "aarch64/FMA.h"
-#elif defined(LIBC_TARGET_ARCH_IS_ANY_RISCV)
-#include "riscv/FMA.h"
-#elif defined(LIBC_TARGET_ARCH_IS_GPU)
-#include "gpu/FMA.h"
-#endif
+namespace LIBC_NAMESPACE {
+namespace fputil {
+
+template <typename T>
+LIBC_INLINE cpp::enable_if_t<cpp::is_same_v<T, float>, T> fma(T x, T y, T z) {
+  return __builtin_fmaf(x, y, z);
+}
+
+template <typename T>
+LIBC_INLINE cpp::enable_if_t<cpp::is_same_v<T, double>, T> fma(T x, T y, T z) {
+  return __builtin_fma(x, y, z);
+}
+
+} // namespace fputil
+} // namespace LIBC_NAMESPACE
 
 #else
 // FMA instructions are not available
 #include "generic/FMA.h"
-#include "src/__support/CPP/type_traits.h"
 
 namespace LIBC_NAMESPACE {
 namespace fputil {
diff --git a/libc/src/__support/FPUtil/aarch64/FEnvImpl.h b/libc/src/__support/FPUtil/aarch64/FEnvImpl.h
index d1d92169475d..cd8a5970edd6 100644
--- a/libc/src/__support/FPUtil/aarch64/FEnvImpl.h
+++ b/libc/src/__support/FPUtil/aarch64/FEnvImpl.h
@@ -53,19 +53,19 @@ struct FEnv {
   static constexpr uint32_t ExceptionControlFlagsBitPosition = 8;
 
   LIBC_INLINE static uint32_t getStatusValueForExcept(int excepts) {
-    return (excepts & FE_INVALID ? INVALID : 0) |
-           (excepts & FE_DIVBYZERO ? DIVBYZERO : 0) |
-           (excepts & FE_OVERFLOW ? OVERFLOW : 0) |
-           (excepts & FE_UNDERFLOW ? UNDERFLOW : 0) |
-           (excepts & FE_INEXACT ? INEXACT : 0);
+    return ((excepts & FE_INVALID) ? INVALID : 0) |
+           ((excepts & FE_DIVBYZERO) ? DIVBYZERO : 0) |
+           ((excepts & FE_OVERFLOW) ? OVERFLOW : 0) |
+           ((excepts & FE_UNDERFLOW) ? UNDERFLOW : 0) |
+           ((excepts & FE_INEXACT) ? INEXACT : 0);
   }
 
   LIBC_INLINE static int exceptionStatusToMacro(uint32_t status) {
-    return (status & INVALID ? FE_INVALID : 0) |
-           (status & DIVBYZERO ? FE_DIVBYZERO : 0) |
-           (status & OVERFLOW ? FE_OVERFLOW : 0) |
-           (status & UNDERFLOW ? FE_UNDERFLOW : 0) |
-           (status & INEXACT ? FE_INEXACT : 0);
+    return ((status & INVALID) ? FE_INVALID : 0) |
+           ((status & DIVBYZERO) ? FE_DIVBYZERO : 0) |
+           ((status & OVERFLOW) ? FE_OVERFLOW : 0) |
+           ((status & UNDERFLOW) ? FE_UNDERFLOW : 0) |
+           ((status & INEXACT) ? FE_INEXACT : 0);
   }
 
   static uint32_t getControlWord() {
diff --git a/libc/src/__support/FPUtil/aarch64/FMA.h b/libc/src/__support/FPUtil/aarch64/FMA.h
deleted file mode 100644
index 6254a0673ff4..000000000000
--- a/libc/src/__support/FPUtil/aarch64/FMA.h
+++ /dev/null
@@ -1,50 +0,0 @@
-//===-- Aarch64 implementations of the fma function -------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_AARCH64_FMA_H
-#define LLVM_LIBC_SRC___SUPPORT_FPUTIL_AARCH64_FMA_H
-
-#include "src/__support/macros/attributes.h" // LIBC_INLINE
-#include "src/__support/macros/properties/architectures.h"
-#include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
-
-#if !defined(LIBC_TARGET_ARCH_IS_AARCH64)
-#error "Invalid include"
-#endif
-
-#if !defined(LIBC_TARGET_CPU_HAS_FMA)
-#error "FMA instructions are not supported"
-#endif
-
-#include "src/__support/CPP/type_traits.h"
-
-namespace LIBC_NAMESPACE {
-namespace fputil {
-
-template <typename T>
-LIBC_INLINE cpp::enable_if_t<cpp::is_same_v<T, float>, T> fma(T x, T y, T z) {
-  float result;
-  LIBC_INLINE_ASM("fmadd %s0, %s1, %s2, %s3\n\t"
-                  : "=w"(result)
-                  : "w"(x), "w"(y), "w"(z));
-  return result;
-}
-
-template <typename T>
-LIBC_INLINE cpp::enable_if_t<cpp::is_same_v<T, double>, T> fma(T x, T y, T z) {
-  double result;
-  LIBC_INLINE_ASM("fmadd %d0, %d1, %d2, %d3\n\t"
-                  : "=w"(result)
-                  : "w"(x), "w"(y), "w"(z));
-  return result;
-}
-
-} // namespace fputil
-} // namespace LIBC_NAMESPACE
-
-#endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_AARCH64_FMA_H
diff --git a/libc/src/__support/FPUtil/aarch64/fenv_darwin_impl.h b/libc/src/__support/FPUtil/aarch64/fenv_darwin_impl.h
index 5b59ba38d67b..feb48e3719bf 100644
--- a/libc/src/__support/FPUtil/aarch64/fenv_darwin_impl.h
+++ b/libc/src/__support/FPUtil/aarch64/fenv_darwin_impl.h
@@ -63,39 +63,39 @@ struct FEnv {
   // located in a different place from FE_FLUSHTOZERO status bit relative to
   // the other exceptions.
   LIBC_INLINE static uint32_t exception_value_from_status(int status) {
-    return (status & FE_INVALID ? EX_INVALID : 0) |
-           (status & FE_DIVBYZERO ? EX_DIVBYZERO : 0) |
-           (status & FE_OVERFLOW ? EX_OVERFLOW : 0) |
-           (status & FE_UNDERFLOW ? EX_UNDERFLOW : 0) |
-           (status & FE_INEXACT ? EX_INEXACT : 0) |
-           (status & FE_FLUSHTOZERO ? EX_FLUSHTOZERO : 0);
+    return ((status & FE_INVALID) ? EX_INVALID : 0) |
+           ((status & FE_DIVBYZERO) ? EX_DIVBYZERO : 0) |
+           ((status & FE_OVERFLOW) ? EX_OVERFLOW : 0) |
+           ((status & FE_UNDERFLOW) ? EX_UNDERFLOW : 0) |
+           ((status & FE_INEXACT) ? EX_INEXACT : 0) |
+           ((status & FE_FLUSHTOZERO) ? EX_FLUSHTOZERO : 0);
   }
 
   LIBC_INLINE static uint32_t exception_value_from_control(int control) {
-    return (control & __fpcr_trap_invalid ? EX_INVALID : 0) |
-           (control & __fpcr_trap_divbyzero ? EX_DIVBYZERO : 0) |
-           (control & __fpcr_trap_overflow ? EX_OVERFLOW : 0) |
-           (control & __fpcr_trap_underflow ? EX_UNDERFLOW : 0) |
-           (control & __fpcr_trap_inexact ? EX_INEXACT : 0) |
-           (control & __fpcr_flush_to_zero ? EX_FLUSHTOZERO : 0);
+    return ((control & __fpcr_trap_invalid) ? EX_INVALID : 0) |
+           ((control & __fpcr_trap_divbyzero) ? EX_DIVBYZERO : 0) |
+           ((control & __fpcr_trap_overflow) ? EX_OVERFLOW : 0) |
+           ((control & __fpcr_trap_underflow) ? EX_UNDERFLOW : 0) |
+           ((control & __fpcr_trap_inexact) ? EX_INEXACT : 0) |
+           ((control & __fpcr_flush_to_zero) ? EX_FLUSHTOZERO : 0);
   }
 
   LIBC_INLINE static int exception_value_to_status(uint32_t excepts) {
-    return (excepts & EX_INVALID ? FE_INVALID : 0) |
-           (excepts & EX_DIVBYZERO ? FE_DIVBYZERO : 0) |
-           (excepts & EX_OVERFLOW ? FE_OVERFLOW : 0) |
-           (excepts & EX_UNDERFLOW ? FE_UNDERFLOW : 0) |
-           (excepts & EX_INEXACT ? FE_INEXACT : 0) |
-           (excepts & EX_FLUSHTOZERO ? FE_FLUSHTOZERO : 0);
+    return ((excepts & EX_INVALID) ? FE_INVALID : 0) |
+           ((excepts & EX_DIVBYZERO) ? FE_DIVBYZERO : 0) |
+           ((excepts & EX_OVERFLOW) ? FE_OVERFLOW : 0) |
+           ((excepts & EX_UNDERFLOW) ? FE_UNDERFLOW : 0) |
+           ((excepts & EX_INEXACT) ? FE_INEXACT : 0) |
+           ((excepts & EX_FLUSHTOZERO) ? FE_FLUSHTOZERO : 0);
   }
 
   LIBC_INLINE static int exception_value_to_control(uint32_t excepts) {
-    return (excepts & EX_INVALID ? __fpcr_trap_invalid : 0) |
-           (excepts & EX_DIVBYZERO ? __fpcr_trap_divbyzero : 0) |
-           (excepts & EX_OVERFLOW ? __fpcr_trap_overflow : 0) |
-           (excepts & EX_UNDERFLOW ? __fpcr_trap_underflow : 0) |
-           (excepts & EX_INEXACT ? __fpcr_trap_inexact : 0) |
-           (excepts & EX_FLUSHTOZERO ? __fpcr_flush_to_zero : 0);
+    return ((excepts & EX_INVALID) ? __fpcr_trap_invalid : 0) |
+           ((excepts & EX_DIVBYZERO) ? __fpcr_trap_divbyzero : 0) |
+           ((excepts & EX_OVERFLOW) ? __fpcr_trap_overflow : 0) |
+           ((excepts & EX_UNDERFLOW) ? __fpcr_trap_underflow : 0) |
+           ((excepts & EX_INEXACT) ? __fpcr_trap_inexact : 0) |
+           ((excepts & EX_FLUSHTOZERO) ? __fpcr_flush_to_zero : 0);
   }
 
   LIBC_INLINE static uint32_t get_control_word() { return __arm_rsr("fpcr"); }
diff --git a/libc/src/__support/FPUtil/arm/FEnvImpl.h b/libc/src/__support/FPUtil/arm/FEnvImpl.h
index 78fbda4f7aff..cb8d31d683af 100644
--- a/libc/src/__support/FPUtil/arm/FEnvImpl.h
+++ b/libc/src/__support/FPUtil/arm/FEnvImpl.h
@@ -50,35 +50,35 @@ struct FEnv {
   }
 
   LIBC_INLINE static int exception_enable_bits_to_macro(uint32_t status) {
-    return (status & INVALID_ENABLE ? FE_INVALID : 0) |
-           (status & DIVBYZERO_ENABLE ? FE_DIVBYZERO : 0) |
-           (status & OVERFLOW_ENABLE ? FE_OVERFLOW : 0) |
-           (status & UNDERFLOW_ENABLE ? FE_UNDERFLOW : 0) |
-           (status & INEXACT_ENABLE ? FE_INEXACT : 0);
+    return ((status & INVALID_ENABLE) ? FE_INVALID : 0) |
+           ((status & DIVBYZERO_ENABLE) ? FE_DIVBYZERO : 0) |
+           ((status & OVERFLOW_ENABLE) ? FE_OVERFLOW : 0) |
+           ((status & UNDERFLOW_ENABLE) ? FE_UNDERFLOW : 0) |
+           ((status & INEXACT_ENABLE) ? FE_INEXACT : 0);
   }
 
   LIBC_INLINE static uint32_t exception_macro_to_enable_bits(int except) {
-    return (except & FE_INVALID ? INVALID_ENABLE : 0) |
-           (except & FE_DIVBYZERO ? DIVBYZERO_ENABLE : 0) |
-           (except & FE_OVERFLOW ? OVERFLOW_ENABLE : 0) |
-           (except & FE_UNDERFLOW ? UNDERFLOW_ENABLE : 0) |
-           (except & FE_INEXACT ? INEXACT_ENABLE : 0);
+    return ((except & FE_INVALID) ? INVALID_ENABLE : 0) |
+           ((except & FE_DIVBYZERO) ? DIVBYZERO_ENABLE : 0) |
+           ((except & FE_OVERFLOW) ? OVERFLOW_ENABLE : 0) |
+           ((except & FE_UNDERFLOW) ? UNDERFLOW_ENABLE : 0) |
+           ((except & FE_INEXACT) ? INEXACT_ENABLE : 0);
   }
 
   LIBC_INLINE static uint32_t exception_macro_to_status_bits(int except) {
-    return (except & FE_INVALID ? INVALID_STATUS : 0) |
-           (except & FE_DIVBYZERO ? DIVBYZERO_STATUS : 0) |
-           (except & FE_OVERFLOW ? OVERFLOW_STATUS : 0) |
-           (except & FE_UNDERFLOW ? UNDERFLOW_STATUS : 0) |
-           (except & FE_INEXACT ? INEXACT_STATUS : 0);
+    return ((except & FE_INVALID) ? INVALID_STATUS : 0) |
+           ((except & FE_DIVBYZERO) ? DIVBYZERO_STATUS : 0) |
+           ((except & FE_OVERFLOW) ? OVERFLOW_STATUS : 0) |
+           ((except & FE_UNDERFLOW) ? UNDERFLOW_STATUS : 0) |
+           ((except & FE_INEXACT) ? INEXACT_STATUS : 0);
   }
 
   LIBC_INLINE static uint32_t exception_status_bits_to_macro(int status) {
-    return (status & INVALID_STATUS ? FE_INVALID : 0) |
-           (status & DIVBYZERO_STATUS ? FE_DIVBYZERO : 0) |
-           (status & OVERFLOW_STATUS ? FE_OVERFLOW : 0) |
-           (status & UNDERFLOW_STATUS ? FE_UNDERFLOW : 0) |
-           (status & INEXACT_STATUS ? FE_INEXACT : 0);
+    return ((status & INVALID_STATUS) ? FE_INVALID : 0) |
+           ((status & DIVBYZERO_STATUS) ? FE_DIVBYZERO : 0) |
+           ((status & OVERFLOW_STATUS) ? FE_OVERFLOW : 0) |
+           ((status & UNDERFLOW_STATUS) ? FE_UNDERFLOW : 0) |
+           ((status & INEXACT_STATUS) ? FE_INEXACT : 0);
   }
 };
 
diff --git a/libc/src/__support/FPUtil/gpu/FMA.h b/libc/src/__support/FPUtil/gpu/FMA.h
deleted file mode 100644
index ef1cd26a72dd..000000000000
--- a/libc/src/__support/FPUtil/gpu/FMA.h
+++ /dev/null
@@ -1,36 +0,0 @@
-//===-- GPU implementations of the fma function -----------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_GPU_FMA_H
-#define LLVM_LIBC_SRC___SUPPORT_FPUTIL_GPU_FMA_H
-
-#include "src/__support/CPP/type_traits.h"
-
-// These intrinsics map to the FMA instructions in the target ISA for the GPU.
-// The default rounding mode generated from these will be to the nearest even.
-#if !__has_builtin(__builtin_fma) || !__has_builtin(__builtin_fmaf)
-#error "FMA builtins must be defined");
-#endif
-
-namespace LIBC_NAMESPACE {
-namespace fputil {
-
-template <typename T>
-LIBC_INLINE cpp::enable_if_t<cpp::is_same_v<T, float>, T> fma(T x, T y, T z) {
-  return __builtin_fmaf(x, y, z);
-}
-
-template <typename T>
-LIBC_INLINE cpp::enable_if_t<cpp::is_same_v<T, double>, T> fma(T x, T y, T z) {
-  return __builtin_fma(x, y, z);
-}
-
-} // namespace fputil
-} // namespace LIBC_NAMESPACE
-
-#endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_GPU_FMA_H
diff --git a/libc/src/__support/FPUtil/riscv/FEnvImpl.h b/libc/src/__support/FPUtil/riscv/FEnvImpl.h
index e7aee3ba4b91..1de464a89de4 100644
--- a/libc/src/__support/FPUtil/riscv/FEnvImpl.h
+++ b/libc/src/__support/FPUtil/riscv/FEnvImpl.h
@@ -65,19 +65,19 @@ struct FEnv {
   }
 
   LIBC_INLINE static int exception_bits_to_macro(uint32_t status) {
-    return (status & INVALID ? FE_INVALID : 0) |
-           (status & DIVBYZERO ? FE_DIVBYZERO : 0) |
-           (status & OVERFLOW ? FE_OVERFLOW : 0) |
-           (status & UNDERFLOW ? FE_UNDERFLOW : 0) |
-           (status & INEXACT ? FE_INEXACT : 0);
+    return ((status & INVALID) ? FE_INVALID : 0) |
+           ((status & DIVBYZERO) ? FE_DIVBYZERO : 0) |
+           ((status & OVERFLOW) ? FE_OVERFLOW : 0) |
+           ((status & UNDERFLOW) ? FE_UNDERFLOW : 0) |
+           ((status & INEXACT) ? FE_INEXACT : 0);
   }
 
   LIBC_INLINE static uint32_t exception_macro_to_bits(int except) {
-    return (except & FE_INVALID ? INVALID : 0) |
-           (except & FE_DIVBYZERO ? DIVBYZERO : 0) |
-           (except & FE_OVERFLOW ? OVERFLOW : 0) |
-           (except & FE_UNDERFLOW ? UNDERFLOW : 0) |
-           (except & FE_INEXACT ? INEXACT : 0);
+    return ((except & FE_INVALID) ? INVALID : 0) |
+           ((except & FE_DIVBYZERO) ? DIVBYZERO : 0) |
+           ((except & FE_OVERFLOW) ? OVERFLOW : 0) |
+           ((except & FE_UNDERFLOW) ? UNDERFLOW : 0) |
+           ((except & FE_INEXACT) ? INEXACT : 0);
   }
 };
 
diff --git a/libc/src/__support/FPUtil/riscv/FMA.h b/libc/src/__support/FPUtil/riscv/FMA.h
deleted file mode 100644
index f01962174f16..000000000000
--- a/libc/src/__support/FPUtil/riscv/FMA.h
+++ /dev/null
@@ -1,54 +0,0 @@
-//===-- RISCV implementations of the fma function ---------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_RISCV_FMA_H
-#define LLVM_LIBC_SRC___SUPPORT_FPUTIL_RISCV_FMA_H
-
-#include "src/__support/macros/attributes.h" // LIBC_INLINE
-#include "src/__support/macros/properties/architectures.h"
-#include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
-
-#if !defined(LIBC_TARGET_ARCH_IS_ANY_RISCV)
-#error "Invalid include"
-#endif
-
-#if !defined(LIBC_TARGET_CPU_HAS_FMA)
-#error "FMA instructions are not supported"
-#endif
-
-#include "src/__support/CPP/type_traits.h"
-
-namespace LIBC_NAMESPACE {
-namespace fputil {
-
-#ifdef __riscv_flen
-template <typename T>
-LIBC_INLINE cpp::enable_if_t<cpp::is_same_v<T, float>, T> fma(T x, T y, T z) {
-  float result;
-  LIBC_INLINE_ASM("fmadd.s %0, %1, %2, %3\n\t"
-                  : "=f"(result)
-                  : "f"(x), "f"(y), "f"(z));
-  return result;
-}
-
-#if __riscv_flen >= 64
-template <typename T>
-LIBC_INLINE cpp::enable_if_t<cpp::is_same_v<T, double>, T> fma(T x, T y, T z) {
-  double result;
-  LIBC_INLINE_ASM("fmadd.d %0, %1, %2, %3\n\t"
-                  : "=f"(result)
-                  : "f"(x), "f"(y), "f"(z));
-  return result;
-}
-#endif // __riscv_flen >= 64
-#endif // __riscv_flen
-
-} // namespace fputil
-} // namespace LIBC_NAMESPACE
-
-#endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_RISCV_FMA_H
diff --git a/libc/src/__support/FPUtil/x86_64/FEnvImpl.h b/libc/src/__support/FPUtil/x86_64/FEnvImpl.h
index 0595658d7df3..a157b81aaaf3 100644
--- a/libc/src/__support/FPUtil/x86_64/FEnvImpl.h
+++ b/libc/src/__support/FPUtil/x86_64/FEnvImpl.h
@@ -72,25 +72,25 @@ static constexpr uint16_t MXCSR_EXCEPTION_CONTOL_BIT_POISTION = 7;
 LIBC_INLINE uint16_t get_status_value_for_except(int excepts) {
   // We will make use of the fact that exception control bits are single
   // bit flags in the control registers.
-  return (excepts & FE_INVALID ? ExceptionFlags::INVALID_F : 0) |
+  return ((excepts & FE_INVALID) ? ExceptionFlags::INVALID_F : 0) |
 #ifdef __FE_DENORM
-         (excepts & __FE_DENORM ? ExceptionFlags::DENORMAL_F : 0) |
+         ((excepts & __FE_DENORM) ? ExceptionFlags::DENORMAL_F : 0) |
 #endif // __FE_DENORM
-         (excepts & FE_DIVBYZERO ? ExceptionFlags::DIV_BY_ZERO_F : 0) |
-         (excepts & FE_OVERFLOW ? ExceptionFlags::OVERFLOW_F : 0) |
-         (excepts & FE_UNDERFLOW ? ExceptionFlags::UNDERFLOW_F : 0) |
-         (excepts & FE_INEXACT ? ExceptionFlags::INEXACT_F : 0);
+         ((excepts & FE_DIVBYZERO) ? ExceptionFlags::DIV_BY_ZERO_F : 0) |
+         ((excepts & FE_OVERFLOW) ? ExceptionFlags::OVERFLOW_F : 0) |
+         ((excepts & FE_UNDERFLOW) ? ExceptionFlags::UNDERFLOW_F : 0) |
+         ((excepts & FE_INEXACT) ? ExceptionFlags::INEXACT_F : 0);
 }
 
 LIBC_INLINE int exception_status_to_macro(uint16_t status) {
-  return (status & ExceptionFlags::INVALID_F ? FE_INVALID : 0) |
+  return ((status & ExceptionFlags::INVALID_F) ? FE_INVALID : 0) |
 #ifdef __FE_DENORM
-         (status & ExceptionFlags::DENORMAL_F ? __FE_DENORM : 0) |
+         ((status & ExceptionFlags::DENORMAL_F) ? __FE_DENORM : 0) |
 #endif // __FE_DENORM
-         (status & ExceptionFlags::DIV_BY_ZERO_F ? FE_DIVBYZERO : 0) |
-         (status & ExceptionFlags::OVERFLOW_F ? FE_OVERFLOW : 0) |
-         (status & ExceptionFlags::UNDERFLOW_F ? FE_UNDERFLOW : 0) |
-         (status & ExceptionFlags::INEXACT_F ? FE_INEXACT : 0);
+         ((status & ExceptionFlags::DIV_BY_ZERO_F) ? FE_DIVBYZERO : 0) |
+         ((status & ExceptionFlags::OVERFLOW_F) ? FE_OVERFLOW : 0) |
+         ((status & ExceptionFlags::UNDERFLOW_F) ? FE_UNDERFLOW : 0) |
+         ((status & ExceptionFlags::INEXACT_F) ? FE_INEXACT : 0);
 }
 
 struct X87StateDescriptor {
diff --git a/libc/src/__support/FPUtil/x86_64/FMA.h b/libc/src/__support/FPUtil/x86_64/FMA.h
deleted file mode 100644
index 91ef7f96ff4d..000000000000
--- a/libc/src/__support/FPUtil/x86_64/FMA.h
+++ /dev/null
@@ -1,55 +0,0 @@
-//===-- x86_64 implementations of the fma function --------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_X86_64_FMA_H
-#define LLVM_LIBC_SRC___SUPPORT_FPUTIL_X86_64_FMA_H
-
-#include "src/__support/macros/attributes.h" // LIBC_INLINE
-#include "src/__support/macros/properties/architectures.h"
-#include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
-
-#if !defined(LIBC_TARGET_ARCH_IS_X86_64)
-#error "Invalid include"
-#endif
-
-#if !defined(LIBC_TARGET_CPU_HAS_FMA)
-#error "FMA instructions are not supported"
-#endif
-
-#include "src/__support/CPP/type_traits.h"
-#include <immintrin.h>
-
-namespace LIBC_NAMESPACE {
-namespace fputil {
-
-template <typename T>
-LIBC_INLINE cpp::enable_if_t<cpp::is_same_v<T, float>, T> fma(T x, T y, T z) {
-  float result;
-  __m128 xmm = _mm_load_ss(&x);           // NOLINT
-  __m128 ymm = _mm_load_ss(&y);           // NOLINT
-  __m128 zmm = _mm_load_ss(&z);           // NOLINT
-  __m128 r = _mm_fmadd_ss(xmm, ymm, zmm); // NOLINT
-  _mm_store_ss(&result, r);               // NOLINT
-  return result;
-}
-
-template <typename T>
-LIBC_INLINE cpp::enable_if_t<cpp::is_same_v<T, double>, T> fma(T x, T y, T z) {
-  double result;
-  __m128d xmm = _mm_load_sd(&x);           // NOLINT
-  __m128d ymm = _mm_load_sd(&y);           // NOLINT
-  __m128d zmm = _mm_load_sd(&z);           // NOLINT
-  __m128d r = _mm_fmadd_sd(xmm, ymm, zmm); // NOLINT
-  _mm_store_sd(&result, r);                // NOLINT
-  return result;
-}
-
-} // namespace fputil
-} // namespace LIBC_NAMESPACE
-
-#endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_X86_64_FMA_H
diff --git a/libc/src/__support/File/CMakeLists.txt b/libc/src/__support/File/CMakeLists.txt
index b7c0612096aa..0416ac2cc902 100644
--- a/libc/src/__support/File/CMakeLists.txt
+++ b/libc/src/__support/File/CMakeLists.txt
@@ -25,6 +25,7 @@ add_object_library(
   HDRS
     dir.h
   DEPENDS
+    libc.src.__support.CPP.mutex
     libc.src.__support.CPP.new
     libc.src.__support.CPP.span
     libc.src.__support.threads.mutex
diff --git a/libc/src/__support/File/dir.cpp b/libc/src/__support/File/dir.cpp
index 9ff639a777e2..e0f7695b3932 100644
--- a/libc/src/__support/File/dir.cpp
+++ b/libc/src/__support/File/dir.cpp
@@ -8,6 +8,7 @@
 
 #include "dir.h"
 
+#include "src/__support/CPP/mutex.h" // lock_guard
 #include "src/__support/CPP/new.h"
 #include "src/__support/error_or.h"
 #include "src/errno/libc_errno.h" // For error macros
@@ -27,7 +28,7 @@ ErrorOr<Dir *> Dir::open(const char *path) {
 }
 
 ErrorOr<struct ::dirent *> Dir::read() {
-  MutexLock lock(&mutex);
+  cpp::lock_guard lock(mutex);
   if (readptr >= fillsize) {
     auto readsize = platform_fetch_dirents(fd, buffer);
     if (!readsize)
@@ -51,7 +52,7 @@ ErrorOr<struct ::dirent *> Dir::read() {
 
 int Dir::close() {
   {
-    MutexLock lock(&mutex);
+    cpp::lock_guard lock(mutex);
     int retval = platform_closedir(fd);
     if (retval != 0)
       return retval;
diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h
index 9b520a6bcf38..5f8ad74f6aea 100644
--- a/libc/src/__support/GPU/amdgpu/utils.h
+++ b/libc/src/__support/GPU/amdgpu/utils.h
@@ -140,6 +140,11 @@ LIBC_INLINE uint32_t get_lane_size() {
   __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
 }
 
+/// Waits for all pending memory operations to complete in program order.
+[[clang::convergent]] LIBC_INLINE void memory_fence() {
+  __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "");
+}
+
 /// Wait for all threads in the wavefront to converge, this is a noop on AMDGPU.
 [[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t) {
   __builtin_amdgcn_wave_barrier();
diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h
index 3f19afb83648..88b8ee2e31d3 100644
--- a/libc/src/__support/GPU/nvptx/utils.h
+++ b/libc/src/__support/GPU/nvptx/utils.h
@@ -118,9 +118,13 @@ LIBC_INLINE uint32_t get_lane_size() { return 32; }
   uint32_t mask = static_cast<uint32_t>(lane_mask);
   return __nvvm_vote_ballot_sync(mask, x);
 }
+
 /// Waits for all the threads in the block to converge and issues a fence.
 [[clang::convergent]] LIBC_INLINE void sync_threads() { __syncthreads(); }
 
+/// Waits for all pending memory operations to complete in program order.
+[[clang::convergent]] LIBC_INLINE void memory_fence() { __nvvm_membar_sys(); }
+
 /// Waits for all threads in the warp to reconverge for independent scheduling.
 [[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t mask) {
   __nvvm_bar_warp_sync(static_cast<uint32_t>(mask));
diff --git a/libc/src/__support/HashTable/generic/bitmask_impl.inc b/libc/src/__support/HashTable/generic/bitmask_impl.inc
index 56b540d568d0..d6c5ae075558 100644
--- a/libc/src/__support/HashTable/generic/bitmask_impl.inc
+++ b/libc/src/__support/HashTable/generic/bitmask_impl.inc
@@ -34,10 +34,11 @@ LIBC_INLINE constexpr bitmask_t repeat_byte(bitmask_t byte) {
   return byte;
 }
 
-using BitMask = BitMaskAdaptor<bitmask_t, 0x8ull>;
+using BitMask = BitMaskAdaptor<bitmask_t, 0x8ul>;
 using IteratableBitMask = IteratableBitMaskAdaptor<BitMask>;
 
 struct Group {
+  LIBC_INLINE_VAR static constexpr bitmask_t MASK = repeat_byte(0x80ul);
   bitmask_t data;
 
   // Load a group of control words from an arbitary address.
@@ -100,21 +101,23 @@ struct Group {
     //  - The check for key equality will catch these.
     //  - This only happens if there is at least 1 true match.
     //  - The chance of this happening is very low (< 1% chance per byte).
-    auto cmp = data ^ repeat_byte(byte);
-    auto result = LIBC_NAMESPACE::Endian::to_little_endian(
-        (cmp - repeat_byte(0x01)) & ~cmp & repeat_byte(0x80));
+    static constexpr bitmask_t ONES = repeat_byte(0x01ul);
+    auto cmp = data ^ repeat_byte(static_cast<bitmask_t>(byte) & 0xFFul);
+    auto result =
+        LIBC_NAMESPACE::Endian::to_little_endian((cmp - ONES) & ~cmp & MASK);
     return {BitMask{result}};
   }
 
   // Find out the lanes equal to EMPTY or DELETE (highest bit set) and
   // return the bitmask with corresponding bits set.
   LIBC_INLINE BitMask mask_available() const {
-    return {LIBC_NAMESPACE::Endian::to_little_endian(data) & repeat_byte(0x80)};
+    bitmask_t le_data = LIBC_NAMESPACE::Endian::to_little_endian(data);
+    return {le_data & MASK};
   }
 
   LIBC_INLINE IteratableBitMask occupied() const {
-    return {
-        {static_cast<bitmask_t>(mask_available().word ^ repeat_byte(0x80))}};
+    bitmask_t available = mask_available().word;
+    return {BitMask{available ^ MASK}};
   }
 };
 } // namespace internal
diff --git a/libc/src/__support/threads/CMakeLists.txt b/libc/src/__support/threads/CMakeLists.txt
index 731adf6f9c8e..34412be4dfed 100644
--- a/libc/src/__support/threads/CMakeLists.txt
+++ b/libc/src/__support/threads/CMakeLists.txt
@@ -31,6 +31,7 @@ if(TARGET libc.src.__support.threads.${LIBC_TARGET_OS}.mutex)
       fork_callbacks.h
     DEPENDS
       .mutex
+      libc.src.__support.CPP.mutex
   )
 endif()
 
@@ -57,6 +58,7 @@ if(TARGET libc.src.__support.threads.${LIBC_TARGET_OS}.thread)
       libc.src.__support.common
       libc.src.__support.fixedvector
       libc.src.__support.CPP.array
+      libc.src.__support.CPP.mutex
       libc.src.__support.CPP.optional
   )
 endif()
diff --git a/libc/src/__support/threads/fork_callbacks.cpp b/libc/src/__support/threads/fork_callbacks.cpp
index 54fda676f281..6efaf62f135a 100644
--- a/libc/src/__support/threads/fork_callbacks.cpp
+++ b/libc/src/__support/threads/fork_callbacks.cpp
@@ -8,6 +8,7 @@
 
 #include "fork_callbacks.h"
 
+#include "src/__support/CPP/mutex.h" // lock_guard
 #include "src/__support/threads/mutex.h"
 
 #include <stddef.h> // For size_t
@@ -35,7 +36,7 @@ public:
   constexpr AtForkCallbackManager() : mtx(false, false, false), next_index(0) {}
 
   bool register_triple(const ForkCallbackTriple &triple) {
-    MutexLock lock(&mtx);
+    cpp::lock_guard lock(mtx);
     if (next_index >= CALLBACK_SIZE)
       return false;
     list[next_index] = triple;
@@ -44,7 +45,7 @@ public:
   }
 
   void invoke_prepare() {
-    MutexLock lock(&mtx);
+    cpp::lock_guard lock(mtx);
     for (size_t i = 0; i < next_index; ++i) {
       auto prepare = list[i].prepare;
       if (prepare)
@@ -53,7 +54,7 @@ public:
   }
 
   void invoke_parent() {
-    MutexLock lock(&mtx);
+    cpp::lock_guard lock(mtx);
     for (size_t i = 0; i < next_index; ++i) {
       auto parent = list[i].parent;
       if (parent)
@@ -62,7 +63,7 @@ public:
   }
 
   void invoke_child() {
-    MutexLock lock(&mtx);
+    cpp::lock_guard lock(mtx);
     for (size_t i = 0; i < next_index; ++i) {
       auto child = list[i].child;
       if (child)
diff --git a/libc/src/__support/threads/linux/CMakeLists.txt b/libc/src/__support/threads/linux/CMakeLists.txt
index 87a7a66ac6ea..9bee30206f1b 100644
--- a/libc/src/__support/threads/linux/CMakeLists.txt
+++ b/libc/src/__support/threads/linux/CMakeLists.txt
@@ -9,14 +9,25 @@ if(NOT TARGET libc.src.__support.OSUtil.osutil)
 endif()
 
 add_header_library(
-  mutex
+  futex_utils
   HDRS
-    mutex.h
+    futex_utils.h
   DEPENDS
     .futex_word_type
     libc.include.sys_syscall
-    libc.src.__support.CPP.atomic
     libc.src.__support.OSUtil.osutil
+    libc.src.__support.CPP.atomic
+    libc.src.__support.CPP.limits
+    libc.src.__support.CPP.optional
+    libc.hdr.types.struct_timespec
+)
+
+add_header_library(
+  mutex
+  HDRS
+    mutex.h
+  DEPENDS
+    .futex_utils
     libc.src.__support.threads.mutex_common
 )
 
@@ -25,7 +36,7 @@ add_object_library(
   SRCS
     thread.cpp
   DEPENDS
-    .futex_word_type
+    .futex_utils
     libc.config.linux.app_h
     libc.include.sys_syscall
     libc.src.errno.errno
@@ -50,8 +61,5 @@ add_object_library(
   HDRS
     ../callonce.h
   DEPENDS
-    libc.include.sys_syscall
-    libc.src.__support.CPP.atomic
-    libc.src.__support.CPP.limits
-    libc.src.__support.OSUtil.osutil
+    .futex_utils
 )
diff --git a/libc/src/__support/threads/linux/callonce.cpp b/libc/src/__support/threads/linux/callonce.cpp
index b6a5ab8c0d07..b48a514a4487 100644
--- a/libc/src/__support/threads/linux/callonce.cpp
+++ b/libc/src/__support/threads/linux/callonce.cpp
@@ -6,15 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "futex_word.h"
-
-#include "src/__support/CPP/atomic.h"
-#include "src/__support/CPP/limits.h"     // INT_MAX
-#include "src/__support/OSUtil/syscall.h" // For syscall functions.
 #include "src/__support/threads/callonce.h"
-
-#include <linux/futex.h>
-#include <sys/syscall.h> // For syscall numbers.
+#include "src/__support/macros/optimization.h"
+#include "src/__support/threads/linux/futex_utils.h"
 
 namespace LIBC_NAMESPACE {
 
@@ -24,31 +18,30 @@ static constexpr FutexWordType WAITING = 0x22;
 static constexpr FutexWordType FINISH = 0x33;
 
 int callonce(CallOnceFlag *flag, CallOnceCallback *func) {
-  auto *futex_word = reinterpret_cast<cpp::Atomic<FutexWordType> *>(flag);
+  auto *futex_word = reinterpret_cast<Futex *>(flag);
 
   FutexWordType not_called = NOT_CALLED;
 
+  // Avoid cmpxchg operation if the function has already been called.
+  // The destination operand of cmpxchg may receive a write cycle without
+  // regard to the result of the comparison
+  if (LIBC_LIKELY(futex_word->load(cpp::MemoryOrder::RELAXED) == FINISH))
+    return 0;
+
   // The call_once call can return only after the called function |func|
   // returns. So, we use futexes to synchronize calls with the same flag value.
   if (futex_word->compare_exchange_strong(not_called, START)) {
     func();
     auto status = futex_word->exchange(FINISH);
-    if (status == WAITING) {
-      LIBC_NAMESPACE::syscall_impl<long>(FUTEX_SYSCALL_ID, &futex_word->val,
-                                         FUTEX_WAKE_PRIVATE,
-                                         INT_MAX, // Wake all waiters.
-                                         0, 0, 0);
-    }
+    if (status == WAITING)
+      futex_word->notify_all();
     return 0;
   }
 
   FutexWordType status = START;
   if (futex_word->compare_exchange_strong(status, WAITING) ||
       status == WAITING) {
-    LIBC_NAMESPACE::syscall_impl<long>(
-        FUTEX_SYSCALL_ID, &futex_word->val, FUTEX_WAIT_PRIVATE,
-        WAITING, // Block only if status is still |WAITING|.
-        0, 0, 0);
+    futex_word->wait(WAITING);
   }
 
   return 0;
diff --git a/libc/src/__support/threads/linux/futex_utils.h b/libc/src/__support/threads/linux/futex_utils.h
new file mode 100644
index 000000000000..1fbce4f7bf43
--- /dev/null
+++ b/libc/src/__support/threads/linux/futex_utils.h
@@ -0,0 +1,90 @@
+//===--- Futex Wrapper ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_THREADS_LINUX_FUTEX_UTILS_H
+#define LLVM_LIBC_SRC___SUPPORT_THREADS_LINUX_FUTEX_UTILS_H
+
+#include "hdr/types/struct_timespec.h"
+#include "src/__support/CPP/atomic.h"
+#include "src/__support/CPP/limits.h"
+#include "src/__support/CPP/optional.h"
+#include "src/__support/OSUtil/syscall.h"
+#include "src/__support/macros/attributes.h"
+#include "src/__support/threads/linux/futex_word.h"
+#include <linux/errno.h>
+#include <linux/futex.h>
+
+namespace LIBC_NAMESPACE {
+class Futex : public cpp::Atomic<FutexWordType> {
+public:
+  struct Timeout {
+    timespec abs_time;
+    bool is_realtime;
+  };
+  LIBC_INLINE constexpr Futex(FutexWordType value)
+      : cpp::Atomic<FutexWordType>(value) {}
+  LIBC_INLINE Futex &operator=(FutexWordType value) {
+    cpp::Atomic<FutexWordType>::store(value);
+    return *this;
+  }
+  LIBC_INLINE long wait(FutexWordType expected,
+                        cpp::optional<Timeout> timeout = cpp::nullopt,
+                        bool is_shared = false) {
+    // use bitset variants to enforce abs_time
+    uint32_t op = is_shared ? FUTEX_WAIT_BITSET : FUTEX_WAIT_BITSET_PRIVATE;
+    if (timeout && timeout->is_realtime) {
+      op |= FUTEX_CLOCK_REALTIME;
+    }
+    for (;;) {
+      if (this->load(cpp::MemoryOrder::RELAXED) != expected)
+        return 0;
+
+      long ret = syscall_impl<long>(
+          /* syscall number */ FUTEX_SYSCALL_ID,
+          /* futex address */ this,
+          /* futex operation  */ op,
+          /* expected value */ expected,
+          /* timeout */ timeout ? &timeout->abs_time : nullptr,
+          /* ignored */ nullptr,
+          /* bitset */ FUTEX_BITSET_MATCH_ANY);
+
+      // continue waiting if interrupted; otherwise return the result
+      // which should normally be 0 or -ETIMEOUT
+      if (ret == -EINTR)
+        continue;
+
+      return ret;
+    }
+  }
+  LIBC_INLINE long notify_one(bool is_shared = false) {
+    return syscall_impl<long>(
+        /* syscall number */ FUTEX_SYSCALL_ID,
+        /* futex address */ this,
+        /* futex operation  */ is_shared ? FUTEX_WAKE : FUTEX_WAKE_PRIVATE,
+        /* wake up limit */ 1,
+        /* ignored */ nullptr,
+        /* ignored */ nullptr,
+        /* ignored */ 0);
+  }
+  LIBC_INLINE long notify_all(bool is_shared = false) {
+    return syscall_impl<long>(
+        /* syscall number */ FUTEX_SYSCALL_ID,
+        /* futex address */ this,
+        /* futex operation  */ is_shared ? FUTEX_WAKE : FUTEX_WAKE_PRIVATE,
+        /* wake up limit */ cpp::numeric_limits<int>::max(),
+        /* ignored */ nullptr,
+        /* ignored */ nullptr,
+        /* ignored */ 0);
+  }
+};
+
+static_assert(__is_standard_layout(Futex),
+              "Futex must be a standard layout type.");
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC___SUPPORT_THREADS_LINUX_FUTEX_UTILS_H
diff --git a/libc/src/__support/threads/linux/futex_word.h b/libc/src/__support/threads/linux/futex_word.h
index 67159b81b561..acdd33bcdaaf 100644
--- a/libc/src/__support/threads/linux/futex_word.h
+++ b/libc/src/__support/threads/linux/futex_word.h
@@ -11,7 +11,6 @@
 
 #include <stdint.h>
 #include <sys/syscall.h>
-
 namespace LIBC_NAMESPACE {
 
 // Futexes are 32 bits in size on all platforms, including 64-bit platforms.
diff --git a/libc/src/__support/threads/linux/mutex.h b/libc/src/__support/threads/linux/mutex.h
index 618698db0d25..6702de465168 100644
--- a/libc/src/__support/threads/linux/mutex.h
+++ b/libc/src/__support/threads/linux/mutex.h
@@ -9,17 +9,10 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_THREADS_LINUX_MUTEX_H
 #define LLVM_LIBC_SRC___SUPPORT_THREADS_LINUX_MUTEX_H
 
-#include "src/__support/CPP/atomic.h"
-#include "src/__support/OSUtil/syscall.h" // For syscall functions.
-#include "src/__support/threads/linux/futex_word.h"
+#include "src/__support/threads/linux/futex_utils.h"
 #include "src/__support/threads/mutex_common.h"
 
-#include <linux/futex.h>
-#include <stdint.h>
-#include <sys/syscall.h> // For syscall numbers.
-
 namespace LIBC_NAMESPACE {
-
 struct Mutex {
   unsigned char timed;
   unsigned char recursive;
@@ -28,7 +21,7 @@ struct Mutex {
   void *owner;
   unsigned long long lock_count;
 
-  cpp::Atomic<FutexWordType> futex_word;
+  Futex futex_word;
 
   enum class LockState : FutexWordType {
     Free,
@@ -76,9 +69,7 @@ public:
         // futex syscall will block if the futex data is still
         // `LockState::Waiting` (the 4th argument to the syscall function
         // below.)
-        LIBC_NAMESPACE::syscall_impl<long>(
-            FUTEX_SYSCALL_ID, &futex_word.val, FUTEX_WAIT_PRIVATE,
-            FutexWordType(LockState::Waiting), 0, 0, 0);
+        futex_word.wait(FutexWordType(LockState::Waiting));
         was_waiting = true;
         // Once woken up/unblocked, try everything all over.
         continue;
@@ -91,9 +82,7 @@ public:
           // we will wait for the futex to be woken up. Note again that the
           // following syscall will block only if the futex data is still
           // `LockState::Waiting`.
-          LIBC_NAMESPACE::syscall_impl<long>(
-              FUTEX_SYSCALL_ID, &futex_word, FUTEX_WAIT_PRIVATE,
-              FutexWordType(LockState::Waiting), 0, 0, 0);
+          futex_word.wait(FutexWordType(LockState::Waiting));
           was_waiting = true;
         }
         continue;
@@ -110,8 +99,7 @@ public:
       if (futex_word.compare_exchange_strong(mutex_status,
                                              FutexWordType(LockState::Free))) {
         // If any thread is waiting to be woken up, then do it.
-        LIBC_NAMESPACE::syscall_impl<long>(FUTEX_SYSCALL_ID, &futex_word,
-                                           FUTEX_WAKE_PRIVATE, 1, 0, 0, 0);
+        futex_word.notify_one();
         return MutexError::NONE;
       }
 
diff --git a/libc/src/__support/threads/linux/thread.cpp b/libc/src/__support/threads/linux/thread.cpp
index fcf87cc587a5..1d986ff38cff 100644
--- a/libc/src/__support/threads/linux/thread.cpp
+++ b/libc/src/__support/threads/linux/thread.cpp
@@ -14,15 +14,14 @@
 #include "src/__support/OSUtil/syscall.h" // For syscall functions.
 #include "src/__support/common.h"
 #include "src/__support/error_or.h"
-#include "src/__support/threads/linux/futex_word.h" // For FutexWordType
-#include "src/errno/libc_errno.h"                   // For error macros
+#include "src/__support/threads/linux/futex_utils.h" // For FutexWordType
+#include "src/errno/libc_errno.h"                    // For error macros
 
 #ifdef LIBC_TARGET_ARCH_IS_AARCH64
 #include <arm_acle.h>
 #endif
 
 #include <fcntl.h>
-#include <linux/futex.h>
 #include <linux/param.h> // For EXEC_PAGESIZE.
 #include <linux/prctl.h> // For PR_SET_NAME
 #include <linux/sched.h> // For CLONE_* flags.
@@ -247,8 +246,7 @@ int Thread::run(ThreadStyle style, ThreadRunner runner, void *arg, void *stack,
   // stack memory.
 
   static constexpr size_t INTERNAL_STACK_DATA_SIZE =
-      sizeof(StartArgs) + sizeof(ThreadAttributes) +
-      sizeof(cpp::Atomic<FutexWordType>);
+      sizeof(StartArgs) + sizeof(ThreadAttributes) + sizeof(Futex);
 
   // This is pretty arbitrary, but at the moment we don't adjust user provided
   // stacksize (or default) to account for this data as its assumed minimal. If
@@ -288,9 +286,9 @@ int Thread::run(ThreadStyle style, ThreadRunner runner, void *arg, void *stack,
   start_args->runner = runner;
   start_args->arg = arg;
 
-  auto clear_tid = reinterpret_cast<cpp::Atomic<FutexWordType> *>(
+  auto clear_tid = reinterpret_cast<Futex *>(
       adjusted_stack + sizeof(StartArgs) + sizeof(ThreadAttributes));
-  clear_tid->val = CLEAR_TID_VALUE;
+  clear_tid->set(CLEAR_TID_VALUE);
   attrib->platform_data = clear_tid;
 
   // The clone syscall takes arguments in an architecture specific order.
@@ -374,14 +372,11 @@ void Thread::wait() {
   // The kernel should set the value at the clear tid address to zero.
   // If not, it is a spurious wake and we should continue to wait on
   // the futex.
-  auto *clear_tid =
-      reinterpret_cast<cpp::Atomic<FutexWordType> *>(attrib->platform_data);
-  while (clear_tid->load() != 0) {
-    // We cannot do a FUTEX_WAIT_PRIVATE here as the kernel does a
-    // FUTEX_WAKE and not a FUTEX_WAKE_PRIVATE.
-    LIBC_NAMESPACE::syscall_impl<long>(FUTEX_SYSCALL_ID, &clear_tid->val,
-                                       FUTEX_WAIT, CLEAR_TID_VALUE, nullptr);
-  }
+  auto *clear_tid = reinterpret_cast<Futex *>(attrib->platform_data);
+  // We cannot do a FUTEX_WAIT_PRIVATE here as the kernel does a
+  // FUTEX_WAKE and not a FUTEX_WAKE_PRIVATE.
+  while (clear_tid->load() != 0)
+    clear_tid->wait(CLEAR_TID_VALUE, cpp::nullopt, true);
 }
 
 bool Thread::operator==(const Thread &thread) const {
diff --git a/libc/src/__support/threads/mutex.h b/libc/src/__support/threads/mutex.h
index fa2bd64b6b51..9dded2e3f952 100644
--- a/libc/src/__support/threads/mutex.h
+++ b/libc/src/__support/threads/mutex.h
@@ -38,9 +38,9 @@
 // want the constructors of the Mutex classes to be constexprs.
 
 #if defined(__linux__)
-#include "linux/mutex.h"
+#include "src/__support/threads/linux/mutex.h"
 #elif defined(LIBC_TARGET_ARCH_IS_GPU)
-#include "gpu/mutex.h"
+#include "src/__support/threads/gpu/mutex.h"
 #endif // __linux__
 
 namespace LIBC_NAMESPACE {
diff --git a/libc/src/__support/threads/thread.cpp b/libc/src/__support/threads/thread.cpp
index 62aa86b7aef7..7b02f8246e24 100644
--- a/libc/src/__support/threads/thread.cpp
+++ b/libc/src/__support/threads/thread.cpp
@@ -6,10 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "thread.h"
-#include "mutex.h"
+#include "src/__support/threads/thread.h"
+#include "src/__support/threads/mutex.h"
 
 #include "src/__support/CPP/array.h"
+#include "src/__support/CPP/mutex.h" // lock_guard
 #include "src/__support/CPP/optional.h"
 #include "src/__support/fixedvector.h"
 #include "src/__support/macros/attributes.h"
@@ -56,7 +57,7 @@ public:
   constexpr TSSKeyMgr() : mtx(false, false, false) {}
 
   cpp::optional<unsigned int> new_key(TSSDtor *dtor) {
-    MutexLock lock(&mtx);
+    cpp::lock_guard lock(mtx);
     for (unsigned int i = 0; i < TSS_KEY_COUNT; ++i) {
       TSSKeyUnit &u = units[i];
       if (!u.active) {
@@ -70,20 +71,20 @@ public:
   TSSDtor *get_dtor(unsigned int key) {
     if (key >= TSS_KEY_COUNT)
       return nullptr;
-    MutexLock lock(&mtx);
+    cpp::lock_guard lock(mtx);
     return units[key].dtor;
   }
 
   bool remove_key(unsigned int key) {
     if (key >= TSS_KEY_COUNT)
       return false;
-    MutexLock lock(&mtx);
+    cpp::lock_guard lock(mtx);
     units[key].reset();
     return true;
   }
 
   bool is_valid_key(unsigned int key) {
-    MutexLock lock(&mtx);
+    cpp::lock_guard lock(mtx);
     return units[key].active;
   }
 };
@@ -113,7 +114,7 @@ public:
   constexpr ThreadAtExitCallbackMgr() : mtx(false, false, false) {}
 
   int add_callback(AtExitCallback *callback, void *obj) {
-    MutexLock lock(&mtx);
+    cpp::lock_guard lock(mtx);
     return callback_list.push_back({callback, obj});
   }
 
diff --git a/libc/src/__support/time/CMakeLists.txt b/libc/src/__support/time/CMakeLists.txt
new file mode 100644
index 000000000000..89ddffb09938
--- /dev/null
+++ b/libc/src/__support/time/CMakeLists.txt
@@ -0,0 +1,12 @@
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
+  add_subdirectory(${LIBC_TARGET_OS})
+endif()
+
+add_header_library(
+  units
+  HDRS
+    units.h
+  DEPENDS
+    libc.src.__support.common
+    libc.hdr.types.time_t
+)
diff --git a/libc/src/__support/time/linux/CMakeLists.txt b/libc/src/__support/time/linux/CMakeLists.txt
new file mode 100644
index 000000000000..f04d550555e1
--- /dev/null
+++ b/libc/src/__support/time/linux/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_object_library(
+  clock_gettime
+  HDRS
+    clock_gettime.h
+  SRCS
+    clock_gettime.cpp
+  DEPENDS
+    libc.include.sys_syscall
+    libc.hdr.types.struct_timespec
+    libc.hdr.types.clockid_t
+    libc.src.__support.common
+    libc.src.__support.error_or
+    libc.src.__support.OSUtil.osutil
+)
diff --git a/libc/src/time/linux/clockGetTimeImpl.h b/libc/src/__support/time/linux/clock_gettime.cpp
index 8c8c9fcf845c..7f266b282a39 100644
--- a/libc/src/time/linux/clockGetTimeImpl.h
+++ b/libc/src/__support/time/linux/clock_gettime.cpp
@@ -1,4 +1,4 @@
-//===- Linux implementation of the POSIX clock_gettime function -*- C++ -*-===//
+//===--- clock_gettime linux implementation ---------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,23 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_SRC_TIME_LINUX_CLOCKGETTIMEIMPL_H
-#define LLVM_LIBC_SRC_TIME_LINUX_CLOCKGETTIMEIMPL_H
-
-#include "src/__support/OSUtil/syscall.h" // For internal syscall function.
-#include "src/__support/common.h"
-#include "src/__support/error_or.h"
-#include "src/errno/libc_errno.h"
-
-#include <stdint.h>      // For int64_t.
-#include <sys/syscall.h> // For syscall numbers.
-#include <time.h>
-
+#include "src/__support/time/linux/clock_gettime.h"
+#include "src/__support/OSUtil/syscall.h"
+#include <sys/syscall.h>
 namespace LIBC_NAMESPACE {
 namespace internal {
-
-LIBC_INLINE ErrorOr<int> clock_gettimeimpl(clockid_t clockid,
-                                           struct timespec *ts) {
+ErrorOr<int> clock_gettime(clockid_t clockid, timespec *ts) {
 #if SYS_clock_gettime
   int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_clock_gettime,
                                               static_cast<long>(clockid),
@@ -44,5 +33,3 @@ LIBC_INLINE ErrorOr<int> clock_gettimeimpl(clockid_t clockid,
 
 } // namespace internal
 } // namespace LIBC_NAMESPACE
-
-#endif // LLVM_LIBC_SRC_TIME_LINUX_CLOCKGETTIMEIMPL_H
diff --git a/libc/src/__support/time/linux/clock_gettime.h b/libc/src/__support/time/linux/clock_gettime.h
new file mode 100644
index 000000000000..b1572726f630
--- /dev/null
+++ b/libc/src/__support/time/linux/clock_gettime.h
@@ -0,0 +1,23 @@
+//===--- clock_gettime linux implementation ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_TIME_LINUX_CLOCK_GETTIME_H
+#define LLVM_LIBC_SRC___SUPPORT_TIME_LINUX_CLOCK_GETTIME_H
+#include "hdr/types/clockid_t.h"
+#include "hdr/types/struct_timespec.h"
+#include "src/__support/common.h"
+
+#include "src/__support/error_or.h"
+
+namespace LIBC_NAMESPACE {
+namespace internal {
+ErrorOr<int> clock_gettime(clockid_t clockid, timespec *ts);
+}
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC___SUPPORT_TIME_LINUX_CLOCK_GETTIME_H
diff --git a/libc/src/__support/time/units.h b/libc/src/__support/time/units.h
new file mode 100644
index 000000000000..f6bd19f9b139
--- /dev/null
+++ b/libc/src/__support/time/units.h
@@ -0,0 +1,38 @@
+//===--- Time units conversion ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_TIME_UNITS_H
+#define LLVM_LIBC_SRC___SUPPORT_TIME_UNITS_H
+
+#include "hdr/types/time_t.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+namespace time_units {
+LIBC_INLINE constexpr time_t operator""_s_ns(unsigned long long s) {
+  return s * 1'000'000'000;
+}
+LIBC_INLINE constexpr time_t operator""_s_us(unsigned long long s) {
+  return s * 1'000'000;
+}
+LIBC_INLINE constexpr time_t operator""_s_ms(unsigned long long s) {
+  return s * 1'000;
+}
+LIBC_INLINE constexpr time_t operator""_ms_ns(unsigned long long ms) {
+  return ms * 1'000'000;
+}
+LIBC_INLINE constexpr time_t operator""_ms_us(unsigned long long ms) {
+  return ms * 1'000;
+}
+LIBC_INLINE constexpr time_t operator""_us_ns(unsigned long long us) {
+  return us * 1'000;
+}
+} // namespace time_units
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC___SUPPORT_TIME_UNITS_H
diff --git a/libc/src/math/generic/powf.cpp b/libc/src/math/generic/powf.cpp
index 0450ffd711ff..59efc3f424c7 100644
--- a/libc/src/math/generic/powf.cpp
+++ b/libc/src/math/generic/powf.cpp
@@ -528,7 +528,7 @@ LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) {
   // So if |y| > 151 * 2^24, and x is finite:
   //   |y * log2(x)| = 0 or > 151.
   // Hence x^y will either overflow or underflow if x is not zero.
-  if (LIBC_UNLIKELY((y_abs & 0x007f'ffff) == 0) || (y_abs > 0x4f170000)) {
+  if (LIBC_UNLIKELY((y_abs & 0x0007'ffff) == 0) || (y_abs > 0x4f170000)) {
     // Exceptional exponents.
     switch (y_abs) {
     case 0x0000'0000: { // y = +-0.0f
@@ -572,6 +572,26 @@ LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) {
         // case 0xbf00'0000:  // pow(x, -1/2) = rsqrt(x)
         //   return rsqrt(x);
       }
+      if (is_integer(y) && (y_u > 0x4000'0000) && (y_u <= 0x41c0'0000)) {
+        // Check for exact cases when 2 < y < 25 and y is an integer.
+        int msb =
+            (x_abs == 0) ? (FloatBits::TOTAL_LEN - 2) : cpp::countl_zero(x_abs);
+        msb = (msb > FloatBits::EXP_LEN) ? msb : FloatBits::EXP_LEN;
+        int lsb = (x_abs == 0) ? 0 : cpp::countr_zero(x_abs);
+        lsb = (lsb > FloatBits::FRACTION_LEN) ? FloatBits::FRACTION_LEN : lsb;
+        int extra_bits = FloatBits::TOTAL_LEN - 2 - lsb - msb;
+        int iter = static_cast<int>(y);
+
+        if (extra_bits * iter <= FloatBits::FRACTION_LEN + 2) {
+          // The result is either exact or exactly half-way.
+          // But it is exactly representable in double precision.
+          double x_d = static_cast<double>(x);
+          double result = x_d;
+          for (int i = 1; i < iter; ++i)
+            result *= x_d;
+          return static_cast<float>(result);
+        }
+      }
       if (y_abs > 0x4f17'0000) {
         if (y_abs > 0x7f80'0000) {
           // y is NaN
@@ -834,7 +854,6 @@ LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) {
   return static_cast<float>(
              powf_double_double(idx_x, dx, y6, lo6_hi, exp2_hi_mid_dd)) +
          0.0f;
-  // return static_cast<float>(r);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt
index e526ba040bef..9b76a6a0f857 100644
--- a/libc/src/stdlib/CMakeLists.txt
+++ b/libc/src/stdlib/CMakeLists.txt
@@ -414,6 +414,7 @@ add_entrypoint_object(
   CXX_STANDARD
     20 # For constinit of the atexit callback list.
   DEPENDS
+    libc.src.__support.CPP.mutex
     libc.src.__support.CPP.new
     libc.src.__support.OSUtil.osutil
     libc.src.__support.blockstore
diff --git a/libc/src/stdlib/atexit.cpp b/libc/src/stdlib/atexit.cpp
index fa072b2fdf8d..4f0497444773 100644
--- a/libc/src/stdlib/atexit.cpp
+++ b/libc/src/stdlib/atexit.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdlib/atexit.h"
+#include "src/__support/CPP/mutex.h" // lock_guard
 #include "src/__support/blockstore.h"
 #include "src/__support/common.h"
 #include "src/__support/fixedvector.h"
@@ -68,7 +69,7 @@ void call_exit_callbacks() {
 }
 
 int add_atexit_unit(const AtExitUnit &unit) {
-  MutexLock lock(&handler_list_mtx);
+  cpp::lock_guard lock(handler_list_mtx);
   if (exit_callbacks.push_back(unit))
     return 0;
   return -1;
diff --git a/libc/src/threads/linux/CMakeLists.txt b/libc/src/threads/linux/CMakeLists.txt
index be5407031aad..68b7106c2052 100644
--- a/libc/src/threads/linux/CMakeLists.txt
+++ b/libc/src/threads/linux/CMakeLists.txt
@@ -7,9 +7,10 @@ add_header_library(
     libc.include.sys_syscall
     libc.include.threads
     libc.src.__support.CPP.atomic
+    libc.src.__support.CPP.mutex
     libc.src.__support.OSUtil.osutil
     libc.src.__support.threads.mutex
-    libc.src.__support.threads.linux.futex_word_type
+    libc.src.__support.threads.linux.futex_utils
 )
 
 add_entrypoint_object(
diff --git a/libc/src/threads/linux/CndVar.h b/libc/src/threads/linux/CndVar.h
index b4afdef9f9eb..c08ffa393856 100644
--- a/libc/src/threads/linux/CndVar.h
+++ b/libc/src/threads/linux/CndVar.h
@@ -10,8 +10,10 @@
 #define LLVM_LIBC_SRC_THREADS_LINUX_CNDVAR_H
 
 #include "src/__support/CPP/atomic.h"
+#include "src/__support/CPP/mutex.h" // lock_guard
+#include "src/__support/CPP/optional.h"
 #include "src/__support/OSUtil/syscall.h" // For syscall functions.
-#include "src/__support/threads/linux/futex_word.h"
+#include "src/__support/threads/linux/futex_utils.h"
 #include "src/__support/threads/mutex.h"
 
 #include <linux/futex.h> // For futex operations.
@@ -28,7 +30,7 @@ struct CndVar {
   };
 
   struct CndWaiter {
-    cpp::Atomic<uint32_t> futex_word = WS_Waiting;
+    Futex futex_word = WS_Waiting;
     CndWaiter *next = nullptr;
   };
 
@@ -58,7 +60,7 @@ struct CndVar {
 
     CndWaiter waiter;
     {
-      MutexLock ml(&qmtx);
+      cpp::lock_guard ml(qmtx);
       CndWaiter *old_back = nullptr;
       if (waitq_front == nullptr) {
         waitq_front = waitq_back = &waiter;
@@ -84,8 +86,7 @@ struct CndVar {
       }
     }
 
-    LIBC_NAMESPACE::syscall_impl<long>(FUTEX_SYSCALL_ID, &waiter.futex_word.val,
-                                       FUTEX_WAIT, WS_Waiting, 0, 0, 0);
+    waiter.futex_word.wait(WS_Waiting, cpp::nullopt, true);
 
     // At this point, if locking |m| fails, we can simply return as the
     // queued up waiter would have been removed from the queue.
@@ -109,6 +110,7 @@ struct CndVar {
 
     qmtx.futex_word = FutexWordType(Mutex::LockState::Free);
 
+    // this is a special WAKE_OP, so we use syscall directly
     LIBC_NAMESPACE::syscall_impl<long>(
         FUTEX_SYSCALL_ID, &qmtx.futex_word.val, FUTEX_WAKE_OP, 1, 1,
         &first->futex_word.val,
@@ -117,7 +119,7 @@ struct CndVar {
   }
 
   int broadcast() {
-    MutexLock ml(&qmtx);
+    cpp::lock_guard ml(qmtx);
     uint32_t dummy_futex_word;
     CndWaiter *waiter = waitq_front;
     waitq_front = waitq_back = nullptr;
diff --git a/libc/src/time/clock.h b/libc/src/time/clock.h
index d4af7656644a..f5d14d036e13 100644
--- a/libc/src/time/clock.h
+++ b/libc/src/time/clock.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_TIME_CLOCK_H
 #define LLVM_LIBC_SRC_TIME_CLOCK_H
 
-#include <time.h>
+#include "hdr/types/clock_t.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/time/clock_gettime.h b/libc/src/time/clock_gettime.h
index 72e2e1949feb..48e81a355429 100644
--- a/libc/src/time/clock_gettime.h
+++ b/libc/src/time/clock_gettime.h
@@ -9,11 +9,12 @@
 #ifndef LLVM_LIBC_SRC_TIME_CLOCK_GETTIME_H
 #define LLVM_LIBC_SRC_TIME_CLOCK_GETTIME_H
 
-#include <time.h>
+#include "hdr/types/clockid_t.h"
+#include "hdr/types/struct_timespec.h"
 
 namespace LIBC_NAMESPACE {
 
-int clock_gettime(clockid_t clockid, struct timespec *tp);
+int clock_gettime(clockid_t clockid, timespec *tp);
 
 } // namespace LIBC_NAMESPACE
 
diff --git a/libc/src/time/gettimeofday.h b/libc/src/time/gettimeofday.h
index 880b94cee731..62ee31edcad6 100644
--- a/libc/src/time/gettimeofday.h
+++ b/libc/src/time/gettimeofday.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_TIME_GETTIMEOFDAY_H
 #define LLVM_LIBC_SRC_TIME_GETTIMEOFDAY_H
 
-#include <time.h>
+#include "hdr/types/struct_timeval.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/time/gpu/CMakeLists.txt b/libc/src/time/gpu/CMakeLists.txt
index bb79d92399b3..088271d88191 100644
--- a/libc/src/time/gpu/CMakeLists.txt
+++ b/libc/src/time/gpu/CMakeLists.txt
@@ -4,6 +4,9 @@ add_object_library(
     time_utils.cpp
   HDRS
     time_utils.h
+  DEPENDS
+    libc.hdr.types.clock_t
+    libc.hdr.time_macros
 )
 
 add_entrypoint_object(
diff --git a/libc/src/time/gpu/clock.cpp b/libc/src/time/gpu/clock.cpp
index 86cc97e2a3bf..8ddfc27975bb 100644
--- a/libc/src/time/gpu/clock.cpp
+++ b/libc/src/time/gpu/clock.cpp
@@ -6,9 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "time_utils.h"
-
 #include "src/time/clock.h"
+#include "src/time/gpu/time_utils.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/time/gpu/time_utils.h b/libc/src/time/gpu/time_utils.h
index 8a9a5f0f65b8..3f1fd11c1791 100644
--- a/libc/src/time/gpu/time_utils.h
+++ b/libc/src/time/gpu/time_utils.h
@@ -9,8 +9,9 @@
 #ifndef LLVM_LIBC_SRC_TIME_GPU_TIME_UTILS_H
 #define LLVM_LIBC_SRC_TIME_GPU_TIME_UTILS_H
 
+#include "hdr/time_macros.h"
+#include "hdr/types/clock_t.h"
 #include "src/__support/GPU/utils.h"
-
 namespace LIBC_NAMESPACE {
 
 #if defined(LIBC_TARGET_ARCH_IS_AMDGPU)
diff --git a/libc/src/time/linux/CMakeLists.txt b/libc/src/time/linux/CMakeLists.txt
index df79bf598626..c15fb44ad5d1 100644
--- a/libc/src/time/linux/CMakeLists.txt
+++ b/libc/src/time/linux/CMakeLists.txt
@@ -5,9 +5,9 @@ add_entrypoint_object(
   HDRS
     ../time_func.h
   DEPENDS
-    libc.include.time
-    libc.include.sys_syscall
-    libc.src.__support.OSUtil.osutil
+    libc.hdr.time_macros
+    libc.hdr.types.time_t
+    libc.src.__support.time.linux.clock_gettime
     libc.src.errno.errno
 )
 
@@ -18,10 +18,11 @@ add_entrypoint_object(
   HDRS
     ../clock.h
   DEPENDS
-    libc.include.time
-    libc.include.sys_syscall
+    libc.hdr.time_macros
+    libc.hdr.types.clock_t
+    libc.src.__support.time.units
+    libc.src.__support.time.linux.clock_gettime
     libc.src.__support.CPP.limits
-    libc.src.__support.OSUtil.osutil
     libc.src.errno.errno
 )
 
@@ -32,10 +33,10 @@ add_entrypoint_object(
   HDRS
     ../nanosleep.h
   DEPENDS
-    libc.include.time
+    libc.hdr.types.struct_timespec
     libc.include.sys_syscall
-    libc.src.__support.CPP.limits
     libc.src.__support.OSUtil.osutil
+    libc.src.__support.CPP.limits
     libc.src.errno.errno
 )
 
@@ -46,9 +47,9 @@ add_entrypoint_object(
   HDRS
     ../clock_gettime.h
   DEPENDS
-    libc.include.time
-    libc.include.sys_syscall
-    libc.src.__support.OSUtil.osutil
+    libc.hdr.types.clockid_t
+    libc.hdr.types.struct_timespec
+    libc.src.__support.time.linux.clock_gettime
     libc.src.errno.errno
 )
 
@@ -59,8 +60,9 @@ add_entrypoint_object(
   HDRS
     ../gettimeofday.h
   DEPENDS
-    libc.include.time
-    libc.include.sys_syscall
-    libc.src.__support.OSUtil.osutil
+    libc.hdr.time_macros
+    libc.hdr.types.suseconds_t
+    libc.src.__support.time.linux.clock_gettime
+    libc.src.__support.time.units
     libc.src.errno.errno
 )
diff --git a/libc/src/time/linux/clock.cpp b/libc/src/time/linux/clock.cpp
index 1e95f0526bc9..2c1eee8e5d60 100644
--- a/libc/src/time/linux/clock.cpp
+++ b/libc/src/time/linux/clock.cpp
@@ -7,21 +7,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/time/clock.h"
-
+#include "hdr/time_macros.h"
 #include "src/__support/CPP/limits.h"
-#include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/time/linux/clock_gettime.h"
+#include "src/__support/time/units.h"
 #include "src/errno/libc_errno.h"
-#include "src/time/linux/clockGetTimeImpl.h"
-
-#include <sys/syscall.h> // For syscall numbers.
-#include <time.h>
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(clock_t, clock, ()) {
+  using namespace time_units;
   struct timespec ts;
-  auto result = internal::clock_gettimeimpl(CLOCK_PROCESS_CPUTIME_ID, &ts);
+  auto result = internal::clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
   if (!result.has_value()) {
     libc_errno = result.error();
     return -1;
@@ -34,15 +32,15 @@ LLVM_LIBC_FUNCTION(clock_t, clock, ()) {
       cpp::numeric_limits<clock_t>::max() / CLOCKS_PER_SEC;
   if (ts.tv_sec > CLOCK_SECS_MAX)
     return clock_t(-1);
-  if (ts.tv_nsec / 1000000000 > CLOCK_SECS_MAX - ts.tv_sec)
+  if (ts.tv_nsec / 1_s_ns > CLOCK_SECS_MAX - ts.tv_sec)
     return clock_t(-1);
 
   // For the integer computation converting tv_nsec to clocks to work
   // correctly, we want CLOCKS_PER_SEC to be less than 1000000000.
-  static_assert(1000000000 > CLOCKS_PER_SEC,
-                "Expected CLOCKS_PER_SEC to be less than 1000000000.");
+  static_assert(1_s_ns > CLOCKS_PER_SEC,
+                "Expected CLOCKS_PER_SEC to be less than 1'000'000'000.");
   return clock_t(ts.tv_sec * CLOCKS_PER_SEC +
-                 ts.tv_nsec / (1000000000 / CLOCKS_PER_SEC));
+                 ts.tv_nsec / (1_s_ns / CLOCKS_PER_SEC));
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/time/linux/clock_gettime.cpp b/libc/src/time/linux/clock_gettime.cpp
index 47e974a866c8..d7b8cfd245bc 100644
--- a/libc/src/time/linux/clock_gettime.cpp
+++ b/libc/src/time/linux/clock_gettime.cpp
@@ -7,21 +7,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/time/clock_gettime.h"
-
-#include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/time/linux/clock_gettime.h"
 #include "src/errno/libc_errno.h"
-#include "src/time/linux/clockGetTimeImpl.h"
-
-#include <sys/syscall.h> // For syscall numbers.
-#include <time.h>
 
 namespace LIBC_NAMESPACE {
 
 // TODO(michaelrj): Move this into time/linux with the other syscalls.
 LLVM_LIBC_FUNCTION(int, clock_gettime,
                    (clockid_t clockid, struct timespec *ts)) {
-  auto result = internal::clock_gettimeimpl(clockid, ts);
+  auto result = internal::clock_gettime(clockid, ts);
 
   // A negative return value indicates an error with the magnitude of the
   // value being the error code.
diff --git a/libc/src/time/linux/gettimeofday.cpp b/libc/src/time/linux/gettimeofday.cpp
index 07ab4d579176..f868f5ff4d4b 100644
--- a/libc/src/time/linux/gettimeofday.cpp
+++ b/libc/src/time/linux/gettimeofday.cpp
@@ -7,24 +7,24 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/time/gettimeofday.h"
-
-#include "src/__support/OSUtil/syscall.h" // For internal syscall function.
+#include "hdr/time_macros.h"
+#include "hdr/types/suseconds_t.h"
 #include "src/__support/common.h"
+#include "src/__support/time/linux/clock_gettime.h"
+#include "src/__support/time/units.h"
 #include "src/errno/libc_errno.h"
-#include "src/time/linux/clockGetTimeImpl.h"
-
-#include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE {
 
 // TODO(michaelrj): Move this into time/linux with the other syscalls.
 LLVM_LIBC_FUNCTION(int, gettimeofday,
                    (struct timeval * tv, [[maybe_unused]] void *unused)) {
+  using namespace time_units;
   if (tv == nullptr)
     return 0;
 
   struct timespec ts;
-  auto result = internal::clock_gettimeimpl(CLOCK_REALTIME, &ts);
+  auto result = internal::clock_gettime(CLOCK_REALTIME, &ts);
 
   // A negative return value indicates an error with the magnitude of the
   // value being the error code.
@@ -34,7 +34,7 @@ LLVM_LIBC_FUNCTION(int, gettimeofday,
   }
 
   tv->tv_sec = ts.tv_sec;
-  tv->tv_usec = static_cast<suseconds_t>(ts.tv_nsec / 1000);
+  tv->tv_usec = static_cast<suseconds_t>(ts.tv_nsec / 1_us_ns);
   return 0;
 }
 
diff --git a/libc/src/time/linux/time.cpp b/libc/src/time/linux/time.cpp
index e286fae095b2..32f531efb6d1 100644
--- a/libc/src/time/linux/time.cpp
+++ b/libc/src/time/linux/time.cpp
@@ -6,22 +6,18 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/time/time_func.h"
-
-#include "src/__support/OSUtil/syscall.h" // For internal syscall function.
+#include "hdr/time_macros.h"
 #include "src/__support/common.h"
+#include "src/__support/time/linux/clock_gettime.h"
 #include "src/errno/libc_errno.h"
-#include "src/time/linux/clockGetTimeImpl.h"
-
-#include <sys/syscall.h> // For syscall numbers.
-#include <time.h>
+#include "src/time/time_func.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(time_t, time, (time_t * tp)) {
   // TODO: Use the Linux VDSO to fetch the time and avoid the syscall.
   struct timespec ts;
-  auto result = internal::clock_gettimeimpl(CLOCK_REALTIME, &ts);
+  auto result = internal::clock_gettime(CLOCK_REALTIME, &ts);
   if (!result.has_value()) {
     libc_errno = result.error();
     return -1;
diff --git a/libc/src/time/nanosleep.h b/libc/src/time/nanosleep.h
index 757394232c07..2309666b2304 100644
--- a/libc/src/time/nanosleep.h
+++ b/libc/src/time/nanosleep.h
@@ -9,11 +9,11 @@
 #ifndef LLVM_LIBC_SRC_TIME_NANOSLEEP_H
 #define LLVM_LIBC_SRC_TIME_NANOSLEEP_H
 
-#include <time.h>
+#include "hdr/types/struct_timespec.h"
 
 namespace LIBC_NAMESPACE {
 
-int nanosleep(const struct timespec *req, struct timespec *rem);
+int nanosleep(const timespec *req, timespec *rem);
 
 } // namespace LIBC_NAMESPACE
 
diff --git a/libc/src/time/time_func.h b/libc/src/time/time_func.h
index beb02020b575..2a5239220942 100644
--- a/libc/src/time/time_func.h
+++ b/libc/src/time/time_func.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_TIME_TIME_FUNC_H
 #define LLVM_LIBC_SRC_TIME_TIME_FUNC_H
 
-#include <time.h>
+#include "hdr/types/time_t.h"
 
 // Note this header file is named time_func.h to avoid conflicts with the
 // public header file time.h.
diff --git a/libc/test/UnitTest/FPMatcher.h b/libc/test/UnitTest/FPMatcher.h
index c58c322c981e..26af5cec02b5 100644
--- a/libc/test/UnitTest/FPMatcher.h
+++ b/libc/test/UnitTest/FPMatcher.h
@@ -159,18 +159,18 @@ template <typename T> struct FPTest : public Test {
 #define EXPECT_FP_EXCEPTION(expected)                                          \
   do {                                                                         \
     if (math_errhandling & MATH_ERREXCEPT) {                                   \
-      EXPECT_GE(LIBC_NAMESPACE::fputil::test_except(FE_ALL_EXCEPT) &           \
-                    (expected),                                                \
-                expected);                                                     \
+      EXPECT_EQ(LIBC_NAMESPACE::fputil::test_except(FE_ALL_EXCEPT) &           \
+                    ((expected) ? (expected) : FE_ALL_EXCEPT),                 \
+                (expected));                                                   \
     }                                                                          \
   } while (0)
 
 #define ASSERT_FP_EXCEPTION(expected)                                          \
   do {                                                                         \
     if (math_errhandling & MATH_ERREXCEPT) {                                   \
-      ASSERT_GE(LIBC_NAMESPACE::fputil::test_except(FE_ALL_EXCEPT) &           \
-                    (expected),                                                \
-                expected);                                                     \
+      ASSERT_EQ(LIBC_NAMESPACE::fputil::test_except(FE_ALL_EXCEPT) &           \
+                    ((expected) ? (expected) : FE_ALL_EXCEPT),                 \
+                (expected));                                                   \
     }                                                                          \
   } while (0)
 
@@ -178,24 +178,14 @@ template <typename T> struct FPTest : public Test {
   do {                                                                         \
     LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);                       \
     EXPECT_FP_EQ(expected_val, actual_val);                                    \
-    if (math_errhandling & MATH_ERREXCEPT) {                                   \
-      EXPECT_GE(LIBC_NAMESPACE::fputil::test_except(FE_ALL_EXCEPT) &           \
-                    (expected_except),                                         \
-                expected_except);                                              \
-      LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);                     \
-    }                                                                          \
+    EXPECT_FP_EXCEPTION(expected_except);                                      \
   } while (0)
 
 #define EXPECT_FP_IS_NAN_WITH_EXCEPTION(actual_val, expected_except)           \
   do {                                                                         \
     LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);                       \
     EXPECT_FP_IS_NAN(actual_val);                                              \
-    if (math_errhandling & MATH_ERREXCEPT) {                                   \
-      EXPECT_GE(LIBC_NAMESPACE::fputil::test_except(FE_ALL_EXCEPT) &           \
-                    (expected_except),                                         \
-                expected_except);                                              \
-      LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);                     \
-    }                                                                          \
+    EXPECT_FP_EXCEPTION(expected_except);                                      \
   } while (0)
 
 #define EXPECT_FP_EQ_ALL_ROUNDING(expected, actual)                            \
diff --git a/libc/test/src/__support/CPP/CMakeLists.txt b/libc/test/src/__support/CPP/CMakeLists.txt
index 708548f812c6..cec13afc8dd1 100644
--- a/libc/test/src/__support/CPP/CMakeLists.txt
+++ b/libc/test/src/__support/CPP/CMakeLists.txt
@@ -65,6 +65,16 @@ add_libc_test(
 )
 
 add_libc_test(
+  mutex_test
+  SUITE
+    libc-cpp-utils-tests
+  SRCS
+    mutex_test.cpp
+  DEPENDS
+    libc.src.__support.CPP.mutex
+)
+
+add_libc_test(
   int_seq_test
   SUITE
     libc-cpp-utils-tests
diff --git a/libc/test/src/__support/CPP/mutex_test.cpp b/libc/test/src/__support/CPP/mutex_test.cpp
new file mode 100644
index 000000000000..a68c84cfc78a
--- /dev/null
+++ b/libc/test/src/__support/CPP/mutex_test.cpp
@@ -0,0 +1,79 @@
+//===-- Unittests for mutex -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/mutex.h"
+#include "test/UnitTest/Test.h"
+
+using LIBC_NAMESPACE::cpp::adopt_lock;
+using LIBC_NAMESPACE::cpp::lock_guard;
+
+// Simple struct for testing cpp::lock_guard. It defines methods 'lock' and
+// 'unlock' which are required for the cpp::lock_guard class template.
+struct Mutex {
+  // Flag to show whether this mutex is locked.
+  bool locked = false;
+
+  // Flag to show if this mutex has been double locked.
+  bool double_locked = false;
+
+  // Flag to show if this mutex has been double unlocked.
+  bool double_unlocked = false;
+
+  Mutex() {}
+
+  void lock() {
+    if (locked)
+      double_locked = true;
+
+    locked = true;
+  }
+
+  void unlock() {
+    if (!locked)
+      double_unlocked = true;
+
+    locked = false;
+  }
+};
+
+TEST(LlvmLibcMutexTest, Basic) {
+  Mutex m;
+  ASSERT_FALSE(m.locked);
+  ASSERT_FALSE(m.double_locked);
+  ASSERT_FALSE(m.double_unlocked);
+
+  {
+    lock_guard lg(m);
+    ASSERT_TRUE(m.locked);
+    ASSERT_FALSE(m.double_locked);
+  }
+
+  ASSERT_FALSE(m.locked);
+  ASSERT_FALSE(m.double_unlocked);
+}
+
+TEST(LlvmLibcMutexTest, AcquireLocked) {
+  Mutex m;
+  ASSERT_FALSE(m.locked);
+  ASSERT_FALSE(m.double_locked);
+  ASSERT_FALSE(m.double_unlocked);
+
+  // Lock the mutex before placing a lock guard on it.
+  m.lock();
+  ASSERT_TRUE(m.locked);
+  ASSERT_FALSE(m.double_locked);
+
+  {
+    lock_guard lg(m, adopt_lock);
+    ASSERT_TRUE(m.locked);
+    ASSERT_FALSE(m.double_locked);
+  }
+
+  ASSERT_FALSE(m.locked);
+  ASSERT_FALSE(m.double_unlocked);
+}
diff --git a/libc/test/src/math/FModTest.h b/libc/test/src/math/FModTest.h
index f1015d6497fc..32c009ab8828 100644
--- a/libc/test/src/math/FModTest.h
+++ b/libc/test/src/math/FModTest.h
@@ -18,10 +18,10 @@
 #include "hdr/math_macros.h"
 
 #define TEST_SPECIAL(x, y, expected, dom_err, expected_exception)              \
+  LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);                         \
   EXPECT_FP_EQ(expected, f(x, y));                                             \
   EXPECT_MATH_ERRNO((dom_err) ? EDOM : 0);                                     \
-  EXPECT_FP_EXCEPTION(expected_exception);                                     \
-  LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT)
+  EXPECT_FP_EXCEPTION(expected_exception)
 
 #define TEST_REGULAR(x, y, expected) TEST_SPECIAL(x, y, expected, false, 0)
 
diff --git a/libc/test/src/math/RoundToIntegerTest.h b/libc/test/src/math/RoundToIntegerTest.h
index 0f052ba42a46..d40e15080087 100644
--- a/libc/test/src/math/RoundToIntegerTest.h
+++ b/libc/test/src/math/RoundToIntegerTest.h
@@ -57,12 +57,13 @@ private:
 
     ASSERT_EQ(func(input), expected);
 
+    // TODO: Handle the !expectError case. It used to expect
+    // 0 for errno and exceptions, but this doesn't hold for
+    // all math functions using RoundToInteger test:
+    // https://github.com/llvm/llvm-project/pull/88816
     if (expectError) {
       ASSERT_FP_EXCEPTION(FE_INVALID);
       ASSERT_MATH_ERRNO(EDOM);
-    } else {
-      ASSERT_FP_EXCEPTION(0);
-      ASSERT_MATH_ERRNO(0);
     }
   }
 
diff --git a/libc/test/src/math/atanf_test.cpp b/libc/test/src/math/atanf_test.cpp
index 4fa7badaf736..376b4724b5a3 100644
--- a/libc/test/src/math/atanf_test.cpp
+++ b/libc/test/src/math/atanf_test.cpp
@@ -21,21 +21,29 @@ using LlvmLibcAtanfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
+// TODO: This test needs to have its checks for exceptions, errno
+// tightened
 TEST_F(LlvmLibcAtanfTest, SpecialNumbers) {
   LIBC_NAMESPACE::libc_errno = 0;
   LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atanf(aNaN));
-  EXPECT_FP_EXCEPTION(0);
+  // TODO: Uncomment these checks later, RoundingMode affects running
+  // tests in this way https://github.com/llvm/llvm-project/issues/90653.
+  // EXPECT_FP_EXCEPTION(0);
   EXPECT_MATH_ERRNO(0);
 
   LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
   EXPECT_FP_EQ_ALL_ROUNDING(0.0f, LIBC_NAMESPACE::atanf(0.0f));
-  EXPECT_FP_EXCEPTION(0);
+  // TODO: Uncomment these checks later, RoundingMode affects running
+  // tests in this way https://github.com/llvm/llvm-project/issues/90653.
+  // EXPECT_FP_EXCEPTION(0);
   EXPECT_MATH_ERRNO(0);
 
   LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
   EXPECT_FP_EQ_ALL_ROUNDING(-0.0f, LIBC_NAMESPACE::atanf(-0.0f));
-  EXPECT_FP_EXCEPTION(0);
+  // TODO: Uncomment these checks later, RoundingMode affects running
+  // tests in this way https://github.com/llvm/llvm-project/issues/90653.
+  // EXPECT_FP_EXCEPTION(0);
   EXPECT_MATH_ERRNO(0);
 }
 
diff --git a/libc/test/src/math/atanhf_test.cpp b/libc/test/src/math/atanhf_test.cpp
index 7fc8c70d1386..b0505e4c1182 100644
--- a/libc/test/src/math/atanhf_test.cpp
+++ b/libc/test/src/math/atanhf_test.cpp
@@ -21,32 +21,40 @@ using LlvmLibcAtanhfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
+// TODO: This test needs to have its checks for exceptions, errno
+// tightened https://github.com/llvm/llvm-project/issues/88819.
 TEST_F(LlvmLibcAtanhfTest, SpecialNumbers) {
 
   LIBC_NAMESPACE::libc_errno = 0;
   LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atanhf(aNaN));
-  EXPECT_FP_EXCEPTION(0);
+  // TODO: Uncomment these checks later, RoundingMode affects running
+  // tests in this way https://github.com/llvm/llvm-project/issues/90653.
+  // EXPECT_FP_EXCEPTION(0);
   EXPECT_MATH_ERRNO(0);
 
   LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
   EXPECT_FP_EQ_ALL_ROUNDING(0.0f, LIBC_NAMESPACE::atanhf(0.0f));
-  EXPECT_FP_EXCEPTION(0);
+  // See above TODO
+  // EXPECT_FP_EXCEPTION(0);
   EXPECT_MATH_ERRNO(0);
 
   LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
   EXPECT_FP_EQ_ALL_ROUNDING(-0.0f, LIBC_NAMESPACE::atanhf(-0.0f));
-  EXPECT_FP_EXCEPTION(0);
+  // See above TODO
+  // EXPECT_FP_EXCEPTION(0);
   EXPECT_MATH_ERRNO(0);
 
   LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
   EXPECT_FP_EQ_ALL_ROUNDING(inf, LIBC_NAMESPACE::atanhf(1.0f));
-  EXPECT_FP_EXCEPTION(FE_DIVBYZERO);
+  // See above TODO
+  // EXPECT_FP_EXCEPTION(FE_DIVBYZERO);
   EXPECT_MATH_ERRNO(ERANGE);
 
   LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
   EXPECT_FP_EQ_ALL_ROUNDING(neg_inf, LIBC_NAMESPACE::atanhf(-1.0f));
-  EXPECT_FP_EXCEPTION(FE_DIVBYZERO);
+  // See above TODO
+  // EXPECT_FP_EXCEPTION(FE_DIVBYZERO);
   EXPECT_MATH_ERRNO(ERANGE);
 
   auto bt = FPBits(1.0f);
@@ -54,33 +62,37 @@ TEST_F(LlvmLibcAtanhfTest, SpecialNumbers) {
 
   LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atanhf(bt.get_val()));
-  EXPECT_FP_EXCEPTION(FE_INVALID);
+  // EXPECT_FP_EXCEPTION(FE_INVALID);
   EXPECT_MATH_ERRNO(EDOM);
 
   LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
   bt.set_sign(Sign::NEG);
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atanhf(bt.get_val()));
-  EXPECT_FP_EXCEPTION(FE_INVALID);
+  // See above TODO
+  // EXPECT_FP_EXCEPTION(FE_INVALID);
   EXPECT_MATH_ERRNO(EDOM);
 
   LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atanhf(2.0f));
-  EXPECT_FP_EXCEPTION(FE_INVALID);
+  // See above TODO
+  // EXPECT_FP_EXCEPTION(FE_INVALID);
   EXPECT_MATH_ERRNO(EDOM);
 
   LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atanhf(-2.0f));
-  EXPECT_FP_EXCEPTION(FE_INVALID);
+  // EXPECT_FP_EXCEPTION(FE_INVALID);
   EXPECT_MATH_ERRNO(EDOM);
 
   LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atanhf(inf));
-  EXPECT_FP_EXCEPTION(FE_INVALID);
+  // See above TODO
+  // EXPECT_FP_EXCEPTION(FE_INVALID);
   EXPECT_MATH_ERRNO(EDOM);
 
   bt.set_sign(Sign::NEG);
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atanhf(neg_inf));
-  EXPECT_FP_EXCEPTION(FE_INVALID);
+  // See above TODO
+  // EXPECT_FP_EXCEPTION(FE_INVALID);
   EXPECT_MATH_ERRNO(EDOM);
 }
 
diff --git a/libc/test/src/math/powf_test.cpp b/libc/test/src/math/powf_test.cpp
index 69135593cd32..797913e5b7ee 100644
--- a/libc/test/src/math/powf_test.cpp
+++ b/libc/test/src/math/powf_test.cpp
@@ -22,14 +22,21 @@ using LIBC_NAMESPACE::testing::tlog;
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 TEST_F(LlvmLibcPowfTest, TrickyInputs) {
-  constexpr int N = 11;
+  constexpr int N = 13;
   constexpr mpfr::BinaryInput<float> INPUTS[N] = {
-      {0x1.290bbp-124f, 0x1.1e6d92p-25f}, {0x1.2e9fb6p+5f, -0x1.1b82b6p-18f},
-      {0x1.6877f6p+60f, -0x1.75f1c6p-4f}, {0x1.0936acp-63f, -0x1.55200ep-15f},
-      {0x1.d6d72ap+43f, -0x1.749ccap-5f}, {0x1.4afb2ap-40f, 0x1.063198p+0f},
-      {0x1.0124dep+0f, -0x1.fdb016p+9f},  {0x1.1058p+0f, 0x1.ap+64f},
-      {0x1.1058p+0f, -0x1.ap+64f},        {0x1.1058p+0f, 0x1.ap+64f},
+      {0x1.290bbp-124f, 0x1.1e6d92p-25f},
+      {0x1.2e9fb6p+5f, -0x1.1b82b6p-18f},
+      {0x1.6877f6p+60f, -0x1.75f1c6p-4f},
+      {0x1.0936acp-63f, -0x1.55200ep-15f},
+      {0x1.d6d72ap+43f, -0x1.749ccap-5f},
+      {0x1.4afb2ap-40f, 0x1.063198p+0f},
+      {0x1.0124dep+0f, -0x1.fdb016p+9f},
+      {0x1.1058p+0f, 0x1.ap+64f},
+      {0x1.1058p+0f, -0x1.ap+64f},
+      {0x1.1058p+0f, 0x1.ap+64f},
       {0x1.fa32d4p-1f, 0x1.67a62ep+12f},
+      {-0x1.8p-49, 0x1.8p+1},
+      {0x1.8p-48, 0x1.8p+1},
   };
 
   for (int i = 0; i < N; ++i) {
diff --git a/libc/test/src/math/smoke/NextAfterTest.h b/libc/test/src/math/smoke/NextAfterTest.h
index 65dba9338285..d65ccdf8e70c 100644
--- a/libc/test/src/math/smoke/NextAfterTest.h
+++ b/libc/test/src/math/smoke/NextAfterTest.h
@@ -18,6 +18,8 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
+// TODO: Strengthen errno,exception checks and remove these assert macros
+// after new matchers/test fixtures are added
 #define ASSERT_FP_EQ_WITH_EXCEPTION(result, expected, expected_exception)      \
   ASSERT_FP_EQ(result, expected);                                              \
   ASSERT_FP_EXCEPTION(expected_exception);                                     \
diff --git a/libc/test/src/math/smoke/NextTowardTest.h b/libc/test/src/math/smoke/NextTowardTest.h
index 1894d324b085..a24ec9ff6bd8 100644
--- a/libc/test/src/math/smoke/NextTowardTest.h
+++ b/libc/test/src/math/smoke/NextTowardTest.h
@@ -19,6 +19,8 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
+// TODO: Strengthen errno,exception checks and remove these assert macros
+// after new matchers/test fixtures are added
 #define ASSERT_FP_EQ_WITH_EXCEPTION(result, expected, expected_exception)      \
   ASSERT_FP_EQ(result, expected);                                              \
   ASSERT_FP_EXCEPTION(expected_exception);                                     \
diff --git a/libc/test/src/math/smoke/RoundToIntegerTest.h b/libc/test/src/math/smoke/RoundToIntegerTest.h
index 50bcd4a6a76c..3ff311f46b05 100644
--- a/libc/test/src/math/smoke/RoundToIntegerTest.h
+++ b/libc/test/src/math/smoke/RoundToIntegerTest.h
@@ -28,14 +28,7 @@ public:
   typedef I (*RoundToIntegerFunc)(F);
 
 private:
-  using FPBits = LIBC_NAMESPACE::fputil::FPBits<F>;
-  using StorageType = typename FPBits::StorageType;
-
-  const F zero = FPBits::zero(Sign::POS).get_val();
-  const F neg_zero = FPBits::zero(Sign::NEG).get_val();
-  const F inf = FPBits::inf(Sign::POS).get_val();
-  const F neg_inf = FPBits::inf(Sign::NEG).get_val();
-  const F nan = FPBits::quiet_nan().get_val();
+  DECLARE_SPECIAL_CONSTANTS(F)
 
   static constexpr StorageType MAX_SUBNORMAL =
       FPBits::max_subnormal().uintval();
@@ -52,12 +45,13 @@ private:
 
     ASSERT_EQ(func(input), expected);
 
+    // TODO: Handle the !expectError case. It used to expect
+    // 0 for errno and exceptions, but this doesn't hold for
+    // all math functions using RoundToInteger test:
+    // https://github.com/llvm/llvm-project/pull/88816
     if (expectError) {
       ASSERT_FP_EXCEPTION(FE_INVALID);
       ASSERT_MATH_ERRNO(EDOM);
-    } else {
-      ASSERT_FP_EXCEPTION(0);
-      ASSERT_MATH_ERRNO(0);
     }
   }
 
@@ -81,7 +75,7 @@ public:
     // libc/CMakeLists.txt is not forwarded to C++.
 #if LIBC_COPT_IMPLEMENTATION_DEFINED_TEST_BEHAVIOR
     // Result is not well-defined, we always returns INTEGER_MAX
-    test_one_input(func, nan, INTEGER_MAX, true);
+    test_one_input(func, aNaN, INTEGER_MAX, true);
 #endif // LIBC_COPT_IMPLEMENTATION_DEFINED_TEST_BEHAVIOR
   }
 
diff --git a/libc/test/src/math/smoke/atan2f_test.cpp b/libc/test/src/math/smoke/atan2f_test.cpp
index f81d140fefc5..32a28cfdfeaa 100644
--- a/libc/test/src/math/smoke/atan2f_test.cpp
+++ b/libc/test/src/math/smoke/atan2f_test.cpp
@@ -18,33 +18,43 @@ using LlvmLibcAtan2fTest = LIBC_NAMESPACE::testing::FPTest<float>;
 TEST_F(LlvmLibcAtan2fTest, SpecialNumbers) {
   LIBC_NAMESPACE::libc_errno = 0;
 
+  // TODO: Strengthen errno,exception checks and remove these assert macros
+  // after new matchers/test fixtures are added see:
+  // https://github.com/llvm/llvm-project/issues/90653.
   LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atan2f(aNaN, zero));
-  EXPECT_FP_EXCEPTION(0);
+  // TODO: Uncomment these checks later, RoundingMode affects running
+  // tests in this way https://github.com/llvm/llvm-project/issues/90653.
+  // EXPECT_FP_EXCEPTION(0);
   EXPECT_MATH_ERRNO(0);
 
   LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atan2f(1.0f, aNaN));
-  EXPECT_FP_EXCEPTION(0);
+  // See above TODO
+  // EXPECT_FP_EXCEPTION(0);
   EXPECT_MATH_ERRNO(0);
 
   LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
   EXPECT_FP_EQ_ALL_ROUNDING(0.0f, LIBC_NAMESPACE::atan2f(zero, zero));
-  EXPECT_FP_EXCEPTION(0);
+  // See above TODO
+  // EXPECT_FP_EXCEPTION(0);
   EXPECT_MATH_ERRNO(0);
 
   LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
   EXPECT_FP_EQ_ALL_ROUNDING(-0.0f, LIBC_NAMESPACE::atan2f(-0.0f, zero));
-  EXPECT_FP_EXCEPTION(0);
+  // See above TODO
+  // EXPECT_FP_EXCEPTION(0);
   EXPECT_MATH_ERRNO(0);
 
   LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
   EXPECT_FP_EQ_ALL_ROUNDING(0.0f, LIBC_NAMESPACE::atan2f(1.0f, inf));
-  EXPECT_FP_EXCEPTION(0);
+  // See above TODO
+  // EXPECT_FP_EXCEPTION(0);
   EXPECT_MATH_ERRNO(0);
 
   LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
   EXPECT_FP_EQ_ALL_ROUNDING(-0.0f, LIBC_NAMESPACE::atan2f(-1.0f, inf));
-  EXPECT_FP_EXCEPTION(0);
+  // See above TODO
+  // EXPECT_FP_EXCEPTION(0);
   EXPECT_MATH_ERRNO(0);
 }
diff --git a/libc/test/src/math/smoke/atanf_test.cpp b/libc/test/src/math/smoke/atanf_test.cpp
index 3800c2334b92..56bf2f951b33 100644
--- a/libc/test/src/math/smoke/atanf_test.cpp
+++ b/libc/test/src/math/smoke/atanf_test.cpp
@@ -21,18 +21,25 @@ using LlvmLibcAtanfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 TEST_F(LlvmLibcAtanfTest, SpecialNumbers) {
   LIBC_NAMESPACE::libc_errno = 0;
 
+  // TODO: Strengthen errno,exception checks and remove these assert macros
+  // after new matchers/test fixtures are added
+  // https://github.com/llvm/llvm-project/issues/90653
   LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atanf(aNaN));
-  EXPECT_FP_EXCEPTION(0);
+  // TODO: Uncomment these checks later, RoundingMode affects running
+  // tests in this way https://github.com/llvm/llvm-project/issues/90653.
+  // EXPECT_FP_EXCEPTION(0);
   EXPECT_MATH_ERRNO(0);
 
   LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
   EXPECT_FP_EQ_ALL_ROUNDING(0.0f, LIBC_NAMESPACE::atanf(0.0f));
-  EXPECT_FP_EXCEPTION(0);
+  // See above TODO
+  // EXPECT_FP_EXCEPTION(0);
   EXPECT_MATH_ERRNO(0);
 
   LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
   EXPECT_FP_EQ_ALL_ROUNDING(-0.0f, LIBC_NAMESPACE::atanf(-0.0f));
-  EXPECT_FP_EXCEPTION(0);
+  // See above TODO
+  // EXPECT_FP_EXCEPTION(0);
   EXPECT_MATH_ERRNO(0);
 }
diff --git a/libc/test/src/math/smoke/atanhf_test.cpp b/libc/test/src/math/smoke/atanhf_test.cpp
index fc3e2dd9bc54..2d2acfeeab4e 100644
--- a/libc/test/src/math/smoke/atanhf_test.cpp
+++ b/libc/test/src/math/smoke/atanhf_test.cpp
@@ -19,22 +19,28 @@
 using LlvmLibcAtanhfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcAtanhfTest, SpecialNumbers) {
-
   LIBC_NAMESPACE::libc_errno = 0;
 
+  // TODO: Strengthen errno,exception checks and remove these assert macros
+  // after new matchers/test fixtures are added, see:
+  // https://github.com/llvm/llvm-project/issues/90653
   LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atanhf(aNaN));
-  EXPECT_FP_EXCEPTION(0);
+  // TODO: Uncomment these checks later, RoundingMode affects running
+  // tests in this way https://github.com/llvm/llvm-project/issues/90653.
+  // EXPECT_FP_EXCEPTION(0);
   EXPECT_MATH_ERRNO(0);
 
   LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
   EXPECT_FP_EQ_ALL_ROUNDING(0.0f, LIBC_NAMESPACE::atanhf(0.0f));
-  EXPECT_FP_EXCEPTION(0);
+  // See above TODO
+  // EXPECT_FP_EXCEPTION(0);
   EXPECT_MATH_ERRNO(0);
 
   LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
   EXPECT_FP_EQ_ALL_ROUNDING(-0.0f, LIBC_NAMESPACE::atanhf(-0.0f));
-  EXPECT_FP_EXCEPTION(0);
+  // See above TODO
+  // EXPECT_FP_EXCEPTION(0);
   EXPECT_MATH_ERRNO(0);
 
   EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::atanhf(1.0f), FE_DIVBYZERO);
diff --git a/libclc/generic/lib/math/log_base.h b/libclc/generic/lib/math/log_base.h
index f5b6f1cb4499..2558f016f60b 100644
--- a/libclc/generic/lib/math/log_base.h
+++ b/libclc/generic/lib/math/log_base.h
@@ -289,7 +289,7 @@ log(double x)
     double ret = is_near ? ret_near : ret_far;
 
     ret = isinf(x) ? as_double(PINFBITPATT_DP64) : ret;
-    ret = isnan(x) | (x < 0.0) ? as_double(QNANBITPATT_DP64) : ret;
+    ret = (isnan(x) | (x < 0.0)) ? as_double(QNANBITPATT_DP64) : ret;
     ret = x == 0.0 ? as_double(NINFBITPATT_DP64) : ret;
     return ret;
 }
diff --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/benchmarks/CMakeLists.txt
index 5dc3be0c367e..93b549a316e3 100644
--- a/libcxx/benchmarks/CMakeLists.txt
+++ b/libcxx/benchmarks/CMakeLists.txt
@@ -122,7 +122,7 @@ endif()
 add_library(           cxx-benchmarks-flags-libcxx INTERFACE)
 target_link_libraries( cxx-benchmarks-flags-libcxx INTERFACE cxx-benchmarks-flags)
 target_compile_options(cxx-benchmarks-flags-libcxx INTERFACE ${SANITIZER_FLAGS} -Wno-user-defined-literals -Wno-suggest-override)
-target_link_options(   cxx-benchmarks-flags-libcxx INTERFACE -nostdlib++ "-L${BENCHMARK_LIBCXX_INSTALL}/lib" "-L${BENCHMARK_LIBCXX_INSTALL}/lib64" ${SANITIZER_FLAGS})
+target_link_options(   cxx-benchmarks-flags-libcxx INTERFACE -lm -nostdlib++ "-L${BENCHMARK_LIBCXX_INSTALL}/lib" "-L${BENCHMARK_LIBCXX_INSTALL}/lib64" ${SANITIZER_FLAGS})
 
 set(libcxx_benchmark_targets)
 
@@ -220,6 +220,7 @@ set(BENCHMARK_TESTS
     lexicographical_compare_three_way.bench.cpp
     map.bench.cpp
     monotonic_buffer.bench.cpp
+    numeric/gcd.bench.cpp
     ordered_set.bench.cpp
     shared_mutex_vs_mutex.bench.cpp
     stop_token.bench.cpp
diff --git a/libcxx/benchmarks/numeric/gcd.bench.cpp b/libcxx/benchmarks/numeric/gcd.bench.cpp
new file mode 100644
index 000000000000..f8b6a856cd0d
--- /dev/null
+++ b/libcxx/benchmarks/numeric/gcd.bench.cpp
@@ -0,0 +1,53 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <array>
+#include <benchmark/benchmark.h>
+#include <cstring>
+#include <numeric>
+#include <random>
+
+template <class T>
+static std::array<T, 1000> generate(std::uniform_int_distribution<T> distribution = std::uniform_int_distribution<T>{
+                                        std::numeric_limits<T>::min() + 1, std::numeric_limits<T>::max()}) {
+  std::mt19937 generator;
+  std::array<T, 1000> result;
+  std::generate_n(result.begin(), result.size(), [&] { return distribution(generator); });
+  return result;
+}
+
+static void bm_gcd_random(benchmark::State& state) {
+  std::array data = generate<int>();
+  while (state.KeepRunningBatch(data.size()))
+    for (auto v0 : data)
+      for (auto v1 : data)
+        benchmark::DoNotOptimize(std::gcd(v0, v1));
+}
+BENCHMARK(bm_gcd_random);
+
+static void bm_gcd_trivial(benchmark::State& state) {
+  int lhs = ~static_cast<int>(0), rhs = 1;
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(lhs);
+    benchmark::DoNotOptimize(rhs);
+    benchmark::DoNotOptimize(std::gcd(lhs, rhs));
+  }
+}
+BENCHMARK(bm_gcd_trivial);
+
+static void bm_gcd_complex(benchmark::State& state) {
+  int lhs = 2971215073, rhs = 1836311903;
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(lhs);
+    benchmark::DoNotOptimize(rhs);
+    benchmark::DoNotOptimize(std::gcd(lhs, rhs));
+  }
+}
+BENCHMARK(bm_gcd_complex);
+
+BENCHMARK_MAIN();
diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst
index 3197d2cd1b27..17b3476d2c86 100644
--- a/libcxx/docs/FeatureTestMacroTable.rst
+++ b/libcxx/docs/FeatureTestMacroTable.rst
@@ -322,6 +322,8 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_constexpr_typeinfo``                           ``202106L``
     ---------------------------------------------------------- -----------------
+    ``__cpp_lib_containers_ranges``                            ``202202L``
+    ---------------------------------------------------------- -----------------
     ``__cpp_lib_expected``                                     ``202211L``
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_format_path``                                  *unimplemented*
@@ -444,7 +446,7 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_rcu``                                          *unimplemented*
     ---------------------------------------------------------- -----------------
-    ``__cpp_lib_reference_wrapper``                            *unimplemented*
+    ``__cpp_lib_reference_wrapper``                            ``202403L``
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_saturation_arithmetic``                        ``202311L``
     ---------------------------------------------------------- -----------------
diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst
index 5a07b11cbcd5..83fcd40bb80c 100644
--- a/libcxx/docs/ReleaseNotes/19.rst
+++ b/libcxx/docs/ReleaseNotes/19.rst
@@ -46,11 +46,13 @@ Implemented Papers
 - P2869R4 - Remove Deprecated ``shared_ptr`` Atomic Access APIs from C++26
 - P2872R3 - Remove ``wstring_convert`` From C++26
 - P3142R0 - Printing Blank Lines with ``println`` (as DR against C++23)
+- P2944R3 - Comparisons for ``reference_wrapper`` (comparison operators for ``reference_wrapper`` only)
 - P2302R4 - ``std::ranges::contains``
 - P1659R3 - ``std::ranges::starts_with`` and ``std::ranges::ends_with``
 - P3029R1 - Better ``mdspan``'s CTAD
 - P2387R3 - Pipe support for user-defined range adaptors
 - P2713R1 - Escaping improvements in ``std::format``
+- P2231R1 - Missing ``constexpr`` in ``std::optional`` and ``std::variant``
 
 Improvements and New Features
 -----------------------------
diff --git a/libcxx/docs/Status/Cxx20.rst b/libcxx/docs/Status/Cxx20.rst
index 23289dc6e596..b08b99394fbb 100644
--- a/libcxx/docs/Status/Cxx20.rst
+++ b/libcxx/docs/Status/Cxx20.rst
@@ -47,7 +47,6 @@ Paper Status
    .. [#note-P0619] P0619: Only sections D.8, D.9, D.10 and D.13 are implemented. Sections D.4, D.7, D.11, and D.12 remain undone.
    .. [#note-P0883.1] P0883: shared_ptr and floating-point changes weren't applied as they themselves aren't implemented yet.
    .. [#note-P0883.2] P0883: ``ATOMIC_FLAG_INIT`` was marked deprecated in version 14.0, but was undeprecated with the implementation of LWG3659 in version 15.0.
-   .. [#note-P2231] P2231: Optional is complete. The changes to variant haven't been implemented yet.
    .. [#note-P0660] P0660: The paper is implemented but the features are experimental and can be enabled via ``-fexperimental-library``.
    .. [#note-P0355] P0355: The implementation status is:
 
diff --git a/libcxx/docs/Status/Cxx20Papers.csv b/libcxx/docs/Status/Cxx20Papers.csv
index d31720b7576d..955aa5f614af 100644
--- a/libcxx/docs/Status/Cxx20Papers.csv
+++ b/libcxx/docs/Status/Cxx20Papers.csv
@@ -192,7 +192,7 @@
 "`P2106R0 <https://wg21.link/P2106R0>`__","LWG","Alternative wording for GB315 and GB316","Prague","|Complete|","15.0","|ranges|"
 "`P2116R0 <https://wg21.link/P2116R0>`__","LWG","Remove tuple-like protocol support from fixed-extent span","Prague","|Complete|","11.0"
 "","","","","","",""
-"`P2231R1 <https://wg21.link/P2231R1>`__","LWG","Missing constexpr in std::optional and std::variant","June 2021","|Partial| [#note-P2231]_","13.0"
+"`P2231R1 <https://wg21.link/P2231R1>`__","LWG","Missing constexpr in std::optional and std::variant","June 2021","|Complete|","19.0"
 "`P2325R3 <https://wg21.link/P2325R3>`__","LWG","Views should not be required to be default constructible","June 2021","|Complete|","16.0","|ranges|"
 "`P2210R2 <https://wg21.link/P2210R2>`__","LWG","Superior String Splitting","June 2021","|Complete|","16.0","|ranges|"
 "`P2216R3 <https://wg21.link/P2216R3>`__","LWG","std::format improvements","June 2021","|Complete|","15.0"
diff --git a/libcxx/docs/Status/Cxx2c.rst b/libcxx/docs/Status/Cxx2c.rst
index e3d9cbb551ff..5f459b4b3e4e 100644
--- a/libcxx/docs/Status/Cxx2c.rst
+++ b/libcxx/docs/Status/Cxx2c.rst
@@ -41,6 +41,7 @@ Paper Status
 
    .. [#note-P2510R3] This paper is applied as DR against C++20. (MSVC STL and libstdc++ will do the same.)
    .. [#note-P3142R0] This paper is applied as DR against C++23. (MSVC STL and libstdc++ will do the same.)
+   .. [#note-P2944R3] Implemented comparisons for ``reference_wrapper`` only.
 
 .. _issues-status-cxx2c:
 
diff --git a/libcxx/docs/Status/Cxx2cIssues.csv b/libcxx/docs/Status/Cxx2cIssues.csv
index 30a059f8a3df..76717e1d3448 100644
--- a/libcxx/docs/Status/Cxx2cIssues.csv
+++ b/libcxx/docs/Status/Cxx2cIssues.csv
@@ -64,4 +64,5 @@
 "","","","","",""
 "`3343 <https://wg21.link/LWG3343>`__","Ordering of calls to ``unlock()`` and ``notify_all()`` in Effects element of ``notify_all_at_thread_exit()`` should be reversed","Not Yet Adopted","|Complete|","16.0",""
 "XXXX","","The sys_info range should be affected by save","Not Yet Adopted","|Complete|","19.0"
+"`4071 <https://wg21.link/LWG4071>`__","","``reference_wrapper`` comparisons are not SFINAE-friendly","Not Yet Adopted","|Complete|","19.0"
 "","","","","",""
diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv
index 409278db1e87..30a601858b63 100644
--- a/libcxx/docs/Status/Cxx2cPapers.csv
+++ b/libcxx/docs/Status/Cxx2cPapers.csv
@@ -59,7 +59,7 @@
 "`P2248R8 <https://wg21.link/P2248R8>`__","LWG","Enabling list-initialization for algorithms","Tokyo March 2024","","",""
 "`P2810R4 <https://wg21.link/P2810R4>`__","LWG","``is_debugger_present`` ``is_replaceable``","Tokyo March 2024","","",""
 "`P1068R11 <https://wg21.link/P1068R11>`__","LWG","Vector API for random number generation","Tokyo March 2024","","",""
-"`P2944R3 <https://wg21.link/P2944R3>`__","LWG","Comparisons for ``reference_wrapper``","Tokyo March 2024","","",""
+"`P2944R3 <https://wg21.link/P2944R3>`__","LWG","Comparisons for ``reference_wrapper``","Tokyo March 2024","|Partial| [#note-P2944R3]_","19.0",""
 "`P2642R6 <https://wg21.link/P2642R6>`__","LWG","Padded ``mdspan`` layouts","Tokyo March 2024","","",""
 "`P3029R1 <https://wg21.link/P3029R1>`__","LWG","Better ``mdspan``'s CTAD","Tokyo March 2024","|Complete|","19.0",""
 "","","","","","",""
diff --git a/libcxx/docs/Status/SpaceshipProjects.csv b/libcxx/docs/Status/SpaceshipProjects.csv
index 3d14f487d9a9..128b23b0c2c7 100644
--- a/libcxx/docs/Status/SpaceshipProjects.csv
+++ b/libcxx/docs/Status/SpaceshipProjects.csv
@@ -171,10 +171,10 @@ Section,Description,Dependencies,Assignee,Complete
 | `month_weekday_last <https://reviews.llvm.org/D152699>`_
 | `year_month_weekday <https://reviews.llvm.org/D152699>`_
 | `year_month_weekday_last <https://reviews.llvm.org/D152699>`_",None,Hristo Hristov,|Complete|
-`[time.zone.nonmembers] <https://wg21.link/time.zone.nonmembers>`_,"`chrono::time_zone`",A ``<chrono>`` implementation,Mark de Wever,|Complete|
+`[time.zone.nonmembers] <https://wg21.link/time.zone.nonmembers>`_,"`chrono::time_zone`",,Mark de Wever,|Complete|
 `[time.zone.zonedtime.nonmembers] <https://wg21.link/time.zone.zonedtime.nonmembers>`_,"`chrono::zoned_time`",A ``<chrono>`` implementation,Mark de Wever,|In Progress|
-`[time.zone.leap.nonmembers] <https://wg21.link/time.zone.leap.nonmembers>`_,"`chrono::time_leap_seconds`",A ``<chrono>`` implementation,Mark de Wever,|Complete|
-`[time.zone.link.nonmembers] <https://wg21.link/time.zone.link.nonmembers>`_,"`chrono::time_zone_link`",A ``<chrono>`` implementation,Mark de Wever,|Complete|
+`[time.zone.leap.nonmembers] <https://wg21.link/time.zone.leap.nonmembers>`_,"`chrono::time_leap_seconds`",,Mark de Wever,|Complete|
+`[time.zone.link.nonmembers] <https://wg21.link/time.zone.link.nonmembers>`_,"`chrono::time_zone_link`",,Mark de Wever,|Complete|
 - `5.13 Clause 28: Localization library <https://wg21.link/p1614r2#clause-28-localization-library>`_,,,,
 "| `[locale] <https://wg21.link/locale>`_
 | `[locale.operators] <https://wg21.link/locale.operators>`_",| remove ops `locale <https://reviews.llvm.org/D152654>`_,None,Hristo Hristov,|Complete|
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 1296c536bc88..01e9c247560c 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -446,6 +446,7 @@ set(files
   __ios/fpos.h
   __iterator/access.h
   __iterator/advance.h
+  __iterator/aliasing_iterator.h
   __iterator/back_insert_iterator.h
   __iterator/bounded_iter.h
   __iterator/common_iterator.h
@@ -581,6 +582,8 @@ set(files
   __numeric/transform_exclusive_scan.h
   __numeric/transform_inclusive_scan.h
   __numeric/transform_reduce.h
+  __ostream/basic_ostream.h
+  __ostream/print.h
   __pstl/backends/libdispatch.h
   __pstl/backends/serial.h
   __pstl/backends/std_thread.h
@@ -731,7 +734,6 @@ set(files
   __type_traits/aligned_storage.h
   __type_traits/aligned_union.h
   __type_traits/alignment_of.h
-  __type_traits/apply_cv.h
   __type_traits/can_extract_key.h
   __type_traits/common_reference.h
   __type_traits/common_type.h
diff --git a/libcxx/include/__algorithm/mismatch.h b/libcxx/include/__algorithm/mismatch.h
index c2b3f8938f71..632bec02406a 100644
--- a/libcxx/include/__algorithm/mismatch.h
+++ b/libcxx/include/__algorithm/mismatch.h
@@ -16,6 +16,7 @@
 #include <__algorithm/unwrap_iter.h>
 #include <__config>
 #include <__functional/identity.h>
+#include <__iterator/aliasing_iterator.h>
 #include <__type_traits/desugars_to.h>
 #include <__type_traits/invoke.h>
 #include <__type_traits/is_constant_evaluated.h>
@@ -55,18 +56,13 @@ __mismatch(_Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Pred& __pred, _Pro
 
 #if _LIBCPP_VECTORIZE_ALGORITHMS
 
-template <class _Tp,
-          class _Pred,
-          class _Proj1,
-          class _Proj2,
-          __enable_if_t<is_integral<_Tp>::value && __desugars_to_v<__equal_tag, _Pred, _Tp, _Tp> &&
-                            __is_identity<_Proj1>::value && __is_identity<_Proj2>::value,
-                        int> = 0>
-_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Tp*, _Tp*>
-__mismatch(_Tp* __first1, _Tp* __last1, _Tp* __first2, _Pred& __pred, _Proj1& __proj1, _Proj2& __proj2) {
+template <class _Iter>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Iter, _Iter>
+__mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) {
+  using __value_type              = __iter_value_type<_Iter>;
   constexpr size_t __unroll_count = 4;
-  constexpr size_t __vec_size     = __native_vector_size<_Tp>;
-  using __vec                     = __simd_vector<_Tp, __vec_size>;
+  constexpr size_t __vec_size     = __native_vector_size<__value_type>;
+  using __vec                     = __simd_vector<__value_type, __vec_size>;
 
   if (!__libcpp_is_constant_evaluated()) {
     auto __orig_first1 = __first1;
@@ -116,9 +112,41 @@ __mismatch(_Tp* __first1, _Tp* __last1, _Tp* __first2, _Pred& __pred, _Proj1& __
     } // else loop over the elements individually
   }
 
-  return std::__mismatch_loop(__first1, __last1, __first2, __pred, __proj1, __proj2);
+  __equal_to __pred;
+  __identity __proj;
+  return std::__mismatch_loop(__first1, __last1, __first2, __pred, __proj, __proj);
+}
+
+template <class _Tp,
+          class _Pred,
+          class _Proj1,
+          class _Proj2,
+          __enable_if_t<is_integral<_Tp>::value && __desugars_to_v<__equal_tag, _Pred, _Tp, _Tp> &&
+                            __is_identity<_Proj1>::value && __is_identity<_Proj2>::value,
+                        int> = 0>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Tp*, _Tp*>
+__mismatch(_Tp* __first1, _Tp* __last1, _Tp* __first2, _Pred&, _Proj1&, _Proj2&) {
+  return std::__mismatch_vectorized(__first1, __last1, __first2);
 }
 
+template <class _Tp,
+          class _Pred,
+          class _Proj1,
+          class _Proj2,
+          __enable_if_t<!is_integral<_Tp>::value && __desugars_to_v<__equal_tag, _Pred, _Tp, _Tp> &&
+                            __is_identity<_Proj1>::value && __is_identity<_Proj2>::value &&
+                            __can_map_to_integer_v<_Tp> && __libcpp_is_trivially_equality_comparable<_Tp, _Tp>::value,
+                        int> = 0>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Tp*, _Tp*>
+__mismatch(_Tp* __first1, _Tp* __last1, _Tp* __first2, _Pred& __pred, _Proj1& __proj1, _Proj2& __proj2) {
+  if (__libcpp_is_constant_evaluated()) {
+    return std::__mismatch_loop(__first1, __last1, __first2, __pred, __proj1, __proj2);
+  } else {
+    using _Iter = __aliasing_iterator<_Tp*, __get_as_integer_type_t<_Tp>>;
+    auto __ret  = std::__mismatch_vectorized(_Iter(__first1), _Iter(__last1), _Iter(__first2));
+    return {__ret.first.__base(), __ret.second.__base()};
+  }
+}
 #endif // _LIBCPP_VECTORIZE_ALGORITHMS
 
 template <class _InputIterator1, class _InputIterator2, class _BinaryPredicate>
diff --git a/libcxx/include/__algorithm/simd_utils.h b/libcxx/include/__algorithm/simd_utils.h
index 8d540ae2cce8..71d65e8f4afb 100644
--- a/libcxx/include/__algorithm/simd_utils.h
+++ b/libcxx/include/__algorithm/simd_utils.h
@@ -43,6 +43,34 @@ _LIBCPP_PUSH_MACROS
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+template <class _Tp>
+inline constexpr bool __can_map_to_integer_v =
+    sizeof(_Tp) == alignof(_Tp) && (sizeof(_Tp) == 1 || sizeof(_Tp) == 2 || sizeof(_Tp) == 4 || sizeof(_Tp) == 8);
+
+template <size_t _TypeSize>
+struct __get_as_integer_type_impl;
+
+template <>
+struct __get_as_integer_type_impl<1> {
+  using type = uint8_t;
+};
+
+template <>
+struct __get_as_integer_type_impl<2> {
+  using type = uint16_t;
+};
+template <>
+struct __get_as_integer_type_impl<4> {
+  using type = uint32_t;
+};
+template <>
+struct __get_as_integer_type_impl<8> {
+  using type = uint64_t;
+};
+
+template <class _Tp>
+using __get_as_integer_type_t = typename __get_as_integer_type_impl<sizeof(_Tp)>::type;
+
 // This isn't specialized for 64 byte vectors on purpose. They have the potential to significantly reduce performance
 // in mixed simd/non-simd workloads and don't provide any performance improvement for currently vectorized algorithms
 // as far as benchmarks are concerned.
@@ -80,10 +108,10 @@ template <class _VecT>
 using __simd_vector_underlying_type_t = decltype(std::__simd_vector_underlying_type_impl(_VecT{}));
 
 // This isn't inlined without always_inline when loading chars.
-template <class _VecT, class _Tp>
-_LIBCPP_NODISCARD _LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _VecT __load_vector(const _Tp* __ptr) noexcept {
+template <class _VecT, class _Iter>
+_LIBCPP_NODISCARD _LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _VecT __load_vector(_Iter __iter) noexcept {
   return [=]<size_t... _Indices>(index_sequence<_Indices...>) _LIBCPP_ALWAYS_INLINE noexcept {
-    return _VecT{__ptr[_Indices]...};
+    return _VecT{__iter[_Indices]...};
   }(make_index_sequence<__simd_vector_size_v<_VecT>>{});
 }
 
diff --git a/libcxx/include/__availability b/libcxx/include/__availability
index 7a02ae00846b..e44ac1962df3 100644
--- a/libcxx/include/__availability
+++ b/libcxx/include/__availability
@@ -87,43 +87,43 @@
 #if defined(_LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS)
 
 #  define _LIBCPP_INTRODUCED_IN_LLVM_4 1
-#  define _LIBCPP_INTRODUCED_IN_LLVM_4_MARKUP /* nothing */
+#  define _LIBCPP_INTRODUCED_IN_LLVM_4_ATTRIBUTE /* nothing */
 
 #  define _LIBCPP_INTRODUCED_IN_LLVM_9 1
-#  define _LIBCPP_INTRODUCED_IN_LLVM_9_MARKUP      /* nothing */
-#  define _LIBCPP_INTRODUCED_IN_LLVM_9_MARKUP_PUSH /* nothing */
-#  define _LIBCPP_INTRODUCED_IN_LLVM_9_MARKUP_POP  /* nothing */
+#  define _LIBCPP_INTRODUCED_IN_LLVM_9_ATTRIBUTE      /* nothing */
+#  define _LIBCPP_INTRODUCED_IN_LLVM_9_ATTRIBUTE_PUSH /* nothing */
+#  define _LIBCPP_INTRODUCED_IN_LLVM_9_ATTRIBUTE_POP  /* nothing */
 
 #  define _LIBCPP_INTRODUCED_IN_LLVM_10 1
-#  define _LIBCPP_INTRODUCED_IN_LLVM_10_MARKUP /* nothing */
+#  define _LIBCPP_INTRODUCED_IN_LLVM_10_ATTRIBUTE /* nothing */
 
 #  define _LIBCPP_INTRODUCED_IN_LLVM_12 1
-#  define _LIBCPP_INTRODUCED_IN_LLVM_12_MARKUP /* nothing */
+#  define _LIBCPP_INTRODUCED_IN_LLVM_12_ATTRIBUTE /* nothing */
 
 #  define _LIBCPP_INTRODUCED_IN_LLVM_14 1
-#  define _LIBCPP_INTRODUCED_IN_LLVM_14_MARKUP /* nothing */
+#  define _LIBCPP_INTRODUCED_IN_LLVM_14_ATTRIBUTE /* nothing */
 
 #  define _LIBCPP_INTRODUCED_IN_LLVM_15 1
-#  define _LIBCPP_INTRODUCED_IN_LLVM_15_MARKUP /* nothing */
+#  define _LIBCPP_INTRODUCED_IN_LLVM_15_ATTRIBUTE /* nothing */
 
 #  define _LIBCPP_INTRODUCED_IN_LLVM_16 1
-#  define _LIBCPP_INTRODUCED_IN_LLVM_16_MARKUP /* nothing */
+#  define _LIBCPP_INTRODUCED_IN_LLVM_16_ATTRIBUTE /* nothing */
 
 #  define _LIBCPP_INTRODUCED_IN_LLVM_18 1
-#  define _LIBCPP_INTRODUCED_IN_LLVM_18_MARKUP /* nothing */
+#  define _LIBCPP_INTRODUCED_IN_LLVM_18_ATTRIBUTE /* nothing */
 
 #  define _LIBCPP_INTRODUCED_IN_LLVM_19 1
-#  define _LIBCPP_INTRODUCED_IN_LLVM_19_MARKUP /* nothing */
+#  define _LIBCPP_INTRODUCED_IN_LLVM_19_ATTRIBUTE /* nothing */
 
 #elif defined(__APPLE__)
 
 // LLVM 4
 #  if defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 50000
 #    define _LIBCPP_INTRODUCED_IN_LLVM_4 0
-#    define _LIBCPP_INTRODUCED_IN_LLVM_4_MARKUP __attribute__((availability(watchos, strict, introduced = 5.0)))
+#    define _LIBCPP_INTRODUCED_IN_LLVM_4_ATTRIBUTE __attribute__((availability(watchos, strict, introduced = 5.0)))
 #  else
 #    define _LIBCPP_INTRODUCED_IN_LLVM_4 1
-#    define _LIBCPP_INTRODUCED_IN_LLVM_4_MARKUP /* nothing */
+#    define _LIBCPP_INTRODUCED_IN_LLVM_4_ATTRIBUTE /* nothing */
 #  endif
 
 // LLVM 9
@@ -134,18 +134,18 @@
       (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 60000)
 // clang-format on
 #    define _LIBCPP_INTRODUCED_IN_LLVM_9 0
-#    define _LIBCPP_INTRODUCED_IN_LLVM_9_MARKUP                                                                        \
+#    define _LIBCPP_INTRODUCED_IN_LLVM_9_ATTRIBUTE                                                                     \
       __attribute__((availability(macos, strict, introduced = 10.15)))                                                 \
       __attribute__((availability(ios, strict, introduced = 13.0)))                                                    \
       __attribute__((availability(tvos, strict, introduced = 13.0)))                                                   \
       __attribute__((availability(watchos, strict, introduced = 6.0)))
 // clang-format off
-#    define _LIBCPP_INTRODUCED_IN_LLVM_9_MARKUP_PUSH                                                                               \
+#    define _LIBCPP_INTRODUCED_IN_LLVM_9_ATTRIBUTE_PUSH                                                                               \
       _Pragma("clang attribute push(__attribute__((availability(macos,strict,introduced=10.15))), apply_to=any(function,record))") \
       _Pragma("clang attribute push(__attribute__((availability(ios,strict,introduced=13.0))), apply_to=any(function,record))")    \
       _Pragma("clang attribute push(__attribute__((availability(tvos,strict,introduced=13.0))), apply_to=any(function,record))")   \
       _Pragma("clang attribute push(__attribute__((availability(watchos,strict,introduced=6.0))), apply_to=any(function,record))")
-#    define _LIBCPP_INTRODUCED_IN_LLVM_9_MARKUP_POP                                                                    \
+#    define _LIBCPP_INTRODUCED_IN_LLVM_9_ATTRIBUTE_POP                                                                    \
       _Pragma("clang attribute pop") \
       _Pragma("clang attribute pop") \
       _Pragma("clang attribute pop") \
@@ -153,9 +153,9 @@
 // clang-format on
 #  else
 #    define _LIBCPP_INTRODUCED_IN_LLVM_9 1
-#    define _LIBCPP_INTRODUCED_IN_LLVM_9_MARKUP      /* nothing */
-#    define _LIBCPP_INTRODUCED_IN_LLVM_9_MARKUP_PUSH /* nothing */
-#    define _LIBCPP_INTRODUCED_IN_LLVM_9_MARKUP_POP  /* nothing */
+#    define _LIBCPP_INTRODUCED_IN_LLVM_9_ATTRIBUTE      /* nothing */
+#    define _LIBCPP_INTRODUCED_IN_LLVM_9_ATTRIBUTE_PUSH /* nothing */
+#    define _LIBCPP_INTRODUCED_IN_LLVM_9_ATTRIBUTE_POP  /* nothing */
 #  endif
 
 // LLVM 10
@@ -166,14 +166,14 @@
       (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 70000)
 // clang-format on
 #    define _LIBCPP_INTRODUCED_IN_LLVM_10 0
-#    define _LIBCPP_INTRODUCED_IN_LLVM_10_MARKUP                                                                       \
+#    define _LIBCPP_INTRODUCED_IN_LLVM_10_ATTRIBUTE                                                                    \
       __attribute__((availability(macos, strict, introduced = 11.0)))                                                  \
       __attribute__((availability(ios, strict, introduced = 14.0)))                                                    \
       __attribute__((availability(tvos, strict, introduced = 14.0)))                                                   \
       __attribute__((availability(watchos, strict, introduced = 7.0)))
 #  else
 #    define _LIBCPP_INTRODUCED_IN_LLVM_10 1
-#    define _LIBCPP_INTRODUCED_IN_LLVM_10_MARKUP /* nothing */
+#    define _LIBCPP_INTRODUCED_IN_LLVM_10_ATTRIBUTE /* nothing */
 #  endif
 
 // LLVM 12
@@ -184,14 +184,14 @@
       (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 80000)
 // clang-format on
 #    define _LIBCPP_INTRODUCED_IN_LLVM_12 0
-#    define _LIBCPP_INTRODUCED_IN_LLVM_12_MARKUP                                                                       \
+#    define _LIBCPP_INTRODUCED_IN_LLVM_12_ATTRIBUTE                                                                    \
       __attribute__((availability(macos, strict, introduced = 12.0)))                                                  \
       __attribute__((availability(ios, strict, introduced = 15.0)))                                                    \
       __attribute__((availability(tvos, strict, introduced = 15.0)))                                                   \
       __attribute__((availability(watchos, strict, introduced = 8.0)))
 #  else
 #    define _LIBCPP_INTRODUCED_IN_LLVM_12 1
-#    define _LIBCPP_INTRODUCED_IN_LLVM_12_MARKUP /* nothing */
+#    define _LIBCPP_INTRODUCED_IN_LLVM_12_ATTRIBUTE /* nothing */
 #  endif
 
 // LLVM 14
@@ -202,19 +202,19 @@
       (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 90500)
 // clang-format on
 #    define _LIBCPP_INTRODUCED_IN_LLVM_14 0
-#    define _LIBCPP_INTRODUCED_IN_LLVM_14_MARKUP                                                                       \
+#    define _LIBCPP_INTRODUCED_IN_LLVM_14_ATTRIBUTE                                                                    \
       __attribute__((availability(macos, strict, introduced = 13.4)))                                                  \
       __attribute__((availability(ios, strict, introduced = 16.5)))                                                    \
       __attribute__((availability(tvos, strict, introduced = 16.5)))                                                   \
       __attribute__((availability(watchos, strict, introduced = 9.5)))
 #  else
 #    define _LIBCPP_INTRODUCED_IN_LLVM_14 1
-#    define _LIBCPP_INTRODUCED_IN_LLVM_14_MARKUP /* nothing */
+#    define _LIBCPP_INTRODUCED_IN_LLVM_14_ATTRIBUTE /* nothing */
 #  endif
 
 // LLVM 15-16
 #  define _LIBCPP_INTRODUCED_IN_LLVM_15 _LIBCPP_INTRODUCED_IN_LLVM_16
-#  define _LIBCPP_INTRODUCED_IN_LLVM_15_MARKUP _LIBCPP_INTRODUCED_IN_LLVM_16_MARKUP
+#  define _LIBCPP_INTRODUCED_IN_LLVM_15_ATTRIBUTE _LIBCPP_INTRODUCED_IN_LLVM_16_ATTRIBUTE
 // clang-format off
 #  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 140000) ||   \
       (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 170000) || \
@@ -222,34 +222,34 @@
       (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 100000)
 // clang-format on
 #    define _LIBCPP_INTRODUCED_IN_LLVM_16 0
-#    define _LIBCPP_INTRODUCED_IN_LLVM_16_MARKUP                                                                       \
+#    define _LIBCPP_INTRODUCED_IN_LLVM_16_ATTRIBUTE                                                                    \
       __attribute__((availability(macos, strict, introduced = 14.0)))                                                  \
       __attribute__((availability(ios, strict, introduced = 17.0)))                                                    \
       __attribute__((availability(tvos, strict, introduced = 17.0)))                                                   \
       __attribute__((availability(watchos, strict, introduced = 10.0)))
 #  else
 #    define _LIBCPP_INTRODUCED_IN_LLVM_16 1
-#    define _LIBCPP_INTRODUCED_IN_LLVM_16_MARKUP /* nothing */
+#    define _LIBCPP_INTRODUCED_IN_LLVM_16_ATTRIBUTE /* nothing */
 #  endif
 
 // LLVM 18
 // TODO: Fill this in
 #  if 1
 #    define _LIBCPP_INTRODUCED_IN_LLVM_18 0
-#    define _LIBCPP_INTRODUCED_IN_LLVM_18_MARKUP __attribute__((unavailable))
+#    define _LIBCPP_INTRODUCED_IN_LLVM_18_ATTRIBUTE __attribute__((unavailable))
 #  else
 #    define _LIBCPP_INTRODUCED_IN_LLVM_18 1
-#    define _LIBCPP_INTRODUCED_IN_LLVM_18_MARKUP /* nothing */
+#    define _LIBCPP_INTRODUCED_IN_LLVM_18_ATTRIBUTE /* nothing */
 #  endif
 
 // LLVM 19
 // TODO: Fill this in
 #  if 1
 #    define _LIBCPP_INTRODUCED_IN_LLVM_19 0
-#    define _LIBCPP_INTRODUCED_IN_LLVM_19_MARKUP __attribute__((unavailable))
+#    define _LIBCPP_INTRODUCED_IN_LLVM_19_ATTRIBUTE __attribute__((unavailable))
 #  else
 #    define _LIBCPP_INTRODUCED_IN_LLVM_19 1
-#    define _LIBCPP_INTRODUCED_IN_LLVM_19_MARKUP /* nothing */
+#    define _LIBCPP_INTRODUCED_IN_LLVM_19_ATTRIBUTE /* nothing */
 #  endif
 
 #else
@@ -270,27 +270,27 @@
 // these exceptions can be used even on older deployment targets, but those
 // methods will abort instead of throwing.
 #define _LIBCPP_AVAILABILITY_HAS_BAD_OPTIONAL_ACCESS _LIBCPP_INTRODUCED_IN_LLVM_4
-#define _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS _LIBCPP_INTRODUCED_IN_LLVM_4_MARKUP
+#define _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS _LIBCPP_INTRODUCED_IN_LLVM_4_ATTRIBUTE
 
 #define _LIBCPP_AVAILABILITY_HAS_BAD_VARIANT_ACCESS _LIBCPP_INTRODUCED_IN_LLVM_4
-#define _LIBCPP_AVAILABILITY_BAD_VARIANT_ACCESS _LIBCPP_INTRODUCED_IN_LLVM_4_MARKUP
+#define _LIBCPP_AVAILABILITY_BAD_VARIANT_ACCESS _LIBCPP_INTRODUCED_IN_LLVM_4_ATTRIBUTE
 
 #define _LIBCPP_AVAILABILITY_HAS_BAD_ANY_CAST _LIBCPP_INTRODUCED_IN_LLVM_4
-#define _LIBCPP_AVAILABILITY_BAD_ANY_CAST _LIBCPP_INTRODUCED_IN_LLVM_4_MARKUP
+#define _LIBCPP_AVAILABILITY_BAD_ANY_CAST _LIBCPP_INTRODUCED_IN_LLVM_4_ATTRIBUTE
 
 // These macros control the availability of all parts of <filesystem> that
 // depend on something in the dylib.
 #define _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY _LIBCPP_INTRODUCED_IN_LLVM_9
-#define _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY _LIBCPP_INTRODUCED_IN_LLVM_9_MARKUP
-#define _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY_PUSH _LIBCPP_INTRODUCED_IN_LLVM_9_MARKUP_PUSH
-#define _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY_POP _LIBCPP_INTRODUCED_IN_LLVM_9_MARKUP_POP
+#define _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY _LIBCPP_INTRODUCED_IN_LLVM_9_ATTRIBUTE
+#define _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY_PUSH _LIBCPP_INTRODUCED_IN_LLVM_9_ATTRIBUTE_PUSH
+#define _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY_POP _LIBCPP_INTRODUCED_IN_LLVM_9_ATTRIBUTE_POP
 
 // This controls the availability of the C++20 synchronization library,
 // which requires shared library support for various operations
 // (see libcxx/src/atomic.cpp). This includes <barier>, <latch>,
 // <semaphore>, and notification functions on std::atomic.
 #define _LIBCPP_AVAILABILITY_HAS_SYNC _LIBCPP_INTRODUCED_IN_LLVM_10
-#define _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INTRODUCED_IN_LLVM_10_MARKUP
+#define _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INTRODUCED_IN_LLVM_10_ATTRIBUTE
 
 // Enable additional explicit instantiations of iostreams components. This
 // reduces the number of weak definitions generated in programs that use
@@ -308,13 +308,13 @@
 // This controls the availability of floating-point std::to_chars functions.
 // These overloads were added later than the integer overloads.
 #define _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT _LIBCPP_INTRODUCED_IN_LLVM_14
-#define _LIBCPP_AVAILABILITY_TO_CHARS_FLOATING_POINT _LIBCPP_INTRODUCED_IN_LLVM_14_MARKUP
+#define _LIBCPP_AVAILABILITY_TO_CHARS_FLOATING_POINT _LIBCPP_INTRODUCED_IN_LLVM_14_ATTRIBUTE
 
 // This controls whether the library claims to provide a default verbose
 // termination function, and consequently whether the headers will try
 // to use it when the mechanism isn't overriden at compile-time.
 #define _LIBCPP_AVAILABILITY_HAS_VERBOSE_ABORT _LIBCPP_INTRODUCED_IN_LLVM_15
-#define _LIBCPP_AVAILABILITY_VERBOSE_ABORT _LIBCPP_INTRODUCED_IN_LLVM_15_MARKUP
+#define _LIBCPP_AVAILABILITY_VERBOSE_ABORT _LIBCPP_INTRODUCED_IN_LLVM_15_ATTRIBUTE
 
 // This controls the availability of the C++17 std::pmr library,
 // which is implemented in large part in the built library.
@@ -330,27 +330,27 @@
 // in the built library, which std::make_exception_ptr might use
 // (see libcxx/include/__exception/exception_ptr.h).
 #define _LIBCPP_AVAILABILITY_HAS_INIT_PRIMARY_EXCEPTION _LIBCPP_INTRODUCED_IN_LLVM_18
-#define _LIBCPP_AVAILABILITY_INIT_PRIMARY_EXCEPTION _LIBCPP_INTRODUCED_IN_LLVM_18_MARKUP
+#define _LIBCPP_AVAILABILITY_INIT_PRIMARY_EXCEPTION _LIBCPP_INTRODUCED_IN_LLVM_18_ATTRIBUTE
 
 // This controls the availability of C++23 <print>, which
 // has a dependency on the built library (it needs access to
 // the underlying buffer types of std::cout, std::cerr, and std::clog.
 #define _LIBCPP_AVAILABILITY_HAS_PRINT _LIBCPP_INTRODUCED_IN_LLVM_18
-#define _LIBCPP_AVAILABILITY_PRINT _LIBCPP_INTRODUCED_IN_LLVM_18_MARKUP
+#define _LIBCPP_AVAILABILITY_PRINT _LIBCPP_INTRODUCED_IN_LLVM_18_ATTRIBUTE
 
 // This controls the availability of the C++20 time zone database.
 // The parser code is built in the library.
 #define _LIBCPP_AVAILABILITY_HAS_TZDB _LIBCPP_INTRODUCED_IN_LLVM_19
-#define _LIBCPP_AVAILABILITY_TZDB _LIBCPP_INTRODUCED_IN_LLVM_19_MARKUP
+#define _LIBCPP_AVAILABILITY_TZDB _LIBCPP_INTRODUCED_IN_LLVM_19_ATTRIBUTE
 
 // These macros determine whether we assume that std::bad_function_call and
 // std::bad_expected_access provide a key function in the dylib. This allows
 // centralizing their vtable and typeinfo instead of having all TUs provide
 // a weak definition that then gets deduplicated.
-#  define _LIBCPP_AVAILABILITY_HAS_BAD_FUNCTION_CALL_KEY_FUNCTION _LIBCPP_INTRODUCED_IN_LLVM_19
-#  define _LIBCPP_AVAILABILITY_BAD_FUNCTION_CALL_KEY_FUNCTION _LIBCPP_INTRODUCED_IN_LLVM_19_MARKUP
-#  define _LIBCPP_AVAILABILITY_HAS_BAD_EXPECTED_ACCESS_KEY_FUNCTION _LIBCPP_INTRODUCED_IN_LLVM_19
-#  define _LIBCPP_AVAILABILITY_BAD_EXPECTED_ACCESS_KEY_FUNCTION _LIBCPP_INTRODUCED_IN_LLVM_19_MARKUP
+#define _LIBCPP_AVAILABILITY_HAS_BAD_FUNCTION_CALL_KEY_FUNCTION _LIBCPP_INTRODUCED_IN_LLVM_19
+#define _LIBCPP_AVAILABILITY_BAD_FUNCTION_CALL_KEY_FUNCTION _LIBCPP_INTRODUCED_IN_LLVM_19_ATTRIBUTE
+#define _LIBCPP_AVAILABILITY_HAS_BAD_EXPECTED_ACCESS_KEY_FUNCTION _LIBCPP_INTRODUCED_IN_LLVM_19
+#define _LIBCPP_AVAILABILITY_BAD_EXPECTED_ACCESS_KEY_FUNCTION _LIBCPP_INTRODUCED_IN_LLVM_19_ATTRIBUTE
 
 // Define availability attributes that depend on _LIBCPP_HAS_NO_EXCEPTIONS.
 // Those are defined in terms of the availability attributes above, and
diff --git a/libcxx/include/__config b/libcxx/include/__config
index e4c5c685a456..104a244cc82c 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -334,26 +334,16 @@ _LIBCPP_HARDENING_MODE_DEBUG
 #    define _LIBCPP_CXX03_LANG
 #  endif
 
-#  ifndef __has_attribute
-#    define __has_attribute(__x) 0
-#  endif
-
-#  ifndef __has_builtin
-#    define __has_builtin(__x) 0
-#  endif
-
+// TODO: Remove once we switch to GCC 14
 #  ifndef __has_extension
 #    define __has_extension(__x) 0
 #  endif
 
+// TODO: Remove once we switch to GCC 14
 #  ifndef __has_feature
 #    define __has_feature(__x) 0
 #  endif
 
-#  ifndef __has_cpp_attribute
-#    define __has_cpp_attribute(__x) 0
-#  endif
-
 #  ifndef __has_constexpr_builtin
 #    define __has_constexpr_builtin(x) 0
 #  endif
@@ -375,10 +365,6 @@ _LIBCPP_HARDENING_MODE_DEBUG
 
 #  define __has_keyword(__x) !(__is_identifier(__x))
 
-#  ifndef __has_include
-#    define __has_include(...) 0
-#  endif
-
 #  ifndef __has_warning
 #    define __has_warning(...) 0
 #  endif
diff --git a/libcxx/include/__functional/is_transparent.h b/libcxx/include/__functional/is_transparent.h
index 13fc94f71c6b..b2d62f2e3ead 100644
--- a/libcxx/include/__functional/is_transparent.h
+++ b/libcxx/include/__functional/is_transparent.h
@@ -11,7 +11,6 @@
 #define _LIBCPP___FUNCTIONAL_IS_TRANSPARENT
 
 #include <__config>
-#include <__type_traits/integral_constant.h>
 #include <__type_traits/void_t.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -23,10 +22,10 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 #if _LIBCPP_STD_VER >= 14
 
 template <class _Tp, class, class = void>
-struct __is_transparent : false_type {};
+inline const bool __is_transparent_v = false;
 
 template <class _Tp, class _Up>
-struct __is_transparent<_Tp, _Up, __void_t<typename _Tp::is_transparent> > : true_type {};
+inline const bool __is_transparent_v<_Tp, _Up, __void_t<typename _Tp::is_transparent> > = true;
 
 #endif
 
diff --git a/libcxx/include/__functional/reference_wrapper.h b/libcxx/include/__functional/reference_wrapper.h
index 94b39e3bc786..ab5d7c7cee11 100644
--- a/libcxx/include/__functional/reference_wrapper.h
+++ b/libcxx/include/__functional/reference_wrapper.h
@@ -10,11 +10,14 @@
 #ifndef _LIBCPP___FUNCTIONAL_REFERENCE_WRAPPER_H
 #define _LIBCPP___FUNCTIONAL_REFERENCE_WRAPPER_H
 
+#include <__compare/synth_three_way.h>
+#include <__concepts/boolean_testable.h>
 #include <__config>
 #include <__functional/invoke.h>
 #include <__functional/weak_result_type.h>
 #include <__memory/addressof.h>
 #include <__type_traits/enable_if.h>
+#include <__type_traits/is_const.h>
 #include <__type_traits/remove_cvref.h>
 #include <__type_traits/void_t.h>
 #include <__utility/declval.h>
@@ -64,6 +67,54 @@ public:
   {
     return std::__invoke(get(), std::forward<_ArgTypes>(__args)...);
   }
+
+#if _LIBCPP_STD_VER >= 26
+
+  // [refwrap.comparisons], comparisons
+
+  _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator==(reference_wrapper __x, reference_wrapper __y)
+    requires requires {
+      { __x.get() == __y.get() } -> __boolean_testable;
+    }
+  {
+    return __x.get() == __y.get();
+  }
+
+  _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator==(reference_wrapper __x, const _Tp& __y)
+    requires requires {
+      { __x.get() == __y } -> __boolean_testable;
+    }
+  {
+    return __x.get() == __y;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator==(reference_wrapper __x, reference_wrapper<const _Tp> __y)
+    requires(!is_const_v<_Tp>) && requires {
+      { __x.get() == __y.get() } -> __boolean_testable;
+    }
+  {
+    return __x.get() == __y.get();
+  }
+
+  _LIBCPP_HIDE_FROM_ABI friend constexpr auto operator<=>(reference_wrapper __x, reference_wrapper __y)
+    requires requires { std::__synth_three_way(__x.get(), __y.get()); }
+  {
+    return std::__synth_three_way(__x.get(), __y.get());
+  }
+
+  _LIBCPP_HIDE_FROM_ABI friend constexpr auto operator<=>(reference_wrapper __x, const _Tp& __y)
+    requires requires { std::__synth_three_way(__x.get(), __y); }
+  {
+    return std::__synth_three_way(__x.get(), __y);
+  }
+
+  _LIBCPP_HIDE_FROM_ABI friend constexpr auto operator<=>(reference_wrapper __x, reference_wrapper<const _Tp> __y)
+    requires(!is_const_v<_Tp>) && requires { std::__synth_three_way(__x.get(), __y.get()); }
+  {
+    return std::__synth_three_way(__x.get(), __y.get());
+  }
+
+#endif // _LIBCPP_STD_VER >= 26
 };
 
 #if _LIBCPP_STD_VER >= 17
diff --git a/libcxx/include/__iterator/aliasing_iterator.h b/libcxx/include/__iterator/aliasing_iterator.h
new file mode 100644
index 000000000000..94ba577078b5
--- /dev/null
+++ b/libcxx/include/__iterator/aliasing_iterator.h
@@ -0,0 +1,127 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ITERATOR_ALIASING_ITERATOR_H
+#define _LIBCPP___ITERATOR_ALIASING_ITERATOR_H
+
+#include <__config>
+#include <__iterator/iterator_traits.h>
+#include <__memory/pointer_traits.h>
+#include <__type_traits/is_trivial.h>
+#include <cstddef>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+// This iterator wrapper is used to type-pun an iterator to return a different type. This is done without UB by not
+// actually punning the type, but instead inspecting the object representation of the base type and copying that into
+// an instance of the alias type. For that reason the alias type has to be trivial. The alias is returned as a prvalue
+// when derferencing the iterator, since it is temporary storage. This wrapper is used to vectorize some algorithms.
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _BaseIter, class _Alias>
+struct __aliasing_iterator_wrapper {
+  class __iterator {
+    _BaseIter __base_ = nullptr;
+
+    using __iter_traits     = iterator_traits<_BaseIter>;
+    using __base_value_type = typename __iter_traits::value_type;
+
+    static_assert(__has_random_access_iterator_category<_BaseIter>::value,
+                  "The base iterator has to be a random access iterator!");
+
+  public:
+    using iterator_category = random_access_iterator_tag;
+    using value_type        = _Alias;
+    using difference_type   = ptrdiff_t;
+    using reference         = value_type&;
+    using pointer           = value_type*;
+
+    static_assert(is_trivial<value_type>::value);
+    static_assert(sizeof(__base_value_type) == sizeof(value_type));
+
+    _LIBCPP_HIDE_FROM_ABI __iterator() = default;
+    _LIBCPP_HIDE_FROM_ABI __iterator(_BaseIter __base) _NOEXCEPT : __base_(__base) {}
+
+    _LIBCPP_HIDE_FROM_ABI __iterator& operator++() _NOEXCEPT {
+      ++__base_;
+      return *this;
+    }
+
+    _LIBCPP_HIDE_FROM_ABI __iterator operator++(int) _NOEXCEPT {
+      __iterator __tmp(*this);
+      ++__base_;
+      return __tmp;
+    }
+
+    _LIBCPP_HIDE_FROM_ABI __iterator& operator--() _NOEXCEPT {
+      --__base_;
+      return *this;
+    }
+
+    _LIBCPP_HIDE_FROM_ABI __iterator operator--(int) _NOEXCEPT {
+      __iterator __tmp(*this);
+      --__base_;
+      return __tmp;
+    }
+
+    _LIBCPP_HIDE_FROM_ABI friend __iterator operator+(__iterator __iter, difference_type __n) _NOEXCEPT {
+      return __iterator(__iter.__base_ + __n);
+    }
+
+    _LIBCPP_HIDE_FROM_ABI friend __iterator operator+(difference_type __n, __iterator __iter) _NOEXCEPT {
+      return __iterator(__n + __iter.__base_);
+    }
+
+    _LIBCPP_HIDE_FROM_ABI __iterator& operator+=(difference_type __n) _NOEXCEPT {
+      __base_ += __n;
+      return *this;
+    }
+
+    _LIBCPP_HIDE_FROM_ABI friend __iterator operator-(__iterator __iter, difference_type __n) _NOEXCEPT {
+      return __iterator(__iter.__base_ - __n);
+    }
+
+    _LIBCPP_HIDE_FROM_ABI friend difference_type operator-(__iterator __lhs, __iterator __rhs) _NOEXCEPT {
+      return __lhs.__base_ - __rhs.__base_;
+    }
+
+    _LIBCPP_HIDE_FROM_ABI __iterator& operator-=(difference_type __n) _NOEXCEPT {
+      __base_ -= __n;
+      return *this;
+    }
+
+    _LIBCPP_HIDE_FROM_ABI _BaseIter __base() const _NOEXCEPT { return __base_; }
+
+    _LIBCPP_HIDE_FROM_ABI _Alias operator*() const _NOEXCEPT {
+      _Alias __val;
+      __builtin_memcpy(&__val, std::__to_address(__base_), sizeof(value_type));
+      return __val;
+    }
+
+    _LIBCPP_HIDE_FROM_ABI value_type operator[](difference_type __n) const _NOEXCEPT { return *(*this + __n); }
+
+    _LIBCPP_HIDE_FROM_ABI friend bool operator==(const __iterator& __lhs, const __iterator& __rhs) _NOEXCEPT {
+      return __lhs.__base_ == __rhs.__base_;
+    }
+
+    _LIBCPP_HIDE_FROM_ABI friend bool operator!=(const __iterator& __lhs, const __iterator& __rhs) _NOEXCEPT {
+      return __lhs.__base_ != __rhs.__base_;
+    }
+  };
+};
+
+// This is required to avoid ADL instantiations on _BaseT
+template <class _BaseT, class _Alias>
+using __aliasing_iterator = typename __aliasing_iterator_wrapper<_BaseT, _Alias>::__iterator;
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___ITERATOR_ALIASING_ITERATOR_H
diff --git a/libcxx/include/__numeric/gcd_lcm.h b/libcxx/include/__numeric/gcd_lcm.h
index 48df2338051e..5d735a51a47e 100644
--- a/libcxx/include/__numeric/gcd_lcm.h
+++ b/libcxx/include/__numeric/gcd_lcm.h
@@ -10,7 +10,9 @@
 #ifndef _LIBCPP___NUMERIC_GCD_LCM_H
 #define _LIBCPP___NUMERIC_GCD_LCM_H
 
+#include <__algorithm/min.h>
 #include <__assert>
+#include <__bit/countr.h>
 #include <__config>
 #include <__type_traits/common_type.h>
 #include <__type_traits/is_integral.h>
@@ -50,9 +52,47 @@ struct __ct_abs<_Result, _Source, false> {
 };
 
 template <class _Tp>
-_LIBCPP_CONSTEXPR _LIBCPP_HIDDEN _Tp __gcd(_Tp __m, _Tp __n) {
+_LIBCPP_CONSTEXPR _LIBCPP_HIDDEN _Tp __gcd(_Tp __a, _Tp __b) {
   static_assert((!is_signed<_Tp>::value), "");
-  return __n == 0 ? __m : std::__gcd<_Tp>(__n, __m % __n);
+
+  // From: https://lemire.me/blog/2013/12/26/fastest-way-to-compute-the-greatest-common-divisor
+  //
+  // If power of two divides both numbers, we can push it out.
+  // - gcd( 2^x * a, 2^x * b) = 2^x * gcd(a, b)
+  //
+  // If and only if exactly one number is even, we can divide that number by that power.
+  // - if a, b are odd, then gcd(2^x * a, b) = gcd(a, b)
+  //
+  // And standard gcd algorithm where instead of modulo, minus is used.
+
+  if (__a < __b) {
+    _Tp __tmp = __b;
+    __b       = __a;
+    __a       = __tmp;
+  }
+  if (__b == 0)
+    return __a;
+  __a %= __b; // Make both argument of the same size, and early result in the easy case.
+  if (__a == 0)
+    return __b;
+
+  int __az    = std::__countr_zero(__a);
+  int __bz    = std::__countr_zero(__b);
+  int __shift = std::min(__az, __bz);
+  __a >>= __az;
+  __b >>= __bz;
+  do {
+    _Tp __diff = __a - __b;
+    if (__a > __b) {
+      __a = __b;
+      __b = __diff;
+    } else {
+      __b = __b - __a;
+    }
+    if (__diff != 0)
+      __b >>= std::__countr_zero(__diff);
+  } while (__b != 0);
+  return __a << __shift;
 }
 
 template <class _Tp, class _Up>
diff --git a/libcxx/include/__ostream/basic_ostream.h b/libcxx/include/__ostream/basic_ostream.h
new file mode 100644
index 000000000000..697192bfb46b
--- /dev/null
+++ b/libcxx/include/__ostream/basic_ostream.h
@@ -0,0 +1,860 @@
+//===---------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___OSTREAM_BASIC_OSTREAM_H
+#define _LIBCPP___OSTREAM_BASIC_OSTREAM_H
+
+#include <__config>
+#include <__exception/operations.h>
+#include <__memory/shared_ptr.h>
+#include <__memory/unique_ptr.h>
+#include <__system_error/error_code.h>
+#include <__type_traits/conjunction.h>
+#include <__type_traits/enable_if.h>
+#include <__type_traits/is_base_of.h>
+#include <__type_traits/void_t.h>
+#include <__utility/declval.h>
+#include <bitset>
+#include <cstddef>
+#include <ios>
+#include <locale>
+#include <new> // for __throw_bad_alloc
+#include <streambuf>
+#include <string_view>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _CharT, class _Traits>
+class _LIBCPP_TEMPLATE_VIS basic_ostream : virtual public basic_ios<_CharT, _Traits> {
+public:
+  // types (inherited from basic_ios (27.5.4)):
+  typedef _CharT char_type;
+  typedef _Traits traits_type;
+  typedef typename traits_type::int_type int_type;
+  typedef typename traits_type::pos_type pos_type;
+  typedef typename traits_type::off_type off_type;
+
+  // 27.7.2.2 Constructor/destructor:
+  inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 explicit basic_ostream(basic_streambuf<char_type, traits_type>* __sb) {
+    this->init(__sb);
+  }
+  ~basic_ostream() override;
+
+protected:
+  inline _LIBCPP_HIDE_FROM_ABI basic_ostream(basic_ostream&& __rhs);
+
+  // 27.7.2.3 Assign/swap
+  inline _LIBCPP_HIDE_FROM_ABI basic_ostream& operator=(basic_ostream&& __rhs);
+
+  inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 void swap(basic_ostream& __rhs) {
+    basic_ios<char_type, traits_type>::swap(__rhs);
+  }
+
+  basic_ostream(const basic_ostream& __rhs)            = delete;
+  basic_ostream& operator=(const basic_ostream& __rhs) = delete;
+
+public:
+  // 27.7.2.4 Prefix/suffix:
+  class _LIBCPP_TEMPLATE_VIS sentry;
+
+  // 27.7.2.6 Formatted output:
+  inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 basic_ostream& operator<<(basic_ostream& (*__pf)(basic_ostream&)) {
+    return __pf(*this);
+  }
+
+  inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 basic_ostream&
+  operator<<(basic_ios<char_type, traits_type>& (*__pf)(basic_ios<char_type, traits_type>&)) {
+    __pf(*this);
+    return *this;
+  }
+
+  inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 basic_ostream& operator<<(ios_base& (*__pf)(ios_base&)) {
+    __pf(*this);
+    return *this;
+  }
+
+  basic_ostream& operator<<(bool __n);
+  basic_ostream& operator<<(short __n);
+  basic_ostream& operator<<(unsigned short __n);
+  basic_ostream& operator<<(int __n);
+  basic_ostream& operator<<(unsigned int __n);
+  basic_ostream& operator<<(long __n);
+  basic_ostream& operator<<(unsigned long __n);
+  basic_ostream& operator<<(long long __n);
+  basic_ostream& operator<<(unsigned long long __n);
+  basic_ostream& operator<<(float __f);
+  basic_ostream& operator<<(double __f);
+  basic_ostream& operator<<(long double __f);
+  basic_ostream& operator<<(const void* __p);
+
+#if _LIBCPP_STD_VER >= 23
+  _LIBCPP_HIDE_FROM_ABI basic_ostream& operator<<(const volatile void* __p) {
+    return operator<<(const_cast<const void*>(__p));
+  }
+#endif
+
+  basic_ostream& operator<<(basic_streambuf<char_type, traits_type>* __sb);
+
+#if _LIBCPP_STD_VER >= 17
+  // LWG 2221 - nullptr. This is not backported to older standards modes.
+  // See https://reviews.llvm.org/D127033 for more info on the rationale.
+  _LIBCPP_HIDE_FROM_ABI basic_ostream& operator<<(nullptr_t) { return *this << "nullptr"; }
+#endif
+
+  // 27.7.2.7 Unformatted output:
+  basic_ostream& put(char_type __c);
+  basic_ostream& write(const char_type* __s, streamsize __n);
+  basic_ostream& flush();
+
+  // 27.7.2.5 seeks:
+  inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 pos_type tellp();
+  inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 basic_ostream& seekp(pos_type __pos);
+  inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 basic_ostream& seekp(off_type __off, ios_base::seekdir __dir);
+
+protected:
+  _LIBCPP_HIDE_FROM_ABI basic_ostream() {} // extension, intentially does not initialize
+};
+
+template <class _CharT, class _Traits>
+class _LIBCPP_TEMPLATE_VIS basic_ostream<_CharT, _Traits>::sentry {
+  bool __ok_;
+  basic_ostream<_CharT, _Traits>& __os_;
+
+public:
+  explicit sentry(basic_ostream<_CharT, _Traits>& __os);
+  ~sentry();
+  sentry(const sentry&)            = delete;
+  sentry& operator=(const sentry&) = delete;
+
+  _LIBCPP_HIDE_FROM_ABI explicit operator bool() const { return __ok_; }
+};
+
+template <class _CharT, class _Traits>
+basic_ostream<_CharT, _Traits>::sentry::sentry(basic_ostream<_CharT, _Traits>& __os) : __ok_(false), __os_(__os) {
+  if (__os.good()) {
+    if (__os.tie())
+      __os.tie()->flush();
+    __ok_ = true;
+  }
+}
+
+template <class _CharT, class _Traits>
+basic_ostream<_CharT, _Traits>::sentry::~sentry() {
+  if (__os_.rdbuf() && __os_.good() && (__os_.flags() & ios_base::unitbuf) && !uncaught_exception()) {
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+    try {
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+      if (__os_.rdbuf()->pubsync() == -1)
+        __os_.setstate(ios_base::badbit);
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+    } catch (...) {
+    }
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+  }
+}
+
+template <class _CharT, class _Traits>
+basic_ostream<_CharT, _Traits>::basic_ostream(basic_ostream&& __rhs) {
+  this->move(__rhs);
+}
+
+template <class _CharT, class _Traits>
+basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator=(basic_ostream&& __rhs) {
+  swap(__rhs);
+  return *this;
+}
+
+template <class _CharT, class _Traits>
+basic_ostream<_CharT, _Traits>::~basic_ostream() {}
+
+template <class _CharT, class _Traits>
+basic_ostream<_CharT, _Traits>&
+basic_ostream<_CharT, _Traits>::operator<<(basic_streambuf<char_type, traits_type>* __sb) {
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  try {
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+    sentry __s(*this);
+    if (__s) {
+      if (__sb) {
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+        try {
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+          typedef istreambuf_iterator<_CharT, _Traits> _Ip;
+          typedef ostreambuf_iterator<_CharT, _Traits> _Op;
+          _Ip __i(__sb);
+          _Ip __eof;
+          _Op __o(*this);
+          size_t __c = 0;
+          for (; __i != __eof; ++__i, ++__o, ++__c) {
+            *__o = *__i;
+            if (__o.failed())
+              break;
+          }
+          if (__c == 0)
+            this->setstate(ios_base::failbit);
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+        } catch (...) {
+          this->__set_failbit_and_consider_rethrow();
+        }
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+      } else
+        this->setstate(ios_base::badbit);
+    }
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  } catch (...) {
+    this->__set_badbit_and_consider_rethrow();
+  }
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+  return *this;
+}
+
+template <class _CharT, class _Traits>
+basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(bool __n) {
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  try {
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+    sentry __s(*this);
+    if (__s) {
+      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
+      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
+      if (__f.put(*this, *this, this->fill(), __n).failed())
+        this->setstate(ios_base::badbit | ios_base::failbit);
+    }
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  } catch (...) {
+    this->__set_badbit_and_consider_rethrow();
+  }
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+  return *this;
+}
+
+template <class _CharT, class _Traits>
+basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(short __n) {
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  try {
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+    sentry __s(*this);
+    if (__s) {
+      ios_base::fmtflags __flags = ios_base::flags() & ios_base::basefield;
+      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
+      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
+      if (__f.put(*this,
+                  *this,
+                  this->fill(),
+                  __flags == ios_base::oct || __flags == ios_base::hex
+                      ? static_cast<long>(static_cast<unsigned short>(__n))
+                      : static_cast<long>(__n))
+              .failed())
+        this->setstate(ios_base::badbit | ios_base::failbit);
+    }
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  } catch (...) {
+    this->__set_badbit_and_consider_rethrow();
+  }
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+  return *this;
+}
+
+template <class _CharT, class _Traits>
+basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(unsigned short __n) {
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  try {
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+    sentry __s(*this);
+    if (__s) {
+      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
+      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
+      if (__f.put(*this, *this, this->fill(), static_cast<unsigned long>(__n)).failed())
+        this->setstate(ios_base::badbit | ios_base::failbit);
+    }
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  } catch (...) {
+    this->__set_badbit_and_consider_rethrow();
+  }
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+  return *this;
+}
+
+template <class _CharT, class _Traits>
+basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(int __n) {
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  try {
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+    sentry __s(*this);
+    if (__s) {
+      ios_base::fmtflags __flags = ios_base::flags() & ios_base::basefield;
+      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
+      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
+      if (__f.put(*this,
+                  *this,
+                  this->fill(),
+                  __flags == ios_base::oct || __flags == ios_base::hex
+                      ? static_cast<long>(static_cast<unsigned int>(__n))
+                      : static_cast<long>(__n))
+              .failed())
+        this->setstate(ios_base::badbit | ios_base::failbit);
+    }
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  } catch (...) {
+    this->__set_badbit_and_consider_rethrow();
+  }
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+  return *this;
+}
+
+template <class _CharT, class _Traits>
+basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(unsigned int __n) {
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  try {
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+    sentry __s(*this);
+    if (__s) {
+      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
+      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
+      if (__f.put(*this, *this, this->fill(), static_cast<unsigned long>(__n)).failed())
+        this->setstate(ios_base::badbit | ios_base::failbit);
+    }
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  } catch (...) {
+    this->__set_badbit_and_consider_rethrow();
+  }
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+  return *this;
+}
+
+template <class _CharT, class _Traits>
+basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(long __n) {
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  try {
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+    sentry __s(*this);
+    if (__s) {
+      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
+      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
+      if (__f.put(*this, *this, this->fill(), __n).failed())
+        this->setstate(ios_base::badbit | ios_base::failbit);
+    }
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  } catch (...) {
+    this->__set_badbit_and_consider_rethrow();
+  }
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+  return *this;
+}
+
+template <class _CharT, class _Traits>
+basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(unsigned long __n) {
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  try {
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+    sentry __s(*this);
+    if (__s) {
+      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
+      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
+      if (__f.put(*this, *this, this->fill(), __n).failed())
+        this->setstate(ios_base::badbit | ios_base::failbit);
+    }
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  } catch (...) {
+    this->__set_badbit_and_consider_rethrow();
+  }
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+  return *this;
+}
+
+template <class _CharT, class _Traits>
+basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(long long __n) {
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  try {
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+    sentry __s(*this);
+    if (__s) {
+      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
+      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
+      if (__f.put(*this, *this, this->fill(), __n).failed())
+        this->setstate(ios_base::badbit | ios_base::failbit);
+    }
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  } catch (...) {
+    this->__set_badbit_and_consider_rethrow();
+  }
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+  return *this;
+}
+
+template <class _CharT, class _Traits>
+basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(unsigned long long __n) {
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  try {
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+    sentry __s(*this);
+    if (__s) {
+      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
+      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
+      if (__f.put(*this, *this, this->fill(), __n).failed())
+        this->setstate(ios_base::badbit | ios_base::failbit);
+    }
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  } catch (...) {
+    this->__set_badbit_and_consider_rethrow();
+  }
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+  return *this;
+}
+
+template <class _CharT, class _Traits>
+basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(float __n) {
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  try {
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+    sentry __s(*this);
+    if (__s) {
+      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
+      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
+      if (__f.put(*this, *this, this->fill(), static_cast<double>(__n)).failed())
+        this->setstate(ios_base::badbit | ios_base::failbit);
+    }
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  } catch (...) {
+    this->__set_badbit_and_consider_rethrow();
+  }
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+  return *this;
+}
+
+template <class _CharT, class _Traits>
+basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(double __n) {
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  try {
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+    sentry __s(*this);
+    if (__s) {
+      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
+      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
+      if (__f.put(*this, *this, this->fill(), __n).failed())
+        this->setstate(ios_base::badbit | ios_base::failbit);
+    }
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  } catch (...) {
+    this->__set_badbit_and_consider_rethrow();
+  }
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+  return *this;
+}
+
+template <class _CharT, class _Traits>
+basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(long double __n) {
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  try {
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+    sentry __s(*this);
+    if (__s) {
+      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
+      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
+      if (__f.put(*this, *this, this->fill(), __n).failed())
+        this->setstate(ios_base::badbit | ios_base::failbit);
+    }
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  } catch (...) {
+    this->__set_badbit_and_consider_rethrow();
+  }
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+  return *this;
+}
+
+template <class _CharT, class _Traits>
+basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(const void* __n) {
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  try {
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+    sentry __s(*this);
+    if (__s) {
+      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
+      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
+      if (__f.put(*this, *this, this->fill(), __n).failed())
+        this->setstate(ios_base::badbit | ios_base::failbit);
+    }
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  } catch (...) {
+    this->__set_badbit_and_consider_rethrow();
+  }
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+  return *this;
+}
+
+template <class _CharT, class _Traits>
+_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
+__put_character_sequence(basic_ostream<_CharT, _Traits>& __os, const _CharT* __str, size_t __len) {
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  try {
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+    typename basic_ostream<_CharT, _Traits>::sentry __s(__os);
+    if (__s) {
+      typedef ostreambuf_iterator<_CharT, _Traits> _Ip;
+      if (std::__pad_and_output(
+              _Ip(__os),
+              __str,
+              (__os.flags() & ios_base::adjustfield) == ios_base::left ? __str + __len : __str,
+              __str + __len,
+              __os,
+              __os.fill())
+              .failed())
+        __os.setstate(ios_base::badbit | ios_base::failbit);
+    }
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  } catch (...) {
+    __os.__set_badbit_and_consider_rethrow();
+  }
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+  return __os;
+}
+
+template <class _CharT, class _Traits>
+_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>& operator<<(basic_ostream<_CharT, _Traits>& __os, _CharT __c) {
+  return std::__put_character_sequence(__os, &__c, 1);
+}
+
+template <class _CharT, class _Traits>
+_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>& operator<<(basic_ostream<_CharT, _Traits>& __os, char __cn) {
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  try {
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+    typename basic_ostream<_CharT, _Traits>::sentry __s(__os);
+    if (__s) {
+      _CharT __c = __os.widen(__cn);
+      typedef ostreambuf_iterator<_CharT, _Traits> _Ip;
+      if (std::__pad_and_output(
+              _Ip(__os),
+              &__c,
+              (__os.flags() & ios_base::adjustfield) == ios_base::left ? &__c + 1 : &__c,
+              &__c + 1,
+              __os,
+              __os.fill())
+              .failed())
+        __os.setstate(ios_base::badbit | ios_base::failbit);
+    }
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  } catch (...) {
+    __os.__set_badbit_and_consider_rethrow();
+  }
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+  return __os;
+}
+
+template <class _Traits>
+_LIBCPP_HIDE_FROM_ABI basic_ostream<char, _Traits>& operator<<(basic_ostream<char, _Traits>& __os, char __c) {
+  return std::__put_character_sequence(__os, &__c, 1);
+}
+
+template <class _Traits>
+_LIBCPP_HIDE_FROM_ABI basic_ostream<char, _Traits>& operator<<(basic_ostream<char, _Traits>& __os, signed char __c) {
+  return std::__put_character_sequence(__os, (char*)&__c, 1);
+}
+
+template <class _Traits>
+_LIBCPP_HIDE_FROM_ABI basic_ostream<char, _Traits>& operator<<(basic_ostream<char, _Traits>& __os, unsigned char __c) {
+  return std::__put_character_sequence(__os, (char*)&__c, 1);
+}
+
+template <class _CharT, class _Traits>
+_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
+operator<<(basic_ostream<_CharT, _Traits>& __os, const _CharT* __str) {
+  return std::__put_character_sequence(__os, __str, _Traits::length(__str));
+}
+
+template <class _CharT, class _Traits>
+_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
+operator<<(basic_ostream<_CharT, _Traits>& __os, const char* __strn) {
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  try {
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+    typename basic_ostream<_CharT, _Traits>::sentry __s(__os);
+    if (__s) {
+      typedef ostreambuf_iterator<_CharT, _Traits> _Ip;
+      size_t __len   = char_traits<char>::length(__strn);
+      const int __bs = 100;
+      _CharT __wbb[__bs];
+      _CharT* __wb = __wbb;
+      unique_ptr<_CharT, void (*)(void*)> __h(0, free);
+      if (__len > __bs) {
+        __wb = (_CharT*)malloc(__len * sizeof(_CharT));
+        if (__wb == 0)
+          __throw_bad_alloc();
+        __h.reset(__wb);
+      }
+      for (_CharT* __p = __wb; *__strn != '\0'; ++__strn, ++__p)
+        *__p = __os.widen(*__strn);
+      if (std::__pad_and_output(
+              _Ip(__os),
+              __wb,
+              (__os.flags() & ios_base::adjustfield) == ios_base::left ? __wb + __len : __wb,
+              __wb + __len,
+              __os,
+              __os.fill())
+              .failed())
+        __os.setstate(ios_base::badbit | ios_base::failbit);
+    }
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  } catch (...) {
+    __os.__set_badbit_and_consider_rethrow();
+  }
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+  return __os;
+}
+
+template <class _Traits>
+_LIBCPP_HIDE_FROM_ABI basic_ostream<char, _Traits>& operator<<(basic_ostream<char, _Traits>& __os, const char* __str) {
+  return std::__put_character_sequence(__os, __str, _Traits::length(__str));
+}
+
+template <class _Traits>
+_LIBCPP_HIDE_FROM_ABI basic_ostream<char, _Traits>&
+operator<<(basic_ostream<char, _Traits>& __os, const signed char* __str) {
+  const char* __s = (const char*)__str;
+  return std::__put_character_sequence(__os, __s, _Traits::length(__s));
+}
+
+template <class _Traits>
+_LIBCPP_HIDE_FROM_ABI basic_ostream<char, _Traits>&
+operator<<(basic_ostream<char, _Traits>& __os, const unsigned char* __str) {
+  const char* __s = (const char*)__str;
+  return std::__put_character_sequence(__os, __s, _Traits::length(__s));
+}
+
+template <class _CharT, class _Traits>
+basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::put(char_type __c) {
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  try {
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+    sentry __s(*this);
+    if (__s) {
+      typedef ostreambuf_iterator<_CharT, _Traits> _Op;
+      _Op __o(*this);
+      *__o = __c;
+      if (__o.failed())
+        this->setstate(ios_base::badbit);
+    }
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  } catch (...) {
+    this->__set_badbit_and_consider_rethrow();
+  }
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+  return *this;
+}
+
+template <class _CharT, class _Traits>
+basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::write(const char_type* __s, streamsize __n) {
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  try {
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+    sentry __sen(*this);
+    if (__sen && __n) {
+      if (this->rdbuf()->sputn(__s, __n) != __n)
+        this->setstate(ios_base::badbit);
+    }
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  } catch (...) {
+    this->__set_badbit_and_consider_rethrow();
+  }
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+  return *this;
+}
+
+template <class _CharT, class _Traits>
+basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::flush() {
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  try {
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+    if (this->rdbuf()) {
+      sentry __s(*this);
+      if (__s) {
+        if (this->rdbuf()->pubsync() == -1)
+          this->setstate(ios_base::badbit);
+      }
+    }
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  } catch (...) {
+    this->__set_badbit_and_consider_rethrow();
+  }
+#endif // _LIBCPP_HAS_NO_EXCEPTIONS
+  return *this;
+}
+
+template <class _CharT, class _Traits>
+typename basic_ostream<_CharT, _Traits>::pos_type basic_ostream<_CharT, _Traits>::tellp() {
+  if (this->fail())
+    return pos_type(-1);
+  return this->rdbuf()->pubseekoff(0, ios_base::cur, ios_base::out);
+}
+
+template <class _CharT, class _Traits>
+basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::seekp(pos_type __pos) {
+  sentry __s(*this);
+  if (!this->fail()) {
+    if (this->rdbuf()->pubseekpos(__pos, ios_base::out) == pos_type(-1))
+      this->setstate(ios_base::failbit);
+  }
+  return *this;
+}
+
+template <class _CharT, class _Traits>
+basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::seekp(off_type __off, ios_base::seekdir __dir) {
+  sentry __s(*this);
+  if (!this->fail()) {
+    if (this->rdbuf()->pubseekoff(__off, __dir, ios_base::out) == pos_type(-1))
+      this->setstate(ios_base::failbit);
+  }
+  return *this;
+}
+
+template <class _CharT, class _Traits>
+_LIBCPP_HIDE_FROM_ABI inline basic_ostream<_CharT, _Traits>& endl(basic_ostream<_CharT, _Traits>& __os) {
+  __os.put(__os.widen('\n'));
+  __os.flush();
+  return __os;
+}
+
+template <class _CharT, class _Traits>
+_LIBCPP_HIDE_FROM_ABI inline basic_ostream<_CharT, _Traits>& ends(basic_ostream<_CharT, _Traits>& __os) {
+  __os.put(_CharT());
+  return __os;
+}
+
+template <class _CharT, class _Traits>
+_LIBCPP_HIDE_FROM_ABI inline basic_ostream<_CharT, _Traits>& flush(basic_ostream<_CharT, _Traits>& __os) {
+  __os.flush();
+  return __os;
+}
+
+template <class _Stream, class _Tp, class = void>
+struct __is_ostreamable : false_type {};
+
+template <class _Stream, class _Tp>
+struct __is_ostreamable<_Stream, _Tp, decltype(std::declval<_Stream>() << std::declval<_Tp>(), void())> : true_type {};
+
+template <class _Stream,
+          class _Tp,
+          __enable_if_t<_And<is_base_of<ios_base, _Stream>, __is_ostreamable<_Stream&, const _Tp&> >::value, int> = 0>
+_LIBCPP_HIDE_FROM_ABI _Stream&& operator<<(_Stream&& __os, const _Tp& __x) {
+  __os << __x;
+  return std::move(__os);
+}
+
+template <class _CharT, class _Traits, class _Allocator>
+basic_ostream<_CharT, _Traits>&
+operator<<(basic_ostream<_CharT, _Traits>& __os, const basic_string<_CharT, _Traits, _Allocator>& __str) {
+  return std::__put_character_sequence(__os, __str.data(), __str.size());
+}
+
+template <class _CharT, class _Traits>
+_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
+operator<<(basic_ostream<_CharT, _Traits>& __os, basic_string_view<_CharT, _Traits> __sv) {
+  return std::__put_character_sequence(__os, __sv.data(), __sv.size());
+}
+
+template <class _CharT, class _Traits>
+inline _LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
+operator<<(basic_ostream<_CharT, _Traits>& __os, const error_code& __ec) {
+  return __os << __ec.category().name() << ':' << __ec.value();
+}
+
+template <class _CharT, class _Traits, class _Yp>
+inline _LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
+operator<<(basic_ostream<_CharT, _Traits>& __os, shared_ptr<_Yp> const& __p) {
+  return __os << __p.get();
+}
+
+template <
+    class _CharT,
+    class _Traits,
+    class _Yp,
+    class _Dp,
+    __enable_if_t<is_same<void,
+                          __void_t<decltype((std::declval<basic_ostream<_CharT, _Traits>&>()
+                                             << std::declval<typename unique_ptr<_Yp, _Dp>::pointer>()))> >::value,
+                  int> = 0>
+inline _LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
+operator<<(basic_ostream<_CharT, _Traits>& __os, unique_ptr<_Yp, _Dp> const& __p) {
+  return __os << __p.get();
+}
+
+template <class _CharT, class _Traits, size_t _Size>
+_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
+operator<<(basic_ostream<_CharT, _Traits>& __os, const bitset<_Size>& __x) {
+  return __os << __x.template to_string<_CharT, _Traits>(std::use_facet<ctype<_CharT> >(__os.getloc()).widen('0'),
+                                                         std::use_facet<ctype<_CharT> >(__os.getloc()).widen('1'));
+}
+
+#if _LIBCPP_STD_VER >= 20
+
+#  ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
+template <class _Traits>
+basic_ostream<char, _Traits>& operator<<(basic_ostream<char, _Traits>&, wchar_t) = delete;
+
+template <class _Traits>
+basic_ostream<char, _Traits>& operator<<(basic_ostream<char, _Traits>&, const wchar_t*) = delete;
+
+template <class _Traits>
+basic_ostream<wchar_t, _Traits>& operator<<(basic_ostream<wchar_t, _Traits>&, char16_t) = delete;
+
+template <class _Traits>
+basic_ostream<wchar_t, _Traits>& operator<<(basic_ostream<wchar_t, _Traits>&, char32_t) = delete;
+
+template <class _Traits>
+basic_ostream<wchar_t, _Traits>& operator<<(basic_ostream<wchar_t, _Traits>&, const char16_t*) = delete;
+
+template <class _Traits>
+basic_ostream<wchar_t, _Traits>& operator<<(basic_ostream<wchar_t, _Traits>&, const char32_t*) = delete;
+
+#  endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS
+
+#  ifndef _LIBCPP_HAS_NO_CHAR8_T
+template <class _Traits>
+basic_ostream<char, _Traits>& operator<<(basic_ostream<char, _Traits>&, char8_t) = delete;
+
+template <class _Traits>
+basic_ostream<wchar_t, _Traits>& operator<<(basic_ostream<wchar_t, _Traits>&, char8_t) = delete;
+
+template <class _Traits>
+basic_ostream<char, _Traits>& operator<<(basic_ostream<char, _Traits>&, const char8_t*) = delete;
+
+template <class _Traits>
+basic_ostream<wchar_t, _Traits>& operator<<(basic_ostream<wchar_t, _Traits>&, const char8_t*) = delete;
+#  endif
+
+template <class _Traits>
+basic_ostream<char, _Traits>& operator<<(basic_ostream<char, _Traits>&, char16_t) = delete;
+
+template <class _Traits>
+basic_ostream<char, _Traits>& operator<<(basic_ostream<char, _Traits>&, char32_t) = delete;
+
+template <class _Traits>
+basic_ostream<char, _Traits>& operator<<(basic_ostream<char, _Traits>&, const char16_t*) = delete;
+
+template <class _Traits>
+basic_ostream<char, _Traits>& operator<<(basic_ostream<char, _Traits>&, const char32_t*) = delete;
+
+#endif // _LIBCPP_STD_VER >= 20
+
+extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_ostream<char>;
+#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
+extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_ostream<wchar_t>;
+#endif
+
+_LIBCPP_END_NAMESPACE_STD
+
+_LIBCPP_POP_MACROS
+
+#endif // _LIBCPP___OSTREAM_BASIC_OSTREAM_H
diff --git a/libcxx/include/__ostream/print.h b/libcxx/include/__ostream/print.h
new file mode 100644
index 000000000000..97680cdab6da
--- /dev/null
+++ b/libcxx/include/__ostream/print.h
@@ -0,0 +1,180 @@
+//===---------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___OSTREAM_PRINT_H
+#define _LIBCPP___OSTREAM_PRINT_H
+
+#include <__availability>
+#include <__config>
+#include <__fwd/ostream.h>
+#include <__iterator/ostreambuf_iterator.h>
+#include <__ostream/basic_ostream.h>
+#include <format>
+#include <ios>
+#include <locale>
+#include <print>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#if _LIBCPP_STD_VER >= 23
+
+template <class = void> // TODO PRINT template or availability markup fires too eagerly (http://llvm.org/PR61563).
+_LIBCPP_HIDE_FROM_ABI inline void
+__vprint_nonunicode(ostream& __os, string_view __fmt, format_args __args, bool __write_nl) {
+  // [ostream.formatted.print]/3
+  // Effects: Behaves as a formatted output function
+  // ([ostream.formatted.reqmts]) of os, except that:
+  // - failure to generate output is reported as specified below, and
+  // - any exception thrown by the call to vformat is propagated without regard
+  //   to the value of os.exceptions() and without turning on ios_base::badbit
+  //   in the error state of os.
+  // After constructing a sentry object, the function initializes an automatic
+  // variable via
+  //   string out = vformat(os.getloc(), fmt, args);
+
+  ostream::sentry __s(__os);
+  if (__s) {
+    string __o = std::vformat(__os.getloc(), __fmt, __args);
+    if (__write_nl)
+      __o += '\n';
+
+    const char* __str = __o.data();
+    size_t __len      = __o.size();
+
+#  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+    try {
+#  endif // _LIBCPP_HAS_NO_EXCEPTIONS
+      typedef ostreambuf_iterator<char> _Ip;
+      if (std::__pad_and_output(
+              _Ip(__os),
+              __str,
+              (__os.flags() & ios_base::adjustfield) == ios_base::left ? __str + __len : __str,
+              __str + __len,
+              __os,
+              __os.fill())
+              .failed())
+        __os.setstate(ios_base::badbit | ios_base::failbit);
+
+#  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+    } catch (...) {
+      __os.__set_badbit_and_consider_rethrow();
+    }
+#  endif // _LIBCPP_HAS_NO_EXCEPTIONS
+  }
+}
+
+template <class = void> // TODO PRINT template or availability markup fires too eagerly (http://llvm.org/PR61563).
+_LIBCPP_HIDE_FROM_ABI inline void vprint_nonunicode(ostream& __os, string_view __fmt, format_args __args) {
+  std::__vprint_nonunicode(__os, __fmt, __args, false);
+}
+
+// Returns the FILE* associated with the __os.
+// Returns a nullptr when no FILE* is associated with __os.
+// This function is in the dylib since the type of the buffer associated
+// with std::cout, std::cerr, and std::clog is only known in the dylib.
+//
+// This function implements part of the implementation-defined behavior
+// of [ostream.formatted.print]/3
+//   If the function is vprint_unicode and os is a stream that refers to
+//   a terminal capable of displaying Unicode which is determined in an
+//   implementation-defined manner, writes out to the terminal using the
+//   native Unicode API;
+// Whether the returned FILE* is "a terminal capable of displaying Unicode"
+// is determined in the same way as the print(FILE*, ...) overloads.
+_LIBCPP_EXPORTED_FROM_ABI FILE* __get_ostream_file(ostream& __os);
+
+#  ifndef _LIBCPP_HAS_NO_UNICODE
+template <class = void> // TODO PRINT template or availability markup fires too eagerly (http://llvm.org/PR61563).
+_LIBCPP_HIDE_FROM_ABI void __vprint_unicode(ostream& __os, string_view __fmt, format_args __args, bool __write_nl) {
+#    if _LIBCPP_AVAILABILITY_HAS_PRINT == 0
+  return std::__vprint_nonunicode(__os, __fmt, __args, __write_nl);
+#    else
+  FILE* __file = std::__get_ostream_file(__os);
+  if (!__file || !__print::__is_terminal(__file))
+    return std::__vprint_nonunicode(__os, __fmt, __args, __write_nl);
+
+  // [ostream.formatted.print]/3
+  //    If the function is vprint_unicode and os is a stream that refers to a
+  //    terminal capable of displaying Unicode which is determined in an
+  //    implementation-defined manner, writes out to the terminal using the
+  //    native Unicode API; if out contains invalid code units, the behavior is
+  //    undefined and implementations are encouraged to diagnose it. If the
+  //    native Unicode API is used, the function flushes os before writing out.
+  //
+  // This is the path for the native API, start with flushing.
+  __os.flush();
+
+#      ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  try {
+#      endif // _LIBCPP_HAS_NO_EXCEPTIONS
+    ostream::sentry __s(__os);
+    if (__s) {
+#      ifndef _LIBCPP_WIN32API
+      __print::__vprint_unicode_posix(__file, __fmt, __args, __write_nl, true);
+#      elif !defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)
+    __print::__vprint_unicode_windows(__file, __fmt, __args, __write_nl, true);
+#      else
+#        error "Windows builds with wchar_t disabled are not supported."
+#      endif
+    }
+
+#      ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  } catch (...) {
+    __os.__set_badbit_and_consider_rethrow();
+  }
+#      endif // _LIBCPP_HAS_NO_EXCEPTIONS
+#    endif   // _LIBCPP_AVAILABILITY_HAS_PRINT
+}
+
+template <class = void> // TODO PRINT template or availability markup fires too eagerly (http://llvm.org/PR61563).
+_LIBCPP_HIDE_FROM_ABI inline void vprint_unicode(ostream& __os, string_view __fmt, format_args __args) {
+  std::__vprint_unicode(__os, __fmt, __args, false);
+}
+#  endif // _LIBCPP_HAS_NO_UNICODE
+
+template <class... _Args>
+_LIBCPP_HIDE_FROM_ABI void print(ostream& __os, format_string<_Args...> __fmt, _Args&&... __args) {
+#  ifndef _LIBCPP_HAS_NO_UNICODE
+  if constexpr (__print::__use_unicode_execution_charset)
+    std::__vprint_unicode(__os, __fmt.get(), std::make_format_args(__args...), false);
+  else
+    std::__vprint_nonunicode(__os, __fmt.get(), std::make_format_args(__args...), false);
+#  else  // _LIBCPP_HAS_NO_UNICODE
+  std::__vprint_nonunicode(__os, __fmt.get(), std::make_format_args(__args...), false);
+#  endif // _LIBCPP_HAS_NO_UNICODE
+}
+
+template <class... _Args>
+_LIBCPP_HIDE_FROM_ABI void println(ostream& __os, format_string<_Args...> __fmt, _Args&&... __args) {
+#  ifndef _LIBCPP_HAS_NO_UNICODE
+  // Note the wording in the Standard is inefficient. The output of
+  // std::format is a std::string which is then copied. This solution
+  // just appends a newline at the end of the output.
+  if constexpr (__print::__use_unicode_execution_charset)
+    std::__vprint_unicode(__os, __fmt.get(), std::make_format_args(__args...), true);
+  else
+    std::__vprint_nonunicode(__os, __fmt.get(), std::make_format_args(__args...), true);
+#  else  // _LIBCPP_HAS_NO_UNICODE
+  std::__vprint_nonunicode(__os, __fmt.get(), std::make_format_args(__args...), true);
+#  endif // _LIBCPP_HAS_NO_UNICODE
+}
+
+template <class = void> // TODO PRINT template or availability markup fires too eagerly (http://llvm.org/PR61563).
+_LIBCPP_HIDE_FROM_ABI inline void println(ostream& __os) {
+  std::print(__os, "\n");
+}
+
+#endif // _LIBCPP_STD_VER >= 23
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___OSTREAM_PRINT_H
diff --git a/libcxx/include/__tuple/make_tuple_types.h b/libcxx/include/__tuple/make_tuple_types.h
index 43161b17cfa3..9e0fefae2f2f 100644
--- a/libcxx/include/__tuple/make_tuple_types.h
+++ b/libcxx/include/__tuple/make_tuple_types.h
@@ -16,7 +16,7 @@
 #include <__tuple/tuple_indices.h>
 #include <__tuple/tuple_size.h>
 #include <__tuple/tuple_types.h>
-#include <__type_traits/apply_cv.h>
+#include <__type_traits/copy_cvref.h>
 #include <__type_traits/remove_cv.h>
 #include <__type_traits/remove_reference.h>
 #include <cstddef>
@@ -41,7 +41,7 @@ template <template <class...> class _Tuple, class... _Types, size_t... _Idx>
 struct __make_tuple_types_flat<_Tuple<_Types...>, __tuple_indices<_Idx...>> {
   // Specialization for pair, tuple, and __tuple_types
   template <class _Tp>
-  using __apply_quals _LIBCPP_NODEBUG = __tuple_types<__apply_cv_t<_Tp, __type_pack_element<_Idx, _Types...>>...>;
+  using __apply_quals _LIBCPP_NODEBUG = __tuple_types<__copy_cvref_t<_Tp, __type_pack_element<_Idx, _Types...>>...>;
 };
 
 template <class _Vt, size_t _Np, size_t... _Idx>
@@ -49,7 +49,7 @@ struct __make_tuple_types_flat<array<_Vt, _Np>, __tuple_indices<_Idx...>> {
   template <size_t>
   using __value_type = _Vt;
   template <class _Tp>
-  using __apply_quals = __tuple_types<__apply_cv_t<_Tp, __value_type<_Idx>>...>;
+  using __apply_quals = __tuple_types<__copy_cvref_t<_Tp, __value_type<_Idx>>...>;
 };
 
 template <class _Tp,
diff --git a/libcxx/include/__type_traits/apply_cv.h b/libcxx/include/__type_traits/apply_cv.h
deleted file mode 100644
index 723af95b8d92..000000000000
--- a/libcxx/include/__type_traits/apply_cv.h
+++ /dev/null
@@ -1,38 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_APPLY_CV_H
-#define _LIBCPP___TYPE_TRAITS_APPLY_CV_H
-
-#include <__config>
-#include <__type_traits/copy_cv.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <class _Tp>
-struct __apply_cv_impl {
-  template <class _Up>
-  using __apply _LIBCPP_NODEBUG = __copy_cv_t<_Tp, _Up>;
-};
-
-template <class _Tp>
-struct __apply_cv_impl<_Tp&> {
-  template <class _Up>
-  using __apply _LIBCPP_NODEBUG = __copy_cv_t<_Tp, _Up>&;
-};
-
-template <class _Tp, class _Up>
-using __apply_cv_t _LIBCPP_NODEBUG = typename __apply_cv_impl<_Tp>::template __apply<_Up>;
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_APPLY_CV_H
diff --git a/libcxx/include/__type_traits/is_equality_comparable.h b/libcxx/include/__type_traits/is_equality_comparable.h
index 00316ed63778..4397f743e5ee 100644
--- a/libcxx/include/__type_traits/is_equality_comparable.h
+++ b/libcxx/include/__type_traits/is_equality_comparable.h
@@ -17,7 +17,6 @@
 #include <__type_traits/is_signed.h>
 #include <__type_traits/is_void.h>
 #include <__type_traits/remove_cv.h>
-#include <__type_traits/remove_cvref.h>
 #include <__type_traits/void_t.h>
 #include <__utility/declval.h>
 
@@ -45,6 +44,8 @@ struct __is_equality_comparable<_Tp, _Up, __void_t<decltype(std::declval<_Tp>()
 // pointers that don't have the same type (ignoring cv-qualifiers): pointers to virtual bases are equality comparable,
 //   but don't have the same bit-pattern. An exception to this is comparing to a void-pointer. There the bit-pattern is
 //   always compared.
+// objects with padding bytes: since objects with padding bytes may compare equal, even though their object
+//   representation may not be equivalent.
 
 template <class _Tp, class _Up, class = void>
 struct __libcpp_is_trivially_equality_comparable_impl : false_type {};
diff --git a/libcxx/include/__type_traits/make_signed.h b/libcxx/include/__type_traits/make_signed.h
index 1a8a35f3859d..c1fc009d9ba2 100644
--- a/libcxx/include/__type_traits/make_signed.h
+++ b/libcxx/include/__type_traits/make_signed.h
@@ -10,7 +10,6 @@
 #define _LIBCPP___TYPE_TRAITS_MAKE_SIGNED_H
 
 #include <__config>
-#include <__type_traits/apply_cv.h>
 #include <__type_traits/is_enum.h>
 #include <__type_traits/is_integral.h>
 #include <__type_traits/nat.h>
@@ -70,7 +69,7 @@ template <> struct __make_signed<__uint128_t,        true> {typedef __int128_t t
 // clang-format on
 
 template <class _Tp>
-using __make_signed_t = __apply_cv_t<_Tp, typename __make_signed<__remove_cv_t<_Tp> >::type>;
+using __make_signed_t = __copy_cv_t<_Tp, typename __make_signed<__remove_cv_t<_Tp> >::type>;
 
 #endif // __has_builtin(__make_signed)
 
diff --git a/libcxx/include/__type_traits/make_unsigned.h b/libcxx/include/__type_traits/make_unsigned.h
index 98967371e773..282cd2d91131 100644
--- a/libcxx/include/__type_traits/make_unsigned.h
+++ b/libcxx/include/__type_traits/make_unsigned.h
@@ -10,8 +10,8 @@
 #define _LIBCPP___TYPE_TRAITS_MAKE_UNSIGNED_H
 
 #include <__config>
-#include <__type_traits/apply_cv.h>
 #include <__type_traits/conditional.h>
+#include <__type_traits/copy_cv.h>
 #include <__type_traits/is_enum.h>
 #include <__type_traits/is_integral.h>
 #include <__type_traits/is_unsigned.h>
@@ -72,7 +72,7 @@ template <> struct __make_unsigned<__uint128_t,        true> {typedef __uint128_
 // clang-format on
 
 template <class _Tp>
-using __make_unsigned_t = __apply_cv_t<_Tp, typename __make_unsigned<__remove_cv_t<_Tp> >::type>;
+using __make_unsigned_t = __copy_cv_t<_Tp, typename __make_unsigned<__remove_cv_t<_Tp> >::type>;
 
 #endif // __has_builtin(__make_unsigned)
 
diff --git a/libcxx/include/cwchar b/libcxx/include/cwchar
index 4cc6f56c389b..08cfac58c846 100644
--- a/libcxx/include/cwchar
+++ b/libcxx/include/cwchar
@@ -103,7 +103,7 @@ size_t wcsrtombs(char* restrict dst, const wchar_t** restrict src, size_t len,
 */
 
 #include <__config>
-#include <__type_traits/apply_cv.h>
+#include <__type_traits/copy_cv.h>
 #include <__type_traits/is_constant_evaluated.h>
 #include <__type_traits/is_equality_comparable.h>
 #include <__type_traits/is_same.h>
@@ -236,7 +236,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __constexpr_wmemchr(_Tp
     wchar_t __value_buffer = 0;
     __builtin_memcpy(&__value_buffer, &__value, sizeof(wchar_t));
     return reinterpret_cast<_Tp*>(
-        __builtin_wmemchr(reinterpret_cast<__apply_cv_t<_Tp, wchar_t>*>(__str), __value_buffer, __count));
+        __builtin_wmemchr(reinterpret_cast<__copy_cv_t<_Tp, wchar_t>*>(__str), __value_buffer, __count));
   }
 #  if _LIBCPP_STD_VER >= 17
   else if constexpr (is_same_v<remove_cv_t<_Tp>, wchar_t>)
diff --git a/libcxx/include/functional b/libcxx/include/functional
index a2476c93ad1b..27cf21e1a4c8 100644
--- a/libcxx/include/functional
+++ b/libcxx/include/functional
@@ -77,6 +77,15 @@ template <class T> struct unwrap_ref_decay : unwrap_reference<decay_t<T>> { };
 template <class T> using unwrap_reference_t = typename unwrap_reference<T>::type; // since C++20
 template <class T> using unwrap_ref_decay_t = typename unwrap_ref_decay<T>::type; // since C++20
 
+// [refwrap.comparisons], comparisons
+friend constexpr bool operator==(reference_wrapper, reference_wrapper);           // Since C++26
+friend constexpr bool operator==(reference_wrapper, const T&);                    // Since C++26
+friend constexpr bool operator==(reference_wrapper, reference_wrapper<const T>);  // Since C++26
+
+friend constexpr auto operator<=>(reference_wrapper, reference_wrapper);          // Since C++26
+friend constexpr auto operator<=>(reference_wrapper, const T&);                   // Since C++26
+friend constexpr auto operator<=>(reference_wrapper, reference_wrapper<const T>); // Since C++26
+
 template <class T> // <class T=void> in C++14
 struct plus {
     T operator()(const T& x, const T& y) const;
diff --git a/libcxx/include/istream b/libcxx/include/istream
index 3f20c355046c..21269c8a8b40 100644
--- a/libcxx/include/istream
+++ b/libcxx/include/istream
@@ -161,13 +161,15 @@ template <class Stream, class T>
 #include <__config>
 #include <__fwd/istream.h>
 #include <__iterator/istreambuf_iterator.h>
+#include <__ostream/basic_ostream.h>
 #include <__type_traits/conjunction.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_base_of.h>
 #include <__utility/declval.h>
 #include <__utility/forward.h>
 #include <bitset>
-#include <ostream>
+#include <ios>
+#include <locale>
 #include <version>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -1362,6 +1364,7 @@ _LIBCPP_END_NAMESPACE_STD
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <concepts>
 #  include <iosfwd>
+#  include <ostream>
 #  include <type_traits>
 #endif
 
diff --git a/libcxx/include/map b/libcxx/include/map
index 2276cc043709..1d1c062a0267 100644
--- a/libcxx/include/map
+++ b/libcxx/include/map
@@ -1367,11 +1367,11 @@ public:
   _LIBCPP_HIDE_FROM_ABI iterator find(const key_type& __k) { return __tree_.find(__k); }
   _LIBCPP_HIDE_FROM_ABI const_iterator find(const key_type& __k) const { return __tree_.find(__k); }
 #if _LIBCPP_STD_VER >= 14
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI iterator find(const _K2& __k) {
     return __tree_.find(__k);
   }
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI const_iterator find(const _K2& __k) const {
     return __tree_.find(__k);
   }
@@ -1379,7 +1379,7 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI size_type count(const key_type& __k) const { return __tree_.__count_unique(__k); }
 #if _LIBCPP_STD_VER >= 14
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI size_type count(const _K2& __k) const {
     return __tree_.__count_multi(__k);
   }
@@ -1387,7 +1387,7 @@ public:
 
 #if _LIBCPP_STD_VER >= 20
   _LIBCPP_HIDE_FROM_ABI bool contains(const key_type& __k) const { return find(__k) != end(); }
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI bool contains(const _K2& __k) const {
     return find(__k) != end();
   }
@@ -1396,12 +1396,12 @@ public:
   _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const key_type& __k) { return __tree_.lower_bound(__k); }
   _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const key_type& __k) const { return __tree_.lower_bound(__k); }
 #if _LIBCPP_STD_VER >= 14
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const _K2& __k) {
     return __tree_.lower_bound(__k);
   }
 
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const _K2& __k) const {
     return __tree_.lower_bound(__k);
   }
@@ -1410,11 +1410,11 @@ public:
   _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const key_type& __k) { return __tree_.upper_bound(__k); }
   _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const key_type& __k) const { return __tree_.upper_bound(__k); }
 #if _LIBCPP_STD_VER >= 14
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const _K2& __k) {
     return __tree_.upper_bound(__k);
   }
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const _K2& __k) const {
     return __tree_.upper_bound(__k);
   }
@@ -1427,11 +1427,11 @@ public:
     return __tree_.__equal_range_unique(__k);
   }
 #if _LIBCPP_STD_VER >= 14
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI pair<iterator, iterator> equal_range(const _K2& __k) {
     return __tree_.__equal_range_multi(__k);
   }
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI pair<const_iterator, const_iterator> equal_range(const _K2& __k) const {
     return __tree_.__equal_range_multi(__k);
   }
@@ -1959,11 +1959,11 @@ public:
   _LIBCPP_HIDE_FROM_ABI iterator find(const key_type& __k) { return __tree_.find(__k); }
   _LIBCPP_HIDE_FROM_ABI const_iterator find(const key_type& __k) const { return __tree_.find(__k); }
 #if _LIBCPP_STD_VER >= 14
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI iterator find(const _K2& __k) {
     return __tree_.find(__k);
   }
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI const_iterator find(const _K2& __k) const {
     return __tree_.find(__k);
   }
@@ -1971,7 +1971,7 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI size_type count(const key_type& __k) const { return __tree_.__count_multi(__k); }
 #if _LIBCPP_STD_VER >= 14
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI size_type count(const _K2& __k) const {
     return __tree_.__count_multi(__k);
   }
@@ -1979,7 +1979,7 @@ public:
 
 #if _LIBCPP_STD_VER >= 20
   _LIBCPP_HIDE_FROM_ABI bool contains(const key_type& __k) const { return find(__k) != end(); }
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI bool contains(const _K2& __k) const {
     return find(__k) != end();
   }
@@ -1988,12 +1988,12 @@ public:
   _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const key_type& __k) { return __tree_.lower_bound(__k); }
   _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const key_type& __k) const { return __tree_.lower_bound(__k); }
 #if _LIBCPP_STD_VER >= 14
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const _K2& __k) {
     return __tree_.lower_bound(__k);
   }
 
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const _K2& __k) const {
     return __tree_.lower_bound(__k);
   }
@@ -2002,11 +2002,11 @@ public:
   _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const key_type& __k) { return __tree_.upper_bound(__k); }
   _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const key_type& __k) const { return __tree_.upper_bound(__k); }
 #if _LIBCPP_STD_VER >= 14
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const _K2& __k) {
     return __tree_.upper_bound(__k);
   }
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const _K2& __k) const {
     return __tree_.upper_bound(__k);
   }
@@ -2019,11 +2019,11 @@ public:
     return __tree_.__equal_range_multi(__k);
   }
 #if _LIBCPP_STD_VER >= 14
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI pair<iterator, iterator> equal_range(const _K2& __k) {
     return __tree_.__equal_range_multi(__k);
   }
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI pair<const_iterator, const_iterator> equal_range(const _K2& __k) const {
     return __tree_.__equal_range_multi(__k);
   }
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index 8727ab88f16c..70dac2f19846 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -700,6 +700,7 @@ module std_private_algorithm_minmax_element                              [system
 module std_private_algorithm_mismatch                                    [system] {
   header "__algorithm/mismatch.h"
   export std_private_algorithm_simd_utils
+  export std_private_iterator_aliasing_iterator
 }
 module std_private_algorithm_move                                        [system] { header "__algorithm/move.h" }
 module std_private_algorithm_move_backward                               [system] { header "__algorithm/move_backward.h" }
@@ -1390,6 +1391,7 @@ module std_private_iosfwd_streambuf_fwd [system] { header "__fwd/streambuf.h" }
 
 module std_private_iterator_access                  [system] { header "__iterator/access.h" }
 module std_private_iterator_advance                 [system] { header "__iterator/advance.h" }
+module std_private_iterator_aliasing_iterator       [system] { header "__iterator/aliasing_iterator.h" }
 module std_private_iterator_back_insert_iterator    [system] { header "__iterator/back_insert_iterator.h" }
 module std_private_iterator_bounded_iter            [system] { header "__iterator/bounded_iter.h" }
 module std_private_iterator_common_iterator         [system] { header "__iterator/common_iterator.h" }
@@ -1614,6 +1616,15 @@ module std_private_pstl_configuration              [system] {
 
 module std_private_queue_fwd [system] { header "__fwd/queue.h" }
 
+module std_private_ostream_basic_ostream [system] {
+  header "__ostream/basic_ostream.h"
+  export std_streambuf
+}
+module std_private_ostream_print         [system] {
+  header "__ostream/print.h"
+  export std_print
+}
+
 module std_private_random_bernoulli_distribution          [system] { header "__random/bernoulli_distribution.h" }
 module std_private_random_binomial_distribution           [system] { header "__random/binomial_distribution.h" }
 module std_private_random_cauchy_distribution             [system] { header "__random/cauchy_distribution.h" }
@@ -1851,11 +1862,6 @@ module std_private_type_traits_add_volatile                              [system
 module std_private_type_traits_aligned_storage                           [system] { header "__type_traits/aligned_storage.h" }
 module std_private_type_traits_aligned_union                             [system] { header "__type_traits/aligned_union.h" }
 module std_private_type_traits_alignment_of                              [system] { header "__type_traits/alignment_of.h" }
-module std_private_type_traits_apply_cv                                  [system] {
-  header "__type_traits/apply_cv.h"
-  export std_private_type_traits_is_const
-  export std_private_type_traits_is_volatile
-}
 module std_private_type_traits_can_extract_key                           [system] { header "__type_traits/can_extract_key.h" }
 module std_private_type_traits_common_reference                          [system] {
   header "__type_traits/common_reference.h"
diff --git a/libcxx/include/ostream b/libcxx/include/ostream
index d4fc1c58b8a9..f75110e7d73f 100644
--- a/libcxx/include/ostream
+++ b/libcxx/include/ostream
@@ -172,1010 +172,19 @@ void vprint_nonunicode(ostream& os, string_view fmt, format_args args);
 
 */
 
-#include <__availability>
 #include <__config>
-#include <__exception/operations.h>
-#include <__fwd/ostream.h>
-#include <__memory/shared_ptr.h>
-#include <__memory/unique_ptr.h>
-#include <__system_error/error_code.h>
-#include <__type_traits/conjunction.h>
-#include <__type_traits/enable_if.h>
-#include <__type_traits/is_base_of.h>
-#include <__type_traits/void_t.h>
-#include <__utility/declval.h>
-#include <bitset>
-#include <cstdio>
-#include <format>
-#include <ios>
-#include <locale>
-#include <new>
-#include <print>
-#include <streambuf>
-#include <string_view>
+#include <__ostream/basic_ostream.h>
+#include <__ostream/print.h>
 #include <version>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
 
-_LIBCPP_PUSH_MACROS
-#include <__undef_macros>
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <class _CharT, class _Traits>
-class _LIBCPP_TEMPLATE_VIS basic_ostream : virtual public basic_ios<_CharT, _Traits> {
-public:
-  // types (inherited from basic_ios (27.5.4)):
-  typedef _CharT char_type;
-  typedef _Traits traits_type;
-  typedef typename traits_type::int_type int_type;
-  typedef typename traits_type::pos_type pos_type;
-  typedef typename traits_type::off_type off_type;
-
-  // 27.7.2.2 Constructor/destructor:
-  inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 explicit basic_ostream(basic_streambuf<char_type, traits_type>* __sb) {
-    this->init(__sb);
-  }
-  ~basic_ostream() override;
-
-protected:
-  inline _LIBCPP_HIDE_FROM_ABI basic_ostream(basic_ostream&& __rhs);
-
-  // 27.7.2.3 Assign/swap
-  inline _LIBCPP_HIDE_FROM_ABI basic_ostream& operator=(basic_ostream&& __rhs);
-
-  inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 void swap(basic_ostream& __rhs) {
-    basic_ios<char_type, traits_type>::swap(__rhs);
-  }
-
-  basic_ostream(const basic_ostream& __rhs)            = delete;
-  basic_ostream& operator=(const basic_ostream& __rhs) = delete;
-
-public:
-  // 27.7.2.4 Prefix/suffix:
-  class _LIBCPP_TEMPLATE_VIS sentry;
-
-  // 27.7.2.6 Formatted output:
-  inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 basic_ostream& operator<<(basic_ostream& (*__pf)(basic_ostream&)) {
-    return __pf(*this);
-  }
-
-  inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 basic_ostream&
-  operator<<(basic_ios<char_type, traits_type>& (*__pf)(basic_ios<char_type, traits_type>&)) {
-    __pf(*this);
-    return *this;
-  }
-
-  inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 basic_ostream& operator<<(ios_base& (*__pf)(ios_base&)) {
-    __pf(*this);
-    return *this;
-  }
-
-  basic_ostream& operator<<(bool __n);
-  basic_ostream& operator<<(short __n);
-  basic_ostream& operator<<(unsigned short __n);
-  basic_ostream& operator<<(int __n);
-  basic_ostream& operator<<(unsigned int __n);
-  basic_ostream& operator<<(long __n);
-  basic_ostream& operator<<(unsigned long __n);
-  basic_ostream& operator<<(long long __n);
-  basic_ostream& operator<<(unsigned long long __n);
-  basic_ostream& operator<<(float __f);
-  basic_ostream& operator<<(double __f);
-  basic_ostream& operator<<(long double __f);
-  basic_ostream& operator<<(const void* __p);
-
-#if _LIBCPP_STD_VER >= 23
-  _LIBCPP_HIDE_FROM_ABI basic_ostream& operator<<(const volatile void* __p) {
-    return operator<<(const_cast<const void*>(__p));
-  }
-#endif
-
-  basic_ostream& operator<<(basic_streambuf<char_type, traits_type>* __sb);
-
-#if _LIBCPP_STD_VER >= 17
-  // LWG 2221 - nullptr. This is not backported to older standards modes.
-  // See https://reviews.llvm.org/D127033 for more info on the rationale.
-  _LIBCPP_HIDE_FROM_ABI basic_ostream& operator<<(nullptr_t) { return *this << "nullptr"; }
-#endif
-
-  // 27.7.2.7 Unformatted output:
-  basic_ostream& put(char_type __c);
-  basic_ostream& write(const char_type* __s, streamsize __n);
-  basic_ostream& flush();
-
-  // 27.7.2.5 seeks:
-  inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 pos_type tellp();
-  inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 basic_ostream& seekp(pos_type __pos);
-  inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 basic_ostream& seekp(off_type __off, ios_base::seekdir __dir);
-
-protected:
-  _LIBCPP_HIDE_FROM_ABI basic_ostream() {} // extension, intentially does not initialize
-};
-
-template <class _CharT, class _Traits>
-class _LIBCPP_TEMPLATE_VIS basic_ostream<_CharT, _Traits>::sentry {
-  bool __ok_;
-  basic_ostream<_CharT, _Traits>& __os_;
-
-public:
-  explicit sentry(basic_ostream<_CharT, _Traits>& __os);
-  ~sentry();
-  sentry(const sentry&)            = delete;
-  sentry& operator=(const sentry&) = delete;
-
-  _LIBCPP_HIDE_FROM_ABI explicit operator bool() const { return __ok_; }
-};
-
-template <class _CharT, class _Traits>
-basic_ostream<_CharT, _Traits>::sentry::sentry(basic_ostream<_CharT, _Traits>& __os) : __ok_(false), __os_(__os) {
-  if (__os.good()) {
-    if (__os.tie())
-      __os.tie()->flush();
-    __ok_ = true;
-  }
-}
-
-template <class _CharT, class _Traits>
-basic_ostream<_CharT, _Traits>::sentry::~sentry() {
-  if (__os_.rdbuf() && __os_.good() && (__os_.flags() & ios_base::unitbuf) && !uncaught_exception()) {
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-    try {
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-      if (__os_.rdbuf()->pubsync() == -1)
-        __os_.setstate(ios_base::badbit);
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-    } catch (...) {
-    }
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-  }
-}
-
-template <class _CharT, class _Traits>
-basic_ostream<_CharT, _Traits>::basic_ostream(basic_ostream&& __rhs) {
-  this->move(__rhs);
-}
-
-template <class _CharT, class _Traits>
-basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator=(basic_ostream&& __rhs) {
-  swap(__rhs);
-  return *this;
-}
-
-template <class _CharT, class _Traits>
-basic_ostream<_CharT, _Traits>::~basic_ostream() {}
-
-template <class _CharT, class _Traits>
-basic_ostream<_CharT, _Traits>&
-basic_ostream<_CharT, _Traits>::operator<<(basic_streambuf<char_type, traits_type>* __sb) {
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  try {
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      if (__sb) {
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-        try {
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-          typedef istreambuf_iterator<_CharT, _Traits> _Ip;
-          typedef ostreambuf_iterator<_CharT, _Traits> _Op;
-          _Ip __i(__sb);
-          _Ip __eof;
-          _Op __o(*this);
-          size_t __c = 0;
-          for (; __i != __eof; ++__i, ++__o, ++__c) {
-            *__o = *__i;
-            if (__o.failed())
-              break;
-          }
-          if (__c == 0)
-            this->setstate(ios_base::failbit);
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-        } catch (...) {
-          this->__set_failbit_and_consider_rethrow();
-        }
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-      } else
-        this->setstate(ios_base::badbit);
-    }
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-  return *this;
-}
-
-template <class _CharT, class _Traits>
-basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(bool __n) {
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  try {
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), __n).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-  return *this;
-}
-
-template <class _CharT, class _Traits>
-basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(short __n) {
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  try {
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      ios_base::fmtflags __flags = ios_base::flags() & ios_base::basefield;
-      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this,
-                  *this,
-                  this->fill(),
-                  __flags == ios_base::oct || __flags == ios_base::hex
-                      ? static_cast<long>(static_cast<unsigned short>(__n))
-                      : static_cast<long>(__n))
-              .failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-  return *this;
-}
-
-template <class _CharT, class _Traits>
-basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(unsigned short __n) {
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  try {
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), static_cast<unsigned long>(__n)).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-  return *this;
-}
-
-template <class _CharT, class _Traits>
-basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(int __n) {
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  try {
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      ios_base::fmtflags __flags = ios_base::flags() & ios_base::basefield;
-      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this,
-                  *this,
-                  this->fill(),
-                  __flags == ios_base::oct || __flags == ios_base::hex
-                      ? static_cast<long>(static_cast<unsigned int>(__n))
-                      : static_cast<long>(__n))
-              .failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-  return *this;
-}
-
-template <class _CharT, class _Traits>
-basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(unsigned int __n) {
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  try {
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), static_cast<unsigned long>(__n)).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-  return *this;
-}
-
-template <class _CharT, class _Traits>
-basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(long __n) {
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  try {
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), __n).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-  return *this;
-}
-
-template <class _CharT, class _Traits>
-basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(unsigned long __n) {
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  try {
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), __n).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-  return *this;
-}
-
-template <class _CharT, class _Traits>
-basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(long long __n) {
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  try {
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), __n).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-  return *this;
-}
-
-template <class _CharT, class _Traits>
-basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(unsigned long long __n) {
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  try {
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), __n).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-  return *this;
-}
-
-template <class _CharT, class _Traits>
-basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(float __n) {
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  try {
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), static_cast<double>(__n)).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-  return *this;
-}
-
-template <class _CharT, class _Traits>
-basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(double __n) {
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  try {
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), __n).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-  return *this;
-}
-
-template <class _CharT, class _Traits>
-basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(long double __n) {
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  try {
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), __n).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-  return *this;
-}
-
-template <class _CharT, class _Traits>
-basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(const void* __n) {
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  try {
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), __n).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-  return *this;
-}
-
-template <class _CharT, class _Traits>
-_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
-__put_character_sequence(basic_ostream<_CharT, _Traits>& __os, const _CharT* __str, size_t __len) {
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  try {
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-    typename basic_ostream<_CharT, _Traits>::sentry __s(__os);
-    if (__s) {
-      typedef ostreambuf_iterator<_CharT, _Traits> _Ip;
-      if (std::__pad_and_output(
-              _Ip(__os),
-              __str,
-              (__os.flags() & ios_base::adjustfield) == ios_base::left ? __str + __len : __str,
-              __str + __len,
-              __os,
-              __os.fill())
-              .failed())
-        __os.setstate(ios_base::badbit | ios_base::failbit);
-    }
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  } catch (...) {
-    __os.__set_badbit_and_consider_rethrow();
-  }
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-  return __os;
-}
-
-template <class _CharT, class _Traits>
-_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>& operator<<(basic_ostream<_CharT, _Traits>& __os, _CharT __c) {
-  return std::__put_character_sequence(__os, &__c, 1);
-}
-
-template <class _CharT, class _Traits>
-_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>& operator<<(basic_ostream<_CharT, _Traits>& __os, char __cn) {
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  try {
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-    typename basic_ostream<_CharT, _Traits>::sentry __s(__os);
-    if (__s) {
-      _CharT __c = __os.widen(__cn);
-      typedef ostreambuf_iterator<_CharT, _Traits> _Ip;
-      if (std::__pad_and_output(
-              _Ip(__os),
-              &__c,
-              (__os.flags() & ios_base::adjustfield) == ios_base::left ? &__c + 1 : &__c,
-              &__c + 1,
-              __os,
-              __os.fill())
-              .failed())
-        __os.setstate(ios_base::badbit | ios_base::failbit);
-    }
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  } catch (...) {
-    __os.__set_badbit_and_consider_rethrow();
-  }
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-  return __os;
-}
-
-template <class _Traits>
-_LIBCPP_HIDE_FROM_ABI basic_ostream<char, _Traits>& operator<<(basic_ostream<char, _Traits>& __os, char __c) {
-  return std::__put_character_sequence(__os, &__c, 1);
-}
-
-template <class _Traits>
-_LIBCPP_HIDE_FROM_ABI basic_ostream<char, _Traits>& operator<<(basic_ostream<char, _Traits>& __os, signed char __c) {
-  return std::__put_character_sequence(__os, (char*)&__c, 1);
-}
-
-template <class _Traits>
-_LIBCPP_HIDE_FROM_ABI basic_ostream<char, _Traits>& operator<<(basic_ostream<char, _Traits>& __os, unsigned char __c) {
-  return std::__put_character_sequence(__os, (char*)&__c, 1);
-}
-
-template <class _CharT, class _Traits>
-_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
-operator<<(basic_ostream<_CharT, _Traits>& __os, const _CharT* __str) {
-  return std::__put_character_sequence(__os, __str, _Traits::length(__str));
-}
-
-template <class _CharT, class _Traits>
-_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
-operator<<(basic_ostream<_CharT, _Traits>& __os, const char* __strn) {
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  try {
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-    typename basic_ostream<_CharT, _Traits>::sentry __s(__os);
-    if (__s) {
-      typedef ostreambuf_iterator<_CharT, _Traits> _Ip;
-      size_t __len   = char_traits<char>::length(__strn);
-      const int __bs = 100;
-      _CharT __wbb[__bs];
-      _CharT* __wb = __wbb;
-      unique_ptr<_CharT, void (*)(void*)> __h(0, free);
-      if (__len > __bs) {
-        __wb = (_CharT*)malloc(__len * sizeof(_CharT));
-        if (__wb == 0)
-          __throw_bad_alloc();
-        __h.reset(__wb);
-      }
-      for (_CharT* __p = __wb; *__strn != '\0'; ++__strn, ++__p)
-        *__p = __os.widen(*__strn);
-      if (std::__pad_and_output(
-              _Ip(__os),
-              __wb,
-              (__os.flags() & ios_base::adjustfield) == ios_base::left ? __wb + __len : __wb,
-              __wb + __len,
-              __os,
-              __os.fill())
-              .failed())
-        __os.setstate(ios_base::badbit | ios_base::failbit);
-    }
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  } catch (...) {
-    __os.__set_badbit_and_consider_rethrow();
-  }
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-  return __os;
-}
-
-template <class _Traits>
-_LIBCPP_HIDE_FROM_ABI basic_ostream<char, _Traits>& operator<<(basic_ostream<char, _Traits>& __os, const char* __str) {
-  return std::__put_character_sequence(__os, __str, _Traits::length(__str));
-}
-
-template <class _Traits>
-_LIBCPP_HIDE_FROM_ABI basic_ostream<char, _Traits>&
-operator<<(basic_ostream<char, _Traits>& __os, const signed char* __str) {
-  const char* __s = (const char*)__str;
-  return std::__put_character_sequence(__os, __s, _Traits::length(__s));
-}
-
-template <class _Traits>
-_LIBCPP_HIDE_FROM_ABI basic_ostream<char, _Traits>&
-operator<<(basic_ostream<char, _Traits>& __os, const unsigned char* __str) {
-  const char* __s = (const char*)__str;
-  return std::__put_character_sequence(__os, __s, _Traits::length(__s));
-}
-
-template <class _CharT, class _Traits>
-basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::put(char_type __c) {
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  try {
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef ostreambuf_iterator<_CharT, _Traits> _Op;
-      _Op __o(*this);
-      *__o = __c;
-      if (__o.failed())
-        this->setstate(ios_base::badbit);
-    }
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-  return *this;
-}
-
-template <class _CharT, class _Traits>
-basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::write(const char_type* __s, streamsize __n) {
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  try {
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-    sentry __sen(*this);
-    if (__sen && __n) {
-      if (this->rdbuf()->sputn(__s, __n) != __n)
-        this->setstate(ios_base::badbit);
-    }
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-  return *this;
-}
-
-template <class _CharT, class _Traits>
-basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::flush() {
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  try {
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-    if (this->rdbuf()) {
-      sentry __s(*this);
-      if (__s) {
-        if (this->rdbuf()->pubsync() == -1)
-          this->setstate(ios_base::badbit);
-      }
-    }
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
-  return *this;
-}
-
-template <class _CharT, class _Traits>
-typename basic_ostream<_CharT, _Traits>::pos_type basic_ostream<_CharT, _Traits>::tellp() {
-  if (this->fail())
-    return pos_type(-1);
-  return this->rdbuf()->pubseekoff(0, ios_base::cur, ios_base::out);
-}
-
-template <class _CharT, class _Traits>
-basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::seekp(pos_type __pos) {
-  sentry __s(*this);
-  if (!this->fail()) {
-    if (this->rdbuf()->pubseekpos(__pos, ios_base::out) == pos_type(-1))
-      this->setstate(ios_base::failbit);
-  }
-  return *this;
-}
-
-template <class _CharT, class _Traits>
-basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::seekp(off_type __off, ios_base::seekdir __dir) {
-  sentry __s(*this);
-  if (!this->fail()) {
-    if (this->rdbuf()->pubseekoff(__off, __dir, ios_base::out) == pos_type(-1))
-      this->setstate(ios_base::failbit);
-  }
-  return *this;
-}
-
-template <class _CharT, class _Traits>
-_LIBCPP_HIDE_FROM_ABI inline basic_ostream<_CharT, _Traits>& endl(basic_ostream<_CharT, _Traits>& __os) {
-  __os.put(__os.widen('\n'));
-  __os.flush();
-  return __os;
-}
-
-template <class _CharT, class _Traits>
-_LIBCPP_HIDE_FROM_ABI inline basic_ostream<_CharT, _Traits>& ends(basic_ostream<_CharT, _Traits>& __os) {
-  __os.put(_CharT());
-  return __os;
-}
-
-template <class _CharT, class _Traits>
-_LIBCPP_HIDE_FROM_ABI inline basic_ostream<_CharT, _Traits>& flush(basic_ostream<_CharT, _Traits>& __os) {
-  __os.flush();
-  return __os;
-}
-
-template <class _Stream, class _Tp, class = void>
-struct __is_ostreamable : false_type {};
-
-template <class _Stream, class _Tp>
-struct __is_ostreamable<_Stream, _Tp, decltype(std::declval<_Stream>() << std::declval<_Tp>(), void())> : true_type {};
-
-template <class _Stream,
-          class _Tp,
-          __enable_if_t<_And<is_base_of<ios_base, _Stream>, __is_ostreamable<_Stream&, const _Tp&> >::value, int> = 0>
-_LIBCPP_HIDE_FROM_ABI _Stream&& operator<<(_Stream&& __os, const _Tp& __x) {
-  __os << __x;
-  return std::move(__os);
-}
-
-template <class _CharT, class _Traits, class _Allocator>
-basic_ostream<_CharT, _Traits>&
-operator<<(basic_ostream<_CharT, _Traits>& __os, const basic_string<_CharT, _Traits, _Allocator>& __str) {
-  return std::__put_character_sequence(__os, __str.data(), __str.size());
-}
-
-template <class _CharT, class _Traits>
-_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
-operator<<(basic_ostream<_CharT, _Traits>& __os, basic_string_view<_CharT, _Traits> __sv) {
-  return std::__put_character_sequence(__os, __sv.data(), __sv.size());
-}
-
-template <class _CharT, class _Traits>
-inline _LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
-operator<<(basic_ostream<_CharT, _Traits>& __os, const error_code& __ec) {
-  return __os << __ec.category().name() << ':' << __ec.value();
-}
-
-template <class _CharT, class _Traits, class _Yp>
-inline _LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
-operator<<(basic_ostream<_CharT, _Traits>& __os, shared_ptr<_Yp> const& __p) {
-  return __os << __p.get();
-}
-
-template <
-    class _CharT,
-    class _Traits,
-    class _Yp,
-    class _Dp,
-    __enable_if_t<is_same<void,
-                          __void_t<decltype((std::declval<basic_ostream<_CharT, _Traits>&>()
-                                             << std::declval<typename unique_ptr<_Yp, _Dp>::pointer>()))> >::value,
-                  int> = 0>
-inline _LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
-operator<<(basic_ostream<_CharT, _Traits>& __os, unique_ptr<_Yp, _Dp> const& __p) {
-  return __os << __p.get();
-}
-
-template <class _CharT, class _Traits, size_t _Size>
-_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
-operator<<(basic_ostream<_CharT, _Traits>& __os, const bitset<_Size>& __x) {
-  return __os << __x.template to_string<_CharT, _Traits>(std::use_facet<ctype<_CharT> >(__os.getloc()).widen('0'),
-                                                         std::use_facet<ctype<_CharT> >(__os.getloc()).widen('1'));
-}
-
-#if _LIBCPP_STD_VER >= 20
-
-#  ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
-template <class _Traits>
-basic_ostream<char, _Traits>& operator<<(basic_ostream<char, _Traits>&, wchar_t) = delete;
-
-template <class _Traits>
-basic_ostream<char, _Traits>& operator<<(basic_ostream<char, _Traits>&, const wchar_t*) = delete;
-
-template <class _Traits>
-basic_ostream<wchar_t, _Traits>& operator<<(basic_ostream<wchar_t, _Traits>&, char16_t) = delete;
-
-template <class _Traits>
-basic_ostream<wchar_t, _Traits>& operator<<(basic_ostream<wchar_t, _Traits>&, char32_t) = delete;
-
-template <class _Traits>
-basic_ostream<wchar_t, _Traits>& operator<<(basic_ostream<wchar_t, _Traits>&, const char16_t*) = delete;
-
-template <class _Traits>
-basic_ostream<wchar_t, _Traits>& operator<<(basic_ostream<wchar_t, _Traits>&, const char32_t*) = delete;
-
-#  endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS
-
-#  ifndef _LIBCPP_HAS_NO_CHAR8_T
-template <class _Traits>
-basic_ostream<char, _Traits>& operator<<(basic_ostream<char, _Traits>&, char8_t) = delete;
-
-template <class _Traits>
-basic_ostream<wchar_t, _Traits>& operator<<(basic_ostream<wchar_t, _Traits>&, char8_t) = delete;
-
-template <class _Traits>
-basic_ostream<char, _Traits>& operator<<(basic_ostream<char, _Traits>&, const char8_t*) = delete;
-
-template <class _Traits>
-basic_ostream<wchar_t, _Traits>& operator<<(basic_ostream<wchar_t, _Traits>&, const char8_t*) = delete;
-#  endif
-
-template <class _Traits>
-basic_ostream<char, _Traits>& operator<<(basic_ostream<char, _Traits>&, char16_t) = delete;
-
-template <class _Traits>
-basic_ostream<char, _Traits>& operator<<(basic_ostream<char, _Traits>&, char32_t) = delete;
-
-template <class _Traits>
-basic_ostream<char, _Traits>& operator<<(basic_ostream<char, _Traits>&, const char16_t*) = delete;
-
-template <class _Traits>
-basic_ostream<char, _Traits>& operator<<(basic_ostream<char, _Traits>&, const char32_t*) = delete;
-
-#endif // _LIBCPP_STD_VER >= 20
-
-extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_ostream<char>;
-#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
-extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_ostream<wchar_t>;
-#endif
-
-#if _LIBCPP_STD_VER >= 23
-
-template <class = void> // TODO PRINT template or availability markup fires too eagerly (http://llvm.org/PR61563).
-_LIBCPP_HIDE_FROM_ABI inline void
-__vprint_nonunicode(ostream& __os, string_view __fmt, format_args __args, bool __write_nl) {
-  // [ostream.formatted.print]/3
-  // Effects: Behaves as a formatted output function
-  // ([ostream.formatted.reqmts]) of os, except that:
-  // - failure to generate output is reported as specified below, and
-  // - any exception thrown by the call to vformat is propagated without regard
-  //   to the value of os.exceptions() and without turning on ios_base::badbit
-  //   in the error state of os.
-  // After constructing a sentry object, the function initializes an automatic
-  // variable via
-  //   string out = vformat(os.getloc(), fmt, args);
-
-  ostream::sentry __s(__os);
-  if (__s) {
-    string __o = std::vformat(__os.getloc(), __fmt, __args);
-    if (__write_nl)
-      __o += '\n';
-
-    const char* __str = __o.data();
-    size_t __len      = __o.size();
-
-#  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-    try {
-#  endif // _LIBCPP_HAS_NO_EXCEPTIONS
-      typedef ostreambuf_iterator<char> _Ip;
-      if (std::__pad_and_output(
-              _Ip(__os),
-              __str,
-              (__os.flags() & ios_base::adjustfield) == ios_base::left ? __str + __len : __str,
-              __str + __len,
-              __os,
-              __os.fill())
-              .failed())
-        __os.setstate(ios_base::badbit | ios_base::failbit);
-
-#  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-    } catch (...) {
-      __os.__set_badbit_and_consider_rethrow();
-    }
-#  endif // _LIBCPP_HAS_NO_EXCEPTIONS
-  }
-}
-
-template <class = void> // TODO PRINT template or availability markup fires too eagerly (http://llvm.org/PR61563).
-_LIBCPP_HIDE_FROM_ABI inline void vprint_nonunicode(ostream& __os, string_view __fmt, format_args __args) {
-  std::__vprint_nonunicode(__os, __fmt, __args, false);
-}
-
-// Returns the FILE* associated with the __os.
-// Returns a nullptr when no FILE* is associated with __os.
-// This function is in the dylib since the type of the buffer associated
-// with std::cout, std::cerr, and std::clog is only known in the dylib.
-//
-// This function implements part of the implementation-defined behavior
-// of [ostream.formatted.print]/3
-//   If the function is vprint_unicode and os is a stream that refers to
-//   a terminal capable of displaying Unicode which is determined in an
-//   implementation-defined manner, writes out to the terminal using the
-//   native Unicode API;
-// Whether the returned FILE* is "a terminal capable of displaying Unicode"
-// is determined in the same way as the print(FILE*, ...) overloads.
-_LIBCPP_EXPORTED_FROM_ABI FILE* __get_ostream_file(ostream& __os);
-
-#  ifndef _LIBCPP_HAS_NO_UNICODE
-template <class = void> // TODO PRINT template or availability markup fires too eagerly (http://llvm.org/PR61563).
-_LIBCPP_HIDE_FROM_ABI void __vprint_unicode(ostream& __os, string_view __fmt, format_args __args, bool __write_nl) {
-#    if _LIBCPP_AVAILABILITY_HAS_PRINT == 0
-  return std::__vprint_nonunicode(__os, __fmt, __args, __write_nl);
-#    else
-  FILE* __file = std::__get_ostream_file(__os);
-  if (!__file || !__print::__is_terminal(__file))
-    return std::__vprint_nonunicode(__os, __fmt, __args, __write_nl);
-
-  // [ostream.formatted.print]/3
-  //    If the function is vprint_unicode and os is a stream that refers to a
-  //    terminal capable of displaying Unicode which is determined in an
-  //    implementation-defined manner, writes out to the terminal using the
-  //    native Unicode API; if out contains invalid code units, the behavior is
-  //    undefined and implementations are encouraged to diagnose it. If the
-  //    native Unicode API is used, the function flushes os before writing out.
-  //
-  // This is the path for the native API, start with flushing.
-  __os.flush();
-
-#      ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  try {
-#      endif // _LIBCPP_HAS_NO_EXCEPTIONS
-    ostream::sentry __s(__os);
-    if (__s) {
-#      ifndef _LIBCPP_WIN32API
-      __print::__vprint_unicode_posix(__file, __fmt, __args, __write_nl, true);
-#      elif !defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)
-    __print::__vprint_unicode_windows(__file, __fmt, __args, __write_nl, true);
-#      else
-#        error "Windows builds with wchar_t disabled are not supported."
-#      endif
-    }
-
-#      ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-  } catch (...) {
-    __os.__set_badbit_and_consider_rethrow();
-  }
-#      endif // _LIBCPP_HAS_NO_EXCEPTIONS
-#    endif   // _LIBCPP_AVAILABILITY_HAS_PRINT
-}
-
-template <class = void> // TODO PRINT template or availability markup fires too eagerly (http://llvm.org/PR61563).
-_LIBCPP_HIDE_FROM_ABI inline void vprint_unicode(ostream& __os, string_view __fmt, format_args __args) {
-  std::__vprint_unicode(__os, __fmt, __args, false);
-}
-#  endif // _LIBCPP_HAS_NO_UNICODE
-
-template <class... _Args>
-_LIBCPP_HIDE_FROM_ABI void print(ostream& __os, format_string<_Args...> __fmt, _Args&&... __args) {
-#  ifndef _LIBCPP_HAS_NO_UNICODE
-  if constexpr (__print::__use_unicode_execution_charset)
-    std::__vprint_unicode(__os, __fmt.get(), std::make_format_args(__args...), false);
-  else
-    std::__vprint_nonunicode(__os, __fmt.get(), std::make_format_args(__args...), false);
-#  else  // _LIBCPP_HAS_NO_UNICODE
-  std::__vprint_nonunicode(__os, __fmt.get(), std::make_format_args(__args...), false);
-#  endif // _LIBCPP_HAS_NO_UNICODE
-}
-
-template <class... _Args>
-_LIBCPP_HIDE_FROM_ABI void println(ostream& __os, format_string<_Args...> __fmt, _Args&&... __args) {
-#  ifndef _LIBCPP_HAS_NO_UNICODE
-  // Note the wording in the Standard is inefficient. The output of
-  // std::format is a std::string which is then copied. This solution
-  // just appends a newline at the end of the output.
-  if constexpr (__print::__use_unicode_execution_charset)
-    std::__vprint_unicode(__os, __fmt.get(), std::make_format_args(__args...), true);
-  else
-    std::__vprint_nonunicode(__os, __fmt.get(), std::make_format_args(__args...), true);
-#  else  // _LIBCPP_HAS_NO_UNICODE
-  std::__vprint_nonunicode(__os, __fmt.get(), std::make_format_args(__args...), true);
-#  endif // _LIBCPP_HAS_NO_UNICODE
-}
-
-template <class = void> // TODO PRINT template or availability markup fires too eagerly (http://llvm.org/PR61563).
-_LIBCPP_HIDE_FROM_ABI inline void println(ostream& __os) { std::print(__os, "\n"); }
-
-#endif // _LIBCPP_STD_VER >= 23
-
-_LIBCPP_END_NAMESPACE_STD
-
-_LIBCPP_POP_MACROS
-
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <atomic>
 #  include <concepts>
+#  include <cstdio>
 #  include <cstdlib>
 #  include <iosfwd>
 #  include <iterator>
diff --git a/libcxx/include/set b/libcxx/include/set
index 763c26cea01f..d9377ee6c332 100644
--- a/libcxx/include/set
+++ b/libcxx/include/set
@@ -825,11 +825,11 @@ public:
   _LIBCPP_HIDE_FROM_ABI iterator find(const key_type& __k) { return __tree_.find(__k); }
   _LIBCPP_HIDE_FROM_ABI const_iterator find(const key_type& __k) const { return __tree_.find(__k); }
 #if _LIBCPP_STD_VER >= 14
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI iterator find(const _K2& __k) {
     return __tree_.find(__k);
   }
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI const_iterator find(const _K2& __k) const {
     return __tree_.find(__k);
   }
@@ -837,7 +837,7 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI size_type count(const key_type& __k) const { return __tree_.__count_unique(__k); }
 #if _LIBCPP_STD_VER >= 14
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI size_type count(const _K2& __k) const {
     return __tree_.__count_multi(__k);
   }
@@ -845,7 +845,7 @@ public:
 
 #if _LIBCPP_STD_VER >= 20
   _LIBCPP_HIDE_FROM_ABI bool contains(const key_type& __k) const { return find(__k) != end(); }
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI bool contains(const _K2& __k) const {
     return find(__k) != end();
   }
@@ -854,12 +854,12 @@ public:
   _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const key_type& __k) { return __tree_.lower_bound(__k); }
   _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const key_type& __k) const { return __tree_.lower_bound(__k); }
 #if _LIBCPP_STD_VER >= 14
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const _K2& __k) {
     return __tree_.lower_bound(__k);
   }
 
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const _K2& __k) const {
     return __tree_.lower_bound(__k);
   }
@@ -868,11 +868,11 @@ public:
   _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const key_type& __k) { return __tree_.upper_bound(__k); }
   _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const key_type& __k) const { return __tree_.upper_bound(__k); }
 #if _LIBCPP_STD_VER >= 14
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const _K2& __k) {
     return __tree_.upper_bound(__k);
   }
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const _K2& __k) const {
     return __tree_.upper_bound(__k);
   }
@@ -885,11 +885,11 @@ public:
     return __tree_.__equal_range_unique(__k);
   }
 #if _LIBCPP_STD_VER >= 14
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI pair<iterator, iterator> equal_range(const _K2& __k) {
     return __tree_.__equal_range_multi(__k);
   }
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI pair<const_iterator, const_iterator> equal_range(const _K2& __k) const {
     return __tree_.__equal_range_multi(__k);
   }
@@ -1283,11 +1283,11 @@ public:
   _LIBCPP_HIDE_FROM_ABI iterator find(const key_type& __k) { return __tree_.find(__k); }
   _LIBCPP_HIDE_FROM_ABI const_iterator find(const key_type& __k) const { return __tree_.find(__k); }
 #if _LIBCPP_STD_VER >= 14
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI iterator find(const _K2& __k) {
     return __tree_.find(__k);
   }
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI const_iterator find(const _K2& __k) const {
     return __tree_.find(__k);
   }
@@ -1295,7 +1295,7 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI size_type count(const key_type& __k) const { return __tree_.__count_multi(__k); }
 #if _LIBCPP_STD_VER >= 14
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI size_type count(const _K2& __k) const {
     return __tree_.__count_multi(__k);
   }
@@ -1303,7 +1303,7 @@ public:
 
 #if _LIBCPP_STD_VER >= 20
   _LIBCPP_HIDE_FROM_ABI bool contains(const key_type& __k) const { return find(__k) != end(); }
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI bool contains(const _K2& __k) const {
     return find(__k) != end();
   }
@@ -1312,12 +1312,12 @@ public:
   _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const key_type& __k) { return __tree_.lower_bound(__k); }
   _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const key_type& __k) const { return __tree_.lower_bound(__k); }
 #if _LIBCPP_STD_VER >= 14
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const _K2& __k) {
     return __tree_.lower_bound(__k);
   }
 
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const _K2& __k) const {
     return __tree_.lower_bound(__k);
   }
@@ -1326,11 +1326,11 @@ public:
   _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const key_type& __k) { return __tree_.upper_bound(__k); }
   _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const key_type& __k) const { return __tree_.upper_bound(__k); }
 #if _LIBCPP_STD_VER >= 14
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const _K2& __k) {
     return __tree_.upper_bound(__k);
   }
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const _K2& __k) const {
     return __tree_.upper_bound(__k);
   }
@@ -1343,11 +1343,11 @@ public:
     return __tree_.__equal_range_multi(__k);
   }
 #if _LIBCPP_STD_VER >= 14
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI pair<iterator, iterator> equal_range(const _K2& __k) {
     return __tree_.__equal_range_multi(__k);
   }
-  template <typename _K2, enable_if_t<__is_transparent<_Compare, _K2>::value, int> = 0>
+  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI pair<const_iterator, const_iterator> equal_range(const _K2& __k) const {
     return __tree_.__equal_range_multi(__k);
   }
diff --git a/libcxx/include/sstream b/libcxx/include/sstream
index 003c802b2647..5009fe5c0057 100644
--- a/libcxx/include/sstream
+++ b/libcxx/include/sstream
@@ -205,7 +205,7 @@ public:
       basic_ostringstream(const T& t, const Allocator& a);                                // Since C++26
     template<class T>
       basic_ostringstream(const T& t, ios_base::openmode which, const Allocator& a);      // Since C++26
-    basic_ostringstream(const basic_ostringstream&) = delete;                             
+    basic_ostringstream(const basic_ostringstream&) = delete;
     basic_ostringstream(basic_ostringstream&& rhs);
 
     // [ostringstream.assign] Assign and swap:
@@ -315,10 +315,10 @@ typedef basic_stringstream<wchar_t> wstringstream;
 #include <__availability>
 #include <__config>
 #include <__fwd/sstream.h>
+#include <__ostream/basic_ostream.h>
 #include <__type_traits/is_convertible.h>
 #include <__utility/swap.h>
 #include <istream>
-#include <ostream>
 #include <string>
 #include <string_view>
 #include <version>
@@ -1270,6 +1270,7 @@ _LIBCPP_END_NAMESPACE_STD
 _LIBCPP_POP_MACROS
 
 #if _LIBCPP_STD_VER <= 20 && !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES)
+#  include <ostream>
 #  include <type_traits>
 #endif
 
diff --git a/libcxx/include/string b/libcxx/include/string
index 883bc1d7e5dc..1db803e822d7 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -662,7 +662,6 @@ _LIBCPP_PUSH_MACROS
 #else
 #  define _LIBCPP_STRING_INTERNAL_MEMORY_ACCESS
 #endif
-#define _LIBCPP_SHORT_STRING_ANNOTATIONS_ALLOWED false
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
@@ -736,10 +735,44 @@ public:
   //
   // This string implementation doesn't contain any references into itself. It only contains a bit that says whether
   // it is in small or large string mode, so the entire structure is trivially relocatable if its members are.
+#if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
+  // When compiling with AddressSanitizer (ASan), basic_string cannot be trivially
+  // relocatable. Because the object's memory might be poisoned when its content
+  // is kept inside objects memory (short string optimization), instead of in allocated
+  // external memory. In such cases, the destructor is responsible for unpoisoning
+  // the memory to avoid triggering false positives.
+  // Therefore it's crucial to ensure the destructor is called.
+  using __trivially_relocatable = void;
+#else
   using __trivially_relocatable = __conditional_t<
       __libcpp_is_trivially_relocatable<allocator_type>::value && __libcpp_is_trivially_relocatable<pointer>::value,
       basic_string,
       void>;
+#endif
+#if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
+  pointer __asan_volatile_wrapper(pointer const &__ptr) const {
+    if (__libcpp_is_constant_evaluated())
+      return __ptr;
+
+    pointer volatile __copy_ptr = __ptr;
+
+    return const_cast<pointer &>(__copy_ptr);
+  }
+
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
+  const_pointer __asan_volatile_wrapper(const_pointer const &__ptr) const {
+    if (__libcpp_is_constant_evaluated())
+      return __ptr;
+
+    const_pointer volatile __copy_ptr = __ptr;
+
+    return const_cast<const_pointer &>(__copy_ptr);
+  }
+#define _LIBCPP_ASAN_VOLATILE_WRAPPER(PTR) __asan_volatile_wrapper(PTR)
+#else
+#define _LIBCPP_ASAN_VOLATILE_WRAPPER(PTR) PTR
+#endif
 
   static_assert((!is_array<value_type>::value), "Character type of basic_string must not be an array");
   static_assert((is_standard_layout<value_type>::value), "Character type of basic_string must be standard-layout");
@@ -1886,16 +1919,16 @@ private:
     __r_.first().__l.__data_ = __p;
   }
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pointer __get_long_pointer() _NOEXCEPT {
-    return __r_.first().__l.__data_;
+    return _LIBCPP_ASAN_VOLATILE_WRAPPER(__r_.first().__l.__data_);
   }
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_pointer __get_long_pointer() const _NOEXCEPT {
-    return __r_.first().__l.__data_;
+    return _LIBCPP_ASAN_VOLATILE_WRAPPER(__r_.first().__l.__data_);
   }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pointer __get_short_pointer() _NOEXCEPT {
-    return pointer_traits<pointer>::pointer_to(__r_.first().__s.__data_[0]);
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_STRING_INTERNAL_MEMORY_ACCESS pointer __get_short_pointer() _NOEXCEPT {
+    return _LIBCPP_ASAN_VOLATILE_WRAPPER(pointer_traits<pointer>::pointer_to(__r_.first().__s.__data_[0]));
   }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_pointer __get_short_pointer() const _NOEXCEPT {
-    return pointer_traits<const_pointer>::pointer_to(__r_.first().__s.__data_[0]);
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_STRING_INTERNAL_MEMORY_ACCESS const_pointer __get_short_pointer() const _NOEXCEPT {
+    return _LIBCPP_ASAN_VOLATILE_WRAPPER(pointer_traits<const_pointer>::pointer_to(__r_.first().__s.__data_[0]));
   }
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pointer __get_pointer() _NOEXCEPT {
     return __is_long() ? __get_long_pointer() : __get_short_pointer();
@@ -1914,22 +1947,17 @@ private:
 #endif
   }
 
-  // ASan: short string is poisoned if and only if this function returns true.
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __asan_short_string_is_annotated() const _NOEXCEPT {
-    return _LIBCPP_SHORT_STRING_ANNOTATIONS_ALLOWED && !__libcpp_is_constant_evaluated();
-  }
-
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_new(size_type __current_size) const _NOEXCEPT {
     (void)__current_size;
 #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
-    if (!__libcpp_is_constant_evaluated() && (__asan_short_string_is_annotated() || __is_long()))
+    if (!__libcpp_is_constant_evaluated())
       __annotate_contiguous_container(data() + capacity() + 1, data() + __current_size + 1);
 #endif
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_delete() const _NOEXCEPT {
 #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
-    if (!__libcpp_is_constant_evaluated() && (__asan_short_string_is_annotated() || __is_long()))
+    if (!__libcpp_is_constant_evaluated())
       __annotate_contiguous_container(data() + size() + 1, data() + capacity() + 1);
 #endif
   }
@@ -1937,7 +1965,7 @@ private:
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_increase(size_type __n) const _NOEXCEPT {
     (void)__n;
 #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
-    if (!__libcpp_is_constant_evaluated() && (__asan_short_string_is_annotated() || __is_long()))
+    if (!__libcpp_is_constant_evaluated())
       __annotate_contiguous_container(data() + size() + 1, data() + size() + 1 + __n);
 #endif
   }
@@ -1945,7 +1973,7 @@ private:
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_shrink(size_type __old_size) const _NOEXCEPT {
     (void)__old_size;
 #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
-    if (!__libcpp_is_constant_evaluated() && (__asan_short_string_is_annotated() || __is_long()))
+    if (!__libcpp_is_constant_evaluated())
       __annotate_contiguous_container(data() + __old_size + 1, data() + size() + 1);
 #endif
   }
@@ -1959,10 +1987,10 @@ private:
     if (__s < __min_cap) {
       return static_cast<size_type>(__min_cap) - 1;
     }
-    const size_type __boundary = sizeof(value_type) < __alignment ? __alignment / sizeof(value_type) : 1;
+    const size_type __boundary = sizeof(value_type) < __alignment ? __alignment / sizeof(value_type) : __endian_factor;
     size_type __guess          = __align_it<__boundary>(__s + 1) - 1;
     if (__guess == __min_cap)
-      ++__guess;
+      __guess += __endian_factor;
     return __guess;
   }
 
diff --git a/libcxx/include/syncstream b/libcxx/include/syncstream
index c54e8ce9f54c..e6f35b6f428e 100644
--- a/libcxx/include/syncstream
+++ b/libcxx/include/syncstream
@@ -117,7 +117,9 @@ namespace std {
 
 #include <__config>
 #include <__utility/move.h>
+#include <ios>
 #include <iosfwd> // required for declaration of default arguments
+#include <streambuf>
 #include <string>
 
 #ifndef _LIBCPP_HAS_NO_THREADS
diff --git a/libcxx/include/tuple b/libcxx/include/tuple
index c7fc5509a0f8..e7b43af7d13c 100644
--- a/libcxx/include/tuple
+++ b/libcxx/include/tuple
@@ -222,7 +222,6 @@ template <class... Types>
 #include <__tuple/tuple_like_ext.h>
 #include <__tuple/tuple_size.h>
 #include <__tuple/tuple_types.h>
-#include <__type_traits/apply_cv.h>
 #include <__type_traits/common_reference.h>
 #include <__type_traits/common_type.h>
 #include <__type_traits/conditional.h>
@@ -304,7 +303,7 @@ class __tuple_leaf {
 #  endif
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 __tuple_leaf& operator=(const __tuple_leaf&);
+  _LIBCPP_CONSTEXPR_SINCE_CXX14 __tuple_leaf& operator=(const __tuple_leaf&) = delete;
 
 public:
   _LIBCPP_HIDE_FROM_ABI constexpr __tuple_leaf() _NOEXCEPT_(is_nothrow_default_constructible<_Hp>::value) : __value_() {
@@ -380,7 +379,7 @@ public:
 
 template <size_t _Ip, class _Hp>
 class __tuple_leaf<_Ip, _Hp, true> : private _Hp {
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 __tuple_leaf& operator=(const __tuple_leaf&);
+  _LIBCPP_CONSTEXPR_SINCE_CXX14 __tuple_leaf& operator=(const __tuple_leaf&) = delete;
 
 public:
   _LIBCPP_HIDE_FROM_ABI constexpr __tuple_leaf() _NOEXCEPT_(is_nothrow_default_constructible<_Hp>::value) {}
@@ -1286,14 +1285,14 @@ struct __tuple_cat_return_ref_imp;
 template <class... _Types, size_t... _I0, class _Tuple0>
 struct __tuple_cat_return_ref_imp<tuple<_Types...>, __tuple_indices<_I0...>, _Tuple0> {
   typedef _LIBCPP_NODEBUG __libcpp_remove_reference_t<_Tuple0> _T0;
-  typedef tuple<_Types..., __apply_cv_t<_Tuple0, typename tuple_element<_I0, _T0>::type>&&...> type;
+  typedef tuple<_Types..., __copy_cvref_t<_Tuple0, typename tuple_element<_I0, _T0>::type>&&...> type;
 };
 
 template <class... _Types, size_t... _I0, class _Tuple0, class _Tuple1, class... _Tuples>
 struct __tuple_cat_return_ref_imp<tuple<_Types...>, __tuple_indices<_I0...>, _Tuple0, _Tuple1, _Tuples...>
     : public __tuple_cat_return_ref_imp<
           tuple<_Types...,
-                __apply_cv_t<_Tuple0, typename tuple_element<_I0, __libcpp_remove_reference_t<_Tuple0>>::type>&&...>,
+                __copy_cvref_t<_Tuple0, typename tuple_element<_I0, __libcpp_remove_reference_t<_Tuple0>>::type>&&...>,
           typename __make_tuple_indices<tuple_size<__libcpp_remove_reference_t<_Tuple1> >::value>::type,
           _Tuple1,
           _Tuples...> {};
@@ -1327,7 +1326,7 @@ struct __tuple_cat<tuple<_Types...>, __tuple_indices<_I0...>, __tuple_indices<_J
     (void)__t; // avoid unused parameter warning on GCC when _I0 is empty
     typedef _LIBCPP_NODEBUG __libcpp_remove_reference_t<_Tuple0> _T0;
     typedef _LIBCPP_NODEBUG __libcpp_remove_reference_t<_Tuple1> _T1;
-    return __tuple_cat<tuple<_Types..., __apply_cv_t<_Tuple0, typename tuple_element<_J0, _T0>::type>&&...>,
+    return __tuple_cat<tuple<_Types..., __copy_cvref_t<_Tuple0, typename tuple_element<_J0, _T0>::type>&&...>,
                        typename __make_tuple_indices<sizeof...(_Types) + tuple_size<_T0>::value>::type,
                        typename __make_tuple_indices<tuple_size<_T1>::value>::type>()(
         std::forward_as_tuple(
@@ -1375,22 +1374,22 @@ inline _LIBCPP_HIDE_FROM_ABI constexpr _Tp __make_from_tuple_impl(_Tuple&& __t,
 }
 #else
 template <class _Tp, class _Tuple, size_t... _Idx>
-inline _LIBCPP_HIDE_FROM_ABI constexpr _Tp __make_from_tuple_impl(_Tuple&& __t, __tuple_indices<_Idx...>, 
+inline _LIBCPP_HIDE_FROM_ABI constexpr _Tp __make_from_tuple_impl(_Tuple&& __t, __tuple_indices<_Idx...>,
     enable_if_t<is_constructible_v<_Tp, decltype(std::get<_Idx>(std::forward<_Tuple>(__t)))...>> * = nullptr)
     _LIBCPP_NOEXCEPT_RETURN(_Tp(std::get<_Idx>(std::forward<_Tuple>(__t))...))
 #endif // _LIBCPP_STD_VER >= 20
 
-template <class _Tp, class _Tuple, 
+template <class _Tp, class _Tuple,
           class _Seq = typename __make_tuple_indices<tuple_size_v<remove_reference_t<_Tuple>>>::type, class = void>
 inline constexpr bool __can_make_from_tuple = false;
 
 template <class _Tp, class _Tuple, size_t... _Idx>
-inline constexpr bool __can_make_from_tuple<_Tp, _Tuple, __tuple_indices<_Idx...>, 
+inline constexpr bool __can_make_from_tuple<_Tp, _Tuple, __tuple_indices<_Idx...>,
     enable_if_t<is_constructible_v<_Tp, decltype(std::get<_Idx>(std::declval<_Tuple>()))...>>> = true;
 
-// Based on LWG3528(https://wg21.link/LWG3528) and http://eel.is/c++draft/description#structure.requirements-9, 
-// the standard allows to impose requirements, we constraint std::make_from_tuple to make std::make_from_tuple 
-// SFINAE friendly and also avoid worse diagnostic messages. We still keep the constraints of std::__make_from_tuple_impl 
+// Based on LWG3528(https://wg21.link/LWG3528) and http://eel.is/c++draft/description#structure.requirements-9,
+// the standard allows to impose requirements, we constraint std::make_from_tuple to make std::make_from_tuple
+// SFINAE friendly and also avoid worse diagnostic messages. We still keep the constraints of std::__make_from_tuple_impl
 // so that std::__make_from_tuple_impl will have the same advantages when used alone.
 #if _LIBCPP_STD_VER >= 20
 template <class _Tp, class _Tuple>
diff --git a/libcxx/include/type_traits b/libcxx/include/type_traits
index 10f9b881c0e4..aee9fcf4137f 100644
--- a/libcxx/include/type_traits
+++ b/libcxx/include/type_traits
@@ -428,7 +428,6 @@ namespace std
 #include <__type_traits/aligned_storage.h>
 #include <__type_traits/aligned_union.h>
 #include <__type_traits/alignment_of.h>
-#include <__type_traits/apply_cv.h>
 #include <__type_traits/can_extract_key.h>
 #include <__type_traits/common_reference.h>
 #include <__type_traits/common_type.h>
diff --git a/libcxx/include/unordered_map b/libcxx/include/unordered_map
index 8c21d703a5c0..c838cd96b112 100644
--- a/libcxx/include/unordered_map
+++ b/libcxx/include/unordered_map
@@ -1384,13 +1384,11 @@ public:
   _LIBCPP_HIDE_FROM_ABI iterator find(const key_type& __k) { return __table_.find(__k); }
   _LIBCPP_HIDE_FROM_ABI const_iterator find(const key_type& __k) const { return __table_.find(__k); }
 #if _LIBCPP_STD_VER >= 20
-  template <class _K2,
-            enable_if_t<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value>* = nullptr>
+  template <class _K2, enable_if_t<__is_transparent_v<hasher, _K2> && __is_transparent_v<key_equal, _K2>>* = nullptr>
   _LIBCPP_HIDE_FROM_ABI iterator find(const _K2& __k) {
     return __table_.find(__k);
   }
-  template <class _K2,
-            enable_if_t<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value>* = nullptr>
+  template <class _K2, enable_if_t<__is_transparent_v<hasher, _K2> && __is_transparent_v<key_equal, _K2>>* = nullptr>
   _LIBCPP_HIDE_FROM_ABI const_iterator find(const _K2& __k) const {
     return __table_.find(__k);
   }
@@ -1398,8 +1396,7 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI size_type count(const key_type& __k) const { return __table_.__count_unique(__k); }
 #if _LIBCPP_STD_VER >= 20
-  template <class _K2,
-            enable_if_t<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value>* = nullptr>
+  template <class _K2, enable_if_t<__is_transparent_v<hasher, _K2> && __is_transparent_v<key_equal, _K2>>* = nullptr>
   _LIBCPP_HIDE_FROM_ABI size_type count(const _K2& __k) const {
     return __table_.__count_unique(__k);
   }
@@ -1408,8 +1405,7 @@ public:
 #if _LIBCPP_STD_VER >= 20
   _LIBCPP_HIDE_FROM_ABI bool contains(const key_type& __k) const { return find(__k) != end(); }
 
-  template <class _K2,
-            enable_if_t<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value>* = nullptr>
+  template <class _K2, enable_if_t<__is_transparent_v<hasher, _K2> && __is_transparent_v<key_equal, _K2>>* = nullptr>
   _LIBCPP_HIDE_FROM_ABI bool contains(const _K2& __k) const {
     return find(__k) != end();
   }
@@ -1423,12 +1419,12 @@ public:
   }
 #if _LIBCPP_STD_VER >= 20
   template <class _K2,
-            enable_if_t<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value>* = nullptr>
+            enable_if_t<__is_transparent_v<hasher, _K2> && __is_transparent_v<key_equal, _K2>>* = nullptr>
   _LIBCPP_HIDE_FROM_ABI pair<iterator, iterator> equal_range(const _K2& __k) {
     return __table_.__equal_range_unique(__k);
   }
   template <class _K2,
-            enable_if_t<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value>* = nullptr>
+            enable_if_t<__is_transparent_v<hasher, _K2> && __is_transparent_v<key_equal, _K2>>* = nullptr>
   _LIBCPP_HIDE_FROM_ABI pair<const_iterator, const_iterator> equal_range(const _K2& __k) const {
     return __table_.__equal_range_unique(__k);
   }
@@ -2135,13 +2131,11 @@ public:
   _LIBCPP_HIDE_FROM_ABI iterator find(const key_type& __k) { return __table_.find(__k); }
   _LIBCPP_HIDE_FROM_ABI const_iterator find(const key_type& __k) const { return __table_.find(__k); }
 #if _LIBCPP_STD_VER >= 20
-  template <class _K2,
-            enable_if_t<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value>* = nullptr>
+  template <class _K2, enable_if_t<__is_transparent_v<hasher, _K2> && __is_transparent_v<key_equal, _K2>>* = nullptr>
   _LIBCPP_HIDE_FROM_ABI iterator find(const _K2& __k) {
     return __table_.find(__k);
   }
-  template <class _K2,
-            enable_if_t<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value>* = nullptr>
+  template <class _K2, enable_if_t<__is_transparent_v<hasher, _K2> && __is_transparent_v<key_equal, _K2>>* = nullptr>
   _LIBCPP_HIDE_FROM_ABI const_iterator find(const _K2& __k) const {
     return __table_.find(__k);
   }
@@ -2149,8 +2143,7 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI size_type count(const key_type& __k) const { return __table_.__count_multi(__k); }
 #if _LIBCPP_STD_VER >= 20
-  template <class _K2,
-            enable_if_t<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value>* = nullptr>
+  template <class _K2, enable_if_t<__is_transparent_v<hasher, _K2> && __is_transparent_v<key_equal, _K2>>* = nullptr>
   _LIBCPP_HIDE_FROM_ABI size_type count(const _K2& __k) const {
     return __table_.__count_multi(__k);
   }
@@ -2159,8 +2152,7 @@ public:
 #if _LIBCPP_STD_VER >= 20
   _LIBCPP_HIDE_FROM_ABI bool contains(const key_type& __k) const { return find(__k) != end(); }
 
-  template <class _K2,
-            enable_if_t<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value>* = nullptr>
+  template <class _K2, enable_if_t<__is_transparent_v<hasher, _K2> && __is_transparent_v<key_equal, _K2>>* = nullptr>
   _LIBCPP_HIDE_FROM_ABI bool contains(const _K2& __k) const {
     return find(__k) != end();
   }
@@ -2174,12 +2166,12 @@ public:
   }
 #if _LIBCPP_STD_VER >= 20
   template <class _K2,
-            enable_if_t<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value>* = nullptr>
+            enable_if_t<__is_transparent_v<hasher, _K2> && __is_transparent_v<key_equal, _K2>>* = nullptr>
   _LIBCPP_HIDE_FROM_ABI pair<iterator, iterator> equal_range(const _K2& __k) {
     return __table_.__equal_range_multi(__k);
   }
   template <class _K2,
-            enable_if_t<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value>* = nullptr>
+            enable_if_t<__is_transparent_v<hasher, _K2> && __is_transparent_v<key_equal, _K2>>* = nullptr>
   _LIBCPP_HIDE_FROM_ABI pair<const_iterator, const_iterator> equal_range(const _K2& __k) const {
     return __table_.__equal_range_multi(__k);
   }
diff --git a/libcxx/include/unordered_set b/libcxx/include/unordered_set
index 69fe6b768788..5de1458beb1e 100644
--- a/libcxx/include/unordered_set
+++ b/libcxx/include/unordered_set
@@ -839,13 +839,11 @@ public:
   _LIBCPP_HIDE_FROM_ABI iterator find(const key_type& __k) { return __table_.find(__k); }
   _LIBCPP_HIDE_FROM_ABI const_iterator find(const key_type& __k) const { return __table_.find(__k); }
 #if _LIBCPP_STD_VER >= 20
-  template <class _K2,
-            enable_if_t<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value>* = nullptr>
+  template <class _K2, enable_if_t<__is_transparent_v<hasher, _K2> && __is_transparent_v<key_equal, _K2>>* = nullptr>
   _LIBCPP_HIDE_FROM_ABI iterator find(const _K2& __k) {
     return __table_.find(__k);
   }
-  template <class _K2,
-            enable_if_t<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value>* = nullptr>
+  template <class _K2, enable_if_t<__is_transparent_v<hasher, _K2> && __is_transparent_v<key_equal, _K2>>* = nullptr>
   _LIBCPP_HIDE_FROM_ABI const_iterator find(const _K2& __k) const {
     return __table_.find(__k);
   }
@@ -853,8 +851,7 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI size_type count(const key_type& __k) const { return __table_.__count_unique(__k); }
 #if _LIBCPP_STD_VER >= 20
-  template <class _K2,
-            enable_if_t<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value>* = nullptr>
+  template <class _K2, enable_if_t<__is_transparent_v<hasher, _K2> && __is_transparent_v<key_equal, _K2>>* = nullptr>
   _LIBCPP_HIDE_FROM_ABI size_type count(const _K2& __k) const {
     return __table_.__count_unique(__k);
   }
@@ -863,8 +860,7 @@ public:
 #if _LIBCPP_STD_VER >= 20
   _LIBCPP_HIDE_FROM_ABI bool contains(const key_type& __k) const { return find(__k) != end(); }
 
-  template <class _K2,
-            enable_if_t<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value>* = nullptr>
+  template <class _K2, enable_if_t<__is_transparent_v<hasher, _K2> && __is_transparent_v<key_equal, _K2>>* = nullptr>
   _LIBCPP_HIDE_FROM_ABI bool contains(const _K2& __k) const {
     return find(__k) != end();
   }
@@ -877,13 +873,11 @@ public:
     return __table_.__equal_range_unique(__k);
   }
 #if _LIBCPP_STD_VER >= 20
-  template <class _K2,
-            enable_if_t<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value>* = nullptr>
+  template <class _K2, enable_if_t<__is_transparent_v<hasher, _K2> && __is_transparent_v<key_equal, _K2>>* = nullptr>
   _LIBCPP_HIDE_FROM_ABI pair<iterator, iterator> equal_range(const _K2& __k) {
     return __table_.__equal_range_unique(__k);
   }
-  template <class _K2,
-            enable_if_t<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value>* = nullptr>
+  template <class _K2, enable_if_t<__is_transparent_v<hasher, _K2> && __is_transparent_v<key_equal, _K2>>* = nullptr>
   _LIBCPP_HIDE_FROM_ABI pair<const_iterator, const_iterator> equal_range(const _K2& __k) const {
     return __table_.__equal_range_unique(__k);
   }
@@ -1442,13 +1436,11 @@ public:
   _LIBCPP_HIDE_FROM_ABI iterator find(const key_type& __k) { return __table_.find(__k); }
   _LIBCPP_HIDE_FROM_ABI const_iterator find(const key_type& __k) const { return __table_.find(__k); }
 #if _LIBCPP_STD_VER >= 20
-  template <class _K2,
-            enable_if_t<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value>* = nullptr>
+  template <class _K2, enable_if_t<__is_transparent_v<hasher, _K2> && __is_transparent_v<key_equal, _K2>>* = nullptr>
   _LIBCPP_HIDE_FROM_ABI iterator find(const _K2& __k) {
     return __table_.find(__k);
   }
-  template <class _K2,
-            enable_if_t<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value>* = nullptr>
+  template <class _K2, enable_if_t<__is_transparent_v<hasher, _K2> && __is_transparent_v<key_equal, _K2>>* = nullptr>
   _LIBCPP_HIDE_FROM_ABI const_iterator find(const _K2& __k) const {
     return __table_.find(__k);
   }
@@ -1456,8 +1448,7 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI size_type count(const key_type& __k) const { return __table_.__count_multi(__k); }
 #if _LIBCPP_STD_VER >= 20
-  template <class _K2,
-            enable_if_t<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value>* = nullptr>
+  template <class _K2, enable_if_t<__is_transparent_v<hasher, _K2> && __is_transparent_v<key_equal, _K2>>* = nullptr>
   _LIBCPP_HIDE_FROM_ABI size_type count(const _K2& __k) const {
     return __table_.__count_multi(__k);
   }
@@ -1466,8 +1457,7 @@ public:
 #if _LIBCPP_STD_VER >= 20
   _LIBCPP_HIDE_FROM_ABI bool contains(const key_type& __k) const { return find(__k) != end(); }
 
-  template <class _K2,
-            enable_if_t<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value>* = nullptr>
+  template <class _K2, enable_if_t<__is_transparent_v<hasher, _K2> && __is_transparent_v<key_equal, _K2>>* = nullptr>
   _LIBCPP_HIDE_FROM_ABI bool contains(const _K2& __k) const {
     return find(__k) != end();
   }
@@ -1480,13 +1470,11 @@ public:
     return __table_.__equal_range_multi(__k);
   }
 #if _LIBCPP_STD_VER >= 20
-  template <class _K2,
-            enable_if_t<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value>* = nullptr>
+  template <class _K2, enable_if_t<__is_transparent_v<hasher, _K2> && __is_transparent_v<key_equal, _K2>>* = nullptr>
   _LIBCPP_HIDE_FROM_ABI pair<iterator, iterator> equal_range(const _K2& __k) {
     return __table_.__equal_range_multi(__k);
   }
-  template <class _K2,
-            enable_if_t<__is_transparent<hasher, _K2>::value && __is_transparent<key_equal, _K2>::value>* = nullptr>
+  template <class _K2, enable_if_t<__is_transparent_v<hasher, _K2> && __is_transparent_v<key_equal, _K2>>* = nullptr>
   _LIBCPP_HIDE_FROM_ABI pair<const_iterator, const_iterator> equal_range(const _K2& __k) const {
     return __table_.__equal_range_multi(__k);
   }
diff --git a/libcxx/include/variant b/libcxx/include/variant
index 34150bd45284..631ffceab5f6 100644
--- a/libcxx/include/variant
+++ b/libcxx/include/variant
@@ -42,26 +42,28 @@ namespace std {
         in_place_index_t<I>, initializer_list<U>, Args&&...);
 
     // 20.7.2.2, destructor
-    ~variant();
+    constexpr ~variant();                                             // constexpr since c++20
 
     // 20.7.2.3, assignment
     constexpr variant& operator=(const variant&);
     constexpr variant& operator=(variant&&) noexcept(see below);
 
-    template <class T> variant& operator=(T&&) noexcept(see below);
+    template <class T>
+    constexpr variant& operator=(T&&) noexcept(see below);            // constexpr since c++20
 
     // 20.7.2.4, modifiers
     template <class T, class... Args>
-    T& emplace(Args&&...);
+    constexpr T& emplace(Args&&...);                                  // constexpr since c++20
 
     template <class T, class U, class... Args>
-    T& emplace(initializer_list<U>, Args&&...);
+    constexpr T& emplace(initializer_list<U>, Args&&...);             // constexpr since c++20
 
     template <size_t I, class... Args>
-    variant_alternative_t<I, variant>& emplace(Args&&...);
+    constexpr variant_alternative_t<I, variant>& emplace(Args&&...);  // constexpr since c++20
 
     template <size_t I, class U, class...  Args>
-    variant_alternative_t<I, variant>& emplace(initializer_list<U>, Args&&...);
+    constexpr variant_alternative_t<I, variant>&
+        emplace(initializer_list<U>, Args&&...);                      // constexpr since c++20
 
     // 20.7.2.5, value status
     constexpr bool valueless_by_exception() const noexcept;
@@ -221,6 +223,7 @@ namespace std {
 #include <__functional/operations.h>
 #include <__functional/unary_function.h>
 #include <__memory/addressof.h>
+#include <__memory/construct_at.h>
 #include <__tuple/find_index.h>
 #include <__tuple/sfinae_helpers.h>
 #include <__type_traits/add_const.h>
@@ -663,7 +666,8 @@ private:
 
 template <size_t _Index, class _Tp>
 struct _LIBCPP_TEMPLATE_VIS __alt {
-  using __value_type = _Tp;
+  using __value_type              = _Tp;
+  static constexpr size_t __index = _Index;
 
   template <class... _Args>
   _LIBCPP_HIDE_FROM_ABI explicit constexpr __alt(in_place_t, _Args&&... __args)
@@ -678,7 +682,7 @@ union _LIBCPP_TEMPLATE_VIS __union;
 template <_Trait _DestructibleTrait, size_t _Index>
 union _LIBCPP_TEMPLATE_VIS __union<_DestructibleTrait, _Index> {};
 
-#  define _LIBCPP_VARIANT_UNION(destructible_trait, destructor)                                                        \
+#  define _LIBCPP_VARIANT_UNION(destructible_trait, destructor_definition)                                             \
     template <size_t _Index, class _Tp, class... _Types>                                                               \
     union _LIBCPP_TEMPLATE_VIS __union<destructible_trait, _Index, _Tp, _Types...> {                                   \
     public:                                                                                                            \
@@ -692,13 +696,11 @@ union _LIBCPP_TEMPLATE_VIS __union<_DestructibleTrait, _Index> {};
       _LIBCPP_HIDE_FROM_ABI explicit constexpr __union(in_place_index_t<_Ip>, _Args&&... __args)                       \
           : __tail(in_place_index<_Ip - 1>, std::forward<_Args>(__args)...) {}                                         \
                                                                                                                        \
-      __union(const __union&) = default;                                                                               \
-      __union(__union&&)      = default;                                                                               \
-                                                                                                                       \
-      destructor;                                                                                                      \
-                                                                                                                       \
-      __union& operator=(const __union&) = default;                                                                    \
-      __union& operator=(__union&&)      = default;                                                                    \
+      _LIBCPP_HIDE_FROM_ABI __union(const __union&)            = default;                                              \
+      _LIBCPP_HIDE_FROM_ABI __union(__union&&)                 = default;                                              \
+      _LIBCPP_HIDE_FROM_ABI __union& operator=(const __union&) = default;                                              \
+      _LIBCPP_HIDE_FROM_ABI __union& operator=(__union&&)      = default;                                              \
+      destructor_definition;                                                                                           \
                                                                                                                        \
     private:                                                                                                           \
       char __dummy;                                                                                                    \
@@ -708,10 +710,11 @@ union _LIBCPP_TEMPLATE_VIS __union<_DestructibleTrait, _Index> {};
       friend struct __access::__union;                                                                                 \
     }
 
-_LIBCPP_VARIANT_UNION(_Trait::_TriviallyAvailable, ~__union() = default);
+_LIBCPP_VARIANT_UNION(_Trait::_TriviallyAvailable,
+                      _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 ~__union() = default);
 _LIBCPP_VARIANT_UNION(
-    _Trait::_Available, _LIBCPP_HIDE_FROM_ABI ~__union() {} _LIBCPP_EAT_SEMICOLON);
-_LIBCPP_VARIANT_UNION(_Trait::_Unavailable, ~__union() = delete);
+    _Trait::_Available, _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 ~__union() {} _LIBCPP_EAT_SEMICOLON);
+_LIBCPP_VARIANT_UNION(_Trait::_Unavailable, _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 ~__union() = delete);
 
 #  undef _LIBCPP_VARIANT_UNION
 
@@ -754,7 +757,7 @@ protected:
 template <class _Traits, _Trait = _Traits::__destructible_trait>
 class _LIBCPP_TEMPLATE_VIS __dtor;
 
-#  define _LIBCPP_VARIANT_DESTRUCTOR(destructible_trait, destructor, destroy)                                          \
+#  define _LIBCPP_VARIANT_DESTRUCTOR(destructible_trait, destructor_definition, destroy)                               \
     template <class... _Types>                                                                                         \
     class _LIBCPP_TEMPLATE_VIS __dtor<__traits<_Types...>, destructible_trait>                                         \
         : public __base<destructible_trait, _Types...> {                                                               \
@@ -764,28 +767,27 @@ class _LIBCPP_TEMPLATE_VIS __dtor;
     public:                                                                                                            \
       using __base_type::__base_type;                                                                                  \
       using __base_type::operator=;                                                                                    \
-                                                                                                                       \
-      __dtor(const __dtor&)            = default;                                                                      \
-      __dtor(__dtor&&)                 = default;                                                                      \
-      __dtor& operator=(const __dtor&) = default;                                                                      \
-      __dtor& operator=(__dtor&&)      = default;                                                                      \
-      destructor;                                                                                                      \
+      _LIBCPP_HIDE_FROM_ABI __dtor(const __dtor&)            = default;                                                \
+      _LIBCPP_HIDE_FROM_ABI __dtor(__dtor&&)                 = default;                                                \
+      _LIBCPP_HIDE_FROM_ABI __dtor& operator=(const __dtor&) = default;                                                \
+      _LIBCPP_HIDE_FROM_ABI __dtor& operator=(__dtor&&)      = default;                                                \
+      destructor_definition;                                                                                           \
                                                                                                                        \
     protected:                                                                                                         \
-      inline _LIBCPP_HIDE_FROM_ABI destroy;                                                                            \
+      destroy;                                                                                                         \
     }
 
 _LIBCPP_VARIANT_DESTRUCTOR(
     _Trait::_TriviallyAvailable,
-    ~__dtor() = default, //
-    _LIBCPP_HIDE_FROM_ABI void __destroy() noexcept {
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 ~__dtor() = default,
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __destroy() noexcept {
       this->__index = __variant_npos<__index_t>;
     } _LIBCPP_EAT_SEMICOLON);
 
 _LIBCPP_VARIANT_DESTRUCTOR(
     _Trait::_Available,
-    _LIBCPP_HIDE_FROM_ABI ~__dtor() { __destroy(); } _LIBCPP_EAT_SEMICOLON,
-    _LIBCPP_HIDE_FROM_ABI void __destroy() noexcept {
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 ~__dtor() { __destroy(); } _LIBCPP_EAT_SEMICOLON,
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __destroy() noexcept {
       if (!this->valueless_by_exception()) {
         __visitation::__base::__visit_alt(
             [](auto& __alt) noexcept {
@@ -797,7 +799,9 @@ _LIBCPP_VARIANT_DESTRUCTOR(
       this->__index = __variant_npos<__index_t>;
     } _LIBCPP_EAT_SEMICOLON);
 
-_LIBCPP_VARIANT_DESTRUCTOR(_Trait::_Unavailable, ~__dtor() = delete, void __destroy() noexcept = delete);
+_LIBCPP_VARIANT_DESTRUCTOR(_Trait::_Unavailable,
+                           _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 ~__dtor()                 = delete,
+                           _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __destroy() noexcept = delete);
 
 #  undef _LIBCPP_VARIANT_DESTRUCTOR
 
@@ -810,23 +814,18 @@ public:
   using __base_type::operator=;
 
 protected:
-  template <size_t _Ip, class _Tp, class... _Args>
-  _LIBCPP_HIDE_FROM_ABI static _Tp& __construct_alt(__alt<_Ip, _Tp>& __a, _Args&&... __args) {
-    ::new ((void*)std::addressof(__a)) __alt<_Ip, _Tp>(in_place, std::forward<_Args>(__args)...);
-    return __a.__value;
-  }
-
   template <class _Rhs>
-  _LIBCPP_HIDE_FROM_ABI static void __generic_construct(__ctor& __lhs, _Rhs&& __rhs) {
+  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX20 void __generic_construct(__ctor& __lhs, _Rhs&& __rhs) {
     __lhs.__destroy();
     if (!__rhs.valueless_by_exception()) {
       auto __rhs_index = __rhs.index();
       __visitation::__base::__visit_alt_at(
           __rhs_index,
-          [](auto& __lhs_alt, auto&& __rhs_alt) {
-            __construct_alt(__lhs_alt, std::forward<decltype(__rhs_alt)>(__rhs_alt).__value);
+          [&__lhs](auto&& __rhs_alt) {
+            std::__construct_at(std::addressof(__lhs.__data),
+                                in_place_index<__decay_t<decltype(__rhs_alt)>::__index>,
+                                std::forward<decltype(__rhs_alt)>(__rhs_alt).__value);
           },
-          __lhs,
           std::forward<_Rhs>(__rhs));
       __lhs.__index = __rhs_index;
     }
@@ -836,7 +835,7 @@ protected:
 template <class _Traits, _Trait = _Traits::__move_constructible_trait>
 class _LIBCPP_TEMPLATE_VIS __move_constructor;
 
-#  define _LIBCPP_VARIANT_MOVE_CONSTRUCTOR(move_constructible_trait, move_constructor)                                 \
+#  define _LIBCPP_VARIANT_MOVE_CONSTRUCTOR(move_constructible_trait, move_constructor_definition)                      \
     template <class... _Types>                                                                                         \
     class _LIBCPP_TEMPLATE_VIS __move_constructor<__traits<_Types...>, move_constructible_trait>                       \
         : public __ctor<__traits<_Types...>> {                                                                         \
@@ -846,32 +845,35 @@ class _LIBCPP_TEMPLATE_VIS __move_constructor;
       using __base_type::__base_type;                                                                                  \
       using __base_type::operator=;                                                                                    \
                                                                                                                        \
-      __move_constructor(const __move_constructor&)            = default;                                              \
-      ~__move_constructor()                                    = default;                                              \
-      __move_constructor& operator=(const __move_constructor&) = default;                                              \
-      __move_constructor& operator=(__move_constructor&&)      = default;                                              \
-      move_constructor;                                                                                                \
+      _LIBCPP_HIDE_FROM_ABI __move_constructor(const __move_constructor&)            = default;                        \
+      _LIBCPP_HIDE_FROM_ABI ~__move_constructor()                                    = default;                        \
+      _LIBCPP_HIDE_FROM_ABI __move_constructor& operator=(const __move_constructor&) = default;                        \
+      _LIBCPP_HIDE_FROM_ABI __move_constructor& operator=(__move_constructor&&)      = default;                        \
+      move_constructor_definition;                                                                                     \
     }
 
-_LIBCPP_VARIANT_MOVE_CONSTRUCTOR(_Trait::_TriviallyAvailable,
-                                 __move_constructor(__move_constructor&& __that) = default);
+_LIBCPP_VARIANT_MOVE_CONSTRUCTOR(
+    _Trait::_TriviallyAvailable,
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __move_constructor(__move_constructor&& __that) = default);
 
 _LIBCPP_VARIANT_MOVE_CONSTRUCTOR(
     _Trait::_Available,
-    _LIBCPP_HIDE_FROM_ABI __move_constructor(__move_constructor&& __that) noexcept(
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __move_constructor(__move_constructor&& __that) noexcept(
         __all<is_nothrow_move_constructible_v<_Types>...>::value)
     : __move_constructor(__valueless_t{}) {
       this->__generic_construct(*this, std::move(__that));
     } _LIBCPP_EAT_SEMICOLON);
 
-_LIBCPP_VARIANT_MOVE_CONSTRUCTOR(_Trait::_Unavailable, __move_constructor(__move_constructor&&) = delete);
+_LIBCPP_VARIANT_MOVE_CONSTRUCTOR(
+    _Trait::_Unavailable,
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __move_constructor(__move_constructor&&) = delete);
 
 #  undef _LIBCPP_VARIANT_MOVE_CONSTRUCTOR
 
 template <class _Traits, _Trait = _Traits::__copy_constructible_trait>
 class _LIBCPP_TEMPLATE_VIS __copy_constructor;
 
-#  define _LIBCPP_VARIANT_COPY_CONSTRUCTOR(copy_constructible_trait, copy_constructor)                                 \
+#  define _LIBCPP_VARIANT_COPY_CONSTRUCTOR(copy_constructible_trait, copy_constructor_definition)                      \
     template <class... _Types>                                                                                         \
     class _LIBCPP_TEMPLATE_VIS __copy_constructor<__traits<_Types...>, copy_constructible_trait>                       \
         : public __move_constructor<__traits<_Types...>> {                                                             \
@@ -881,21 +883,25 @@ class _LIBCPP_TEMPLATE_VIS __copy_constructor;
       using __base_type::__base_type;                                                                                  \
       using __base_type::operator=;                                                                                    \
                                                                                                                        \
-      __copy_constructor(__copy_constructor&&)                 = default;                                              \
-      ~__copy_constructor()                                    = default;                                              \
-      __copy_constructor& operator=(const __copy_constructor&) = default;                                              \
-      __copy_constructor& operator=(__copy_constructor&&)      = default;                                              \
-      copy_constructor;                                                                                                \
-    } // namespace __variant_detail
+      _LIBCPP_HIDE_FROM_ABI __copy_constructor(__copy_constructor&&)                 = default;                        \
+      _LIBCPP_HIDE_FROM_ABI ~__copy_constructor()                                    = default;                        \
+      _LIBCPP_HIDE_FROM_ABI __copy_constructor& operator=(const __copy_constructor&) = default;                        \
+      _LIBCPP_HIDE_FROM_ABI __copy_constructor& operator=(__copy_constructor&&)      = default;                        \
+      copy_constructor_definition;                                                                                     \
+    }
 
-_LIBCPP_VARIANT_COPY_CONSTRUCTOR(_Trait::_TriviallyAvailable,
-                                 __copy_constructor(const __copy_constructor& __that) = default);
+_LIBCPP_VARIANT_COPY_CONSTRUCTOR(
+    _Trait::_TriviallyAvailable,
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __copy_constructor(const __copy_constructor& __that) = default);
 
 _LIBCPP_VARIANT_COPY_CONSTRUCTOR(
-    _Trait::_Available, _LIBCPP_HIDE_FROM_ABI __copy_constructor(const __copy_constructor& __that)
+    _Trait::_Available,
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __copy_constructor(const __copy_constructor& __that)
     : __copy_constructor(__valueless_t{}) { this->__generic_construct(*this, __that); } _LIBCPP_EAT_SEMICOLON);
 
-_LIBCPP_VARIANT_COPY_CONSTRUCTOR(_Trait::_Unavailable, __copy_constructor(const __copy_constructor&) = delete);
+_LIBCPP_VARIANT_COPY_CONSTRUCTOR(
+    _Trait::_Unavailable,
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __copy_constructor(const __copy_constructor&) = delete);
 
 #  undef _LIBCPP_VARIANT_COPY_CONSTRUCTOR
 
@@ -908,22 +914,24 @@ public:
   using __base_type::operator=;
 
   template <size_t _Ip, class... _Args>
-  _LIBCPP_HIDE_FROM_ABI auto& __emplace(_Args&&... __args) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 auto& __emplace(_Args&&... __args) {
     this->__destroy();
-    auto& __res   = this->__construct_alt(__access::__base::__get_alt<_Ip>(*this), std::forward<_Args>(__args)...);
+    std::__construct_at(std::addressof(this->__data), in_place_index<_Ip>, std::forward<_Args>(__args)...);
     this->__index = _Ip;
-    return __res;
+    return __access::__base::__get_alt<_Ip>(*this).__value;
   }
 
 protected:
   template <size_t _Ip, class _Tp, class _Arg>
-  _LIBCPP_HIDE_FROM_ABI void __assign_alt(__alt<_Ip, _Tp>& __a, _Arg&& __arg) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __assign_alt(__alt<_Ip, _Tp>& __a, _Arg&& __arg) {
     if (this->index() == _Ip) {
       __a.__value = std::forward<_Arg>(__arg);
     } else {
       struct {
-        _LIBCPP_HIDDEN void operator()(true_type) const { __this->__emplace<_Ip>(std::forward<_Arg>(__arg)); }
-        _LIBCPP_HIDDEN void operator()(false_type) const {
+        _LIBCPP_HIDDEN _LIBCPP_CONSTEXPR_SINCE_CXX20 void operator()(true_type) const {
+          __this->__emplace<_Ip>(std::forward<_Arg>(__arg));
+        }
+        _LIBCPP_HIDDEN _LIBCPP_CONSTEXPR_SINCE_CXX20 void operator()(false_type) const {
           __this->__emplace<_Ip>(_Tp(std::forward<_Arg>(__arg)));
         }
         __assignment* __this;
@@ -934,7 +942,7 @@ protected:
   }
 
   template <class _That>
-  _LIBCPP_HIDE_FROM_ABI void __generic_assign(_That&& __that) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __generic_assign(_That&& __that) {
     if (this->valueless_by_exception() && __that.valueless_by_exception()) {
       // do nothing.
     } else if (__that.valueless_by_exception()) {
@@ -954,7 +962,7 @@ protected:
 template <class _Traits, _Trait = _Traits::__move_assignable_trait>
 class _LIBCPP_TEMPLATE_VIS __move_assignment;
 
-#  define _LIBCPP_VARIANT_MOVE_ASSIGNMENT(move_assignable_trait, move_assignment)                                      \
+#  define _LIBCPP_VARIANT_MOVE_ASSIGNMENT(move_assignable_trait, move_assignment_definition)                           \
     template <class... _Types>                                                                                         \
     class _LIBCPP_TEMPLATE_VIS __move_assignment<__traits<_Types...>, move_assignable_trait>                           \
         : public __assignment<__traits<_Types...>> {                                                                   \
@@ -964,33 +972,36 @@ class _LIBCPP_TEMPLATE_VIS __move_assignment;
       using __base_type::__base_type;                                                                                  \
       using __base_type::operator=;                                                                                    \
                                                                                                                        \
-      __move_assignment(const __move_assignment&)            = default;                                                \
-      __move_assignment(__move_assignment&&)                 = default;                                                \
-      ~__move_assignment()                                   = default;                                                \
-      __move_assignment& operator=(const __move_assignment&) = default;                                                \
-      move_assignment;                                                                                                 \
+      _LIBCPP_HIDE_FROM_ABI __move_assignment(const __move_assignment&)            = default;                          \
+      _LIBCPP_HIDE_FROM_ABI __move_assignment(__move_assignment&&)                 = default;                          \
+      _LIBCPP_HIDE_FROM_ABI ~__move_assignment()                                   = default;                          \
+      _LIBCPP_HIDE_FROM_ABI __move_assignment& operator=(const __move_assignment&) = default;                          \
+      move_assignment_definition;                                                                                      \
     }
 
 _LIBCPP_VARIANT_MOVE_ASSIGNMENT(_Trait::_TriviallyAvailable,
-                                __move_assignment& operator=(__move_assignment&& __that) = default);
+                                _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __move_assignment& operator=(
+                                    __move_assignment&& __that) = default);
 
 _LIBCPP_VARIANT_MOVE_ASSIGNMENT(
     _Trait::_Available,
-    _LIBCPP_HIDE_FROM_ABI __move_assignment&
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __move_assignment&
     operator=(__move_assignment&& __that) noexcept(
         __all<(is_nothrow_move_constructible_v<_Types> && is_nothrow_move_assignable_v<_Types>)...>::value) {
       this->__generic_assign(std::move(__that));
       return *this;
     } _LIBCPP_EAT_SEMICOLON);
 
-_LIBCPP_VARIANT_MOVE_ASSIGNMENT(_Trait::_Unavailable, __move_assignment& operator=(__move_assignment&&) = delete);
+_LIBCPP_VARIANT_MOVE_ASSIGNMENT(
+    _Trait::_Unavailable,
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __move_assignment& operator=(__move_assignment&&) = delete);
 
 #  undef _LIBCPP_VARIANT_MOVE_ASSIGNMENT
 
 template <class _Traits, _Trait = _Traits::__copy_assignable_trait>
 class _LIBCPP_TEMPLATE_VIS __copy_assignment;
 
-#  define _LIBCPP_VARIANT_COPY_ASSIGNMENT(copy_assignable_trait, copy_assignment)                                      \
+#  define _LIBCPP_VARIANT_COPY_ASSIGNMENT(copy_assignable_trait, copy_assignment_definition)                           \
     template <class... _Types>                                                                                         \
     class _LIBCPP_TEMPLATE_VIS __copy_assignment<__traits<_Types...>, copy_assignable_trait>                           \
         : public __move_assignment<__traits<_Types...>> {                                                              \
@@ -1000,23 +1011,28 @@ class _LIBCPP_TEMPLATE_VIS __copy_assignment;
       using __base_type::__base_type;                                                                                  \
       using __base_type::operator=;                                                                                    \
                                                                                                                        \
-      __copy_assignment(const __copy_assignment&)       = default;                                                     \
-      __copy_assignment(__copy_assignment&&)            = default;                                                     \
-      ~__copy_assignment()                              = default;                                                     \
-      __copy_assignment& operator=(__copy_assignment&&) = default;                                                     \
-      copy_assignment;                                                                                                 \
+      _LIBCPP_HIDE_FROM_ABI __copy_assignment(const __copy_assignment&)       = default;                               \
+      _LIBCPP_HIDE_FROM_ABI __copy_assignment(__copy_assignment&&)            = default;                               \
+      _LIBCPP_HIDE_FROM_ABI ~__copy_assignment()                              = default;                               \
+      _LIBCPP_HIDE_FROM_ABI __copy_assignment& operator=(__copy_assignment&&) = default;                               \
+      copy_assignment_definition;                                                                                      \
     }
 
 _LIBCPP_VARIANT_COPY_ASSIGNMENT(_Trait::_TriviallyAvailable,
-                                __copy_assignment& operator=(const __copy_assignment& __that) = default);
+                                _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __copy_assignment& operator=(
+                                    const __copy_assignment& __that) = default);
 
 _LIBCPP_VARIANT_COPY_ASSIGNMENT(
-    _Trait::_Available, _LIBCPP_HIDE_FROM_ABI __copy_assignment& operator=(const __copy_assignment& __that) {
+    _Trait::_Available,
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __copy_assignment&
+    operator=(const __copy_assignment& __that) {
       this->__generic_assign(__that);
       return *this;
     } _LIBCPP_EAT_SEMICOLON);
 
-_LIBCPP_VARIANT_COPY_ASSIGNMENT(_Trait::_Unavailable, __copy_assignment& operator=(const __copy_assignment&) = delete);
+_LIBCPP_VARIANT_COPY_ASSIGNMENT(_Trait::_Unavailable,
+                                _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __copy_assignment& operator=(
+                                    const __copy_assignment&) = delete);
 
 #  undef _LIBCPP_VARIANT_COPY_ASSIGNMENT
 
@@ -1032,11 +1048,11 @@ public:
   _LIBCPP_HIDE_FROM_ABI __impl& operator=(__impl&&)      = default;
 
   template <size_t _Ip, class _Arg>
-  _LIBCPP_HIDE_FROM_ABI void __assign(_Arg&& __arg) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __assign(_Arg&& __arg) {
     this->__assign_alt(__access::__base::__get_alt<_Ip>(*this), std::forward<_Arg>(__arg));
   }
 
-  inline _LIBCPP_HIDE_FROM_ABI void __swap(__impl& __that) {
+  inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __swap(__impl& __that) {
     if (this->valueless_by_exception() && __that.valueless_by_exception()) {
       // do nothing.
     } else if (this->index() == __that.index()) {
@@ -1081,7 +1097,7 @@ public:
   }
 
 private:
-  inline _LIBCPP_HIDE_FROM_ABI bool __move_nothrow() const {
+  constexpr inline _LIBCPP_HIDE_FROM_ABI bool __move_nothrow() const {
     constexpr bool __results[] = {is_nothrow_move_constructible_v<_Types>...};
     return this->valueless_by_exception() || __results[this->index()];
   }
@@ -1223,7 +1239,7 @@ public:
       _Args&&... __args) noexcept(is_nothrow_constructible_v<_Tp, initializer_list< _Up>&, _Args...>)
       : __impl_(in_place_index<_Ip>, __il, std::forward<_Args>(__args)...) {}
 
-  _LIBCPP_HIDE_FROM_ABI ~variant() = default;
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 ~variant() = default;
 
   _LIBCPP_HIDE_FROM_ABI constexpr variant& operator=(const variant&) = default;
   _LIBCPP_HIDE_FROM_ABI constexpr variant& operator=(variant&&)      = default;
@@ -1233,7 +1249,7 @@ public:
              class _Tp  = __variant_detail::__best_match_t<_Arg, _Types...>,
              size_t _Ip = __find_detail::__find_unambiguous_index_sfinae<_Tp, _Types...>::value,
              enable_if_t<is_assignable_v<_Tp&, _Arg> && is_constructible_v<_Tp, _Arg>, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI variant&
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 variant&
   operator=(_Arg&& __arg) noexcept(is_nothrow_assignable_v<_Tp&, _Arg> && is_nothrow_constructible_v<_Tp, _Arg>) {
     __impl_.template __assign<_Ip>(std::forward<_Arg>(__arg));
     return *this;
@@ -1244,7 +1260,7 @@ public:
              enable_if_t<(_Ip < sizeof...(_Types)), int>         = 0,
              class _Tp                                           = variant_alternative_t<_Ip, variant<_Types...>>,
              enable_if_t<is_constructible_v<_Tp, _Args...>, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI _Tp& emplace(_Args&&... __args) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp& emplace(_Args&&... __args) {
     return __impl_.template __emplace<_Ip>(std::forward<_Args>(__args)...);
   }
 
@@ -1254,7 +1270,7 @@ public:
              enable_if_t<(_Ip < sizeof...(_Types)), int> = 0,
              class _Tp                                   = variant_alternative_t<_Ip, variant<_Types...>>,
              enable_if_t<is_constructible_v<_Tp, initializer_list<_Up>&, _Args...>, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI _Tp& emplace(initializer_list<_Up> __il, _Args&&... __args) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp& emplace(initializer_list<_Up> __il, _Args&&... __args) {
     return __impl_.template __emplace<_Ip>(__il, std::forward<_Args>(__args)...);
   }
 
@@ -1262,7 +1278,7 @@ public:
              class... _Args,
              size_t _Ip = __find_detail::__find_unambiguous_index_sfinae<_Tp, _Types...>::value,
              enable_if_t<is_constructible_v<_Tp, _Args...>, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI _Tp& emplace(_Args&&... __args) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp& emplace(_Args&&... __args) {
     return __impl_.template __emplace<_Ip>(std::forward<_Args>(__args)...);
   }
 
@@ -1271,7 +1287,7 @@ public:
              class... _Args,
              size_t _Ip = __find_detail::__find_unambiguous_index_sfinae<_Tp, _Types...>::value,
              enable_if_t<is_constructible_v<_Tp, initializer_list<_Up>&, _Args...>, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI _Tp& emplace(initializer_list<_Up> __il, _Args&&... __args) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp& emplace(initializer_list<_Up> __il, _Args&&... __args) {
     return __impl_.template __emplace<_Ip>(__il, std::forward<_Args>(__args)...);
   }
 
@@ -1285,7 +1301,7 @@ public:
              enable_if_t< __all<(__dependent_type<is_move_constructible<_Types>, _Dummy>::value &&
                                  __dependent_type<is_swappable<_Types>, _Dummy>::value)...>::value,
                           int> = 0>
-  _LIBCPP_HIDE_FROM_ABI void swap(variant& __that) noexcept(
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void swap(variant& __that) noexcept(
       __all<(is_nothrow_move_constructible_v<_Types> && is_nothrow_swappable_v<_Types>)...>::value) {
     __impl_.__swap(__that.__impl_);
   }
@@ -1568,7 +1584,7 @@ visit(_Visitor&& __visitor, _Vs&&... __vs) {
 #  endif
 
 template <class... _Types>
-_LIBCPP_HIDE_FROM_ABI auto
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 auto
 swap(variant<_Types...>& __lhs, variant<_Types...>& __rhs) noexcept(noexcept(__lhs.swap(__rhs)))
     -> decltype(__lhs.swap(__rhs)) {
   return __lhs.swap(__rhs);
diff --git a/libcxx/include/version b/libcxx/include/version
index 0ed77345baa7..ba116957b033 100644
--- a/libcxx/include/version
+++ b/libcxx/include/version
@@ -79,6 +79,10 @@ __cpp_lib_constexpr_utility                             201811L <utility>
 __cpp_lib_constexpr_vector                              201907L <vector>
 __cpp_lib_constrained_equality                          202403L <optional> <tuple> <utility>
                                                                 <variant>
+__cpp_lib_containers_ranges                             202202L <deque> <forward_list> <list>
+                                                                <map> <queue> <set>
+                                                                <stack> <string> <unordered_map>
+                                                                <unordered_set> <vector>
 __cpp_lib_copyable_function                             202306L <functional>
 __cpp_lib_coroutine                                     201902L <coroutine>
 __cpp_lib_debugging                                     202311L <debugging>
@@ -184,10 +188,7 @@ __cpp_lib_ranges_join_with                              202202L <ranges>
 __cpp_lib_ranges_repeat                                 202207L <ranges>
 __cpp_lib_ranges_slide                                  202202L <ranges>
 __cpp_lib_ranges_starts_ends_with                       202106L <algorithm>
-__cpp_lib_ranges_to_container                           202202L <deque> <forward_list> <list>
-                                                                <map> <queue> <ranges>
-                                                                <set> <stack> <string>
-                                                                <unordered_map> <unordered_set> <vector>
+__cpp_lib_ranges_to_container                           202202L <ranges>
 __cpp_lib_ranges_zip                                    202110L <ranges> <tuple> <utility>
 __cpp_lib_ratio                                         202306L <ratio>
 __cpp_lib_raw_memory_algorithms                         201606L <memory>
@@ -456,6 +457,7 @@ __cpp_lib_within_lifetime                               202306L <type_traits>
 # undef  __cpp_lib_constexpr_memory
 # define __cpp_lib_constexpr_memory                     202202L
 # define __cpp_lib_constexpr_typeinfo                   202106L
+# define __cpp_lib_containers_ranges                    202202L
 # define __cpp_lib_expected                             202211L
 // # define __cpp_lib_format_path                          202403L
 # define __cpp_lib_format_ranges                        202207L
@@ -524,7 +526,7 @@ __cpp_lib_within_lifetime                               202306L <type_traits>
 // # define __cpp_lib_ranges_concat                        202403L
 # define __cpp_lib_ratio                                202306L
 // # define __cpp_lib_rcu                                  202306L
-// # define __cpp_lib_reference_wrapper                    202403L
+# define __cpp_lib_reference_wrapper                    202403L
 # define __cpp_lib_saturation_arithmetic                202311L
 // # define __cpp_lib_smart_ptr_owner_equality             202306L
 # define __cpp_lib_span_at                              202311L
diff --git a/libcxx/modules/std.compat/cctype.inc b/libcxx/modules/std.compat/cctype.inc
index 56fb45a374a5..5cde12ddb38d 100644
--- a/libcxx/modules/std.compat/cctype.inc
+++ b/libcxx/modules/std.compat/cctype.inc
@@ -8,18 +8,18 @@
 //===----------------------------------------------------------------------===//
 
 export {
-  using ::isalnum;
-  using ::isalpha;
-  using ::isblank;
-  using ::iscntrl;
-  using ::isdigit;
-  using ::isgraph;
-  using ::islower;
-  using ::isprint;
-  using ::ispunct;
-  using ::isspace;
-  using ::isupper;
-  using ::isxdigit;
-  using ::tolower;
-  using ::toupper;
+  using ::isalnum _LIBCPP_USING_IF_EXISTS;
+  using ::isalpha _LIBCPP_USING_IF_EXISTS;
+  using ::isblank _LIBCPP_USING_IF_EXISTS;
+  using ::iscntrl _LIBCPP_USING_IF_EXISTS;
+  using ::isdigit _LIBCPP_USING_IF_EXISTS;
+  using ::isgraph _LIBCPP_USING_IF_EXISTS;
+  using ::islower _LIBCPP_USING_IF_EXISTS;
+  using ::isprint _LIBCPP_USING_IF_EXISTS;
+  using ::ispunct _LIBCPP_USING_IF_EXISTS;
+  using ::isspace _LIBCPP_USING_IF_EXISTS;
+  using ::isupper _LIBCPP_USING_IF_EXISTS;
+  using ::isxdigit _LIBCPP_USING_IF_EXISTS;
+  using ::tolower _LIBCPP_USING_IF_EXISTS;
+  using ::toupper _LIBCPP_USING_IF_EXISTS;
 } // export
diff --git a/libcxx/modules/std.compat/cfenv.inc b/libcxx/modules/std.compat/cfenv.inc
index 50128463d6a9..5a373f646971 100644
--- a/libcxx/modules/std.compat/cfenv.inc
+++ b/libcxx/modules/std.compat/cfenv.inc
@@ -9,21 +9,21 @@
 
 export {
   // types
-  using ::fenv_t;
-  using ::fexcept_t;
+  using ::fenv_t _LIBCPP_USING_IF_EXISTS;
+  using ::fexcept_t _LIBCPP_USING_IF_EXISTS;
 
   // functions
-  using ::feclearexcept;
-  using ::fegetexceptflag;
-  using ::feraiseexcept;
-  using ::fesetexceptflag;
-  using ::fetestexcept;
+  using ::feclearexcept _LIBCPP_USING_IF_EXISTS;
+  using ::fegetexceptflag _LIBCPP_USING_IF_EXISTS;
+  using ::feraiseexcept _LIBCPP_USING_IF_EXISTS;
+  using ::fesetexceptflag _LIBCPP_USING_IF_EXISTS;
+  using ::fetestexcept _LIBCPP_USING_IF_EXISTS;
 
-  using ::fegetround;
-  using ::fesetround;
+  using ::fegetround _LIBCPP_USING_IF_EXISTS;
+  using ::fesetround _LIBCPP_USING_IF_EXISTS;
 
-  using ::fegetenv;
-  using ::feholdexcept;
-  using ::fesetenv;
-  using ::feupdateenv;
+  using ::fegetenv _LIBCPP_USING_IF_EXISTS;
+  using ::feholdexcept _LIBCPP_USING_IF_EXISTS;
+  using ::fesetenv _LIBCPP_USING_IF_EXISTS;
+  using ::feupdateenv _LIBCPP_USING_IF_EXISTS;
 } // export
diff --git a/libcxx/modules/std.compat/cinttypes.inc b/libcxx/modules/std.compat/cinttypes.inc
index a64c088d0d6f..4789ec331020 100644
--- a/libcxx/modules/std.compat/cinttypes.inc
+++ b/libcxx/modules/std.compat/cinttypes.inc
@@ -8,14 +8,14 @@
 //===----------------------------------------------------------------------===//
 
 export {
-  using ::imaxdiv_t;
+  using ::imaxdiv_t _LIBCPP_USING_IF_EXISTS;
 
-  using ::imaxabs;
-  using ::imaxdiv;
-  using ::strtoimax;
-  using ::strtoumax;
-  using ::wcstoimax;
-  using ::wcstoumax;
+  using ::imaxabs _LIBCPP_USING_IF_EXISTS;
+  using ::imaxdiv _LIBCPP_USING_IF_EXISTS;
+  using ::strtoimax _LIBCPP_USING_IF_EXISTS;
+  using ::strtoumax _LIBCPP_USING_IF_EXISTS;
+  using ::wcstoimax _LIBCPP_USING_IF_EXISTS;
+  using ::wcstoumax _LIBCPP_USING_IF_EXISTS;
 
   // abs is conditionally here, but always present in cmath.cppm. To avoid
   // conflicing declarations omit the using here.
diff --git a/libcxx/modules/std.compat/clocale.inc b/libcxx/modules/std.compat/clocale.inc
index d9785a737943..1a975c560a49 100644
--- a/libcxx/modules/std.compat/clocale.inc
+++ b/libcxx/modules/std.compat/clocale.inc
@@ -9,9 +9,9 @@
 
 export {
 #ifndef _LIBCPP_HAS_NO_LOCALIZATION
-  using ::lconv;
+  using ::lconv _LIBCPP_USING_IF_EXISTS;
 
-  using ::localeconv;
-  using ::setlocale;
+  using ::localeconv _LIBCPP_USING_IF_EXISTS;
+  using ::setlocale _LIBCPP_USING_IF_EXISTS;
 #endif // _LIBCPP_HAS_NO_LOCALIZATION
 } // export
diff --git a/libcxx/modules/std.compat/cmath.inc b/libcxx/modules/std.compat/cmath.inc
index de5379275c5f..6c86d0df5740 100644
--- a/libcxx/modules/std.compat/cmath.inc
+++ b/libcxx/modules/std.compat/cmath.inc
@@ -8,241 +8,241 @@
 //===----------------------------------------------------------------------===//
 
 export {
-  using ::double_t;
-  using ::float_t;
+  using ::double_t _LIBCPP_USING_IF_EXISTS;
+  using ::float_t _LIBCPP_USING_IF_EXISTS;
 
-  using ::acos;
-  using ::acosf;
-  using ::acosl;
+  using ::acos _LIBCPP_USING_IF_EXISTS;
+  using ::acosf _LIBCPP_USING_IF_EXISTS;
+  using ::acosl _LIBCPP_USING_IF_EXISTS;
 
-  using ::asin;
-  using ::asinf;
-  using ::asinl;
+  using ::asin _LIBCPP_USING_IF_EXISTS;
+  using ::asinf _LIBCPP_USING_IF_EXISTS;
+  using ::asinl _LIBCPP_USING_IF_EXISTS;
 
-  using ::atan;
-  using ::atanf;
-  using ::atanl;
+  using ::atan _LIBCPP_USING_IF_EXISTS;
+  using ::atanf _LIBCPP_USING_IF_EXISTS;
+  using ::atanl _LIBCPP_USING_IF_EXISTS;
 
-  using ::atan2;
-  using ::atan2f;
-  using ::atan2l;
+  using ::atan2 _LIBCPP_USING_IF_EXISTS;
+  using ::atan2f _LIBCPP_USING_IF_EXISTS;
+  using ::atan2l _LIBCPP_USING_IF_EXISTS;
 
-  using ::cos;
-  using ::cosf;
-  using ::cosl;
+  using ::cos _LIBCPP_USING_IF_EXISTS;
+  using ::cosf _LIBCPP_USING_IF_EXISTS;
+  using ::cosl _LIBCPP_USING_IF_EXISTS;
 
-  using ::sin;
-  using ::sinf;
-  using ::sinl;
+  using ::sin _LIBCPP_USING_IF_EXISTS;
+  using ::sinf _LIBCPP_USING_IF_EXISTS;
+  using ::sinl _LIBCPP_USING_IF_EXISTS;
 
-  using ::tan;
-  using ::tanf;
-  using ::tanl;
+  using ::tan _LIBCPP_USING_IF_EXISTS;
+  using ::tanf _LIBCPP_USING_IF_EXISTS;
+  using ::tanl _LIBCPP_USING_IF_EXISTS;
 
-  using ::acosh;
-  using ::acoshf;
-  using ::acoshl;
+  using ::acosh _LIBCPP_USING_IF_EXISTS;
+  using ::acoshf _LIBCPP_USING_IF_EXISTS;
+  using ::acoshl _LIBCPP_USING_IF_EXISTS;
 
-  using ::asinh;
-  using ::asinhf;
-  using ::asinhl;
+  using ::asinh _LIBCPP_USING_IF_EXISTS;
+  using ::asinhf _LIBCPP_USING_IF_EXISTS;
+  using ::asinhl _LIBCPP_USING_IF_EXISTS;
 
-  using ::atanh;
-  using ::atanhf;
-  using ::atanhl;
+  using ::atanh _LIBCPP_USING_IF_EXISTS;
+  using ::atanhf _LIBCPP_USING_IF_EXISTS;
+  using ::atanhl _LIBCPP_USING_IF_EXISTS;
 
-  using ::cosh;
-  using ::coshf;
-  using ::coshl;
+  using ::cosh _LIBCPP_USING_IF_EXISTS;
+  using ::coshf _LIBCPP_USING_IF_EXISTS;
+  using ::coshl _LIBCPP_USING_IF_EXISTS;
 
-  using ::sinh;
-  using ::sinhf;
-  using ::sinhl;
+  using ::sinh _LIBCPP_USING_IF_EXISTS;
+  using ::sinhf _LIBCPP_USING_IF_EXISTS;
+  using ::sinhl _LIBCPP_USING_IF_EXISTS;
 
-  using ::tanh;
-  using ::tanhf;
-  using ::tanhl;
+  using ::tanh _LIBCPP_USING_IF_EXISTS;
+  using ::tanhf _LIBCPP_USING_IF_EXISTS;
+  using ::tanhl _LIBCPP_USING_IF_EXISTS;
 
-  using ::exp;
-  using ::expf;
-  using ::expl;
+  using ::exp _LIBCPP_USING_IF_EXISTS;
+  using ::expf _LIBCPP_USING_IF_EXISTS;
+  using ::expl _LIBCPP_USING_IF_EXISTS;
 
-  using ::exp2;
-  using ::exp2f;
-  using ::exp2l;
+  using ::exp2 _LIBCPP_USING_IF_EXISTS;
+  using ::exp2f _LIBCPP_USING_IF_EXISTS;
+  using ::exp2l _LIBCPP_USING_IF_EXISTS;
 
-  using ::expm1;
-  using ::expm1f;
-  using ::expm1l;
+  using ::expm1 _LIBCPP_USING_IF_EXISTS;
+  using ::expm1f _LIBCPP_USING_IF_EXISTS;
+  using ::expm1l _LIBCPP_USING_IF_EXISTS;
 
-  using ::frexp;
-  using ::frexpf;
-  using ::frexpl;
+  using ::frexp _LIBCPP_USING_IF_EXISTS;
+  using ::frexpf _LIBCPP_USING_IF_EXISTS;
+  using ::frexpl _LIBCPP_USING_IF_EXISTS;
 
-  using ::ilogb;
-  using ::ilogbf;
-  using ::ilogbl;
+  using ::ilogb _LIBCPP_USING_IF_EXISTS;
+  using ::ilogbf _LIBCPP_USING_IF_EXISTS;
+  using ::ilogbl _LIBCPP_USING_IF_EXISTS;
 
-  using ::ldexp;
-  using ::ldexpf;
-  using ::ldexpl;
+  using ::ldexp _LIBCPP_USING_IF_EXISTS;
+  using ::ldexpf _LIBCPP_USING_IF_EXISTS;
+  using ::ldexpl _LIBCPP_USING_IF_EXISTS;
 
-  using ::log;
-  using ::logf;
-  using ::logl;
+  using ::log _LIBCPP_USING_IF_EXISTS;
+  using ::logf _LIBCPP_USING_IF_EXISTS;
+  using ::logl _LIBCPP_USING_IF_EXISTS;
 
-  using ::log10;
-  using ::log10f;
-  using ::log10l;
+  using ::log10 _LIBCPP_USING_IF_EXISTS;
+  using ::log10f _LIBCPP_USING_IF_EXISTS;
+  using ::log10l _LIBCPP_USING_IF_EXISTS;
 
-  using ::log1p;
-  using ::log1pf;
-  using ::log1pl;
+  using ::log1p _LIBCPP_USING_IF_EXISTS;
+  using ::log1pf _LIBCPP_USING_IF_EXISTS;
+  using ::log1pl _LIBCPP_USING_IF_EXISTS;
 
-  using ::log2;
-  using ::log2f;
-  using ::log2l;
+  using ::log2 _LIBCPP_USING_IF_EXISTS;
+  using ::log2f _LIBCPP_USING_IF_EXISTS;
+  using ::log2l _LIBCPP_USING_IF_EXISTS;
 
-  using ::logb;
-  using ::logbf;
-  using ::logbl;
+  using ::logb _LIBCPP_USING_IF_EXISTS;
+  using ::logbf _LIBCPP_USING_IF_EXISTS;
+  using ::logbl _LIBCPP_USING_IF_EXISTS;
 
-  using ::modf;
-  using ::modff;
-  using ::modfl;
+  using ::modf _LIBCPP_USING_IF_EXISTS;
+  using ::modff _LIBCPP_USING_IF_EXISTS;
+  using ::modfl _LIBCPP_USING_IF_EXISTS;
 
-  using ::scalbn;
-  using ::scalbnf;
-  using ::scalbnl;
+  using ::scalbn _LIBCPP_USING_IF_EXISTS;
+  using ::scalbnf _LIBCPP_USING_IF_EXISTS;
+  using ::scalbnl _LIBCPP_USING_IF_EXISTS;
 
-  using ::scalbln;
-  using ::scalblnf;
-  using ::scalblnl;
+  using ::scalbln _LIBCPP_USING_IF_EXISTS;
+  using ::scalblnf _LIBCPP_USING_IF_EXISTS;
+  using ::scalblnl _LIBCPP_USING_IF_EXISTS;
 
-  using ::cbrt;
-  using ::cbrtf;
-  using ::cbrtl;
+  using ::cbrt _LIBCPP_USING_IF_EXISTS;
+  using ::cbrtf _LIBCPP_USING_IF_EXISTS;
+  using ::cbrtl _LIBCPP_USING_IF_EXISTS;
 
   // [c.math.abs], absolute values
-  using ::abs;
+  using ::abs _LIBCPP_USING_IF_EXISTS;
 
-  using ::fabs;
-  using ::fabsf;
-  using ::fabsl;
+  using ::fabs _LIBCPP_USING_IF_EXISTS;
+  using ::fabsf _LIBCPP_USING_IF_EXISTS;
+  using ::fabsl _LIBCPP_USING_IF_EXISTS;
 
-  using ::hypot;
-  using ::hypotf;
-  using ::hypotl;
+  using ::hypot _LIBCPP_USING_IF_EXISTS;
+  using ::hypotf _LIBCPP_USING_IF_EXISTS;
+  using ::hypotl _LIBCPP_USING_IF_EXISTS;
 
   // [c.math.hypot3], three-dimensional hypotenuse
 
-  using ::pow;
-  using ::powf;
-  using ::powl;
+  using ::pow _LIBCPP_USING_IF_EXISTS;
+  using ::powf _LIBCPP_USING_IF_EXISTS;
+  using ::powl _LIBCPP_USING_IF_EXISTS;
 
-  using ::sqrt;
-  using ::sqrtf;
-  using ::sqrtl;
+  using ::sqrt _LIBCPP_USING_IF_EXISTS;
+  using ::sqrtf _LIBCPP_USING_IF_EXISTS;
+  using ::sqrtl _LIBCPP_USING_IF_EXISTS;
 
-  using ::erf;
-  using ::erff;
-  using ::erfl;
+  using ::erf _LIBCPP_USING_IF_EXISTS;
+  using ::erff _LIBCPP_USING_IF_EXISTS;
+  using ::erfl _LIBCPP_USING_IF_EXISTS;
 
-  using ::erfc;
-  using ::erfcf;
-  using ::erfcl;
+  using ::erfc _LIBCPP_USING_IF_EXISTS;
+  using ::erfcf _LIBCPP_USING_IF_EXISTS;
+  using ::erfcl _LIBCPP_USING_IF_EXISTS;
 
-  using ::lgamma;
-  using ::lgammaf;
-  using ::lgammal;
+  using ::lgamma _LIBCPP_USING_IF_EXISTS;
+  using ::lgammaf _LIBCPP_USING_IF_EXISTS;
+  using ::lgammal _LIBCPP_USING_IF_EXISTS;
 
-  using ::tgamma;
-  using ::tgammaf;
-  using ::tgammal;
+  using ::tgamma _LIBCPP_USING_IF_EXISTS;
+  using ::tgammaf _LIBCPP_USING_IF_EXISTS;
+  using ::tgammal _LIBCPP_USING_IF_EXISTS;
 
-  using ::ceil;
-  using ::ceilf;
-  using ::ceill;
+  using ::ceil _LIBCPP_USING_IF_EXISTS;
+  using ::ceilf _LIBCPP_USING_IF_EXISTS;
+  using ::ceill _LIBCPP_USING_IF_EXISTS;
 
-  using ::floor;
-  using ::floorf;
-  using ::floorl;
+  using ::floor _LIBCPP_USING_IF_EXISTS;
+  using ::floorf _LIBCPP_USING_IF_EXISTS;
+  using ::floorl _LIBCPP_USING_IF_EXISTS;
 
-  using ::nearbyint;
-  using ::nearbyintf;
-  using ::nearbyintl;
+  using ::nearbyint _LIBCPP_USING_IF_EXISTS;
+  using ::nearbyintf _LIBCPP_USING_IF_EXISTS;
+  using ::nearbyintl _LIBCPP_USING_IF_EXISTS;
 
-  using ::rint;
-  using ::rintf;
-  using ::rintl;
+  using ::rint _LIBCPP_USING_IF_EXISTS;
+  using ::rintf _LIBCPP_USING_IF_EXISTS;
+  using ::rintl _LIBCPP_USING_IF_EXISTS;
 
-  using ::lrint;
-  using ::lrintf;
-  using ::lrintl;
+  using ::lrint _LIBCPP_USING_IF_EXISTS;
+  using ::lrintf _LIBCPP_USING_IF_EXISTS;
+  using ::lrintl _LIBCPP_USING_IF_EXISTS;
 
-  using ::llrint;
-  using ::llrintf;
-  using ::llrintl;
+  using ::llrint _LIBCPP_USING_IF_EXISTS;
+  using ::llrintf _LIBCPP_USING_IF_EXISTS;
+  using ::llrintl _LIBCPP_USING_IF_EXISTS;
 
-  using ::round;
-  using ::roundf;
-  using ::roundl;
+  using ::round _LIBCPP_USING_IF_EXISTS;
+  using ::roundf _LIBCPP_USING_IF_EXISTS;
+  using ::roundl _LIBCPP_USING_IF_EXISTS;
 
-  using ::lround;
-  using ::lroundf;
-  using ::lroundl;
+  using ::lround _LIBCPP_USING_IF_EXISTS;
+  using ::lroundf _LIBCPP_USING_IF_EXISTS;
+  using ::lroundl _LIBCPP_USING_IF_EXISTS;
 
-  using ::llround;
-  using ::llroundf;
-  using ::llroundl;
+  using ::llround _LIBCPP_USING_IF_EXISTS;
+  using ::llroundf _LIBCPP_USING_IF_EXISTS;
+  using ::llroundl _LIBCPP_USING_IF_EXISTS;
 
-  using ::trunc;
-  using ::truncf;
-  using ::truncl;
+  using ::trunc _LIBCPP_USING_IF_EXISTS;
+  using ::truncf _LIBCPP_USING_IF_EXISTS;
+  using ::truncl _LIBCPP_USING_IF_EXISTS;
 
-  using ::fmod;
-  using ::fmodf;
-  using ::fmodl;
+  using ::fmod _LIBCPP_USING_IF_EXISTS;
+  using ::fmodf _LIBCPP_USING_IF_EXISTS;
+  using ::fmodl _LIBCPP_USING_IF_EXISTS;
 
-  using ::remainder;
-  using ::remainderf;
-  using ::remainderl;
+  using ::remainder _LIBCPP_USING_IF_EXISTS;
+  using ::remainderf _LIBCPP_USING_IF_EXISTS;
+  using ::remainderl _LIBCPP_USING_IF_EXISTS;
 
-  using ::remquo;
-  using ::remquof;
-  using ::remquol;
+  using ::remquo _LIBCPP_USING_IF_EXISTS;
+  using ::remquof _LIBCPP_USING_IF_EXISTS;
+  using ::remquol _LIBCPP_USING_IF_EXISTS;
 
-  using ::copysign;
-  using ::copysignf;
-  using ::copysignl;
+  using ::copysign _LIBCPP_USING_IF_EXISTS;
+  using ::copysignf _LIBCPP_USING_IF_EXISTS;
+  using ::copysignl _LIBCPP_USING_IF_EXISTS;
 
-  using ::nan;
-  using ::nanf;
-  using ::nanl;
+  using ::nan _LIBCPP_USING_IF_EXISTS;
+  using ::nanf _LIBCPP_USING_IF_EXISTS;
+  using ::nanl _LIBCPP_USING_IF_EXISTS;
 
-  using ::nextafter;
-  using ::nextafterf;
-  using ::nextafterl;
+  using ::nextafter _LIBCPP_USING_IF_EXISTS;
+  using ::nextafterf _LIBCPP_USING_IF_EXISTS;
+  using ::nextafterl _LIBCPP_USING_IF_EXISTS;
 
-  using ::nexttoward;
-  using ::nexttowardf;
-  using ::nexttowardl;
+  using ::nexttoward _LIBCPP_USING_IF_EXISTS;
+  using ::nexttowardf _LIBCPP_USING_IF_EXISTS;
+  using ::nexttowardl _LIBCPP_USING_IF_EXISTS;
 
-  using ::fdim;
-  using ::fdimf;
-  using ::fdiml;
+  using ::fdim _LIBCPP_USING_IF_EXISTS;
+  using ::fdimf _LIBCPP_USING_IF_EXISTS;
+  using ::fdiml _LIBCPP_USING_IF_EXISTS;
 
-  using ::fmax;
-  using ::fmaxf;
-  using ::fmaxl;
+  using ::fmax _LIBCPP_USING_IF_EXISTS;
+  using ::fmaxf _LIBCPP_USING_IF_EXISTS;
+  using ::fmaxl _LIBCPP_USING_IF_EXISTS;
 
-  using ::fmin;
-  using ::fminf;
-  using ::fminl;
+  using ::fmin _LIBCPP_USING_IF_EXISTS;
+  using ::fminf _LIBCPP_USING_IF_EXISTS;
+  using ::fminl _LIBCPP_USING_IF_EXISTS;
 
-  using ::fma;
-  using ::fmaf;
-  using ::fmal;
+  using ::fma _LIBCPP_USING_IF_EXISTS;
+  using ::fmaf _LIBCPP_USING_IF_EXISTS;
+  using ::fmal _LIBCPP_USING_IF_EXISTS;
 
   // [c.math.lerp], linear interpolation
   // [support.c.headers.other]/1
@@ -251,18 +251,18 @@ export {
   // ...
 
   // [c.math.fpclass], classification / comparison functions
-  using ::fpclassify;
-  using ::isfinite;
-  using ::isgreater;
-  using ::isgreaterequal;
-  using ::isinf;
-  using ::isless;
-  using ::islessequal;
-  using ::islessgreater;
-  using ::isnan;
-  using ::isnormal;
-  using ::isunordered;
-  using ::signbit;
+  using ::fpclassify _LIBCPP_USING_IF_EXISTS;
+  using ::isfinite _LIBCPP_USING_IF_EXISTS;
+  using ::isgreater _LIBCPP_USING_IF_EXISTS;
+  using ::isgreaterequal _LIBCPP_USING_IF_EXISTS;
+  using ::isinf _LIBCPP_USING_IF_EXISTS;
+  using ::isless _LIBCPP_USING_IF_EXISTS;
+  using ::islessequal _LIBCPP_USING_IF_EXISTS;
+  using ::islessgreater _LIBCPP_USING_IF_EXISTS;
+  using ::isnan _LIBCPP_USING_IF_EXISTS;
+  using ::isnormal _LIBCPP_USING_IF_EXISTS;
+  using ::isunordered _LIBCPP_USING_IF_EXISTS;
+  using ::signbit _LIBCPP_USING_IF_EXISTS;
 
   // [sf.cmath], mathematical special functions
 } // export
diff --git a/libcxx/modules/std.compat/csetjmp.inc b/libcxx/modules/std.compat/csetjmp.inc
index 1fc42ea3ee03..53e1421a2fbd 100644
--- a/libcxx/modules/std.compat/csetjmp.inc
+++ b/libcxx/modules/std.compat/csetjmp.inc
@@ -8,6 +8,6 @@
 //===----------------------------------------------------------------------===//
 
 export {
-  using ::jmp_buf;
-  using ::longjmp;
+  using ::jmp_buf _LIBCPP_USING_IF_EXISTS;
+  using ::longjmp _LIBCPP_USING_IF_EXISTS;
 } // export
diff --git a/libcxx/modules/std.compat/csignal.inc b/libcxx/modules/std.compat/csignal.inc
index 33af6a9f2b73..bf72459df165 100644
--- a/libcxx/modules/std.compat/csignal.inc
+++ b/libcxx/modules/std.compat/csignal.inc
@@ -8,10 +8,10 @@
 //===----------------------------------------------------------------------===//
 
 export {
-  using ::sig_atomic_t;
+  using ::sig_atomic_t _LIBCPP_USING_IF_EXISTS;
 
   // [support.signal], signal handlers
-  using ::signal;
+  using ::signal _LIBCPP_USING_IF_EXISTS;
 
-  using ::raise;
+  using ::raise _LIBCPP_USING_IF_EXISTS;
 } // export
diff --git a/libcxx/modules/std.compat/cstdarg.inc b/libcxx/modules/std.compat/cstdarg.inc
index 3efb34617a8b..79b5df4ec99c 100644
--- a/libcxx/modules/std.compat/cstdarg.inc
+++ b/libcxx/modules/std.compat/cstdarg.inc
@@ -7,4 +7,4 @@
 //
 //===----------------------------------------------------------------------===//
 
-export { using ::va_list; } // export
+export { using ::va_list _LIBCPP_USING_IF_EXISTS; } // export
diff --git a/libcxx/modules/std.compat/cstddef.inc b/libcxx/modules/std.compat/cstddef.inc
index 94ad036fd8f4..8704a31e01f3 100644
--- a/libcxx/modules/std.compat/cstddef.inc
+++ b/libcxx/modules/std.compat/cstddef.inc
@@ -8,10 +8,10 @@
 //===----------------------------------------------------------------------===//
 
 export {
-  using ::max_align_t;
+  using ::max_align_t _LIBCPP_USING_IF_EXISTS;
   using ::nullptr_t;
-  using ::ptrdiff_t;
-  using ::size_t;
+  using ::ptrdiff_t _LIBCPP_USING_IF_EXISTS;
+  using ::size_t _LIBCPP_USING_IF_EXISTS;
 
   // [support.c.headers]/1
   // ...  placed within the global namespace scope, except for ... the
diff --git a/libcxx/modules/std.compat/cstdint.inc b/libcxx/modules/std.compat/cstdint.inc
index 1a74efc70cea..a8dd6898cb26 100644
--- a/libcxx/modules/std.compat/cstdint.inc
+++ b/libcxx/modules/std.compat/cstdint.inc
@@ -14,17 +14,17 @@ export {
   using ::int32_t _LIBCPP_USING_IF_EXISTS;
   using ::int64_t _LIBCPP_USING_IF_EXISTS;
 
-  using ::int_fast16_t;
-  using ::int_fast32_t;
-  using ::int_fast64_t;
-  using ::int_fast8_t;
+  using ::int_fast16_t _LIBCPP_USING_IF_EXISTS;
+  using ::int_fast32_t _LIBCPP_USING_IF_EXISTS;
+  using ::int_fast64_t _LIBCPP_USING_IF_EXISTS;
+  using ::int_fast8_t _LIBCPP_USING_IF_EXISTS;
 
-  using ::int_least16_t;
-  using ::int_least32_t;
-  using ::int_least64_t;
-  using ::int_least8_t;
+  using ::int_least16_t _LIBCPP_USING_IF_EXISTS;
+  using ::int_least32_t _LIBCPP_USING_IF_EXISTS;
+  using ::int_least64_t _LIBCPP_USING_IF_EXISTS;
+  using ::int_least8_t _LIBCPP_USING_IF_EXISTS;
 
-  using ::intmax_t;
+  using ::intmax_t _LIBCPP_USING_IF_EXISTS;
 
   using ::intptr_t _LIBCPP_USING_IF_EXISTS;
 
@@ -34,17 +34,17 @@ export {
   using ::uint32_t _LIBCPP_USING_IF_EXISTS;
   using ::uint64_t _LIBCPP_USING_IF_EXISTS;
 
-  using ::uint_fast16_t;
-  using ::uint_fast32_t;
-  using ::uint_fast64_t;
-  using ::uint_fast8_t;
+  using ::uint_fast16_t _LIBCPP_USING_IF_EXISTS;
+  using ::uint_fast32_t _LIBCPP_USING_IF_EXISTS;
+  using ::uint_fast64_t _LIBCPP_USING_IF_EXISTS;
+  using ::uint_fast8_t _LIBCPP_USING_IF_EXISTS;
 
-  using ::uint_least16_t;
-  using ::uint_least32_t;
-  using ::uint_least64_t;
-  using ::uint_least8_t;
+  using ::uint_least16_t _LIBCPP_USING_IF_EXISTS;
+  using ::uint_least32_t _LIBCPP_USING_IF_EXISTS;
+  using ::uint_least64_t _LIBCPP_USING_IF_EXISTS;
+  using ::uint_least8_t _LIBCPP_USING_IF_EXISTS;
 
-  using ::uintmax_t;
+  using ::uintmax_t _LIBCPP_USING_IF_EXISTS;
 
   using ::uintptr_t _LIBCPP_USING_IF_EXISTS;
 } // export
diff --git a/libcxx/modules/std.compat/cstdio.inc b/libcxx/modules/std.compat/cstdio.inc
index 1ec3015c9e2a..33dd2cbfb0d7 100644
--- a/libcxx/modules/std.compat/cstdio.inc
+++ b/libcxx/modules/std.compat/cstdio.inc
@@ -8,54 +8,54 @@
 //===----------------------------------------------------------------------===//
 
 export {
-  using ::FILE;
-  using ::fpos_t;
-  using ::size_t;
+  using ::FILE _LIBCPP_USING_IF_EXISTS;
+  using ::fpos_t _LIBCPP_USING_IF_EXISTS;
+  using ::size_t _LIBCPP_USING_IF_EXISTS;
 
-  using ::clearerr;
-  using ::fclose;
-  using ::feof;
-  using ::ferror;
-  using ::fflush;
-  using ::fgetc;
-  using ::fgetpos;
-  using ::fgets;
-  using ::fopen;
-  using ::fprintf;
-  using ::fputc;
-  using ::fputs;
-  using ::fread;
-  using ::freopen;
-  using ::fscanf;
-  using ::fseek;
-  using ::fsetpos;
-  using ::ftell;
-  using ::fwrite;
-  using ::getc;
-  using ::getchar;
-  using ::perror;
-  using ::printf;
-  using ::putc;
-  using ::putchar;
-  using ::puts;
-  using ::remove;
-  using ::rename;
-  using ::rewind;
-  using ::scanf;
-  using ::setbuf;
-  using ::setvbuf;
-  using ::snprintf;
-  using ::sprintf;
-  using ::sscanf;
-  using ::tmpfile;
-  using ::tmpnam;
-  using ::ungetc;
-  using ::vfprintf;
-  using ::vfscanf;
-  using ::vprintf;
-  using ::vscanf;
-  using ::vsnprintf;
-  using ::vsprintf;
-  using ::vsscanf;
+  using ::clearerr _LIBCPP_USING_IF_EXISTS;
+  using ::fclose _LIBCPP_USING_IF_EXISTS;
+  using ::feof _LIBCPP_USING_IF_EXISTS;
+  using ::ferror _LIBCPP_USING_IF_EXISTS;
+  using ::fflush _LIBCPP_USING_IF_EXISTS;
+  using ::fgetc _LIBCPP_USING_IF_EXISTS;
+  using ::fgetpos _LIBCPP_USING_IF_EXISTS;
+  using ::fgets _LIBCPP_USING_IF_EXISTS;
+  using ::fopen _LIBCPP_USING_IF_EXISTS;
+  using ::fprintf _LIBCPP_USING_IF_EXISTS;
+  using ::fputc _LIBCPP_USING_IF_EXISTS;
+  using ::fputs _LIBCPP_USING_IF_EXISTS;
+  using ::fread _LIBCPP_USING_IF_EXISTS;
+  using ::freopen _LIBCPP_USING_IF_EXISTS;
+  using ::fscanf _LIBCPP_USING_IF_EXISTS;
+  using ::fseek _LIBCPP_USING_IF_EXISTS;
+  using ::fsetpos _LIBCPP_USING_IF_EXISTS;
+  using ::ftell _LIBCPP_USING_IF_EXISTS;
+  using ::fwrite _LIBCPP_USING_IF_EXISTS;
+  using ::getc _LIBCPP_USING_IF_EXISTS;
+  using ::getchar _LIBCPP_USING_IF_EXISTS;
+  using ::perror _LIBCPP_USING_IF_EXISTS;
+  using ::printf _LIBCPP_USING_IF_EXISTS;
+  using ::putc _LIBCPP_USING_IF_EXISTS;
+  using ::putchar _LIBCPP_USING_IF_EXISTS;
+  using ::puts _LIBCPP_USING_IF_EXISTS;
+  using ::remove _LIBCPP_USING_IF_EXISTS;
+  using ::rename _LIBCPP_USING_IF_EXISTS;
+  using ::rewind _LIBCPP_USING_IF_EXISTS;
+  using ::scanf _LIBCPP_USING_IF_EXISTS;
+  using ::setbuf _LIBCPP_USING_IF_EXISTS;
+  using ::setvbuf _LIBCPP_USING_IF_EXISTS;
+  using ::snprintf _LIBCPP_USING_IF_EXISTS;
+  using ::sprintf _LIBCPP_USING_IF_EXISTS;
+  using ::sscanf _LIBCPP_USING_IF_EXISTS;
+  using ::tmpfile _LIBCPP_USING_IF_EXISTS;
+  using ::tmpnam _LIBCPP_USING_IF_EXISTS;
+  using ::ungetc _LIBCPP_USING_IF_EXISTS;
+  using ::vfprintf _LIBCPP_USING_IF_EXISTS;
+  using ::vfscanf _LIBCPP_USING_IF_EXISTS;
+  using ::vprintf _LIBCPP_USING_IF_EXISTS;
+  using ::vscanf _LIBCPP_USING_IF_EXISTS;
+  using ::vsnprintf _LIBCPP_USING_IF_EXISTS;
+  using ::vsprintf _LIBCPP_USING_IF_EXISTS;
+  using ::vsscanf _LIBCPP_USING_IF_EXISTS;
 
 } // export
diff --git a/libcxx/modules/std.compat/cstdlib.inc b/libcxx/modules/std.compat/cstdlib.inc
index 4783cbf51623..94f5e7e8d7f4 100644
--- a/libcxx/modules/std.compat/cstdlib.inc
+++ b/libcxx/modules/std.compat/cstdlib.inc
@@ -8,65 +8,65 @@
 //===----------------------------------------------------------------------===//
 
 export {
-  using ::div_t;
-  using ::ldiv_t;
-  using ::lldiv_t;
-  using ::size_t;
+  using ::div_t _LIBCPP_USING_IF_EXISTS;
+  using ::ldiv_t _LIBCPP_USING_IF_EXISTS;
+  using ::lldiv_t _LIBCPP_USING_IF_EXISTS;
+  using ::size_t _LIBCPP_USING_IF_EXISTS;
 
   // [support.start.term], start and termination
-  using ::_Exit;
-  using ::abort;
+  using ::_Exit _LIBCPP_USING_IF_EXISTS;
+  using ::abort _LIBCPP_USING_IF_EXISTS;
   using ::at_quick_exit _LIBCPP_USING_IF_EXISTS;
-  using ::atexit;
-  using ::exit;
+  using ::atexit _LIBCPP_USING_IF_EXISTS;
+  using ::exit _LIBCPP_USING_IF_EXISTS;
   using ::quick_exit _LIBCPP_USING_IF_EXISTS;
 
-  using ::getenv;
-  using ::system;
+  using ::getenv _LIBCPP_USING_IF_EXISTS;
+  using ::system _LIBCPP_USING_IF_EXISTS;
 
   // [c.malloc], C library memory allocation
   using ::aligned_alloc _LIBCPP_USING_IF_EXISTS;
-  using ::calloc;
-  using ::free;
-  using ::malloc;
-  using ::realloc;
+  using ::calloc _LIBCPP_USING_IF_EXISTS;
+  using ::free _LIBCPP_USING_IF_EXISTS;
+  using ::malloc _LIBCPP_USING_IF_EXISTS;
+  using ::realloc _LIBCPP_USING_IF_EXISTS;
 
-  using ::atof;
-  using ::atoi;
-  using ::atol;
-  using ::atoll;
-  using ::strtod;
-  using ::strtof;
-  using ::strtol;
-  using ::strtold;
-  using ::strtoll;
-  using ::strtoul;
-  using ::strtoull;
+  using ::atof _LIBCPP_USING_IF_EXISTS;
+  using ::atoi _LIBCPP_USING_IF_EXISTS;
+  using ::atol _LIBCPP_USING_IF_EXISTS;
+  using ::atoll _LIBCPP_USING_IF_EXISTS;
+  using ::strtod _LIBCPP_USING_IF_EXISTS;
+  using ::strtof _LIBCPP_USING_IF_EXISTS;
+  using ::strtol _LIBCPP_USING_IF_EXISTS;
+  using ::strtold _LIBCPP_USING_IF_EXISTS;
+  using ::strtoll _LIBCPP_USING_IF_EXISTS;
+  using ::strtoul _LIBCPP_USING_IF_EXISTS;
+  using ::strtoull _LIBCPP_USING_IF_EXISTS;
 
   // [c.mb.wcs], multibyte / wide string and character conversion functions
-  using ::mblen;
+  using ::mblen _LIBCPP_USING_IF_EXISTS;
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
-  using ::mbstowcs;
-  using ::mbtowc;
-  using ::wcstombs;
-  using ::wctomb;
+  using ::mbstowcs _LIBCPP_USING_IF_EXISTS;
+  using ::mbtowc _LIBCPP_USING_IF_EXISTS;
+  using ::wcstombs _LIBCPP_USING_IF_EXISTS;
+  using ::wctomb _LIBCPP_USING_IF_EXISTS;
 #endif
   // [alg.c.library], C standard library algorithms
-  using ::bsearch;
-  using ::qsort;
+  using ::bsearch _LIBCPP_USING_IF_EXISTS;
+  using ::qsort _LIBCPP_USING_IF_EXISTS;
 
   // [c.math.rand], low-quality random number generation
-  using ::rand;
-  using ::srand;
+  using ::rand _LIBCPP_USING_IF_EXISTS;
+  using ::srand _LIBCPP_USING_IF_EXISTS;
 
   // [c.math.abs], absolute values
-  using ::abs;
+  using ::abs _LIBCPP_USING_IF_EXISTS;
 
-  using ::labs;
-  using ::llabs;
+  using ::labs _LIBCPP_USING_IF_EXISTS;
+  using ::llabs _LIBCPP_USING_IF_EXISTS;
 
-  using ::div;
-  using ::ldiv;
-  using ::lldiv;
+  using ::div _LIBCPP_USING_IF_EXISTS;
+  using ::ldiv _LIBCPP_USING_IF_EXISTS;
+  using ::lldiv _LIBCPP_USING_IF_EXISTS;
 
 } // export
diff --git a/libcxx/modules/std.compat/cstring.inc b/libcxx/modules/std.compat/cstring.inc
index 090350ae8147..5029a7674bb2 100644
--- a/libcxx/modules/std.compat/cstring.inc
+++ b/libcxx/modules/std.compat/cstring.inc
@@ -8,29 +8,29 @@
 //===----------------------------------------------------------------------===//
 
 export {
-  using ::size_t;
+  using ::size_t _LIBCPP_USING_IF_EXISTS;
 
-  using ::memchr;
-  using ::memcmp;
-  using ::memcpy;
-  using ::memmove;
-  using ::memset;
-  using ::strcat;
-  using ::strchr;
-  using ::strcmp;
-  using ::strcoll;
-  using ::strcpy;
-  using ::strcspn;
-  using ::strerror;
-  using ::strlen;
-  using ::strncat;
-  using ::strncmp;
-  using ::strncpy;
-  using ::strpbrk;
-  using ::strrchr;
-  using ::strspn;
-  using ::strstr;
-  using ::strtok;
-  using ::strxfrm;
+  using ::memchr _LIBCPP_USING_IF_EXISTS;
+  using ::memcmp _LIBCPP_USING_IF_EXISTS;
+  using ::memcpy _LIBCPP_USING_IF_EXISTS;
+  using ::memmove _LIBCPP_USING_IF_EXISTS;
+  using ::memset _LIBCPP_USING_IF_EXISTS;
+  using ::strcat _LIBCPP_USING_IF_EXISTS;
+  using ::strchr _LIBCPP_USING_IF_EXISTS;
+  using ::strcmp _LIBCPP_USING_IF_EXISTS;
+  using ::strcoll _LIBCPP_USING_IF_EXISTS;
+  using ::strcpy _LIBCPP_USING_IF_EXISTS;
+  using ::strcspn _LIBCPP_USING_IF_EXISTS;
+  using ::strerror _LIBCPP_USING_IF_EXISTS;
+  using ::strlen _LIBCPP_USING_IF_EXISTS;
+  using ::strncat _LIBCPP_USING_IF_EXISTS;
+  using ::strncmp _LIBCPP_USING_IF_EXISTS;
+  using ::strncpy _LIBCPP_USING_IF_EXISTS;
+  using ::strpbrk _LIBCPP_USING_IF_EXISTS;
+  using ::strrchr _LIBCPP_USING_IF_EXISTS;
+  using ::strspn _LIBCPP_USING_IF_EXISTS;
+  using ::strstr _LIBCPP_USING_IF_EXISTS;
+  using ::strtok _LIBCPP_USING_IF_EXISTS;
+  using ::strxfrm _LIBCPP_USING_IF_EXISTS;
 
 } // export
diff --git a/libcxx/modules/std.compat/ctime.inc b/libcxx/modules/std.compat/ctime.inc
index 6e621f494348..eba8234a0896 100644
--- a/libcxx/modules/std.compat/ctime.inc
+++ b/libcxx/modules/std.compat/ctime.inc
@@ -8,21 +8,21 @@
 //===----------------------------------------------------------------------===//
 
 export {
-  using ::clock_t;
-  using ::size_t;
-  using ::time_t;
+  using ::clock_t _LIBCPP_USING_IF_EXISTS;
+  using ::size_t _LIBCPP_USING_IF_EXISTS;
+  using ::time_t _LIBCPP_USING_IF_EXISTS;
 
-  using ::timespec;
-  using ::tm;
+  using ::timespec _LIBCPP_USING_IF_EXISTS;
+  using ::tm _LIBCPP_USING_IF_EXISTS;
 
-  using ::asctime;
-  using ::clock;
-  using ::ctime;
-  using ::difftime;
-  using ::gmtime;
-  using ::localtime;
-  using ::mktime;
-  using ::strftime;
-  using ::time;
+  using ::asctime _LIBCPP_USING_IF_EXISTS;
+  using ::clock _LIBCPP_USING_IF_EXISTS;
+  using ::ctime _LIBCPP_USING_IF_EXISTS;
+  using ::difftime _LIBCPP_USING_IF_EXISTS;
+  using ::gmtime _LIBCPP_USING_IF_EXISTS;
+  using ::localtime _LIBCPP_USING_IF_EXISTS;
+  using ::mktime _LIBCPP_USING_IF_EXISTS;
+  using ::strftime _LIBCPP_USING_IF_EXISTS;
+  using ::time _LIBCPP_USING_IF_EXISTS;
   using ::timespec_get _LIBCPP_USING_IF_EXISTS;
 } // export
diff --git a/libcxx/modules/std.compat/cwchar.inc b/libcxx/modules/std.compat/cwchar.inc
index 8905aecbdfec..4cad9281fb49 100644
--- a/libcxx/modules/std.compat/cwchar.inc
+++ b/libcxx/modules/std.compat/cwchar.inc
@@ -9,72 +9,72 @@
 
 export {
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
-  using ::mbstate_t;
-  using ::size_t;
-  using ::wint_t;
+  using ::mbstate_t _LIBCPP_USING_IF_EXISTS;
+  using ::size_t _LIBCPP_USING_IF_EXISTS;
+  using ::wint_t _LIBCPP_USING_IF_EXISTS;
 
-  using ::tm;
+  using ::tm _LIBCPP_USING_IF_EXISTS;
 
-  using ::btowc;
-  using ::fgetwc;
-  using ::fgetws;
-  using ::fputwc;
-  using ::fputws;
-  using ::fwide;
-  using ::fwprintf;
-  using ::fwscanf;
-  using ::getwc;
-  using ::getwchar;
-  using ::putwc;
-  using ::putwchar;
-  using ::swprintf;
-  using ::swscanf;
-  using ::ungetwc;
-  using ::vfwprintf;
-  using ::vfwscanf;
-  using ::vswprintf;
-  using ::vswscanf;
-  using ::vwprintf;
-  using ::vwscanf;
-  using ::wcscat;
-  using ::wcschr;
-  using ::wcscmp;
-  using ::wcscoll;
-  using ::wcscpy;
-  using ::wcscspn;
-  using ::wcsftime;
-  using ::wcslen;
-  using ::wcsncat;
-  using ::wcsncmp;
-  using ::wcsncpy;
-  using ::wcspbrk;
-  using ::wcsrchr;
-  using ::wcsspn;
-  using ::wcsstr;
-  using ::wcstod;
-  using ::wcstof;
-  using ::wcstok;
-  using ::wcstol;
-  using ::wcstold;
-  using ::wcstoll;
-  using ::wcstoul;
-  using ::wcstoull;
-  using ::wcsxfrm;
-  using ::wctob;
-  using ::wmemchr;
-  using ::wmemcmp;
-  using ::wmemcpy;
-  using ::wmemmove;
-  using ::wmemset;
-  using ::wprintf;
-  using ::wscanf;
+  using ::btowc _LIBCPP_USING_IF_EXISTS;
+  using ::fgetwc _LIBCPP_USING_IF_EXISTS;
+  using ::fgetws _LIBCPP_USING_IF_EXISTS;
+  using ::fputwc _LIBCPP_USING_IF_EXISTS;
+  using ::fputws _LIBCPP_USING_IF_EXISTS;
+  using ::fwide _LIBCPP_USING_IF_EXISTS;
+  using ::fwprintf _LIBCPP_USING_IF_EXISTS;
+  using ::fwscanf _LIBCPP_USING_IF_EXISTS;
+  using ::getwc _LIBCPP_USING_IF_EXISTS;
+  using ::getwchar _LIBCPP_USING_IF_EXISTS;
+  using ::putwc _LIBCPP_USING_IF_EXISTS;
+  using ::putwchar _LIBCPP_USING_IF_EXISTS;
+  using ::swprintf _LIBCPP_USING_IF_EXISTS;
+  using ::swscanf _LIBCPP_USING_IF_EXISTS;
+  using ::ungetwc _LIBCPP_USING_IF_EXISTS;
+  using ::vfwprintf _LIBCPP_USING_IF_EXISTS;
+  using ::vfwscanf _LIBCPP_USING_IF_EXISTS;
+  using ::vswprintf _LIBCPP_USING_IF_EXISTS;
+  using ::vswscanf _LIBCPP_USING_IF_EXISTS;
+  using ::vwprintf _LIBCPP_USING_IF_EXISTS;
+  using ::vwscanf _LIBCPP_USING_IF_EXISTS;
+  using ::wcscat _LIBCPP_USING_IF_EXISTS;
+  using ::wcschr _LIBCPP_USING_IF_EXISTS;
+  using ::wcscmp _LIBCPP_USING_IF_EXISTS;
+  using ::wcscoll _LIBCPP_USING_IF_EXISTS;
+  using ::wcscpy _LIBCPP_USING_IF_EXISTS;
+  using ::wcscspn _LIBCPP_USING_IF_EXISTS;
+  using ::wcsftime _LIBCPP_USING_IF_EXISTS;
+  using ::wcslen _LIBCPP_USING_IF_EXISTS;
+  using ::wcsncat _LIBCPP_USING_IF_EXISTS;
+  using ::wcsncmp _LIBCPP_USING_IF_EXISTS;
+  using ::wcsncpy _LIBCPP_USING_IF_EXISTS;
+  using ::wcspbrk _LIBCPP_USING_IF_EXISTS;
+  using ::wcsrchr _LIBCPP_USING_IF_EXISTS;
+  using ::wcsspn _LIBCPP_USING_IF_EXISTS;
+  using ::wcsstr _LIBCPP_USING_IF_EXISTS;
+  using ::wcstod _LIBCPP_USING_IF_EXISTS;
+  using ::wcstof _LIBCPP_USING_IF_EXISTS;
+  using ::wcstok _LIBCPP_USING_IF_EXISTS;
+  using ::wcstol _LIBCPP_USING_IF_EXISTS;
+  using ::wcstold _LIBCPP_USING_IF_EXISTS;
+  using ::wcstoll _LIBCPP_USING_IF_EXISTS;
+  using ::wcstoul _LIBCPP_USING_IF_EXISTS;
+  using ::wcstoull _LIBCPP_USING_IF_EXISTS;
+  using ::wcsxfrm _LIBCPP_USING_IF_EXISTS;
+  using ::wctob _LIBCPP_USING_IF_EXISTS;
+  using ::wmemchr _LIBCPP_USING_IF_EXISTS;
+  using ::wmemcmp _LIBCPP_USING_IF_EXISTS;
+  using ::wmemcpy _LIBCPP_USING_IF_EXISTS;
+  using ::wmemmove _LIBCPP_USING_IF_EXISTS;
+  using ::wmemset _LIBCPP_USING_IF_EXISTS;
+  using ::wprintf _LIBCPP_USING_IF_EXISTS;
+  using ::wscanf _LIBCPP_USING_IF_EXISTS;
 
   // [c.mb.wcs], multibyte / wide string and character conversion functions
-  using ::mbrlen;
-  using ::mbrtowc;
-  using ::mbsinit;
-  using ::mbsrtowcs;
-  using ::wcrtomb;
-  using ::wcsrtombs;
+  using ::mbrlen _LIBCPP_USING_IF_EXISTS;
+  using ::mbrtowc _LIBCPP_USING_IF_EXISTS;
+  using ::mbsinit _LIBCPP_USING_IF_EXISTS;
+  using ::mbsrtowcs _LIBCPP_USING_IF_EXISTS;
+  using ::wcrtomb _LIBCPP_USING_IF_EXISTS;
+  using ::wcsrtombs _LIBCPP_USING_IF_EXISTS;
 #endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS
 } // export
diff --git a/libcxx/modules/std.compat/cwctype.inc b/libcxx/modules/std.compat/cwctype.inc
index 13aa2b7f3fb7..8d06eaa379ea 100644
--- a/libcxx/modules/std.compat/cwctype.inc
+++ b/libcxx/modules/std.compat/cwctype.inc
@@ -9,27 +9,27 @@
 
 export {
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
-  using ::wctrans_t;
-  using ::wctype_t;
-  using ::wint_t;
+  using ::wctrans_t _LIBCPP_USING_IF_EXISTS;
+  using ::wctype_t _LIBCPP_USING_IF_EXISTS;
+  using ::wint_t _LIBCPP_USING_IF_EXISTS;
 
-  using ::iswalnum;
-  using ::iswalpha;
-  using ::iswblank;
-  using ::iswcntrl;
-  using ::iswctype;
-  using ::iswdigit;
-  using ::iswgraph;
-  using ::iswlower;
-  using ::iswprint;
-  using ::iswpunct;
-  using ::iswspace;
-  using ::iswupper;
-  using ::iswxdigit;
-  using ::towctrans;
-  using ::towlower;
-  using ::towupper;
-  using ::wctrans;
-  using ::wctype;
+  using ::iswalnum _LIBCPP_USING_IF_EXISTS;
+  using ::iswalpha _LIBCPP_USING_IF_EXISTS;
+  using ::iswblank _LIBCPP_USING_IF_EXISTS;
+  using ::iswcntrl _LIBCPP_USING_IF_EXISTS;
+  using ::iswctype _LIBCPP_USING_IF_EXISTS;
+  using ::iswdigit _LIBCPP_USING_IF_EXISTS;
+  using ::iswgraph _LIBCPP_USING_IF_EXISTS;
+  using ::iswlower _LIBCPP_USING_IF_EXISTS;
+  using ::iswprint _LIBCPP_USING_IF_EXISTS;
+  using ::iswpunct _LIBCPP_USING_IF_EXISTS;
+  using ::iswspace _LIBCPP_USING_IF_EXISTS;
+  using ::iswupper _LIBCPP_USING_IF_EXISTS;
+  using ::iswxdigit _LIBCPP_USING_IF_EXISTS;
+  using ::towctrans _LIBCPP_USING_IF_EXISTS;
+  using ::towlower _LIBCPP_USING_IF_EXISTS;
+  using ::towupper _LIBCPP_USING_IF_EXISTS;
+  using ::wctrans _LIBCPP_USING_IF_EXISTS;
+  using ::wctype _LIBCPP_USING_IF_EXISTS;
 #endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS
 } // export
diff --git a/libcxx/modules/std/atomic.inc b/libcxx/modules/std/atomic.inc
index 2b54cef863e5..d77d7a5bb0fb 100644
--- a/libcxx/modules/std/atomic.inc
+++ b/libcxx/modules/std/atomic.inc
@@ -10,132 +10,132 @@
 export namespace std {
 
   // [atomics.order], order and consistency
-  using std::memory_order;
-  using std::memory_order_acq_rel;
-  using std::memory_order_acquire;
-  using std::memory_order_consume;
-  using std::memory_order_relaxed;
-  using std::memory_order_release;
-  using std::memory_order_seq_cst;
+  using std::memory_order _LIBCPP_USING_IF_EXISTS;
+  using std::memory_order_acq_rel _LIBCPP_USING_IF_EXISTS;
+  using std::memory_order_acquire _LIBCPP_USING_IF_EXISTS;
+  using std::memory_order_consume _LIBCPP_USING_IF_EXISTS;
+  using std::memory_order_relaxed _LIBCPP_USING_IF_EXISTS;
+  using std::memory_order_release _LIBCPP_USING_IF_EXISTS;
+  using std::memory_order_seq_cst _LIBCPP_USING_IF_EXISTS;
 
-  using std::kill_dependency;
+  using std::kill_dependency _LIBCPP_USING_IF_EXISTS;
 
   // [atomics.ref.generic], class template atomic_ref
   // [atomics.ref.pointer], partial specialization for pointers
-  // using std::atomic_ref;
+  // using std::atomic_ref _LIBCPP_USING_IF_EXISTS;
 
   // [atomics.types.generic], class template atomic
-  using std::atomic;
+  using std::atomic _LIBCPP_USING_IF_EXISTS;
 
   // [atomics.nonmembers], non-member functions
-  using std::atomic_compare_exchange_strong;
-  using std::atomic_compare_exchange_strong_explicit;
-  using std::atomic_compare_exchange_weak;
-  using std::atomic_compare_exchange_weak_explicit;
-  using std::atomic_exchange;
-  using std::atomic_exchange_explicit;
-  using std::atomic_is_lock_free;
-  using std::atomic_load;
-  using std::atomic_load_explicit;
-  using std::atomic_store;
-  using std::atomic_store_explicit;
-
-  using std::atomic_fetch_add;
-  using std::atomic_fetch_add_explicit;
-  using std::atomic_fetch_and;
-  using std::atomic_fetch_and_explicit;
-  using std::atomic_fetch_or;
-  using std::atomic_fetch_or_explicit;
-  using std::atomic_fetch_sub;
-  using std::atomic_fetch_sub_explicit;
-  using std::atomic_fetch_xor;
-  using std::atomic_fetch_xor_explicit;
-  using std::atomic_notify_all;
-  using std::atomic_notify_one;
-  using std::atomic_wait;
-  using std::atomic_wait_explicit;
+  using std::atomic_compare_exchange_strong _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_compare_exchange_strong_explicit _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_compare_exchange_weak _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_compare_exchange_weak_explicit _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_exchange _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_exchange_explicit _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_is_lock_free _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_load _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_load_explicit _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_store _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_store_explicit _LIBCPP_USING_IF_EXISTS;
+
+  using std::atomic_fetch_add _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_fetch_add_explicit _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_fetch_and _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_fetch_and_explicit _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_fetch_or _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_fetch_or_explicit _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_fetch_sub _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_fetch_sub_explicit _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_fetch_xor _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_fetch_xor_explicit _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_notify_all _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_notify_one _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_wait _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_wait_explicit _LIBCPP_USING_IF_EXISTS;
 
   // [atomics.alias], type aliases
-  using std::atomic_bool;
-  using std::atomic_char;
-  using std::atomic_char16_t;
-  using std::atomic_char32_t;
+  using std::atomic_bool _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_char _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_char16_t _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_char32_t _LIBCPP_USING_IF_EXISTS;
 #ifndef _LIBCPP_HAS_NO_CHAR8_T
-  using std::atomic_char8_t;
+  using std::atomic_char8_t _LIBCPP_USING_IF_EXISTS;
 #endif
-  using std::atomic_int;
-  using std::atomic_llong;
-  using std::atomic_long;
-  using std::atomic_schar;
-  using std::atomic_short;
-  using std::atomic_uchar;
-  using std::atomic_uint;
-  using std::atomic_ullong;
-  using std::atomic_ulong;
-  using std::atomic_ushort;
+  using std::atomic_int _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_llong _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_long _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_schar _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_short _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_uchar _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_uint _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_ullong _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_ulong _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_ushort _LIBCPP_USING_IF_EXISTS;
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
-  using std::atomic_wchar_t;
+  using std::atomic_wchar_t _LIBCPP_USING_IF_EXISTS;
 #endif
 
-  using std::atomic_int16_t;
-  using std::atomic_int32_t;
-  using std::atomic_int64_t;
-  using std::atomic_int8_t;
-  using std::atomic_uint16_t;
-  using std::atomic_uint32_t;
-  using std::atomic_uint64_t;
-  using std::atomic_uint8_t;
-
-  using std::atomic_int_least16_t;
-  using std::atomic_int_least32_t;
-  using std::atomic_int_least64_t;
-  using std::atomic_int_least8_t;
-  using std::atomic_uint_least16_t;
-  using std::atomic_uint_least32_t;
-  using std::atomic_uint_least64_t;
-  using std::atomic_uint_least8_t;
-
-  using std::atomic_int_fast16_t;
-  using std::atomic_int_fast32_t;
-  using std::atomic_int_fast64_t;
-  using std::atomic_int_fast8_t;
-  using std::atomic_uint_fast16_t;
-  using std::atomic_uint_fast32_t;
-  using std::atomic_uint_fast64_t;
-  using std::atomic_uint_fast8_t;
-
-  using std::atomic_intmax_t;
-  using std::atomic_intptr_t;
-  using std::atomic_ptrdiff_t;
-  using std::atomic_size_t;
-  using std::atomic_uintmax_t;
-  using std::atomic_uintptr_t;
+  using std::atomic_int16_t _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_int32_t _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_int64_t _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_int8_t _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_uint16_t _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_uint32_t _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_uint64_t _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_uint8_t _LIBCPP_USING_IF_EXISTS;
+
+  using std::atomic_int_least16_t _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_int_least32_t _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_int_least64_t _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_int_least8_t _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_uint_least16_t _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_uint_least32_t _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_uint_least64_t _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_uint_least8_t _LIBCPP_USING_IF_EXISTS;
+
+  using std::atomic_int_fast16_t _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_int_fast32_t _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_int_fast64_t _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_int_fast8_t _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_uint_fast16_t _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_uint_fast32_t _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_uint_fast64_t _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_uint_fast8_t _LIBCPP_USING_IF_EXISTS;
+
+  using std::atomic_intmax_t _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_intptr_t _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_ptrdiff_t _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_size_t _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_uintmax_t _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_uintptr_t _LIBCPP_USING_IF_EXISTS;
 
 #ifndef _LIBCPP_NO_LOCK_FREE_TYPES
-  using std::atomic_signed_lock_free;
-  using std::atomic_unsigned_lock_free;
+  using std::atomic_signed_lock_free _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_unsigned_lock_free _LIBCPP_USING_IF_EXISTS;
 #endif
 
   // [atomics.flag], flag type and operations
-  using std::atomic_flag;
+  using std::atomic_flag _LIBCPP_USING_IF_EXISTS;
 
-  using std::atomic_flag_clear;
-  using std::atomic_flag_clear_explicit;
-  using std::atomic_flag_test;
-  using std::atomic_flag_test_and_set;
-  using std::atomic_flag_test_and_set_explicit;
-  using std::atomic_flag_test_explicit;
+  using std::atomic_flag_clear _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_flag_clear_explicit _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_flag_test _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_flag_test_and_set _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_flag_test_and_set_explicit _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_flag_test_explicit _LIBCPP_USING_IF_EXISTS;
 
-  using std::atomic_flag_notify_all;
-  using std::atomic_flag_notify_one;
-  using std::atomic_flag_wait;
-  using std::atomic_flag_wait_explicit;
+  using std::atomic_flag_notify_all _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_flag_notify_one _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_flag_wait _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_flag_wait_explicit _LIBCPP_USING_IF_EXISTS;
 
   // [atomics.fences], fences
-  using std::atomic_signal_fence;
-  using std::atomic_thread_fence;
+  using std::atomic_signal_fence _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_thread_fence _LIBCPP_USING_IF_EXISTS;
 
   // [depr.atomics.nonmembers]
-  using std::atomic_init;
+  using std::atomic_init _LIBCPP_USING_IF_EXISTS;
 
 } // namespace std
diff --git a/libcxx/modules/std/cctype.inc b/libcxx/modules/std/cctype.inc
index 075dbbe78198..43417aa15962 100644
--- a/libcxx/modules/std/cctype.inc
+++ b/libcxx/modules/std/cctype.inc
@@ -8,18 +8,18 @@
 //===----------------------------------------------------------------------===//
 
 export namespace std {
-  using std::isalnum;
-  using std::isalpha;
-  using std::isblank;
-  using std::iscntrl;
-  using std::isdigit;
-  using std::isgraph;
-  using std::islower;
-  using std::isprint;
-  using std::ispunct;
-  using std::isspace;
-  using std::isupper;
-  using std::isxdigit;
-  using std::tolower;
-  using std::toupper;
+  using std::isalnum _LIBCPP_USING_IF_EXISTS;
+  using std::isalpha _LIBCPP_USING_IF_EXISTS;
+  using std::isblank _LIBCPP_USING_IF_EXISTS;
+  using std::iscntrl _LIBCPP_USING_IF_EXISTS;
+  using std::isdigit _LIBCPP_USING_IF_EXISTS;
+  using std::isgraph _LIBCPP_USING_IF_EXISTS;
+  using std::islower _LIBCPP_USING_IF_EXISTS;
+  using std::isprint _LIBCPP_USING_IF_EXISTS;
+  using std::ispunct _LIBCPP_USING_IF_EXISTS;
+  using std::isspace _LIBCPP_USING_IF_EXISTS;
+  using std::isupper _LIBCPP_USING_IF_EXISTS;
+  using std::isxdigit _LIBCPP_USING_IF_EXISTS;
+  using std::tolower _LIBCPP_USING_IF_EXISTS;
+  using std::toupper _LIBCPP_USING_IF_EXISTS;
 } // namespace std
diff --git a/libcxx/modules/std/cfenv.inc b/libcxx/modules/std/cfenv.inc
index 34a8a7afa846..831c1fed8ebe 100644
--- a/libcxx/modules/std/cfenv.inc
+++ b/libcxx/modules/std/cfenv.inc
@@ -9,22 +9,22 @@
 
 export namespace std {
   // types
-  using std::fenv_t;
-  using std::fexcept_t;
+  using std::fenv_t _LIBCPP_USING_IF_EXISTS;
+  using std::fexcept_t _LIBCPP_USING_IF_EXISTS;
 
   // functions
-  using std::feclearexcept;
-  using std::fegetexceptflag;
-  using std::feraiseexcept;
-  using std::fesetexceptflag;
-  using std::fetestexcept;
+  using std::feclearexcept _LIBCPP_USING_IF_EXISTS;
+  using std::fegetexceptflag _LIBCPP_USING_IF_EXISTS;
+  using std::feraiseexcept _LIBCPP_USING_IF_EXISTS;
+  using std::fesetexceptflag _LIBCPP_USING_IF_EXISTS;
+  using std::fetestexcept _LIBCPP_USING_IF_EXISTS;
 
-  using std::fegetround;
-  using std::fesetround;
+  using std::fegetround _LIBCPP_USING_IF_EXISTS;
+  using std::fesetround _LIBCPP_USING_IF_EXISTS;
 
-  using std::fegetenv;
-  using std::feholdexcept;
-  using std::fesetenv;
-  using std::feupdateenv;
+  using std::fegetenv _LIBCPP_USING_IF_EXISTS;
+  using std::feholdexcept _LIBCPP_USING_IF_EXISTS;
+  using std::fesetenv _LIBCPP_USING_IF_EXISTS;
+  using std::feupdateenv _LIBCPP_USING_IF_EXISTS;
 
 } // namespace std
diff --git a/libcxx/modules/std/chrono.inc b/libcxx/modules/std/chrono.inc
index 1265e21dc54e..813322a1797f 100644
--- a/libcxx/modules/std/chrono.inc
+++ b/libcxx/modules/std/chrono.inc
@@ -190,10 +190,11 @@ export namespace std {
     using std::chrono::make12;
     using std::chrono::make24;
 
-#if !defined(_LIBCPP_HAS_NO_TIME_ZONE_DATABASE) && !defined(_LIBCPP_HAS_NO_FILESYSTEM) &&                              \
-    !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#ifdef _LIBCPP_ENABLE_EXPERIMENTAL
+
+#  if !defined(_LIBCPP_HAS_NO_TIME_ZONE_DATABASE) && !defined(_LIBCPP_HAS_NO_FILESYSTEM) &&                            \
+      !defined(_LIBCPP_HAS_NO_LOCALIZATION)
 
-#  ifdef _LIBCPP_ENABLE_EXPERIMENTAL
     // [time.zone.db], time zone database
     using std::chrono::tzdb;
     using std::chrono::tzdb_list;
@@ -213,11 +214,16 @@ export namespace std {
     using std::chrono::ambiguous_local_time;
     using std::chrono::nonexistent_local_time;
 #    endif // if 0
+#  endif   //  !defined(_LIBCPP_HAS_NO_TIME_ZONE_DATABASE) && !defined(_LIBCPP_HAS_NO_FILESYSTEM) &&
+           //  !defined(_LIBCPP_HAS_NO_LOCALIZATION)
 
     // [time.zone.info], information classes
     using std::chrono::local_info;
     using std::chrono::sys_info;
 
+#  if !defined(_LIBCPP_HAS_NO_TIME_ZONE_DATABASE) && !defined(_LIBCPP_HAS_NO_FILESYSTEM) &&                            \
+      !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+
 #    if 0
     // [time.zone.timezone], class time_zone
     using std::chrono::choose;
@@ -246,9 +252,9 @@ export namespace std {
     // [time.format], formatting
     using std::chrono::local_time_format;
 #    endif
-#  endif // _LIBCPP_ENABLE_EXPERIMENTAL
-#endif   //  !defined(_LIBCPP_HAS_NO_TIME_ZONE_DATABASE) && !defined(_LIBCPP_HAS_NO_FILESYSTEM) &&
-         //    !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#  endif //  !defined(_LIBCPP_HAS_NO_TIME_ZONE_DATABASE) && !defined(_LIBCPP_HAS_NO_FILESYSTEM) &&
+         //  !defined(_LIBCPP_HAS_NO_LOCALIZATION)
+#endif   // _LIBCPP_ENABLE_EXPERIMENTAL
 
   } // namespace chrono
 
diff --git a/libcxx/modules/std/cinttypes.inc b/libcxx/modules/std/cinttypes.inc
index 2e1359265986..b2b98c973e98 100644
--- a/libcxx/modules/std/cinttypes.inc
+++ b/libcxx/modules/std/cinttypes.inc
@@ -8,14 +8,14 @@
 //===----------------------------------------------------------------------===//
 
 export namespace std {
-  using std::imaxdiv_t;
+  using std::imaxdiv_t _LIBCPP_USING_IF_EXISTS;
 
-  using std::imaxabs;
-  using std::imaxdiv;
-  using std::strtoimax;
-  using std::strtoumax;
-  using std::wcstoimax;
-  using std::wcstoumax;
+  using std::imaxabs _LIBCPP_USING_IF_EXISTS;
+  using std::imaxdiv _LIBCPP_USING_IF_EXISTS;
+  using std::strtoimax _LIBCPP_USING_IF_EXISTS;
+  using std::strtoumax _LIBCPP_USING_IF_EXISTS;
+  using std::wcstoimax _LIBCPP_USING_IF_EXISTS;
+  using std::wcstoumax _LIBCPP_USING_IF_EXISTS;
 
   // abs is conditionally here, but always present in cmath.cppm. To avoid
   // conflicing declarations omit the using here.
diff --git a/libcxx/modules/std/clocale.inc b/libcxx/modules/std/clocale.inc
index 3efe1595dc05..359868a61eb9 100644
--- a/libcxx/modules/std/clocale.inc
+++ b/libcxx/modules/std/clocale.inc
@@ -9,9 +9,9 @@
 
 export namespace std {
 #ifndef _LIBCPP_HAS_NO_LOCALIZATION
-  using std::lconv;
+  using std::lconv _LIBCPP_USING_IF_EXISTS;
 
-  using std::localeconv;
-  using std::setlocale;
+  using std::localeconv _LIBCPP_USING_IF_EXISTS;
+  using std::setlocale _LIBCPP_USING_IF_EXISTS;
 #endif // _LIBCPP_HAS_NO_LOCALIZATION
 } // namespace std
diff --git a/libcxx/modules/std/cmath.inc b/libcxx/modules/std/cmath.inc
index 0fe887447ad8..a463c1e3ccf8 100644
--- a/libcxx/modules/std/cmath.inc
+++ b/libcxx/modules/std/cmath.inc
@@ -9,258 +9,258 @@
 
 export namespace std {
 
-  using std::double_t;
-  using std::float_t;
+  using std::double_t _LIBCPP_USING_IF_EXISTS;
+  using std::float_t _LIBCPP_USING_IF_EXISTS;
 
-  using std::acos;
-  using std::acosf;
-  using std::acosl;
+  using std::acos _LIBCPP_USING_IF_EXISTS;
+  using std::acosf _LIBCPP_USING_IF_EXISTS;
+  using std::acosl _LIBCPP_USING_IF_EXISTS;
 
-  using std::asin;
-  using std::asinf;
-  using std::asinl;
+  using std::asin _LIBCPP_USING_IF_EXISTS;
+  using std::asinf _LIBCPP_USING_IF_EXISTS;
+  using std::asinl _LIBCPP_USING_IF_EXISTS;
 
-  using std::atan;
-  using std::atanf;
-  using std::atanl;
+  using std::atan _LIBCPP_USING_IF_EXISTS;
+  using std::atanf _LIBCPP_USING_IF_EXISTS;
+  using std::atanl _LIBCPP_USING_IF_EXISTS;
 
-  using std::atan2;
-  using std::atan2f;
-  using std::atan2l;
+  using std::atan2 _LIBCPP_USING_IF_EXISTS;
+  using std::atan2f _LIBCPP_USING_IF_EXISTS;
+  using std::atan2l _LIBCPP_USING_IF_EXISTS;
 
-  using std::cos;
-  using std::cosf;
-  using std::cosl;
+  using std::cos _LIBCPP_USING_IF_EXISTS;
+  using std::cosf _LIBCPP_USING_IF_EXISTS;
+  using std::cosl _LIBCPP_USING_IF_EXISTS;
 
-  using std::sin;
-  using std::sinf;
-  using std::sinl;
+  using std::sin _LIBCPP_USING_IF_EXISTS;
+  using std::sinf _LIBCPP_USING_IF_EXISTS;
+  using std::sinl _LIBCPP_USING_IF_EXISTS;
 
-  using std::tan;
-  using std::tanf;
-  using std::tanl;
+  using std::tan _LIBCPP_USING_IF_EXISTS;
+  using std::tanf _LIBCPP_USING_IF_EXISTS;
+  using std::tanl _LIBCPP_USING_IF_EXISTS;
 
-  using std::acosh;
-  using std::acoshf;
-  using std::acoshl;
+  using std::acosh _LIBCPP_USING_IF_EXISTS;
+  using std::acoshf _LIBCPP_USING_IF_EXISTS;
+  using std::acoshl _LIBCPP_USING_IF_EXISTS;
 
-  using std::asinh;
-  using std::asinhf;
-  using std::asinhl;
+  using std::asinh _LIBCPP_USING_IF_EXISTS;
+  using std::asinhf _LIBCPP_USING_IF_EXISTS;
+  using std::asinhl _LIBCPP_USING_IF_EXISTS;
 
-  using std::atanh;
-  using std::atanhf;
-  using std::atanhl;
+  using std::atanh _LIBCPP_USING_IF_EXISTS;
+  using std::atanhf _LIBCPP_USING_IF_EXISTS;
+  using std::atanhl _LIBCPP_USING_IF_EXISTS;
 
-  using std::cosh;
-  using std::coshf;
-  using std::coshl;
+  using std::cosh _LIBCPP_USING_IF_EXISTS;
+  using std::coshf _LIBCPP_USING_IF_EXISTS;
+  using std::coshl _LIBCPP_USING_IF_EXISTS;
 
-  using std::sinh;
-  using std::sinhf;
-  using std::sinhl;
+  using std::sinh _LIBCPP_USING_IF_EXISTS;
+  using std::sinhf _LIBCPP_USING_IF_EXISTS;
+  using std::sinhl _LIBCPP_USING_IF_EXISTS;
 
-  using std::tanh;
-  using std::tanhf;
-  using std::tanhl;
+  using std::tanh _LIBCPP_USING_IF_EXISTS;
+  using std::tanhf _LIBCPP_USING_IF_EXISTS;
+  using std::tanhl _LIBCPP_USING_IF_EXISTS;
 
-  using std::exp;
-  using std::expf;
-  using std::expl;
+  using std::exp _LIBCPP_USING_IF_EXISTS;
+  using std::expf _LIBCPP_USING_IF_EXISTS;
+  using std::expl _LIBCPP_USING_IF_EXISTS;
 
-  using std::exp2;
-  using std::exp2f;
-  using std::exp2l;
+  using std::exp2 _LIBCPP_USING_IF_EXISTS;
+  using std::exp2f _LIBCPP_USING_IF_EXISTS;
+  using std::exp2l _LIBCPP_USING_IF_EXISTS;
 
-  using std::expm1;
-  using std::expm1f;
-  using std::expm1l;
+  using std::expm1 _LIBCPP_USING_IF_EXISTS;
+  using std::expm1f _LIBCPP_USING_IF_EXISTS;
+  using std::expm1l _LIBCPP_USING_IF_EXISTS;
 
-  using std::frexp;
-  using std::frexpf;
-  using std::frexpl;
+  using std::frexp _LIBCPP_USING_IF_EXISTS;
+  using std::frexpf _LIBCPP_USING_IF_EXISTS;
+  using std::frexpl _LIBCPP_USING_IF_EXISTS;
 
-  using std::ilogb;
-  using std::ilogbf;
-  using std::ilogbl;
+  using std::ilogb _LIBCPP_USING_IF_EXISTS;
+  using std::ilogbf _LIBCPP_USING_IF_EXISTS;
+  using std::ilogbl _LIBCPP_USING_IF_EXISTS;
 
-  using std::ldexp;
-  using std::ldexpf;
-  using std::ldexpl;
+  using std::ldexp _LIBCPP_USING_IF_EXISTS;
+  using std::ldexpf _LIBCPP_USING_IF_EXISTS;
+  using std::ldexpl _LIBCPP_USING_IF_EXISTS;
 
-  using std::log;
-  using std::logf;
-  using std::logl;
+  using std::log _LIBCPP_USING_IF_EXISTS;
+  using std::logf _LIBCPP_USING_IF_EXISTS;
+  using std::logl _LIBCPP_USING_IF_EXISTS;
 
-  using std::log10;
-  using std::log10f;
-  using std::log10l;
+  using std::log10 _LIBCPP_USING_IF_EXISTS;
+  using std::log10f _LIBCPP_USING_IF_EXISTS;
+  using std::log10l _LIBCPP_USING_IF_EXISTS;
 
-  using std::log1p;
-  using std::log1pf;
-  using std::log1pl;
+  using std::log1p _LIBCPP_USING_IF_EXISTS;
+  using std::log1pf _LIBCPP_USING_IF_EXISTS;
+  using std::log1pl _LIBCPP_USING_IF_EXISTS;
 
-  using std::log2;
-  using std::log2f;
-  using std::log2l;
+  using std::log2 _LIBCPP_USING_IF_EXISTS;
+  using std::log2f _LIBCPP_USING_IF_EXISTS;
+  using std::log2l _LIBCPP_USING_IF_EXISTS;
 
-  using std::logb;
-  using std::logbf;
-  using std::logbl;
+  using std::logb _LIBCPP_USING_IF_EXISTS;
+  using std::logbf _LIBCPP_USING_IF_EXISTS;
+  using std::logbl _LIBCPP_USING_IF_EXISTS;
 
-  using std::modf;
-  using std::modff;
-  using std::modfl;
+  using std::modf _LIBCPP_USING_IF_EXISTS;
+  using std::modff _LIBCPP_USING_IF_EXISTS;
+  using std::modfl _LIBCPP_USING_IF_EXISTS;
 
-  using std::scalbn;
-  using std::scalbnf;
-  using std::scalbnl;
+  using std::scalbn _LIBCPP_USING_IF_EXISTS;
+  using std::scalbnf _LIBCPP_USING_IF_EXISTS;
+  using std::scalbnl _LIBCPP_USING_IF_EXISTS;
 
-  using std::scalbln;
-  using std::scalblnf;
-  using std::scalblnl;
+  using std::scalbln _LIBCPP_USING_IF_EXISTS;
+  using std::scalblnf _LIBCPP_USING_IF_EXISTS;
+  using std::scalblnl _LIBCPP_USING_IF_EXISTS;
 
-  using std::cbrt;
-  using std::cbrtf;
-  using std::cbrtl;
+  using std::cbrt _LIBCPP_USING_IF_EXISTS;
+  using std::cbrtf _LIBCPP_USING_IF_EXISTS;
+  using std::cbrtl _LIBCPP_USING_IF_EXISTS;
 
   // [c.math.abs], absolute values
-  using std::abs;
+  using std::abs _LIBCPP_USING_IF_EXISTS;
 
-  using std::fabs;
-  using std::fabsf;
-  using std::fabsl;
+  using std::fabs _LIBCPP_USING_IF_EXISTS;
+  using std::fabsf _LIBCPP_USING_IF_EXISTS;
+  using std::fabsl _LIBCPP_USING_IF_EXISTS;
 
-  using std::hypot;
-  using std::hypotf;
-  using std::hypotl;
+  using std::hypot _LIBCPP_USING_IF_EXISTS;
+  using std::hypotf _LIBCPP_USING_IF_EXISTS;
+  using std::hypotl _LIBCPP_USING_IF_EXISTS;
 
   // [c.math.hypot3], three-dimensional hypotenuse
 
-  using std::pow;
-  using std::powf;
-  using std::powl;
+  using std::pow _LIBCPP_USING_IF_EXISTS;
+  using std::powf _LIBCPP_USING_IF_EXISTS;
+  using std::powl _LIBCPP_USING_IF_EXISTS;
 
-  using std::sqrt;
-  using std::sqrtf;
-  using std::sqrtl;
+  using std::sqrt _LIBCPP_USING_IF_EXISTS;
+  using std::sqrtf _LIBCPP_USING_IF_EXISTS;
+  using std::sqrtl _LIBCPP_USING_IF_EXISTS;
 
-  using std::erf;
-  using std::erff;
-  using std::erfl;
+  using std::erf _LIBCPP_USING_IF_EXISTS;
+  using std::erff _LIBCPP_USING_IF_EXISTS;
+  using std::erfl _LIBCPP_USING_IF_EXISTS;
 
-  using std::erfc;
-  using std::erfcf;
-  using std::erfcl;
+  using std::erfc _LIBCPP_USING_IF_EXISTS;
+  using std::erfcf _LIBCPP_USING_IF_EXISTS;
+  using std::erfcl _LIBCPP_USING_IF_EXISTS;
 
-  using std::lgamma;
-  using std::lgammaf;
-  using std::lgammal;
+  using std::lgamma _LIBCPP_USING_IF_EXISTS;
+  using std::lgammaf _LIBCPP_USING_IF_EXISTS;
+  using std::lgammal _LIBCPP_USING_IF_EXISTS;
 
-  using std::tgamma;
-  using std::tgammaf;
-  using std::tgammal;
+  using std::tgamma _LIBCPP_USING_IF_EXISTS;
+  using std::tgammaf _LIBCPP_USING_IF_EXISTS;
+  using std::tgammal _LIBCPP_USING_IF_EXISTS;
 
-  using std::ceil;
-  using std::ceilf;
-  using std::ceill;
+  using std::ceil _LIBCPP_USING_IF_EXISTS;
+  using std::ceilf _LIBCPP_USING_IF_EXISTS;
+  using std::ceill _LIBCPP_USING_IF_EXISTS;
 
-  using std::floor;
-  using std::floorf;
-  using std::floorl;
+  using std::floor _LIBCPP_USING_IF_EXISTS;
+  using std::floorf _LIBCPP_USING_IF_EXISTS;
+  using std::floorl _LIBCPP_USING_IF_EXISTS;
 
-  using std::nearbyint;
-  using std::nearbyintf;
-  using std::nearbyintl;
+  using std::nearbyint _LIBCPP_USING_IF_EXISTS;
+  using std::nearbyintf _LIBCPP_USING_IF_EXISTS;
+  using std::nearbyintl _LIBCPP_USING_IF_EXISTS;
 
-  using std::rint;
-  using std::rintf;
-  using std::rintl;
+  using std::rint _LIBCPP_USING_IF_EXISTS;
+  using std::rintf _LIBCPP_USING_IF_EXISTS;
+  using std::rintl _LIBCPP_USING_IF_EXISTS;
 
-  using std::lrint;
-  using std::lrintf;
-  using std::lrintl;
+  using std::lrint _LIBCPP_USING_IF_EXISTS;
+  using std::lrintf _LIBCPP_USING_IF_EXISTS;
+  using std::lrintl _LIBCPP_USING_IF_EXISTS;
 
-  using std::llrint;
-  using std::llrintf;
-  using std::llrintl;
+  using std::llrint _LIBCPP_USING_IF_EXISTS;
+  using std::llrintf _LIBCPP_USING_IF_EXISTS;
+  using std::llrintl _LIBCPP_USING_IF_EXISTS;
 
-  using std::round;
-  using std::roundf;
-  using std::roundl;
+  using std::round _LIBCPP_USING_IF_EXISTS;
+  using std::roundf _LIBCPP_USING_IF_EXISTS;
+  using std::roundl _LIBCPP_USING_IF_EXISTS;
 
-  using std::lround;
-  using std::lroundf;
-  using std::lroundl;
+  using std::lround _LIBCPP_USING_IF_EXISTS;
+  using std::lroundf _LIBCPP_USING_IF_EXISTS;
+  using std::lroundl _LIBCPP_USING_IF_EXISTS;
 
-  using std::llround;
-  using std::llroundf;
-  using std::llroundl;
+  using std::llround _LIBCPP_USING_IF_EXISTS;
+  using std::llroundf _LIBCPP_USING_IF_EXISTS;
+  using std::llroundl _LIBCPP_USING_IF_EXISTS;
 
-  using std::trunc;
-  using std::truncf;
-  using std::truncl;
+  using std::trunc _LIBCPP_USING_IF_EXISTS;
+  using std::truncf _LIBCPP_USING_IF_EXISTS;
+  using std::truncl _LIBCPP_USING_IF_EXISTS;
 
-  using std::fmod;
-  using std::fmodf;
-  using std::fmodl;
+  using std::fmod _LIBCPP_USING_IF_EXISTS;
+  using std::fmodf _LIBCPP_USING_IF_EXISTS;
+  using std::fmodl _LIBCPP_USING_IF_EXISTS;
 
-  using std::remainder;
-  using std::remainderf;
-  using std::remainderl;
+  using std::remainder _LIBCPP_USING_IF_EXISTS;
+  using std::remainderf _LIBCPP_USING_IF_EXISTS;
+  using std::remainderl _LIBCPP_USING_IF_EXISTS;
 
-  using std::remquo;
-  using std::remquof;
-  using std::remquol;
+  using std::remquo _LIBCPP_USING_IF_EXISTS;
+  using std::remquof _LIBCPP_USING_IF_EXISTS;
+  using std::remquol _LIBCPP_USING_IF_EXISTS;
 
-  using std::copysign;
-  using std::copysignf;
-  using std::copysignl;
+  using std::copysign _LIBCPP_USING_IF_EXISTS;
+  using std::copysignf _LIBCPP_USING_IF_EXISTS;
+  using std::copysignl _LIBCPP_USING_IF_EXISTS;
 
-  using std::nan;
-  using std::nanf;
-  using std::nanl;
+  using std::nan _LIBCPP_USING_IF_EXISTS;
+  using std::nanf _LIBCPP_USING_IF_EXISTS;
+  using std::nanl _LIBCPP_USING_IF_EXISTS;
 
-  using std::nextafter;
-  using std::nextafterf;
-  using std::nextafterl;
+  using std::nextafter _LIBCPP_USING_IF_EXISTS;
+  using std::nextafterf _LIBCPP_USING_IF_EXISTS;
+  using std::nextafterl _LIBCPP_USING_IF_EXISTS;
 
-  using std::nexttoward;
-  using std::nexttowardf;
-  using std::nexttowardl;
+  using std::nexttoward _LIBCPP_USING_IF_EXISTS;
+  using std::nexttowardf _LIBCPP_USING_IF_EXISTS;
+  using std::nexttowardl _LIBCPP_USING_IF_EXISTS;
 
-  using std::fdim;
-  using std::fdimf;
-  using std::fdiml;
+  using std::fdim _LIBCPP_USING_IF_EXISTS;
+  using std::fdimf _LIBCPP_USING_IF_EXISTS;
+  using std::fdiml _LIBCPP_USING_IF_EXISTS;
 
-  using std::fmax;
-  using std::fmaxf;
-  using std::fmaxl;
+  using std::fmax _LIBCPP_USING_IF_EXISTS;
+  using std::fmaxf _LIBCPP_USING_IF_EXISTS;
+  using std::fmaxl _LIBCPP_USING_IF_EXISTS;
 
-  using std::fmin;
-  using std::fminf;
-  using std::fminl;
+  using std::fmin _LIBCPP_USING_IF_EXISTS;
+  using std::fminf _LIBCPP_USING_IF_EXISTS;
+  using std::fminl _LIBCPP_USING_IF_EXISTS;
 
-  using std::fma;
-  using std::fmaf;
-  using std::fmal;
+  using std::fma _LIBCPP_USING_IF_EXISTS;
+  using std::fmaf _LIBCPP_USING_IF_EXISTS;
+  using std::fmal _LIBCPP_USING_IF_EXISTS;
 
   // [c.math.lerp], linear interpolation
-  using std::lerp;
+  using std::lerp _LIBCPP_USING_IF_EXISTS;
 
   // [c.math.fpclass], classification / comparison functions
-  using std::fpclassify;
-  using std::isfinite;
-  using std::isgreater;
-  using std::isgreaterequal;
-  using std::isinf;
-  using std::isless;
-  using std::islessequal;
-  using std::islessgreater;
-  using std::isnan;
-  using std::isnormal;
-  using std::isunordered;
-  using std::signbit;
+  using std::fpclassify _LIBCPP_USING_IF_EXISTS;
+  using std::isfinite _LIBCPP_USING_IF_EXISTS;
+  using std::isgreater _LIBCPP_USING_IF_EXISTS;
+  using std::isgreaterequal _LIBCPP_USING_IF_EXISTS;
+  using std::isinf _LIBCPP_USING_IF_EXISTS;
+  using std::isless _LIBCPP_USING_IF_EXISTS;
+  using std::islessequal _LIBCPP_USING_IF_EXISTS;
+  using std::islessgreater _LIBCPP_USING_IF_EXISTS;
+  using std::isnan _LIBCPP_USING_IF_EXISTS;
+  using std::isnormal _LIBCPP_USING_IF_EXISTS;
+  using std::isunordered _LIBCPP_USING_IF_EXISTS;
+  using std::signbit _LIBCPP_USING_IF_EXISTS;
 
   // [sf.cmath], mathematical special functions
 #if 0
diff --git a/libcxx/modules/std/csetjmp.inc b/libcxx/modules/std/csetjmp.inc
index 68e226c8b7f1..8aa2e2329291 100644
--- a/libcxx/modules/std/csetjmp.inc
+++ b/libcxx/modules/std/csetjmp.inc
@@ -8,6 +8,6 @@
 //===----------------------------------------------------------------------===//
 
 export namespace std {
-  using std::jmp_buf;
-  using std::longjmp;
+  using std::jmp_buf _LIBCPP_USING_IF_EXISTS;
+  using std::longjmp _LIBCPP_USING_IF_EXISTS;
 } // namespace std
diff --git a/libcxx/modules/std/csignal.inc b/libcxx/modules/std/csignal.inc
index b57e8edc6c09..05f3986866c7 100644
--- a/libcxx/modules/std/csignal.inc
+++ b/libcxx/modules/std/csignal.inc
@@ -8,11 +8,11 @@
 //===----------------------------------------------------------------------===//
 
 export namespace std {
-  using std::sig_atomic_t;
+  using std::sig_atomic_t _LIBCPP_USING_IF_EXISTS;
 
   // [support.signal], signal handlers
-  using std::signal;
+  using std::signal _LIBCPP_USING_IF_EXISTS;
 
-  using std::raise;
+  using std::raise _LIBCPP_USING_IF_EXISTS;
 
 } // namespace std
diff --git a/libcxx/modules/std/cstdarg.inc b/libcxx/modules/std/cstdarg.inc
index 2b7309c94ed2..5947bc2452b7 100644
--- a/libcxx/modules/std/cstdarg.inc
+++ b/libcxx/modules/std/cstdarg.inc
@@ -8,5 +8,5 @@
 //===----------------------------------------------------------------------===//
 
 export namespace std {
-  using std::va_list;
+  using std::va_list _LIBCPP_USING_IF_EXISTS;
 } // namespace std
diff --git a/libcxx/modules/std/cstddef.inc b/libcxx/modules/std/cstddef.inc
index 2b9ab0c47074..6443de892382 100644
--- a/libcxx/modules/std/cstddef.inc
+++ b/libcxx/modules/std/cstddef.inc
@@ -8,10 +8,10 @@
 //===----------------------------------------------------------------------===//
 
 export namespace std {
-  using std::max_align_t;
+  using std::max_align_t _LIBCPP_USING_IF_EXISTS;
   using std::nullptr_t;
-  using std::ptrdiff_t;
-  using std::size_t;
+  using std::ptrdiff_t _LIBCPP_USING_IF_EXISTS;
+  using std::size_t _LIBCPP_USING_IF_EXISTS;
 
   using std::byte;
 
diff --git a/libcxx/modules/std/cstdint.inc b/libcxx/modules/std/cstdint.inc
index f6de4472218d..f23b52a94526 100644
--- a/libcxx/modules/std/cstdint.inc
+++ b/libcxx/modules/std/cstdint.inc
@@ -14,17 +14,17 @@ export namespace std {
   using std::int32_t _LIBCPP_USING_IF_EXISTS;
   using std::int64_t _LIBCPP_USING_IF_EXISTS;
 
-  using std::int_fast16_t;
-  using std::int_fast32_t;
-  using std::int_fast64_t;
-  using std::int_fast8_t;
+  using std::int_fast16_t _LIBCPP_USING_IF_EXISTS;
+  using std::int_fast32_t _LIBCPP_USING_IF_EXISTS;
+  using std::int_fast64_t _LIBCPP_USING_IF_EXISTS;
+  using std::int_fast8_t _LIBCPP_USING_IF_EXISTS;
 
-  using std::int_least16_t;
-  using std::int_least32_t;
-  using std::int_least64_t;
-  using std::int_least8_t;
+  using std::int_least16_t _LIBCPP_USING_IF_EXISTS;
+  using std::int_least32_t _LIBCPP_USING_IF_EXISTS;
+  using std::int_least64_t _LIBCPP_USING_IF_EXISTS;
+  using std::int_least8_t _LIBCPP_USING_IF_EXISTS;
 
-  using std::intmax_t;
+  using std::intmax_t _LIBCPP_USING_IF_EXISTS;
 
   using std::intptr_t _LIBCPP_USING_IF_EXISTS;
 
@@ -34,17 +34,17 @@ export namespace std {
   using std::uint32_t _LIBCPP_USING_IF_EXISTS;
   using std::uint64_t _LIBCPP_USING_IF_EXISTS;
 
-  using std::uint_fast16_t;
-  using std::uint_fast32_t;
-  using std::uint_fast64_t;
-  using std::uint_fast8_t;
+  using std::uint_fast16_t _LIBCPP_USING_IF_EXISTS;
+  using std::uint_fast32_t _LIBCPP_USING_IF_EXISTS;
+  using std::uint_fast64_t _LIBCPP_USING_IF_EXISTS;
+  using std::uint_fast8_t _LIBCPP_USING_IF_EXISTS;
 
-  using std::uint_least16_t;
-  using std::uint_least32_t;
-  using std::uint_least64_t;
-  using std::uint_least8_t;
+  using std::uint_least16_t _LIBCPP_USING_IF_EXISTS;
+  using std::uint_least32_t _LIBCPP_USING_IF_EXISTS;
+  using std::uint_least64_t _LIBCPP_USING_IF_EXISTS;
+  using std::uint_least8_t _LIBCPP_USING_IF_EXISTS;
 
-  using std::uintmax_t;
+  using std::uintmax_t _LIBCPP_USING_IF_EXISTS;
 
   using std::uintptr_t _LIBCPP_USING_IF_EXISTS;
 } // namespace std
diff --git a/libcxx/modules/std/cstdio.inc b/libcxx/modules/std/cstdio.inc
index eec8170e0791..62fa2f566a5f 100644
--- a/libcxx/modules/std/cstdio.inc
+++ b/libcxx/modules/std/cstdio.inc
@@ -8,53 +8,53 @@
 //===----------------------------------------------------------------------===//
 
 export namespace std {
-  using std::FILE;
-  using std::fpos_t;
-  using std::size_t;
+  using std::FILE _LIBCPP_USING_IF_EXISTS;
+  using std::fpos_t _LIBCPP_USING_IF_EXISTS;
+  using std::size_t _LIBCPP_USING_IF_EXISTS;
 
-  using std::clearerr;
-  using std::fclose;
-  using std::feof;
-  using std::ferror;
-  using std::fflush;
-  using std::fgetc;
-  using std::fgetpos;
-  using std::fgets;
-  using std::fopen;
-  using std::fprintf;
-  using std::fputc;
-  using std::fputs;
-  using std::fread;
-  using std::freopen;
-  using std::fscanf;
-  using std::fseek;
-  using std::fsetpos;
-  using std::ftell;
-  using std::fwrite;
-  using std::getc;
-  using std::getchar;
-  using std::perror;
-  using std::printf;
-  using std::putc;
-  using std::putchar;
-  using std::puts;
-  using std::remove;
-  using std::rename;
-  using std::rewind;
-  using std::scanf;
-  using std::setbuf;
-  using std::setvbuf;
-  using std::snprintf;
-  using std::sprintf;
-  using std::sscanf;
-  using std::tmpfile;
-  using std::tmpnam;
-  using std::ungetc;
-  using std::vfprintf;
-  using std::vfscanf;
-  using std::vprintf;
-  using std::vscanf;
-  using std::vsnprintf;
-  using std::vsprintf;
-  using std::vsscanf;
+  using std::clearerr _LIBCPP_USING_IF_EXISTS;
+  using std::fclose _LIBCPP_USING_IF_EXISTS;
+  using std::feof _LIBCPP_USING_IF_EXISTS;
+  using std::ferror _LIBCPP_USING_IF_EXISTS;
+  using std::fflush _LIBCPP_USING_IF_EXISTS;
+  using std::fgetc _LIBCPP_USING_IF_EXISTS;
+  using std::fgetpos _LIBCPP_USING_IF_EXISTS;
+  using std::fgets _LIBCPP_USING_IF_EXISTS;
+  using std::fopen _LIBCPP_USING_IF_EXISTS;
+  using std::fprintf _LIBCPP_USING_IF_EXISTS;
+  using std::fputc _LIBCPP_USING_IF_EXISTS;
+  using std::fputs _LIBCPP_USING_IF_EXISTS;
+  using std::fread _LIBCPP_USING_IF_EXISTS;
+  using std::freopen _LIBCPP_USING_IF_EXISTS;
+  using std::fscanf _LIBCPP_USING_IF_EXISTS;
+  using std::fseek _LIBCPP_USING_IF_EXISTS;
+  using std::fsetpos _LIBCPP_USING_IF_EXISTS;
+  using std::ftell _LIBCPP_USING_IF_EXISTS;
+  using std::fwrite _LIBCPP_USING_IF_EXISTS;
+  using std::getc _LIBCPP_USING_IF_EXISTS;
+  using std::getchar _LIBCPP_USING_IF_EXISTS;
+  using std::perror _LIBCPP_USING_IF_EXISTS;
+  using std::printf _LIBCPP_USING_IF_EXISTS;
+  using std::putc _LIBCPP_USING_IF_EXISTS;
+  using std::putchar _LIBCPP_USING_IF_EXISTS;
+  using std::puts _LIBCPP_USING_IF_EXISTS;
+  using std::remove _LIBCPP_USING_IF_EXISTS;
+  using std::rename _LIBCPP_USING_IF_EXISTS;
+  using std::rewind _LIBCPP_USING_IF_EXISTS;
+  using std::scanf _LIBCPP_USING_IF_EXISTS;
+  using std::setbuf _LIBCPP_USING_IF_EXISTS;
+  using std::setvbuf _LIBCPP_USING_IF_EXISTS;
+  using std::snprintf _LIBCPP_USING_IF_EXISTS;
+  using std::sprintf _LIBCPP_USING_IF_EXISTS;
+  using std::sscanf _LIBCPP_USING_IF_EXISTS;
+  using std::tmpfile _LIBCPP_USING_IF_EXISTS;
+  using std::tmpnam _LIBCPP_USING_IF_EXISTS;
+  using std::ungetc _LIBCPP_USING_IF_EXISTS;
+  using std::vfprintf _LIBCPP_USING_IF_EXISTS;
+  using std::vfscanf _LIBCPP_USING_IF_EXISTS;
+  using std::vprintf _LIBCPP_USING_IF_EXISTS;
+  using std::vscanf _LIBCPP_USING_IF_EXISTS;
+  using std::vsnprintf _LIBCPP_USING_IF_EXISTS;
+  using std::vsprintf _LIBCPP_USING_IF_EXISTS;
+  using std::vsscanf _LIBCPP_USING_IF_EXISTS;
 } // namespace std
diff --git a/libcxx/modules/std/cstdlib.inc b/libcxx/modules/std/cstdlib.inc
index 8840c61367c3..617cf3ff3ef6 100644
--- a/libcxx/modules/std/cstdlib.inc
+++ b/libcxx/modules/std/cstdlib.inc
@@ -8,64 +8,64 @@
 //===----------------------------------------------------------------------===//
 
 export namespace std {
-  using std::div_t;
-  using std::ldiv_t;
-  using std::lldiv_t;
-  using std::size_t;
+  using std::div_t _LIBCPP_USING_IF_EXISTS;
+  using std::ldiv_t _LIBCPP_USING_IF_EXISTS;
+  using std::lldiv_t _LIBCPP_USING_IF_EXISTS;
+  using std::size_t _LIBCPP_USING_IF_EXISTS;
 
   // [support.start.term], start and termination
-  using std::_Exit;
-  using std::abort;
-  using std::at_quick_exit;
-  using std::atexit;
-  using std::exit;
-  using std::quick_exit;
+  using std::_Exit _LIBCPP_USING_IF_EXISTS;
+  using std::abort _LIBCPP_USING_IF_EXISTS;
+  using std::at_quick_exit _LIBCPP_USING_IF_EXISTS;
+  using std::atexit _LIBCPP_USING_IF_EXISTS;
+  using std::exit _LIBCPP_USING_IF_EXISTS;
+  using std::quick_exit _LIBCPP_USING_IF_EXISTS;
 
-  using std::getenv;
-  using std::system;
+  using std::getenv _LIBCPP_USING_IF_EXISTS;
+  using std::system _LIBCPP_USING_IF_EXISTS;
 
   // [c.malloc], C library memory allocation
-  using std::aligned_alloc;
-  using std::calloc;
-  using std::free;
-  using std::malloc;
-  using std::realloc;
+  using std::aligned_alloc _LIBCPP_USING_IF_EXISTS;
+  using std::calloc _LIBCPP_USING_IF_EXISTS;
+  using std::free _LIBCPP_USING_IF_EXISTS;
+  using std::malloc _LIBCPP_USING_IF_EXISTS;
+  using std::realloc _LIBCPP_USING_IF_EXISTS;
 
-  using std::atof;
-  using std::atoi;
-  using std::atol;
-  using std::atoll;
-  using std::strtod;
-  using std::strtof;
-  using std::strtol;
-  using std::strtold;
-  using std::strtoll;
-  using std::strtoul;
-  using std::strtoull;
+  using std::atof _LIBCPP_USING_IF_EXISTS;
+  using std::atoi _LIBCPP_USING_IF_EXISTS;
+  using std::atol _LIBCPP_USING_IF_EXISTS;
+  using std::atoll _LIBCPP_USING_IF_EXISTS;
+  using std::strtod _LIBCPP_USING_IF_EXISTS;
+  using std::strtof _LIBCPP_USING_IF_EXISTS;
+  using std::strtol _LIBCPP_USING_IF_EXISTS;
+  using std::strtold _LIBCPP_USING_IF_EXISTS;
+  using std::strtoll _LIBCPP_USING_IF_EXISTS;
+  using std::strtoul _LIBCPP_USING_IF_EXISTS;
+  using std::strtoull _LIBCPP_USING_IF_EXISTS;
 
   // [c.mb.wcs], multibyte / wide string and character conversion functions
-  using std::mblen;
+  using std::mblen _LIBCPP_USING_IF_EXISTS;
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
-  using std::mbstowcs;
-  using std::mbtowc;
-  using std::wcstombs;
-  using std::wctomb;
+  using std::mbstowcs _LIBCPP_USING_IF_EXISTS;
+  using std::mbtowc _LIBCPP_USING_IF_EXISTS;
+  using std::wcstombs _LIBCPP_USING_IF_EXISTS;
+  using std::wctomb _LIBCPP_USING_IF_EXISTS;
 #endif
   // [alg.c.library], C standard library algorithms
-  using std::bsearch;
-  using std::qsort;
+  using std::bsearch _LIBCPP_USING_IF_EXISTS;
+  using std::qsort _LIBCPP_USING_IF_EXISTS;
 
   // [c.math.rand], low-quality random number generation
-  using std::rand;
-  using std::srand;
+  using std::rand _LIBCPP_USING_IF_EXISTS;
+  using std::srand _LIBCPP_USING_IF_EXISTS;
 
   // [c.math.abs], absolute values
-  using std::abs;
+  using std::abs _LIBCPP_USING_IF_EXISTS;
 
-  using std::labs;
-  using std::llabs;
+  using std::labs _LIBCPP_USING_IF_EXISTS;
+  using std::llabs _LIBCPP_USING_IF_EXISTS;
 
-  using std::div;
-  using std::ldiv;
-  using std::lldiv;
+  using std::div _LIBCPP_USING_IF_EXISTS;
+  using std::ldiv _LIBCPP_USING_IF_EXISTS;
+  using std::lldiv _LIBCPP_USING_IF_EXISTS;
 } // namespace std
diff --git a/libcxx/modules/std/cstring.inc b/libcxx/modules/std/cstring.inc
index d21714b787c8..9ad33b982b32 100644
--- a/libcxx/modules/std/cstring.inc
+++ b/libcxx/modules/std/cstring.inc
@@ -8,28 +8,28 @@
 //===----------------------------------------------------------------------===//
 
 export namespace std {
-  using std::size_t;
+  using std::size_t _LIBCPP_USING_IF_EXISTS;
 
-  using std::memchr;
-  using std::memcmp;
-  using std::memcpy;
-  using std::memmove;
-  using std::memset;
-  using std::strcat;
-  using std::strchr;
-  using std::strcmp;
-  using std::strcoll;
-  using std::strcpy;
-  using std::strcspn;
-  using std::strerror;
-  using std::strlen;
-  using std::strncat;
-  using std::strncmp;
-  using std::strncpy;
-  using std::strpbrk;
-  using std::strrchr;
-  using std::strspn;
-  using std::strstr;
-  using std::strtok;
-  using std::strxfrm;
+  using std::memchr _LIBCPP_USING_IF_EXISTS;
+  using std::memcmp _LIBCPP_USING_IF_EXISTS;
+  using std::memcpy _LIBCPP_USING_IF_EXISTS;
+  using std::memmove _LIBCPP_USING_IF_EXISTS;
+  using std::memset _LIBCPP_USING_IF_EXISTS;
+  using std::strcat _LIBCPP_USING_IF_EXISTS;
+  using std::strchr _LIBCPP_USING_IF_EXISTS;
+  using std::strcmp _LIBCPP_USING_IF_EXISTS;
+  using std::strcoll _LIBCPP_USING_IF_EXISTS;
+  using std::strcpy _LIBCPP_USING_IF_EXISTS;
+  using std::strcspn _LIBCPP_USING_IF_EXISTS;
+  using std::strerror _LIBCPP_USING_IF_EXISTS;
+  using std::strlen _LIBCPP_USING_IF_EXISTS;
+  using std::strncat _LIBCPP_USING_IF_EXISTS;
+  using std::strncmp _LIBCPP_USING_IF_EXISTS;
+  using std::strncpy _LIBCPP_USING_IF_EXISTS;
+  using std::strpbrk _LIBCPP_USING_IF_EXISTS;
+  using std::strrchr _LIBCPP_USING_IF_EXISTS;
+  using std::strspn _LIBCPP_USING_IF_EXISTS;
+  using std::strstr _LIBCPP_USING_IF_EXISTS;
+  using std::strtok _LIBCPP_USING_IF_EXISTS;
+  using std::strxfrm _LIBCPP_USING_IF_EXISTS;
 } // namespace std
diff --git a/libcxx/modules/std/ctime.inc b/libcxx/modules/std/ctime.inc
index c98cb28e649b..5bfa61917e5f 100644
--- a/libcxx/modules/std/ctime.inc
+++ b/libcxx/modules/std/ctime.inc
@@ -8,21 +8,21 @@
 //===----------------------------------------------------------------------===//
 
 export namespace std {
-  using std::clock_t;
-  using std::size_t;
-  using std::time_t;
+  using std::clock_t _LIBCPP_USING_IF_EXISTS;
+  using std::size_t _LIBCPP_USING_IF_EXISTS;
+  using std::time_t _LIBCPP_USING_IF_EXISTS;
 
-  using std::timespec;
-  using std::tm;
+  using std::timespec _LIBCPP_USING_IF_EXISTS;
+  using std::tm _LIBCPP_USING_IF_EXISTS;
 
-  using std::asctime;
-  using std::clock;
-  using std::ctime;
-  using std::difftime;
-  using std::gmtime;
-  using std::localtime;
-  using std::mktime;
-  using std::strftime;
-  using std::time;
+  using std::asctime _LIBCPP_USING_IF_EXISTS;
+  using std::clock _LIBCPP_USING_IF_EXISTS;
+  using std::ctime _LIBCPP_USING_IF_EXISTS;
+  using std::difftime _LIBCPP_USING_IF_EXISTS;
+  using std::gmtime _LIBCPP_USING_IF_EXISTS;
+  using std::localtime _LIBCPP_USING_IF_EXISTS;
+  using std::mktime _LIBCPP_USING_IF_EXISTS;
+  using std::strftime _LIBCPP_USING_IF_EXISTS;
+  using std::time _LIBCPP_USING_IF_EXISTS;
   using std::timespec_get _LIBCPP_USING_IF_EXISTS;
 } // namespace std
diff --git a/libcxx/modules/std/cwchar.inc b/libcxx/modules/std/cwchar.inc
index 6818c46b48ef..02b1713359b6 100644
--- a/libcxx/modules/std/cwchar.inc
+++ b/libcxx/modules/std/cwchar.inc
@@ -9,72 +9,72 @@
 
 export namespace std {
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
-  using std::mbstate_t;
-  using std::size_t;
-  using std::wint_t;
+  using std::mbstate_t _LIBCPP_USING_IF_EXISTS;
+  using std::size_t _LIBCPP_USING_IF_EXISTS;
+  using std::wint_t _LIBCPP_USING_IF_EXISTS;
 
-  using std::tm;
+  using std::tm _LIBCPP_USING_IF_EXISTS;
 
-  using std::btowc;
-  using std::fgetwc;
-  using std::fgetws;
-  using std::fputwc;
-  using std::fputws;
-  using std::fwide;
-  using std::fwprintf;
-  using std::fwscanf;
-  using std::getwc;
-  using std::getwchar;
-  using std::putwc;
-  using std::putwchar;
-  using std::swprintf;
-  using std::swscanf;
-  using std::ungetwc;
-  using std::vfwprintf;
-  using std::vfwscanf;
-  using std::vswprintf;
-  using std::vswscanf;
-  using std::vwprintf;
-  using std::vwscanf;
-  using std::wcscat;
-  using std::wcschr;
-  using std::wcscmp;
-  using std::wcscoll;
-  using std::wcscpy;
-  using std::wcscspn;
-  using std::wcsftime;
-  using std::wcslen;
-  using std::wcsncat;
-  using std::wcsncmp;
-  using std::wcsncpy;
-  using std::wcspbrk;
-  using std::wcsrchr;
-  using std::wcsspn;
-  using std::wcsstr;
-  using std::wcstod;
-  using std::wcstof;
-  using std::wcstok;
-  using std::wcstol;
-  using std::wcstold;
-  using std::wcstoll;
-  using std::wcstoul;
-  using std::wcstoull;
-  using std::wcsxfrm;
-  using std::wctob;
-  using std::wmemchr;
-  using std::wmemcmp;
-  using std::wmemcpy;
-  using std::wmemmove;
-  using std::wmemset;
-  using std::wprintf;
-  using std::wscanf;
+  using std::btowc _LIBCPP_USING_IF_EXISTS;
+  using std::fgetwc _LIBCPP_USING_IF_EXISTS;
+  using std::fgetws _LIBCPP_USING_IF_EXISTS;
+  using std::fputwc _LIBCPP_USING_IF_EXISTS;
+  using std::fputws _LIBCPP_USING_IF_EXISTS;
+  using std::fwide _LIBCPP_USING_IF_EXISTS;
+  using std::fwprintf _LIBCPP_USING_IF_EXISTS;
+  using std::fwscanf _LIBCPP_USING_IF_EXISTS;
+  using std::getwc _LIBCPP_USING_IF_EXISTS;
+  using std::getwchar _LIBCPP_USING_IF_EXISTS;
+  using std::putwc _LIBCPP_USING_IF_EXISTS;
+  using std::putwchar _LIBCPP_USING_IF_EXISTS;
+  using std::swprintf _LIBCPP_USING_IF_EXISTS;
+  using std::swscanf _LIBCPP_USING_IF_EXISTS;
+  using std::ungetwc _LIBCPP_USING_IF_EXISTS;
+  using std::vfwprintf _LIBCPP_USING_IF_EXISTS;
+  using std::vfwscanf _LIBCPP_USING_IF_EXISTS;
+  using std::vswprintf _LIBCPP_USING_IF_EXISTS;
+  using std::vswscanf _LIBCPP_USING_IF_EXISTS;
+  using std::vwprintf _LIBCPP_USING_IF_EXISTS;
+  using std::vwscanf _LIBCPP_USING_IF_EXISTS;
+  using std::wcscat _LIBCPP_USING_IF_EXISTS;
+  using std::wcschr _LIBCPP_USING_IF_EXISTS;
+  using std::wcscmp _LIBCPP_USING_IF_EXISTS;
+  using std::wcscoll _LIBCPP_USING_IF_EXISTS;
+  using std::wcscpy _LIBCPP_USING_IF_EXISTS;
+  using std::wcscspn _LIBCPP_USING_IF_EXISTS;
+  using std::wcsftime _LIBCPP_USING_IF_EXISTS;
+  using std::wcslen _LIBCPP_USING_IF_EXISTS;
+  using std::wcsncat _LIBCPP_USING_IF_EXISTS;
+  using std::wcsncmp _LIBCPP_USING_IF_EXISTS;
+  using std::wcsncpy _LIBCPP_USING_IF_EXISTS;
+  using std::wcspbrk _LIBCPP_USING_IF_EXISTS;
+  using std::wcsrchr _LIBCPP_USING_IF_EXISTS;
+  using std::wcsspn _LIBCPP_USING_IF_EXISTS;
+  using std::wcsstr _LIBCPP_USING_IF_EXISTS;
+  using std::wcstod _LIBCPP_USING_IF_EXISTS;
+  using std::wcstof _LIBCPP_USING_IF_EXISTS;
+  using std::wcstok _LIBCPP_USING_IF_EXISTS;
+  using std::wcstol _LIBCPP_USING_IF_EXISTS;
+  using std::wcstold _LIBCPP_USING_IF_EXISTS;
+  using std::wcstoll _LIBCPP_USING_IF_EXISTS;
+  using std::wcstoul _LIBCPP_USING_IF_EXISTS;
+  using std::wcstoull _LIBCPP_USING_IF_EXISTS;
+  using std::wcsxfrm _LIBCPP_USING_IF_EXISTS;
+  using std::wctob _LIBCPP_USING_IF_EXISTS;
+  using std::wmemchr _LIBCPP_USING_IF_EXISTS;
+  using std::wmemcmp _LIBCPP_USING_IF_EXISTS;
+  using std::wmemcpy _LIBCPP_USING_IF_EXISTS;
+  using std::wmemmove _LIBCPP_USING_IF_EXISTS;
+  using std::wmemset _LIBCPP_USING_IF_EXISTS;
+  using std::wprintf _LIBCPP_USING_IF_EXISTS;
+  using std::wscanf _LIBCPP_USING_IF_EXISTS;
 
   // [c.mb.wcs], multibyte / wide string and character conversion functions
-  using std::mbrlen;
-  using std::mbrtowc;
-  using std::mbsinit;
-  using std::mbsrtowcs;
-  using std::wcrtomb;
-  using std::wcsrtombs;
+  using std::mbrlen _LIBCPP_USING_IF_EXISTS;
+  using std::mbrtowc _LIBCPP_USING_IF_EXISTS;
+  using std::mbsinit _LIBCPP_USING_IF_EXISTS;
+  using std::mbsrtowcs _LIBCPP_USING_IF_EXISTS;
+  using std::wcrtomb _LIBCPP_USING_IF_EXISTS;
+  using std::wcsrtombs _LIBCPP_USING_IF_EXISTS;
 #endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS
 } // namespace std
diff --git a/libcxx/modules/std/cwctype.inc b/libcxx/modules/std/cwctype.inc
index 70e6cf3f1133..30e526aae0af 100644
--- a/libcxx/modules/std/cwctype.inc
+++ b/libcxx/modules/std/cwctype.inc
@@ -9,27 +9,27 @@
 
 export namespace std {
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
-  using std::wctrans_t;
-  using std::wctype_t;
-  using std::wint_t;
+  using std::wctrans_t _LIBCPP_USING_IF_EXISTS;
+  using std::wctype_t _LIBCPP_USING_IF_EXISTS;
+  using std::wint_t _LIBCPP_USING_IF_EXISTS;
 
-  using std::iswalnum;
-  using std::iswalpha;
-  using std::iswblank;
-  using std::iswcntrl;
-  using std::iswctype;
-  using std::iswdigit;
-  using std::iswgraph;
-  using std::iswlower;
-  using std::iswprint;
-  using std::iswpunct;
-  using std::iswspace;
-  using std::iswupper;
-  using std::iswxdigit;
-  using std::towctrans;
-  using std::towlower;
-  using std::towupper;
-  using std::wctrans;
-  using std::wctype;
+  using std::iswalnum _LIBCPP_USING_IF_EXISTS;
+  using std::iswalpha _LIBCPP_USING_IF_EXISTS;
+  using std::iswblank _LIBCPP_USING_IF_EXISTS;
+  using std::iswcntrl _LIBCPP_USING_IF_EXISTS;
+  using std::iswctype _LIBCPP_USING_IF_EXISTS;
+  using std::iswdigit _LIBCPP_USING_IF_EXISTS;
+  using std::iswgraph _LIBCPP_USING_IF_EXISTS;
+  using std::iswlower _LIBCPP_USING_IF_EXISTS;
+  using std::iswprint _LIBCPP_USING_IF_EXISTS;
+  using std::iswpunct _LIBCPP_USING_IF_EXISTS;
+  using std::iswspace _LIBCPP_USING_IF_EXISTS;
+  using std::iswupper _LIBCPP_USING_IF_EXISTS;
+  using std::iswxdigit _LIBCPP_USING_IF_EXISTS;
+  using std::towctrans _LIBCPP_USING_IF_EXISTS;
+  using std::towlower _LIBCPP_USING_IF_EXISTS;
+  using std::towupper _LIBCPP_USING_IF_EXISTS;
+  using std::wctrans _LIBCPP_USING_IF_EXISTS;
+  using std::wctype _LIBCPP_USING_IF_EXISTS;
 #endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS
 } // namespace std
diff --git a/libcxx/modules/std/ranges.inc b/libcxx/modules/std/ranges.inc
index 80f31c79a1a4..f71efe948ede 100644
--- a/libcxx/modules/std/ranges.inc
+++ b/libcxx/modules/std/ranges.inc
@@ -138,9 +138,6 @@ export namespace std {
     }
 #endif // _LIBCPP_HAS_NO_LOCALIZATION
 
-#if _LIBCPP_STD_VER >= 23
-    // [range.adaptor.object], range adaptor objects
-    using std::ranges::range_adaptor_closure;
     // Note: This declaration not in the synopsis or explicitly in the wording.
     // However it is needed for the range adaptors.
     // [range.adaptor.object]/3
@@ -151,7 +148,11 @@ export namespace std {
     //   involving an object of type cv D as an operand to the | operator is
     //   undefined if overload resolution selects a program-defined operator|
     //   function.
+    // This is used internally in C++20 mode.
     using std::ranges::operator|;
+#if _LIBCPP_STD_VER >= 23
+    // [range.adaptor.object], range adaptor objects
+    using std::ranges::range_adaptor_closure;
 #endif
 
     // [range.all], all view
diff --git a/libcxx/test/CMakeLists.txt b/libcxx/test/CMakeLists.txt
index e0d3a0dbc400..fd57aa9fe8b3 100644
--- a/libcxx/test/CMakeLists.txt
+++ b/libcxx/test/CMakeLists.txt
@@ -1,5 +1,11 @@
 include(HandleLitArguments)
 add_subdirectory(tools)
+# When the tools add clang-tidy support, the dependencies need to be updated.
+# This cannot be done in the tools CMakeLists.txt since that does not update
+# the status in this (a parent) directory.
+if(TARGET cxx-tidy)
+  list(APPEND LIBCXX_TEST_DEPS cxx-tidy)
+endif()
 
 # By default, libcxx and libcxxabi share a library directory.
 if (NOT LIBCXX_CXX_ABI_LIBRARY_PATH)
diff --git a/libcxx/test/libcxx/clang_modules_include.gen.py b/libcxx/test/libcxx/clang_modules_include.gen.py
index 61a925823764..7ba4bf032624 100644
--- a/libcxx/test/libcxx/clang_modules_include.gen.py
+++ b/libcxx/test/libcxx/clang_modules_include.gen.py
@@ -12,35 +12,33 @@
 
 # RUN: %{python} %s %{libcxx-dir}/utils
 
+# block Lit from interpreting a RUN/XFAIL/etc inside the generation script
+# END.
+
 import sys
 sys.path.append(sys.argv[1])
 from libcxx.header_information import lit_header_restrictions, public_headers
 
-BLOCKLIT = '' # block Lit from interpreting a RUN/XFAIL/etc inside the generation script
-
 for header in public_headers:
   print(f"""\
 //--- {header}.compile.pass.cpp
-// RUN{BLOCKLIT}: %{{cxx}} %s %{{flags}} %{{compile_flags}} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only
+// RUN: %{{cxx}} %s %{{flags}} %{{compile_flags}} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only
 
 // GCC doesn't support -fcxx-modules
-// UNSUPPORTED{BLOCKLIT}: gcc
+// UNSUPPORTED: gcc
 
 // The Windows headers don't appear to be compatible with modules
-// UNSUPPORTED{BLOCKLIT}: windows
-// UNSUPPORTED{BLOCKLIT}: buildhost=windows
-
-// The AIX headers don't appear to be compatible with modules
-// UNSUPPORTED{BLOCKLIT}: LIBCXX-AIX-FIXME
+// UNSUPPORTED: windows
+// UNSUPPORTED: buildhost=windows
 
 // The Android headers don't appear to be compatible with modules yet
-// UNSUPPORTED{BLOCKLIT}: LIBCXX-ANDROID-FIXME
+// UNSUPPORTED: LIBCXX-ANDROID-FIXME
 
 // TODO: Investigate this failure
-// UNSUPPORTED{BLOCKLIT}: LIBCXX-FREEBSD-FIXME
+// UNSUPPORTED: LIBCXX-FREEBSD-FIXME
 
 // TODO: Investigate this failure
-// UNSUPPORTED{BLOCKLIT}: LIBCXX-PICOLIBC-FIXME
+// UNSUPPORTED: LIBCXX-PICOLIBC-FIXME
 
 {lit_header_restrictions.get(header, '')}
 
@@ -49,25 +47,22 @@ for header in public_headers:
 
 print(f"""\
 //--- __std_clang_module.compile.pass.mm
-// RUN{BLOCKLIT}: %{{cxx}} %s %{{flags}} %{{compile_flags}} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only
+// RUN: %{{cxx}} %s %{{flags}} %{{compile_flags}} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only
 
-// REQUIRES{BLOCKLIT}: clang-modules-build
+// REQUIRES: clang-modules-build
 
 // GCC doesn't support -fcxx-modules
-// UNSUPPORTED{BLOCKLIT}: gcc
+// UNSUPPORTED: gcc
 
 // The Windows headers don't appear to be compatible with modules
-// UNSUPPORTED{BLOCKLIT}: windows
-// UNSUPPORTED{BLOCKLIT}: buildhost=windows
-
-// The AIX headers don't appear to be compatible with modules
-// UNSUPPORTED{BLOCKLIT}: LIBCXX-AIX-FIXME
+// UNSUPPORTED: windows
+// UNSUPPORTED: buildhost=windows
 
 // The Android headers don't appear to be compatible with modules yet
-// UNSUPPORTED{BLOCKLIT}: LIBCXX-ANDROID-FIXME
+// UNSUPPORTED: LIBCXX-ANDROID-FIXME
 
 // TODO: Investigate this failure
-// UNSUPPORTED{BLOCKLIT}: LIBCXX-FREEBSD-FIXME
+// UNSUPPORTED: LIBCXX-FREEBSD-FIXME
 
 @import std;
 
diff --git a/libcxx/test/libcxx/clang_tidy.gen.py b/libcxx/test/libcxx/clang_tidy.gen.py
index 19b6a999df60..76b9db2d5cb8 100644
--- a/libcxx/test/libcxx/clang_tidy.gen.py
+++ b/libcxx/test/libcxx/clang_tidy.gen.py
@@ -10,25 +10,30 @@
 
 # RUN: %{python} %s %{libcxx-dir}/utils
 
+# block Lit from interpreting a RUN/XFAIL/etc inside the generation script
+# END.
+
 import sys
 sys.path.append(sys.argv[1])
 from libcxx.header_information import lit_header_restrictions, public_headers
 
 for header in public_headers:
-  BLOCKLIT = '' # block Lit from interpreting a RUN/XFAIL/etc inside the generation script
   print(f"""\
 //--- {header}.sh.cpp
 
-// REQUIRES{BLOCKLIT}: has-clang-tidy
+// REQUIRES: has-clang-tidy
 
 // The GCC compiler flags are not always compatible with clang-tidy.
-// UNSUPPORTED{BLOCKLIT}: gcc
+// UNSUPPORTED: gcc
+
+// Clang 17 has false positives.
+// UNSUPPORTED: clang-17
 
 {lit_header_restrictions.get(header, '')}
 
 // TODO: run clang-tidy with modules enabled once they are supported
-// RUN{BLOCKLIT}: %{{clang-tidy}} %s --warnings-as-errors=* -header-filter=.* --checks='-*,libcpp-*' --load=%{{test-tools-dir}}/clang_tidy_checks/libcxx-tidy.plugin -- %{{compile_flags}} -fno-modules
-// RUN{BLOCKLIT}: %{{clang-tidy}} %s --warnings-as-errors=* -header-filter=.* --config-file=%{{libcxx-dir}}/.clang-tidy -- -Wweak-vtables %{{compile_flags}} -fno-modules
+// RUN: %{{clang-tidy}} %s --warnings-as-errors=* -header-filter=.* --checks='-*,libcpp-*' --load=%{{test-tools-dir}}/clang_tidy_checks/libcxx-tidy.plugin -- %{{compile_flags}} -fno-modules
+// RUN: %{{clang-tidy}} %s --warnings-as-errors=* -header-filter=.* --config-file=%{{libcxx-dir}}/.clang-tidy -- -Wweak-vtables %{{compile_flags}} -fno-modules
 
 #include <{header}>
 """)
diff --git a/libcxx/test/libcxx/containers/strings/basic.string/asan_deque_integration.pass.cpp b/libcxx/test/libcxx/containers/strings/basic.string/asan_deque_integration.pass.cpp
new file mode 100644
index 000000000000..1205190b3a6e
--- /dev/null
+++ b/libcxx/test/libcxx/containers/strings/basic.string/asan_deque_integration.pass.cpp
@@ -0,0 +1,182 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: asan
+// UNSUPPORTED: c++03
+
+#include <cassert>
+#include <string>
+#include <array>
+#include <deque>
+#include "test_macros.h"
+#include "asan_testing.h"
+#include "min_allocator.h"
+
+// This tests exists to check if strings work well with deque, as those
+// may be partialy annotated, we cannot simply call
+// is_double_ended_contiguous_container_asan_correct, as it assumes that
+// object memory inside is not annotated, so we check everything in a more careful way.
+
+template <typename D>
+void verify_inside(D const& d) {
+  for (size_t i = 0; i < d.size(); ++i) {
+    assert(is_string_asan_correct(d[i]));
+  }
+}
+
+template <typename S, size_t N>
+S get_s(char c) {
+  S s;
+  for (size_t i = 0; i < N; ++i)
+    s.push_back(c);
+
+  return s;
+}
+
+template <class C, class S>
+void test_string() {
+  size_t const N = sizeof(S) < 256 ? (4096 / sizeof(S)) : 16;
+
+  {
+    C d1a(1), d1b(N), d1c(N + 1), d1d(5 * N);
+    verify_inside(d1a);
+    verify_inside(d1b);
+    verify_inside(d1c);
+    verify_inside(d1d);
+  }
+  {
+    C d2;
+    for (size_t i = 0; i < 3 * N + 2; ++i) {
+      d2.push_back(get_s<S, 1>(i % 10 + 'a'));
+      verify_inside(d2);
+      d2.push_back(get_s<S, 22>(i % 10 + 'b'));
+      verify_inside(d2);
+
+      d2.pop_front();
+      verify_inside(d2);
+    }
+  }
+  {
+    C d3;
+    for (size_t i = 0; i < 3 * N + 2; ++i) {
+      d3.push_front(get_s<S, 1>(i % 10 + 'a'));
+      verify_inside(d3);
+      d3.push_front(get_s<S, 28>(i % 10 + 'b'));
+      verify_inside(d3);
+
+      d3.pop_back();
+      verify_inside(d3);
+    }
+  }
+  {
+    C d4;
+    for (size_t i = 0; i < 3 * N + 2; ++i) {
+      // When there is no SSO, all elements inside should not be poisoned,
+      // so we can verify deque poisoning.
+      d4.push_front(get_s<S, 33>(i % 10 + 'a'));
+      verify_inside(d4);
+      assert(is_double_ended_contiguous_container_asan_correct(d4));
+      d4.push_back(get_s<S, 28>(i % 10 + 'b'));
+      verify_inside(d4);
+      assert(is_double_ended_contiguous_container_asan_correct(d4));
+    }
+  }
+  {
+    C d5;
+    for (size_t i = 0; i < 3 * N + 2; ++i) {
+      // In d4 we never had poisoned memory inside deque.
+      // Here we start with SSO, so part of the inside of the container,
+      // will be poisoned.
+      d5.push_front(S());
+      verify_inside(d5);
+    }
+    for (size_t i = 0; i < d5.size(); ++i) {
+      // We change the size to have long string.
+      // Memory owne by deque should not be poisoned by string.
+      d5[i].resize(100);
+      verify_inside(d5);
+    }
+
+    assert(is_double_ended_contiguous_container_asan_correct(d5));
+
+    d5.erase(d5.begin() + 2);
+    verify_inside(d5);
+
+    d5.erase(d5.end() - 2);
+    verify_inside(d5);
+
+    assert(is_double_ended_contiguous_container_asan_correct(d5));
+  }
+  {
+    C d6a;
+    assert(is_double_ended_contiguous_container_asan_correct(d6a));
+
+    C d6b(N + 2, get_s<S, 100>('a'));
+    d6b.push_front(get_s<S, 101>('b'));
+    while (!d6b.empty()) {
+      d6b.pop_back();
+      assert(is_double_ended_contiguous_container_asan_correct(d6b));
+    }
+
+    C d6c(N + 2, get_s<S, 102>('c'));
+    while (!d6c.empty()) {
+      d6c.pop_back();
+      assert(is_double_ended_contiguous_container_asan_correct(d6c));
+    }
+  }
+  {
+    C d7(9 * N + 2);
+
+    d7.insert(d7.begin() + 1, S());
+    verify_inside(d7);
+
+    d7.insert(d7.end() - 3, S());
+    verify_inside(d7);
+
+    d7.insert(d7.begin() + 2 * N, get_s<S, 1>('a'));
+    verify_inside(d7);
+
+    d7.insert(d7.end() - 2 * N, get_s<S, 1>('b'));
+    verify_inside(d7);
+
+    d7.insert(d7.begin() + 2 * N, 3 * N, get_s<S, 1>('c'));
+    verify_inside(d7);
+
+    // It may not be short for big element types, but it will be checked correctly:
+    d7.insert(d7.end() - 2 * N, 3 * N, get_s<S, 2>('d'));
+    verify_inside(d7);
+
+    d7.erase(d7.begin() + 2);
+    verify_inside(d7);
+
+    d7.erase(d7.end() - 2);
+    verify_inside(d7);
+  }
+}
+
+template <class S>
+void test_container() {
+  test_string<std::deque<S, std::allocator<S>>, S>();
+  test_string<std::deque<S, min_allocator<S>>, S>();
+  test_string<std::deque<S, safe_allocator<S>>, S>();
+}
+
+int main(int, char**) {
+  // Those tests support only types based on std::basic_string.
+  test_container<std::string>();
+  test_container<std::wstring>();
+#if TEST_STD_VER >= 11
+  test_container<std::u16string>();
+  test_container<std::u32string>();
+#endif
+#if TEST_STD_VER >= 20
+  test_container<std::u8string>();
+#endif
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/strings/basic.string/asan_short.pass.cpp b/libcxx/test/libcxx/containers/strings/basic.string/asan_short.pass.cpp
new file mode 100644
index 000000000000..53c70bed189b
--- /dev/null
+++ b/libcxx/test/libcxx/containers/strings/basic.string/asan_short.pass.cpp
@@ -0,0 +1,56 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: asan
+// UNSUPPORTED: c++03
+
+// <string>
+
+// Basic test if ASan annotations work for short strings.
+
+#include <string>
+#include <cassert>
+#include <cstdlib>
+
+#include "asan_testing.h"
+#include "min_allocator.h"
+#include "test_iterators.h"
+#include "test_macros.h"
+
+extern "C" void __sanitizer_set_death_callback(void (*callback)(void));
+
+void do_exit() { exit(0); }
+
+int main(int, char**) {
+  {
+    typedef cpp17_input_iterator<char*> MyInputIter;
+    // Should not trigger ASan.
+    std::basic_string<char, std::char_traits<char>, safe_allocator<char>> v;
+    char i[] = {'a', 'b', 'c', 'd'};
+
+    v.insert(v.begin(), MyInputIter(i), MyInputIter(i + 4));
+    assert(v[0] == 'a');
+    assert(is_string_asan_correct(v));
+  }
+
+  __sanitizer_set_death_callback(do_exit);
+  {
+    using T     = char;
+    using C     = std::basic_string<T, std::char_traits<T>, safe_allocator<T>>;
+    const T t[] = {'a', 'b', 'c', 'd', 'e', 'f', 'g'};
+    C c(std::begin(t), std::end(t));
+    assert(is_string_asan_correct(c));
+    assert(__sanitizer_verify_contiguous_container(c.data(), c.data() + c.size() + 1, c.data() + c.capacity() + 1) !=
+           0);
+    volatile T foo = c[c.size() + 1]; // should trigger ASAN. Use volatile to prevent being optimized away.
+    assert(false);                    // if we got here, ASAN didn't trigger
+    ((void)foo);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/strings/basic.string/asan_vector_integration.pass.cpp b/libcxx/test/libcxx/containers/strings/basic.string/asan_vector_integration.pass.cpp
new file mode 100644
index 000000000000..b7d95b706908
--- /dev/null
+++ b/libcxx/test/libcxx/containers/strings/basic.string/asan_vector_integration.pass.cpp
@@ -0,0 +1,182 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: asan
+// UNSUPPORTED: c++03
+
+#include <cassert>
+#include <string>
+#include <vector>
+#include <array>
+#include "test_macros.h"
+#include "asan_testing.h"
+#include "min_allocator.h"
+
+// This tests exists to check if strings work well with vector, as those
+// may be partialy annotated, we cannot simply call
+// is_contiguous_container_asan_correct, as it assumes that
+// object memory inside is not annotated, so we check everything in a more careful way.
+
+template <typename D>
+void verify_inside(D const& d) {
+  for (size_t i = 0; i < d.size(); ++i) {
+    assert(is_string_asan_correct(d[i]));
+  }
+}
+
+template <typename S, size_t N>
+S get_s(char c) {
+  S s;
+  for (size_t i = 0; i < N; ++i)
+    s.push_back(c);
+
+  return s;
+}
+
+template <class C, class S>
+void test_string() {
+  size_t const N = sizeof(S) < 256 ? (4096 / sizeof(S)) : 16;
+
+  {
+    C d1a(1), d1b(N), d1c(N + 1), d1d(5 * N);
+    verify_inside(d1a);
+    verify_inside(d1b);
+    verify_inside(d1c);
+    verify_inside(d1d);
+  }
+  {
+    C d2;
+    for (size_t i = 0; i < 3 * N + 2; ++i) {
+      d2.push_back(get_s<S, 1>(i % 10 + 'a'));
+      verify_inside(d2);
+      d2.push_back(get_s<S, 28>(i % 10 + 'b'));
+      verify_inside(d2);
+
+      d2.erase(d2.cbegin());
+      verify_inside(d2);
+    }
+  }
+  {
+    C d3;
+    for (size_t i = 0; i < 3 * N + 2; ++i) {
+      d3.push_back(get_s<S, 1>(i % 10 + 'a'));
+      verify_inside(d3);
+      d3.push_back(get_s<S, 28>(i % 10 + 'b'));
+      verify_inside(d3);
+
+      d3.pop_back();
+      verify_inside(d3);
+    }
+  }
+  {
+    C d4;
+    for (size_t i = 0; i < 3 * N + 2; ++i) {
+      // When there is no SSO, all elements inside should not be poisoned,
+      // so we can verify vector poisoning.
+      d4.push_back(get_s<S, 33>(i % 10 + 'a'));
+      verify_inside(d4);
+      assert(is_contiguous_container_asan_correct(d4));
+      d4.push_back(get_s<S, 28>(i % 10 + 'b'));
+      verify_inside(d4);
+      assert(is_contiguous_container_asan_correct(d4));
+    }
+  }
+  {
+    C d5;
+    for (size_t i = 0; i < 3 * N + 2; ++i) {
+      // In d4 we never had poisoned memory inside vector.
+      // Here we start with SSO, so part of the inside of the container,
+      // will be poisoned.
+      d5.push_back(S());
+      verify_inside(d5);
+    }
+    for (size_t i = 0; i < d5.size(); ++i) {
+      // We change the size to have long string.
+      // Memory owne by vector should not be poisoned by string.
+      d5[i].resize(100);
+      verify_inside(d5);
+    }
+
+    assert(is_contiguous_container_asan_correct(d5));
+
+    d5.erase(d5.begin() + 2);
+    verify_inside(d5);
+
+    d5.erase(d5.end() - 2);
+    verify_inside(d5);
+
+    assert(is_contiguous_container_asan_correct(d5));
+  }
+  {
+    C d6a;
+    assert(is_contiguous_container_asan_correct(d6a));
+
+    C d6b(N + 2, get_s<S, 100>('a'));
+    d6b.push_back(get_s<S, 101>('b'));
+    while (!d6b.empty()) {
+      d6b.pop_back();
+      assert(is_contiguous_container_asan_correct(d6b));
+    }
+
+    C d6c(N + 2, get_s<S, 102>('c'));
+    while (!d6c.empty()) {
+      d6c.pop_back();
+      assert(is_contiguous_container_asan_correct(d6c));
+    }
+  }
+  {
+    C d7(9 * N + 2);
+
+    d7.insert(d7.begin() + 1, S());
+    verify_inside(d7);
+
+    d7.insert(d7.end() - 3, S());
+    verify_inside(d7);
+
+    d7.insert(d7.begin() + 2 * N, get_s<S, 1>('a'));
+    verify_inside(d7);
+
+    d7.insert(d7.end() - 2 * N, get_s<S, 1>('b'));
+    verify_inside(d7);
+
+    d7.insert(d7.begin() + 2 * N, 3 * N, get_s<S, 1>('c'));
+    verify_inside(d7);
+
+    // It may not be short for big element types, but it will be checked correctly:
+    d7.insert(d7.end() - 2 * N, 3 * N, get_s<S, 2>('d'));
+    verify_inside(d7);
+
+    d7.erase(d7.begin() + 2);
+    verify_inside(d7);
+
+    d7.erase(d7.end() - 2);
+    verify_inside(d7);
+  }
+}
+
+template <class S>
+void test_container() {
+  test_string<std::vector<S, std::allocator<S>>, S>();
+  test_string<std::vector<S, min_allocator<S>>, S>();
+  test_string<std::vector<S, safe_allocator<S>>, S>();
+}
+
+int main(int, char**) {
+  // Those tests support only types based on std::basic_string.
+  test_container<std::string>();
+  test_container<std::wstring>();
+#if TEST_STD_VER >= 11
+  test_container<std::u16string>();
+  test_container<std::u32string>();
+#endif
+#if TEST_STD_VER >= 20
+  test_container<std::u8string>();
+#endif
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/double_include.gen.py b/libcxx/test/libcxx/double_include.gen.py
index 2fcfa50db693..c7cb38b8f359 100644
--- a/libcxx/test/libcxx/double_include.gen.py
+++ b/libcxx/test/libcxx/double_include.gen.py
@@ -10,20 +10,22 @@
 
 # RUN: %{python} %s %{libcxx-dir}/utils
 
+# Block Lit from interpreting a RUN/XFAIL/etc inside the generation script.
+# END.
+
 import sys
 sys.path.append(sys.argv[1])
 from libcxx.header_information import lit_header_restrictions, public_headers
 
 for header in public_headers:
-  BLOCKLIT = '' # block Lit from interpreting a RUN/XFAIL/etc inside the generation script
   print(f"""\
 //--- {header}.sh.cpp
 {lit_header_restrictions.get(header, '')}
 
-// RUN{BLOCKLIT}: %{{cxx}} -c %s -o %t.first.o %{{flags}} %{{compile_flags}}
-// RUN{BLOCKLIT}: %{{cxx}} -c %s -o %t.second.o -DWITH_MAIN %{{flags}} %{{compile_flags}}
-// RUN{BLOCKLIT}: %{{cxx}} -o %t.exe %t.first.o %t.second.o %{{flags}} %{{link_flags}}
-// RUN{BLOCKLIT}: %{{run}}
+// RUN: %{{cxx}} -c %s -o %t.first.o %{{flags}} %{{compile_flags}}
+// RUN: %{{cxx}} -c %s -o %t.second.o -DWITH_MAIN %{{flags}} %{{compile_flags}}
+// RUN: %{{cxx}} -o %t.exe %t.first.o %t.second.o %{{flags}} %{{link_flags}}
+// RUN: %{{run}}
 
 #include <{header}>
 
diff --git a/libcxx/test/libcxx/input.output/file.streams/fstreams/traits_mismatch.verify.cpp b/libcxx/test/libcxx/input.output/file.streams/fstreams/traits_mismatch.verify.cpp
index 82136ee02557..b77f6a020131 100644
--- a/libcxx/test/libcxx/input.output/file.streams/fstreams/traits_mismatch.verify.cpp
+++ b/libcxx/test/libcxx/input.output/file.streams/fstreams/traits_mismatch.verify.cpp
@@ -21,19 +21,9 @@ std::basic_fstream<char, std::char_traits<wchar_t> > f;
 // expected-error-re@ios:* {{static assertion failed{{.*}}traits_type::char_type must be the same type as CharT}}
 // expected-error-re@streambuf:* {{static assertion failed{{.*}}traits_type::char_type must be the same type as CharT}}
 
-// expected-error@fstream:* {{only virtual member functions can be marked 'override'}}
-// expected-error@fstream:* {{only virtual member functions can be marked 'override'}}
-// expected-error@fstream:* {{only virtual member functions can be marked 'override'}}
-// expected-error@fstream:* {{only virtual member functions can be marked 'override'}}
-// expected-error@fstream:* {{only virtual member functions can be marked 'override'}}
-// expected-error@fstream:* {{only virtual member functions can be marked 'override'}}
-// expected-error@fstream:* {{only virtual member functions can be marked 'override'}}
-// expected-error@fstream:* {{only virtual member functions can be marked 'override'}}
-// expected-error@fstream:* {{only virtual member functions can be marked 'override'}}
-// expected-error@istream:* {{only virtual member functions can be marked 'override'}}
-// expected-error@istream:* {{only virtual member functions can be marked 'override'}}
+// expected-error@*:* 11 {{only virtual member functions can be marked 'override'}}
 
 // FIXME: As of commit r324062 Clang incorrectly generates a diagnostic about mismatching
 // exception specifications for types which are already invalid for one reason or another.
 // For now we tolerate this diagnostic.
-// expected-error@ostream:* 0-1 {{exception specification of overriding function is more lax than base version}}
+// expected-error@*:* 0-1 {{exception specification of overriding function is more lax than base version}}
diff --git a/libcxx/test/libcxx/input.output/iostream.format/output.streams/traits_mismatch.verify.cpp b/libcxx/test/libcxx/input.output/iostream.format/output.streams/traits_mismatch.verify.cpp
index 5b4f7c4694e6..f947507b23a1 100644
--- a/libcxx/test/libcxx/input.output/iostream.format/output.streams/traits_mismatch.verify.cpp
+++ b/libcxx/test/libcxx/input.output/iostream.format/output.streams/traits_mismatch.verify.cpp
@@ -22,4 +22,4 @@ struct test_ostream
     : public std::basic_ostream<char, std::char_traits<wchar_t> > {};
 
 // expected-error-re@ios:* {{static assertion failed{{.*}}traits_type::char_type must be the same type as CharT}}
-// expected-error@ostream:* {{only virtual member functions can be marked 'override'}}
+// expected-error@*:* {{only virtual member functions can be marked 'override'}}
diff --git a/libcxx/test/libcxx/iterators/aliasing_iterator.pass.cpp b/libcxx/test/libcxx/iterators/aliasing_iterator.pass.cpp
new file mode 100644
index 000000000000..60587d5bfe5d
--- /dev/null
+++ b/libcxx/test/libcxx/iterators/aliasing_iterator.pass.cpp
@@ -0,0 +1,45 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// ADDITIONAL_COMPILE_FLAGS(clang): -Wprivate-header
+
+#include <__iterator/aliasing_iterator.h>
+#include <cassert>
+
+struct NonTrivial {
+  int i_;
+
+  NonTrivial(int i) : i_(i) {}
+  NonTrivial(const NonTrivial& other) : i_(other.i_) {}
+
+  NonTrivial& operator=(const NonTrivial& other) {
+    i_ = other.i_;
+    return *this;
+  }
+
+  ~NonTrivial() {}
+};
+
+int main(int, char**) {
+  {
+    NonTrivial arr[] = {1, 2, 3, 4};
+    std::__aliasing_iterator<NonTrivial*, int> iter(arr);
+
+    assert(*iter == 1);
+    assert(iter[0] == 1);
+    assert(iter[1] == 2);
+    ++iter;
+    assert(*iter == 2);
+    assert(iter[-1] == 1);
+    assert(iter.__base() == arr + 1);
+    assert(iter == iter);
+    assert(iter != (iter + 1));
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/transitive_includes.gen.py b/libcxx/test/libcxx/transitive_includes.gen.py
index e4e1d3f232c1..a67cab693b6e 100644
--- a/libcxx/test/libcxx/transitive_includes.gen.py
+++ b/libcxx/test/libcxx/transitive_includes.gen.py
@@ -18,6 +18,9 @@
 
 # RUN: %{python} %s %{libcxx-dir}/utils
 
+# block Lit from interpreting a RUN/XFAIL/etc inside the generation script
+# END.
+
 import sys
 sys.path.append(sys.argv[1])
 from libcxx.header_information import lit_header_restrictions, public_headers
@@ -29,11 +32,10 @@ import re
 # for std in c++03 c++11 c++14 c++17 c++20 c++23 c++26; do <build>/bin/llvm-lit --param std=$std libcxx/test/libcxx/transitive_includes.gen.py; done
 regenerate_expected_results = False
 
-BLOCKLIT = '' # block Lit from interpreting a RUN/XFAIL/etc inside the generation script
 if regenerate_expected_results:
   print(f"""\
 //--- generate-transitive-includes.sh.cpp
-// RUN{BLOCKLIT}: mkdir %t
+// RUN: mkdir %t
 """)
 
   all_traces = []
@@ -43,12 +45,12 @@ if regenerate_expected_results:
 
     normalized_header = re.sub('/', '_', header)
     print(f"""\
-// RUN{BLOCKLIT}: echo "#include <{header}>" | %{{cxx}} -xc++ - %{{flags}} %{{compile_flags}} --trace-includes -fshow-skipped-includes --preprocess > /dev/null 2> %t/trace-includes.{normalized_header}.txt
+// RUN: echo "#include <{header}>" | %{{cxx}} -xc++ - %{{flags}} %{{compile_flags}} --trace-includes -fshow-skipped-includes --preprocess > /dev/null 2> %t/trace-includes.{normalized_header}.txt
 """)
     all_traces.append(f'%t/trace-includes.{normalized_header}.txt')
 
   print(f"""\
-// RUN{BLOCKLIT}: %{{python}} %{{libcxx-dir}}/test/libcxx/transitive_includes_to_csv.py {' '.join(all_traces)} > %{{libcxx-dir}}/test/libcxx/transitive_includes/%{{cxx_std}}.csv
+// RUN: %{{python}} %{{libcxx-dir}}/test/libcxx/transitive_includes_to_csv.py {' '.join(all_traces)} > %{{libcxx-dir}}/test/libcxx/transitive_includes/%{{cxx_std}}.csv
 """)
 
 else:
@@ -64,27 +66,27 @@ else:
 {lit_header_restrictions.get(header, '')}
 
 // TODO: Fix this test to make it work with localization or wide characters disabled
-// UNSUPPORTED{BLOCKLIT}: no-localization, no-wide-characters, no-threads, no-filesystem, libcpp-has-no-experimental-tzdb, no-tzdb
+// UNSUPPORTED: no-localization, no-wide-characters, no-threads, no-filesystem, libcpp-has-no-experimental-tzdb, no-tzdb
 
 // When built with modules, this test doesn't work because --trace-includes doesn't
 // report the stack of includes correctly.
-// UNSUPPORTED{BLOCKLIT}: clang-modules-build
+// UNSUPPORTED: clang-modules-build
 
 // This test uses --trace-includes, which is not supported by GCC.
-// UNSUPPORTED{BLOCKLIT}: gcc
+// UNSUPPORTED: gcc
 
 // This test is not supported when we remove the transitive includes provided for backwards
 // compatibility. When we bulk-remove them, we'll adjust the includes that are expected by
 // this test instead.
-// UNSUPPORTED{BLOCKLIT}: transitive-includes-disabled
+// UNSUPPORTED: transitive-includes-disabled
 
 // TODO: Figure out why <stdatomic.h> doesn't work on FreeBSD
-// UNSUPPORTED{BLOCKLIT}: LIBCXX-FREEBSD-FIXME
+// UNSUPPORTED: LIBCXX-FREEBSD-FIXME
 
-// RUN{BLOCKLIT}: mkdir %t
-// RUN{BLOCKLIT}: %{{cxx}} %s %{{flags}} %{{compile_flags}} --trace-includes -fshow-skipped-includes --preprocess > /dev/null 2> %t/trace-includes.txt
-// RUN{BLOCKLIT}: %{{python}} %{{libcxx-dir}}/test/libcxx/transitive_includes_to_csv.py %t/trace-includes.txt > %t/actual_transitive_includes.csv
-// RUN{BLOCKLIT}: cat %{{libcxx-dir}}/test/libcxx/transitive_includes/%{{cxx_std}}.csv | awk '/^{escaped_header} / {{ print }}' > %t/expected_transitive_includes.csv
-// RUN{BLOCKLIT}: diff -w %t/expected_transitive_includes.csv %t/actual_transitive_includes.csv
+// RUN: mkdir %t
+// RUN: %{{cxx}} %s %{{flags}} %{{compile_flags}} --trace-includes -fshow-skipped-includes --preprocess > /dev/null 2> %t/trace-includes.txt
+// RUN: %{{python}} %{{libcxx-dir}}/test/libcxx/transitive_includes_to_csv.py %t/trace-includes.txt > %t/actual_transitive_includes.csv
+// RUN: cat %{{libcxx-dir}}/test/libcxx/transitive_includes/%{{cxx_std}}.csv | awk '/^{escaped_header} / {{ print }}' > %t/expected_transitive_includes.csv
+// RUN: diff -w %t/expected_transitive_includes.csv %t/actual_transitive_includes.csv
 #include <{header}>
 """)
diff --git a/libcxx/test/libcxx/transitive_includes/cxx03.csv b/libcxx/test/libcxx/transitive_includes/cxx03.csv
index c2250899a800..92601fab5b77 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx03.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx03.csv
@@ -401,11 +401,23 @@ iostream ostream
 iostream streambuf
 iostream version
 istream bitset
+istream cerrno
 istream concepts
 istream cstddef
+istream cstdint
+istream cstring
+istream initializer_list
+istream ios
 istream iosfwd
+istream limits
+istream locale
+istream new
 istream ostream
+istream streambuf
+istream string
+istream string_view
 istream type_traits
+istream typeinfo
 istream version
 iterator compare
 iterator concepts
@@ -558,6 +570,7 @@ numeric cstddef
 numeric cstdint
 numeric execution
 numeric functional
+numeric initializer_list
 numeric iterator
 numeric limits
 numeric new
@@ -746,12 +759,23 @@ span limits
 span stdexcept
 span type_traits
 span version
+sstream bitset
+sstream cerrno
 sstream cstddef
+sstream cstdint
+sstream cstring
+sstream initializer_list
+sstream ios
 sstream istream
+sstream limits
+sstream locale
+sstream new
 sstream ostream
+sstream streambuf
 sstream string
 sstream string_view
 sstream type_traits
+sstream typeinfo
 sstream version
 stack compare
 stack concepts
@@ -835,11 +859,13 @@ strstream istream
 strstream ostream
 strstream version
 syncstream cstddef
+syncstream ios
 syncstream iosfwd
 syncstream map
 syncstream mutex
 syncstream ostream
 syncstream shared_mutex
+syncstream streambuf
 syncstream string
 system_error cerrno
 system_error compare
diff --git a/libcxx/test/libcxx/transitive_includes/cxx11.csv b/libcxx/test/libcxx/transitive_includes/cxx11.csv
index 3e929e8f9409..c05eb42deb9a 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx11.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx11.csv
@@ -404,11 +404,23 @@ iostream ostream
 iostream streambuf
 iostream version
 istream bitset
+istream cerrno
 istream concepts
 istream cstddef
+istream cstdint
+istream cstring
+istream initializer_list
+istream ios
 istream iosfwd
+istream limits
+istream locale
+istream new
 istream ostream
+istream streambuf
+istream string
+istream string_view
 istream type_traits
+istream typeinfo
 istream version
 iterator compare
 iterator concepts
@@ -563,6 +575,7 @@ numeric cstddef
 numeric cstdint
 numeric execution
 numeric functional
+numeric initializer_list
 numeric iterator
 numeric limits
 numeric new
@@ -752,12 +765,23 @@ span limits
 span stdexcept
 span type_traits
 span version
+sstream bitset
+sstream cerrno
 sstream cstddef
+sstream cstdint
+sstream cstring
+sstream initializer_list
+sstream ios
 sstream istream
+sstream limits
+sstream locale
+sstream new
 sstream ostream
+sstream streambuf
 sstream string
 sstream string_view
 sstream type_traits
+sstream typeinfo
 sstream version
 stack compare
 stack concepts
@@ -842,11 +866,13 @@ strstream istream
 strstream ostream
 strstream version
 syncstream cstddef
+syncstream ios
 syncstream iosfwd
 syncstream map
 syncstream mutex
 syncstream ostream
 syncstream shared_mutex
+syncstream streambuf
 syncstream string
 system_error cerrno
 system_error compare
diff --git a/libcxx/test/libcxx/transitive_includes/cxx14.csv b/libcxx/test/libcxx/transitive_includes/cxx14.csv
index 422db19b6bb8..09252b7b7d2d 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx14.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx14.csv
@@ -407,11 +407,23 @@ iostream ostream
 iostream streambuf
 iostream version
 istream bitset
+istream cerrno
 istream concepts
 istream cstddef
+istream cstdint
+istream cstring
+istream initializer_list
+istream ios
 istream iosfwd
+istream limits
+istream locale
+istream new
 istream ostream
+istream streambuf
+istream string
+istream string_view
 istream type_traits
+istream typeinfo
 istream version
 iterator compare
 iterator concepts
@@ -566,6 +578,7 @@ numeric cstddef
 numeric cstdint
 numeric execution
 numeric functional
+numeric initializer_list
 numeric iterator
 numeric limits
 numeric new
@@ -755,12 +768,23 @@ span limits
 span stdexcept
 span type_traits
 span version
+sstream bitset
+sstream cerrno
 sstream cstddef
+sstream cstdint
+sstream cstring
+sstream initializer_list
+sstream ios
 sstream istream
+sstream limits
+sstream locale
+sstream new
 sstream ostream
+sstream streambuf
 sstream string
 sstream string_view
 sstream type_traits
+sstream typeinfo
 sstream version
 stack compare
 stack concepts
@@ -845,11 +869,13 @@ strstream istream
 strstream ostream
 strstream version
 syncstream cstddef
+syncstream ios
 syncstream iosfwd
 syncstream map
 syncstream mutex
 syncstream ostream
 syncstream shared_mutex
+syncstream streambuf
 syncstream string
 system_error cerrno
 system_error compare
diff --git a/libcxx/test/libcxx/transitive_includes/cxx17.csv b/libcxx/test/libcxx/transitive_includes/cxx17.csv
index 422db19b6bb8..09252b7b7d2d 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx17.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx17.csv
@@ -407,11 +407,23 @@ iostream ostream
 iostream streambuf
 iostream version
 istream bitset
+istream cerrno
 istream concepts
 istream cstddef
+istream cstdint
+istream cstring
+istream initializer_list
+istream ios
 istream iosfwd
+istream limits
+istream locale
+istream new
 istream ostream
+istream streambuf
+istream string
+istream string_view
 istream type_traits
+istream typeinfo
 istream version
 iterator compare
 iterator concepts
@@ -566,6 +578,7 @@ numeric cstddef
 numeric cstdint
 numeric execution
 numeric functional
+numeric initializer_list
 numeric iterator
 numeric limits
 numeric new
@@ -755,12 +768,23 @@ span limits
 span stdexcept
 span type_traits
 span version
+sstream bitset
+sstream cerrno
 sstream cstddef
+sstream cstdint
+sstream cstring
+sstream initializer_list
+sstream ios
 sstream istream
+sstream limits
+sstream locale
+sstream new
 sstream ostream
+sstream streambuf
 sstream string
 sstream string_view
 sstream type_traits
+sstream typeinfo
 sstream version
 stack compare
 stack concepts
@@ -845,11 +869,13 @@ strstream istream
 strstream ostream
 strstream version
 syncstream cstddef
+syncstream ios
 syncstream iosfwd
 syncstream map
 syncstream mutex
 syncstream ostream
 syncstream shared_mutex
+syncstream streambuf
 syncstream string
 system_error cerrno
 system_error compare
diff --git a/libcxx/test/libcxx/transitive_includes/cxx20.csv b/libcxx/test/libcxx/transitive_includes/cxx20.csv
index 7d31ba160ee1..ce4ccc3d1161 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx20.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx20.csv
@@ -418,11 +418,23 @@ iostream ostream
 iostream streambuf
 iostream version
 istream bitset
+istream cerrno
 istream concepts
 istream cstddef
+istream cstdint
+istream cstring
+istream initializer_list
+istream ios
 istream iosfwd
+istream limits
+istream locale
+istream new
 istream ostream
+istream streambuf
+istream string
+istream string_view
 istream type_traits
+istream typeinfo
 istream version
 iterator compare
 iterator concepts
@@ -577,6 +589,7 @@ numeric cstddef
 numeric cstdint
 numeric execution
 numeric functional
+numeric initializer_list
 numeric iterator
 numeric limits
 numeric new
@@ -766,12 +779,23 @@ span limits
 span stdexcept
 span type_traits
 span version
+sstream bitset
+sstream cerrno
 sstream cstddef
+sstream cstdint
+sstream cstring
+sstream initializer_list
+sstream ios
 sstream istream
+sstream limits
+sstream locale
+sstream new
 sstream ostream
+sstream streambuf
 sstream string
 sstream string_view
 sstream type_traits
+sstream typeinfo
 sstream version
 stack compare
 stack concepts
@@ -856,11 +880,13 @@ strstream istream
 strstream ostream
 strstream version
 syncstream cstddef
+syncstream ios
 syncstream iosfwd
 syncstream map
 syncstream mutex
 syncstream ostream
 syncstream shared_mutex
+syncstream streambuf
 syncstream string
 system_error cerrno
 system_error compare
diff --git a/libcxx/test/libcxx/transitive_includes/cxx23.csv b/libcxx/test/libcxx/transitive_includes/cxx23.csv
index ea01e4134585..62d931c0eeba 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx23.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx23.csv
@@ -288,8 +288,19 @@ iostream ostream
 iostream streambuf
 iostream version
 istream bitset
+istream cerrno
 istream cstddef
-istream ostream
+istream cstdint
+istream cstring
+istream initializer_list
+istream ios
+istream limits
+istream locale
+istream new
+istream streambuf
+istream string
+istream string_view
+istream typeinfo
 istream version
 iterator compare
 iterator concepts
@@ -408,7 +419,6 @@ ostream bitset
 ostream cerrno
 ostream cstddef
 ostream cstdint
-ostream cstdio
 ostream cstring
 ostream format
 ostream initializer_list
@@ -519,11 +529,21 @@ span initializer_list
 span limits
 span stdexcept
 span version
+sstream bitset
+sstream cerrno
 sstream cstddef
+sstream cstdint
+sstream cstring
+sstream initializer_list
+sstream ios
 sstream istream
-sstream ostream
+sstream limits
+sstream locale
+sstream new
+sstream streambuf
 sstream string
 sstream string_view
+sstream typeinfo
 sstream version
 stack compare
 stack cstddef
@@ -589,11 +609,13 @@ strstream istream
 strstream ostream
 strstream version
 syncstream cstddef
+syncstream ios
 syncstream iosfwd
 syncstream map
 syncstream mutex
 syncstream ostream
 syncstream shared_mutex
+syncstream streambuf
 syncstream string
 system_error cerrno
 system_error compare
diff --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv
index ea01e4134585..f68249aeec78 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx26.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv
@@ -176,6 +176,29 @@ experimental/simd limits
 experimental/type_traits initializer_list
 experimental/type_traits type_traits
 experimental/utility utility
+experimental/vector experimental/memory_resource
+experimental/vector vector
+ext/hash_map algorithm
+ext/hash_map cmath
+ext/hash_map cstddef
+ext/hash_map cstdint
+ext/hash_map cstring
+ext/hash_map functional
+ext/hash_map initializer_list
+ext/hash_map limits
+ext/hash_map new
+ext/hash_map stdexcept
+ext/hash_map string
+ext/hash_set algorithm
+ext/hash_set cmath
+ext/hash_set cstddef
+ext/hash_set cstdint
+ext/hash_set cstring
+ext/hash_set functional
+ext/hash_set initializer_list
+ext/hash_set limits
+ext/hash_set new
+ext/hash_set string
 filesystem compare
 filesystem cstddef
 filesystem cstdint
@@ -288,8 +311,19 @@ iostream ostream
 iostream streambuf
 iostream version
 istream bitset
+istream cerrno
 istream cstddef
-istream ostream
+istream cstdint
+istream cstring
+istream initializer_list
+istream ios
+istream limits
+istream locale
+istream new
+istream streambuf
+istream string
+istream string_view
+istream typeinfo
 istream version
 iterator compare
 iterator concepts
@@ -408,7 +442,6 @@ ostream bitset
 ostream cerrno
 ostream cstddef
 ostream cstdint
-ostream cstdio
 ostream cstring
 ostream format
 ostream initializer_list
@@ -519,11 +552,21 @@ span initializer_list
 span limits
 span stdexcept
 span version
+sstream bitset
+sstream cerrno
 sstream cstddef
+sstream cstdint
+sstream cstring
+sstream initializer_list
+sstream ios
 sstream istream
-sstream ostream
+sstream limits
+sstream locale
+sstream new
+sstream streambuf
 sstream string
 sstream string_view
+sstream typeinfo
 sstream version
 stack compare
 stack cstddef
@@ -589,11 +632,13 @@ strstream istream
 strstream ostream
 strstream version
 syncstream cstddef
+syncstream ios
 syncstream iosfwd
 syncstream map
 syncstream mutex
 syncstream ostream
 syncstream shared_mutex
+syncstream streambuf
 syncstream string
 system_error cerrno
 system_error compare
diff --git a/libcxx/test/libcxx/type_traits/is_trivially_relocatable.compile.pass.cpp b/libcxx/test/libcxx/type_traits/is_trivially_relocatable.compile.pass.cpp
index 389816bb23aa..4d1a8ad9e229 100644
--- a/libcxx/test/libcxx/type_traits/is_trivially_relocatable.compile.pass.cpp
+++ b/libcxx/test/libcxx/type_traits/is_trivially_relocatable.compile.pass.cpp
@@ -48,6 +48,7 @@ static_assert(!std::__libcpp_is_trivially_relocatable<MoveOnlyTriviallyCopyable>
 // ----------------------
 
 // basic_string
+#if defined(_LIBCPP_HAS_NO_ASAN) || !defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
 struct MyChar {
   char c;
 };
@@ -78,7 +79,7 @@ static_assert(
     !std::__libcpp_is_trivially_relocatable<
         std::basic_string<MyChar, NotTriviallyRelocatableCharTraits<MyChar>, test_allocator<MyChar> > >::value,
     "");
-
+#endif
 // unique_ptr
 struct NotTriviallyRelocatableDeleter {
   NotTriviallyRelocatableDeleter(const NotTriviallyRelocatableDeleter&);
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp
index eb5f7cacdde3..dd37555ffcce 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp
@@ -66,14 +66,27 @@ TEST_CONSTEXPR_CXX20 void check(Container1 lhs, Container2 rhs, size_t offset) {
 #endif
 }
 
-struct NonTrivial {
+// Compares modulo 4 to make sure we only forward to the vectorized version if we are trivially equality comparable
+struct NonTrivialMod4Comp {
   int i_;
 
-  TEST_CONSTEXPR_CXX20 NonTrivial(int i) : i_(i) {}
-  TEST_CONSTEXPR_CXX20 NonTrivial(NonTrivial&& other) : i_(other.i_) { other.i_ = 0; }
+  TEST_CONSTEXPR_CXX20 NonTrivialMod4Comp(int i) : i_(i) {}
+  TEST_CONSTEXPR_CXX20 NonTrivialMod4Comp(NonTrivialMod4Comp&& other) : i_(other.i_) { other.i_ = 0; }
 
-  TEST_CONSTEXPR_CXX20 friend bool operator==(const NonTrivial& lhs, const NonTrivial& rhs) { return lhs.i_ == rhs.i_; }
+  TEST_CONSTEXPR_CXX20 friend bool operator==(const NonTrivialMod4Comp& lhs, const NonTrivialMod4Comp& rhs) {
+    return lhs.i_ % 4 == rhs.i_ % 4;
+  }
+};
+
+#if TEST_STD_VER >= 20
+struct TriviallyEqualityComparable {
+  int i_;
+
+  TEST_CONSTEXPR_CXX20 TriviallyEqualityComparable(int i) : i_(i) {}
+
+  TEST_CONSTEXPR_CXX20 friend bool operator==(TriviallyEqualityComparable, TriviallyEqualityComparable) = default;
 };
+#endif // TEST_STD_VER >= 20
 
 struct ModTwoComp {
   TEST_CONSTEXPR_CXX20 bool operator()(int lhs, int rhs) { return lhs % 2 == rhs % 2; }
@@ -136,16 +149,30 @@ TEST_CONSTEXPR_CXX20 bool test() {
   types::for_each(types::cpp17_input_iterator_list<int*>(), Test());
 
   { // use a non-integer type to also test the general case - all elements match
-    std::array<NonTrivial, 8> lhs = {1, 2, 3, 4, 5, 6, 7, 8};
-    std::array<NonTrivial, 8> rhs = {1, 2, 3, 4, 5, 6, 7, 8};
-    check<NonTrivial*>(std::move(lhs), std::move(rhs), 8);
+    std::array<NonTrivialMod4Comp, 8> lhs = {1, 2, 3, 4, 5, 6, 7, 8};
+    std::array<NonTrivialMod4Comp, 8> rhs = {1, 2, 3, 4, 1, 6, 7, 8};
+    check<NonTrivialMod4Comp*>(std::move(lhs), std::move(rhs), 8);
   }
 
   { // use a non-integer type to also test the general case - not all elements match
-    std::array<NonTrivial, 8> lhs = {1, 2, 3, 4, 7, 6, 7, 8};
-    std::array<NonTrivial, 8> rhs = {1, 2, 3, 4, 5, 6, 7, 8};
-    check<NonTrivial*>(std::move(lhs), std::move(rhs), 4);
+    std::array<NonTrivialMod4Comp, 8> lhs = {1, 2, 3, 4, 7, 6, 7, 8};
+    std::array<NonTrivialMod4Comp, 8> rhs = {1, 2, 3, 4, 5, 6, 7, 8};
+    check<NonTrivialMod4Comp*>(std::move(lhs), std::move(rhs), 4);
+  }
+
+#if TEST_STD_VER >= 20
+  { // trivially equality comparable class type to test forwarding to the vectorized version - all elements match
+    std::array<TriviallyEqualityComparable, 8> lhs = {1, 2, 3, 4, 5, 6, 7, 8};
+    std::array<TriviallyEqualityComparable, 8> rhs = {1, 2, 3, 4, 5, 6, 7, 8};
+    check<TriviallyEqualityComparable*>(std::move(lhs), std::move(rhs), 8);
+  }
+
+  { // trivially equality comparable class type to test forwarding to the vectorized version - not all elements match
+    std::array<TriviallyEqualityComparable, 8> lhs = {1, 2, 3, 4, 7, 6, 7, 8};
+    std::array<TriviallyEqualityComparable, 8> rhs = {1, 2, 3, 4, 5, 6, 7, 8};
+    check<TriviallyEqualityComparable*>(std::move(lhs), std::move(rhs), 4);
   }
+#endif // TEST_STD_VER >= 20
 
   return true;
 }
diff --git a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/fetch_add.pass.cpp b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/fetch_add.pass.cpp
index 4119c39772e5..7350c1ddf0e9 100644
--- a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/fetch_add.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/fetch_add.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 // UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: LIBCXX-AIX-FIXME
 // XFAIL: !has-64-bit-atomics
 
 // https://github.com/llvm/llvm-project/issues/72893
diff --git a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/fetch_sub.pass.cpp b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/fetch_sub.pass.cpp
index 2460765a3c86..84dcde5f2784 100644
--- a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/fetch_sub.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/fetch_sub.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 // UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: LIBCXX-AIX-FIXME
 // XFAIL: !has-64-bit-atomics
 
 // https://github.com/llvm/llvm-project/issues/72893
diff --git a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/operator.minus_equals.pass.cpp b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/operator.minus_equals.pass.cpp
index 4bd303022c0d..386a393e3550 100644
--- a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/operator.minus_equals.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/operator.minus_equals.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 // UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: LIBCXX-AIX-FIXME
 // XFAIL: !has-64-bit-atomics
 
 // floating-point-type operator-=(floating-point-type) volatile noexcept;
diff --git a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/operator.plus_equals.pass.cpp b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/operator.plus_equals.pass.cpp
index 69abb9ae63c3..afd06d537c7a 100644
--- a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/operator.plus_equals.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/operator.plus_equals.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 // UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: LIBCXX-AIX-FIXME
 // XFAIL: !has-64-bit-atomics
 
 // floating-point-type operator+=(floating-point-type) volatile noexcept;
diff --git a/libcxx/test/std/input.output/syncstream/syncbuf/helpers.h b/libcxx/test/std/input.output/syncstream/syncbuf/helpers.h
index 79fcaafa8095..523f0da6cb64 100644
--- a/libcxx/test/std/input.output/syncstream/syncbuf/helpers.h
+++ b/libcxx/test/std/input.output/syncstream/syncbuf/helpers.h
@@ -9,6 +9,7 @@
 #ifndef TEST_STD_INPUT_OUTPUT_SYNCSTREAM_SYNCBUF_SYNCSTREAM_SYNCBUF_MEMBERS_H
 #define TEST_STD_INPUT_OUTPUT_SYNCSTREAM_SYNCBUF_SYNCSTREAM_SYNCBUF_MEMBERS_H
 
+#include <streambuf>
 #include <syncstream>
 
 template <class T>
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/deque.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/deque.version.compile.pass.cpp
index 720557f33e2a..d0e4ac130c60 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/deque.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/deque.version.compile.pass.cpp
@@ -17,10 +17,10 @@
 
 /*  Constant                                                Value
     __cpp_lib_allocator_traits_is_always_equal              201411L [C++17]
+    __cpp_lib_containers_ranges                             202202L [C++23]
     __cpp_lib_default_template_type_for_algorithm_values    202403L [C++26]
     __cpp_lib_erase_if                                      202002L [C++20]
     __cpp_lib_nonmember_container_access                    201411L [C++17]
-    __cpp_lib_ranges_to_container                           202202L [C++23]
 */
 
 #include <deque>
@@ -32,6 +32,10 @@
 #   error "__cpp_lib_allocator_traits_is_always_equal should not be defined before c++17"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_default_template_type_for_algorithm_values
 #   error "__cpp_lib_default_template_type_for_algorithm_values should not be defined before c++26"
 # endif
@@ -44,16 +48,16 @@
 #   error "__cpp_lib_nonmember_container_access should not be defined before c++17"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 #elif TEST_STD_VER == 14
 
 # ifdef __cpp_lib_allocator_traits_is_always_equal
 #   error "__cpp_lib_allocator_traits_is_always_equal should not be defined before c++17"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_default_template_type_for_algorithm_values
 #   error "__cpp_lib_default_template_type_for_algorithm_values should not be defined before c++26"
 # endif
@@ -66,10 +70,6 @@
 #   error "__cpp_lib_nonmember_container_access should not be defined before c++17"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 #elif TEST_STD_VER == 17
 
 # ifndef __cpp_lib_allocator_traits_is_always_equal
@@ -79,6 +79,10 @@
 #   error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++17"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_default_template_type_for_algorithm_values
 #   error "__cpp_lib_default_template_type_for_algorithm_values should not be defined before c++26"
 # endif
@@ -94,10 +98,6 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++17"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 #elif TEST_STD_VER == 20
 
 # ifndef __cpp_lib_allocator_traits_is_always_equal
@@ -107,6 +107,10 @@
 #   error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++20"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_default_template_type_for_algorithm_values
 #   error "__cpp_lib_default_template_type_for_algorithm_values should not be defined before c++26"
 # endif
@@ -125,10 +129,6 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++20"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 #elif TEST_STD_VER == 23
 
 # ifndef __cpp_lib_allocator_traits_is_always_equal
@@ -138,6 +138,13 @@
 #   error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++23"
 # endif
 
+# ifndef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should be defined in c++23"
+# endif
+# if __cpp_lib_containers_ranges != 202202L
+#   error "__cpp_lib_containers_ranges should have the value 202202L in c++23"
+# endif
+
 # ifdef __cpp_lib_default_template_type_for_algorithm_values
 #   error "__cpp_lib_default_template_type_for_algorithm_values should not be defined before c++26"
 # endif
@@ -156,13 +163,6 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++23"
 # endif
 
-# ifndef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should be defined in c++23"
-# endif
-# if __cpp_lib_ranges_to_container != 202202L
-#   error "__cpp_lib_ranges_to_container should have the value 202202L in c++23"
-# endif
-
 #elif TEST_STD_VER > 23
 
 # ifndef __cpp_lib_allocator_traits_is_always_equal
@@ -172,6 +172,13 @@
 #   error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++26"
 # endif
 
+# ifndef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should be defined in c++26"
+# endif
+# if __cpp_lib_containers_ranges != 202202L
+#   error "__cpp_lib_containers_ranges should have the value 202202L in c++26"
+# endif
+
 # if !defined(_LIBCPP_VERSION)
 #   ifndef __cpp_lib_default_template_type_for_algorithm_values
 #     error "__cpp_lib_default_template_type_for_algorithm_values should be defined in c++26"
@@ -199,12 +206,5 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++26"
 # endif
 
-# ifndef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should be defined in c++26"
-# endif
-# if __cpp_lib_ranges_to_container != 202202L
-#   error "__cpp_lib_ranges_to_container should have the value 202202L in c++26"
-# endif
-
 #endif // TEST_STD_VER > 23
 
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/forward_list.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/forward_list.version.compile.pass.cpp
index 9305cf0c54b9..bcb606451b27 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/forward_list.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/forward_list.version.compile.pass.cpp
@@ -17,12 +17,12 @@
 
 /*  Constant                                                Value
     __cpp_lib_allocator_traits_is_always_equal              201411L [C++17]
+    __cpp_lib_containers_ranges                             202202L [C++23]
     __cpp_lib_default_template_type_for_algorithm_values    202403L [C++26]
     __cpp_lib_erase_if                                      202002L [C++20]
     __cpp_lib_incomplete_container_elements                 201505L [C++17]
     __cpp_lib_list_remove_return_type                       201806L [C++20]
     __cpp_lib_nonmember_container_access                    201411L [C++17]
-    __cpp_lib_ranges_to_container                           202202L [C++23]
 */
 
 #include <forward_list>
@@ -34,6 +34,10 @@
 #   error "__cpp_lib_allocator_traits_is_always_equal should not be defined before c++17"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_default_template_type_for_algorithm_values
 #   error "__cpp_lib_default_template_type_for_algorithm_values should not be defined before c++26"
 # endif
@@ -54,16 +58,16 @@
 #   error "__cpp_lib_nonmember_container_access should not be defined before c++17"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 #elif TEST_STD_VER == 14
 
 # ifdef __cpp_lib_allocator_traits_is_always_equal
 #   error "__cpp_lib_allocator_traits_is_always_equal should not be defined before c++17"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_default_template_type_for_algorithm_values
 #   error "__cpp_lib_default_template_type_for_algorithm_values should not be defined before c++26"
 # endif
@@ -84,10 +88,6 @@
 #   error "__cpp_lib_nonmember_container_access should not be defined before c++17"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 #elif TEST_STD_VER == 17
 
 # ifndef __cpp_lib_allocator_traits_is_always_equal
@@ -97,6 +97,10 @@
 #   error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++17"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_default_template_type_for_algorithm_values
 #   error "__cpp_lib_default_template_type_for_algorithm_values should not be defined before c++26"
 # endif
@@ -123,10 +127,6 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++17"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 #elif TEST_STD_VER == 20
 
 # ifndef __cpp_lib_allocator_traits_is_always_equal
@@ -136,6 +136,10 @@
 #   error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++20"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_default_template_type_for_algorithm_values
 #   error "__cpp_lib_default_template_type_for_algorithm_values should not be defined before c++26"
 # endif
@@ -168,10 +172,6 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++20"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 #elif TEST_STD_VER == 23
 
 # ifndef __cpp_lib_allocator_traits_is_always_equal
@@ -181,6 +181,13 @@
 #   error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++23"
 # endif
 
+# ifndef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should be defined in c++23"
+# endif
+# if __cpp_lib_containers_ranges != 202202L
+#   error "__cpp_lib_containers_ranges should have the value 202202L in c++23"
+# endif
+
 # ifdef __cpp_lib_default_template_type_for_algorithm_values
 #   error "__cpp_lib_default_template_type_for_algorithm_values should not be defined before c++26"
 # endif
@@ -213,13 +220,6 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++23"
 # endif
 
-# ifndef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should be defined in c++23"
-# endif
-# if __cpp_lib_ranges_to_container != 202202L
-#   error "__cpp_lib_ranges_to_container should have the value 202202L in c++23"
-# endif
-
 #elif TEST_STD_VER > 23
 
 # ifndef __cpp_lib_allocator_traits_is_always_equal
@@ -229,6 +229,13 @@
 #   error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++26"
 # endif
 
+# ifndef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should be defined in c++26"
+# endif
+# if __cpp_lib_containers_ranges != 202202L
+#   error "__cpp_lib_containers_ranges should have the value 202202L in c++26"
+# endif
+
 # if !defined(_LIBCPP_VERSION)
 #   ifndef __cpp_lib_default_template_type_for_algorithm_values
 #     error "__cpp_lib_default_template_type_for_algorithm_values should be defined in c++26"
@@ -270,12 +277,5 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++26"
 # endif
 
-# ifndef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should be defined in c++26"
-# endif
-# if __cpp_lib_ranges_to_container != 202202L
-#   error "__cpp_lib_ranges_to_container should have the value 202202L in c++26"
-# endif
-
 #endif // TEST_STD_VER > 23
 
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/functional.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/functional.version.compile.pass.cpp
index aeb09a30b425..27e76e5b2b05 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/functional.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/functional.version.compile.pass.cpp
@@ -535,17 +535,11 @@
 #   error "__cpp_lib_ranges should have the value 202207L in c++26"
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_reference_wrapper
-#     error "__cpp_lib_reference_wrapper should be defined in c++26"
-#   endif
-#   if __cpp_lib_reference_wrapper != 202403L
-#     error "__cpp_lib_reference_wrapper should have the value 202403L in c++26"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_reference_wrapper
-#     error "__cpp_lib_reference_wrapper should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifndef __cpp_lib_reference_wrapper
+#   error "__cpp_lib_reference_wrapper should be defined in c++26"
+# endif
+# if __cpp_lib_reference_wrapper != 202403L
+#   error "__cpp_lib_reference_wrapper should have the value 202403L in c++26"
 # endif
 
 # ifndef __cpp_lib_result_of_sfinae
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/list.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/list.version.compile.pass.cpp
index 12225612b80d..9cd3c3bc941c 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/list.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/list.version.compile.pass.cpp
@@ -17,12 +17,12 @@
 
 /*  Constant                                                Value
     __cpp_lib_allocator_traits_is_always_equal              201411L [C++17]
+    __cpp_lib_containers_ranges                             202202L [C++23]
     __cpp_lib_default_template_type_for_algorithm_values    202403L [C++26]
     __cpp_lib_erase_if                                      202002L [C++20]
     __cpp_lib_incomplete_container_elements                 201505L [C++17]
     __cpp_lib_list_remove_return_type                       201806L [C++20]
     __cpp_lib_nonmember_container_access                    201411L [C++17]
-    __cpp_lib_ranges_to_container                           202202L [C++23]
 */
 
 #include <list>
@@ -34,6 +34,10 @@
 #   error "__cpp_lib_allocator_traits_is_always_equal should not be defined before c++17"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_default_template_type_for_algorithm_values
 #   error "__cpp_lib_default_template_type_for_algorithm_values should not be defined before c++26"
 # endif
@@ -54,16 +58,16 @@
 #   error "__cpp_lib_nonmember_container_access should not be defined before c++17"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 #elif TEST_STD_VER == 14
 
 # ifdef __cpp_lib_allocator_traits_is_always_equal
 #   error "__cpp_lib_allocator_traits_is_always_equal should not be defined before c++17"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_default_template_type_for_algorithm_values
 #   error "__cpp_lib_default_template_type_for_algorithm_values should not be defined before c++26"
 # endif
@@ -84,10 +88,6 @@
 #   error "__cpp_lib_nonmember_container_access should not be defined before c++17"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 #elif TEST_STD_VER == 17
 
 # ifndef __cpp_lib_allocator_traits_is_always_equal
@@ -97,6 +97,10 @@
 #   error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++17"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_default_template_type_for_algorithm_values
 #   error "__cpp_lib_default_template_type_for_algorithm_values should not be defined before c++26"
 # endif
@@ -123,10 +127,6 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++17"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 #elif TEST_STD_VER == 20
 
 # ifndef __cpp_lib_allocator_traits_is_always_equal
@@ -136,6 +136,10 @@
 #   error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++20"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_default_template_type_for_algorithm_values
 #   error "__cpp_lib_default_template_type_for_algorithm_values should not be defined before c++26"
 # endif
@@ -168,10 +172,6 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++20"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 #elif TEST_STD_VER == 23
 
 # ifndef __cpp_lib_allocator_traits_is_always_equal
@@ -181,6 +181,13 @@
 #   error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++23"
 # endif
 
+# ifndef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should be defined in c++23"
+# endif
+# if __cpp_lib_containers_ranges != 202202L
+#   error "__cpp_lib_containers_ranges should have the value 202202L in c++23"
+# endif
+
 # ifdef __cpp_lib_default_template_type_for_algorithm_values
 #   error "__cpp_lib_default_template_type_for_algorithm_values should not be defined before c++26"
 # endif
@@ -213,13 +220,6 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++23"
 # endif
 
-# ifndef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should be defined in c++23"
-# endif
-# if __cpp_lib_ranges_to_container != 202202L
-#   error "__cpp_lib_ranges_to_container should have the value 202202L in c++23"
-# endif
-
 #elif TEST_STD_VER > 23
 
 # ifndef __cpp_lib_allocator_traits_is_always_equal
@@ -229,6 +229,13 @@
 #   error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++26"
 # endif
 
+# ifndef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should be defined in c++26"
+# endif
+# if __cpp_lib_containers_ranges != 202202L
+#   error "__cpp_lib_containers_ranges should have the value 202202L in c++26"
+# endif
+
 # if !defined(_LIBCPP_VERSION)
 #   ifndef __cpp_lib_default_template_type_for_algorithm_values
 #     error "__cpp_lib_default_template_type_for_algorithm_values should be defined in c++26"
@@ -270,12 +277,5 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++26"
 # endif
 
-# ifndef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should be defined in c++26"
-# endif
-# if __cpp_lib_ranges_to_container != 202202L
-#   error "__cpp_lib_ranges_to_container should have the value 202202L in c++26"
-# endif
-
 #endif // TEST_STD_VER > 23
 
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/map.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/map.version.compile.pass.cpp
index 23a2df95dca9..4ffb72d1442e 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/map.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/map.version.compile.pass.cpp
@@ -19,12 +19,12 @@
     __cpp_lib_allocator_traits_is_always_equal       201411L [C++17]
     __cpp_lib_associative_heterogeneous_erasure      202110L [C++23]
     __cpp_lib_associative_heterogeneous_insertion    202306L [C++26]
+    __cpp_lib_containers_ranges                      202202L [C++23]
     __cpp_lib_erase_if                               202002L [C++20]
     __cpp_lib_generic_associative_lookup             201304L [C++14]
     __cpp_lib_map_try_emplace                        201411L [C++17]
     __cpp_lib_node_extract                           201606L [C++17]
     __cpp_lib_nonmember_container_access             201411L [C++17]
-    __cpp_lib_ranges_to_container                    202202L [C++23]
     __cpp_lib_tuple_like                             202207L [C++23]
                                                      202311L [C++26]
 */
@@ -46,6 +46,10 @@
 #   error "__cpp_lib_associative_heterogeneous_insertion should not be defined before c++26"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_erase_if
 #   error "__cpp_lib_erase_if should not be defined before c++20"
 # endif
@@ -66,10 +70,6 @@
 #   error "__cpp_lib_nonmember_container_access should not be defined before c++17"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 # ifdef __cpp_lib_tuple_like
 #   error "__cpp_lib_tuple_like should not be defined before c++23"
 # endif
@@ -88,6 +88,10 @@
 #   error "__cpp_lib_associative_heterogeneous_insertion should not be defined before c++26"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_erase_if
 #   error "__cpp_lib_erase_if should not be defined before c++20"
 # endif
@@ -111,10 +115,6 @@
 #   error "__cpp_lib_nonmember_container_access should not be defined before c++17"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 # ifdef __cpp_lib_tuple_like
 #   error "__cpp_lib_tuple_like should not be defined before c++23"
 # endif
@@ -136,6 +136,10 @@
 #   error "__cpp_lib_associative_heterogeneous_insertion should not be defined before c++26"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_erase_if
 #   error "__cpp_lib_erase_if should not be defined before c++20"
 # endif
@@ -168,10 +172,6 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++17"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 # ifdef __cpp_lib_tuple_like
 #   error "__cpp_lib_tuple_like should not be defined before c++23"
 # endif
@@ -193,6 +193,10 @@
 #   error "__cpp_lib_associative_heterogeneous_insertion should not be defined before c++26"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifndef __cpp_lib_erase_if
 #   error "__cpp_lib_erase_if should be defined in c++20"
 # endif
@@ -228,10 +232,6 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++20"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 # ifdef __cpp_lib_tuple_like
 #   error "__cpp_lib_tuple_like should not be defined before c++23"
 # endif
@@ -262,6 +262,13 @@
 #   error "__cpp_lib_associative_heterogeneous_insertion should not be defined before c++26"
 # endif
 
+# ifndef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should be defined in c++23"
+# endif
+# if __cpp_lib_containers_ranges != 202202L
+#   error "__cpp_lib_containers_ranges should have the value 202202L in c++23"
+# endif
+
 # ifndef __cpp_lib_erase_if
 #   error "__cpp_lib_erase_if should be defined in c++23"
 # endif
@@ -297,13 +304,6 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++23"
 # endif
 
-# ifndef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should be defined in c++23"
-# endif
-# if __cpp_lib_ranges_to_container != 202202L
-#   error "__cpp_lib_ranges_to_container should have the value 202202L in c++23"
-# endif
-
 # if !defined(_LIBCPP_VERSION)
 #   ifndef __cpp_lib_tuple_like
 #     error "__cpp_lib_tuple_like should be defined in c++23"
@@ -352,6 +352,13 @@
 #   endif
 # endif
 
+# ifndef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should be defined in c++26"
+# endif
+# if __cpp_lib_containers_ranges != 202202L
+#   error "__cpp_lib_containers_ranges should have the value 202202L in c++26"
+# endif
+
 # ifndef __cpp_lib_erase_if
 #   error "__cpp_lib_erase_if should be defined in c++26"
 # endif
@@ -387,13 +394,6 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++26"
 # endif
 
-# ifndef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should be defined in c++26"
-# endif
-# if __cpp_lib_ranges_to_container != 202202L
-#   error "__cpp_lib_ranges_to_container should have the value 202202L in c++26"
-# endif
-
 # if !defined(_LIBCPP_VERSION)
 #   ifndef __cpp_lib_tuple_like
 #     error "__cpp_lib_tuple_like should be defined in c++26"
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/queue.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/queue.version.compile.pass.cpp
index fdedd27bd46b..5a5739ff5e7c 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/queue.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/queue.version.compile.pass.cpp
@@ -17,7 +17,7 @@
 
 /*  Constant                                       Value
     __cpp_lib_adaptor_iterator_pair_constructor    202106L [C++23]
-    __cpp_lib_ranges_to_container                  202202L [C++23]
+    __cpp_lib_containers_ranges                    202202L [C++23]
 */
 
 #include <queue>
@@ -29,8 +29,8 @@
 #   error "__cpp_lib_adaptor_iterator_pair_constructor should not be defined before c++23"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
 # endif
 
 #elif TEST_STD_VER == 14
@@ -39,8 +39,8 @@
 #   error "__cpp_lib_adaptor_iterator_pair_constructor should not be defined before c++23"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
 # endif
 
 #elif TEST_STD_VER == 17
@@ -49,8 +49,8 @@
 #   error "__cpp_lib_adaptor_iterator_pair_constructor should not be defined before c++23"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
 # endif
 
 #elif TEST_STD_VER == 20
@@ -59,8 +59,8 @@
 #   error "__cpp_lib_adaptor_iterator_pair_constructor should not be defined before c++23"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
 # endif
 
 #elif TEST_STD_VER == 23
@@ -72,11 +72,11 @@
 #   error "__cpp_lib_adaptor_iterator_pair_constructor should have the value 202106L in c++23"
 # endif
 
-# ifndef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should be defined in c++23"
+# ifndef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should be defined in c++23"
 # endif
-# if __cpp_lib_ranges_to_container != 202202L
-#   error "__cpp_lib_ranges_to_container should have the value 202202L in c++23"
+# if __cpp_lib_containers_ranges != 202202L
+#   error "__cpp_lib_containers_ranges should have the value 202202L in c++23"
 # endif
 
 #elif TEST_STD_VER > 23
@@ -88,11 +88,11 @@
 #   error "__cpp_lib_adaptor_iterator_pair_constructor should have the value 202106L in c++26"
 # endif
 
-# ifndef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should be defined in c++26"
+# ifndef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should be defined in c++26"
 # endif
-# if __cpp_lib_ranges_to_container != 202202L
-#   error "__cpp_lib_ranges_to_container should have the value 202202L in c++26"
+# if __cpp_lib_containers_ranges != 202202L
+#   error "__cpp_lib_containers_ranges should have the value 202202L in c++26"
 # endif
 
 #endif // TEST_STD_VER > 23
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/set.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/set.version.compile.pass.cpp
index 271df95b8149..a733f17c3153 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/set.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/set.version.compile.pass.cpp
@@ -19,11 +19,11 @@
     __cpp_lib_allocator_traits_is_always_equal       201411L [C++17]
     __cpp_lib_associative_heterogeneous_erasure      202110L [C++23]
     __cpp_lib_associative_heterogeneous_insertion    202306L [C++26]
+    __cpp_lib_containers_ranges                      202202L [C++23]
     __cpp_lib_erase_if                               202002L [C++20]
     __cpp_lib_generic_associative_lookup             201304L [C++14]
     __cpp_lib_node_extract                           201606L [C++17]
     __cpp_lib_nonmember_container_access             201411L [C++17]
-    __cpp_lib_ranges_to_container                    202202L [C++23]
 */
 
 #include <set>
@@ -43,6 +43,10 @@
 #   error "__cpp_lib_associative_heterogeneous_insertion should not be defined before c++26"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_erase_if
 #   error "__cpp_lib_erase_if should not be defined before c++20"
 # endif
@@ -59,10 +63,6 @@
 #   error "__cpp_lib_nonmember_container_access should not be defined before c++17"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 #elif TEST_STD_VER == 14
 
 # ifdef __cpp_lib_allocator_traits_is_always_equal
@@ -77,6 +77,10 @@
 #   error "__cpp_lib_associative_heterogeneous_insertion should not be defined before c++26"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_erase_if
 #   error "__cpp_lib_erase_if should not be defined before c++20"
 # endif
@@ -96,10 +100,6 @@
 #   error "__cpp_lib_nonmember_container_access should not be defined before c++17"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 #elif TEST_STD_VER == 17
 
 # ifndef __cpp_lib_allocator_traits_is_always_equal
@@ -117,6 +117,10 @@
 #   error "__cpp_lib_associative_heterogeneous_insertion should not be defined before c++26"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_erase_if
 #   error "__cpp_lib_erase_if should not be defined before c++20"
 # endif
@@ -142,10 +146,6 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++17"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 #elif TEST_STD_VER == 20
 
 # ifndef __cpp_lib_allocator_traits_is_always_equal
@@ -163,6 +163,10 @@
 #   error "__cpp_lib_associative_heterogeneous_insertion should not be defined before c++26"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifndef __cpp_lib_erase_if
 #   error "__cpp_lib_erase_if should be defined in c++20"
 # endif
@@ -191,10 +195,6 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++20"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 #elif TEST_STD_VER == 23
 
 # ifndef __cpp_lib_allocator_traits_is_always_equal
@@ -221,6 +221,13 @@
 #   error "__cpp_lib_associative_heterogeneous_insertion should not be defined before c++26"
 # endif
 
+# ifndef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should be defined in c++23"
+# endif
+# if __cpp_lib_containers_ranges != 202202L
+#   error "__cpp_lib_containers_ranges should have the value 202202L in c++23"
+# endif
+
 # ifndef __cpp_lib_erase_if
 #   error "__cpp_lib_erase_if should be defined in c++23"
 # endif
@@ -249,13 +256,6 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++23"
 # endif
 
-# ifndef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should be defined in c++23"
-# endif
-# if __cpp_lib_ranges_to_container != 202202L
-#   error "__cpp_lib_ranges_to_container should have the value 202202L in c++23"
-# endif
-
 #elif TEST_STD_VER > 23
 
 # ifndef __cpp_lib_allocator_traits_is_always_equal
@@ -291,6 +291,13 @@
 #   endif
 # endif
 
+# ifndef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should be defined in c++26"
+# endif
+# if __cpp_lib_containers_ranges != 202202L
+#   error "__cpp_lib_containers_ranges should have the value 202202L in c++26"
+# endif
+
 # ifndef __cpp_lib_erase_if
 #   error "__cpp_lib_erase_if should be defined in c++26"
 # endif
@@ -319,12 +326,5 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++26"
 # endif
 
-# ifndef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should be defined in c++26"
-# endif
-# if __cpp_lib_ranges_to_container != 202202L
-#   error "__cpp_lib_ranges_to_container should have the value 202202L in c++26"
-# endif
-
 #endif // TEST_STD_VER > 23
 
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/stack.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/stack.version.compile.pass.cpp
index cc5af8a4df60..db51c4782b6c 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/stack.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/stack.version.compile.pass.cpp
@@ -17,7 +17,7 @@
 
 /*  Constant                                       Value
     __cpp_lib_adaptor_iterator_pair_constructor    202106L [C++23]
-    __cpp_lib_ranges_to_container                  202202L [C++23]
+    __cpp_lib_containers_ranges                    202202L [C++23]
 */
 
 #include <stack>
@@ -29,8 +29,8 @@
 #   error "__cpp_lib_adaptor_iterator_pair_constructor should not be defined before c++23"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
 # endif
 
 #elif TEST_STD_VER == 14
@@ -39,8 +39,8 @@
 #   error "__cpp_lib_adaptor_iterator_pair_constructor should not be defined before c++23"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
 # endif
 
 #elif TEST_STD_VER == 17
@@ -49,8 +49,8 @@
 #   error "__cpp_lib_adaptor_iterator_pair_constructor should not be defined before c++23"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
 # endif
 
 #elif TEST_STD_VER == 20
@@ -59,8 +59,8 @@
 #   error "__cpp_lib_adaptor_iterator_pair_constructor should not be defined before c++23"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
 # endif
 
 #elif TEST_STD_VER == 23
@@ -72,11 +72,11 @@
 #   error "__cpp_lib_adaptor_iterator_pair_constructor should have the value 202106L in c++23"
 # endif
 
-# ifndef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should be defined in c++23"
+# ifndef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should be defined in c++23"
 # endif
-# if __cpp_lib_ranges_to_container != 202202L
-#   error "__cpp_lib_ranges_to_container should have the value 202202L in c++23"
+# if __cpp_lib_containers_ranges != 202202L
+#   error "__cpp_lib_containers_ranges should have the value 202202L in c++23"
 # endif
 
 #elif TEST_STD_VER > 23
@@ -88,11 +88,11 @@
 #   error "__cpp_lib_adaptor_iterator_pair_constructor should have the value 202106L in c++26"
 # endif
 
-# ifndef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should be defined in c++26"
+# ifndef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should be defined in c++26"
 # endif
-# if __cpp_lib_ranges_to_container != 202202L
-#   error "__cpp_lib_ranges_to_container should have the value 202202L in c++26"
+# if __cpp_lib_containers_ranges != 202202L
+#   error "__cpp_lib_containers_ranges should have the value 202202L in c++26"
 # endif
 
 #endif // TEST_STD_VER > 23
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp
index 8d944a194faf..16a9a0a28de6 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp
@@ -19,10 +19,10 @@
     __cpp_lib_allocator_traits_is_always_equal              201411L [C++17]
     __cpp_lib_char8_t                                       201907L [C++20]
     __cpp_lib_constexpr_string                              201907L [C++20]
+    __cpp_lib_containers_ranges                             202202L [C++23]
     __cpp_lib_default_template_type_for_algorithm_values    202403L [C++26]
     __cpp_lib_erase_if                                      202002L [C++20]
     __cpp_lib_nonmember_container_access                    201411L [C++17]
-    __cpp_lib_ranges_to_container                           202202L [C++23]
     __cpp_lib_starts_ends_with                              201711L [C++20]
     __cpp_lib_string_contains                               202011L [C++23]
     __cpp_lib_string_resize_and_overwrite                   202110L [C++23]
@@ -49,6 +49,10 @@
 #   error "__cpp_lib_constexpr_string should not be defined before c++20"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_default_template_type_for_algorithm_values
 #   error "__cpp_lib_default_template_type_for_algorithm_values should not be defined before c++26"
 # endif
@@ -61,10 +65,6 @@
 #   error "__cpp_lib_nonmember_container_access should not be defined before c++17"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 # ifdef __cpp_lib_starts_ends_with
 #   error "__cpp_lib_starts_ends_with should not be defined before c++20"
 # endif
@@ -103,6 +103,10 @@
 #   error "__cpp_lib_constexpr_string should not be defined before c++20"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_default_template_type_for_algorithm_values
 #   error "__cpp_lib_default_template_type_for_algorithm_values should not be defined before c++26"
 # endif
@@ -115,10 +119,6 @@
 #   error "__cpp_lib_nonmember_container_access should not be defined before c++17"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 # ifdef __cpp_lib_starts_ends_with
 #   error "__cpp_lib_starts_ends_with should not be defined before c++20"
 # endif
@@ -163,6 +163,10 @@
 #   error "__cpp_lib_constexpr_string should not be defined before c++20"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_default_template_type_for_algorithm_values
 #   error "__cpp_lib_default_template_type_for_algorithm_values should not be defined before c++26"
 # endif
@@ -178,10 +182,6 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++17"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 # ifdef __cpp_lib_starts_ends_with
 #   error "__cpp_lib_starts_ends_with should not be defined before c++20"
 # endif
@@ -241,6 +241,10 @@
 #   error "__cpp_lib_constexpr_string should have the value 201907L in c++20"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_default_template_type_for_algorithm_values
 #   error "__cpp_lib_default_template_type_for_algorithm_values should not be defined before c++26"
 # endif
@@ -259,10 +263,6 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++20"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 # ifndef __cpp_lib_starts_ends_with
 #   error "__cpp_lib_starts_ends_with should be defined in c++20"
 # endif
@@ -325,6 +325,13 @@
 #   error "__cpp_lib_constexpr_string should have the value 201907L in c++23"
 # endif
 
+# ifndef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should be defined in c++23"
+# endif
+# if __cpp_lib_containers_ranges != 202202L
+#   error "__cpp_lib_containers_ranges should have the value 202202L in c++23"
+# endif
+
 # ifdef __cpp_lib_default_template_type_for_algorithm_values
 #   error "__cpp_lib_default_template_type_for_algorithm_values should not be defined before c++26"
 # endif
@@ -343,13 +350,6 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++23"
 # endif
 
-# ifndef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should be defined in c++23"
-# endif
-# if __cpp_lib_ranges_to_container != 202202L
-#   error "__cpp_lib_ranges_to_container should have the value 202202L in c++23"
-# endif
-
 # ifndef __cpp_lib_starts_ends_with
 #   error "__cpp_lib_starts_ends_with should be defined in c++23"
 # endif
@@ -427,6 +427,13 @@
 #   error "__cpp_lib_constexpr_string should have the value 201907L in c++26"
 # endif
 
+# ifndef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should be defined in c++26"
+# endif
+# if __cpp_lib_containers_ranges != 202202L
+#   error "__cpp_lib_containers_ranges should have the value 202202L in c++26"
+# endif
+
 # if !defined(_LIBCPP_VERSION)
 #   ifndef __cpp_lib_default_template_type_for_algorithm_values
 #     error "__cpp_lib_default_template_type_for_algorithm_values should be defined in c++26"
@@ -454,13 +461,6 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++26"
 # endif
 
-# ifndef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should be defined in c++26"
-# endif
-# if __cpp_lib_ranges_to_container != 202202L
-#   error "__cpp_lib_ranges_to_container should have the value 202202L in c++26"
-# endif
-
 # ifndef __cpp_lib_starts_ends_with
 #   error "__cpp_lib_starts_ends_with should be defined in c++26"
 # endif
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/unordered_map.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/unordered_map.version.compile.pass.cpp
index 5f7f1805c509..83c12730a671 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/unordered_map.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/unordered_map.version.compile.pass.cpp
@@ -19,11 +19,11 @@
     __cpp_lib_allocator_traits_is_always_equal       201411L [C++17]
     __cpp_lib_associative_heterogeneous_erasure      202110L [C++23]
     __cpp_lib_associative_heterogeneous_insertion    202306L [C++26]
+    __cpp_lib_containers_ranges                      202202L [C++23]
     __cpp_lib_erase_if                               202002L [C++20]
     __cpp_lib_generic_unordered_lookup               201811L [C++20]
     __cpp_lib_node_extract                           201606L [C++17]
     __cpp_lib_nonmember_container_access             201411L [C++17]
-    __cpp_lib_ranges_to_container                    202202L [C++23]
     __cpp_lib_tuple_like                             202207L [C++23]
                                                      202311L [C++26]
     __cpp_lib_unordered_map_try_emplace              201411L [C++17]
@@ -46,6 +46,10 @@
 #   error "__cpp_lib_associative_heterogeneous_insertion should not be defined before c++26"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_erase_if
 #   error "__cpp_lib_erase_if should not be defined before c++20"
 # endif
@@ -62,10 +66,6 @@
 #   error "__cpp_lib_nonmember_container_access should not be defined before c++17"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 # ifdef __cpp_lib_tuple_like
 #   error "__cpp_lib_tuple_like should not be defined before c++23"
 # endif
@@ -88,6 +88,10 @@
 #   error "__cpp_lib_associative_heterogeneous_insertion should not be defined before c++26"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_erase_if
 #   error "__cpp_lib_erase_if should not be defined before c++20"
 # endif
@@ -104,10 +108,6 @@
 #   error "__cpp_lib_nonmember_container_access should not be defined before c++17"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 # ifdef __cpp_lib_tuple_like
 #   error "__cpp_lib_tuple_like should not be defined before c++23"
 # endif
@@ -133,6 +133,10 @@
 #   error "__cpp_lib_associative_heterogeneous_insertion should not be defined before c++26"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_erase_if
 #   error "__cpp_lib_erase_if should not be defined before c++20"
 # endif
@@ -155,10 +159,6 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++17"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 # ifdef __cpp_lib_tuple_like
 #   error "__cpp_lib_tuple_like should not be defined before c++23"
 # endif
@@ -187,6 +187,10 @@
 #   error "__cpp_lib_associative_heterogeneous_insertion should not be defined before c++26"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifndef __cpp_lib_erase_if
 #   error "__cpp_lib_erase_if should be defined in c++20"
 # endif
@@ -215,10 +219,6 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++20"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 # ifdef __cpp_lib_tuple_like
 #   error "__cpp_lib_tuple_like should not be defined before c++23"
 # endif
@@ -256,6 +256,13 @@
 #   error "__cpp_lib_associative_heterogeneous_insertion should not be defined before c++26"
 # endif
 
+# ifndef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should be defined in c++23"
+# endif
+# if __cpp_lib_containers_ranges != 202202L
+#   error "__cpp_lib_containers_ranges should have the value 202202L in c++23"
+# endif
+
 # ifndef __cpp_lib_erase_if
 #   error "__cpp_lib_erase_if should be defined in c++23"
 # endif
@@ -284,13 +291,6 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++23"
 # endif
 
-# ifndef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should be defined in c++23"
-# endif
-# if __cpp_lib_ranges_to_container != 202202L
-#   error "__cpp_lib_ranges_to_container should have the value 202202L in c++23"
-# endif
-
 # if !defined(_LIBCPP_VERSION)
 #   ifndef __cpp_lib_tuple_like
 #     error "__cpp_lib_tuple_like should be defined in c++23"
@@ -346,6 +346,13 @@
 #   endif
 # endif
 
+# ifndef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should be defined in c++26"
+# endif
+# if __cpp_lib_containers_ranges != 202202L
+#   error "__cpp_lib_containers_ranges should have the value 202202L in c++26"
+# endif
+
 # ifndef __cpp_lib_erase_if
 #   error "__cpp_lib_erase_if should be defined in c++26"
 # endif
@@ -374,13 +381,6 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++26"
 # endif
 
-# ifndef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should be defined in c++26"
-# endif
-# if __cpp_lib_ranges_to_container != 202202L
-#   error "__cpp_lib_ranges_to_container should have the value 202202L in c++26"
-# endif
-
 # if !defined(_LIBCPP_VERSION)
 #   ifndef __cpp_lib_tuple_like
 #     error "__cpp_lib_tuple_like should be defined in c++26"
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/unordered_set.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/unordered_set.version.compile.pass.cpp
index a0947e995a28..4da49a45698d 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/unordered_set.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/unordered_set.version.compile.pass.cpp
@@ -19,11 +19,11 @@
     __cpp_lib_allocator_traits_is_always_equal       201411L [C++17]
     __cpp_lib_associative_heterogeneous_erasure      202110L [C++23]
     __cpp_lib_associative_heterogeneous_insertion    202306L [C++26]
+    __cpp_lib_containers_ranges                      202202L [C++23]
     __cpp_lib_erase_if                               202002L [C++20]
     __cpp_lib_generic_unordered_lookup               201811L [C++20]
     __cpp_lib_node_extract                           201606L [C++17]
     __cpp_lib_nonmember_container_access             201411L [C++17]
-    __cpp_lib_ranges_to_container                    202202L [C++23]
 */
 
 #include <unordered_set>
@@ -43,6 +43,10 @@
 #   error "__cpp_lib_associative_heterogeneous_insertion should not be defined before c++26"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_erase_if
 #   error "__cpp_lib_erase_if should not be defined before c++20"
 # endif
@@ -59,10 +63,6 @@
 #   error "__cpp_lib_nonmember_container_access should not be defined before c++17"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 #elif TEST_STD_VER == 14
 
 # ifdef __cpp_lib_allocator_traits_is_always_equal
@@ -77,6 +77,10 @@
 #   error "__cpp_lib_associative_heterogeneous_insertion should not be defined before c++26"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_erase_if
 #   error "__cpp_lib_erase_if should not be defined before c++20"
 # endif
@@ -93,10 +97,6 @@
 #   error "__cpp_lib_nonmember_container_access should not be defined before c++17"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 #elif TEST_STD_VER == 17
 
 # ifndef __cpp_lib_allocator_traits_is_always_equal
@@ -114,6 +114,10 @@
 #   error "__cpp_lib_associative_heterogeneous_insertion should not be defined before c++26"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_erase_if
 #   error "__cpp_lib_erase_if should not be defined before c++20"
 # endif
@@ -136,10 +140,6 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++17"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 #elif TEST_STD_VER == 20
 
 # ifndef __cpp_lib_allocator_traits_is_always_equal
@@ -157,6 +157,10 @@
 #   error "__cpp_lib_associative_heterogeneous_insertion should not be defined before c++26"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifndef __cpp_lib_erase_if
 #   error "__cpp_lib_erase_if should be defined in c++20"
 # endif
@@ -185,10 +189,6 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++20"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 #elif TEST_STD_VER == 23
 
 # ifndef __cpp_lib_allocator_traits_is_always_equal
@@ -215,6 +215,13 @@
 #   error "__cpp_lib_associative_heterogeneous_insertion should not be defined before c++26"
 # endif
 
+# ifndef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should be defined in c++23"
+# endif
+# if __cpp_lib_containers_ranges != 202202L
+#   error "__cpp_lib_containers_ranges should have the value 202202L in c++23"
+# endif
+
 # ifndef __cpp_lib_erase_if
 #   error "__cpp_lib_erase_if should be defined in c++23"
 # endif
@@ -243,13 +250,6 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++23"
 # endif
 
-# ifndef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should be defined in c++23"
-# endif
-# if __cpp_lib_ranges_to_container != 202202L
-#   error "__cpp_lib_ranges_to_container should have the value 202202L in c++23"
-# endif
-
 #elif TEST_STD_VER > 23
 
 # ifndef __cpp_lib_allocator_traits_is_always_equal
@@ -285,6 +285,13 @@
 #   endif
 # endif
 
+# ifndef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should be defined in c++26"
+# endif
+# if __cpp_lib_containers_ranges != 202202L
+#   error "__cpp_lib_containers_ranges should have the value 202202L in c++26"
+# endif
+
 # ifndef __cpp_lib_erase_if
 #   error "__cpp_lib_erase_if should be defined in c++26"
 # endif
@@ -313,12 +320,5 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++26"
 # endif
 
-# ifndef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should be defined in c++26"
-# endif
-# if __cpp_lib_ranges_to_container != 202202L
-#   error "__cpp_lib_ranges_to_container should have the value 202202L in c++26"
-# endif
-
 #endif // TEST_STD_VER > 23
 
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/vector.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/vector.version.compile.pass.cpp
index 3d0a956e6c8e..0eef1e99221e 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/vector.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/vector.version.compile.pass.cpp
@@ -18,11 +18,11 @@
 /*  Constant                                                Value
     __cpp_lib_allocator_traits_is_always_equal              201411L [C++17]
     __cpp_lib_constexpr_vector                              201907L [C++20]
+    __cpp_lib_containers_ranges                             202202L [C++23]
     __cpp_lib_default_template_type_for_algorithm_values    202403L [C++26]
     __cpp_lib_erase_if                                      202002L [C++20]
     __cpp_lib_incomplete_container_elements                 201505L [C++17]
     __cpp_lib_nonmember_container_access                    201411L [C++17]
-    __cpp_lib_ranges_to_container                           202202L [C++23]
 */
 
 #include <vector>
@@ -38,6 +38,10 @@
 #   error "__cpp_lib_constexpr_vector should not be defined before c++20"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_default_template_type_for_algorithm_values
 #   error "__cpp_lib_default_template_type_for_algorithm_values should not be defined before c++26"
 # endif
@@ -54,10 +58,6 @@
 #   error "__cpp_lib_nonmember_container_access should not be defined before c++17"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 #elif TEST_STD_VER == 14
 
 # ifdef __cpp_lib_allocator_traits_is_always_equal
@@ -68,6 +68,10 @@
 #   error "__cpp_lib_constexpr_vector should not be defined before c++20"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_default_template_type_for_algorithm_values
 #   error "__cpp_lib_default_template_type_for_algorithm_values should not be defined before c++26"
 # endif
@@ -84,10 +88,6 @@
 #   error "__cpp_lib_nonmember_container_access should not be defined before c++17"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 #elif TEST_STD_VER == 17
 
 # ifndef __cpp_lib_allocator_traits_is_always_equal
@@ -101,6 +101,10 @@
 #   error "__cpp_lib_constexpr_vector should not be defined before c++20"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_default_template_type_for_algorithm_values
 #   error "__cpp_lib_default_template_type_for_algorithm_values should not be defined before c++26"
 # endif
@@ -123,10 +127,6 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++17"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 #elif TEST_STD_VER == 20
 
 # ifndef __cpp_lib_allocator_traits_is_always_equal
@@ -143,6 +143,10 @@
 #   error "__cpp_lib_constexpr_vector should have the value 201907L in c++20"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_default_template_type_for_algorithm_values
 #   error "__cpp_lib_default_template_type_for_algorithm_values should not be defined before c++26"
 # endif
@@ -168,10 +172,6 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++20"
 # endif
 
-# ifdef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should not be defined before c++23"
-# endif
-
 #elif TEST_STD_VER == 23
 
 # ifndef __cpp_lib_allocator_traits_is_always_equal
@@ -188,6 +188,13 @@
 #   error "__cpp_lib_constexpr_vector should have the value 201907L in c++23"
 # endif
 
+# ifndef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should be defined in c++23"
+# endif
+# if __cpp_lib_containers_ranges != 202202L
+#   error "__cpp_lib_containers_ranges should have the value 202202L in c++23"
+# endif
+
 # ifdef __cpp_lib_default_template_type_for_algorithm_values
 #   error "__cpp_lib_default_template_type_for_algorithm_values should not be defined before c++26"
 # endif
@@ -213,13 +220,6 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++23"
 # endif
 
-# ifndef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should be defined in c++23"
-# endif
-# if __cpp_lib_ranges_to_container != 202202L
-#   error "__cpp_lib_ranges_to_container should have the value 202202L in c++23"
-# endif
-
 #elif TEST_STD_VER > 23
 
 # ifndef __cpp_lib_allocator_traits_is_always_equal
@@ -236,6 +236,13 @@
 #   error "__cpp_lib_constexpr_vector should have the value 201907L in c++26"
 # endif
 
+# ifndef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should be defined in c++26"
+# endif
+# if __cpp_lib_containers_ranges != 202202L
+#   error "__cpp_lib_containers_ranges should have the value 202202L in c++26"
+# endif
+
 # if !defined(_LIBCPP_VERSION)
 #   ifndef __cpp_lib_default_template_type_for_algorithm_values
 #     error "__cpp_lib_default_template_type_for_algorithm_values should be defined in c++26"
@@ -270,12 +277,5 @@
 #   error "__cpp_lib_nonmember_container_access should have the value 201411L in c++26"
 # endif
 
-# ifndef __cpp_lib_ranges_to_container
-#   error "__cpp_lib_ranges_to_container should be defined in c++26"
-# endif
-# if __cpp_lib_ranges_to_container != 202202L
-#   error "__cpp_lib_ranges_to_container should have the value 202202L in c++26"
-# endif
-
 #endif // TEST_STD_VER > 23
 
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
index 3ec548f56cea..d7035d7e5e3a 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
@@ -73,6 +73,7 @@
     __cpp_lib_constexpr_utility                             201811L [C++20]
     __cpp_lib_constexpr_vector                              201907L [C++20]
     __cpp_lib_constrained_equality                          202403L [C++26]
+    __cpp_lib_containers_ranges                             202202L [C++23]
     __cpp_lib_copyable_function                             202306L [C++26]
     __cpp_lib_coroutine                                     201902L [C++20]
     __cpp_lib_debugging                                     202311L [C++26]
@@ -455,6 +456,10 @@
 #   error "__cpp_lib_constrained_equality should not be defined before c++26"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_copyable_function
 #   error "__cpp_lib_copyable_function should not be defined before c++26"
 # endif
@@ -1283,6 +1288,10 @@
 #   error "__cpp_lib_constrained_equality should not be defined before c++26"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_copyable_function
 #   error "__cpp_lib_copyable_function should not be defined before c++26"
 # endif
@@ -2213,6 +2222,10 @@
 #   error "__cpp_lib_constrained_equality should not be defined before c++26"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_copyable_function
 #   error "__cpp_lib_copyable_function should not be defined before c++26"
 # endif
@@ -3422,6 +3435,10 @@
 #   error "__cpp_lib_constrained_equality should not be defined before c++26"
 # endif
 
+# ifdef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_copyable_function
 #   error "__cpp_lib_copyable_function should not be defined before c++26"
 # endif
@@ -4841,6 +4858,13 @@
 #   error "__cpp_lib_constrained_equality should not be defined before c++26"
 # endif
 
+# ifndef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should be defined in c++23"
+# endif
+# if __cpp_lib_containers_ranges != 202202L
+#   error "__cpp_lib_containers_ranges should have the value 202202L in c++23"
+# endif
+
 # ifdef __cpp_lib_copyable_function
 #   error "__cpp_lib_copyable_function should not be defined before c++26"
 # endif
@@ -6482,6 +6506,13 @@
 #   endif
 # endif
 
+# ifndef __cpp_lib_containers_ranges
+#   error "__cpp_lib_containers_ranges should be defined in c++26"
+# endif
+# if __cpp_lib_containers_ranges != 202202L
+#   error "__cpp_lib_containers_ranges should have the value 202202L in c++26"
+# endif
+
 # if !defined(_LIBCPP_VERSION)
 #   ifndef __cpp_lib_copyable_function
 #     error "__cpp_lib_copyable_function should be defined in c++26"
@@ -7433,17 +7464,11 @@
 #   endif
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_reference_wrapper
-#     error "__cpp_lib_reference_wrapper should be defined in c++26"
-#   endif
-#   if __cpp_lib_reference_wrapper != 202403L
-#     error "__cpp_lib_reference_wrapper should have the value 202403L in c++26"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_reference_wrapper
-#     error "__cpp_lib_reference_wrapper should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifndef __cpp_lib_reference_wrapper
+#   error "__cpp_lib_reference_wrapper should be defined in c++26"
+# endif
+# if __cpp_lib_reference_wrapper != 202403L
+#   error "__cpp_lib_reference_wrapper should have the value 202403L in c++26"
 # endif
 
 # ifndef __cpp_lib_remove_cvref
diff --git a/libcxx/test/std/localization/locale.categories/category.ctype/locale.ctype.byname/widen_1.pass.cpp b/libcxx/test/std/localization/locale.categories/category.ctype/locale.ctype.byname/widen_1.pass.cpp
index bafdfcea0460..959a4be9e1de 100644
--- a/libcxx/test/std/localization/locale.categories/category.ctype/locale.ctype.byname/widen_1.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.ctype/locale.ctype.byname/widen_1.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // REQUIRES: locale.en_US.UTF-8
-// XFAIL: LIBCXX-AIX-FIXME
 // XFAIL: no-wide-characters
 
 // <locale>
@@ -57,7 +56,7 @@ int main(int, char**)
             assert(f.widen('.') == L'.');
             assert(f.widen('a') == L'a');
             assert(f.widen('1') == L'1');
-#if defined(__APPLE__) || defined(__FreeBSD__) || defined(_WIN32)
+#if defined(__APPLE__) || defined(__FreeBSD__) || defined(_WIN32) || defined(_AIX)
             assert(f.widen(char(-5)) == L'\u00fb');
 #else
             assert(f.widen(char(-5)) == wchar_t(-1));
diff --git a/libcxx/test/std/localization/locale.categories/category.ctype/locale.ctype.byname/widen_many.pass.cpp b/libcxx/test/std/localization/locale.categories/category.ctype/locale.ctype.byname/widen_many.pass.cpp
index 552eab1f2ab4..078b4a6fefb7 100644
--- a/libcxx/test/std/localization/locale.categories/category.ctype/locale.ctype.byname/widen_many.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.ctype/locale.ctype.byname/widen_many.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // REQUIRES: locale.en_US.UTF-8
-// XFAIL: LIBCXX-AIX-FIXME
 // XFAIL: no-wide-characters
 
 // <locale>
@@ -63,7 +62,7 @@ int main(int, char**)
             assert(v[3] == L'.');
             assert(v[4] == L'a');
             assert(v[5] == L'1');
-#if defined(__APPLE__) || defined(__FreeBSD__) || defined(_WIN32)
+#if defined(__APPLE__) || defined(__FreeBSD__) || defined(_WIN32) || defined(_AIX)
             assert(v[6] == L'\xfb');
 #else
             assert(v[6] == wchar_t(-1));
diff --git a/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/grouping.pass.cpp b/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/grouping.pass.cpp
index 2f2bc2fd8832..86c447d400aa 100644
--- a/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/grouping.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/grouping.pass.cpp
@@ -9,7 +9,6 @@
 // NetBSD does not support LC_NUMERIC at the moment
 // XFAIL: netbsd
 
-// XFAIL: LIBCXX-AIX-FIXME
 // XFAIL: LIBCXX-FREEBSD-FIXME
 
 // REQUIRES: locale.en_US.UTF-8
@@ -49,7 +48,7 @@ int main(int, char**)
         {
             typedef char C;
             const std::numpunct<C>& np = std::use_facet<std::numpunct<C> >(l);
-#ifdef _WIN32
+#if defined(_WIN32) || defined(_AIX)
             assert(np.grouping() == "\3");
 #else
             assert(np.grouping() == "\3\3");
@@ -59,17 +58,17 @@ int main(int, char**)
         {
             typedef wchar_t C;
             const std::numpunct<C>& np = std::use_facet<std::numpunct<C> >(l);
-#ifdef _WIN32
+#  if defined(_WIN32) || defined(_AIX)
             assert(np.grouping() == "\3");
-#else
+#  else
             assert(np.grouping() == "\3\3");
-#endif
+#  endif
         }
 #endif
     }
     {
         std::locale l(LOCALE_fr_FR_UTF_8);
-#if defined(TEST_HAS_GLIBC) || defined(_WIN32)
+#if defined(TEST_HAS_GLIBC) || defined(_WIN32) || defined(_AIX)
         const char* const group = "\3";
 #else
         const char* const group = "\x7f";
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp
index 831c226f9c8e..212804356a05 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp
@@ -17,6 +17,7 @@
 #include <cassert>
 #include <climits>
 #include <cstdint>
+#include <random>
 #include <type_traits>
 
 #include "test_macros.h"
@@ -48,6 +49,74 @@ constexpr bool test0(int in1, int in2, int out)
     return true;
 }
 
+template <typename T>
+T basic_gcd_(T m, T n) {
+  return n == 0 ? m : basic_gcd_<T>(n, m % n);
+}
+
+template <typename T>
+T basic_gcd(T m, T n) {
+  using Tp = std::make_unsigned_t<T>;
+  if (m < 0 && m != std::numeric_limits<T>::min())
+    m = -m;
+  if (n < 0 && n != std::numeric_limits<T>::min())
+    n = -n;
+  return basic_gcd_(static_cast<Tp>(m), static_cast<Tp>(n));
+}
+
+template <typename Input>
+void do_fuzzy_tests() {
+  std::mt19937 gen(1938);
+  std::uniform_int_distribution<Input> distrib;
+
+  constexpr int nb_rounds = 10000;
+  for (int i = 0; i < nb_rounds; ++i) {
+    Input n = distrib(gen);
+    Input m = distrib(gen);
+    assert(std::gcd(n, m) == basic_gcd(n, m));
+  }
+}
+
+template <typename Input>
+void do_limit_tests() {
+  Input inputs[] = {
+      // The behavior of std::gcd is undefined if the absolute value of one of its
+      // operand is not representable in the result type.
+      std::numeric_limits<Input>::min() + (std::is_signed<Input>::value ? 3 : 0),
+      std::numeric_limits<Input>::min() + 1,
+      std::numeric_limits<Input>::min() + 2,
+      std::numeric_limits<Input>::max(),
+      std::numeric_limits<Input>::max() - 1,
+      std::numeric_limits<Input>::max() - 2,
+      0,
+      1,
+      2,
+      3,
+      4,
+      5,
+      6,
+      7,
+      8,
+      9,
+      10,
+      (Input)-1,
+      (Input)-2,
+      (Input)-3,
+      (Input)-4,
+      (Input)-5,
+      (Input)-6,
+      (Input)-7,
+      (Input)-8,
+      (Input)-9,
+      (Input)-10,
+  };
+
+  for (auto n : inputs) {
+    for (auto m : inputs) {
+      assert(std::gcd(n, m) == basic_gcd(n, m));
+    }
+  }
+}
 
 template <typename Input1, typename Input2 = Input1>
 constexpr bool do_test(int = 0)
@@ -143,5 +212,23 @@ int main(int argc, char**)
     assert(res == 2);
     }
 
-  return 0;
+    do_fuzzy_tests<std::int8_t>();
+    do_fuzzy_tests<std::int16_t>();
+    do_fuzzy_tests<std::int32_t>();
+    do_fuzzy_tests<std::int64_t>();
+    do_fuzzy_tests<std::uint8_t>();
+    do_fuzzy_tests<std::uint16_t>();
+    do_fuzzy_tests<std::uint32_t>();
+    do_fuzzy_tests<std::uint64_t>();
+
+    do_limit_tests<std::int8_t>();
+    do_limit_tests<std::int16_t>();
+    do_limit_tests<std::int32_t>();
+    do_limit_tests<std::int64_t>();
+    do_limit_tests<std::uint8_t>();
+    do_limit_tests<std::uint16_t>();
+    do_limit_tests<std::uint32_t>();
+    do_limit_tests<std::uint64_t>();
+
+    return 0;
 }
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.bernoulli/eval.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.bernoulli/eval.pass.cpp
index 9a7af92931dd..568bf34f1ea4 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.bernoulli/eval.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.bernoulli/eval.pass.cpp
@@ -63,8 +63,8 @@ int main(int, char**)
         double x_kurtosis = (6 * sqr(d.p()) - 6 * d.p() + 1)/x_var;
         assert(std::abs((mean - x_mean) / x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
-        assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.02);
+        assert(std::abs((skew - x_skew) / x_skew) < 0.02);
+        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.05);
     }
     {
         typedef std::bernoulli_distribution D;
@@ -99,8 +99,8 @@ int main(int, char**)
         double x_kurtosis = (6 * sqr(d.p()) - 6 * d.p() + 1)/x_var;
         assert(std::abs((mean - x_mean) / x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
-        assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.02);
+        assert(std::abs((skew - x_skew) / x_skew) < 0.02);
+        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.05);
     }
 
   return 0;
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.bernoulli/eval_param.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.bernoulli/eval_param.pass.cpp
index 5584a9d16448..dfaa9f1c89f9 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.bernoulli/eval_param.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.bernoulli/eval_param.pass.cpp
@@ -65,8 +65,8 @@ int main(int, char**)
         double x_kurtosis = (6 * sqr(p.p()) - 6 * p.p() + 1)/x_var;
         assert(std::abs((mean - x_mean) / x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
-        assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.02);
+        assert(std::abs((skew - x_skew) / x_skew) < 0.02);
+        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.05);
     }
     {
         typedef std::bernoulli_distribution D;
@@ -103,8 +103,8 @@ int main(int, char**)
         double x_kurtosis = (6 * sqr(p.p()) - 6 * p.p() + 1)/x_var;
         assert(std::abs((mean - x_mean) / x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
-        assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.02);
+        assert(std::abs((skew - x_skew) / x_skew) < 0.02);
+        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.05);
     }
 
   return 0;
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.bin/eval.PR44847.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.bin/eval.PR44847.pass.cpp
index 572a59b7a192..549f3cee4d4a 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.bin/eval.PR44847.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.bin/eval.PR44847.pass.cpp
@@ -163,7 +163,7 @@ int main(int, char**) {
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
     assert(std::abs((var - x_var) / x_var) < 0.01);
     assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.04);
+    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.06);
 
     return 0;
 }
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.bin/eval.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.bin/eval.pass.cpp
index dbdd09724119..d8852cc3bd09 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.bin/eval.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.bin/eval.pass.cpp
@@ -68,7 +68,7 @@ void test1() {
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
     assert(std::abs((var - x_var) / x_var) < 0.01);
     assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.04);
+    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.08);
 }
 
 template <class T>
@@ -109,8 +109,8 @@ void test2() {
     double x_kurtosis = (1-6*d.p()*(1-d.p())) / x_var;
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
     assert(std::abs((var - x_var) / x_var) < 0.01);
-    assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
+    assert(std::abs((skew - x_skew) / x_skew) < 0.02);
+    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.08);
 }
 
 template <class T>
@@ -151,8 +151,8 @@ void test3() {
     double x_kurtosis = (1-6*d.p()*(1-d.p())) / x_var;
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
     assert(std::abs((var - x_var) / x_var) < 0.01);
-    assert(std::abs((skew - x_skew) / x_skew) < 0.03);
-    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.3);
+    assert(std::abs((skew - x_skew) / x_skew) < 0.07);
+    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 2.0);
 }
 
 template <class T>
@@ -292,7 +292,7 @@ void test6() {
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
     assert(std::abs((var - x_var) / x_var) < 0.01);
     assert(std::abs(skew - x_skew) < 0.02);
-    assert(std::abs(kurtosis - x_kurtosis) < 0.01);
+    assert(std::abs(kurtosis - x_kurtosis) < 0.03);
 }
 
 template <class T>
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.bin/eval_param.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.bin/eval_param.pass.cpp
index 78d6aedde73a..adbcb78d10f4 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.bin/eval_param.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.bin/eval_param.pass.cpp
@@ -72,7 +72,7 @@ int main(int, char**)
         assert(std::abs((mean - x_mean) / x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
         assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.04);
+        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.08);
     }
     {
         typedef std::binomial_distribution<> D;
@@ -113,8 +113,8 @@ int main(int, char**)
         double x_kurtosis = (1-6*p.p()*(1-p.p())) / x_var;
         assert(std::abs((mean - x_mean) / x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
-        assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
+        assert(std::abs((skew - x_skew) / x_skew) < 0.02);
+        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.08);
     }
     {
         typedef std::binomial_distribution<> D;
@@ -155,8 +155,8 @@ int main(int, char**)
         double x_kurtosis = (1-6*p.p()*(1-p.p())) / x_var;
         assert(std::abs((mean - x_mean) / x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
-        assert(std::abs((skew - x_skew) / x_skew) < 0.04);
-        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.3);
+        assert(std::abs((skew - x_skew) / x_skew) < 0.07);
+        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 2.0);
     }
 
   return 0;
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.geo/eval.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.geo/eval.pass.cpp
index 440334ed3488..0cdb7fa6312c 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.geo/eval.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.geo/eval.pass.cpp
@@ -77,7 +77,7 @@ void test1() {
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
     assert(std::abs((var - x_var) / x_var) < 0.01);
     assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
+    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
 }
 
 template <class T>
@@ -161,7 +161,7 @@ void test3() {
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
     assert(std::abs((var - x_var) / x_var) < 0.01);
     assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.02);
+    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
 }
 
 template <class T>
@@ -203,7 +203,7 @@ void test4() {
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
     assert(std::abs((var - x_var) / x_var) < 0.01);
     assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.02);
+    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
 }
 
 template <class T>
@@ -245,7 +245,7 @@ void test5() {
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
     assert(std::abs((var - x_var) / x_var) < 0.01);
     assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.02);
+    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
 }
 
 template <class T>
@@ -284,8 +284,8 @@ void test6() {
     double x_var = x_mean / d.p();
     double x_skew = (2 - d.p()) / std::sqrt((1 - d.p()));
     double x_kurtosis = 6 + sqr(d.p()) / (1 - d.p());
-    assert(std::abs((mean - x_mean) / x_mean) < 0.01);
-    assert(std::abs((var - x_var) / x_var) < 0.01);
+    assert(std::abs((mean - x_mean) / x_mean) < 0.02);
+    assert(std::abs((var - x_var) / x_var) < 0.02);
     assert(std::abs((skew - x_skew) / x_skew) < 0.01);
     assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.02);
 }
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.geo/eval_param.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.geo/eval_param.pass.cpp
index 16cb7fb0a45f..16a5bd4c7a33 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.geo/eval_param.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.geo/eval_param.pass.cpp
@@ -72,7 +72,7 @@ int main(int, char**)
         assert(std::abs((mean - x_mean) / x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
         assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
+        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
     }
     {
         typedef std::geometric_distribution<> D;
@@ -156,7 +156,7 @@ int main(int, char**)
         assert(std::abs((mean - x_mean) / x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
         assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.02);
+        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
     }
 
   return 0;
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.negbin/eval.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.negbin/eval.pass.cpp
index d0f6fbf0a120..be08361fc27e 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.negbin/eval.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.negbin/eval.pass.cpp
@@ -74,7 +74,7 @@ void test1() {
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
     assert(std::abs((var - x_var) / x_var) < 0.01);
     assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.02);
+    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
 }
 
 template <class T>
@@ -115,8 +115,8 @@ void test2() {
     double x_kurtosis = 6. / d.k() + sqr(d.p()) / (d.k() * (1 - d.p()));
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
     assert(std::abs((var - x_var) / x_var) < 0.01);
-    assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
+    assert(std::abs((skew - x_skew) / x_skew) < 0.02);
+    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.1);
 }
 
 template <class T>
@@ -157,8 +157,8 @@ void test3() {
     double x_kurtosis = 6. / d.k() + sqr(d.p()) / (d.k() * (1 - d.p()));
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
     assert(std::abs((var - x_var) / x_var) < 0.01);
-    assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
+    assert(std::abs((skew - x_skew) / x_skew) < 0.02);
+    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.08);
 }
 
 template <class T>
@@ -243,8 +243,8 @@ void test5() {
     double x_kurtosis = 6. / d.k() + sqr(d.p()) / (d.k() * (1 - d.p()));
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
     assert(std::abs((var - x_var) / x_var) < 0.01);
-    assert(std::abs((skew - x_skew) / x_skew) < 0.04);
-    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.05);
+    assert(std::abs((skew - x_skew) / x_skew) < 0.02);
+    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.3);
 }
 
 template <class T>
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.negbin/eval_param.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.negbin/eval_param.pass.cpp
index 0b03982a737e..26bc83382f68 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.negbin/eval_param.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.negbin/eval_param.pass.cpp
@@ -72,7 +72,7 @@ int main(int, char**)
         assert(std::abs((mean - x_mean) / x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
         assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
+        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
     }
     {
         typedef std::negative_binomial_distribution<> D;
@@ -113,8 +113,8 @@ int main(int, char**)
         double x_kurtosis = 6. / p.k() + sqr(p.p()) / (p.k() * (1 - p.p()));
         assert(std::abs((mean - x_mean) / x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
-        assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
+        assert(std::abs((skew - x_skew) / x_skew) < 0.02);
+        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.1);
     }
     {
         typedef std::negative_binomial_distribution<> D;
@@ -155,8 +155,8 @@ int main(int, char**)
         double x_kurtosis = 6. / p.k() + sqr(p.p()) / (p.k() * (1 - p.p()));
         assert(std::abs((mean - x_mean) / x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
-        assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
+        assert(std::abs((skew - x_skew) / x_skew) < 0.02);
+        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.08);
     }
 
   return 0;
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.norm/rand.dist.norm.cauchy/eval.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.norm/rand.dist.norm.cauchy/eval.pass.cpp
index abc0cc531a11..83e64046f0dc 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.norm/rand.dist.norm.cauchy/eval.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.norm/rand.dist.norm.cauchy/eval.pass.cpp
@@ -45,7 +45,7 @@ int main(int, char**)
             u.push_back(d(g));
         std::sort(u.begin(), u.end());
         for (int i = 0; i < N; ++i)
-            assert(std::abs(f(u[i], a, b) - double(i)/N) < .001);
+          assert(std::abs(f(u[i], a, b) - double(i) / N) < .0013);
     }
     {
         typedef std::cauchy_distribution<> D;
@@ -60,7 +60,7 @@ int main(int, char**)
             u.push_back(d(g));
         std::sort(u.begin(), u.end());
         for (int i = 0; i < N; ++i)
-            assert(std::abs(f(u[i], a, b) - double(i)/N) < .001);
+          assert(std::abs(f(u[i], a, b) - double(i) / N) < .0013);
     }
     {
         typedef std::cauchy_distribution<> D;
@@ -75,7 +75,7 @@ int main(int, char**)
             u.push_back(d(g));
         std::sort(u.begin(), u.end());
         for (int i = 0; i < N; ++i)
-            assert(std::abs(f(u[i], a, b) - double(i)/N) < .001);
+          assert(std::abs(f(u[i], a, b) - double(i) / N) < .0013);
     }
 
   return 0;
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.norm/rand.dist.norm.chisq/eval.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.norm/rand.dist.norm.chisq/eval.pass.cpp
index 2a8dfd31aa08..559034b2a0ec 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.norm/rand.dist.norm.chisq/eval.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.norm/rand.dist.norm.chisq/eval.pass.cpp
@@ -70,7 +70,7 @@ int main(int, char**)
         assert(std::abs((mean - x_mean) / x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
         assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
+        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.04);
     }
     {
         typedef std::chi_squared_distribution<> D;
@@ -109,7 +109,7 @@ int main(int, char**)
         assert(std::abs((mean - x_mean) / x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
         assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
+        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
     }
     {
         typedef std::chi_squared_distribution<> D;
@@ -148,7 +148,7 @@ int main(int, char**)
         assert(std::abs((mean - x_mean) / x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
         assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
+        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
     }
 
   return 0;
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.norm/rand.dist.norm.chisq/eval_param.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.norm/rand.dist.norm.chisq/eval_param.pass.cpp
index 52864739c9b3..74454f296267 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.norm/rand.dist.norm.chisq/eval_param.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.norm/rand.dist.norm.chisq/eval_param.pass.cpp
@@ -72,7 +72,7 @@ int main(int, char**)
         assert(std::abs((mean - x_mean) / x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
         assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
+        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
     }
     {
         typedef std::chi_squared_distribution<> D;
@@ -113,7 +113,7 @@ int main(int, char**)
         assert(std::abs((mean - x_mean) / x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
         assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
+        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
     }
     {
         typedef std::chi_squared_distribution<> D;
@@ -154,7 +154,7 @@ int main(int, char**)
         assert(std::abs((mean - x_mean) / x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
         assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
+        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.04);
     }
 
   return 0;
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.norm/rand.dist.norm.lognormal/eval.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.norm/rand.dist.norm.lognormal/eval.pass.cpp
index 9f25cea6540e..745103d13d28 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.norm/rand.dist.norm.lognormal/eval.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.norm/rand.dist.norm.lognormal/eval.pass.cpp
@@ -70,8 +70,8 @@ test1()
                       3*std::exp(2*sqr(d.s())) - 6;
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
     assert(std::abs((var - x_var) / x_var) < 0.01);
-    assert(std::abs((skew - x_skew) / x_skew) < 0.05);
-    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.25);
+    assert(std::abs((skew - x_skew) / x_skew) < 0.1);
+    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 1.9);
 }
 
 void
@@ -115,7 +115,7 @@ test2()
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
     assert(std::abs((var - x_var) / x_var) < 0.01);
     assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
+    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.04);
 }
 
 void
@@ -159,7 +159,7 @@ test3()
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
     assert(std::abs((var - x_var) / x_var) < 0.01);
     assert(std::abs((skew - x_skew) / x_skew) < 0.02);
-    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.05);
+    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.06);
 }
 
 void
@@ -202,8 +202,8 @@ test4()
                       3*std::exp(2*sqr(d.s())) - 6;
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
     assert(std::abs((var - x_var) / x_var) < 0.02);
-    assert(std::abs((skew - x_skew) / x_skew) < 0.08);
-    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.4);
+    assert(std::abs((skew - x_skew) / x_skew) < 0.1);
+    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.5);
 }
 
 void
@@ -245,9 +245,9 @@ test5()
     double x_kurtosis = std::exp(4*sqr(d.s())) + 2*std::exp(3*sqr(d.s())) +
                       3*std::exp(2*sqr(d.s())) - 6;
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
-    assert(std::abs((var - x_var) / x_var) < 0.04);
-    assert(std::abs((skew - x_skew) / x_skew) < 0.2);
-    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.7);
+    assert(std::abs((var - x_var) / x_var) < 0.05);
+    assert(std::abs((skew - x_skew) / x_skew) < 0.3);
+    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 1.0);
 }
 
 int main(int, char**)
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.norm/rand.dist.norm.lognormal/eval_param.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.norm/rand.dist.norm.lognormal/eval_param.pass.cpp
index 9c9eb858e5bf..f9ea0bbd201c 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.norm/rand.dist.norm.lognormal/eval_param.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.norm/rand.dist.norm.lognormal/eval_param.pass.cpp
@@ -72,8 +72,8 @@ test1()
                       3*std::exp(2*sqr(p.s())) - 6;
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
     assert(std::abs((var - x_var) / x_var) < 0.01);
-    assert(std::abs((skew - x_skew) / x_skew) < 0.05);
-    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.25);
+    assert(std::abs((skew - x_skew) / x_skew) < 0.1);
+    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 1.9);
 }
 
 void
@@ -119,7 +119,7 @@ test2()
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
     assert(std::abs((var - x_var) / x_var) < 0.01);
     assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
+    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.04);
 }
 
 void
@@ -165,7 +165,7 @@ test3()
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
     assert(std::abs((var - x_var) / x_var) < 0.01);
     assert(std::abs((skew - x_skew) / x_skew) < 0.02);
-    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.05);
+    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.06);
 }
 
 void
@@ -210,8 +210,8 @@ test4()
                       3*std::exp(2*sqr(p.s())) - 6;
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
     assert(std::abs((var - x_var) / x_var) < 0.02);
-    assert(std::abs((skew - x_skew) / x_skew) < 0.08);
-    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.4);
+    assert(std::abs((skew - x_skew) / x_skew) < 0.1);
+    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.5);
 }
 
 void
@@ -255,9 +255,9 @@ test5()
     double x_kurtosis = std::exp(4*sqr(p.s())) + 2*std::exp(3*sqr(p.s())) +
                       3*std::exp(2*sqr(p.s())) - 6;
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
-    assert(std::abs((var - x_var) / x_var) < 0.04);
-    assert(std::abs((skew - x_skew) / x_skew) < 0.2);
-    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.7);
+    assert(std::abs((var - x_var) / x_var) < 0.05);
+    assert(std::abs((skew - x_skew) / x_skew) < 0.3);
+    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 1.0);
 }
 
 int main(int, char**)
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.norm/rand.dist.norm.t/eval.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.norm/rand.dist.norm.t/eval.pass.cpp
index a73e06bbf03e..80ed7bab2650 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.norm/rand.dist.norm.t/eval.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.norm/rand.dist.norm.t/eval.pass.cpp
@@ -64,7 +64,7 @@ int main(int, char**)
         double x_kurtosis = 6 / (d.n() - 4);
         assert(std::abs(mean - x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
-        assert(std::abs(skew - x_skew) < 0.01);
+        assert(std::abs(skew - x_skew) < 0.05);
         assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.2);
     }
     {
@@ -99,7 +99,7 @@ int main(int, char**)
         double x_kurtosis = 6 / (d.n() - 4);
         assert(std::abs(mean - x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
-        assert(std::abs(skew - x_skew) < 0.01);
+        assert(std::abs(skew - x_skew) < 0.05);
         assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.04);
     }
     {
@@ -134,8 +134,8 @@ int main(int, char**)
         double x_kurtosis = 6 / (d.n() - 4);
         assert(std::abs(mean - x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
-        assert(std::abs(skew - x_skew) < 0.01);
-        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.02);
+        assert(std::abs(skew - x_skew) < 0.005);
+        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.2);
     }
 
   return 0;
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.norm/rand.dist.norm.t/eval_param.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.norm/rand.dist.norm.t/eval_param.pass.cpp
index 01ce61a680af..fe3140fb6f55 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.norm/rand.dist.norm.t/eval_param.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.norm/rand.dist.norm.t/eval_param.pass.cpp
@@ -66,7 +66,7 @@ int main(int, char**)
         double x_kurtosis = 6 / (p.n() - 4);
         assert(std::abs(mean - x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
-        assert(std::abs(skew - x_skew) < 0.01);
+        assert(std::abs(skew - x_skew) < 0.05);
         assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.2);
     }
     {
@@ -103,7 +103,7 @@ int main(int, char**)
         double x_kurtosis = 6 / (p.n() - 4);
         assert(std::abs(mean - x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
-        assert(std::abs(skew - x_skew) < 0.01);
+        assert(std::abs(skew - x_skew) < 0.05);
         assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.04);
     }
     {
@@ -140,8 +140,8 @@ int main(int, char**)
         double x_kurtosis = 6 / (p.n() - 4);
         assert(std::abs(mean - x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
-        assert(std::abs(skew - x_skew) < 0.01);
-        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.02);
+        assert(std::abs(skew - x_skew) < 0.005);
+        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.2);
     }
 
   return 0;
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.pois/rand.dist.pois.exp/eval.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.pois/rand.dist.pois.exp/eval.pass.cpp
index 8bceb918ecd5..aee573e5f246 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.pois/rand.dist.pois.exp/eval.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.pois/rand.dist.pois.exp/eval.pass.cpp
@@ -70,7 +70,7 @@ int main(int, char**)
         assert(std::abs((mean - x_mean) / x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
         assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
+        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
     }
     {
         typedef std::exponential_distribution<> D;
@@ -109,7 +109,7 @@ int main(int, char**)
         assert(std::abs((mean - x_mean) / x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
         assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
+        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
     }
     {
         typedef std::exponential_distribution<> D;
@@ -148,7 +148,7 @@ int main(int, char**)
         assert(std::abs((mean - x_mean) / x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
         assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
+        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
     }
 
   return 0;
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.pois/rand.dist.pois.exp/eval_param.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.pois/rand.dist.pois.exp/eval_param.pass.cpp
index 016ba27587cf..dfbedd81e0b9 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.pois/rand.dist.pois.exp/eval_param.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.pois/rand.dist.pois.exp/eval_param.pass.cpp
@@ -72,7 +72,7 @@ int main(int, char**)
         assert(std::abs((mean - x_mean) / x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
         assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
+        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
     }
 
   return 0;
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.pois/rand.dist.pois.extreme/eval.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.pois/rand.dist.pois.extreme/eval.pass.cpp
index 9aaf6ce0ab3a..3cf0feef18ec 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.pois/rand.dist.pois.extreme/eval.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.pois/rand.dist.pois.extreme/eval.pass.cpp
@@ -68,7 +68,7 @@ test1()
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
     assert(std::abs((var - x_var) / x_var) < 0.01);
     assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
+    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
 }
 
 void
@@ -109,7 +109,7 @@ test2()
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
     assert(std::abs((var - x_var) / x_var) < 0.01);
     assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
+    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
 }
 
 void
@@ -150,7 +150,7 @@ test3()
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
     assert(std::abs((var - x_var) / x_var) < 0.01);
     assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
+    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
 }
 
 void
@@ -191,7 +191,7 @@ test4()
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
     assert(std::abs((var - x_var) / x_var) < 0.01);
     assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
+    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
 }
 
 int main(int, char**)
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.pois/rand.dist.pois.extreme/eval_param.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.pois/rand.dist.pois.extreme/eval_param.pass.cpp
index cb390c0452ad..43a22511589b 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.pois/rand.dist.pois.extreme/eval_param.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.pois/rand.dist.pois.extreme/eval_param.pass.cpp
@@ -70,7 +70,7 @@ test1()
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
     assert(std::abs((var - x_var) / x_var) < 0.01);
     assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
+    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
 }
 
 void
@@ -113,7 +113,7 @@ test2()
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
     assert(std::abs((var - x_var) / x_var) < 0.01);
     assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
+    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
 }
 
 void
@@ -156,7 +156,7 @@ test3()
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
     assert(std::abs((var - x_var) / x_var) < 0.01);
     assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
+    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
 }
 
 void
@@ -199,7 +199,7 @@ test4()
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
     assert(std::abs((var - x_var) / x_var) < 0.01);
     assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
+    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
 }
 
 int main(int, char**)
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.pois/rand.dist.pois.gamma/eval.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.pois/rand.dist.pois.gamma/eval.pass.cpp
index 7af50cdb8b1a..48cc4df02396 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.pois/rand.dist.pois.gamma/eval.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.pois/rand.dist.pois.gamma/eval.pass.cpp
@@ -69,7 +69,7 @@ int main(int, char**)
         assert(std::abs((mean - x_mean) / x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
         assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
+        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.04);
     }
     {
         typedef std::gamma_distribution<> D;
@@ -108,7 +108,7 @@ int main(int, char**)
         assert(std::abs((mean - x_mean) / x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
         assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
+        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
     }
     {
         typedef std::gamma_distribution<> D;
@@ -147,7 +147,7 @@ int main(int, char**)
         assert(std::abs((mean - x_mean) / x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
         assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
+        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
     }
 
   return 0;
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.pois/rand.dist.pois.gamma/eval_param.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.pois/rand.dist.pois.gamma/eval_param.pass.cpp
index a782770153ac..c437983c6bd7 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.pois/rand.dist.pois.gamma/eval_param.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.pois/rand.dist.pois.gamma/eval_param.pass.cpp
@@ -71,7 +71,7 @@ int main(int, char**)
         assert(std::abs((mean - x_mean) / x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
         assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
+        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
     }
     {
         typedef std::gamma_distribution<> D;
@@ -112,7 +112,7 @@ int main(int, char**)
         assert(std::abs((mean - x_mean) / x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
         assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
+        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
     }
     {
         typedef std::gamma_distribution<> D;
@@ -153,7 +153,7 @@ int main(int, char**)
         assert(std::abs((mean - x_mean) / x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
         assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
+        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.04);
     }
 
   return 0;
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.pois/rand.dist.pois.poisson/eval.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.pois/rand.dist.pois.poisson/eval.pass.cpp
index 5feb2e580229..564f2e99ccea 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.pois/rand.dist.pois.poisson/eval.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.pois/rand.dist.pois.poisson/eval.pass.cpp
@@ -129,8 +129,8 @@ void tests() {
     double x_kurtosis = 1 / x_var;
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
     assert(std::abs((var - x_var) / x_var) < 0.01);
-    assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
+    assert(std::abs((skew - x_skew) / x_skew) < 0.03);
+    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.2);
   }
   {
     typedef std::poisson_distribution<T> D;
@@ -167,9 +167,9 @@ void tests() {
     double x_skew = 1 / std::sqrt(x_var);
     double x_kurtosis = 1 / x_var;
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
-    assert(std::abs((var - x_var) / x_var) < 0.01);
-    assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.04);
+    assert(std::abs((var - x_var) / x_var) < 0.02);
+    assert(std::abs((skew - x_skew) / x_skew) < 0.02);
+    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.09);
   }
   {
     typedef std::poisson_distribution<T> D;
@@ -207,8 +207,8 @@ void tests() {
     double x_kurtosis = 1 / x_var;
     assert(std::abs((mean - x_mean) / x_mean) < 0.01);
     assert(std::abs((var - x_var) / x_var) < 0.01);
-    assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
+    assert(std::abs((skew - x_skew) / x_skew) < 0.02);
+    assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.3);
   }
 }
 
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.pois/rand.dist.pois.poisson/eval_param.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.pois/rand.dist.pois.poisson/eval_param.pass.cpp
index 8d60be4e656e..e5cabc086581 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.pois/rand.dist.pois.poisson/eval_param.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.pois/rand.dist.pois.poisson/eval_param.pass.cpp
@@ -70,8 +70,8 @@ int main(int, char**)
         double x_kurtosis = 1 / x_var;
         assert(std::abs((mean - x_mean) / x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
-        assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.03);
+        assert(std::abs((skew - x_skew) / x_skew) < 0.03);
+        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.2);
     }
     {
         typedef std::poisson_distribution<> D;
@@ -110,9 +110,9 @@ int main(int, char**)
         double x_skew = 1 / std::sqrt(x_var);
         double x_kurtosis = 1 / x_var;
         assert(std::abs((mean - x_mean) / x_mean) < 0.01);
-        assert(std::abs((var - x_var) / x_var) < 0.01);
-        assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.04);
+        assert(std::abs((var - x_var) / x_var) < 0.02);
+        assert(std::abs((skew - x_skew) / x_skew) < 0.02);
+        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.09);
     }
     {
         typedef std::poisson_distribution<> D;
@@ -152,8 +152,8 @@ int main(int, char**)
         double x_kurtosis = 1 / x_var;
         assert(std::abs((mean - x_mean) / x_mean) < 0.01);
         assert(std::abs((var - x_var) / x_var) < 0.01);
-        assert(std::abs((skew - x_skew) / x_skew) < 0.01);
-        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
+        assert(std::abs((skew - x_skew) / x_skew) < 0.02);
+        assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.3);
     }
 
   return 0;
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/eval.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/eval.pass.cpp
index da089b02a964..9c6365e7e297 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/eval.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/eval.pass.cpp
@@ -81,7 +81,7 @@ void tests() {
         }
         std::vector<double> prob = d.probabilities();
         for (unsigned i = 0; i < u.size(); ++i)
-            assert(std::abs((double)u[i]/N - prob[i]) / prob[i] < 0.001);
+          assert(std::abs((double)u[i] / N - prob[i]) / prob[i] < 0.0013);
     }
     {
         typedef std::discrete_distribution<T> D;
@@ -158,7 +158,7 @@ void tests() {
         std::vector<double> prob = d.probabilities();
         for (unsigned i = 0; i < u.size(); ++i)
             if (prob[i] != 0)
-                assert(std::abs((double)u[i]/N - prob[i]) / prob[i] < 0.001);
+              assert(std::abs((double)u[i] / N - prob[i]) / prob[i] < 0.0013);
             else
                 assert(u[i] == 0);
     }
@@ -202,7 +202,7 @@ void tests() {
         std::vector<double> prob = d.probabilities();
         for (unsigned i = 0; i < u.size(); ++i)
             if (prob[i] != 0)
-                assert(std::abs((double)u[i]/N - prob[i]) / prob[i] < 0.001);
+              assert(std::abs((double)u[i] / N - prob[i]) / prob[i] < 0.0013);
             else
                 assert(u[i] == 0);
     }
@@ -290,7 +290,7 @@ void tests() {
         std::vector<double> prob = d.probabilities();
         for (unsigned i = 0; i < u.size(); ++i)
             if (prob[i] != 0)
-                assert(std::abs((double)u[i]/N - prob[i]) / prob[i] < 0.001);
+              assert(std::abs((double)u[i] / N - prob[i]) / prob[i] < 0.0013);
             else
                 assert(u[i] == 0);
     }
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.pconst/eval.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.pconst/eval.pass.cpp
index e11b235b7646..8ab560b15bb1 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.pconst/eval.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.pconst/eval.pass.cpp
@@ -529,8 +529,8 @@ test8()
             double x_skew = 0;
             double x_kurtosis = -6./5;
             assert(std::abs((mean - x_mean) / x_mean) < 0.01);
-            assert(std::abs((var - x_var) / x_var) < 0.01);
-            assert(std::abs(skew - x_skew) < 0.01);
+            assert(std::abs((var - x_var) / x_var) < 0.02);
+            assert(std::abs(skew - x_skew) < 0.02);
             assert(std::abs((kurtosis - x_kurtosis) / x_kurtosis) < 0.01);
         }
     }
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/eval.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/eval.pass.cpp
index e551c8c2bb38..8aad0b8e4a85 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/eval.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/eval.pass.cpp
@@ -58,7 +58,7 @@ test1()
         u.push_back(v);
     }
     std::sort(u.begin(), u.end());
-    int kp = -1;
+    std::ptrdiff_t kp = -1;
     double a = std::numeric_limits<double>::quiet_NaN();
     double m = std::numeric_limits<double>::quiet_NaN();
     double bk = std::numeric_limits<double>::quiet_NaN();
@@ -76,18 +76,17 @@ test1()
         p[i] /= S;
     for (std::size_t i = 0; i < N; ++i)
     {
-        int k = std::lower_bound(b, b+Np+1, u[i]) - b - 1;
-        if (k != kp)
-        {
-            a = 0;
-            for (int j = 0; j < k; ++j)
-                a += areas[j];
-            m = (p[k+1] - p[k]) / (b[k+1] - b[k]);
-            bk = b[k];
-            c = (b[k+1]*p[k] - b[k]*p[k+1]) / (b[k+1] - b[k]);
-            kp = k;
+      std::ptrdiff_t k = std::lower_bound(b, b + Np + 1, u[i]) - b - 1;
+      if (k != kp) {
+        a = 0;
+        for (int j = 0; j < k; ++j)
+          a += areas[j];
+        m  = (p[k + 1] - p[k]) / (b[k + 1] - b[k]);
+        bk = b[k];
+        c  = (b[k + 1] * p[k] - b[k] * p[k + 1]) / (b[k + 1] - b[k]);
+        kp = k;
         }
-        assert(std::abs(f(u[i], a, m, bk, c) - double(i)/N) < .001);
+      assert(std::abs(f(u[i], a, m, bk, c) - double(i) / N) < .0013);
     }
 }
 
@@ -110,7 +109,7 @@ test2()
         u.push_back(v);
     }
     std::sort(u.begin(), u.end());
-    int kp = -1;
+    std::ptrdiff_t kp = -1;
     double a = std::numeric_limits<double>::quiet_NaN();
     double m = std::numeric_limits<double>::quiet_NaN();
     double bk = std::numeric_limits<double>::quiet_NaN();
@@ -128,18 +127,17 @@ test2()
         p[i] /= S;
     for (std::size_t i = 0; i < N; ++i)
     {
-        int k = std::lower_bound(b, b+Np+1, u[i]) - b - 1;
-        if (k != kp)
-        {
-            a = 0;
-            for (int j = 0; j < k; ++j)
-                a += areas[j];
-            m = (p[k+1] - p[k]) / (b[k+1] - b[k]);
-            bk = b[k];
-            c = (b[k+1]*p[k] - b[k]*p[k+1]) / (b[k+1] - b[k]);
-            kp = k;
+      std::ptrdiff_t k = std::lower_bound(b, b + Np + 1, u[i]) - b - 1;
+      if (k != kp) {
+        a = 0;
+        for (int j = 0; j < k; ++j)
+          a += areas[j];
+        m  = (p[k + 1] - p[k]) / (b[k + 1] - b[k]);
+        bk = b[k];
+        c  = (b[k + 1] * p[k] - b[k] * p[k + 1]) / (b[k + 1] - b[k]);
+        kp = k;
         }
-        assert(std::abs(f(u[i], a, m, bk, c) - double(i)/N) < .001);
+      assert(std::abs(f(u[i], a, m, bk, c) - double(i) / N) < .0013);
     }
 }
 
@@ -162,7 +160,7 @@ test3()
         u.push_back(v);
     }
     std::sort(u.begin(), u.end());
-    int kp = -1;
+    std::ptrdiff_t kp = -1;
     double a = std::numeric_limits<double>::quiet_NaN();
     double m = std::numeric_limits<double>::quiet_NaN();
     double bk = std::numeric_limits<double>::quiet_NaN();
@@ -180,18 +178,17 @@ test3()
         p[i] /= S;
     for (std::size_t i = 0; i < N; ++i)
     {
-        int k = std::lower_bound(b, b+Np+1, u[i]) - b - 1;
-        if (k != kp)
-        {
-            a = 0;
-            for (int j = 0; j < k; ++j)
-                a += areas[j];
-            m = (p[k+1] - p[k]) / (b[k+1] - b[k]);
-            bk = b[k];
-            c = (b[k+1]*p[k] - b[k]*p[k+1]) / (b[k+1] - b[k]);
-            kp = k;
+      std::ptrdiff_t k = std::lower_bound(b, b + Np + 1, u[i]) - b - 1;
+      if (k != kp) {
+        a = 0;
+        for (int j = 0; j < k; ++j)
+          a += areas[j];
+        m  = (p[k + 1] - p[k]) / (b[k + 1] - b[k]);
+        bk = b[k];
+        c  = (b[k + 1] * p[k] - b[k] * p[k + 1]) / (b[k + 1] - b[k]);
+        kp = k;
         }
-        assert(std::abs(f(u[i], a, m, bk, c) - double(i)/N) < .001);
+      assert(std::abs(f(u[i], a, m, bk, c) - double(i) / N) < .0013);
     }
 }
 
@@ -214,7 +211,7 @@ test4()
         u.push_back(v);
     }
     std::sort(u.begin(), u.end());
-    int kp = -1;
+    std::ptrdiff_t kp = -1;
     double a = std::numeric_limits<double>::quiet_NaN();
     double m = std::numeric_limits<double>::quiet_NaN();
     double bk = std::numeric_limits<double>::quiet_NaN();
@@ -232,19 +229,18 @@ test4()
         p[i] /= S;
     for (std::size_t i = 0; i < N; ++i)
     {
-        int k = std::lower_bound(b, b+Np+1, u[i]) - b - 1;
-        if (k != kp)
-        {
-            a = 0;
-            for (int j = 0; j < k; ++j)
-                a += areas[j];
-            assert(k < static_cast<int>(Np));
-            m = (p[k+1] - p[k]) / (b[k+1] - b[k]);
-            bk = b[k];
-            c = (b[k+1]*p[k] - b[k]*p[k+1]) / (b[k+1] - b[k]);
-            kp = k;
+      std::ptrdiff_t k = std::lower_bound(b, b + Np + 1, u[i]) - b - 1;
+      if (k != kp) {
+        a = 0;
+        for (int j = 0; j < k; ++j)
+          a += areas[j];
+        assert(k < static_cast<int>(Np));
+        m  = (p[k + 1] - p[k]) / (b[k + 1] - b[k]);
+        bk = b[k];
+        c  = (b[k + 1] * p[k] - b[k] * p[k + 1]) / (b[k + 1] - b[k]);
+        kp = k;
         }
-        assert(std::abs(f(u[i], a, m, bk, c) - double(i)/N) < .001);
+      assert(std::abs(f(u[i], a, m, bk, c) - double(i) / N) < .0013);
     }
 }
 
@@ -267,7 +263,7 @@ test5()
         u.push_back(v);
     }
     std::sort(u.begin(), u.end());
-    int kp = -1;
+    std::ptrdiff_t kp = -1;
     double a = std::numeric_limits<double>::quiet_NaN();
     double m = std::numeric_limits<double>::quiet_NaN();
     double bk = std::numeric_limits<double>::quiet_NaN();
@@ -286,19 +282,18 @@ test5()
         p[i] /= S;
     for (std::size_t i = 0; i < N; ++i)
     {
-        int k = std::lower_bound(b, b+Np+1, u[i]) - b - 1;
-        if (k != kp)
-        {
-            a = 0;
-            for (int j = 0; j < k; ++j)
-                a += areas[j];
-            assert(k < static_cast<int>(Np));
-            m = (p[k+1] - p[k]) / (b[k+1] - b[k]);
-            bk = b[k];
-            c = (b[k+1]*p[k] - b[k]*p[k+1]) / (b[k+1] - b[k]);
-            kp = k;
+      std::ptrdiff_t k = std::lower_bound(b, b + Np + 1, u[i]) - b - 1;
+      if (k != kp) {
+        a = 0;
+        for (int j = 0; j < k; ++j)
+          a += areas[j];
+        assert(k < static_cast<int>(Np));
+        m  = (p[k + 1] - p[k]) / (b[k + 1] - b[k]);
+        bk = b[k];
+        c  = (b[k + 1] * p[k] - b[k] * p[k + 1]) / (b[k + 1] - b[k]);
+        kp = k;
         }
-        assert(std::abs(f(u[i], a, m, bk, c) - double(i)/N) < .001);
+      assert(std::abs(f(u[i], a, m, bk, c) - double(i) / N) < .0013);
     }
 }
 
@@ -321,7 +316,7 @@ test6()
         u.push_back(v);
     }
     std::sort(u.begin(), u.end());
-    int kp = -1;
+    std::ptrdiff_t kp = -1;
     double a = std::numeric_limits<double>::quiet_NaN();
     double m = std::numeric_limits<double>::quiet_NaN();
     double bk = std::numeric_limits<double>::quiet_NaN();
@@ -339,18 +334,17 @@ test6()
         p[i] /= S;
     for (std::size_t i = 0; i < N; ++i)
     {
-        int k = std::lower_bound(b, b+Np+1, u[i]) - b - 1;
-        if (k != kp)
-        {
-            a = 0;
-            for (int j = 0; j < k; ++j)
-                a += areas[j];
-            m = (p[k+1] - p[k]) / (b[k+1] - b[k]);
-            bk = b[k];
-            c = (b[k+1]*p[k] - b[k]*p[k+1]) / (b[k+1] - b[k]);
-            kp = k;
+      std::ptrdiff_t k = std::lower_bound(b, b + Np + 1, u[i]) - b - 1;
+      if (k != kp) {
+        a = 0;
+        for (int j = 0; j < k; ++j)
+          a += areas[j];
+        m  = (p[k + 1] - p[k]) / (b[k + 1] - b[k]);
+        bk = b[k];
+        c  = (b[k + 1] * p[k] - b[k] * p[k + 1]) / (b[k + 1] - b[k]);
+        kp = k;
         }
-        assert(std::abs(f(u[i], a, m, bk, c) - double(i)/N) < .001);
+      assert(std::abs(f(u[i], a, m, bk, c) - double(i) / N) < .0013);
     }
 }
 
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/eval_param.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/eval_param.pass.cpp
index 605b11942a0e..4601c3601930 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/eval_param.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/eval_param.pass.cpp
@@ -60,7 +60,7 @@ int main(int, char**)
             u.push_back(v);
         }
         std::sort(u.begin(), u.end());
-        int kp = -1;
+        std::ptrdiff_t kp = -1;
         double a = std::numeric_limits<double>::quiet_NaN();
         double m = std::numeric_limits<double>::quiet_NaN();
         double bk = std::numeric_limits<double>::quiet_NaN();
@@ -78,18 +78,17 @@ int main(int, char**)
             p[i] /= S;
         for (std::size_t i = 0; i < N; ++i)
         {
-            int k = std::lower_bound(b, b+Np+1, u[i]) - b - 1;
-            if (k != kp)
-            {
-                a = 0;
-                for (int j = 0; j < k; ++j)
-                    a += areas[j];
-                m = (p[k+1] - p[k]) / (b[k+1] - b[k]);
-                bk = b[k];
-                c = (b[k+1]*p[k] - b[k]*p[k+1]) / (b[k+1] - b[k]);
-                kp = k;
+          std::ptrdiff_t k = std::lower_bound(b, b + Np + 1, u[i]) - b - 1;
+          if (k != kp) {
+            a = 0;
+            for (int j = 0; j < k; ++j)
+              a += areas[j];
+            m  = (p[k + 1] - p[k]) / (b[k + 1] - b[k]);
+            bk = b[k];
+            c  = (b[k + 1] * p[k] - b[k] * p[k + 1]) / (b[k + 1] - b[k]);
+            kp = k;
             }
-            assert(std::abs(f(u[i], a, m, bk, c) - double(i)/N) < .001);
+          assert(std::abs(f(u[i], a, m, bk, c) - double(i) / N) < .0013);
         }
     }
 
diff --git a/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/parse.pass.cpp b/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/parse.pass.cpp
index 2e75606832b4..b478893434e8 100644
--- a/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/parse.pass.cpp
+++ b/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/parse.pass.cpp
@@ -24,6 +24,7 @@
 
 #include <cassert>
 #include <concepts>
+#include <format>
 #include <memory>
 #include <thread>
 
diff --git a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/compare.three_way.refwrap.const_ref.pass.cpp b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/compare.three_way.refwrap.const_ref.pass.cpp
new file mode 100644
index 000000000000..85106c18ec35
--- /dev/null
+++ b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/compare.three_way.refwrap.const_ref.pass.cpp
@@ -0,0 +1,89 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+
+// <functional>
+
+// class reference_wrapper
+
+// [refwrap.comparisons], comparisons
+
+// friend constexpr auto operator<=>(reference_wrapper, const T&);                   // Since C++26
+
+#include <cassert>
+#include <concepts>
+#include <functional>
+
+#include "test_comparisons.h"
+#include "test_macros.h"
+
+#include "helper_concepts.h"
+#include "helper_types.h"
+
+// Test SFINAE.
+
+static_assert(HasSpaceshipOperatorWithInt<std::reference_wrapper<StrongOrder>>);
+static_assert(HasSpaceshipOperatorWithInt<std::reference_wrapper<WeakOrder>>);
+static_assert(HasSpaceshipOperatorWithInt<std::reference_wrapper<PartialOrder>>);
+
+static_assert(!HasSpaceshipOperatorWithInt<std::reference_wrapper<NonComparable>>);
+
+// Test comparisons.
+
+template <typename T, typename Order>
+constexpr void test() {
+  T t{47};
+
+  T bigger{94};
+  T smaller{82};
+
+  T unordered{std::numeric_limits<int>::min()};
+
+  // Identical contents
+  {
+    std::reference_wrapper<T> rw1{t};
+    assert(testOrder(rw1, t, Order::equivalent));
+  }
+  // Less
+  {
+    std::reference_wrapper<T> rw1{smaller};
+    assert(testOrder(rw1, bigger, Order::less));
+  }
+  // Greater
+  {
+    std::reference_wrapper<T> rw1{bigger};
+    assert(testOrder(rw1, smaller, Order::greater));
+  }
+  // Unordered
+  if constexpr (std::same_as<T, PartialOrder>) {
+    std::reference_wrapper<T> rw1{bigger};
+    assert(testOrder(rw1, unordered, Order::unordered));
+  }
+}
+
+constexpr bool test() {
+  test<int, std::strong_ordering>();
+  test<StrongOrder, std::strong_ordering>();
+  test<int, std::weak_ordering>();
+  test<WeakOrder, std::weak_ordering>();
+  test<int, std::partial_ordering>();
+  test<PartialOrder, std::partial_ordering>();
+
+  // `LessAndEqComp` does not have `operator<=>`. Ordering is synthesized based on `operator<`
+  test<LessAndEqComp, std::weak_ordering>();
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/compare.three_way.refwrap.refwrap.pass.cpp b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/compare.three_way.refwrap.refwrap.pass.cpp
new file mode 100644
index 000000000000..794fac00de8a
--- /dev/null
+++ b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/compare.three_way.refwrap.refwrap.pass.cpp
@@ -0,0 +1,93 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+
+// <functional>
+
+// class reference_wrapper
+
+// [refwrap.comparisons], comparisons
+
+// friend constexpr auto operator<=>(reference_wrapper, reference_wrapper);          // Since C++26
+
+#include <cassert>
+#include <concepts>
+#include <functional>
+
+#include "test_comparisons.h"
+#include "test_macros.h"
+
+#include "helper_concepts.h"
+#include "helper_types.h"
+
+// Test SFINAE.
+
+static_assert(std::three_way_comparable<std::reference_wrapper<StrongOrder>>);
+static_assert(std::three_way_comparable<std::reference_wrapper<WeakOrder>>);
+static_assert(std::three_way_comparable<std::reference_wrapper<PartialOrder>>);
+
+static_assert(!std::three_way_comparable<std::reference_wrapper<NonComparable>>);
+
+// Test comparisons.
+
+template <typename T, typename Order>
+constexpr void test() {
+  T t{47};
+
+  T bigger{94};
+  T smaller{82};
+
+  T unordered{std::numeric_limits<int>::min()};
+
+  // Identical contents
+  {
+    std::reference_wrapper<T> rw1{t};
+    std::reference_wrapper<T> rw2{t};
+    assert(testOrder(rw1, rw2, Order::equivalent));
+  }
+  // Less
+  {
+    std::reference_wrapper<T> rw1{smaller};
+    std::reference_wrapper<T> rw2{bigger};
+    assert(testOrder(rw1, rw2, Order::less));
+  }
+  // Greater
+  {
+    std::reference_wrapper<T> rw1{bigger};
+    std::reference_wrapper<T> rw2{smaller};
+    assert(testOrder(rw1, rw2, Order::greater));
+  }
+  // Unordered
+  if constexpr (std::same_as<T, PartialOrder>) {
+    std::reference_wrapper<T> rw1{bigger};
+    std::reference_wrapper<T> rw2{unordered};
+    assert(testOrder(rw1, rw2, Order::unordered));
+  }
+}
+
+constexpr bool test() {
+  test<int, std::strong_ordering>();
+  test<StrongOrder, std::strong_ordering>();
+  test<int, std::weak_ordering>();
+  test<WeakOrder, std::weak_ordering>();
+  test<int, std::partial_ordering>();
+  test<PartialOrder, std::partial_ordering>();
+
+  // `LessAndEqComp` does not have `operator<=>`. Ordering is synthesized based on `operator<`
+  test<LessAndEqComp, std::weak_ordering>();
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/compare.three_way.refwrap.refwrap_const.pass.cpp b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/compare.three_way.refwrap.refwrap_const.pass.cpp
new file mode 100644
index 000000000000..9b1302affa85
--- /dev/null
+++ b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/compare.three_way.refwrap.refwrap_const.pass.cpp
@@ -0,0 +1,95 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+
+// <functional>
+
+// class reference_wrapper
+
+// [refwrap.comparisons], comparisons
+
+// friend constexpr auto operator<=>(reference_wrapper, reference_wrapper<const T>); // Since C++26
+
+#include <cassert>
+#include <concepts>
+#include <functional>
+
+#include "test_comparisons.h"
+#include "test_macros.h"
+
+#include "helper_concepts.h"
+#include "helper_types.h"
+
+// Test SFINAE.
+
+static_assert(std::three_way_comparable_with<std::reference_wrapper<StrongOrder>, const StrongOrder>);
+static_assert(std::three_way_comparable_with<std::reference_wrapper<WeakOrder>, const WeakOrder>);
+static_assert(std::three_way_comparable_with<std::reference_wrapper<PartialOrder>, const PartialOrder>);
+
+static_assert(!std::three_way_comparable_with<std::reference_wrapper<StrongOrder>, const NonComparable>);
+static_assert(!std::three_way_comparable_with<std::reference_wrapper<WeakOrder>, const NonComparable>);
+static_assert(!std::three_way_comparable_with<std::reference_wrapper<PartialOrder>, const NonComparable>);
+
+// Test comparisons.
+
+template <typename T, typename Order>
+constexpr void test() {
+  T t{47};
+
+  T bigger{94};
+  T smaller{82};
+
+  T unordered{std::numeric_limits<int>::min()};
+
+  // Identical contents
+  {
+    std::reference_wrapper<T> rw1{t};
+    std::reference_wrapper<const T> rw2{t};
+    assert(testOrder(rw1, rw2, Order::equivalent));
+  }
+  // Less
+  {
+    std::reference_wrapper<T> rw1{smaller};
+    std::reference_wrapper<const T> rw2{bigger};
+    assert(testOrder(rw1, rw2, Order::less));
+  }
+  // Greater
+  {
+    std::reference_wrapper<T> rw1{bigger};
+    std::reference_wrapper<const T> rw2{smaller};
+    assert(testOrder(rw1, rw2, Order::greater));
+  }
+  // Unordered
+  if constexpr (std::same_as<T, PartialOrder>) {
+    std::reference_wrapper<T> rw1{bigger};
+    std::reference_wrapper<const T> rw2{unordered};
+    assert(testOrder(rw1, rw2, Order::unordered));
+  }
+}
+
+constexpr bool test() {
+  test<int, std::strong_ordering>();
+  test<StrongOrder, std::strong_ordering>();
+  test<int, std::weak_ordering>();
+  test<WeakOrder, std::weak_ordering>();
+  test<int, std::partial_ordering>();
+  test<PartialOrder, std::partial_ordering>();
+
+  // `LessAndEqComp` does not have `operator<=>`. Ordering is synthesized based on `operator<`
+  test<LessAndEqComp, std::weak_ordering>();
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/equal.refwrap.const_ref.pass.cpp b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/equal.refwrap.const_ref.pass.cpp
new file mode 100644
index 000000000000..465326818f17
--- /dev/null
+++ b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/equal.refwrap.const_ref.pass.cpp
@@ -0,0 +1,62 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+
+// <functional>
+
+// class reference_wrapper
+
+// [refwrap.comparisons], comparisons
+
+// friend constexpr bool operator==(reference_wrapper, const T&);                                         // Since C++26
+
+#include <cassert>
+#include <concepts>
+#include <functional>
+
+#include "test_comparisons.h"
+#include "test_macros.h"
+
+#include "helper_concepts.h"
+#include "helper_types.h"
+
+// Test SFINAE.
+
+static_assert(HasEqualityOperatorWithInt<std::reference_wrapper<EqualityComparable>>);
+
+static_assert(!HasEqualityOperatorWithInt<std::reference_wrapper<NonComparable>>);
+
+// Test equality.
+
+template <typename T>
+constexpr void test() {
+  T i{92};
+  T j{84};
+
+  std::reference_wrapper<T> rw1{i};
+
+  // refwrap, const&
+  AssertEqualityReturnBool<decltype(rw1), decltype(i)>();
+  assert(testEquality(rw1, i, true));
+  assert(testEquality(rw1, j, false));
+}
+
+constexpr bool test() {
+  test<int>();
+  test<EqualityComparable>();
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/equal.refwrap.refwrap.pass.cpp b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/equal.refwrap.refwrap.pass.cpp
new file mode 100644
index 000000000000..a50b530bbc6e
--- /dev/null
+++ b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/equal.refwrap.refwrap.pass.cpp
@@ -0,0 +1,64 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+
+// <functional>
+
+// class reference_wrapper
+
+// [refwrap.comparisons], comparisons
+// friend constexpr bool operator==(reference_wrapper, reference_wrapper);                                // Since C++26
+
+#include <cassert>
+#include <concepts>
+#include <functional>
+
+#include "test_comparisons.h"
+#include "test_macros.h"
+
+#include "helper_concepts.h"
+#include "helper_types.h"
+
+// Test SFINAE.
+
+static_assert(std::equality_comparable<std::reference_wrapper<EqualityComparable>>);
+
+static_assert(!std::equality_comparable<std::reference_wrapper<NonComparable>>);
+
+// Test equality.
+
+template <typename T>
+constexpr void test() {
+  T i{92};
+  T j{84};
+
+  std::reference_wrapper<T> rw1{i};
+  std::reference_wrapper<T> rw2 = rw1;
+  std::reference_wrapper<T> rw3{j};
+  std::reference_wrapper<const T> crw1{i};
+  std::reference_wrapper<const T> crw3{j};
+
+  AssertEqualityReturnBool<decltype(rw1), decltype(rw2)>();
+  assert(testEquality(rw1, rw2, true));
+  assert(testEquality(rw1, rw3, false));
+}
+
+constexpr bool test() {
+  test<int>();
+  test<EqualityComparable>();
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/equal.refwrap.refwrap_const.pass.cpp b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/equal.refwrap.refwrap_const.pass.cpp
new file mode 100644
index 000000000000..10f017742a87
--- /dev/null
+++ b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/equal.refwrap.refwrap_const.pass.cpp
@@ -0,0 +1,67 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+
+// <functional>
+
+// class reference_wrapper
+
+// [refwrap.comparisons], comparisons
+
+// friend constexpr bool operator==(reference_wrapper, reference_wrapper<const T>);                       // Since C++26
+
+#include <cassert>
+#include <concepts>
+#include <functional>
+
+#include "test_comparisons.h"
+#include "test_macros.h"
+
+#include "helper_concepts.h"
+#include "helper_types.h"
+
+// Test SFINAE.
+
+static_assert(std::equality_comparable_with<std::reference_wrapper<EqualityComparable>,
+                                            std::reference_wrapper<const EqualityComparable>>);
+
+static_assert(!std::equality_comparable_with<std::reference_wrapper<EqualityComparable>,
+                                             std::reference_wrapper<const NonComparable>>);
+
+// Test equality.
+
+template <typename T>
+constexpr void test() {
+  T i{92};
+  T j{84};
+
+  std::reference_wrapper<T> rw1{i};
+
+  std::reference_wrapper<T> rw3{j};
+  std::reference_wrapper<const T> crw1{i};
+  std::reference_wrapper<const T> crw3{j};
+
+  AssertEqualityReturnBool<decltype(rw1), decltype(crw1)>();
+  assert(testEquality(rw1, crw1, true));
+  assert(testEquality(rw1, crw3, false));
+}
+
+constexpr bool test() {
+  test<int>();
+  test<EqualityComparable>();
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/helper_concepts.h b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/helper_concepts.h
new file mode 100644
index 000000000000..2dbb304f8af6
--- /dev/null
+++ b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/helper_concepts.h
@@ -0,0 +1,38 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TEST_STD_FUNCTIONOBJECTS_REFWRAP_HELPER_CONCEPTS_H
+#define TEST_STD_FUNCTIONOBJECTS_REFWRAP_HELPER_CONCEPTS_H
+
+#include <concepts>
+#include <utility>
+
+// Equality
+
+template <typename T>
+concept HasEqualityOperatorWithInt = requires(T t, int i) {
+  { t.get() == i } -> std::convertible_to<bool>;
+};
+
+// Spaceship
+
+template <class T>
+concept BooleanTestableImpl = std::convertible_to<T, bool>;
+
+template <class T>
+concept BooleanTestable = BooleanTestableImpl<T> && requires(T&& t) {
+  { !std::forward<T>(t) } -> BooleanTestableImpl;
+};
+
+template <typename T>
+concept HasSpaceshipOperatorWithInt = requires(T t, int i) {
+  { t < i } -> BooleanTestable;
+  { i < t } -> BooleanTestable;
+};
+
+#endif // TEST_STD_FUNCTIONOBJECTS_REFWRAP_HELPER_CONCEPTS_H
diff --git a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/helper_types.h b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/helper_types.h
new file mode 100644
index 000000000000..cf5e568dbf93
--- /dev/null
+++ b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/helper_types.h
@@ -0,0 +1,30 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TEST_STD_FUNCTIONOBJECTS_REFWRAP_HELPER_TYPES_H
+#define TEST_STD_FUNCTIONOBJECTS_REFWRAP_HELPER_TYPES_H
+
+#include <concepts>
+
+struct EqualityComparable {
+  constexpr EqualityComparable(int value) : value_{value} {};
+
+  friend constexpr bool operator==(const EqualityComparable&, const EqualityComparable&) noexcept = default;
+
+  int value_;
+};
+
+static_assert(std::equality_comparable<EqualityComparable>);
+static_assert(EqualityComparable{94} == EqualityComparable{94});
+static_assert(EqualityComparable{94} != EqualityComparable{82});
+
+struct NonComparable {};
+
+static_assert(!std::three_way_comparable<NonComparable>);
+
+#endif // TEST_STD_FUNCTIONOBJECTS_REFWRAP_HELPER_TYPES_H
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.assign/T.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.assign/T.pass.cpp
index 4b9eaba2d2ba..98faf84fa52d 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.assign/T.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.assign/T.pass.cpp
@@ -33,17 +33,17 @@ struct Dummy {
 
 struct ThrowsCtorT {
   ThrowsCtorT(int) noexcept(false) {}
-  ThrowsCtorT &operator=(int) noexcept { return *this; }
+  ThrowsCtorT& operator=(int) noexcept { return *this; }
 };
 
 struct ThrowsAssignT {
   ThrowsAssignT(int) noexcept {}
-  ThrowsAssignT &operator=(int) noexcept(false) { return *this; }
+  ThrowsAssignT& operator=(int) noexcept(false) { return *this; }
 };
 
 struct NoThrowT {
   NoThrowT(int) noexcept {}
-  NoThrowT &operator=(int) noexcept { return *this; }
+  NoThrowT& operator=(int) noexcept { return *this; }
 };
 
 } // namespace MetaHelpers
@@ -55,7 +55,7 @@ struct ThrowsCtorT {
   int value;
   ThrowsCtorT() : value(0) {}
   ThrowsCtorT(int) noexcept(false) { throw 42; }
-  ThrowsCtorT &operator=(int v) noexcept {
+  ThrowsCtorT& operator=(int v) noexcept {
     value = v;
     return *this;
   }
@@ -64,9 +64,12 @@ struct ThrowsCtorT {
 struct MoveCrashes {
   int value;
   MoveCrashes(int v = 0) noexcept : value{v} {}
-  MoveCrashes(MoveCrashes &&) noexcept { assert(false); }
-  MoveCrashes &operator=(MoveCrashes &&) noexcept { assert(false); return *this; }
-  MoveCrashes &operator=(int v) noexcept {
+  MoveCrashes(MoveCrashes&&) noexcept { assert(false); }
+  MoveCrashes& operator=(MoveCrashes&&) noexcept {
+    assert(false);
+    return *this;
+  }
+  MoveCrashes& operator=(int v) noexcept {
     value = v;
     return *this;
   }
@@ -76,8 +79,8 @@ struct ThrowsCtorTandMove {
   int value;
   ThrowsCtorTandMove() : value(0) {}
   ThrowsCtorTandMove(int) noexcept(false) { throw 42; }
-  ThrowsCtorTandMove(ThrowsCtorTandMove &&) noexcept(false) { assert(false); }
-  ThrowsCtorTandMove &operator=(int v) noexcept {
+  ThrowsCtorTandMove(ThrowsCtorTandMove&&) noexcept(false) { assert(false); }
+  ThrowsCtorTandMove& operator=(int v) noexcept {
     value = v;
     return *this;
   }
@@ -87,14 +90,14 @@ struct ThrowsAssignT {
   int value;
   ThrowsAssignT() : value(0) {}
   ThrowsAssignT(int v) noexcept : value(v) {}
-  ThrowsAssignT &operator=(int) noexcept(false) { throw 42; }
+  ThrowsAssignT& operator=(int) noexcept(false) { throw 42; }
 };
 
 struct NoThrowT {
   int value;
   NoThrowT() : value(0) {}
   NoThrowT(int v) noexcept : value(v) {}
-  NoThrowT &operator=(int v) noexcept {
+  NoThrowT& operator=(int v) noexcept {
     value = v;
     return *this;
   }
@@ -103,7 +106,7 @@ struct NoThrowT {
 #endif // !defined(TEST_HAS_NO_EXCEPTIONS)
 } // namespace RuntimeHelpers
 
-void test_T_assignment_noexcept() {
+constexpr void test_T_assignment_noexcept() {
   using namespace MetaHelpers;
   {
     using V = std::variant<Dummy, NoThrowT>;
@@ -119,17 +122,17 @@ void test_T_assignment_noexcept() {
   }
 }
 
-void test_T_assignment_sfinae() {
+constexpr void test_T_assignment_sfinae() {
   {
     using V = std::variant<long, long long>;
     static_assert(!std::is_assignable<V, int>::value, "ambiguous");
   }
   {
     using V = std::variant<std::string, std::string>;
-    static_assert(!std::is_assignable<V, const char *>::value, "ambiguous");
+    static_assert(!std::is_assignable<V, const char*>::value, "ambiguous");
   }
   {
-    using V = std::variant<std::string, void *>;
+    using V = std::variant<std::string, void*>;
     static_assert(!std::is_assignable<V, int>::value, "no matching operator=");
   }
   {
@@ -138,8 +141,7 @@ void test_T_assignment_sfinae() {
   }
   {
     using V = std::variant<std::unique_ptr<int>, bool>;
-    static_assert(!std::is_assignable<V, std::unique_ptr<char>>::value,
-                  "no explicit bool in operator=");
+    static_assert(!std::is_assignable<V, std::unique_ptr<char>>::value, "no explicit bool in operator=");
     struct X {
       operator void*();
     };
@@ -152,12 +154,11 @@ void test_T_assignment_sfinae() {
       operator X();
     };
     using V = std::variant<X>;
-    static_assert(std::is_assignable<V, Y>::value,
-                  "regression on user-defined conversions in operator=");
+    static_assert(std::is_assignable<V, Y>::value, "regression on user-defined conversions in operator=");
   }
 }
 
-void test_T_assignment_basic() {
+TEST_CONSTEXPR_CXX20 void test_T_assignment_basic() {
   {
     std::variant<int> v(43);
     v = 42;
@@ -184,19 +185,146 @@ void test_T_assignment_basic() {
   }
   {
     std::variant<std::string, bool> v = true;
-    v = "bar";
+    v                                 = "bar";
     assert(v.index() == 0);
     assert(std::get<0>(v) == "bar");
   }
+}
+
+void test_T_assignment_basic_no_constexpr() {
+  std::variant<bool, std::unique_ptr<int>> v;
+  v = nullptr;
+  assert(v.index() == 1);
+  assert(std::get<1>(v) == nullptr);
+}
+
+struct TraceStat {
+  int construct      = 0;
+  int copy_construct = 0;
+  int copy_assign    = 0;
+  int move_construct = 0;
+  int move_assign    = 0;
+  int T_copy_assign  = 0;
+  int T_move_assign  = 0;
+  int destroy        = 0;
+};
+
+template <bool CtorNoexcept, bool MoveCtorNoexcept>
+struct Trace {
+  struct T {};
+
+  constexpr Trace(TraceStat* s) noexcept(CtorNoexcept) : stat(s) { ++s->construct; }
+  constexpr Trace(T) noexcept(CtorNoexcept) : stat(nullptr) {}
+  constexpr Trace(const Trace& o) : stat(o.stat) { ++stat->copy_construct; }
+  constexpr Trace(Trace&& o) noexcept(MoveCtorNoexcept) : stat(o.stat) { ++stat->move_construct; }
+  constexpr Trace& operator=(const Trace&) {
+    ++stat->copy_assign;
+    return *this;
+  }
+  constexpr Trace& operator=(Trace&&) noexcept {
+    ++stat->move_assign;
+    return *this;
+  }
+
+  constexpr Trace& operator=(const T&) {
+    ++stat->T_copy_assign;
+    return *this;
+  }
+  constexpr Trace& operator=(T&&) noexcept {
+    ++stat->T_move_assign;
+    return *this;
+  }
+  TEST_CONSTEXPR_CXX20 ~Trace() { ++stat->destroy; }
+
+  TraceStat* stat;
+};
+
+TEST_CONSTEXPR_CXX20 void test_T_assignment_performs_construction() {
   {
-    std::variant<bool, std::unique_ptr<int>> v;
-    v = nullptr;
-    assert(v.index() == 1);
-    assert(std::get<1>(v) == nullptr);
+    using V = std::variant<int, Trace<false, false>>;
+    TraceStat stat;
+    V v{1};
+    v = &stat;
+    assert(stat.construct == 1);
+    assert(stat.copy_construct == 0);
+    assert(stat.move_construct == 0);
+    assert(stat.copy_assign == 0);
+    assert(stat.move_assign == 0);
+    assert(stat.destroy == 0);
+  }
+  {
+    using V = std::variant<int, Trace<false, true>>;
+    TraceStat stat;
+    V v{1};
+    v = &stat;
+    assert(stat.construct == 1);
+    assert(stat.copy_construct == 0);
+    assert(stat.move_construct == 1);
+    assert(stat.copy_assign == 0);
+    assert(stat.move_assign == 0);
+    assert(stat.destroy == 1);
+  }
+
+  {
+    using V = std::variant<int, Trace<true, false>>;
+    TraceStat stat;
+    V v{1};
+    v = &stat;
+    assert(stat.construct == 1);
+    assert(stat.copy_construct == 0);
+    assert(stat.move_construct == 0);
+    assert(stat.copy_assign == 0);
+    assert(stat.move_assign == 0);
+    assert(stat.destroy == 0);
+  }
+
+  {
+    using V = std::variant<int, Trace<true, true>>;
+    TraceStat stat;
+    V v{1};
+    v = &stat;
+    assert(stat.construct == 1);
+    assert(stat.copy_construct == 0);
+    assert(stat.move_construct == 0);
+    assert(stat.copy_assign == 0);
+    assert(stat.move_assign == 0);
+    assert(stat.destroy == 0);
   }
 }
 
-void test_T_assignment_performs_construction() {
+TEST_CONSTEXPR_CXX20 void test_T_assignment_performs_assignment() {
+  {
+    using V = std::variant<int, Trace<false, false>>;
+    TraceStat stat;
+    V v{&stat};
+    v = Trace<false, false>::T{};
+    assert(stat.construct == 1);
+    assert(stat.copy_construct == 0);
+    assert(stat.move_construct == 0);
+    assert(stat.copy_assign == 0);
+    assert(stat.move_assign == 0);
+    assert(stat.T_copy_assign == 0);
+    assert(stat.T_move_assign == 1);
+    assert(stat.destroy == 0);
+  }
+  {
+    using V = std::variant<int, Trace<false, false>>;
+    TraceStat stat;
+    V v{&stat};
+    Trace<false, false>::T t;
+    v = t;
+    assert(stat.construct == 1);
+    assert(stat.copy_construct == 0);
+    assert(stat.move_construct == 0);
+    assert(stat.copy_assign == 0);
+    assert(stat.move_assign == 0);
+    assert(stat.T_copy_assign == 1);
+    assert(stat.T_move_assign == 0);
+    assert(stat.destroy == 0);
+  }
+}
+
+void test_T_assignment_performs_construction_throw() {
   using namespace RuntimeHelpers;
 #ifndef TEST_HAS_NO_EXCEPTIONS
   {
@@ -220,7 +348,7 @@ void test_T_assignment_performs_construction() {
 #endif // TEST_HAS_NO_EXCEPTIONS
 }
 
-void test_T_assignment_performs_assignment() {
+void test_T_assignment_performs_assignment_throw() {
   using namespace RuntimeHelpers;
 #ifndef TEST_HAS_NO_EXCEPTIONS
   {
@@ -262,7 +390,7 @@ void test_T_assignment_performs_assignment() {
 #endif // TEST_HAS_NO_EXCEPTIONS
 }
 
-void test_T_assignment_vector_bool() {
+TEST_CONSTEXPR_CXX20 void test_T_assignment_vector_bool() {
   std::vector<bool> vec = {true};
   std::variant<bool, int> v;
   v = vec[0];
@@ -270,7 +398,13 @@ void test_T_assignment_vector_bool() {
   assert(std::get<0>(v) == true);
 }
 
-int main(int, char**) {
+void non_constexpr_test() {
+  test_T_assignment_basic_no_constexpr();
+  test_T_assignment_performs_construction_throw();
+  test_T_assignment_performs_assignment_throw();
+}
+
+TEST_CONSTEXPR_CXX20 bool test() {
   test_T_assignment_basic();
   test_T_assignment_performs_construction();
   test_T_assignment_performs_assignment();
@@ -278,5 +412,15 @@ int main(int, char**) {
   test_T_assignment_sfinae();
   test_T_assignment_vector_bool();
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  non_constexpr_test();
+
+#if TEST_STD_VER >= 20
+  static_assert(test());
+#endif
   return 0;
 }
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.assign/copy.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.assign/copy.pass.cpp
index 096d365d2d75..a6d3f34114eb 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.assign/copy.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.assign/copy.pass.cpp
@@ -22,88 +22,108 @@
 #include "test_macros.h"
 
 struct NoCopy {
-  NoCopy(const NoCopy &) = delete;
-  NoCopy &operator=(const NoCopy &) = default;
+  NoCopy(const NoCopy&)            = delete;
+  NoCopy& operator=(const NoCopy&) = default;
 };
 
 struct CopyOnly {
-  CopyOnly(const CopyOnly &) = default;
-  CopyOnly(CopyOnly &&) = delete;
-  CopyOnly &operator=(const CopyOnly &) = default;
-  CopyOnly &operator=(CopyOnly &&) = delete;
+  CopyOnly(const CopyOnly&)            = default;
+  CopyOnly(CopyOnly&&)                 = delete;
+  CopyOnly& operator=(const CopyOnly&) = default;
+  CopyOnly& operator=(CopyOnly&&)      = delete;
 };
 
 struct MoveOnly {
-  MoveOnly(const MoveOnly &) = delete;
-  MoveOnly(MoveOnly &&) = default;
-  MoveOnly &operator=(const MoveOnly &) = default;
+  MoveOnly(const MoveOnly&)            = delete;
+  MoveOnly(MoveOnly&&)                 = default;
+  MoveOnly& operator=(const MoveOnly&) = default;
 };
 
 struct MoveOnlyNT {
-  MoveOnlyNT(const MoveOnlyNT &) = delete;
-  MoveOnlyNT(MoveOnlyNT &&) {}
-  MoveOnlyNT &operator=(const MoveOnlyNT &) = default;
+  MoveOnlyNT(const MoveOnlyNT&) = delete;
+  MoveOnlyNT(MoveOnlyNT&&) {}
+  MoveOnlyNT& operator=(const MoveOnlyNT&) = default;
 };
 
 struct CopyAssign {
-  static int alive;
-  static int copy_construct;
-  static int copy_assign;
-  static int move_construct;
-  static int move_assign;
-  static void reset() {
-    copy_construct = copy_assign = move_construct = move_assign = alive = 0;
-  }
-  CopyAssign(int v) : value(v) { ++alive; }
-  CopyAssign(const CopyAssign &o) : value(o.value) {
-    ++alive;
-    ++copy_construct;
-  }
-  CopyAssign(CopyAssign &&o) noexcept : value(o.value) {
+  constexpr CopyAssign(int v, int* alv, int* cpy_ctr, int* cpy_assi, int* move_ctr, int* move_assi)
+      : value(v),
+        alive(alv),
+        copy_construct(cpy_ctr),
+        copy_assign(cpy_assi),
+        move_construct(move_ctr),
+        move_assign(move_assi) {
+    ++*alive;
+  }
+  constexpr CopyAssign(const CopyAssign& o)
+      : value(o.value),
+        alive(o.alive),
+        copy_construct(o.copy_construct),
+        copy_assign(o.copy_assign),
+        move_construct(o.move_construct),
+        move_assign(o.move_assign) {
+    ++*alive;
+    ++*copy_construct;
+  }
+  constexpr CopyAssign(CopyAssign&& o) noexcept
+      : value(o.value),
+        alive(o.alive),
+        copy_construct(o.copy_construct),
+        copy_assign(o.copy_assign),
+        move_construct(o.move_construct),
+        move_assign(o.move_assign) {
     o.value = -1;
-    ++alive;
-    ++move_construct;
-  }
-  CopyAssign &operator=(const CopyAssign &o) {
-    value = o.value;
-    ++copy_assign;
+    ++*alive;
+    ++*move_construct;
+  }
+  constexpr CopyAssign& operator=(const CopyAssign& o) {
+    value          = o.value;
+    alive          = o.alive;
+    copy_construct = o.copy_construct;
+    copy_assign    = o.copy_assign;
+    move_construct = o.move_construct;
+    move_assign    = o.move_assign;
+    ++*copy_assign;
     return *this;
   }
-  CopyAssign &operator=(CopyAssign &&o) noexcept {
-    value = o.value;
-    o.value = -1;
-    ++move_assign;
+  constexpr CopyAssign& operator=(CopyAssign&& o) noexcept {
+    value          = o.value;
+    alive          = o.alive;
+    copy_construct = o.copy_construct;
+    copy_assign    = o.copy_assign;
+    move_construct = o.move_construct;
+    move_assign    = o.move_assign;
+    o.value        = -1;
+    ++*move_assign;
     return *this;
   }
-  ~CopyAssign() { --alive; }
+  TEST_CONSTEXPR_CXX20 ~CopyAssign() { --*alive; }
   int value;
+  int* alive;
+  int* copy_construct;
+  int* copy_assign;
+  int* move_construct;
+  int* move_assign;
 };
 
-int CopyAssign::alive = 0;
-int CopyAssign::copy_construct = 0;
-int CopyAssign::copy_assign = 0;
-int CopyAssign::move_construct = 0;
-int CopyAssign::move_assign = 0;
-
 struct CopyMaybeThrows {
-  CopyMaybeThrows(const CopyMaybeThrows &);
-  CopyMaybeThrows &operator=(const CopyMaybeThrows &);
+  CopyMaybeThrows(const CopyMaybeThrows&);
+  CopyMaybeThrows& operator=(const CopyMaybeThrows&);
 };
 struct CopyDoesThrow {
-  CopyDoesThrow(const CopyDoesThrow &) noexcept(false);
-  CopyDoesThrow &operator=(const CopyDoesThrow &) noexcept(false);
+  CopyDoesThrow(const CopyDoesThrow&) noexcept(false);
+  CopyDoesThrow& operator=(const CopyDoesThrow&) noexcept(false);
 };
 
-
 struct NTCopyAssign {
   constexpr NTCopyAssign(int v) : value(v) {}
-  NTCopyAssign(const NTCopyAssign &) = default;
-  NTCopyAssign(NTCopyAssign &&) = default;
-  NTCopyAssign &operator=(const NTCopyAssign &that) {
+  NTCopyAssign(const NTCopyAssign&) = default;
+  NTCopyAssign(NTCopyAssign&&)      = default;
+  NTCopyAssign& operator=(const NTCopyAssign& that) {
     value = that.value;
     return *this;
   };
-  NTCopyAssign &operator=(NTCopyAssign &&) = delete;
+  NTCopyAssign& operator=(NTCopyAssign&&) = delete;
   int value;
 };
 
@@ -112,10 +132,10 @@ static_assert(std::is_copy_assignable<NTCopyAssign>::value, "");
 
 struct TCopyAssign {
   constexpr TCopyAssign(int v) : value(v) {}
-  TCopyAssign(const TCopyAssign &) = default;
-  TCopyAssign(TCopyAssign &&) = default;
-  TCopyAssign &operator=(const TCopyAssign &) = default;
-  TCopyAssign &operator=(TCopyAssign &&) = delete;
+  TCopyAssign(const TCopyAssign&)            = default;
+  TCopyAssign(TCopyAssign&&)                 = default;
+  TCopyAssign& operator=(const TCopyAssign&) = default;
+  TCopyAssign& operator=(TCopyAssign&&)      = delete;
   int value;
 };
 
@@ -123,11 +143,11 @@ static_assert(std::is_trivially_copy_assignable<TCopyAssign>::value, "");
 
 struct TCopyAssignNTMoveAssign {
   constexpr TCopyAssignNTMoveAssign(int v) : value(v) {}
-  TCopyAssignNTMoveAssign(const TCopyAssignNTMoveAssign &) = default;
-  TCopyAssignNTMoveAssign(TCopyAssignNTMoveAssign &&) = default;
-  TCopyAssignNTMoveAssign &operator=(const TCopyAssignNTMoveAssign &) = default;
-  TCopyAssignNTMoveAssign &operator=(TCopyAssignNTMoveAssign &&that) {
-    value = that.value;
+  TCopyAssignNTMoveAssign(const TCopyAssignNTMoveAssign&)            = default;
+  TCopyAssignNTMoveAssign(TCopyAssignNTMoveAssign&&)                 = default;
+  TCopyAssignNTMoveAssign& operator=(const TCopyAssignNTMoveAssign&) = default;
+  TCopyAssignNTMoveAssign& operator=(TCopyAssignNTMoveAssign&& that) {
+    value      = that.value;
     that.value = -1;
     return *this;
   }
@@ -139,17 +159,20 @@ static_assert(std::is_trivially_copy_assignable_v<TCopyAssignNTMoveAssign>, "");
 #ifndef TEST_HAS_NO_EXCEPTIONS
 struct CopyThrows {
   CopyThrows() = default;
-  CopyThrows(const CopyThrows &) { throw 42; }
-  CopyThrows &operator=(const CopyThrows &) { throw 42; }
+  CopyThrows(const CopyThrows&) { throw 42; }
+  CopyThrows& operator=(const CopyThrows&) { throw 42; }
 };
 
 struct CopyCannotThrow {
   static int alive;
   CopyCannotThrow() { ++alive; }
-  CopyCannotThrow(const CopyCannotThrow &) noexcept { ++alive; }
-  CopyCannotThrow(CopyCannotThrow &&) noexcept { assert(false); }
-  CopyCannotThrow &operator=(const CopyCannotThrow &) noexcept = default;
-  CopyCannotThrow &operator=(CopyCannotThrow &&) noexcept { assert(false); return *this; }
+  CopyCannotThrow(const CopyCannotThrow&) noexcept { ++alive; }
+  CopyCannotThrow(CopyCannotThrow&&) noexcept { assert(false); }
+  CopyCannotThrow& operator=(const CopyCannotThrow&) noexcept = default;
+  CopyCannotThrow& operator=(CopyCannotThrow&&) noexcept {
+    assert(false);
+    return *this;
+  }
 };
 
 int CopyCannotThrow::alive = 0;
@@ -157,10 +180,10 @@ int CopyCannotThrow::alive = 0;
 struct MoveThrows {
   static int alive;
   MoveThrows() { ++alive; }
-  MoveThrows(const MoveThrows &) { ++alive; }
-  MoveThrows(MoveThrows &&) { throw 42; }
-  MoveThrows &operator=(const MoveThrows &) { return *this; }
-  MoveThrows &operator=(MoveThrows &&) { throw 42; }
+  MoveThrows(const MoveThrows&) { ++alive; }
+  MoveThrows(MoveThrows&&) { throw 42; }
+  MoveThrows& operator=(const MoveThrows&) { return *this; }
+  MoveThrows& operator=(MoveThrows&&) { throw 42; }
   ~MoveThrows() { --alive; }
 };
 
@@ -169,20 +192,21 @@ int MoveThrows::alive = 0;
 struct MakeEmptyT {
   static int alive;
   MakeEmptyT() { ++alive; }
-  MakeEmptyT(const MakeEmptyT &) {
+  MakeEmptyT(const MakeEmptyT&) {
     ++alive;
     // Don't throw from the copy constructor since variant's assignment
     // operator performs a copy before committing to the assignment.
   }
-  MakeEmptyT(MakeEmptyT &&) { throw 42; }
-  MakeEmptyT &operator=(const MakeEmptyT &) { throw 42; }
-  MakeEmptyT &operator=(MakeEmptyT &&) { throw 42; }
+  MakeEmptyT(MakeEmptyT&&) { throw 42; }
+  MakeEmptyT& operator=(const MakeEmptyT&) { throw 42; }
+  MakeEmptyT& operator=(MakeEmptyT&&) { throw 42; }
   ~MakeEmptyT() { --alive; }
 };
 
 int MakeEmptyT::alive = 0;
 
-template <class Variant> void makeEmpty(Variant &v) {
+template <class Variant>
+void makeEmpty(Variant& v) {
   Variant v2(std::in_place_type<MakeEmptyT>);
   try {
     v = std::move(v2);
@@ -193,7 +217,7 @@ template <class Variant> void makeEmpty(Variant &v) {
 }
 #endif // TEST_HAS_NO_EXCEPTIONS
 
-void test_copy_assignment_not_noexcept() {
+constexpr void test_copy_assignment_not_noexcept() {
   {
     using V = std::variant<CopyMaybeThrows>;
     static_assert(!std::is_nothrow_copy_assignable<V>::value, "");
@@ -204,7 +228,7 @@ void test_copy_assignment_not_noexcept() {
   }
 }
 
-void test_copy_assignment_sfinae() {
+constexpr void test_copy_assignment_sfinae() {
   {
     using V = std::variant<int, long>;
     static_assert(std::is_copy_assignable<V>::value, "");
@@ -259,7 +283,7 @@ void test_copy_assignment_empty_empty() {
     makeEmpty(v1);
     V v2(std::in_place_index<0>);
     makeEmpty(v2);
-    V &vref = (v1 = v2);
+    V& vref = (v1 = v2);
     assert(&vref == &v1);
     assert(v1.valueless_by_exception());
     assert(v1.index() == std::variant_npos);
@@ -275,7 +299,7 @@ void test_copy_assignment_non_empty_empty() {
     V v1(std::in_place_index<0>, 42);
     V v2(std::in_place_index<0>);
     makeEmpty(v2);
-    V &vref = (v1 = v2);
+    V& vref = (v1 = v2);
     assert(&vref == &v1);
     assert(v1.valueless_by_exception());
     assert(v1.index() == std::variant_npos);
@@ -285,7 +309,7 @@ void test_copy_assignment_non_empty_empty() {
     V v1(std::in_place_index<2>, "hello");
     V v2(std::in_place_index<0>);
     makeEmpty(v2);
-    V &vref = (v1 = v2);
+    V& vref = (v1 = v2);
     assert(&vref == &v1);
     assert(v1.valueless_by_exception());
     assert(v1.index() == std::variant_npos);
@@ -301,7 +325,7 @@ void test_copy_assignment_empty_non_empty() {
     V v1(std::in_place_index<0>);
     makeEmpty(v1);
     V v2(std::in_place_index<0>, 42);
-    V &vref = (v1 = v2);
+    V& vref = (v1 = v2);
     assert(&vref == &v1);
     assert(v1.index() == 0);
     assert(std::get<0>(v1) == 42);
@@ -311,7 +335,7 @@ void test_copy_assignment_empty_non_empty() {
     V v1(std::in_place_index<0>);
     makeEmpty(v1);
     V v2(std::in_place_type<std::string>, "hello");
-    V &vref = (v1 = v2);
+    V& vref = (v1 = v2);
     assert(&vref == &v1);
     assert(v1.index() == 2);
     assert(std::get<2>(v1) == "hello");
@@ -319,14 +343,18 @@ void test_copy_assignment_empty_non_empty() {
 #endif // TEST_HAS_NO_EXCEPTIONS
 }
 
-template <typename T> struct Result { std::size_t index; T value; };
+template <typename T>
+struct Result {
+  std::size_t index;
+  T value;
+};
 
-void test_copy_assignment_same_index() {
+TEST_CONSTEXPR_CXX20 void test_copy_assignment_same_index() {
   {
     using V = std::variant<int>;
     V v1(43);
     V v2(42);
-    V &vref = (v1 = v2);
+    V& vref = (v1 = v2);
     assert(&vref == &v1);
     assert(v1.index() == 0);
     assert(std::get<0>(v1) == 42);
@@ -335,40 +363,28 @@ void test_copy_assignment_same_index() {
     using V = std::variant<int, long, unsigned>;
     V v1(43l);
     V v2(42l);
-    V &vref = (v1 = v2);
+    V& vref = (v1 = v2);
     assert(&vref == &v1);
     assert(v1.index() == 1);
     assert(std::get<1>(v1) == 42);
   }
   {
-    using V = std::variant<int, CopyAssign, unsigned>;
-    V v1(std::in_place_type<CopyAssign>, 43);
-    V v2(std::in_place_type<CopyAssign>, 42);
-    CopyAssign::reset();
-    V &vref = (v1 = v2);
+    using V            = std::variant<int, CopyAssign, unsigned>;
+    int alive          = 0;
+    int copy_construct = 0;
+    int copy_assign    = 0;
+    int move_construct = 0;
+    int move_assign    = 0;
+    V v1(std::in_place_type<CopyAssign>, 43, &alive, &copy_construct, &copy_assign, &move_construct, &move_assign);
+    V v2(std::in_place_type<CopyAssign>, 42, &alive, &copy_construct, &copy_assign, &move_construct, &move_assign);
+    V& vref = (v1 = v2);
     assert(&vref == &v1);
     assert(v1.index() == 1);
     assert(std::get<1>(v1).value == 42);
-    assert(CopyAssign::copy_construct == 0);
-    assert(CopyAssign::move_construct == 0);
-    assert(CopyAssign::copy_assign == 1);
+    assert(copy_construct == 0);
+    assert(move_construct == 0);
+    assert(copy_assign == 1);
   }
-#ifndef TEST_HAS_NO_EXCEPTIONS
-  using MET = MakeEmptyT;
-  {
-    using V = std::variant<int, MET, std::string>;
-    V v1(std::in_place_type<MET>);
-    MET &mref = std::get<1>(v1);
-    V v2(std::in_place_type<MET>);
-    try {
-      v1 = v2;
-      assert(false);
-    } catch (...) {
-    }
-    assert(v1.index() == 1);
-    assert(&std::get<1>(v1) == &mref);
-  }
-#endif // TEST_HAS_NO_EXCEPTIONS
 
   // Make sure we properly propagate triviality, which implies constexpr-ness (see P0602R4).
   {
@@ -429,34 +445,88 @@ void test_copy_assignment_same_index() {
   }
 }
 
-void test_copy_assignment_different_index() {
+TEST_CONSTEXPR_CXX20 void test_copy_assignment_different_index() {
   {
     using V = std::variant<int, long, unsigned>;
     V v1(43);
     V v2(42l);
-    V &vref = (v1 = v2);
+    V& vref = (v1 = v2);
     assert(&vref == &v1);
     assert(v1.index() == 1);
     assert(std::get<1>(v1) == 42);
   }
   {
-    using V = std::variant<int, CopyAssign, unsigned>;
-    CopyAssign::reset();
+    using V            = std::variant<int, CopyAssign, unsigned>;
+    int alive          = 0;
+    int copy_construct = 0;
+    int copy_assign    = 0;
+    int move_construct = 0;
+    int move_assign    = 0;
     V v1(std::in_place_type<unsigned>, 43u);
-    V v2(std::in_place_type<CopyAssign>, 42);
-    assert(CopyAssign::copy_construct == 0);
-    assert(CopyAssign::move_construct == 0);
-    assert(CopyAssign::alive == 1);
-    V &vref = (v1 = v2);
+    V v2(std::in_place_type<CopyAssign>, 42, &alive, &copy_construct, &copy_assign, &move_construct, &move_assign);
+    assert(copy_construct == 0);
+    assert(move_construct == 0);
+    assert(alive == 1);
+    V& vref = (v1 = v2);
     assert(&vref == &v1);
     assert(v1.index() == 1);
     assert(std::get<1>(v1).value == 42);
-    assert(CopyAssign::alive == 2);
-    assert(CopyAssign::copy_construct == 1);
-    assert(CopyAssign::move_construct == 1);
-    assert(CopyAssign::copy_assign == 0);
+    assert(alive == 2);
+    assert(copy_construct == 1);
+    assert(move_construct == 1);
+    assert(copy_assign == 0);
+  }
+
+  // Make sure we properly propagate triviality, which implies constexpr-ness (see P0602R4).
+  {
+    struct {
+      constexpr Result<long> operator()() const {
+        using V = std::variant<int, long, unsigned>;
+        V v(43);
+        V v2(42l);
+        v = v2;
+        return {v.index(), std::get<1>(v)};
+      }
+    } test;
+    constexpr auto result = test();
+    static_assert(result.index == 1, "");
+    static_assert(result.value == 42l, "");
   }
+  {
+    struct {
+      constexpr Result<int> operator()() const {
+        using V = std::variant<int, TCopyAssign, unsigned>;
+        V v(std::in_place_type<unsigned>, 43u);
+        V v2(std::in_place_type<TCopyAssign>, 42);
+        v = v2;
+        return {v.index(), std::get<1>(v).value};
+      }
+    } test;
+    constexpr auto result = test();
+    static_assert(result.index == 1, "");
+    static_assert(result.value == 42, "");
+  }
+}
+
+void test_assignment_throw() {
 #ifndef TEST_HAS_NO_EXCEPTIONS
+  using MET = MakeEmptyT;
+  // same index
+  {
+    using V = std::variant<int, MET, std::string>;
+    V v1(std::in_place_type<MET>);
+    MET& mref = std::get<1>(v1);
+    V v2(std::in_place_type<MET>);
+    try {
+      v1 = v2;
+      assert(false);
+    } catch (...) {
+    }
+    assert(v1.index() == 1);
+    assert(&std::get<1>(v1) == &mref);
+  }
+
+  // difference indices
   {
     using V = std::variant<int, CopyThrows, std::string>;
     V v1(std::in_place_type<std::string>, "hello");
@@ -496,7 +566,7 @@ void test_copy_assignment_different_index() {
     using V = std::variant<int, CopyThrows, std::string>;
     V v1(std::in_place_type<CopyThrows>);
     V v2(std::in_place_type<std::string>, "hello");
-    V &vref = (v1 = v2);
+    V& vref = (v1 = v2);
     assert(&vref == &v1);
     assert(v1.index() == 2);
     assert(std::get<2>(v1) == "hello");
@@ -507,7 +577,7 @@ void test_copy_assignment_different_index() {
     using V = std::variant<int, MoveThrows, std::string>;
     V v1(std::in_place_type<MoveThrows>);
     V v2(std::in_place_type<std::string>, "hello");
-    V &vref = (v1 = v2);
+    V& vref = (v1 = v2);
     assert(&vref == &v1);
     assert(v1.index() == 2);
     assert(std::get<2>(v1) == "hello");
@@ -515,69 +585,83 @@ void test_copy_assignment_different_index() {
     assert(std::get<2>(v2) == "hello");
   }
 #endif // TEST_HAS_NO_EXCEPTIONS
-
-  // Make sure we properly propagate triviality, which implies constexpr-ness (see P0602R4).
-  {
-    struct {
-      constexpr Result<long> operator()() const {
-        using V = std::variant<int, long, unsigned>;
-        V v(43);
-        V v2(42l);
-        v = v2;
-        return {v.index(), std::get<1>(v)};
-      }
-    } test;
-    constexpr auto result = test();
-    static_assert(result.index == 1, "");
-    static_assert(result.value == 42l, "");
-  }
-  {
-    struct {
-      constexpr Result<int> operator()() const {
-        using V = std::variant<int, TCopyAssign, unsigned>;
-        V v(std::in_place_type<unsigned>, 43u);
-        V v2(std::in_place_type<TCopyAssign>, 42);
-        v = v2;
-        return {v.index(), std::get<1>(v).value};
-      }
-    } test;
-    constexpr auto result = test();
-    static_assert(result.index == 1, "");
-    static_assert(result.value == 42, "");
-  }
 }
 
-template <std::size_t NewIdx, class ValueType>
-constexpr bool test_constexpr_assign_imp(
-    std::variant<long, void*, int>&& v, ValueType&& new_value)
-{
-  const std::variant<long, void*, int> cp(
-      std::forward<ValueType>(new_value));
+template <std::size_t NewIdx, class T, class ValueType>
+constexpr void test_constexpr_assign_imp(T&& v, ValueType&& new_value) {
+  using Variant = std::decay_t<T>;
+  const Variant cp(std::forward<ValueType>(new_value));
   v = cp;
-  return v.index() == NewIdx &&
-        std::get<NewIdx>(v) == std::get<NewIdx>(cp);
+  assert(v.index() == NewIdx);
+  assert(std::get<NewIdx>(v) == std::get<NewIdx>(cp));
 }
 
-void test_constexpr_copy_assignment() {
+constexpr void test_constexpr_copy_assignment_trivial() {
   // Make sure we properly propagate triviality, which implies constexpr-ness (see P0602R4).
   using V = std::variant<long, void*, int>;
   static_assert(std::is_trivially_copyable<V>::value, "");
   static_assert(std::is_trivially_copy_assignable<V>::value, "");
-  static_assert(test_constexpr_assign_imp<0>(V(42l), 101l), "");
-  static_assert(test_constexpr_assign_imp<0>(V(nullptr), 101l), "");
-  static_assert(test_constexpr_assign_imp<1>(V(42l), nullptr), "");
-  static_assert(test_constexpr_assign_imp<2>(V(42l), 101), "");
+  test_constexpr_assign_imp<0>(V(42l), 101l);
+  test_constexpr_assign_imp<0>(V(nullptr), 101l);
+  test_constexpr_assign_imp<1>(V(42l), nullptr);
+  test_constexpr_assign_imp<2>(V(42l), 101);
 }
 
-int main(int, char**) {
+struct NonTrivialCopyAssign {
+  int i = 0;
+  constexpr NonTrivialCopyAssign(int ii) : i(ii) {}
+  constexpr NonTrivialCopyAssign(const NonTrivialCopyAssign& other) : i(other.i) {}
+  constexpr NonTrivialCopyAssign& operator=(const NonTrivialCopyAssign& o) {
+    i = o.i;
+    return *this;
+  }
+  TEST_CONSTEXPR_CXX20 ~NonTrivialCopyAssign() = default;
+  friend constexpr bool operator==(const NonTrivialCopyAssign& x, const NonTrivialCopyAssign& y) { return x.i == y.i; }
+};
+
+constexpr void test_constexpr_copy_assignment_non_trivial() {
+  // Make sure we properly propagate triviality, which implies constexpr-ness (see P0602R4).
+  using V = std::variant<long, void*, NonTrivialCopyAssign>;
+  static_assert(!std::is_trivially_copyable<V>::value, "");
+  static_assert(!std::is_trivially_copy_assignable<V>::value, "");
+  test_constexpr_assign_imp<0>(V(42l), 101l);
+  test_constexpr_assign_imp<0>(V(nullptr), 101l);
+  test_constexpr_assign_imp<1>(V(42l), nullptr);
+  test_constexpr_assign_imp<2>(V(42l), NonTrivialCopyAssign(5));
+  test_constexpr_assign_imp<2>(V(NonTrivialCopyAssign(3)), NonTrivialCopyAssign(5));
+}
+
+void non_constexpr_test() {
   test_copy_assignment_empty_empty();
   test_copy_assignment_non_empty_empty();
   test_copy_assignment_empty_non_empty();
-  test_copy_assignment_same_index();
-  test_copy_assignment_different_index();
+  test_assignment_throw();
+}
+
+constexpr bool cxx17_constexpr_test() {
   test_copy_assignment_sfinae();
   test_copy_assignment_not_noexcept();
-  test_constexpr_copy_assignment();
+  test_constexpr_copy_assignment_trivial();
 
+  return true;
+}
+
+TEST_CONSTEXPR_CXX20 bool cxx20_constexpr_test() {
+  test_copy_assignment_same_index();
+  test_copy_assignment_different_index();
+  test_constexpr_copy_assignment_non_trivial();
+
+  return true;
+}
+
+int main(int, char**) {
+  non_constexpr_test();
+  cxx17_constexpr_test();
+  cxx20_constexpr_test();
+
+  static_assert(cxx17_constexpr_test());
+#if TEST_STD_VER >= 20
+  static_assert(cxx20_constexpr_test());
+#endif
   return 0;
 }
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.assign/move.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.assign/move.pass.cpp
index 84094347aed3..157ff68f3748 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.assign/move.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.assign/move.pass.cpp
@@ -24,71 +24,70 @@
 #include "variant_test_helpers.h"
 
 struct NoCopy {
-  NoCopy(const NoCopy &) = delete;
-  NoCopy &operator=(const NoCopy &) = default;
+  NoCopy(const NoCopy&)            = delete;
+  NoCopy& operator=(const NoCopy&) = default;
 };
 
 struct CopyOnly {
-  CopyOnly(const CopyOnly &) = default;
-  CopyOnly(CopyOnly &&) = delete;
-  CopyOnly &operator=(const CopyOnly &) = default;
-  CopyOnly &operator=(CopyOnly &&) = delete;
+  CopyOnly(const CopyOnly&)            = default;
+  CopyOnly(CopyOnly&&)                 = delete;
+  CopyOnly& operator=(const CopyOnly&) = default;
+  CopyOnly& operator=(CopyOnly&&)      = delete;
 };
 
 struct MoveOnly {
-  MoveOnly(const MoveOnly &) = delete;
-  MoveOnly(MoveOnly &&) = default;
-  MoveOnly &operator=(const MoveOnly &) = delete;
-  MoveOnly &operator=(MoveOnly &&) = default;
+  MoveOnly(const MoveOnly&)            = delete;
+  MoveOnly(MoveOnly&&)                 = default;
+  MoveOnly& operator=(const MoveOnly&) = delete;
+  MoveOnly& operator=(MoveOnly&&)      = default;
 };
 
 struct MoveOnlyNT {
-  MoveOnlyNT(const MoveOnlyNT &) = delete;
-  MoveOnlyNT(MoveOnlyNT &&) {}
-  MoveOnlyNT &operator=(const MoveOnlyNT &) = delete;
-  MoveOnlyNT &operator=(MoveOnlyNT &&) = default;
+  MoveOnlyNT(const MoveOnlyNT&) = delete;
+  MoveOnlyNT(MoveOnlyNT&&) {}
+  MoveOnlyNT& operator=(const MoveOnlyNT&) = delete;
+  MoveOnlyNT& operator=(MoveOnlyNT&&)      = default;
 };
 
 struct MoveOnlyOddNothrow {
-  MoveOnlyOddNothrow(MoveOnlyOddNothrow &&) noexcept(false) {}
-  MoveOnlyOddNothrow(const MoveOnlyOddNothrow &) = delete;
-  MoveOnlyOddNothrow &operator=(MoveOnlyOddNothrow &&) noexcept = default;
-  MoveOnlyOddNothrow &operator=(const MoveOnlyOddNothrow &) = delete;
+  MoveOnlyOddNothrow(MoveOnlyOddNothrow&&) noexcept(false) {}
+  MoveOnlyOddNothrow(const MoveOnlyOddNothrow&)                = delete;
+  MoveOnlyOddNothrow& operator=(MoveOnlyOddNothrow&&) noexcept = default;
+  MoveOnlyOddNothrow& operator=(const MoveOnlyOddNothrow&)     = delete;
 };
 
 struct MoveAssignOnly {
-  MoveAssignOnly(MoveAssignOnly &&) = delete;
-  MoveAssignOnly &operator=(MoveAssignOnly &&) = default;
+  MoveAssignOnly(MoveAssignOnly&&)            = delete;
+  MoveAssignOnly& operator=(MoveAssignOnly&&) = default;
 };
 
 struct MoveAssign {
-  static int move_construct;
-  static int move_assign;
-  static void reset() { move_construct = move_assign = 0; }
-  MoveAssign(int v) : value(v) {}
-  MoveAssign(MoveAssign &&o) : value(o.value) {
-    ++move_construct;
+  constexpr MoveAssign(int v, int* move_ctor, int* move_assi)
+      : value(v), move_construct(move_ctor), move_assign(move_assi) {}
+  constexpr MoveAssign(MoveAssign&& o) : value(o.value), move_construct(o.move_construct), move_assign(o.move_assign) {
+    ++*move_construct;
     o.value = -1;
   }
-  MoveAssign &operator=(MoveAssign &&o) {
-    value = o.value;
-    ++move_assign;
+  constexpr MoveAssign& operator=(MoveAssign&& o) {
+    value          = o.value;
+    move_construct = o.move_construct;
+    move_assign    = o.move_assign;
+    ++*move_assign;
     o.value = -1;
     return *this;
   }
   int value;
+  int* move_construct;
+  int* move_assign;
 };
 
-int MoveAssign::move_construct = 0;
-int MoveAssign::move_assign = 0;
-
 struct NTMoveAssign {
   constexpr NTMoveAssign(int v) : value(v) {}
-  NTMoveAssign(const NTMoveAssign &) = default;
-  NTMoveAssign(NTMoveAssign &&) = default;
-  NTMoveAssign &operator=(const NTMoveAssign &that) = default;
-  NTMoveAssign &operator=(NTMoveAssign &&that) {
-    value = that.value;
+  NTMoveAssign(const NTMoveAssign&)                 = default;
+  NTMoveAssign(NTMoveAssign&&)                      = default;
+  NTMoveAssign& operator=(const NTMoveAssign& that) = default;
+  NTMoveAssign& operator=(NTMoveAssign&& that) {
+    value      = that.value;
     that.value = -1;
     return *this;
   };
@@ -100,10 +99,10 @@ static_assert(std::is_move_assignable<NTMoveAssign>::value, "");
 
 struct TMoveAssign {
   constexpr TMoveAssign(int v) : value(v) {}
-  TMoveAssign(const TMoveAssign &) = delete;
-  TMoveAssign(TMoveAssign &&) = default;
-  TMoveAssign &operator=(const TMoveAssign &) = delete;
-  TMoveAssign &operator=(TMoveAssign &&) = default;
+  TMoveAssign(const TMoveAssign&)            = delete;
+  TMoveAssign(TMoveAssign&&)                 = default;
+  TMoveAssign& operator=(const TMoveAssign&) = delete;
+  TMoveAssign& operator=(TMoveAssign&&)      = default;
   int value;
 };
 
@@ -111,13 +110,13 @@ static_assert(std::is_trivially_move_assignable<TMoveAssign>::value, "");
 
 struct TMoveAssignNTCopyAssign {
   constexpr TMoveAssignNTCopyAssign(int v) : value(v) {}
-  TMoveAssignNTCopyAssign(const TMoveAssignNTCopyAssign &) = default;
-  TMoveAssignNTCopyAssign(TMoveAssignNTCopyAssign &&) = default;
-  TMoveAssignNTCopyAssign &operator=(const TMoveAssignNTCopyAssign &that) {
+  TMoveAssignNTCopyAssign(const TMoveAssignNTCopyAssign&) = default;
+  TMoveAssignNTCopyAssign(TMoveAssignNTCopyAssign&&)      = default;
+  TMoveAssignNTCopyAssign& operator=(const TMoveAssignNTCopyAssign& that) {
     value = that.value;
     return *this;
   }
-  TMoveAssignNTCopyAssign &operator=(TMoveAssignNTCopyAssign &&) = default;
+  TMoveAssignNTCopyAssign& operator=(TMoveAssignNTCopyAssign&&) = default;
   int value;
 };
 
@@ -127,16 +126,13 @@ struct TrivialCopyNontrivialMove {
   TrivialCopyNontrivialMove(TrivialCopyNontrivialMove const&) = default;
   TrivialCopyNontrivialMove(TrivialCopyNontrivialMove&&) noexcept {}
   TrivialCopyNontrivialMove& operator=(TrivialCopyNontrivialMove const&) = default;
-  TrivialCopyNontrivialMove& operator=(TrivialCopyNontrivialMove&&) noexcept {
-    return *this;
-  }
+  TrivialCopyNontrivialMove& operator=(TrivialCopyNontrivialMove&&) noexcept { return *this; }
 };
 
 static_assert(std::is_trivially_copy_assignable_v<TrivialCopyNontrivialMove>, "");
 static_assert(!std::is_trivially_move_assignable_v<TrivialCopyNontrivialMove>, "");
 
-
-void test_move_assignment_noexcept() {
+constexpr void test_move_assignment_noexcept() {
   {
     using V = std::variant<int>;
     static_assert(std::is_nothrow_move_assignable<V>::value, "");
@@ -163,7 +159,7 @@ void test_move_assignment_noexcept() {
   }
 }
 
-void test_move_assignment_sfinae() {
+constexpr void test_move_assignment_sfinae() {
   {
     using V = std::variant<int, long>;
     static_assert(std::is_move_assignable<V>::value, "");
@@ -228,7 +224,7 @@ void test_move_assignment_empty_empty() {
     makeEmpty(v1);
     V v2(std::in_place_index<0>);
     makeEmpty(v2);
-    V &vref = (v1 = std::move(v2));
+    V& vref = (v1 = std::move(v2));
     assert(&vref == &v1);
     assert(v1.valueless_by_exception());
     assert(v1.index() == std::variant_npos);
@@ -244,7 +240,7 @@ void test_move_assignment_non_empty_empty() {
     V v1(std::in_place_index<0>, 42);
     V v2(std::in_place_index<0>);
     makeEmpty(v2);
-    V &vref = (v1 = std::move(v2));
+    V& vref = (v1 = std::move(v2));
     assert(&vref == &v1);
     assert(v1.valueless_by_exception());
     assert(v1.index() == std::variant_npos);
@@ -254,7 +250,7 @@ void test_move_assignment_non_empty_empty() {
     V v1(std::in_place_index<2>, "hello");
     V v2(std::in_place_index<0>);
     makeEmpty(v2);
-    V &vref = (v1 = std::move(v2));
+    V& vref = (v1 = std::move(v2));
     assert(&vref == &v1);
     assert(v1.valueless_by_exception());
     assert(v1.index() == std::variant_npos);
@@ -270,7 +266,7 @@ void test_move_assignment_empty_non_empty() {
     V v1(std::in_place_index<0>);
     makeEmpty(v1);
     V v2(std::in_place_index<0>, 42);
-    V &vref = (v1 = std::move(v2));
+    V& vref = (v1 = std::move(v2));
     assert(&vref == &v1);
     assert(v1.index() == 0);
     assert(std::get<0>(v1) == 42);
@@ -280,7 +276,7 @@ void test_move_assignment_empty_non_empty() {
     V v1(std::in_place_index<0>);
     makeEmpty(v1);
     V v2(std::in_place_type<std::string>, "hello");
-    V &vref = (v1 = std::move(v2));
+    V& vref = (v1 = std::move(v2));
     assert(&vref == &v1);
     assert(v1.index() == 2);
     assert(std::get<2>(v1) == "hello");
@@ -288,14 +284,18 @@ void test_move_assignment_empty_non_empty() {
 #endif // TEST_HAS_NO_EXCEPTIONS
 }
 
-template <typename T> struct Result { std::size_t index; T value; };
+template <typename T>
+struct Result {
+  std::size_t index;
+  T value;
+};
 
-void test_move_assignment_same_index() {
+TEST_CONSTEXPR_CXX20 void test_move_assignment_same_index() {
   {
     using V = std::variant<int>;
     V v1(43);
     V v2(42);
-    V &vref = (v1 = std::move(v2));
+    V& vref = (v1 = std::move(v2));
     assert(&vref == &v1);
     assert(v1.index() == 0);
     assert(std::get<0>(v1) == 42);
@@ -304,39 +304,24 @@ void test_move_assignment_same_index() {
     using V = std::variant<int, long, unsigned>;
     V v1(43l);
     V v2(42l);
-    V &vref = (v1 = std::move(v2));
+    V& vref = (v1 = std::move(v2));
     assert(&vref == &v1);
     assert(v1.index() == 1);
     assert(std::get<1>(v1) == 42);
   }
   {
-    using V = std::variant<int, MoveAssign, unsigned>;
-    V v1(std::in_place_type<MoveAssign>, 43);
-    V v2(std::in_place_type<MoveAssign>, 42);
-    MoveAssign::reset();
-    V &vref = (v1 = std::move(v2));
+    using V            = std::variant<int, MoveAssign, unsigned>;
+    int move_construct = 0;
+    int move_assign    = 0;
+    V v1(std::in_place_type<MoveAssign>, 43, &move_construct, &move_assign);
+    V v2(std::in_place_type<MoveAssign>, 42, &move_construct, &move_assign);
+    V& vref = (v1 = std::move(v2));
     assert(&vref == &v1);
     assert(v1.index() == 1);
     assert(std::get<1>(v1).value == 42);
-    assert(MoveAssign::move_construct == 0);
-    assert(MoveAssign::move_assign == 1);
-  }
-#ifndef TEST_HAS_NO_EXCEPTIONS
-  using MET = MakeEmptyT;
-  {
-    using V = std::variant<int, MET, std::string>;
-    V v1(std::in_place_type<MET>);
-    MET &mref = std::get<1>(v1);
-    V v2(std::in_place_type<MET>);
-    try {
-      v1 = std::move(v2);
-      assert(false);
-    } catch (...) {
-    }
-    assert(v1.index() == 1);
-    assert(&std::get<1>(v1) == &mref);
+    assert(move_construct == 0);
+    assert(move_assign == 1);
   }
-#endif // TEST_HAS_NO_EXCEPTIONS
 
   // Make sure we properly propagate triviality, which implies constexpr-ness (see P0602R4).
   {
@@ -383,52 +368,29 @@ void test_move_assignment_same_index() {
   }
 }
 
-void test_move_assignment_different_index() {
+TEST_CONSTEXPR_CXX20 void test_move_assignment_different_index() {
   {
     using V = std::variant<int, long, unsigned>;
     V v1(43);
     V v2(42l);
-    V &vref = (v1 = std::move(v2));
+    V& vref = (v1 = std::move(v2));
     assert(&vref == &v1);
     assert(v1.index() == 1);
     assert(std::get<1>(v1) == 42);
   }
   {
-    using V = std::variant<int, MoveAssign, unsigned>;
+    using V            = std::variant<int, MoveAssign, unsigned>;
+    int move_construct = 0;
+    int move_assign    = 0;
     V v1(std::in_place_type<unsigned>, 43u);
-    V v2(std::in_place_type<MoveAssign>, 42);
-    MoveAssign::reset();
-    V &vref = (v1 = std::move(v2));
+    V v2(std::in_place_type<MoveAssign>, 42, &move_construct, &move_assign);
+    V& vref = (v1 = std::move(v2));
     assert(&vref == &v1);
     assert(v1.index() == 1);
     assert(std::get<1>(v1).value == 42);
-    assert(MoveAssign::move_construct == 1);
-    assert(MoveAssign::move_assign == 0);
+    assert(move_construct == 1);
+    assert(move_assign == 0);
   }
-#ifndef TEST_HAS_NO_EXCEPTIONS
-  using MET = MakeEmptyT;
-  {
-    using V = std::variant<int, MET, std::string>;
-    V v1(std::in_place_type<int>);
-    V v2(std::in_place_type<MET>);
-    try {
-      v1 = std::move(v2);
-      assert(false);
-    } catch (...) {
-    }
-    assert(v1.valueless_by_exception());
-    assert(v1.index() == std::variant_npos);
-  }
-  {
-    using V = std::variant<int, MET, std::string>;
-    V v1(std::in_place_type<MET>);
-    V v2(std::in_place_type<std::string>, "hello");
-    V &vref = (v1 = std::move(v2));
-    assert(&vref == &v1);
-    assert(v1.index() == 2);
-    assert(std::get<2>(v1) == "hello");
-  }
-#endif // TEST_HAS_NO_EXCEPTIONS
 
   // Make sure we properly propagate triviality, which implies constexpr-ness (see P0602R4).
   {
@@ -461,38 +423,126 @@ void test_move_assignment_different_index() {
   }
 }
 
-template <std::size_t NewIdx, class ValueType>
-constexpr bool test_constexpr_assign_imp(
-    std::variant<long, void*, int>&& v, ValueType&& new_value)
-{
-  std::variant<long, void*, int> v2(
-      std::forward<ValueType>(new_value));
+void test_assignment_throw() {
+#ifndef TEST_HAS_NO_EXCEPTIONS
+  using MET = MakeEmptyT;
+  // same index
+  {
+    using V = std::variant<int, MET, std::string>;
+    V v1(std::in_place_type<MET>);
+    MET& mref = std::get<1>(v1);
+    V v2(std::in_place_type<MET>);
+    try {
+      v1 = std::move(v2);
+      assert(false);
+    } catch (...) {
+    }
+    assert(v1.index() == 1);
+    assert(&std::get<1>(v1) == &mref);
+  }
+
+  // different indices
+  {
+    using V = std::variant<int, MET, std::string>;
+    V v1(std::in_place_type<int>);
+    V v2(std::in_place_type<MET>);
+    try {
+      v1 = std::move(v2);
+      assert(false);
+    } catch (...) {
+    }
+    assert(v1.valueless_by_exception());
+    assert(v1.index() == std::variant_npos);
+  }
+  {
+    using V = std::variant<int, MET, std::string>;
+    V v1(std::in_place_type<MET>);
+    V v2(std::in_place_type<std::string>, "hello");
+    V& vref = (v1 = std::move(v2));
+    assert(&vref == &v1);
+    assert(v1.index() == 2);
+    assert(std::get<2>(v1) == "hello");
+  }
+#endif // TEST_HAS_NO_EXCEPTIONS
+}
+
+template <std::size_t NewIdx, class T, class ValueType>
+constexpr void test_constexpr_assign_imp(T&& v, ValueType&& new_value) {
+  using Variant = std::decay_t<T>;
+  Variant v2(std::forward<ValueType>(new_value));
   const auto cp = v2;
-  v = std::move(v2);
-  return v.index() == NewIdx &&
-        std::get<NewIdx>(v) == std::get<NewIdx>(cp);
+  v             = std::move(v2);
+  assert(v.index() == NewIdx);
+  assert(std::get<NewIdx>(v) == std::get<NewIdx>(cp));
 }
 
-void test_constexpr_move_assignment() {
+constexpr void test_constexpr_move_assignment_trivial() {
   // Make sure we properly propagate triviality, which implies constexpr-ness (see P0602R4).
   using V = std::variant<long, void*, int>;
   static_assert(std::is_trivially_copyable<V>::value, "");
   static_assert(std::is_trivially_move_assignable<V>::value, "");
-  static_assert(test_constexpr_assign_imp<0>(V(42l), 101l), "");
-  static_assert(test_constexpr_assign_imp<0>(V(nullptr), 101l), "");
-  static_assert(test_constexpr_assign_imp<1>(V(42l), nullptr), "");
-  static_assert(test_constexpr_assign_imp<2>(V(42l), 101), "");
+  test_constexpr_assign_imp<0>(V(42l), 101l);
+  test_constexpr_assign_imp<0>(V(nullptr), 101l);
+  test_constexpr_assign_imp<1>(V(42l), nullptr);
+  test_constexpr_assign_imp<2>(V(42l), 101);
 }
 
-int main(int, char**) {
+struct NonTrivialMoveAssign {
+  int i = 0;
+  constexpr NonTrivialMoveAssign(int ii) : i(ii) {}
+  constexpr NonTrivialMoveAssign(const NonTrivialMoveAssign& other) = default;
+  constexpr NonTrivialMoveAssign(NonTrivialMoveAssign&& other) : i(other.i) {}
+  constexpr NonTrivialMoveAssign& operator=(const NonTrivialMoveAssign&) = default;
+  constexpr NonTrivialMoveAssign& operator=(NonTrivialMoveAssign&& o) {
+    i = o.i;
+    return *this;
+  }
+  TEST_CONSTEXPR_CXX20 ~NonTrivialMoveAssign() = default;
+  friend constexpr bool operator==(const NonTrivialMoveAssign& x, const NonTrivialMoveAssign& y) { return x.i == y.i; }
+};
+
+TEST_CONSTEXPR_CXX20 void test_constexpr_move_assignment_non_trivial() {
+  using V = std::variant<long, void*, NonTrivialMoveAssign>;
+  static_assert(!std::is_trivially_copyable<V>::value);
+  static_assert(!std::is_trivially_move_assignable<V>::value);
+  test_constexpr_assign_imp<0>(V(42l), 101l);
+  test_constexpr_assign_imp<0>(V(nullptr), 101l);
+  test_constexpr_assign_imp<1>(V(42l), nullptr);
+  test_constexpr_assign_imp<2>(V(42l), NonTrivialMoveAssign(5));
+  test_constexpr_assign_imp<2>(V(NonTrivialMoveAssign(3)), NonTrivialMoveAssign(5));
+}
+
+void non_constexpr_test() {
   test_move_assignment_empty_empty();
   test_move_assignment_non_empty_empty();
   test_move_assignment_empty_non_empty();
-  test_move_assignment_same_index();
-  test_move_assignment_different_index();
+  test_assignment_throw();
+}
+
+constexpr bool cxx17_constexpr_test() {
   test_move_assignment_sfinae();
   test_move_assignment_noexcept();
-  test_constexpr_move_assignment();
+  test_constexpr_move_assignment_trivial();
+
+  return true;
+}
 
+TEST_CONSTEXPR_CXX20 bool cxx20_constexpr_test() {
+  test_move_assignment_same_index();
+  test_move_assignment_different_index();
+  test_constexpr_move_assignment_non_trivial();
+
+  return true;
+}
+
+int main(int, char**) {
+  non_constexpr_test();
+  cxx17_constexpr_test();
+  cxx20_constexpr_test();
+
+  static_assert(cxx17_constexpr_test());
+#if TEST_STD_VER >= 20
+  static_assert(cxx20_constexpr_test());
+#endif
   return 0;
 }
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/copy.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/copy.pass.cpp
index d1e5768f58d2..820ff9e0d1a9 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/copy.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/copy.pass.cpp
@@ -22,30 +22,30 @@
 #include "test_workarounds.h"
 
 struct NonT {
-  NonT(int v) : value(v) {}
-  NonT(const NonT &o) : value(o.value) {}
+  constexpr NonT(int v) : value(v) {}
+  constexpr NonT(const NonT& o) : value(o.value) {}
   int value;
 };
 static_assert(!std::is_trivially_copy_constructible<NonT>::value, "");
 
 struct NoCopy {
-  NoCopy(const NoCopy &) = delete;
+  NoCopy(const NoCopy&) = delete;
 };
 
 struct MoveOnly {
-  MoveOnly(const MoveOnly &) = delete;
-  MoveOnly(MoveOnly &&) = default;
+  MoveOnly(const MoveOnly&) = delete;
+  MoveOnly(MoveOnly&&)      = default;
 };
 
 struct MoveOnlyNT {
-  MoveOnlyNT(const MoveOnlyNT &) = delete;
-  MoveOnlyNT(MoveOnlyNT &&) {}
+  MoveOnlyNT(const MoveOnlyNT&) = delete;
+  MoveOnlyNT(MoveOnlyNT&&) {}
 };
 
 struct NTCopy {
   constexpr NTCopy(int v) : value(v) {}
-  NTCopy(const NTCopy &that) : value(that.value) {}
-  NTCopy(NTCopy &&) = delete;
+  NTCopy(const NTCopy& that) : value(that.value) {}
+  NTCopy(NTCopy&&) = delete;
   int value;
 };
 
@@ -54,8 +54,8 @@ static_assert(std::is_copy_constructible<NTCopy>::value, "");
 
 struct TCopy {
   constexpr TCopy(int v) : value(v) {}
-  TCopy(TCopy const &) = default;
-  TCopy(TCopy &&) = delete;
+  TCopy(TCopy const&) = default;
+  TCopy(TCopy&&)      = delete;
   int value;
 };
 
@@ -74,20 +74,21 @@ static_assert(std::is_trivially_copy_constructible<TCopyNTMove>::value, "");
 struct MakeEmptyT {
   static int alive;
   MakeEmptyT() { ++alive; }
-  MakeEmptyT(const MakeEmptyT &) {
+  MakeEmptyT(const MakeEmptyT&) {
     ++alive;
     // Don't throw from the copy constructor since variant's assignment
     // operator performs a copy before committing to the assignment.
   }
-  MakeEmptyT(MakeEmptyT &&) { throw 42; }
-  MakeEmptyT &operator=(const MakeEmptyT &) { throw 42; }
-  MakeEmptyT &operator=(MakeEmptyT &&) { throw 42; }
+  MakeEmptyT(MakeEmptyT&&) { throw 42; }
+  MakeEmptyT& operator=(const MakeEmptyT&) { throw 42; }
+  MakeEmptyT& operator=(MakeEmptyT&&) { throw 42; }
   ~MakeEmptyT() { --alive; }
 };
 
 int MakeEmptyT::alive = 0;
 
-template <class Variant> void makeEmpty(Variant &v) {
+template <class Variant>
+void makeEmpty(Variant& v) {
   Variant v2(std::in_place_type<MakeEmptyT>);
   try {
     v = std::move(v2);
@@ -98,7 +99,7 @@ template <class Variant> void makeEmpty(Variant &v) {
 }
 #endif // TEST_HAS_NO_EXCEPTIONS
 
-void test_copy_ctor_sfinae() {
+constexpr void test_copy_ctor_sfinae() {
   {
     using V = std::variant<int, long>;
     static_assert(std::is_copy_constructible<V>::value, "");
@@ -136,7 +137,7 @@ void test_copy_ctor_sfinae() {
   }
 }
 
-void test_copy_ctor_basic() {
+TEST_CONSTEXPR_CXX20 void test_copy_ctor_basic() {
   {
     std::variant<int> v(std::in_place_index<0>, 42);
     std::variant<int> v2 = v;
@@ -214,21 +215,21 @@ void test_copy_ctor_valueless_by_exception() {
   using V = std::variant<int, MakeEmptyT>;
   V v1;
   makeEmpty(v1);
-  const V &cv1 = v1;
+  const V& cv1 = v1;
   V v(cv1);
   assert(v.valueless_by_exception());
 #endif // TEST_HAS_NO_EXCEPTIONS
 }
 
-template <std::size_t Idx>
-constexpr bool test_constexpr_copy_ctor_imp(std::variant<long, void*, const int> const& v) {
+template <std::size_t Idx, class T>
+constexpr void test_constexpr_copy_ctor_imp(const T& v) {
   auto v2 = v;
-  return v2.index() == v.index() &&
-         v2.index() == Idx &&
-         std::get<Idx>(v2) == std::get<Idx>(v);
+  assert(v2.index() == v.index());
+  assert(v2.index() == Idx);
+  assert(std::get<Idx>(v2) == std::get<Idx>(v));
 }
 
-void test_constexpr_copy_ctor() {
+constexpr void test_constexpr_copy_ctor_trivial() {
   // Make sure we properly propagate triviality, which implies constexpr-ness (see P0602R4).
   using V = std::variant<long, void*, const int>;
 #ifdef TEST_WORKAROUND_MSVC_BROKEN_IS_TRIVIALLY_COPYABLE
@@ -237,18 +238,57 @@ void test_constexpr_copy_ctor() {
   static_assert(std::is_trivially_move_constructible<V>::value, "");
   static_assert(!std::is_copy_assignable<V>::value, "");
   static_assert(!std::is_move_assignable<V>::value, "");
-#else // TEST_WORKAROUND_MSVC_BROKEN_IS_TRIVIALLY_COPYABLE
+#else  // TEST_WORKAROUND_MSVC_BROKEN_IS_TRIVIALLY_COPYABLE
   static_assert(std::is_trivially_copyable<V>::value, "");
 #endif // TEST_WORKAROUND_MSVC_BROKEN_IS_TRIVIALLY_COPYABLE
-  static_assert(test_constexpr_copy_ctor_imp<0>(V(42l)), "");
-  static_assert(test_constexpr_copy_ctor_imp<1>(V(nullptr)), "");
-  static_assert(test_constexpr_copy_ctor_imp<2>(V(101)), "");
+  static_assert(std::is_trivially_copy_constructible<V>::value, "");
+  test_constexpr_copy_ctor_imp<0>(V(42l));
+  test_constexpr_copy_ctor_imp<1>(V(nullptr));
+  test_constexpr_copy_ctor_imp<2>(V(101));
 }
 
-int main(int, char**) {
-  test_copy_ctor_basic();
-  test_copy_ctor_valueless_by_exception();
+struct NonTrivialCopyCtor {
+  int i = 0;
+  constexpr NonTrivialCopyCtor(int ii) : i(ii) {}
+  constexpr NonTrivialCopyCtor(const NonTrivialCopyCtor& other) : i(other.i) {}
+  constexpr NonTrivialCopyCtor(NonTrivialCopyCtor&& other) = default;
+  TEST_CONSTEXPR_CXX20 ~NonTrivialCopyCtor()               = default;
+  friend constexpr bool operator==(const NonTrivialCopyCtor& x, const NonTrivialCopyCtor& y) { return x.i == y.i; }
+};
+
+TEST_CONSTEXPR_CXX20 void test_constexpr_copy_ctor_non_trivial() {
+  // Test !is_trivially_move_constructible
+  using V = std::variant<long, NonTrivialCopyCtor, void*>;
+  static_assert(!std::is_trivially_copy_constructible<V>::value, "");
+  test_constexpr_copy_ctor_imp<0>(V(42l));
+  test_constexpr_copy_ctor_imp<1>(V(NonTrivialCopyCtor(5)));
+  test_constexpr_copy_ctor_imp<2>(V(nullptr));
+}
+
+void non_constexpr_test() { test_copy_ctor_valueless_by_exception(); }
+
+constexpr bool cxx17_constexpr_test() {
   test_copy_ctor_sfinae();
-  test_constexpr_copy_ctor();
+  test_constexpr_copy_ctor_trivial();
+
+  return true;
+}
+
+TEST_CONSTEXPR_CXX20 bool cxx20_constexpr_test() {
+  test_copy_ctor_basic();
+  test_constexpr_copy_ctor_non_trivial();
+
+  return true;
+}
+
+int main(int, char**) {
+  non_constexpr_test();
+  cxx17_constexpr_test();
+  cxx20_constexpr_test();
+
+  static_assert(cxx17_constexpr_test());
+#if TEST_STD_VER >= 20
+  static_assert(cxx20_constexpr_test());
+#endif
   return 0;
 }
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/default.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/default.pass.cpp
index 40db038a0033..9abf4d758d84 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/default.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/default.pass.cpp
@@ -17,6 +17,7 @@
 #include <cassert>
 #include <type_traits>
 #include <variant>
+#include <string>
 
 #include "test_macros.h"
 #include "variant_test_helpers.h"
@@ -35,7 +36,7 @@ struct DefaultCtorThrows {
 };
 #endif
 
-void test_default_ctor_sfinae() {
+constexpr void test_default_ctor_sfinae() {
   {
     using V = std::variant<std::monostate, int>;
     static_assert(std::is_default_constructible<V>::value, "");
@@ -46,7 +47,7 @@ void test_default_ctor_sfinae() {
   }
 }
 
-void test_default_ctor_noexcept() {
+constexpr void test_default_ctor_noexcept() {
   {
     using V = std::variant<int>;
     static_assert(std::is_nothrow_default_constructible<V>::value, "");
@@ -63,7 +64,7 @@ void test_default_ctor_throws() {
   try {
     V v;
     assert(false);
-  } catch (const int &ex) {
+  } catch (const int& ex) {
     assert(ex == 42);
   } catch (...) {
     assert(false);
@@ -71,7 +72,7 @@ void test_default_ctor_throws() {
 #endif
 }
 
-void test_default_ctor_basic() {
+constexpr void test_default_ctor_basic() {
   {
     std::variant<int> v;
     assert(v.index() == 0);
@@ -107,11 +108,24 @@ void test_default_ctor_basic() {
   }
 }
 
-int main(int, char**) {
+constexpr void issue_86686() {
+#if TEST_STD_VER >= 20
+  static_assert(std::variant<std::string>{}.index() == 0);
+#endif
+}
+
+constexpr bool test() {
   test_default_ctor_basic();
   test_default_ctor_sfinae();
   test_default_ctor_noexcept();
-  test_default_ctor_throws();
+  issue_86686();
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  test_default_ctor_throws();
+  static_assert(test());
   return 0;
 }
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/move.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/move.pass.cpp
index e2518fe29caf..4e8453c23cf5 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/move.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/move.pass.cpp
@@ -23,31 +23,31 @@
 #include "test_workarounds.h"
 
 struct ThrowsMove {
-  ThrowsMove(ThrowsMove &&) noexcept(false) {}
+  ThrowsMove(ThrowsMove&&) noexcept(false) {}
 };
 
 struct NoCopy {
-  NoCopy(const NoCopy &) = delete;
+  NoCopy(const NoCopy&) = delete;
 };
 
 struct MoveOnly {
   int value;
-  MoveOnly(int v) : value(v) {}
-  MoveOnly(const MoveOnly &) = delete;
-  MoveOnly(MoveOnly &&) = default;
+  constexpr MoveOnly(int v) : value(v) {}
+  MoveOnly(const MoveOnly&) = delete;
+  MoveOnly(MoveOnly&&)      = default;
 };
 
 struct MoveOnlyNT {
   int value;
-  MoveOnlyNT(int v) : value(v) {}
-  MoveOnlyNT(const MoveOnlyNT &) = delete;
-  MoveOnlyNT(MoveOnlyNT &&other) : value(other.value) { other.value = -1; }
+  constexpr MoveOnlyNT(int v) : value(v) {}
+  MoveOnlyNT(const MoveOnlyNT&) = delete;
+  constexpr MoveOnlyNT(MoveOnlyNT&& other) : value(other.value) { other.value = -1; }
 };
 
 struct NTMove {
   constexpr NTMove(int v) : value(v) {}
-  NTMove(const NTMove &) = delete;
-  NTMove(NTMove &&that) : value(that.value) { that.value = -1; }
+  NTMove(const NTMove&) = delete;
+  NTMove(NTMove&& that) : value(that.value) { that.value = -1; }
   int value;
 };
 
@@ -56,8 +56,8 @@ static_assert(std::is_move_constructible<NTMove>::value, "");
 
 struct TMove {
   constexpr TMove(int v) : value(v) {}
-  TMove(const TMove &) = delete;
-  TMove(TMove &&) = default;
+  TMove(const TMove&) = delete;
+  TMove(TMove&&)      = default;
   int value;
 };
 
@@ -76,20 +76,21 @@ static_assert(std::is_trivially_move_constructible<TMoveNTCopy>::value, "");
 struct MakeEmptyT {
   static int alive;
   MakeEmptyT() { ++alive; }
-  MakeEmptyT(const MakeEmptyT &) {
+  MakeEmptyT(const MakeEmptyT&) {
     ++alive;
     // Don't throw from the copy constructor since variant's assignment
     // operator performs a copy before committing to the assignment.
   }
-  MakeEmptyT(MakeEmptyT &&) { throw 42; }
-  MakeEmptyT &operator=(const MakeEmptyT &) { throw 42; }
-  MakeEmptyT &operator=(MakeEmptyT &&) { throw 42; }
+  MakeEmptyT(MakeEmptyT&&) { throw 42; }
+  MakeEmptyT& operator=(const MakeEmptyT&) { throw 42; }
+  MakeEmptyT& operator=(MakeEmptyT&&) { throw 42; }
   ~MakeEmptyT() { --alive; }
 };
 
 int MakeEmptyT::alive = 0;
 
-template <class Variant> void makeEmpty(Variant &v) {
+template <class Variant>
+void makeEmpty(Variant& v) {
   Variant v2(std::in_place_type<MakeEmptyT>);
   try {
     v = std::move(v2);
@@ -100,7 +101,7 @@ template <class Variant> void makeEmpty(Variant &v) {
 }
 #endif // TEST_HAS_NO_EXCEPTIONS
 
-void test_move_noexcept() {
+constexpr void test_move_noexcept() {
   {
     using V = std::variant<int, long>;
     static_assert(std::is_nothrow_move_constructible<V>::value, "");
@@ -119,7 +120,7 @@ void test_move_noexcept() {
   }
 }
 
-void test_move_ctor_sfinae() {
+constexpr void test_move_ctor_sfinae() {
   {
     using V = std::variant<int, long>;
     static_assert(std::is_move_constructible<V>::value, "");
@@ -158,9 +159,12 @@ void test_move_ctor_sfinae() {
 }
 
 template <typename T>
-struct Result { std::size_t index; T value; };
+struct Result {
+  std::size_t index;
+  T value;
+};
 
-void test_move_ctor_basic() {
+TEST_CONSTEXPR_CXX20 void test_move_ctor_basic() {
   {
     std::variant<int> v(std::in_place_index<0>, 42);
     std::variant<int> v2 = std::move(v);
@@ -289,16 +293,16 @@ void test_move_ctor_valueless_by_exception() {
 #endif // TEST_HAS_NO_EXCEPTIONS
 }
 
-template <std::size_t Idx>
-constexpr bool test_constexpr_ctor_imp(std::variant<long, void*, const int> const& v) {
+template <std::size_t Idx, class T>
+constexpr void test_constexpr_ctor_imp(const T& v) {
   auto copy = v;
-  auto v2 = std::move(copy);
-  return v2.index() == v.index() &&
-         v2.index() == Idx &&
-        std::get<Idx>(v2) == std::get<Idx>(v);
+  auto v2   = std::move(copy);
+  assert(v2.index() == v.index());
+  assert(v2.index() == Idx);
+  assert(std::get<Idx>(v2) == std::get<Idx>(v));
 }
 
-void test_constexpr_move_ctor() {
+constexpr void test_constexpr_move_ctor_trivial() {
   // Make sure we properly propagate triviality, which implies constexpr-ness (see P0602R4).
   using V = std::variant<long, void*, const int>;
 #ifdef TEST_WORKAROUND_MSVC_BROKEN_IS_TRIVIALLY_COPYABLE
@@ -307,21 +311,58 @@ void test_constexpr_move_ctor() {
   static_assert(std::is_trivially_move_constructible<V>::value, "");
   static_assert(!std::is_copy_assignable<V>::value, "");
   static_assert(!std::is_move_assignable<V>::value, "");
-#else // TEST_WORKAROUND_MSVC_BROKEN_IS_TRIVIALLY_COPYABLE
+#else  // TEST_WORKAROUND_MSVC_BROKEN_IS_TRIVIALLY_COPYABLE
   static_assert(std::is_trivially_copyable<V>::value, "");
 #endif // TEST_WORKAROUND_MSVC_BROKEN_IS_TRIVIALLY_COPYABLE
   static_assert(std::is_trivially_move_constructible<V>::value, "");
-  static_assert(test_constexpr_ctor_imp<0>(V(42l)), "");
-  static_assert(test_constexpr_ctor_imp<1>(V(nullptr)), "");
-  static_assert(test_constexpr_ctor_imp<2>(V(101)), "");
+  test_constexpr_ctor_imp<0>(V(42l));
+  test_constexpr_ctor_imp<1>(V(nullptr));
+  test_constexpr_ctor_imp<2>(V(101));
 }
 
-int main(int, char**) {
-  test_move_ctor_basic();
-  test_move_ctor_valueless_by_exception();
+struct NonTrivialMoveCtor {
+  int i = 0;
+  constexpr NonTrivialMoveCtor(int ii) : i(ii) {}
+  constexpr NonTrivialMoveCtor(const NonTrivialMoveCtor& other) = default;
+  constexpr NonTrivialMoveCtor(NonTrivialMoveCtor&& other) : i(other.i) {}
+  TEST_CONSTEXPR_CXX20 ~NonTrivialMoveCtor() = default;
+  friend constexpr bool operator==(const NonTrivialMoveCtor& x, const NonTrivialMoveCtor& y) { return x.i == y.i; }
+};
+
+TEST_CONSTEXPR_CXX20 void test_constexpr_move_ctor_non_trivial() {
+  using V = std::variant<long, NonTrivialMoveCtor, void*>;
+  static_assert(!std::is_trivially_move_constructible<V>::value, "");
+  test_constexpr_ctor_imp<0>(V(42l));
+  test_constexpr_ctor_imp<1>(V(NonTrivialMoveCtor(5)));
+  test_constexpr_ctor_imp<2>(V(nullptr));
+}
+
+void non_constexpr_test() { test_move_ctor_valueless_by_exception(); }
+
+constexpr bool cxx17_constexpr_test() {
   test_move_noexcept();
   test_move_ctor_sfinae();
-  test_constexpr_move_ctor();
+  test_constexpr_move_ctor_trivial();
+
+  return true;
+}
+
+TEST_CONSTEXPR_CXX20 bool cxx20_constexpr_test() {
+  test_move_ctor_basic();
+  test_constexpr_move_ctor_non_trivial();
+
+  return true;
+}
+
+int main(int, char**) {
+  non_constexpr_test();
+  cxx17_constexpr_test();
+  cxx20_constexpr_test();
+
+  static_assert(cxx17_constexpr_test());
+#if TEST_STD_VER >= 20
+  static_assert(cxx20_constexpr_test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.dtor/dtor.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.dtor/dtor.pass.cpp
index 2e026038c97a..53c5283b2edc 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.dtor/dtor.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.dtor/dtor.pass.cpp
@@ -21,55 +21,76 @@
 #include "test_macros.h"
 
 struct NonTDtor {
-  static int count;
-  NonTDtor() = default;
-  ~NonTDtor() { ++count; }
+  int* count;
+  constexpr NonTDtor(int* a, int*) : count(a) {}
+  TEST_CONSTEXPR_CXX20 ~NonTDtor() { ++*count; }
 };
-int NonTDtor::count = 0;
 static_assert(!std::is_trivially_destructible<NonTDtor>::value, "");
 
 struct NonTDtor1 {
-  static int count;
-  NonTDtor1() = default;
-  ~NonTDtor1() { ++count; }
+  int* count;
+  constexpr NonTDtor1(int*, int* b) : count(b) {}
+  TEST_CONSTEXPR_CXX20 ~NonTDtor1() { ++*count; }
 };
-int NonTDtor1::count = 0;
 static_assert(!std::is_trivially_destructible<NonTDtor1>::value, "");
 
 struct TDtor {
-  TDtor(const TDtor &) {} // non-trivial copy
-  ~TDtor() = default;
+  constexpr TDtor() = default;
+  constexpr TDtor(const TDtor&) {} // non-trivial copy
+  TEST_CONSTEXPR_CXX20 ~TDtor() = default;
 };
 static_assert(!std::is_trivially_copy_constructible<TDtor>::value, "");
 static_assert(std::is_trivially_destructible<TDtor>::value, "");
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX20 bool test() {
   {
     using V = std::variant<int, long, TDtor>;
     static_assert(std::is_trivially_destructible<V>::value, "");
+    [[maybe_unused]] V v(std::in_place_index<2>);
   }
   {
     using V = std::variant<NonTDtor, int, NonTDtor1>;
     static_assert(!std::is_trivially_destructible<V>::value, "");
     {
-      V v(std::in_place_index<0>);
-      assert(NonTDtor::count == 0);
-      assert(NonTDtor1::count == 0);
+      int count0 = 0;
+      int count1 = 0;
+      {
+        V v(std::in_place_index<0>, &count0, &count1);
+        assert(count0 == 0);
+        assert(count1 == 0);
+      }
+      assert(count0 == 1);
+      assert(count1 == 0);
+    }
+    {
+      int count0 = 0;
+      int count1 = 0;
+      { V v(std::in_place_index<1>); }
+      assert(count0 == 0);
+      assert(count1 == 0);
     }
-    assert(NonTDtor::count == 1);
-    assert(NonTDtor1::count == 0);
-    NonTDtor::count = 0;
-    { V v(std::in_place_index<1>); }
-    assert(NonTDtor::count == 0);
-    assert(NonTDtor1::count == 0);
     {
-      V v(std::in_place_index<2>);
-      assert(NonTDtor::count == 0);
-      assert(NonTDtor1::count == 0);
+      int count0 = 0;
+      int count1 = 0;
+      {
+        V v(std::in_place_index<2>, &count0, &count1);
+        assert(count0 == 0);
+        assert(count1 == 0);
+      }
+      assert(count0 == 0);
+      assert(count1 == 1);
     }
-    assert(NonTDtor::count == 0);
-    assert(NonTDtor1::count == 1);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+
+#if TEST_STD_VER >= 20
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.mod/emplace_index_args.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.mod/emplace_index_args.pass.cpp
index 2fe9033dd816..f98d968f0eae 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.mod/emplace_index_args.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.mod/emplace_index_args.pass.cpp
@@ -26,8 +26,8 @@
 #include "variant_test_helpers.h"
 
 template <class Var, std::size_t I, class... Args>
-constexpr auto test_emplace_exists_imp(int) -> decltype(
-    std::declval<Var>().template emplace<I>(std::declval<Args>()...), true) {
+constexpr auto test_emplace_exists_imp(int)
+    -> decltype(std::declval<Var>().template emplace<I>(std::declval<Args>()...), true) {
   return true;
 }
 
@@ -36,28 +36,32 @@ constexpr auto test_emplace_exists_imp(long) -> bool {
   return false;
 }
 
-template <class Var, std::size_t I, class... Args> constexpr bool emplace_exists() {
+template <class Var, std::size_t I, class... Args>
+constexpr bool emplace_exists() {
   return test_emplace_exists_imp<Var, I, Args...>(0);
 }
 
-void test_emplace_sfinae() {
+constexpr void test_emplace_sfinae() {
   {
-    using V = std::variant<int, void *, const void *, TestTypes::NoCtors>;
+    using V = std::variant<int, void*, const void*, TestTypes::NoCtors>;
     static_assert(emplace_exists<V, 0>(), "");
     static_assert(emplace_exists<V, 0, int>(), "");
-    static_assert(!emplace_exists<V, 0, decltype(nullptr)>(),
-                  "cannot construct");
+    static_assert(!emplace_exists<V, 0, decltype(nullptr)>(), "cannot construct");
     static_assert(emplace_exists<V, 1, decltype(nullptr)>(), "");
-    static_assert(emplace_exists<V, 1, int *>(), "");
-    static_assert(!emplace_exists<V, 1, const int *>(), "");
+    static_assert(emplace_exists<V, 1, int*>(), "");
+    static_assert(!emplace_exists<V, 1, const int*>(), "");
     static_assert(!emplace_exists<V, 1, int>(), "cannot construct");
-    static_assert(emplace_exists<V, 2, const int *>(), "");
-    static_assert(emplace_exists<V, 2, int *>(), "");
+    static_assert(emplace_exists<V, 2, const int*>(), "");
+    static_assert(emplace_exists<V, 2, int*>(), "");
     static_assert(!emplace_exists<V, 3>(), "cannot construct");
   }
 }
 
-void test_basic() {
+struct NoCtor {
+  NoCtor() = delete;
+};
+
+TEST_CONSTEXPR_CXX20 void test_basic() {
   {
     using V = std::variant<int>;
     V v(42);
@@ -70,9 +74,9 @@ void test_basic() {
     assert(std::get<0>(v) == 42);
     assert(&ref2 == &std::get<0>(v));
   }
+
   {
-    using V =
-        std::variant<int, long, const void *, TestTypes::NoCtors, std::string>;
+    using V     = std::variant<int, long, const void*, NoCtor, std::string>;
     const int x = 100;
     V v(std::in_place_index<0>, -1);
     // default emplace a value
@@ -92,9 +96,19 @@ void test_basic() {
   }
 }
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX20 bool test() {
   test_basic();
   test_emplace_sfinae();
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+
+#if TEST_STD_VER >= 20
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.mod/emplace_index_init_list_args.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.mod/emplace_index_init_list_args.pass.cpp
index 9068aacc4359..4c635570bd56 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.mod/emplace_index_init_list_args.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.mod/emplace_index_init_list_args.pass.cpp
@@ -32,13 +32,12 @@ struct InitList {
 struct InitListArg {
   std::size_t size;
   int value;
-  constexpr InitListArg(std::initializer_list<int> il, int v)
-      : size(il.size()), value(v) {}
+  constexpr InitListArg(std::initializer_list<int> il, int v) : size(il.size()), value(v) {}
 };
 
 template <class Var, std::size_t I, class... Args>
-constexpr auto test_emplace_exists_imp(int) -> decltype(
-    std::declval<Var>().template emplace<I>(std::declval<Args>()...), true) {
+constexpr auto test_emplace_exists_imp(int)
+    -> decltype(std::declval<Var>().template emplace<I>(std::declval<Args>()...), true) {
   return true;
 }
 
@@ -47,13 +46,13 @@ constexpr auto test_emplace_exists_imp(long) -> bool {
   return false;
 }
 
-template <class Var, std::size_t I, class... Args> constexpr bool emplace_exists() {
+template <class Var, std::size_t I, class... Args>
+constexpr bool emplace_exists() {
   return test_emplace_exists_imp<Var, I, Args...>(0);
 }
 
-void test_emplace_sfinae() {
-  using V =
-      std::variant<int, TestTypes::NoCtors, InitList, InitListArg, long, long>;
+constexpr void test_emplace_sfinae() {
+  using V  = std::variant<int, TestTypes::NoCtors, InitList, InitListArg, long, long>;
   using IL = std::initializer_list<int>;
   static_assert(!emplace_exists<V, 1, IL>(), "no such constructor");
   static_assert(emplace_exists<V, 2, IL>(), "");
@@ -65,8 +64,12 @@ void test_emplace_sfinae() {
   static_assert(!emplace_exists<V, 3, IL, int, int>(), "too many args");
 }
 
-void test_basic() {
-  using V = std::variant<int, InitList, InitListArg, TestTypes::NoCtors>;
+struct NoCtor {
+  NoCtor() = delete;
+};
+
+TEST_CONSTEXPR_CXX20 void test_basic() {
+  using V = std::variant<int, InitList, InitListArg, NoCtor>;
   V v;
   auto& ref1 = v.emplace<1>({1, 2, 3});
   static_assert(std::is_same_v<InitList&, decltype(ref1)>, "");
@@ -83,9 +86,19 @@ void test_basic() {
   assert(&ref3 == &std::get<1>(v));
 }
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX20 bool test() {
   test_basic();
   test_emplace_sfinae();
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+
+#if TEST_STD_VER >= 20
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.mod/emplace_type_args.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.mod/emplace_type_args.pass.cpp
index 4e9f67775d10..c2ed54d8a625 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.mod/emplace_type_args.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.mod/emplace_type_args.pass.cpp
@@ -25,8 +25,8 @@
 #include "variant_test_helpers.h"
 
 template <class Var, class T, class... Args>
-constexpr auto test_emplace_exists_imp(int) -> decltype(
-    std::declval<Var>().template emplace<T>(std::declval<Args>()...), true) {
+constexpr auto test_emplace_exists_imp(int)
+    -> decltype(std::declval<Var>().template emplace<T>(std::declval<Args>()...), true) {
   return true;
 }
 
@@ -35,28 +35,32 @@ constexpr auto test_emplace_exists_imp(long) -> bool {
   return false;
 }
 
-template <class... Args> constexpr bool emplace_exists() {
+template <class... Args>
+constexpr bool emplace_exists() {
   return test_emplace_exists_imp<Args...>(0);
 }
 
-void test_emplace_sfinae() {
+constexpr void test_emplace_sfinae() {
   {
-    using V = std::variant<int, void *, const void *, TestTypes::NoCtors>;
+    using V = std::variant<int, void*, const void*, TestTypes::NoCtors>;
     static_assert(emplace_exists<V, int>(), "");
     static_assert(emplace_exists<V, int, int>(), "");
-    static_assert(!emplace_exists<V, int, decltype(nullptr)>(),
-                  "cannot construct");
-    static_assert(emplace_exists<V, void *, decltype(nullptr)>(), "");
-    static_assert(!emplace_exists<V, void *, int>(), "cannot construct");
-    static_assert(emplace_exists<V, void *, int *>(), "");
-    static_assert(!emplace_exists<V, void *, const int *>(), "");
-    static_assert(emplace_exists<V, const void *, const int *>(), "");
-    static_assert(emplace_exists<V, const void *, int *>(), "");
+    static_assert(!emplace_exists<V, int, decltype(nullptr)>(), "cannot construct");
+    static_assert(emplace_exists<V, void*, decltype(nullptr)>(), "");
+    static_assert(!emplace_exists<V, void*, int>(), "cannot construct");
+    static_assert(emplace_exists<V, void*, int*>(), "");
+    static_assert(!emplace_exists<V, void*, const int*>(), "");
+    static_assert(emplace_exists<V, const void*, const int*>(), "");
+    static_assert(emplace_exists<V, const void*, int*>(), "");
     static_assert(!emplace_exists<V, TestTypes::NoCtors>(), "cannot construct");
   }
 }
 
-void test_basic() {
+struct NoCtor {
+  NoCtor() = delete;
+};
+
+TEST_CONSTEXPR_CXX20 void test_basic() {
   {
     using V = std::variant<int>;
     V v(42);
@@ -70,8 +74,7 @@ void test_basic() {
     assert(&ref2 == &std::get<0>(v));
   }
   {
-    using V =
-        std::variant<int, long, const void *, TestTypes::NoCtors, std::string>;
+    using V     = std::variant<int, long, const void*, NoCtor, std::string>;
     const int x = 100;
     V v(std::in_place_type<int>, -1);
     // default emplace a value
@@ -79,8 +82,8 @@ void test_basic() {
     static_assert(std::is_same_v<long&, decltype(ref1)>, "");
     assert(std::get<1>(v) == 0);
     assert(&ref1 == &std::get<1>(v));
-    auto& ref2 = v.emplace<const void *>(&x);
-    static_assert(std::is_same_v<const void *&, decltype(ref2)>, "");
+    auto& ref2 = v.emplace<const void*>(&x);
+    static_assert(std::is_same_v<const void*&, decltype(ref2)>, "");
     assert(std::get<2>(v) == &x);
     assert(&ref2 == &std::get<2>(v));
     // emplace with multiple args
@@ -91,9 +94,19 @@ void test_basic() {
   }
 }
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX20 bool test() {
   test_basic();
   test_emplace_sfinae();
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+
+#if TEST_STD_VER >= 20
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.mod/emplace_type_init_list_args.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.mod/emplace_type_init_list_args.pass.cpp
index 74d834b9b345..644f2418b925 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.mod/emplace_type_init_list_args.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.mod/emplace_type_init_list_args.pass.cpp
@@ -32,13 +32,12 @@ struct InitList {
 struct InitListArg {
   std::size_t size;
   int value;
-  constexpr InitListArg(std::initializer_list<int> il, int v)
-      : size(il.size()), value(v) {}
+  constexpr InitListArg(std::initializer_list<int> il, int v) : size(il.size()), value(v) {}
 };
 
 template <class Var, class T, class... Args>
-constexpr auto test_emplace_exists_imp(int) -> decltype(
-    std::declval<Var>().template emplace<T>(std::declval<Args>()...), true) {
+constexpr auto test_emplace_exists_imp(int)
+    -> decltype(std::declval<Var>().template emplace<T>(std::declval<Args>()...), true) {
   return true;
 }
 
@@ -47,13 +46,13 @@ constexpr auto test_emplace_exists_imp(long) -> bool {
   return false;
 }
 
-template <class... Args> constexpr bool emplace_exists() {
+template <class... Args>
+constexpr bool emplace_exists() {
   return test_emplace_exists_imp<Args...>(0);
 }
 
-void test_emplace_sfinae() {
-  using V =
-      std::variant<int, TestTypes::NoCtors, InitList, InitListArg, long, long>;
+constexpr void test_emplace_sfinae() {
+  using V  = std::variant<int, TestTypes::NoCtors, InitList, InitListArg, long, long>;
   using IL = std::initializer_list<int>;
   static_assert(emplace_exists<V, InitList, IL>(), "");
   static_assert(!emplace_exists<V, InitList, int>(), "args don't match");
@@ -61,31 +60,44 @@ void test_emplace_sfinae() {
   static_assert(emplace_exists<V, InitListArg, IL, int>(), "");
   static_assert(!emplace_exists<V, InitListArg, int>(), "args don't match");
   static_assert(!emplace_exists<V, InitListArg, IL>(), "too few args");
-  static_assert(!emplace_exists<V, InitListArg, IL, int, int>(),
-                "too many args");
+  static_assert(!emplace_exists<V, InitListArg, IL, int, int>(), "too many args");
 }
 
-void test_basic() {
-  using V = std::variant<int, InitList, InitListArg, TestTypes::NoCtors>;
+struct NoCtor {
+  NoCtor() = delete;
+};
+
+TEST_CONSTEXPR_CXX20 void test_basic() {
+  using V = std::variant<int, InitList, InitListArg, NoCtor>;
   V v;
   auto& ref1 = v.emplace<InitList>({1, 2, 3});
-  static_assert(std::is_same_v<InitList&,decltype(ref1)>, "");
+  static_assert(std::is_same_v<InitList&, decltype(ref1)>, "");
   assert(std::get<InitList>(v).size == 3);
   assert(&ref1 == &std::get<InitList>(v));
   auto& ref2 = v.emplace<InitListArg>({1, 2, 3, 4}, 42);
-  static_assert(std::is_same_v<InitListArg&,decltype(ref2)>, "");
+  static_assert(std::is_same_v<InitListArg&, decltype(ref2)>, "");
   assert(std::get<InitListArg>(v).size == 4);
   assert(std::get<InitListArg>(v).value == 42);
   assert(&ref2 == &std::get<InitListArg>(v));
   auto& ref3 = v.emplace<InitList>({1});
-  static_assert(std::is_same_v<InitList&,decltype(ref3)>, "");
+  static_assert(std::is_same_v<InitList&, decltype(ref3)>, "");
   assert(std::get<InitList>(v).size == 1);
   assert(&ref3 == &std::get<InitList>(v));
 }
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX20 bool test() {
   test_basic();
   test_emplace_sfinae();
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+
+#if TEST_STD_VER >= 20
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp
index 1802bc4670bb..db05691c5581 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp
@@ -25,37 +25,39 @@
 #include "variant_test_helpers.h"
 
 struct NotSwappable {};
-void swap(NotSwappable &, NotSwappable &) = delete;
+void swap(NotSwappable&, NotSwappable&) = delete;
 
 struct NotCopyable {
-  NotCopyable() = default;
-  NotCopyable(const NotCopyable &) = delete;
-  NotCopyable &operator=(const NotCopyable &) = delete;
+  NotCopyable()                              = default;
+  NotCopyable(const NotCopyable&)            = delete;
+  NotCopyable& operator=(const NotCopyable&) = delete;
 };
 
 struct NotCopyableWithSwap {
-  NotCopyableWithSwap() = default;
-  NotCopyableWithSwap(const NotCopyableWithSwap &) = delete;
-  NotCopyableWithSwap &operator=(const NotCopyableWithSwap &) = delete;
+  NotCopyableWithSwap()                                      = default;
+  NotCopyableWithSwap(const NotCopyableWithSwap&)            = delete;
+  NotCopyableWithSwap& operator=(const NotCopyableWithSwap&) = delete;
 };
-void swap(NotCopyableWithSwap &, NotCopyableWithSwap) {}
+constexpr void swap(NotCopyableWithSwap&, NotCopyableWithSwap) {}
 
 struct NotMoveAssignable {
-  NotMoveAssignable() = default;
-  NotMoveAssignable(NotMoveAssignable &&) = default;
-  NotMoveAssignable &operator=(NotMoveAssignable &&) = delete;
+  NotMoveAssignable()                               = default;
+  NotMoveAssignable(NotMoveAssignable&&)            = default;
+  NotMoveAssignable& operator=(NotMoveAssignable&&) = delete;
 };
 
 struct NotMoveAssignableWithSwap {
-  NotMoveAssignableWithSwap() = default;
-  NotMoveAssignableWithSwap(NotMoveAssignableWithSwap &&) = default;
-  NotMoveAssignableWithSwap &operator=(NotMoveAssignableWithSwap &&) = delete;
+  NotMoveAssignableWithSwap()                                       = default;
+  NotMoveAssignableWithSwap(NotMoveAssignableWithSwap&&)            = default;
+  NotMoveAssignableWithSwap& operator=(NotMoveAssignableWithSwap&&) = delete;
 };
-void swap(NotMoveAssignableWithSwap &, NotMoveAssignableWithSwap &) noexcept {}
+constexpr void swap(NotMoveAssignableWithSwap&, NotMoveAssignableWithSwap&) noexcept {}
 
-template <bool Throws> void do_throw() {}
+template <bool Throws>
+constexpr void do_throw() {}
 
-template <> void do_throw<true>() {
+template <>
+void do_throw<true>() {
 #ifndef TEST_HAS_NO_EXCEPTIONS
   throw 42;
 #else
@@ -63,60 +65,49 @@ template <> void do_throw<true>() {
 #endif
 }
 
-template <bool NT_Copy, bool NT_Move, bool NT_CopyAssign, bool NT_MoveAssign,
-          bool NT_Swap, bool EnableSwap = true>
+template <bool NT_Copy, bool NT_Move, bool NT_CopyAssign, bool NT_MoveAssign, bool NT_Swap, bool EnableSwap = true>
 struct NothrowTypeImp {
-  static int move_called;
-  static int move_assign_called;
-  static int swap_called;
-  static void reset() { move_called = move_assign_called = swap_called = 0; }
-  NothrowTypeImp() = default;
-  explicit NothrowTypeImp(int v) : value(v) {}
-  NothrowTypeImp(const NothrowTypeImp &o) noexcept(NT_Copy) : value(o.value) {
-    assert(false);
-  } // never called by test
-  NothrowTypeImp(NothrowTypeImp &&o) noexcept(NT_Move) : value(o.value) {
-    ++move_called;
+  int value;
+  int* move_called;
+  int* move_assign_called;
+  int* swap_called;
+
+  constexpr NothrowTypeImp(int v, int* mv_ctr, int* mv_assign, int* swap)
+      : value(v), move_called(mv_ctr), move_assign_called(mv_assign), swap_called(swap) {}
+
+  NothrowTypeImp(const NothrowTypeImp& o) noexcept(NT_Copy) : value(o.value) { assert(false); } // never called by test
+
+  constexpr NothrowTypeImp(NothrowTypeImp&& o) noexcept(NT_Move)
+      : value(o.value),
+        move_called(o.move_called),
+        move_assign_called(o.move_assign_called),
+        swap_called(o.swap_called) {
+    ++*move_called;
     do_throw<!NT_Move>();
     o.value = -1;
   }
-  NothrowTypeImp &operator=(const NothrowTypeImp &) noexcept(NT_CopyAssign) {
+
+  NothrowTypeImp& operator=(const NothrowTypeImp&) noexcept(NT_CopyAssign) {
     assert(false);
     return *this;
   } // never called by the tests
-  NothrowTypeImp &operator=(NothrowTypeImp &&o) noexcept(NT_MoveAssign) {
-    ++move_assign_called;
+
+  constexpr NothrowTypeImp& operator=(NothrowTypeImp&& o) noexcept(NT_MoveAssign) {
+    ++*move_assign_called;
     do_throw<!NT_MoveAssign>();
-    value = o.value;
+    value   = o.value;
     o.value = -1;
     return *this;
   }
-  int value;
 };
-template <bool NT_Copy, bool NT_Move, bool NT_CopyAssign, bool NT_MoveAssign,
-          bool NT_Swap, bool EnableSwap>
-int NothrowTypeImp<NT_Copy, NT_Move, NT_CopyAssign, NT_MoveAssign, NT_Swap,
-                   EnableSwap>::move_called = 0;
-template <bool NT_Copy, bool NT_Move, bool NT_CopyAssign, bool NT_MoveAssign,
-          bool NT_Swap, bool EnableSwap>
-int NothrowTypeImp<NT_Copy, NT_Move, NT_CopyAssign, NT_MoveAssign, NT_Swap,
-                   EnableSwap>::move_assign_called = 0;
-template <bool NT_Copy, bool NT_Move, bool NT_CopyAssign, bool NT_MoveAssign,
-          bool NT_Swap, bool EnableSwap>
-int NothrowTypeImp<NT_Copy, NT_Move, NT_CopyAssign, NT_MoveAssign, NT_Swap,
-                   EnableSwap>::swap_called = 0;
-
-template <bool NT_Copy, bool NT_Move, bool NT_CopyAssign, bool NT_MoveAssign,
-          bool NT_Swap>
-void swap(NothrowTypeImp<NT_Copy, NT_Move, NT_CopyAssign, NT_MoveAssign,
-                         NT_Swap, true> &lhs,
-          NothrowTypeImp<NT_Copy, NT_Move, NT_CopyAssign, NT_MoveAssign,
-                         NT_Swap, true> &rhs) noexcept(NT_Swap) {
-  lhs.swap_called++;
+
+template <bool NT_Copy, bool NT_Move, bool NT_CopyAssign, bool NT_MoveAssign, bool NT_Swap>
+constexpr void
+swap(NothrowTypeImp<NT_Copy, NT_Move, NT_CopyAssign, NT_MoveAssign, NT_Swap, true>& lhs,
+     NothrowTypeImp<NT_Copy, NT_Move, NT_CopyAssign, NT_MoveAssign, NT_Swap, true>& rhs) noexcept(NT_Swap) {
+  ++*lhs.swap_called;
   do_throw<!NT_Swap>();
-  int tmp = lhs.value;
-  lhs.value = rhs.value;
-  rhs.value = tmp;
+  std::swap(lhs.value, rhs.value);
 }
 
 // throwing copy, nothrow move ctor/assign, no swap provided
@@ -124,53 +115,42 @@ using NothrowMoveable = NothrowTypeImp<false, true, false, true, false, false>;
 // throwing copy and move assign, nothrow move ctor, no swap provided
 using NothrowMoveCtor = NothrowTypeImp<false, true, false, false, false, false>;
 // nothrow move ctor, throwing move assignment, swap provided
-using NothrowMoveCtorWithThrowingSwap =
-    NothrowTypeImp<false, true, false, false, false, true>;
+using NothrowMoveCtorWithThrowingSwap = NothrowTypeImp<false, true, false, false, false, true>;
 // throwing move ctor, nothrow move assignment, no swap provided
-using ThrowingMoveCtor =
-    NothrowTypeImp<false, false, false, true, false, false>;
+using ThrowingMoveCtor = NothrowTypeImp<false, false, false, true, false, false>;
 // throwing special members, nothrowing swap
-using ThrowingTypeWithNothrowSwap =
-    NothrowTypeImp<false, false, false, false, true, true>;
-using NothrowTypeWithThrowingSwap =
-    NothrowTypeImp<true, true, true, true, false, true>;
+using ThrowingTypeWithNothrowSwap = NothrowTypeImp<false, false, false, false, true, true>;
+using NothrowTypeWithThrowingSwap = NothrowTypeImp<true, true, true, true, false, true>;
 // throwing move assign with nothrow move and nothrow swap
-using ThrowingMoveAssignNothrowMoveCtorWithSwap =
-    NothrowTypeImp<false, true, false, false, true, true>;
+using ThrowingMoveAssignNothrowMoveCtorWithSwap = NothrowTypeImp<false, true, false, false, true, true>;
 // throwing move assign with nothrow move but no swap.
-using ThrowingMoveAssignNothrowMoveCtor =
-    NothrowTypeImp<false, true, false, false, false, false>;
+using ThrowingMoveAssignNothrowMoveCtor = NothrowTypeImp<false, true, false, false, false, false>;
 
 struct NonThrowingNonNoexceptType {
-  static int move_called;
-  static void reset() { move_called = 0; }
-  NonThrowingNonNoexceptType() = default;
-  NonThrowingNonNoexceptType(int v) : value(v) {}
-  NonThrowingNonNoexceptType(NonThrowingNonNoexceptType &&o) noexcept(false)
-      : value(o.value) {
-    ++move_called;
+  int value;
+  int* move_called;
+  constexpr NonThrowingNonNoexceptType(int v, int* mv_called) : value(v), move_called(mv_called) {}
+  constexpr NonThrowingNonNoexceptType(NonThrowingNonNoexceptType&& o) noexcept(false)
+      : value(o.value), move_called(o.move_called) {
+    ++*move_called;
     o.value = -1;
   }
-  NonThrowingNonNoexceptType &
-  operator=(NonThrowingNonNoexceptType &&) noexcept(false) {
+  NonThrowingNonNoexceptType& operator=(NonThrowingNonNoexceptType&&) noexcept(false) {
     assert(false); // never called by the tests.
     return *this;
   }
-  int value;
 };
-int NonThrowingNonNoexceptType::move_called = 0;
 
 struct ThrowsOnSecondMove {
   int value;
   int move_count;
   ThrowsOnSecondMove(int v) : value(v), move_count(0) {}
-  ThrowsOnSecondMove(ThrowsOnSecondMove &&o) noexcept(false)
-      : value(o.value), move_count(o.move_count + 1) {
+  ThrowsOnSecondMove(ThrowsOnSecondMove&& o) noexcept(false) : value(o.value), move_count(o.move_count + 1) {
     if (move_count == 2)
       do_throw<true>();
     o.value = -1;
   }
-  ThrowsOnSecondMove &operator=(ThrowsOnSecondMove &&) {
+  ThrowsOnSecondMove& operator=(ThrowsOnSecondMove&&) {
     assert(false); // not called by test
     return *this;
   }
@@ -224,265 +204,293 @@ void test_swap_valueless_by_exception() {
 #endif
 }
 
-void test_swap_same_alternative() {
+TEST_CONSTEXPR_CXX20 void test_swap_same_alternative() {
   {
-    using T = ThrowingTypeWithNothrowSwap;
-    using V = std::variant<T, int>;
-    T::reset();
-    V v1(std::in_place_index<0>, 42);
-    V v2(std::in_place_index<0>, 100);
+    using V                = std::variant<ThrowingTypeWithNothrowSwap, int>;
+    int move_called        = 0;
+    int move_assign_called = 0;
+    int swap_called        = 0;
+    V v1(std::in_place_index<0>, 42, &move_called, &move_assign_called, &swap_called);
+    V v2(std::in_place_index<0>, 100, &move_called, &move_assign_called, &swap_called);
     v1.swap(v2);
-    assert(T::swap_called == 1);
+    assert(swap_called == 1);
     assert(std::get<0>(v1).value == 100);
     assert(std::get<0>(v2).value == 42);
     swap(v1, v2);
-    assert(T::swap_called == 2);
+    assert(swap_called == 2);
     assert(std::get<0>(v1).value == 42);
     assert(std::get<0>(v2).value == 100);
+
+    assert(move_called == 0);
+    assert(move_assign_called == 0);
   }
   {
-    using T = NothrowMoveable;
-    using V = std::variant<T, int>;
-    T::reset();
-    V v1(std::in_place_index<0>, 42);
-    V v2(std::in_place_index<0>, 100);
+    using V                = std::variant<NothrowMoveable, int>;
+    int move_called        = 0;
+    int move_assign_called = 0;
+    int swap_called        = 0;
+    V v1(std::in_place_index<0>, 42, &move_called, &move_assign_called, &swap_called);
+    V v2(std::in_place_index<0>, 100, &move_called, &move_assign_called, &swap_called);
     v1.swap(v2);
-    assert(T::swap_called == 0);
-    assert(T::move_called == 1);
-    assert(T::move_assign_called == 2);
+    assert(swap_called == 0);
+    assert(move_called == 1);
+    assert(move_assign_called == 2);
     assert(std::get<0>(v1).value == 100);
     assert(std::get<0>(v2).value == 42);
-    T::reset();
+
+    move_called        = 0;
+    move_assign_called = 0;
+    swap_called        = 0;
+
     swap(v1, v2);
-    assert(T::swap_called == 0);
-    assert(T::move_called == 1);
-    assert(T::move_assign_called == 2);
+    assert(swap_called == 0);
+    assert(move_called == 1);
+    assert(move_assign_called == 2);
     assert(std::get<0>(v1).value == 42);
     assert(std::get<0>(v2).value == 100);
   }
+}
+
+void test_swap_same_alternative_throws(){
 #ifndef TEST_HAS_NO_EXCEPTIONS
-  {
-    using T = NothrowTypeWithThrowingSwap;
-    using V = std::variant<T, int>;
-    T::reset();
-    V v1(std::in_place_index<0>, 42);
-    V v2(std::in_place_index<0>, 100);
-    try {
-      v1.swap(v2);
-      assert(false);
-    } catch (int) {
-    }
-    assert(T::swap_called == 1);
-    assert(T::move_called == 0);
-    assert(T::move_assign_called == 0);
-    assert(std::get<0>(v1).value == 42);
-    assert(std::get<0>(v2).value == 100);
-  }
-  {
-    using T = ThrowingMoveCtor;
-    using V = std::variant<T, int>;
-    T::reset();
-    V v1(std::in_place_index<0>, 42);
-    V v2(std::in_place_index<0>, 100);
-    try {
-      v1.swap(v2);
-      assert(false);
-    } catch (int) {
-    }
-    assert(T::move_called == 1); // call threw
-    assert(T::move_assign_called == 0);
-    assert(std::get<0>(v1).value ==
-           42); // throw happened before v1 was moved from
-    assert(std::get<0>(v2).value == 100);
+    {using V = std::variant<NothrowTypeWithThrowingSwap, int>;
+int move_called        = 0;
+int move_assign_called = 0;
+int swap_called        = 0;
+V v1(std::in_place_index<0>, 42, &move_called, &move_assign_called, &swap_called);
+V v2(std::in_place_index<0>, 100, &move_called, &move_assign_called, &swap_called);
+try {
+  v1.swap(v2);
+  assert(false);
+} catch (int) {
+}
+assert(swap_called == 1);
+assert(move_called == 0);
+assert(move_assign_called == 0);
+assert(std::get<0>(v1).value == 42);
+assert(std::get<0>(v2).value == 100);
+}
+
+{
+  using V                = std::variant<ThrowingMoveCtor, int>;
+  int move_called        = 0;
+  int move_assign_called = 0;
+  int swap_called        = 0;
+  V v1(std::in_place_index<0>, 42, &move_called, &move_assign_called, &swap_called);
+  V v2(std::in_place_index<0>, 100, &move_called, &move_assign_called, &swap_called);
+  try {
+    v1.swap(v2);
+    assert(false);
+  } catch (int) {
   }
-  {
-    using T = ThrowingMoveAssignNothrowMoveCtor;
-    using V = std::variant<T, int>;
-    T::reset();
-    V v1(std::in_place_index<0>, 42);
-    V v2(std::in_place_index<0>, 100);
-    try {
-      v1.swap(v2);
-      assert(false);
-    } catch (int) {
-    }
-    assert(T::move_called == 1);
-    assert(T::move_assign_called == 1);  // call threw and didn't complete
-    assert(std::get<0>(v1).value == -1); // v1 was moved from
-    assert(std::get<0>(v2).value == 100);
+  assert(move_called == 1); // call threw
+  assert(move_assign_called == 0);
+  assert(swap_called == 0);
+  assert(std::get<0>(v1).value == 42); // throw happened before v1 was moved from
+  assert(std::get<0>(v2).value == 100);
+}
+{
+  using V                = std::variant<ThrowingMoveAssignNothrowMoveCtor, int>;
+  int move_called        = 0;
+  int move_assign_called = 0;
+  int swap_called        = 0;
+  V v1(std::in_place_index<0>, 42, &move_called, &move_assign_called, &swap_called);
+  V v2(std::in_place_index<0>, 100, &move_called, &move_assign_called, &swap_called);
+  try {
+    v1.swap(v2);
+    assert(false);
+  } catch (int) {
   }
+  assert(move_called == 1);
+  assert(move_assign_called == 1); // call threw and didn't complete
+  assert(swap_called == 0);
+  assert(std::get<0>(v1).value == -1); // v1 was moved from
+  assert(std::get<0>(v2).value == 100);
+}
 #endif
 }
 
-void test_swap_different_alternatives() {
+TEST_CONSTEXPR_CXX20 void test_swap_different_alternatives() {
   {
-    using T = NothrowMoveCtorWithThrowingSwap;
-    using V = std::variant<T, int>;
-    T::reset();
-    V v1(std::in_place_index<0>, 42);
+    using V                = std::variant<NothrowMoveCtorWithThrowingSwap, int>;
+    int move_called        = 0;
+    int move_assign_called = 0;
+    int swap_called        = 0;
+    V v1(std::in_place_index<0>, 42, &move_called, &move_assign_called, &swap_called);
     V v2(std::in_place_index<1>, 100);
     v1.swap(v2);
-    assert(T::swap_called == 0);
+    assert(swap_called == 0);
     // The libc++ implementation double copies the argument, and not
     // the variant swap is called on.
-    LIBCPP_ASSERT(T::move_called == 1);
-    assert(T::move_called <= 2);
-    assert(T::move_assign_called == 0);
+    LIBCPP_ASSERT(move_called == 1);
+    assert(move_called <= 2);
+    assert(move_assign_called == 0);
     assert(std::get<1>(v1) == 100);
     assert(std::get<0>(v2).value == 42);
-    T::reset();
+
+    move_called        = 0;
+    move_assign_called = 0;
+    swap_called        = 0;
+
     swap(v1, v2);
-    assert(T::swap_called == 0);
-    LIBCPP_ASSERT(T::move_called == 2);
-    assert(T::move_called <= 2);
-    assert(T::move_assign_called == 0);
+    assert(swap_called == 0);
+    LIBCPP_ASSERT(move_called == 2);
+    assert(move_called <= 2);
+    assert(move_assign_called == 0);
     assert(std::get<0>(v1).value == 42);
     assert(std::get<1>(v2) == 100);
   }
+}
+
+void test_swap_different_alternatives_throws() {
 #ifndef TEST_HAS_NO_EXCEPTIONS
   {
-    using T1 = ThrowingTypeWithNothrowSwap;
-    using T2 = NonThrowingNonNoexceptType;
-    using V = std::variant<T1, T2>;
-    T1::reset();
-    T2::reset();
-    V v1(std::in_place_index<0>, 42);
-    V v2(std::in_place_index<1>, 100);
+    using V                 = std::variant<ThrowingTypeWithNothrowSwap, NonThrowingNonNoexceptType>;
+    int move_called1        = 0;
+    int move_assign_called1 = 0;
+    int swap_called1        = 0;
+    int move_called2        = 0;
+    V v1(std::in_place_index<0>, 42, &move_called1, &move_assign_called1, &swap_called1);
+    V v2(std::in_place_index<1>, 100, &move_called2);
     try {
       v1.swap(v2);
       assert(false);
     } catch (int) {
     }
-    assert(T1::swap_called == 0);
-    assert(T1::move_called == 1); // throws
-    assert(T1::move_assign_called == 0);
+    assert(swap_called1 == 0);
+    assert(move_called1 == 1); // throws
+    assert(move_assign_called1 == 0);
     // FIXME: libc++ shouldn't move from T2 here.
-    LIBCPP_ASSERT(T2::move_called == 1);
-    assert(T2::move_called <= 1);
+    LIBCPP_ASSERT(move_called2 == 1);
+    assert(move_called2 <= 1);
     assert(std::get<0>(v1).value == 42);
-    if (T2::move_called != 0)
+    if (move_called2 != 0)
       assert(v2.valueless_by_exception());
     else
       assert(std::get<1>(v2).value == 100);
   }
   {
-    using T1 = NonThrowingNonNoexceptType;
-    using T2 = ThrowingTypeWithNothrowSwap;
-    using V = std::variant<T1, T2>;
-    T1::reset();
-    T2::reset();
-    V v1(std::in_place_index<0>, 42);
-    V v2(std::in_place_index<1>, 100);
+    using V                 = std::variant<NonThrowingNonNoexceptType, ThrowingTypeWithNothrowSwap>;
+    int move_called1        = 0;
+    int move_called2        = 0;
+    int move_assign_called2 = 0;
+    int swap_called2        = 0;
+    V v1(std::in_place_index<0>, 42, &move_called1);
+    V v2(std::in_place_index<1>, 100, &move_called2, &move_assign_called2, &swap_called2);
     try {
       v1.swap(v2);
       assert(false);
     } catch (int) {
     }
-    LIBCPP_ASSERT(T1::move_called == 0);
-    assert(T1::move_called <= 1);
-    assert(T2::swap_called == 0);
-    assert(T2::move_called == 1); // throws
-    assert(T2::move_assign_called == 0);
-    if (T1::move_called != 0)
+    LIBCPP_ASSERT(move_called1 == 0);
+    assert(move_called1 <= 1);
+    assert(swap_called2 == 0);
+    assert(move_called2 == 1); // throws
+    assert(move_assign_called2 == 0);
+    if (move_called1 != 0)
       assert(v1.valueless_by_exception());
     else
       assert(std::get<0>(v1).value == 42);
     assert(std::get<1>(v2).value == 100);
   }
 // FIXME: The tests below are just very libc++ specific
-#ifdef _LIBCPP_VERSION
+#  ifdef _LIBCPP_VERSION
   {
-    using T1 = ThrowsOnSecondMove;
-    using T2 = NonThrowingNonNoexceptType;
-    using V = std::variant<T1, T2>;
-    T2::reset();
+    using V         = std::variant<ThrowsOnSecondMove, NonThrowingNonNoexceptType>;
+    int move_called = 0;
     V v1(std::in_place_index<0>, 42);
-    V v2(std::in_place_index<1>, 100);
+    V v2(std::in_place_index<1>, 100, &move_called);
     v1.swap(v2);
-    assert(T2::move_called == 2);
+    assert(move_called == 2);
     assert(std::get<1>(v1).value == 100);
     assert(std::get<0>(v2).value == 42);
     assert(std::get<0>(v2).move_count == 1);
   }
   {
-    using T1 = NonThrowingNonNoexceptType;
-    using T2 = ThrowsOnSecondMove;
-    using V = std::variant<T1, T2>;
-    T1::reset();
-    V v1(std::in_place_index<0>, 42);
+    using V         = std::variant<NonThrowingNonNoexceptType, ThrowsOnSecondMove>;
+    int move_called = 0;
+    V v1(std::in_place_index<0>, 42, &move_called);
     V v2(std::in_place_index<1>, 100);
     try {
       v1.swap(v2);
       assert(false);
     } catch (int) {
     }
-    assert(T1::move_called == 1);
+    assert(move_called == 1);
     assert(v1.valueless_by_exception());
     assert(std::get<0>(v2).value == 42);
   }
-#endif
-// testing libc++ extension. If either variant stores a nothrow move
-// constructible type v1.swap(v2) provides the strong exception safety
-// guarantee.
-#ifdef _LIBCPP_VERSION
+#  endif
+  // testing libc++ extension. If either variant stores a nothrow move
+  // constructible type v1.swap(v2) provides the strong exception safety
+  // guarantee.
+#  ifdef _LIBCPP_VERSION
   {
-
-    using T1 = ThrowingTypeWithNothrowSwap;
-    using T2 = NothrowMoveable;
-    using V = std::variant<T1, T2>;
-    T1::reset();
-    T2::reset();
-    V v1(std::in_place_index<0>, 42);
-    V v2(std::in_place_index<1>, 100);
+    using V                 = std::variant<ThrowingTypeWithNothrowSwap, NothrowMoveable>;
+    int move_called1        = 0;
+    int move_assign_called1 = 0;
+    int swap_called1        = 0;
+    int move_called2        = 0;
+    int move_assign_called2 = 0;
+    int swap_called2        = 0;
+    V v1(std::in_place_index<0>, 42, &move_called1, &move_assign_called1, &swap_called1);
+    V v2(std::in_place_index<1>, 100, &move_called2, &move_assign_called2, &swap_called2);
     try {
       v1.swap(v2);
       assert(false);
     } catch (int) {
     }
-    assert(T1::swap_called == 0);
-    assert(T1::move_called == 1);
-    assert(T1::move_assign_called == 0);
-    assert(T2::swap_called == 0);
-    assert(T2::move_called == 2);
-    assert(T2::move_assign_called == 0);
+    assert(swap_called1 == 0);
+    assert(move_called1 == 1);
+    assert(move_assign_called1 == 0);
+    assert(swap_called2 == 0);
+    assert(move_called2 == 2);
+    assert(move_assign_called2 == 0);
     assert(std::get<0>(v1).value == 42);
     assert(std::get<1>(v2).value == 100);
     // swap again, but call v2's swap.
-    T1::reset();
-    T2::reset();
+
+    move_called1        = 0;
+    move_assign_called1 = 0;
+    swap_called1        = 0;
+    move_called2        = 0;
+    move_assign_called2 = 0;
+    swap_called2        = 0;
+
     try {
       v2.swap(v1);
       assert(false);
     } catch (int) {
     }
-    assert(T1::swap_called == 0);
-    assert(T1::move_called == 1);
-    assert(T1::move_assign_called == 0);
-    assert(T2::swap_called == 0);
-    assert(T2::move_called == 2);
-    assert(T2::move_assign_called == 0);
+    assert(swap_called1 == 0);
+    assert(move_called1 == 1);
+    assert(move_assign_called1 == 0);
+    assert(swap_called2 == 0);
+    assert(move_called2 == 2);
+    assert(move_assign_called2 == 0);
     assert(std::get<0>(v1).value == 42);
     assert(std::get<1>(v2).value == 100);
   }
-#endif // _LIBCPP_VERSION
+#  endif // _LIBCPP_VERSION
 #endif
 }
 
 template <class Var>
-constexpr auto has_swap_member_imp(int)
-    -> decltype(std::declval<Var &>().swap(std::declval<Var &>()), true) {
+constexpr auto has_swap_member_imp(int) -> decltype(std::declval<Var&>().swap(std::declval<Var&>()), true) {
   return true;
 }
 
-template <class Var> constexpr auto has_swap_member_imp(long) -> bool {
+template <class Var>
+constexpr auto has_swap_member_imp(long) -> bool {
   return false;
 }
 
-template <class Var> constexpr bool has_swap_member() {
+template <class Var>
+constexpr bool has_swap_member() {
   return has_swap_member_imp<Var>(0);
 }
 
-void test_swap_sfinae() {
+constexpr void test_swap_sfinae() {
   {
     // This variant type does not provide either a member or non-member swap
     // but is still swappable via the generic swap algorithm, since the
@@ -508,7 +516,7 @@ void test_swap_sfinae() {
   }
 }
 
-void test_swap_noexcept() {
+_LIBCPP_CONSTEXPR_SINCE_CXX20 void test_swap_noexcept() {
   {
     using V = std::variant<int, NothrowMoveable>;
     static_assert(std::is_swappable_v<V> && has_swap_member<V>(), "");
@@ -581,12 +589,28 @@ void test_swap_noexcept() {
 template class std::variant<int, NotSwappable>;
 #endif
 
-int main(int, char**) {
+void non_constexpr_test() {
   test_swap_valueless_by_exception();
+  test_swap_same_alternative_throws();
+  test_swap_different_alternatives_throws();
+}
+
+TEST_CONSTEXPR_CXX20 bool test() {
   test_swap_same_alternative();
   test_swap_different_alternatives();
   test_swap_sfinae();
   test_swap_noexcept();
 
+  return true;
+}
+
+int main(int, char**) {
+  non_constexpr_test();
+  test();
+
+#if TEST_STD_VER >= 20
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/support/asan_testing.h b/libcxx/test/support/asan_testing.h
index 6bfc8280a4ea..3785c1f9c20d 100644
--- a/libcxx/test/support/asan_testing.h
+++ b/libcxx/test/support/asan_testing.h
@@ -56,35 +56,16 @@ TEST_CONSTEXPR bool is_double_ended_contiguous_container_asan_correct(const std:
 #endif
 
 #if TEST_HAS_FEATURE(address_sanitizer)
-template <typename S>
-bool is_string_short(S const& s) {
-  // We do not have access to __is_long(), but we can check if strings
-  // buffer is inside strings memory. If strings memory contains its content,
-  // SSO is in use. To check it, we can just confirm that the beginning is in
-  // the string object memory block.
-  // &s    - beginning of objects memory
-  // &s[0] - beginning of the buffer
-  // (&s+1) - end of objects memory
-  return (void*)std::addressof(s) <= (void*)std::addressof(s[0]) &&
-         (void*)std::addressof(s[0]) < (void*)(std::addressof(s) + 1);
-}
-
 template <typename ChrT, typename TraitsT, typename Alloc>
 TEST_CONSTEXPR bool is_string_asan_correct(const std::basic_string<ChrT, TraitsT, Alloc>& c) {
   if (TEST_IS_CONSTANT_EVALUATED)
     return true;
 
-  if (!is_string_short(c) || _LIBCPP_SHORT_STRING_ANNOTATIONS_ALLOWED) {
-    if (std::__asan_annotate_container_with_allocator<Alloc>::value)
-      return __sanitizer_verify_contiguous_container(c.data(), c.data() + c.size() + 1, c.data() + c.capacity() + 1) !=
-             0;
-    else
-      return __sanitizer_verify_contiguous_container(
-                 c.data(), c.data() + c.capacity() + 1, c.data() + c.capacity() + 1) != 0;
-  } else {
-    return __sanitizer_verify_contiguous_container(std::addressof(c), std::addressof(c) + 1, std::addressof(c) + 1) !=
-           0;
-  }
+  if (std::__asan_annotate_container_with_allocator<Alloc>::value)
+    return __sanitizer_verify_contiguous_container(c.data(), c.data() + c.size() + 1, c.data() + c.capacity() + 1) != 0;
+  else
+    return __sanitizer_verify_contiguous_container(
+               c.data(), c.data() + c.capacity() + 1, c.data() + c.capacity() + 1) != 0;
 }
 #else
 #  include <string>
diff --git a/libcxx/test/support/atomic_helpers.h b/libcxx/test/support/atomic_helpers.h
index 9a32b1ffe85e..0266a0961067 100644
--- a/libcxx/test/support/atomic_helpers.h
+++ b/libcxx/test/support/atomic_helpers.h
@@ -116,6 +116,7 @@ template <template <class TestArg> class TestFunctor>
 struct TestEachAtomicType {
   void operator()() const {
     TestEachIntegralType<TestFunctor>()();
+    TestEachPointerType<TestFunctor>()();
     TestFunctor<UserAtomicType>()();
     /*
             Note: These aren't going to be lock-free,
@@ -128,8 +129,6 @@ struct TestEachAtomicType {
         TestFunctor<PaddedUserAtomicType>()();
         TestFunctor<WeirdUserAtomicType>()();
 */
-    TestFunctor<int*>()();
-    TestFunctor<const int*>()();
     TestFunctor<float>()();
     TestFunctor<double>()();
   }
diff --git a/libcxx/test/support/constexpr_char_traits.h b/libcxx/test/support/constexpr_char_traits.h
index 75380d5a7ffb..7c487c504af1 100644
--- a/libcxx/test/support/constexpr_char_traits.h
+++ b/libcxx/test/support/constexpr_char_traits.h
@@ -16,6 +16,31 @@
 
 #include "test_macros.h"
 
+// Tests whether the pointer p is in the range [first, last).
+//
+// Precondition: The range [first, last) is a valid range.
+//
+// Typically the pointers are compared with less than. This is not allowed when
+// the pointers belong to different ranges, which is UB. Typically, this is
+// benign at run-time, however since UB is not allowed during constant
+// evaluation this does not compile. This function does the validation without
+// UB.
+//
+// When p is in the range [first, last) the data can be copied from the
+// beginning to the end. Otherwise it needs to be copied from the end to the
+// beginning.
+template <class CharT>
+TEST_CONSTEXPR_CXX14 bool is_pointer_in_range(const CharT* first, const CharT* last, const CharT* p) {
+  if (first == p) // Needed when n == 0
+    return true;
+
+  for (; first != last; ++first)
+    if (first == p)
+      return true;
+
+  return false;
+}
+
 template <class CharT>
 struct constexpr_char_traits
 {
@@ -98,23 +123,21 @@ constexpr_char_traits<CharT>::find(const char_type* s, std::size_t n, const char
 }
 
 template <class CharT>
-TEST_CONSTEXPR_CXX14 CharT*
-constexpr_char_traits<CharT>::move(char_type* s1, const char_type* s2, std::size_t n)
-{
-    char_type* r = s1;
-    if (s1 < s2)
-    {
-        for (; n; --n, ++s1, ++s2)
-            assign(*s1, *s2);
-    }
-    else if (s2 < s1)
-    {
-        s1 += n;
-        s2 += n;
-        for (; n; --n)
-            assign(*--s1, *--s2);
-    }
-    return r;
+TEST_CONSTEXPR_CXX14 CharT* constexpr_char_traits<CharT>::move(char_type* s1, const char_type* s2, std::size_t n) {
+  if (s1 == s2)
+    return s1;
+
+  char_type* r = s1;
+  if (is_pointer_in_range(s1, s1 + n, s2)) {
+    for (; n; --n)
+      assign(*s1++, *s2++);
+  } else {
+    s1 += n;
+    s2 += n;
+    for (; n; --n)
+      assign(*--s1, *--s2);
+  }
+  return r;
 }
 
 template <class CharT>
diff --git a/libcxx/test/support/nasty_string.h b/libcxx/test/support/nasty_string.h
index 672c3cb4ed9e..ea9d83ccf282 100644
--- a/libcxx/test/support/nasty_string.h
+++ b/libcxx/test/support/nasty_string.h
@@ -16,6 +16,7 @@
 
 #include "make_string.h"
 #include "test_macros.h"
+#include "constexpr_char_traits.h" // is_pointer_in_range
 
 // This defines a nasty_string similar to nasty_containers. This string's
 // value_type does operator hijacking, which allows us to ensure that the
@@ -118,11 +119,14 @@ constexpr const nasty_char* nasty_char_traits::find(const nasty_char* s, std::si
 }
 
 constexpr nasty_char* nasty_char_traits::move(nasty_char* s1, const nasty_char* s2, std::size_t n) {
+  if (s1 == s2)
+    return s1;
+
   nasty_char* r = s1;
-  if (s1 < s2) {
-    for (; n; --n, ++s1, ++s2)
-      assign(*s1, *s2);
-  } else if (s2 < s1) {
+  if (is_pointer_in_range(s1, s1 + n, s2)) {
+    for (; n; --n)
+      assign(*s1++, *s2++);
+  } else {
     s1 += n;
     s2 += n;
     for (; n; --n)
diff --git a/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt b/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt
index 28eed6144583..28c1dbf8aca3 100644
--- a/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt
+++ b/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt
@@ -64,6 +64,28 @@ if(NOT HAS_CLANG_TIDY_HEADERS)
                  "clang-tidy headers are not present.")
   return()
 endif()
+
+# The clangTidy plugin uses C++20, so ensure that we support C++20 when using libstdc++.
+# This is required because some versions of libstdc++ used as a system library on build platforms
+# we support do not support C++20 yet.
+# Note it has not been tested whether version 11 works.
+file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/test.cpp" "
+#include <version>
+#if defined(_GLIBCXX_RELEASE) && _GLIBCXX_RELEASE < 12
+  # error The libstdc++ version is too old.
+#endif
+int main(){}
+")
+try_compile(HAS_NEWER_STANDARD_LIBRARY
+  "${CMAKE_CURRENT_BINARY_DIR}"
+  "${CMAKE_CURRENT_BINARY_DIR}/test.cpp"
+   LINK_LIBRARIES clangTidy)
+
+if(NOT HAS_NEWER_STANDARD_LIBRARY)
+  message(STATUS "Clang-tidy tests are disabled due to using "
+                 "stdlibc++ older than version 12")
+  return()
+endif()
 message(STATUS "Clang-tidy tests are enabled.")
 
 set(SOURCES
@@ -88,5 +110,3 @@ set_target_properties(cxx-tidy PROPERTIES
 
 set_target_properties(cxx-tidy PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 set(CMAKE_SHARED_MODULE_SUFFIX_CXX .plugin) # Use a portable suffix to simplify how we can find it from Lit
-
-list(APPEND LIBCXX_TEST_DEPS cxx-tidy)
diff --git a/libcxx/utils/ci/docker-compose.yml b/libcxx/utils/ci/docker-compose.yml
index af9a48481e8b..63aa43990b29 100644
--- a/libcxx/utils/ci/docker-compose.yml
+++ b/libcxx/utils/ci/docker-compose.yml
@@ -21,7 +21,7 @@ services:
       dockerfile: Dockerfile
       target: actions-builder
       args:
-        BASE_IMAGE: ghcr.io/actions/actions-runner:2.314.1
+        BASE_IMAGE: ghcr.io/actions/actions-runner:2.316.1
         <<: *compiler_versions
   android-buildkite-builder:
     image: ghcr.io/libcxx/android-buildkite-builder:${TAG:-latest}
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
index f2b8d55c0e11..d0171c84acbc 100755
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -398,6 +398,23 @@ feature_test_macros = [
             "unimplemented": True,
         },
         {
+            "name": "__cpp_lib_containers_ranges",
+            "values": {"c++23": 202202},
+            "headers": [
+                "deque",
+                "forward_list",
+                "list",
+                "map",
+                "queue",
+                "set",
+                "stack",
+                "string",
+                "unordered_map",
+                "unordered_set",
+                "vector",
+            ],
+        },
+        {
             "name": "__cpp_lib_copyable_function",
             "values": {"c++26": 202306},  # P2548R6 copyable_function
             "headers": ["functional"],
@@ -991,20 +1008,7 @@ feature_test_macros = [
         {
             "name": "__cpp_lib_ranges_to_container",
             "values": {"c++23": 202202},
-            "headers": [
-                "deque",
-                "forward_list",
-                "list",
-                "map",
-                "queue",
-                "ranges",
-                "set",
-                "stack",
-                "string",
-                "unordered_map",
-                "unordered_set",
-                "vector",
-            ],
+            "headers": ["ranges"],
         },
         {
             "name": "__cpp_lib_ranges_zip",
@@ -1040,7 +1044,6 @@ feature_test_macros = [
             "name": "__cpp_lib_reference_wrapper",
             "values": {"c++26": 202403}, # P2944R3: Comparisons for reference_wrapper
             "headers": ["functional"],
-            "unimplemented": True,
         },
         {
             "name": "__cpp_lib_remove_cvref",
diff --git a/libcxxabi/src/aix_state_tab_eh.inc b/libcxxabi/src/aix_state_tab_eh.inc
index 9f46001b0209..285c9ac71d60 100644
--- a/libcxxabi/src/aix_state_tab_eh.inc
+++ b/libcxxabi/src/aix_state_tab_eh.inc
@@ -102,8 +102,6 @@ static bool state_tab_dbg() {
 
 namespace __state_table_eh {
 
-using destruct_f = void (*)(void*);
-
 // Definition of flags for the state table entry field 'action flag'.
 enum FSMEntryCount : intptr_t { beginCatch = -1, endCatch = -2, deleteObject = -3, cleanupLabel = -4, terminate = -5 };
 
@@ -145,8 +143,10 @@ struct FSMEntry {
     intptr_t nextStatePtr;
   };
   union {
-    // Address of the destructor function.
-    void (*destructor)(void*, size_t);
+    // Address of the destructor function with 1 argument.
+    void (*destructor)(void*);
+    // Address of the destructor function with 2 arguments.
+    void (*xlCDestructor)(void*, size_t);
     // The address of the catch block or cleanup code.
     void* landingPad;
   };
@@ -191,17 +191,12 @@ static void invoke_destructor(FSMEntry* fsmEntry, void* addr) {
   try {
     if (fsmEntry->elementCount == 1) {
       _LIBCXXABI_TRACE_STATETAB0("calling scalar destructor\n");
-      (*fsmEntry->destructor)(addr, dtorArgument);
+      (*fsmEntry->xlCDestructor)(addr, dtorArgument);
       _LIBCXXABI_TRACE_STATETAB0("returned from scalar destructor\n");
     } else {
       _LIBCXXABI_TRACE_STATETAB0("calling vector destructor\n");
-      // TODO: in the legacy ABI, destructors had a second argument. We don't expect to encounter
-      // destructors of this type in the itanium-based ABI, so this should be safe, but this could use some cleanup.
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wcast-function-type"
       __cxa_vec_cleanup(addr, reinterpret_cast<size_t>(fsmEntry->elementCount), fsmEntry->elemSize,
-                        reinterpret_cast<destruct_f>(fsmEntry->destructor));
-#pragma GCC diagnostic pop
+                        fsmEntry->destructor);
       _LIBCXXABI_TRACE_STATETAB0("returned from vector destructor\n");
     }
   } catch (...) {
@@ -218,7 +213,7 @@ static void invoke_delete(FSMEntry* fsmEntry, void* addr) {
   try {
     _LIBCXXABI_TRACE_STATETAB0("..calling delete()\n");
     // 'destructor' holds a function pointer to delete().
-    (*fsmEntry->destructor)(objectAddress, fsmEntry->elemSize);
+    (*fsmEntry->xlCDestructor)(objectAddress, fsmEntry->elemSize);
     _LIBCXXABI_TRACE_STATETAB0("..returned from delete()\n");
   } catch (...) {
     _LIBCXXABI_TRACE_STATETAB0("Uncaught exception in delete(), terminating\n");
diff --git a/libcxxabi/src/cxa_personality.cpp b/libcxxabi/src/cxa_personality.cpp
index 4b6c4edbc266..d95d78131940 100644
--- a/libcxxabi/src/cxa_personality.cpp
+++ b/libcxxabi/src/cxa_personality.cpp
@@ -717,9 +717,7 @@ static void scan_eh_tab(scan_results &results, _Unwind_Action actions,
             if (actionEntry == 0)
             {
                 // Found a cleanup
-                results.reason = actions & _UA_SEARCH_PHASE
-                                     ? _URC_CONTINUE_UNWIND
-                                     : _URC_HANDLER_FOUND;
+                results.reason = (actions & _UA_SEARCH_PHASE) ? _URC_CONTINUE_UNWIND : _URC_HANDLER_FOUND;
                 return;
             }
             // Convert 1-based byte offset into
diff --git a/libcxxabi/src/demangle/ItaniumDemangle.h b/libcxxabi/src/demangle/ItaniumDemangle.h
index 4a0444d407ea..36bf45463636 100644
--- a/libcxxabi/src/demangle/ItaniumDemangle.h
+++ b/libcxxabi/src/demangle/ItaniumDemangle.h
@@ -5715,6 +5715,7 @@ Node *AbstractManglingParser<Derived, Alloc>::parseTemplateParam() {
 }
 
 // <template-param-decl> ::= Ty                          # type parameter
+//                       ::= Tk <concept name> [<template-args>] # constrained type parameter
 //                       ::= Tn <type>                   # non-type parameter
 //                       ::= Tt <template-param-decl>* E # template parameter
 //                       ::= Tp <template-param-decl>    # parameter pack
@@ -5846,7 +5847,7 @@ Node *AbstractManglingParser<Derived, Alloc>::parseTemplateArg() {
   }
 }
 
-// <template-args> ::= I <template-arg>* E
+// <template-args> ::= I <template-arg>* [Q <requires-clause expr>] E
 //     extension, the abi says <template-arg>+
 template <typename Derived, typename Alloc>
 Node *
diff --git a/lld/COFF/DebugTypes.cpp b/lld/COFF/DebugTypes.cpp
index a4c808e4c9a0..7689ad163a65 100644
--- a/lld/COFF/DebugTypes.cpp
+++ b/lld/COFF/DebugTypes.cpp
@@ -465,7 +465,7 @@ static bool equalsPath(StringRef path1, StringRef path2) {
 #if defined(_WIN32)
   return path1.equals_insensitive(path2);
 #else
-  return path1.equals(path2);
+  return path1 == path2;
 #endif
 }
 
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index b29e1e1a67f1..dd33f4bd772f 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -151,7 +151,7 @@ bool link(ArrayRef<const char *> args, llvm::raw_ostream &stdoutOS,
                                  "--error-limit=0 to see all errors)";
 
   config = ConfigWrapper();
-  script = std::make_unique<LinkerScript>();
+  script = ScriptWrapper();
 
   symAux.emplace_back();
 
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index fa48552b8f7a..fa81611e7c9e 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -1128,7 +1128,7 @@ void InputSectionBase::adjustSplitStackFunctionPrologues(uint8_t *buf,
   for (Relocation &rel : relocs()) {
     // Ignore calls into the split-stack api.
     if (rel.sym->getName().starts_with("__morestack")) {
-      if (rel.sym->getName().equals("__morestack"))
+      if (rel.sym->getName() == "__morestack")
         morestackCalls.push_back(&rel);
       continue;
     }
diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp
index f815b3ac6fee..c0a5014817b9 100644
--- a/lld/ELF/LinkerScript.cpp
+++ b/lld/ELF/LinkerScript.cpp
@@ -44,7 +44,7 @@ using namespace llvm::support::endian;
 using namespace lld;
 using namespace lld::elf;
 
-std::unique_ptr<LinkerScript> elf::script;
+ScriptWrapper elf::script;
 
 static bool isSectionPrefix(StringRef prefix, StringRef name) {
   return name.consume_front(prefix) && (name.empty() || name[0] == '.');
@@ -801,7 +801,7 @@ static OutputDesc *addInputSec(StringMap<TinyPtrVector<OutputSection *>> &map,
       auto *firstIsec = cast<InputSectionBase>(
           cast<InputSectionDescription>(sec->commands[0])->sectionBases[0]);
       OutputSection *firstIsecOut =
-          firstIsec->flags & SHF_LINK_ORDER
+          (firstIsec->flags & SHF_LINK_ORDER)
               ? firstIsec->getLinkOrderDep()->getOutputSection()
               : nullptr;
       if (firstIsecOut != isec->getLinkOrderDep()->getOutputSection())
diff --git a/lld/ELF/LinkerScript.h b/lld/ELF/LinkerScript.h
index fa7c6eb9c0d8..b09cd12c46f9 100644
--- a/lld/ELF/LinkerScript.h
+++ b/lld/ELF/LinkerScript.h
@@ -402,7 +402,12 @@ public:
   llvm::MapVector<StringRef, SmallVector<StringRef, 0>> provideMap;
 };
 
-LLVM_LIBRARY_VISIBILITY extern std::unique_ptr<LinkerScript> script;
+struct ScriptWrapper {
+  LinkerScript s;
+  LinkerScript *operator->() { return &s; }
+};
+
+LLVM_LIBRARY_VISIBILITY extern ScriptWrapper script;
 
 } // end namespace lld::elf
 
diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td
index 73a4f9662a56..b9e05a4b1fd5 100644
--- a/lld/ELF/Options.td
+++ b/lld/ELF/Options.td
@@ -68,7 +68,7 @@ defm compress_debug_sections:
   MetaVarName<"[none,zlib,zstd]">;
 
 defm compress_sections: EEq<"compress-sections",
-  "Compress output sections that match the glob and do not have the SHF_ALLOC flag."
+  "Compress output sections that match the glob and do not have the SHF_ALLOC flag. "
   "The compression level is <level> (if specified) or a default speed-focused level">,
   MetaVarName<"<section-glob>={none,zlib,zstd}[:level]">;
 
diff --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp
index 2dbbff06a890..9c667241360f 100644
--- a/lld/ELF/OutputSections.cpp
+++ b/lld/ELF/OutputSections.cpp
@@ -438,10 +438,10 @@ template <class ELFT> void OutputSection::maybeCompress() {
     compressed.type = ELFCOMPRESS_ZLIB;
     compressed.checksum = checksum;
   }
+#endif
 
   compressed.shards = std::move(shardsOut);
   flags |= SHF_COMPRESSED;
-#endif
 }
 
 static void writeInt(uint8_t *buf, uint64_t data, uint64_t size) {
diff --git a/lld/ELF/Target.cpp b/lld/ELF/Target.cpp
index d879a427e9c0..3e221646ce24 100644
--- a/lld/ELF/Target.cpp
+++ b/lld/ELF/Target.cpp
@@ -141,7 +141,7 @@ bool TargetInfo::needsThunk(RelExpr expr, RelType type, const InputFile *file,
 
 bool TargetInfo::adjustPrologueForCrossSplitStack(uint8_t *loc, uint8_t *end,
                                                   uint8_t stOther) const {
-  llvm_unreachable("Target doesn't support split stacks.");
+  fatal("target doesn't support split stacks");
 }
 
 bool TargetInfo::inBranchRange(RelType type, uint64_t src, uint64_t dst) const {
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 240c16a4d8f6..e400ed2ae945 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -577,7 +577,7 @@ static bool isRelroSection(const OutputSection *sec) {
   // for accessing .got as well, .got and .toc need to be close enough in the
   // virtual address space. Usually, .toc comes just after .got. Since we place
   // .got into RELRO, .toc needs to be placed into RELRO too.
-  if (sec->name.equals(".toc"))
+  if (sec->name == ".toc")
     return true;
 
   // .got.plt contains pointers to external function symbols. They are
diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index 65de531db04b..d4d8d53d69ee 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -1507,7 +1507,7 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
       StringRef sep = sys::path::get_separator();
       // real_path removes trailing slashes as part of the normalization, but
       // these are meaningful for our text based stripping
-      if (config->osoPrefix.equals(".") || config->osoPrefix.ends_with(sep))
+      if (config->osoPrefix == "." || config->osoPrefix.ends_with(sep))
         expanded += sep;
       config->osoPrefix = saver().save(expanded.str());
     }
diff --git a/lld/MachO/ObjC.cpp b/lld/MachO/ObjC.cpp
index 95fe0c9374f1..9d1612beae87 100644
--- a/lld/MachO/ObjC.cpp
+++ b/lld/MachO/ObjC.cpp
@@ -186,13 +186,26 @@ ObjcCategoryChecker::ObjcCategoryChecker()
       roClassLayout(target->wordSize), listHeaderLayout(target->wordSize),
       methodLayout(target->wordSize) {}
 
-// \p r must point to an offset within a cstring section.
+// \p r must point to an offset within a CStringInputSection or a
+// ConcatInputSection
 static StringRef getReferentString(const Reloc &r) {
   if (auto *isec = r.referent.dyn_cast<InputSection *>())
     return cast<CStringInputSection>(isec)->getStringRefAtOffset(r.addend);
+
   auto *sym = cast<Defined>(r.referent.get<Symbol *>());
-  return cast<CStringInputSection>(sym->isec())
-      ->getStringRefAtOffset(sym->value + r.addend);
+  auto *symIsec = sym->isec();
+  auto symOffset = sym->value + r.addend;
+
+  if (auto *s = dyn_cast_or_null<CStringInputSection>(symIsec))
+    return s->getStringRefAtOffset(symOffset);
+
+  if (isa<ConcatInputSection>(symIsec)) {
+    auto strData = symIsec->data.slice(symOffset);
+    const char *pszData = reinterpret_cast<const char *>(strData.data());
+    return StringRef(pszData, strnlen(pszData, strData.size()));
+  }
+
+  llvm_unreachable("unknown reference section in getReferentString");
 }
 
 void ObjcCategoryChecker::parseMethods(const ConcatInputSection *methodsIsec,
@@ -351,30 +364,28 @@ class ObjcCategoryMerger {
   // alignment as already used in existing (input) categories. To do this we
   // have InfoCategoryWriter which contains the various sections that the
   // generated categories will be written to.
-  template <typename T> struct InfoWriteSection {
+  struct InfoWriteSection {
     bool valid = false; // Data has been successfully collected from input
     uint32_t align = 0;
     Section *inputSection;
     Reloc relocTemplate;
-    T *outputSection;
+    OutputSection *outputSection;
   };
 
   struct InfoCategoryWriter {
-    InfoWriteSection<ConcatOutputSection> catListInfo;
-    InfoWriteSection<ConcatOutputSection> catBodyInfo;
-    InfoWriteSection<CStringSection> catNameInfo;
-    InfoWriteSection<ConcatOutputSection> catPtrListInfo;
+    InfoWriteSection catListInfo;
+    InfoWriteSection catBodyInfo;
+    InfoWriteSection catNameInfo;
+    InfoWriteSection catPtrListInfo;
   };
 
   // Information about a pointer list in the original categories (method lists,
   // protocol lists, etc)
   struct PointerListInfo {
-    PointerListInfo(const char *_categoryPrefix, uint32_t _categoryOffset,
-                    uint32_t _pointersPerStruct)
-        : categoryPrefix(_categoryPrefix), categoryOffset(_categoryOffset),
+    PointerListInfo(const char *_categoryPrefix, uint32_t _pointersPerStruct)
+        : categoryPrefix(_categoryPrefix),
           pointersPerStruct(_pointersPerStruct) {}
     const char *categoryPrefix;
-    uint32_t categoryOffset = 0;
 
     uint32_t pointersPerStruct = 0;
 
@@ -399,25 +410,16 @@ class ObjcCategoryMerger {
     // In case we generate new data, mark the new data as belonging to this file
     ObjFile *objFileForMergeData = nullptr;
 
-    PointerListInfo instanceMethods = {
-        objc::symbol_names::categoryInstanceMethods,
-        /*_categoryOffset=*/catLayout.instanceMethodsOffset,
-        /*pointersPerStruct=*/3};
-    PointerListInfo classMethods = {
-        objc::symbol_names::categoryClassMethods,
-        /*_categoryOffset=*/catLayout.classMethodsOffset,
-        /*pointersPerStruct=*/3};
+    PointerListInfo instanceMethods = {objc::symbol_names::instanceMethods,
+                                       /*pointersPerStruct=*/3};
+    PointerListInfo classMethods = {objc::symbol_names::categoryClassMethods,
+                                    /*pointersPerStruct=*/3};
     PointerListInfo protocols = {objc::symbol_names::categoryProtocols,
-                                 /*_categoryOffset=*/catLayout.protocolsOffset,
                                  /*pointersPerStruct=*/0};
-    PointerListInfo instanceProps = {
-        objc::symbol_names::listProprieties,
-        /*_categoryOffset=*/catLayout.instancePropsOffset,
-        /*pointersPerStruct=*/2};
-    PointerListInfo classProps = {
-        objc::symbol_names::klassPropList,
-        /*_categoryOffset=*/catLayout.classPropsOffset,
-        /*pointersPerStruct=*/2};
+    PointerListInfo instanceProps = {objc::symbol_names::listProprieties,
+                                     /*pointersPerStruct=*/2};
+    PointerListInfo classProps = {objc::symbol_names::klassPropList,
+                                  /*pointersPerStruct=*/2};
   };
 
 public:
@@ -426,19 +428,20 @@ public:
   static void doCleanup();
 
 private:
+  DenseSet<const Symbol *> collectNlCategories();
   void collectAndValidateCategoriesData();
   void
   mergeCategoriesIntoSingleCategory(std::vector<InfoInputCategory> &categories);
 
   void eraseISec(ConcatInputSection *isec);
+  void removeRefsToErasedIsecs();
   void eraseMergedCategories();
 
   void generateCatListForNonErasedCategories(
-      std::map<ConcatInputSection *, std::set<uint64_t>>
+      MapVector<ConcatInputSection *, std::set<uint64_t>>
           catListToErasedOffsets);
-  template <typename T>
   void collectSectionWriteInfoFromIsec(const InputSection *isec,
-                                       InfoWriteSection<T> &catWriteInfo);
+                                       InfoWriteSection &catWriteInfo);
   void collectCategoryWriterInfoFromCategory(const InfoInputCategory &catInfo);
   void parseCatInfoToExtInfo(const InfoInputCategory &catInfo,
                              ClassExtensionInfo &extInfo);
@@ -489,7 +492,9 @@ private:
   InfoCategoryWriter infoCategoryWriter;
   std::vector<ConcatInputSection *> &allInputSections;
   // Map of base class Symbol to list of InfoInputCategory's for it
-  DenseMap<const Symbol *, std::vector<InfoInputCategory>> categoryMap;
+  MapVector<const Symbol *, std::vector<InfoInputCategory>> categoryMap;
+  // Set for tracking InputSection erased via eraseISec
+  DenseSet<InputSection *> erasedIsecs;
 
   // Normally, the binary data comes from the input files, but since we're
   // generating binary data ourselves, we use the below array to store it in.
@@ -511,15 +516,12 @@ ObjcCategoryMerger::ObjcCategoryMerger(
       protocolListHeaderLayout(target->wordSize),
       allInputSections(_allInputSections) {}
 
-// This is a template so that it can be used both for CStringSection and
-// ConcatOutputSection
-template <typename T>
 void ObjcCategoryMerger::collectSectionWriteInfoFromIsec(
-    const InputSection *isec, InfoWriteSection<T> &catWriteInfo) {
+    const InputSection *isec, InfoWriteSection &catWriteInfo) {
 
   catWriteInfo.inputSection = const_cast<Section *>(&isec->section);
   catWriteInfo.align = isec->align;
-  catWriteInfo.outputSection = dyn_cast_or_null<T>(isec->parent);
+  catWriteInfo.outputSection = isec->parent;
 
   assert(catWriteInfo.outputSection &&
          "outputSection may not be null in collectSectionWriteInfoFromIsec.");
@@ -533,6 +535,8 @@ void ObjcCategoryMerger::collectSectionWriteInfoFromIsec(
 Symbol *
 ObjcCategoryMerger::tryGetSymbolAtIsecOffset(const ConcatInputSection *isec,
                                              uint32_t offset) {
+  if (!isec)
+    return nullptr;
   const Reloc *reloc = isec->getRelocAt(offset);
 
   if (!reloc)
@@ -576,19 +580,19 @@ void ObjcCategoryMerger::collectCategoryWriterInfoFromCategory(
     const InfoInputCategory &catInfo) {
 
   if (!infoCategoryWriter.catListInfo.valid)
-    collectSectionWriteInfoFromIsec<ConcatOutputSection>(
-        catInfo.catListIsec, infoCategoryWriter.catListInfo);
+    collectSectionWriteInfoFromIsec(catInfo.catListIsec,
+                                    infoCategoryWriter.catListInfo);
   if (!infoCategoryWriter.catBodyInfo.valid)
-    collectSectionWriteInfoFromIsec<ConcatOutputSection>(
-        catInfo.catBodyIsec, infoCategoryWriter.catBodyInfo);
+    collectSectionWriteInfoFromIsec(catInfo.catBodyIsec,
+                                    infoCategoryWriter.catBodyInfo);
 
   if (!infoCategoryWriter.catNameInfo.valid) {
     lld::macho::Defined *catNameSym =
         tryGetDefinedAtIsecOffset(catInfo.catBodyIsec, catLayout.nameOffset);
     assert(catNameSym && "Category does not have a valid name Symbol");
 
-    collectSectionWriteInfoFromIsec<CStringSection>(
-        catNameSym->isec(), infoCategoryWriter.catNameInfo);
+    collectSectionWriteInfoFromIsec(catNameSym->isec(),
+                                    infoCategoryWriter.catNameInfo);
   }
 
   // Collect writer info from all the category lists (we're assuming they all
@@ -598,8 +602,8 @@ void ObjcCategoryMerger::collectCategoryWriterInfoFromCategory(
          off <= catLayout.classPropsOffset; off += target->wordSize) {
       if (Defined *ptrList =
               tryGetDefinedAtIsecOffset(catInfo.catBodyIsec, off)) {
-        collectSectionWriteInfoFromIsec<ConcatOutputSection>(
-            ptrList->isec(), infoCategoryWriter.catPtrListInfo);
+        collectSectionWriteInfoFromIsec(ptrList->isec(),
+                                        infoCategoryWriter.catPtrListInfo);
         // we've successfully collected data, so we can break
         break;
       }
@@ -795,7 +799,7 @@ void ObjcCategoryMerger::emitAndLinkProtocolList(
   listSec->parent = infoCategoryWriter.catPtrListInfo.outputSection;
 
   std::string symName = ptrList.categoryPrefix;
-  symName += extInfo.baseClassName + "_$_(" + extInfo.mergedContainerName + ")";
+  symName += extInfo.baseClassName + "(" + extInfo.mergedContainerName + ")";
 
   Defined *ptrListSym = make<Defined>(
       newStringData(symName.c_str()), /*file=*/parentSym->getObjectFile(),
@@ -853,7 +857,7 @@ void ObjcCategoryMerger::emitAndLinkPointerList(
   listSec->parent = infoCategoryWriter.catPtrListInfo.outputSection;
 
   std::string symName = ptrList.categoryPrefix;
-  symName += extInfo.baseClassName + "_$_" + extInfo.mergedContainerName;
+  symName += extInfo.baseClassName + "(" + extInfo.mergedContainerName + ")";
 
   Defined *ptrListSym = make<Defined>(
       newStringData(symName.c_str()), /*file=*/parentSym->getObjectFile(),
@@ -930,7 +934,7 @@ Defined *ObjcCategoryMerger::emitCategoryBody(const std::string &name,
   addInputSection(newBodySec);
 
   std::string symName =
-      objc::symbol_names::category + baseClassName + "_$_(" + name + ")";
+      objc::symbol_names::category + baseClassName + "(" + name + ")";
   Defined *catBodySym = make<Defined>(
       newStringData(symName.c_str()), /*file=*/objFile, newBodySec,
       /*value=*/0, bodyData.size(), /*isWeakDef=*/false, /*isExternal=*/false,
@@ -1057,7 +1061,27 @@ void ObjcCategoryMerger::createSymbolReference(Defined *refFrom,
   refFrom->isec()->relocs.push_back(r);
 }
 
+// Get the list of categories in the '__objc_nlcatlist' section. We can't
+// optimize these as they have a '+load' method that has to be called at
+// runtime.
+DenseSet<const Symbol *> ObjcCategoryMerger::collectNlCategories() {
+  DenseSet<const Symbol *> nlCategories;
+
+  for (InputSection *sec : allInputSections) {
+    if (sec->getName() != section_names::objcNonLazyCatList)
+      continue;
+
+    for (auto &r : sec->relocs) {
+      const Symbol *sym = r.referent.dyn_cast<Symbol *>();
+      nlCategories.insert(sym);
+    }
+  }
+  return nlCategories;
+}
+
 void ObjcCategoryMerger::collectAndValidateCategoriesData() {
+  auto nlCategories = collectNlCategories();
+
   for (InputSection *sec : allInputSections) {
     if (sec->getName() != section_names::objcCatList)
       continue;
@@ -1071,6 +1095,9 @@ void ObjcCategoryMerger::collectAndValidateCategoriesData() {
       assert(categorySym &&
              "Failed to get a valid category at __objc_catlit offset");
 
+      if (nlCategories.count(categorySym))
+        continue;
+
       // We only support ObjC categories (no swift + @objc)
       // TODO: Support swift + @objc categories also
       if (!categorySym->getName().starts_with(objc::symbol_names::category))
@@ -1101,7 +1128,7 @@ void ObjcCategoryMerger::collectAndValidateCategoriesData() {
 // (not erased). For these not erased categories, we generate new __objc_catlist
 // entries since the parent __objc_catlist entry will be erased
 void ObjcCategoryMerger::generateCatListForNonErasedCategories(
-    const std::map<ConcatInputSection *, std::set<uint64_t>>
+    const MapVector<ConcatInputSection *, std::set<uint64_t>>
         catListToErasedOffsets) {
 
   // Go through all offsets of all __objc_catlist's that we process and if there
@@ -1121,7 +1148,7 @@ void ObjcCategoryMerger::generateCatListForNonErasedCategories(
       assert(nonErasedCatBody && "Failed to relocate non-deleted category");
 
       // Allocate data for the new __objc_catlist slot
-      auto bodyData = newSectionData(target->wordSize);
+      llvm::ArrayRef<uint8_t> bodyData = newSectionData(target->wordSize);
 
       // We mark the __objc_catlist slot as belonging to the same file as the
       // category
@@ -1156,6 +1183,8 @@ void ObjcCategoryMerger::generateCatListForNonErasedCategories(
 }
 
 void ObjcCategoryMerger::eraseISec(ConcatInputSection *isec) {
+  erasedIsecs.insert(isec);
+
   isec->live = false;
   for (auto &sym : isec->symbols)
     sym->used = false;
@@ -1166,7 +1195,7 @@ void ObjcCategoryMerger::eraseISec(ConcatInputSection *isec) {
 // them.
 void ObjcCategoryMerger::eraseMergedCategories() {
   // Map of InputSection to a set of offsets of the categories that were merged
-  std::map<ConcatInputSection *, std::set<uint64_t>> catListToErasedOffsets;
+  MapVector<ConcatInputSection *, std::set<uint64_t>> catListToErasedOffsets;
 
   for (auto &mapEntry : categoryMap) {
     for (InfoInputCategory &catInfo : mapEntry.second) {
@@ -1190,6 +1219,7 @@ void ObjcCategoryMerger::eraseMergedCategories() {
         continue;
 
       eraseISec(catInfo.catBodyIsec);
+
       tryEraseDefinedAtIsecOffset(catInfo.catBodyIsec, catLayout.nameOffset);
       tryEraseDefinedAtIsecOffset(catInfo.catBodyIsec,
                                   catLayout.instanceMethodsOffset);
@@ -1203,6 +1233,33 @@ void ObjcCategoryMerger::eraseMergedCategories() {
                                   catLayout.instancePropsOffset);
     }
   }
+
+  removeRefsToErasedIsecs();
+}
+
+// The compiler may generate references to categories inside the addrsig
+// section. This function will erase these references.
+void ObjcCategoryMerger::removeRefsToErasedIsecs() {
+  for (InputSection *isec : inputSections) {
+    if (isec->getName() != section_names::addrSig)
+      continue;
+
+    auto removeRelocs = [this](Reloc &r) {
+      auto *isec = dyn_cast_or_null<ConcatInputSection>(
+          r.referent.dyn_cast<InputSection *>());
+      if (!isec) {
+        Defined *sym =
+            dyn_cast_or_null<Defined>(r.referent.dyn_cast<Symbol *>());
+        if (sym)
+          isec = dyn_cast<ConcatInputSection>(sym->isec());
+      }
+      if (!isec)
+        return false;
+      return erasedIsecs.count(isec) > 0;
+    };
+
+    llvm::erase_if(isec->relocs, removeRelocs);
+  }
 }
 
 void ObjcCategoryMerger::doMerge() {
@@ -1222,7 +1279,7 @@ void ObjcCategoryMerger::doCleanup() { generatedSectionData.clear(); }
 StringRef ObjcCategoryMerger::newStringData(const char *str) {
   uint32_t len = strlen(str);
   uint32_t bufSize = len + 1;
-  auto &data = newSectionData(bufSize);
+  SmallVector<uint8_t> &data = newSectionData(bufSize);
   char *strData = reinterpret_cast<char *>(data.data());
   // Copy the string chars and null-terminator
   memcpy(strData, str, bufSize);
diff --git a/lld/test/COFF/Inputs/combined-resources.rc b/lld/test/COFF/Inputs/combined-resources.rc
index 08bfb94c44ae..1caf0b356b40 100644
--- a/lld/test/COFF/Inputs/combined-resources.rc
+++ b/lld/test/COFF/Inputs/combined-resources.rc
@@ -1,50 +1,50 @@
-#include "windows.h"
-
-LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US
-
-myaccelerators ACCELERATORS
-{
-	"^C", 999, VIRTKEY, ALT
-	"D", 1100, VIRTKEY, CONTROL, SHIFT
-	"^R", 444, ASCII, NOINVERT
-}
-
-cursor BITMAP "combined-resources-cursor.bmp"
-okay BITMAP "combined-resources-okay.bmp"
-
-14432 MENU
-LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED
-{
-	MENUITEM "yu", 100
-	MENUITEM "shala", 101
-	MENUITEM "kaoya", 102
-}
-
-testdialog DIALOG 10, 10, 200, 300
-STYLE WS_POPUP | WS_BORDER
-CAPTION "Test"
-{
-	CTEXT "Continue:", 1, 10, 10, 230, 14
-	PUSHBUTTON "&OK", 2, 66, 134, 161, 13
-}
-
-12 ACCELERATORS
-{
-	"X", 164, VIRTKEY, ALT
-	"H", 5678, VIRTKEY, CONTROL, SHIFT
-	"^R", 444, ASCII, NOINVERT
-}
-
-"eat" MENU
-LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_AUS
-{
-	MENUITEM "fish", 100
-	MENUITEM "salad", 101
-	MENUITEM "duck", 102
-}
-
-
-myresource stringarray {
-	"this is a user defined resource\0",
-	"it contains many strings\0",
-}
+#include "windows.h"
+
+LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US
+
+myaccelerators ACCELERATORS
+{
+	"^C", 999, VIRTKEY, ALT
+	"D", 1100, VIRTKEY, CONTROL, SHIFT
+	"^R", 444, ASCII, NOINVERT
+}
+
+cursor BITMAP "combined-resources-cursor.bmp"
+okay BITMAP "combined-resources-okay.bmp"
+
+14432 MENU
+LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED
+{
+	MENUITEM "yu", 100
+	MENUITEM "shala", 101
+	MENUITEM "kaoya", 102
+}
+
+testdialog DIALOG 10, 10, 200, 300
+STYLE WS_POPUP | WS_BORDER
+CAPTION "Test"
+{
+	CTEXT "Continue:", 1, 10, 10, 230, 14
+	PUSHBUTTON "&OK", 2, 66, 134, 161, 13
+}
+
+12 ACCELERATORS
+{
+	"X", 164, VIRTKEY, ALT
+	"H", 5678, VIRTKEY, CONTROL, SHIFT
+	"^R", 444, ASCII, NOINVERT
+}
+
+"eat" MENU
+LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_AUS
+{
+	MENUITEM "fish", 100
+	MENUITEM "salad", 101
+	MENUITEM "duck", 102
+}
+
+
+myresource stringarray {
+	"this is a user defined resource\0",
+	"it contains many strings\0",
+}
diff --git a/lld/test/COFF/pdb-type-server-invalid-signature.yaml b/lld/test/COFF/pdb-type-server-invalid-signature.yaml
index 87c436768484..8f1528ff1a89 100644
--- a/lld/test/COFF/pdb-type-server-invalid-signature.yaml
+++ b/lld/test/COFF/pdb-type-server-invalid-signature.yaml
@@ -23,8 +23,8 @@
 # RUN: cp %S/Inputs/pdb-diff-cl.pdb %T
 # RUN: lld-link %t3.obj -out:%t3.exe -debug -pdb:%t3.pdb -nodefaultlib -entry:main 2>&1 | FileCheck -DMSG=%errc_ENOENT %s -check-prefix=INVALID-PATH -allow-empty
 
-# INVALID-PATH-NOT: warning: Cannot use debug info for '{{.*}}3.obj' [LNK4099]
-# INVALID-PATH-NOT: failed to load reference 'c:\some_invalid_path_AABB98765\pdb-diff-cl.pdb': [[MSG]]
+# INVALID-PATH-NOT: warning: Cannot use debug info for '{{.*}}3.obj' [LNK4099]
+# INVALID-PATH-NOT: failed to load reference 'c:\some_invalid_path_AABB98765\pdb-diff-cl.pdb': [[MSG]]
 
 --- !COFF
 header:
diff --git a/lld/test/COFF/pdb_char8_t.ll b/lld/test/COFF/pdb_char8_t.ll
index 0cb71b641e8c..0d160f0e50c7 100644
--- a/lld/test/COFF/pdb_char8_t.ll
+++ b/lld/test/COFF/pdb_char8_t.ll
@@ -1,46 +1,46 @@
-; REQUIRES: x86
-; RUN: llc -mtriple x86_64-windows-msvc -filetype obj -o %t.obj %s
-; RUN: lld-link /nodefaultlib /noentry /dll /debug /out:%t.exe /pdb:%t.pdb %t.obj
-; RUN: llvm-pdbutil dump -type-index=0x7c %t.pdb
-
-; CHECK: 0x007C (char8_t) | char8_t
-
-define dso_local i32 @main() #0 !dbg !9 {
-  %1 = alloca i32, align 4
-  %2 = alloca i8, align 1
-  store i32 0, ptr %1, align 4
-  call void @llvm.dbg.declare(metadata ptr %2, metadata !13, metadata !DIExpression()), !dbg !15
-  store i8 0, ptr %2, align 1, !dbg !15
-  %3 = load i8, ptr %2, align 1, !dbg !16
-  %4 = zext i8 %3 to i32, !dbg !16
-  ret i32 %4, !dbg !16
-}
-
-; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-attributes #0 = { mustprogress noinline norecurse nounwind optnone uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
-attributes #1 = { nofree nosync nounwind readnone speculatable willreturn }
-
-!llvm.dbg.cu = !{!0}
-!llvm.linker.options = !{}
-!llvm.module.flags = !{!3, !4, !5, !6, !7}
-!llvm.ident = !{!8}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 13.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
-!1 = !DIFile(filename: "pdb_char8_t.cpp", directory: "C:\\src", checksumkind: CSK_MD5, checksum: "a00748d29f4e59003184945cd3e17ee3")
-!2 = !{}
-!3 = !{i32 2, !"CodeView", i32 1}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 2}
-!6 = !{i32 7, !"PIC Level", i32 2}
-!7 = !{i32 7, !"uwtable", i32 1}
-!8 = !{!"clang version 13.0.0"}
-!9 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 1, type: !10, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
-!10 = !DISubroutineType(types: !11)
-!11 = !{!12}
-!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-!13 = !DILocalVariable(name: "local", scope: !9, file: !1, line: 3, type: !14)
-!14 = !DIBasicType(name: "char8_t", size: 8, encoding: DW_ATE_UTF)
-!15 = !DILocation(line: 3, scope: !9)
-!16 = !DILocation(line: 4, scope: !9)
+; REQUIRES: x86
+; RUN: llc -mtriple x86_64-windows-msvc -filetype obj -o %t.obj %s
+; RUN: lld-link /nodefaultlib /noentry /dll /debug /out:%t.exe /pdb:%t.pdb %t.obj
+; RUN: llvm-pdbutil dump -type-index=0x7c %t.pdb
+
+; CHECK: 0x007C (char8_t) | char8_t
+
+define dso_local i32 @main() #0 !dbg !9 {
+  %1 = alloca i32, align 4
+  %2 = alloca i8, align 1
+  store i32 0, ptr %1, align 4
+  call void @llvm.dbg.declare(metadata ptr %2, metadata !13, metadata !DIExpression()), !dbg !15
+  store i8 0, ptr %2, align 1, !dbg !15
+  %3 = load i8, ptr %2, align 1, !dbg !16
+  %4 = zext i8 %3 to i32, !dbg !16
+  ret i32 %4, !dbg !16
+}
+
+; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+attributes #0 = { mustprogress noinline norecurse nounwind optnone uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nofree nosync nounwind readnone speculatable willreturn }
+
+!llvm.dbg.cu = !{!0}
+!llvm.linker.options = !{}
+!llvm.module.flags = !{!3, !4, !5, !6, !7}
+!llvm.ident = !{!8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 13.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "pdb_char8_t.cpp", directory: "C:\\src", checksumkind: CSK_MD5, checksum: "a00748d29f4e59003184945cd3e17ee3")
+!2 = !{}
+!3 = !{i32 2, !"CodeView", i32 1}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 2}
+!6 = !{i32 7, !"PIC Level", i32 2}
+!7 = !{i32 7, !"uwtable", i32 1}
+!8 = !{!"clang version 13.0.0"}
+!9 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 1, type: !10, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!10 = !DISubroutineType(types: !11)
+!11 = !{!12}
+!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!13 = !DILocalVariable(name: "local", scope: !9, file: !1, line: 3, type: !14)
+!14 = !DIBasicType(name: "char8_t", size: 8, encoding: DW_ATE_UTF)
+!15 = !DILocation(line: 3, scope: !9)
+!16 = !DILocation(line: 4, scope: !9)
diff --git a/lld/test/COFF/thinlto-index-only.ll b/lld/test/COFF/thinlto-index-only.ll
index f99134143e4d..8ef981d6090f 100644
--- a/lld/test/COFF/thinlto-index-only.ll
+++ b/lld/test/COFF/thinlto-index-only.ll
@@ -22,8 +22,8 @@
 ; BACKEND1: <GLOBALVAL_SUMMARY_BLOCK
 ; BACKEND1: <VERSION
 ; BACKEND1: <FLAGS
-; BACKEND1: <VALUE_GUID {{.*}} op0={{1|2}} {{op1=3060885059 op2=1207956914|op1=3684000822 op2=3884832250}}
-; BACKEND1: <VALUE_GUID {{.*}} op0={{1|2}} {{op1=3060885059 op2=1207956914|op1=3684000822 op2=3884832250}}
+; BACKEND1: <VALUE_GUID op0={{1|2}} op1={{-5300342847281564238|-2624081020897602054}}
+; BACKEND1: <VALUE_GUID op0={{1|2}} op1={{-5300342847281564238|-2624081020897602054}}
 ; BACKEND1: <COMBINED
 ; BACKEND1: <COMBINED
 ; BACKEND1: </GLOBALVAL_SUMMARY_BLOCK
@@ -37,7 +37,7 @@
 ; BACKEND2-NEXT: <GLOBALVAL_SUMMARY_BLOCK
 ; BACKEND2-NEXT: <VERSION
 ; BACKEND2-NEXT: <FLAGS
-; BACKEND2-NEXT: <VALUE_GUID {{.*}} op0=1 op1=3060885059 op2=1207956914
+; BACKEND2-NEXT: <VALUE_GUID op0=1 op1=-5300342847281564238
 ; BACKEND2-NEXT: <COMBINED
 ; BACKEND2-NEXT: </GLOBALVAL_SUMMARY_BLOCK
 
diff --git a/lld/test/ELF/aarch64-thunk-reuse2.s b/lld/test/ELF/aarch64-thunk-reuse2.s
index e9dd385605ad..c2cfee6f876c 100644
--- a/lld/test/ELF/aarch64-thunk-reuse2.s
+++ b/lld/test/ELF/aarch64-thunk-reuse2.s
@@ -14,7 +14,7 @@
 # CHECK:       <__AArch64ADRPThunk_>:
 # CHECK-NEXT:   8010708:       adrp    x16, 0x10000
 # CHECK-NEXT:                  add     x16, x16, #1792
-# CHECk-NEXT:                  br      x16
+# CHECK-NEXT:                  br      x16
 # CHECK-LABEL: <high>:
 # CHECK-NEXT:   8010714:       bl      0x8010708 <__AArch64ADRPThunk_>
 # CHECK-NEXT:                  b       0x8010708 <__AArch64ADRPThunk_>
diff --git a/lld/test/ELF/dynamic-list-cpp.s b/lld/test/ELF/dynamic-list-cpp.s
index b0efb8d16bcb..05f11e0079d3 100644
--- a/lld/test/ELF/dynamic-list-cpp.s
+++ b/lld/test/ELF/dynamic-list-cpp.s
@@ -1,18 +1,18 @@
-# REQUIRES: x86
-
-## Confirm both mangled and unmangled names may appear in
-## the --dynamic-list file.
-
-# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
-
-# RUN: echo '{ _Z1fv; extern "C++" { "g()"; }; };' > %t.list
-# RUN: ld.lld -pie --dynamic-list %t.list %t.o -o %t
-# RUN: llvm-readelf --dyn-syms %t | FileCheck %s
-
-# CHECK:      Symbol table '.dynsym' contains 3 entries:
-# CHECK:      _Z1fv
-# CHECK-NEXT: _Z1gv
-
-.globl _Z1fv, _Z1gv
-_Z1fv:
-_Z1gv:
+# REQUIRES: x86
+
+## Confirm both mangled and unmangled names may appear in
+## the --dynamic-list file.
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
+
+# RUN: echo '{ _Z1fv; extern "C++" { "g()"; }; };' > %t.list
+# RUN: ld.lld -pie --dynamic-list %t.list %t.o -o %t
+# RUN: llvm-readelf --dyn-syms %t | FileCheck %s
+
+# CHECK:      Symbol table '.dynsym' contains 3 entries:
+# CHECK:      _Z1fv
+# CHECK-NEXT: _Z1gv
+
+.globl _Z1fv, _Z1gv
+_Z1fv:
+_Z1gv:
diff --git a/lld/test/ELF/linkerscript/orphan-phdrs2.test b/lld/test/ELF/linkerscript/orphan-phdrs2.test
index c302e0e70b2b..d75c76da87e8 100644
--- a/lld/test/ELF/linkerscript/orphan-phdrs2.test
+++ b/lld/test/ELF/linkerscript/orphan-phdrs2.test
@@ -12,7 +12,7 @@
 # CHECK-NEXT:   Type {{.*}} Flg Align
 # CHECK-NEXT:   LOAD {{.*}} R E 0x
 # CHECK-NEXT:   LOAD {{.*}} RW  0x
-# CHECK-MEXT:   LOAD {{.*}} R   0x
+# CHECK-NEXT:   LOAD {{.*}} R   0x
 
 # CHECK:      Segment Sections...
 # CHECK-NEXT:   00 .text {{$}}
diff --git a/lld/test/ELF/lto/thinlto-emit-index.ll b/lld/test/ELF/lto/thinlto-emit-index.ll
index 03dfbc0200e9..ba2ac2ceb689 100644
--- a/lld/test/ELF/lto/thinlto-emit-index.ll
+++ b/lld/test/ELF/lto/thinlto-emit-index.ll
@@ -76,8 +76,8 @@
 ; BACKEND1: <GLOBALVAL_SUMMARY_BLOCK
 ; BACKEND1: <VERSION
 ; BACKEND1: <FLAGS
-; BACKEND1: <VALUE_GUID {{.*}} op0={{1|2}} {{op1=3060885059 op2=1207956914|op1=3432075125 op2=3712786831}}
-; BACKEND1: <VALUE_GUID {{.*}} op0={{1|2}} {{op1=3060885059 op2=1207956914|op1=3432075125 op2=3712786831}}
+; BACKEND1: <VALUE_GUID op0={{1|2}} op1={{-3706093650706652785|-5300342847281564238}}
+; BACKEND1: <VALUE_GUID op0={{1|2}} op1={{-3706093650706652785|-5300342847281564238}}
 ; BACKEND1: <COMBINED
 ; BACKEND1: <COMBINED
 ; BACKEND1: </GLOBALVAL_SUMMARY_BLOCK
@@ -90,7 +90,7 @@
 ; BACKEND2-NEXT: <GLOBALVAL_SUMMARY_BLOCK
 ; BACKEND2-NEXT: <VERSION
 ; BACKEND2-NEXT: <FLAGS
-; BACKEND2-NEXT: <VALUE_GUID {{.*}} op0=1 op1=3060885059 op2=1207956914
+; BACKEND2-NEXT: <VALUE_GUID op0=1 op1=-5300342847281564238
 ; BACKEND2-NEXT: <COMBINED
 ; BACKEND2-NEXT: </GLOBALVAL_SUMMARY_BLOCK
 
diff --git a/lld/test/ELF/lto/thinlto-index-only.ll b/lld/test/ELF/lto/thinlto-index-only.ll
index da60af80a004..abf58ce5ea41 100644
--- a/lld/test/ELF/lto/thinlto-index-only.ll
+++ b/lld/test/ELF/lto/thinlto-index-only.ll
@@ -103,8 +103,8 @@
 ; BACKEND1: <GLOBALVAL_SUMMARY_BLOCK
 ; BACKEND1: <VERSION
 ; BACKEND1: <FLAGS
-; BACKEND1: <VALUE_GUID {{.*}} op0={{1|2}} {{op1=3060885059 op2=1207956914|op1=3432075125 op2=3712786831}}
-; BACKEND1: <VALUE_GUID {{.*}} op0={{1|2}} {{op1=3060885059 op2=1207956914|op1=3432075125 op2=3712786831}}
+; BACKEND1: <VALUE_GUID op0={{1|2}} op1={{-3706093650706652785|-5300342847281564238}}
+; BACKEND1: <VALUE_GUID op0={{1|2}} op1={{-3706093650706652785|-5300342847281564238}}
 ; BACKEND1: <COMBINED
 ; BACKEND1: <COMBINED
 ; BACKEND1: </GLOBALVAL_SUMMARY_BLOCK
@@ -117,7 +117,7 @@
 ; BACKEND2-NEXT: <GLOBALVAL_SUMMARY_BLOCK
 ; BACKEND2-NEXT: <VERSION
 ; BACKEND2-NEXT: <FLAGS
-; BACKEND2-NEXT: <VALUE_GUID {{.*}} op0=1 op1=3060885059 op2=1207956914
+; BACKEND2-NEXT: <VALUE_GUID op0=1 op1=-5300342847281564238
 ; BACKEND2-NEXT: <COMBINED
 ; BACKEND2-NEXT: </GLOBALVAL_SUMMARY_BLOCK
 
diff --git a/lld/test/ELF/mips-eh_frame-pic.s b/lld/test/ELF/mips-eh_frame-pic.s
index c04dbdf57b08..79076e74a7e3 100644
--- a/lld/test/ELF/mips-eh_frame-pic.s
+++ b/lld/test/ELF/mips-eh_frame-pic.s
@@ -16,7 +16,7 @@
 # RUN: llvm-mc -filetype=obj -triple=mips64-unknown-linux --position-independent %s -o %t-pic.o
 # RUN: llvm-readobj -r %t-pic.o | FileCheck %s --check-prefixes=RELOCS,PIC64-RELOCS
 # RUN: ld.lld -shared %t-pic.o -o %t-pic.so
-# RUN: llvm-dwarfdump --eh-frame %t-pic.so | FileCheck %s --check-prefix=PIC-EH-FRAME
+# RUN: llvm-dwarfdump --eh-frame %t-pic.so | FileCheck %s --check-prefix=PIC64-EH-FRAME
 
 ## Also check MIPS32:
 # RUN: llvm-mc -filetype=obj -triple=mips-unknown-linux %s -o %t-nopic32.o
@@ -31,7 +31,7 @@
 # RUN: llvm-mc -filetype=obj -triple=mips-unknown-linux --position-independent %s -o %t-pic32.o
 # RUN: llvm-readobj -r %t-pic32.o | FileCheck %s --check-prefixes=RELOCS,PIC32-RELOCS
 # RUN: ld.lld -shared %t-pic32.o -o %t-pic32.so
-# RUN: llvm-dwarfdump --eh-frame %t-pic32.so | FileCheck %s --check-prefix=PIC-EH-FRAME
+# RUN: llvm-dwarfdump --eh-frame %t-pic32.so | FileCheck %s --check-prefix=PIC32-EH-FRAME
 
 # RELOCS:            .rel{{a?}}.eh_frame {
 # ABS32-RELOCS-NEXT:   0x1C R_MIPS_32 .text
@@ -44,7 +44,9 @@
 ##                                   ^^ fde pointer encoding: DW_EH_PE_sdata8
 # ABS32-EH-FRAME: Augmentation data: 0B
 ##                                   ^^ fde pointer encoding: DW_EH_PE_sdata4
-# PIC-EH-FRAME: Augmentation data: 1B
+# PIC32-EH-FRAME: Augmentation data: 1B
+##                                 ^^ fde pointer encoding: DW_EH_PE_pcrel | DW_EH_PE_sdata4
+# PIC64-EH-FRAME: Augmentation data: 1B
 ##                                 ^^ fde pointer encoding: DW_EH_PE_pcrel | DW_EH_PE_sdata4
 ## Note: ld.bfd converts the R_MIPS_64 relocs to DW_EH_PE_pcrel | DW_EH_PE_sdata8
 ## for N64 ABI (and DW_EH_PE_pcrel | DW_EH_PE_sdata4 for MIPS32)
diff --git a/lld/test/ELF/mips-got-page-script.s b/lld/test/ELF/mips-got-page-script.s
index 4055fb6dabef..3caf5cc03afc 100644
--- a/lld/test/ELF/mips-got-page-script.s
+++ b/lld/test/ELF/mips-got-page-script.s
@@ -19,32 +19,32 @@
 # CHECK-NEXT: Value: 0x40000
 
 # CHECK:      Local entries [
-# CHECK-BEXT:    Entry {
-# CHECK-BEXT:      Address:
-# CHECK-BEXT:      Access:
-# CHECK-BEXT:      Initial: 0x10000
-# CHECK-BEXT:    }
-# CHECK-BEXT:    Entry {
-# CHECK-BEXT:      Address:
-# CHECK-BEXT:      Access:
-# CHECK-BEXT:      Initial: 0x20000
-# CHECK-BEXT:    }
-# CHECK-BEXT:    Entry {
-# CHECK-BEXT:      Address:
-# CHECK-BEXT:      Access:
-# CHECK-BEXT:      Initial: 0x30000
-# CHECK-BEXT:    }
-# CHECK-BEXT:    Entry {
-# CHECK-BEXT:      Address:
-# CHECK-BEXT:      Access:
-# CHECK-BEXT:      Initial: 0x40000
-# CHECK-BEXT:    }
-# CHECK-BEXT:    Entry {
-# CHECK-BEXT:      Address:
-# CHECK-BEXT:      Access:
-# CHECK-BEXT:      Initial: 0x50000
-# CHECK-BEXT:    }
-# CHECK-BEXT:  ]
+# CHECK-NEXT:    Entry {
+# CHECK-NEXT:      Address:
+# CHECK-NEXT:      Access:
+# CHECK-NEXT:      Initial: 0x10000
+# CHECK-NEXT:    }
+# CHECK-NEXT:    Entry {
+# CHECK-NEXT:      Address:
+# CHECK-NEXT:      Access:
+# CHECK-NEXT:      Initial: 0x20000
+# CHECK-NEXT:    }
+# CHECK-NEXT:    Entry {
+# CHECK-NEXT:      Address:
+# CHECK-NEXT:      Access:
+# CHECK-NEXT:      Initial: 0x30000
+# CHECK-NEXT:    }
+# CHECK-NEXT:    Entry {
+# CHECK-NEXT:      Address:
+# CHECK-NEXT:      Access:
+# CHECK-NEXT:      Initial: 0x40000
+# CHECK-NEXT:    }
+# CHECK-NEXT:    Entry {
+# CHECK-NEXT:      Address:
+# CHECK-NEXT:      Access:
+# CHECK-NEXT:      Initial: 0x50000
+# CHECK-NEXT:    }
+# CHECK-NEXT:  ]
 
   .option pic2
   .text
diff --git a/lld/test/ELF/riscv-split-stack.s b/lld/test/ELF/riscv-split-stack.s
new file mode 100644
index 000000000000..e6ebf9f3cb5a
--- /dev/null
+++ b/lld/test/ELF/riscv-split-stack.s
@@ -0,0 +1,24 @@
+# REQUIRES: riscv
+
+# RUN: split-file %s %t
+# RUN: llvm-mc -filetype=obj -triple=riscv64 %t/main.s -o %t.64.o
+# RUN: llvm-mc -filetype=obj -triple=riscv64 %t/callee.s -o %t.64.2.o
+# RUN: not ld.lld %t.64.o %t.64.2.o -o %t.64 2>&1 | FileCheck %s
+# CHECK: error: target doesn't support split stacks
+
+#--- main.s
+        .globl  _start
+        .type   _start,@function
+_start:
+        call    test
+	ret
+end:
+        .size   _start, end-_start
+        .section        ".note.GNU-split-stack","",@progbits
+
+
+#--- callee.s
+        .globl  test
+        .type   test,@function
+test:
+	ret
diff --git a/lld/test/ELF/ttext-tdata-tbss.s b/lld/test/ELF/ttext-tdata-tbss.s
index fb9c4d513174..c8254d696929 100644
--- a/lld/test/ELF/ttext-tdata-tbss.s
+++ b/lld/test/ELF/ttext-tdata-tbss.s
@@ -42,13 +42,13 @@
 # USER2-NEXT: LOAD 0x001000 0x0000000000001000
 
 ## With .text well above 200000 we don't need to change the image base
-# RUN: ld.lld -Ttext 0x201000 %t.o -o %t4
+# RUN: ld.lld -Ttext 0x201000 -z separate-loadable-segments %t.o -o %t4
 # RUN: llvm-readelf -S -l %t4 | FileCheck %s --check-prefix=USER3
-# USER3:     .text   PROGBITS 0000000000201000 001000 000001
-# USER3-NEX: .rodata PROGBITS 0000000000202000 002000 000008
-# USER3-NEX: .aw     PROGBITS 0000000000203000 003000 000008
-# USER3-NEX: .data   PROGBITS 0000000000203008 003008 000008
-# USER3-NEX: .bss    NOBITS   0000000000203010 003010 000008
+# USER3:      .text   PROGBITS 0000000000201000 001000 000001
+# USER3-NEXT: .rodata PROGBITS 0000000000202000 002000 000008
+# USER3-NEXT: .aw     PROGBITS 0000000000203000 003000 000008
+# USER3-NEXT: .data   PROGBITS 0000000000203008 003008 000008
+# USER3-NEXT: .bss    NOBITS   0000000000203010 003010 000008
 # USER3:      Type
 # USER3-NEXT: PHDR 0x000040 0x0000000000200040
 # USER3-NEXT: LOAD 0x000000 0x0000000000200000
diff --git a/lld/test/MachO/objc-category-merging-complete-test.s b/lld/test/MachO/objc-category-merging-complete-test.s
index 3bc3ca26b6ae..74400177b550 100644
--- a/lld/test/MachO/objc-category-merging-complete-test.s
+++ b/lld/test/MachO/objc-category-merging-complete-test.s
@@ -13,7 +13,7 @@
 # RUN: llvm-objdump --objc-meta-data --macho a64_file2_merge.exe | FileCheck %s --check-prefixes=MERGE_CATS
 
 
-MERGE_CATS:     __OBJC_$_CATEGORY_MyBaseClass_$_(Category02|Category03)
+MERGE_CATS:     __OBJC_$_CATEGORY_MyBaseClass(Category02|Category03)
 MERGE_CATS-NEXT:              name {{.*}} Category02|Category03
 MERGE_CATS:           instanceMethods
 MERGE_CATS-NEXT:           entsize 24
@@ -88,9 +88,10 @@ MERGE_CATS-NEXT:                 name {{.*}} MyProtocol02Prop
 MERGE_CATS-NEXT:            attributes {{.*}} Ti,R,D
 MERGE_CATS-NEXT:                 name {{.*}} MyProtocol03Prop
 MERGE_CATS-NEXT:            attributes {{.*}} Ti,R,D
+MERGE_CATS:        __OBJC_$_CATEGORY_MyBaseClass_$_Category04
 
 
-NO_MERGE_CATS-NOT: __OBJC_$_CATEGORY_MyBaseClass_$_(Category02|Category03)
+NO_MERGE_CATS-NOT: __OBJC_$_CATEGORY_MyBaseClass(Category02|Category03)
 NO_MERGE_CATS: __OBJC_$_CATEGORY_MyBaseClass_$_Category02
 NO_MERGE_CATS: instanceMethods
 NO_MERGE_CATS-NEXT: 24
@@ -431,6 +432,15 @@ L_OBJC_IMAGE_INFO:
 ## @dynamic MyProtocol03Prop;
 ## @end
 ##
+## // This category shouldn't be merged
+## @interface MyBaseClass(Category04)
+## + (void)load;
+## @end
+##
+## @implementation MyBaseClass(Category04)
+## + (void)load {}
+## @end
+##
 ## int main() {
 ##     return 0;
 ## }
@@ -493,6 +503,12 @@ L_OBJC_IMAGE_INFO:
 	b	_OUTLINED_FUNCTION_0
 	.cfi_endproc
                                         ; -- End function
+	.p2align	2
+"+[MyBaseClass(Category04) load]":
+	.cfi_startproc
+; %bb.0:
+	ret
+	.cfi_endproc
 	.globl	_main                           ; -- Begin function main
 	.p2align	2
 _main:                                  ; @main
@@ -746,11 +762,42 @@ __OBJC_$_CATEGORY_MyBaseClass_$_Category03:
 	.quad	0
 	.long	64                              ; 0x40
 	.space	4
+	.section	__TEXT,__objc_classname,cstring_literals
+l_OBJC_CLASS_NAME_.15:
+	.asciz	"Category04"
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_METH_VAR_NAME_.16:
+	.asciz	"load"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0
+__OBJC_$_CATEGORY_CLASS_METHODS_MyBaseClass_$_Category04:
+	.long	24
+	.long	1
+	.quad	l_OBJC_METH_VAR_NAME_.16
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	"+[MyBaseClass(Category04) load]"
+	.p2align	3, 0x0
+__OBJC_$_CATEGORY_MyBaseClass_$_Category04:
+	.quad	l_OBJC_CLASS_NAME_.15
+	.quad	_OBJC_CLASS_$_MyBaseClass
+	.quad	0
+	.quad	__OBJC_$_CATEGORY_CLASS_METHODS_MyBaseClass_$_Category04
+	.quad	0
+	.quad	0
+	.quad	0
+	.long	64
+	.space	4
 	.section	__DATA,__objc_catlist,regular,no_dead_strip
 	.p2align	3, 0x0                          ; @"OBJC_LABEL_CATEGORY_$"
 l_OBJC_LABEL_CATEGORY_$:
 	.quad	__OBJC_$_CATEGORY_MyBaseClass_$_Category02
 	.quad	__OBJC_$_CATEGORY_MyBaseClass_$_Category03
+	.quad	__OBJC_$_CATEGORY_MyBaseClass_$_Category04
+	.section	__DATA,__objc_nlcatlist,regular,no_dead_strip
+	.p2align	3, 0x0
+l_OBJC_LABEL_NONLAZY_CATEGORY_$:
+	.quad	__OBJC_$_CATEGORY_MyBaseClass_$_Category04
+
 	.no_dead_strip	__OBJC_LABEL_PROTOCOL_$_MyProtocol02
 	.no_dead_strip	__OBJC_LABEL_PROTOCOL_$_MyProtocol03
 	.no_dead_strip	__OBJC_PROTOCOL_$_MyProtocol02
diff --git a/lld/test/MachO/objc-category-merging-extern-class-minimal.s b/lld/test/MachO/objc-category-merging-extern-class-minimal.s
index ede7ef5d9c32..5dd8924df5ad 100644
--- a/lld/test/MachO/objc-category-merging-extern-class-minimal.s
+++ b/lld/test/MachO/objc-category-merging-extern-class-minimal.s
@@ -20,7 +20,7 @@ MERGE_CATS-NOT: __OBJC_$_CATEGORY_MyBaseClass_$_Category01
 MERGE_CATS-NOT: __OBJC_$_CATEGORY_MyBaseClass_$_Category02
 
 # Check that the merged cateogry is there, in the correct format
-MERGE_CATS: __OBJC_$_CATEGORY_MyBaseClass_$_(Category01|Category02)
+MERGE_CATS: __OBJC_$_CATEGORY_MyBaseClass(Category01|Category02)
 MERGE_CATS-NEXT:   name {{.*}} Category01|Category02
 MERGE_CATS:       instanceMethods
 MERGE_CATS-NEXT:  24
@@ -37,7 +37,7 @@ MERGE_CATS-NEXT:   instanceProperties 0x0
 
 #### Check merge categories disabled ###
 # Check that the merged category is not there
-NO_MERGE_CATS-NOT: __OBJC_$_CATEGORY_MyBaseClass_$_(Category01|Category02)
+NO_MERGE_CATS-NOT: __OBJC_$_CATEGORY_MyBaseClass(Category01|Category02)
 
 # Check that the original categories are there
 NO_MERGE_CATS: __OBJC_$_CATEGORY_MyBaseClass_$_Category01
@@ -118,7 +118,7 @@ __OBJC_$_CATEGORY_MyBaseClass_$_Category01:
 	.quad	0
 	.long	64                              ; 0x40
 	.space	4
-	.section	__TEXT,__objc_classname,cstring_literals
+	.section	__DATA,__objc_const
 l_OBJC_CLASS_NAME_.1:                   ; @OBJC_CLASS_NAME_.1
 	.asciz	"Category02"
 	.section	__TEXT,__objc_methname,cstring_literals
@@ -153,3 +153,6 @@ L_OBJC_IMAGE_INFO:
 	.long	0
 	.long	96
 .subsections_via_symbols
+
+.addrsig
+.addrsig_sym __OBJC_$_CATEGORY_MyBaseClass_$_Category01
diff --git a/lld/test/MachO/thinlto-emit-index.ll b/lld/test/MachO/thinlto-emit-index.ll
index 7a3332ab8c93..6f8d552f1435 100644
--- a/lld/test/MachO/thinlto-emit-index.ll
+++ b/lld/test/MachO/thinlto-emit-index.ll
@@ -76,8 +76,8 @@
 ; BACKEND1: <GLOBALVAL_SUMMARY_BLOCK
 ; BACKEND1: <VERSION
 ; BACKEND1: <FLAGS
-; BACKEND1: <VALUE_GUID {{.*}} op0={{1|2}} {{op1=3060885059 op2=1207956914|op1=3432075125 op2=3712786831}}
-; BACKEND1: <VALUE_GUID {{.*}} op0={{1|2}} {{op1=3060885059 op2=1207956914|op1=3432075125 op2=3712786831}}
+; BACKEND1: <VALUE_GUID op0={{1|2}} op1={{-3706093650706652785|-5300342847281564238}}
+; BACKEND1: <VALUE_GUID op0={{1|2}} op1={{-3706093650706652785|-5300342847281564238}}
 ; BACKEND1: <COMBINED
 ; BACKEND1: <COMBINED
 ; BACKEND1: </GLOBALVAL_SUMMARY_BLOCK
@@ -90,7 +90,7 @@
 ; BACKEND2-NEXT: <GLOBALVAL_SUMMARY_BLOCK
 ; BACKEND2-NEXT: <VERSION
 ; BACKEND2-NEXT: <FLAGS
-; BACKEND2-NEXT: <VALUE_GUID {{.*}} op0=1 op1=3060885059 op2=1207956914
+; BACKEND2-NEXT: <VALUE_GUID op0=1 op1=-5300342847281564238
 ; BACKEND2-NEXT: <COMBINED
 ; BACKEND2-NEXT: </GLOBALVAL_SUMMARY_BLOCK
 
diff --git a/lld/test/MachO/thinlto-index-only.ll b/lld/test/MachO/thinlto-index-only.ll
index 4844e715f492..a97cd126ad5b 100644
--- a/lld/test/MachO/thinlto-index-only.ll
+++ b/lld/test/MachO/thinlto-index-only.ll
@@ -75,8 +75,8 @@
 ; BACKEND1: <GLOBALVAL_SUMMARY_BLOCK
 ; BACKEND1: <VERSION
 ; BACKEND1: <FLAGS
-; BACKEND1: <VALUE_GUID {{.*}} op0={{1|2}} {{op1=3060885059 op2=1207956914|op1=3432075125 op2=3712786831}}
-; BACKEND1: <VALUE_GUID {{.*}} op0={{1|2}} {{op1=3060885059 op2=1207956914|op1=3432075125 op2=3712786831}}
+; BACKEND1: <VALUE_GUID op0={{1|2}} op1={{-3706093650706652785|-5300342847281564238}}
+; BACKEND1: <VALUE_GUID op0={{1|2}} op1={{-3706093650706652785|-5300342847281564238}}
 ; BACKEND1: <COMBINED
 ; BACKEND1: <COMBINED
 ; BACKEND1: </GLOBALVAL_SUMMARY_BLOCK
@@ -89,7 +89,7 @@
 ; BACKEND2-NEXT: <GLOBALVAL_SUMMARY_BLOCK
 ; BACKEND2-NEXT: <VERSION
 ; BACKEND2-NEXT: <FLAGS
-; BACKEND2-NEXT: <VALUE_GUID {{.*}} op0=1 op1=3060885059 op2=1207956914
+; BACKEND2-NEXT: <VALUE_GUID op0=1 op1=-5300342847281564238
 ; BACKEND2-NEXT: <COMBINED
 ; BACKEND2-NEXT: </GLOBALVAL_SUMMARY_BLOCK
 
diff --git a/lld/wasm/InputChunks.cpp b/lld/wasm/InputChunks.cpp
index 2074dd59c1dd..975225974aff 100644
--- a/lld/wasm/InputChunks.cpp
+++ b/lld/wasm/InputChunks.cpp
@@ -519,8 +519,8 @@ uint64_t InputSection::getTombstoneForSection(StringRef name) {
   // If they occur in DWARF debug symbols, we want to change the pc of the
   // function to -1 to avoid overlapping with a valid range. However for the
   // debug_ranges and debug_loc sections that would conflict with the existing
-  // meaning of -1 so we use -2.  
-  if (name.equals(".debug_ranges") || name.equals(".debug_loc"))
+  // meaning of -1 so we use -2.
+  if (name == ".debug_ranges" || name == ".debug_loc")
     return UINT64_C(-2);
   if (name.starts_with(".debug_"))
     return UINT64_C(-1);
diff --git a/lldb/cmake/modules/LLDBFramework.cmake b/lldb/cmake/modules/LLDBFramework.cmake
index df2f8ddf54a3..dd8c36bba0e9 100644
--- a/lldb/cmake/modules/LLDBFramework.cmake
+++ b/lldb/cmake/modules/LLDBFramework.cmake
@@ -105,7 +105,7 @@ foreach(header
 endforeach()
 
 # Wrap output in a target, so lldb-framework can depend on it.
-add_custom_target(liblldb-resource-headers DEPENDS ${lldb_staged_headers})
+add_custom_target(liblldb-resource-headers DEPENDS lldb-sbapi-dwarf-enums ${lldb_staged_headers})
 set_target_properties(liblldb-resource-headers PROPERTIES FOLDER "lldb misc")
 add_dependencies(liblldb liblldb-resource-headers)
 
diff --git a/lldb/docs/lldb-for-gdb-users.txt b/lldb/docs/lldb-for-gdb-users.txt
deleted file mode 100644
index e5eae376bb48..000000000000
--- a/lldb/docs/lldb-for-gdb-users.txt
+++ /dev/null
@@ -1,488 +0,0 @@
-Here's a short precis of how to run lldb if you are familiar with the
-gdb command set:
-
-
-1) LLDB Command Structure:
-
-First some details on lldb command structure to help orient you...
-
-Unlike gdb's command set, which is rather free-form, we tried to make
-the lldb command syntax fairly structured.  The commands are all of the
-form
-
-<noun> <verb> [-options [option-value]] [argument [argument...]]
-
-The command line parsing is done before command execution, so it is
-uniform across all the commands.  The command syntax is very simple,
-basically arguments, options and option values are all white-space
-separated.  If you need to put a backslash or double-quote character
-in an argument you back-slash it in the argument.  That makes the
-command syntax more regular, but it also means you may have to
-quote some arguments in lldb that you wouldn't in gdb.
-
-Options can be placed anywhere on the command line, but if the arguments
-begin with a "-" then you have to tell lldb that you're done with options
-using the "--" option.  So for instance, the "process launch" command takes
-the "-s" option to mean "stop the process at the first instruction".  It's 
-arguments are the arguments you are passing to the program.  So if you wanted
-to pass an argument that contained a "-" you would have to do:
-
-(lldb) process launch -- -program_arg value
-
-We also tried to reduce the number of special purpose argument
-parsers, which sometimes forces the user to be a little more explicit
-about stating their intentions.  The first instance you'll note of
-this is the breakpoint command.  In gdb, to set a breakpoint, you
-would just say:
-
-(gdb) break foo.c:12
-
-or
-
-(gdb) break foo
-
-if foo is a function.  As time went on, the parser that tells foo.c:12
-from foo from foo.c::foo (which means the function foo in the file
-foo.c) got more and more complex and bizarre, and especially in C++
-there are times where there's really no way to specify the function
-you want to break on.  The lldb commands are more verbose but also precise.  
-So you say:
-
-(lldb) breakpoint set -f foo.c -l 12
-
-to set a file & line breakpoint.  To set a breakpoint on a function
-by name, you do:
-
-(lldb) breakpoint set -n foo
-
-This can allow us to be more expressive, so you can say:
-
-(lldb) breakpoint set -M foo
-
-to break on all C++ methods named foo, or:
-
-(lldb) breakpoint set -S alignLeftEdges:
-
-to set a breakpoint on all ObjC selectors called alignLeftEdges:.  It
-also makes it easy to compose specifications, like:
-
-(lldb) breakpoint set -s foo.dylib -n foo
-
-for all functions called foo in the shared library foo.dylib.  Suggestions
-on more interesting primitives of this sort are also very welcome.
-
-So for instance:
-
-(lldb) breakpoint set -n "-[SKTGraphicView alignLeftEdges:]"
-
-Just like gdb, the lldb command interpreter does a shortest unique
-string match on command names, so the previous command can also be
-typed:
-
-(lldb) b s -n "-[SKTGraphicView alignLeftEdges:]"
-
-lldb also supports command completion for source file names, symbol
-names, file names, etc. Completion is initiated by a hitting a <TAB>.
-Individual options in a command can have different completers, so for
-instance the -f option in "breakpoint" completes to source files, the
--s option to currently loaded shared libraries, etc...  We can even do 
-things like if you specify -s, and are completing on -f, we will only
-list source files in the shared library specified by -s...
-
-The individual commands are pretty extensively documented, using
-the "help" command.  And there is an "apropos" command that will
-search the help for a particular word and dump a summary help string
-for each matching command.
-
-Finally, there is a mechanism to construct aliases for commonly used
-commands.  So for instance if you get annoyed typing
-
-(lldb) b s -f foo.c -l 12
-
-you can do:
-
-(lldb) command alias bfl breakpoint set -f %1 -l %2
-(lldb) bfl foo.c 12
-
-We have added a few aliases for commonly used commands (e.g. "step",
-"next" and "continue") but we haven't tried to be exhaustive because
-in our experience it is more convenient to make the basic commands
-unique down to a letter or two, and then learn these sequences than
-fill the namespace with lots of aliases, and then have to type them
-all the way out.
-
-However, users are free to customize lldb's command set however they
-like, and since lldb reads the file ~/.lldbinit at startup, you can
-store all your aliases there and they will be generally available to
-you.  Your aliases are also documented in the help command so you can
-remind yourself of what you've set up.
-
-lldb also has a built-in Python interpreter, which is accessible by
-the "script" command.  All the functionality of the debugger is
-available as classes in the Python interpreter, so the more complex
-commands that in gdb you would introduce with the "define" command can
-be done by writing Python functions using the lldb-Python library,
-then loading the scripts into your running session and accessing them
-with the "script" command.  
-
-
-
-2) A typical session:
-
-
-a) Setting the program to debug:
-
-
-As with gdb, you can start lldb and specify the file you wish to debug
-on the command line:
-
-$ lldb /Projects/Sketch/build/Debug/Sketch.app
-Current executable set to '/Projects/Sketch/build/Debug/Sketch.app' (x86_64).
-
-or you can specify it after the fact with the "file" command:
-
-(lldb) file /Projects/Sketch/build/Debug/Sketch.app
-Current executable set to '/Projects/Sketch/build/Debug/Sketch.app' (x86_64).
-
-
-b) Setting breakpoints:
-
-
-We've discussed how to set breakpoints above.  You can use "help break set" 
-to see all the options for breakpoint setting.  For instance, we might do:
-
-(lldb) b s -S alignLeftEdges:
-Breakpoint created: 1: name = 'alignLeftEdges:', locations = 1, resolved = 1
-
-You can find out about the breakpoints you've set with:
-
-(lldb) break list
-Current breakpoints:
-1: name = 'alignLeftEdges:', locations = 1, resolved = 1
-  1.1: where = Sketch`-[SKTGraphicView alignLeftEdges:] + 33 at /Projects/Sketch/SKTGraphicView.m:1405, address = 0x0000000100010d5b, resolved, hit count = 0 
-
-Note that each "logical" breakpoint can have multiple "locations".
-The logical breakpoint has an integer id, and its locations have an
-id within their parent breakpoint (the two are joined by a ".",
-e.g. 1.1 in the example above.)  
-
-Also the breakpoints remain "live" so that if another shared library
-were to be loaded that had another implementation of the
-"alignLeftEdges:" selector, the new location would be added to
-breakpoint 1 (e.g. a "1.2" breakpoint would be set on the newly loaded
-selector).
-
-The other piece of information in the breakpoint listing is whether the
-breakpoint location was "resolved" or not.  A location gets resolved when
-the file address it corresponds to gets loaded into the program you are
-debugging.  For instance if you set a breakpoint in a shared library that 
-then gets unloaded, that breakpoint location will remain, but it will no 
-longer be "resolved".
-
-One other thing to note for gdb users is that lldb acts like gdb with:
-
-(gdb) set breakpoint pending on
-
-That is, lldb should always make a breakpoint from your specification, even
-if it couldn't find any locations that match the specification.  You can tell
-whether the expression was resolved or not by checking the locations field
-in "breakpoint list", and we report the breakpoint as "pending" when you
-set it so you can tell you've made a typo more easily, if that was indeed 
-the reason no locations were found:
-
-(lldb) b s -f no_such_file.c -l 10000000
-Breakpoint created: 1: file ='no_such_file.c', line = 10000000, locations = 0 (pending)
-
-You can delete, disable, set conditions and ignore counts either on all the
-locations generated by your logical breakpoint, or on particular locations
-your specification resolved to.  For instance if we wanted to add a command
-to print a backtrace when we hit this breakpoint we could do:
-
-(lldb) b command add -c 1.1
-Enter your debugger command(s).  Type 'DONE' to end.
-> bt
-> DONE
-
-The "-c" option specifies that the breakpoint command is a set of lldb
-command interpreter commands.  Use "-s" if you want to implement your
-breakpoint command using the Python interface instead.
-
-
-c) Running the program:
-
-Then you can either launch the process with the command:
-
-(lldb) process launch
-
-or its alias:
-
-(lldb) r
-
-Or you can attach to a process by name with:
-
-(lldb) process attach -n Sketch
-
-The "attach by name"  also supports the "-w" option which waits for the
-next process of that name to show up, and attaches to that.  You can also
-attach by PID:
-
-(lldb) process attach -p 12345
-Process 46915 Attaching
-(lldb) Process 46915 Stopped
-1 of 3 threads stopped with reasons:
-* thread #1: tid = 0x2c03, 0x00007fff85cac76a, where = libSystem.B.dylib`__getdirentries64 + 10, stop reason = signal = SIGSTOP, queue = com.apple.main-thread
-
-Note that we tell you that "1 of 3 threads stopped with reasons" and
-then list those threads.  In a multi-threaded environment it is very
-common for more than one thread to hit your breakpoint(s) before the
-kernel actually returns control to the debugger.  In that case, you
-will see all the threads that stopped for some interesting reason
-listed in the stop message.
-
-
-d) Controlling execution:
-
-
-After launching, we can continue until we hit our breakpoint.  The primitive
-commands for process control all exist under the "thread" command:
-
-(lldb) thread continue
-Resuming thread 0x2c03 in process 46915
-Resuming process 46915
-(lldb)
-
-At present you can only operate on one thread at a time, but the
-design will ultimately support saying "step over the function in
-Thread 1, and step into the function in Thread 2, and continue Thread
-3" etc.  When we eventually support keeping some threads running while
-others are stopped this will be particularly important.  For
-convenience, however, all the stepping commands have easy aliases.  
-So "thread continue" is just "c", etc.
-
-The other program stepping commands are pretty much the same as in gdb.  
-You've got:
-
-  1. (lldb) thread step-in
-     The same as gdb's "step" -- there is also the alias "s" in lldb
-
-  2. (lldb) thread step-over
-     The same as gdb's "next" -- there is also the alias "n" in lldb
-
-  3. (lldb) thread step-out
-     The same as gdb's "finish" -- there is also the alias "f" in lldb
-
-And the "by instruction" versions:
-
-(lldb) thread step-inst
-(lldb) thread step-over-inst
-
-Finally, there's:
-
-(lldb) thread until 100
-
-Which runs the thread in the current frame till it reaches line 100 in
-this frame or stops if it leaves the current frame.  This is a pretty 
-close equivalent to gdb's "until" command.
-
-
-One thing here that might be a little disconcerting to gdb users here is that
-when you resume process execution, you immediately get a prompt back.  That's
-because the lldb interpreter remains live when you are running the target.
-This allows you to set a breakpoint, etc without having to explicitly interrupt
-the program you are debugging.  We're still working out all the operations
-that it is safe to do while running.  But this way of operation will set us
-up for "no stop" debugging when we get to implementing that.
-
-If you want to interrupt a running program do:
-
-(lldb) process interrupt
-
-To find out the state of the program, use:
-
-(lldb) process status
-Process 47958 is running.
-
-This is very convenient, but it does have the down-side that debugging
-programs that use stdin is no longer as straightforward.  For now, you
-have to specify another tty to use as the program stdout & stdin using
-the appropriate options to "process launch", or start your program in
-another terminal and catch it with "process attach -w".  We will come
-up with some more convenient way to juggle the terminal back & forth
-over time.
-
-
-e) Examining program state:
-
-Once you've stopped, lldb will choose a current thread, usually the
-one that stopped "for a reason", and a current frame in that thread.
-Many the commands for inspecting state work on this current
-thread/frame.
-
-To inspect the current state of your process, you can start with the
-threads:
-
-(lldb) thread list
-Process 46915 state is Stopped
-* thread #1: tid = 0x2c03, 0x00007fff85cac76a, where = libSystem.B.dylib`__getdirentries64 + 10, stop reason = signal = SIGSTOP, queue = com.apple.main-thread
-  thread #2: tid = 0x2e03, 0x00007fff85cbb08a, where = libSystem.B.dylib`kevent + 10, queue = com.apple.libdispatch-manager
-  thread #3: tid = 0x2f03, 0x00007fff85cbbeaa, where = libSystem.B.dylib`__workq_kernreturn + 10
-
-The * indicates that Thread 1 is the current thread.  To get a
-backtrace for that thread, do:
-
-(lldb) thread backtrace
-thread #1: tid = 0x2c03, stop reason = breakpoint 1.1, queue = com.apple.main-thread
-  frame #0: 0x0000000100010d5b, where = Sketch`-[SKTGraphicView alignLeftEdges:] + 33 at /Projects/Sketch/SKTGraphicView.m:1405
-  frame #1: 0x00007fff8602d152, where = AppKit`-[NSApplication sendAction:to:from:] + 95
-  frame #2: 0x00007fff860516be, where = AppKit`-[NSMenuItem _corePerformAction] + 365
-  frame #3: 0x00007fff86051428, where = AppKit`-[NSCarbonMenuImpl performActionWithHighlightingForItemAtIndex:] + 121
-  frame #4: 0x00007fff860370c1, where = AppKit`-[NSMenu performKeyEquivalent:] + 272
-  frame #5: 0x00007fff86035e69, where = AppKit`-[NSApplication _handleKeyEquivalent:] + 559
-  frame #6: 0x00007fff85f06aa1, where = AppKit`-[NSApplication sendEvent:] + 3630
-  frame #7: 0x00007fff85e9d922, where = AppKit`-[NSApplication run] + 474
-  frame #8: 0x00007fff85e965f8, where = AppKit`NSApplicationMain + 364
-  frame #9: 0x0000000100015ae3, where = Sketch`main + 33 at /Projects/Sketch/SKTMain.m:11
-  frame #10: 0x0000000100000f20, where = Sketch`start + 52
-
-You can also provide a list of threads to backtrace, or the keyword
-"all" to see all threads:
-
-(lldb) thread backtrace all
-
-Next task is inspecting data:
-
-The most convenient way to inspect a frame's arguments and local variables is:
-
-(lldb) frame variable 
-self = (SKTGraphicView *) 0x0000000100208b40
-_cmd = (struct objc_selector *) 0x000000010001bae1
-sender = (id) 0x00000001001264e0
-selection = (NSArray *) 0x00000001001264e0
-i = (NSUInteger) 0x00000001001264e0
-c = (NSUInteger) 0x00000001001253b0
-
-You can also choose particular variables to view:
-
-(lldb) frame variable self
-(SKTGraphicView *) self = 0x0000000100208b40
-
-The frame variable command is not a full expression parser but it
-does support some common operations like dereferencing:
-
-(lldb) fr v *self
-(SKTGraphicView *) self = 0x0000000100208b40
-  (NSView) NSView = {
-    (NSResponder) NSResponder = {
-...
-
-and structure element references:
-
-(lldb) frame variable self.isa
-(struct objc_class *) self.isa = 0x0000000100023730
-
-The frame variable command will also perform "object printing" operations on
-variables (currently we only support NSPrintForDebugger) with:
-
-(lldb) fr v -o self
-(SKTGraphicView *) self = 0x0000000100208b40
-<SKTGraphicView: 0x100208b40>
-
-You can select another frame to view with:
-
-(lldb) frame select 9
-frame #9: 0x0000000100015ae3, where = Sketch`main + 33 at /Projects/Sketch/SKTMain.m:11
-   8      
-   9      
-  10      int main(int argc, const char *argv[]) {
-  11 ->       return NSApplicationMain(argc, argv);
-  12          }
-  13          
-  14          
-
-Another neat trick that the variable list does is array references, so:
-
-(lldb) fr v argv[0]
-(char const *) argv[0] = 0x00007fff5fbffaf8 "/Projects/Sketch/build/Debug/Sketch.app/Contents/MacOS/Sketch"
-
-If you need to view more complex data or change program data, you can
-use the general "expression" command.  It takes an expression and
-evaluates it in the scope of the currently selected frame.  For instance:
-
-(lldb) expr self
-$0 = (SKTGraphicView *) 0x0000000100135430
-(lldb) expr self = 0x00
-$1 = (SKTGraphicView *) 0x0000000000000000
-(lldb) frame var self
-(SKTGraphicView *) self = 0x0000000000000000
-
-You can also call functions:
-
-(lldb) expr (int) printf ("I have a pointer 0x%llx.\n", self)
-$2 = (int) 22
-I have a pointer 0x0.
-
-One thing to note from this example is that lldb commands can be defined to
-take "raw" input.  "expression" is one of these.  So in the expression command,
-you don't have to quote your whole expression, nor backslash protect quotes,
-etc...
-
-Finally, the results of the expressions are stored in persistent variables
-(of the form $[0-9]+) that you can use in further expressions, like:
-
-(lldb) expr self = $0
-$4 = (SKTGraphicView *) 0x0000000100135430
-
-f) Customization:
-
-You can use the embedded Python interpreter to add the following 'pwd' and 'cd' commands
-for your lldb session:
-
-(lldb) script import os
-(lldb) command alias pwd script print os.getcwd()
-(lldb) command regex cd "s/^(.*)$/script os.chdir(os.path.expanduser('%1'))/"
-
-...
-
-(lldb) cd /tmp
-script os.chdir(os.path.expanduser('/tmp'))
-(lldb) pwd
-/private/tmp
-(lldb) 
-
-Or for a more capable 'cd' command, create ~/utils.py like this:
-
-import os
-
-def chdir(debugger, args, result, dict):
-    """Change the working directory, or cd to ${HOME}."""
-    dir = args.strip()
-    if dir:
-        os.chdir(args)
-    else:
-        os.chdir(os.path.expanduser('~'))
-    print "Current working directory: %s" % os.getcwd()
-
-and, have the following in your ~/.lldbinit file:
-
-script import os, sys
-script sys.path.append(os.path.expanduser('~'))
-script import utils
-command alias pwd script print os.getcwd()
-command script add -f utils.chdir cd
-
-and, then in your lldb session, you can have:
-
-(lldb) help cd
-
-Change the working directory, or cd to ${HOME}.
-Syntax: cd
-(lldb) cd
-Current working directory: /Volumes/data/Users/johnny
-(lldb) cd /tmp
-Current working directory: /private/tmp
-(lldb) pwd
-/private/tmp
-(lldb) 
-
-For more examples of customization, look under the ToT/examples/customization
-directory.
diff --git a/lldb/docs/use/qemu-testing.rst b/lldb/docs/use/qemu-testing.rst
index 6e282141864c..51a30b11717a 100644
--- a/lldb/docs/use/qemu-testing.rst
+++ b/lldb/docs/use/qemu-testing.rst
@@ -172,6 +172,7 @@ forwarded for this to work.
 
 .. note::
   These options are used to create a "port map" within ``lldb-server``.
-  Unfortunately this map is not shared across all the processes it may create,
+  Unfortunately this map is not cleaned up on Windows on connection close,
   and across a few uses you may run out of valid ports. To work around this,
   restart the platform every so often, especially after running a set of tests.
+  This is tracked here: https://github.com/llvm/llvm-project/issues/90923
diff --git a/lldb/docs/use/tutorial.rst b/lldb/docs/use/tutorial.rst
index c7f89976156c..22354c6720e1 100644
--- a/lldb/docs/use/tutorial.rst
+++ b/lldb/docs/use/tutorial.rst
@@ -1,14 +1,14 @@
 Tutorial
 ========
 
-This document describes how to use lldb if you are already familiar with
-gdb's command set. We will start with some details on lldb command structure and
+This document describes how to use LLDB if you are already familiar with
+GDB's command set. We will start with some details on LLDB command structure and
 syntax.
 
 Command Structure
 -----------------
 
-Unlike gdb's quite free-form commands, lldb's are more structured. All commands
+Unlike GDB's quite free-form commands, LLDB's are more structured. All commands
 are of the form:
 
 ::
@@ -24,11 +24,11 @@ all commands. The command syntax for basic commands is very simple.
 * Escape backslashes and double quotes within arguments should be escaped
   with a backslash ``\``.
 
-This makes lldb's commands more regular, but it also means you may have to quote
-some arguments in lldb that you would not in gdb.
+This makes LLDB's commands more regular, but it also means you may have to quote
+some arguments in LLDB that you would not in GDB.
 
-There is one other special quote character in lldb - the backtick `````.
-If you put backticks around an argument or option value, lldb will run the text
+There is one other special quote character in LLDB - the backtick `````.
+If you put backticks around an argument or option value, LLDB will run the text
 of the value through the expression parser, and the result of the expression
 will be passed to the command.  So for instance, if ``len`` is a local
 ``int`` variable with the value ``5``, then the command:
@@ -40,7 +40,7 @@ will be passed to the command.  So for instance, if ``len`` is a local
 Will receive the value ``5`` for the count option, rather than the string ``len``.
 
 Options can be placed anywhere on the command line, but if the arguments begin
-with a ``-`` then you have to tell lldb that you are done with options for the
+with a ``-`` then you have to tell LLDB that you are done with options for the
 current command by adding an option termination: ``--``.
 
 So for instance, if you want to launch a process and give the ``process launch``
@@ -53,7 +53,7 @@ to launch to be launched with the arguments ``-program_arg value``, you would ty
 
 We also tried to reduce the number of special purpose argument parsers, which
 sometimes forces the user to be explicit about their intentions. The first
-instance you willl see of this is the breakpoint command. In gdb, to set a
+instance you willl see of this is the breakpoint command. In GDB, to set a
 breakpoint, you might enter:
 
 ::
@@ -71,17 +71,17 @@ from ``foo`` from ``foo.c::foo`` (which means the function ``foo`` in the file `
 got more and more complex. Especially in C++ there are times where there is
 really no way to specify the function you want to break on.
 
-The lldb commands are more verbose but also more precise and allow for
+The LLDB commands are more verbose but also more precise and allow for
 intelligent auto completion.
 
-To set the same file and line breakpoint in lldb you can enter either of:
+To set the same file and line breakpoint in LLDB you can enter either of:
 
 ::
 
    (lldb) breakpoint set --file foo.c --line 12
    (lldb) breakpoint set -f foo.c -l 12
 
-To set a breakpoint on a function named ``foo`` in lldb you can enter either of:
+To set a breakpoint on a function named ``foo`` in LLDB you can enter either of:
 
 ::
 
@@ -96,7 +96,7 @@ conditions or commands without having to specify them multiple times:
 
    (lldb) breakpoint set --name foo --name bar
 
-Setting breakpoints by name is even more specialized in lldb as you can specify
+Setting breakpoints by name is even more specialized in LLDB as you can specify
 that you want to set a breakpoint at a function by method name. To set a
 breakpoint on all C++ methods named ``foo`` you can enter either of:
 
@@ -125,7 +125,7 @@ The ``--shlib`` option can also be repeated to specify several shared libraries.
 
 Suggestions on more interesting primitives of this sort are also very welcome.
 
-Just like gdb, the lldb command interpreter does a shortest unique string match
+Just like GDB, the LLDB command interpreter does a shortest unique string match
 on command names, so the following two commands will both execute the same
 command:
 
@@ -134,12 +134,12 @@ command:
    (lldb) breakpoint set -n "-[SKTGraphicView alignLeftEdges:]"
    (lldb) br s -n "-[SKTGraphicView alignLeftEdges:]"
 
-lldb also supports command completion for source file names, symbol names, file
+LLDB also supports command completion for source file names, symbol names, file
 names, etc. Completion is initiated by hitting TAB. Individual options in a
 command can have different completers, so for instance, the ``--file <path>``
 option in ``breakpoint`` completes to source files, the ``--shlib <path>`` option
 to currently loaded shared libraries, etc. You can even do things like if you
-specify ``--shlib <path>``, and are completing on ``--file <path>``, lldb will only
+specify ``--shlib <path>``, and are completing on ``--file <path>``, LLDB will only
 list source files in the shared library specified by ``--shlib <path>``.
 
 The individual commands are pretty extensively documented. You can use the ``help``
@@ -162,23 +162,23 @@ You can do:
    (lldb) command alias bfl breakpoint set -f %1 -l %2
    (lldb) bfl foo.c 12
 
-lldb has a few aliases for commonly used commands (e.g. ``step``, ``next`` and
+LLDB has a few aliases for commonly used commands (e.g. ``step``, ``next`` and
 ``continue``) but it does not try to be exhaustive because in our experience it
 is more convenient to make the basic commands unique down to a letter or two,
 and then learn these sequences than to fill the namespace with lots of aliases,
 and then have to type them all the way out.
 
-However, users are free to customize lldb's command set however they like, and
-since lldb reads the file ``~/.lldbinit`` at startup, you can store all your
+However, users are free to customize LLDB's command set however they like, and
+since LLDB reads the file ``~/.lldbinit`` at startup, you can store all your
 aliases there and they will be generally available to you. Your aliases are
 also documented in the ``help`` command so you can remind yourself of what you have
 set up.
 
-One alias of note that lldb does include by popular demand is a weak emulator of
-gdb's ``break`` command. It does not try to do everything that gdb's break command
+One alias of note that LLDB does include by popular demand is a weak emulator of
+GDB's ``break`` command. It does not try to do everything that GDB's break command
 does (for instance, it does not handle ``foo.c::bar``). But it mostly works, and
 makes the transition easier. Also, by popular demand, it is aliased to ``b``. If you
-actually want to learn the lldb command set natively, that means it will get in
+actually want to learn the LLDB command set natively, that means it will get in
 the way of the rest of the breakpoint commands. Fortunately, if you do not like
 one of our aliases, you can easily get rid of it by running, for example:
 
@@ -192,9 +192,9 @@ You can also do:
 
    (lldb) command alias b breakpoint
 
-So you can run the native lldb breakpoint command with just ``b``.
+So you can run the native LLDB breakpoint command with just ``b``.
 
-The lldb command parser also supports "raw" commands, where, after command
+The LLDB command parser also supports "raw" commands, where, after command
 options are stripped off, the rest of the command string is passed
 uninterpreted to the command. This is convenient for commands whose arguments
 might be some complex expression that would be painful to backslash protect.
@@ -205,17 +205,17 @@ commands still can have options, if your command string has dashes in it,
 you will have to indicate these are not option markers by putting ``--`` after the
 command name, but before your command string.
 
-lldb also has a built-in Python interpreter, which is accessible by the
+LLDB also has a built-in Python interpreter, which is accessible by the
 ``"script`` command. All the functionality of the debugger is available as classes
-in the Python interpreter, so the more complex commands that in gdb you would
+in the Python interpreter, so the more complex commands that in GDB you would
 introduce with the ``define`` command can be done by writing Python functions
-using the lldb-Python library, then loading the scripts into your running
+using the LLDB Python library, then loading the scripts into your running
 session and accessing them with the ``script`` command.
 
-Loading a Program Into lldb
+Loading a Program Into LLDB
 ---------------------------
 
-First you need to set the program to debug. As with gdb, you can start lldb and
+First you need to set the program to debug. As with GDB, you can start LLDB and
 specify the file you wish to debug on the command line:
 
 ::
@@ -273,16 +273,16 @@ address it corresponds to gets loaded into the program you are debugging. For
 instance if you set a breakpoint in a shared library that then gets unloaded,
 that breakpoint location will remain, but it will no longer be resolved.
 
-One other thing to note for gdb users is that lldb acts like gdb with:
+One other thing to note for GDB users is that LLDB acts like GDB with:
 
 ::
 
    (gdb) set breakpoint pending on
 
-Which means that lldb will always make a breakpoint from your specification, even if it
+Which means that LLDB will always make a breakpoint from your specification, even if it
 could not find any locations that match the specification. You can tell whether
 the expression was resolved or not by checking the locations field in
-``breakpoint list``, and lldb reports the breakpoint as ``pending`` when you set it so
+``breakpoint list``, and LLDB reports the breakpoint as ``pending`` when you set it so
 you can tell you have made a typo more easily, if that was indeed the reason no
 locations were found:
 
@@ -304,12 +304,12 @@ command to print a backtrace when you hit this breakpoint you could do:
    > bt
    > DONE
 
-By default, the breakpoint command add command takes lldb command line
+By default, the breakpoint command add command takes LLDB command line
 commands. You can also specify this explicitly by passing the ``--command``
 option. Use ``--script`` if you want to implement your breakpoint command using
 the Python script instead.
 
-This is a convenient point to bring up another feature of the lldb command
+This is a convenient point to bring up another feature of the LLDB command
 ``help``. Do:
 
 ::
@@ -447,7 +447,7 @@ a variable called ``global`` for write operation, but only stop if the condition
 Starting or Attaching to Your Program
 -------------------------------------
 
-To launch a program in lldb you will use the ``process launch`` command or one of
+To launch a program in LLDB you will use the ``process launch`` command or one of
 its built in aliases:
 
 ::
@@ -457,7 +457,7 @@ its built in aliases:
    (lldb) r
 
 You can also attach to a process by process ID or process name. When attaching
-to a process by name, lldb also supports the ``--waitfor`` option which waits for
+to a process by name, LLDB also supports the ``--waitfor`` option which waits for
 the next process that has that name to show up, and attaches to it
 
 ::
@@ -497,32 +497,32 @@ for process control all exist under the "thread" command:
 
 At present you can only operate on one thread at a time, but the design will
 ultimately support saying "step over the function in Thread 1, and step into the
-function in Thread 2, and continue Thread 3" etc. When lldb eventually supports
+function in Thread 2, and continue Thread 3" etc. When LLDB eventually supports
 keeping some threads running while others are stopped this will be particularly
 important. For convenience, however, all the stepping commands have easy aliases.
 So ``thread continue`` is just ``c``, etc.
 
-The other program stepping commands are pretty much the same as in gdb. You have got:
+The other program stepping commands are pretty much the same as in GDB. You have got:
 
 ::
 
-   (lldb) thread step-in    // The same as gdb's "step" or "s"
-   (lldb) thread step-over  // The same as gdb's "next" or "n"
-   (lldb) thread step-out   // The same as gdb's "finish" or "f"
+   (lldb) thread step-in    // The same as GDB's "step" or "s"
+   (lldb) thread step-over  // The same as GDB's "next" or "n"
+   (lldb) thread step-out   // The same as GDB's "finish" or "f"
 
-By default, lldb does defined aliases to all common gdb process control commands
-(``s``, ``step``, ``n``, ``next``, ``finish``). If lldb is missing any, please add
+By default, LLDB does defined aliases to all common GDB process control commands
+(``s``, ``step``, ``n``, ``next``, ``finish``). If LLDB is missing any, please add
 them to your ``~/.lldbinit`` file using the ``command alias`` command.
 
-lldb also supports the step by instruction versions:
+LLDB also supports the step by instruction versions:
 
 ::
 
 
-   (lldb) thread step-inst       // The same as gdb's "stepi" / "si"
-   (lldb) thread step-over-inst  // The same as gdb's "nexti" / "ni"
+   (lldb) thread step-inst       // The same as GDB's "stepi" / "si"
+   (lldb) thread step-over-inst  // The same as GDB's "nexti" / "ni"
 
-Finally, lldb has a run until line or frame exit stepping mode:
+Finally, LLDB has a run until line or frame exit stepping mode:
 
 ::
 
@@ -530,16 +530,16 @@ Finally, lldb has a run until line or frame exit stepping mode:
 
 This command will run the thread in the current frame until it reaches line 100
 in this frame or stops if it leaves the current frame. This is a pretty close
-equivalent to gdb's ``until`` command.
+equivalent to GDB's ``until`` command.
 
-A process, by default, will share the lldb terminal with the inferior process.
-When in this mode, much like when debugging with gdb, when the process is
+A process, by default, will share the LLDB terminal with the inferior process.
+When in this mode, much like when debugging with GDB, when the process is
 running anything you type will go to the ``STDIN`` of the inferior process. To
 interrupt your inferior program, type ``CTRL+C``.
 
 If you attach to a process, or launch a process with the ``--no-stdin`` option,
 the command interpreter is always available to enter commands. It might be a
-little disconcerting to gdb users to always have an ``(lldb)`` prompt. This allows
+little disconcerting to GDB users to always have an ``(lldb)`` prompt. This allows
 you to set a breakpoint, or use any other command without having to explicitly
 interrupt the program you are debugging:
 
@@ -563,16 +563,16 @@ and memory reading and writing (``memory [read|write] ...``).
 The question of disabling stdio when running brings up a good opportunity to
 show how to set debugger properties. If you always want to run in
 the ``--no-stdin`` mode, you can set this as a generic process property using the
-lldb ``settings`` command, which is equivalent to gdb's ``set`` command.
+LLDB ``settings`` command, which is equivalent to GDB's ``set`` command.
 In this case you would say:
 
 ::
 
    (lldb) settings set target.process.disable-stdio true
 
-Over time, gdb's ``set`` command became a wilderness of disordered options, so
-that there were useful options that even experienced gdb users did not know
-about because they were too hard to find. lldb instead organizes the settings
+Over time, GDB's ``set`` command became a wilderness of disordered options, so
+that there were useful options that even experienced GDB users did not know
+about because they were too hard to find. LLDB instead organizes the settings
 hierarchically using the structure of the basic entities in the debugger. For
 the most part anywhere you can specify a setting on a generic entity (threads,
 for example) you can also apply the option to a particular instance. You can
@@ -582,7 +582,7 @@ on the settings command explaining how it works more generally.
 Examining Thread State
 ----------------------
 
-Once you have stopped, lldb will choose a current thread, usually the one that
+Once you have stopped, LLDB will choose a current thread, usually the one that
 stopped "for a reason", and a current frame in that thread (on stop this is
 always the bottom-most frame). Many the commands for inspecting state work on
 this current thread/frame.
@@ -685,7 +685,7 @@ pointers as arrays:
    (char const *) argv[0] = 0x00007fff5fbffaf8 "/Projects/Sketch/build/Debug/Sketch.app/Contents/MacOS/Sketch"
 
 The frame variable command will also perform "object printing" operations on
-variables (currently lldb only supports ObjC printing, using the object's
+variables (currently LLDB only supports ObjC printing, using the object's
 ``description`` method. Turn this on by passing the ``-o`` flag to frame variable:
 
 ::
@@ -697,4 +697,4 @@ variables (currently lldb only supports ObjC printing, using the object's
    frame #9: 0x0000000100015ae3, where = Sketch`function1 + 33 at /Projects/Sketch/SKTFunctions.m:11
 
 You can also move up and down the stack by passing the ``--relative`` (``-r``) option.
-We also have built-in aliases ``u`` and ``d`` which behave like their gdb equivalents.
+We also have built-in aliases ``u`` and ``d`` which behave like their GDB equivalents.
diff --git a/lldb/examples/python/crashlog.py b/lldb/examples/python/crashlog.py
index c992348b24be..641b2e64d53b 100755
--- a/lldb/examples/python/crashlog.py
+++ b/lldb/examples/python/crashlog.py
@@ -252,7 +252,7 @@ class CrashLog(symbolication.Symbolicator):
                 self.idents.append(ident)
 
         def did_crash(self):
-            return self.reason is not None
+            return self.crashed
 
         def __str__(self):
             if self.app_specific_backtrace:
@@ -418,9 +418,20 @@ class CrashLog(symbolication.Symbolicator):
                         with print_lock:
                             print('falling back to binary inside "%s"' % dsym)
                         self.symfile = dsym
-                        for filename in os.listdir(dwarf_dir):
-                            self.path = os.path.join(dwarf_dir, filename)
-                            if self.find_matching_slice():
+                        # Look for the executable next to the dSYM bundle.
+                        parent_dir = os.path.dirname(dsym)
+                        executables = []
+                        for root, _, files in os.walk(parent_dir):
+                            for file in files:
+                                abs_path = os.path.join(root, file)
+                                if os.path.isfile(abs_path) and os.access(
+                                    abs_path, os.X_OK
+                                ):
+                                    executables.append(abs_path)
+                        for binary in executables:
+                            basename = os.path.basename(binary)
+                            if basename == self.identifier:
+                                self.path = binary
                                 found_matching_slice = True
                                 break
                         if found_matching_slice:
@@ -526,6 +537,49 @@ class CrashLog(symbolication.Symbolicator):
     def get_target(self):
         return self.target
 
+    def load_images(self, options, loaded_images=None):
+        if not loaded_images:
+            loaded_images = []
+        images_to_load = self.images
+        if options.load_all_images:
+            for image in self.images:
+                image.resolve = True
+        elif options.crashed_only:
+            for thread in self.threads:
+                if thread.did_crash():
+                    images_to_load = []
+                    for ident in thread.idents:
+                        for image in self.find_images_with_identifier(ident):
+                            image.resolve = True
+                            images_to_load.append(image)
+
+        futures = []
+        with tempfile.TemporaryDirectory() as obj_dir:
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+
+                def add_module(image, target, obj_dir):
+                    return image, image.add_module(target, obj_dir)
+
+                for image in images_to_load:
+                    if image not in loaded_images:
+                        if image.uuid == uuid.UUID(int=0):
+                            continue
+                        futures.append(
+                            executor.submit(
+                                add_module,
+                                image=image,
+                                target=self.target,
+                                obj_dir=obj_dir,
+                            )
+                        )
+
+                for future in concurrent.futures.as_completed(futures):
+                    image, err = future.result()
+                    if err:
+                        print(err)
+                    else:
+                        loaded_images.append(image)
+
 
 class CrashLogFormatException(Exception):
     pass
@@ -1408,36 +1462,7 @@ def SymbolicateCrashLog(crash_log, options):
     if not target:
         return
 
-    if options.load_all_images:
-        for image in crash_log.images:
-            image.resolve = True
-    elif options.crashed_only:
-        for thread in crash_log.threads:
-            if thread.did_crash():
-                for ident in thread.idents:
-                    for image in crash_log.find_images_with_identifier(ident):
-                        image.resolve = True
-
-    futures = []
-    loaded_images = []
-    with tempfile.TemporaryDirectory() as obj_dir:
-        with concurrent.futures.ThreadPoolExecutor() as executor:
-
-            def add_module(image, target, obj_dir):
-                return image, image.add_module(target, obj_dir)
-
-            for image in crash_log.images:
-                futures.append(
-                    executor.submit(
-                        add_module, image=image, target=target, obj_dir=obj_dir
-                    )
-                )
-            for future in concurrent.futures.as_completed(futures):
-                image, err = future.result()
-                if err:
-                    print(err)
-                else:
-                    loaded_images.append(image)
+    crash_log.load_images(options)
 
     if crash_log.backtraces:
         for thread in crash_log.backtraces:
@@ -1469,6 +1494,7 @@ def load_crashlog_in_scripted_process(debugger, crashlog_path, options, result):
             raise InteractiveCrashLogException(
                 "couldn't create target provided by the user (%s)" % options.target_path
             )
+        crashlog.target = target
 
     # 2. If the user didn't provide a target, try to create a target using the symbolicator
     if not target or not target.IsValid():
@@ -1498,7 +1524,11 @@ def load_crashlog_in_scripted_process(debugger, crashlog_path, options, result):
     structured_data = lldb.SBStructuredData()
     structured_data.SetFromJSON(
         json.dumps(
-            {"file_path": crashlog_path, "load_all_images": options.load_all_images}
+            {
+                "file_path": crashlog_path,
+                "load_all_images": options.load_all_images,
+                "crashed_only": options.crashed_only,
+            }
         )
     )
     launch_info = lldb.SBLaunchInfo(None)
@@ -1631,7 +1661,8 @@ def CreateSymbolicateCrashLogOptions(
         "--no-crashed-only",
         action="store_false",
         dest="crashed_only",
-        help="do not symbolicate the crashed thread",
+        help="in batch mode, symbolicate all threads, not only the crashed one",
+        default=False,
     )
     arg_parser.add_argument(
         "--disasm-depth",
diff --git a/lldb/examples/python/crashlog_scripted_process.py b/lldb/examples/python/crashlog_scripted_process.py
index c69985b1a072..26c5c37b7371 100644
--- a/lldb/examples/python/crashlog_scripted_process.py
+++ b/lldb/examples/python/crashlog_scripted_process.py
@@ -29,27 +29,7 @@ class CrashLogScriptedProcess(ScriptedProcess):
         if hasattr(self.crashlog, "asb"):
             self.extended_thread_info = self.crashlog.asb
 
-        if self.load_all_images:
-            for image in self.crashlog.images:
-                image.resolve = True
-        else:
-            for thread in self.crashlog.threads:
-                if thread.did_crash():
-                    for ident in thread.idents:
-                        for image in self.crashlog.find_images_with_identifier(ident):
-                            image.resolve = True
-
-        with tempfile.TemporaryDirectory() as obj_dir:
-            for image in self.crashlog.images:
-                if image not in self.loaded_images:
-                    if image.uuid == uuid.UUID(int=0):
-                        continue
-                    err = image.add_module(self.target, obj_dir)
-                    if err:
-                        # Append to SBCommandReturnObject
-                        print(err)
-                    else:
-                        self.loaded_images.append(image)
+        crashlog.load_images(self.options, self.loaded_images)
 
         for thread in self.crashlog.threads:
             if (
@@ -70,6 +50,10 @@ class CrashLogScriptedProcess(ScriptedProcess):
                 self.app_specific_thread, self.addr_mask, self.target
             )
 
+    class CrashLogOptions:
+        load_all_images = False
+        crashed_only = True
+
     def __init__(self, exe_ctx: lldb.SBExecutionContext, args: lldb.SBStructuredData):
         super().__init__(exe_ctx, args)
 
@@ -88,13 +72,17 @@ class CrashLogScriptedProcess(ScriptedProcess):
             # Return error
             return
 
+        self.options = self.CrashLogOptions()
+
         load_all_images = args.GetValueForKey("load_all_images")
         if load_all_images and load_all_images.IsValid():
             if load_all_images.GetType() == lldb.eStructuredDataTypeBoolean:
-                self.load_all_images = load_all_images.GetBooleanValue()
+                self.options.load_all_images = load_all_images.GetBooleanValue()
 
-        if not self.load_all_images:
-            self.load_all_images = False
+        crashed_only = args.GetValueForKey("crashed_only")
+        if crashed_only and crashed_only.IsValid():
+            if crashed_only.GetType() == lldb.eStructuredDataTypeBoolean:
+                self.options.crashed_only = crashed_only.GetBooleanValue()
 
         self.pid = super().get_process_id()
         self.crashed_thread_idx = 0
@@ -159,7 +147,7 @@ class CrashLogScriptedThread(ScriptedThread):
         return frames
 
     def create_stackframes(self):
-        if not (self.originating_process.load_all_images or self.has_crashed):
+        if not (self.originating_process.options.load_all_images or self.has_crashed):
             return None
 
         if not self.backing_thread or not len(self.backing_thread.frames):
diff --git a/lldb/include/lldb/API/SBType.h b/lldb/include/lldb/API/SBType.h
index 5b9ff2170b2b..63ba91082d57 100644
--- a/lldb/include/lldb/API/SBType.h
+++ b/lldb/include/lldb/API/SBType.h
@@ -150,6 +150,8 @@ public:
 
   uint64_t GetByteSize();
 
+  uint64_t GetByteAlign();
+
   bool IsPointerType();
 
   bool IsReferenceType();
diff --git a/lldb/include/lldb/Core/Debugger.h b/lldb/include/lldb/Core/Debugger.h
index 49ff0737acef..ea994bf8c28d 100644
--- a/lldb/include/lldb/Core/Debugger.h
+++ b/lldb/include/lldb/Core/Debugger.h
@@ -78,15 +78,6 @@ class Debugger : public std::enable_shared_from_this<Debugger>,
                  public UserID,
                  public Properties {
 public:
-  /// Broadcaster event bits definitions.
-  enum {
-    eBroadcastBitProgress = (1 << 0),
-    eBroadcastBitWarning = (1 << 1),
-    eBroadcastBitError = (1 << 2),
-    eBroadcastSymbolChange = (1 << 3),
-    eBroadcastBitProgressCategory = (1 << 4),
-  };
-
   using DebuggerList = std::vector<lldb::DebuggerSP>;
 
   static llvm::StringRef GetStaticBroadcasterClass();
@@ -628,10 +619,9 @@ protected:
   ReportProgress(uint64_t progress_id, std::string title, std::string details,
                  uint64_t completed, uint64_t total,
                  std::optional<lldb::user_id_t> debugger_id,
-                 uint32_t progress_category_bit = eBroadcastBitProgress);
+                 uint32_t progress_category_bit = lldb::eBroadcastBitProgress);
 
-  static void ReportDiagnosticImpl(DiagnosticEventData::Type type,
-                                   std::string message,
+  static void ReportDiagnosticImpl(lldb::Severity severity, std::string message,
                                    std::optional<lldb::user_id_t> debugger_id,
                                    std::once_flag *once);
 
diff --git a/lldb/include/lldb/Core/DebuggerEvents.h b/lldb/include/lldb/Core/DebuggerEvents.h
index 74bb05e6e6bf..49a4ecf8e537 100644
--- a/lldb/include/lldb/Core/DebuggerEvents.h
+++ b/lldb/include/lldb/Core/DebuggerEvents.h
@@ -76,19 +76,15 @@ private:
 
 class DiagnosticEventData : public EventData {
 public:
-  enum class Type {
-    Info,
-    Warning,
-    Error,
-  };
-  DiagnosticEventData(Type type, std::string message, bool debugger_specific)
-      : m_message(std::move(message)), m_type(type),
+  DiagnosticEventData(lldb::Severity severity, std::string message,
+                      bool debugger_specific)
+      : m_message(std::move(message)), m_severity(severity),
         m_debugger_specific(debugger_specific) {}
   ~DiagnosticEventData() override = default;
 
   const std::string &GetMessage() const { return m_message; }
   bool IsDebuggerSpecific() const { return m_debugger_specific; }
-  Type GetType() const { return m_type; }
+  lldb::Severity GetSeverity() const { return m_severity; }
 
   llvm::StringRef GetPrefix() const;
 
@@ -105,7 +101,7 @@ public:
 
 protected:
   std::string m_message;
-  Type m_type;
+  lldb::Severity m_severity;
   const bool m_debugger_specific;
 
   DiagnosticEventData(const DiagnosticEventData &) = delete;
diff --git a/lldb/include/lldb/Expression/DiagnosticManager.h b/lldb/include/lldb/Expression/DiagnosticManager.h
index 06bf1d115f15..d49b7c99b114 100644
--- a/lldb/include/lldb/Expression/DiagnosticManager.h
+++ b/lldb/include/lldb/Expression/DiagnosticManager.h
@@ -28,12 +28,6 @@ enum DiagnosticOrigin {
   eDiagnosticOriginLLVM
 };
 
-enum DiagnosticSeverity {
-  eDiagnosticSeverityError,
-  eDiagnosticSeverityWarning,
-  eDiagnosticSeverityRemark
-};
-
 const uint32_t LLDB_INVALID_COMPILER_ID = UINT32_MAX;
 
 class Diagnostic {
@@ -55,7 +49,7 @@ public:
     }
   }
 
-  Diagnostic(llvm::StringRef message, DiagnosticSeverity severity,
+  Diagnostic(llvm::StringRef message, lldb::Severity severity,
              DiagnosticOrigin origin, uint32_t compiler_id)
       : m_message(message), m_severity(severity), m_origin(origin),
         m_compiler_id(compiler_id) {}
@@ -68,7 +62,7 @@ public:
 
   virtual bool HasFixIts() const { return false; }
 
-  DiagnosticSeverity GetSeverity() const { return m_severity; }
+  lldb::Severity GetSeverity() const { return m_severity; }
 
   uint32_t GetCompilerID() const { return m_compiler_id; }
 
@@ -83,7 +77,7 @@ public:
 
 protected:
   std::string m_message;
-  DiagnosticSeverity m_severity;
+  lldb::Severity m_severity;
   DiagnosticOrigin m_origin;
   uint32_t m_compiler_id; // Compiler-specific diagnostic ID
 };
@@ -106,7 +100,7 @@ public:
                         });
   }
 
-  void AddDiagnostic(llvm::StringRef message, DiagnosticSeverity severity,
+  void AddDiagnostic(llvm::StringRef message, lldb::Severity severity,
                      DiagnosticOrigin origin,
                      uint32_t compiler_id = LLDB_INVALID_COMPILER_ID) {
     m_diagnostics.emplace_back(
@@ -127,9 +121,9 @@ public:
     other.Clear();
   }
 
-  size_t Printf(DiagnosticSeverity severity, const char *format, ...)
+  size_t Printf(lldb::Severity severity, const char *format, ...)
       __attribute__((format(printf, 3, 4)));
-  void PutString(DiagnosticSeverity severity, llvm::StringRef str);
+  void PutString(lldb::Severity severity, llvm::StringRef str);
 
   void AppendMessageToDiagnostic(llvm::StringRef str) {
     if (!m_diagnostics.empty())
diff --git a/lldb/include/lldb/Host/Host.h b/lldb/include/lldb/Host/Host.h
index 30549cd78914..9d0994978402 100644
--- a/lldb/include/lldb/Host/Host.h
+++ b/lldb/include/lldb/Host/Host.h
@@ -88,7 +88,7 @@ public:
                               lldb::pid_t pid);
 
   /// Emit the given message to the operating system log.
-  static void SystemLog(llvm::StringRef message);
+  static void SystemLog(lldb::Severity severity, llvm::StringRef message);
 
   /// Get the process ID for the calling process.
   ///
diff --git a/lldb/include/lldb/Symbol/SymbolContext.h b/lldb/include/lldb/Symbol/SymbolContext.h
index bd33a71b46ca..0bc707070f85 100644
--- a/lldb/include/lldb/Symbol/SymbolContext.h
+++ b/lldb/include/lldb/Symbol/SymbolContext.h
@@ -158,6 +158,7 @@ public:
       Stream *s, ExecutionContextScope *exe_scope, const Address &so_addr,
       bool show_fullpaths, bool show_module, bool show_inlined_frames,
       bool show_function_arguments, bool show_function_name,
+      bool show_function_display_name = false,
       std::optional<Stream::HighlightSettings> settings = std::nullopt) const;
 
   /// Get the address range contained within a symbol context.
diff --git a/lldb/include/lldb/Symbol/Type.h b/lldb/include/lldb/Symbol/Type.h
index 1c4f7b5601b0..7aa0852676e4 100644
--- a/lldb/include/lldb/Symbol/Type.h
+++ b/lldb/include/lldb/Symbol/Type.h
@@ -21,6 +21,8 @@
 
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLForwardCompat.h"
+#include "llvm/Support/raw_ostream.h"
 
 #include <optional>
 #include <set>
@@ -492,12 +494,37 @@ public:
 
   static int Compare(const Type &a, const Type &b);
 
+  // Represents a parsed type name coming out of GetTypeScopeAndBasename. The
+  // structure holds StringRefs pointing to portions of the original name, and
+  // so must not be used after the name is destroyed.
+  struct ParsedName {
+    lldb::TypeClass type_class = lldb::eTypeClassAny;
+
+    // Scopes of the type, starting with the outermost. Absolute type references
+    // have a "::" as the first scope.
+    llvm::SmallVector<llvm::StringRef> scope;
+
+    llvm::StringRef basename;
+
+    friend bool operator==(const ParsedName &lhs, const ParsedName &rhs) {
+      return lhs.type_class == rhs.type_class && lhs.scope == rhs.scope &&
+             lhs.basename == rhs.basename;
+    }
+
+    friend llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
+                                         const ParsedName &name) {
+      return os << llvm::formatv(
+                 "Type::ParsedName({0:x}, [{1}], {2})",
+                 llvm::to_underlying(name.type_class),
+                 llvm::make_range(name.scope.begin(), name.scope.end()),
+                 name.basename);
+    }
+  };
   // From a fully qualified typename, split the type into the type basename and
   // the remaining type scope (namespaces/classes).
-  static bool GetTypeScopeAndBasename(llvm::StringRef name,
-                                      llvm::StringRef &scope,
-                                      llvm::StringRef &basename,
-                                      lldb::TypeClass &type_class);
+  static std::optional<ParsedName>
+  GetTypeScopeAndBasename(llvm::StringRef name);
+
   void SetEncodingType(Type *encoding_type) { m_encoding_type = encoding_type; }
 
   uint32_t GetEncodingMask();
diff --git a/lldb/include/lldb/Symbol/TypeList.h b/lldb/include/lldb/Symbol/TypeList.h
index 403469c989f5..d58772ad5b62 100644
--- a/lldb/include/lldb/Symbol/TypeList.h
+++ b/lldb/include/lldb/Symbol/TypeList.h
@@ -49,15 +49,6 @@ public:
 
   void ForEach(std::function<bool(lldb::TypeSP &type_sp)> const &callback);
 
-  void RemoveMismatchedTypes(llvm::StringRef qualified_typename,
-                             bool exact_match);
-
-  void RemoveMismatchedTypes(llvm::StringRef type_scope,
-                             llvm::StringRef type_basename,
-                             lldb::TypeClass type_class, bool exact_match);
-
-  void RemoveMismatchedTypes(lldb::TypeClass type_class);
-
 private:
   typedef collection::iterator iterator;
   typedef collection::const_iterator const_iterator;
diff --git a/lldb/include/lldb/Symbol/TypeMap.h b/lldb/include/lldb/Symbol/TypeMap.h
index 433711875e55..89011efab5c3 100644
--- a/lldb/include/lldb/Symbol/TypeMap.h
+++ b/lldb/include/lldb/Symbol/TypeMap.h
@@ -55,10 +55,6 @@ public:
 
   bool Remove(const lldb::TypeSP &type_sp);
 
-  void RemoveMismatchedTypes(llvm::StringRef type_scope,
-                             llvm::StringRef type_basename,
-                             lldb::TypeClass type_class, bool exact_match);
-
 private:
   typedef collection::iterator iterator;
   typedef collection::const_iterator const_iterator;
diff --git a/lldb/include/lldb/Target/Language.h b/lldb/include/lldb/Target/Language.h
index 67714e6fdf94..ff7c60bf68bf 100644
--- a/lldb/include/lldb/Target/Language.h
+++ b/lldb/include/lldb/Target/Language.h
@@ -281,6 +281,10 @@ public:
     return mangled.GetMangledName();
   }
 
+  virtual ConstString GetDisplayDemangledName(Mangled mangled) const {
+    return mangled.GetDemangledName();
+  }
+
   virtual void GetExceptionResolverDescription(bool catch_on, bool throw_on,
                                                Stream &s);
 
diff --git a/lldb/include/lldb/Target/Platform.h b/lldb/include/lldb/Target/Platform.h
index ad9c9dcbe684..e05c79cb501b 100644
--- a/lldb/include/lldb/Target/Platform.h
+++ b/lldb/include/lldb/Target/Platform.h
@@ -649,8 +649,8 @@ public:
 
   virtual std::string GetPlatformSpecificConnectionInformation() { return ""; }
 
-  virtual bool CalculateMD5(const FileSpec &file_spec, uint64_t &low,
-                            uint64_t &high);
+  virtual llvm::ErrorOr<llvm::MD5::MD5Result>
+  CalculateMD5(const FileSpec &file_spec);
 
   virtual uint32_t GetResumeCountForLaunchInfo(ProcessLaunchInfo &launch_info) {
     return 1;
diff --git a/lldb/include/lldb/Target/RemoteAwarePlatform.h b/lldb/include/lldb/Target/RemoteAwarePlatform.h
index d183815e1c8b..0b9d79f9ff03 100644
--- a/lldb/include/lldb/Target/RemoteAwarePlatform.h
+++ b/lldb/include/lldb/Target/RemoteAwarePlatform.h
@@ -58,8 +58,8 @@ public:
   Status SetFilePermissions(const FileSpec &file_spec,
                             uint32_t file_permissions) override;
 
-  bool CalculateMD5(const FileSpec &file_spec, uint64_t &low,
-                    uint64_t &high) override;
+  llvm::ErrorOr<llvm::MD5::MD5Result>
+  CalculateMD5(const FileSpec &file_spec) override;
 
   Status GetFileWithUUID(const FileSpec &platform_file, const UUID *uuid,
                          FileSpec &local_file) override;
diff --git a/lldb/include/lldb/Utility/Log.h b/lldb/include/lldb/Utility/Log.h
index 01876ad732d4..27707c17f9b8 100644
--- a/lldb/include/lldb/Utility/Log.h
+++ b/lldb/include/lldb/Utility/Log.h
@@ -112,6 +112,23 @@ private:
   static char ID;
 };
 
+/// A T-style log handler that multiplexes messages to two log handlers.
+class TeeLogHandler : public LogHandler {
+public:
+  TeeLogHandler(std::shared_ptr<LogHandler> first_log_handler,
+                std::shared_ptr<LogHandler> second_log_handler);
+
+  void Emit(llvm::StringRef message) override;
+
+  bool isA(const void *ClassID) const override { return ClassID == &ID; }
+  static bool classof(const LogHandler *obj) { return obj->isA(&ID); }
+
+private:
+  std::shared_ptr<LogHandler> m_first_log_handler;
+  std::shared_ptr<LogHandler> m_second_log_handler;
+  static char ID;
+};
+
 class Log final {
 public:
   /// The underlying type of all log channel enums. Declare them as:
diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h
index 15e458571860..8e05f6ba9c87 100644
--- a/lldb/include/lldb/lldb-enumerations.h
+++ b/lldb/include/lldb/lldb-enumerations.h
@@ -1344,7 +1344,15 @@ enum DebuggerBroadcastBit {
   eBroadcastBitProgress = (1 << 0),
   eBroadcastBitWarning = (1 << 1),
   eBroadcastBitError = (1 << 2),
-  eBroadcastBitProgressCategory = (1 << 3),
+  eBroadcastSymbolChange = (1 << 3),
+  eBroadcastBitProgressCategory = (1 << 4),
+};
+
+/// Used for expressing severity in logs and diagnostics.
+enum Severity {
+  eSeverityError,
+  eSeverityWarning,
+  eSeverityInfo, // Equivalent to Remark used in clang.
 };
 
 } // namespace lldb
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index 5838281bcb1a..e2126d67a5fe 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -448,7 +448,7 @@ class DebugCommunication(object):
         response = self.request_completions(text, frameId)
         return response["body"]["targets"]
 
-    def get_scope_variables(self, scope_name, frameIndex=0, threadId=None):
+    def get_scope_variables(self, scope_name, frameIndex=0, threadId=None, is_hex=None):
         stackFrame = self.get_stackFrame(frameIndex=frameIndex, threadId=threadId)
         if stackFrame is None:
             return []
@@ -462,7 +462,7 @@ class DebugCommunication(object):
         for scope in frame_scopes:
             if scope["name"] == scope_name:
                 varRef = scope["variablesReference"]
-                variables_response = self.request_variables(varRef)
+                variables_response = self.request_variables(varRef, is_hex=is_hex)
                 if variables_response:
                     if "body" in variables_response:
                         body = variables_response["body"]
@@ -476,9 +476,9 @@ class DebugCommunication(object):
             "Globals", frameIndex=frameIndex, threadId=threadId
         )
 
-    def get_local_variables(self, frameIndex=0, threadId=None):
+    def get_local_variables(self, frameIndex=0, threadId=None, is_hex=None):
         return self.get_scope_variables(
-            "Locals", frameIndex=frameIndex, threadId=threadId
+            "Locals", frameIndex=frameIndex, threadId=threadId, is_hex=is_hex
         )
 
     def get_registers(self, frameIndex=0, threadId=None):
@@ -486,28 +486,32 @@ class DebugCommunication(object):
             "Registers", frameIndex=frameIndex, threadId=threadId
         )
 
-    def get_local_variable(self, name, frameIndex=0, threadId=None):
-        locals = self.get_local_variables(frameIndex=frameIndex, threadId=threadId)
+    def get_local_variable(self, name, frameIndex=0, threadId=None, is_hex=None):
+        locals = self.get_local_variables(
+            frameIndex=frameIndex, threadId=threadId, is_hex=is_hex
+        )
         for local in locals:
             if "name" in local and local["name"] == name:
                 return local
         return None
 
-    def get_local_variable_value(self, name, frameIndex=0, threadId=None):
+    def get_local_variable_value(self, name, frameIndex=0, threadId=None, is_hex=None):
         variable = self.get_local_variable(
-            name, frameIndex=frameIndex, threadId=threadId
+            name, frameIndex=frameIndex, threadId=threadId, is_hex=is_hex
         )
         if variable and "value" in variable:
             return variable["value"]
         return None
 
-    def get_local_variable_child(self, name, child_name, frameIndex=0, threadId=None):
+    def get_local_variable_child(
+        self, name, child_name, frameIndex=0, threadId=None, is_hex=None
+    ):
         local = self.get_local_variable(name, frameIndex, threadId)
         if local["variablesReference"] == 0:
             return None
-        children = self.request_variables(local["variablesReference"])["body"][
-            "variables"
-        ]
+        children = self.request_variables(local["variablesReference"], is_hex=is_hex)[
+            "body"
+        ]["variables"]
         for child in children:
             if child["name"] == child_name:
                 return child
@@ -1035,12 +1039,16 @@ class DebugCommunication(object):
             self.threads = None
         return response
 
-    def request_variables(self, variablesReference, start=None, count=None):
+    def request_variables(
+        self, variablesReference, start=None, count=None, is_hex=None
+    ):
         args_dict = {"variablesReference": variablesReference}
         if start is not None:
             args_dict["start"] = start
         if count is not None:
             args_dict["count"] = count
+        if is_hex is not None:
+            args_dict["format"] = {"hex": is_hex}
         command_dict = {
             "command": "variables",
             "type": "request",
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-server/gdbremote_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-server/gdbremote_testcase.py
index 75522158b322..8c8e4abed0b4 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-server/gdbremote_testcase.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-server/gdbremote_testcase.py
@@ -130,9 +130,9 @@ class GdbRemoteTestCaseBase(Base, metaclass=GdbRemoteTestCaseFactory):
         self.stub_sends_two_stop_notifications_on_kill = False
         if configuration.lldb_platform_url:
             if configuration.lldb_platform_url.startswith("unix-"):
-                url_pattern = "(.+)://\[?(.+?)\]?/.*"
+                url_pattern = r"(.+)://\[?(.+?)\]?/.*"
             else:
-                url_pattern = "(.+)://(.+):\d+"
+                url_pattern = r"(.+)://(.+):\d+"
             scheme, host = re.match(
                 url_pattern, configuration.lldb_platform_url
             ).groups()
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-server/lldbgdbserverutils.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-server/lldbgdbserverutils.py
index 61c5c3a7c865..d1a4119bac78 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-server/lldbgdbserverutils.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-server/lldbgdbserverutils.py
@@ -50,7 +50,7 @@ def get_debugserver_exe():
 
 
 _LOG_LINE_REGEX = re.compile(
-    r"^(lldb-server|debugserver)\s+<\s*(\d+)>" + "\s+(read|send)\s+packet:\s+(.+)$"
+    r"^(lldb-server|debugserver)\s+<\s*(\d+)>\s+(read|send)\s+packet:\s+(.+)$"
 )
 
 
diff --git a/lldb/scripts/generate-sbapi-dwarf-enum.py b/lldb/scripts/generate-sbapi-dwarf-enum.py
index 464eb2afff7d..7fd603798631 100755
--- a/lldb/scripts/generate-sbapi-dwarf-enum.py
+++ b/lldb/scripts/generate-sbapi-dwarf-enum.py
@@ -2,6 +2,7 @@
 
 import argparse
 import re
+import os
 
 HEADER = """\
 //===-- SBLanguages.h -----------------------------------------*- C++ -*-===//
@@ -14,6 +15,8 @@ HEADER = """\
 
 #ifndef LLDB_API_SBLANGUAGE_H
 #define LLDB_API_SBLANGUAGE_H
+
+namespace lldb {
 /// Used by \\ref SBExpressionOptions.
 /// These enumerations use the same language enumerations as the DWARF
 /// specification for ease of use and consistency.
@@ -23,6 +26,8 @@ enum SBSourceLanguageName : uint16_t {
 FOOTER = """\
 };
 
+} // namespace lldb
+
 #endif
 """
 
@@ -37,6 +42,9 @@ def emit_enum(input, output):
     with open(input, "r") as f:
         lines = f.readlines()
 
+    # Create output folder if it does not exist
+    os.makedirs(os.path.dirname(output), exist_ok=True)
+
     # Write the output.
     with open(output, "w") as f:
         # Emit the header.
diff --git a/lldb/source/API/CMakeLists.txt b/lldb/source/API/CMakeLists.txt
index a64c0d4a3334..aa31caddfde3 100644
--- a/lldb/source/API/CMakeLists.txt
+++ b/lldb/source/API/CMakeLists.txt
@@ -20,19 +20,21 @@ if(LLDB_ENABLE_LUA)
   set(lldb_lua_wrapper ${lua_bindings_dir}/LLDBWrapLua.cpp)
 endif()
 
-# Target to generate SBLanguages.h from Dwarf.def.
+# Generate SBLanguages.h from Dwarf.def.
 set(sb_languages_file
   ${CMAKE_CURRENT_BINARY_DIR}/../../include/lldb/API/SBLanguages.h)
-add_custom_target(
-  lldb-sbapi-dwarf-enums
-  "${Python3_EXECUTABLE}"
+add_custom_command(
+  COMMENT "Generating SBLanguages.h from Dwarf.def"
+  COMMAND "${Python3_EXECUTABLE}"
       ${LLDB_SOURCE_DIR}/scripts/generate-sbapi-dwarf-enum.py
       ${LLVM_MAIN_INCLUDE_DIR}/llvm/BinaryFormat/Dwarf.def
       -o ${sb_languages_file}
-  BYPRODUCTS ${sb_languages_file}
+  OUTPUT ${sb_languages_file}
   DEPENDS ${LLVM_MAIN_INCLUDE_DIR}/llvm/BinaryFormat/Dwarf.def
   WORKING_DIRECTORY ${LLVM_LIBRARY_OUTPUT_INTDIR}
 )
+add_custom_target(lldb-sbapi-dwarf-enums
+  DEPENDS ${sb_languages_file})
 
 add_lldb_library(liblldb SHARED ${option_framework}
   SBAddress.cpp
diff --git a/lldb/source/API/SBType.cpp b/lldb/source/API/SBType.cpp
index 6cecb5c9ea81..8a063e5ad61d 100644
--- a/lldb/source/API/SBType.cpp
+++ b/lldb/source/API/SBType.cpp
@@ -25,6 +25,7 @@
 #include "lldb/Utility/Stream.h"
 
 #include "llvm/ADT/APSInt.h"
+#include "llvm/Support/MathExtras.h"
 
 #include <memory>
 #include <optional>
@@ -132,6 +133,18 @@ uint64_t SBType::GetByteSize() {
   return 0;
 }
 
+uint64_t SBType::GetByteAlign() {
+  LLDB_INSTRUMENT_VA(this);
+
+  if (!IsValid())
+    return 0;
+
+  std::optional<uint64_t> bit_align =
+      m_opaque_sp->GetCompilerType(/*prefer_dynamic=*/false)
+          .GetTypeBitAlign(nullptr);
+  return llvm::divideCeil(bit_align.value_or(0), 8);
+}
+
 bool SBType::IsPointerType() {
   LLDB_INSTRUMENT_VA(this);
 
diff --git a/lldb/source/Breakpoint/BreakpointLocation.cpp b/lldb/source/Breakpoint/BreakpointLocation.cpp
index b48ec1398d63..41911fad41c6 100644
--- a/lldb/source/Breakpoint/BreakpointLocation.cpp
+++ b/lldb/source/Breakpoint/BreakpointLocation.cpp
@@ -507,7 +507,7 @@ void BreakpointLocation::GetDescription(Stream *s,
       else
         s->PutCString("where = ");
       sc.DumpStopContext(s, m_owner.GetTarget().GetProcessSP().get(), m_address,
-                         false, true, false, true, true);
+                         false, true, false, true, true, true);
     } else {
       if (sc.module_sp) {
         s->EOL();
diff --git a/lldb/source/Core/Address.cpp b/lldb/source/Core/Address.cpp
index b23398883fa5..5a4751bd5256 100644
--- a/lldb/source/Core/Address.cpp
+++ b/lldb/source/Core/Address.cpp
@@ -645,7 +645,8 @@ bool Address::Dump(Stream *s, ExecutionContextScope *exe_scope, DumpStyle style,
                     pointer_sc.symbol != nullptr) {
                   s->PutCString(": ");
                   pointer_sc.DumpStopContext(s, exe_scope, so_addr, true, false,
-                                             false, true, true, settings);
+                                             false, true, true, false,
+                                             settings);
                 }
               }
             }
@@ -685,7 +686,7 @@ bool Address::Dump(Stream *s, ExecutionContextScope *exe_scope, DumpStyle style,
               sc.DumpStopContext(s, exe_scope, *this, show_fullpaths,
                                  show_module, show_inlined_frames,
                                  show_function_arguments, show_function_name,
-                                 settings);
+                                 false, settings);
             } else {
               // We found a symbol but it was in a different section so it
               // isn't the symbol we should be showing, just show the section
diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp
index cac4642873b7..9951fbcd3e7c 100644
--- a/lldb/source/Core/Debugger.cpp
+++ b/lldb/source/Core/Debugger.cpp
@@ -1476,20 +1476,19 @@ void Debugger::ReportProgress(uint64_t progress_id, std::string title,
   }
 }
 
-static void PrivateReportDiagnostic(Debugger &debugger,
-                                    DiagnosticEventData::Type type,
+static void PrivateReportDiagnostic(Debugger &debugger, Severity severity,
                                     std::string message,
                                     bool debugger_specific) {
   uint32_t event_type = 0;
-  switch (type) {
-  case DiagnosticEventData::Type::Info:
-    assert(false && "DiagnosticEventData::Type::Info should not be broadcast");
+  switch (severity) {
+  case eSeverityInfo:
+    assert(false && "eSeverityInfo should not be broadcast");
     return;
-  case DiagnosticEventData::Type::Warning:
-    event_type = Debugger::eBroadcastBitWarning;
+  case eSeverityWarning:
+    event_type = lldb::eBroadcastBitWarning;
     break;
-  case DiagnosticEventData::Type::Error:
-    event_type = Debugger::eBroadcastBitError;
+  case eSeverityError:
+    event_type = lldb::eBroadcastBitError;
     break;
   }
 
@@ -1497,29 +1496,32 @@ static void PrivateReportDiagnostic(Debugger &debugger,
   if (!broadcaster.EventTypeHasListeners(event_type)) {
     // Diagnostics are too important to drop. If nobody is listening, print the
     // diagnostic directly to the debugger's error stream.
-    DiagnosticEventData event_data(type, std::move(message), debugger_specific);
+    DiagnosticEventData event_data(severity, std::move(message),
+                                   debugger_specific);
     StreamSP stream = debugger.GetAsyncErrorStream();
     event_data.Dump(stream.get());
     return;
   }
   EventSP event_sp = std::make_shared<Event>(
       event_type,
-      new DiagnosticEventData(type, std::move(message), debugger_specific));
+      new DiagnosticEventData(severity, std::move(message), debugger_specific));
   broadcaster.BroadcastEvent(event_sp);
 }
 
-void Debugger::ReportDiagnosticImpl(DiagnosticEventData::Type type,
-                                    std::string message,
+void Debugger::ReportDiagnosticImpl(Severity severity, std::string message,
                                     std::optional<lldb::user_id_t> debugger_id,
                                     std::once_flag *once) {
   auto ReportDiagnosticLambda = [&]() {
+    // Always log diagnostics to the system log.
+    Host::SystemLog(severity, message);
+
     // The diagnostic subsystem is optional but we still want to broadcast
     // events when it's disabled.
     if (Diagnostics::Enabled())
       Diagnostics::Instance().Report(message);
 
     // We don't broadcast info events.
-    if (type == DiagnosticEventData::Type::Info)
+    if (severity == lldb::eSeverityInfo)
       return;
 
     // Check if this diagnostic is for a specific debugger.
@@ -1528,7 +1530,8 @@ void Debugger::ReportDiagnosticImpl(DiagnosticEventData::Type type,
       // still exists.
       DebuggerSP debugger_sp = FindDebuggerWithID(*debugger_id);
       if (debugger_sp)
-        PrivateReportDiagnostic(*debugger_sp, type, std::move(message), true);
+        PrivateReportDiagnostic(*debugger_sp, severity, std::move(message),
+                                true);
       return;
     }
     // The diagnostic event is not debugger specific, iterate over all debuggers
@@ -1536,7 +1539,7 @@ void Debugger::ReportDiagnosticImpl(DiagnosticEventData::Type type,
     if (g_debugger_list_ptr && g_debugger_list_mutex_ptr) {
       std::lock_guard<std::recursive_mutex> guard(*g_debugger_list_mutex_ptr);
       for (const auto &debugger : *g_debugger_list_ptr)
-        PrivateReportDiagnostic(*debugger, type, message, false);
+        PrivateReportDiagnostic(*debugger, severity, message, false);
     }
   };
 
@@ -1549,22 +1552,19 @@ void Debugger::ReportDiagnosticImpl(DiagnosticEventData::Type type,
 void Debugger::ReportWarning(std::string message,
                              std::optional<lldb::user_id_t> debugger_id,
                              std::once_flag *once) {
-  ReportDiagnosticImpl(DiagnosticEventData::Type::Warning, std::move(message),
-                       debugger_id, once);
+  ReportDiagnosticImpl(eSeverityWarning, std::move(message), debugger_id, once);
 }
 
 void Debugger::ReportError(std::string message,
                            std::optional<lldb::user_id_t> debugger_id,
                            std::once_flag *once) {
-  ReportDiagnosticImpl(DiagnosticEventData::Type::Error, std::move(message),
-                       debugger_id, once);
+  ReportDiagnosticImpl(eSeverityError, std::move(message), debugger_id, once);
 }
 
 void Debugger::ReportInfo(std::string message,
                           std::optional<lldb::user_id_t> debugger_id,
                           std::once_flag *once) {
-  ReportDiagnosticImpl(DiagnosticEventData::Type::Info, std::move(message),
-                       debugger_id, once);
+  ReportDiagnosticImpl(eSeverityInfo, std::move(message), debugger_id, once);
 }
 
 void Debugger::ReportSymbolChange(const ModuleSpec &module_spec) {
@@ -1572,7 +1572,7 @@ void Debugger::ReportSymbolChange(const ModuleSpec &module_spec) {
     std::lock_guard<std::recursive_mutex> guard(*g_debugger_list_mutex_ptr);
     for (DebuggerSP debugger_sp : *g_debugger_list_ptr) {
       EventSP event_sp = std::make_shared<Event>(
-          Debugger::eBroadcastSymbolChange,
+          lldb::eBroadcastSymbolChange,
           new SymbolChangeEventData(debugger_sp, module_spec));
       debugger_sp->GetBroadcaster().BroadcastEvent(event_sp);
     }
@@ -1879,8 +1879,9 @@ lldb::thread_result_t Debugger::DefaultEventHandler() {
           CommandInterpreter::eBroadcastBitAsynchronousErrorData);
 
   listener_sp->StartListeningForEvents(
-      &m_broadcaster, eBroadcastBitProgress | eBroadcastBitWarning |
-                          eBroadcastBitError | eBroadcastSymbolChange);
+      &m_broadcaster, lldb::eBroadcastBitProgress | lldb::eBroadcastBitWarning |
+                          lldb::eBroadcastBitError |
+                          lldb::eBroadcastSymbolChange);
 
   // Let the thread that spawned us know that we have started up and that we
   // are now listening to all required events so no events get missed
@@ -1932,11 +1933,11 @@ lldb::thread_result_t Debugger::DefaultEventHandler() {
               }
             }
           } else if (broadcaster == &m_broadcaster) {
-            if (event_type & Debugger::eBroadcastBitProgress)
+            if (event_type & lldb::eBroadcastBitProgress)
               HandleProgressEvent(event_sp);
-            else if (event_type & Debugger::eBroadcastBitWarning)
+            else if (event_type & lldb::eBroadcastBitWarning)
               HandleDiagnosticEvent(event_sp);
-            else if (event_type & Debugger::eBroadcastBitError)
+            else if (event_type & lldb::eBroadcastBitError)
               HandleDiagnosticEvent(event_sp);
           }
         }
diff --git a/lldb/source/Core/DebuggerEvents.cpp b/lldb/source/Core/DebuggerEvents.cpp
index 65aed0eba9c4..2fa6efd155af 100644
--- a/lldb/source/Core/DebuggerEvents.cpp
+++ b/lldb/source/Core/DebuggerEvents.cpp
@@ -73,19 +73,19 @@ ProgressEventData::GetAsStructuredData(const Event *event_ptr) {
 }
 
 llvm::StringRef DiagnosticEventData::GetPrefix() const {
-  switch (m_type) {
-  case Type::Info:
+  switch (m_severity) {
+  case Severity::eSeverityInfo:
     return "info";
-  case Type::Warning:
+  case Severity::eSeverityWarning:
     return "warning";
-  case Type::Error:
+  case Severity::eSeverityError:
     return "error";
   }
   llvm_unreachable("Fully covered switch above!");
 }
 
 void DiagnosticEventData::Dump(Stream *s) const {
-  llvm::HighlightColor color = m_type == Type::Warning
+  llvm::HighlightColor color = m_severity == lldb::eSeverityWarning
                                    ? llvm::HighlightColor::Warning
                                    : llvm::HighlightColor::Error;
   llvm::WithColor(s->AsRawOstream(), color, llvm::ColorMode::Enable)
diff --git a/lldb/source/Core/Mangled.cpp b/lldb/source/Core/Mangled.cpp
index b167c51fdce2..8efc4c639cca 100644
--- a/lldb/source/Core/Mangled.cpp
+++ b/lldb/source/Core/Mangled.cpp
@@ -310,6 +310,8 @@ ConstString Mangled::GetDemangledName() const {
 }
 
 ConstString Mangled::GetDisplayDemangledName() const {
+  if (Language *lang = Language::FindPlugin(GuessLanguage()))
+    return lang->GetDisplayDemangledName(*this);
   return GetDemangledName();
 }
 
diff --git a/lldb/source/Core/Progress.cpp b/lldb/source/Core/Progress.cpp
index 161038284e21..1a779e2ddf92 100644
--- a/lldb/source/Core/Progress.cpp
+++ b/lldb/source/Core/Progress.cpp
@@ -172,7 +172,7 @@ void ProgressManager::ReportProgress(
   Debugger::ReportProgress(progress_data.progress_id, progress_data.title, "",
                            completed, Progress::kNonDeterministicTotal,
                            progress_data.debugger_id,
-                           Debugger::eBroadcastBitProgressCategory);
+                           lldb::eBroadcastBitProgressCategory);
 }
 
 void ProgressManager::Expire(llvm::StringRef key) {
diff --git a/lldb/source/Expression/DiagnosticManager.cpp b/lldb/source/Expression/DiagnosticManager.cpp
index 9a1100df78db..a8330138f3d5 100644
--- a/lldb/source/Expression/DiagnosticManager.cpp
+++ b/lldb/source/Expression/DiagnosticManager.cpp
@@ -31,17 +31,17 @@ void DiagnosticManager::Dump(Log *log) {
   log->PutCString(str.c_str());
 }
 
-static const char *StringForSeverity(DiagnosticSeverity severity) {
+static const char *StringForSeverity(lldb::Severity severity) {
   switch (severity) {
   // this should be exhaustive
-  case lldb_private::eDiagnosticSeverityError:
+  case lldb::eSeverityError:
     return "error: ";
-  case lldb_private::eDiagnosticSeverityWarning:
+  case lldb::eSeverityWarning:
     return "warning: ";
-  case lldb_private::eDiagnosticSeverityRemark:
+  case lldb::eSeverityInfo:
     return "";
   }
-  llvm_unreachable("switch needs another case for DiagnosticSeverity enum");
+  llvm_unreachable("switch needs another case for lldb::Severity enum");
 }
 
 std::string DiagnosticManager::GetString(char separator) {
@@ -65,8 +65,8 @@ std::string DiagnosticManager::GetString(char separator) {
   return ret;
 }
 
-size_t DiagnosticManager::Printf(DiagnosticSeverity severity,
-                                 const char *format, ...) {
+size_t DiagnosticManager::Printf(lldb::Severity severity, const char *format,
+                                 ...) {
   StreamString ss;
 
   va_list args;
@@ -79,7 +79,7 @@ size_t DiagnosticManager::Printf(DiagnosticSeverity severity,
   return result;
 }
 
-void DiagnosticManager::PutString(DiagnosticSeverity severity,
+void DiagnosticManager::PutString(lldb::Severity severity,
                                   llvm::StringRef str) {
   if (str.empty())
     return;
diff --git a/lldb/source/Expression/FunctionCaller.cpp b/lldb/source/Expression/FunctionCaller.cpp
index ffadbf9b32ec..5ac2b0681ebb 100644
--- a/lldb/source/Expression/FunctionCaller.cpp
+++ b/lldb/source/Expression/FunctionCaller.cpp
@@ -67,27 +67,25 @@ bool FunctionCaller::WriteFunctionWrapper(
   Process *process = exe_ctx.GetProcessPtr();
 
   if (!process) {
-    diagnostic_manager.Printf(eDiagnosticSeverityError, "no process.");
+    diagnostic_manager.Printf(lldb::eSeverityError, "no process.");
     return false;
   }
   
   lldb::ProcessSP jit_process_sp(m_jit_process_wp.lock());
 
   if (process != jit_process_sp.get()) {
-    diagnostic_manager.Printf(eDiagnosticSeverityError,
-                             "process does not match the stored process.");
+    diagnostic_manager.Printf(lldb::eSeverityError,
+                              "process does not match the stored process.");
     return false;
   }
     
   if (process->GetState() != lldb::eStateStopped) {
-    diagnostic_manager.Printf(eDiagnosticSeverityError, 
-                              "process is not stopped");
+    diagnostic_manager.Printf(lldb::eSeverityError, "process is not stopped");
     return false;
   }
 
   if (!m_compiled) {
-    diagnostic_manager.Printf(eDiagnosticSeverityError, 
-                              "function not compiled");
+    diagnostic_manager.Printf(lldb::eSeverityError, "function not compiled");
     return false;
   }
   
@@ -101,7 +99,7 @@ bool FunctionCaller::WriteFunctionWrapper(
       can_interpret, eExecutionPolicyAlways));
 
   if (!jit_error.Success()) {
-    diagnostic_manager.Printf(eDiagnosticSeverityError,
+    diagnostic_manager.Printf(lldb::eSeverityError,
                               "Error in PrepareForExecution: %s.",
                               jit_error.AsCString());
     return false;
@@ -144,7 +142,7 @@ bool FunctionCaller::WriteFunctionArguments(
   // All the information to reconstruct the struct is provided by the
   // StructExtractor.
   if (!m_struct_valid) {
-    diagnostic_manager.PutString(eDiagnosticSeverityError,
+    diagnostic_manager.PutString(lldb::eSeverityError,
                                  "Argument information was not correctly "
                                  "parsed, so the function cannot be called.");
     return false;
@@ -192,7 +190,7 @@ bool FunctionCaller::WriteFunctionArguments(
   size_t num_args = arg_values.GetSize();
   if (num_args != m_arg_values.GetSize()) {
     diagnostic_manager.Printf(
-        eDiagnosticSeverityError,
+        lldb::eSeverityError,
         "Wrong number of arguments - was: %" PRIu64 " should be: %" PRIu64 "",
         (uint64_t)num_args, (uint64_t)m_arg_values.GetSize());
     return false;
@@ -231,11 +229,11 @@ bool FunctionCaller::InsertFunction(ExecutionContext &exe_ctx,
   // the caller, we need to be stopped.
   Process *process = exe_ctx.GetProcessPtr();
   if (!process) {
-    diagnostic_manager.PutString(eDiagnosticSeverityError, "no process");
+    diagnostic_manager.PutString(lldb::eSeverityError, "no process");
     return false;
   }
   if (process->GetState() != lldb::eStateStopped) {
-    diagnostic_manager.PutString(eDiagnosticSeverityError, "process running");
+    diagnostic_manager.PutString(lldb::eSeverityError, "process running");
     return false;
   }
   if (CompileFunction(exe_ctx.GetThreadSP(), diagnostic_manager) != 0)
@@ -267,8 +265,7 @@ lldb::ThreadPlanSP FunctionCaller::GetThreadPlanToCallFunction(
   Thread *thread = exe_ctx.GetThreadPtr();
   if (thread == nullptr) {
     diagnostic_manager.PutString(
-        eDiagnosticSeverityError,
-        "Can't call a function without a valid thread.");
+        lldb::eSeverityError, "Can't call a function without a valid thread.");
     return nullptr;
   }
 
diff --git a/lldb/source/Expression/LLVMUserExpression.cpp b/lldb/source/Expression/LLVMUserExpression.cpp
index 1434011c80ad..b4fdfc4d1fa8 100644
--- a/lldb/source/Expression/LLVMUserExpression.cpp
+++ b/lldb/source/Expression/LLVMUserExpression.cpp
@@ -73,7 +73,7 @@ LLVMUserExpression::DoExecute(DiagnosticManager &diagnostic_manager,
 
   if (m_jit_start_addr == LLDB_INVALID_ADDRESS && !m_can_interpret) {
     diagnostic_manager.PutString(
-        eDiagnosticSeverityError,
+        lldb::eSeverityError,
         "Expression can't be run, because there is no JIT compiled function");
     return lldb::eExpressionSetupError;
   }
@@ -83,7 +83,7 @@ LLVMUserExpression::DoExecute(DiagnosticManager &diagnostic_manager,
   if (!PrepareToExecuteJITExpression(diagnostic_manager, exe_ctx,
                                      struct_address)) {
     diagnostic_manager.Printf(
-        eDiagnosticSeverityError,
+        lldb::eSeverityError,
         "errored out in %s, couldn't PrepareToExecuteJITExpression",
         __FUNCTION__);
     return lldb::eExpressionSetupError;
@@ -98,8 +98,7 @@ LLVMUserExpression::DoExecute(DiagnosticManager &diagnostic_manager,
 
     if (!module || !function) {
       diagnostic_manager.PutString(
-          eDiagnosticSeverityError,
-          "supposed to interpret, but nothing is there");
+          lldb::eSeverityError, "supposed to interpret, but nothing is there");
       return lldb::eExpressionSetupError;
     }
 
@@ -108,7 +107,7 @@ LLVMUserExpression::DoExecute(DiagnosticManager &diagnostic_manager,
     std::vector<lldb::addr_t> args;
 
     if (!AddArguments(exe_ctx, args, struct_address, diagnostic_manager)) {
-      diagnostic_manager.Printf(eDiagnosticSeverityError,
+      diagnostic_manager.Printf(lldb::eSeverityError,
                                 "errored out in %s, couldn't AddArguments",
                                 __FUNCTION__);
       return lldb::eExpressionSetupError;
@@ -122,14 +121,14 @@ LLVMUserExpression::DoExecute(DiagnosticManager &diagnostic_manager,
                              function_stack_top, exe_ctx, options.GetTimeout());
 
     if (!interpreter_error.Success()) {
-      diagnostic_manager.Printf(eDiagnosticSeverityError,
+      diagnostic_manager.Printf(lldb::eSeverityError,
                                 "supposed to interpret, but failed: %s",
                                 interpreter_error.AsCString());
       return lldb::eExpressionDiscarded;
     }
   } else {
     if (!exe_ctx.HasThreadScope()) {
-      diagnostic_manager.Printf(eDiagnosticSeverityError,
+      diagnostic_manager.Printf(lldb::eSeverityError,
                                 "%s called with no thread selected",
                                 __FUNCTION__);
       return lldb::eExpressionSetupError;
@@ -144,7 +143,7 @@ LLVMUserExpression::DoExecute(DiagnosticManager &diagnostic_manager,
     std::vector<lldb::addr_t> args;
 
     if (!AddArguments(exe_ctx, args, struct_address, diagnostic_manager)) {
-      diagnostic_manager.Printf(eDiagnosticSeverityError,
+      diagnostic_manager.Printf(lldb::eSeverityError,
                                 "errored out in %s, couldn't AddArguments",
                                 __FUNCTION__);
       return lldb::eExpressionSetupError;
@@ -156,7 +155,7 @@ LLVMUserExpression::DoExecute(DiagnosticManager &diagnostic_manager,
 
     StreamString ss;
     if (!call_plan_sp || !call_plan_sp->ValidatePlan(&ss)) {
-      diagnostic_manager.PutString(eDiagnosticSeverityError, ss.GetString());
+      diagnostic_manager.PutString(lldb::eSeverityError, ss.GetString());
       return lldb::eExpressionSetupError;
     }
 
@@ -194,11 +193,11 @@ LLVMUserExpression::DoExecute(DiagnosticManager &diagnostic_manager,
           error_desc = real_stop_info_sp->GetDescription();
       }
       if (error_desc)
-        diagnostic_manager.Printf(eDiagnosticSeverityError,
+        diagnostic_manager.Printf(lldb::eSeverityError,
                                   "Execution was interrupted, reason: %s.",
                                   error_desc);
       else
-        diagnostic_manager.PutString(eDiagnosticSeverityError,
+        diagnostic_manager.PutString(lldb::eSeverityError,
                                      "Execution was interrupted.");
 
       if ((execution_result == lldb::eExpressionInterrupted &&
@@ -221,7 +220,7 @@ LLVMUserExpression::DoExecute(DiagnosticManager &diagnostic_manager,
       return execution_result;
     } else if (execution_result == lldb::eExpressionStoppedForDebug) {
       diagnostic_manager.PutString(
-          eDiagnosticSeverityRemark,
+          lldb::eSeverityInfo,
           "Execution was halted at the first instruction of the expression "
           "function because \"debug\" was requested.\n"
           "Use \"thread return -x\" to return to the state before expression "
@@ -229,7 +228,7 @@ LLVMUserExpression::DoExecute(DiagnosticManager &diagnostic_manager,
       return execution_result;
     } else if (execution_result == lldb::eExpressionThreadVanished) {
       diagnostic_manager.Printf(
-          eDiagnosticSeverityError,
+          lldb::eSeverityError,
           "Couldn't complete execution; the thread "
           "on which the expression was being run: 0x%" PRIx64
           " exited during its execution.",
@@ -237,7 +236,7 @@ LLVMUserExpression::DoExecute(DiagnosticManager &diagnostic_manager,
       return execution_result;
     } else if (execution_result != lldb::eExpressionCompleted) {
       diagnostic_manager.Printf(
-          eDiagnosticSeverityError, "Couldn't execute function; result was %s",
+          lldb::eSeverityError, "Couldn't execute function; result was %s",
           Process::ExecutionResultAsCString(execution_result));
       return execution_result;
     }
@@ -261,7 +260,7 @@ bool LLVMUserExpression::FinalizeJITExecution(
                  "after execution --");
 
   if (!m_dematerializer_sp) {
-    diagnostic_manager.Printf(eDiagnosticSeverityError,
+    diagnostic_manager.Printf(lldb::eSeverityError,
                               "Couldn't apply expression side effects : no "
                               "dematerializer is present");
     return false;
@@ -273,7 +272,7 @@ bool LLVMUserExpression::FinalizeJITExecution(
                                      function_stack_top);
 
   if (!dematerialize_error.Success()) {
-    diagnostic_manager.Printf(eDiagnosticSeverityError,
+    diagnostic_manager.Printf(lldb::eSeverityError,
                               "Couldn't apply expression side effects : %s",
                               dematerialize_error.AsCString("unknown error"));
     return false;
@@ -299,7 +298,7 @@ bool LLVMUserExpression::PrepareToExecuteJITExpression(
 
   if (!LockAndCheckContext(exe_ctx, target, process, frame)) {
     diagnostic_manager.PutString(
-        eDiagnosticSeverityError,
+        lldb::eSeverityError,
         "The context has changed before we could JIT the expression!");
     return false;
   }
@@ -322,7 +321,7 @@ bool LLVMUserExpression::PrepareToExecuteJITExpression(
 
       if (!alloc_error.Success()) {
         diagnostic_manager.Printf(
-            eDiagnosticSeverityError,
+            lldb::eSeverityError,
             "Couldn't allocate space for materialized struct: %s",
             alloc_error.AsCString());
         return false;
@@ -354,7 +353,7 @@ bool LLVMUserExpression::PrepareToExecuteJITExpression(
 
       if (!alloc_error.Success()) {
         diagnostic_manager.Printf(
-            eDiagnosticSeverityError,
+            lldb::eSeverityError,
             "Couldn't allocate space for the stack frame: %s",
             alloc_error.AsCString());
         return false;
@@ -367,7 +366,7 @@ bool LLVMUserExpression::PrepareToExecuteJITExpression(
         frame, *m_execution_unit_sp, struct_address, materialize_error);
 
     if (!materialize_error.Success()) {
-      diagnostic_manager.Printf(eDiagnosticSeverityError,
+      diagnostic_manager.Printf(lldb::eSeverityError,
                                 "Couldn't materialize: %s",
                                 materialize_error.AsCString());
       return false;
diff --git a/lldb/source/Expression/UserExpression.cpp b/lldb/source/Expression/UserExpression.cpp
index 5658426c8891..b78f43995767 100644
--- a/lldb/source/Expression/UserExpression.cpp
+++ b/lldb/source/Expression/UserExpression.cpp
@@ -300,6 +300,8 @@ UserExpression::Evaluate(ExecutionContext &exe_ctx,
             target->GetUserExpressionForLanguage(
                 fixed_expression->c_str(), full_prefix, language, desired_type,
                 options, ctx_obj, error));
+        if (!fixed_expression_sp)
+          break;
         DiagnosticManager fixed_diagnostic_manager;
         parse_success = fixed_expression_sp->Parse(
             fixed_diagnostic_manager, exe_ctx, execution_policy,
@@ -308,17 +310,16 @@ UserExpression::Evaluate(ExecutionContext &exe_ctx,
           diagnostic_manager.Clear();
           user_expression_sp = fixed_expression_sp;
           break;
+        }
+        // The fixed expression also didn't parse. Let's check for any new
+        // fixits we could try.
+        if (!fixed_expression_sp->GetFixedText().empty()) {
+          *fixed_expression = fixed_expression_sp->GetFixedText().str();
         } else {
-          // The fixed expression also didn't parse. Let's check for any new
-          // Fix-Its we could try.
-          if (!fixed_expression_sp->GetFixedText().empty()) {
-            *fixed_expression = fixed_expression_sp->GetFixedText().str();
-          } else {
-            // Fixed expression didn't compile without a fixit, don't retry and
-            // don't tell the user about it.
-            fixed_expression->clear();
-            break;
-          }
+          // Fixed expression didn't compile without a fixit, don't retry and
+          // don't tell the user about it.
+          fixed_expression->clear();
+          break;
         }
       }
     }
diff --git a/lldb/source/Host/common/Host.cpp b/lldb/source/Host/common/Host.cpp
index 565138ba1703..06ccc0e2b342 100644
--- a/lldb/source/Host/common/Host.cpp
+++ b/lldb/source/Host/common/Host.cpp
@@ -91,15 +91,37 @@ using namespace lldb_private;
 #if !defined(__APPLE__)
 #if !defined(_WIN32)
 #include <syslog.h>
-void Host::SystemLog(llvm::StringRef message) {
+void Host::SystemLog(Severity severity, llvm::StringRef message) {
   static llvm::once_flag g_openlog_once;
   llvm::call_once(g_openlog_once, [] {
     openlog("lldb", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_USER);
   });
-  syslog(LOG_INFO, "%s", message.data());
+  int level = LOG_DEBUG;
+  switch (severity) {
+  case lldb::eSeverityInfo:
+    level = LOG_INFO;
+    break;
+  case lldb::eSeverityWarning:
+    level = LOG_WARNING;
+    break;
+  case lldb::eSeverityError:
+    level = LOG_ERR;
+    break;
+  }
+  syslog(level, "%s", message.data());
 }
 #else
-void Host::SystemLog(llvm::StringRef message) { llvm::errs() << message; }
+void Host::SystemLog(Severity severity, llvm::StringRef message) {
+  switch (severity) {
+  case lldb::eSeverityInfo:
+  case lldb::eSeverityWarning:
+    llvm::outs() << message;
+    break;
+  case lldb::eSeverityError:
+    llvm::errs() << message;
+    break;
+  }
+}
 #endif
 #endif
 
@@ -629,5 +651,5 @@ char SystemLogHandler::ID;
 SystemLogHandler::SystemLogHandler() {}
 
 void SystemLogHandler::Emit(llvm::StringRef message) {
-  Host::SystemLog(message);
+  Host::SystemLog(lldb::eSeverityInfo, message);
 }
diff --git a/lldb/source/Host/macosx/objcxx/Host.mm b/lldb/source/Host/macosx/objcxx/Host.mm
index 070a49208639..4fba5550ba10 100644
--- a/lldb/source/Host/macosx/objcxx/Host.mm
+++ b/lldb/source/Host/macosx/objcxx/Host.mm
@@ -102,12 +102,20 @@ using namespace lldb_private;
 static os_log_t g_os_log;
 static std::once_flag g_os_log_once;
 
-void Host::SystemLog(llvm::StringRef message) {
+void Host::SystemLog(Severity severity, llvm::StringRef message) {
   if (__builtin_available(macos 10.12, iOS 10, tvOS 10, watchOS 3, *)) {
     std::call_once(g_os_log_once, []() {
       g_os_log = os_log_create("com.apple.dt.lldb", "lldb");
     });
-    os_log(g_os_log, "%{public}s", message.str().c_str());
+    switch (severity) {
+    case lldb::eSeverityInfo:
+    case lldb::eSeverityWarning:
+      os_log(g_os_log, "%{public}s", message.str().c_str());
+      break;
+    case lldb::eSeverityError:
+      os_log_error(g_os_log, "%{public}s", message.str().c_str());
+      break;
+    }
   } else {
     llvm::errs() << message;
   }
diff --git a/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp b/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp
index 9fa245fc41d4..51e4b3e6728f 100644
--- a/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp
+++ b/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp
@@ -506,6 +506,19 @@ DynamicLoaderPOSIXDYLD::GetStepThroughTrampolinePlan(Thread &thread,
   Target &target = thread.GetProcess()->GetTarget();
   const ModuleList &images = target.GetImages();
 
+  llvm::StringRef target_name = sym_name.GetStringRef();
+  // On AArch64, the trampoline name has a prefix (__AArch64ADRPThunk_ or
+  // __AArch64AbsLongThunk_) added to the function name. If we detect a
+  // trampoline with the prefix, we need to remove the prefix to find the
+  // function symbol.
+  if (target_name.consume_front("__AArch64ADRPThunk_") ||
+      target_name.consume_front("__AArch64AbsLongThunk_")) {
+    // An empty target name can happen for trampolines generated for
+    // section-referencing relocations.
+    if (!target_name.empty()) {
+      sym_name = ConstString(target_name);
+    }
+  }
   images.FindSymbolsWithNameAndType(sym_name, eSymbolTypeCode, target_symbols);
   if (!target_symbols.GetSize())
     return thread_plan_sp;
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangDiagnostic.h b/lldb/source/Plugins/ExpressionParser/Clang/ClangDiagnostic.h
index 7459b715dbe2..21abd71cc34e 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangDiagnostic.h
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangDiagnostic.h
@@ -29,7 +29,7 @@ public:
     return diag->getKind() == eDiagnosticOriginClang;
   }
 
-  ClangDiagnostic(llvm::StringRef message, DiagnosticSeverity severity,
+  ClangDiagnostic(llvm::StringRef message, lldb::Severity severity,
                   uint32_t compiler_id)
       : Diagnostic(message, severity, eDiagnosticOriginClang, compiler_id) {}
 
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.cpp
index 31f6447d66f6..f994d0250433 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.cpp
@@ -228,8 +228,7 @@ bool ClangExpressionDeclMap::AddPersistentVariable(const NamedDecl *decl,
     std::string msg = llvm::formatv("redefinition of persistent variable '{0}'",
                                     name).str();
     m_parser_vars->m_diagnostics->AddDiagnostic(
-        msg, DiagnosticSeverity::eDiagnosticSeverityError,
-        DiagnosticOrigin::eDiagnosticOriginLLDB);
+        msg, lldb::eSeverityError, DiagnosticOrigin::eDiagnosticOriginLLDB);
     return false;
   }
 
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
index 72c7cda13ecb..1dd98567f8d6 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
@@ -207,20 +207,20 @@ public:
     m_passthrough->HandleDiagnostic(DiagLevel, Info);
     m_os->flush();
 
-    lldb_private::DiagnosticSeverity severity;
+    lldb::Severity severity;
     bool make_new_diagnostic = true;
 
     switch (DiagLevel) {
     case DiagnosticsEngine::Level::Fatal:
     case DiagnosticsEngine::Level::Error:
-      severity = eDiagnosticSeverityError;
+      severity = lldb::eSeverityError;
       break;
     case DiagnosticsEngine::Level::Warning:
-      severity = eDiagnosticSeverityWarning;
+      severity = lldb::eSeverityWarning;
       break;
     case DiagnosticsEngine::Level::Remark:
     case DiagnosticsEngine::Level::Ignored:
-      severity = eDiagnosticSeverityRemark;
+      severity = lldb::eSeverityInfo;
       break;
     case DiagnosticsEngine::Level::Note:
       m_manager->AppendMessageToDiagnostic(m_output);
@@ -238,7 +238,7 @@ public:
       if (!clang_diag || clang_diag->HasFixIts())
         break;
       // Ignore all Fix-Its that are not associated with an error.
-      if (clang_diag->GetSeverity() != eDiagnosticSeverityError)
+      if (clang_diag->GetSeverity() != lldb::eSeverityError)
         break;
       AddAllFixIts(clang_diag, Info);
       break;
@@ -256,7 +256,7 @@ public:
       // enough context in an expression for the warning to be useful.
       // FIXME: Should we try to filter out FixIts that apply to our generated
       // code, and not the user's expression?
-      if (severity == eDiagnosticSeverityError)
+      if (severity == lldb::eSeverityError)
         AddAllFixIts(new_diagnostic.get(), Info);
 
       m_manager->AddDiagnostic(std::move(new_diagnostic));
@@ -1164,7 +1164,7 @@ ClangExpressionParser::ParseInternal(DiagnosticManager &diagnostic_manager,
 
   if (m_pp_callbacks && m_pp_callbacks->hasErrors()) {
     num_errors++;
-    diagnostic_manager.PutString(eDiagnosticSeverityError,
+    diagnostic_manager.PutString(lldb::eSeverityError,
                                  "while importing modules:");
     diagnostic_manager.AppendMessageToDiagnostic(
         m_pp_callbacks->getErrorString());
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangFunctionCaller.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangFunctionCaller.cpp
index 5235cd2a1461..59321e375bdc 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangFunctionCaller.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangFunctionCaller.cpp
@@ -138,7 +138,7 @@ ClangFunctionCaller::CompileFunction(lldb::ThreadSP thread_to_use_sp,
         type_name = clang_qual_type.GetTypeName().AsCString("");
       } else {
         diagnostic_manager.Printf(
-            eDiagnosticSeverityError,
+            lldb::eSeverityError,
             "Could not determine type of input value %" PRIu64 ".",
             (uint64_t)i);
         return 1;
@@ -194,7 +194,7 @@ ClangFunctionCaller::CompileFunction(lldb::ThreadSP thread_to_use_sp,
     num_errors = clang_parser->Parse(diagnostic_manager);
     m_parser.reset(clang_parser);
   } else {
-    diagnostic_manager.PutString(eDiagnosticSeverityError,
+    diagnostic_manager.PutString(lldb::eSeverityError,
                                  "no process - unable to inject function");
     num_errors = 1;
   }
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp
index 5ea7bc02a6e4..c7e98d12590d 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp
@@ -331,12 +331,11 @@ bool ClangUserExpression::SetupPersistentState(DiagnosticManager &diagnostic_man
       m_result_delegate.RegisterPersistentState(persistent_state);
     } else {
       diagnostic_manager.PutString(
-          eDiagnosticSeverityError,
-          "couldn't start parsing (no persistent data)");
+          lldb::eSeverityError, "couldn't start parsing (no persistent data)");
       return false;
     }
   } else {
-    diagnostic_manager.PutString(eDiagnosticSeverityError,
+    diagnostic_manager.PutString(lldb::eSeverityError,
                                  "error: couldn't start parsing (no target)");
     return false;
   }
@@ -384,12 +383,11 @@ static void SetupDeclVendor(ExecutionContext &exe_ctx, Target *target,
     // The error stream already contains several Clang diagnostics that might
     // be either errors or warnings, so just print them all as one remark
     // diagnostic to prevent that the message starts with "error: error:".
-    diagnostic_manager.PutString(eDiagnosticSeverityRemark,
-                                 error_stream.GetString());
+    diagnostic_manager.PutString(lldb::eSeverityInfo, error_stream.GetString());
     return;
   }
 
-  diagnostic_manager.PutString(eDiagnosticSeverityError,
+  diagnostic_manager.PutString(lldb::eSeverityError,
                                "Unknown error while loading modules needed for "
                                "current compilation unit.");
 }
@@ -424,7 +422,7 @@ void ClangUserExpression::CreateSourceCode(
 
     if (!m_source_code->GetText(m_transformed_text, exe_ctx, !m_ctx_obj,
                                 for_completion, modules_to_import)) {
-      diagnostic_manager.PutString(eDiagnosticSeverityError,
+      diagnostic_manager.PutString(lldb::eSeverityError,
                                    "couldn't construct expression body");
       return;
     }
@@ -531,7 +529,7 @@ bool ClangUserExpression::PrepareForParsing(
   ScanContext(exe_ctx, err);
 
   if (!err.Success()) {
-    diagnostic_manager.PutString(eDiagnosticSeverityWarning, err.AsCString());
+    diagnostic_manager.PutString(lldb::eSeverityWarning, err.AsCString());
   }
 
   ////////////////////////////////////
@@ -564,7 +562,7 @@ bool ClangUserExpression::TryParse(
 
   if (!DeclMap()->WillParse(exe_ctx, GetMaterializer())) {
     diagnostic_manager.PutString(
-        eDiagnosticSeverityError,
+        lldb::eSeverityError,
         "current process state is unsuitable for expression parsing");
     return false;
   }
@@ -611,9 +609,9 @@ bool ClangUserExpression::TryParse(
     if (!jit_error.Success()) {
       const char *error_cstr = jit_error.AsCString();
       if (error_cstr && error_cstr[0])
-        diagnostic_manager.PutString(eDiagnosticSeverityError, error_cstr);
+        diagnostic_manager.PutString(lldb::eSeverityError, error_cstr);
       else
-        diagnostic_manager.PutString(eDiagnosticSeverityError,
+        diagnostic_manager.PutString(lldb::eSeverityError,
                                      "expression can't be interpreted or run");
       return false;
     }
@@ -663,7 +661,7 @@ bool ClangUserExpression::Parse(DiagnosticManager &diagnostic_manager,
   Target *target = exe_ctx.GetTargetPtr();
 
   if (!target) {
-    diagnostic_manager.PutString(eDiagnosticSeverityError, "invalid target");
+    diagnostic_manager.PutString(lldb::eSeverityError, "invalid target");
     return false;
   }
 
@@ -707,11 +705,9 @@ bool ClangUserExpression::Parse(DiagnosticManager &diagnostic_manager,
     if (!static_init_error.Success()) {
       const char *error_cstr = static_init_error.AsCString();
       if (error_cstr && error_cstr[0])
-        diagnostic_manager.Printf(eDiagnosticSeverityError,
-                                  "%s\n",
-                                  error_cstr);
+        diagnostic_manager.Printf(lldb::eSeverityError, "%s\n", error_cstr);
       else
-        diagnostic_manager.PutString(eDiagnosticSeverityError,
+        diagnostic_manager.PutString(lldb::eSeverityError,
                                      "couldn't run static initializers\n");
       return false;
     }
@@ -825,7 +821,7 @@ bool ClangUserExpression::Complete(ExecutionContext &exe_ctx,
 
   if (!DeclMap()->WillParse(exe_ctx, GetMaterializer())) {
     diagnostic_manager.PutString(
-        eDiagnosticSeverityError,
+        lldb::eSeverityError,
         "current process state is unsuitable for expression parsing");
 
     return false;
@@ -902,7 +898,7 @@ bool ClangUserExpression::AddArguments(ExecutionContext &exe_ctx,
 
     if (!m_in_cplusplus_method && !m_in_objectivec_method) {
       diagnostic_manager.PutString(
-          eDiagnosticSeverityError,
+          lldb::eSeverityError,
           "need object pointer but don't know the language");
       return false;
     }
@@ -944,7 +940,7 @@ bool ClangUserExpression::AddArguments(ExecutionContext &exe_ctx,
 
       if (!object_ptr_error.Success()) {
         diagnostic_manager.Printf(
-            eDiagnosticSeverityWarning,
+            lldb::eSeverityWarning,
             "couldn't get cmd pointer (substituting NULL): %s",
             object_ptr_error.AsCString());
         cmd_ptr = 0;
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangUtilityFunction.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangUtilityFunction.cpp
index 56d6cf19ee4c..1f44200c4cff 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangUtilityFunction.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangUtilityFunction.cpp
@@ -75,8 +75,7 @@ ClangUtilityFunction::~ClangUtilityFunction() = default;
 bool ClangUtilityFunction::Install(DiagnosticManager &diagnostic_manager,
                                    ExecutionContext &exe_ctx) {
   if (m_jit_start_addr != LLDB_INVALID_ADDRESS) {
-    diagnostic_manager.PutString(eDiagnosticSeverityWarning,
-                                 "already installed");
+    diagnostic_manager.PutString(lldb::eSeverityWarning, "already installed");
     return false;
   }
 
@@ -87,21 +86,21 @@ bool ClangUtilityFunction::Install(DiagnosticManager &diagnostic_manager,
   Target *target = exe_ctx.GetTargetPtr();
 
   if (!target) {
-    diagnostic_manager.PutString(eDiagnosticSeverityError, "invalid target");
+    diagnostic_manager.PutString(lldb::eSeverityError, "invalid target");
     return false;
   }
 
   Process *process = exe_ctx.GetProcessPtr();
 
   if (!process) {
-    diagnostic_manager.PutString(eDiagnosticSeverityError, "invalid process");
+    diagnostic_manager.PutString(lldb::eSeverityError, "invalid process");
     return false;
   }
 
   // Since we might need to call allocate memory and maybe call code to make
   // the caller, we need to be stopped.
   if (process->GetState() != lldb::eStateStopped) {
-    diagnostic_manager.PutString(eDiagnosticSeverityError, "process running");
+    diagnostic_manager.PutString(lldb::eSeverityError, "process running");
     return false;
   }
   //////////////////////////
@@ -114,7 +113,7 @@ bool ClangUtilityFunction::Install(DiagnosticManager &diagnostic_manager,
 
   if (!DeclMap()->WillParse(exe_ctx, nullptr)) {
     diagnostic_manager.PutString(
-        eDiagnosticSeverityError,
+        lldb::eSeverityError,
         "current process state is unsuitable for expression parsing");
     return false;
   }
@@ -166,9 +165,9 @@ bool ClangUtilityFunction::Install(DiagnosticManager &diagnostic_manager,
   } else {
     const char *error_cstr = jit_error.AsCString();
     if (error_cstr && error_cstr[0]) {
-      diagnostic_manager.Printf(eDiagnosticSeverityError, "%s", error_cstr);
+      diagnostic_manager.Printf(lldb::eSeverityError, "%s", error_cstr);
     } else {
-      diagnostic_manager.PutString(eDiagnosticSeverityError,
+      diagnostic_manager.PutString(lldb::eSeverityError,
                                    "expression can't be interpreted or run");
     }
     return false;
diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
index 16f6d2e884b5..d88f2d083019 100644
--- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
+++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
@@ -2060,13 +2060,17 @@ static char FindArmAarch64MappingSymbol(const char *symbol_name) {
 #define IS_MICROMIPS(ST_OTHER) (((ST_OTHER)&STO_MIPS_ISA) == STO_MICROMIPS)
 
 // private
-unsigned ObjectFileELF::ParseSymbols(Symtab *symtab, user_id_t start_id,
-                                     SectionList *section_list,
-                                     const size_t num_symbols,
-                                     const DataExtractor &symtab_data,
-                                     const DataExtractor &strtab_data) {
+std::pair<unsigned, ObjectFileELF::FileAddressToAddressClassMap>
+ObjectFileELF::ParseSymbols(Symtab *symtab, user_id_t start_id,
+                            SectionList *section_list, const size_t num_symbols,
+                            const DataExtractor &symtab_data,
+                            const DataExtractor &strtab_data) {
   ELFSymbol symbol;
   lldb::offset_t offset = 0;
+  // The changes these symbols would make to the class map. We will also update
+  // m_address_class_map but need to tell the caller what changed because the
+  // caller may be another object file.
+  FileAddressToAddressClassMap address_class_map;
 
   static ConstString text_section_name(".text");
   static ConstString init_section_name(".init");
@@ -2213,18 +2217,18 @@ unsigned ObjectFileELF::ParseSymbols(Symtab *symtab, user_id_t start_id,
             switch (mapping_symbol) {
             case 'a':
               // $a[.<any>]* - marks an ARM instruction sequence
-              m_address_class_map[symbol.st_value] = AddressClass::eCode;
+              address_class_map[symbol.st_value] = AddressClass::eCode;
               break;
             case 'b':
             case 't':
               // $b[.<any>]* - marks a THUMB BL instruction sequence
               // $t[.<any>]* - marks a THUMB instruction sequence
-              m_address_class_map[symbol.st_value] =
+              address_class_map[symbol.st_value] =
                   AddressClass::eCodeAlternateISA;
               break;
             case 'd':
               // $d[.<any>]* - marks a data item sequence (e.g. lit pool)
-              m_address_class_map[symbol.st_value] = AddressClass::eData;
+              address_class_map[symbol.st_value] = AddressClass::eData;
               break;
             }
           }
@@ -2238,11 +2242,11 @@ unsigned ObjectFileELF::ParseSymbols(Symtab *symtab, user_id_t start_id,
             switch (mapping_symbol) {
             case 'x':
               // $x[.<any>]* - marks an A64 instruction sequence
-              m_address_class_map[symbol.st_value] = AddressClass::eCode;
+              address_class_map[symbol.st_value] = AddressClass::eCode;
               break;
             case 'd':
               // $d[.<any>]* - marks a data item sequence (e.g. lit pool)
-              m_address_class_map[symbol.st_value] = AddressClass::eData;
+              address_class_map[symbol.st_value] = AddressClass::eData;
               break;
             }
           }
@@ -2260,11 +2264,11 @@ unsigned ObjectFileELF::ParseSymbols(Symtab *symtab, user_id_t start_id,
             // conjunction with symbol.st_value to produce the final
             // symbol_value that we store in the symtab.
             symbol_value_offset = -1;
-            m_address_class_map[symbol.st_value ^ 1] =
+            address_class_map[symbol.st_value ^ 1] =
                 AddressClass::eCodeAlternateISA;
           } else {
             // This address is ARM
-            m_address_class_map[symbol.st_value] = AddressClass::eCode;
+            address_class_map[symbol.st_value] = AddressClass::eCode;
           }
         }
       }
@@ -2285,17 +2289,17 @@ unsigned ObjectFileELF::ParseSymbols(Symtab *symtab, user_id_t start_id,
       */
       if (arch.IsMIPS()) {
         if (IS_MICROMIPS(symbol.st_other))
-          m_address_class_map[symbol.st_value] = AddressClass::eCodeAlternateISA;
+          address_class_map[symbol.st_value] = AddressClass::eCodeAlternateISA;
         else if ((symbol.st_value & 1) && (symbol_type == eSymbolTypeCode)) {
           symbol.st_value = symbol.st_value & (~1ull);
-          m_address_class_map[symbol.st_value] = AddressClass::eCodeAlternateISA;
+          address_class_map[symbol.st_value] = AddressClass::eCodeAlternateISA;
         } else {
           if (symbol_type == eSymbolTypeCode)
-            m_address_class_map[symbol.st_value] = AddressClass::eCode;
+            address_class_map[symbol.st_value] = AddressClass::eCode;
           else if (symbol_type == eSymbolTypeData)
-            m_address_class_map[symbol.st_value] = AddressClass::eData;
+            address_class_map[symbol.st_value] = AddressClass::eData;
           else
-            m_address_class_map[symbol.st_value] = AddressClass::eUnknown;
+            address_class_map[symbol.st_value] = AddressClass::eUnknown;
         }
       }
     }
@@ -2356,13 +2360,30 @@ unsigned ObjectFileELF::ParseSymbols(Symtab *symtab, user_id_t start_id,
     bool symbol_size_valid =
         symbol.st_size != 0 || symbol.getType() != STT_FUNC;
 
+    bool is_trampoline = false;
+    if (arch.IsValid() && (arch.GetMachine() == llvm::Triple::aarch64)) {
+      // On AArch64, trampolines are registered as code.
+      // If we detect a trampoline (which starts with __AArch64ADRPThunk_ or
+      // __AArch64AbsLongThunk_) we register the symbol as a trampoline. This
+      // way we will be able to detect the trampoline when we step in a function
+      // and step through the trampoline.
+      if (symbol_type == eSymbolTypeCode) {
+        llvm::StringRef trampoline_name = mangled.GetName().GetStringRef();
+        if (trampoline_name.starts_with("__AArch64ADRPThunk_") ||
+            trampoline_name.starts_with("__AArch64AbsLongThunk_")) {
+          symbol_type = eSymbolTypeTrampoline;
+          is_trampoline = true;
+        }
+      }
+    }
+
     Symbol dc_symbol(
         i + start_id, // ID is the original symbol table index.
         mangled,
         symbol_type,                    // Type of this symbol
         is_global,                      // Is this globally visible?
         false,                          // Is this symbol debug info?
-        false,                          // Is this symbol a trampoline?
+        is_trampoline,                  // Is this symbol a trampoline?
         false,                          // Is this symbol artificial?
         AddressRange(symbol_section_sp, // Section in which this symbol is
                                         // defined or null.
@@ -2375,24 +2396,33 @@ unsigned ObjectFileELF::ParseSymbols(Symtab *symtab, user_id_t start_id,
       dc_symbol.SetIsWeak(true);
     symtab->AddSymbol(dc_symbol);
   }
-  return i;
+
+  m_address_class_map.merge(address_class_map);
+  return {i, address_class_map};
 }
 
-unsigned ObjectFileELF::ParseSymbolTable(Symtab *symbol_table,
-                                         user_id_t start_id,
-                                         lldb_private::Section *symtab) {
+std::pair<unsigned, ObjectFileELF::FileAddressToAddressClassMap>
+ObjectFileELF::ParseSymbolTable(Symtab *symbol_table, user_id_t start_id,
+                                lldb_private::Section *symtab) {
   if (symtab->GetObjectFile() != this) {
     // If the symbol table section is owned by a different object file, have it
     // do the parsing.
     ObjectFileELF *obj_file_elf =
         static_cast<ObjectFileELF *>(symtab->GetObjectFile());
-    return obj_file_elf->ParseSymbolTable(symbol_table, start_id, symtab);
+    auto [num_symbols, address_class_map] =
+        obj_file_elf->ParseSymbolTable(symbol_table, start_id, symtab);
+
+    // The other object file returned the changes it made to its address
+    // class map, make the same changes to ours.
+    m_address_class_map.merge(address_class_map);
+
+    return {num_symbols, address_class_map};
   }
 
   // Get section list for this object file.
   SectionList *section_list = m_sections_up.get();
   if (!section_list)
-    return 0;
+    return {};
 
   user_id_t symtab_id = symtab->GetID();
   const ELFSectionHeaderInfo *symtab_hdr = GetSectionHeaderByIndex(symtab_id);
@@ -2418,7 +2448,7 @@ unsigned ObjectFileELF::ParseSymbolTable(Symtab *symbol_table,
     }
   }
 
-  return 0;
+  return {0, {}};
 }
 
 size_t ObjectFileELF::ParseDynamicSymbols() {
@@ -2955,8 +2985,12 @@ void ObjectFileELF::ParseSymtab(Symtab &lldb_symtab) {
   // while the reverse is not necessarily true.
   Section *symtab =
       section_list->FindSectionByType(eSectionTypeELFSymbolTable, true).get();
-  if (symtab)
-    symbol_id += ParseSymbolTable(&lldb_symtab, symbol_id, symtab);
+  if (symtab) {
+    auto [num_symbols, address_class_map] =
+        ParseSymbolTable(&lldb_symtab, symbol_id, symtab);
+    m_address_class_map.merge(address_class_map);
+    symbol_id += num_symbols;
+  }
 
   // The symtab section is non-allocable and can be stripped, while the
   // .dynsym section which should always be always be there. To support the
@@ -2969,8 +3003,12 @@ void ObjectFileELF::ParseSymtab(Symtab &lldb_symtab) {
     Section *dynsym =
         section_list->FindSectionByType(eSectionTypeELFDynamicSymbols, true)
             .get();
-    if (dynsym)
-      symbol_id += ParseSymbolTable(&lldb_symtab, symbol_id, dynsym);
+    if (dynsym) {
+      auto [num_symbols, address_class_map] =
+          ParseSymbolTable(&lldb_symtab, symbol_id, dynsym);
+      symbol_id += num_symbols;
+      m_address_class_map.merge(address_class_map);
+    }
   }
 
   // DT_JMPREL
diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.h b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.h
index bc8e34981a9d..844e981b1d89 100644
--- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.h
+++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.h
@@ -187,6 +187,9 @@ private:
   typedef DynamicSymbolColl::iterator DynamicSymbolCollIter;
   typedef DynamicSymbolColl::const_iterator DynamicSymbolCollConstIter;
 
+  /// An ordered map of file address to address class. Used on architectures
+  /// like Arm where there is an alternative ISA mode like Thumb. The container
+  /// is ordered so that it can be binary searched.
   typedef std::map<lldb::addr_t, lldb_private::AddressClass>
       FileAddressToAddressClassMap;
 
@@ -285,18 +288,19 @@ private:
 
   /// Populates the symbol table with all non-dynamic linker symbols.  This
   /// method will parse the symbols only once.  Returns the number of symbols
-  /// parsed.
-  unsigned ParseSymbolTable(lldb_private::Symtab *symbol_table,
-                            lldb::user_id_t start_id,
-                            lldb_private::Section *symtab);
+  /// parsed and a map of address types (used by targets like Arm that have
+  /// an alternative ISA mode like Thumb).
+  std::pair<unsigned, FileAddressToAddressClassMap>
+  ParseSymbolTable(lldb_private::Symtab *symbol_table, lldb::user_id_t start_id,
+                   lldb_private::Section *symtab);
 
   /// Helper routine for ParseSymbolTable().
-  unsigned ParseSymbols(lldb_private::Symtab *symbol_table,
-                        lldb::user_id_t start_id,
-                        lldb_private::SectionList *section_list,
-                        const size_t num_symbols,
-                        const lldb_private::DataExtractor &symtab_data,
-                        const lldb_private::DataExtractor &strtab_data);
+  std::pair<unsigned, FileAddressToAddressClassMap>
+  ParseSymbols(lldb_private::Symtab *symbol_table, lldb::user_id_t start_id,
+               lldb_private::SectionList *section_list,
+               const size_t num_symbols,
+               const lldb_private::DataExtractor &symtab_data,
+               const lldb_private::DataExtractor &strtab_data);
 
   /// Scans the relocation entries and adds a set of artificial symbols to the
   /// given symbol table for each PLT slot.  Returns the number of symbols
diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
index 1caf93659956..4dd23bb1e4db 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
@@ -5140,8 +5140,17 @@ uint32_t ObjectFileMachO::GetDependentModules(FileSpecList &files) {
       case LC_LOADFVMLIB:
       case LC_LOAD_UPWARD_DYLIB: {
         uint32_t name_offset = cmd_offset + m_data.GetU32(&offset);
+        bool is_delayed_init = false;
+        uint32_t use_command_marker = m_data.GetU32(&offset);
+        if (use_command_marker == 0x1a741800 /* DYLIB_USE_MARKER */) {
+          offset += 4; /* uint32_t current_version */
+          offset += 4; /* uint32_t compat_version */
+          uint32_t flags = m_data.GetU32(&offset);
+          if (flags & 0x08 /* DYLIB_USE_DELAYED_INIT */)
+            is_delayed_init = true;
+        }
         const char *path = m_data.PeekCStr(name_offset);
-        if (path) {
+        if (path && !is_delayed_init) {
           if (load_cmd.cmd == LC_RPATH)
             rpath_paths.push_back(path);
           else {
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.cpp
index 52777909a1f8..82156aca8cf1 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.cpp
@@ -405,17 +405,21 @@ lldb_private::Status PlatformDarwinDevice::GetSharedModuleWithLocalCache(
           // when going over the *slow* GDB remote transfer mechanism we first
           // check the hashes of the files - and only do the actual transfer if
           // they differ
-          uint64_t high_local, high_remote, low_local, low_remote;
           auto MD5 = llvm::sys::fs::md5_contents(module_cache_spec.GetPath());
           if (!MD5)
             return Status(MD5.getError());
-          std::tie(high_local, low_local) = MD5->words();
 
-          m_remote_platform_sp->CalculateMD5(module_spec.GetFileSpec(),
-                                             low_remote, high_remote);
-          if (low_local != low_remote || high_local != high_remote) {
+          Log *log = GetLog(LLDBLog::Platform);
+          bool requires_transfer = true;
+          llvm::ErrorOr<llvm::MD5::MD5Result> remote_md5 =
+              m_remote_platform_sp->CalculateMD5(module_spec.GetFileSpec());
+          if (std::error_code ec = remote_md5.getError())
+            LLDB_LOG(log, "couldn't get md5 sum from remote: {0}",
+                     ec.message());
+          else
+            requires_transfer = *MD5 != *remote_md5;
+          if (requires_transfer) {
             // bring in the remote file
-            Log *log = GetLog(LLDBLog::Platform);
             LLDB_LOGF(log,
                       "[%s] module %s/%s needs to be replaced from remote copy",
                       (IsHost() ? "host" : "remote"),
diff --git a/lldb/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.cpp b/lldb/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.cpp
index 0dce5add2e37..4684947ede20 100644
--- a/lldb/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.cpp
+++ b/lldb/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.cpp
@@ -684,12 +684,12 @@ Status PlatformRemoteGDBServer::RunShellCommand(
                                           signo_ptr, command_output, timeout);
 }
 
-bool PlatformRemoteGDBServer::CalculateMD5(const FileSpec &file_spec,
-                                           uint64_t &low, uint64_t &high) {
+llvm::ErrorOr<llvm::MD5::MD5Result>
+PlatformRemoteGDBServer::CalculateMD5(const FileSpec &file_spec) {
   if (!IsConnected())
-    return false;
+    return std::make_error_code(std::errc::not_connected);
 
-  return m_gdb_client_up->CalculateMD5(file_spec, low, high);
+  return m_gdb_client_up->CalculateMD5(file_spec);
 }
 
 void PlatformRemoteGDBServer::CalculateTrapHandlerSymbolNames() {
diff --git a/lldb/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.h b/lldb/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.h
index d83fc386f594..0ae1f3cb4199 100644
--- a/lldb/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.h
+++ b/lldb/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.h
@@ -146,8 +146,8 @@ public:
 
   void CalculateTrapHandlerSymbolNames() override;
 
-  bool CalculateMD5(const FileSpec &file_spec, uint64_t &low,
-                    uint64_t &high) override;
+  llvm::ErrorOr<llvm::MD5::MD5Result>
+  CalculateMD5(const FileSpec &file_spec) override;
 
   const lldb::UnixSignalsSP &GetRemoteUnixSignals() override;
 
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
index 7498a070c260..db9fb37a9a3c 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
@@ -3418,8 +3418,8 @@ bool GDBRemoteCommunicationClient::GetFileExists(
   return true;
 }
 
-bool GDBRemoteCommunicationClient::CalculateMD5(
-    const lldb_private::FileSpec &file_spec, uint64_t &low, uint64_t &high) {
+llvm::ErrorOr<llvm::MD5::MD5Result> GDBRemoteCommunicationClient::CalculateMD5(
+    const lldb_private::FileSpec &file_spec) {
   std::string path(file_spec.GetPath(false));
   lldb_private::StreamString stream;
   stream.PutCString("vFile:MD5:");
@@ -3428,11 +3428,11 @@ bool GDBRemoteCommunicationClient::CalculateMD5(
   if (SendPacketAndWaitForResponse(stream.GetString(), response) ==
       PacketResult::Success) {
     if (response.GetChar() != 'F')
-      return false;
+      return std::make_error_code(std::errc::illegal_byte_sequence);
     if (response.GetChar() != ',')
-      return false;
+      return std::make_error_code(std::errc::illegal_byte_sequence);
     if (response.Peek() && *response.Peek() == 'x')
-      return false;
+      return std::make_error_code(std::errc::no_such_file_or_directory);
 
     // GDBRemoteCommunicationServerCommon::Handle_vFile_MD5 concatenates low and
     // high hex strings. We can't use response.GetHexMaxU64 because that can't
@@ -3455,25 +3455,33 @@ bool GDBRemoteCommunicationClient::CalculateMD5(
     auto part =
         response.GetStringRef().substr(response.GetFilePos(), MD5_HALF_LENGTH);
     if (part.size() != MD5_HALF_LENGTH)
-      return false;
+      return std::make_error_code(std::errc::illegal_byte_sequence);
     response.SetFilePos(response.GetFilePos() + part.size());
 
+    uint64_t low;
     if (part.getAsInteger(/*radix=*/16, low))
-      return false;
+      return std::make_error_code(std::errc::illegal_byte_sequence);
 
     // Get high part
     part =
         response.GetStringRef().substr(response.GetFilePos(), MD5_HALF_LENGTH);
     if (part.size() != MD5_HALF_LENGTH)
-      return false;
+      return std::make_error_code(std::errc::illegal_byte_sequence);
     response.SetFilePos(response.GetFilePos() + part.size());
 
+    uint64_t high;
     if (part.getAsInteger(/*radix=*/16, high))
-      return false;
+      return std::make_error_code(std::errc::illegal_byte_sequence);
 
-    return true;
+    llvm::MD5::MD5Result result;
+    llvm::support::endian::write<uint64_t, llvm::endianness::little>(
+        result.data(), low);
+    llvm::support::endian::write<uint64_t, llvm::endianness::little>(
+        result.data() + 8, high);
+
+    return result;
   }
-  return false;
+  return std::make_error_code(std::errc::operation_canceled);
 }
 
 bool GDBRemoteCommunicationClient::AvoidGPackets(ProcessGDBRemote *process) {
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h
index 4be7eb00f42b..898d176abc34 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h
@@ -392,7 +392,7 @@ public:
           *command_output, // Pass nullptr if you don't want the command output
       const Timeout<std::micro> &timeout);
 
-  bool CalculateMD5(const FileSpec &file_spec, uint64_t &low, uint64_t &high);
+  llvm::ErrorOr<llvm::MD5::MD5Result> CalculateMD5(const FileSpec &file_spec);
 
   lldb::DataBufferSP ReadRegister(
       lldb::tid_t tid,
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.h
index 66db396279e0..e144cf0f9bd9 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.h
@@ -60,6 +60,8 @@ public:
 
   virtual ConstString GetDIEClassTemplateParams(const DWARFDIE &die) = 0;
 
+  virtual lldb_private::Type *FindDefinitionTypeForDIE(const DWARFDIE &die) = 0;
+
   static std::optional<SymbolFile::ArrayInfo>
   ParseChildArrayInfo(const DWARFDIE &parent_die,
                       const ExecutionContext *exe_ctx = nullptr);
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index bea11e0e3840..2a46be921612 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -154,6 +154,26 @@ static bool TagIsRecordType(dw_tag_t tag) {
   }
 }
 
+static bool IsForwardDeclaration(const DWARFDIE &die,
+                                 const ParsedDWARFTypeAttributes &attrs,
+                                 LanguageType cu_language) {
+  if (attrs.is_forward_declaration)
+    return true;
+
+  // Work around an issue with clang at the moment where forward
+  // declarations for objective C classes are emitted as:
+  //  DW_TAG_structure_type [2]
+  //  DW_AT_name( "ForwardObjcClass" )
+  //  DW_AT_byte_size( 0x00 )
+  //  DW_AT_decl_file( "..." )
+  //  DW_AT_decl_line( 1 )
+  //
+  // Note that there is no DW_AT_declaration and there are no children,
+  // and the byte size is zero.
+  return attrs.byte_size && *attrs.byte_size == 0 && attrs.name &&
+         !die.HasChildren() && cu_language == eLanguageTypeObjC;
+}
+
 TypeSP DWARFASTParserClang::ParseTypeFromClangModule(const SymbolContext &sc,
                                                      const DWARFDIE &die,
                                                      Log *log) {
@@ -249,11 +269,9 @@ static void ForcefullyCompleteType(CompilerType type) {
 /// This function serves a similar purpose as RequireCompleteType above, but it
 /// avoids completing the type if it is not immediately necessary. It only
 /// ensures we _can_ complete the type later.
-static void PrepareContextToReceiveMembers(TypeSystemClang &ast,
-                                           ClangASTImporter &ast_importer,
-                                           clang::DeclContext *decl_ctx,
-                                           DWARFDIE die,
-                                           const char *type_name_cstr) {
+void DWARFASTParserClang::PrepareContextToReceiveMembers(
+    clang::DeclContext *decl_ctx, const DWARFDIE &decl_ctx_die,
+    const DWARFDIE &die, const char *type_name_cstr) {
   auto *tag_decl_ctx = clang::dyn_cast<clang::TagDecl>(decl_ctx);
   if (!tag_decl_ctx)
     return; // Non-tag context are always ready.
@@ -268,7 +286,8 @@ static void PrepareContextToReceiveMembers(TypeSystemClang &ast,
   // gmodules case), we can complete the type by doing a full import.
 
   // If this type was not imported from an external AST, there's nothing to do.
-  CompilerType type = ast.GetTypeForDecl(tag_decl_ctx);
+  CompilerType type = m_ast.GetTypeForDecl(tag_decl_ctx);
+  ClangASTImporter &ast_importer = GetClangASTImporter();
   if (type && ast_importer.CanImport(type)) {
     auto qual_type = ClangUtil::GetQualType(type);
     if (ast_importer.RequireCompleteType(qual_type))
@@ -279,6 +298,13 @@ static void PrepareContextToReceiveMembers(TypeSystemClang &ast,
         type_name_cstr ? type_name_cstr : "", die.GetOffset());
   }
 
+  // By searching for the definition DIE of the decl_ctx type, we will either:
+  // 1. Found the the definition DIE and start its definition with
+  // TypeSystemClang::StartTagDeclarationDefinition.
+  // 2. Unable to find it, then need to forcefully complete it.
+  FindDefinitionTypeForDIE(decl_ctx_die);
+  if (tag_decl_ctx->isCompleteDefinition() || tag_decl_ctx->isBeingDefined())
+    return;
   // We don't have a type definition and/or the import failed. We must
   // forcefully complete the type to avoid crashes.
   ForcefullyCompleteType(type);
@@ -452,9 +478,9 @@ TypeSP DWARFASTParserClang::ParseTypeFromDWARF(const SymbolContext &sc,
         log,
         "DWARFASTParserClang::ParseTypeFromDWARF "
         "(die = {0:x16}, decl_ctx = {1:p} (die "
-        "{2:x16})) {3} name = '{4}')",
+        "{2:x16})) {3} ({4}) name = '{5}')",
         die.GetOffset(), static_cast<void *>(context), context_die.GetOffset(),
-        die.GetTagAsCString(), die.GetName());
+        DW_TAG_value_to_name(die.Tag()), die.Tag(), die.GetName());
   }
 
   Type *type_ptr = dwarf->GetDIEToType().lookup(die.GetDIE());
@@ -620,10 +646,11 @@ DWARFASTParserClang::ParseTypeModifier(const SymbolContext &sc,
   if (tag == DW_TAG_typedef) {
     // DeclContext will be populated when the clang type is materialized in
     // Type::ResolveCompilerType.
-    PrepareContextToReceiveMembers(
-        m_ast, GetClangASTImporter(),
-        GetClangDeclContextContainingDIE(die, nullptr), die,
-        attrs.name.GetCString());
+    DWARFDIE decl_ctx_die;
+    clang::DeclContext *decl_ctx =
+        GetClangDeclContextContainingDIE(die, &decl_ctx_die);
+    PrepareContextToReceiveMembers(decl_ctx, decl_ctx_die, die,
+                                   attrs.name.GetCString());
 
     if (attrs.type.IsValid()) {
       // Try to parse a typedef from the (DWARF embedded in the) Clang
@@ -765,9 +792,10 @@ DWARFASTParserClang::ParseTypeModifier(const SymbolContext &sc,
           if (log)
             dwarf->GetObjectFile()->GetModule()->LogMessage(
                 log,
-                "SymbolFileDWARF::ParseType (die = {0:x16}) {1} '{2}' "
+                "SymbolFileDWARF::ParseType (die = {0:x16}) {1} ({2}) '{3}' "
                 "is Objective-C 'id' built-in type.",
-                die.GetOffset(), die.GetTagAsCString(), die.GetName());
+                die.GetOffset(), DW_TAG_value_to_name(die.Tag()), die.Tag(),
+                die.GetName());
           clang_type = m_ast.GetBasicType(eBasicTypeObjCID);
           encoding_data_type = Type::eEncodingIsUID;
           attrs.type.Clear();
@@ -776,9 +804,10 @@ DWARFASTParserClang::ParseTypeModifier(const SymbolContext &sc,
           if (log)
             dwarf->GetObjectFile()->GetModule()->LogMessage(
                 log,
-                "SymbolFileDWARF::ParseType (die = {0:x16}) {1} '{2}' "
+                "SymbolFileDWARF::ParseType (die = {0:x16}) {1} ({2}) '{3}' "
                 "is Objective-C 'Class' built-in type.",
-                die.GetOffset(), die.GetTagAsCString(), die.GetName());
+                die.GetOffset(), DW_TAG_value_to_name(die.Tag()), die.Tag(),
+                die.GetName());
           clang_type = m_ast.GetBasicType(eBasicTypeObjCClass);
           encoding_data_type = Type::eEncodingIsUID;
           attrs.type.Clear();
@@ -787,9 +816,10 @@ DWARFASTParserClang::ParseTypeModifier(const SymbolContext &sc,
           if (log)
             dwarf->GetObjectFile()->GetModule()->LogMessage(
                 log,
-                "SymbolFileDWARF::ParseType (die = {0:x16}) {1} '{2}' "
+                "SymbolFileDWARF::ParseType (die = {0:x16}) {1} ({2}) '{3}' "
                 "is Objective-C 'selector' built-in type.",
-                die.GetOffset(), die.GetTagAsCString(), die.GetName());
+                die.GetOffset(), DW_TAG_value_to_name(die.Tag()), die.Tag(),
+                die.GetName());
           clang_type = m_ast.GetBasicType(eBasicTypeObjCSel);
           encoding_data_type = Type::eEncodingIsUID;
           attrs.type.Clear();
@@ -808,10 +838,10 @@ DWARFASTParserClang::ParseTypeModifier(const SymbolContext &sc,
             if (log)
               dwarf->GetObjectFile()->GetModule()->LogMessage(
                   log,
-                  "SymbolFileDWARF::ParseType (die = {0:x16}) {1} "
-                  "'{2}' is 'objc_object*', which we overrode to "
-                  "'id'.",
-                  die.GetOffset(), die.GetTagAsCString(), die.GetName());
+                  "SymbolFileDWARF::ParseType (die = {0:x16}) {1} ({2}) '{3}' "
+                  "is 'objc_object*', which we overrode to 'id'.",
+                  die.GetOffset(), DW_TAG_value_to_name(die.Tag()), die.Tag(),
+                  die.GetName());
             clang_type = m_ast.GetBasicType(eBasicTypeObjCID);
             encoding_data_type = Type::eEncodingIsUID;
             attrs.type.Clear();
@@ -870,10 +900,10 @@ TypeSP DWARFASTParserClang::ParseEnum(const SymbolContext &sc,
       if (log) {
         dwarf->GetObjectFile()->GetModule()->LogMessage(
             log,
-            "SymbolFileDWARF({0:p}) - {1:x16}}: {2} type \"{3}\" is a "
-            "forward declaration, complete type is {4:x8}",
+            "SymbolFileDWARF({0:p}) - {1:x16}}: {2} ({3}) type \"{4}\" is a "
+            "forward declaration, complete type is {5:x8}",
             static_cast<void *>(this), die.GetOffset(),
-            DW_TAG_value_to_name(tag), attrs.name.GetCString(),
+            DW_TAG_value_to_name(tag), tag, attrs.name.GetCString(),
             type_sp->GetID());
       }
 
@@ -1100,32 +1130,6 @@ DWARFASTParserClang::ParseSubroutine(const DWARFDIE &die,
         // struct and see if this is actually a C++ method
         Type *class_type = dwarf->ResolveType(decl_ctx_die);
         if (class_type) {
-          if (class_type->GetID() != decl_ctx_die.GetID() ||
-              IsClangModuleFwdDecl(decl_ctx_die)) {
-
-            // We uniqued the parent class of this function to another
-            // class so we now need to associate all dies under
-            // "decl_ctx_die" to DIEs in the DIE for "class_type"...
-            DWARFDIE class_type_die = dwarf->GetDIE(class_type->GetID());
-
-            if (class_type_die) {
-              std::vector<DWARFDIE> failures;
-
-              CopyUniqueClassMethodTypes(decl_ctx_die, class_type_die,
-                                         class_type, failures);
-
-              // FIXME do something with these failures that's
-              // smarter than just dropping them on the ground.
-              // Unfortunately classes don't like having stuff added
-              // to them after their definitions are complete...
-
-              Type *type_ptr = dwarf->GetDIEToType()[die.GetDIE()];
-              if (type_ptr && type_ptr != DIE_IS_BEING_PARSED) {
-                return type_ptr->shared_from_this();
-              }
-            }
-          }
-
           if (attrs.specification.IsValid()) {
             // We have a specification which we are going to base our
             // function prototype off of, so we need this type to be
@@ -1260,6 +1264,39 @@ DWARFASTParserClang::ParseSubroutine(const DWARFDIE &die,
               }
             }
           }
+          // By here, we should have already completed the c++ class_type
+          // because if either specification or abstract_origin is present, we
+          // call GetClangDeclContextForDIE to resolve the DW_TAG_subprogram
+          // refered by this one until we reached the DW_TAG_subprogram without
+          // specification or abstract_origin (the else branch above). Then the
+          // above GetFullCompilerType() will complete the class_type if it's
+          // not completed yet. After that, we will have the mapping from DIEs
+          // in class_type_die to DeclContexts in m_die_to_decl_ctx.
+          if (class_type->GetID() != decl_ctx_die.GetID() ||
+              IsClangModuleFwdDecl(decl_ctx_die)) {
+
+            // We uniqued the parent class of this function to another
+            // class so we now need to associate all dies under
+            // "decl_ctx_die" to DIEs in the DIE for "class_type"...
+            DWARFDIE class_type_die = dwarf->GetDIE(class_type->GetID());
+
+            if (class_type_die) {
+              std::vector<DWARFDIE> failures;
+
+              CopyUniqueClassMethodTypes(decl_ctx_die, class_type_die,
+                                         class_type, failures);
+
+              // FIXME do something with these failures that's
+              // smarter than just dropping them on the ground.
+              // Unfortunately classes don't like having stuff added
+              // to them after their definitions are complete...
+
+              Type *type_ptr = dwarf->GetDIEToType()[die.GetDIE()];
+              if (type_ptr && type_ptr != DIE_IS_BEING_PARSED) {
+                return type_ptr->shared_from_this();
+              }
+            }
+          }
         }
       }
     }
@@ -1632,6 +1669,93 @@ DWARFASTParserClang::GetCPlusPlusQualifiedName(const DWARFDIE &die) {
   return qualified_name;
 }
 
+lldb_private::Type *
+DWARFASTParserClang::FindDefinitionTypeForDIE(const DWARFDIE &die) {
+  SymbolFileDWARF *dwarf = die.GetDWARF();
+  ParsedDWARFTypeAttributes attrs(die);
+  bool is_forward_declaration = IsForwardDeclaration(
+      die, attrs, SymbolFileDWARF::GetLanguage(*die.GetCU()));
+  if (!is_forward_declaration)
+    return dwarf->GetDIEToType()[die.GetDIE()];
+
+  const dw_tag_t tag = die.Tag();
+  TypeSP type_sp;
+  Log *log = GetLog(DWARFLog::TypeCompletion | DWARFLog::Lookups);
+  if (log) {
+    dwarf->GetObjectFile()->GetModule()->LogMessage(
+        log,
+        "SymbolFileDWARF({0:p}) - {1:x16}: {2} type \"{3}\" is a "
+        "forward declaration DIE, trying to find definition DIE",
+        static_cast<void *>(this), die.GetOffset(), DW_TAG_value_to_name(tag),
+        attrs.name.GetCString());
+  }
+  // We haven't parse definition die for this type, starting to search for it.
+  // After we found the definition die, the GetDeclarationDIEToDefinitionDIE()
+  // map will have the new mapping from this declaration die to definition die.
+  if (attrs.class_language == eLanguageTypeObjC ||
+      attrs.class_language == eLanguageTypeObjC_plus_plus) {
+    if (!attrs.is_complete_objc_class &&
+        die.Supports_DW_AT_APPLE_objc_complete_type()) {
+      // We have a valid eSymbolTypeObjCClass class symbol whose name
+      // matches the current objective C class that we are trying to find
+      // and this DIE isn't the complete definition (we checked
+      // is_complete_objc_class above and know it is false), so the real
+      // definition is in here somewhere
+      type_sp =
+          dwarf->FindCompleteObjCDefinitionTypeForDIE(die, attrs.name, true);
+
+      if (!type_sp) {
+        SymbolFileDWARFDebugMap *debug_map_symfile =
+            dwarf->GetDebugMapSymfile();
+        if (debug_map_symfile) {
+          // We weren't able to find a full declaration in this DWARF,
+          // see if we have a declaration anywhere else...
+          type_sp = debug_map_symfile->FindCompleteObjCDefinitionTypeForDIE(
+              die, attrs.name, true);
+        }
+      }
+
+      if (type_sp && log) {
+        dwarf->GetObjectFile()->GetModule()->LogMessage(
+            log,
+            "SymbolFileDWARF({0:p}) - {1:x16}: {2} ({3}) type \"{4}\" is an "
+            "incomplete objc type, complete type is {5:x8}",
+            static_cast<void *>(this), die.GetOffset(),
+            DW_TAG_value_to_name(tag), tag, attrs.name.GetCString(),
+            type_sp->GetID());
+      }
+    }
+  }
+
+  type_sp = dwarf->FindDefinitionTypeForDWARFDeclContext(die);
+  if (!type_sp) {
+    SymbolFileDWARFDebugMap *debug_map_symfile = dwarf->GetDebugMapSymfile();
+    if (debug_map_symfile) {
+      // We weren't able to find a full declaration in this DWARF, see
+      // if we have a declaration anywhere else...
+      type_sp = debug_map_symfile->FindDefinitionTypeForDWARFDeclContext(die);
+    }
+    if (type_sp && log) {
+      dwarf->GetObjectFile()->GetModule()->LogMessage(
+          log,
+          "SymbolFileDWARF({0:p}) - {1:x16}: {2} type \"{3}\" is a "
+          "forward declaration, complete type is {4:x8}",
+          static_cast<void *>(this), die.GetOffset(), DW_TAG_value_to_name(tag),
+          attrs.name.GetCString(), type_sp->GetID());
+    }
+  }
+
+  if (!type_sp && log) {
+    dwarf->GetObjectFile()->GetModule()->LogMessage(
+        log,
+        "SymbolFileDWARF({0:p}) - {1:x16}: {2} type \"{3}\" is a "
+        "forward declaration, unable to find definition DIE for it",
+        static_cast<void *>(this), die.GetOffset(), DW_TAG_value_to_name(tag),
+        attrs.name.GetCString());
+  }
+  return type_sp.get();
+}
+
 TypeSP
 DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
                                            const DWARFDIE &die,
@@ -1643,14 +1767,10 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
   LanguageType cu_language = SymbolFileDWARF::GetLanguage(*die.GetCU());
   Log *log = GetLog(DWARFLog::TypeCompletion | DWARFLog::Lookups);
 
-  // UniqueDWARFASTType is large, so don't create a local variables on the
-  // stack, put it on the heap. This function is often called recursively and
-  // clang isn't good at sharing the stack space for variables in different
-  // blocks.
-  auto unique_ast_entry_up = std::make_unique<UniqueDWARFASTType>();
-
   ConstString unique_typename(attrs.name);
   Declaration unique_decl(attrs.decl);
+  uint64_t byte_size = attrs.byte_size.value_or(0);
+  attrs.is_forward_declaration = IsForwardDeclaration(die, attrs, cu_language);
 
   if (attrs.name) {
     if (Language::LanguageIsCPlusPlus(cu_language)) {
@@ -1663,14 +1783,42 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
       unique_decl.Clear();
     }
 
-    if (dwarf->GetUniqueDWARFASTTypeMap().Find(
-            unique_typename, die, unique_decl, attrs.byte_size.value_or(-1),
-            *unique_ast_entry_up)) {
-      type_sp = unique_ast_entry_up->m_type_sp;
+    if (UniqueDWARFASTType *unique_ast_entry_type =
+            dwarf->GetUniqueDWARFASTTypeMap().Find(
+                unique_typename, die, unique_decl, byte_size,
+                attrs.is_forward_declaration)) {
+      type_sp = unique_ast_entry_type->m_type_sp;
       if (type_sp) {
         dwarf->GetDIEToType()[die.GetDIE()] = type_sp.get();
         LinkDeclContextToDIE(
-            GetCachedClangDeclContextForDIE(unique_ast_entry_up->m_die), die);
+            GetCachedClangDeclContextForDIE(unique_ast_entry_type->m_die), die);
+        if (!attrs.is_forward_declaration) {
+          // If the DIE being parsed in this function is a definition and the
+          // entry in the map is a declaration, then we need to update the entry
+          // to point to the definition DIE.
+          if (unique_ast_entry_type->m_is_forward_declaration) {
+            unique_ast_entry_type->m_die = die;
+            unique_ast_entry_type->m_byte_size = byte_size;
+            unique_ast_entry_type->m_declaration = unique_decl;
+            unique_ast_entry_type->m_is_forward_declaration = false;
+            // Need to update Type ID to refer to the definition DIE. because
+            // it's used in ParseSubroutine to determine if we need to copy cxx
+            // method types from a declaration DIE to this definition DIE.
+            type_sp->SetID(die.GetID());
+            clang_type = type_sp->GetForwardCompilerType();
+            if (attrs.class_language != eLanguageTypeObjC &&
+                attrs.class_language != eLanguageTypeObjC_plus_plus)
+              TypeSystemClang::StartTagDeclarationDefinition(clang_type);
+
+            CompilerType compiler_type_no_qualifiers =
+                ClangUtil::RemoveFastQualifiers(clang_type);
+            auto result = dwarf->GetForwardDeclCompilerTypeToDIE().try_emplace(
+                compiler_type_no_qualifiers.GetOpaqueQualType(),
+                *die.GetDIERef());
+            if (!result.second)
+              result.first->second = *die.GetDIERef();
+          }
+        }
         return type_sp;
       }
     }
@@ -1692,126 +1840,21 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
     default_accessibility = eAccessPrivate;
   }
 
-  if (attrs.byte_size && *attrs.byte_size == 0 && attrs.name &&
-      !die.HasChildren() && cu_language == eLanguageTypeObjC) {
-    // Work around an issue with clang at the moment where forward
-    // declarations for objective C classes are emitted as:
-    //  DW_TAG_structure_type [2]
-    //  DW_AT_name( "ForwardObjcClass" )
-    //  DW_AT_byte_size( 0x00 )
-    //  DW_AT_decl_file( "..." )
-    //  DW_AT_decl_line( 1 )
-    //
-    // Note that there is no DW_AT_declaration and there are no children,
-    // and the byte size is zero.
-    attrs.is_forward_declaration = true;
-  }
-
-  if (attrs.class_language == eLanguageTypeObjC ||
-      attrs.class_language == eLanguageTypeObjC_plus_plus) {
-    if (!attrs.is_complete_objc_class &&
-        die.Supports_DW_AT_APPLE_objc_complete_type()) {
-      // We have a valid eSymbolTypeObjCClass class symbol whose name
-      // matches the current objective C class that we are trying to find
-      // and this DIE isn't the complete definition (we checked
-      // is_complete_objc_class above and know it is false), so the real
-      // definition is in here somewhere
-      type_sp =
-          dwarf->FindCompleteObjCDefinitionTypeForDIE(die, attrs.name, true);
-
-      if (!type_sp) {
-        SymbolFileDWARFDebugMap *debug_map_symfile =
-            dwarf->GetDebugMapSymfile();
-        if (debug_map_symfile) {
-          // We weren't able to find a full declaration in this DWARF,
-          // see if we have a declaration anywhere else...
-          type_sp = debug_map_symfile->FindCompleteObjCDefinitionTypeForDIE(
-              die, attrs.name, true);
-        }
-      }
-
-      if (type_sp) {
-        if (log) {
-          dwarf->GetObjectFile()->GetModule()->LogMessage(
-              log,
-              "SymbolFileDWARF({0:p}) - {1:x16}: {2} type "
-              "\"{3}\" is an "
-              "incomplete objc type, complete type is {4:x8}",
-              static_cast<void *>(this), die.GetOffset(),
-              DW_TAG_value_to_name(tag), attrs.name.GetCString(),
-              type_sp->GetID());
-        }
-
-        // We found a real definition for this type elsewhere so lets use
-        // it and cache the fact that we found a complete type for this
-        // die
-        dwarf->GetDIEToType()[die.GetDIE()] = type_sp.get();
-        return type_sp;
-      }
-    }
-  }
-
   if (attrs.is_forward_declaration) {
-    // We have a forward declaration to a type and we need to try and
-    // find a full declaration. We look in the current type index just in
-    // case we have a forward declaration followed by an actual
-    // declarations in the DWARF. If this fails, we need to look
-    // elsewhere...
-    if (log) {
-      dwarf->GetObjectFile()->GetModule()->LogMessage(
-          log,
-          "SymbolFileDWARF({0:p}) - {1:x16}: {2} type \"{3}\" is a "
-          "forward declaration, trying to find complete type",
-          static_cast<void *>(this), die.GetOffset(), DW_TAG_value_to_name(tag),
-          attrs.name.GetCString());
-    }
-
     // See if the type comes from a Clang module and if so, track down
     // that type.
     type_sp = ParseTypeFromClangModule(sc, die, log);
     if (type_sp)
       return type_sp;
-
-    // type_sp = FindDefinitionTypeForDIE (dwarf_cu, die,
-    // type_name_const_str);
-    type_sp = dwarf->FindDefinitionTypeForDWARFDeclContext(die);
-
-    if (!type_sp) {
-      SymbolFileDWARFDebugMap *debug_map_symfile = dwarf->GetDebugMapSymfile();
-      if (debug_map_symfile) {
-        // We weren't able to find a full declaration in this DWARF, see
-        // if we have a declaration anywhere else...
-        type_sp = debug_map_symfile->FindDefinitionTypeForDWARFDeclContext(die);
-      }
-    }
-
-    if (type_sp) {
-      if (log) {
-        dwarf->GetObjectFile()->GetModule()->LogMessage(
-            log,
-            "SymbolFileDWARF({0:p}) - {1:x16}: {2} type \"{3}\" is a "
-            "forward declaration, complete type is {4:x8}",
-            static_cast<void *>(this), die.GetOffset(),
-            DW_TAG_value_to_name(tag), attrs.name.GetCString(),
-            type_sp->GetID());
-      }
-
-      // We found a real definition for this type elsewhere so lets use
-      // it and cache the fact that we found a complete type for this die
-      dwarf->GetDIEToType()[die.GetDIE()] = type_sp.get();
-      clang::DeclContext *defn_decl_ctx =
-          GetCachedClangDeclContextForDIE(dwarf->GetDIE(type_sp->GetID()));
-      if (defn_decl_ctx)
-        LinkDeclContextToDIE(defn_decl_ctx, die);
-      return type_sp;
-    }
   }
+
   assert(tag_decl_kind != -1);
   UNUSED_IF_ASSERT_DISABLED(tag_decl_kind);
-  bool clang_type_was_created = false;
-  clang::DeclContext *decl_ctx = GetClangDeclContextContainingDIE(die, nullptr);
+  DWARFDIE decl_ctx_die;
+  clang::DeclContext *decl_ctx =
+      GetClangDeclContextContainingDIE(die, &decl_ctx_die);
 
-  PrepareContextToReceiveMembers(m_ast, GetClangASTImporter(), decl_ctx, die,
+  PrepareContextToReceiveMembers(decl_ctx, decl_ctx_die, die,
                                  attrs.name.GetCString());
 
   if (attrs.accessibility == eAccessNone && decl_ctx) {
@@ -1836,10 +1879,10 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
       if (log) {
         dwarf->GetObjectFile()->GetModule()->LogMessage(
             log,
-            "SymbolFileDWARF({0:p}) - {1:x16}: {2} type \"{3}\" "
+            "SymbolFileDWARF({0:p}) - {1:x16}: {2} ({3}) type \"{4}\" "
             "clang::ClassTemplateDecl failed to return a decl.",
             static_cast<void *>(this), die.GetOffset(),
-            DW_TAG_value_to_name(tag), attrs.name.GetCString());
+            DW_TAG_value_to_name(tag), tag, attrs.name.GetCString());
       }
       return TypeSP();
     }
@@ -1850,20 +1893,17 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
             tag_decl_kind, template_param_infos);
     clang_type =
         m_ast.CreateClassTemplateSpecializationType(class_specialization_decl);
-    clang_type_was_created = true;
 
     m_ast.SetMetadata(class_template_decl, metadata);
     m_ast.SetMetadata(class_specialization_decl, metadata);
   }
 
-  if (!clang_type_was_created) {
-    clang_type_was_created = true;
+  if (!clang_type) {
     clang_type = m_ast.CreateRecordType(
         decl_ctx, GetOwningClangModule(die), attrs.accessibility,
         attrs.name.GetCString(), tag_decl_kind, attrs.class_language, &metadata,
         attrs.exports_symbols);
   }
-
   // Store a forward declaration to this class type in case any
   // parameters in any class methods need it for the clang types for
   // function prototypes.
@@ -1874,13 +1914,19 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
       Type::ResolveState::Forward,
       TypePayloadClang(OptionalClangModuleID(), attrs.is_complete_objc_class));
 
+  // UniqueDWARFASTType is large, so don't create a local variables on the
+  // stack, put it on the heap. This function is often called recursively and
+  // clang isn't good at sharing the stack space for variables in different
+  // blocks.
+  auto unique_ast_entry_up = std::make_unique<UniqueDWARFASTType>();
   // Add our type to the unique type map so we don't end up creating many
   // copies of the same type over and over in the ASTContext for our
   // module
   unique_ast_entry_up->m_type_sp = type_sp;
   unique_ast_entry_up->m_die = die;
   unique_ast_entry_up->m_declaration = unique_decl;
-  unique_ast_entry_up->m_byte_size = attrs.byte_size.value_or(0);
+  unique_ast_entry_up->m_byte_size = byte_size;
+  unique_ast_entry_up->m_is_forward_declaration = attrs.is_forward_declaration;
   dwarf->GetUniqueDWARFASTTypeMap().Insert(unique_typename,
                                            *unique_ast_entry_up);
 
@@ -1921,7 +1967,7 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
           GetClangASTImporter().SetRecordLayout(record_decl, layout);
         }
       }
-    } else if (clang_type_was_created) {
+    } else {
       // Start the definition if the class is not objective C since the
       // underlying decls respond to isCompleteDefinition(). Objective
       // C decls don't respond to isCompleteDefinition() so we can't
@@ -1933,26 +1979,21 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
       if (attrs.class_language != eLanguageTypeObjC &&
           attrs.class_language != eLanguageTypeObjC_plus_plus)
         TypeSystemClang::StartTagDeclarationDefinition(clang_type);
-
-      // Leave this as a forward declaration until we need to know the
-      // details of the type. lldb_private::Type will automatically call
-      // the SymbolFile virtual function
-      // "SymbolFileDWARF::CompleteType(Type *)" When the definition
-      // needs to be defined.
-      assert(!dwarf->GetForwardDeclCompilerTypeToDIE().count(
-                 ClangUtil::RemoveFastQualifiers(clang_type)
-                     .GetOpaqueQualType()) &&
-             "Type already in the forward declaration map!");
-      // Can't assume m_ast.GetSymbolFile() is actually a
-      // SymbolFileDWARF, it can be a SymbolFileDWARFDebugMap for Apple
-      // binaries.
-      dwarf->GetForwardDeclCompilerTypeToDIE().try_emplace(
-          ClangUtil::RemoveFastQualifiers(clang_type).GetOpaqueQualType(),
-          *die.GetDIERef());
-      m_ast.SetHasExternalStorage(clang_type.GetOpaqueQualType(), true);
     }
   }
 
+  // If this is a declaration DIE, leave this as a forward declaration until we
+  // need to know the details of the type. lldb_private::Type will automatically
+  // call the SymbolFile virtual function "SymbolFileDWARF::CompleteType(Type
+  // *)" When the definition needs to be defined.
+  assert(!dwarf->GetForwardDeclCompilerTypeToDIE().count(
+             ClangUtil::RemoveFastQualifiers(clang_type).GetOpaqueQualType()) &&
+         "Type already in the forward declaration map!");
+  dwarf->GetForwardDeclCompilerTypeToDIE().try_emplace(
+      ClangUtil::RemoveFastQualifiers(clang_type).GetOpaqueQualType(),
+      *die.GetDIERef());
+  m_ast.SetHasExternalStorage(clang_type.GetOpaqueQualType(), true);
+
   // If we made a clang type, set the trivial abi if applicable: We only
   // do this for pass by value - which implies the Trivial ABI. There
   // isn't a way to assert that something that would normally be pass by
@@ -2304,6 +2345,11 @@ bool DWARFASTParserClang::CompleteTypeFromDWARF(const DWARFDIE &die,
 
   if (!die)
     return false;
+  ParsedDWARFTypeAttributes attrs(die);
+  bool is_forward_declaration = IsForwardDeclaration(
+      die, attrs, SymbolFileDWARF::GetLanguage(*die.GetCU()));
+  if (is_forward_declaration)
+    return false;
 
   const dw_tag_t tag = die.Tag();
 
@@ -3018,11 +3064,11 @@ void DWARFASTParserClang::ParseSingleMember(
               this_field_info.bit_offset)))) {
       ObjectFile *objfile = die.GetDWARF()->GetObjectFile();
       objfile->GetModule()->ReportWarning(
-          "{0:x16}: {1} bitfield named \"{2}\" has invalid "
-          "bit offset ({3:x8}) member will be ignored. Please file a bug "
+          "{0:x16}: {1} ({2}) bitfield named \"{3}\" has invalid "
+          "bit offset ({4:x8}) member will be ignored. Please file a bug "
           "against the "
-          "compiler and include the preprocessed output for {4}\n",
-          die.GetID(), DW_TAG_value_to_name(tag), attrs.name,
+          "compiler and include the preprocessed output for {5}\n",
+          die.GetID(), DW_TAG_value_to_name(tag), tag, attrs.name,
           this_field_info.bit_offset, GetUnitName(parent_die).c_str());
       return;
     }
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
index 8d4af203bb28..853b8ccc3036 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
@@ -42,40 +42,40 @@ struct ParsedDWARFTypeAttributes;
 
 class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser {
 public:
+  typedef lldb_private::plugin::dwarf::DWARFDIE DWARFDIE;
+
   DWARFASTParserClang(lldb_private::TypeSystemClang &ast);
 
   ~DWARFASTParserClang() override;
 
   // DWARFASTParser interface.
-  lldb::TypeSP
-  ParseTypeFromDWARF(const lldb_private::SymbolContext &sc,
-                     const lldb_private::plugin::dwarf::DWARFDIE &die,
-                     bool *type_is_new_ptr) override;
+  lldb::TypeSP ParseTypeFromDWARF(const lldb_private::SymbolContext &sc,
+                                  const DWARFDIE &die,
+                                  bool *type_is_new_ptr) override;
 
-  lldb_private::ConstString ConstructDemangledNameFromDWARF(
-      const lldb_private::plugin::dwarf::DWARFDIE &die) override;
+  lldb_private::ConstString
+  ConstructDemangledNameFromDWARF(const DWARFDIE &die) override;
 
   lldb_private::Function *
   ParseFunctionFromDWARF(lldb_private::CompileUnit &comp_unit,
-                         const lldb_private::plugin::dwarf::DWARFDIE &die,
+                         const DWARFDIE &die,
                          const lldb_private::AddressRange &func_range) override;
 
   bool
-  CompleteTypeFromDWARF(const lldb_private::plugin::dwarf::DWARFDIE &die,
-                        lldb_private::Type *type,
+  CompleteTypeFromDWARF(const DWARFDIE &die, lldb_private::Type *type,
                         lldb_private::CompilerType &compiler_type) override;
 
-  lldb_private::CompilerDecl GetDeclForUIDFromDWARF(
-      const lldb_private::plugin::dwarf::DWARFDIE &die) override;
+  lldb_private::CompilerDecl
+  GetDeclForUIDFromDWARF(const DWARFDIE &die) override;
 
   void EnsureAllDIEsInDeclContextHaveBeenParsed(
       lldb_private::CompilerDeclContext decl_context) override;
 
-  lldb_private::CompilerDeclContext GetDeclContextForUIDFromDWARF(
-      const lldb_private::plugin::dwarf::DWARFDIE &die) override;
+  lldb_private::CompilerDeclContext
+  GetDeclContextForUIDFromDWARF(const DWARFDIE &die) override;
 
-  lldb_private::CompilerDeclContext GetDeclContextContainingUIDFromDWARF(
-      const lldb_private::plugin::dwarf::DWARFDIE &die) override;
+  lldb_private::CompilerDeclContext
+  GetDeclContextContainingUIDFromDWARF(const DWARFDIE &die) override;
 
   lldb_private::ClangASTImporter &GetClangASTImporter();
 
@@ -105,8 +105,13 @@ public:
   /// \return A string, including surrounding '<>', of the template parameters.
   /// If the DIE's name already has '<>', returns an empty ConstString because
   /// it's assumed that the caller is using the DIE name anyway.
-  lldb_private::ConstString GetDIEClassTemplateParams(
-      const lldb_private::plugin::dwarf::DWARFDIE &die) override;
+  lldb_private::ConstString
+  GetDIEClassTemplateParams(const DWARFDIE &die) override;
+
+  // Searching for definition DIE for the given DIE and return the type
+  // associated with the definition DIE, or nullptr if definition DIE is not
+  // found.
+  lldb_private::Type *FindDefinitionTypeForDIE(const DWARFDIE &die) override;
 
 protected:
   /// Protected typedefs and members.
@@ -118,8 +123,7 @@ protected:
       const lldb_private::plugin::dwarf::DWARFDebugInfoEntry *,
       clang::DeclContext *>
       DIEToDeclContextMap;
-  typedef std::multimap<const clang::DeclContext *,
-                        const lldb_private::plugin::dwarf::DWARFDIE>
+  typedef std::multimap<const clang::DeclContext *, const DWARFDIE>
       DeclContextToDIEMap;
   typedef llvm::DenseMap<
       const lldb_private::plugin::dwarf::DWARFDebugInfoEntry *,
@@ -137,14 +141,11 @@ protected:
   std::unique_ptr<lldb_private::ClangASTImporter> m_clang_ast_importer_up;
   /// @}
 
-  clang::DeclContext *
-  GetDeclContextForBlock(const lldb_private::plugin::dwarf::DWARFDIE &die);
+  clang::DeclContext *GetDeclContextForBlock(const DWARFDIE &die);
 
-  clang::BlockDecl *
-  ResolveBlockDIE(const lldb_private::plugin::dwarf::DWARFDIE &die);
+  clang::BlockDecl *ResolveBlockDIE(const DWARFDIE &die);
 
-  clang::NamespaceDecl *
-  ResolveNamespaceDIE(const lldb_private::plugin::dwarf::DWARFDIE &die);
+  clang::NamespaceDecl *ResolveNamespaceDIE(const DWARFDIE &die);
 
   /// Returns the namespace decl that a DW_TAG_imported_declaration imports.
   ///
@@ -155,96 +156,86 @@ protected:
   ///          'die' imports. If the imported entity is not a namespace
   ///          or another import declaration, returns nullptr. If an error
   ///          occurs, returns nullptr.
-  clang::NamespaceDecl *ResolveImportedDeclarationDIE(
-      const lldb_private::plugin::dwarf::DWARFDIE &die);
+  clang::NamespaceDecl *ResolveImportedDeclarationDIE(const DWARFDIE &die);
 
-  bool ParseTemplateDIE(const lldb_private::plugin::dwarf::DWARFDIE &die,
+  bool ParseTemplateDIE(const DWARFDIE &die,
                         lldb_private::TypeSystemClang::TemplateParameterInfos
                             &template_param_infos);
 
   bool ParseTemplateParameterInfos(
-      const lldb_private::plugin::dwarf::DWARFDIE &parent_die,
+      const DWARFDIE &parent_die,
       lldb_private::TypeSystemClang::TemplateParameterInfos
           &template_param_infos);
 
-  std::string
-  GetCPlusPlusQualifiedName(const lldb_private::plugin::dwarf::DWARFDIE &die);
+  std::string GetCPlusPlusQualifiedName(const DWARFDIE &die);
 
   bool ParseChildMembers(
-      const lldb_private::plugin::dwarf::DWARFDIE &die,
-      lldb_private::CompilerType &class_compiler_type,
+      const DWARFDIE &die, lldb_private::CompilerType &class_compiler_type,
       std::vector<std::unique_ptr<clang::CXXBaseSpecifier>> &base_classes,
-      std::vector<lldb_private::plugin::dwarf::DWARFDIE> &member_function_dies,
-      std::vector<lldb_private::plugin::dwarf::DWARFDIE> &contained_type_dies,
+      std::vector<DWARFDIE> &member_function_dies,
+      std::vector<DWARFDIE> &contained_type_dies,
       DelayedPropertyList &delayed_properties,
       const lldb::AccessType default_accessibility,
       lldb_private::ClangASTImporter::LayoutInfo &layout_info);
 
   size_t
   ParseChildParameters(clang::DeclContext *containing_decl_ctx,
-                       const lldb_private::plugin::dwarf::DWARFDIE &parent_die,
-                       bool skip_artificial, bool &is_static, bool &is_variadic,
+                       const DWARFDIE &parent_die, bool skip_artificial,
+                       bool &is_static, bool &is_variadic,
                        bool &has_template_params,
                        std::vector<lldb_private::CompilerType> &function_args,
                        std::vector<clang::ParmVarDecl *> &function_param_decls,
                        unsigned &type_quals);
 
-  size_t ParseChildEnumerators(
-      lldb_private::CompilerType &compiler_type, bool is_signed,
-      uint32_t enumerator_byte_size,
-      const lldb_private::plugin::dwarf::DWARFDIE &parent_die);
+  size_t ParseChildEnumerators(lldb_private::CompilerType &compiler_type,
+                               bool is_signed, uint32_t enumerator_byte_size,
+                               const DWARFDIE &parent_die);
 
   /// Parse a structure, class, or union type DIE.
-  lldb::TypeSP
-  ParseStructureLikeDIE(const lldb_private::SymbolContext &sc,
-                        const lldb_private::plugin::dwarf::DWARFDIE &die,
-                        ParsedDWARFTypeAttributes &attrs);
+  lldb::TypeSP ParseStructureLikeDIE(const lldb_private::SymbolContext &sc,
+                                     const DWARFDIE &die,
+                                     ParsedDWARFTypeAttributes &attrs);
 
-  clang::Decl *
-  GetClangDeclForDIE(const lldb_private::plugin::dwarf::DWARFDIE &die);
+  clang::Decl *GetClangDeclForDIE(const DWARFDIE &die);
 
-  clang::DeclContext *
-  GetClangDeclContextForDIE(const lldb_private::plugin::dwarf::DWARFDIE &die);
+  clang::DeclContext *GetClangDeclContextForDIE(const DWARFDIE &die);
 
-  clang::DeclContext *GetClangDeclContextContainingDIE(
-      const lldb_private::plugin::dwarf::DWARFDIE &die,
-      lldb_private::plugin::dwarf::DWARFDIE *decl_ctx_die);
-  lldb_private::OptionalClangModuleID
-  GetOwningClangModule(const lldb_private::plugin::dwarf::DWARFDIE &die);
+  clang::DeclContext *GetClangDeclContextContainingDIE(const DWARFDIE &die,
+                                                       DWARFDIE *decl_ctx_die);
+  lldb_private::OptionalClangModuleID GetOwningClangModule(const DWARFDIE &die);
 
-  bool CopyUniqueClassMethodTypes(
-      const lldb_private::plugin::dwarf::DWARFDIE &src_class_die,
-      const lldb_private::plugin::dwarf::DWARFDIE &dst_class_die,
-      lldb_private::Type *class_type,
-      std::vector<lldb_private::plugin::dwarf::DWARFDIE> &failures);
+  bool CopyUniqueClassMethodTypes(const DWARFDIE &src_class_die,
+                                  const DWARFDIE &dst_class_die,
+                                  lldb_private::Type *class_type,
+                                  std::vector<DWARFDIE> &failures);
 
-  clang::DeclContext *GetCachedClangDeclContextForDIE(
-      const lldb_private::plugin::dwarf::DWARFDIE &die);
+  clang::DeclContext *GetCachedClangDeclContextForDIE(const DWARFDIE &die);
 
-  void LinkDeclContextToDIE(clang::DeclContext *decl_ctx,
-                            const lldb_private::plugin::dwarf::DWARFDIE &die);
+  void LinkDeclContextToDIE(clang::DeclContext *decl_ctx, const DWARFDIE &die);
 
-  void LinkDeclToDIE(clang::Decl *decl,
-                     const lldb_private::plugin::dwarf::DWARFDIE &die);
+  void LinkDeclToDIE(clang::Decl *decl, const DWARFDIE &die);
 
   /// If \p type_sp is valid, calculate and set its symbol context scope, and
   /// update the type list for its backing symbol file.
   ///
   /// Returns \p type_sp.
-  lldb::TypeSP UpdateSymbolContextScopeForType(
-      const lldb_private::SymbolContext &sc,
-      const lldb_private::plugin::dwarf::DWARFDIE &die, lldb::TypeSP type_sp);
+  lldb::TypeSP
+  UpdateSymbolContextScopeForType(const lldb_private::SymbolContext &sc,
+                                  const DWARFDIE &die, lldb::TypeSP type_sp);
 
   /// Follow Clang Module Skeleton CU references to find a type definition.
-  lldb::TypeSP
-  ParseTypeFromClangModule(const lldb_private::SymbolContext &sc,
-                           const lldb_private::plugin::dwarf::DWARFDIE &die,
-                           lldb_private::Log *log);
+  lldb::TypeSP ParseTypeFromClangModule(const lldb_private::SymbolContext &sc,
+                                        const DWARFDIE &die,
+                                        lldb_private::Log *log);
 
   // Return true if this type is a declaration to a type in an external
   // module.
-  lldb::ModuleSP
-  GetModuleForType(const lldb_private::plugin::dwarf::DWARFDIE &die);
+  lldb::ModuleSP GetModuleForType(const DWARFDIE &die);
+
+  void PrepareContextToReceiveMembers(clang::DeclContext *decl_ctx,
+                                      const DWARFDIE &decl_ctx_die,
+                                      const DWARFDIE &die,
+                                      const char *type_name_cstr);
 
   static bool classof(const DWARFASTParser *Parser) {
     return Parser->GetKind() == Kind::DWARFASTParserClang;
@@ -274,10 +265,8 @@ private:
 
   /// Parsed form of all attributes that are relevant for parsing type members.
   struct MemberAttributes {
-    explicit MemberAttributes(
-        const lldb_private::plugin::dwarf::DWARFDIE &die,
-        const lldb_private::plugin::dwarf::DWARFDIE &parent_die,
-        lldb::ModuleSP module_sp);
+    explicit MemberAttributes(const DWARFDIE &die, const DWARFDIE &parent_die,
+                              lldb::ModuleSP module_sp);
     const char *name = nullptr;
     /// Indicates how many bits into the word (according to the host endianness)
     /// the low-order bit of the field starts. Can be negative.
@@ -324,15 +313,12 @@ private:
   /// created property.
   /// \param delayed_properties The list of delayed properties that the result
   /// will be appended to.
-  void
-  ParseObjCProperty(const lldb_private::plugin::dwarf::DWARFDIE &die,
-                    const lldb_private::plugin::dwarf::DWARFDIE &parent_die,
-                    const lldb_private::CompilerType &class_clang_type,
-                    DelayedPropertyList &delayed_properties);
+  void ParseObjCProperty(const DWARFDIE &die, const DWARFDIE &parent_die,
+                         const lldb_private::CompilerType &class_clang_type,
+                         DelayedPropertyList &delayed_properties);
 
   void
-  ParseSingleMember(const lldb_private::plugin::dwarf::DWARFDIE &die,
-                    const lldb_private::plugin::dwarf::DWARFDIE &parent_die,
+  ParseSingleMember(const DWARFDIE &die, const DWARFDIE &parent_die,
                     const lldb_private::CompilerType &class_clang_type,
                     lldb::AccessType default_accessibility,
                     lldb_private::ClangASTImporter::LayoutInfo &layout_info,
@@ -350,31 +336,25 @@ private:
   /// \param[in] class_clang_type The parent RecordType of the static
   ///                             member this function will create.
   void CreateStaticMemberVariable(
-      const lldb_private::plugin::dwarf::DWARFDIE &die,
-      const MemberAttributes &attrs,
+      const DWARFDIE &die, const MemberAttributes &attrs,
       const lldb_private::CompilerType &class_clang_type);
 
-  bool CompleteRecordType(const lldb_private::plugin::dwarf::DWARFDIE &die,
-                          lldb_private::Type *type,
+  bool CompleteRecordType(const DWARFDIE &die, lldb_private::Type *type,
                           lldb_private::CompilerType &clang_type);
-  bool CompleteEnumType(const lldb_private::plugin::dwarf::DWARFDIE &die,
-                        lldb_private::Type *type,
+  bool CompleteEnumType(const DWARFDIE &die, lldb_private::Type *type,
                         lldb_private::CompilerType &clang_type);
 
-  lldb::TypeSP
-  ParseTypeModifier(const lldb_private::SymbolContext &sc,
-                    const lldb_private::plugin::dwarf::DWARFDIE &die,
-                    ParsedDWARFTypeAttributes &attrs);
+  lldb::TypeSP ParseTypeModifier(const lldb_private::SymbolContext &sc,
+                                 const DWARFDIE &die,
+                                 ParsedDWARFTypeAttributes &attrs);
   lldb::TypeSP ParseEnum(const lldb_private::SymbolContext &sc,
-                         const lldb_private::plugin::dwarf::DWARFDIE &die,
-                         ParsedDWARFTypeAttributes &attrs);
-  lldb::TypeSP ParseSubroutine(const lldb_private::plugin::dwarf::DWARFDIE &die,
+                         const DWARFDIE &die, ParsedDWARFTypeAttributes &attrs);
+  lldb::TypeSP ParseSubroutine(const DWARFDIE &die,
                                const ParsedDWARFTypeAttributes &attrs);
-  lldb::TypeSP ParseArrayType(const lldb_private::plugin::dwarf::DWARFDIE &die,
+  lldb::TypeSP ParseArrayType(const DWARFDIE &die,
                               const ParsedDWARFTypeAttributes &attrs);
-  lldb::TypeSP
-  ParsePointerToMemberType(const lldb_private::plugin::dwarf::DWARFDIE &die,
-                           const ParsedDWARFTypeAttributes &attrs);
+  lldb::TypeSP ParsePointerToMemberType(const DWARFDIE &die,
+                                        const ParsedDWARFTypeAttributes &attrs);
 
   /// Parses a DW_TAG_inheritance DIE into a base/super class.
   ///
@@ -391,8 +371,7 @@ private:
   /// \param layout_info The layout information that will be updated for C++
   /// base classes with the base offset.
   void ParseInheritance(
-      const lldb_private::plugin::dwarf::DWARFDIE &die,
-      const lldb_private::plugin::dwarf::DWARFDIE &parent_die,
+      const DWARFDIE &die, const DWARFDIE &parent_die,
       const lldb_private::CompilerType class_clang_type,
       const lldb::AccessType default_accessibility,
       const lldb::ModuleSP &module_sp,
@@ -409,8 +388,7 @@ private:
   /// \param layout_info The layout information that will be updated for
   //   base classes with the base offset
   void
-  ParseRustVariantPart(lldb_private::plugin::dwarf::DWARFDIE &die,
-                       const lldb_private::plugin::dwarf::DWARFDIE &parent_die,
+  ParseRustVariantPart(DWARFDIE &die, const DWARFDIE &parent_die,
                        lldb_private::CompilerType &class_clang_type,
                        const lldb::AccessType default_accesibility,
                        lldb_private::ClangASTImporter::LayoutInfo &layout_info);
@@ -420,8 +398,9 @@ private:
 /// Some attributes are relevant for all kinds of types (declaration), while
 /// others are only meaningful to a specific type (is_virtual)
 struct ParsedDWARFTypeAttributes {
-  explicit ParsedDWARFTypeAttributes(
-      const lldb_private::plugin::dwarf::DWARFDIE &die);
+  typedef lldb_private::plugin::dwarf::DWARFDIE DWARFDIE;
+
+  explicit ParsedDWARFTypeAttributes(const DWARFDIE &die);
 
   lldb::AccessType accessibility = lldb::eAccessNone;
   bool is_artificial = false;
@@ -438,7 +417,7 @@ struct ParsedDWARFTypeAttributes {
   const char *mangled_name = nullptr;
   lldb_private::ConstString name;
   lldb_private::Declaration decl;
-  lldb_private::plugin::dwarf::DWARFDIE object_pointer;
+  DWARFDIE object_pointer;
   lldb_private::plugin::dwarf::DWARFFormValue abstract_origin;
   lldb_private::plugin::dwarf::DWARFFormValue containing_type;
   lldb_private::plugin::dwarf::DWARFFormValue signature;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFBaseDIE.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFBaseDIE.cpp
index 3a3b05acd26d..c2ebeed4c860 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFBaseDIE.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFBaseDIE.cpp
@@ -35,10 +35,6 @@ dw_tag_t DWARFBaseDIE::Tag() const {
     return llvm::dwarf::DW_TAG_null;
 }
 
-const char *DWARFBaseDIE::GetTagAsCString() const {
-  return DW_TAG_value_to_name(Tag());
-}
-
 const char *DWARFBaseDIE::GetAttributeValueAsString(const dw_attr_t attr,
                                                 const char *fail_value) const {
   if (IsValid())
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFBaseDIE.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFBaseDIE.h
index 75c822703cd8..235343d22712 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFBaseDIE.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFBaseDIE.h
@@ -85,8 +85,6 @@ public:
   // Accessing information about a DIE
   dw_tag_t Tag() const;
 
-  const char *GetTagAsCString() const;
-
   dw_offset_t GetOffset() const;
 
   // Get the LLDB user ID for this DIE. This is often just the DIE offset,
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDefines.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDefines.cpp
index 9a88aed85e97..2fb0c224bf8e 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDefines.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDefines.cpp
@@ -15,38 +15,12 @@
 namespace lldb_private::plugin {
 namespace dwarf {
 
-const char *DW_TAG_value_to_name(uint32_t val) {
-  static char invalid[100];
-
-  if (val == 0)
-    return "NULL";
+llvm::StringRef DW_TAG_value_to_name(dw_tag_t tag) {
+  static constexpr llvm::StringLiteral s_unknown_tag_name("<unknown DW_TAG>");
+  if (llvm::StringRef tag_name = llvm::dwarf::TagString(tag); !tag_name.empty())
+    return tag_name;
 
-  llvm::StringRef llvmstr = llvm::dwarf::TagString(val);
-  if (llvmstr.empty()) {
-    snprintf(invalid, sizeof(invalid), "Unknown DW_TAG constant: 0x%x", val);
-    return invalid;
-  }
-  return llvmstr.data();
-}
-
-const char *DW_AT_value_to_name(uint32_t val) {
-  static char invalid[100];
-  llvm::StringRef llvmstr = llvm::dwarf::AttributeString(val);
-  if (llvmstr.empty()) {
-    snprintf(invalid, sizeof(invalid), "Unknown DW_AT constant: 0x%x", val);
-    return invalid;
-  }
-  return llvmstr.data();
-}
-
-const char *DW_FORM_value_to_name(uint32_t val) {
-  static char invalid[100];
-  llvm::StringRef llvmstr = llvm::dwarf::FormEncodingString(val);
-  if (llvmstr.empty()) {
-    snprintf(invalid, sizeof(invalid), "Unknown DW_FORM constant: 0x%x", val);
-    return invalid;
-  }
-  return llvmstr.data();
+  return s_unknown_tag_name;
 }
 
 const char *DW_OP_value_to_name(uint32_t val) {
@@ -59,35 +33,5 @@ const char *DW_OP_value_to_name(uint32_t val) {
   return llvmstr.data();
 }
 
-const char *DW_ATE_value_to_name(uint32_t val) {
-  static char invalid[100];
-  llvm::StringRef llvmstr = llvm::dwarf::AttributeEncodingString(val);
-  if (llvmstr.empty()) {
-    snprintf(invalid, sizeof(invalid), "Unknown DW_ATE constant: 0x%x", val);
-    return invalid;
-  }
-  return llvmstr.data();
-}
-
-const char *DW_LANG_value_to_name(uint32_t val) {
-  static char invalid[100];
-  llvm::StringRef llvmstr = llvm::dwarf::LanguageString(val);
-  if (llvmstr.empty()) {
-    snprintf(invalid, sizeof(invalid), "Unknown DW_LANG constant: 0x%x", val);
-    return invalid;
-  }
-  return llvmstr.data();
-}
-
-const char *DW_LNS_value_to_name(uint32_t val) {
-  static char invalid[100];
-  llvm::StringRef llvmstr = llvm::dwarf::LNStandardString(val);
-  if (llvmstr.empty()) {
-    snprintf(invalid, sizeof(invalid), "Unknown DW_LNS constant: 0x%x", val);
-    return invalid;
-  }
-  return llvmstr.data();
-}
-
 } // namespace dwarf
 } // namespace lldb_private::plugin
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDefines.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDefines.h
index 3ed92cc203bf..be81cb0f5df1 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDefines.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDefines.h
@@ -15,22 +15,10 @@
 namespace lldb_private::plugin {
 namespace dwarf {
 
-typedef uint32_t DRC_class; // Holds DRC_* class bitfields
-
-const char *DW_TAG_value_to_name(uint32_t val);
-
-const char *DW_AT_value_to_name(uint32_t val);
-
-const char *DW_FORM_value_to_name(uint32_t val);
+llvm::StringRef DW_TAG_value_to_name(dw_tag_t tag);
 
 const char *DW_OP_value_to_name(uint32_t val);
 
-const char *DW_ATE_value_to_name(uint32_t val);
-
-const char *DW_LANG_value_to_name(uint32_t val);
-
-const char *DW_LNS_value_to_name(uint32_t val);
-
 } // namespace dwarf
 } // namespace lldb_private::plugin
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
index dabc595427df..3a57ec970b07 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
@@ -1062,6 +1062,7 @@ DWARFUnit::FindRnglistFromOffset(dw_offset_t offset) {
     ranges.Append(DWARFRangeList::Entry(llvm_range.LowPC,
                                         llvm_range.HighPC - llvm_range.LowPC));
   }
+  ranges.Sort();
   return ranges;
 }
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index 49f13d2c89e3..5a07fd30fbf7 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -1555,8 +1555,10 @@ Type *SymbolFileDWARF::ResolveTypeUID(const DWARFDIE &die,
     Log *log = GetLog(DWARFLog::DebugInfo);
     if (log)
       GetObjectFile()->GetModule()->LogMessage(
-          log, "SymbolFileDWARF::ResolveTypeUID (die = {0:x16}) {1} '{2}'",
-          die.GetOffset(), die.GetTagAsCString(), die.GetName());
+          log,
+          "SymbolFileDWARF::ResolveTypeUID (die = {0:x16}) {1} ({2}) '{3}'",
+          die.GetOffset(), DW_TAG_value_to_name(die.Tag()), die.Tag(),
+          die.GetName());
 
     // We might be coming in in the middle of a type tree (a class within a
     // class, an enum within a class), so parse any needed parent DIEs before
@@ -1572,11 +1574,10 @@ Type *SymbolFileDWARF::ResolveTypeUID(const DWARFDIE &die,
           if (log)
             GetObjectFile()->GetModule()->LogMessage(
                 log,
-                "SymbolFileDWARF::ResolveTypeUID (die = {0:x16}) "
-                "{1} '{2}' "
-                "resolve parent forward type for {3:x16})",
-                die.GetOffset(), die.GetTagAsCString(), die.GetName(),
-                decl_ctx_die.GetOffset());
+                "SymbolFileDWARF::ResolveTypeUID (die = {0:x16}) {1} ({2}) "
+                "'{3}' resolve parent forward type for {4:x16})",
+                die.GetOffset(), DW_TAG_value_to_name(die.Tag()), die.Tag(),
+                die.GetName(), decl_ctx_die.GetOffset());
         } break;
 
         default:
@@ -1631,27 +1632,33 @@ bool SymbolFileDWARF::CompleteType(CompilerType &compiler_type) {
     return true;
   }
 
-  DWARFDIE dwarf_die = GetDIE(die_it->getSecond());
-  if (dwarf_die) {
-    // Once we start resolving this type, remove it from the forward
-    // declaration map in case anyone child members or other types require this
-    // type to get resolved. The type will get resolved when all of the calls
-    // to SymbolFileDWARF::ResolveClangOpaqueTypeDefinition are done.
-    GetForwardDeclCompilerTypeToDIE().erase(die_it);
-
-    Type *type = GetDIEToType().lookup(dwarf_die.GetDIE());
+  // Once we start resolving this type, remove it from the forward
+  // declaration map in case anyone's child members or other types require this
+  // type to get resolved.
+  DWARFDIE dwarf_die = GetDIE(die_it->second);
+  GetForwardDeclCompilerTypeToDIE().erase(die_it);
+  Type *type = nullptr;
+  if (DWARFASTParser *dwarf_ast = GetDWARFParser(*dwarf_die.GetCU()))
+    type = dwarf_ast->FindDefinitionTypeForDIE(dwarf_die);
+  if (!type)
+    return false;
 
-    Log *log = GetLog(DWARFLog::DebugInfo | DWARFLog::TypeCompletion);
-    if (log)
-      GetObjectFile()->GetModule()->LogMessageVerboseBacktrace(
-          log, "{0:x8}: {1} '{2}' resolving forward declaration...",
-          dwarf_die.GetID(), dwarf_die.GetTagAsCString(),
-          type->GetName().AsCString());
-    assert(compiler_type);
-    if (DWARFASTParser *dwarf_ast = GetDWARFParser(*dwarf_die.GetCU()))
-      return dwarf_ast->CompleteTypeFromDWARF(dwarf_die, type, compiler_type);
+  die_it = GetForwardDeclCompilerTypeToDIE().find(
+      compiler_type_no_qualifiers.GetOpaqueQualType());
+  if (die_it != GetForwardDeclCompilerTypeToDIE().end()) {
+    dwarf_die = GetDIE(die_it->getSecond());
+    GetForwardDeclCompilerTypeToDIE().erase(die_it);
   }
-  return false;
+
+  if (Log *log = GetLog(DWARFLog::DebugInfo | DWARFLog::TypeCompletion))
+    GetObjectFile()->GetModule()->LogMessageVerboseBacktrace(
+        log, "{0:x8}: {1} ({2}) '{3}' resolving forward declaration...",
+        dwarf_die.GetID(), DW_TAG_value_to_name(dwarf_die.Tag()),
+        dwarf_die.Tag(), type->GetName().AsCString());
+  assert(compiler_type);
+  if (DWARFASTParser *dwarf_ast = GetDWARFParser(*dwarf_die.GetCU()))
+    return dwarf_ast->CompleteTypeFromDWARF(dwarf_die, type, compiler_type);
+  return true;
 }
 
 Type *SymbolFileDWARF::ResolveType(const DWARFDIE &die,
@@ -1665,8 +1672,9 @@ Type *SymbolFileDWARF::ResolveType(const DWARFDIE &die,
         return type;
 
       GetObjectFile()->GetModule()->ReportError(
-          "Parsing a die that is being parsed die: {0:x16}: {1} {2}",
-          die.GetOffset(), die.GetTagAsCString(), die.GetName());
+          "Parsing a die that is being parsed die: {0:x16}: {1} ({2}) {3}",
+          die.GetOffset(), DW_TAG_value_to_name(die.Tag()), die.Tag(),
+          die.GetName());
 
     } else
       return type;
@@ -3134,9 +3142,9 @@ SymbolFileDWARF::FindDefinitionTypeForDWARFDeclContext(const DWARFDIE &die) {
     if (log) {
       GetObjectFile()->GetModule()->LogMessage(
           log,
-          "SymbolFileDWARF::FindDefinitionTypeForDWARFDeclContext(tag={0}, "
-          "name='{1}')",
-          DW_TAG_value_to_name(tag), die.GetName());
+          "SymbolFileDWARF::FindDefinitionTypeForDWARFDeclContext(tag={0} "
+          "({1}), name='{2}')",
+          DW_TAG_value_to_name(tag), tag, die.GetName());
     }
 
     // Get the type system that we are looking to find a type for. We will
@@ -3184,10 +3192,10 @@ SymbolFileDWARF::FindDefinitionTypeForDWARFDeclContext(const DWARFDIE &die) {
           GetObjectFile()->GetModule()->LogMessage(
               log,
               "SymbolFileDWARF::"
-              "FindDefinitionTypeForDWARFDeclContext(tag={0}, "
-              "name='{1}') ignoring die={2:x16} ({3})",
-              DW_TAG_value_to_name(tag), die.GetName(), type_die.GetOffset(),
-              type_die.GetName());
+              "FindDefinitionTypeForDWARFDeclContext(tag={0} ({1}), "
+              "name='{2}') ignoring die={3:x16} ({4})",
+              DW_TAG_value_to_name(tag), tag, die.GetName(),
+              type_die.GetOffset(), type_die.GetName());
         }
         return true;
       }
@@ -3197,9 +3205,9 @@ SymbolFileDWARF::FindDefinitionTypeForDWARFDeclContext(const DWARFDIE &die) {
         GetObjectFile()->GetModule()->LogMessage(
             log,
             "SymbolFileDWARF::"
-            "FindDefinitionTypeForDWARFDeclContext(tag={0}, "
-            "name='{1}') trying die={2:x16} ({3})",
-            DW_TAG_value_to_name(tag), die.GetName(), type_die.GetOffset(),
+            "FindDefinitionTypeForDWARFDeclContext(tag={0} ({1}), name='{2}') "
+            "trying die={3:x16} ({4})",
+            DW_TAG_value_to_name(tag), tag, die.GetName(), type_die.GetOffset(),
             type_dwarf_decl_ctx.GetQualifiedName());
       }
 
@@ -3650,8 +3658,8 @@ VariableSP SymbolFileDWARF::ParseVariableDIE(const SymbolContext &sc,
         StreamString strm;
         location->DumpLocation(&strm, eDescriptionLevelFull, nullptr);
         GetObjectFile()->GetModule()->ReportError(
-            "{0:x16}: {1} has an invalid location: {2}", die.GetOffset(),
-            die.GetTagAsCString(), strm.GetData());
+            "{0:x16}: {1} ({2}) has an invalid location: {3}", die.GetOffset(),
+            DW_TAG_value_to_name(die.Tag()), die.Tag(), strm.GetData());
       }
       if (location_DW_OP_addr != LLDB_INVALID_ADDRESS)
         is_static_lifetime = true;
@@ -3839,10 +3847,11 @@ void SymbolFileDWARF::ParseAndAppendGlobalVariable(
       variable_list_sp = sc.comp_unit->GetVariableList(false);
     } else {
       GetObjectFile()->GetModule()->ReportError(
-          "parent {0:x8} {1} with no valid compile unit in "
-          "symbol context for {2:x8} {3}.\n",
-          sc_parent_die.GetID(), sc_parent_die.GetTagAsCString(), die.GetID(),
-          die.GetTagAsCString());
+          "parent {0:x8} {1} ({2}) with no valid compile unit in "
+          "symbol context for {3:x8} {4} ({5}).\n",
+          sc_parent_die.GetID(), DW_TAG_value_to_name(sc_parent_die.Tag()),
+          sc_parent_die.Tag(), die.GetID(), DW_TAG_value_to_name(die.Tag()),
+          die.Tag());
       return;
     }
     break;
@@ -3850,8 +3859,8 @@ void SymbolFileDWARF::ParseAndAppendGlobalVariable(
   default:
     GetObjectFile()->GetModule()->ReportError(
         "didn't find appropriate parent DIE for variable list for {0:x8} "
-        "{1}.\n",
-        die.GetID(), die.GetTagAsCString());
+        "{1} ({2}).\n",
+        die.GetID(), DW_TAG_value_to_name(die.Tag()), die.Tag());
     return;
   }
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
index 7282c08c6857..94aa810680c5 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
@@ -533,8 +533,12 @@ protected:
   NameToOffsetMap m_function_scope_qualified_name_map;
   std::unique_ptr<DWARFDebugRanges> m_ranges;
   UniqueDWARFASTTypeMap m_unique_ast_type_map;
+  // A map from DIE to lldb_private::Type. For record type, the key might be
+  // either declaration DIE or definition DIE.
   DIEToTypePtr m_die_to_type;
   DIEToVariableSP m_die_to_variable_sp;
+  // A map from CompilerType to the struct/class/union/enum DIE (might be a
+  // declaration or a definition) that is used to construct it.
   CompilerTypeToDIE m_forward_decl_compiler_type_to_die;
   llvm::DenseMap<dw_offset_t, std::unique_ptr<SupportFileList>>
       m_type_unit_support_files;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.cpp b/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.cpp
index 223518f0ae82..4762356034ca 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.cpp
@@ -13,66 +13,67 @@
 using namespace lldb_private::dwarf;
 using namespace lldb_private::plugin::dwarf;
 
-bool UniqueDWARFASTTypeList::Find(const DWARFDIE &die,
-                                  const lldb_private::Declaration &decl,
-                                  const int32_t byte_size,
-                                  UniqueDWARFASTType &entry) const {
-  for (const UniqueDWARFASTType &udt : m_collection) {
+UniqueDWARFASTType *UniqueDWARFASTTypeList::Find(
+    const DWARFDIE &die, const lldb_private::Declaration &decl,
+    const int32_t byte_size, bool is_forward_declaration) {
+  for (UniqueDWARFASTType &udt : m_collection) {
     // Make sure the tags match
     if (udt.m_die.Tag() == die.Tag()) {
-      // Validate byte sizes of both types only if both are valid.
-      if (udt.m_byte_size < 0 || byte_size < 0 ||
-          udt.m_byte_size == byte_size) {
-        // Make sure the file and line match
-        if (udt.m_declaration == decl) {
-          // The type has the same name, and was defined on the same file and
-          // line. Now verify all of the parent DIEs match.
-          DWARFDIE parent_arg_die = die.GetParent();
-          DWARFDIE parent_pos_die = udt.m_die.GetParent();
-          bool match = true;
-          bool done = false;
-          while (!done && match && parent_arg_die && parent_pos_die) {
-            const dw_tag_t parent_arg_tag = parent_arg_die.Tag();
-            const dw_tag_t parent_pos_tag = parent_pos_die.Tag();
-            if (parent_arg_tag == parent_pos_tag) {
-              switch (parent_arg_tag) {
-              case DW_TAG_class_type:
-              case DW_TAG_structure_type:
-              case DW_TAG_union_type:
-              case DW_TAG_namespace: {
-                const char *parent_arg_die_name = parent_arg_die.GetName();
-                if (parent_arg_die_name ==
-                    nullptr) // Anonymous (i.e. no-name) struct
-                {
-                  match = false;
-                } else {
-                  const char *parent_pos_die_name = parent_pos_die.GetName();
-                  if (parent_pos_die_name == nullptr ||
-                      ((parent_arg_die_name != parent_pos_die_name) &&
-                       strcmp(parent_arg_die_name, parent_pos_die_name)))
-                    match = false;
-                }
-              } break;
-
-              case DW_TAG_compile_unit:
-              case DW_TAG_partial_unit:
-                done = true;
-                break;
-              default:
-                break;
-              }
+      // If they are not both definition DIEs or both declaration DIEs, then
+      // don't check for byte size and declaration location, because declaration
+      // DIEs usually don't have those info.
+      bool matching_size_declaration =
+          udt.m_is_forward_declaration != is_forward_declaration
+              ? true
+              : (udt.m_byte_size < 0 || byte_size < 0 ||
+                 udt.m_byte_size == byte_size) &&
+                    udt.m_declaration == decl;
+      if (!matching_size_declaration)
+        continue;
+      // The type has the same name, and was defined on the same file and
+      // line. Now verify all of the parent DIEs match.
+      DWARFDIE parent_arg_die = die.GetParent();
+      DWARFDIE parent_pos_die = udt.m_die.GetParent();
+      bool match = true;
+      bool done = false;
+      while (!done && match && parent_arg_die && parent_pos_die) {
+        const dw_tag_t parent_arg_tag = parent_arg_die.Tag();
+        const dw_tag_t parent_pos_tag = parent_pos_die.Tag();
+        if (parent_arg_tag == parent_pos_tag) {
+          switch (parent_arg_tag) {
+          case DW_TAG_class_type:
+          case DW_TAG_structure_type:
+          case DW_TAG_union_type:
+          case DW_TAG_namespace: {
+            const char *parent_arg_die_name = parent_arg_die.GetName();
+            if (parent_arg_die_name == nullptr) {
+              // Anonymous (i.e. no-name) struct
+              match = false;
+            } else {
+              const char *parent_pos_die_name = parent_pos_die.GetName();
+              if (parent_pos_die_name == nullptr ||
+                  ((parent_arg_die_name != parent_pos_die_name) &&
+                   strcmp(parent_arg_die_name, parent_pos_die_name)))
+                match = false;
             }
-            parent_arg_die = parent_arg_die.GetParent();
-            parent_pos_die = parent_pos_die.GetParent();
-          }
+          } break;
 
-          if (match) {
-            entry = udt;
-            return true;
+          case DW_TAG_compile_unit:
+          case DW_TAG_partial_unit:
+            done = true;
+            break;
+          default:
+            break;
           }
         }
+        parent_arg_die = parent_arg_die.GetParent();
+        parent_pos_die = parent_pos_die.GetParent();
+      }
+
+      if (match) {
+        return &udt;
       }
     }
   }
-  return false;
+  return nullptr;
 }
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.h b/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.h
index bf3cbae55e5c..29e5c02dcbe1 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.h
@@ -23,31 +23,19 @@ public:
   // Constructors and Destructors
   UniqueDWARFASTType() : m_type_sp(), m_die(), m_declaration() {}
 
-  UniqueDWARFASTType(lldb::TypeSP &type_sp, const DWARFDIE &die,
-                     const Declaration &decl, int32_t byte_size)
-      : m_type_sp(type_sp), m_die(die), m_declaration(decl),
-        m_byte_size(byte_size) {}
-
   UniqueDWARFASTType(const UniqueDWARFASTType &rhs)
       : m_type_sp(rhs.m_type_sp), m_die(rhs.m_die),
-        m_declaration(rhs.m_declaration), m_byte_size(rhs.m_byte_size) {}
+        m_declaration(rhs.m_declaration), m_byte_size(rhs.m_byte_size),
+        m_is_forward_declaration(rhs.m_is_forward_declaration) {}
 
   ~UniqueDWARFASTType() = default;
 
-  UniqueDWARFASTType &operator=(const UniqueDWARFASTType &rhs) {
-    if (this != &rhs) {
-      m_type_sp = rhs.m_type_sp;
-      m_die = rhs.m_die;
-      m_declaration = rhs.m_declaration;
-      m_byte_size = rhs.m_byte_size;
-    }
-    return *this;
-  }
-
   lldb::TypeSP m_type_sp;
   DWARFDIE m_die;
   Declaration m_declaration;
   int32_t m_byte_size = -1;
+  // True if the m_die is a forward declaration DIE.
+  bool m_is_forward_declaration = true;
 };
 
 class UniqueDWARFASTTypeList {
@@ -62,8 +50,9 @@ public:
     m_collection.push_back(entry);
   }
 
-  bool Find(const DWARFDIE &die, const Declaration &decl,
-            const int32_t byte_size, UniqueDWARFASTType &entry) const;
+  UniqueDWARFASTType *Find(const DWARFDIE &die, const Declaration &decl,
+                           const int32_t byte_size,
+                           bool is_forward_declaration);
 
 protected:
   typedef std::vector<UniqueDWARFASTType> collection;
@@ -80,14 +69,15 @@ public:
     m_collection[name.GetCString()].Append(entry);
   }
 
-  bool Find(ConstString name, const DWARFDIE &die, const Declaration &decl,
-            const int32_t byte_size, UniqueDWARFASTType &entry) const {
+  UniqueDWARFASTType *Find(ConstString name, const DWARFDIE &die,
+                           const Declaration &decl, const int32_t byte_size,
+                           bool is_forward_declaration) {
     const char *unique_name_cstr = name.GetCString();
-    collection::const_iterator pos = m_collection.find(unique_name_cstr);
+    collection::iterator pos = m_collection.find(unique_name_cstr);
     if (pos != m_collection.end()) {
-      return pos->second.Find(die, decl, byte_size, entry);
+      return pos->second.Find(die, decl, byte_size, is_forward_declaration);
     }
-    return false;
+    return nullptr;
   }
 
 protected:
diff --git a/lldb/source/Plugins/SymbolLocator/Default/SymbolLocatorDefault.cpp b/lldb/source/Plugins/SymbolLocator/Default/SymbolLocatorDefault.cpp
index edb1d59cf42f..919f26ba7012 100644
--- a/lldb/source/Plugins/SymbolLocator/Default/SymbolLocatorDefault.cpp
+++ b/lldb/source/Plugins/SymbolLocator/Default/SymbolLocatorDefault.cpp
@@ -157,7 +157,7 @@ std::optional<FileSpec> SymbolLocatorDefault::LocateExecutableSymbolFile(
       mib[1] = USER_LOCALBASE;
       if (::sysctl(mib, 2, buf, &len, NULL, 0) == 0) {
         FileSpec file_spec("/lib/debug");
-        file_spec.PrependPathComponent(StringRef(buf));
+        file_spec.PrependPathComponent(llvm::StringRef(buf));
         FileSystem::Instance().Resolve(file_spec);
         debug_file_search_paths.AppendIfUnique(file_spec);
       }
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index 3bdb288e97dd..d0033fcd9cdf 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -4996,6 +4996,9 @@ lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type,
 
     case clang::BuiltinType::IncompleteMatrixIdx:
       break;
+
+    case clang::BuiltinType::UnresolvedTemplate:
+      break;
     }
     break;
   // All pointer types are represented as unsigned integer encodings. We may
@@ -7103,6 +7106,8 @@ TypeSystemClang::GetDirectNestedTypeWithName(lldb::opaque_compiler_type_t type,
     for (NamedDecl *decl : record_decl->lookup(decl_name)) {
       if (auto *tag_decl = dyn_cast<clang::TagDecl>(decl))
         return GetType(getASTContext().getTagDeclType(tag_decl));
+      if (auto *typedef_decl = dyn_cast<clang::TypedefNameDecl>(decl))
+        return GetType(getASTContext().getTypedefType(typedef_decl));
     }
     break;
   }
diff --git a/lldb/source/Symbol/SymbolContext.cpp b/lldb/source/Symbol/SymbolContext.cpp
index f368896fbad4..8f26e41d1920 100644
--- a/lldb/source/Symbol/SymbolContext.cpp
+++ b/lldb/source/Symbol/SymbolContext.cpp
@@ -73,6 +73,7 @@ bool SymbolContext::DumpStopContext(
     Stream *s, ExecutionContextScope *exe_scope, const Address &addr,
     bool show_fullpaths, bool show_module, bool show_inlined_frames,
     bool show_function_arguments, bool show_function_name,
+    bool show_function_display_name,
     std::optional<Stream::HighlightSettings> settings) const {
   bool dumped_something = false;
   if (show_module && module_sp) {
@@ -93,6 +94,8 @@ bool SymbolContext::DumpStopContext(
       ConstString name;
       if (!show_function_arguments)
         name = function->GetNameNoArguments();
+      if (!name && show_function_display_name)
+        name = function->GetDisplayName();
       if (!name)
         name = function->GetName();
       if (name)
@@ -146,7 +149,8 @@ bool SymbolContext::DumpStopContext(
         const bool show_function_name = true;
         return inline_parent_sc.DumpStopContext(
             s, exe_scope, inline_parent_addr, show_fullpaths, show_module,
-            show_inlined_frames, show_function_arguments, show_function_name);
+            show_inlined_frames, show_function_arguments, show_function_name,
+            show_function_display_name);
       }
     } else {
       if (line_entry.IsValid()) {
@@ -164,7 +168,12 @@ bool SymbolContext::DumpStopContext(
       dumped_something = true;
       if (symbol->GetType() == eSymbolTypeTrampoline)
         s->PutCString("symbol stub for: ");
-      s->PutCStringColorHighlighted(symbol->GetName().GetStringRef(), settings);
+      ConstString name;
+      if (show_function_display_name)
+        name = symbol->GetDisplayName();
+      if (!name)
+        name = symbol->GetName();
+      s->PutCStringColorHighlighted(name.GetStringRef(), settings);
     }
 
     if (addr.IsValid() && symbol->ValueIsAddress()) {
diff --git a/lldb/source/Symbol/Type.cpp b/lldb/source/Symbol/Type.cpp
index b85c38097ebe..6bf69c2ded28 100644
--- a/lldb/source/Symbol/Type.cpp
+++ b/lldb/source/Symbol/Type.cpp
@@ -29,6 +29,7 @@
 #include "lldb/Target/ExecutionContext.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/Target.h"
+#include "lldb/lldb-enumerations.h"
 
 #include "llvm/ADT/StringRef.h"
 
@@ -85,27 +86,23 @@ static CompilerContextKind ConvertTypeClass(lldb::TypeClass type_class) {
 
 TypeQuery::TypeQuery(llvm::StringRef name, TypeQueryOptions options)
     : m_options(options) {
-  llvm::StringRef scope, basename;
-  lldb::TypeClass type_class = lldb::eTypeClassAny;
-  if (Type::GetTypeScopeAndBasename(name, scope, basename, type_class)) {
-    if (scope.consume_front("::"))
-      m_options |= e_exact_match;
+  if (std::optional<Type::ParsedName> parsed_name =
+          Type::GetTypeScopeAndBasename(name)) {
+    llvm::ArrayRef scope = parsed_name->scope;
     if (!scope.empty()) {
-      std::pair<llvm::StringRef, llvm::StringRef> scope_pair =
-          scope.split("::");
-      while (!scope_pair.second.empty()) {
-        m_context.push_back({CompilerContextKind::AnyDeclContext,
-                             ConstString(scope_pair.first.str())});
-        scope_pair = scope_pair.second.split("::");
+      if (scope[0] == "::") {
+        m_options |= e_exact_match;
+        scope = scope.drop_front();
+      }
+      for (llvm::StringRef s : scope) {
+        m_context.push_back(
+            {CompilerContextKind::AnyDeclContext, ConstString(s)});
       }
-      m_context.push_back({CompilerContextKind::AnyDeclContext,
-                           ConstString(scope_pair.first.str())});
     }
-    m_context.push_back(
-        {ConvertTypeClass(type_class), ConstString(basename.str())});
+    m_context.push_back({ConvertTypeClass(parsed_name->type_class),
+                         ConstString(parsed_name->basename)});
   } else {
-    m_context.push_back(
-        {CompilerContextKind::AnyType, ConstString(name.str())});
+    m_context.push_back({CompilerContextKind::AnyType, ConstString(name)});
   }
 }
 
@@ -773,65 +770,56 @@ ConstString Type::GetQualifiedName() {
   return GetForwardCompilerType().GetTypeName();
 }
 
-bool Type::GetTypeScopeAndBasename(llvm::StringRef name,
-                                   llvm::StringRef &scope,
-                                   llvm::StringRef &basename,
-                                   TypeClass &type_class) {
-  type_class = eTypeClassAny;
+std::optional<Type::ParsedName>
+Type::GetTypeScopeAndBasename(llvm::StringRef name) {
+  ParsedName result;
 
   if (name.empty())
-    return false;
-
-  // Clear the scope in case we have just a type class and a basename.
-  scope = llvm::StringRef();
-  basename = name;
-  if (basename.consume_front("struct "))
-    type_class = eTypeClassStruct;
-  else if (basename.consume_front("class "))
-    type_class = eTypeClassClass;
-  else if (basename.consume_front("union "))
-    type_class = eTypeClassUnion;
-  else if (basename.consume_front("enum "))
-    type_class = eTypeClassEnumeration;
-  else if (basename.consume_front("typedef "))
-    type_class = eTypeClassTypedef;
-
-  size_t namespace_separator = basename.find("::");
-  if (namespace_separator == llvm::StringRef::npos) {
-    // If "name" started a type class we need to return true with no scope.
-    return type_class != eTypeClassAny;
-  }
-
-  size_t template_begin = basename.find('<');
-  while (namespace_separator != llvm::StringRef::npos) {
-    if (template_begin != llvm::StringRef::npos &&
-        namespace_separator > template_begin) {
-      size_t template_depth = 1;
-      llvm::StringRef template_arg =
-          basename.drop_front(template_begin + 1);
-      while (template_depth > 0 && !template_arg.empty()) {
-        if (template_arg.front() == '<')
-          template_depth++;
-        else if (template_arg.front() == '>')
-          template_depth--;
-        template_arg = template_arg.drop_front(1);
+    return std::nullopt;
+
+  if (name.consume_front("struct "))
+    result.type_class = eTypeClassStruct;
+  else if (name.consume_front("class "))
+    result.type_class = eTypeClassClass;
+  else if (name.consume_front("union "))
+    result.type_class = eTypeClassUnion;
+  else if (name.consume_front("enum "))
+    result.type_class = eTypeClassEnumeration;
+  else if (name.consume_front("typedef "))
+    result.type_class = eTypeClassTypedef;
+
+  if (name.consume_front("::"))
+    result.scope.push_back("::");
+
+  bool prev_is_colon = false;
+  size_t template_depth = 0;
+  size_t name_begin = 0;
+  for (const auto &pos : llvm::enumerate(name)) {
+    switch (pos.value()) {
+    case ':':
+      if (prev_is_colon && template_depth == 0) {
+        result.scope.push_back(name.slice(name_begin, pos.index() - 1));
+        name_begin = pos.index() + 1;
       }
-      if (template_depth != 0)
-        return false; // We have an invalid type name. Bail out.
-      if (template_arg.empty())
-        break; // The template ends at the end of the full name.
-      basename = template_arg;
-    } else {
-      basename = basename.drop_front(namespace_separator + 2);
+      break;
+    case '<':
+      ++template_depth;
+      break;
+    case '>':
+      if (template_depth == 0)
+        return std::nullopt; // Invalid name.
+      --template_depth;
+      break;
     }
-    template_begin = basename.find('<');
-    namespace_separator = basename.find("::");
-  }
-  if (basename.size() < name.size()) {
-    scope = name.take_front(name.size() - basename.size());
-    return true;
+    prev_is_colon = pos.value() == ':';
   }
-  return false;
+
+  if (name_begin < name.size() && template_depth == 0)
+    result.basename = name.substr(name_begin);
+  else
+    return std::nullopt;
+
+  return result;
 }
 
 ModuleSP Type::GetModule() {
diff --git a/lldb/source/Symbol/TypeList.cpp b/lldb/source/Symbol/TypeList.cpp
index 2e101e0a8f57..574887189315 100644
--- a/lldb/source/Symbol/TypeList.cpp
+++ b/lldb/source/Symbol/TypeList.cpp
@@ -96,112 +96,3 @@ void TypeList::Dump(Stream *s, bool show_context) {
     if (Type *t = pos->get())
       t->Dump(s, show_context);
 }
-
-void TypeList::RemoveMismatchedTypes(llvm::StringRef qualified_typename,
-                                     bool exact_match) {
-  llvm::StringRef type_scope;
-  llvm::StringRef type_basename;
-  TypeClass type_class = eTypeClassAny;
-  if (!Type::GetTypeScopeAndBasename(qualified_typename, type_scope,
-                                     type_basename, type_class)) {
-    type_basename = qualified_typename;
-    type_scope = "";
-  }
-  return RemoveMismatchedTypes(type_scope, type_basename, type_class,
-                               exact_match);
-}
-
-void TypeList::RemoveMismatchedTypes(llvm::StringRef type_scope,
-                                     llvm::StringRef type_basename,
-                                     TypeClass type_class, bool exact_match) {
-  // Our "collection" type currently is a std::map which doesn't have any good
-  // way to iterate and remove items from the map so we currently just make a
-  // new list and add all of the matching types to it, and then swap it into
-  // m_types at the end
-  collection matching_types;
-
-  iterator pos, end = m_types.end();
-
-  for (pos = m_types.begin(); pos != end; ++pos) {
-    Type *the_type = pos->get();
-    bool keep_match = false;
-    TypeClass match_type_class = eTypeClassAny;
-
-    if (type_class != eTypeClassAny) {
-      match_type_class = the_type->GetForwardCompilerType().GetTypeClass();
-      if ((match_type_class & type_class) == 0)
-        continue;
-    }
-
-    ConstString match_type_name_const_str(the_type->GetQualifiedName());
-    if (match_type_name_const_str) {
-      const char *match_type_name = match_type_name_const_str.GetCString();
-      llvm::StringRef match_type_scope;
-      llvm::StringRef match_type_basename;
-      if (Type::GetTypeScopeAndBasename(match_type_name, match_type_scope,
-                                        match_type_basename,
-                                        match_type_class)) {
-        if (match_type_basename == type_basename) {
-          const size_t type_scope_size = type_scope.size();
-          const size_t match_type_scope_size = match_type_scope.size();
-          if (exact_match || (type_scope_size == match_type_scope_size)) {
-            keep_match = match_type_scope == type_scope;
-          } else {
-            if (match_type_scope_size > type_scope_size) {
-              const size_t type_scope_pos = match_type_scope.rfind(type_scope);
-              if (type_scope_pos == match_type_scope_size - type_scope_size) {
-                if (type_scope_pos >= 2) {
-                  // Our match scope ends with the type scope we were looking
-                  // for, but we need to make sure what comes before the
-                  // matching type scope is a namespace boundary in case we are
-                  // trying to match: type_basename = "d" type_scope = "b::c::"
-                  // We want to match:
-                  //  match_type_scope "a::b::c::"
-                  // But not:
-                  //  match_type_scope "a::bb::c::"
-                  // So below we make sure what comes before "b::c::" in
-                  // match_type_scope is "::", or the namespace boundary
-                  if (match_type_scope[type_scope_pos - 1] == ':' &&
-                      match_type_scope[type_scope_pos - 2] == ':') {
-                    keep_match = true;
-                  }
-                }
-              }
-            }
-          }
-        }
-      } else {
-        // The type we are currently looking at doesn't exists in a namespace
-        // or class, so it only matches if there is no type scope...
-        keep_match = type_scope.empty() && type_basename == match_type_name;
-      }
-    }
-
-    if (keep_match) {
-      matching_types.push_back(*pos);
-    }
-  }
-  m_types.swap(matching_types);
-}
-
-void TypeList::RemoveMismatchedTypes(TypeClass type_class) {
-  if (type_class == eTypeClassAny)
-    return;
-
-  // Our "collection" type currently is a std::map which doesn't have any good
-  // way to iterate and remove items from the map so we currently just make a
-  // new list and add all of the matching types to it, and then swap it into
-  // m_types at the end
-  collection matching_types;
-
-  iterator pos, end = m_types.end();
-
-  for (pos = m_types.begin(); pos != end; ++pos) {
-    Type *the_type = pos->get();
-    TypeClass match_type_class =
-        the_type->GetForwardCompilerType().GetTypeClass();
-    if (match_type_class & type_class)
-      matching_types.push_back(*pos);
-  }
-  m_types.swap(matching_types);
-}
diff --git a/lldb/source/Symbol/TypeMap.cpp b/lldb/source/Symbol/TypeMap.cpp
index 8933de53749c..9d7c05f318a1 100644
--- a/lldb/source/Symbol/TypeMap.cpp
+++ b/lldb/source/Symbol/TypeMap.cpp
@@ -132,76 +132,3 @@ void TypeMap::Dump(Stream *s, bool show_context,
   for (const auto &pair : m_types)
     pair.second->Dump(s, show_context, level);
 }
-
-void TypeMap::RemoveMismatchedTypes(llvm::StringRef type_scope,
-                                    llvm::StringRef type_basename,
-                                    TypeClass type_class, bool exact_match) {
-  // Our "collection" type currently is a std::map which doesn't have any good
-  // way to iterate and remove items from the map so we currently just make a
-  // new list and add all of the matching types to it, and then swap it into
-  // m_types at the end
-  collection matching_types;
-
-  iterator pos, end = m_types.end();
-
-  for (pos = m_types.begin(); pos != end; ++pos) {
-    Type *the_type = pos->second.get();
-    bool keep_match = false;
-    TypeClass match_type_class = eTypeClassAny;
-
-    if (type_class != eTypeClassAny) {
-      match_type_class = the_type->GetForwardCompilerType().GetTypeClass();
-      if ((match_type_class & type_class) == 0)
-        continue;
-    }
-
-    ConstString match_type_name_const_str(the_type->GetQualifiedName());
-    if (match_type_name_const_str) {
-      const char *match_type_name = match_type_name_const_str.GetCString();
-      llvm::StringRef match_type_scope;
-      llvm::StringRef match_type_basename;
-      if (Type::GetTypeScopeAndBasename(match_type_name, match_type_scope,
-                                        match_type_basename,
-                                        match_type_class)) {
-        if (match_type_basename == type_basename) {
-          const size_t type_scope_size = type_scope.size();
-          const size_t match_type_scope_size = match_type_scope.size();
-          if (exact_match || (type_scope_size == match_type_scope_size)) {
-            keep_match = match_type_scope == type_scope;
-          } else {
-            if (match_type_scope_size > type_scope_size) {
-              const size_t type_scope_pos = match_type_scope.rfind(type_scope);
-              if (type_scope_pos == match_type_scope_size - type_scope_size) {
-                if (type_scope_pos >= 2) {
-                  // Our match scope ends with the type scope we were looking
-                  // for, but we need to make sure what comes before the
-                  // matching type scope is a namespace boundary in case we are
-                  // trying to match: type_basename = "d" type_scope = "b::c::"
-                  // We want to match:
-                  //  match_type_scope "a::b::c::"
-                  // But not:
-                  //  match_type_scope "a::bb::c::"
-                  // So below we make sure what comes before "b::c::" in
-                  // match_type_scope is "::", or the namespace boundary
-                  if (match_type_scope[type_scope_pos - 1] == ':' &&
-                      match_type_scope[type_scope_pos - 2] == ':') {
-                    keep_match = true;
-                  }
-                }
-              }
-            }
-          }
-        }
-      } else {
-        // The type we are currently looking at doesn't exists in a namespace
-        // or class, so it only matches if there is no type scope...
-        keep_match = type_scope.empty() && type_basename == match_type_name;
-      }
-    }
-
-    if (keep_match) {
-      matching_types.insert(*pos);
-    }
-  }
-  m_types.swap(matching_types);
-}
diff --git a/lldb/source/Target/Platform.cpp b/lldb/source/Target/Platform.cpp
index 91483ba008f4..4af4aa68ccd0 100644
--- a/lldb/source/Target/Platform.cpp
+++ b/lldb/source/Target/Platform.cpp
@@ -1199,22 +1199,22 @@ Status Platform::PutFile(const FileSpec &source, const FileSpec &destination,
   Status error;
 
   bool requires_upload = true;
-  uint64_t dest_md5_low, dest_md5_high;
-  bool success = CalculateMD5(destination, dest_md5_low, dest_md5_high);
-  if (!success) {
-    LLDB_LOGF(log, "[PutFile] couldn't get md5 sum of destination");
+  llvm::ErrorOr<llvm::MD5::MD5Result> remote_md5 = CalculateMD5(destination);
+  if (std::error_code ec = remote_md5.getError()) {
+    LLDB_LOG(log, "[PutFile] couldn't get md5 sum of destination: {0}",
+             ec.message());
   } else {
-    auto local_md5 = llvm::sys::fs::md5_contents(source.GetPath());
-    if (!local_md5) {
-      LLDB_LOGF(log, "[PutFile] couldn't get md5 sum of source");
+    llvm::ErrorOr<llvm::MD5::MD5Result> local_md5 =
+        llvm::sys::fs::md5_contents(source.GetPath());
+    if (std::error_code ec = local_md5.getError()) {
+      LLDB_LOG(log, "[PutFile] couldn't get md5 sum of source: {0}",
+               ec.message());
     } else {
-      const auto [local_md5_high, local_md5_low] = local_md5->words();
       LLDB_LOGF(log, "[PutFile] destination md5: %016" PRIx64 "%016" PRIx64,
-                dest_md5_high, dest_md5_low);
+                remote_md5->high(), remote_md5->low());
       LLDB_LOGF(log, "[PutFile]       local md5: %016" PRIx64 "%016" PRIx64,
-                local_md5_high, local_md5_low);
-      requires_upload =
-          local_md5_high != dest_md5_high || local_md5_low != dest_md5_low;
+                local_md5->high(), local_md5->low());
+      requires_upload = *remote_md5 != *local_md5;
     }
   }
 
@@ -1339,15 +1339,11 @@ lldb_private::Status Platform::RunShellCommand(
   return Status("unable to run a remote command without a platform");
 }
 
-bool Platform::CalculateMD5(const FileSpec &file_spec, uint64_t &low,
-                            uint64_t &high) {
+llvm::ErrorOr<llvm::MD5::MD5Result>
+Platform::CalculateMD5(const FileSpec &file_spec) {
   if (!IsHost())
-    return false;
-  auto Result = llvm::sys::fs::md5_contents(file_spec.GetPath());
-  if (!Result)
-    return false;
-  std::tie(high, low) = Result->words();
-  return true;
+    return std::make_error_code(std::errc::not_supported);
+  return llvm::sys::fs::md5_contents(file_spec.GetPath());
 }
 
 void Platform::SetLocalCacheDirectory(const char *local) {
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index 30c240b064b5..25afade9a827 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -4738,27 +4738,26 @@ Process::RunThreadPlan(ExecutionContext &exe_ctx,
 
   if (!thread_plan_sp) {
     diagnostic_manager.PutString(
-        eDiagnosticSeverityError,
-        "RunThreadPlan called with empty thread plan.");
+        lldb::eSeverityError, "RunThreadPlan called with empty thread plan.");
     return eExpressionSetupError;
   }
 
   if (!thread_plan_sp->ValidatePlan(nullptr)) {
     diagnostic_manager.PutString(
-        eDiagnosticSeverityError,
+        lldb::eSeverityError,
         "RunThreadPlan called with an invalid thread plan.");
     return eExpressionSetupError;
   }
 
   if (exe_ctx.GetProcessPtr() != this) {
-    diagnostic_manager.PutString(eDiagnosticSeverityError,
+    diagnostic_manager.PutString(lldb::eSeverityError,
                                  "RunThreadPlan called on wrong process.");
     return eExpressionSetupError;
   }
 
   Thread *thread = exe_ctx.GetThreadPtr();
   if (thread == nullptr) {
-    diagnostic_manager.PutString(eDiagnosticSeverityError,
+    diagnostic_manager.PutString(lldb::eSeverityError,
                                  "RunThreadPlan called with invalid thread.");
     return eExpressionSetupError;
   }
@@ -4793,7 +4792,7 @@ Process::RunThreadPlan(ExecutionContext &exe_ctx,
 
   if (m_private_state.GetValue() != eStateStopped) {
     diagnostic_manager.PutString(
-        eDiagnosticSeverityError,
+        lldb::eSeverityError,
         "RunThreadPlan called while the private state was not stopped.");
     return eExpressionSetupError;
   }
@@ -4807,7 +4806,7 @@ Process::RunThreadPlan(ExecutionContext &exe_ctx,
     selected_frame_sp = thread->GetSelectedFrame(DoNoSelectMostRelevantFrame);
     if (!selected_frame_sp) {
       diagnostic_manager.Printf(
-          eDiagnosticSeverityError,
+          lldb::eSeverityError,
           "RunThreadPlan called without a selected frame on thread %d",
           thread_idx_id);
       return eExpressionSetupError;
@@ -4818,7 +4817,7 @@ Process::RunThreadPlan(ExecutionContext &exe_ctx,
   // be smaller than the overall timeout.
   if (options.GetOneThreadTimeout() && options.GetTimeout() &&
       *options.GetTimeout() < *options.GetOneThreadTimeout()) {
-    diagnostic_manager.PutString(eDiagnosticSeverityError,
+    diagnostic_manager.PutString(lldb::eSeverityError,
                                  "RunThreadPlan called with one thread "
                                  "timeout greater than total timeout");
     return eExpressionSetupError;
@@ -4946,7 +4945,7 @@ Process::RunThreadPlan(ExecutionContext &exe_ctx,
     Event *other_events = listener_sp->PeekAtNextEvent();
     if (other_events != nullptr) {
       diagnostic_manager.PutString(
-          eDiagnosticSeverityError,
+          lldb::eSeverityError,
           "RunThreadPlan called with pending events on the queue.");
       return eExpressionSetupError;
     }
@@ -4989,7 +4988,7 @@ Process::RunThreadPlan(ExecutionContext &exe_ctx,
           Status resume_error = PrivateResume();
           if (!resume_error.Success()) {
             diagnostic_manager.Printf(
-                eDiagnosticSeverityError,
+                lldb::eSeverityError,
                 "couldn't resume inferior the %d time: \"%s\".", num_resumes,
                 resume_error.AsCString());
             return_value = eExpressionSetupError;
@@ -5005,7 +5004,7 @@ Process::RunThreadPlan(ExecutionContext &exe_ctx,
                     "resume %" PRIu32 ", exiting.",
                     num_resumes);
 
-          diagnostic_manager.Printf(eDiagnosticSeverityError,
+          diagnostic_manager.Printf(lldb::eSeverityError,
                                     "didn't get any event after resume %" PRIu32
                                     ", exiting.",
                                     num_resumes);
@@ -5041,7 +5040,7 @@ Process::RunThreadPlan(ExecutionContext &exe_ctx,
           }
 
           diagnostic_manager.Printf(
-              eDiagnosticSeverityError,
+              lldb::eSeverityError,
               "didn't get running event after initial resume, got %s instead.",
               StateAsCString(stop_state));
           return_value = eExpressionSetupError;
@@ -5099,7 +5098,7 @@ Process::RunThreadPlan(ExecutionContext &exe_ctx,
             const bool use_run_lock = false;
             Halt(clear_thread_plans, use_run_lock);
             return_value = eExpressionInterrupted;
-            diagnostic_manager.PutString(eDiagnosticSeverityRemark,
+            diagnostic_manager.PutString(lldb::eSeverityInfo,
                                          "execution halted by user interrupt.");
             LLDB_LOGF(log, "Process::RunThreadPlan(): Got  interrupted by "
                            "eBroadcastBitInterrupted, exiting.");
@@ -5152,7 +5151,7 @@ Process::RunThreadPlan(ExecutionContext &exe_ctx,
                 event_to_broadcast_sp = event_sp;
 
               diagnostic_manager.PutString(
-                  eDiagnosticSeverityError,
+                  lldb::eSeverityError,
                   "execution stopped with unexpected state.");
               return_value = eExpressionInterrupted;
               break;
diff --git a/lldb/source/Target/RemoteAwarePlatform.cpp b/lldb/source/Target/RemoteAwarePlatform.cpp
index 0bd6c9251c85..9a41a423cadd 100644
--- a/lldb/source/Target/RemoteAwarePlatform.cpp
+++ b/lldb/source/Target/RemoteAwarePlatform.cpp
@@ -266,11 +266,11 @@ Status RemoteAwarePlatform::Unlink(const FileSpec &file_spec) {
   return Platform::Unlink(file_spec);
 }
 
-bool RemoteAwarePlatform::CalculateMD5(const FileSpec &file_spec, uint64_t &low,
-                                       uint64_t &high) {
+llvm::ErrorOr<llvm::MD5::MD5Result>
+RemoteAwarePlatform::CalculateMD5(const FileSpec &file_spec) {
   if (m_remote_platform_sp)
-    return m_remote_platform_sp->CalculateMD5(file_spec, low, high);
-  return Platform::CalculateMD5(file_spec, low, high);
+    return m_remote_platform_sp->CalculateMD5(file_spec);
+  return Platform::CalculateMD5(file_spec);
 }
 
 FileSpec RemoteAwarePlatform::GetRemoteWorkingDirectory() {
diff --git a/lldb/source/Utility/Log.cpp b/lldb/source/Utility/Log.cpp
index 3a45a0285d3e..6713a5bd7582 100644
--- a/lldb/source/Utility/Log.cpp
+++ b/lldb/source/Utility/Log.cpp
@@ -39,6 +39,7 @@ char LogHandler::ID;
 char StreamLogHandler::ID;
 char CallbackLogHandler::ID;
 char RotatingLogHandler::ID;
+char TeeLogHandler::ID;
 
 llvm::ManagedStatic<Log::ChannelMap> Log::g_channel_map;
 
@@ -438,3 +439,16 @@ void RotatingLogHandler::Dump(llvm::raw_ostream &stream) const {
   }
   stream.flush();
 }
+
+TeeLogHandler::TeeLogHandler(std::shared_ptr<LogHandler> first_log_handler,
+                             std::shared_ptr<LogHandler> second_log_handler)
+    : m_first_log_handler(first_log_handler),
+      m_second_log_handler(second_log_handler) {
+  assert(m_first_log_handler && "first log handler must be valid");
+  assert(m_second_log_handler && "second log handler must be valid");
+}
+
+void TeeLogHandler::Emit(llvm::StringRef message) {
+  m_first_log_handler->Emit(message);
+  m_second_log_handler->Emit(message);
+}
diff --git a/lldb/source/Utility/Scalar.cpp b/lldb/source/Utility/Scalar.cpp
index e94fd4596236..c70c5e107991 100644
--- a/lldb/source/Utility/Scalar.cpp
+++ b/lldb/source/Utility/Scalar.cpp
@@ -134,9 +134,9 @@ size_t Scalar::GetByteSize() const {
   case e_void:
     break;
   case e_int:
-    return (m_integer.getBitWidth() / 8);
+    return (m_integer.getBitWidth() + 7) / 8;
   case e_float:
-    return m_float.bitcastToAPInt().getBitWidth() / 8;
+    return (m_float.bitcastToAPInt().getBitWidth() + 7) / 8;
   }
   return 0;
 }
diff --git a/lldb/test/API/commands/platform/process/launch/TestPlatformProcessLaunch.py b/lldb/test/API/commands/platform/process/launch/TestPlatformProcessLaunch.py
index 3fb7d00c93d2..7cbad03eeeea 100644
--- a/lldb/test/API/commands/platform/process/launch/TestPlatformProcessLaunch.py
+++ b/lldb/test/API/commands/platform/process/launch/TestPlatformProcessLaunch.py
@@ -3,6 +3,7 @@ Test platform process launch.
 """
 
 from textwrap import dedent
+from lldbsuite.test import lldbutil
 from lldbsuite.test.lldbtest import TestBase
 
 
@@ -11,9 +12,10 @@ class ProcessLaunchTestCase(TestBase):
 
     def setup(self):
         self.build()
-        exe = self.getBuildArtifact("a.out")
-        self.runCmd("file " + exe)
-        return (exe, self.getBuildArtifact("stdio.log"))
+        self.runCmd("file " + self.getBuildArtifact("a.out"))
+        exe = lldbutil.append_to_process_working_directory(self, "a.out")
+        outfile = lldbutil.append_to_process_working_directory(self, "stdio.log")
+        return (exe, outfile)
 
     def test_process_launch_no_args(self):
         # When there are no extra arguments we just have 0, the program name.
@@ -21,18 +23,18 @@ class ProcessLaunchTestCase(TestBase):
         self.runCmd("platform process launch --stdout {} -s".format(outfile))
         self.runCmd("continue")
 
-        with open(outfile) as f:
-            self.assertEqual(
-                dedent(
-                    """\
-                Got 1 argument(s).
-                [0]: {}
-                """.format(
-                        exe
-                    )
-                ),
-                f.read(),
-            )
+        stdio_log = lldbutil.read_file_on_target(self, outfile)
+        self.assertEqual(
+            dedent(
+                """\
+            Got 1 argument(s).
+            [0]: {}
+            """.format(
+                    exe
+                )
+            ),
+            stdio_log,
+        )
 
     def test_process_launch_command_args(self):
         exe, outfile = self.setup()
@@ -41,21 +43,21 @@ class ProcessLaunchTestCase(TestBase):
         self.runCmd("platform process launch --stdout {} -s -- A B C".format(outfile))
         self.runCmd("continue")
 
-        with open(outfile) as f:
-            self.assertEqual(
-                dedent(
-                    """\
-                Got 4 argument(s).
-                [0]: {}
-                [1]: A
-                [2]: B
-                [3]: C
-                """.format(
-                        exe
-                    )
-                ),
-                f.read(),
-            )
+        stdio_log = lldbutil.read_file_on_target(self, outfile)
+        self.assertEqual(
+            dedent(
+                """\
+            Got 4 argument(s).
+            [0]: {}
+            [1]: A
+            [2]: B
+            [3]: C
+            """.format(
+                    exe
+                )
+            ),
+            stdio_log,
+        )
 
     def test_process_launch_target_args(self):
         exe, outfile = self.setup()
@@ -64,17 +66,17 @@ class ProcessLaunchTestCase(TestBase):
         self.runCmd("platform process launch --stdout {} -s".format(outfile))
         self.runCmd("continue")
 
-        with open(outfile) as f:
-            self.assertEqual(
-                dedent(
-                    """\
-                Got 3 argument(s).
-                [0]: {}
-                [1]: D
-                [2]: E
-                """.format(
-                        exe
-                    )
-                ),
-                f.read(),
-            )
+        stdio_log = lldbutil.read_file_on_target(self, outfile)
+        self.assertEqual(
+            dedent(
+                """\
+            Got 3 argument(s).
+            [0]: {}
+            [1]: D
+            [2]: E
+            """.format(
+                    exe
+                )
+            ),
+            stdio_log,
+        )
diff --git a/lldb/test/API/commands/settings/TestSettings.py b/lldb/test/API/commands/settings/TestSettings.py
index 104a9f09788c..385acceb7a8b 100644
--- a/lldb/test/API/commands/settings/TestSettings.py
+++ b/lldb/test/API/commands/settings/TestSettings.py
@@ -953,7 +953,7 @@ class SettingsCommandTestCase(TestBase):
 
         # Test OptionValueFileSpec
         self.verify_setting_value_json(
-            "platform.module-cache-directory", self.get_process_working_directory()
+            "platform.module-cache-directory", self.getBuildDir()
         )
 
         # Test OptionValueArray
diff --git a/lldb/test/API/commands/settings/quoting/TestQuoting.py b/lldb/test/API/commands/settings/quoting/TestQuoting.py
index 393f4be3c824..60eeeead4e0a 100644
--- a/lldb/test/API/commands/settings/quoting/TestQuoting.py
+++ b/lldb/test/API/commands/settings/quoting/TestQuoting.py
@@ -51,9 +51,7 @@ class SettingsCommandTestCase(TestBase):
         outfile = self.getBuildArtifact(filename)
 
         if lldb.remote_platform:
-            outfile_arg = os.path.join(
-                lldb.remote_platform.GetWorkingDirectory(), filename
-            )
+            outfile_arg = lldbutil.append_to_process_working_directory(self, filename)
         else:
             outfile_arg = outfile
 
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteLoad.py b/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteLoad.py
index c39cb4cd59aa..f0a5429e6c1c 100644
--- a/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteLoad.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteLoad.py
@@ -6,9 +6,6 @@ from lldbsuite.test.lldbgdbclient import GDBRemoteTestBase
 
 
 class TestGDBRemoteLoad(GDBRemoteTestBase):
-    @expectedFailureAll(
-        archs=["aarch64"], oslist=["freebsd"], bugnumber="llvm.org/pr49414"
-    )
     def test_module_load_address(self):
         """Test that setting the load address of a module uses virtual addresses"""
         target = self.createTarget("a.yaml")
@@ -20,9 +17,6 @@ class TestGDBRemoteLoad(GDBRemoteTestBase):
         self.assertTrue(address.IsValid())
         self.assertEqual(".data", address.GetSection().GetName())
 
-    @expectedFailureAll(
-        archs=["aarch64"], oslist=["freebsd"], bugnumber="llvm.org/pr49414"
-    )
     def test_ram_load(self):
         """Test loading an object file to a target's ram"""
         target = self.createTarget("a.yaml")
@@ -31,9 +25,6 @@ class TestGDBRemoteLoad(GDBRemoteTestBase):
         self.assertPacketLogContains(["M1000,4:c3c3c3c3", "M1004,2:3232"])
 
     @skipIfXmlSupportMissing
-    @expectedFailureAll(
-        archs=["aarch64"], oslist=["freebsd"], bugnumber="llvm.org/pr49414"
-    )
     def test_flash_load(self):
         """Test loading an object file to a target's flash memory"""
 
diff --git a/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py b/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py
index 7ec5e0d7c830..8ec0cbdd0fdd 100644
--- a/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py
+++ b/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py
@@ -292,9 +292,7 @@ class LinuxCoreTestCase(TestBase):
         self.dbg.DeleteTarget(target)
 
     @skipIfLLVMTargetMissing("AArch64")
-    @expectedFailureAll(
-        archs=["aarch64"], oslist=["freebsd"], bugnumber="llvm.org/pr49415"
-    )
+    # This test fails on FreeBSD 12 and earlier, see llvm.org/pr49415 for details.
     def test_aarch64_regs(self):
         # check 64 bit ARM core files
         target = self.dbg.CreateTarget(None)
@@ -377,9 +375,7 @@ class LinuxCoreTestCase(TestBase):
         self.expect("register read --all")
 
     @skipIfLLVMTargetMissing("AArch64")
-    @expectedFailureAll(
-        archs=["aarch64"], oslist=["freebsd"], bugnumber="llvm.org/pr49415"
-    )
+    # This test fails on FreeBSD 12 and earlier, see llvm.org/pr49415 for details.
     def test_aarch64_sve_regs_fpsimd(self):
         # check 64 bit ARM core files
         target = self.dbg.CreateTarget(None)
diff --git a/lldb/test/API/functionalities/scripted_process/TestScriptedProcess.py b/lldb/test/API/functionalities/scripted_process/TestScriptedProcess.py
index 5aaf68575623..9519c576689d 100644
--- a/lldb/test/API/functionalities/scripted_process/TestScriptedProcess.py
+++ b/lldb/test/API/functionalities/scripted_process/TestScriptedProcess.py
@@ -187,6 +187,10 @@ class ScriptedProcesTestCase(TestBase):
             + os.path.join(self.getSourceDir(), scripted_process_example_relpath)
         )
 
+        self.runCmd(
+            "target stop-hook add -k first -v 1 -k second -v 2 -P dummy_scripted_process.DummyStopHook"
+        )
+
         launch_info = lldb.SBLaunchInfo(None)
         launch_info.SetProcessPluginName("ScriptedProcess")
         launch_info.SetScriptedProcessClassName(
@@ -207,6 +211,9 @@ class ScriptedProcesTestCase(TestBase):
         self.assertTrue(hasattr(py_impl, "my_super_secret_member"))
         self.assertEqual(py_impl.my_super_secret_method(), 42)
 
+        self.assertTrue(hasattr(py_impl, "handled_stop"))
+        self.assertTrue(py_impl.handled_stop)
+
         # Try reading from target #0 process ...
         addr = 0x500000000
         message = "Hello, target 0"
diff --git a/lldb/test/API/functionalities/scripted_process/dummy_scripted_process.py b/lldb/test/API/functionalities/scripted_process/dummy_scripted_process.py
index 5aff3aa4bb55..cb07bf32c508 100644
--- a/lldb/test/API/functionalities/scripted_process/dummy_scripted_process.py
+++ b/lldb/test/API/functionalities/scripted_process/dummy_scripted_process.py
@@ -7,6 +7,16 @@ from lldb.plugins.scripted_process import ScriptedProcess
 from lldb.plugins.scripted_process import ScriptedThread
 
 
+class DummyStopHook:
+    def __init__(self, target, args, internal_dict):
+        self.target = target
+        self.args = args
+
+    def handle_stop(self, exe_ctx, stream):
+        print("My DummyStopHook triggered. Printing args: \n%s" % self.args)
+        sp = exe_ctx.process.GetScriptedImplementation()
+        sp.handled_stop = True
+
 class DummyScriptedProcess(ScriptedProcess):
     memory = None
 
@@ -18,6 +28,7 @@ class DummyScriptedProcess(ScriptedProcess):
         debugger = self.target.GetDebugger()
         index = debugger.GetIndexOfTarget(self.target)
         self.memory[addr] = "Hello, target " + str(index)
+        self.handled_stop = False
 
     def read_memory_at_address(
         self, addr: int, size: int, error: lldb.SBError
@@ -99,8 +110,14 @@ class DummyScriptedThread(ScriptedThread):
 
 
 def __lldb_init_module(debugger, dict):
+    # This is used when loading the script in an interactive debug session to
+    # automatically, register the stop-hook and launch the scripted process.
     if not "SKIP_SCRIPTED_PROCESS_LAUNCH" in os.environ:
         debugger.HandleCommand(
+            "target stop-hook add -k first -v 1 -k second -v 2 -P %s.%s"
+            % (__name__, DummyStopHook.__name__)
+        )
+        debugger.HandleCommand(
             "process launch -C %s.%s" % (__name__, DummyScriptedProcess.__name__)
         )
     else:
@@ -108,3 +125,7 @@ def __lldb_init_module(debugger, dict):
             "Name of the class that will manage the scripted process: '%s.%s'"
             % (__name__, DummyScriptedProcess.__name__)
         )
+        print(
+            "Name of the class that will manage the stop-hook: '%s.%s'"
+            % (__name__, DummyStopHook.__name__)
+        )
diff --git a/lldb/test/API/lang/c/calling-conventions/TestCCallingConventions.py b/lldb/test/API/lang/c/calling-conventions/TestCCallingConventions.py
index 9483dfcd0401..0304482e899b 100644
--- a/lldb/test/API/lang/c/calling-conventions/TestCCallingConventions.py
+++ b/lldb/test/API/lang/c/calling-conventions/TestCCallingConventions.py
@@ -62,7 +62,10 @@ class TestCase(TestBase):
             return
         self.expect_expr("func(1, 2, 3, 4)", result_type="int", result_value="10")
 
+    # Fails on x86, passes elsewhere because clang doesn't support vectorcall on
+    # any other architectures.
     @expectedFailureAll(
+        triple=re.compile("^(x86|i386)"),
         oslist=["freebsd"], bugnumber="github.com/llvm/llvm-project/issues/56084"
     )
     def test_vectorcall(self):
diff --git a/lldb/test/API/macosx/rosetta/TestRosetta.py b/lldb/test/API/macosx/rosetta/TestRosetta.py
index ce40de475ef1..a812f558a8fc 100644
--- a/lldb/test/API/macosx/rosetta/TestRosetta.py
+++ b/lldb/test/API/macosx/rosetta/TestRosetta.py
@@ -40,6 +40,7 @@ class TestRosetta(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
     @skipUnlessAppleSilicon
+    @skipIfLLVMTargetMissing("X86")
     @skipIfDarwinEmbedded
     def test_rosetta(self):
         """There can be many tests in a test case - describe this test here."""
diff --git a/lldb/test/API/macosx/universal64/TestUniversal64.py b/lldb/test/API/macosx/universal64/TestUniversal64.py
index 98661443086e..893ff14d8113 100644
--- a/lldb/test/API/macosx/universal64/TestUniversal64.py
+++ b/lldb/test/API/macosx/universal64/TestUniversal64.py
@@ -17,6 +17,7 @@ class Universal64TestCase(TestBase):
         # actually launch them here.
 
     # The Makefile manually invokes clang.
+    @skipIfLLVMTargetMissing("X86")
     @skipIfAsan
     @skipUnlessDarwin
     @skipIfDarwinEmbedded
@@ -26,6 +27,7 @@ class Universal64TestCase(TestBase):
         self.do_test()
 
     # The Makefile manually invokes clang.
+    @skipIfLLVMTargetMissing("X86")
     @skipIfAsan
     @skipUnlessDarwin
     @skipIfDarwinEmbedded
diff --git a/lldb/test/API/python_api/sbmodule/FindTypes/Makefile b/lldb/test/API/python_api/sbmodule/FindTypes/Makefile
new file mode 100644
index 000000000000..99998b20bcb0
--- /dev/null
+++ b/lldb/test/API/python_api/sbmodule/FindTypes/Makefile
@@ -0,0 +1,3 @@
+CXX_SOURCES := main.cpp
+
+include Makefile.rules
diff --git a/lldb/test/API/python_api/sbmodule/FindTypes/TestSBModuleFindTypes.py b/lldb/test/API/python_api/sbmodule/FindTypes/TestSBModuleFindTypes.py
new file mode 100644
index 000000000000..5c3d2b4187dd
--- /dev/null
+++ b/lldb/test/API/python_api/sbmodule/FindTypes/TestSBModuleFindTypes.py
@@ -0,0 +1,40 @@
+"""Test the SBModule::FindTypes."""
+
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class TestSBModuleFindTypes(TestBase):
+    def test_lookup_in_template_scopes(self):
+        self.build()
+        spec = lldb.SBModuleSpec()
+        spec.SetFileSpec(lldb.SBFileSpec(self.getBuildArtifact()))
+        module = lldb.SBModule(spec)
+
+        self.assertEqual(
+            set([t.GetName() for t in module.FindTypes("LookMeUp")]),
+            set(
+                [
+                    "ns1::Foo<void>::LookMeUp",
+                    "ns2::Bar<void>::LookMeUp",
+                    "ns1::Foo<ns2::Bar<void> >::LookMeUp",
+                ]
+            ),
+        )
+
+        self.assertEqual(
+            set([t.GetName() for t in module.FindTypes("ns1::Foo<void>::LookMeUp")]),
+            set(["ns1::Foo<void>::LookMeUp"]),
+        )
+
+        self.assertEqual(
+            set(
+                [
+                    t.GetName()
+                    for t in module.FindTypes("ns1::Foo<ns2::Bar<void> >::LookMeUp")
+                ]
+            ),
+            set(["ns1::Foo<ns2::Bar<void> >::LookMeUp"]),
+        )
diff --git a/lldb/test/API/python_api/sbmodule/FindTypes/main.cpp b/lldb/test/API/python_api/sbmodule/FindTypes/main.cpp
new file mode 100644
index 000000000000..cb2646ce312a
--- /dev/null
+++ b/lldb/test/API/python_api/sbmodule/FindTypes/main.cpp
@@ -0,0 +1,17 @@
+namespace ns1 {
+template <typename T> struct Foo {
+  struct LookMeUp {};
+};
+} // namespace ns1
+
+namespace ns2 {
+template <typename T> struct Bar {
+  struct LookMeUp {};
+};
+} // namespace ns2
+
+ns1::Foo<void>::LookMeUp l1;
+ns2::Bar<void>::LookMeUp l2;
+ns1::Foo<ns2::Bar<void>>::LookMeUp l3;
+
+int main() {}
diff --git a/lldb/test/API/python_api/type/TestTypeList.py b/lldb/test/API/python_api/type/TestTypeList.py
index 81c44f7a39d6..b028929eea44 100644
--- a/lldb/test/API/python_api/type/TestTypeList.py
+++ b/lldb/test/API/python_api/type/TestTypeList.py
@@ -52,6 +52,19 @@ class TypeAndTypeListTestCase(TestBase):
         self.DebugSBValue(value)
         self.assertEqual(value.GetValueAsSigned(), 47)
 
+        static_constexpr_bool_field = task_type.GetStaticFieldWithName(
+            "static_constexpr_bool_field"
+        )
+        self.assertTrue(static_constexpr_bool_field)
+        self.assertEqual(
+            static_constexpr_bool_field.GetName(), "static_constexpr_bool_field"
+        )
+        self.assertEqual(static_constexpr_bool_field.GetType().GetName(), "const bool")
+
+        value = static_constexpr_bool_field.GetConstantValue(self.target())
+        self.DebugSBValue(value)
+        self.assertEqual(value.GetValueAsUnsigned(), 1)
+
         static_mutable_field = task_type.GetStaticFieldWithName("static_mutable_field")
         self.assertTrue(static_mutable_field)
         self.assertEqual(static_mutable_field.GetName(), "static_mutable_field")
@@ -259,3 +272,40 @@ class TypeAndTypeListTestCase(TestBase):
             self.assertTrue(int_enum_uchar)
             self.DebugSBType(int_enum_uchar)
             self.assertEqual(int_enum_uchar.GetName(), "unsigned char")
+
+    def test_nested_typedef(self):
+        """Exercise FindDirectNestedType for typedefs."""
+        self.build()
+        target = self.dbg.CreateTarget(self.getBuildArtifact())
+        self.assertTrue(target)
+
+        with_nested_typedef = target.FindFirstType("WithNestedTypedef")
+        self.assertTrue(with_nested_typedef)
+
+        # This is necessary to work around #91186
+        self.assertTrue(target.FindFirstGlobalVariable("typedefed_value").GetType())
+
+        the_typedef = with_nested_typedef.FindDirectNestedType("TheTypedef")
+        self.assertTrue(the_typedef)
+        self.assertEqual(the_typedef.GetTypedefedType().GetName(), "int")
+
+    def test_GetByteAlign(self):
+        """Exercise SBType::GetByteAlign"""
+        self.build()
+        spec = lldb.SBModuleSpec()
+        spec.SetFileSpec(lldb.SBFileSpec(self.getBuildArtifact()))
+        module = lldb.SBModule(spec)
+        self.assertTrue(module)
+
+        # Invalid types should not crash.
+        self.assertEqual(lldb.SBType().GetByteAlign(), 0)
+
+        # Try a type with natural alignment.
+        void_ptr = module.GetBasicType(lldb.eBasicTypeVoid).GetPointerType()
+        self.assertTrue(void_ptr)
+        # Not exactly guaranteed by the spec, but should be true everywhere we
+        # care about.
+        self.assertEqual(void_ptr.GetByteSize(), void_ptr.GetByteAlign())
+
+        # And an over-aligned type.
+        self.assertEqual(module.FindFirstType("OverAlignedStruct").GetByteAlign(), 128)
diff --git a/lldb/test/API/python_api/type/main.cpp b/lldb/test/API/python_api/type/main.cpp
index c86644d91827..6acde5bb666a 100644
--- a/lldb/test/API/python_api/type/main.cpp
+++ b/lldb/test/API/python_api/type/main.cpp
@@ -28,6 +28,7 @@ public:
     union U {
     } u;
     static constexpr long static_constexpr_field = 47;
+    static constexpr bool static_constexpr_bool_field = true;
     static int static_mutable_field;
     Task(int i, Task *n):
         id(i),
@@ -49,6 +50,14 @@ enum EnumType {};
 enum class ScopedEnumType {};
 enum class EnumUChar : unsigned char {};
 
+struct alignas(128) OverAlignedStruct {};
+OverAlignedStruct over_aligned_struct;
+
+struct WithNestedTypedef {
+  typedef int TheTypedef;
+};
+WithNestedTypedef::TheTypedef typedefed_value;
+
 int main (int argc, char const *argv[])
 {
     Task *task_head = new Task(-1, NULL);
diff --git a/lldb/test/API/tools/lldb-dap/console/TestDAP_console.py b/lldb/test/API/tools/lldb-dap/console/TestDAP_console.py
index 8f456aaf890c..8769f39633e6 100644
--- a/lldb/test/API/tools/lldb-dap/console/TestDAP_console.py
+++ b/lldb/test/API/tools/lldb-dap/console/TestDAP_console.py
@@ -4,17 +4,15 @@ Test lldb-dap setBreakpoints request
 
 import dap_server
 import lldbdap_testcase
-import psutil
-from collections import deque
 from lldbsuite.test import lldbutil
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
 
 
-def get_subprocess(process_name):
-    queue = deque([psutil.Process(os.getpid())])
+def get_subprocess(root_process, process_name):
+    queue = [root_process]
     while queue:
-        process = queue.popleft()
+        process = queue.pop()
         if process.name() == process_name:
             return process
         queue.extend(process.children())
@@ -131,7 +129,17 @@ class TestDAP_console(lldbdap_testcase.DAPTestCaseBase):
         process_name = (
             "debugserver" if platform.system() in ["Darwin"] else "lldb-server"
         )
-        process = get_subprocess(process_name)
+
+        try:
+            import psutil
+        except ImportError:
+            print(
+                "psutil not installed, please install using 'pip install psutil'. "
+                "Skipping test_exit_status_message_sigterm test.",
+                file=sys.stderr,
+            )
+            return
+        process = get_subprocess(psutil.Process(os.getpid()), process_name)
         process.terminate()
         process.wait()
 
diff --git a/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py b/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
index d886d0776ce5..ab7dfb5216ae 100644
--- a/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
+++ b/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
@@ -754,3 +754,43 @@ class TestDAP_variables(lldbdap_testcase.DAPTestCaseBase):
         """
         initCommands = ["settings set symbols.load-on-demand true"]
         self.darwin_dwarf_missing_obj(initCommands)
+
+    @no_debug_info_test
+    @skipIfWindows
+    @skipIfRemote
+    def test_value_format(self):
+        """
+        Test that toggle variables value format between decimal and hexical works.
+        """
+        program = self.getBuildArtifact("a.out")
+        self.build_and_launch(program)
+        source = "main.cpp"
+        breakpoint1_line = line_number(source, "// breakpoint 1")
+        lines = [breakpoint1_line]
+
+        breakpoint_ids = self.set_source_breakpoints(source, lines)
+        self.assertEqual(
+            len(breakpoint_ids), len(lines), "expect correct number of breakpoints"
+        )
+        self.continue_to_breakpoints(breakpoint_ids)
+
+        # Verify locals value format decimal
+        is_hex = False
+        var_pt_x = self.dap_server.get_local_variable_child("pt", "x", is_hex=is_hex)
+        self.assertEquals(var_pt_x["value"], "11")
+        var_pt_y = self.dap_server.get_local_variable_child("pt", "y", is_hex=is_hex)
+        self.assertEquals(var_pt_y["value"], "22")
+
+        # Verify locals value format hexical
+        is_hex = True
+        var_pt_x = self.dap_server.get_local_variable_child("pt", "x", is_hex=is_hex)
+        self.assertEquals(var_pt_x["value"], "0x0000000b")
+        var_pt_y = self.dap_server.get_local_variable_child("pt", "y", is_hex=is_hex)
+        self.assertEquals(var_pt_y["value"], "0x00000016")
+
+        # Toggle and verify locals value format decimal again
+        is_hex = False
+        var_pt_x = self.dap_server.get_local_variable_child("pt", "x", is_hex=is_hex)
+        self.assertEquals(var_pt_x["value"], "11")
+        var_pt_y = self.dap_server.get_local_variable_child("pt", "y", is_hex=is_hex)
+        self.assertEquals(var_pt_y["value"], "22")
diff --git a/lldb/test/Shell/Commands/command-disassemble-aarch64-extensions.s b/lldb/test/Shell/Commands/command-disassemble-aarch64-extensions.s
index e154f544e7cc..685d0a84ec28 100644
--- a/lldb/test/Shell/Commands/command-disassemble-aarch64-extensions.s
+++ b/lldb/test/Shell/Commands/command-disassemble-aarch64-extensions.s
@@ -59,7 +59,7 @@ fn:
   bdep z0.b, z1.b, z31.b                // AEK_SVE2BITPERM
   rax1 z0.d, z0.d, z0.d                 // AEK_SVE2SHA3
   sm4e z0.s, z0.s, z0.s                 // AEK_SVE2SM4
-  addqv   v0.8h, p0, z0.h               // AEK_SVE2p1 / AEK_SME2p1
+  addqv   v0.8h, p0, z0.h               // AEK_SVE2P1 / AEK_SME2P1
   rcwswp x0, x1, [x2]                   // AEK_THE
   tcommit                               // AEK_TME
 lbl:
diff --git a/lldb/test/Shell/ExecControl/StepIn/Inputs/aarch64_thunk.cc b/lldb/test/Shell/ExecControl/StepIn/Inputs/aarch64_thunk.cc
new file mode 100644
index 000000000000..02f3bef32a59
--- /dev/null
+++ b/lldb/test/Shell/ExecControl/StepIn/Inputs/aarch64_thunk.cc
@@ -0,0 +1,15 @@
+extern "C" int __attribute__((naked)) __AArch64ADRPThunk_step_here() {
+    asm (
+      "adrp x16, step_here\n"
+      "add x16, x16, :lo12:step_here\n"
+      "br x16"
+    );
+}
+
+extern "C" __attribute__((used)) int step_here() {
+    return 47;
+}
+
+int main() {
+  return __AArch64ADRPThunk_step_here();
+}
diff --git a/lldb/test/Shell/ExecControl/StepIn/step_through-aarch64-thunk.test b/lldb/test/Shell/ExecControl/StepIn/step_through-aarch64-thunk.test
new file mode 100644
index 000000000000..336a746fa3a4
--- /dev/null
+++ b/lldb/test/Shell/ExecControl/StepIn/step_through-aarch64-thunk.test
@@ -0,0 +1,17 @@
+# REQUIRES: native && target-aarch64
+
+# This test is specific to elf platforms.
+# UNSUPPORTED: system-windows, system-darwin
+
+# RUN: %clangxx_host %p/Inputs/aarch64_thunk.cc -g -o %t.out
+# RUN: %lldb %t.out -s %s | FileCheck %s
+
+b main
+# CHECK: Breakpoint 1: where = step_through-aarch64-thunk.test.tmp.out`main
+
+r
+# CHECK: stop reason = breakpoint 1.1
+
+s
+# CHECK: stop reason = step in
+# CHECK:     frame #0: {{.*}} step_through-aarch64-thunk.test.tmp.out`::step_here()
diff --git a/lldb/test/Shell/SymbolFile/DWARF/delayed-definition-die-searching.test b/lldb/test/Shell/SymbolFile/DWARF/delayed-definition-die-searching.test
new file mode 100644
index 000000000000..836fcd7b587b
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/delayed-definition-die-searching.test
@@ -0,0 +1,34 @@
+# Test definition DIE searching is delayed until complete type is required.
+
+# RUN: split-file %s %t
+# RUN: %clangxx_host %t/main.cpp %t/t1_def.cpp -gdwarf -o %t.out
+# RUN: %lldb -b %t.out -s %t/lldb.cmd | FileCheck %s
+
+# CHECK: (lldb) p v1
+# CHECK: DWARFASTParserClang::ParseTypeFromDWARF{{.*}}DW_TAG_structure_type (DW_TAG_structure_type) name = 't2<t1>'
+# CHECK: DWARFASTParserClang::ParseTypeFromDWARF{{.*}}DW_TAG_structure_type (DW_TAG_structure_type) name = 't1'
+# CHECK: DW_TAG_structure_type (DW_TAG_structure_type) 't2<t1>' resolving forward declaration...
+# CHECK: (t2<t1>)  {}
+# CHECK: (lldb) p v2
+# CHECK: DWARFASTParserClang::ParseTypeFromDWARF{{.*}}DW_TAG_structure_type (DW_TAG_structure_type) name = 't1'
+# CHECK: DW_TAG_structure_type (DW_TAG_structure_type) 't1' resolving forward declaration...
+
+#--- lldb.cmd
+log enable dwarf comp
+p v1
+p v2
+
+#--- main.cpp
+template<typename T>
+struct t2 {
+};
+struct t1;
+t2<t1> v1; // this CU doesn't have definition DIE for t1, but only declaration DIE for it.
+int main() {
+}
+
+#--- t1_def.cpp
+struct t1 { // this CU contains definition DIE for t1.
+  int x;
+};
+t1 v2;
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/debug_rnglists.s b/lldb/test/Shell/SymbolFile/DWARF/x86/debug_rnglists.s
index 89b5d94c68c3..af8a1796f3ab 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/debug_rnglists.s
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/debug_rnglists.s
@@ -125,11 +125,11 @@ lookup_rnglists2:
         .long   .Ldebug_ranges0-.Lrnglists_table_base0
 .Ldebug_ranges0:
         .byte   4                       # DW_RLE_offset_pair
-        .uleb128 .Lblock1_begin-rnglists  #   starting offset
-        .uleb128 .Lblock1_end-rnglists    #   ending offset
-        .byte   4                       # DW_RLE_offset_pair
         .uleb128 .Lblock2_begin-rnglists  #   starting offset
         .uleb128 .Lblock2_end-rnglists    #   ending offset
+        .byte   4                       # DW_RLE_offset_pair
+        .uleb128 .Lblock1_begin-rnglists  #   starting offset
+        .uleb128 .Lblock1_end-rnglists    #   ending offset
         .byte   0                       # DW_RLE_end_of_list
 .Ldebug_rnglist_table_end0:
 
diff --git a/lldb/tools/debugserver/source/MacOSX/MachException.cpp b/lldb/tools/debugserver/source/MacOSX/MachException.cpp
index eab4cdfc8b77..659fb2ff8186 100644
--- a/lldb/tools/debugserver/source/MacOSX/MachException.cpp
+++ b/lldb/tools/debugserver/source/MacOSX/MachException.cpp
@@ -247,7 +247,7 @@ kern_return_t MachException::Message::Receive(mach_port_t port,
   DNBError err;
   const bool log_exceptions = DNBLogCheckLogBit(LOG_EXCEPTIONS);
   mach_msg_timeout_t mach_msg_timeout =
-      options & MACH_RCV_TIMEOUT ? timeout : 0;
+      (options & MACH_RCV_TIMEOUT) ? timeout : 0;
   if (log_exceptions && ((options & MACH_RCV_TIMEOUT) == 0)) {
     // Dump this log message if we have no timeout in case it never returns
     DNBLogThreaded("::mach_msg ( msg->{bits = %#x, size = %u remote_port = "
diff --git a/lldb/tools/debugserver/source/MacOSX/MachProcess.mm b/lldb/tools/debugserver/source/MacOSX/MachProcess.mm
index 70b4564a027b..cbe3c5459e91 100644
--- a/lldb/tools/debugserver/source/MacOSX/MachProcess.mm
+++ b/lldb/tools/debugserver/source/MacOSX/MachProcess.mm
@@ -4070,10 +4070,10 @@ pid_t MachProcess::BoardServiceLaunchForDebug(
       m_flags |= eMachProcessFlagsAttached;
       DNBLog("[LaunchAttach] successfully attached to pid %d", m_pid);
     } else {
-      launch_err.SetErrorString(
-          "Failed to attach to pid %d, BoardServiceLaunchForDebug() unable to "
-          "ptrace(PT_ATTACHEXC)",
-          m_pid);
+      std::string errmsg = "Failed to attach to pid ";
+      errmsg += std::to_string(m_pid);
+      errmsg += ", BoardServiceLaunchForDebug() unable to ptrace(PT_ATTACHEXC)";
+      launch_err.SetErrorString(errmsg.c_str());
       SetState(eStateExited);
       DNBLog("[LaunchAttach] END (%d) error: failed to attach to pid %d",
              getpid(), m_pid);
diff --git a/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp b/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp
index 57dd2dce6bf5..b6f52cb5cf49 100644
--- a/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp
+++ b/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp
@@ -26,8 +26,12 @@
 #include <cinttypes>
 #include <sys/sysctl.h>
 
+#undef DEBUGSERVER_IS_ARM64E
 #if __has_feature(ptrauth_calls)
 #include <ptrauth.h>
+#if defined(__LP64__)
+#define DEBUGSERVER_IS_ARM64E 1
+#endif
 #endif
 
 // Break only in privileged or user mode
@@ -115,7 +119,7 @@ static uint64_t clear_pac_bits(uint64_t value) {
 uint64_t DNBArchMachARM64::GetPC(uint64_t failValue) {
   // Get program counter
   if (GetGPRState(false) == KERN_SUCCESS)
-#if __has_feature(ptrauth_calls) && defined(__LP64__)
+#if defined(DEBUGSERVER_IS_ARM64E)
     return clear_pac_bits(
         reinterpret_cast<uint64_t>(m_state.context.gpr.__opaque_pc));
 #else
@@ -147,7 +151,7 @@ kern_return_t DNBArchMachARM64::SetPC(uint64_t value) {
 uint64_t DNBArchMachARM64::GetSP(uint64_t failValue) {
   // Get stack pointer
   if (GetGPRState(false) == KERN_SUCCESS)
-#if __has_feature(ptrauth_calls) && defined(__LP64__)
+#if defined(DEBUGSERVER_IS_ARM64E)
     return clear_pac_bits(
         reinterpret_cast<uint64_t>(m_state.context.gpr.__opaque_sp));
 #else
@@ -169,25 +173,24 @@ kern_return_t DNBArchMachARM64::GetGPRState(bool force) {
                          (thread_state_t)&m_state.context.gpr, &count);
   if (DNBLogEnabledForAny(LOG_THREAD)) {
     uint64_t *x = &m_state.context.gpr.__x[0];
-    DNBLogThreaded("thread_get_state signed regs "
-                   "\n   fp=%16.16llx"
-                   "\n   lr=%16.16llx"
-                   "\n   sp=%16.16llx"
-                   "\n   pc=%16.16llx",
-#if __has_feature(ptrauth_calls) && defined(__LP64__)
+
+    const char *log_str = "thread_get_state signed regs "
+                          "\n   fp=%16.16llx"
+                          "\n   lr=%16.16llx"
+                          "\n   sp=%16.16llx"
+                          "\n   pc=%16.16llx";
+#if defined(DEBUGSERVER_IS_ARM64E)
+    DNBLogThreaded(log_str,
                    reinterpret_cast<uint64_t>(m_state.context.gpr.__opaque_fp),
                    reinterpret_cast<uint64_t>(m_state.context.gpr.__opaque_lr),
                    reinterpret_cast<uint64_t>(m_state.context.gpr.__opaque_sp),
-                   reinterpret_cast<uint64_t>(m_state.context.gpr.__opaque_pc)
+                   reinterpret_cast<uint64_t>(m_state.context.gpr.__opaque_pc));
 #else
-                   m_state.context.gpr.__fp,
-                   m_state.context.gpr.__lr, 
-                   m_state.context.gpr.__sp,
-                   m_state.context.gpr.__pc
+    DNBLogThreaded(log_str, m_state.context.gpr.__fp, m_state.context.gpr.__lr,
+                   m_state.context.gpr.__sp, m_state.context.gpr.__pc);
 #endif
-    );
 
-#if __has_feature(ptrauth_calls) && defined(__LP64__)
+#if defined(DEBUGSERVER_IS_ARM64E)
     uint64_t log_fp = clear_pac_bits(
         reinterpret_cast<uint64_t>(m_state.context.gpr.__opaque_fp));
     uint64_t log_lr = clear_pac_bits(
@@ -661,7 +664,7 @@ kern_return_t DNBArchMachARM64::EnableHardwareSingleStep(bool enable) {
     return err.Status();
   }
 
-#if __has_feature(ptrauth_calls) && defined(__LP64__)
+#if defined(DEBUGSERVER_IS_ARM64E)
   uint64_t pc = clear_pac_bits(
       reinterpret_cast<uint64_t>(m_state.context.gpr.__opaque_pc));
 #else
@@ -2187,7 +2190,7 @@ bool DNBArchMachARM64::GetRegisterValue(uint32_t set, uint32_t reg,
     case e_regSetGPR:
       if (reg <= gpr_pc) {
         switch (reg) {
-#if __has_feature(ptrauth_calls) && defined(__LP64__)
+#if defined(DEBUGSERVER_IS_ARM64E)
         case gpr_pc:
           value->value.uint64 = clear_pac_bits(
               reinterpret_cast<uint64_t>(m_state.context.gpr.__opaque_pc));
diff --git a/lldb/tools/driver/Driver.cpp b/lldb/tools/driver/Driver.cpp
index a821699c5e2e..14371da64f2f 100644
--- a/lldb/tools/driver/Driver.cpp
+++ b/lldb/tools/driver/Driver.cpp
@@ -733,8 +733,14 @@ int main(int argc, char const *argv[]) {
   // Setup LLVM signal handlers and make sure we call llvm_shutdown() on
   // destruction.
   llvm::InitLLVM IL(argc, argv, /*InstallPipeSignalExitHandler=*/false);
+#if !defined(__APPLE__)
   llvm::setBugReportMsg("PLEASE submit a bug report to " LLDB_BUG_REPORT_URL
                         " and include the crash backtrace.\n");
+#else
+  llvm::setBugReportMsg("PLEASE submit a bug report to " LLDB_BUG_REPORT_URL
+                        " and include the crash report from "
+                        "~/Library/Logs/DiagnosticReports/.\n");
+#endif
 
   // Parse arguments.
   LLDBOptTable T;
diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp
index b254ddfef0d5..55ff1493c101 100644
--- a/lldb/tools/lldb-dap/DAP.cpp
+++ b/lldb/tools/lldb-dap/DAP.cpp
@@ -39,8 +39,7 @@ DAP::DAP()
            {"objc_throw", "Objective-C Throw", lldb::eLanguageTypeObjC},
            {"swift_catch", "Swift Catch", lldb::eLanguageTypeSwift},
            {"swift_throw", "Swift Throw", lldb::eLanguageTypeSwift}}),
-      focus_tid(LLDB_INVALID_THREAD_ID), sent_terminated_event(false),
-      stop_at_entry(false), is_attach(false),
+      focus_tid(LLDB_INVALID_THREAD_ID), stop_at_entry(false), is_attach(false),
       enable_auto_variable_summaries(false),
       enable_synthetic_child_debugging(false),
       restarting_process_id(LLDB_INVALID_PROCESS_ID),
@@ -623,7 +622,7 @@ bool DAP::HandleObject(const llvm::json::Object &object) {
 }
 
 llvm::Error DAP::Loop() {
-  while (!sent_terminated_event) {
+  while (!disconnecting) {
     llvm::json::Object object;
     lldb_dap::PacketStatus status = GetNextObject(object);
 
diff --git a/lldb/tools/lldb-dap/DAP.h b/lldb/tools/lldb-dap/DAP.h
index 5c70a056fea4..bbd9d46ba3a0 100644
--- a/lldb/tools/lldb-dap/DAP.h
+++ b/lldb/tools/lldb-dap/DAP.h
@@ -168,7 +168,7 @@ struct DAP {
   // arguments if we get a RestartRequest.
   std::optional<llvm::json::Object> last_launch_or_attach_request;
   lldb::tid_t focus_tid;
-  std::atomic<bool> sent_terminated_event;
+  bool disconnecting = false;
   bool stop_at_entry;
   bool is_attach;
   bool enable_auto_variable_summaries;
diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp
index b4a2718bbb09..bec277332bcf 100644
--- a/lldb/tools/lldb-dap/JSONUtils.cpp
+++ b/lldb/tools/lldb-dap/JSONUtils.cpp
@@ -137,8 +137,7 @@ std::vector<std::string> GetStrings(const llvm::json::Object *obj,
 
 static bool IsClassStructOrUnionType(lldb::SBType t) {
   return (t.GetTypeClass() & (lldb::eTypeClassUnion | lldb::eTypeClassStruct |
-                              lldb::eTypeClassUnion | lldb::eTypeClassArray)) !=
-         0;
+                              lldb::eTypeClassArray)) != 0;
 }
 
 /// Create a short summary for a container that contains the summary of its
@@ -755,7 +754,6 @@ llvm::json::Value CreateStackFrame(lldb::SBFrame &frame) {
   } else {
     object.try_emplace("line", 0);
     object.try_emplace("column", 0);
-    object.try_emplace("presentationHint", "subtle");
   }
 
   const auto pc = frame.GetPC();
@@ -988,8 +986,14 @@ VariableDescription::VariableDescription(lldb::SBValue v, bool format_hex,
   display_type_name =
       !raw_display_type_name.empty() ? raw_display_type_name : NO_TYPENAME;
 
-  if (format_hex)
-    v.SetFormat(lldb::eFormatHex);
+  // Only format hex/default if there is no existing special format.
+  if (v.GetFormat() == lldb::eFormatDefault ||
+      v.GetFormat() == lldb::eFormatHex) {
+    if (format_hex)
+      v.SetFormat(lldb::eFormatHex);
+    else
+      v.SetFormat(lldb::eFormatDefault);
+  }
 
   llvm::raw_string_ostream os_display_value(display_value);
 
diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp
index 8000d68dea7e..96da458be21d 100644
--- a/lldb/tools/lldb-dap/lldb-dap.cpp
+++ b/lldb/tools/lldb-dap/lldb-dap.cpp
@@ -226,26 +226,14 @@ void SendContinuedEvent() {
 // Send a "terminated" event to indicate the process is done being
 // debugged.
 void SendTerminatedEvent() {
-  // If an inferior exits prior to the processing of a disconnect request, then
-  // the threads executing EventThreadFunction and request_discontinue
-  // respectively may call SendTerminatedEvent simultaneously. Without any
-  // synchronization, the thread executing EventThreadFunction may set
-  // g_dap.sent_terminated_event before the thread executing
-  // request_discontinue has had a chance to test it, in which case the latter
-  // would move ahead to issue a response to the disconnect request. Said
-  // response may get dispatched ahead of the terminated event compelling the
-  // client to terminate the debug session without consuming any console output
-  // that might've been generated by the execution of terminateCommands. So,
-  // synchronize simultaneous calls to SendTerminatedEvent.
+  // Prevent races if the process exits while we're being asked to disconnect.
   static std::mutex mutex;
   std::lock_guard<std::mutex> locker(mutex);
-  if (!g_dap.sent_terminated_event) {
-    g_dap.sent_terminated_event = true;
-    g_dap.RunTerminateCommands();
-    // Send a "terminated" event
-    llvm::json::Object event(CreateTerminatedEventObject());
-    g_dap.SendJSON(llvm::json::Value(std::move(event)));
-  }
+
+  g_dap.RunTerminateCommands();
+  // Send a "terminated" event
+  llvm::json::Object event(CreateTerminatedEventObject());
+  g_dap.SendJSON(llvm::json::Value(std::move(event)));
 }
 
 // Send a thread stopped event for all threads as long as the process
@@ -1003,6 +991,7 @@ void request_disconnect(const llvm::json::Object &request) {
     g_dap.broadcaster.BroadcastEventByType(eBroadcastBitStopProgressThread);
     g_dap.progress_event_thread.join();
   }
+  g_dap.disconnecting = true;
 }
 
 void request_exceptionInfo(const llvm::json::Object &request) {
@@ -2774,32 +2763,28 @@ void request_dataBreakpointInfo(const llvm::json::Object &request) {
                                           : "evaluation failed");
     } else {
       uint64_t load_addr = value.GetValueAsUnsigned();
-      addr = llvm::utohexstr(load_addr);
-      lldb::SBMemoryRegionInfo region;
-      lldb::SBError err =
-          g_dap.target.GetProcess().GetMemoryRegionInfo(load_addr, region);
-      if (err.Success()) {
-        if (!(region.IsReadable() || region.IsWritable())) {
-          body.try_emplace("dataId", nullptr);
-          body.try_emplace("description",
-                           "memory region for address " + addr +
-                               " has no read or write permissions");
-        } else {
-          lldb::SBData data = value.GetPointeeData();
-          if (data.IsValid())
-            size = llvm::utostr(data.GetByteSize());
-          else {
+      lldb::SBData data = value.GetPointeeData();
+      if (data.IsValid()) {
+        size = llvm::utostr(data.GetByteSize());
+        addr = llvm::utohexstr(load_addr);
+        lldb::SBMemoryRegionInfo region;
+        lldb::SBError err =
+            g_dap.target.GetProcess().GetMemoryRegionInfo(load_addr, region);
+        // Only lldb-server supports "qMemoryRegionInfo". So, don't fail this
+        // request if SBProcess::GetMemoryRegionInfo returns error.
+        if (err.Success()) {
+          if (!(region.IsReadable() || region.IsWritable())) {
             body.try_emplace("dataId", nullptr);
             body.try_emplace("description",
-                             "unable to get byte size for expression: " +
-                                 name.str());
+                             "memory region for address " + addr +
+                                 " has no read or write permissions");
           }
         }
       } else {
         body.try_emplace("dataId", nullptr);
         body.try_emplace("description",
-                         "unable to get memory region info for address " +
-                             addr);
+                         "unable to get byte size for expression: " +
+                             name.str());
       }
     }
   } else {
@@ -4196,8 +4181,14 @@ int SetupStdoutStderrRedirection() {
 
 int main(int argc, char *argv[]) {
   llvm::InitLLVM IL(argc, argv, /*InstallPipeSignalExitHandler=*/false);
+#if !defined(__APPLE__)
   llvm::setBugReportMsg("PLEASE submit a bug report to " LLDB_BUG_REPORT_URL
                         " and include the crash backtrace.\n");
+#else
+  llvm::setBugReportMsg("PLEASE submit a bug report to " LLDB_BUG_REPORT_URL
+                        " and include the crash report from "
+                        "~/Library/Logs/DiagnosticReports/.\n");
+#endif
 
   llvm::SmallString<256> program_path(argv[0]);
   llvm::sys::fs::make_absolute(program_path);
diff --git a/lldb/tools/lldb-server/lldb-platform.cpp b/lldb/tools/lldb-server/lldb-platform.cpp
index 3e126584eb25..cfd0a3797d81 100644
--- a/lldb/tools/lldb-server/lldb-platform.cpp
+++ b/lldb/tools/lldb-server/lldb-platform.cpp
@@ -282,17 +282,12 @@ int main_platform(int argc, char *argv[]) {
     }
   }
 
-  do {
-    GDBRemoteCommunicationServerPlatform platform(
-        acceptor_up->GetSocketProtocol(), acceptor_up->GetSocketScheme());
-
-    if (port_offset > 0)
-      platform.SetPortOffset(port_offset);
-
-    if (!gdbserver_portmap.empty()) {
-      platform.SetPortMap(std::move(gdbserver_portmap));
-    }
+  GDBRemoteCommunicationServerPlatform platform(
+      acceptor_up->GetSocketProtocol(), acceptor_up->GetSocketScheme());
+  if (port_offset > 0)
+    platform.SetPortOffset(port_offset);
 
+  do {
     const bool children_inherit_accept_socket = true;
     Connection *conn = nullptr;
     error = acceptor_up->Accept(children_inherit_accept_socket, conn);
@@ -301,13 +296,37 @@ int main_platform(int argc, char *argv[]) {
       exit(socket_error);
     }
     printf("Connection established.\n");
+
     if (g_server) {
       // Collect child zombie processes.
 #if !defined(_WIN32)
-      while (waitpid(-1, nullptr, WNOHANG) > 0)
-        ;
+      ::pid_t waitResult;
+      while ((waitResult = waitpid(-1, nullptr, WNOHANG)) > 0) {
+        // waitResult is the child pid
+        gdbserver_portmap.FreePortForProcess(waitResult);
+      }
 #endif
-      if (fork()) {
+      // TODO: Clean up portmap for Windows when children die
+      // See https://github.com/llvm/llvm-project/issues/90923
+
+      // After collecting zombie ports, get the next available
+      GDBRemoteCommunicationServerPlatform::PortMap portmap_for_child;
+      llvm::Expected<uint16_t> available_port =
+          gdbserver_portmap.GetNextAvailablePort();
+      if (available_port)
+        portmap_for_child.AllowPort(*available_port);
+      else {
+        llvm::consumeError(available_port.takeError());
+        fprintf(stderr,
+                "no available gdbserver port for connection - dropping...\n");
+        delete conn;
+        continue;
+      }
+      platform.SetPortMap(std::move(portmap_for_child));
+
+      auto childPid = fork();
+      if (childPid) {
+        gdbserver_portmap.AssociatePortWithProcess(*available_port, childPid);
         // Parent doesn't need a connection to the lldb client
         delete conn;
 
@@ -323,7 +342,11 @@ int main_platform(int argc, char *argv[]) {
       // If not running as a server, this process will not accept
       // connections while a connection is active.
       acceptor_up.reset();
+
+      // When not running in server mode, use all available ports
+      platform.SetPortMap(std::move(gdbserver_portmap));
     }
+
     platform.SetConnection(std::unique_ptr<Connection>(conn));
 
     if (platform.IsConnected()) {
diff --git a/lldb/unittests/Core/DiagnosticEventTest.cpp b/lldb/unittests/Core/DiagnosticEventTest.cpp
index d06f164e87e7..1423f76b8b52 100644
--- a/lldb/unittests/Core/DiagnosticEventTest.cpp
+++ b/lldb/unittests/Core/DiagnosticEventTest.cpp
@@ -55,9 +55,8 @@ TEST_F(DiagnosticEventTest, Warning) {
   ListenerSP listener_sp = Listener::MakeListener("test-listener");
 
   listener_sp->StartListeningForEvents(&broadcaster,
-                                       Debugger::eBroadcastBitWarning);
-  EXPECT_TRUE(
-      broadcaster.EventTypeHasListeners(Debugger::eBroadcastBitWarning));
+                                       lldb::eBroadcastBitWarning);
+  EXPECT_TRUE(broadcaster.EventTypeHasListeners(lldb::eBroadcastBitWarning));
 
   Debugger::ReportWarning("foo", debugger_sp->GetID());
 
@@ -80,9 +79,8 @@ TEST_F(DiagnosticEventTest, Error) {
   Broadcaster &broadcaster = debugger_sp->GetBroadcaster();
   ListenerSP listener_sp = Listener::MakeListener("test-listener");
 
-  listener_sp->StartListeningForEvents(&broadcaster,
-                                       Debugger::eBroadcastBitError);
-  EXPECT_TRUE(broadcaster.EventTypeHasListeners(Debugger::eBroadcastBitError));
+  listener_sp->StartListeningForEvents(&broadcaster, lldb::eBroadcastBitError);
+  EXPECT_TRUE(broadcaster.EventTypeHasListeners(lldb::eBroadcastBitError));
 
   Debugger::ReportError("bar", debugger_sp->GetID());
 
@@ -111,7 +109,7 @@ TEST_F(DiagnosticEventTest, MultipleDebuggers) {
     listeners.push_back(listener);
 
     listener->StartListeningForEvents(&debugger->GetBroadcaster(),
-                                      Debugger::eBroadcastBitError);
+                                      lldb::eBroadcastBitError);
   }
 
   Debugger::ReportError("baz");
@@ -140,9 +138,8 @@ TEST_F(DiagnosticEventTest, WarningOnce) {
   ListenerSP listener_sp = Listener::MakeListener("test-listener");
 
   listener_sp->StartListeningForEvents(&broadcaster,
-                                       Debugger::eBroadcastBitWarning);
-  EXPECT_TRUE(
-      broadcaster.EventTypeHasListeners(Debugger::eBroadcastBitWarning));
+                                       lldb::eBroadcastBitWarning);
+  EXPECT_TRUE(broadcaster.EventTypeHasListeners(lldb::eBroadcastBitWarning));
 
   std::once_flag once;
   Debugger::ReportWarning("foo", debugger_sp->GetID(), &once);
diff --git a/lldb/unittests/Core/ProgressReportTest.cpp b/lldb/unittests/Core/ProgressReportTest.cpp
index f0d253be9bf6..141244feb1f0 100644
--- a/lldb/unittests/Core/ProgressReportTest.cpp
+++ b/lldb/unittests/Core/ProgressReportTest.cpp
@@ -61,7 +61,7 @@ protected:
 };
 
 TEST_F(ProgressReportTest, TestReportCreation) {
-  ListenerSP listener_sp = CreateListenerFor(Debugger::eBroadcastBitProgress);
+  ListenerSP listener_sp = CreateListenerFor(lldb::eBroadcastBitProgress);
   EventSP event_sp;
   const ProgressEventData *data;
 
@@ -135,7 +135,7 @@ TEST_F(ProgressReportTest, TestReportCreation) {
 
 TEST_F(ProgressReportTest, TestProgressManager) {
   ListenerSP listener_sp =
-      CreateListenerFor(Debugger::eBroadcastBitProgressCategory);
+      CreateListenerFor(lldb::eBroadcastBitProgressCategory);
   EventSP event_sp;
   const ProgressEventData *data;
 
@@ -173,7 +173,7 @@ TEST_F(ProgressReportTest, TestProgressManager) {
 
 TEST_F(ProgressReportTest, TestOverlappingEvents) {
   ListenerSP listener_sp =
-      CreateListenerFor(Debugger::eBroadcastBitProgressCategory);
+      CreateListenerFor(lldb::eBroadcastBitProgressCategory);
   EventSP event_sp;
   const ProgressEventData *data;
 
@@ -214,7 +214,7 @@ TEST_F(ProgressReportTest, TestOverlappingEvents) {
 
 TEST_F(ProgressReportTest, TestProgressManagerDisjointReports) {
   ListenerSP listener_sp =
-      CreateListenerFor(Debugger::eBroadcastBitProgressCategory);
+      CreateListenerFor(lldb::eBroadcastBitProgressCategory);
   EventSP event_sp;
   const ProgressEventData *data;
   uint64_t expected_progress_id;
diff --git a/lldb/unittests/Expression/DiagnosticManagerTest.cpp b/lldb/unittests/Expression/DiagnosticManagerTest.cpp
index cab26debedb1..05fe7c164d68 100644
--- a/lldb/unittests/Expression/DiagnosticManagerTest.cpp
+++ b/lldb/unittests/Expression/DiagnosticManagerTest.cpp
@@ -19,7 +19,7 @@ class FixItDiag : public Diagnostic {
 
 public:
   FixItDiag(llvm::StringRef msg, bool has_fixits)
-      : Diagnostic(msg, DiagnosticSeverity::eDiagnosticSeverityError,
+      : Diagnostic(msg, lldb::eSeverityError,
                    DiagnosticOrigin::eDiagnosticOriginLLDB, custom_diag_id),
         m_has_fixits(has_fixits) {}
   bool HasFixIts() const override { return m_has_fixits; }
@@ -29,7 +29,7 @@ public:
 namespace {
 class TextDiag : public Diagnostic {
 public:
-  TextDiag(llvm::StringRef msg, DiagnosticSeverity severity)
+  TextDiag(llvm::StringRef msg, lldb::Severity severity)
       : Diagnostic(msg, severity, DiagnosticOrigin::eDiagnosticOriginLLDB,
                    custom_diag_id) {}
 };
@@ -40,7 +40,7 @@ TEST(DiagnosticManagerTest, AddDiagnostic) {
   EXPECT_EQ(0U, mgr.Diagnostics().size());
 
   std::string msg = "foo bar has happened";
-  DiagnosticSeverity severity = DiagnosticSeverity::eDiagnosticSeverityError;
+  lldb::Severity severity = lldb::eSeverityError;
   DiagnosticOrigin origin = DiagnosticOrigin::eDiagnosticOriginLLDB;
   auto diag =
       std::make_unique<Diagnostic>(msg, severity, origin, custom_diag_id);
@@ -82,8 +82,7 @@ TEST(DiagnosticManagerTest, GetStringNoDiags) {
 
 TEST(DiagnosticManagerTest, GetStringBasic) {
   DiagnosticManager mgr;
-  mgr.AddDiagnostic(
-      std::make_unique<TextDiag>("abc", eDiagnosticSeverityError));
+  mgr.AddDiagnostic(std::make_unique<TextDiag>("abc", lldb::eSeverityError));
   EXPECT_EQ("error: abc\n", mgr.GetString());
 }
 
@@ -91,18 +90,15 @@ TEST(DiagnosticManagerTest, GetStringMultiline) {
   DiagnosticManager mgr;
 
   // Multiline diagnostics should only get one severity label.
-  mgr.AddDiagnostic(
-      std::make_unique<TextDiag>("b\nc", eDiagnosticSeverityError));
+  mgr.AddDiagnostic(std::make_unique<TextDiag>("b\nc", lldb::eSeverityError));
   EXPECT_EQ("error: b\nc\n", mgr.GetString());
 }
 
 TEST(DiagnosticManagerTest, GetStringMultipleDiags) {
   DiagnosticManager mgr;
-  mgr.AddDiagnostic(
-      std::make_unique<TextDiag>("abc", eDiagnosticSeverityError));
+  mgr.AddDiagnostic(std::make_unique<TextDiag>("abc", lldb::eSeverityError));
   EXPECT_EQ("error: abc\n", mgr.GetString());
-  mgr.AddDiagnostic(
-      std::make_unique<TextDiag>("def", eDiagnosticSeverityError));
+  mgr.AddDiagnostic(std::make_unique<TextDiag>("def", lldb::eSeverityError));
   EXPECT_EQ("error: abc\nerror: def\n", mgr.GetString());
 }
 
@@ -110,13 +106,10 @@ TEST(DiagnosticManagerTest, GetStringSeverityLabels) {
   DiagnosticManager mgr;
 
   // Different severities should cause different labels.
-  mgr.AddDiagnostic(
-      std::make_unique<TextDiag>("foo", eDiagnosticSeverityError));
-  mgr.AddDiagnostic(
-      std::make_unique<TextDiag>("bar", eDiagnosticSeverityWarning));
+  mgr.AddDiagnostic(std::make_unique<TextDiag>("foo", lldb::eSeverityError));
+  mgr.AddDiagnostic(std::make_unique<TextDiag>("bar", lldb::eSeverityWarning));
   // Remarks have no labels.
-  mgr.AddDiagnostic(
-      std::make_unique<TextDiag>("baz", eDiagnosticSeverityRemark));
+  mgr.AddDiagnostic(std::make_unique<TextDiag>("baz", lldb::eSeverityInfo));
   EXPECT_EQ("error: foo\nwarning: bar\nbaz\n", mgr.GetString());
 }
 
@@ -124,12 +117,9 @@ TEST(DiagnosticManagerTest, GetStringPreserveOrder) {
   DiagnosticManager mgr;
 
   // Make sure we preserve the diagnostic order and do not sort them in any way.
-  mgr.AddDiagnostic(
-      std::make_unique<TextDiag>("baz", eDiagnosticSeverityRemark));
-  mgr.AddDiagnostic(
-      std::make_unique<TextDiag>("bar", eDiagnosticSeverityWarning));
-  mgr.AddDiagnostic(
-      std::make_unique<TextDiag>("foo", eDiagnosticSeverityError));
+  mgr.AddDiagnostic(std::make_unique<TextDiag>("baz", lldb::eSeverityInfo));
+  mgr.AddDiagnostic(std::make_unique<TextDiag>("bar", lldb::eSeverityWarning));
+  mgr.AddDiagnostic(std::make_unique<TextDiag>("foo", lldb::eSeverityError));
   EXPECT_EQ("baz\nwarning: bar\nerror: foo\n", mgr.GetString());
 }
 
@@ -144,10 +134,8 @@ TEST(DiagnosticManagerTest, AppendMessageNoDiag) {
 TEST(DiagnosticManagerTest, AppendMessageAttachToLastDiag) {
   DiagnosticManager mgr;
 
-  mgr.AddDiagnostic(
-      std::make_unique<TextDiag>("foo", eDiagnosticSeverityError));
-  mgr.AddDiagnostic(
-      std::make_unique<TextDiag>("bar", eDiagnosticSeverityError));
+  mgr.AddDiagnostic(std::make_unique<TextDiag>("foo", lldb::eSeverityError));
+  mgr.AddDiagnostic(std::make_unique<TextDiag>("bar", lldb::eSeverityError));
   // This should append to 'bar' and not to 'foo'.
   mgr.AppendMessageToDiagnostic("message text");
 
@@ -157,12 +145,10 @@ TEST(DiagnosticManagerTest, AppendMessageAttachToLastDiag) {
 TEST(DiagnosticManagerTest, AppendMessageSubsequentDiags) {
   DiagnosticManager mgr;
 
-  mgr.AddDiagnostic(
-      std::make_unique<TextDiag>("bar", eDiagnosticSeverityError));
+  mgr.AddDiagnostic(std::make_unique<TextDiag>("bar", lldb::eSeverityError));
   mgr.AppendMessageToDiagnostic("message text");
   // Pushing another diag after the message should work fine.
-  mgr.AddDiagnostic(
-      std::make_unique<TextDiag>("foo", eDiagnosticSeverityError));
+  mgr.AddDiagnostic(std::make_unique<TextDiag>("foo", lldb::eSeverityError));
 
   EXPECT_EQ("error: bar\nmessage text\nerror: foo\n", mgr.GetString());
 }
@@ -170,7 +156,7 @@ TEST(DiagnosticManagerTest, AppendMessageSubsequentDiags) {
 TEST(DiagnosticManagerTest, PutString) {
   DiagnosticManager mgr;
 
-  mgr.PutString(eDiagnosticSeverityError, "foo");
+  mgr.PutString(lldb::eSeverityError, "foo");
   EXPECT_EQ(1U, mgr.Diagnostics().size());
   EXPECT_EQ(eDiagnosticOriginLLDB, mgr.Diagnostics().front()->getKind());
   EXPECT_EQ("error: foo\n", mgr.GetString());
@@ -180,8 +166,8 @@ TEST(DiagnosticManagerTest, PutStringMultiple) {
   DiagnosticManager mgr;
 
   // Multiple PutString should behave like multiple diagnostics.
-  mgr.PutString(eDiagnosticSeverityError, "foo");
-  mgr.PutString(eDiagnosticSeverityError, "bar");
+  mgr.PutString(lldb::eSeverityError, "foo");
+  mgr.PutString(lldb::eSeverityError, "bar");
   EXPECT_EQ(2U, mgr.Diagnostics().size());
   EXPECT_EQ("error: foo\nerror: bar\n", mgr.GetString());
 }
@@ -191,8 +177,8 @@ TEST(DiagnosticManagerTest, PutStringSeverities) {
 
   // Multiple PutString with different severities should behave like we
   // created multiple diagnostics.
-  mgr.PutString(eDiagnosticSeverityError, "foo");
-  mgr.PutString(eDiagnosticSeverityWarning, "bar");
+  mgr.PutString(lldb::eSeverityError, "foo");
+  mgr.PutString(lldb::eSeverityWarning, "bar");
   EXPECT_EQ(2U, mgr.Diagnostics().size());
   EXPECT_EQ("error: foo\nwarning: bar\n", mgr.GetString());
 }
diff --git a/lldb/unittests/Process/gdb-remote/GDBRemoteCommunicationClientTest.cpp b/lldb/unittests/Process/gdb-remote/GDBRemoteCommunicationClientTest.cpp
index 6b11ec43a65d..24111396b0ac 100644
--- a/lldb/unittests/Process/gdb-remote/GDBRemoteCommunicationClientTest.cpp
+++ b/lldb/unittests/Process/gdb-remote/GDBRemoteCommunicationClientTest.cpp
@@ -595,10 +595,8 @@ TEST_F(GDBRemoteCommunicationClientTest, WriteMemoryTags) {
 
 TEST_F(GDBRemoteCommunicationClientTest, CalculateMD5) {
   FileSpec file_spec("/foo/bar", FileSpec::Style::posix);
-  uint64_t low, high;
-  std::future<bool> async_result = std::async(std::launch::async, [&] {
-    return client.CalculateMD5(file_spec, low, high);
-  });
+  std::future<ErrorOr<MD5::MD5Result>> async_result = std::async(
+      std::launch::async, [&] { return client.CalculateMD5(file_spec); });
 
   lldb_private::StreamString stream;
   stream.PutCString("vFile:MD5:");
@@ -607,11 +605,12 @@ TEST_F(GDBRemoteCommunicationClientTest, CalculateMD5) {
                "F,"
                "deadbeef01020304"
                "05060708deadbeef");
-  ASSERT_TRUE(async_result.get());
+  auto result = async_result.get();
 
   // Server and client puts/parses low, and then high
   const uint64_t expected_low = 0xdeadbeef01020304;
   const uint64_t expected_high = 0x05060708deadbeef;
-  EXPECT_EQ(expected_low, low);
-  EXPECT_EQ(expected_high, high);
+  ASSERT_TRUE(result);
+  EXPECT_EQ(expected_low, result->low());
+  EXPECT_EQ(expected_high, result->high());
 }
diff --git a/lldb/unittests/Symbol/TestType.cpp b/lldb/unittests/Symbol/TestType.cpp
index 73f5811434fd..da849d804e4d 100644
--- a/lldb/unittests/Symbol/TestType.cpp
+++ b/lldb/unittests/Symbol/TestType.cpp
@@ -10,43 +10,41 @@
 #include "gtest/gtest.h"
 
 #include "lldb/Symbol/Type.h"
+#include "lldb/lldb-enumerations.h"
 
 using namespace lldb;
 using namespace lldb_private;
 
-namespace {
-void TestGetTypeScopeAndBasenameHelper(const char *full_type,
-                                       bool expected_is_scoped,
-                                       const char *expected_scope,
-                                       const char *expected_name) {
-  llvm::StringRef scope, name;
-  lldb::TypeClass type_class;
-  bool is_scoped =
-      Type::GetTypeScopeAndBasename(full_type, scope, name, type_class);
-  EXPECT_EQ(is_scoped, expected_is_scoped);
-  if (expected_is_scoped) {
-    EXPECT_EQ(scope, expected_scope);
-    EXPECT_EQ(name, expected_name);
-  }
-}
-}
-
 TEST(Type, GetTypeScopeAndBasename) {
-  TestGetTypeScopeAndBasenameHelper("int", false, "", "");
-  TestGetTypeScopeAndBasenameHelper("std::string", true, "std::", "string");
-  TestGetTypeScopeAndBasenameHelper("std::set<int>", true, "std::", "set<int>");
-  TestGetTypeScopeAndBasenameHelper("std::set<int, std::less<int>>", true,
-                                    "std::", "set<int, std::less<int>>");
-  TestGetTypeScopeAndBasenameHelper("std::string::iterator", true,
-                                    "std::string::", "iterator");
-  TestGetTypeScopeAndBasenameHelper("std::set<int>::iterator", true,
-                                    "std::set<int>::", "iterator");
-  TestGetTypeScopeAndBasenameHelper(
-      "std::set<int, std::less<int>>::iterator", true,
-      "std::set<int, std::less<int>>::", "iterator");
-  TestGetTypeScopeAndBasenameHelper(
-      "std::set<int, std::less<int>>::iterator<bool>", true,
-      "std::set<int, std::less<int>>::", "iterator<bool>");
+  EXPECT_EQ(Type::GetTypeScopeAndBasename("int"),
+            (Type::ParsedName{eTypeClassAny, {}, "int"}));
+  EXPECT_EQ(Type::GetTypeScopeAndBasename("std::string"),
+            (Type::ParsedName{eTypeClassAny, {"std"}, "string"}));
+  EXPECT_EQ(Type::GetTypeScopeAndBasename("::std::string"),
+            (Type::ParsedName{eTypeClassAny, {"::", "std"}, "string"}));
+  EXPECT_EQ(Type::GetTypeScopeAndBasename("struct std::string"),
+            (Type::ParsedName{eTypeClassStruct, {"std"}, "string"}));
+  EXPECT_EQ(Type::GetTypeScopeAndBasename("std::set<int>"),
+            (Type::ParsedName{eTypeClassAny, {"std"}, "set<int>"}));
+  EXPECT_EQ(
+      Type::GetTypeScopeAndBasename("std::set<int, std::less<int>>"),
+      (Type::ParsedName{eTypeClassAny, {"std"}, "set<int, std::less<int>>"}));
+  EXPECT_EQ(Type::GetTypeScopeAndBasename("std::string::iterator"),
+            (Type::ParsedName{eTypeClassAny, {"std", "string"}, "iterator"}));
+  EXPECT_EQ(Type::GetTypeScopeAndBasename("std::set<int>::iterator"),
+            (Type::ParsedName{eTypeClassAny, {"std", "set<int>"}, "iterator"}));
+  EXPECT_EQ(
+      Type::GetTypeScopeAndBasename("std::set<int, std::less<int>>::iterator"),
+      (Type::ParsedName{
+          eTypeClassAny, {"std", "set<int, std::less<int>>"}, "iterator"}));
+  EXPECT_EQ(Type::GetTypeScopeAndBasename(
+                "std::set<int, std::less<int>>::iterator<bool>"),
+            (Type::ParsedName{eTypeClassAny,
+                              {"std", "set<int, std::less<int>>"},
+                              "iterator<bool>"}));
+
+  EXPECT_EQ(Type::GetTypeScopeAndBasename("std::"), std::nullopt);
+  EXPECT_EQ(Type::GetTypeScopeAndBasename("foo<::bar"), std::nullopt);
 }
 
 TEST(Type, CompilerContextPattern) {
diff --git a/lldb/unittests/Utility/LogTest.cpp b/lldb/unittests/Utility/LogTest.cpp
index 1dac19486a8f..b9b0af4133da 100644
--- a/lldb/unittests/Utility/LogTest.cpp
+++ b/lldb/unittests/Utility/LogTest.cpp
@@ -200,6 +200,18 @@ TEST(LogHandlerTest, RotatingLogHandler) {
   EXPECT_EQ(GetDumpAsString(handler), "bazquxquux");
 }
 
+TEST(LogHandlerTest, TeeLogHandler) {
+  auto handler1 = std::make_shared<RotatingLogHandler>(2);
+  auto handler2 = std::make_shared<RotatingLogHandler>(2);
+  TeeLogHandler handler(handler1, handler2);
+
+  handler.Emit("foo");
+  handler.Emit("bar");
+
+  EXPECT_EQ(GetDumpAsString(*handler1), "foobar");
+  EXPECT_EQ(GetDumpAsString(*handler2), "foobar");
+}
+
 TEST_F(LogChannelTest, Enable) {
   EXPECT_EQ(nullptr, GetLog(TestChannel::FOO));
   std::string message;
diff --git a/lldb/unittests/Utility/ScalarTest.cpp b/lldb/unittests/Utility/ScalarTest.cpp
index 8d957d16593e..500cb8bb2286 100644
--- a/lldb/unittests/Utility/ScalarTest.cpp
+++ b/lldb/unittests/Utility/ScalarTest.cpp
@@ -13,8 +13,11 @@
 #include "lldb/Utility/Scalar.h"
 #include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
+#include "lldb/lldb-enumerations.h"
+#include "llvm/ADT/APSInt.h"
 #include "llvm/Testing/Support/Error.h"
 
+#include <algorithm>
 #include <cmath>
 
 using namespace lldb_private;
@@ -163,6 +166,33 @@ TEST(ScalarTest, GetBytes) {
   ASSERT_EQ(0, memcmp(f, Storage, sizeof(f)));
 }
 
+TEST(ScalarTest, GetData) {
+  auto get_data = [](llvm::APSInt v) {
+    DataExtractor data;
+    Scalar(v).GetData(data);
+    return data.GetData().vec();
+  };
+
+  auto vec = [](std::initializer_list<uint8_t> l) {
+    std::vector<uint8_t> v(l.begin(), l.end());
+    if (endian::InlHostByteOrder() == lldb::eByteOrderLittle)
+      std::reverse(v.begin(), v.end());
+    return v;
+  };
+
+  EXPECT_THAT(
+      get_data(llvm::APSInt::getMaxValue(/*numBits=*/1, /*Unsigned=*/true)),
+      vec({0x01}));
+
+  EXPECT_THAT(
+      get_data(llvm::APSInt::getMaxValue(/*numBits=*/8, /*Unsigned=*/true)),
+      vec({0xff}));
+
+  EXPECT_THAT(
+      get_data(llvm::APSInt::getMaxValue(/*numBits=*/9, /*Unsigned=*/true)),
+      vec({0x01, 0xff}));
+}
+
 TEST(ScalarTest, SetValueFromData) {
   uint8_t a[] = {1, 2, 3, 4};
   Scalar s;
diff --git a/llvm/cmake/modules/LLVMExternalProjectUtils.cmake b/llvm/cmake/modules/LLVMExternalProjectUtils.cmake
index 2672f90f579b..c8016f20a819 100644
--- a/llvm/cmake/modules/LLVMExternalProjectUtils.cmake
+++ b/llvm/cmake/modules/LLVMExternalProjectUtils.cmake
@@ -261,7 +261,7 @@ function(llvm_ExternalProject_Add name source_dir)
     set(sysroot_arg -DCMAKE_SYSROOT=${CMAKE_SYSROOT})
   endif()
 
-  if(CMAKE_CROSSCOMPILING OR _cmake_system_name STREQUAL AIX)
+  if(CMAKE_CROSSCOMPILING)
     set(compiler_args -DCMAKE_ASM_COMPILER=${CMAKE_ASM_COMPILER}
                       -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                       -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@@ -273,8 +273,6 @@ function(llvm_ExternalProject_Add name source_dir)
                       -DCMAKE_OBJDUMP=${CMAKE_OBJDUMP}
                       -DCMAKE_STRIP=${CMAKE_STRIP}
                       -DCMAKE_READELF=${CMAKE_READELF})
-  endif()
-  if(CMAKE_CROSSCOMPILING)
     set(llvm_config_path ${LLVM_CONFIG_PATH})
 
     if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 029db00134c0..51969be85648 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1157,6 +1157,12 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
                                                    register do not exactly match the FLT_ROUNDS values,
                                                    so a conversion is performed.
 
+  :ref:`llvm.set.rounding<int_set_rounding>`       Input value expected to be one of the valid results
+                                                   from '``llvm.get.rounding``'. Rounding mode is
+                                                   undefined if not passed a valid input. This should be
+                                                   a wave uniform value. In case of a divergent input
+                                                   value, the first active lane's value will be used.
+
   :ref:`llvm.get.fpenv<int_get_fpenv>`             Returns the current value of the AMDGPU floating point environment.
                                                    This stores information related to the current rounding mode,
                                                    denormalization mode, enabled traps, and floating point exceptions.
diff --git a/llvm/docs/CommandGuide/dsymutil.rst b/llvm/docs/CommandGuide/dsymutil.rst
index e3f2f33224b0..6026e2f534ed 100644
--- a/llvm/docs/CommandGuide/dsymutil.rst
+++ b/llvm/docs/CommandGuide/dsymutil.rst
@@ -115,6 +115,10 @@ OPTIONS
  Specifies an alternate ``path`` to place the dSYM bundle. The default dSYM
  bundle path is created by appending ``.dSYM`` to the executable name.
 
+.. option:: -q, --quiet
+
+ Enable quiet mode and limit output.
+
 .. option:: --remarks-drop-without-debug
 
  Drop remarks without valid debug locations. Without this flags, all remarks are kept.
diff --git a/llvm/docs/CommandGuide/llvm-mca.rst b/llvm/docs/CommandGuide/llvm-mca.rst
index eae5e1406b89..f610ea2f2168 100644
--- a/llvm/docs/CommandGuide/llvm-mca.rst
+++ b/llvm/docs/CommandGuide/llvm-mca.rst
@@ -234,6 +234,16 @@ option specifies "``-``", then the output will also be sent to standard output.
   no extra information, and InstrumentManager never overrides the default
   schedule class for a given instruction.
 
+.. option:: -skip-unsupported-instructions=<reason>
+
+  Force :program:`llvm-mca` to continue in the presence of instructions which do
+  not parse or lack key scheduling information. Note that the resulting analysis
+  is impacted since those unsupported instructions are ignored as-if they are
+  not supplied as a part of the input.
+
+  The choice of `<reason>` controls the when mca will report an error.
+  `<reason>` may be `none` (default), `lack-sched`, `parse-failure`, `any`.
+
 EXIT STATUS
 -----------
 
diff --git a/llvm/docs/CommandGuide/llvm-pdbutil.rst b/llvm/docs/CommandGuide/llvm-pdbutil.rst
index 955353187112..74e1444794df 100644
--- a/llvm/docs/CommandGuide/llvm-pdbutil.rst
+++ b/llvm/docs/CommandGuide/llvm-pdbutil.rst
@@ -27,18 +27,18 @@ Subcommands
 a different purpose.  A brief summary of each command follows, with more detail
 in the sections that follow.
 
-  * :ref:`pretty_subcommand` - Dump symbol and type information in a format that
+  * :ref:`pretty_subcommand` - Dump symbol and type information in a format that
     tries to look as much like the original source code as possible.
-  * :ref:`dump_subcommand` - Dump low level types and structures from the PDB
+  * :ref:`dump_subcommand` - Dump low level types and structures from the PDB
     file, including CodeView records, hash tables, PDB streams, etc.
-  * :ref:`bytes_subcommand` - Dump data from the PDB file's streams, records,
+  * :ref:`bytes_subcommand` - Dump data from the PDB file's streams, records,
     types, symbols, etc as raw bytes.
-  * :ref:`yaml2pdb_subcommand` - Given a yaml description of a PDB file, produce
+  * :ref:`yaml2pdb_subcommand` - Given a yaml description of a PDB file, produce
     a valid PDB file that matches that description.
-  * :ref:`pdb2yaml_subcommand` - For a given PDB file, produce a YAML
-    description of some or all of the file in a way that the PDB can be
+  * :ref:`pdb2yaml_subcommand` - For a given PDB file, produce a YAML
+    description of some or all of the file in a way that the PDB can be
     reconstructed.
-  * :ref:`merge_subcommand` - Given two PDBs, produce a third PDB that is the
+  * :ref:`merge_subcommand` - Given two PDBs, produce a third PDB that is the
     result of merging the two input PDBs.
 
 .. _pretty_subcommand:
@@ -49,7 +49,7 @@ pretty
 .. program:: llvm-pdbutil pretty
 
 .. important::
-   The **pretty** subcommand is built on the Windows DIA SDK, and as such is not
+   The **pretty** subcommand is built on the Windows DIA SDK, and as such is not
    supported on non-Windows platforms.
 
 USAGE: :program:`llvm-pdbutil` pretty [*options*] <input PDB file>
@@ -57,10 +57,10 @@ USAGE: :program:`llvm-pdbutil` pretty [*options*] <input PDB file>
 Summary
 ^^^^^^^^^^^
 
-The *pretty* subcommand displays a very high level representation of your
-program's debug info.  Since it is built on the Windows DIA SDK which is the
-standard API that Windows tools and debuggers query debug information, it
-presents a more authoritative view of how a debugger is going to interpret your
+The *pretty* subcommand displays a very high level representation of your
+program's debug info.  Since it is built on the Windows DIA SDK which is the
+standard API that Windows tools and debuggers query debug information, it
+presents a more authoritative view of how a debugger is going to interpret your
 debug information than a mode which displays low-level CodeView records.
 
 Options
@@ -70,55 +70,55 @@ Filtering and Sorting Options
 +++++++++++++++++++++++++++++
 
 .. note::
-   *exclude* filters take priority over *include* filters.  So if a filter
+   *exclude* filters take priority over *include* filters.  So if a filter
    matches both an include and an exclude rule, then it is excluded.
 
 .. option:: -exclude-compilands=<string>
 
- When dumping compilands, compiland source-file contributions, or per-compiland
- symbols, this option instructs **llvm-pdbutil** to omit any compilands that
+ When dumping compilands, compiland source-file contributions, or per-compiland
+ symbols, this option instructs **llvm-pdbutil** to omit any compilands that
  match the specified regular expression.
 
 .. option:: -exclude-symbols=<string>
 
- When dumping global, public, or per-compiland symbols, this option instructs
- **llvm-pdbutil** to omit any symbols that match the specified regular
+ When dumping global, public, or per-compiland symbols, this option instructs
+ **llvm-pdbutil** to omit any symbols that match the specified regular
  expression.
 
 .. option:: -exclude-types=<string>
 
- When dumping types, this option instructs **llvm-pdbutil** to omit any types
+ When dumping types, this option instructs **llvm-pdbutil** to omit any types
  that match the specified regular expression.
 
 .. option:: -include-compilands=<string>
 
- When dumping compilands, compiland source-file contributions, or per-compiland
- symbols, limit the initial search to only those compilands that match the
+ When dumping compilands, compiland source-file contributions, or per-compiland
+ symbols, limit the initial search to only those compilands that match the
  specified regular expression.
 
 .. option:: -include-symbols=<string>
 
- When dumping global, public, or per-compiland symbols, limit the initial
+ When dumping global, public, or per-compiland symbols, limit the initial
  search to only those symbols that match the specified regular expression.
 
 .. option:: -include-types=<string>
 
- When dumping types, limit the initial search to only those types that match
+ When dumping types, limit the initial search to only those types that match
  the specified regular expression.
 
 .. option:: -min-class-padding=<uint>
 
- Only display types that have at least the specified amount of alignment
+ Only display types that have at least the specified amount of alignment
  padding, accounting for padding in base classes and aggregate field members.
 
 .. option:: -min-class-padding-imm=<uint>
 
- Only display types that have at least the specified amount of alignment
+ Only display types that have at least the specified amount of alignment
  padding, ignoring padding in base classes and aggregate field members.
 
 .. option:: -min-type-size=<uint>
 
- Only display types T where sizeof(T) is greater than or equal to the specified
+ Only display types T where sizeof(T) is greater than or equal to the specified
  amount.
 
 .. option:: -no-compiler-generated
@@ -127,7 +127,7 @@ Filtering and Sorting Options
 
 .. option:: -no-enum-definitions
 
- When dumping an enum, don't show the full enum (e.g. the individual enumerator
+ When dumping an enum, don't show the full enum (e.g. the individual enumerator
  values).
 
 .. option:: -no-system-libs
@@ -233,12 +233,12 @@ Other Options
 
 .. option:: -color-output
 
- Force color output on or off.  By default, color if used if outputting to a
+ Force color output on or off.  By default, color if used if outputting to a
  terminal.
 
 .. option:: -load-address=<uint>
 
- When displaying relative virtual addresses, assume the process is loaded at the
+ When displaying relative virtual addresses, assume the process is loaded at the
  given address and display what would be the absolute address.
 
 .. _dump_subcommand:
@@ -253,14 +253,14 @@ USAGE: :program:`llvm-pdbutil` dump [*options*] <input PDB file>
 Summary
 ^^^^^^^^^^^
 
-The **dump** subcommand displays low level information about the structure of a
-PDB file.  It is used heavily by LLVM's testing infrastructure, but can also be
-used for PDB forensics.  It serves a role similar to that of Microsoft's
-`cvdump` tool.
-
-.. note::
-   The **dump** subcommand exposes internal details of the file format.  As
-   such, the reader should be familiar with :doc:`/PDB/index` before using this
+The **dump** subcommand displays low level information about the structure of a
+PDB file.  It is used heavily by LLVM's testing infrastructure, but can also be
+used for PDB forensics.  It serves a role similar to that of Microsoft's
+`cvdump` tool.
+
+.. note::
+   The **dump** subcommand exposes internal details of the file format.  As
+   such, the reader should be familiar with :doc:`/PDB/index` before using this
    command.
 
 Options
@@ -388,8 +388,8 @@ Type Record Options
  When used in conjunction with :option:`-type-index` or :option:`-id-index`,
  dumps the entire dependency graph for the specified index instead of just the
  single record with the specified index.  For example, if type index 0x4000 is
- a function whose return type has index 0x3000, and you specify
- `-dependents=0x4000`, then this would dump both records (as well as any other
+ a function whose return type has index 0x3000, and you specify
+ `-dependents=0x4000`, then this would dump both records (as well as any other
  dependents in the tree).
 
 Miscellaneous Options
diff --git a/llvm/docs/DirectX/DXContainer.rst b/llvm/docs/DirectX/DXContainer.rst
new file mode 100644
index 000000000000..36e670a1c164
--- /dev/null
+++ b/llvm/docs/DirectX/DXContainer.rst
@@ -0,0 +1,402 @@
+=================
+DirectX Container
+=================
+
+.. contents::
+   :local:
+
+.. toctree::
+   :hidden:
+
+Overview
+========
+
+The DirectX Container (DXContainer) file format is the binary file format for
+compiled shaders targeting the DirectX runtime. The file format is also called
+the DXIL Container or DXBC file format. Because the file format can be used to
+include either DXIL or DXBC compiled shaders, the nomenclature in LLVM is simply
+DirectX Container.
+
+DirectX Container files are read by the compiler and associated tools as well as
+the DirectX runtime, profiling tools and other users. This document serves as a
+companion to the implementation in LLVM to more completely document the file
+format for its many users.
+
+Basic Structure
+===============
+
+A DXContainer file begins with a header, and is then followed by a sequence of
+"parts", which are analogous to object file sections. Each part contains a part
+header, and some number of bytes of data after the header in a defined format.
+
+DX Container data structures are encoded little-endian in the binary file.
+
+The LLVM versions of all data structures described and/or referenced in this
+file are defined in
+`llvm/include/llvm/BinaryFormat/DXContainer.h
+<https://github.com/llvm/llvm-project/blob/main/llvm/include/llvm/BinaryFormat/DXContainer.h>`_.
+Some pseudo code is provided in blocks below to ease understanding of this
+document, but reading it with the header available will provide the most
+clarity.
+
+File Header
+-----------
+
+.. code-block:: c
+
+  struct Header {
+    uint8_t Magic[4];
+    uint8_t Digest[16];
+    uint16_t MajorVersion;
+    uint16_t MinorVersion;
+    uint32_t FileSize;
+    uint32_t PartCount;
+  };
+
+The DXContainer header matches the pseudo-definition above. It begins with a
+four character code (magic number) with the value ``DXBC`` to denote the file
+format.
+
+The ``Digest`` is a 128bit hash digest computed with a proprietary algorithm and
+encoded in the binary by the bytecode validator.
+
+The ``MajorVersion`` and ``MinorVersion`` encode the file format version
+``1.0``.
+
+The remaining fields encode 32-bit unsigned integers for the file size and
+number of parts.
+
+Following the part header is an array of ``PartCount`` 32-bit unsigned integers
+specifying the offsets of each part header.
+
+Part Data
+---------
+
+.. code-block:: c
+
+  struct PartHeader {
+    uint8_t Name[4];
+    uint32_t Size;
+  }
+
+Each part begins with a part header. A part header includes the 4-character part
+name, and a 32-bit unsigned integer specifying the size of the part data. The
+part header is followed by ``Size`` bytes of data comprising the part. The
+format does not explicitly require 32-bit alignment of parts, although LLVM does
+implement this restriction in the writer code (because it's a good idea). The
+LLVM object reader code does not assume inputs are correctly aligned to avoid
+undefined behavior caused by misaligned inputs generated by other compilers.
+
+Part Formats
+============
+
+The part name indicates the format of the part data. There are 24 part headers
+used by DXC and FXC. Not all compiled shaders contain all parts. In the list
+below parts generated only by DXC are marked with †, and parts generated only by
+FXC are marked with \*.
+
+#. `DXIL`_† - Stores the DXIL bytecode.
+#. `HASH`_† - Stores the shader MD5 hash.
+#. ILDB† - Stores the DXIL bytecode with LLVM Debug Information embedded in the module.
+#. ILDN† - Stores shader debug name for external debug information.
+#. `ISG1`_ - Stores the input signature for Shader Model 5.1+.
+#. ISGN\* - Stores the input signature for Shader Model 4 and earlier.
+#. `OSG1`_ - Stores the output signature for Shader Model 5.1+.
+#. OSG5\* - Stores the output signature for Shader Model 5.
+#. OSGN\* - Stores the output signature for Shader Model 4 and earlier.
+#. PCSG\* - Stores the patch constant signature for Shader Model 5.1 and earlier.
+#. PDBI† - Stores PDB information.
+#. PRIV - Stores arbitrary private data (Not encoded by either FXC or DXC).
+#. `PSG1`_ - Stores the patch constant signature for Shader Model 6+.
+#. `PSV0`_ - Stores Pipeline State Validation data.
+#. RDAT† - Stores Runtime Data.
+#. RDEF\* - Stores resource definitions.
+#. RTS0 - Stores compiled root signature.
+#. `SFI0`_ - Stores shader feature flags.
+#. SHDR\* - Stores compiled DXBC bytecode.
+#. SHEX\* - Stores compiled DXBC bytecode.
+#. DXBC\* - Stores compiled DXBC bytecode.
+#. SRCI† - Stores shader source information.
+#. STAT† - Stores shader statistics.
+#. VERS† - Stores shader compiler version information.
+
+DXIL Part
+---------
+.. _DXIL:
+
+The DXIL part is comprised of three data structures: the ``ProgramHeader``, the
+``BitcodeHeader`` and the bitcode serialized LLVM 3.7 IR Module.
+
+The ``ProgramHeader`` contains the shader model version and pipeline stage
+enumeration value. This identifies the target profile of the contained shader
+bitcode.
+
+The ``BitcodeHeader`` contains the DXIL version information and refers to the
+start of the bitcode data.
+
+HASH Part
+---------
+.. _HASH:
+
+The HASH part contains a 32-bit unsigned integer with the shader hash flags, and
+a 128-bit MD5 hash digest. The flags field can either have the value ``0`` to
+indicate no flags, or ``1`` to indicate that the file hash was computed
+including the source code that produced the binary.
+
+Program Signature (SG1) Parts
+-----------------------------
+.. _ISG1:
+.. _OSG1:
+.. _PSG1:
+
+.. code-block:: c
+
+  struct ProgramSignatureHeader {
+    uint32_t ParamCount;
+    uint32_t FirstParamOffset;
+  }
+
+The program signature parts (ISG1, OSG1, & PSG1) all use the same data
+structures to encode inputs, outputs and patch information. The
+``ProgramSignatureHeader`` includes two 32-bit unsigned integers to specify the
+number of signature parameters and the offset of the first parameter.
+
+Beginning at ``FirstParamOffset`` bytes from the start of the
+``ProgramSignatureHeader``, ``ParamCount`` ``ProgramSignatureElement``
+structures are written. Following the ``ProgramSignatureElements`` is a string
+table of null terminated strings padded to 32-byte alignment. This string table
+matches the DWARF string table format as implemented by LLVM.
+
+Each ``ProgramSignatureElement`` encodes a ``NameOffset`` value which specifies
+the offset into the string table. A value of ``0`` denotes no name. The offsets
+encoded here are from the beginning of the ``ProgramSignatureHeader`` not the
+beginning of the string table.
+
+The ``ProgramSignatureElement`` contains several enumeration fields which are
+defined in `llvm/include/llvm/BinaryFormat/DXContainerConstants.def <https://github.com/llvm/llvm-project/blob/main/llvm/include/llvm/BinaryFormat/DXContainerConstants.def>`_.
+These fields encode the D3D system value, the type of data and its precision
+requirements.
+
+PSV0 Part
+---------
+.. _PSV0:
+
+The Pipeline State Validation data encodes versioned runtime information
+structures. These structures use a scheme where in lieu of encoding a version
+number, they encode the size of the structure and each new version of the
+structure is additive. This allows readers to infer the version of the structure
+by comparing the encoded size with the size of known structures. If the encoded
+size is larger than any known structure, the largest known structure can validly
+parse the data represented in the known structure.
+
+In LLVM we represent the versions of the associated data structures with
+versioned namespaces under the ``llvm::dxbc::PSV`` namespace (e.g. ``v0``,
+``v1``). Each structure in the ``v0`` namespace is the base version, the
+structures in the ``v1`` namespace inherit from the ``v0`` namespace, and the
+``v2`` structures inherit from the ``v1`` structures, and so on.
+
+The high-level structure of the PSV data is:
+
+#. ``RuntimeInfo`` structure
+#. Resource bindings
+#. Signature elements
+#. Mask Vectors (Output, Input, InputPatch, PatchOutput)
+
+Immediately following the part header for the PSV0 part is a 32-bit unsigned
+integer specifying the size of the ``RuntimeInfo`` structure that follows.
+
+Immediately following the ``RuntimeInfo`` structure is a 32-bit unsigned integer
+specifying the number of resource bindings. If the number of resources is
+greater than zero, another unsigned 32-bit integer follows to specify the size
+of the ``ResourceBindInfo`` structure. This is followed by the specified number
+of structures of the specified size (which infers the version of the structure).
+
+For version 0 of the data this ends the part data.
+
+PSV0 Signature Elements
+~~~~~~~~~~~~~~~~~~~~~~~
+
+The signature elements are conceptually a single concept but the data is encoded
+in three different blocks. The first block is a string table, the second block
+is an index table, and the third block is the elements themselves, which in turn
+are separeated by input, output and patch constant or primitive elements.
+
+Signature elements capture much of the same data captured in the :ref:`SG1
+<ISG1>` parts. The use of an index table allows de-duplciation of data for a more
+compact final representation.
+
+The string table begins with a 32-bit unsigned integer specifying the table
+size. This string table uses the DXContainer format as implemented in LLVM. This
+format prefixes the string table with a null byte so that offset ``0`` is a null
+string, and pads to 32-byte alignment.
+
+The index table begins with a 32-bit unsigned integer specifying the size of the
+table, and is followed by that many 32-bit unsigned integers representing the
+table. The index table may or may not deduplicate repeated sequences (both DXC
+and Clang do). The indices signify the indices in the flattened aggregate
+representation which the signature element describes. A single semantic may have
+more than one entry in this table to denote the different attributes of its
+members.
+
+For example given the following code:
+
+.. code-block:: c
+
+  struct VSOut_1
+  {
+      float4 f3 : VOUT2;
+      float3 f4 : VOUT3;
+  };
+
+
+  struct VSOut
+  {
+      float4 f1 : VOUT0;
+      float2 f2[4] : VOUT1;
+      VSOut_1 s;
+      int4 f5 : VOUT4;
+  };
+
+  void main(out VSOut o1 : A) {
+  }
+
+The semantic ``A`` gets expanded into 5 output signature elements. Those
+elements are:
+
+.. note::
+
+  In the example below, it is a coincidence that the rows match the indices, in
+  more complicated examples with multiple semantics this is not the case.
+
+#. Index 0 starts at row 0, contains 4 columns, and is float32. This represents
+   ``f1`` in the source.
+#. Index 1, 2, 3, and 4 starts at row 1, contains two columns and is float32.
+   This represents ``f2`` in the source, and it spreads across rows 1 - 4.
+#. Index 5 starts at row 5, contains 4 columns, and is float32. This represents
+   ``f3`` in the source.
+#. Index 6 starts at row 6, contains 3 columns, and is float32. This represents
+   ``f4``.
+#. Index 7 starts at row 7, contains 4 columns, and is signed 32-bit integer.
+   This represents ``f5`` in the source.
+
+The LLVM ``obj2yaml`` tool can parse this data out of the PSV and present it in
+human readable YAML. For the example above it produces the output:
+
+.. code-block:: YAML
+
+  SigOutputElements:
+    - Name:            A
+      Indices:         [ 0 ]
+      StartRow:        0
+      Cols:            4
+      StartCol:        0
+      Allocated:       true
+      Kind:            Arbitrary
+      ComponentType:   Float32
+      Interpolation:   Linear
+      DynamicMask:     0x0
+      Stream:          0
+    - Name:            A
+      Indices:         [ 1, 2, 3, 4 ]
+      StartRow:        1
+      Cols:            2
+      StartCol:        0
+      Allocated:       true
+      Kind:            Arbitrary
+      ComponentType:   Float32
+      Interpolation:   Linear
+      DynamicMask:     0x0
+      Stream:          0
+    - Name:            A
+      Indices:         [ 5 ]
+      StartRow:        5
+      Cols:            4
+      StartCol:        0
+      Allocated:       true
+      Kind:            Arbitrary
+      ComponentType:   Float32
+      Interpolation:   Linear
+      DynamicMask:     0x0
+      Stream:          0
+    - Name:            A
+      Indices:         [ 6 ]
+      StartRow:        6
+      Cols:            3
+      StartCol:        0
+      Allocated:       true
+      Kind:            Arbitrary
+      ComponentType:   Float32
+      Interpolation:   Linear
+      DynamicMask:     0x0
+      Stream:          0
+    - Name:            A
+      Indices:         [ 7 ]
+      StartRow:        7
+      Cols:            4
+      StartCol:        0
+      Allocated:       true
+      Kind:            Arbitrary
+      ComponentType:   SInt32
+      Interpolation:   Constant
+      DynamicMask:     0x0
+      Stream:          0
+
+The number of signature elements of each type is encoded in the
+``llvm::dxbc::PSV::v1::RuntimeInfo`` structure. If any of the element count
+values are non-zero, the size of the ``ProgramSignatureElement`` structure is
+encoded next to allow versioning of that structure. Today there is only one
+version. Following the size field is the specified number of signature elements
+in the order input, output, then patch constant or primitive.
+
+Following the signature elements is a sequence of mask vectors encoded as a
+series of 32-bit integers. Each 32-bit integer in the mask encodes values for 8
+input/output/patch or primitive elements. The mask vector is filled from least
+significant bit to most significant bit with each added element shifting the
+previous elements left. A reader needs to consult the total number of vectors
+encoded in the ``RuntimeInfo`` structure to know how to read the mask vector.
+
+If the shader has ``UsesViewID`` enabled in the ``RuntimeInfo`` an output mask
+vector will be included. The output mask vector is four arrays of 32-bit
+unsigned integers. Each of the four arrays corresponds to an output stream.
+Geometry shaders have a maximum of four output streams, all other shader stages
+only support one output stream. Each bit in the mask vector identifies one
+column of an output from the output signature depends on the ViewID.
+
+If the shader has ``UsesViewID`` enabled, it is a hull shader, and it has patch
+constant or primitive vector elements, a patch constant or primitive vector mask
+will be included. It is identical in structure to the output mask vector. Each
+bit in the mask vector identifies one column of a patch constant output which
+depends on the ViewID.
+
+The next series of mask vectors are similar in structure to the output mask
+vector, but they contain an extra dimension.
+
+The output/input map is encoded next if the shader has inputs and outputs. The
+output/input mask encodes which outputs are impacted by each column of each
+input. The size for each mask vector is the size of the output max vector * the
+number of inputs * 4 (for each component). Each bit in the mask vector
+identifies one column of an output and a column of an input. A value of 1 means
+the output is impacted by the input.
+
+If the shader is a hull shader, and it has inputs and patch outputs, an input to
+patch map will be included next. This is identical in structure to the
+output/input map. The dimensions are defined by the size of the patch constant
+or primitive vector mask * the number of inputs * 4 (for each component). Each
+bit in the mask vector identifies one column of a patch constant output and a
+column of an input. A value of 1 means the output is impacted by the input.
+
+If the shader is a domain shader, and it has outputs and patch outputs, an
+output patch map will be included next. This is identical in structure to the
+output/input map. The dimensions are defined by the size of the patch constant
+or primitive vector mask * the number of outputs * 4 (for each component). Each
+bit in the mask vector identifies one column of a patch constant input and a
+column of an output. A value of 1 means the output is impacted by the primitive
+input.
+
+SFI0 Part
+---------
+.. _SFI0:
+
+The SFI0 part encodes a 64-bit unsigned integer bitmask of the feature flags.
+This denotes which optional features the shader requires. The flag values are
+defined in `llvm/include/llvm/BinaryFormat/DXContainerConstants.def <https://github.com/llvm/llvm-project/blob/main/llvm/include/llvm/BinaryFormat/DXContainerConstants.def>`_.
diff --git a/llvm/docs/DirectXUsage.rst b/llvm/docs/DirectXUsage.rst
index 79543e19bd34..dd7796e1fdc9 100644
--- a/llvm/docs/DirectXUsage.rst
+++ b/llvm/docs/DirectXUsage.rst
@@ -14,6 +14,7 @@ User Guide for the DirectX Target
    :hidden:
 
    DirectX/DXILArchitecture
+   DirectX/DXContainer
 
 Introduction
 ============
@@ -81,6 +82,8 @@ code generation targets in LLVM, the LLVM codebase uses a more neutral name,
 The ``DXContainer`` format is sparsely documented in the functional
 specification, but a reference implementation exists in the
 `DirectXShaderCompiler. <https://github.com/microsoft/DirectXShaderCompiler>`_.
+The format is documented in the LLVM project docs as well (see
+:doc:`DirectX/DXContainer`).
 
 Support for generating ``DXContainer`` files in LLVM, is being added to the LLVM
 MC layer for object streamers and writers, and to the Object and ObjectYAML
diff --git a/llvm/docs/GettingInvolved.rst b/llvm/docs/GettingInvolved.rst
index a45d73a9a3d4..89df6cbe31c4 100644
--- a/llvm/docs/GettingInvolved.rst
+++ b/llvm/docs/GettingInvolved.rst
@@ -1,511 +1,511 @@
-Getting Involved
-================
-
-LLVM welcomes contributions of all kinds. To get started, please review the following topics:
-
-.. contents::
-   :local:
-
-.. toctree::
-   :hidden:
-
-   Contributing
-   DeveloperPolicy
-   CodeReview
-   SupportPolicy
-   SphinxQuickstartTemplate
-   HowToSubmitABug
-   BugLifeCycle
-   CodingStandards
-   GitHub
-   GitBisecting
-   GitRepositoryPolicy
-
-:doc:`Contributing`
-   An overview on how to contribute to LLVM.
-
-:doc:`DeveloperPolicy`
-   The LLVM project's policy towards developers and their contributions.
-
-:doc:`CodeReview`
-   The LLVM project's code-review process.
-
-:doc:`SupportPolicy`
-   The LLVM support policy for core and non-core components.
-
-:doc:`SphinxQuickstartTemplate`
-  A template + tutorial for writing new Sphinx documentation. It is meant
-  to be read in source form.
-
-:doc:`HowToSubmitABug`
-   Instructions for properly submitting information about any bugs you run into
-   in the LLVM system.
-
-:doc:`BugLifeCycle`
-   Describes how bugs are reported, triaged and closed.
-
-:doc:`CodingStandards`
-  Details the LLVM coding standards and provides useful information on writing
-  efficient C++ code.
-
-:doc:`GitHub`
-  Describes how to use the llvm-project repository and code reviews on GitHub.
-
-:doc:`GitBisecting`
-  Describes how to use ``git bisect`` on LLVM's repository.
-
-:doc:`GitRepositoryPolicy`
-   Collection of policies around the git repositories.
-
-.. _development-process:
-
-Development Process
--------------------
-
-Information about LLVM's development process.
-
-.. toctree::
-   :hidden:
-
-   Projects
-   HowToReleaseLLVM
-   ReleaseProcess
-   HowToAddABuilder
-   ReleaseNotes
-
-:doc:`Projects`
-  How-to guide and templates for new projects that *use* the LLVM
-  infrastructure.  The templates (directory organization, Makefiles, and test
-  tree) allow the project code to be located outside (or inside) the ``llvm/``
-  tree, while using LLVM header files and libraries.
-
-:doc:`HowToReleaseLLVM`
-  This is a guide to preparing LLVM releases. Most developers can ignore it.
-
-:doc:`ReleaseProcess`
-  This is a guide to validate a new release, during the release process. Most developers can ignore it.
-
-:doc:`HowToAddABuilder`
-   Instructions for adding new builder to LLVM buildbot master.
-
-:doc:`Release notes for the current release <ReleaseNotes>`
-   This describes new features, known bugs, and other limitations.
-
-.. _lists-forums:
-
-Forums & Mailing Lists
-----------------------
-
-If you can't find what you need in these docs, try consulting the
-Discourse forums. There are also commit mailing lists for all commits to the LLVM Project.
-The :doc:`CodeOfConduct` applies to all these forums and mailing lists.
-
-`LLVM Discourse`__
-  The forums for all things LLVM and related sub-projects. There are categories and subcategories for a wide variety of areas within LLVM. You can also view tags or search for a specific topic.
-
-  .. __: https://discourse.llvm.org/
-
-`Commits Archive (llvm-commits)`__
-  This list contains all commit messages that are made when LLVM developers
-  commit code changes to the repository. It also serves as a forum for
-  patch review (i.e. send patches here). It is useful for those who want to
-  stay on the bleeding edge of LLVM development. This list is very high
-  volume.
-
-  .. __: http://lists.llvm.org/pipermail/llvm-commits/
-
-`Bugs & Patches Archive (llvm-bugs)`__
-  This list gets emailed every time a bug is opened and closed. It is
-  higher volume than the LLVM-dev list.
-
-  .. __: http://lists.llvm.org/pipermail/llvm-bugs/
-
-`LLVM Announcements`__
-  If you just want project wide announcements such as releases, developers meetings, or blog posts, then you should check out the Announcement category on LLVM Discourse.
-
-  .. __: https://discourse.llvm.org/c/announce/46
-
-.. _online-sync-ups:
-
-Online Sync-Ups
----------------
-
-A number of regular calls are organized on specific topics. It should be
-expected that the range of topics will change over time. At the time of
-writing, the following sync-ups are organized.
-The :doc:`CodeOfConduct` applies to all online sync-ups.
-
-If you'd like to organize a new sync-up, please add the info in the table
-below. Please also create a calendar event for it and invite calendar@llvm.org
-to the event, so that it'll show up on the :ref:`llvm-community-calendar`.
-Please see :ref:`llvm-community-calendar-host-guidance` for more guidance on
-what to add to your calendar invite.
-
-.. list-table:: LLVM regular sync-up calls
-   :widths: 25 25 25 25
-   :header-rows: 1
-
-   * - Topic
-     - Frequency
-     - Calendar link
-     - Minutes/docs link
-   * - Loop Optimization Working Group
-     - Every 2 weeks on Wednesday
-     - `ics <./_static/LoopOptWG_invite.ics>`__
-     - `Minutes/docs <https://docs.google.com/document/d/1sdzoyB11s0ccTZ3fobqctDpgJmRoFcz0sviKxqczs4g/edit>`__
-   * - RISC-V
-     - Every 2 weeks on Thursday
-     - `ics <https://calendar.google.com/calendar/ical/lowrisc.org_0n5pkesfjcnp0bh5hps1p0bd80%40group.calendar.google.com/public/basic.ics>`__
-       `gcal <https://calendar.google.com/calendar/b/1?cid=bG93cmlzYy5vcmdfMG41cGtlc2ZqY25wMGJoNWhwczFwMGJkODBAZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ>`__
-     - `Minutes/docs <https://docs.google.com/document/d/1G3ocHm2zE6AYTS2N3_3w2UxFnSEyKkcF57siLWe-NVs>`__
-   * - ML Guided Compiler Optimizations
-     - Monthly
-     -
-     - `Minutes/docs <https://docs.google.com/document/d/1JecbplF09l3swTjze-UVeLh4L48svJxGVy4mz_e9Rhs/edit?usp=gmail#heading=h.ts9cmcjbir1j>`__
-   * - `LLVM security group <https://llvm.org/docs/Security.html>`__
-     - Monthly, every 3rd Tuesday
-     - `ics <https://calendar.google.com/calendar/ical/eoh3m9k1l6vqbd1fkp94fv5q74%40group.calendar.google.com/public/basic.ics>`__
-       `gcal <https://calendar.google.com/calendar/embed?src=eoh3m9k1l6vqbd1fkp94fv5q74%40group.calendar.google.com>`__
-     - `Minutes/docs <https://discourse.llvm.org/t/llvm-security-group-public-sync-ups/62735>`__
-   * - `CIRCT <https://github.com/llvm/circt>`__
-     - Weekly, on Wednesday
-     -
-     - `Minutes/docs <https://docs.google.com/document/d/1fOSRdyZR2w75D87yU2Ma9h2-_lEPL4NxvhJGJd-s5pk/edit#heading=h.mulvhjtr8dk9>`__
-   * - flang
-     - Multiple meeting series, `documented here <https://github.com/llvm/llvm-project/blob/main/flang/docs/GettingInvolved.md#calls>`__
-     -
-     -
-   * - OpenMP
-     - Multiple meeting series, `documented here <https://openmp.llvm.org/docs/SupportAndFAQ.html>`__
-     -
-     -
-   * - LLVM Alias Analysis
-     - Every 4 weeks on Tuesdays
-     - `ics <http://lists.llvm.org/pipermail/llvm-dev/attachments/20201103/a3499a67/attachment-0001.ics>`__
-     - `Minutes/docs <https://docs.google.com/document/d/17U-WvX8qyKc3S36YUKr3xfF-GHunWyYowXbxEdpHscw>`__
-   * - LLVM Pointer Authentication
-     - Every month on Mondays
-     - `ics <https://calendar.google.com/calendar/ical/fr1qtmrmt2s9odufjvurkb6j70%40group.calendar.google.com/public/basic.ics>`__
-     - `Minutes/docs <https://discourse.llvm.org/t/llvm-pointer-authentication-sync-ups/62661>`__
-   * - LLVM Embedded Toolchains
-     - Every 4 weeks on Thursdays
-     - `ics <https://drive.google.com/file/d/1uNa-PFYkhAfT83kR2Nc4Fi706TAQFBEL/view?usp=sharing>`__
-       `gcal <https://calendar.google.com/calendar/u/0?cid=ZDQyc3ZlajJmbjIzNG1jaTUybjFsdjA2dWNAZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ>`__
-     - `Minutes/docs <https://docs.google.com/document/d/1GahxppHJ7o1O_fn1Mbidu1DHEg7V2aOr92LXCtNV1_o/edit?usp=sharing>`__
-   * - Clang C and C++ Language Working Group
-     - 1st and 3rd Wednesday of the month
-     - `gcal <https://calendar.google.com/calendar/u/0?cid=cW1lZGg0ZXNpMnIyZDN2aTVydGVrdWF1YzRAZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ>`__
-     - `Minutes/docs <https://docs.google.com/document/d/1x5-RbOC6-jnI_NcJ9Dp4pSmGhhNe7lUevuWUIB46TeM/edit?usp=sharing>`__
-   * - LLVM SPIR-V Backend Working Group
-     - Every week on Monday
-     -
-     - `Meeting details/agenda <https://docs.google.com/document/d/1UjX-LAwPjJ75Nmb8a5jz-Qrm-pPtKtQw0k1S1Lop9jU/edit?usp=sharing>`__
-   * - SYCL Upstream Working Group
-     - Every 2 weeks on Mondays
-     - `gcal <https://calendar.google.com/calendar/u/0?cid=c3ljbC5sbHZtLndnQGdtYWlsLmNvbQ>`__
-     - `Meeting details/agenda <https://docs.google.com/document/d/1ivYDSn_5ChTeiZ7TiO64WC_jYJnGwAUiT9Ngi9cAdFU/edit?usp=sharing>`__
-   * - Floating Point Working Group
-     - Every 3rd Wednesday of the month
-     - `ics <https://calendar.google.com/calendar/ical/02582507bac79d186900712566ec3fc69b33ac24d7de0a8c76c7b19976f190c0%40group.calendar.google.com/private-6e35506dbfe13812e92e9aa8cd5d761d/basic.ics>`__
-       `gcal <https://calendar.google.com/calendar/u/0?cid=MDI1ODI1MDdiYWM3OWQxODY5MDA3MTI1NjZlYzNmYzY5YjMzYWMyNGQ3ZGUwYThjNzZjN2IxOTk3NmYxOTBjMEBncm91cC5jYWxlbmRhci5nb29nbGUuY29t>`__
-     - `Meeting details/agenda: <https://docs.google.com/document/d/1QcmUlWftPlBi-Wz6b6PipqJfvjpJ-OuRMRnN9Dm2t0c>`__
-
-Past online sync-ups
-^^^^^^^^^^^^^^^^^^^^
-
-Some online sync-ups are no longer happening. We keep pointing to them here to
-keep track of the meeting notes and in case anyone would want to revive them in
-the future.
-
-.. list-table:: LLVM no-longer-happening sync-up calls
-   :widths: 25 25 25 25
-   :header-rows: 1
-
-   * - Topic
-     - Frequency
-     - Calendar link
-     - Minutes/docs link
-   * - Scalable Vectors and Arm SVE
-     - Monthly, every 3rd Tuesday
-     - `ics <https://calendar.google.com/calendar/ical/bjms39pe6k6bo5egtsp7don414%40group.calendar.google.com/public/basic.ics>`__
-       `gcal <https://calendar.google.com/calendar/u/0/embed?src=bjms39pe6k6bo5egtsp7don414@group.calendar.google.com>`__
-     - `Minutes/docs <https://docs.google.com/document/d/1UPH2Hzou5RgGT8XfO39OmVXKEibWPfdYLELSaHr3xzo/edit>`__
-   * - MemorySSA in LLVM
-     - Every 8 weeks on Mondays
-     - `ics <https://calendar.google.com/calendar/ical/c_1mincouiltpa24ac14of14lhi4%40group.calendar.google.com/public/basic.ics>`__
-       `gcal <https://calendar.google.com/calendar/embed?src=c_1mincouiltpa24ac14of14lhi4%40group.calendar.google.com>`__
-     - `Minutes/docs <https://docs.google.com/document/d/1-uEEZfmRdPThZlctOq9eXlmUaSSAAi8oKxhrPY_lpjk/edit#>`__
-   * - GlobalISel
-     - Every 2nd Tuesday of the month
-     - `gcal <https://calendar.google.com/calendar/u/0?cid=ZDcyMjc0ZjZiZjNhMzFlYmE3NTNkMWM2MGM2NjM5ZWU3ZDE2MjM4MGFlZDc2ZjViY2UyYzMwNzVhZjk4MzQ4ZEBncm91cC5jYWxlbmRhci5nb29nbGUuY29t>`__
-     - `Meeting details/agenda <https://docs.google.com/document/d/1Ry8O4-Tm5BFj9AMjr8qTQFU80z-ptiNQ62687NaIvLs/edit?usp=sharing>`__
-   * - Vector Predication
-     - Every 2 weeks on Tuesdays, 3pm UTC
-     -
-     - `Minutes/docs <https://docs.google.com/document/d/1q26ToudQjnqN5x31zk8zgq_s0lem1-BF8pQmciLa4k8/edit?usp=sharing>`__
-   * - `MLIR <https://mlir.llvm.org>`__ design meetings
-     - Weekly, on Thursdays
-     -
-     - `Minutes/docs <https://docs.google.com/document/d/1y_9f1AbfgcoVdJh4_aM6-BaSHvrHl8zuA5G4jv_94K8/edit#heading=h.cite1kolful9>`__
-
-.. _office-hours:
-
-Office hours
-------------
-
-A number of experienced LLVM contributors make themselves available for a chat
-on a regular schedule, to anyone who is looking for some guidance. Please find
-the list of who is available when, through which medium, and what their area of
-expertise is. Don't be too shy to dial in!
-
-Office hours are also listed on the :ref:`llvm-community-calendar`. Of course,
-people take time off from time to time, so if you dial in and you don't find
-anyone present, chances are they happen to be off that day.
-
-The :doc:`CodeOfConduct` applies to all office hours.
-
-.. list-table:: LLVM office hours
-  :widths: 15 40 15 15 15
-  :header-rows: 1
-
-  * - Name
-    - In-scope topics
-    - When?
-    - Where?
-    - Languages
-  * - Kristof Beyls
-    - General questions on how to contribute to LLVM; organizing meetups;
-      submitting talks; and other general LLVM-related topics. Arm/AArch64
-      codegen. LLVM security group. LLVM Office Hours.
-    - Every 2nd and 4th Wednesday of the month at 9.30am CET, for 30 minutes.
-      `ics <https://calendar.google.com/calendar/ical/co0h4ndpvtfe64opn7eraiq3ac%40group.calendar.google.com/public/basic.ics>`__
-    - `Jitsi <https://meet.jit.si/KristofBeylsLLVMOfficeHour>`__
-    - English, Flemish, Dutch
-  * - Alina Sbirlea
-    - General questions on how to contribute to LLVM; women in compilers;
-      MemorySSA, BatchAA, various loop passes, new pass manager.
-    - Monthly, 2nd Tuesdays, 10.00am PT/7:00pm CET, for 30 minutes.
-      `ics <https://calendar.google.com/calendar/ical/c_pm6e7160iq7n5fcm1s6m3rjhh4%40group.calendar.google.com/public/basic.ics>`__
-      `gcal <https://calendar.google.com/calendar/embed?src=c_pm6e7160iq7n5fcm1s6m3rjhh4%40group.calendar.google.com>`__
-    - `GoogleMeet <https://meet.google.com/hhk-xpdj-gvx>`__
-    - English, Romanian
-  * - Aaron Ballman (he/him)
-    - Clang internals; frontend attributes; clang-tidy; clang-query; AST matchers
-    - Monthly, 2nd Monday and 3rd Friday of the month at 10:00am Eastern and again at 2:00pm Eastern, for 60 minutes.
-      `ics <https://calendar.google.com/calendar/ical/npgke5dug0uliud0qapptmps58%40group.calendar.google.com/public/basic.ics>`__
-      `gcal <https://calendar.google.com/calendar/embed?src=npgke5dug0uliud0qapptmps58%40group.calendar.google.com>`__
-    - `GoogleMeet <https://meet.google.com/xok-iqne-gmi>`__
-    - English, Norwegian (not fluently)
-  * - Johannes Doerfert (he/him)
-    - OpenMP, LLVM-IR, interprocedural optimizations, Attributor, workshops, research, ...
-    - Every week, Wednesdays 9:30am (Pacific Time), for 1 hour.
-      `ics <https://drive.google.com/file/d/1E_QkRvirmdJzlXf2EKBUX-v8Xj7-eW3v/view?usp=sharing>`__
-    - `MS Teams <https://teams.microsoft.com/l/meetup-join/19%3ameeting_MTMxNzU4MWYtYzViNS00OTM2LWJmNWQtMjg5ZWFhNGVjNzgw%40thread.v2/0?context=%7b%22Tid%22%3a%22a722dec9-ae4e-4ae3-9d75-fd66e2680a63%22%2c%22Oid%22%3a%22885bda30-ce8e-46db-aa7e-15de0474831a%22%7d>`__
-    - English, German
-  * - Tobias Grosser
-    - General questions on how to contribute to LLVM/MLIR, Polly, Loop Optimization, FPL, Research in LLVM, PhD in CS, Summer of Code.
-    - Monthly, last Monday of the month at 18:00 London time (typically 9am PT), for 30 minutes.
-    - `Video Call <https://meet.grosser.science/LLVMOfficeHours>`__
-    - English, German, Spanish, French
-  * - Anastasia Stulova
-    - Clang internals for C/C++ language extensions and dialects, OpenCL, GPU, SPIR-V, how to contribute, women in compilers.
-    - Monthly, 1st Tuesday of the month at 17:00 BST - London time (9:00am PT except for 2 weeks in spring), 30 mins slot.
-    - `GoogleMeet <https://meet.google.com/kdy-fdbv-nuk>`__
-    - English, Russian, German (not fluently)
-  * - Alexey Bader
-    - SYCL compiler, offload tools, OpenCL and SPIR-V, how to contribute.
-    - Monthly, 2nd Monday of the month at 9:30am PT, for 30 minutes.
-    - `GoogleMeet <https://meet.google.com/pdz-xhns-uus>`__
-    - English, Russian
-  * - Maksim Panchenko
-    - BOLT internals, IR, new passes, proposals, etc.
-    - Monthly, 2nd Wednesday of the month at 11:00am PT, for 30 minutes.
-    - `Zoom <https://fb.zoom.us/j/97065697120?pwd=NTFaUWJjZW9uVkJuaVlPTE9qclE3dz09>`__
-    - English, Russian
-  * - Quentin Colombet (he/him)
-    - LLVM/MLIR; Codegen (Instruction selection (GlobalISel/SDISel), Machine IR,
-      Register allocation, etc.); Optimizations; MCA
-    - Monthly, 1st Wednesday of the month at 8.00am PT, for 30 minutes.
-      `ics <https://calendar.google.com/calendar/ical/48c4ad60290a4df218e51e1ceec1106fe317b0ebc76938d9273592053f38204e%40group.calendar.google.com/public/basic.ics>`__
-      `gcal <https://calendar.google.com/calendar/embed?src=48c4ad60290a4df218e51e1ceec1106fe317b0ebc76938d9273592053f38204e%40group.calendar.google.com>`__
-    - `Google meet <https://meet.google.com/cbz-grrp-obs>`__
-    - English, French
-  * - Phoebe Wang (she/her)
-    - X86 backend, General questions to X86, women in compilers.
-    - Monthly, 3rd Wednesday of the month at 8:30am Beijing time, for 30 minutes.
-    - `MS Teams <https://teams.microsoft.com/l/meetup-join/19%3ameeting_NWQ0MjU0NjYtZjUyMi00YTU3LThmM2EtY2Y2YTE4NGM3NmFi%40thread.v2/0?context=%7b%22Tid%22%3a%2246c98d88-e344-4ed4-8496-4ed7712e255d%22%2c%22Oid%22%3a%227b309d9c-a9bb-44c8-a940-ab97eef42d4d%22%7d>`__
-    - English, Chinese
-  * - Amara Emerson
-    - GlobalISel questions.
-    - Monthly, 4th Wednesday of the month at 9:30am PT, for 30 minutes.
-    - `Google meet <https://meet.google.com/pdd-dibg-cwv>`__
-    - English
-  * - Maksim Levental and Jeremy Kun
-    - MLIR newcomers and general discussion (`livestreamed <https://www.youtube.com/playlist?list=PLhxO86S3jsX2k7kOhZaV-qKWm8tNsUdAE>`__)
-    - Every two weeks, Fridays at 3:00pm US Pacific, for 90 minutes.
-    - Livestream chat or `Google meet <https://meet.google.com/wit-tvzc-dwc>`__
-    - English
-  * - Rotating hosts
-    - Getting Started, beginner questions, new contributors.
-    - Every Tuesday at 2 PM ET (11 AM PT), for 30 minutes.
-    - `Google meet <https://meet.google.com/nga-uhpf-bbb>`__
-    - English
-
-
-Guidance for office hours hosts
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-* If you're interested in becoming an office hours host, please add your
-  information to the list above. Please create a calendar event for it and
-  invite calendar@llvm.org to the event so that it'll show up on the
-  :ref:`llvm-community-calendar`.
-  Please see :ref:`llvm-community-calendar-host-guidance` for more guidance on
-  what to add to your calendar invite.
-* When starting an office hours session, consider typing something like "*Hi,
-  I'm available for chats in the next half hour at* video chat URL. *I'm
-  looking forward to having conversations on the video chat or here.*" on the
-  LLVM chat channels that you are already on. These could include:
-
-    * the `#office-hours Discord channel
-      <https://discord.com/channels/636084430946959380/976196303681896538>`__.
-    * :ref:`IRC`
-
-  Doing this can help:
-    * overcome potential anxiety to call in for a first time,
-    * people who prefer to first exchange a few messages through text chat
-      before dialing in, and
-    * remind the wider community that office hours do exist.
-* If you decide to no longer host office hours, please do remove your entry
-  from the list above.
-
-
-.. _IRC:
-
-IRC
----
-
-Users and developers of the LLVM project (including subprojects such as Clang)
-can be found in #llvm on `irc.oftc.net <irc://irc.oftc.net/llvm>`_. The channel
-is actively moderated.
-
-The #llvm-build channel has a bot for
-`LLVM buildbot <http://lab.llvm.org/buildbot/#/console>`_ status changes. The
-bot will post a message with a link to a build bot and a blamelist when a build
-goes from passing to failing and again (without the blamelist) when the build
-goes from failing back to passing. It is a good channel for actively monitoring
-build statuses, but it is a noisy channel due to the automated messages. The
-channel is not actively moderated.
-
-In addition to the traditional IRC there is a
-`Discord <https://discord.com/channels/636084430946959380/636725486533345280>`_
-chat server available. To sign up, please use this
-`invitation link <https://discord.com/invite/xS7Z362>`_.
-
-
-.. _meetups-social-events:
-
-Meetups and social events
--------------------------
-
-.. toctree::
-   :hidden:
-
-   MeetupGuidelines
-
-Besides developer `meetings and conferences <https://llvm.org/devmtg/>`_,
-there are several user groups called
-`LLVM Socials <https://www.meetup.com/pro/llvm/>`_. We greatly encourage you to
-join one in your city. Or start a new one if there is none:
-
-:doc:`MeetupGuidelines`
-
-.. _community-proposals:
-
-Community wide proposals
-------------------------
-
-Proposals for massive changes in how the community behaves and how the work flow
-can be better.
-
-.. toctree::
-   :hidden:
-
-   Proposals/GitHubMove
-   BugpointRedesign
-   Proposals/TestSuite
-   Proposals/VariableNames
-   Proposals/VectorPredication
-
-:doc:`Proposals/GitHubMove`
-   Proposal to move from SVN/Git to GitHub.
-
-:doc:`BugpointRedesign`
-   Design doc for a redesign of the Bugpoint tool.
-
-:doc:`Proposals/TestSuite`
-   Proposals for additional benchmarks/programs for llvm's test-suite.
-
-:doc:`Proposals/VariableNames`
-   Proposal to change the variable names coding standard.
-
-:doc:`Proposals/VectorPredication`
-   Proposal for predicated vector instructions in LLVM.
-
-.. _llvm-community-calendar:
-
-LLVM community calendar
------------------------
-
-We aim to maintain a public calendar view of all events happening in the LLVM
-community such as :ref:`online-sync-ups` and :ref:`office-hours`. The calendar
-can be found at
-https://calendar.google.com/calendar/u/0/embed?src=calendar@llvm.org and can
-also be seen inline below:
-
-.. raw:: html
-
-    <iframe src="https://calendar.google.com/calendar/embed?height=600&wkst=1&bgcolor=%23ffffff&ctz=UTC&showCalendars=0&showDate=1&showNav=1&src=Y2FsZW5kYXJAbGx2bS5vcmc&color=%23039BE5" style="border:solid 1px #777" width="800" height="600" frameborder="0" scrolling="no"></iframe>
-
-Note that the web view of the LLVM community calendar shows events in
-Coordinated Universal Time (UTC). If you use Google Calendar, consider
-subscribing to it with the + button in the bottom-right corner to view all
-events in your local timezone alongside your other calendars.
-
-.. _llvm-community-calendar-host-guidance:
-
-Guidance on what to put into LLVM community calendar invites
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-To add your event, create a calendar event for it and invite calendar@llvm.org
-on it. Your event should then show up on the community calendar.
-
-Please put the following pieces of information in your calendar invite:
-
-* Write a single paragraph describing what the event is about. Include things
-  such as who the event is for and what sort of topics are discussed.
-* State explicitly that the :doc:`CodeOfConduct` applies to this event.
-* Make it clear who:
-
-  * the organizer is.
-
-  * the person to contact is in case of any code-of-conduct issues.  Typically,
-    this would be the organizer.
-
-* If you have meeting minutes for your event, add a pointer to where those live.
-  A good place for meeting minutes could be as a post on LLVM Discourse.
-
-An example invite looks as follows
-
-.. code-block:: none
-
-  This event is a meetup for all developers of LLDB. Meeting agendas are posted
-  on discourse before the event.
-
-  Attendees are required to adhere to the LLVM Code of Conduct
-  (https://llvm.org/docs/CodeOfConduct.html). For any Code of Conduct reports,
-  please contact the organizers, and also email conduct@llvm.org.
-
-  Agenda/Meeting Minutes: Link to minutes
-
-  Organizer(s): First Surname (name@email.com)
-
+Getting Involved
+================
+
+LLVM welcomes contributions of all kinds. To get started, please review the following topics:
+
+.. contents::
+   :local:
+
+.. toctree::
+   :hidden:
+
+   Contributing
+   DeveloperPolicy
+   CodeReview
+   SupportPolicy
+   SphinxQuickstartTemplate
+   HowToSubmitABug
+   BugLifeCycle
+   CodingStandards
+   GitHub
+   GitBisecting
+   GitRepositoryPolicy
+
+:doc:`Contributing`
+   An overview on how to contribute to LLVM.
+
+:doc:`DeveloperPolicy`
+   The LLVM project's policy towards developers and their contributions.
+
+:doc:`CodeReview`
+   The LLVM project's code-review process.
+
+:doc:`SupportPolicy`
+   The LLVM support policy for core and non-core components.
+
+:doc:`SphinxQuickstartTemplate`
+  A template + tutorial for writing new Sphinx documentation. It is meant
+  to be read in source form.
+
+:doc:`HowToSubmitABug`
+   Instructions for properly submitting information about any bugs you run into
+   in the LLVM system.
+
+:doc:`BugLifeCycle`
+   Describes how bugs are reported, triaged and closed.
+
+:doc:`CodingStandards`
+  Details the LLVM coding standards and provides useful information on writing
+  efficient C++ code.
+
+:doc:`GitHub`
+  Describes how to use the llvm-project repository and code reviews on GitHub.
+
+:doc:`GitBisecting`
+  Describes how to use ``git bisect`` on LLVM's repository.
+
+:doc:`GitRepositoryPolicy`
+   Collection of policies around the git repositories.
+
+.. _development-process:
+
+Development Process
+-------------------
+
+Information about LLVM's development process.
+
+.. toctree::
+   :hidden:
+
+   Projects
+   HowToReleaseLLVM
+   ReleaseProcess
+   HowToAddABuilder
+   ReleaseNotes
+
+:doc:`Projects`
+  How-to guide and templates for new projects that *use* the LLVM
+  infrastructure.  The templates (directory organization, Makefiles, and test
+  tree) allow the project code to be located outside (or inside) the ``llvm/``
+  tree, while using LLVM header files and libraries.
+
+:doc:`HowToReleaseLLVM`
+  This is a guide to preparing LLVM releases. Most developers can ignore it.
+
+:doc:`ReleaseProcess`
+  This is a guide to validate a new release, during the release process. Most developers can ignore it.
+
+:doc:`HowToAddABuilder`
+   Instructions for adding new builder to LLVM buildbot master.
+
+:doc:`Release notes for the current release <ReleaseNotes>`
+   This describes new features, known bugs, and other limitations.
+
+.. _lists-forums:
+
+Forums & Mailing Lists
+----------------------
+
+If you can't find what you need in these docs, try consulting the
+Discourse forums. There are also commit mailing lists for all commits to the LLVM Project.
+The :doc:`CodeOfConduct` applies to all these forums and mailing lists.
+
+`LLVM Discourse`__
+  The forums for all things LLVM and related sub-projects. There are categories and subcategories for a wide variety of areas within LLVM. You can also view tags or search for a specific topic.
+
+  .. __: https://discourse.llvm.org/
+
+`Commits Archive (llvm-commits)`__
+  This list contains all commit messages that are made when LLVM developers
+  commit code changes to the repository. It also serves as a forum for
+  patch review (i.e. send patches here). It is useful for those who want to
+  stay on the bleeding edge of LLVM development. This list is very high
+  volume.
+
+  .. __: http://lists.llvm.org/pipermail/llvm-commits/
+
+`Bugs & Patches Archive (llvm-bugs)`__
+  This list gets emailed every time a bug is opened and closed. It is
+  higher volume than the LLVM-dev list.
+
+  .. __: http://lists.llvm.org/pipermail/llvm-bugs/
+
+`LLVM Announcements`__
+  If you just want project wide announcements such as releases, developers meetings, or blog posts, then you should check out the Announcement category on LLVM Discourse.
+
+  .. __: https://discourse.llvm.org/c/announce/46
+
+.. _online-sync-ups:
+
+Online Sync-Ups
+---------------
+
+A number of regular calls are organized on specific topics. It should be
+expected that the range of topics will change over time. At the time of
+writing, the following sync-ups are organized.
+The :doc:`CodeOfConduct` applies to all online sync-ups.
+
+If you'd like to organize a new sync-up, please add the info in the table
+below. Please also create a calendar event for it and invite calendar@llvm.org
+to the event, so that it'll show up on the :ref:`llvm-community-calendar`.
+Please see :ref:`llvm-community-calendar-host-guidance` for more guidance on
+what to add to your calendar invite.
+
+.. list-table:: LLVM regular sync-up calls
+   :widths: 25 25 25 25
+   :header-rows: 1
+
+   * - Topic
+     - Frequency
+     - Calendar link
+     - Minutes/docs link
+   * - Loop Optimization Working Group
+     - Every 2 weeks on Wednesday
+     - `ics <./_static/LoopOptWG_invite.ics>`__
+     - `Minutes/docs <https://docs.google.com/document/d/1sdzoyB11s0ccTZ3fobqctDpgJmRoFcz0sviKxqczs4g/edit>`__
+   * - RISC-V
+     - Every 2 weeks on Thursday
+     - `ics <https://calendar.google.com/calendar/ical/lowrisc.org_0n5pkesfjcnp0bh5hps1p0bd80%40group.calendar.google.com/public/basic.ics>`__
+       `gcal <https://calendar.google.com/calendar/b/1?cid=bG93cmlzYy5vcmdfMG41cGtlc2ZqY25wMGJoNWhwczFwMGJkODBAZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ>`__
+     - `Minutes/docs <https://docs.google.com/document/d/1G3ocHm2zE6AYTS2N3_3w2UxFnSEyKkcF57siLWe-NVs>`__
+   * - ML Guided Compiler Optimizations
+     - Monthly
+     -
+     - `Minutes/docs <https://docs.google.com/document/d/1JecbplF09l3swTjze-UVeLh4L48svJxGVy4mz_e9Rhs/edit?usp=gmail#heading=h.ts9cmcjbir1j>`__
+   * - `LLVM security group <https://llvm.org/docs/Security.html>`__
+     - Monthly, every 3rd Tuesday
+     - `ics <https://calendar.google.com/calendar/ical/eoh3m9k1l6vqbd1fkp94fv5q74%40group.calendar.google.com/public/basic.ics>`__
+       `gcal <https://calendar.google.com/calendar/embed?src=eoh3m9k1l6vqbd1fkp94fv5q74%40group.calendar.google.com>`__
+     - `Minutes/docs <https://discourse.llvm.org/t/llvm-security-group-public-sync-ups/62735>`__
+   * - `CIRCT <https://github.com/llvm/circt>`__
+     - Weekly, on Wednesday
+     -
+     - `Minutes/docs <https://docs.google.com/document/d/1fOSRdyZR2w75D87yU2Ma9h2-_lEPL4NxvhJGJd-s5pk/edit#heading=h.mulvhjtr8dk9>`__
+   * - flang
+     - Multiple meeting series, `documented here <https://github.com/llvm/llvm-project/blob/main/flang/docs/GettingInvolved.md#calls>`__
+     -
+     -
+   * - OpenMP
+     - Multiple meeting series, `documented here <https://openmp.llvm.org/docs/SupportAndFAQ.html>`__
+     -
+     -
+   * - LLVM Alias Analysis
+     - Every 4 weeks on Tuesdays
+     - `ics <http://lists.llvm.org/pipermail/llvm-dev/attachments/20201103/a3499a67/attachment-0001.ics>`__
+     - `Minutes/docs <https://docs.google.com/document/d/17U-WvX8qyKc3S36YUKr3xfF-GHunWyYowXbxEdpHscw>`__
+   * - LLVM Pointer Authentication
+     - Every month on Mondays
+     - `ics <https://calendar.google.com/calendar/ical/fr1qtmrmt2s9odufjvurkb6j70%40group.calendar.google.com/public/basic.ics>`__
+     - `Minutes/docs <https://discourse.llvm.org/t/llvm-pointer-authentication-sync-ups/62661>`__
+   * - LLVM Embedded Toolchains
+     - Every 4 weeks on Thursdays
+     - `ics <https://drive.google.com/file/d/1uNa-PFYkhAfT83kR2Nc4Fi706TAQFBEL/view?usp=sharing>`__
+       `gcal <https://calendar.google.com/calendar/u/0?cid=ZDQyc3ZlajJmbjIzNG1jaTUybjFsdjA2dWNAZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ>`__
+     - `Minutes/docs <https://docs.google.com/document/d/1GahxppHJ7o1O_fn1Mbidu1DHEg7V2aOr92LXCtNV1_o/edit?usp=sharing>`__
+   * - Clang C and C++ Language Working Group
+     - 1st and 3rd Wednesday of the month
+     - `gcal <https://calendar.google.com/calendar/u/0?cid=cW1lZGg0ZXNpMnIyZDN2aTVydGVrdWF1YzRAZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ>`__
+     - `Minutes/docs <https://docs.google.com/document/d/1x5-RbOC6-jnI_NcJ9Dp4pSmGhhNe7lUevuWUIB46TeM/edit?usp=sharing>`__
+   * - LLVM SPIR-V Backend Working Group
+     - Every week on Monday
+     -
+     - `Meeting details/agenda <https://docs.google.com/document/d/1UjX-LAwPjJ75Nmb8a5jz-Qrm-pPtKtQw0k1S1Lop9jU/edit?usp=sharing>`__
+   * - SYCL Upstream Working Group
+     - Every 2 weeks on Mondays
+     - `gcal <https://calendar.google.com/calendar/u/0?cid=c3ljbC5sbHZtLndnQGdtYWlsLmNvbQ>`__
+     - `Meeting details/agenda <https://docs.google.com/document/d/1ivYDSn_5ChTeiZ7TiO64WC_jYJnGwAUiT9Ngi9cAdFU/edit?usp=sharing>`__
+   * - Floating Point Working Group
+     - Every 3rd Wednesday of the month
+     - `ics <https://calendar.google.com/calendar/ical/02582507bac79d186900712566ec3fc69b33ac24d7de0a8c76c7b19976f190c0%40group.calendar.google.com/private-6e35506dbfe13812e92e9aa8cd5d761d/basic.ics>`__
+       `gcal <https://calendar.google.com/calendar/u/0?cid=MDI1ODI1MDdiYWM3OWQxODY5MDA3MTI1NjZlYzNmYzY5YjMzYWMyNGQ3ZGUwYThjNzZjN2IxOTk3NmYxOTBjMEBncm91cC5jYWxlbmRhci5nb29nbGUuY29t>`__
+     - `Meeting details/agenda: <https://docs.google.com/document/d/1QcmUlWftPlBi-Wz6b6PipqJfvjpJ-OuRMRnN9Dm2t0c>`__
+
+Past online sync-ups
+^^^^^^^^^^^^^^^^^^^^
+
+Some online sync-ups are no longer happening. We keep pointing to them here to
+keep track of the meeting notes and in case anyone would want to revive them in
+the future.
+
+.. list-table:: LLVM no-longer-happening sync-up calls
+   :widths: 25 25 25 25
+   :header-rows: 1
+
+   * - Topic
+     - Frequency
+     - Calendar link
+     - Minutes/docs link
+   * - Scalable Vectors and Arm SVE
+     - Monthly, every 3rd Tuesday
+     - `ics <https://calendar.google.com/calendar/ical/bjms39pe6k6bo5egtsp7don414%40group.calendar.google.com/public/basic.ics>`__
+       `gcal <https://calendar.google.com/calendar/u/0/embed?src=bjms39pe6k6bo5egtsp7don414@group.calendar.google.com>`__
+     - `Minutes/docs <https://docs.google.com/document/d/1UPH2Hzou5RgGT8XfO39OmVXKEibWPfdYLELSaHr3xzo/edit>`__
+   * - MemorySSA in LLVM
+     - Every 8 weeks on Mondays
+     - `ics <https://calendar.google.com/calendar/ical/c_1mincouiltpa24ac14of14lhi4%40group.calendar.google.com/public/basic.ics>`__
+       `gcal <https://calendar.google.com/calendar/embed?src=c_1mincouiltpa24ac14of14lhi4%40group.calendar.google.com>`__
+     - `Minutes/docs <https://docs.google.com/document/d/1-uEEZfmRdPThZlctOq9eXlmUaSSAAi8oKxhrPY_lpjk/edit#>`__
+   * - GlobalISel
+     - Every 2nd Tuesday of the month
+     - `gcal <https://calendar.google.com/calendar/u/0?cid=ZDcyMjc0ZjZiZjNhMzFlYmE3NTNkMWM2MGM2NjM5ZWU3ZDE2MjM4MGFlZDc2ZjViY2UyYzMwNzVhZjk4MzQ4ZEBncm91cC5jYWxlbmRhci5nb29nbGUuY29t>`__
+     - `Meeting details/agenda <https://docs.google.com/document/d/1Ry8O4-Tm5BFj9AMjr8qTQFU80z-ptiNQ62687NaIvLs/edit?usp=sharing>`__
+   * - Vector Predication
+     - Every 2 weeks on Tuesdays, 3pm UTC
+     -
+     - `Minutes/docs <https://docs.google.com/document/d/1q26ToudQjnqN5x31zk8zgq_s0lem1-BF8pQmciLa4k8/edit?usp=sharing>`__
+   * - `MLIR <https://mlir.llvm.org>`__ design meetings
+     - Weekly, on Thursdays
+     -
+     - `Minutes/docs <https://docs.google.com/document/d/1y_9f1AbfgcoVdJh4_aM6-BaSHvrHl8zuA5G4jv_94K8/edit#heading=h.cite1kolful9>`__
+
+.. _office-hours:
+
+Office hours
+------------
+
+A number of experienced LLVM contributors make themselves available for a chat
+on a regular schedule, to anyone who is looking for some guidance. Please find
+the list of who is available when, through which medium, and what their area of
+expertise is. Don't be too shy to dial in!
+
+Office hours are also listed on the :ref:`llvm-community-calendar`. Of course,
+people take time off from time to time, so if you dial in and you don't find
+anyone present, chances are they happen to be off that day.
+
+The :doc:`CodeOfConduct` applies to all office hours.
+
+.. list-table:: LLVM office hours
+  :widths: 15 40 15 15 15
+  :header-rows: 1
+
+  * - Name
+    - In-scope topics
+    - When?
+    - Where?
+    - Languages
+  * - Kristof Beyls
+    - General questions on how to contribute to LLVM; organizing meetups;
+      submitting talks; and other general LLVM-related topics. Arm/AArch64
+      codegen. LLVM security group. LLVM Office Hours.
+    - Every 2nd and 4th Wednesday of the month at 9.30am CET, for 30 minutes.
+      `ics <https://calendar.google.com/calendar/ical/co0h4ndpvtfe64opn7eraiq3ac%40group.calendar.google.com/public/basic.ics>`__
+    - `Jitsi <https://meet.jit.si/KristofBeylsLLVMOfficeHour>`__
+    - English, Flemish, Dutch
+  * - Alina Sbirlea
+    - General questions on how to contribute to LLVM; women in compilers;
+      MemorySSA, BatchAA, various loop passes, new pass manager.
+    - Monthly, 2nd Tuesdays, 10.00am PT/7:00pm CET, for 30 minutes.
+      `ics <https://calendar.google.com/calendar/ical/c_pm6e7160iq7n5fcm1s6m3rjhh4%40group.calendar.google.com/public/basic.ics>`__
+      `gcal <https://calendar.google.com/calendar/embed?src=c_pm6e7160iq7n5fcm1s6m3rjhh4%40group.calendar.google.com>`__
+    - `GoogleMeet <https://meet.google.com/hhk-xpdj-gvx>`__
+    - English, Romanian
+  * - Aaron Ballman (he/him)
+    - Clang internals; frontend attributes; clang-tidy; clang-query; AST matchers
+    - Monthly, 2nd Monday and 3rd Friday of the month at 10:00am Eastern and again at 2:00pm Eastern, for 60 minutes.
+      `ics <https://calendar.google.com/calendar/ical/npgke5dug0uliud0qapptmps58%40group.calendar.google.com/public/basic.ics>`__
+      `gcal <https://calendar.google.com/calendar/embed?src=npgke5dug0uliud0qapptmps58%40group.calendar.google.com>`__
+    - `GoogleMeet <https://meet.google.com/xok-iqne-gmi>`__
+    - English, Norwegian (not fluently)
+  * - Johannes Doerfert (he/him)
+    - OpenMP, LLVM-IR, interprocedural optimizations, Attributor, workshops, research, ...
+    - Every week, Wednesdays 9:30am (Pacific Time), for 1 hour.
+      `ics <https://drive.google.com/file/d/1E_QkRvirmdJzlXf2EKBUX-v8Xj7-eW3v/view?usp=sharing>`__
+    - `MS Teams <https://teams.microsoft.com/l/meetup-join/19%3ameeting_MTMxNzU4MWYtYzViNS00OTM2LWJmNWQtMjg5ZWFhNGVjNzgw%40thread.v2/0?context=%7b%22Tid%22%3a%22a722dec9-ae4e-4ae3-9d75-fd66e2680a63%22%2c%22Oid%22%3a%22885bda30-ce8e-46db-aa7e-15de0474831a%22%7d>`__
+    - English, German
+  * - Tobias Grosser
+    - General questions on how to contribute to LLVM/MLIR, Polly, Loop Optimization, FPL, Research in LLVM, PhD in CS, Summer of Code.
+    - Monthly, last Monday of the month at 18:00 London time (typically 9am PT), for 30 minutes.
+    - `Video Call <https://meet.grosser.science/LLVMOfficeHours>`__
+    - English, German, Spanish, French
+  * - Anastasia Stulova
+    - Clang internals for C/C++ language extensions and dialects, OpenCL, GPU, SPIR-V, how to contribute, women in compilers.
+    - Monthly, 1st Tuesday of the month at 17:00 BST - London time (9:00am PT except for 2 weeks in spring), 30 mins slot.
+    - `GoogleMeet <https://meet.google.com/kdy-fdbv-nuk>`__
+    - English, Russian, German (not fluently)
+  * - Alexey Bader
+    - SYCL compiler, offload tools, OpenCL and SPIR-V, how to contribute.
+    - Monthly, 2nd Monday of the month at 9:30am PT, for 30 minutes.
+    - `GoogleMeet <https://meet.google.com/pdz-xhns-uus>`__
+    - English, Russian
+  * - Maksim Panchenko
+    - BOLT internals, IR, new passes, proposals, etc.
+    - Monthly, 2nd Wednesday of the month at 11:00am PT, for 30 minutes.
+    - `Zoom <https://fb.zoom.us/j/97065697120?pwd=NTFaUWJjZW9uVkJuaVlPTE9qclE3dz09>`__
+    - English, Russian
+  * - Quentin Colombet (he/him)
+    - LLVM/MLIR; Codegen (Instruction selection (GlobalISel/SDISel), Machine IR,
+      Register allocation, etc.); Optimizations; MCA
+    - Monthly, 1st Wednesday of the month at 8.00am PT, for 30 minutes.
+      `ics <https://calendar.google.com/calendar/ical/48c4ad60290a4df218e51e1ceec1106fe317b0ebc76938d9273592053f38204e%40group.calendar.google.com/public/basic.ics>`__
+      `gcal <https://calendar.google.com/calendar/embed?src=48c4ad60290a4df218e51e1ceec1106fe317b0ebc76938d9273592053f38204e%40group.calendar.google.com>`__
+    - `Google meet <https://meet.google.com/cbz-grrp-obs>`__
+    - English, French
+  * - Phoebe Wang (she/her)
+    - X86 backend, General questions to X86, women in compilers.
+    - Monthly, 3rd Wednesday of the month at 8:30am Beijing time, for 30 minutes.
+    - `MS Teams <https://teams.microsoft.com/l/meetup-join/19%3ameeting_NWQ0MjU0NjYtZjUyMi00YTU3LThmM2EtY2Y2YTE4NGM3NmFi%40thread.v2/0?context=%7b%22Tid%22%3a%2246c98d88-e344-4ed4-8496-4ed7712e255d%22%2c%22Oid%22%3a%227b309d9c-a9bb-44c8-a940-ab97eef42d4d%22%7d>`__
+    - English, Chinese
+  * - Amara Emerson
+    - GlobalISel questions.
+    - Monthly, 4th Wednesday of the month at 9:30am PT, for 30 minutes.
+    - `Google meet <https://meet.google.com/pdd-dibg-cwv>`__
+    - English
+  * - Maksim Levental and Jeremy Kun
+    - MLIR newcomers and general discussion (`livestreamed <https://www.youtube.com/playlist?list=PLhxO86S3jsX2k7kOhZaV-qKWm8tNsUdAE>`__)
+    - Every two weeks, Wednesdays at 2:00pm US Pacific, for 90 minutes.
+    - Livestream chat or `Google meet <https://meet.google.com/wit-tvzc-dwc>`__
+    - English
+  * - Rotating hosts
+    - Getting Started, beginner questions, new contributors.
+    - Every Tuesday at 2 PM ET (11 AM PT), for 30 minutes.
+    - `Google meet <https://meet.google.com/nga-uhpf-bbb>`__
+    - English
+
+
+Guidance for office hours hosts
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+* If you're interested in becoming an office hours host, please add your
+  information to the list above. Please create a calendar event for it and
+  invite calendar@llvm.org to the event so that it'll show up on the
+  :ref:`llvm-community-calendar`.
+  Please see :ref:`llvm-community-calendar-host-guidance` for more guidance on
+  what to add to your calendar invite.
+* When starting an office hours session, consider typing something like "*Hi,
+  I'm available for chats in the next half hour at* video chat URL. *I'm
+  looking forward to having conversations on the video chat or here.*" on the
+  LLVM chat channels that you are already on. These could include:
+
+    * the `#office-hours Discord channel
+      <https://discord.com/channels/636084430946959380/976196303681896538>`__.
+    * :ref:`IRC`
+
+  Doing this can help:
+    * overcome potential anxiety to call in for a first time,
+    * people who prefer to first exchange a few messages through text chat
+      before dialing in, and
+    * remind the wider community that office hours do exist.
+* If you decide to no longer host office hours, please do remove your entry
+  from the list above.
+
+
+.. _IRC:
+
+IRC
+---
+
+Users and developers of the LLVM project (including subprojects such as Clang)
+can be found in #llvm on `irc.oftc.net <irc://irc.oftc.net/llvm>`_. The channel
+is actively moderated.
+
+The #llvm-build channel has a bot for
+`LLVM buildbot <http://lab.llvm.org/buildbot/#/console>`_ status changes. The
+bot will post a message with a link to a build bot and a blamelist when a build
+goes from passing to failing and again (without the blamelist) when the build
+goes from failing back to passing. It is a good channel for actively monitoring
+build statuses, but it is a noisy channel due to the automated messages. The
+channel is not actively moderated.
+
+In addition to the traditional IRC there is a
+`Discord <https://discord.com/channels/636084430946959380/636725486533345280>`_
+chat server available. To sign up, please use this
+`invitation link <https://discord.com/invite/xS7Z362>`_.
+
+
+.. _meetups-social-events:
+
+Meetups and social events
+-------------------------
+
+.. toctree::
+   :hidden:
+
+   MeetupGuidelines
+
+Besides developer `meetings and conferences <https://llvm.org/devmtg/>`_,
+there are several user groups called
+`LLVM Socials <https://www.meetup.com/pro/llvm/>`_. We greatly encourage you to
+join one in your city. Or start a new one if there is none:
+
+:doc:`MeetupGuidelines`
+
+.. _community-proposals:
+
+Community wide proposals
+------------------------
+
+Proposals for massive changes in how the community behaves and how the work flow
+can be better.
+
+.. toctree::
+   :hidden:
+
+   Proposals/GitHubMove
+   BugpointRedesign
+   Proposals/TestSuite
+   Proposals/VariableNames
+   Proposals/VectorPredication
+
+:doc:`Proposals/GitHubMove`
+   Proposal to move from SVN/Git to GitHub.
+
+:doc:`BugpointRedesign`
+   Design doc for a redesign of the Bugpoint tool.
+
+:doc:`Proposals/TestSuite`
+   Proposals for additional benchmarks/programs for llvm's test-suite.
+
+:doc:`Proposals/VariableNames`
+   Proposal to change the variable names coding standard.
+
+:doc:`Proposals/VectorPredication`
+   Proposal for predicated vector instructions in LLVM.
+
+.. _llvm-community-calendar:
+
+LLVM community calendar
+-----------------------
+
+We aim to maintain a public calendar view of all events happening in the LLVM
+community such as :ref:`online-sync-ups` and :ref:`office-hours`. The calendar
+can be found at
+https://calendar.google.com/calendar/u/0/embed?src=calendar@llvm.org and can
+also be seen inline below:
+
+.. raw:: html
+
+    <iframe src="https://calendar.google.com/calendar/embed?height=600&wkst=1&bgcolor=%23ffffff&ctz=UTC&showCalendars=0&showDate=1&showNav=1&src=Y2FsZW5kYXJAbGx2bS5vcmc&color=%23039BE5" style="border:solid 1px #777" width="800" height="600" frameborder="0" scrolling="no"></iframe>
+
+Note that the web view of the LLVM community calendar shows events in
+Coordinated Universal Time (UTC). If you use Google Calendar, consider
+subscribing to it with the + button in the bottom-right corner to view all
+events in your local timezone alongside your other calendars.
+
+.. _llvm-community-calendar-host-guidance:
+
+Guidance on what to put into LLVM community calendar invites
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+To add your event, create a calendar event for it and invite calendar@llvm.org
+on it. Your event should then show up on the community calendar.
+
+Please put the following pieces of information in your calendar invite:
+
+* Write a single paragraph describing what the event is about. Include things
+  such as who the event is for and what sort of topics are discussed.
+* State explicitly that the :doc:`CodeOfConduct` applies to this event.
+* Make it clear who:
+
+  * the organizer is.
+
+  * the person to contact is in case of any code-of-conduct issues.  Typically,
+    this would be the organizer.
+
+* If you have meeting minutes for your event, add a pointer to where those live.
+  A good place for meeting minutes could be as a post on LLVM Discourse.
+
+An example invite looks as follows
+
+.. code-block:: none
+
+  This event is a meetup for all developers of LLDB. Meeting agendas are posted
+  on discourse before the event.
+
+  Attendees are required to adhere to the LLVM Code of Conduct
+  (https://llvm.org/docs/CodeOfConduct.html). For any Code of Conduct reports,
+  please contact the organizers, and also email conduct@llvm.org.
+
+  Agenda/Meeting Minutes: Link to minutes
+
+  Organizer(s): First Surname (name@email.com)
+
diff --git a/llvm/docs/GettingStartedTutorials.rst b/llvm/docs/GettingStartedTutorials.rst
index a6541acf47ad..55060343ba36 100644
--- a/llvm/docs/GettingStartedTutorials.rst
+++ b/llvm/docs/GettingStartedTutorials.rst
@@ -1,43 +1,43 @@
-Getting Started/Tutorials
-=========================
-
-For those new to the LLVM system.
-
-.. toctree::
-   :hidden:
-
-   CompilerWriterInfo
-   Frontend/PerformanceTips
-   GettingStarted
-   GettingStartedVS
-   ProgrammersManual
-   tutorial/index
-   MyFirstTypoFix
-
-:doc:`GettingStarted`
-   Discusses how to get up and running quickly with the LLVM infrastructure.
-   Everything from unpacking and compilation of the distribution to execution
-   of some tools.
-
-:doc:`tutorial/index`
-   Tutorials about using LLVM. Includes a tutorial about making a custom
-   language with LLVM.
-
-:doc:`ProgrammersManual`
-  Introduction to the general layout of the LLVM sourcebase, important classes
-  and APIs, and some tips & tricks.
-
-:doc:`Frontend/PerformanceTips`
-   A collection of tips for frontend authors on how to generate IR
-   which LLVM is able to effectively optimize.
-
-:doc:`GettingStartedVS`
-   An addendum to the main Getting Started guide for those using Visual Studio
-   on Windows.
-
-:doc:`CompilerWriterInfo`
-  A list of helpful links for compiler writers.
-
-:doc:`MyFirstTypoFix`
-   This tutorial will guide you through the process of making a change to
-   LLVM, and contributing it back to the LLVM project.
+Getting Started/Tutorials
+=========================
+
+For those new to the LLVM system.
+
+.. toctree::
+   :hidden:
+
+   CompilerWriterInfo
+   Frontend/PerformanceTips
+   GettingStarted
+   GettingStartedVS
+   ProgrammersManual
+   tutorial/index
+   MyFirstTypoFix
+
+:doc:`GettingStarted`
+   Discusses how to get up and running quickly with the LLVM infrastructure.
+   Everything from unpacking and compilation of the distribution to execution
+   of some tools.
+
+:doc:`tutorial/index`
+   Tutorials about using LLVM. Includes a tutorial about making a custom
+   language with LLVM.
+
+:doc:`ProgrammersManual`
+  Introduction to the general layout of the LLVM sourcebase, important classes
+  and APIs, and some tips & tricks.
+
+:doc:`Frontend/PerformanceTips`
+   A collection of tips for frontend authors on how to generate IR
+   which LLVM is able to effectively optimize.
+
+:doc:`GettingStartedVS`
+   An addendum to the main Getting Started guide for those using Visual Studio
+   on Windows.
+
+:doc:`CompilerWriterInfo`
+  A list of helpful links for compiler writers.
+
+:doc:`MyFirstTypoFix`
+   This tutorial will guide you through the process of making a change to
+   LLVM, and contributing it back to the LLVM project.
diff --git a/llvm/docs/GlobalISel/GenericOpcode.rst b/llvm/docs/GlobalISel/GenericOpcode.rst
index 492d30280f47..52dc039df777 100644
--- a/llvm/docs/GlobalISel/GenericOpcode.rst
+++ b/llvm/docs/GlobalISel/GenericOpcode.rst
@@ -592,8 +592,8 @@ G_FLOG, G_FLOG2, G_FLOG10
 
 Calculate the base-e, base-2, or base-10 respectively.
 
-G_FCEIL, G_FCOS, G_FSIN, G_FSQRT, G_FFLOOR, G_FRINT, G_FNEARBYINT
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+G_FCEIL, G_FCOS, G_FSIN, G_FTAN, G_FSQRT, G_FFLOOR, G_FRINT, G_FNEARBYINT
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 These correspond to the standard C functions of the same name.
 
diff --git a/llvm/docs/InstCombineContributorGuide.md b/llvm/docs/InstCombineContributorGuide.md
index 2416fd0920f6..ce5f958058c5 100644
--- a/llvm/docs/InstCombineContributorGuide.md
+++ b/llvm/docs/InstCombineContributorGuide.md
@@ -554,3 +554,11 @@ guidelines.
    use of ValueTracking queries. Whether this makes sense depends on the case,
    but it's usually a good idea to only handle the constant pattern first, and
    then generalize later if it seems useful.
+
+## Guidelines for reviewers
+
+ * Do not ask new contributors to implement non-splat vector support in code
+   reviews. If you think non-splat vector support for a fold is both
+   worthwhile and policy-compliant (that is, the handling would not result in
+   any appreciable increase in code complexity), implement it yourself in a
+   follow-up patch.
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 6291a4e57919..06809f8bf445 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -9113,8 +9113,8 @@ instruction in most regards. The primary difference is that it
 establishes an association with additional labels to define where control
 flow goes after the call.
 
-The output values of a '``callbr``' instruction are available only to
-the '``fallthrough``' block, not to any '``indirect``' blocks(s).
+The output values of a '``callbr``' instruction are available both in the
+the '``fallthrough``' block, and any '``indirect``' blocks(s).
 
 The only use of this today is to implement the "goto" feature of gcc inline
 assembly where additional labels can be provided as locations for the inline
@@ -14111,6 +14111,25 @@ structures and the code to increment the appropriate value, in a
 format that can be written out by a compiler runtime and consumed via
 the ``llvm-profdata`` tool.
 
+.. FIXME: write complete doc on contextual instrumentation and link from here
+.. and from llvm.instrprof.callsite.
+
+The intrinsic is lowered differently for contextual profiling by the
+``-ctx-instr-lower`` pass. Here:
+
+* the entry basic block increment counter is lowered as a call to compiler-rt,
+  to either ``__llvm_ctx_profile_start_context`` or
+  ``__llvm_ctx_profile_get_context``. Either returns a pointer to a context object
+  which contains a buffer into which counter increments can happen. Note that the
+  pointer value returned by compiler-rt may have its LSB set - counter increments
+  happen offset from the address with the LSB cleared.
+
+* all the other lowerings of ``llvm.instrprof.increment[.step]`` happen within
+  that context.
+
+* the context is assumed to be a local value to the function, and no concurrency
+  concerns need to be handled by LLVM.
+
 '``llvm.instrprof.increment.step``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -14156,10 +14175,10 @@ Syntax:
 Overview:
 """""""""
 
-.. FIXME: detail when it's emitted once the support is added
-
 The '``llvm.instrprof.callsite``' intrinsic should be emitted before a callsite
-that's not to a "fake" callee (like another intrinsic or asm).
+that's not to a "fake" callee (like another intrinsic or asm). It is used by
+contextual profiling and has side-effects. Its lowering happens in IR, and
+target-specific backends should never encounter it.
 
 Arguments:
 """"""""""
@@ -14172,9 +14191,28 @@ The last argument is the called value of the callsite this intrinsic precedes.
 
 Semantics:
 """"""""""
-.. FIXME: detail how when the lowering pass is added.
 
-This is lowered by contextual profiling.
+This is lowered by contextual profiling. In contextual profiling, functions get,
+from compiler-rt, a pointer to a context object. The context object consists of
+a buffer LLVM can use to perform counter increments (i.e. the lowering of
+``llvm.instrprof.increment[.step]``. The address range following the counter
+buffer, ``<num-counters>`` x ``sizeof(ptr)`` - sized, is expected to contain
+pointers to contexts of functions called from this function ("subcontexts").
+LLVM does not dereference into that memory region, just calculates GEPs. 
+
+The lowering of ``llvm.instrprof.callsite`` consists of:
+
+* write to ``__llvm_ctx_profile_expected_callee`` the ``<callsite>`` value;
+
+* write to ``__llvm_ctx_profile_callsite`` the address into this function's
+  context of the ``<index>`` position into the subcontexts region.
+
+
+``__llvm_ctx_profile_{expected_callee|callsite}`` are initialized by compiler-rt
+and are TLS. They are both vectors of pointers of size 2. The index into each is
+determined when the current function obtains the pointer to its context from
+compiler-rt. The pointer's LSB gives the index.
+
 
 '``llvm.instrprof.timestamp``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -15272,6 +15310,43 @@ trapping or setting ``errno``.
 When specified with the fast-math-flag 'afn', the result may be approximated
 using a less accurate calculation.
 
+'``llvm.tan.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.tan`` on any
+floating-point or vector of floating-point type. Not all targets support
+all types however.
+
+::
+
+      declare float     @llvm.tan.f32(float  %Val)
+      declare double    @llvm.tan.f64(double %Val)
+      declare x86_fp80  @llvm.tan.f80(x86_fp80  %Val)
+      declare fp128     @llvm.tan.f128(fp128 %Val)
+      declare ppc_fp128 @llvm.tan.ppcf128(ppc_fp128  %Val)
+
+Overview:
+"""""""""
+
+The '``llvm.tan.*``' intrinsics return the tangent of the operand.
+
+Arguments:
+""""""""""
+
+The argument and return value are floating-point numbers of the same type.
+
+Semantics:
+""""""""""
+
+Return the same value as a corresponding libm '``tan``' function but without
+trapping or setting ``errno``.
+
+When specified with the fast-math-flag 'afn', the result may be approximated
+using a less accurate calculation.
+
 '``llvm.pow.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -19068,6 +19143,60 @@ will be on any later loop iteration.
 This intrinsic will only return 0 if the input count is also 0. A non-zero input
 count will produce a non-zero result.
 
+'``llvm.experimental.vector.histogram.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+These intrinsics are overloaded.
+
+These intrinsics represent histogram-like operations; that is, updating values
+in memory that may not be contiguous, and where multiple elements within a
+single vector may be updating the same value in memory.
+
+The update operation must be specified as part of the intrinsic name. For a
+simple histogram like the following the ``add`` operation would be used.
+
+.. code-block:: c
+
+    void simple_histogram(int *restrict buckets, unsigned *indices, int N, int inc) {
+      for (int i = 0; i < N; ++i)
+        buckets[indices[i]] += inc;
+    }
+
+More update operation types may be added in the future.
+
+::
+
+    declare <8 x i32> @llvm.experimental.vector.histogram.add.v8p0.i32(<8 x ptr> %ptrs, i32 %inc, <8 x i1> %mask)
+    declare <vscale x 2 x i64> @llvm.experimental.vector.histogram.add.nxv2p0.i64(<vscale x 2 x ptr> %ptrs, i64 %inc, <vscale x 2 x i1> %mask)
+
+Arguments:
+""""""""""
+
+The first argument is a vector of pointers to the memory locations to be
+updated. The second argument is a scalar used to update the value from
+memory; it must match the type of value to be updated. The final argument
+is a mask value to exclude locations from being modified.
+
+Semantics:
+""""""""""
+
+The '``llvm.experimental.vector.histogram.*``' intrinsics are used to perform
+updates on potentially overlapping values in memory. The intrinsics represent
+the follow sequence of operations:
+
+1. Gather load from the ``ptrs`` operand, with element type matching that of
+   the ``inc`` operand.
+2. Update of the values loaded from memory. In the case of the ``add``
+   update operation, this means:
+
+   1. Perform a cross-vector histogram operation on the ``ptrs`` operand.
+   2. Multiply the result by the ``inc`` operand.
+   3. Add the result to the values loaded from memory
+3. Scatter the result of the update operation to the memory locations from
+   the ``ptrs`` operand.
+
+The ``mask`` operand will apply to at least the gather and scatter operations.
+
 Matrix Intrinsics
 -----------------
 
@@ -22107,6 +22236,146 @@ Examples:
       %also.r = call float @llvm.minnum.f32(float %reduction, float %start)
 
 
+.. _int_vp_reduce_fmaximum:
+
+'``llvm.vp.reduce.fmaximum.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare float @llvm.vp.reduce.fmaximum.v4f32(float <start_value>, <4 x float> <val>, <4 x i1> <mask>, float <vector_length>)
+      declare double @llvm.vp.reduce.fmaximum.nxv8f64(double <start_value>, <vscale x 8 x double> <val>, <vscale x 8 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated floating-point ``MAX`` reduction of a vector and a scalar starting
+value, returning the result as a scalar.
+
+
+Arguments:
+""""""""""
+
+The first operand is the start value of the reduction, which must be a scalar
+floating-point type equal to the result type. The second operand is the vector
+on which the reduction is performed and must be a vector of floating-point
+values whose element type is the result/start type. The third operand is the
+vector mask and is a vector of boolean values with the same number of elements
+as the vector operand. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.reduce.fmaximum``' intrinsic performs the floating-point ``MAX``
+reduction (:ref:`llvm.vector.reduce.fmaximum <int_vector_reduce_fmaximum>`) of
+the vector operand ``val`` on each enabled lane, taking the maximum of that and
+the scalar ``start_value``. Disabled lanes are treated as containing the
+neutral value (i.e. having no effect on the reduction operation). If the vector
+length is zero, the result is the start value.
+
+The neutral value is dependent on the :ref:`fast-math flags <fastmath>`. If no
+flags are set or only the ``nnan`` is set, the neutral value is ``-Infinity``.
+If ``ninf`` is set, then the neutral value is the smallest floating-point value
+for the result type.
+
+This instruction has the same comparison semantics as the
+:ref:`llvm.vector.reduce.fmaximum <int_vector_reduce_fmaximum>` intrinsic (and
+thus the '``llvm.maximum.*``' intrinsic). That is, the result will always be a
+number unless any of the elements in the vector or the starting value is
+``NaN``. Namely, this intrinsic propagates ``NaN``. Also, -0.0 is considered
+less than +0.0.
+
+To ignore the start value, the neutral value can be used.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call float @llvm.vp.reduce.fmaximum.v4f32(float %float, <4 x float> %a, <4 x i1> %mask, i32 %evl)
+      ; %r is equivalent to %also.r, where lanes greater than or equal to %evl
+      ; are treated as though %mask were false for those lanes.
+
+      %masked.a = select <4 x i1> %mask, <4 x float> %a, <4 x float> <float -infinity, float -infinity, float -infinity, float -infinity>
+      %reduction = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %masked.a)
+      %also.r = call float @llvm.maximum.f32(float %reduction, float %start)
+
+
+.. _int_vp_reduce_fminimum:
+
+'``llvm.vp.reduce.fminimum.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare float @llvm.vp.reduce.fminimum.v4f32(float <start_value>, <4 x float> <val>, <4 x i1> <mask>, float <vector_length>)
+      declare double @llvm.vp.reduce.fminimum.nxv8f64(double <start_value>, <vscale x 8 x double> <val>, <vscale x 8 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated floating-point ``MIN`` reduction of a vector and a scalar starting
+value, returning the result as a scalar.
+
+
+Arguments:
+""""""""""
+
+The first operand is the start value of the reduction, which must be a scalar
+floating-point type equal to the result type. The second operand is the vector
+on which the reduction is performed and must be a vector of floating-point
+values whose element type is the result/start type. The third operand is the
+vector mask and is a vector of boolean values with the same number of elements
+as the vector operand. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.reduce.fminimum``' intrinsic performs the floating-point ``MIN``
+reduction (:ref:`llvm.vector.reduce.fminimum <int_vector_reduce_fminimum>`) of
+the vector operand ``val`` on each enabled lane, taking the minimum of that and
+the scalar ``start_value``. Disabled lanes are treated as containing the neutral
+value (i.e. having no effect on the reduction operation). If the vector length
+is zero, the result is the start value.
+
+The neutral value is dependent on the :ref:`fast-math flags <fastmath>`. If no
+flags are set or only the ``nnan`` is set, the neutral value is ``+Infinity``.
+If ``ninf`` is set, then the neutral value is the largest floating-point value
+for the result type.
+
+This instruction has the same comparison semantics as the
+:ref:`llvm.vector.reduce.fminimum <int_vector_reduce_fminimum>` intrinsic (and
+thus the '``llvm.minimum.*``' intrinsic). That is, the result will always be a
+number unless any of the elements in the vector or the starting value is
+``NaN``. Namely, this intrinsic propagates ``NaN``. Also, -0.0 is considered
+less than +0.0.
+
+To ignore the start value, the neutral value can be used.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call float @llvm.vp.reduce.fminimum.v4f32(float %start, <4 x float> %a, <4 x i1> %mask, i32 %evl)
+      ; %r is equivalent to %also.r, where lanes greater than or equal to %evl
+      ; are treated as though %mask were false for those lanes.
+
+      %masked.a = select <4 x i1> %mask, <4 x float> %a, <4 x float> <float infinity, float infinity, float infinity, float infinity>
+      %reduction = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %masked.a)
+      %also.r = call float @llvm.minimum.f32(float %reduction, float %start)
+
+
 .. _int_get_active_lane_mask:
 
 '``llvm.get.active.lane.mask.*``' Intrinsics
@@ -26739,6 +27008,8 @@ specified by C standard:
 Other values may be used to represent additional rounding modes, supported by a
 target. These values are target-specific.
 
+.. _int_set_rounding:
+
 '``llvm.set.rounding``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/llvm/docs/MIRLangRef.rst b/llvm/docs/MIRLangRef.rst
index 52ff24daa7fb..e248a14636a8 100644
--- a/llvm/docs/MIRLangRef.rst
+++ b/llvm/docs/MIRLangRef.rst
@@ -168,11 +168,11 @@ Here is an example of a YAML document that contains an LLVM module:
 
 .. code-block:: llvm
 
-       define i32 @inc(i32* %x) {
+       define i32 @inc(ptr %x) {
        entry:
-         %0 = load i32, i32* %x
+         %0 = load i32, ptr %x
          %1 = add i32 %0, 1
-         store i32 %1, i32* %x
+         store i32 %1, ptr %x
          ret i32 %1
        }
 
diff --git a/llvm/docs/PDB/CodeViewSymbols.rst b/llvm/docs/PDB/CodeViewSymbols.rst
index b056b804e5ad..0f218db412f3 100644
--- a/llvm/docs/PDB/CodeViewSymbols.rst
+++ b/llvm/docs/PDB/CodeViewSymbols.rst
@@ -93,9 +93,9 @@ the compiler decided to emit is impractical.  This differs from DWARF, where eve
 though we don't necessarily have O(1) lookup by basename within a given scope (including
 O(1) scope, we at least have O(n) access within a given scope).
 
-.. important::
-   Program-wide lookup of names by anything other than an exact textually matching fully
-   qualified name is not possible.
+.. important::
+   Program-wide lookup of names by anything other than an exact textually matching fully
+   qualified name is not possible.
 
 
 S_GDATA32
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index a4cf17a8398a..ff08c9d345d5 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -99,6 +99,7 @@ on support follow.
      ``Shvstvecd``     Assembly Support (`See note <#riscv-profiles-extensions-note>`__)
      ``Smaia``         Supported
      ``Smepmp``        Supported
+     ``Smstateen``     Assembly Support
      ``Ssaia``         Supported
      ``Ssccptr``       Assembly Support (`See note <#riscv-profiles-extensions-note>`__)
      ``Sscofpmf``      Assembly Support
diff --git a/llvm/docs/Reference.rst b/llvm/docs/Reference.rst
index 1661c8c533db..df61628b06c7 100644
--- a/llvm/docs/Reference.rst
+++ b/llvm/docs/Reference.rst
@@ -1,234 +1,234 @@
-Reference
-=========
-
-LLVM and API reference documentation.
-
-.. contents::
-   :local:
-
-.. toctree::
-   :hidden:
-
-   Atomics
-   BitCodeFormat
-   BlockFrequencyTerminology
-   BranchWeightMetadata
-   Bugpoint
-   CommandGuide/index
-   ConvergenceAndUniformity
-   ConvergentOperations
-   Coroutines
-   DependenceGraphs/index
-   ExceptionHandling
-   Extensions
-   FaultMaps
-   FuzzingLLVM
-   GarbageCollection
-   GetElementPtr
-   GlobalISel/index
-   GwpAsan
-   HowToSetUpLLVMStyleRTTI
-   HowToUseAttributes
-   InAlloca
-   LangRef
-   LibFuzzer
-   MarkedUpDisassembly
-   MIRLangRef
-   OptBisect
-   PCSectionsMetadata
-   PDB/index
-   PointerAuth
-   ScudoHardenedAllocator
-   MemoryModelRelaxationAnnotations
-   MemTagSanitizer
-   Security
-   SecurityTransparencyReports
-   SegmentedStacks
-   StackMaps
-   SpeculativeLoadHardening
-   Statepoints
-   SymbolizerMarkupFormat
-   SystemLibrary
-   TestingGuide
-   TransformMetadata
-   TypeMetadata
-   XRay
-   XRayExample
-   XRayFDRFormat
-   YamlIO
-
-API Reference
--------------
-
-`Doxygen generated documentation <https://llvm.org/doxygen/>`_
-  (`classes <https://llvm.org/doxygen/inherits.html>`_)
-
-:doc:`HowToUseAttributes`
-  Answers some questions about the new Attributes infrastructure.
-
-LLVM Reference
---------------
-
-======================
-Command Line Utilities
-======================
-
-:doc:`LLVM Command Guide <CommandGuide/index>`
-   A reference manual for the LLVM command line utilities ("man" pages for LLVM
-   tools).
-
-:doc:`Bugpoint`
-   Automatic bug finder and test-case reducer description and usage
-   information.
-
-:doc:`OptBisect`
-  A command line option for debugging optimization-induced failures.
-
-:doc:`SymbolizerMarkupFormat`
-  A reference for the log symbolizer markup accepted by ``llvm-symbolizer``.
-
-:doc:`The Microsoft PDB File Format <PDB/index>`
-  A detailed description of the Microsoft PDB (Program Database) file format.
-
-==================
-Garbage Collection
-==================
-
-:doc:`GarbageCollection`
-   The interfaces source-language compilers should use for compiling GC'd
-   programs.
-
-:doc:`Statepoints`
-  This describes a set of experimental extensions for garbage
-  collection support.
-
-=========
-LibFuzzer
-=========
-
-:doc:`LibFuzzer`
-  A library for writing in-process guided fuzzers.
-
-:doc:`FuzzingLLVM`
-  Information on writing and using Fuzzers to find bugs in LLVM.
-
-========
-LLVM IR
-========
-
-:doc:`LLVM Language Reference Manual <LangRef>`
-  Defines the LLVM intermediate representation and the assembly form of the
-  different nodes.
-
-:doc:`InAlloca`
-  Description of the ``inalloca`` argument attribute.
-
-:doc:`BitCodeFormat`
-   This describes the file format and encoding used for LLVM "bc" files.
-
-:doc:`Machine IR (MIR) Format Reference Manual <MIRLangRef>`
-   A reference manual for the MIR serialization format, which is used to test
-   LLVM's code generation passes.
-
-:doc:`GlobalISel/index`
-  This describes the prototype instruction selection replacement, GlobalISel.
-
-:doc:`ConvergentOperations`
-  Description of ``convergent`` operation semantics and related intrinsics.
-
-=====================
-Testing and Debugging
-=====================
-
-:doc:`LLVM Testing Infrastructure Guide <TestingGuide>`
-   A reference manual for using the LLVM testing infrastructure.
-
-:doc:`TestSuiteGuide`
-  Describes how to compile and run the test-suite benchmarks.
-
-
-:doc:`GwpAsan`
-  A sampled heap memory error detection toolkit designed for production use.
-
-====
-XRay
-====
-
-:doc:`XRay`
-  High-level documentation of how to use XRay in LLVM.
-
-:doc:`XRayExample`
-  An example of how to debug an application with XRay.
-
-=================
-Additional Topics
-=================
-
-:doc:`FaultMaps`
-  LLVM support for folding control flow into faulting machine instructions.
-
-:doc:`Atomics`
-  Information about LLVM's concurrency model.
-
-:doc:`ExceptionHandling`
-   This document describes the design and implementation of exception handling
-   in LLVM.
-
-:doc:`Extensions`
-  LLVM-specific extensions to tools and formats LLVM seeks compatibility with.
-
-:doc:`HowToSetUpLLVMStyleRTTI`
-  How to make ``isa<>``, ``dyn_cast<>``, etc. available for clients of your
-  class hierarchy.
-
-:doc:`BlockFrequencyTerminology`
-   Provides information about terminology used in the ``BlockFrequencyInfo``
-   analysis pass.
-
-:doc:`BranchWeightMetadata`
-   Provides information about Branch Prediction Information.
-
-:doc:`GetElementPtr`
-  Answers to some very frequent questions about LLVM's most frequently
-  misunderstood instruction.
-
-:doc:`ScudoHardenedAllocator`
-  A library that implements a security-hardened `malloc()`.
-
-:doc:`MemoryModelRelaxationAnnotations`
-  Target-defined relaxation to LLVM's concurrency model.
-
-:doc:`MemTagSanitizer`
-  Security hardening for production code aiming to mitigate memory
-  related vulnerabilities. Based on the Armv8.5-A Memory Tagging Extension.
-
-:doc:`Dependence Graphs <DependenceGraphs/index>`
-  A description of the design of the various dependence graphs such as
-  the DDG (Data Dependence Graph).
-
-:doc:`SpeculativeLoadHardening`
-  A description of the Speculative Load Hardening mitigation for Spectre v1.
-
-:doc:`SegmentedStacks`
-   This document describes segmented stacks and how they are used in LLVM.
-
-:doc:`MarkedUpDisassembly`
-   This document describes the optional rich disassembly output syntax.
-
-:doc:`StackMaps`
-  LLVM support for mapping instruction addresses to the location of
-  values and allowing code to be patched.
-
-:doc:`Coroutines`
-  LLVM support for coroutines.
-
-:doc:`PointerAuth`
-  A description of pointer authentication, its LLVM IR representation, and its
-  support in the backend.
-
-:doc:`YamlIO`
-   A reference guide for using LLVM's YAML I/O library.
-
-:doc:`ConvergenceAndUniformity`
-   A description of uniformity analysis in the presence of irreducible
-   control flow, and its implementation.
+Reference
+=========
+
+LLVM and API reference documentation.
+
+.. contents::
+   :local:
+
+.. toctree::
+   :hidden:
+
+   Atomics
+   BitCodeFormat
+   BlockFrequencyTerminology
+   BranchWeightMetadata
+   Bugpoint
+   CommandGuide/index
+   ConvergenceAndUniformity
+   ConvergentOperations
+   Coroutines
+   DependenceGraphs/index
+   ExceptionHandling
+   Extensions
+   FaultMaps
+   FuzzingLLVM
+   GarbageCollection
+   GetElementPtr
+   GlobalISel/index
+   GwpAsan
+   HowToSetUpLLVMStyleRTTI
+   HowToUseAttributes
+   InAlloca
+   LangRef
+   LibFuzzer
+   MarkedUpDisassembly
+   MIRLangRef
+   OptBisect
+   PCSectionsMetadata
+   PDB/index
+   PointerAuth
+   ScudoHardenedAllocator
+   MemoryModelRelaxationAnnotations
+   MemTagSanitizer
+   Security
+   SecurityTransparencyReports
+   SegmentedStacks
+   StackMaps
+   SpeculativeLoadHardening
+   Statepoints
+   SymbolizerMarkupFormat
+   SystemLibrary
+   TestingGuide
+   TransformMetadata
+   TypeMetadata
+   XRay
+   XRayExample
+   XRayFDRFormat
+   YamlIO
+
+API Reference
+-------------
+
+`Doxygen generated documentation <https://llvm.org/doxygen/>`_
+  (`classes <https://llvm.org/doxygen/inherits.html>`_)
+
+:doc:`HowToUseAttributes`
+  Answers some questions about the new Attributes infrastructure.
+
+LLVM Reference
+--------------
+
+======================
+Command Line Utilities
+======================
+
+:doc:`LLVM Command Guide <CommandGuide/index>`
+   A reference manual for the LLVM command line utilities ("man" pages for LLVM
+   tools).
+
+:doc:`Bugpoint`
+   Automatic bug finder and test-case reducer description and usage
+   information.
+
+:doc:`OptBisect`
+  A command line option for debugging optimization-induced failures.
+
+:doc:`SymbolizerMarkupFormat`
+  A reference for the log symbolizer markup accepted by ``llvm-symbolizer``.
+
+:doc:`The Microsoft PDB File Format <PDB/index>`
+  A detailed description of the Microsoft PDB (Program Database) file format.
+
+==================
+Garbage Collection
+==================
+
+:doc:`GarbageCollection`
+   The interfaces source-language compilers should use for compiling GC'd
+   programs.
+
+:doc:`Statepoints`
+  This describes a set of experimental extensions for garbage
+  collection support.
+
+=========
+LibFuzzer
+=========
+
+:doc:`LibFuzzer`
+  A library for writing in-process guided fuzzers.
+
+:doc:`FuzzingLLVM`
+  Information on writing and using Fuzzers to find bugs in LLVM.
+
+========
+LLVM IR
+========
+
+:doc:`LLVM Language Reference Manual <LangRef>`
+  Defines the LLVM intermediate representation and the assembly form of the
+  different nodes.
+
+:doc:`InAlloca`
+  Description of the ``inalloca`` argument attribute.
+
+:doc:`BitCodeFormat`
+   This describes the file format and encoding used for LLVM "bc" files.
+
+:doc:`Machine IR (MIR) Format Reference Manual <MIRLangRef>`
+   A reference manual for the MIR serialization format, which is used to test
+   LLVM's code generation passes.
+
+:doc:`GlobalISel/index`
+  This describes the prototype instruction selection replacement, GlobalISel.
+
+:doc:`ConvergentOperations`
+  Description of ``convergent`` operation semantics and related intrinsics.
+
+=====================
+Testing and Debugging
+=====================
+
+:doc:`LLVM Testing Infrastructure Guide <TestingGuide>`
+   A reference manual for using the LLVM testing infrastructure.
+
+:doc:`TestSuiteGuide`
+  Describes how to compile and run the test-suite benchmarks.
+
+
+:doc:`GwpAsan`
+  A sampled heap memory error detection toolkit designed for production use.
+
+====
+XRay
+====
+
+:doc:`XRay`
+  High-level documentation of how to use XRay in LLVM.
+
+:doc:`XRayExample`
+  An example of how to debug an application with XRay.
+
+=================
+Additional Topics
+=================
+
+:doc:`FaultMaps`
+  LLVM support for folding control flow into faulting machine instructions.
+
+:doc:`Atomics`
+  Information about LLVM's concurrency model.
+
+:doc:`ExceptionHandling`
+   This document describes the design and implementation of exception handling
+   in LLVM.
+
+:doc:`Extensions`
+  LLVM-specific extensions to tools and formats LLVM seeks compatibility with.
+
+:doc:`HowToSetUpLLVMStyleRTTI`
+  How to make ``isa<>``, ``dyn_cast<>``, etc. available for clients of your
+  class hierarchy.
+
+:doc:`BlockFrequencyTerminology`
+   Provides information about terminology used in the ``BlockFrequencyInfo``
+   analysis pass.
+
+:doc:`BranchWeightMetadata`
+   Provides information about Branch Prediction Information.
+
+:doc:`GetElementPtr`
+  Answers to some very frequent questions about LLVM's most frequently
+  misunderstood instruction.
+
+:doc:`ScudoHardenedAllocator`
+  A library that implements a security-hardened `malloc()`.
+
+:doc:`MemoryModelRelaxationAnnotations`
+  Target-defined relaxation to LLVM's concurrency model.
+
+:doc:`MemTagSanitizer`
+  Security hardening for production code aiming to mitigate memory
+  related vulnerabilities. Based on the Armv8.5-A Memory Tagging Extension.
+
+:doc:`Dependence Graphs <DependenceGraphs/index>`
+  A description of the design of the various dependence graphs such as
+  the DDG (Data Dependence Graph).
+
+:doc:`SpeculativeLoadHardening`
+  A description of the Speculative Load Hardening mitigation for Spectre v1.
+
+:doc:`SegmentedStacks`
+   This document describes segmented stacks and how they are used in LLVM.
+
+:doc:`MarkedUpDisassembly`
+   This document describes the optional rich disassembly output syntax.
+
+:doc:`StackMaps`
+  LLVM support for mapping instruction addresses to the location of
+  values and allowing code to be patched.
+
+:doc:`Coroutines`
+  LLVM support for coroutines.
+
+:doc:`PointerAuth`
+  A description of pointer authentication, its LLVM IR representation, and its
+  support in the backend.
+
+:doc:`YamlIO`
+   A reference guide for using LLVM's YAML I/O library.
+
+:doc:`ConvergenceAndUniformity`
+   A description of uniformity analysis in the presence of irreducible
+   control flow, and its implementation.
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index d8cc667723f5..f2577e1684f5 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -76,14 +76,23 @@ Changes to the AArch64 Backend
 * Added support for Cortex-A78AE, Cortex-A520AE, Cortex-A720AE,
   Cortex-R82AE, Neoverse-N3, Neoverse-V3 and Neoverse-V3AE CPUs.
 
+* ``-mbranch-protection=standard`` now enables FEAT_PAuth_LR by
+  default when the feature is enabled. The new behaviour results 
+  in ``standard`` being equal to ``bti+pac-ret+pc`` when ``+pauth-lr``
+  is passed as part of ``-mcpu=`` options.
+
 Changes to the AMDGPU Backend
 -----------------------------
 
 * Implemented the ``llvm.get.fpenv`` and ``llvm.set.fpenv`` intrinsics.
 
+* Implemented :ref:`llvm.get.rounding <int_get_rounding>` and :ref:`llvm.set.rounding <int_set_rounding>`
+
 Changes to the ARM Backend
 --------------------------
+
 * FEAT_F32MM is no longer activated by default when using `+sve` on v8.6-A or greater. The feature is still available and can be used by adding `+f32mm` to the command line options.
+* armv8-r now implies only fp-armv8d16sp, rather than neon and full fp-armv8. These features are still included by default for cortex-r52. The default cpu for armv8-r is now "generic", for compatibility with variants that do not include neon, fp64, and d32.
 
 Changes to the AVR Backend
 --------------------------
@@ -119,6 +128,7 @@ Changes to the RISC-V Backend
 * Added the CSR names from the Resumable Non-Maskable Interrupts (Smrnmi) extension.
 * llvm-objdump now prints disassembled opcode bytes in groups of 2 or 4 bytes to
   match GNU objdump. The bytes within the groups are in big endian order.
+* Added smstateen extension to -march. CSR names for smstateen were already supported.
 
 Changes to the WebAssembly Backend
 ----------------------------------
@@ -161,6 +171,15 @@ Changes to the C API
 * Added ``LLVMAtomicRMWBinOpUIncWrap`` and ``LLVMAtomicRMWBinOpUDecWrap`` to
   ``LLVMAtomicRMWBinOp`` enum for AtomicRMW instructions.
 
+* Added ``LLVMCreateConstantRangeAttribute`` function for creating ConstantRange Attributes.
+
+* Added the following functions for creating and accessing data for CallBr instructions:
+
+  * ``LLVMBuildCallBr``
+  * ``LLVMGetCallBrDefaultDest``
+  * ``LLVMGetCallBrNumIndirectDests``
+  * ``LLVMGetCallBrIndirectDest``
+
 Changes to the CodeGen infrastructure
 -------------------------------------
 
@@ -170,6 +189,13 @@ Changes to the Metadata Info
 Changes to the Debug Info
 ---------------------------------
 
+* LLVM has switched from using debug intrinsics internally to using debug
+  records by default. This should happen transparently when using the DIBuilder
+  to construct debug variable information, but will require changes for any code
+  that interacts with debug intrinsics directly. Debug intrinsics will only be
+  supported on a best-effort basis from here onwards; for more information, see
+  the `migration docs <https://llvm.org/docs/RemoveDIsDebugInfo.html>`_.
+
 Changes to the LLVM tools
 ---------------------------------
 * llvm-nm and llvm-objdump can now print symbol information from linked
@@ -211,6 +237,14 @@ Changes to the LLVM tools
   (`#89162 <https://github.com/llvm/llvm-project/pull/89162>`_)
   ``--raw-relr`` has been removed.
 
+* llvm-mca now aborts by default if it is given bad input where previously it
+  would continue. Additionally, it can now continue when it encounters
+  instructions which lack scheduling information. The behaviour can be
+  controlled by the newly introduced
+  `--skip-unsupported-instructions=<none|lack-sched|parse-failure|any>`, as
+  documented in `--help` output and the command guide. (`#90474
+  <https://github.com/llvm/llvm-project/pull/90474>`)
+
 Changes to LLDB
 ---------------------------------
 
diff --git a/llvm/docs/TestingGuide.rst b/llvm/docs/TestingGuide.rst
index e32e4d1e535a..e24feb3bf5fa 100644
--- a/llvm/docs/TestingGuide.rst
+++ b/llvm/docs/TestingGuide.rst
@@ -433,6 +433,87 @@ actually participate in the test besides holding the ``RUN:`` lines.
   putting the extra files in an ``Inputs/`` directory. This pattern is
   deprecated.
 
+Elaborated tests
+----------------
+
+Generally, IR and assembly test files benefit from being cleaned to remove
+unnecessary details. However, for tests requiring elaborate IR or assembly
+files where cleanup is less practical (e.g., large amount of debug information
+output from Clang), you can include generation instructions within
+``split-file`` part called ``gen``. Then, run
+``llvm/utils/update_test_body.py`` on the test file to generate the needed
+content.
+
+.. code-block:: none
+
+    ; RUN: rm -rf %t && split-file %s %t && cd %t
+    ; RUN: opt -S a.ll ... | FileCheck %s
+
+    ; CHECK: hello
+
+    ;--- a.cc
+    int va;
+    ;--- gen
+    clang --target=x86_64-linux -S -emit-llvm -g a.cc -o -
+
+    ;--- a.ll
+    # content generated by the script 'gen'
+
+.. code-block:: bash
+
+   PATH=/path/to/clang_build/bin:$PATH llvm/utils/update_test_body.py path/to/test.ll
+
+The script will prepare extra files with ``split-file``, invoke ``gen``, and
+then rewrite the part after ``gen`` with its stdout.
+
+For convenience, if the test needs one single assembly file, you can also wrap
+``gen`` and its required files with ``.ifdef`` and ``.endif``. Then you can
+skip ``split-file`` in RUN lines.
+
+.. code-block:: none
+
+    # RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o a.o
+    # RUN: ... | FileCheck %s
+
+    # CHECK: hello
+
+    .ifdef GEN
+    #--- a.cc
+    int va;
+    #--- gen
+    clang --target=x86_64-linux -S -g a.cc -o -
+    .endif
+    # content generated by the script 'gen'
+
+.. note::
+
+  Consider specifying an explicit target triple to avoid differences when
+  regeneration is needed on another machine.
+
+  ``gen`` is invoked with ``PWD`` set to ``/proc/self/cwd``. Clang commands
+  don't need ``-fdebug-compilation-dir=`` since its default value is ``PWD``.
+
+  Check prefixes should be placed before ``.endif`` since the part after
+  ``.endif`` is replaced.
+
+If the test body contains multiple files, you can print ``---`` separators and
+utilize ``split-file`` in ``RUN`` lines.
+
+.. code-block:: none
+
+    # RUN: rm -rf %t && split-file %s %t && cd %t
+    ...
+
+    #--- a.cc
+    int va;
+    #--- b.cc
+    int vb;
+    #--- gen
+    clang --target=x86_64-linux -S -O1 -g a.cc -o -
+    echo '#--- b.s'
+    clang --target=x86_64-linux -S -O1 -g b.cc -o -
+    #--- a.s
+
 Fragile tests
 -------------
 
diff --git a/llvm/docs/UserGuides.rst b/llvm/docs/UserGuides.rst
index f40a04d414a2..18d273a51daf 100644
--- a/llvm/docs/UserGuides.rst
+++ b/llvm/docs/UserGuides.rst
@@ -1,286 +1,286 @@
-User Guides
-===========
-
-NOTE: If you are a user who is only interested in using an LLVM-based compiler,
-you should look into `Clang <https://clang.llvm.org>`_ instead. The
-documentation here is intended for users who have a need to work with the
-intermediate LLVM representation.
-
-.. contents::
-   :local:
-
-.. toctree::
-   :hidden:
-
-   AArch64SME
-   AddingConstrainedIntrinsics
-   AdvancedBuilds
-   AliasAnalysis
-   AMDGPUUsage
-   Benchmarking
-   BigEndianNEON
-   BuildingADistribution
-   CFIVerify
-   CMake
-   CMakePrimer
-   CodeGenerator
-   CodeOfConduct
-   CommandLine
-   CompileCudaWithLLVM
-   CoverageMappingFormat
-   CycleTerminology
-   DebuggingJITedCode
-   DirectXUsage
-   Docker
-   FatLTO
-   ExtendingLLVM
-   GitHub
-   GoldPlugin
-   GlobalISel/MIRPatterns
-   HowToBuildOnARM
-   HowToBuildWithPGO
-   HowToBuildWindowsItaniumPrograms
-   HowToCrossCompileBuiltinsOnArm
-   HowToCrossCompileLLVM
-   HowToUpdateDebugInfo
-   InstCombineContributorGuide
-   InstrProfileFormat
-   InstrRefDebugInfo
-   LinkTimeOptimization
-   LoopTerminology
-   MarkdownQuickstartTemplate
-   MemorySSA
-   MergeFunctions
-   MCJITDesignAndImplementation
-   MisExpect
-   ORCv2
-   OpaquePointers
-   JITLink
-   NewPassManager
-   NVPTXUsage
-   Passes
-   ReportingGuide
-   ResponseGuide
-   Remarks
-   RemoveDIsDebugInfo
-   RISCVUsage
-   SourceLevelDebugging
-   SPIRVUsage
-   StackSafetyAnalysis
-   SupportLibrary
-   TableGen/index
-   TableGenFundamentals
-   Vectorizers
-   WritingAnLLVMPass
-   WritingAnLLVMNewPMPass
-   WritingAnLLVMBackend
-   yaml2obj
-
-Clang
------
-
-:doc:`HowToBuildOnARM`
-   Notes on building and testing LLVM/Clang on ARM.
-
-:doc:`HowToBuildWithPGO`
-    Notes on building LLVM/Clang with PGO.
-
-:doc:`HowToCrossCompileLLVM`
-   Notes on cross-building and testing LLVM/Clang.
-
-`How to build the C, C++, ObjC, and ObjC++ front end`__
-   Instructions for building the clang front-end from source.
-
-   .. __: https://clang.llvm.org/get_started.html
-
-:doc:`CoverageMappingFormat`
-  This describes the format and encoding used for LLVM’s code coverage mapping.
-
-:doc:`CFIVerify`
-  A description of the verification tool for Control Flow Integrity.
-
-LLVM Builds and Distributions
------------------------------
-
-:doc:`BuildingADistribution`
-  A best-practices guide for using LLVM's CMake build system to package and
-  distribute LLVM-based tools.
-
-:doc:`CMake`
-   An addendum to the main Getting Started guide for those using the `CMake
-   build system <http://www.cmake.org>`_.
-
-:doc:`Docker`
-   A reference for using Dockerfiles provided with LLVM.
-
-:doc:`Support Library <SupportLibrary>`
-   This document describes the LLVM Support Library (``lib/Support``) and
-   how to keep LLVM source code portable.
-
-:doc:`AdvancedBuilds`
-   This document describes more advanced build configurations.
-
-Optimizations
--------------
-
-:doc:`WritingAnLLVMNewPMPass`
-   Information on how to write LLVM transformations under the new pass
-   manager.
-
-:doc:`WritingAnLLVMPass`
-   Information on how to write LLVM transformations and analyses under the
-   legacy pass manager.
-
-:doc:`Passes`
-   A list of optimizations and analyses implemented in LLVM.
-
-:doc:`StackSafetyAnalysis`
-  This document describes the design of the stack safety analysis of local
-  variables.
-
-:doc:`MergeFunctions`
-  Describes functions merging optimization.
-
-:doc:`AliasAnalysis`
-   Information on how to write a new alias analysis implementation or how to
-   use existing analyses.
-
-:doc:`MemorySSA`
-   Information about the MemorySSA utility in LLVM, as well as how to use it.
-
-:doc:`LoopTerminology`
-  A document describing Loops and associated terms as used in LLVM.
-
-:doc:`CycleTerminology`
-  A document describing cycles as a generalization of loops.
-
-:doc:`Vectorizers`
-   This document describes the current status of vectorization in LLVM.
-
-:doc:`LinkTimeOptimization`
-   This document describes the interface between LLVM intermodular optimizer
-   and the linker and its design
-
-:doc:`GoldPlugin`
-   How to build your programs with link-time optimization on Linux.
-
-:doc:`Remarks`
-   A reference on the implementation of remarks in LLVM.
-
-:doc:`Source Level Debugging with LLVM <SourceLevelDebugging>`
-   This document describes the design and philosophy behind the LLVM
-   source-level debugger.
-
-:doc:`How to Update Debug Info <HowToUpdateDebugInfo>`
-   This document specifies how to correctly update debug info in various kinds
-   of code transformations.
-
-:doc:`InstrRefDebugInfo`
-   This document explains how LLVM uses value tracking, or instruction
-   referencing, to determine variable locations for debug info in the final
-   stages of compilation.
-
-:doc:`RemoveDIsDebugInfo`
-   This is a migration guide describing how to move from debug info using
-   intrinsics such as dbg.value to using the non-instruction DbgRecord object.
-
-:doc:`InstrProfileFormat`
-   This document explains two binary formats of instrumentation-based profiles.
-
-:doc:`InstCombineContributorGuide`
-   This document specifies guidelines for contributions for InstCombine and
-   related passes.
-
-Code Generation
----------------
-
-:doc:`WritingAnLLVMBackend`
-   Information on how to write LLVM backends for machine targets.
-
-:doc:`CodeGenerator`
-   The design and implementation of the LLVM code generator.  Useful if you are
-   working on retargetting LLVM to a new architecture, designing a new codegen
-   pass, or enhancing existing components.
-
-:doc:`TableGen <TableGen/index>`
-   Describes the TableGen tool, which is used heavily by the LLVM code
-   generator.
-
-==========
-GlobalISel
-==========
-
-:doc:`MIRPatterns <GlobalISel/MIRPatterns>`
-   Describes the design of MIR Patterns and how to use them.
-
-===
-JIT
-===
-
-:doc:`MCJITDesignAndImplementation`
-   Describes the inner workings of MCJIT execution engine.
-
-:doc:`ORCv2`
-   Describes the design and implementation of the ORC APIs, including some
-   usage examples, and a guide for users transitioning from ORCv1 to ORCv2.
-
-:doc:`JITLink`
-   Describes the design and APIs for the JITLink library, ORC's new JIT
-   linker.
-
-:doc:`DebuggingJITedCode`
-   How to debug JITed code with GDB.
-
-Additional Topics
------------------
-
-:doc:`CommandLine`
-  Provides information on using the command line parsing library.
-
-:doc:`ExtendingLLVM`
-  Look here to see how to add instructions and intrinsics to LLVM.
-
-:doc:`AddingConstrainedIntrinsics`
-   Gives the steps necessary when adding a new constrained math intrinsic
-   to LLVM.
-
-:doc:`HowToBuildWindowsItaniumPrograms`
-   Notes on assembling a Windows Itanium environment.
-
-:doc:`HowToCrossCompileBuiltinsOnArm`
-   Notes on cross-building and testing the compiler-rt builtins for Arm.
-
-:doc:`BigEndianNEON`
-  LLVM's support for generating NEON instructions on big endian ARM targets is
-  somewhat nonintuitive. This document explains the implementation and rationale.
-
-:doc:`AArch64SME`
-  LLVM's support for AArch64 SME ACLE and ABI.
-
-:doc:`CompileCudaWithLLVM`
-  LLVM support for CUDA.
-
-:doc:`NVPTXUsage`
-   This document describes using the NVPTX backend to compile GPU kernels.
-
-:doc:`AMDGPUUsage`
-   This document describes using the AMDGPU backend to compile GPU kernels.
-
-:doc:`AMDGPUDwarfExtensionsForHeterogeneousDebugging`
-   This document describes DWARF extensions to support heterogeneous debugging
-   for targets such as the AMDGPU backend.
-
-:doc:`AMDGPUDwarfExtensionAllowLocationDescriptionOnTheDwarfExpressionStack/AMDGPUDwarfExtensionAllowLocationDescriptionOnTheDwarfExpressionStack`
-   This document describes a DWARF extension to allow location descriptions on
-   the DWARF expression stack. It is part of
-   :doc:`AMDGPUDwarfExtensionsForHeterogeneousDebugging`.
-
-:doc:`SPIRVUsage`
-   This document describes using the SPIR-V target to compile GPU kernels.
-
-:doc:`DirectXUsage`
-   This document describes using the DirectX target to compile GPU code for the
-   DirectX runtime.
-
-:doc:`RISCVUsage`
-   This document describes using the RISCV-V target.
+User Guides
+===========
+
+NOTE: If you are a user who is only interested in using an LLVM-based compiler,
+you should look into `Clang <https://clang.llvm.org>`_ instead. The
+documentation here is intended for users who have a need to work with the
+intermediate LLVM representation.
+
+.. contents::
+   :local:
+
+.. toctree::
+   :hidden:
+
+   AArch64SME
+   AddingConstrainedIntrinsics
+   AdvancedBuilds
+   AliasAnalysis
+   AMDGPUUsage
+   Benchmarking
+   BigEndianNEON
+   BuildingADistribution
+   CFIVerify
+   CMake
+   CMakePrimer
+   CodeGenerator
+   CodeOfConduct
+   CommandLine
+   CompileCudaWithLLVM
+   CoverageMappingFormat
+   CycleTerminology
+   DebuggingJITedCode
+   DirectXUsage
+   Docker
+   FatLTO
+   ExtendingLLVM
+   GitHub
+   GoldPlugin
+   GlobalISel/MIRPatterns
+   HowToBuildOnARM
+   HowToBuildWithPGO
+   HowToBuildWindowsItaniumPrograms
+   HowToCrossCompileBuiltinsOnArm
+   HowToCrossCompileLLVM
+   HowToUpdateDebugInfo
+   InstCombineContributorGuide
+   InstrProfileFormat
+   InstrRefDebugInfo
+   LinkTimeOptimization
+   LoopTerminology
+   MarkdownQuickstartTemplate
+   MemorySSA
+   MergeFunctions
+   MCJITDesignAndImplementation
+   MisExpect
+   ORCv2
+   OpaquePointers
+   JITLink
+   NewPassManager
+   NVPTXUsage
+   Passes
+   ReportingGuide
+   ResponseGuide
+   Remarks
+   RemoveDIsDebugInfo
+   RISCVUsage
+   SourceLevelDebugging
+   SPIRVUsage
+   StackSafetyAnalysis
+   SupportLibrary
+   TableGen/index
+   TableGenFundamentals
+   Vectorizers
+   WritingAnLLVMPass
+   WritingAnLLVMNewPMPass
+   WritingAnLLVMBackend
+   yaml2obj
+
+Clang
+-----
+
+:doc:`HowToBuildOnARM`
+   Notes on building and testing LLVM/Clang on ARM.
+
+:doc:`HowToBuildWithPGO`
+    Notes on building LLVM/Clang with PGO.
+
+:doc:`HowToCrossCompileLLVM`
+   Notes on cross-building and testing LLVM/Clang.
+
+`How to build the C, C++, ObjC, and ObjC++ front end`__
+   Instructions for building the clang front-end from source.
+
+   .. __: https://clang.llvm.org/get_started.html
+
+:doc:`CoverageMappingFormat`
+  This describes the format and encoding used for LLVM’s code coverage mapping.
+
+:doc:`CFIVerify`
+  A description of the verification tool for Control Flow Integrity.
+
+LLVM Builds and Distributions
+-----------------------------
+
+:doc:`BuildingADistribution`
+  A best-practices guide for using LLVM's CMake build system to package and
+  distribute LLVM-based tools.
+
+:doc:`CMake`
+   An addendum to the main Getting Started guide for those using the `CMake
+   build system <http://www.cmake.org>`_.
+
+:doc:`Docker`
+   A reference for using Dockerfiles provided with LLVM.
+
+:doc:`Support Library <SupportLibrary>`
+   This document describes the LLVM Support Library (``lib/Support``) and
+   how to keep LLVM source code portable.
+
+:doc:`AdvancedBuilds`
+   This document describes more advanced build configurations.
+
+Optimizations
+-------------
+
+:doc:`WritingAnLLVMNewPMPass`
+   Information on how to write LLVM transformations under the new pass
+   manager.
+
+:doc:`WritingAnLLVMPass`
+   Information on how to write LLVM transformations and analyses under the
+   legacy pass manager.
+
+:doc:`Passes`
+   A list of optimizations and analyses implemented in LLVM.
+
+:doc:`StackSafetyAnalysis`
+  This document describes the design of the stack safety analysis of local
+  variables.
+
+:doc:`MergeFunctions`
+  Describes functions merging optimization.
+
+:doc:`AliasAnalysis`
+   Information on how to write a new alias analysis implementation or how to
+   use existing analyses.
+
+:doc:`MemorySSA`
+   Information about the MemorySSA utility in LLVM, as well as how to use it.
+
+:doc:`LoopTerminology`
+  A document describing Loops and associated terms as used in LLVM.
+
+:doc:`CycleTerminology`
+  A document describing cycles as a generalization of loops.
+
+:doc:`Vectorizers`
+   This document describes the current status of vectorization in LLVM.
+
+:doc:`LinkTimeOptimization`
+   This document describes the interface between LLVM intermodular optimizer
+   and the linker and its design
+
+:doc:`GoldPlugin`
+   How to build your programs with link-time optimization on Linux.
+
+:doc:`Remarks`
+   A reference on the implementation of remarks in LLVM.
+
+:doc:`Source Level Debugging with LLVM <SourceLevelDebugging>`
+   This document describes the design and philosophy behind the LLVM
+   source-level debugger.
+
+:doc:`How to Update Debug Info <HowToUpdateDebugInfo>`
+   This document specifies how to correctly update debug info in various kinds
+   of code transformations.
+
+:doc:`InstrRefDebugInfo`
+   This document explains how LLVM uses value tracking, or instruction
+   referencing, to determine variable locations for debug info in the final
+   stages of compilation.
+
+:doc:`RemoveDIsDebugInfo`
+   This is a migration guide describing how to move from debug info using
+   intrinsics such as dbg.value to using the non-instruction DbgRecord object.
+
+:doc:`InstrProfileFormat`
+   This document explains two binary formats of instrumentation-based profiles.
+
+:doc:`InstCombineContributorGuide`
+   This document specifies guidelines for contributions for InstCombine and
+   related passes.
+
+Code Generation
+---------------
+
+:doc:`WritingAnLLVMBackend`
+   Information on how to write LLVM backends for machine targets.
+
+:doc:`CodeGenerator`
+   The design and implementation of the LLVM code generator.  Useful if you are
+   working on retargetting LLVM to a new architecture, designing a new codegen
+   pass, or enhancing existing components.
+
+:doc:`TableGen <TableGen/index>`
+   Describes the TableGen tool, which is used heavily by the LLVM code
+   generator.
+
+==========
+GlobalISel
+==========
+
+:doc:`MIRPatterns <GlobalISel/MIRPatterns>`
+   Describes the design of MIR Patterns and how to use them.
+
+===
+JIT
+===
+
+:doc:`MCJITDesignAndImplementation`
+   Describes the inner workings of MCJIT execution engine.
+
+:doc:`ORCv2`
+   Describes the design and implementation of the ORC APIs, including some
+   usage examples, and a guide for users transitioning from ORCv1 to ORCv2.
+
+:doc:`JITLink`
+   Describes the design and APIs for the JITLink library, ORC's new JIT
+   linker.
+
+:doc:`DebuggingJITedCode`
+   How to debug JITed code with GDB.
+
+Additional Topics
+-----------------
+
+:doc:`CommandLine`
+  Provides information on using the command line parsing library.
+
+:doc:`ExtendingLLVM`
+  Look here to see how to add instructions and intrinsics to LLVM.
+
+:doc:`AddingConstrainedIntrinsics`
+   Gives the steps necessary when adding a new constrained math intrinsic
+   to LLVM.
+
+:doc:`HowToBuildWindowsItaniumPrograms`
+   Notes on assembling a Windows Itanium environment.
+
+:doc:`HowToCrossCompileBuiltinsOnArm`
+   Notes on cross-building and testing the compiler-rt builtins for Arm.
+
+:doc:`BigEndianNEON`
+  LLVM's support for generating NEON instructions on big endian ARM targets is
+  somewhat nonintuitive. This document explains the implementation and rationale.
+
+:doc:`AArch64SME`
+  LLVM's support for AArch64 SME ACLE and ABI.
+
+:doc:`CompileCudaWithLLVM`
+  LLVM support for CUDA.
+
+:doc:`NVPTXUsage`
+   This document describes using the NVPTX backend to compile GPU kernels.
+
+:doc:`AMDGPUUsage`
+   This document describes using the AMDGPU backend to compile GPU kernels.
+
+:doc:`AMDGPUDwarfExtensionsForHeterogeneousDebugging`
+   This document describes DWARF extensions to support heterogeneous debugging
+   for targets such as the AMDGPU backend.
+
+:doc:`AMDGPUDwarfExtensionAllowLocationDescriptionOnTheDwarfExpressionStack/AMDGPUDwarfExtensionAllowLocationDescriptionOnTheDwarfExpressionStack`
+   This document describes a DWARF extension to allow location descriptions on
+   the DWARF expression stack. It is part of
+   :doc:`AMDGPUDwarfExtensionsForHeterogeneousDebugging`.
+
+:doc:`SPIRVUsage`
+   This document describes using the SPIR-V target to compile GPU kernels.
+
+:doc:`DirectXUsage`
+   This document describes using the DirectX target to compile GPU code for the
+   DirectX runtime.
+
+:doc:`RISCVUsage`
+   This document describes using the RISCV-V target.
diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h
index 0b03f3b36fcd..9d09546513f0 100644
--- a/llvm/include/llvm-c/Core.h
+++ b/llvm/include/llvm-c/Core.h
@@ -674,6 +674,18 @@ LLVMAttributeRef LLVMCreateTypeAttribute(LLVMContextRef C, unsigned KindID,
 LLVMTypeRef LLVMGetTypeAttributeValue(LLVMAttributeRef A);
 
 /**
+ * Create a ConstantRange attribute.
+ *
+ * LowerWords and UpperWords need to be NumBits divided by 64 rounded up
+ * elements long.
+ */
+LLVMAttributeRef LLVMCreateConstantRangeAttribute(LLVMContextRef C,
+                                                  unsigned KindID,
+                                                  unsigned NumBits,
+                                                  const uint64_t LowerWords[],
+                                                  const uint64_t UpperWords[]);
+
+/**
  * Create a string attribute.
  */
 LLVMAttributeRef LLVMCreateStringAttribute(LLVMContextRef C,
@@ -3726,6 +3738,28 @@ void LLVMSetNormalDest(LLVMValueRef InvokeInst, LLVMBasicBlockRef B);
 void LLVMSetUnwindDest(LLVMValueRef InvokeInst, LLVMBasicBlockRef B);
 
 /**
+ * Get the default destination of a CallBr instruction.
+ *
+ * @see llvm::CallBrInst::getDefaultDest()
+ */
+LLVMBasicBlockRef LLVMGetCallBrDefaultDest(LLVMValueRef CallBr);
+
+/**
+ * Get the number of indirect destinations of a CallBr instruction.
+ *
+ * @see llvm::CallBrInst::getNumIndirectDests()
+
+ */
+unsigned LLVMGetCallBrNumIndirectDests(LLVMValueRef CallBr);
+
+/**
+ * Get the indirect destination of a CallBr instruction at the given index.
+ *
+ * @see llvm::CallBrInst::getIndirectDest()
+ */
+LLVMBasicBlockRef LLVMGetCallBrIndirectDest(LLVMValueRef CallBr, unsigned Idx);
+
+/**
  * @}
  */
 
@@ -4011,6 +4045,12 @@ LLVMValueRef LLVMBuildSwitch(LLVMBuilderRef, LLVMValueRef V,
                              LLVMBasicBlockRef Else, unsigned NumCases);
 LLVMValueRef LLVMBuildIndirectBr(LLVMBuilderRef B, LLVMValueRef Addr,
                                  unsigned NumDests);
+LLVMValueRef LLVMBuildCallBr(LLVMBuilderRef B, LLVMTypeRef Ty, LLVMValueRef Fn,
+                             LLVMBasicBlockRef DefaultDest,
+                             LLVMBasicBlockRef *IndirectDests,
+                             unsigned NumIndirectDests, LLVMValueRef *Args,
+                             unsigned NumArgs, LLVMOperandBundleRef *Bundles,
+                             unsigned NumBundles, const char *Name);
 LLVMValueRef LLVMBuildInvoke2(LLVMBuilderRef, LLVMTypeRef Ty, LLVMValueRef Fn,
                               LLVMValueRef *Args, unsigned NumArgs,
                               LLVMBasicBlockRef Then, LLVMBasicBlockRef Catch,
diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h
index 8d3c029b2e7e..2fd8b7ea636c 100644
--- a/llvm/include/llvm/ADT/APInt.h
+++ b/llvm/include/llvm/ADT/APInt.h
@@ -1398,6 +1398,13 @@ public:
     *this &= Keep;
   }
 
+  /// Set top hiBits bits to 0.
+  void clearHighBits(unsigned hiBits) {
+    assert(hiBits <= BitWidth && "More bits than bitwidth");
+    APInt Keep = getLowBitsSet(BitWidth, BitWidth - hiBits);
+    *this &= Keep;
+  }
+
   /// Set the sign bit to 0.
   void clearSignBit() { clearBit(BitWidth - 1); }
 
diff --git a/llvm/include/llvm/ADT/GenericCycleImpl.h b/llvm/include/llvm/ADT/GenericCycleImpl.h
index 74faff98b903..ab9c421a4469 100644
--- a/llvm/include/llvm/ADT/GenericCycleImpl.h
+++ b/llvm/include/llvm/ADT/GenericCycleImpl.h
@@ -67,6 +67,21 @@ void GenericCycle<ContextT>::getExitBlocks(
 }
 
 template <typename ContextT>
+void GenericCycle<ContextT>::getExitingBlocks(
+    SmallVectorImpl<BlockT *> &TmpStorage) const {
+  TmpStorage.clear();
+
+  for (BlockT *Block : blocks()) {
+    for (BlockT *Succ : successors(Block)) {
+      if (!contains(Succ)) {
+        TmpStorage.push_back(Block);
+        break;
+      }
+    }
+  }
+}
+
+template <typename ContextT>
 auto GenericCycle<ContextT>::getCyclePreheader() const -> BlockT * {
   BlockT *Predecessor = getCyclePredecessor();
   if (!Predecessor)
diff --git a/llvm/include/llvm/ADT/GenericCycleInfo.h b/llvm/include/llvm/ADT/GenericCycleInfo.h
index 83c4c2759d46..b601fc9bae38 100644
--- a/llvm/include/llvm/ADT/GenericCycleInfo.h
+++ b/llvm/include/llvm/ADT/GenericCycleInfo.h
@@ -126,6 +126,10 @@ public:
   /// branched to.
   void getExitBlocks(SmallVectorImpl<BlockT *> &TmpStorage) const;
 
+  /// Return all blocks of this cycle that have successor outside of this cycle.
+  /// These blocks have cycle exit branch.
+  void getExitingBlocks(SmallVectorImpl<BlockT *> &TmpStorage) const;
+
   /// Return the preheader block for this cycle. Pre-header is well-defined for
   /// reducible cycle in docs/LoopTerminology.rst as: the only one entering
   /// block and its only edge is to the entry block. Return null for irreducible
diff --git a/llvm/include/llvm/ADT/LazyAtomicPointer.h b/llvm/include/llvm/ADT/LazyAtomicPointer.h
index 890584746220..c4fd38963449 100644
--- a/llvm/include/llvm/ADT/LazyAtomicPointer.h
+++ b/llvm/include/llvm/ADT/LazyAtomicPointer.h
@@ -33,7 +33,7 @@ namespace llvm {
 /// std::atomic<T>::notify_all() in \a loadOrGenerate().
 template <class T> class LazyAtomicPointer {
   static constexpr uintptr_t getNull() { return 0; }
-  static constexpr uintptr_t getBusy() { return -1ULL; }
+  static constexpr uintptr_t getBusy() { return UINTPTR_MAX; }
 
   static T *makePointer(uintptr_t Value) {
     assert(Value != getBusy());
diff --git a/llvm/include/llvm/ADT/SmallString.h b/llvm/include/llvm/ADT/SmallString.h
index a5b9eec50c82..be3193c6ef9b 100644
--- a/llvm/include/llvm/ADT/SmallString.h
+++ b/llvm/include/llvm/ADT/SmallString.h
@@ -89,7 +89,7 @@ public:
 
   /// Check for string equality.  This is more efficient than compare() when
   /// the relative ordering of inequal strings isn't needed.
-  [[nodiscard]] bool equals(StringRef RHS) const { return str().equals(RHS); }
+  [[nodiscard]] bool equals(StringRef RHS) const { return str() == RHS; }
 
   /// Check for string equality, ignoring case.
   [[nodiscard]] bool equals_insensitive(StringRef RHS) const {
diff --git a/llvm/include/llvm/ADT/StringMap.h b/llvm/include/llvm/ADT/StringMap.h
index 453d91349e35..daaf82654e09 100644
--- a/llvm/include/llvm/ADT/StringMap.h
+++ b/llvm/include/llvm/ADT/StringMap.h
@@ -291,8 +291,10 @@ public:
       if (FindInRHS == RHS.end())
         return false;
 
-      if (!(KeyValue.getValue() == FindInRHS->getValue()))
-        return false;
+      if constexpr (!std::is_same_v<ValueTy, std::nullopt_t>) {
+        if (!(KeyValue.getValue() == FindInRHS->getValue()))
+          return false;
+      }
     }
 
     return true;
diff --git a/llvm/include/llvm/ADT/StringRef.h b/llvm/include/llvm/ADT/StringRef.h
index 04496c76e072..8ed8e424cfe1 100644
--- a/llvm/include/llvm/ADT/StringRef.h
+++ b/llvm/include/llvm/ADT/StringRef.h
@@ -871,7 +871,11 @@ namespace llvm {
   /// @{
 
   inline bool operator==(StringRef LHS, StringRef RHS) {
-    return LHS.equals(RHS);
+    if (LHS.size() != RHS.size())
+      return false;
+    if (LHS.empty())
+      return true;
+    return ::memcmp(LHS.data(), RHS.data(), LHS.size()) == 0;
   }
 
   inline bool operator!=(StringRef LHS, StringRef RHS) { return !(LHS == RHS); }
diff --git a/llvm/include/llvm/Analysis/DOTGraphTraitsPass.h b/llvm/include/llvm/Analysis/DOTGraphTraitsPass.h
index da72fb511f82..7aea7a3b0f6d 100644
--- a/llvm/include/llvm/Analysis/DOTGraphTraitsPass.h
+++ b/llvm/include/llvm/Analysis/DOTGraphTraitsPass.h
@@ -87,13 +87,12 @@ private:
 };
 
 static inline void shortenFileName(std::string &FN, unsigned char len = 250) {
-
-  FN = FN.substr(0, len);
-
+  if (FN.length() > len)
+    FN.resize(len);
   auto strLen = FN.length();
   while (strLen > 0) {
-    if (auto it = nameObj.find(FN); it != nameObj.end()) {
-      FN = FN.substr(0, --len);
+    if (nameObj.find(FN) != nameObj.end()) {
+      FN.resize(--len);
     } else {
       nameObj.insert(FN);
       break;
diff --git a/llvm/include/llvm/Analysis/IndirectCallVisitor.h b/llvm/include/llvm/Analysis/IndirectCallVisitor.h
index 50815f4e3e83..66c972572b06 100644
--- a/llvm/include/llvm/Analysis/IndirectCallVisitor.h
+++ b/llvm/include/llvm/Analysis/IndirectCallVisitor.h
@@ -27,31 +27,21 @@ struct PGOIndirectCallVisitor : public InstVisitor<PGOIndirectCallVisitor> {
   std::vector<Instruction *> ProfiledAddresses;
   PGOIndirectCallVisitor(InstructionType Type) : Type(Type) {}
 
-  void visitCallBase(CallBase &Call) {
-    if (!Call.isIndirectCall())
-      return;
-
-    if (Type == InstructionType::kIndirectCall) {
-      IndirectCalls.push_back(&Call);
-      return;
-    }
-
-    assert(Type == InstructionType::kVTableVal && "Control flow guaranteed");
+  // Given an indirect call instruction, try to find the the following pattern
+  //
+  // %vtable = load ptr, ptr %obj
+  // %vfn = getelementptr inbounds ptr, ptr %vtable, i64 1
+  // %2 = load ptr, ptr %vfn
+  // $call = tail call i32 %2
+  //
+  // A heuristic is used to find the address feeding instructions.
+  static Instruction *tryGetVTableInstruction(CallBase *CB) {
+    assert(CB != nullptr && "Caller guaranteed");
+    LoadInst *LI = dyn_cast<LoadInst>(CB->getCalledOperand());
 
-    LoadInst *LI = dyn_cast<LoadInst>(Call.getCalledOperand());
-    // The code pattern to look for
-    //
-    // %vtable = load ptr, ptr %b
-    // %vfn = getelementptr inbounds ptr, ptr %vtable, i64 1
-    // %2 = load ptr, ptr %vfn
-    // %call = tail call i32 %2(ptr %b)
-    //
-    // %vtable is the vtable address value to profile, and
-    // %2 is the indirect call target address to profile.
     if (LI != nullptr) {
-      Value *Ptr = LI->getPointerOperand();
-      Value *VTablePtr = Ptr->stripInBoundsConstantOffsets();
-      // This is a heuristic to find address feeding instructions.
+      Value *FuncPtr = LI->getPointerOperand(); // GEP (or bitcast)
+      Value *VTablePtr = FuncPtr->stripInBoundsConstantOffsets();
       // FIXME: Add support in the frontend so LLVM type intrinsics are
       // emitted without LTO. This way, added intrinsics could filter
       // non-vtable instructions and reduce instrumentation overhead.
@@ -63,7 +53,22 @@ struct PGOIndirectCallVisitor : public InstVisitor<PGOIndirectCallVisitor> {
       // address is negligible if exists at all. Comparing loaded address
       // with symbol address guarantees correctness.
       if (VTablePtr != nullptr && isa<Instruction>(VTablePtr))
-        ProfiledAddresses.push_back(cast<Instruction>(VTablePtr));
+        return cast<Instruction>(VTablePtr);
+    }
+    return nullptr;
+  }
+
+  void visitCallBase(CallBase &Call) {
+    if (Call.isIndirectCall()) {
+      IndirectCalls.push_back(&Call);
+
+      if (Type != InstructionType::kVTableVal)
+        return;
+
+      Instruction *VPtr =
+          PGOIndirectCallVisitor::tryGetVTableInstruction(&Call);
+      if (VPtr)
+        ProfiledAddresses.push_back(VPtr);
     }
   }
 
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index e39c371b41ec..6ebd0fb8477a 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -160,9 +160,9 @@ public:
         : Source(Source), Destination(Destination), Type(Type) {}
 
     /// Return the source instruction of the dependence.
-    Instruction *getSource(const LoopAccessInfo &LAI) const;
+    Instruction *getSource(const MemoryDepChecker &DepChecker) const;
     /// Return the destination instruction of the dependence.
-    Instruction *getDestination(const LoopAccessInfo &LAI) const;
+    Instruction *getDestination(const MemoryDepChecker &DepChecker) const;
 
     /// Dependence types that don't prevent vectorization.
     static VectorizationSafetyStatus isSafeForVectorization(DepType Type);
@@ -181,8 +181,10 @@ public:
                const SmallVectorImpl<Instruction *> &Instrs) const;
   };
 
-  MemoryDepChecker(PredicatedScalarEvolution &PSE, const Loop *L)
-      : PSE(PSE), InnermostLoop(L) {}
+  MemoryDepChecker(PredicatedScalarEvolution &PSE, const Loop *L,
+                   unsigned MaxTargetVectorWidthInBits)
+      : PSE(PSE), InnermostLoop(L),
+        MaxTargetVectorWidthInBits(MaxTargetVectorWidthInBits) {}
 
   /// Register the location (instructions are given increasing numbers)
   /// of a write access.
@@ -314,6 +316,12 @@ private:
   /// RecordDependences is true.
   SmallVector<Dependence, 8> Dependences;
 
+  /// The maximum width of a target's vector registers multiplied by 2 to also
+  /// roughly account for additional interleaving. Is used to decide if a
+  /// backwards dependence with non-constant stride should be classified as
+  /// backwards-vectorizable or unknown (triggering a runtime check).
+  unsigned MaxTargetVectorWidthInBits = 0;
+
   /// Check whether there is a plausible dependence between the two
   /// accesses.
   ///
@@ -575,11 +583,16 @@ private:
 /// PSE must be emitted in order for the results of this analysis to be valid.
 class LoopAccessInfo {
 public:
-  LoopAccessInfo(Loop *L, ScalarEvolution *SE, const TargetLibraryInfo *TLI,
-                 AAResults *AA, DominatorTree *DT, LoopInfo *LI);
+  LoopAccessInfo(Loop *L, ScalarEvolution *SE, const TargetTransformInfo *TTI,
+                 const TargetLibraryInfo *TLI, AAResults *AA, DominatorTree *DT,
+                 LoopInfo *LI);
 
   /// Return true we can analyze the memory accesses in the loop and there are
-  /// no memory dependence cycles.
+  /// no memory dependence cycles. Note that for dependences between loads &
+  /// stores with uniform addresses,
+  /// hasStoreStoreDependenceInvolvingLoopInvariantAddress and
+  /// hasLoadStoreDependenceInvolvingLoopInvariantAddress also need to be
+  /// checked.
   bool canVectorizeMemory() const { return CanVecMem; }
 
   /// Return true if there is a convergent operation in the loop. There may
@@ -632,10 +645,16 @@ public:
   /// Print the information about the memory accesses in the loop.
   void print(raw_ostream &OS, unsigned Depth = 0) const;
 
-  /// If the loop has memory dependence involving an invariant address, i.e. two
-  /// stores or a store and a load, then return true, else return false.
-  bool hasDependenceInvolvingLoopInvariantAddress() const {
-    return HasDependenceInvolvingLoopInvariantAddress;
+  /// Return true if the loop has memory dependence involving two stores to an
+  /// invariant address, else return false.
+  bool hasStoreStoreDependenceInvolvingLoopInvariantAddress() const {
+    return HasStoreStoreDependenceInvolvingLoopInvariantAddress;
+  }
+
+  /// Return true if the loop has memory dependence involving a load and a store
+  /// to an invariant address, else return false.
+  bool hasLoadStoreDependenceInvolvingLoopInvariantAddress() const {
+    return HasLoadStoreDependenceInvolvingLoopInvariantAddress;
   }
 
   /// Return the list of stores to invariant addresses.
@@ -697,8 +716,12 @@ private:
   bool CanVecMem = false;
   bool HasConvergentOp = false;
 
-  /// Indicator that there are non vectorizable stores to a uniform address.
-  bool HasDependenceInvolvingLoopInvariantAddress = false;
+  /// Indicator that there are two non vectorizable stores to the same uniform
+  /// address.
+  bool HasStoreStoreDependenceInvolvingLoopInvariantAddress = false;
+  /// Indicator that there is non vectorizable load and store to the same
+  /// uniform address.
+  bool HasLoadStoreDependenceInvolvingLoopInvariantAddress = false;
 
   /// List of stores to invariant addresses.
   SmallVector<StoreInst *> StoresToInvariantAddresses;
@@ -785,12 +808,14 @@ class LoopAccessInfoManager {
   AAResults &AA;
   DominatorTree &DT;
   LoopInfo &LI;
+  TargetTransformInfo *TTI;
   const TargetLibraryInfo *TLI = nullptr;
 
 public:
   LoopAccessInfoManager(ScalarEvolution &SE, AAResults &AA, DominatorTree &DT,
-                        LoopInfo &LI, const TargetLibraryInfo *TLI)
-      : SE(SE), AA(AA), DT(DT), LI(LI), TLI(TLI) {}
+                        LoopInfo &LI, TargetTransformInfo *TTI,
+                        const TargetLibraryInfo *TLI)
+      : SE(SE), AA(AA), DT(DT), LI(LI), TTI(TTI), TLI(TLI) {}
 
   const LoopAccessInfo &getInfo(Loop &L);
 
@@ -819,13 +844,13 @@ public:
 };
 
 inline Instruction *MemoryDepChecker::Dependence::getSource(
-    const LoopAccessInfo &LAI) const {
-  return LAI.getDepChecker().getMemoryInstructions()[Source];
+    const MemoryDepChecker &DepChecker) const {
+  return DepChecker.getMemoryInstructions()[Source];
 }
 
 inline Instruction *MemoryDepChecker::Dependence::getDestination(
-    const LoopAccessInfo &LAI) const {
-  return LAI.getDepChecker().getMemoryInstructions()[Destination];
+    const MemoryDepChecker &DepChecker) const {
+  return DepChecker.getMemoryInstructions()[Destination];
 }
 
 } // End llvm namespace
diff --git a/llvm/include/llvm/Analysis/MemorySSA.h b/llvm/include/llvm/Analysis/MemorySSA.h
index caf0e31fd37d..2ca5c281166c 100644
--- a/llvm/include/llvm/Analysis/MemorySSA.h
+++ b/llvm/include/llvm/Analysis/MemorySSA.h
@@ -110,6 +110,7 @@ namespace llvm {
 template <class GraphType> struct GraphTraits;
 class BasicBlock;
 class Function;
+class Loop;
 class Instruction;
 class LLVMContext;
 class MemoryAccess;
@@ -700,6 +701,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(MemoryPhi, MemoryAccess)
 class MemorySSA {
 public:
   MemorySSA(Function &, AliasAnalysis *, DominatorTree *);
+  MemorySSA(Loop &, AliasAnalysis *, DominatorTree *);
 
   // MemorySSA must remain where it's constructed; Walkers it creates store
   // pointers to it.
@@ -800,10 +802,11 @@ protected:
   // Used by Memory SSA dumpers and wrapper pass
   friend class MemorySSAUpdater;
 
+  template <typename IterT>
   void verifyOrderingDominationAndDefUses(
-      Function &F, VerificationLevel = VerificationLevel::Fast) const;
-  void verifyDominationNumbers(const Function &F) const;
-  void verifyPrevDefInPhis(Function &F) const;
+      IterT Blocks, VerificationLevel = VerificationLevel::Fast) const;
+  template <typename IterT> void verifyDominationNumbers(IterT Blocks) const;
+  template <typename IterT> void verifyPrevDefInPhis(IterT Blocks) const;
 
   // This is used by the use optimizer and updater.
   AccessList *getWritableBlockAccesses(const BasicBlock *BB) const {
@@ -847,7 +850,8 @@ private:
   class OptimizeUses;
 
   CachingWalker *getWalkerImpl();
-  void buildMemorySSA(BatchAAResults &BAA);
+  template <typename IterT>
+  void buildMemorySSA(BatchAAResults &BAA, IterT Blocks);
 
   void prepareForMoveTo(MemoryAccess *, BasicBlock *);
   void verifyUseInDefs(MemoryAccess *, MemoryAccess *) const;
@@ -871,7 +875,8 @@ private:
   void renumberBlock(const BasicBlock *) const;
   AliasAnalysis *AA = nullptr;
   DominatorTree *DT;
-  Function &F;
+  Function *F = nullptr;
+  Loop *L = nullptr;
 
   // Memory SSA mappings
   DenseMap<const Value *, MemoryAccess *> ValueToMemoryAccess;
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 1c76821fe5e4..0c3a6b3742c7 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -797,6 +797,9 @@ public:
   /// Return true if the target supports strided load.
   bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const;
 
+  // Return true if the target supports masked vector histograms.
+  bool isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) const;
+
   /// Return true if this is an alternating opcode pattern that can be lowered
   /// to a single instruction on the target. In X86 this is for the addsub
   /// instruction which corrsponds to a Shuffle + Fadd + FSub pattern in IR.
@@ -834,7 +837,7 @@ public:
   /// If the AM is not supported, it returns a negative value.
   /// TODO: Handle pre/postinc as well.
   InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
-                                       int64_t BaseOffset, bool HasBaseReg,
+                                       StackOffset BaseOffset, bool HasBaseReg,
                                        int64_t Scale,
                                        unsigned AddrSpace = 0) const;
 
@@ -1883,6 +1886,7 @@ public:
   virtual bool isLegalMaskedCompressStore(Type *DataType, Align Alignment) = 0;
   virtual bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) = 0;
   virtual bool isLegalStridedLoadStore(Type *DataType, Align Alignment) = 0;
+  virtual bool isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) = 0;
   virtual bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
                                unsigned Opcode1,
                                const SmallBitVector &OpcodeMask) const = 0;
@@ -1891,7 +1895,7 @@ public:
   virtual bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) = 0;
   virtual bool prefersVectorizedAddressing() = 0;
   virtual InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
-                                               int64_t BaseOffset,
+                                               StackOffset BaseOffset,
                                                bool HasBaseReg, int64_t Scale,
                                                unsigned AddrSpace) = 0;
   virtual bool LSRWithInstrQueries() = 0;
@@ -2386,6 +2390,9 @@ public:
   bool isLegalStridedLoadStore(Type *DataType, Align Alignment) override {
     return Impl.isLegalStridedLoadStore(DataType, Alignment);
   }
+  bool isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) override {
+    return Impl.isLegalMaskedVectorHistogram(AddrType, DataType);
+  }
   bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1,
                        const SmallBitVector &OpcodeMask) const override {
     return Impl.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask);
@@ -2403,7 +2410,7 @@ public:
     return Impl.prefersVectorizedAddressing();
   }
   InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
-                                       int64_t BaseOffset, bool HasBaseReg,
+                                       StackOffset BaseOffset, bool HasBaseReg,
                                        int64_t Scale,
                                        unsigned AddrSpace) override {
     return Impl.getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg, Scale,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 4d5cd963e092..9a57331d281d 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -32,6 +32,7 @@ class Function;
 /// Base class for use as a mix-in that aids implementing
 /// a TargetTransformInfo-compatible class.
 class TargetTransformInfoImplBase {
+
 protected:
   typedef TargetTransformInfo TTI;
 
@@ -315,6 +316,10 @@ public:
     return false;
   }
 
+  bool isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) const {
+    return false;
+  }
+
   bool enableOrderedReductions() const { return false; }
 
   bool hasDivRemOp(Type *DataType, bool IsSigned) const { return false; }
@@ -326,12 +331,13 @@ public:
   bool prefersVectorizedAddressing() const { return true; }
 
   InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
-                                       int64_t BaseOffset, bool HasBaseReg,
+                                       StackOffset BaseOffset, bool HasBaseReg,
                                        int64_t Scale,
                                        unsigned AddrSpace) const {
     // Guess that all legal addressing mode are free.
-    if (isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg, Scale,
-                              AddrSpace))
+    if (isLegalAddressingMode(Ty, BaseGV, BaseOffset.getFixed(), HasBaseReg,
+                              Scale, AddrSpace, /*I=*/nullptr,
+                              BaseOffset.getScalable()))
       return 0;
     return -1;
   }
diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index afd18e7e56ba..0584b7e29f67 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -117,6 +117,8 @@ bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL,
                             const DominatorTree *DT = nullptr,
                             bool UseInstrInfo = true);
 
+bool isOnlyUsedInZeroComparison(const Instruction *CxtI);
+
 bool isOnlyUsedInZeroEqualityComparison(const Instruction *CxtI);
 
 /// Return true if the given value is known to be non-zero when defined. For
diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index 424b73e375b5..521dac08792f 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -795,9 +795,11 @@ private:
   void collectDependences() {
     if (!areDependencesValid())
       return;
-    auto *Deps = LAI->getDepChecker().getDependences();
+    const auto &DepChecker = LAI->getDepChecker();
+    auto *Deps = DepChecker.getDependences();
     for (auto Dep : *Deps)
-      Dependences[Dep.getSource(*LAI)].insert(Dep.getDestination(*LAI));
+      Dependences[Dep.getSource(DepChecker)].insert(
+          Dep.getDestination(DepChecker));
   }
 };
 
diff --git a/llvm/include/llvm/AsmParser/LLParser.h b/llvm/include/llvm/AsmParser/LLParser.h
index b2dcdfad0a04..e687254f6c4c 100644
--- a/llvm/include/llvm/AsmParser/LLParser.h
+++ b/llvm/include/llvm/AsmParser/LLParser.h
@@ -337,7 +337,6 @@ namespace llvm {
 
     // Top-Level Entities
     bool parseTopLevelEntities();
-    bool finalizeDebugInfoFormat(Module *M);
     void dropUnknownMetadataReferences();
     bool validateEndOfModule(bool UpgradeDebugInfo);
     bool validateEndOfIndex();
diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h
index e8d03f806715..847d8103e681 100644
--- a/llvm/include/llvm/BinaryFormat/DXContainer.h
+++ b/llvm/include/llvm/BinaryFormat/DXContainer.h
@@ -119,8 +119,7 @@ struct BitcodeHeader {
 };
 
 struct ProgramHeader {
-  uint8_t MinorVersion : 4;
-  uint8_t MajorVersion : 4;
+  uint8_t Version;
   uint8_t Unused;
   uint16_t ShaderKind;
   uint32_t Size; // Size in uint32_t words including this header.
@@ -131,6 +130,11 @@ struct ProgramHeader {
     sys::swapByteOrder(Size);
     Bitcode.swapBytes();
   }
+  uint8_t getMajorVersion() { return Version >> 4; }
+  uint8_t getMinorVersion() { return Version & 0xF; }
+  static uint8_t getVersion(uint8_t Major, uint8_t Minor) {
+    return (Major << 4) | Minor;
+  }
 };
 
 static_assert(sizeof(ProgramHeader) == 24, "ProgramHeader Size incorrect!");
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 92b51438b4cb..2091432d4fe2 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -25,6 +25,7 @@
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/TargetTransformInfoImpl.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -232,8 +233,8 @@ private:
 
     // The cost of the scalar loads/stores.
     InstructionCost MemoryOpCost =
-        VF * getMemoryOpCost(Opcode, VT->getElementType(), Alignment,
-                             AddressSpace, CostKind);
+        VF * thisT()->getMemoryOpCost(Opcode, VT->getElementType(), Alignment,
+                                      AddressSpace, CostKind);
 
     // Next, compute the cost of packing the result in a vector.
     InstructionCost PackingCost =
@@ -252,8 +253,8 @@ private:
           getScalarizationOverhead(
               FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()), VF),
               /*Insert=*/false, /*Extract=*/true, CostKind) +
-          VF * (getCFInstrCost(Instruction::Br, CostKind) +
-                getCFInstrCost(Instruction::PHI, CostKind));
+          VF * (thisT()->getCFInstrCost(Instruction::Br, CostKind) +
+                thisT()->getCFInstrCost(Instruction::PHI, CostKind));
     }
 
     return AddrExtractCost + MemoryOpCost + PackingCost + ConditionalCost;
@@ -403,13 +404,14 @@ public:
   }
 
   InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
-                                       int64_t BaseOffset, bool HasBaseReg,
+                                       StackOffset BaseOffset, bool HasBaseReg,
                                        int64_t Scale, unsigned AddrSpace) {
     TargetLoweringBase::AddrMode AM;
     AM.BaseGV = BaseGV;
-    AM.BaseOffs = BaseOffset;
+    AM.BaseOffs = BaseOffset.getFixed();
     AM.HasBaseReg = HasBaseReg;
     AM.Scale = Scale;
+    AM.ScalableOffset = BaseOffset.getScalable();
     if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
       return 0;
     return -1;
@@ -426,7 +428,7 @@ public:
   bool useAA() const { return getST()->useAA(); }
 
   bool isTypeLegal(Type *Ty) {
-    EVT VT = getTLI()->getValueType(DL, Ty);
+    EVT VT = getTLI()->getValueType(DL, Ty, /*AllowUnknown=*/true);
     return getTLI()->isTypeLegal(VT);
   }
 
@@ -1758,6 +1760,53 @@ public:
                                           CmpInst::ICMP_ULT, CostKind);
       return Cost;
     }
+    case Intrinsic::experimental_cttz_elts: {
+      EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
+
+      // If we're not expanding the intrinsic then we assume this is cheap
+      // to implement.
+      if (!getTLI()->shouldExpandCttzElements(ArgType))
+        return getTypeLegalizationCost(RetTy).first;
+
+      // TODO: The costs below reflect the expansion code in
+      // SelectionDAGBuilder, but we may want to sacrifice some accuracy in
+      // favour of compile time.
+
+      // Find the smallest "sensible" element type to use for the expansion.
+      bool ZeroIsPoison = !cast<ConstantInt>(Args[1])->isZero();
+      ConstantRange VScaleRange(APInt(64, 1), APInt::getZero(64));
+      if (isa<ScalableVectorType>(ICA.getArgTypes()[0]) && I && I->getCaller())
+        VScaleRange = getVScaleRange(I->getCaller(), 64);
+
+      unsigned EltWidth = getTLI()->getBitWidthForCttzElements(
+          RetTy, ArgType.getVectorElementCount(), ZeroIsPoison, &VScaleRange);
+      Type *NewEltTy = IntegerType::getIntNTy(RetTy->getContext(), EltWidth);
+
+      // Create the new vector type & get the vector length
+      Type *NewVecTy = VectorType::get(
+          NewEltTy, cast<VectorType>(Args[0]->getType())->getElementCount());
+
+      IntrinsicCostAttributes StepVecAttrs(Intrinsic::experimental_stepvector,
+                                           NewVecTy, {}, FMF);
+      InstructionCost Cost =
+          thisT()->getIntrinsicInstrCost(StepVecAttrs, CostKind);
+
+      Cost +=
+          thisT()->getArithmeticInstrCost(Instruction::Sub, NewVecTy, CostKind);
+      Cost += thisT()->getCastInstrCost(Instruction::SExt, NewVecTy,
+                                        Args[0]->getType(),
+                                        TTI::CastContextHint::None, CostKind);
+      Cost +=
+          thisT()->getArithmeticInstrCost(Instruction::And, NewVecTy, CostKind);
+
+      IntrinsicCostAttributes ReducAttrs(Intrinsic::vector_reduce_umax,
+                                         NewEltTy, NewVecTy, FMF, I, 1);
+      Cost += thisT()->getTypeBasedIntrinsicInstrCost(ReducAttrs, CostKind);
+      Cost +=
+          thisT()->getArithmeticInstrCost(Instruction::Sub, NewEltTy, CostKind);
+
+      return Cost;
+    }
     }
 
     // VP Intrinsics should have the same cost as their non-vp counterpart.
diff --git a/llvm/include/llvm/CodeGen/CommandFlags.h b/llvm/include/llvm/CodeGen/CommandFlags.h
index 244dabd38cf6..d5448d781363 100644
--- a/llvm/include/llvm/CodeGen/CommandFlags.h
+++ b/llvm/include/llvm/CodeGen/CommandFlags.h
@@ -122,6 +122,8 @@ bool getUniqueSectionNames();
 
 bool getUniqueBasicBlockSectionNames();
 
+bool getSeparateNamedSections();
+
 llvm::EABI getEABIVersion();
 
 llvm::DebuggerKind getDebuggerTuningOpt();
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 76e8d1166ae0..ecaece8b6834 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -599,10 +599,6 @@ public:
   /// This variant does not erase \p MI after calling the build function.
   void applyBuildFnNoErase(MachineInstr &MI, BuildFnTy &MatchInfo);
 
-  /// Use a function which takes in a MachineIRBuilder to perform a combine.
-  /// By default, it erases the instruction \p MI from the function.
-  void applyBuildFnMO(const MachineOperand &MO, BuildFnTy &MatchInfo);
-
   bool matchOrShiftToFunnelShift(MachineInstr &MI, BuildFnTy &MatchInfo);
   bool matchFunnelShiftToRotate(MachineInstr &MI);
   void applyFunnelShiftToRotate(MachineInstr &MI);
@@ -814,6 +810,12 @@ public:
   /// Match constant LHS ops that should be commuted.
   bool matchCommuteConstantToRHS(MachineInstr &MI);
 
+  /// Combine sext of trunc.
+  bool matchSextOfTrunc(const MachineOperand &MO, BuildFnTy &MatchInfo);
+
+  /// Combine zext of trunc.
+  bool matchZextOfTrunc(const MachineOperand &MO, BuildFnTy &MatchInfo);
+
   /// Match constant LHS FP ops that should be commuted.
   bool matchCommuteFPConstantToRHS(MachineInstr &MI);
 
@@ -848,10 +850,18 @@ public:
   bool matchExtractVectorElementWithBuildVectorTrunc(const MachineOperand &MO,
                                                      BuildFnTy &MatchInfo);
 
+  /// Combine extract vector element with a shuffle vector on the vector
+  /// register.
+  bool matchExtractVectorElementWithShuffleVector(const MachineOperand &MO,
+                                                  BuildFnTy &MatchInfo);
+
   /// Combine extract vector element with a insert vector element on the vector
   /// register and different indices.
   bool matchExtractVectorElementWithDifferentIndices(const MachineOperand &MO,
                                                      BuildFnTy &MatchInfo);
+  /// Use a function which takes in a MachineIRBuilder to perform a combine.
+  /// By default, it erases the instruction def'd on \p MO from the function.
+  void applyBuildFnMO(const MachineOperand &MO, BuildFnTy &MatchInfo);
 
   /// Combine insert vector element OOB.
   bool matchInsertVectorElementOOB(MachineInstr &MI, BuildFnTy &MatchInfo);
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h
index 8eddc6a6a531..72c63ecba529 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h
@@ -168,7 +168,7 @@ enum {
   /// operand.
   /// - InsnID(ULEB128) - Instruction ID
   /// - MMOIdx(ULEB128) - MMO index
-  /// - NumAddrSpace(ULEB128) - Number of valid address spaces
+  /// - NumAddrSpace(1) - Number of valid address spaces
   /// - AddrSpaceN(ULEB128) - An allowed space of the memory access
   /// - AddrSpaceN+1 ...
   GIM_CheckMemoryAddressSpace,
@@ -177,7 +177,7 @@ enum {
   /// memory operand.
   /// - InsnID(ULEB128) - Instruction ID
   /// - MMOIdx(ULEB128) - MMO index
-  /// - MinAlign(ULEB128) - Minimum acceptable alignment
+  /// - MinAlign(1) - Minimum acceptable alignment
   GIM_CheckMemoryAlignment,
 
   /// Check the size of the memory access for the given machine memory operand
@@ -713,6 +713,28 @@ protected:
     memcpy(&Ret, MatchTable, sizeof(Ret));
     return Ret;
   }
+
+public:
+  // Faster ULEB128 decoder tailored for the Match Table Executor.
+  //
+  // - Arguments are fixed to avoid mid-function checks.
+  // - Unchecked execution, assumes no error.
+  // - Fast common case handling (1 byte values).
+  LLVM_ATTRIBUTE_ALWAYS_INLINE static uint64_t
+  fastDecodeULEB128(const uint8_t *LLVM_ATTRIBUTE_RESTRICT MatchTable,
+                    uint64_t &CurrentIdx) {
+    uint64_t Value = MatchTable[CurrentIdx++];
+    if (LLVM_UNLIKELY(Value >= 128)) {
+      Value &= 0x7f;
+      unsigned Shift = 7;
+      do {
+        uint64_t Slice = MatchTable[CurrentIdx] & 0x7f;
+        Value += Slice << Shift;
+        Shift += 7;
+      } while (MatchTable[CurrentIdx++] >= 128);
+    }
+    return Value;
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
index dec2d97bb1fa..4d147bf20c26 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
@@ -100,10 +100,7 @@ bool GIMatchTableExecutor::executeMatchTable(
   };
 
   const auto readULEB = [&]() {
-    unsigned N = 0;
-    uint64_t Val = decodeULEB128(MatchTable + CurrentIdx, &N);
-    CurrentIdx += N;
-    return Val;
+    return fastDecodeULEB128(MatchTable, CurrentIdx);
   };
 
   // Convenience function to return a signed value. This avoids
@@ -476,7 +473,7 @@ bool GIMatchTableExecutor::executeMatchTable(
     }
     case GIM_CheckAtomicOrdering: {
       uint64_t InsnID = readULEB();
-      auto Ordering = (AtomicOrdering)readULEB();
+      auto Ordering = (AtomicOrdering)MatchTable[CurrentIdx++];
       DEBUG_WITH_TYPE(TgtExecutor::getName(),
                       dbgs() << CurrentIdx << ": GIM_CheckAtomicOrdering(MIs["
                              << InsnID << "], " << (uint64_t)Ordering << ")\n");
@@ -493,7 +490,7 @@ bool GIMatchTableExecutor::executeMatchTable(
     }
     case GIM_CheckAtomicOrderingOrStrongerThan: {
       uint64_t InsnID = readULEB();
-      auto Ordering = (AtomicOrdering)readULEB();
+      auto Ordering = (AtomicOrdering)MatchTable[CurrentIdx++];
       DEBUG_WITH_TYPE(TgtExecutor::getName(),
                       dbgs() << CurrentIdx
                              << ": GIM_CheckAtomicOrderingOrStrongerThan(MIs["
@@ -511,7 +508,7 @@ bool GIMatchTableExecutor::executeMatchTable(
     }
     case GIM_CheckAtomicOrderingWeakerThan: {
       uint64_t InsnID = readULEB();
-      auto Ordering = (AtomicOrdering)readULEB();
+      auto Ordering = (AtomicOrdering)MatchTable[CurrentIdx++];
       DEBUG_WITH_TYPE(TgtExecutor::getName(),
                       dbgs() << CurrentIdx
                              << ": GIM_CheckAtomicOrderingWeakerThan(MIs["
@@ -531,7 +528,7 @@ bool GIMatchTableExecutor::executeMatchTable(
       uint64_t InsnID = readULEB();
       uint64_t MMOIdx = readULEB();
       // This accepts a list of possible address spaces.
-      const uint64_t NumAddrSpace = readULEB();
+      const uint64_t NumAddrSpace = MatchTable[CurrentIdx++];
 
       if (State.MIs[InsnID]->getNumMemOperands() <= MMOIdx) {
         if (handleReject() == RejectAndGiveUp)
@@ -568,7 +565,7 @@ bool GIMatchTableExecutor::executeMatchTable(
     case GIM_CheckMemoryAlignment: {
       uint64_t InsnID = readULEB();
       uint64_t MMOIdx = readULEB();
-      uint64_t MinAlign = readULEB();
+      uint64_t MinAlign = MatchTable[CurrentIdx++];
 
       assert(State.MIs[InsnID] != nullptr && "Used insn before defined");
 
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
index 25e47114e4a3..2a3145b635e6 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
@@ -294,6 +294,18 @@ public:
   }
 };
 
+/// Represents a G_SHUFFLE_VECTOR.
+class GShuffleVector : public GenericMachineInstr {
+public:
+  Register getSrc1Reg() const { return getOperand(1).getReg(); }
+  Register getSrc2Reg() const { return getOperand(2).getReg(); }
+  ArrayRef<int> getMask() const { return getOperand(3).getShuffleMask(); }
+
+  static bool classof(const MachineInstr *MI) {
+    return MI->getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR;
+  }
+};
+
 /// Represents a G_PTR_ADD.
 class GPtrAdd : public GenericMachineInstr {
 public:
@@ -780,6 +792,59 @@ public:
   }
 };
 
+/// Represents a cast operation.
+/// It models the llvm::CastInst concept.
+/// The exception is bitcast.
+class GCastOp : public GenericMachineInstr {
+public:
+  Register getSrcReg() const { return getOperand(1).getReg(); }
+
+  static bool classof(const MachineInstr *MI) {
+    switch (MI->getOpcode()) {
+    case TargetOpcode::G_ADDRSPACE_CAST:
+    case TargetOpcode::G_FPEXT:
+    case TargetOpcode::G_FPTOSI:
+    case TargetOpcode::G_FPTOUI:
+    case TargetOpcode::G_FPTRUNC:
+    case TargetOpcode::G_INTTOPTR:
+    case TargetOpcode::G_PTRTOINT:
+    case TargetOpcode::G_SEXT:
+    case TargetOpcode::G_SITOFP:
+    case TargetOpcode::G_TRUNC:
+    case TargetOpcode::G_UITOFP:
+    case TargetOpcode::G_ZEXT:
+    case TargetOpcode::G_ANYEXT:
+      return true;
+    default:
+      return false;
+    }
+  };
+};
+
+/// Represents a sext.
+class GSext : public GCastOp {
+public:
+  static bool classof(const MachineInstr *MI) {
+    return MI->getOpcode() == TargetOpcode::G_SEXT;
+  };
+};
+
+/// Represents a zext.
+class GZext : public GCastOp {
+public:
+  static bool classof(const MachineInstr *MI) {
+    return MI->getOpcode() == TargetOpcode::G_ZEXT;
+  };
+};
+
+/// Represents a trunc.
+class GTrunc : public GCastOp {
+public:
+  static bool classof(const MachineInstr *MI) {
+    return MI->getOpcode() == TargetOpcode::G_TRUNC;
+  };
+};
+
 } // namespace llvm
 
 #endif // LLVM_CODEGEN_GLOBALISEL_GENERICMACHINEINSTRS_H
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
index e15f7a7172e1..92e05ee858a7 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
@@ -746,7 +746,8 @@ public:
   /// \pre \p Op must be smaller than \p Res
   ///
   /// \return The newly created instruction.
-  MachineInstrBuilder buildZExt(const DstOp &Res, const SrcOp &Op);
+  MachineInstrBuilder buildZExt(const DstOp &Res, const SrcOp &Op,
+                                std::optional<unsigned> Flags = std::nullopt);
 
   /// Build and insert \p Res = G_SEXT \p Op, \p Res = G_TRUNC \p Op, or
   /// \p Res = COPY \p Op depending on the differing sizes of \p Res and \p Op.
@@ -1231,7 +1232,8 @@ public:
   /// \pre \p Res must be smaller than \p Op
   ///
   /// \return The newly created instruction.
-  MachineInstrBuilder buildTrunc(const DstOp &Res, const SrcOp &Op);
+  MachineInstrBuilder buildTrunc(const DstOp &Res, const SrcOp &Op,
+                                 std::optional<unsigned> Flags = std::nullopt);
 
   /// Build and insert a \p Res = G_ICMP \p Pred, \p Op0, \p Op1
   ///
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 6429947958ee..d8af97957e48 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1402,6 +1402,11 @@ enum NodeType {
   // which is later translated to an implicit use in the MIR.
   CONVERGENCECTRL_GLUE,
 
+  // Experimental vector histogram intrinsic
+  // Operands: Input Chain, Inc, Mask, Base, Index, Scale, ID
+  // Output: Output Chain
+  EXPERIMENTAL_VECTOR_HISTOGRAM,
+
   /// BUILTIN_OP_END - This must be the last enum value in this list.
   /// The target-specific pre-isel opcode values start here.
   BUILTIN_OP_END
diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index c581eb7a60aa..7970441c8371 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -358,6 +358,24 @@ struct Or<Pred, Preds...> : Or<Preds...> {
   }
 };
 
+template <typename Pred> struct Not {
+  Pred P;
+
+  explicit Not(const Pred &P) : P(P) {}
+
+  template <typename MatchContext>
+  bool match(const MatchContext &Ctx, SDValue N) {
+    return !P.match(Ctx, N);
+  }
+};
+// Explicit deduction guide.
+template <typename Pred> Not(const Pred &P) -> Not<Pred>;
+
+/// Match if the inner pattern does NOT match.
+template <typename Pred> inline Not<Pred> m_Unless(const Pred &P) {
+  return Not{P};
+}
+
 template <typename... Preds> And<Preds...> m_AllOf(Preds &&...preds) {
   return And<Preds...>(std::forward<Preds>(preds)...);
 }
@@ -366,6 +384,10 @@ template <typename... Preds> Or<Preds...> m_AnyOf(Preds &&...preds) {
   return Or<Preds...>(std::forward<Preds>(preds)...);
 }
 
+template <typename... Preds> auto m_NoneOf(Preds &&...preds) {
+  return m_Unless(m_AnyOf(std::forward<Preds>(preds)...));
+}
+
 // === Generic node matching ===
 template <unsigned OpIdx, typename... OpndPreds> struct Operands_match {
   template <typename MatchContext>
@@ -616,12 +638,19 @@ inline UnaryOpc_match<Opnd, true> m_ChainedUnaryOp(unsigned Opc,
   return UnaryOpc_match<Opnd, true>(Opc, Op);
 }
 
+template <typename Opnd>
+inline UnaryOpc_match<Opnd> m_BitReverse(const Opnd &Op) {
+  return UnaryOpc_match<Opnd>(ISD::BITREVERSE, Op);
+}
+
 template <typename Opnd> inline UnaryOpc_match<Opnd> m_ZExt(const Opnd &Op) {
   return UnaryOpc_match<Opnd>(ISD::ZERO_EXTEND, Op);
 }
 
-template <typename Opnd> inline UnaryOpc_match<Opnd> m_SExt(const Opnd &Op) {
-  return UnaryOpc_match<Opnd>(ISD::SIGN_EXTEND, Op);
+template <typename Opnd> inline auto m_SExt(Opnd &&Op) {
+  return m_AnyOf(
+      UnaryOpc_match<Opnd>(ISD::SIGN_EXTEND, Op),
+      m_Node(ISD::SIGN_EXTEND_INREG, std::forward<Opnd>(Op), m_Value()));
 }
 
 template <typename Opnd> inline UnaryOpc_match<Opnd> m_AnyExt(const Opnd &Op) {
@@ -634,18 +663,14 @@ template <typename Opnd> inline UnaryOpc_match<Opnd> m_Trunc(const Opnd &Op) {
 
 /// Match a zext or identity
 /// Allows to peek through optional extensions
-template <typename Opnd>
-inline Or<UnaryOpc_match<Opnd>, Opnd> m_ZExtOrSelf(Opnd &&Op) {
-  return Or<UnaryOpc_match<Opnd>, Opnd>(m_ZExt(std::forward<Opnd>(Op)),
-                                        std::forward<Opnd>(Op));
+template <typename Opnd> inline auto m_ZExtOrSelf(Opnd &&Op) {
+  return m_AnyOf(m_ZExt(std::forward<Opnd>(Op)), std::forward<Opnd>(Op));
 }
 
 /// Match a sext or identity
 /// Allows to peek through optional extensions
-template <typename Opnd>
-inline Or<UnaryOpc_match<Opnd>, Opnd> m_SExtOrSelf(Opnd &&Op) {
-  return Or<UnaryOpc_match<Opnd>, Opnd>(m_SExt(std::forward<Opnd>(Op)),
-                                        std::forward<Opnd>(Op));
+template <typename Opnd> inline auto m_SExtOrSelf(Opnd &&Op) {
+  return m_AnyOf(m_SExt(std::forward<Opnd>(Op)), std::forward<Opnd>(Op));
 }
 
 /// Match a aext or identity
@@ -768,6 +793,39 @@ inline auto m_False() {
       m_Value()};
 }
 
+struct CondCode_match {
+  std::optional<ISD::CondCode> CCToMatch;
+  ISD::CondCode *BindCC = nullptr;
+
+  explicit CondCode_match(ISD::CondCode CC) : CCToMatch(CC) {}
+
+  explicit CondCode_match(ISD::CondCode *CC) : BindCC(CC) {}
+
+  template <typename MatchContext> bool match(const MatchContext &, SDValue N) {
+    if (auto *CC = dyn_cast<CondCodeSDNode>(N.getNode())) {
+      if (CCToMatch && *CCToMatch != CC->get())
+        return false;
+
+      if (BindCC)
+        *BindCC = CC->get();
+      return true;
+    }
+
+    return false;
+  }
+};
+
+/// Match any conditional code SDNode.
+inline CondCode_match m_CondCode() { return CondCode_match(nullptr); }
+/// Match any conditional code SDNode and return its ISD::CondCode value.
+inline CondCode_match m_CondCode(ISD::CondCode &CC) {
+  return CondCode_match(&CC);
+}
+/// Match a conditional code SDNode with a specific ISD::CondCode.
+inline CondCode_match m_SpecificCondCode(ISD::CondCode CC) {
+  return CondCode_match(CC);
+}
+
 /// Match a negate as a sub(0, v)
 template <typename ValTy>
 inline BinaryOpc_match<SpecificInt_match, ValTy> m_Neg(const ValTy &V) {
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 4b1b58d4af0b..979ef8033eb5 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1526,6 +1526,9 @@ public:
                            ArrayRef<SDValue> Ops, MachineMemOperand *MMO,
                            ISD::MemIndexType IndexType,
                            bool IsTruncating = false);
+  SDValue getMaskedHistogram(SDVTList VTs, EVT MemVT, const SDLoc &dl,
+                             ArrayRef<SDValue> Ops, MachineMemOperand *MMO,
+                             ISD::MemIndexType IndexType);
 
   SDValue getGetFPEnv(SDValue Chain, const SDLoc &dl, SDValue Ptr, EVT MemVT,
                       MachineMemOperand *MMO);
@@ -1996,6 +1999,10 @@ public:
   /// is set.
   bool isKnownToBeAPowerOfTwo(SDValue Val, unsigned Depth = 0) const;
 
+  /// Test if the given _fp_ value is known to be an integer power-of-2, either
+  /// positive or negative.
+  bool isKnownToBeAPowerOfTwoFP(SDValue Val, unsigned Depth = 0) const;
+
   /// Return the number of times the sign bit of the register is replicated into
   /// the other bits. We know that at least 1 bit is always equal to the sign
   /// bit (itself), but other cases can give us information. For example,
@@ -2111,6 +2118,10 @@ public:
   /// Test whether the given SDValue is known to contain non-zero value(s).
   bool isKnownNeverZero(SDValue Op, unsigned Depth = 0) const;
 
+  /// Test whether the given float value is known to be positive. +0.0, +inf and
+  /// +nan are considered positive, -0.0, -inf and -nan are not.
+  bool cannotBeOrderedNegativeFP(SDValue Op) const;
+
   /// Test whether two SDValues are known to compare equal. This
   /// is true if they are the same value, or if one is negative zero and the
   /// other positive zero.
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index e7c710414545..ac94c6099d08 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -542,6 +542,7 @@ BEGIN_TWO_BYTE_PACK()
     friend class MaskedLoadStoreSDNode;
     friend class MaskedGatherScatterSDNode;
     friend class VPGatherScatterSDNode;
+    friend class MaskedHistogramSDNode;
 
     uint16_t : NumMemSDNodeBits;
 
@@ -552,6 +553,7 @@ BEGIN_TWO_BYTE_PACK()
     //   MaskedLoadStoreBaseSDNode => enum ISD::MemIndexedMode
     //   VPGatherScatterSDNode => enum ISD::MemIndexType
     //   MaskedGatherScatterSDNode => enum ISD::MemIndexType
+    //   MaskedHistogramSDNode => enum ISD::MemIndexType
     uint16_t AddressingMode : 3;
   };
   enum { NumLSBaseSDNodeBits = NumMemSDNodeBits + 3 };
@@ -564,6 +566,7 @@ BEGIN_TWO_BYTE_PACK()
     friend class MaskedLoadSDNode;
     friend class MaskedGatherSDNode;
     friend class VPGatherSDNode;
+    friend class MaskedHistogramSDNode;
 
     uint16_t : NumLSBaseSDNodeBits;
 
@@ -1420,6 +1423,7 @@ public:
       return getOperand(2);
     case ISD::MGATHER:
     case ISD::MSCATTER:
+    case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
       return getOperand(3);
     default:
       return getOperand(1);
@@ -1468,6 +1472,7 @@ public:
     case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
     case ISD::GET_FPENV_MEM:
     case ISD::SET_FPENV_MEM:
+    case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
       return true;
     default:
       return N->isMemIntrinsic() || N->isTargetMemoryOpcode();
@@ -2953,6 +2958,34 @@ public:
   }
 };
 
+class MaskedHistogramSDNode : public MemSDNode {
+public:
+  friend class SelectionDAG;
+
+  MaskedHistogramSDNode(unsigned Order, const DebugLoc &DL, SDVTList VTs,
+                        EVT MemVT, MachineMemOperand *MMO,
+                        ISD::MemIndexType IndexType)
+      : MemSDNode(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, Order, DL, VTs, MemVT,
+                  MMO) {
+    LSBaseSDNodeBits.AddressingMode = IndexType;
+  }
+
+  ISD::MemIndexType getIndexType() const {
+    return static_cast<ISD::MemIndexType>(LSBaseSDNodeBits.AddressingMode);
+  }
+
+  const SDValue &getBasePtr() const { return getOperand(3); }
+  const SDValue &getIndex() const { return getOperand(4); }
+  const SDValue &getMask() const { return getOperand(2); }
+  const SDValue &getScale() const { return getOperand(5); }
+  const SDValue &getInc() const { return getOperand(1); }
+  const SDValue &getIntID() const { return getOperand(6); }
+
+  static bool classof(const SDNode *N) {
+    return N->getOpcode() == ISD::EXPERIMENTAL_VECTOR_HISTOGRAM;
+  }
+};
+
 class FPStateAccessSDNode : public MemSDNode {
 public:
   friend class SelectionDAG;
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 7ed08cfa8a20..50a8c7eb75af 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -470,6 +470,12 @@ public:
   /// expanded using generic code in SelectionDAGBuilder.
   virtual bool shouldExpandCttzElements(EVT VT) const { return true; }
 
+  /// Return the minimum number of bits required to hold the maximum possible
+  /// number of trailing zero vector elements.
+  unsigned getBitWidthForCttzElements(Type *RetTy, ElementCount EC,
+                                      bool ZeroIsPoison,
+                                      const ConstantRange *VScaleRange) const;
+
   // Return true if op(vecreduce(x), vecreduce(y)) should be reassociated to
   // vecreduce(op(x, y)) for the reduction opcode RedOpc.
   virtual bool shouldReassociateReduction(unsigned RedOpc, EVT VT) const {
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVStringPool.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVStringPool.h
index 4c596b5b1dde..8ce751a56c59 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVStringPool.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVStringPool.h
@@ -60,7 +60,7 @@ public:
     if (isValidIndex(Index))
       return Index;
     size_t Value = Entries.size();
-    ValueType *Entry = ValueType::create(Key, Allocator, std::move(Value));
+    ValueType *Entry = ValueType::create(Key, Allocator, Value);
     StringTable.insert(Entry);
     Entries.push_back(Entry);
     return Value;
diff --git a/llvm/include/llvm/Demangle/ItaniumDemangle.h b/llvm/include/llvm/Demangle/ItaniumDemangle.h
index d33af157543f..e7c008be32f9 100644
--- a/llvm/include/llvm/Demangle/ItaniumDemangle.h
+++ b/llvm/include/llvm/Demangle/ItaniumDemangle.h
@@ -5714,6 +5714,7 @@ Node *AbstractManglingParser<Derived, Alloc>::parseTemplateParam() {
 }
 
 // <template-param-decl> ::= Ty                          # type parameter
+//                       ::= Tk <concept name> [<template-args>] # constrained type parameter
 //                       ::= Tn <type>                   # non-type parameter
 //                       ::= Tt <template-param-decl>* E # template parameter
 //                       ::= Tp <template-param-decl>    # parameter pack
@@ -5845,7 +5846,7 @@ Node *AbstractManglingParser<Derived, Alloc>::parseTemplateArg() {
   }
 }
 
-// <template-args> ::= I <template-arg>* E
+// <template-args> ::= I <template-arg>* [Q <requires-clause expr>] E
 //     extension, the abi says <template-arg>+
 template <typename Derived, typename Alloc>
 Node *
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h b/llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h
index e452b90598a0..689daba8ae73 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h
@@ -121,13 +121,26 @@ public:
     this->ReturnObjectBuffer = std::move(ReturnObjectBuffer);
   }
 
-  /// Add a pass-config modifier.
+  /// Add a plugin.
   ObjectLinkingLayer &addPlugin(std::shared_ptr<Plugin> P) {
     std::lock_guard<std::mutex> Lock(LayerMutex);
     Plugins.push_back(std::move(P));
     return *this;
   }
 
+  /// Remove a plugin. This remove applies only to subsequent links (links
+  /// already underway will continue to use the plugin), and does not of itself
+  /// destroy the plugin -- destruction will happen once all shared pointers
+  /// (including those held by in-progress links) are destroyed.
+  void removePlugin(Plugin &P) {
+    std::lock_guard<std::mutex> Lock(LayerMutex);
+    auto I = llvm::find_if(Plugins, [&](const std::shared_ptr<Plugin> &Elem) {
+      return Elem.get() == &P;
+    });
+    assert(I != Plugins.end() && "Plugin not present");
+    Plugins.erase(I);
+  }
+
   /// Add a LinkGraph to the JITDylib targeted by the given tracker.
   Error add(ResourceTrackerSP, std::unique_ptr<jitlink::LinkGraph> G);
 
diff --git a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
index daef02bcfc9a..13a37265762a 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
@@ -178,6 +178,12 @@ template <typename T> using ListT = llvm::SmallVector<T, 0>;
 // provide their own specialization that conforms to the above requirements.
 template <typename IdType, typename ExprType> struct ObjectT;
 
+// By default, object equality is only determined by its identity.
+template <typename I, typename E>
+bool operator==(const ObjectT<I, E> &o1, const ObjectT<I, E> &o2) {
+  return o1.id() == o2.id();
+}
+
 template <typename I, typename E> using ObjectListT = ListT<ObjectT<I, E>>;
 
 using DirectiveName = llvm::omp::Directive;
@@ -264,6 +270,32 @@ struct ReductionIdentifierT {
 
 template <typename T, typename I, typename E> //
 using IteratorT = ListT<IteratorSpecifierT<T, I, E>>;
+
+template <typename T>
+std::enable_if_t<T::EmptyTrait::value, bool> operator==(const T &a,
+                                                        const T &b) {
+  return true;
+}
+template <typename T>
+std::enable_if_t<T::IncompleteTrait::value, bool> operator==(const T &a,
+                                                             const T &b) {
+  return true;
+}
+template <typename T>
+std::enable_if_t<T::WrapperTrait::value, bool> operator==(const T &a,
+                                                          const T &b) {
+  return a.v == b.v;
+}
+template <typename T>
+std::enable_if_t<T::TupleTrait::value, bool> operator==(const T &a,
+                                                        const T &b) {
+  return a.t == b.t;
+}
+template <typename T>
+std::enable_if_t<T::UnionTrait::value, bool> operator==(const T &a,
+                                                        const T &b) {
+  return a.u == b.u;
+}
 } // namespace type
 
 template <typename T> using ListT = type::ListT<T>;
@@ -285,6 +317,8 @@ ListT<ResultTy> makeList(ContainerTy &&container, FunctionTy &&func) {
 }
 
 namespace clause {
+using type::operator==;
+
 // V5.2: [8.3.1] `assumption` clauses
 template <typename T, typename I, typename E> //
 struct AbsentT {
@@ -726,7 +760,7 @@ struct LinearT {
   ENUM(LinearModifier, Ref, Val, Uval);
 
   using TupleTrait = std::true_type;
-  // Step == nullptr means 1.
+  // Step == nullopt means 1.
   std::tuple<OPT(StepSimpleModifier), OPT(StepComplexModifier),
              OPT(LinearModifier), List>
       t;
@@ -1142,9 +1176,11 @@ struct UsesAllocatorsT {
   using MemSpace = E;
   using TraitsArray = ObjectT<I, E>;
   using Allocator = E;
-  using AllocatorSpec =
-      std::tuple<OPT(MemSpace), OPT(TraitsArray), Allocator>; // Not a spec name
-  using Allocators = ListT<AllocatorSpec>;                    // Not a spec name
+  struct AllocatorSpec { // Not a spec name
+    using TupleTrait = std::true_type;
+    std::tuple<OPT(MemSpace), OPT(TraitsArray), Allocator> t;
+  };
+  using Allocators = ListT<AllocatorSpec>; // Not a spec name
   using WrapperTrait = std::true_type;
   Allocators v;
 };
@@ -1232,9 +1268,10 @@ using UnionOfAllClausesT = typename type::Union< //
     UnionClausesT<T, I, E>,                      //
     WrapperClausesT<T, I, E>                     //
     >::type;
-
 } // namespace clause
 
+using type::operator==;
+
 // The variant wrapper that encapsulates all possible specific clauses.
 // The `Extras` arguments are additional types representing local extensions
 // to the clause set, e.g.
@@ -1244,6 +1281,9 @@ using UnionOfAllClausesT = typename type::Union< //
 //
 // The member Clause::u will be a variant containing all specific clauses
 // defined above, plus MyClause1 and MyClause2.
+//
+// Note: Any derived class must be constructible from the base class
+// ClauseT<...>.
 template <typename TypeType, typename IdType, typename ExprType,
           typename... Extras>
 struct ClauseT {
@@ -1251,6 +1291,9 @@ struct ClauseT {
   using IdTy = IdType;
   using ExprTy = ExprType;
 
+  // Type of "self" to specify this type given a derived class type.
+  using BaseT = ClauseT<TypeType, IdType, ExprType, Extras...>;
+
   using VariantTy = typename type::Union<
       clause::UnionOfAllClausesT<TypeType, IdType, ExprType>,
       std::variant<Extras...>>::type;
@@ -1260,6 +1303,11 @@ struct ClauseT {
   VariantTy u;
 };
 
+template <typename ClauseType> struct DirectiveWithClauses {
+  llvm::omp::Directive id = llvm::omp::Directive::OMPD_unknown;
+  tomp::type::ListT<ClauseType> clauses;
+};
+
 } // namespace tomp
 
 #undef OPT
diff --git a/llvm/include/llvm/Frontend/OpenMP/ConstructCompositionT.h b/llvm/include/llvm/Frontend/OpenMP/ConstructCompositionT.h
new file mode 100644
index 000000000000..9dcb115a0c51
--- /dev/null
+++ b/llvm/include/llvm/Frontend/OpenMP/ConstructCompositionT.h
@@ -0,0 +1,403 @@
+//===- ConstructCompositionT.h -- Composing compound constructs -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Given a list of leaf construct, each with a set of clauses, generate the
+// compound construct whose leaf constructs are the given list, and whose clause
+// list is the merged lists of individual leaf clauses.
+//
+// *** At the moment it assumes that the individual constructs and their clauses
+// *** are a subset of those created by splitting a valid compound construct.
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_FRONTEND_OPENMP_CONSTRUCTCOMPOSITIONT_H
+#define LLVM_FRONTEND_OPENMP_CONSTRUCTCOMPOSITIONT_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Frontend/OpenMP/ClauseT.h"
+#include "llvm/Frontend/OpenMP/OMP.h"
+
+#include <iterator>
+#include <optional>
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+
+namespace tomp {
+template <typename ClauseType> struct ConstructCompositionT {
+  using ClauseTy = ClauseType;
+
+  using TypeTy = typename ClauseTy::TypeTy;
+  using IdTy = typename ClauseTy::IdTy;
+  using ExprTy = typename ClauseTy::ExprTy;
+
+  ConstructCompositionT(uint32_t version,
+                        llvm::ArrayRef<DirectiveWithClauses<ClauseTy>> leafs);
+
+  DirectiveWithClauses<ClauseTy> merged;
+
+private:
+  // Use an ordered container, since we beed to maintain the order in which
+  // clauses are added to it. This is to avoid non-deterministic output.
+  using ClauseSet = ListT<ClauseTy>;
+
+  enum class Presence {
+    All,  // Clause is preesnt on all leaf constructs that allow it.
+    Some, // Clause is present on some, but not on all constructs.
+    None, // Clause is absent on all constructs.
+  };
+
+  template <typename S>
+  ClauseTy makeClause(llvm::omp::Clause clauseId, S &&specific) {
+    return typename ClauseTy::BaseT{clauseId, std::move(specific)};
+  }
+
+  llvm::omp::Directive
+  makeCompound(llvm::ArrayRef<DirectiveWithClauses<ClauseTy>> parts);
+
+  Presence checkPresence(llvm::omp::Clause clauseId);
+
+  // There are clauses that need special handling:
+  // 1. "if": the "directive-name-modifier" on the merged clause may need
+  // to be set appropriately.
+  // 2. "reduction": implies "privateness" of all objects (incompatible
+  // with "shared"); there are rules for merging modifiers
+  void mergeIf();
+  void mergeReduction();
+  void mergeDSA();
+
+  uint32_t version;
+  llvm::ArrayRef<DirectiveWithClauses<ClauseTy>> leafs;
+
+  // clause id -> set of leaf constructs that contain it
+  std::unordered_map<llvm::omp::Clause, llvm::BitVector> clausePresence;
+  // clause id -> set of instances of that clause
+  std::unordered_map<llvm::omp::Clause, ClauseSet> clauseSets;
+};
+
+template <typename C>
+ConstructCompositionT<C>::ConstructCompositionT(
+    uint32_t version, llvm::ArrayRef<DirectiveWithClauses<C>> leafs)
+    : version(version), leafs(leafs) {
+  // Merge the list of constructs with clauses into a compound construct
+  // with a single list of clauses.
+  // The intended use of this function is in splitting compound constructs,
+  // while preserving composite constituent constructs:
+  // Step 1: split compound construct into leaf constructs.
+  // Step 2: identify composite sub-construct, and merge the constituent leafs.
+  //
+  // *** At the moment it assumes that the individual constructs and their
+  // *** clauses are a subset of those created by splitting a valid compound
+  // *** construct.
+  //
+  // 1. Deduplicate clauses
+  //    - exact duplicates: e.g. shared(x) shared(x) -> shared(x)
+  //    - special cases of clauses differing in modifier:
+  //      (a) reduction: inscan + (none|default) = inscan
+  //      (b) reduction: task + (none|default) = task
+  //      (c) combine repeated "if" clauses if possible
+  // 2. Merge DSA clauses: e.g. private(x) private(y) -> private(x, y).
+  // 3. Resolve potential DSA conflicts (typically due to implied clauses).
+
+  if (leafs.empty())
+    return;
+
+  merged.id = makeCompound(leafs);
+
+  // Populate the two maps:
+  for (const auto &[index, leaf] : llvm::enumerate(leafs)) {
+    for (const auto &clause : leaf.clauses) {
+      // Update clausePresence.
+      auto &pset = clausePresence[clause.id];
+      if (pset.size() < leafs.size())
+        pset.resize(leafs.size());
+      pset.set(index);
+      // Update clauseSets.
+      ClauseSet &cset = clauseSets[clause.id];
+      if (!llvm::is_contained(cset, clause))
+        cset.push_back(clause);
+    }
+  }
+
+  mergeIf();
+  mergeReduction();
+  mergeDSA();
+
+  // Fir the rest of the clauses, just copy them.
+  for (auto &[id, clauses] : clauseSets) {
+    // Skip clauses we've already dealt with.
+    switch (id) {
+    case llvm::omp::Clause::OMPC_if:
+    case llvm::omp::Clause::OMPC_reduction:
+    case llvm::omp::Clause::OMPC_shared:
+    case llvm::omp::Clause::OMPC_private:
+    case llvm::omp::Clause::OMPC_firstprivate:
+    case llvm::omp::Clause::OMPC_lastprivate:
+      continue;
+    default:
+      break;
+    }
+    llvm::append_range(merged.clauses, clauses);
+  }
+}
+
+template <typename C>
+llvm::omp::Directive ConstructCompositionT<C>::makeCompound(
+    llvm::ArrayRef<DirectiveWithClauses<ClauseTy>> parts) {
+  llvm::SmallVector<llvm::omp::Directive> dirIds;
+  llvm::transform(parts, std::back_inserter(dirIds),
+                  [](auto &&dwc) { return dwc.id; });
+
+  return llvm::omp::getCompoundConstruct(dirIds);
+}
+
+template <typename C>
+auto ConstructCompositionT<C>::checkPresence(llvm::omp::Clause clauseId)
+    -> Presence {
+  auto found = clausePresence.find(clauseId);
+  if (found == clausePresence.end())
+    return Presence::None;
+
+  bool OnAll = true, OnNone = true;
+  for (const auto &[index, leaf] : llvm::enumerate(leafs)) {
+    if (!llvm::omp::isAllowedClauseForDirective(leaf.id, clauseId, version))
+      continue;
+
+    if (found->second.test(index))
+      OnNone = false;
+    else
+      OnAll = false;
+  }
+
+  if (OnNone)
+    return Presence::None;
+  if (OnAll)
+    return Presence::All;
+  return Presence::Some;
+}
+
+template <typename C> void ConstructCompositionT<C>::mergeIf() {
+  using IfTy = tomp::clause::IfT<TypeTy, IdTy, ExprTy>;
+  // Deal with the "if" clauses. If it's on all leafs that allow it, then it
+  // will apply to the compound construct. Otherwise it will apply to the
+  // single (assumed) leaf construct.
+  // This assumes that the "if" clauses have the same expression.
+  Presence presence = checkPresence(llvm::omp::Clause::OMPC_if);
+  if (presence == Presence::None)
+    return;
+
+  const ClauseTy &some = *clauseSets[llvm::omp::Clause::OMPC_if].begin();
+  const auto &someIf = std::get<IfTy>(some.u);
+
+  if (presence == Presence::All) {
+    // Create "if" without "directive-name-modifier".
+    merged.clauses.emplace_back(
+        makeClause(llvm::omp::Clause::OMPC_if,
+                   IfTy{{/*DirectiveNameModifier=*/std::nullopt,
+                         /*IfExpression=*/std::get<typename IfTy::IfExpression>(
+                             someIf.t)}}));
+  } else {
+    // Find out where it's present and create "if" with the corresponding
+    // "directive-name-modifier".
+    int Idx = clausePresence[llvm::omp::Clause::OMPC_if].find_first();
+    assert(Idx >= 0);
+    merged.clauses.emplace_back(
+        makeClause(llvm::omp::Clause::OMPC_if,
+                   IfTy{{/*DirectiveNameModifier=*/leafs[Idx].id,
+                         /*IfExpression=*/std::get<typename IfTy::IfExpression>(
+                             someIf.t)}}));
+  }
+}
+
+template <typename C> void ConstructCompositionT<C>::mergeReduction() {
+  Presence presence = checkPresence(llvm::omp::Clause::OMPC_reduction);
+  if (presence == Presence::None)
+    return;
+
+  using ReductionTy = tomp::clause::ReductionT<TypeTy, IdTy, ExprTy>;
+  using ModifierTy = typename ReductionTy::ReductionModifier;
+  using IdentifiersTy = typename ReductionTy::ReductionIdentifiers;
+  using ListTy = typename ReductionTy::List;
+  // There are exceptions on which constructs "reduction" may appear
+  // (specifically "parallel", and "teams"). Assume that if "reduction"
+  // is present, it can be applied to the compound construct.
+
+  // What's left is to see if there are any modifiers present. Again,
+  // assume that there are no conflicting modifiers.
+  // There can be, however, multiple reductions on different objects.
+  auto equal = [](const ClauseTy &red1, const ClauseTy &red2) {
+    // Extract actual reductions.
+    const auto r1 = std::get<ReductionTy>(red1.u);
+    const auto r2 = std::get<ReductionTy>(red2.u);
+    // Compare everything except modifiers.
+    if (std::get<IdentifiersTy>(r1.t) != std::get<IdentifiersTy>(r2.t))
+      return false;
+    if (std::get<ListTy>(r1.t) != std::get<ListTy>(r2.t))
+      return false;
+    return true;
+  };
+
+  auto getModifier = [](const ClauseTy &clause) {
+    const ReductionTy &red = std::get<ReductionTy>(clause.u);
+    return std::get<std::optional<ModifierTy>>(red.t);
+  };
+
+  const ClauseSet &reductions = clauseSets[llvm::omp::Clause::OMPC_reduction];
+  std::unordered_set<const ClauseTy *> visited;
+  while (reductions.size() != visited.size()) {
+    typename ClauseSet::const_iterator first;
+
+    // Find first non-visited reduction.
+    for (first = reductions.begin(); first != reductions.end(); ++first) {
+      if (visited.count(&*first))
+        continue;
+      visited.insert(&*first);
+      break;
+    }
+
+    std::optional<ModifierTy> modifier = getModifier(*first);
+
+    // Visit all other reductions that are "equal" (with respect to the
+    // definition above) to "first". Collect modifiers.
+    for (auto iter = std::next(first); iter != reductions.end(); ++iter) {
+      if (!equal(*first, *iter))
+        continue;
+      visited.insert(&*iter);
+      if (!modifier || *modifier == ModifierTy::Default)
+        modifier = getModifier(*iter);
+    }
+
+    const auto &firstRed = std::get<ReductionTy>(first->u);
+    merged.clauses.emplace_back(makeClause(
+        llvm::omp::Clause::OMPC_reduction,
+        ReductionTy{
+            {/*ReductionModifier=*/modifier,
+             /*ReductionIdentifiers=*/std::get<IdentifiersTy>(firstRed.t),
+             /*List=*/std::get<ListTy>(firstRed.t)}}));
+  }
+}
+
+template <typename C> void ConstructCompositionT<C>::mergeDSA() {
+  using ObjectTy = tomp::type::ObjectT<IdTy, ExprTy>;
+
+  // Resolve data-sharing attributes.
+  enum DSA : int {
+    None = 0,
+    Shared = 1 << 0,
+    Private = 1 << 1,
+    FirstPrivate = 1 << 2,
+    LastPrivate = 1 << 3,
+    LastPrivateConditional = 1 << 4,
+  };
+
+  // Use ordered containers to avoid non-deterministic output.
+  llvm::SmallVector<std::pair<ObjectTy, int>, 8> objectDsa;
+
+  auto getDsa = [&](const ObjectTy &object) -> std::pair<ObjectTy, int> & {
+    auto found = llvm::find_if(objectDsa, [&](std::pair<ObjectTy, int> &p) {
+      return p.first.id() == object.id();
+    });
+    if (found != objectDsa.end())
+      return *found;
+    return objectDsa.emplace_back(object, DSA::None);
+  };
+
+  using SharedTy = tomp::clause::SharedT<TypeTy, IdTy, ExprTy>;
+  using PrivateTy = tomp::clause::PrivateT<TypeTy, IdTy, ExprTy>;
+  using FirstprivateTy = tomp::clause::FirstprivateT<TypeTy, IdTy, ExprTy>;
+  using LastprivateTy = tomp::clause::LastprivateT<TypeTy, IdTy, ExprTy>;
+
+  // Visit clauses that affect DSA.
+  for (auto &clause : clauseSets[llvm::omp::Clause::OMPC_shared]) {
+    for (auto &object : std::get<SharedTy>(clause.u).v)
+      getDsa(object).second |= DSA::Shared;
+  }
+
+  for (auto &clause : clauseSets[llvm::omp::Clause::OMPC_private]) {
+    for (auto &object : std::get<PrivateTy>(clause.u).v)
+      getDsa(object).second |= DSA::Private;
+  }
+
+  for (auto &clause : clauseSets[llvm::omp::Clause::OMPC_firstprivate]) {
+    for (auto &object : std::get<FirstprivateTy>(clause.u).v)
+      getDsa(object).second |= DSA::FirstPrivate;
+  }
+
+  for (auto &clause : clauseSets[llvm::omp::Clause::OMPC_lastprivate]) {
+    using ModifierTy = typename LastprivateTy::LastprivateModifier;
+    using ListTy = typename LastprivateTy::List;
+    const auto &lastp = std::get<LastprivateTy>(clause.u);
+    for (auto &object : std::get<ListTy>(lastp.t)) {
+      auto &mod = std::get<std::optional<ModifierTy>>(lastp.t);
+      if (mod && *mod == ModifierTy::Conditional) {
+        getDsa(object).second |= DSA::LastPrivateConditional;
+      } else {
+        getDsa(object).second |= DSA::LastPrivate;
+      }
+    }
+  }
+
+  // Check reductions as well, clear "shared" if set.
+  for (auto &clause : clauseSets[llvm::omp::Clause::OMPC_reduction]) {
+    using ReductionTy = tomp::clause::ReductionT<TypeTy, IdTy, ExprTy>;
+    using ListTy = typename ReductionTy::List;
+    for (auto &object : std::get<ListTy>(std::get<ReductionTy>(clause.u).t))
+      getDsa(object).second &= ~DSA::Shared;
+  }
+
+  tomp::ListT<ObjectTy> privateObj, sharedObj, firstpObj, lastpObj, lastpcObj;
+  for (auto &[object, dsa] : objectDsa) {
+    if (dsa &
+        (DSA::FirstPrivate | DSA::LastPrivate | DSA::LastPrivateConditional)) {
+      if (dsa & DSA::FirstPrivate)
+        firstpObj.push_back(object); // no else
+      if (dsa & DSA::LastPrivateConditional)
+        lastpcObj.push_back(object);
+      else if (dsa & DSA::LastPrivate)
+        lastpObj.push_back(object);
+    } else if (dsa & DSA::Private) {
+      privateObj.push_back(object);
+    } else if (dsa & DSA::Shared) {
+      sharedObj.push_back(object);
+    }
+  }
+
+  // Materialize each clause.
+  if (!privateObj.empty()) {
+    merged.clauses.emplace_back(
+        makeClause(llvm::omp::Clause::OMPC_private,
+                   PrivateTy{/*List=*/std::move(privateObj)}));
+  }
+  if (!sharedObj.empty()) {
+    merged.clauses.emplace_back(
+        makeClause(llvm::omp::Clause::OMPC_shared,
+                   SharedTy{/*List=*/std::move(sharedObj)}));
+  }
+  if (!firstpObj.empty()) {
+    merged.clauses.emplace_back(
+        makeClause(llvm::omp::Clause::OMPC_firstprivate,
+                   FirstprivateTy{/*List=*/std::move(firstpObj)}));
+  }
+  if (!lastpObj.empty()) {
+    merged.clauses.emplace_back(
+        makeClause(llvm::omp::Clause::OMPC_lastprivate,
+                   LastprivateTy{{/*LastprivateModifier=*/std::nullopt,
+                                  /*List=*/std::move(lastpObj)}}));
+  }
+  if (!lastpcObj.empty()) {
+    auto conditional = LastprivateTy::LastprivateModifier::Conditional;
+    merged.clauses.emplace_back(
+        makeClause(llvm::omp::Clause::OMPC_lastprivate,
+                   LastprivateTy{{/*LastprivateModifier=*/conditional,
+                                  /*List=*/std::move(lastpcObj)}}));
+  }
+}
+} // namespace tomp
+
+#endif // LLVM_FRONTEND_OPENMP_CONSTRUCTCOMPOSITIONT_H
diff --git a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
new file mode 100644
index 000000000000..5f12c62b832f
--- /dev/null
+++ b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
@@ -0,0 +1,1160 @@
+//===- ConstructDecompositionT.h -- Decomposing compound constructs -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Given a compound construct with a set of clauses, generate the list of
+// constituent leaf constructs, each with a list of clauses that apply to it.
+//
+// Note: Clauses that are not originally present, but that are implied by the
+// OpenMP spec are materialized, and are present in the output.
+//
+// Note: Composite constructs will also be broken up into leaf constructs.
+// If composite constructs require processing as a whole, the lists of clauses
+// for each leaf constituent should be merged.
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_FRONTEND_OPENMP_CONSTRUCTDECOMPOSITIONT_H
+#define LLVM_FRONTEND_OPENMP_CONSTRUCTDECOMPOSITIONT_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Frontend/OpenMP/ClauseT.h"
+#include "llvm/Frontend/OpenMP/OMP.h"
+
+#include <iterator>
+#include <list>
+#include <optional>
+#include <tuple>
+#include <type_traits>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <variant>
+
+static inline llvm::ArrayRef<llvm::omp::Directive> getWorksharing() {
+  static llvm::omp::Directive worksharing[] = {
+      llvm::omp::Directive::OMPD_do,     llvm::omp::Directive::OMPD_for,
+      llvm::omp::Directive::OMPD_scope,  llvm::omp::Directive::OMPD_sections,
+      llvm::omp::Directive::OMPD_single, llvm::omp::Directive::OMPD_workshare,
+  };
+  return worksharing;
+}
+
+static inline llvm::ArrayRef<llvm::omp::Directive> getWorksharingLoop() {
+  static llvm::omp::Directive worksharingLoop[] = {
+      llvm::omp::Directive::OMPD_do,
+      llvm::omp::Directive::OMPD_for,
+  };
+  return worksharingLoop;
+}
+
+namespace detail {
+template <typename Container, typename Predicate>
+typename std::remove_reference_t<Container>::iterator
+find_unique(Container &&container, Predicate &&pred) {
+  auto first = std::find_if(container.begin(), container.end(), pred);
+  if (first == container.end())
+    return first;
+  auto second = std::find_if(std::next(first), container.end(), pred);
+  if (second == container.end())
+    return first;
+  return container.end();
+}
+} // namespace detail
+
+namespace tomp {
+
+// ClauseType - Either instance of ClauseT, or a type derived from ClauseT.
+//
+// This is the clause representation in the code using this infrastructure.
+//
+// HelperType - A class that implements two member functions:
+//
+//   // Return the base object of the given object, if any.
+//   std::optional<Object> getBaseObject(const Object &object) const
+//   // Return the iteration variable of the outermost loop associated
+//   // with the construct being worked on, if any.
+//   std::optional<Object> getLoopIterVar() const
+template <typename ClauseType, typename HelperType>
+struct ConstructDecompositionT {
+  using ClauseTy = ClauseType;
+
+  using TypeTy = typename ClauseTy::TypeTy;
+  using IdTy = typename ClauseTy::IdTy;
+  using ExprTy = typename ClauseTy::ExprTy;
+  using HelperTy = HelperType;
+  using ObjectTy = tomp::ObjectT<IdTy, ExprTy>;
+
+  using ClauseSet = std::unordered_set<const ClauseTy *>;
+
+  ConstructDecompositionT(uint32_t ver, HelperType &helper,
+                          llvm::omp::Directive dir,
+                          llvm::ArrayRef<ClauseTy> clauses)
+      : version(ver), construct(dir), helper(helper) {
+    for (const ClauseTy &clause : clauses)
+      nodes.push_back(&clause);
+
+    bool success = split();
+    if (!success)
+      return;
+
+    // Copy the individual leaf directives with their clauses to the
+    // output list. Copy by value, since we don't own the storage
+    // with the input clauses, and the internal representation uses
+    // clause addresses.
+    for (auto &leaf : leafs) {
+      output.push_back({leaf.id, {}});
+      auto &out = output.back();
+      for (const ClauseTy *c : leaf.clauses)
+        out.clauses.push_back(*c);
+    }
+  }
+
+  tomp::ListT<DirectiveWithClauses<ClauseType>> output;
+
+private:
+  bool split();
+
+  struct LeafReprInternal {
+    llvm::omp::Directive id = llvm::omp::Directive::OMPD_unknown;
+    tomp::type::ListT<const ClauseTy *> clauses;
+  };
+
+  LeafReprInternal *findDirective(llvm::omp::Directive dirId) {
+    auto found = llvm::find_if(
+        leafs, [&](const LeafReprInternal &leaf) { return leaf.id == dirId; });
+    return found != leafs.end() ? &*found : nullptr;
+  }
+
+  ClauseSet *findClausesWith(const ObjectTy &object) {
+    if (auto found = syms.find(object.id()); found != syms.end())
+      return &found->second;
+    return nullptr;
+  }
+
+  template <typename S>
+  ClauseTy *makeClause(llvm::omp::Clause clauseId, S &&specific) {
+    implicit.push_back(typename ClauseTy::BaseT{clauseId, std::move(specific)});
+    return &implicit.back();
+  }
+
+  void addClauseSymsToMap(const ObjectTy &object, const ClauseTy *);
+  void addClauseSymsToMap(const tomp::ObjectListT<IdTy, ExprTy> &objects,
+                          const ClauseTy *);
+  void addClauseSymsToMap(const TypeTy &item, const ClauseTy *);
+  void addClauseSymsToMap(const ExprTy &item, const ClauseTy *);
+  void addClauseSymsToMap(const tomp::clause::MapT<TypeTy, IdTy, ExprTy> &item,
+                          const ClauseTy *);
+
+  template <typename U>
+  void addClauseSymsToMap(const std::optional<U> &item, const ClauseTy *);
+  template <typename U>
+  void addClauseSymsToMap(const tomp::ListT<U> &item, const ClauseTy *);
+  template <typename... U, size_t... Is>
+  void addClauseSymsToMap(const std::tuple<U...> &item, const ClauseTy *,
+                          std::index_sequence<Is...> = {});
+  template <typename U>
+  std::enable_if_t<std::is_enum_v<llvm::remove_cvref_t<U>>, void>
+  addClauseSymsToMap(U &&item, const ClauseTy *);
+
+  template <typename U>
+  std::enable_if_t<llvm::remove_cvref_t<U>::EmptyTrait::value, void>
+  addClauseSymsToMap(U &&item, const ClauseTy *);
+
+  template <typename U>
+  std::enable_if_t<llvm::remove_cvref_t<U>::IncompleteTrait::value, void>
+  addClauseSymsToMap(U &&item, const ClauseTy *);
+
+  template <typename U>
+  std::enable_if_t<llvm::remove_cvref_t<U>::WrapperTrait::value, void>
+  addClauseSymsToMap(U &&item, const ClauseTy *);
+
+  template <typename U>
+  std::enable_if_t<llvm::remove_cvref_t<U>::TupleTrait::value, void>
+  addClauseSymsToMap(U &&item, const ClauseTy *);
+
+  template <typename U>
+  std::enable_if_t<llvm::remove_cvref_t<U>::UnionTrait::value, void>
+  addClauseSymsToMap(U &&item, const ClauseTy *);
+
+  // Apply a clause to the only directive that allows it. If there are no
+  // directives that allow it, or if there is more that one, do not apply
+  // anything and return false, otherwise return true.
+  bool applyToUnique(const ClauseTy *node);
+
+  // Apply a clause to the first directive in given range that allows it.
+  // If such a directive does not exist, return false, otherwise return true.
+  template <typename Iterator>
+  bool applyToFirst(const ClauseTy *node, llvm::iterator_range<Iterator> range);
+
+  // Apply a clause to the innermost directive that allows it. If such a
+  // directive does not exist, return false, otherwise return true.
+  bool applyToInnermost(const ClauseTy *node);
+
+  // Apply a clause to the outermost directive that allows it. If such a
+  // directive does not exist, return false, otherwise return true.
+  bool applyToOutermost(const ClauseTy *node);
+
+  template <typename Predicate>
+  bool applyIf(const ClauseTy *node, Predicate shouldApply);
+
+  bool applyToAll(const ClauseTy *node);
+
+  template <typename Clause>
+  bool applyClause(Clause &&clause, const ClauseTy *node);
+
+  bool applyClause(const tomp::clause::CollapseT<TypeTy, IdTy, ExprTy> &clause,
+                   const ClauseTy *);
+  bool applyClause(const tomp::clause::PrivateT<TypeTy, IdTy, ExprTy> &clause,
+                   const ClauseTy *);
+  bool
+  applyClause(const tomp::clause::FirstprivateT<TypeTy, IdTy, ExprTy> &clause,
+              const ClauseTy *);
+  bool
+  applyClause(const tomp::clause::LastprivateT<TypeTy, IdTy, ExprTy> &clause,
+              const ClauseTy *);
+  bool applyClause(const tomp::clause::SharedT<TypeTy, IdTy, ExprTy> &clause,
+                   const ClauseTy *);
+  bool applyClause(const tomp::clause::DefaultT<TypeTy, IdTy, ExprTy> &clause,
+                   const ClauseTy *);
+  bool
+  applyClause(const tomp::clause::ThreadLimitT<TypeTy, IdTy, ExprTy> &clause,
+              const ClauseTy *);
+  bool applyClause(const tomp::clause::OrderT<TypeTy, IdTy, ExprTy> &clause,
+                   const ClauseTy *);
+  bool applyClause(const tomp::clause::AllocateT<TypeTy, IdTy, ExprTy> &clause,
+                   const ClauseTy *);
+  bool applyClause(const tomp::clause::ReductionT<TypeTy, IdTy, ExprTy> &clause,
+                   const ClauseTy *);
+  bool applyClause(const tomp::clause::IfT<TypeTy, IdTy, ExprTy> &clause,
+                   const ClauseTy *);
+  bool applyClause(const tomp::clause::LinearT<TypeTy, IdTy, ExprTy> &clause,
+                   const ClauseTy *);
+  bool applyClause(const tomp::clause::NowaitT<TypeTy, IdTy, ExprTy> &clause,
+                   const ClauseTy *);
+
+  uint32_t version;
+  llvm::omp::Directive construct;
+  HelperType &helper;
+  ListT<LeafReprInternal> leafs;
+  tomp::ListT<const ClauseTy *> nodes;
+  std::list<ClauseTy> implicit; // Container for materialized implicit clauses.
+                                // Inserting must preserve element addresses.
+  std::unordered_map<IdTy, ClauseSet> syms;
+  std::unordered_set<IdTy> mapBases;
+};
+
+// Deduction guide
+template <typename ClauseType, typename HelperType>
+ConstructDecompositionT(uint32_t, HelperType &, llvm::omp::Directive,
+                        llvm::ArrayRef<ClauseType>)
+    -> ConstructDecompositionT<ClauseType, HelperType>;
+
+template <typename C, typename H>
+void ConstructDecompositionT<C, H>::addClauseSymsToMap(const ObjectTy &object,
+                                                       const ClauseTy *node) {
+  syms[object.id()].insert(node);
+}
+
+template <typename C, typename H>
+void ConstructDecompositionT<C, H>::addClauseSymsToMap(
+    const tomp::ObjectListT<IdTy, ExprTy> &objects, const ClauseTy *node) {
+  for (auto &object : objects)
+    syms[object.id()].insert(node);
+}
+
+template <typename C, typename H>
+void ConstructDecompositionT<C, H>::addClauseSymsToMap(const TypeTy &item,
+                                                       const ClauseTy *node) {
+  // Nothing to do for types.
+}
+
+template <typename C, typename H>
+void ConstructDecompositionT<C, H>::addClauseSymsToMap(const ExprTy &item,
+                                                       const ClauseTy *node) {
+  // Nothing to do for expressions.
+}
+
+template <typename C, typename H>
+void ConstructDecompositionT<C, H>::addClauseSymsToMap(
+    const tomp::clause::MapT<TypeTy, IdTy, ExprTy> &item,
+    const ClauseTy *node) {
+  auto &objects = std::get<tomp::ObjectListT<IdTy, ExprTy>>(item.t);
+  addClauseSymsToMap(objects, node);
+  for (auto &object : objects) {
+    if (auto base = helper.getBaseObject(object))
+      mapBases.insert(base->id());
+  }
+}
+
+template <typename C, typename H>
+template <typename U>
+void ConstructDecompositionT<C, H>::addClauseSymsToMap(
+    const std::optional<U> &item, const ClauseTy *node) {
+  if (item)
+    addClauseSymsToMap(*item, node);
+}
+
+template <typename C, typename H>
+template <typename U>
+void ConstructDecompositionT<C, H>::addClauseSymsToMap(
+    const tomp::ListT<U> &item, const ClauseTy *node) {
+  for (auto &s : item)
+    addClauseSymsToMap(s, node);
+}
+
+template <typename C, typename H>
+template <typename... U, size_t... Is>
+void ConstructDecompositionT<C, H>::addClauseSymsToMap(
+    const std::tuple<U...> &item, const ClauseTy *node,
+    std::index_sequence<Is...>) {
+  (void)node; // Silence strange warning from GCC.
+  (addClauseSymsToMap(std::get<Is>(item), node), ...);
+}
+
+template <typename C, typename H>
+template <typename U>
+std::enable_if_t<std::is_enum_v<llvm::remove_cvref_t<U>>, void>
+ConstructDecompositionT<C, H>::addClauseSymsToMap(U &&item,
+                                                  const ClauseTy *node) {
+  // Nothing to do for enums.
+}
+
+template <typename C, typename H>
+template <typename U>
+std::enable_if_t<llvm::remove_cvref_t<U>::EmptyTrait::value, void>
+ConstructDecompositionT<C, H>::addClauseSymsToMap(U &&item,
+                                                  const ClauseTy *node) {
+  // Nothing to do for an empty class.
+}
+
+template <typename C, typename H>
+template <typename U>
+std::enable_if_t<llvm::remove_cvref_t<U>::IncompleteTrait::value, void>
+ConstructDecompositionT<C, H>::addClauseSymsToMap(U &&item,
+                                                  const ClauseTy *node) {
+  // Nothing to do for an incomplete class (they're empty).
+}
+
+template <typename C, typename H>
+template <typename U>
+std::enable_if_t<llvm::remove_cvref_t<U>::WrapperTrait::value, void>
+ConstructDecompositionT<C, H>::addClauseSymsToMap(U &&item,
+                                                  const ClauseTy *node) {
+  addClauseSymsToMap(item.v, node);
+}
+
+template <typename C, typename H>
+template <typename U>
+std::enable_if_t<llvm::remove_cvref_t<U>::TupleTrait::value, void>
+ConstructDecompositionT<C, H>::addClauseSymsToMap(U &&item,
+                                                  const ClauseTy *node) {
+  constexpr size_t tuple_size =
+      std::tuple_size_v<llvm::remove_cvref_t<decltype(item.t)>>;
+  addClauseSymsToMap(item.t, node, std::make_index_sequence<tuple_size>{});
+}
+
+template <typename C, typename H>
+template <typename U>
+std::enable_if_t<llvm::remove_cvref_t<U>::UnionTrait::value, void>
+ConstructDecompositionT<C, H>::addClauseSymsToMap(U &&item,
+                                                  const ClauseTy *node) {
+  std::visit([&](auto &&s) { addClauseSymsToMap(s, node); }, item.u);
+}
+
+// Apply a clause to the only directive that allows it. If there are no
+// directives that allow it, or if there is more that one, do not apply
+// anything and return false, otherwise return true.
+template <typename C, typename H>
+bool ConstructDecompositionT<C, H>::applyToUnique(const ClauseTy *node) {
+  auto unique = detail::find_unique(leafs, [=](const auto &dirInfo) {
+    return llvm::omp::isAllowedClauseForDirective(dirInfo.id, node->id,
+                                                  version);
+  });
+
+  if (unique != leafs.end()) {
+    unique->clauses.push_back(node);
+    return true;
+  }
+  return false;
+}
+
+// Apply a clause to the first directive in given range that allows it.
+// If such a directive does not exist, return false, otherwise return true.
+template <typename C, typename H>
+template <typename Iterator>
+bool ConstructDecompositionT<C, H>::applyToFirst(
+    const ClauseTy *node, llvm::iterator_range<Iterator> range) {
+  if (range.empty())
+    return false;
+
+  for (auto &leaf : range) {
+    if (!llvm::omp::isAllowedClauseForDirective(leaf.id, node->id, version))
+      continue;
+    leaf.clauses.push_back(node);
+    return true;
+  }
+  return false;
+}
+
+// Apply a clause to the innermost directive that allows it. If such a
+// directive does not exist, return false, otherwise return true.
+template <typename C, typename H>
+bool ConstructDecompositionT<C, H>::applyToInnermost(const ClauseTy *node) {
+  return applyToFirst(node, llvm::reverse(leafs));
+}
+
+// Apply a clause to the outermost directive that allows it. If such a
+// directive does not exist, return false, otherwise return true.
+template <typename C, typename H>
+bool ConstructDecompositionT<C, H>::applyToOutermost(const ClauseTy *node) {
+  return applyToFirst(node, llvm::iterator_range(leafs));
+}
+
+template <typename C, typename H>
+template <typename Predicate>
+bool ConstructDecompositionT<C, H>::applyIf(const ClauseTy *node,
+                                            Predicate shouldApply) {
+  bool applied = false;
+  for (auto &leaf : leafs) {
+    if (!llvm::omp::isAllowedClauseForDirective(leaf.id, node->id, version))
+      continue;
+    if (!shouldApply(leaf))
+      continue;
+    leaf.clauses.push_back(node);
+    applied = true;
+  }
+
+  return applied;
+}
+
+template <typename C, typename H>
+bool ConstructDecompositionT<C, H>::applyToAll(const ClauseTy *node) {
+  return applyIf(node, [](auto) { return true; });
+}
+
+template <typename C, typename H>
+template <typename Clause>
+bool ConstructDecompositionT<C, H>::applyClause(Clause &&clause,
+                                                const ClauseTy *node) {
+  // The default behavior is to find the unique directive to which the
+  // given clause may be applied. If there are no such directives, or
+  // if there are multiple ones, flag an error.
+  // From "OpenMP Application Programming Interface", Version 5.2:
+  // S Some clauses are permitted only on a single leaf construct of the
+  // S combined or composite construct, in which case the effect is as if
+  // S the clause is applied to that specific construct. (p339, 31-33)
+  if (applyToUnique(node))
+    return true;
+
+  return false;
+}
+
+// COLLAPSE
+// [5.2:93:20-21]
+// Directives: distribute, do, for, loop, simd, taskloop
+//
+// [5.2:339:35]
+// (35) The collapse clause is applied once to the combined or composite
+// construct.
+template <typename C, typename H>
+bool ConstructDecompositionT<C, H>::applyClause(
+    const tomp::clause::CollapseT<TypeTy, IdTy, ExprTy> &clause,
+    const ClauseTy *node) {
+  // Apply "collapse" to the innermost directive. If it's not one that
+  // allows it flag an error.
+  if (!leafs.empty()) {
+    auto &last = leafs.back();
+
+    if (llvm::omp::isAllowedClauseForDirective(last.id, node->id, version)) {
+      last.clauses.push_back(node);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// PRIVATE
+// [5.2:111:5-7]
+// Directives: distribute, do, for, loop, parallel, scope, sections, simd,
+// single, target, task, taskloop, teams
+//
+// [5.2:340:1-2]
+// (1) The effect of the 1 private clause is as if it is applied only to the
+// innermost leaf construct that permits it.
+template <typename C, typename H>
+bool ConstructDecompositionT<C, H>::applyClause(
+    const tomp::clause::PrivateT<TypeTy, IdTy, ExprTy> &clause,
+    const ClauseTy *node) {
+  return applyToInnermost(node);
+}
+
+// FIRSTPRIVATE
+// [5.2:112:5-7]
+// Directives: distribute, do, for, parallel, scope, sections, single, target,
+// task, taskloop, teams
+//
+// [5.2:340:3-20]
+// (3) The effect of the firstprivate clause is as if it is applied to one or
+// more leaf constructs as follows:
+//  (5) To the distribute construct if it is among the constituent constructs;
+//  (6) To the teams construct if it is among the constituent constructs and the
+//      distribute construct is not;
+//  (8) To a worksharing construct that accepts the clause if one is among the
+//      constituent constructs;
+//  (9) To the taskloop construct if it is among the constituent constructs;
+// (10) To the parallel construct if it is among the constituent constructs and
+//      neither a taskloop construct nor a worksharing construct that accepts
+//      the clause is among them;
+// (12) To the target construct if it is among the constituent constructs and
+//      the same list item neither appears in a lastprivate clause nor is the
+//      base variable or base pointer of a list item that appears in a map
+//      clause.
+//
+// (15) If the parallel construct is among the constituent constructs and the
+// effect is not as if the firstprivate clause is applied to it by the above
+// rules, then the effect is as if the shared clause with the same list item is
+// applied to the parallel construct.
+// (17) If the teams construct is among the constituent constructs and the
+// effect is not as if the firstprivate clause is applied to it by the above
+// rules, then the effect is as if the shared clause with the same list item is
+// applied to the teams construct.
+template <typename C, typename H>
+bool ConstructDecompositionT<C, H>::applyClause(
+    const tomp::clause::FirstprivateT<TypeTy, IdTy, ExprTy> &clause,
+    const ClauseTy *node) {
+  bool applied = false;
+
+  // [5.2:340:3-6]
+  auto dirDistribute = findDirective(llvm::omp::OMPD_distribute);
+  auto dirTeams = findDirective(llvm::omp::OMPD_teams);
+  if (dirDistribute != nullptr) {
+    dirDistribute->clauses.push_back(node);
+    applied = true;
+    // [5.2:340:17]
+    if (dirTeams != nullptr) {
+      auto *shared = makeClause(
+          llvm::omp::Clause::OMPC_shared,
+          tomp::clause::SharedT<TypeTy, IdTy, ExprTy>{/*List=*/clause.v});
+      dirTeams->clauses.push_back(shared);
+    }
+  } else if (dirTeams != nullptr) {
+    dirTeams->clauses.push_back(node);
+    applied = true;
+  }
+
+  // [5.2:340:8]
+  auto findWorksharing = [&]() {
+    auto worksharing = getWorksharing();
+    for (auto &leaf : leafs) {
+      auto found = llvm::find(worksharing, leaf.id);
+      if (found != std::end(worksharing))
+        return &leaf;
+    }
+    return static_cast<typename decltype(leafs)::value_type *>(nullptr);
+  };
+
+  auto dirWorksharing = findWorksharing();
+  if (dirWorksharing != nullptr) {
+    dirWorksharing->clauses.push_back(node);
+    applied = true;
+  }
+
+  // [5.2:340:9]
+  auto dirTaskloop = findDirective(llvm::omp::OMPD_taskloop);
+  if (dirTaskloop != nullptr) {
+    dirTaskloop->clauses.push_back(node);
+    applied = true;
+  }
+
+  // [5.2:340:10]
+  auto dirParallel = findDirective(llvm::omp::OMPD_parallel);
+  if (dirParallel != nullptr) {
+    if (dirTaskloop == nullptr && dirWorksharing == nullptr) {
+      dirParallel->clauses.push_back(node);
+      applied = true;
+    } else {
+      // [5.2:340:15]
+      auto *shared = makeClause(
+          llvm::omp::Clause::OMPC_shared,
+          tomp::clause::SharedT<TypeTy, IdTy, ExprTy>{/*List=*/clause.v});
+      dirParallel->clauses.push_back(shared);
+    }
+  }
+
+  // [5.2:340:12]
+  auto inLastprivate = [&](const ObjectTy &object) {
+    if (ClauseSet *set = findClausesWith(object)) {
+      return llvm::find_if(*set, [](const ClauseTy *c) {
+               return c->id == llvm::omp::Clause::OMPC_lastprivate;
+             }) != set->end();
+    }
+    return false;
+  };
+
+  auto dirTarget = findDirective(llvm::omp::OMPD_target);
+  if (dirTarget != nullptr) {
+    tomp::ObjectListT<IdTy, ExprTy> objects;
+    llvm::copy_if(
+        clause.v, std::back_inserter(objects), [&](const ObjectTy &object) {
+          return !inLastprivate(object) && !mapBases.count(object.id());
+        });
+    if (!objects.empty()) {
+      auto *firstp = makeClause(
+          llvm::omp::Clause::OMPC_firstprivate,
+          tomp::clause::FirstprivateT<TypeTy, IdTy, ExprTy>{/*List=*/objects});
+      dirTarget->clauses.push_back(firstp);
+      applied = true;
+    }
+  }
+
+  // "task" is not handled by any of the cases above.
+  if (auto dirTask = findDirective(llvm::omp::OMPD_task)) {
+    dirTask->clauses.push_back(node);
+    applied = true;
+  }
+
+  return applied;
+}
+
+// LASTPRIVATE
+// [5.2:115:7-8]
+// Directives: distribute, do, for, loop, sections, simd, taskloop
+//
+// [5.2:340:21-30]
+// (21) The effect of the lastprivate clause is as if it is applied to all leaf
+// constructs that permit the clause.
+// (22) If the parallel construct is among the constituent constructs and the
+// list item is not also specified in the firstprivate clause, then the effect
+// of the lastprivate clause is as if the shared clause with the same list item
+// is applied to the parallel construct.
+// (24) If the teams construct is among the constituent constructs and the list
+// item is not also specified in the firstprivate clause, then the effect of the
+// lastprivate clause is as if the shared clause with the same list item is
+// applied to the teams construct.
+// (27) If the target construct is among the constituent constructs and the list
+// item is not the base variable or base pointer of a list item that appears in
+// a map clause, the effect of the lastprivate clause is as if the same list
+// item appears in a map clause with a map-type of tofrom.
+template <typename C, typename H>
+bool ConstructDecompositionT<C, H>::applyClause(
+    const tomp::clause::LastprivateT<TypeTy, IdTy, ExprTy> &clause,
+    const ClauseTy *node) {
+  bool applied = false;
+
+  // [5.2:340:21]
+  applied = applyToAll(node);
+  if (!applied)
+    return false;
+
+  auto inFirstprivate = [&](const ObjectTy &object) {
+    if (ClauseSet *set = findClausesWith(object)) {
+      return llvm::find_if(*set, [](const ClauseTy *c) {
+               return c->id == llvm::omp::Clause::OMPC_firstprivate;
+             }) != set->end();
+    }
+    return false;
+  };
+
+  auto &objects = std::get<tomp::ObjectListT<IdTy, ExprTy>>(clause.t);
+
+  // Prepare list of objects that could end up in a "shared" clause.
+  tomp::ObjectListT<IdTy, ExprTy> sharedObjects;
+  llvm::copy_if(
+      objects, std::back_inserter(sharedObjects),
+      [&](const ObjectTy &object) { return !inFirstprivate(object); });
+
+  if (!sharedObjects.empty()) {
+    // [5.2:340:22]
+    if (auto dirParallel = findDirective(llvm::omp::OMPD_parallel)) {
+      auto *shared = makeClause(
+          llvm::omp::Clause::OMPC_shared,
+          tomp::clause::SharedT<TypeTy, IdTy, ExprTy>{/*List=*/sharedObjects});
+      dirParallel->clauses.push_back(shared);
+      applied = true;
+    }
+
+    // [5.2:340:24]
+    if (auto dirTeams = findDirective(llvm::omp::OMPD_teams)) {
+      auto *shared = makeClause(
+          llvm::omp::Clause::OMPC_shared,
+          tomp::clause::SharedT<TypeTy, IdTy, ExprTy>{/*List=*/sharedObjects});
+      dirTeams->clauses.push_back(shared);
+      applied = true;
+    }
+  }
+
+  // [5.2:340:27]
+  if (auto dirTarget = findDirective(llvm::omp::OMPD_target)) {
+    tomp::ObjectListT<IdTy, ExprTy> tofrom;
+    llvm::copy_if(
+        objects, std::back_inserter(tofrom),
+        [&](const ObjectTy &object) { return !mapBases.count(object.id()); });
+
+    if (!tofrom.empty()) {
+      using MapType =
+          typename tomp::clause::MapT<TypeTy, IdTy, ExprTy>::MapType;
+      auto *map =
+          makeClause(llvm::omp::Clause::OMPC_map,
+                     tomp::clause::MapT<TypeTy, IdTy, ExprTy>{
+                         {/*MapType=*/MapType::Tofrom,
+                          /*MapTypeModifier=*/std::nullopt,
+                          /*Mapper=*/std::nullopt, /*Iterator=*/std::nullopt,
+                          /*LocatorList=*/std::move(tofrom)}});
+      dirTarget->clauses.push_back(map);
+      applied = true;
+    }
+  }
+
+  return applied;
+}
+
+// SHARED
+// [5.2:110:5-6]
+// Directives: parallel, task, taskloop, teams
+//
+// [5.2:340:31-32]
+// (31) The effect of the shared, default, thread_limit, or order clause is as
+// if it is applied to all leaf constructs that permit the clause.
+template <typename C, typename H>
+bool ConstructDecompositionT<C, H>::applyClause(
+    const tomp::clause::SharedT<TypeTy, IdTy, ExprTy> &clause,
+    const ClauseTy *node) {
+  // [5.2:340:31]
+  return applyToAll(node);
+}
+
+// DEFAULT
+// [5.2:109:5-6]
+// Directives: parallel, task, taskloop, teams
+//
+// [5.2:340:31-32]
+// (31) The effect of the shared, default, thread_limit, or order clause is as
+// if it is applied to all leaf constructs that permit the clause.
+template <typename C, typename H>
+bool ConstructDecompositionT<C, H>::applyClause(
+    const tomp::clause::DefaultT<TypeTy, IdTy, ExprTy> &clause,
+    const ClauseTy *node) {
+  // [5.2:340:31]
+  return applyToAll(node);
+}
+
+// THREAD_LIMIT
+// [5.2:277:14-15]
+// Directives: target, teams
+//
+// [5.2:340:31-32]
+// (31) The effect of the shared, default, thread_limit, or order clause is as
+// if it is applied to all leaf constructs that permit the clause.
+template <typename C, typename H>
+bool ConstructDecompositionT<C, H>::applyClause(
+    const tomp::clause::ThreadLimitT<TypeTy, IdTy, ExprTy> &clause,
+    const ClauseTy *node) {
+  // [5.2:340:31]
+  return applyToAll(node);
+}
+
+// ORDER
+// [5.2:234:3-4]
+// Directives: distribute, do, for, loop, simd
+//
+// [5.2:340:31-32]
+// (31) The effect of the shared, default, thread_limit, or order clause is as
+// if it is applied to all leaf constructs that permit the clause.
+template <typename C, typename H>
+bool ConstructDecompositionT<C, H>::applyClause(
+    const tomp::clause::OrderT<TypeTy, IdTy, ExprTy> &clause,
+    const ClauseTy *node) {
+  // [5.2:340:31]
+  return applyToAll(node);
+}
+
+// ALLOCATE
+// [5.2:178:7-9]
+// Directives: allocators, distribute, do, for, parallel, scope, sections,
+// single, target, task, taskgroup, taskloop, teams
+//
+// [5.2:340:33-35]
+// (33) The effect of the allocate clause is as if it is applied to all leaf
+// constructs that permit the clause and to which a data-sharing attribute
+// clause that may create a private copy of the same list item is applied.
+template <typename C, typename H>
+bool ConstructDecompositionT<C, H>::applyClause(
+    const tomp::clause::AllocateT<TypeTy, IdTy, ExprTy> &clause,
+    const ClauseTy *node) {
+  // This one needs to be applied at the end, once we know which clauses are
+  // assigned to which leaf constructs.
+
+  // [5.2:340:33]
+  auto canMakePrivateCopy = [](llvm::omp::Clause id) {
+    switch (id) {
+    case llvm::omp::Clause::OMPC_firstprivate:
+    case llvm::omp::Clause::OMPC_lastprivate:
+    case llvm::omp::Clause::OMPC_private:
+      return true;
+    default:
+      return false;
+    }
+  };
+
+  bool applied = applyIf(node, [&](const auto &leaf) {
+    return llvm::any_of(leaf.clauses, [&](const ClauseTy *n) {
+      return canMakePrivateCopy(n->id);
+    });
+  });
+
+  return applied;
+}
+
+// REDUCTION
+// [5.2:134:17-18]
+// Directives: do, for, loop, parallel, scope, sections, simd, taskloop, teams
+//
+// [5.2:340:36-37], [5.2:341:1-13]
+// (36) The effect of the reduction clause is as if it is applied to all leaf
+// constructs that permit the clause, except for the following constructs:
+//  (1) The parallel construct, when combined with the sections,
+//      worksharing-loop, loop, or taskloop construct; and
+//  (3) The teams construct, when combined with the loop construct.
+// (4) For the parallel and teams constructs above, the effect of the reduction
+// clause instead is as if each list item or, for any list item that is an array
+// item, its corresponding base array or base pointer appears in a shared clause
+// for the construct.
+// (6) If the task reduction-modifier is specified, the effect is as if it only
+// modifies the behavior of the reduction clause on the innermost leaf construct
+// that accepts the modifier (see Section 5.5.8).
+// (8) If the inscan reduction-modifier is specified, the effect is as if it
+// modifies the behavior of the reduction clause on all constructs of the
+// combined construct to which the clause is applied and that accept the
+// modifier.
+// (10) If a list item in a reduction clause on a combined target construct does
+// not have the same base variable or base pointer as a list item in a map
+// clause on the construct, then the effect is as if the list item in the
+// reduction clause appears as a list item in a map clause with a map-type of
+// tofrom.
+template <typename C, typename H>
+bool ConstructDecompositionT<C, H>::applyClause(
+    const tomp::clause::ReductionT<TypeTy, IdTy, ExprTy> &clause,
+    const ClauseTy *node) {
+  using ReductionTy = tomp::clause::ReductionT<TypeTy, IdTy, ExprTy>;
+
+  // [5.2:340:36], [5.2:341:1], [5.2:341:3]
+  bool applyToParallel = true, applyToTeams = true;
+
+  auto dirParallel = findDirective(llvm::omp::Directive::OMPD_parallel);
+  if (dirParallel) {
+    auto exclusions = llvm::concat<const llvm::omp::Directive>(
+        getWorksharingLoop(), tomp::ListT<llvm::omp::Directive>{
+                                  llvm::omp::Directive::OMPD_loop,
+                                  llvm::omp::Directive::OMPD_sections,
+                                  llvm::omp::Directive::OMPD_taskloop,
+                              });
+    auto present = [&](llvm::omp::Directive id) {
+      return findDirective(id) != nullptr;
+    };
+
+    if (llvm::any_of(exclusions, present))
+      applyToParallel = false;
+  }
+
+  auto dirTeams = findDirective(llvm::omp::Directive::OMPD_teams);
+  if (dirTeams) {
+    // The only exclusion is OMPD_loop.
+    if (findDirective(llvm::omp::Directive::OMPD_loop))
+      applyToTeams = false;
+  }
+
+  using ReductionModifier = typename ReductionTy::ReductionModifier;
+  using ReductionIdentifiers = typename ReductionTy::ReductionIdentifiers;
+
+  auto &objects = std::get<tomp::ObjectListT<IdTy, ExprTy>>(clause.t);
+  auto &modifier = std::get<std::optional<ReductionModifier>>(clause.t);
+
+  // Apply the reduction clause first to all directives according to the spec.
+  // If the reduction was applied at least once, proceed with the data sharing
+  // side-effects.
+  bool applied = false;
+
+  // [5.2:341:6], [5.2:341:8]
+  auto isValidModifier = [](llvm::omp::Directive dir, ReductionModifier mod,
+                            bool alreadyApplied) {
+    switch (mod) {
+    case ReductionModifier::Inscan:
+      // According to [5.2:135:11-13], "inscan" only applies to
+      // worksharing-loop, worksharing-loop-simd, or "simd" constructs.
+      return dir == llvm::omp::Directive::OMPD_simd ||
+             llvm::is_contained(getWorksharingLoop(), dir);
+    case ReductionModifier::Task:
+      if (alreadyApplied)
+        return false;
+      // According to [5.2:135:16-18], "task" only applies to "parallel" and
+      // worksharing constructs.
+      return dir == llvm::omp::Directive::OMPD_parallel ||
+             llvm::is_contained(getWorksharing(), dir);
+    case ReductionModifier::Default:
+      return true;
+    }
+    llvm_unreachable("Unexpected modifier");
+  };
+
+  auto *unmodified = makeClause(
+      llvm::omp::Clause::OMPC_reduction,
+      ReductionTy{
+          {/*ReductionModifier=*/std::nullopt,
+           /*ReductionIdentifiers=*/std::get<ReductionIdentifiers>(clause.t),
+           /*List=*/objects}});
+
+  ReductionModifier effective =
+      modifier.has_value() ? *modifier : ReductionModifier::Default;
+  bool effectiveApplied = false;
+  // Walk over the leaf constructs starting from the innermost, and apply
+  // the clause as required by the spec.
+  for (auto &leaf : llvm::reverse(leafs)) {
+    if (!llvm::omp::isAllowedClauseForDirective(leaf.id, node->id, version))
+      continue;
+    if (!applyToParallel && &leaf == dirParallel)
+      continue;
+    if (!applyToTeams && &leaf == dirTeams)
+      continue;
+    // Some form of the clause will be applied past this point.
+    if (isValidModifier(leaf.id, effective, effectiveApplied)) {
+      // Apply clause with modifier.
+      leaf.clauses.push_back(node);
+      effectiveApplied = true;
+    } else {
+      // Apply clause without modifier.
+      leaf.clauses.push_back(unmodified);
+    }
+    applied = true;
+  }
+
+  if (!applied)
+    return false;
+
+  tomp::ObjectListT<IdTy, ExprTy> sharedObjects;
+  llvm::transform(objects, std::back_inserter(sharedObjects),
+                  [&](const ObjectTy &object) {
+                    auto maybeBase = helper.getBaseObject(object);
+                    return maybeBase ? *maybeBase : object;
+                  });
+
+  // [5.2:341:4]
+  if (!sharedObjects.empty()) {
+    if (dirParallel && !applyToParallel) {
+      auto *shared = makeClause(
+          llvm::omp::Clause::OMPC_shared,
+          tomp::clause::SharedT<TypeTy, IdTy, ExprTy>{/*List=*/sharedObjects});
+      dirParallel->clauses.push_back(shared);
+    }
+    if (dirTeams && !applyToTeams) {
+      auto *shared = makeClause(
+          llvm::omp::Clause::OMPC_shared,
+          tomp::clause::SharedT<TypeTy, IdTy, ExprTy>{/*List=*/sharedObjects});
+      dirTeams->clauses.push_back(shared);
+    }
+  }
+
+  // [5.2:341:10]
+  auto dirTarget = findDirective(llvm::omp::Directive::OMPD_target);
+  if (dirTarget && leafs.size() > 1) {
+    tomp::ObjectListT<IdTy, ExprTy> tofrom;
+    llvm::copy_if(objects, std::back_inserter(tofrom),
+                  [&](const ObjectTy &object) {
+                    if (auto maybeBase = helper.getBaseObject(object))
+                      return !mapBases.count(maybeBase->id());
+                    return !mapBases.count(object.id()); // XXX is this ok?
+                  });
+    if (!tofrom.empty()) {
+      using MapType =
+          typename tomp::clause::MapT<TypeTy, IdTy, ExprTy>::MapType;
+      auto *map = makeClause(
+          llvm::omp::Clause::OMPC_map,
+          tomp::clause::MapT<TypeTy, IdTy, ExprTy>{
+              {/*MapType=*/MapType::Tofrom, /*MapTypeModifier=*/std::nullopt,
+               /*Mapper=*/std::nullopt, /*Iterator=*/std::nullopt,
+               /*LocatorList=*/std::move(tofrom)}});
+
+      dirTarget->clauses.push_back(map);
+      applied = true;
+    }
+  }
+
+  return applied;
+}
+
+// IF
+// [5.2:72:7-9]
+// Directives: cancel, parallel, simd, target, target data, target enter data,
+// target exit data, target update, task, taskloop
+//
+// [5.2:72:15-18]
+// (15) For combined or composite constructs, the if clause only applies to the
+// semantics of the construct named in the directive-name-modifier.
+// (16) For a combined or composite construct, if no directive-name-modifier is
+// specified then the if clause applies to all constituent constructs to which
+// an if clause can apply.
+template <typename C, typename H>
+bool ConstructDecompositionT<C, H>::applyClause(
+    const tomp::clause::IfT<TypeTy, IdTy, ExprTy> &clause,
+    const ClauseTy *node) {
+  using DirectiveNameModifier =
+      typename clause::IfT<TypeTy, IdTy, ExprTy>::DirectiveNameModifier;
+  using IfExpression = typename clause::IfT<TypeTy, IdTy, ExprTy>::IfExpression;
+  auto &modifier = std::get<std::optional<DirectiveNameModifier>>(clause.t);
+
+  if (modifier) {
+    llvm::omp::Directive dirId = *modifier;
+    auto *unmodified =
+        makeClause(llvm::omp::Clause::OMPC_if,
+                   tomp::clause::IfT<TypeTy, IdTy, ExprTy>{
+                       {/*DirectiveNameModifier=*/std::nullopt,
+                        /*IfExpression=*/std::get<IfExpression>(clause.t)}});
+
+    if (auto *hasDir = findDirective(dirId)) {
+      hasDir->clauses.push_back(unmodified);
+      return true;
+    }
+    return false;
+  }
+
+  return applyToAll(node);
+}
+
+// LINEAR
+// [5.2:118:1-2]
+// Directives: declare simd, do, for, simd
+//
+// [5.2:341:15-22]
+// (15.1) The effect of the linear clause is as if it is applied to the
+// innermost leaf construct.
+// (15.2) Additionally, if the list item is not the iteration variable of a simd
+// or worksharing-loop SIMD construct, the effect on the outer leaf constructs
+// is as if the list item was specified in firstprivate and lastprivate clauses
+// on the combined or composite construct, with the rules specified above
+// applied.
+// (19) If a list item of the linear clause is the iteration variable of a simd
+// or worksharing-loop SIMD construct and it is not declared in the construct,
+// the effect on the outer leaf constructs is as if the list item was specified
+// in a lastprivate clause on the combined or composite construct with the rules
+// specified above applied.
+template <typename C, typename H>
+bool ConstructDecompositionT<C, H>::applyClause(
+    const tomp::clause::LinearT<TypeTy, IdTy, ExprTy> &clause,
+    const ClauseTy *node) {
+  // [5.2:341:15.1]
+  if (!applyToInnermost(node))
+    return false;
+
+  // [5.2:341:15.2], [5.2:341:19]
+  auto dirSimd = findDirective(llvm::omp::Directive::OMPD_simd);
+  std::optional<ObjectTy> iterVar = helper.getLoopIterVar();
+  const auto &objects = std::get<tomp::ObjectListT<IdTy, ExprTy>>(clause.t);
+
+  // Lists of objects that will be used to construct "firstprivate" and
+  // "lastprivate" clauses.
+  tomp::ObjectListT<IdTy, ExprTy> first, last;
+
+  for (const ObjectTy &object : objects) {
+    last.push_back(object);
+    if (!dirSimd || !iterVar || object.id() != iterVar->id())
+      first.push_back(object);
+  }
+
+  if (!first.empty()) {
+    auto *firstp = makeClause(
+        llvm::omp::Clause::OMPC_firstprivate,
+        tomp::clause::FirstprivateT<TypeTy, IdTy, ExprTy>{/*List=*/first});
+    nodes.push_back(firstp); // Appending to the main clause list.
+  }
+  if (!last.empty()) {
+    auto *lastp =
+        makeClause(llvm::omp::Clause::OMPC_lastprivate,
+                   tomp::clause::LastprivateT<TypeTy, IdTy, ExprTy>{
+                       {/*LastprivateModifier=*/std::nullopt, /*List=*/last}});
+    nodes.push_back(lastp); // Appending to the main clause list.
+  }
+  return true;
+}
+
+// NOWAIT
+// [5.2:308:11-13]
+// Directives: dispatch, do, for, interop, scope, sections, single, target,
+// target enter data, target exit data, target update, taskwait, workshare
+//
+// [5.2:341:23]
+// (23) The effect of the nowait clause is as if it is applied to the outermost
+// leaf construct that permits it.
+template <typename C, typename H>
+bool ConstructDecompositionT<C, H>::applyClause(
+    const tomp::clause::NowaitT<TypeTy, IdTy, ExprTy> &clause,
+    const ClauseTy *node) {
+  return applyToOutermost(node);
+}
+
+template <typename C, typename H> bool ConstructDecompositionT<C, H>::split() {
+  bool success = true;
+
+  for (llvm::omp::Directive leaf :
+       llvm::omp::getLeafConstructsOrSelf(construct))
+    leafs.push_back(LeafReprInternal{leaf, /*clauses=*/{}});
+
+  for (const ClauseTy *node : nodes)
+    addClauseSymsToMap(*node, node);
+
+  // First we need to apply LINEAR, because it can generate additional
+  // "firstprivate" and "lastprivate" clauses that apply to the combined/
+  // composite construct.
+  // Collect them separately, because they may modify the clause list.
+  llvm::SmallVector<const ClauseTy *> linears;
+  for (const ClauseTy *node : nodes) {
+    if (node->id == llvm::omp::Clause::OMPC_linear)
+      linears.push_back(node);
+  }
+  for (const auto *node : linears) {
+    success = success &&
+              applyClause(std::get<tomp::clause::LinearT<TypeTy, IdTy, ExprTy>>(
+                              node->u),
+                          node);
+  }
+
+  // "allocate" clauses need to be applied last since they need to see
+  // which directives have data-privatizing clauses.
+  auto skip = [](const ClauseTy *node) {
+    switch (node->id) {
+    case llvm::omp::Clause::OMPC_allocate:
+    case llvm::omp::Clause::OMPC_linear:
+      return true;
+    default:
+      return false;
+    }
+  };
+
+  // Apply (almost) all clauses.
+  for (const ClauseTy *node : nodes) {
+    if (skip(node))
+      continue;
+    success =
+        success &&
+        std::visit([&](auto &&s) { return applyClause(s, node); }, node->u);
+  }
+
+  // Apply "allocate".
+  for (const ClauseTy *node : nodes) {
+    if (node->id != llvm::omp::Clause::OMPC_allocate)
+      continue;
+    success =
+        success &&
+        std::visit([&](auto &&s) { return applyClause(s, node); }, node->u);
+  }
+
+  return success;
+}
+
+} // namespace tomp
+
+#endif // LLVM_FRONTEND_OPENMP_CONSTRUCTDECOMPOSITIONT_H
diff --git a/llvm/include/llvm/IR/Attributes.h b/llvm/include/llvm/IR/Attributes.h
index 5e3ba1f32e6a..dd1195571489 100644
--- a/llvm/include/llvm/IR/Attributes.h
+++ b/llvm/include/llvm/IR/Attributes.h
@@ -224,7 +224,7 @@ public:
 
   /// Return the attribute's value as a ConstantRange. This requires the
   /// attribute to be a ConstantRange attribute.
-  ConstantRange getValueAsConstantRange() const;
+  const ConstantRange &getValueAsConstantRange() const;
 
   /// Returns the alignment field of an attribute as a byte alignment
   /// value.
@@ -265,7 +265,7 @@ public:
   FPClassTest getNoFPClass() const;
 
   /// Returns the value of the range attribute.
-  ConstantRange getRange() const;
+  const ConstantRange &getRange() const;
 
   /// The Attribute is converted to a string of equivalent mnemonic. This
   /// is, presumably, for writing out the mnemonics for the assembly writer.
diff --git a/llvm/include/llvm/IR/DebugInfo.h b/llvm/include/llvm/IR/DebugInfo.h
index 53cede5409e2..5b80218d6c5c 100644
--- a/llvm/include/llvm/IR/DebugInfo.h
+++ b/llvm/include/llvm/IR/DebugInfo.h
@@ -268,6 +268,10 @@ bool calculateFragmentIntersect(
     uint64_t SliceSizeInBits, const DbgVariableRecord *DVRAssign,
     std::optional<DIExpression::FragmentInfo> &Result);
 
+/// Replace DIAssignID uses and attachments with IDs from \p Map.
+/// If an ID is unmapped a new ID is generated and added to \p Map.
+void remapAssignID(DenseMap<DIAssignID *, DIAssignID *> &Map, Instruction &I);
+
 /// Helper struct for trackAssignments, below. We don't use the similar
 /// DebugVariable class because trackAssignments doesn't (yet?) understand
 /// partial variables (fragment info) as input and want to make that clear and
diff --git a/llvm/include/llvm/IR/DebugProgramInstruction.h b/llvm/include/llvm/IR/DebugProgramInstruction.h
index 9f4987493739..ed8081a3cad1 100644
--- a/llvm/include/llvm/IR/DebugProgramInstruction.h
+++ b/llvm/include/llvm/IR/DebugProgramInstruction.h
@@ -32,9 +32,8 @@
 //       ; }
 //       ;; There is a debug-info record in front of the %bar instruction,
 //       ;; thus it points at a DbgMarker object. That DbgMarker contains a
-//       ;; DbgVariableRecord in it's ilist, storing the equivalent information
-//       to the
-//       ;; dbg.value above: the Value, DILocalVariable, etc.
+//       ;; DbgVariableRecord in its ilist, storing the equivalent information
+//       ;; to the dbg.value above: the Value, DILocalVariable, etc.
 //
 // This structure separates the two concerns of the position of the debug-info
 // in the function, and the Value that it refers to. It also creates a new
@@ -121,7 +120,7 @@ public:
 /// within IR. Features various methods copied across from the Instruction
 /// class to aid ease-of-use. DbgRecords should always be linked into a
 /// DbgMarker's StoredDbgRecords list. The marker connects a DbgRecord back to
-/// it's position in the BasicBlock.
+/// its position in the BasicBlock.
 ///
 /// We need a discriminator for dyn/isa casts. In order to avoid paying for a
 /// vtable for "virtual" functions too, subclasses must add a new discriminator
@@ -272,9 +271,8 @@ public:
     Any, ///< To indicate all LocationTypes in searches.
   };
   /// Classification of the debug-info record that this DbgVariableRecord
-  /// represents. Essentially, "is this a dbg.value or dbg.declare?".
-  /// dbg.declares are not currently supported, but it would be trivial to do
-  /// so.
+  /// represents. Essentially, "does this correspond to a dbg.value,
+  /// dbg.declare, or dbg.assign?".
   /// FIXME: We could use spare padding bits from DbgRecord for this.
   LocationType Type;
 
diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
index b9af3a6ca42c..9dd1bb455a71 100644
--- a/llvm/include/llvm/IR/InstrTypes.h
+++ b/llvm/include/llvm/IR/InstrTypes.h
@@ -1997,13 +1997,19 @@ public:
   /// Get the attribute of a given kind from a given arg
   Attribute getParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) const {
     assert(ArgNo < arg_size() && "Out of bounds");
-    return getAttributes().getParamAttr(ArgNo, Kind);
+    Attribute A = getAttributes().getParamAttr(ArgNo, Kind);
+    if (A.isValid())
+      return A;
+    return getParamAttrOnCalledFunction(ArgNo, Kind);
   }
 
   /// Get the attribute of a given kind from a given arg
   Attribute getParamAttr(unsigned ArgNo, StringRef Kind) const {
     assert(ArgNo < arg_size() && "Out of bounds");
-    return getAttributes().getParamAttr(ArgNo, Kind);
+    Attribute A = getAttributes().getParamAttr(ArgNo, Kind);
+    if (A.isValid())
+      return A;
+    return getParamAttrOnCalledFunction(ArgNo, Kind);
   }
 
   /// Return true if the data operand at index \p i has the attribute \p
@@ -2614,6 +2620,11 @@ public:
   op_iterator populateBundleOperandInfos(ArrayRef<OperandBundleDef> Bundles,
                                          const unsigned BeginIndex);
 
+  /// Return true if the call has deopt state bundle.
+  bool hasDeoptState() const {
+    return getOperandBundle(LLVMContext::OB_deopt).has_value();
+  }
+
 public:
   /// Return the BundleOpInfo for the operand at index OpIdx.
   ///
@@ -2647,6 +2658,8 @@ private:
     return hasFnAttrOnCalledFunction(Kind);
   }
   template <typename AK> Attribute getFnAttrOnCalledFunction(AK Kind) const;
+  template <typename AK>
+  Attribute getParamAttrOnCalledFunction(unsigned ArgNo, AK Kind) const;
 
   /// Determine whether the return value has the given attribute. Supports
   /// Attribute::AttrKind and StringRef as \p AttrKind types.
diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index d7ec3c16bec2..0f7b215b80fd 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -4370,6 +4370,9 @@ public:
 
   unsigned getNumSuccessors() const { return 2; }
 
+  /// Updates profile metadata by scaling it by \p S / \p T.
+  void updateProfWeight(uint64_t S, uint64_t T);
+
   // Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const Instruction *I) {
     return (I->getOpcode() == Instruction::Invoke);
diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index 2e99c9e2ee3e..fcd3a1025ac1 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -707,8 +707,7 @@ public:
 /// This is the common base class for constrained floating point intrinsics.
 class ConstrainedFPIntrinsic : public IntrinsicInst {
 public:
-  bool isUnaryOp() const;
-  bool isTernaryOp() const;
+  unsigned getNonMetadataArgCount() const;
   std::optional<RoundingMode> getRoundingMode() const;
   std::optional<fp::ExceptionBehavior> getExceptionBehavior() const;
   bool isDefaultFPEnvironment() const;
diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h
index 340c1c326d06..f79df522dc80 100644
--- a/llvm/include/llvm/IR/Intrinsics.h
+++ b/llvm/include/llvm/IR/Intrinsics.h
@@ -109,6 +109,10 @@ namespace Intrinsic {
   /// Floating-Point Intrinsics".
   bool isConstrainedFPIntrinsic(ID QID);
 
+  /// Returns true if the intrinsic ID is for one of the "Constrained
+  /// Floating-Point Intrinsics" that take rounding mode metadata.
+  bool hasConstrainedFPRoundingModeOperand(ID QID);
+
   /// This is a type descriptor which explains the type requirements of an
   /// intrinsic. This is returned by getIntrinsicInfoTableEntries.
   struct IITDescriptor {
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 28116e5316c9..f1c7d950f927 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1025,6 +1025,7 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn] in {
   def int_powi : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_anyint_ty]>;
   def int_sin  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_cos  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_tan  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_pow  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
                            [LLVMMatchType<0>, LLVMMatchType<0>]>;
   def int_log  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
@@ -1855,6 +1856,13 @@ def int_experimental_vp_strided_load  : DefaultAttrsIntrinsic<[llvm_anyvector_ty
                                llvm_i32_ty],
                              [ NoCapture<ArgIndex<0>>, IntrNoSync, IntrReadMem, IntrWillReturn, IntrArgMemOnly ]>;
 
+// Experimental histogram
+def int_experimental_vector_histogram_add : DefaultAttrsIntrinsic<[],
+                             [ llvm_anyvector_ty, // Vector of pointers
+                               llvm_anyint_ty,    // Increment
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask
+                             [ IntrArgMemOnly ]>;
+
 // Operators
 let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in {
   // Integer arithmetic
@@ -2242,6 +2250,16 @@ let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in {
                                llvm_anyvector_ty,
                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                llvm_i32_ty]>;
+  def int_vp_reduce_fmaximum : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                 [ LLVMVectorElementType<0>,
+                                   llvm_anyvector_ty,
+                                   LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                   llvm_i32_ty]>;
+  def int_vp_reduce_fminimum : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                 [ LLVMVectorElementType<0>,
+                                   llvm_anyvector_ty,
+                                   LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                   llvm_i32_ty]>;
 }
 
 let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn, ImmArg<ArgIndex<1>>] in {
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index e31e00a9c76f..e0630a6649dd 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3649,3 +3649,6 @@ def int_aarch64_sve_pmov_to_pred_lane_zero : SVE2_1VectorArg_Pred_Intrinsic;
 def int_aarch64_sve_pmov_to_vector_lane_merging : SVE2_Pred_1VectorArgIndexed_Intrinsic;
    
 def int_aarch64_sve_pmov_to_vector_lane_zeroing : SVE2_Pred_1VectorArg_Intrinsic;
+
+def int_aarch64_sme_mopa_nonwide : SME_OuterProduct_Intrinsic;
+def int_aarch64_sme_mops_nonwide : SME_OuterProduct_Intrinsic;
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index ee9a5d7a3439..be8048ca2459 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3143,25 +3143,37 @@ def int_amdgcn_cvt_sr_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32">,
 // Special Intrinsics for backend internal use only. No frontend
 // should emit calls to these.
 // ===----------------------------------------------------------------------===//
+//
+// Control-flow intrinsics in LLVM IR are convergent because they represent the
+// wave CFG, i.e., sets of threads that are "converged" or "execute in
+// lock-step". But they exist during a small window in the lowering process,
+// inserted after the structurizer and then translated to equivalent MIR
+// pseudos. So rather than create convergence tokens for these builtins, we
+// simply mark them as not convergent.
+//
+// This is really a workaround to allow control flow lowering in the presence of
+// convergence control tokens. The corresponding MIR pseudos are marked as
+// having side effects, which is sufficient to prevent optimizations without
+// having to mark them as convergent.
 def int_amdgcn_if : Intrinsic<[llvm_i1_ty, llvm_anyint_ty],
-  [llvm_i1_ty], [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
+  [llvm_i1_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree]
 >;
 
 def int_amdgcn_else : Intrinsic<[llvm_i1_ty, llvm_anyint_ty],
-  [llvm_anyint_ty], [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
+  [llvm_anyint_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree]
 >;
 
 def int_amdgcn_if_break : Intrinsic<[llvm_anyint_ty],
   [llvm_i1_ty, LLVMMatchType<0>],
-  [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
+  [IntrNoMem, IntrWillReturn, IntrNoCallback, IntrNoFree]
 >;
 
 def int_amdgcn_loop : Intrinsic<[llvm_i1_ty],
-  [llvm_anyint_ty], [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
+  [llvm_anyint_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree]
 >;
 
 def int_amdgcn_end_cf : Intrinsic<[], [llvm_anyint_ty],
-  [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+  [IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
 // Represent unreachable in a divergent region.
 def int_amdgcn_unreachable : Intrinsic<[], [], [IntrConvergent, IntrNoCallback, IntrNoFree]>;
diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
index b93a5e7be1b5..572d334ac955 100644
--- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
+++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -321,6 +321,23 @@ def int_wasm_relaxed_dot_bf16x8_add_f32:
                         [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v4f32_ty],
                         [IntrNoMem, IntrSpeculatable]>;
 
+//===----------------------------------------------------------------------===//
+// Half-precision intrinsics (experimental)
+//===----------------------------------------------------------------------===//
+
+// TODO: Replace these intrinsic with normal ISel patterns once the XXX
+// instructions are merged to the proposal.
+def int_wasm_loadf16_f32:
+  Intrinsic<[llvm_float_ty],
+            [llvm_ptr_ty],
+            [IntrReadMem, IntrArgMemOnly],
+             "", [SDNPMemOperand]>;
+def int_wasm_storef16_f32:
+  Intrinsic<[],
+            [llvm_float_ty, llvm_ptr_ty],
+            [IntrWriteMem, IntrArgMemOnly],
+             "", [SDNPMemOperand]>;
+
 
 //===----------------------------------------------------------------------===//
 // Thread-local storage intrinsics
diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h
index fa2a7b42c9aa..5d137d4b3553 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -1423,7 +1423,7 @@ public:
   // in the way some record are interpreted, like flags for instance.
   // Note that incrementing this may require changes in both BitcodeReader.cpp
   // and BitcodeWriter.cpp.
-  static constexpr uint64_t BitcodeSummaryVersion = 10;
+  static constexpr uint64_t BitcodeSummaryVersion = 9;
 
   // Regular LTO module name for ASM writer
   static constexpr const char *getRegularLTOModuleName() {
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 739208e74dcb..171ddab977de 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -354,7 +354,8 @@ template <int64_t Val> inline constantint_match<Val> m_ConstantInt() {
 /// is true.
 template <typename Predicate, typename ConstantVal, bool AllowPoison>
 struct cstval_pred_ty : public Predicate {
-  template <typename ITy> bool match(ITy *V) {
+  const Constant **Res = nullptr;
+  template <typename ITy> bool match_impl(ITy *V) {
     if (const auto *CV = dyn_cast<ConstantVal>(V))
       return this->isValue(CV->getValue());
     if (const auto *VTy = dyn_cast<VectorType>(V->getType())) {
@@ -387,6 +388,15 @@ struct cstval_pred_ty : public Predicate {
     }
     return false;
   }
+
+  template <typename ITy> bool match(ITy *V) {
+    if (this->match_impl(V)) {
+      if (Res)
+        *Res = cast<Constant>(V);
+      return true;
+    }
+    return false;
+  }
 };
 
 /// specialization of cstval_pred_ty for ConstantInt
@@ -460,6 +470,35 @@ template <typename Predicate> struct apf_pred_ty : public Predicate {
 //
 ///////////////////////////////////////////////////////////////////////////////
 
+template <typename APTy> struct custom_checkfn {
+  function_ref<bool(const APTy &)> CheckFn;
+  bool isValue(const APTy &C) { return CheckFn(C); }
+};
+
+/// Match an integer or vector where CheckFn(ele) for each element is true.
+/// For vectors, poison elements are assumed to match.
+inline cst_pred_ty<custom_checkfn<APInt>>
+m_CheckedInt(function_ref<bool(const APInt &)> CheckFn) {
+  return cst_pred_ty<custom_checkfn<APInt>>{{CheckFn}};
+}
+
+inline cst_pred_ty<custom_checkfn<APInt>>
+m_CheckedInt(const Constant *&V, function_ref<bool(const APInt &)> CheckFn) {
+  return cst_pred_ty<custom_checkfn<APInt>>{{CheckFn}, &V};
+}
+
+/// Match a float or vector where CheckFn(ele) for each element is true.
+/// For vectors, poison elements are assumed to match.
+inline cstfp_pred_ty<custom_checkfn<APFloat>>
+m_CheckedFp(function_ref<bool(const APFloat &)> CheckFn) {
+  return cstfp_pred_ty<custom_checkfn<APFloat>>{{CheckFn}};
+}
+
+inline cstfp_pred_ty<custom_checkfn<APFloat>>
+m_CheckedFp(const Constant *&V, function_ref<bool(const APFloat &)> CheckFn) {
+  return cstfp_pred_ty<custom_checkfn<APFloat>>{{CheckFn}, &V};
+}
+
 struct is_any_apint {
   bool isValue(const APInt &C) { return true; }
 };
@@ -1839,6 +1878,19 @@ template <typename Op_t> struct NNegZExt_match {
   }
 };
 
+template <typename Op_t, unsigned WrapFlags = 0> struct NoWrapTrunc_match {
+  Op_t Op;
+
+  NoWrapTrunc_match(const Op_t &OpMatch) : Op(OpMatch) {}
+
+  template <typename OpTy> bool match(OpTy *V) {
+    if (auto *I = dyn_cast<TruncInst>(V))
+      return (I->getNoWrapKind() & WrapFlags) == WrapFlags &&
+             Op.match(I->getOperand(0));
+    return false;
+  }
+};
+
 /// Matches BitCast.
 template <typename OpTy>
 inline CastOperator_match<OpTy, Instruction::BitCast>
@@ -1900,6 +1952,20 @@ inline CastOperator_match<OpTy, Instruction::Trunc> m_Trunc(const OpTy &Op) {
   return CastOperator_match<OpTy, Instruction::Trunc>(Op);
 }
 
+/// Matches trunc nuw.
+template <typename OpTy>
+inline NoWrapTrunc_match<OpTy, TruncInst::NoUnsignedWrap>
+m_NUWTrunc(const OpTy &Op) {
+  return NoWrapTrunc_match<OpTy, TruncInst::NoUnsignedWrap>(Op);
+}
+
+/// Matches trunc nsw.
+template <typename OpTy>
+inline NoWrapTrunc_match<OpTy, TruncInst::NoSignedWrap>
+m_NSWTrunc(const OpTy &Op) {
+  return NoWrapTrunc_match<OpTy, TruncInst::NoSignedWrap>(Op);
+}
+
 template <typename OpTy>
 inline match_combine_or<CastOperator_match<OpTy, Instruction::Trunc>, OpTy>
 m_TruncOrSelf(const OpTy &Op) {
diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def
index f1cc8bcae467..20f5bb2b531d 100644
--- a/llvm/include/llvm/IR/VPIntrinsics.def
+++ b/llvm/include/llvm/IR/VPIntrinsics.def
@@ -701,6 +701,14 @@ HELPER_REGISTER_REDUCTION_VP(vp_reduce_fmax, VP_REDUCE_FMAX,
 HELPER_REGISTER_REDUCTION_VP(vp_reduce_fmin, VP_REDUCE_FMIN,
                              vector_reduce_fmin)
 
+// llvm.vp.reduce.fmaximum(start,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_fmaximum, VP_REDUCE_FMAXIMUM,
+                             vector_reduce_fmaximum)
+
+// llvm.vp.reduce.fminimum(start,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_fminimum, VP_REDUCE_FMINIMUM,
+                             vector_reduce_fminimum)
+
 #undef HELPER_REGISTER_REDUCTION_VP
 
 // Specialized helper macro for VP reductions as above but with two forms:
diff --git a/llvm/include/llvm/LTO/legacy/LTOCodeGenerator.h b/llvm/include/llvm/LTO/legacy/LTOCodeGenerator.h
index 06f1396c06fe..d8e41fe92258 100644
--- a/llvm/include/llvm/LTO/legacy/LTOCodeGenerator.h
+++ b/llvm/include/llvm/LTO/legacy/LTOCodeGenerator.h
@@ -97,13 +97,15 @@ struct LTOCodeGenerator {
   void setFileType(CodeGenFileType FT) { Config.CGFileType = FT; }
 
   void setCpu(StringRef MCpu) { Config.CPU = std::string(MCpu); }
-  void setAttrs(std::vector<std::string> MAttrs) { Config.MAttrs = MAttrs; }
+  void setAttrs(std::vector<std::string> MAttrs) {
+    Config.MAttrs = std::move(MAttrs);
+  }
   void setOptLevel(unsigned OptLevel);
 
   void setShouldInternalize(bool Value) { ShouldInternalize = Value; }
   void setShouldEmbedUselists(bool Value) { ShouldEmbedUselists = Value; }
   void setSaveIRBeforeOptPath(std::string Value) {
-    SaveIRBeforeOptPath = Value;
+    SaveIRBeforeOptPath = std::move(Value);
   }
 
   /// Restore linkage of globals
diff --git a/llvm/include/llvm/MC/MCRegisterInfo.h b/llvm/include/llvm/MC/MCRegisterInfo.h
index c648ef20fa84..af5be9186108 100644
--- a/llvm/include/llvm/MC/MCRegisterInfo.h
+++ b/llvm/include/llvm/MC/MCRegisterInfo.h
@@ -126,6 +126,9 @@ struct MCRegisterDesc {
   /// Index into list with lane mask sequences. The sequence contains a lanemask
   /// for every register unit.
   uint16_t RegUnitLaneMasks;
+
+  // Is true for constant registers.
+  bool IsConstant;
 };
 
 /// MCRegisterInfo base class - We assume that the target defines a static
@@ -382,6 +385,9 @@ public:
     return RegStrings + get(RegNo).Name;
   }
 
+  /// Returns true if the given register is constant.
+  bool isConstant(MCRegister RegNo) const { return get(RegNo).IsConstant; }
+
   /// Return the number of registers this target has (useful for
   /// sizing arrays holding per register information)
   unsigned getNumRegs() const {
diff --git a/llvm/include/llvm/MC/MCSubtargetInfo.h b/llvm/include/llvm/MC/MCSubtargetInfo.h
index f172a799aa33..ff76435d6084 100644
--- a/llvm/include/llvm/MC/MCSubtargetInfo.h
+++ b/llvm/include/llvm/MC/MCSubtargetInfo.h
@@ -240,7 +240,32 @@ public:
     return ProcFeatures;
   }
 
-  virtual unsigned getHwMode() const { return 0; }
+  /// HwMode IDs are stored and accessed in a bit set format, enabling
+  /// users to efficiently retrieve specific IDs, such as the RegInfo
+  /// HwMode ID, from the set as required. Using this approach, various
+  /// types of HwMode IDs can be added to a subtarget to manage different
+  /// attributes within that subtarget, significantly enhancing the
+  /// scalability and usability of HwMode. Moreover, to ensure compatibility,
+  /// this method also supports controlling multiple attributes with a single
+  /// HwMode ID, just as was done previously.
+  enum HwModeType {
+    HwMode_Default,   // Return the smallest HwMode ID of current subtarget.
+    HwMode_ValueType, // Return the HwMode ID that controls the ValueType.
+    HwMode_RegInfo,   // Return the HwMode ID that controls the RegSizeInfo and
+                      // SubRegRange.
+    HwMode_EncodingInfo // Return the HwMode ID that controls the EncodingInfo.
+  };
+
+  /// Return a bit set containing all HwMode IDs of the current subtarget.
+  virtual unsigned getHwModeSet() const { return 0; }
+
+  /// HwMode ID corresponding to the 'type' parameter is retrieved from the
+  /// HwMode bit set of the current subtarget. It’s important to note that if
+  /// the current subtarget possesses two HwMode IDs and both control a single
+  /// attribute (such as RegInfo), this interface will result in an error.
+  virtual unsigned getHwMode(enum HwModeType type = HwMode_Default) const {
+    return 0;
+  }
 
   /// Return the cache size in bytes for the given level of cache.
   /// Level is zero-based, so a value of zero means the first level of
diff --git a/llvm/include/llvm/Object/MachO.h b/llvm/include/llvm/Object/MachO.h
index 24f9954584ed..35350df78f8d 100644
--- a/llvm/include/llvm/Object/MachO.h
+++ b/llvm/include/llvm/Object/MachO.h
@@ -134,9 +134,9 @@ public:
   BindRebaseSegInfo(const MachOObjectFile *Obj);
 
   // Used to check a Mach-O Bind or Rebase entry for errors when iterating.
-  const char* checkSegAndOffsets(int32_t SegIndex, uint64_t SegOffset,
-                                 uint8_t PointerSize, uint32_t Count=1,
-                                 uint32_t Skip=0);
+  const char *checkSegAndOffsets(int32_t SegIndex, uint64_t SegOffset,
+                                 uint8_t PointerSize, uint64_t Count = 1,
+                                 uint64_t Skip = 0);
   // Used with valid SegIndex/SegOffset values from checked entries.
   StringRef segmentName(int32_t SegIndex);
   StringRef sectionName(int32_t SegIndex, uint64_t SegOffset);
@@ -576,8 +576,9 @@ public:
   //
   // This is used by MachOBindEntry::moveNext() to validate a MachOBindEntry.
   const char *BindEntryCheckSegAndOffsets(int32_t SegIndex, uint64_t SegOffset,
-                                         uint8_t PointerSize, uint32_t Count=1,
-                                          uint32_t Skip=0) const {
+                                          uint8_t PointerSize,
+                                          uint64_t Count = 1,
+                                          uint64_t Skip = 0) const {
     return BindRebaseSectionTable->checkSegAndOffsets(SegIndex, SegOffset,
                                                      PointerSize, Count, Skip);
   }
@@ -591,8 +592,8 @@ public:
   const char *RebaseEntryCheckSegAndOffsets(int32_t SegIndex,
                                             uint64_t SegOffset,
                                             uint8_t PointerSize,
-                                            uint32_t Count=1,
-                                            uint32_t Skip=0) const {
+                                            uint64_t Count = 1,
+                                            uint64_t Skip = 0) const {
     return BindRebaseSectionTable->checkSegAndOffsets(SegIndex, SegOffset,
                                                       PointerSize, Count, Skip);
   }
diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
index 5a14d619ea07..380bffe75b30 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -128,6 +128,7 @@ MACHINE_FUNCTION_PASS("no-op-machine-function", NoOpMachineFunctionPass())
 MACHINE_FUNCTION_PASS("print", PrintMIRPass())
 MACHINE_FUNCTION_PASS("require-all-machine-function-properties",
                       RequireAllMachineFunctionPropertiesPass())
+MACHINE_FUNCTION_PASS("trigger-verifier-error", TriggerVerifierErrorPass())
 #undef MACHINE_FUNCTION_PASS
 
 // After a pass is converted to new pass manager, its entry should be moved from
diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index 7a8b6639f297..da0310404524 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -462,10 +462,7 @@ public:
 
   CounterMappingRegion getDecisionRegion() const { return Region; }
   unsigned getNumConditions() const {
-    unsigned NumConditions = Region.getDecisionParams().NumConditions;
-    assert(NumConditions != 0 &&
-           "In MC/DC, NumConditions should never be zero!");
-    return NumConditions;
+    return Region.getDecisionParams().NumConditions;
   }
   unsigned getNumTestVectors() const { return TV.size(); }
   bool isCondFolded(unsigned Condition) const { return Folded[Condition]; }
diff --git a/llvm/include/llvm/ProfileData/Coverage/MCDCTypes.h b/llvm/include/llvm/ProfileData/Coverage/MCDCTypes.h
index 8c78bed4dec5..191e4ead95ea 100644
--- a/llvm/include/llvm/ProfileData/Coverage/MCDCTypes.h
+++ b/llvm/include/llvm/ProfileData/Coverage/MCDCTypes.h
@@ -33,7 +33,9 @@ struct DecisionParameters {
 
   DecisionParameters() = delete;
   DecisionParameters(unsigned BitmapIdx, unsigned NumConditions)
-      : BitmapIdx(BitmapIdx), NumConditions(NumConditions) {}
+      : BitmapIdx(BitmapIdx), NumConditions(NumConditions) {
+    assert(NumConditions > 0);
+  }
 };
 
 struct BranchParameters {
@@ -44,7 +46,9 @@ struct BranchParameters {
 
   BranchParameters() = delete;
   BranchParameters(ConditionID ID, const ConditionIDs &Conds)
-      : ID(ID), Conds(Conds) {}
+      : ID(ID), Conds(Conds) {
+    assert(ID >= 0);
+  }
 };
 
 /// The type of MC/DC-specific parameters.
diff --git a/llvm/include/llvm/ProfileData/CtxInstrContextNode.h b/llvm/include/llvm/ProfileData/CtxInstrContextNode.h
new file mode 100644
index 000000000000..a916f197aa14
--- /dev/null
+++ b/llvm/include/llvm/ProfileData/CtxInstrContextNode.h
@@ -0,0 +1,116 @@
+//===--- CtxInstrContextNode.h - Contextual Profile Node --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//==============================================================================
+//
+// NOTE!
+// llvm/lib/ProfileData/CtxInstrContextNode.h and
+//   compiler-rt/lib/ctx_profile/CtxInstrContextNode.h
+// must be exact copies of eachother
+//
+// compiler-rt creates these objects as part of the instrumentation runtime for
+// contextual profiling. LLVM only consumes them to convert a contextual tree
+// to a bitstream.
+//
+//==============================================================================
+
+/// The contextual profile is a directed tree where each node has one parent. A
+/// node (ContextNode) corresponds to a function activation. The root of the
+/// tree is at a function that was marked as entrypoint to the compiler. A node
+/// stores counter values for edges and a vector of subcontexts. These are the
+/// contexts of callees. The index in the subcontext vector corresponds to the
+/// index of the callsite (as was instrumented via llvm.instrprof.callsite). At
+/// that index we find a linked list, potentially empty, of ContextNodes. Direct
+/// calls will have 0 or 1 values in the linked list, but indirect callsites may
+/// have more.
+///
+/// The ContextNode has a fixed sized header describing it - the GUID of the
+/// function, the size of the counter and callsite vectors. It is also an
+/// (intrusive) linked list for the purposes of the indirect call case above.
+///
+/// Allocation is expected to happen on an Arena. The allocation lays out inline
+/// the counter and subcontexts vectors. The class offers APIs to correctly
+/// reference the latter.
+///
+/// The layout is as follows:
+///
+/// [[declared fields][counters vector][vector of ptrs to subcontexts]]
+///
+/// See also documentation on the counters and subContexts members below.
+///
+/// The structure of the ContextNode is known to LLVM, because LLVM needs to:
+///   (1) increment counts, and
+///   (2) form a GEP for the position in the subcontext list of a callsite
+/// This means changes to LLVM contextual profile lowering and changes here
+/// must be coupled.
+/// Note: the header content isn't interesting to LLVM (other than its size)
+///
+/// Part of contextual collection is the notion of "scratch contexts". These are
+/// buffers that are "large enough" to allow for memory-safe acceses during
+/// counter increments - meaning the counter increment code in LLVM doesn't need
+/// to be concerned with memory safety. Their subcontexts never get populated,
+/// though. The runtime code here produces and recognizes them.
+
+#ifndef LLVM_PROFILEDATA_CTXINSTRCONTEXTNODE_H
+#define LLVM_PROFILEDATA_CTXINSTRCONTEXTNODE_H
+
+#include <stdint.h>
+#include <stdlib.h>
+
+namespace llvm {
+namespace ctx_profile {
+using GUID = uint64_t;
+
+class ContextNode final {
+  const GUID Guid;
+  ContextNode *const Next;
+  const uint32_t NrCounters;
+  const uint32_t NrCallsites;
+
+public:
+  ContextNode(GUID Guid, uint32_t NrCounters, uint32_t NrCallsites,
+              ContextNode *Next = nullptr)
+      : Guid(Guid), Next(Next), NrCounters(NrCounters),
+        NrCallsites(NrCallsites) {}
+
+  static inline size_t getAllocSize(uint32_t NrCounters, uint32_t NrCallsites) {
+    return sizeof(ContextNode) + sizeof(uint64_t) * NrCounters +
+           sizeof(ContextNode *) * NrCallsites;
+  }
+
+  // The counters vector starts right after the static header.
+  uint64_t *counters() {
+    ContextNode *addr_after = &(this[1]);
+    return reinterpret_cast<uint64_t *>(addr_after);
+  }
+
+  uint32_t counters_size() const { return NrCounters; }
+  uint32_t callsites_size() const { return NrCallsites; }
+
+  const uint64_t *counters() const {
+    return const_cast<ContextNode *>(this)->counters();
+  }
+
+  // The subcontexts vector starts right after the end of the counters vector.
+  ContextNode **subContexts() {
+    return reinterpret_cast<ContextNode **>(&(counters()[NrCounters]));
+  }
+
+  ContextNode *const *subContexts() const {
+    return const_cast<ContextNode *>(this)->subContexts();
+  }
+
+  GUID guid() const { return Guid; }
+  ContextNode *next() const { return Next; }
+
+  size_t size() const { return getAllocSize(NrCounters, NrCallsites); }
+
+  uint64_t entrycount() const { return counters()[0]; }
+};
+} // namespace ctx_profile
+} // namespace llvm
+#endif
+\ No newline at end of file
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index d5c1ba62911f..88c7fe425b5a 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -17,6 +17,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitmaskEnum.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/IntervalMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
@@ -470,6 +471,12 @@ private:
   // A map from MD5 keys to function define. We only populate this map
   // when build the Symtab from a Module.
   std::vector<std::pair<uint64_t, Function *>> MD5FuncMap;
+  // A map from MD5 to the global variable. This map is only populated when
+  // building the symtab from a module. Use separate container instances for
+  // `MD5FuncMap` and `MD5VTableMap`.
+  // TODO: Unify the container type and the lambda function 'mapName' inside
+  // add{Func,VTable}WithName.
+  DenseMap<uint64_t, GlobalVariable *> MD5VTableMap;
   // A map from function runtime address to function name MD5 hash.
   // This map is only populated and used by raw instr profile reader.
   AddrHashMap AddrToMD5Map;
@@ -488,12 +495,18 @@ private:
 
   // Add the function into the symbol table, by creating the following
   // map entries:
-  // name-set = {PGOFuncName} + {getCanonicalName(PGOFuncName)} if the canonical
-  // name is different from pgo name
+  // name-set = {PGOFuncName} union {getCanonicalName(PGOFuncName)}
   // - In MD5NameMap: <MD5Hash(name), name> for name in name-set
   // - In MD5FuncMap: <MD5Hash(name), &F> for name in name-set
   Error addFuncWithName(Function &F, StringRef PGOFuncName);
 
+  // Add the vtable into the symbol table, by creating the following
+  // map entries:
+  // name-set = {PGOName} union {getCanonicalName(PGOName)}
+  // - In MD5NameMap:  <MD5Hash(name), name> for name in name-set
+  // - In MD5VTableMap: <MD5Hash(name), name> for name in name-set
+  Error addVTableWithName(GlobalVariable &V, StringRef PGOVTableName);
+
   // If the symtab is created by a series of calls to \c addFuncName, \c
   // finalizeSymtab needs to be called before looking up function names.
   // This is required because the underlying map is a vector (for space
@@ -555,6 +568,7 @@ public:
   Error create(const FuncNameIterRange &FuncIterRange,
                const VTableNameIterRange &VTableIterRange);
 
+  // Map the MD5 of the symbol name to the name.
   Error addSymbolName(StringRef SymbolName) {
     if (SymbolName.empty())
       return make_error<InstrProfError>(instrprof_error::malformed,
@@ -630,6 +644,10 @@ public:
   /// Return function from the name's md5 hash. Return nullptr if not found.
   inline Function *getFunction(uint64_t FuncMD5Hash);
 
+  /// Return the global variable corresponding to md5 hash. Return nullptr if
+  /// not found.
+  inline GlobalVariable *getGlobalVariable(uint64_t MD5Hash);
+
   /// Return the name section data.
   inline StringRef getNameData() const { return Data; }
 
@@ -709,6 +727,12 @@ Function* InstrProfSymtab::getFunction(uint64_t FuncMD5Hash) {
   return nullptr;
 }
 
+GlobalVariable *InstrProfSymtab::getGlobalVariable(uint64_t MD5Hash) {
+  if (auto Iter = MD5VTableMap.find(MD5Hash); Iter != MD5VTableMap.end())
+    return Iter->second;
+  return nullptr;
+}
+
 // To store the sums of profile count values, or the percentage of
 // the sums of the total count values.
 struct CountSumOrPercent {
diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h
index 4274f2a6849b..3ef6ca8586fb 100644
--- a/llvm/include/llvm/ProfileData/MemProf.h
+++ b/llvm/include/llvm/ProfileData/MemProf.h
@@ -26,7 +26,7 @@ enum IndexedVersion : uint64_t {
   Version0 = 0,
   // Version 1: Added a version field to the header.
   Version1 = 1,
-  // Version 2: Added a call stack table.  Under development.
+  // Version 2: Added a call stack table.
   Version2 = 2,
 };
 
diff --git a/llvm/include/llvm/Support/Compiler.h b/llvm/include/llvm/Support/Compiler.h
index 8c315d255bb7..d8e3794babc7 100644
--- a/llvm/include/llvm/Support/Compiler.h
+++ b/llvm/include/llvm/Support/Compiler.h
@@ -278,6 +278,14 @@
 #define LLVM_ATTRIBUTE_RETURNS_NONNULL
 #endif
 
+/// LLVM_ATTRIBUTE_RESTRICT - Annotates a pointer to tell the compiler that
+/// it is not aliased in the current scope.
+#if defined(__clang__) || defined(__GNUC__) || defined(_MSC_VER)
+#define LLVM_ATTRIBUTE_RESTRICT __restrict
+#else
+#define LLVM_ATTRIBUTE_RESTRICT
+#endif
+
 /// \macro LLVM_ATTRIBUTE_RETURNS_NOALIAS Used to mark a function as returning a
 /// pointer that does not alias any other valid pointer.
 #ifdef __GNUC__
diff --git a/llvm/include/llvm/Support/LEB128.h b/llvm/include/llvm/Support/LEB128.h
index c4e741549f3f..a15b73bc14dc 100644
--- a/llvm/include/llvm/Support/LEB128.h
+++ b/llvm/include/llvm/Support/LEB128.h
@@ -200,20 +200,26 @@ inline int64_t decodeSLEB128(const uint8_t *p, unsigned *n = nullptr,
   return Value;
 }
 
-inline uint64_t decodeULEB128AndInc(const uint8_t *&p) {
+inline uint64_t decodeULEB128AndInc(const uint8_t *&p, const uint8_t *end,
+                                    const char **error = nullptr) {
   unsigned n;
-  auto ret = decodeULEB128(p, &n);
+  auto ret = decodeULEB128(p, &n, end, error);
   p += n;
   return ret;
 }
 
-inline int64_t decodeSLEB128AndInc(const uint8_t *&p) {
+inline int64_t decodeSLEB128AndInc(const uint8_t *&p, const uint8_t *end,
+                                   const char **error = nullptr) {
   unsigned n;
-  auto ret = decodeSLEB128(p, &n);
+  auto ret = decodeSLEB128(p, &n, end, error);
   p += n;
   return ret;
 }
 
+inline uint64_t decodeULEB128AndIncUnsafe(const uint8_t *&p) {
+  return decodeULEB128AndInc(p, nullptr);
+}
+
 /// Utility function to get the size of the ULEB128-encoded value.
 extern unsigned getULEB128Size(uint64_t Value);
 
diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index cb98f96af522..559a588c2514 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -781,6 +781,9 @@ HANDLE_TARGET_OPCODE(G_FCOS)
 /// Floating point sine.
 HANDLE_TARGET_OPCODE(G_FSIN)
 
+/// Floating point Tangent.
+HANDLE_TARGET_OPCODE(G_FTAN)
+
 /// Floating point square root.
 HANDLE_TARGET_OPCODE(G_FSQRT)
 
diff --git a/llvm/include/llvm/Support/VirtualFileSystem.h b/llvm/include/llvm/Support/VirtualFileSystem.h
index 0113d6b7da25..a1e38de74dfc 100644
--- a/llvm/include/llvm/Support/VirtualFileSystem.h
+++ b/llvm/include/llvm/Support/VirtualFileSystem.h
@@ -320,6 +320,10 @@ public:
   ///          platform-specific error_code.
   virtual std::error_code makeAbsolute(SmallVectorImpl<char> &Path) const;
 
+  /// \returns true if \p A and \p B represent the same file, or an error or
+  /// false if they do not.
+  llvm::ErrorOr<bool> equivalent(const Twine &A, const Twine &B);
+
   enum class PrintType { Summary, Contents, RecursiveContents };
   void print(raw_ostream &OS, PrintType Type = PrintType::Contents,
              unsigned IndentLevel = 0) const {
@@ -961,7 +965,7 @@ private:
   // that, other than the root, path components should not contain slashes or
   // backslashes.
   bool pathComponentMatches(llvm::StringRef lhs, llvm::StringRef rhs) const {
-    if ((CaseSensitive ? lhs.equals(rhs) : lhs.equals_insensitive(rhs)))
+    if ((CaseSensitive ? lhs == rhs : lhs.equals_insensitive(rhs)))
       return true;
     return (lhs == "/" && rhs == "\\") || (lhs == "\\" && rhs == "/");
   }
diff --git a/llvm/include/llvm/Support/YAMLTraits.h b/llvm/include/llvm/Support/YAMLTraits.h
index 33aeb039320d..a234e00c7608 100644
--- a/llvm/include/llvm/Support/YAMLTraits.h
+++ b/llvm/include/llvm/Support/YAMLTraits.h
@@ -567,10 +567,10 @@ inline bool isNumeric(StringRef S) {
 
   // Make S.front() and S.drop_front().front() (if S.front() is [+-]) calls
   // safe.
-  if (S.empty() || S.equals("+") || S.equals("-"))
+  if (S.empty() || S == "+" || S == "-")
     return false;
 
-  if (S.equals(".nan") || S.equals(".NaN") || S.equals(".NAN"))
+  if (S == ".nan" || S == ".NaN" || S == ".NAN")
     return true;
 
   // Infinity and decimal numbers can be prefixed with sign.
@@ -578,7 +578,7 @@ inline bool isNumeric(StringRef S) {
 
   // Check for infinity first, because checking for hex and oct numbers is more
   // expensive.
-  if (Tail.equals(".inf") || Tail.equals(".Inf") || Tail.equals(".INF"))
+  if (Tail == ".inf" || Tail == ".Inf" || Tail == ".INF")
     return true;
 
   // Section 10.3.2 Tag Resolution
@@ -599,7 +599,7 @@ inline bool isNumeric(StringRef S) {
   // digit after dot (as opposed by number which has digits before the dot), but
   // doesn't have one.
   if (S.starts_with(".") &&
-      (S.equals(".") ||
+      (S == "." ||
        (S.size() > 1 && std::strchr("0123456789", S[1]) == nullptr)))
     return false;
 
@@ -656,14 +656,13 @@ inline bool isNumeric(StringRef S) {
 }
 
 inline bool isNull(StringRef S) {
-  return S.equals("null") || S.equals("Null") || S.equals("NULL") ||
-         S.equals("~");
+  return S == "null" || S == "Null" || S == "NULL" || S == "~";
 }
 
 inline bool isBool(StringRef S) {
   // FIXME: using parseBool is causing multiple tests to fail.
-  return S.equals("true") || S.equals("True") || S.equals("TRUE") ||
-         S.equals("false") || S.equals("False") || S.equals("FALSE");
+  return S == "true" || S == "True" || S == "TRUE" || S == "false" ||
+         S == "False" || S == "FALSE";
 }
 
 // 5.1. Character Set
diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td
index 8380d2738d16..c40498e55421 100644
--- a/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/llvm/include/llvm/Target/GenericOpcodes.td
@@ -988,6 +988,13 @@ def G_FSIN : GenericInstruction {
   let hasSideEffects = false;
 }
 
+// Floating point tangent of a value.
+def G_FTAN : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1);
+  let hasSideEffects = false;
+}
+
 // Floating point square root of a value.
 // This returns NaN for negative nonzero values.
 // NOTE: Unlike libm sqrt(), this never sets errno. In all other respects it's
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index dbbb3abaa830..98d266c8c0b4 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -180,6 +180,8 @@ def FmContract  : MIFlagEnum<"FmContract">;
 def FmAfn       : MIFlagEnum<"FmAfn">;
 def FmReassoc   : MIFlagEnum<"FmReassoc">;
 def IsExact     : MIFlagEnum<"IsExact">;
+def NoSWrap     : MIFlagEnum<"NoSWrap">;
+def NoUWrap     : MIFlagEnum<"NoUWrap">;
 
 def MIFlags;
 // def not; -> Already defined as a SDNode
@@ -309,9 +311,8 @@ def shift_of_shifted_logic_chain : GICombineRule<
          [{ return Helper.matchShiftOfShiftedLogic(*${d}, ${matchinfo}); }]),
   (apply [{ Helper.applyShiftOfShiftedLogic(*${d}, ${matchinfo}); }])>;
 
-def mul_to_shl_matchdata : GIDefMatchData<"unsigned">;
 def mul_to_shl : GICombineRule<
-  (defs root:$d, mul_to_shl_matchdata:$matchinfo),
+  (defs root:$d, unsigned_matchinfo:$matchinfo),
   (match (G_MUL $d, $op1, $op2):$mi,
          [{ return Helper.matchCombineMulToShl(*${mi}, ${matchinfo}); }]),
   (apply [{ Helper.applyCombineMulToShl(*${mi}, ${matchinfo}); }])>;
@@ -324,6 +325,28 @@ def reduce_shl_of_extend : GICombineRule<
          [{ return Helper.matchCombineShlOfExtend(*${mi}, ${matchinfo}); }]),
   (apply [{ Helper.applyCombineShlOfExtend(*${mi}, ${matchinfo}); }])>;
 
+// Combine bitreverse(shl (bitreverse x), y)) -> (lshr x, y)
+def bitreverse_shl : GICombineRule<
+  (defs root:$d),
+  (match (G_BITREVERSE $rev, $val),
+         (G_SHL $src, $rev, $amt):$mi,
+         (G_BITREVERSE $d, $src),
+         [{ return Helper.isLegalOrBeforeLegalizer({TargetOpcode::G_LSHR,
+                                                   {MRI.getType(${val}.getReg()),
+                                                    MRI.getType(${amt}.getReg())}}); }]),
+  (apply (G_LSHR $d, $val, $amt))>;
+
+// Combine bitreverse(lshr (bitreverse x), y)) -> (shl x, y)
+def bitreverse_lshr : GICombineRule<
+  (defs root:$d, build_fn_matchinfo:$matchinfo),
+  (match (G_BITREVERSE $rev, $val),
+         (G_LSHR $src, $rev, $amt):$mi,
+         (G_BITREVERSE $d, $src),
+         [{ return Helper.isLegalOrBeforeLegalizer({TargetOpcode::G_SHL,
+                                                   {MRI.getType(${val}.getReg()),
+                                                    MRI.getType(${amt}.getReg())}}); }]),
+  (apply (G_SHL $d, $val, $amt))>;
+
 // Combine (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
 // Combine (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
 def commute_shift : GICombineRule<
@@ -430,9 +453,8 @@ def select_undef_cmp: GICombineRule<
 
 // Fold (true ? x : y) -> x
 // Fold (false ? x : y) -> y
-def select_constant_cmp_matchdata : GIDefMatchData<"unsigned">;
 def select_constant_cmp: GICombineRule<
-  (defs root:$root, select_constant_cmp_matchdata:$matchinfo),
+  (defs root:$root, unsigned_matchinfo:$matchinfo),
   (match (wip_match_opcode G_SELECT):$root,
     [{ return Helper.matchConstantSelectCmp(*${root}, ${matchinfo}); }]),
   (apply [{ Helper.replaceSingleDefInstWithOperand(*${root}, ${matchinfo}); }])
@@ -651,9 +673,8 @@ def add_p2i_to_ptradd : GICombineRule<
 >;
 
 // Fold (ptr_add (int2ptr C1), C2) -> C1 + C2
-def const_ptradd_to_i2p_matchinfo : GIDefMatchData<"APInt">;
 def const_ptradd_to_i2p: GICombineRule<
-  (defs root:$root, const_ptradd_to_i2p_matchinfo:$info),
+  (defs root:$root, apint_matchinfo:$info),
   (match (wip_match_opcode G_PTR_ADD):$root,
     [{ return Helper.matchCombineConstPtrAddToI2P(*${root}, ${info}); }]),
   (apply [{ Helper.applyCombineConstPtrAddToI2P(*${root}, ${info}); }])
@@ -721,9 +742,8 @@ def anyext_trunc_fold: GICombineRule <
 
 // Fold (zext (trunc x)) -> x if the source type is same as the destination type
 // and truncated bits are known to be zero.
-def zext_trunc_fold_matchinfo : GIDefMatchData<"Register">;
 def zext_trunc_fold: GICombineRule <
-  (defs root:$root, zext_trunc_fold_matchinfo:$matchinfo),
+  (defs root:$root, register_matchinfo:$matchinfo),
   (match (wip_match_opcode G_ZEXT):$root,
          [{ return Helper.matchCombineZextTrunc(*${root}, ${matchinfo}); }]),
   (apply [{ Helper.replaceSingleDefInstWithReg(*${root}, ${matchinfo}); }])
@@ -1505,6 +1525,27 @@ def extract_vector_element_freeze : GICombineRule<
    [{ return Helper.matchExtractVectorElementWithFreeze(${root}, ${matchinfo}); }]),
    (apply [{ Helper.applyBuildFnMO(${root}, ${matchinfo}); }])>;
 
+def sext_trunc : GICombineRule<
+   (defs root:$root, build_fn_matchinfo:$matchinfo),
+   (match (G_TRUNC $src, $x, (MIFlags NoSWrap)),
+          (G_SEXT $root, $src),
+   [{ return Helper.matchSextOfTrunc(${root}, ${matchinfo}); }]),
+   (apply [{ Helper.applyBuildFnMO(${root}, ${matchinfo}); }])>;
+
+def zext_trunc : GICombineRule<
+   (defs root:$root, build_fn_matchinfo:$matchinfo),
+   (match (G_TRUNC $src, $x, (MIFlags NoUWrap)),
+          (G_ZEXT $root, $src),
+   [{ return Helper.matchZextOfTrunc(${root}, ${matchinfo}); }]),
+   (apply [{ Helper.applyBuildFnMO(${root}, ${matchinfo}); }])>;
+
+def extract_vector_element_shuffle_vector : GICombineRule<
+   (defs root:$root, build_fn_matchinfo:$matchinfo),
+   (match (G_SHUFFLE_VECTOR $src, $src1, $src2, $mask),
+          (G_EXTRACT_VECTOR_ELT $root, $src, $idx),
+   [{ return Helper.matchExtractVectorElementWithShuffleVector(${root}, ${matchinfo}); }]),
+   (apply [{ Helper.applyBuildFnMO(${root}, ${matchinfo}); }])>;
+
 // Combines concat operations
 def concat_matchinfo : GIDefMatchData<"SmallVector<Register>">;
 def combine_concat_vector : GICombineRule<
@@ -1582,6 +1623,7 @@ extract_vector_element_build_vector_trunc6,
 extract_vector_element_build_vector_trunc7,
 extract_vector_element_build_vector_trunc8,
 extract_vector_element_freeze,
+extract_vector_element_shuffle_vector,
 insert_vector_element_extract_vector_element
 ]>;
 
@@ -1625,6 +1667,8 @@ def width_reduction_combines : GICombineGroup<[reduce_shl_of_extend,
 
 def phi_combines : GICombineGroup<[extend_through_phis]>;
 
+def bitreverse_shift : GICombineGroup<[bitreverse_shl, bitreverse_lshr]>;
+
 def select_combines : GICombineGroup<[select_undef_cmp, select_constant_cmp,
                                       match_selects]>;
 
@@ -1654,7 +1698,7 @@ def all_combines : GICombineGroup<[trivial_combines, vector_ops_combines,
     unmerge_zext_to_zext, merge_unmerge, trunc_ext_fold, trunc_shift,
     const_combines, xor_of_and_with_same_reg, ptr_add_with_zero,
     shift_immed_chain, shift_of_shifted_logic_chain, load_or_combine,
-    div_rem_to_divrem, funnel_shift_combines, commute_shift,
+    div_rem_to_divrem, funnel_shift_combines, bitreverse_shift, commute_shift,
     form_bitfield_extract, constant_fold_binops, constant_fold_fma,
     constant_fold_cast_op, fabs_fneg_fold,
     intdiv_combines, mulh_combines, redundant_neg_operands,
@@ -1662,7 +1706,7 @@ def all_combines : GICombineGroup<[trivial_combines, vector_ops_combines,
     sub_add_reg, select_to_minmax, redundant_binop_in_equality,
     fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors,
     combine_concat_vector, double_icmp_zero_and_or_combine, match_addos,
-    combine_shuffle_concat]>;
+    sext_trunc, zext_trunc, combine_shuffle_concat]>;
 
 // A combine group used to for prelegalizer combiners at -O0. The combines in
 // this group have been selected based on experiments to balance code size and
diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index 48ea3cfe0277..1ba99730ca70 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -288,6 +288,10 @@ public:
     return Options.UniqueBasicBlockSectionNames;
   }
 
+  bool getSeparateNamedSections() const {
+    return Options.SeparateNamedSections;
+  }
+
   /// Return true if data objects should be emitted into their own section,
   /// corresponds to -fdata-sections.
   bool getDataSections() const {
diff --git a/llvm/include/llvm/Target/TargetOptions.h b/llvm/include/llvm/Target/TargetOptions.h
index d37e9d9576ba..98a8b7ba337c 100644
--- a/llvm/include/llvm/Target/TargetOptions.h
+++ b/llvm/include/llvm/Target/TargetOptions.h
@@ -144,15 +144,15 @@ namespace llvm {
           DisableIntegratedAS(false), FunctionSections(false),
           DataSections(false), IgnoreXCOFFVisibility(false),
           XCOFFTracebackTable(true), UniqueSectionNames(true),
-          UniqueBasicBlockSectionNames(false), TrapUnreachable(false),
-          NoTrapAfterNoreturn(false), TLSSize(0), EmulatedTLS(false),
-          EnableTLSDESC(false), EnableIPRA(false), EmitStackSizeSection(false),
-          EnableMachineOutliner(false), EnableMachineFunctionSplitter(false),
-          SupportsDefaultOutlining(false), EmitAddrsig(false), BBAddrMap(false),
-          EmitCallSiteInfo(false), SupportsDebugEntryValues(false),
-          EnableDebugEntryValues(false), ValueTrackingVariableLocations(false),
-          ForceDwarfFrameSection(false), XRayFunctionIndex(true),
-          DebugStrictDwarf(false), Hotpatch(false),
+          UniqueBasicBlockSectionNames(false), SeparateNamedSections(false),
+          TrapUnreachable(false), NoTrapAfterNoreturn(false), TLSSize(0),
+          EmulatedTLS(false), EnableTLSDESC(false), EnableIPRA(false),
+          EmitStackSizeSection(false), EnableMachineOutliner(false),
+          EnableMachineFunctionSplitter(false), SupportsDefaultOutlining(false),
+          EmitAddrsig(false), BBAddrMap(false), EmitCallSiteInfo(false),
+          SupportsDebugEntryValues(false), EnableDebugEntryValues(false),
+          ValueTrackingVariableLocations(false), ForceDwarfFrameSection(false),
+          XRayFunctionIndex(true), DebugStrictDwarf(false), Hotpatch(false),
           PPCGenScalarMASSEntries(false), JMCInstrument(false),
           EnableCFIFixup(false), MisExpect(false), XCOFFReadOnlyPointers(false),
           FPDenormalMode(DenormalMode::IEEE, DenormalMode::IEEE) {}
@@ -277,6 +277,9 @@ namespace llvm {
     /// Use unique names for basic block sections.
     unsigned UniqueBasicBlockSectionNames : 1;
 
+    /// Emit named sections with the same name into different sections.
+    unsigned SeparateNamedSections : 1;
+
     /// Emit target-specific trap instruction for 'unreachable' IR instructions.
     unsigned TrapUnreachable : 1;
 
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index 04fbaf07adfb..20c3f95173c2 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -104,29 +104,9 @@ static_assert(FEAT_MAX < 62,
               "Number of features in CPUFeatures are limited to 62 entries");
 
 // Each ArchExtKind correponds directly to a possible -target-feature.
-enum ArchExtKind : unsigned {
-  AEK_NONE = 1,
-#define ARM_EXTENSION(NAME, ENUM) ENUM,
+#define EMIT_ARCHEXTKIND_ENUM
 #include "llvm/TargetParser/AArch64TargetParserDef.inc"
-  AEK_NUM_EXTENSIONS,
-
-  // FIXME temporary fixes for inconsistent naming.
-  AEK_F32MM = AEK_MATMULFP32,
-  AEK_F64MM = AEK_MATMULFP64,
-  AEK_FCMA = AEK_COMPLXNUM,
-  AEK_FP = AEK_FPARMV8,
-  AEK_FP16 = AEK_FULLFP16,
-  AEK_I8MM = AEK_MATMULINT8,
-  AEK_JSCVT = AEK_JS,
-  AEK_PROFILE = AEK_SPE,
-  AEK_RASv2 = AEK_RASV2,
-  AEK_RAND = AEK_RANDGEN,
-  AEK_SIMD = AEK_NEON,
-  AEK_SME2p1 = AEK_SME2P1,
-  AEK_SVE2p1 = AEK_SVE2P1,
-  AEK_SME_LUTv2 = AEK_SME_LUTV2,
 
-};
 using ExtensionBitset = Bitset<AEK_NUM_EXTENSIONS>;
 
 // Represents an extension that can be enabled with -march=<arch>+<extension>.
@@ -148,111 +128,8 @@ struct ExtensionInfo {
       1000; // Maximum priority for FMV feature
 };
 
-// NOTE: If adding a new extension here, consider adding it to ExtensionMap
-// in AArch64AsmParser too, if supported as an extension name by binutils.
-// clang-format off
-inline constexpr ExtensionInfo Extensions[] = {
-    {"aes", AArch64::AEK_AES, "+aes", "-aes", FEAT_AES, "+fp-armv8,+neon", 150},
-    {"b16b16", AArch64::AEK_B16B16, "+b16b16", "-b16b16", FEAT_INIT, "", 0},
-    {"bf16", AArch64::AEK_BF16, "+bf16", "-bf16", FEAT_BF16, "+bf16", 280},
-    {"brbe", AArch64::AEK_BRBE, "+brbe", "-brbe", FEAT_INIT, "", 0},
-    {"bti", AArch64::AEK_NONE, {}, {}, FEAT_BTI, "+bti", 510},
-    {"crc", AArch64::AEK_CRC, "+crc", "-crc", FEAT_CRC, "+crc", 110},
-    {"crypto", AArch64::AEK_CRYPTO, "+crypto", "-crypto", FEAT_INIT, "+aes,+sha2", 0},
-    {"cssc", AArch64::AEK_CSSC, "+cssc", "-cssc", FEAT_INIT, "", 0},
-    {"d128", AArch64::AEK_D128, "+d128", "-d128", FEAT_INIT, "", 0},
-    {"dgh", AArch64::AEK_NONE, {}, {}, FEAT_DGH, "", 260},
-    {"dit", AArch64::AEK_NONE, {}, {}, FEAT_DIT, "+dit", 180},
-    {"dotprod", AArch64::AEK_DOTPROD, "+dotprod", "-dotprod", FEAT_DOTPROD, "+dotprod,+fp-armv8,+neon", 104},
-    {"dpb", AArch64::AEK_NONE, {}, {}, FEAT_DPB, "+ccpp", 190},
-    {"dpb2", AArch64::AEK_NONE, {}, {}, FEAT_DPB2, "+ccpp,+ccdp", 200},
-    {"ebf16", AArch64::AEK_NONE, {}, {}, FEAT_EBF16, "+bf16", 290},
-    {"f32mm", AArch64::AEK_F32MM, "+f32mm", "-f32mm", FEAT_SVE_F32MM, "+sve,+f32mm,+fullfp16,+fp-armv8,+neon", 350},
-    {"f64mm", AArch64::AEK_F64MM, "+f64mm", "-f64mm", FEAT_SVE_F64MM, "+sve,+f64mm,+fullfp16,+fp-armv8,+neon", 360},
-    {"fcma", AArch64::AEK_FCMA, "+complxnum", "-complxnum", FEAT_FCMA, "+fp-armv8,+neon,+complxnum", 220},
-    {"flagm", AArch64::AEK_FLAGM, "+flagm", "-flagm", FEAT_FLAGM, "+flagm", 20},
-    {"flagm2", AArch64::AEK_NONE, {}, {}, FEAT_FLAGM2, "+flagm,+altnzcv", 30},
-    {"fp", AArch64::AEK_FP, "+fp-armv8", "-fp-armv8", FEAT_FP, "+fp-armv8,+neon", 90},
-    {"fp16", AArch64::AEK_FP16, "+fullfp16", "-fullfp16", FEAT_FP16, "+fullfp16,+fp-armv8,+neon", 170},
-    {"fp16fml", AArch64::AEK_FP16FML, "+fp16fml", "-fp16fml", FEAT_FP16FML, "+fp16fml,+fullfp16,+fp-armv8,+neon", 175},
-    {"frintts", AArch64::AEK_NONE, {}, {}, FEAT_FRINTTS, "+fptoint", 250},
-    {"hbc", AArch64::AEK_HBC, "+hbc", "-hbc", FEAT_INIT, "", 0},
-    {"i8mm", AArch64::AEK_I8MM, "+i8mm", "-i8mm", FEAT_I8MM, "+i8mm", 270},
-    {"ite", AArch64::AEK_ITE, "+ite", "-ite", FEAT_INIT, "", 0},
-    {"jscvt", AArch64::AEK_JSCVT, "+jsconv", "-jsconv", FEAT_JSCVT, "+fp-armv8,+neon,+jsconv", 210},
-    {"ls64_accdata", AArch64::AEK_NONE, {}, {}, FEAT_LS64_ACCDATA, "+ls64", 540},
-    {"ls64_v", AArch64::AEK_NONE, {}, {}, FEAT_LS64_V, "", 530},
-    {"ls64", AArch64::AEK_LS64, "+ls64", "-ls64", FEAT_LS64, "", 520},
-    {"lse", AArch64::AEK_LSE, "+lse", "-lse", FEAT_LSE, "+lse", 80},
-    {"lse128", AArch64::AEK_LSE128, "+lse128", "-lse128", FEAT_INIT, "", 0},
-    {"memtag", AArch64::AEK_MTE, "+mte", "-mte", FEAT_MEMTAG, "", 440},
-    {"memtag2", AArch64::AEK_NONE, {}, {}, FEAT_MEMTAG2, "+mte", 450},
-    {"memtag3", AArch64::AEK_NONE, {}, {}, FEAT_MEMTAG3, "+mte", 460},
-    {"mops", AArch64::AEK_MOPS, "+mops", "-mops", FEAT_MOPS, "+mops", 650},
-    {"pauth", AArch64::AEK_PAUTH, "+pauth", "-pauth", FEAT_INIT, "", 0},
-    {"pmull", AArch64::AEK_NONE, {}, {}, FEAT_PMULL, "+aes,+fp-armv8,+neon", 160},
-    {"pmuv3", AArch64::AEK_PERFMON, "+perfmon", "-perfmon", FEAT_INIT, "", 0},
-    {"predres", AArch64::AEK_PREDRES, "+predres", "-predres", FEAT_PREDRES, "+predres", 480},
-    {"predres2", AArch64::AEK_SPECRES2, "+specres2", "-specres2", FEAT_INIT, "", 0},
-    {"profile", AArch64::AEK_PROFILE, "+spe", "-spe", FEAT_INIT, "", 0},
-    {"ras", AArch64::AEK_RAS, "+ras", "-ras", FEAT_INIT, "", 0},
-    {"rasv2", AArch64::AEK_RASv2, "+rasv2", "-rasv2", FEAT_INIT, "", 0},
-    {"rcpc", AArch64::AEK_RCPC, "+rcpc", "-rcpc", FEAT_RCPC, "+rcpc", 230},
-    {"rcpc2", AArch64::AEK_NONE, {}, {}, FEAT_RCPC2, "+rcpc", 240},
-    {"rcpc3", AArch64::AEK_RCPC3, "+rcpc3", "-rcpc3", FEAT_RCPC3, "+rcpc,+rcpc3", 241},
-    {"rdm", AArch64::AEK_RDM, "+rdm", "-rdm", FEAT_RDM, "+rdm,+fp-armv8,+neon", 108},
-    {"rng", AArch64::AEK_RAND, "+rand", "-rand", FEAT_RNG, "+rand", 10},
-    {"rpres", AArch64::AEK_NONE, {}, {}, FEAT_RPRES, "", 300},
-    {"sb", AArch64::AEK_SB, "+sb", "-sb", FEAT_SB, "+sb", 470},
-    {"sha1", AArch64::AEK_NONE, {}, {}, FEAT_SHA1, "+fp-armv8,+neon", 120},
-    {"sha2", AArch64::AEK_SHA2, "+sha2", "-sha2", FEAT_SHA2, "+sha2,+fp-armv8,+neon", 130},
-    {"sha3", AArch64::AEK_SHA3, "+sha3", "-sha3", FEAT_SHA3, "+sha3,+sha2,+fp-armv8,+neon", 140},
-    {"simd", AArch64::AEK_SIMD, "+neon", "-neon", FEAT_SIMD, "+fp-armv8,+neon", 100},
-    {"sm4", AArch64::AEK_SM4, "+sm4", "-sm4", FEAT_SM4, "+sm4,+fp-armv8,+neon", 106},
-    {"sme-f16f16", AArch64::AEK_SMEF16F16, "+sme-f16f16", "-sme-f16f16", FEAT_INIT, "+sme2,+sme-f16f16", 0},
-    {"sme-f64f64", AArch64::AEK_SMEF64F64, "+sme-f64f64", "-sme-f64f64", FEAT_SME_F64, "+sme,+sme-f64f64,+bf16", 560},
-    {"sme-i16i64", AArch64::AEK_SMEI16I64, "+sme-i16i64", "-sme-i16i64", FEAT_SME_I64, "+sme,+sme-i16i64,+bf16", 570},
-    {"sme", AArch64::AEK_SME, "+sme", "-sme", FEAT_SME, "+sme,+bf16", 430},
-    {"sme2", AArch64::AEK_SME2, "+sme2", "-sme2", FEAT_SME2, "+sme2,+sme,+bf16", 580},
-    {"sme2p1", AArch64::AEK_SME2p1, "+sme2p1", "-sme2p1", FEAT_INIT, "+sme2p1,+sme2,+sme,+bf16", 0},
-    {"ssbs", AArch64::AEK_SSBS, "+ssbs", "-ssbs", FEAT_SSBS, "", 490},
-    {"ssbs2", AArch64::AEK_NONE, {}, {}, FEAT_SSBS2, "+ssbs", 500},
-    {"sve-bf16", AArch64::AEK_NONE, {}, {}, FEAT_SVE_BF16, "+sve,+bf16,+fullfp16,+fp-armv8,+neon", 320},
-    {"sve-ebf16", AArch64::AEK_NONE, {}, {}, FEAT_SVE_EBF16, "+sve,+bf16,+fullfp16,+fp-armv8,+neon", 330},
-    {"sve-i8mm", AArch64::AEK_NONE, {}, {}, FEAT_SVE_I8MM, "+sve,+i8mm,+fullfp16,+fp-armv8,+neon", 340},
-    {"sve", AArch64::AEK_SVE, "+sve", "-sve", FEAT_SVE, "+sve,+fullfp16,+fp-armv8,+neon", 310},
-    {"sve2-aes", AArch64::AEK_SVE2AES, "+sve2-aes", "-sve2-aes", FEAT_SVE_AES, "+sve2,+sve,+sve2-aes,+fullfp16,+fp-armv8,+neon", 380},
-    {"sve2-bitperm", AArch64::AEK_SVE2BITPERM, "+sve2-bitperm", "-sve2-bitperm", FEAT_SVE_BITPERM, "+sve2,+sve,+sve2-bitperm,+fullfp16,+fp-armv8,+neon", 400},
-    {"sve2-pmull128", AArch64::AEK_NONE, {}, {}, FEAT_SVE_PMULL128, "+sve2,+sve,+sve2-aes,+fullfp16,+fp-armv8,+neon", 390},
-    {"sve2-sha3", AArch64::AEK_SVE2SHA3, "+sve2-sha3", "-sve2-sha3", FEAT_SVE_SHA3, "+sve2,+sve,+sve2-sha3,+fullfp16,+fp-armv8,+neon", 410},
-    {"sve2-sm4", AArch64::AEK_SVE2SM4, "+sve2-sm4", "-sve2-sm4", FEAT_SVE_SM4, "+sve2,+sve,+sve2-sm4,+fullfp16,+fp-armv8,+neon", 420},
-    {"sve2", AArch64::AEK_SVE2, "+sve2", "-sve2", FEAT_SVE2, "+sve2,+sve,+fullfp16,+fp-armv8,+neon", 370},
-    {"sve2p1", AArch64::AEK_SVE2p1, "+sve2p1", "-sve2p1", FEAT_INIT, "+sve2p1,+sve2,+sve,+fullfp16,+fp-armv8,+neon", 0},
-    {"the", AArch64::AEK_THE, "+the", "-the", FEAT_INIT, "", 0},
-    {"tme", AArch64::AEK_TME, "+tme", "-tme", FEAT_INIT, "", 0},
-    {"wfxt", AArch64::AEK_NONE, {}, {}, FEAT_WFXT, "+wfxt", 550},
-    {"gcs", AArch64::AEK_GCS, "+gcs", "-gcs", FEAT_INIT, "", 0},
-    {"fpmr", AArch64::AEK_FPMR, "+fpmr", "-fpmr", FEAT_INIT, "", 0},
-    {"fp8", AArch64::AEK_FP8, "+fp8", "-fp8", FEAT_INIT, "+fpmr", 0},
-    {"faminmax", AArch64::AEK_FAMINMAX, "+faminmax", "-faminmax", FEAT_INIT, "", 0},
-    {"fp8fma", AArch64::AEK_FP8FMA, "+fp8fma", "-fp8fma", FEAT_INIT, "+fpmr", 0},
-    {"ssve-fp8fma", AArch64::AEK_SSVE_FP8FMA, "+ssve-fp8fma", "-ssve-fp8fma", FEAT_INIT, "+sme2", 0},
-    {"fp8dot2", AArch64::AEK_FP8DOT2, "+fp8dot2", "-fp8dot2", FEAT_INIT, "", 0},
-    {"ssve-fp8dot2", AArch64::AEK_SSVE_FP8DOT2, "+ssve-fp8dot2", "-ssve-fp8dot2", FEAT_INIT, "+sme2", 0},
-    {"fp8dot4", AArch64::AEK_FP8DOT4, "+fp8dot4", "-fp8dot4", FEAT_INIT, "", 0},
-    {"ssve-fp8dot4", AArch64::AEK_SSVE_FP8DOT4, "+ssve-fp8dot4", "-ssve-fp8dot4", FEAT_INIT, "+sme2", 0},
-    {"lut", AArch64::AEK_LUT, "+lut", "-lut", FEAT_INIT, "", 0},
-    {"sme-lutv2", AArch64::AEK_SME_LUTv2, "+sme-lutv2", "-sme-lutv2", FEAT_INIT, "", 0},
-    {"sme-f8f16", AArch64::AEK_SMEF8F16, "+sme-f8f16", "-sme-f8f16", FEAT_INIT, "+fp8,+sme2", 0},
-    {"sme-f8f32", AArch64::AEK_SMEF8F32, "+sme-f8f32", "-sme-f8f32", FEAT_INIT, "+sme2,+fp8", 0},
-    {"sme-fa64",  AArch64::AEK_SMEFA64,  "+sme-fa64", "-sme-fa64",  FEAT_INIT, "", 0},
-    {"cpa", AArch64::AEK_CPA, "+cpa", "-cpa", FEAT_INIT, "", 0},
-    {"pauth-lr", AArch64::AEK_PAUTHLR, "+pauth-lr", "-pauth-lr", FEAT_INIT, "", 0},
-    {"tlbiw", AArch64::AEK_TLBIW, "+tlbiw", "-tlbiw", FEAT_INIT, "", 0},
-    // Special cases
-    {"none", AArch64::AEK_NONE, {}, {}, FEAT_INIT, "", ExtensionInfo::MaxFMVPriority},
-};
-// clang-format on
+#define EMIT_EXTENSIONS
+#include "llvm/TargetParser/AArch64TargetParserDef.inc"
 
 struct ExtensionSet {
   // Set of extensions which are currently enabled.
@@ -328,7 +205,7 @@ inline constexpr ExtensionDependency ExtensionDependencies[] = {
   {AEK_SVE, AEK_SVE2},
   {AEK_SVE, AEK_F32MM},
   {AEK_SVE, AEK_F64MM},
-  {AEK_SVE2, AEK_SVE2p1},
+  {AEK_SVE2, AEK_SVE2P1},
   {AEK_SVE2, AEK_SVE2BITPERM},
   {AEK_SVE2, AEK_SVE2AES},
   {AEK_SVE2, AEK_SVE2SHA3},
@@ -340,7 +217,7 @@ inline constexpr ExtensionDependency ExtensionDependencies[] = {
   {AEK_SME, AEK_SMEF64F64},
   {AEK_SME, AEK_SMEI16I64},
   {AEK_SME, AEK_SMEFA64},
-  {AEK_SME2, AEK_SME2p1},
+  {AEK_SME2, AEK_SME2P1},
   {AEK_SME2, AEK_SSVE_FP8FMA},
   {AEK_SME2, AEK_SSVE_FP8DOT2},
   {AEK_SME2, AEK_SSVE_FP8DOT4},
@@ -350,7 +227,7 @@ inline constexpr ExtensionDependency ExtensionDependencies[] = {
   {AEK_FP8, AEK_SMEF8F32},
   {AEK_LSE, AEK_LSE128},
   {AEK_PREDRES, AEK_SPECRES2},
-  {AEK_RAS, AEK_RASv2},
+  {AEK_RAS, AEK_RASV2},
   {AEK_RCPC, AEK_RCPC3},
 };
 // clang-format on
@@ -429,7 +306,7 @@ inline constexpr ArchInfo ARMV8_7A  = { VersionTuple{8, 7}, AProfile, "armv8.7-a
 inline constexpr ArchInfo ARMV8_8A  = { VersionTuple{8, 8}, AProfile, "armv8.8-a", "+v8.8a", (ARMV8_7A.DefaultExts |
                                         AArch64::ExtensionBitset({AArch64::AEK_MOPS, AArch64::AEK_HBC}))};
 inline constexpr ArchInfo ARMV8_9A  = { VersionTuple{8, 9}, AProfile, "armv8.9-a", "+v8.9a", (ARMV8_8A.DefaultExts |
-                                        AArch64::ExtensionBitset({AArch64::AEK_SPECRES2, AArch64::AEK_CSSC, AArch64::AEK_RASv2}))};
+                                        AArch64::ExtensionBitset({AArch64::AEK_SPECRES2, AArch64::AEK_CSSC, AArch64::AEK_RASV2}))};
 inline constexpr ArchInfo ARMV9A    = { VersionTuple{9, 0}, AProfile, "armv9-a", "+v9a", (ARMV8_5A.DefaultExts |
                                         AArch64::ExtensionBitset({AArch64::AEK_FP16, AArch64::AEK_SVE, AArch64::AEK_SVE2}))};
 inline constexpr ArchInfo ARMV9_1A  = { VersionTuple{9, 1}, AProfile, "armv9.1-a", "+v9.1a", (ARMV9A.DefaultExts |
@@ -438,7 +315,7 @@ inline constexpr ArchInfo ARMV9_2A  = { VersionTuple{9, 2}, AProfile, "armv9.2-a
 inline constexpr ArchInfo ARMV9_3A  = { VersionTuple{9, 3}, AProfile, "armv9.3-a", "+v9.3a", (ARMV9_2A.DefaultExts |
                                         AArch64::ExtensionBitset({AArch64::AEK_MOPS, AArch64::AEK_HBC}))};
 inline constexpr ArchInfo ARMV9_4A  = { VersionTuple{9, 4}, AProfile, "armv9.4-a", "+v9.4a", (ARMV9_3A.DefaultExts |
-                                        AArch64::ExtensionBitset({AArch64::AEK_SPECRES2, AArch64::AEK_CSSC, AArch64::AEK_RASv2}))};
+                                        AArch64::ExtensionBitset({AArch64::AEK_SPECRES2, AArch64::AEK_CSSC, AArch64::AEK_RASV2}))};
 inline constexpr ArchInfo ARMV9_5A  = { VersionTuple{9, 5}, AProfile, "armv9.5-a", "+v9.5a", (ARMV9_4A.DefaultExts |
                                         AArch64::ExtensionBitset({AArch64::AEK_CPA}))};
 // For v8-R, we do not enable crypto and align with GCC that enables a more minimal set of optional architecture extensions.
@@ -799,6 +676,8 @@ inline constexpr Alias CpuAliases[] = {{"cobalt-100", "neoverse-n2"},
 
 inline constexpr Alias ExtAliases[] = {{"rdma", "rdm"}};
 
+const ExtensionInfo &getExtensionByID(ArchExtKind(ExtID));
+
 bool getExtensionFeatures(
     const AArch64::ExtensionBitset &Extensions,
     std::vector<StringRef> &Features);
diff --git a/llvm/include/llvm/TargetParser/ARMTargetParser.def b/llvm/include/llvm/TargetParser/ARMTargetParser.def
index b821d224d7a8..d7b77a6ef5b6 100644
--- a/llvm/include/llvm/TargetParser/ARMTargetParser.def
+++ b/llvm/include/llvm/TargetParser/ARMTargetParser.def
@@ -183,7 +183,7 @@ ARM_ARCH("armv9.5-a", ARMV9_5A, "9.5-A", "+v9.5a", ARMBuildAttrs::CPUArch::v9_A,
           ARM::AEK_HWDIVTHUMB | ARM::AEK_DSP | ARM::AEK_CRC | ARM::AEK_RAS |
           ARM::AEK_DOTPROD | ARM::AEK_BF16 | ARM::AEK_I8MM))
 ARM_ARCH("armv8-r", ARMV8R, "8-R", "+v8r", ARMBuildAttrs::CPUArch::v8_R,
-         FK_NEON_FP_ARMV8,
+         FK_FPV5_SP_D16,
          (ARM::AEK_MP | ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
           ARM::AEK_HWDIVTHUMB | ARM::AEK_DSP | ARM::AEK_CRC))
 ARM_ARCH("armv8-m.base", ARMV8MBaseline, "8-M.Baseline", "+v8m.base",
@@ -329,7 +329,7 @@ ARM_CPU_NAME("cortex-r7", ARMV7R, FK_VFPV3_D16_FP16, false,
              (ARM::AEK_MP | ARM::AEK_HWDIVARM))
 ARM_CPU_NAME("cortex-r8", ARMV7R, FK_VFPV3_D16_FP16, false,
              (ARM::AEK_MP | ARM::AEK_HWDIVARM))
-ARM_CPU_NAME("cortex-r52", ARMV8R, FK_NEON_FP_ARMV8, true, ARM::AEK_NONE)
+ARM_CPU_NAME("cortex-r52", ARMV8R, FK_NEON_FP_ARMV8, false, ARM::AEK_NONE)
 ARM_CPU_NAME("sc300", ARMV7M, FK_NONE, false, ARM::AEK_NONE)
 ARM_CPU_NAME("cortex-m3", ARMV7M, FK_NONE, true, ARM::AEK_NONE)
 ARM_CPU_NAME("cortex-m4", ARMV7EM, FK_FPV4_SP_D16, true, ARM::AEK_NONE)
diff --git a/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h b/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h
index 8ae553ca80dd..f6115718e9f5 100644
--- a/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h
+++ b/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h
@@ -46,7 +46,7 @@ struct ParsedBranchProtection {
 };
 
 bool parseBranchProtection(StringRef Spec, ParsedBranchProtection &PBP,
-                           StringRef &Err);
+                           StringRef &Err, bool EnablePAuthLR = false);
 
 } // namespace ARM
 } // namespace llvm
diff --git a/llvm/include/llvm/TargetParser/PPCTargetParser.def b/llvm/include/llvm/TargetParser/PPCTargetParser.def
index 88c7304659c4..d462c9c9ffe6 100644
--- a/llvm/include/llvm/TargetParser/PPCTargetParser.def
+++ b/llvm/include/llvm/TargetParser/PPCTargetParser.def
@@ -132,6 +132,14 @@ PPC_LNX_CPU("power10",47)
 #ifndef AIX_POWERPC_USE_SYS_CONF
   #define AIX_POWERPC_USE_SYS_CONF
   #define AIX_SYSCON_IMPL_IDX 1
+  #define AIX_SYSCON_CACHE_IDX 5
+  #define AIX_SYSCON_SMT_IDX 44
+  #define AIX_SYSCON_VMX_IDX 46
+  #define AIX_SYSCON_DFP_IDX  53
+
+  #define SYS_CALL_TM_VER   59
+  #define SYS_CALL_MMA_VER  62
+
   #define AIX_PPC7_VALUE 0x00008000
   #define AIX_PPC8_VALUE 0x00010000
   #define AIX_PPC9_VALUE 0x00020000
@@ -141,46 +149,97 @@ PPC_LNX_CPU("power10",47)
   #define AIX_BUILTIN_PPC_TRUE 1
   #define AIX_BUILTIN_PPC_FALSE 0
   #define USE_SYS_CONF 2
-
-  // Supported COMPARE_OP values.
-  #define COMP_EQ  0
-
+  #define SYS_CALL 3
 #endif
 
-// The value of SUPPORT_METHOD can be AIX_BUILTIN_PPC_TRUE,
-// AIX_BUILTIN_PPC_FALSE, or USE_SYS_CONF.
-// When the value of SUPPORT_METHOD is USE_SYS_CONF, the return value
-// depends on the result of comparing the data member of
-// _system_configuration specified by INDEX with a certain value.
+// The value of SUPPORT_METHOD can be:
+//   AIX_BUILTIN_PPC_TRUE : feature supported
+//   AIX_BUILTIN_PPC_FALSE : feature not supported
+//   USE_SYS_CONF : return value depends on comparing VALUE with the specified
+//                  data member of _system_configuration at INDEX, where the
+//                  data member is masked by Mask.
+//   SYS_CALL : return value depends on comparing a VALUE with the return value
+//              of calling `getsystemcfg` with the parameter INDEX, which is
+//              then masked by Mask.
 
 #ifndef PPC_AIX_CPU
   #define PPC_AIX_CPU(NAME, SUPPORT_METHOD, INDEX, COMPARE_OP, VALUE)
 #endif
 
-// __builtin_cpu_is() is supported only on Power7 and up.
-PPC_AIX_CPU("power4",AIX_BUILTIN_PPC_FALSE,0,0,0)
-PPC_AIX_CPU("ppc970",AIX_BUILTIN_PPC_FALSE,0,0,0)
-PPC_AIX_CPU("power5",AIX_BUILTIN_PPC_FALSE,0,0,0)
-PPC_AIX_CPU("power5+",AIX_BUILTIN_PPC_FALSE,0,0,0)
-PPC_AIX_CPU("power6",AIX_BUILTIN_PPC_FALSE,0,0,0)
-PPC_AIX_CPU("ppc-cell-be",AIX_BUILTIN_PPC_FALSE,0,0,0)
-PPC_AIX_CPU("power6x",AIX_BUILTIN_PPC_FALSE,0,0,0)
-PPC_AIX_CPU("ppca2",AIX_BUILTIN_PPC_FALSE,0,0,0)
-PPC_AIX_CPU("ppc405",AIX_BUILTIN_PPC_FALSE,0,0,0)
-PPC_AIX_CPU("ppc440",AIX_BUILTIN_PPC_FALSE,0,0,0)
-PPC_AIX_CPU("ppc464",AIX_BUILTIN_PPC_FALSE,0,0,0)
-PPC_AIX_CPU("ppc476",AIX_BUILTIN_PPC_FALSE,0,0,0)
-PPC_AIX_CPU("power7",USE_SYS_CONF,AIX_SYSCON_IMPL_IDX,COMP_EQ,AIX_PPC7_VALUE)
-PPC_AIX_CPU("power8",USE_SYS_CONF,AIX_SYSCON_IMPL_IDX,COMP_EQ,AIX_PPC8_VALUE)
-PPC_AIX_CPU("power9",USE_SYS_CONF,AIX_SYSCON_IMPL_IDX,COMP_EQ,AIX_PPC9_VALUE)
-PPC_AIX_CPU("power10",USE_SYS_CONF,AIX_SYSCON_IMPL_IDX,COMP_EQ,AIX_PPC10_VALUE)
+// __builtin_cpu_is() and __builtin_cpu_supports() are supported only on Power7 and up.
+PPC_AIX_CPU("power4",AIX_BUILTIN_PPC_FALSE,0,CmpInst::Predicate(),0)
+PPC_AIX_CPU("ppc970",AIX_BUILTIN_PPC_FALSE,0,CmpInst::Predicate(),0)
+PPC_AIX_CPU("power5",AIX_BUILTIN_PPC_FALSE,0,CmpInst::Predicate(),0)
+PPC_AIX_CPU("power5+",AIX_BUILTIN_PPC_FALSE,0,CmpInst::Predicate(),0)
+PPC_AIX_CPU("power6",AIX_BUILTIN_PPC_FALSE,0,CmpInst::Predicate(),0)
+PPC_AIX_CPU("ppc-cell-be",AIX_BUILTIN_PPC_FALSE,0,CmpInst::Predicate(),0)
+PPC_AIX_CPU("power6x",AIX_BUILTIN_PPC_FALSE,0,CmpInst::Predicate(),0)
+PPC_AIX_CPU("ppca2",AIX_BUILTIN_PPC_FALSE,0,CmpInst::Predicate(),0)
+PPC_AIX_CPU("ppc405",AIX_BUILTIN_PPC_FALSE,0,CmpInst::Predicate(),0)
+PPC_AIX_CPU("ppc440",AIX_BUILTIN_PPC_FALSE,0,CmpInst::Predicate(),0)
+PPC_AIX_CPU("ppc464",AIX_BUILTIN_PPC_FALSE,0,CmpInst::Predicate(),0)
+PPC_AIX_CPU("ppc476",AIX_BUILTIN_PPC_FALSE,0,CmpInst::Predicate(),0)
+PPC_AIX_CPU("power7",USE_SYS_CONF,AIX_SYSCON_IMPL_IDX,ICmpInst::ICMP_EQ,AIX_PPC7_VALUE)
+PPC_AIX_CPU("power8",USE_SYS_CONF,AIX_SYSCON_IMPL_IDX,ICmpInst::ICMP_EQ,AIX_PPC8_VALUE)
+PPC_AIX_CPU("power9",USE_SYS_CONF,AIX_SYSCON_IMPL_IDX,ICmpInst::ICMP_EQ,AIX_PPC9_VALUE)
+PPC_AIX_CPU("power10",USE_SYS_CONF,AIX_SYSCON_IMPL_IDX,ICmpInst::ICMP_EQ,AIX_PPC10_VALUE)
 #undef PPC_AIX_CPU
 
+#ifndef PPC_AIX_FEATURE
+#define PPC_AIX_FEATURE(NAME,DESC,SUPPORT_METHOD,INDEX,MASK,COMPARE_OP,VALUE)
+#endif
+
+PPC_AIX_FEATURE("4xxmac","4xx CPU has a Multiply Accumulator",AIX_BUILTIN_PPC_FALSE,0,0,CmpInst::Predicate(),0)
+PPC_AIX_FEATURE("altivec","CPU has a SIMD/Vector Unit",USE_SYS_CONF,AIX_SYSCON_VMX_IDX,0,ICmpInst::ICMP_UGT,0)
+PPC_AIX_FEATURE("arch_2_05","CPU supports ISA 205 (eg, POWER6)",AIX_BUILTIN_PPC_TRUE,0,0,CmpInst::Predicate(),0)
+PPC_AIX_FEATURE("arch_2_06","CPU supports ISA 206 (eg, POWER7)",USE_SYS_CONF,AIX_SYSCON_IMPL_IDX,0,ICmpInst::ICMP_UGE,AIX_PPC7_VALUE)
+PPC_AIX_FEATURE("arch_2_07","CPU supports ISA 207 (eg, POWER8)",USE_SYS_CONF,AIX_SYSCON_IMPL_IDX,0,ICmpInst::ICMP_UGE,AIX_PPC8_VALUE)
+PPC_AIX_FEATURE("arch_3_00","CPU supports ISA 30 (eg, POWER9)", USE_SYS_CONF,AIX_SYSCON_IMPL_IDX,0,ICmpInst::ICMP_UGE,AIX_PPC9_VALUE)
+PPC_AIX_FEATURE("arch_3_1","CPU supports ISA 31 (eg, POWER10)", USE_SYS_CONF,AIX_SYSCON_IMPL_IDX,0,ICmpInst::ICMP_UGE,AIX_PPC10_VALUE)
+PPC_AIX_FEATURE("booke","CPU supports the Embedded ISA category",AIX_BUILTIN_PPC_FALSE,0,0,CmpInst::Predicate(),0)
+PPC_AIX_FEATURE("cellbe","CPU has a CELL broadband engine",AIX_BUILTIN_PPC_FALSE,0,0,CmpInst::Predicate(),0)
+PPC_AIX_FEATURE("darn","CPU supports the darn (deliver a random number) instruction",USE_SYS_CONF,AIX_SYSCON_IMPL_IDX,0,ICmpInst::ICMP_UGE,AIX_PPC9_VALUE)
+PPC_AIX_FEATURE("dfp","CPU has a decimal floating point unit",USE_SYS_CONF,AIX_SYSCON_DFP_IDX,0,ICmpInst::ICMP_NE,0)
+PPC_AIX_FEATURE("dscr","CPU supports the data stream control register",USE_SYS_CONF,AIX_SYSCON_IMPL_IDX,0,ICmpInst::ICMP_UGE,AIX_PPC8_VALUE)
+PPC_AIX_FEATURE("ebb","CPU supports event base branching",USE_SYS_CONF,AIX_SYSCON_IMPL_IDX,0,ICmpInst::ICMP_UGE,AIX_PPC8_VALUE)
+PPC_AIX_FEATURE("efpsingle","CPU has a SPE single precision floating point unit",AIX_BUILTIN_PPC_FALSE,0,0,CmpInst::Predicate(),0)
+PPC_AIX_FEATURE("efpdouble","CPU has a SPE double precision floating point unit",AIX_BUILTIN_PPC_FALSE,0,0,CmpInst::Predicate(),0)
+PPC_AIX_FEATURE("fpu","CPU has a floating point unit",AIX_BUILTIN_PPC_TRUE,0,0,CmpInst::Predicate(),0)
+PPC_AIX_FEATURE("htm","CPU has hardware transaction memory instructions",SYS_CALL,SYS_CALL_TM_VER,0,ICmpInst::ICMP_UGT,0)
+PPC_AIX_FEATURE("isel","CPU supports the integer select instruction",AIX_BUILTIN_PPC_TRUE,0,0,CmpInst::Predicate(),0)
+PPC_AIX_FEATURE("mma","CPU supports the matrix-multiply assist instructions",SYS_CALL,SYS_CALL_MMA_VER,0,ICmpInst::ICMP_UGT,0)
+PPC_AIX_FEATURE("mmu","CPU has a memory management unit",AIX_BUILTIN_PPC_TRUE,0,0,CmpInst::Predicate(),0)
+PPC_AIX_FEATURE("pa6t","CPU supports the PA Semi 6T CORE ISA",AIX_BUILTIN_PPC_FALSE,0,0,CmpInst::Predicate(),0)
+PPC_AIX_FEATURE("power4","CPU supports ISA 200 (eg, POWER4)",AIX_BUILTIN_PPC_TRUE,0,0,CmpInst::Predicate(),0)
+PPC_AIX_FEATURE("power5","CPU supports ISA 202 (eg, POWER5)",AIX_BUILTIN_PPC_TRUE,0,0,CmpInst::Predicate(),0)
+PPC_AIX_FEATURE("power5+","CPU supports ISA 203 (eg, POWER5+)",AIX_BUILTIN_PPC_TRUE,0,0,CmpInst::Predicate(),0)
+PPC_AIX_FEATURE("power6x","CPU supports ISA 205 (eg, POWER6)",AIX_BUILTIN_PPC_FALSE,0,0,CmpInst::Predicate(),0)
+PPC_AIX_FEATURE("ppc32","CPU supports 32-bit mode execution",AIX_BUILTIN_PPC_TRUE,0,0,CmpInst::Predicate(),0)
+PPC_AIX_FEATURE("ppc601","CPU supports the old POWER ISA (eg, 601)",AIX_BUILTIN_PPC_FALSE,0,0,CmpInst::Predicate(),0)
+PPC_AIX_FEATURE("ppc64","CPU supports 64-bit mode execution",AIX_BUILTIN_PPC_TRUE,0,0,CmpInst::Predicate(),0)
+PPC_AIX_FEATURE("ppcle","CPU supports a little-endian mode that uses address swizzling",AIX_BUILTIN_PPC_FALSE,0,0,CmpInst::Predicate(),0)
+PPC_AIX_FEATURE("smt","CPU supports simultaneous multi-threading",USE_SYS_CONF,AIX_SYSCON_SMT_IDX,0x3,ICmpInst::ICMP_EQ,0x3)
+PPC_AIX_FEATURE("spe","CPU has a signal processing extension unit",AIX_BUILTIN_PPC_FALSE,0,0,CmpInst::Predicate(),0)
+PPC_AIX_FEATURE("tar","CPU supports the target address register",USE_SYS_CONF,AIX_SYSCON_IMPL_IDX,0,ICmpInst::ICMP_UGE,AIX_PPC8_VALUE)
+PPC_AIX_FEATURE("true_le","CPU supports true little-endian mode",AIX_BUILTIN_PPC_TRUE,0,0,CmpInst::Predicate(),0)
+PPC_AIX_FEATURE("ucache","CPU has unified I/D cache",USE_SYS_CONF,AIX_SYSCON_CACHE_IDX,0x00000002,ICmpInst::ICMP_EQ,0x00000002)
+PPC_AIX_FEATURE("vsx","CPU supports the vector-scalar extension",USE_SYS_CONF,AIX_SYSCON_VMX_IDX,0,ICmpInst::ICMP_UGT,1)
+#undef PPC_AIX_FEATURE
+
 // PPC_SYSTEMCONFIG_TYPE defines the IR data structure of kernel variable
 // `_system_configuration`, that is found in the AIX OS header file: </usr/include/sys/systemcfg.h>.
 #ifndef PPC_SYSTEMCONFIG_TYPE
 #define PPC_SYSTEMCONFIG_TYPE \
-Int32Ty, Int32Ty, Int32Ty
+Int32Ty, Int32Ty, Int32Ty, Int32Ty, Int32Ty, Int32Ty, \
+Int32Ty, Int32Ty, Int32Ty, Int32Ty, Int32Ty, Int32Ty, \
+Int32Ty, Int32Ty, Int32Ty, Int32Ty, Int32Ty, Int32Ty, \
+Int32Ty, Int32Ty, Int32Ty, Int32Ty, Int32Ty, Int32Ty, \
+Int32Ty, Int32Ty, Int32Ty, Int32Ty, Int32Ty, Int32Ty, \
+Int32Ty, Int32Ty, Int64Ty, Int32Ty, Int32Ty, Int32Ty, \
+Int32Ty, Int64Ty, Int64Ty, Int64Ty, Int64Ty, Int32Ty, \
+Int32Ty, Int32Ty, Int32Ty, Int32Ty, Int32Ty, Int64Ty, \
+Int32Ty, Int8Ty, Int8Ty, Int8Ty, Int8Ty, Int32Ty, \
+Int32Ty, Int16Ty, Int16Ty, llvm::ArrayType::get(Int32Ty,3), Int32Ty
 #endif
 
 #endif // !PPC_TGT_PARSER_UNDEF_MACROS
diff --git a/llvm/include/llvm/TargetParser/RISCVISAInfo.h b/llvm/include/llvm/TargetParser/RISCVISAInfo.h
index 36617a9b6259..12f6b46fb3ce 100644
--- a/llvm/include/llvm/TargetParser/RISCVISAInfo.h
+++ b/llvm/include/llvm/TargetParser/RISCVISAInfo.h
@@ -87,7 +87,7 @@ private:
 
   RISCVISAUtils::OrderedExtensionMap Exts;
 
-  void addExtension(StringRef ExtName, RISCVISAUtils::ExtensionVersion Version);
+  bool addExtension(StringRef ExtName, RISCVISAUtils::ExtensionVersion Version);
 
   Error checkDependency();
 
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index 7da30e6cf96f..8f9d99816931 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -176,6 +176,7 @@ public:
     DXILSubArch_v1_6,
     DXILSubArch_v1_7,
     DXILSubArch_v1_8,
+    LatestDXILSubArch = DXILSubArch_v1_8,
   };
   enum VendorType {
     UnknownVendor,
@@ -428,6 +429,10 @@ public:
   /// (SubArch).  This should only be called with Vulkan SPIR-V triples.
   VersionTuple getVulkanVersion() const;
 
+  /// Parse the DXIL version number from the DXIL version
+  /// (SubArch).  This should only be called with DXIL triples.
+  VersionTuple getDXILVersion() const;
+
   /// @}
   /// @name Direct Component Access
   /// @{
diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def
index a9ed56fcd470..5670767ff7ed 100644
--- a/llvm/include/llvm/TargetParser/X86TargetParser.def
+++ b/llvm/include/llvm/TargetParser/X86TargetParser.def
@@ -253,6 +253,7 @@ X86_FEATURE_COMPAT(USERMSR,         "usermsr",                0)
 X86_FEATURE_COMPAT(AVX10_1,         "avx10.1-256",            0)
 X86_FEATURE_COMPAT(AVX10_1_512,     "avx10.1-512",            0)
 X86_FEATURE       (EVEX512,         "evex512")
+X86_FEATURE       (NF,              "nf")
 X86_FEATURE       (CF,              "cf")
 // These features aren't really CPU features, but the frontend can set them.
 X86_FEATURE       (RETPOLINE_EXTERNAL_THUNK,    "retpoline-external-thunk")
diff --git a/llvm/include/llvm/TextAPI/Utils.h b/llvm/include/llvm/TextAPI/Utils.h
index 87550851f091..00dfd63e14f9 100644
--- a/llvm/include/llvm/TextAPI/Utils.h
+++ b/llvm/include/llvm/TextAPI/Utils.h
@@ -32,6 +32,8 @@
 namespace llvm::MachO {
 
 using PathSeq = std::vector<std::string>;
+using PathToPlatform = std::pair<std::string, std::optional<PlatformType>>;
+using PathToPlatformSeq = std::vector<PathToPlatform>;
 
 // Defines simple struct for storing symbolic links.
 struct SymLink {
@@ -87,5 +89,12 @@ using AliasMap = std::map<AliasEntry, AliasEntry>;
 /// \return Lookup table of alias to their base symbol.
 Expected<AliasMap> parseAliasList(std::unique_ptr<llvm::MemoryBuffer> &Buffer);
 
+/// Pickup active paths for a given platform.
+///
+/// \param Paths File or search paths to pick up.
+/// \param Platform Platform to collect paths for.
+PathSeq getPathsForPlatform(const PathToPlatformSeq &Paths,
+                            PlatformType Platform);
+
 } // namespace llvm::MachO
 #endif // LLVM_TEXTAPI_UTILS_H
diff --git a/llvm/include/llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h b/llvm/include/llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h
index 2d76546316fa..3568417510f1 100644
--- a/llvm/include/llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h
+++ b/llvm/include/llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h
@@ -8,7 +8,7 @@
 /// \file
 ///
 /// AggressiveInstCombiner - Combine expression patterns to form expressions
-/// with fewer, simple instructions. This pass does not modify the CFG.
+/// with fewer, simple instructions.
 ///
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/include/llvm/Transforms/Instrumentation/PGOCtxProfLowering.h b/llvm/include/llvm/Transforms/Instrumentation/PGOCtxProfLowering.h
index 38afa0c6fd32..5256aff56205 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/PGOCtxProfLowering.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/PGOCtxProfLowering.h
@@ -12,13 +12,16 @@
 #ifndef LLVM_TRANSFORMS_INSTRUMENTATION_PGOCTXPROFLOWERING_H
 #define LLVM_TRANSFORMS_INSTRUMENTATION_PGOCTXPROFLOWERING_H
 
+#include "llvm/IR/PassManager.h"
 namespace llvm {
 class Type;
 
-class PGOCtxProfLoweringPass {
+class PGOCtxProfLoweringPass : public PassInfoMixin<PGOCtxProfLoweringPass> {
 public:
   explicit PGOCtxProfLoweringPass() = default;
   static bool isContextualIRPGOEnabled();
+
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
 };
 } // namespace llvm
 #endif
diff --git a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h
index f7358ac9b1ee..65d43775bdc1 100644
--- a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h
+++ b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h
@@ -142,7 +142,7 @@ public:
   }
 
   Constant *evaluateOnPredecessorEdge(BasicBlock *BB, BasicBlock *PredPredBB,
-                                      Value *cond);
+                                      Value *cond, const DataLayout &DL);
   bool maybethreadThroughTwoBasicBlocks(BasicBlock *BB, Value *Cond);
   void threadThroughTwoBasicBlocks(BasicBlock *PredPredBB, BasicBlock *PredBB,
                                    BasicBlock *BB, BasicBlock *SuccBB);
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 187ace3a0cbe..345e09dce0b2 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -372,15 +372,6 @@ RecurKind getMinMaxReductionRecurKind(Intrinsic::ID RdxID);
 /// Returns the comparison predicate used when expanding a min/max reduction.
 CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK);
 
-/// See RecurrenceDescriptor::isAnyOfPattern for a description of the pattern we
-/// are trying to match. In this pattern, we are only ever selecting between two
-/// values: 1) an initial start value \p StartVal of the reduction PHI, and 2) a
-/// loop invariant value. If any of lane value in \p Left, \p Right is not equal
-/// to \p StartVal, select the loop invariant value. This is done by selecting
-/// \p Right iff \p Left is equal to \p StartVal.
-Value *createAnyOfOp(IRBuilderBase &Builder, Value *StartVal, RecurKind RK,
-                     Value *Left, Value *Right);
-
 /// Returns a Min/Max operation corresponding to MinMaxRecurrenceKind.
 /// The Builder's fast-math-flags must be set to propagate the expected values.
 Value *createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left,
diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
index e8b03f81b348..bd804dc11266 100644
--- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
+++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
@@ -22,6 +22,7 @@
 namespace llvm {
 
 class AssumptionCache;
+class AAResults;
 class BasicBlock;
 class BlockFrequencyInfo;
 class DependenceInfo;
@@ -79,7 +80,8 @@ LoopUnrollResult UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
                             AssumptionCache *AC,
                             const llvm::TargetTransformInfo *TTI,
                             OptimizationRemarkEmitter *ORE, bool PreserveLCSSA,
-                            Loop **RemainderLoop = nullptr);
+                            Loop **RemainderLoop = nullptr,
+                            AAResults *AA = nullptr);
 
 bool UnrollRuntimeLoopRemainder(
     Loop *L, unsigned Count, bool AllowExpensiveTripCount,
@@ -102,7 +104,8 @@ bool isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
 void simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
                              ScalarEvolution *SE, DominatorTree *DT,
                              AssumptionCache *AC,
-                             const TargetTransformInfo *TTI);
+                             const TargetTransformInfo *TTI,
+                             AAResults *AA = nullptr);
 
 MDNode *GetUnrollMetadata(MDNode *LoopID, StringRef Name);
 
diff --git a/llvm/include/llvm/Transforms/Utils/ValueMapper.h b/llvm/include/llvm/Transforms/Utils/ValueMapper.h
index 54e3e62dc3af..743cfeb7ef3f 100644
--- a/llvm/include/llvm/Transforms/Utils/ValueMapper.h
+++ b/llvm/include/llvm/Transforms/Utils/ValueMapper.h
@@ -180,9 +180,8 @@ public:
   Constant *mapConstant(const Constant &C);
 
   void remapInstruction(Instruction &I);
-  void remapDbgVariableRecord(Module *M, DbgVariableRecord &V);
-  void remapDbgVariableRecordRange(Module *M,
-                                   iterator_range<DbgRecordIterator> Range);
+  void remapDbgRecord(Module *M, DbgRecord &V);
+  void remapDbgRecordRange(Module *M, iterator_range<DbgRecordIterator> Range);
   void remapFunction(Function &F);
   void remapGlobalObjectMetadata(GlobalObject &GO);
 
@@ -268,26 +267,25 @@ inline void RemapInstruction(Instruction *I, ValueToValueMapTy &VM,
   ValueMapper(VM, Flags, TypeMapper, Materializer).remapInstruction(*I);
 }
 
-/// Remap the Values used in the DbgVariableRecord \a V using the value map \a
+/// Remap the Values used in the DbgRecord \a DR using the value map \a
 /// VM.
-inline void RemapDbgVariableRecord(Module *M, DbgVariableRecord *V,
-                                   ValueToValueMapTy &VM,
-                                   RemapFlags Flags = RF_None,
-                                   ValueMapTypeRemapper *TypeMapper = nullptr,
-                                   ValueMaterializer *Materializer = nullptr) {
-  ValueMapper(VM, Flags, TypeMapper, Materializer)
-      .remapDbgVariableRecord(M, *V);
+inline void RemapDbgRecord(Module *M, DbgRecord *DR, ValueToValueMapTy &VM,
+                           RemapFlags Flags = RF_None,
+                           ValueMapTypeRemapper *TypeMapper = nullptr,
+                           ValueMaterializer *Materializer = nullptr) {
+  ValueMapper(VM, Flags, TypeMapper, Materializer).remapDbgRecord(M, *DR);
 }
 
-/// Remap the Values used in the DbgVariableRecord \a V using the value map \a
+/// Remap the Values used in the DbgRecords \a Range using the value map \a
 /// VM.
-inline void
-RemapDbgVariableRecordRange(Module *M, iterator_range<DbgRecordIterator> Range,
-                            ValueToValueMapTy &VM, RemapFlags Flags = RF_None,
-                            ValueMapTypeRemapper *TypeMapper = nullptr,
-                            ValueMaterializer *Materializer = nullptr) {
+inline void RemapDbgRecordRange(Module *M,
+                                iterator_range<DbgRecordIterator> Range,
+                                ValueToValueMapTy &VM,
+                                RemapFlags Flags = RF_None,
+                                ValueMapTypeRemapper *TypeMapper = nullptr,
+                                ValueMaterializer *Materializer = nullptr) {
   ValueMapper(VM, Flags, TypeMapper, Materializer)
-      .remapDbgVariableRecordRange(M, Range);
+      .remapDbgRecordRange(M, Range);
 }
 
 /// Remap the operands, metadata, arguments, and instructions of a function.
diff --git a/llvm/lib/Analysis/BlockFrequencyInfo.cpp b/llvm/lib/Analysis/BlockFrequencyInfo.cpp
index ebad8388cbe4..d1b21e8c83f2 100644
--- a/llvm/lib/Analysis/BlockFrequencyInfo.cpp
+++ b/llvm/lib/Analysis/BlockFrequencyInfo.cpp
@@ -188,12 +188,11 @@ void BlockFrequencyInfo::calculate(const Function &F,
     BFI.reset(new ImplType);
   BFI->calculate(F, BPI, LI);
   if (ViewBlockFreqPropagationDAG != GVDT_None &&
-      (ViewBlockFreqFuncName.empty() ||
-       F.getName().equals(ViewBlockFreqFuncName))) {
+      (ViewBlockFreqFuncName.empty() || F.getName() == ViewBlockFreqFuncName)) {
     view();
   }
   if (PrintBFI &&
-      (PrintBFIFuncName.empty() || F.getName().equals(PrintBFIFuncName))) {
+      (PrintBFIFuncName.empty() || F.getName() == PrintBFIFuncName)) {
     print(dbgs());
   }
 }
diff --git a/llvm/lib/Analysis/BranchProbabilityInfo.cpp b/llvm/lib/Analysis/BranchProbabilityInfo.cpp
index 6448ed66dc51..cd3e3a499132 100644
--- a/llvm/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/llvm/lib/Analysis/BranchProbabilityInfo.cpp
@@ -630,8 +630,8 @@ computeUnlikelySuccessors(const BasicBlock *BB, Loop *L,
       if (!CmpLHSConst)
         continue;
       // Now constant-evaluate the compare
-      Constant *Result = ConstantExpr::getCompare(CI->getPredicate(),
-                                                  CmpLHSConst, CmpConst, true);
+      Constant *Result = ConstantFoldCompareInstOperands(
+          CI->getPredicate(), CmpLHSConst, CmpConst, DL);
       // If the result means we don't branch to the block then that block is
       // unlikely.
       if (Result &&
@@ -1273,9 +1273,8 @@ void BranchProbabilityInfo::calculate(const Function &F, const LoopInfo &LoopI,
   EstimatedBlockWeight.clear();
   SccI.reset();
 
-  if (PrintBranchProb &&
-      (PrintBranchProbFuncName.empty() ||
-       F.getName().equals(PrintBranchProbFuncName))) {
+  if (PrintBranchProb && (PrintBranchProbFuncName.empty() ||
+                          F.getName() == PrintBranchProbFuncName)) {
     print(dbgs());
   }
 }
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 749374a3aa48..046a76945380 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1268,10 +1268,10 @@ Constant *llvm::ConstantFoldCompareInstOperands(
       Value *Stripped1 =
           Ops1->stripAndAccumulateInBoundsConstantOffsets(DL, Offset1);
       if (Stripped0 == Stripped1)
-        return ConstantExpr::getCompare(
-            ICmpInst::getSignedPredicate(Predicate),
-            ConstantInt::get(CE0->getContext(), Offset0),
-            ConstantInt::get(CE0->getContext(), Offset1));
+        return ConstantInt::getBool(
+            Ops0->getContext(),
+            ICmpInst::compare(Offset0, Offset1,
+                              ICmpInst::getSignedPredicate(Predicate)));
     }
   } else if (isa<ConstantExpr>(Ops1)) {
     // If RHS is a constant expression, but the left side isn't, swap the
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index c75460f44c1d..f5b17dca4973 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -701,21 +701,26 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
 
   void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
                         bool DefaultDestUndefined) override {
-    if (!DefaultDestUndefined)
-      addCost(2 * InstrCost);
     // If suitable for a jump table, consider the cost for the table size and
     // branch to destination.
     // Maximum valid cost increased in this function.
     if (JumpTableSize) {
+      // Suppose a default branch includes one compare and one conditional
+      // branch if it's reachable.
+      if (!DefaultDestUndefined)
+        addCost(2 * InstrCost);
+      // Suppose a jump table requires one load and one jump instruction.
       int64_t JTCost =
-          static_cast<int64_t>(JumpTableSize) * InstrCost + 4 * InstrCost;
+          static_cast<int64_t>(JumpTableSize) * InstrCost + 2 * InstrCost;
       addCost(JTCost);
       return;
     }
 
     if (NumCaseCluster <= 3) {
       // Suppose a comparison includes one compare and one conditional branch.
-      addCost(NumCaseCluster * 2 * InstrCost);
+      // We can reduce a set of instructions if the default branch is
+      // undefined.
+      addCost((NumCaseCluster - DefaultDestUndefined) * 2 * InstrCost);
       return;
     }
 
@@ -1152,7 +1157,7 @@ private:
   // FIXME: These constants are taken from the heuristic-based cost visitor.
   // These should be removed entirely in a later revision to avoid reliance on
   // heuristics in the ML inliner.
-  static constexpr int JTCostMultiplier = 4;
+  static constexpr int JTCostMultiplier = 2;
   static constexpr int CaseClusterCostMultiplier = 2;
   static constexpr int SwitchDefaultDestCostMultiplier = 2;
   static constexpr int SwitchCostMultiplier = 2;
@@ -1235,11 +1240,10 @@ private:
 
   void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
                         bool DefaultDestUndefined) override {
-    if (!DefaultDestUndefined)
-      increment(InlineCostFeatureIndex::switch_default_dest_penalty,
-                SwitchDefaultDestCostMultiplier * InstrCost);
-
     if (JumpTableSize) {
+      if (!DefaultDestUndefined)
+        increment(InlineCostFeatureIndex::switch_default_dest_penalty,
+                  SwitchDefaultDestCostMultiplier * InstrCost);
       int64_t JTCost = static_cast<int64_t>(JumpTableSize) * InstrCost +
                        JTCostMultiplier * InstrCost;
       increment(InlineCostFeatureIndex::jump_table_penalty, JTCost);
@@ -1248,7 +1252,8 @@ private:
 
     if (NumCaseCluster <= 3) {
       increment(InlineCostFeatureIndex::case_cluster_penalty,
-                NumCaseCluster * CaseClusterCostMultiplier * InstrCost);
+                (NumCaseCluster - DefaultDestUndefined) *
+                    CaseClusterCostMultiplier * InstrCost);
       return;
     }
 
@@ -2046,13 +2051,11 @@ bool CallAnalyzer::visitCmpInst(CmpInst &I) {
     if (RHSBase && LHSBase == RHSBase) {
       // We have common bases, fold the icmp to a constant based on the
       // offsets.
-      Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
-      Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
-      if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
-        SimplifiedValues[&I] = C;
-        ++NumConstantPtrCmps;
-        return true;
-      }
+      SimplifiedValues[&I] = ConstantInt::getBool(
+          I.getType(),
+          ICmpInst::compare(LHSOffset, RHSOffset, I.getPredicate()));
+      ++NumConstantPtrCmps;
+      return true;
     }
   }
 
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 4061dae83c10..37a7259a5cd0 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -4312,6 +4312,10 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
   if (match(I, m_Intrinsic<Intrinsic::is_constant>()))
     return nullptr;
 
+  // Don't simplify freeze.
+  if (isa<FreezeInst>(I))
+    return nullptr;
+
   // Replace Op with RepOp in instruction operands.
   SmallVector<Value *, 8> NewOps;
   bool AnyReplaced = false;
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index b0d29e2409f7..d071e5332440 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -31,6 +31,7 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/BasicBlock.h"
@@ -2005,6 +2006,9 @@ getDependenceDistanceStrideAndSize(
     return MemoryDepChecker::Dependence::Unknown;
   }
 
+  if (!isa<SCEVConstant, SCEVCouldNotCompute>(Dist))
+    Dist = SE.applyLoopGuards(Dist, InnermostLoop);
+
   uint64_t TypeByteSize = DL.getTypeAllocSize(ATy);
   bool HasSameSize =
       DL.getTypeStoreSizeInBits(ATy) == DL.getTypeStoreSizeInBits(BTy);
@@ -2119,17 +2123,24 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
     return Dependence::Forward;
   }
 
-  if (!C) {
-    // TODO: FoundNonConstantDistanceDependence is used as a necessary condition
-    // to consider retrying with runtime checks. Historically, we did not set it
-    // when strides were different but there is no inherent reason to.
+  int64_t MinDistance = SE.getSignedRangeMin(Dist).getSExtValue();
+  // Below we only handle strictly positive distances.
+  if (MinDistance <= 0) {
     FoundNonConstantDistanceDependence |= CommonStride.has_value();
-    LLVM_DEBUG(dbgs() << "LAA: Dependence because of non-constant distance\n");
     return Dependence::Unknown;
   }
 
-  if (!SE.isKnownPositive(Dist))
-    return Dependence::Unknown;
+  if (!isa<SCEVConstant>(Dist)) {
+    // Previously this case would be treated as Unknown, possibly setting
+    // FoundNonConstantDistanceDependence to force re-trying with runtime
+    // checks. Until the TODO below is addressed, set it here to preserve
+    // original behavior w.r.t. re-trying with runtime checks.
+    // TODO: FoundNonConstantDistanceDependence is used as a necessary
+    // condition to consider retrying with runtime checks. Historically, we
+    // did not set it when strides were different but there is no inherent
+    // reason to.
+    FoundNonConstantDistanceDependence |= CommonStride.has_value();
+  }
 
   if (!HasSameSize) {
     LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with "
@@ -2137,14 +2148,9 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
     return Dependence::Unknown;
   }
 
-  // The logic below currently only supports StrideA ==  StrideB, i.e. there's a
-  // common stride.
   if (!CommonStride)
     return Dependence::Unknown;
 
-  const APInt &Val = C->getAPInt();
-  int64_t Distance = Val.getSExtValue();
-
   // Bail out early if passed-in parameters make vectorization not feasible.
   unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
                            VectorizerParams::VectorizationFactor : 1);
@@ -2169,8 +2175,8 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
   //     | A[0] |      | A[2] |      | A[4] |      | A[6] |      |
   //                              | B[0] |      | B[2] |      | B[4] |
   //
-  // Distance needs for vectorizing iterations except the last iteration:
-  // 4 * 2 * (MinNumIter - 1). Distance needs for the last iteration: 4.
+  // MinDistance needs for vectorizing iterations except the last iteration:
+  // 4 * 2 * (MinNumIter - 1). MinDistance needs for the last iteration: 4.
   // So the minimum distance needed is: 4 * 2 * (MinNumIter - 1) + 4.
   //
   // If MinNumIter is 2, it is vectorizable as the minimum distance needed is
@@ -2179,11 +2185,22 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
   // If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4),
   // the minimum distance needed is 28, which is greater than distance. It is
   // not safe to do vectorization.
+
+  // We know that Dist is positive, but it may not be constant. Use the signed
+  // minimum for computations below, as this ensures we compute the closest
+  // possible dependence distance.
   uint64_t MinDistanceNeeded =
-      TypeByteSize * (*CommonStride) * (MinNumIter - 1) + TypeByteSize;
-  if (MinDistanceNeeded > static_cast<uint64_t>(Distance)) {
-    LLVM_DEBUG(dbgs() << "LAA: Failure because of positive distance "
-                      << Distance << '\n');
+      TypeByteSize * *CommonStride * (MinNumIter - 1) + TypeByteSize;
+  if (MinDistanceNeeded > static_cast<uint64_t>(MinDistance)) {
+    if (!isa<SCEVConstant>(Dist)) {
+      // For non-constant distances, we checked the lower bound of the
+      // dependence distance and the distance may be larger at runtime (and safe
+      // for vectorization). Classify it as Unknown, so we re-try with runtime
+      // checks.
+      return Dependence::Unknown;
+    }
+    LLVM_DEBUG(dbgs() << "LAA: Failure because of positive minimum distance "
+                      << MinDistance << '\n');
     return Dependence::Backward;
   }
 
@@ -2212,12 +2229,13 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
   // is 8, which is less than 2 and forbidden vectorization, But actually
   // both A and B could be vectorized by 2 iterations.
   MinDepDistBytes =
-      std::min(static_cast<uint64_t>(Distance), MinDepDistBytes);
+      std::min(static_cast<uint64_t>(MinDistance), MinDepDistBytes);
 
   bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
   uint64_t MinDepDistBytesOld = MinDepDistBytes;
   if (IsTrueDataDependence && EnableForwardingConflictDetection &&
-      couldPreventStoreLoadForward(Distance, TypeByteSize)) {
+      isa<SCEVConstant>(Dist) &&
+      couldPreventStoreLoadForward(MinDistance, TypeByteSize)) {
     // Sanity check that we didn't update MinDepDistBytes when calling
     // couldPreventStoreLoadForward
     assert(MinDepDistBytes == MinDepDistBytesOld &&
@@ -2229,10 +2247,18 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
 
   // An update to MinDepDistBytes requires an update to MaxSafeVectorWidthInBits
   // since there is a backwards dependency.
-  uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * (*CommonStride));
-  LLVM_DEBUG(dbgs() << "LAA: Positive distance " << Val.getSExtValue()
+  uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * *CommonStride);
+  LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance
                     << " with max VF = " << MaxVF << '\n');
+
   uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
+  if (!isa<SCEVConstant>(Dist) && MaxVFInBits < MaxTargetVectorWidthInBits) {
+    // For non-constant distances, we checked the lower bound of the dependence
+    // distance and the distance may be larger at runtime (and safe for
+    // vectorization). Classify it as Unknown, so we re-try with runtime checks.
+    return Dependence::Unknown;
+  }
+
   MaxSafeVectorWidthInBits = std::min(MaxSafeVectorWidthInBits, MaxVFInBits);
   return Dependence::BackwardVectorizable;
 }
@@ -2537,7 +2563,7 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
     if (isInvariant(Ptr)) {
       // Record store instructions to loop invariant addresses
       StoresToInvariantAddresses.push_back(ST);
-      HasDependenceInvolvingLoopInvariantAddress |=
+      HasStoreStoreDependenceInvolvingLoopInvariantAddress |=
           !UniformStores.insert(Ptr).second;
     }
 
@@ -2593,7 +2619,7 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
     if (UniformStores.count(Ptr)) {
       LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform "
                            "load and uniform store to the same address!\n");
-      HasDependenceInvolvingLoopInvariantAddress = true;
+      HasLoadStoreDependenceInvolvingLoopInvariantAddress = true;
     }
 
     MemoryLocation Loc = MemoryLocation::get(LD);
@@ -2726,7 +2752,7 @@ void LoopAccessInfo::emitUnsafeDependenceRemark() {
             "to attempt to isolate the offending operations into a separate "
             "loop";
   OptimizationRemarkAnalysis &R =
-      recordAnalysis("UnsafeDep", Dep.getDestination(*this)) << Info;
+      recordAnalysis("UnsafeDep", Dep.getDestination(getDepChecker())) << Info;
 
   switch (Dep.Type) {
   case MemoryDepChecker::Dependence::NoDep:
@@ -2752,7 +2778,7 @@ void LoopAccessInfo::emitUnsafeDependenceRemark() {
     break;
   }
 
-  if (Instruction *I = Dep.getSource(*this)) {
+  if (Instruction *I = Dep.getSource(getDepChecker())) {
     DebugLoc SourceLoc = I->getDebugLoc();
     if (auto *DD = dyn_cast_or_null<Instruction>(getPointerOperand(I)))
       SourceLoc = DD->getDebugLoc();
@@ -3015,11 +3041,28 @@ void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
 }
 
 LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
+                               const TargetTransformInfo *TTI,
                                const TargetLibraryInfo *TLI, AAResults *AA,
                                DominatorTree *DT, LoopInfo *LI)
     : PSE(std::make_unique<PredicatedScalarEvolution>(*SE, *L)),
-      PtrRtChecking(nullptr),
-      DepChecker(std::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L) {
+      PtrRtChecking(nullptr), TheLoop(L) {
+  unsigned MaxTargetVectorWidthInBits = std::numeric_limits<unsigned>::max();
+  if (TTI) {
+    TypeSize FixedWidth =
+        TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector);
+    if (FixedWidth.isNonZero()) {
+      // Scale the vector width by 2 as rough estimate to also consider
+      // interleaving.
+      MaxTargetVectorWidthInBits = FixedWidth.getFixedValue() * 2;
+    }
+
+    TypeSize ScalableWidth =
+        TTI->getRegisterBitWidth(TargetTransformInfo::RGK_ScalableVector);
+    if (ScalableWidth.isNonZero())
+      MaxTargetVectorWidthInBits = std::numeric_limits<unsigned>::max();
+  }
+  DepChecker =
+      std::make_unique<MemoryDepChecker>(*PSE, L, MaxTargetVectorWidthInBits);
   PtrRtChecking = std::make_unique<RuntimePointerChecking>(*DepChecker, SE);
   if (canAnalyzeLoop()) {
     analyzeLoop(AA, LI, TLI, DT);
@@ -3057,9 +3100,13 @@ void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
   PtrRtChecking->print(OS, Depth);
   OS << "\n";
 
-  OS.indent(Depth) << "Non vectorizable stores to invariant address were "
-                   << (HasDependenceInvolvingLoopInvariantAddress ? "" : "not ")
-                   << "found in loop.\n";
+  OS.indent(Depth)
+      << "Non vectorizable stores to invariant address were "
+      << (HasStoreStoreDependenceInvolvingLoopInvariantAddress ||
+                  HasLoadStoreDependenceInvolvingLoopInvariantAddress
+              ? ""
+              : "not ")
+      << "found in loop.\n";
 
   OS.indent(Depth) << "SCEV assumptions:\n";
   PSE->getPredicate().print(OS, Depth);
@@ -3075,7 +3122,7 @@ const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L) {
 
   if (I.second)
     I.first->second =
-        std::make_unique<LoopAccessInfo>(&L, &SE, TLI, &AA, &DT, &LI);
+        std::make_unique<LoopAccessInfo>(&L, &SE, TTI, TLI, &AA, &DT, &LI);
 
   return *I.first->second;
 }
@@ -3104,8 +3151,9 @@ LoopAccessInfoManager LoopAccessAnalysis::run(Function &F,
   auto &AA = FAM.getResult<AAManager>(F);
   auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
   auto &LI = FAM.getResult<LoopAnalysis>(F);
+  auto &TTI = FAM.getResult<TargetIRAnalysis>(F);
   auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
-  return LoopAccessInfoManager(SE, AA, DT, LI, &TLI);
+  return LoopAccessInfoManager(SE, AA, DT, LI, &TTI, &TLI);
 }
 
 AnalysisKey LoopAccessAnalysis::Key;
diff --git a/llvm/lib/Analysis/LoopInfo.cpp b/llvm/lib/Analysis/LoopInfo.cpp
index 3075e5190f8e..369ab087ffc0 100644
--- a/llvm/lib/Analysis/LoopInfo.cpp
+++ b/llvm/lib/Analysis/LoopInfo.cpp
@@ -1032,7 +1032,7 @@ MDNode *llvm::findOptionMDForLoopID(MDNode *LoopID, StringRef Name) {
     if (!S)
       continue;
     // Return the operand node if MDString holds expected metadata.
-    if (Name.equals(S->getString()))
+    if (Name == S->getString())
       return MD;
   }
 
diff --git a/llvm/lib/Analysis/MemoryBuiltins.cpp b/llvm/lib/Analysis/MemoryBuiltins.cpp
index 46a7a921d86d..8ca15434833d 100644
--- a/llvm/lib/Analysis/MemoryBuiltins.cpp
+++ b/llvm/lib/Analysis/MemoryBuiltins.cpp
@@ -1138,8 +1138,8 @@ SizeOffsetValue ObjectSizeOffsetEvaluator::visitAllocaInst(AllocaInst &I) {
   if (!I.getAllocatedType()->isSized())
     return ObjectSizeOffsetEvaluator::unknown();
 
-  // must be a VLA
-  assert(I.isArrayAllocation());
+  // must be a VLA or vscale.
+  assert(I.isArrayAllocation() || I.getAllocatedType()->isScalableTy());
 
   // If needed, adjust the alloca's operand size to match the pointer indexing
   // size. Subsequent math operations expect the types to match.
@@ -1149,8 +1149,8 @@ SizeOffsetValue ObjectSizeOffsetEvaluator::visitAllocaInst(AllocaInst &I) {
   assert(ArraySize->getType() == Zero->getType() &&
          "Expected zero constant to have pointer index type");
 
-  Value *Size = ConstantInt::get(ArraySize->getType(),
-                                 DL.getTypeAllocSize(I.getAllocatedType()));
+  Value *Size = Builder.CreateTypeSize(
+      ArraySize->getType(), DL.getTypeAllocSize(I.getAllocatedType()));
   Size = Builder.CreateMul(Size, ArraySize);
   return SizeOffsetValue(Size, Zero);
 }
diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp
index 8f5bc24747b1..5c09ba946271 100644
--- a/llvm/lib/Analysis/MemoryProfileInfo.cpp
+++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp
@@ -86,9 +86,9 @@ AllocationType llvm::memprof::getMIBAllocType(const MDNode *MIB) {
   // types that can be applied based on the allocation profile data.
   auto *MDS = dyn_cast<MDString>(MIB->getOperand(1));
   assert(MDS);
-  if (MDS->getString().equals("cold")) {
+  if (MDS->getString() == "cold") {
     return AllocationType::Cold;
-  } else if (MDS->getString().equals("hot")) {
+  } else if (MDS->getString() == "hot") {
     return AllocationType::Hot;
   }
   return AllocationType::NotCold;
diff --git a/llvm/lib/Analysis/MemorySSA.cpp b/llvm/lib/Analysis/MemorySSA.cpp
index 49450d85d742..48ef73e59045 100644
--- a/llvm/lib/Analysis/MemorySSA.cpp
+++ b/llvm/lib/Analysis/MemorySSA.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CFGPrinter.h"
 #include "llvm/Analysis/IteratedDominanceFrontier.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/AssemblyAnnotationWriter.h"
@@ -1230,7 +1231,7 @@ void MemorySSA::markUnreachableAsLiveOnEntry(BasicBlock *BB) {
 }
 
 MemorySSA::MemorySSA(Function &Func, AliasAnalysis *AA, DominatorTree *DT)
-    : DT(DT), F(Func), LiveOnEntryDef(nullptr), Walker(nullptr),
+    : DT(DT), F(&Func), LiveOnEntryDef(nullptr), Walker(nullptr),
       SkipWalker(nullptr) {
   // Build MemorySSA using a batch alias analysis. This reuses the internal
   // state that AA collects during an alias()/getModRefInfo() call. This is
@@ -1239,8 +1240,29 @@ MemorySSA::MemorySSA(Function &Func, AliasAnalysis *AA, DominatorTree *DT)
   // make queries about all the instructions in the Function.
   assert(AA && "No alias analysis?");
   BatchAAResults BatchAA(*AA);
-  buildMemorySSA(BatchAA);
-  // Intentionally leave AA to nullptr while building so we don't accidently
+  buildMemorySSA(BatchAA, iterator_range(F->begin(), F->end()));
+  // Intentionally leave AA to nullptr while building so we don't accidentally
+  // use non-batch AliasAnalysis.
+  this->AA = AA;
+  // Also create the walker here.
+  getWalker();
+}
+
+MemorySSA::MemorySSA(Loop &L, AliasAnalysis *AA, DominatorTree *DT)
+    : DT(DT), L(&L), LiveOnEntryDef(nullptr), Walker(nullptr),
+      SkipWalker(nullptr) {
+  // Build MemorySSA using a batch alias analysis. This reuses the internal
+  // state that AA collects during an alias()/getModRefInfo() call. This is
+  // safe because there are no CFG changes while building MemorySSA and can
+  // significantly reduce the time spent by the compiler in AA, because we will
+  // make queries about all the instructions in the Function.
+  assert(AA && "No alias analysis?");
+  BatchAAResults BatchAA(*AA);
+  buildMemorySSA(
+      BatchAA, map_range(L.blocks(), [](const BasicBlock *BB) -> BasicBlock & {
+        return *const_cast<BasicBlock *>(BB);
+      }));
+  // Intentionally leave AA to nullptr while building so we don't accidentally
   // use non-batch AliasAnalysis.
   this->AA = AA;
   // Also create the walker here.
@@ -1493,16 +1515,17 @@ void MemorySSA::placePHINodes(
     createMemoryPhi(BB);
 }
 
-void MemorySSA::buildMemorySSA(BatchAAResults &BAA) {
+template <typename IterT>
+void MemorySSA::buildMemorySSA(BatchAAResults &BAA, IterT Blocks) {
   // We create an access to represent "live on entry", for things like
   // arguments or users of globals, where the memory they use is defined before
   // the beginning of the function. We do not actually insert it into the IR.
   // We do not define a live on exit for the immediate uses, and thus our
   // semantics do *not* imply that something with no immediate uses can simply
   // be removed.
-  BasicBlock &StartingPoint = F.getEntryBlock();
-  LiveOnEntryDef.reset(new MemoryDef(F.getContext(), nullptr, nullptr,
-                                     &StartingPoint, NextID++));
+  BasicBlock &StartingPoint = *Blocks.begin();
+  LiveOnEntryDef.reset(new MemoryDef(StartingPoint.getContext(), nullptr,
+                                     nullptr, &StartingPoint, NextID++));
 
   // We maintain lists of memory accesses per-block, trading memory for time. We
   // could just look up the memory access for every possible instruction in the
@@ -1510,7 +1533,7 @@ void MemorySSA::buildMemorySSA(BatchAAResults &BAA) {
   SmallPtrSet<BasicBlock *, 32> DefiningBlocks;
   // Go through each block, figure out where defs occur, and chain together all
   // the accesses.
-  for (BasicBlock &B : F) {
+  for (BasicBlock &B : Blocks) {
     bool InsertIntoDef = false;
     AccessList *Accesses = nullptr;
     DefsList *Defs = nullptr;
@@ -1537,11 +1560,29 @@ void MemorySSA::buildMemorySSA(BatchAAResults &BAA) {
   // Now do regular SSA renaming on the MemoryDef/MemoryUse. Visited will get
   // filled in with all blocks.
   SmallPtrSet<BasicBlock *, 16> Visited;
-  renamePass(DT->getRootNode(), LiveOnEntryDef.get(), Visited);
+  if (L) {
+    // Only building MemorySSA for a single loop. placePHINodes may have
+    // inserted a MemoryPhi in the loop's preheader. As this is outside the
+    // scope of the loop, set them to LiveOnEntry.
+    if (auto *P = getMemoryAccess(L->getLoopPreheader())) {
+      for (Use &U : make_early_inc_range(P->uses()))
+        U.set(LiveOnEntryDef.get());
+      removeFromLists(P);
+    }
+    // Now rename accesses in the loop. Populate Visited with the exit blocks of
+    // the loop, to limit the scope of the renaming.
+    SmallVector<BasicBlock *> ExitBlocks;
+    L->getExitBlocks(ExitBlocks);
+    Visited.insert(ExitBlocks.begin(), ExitBlocks.end());
+    renamePass(DT->getNode(L->getLoopPreheader()), LiveOnEntryDef.get(),
+               Visited);
+  } else {
+    renamePass(DT->getRootNode(), LiveOnEntryDef.get(), Visited);
+  }
 
   // Mark the uses in unreachable blocks as live on entry, so that they go
   // somewhere.
-  for (auto &BB : F)
+  for (auto &BB : Blocks)
     if (!Visited.count(&BB))
       markUnreachableAsLiveOnEntry(&BB);
 }
@@ -1851,7 +1892,10 @@ void MemorySSA::removeFromLists(MemoryAccess *MA, bool ShouldDelete) {
 
 void MemorySSA::print(raw_ostream &OS) const {
   MemorySSAAnnotatedWriter Writer(this);
-  F.print(OS, &Writer);
+  Function *F = this->F;
+  if (L)
+    F = L->getHeader()->getParent();
+  F->print(OS, &Writer);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1864,10 +1908,23 @@ void MemorySSA::verifyMemorySSA(VerificationLevel VL) const {
 #endif
 
 #ifndef NDEBUG
-  verifyOrderingDominationAndDefUses(F, VL);
-  verifyDominationNumbers(F);
-  if (VL == VerificationLevel::Full)
-    verifyPrevDefInPhis(F);
+  if (F) {
+    auto Blocks = iterator_range(F->begin(), F->end());
+    verifyOrderingDominationAndDefUses(Blocks, VL);
+    verifyDominationNumbers(Blocks);
+    if (VL == VerificationLevel::Full)
+      verifyPrevDefInPhis(Blocks);
+  } else {
+    assert(L && "must either have loop or function");
+    auto Blocks =
+        map_range(L->blocks(), [](const BasicBlock *BB) -> BasicBlock & {
+          return *const_cast<BasicBlock *>(BB);
+        });
+    verifyOrderingDominationAndDefUses(Blocks, VL);
+    verifyDominationNumbers(Blocks);
+    if (VL == VerificationLevel::Full)
+      verifyPrevDefInPhis(Blocks);
+  }
 #endif
   // Previously, the verification used to also verify that the clobberingAccess
   // cached by MemorySSA is the same as the clobberingAccess found at a later
@@ -1881,8 +1938,9 @@ void MemorySSA::verifyMemorySSA(VerificationLevel VL) const {
   // example, see test4 added in D51960.
 }
 
-void MemorySSA::verifyPrevDefInPhis(Function &F) const {
-  for (const BasicBlock &BB : F) {
+template <typename IterT>
+void MemorySSA::verifyPrevDefInPhis(IterT Blocks) const {
+  for (const BasicBlock &BB : Blocks) {
     if (MemoryPhi *Phi = getMemoryAccess(&BB)) {
       for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
         auto *Pred = Phi->getIncomingBlock(I);
@@ -1917,12 +1975,13 @@ void MemorySSA::verifyPrevDefInPhis(Function &F) const {
 
 /// Verify that all of the blocks we believe to have valid domination numbers
 /// actually have valid domination numbers.
-void MemorySSA::verifyDominationNumbers(const Function &F) const {
+template <typename IterT>
+void MemorySSA::verifyDominationNumbers(IterT Blocks) const {
   if (BlockNumberingValid.empty())
     return;
 
   SmallPtrSet<const BasicBlock *, 16> ValidBlocks = BlockNumberingValid;
-  for (const BasicBlock &BB : F) {
+  for (const BasicBlock &BB : Blocks) {
     if (!ValidBlocks.count(&BB))
       continue;
 
@@ -1958,14 +2017,15 @@ void MemorySSA::verifyDominationNumbers(const Function &F) const {
 /// Verify def-uses: the immediate use information - walk all the memory
 /// accesses and verifying that, for each use, it appears in the appropriate
 /// def's use list
-void MemorySSA::verifyOrderingDominationAndDefUses(Function &F,
+template <typename IterT>
+void MemorySSA::verifyOrderingDominationAndDefUses(IterT Blocks,
                                                    VerificationLevel VL) const {
   // Walk all the blocks, comparing what the lookups think and what the access
   // lists think, as well as the order in the blocks vs the order in the access
   // lists.
   SmallVector<MemoryAccess *, 32> ActualAccesses;
   SmallVector<MemoryAccess *, 32> ActualDefs;
-  for (BasicBlock &B : F) {
+  for (BasicBlock &B : Blocks) {
     const AccessList *AL = getBlockAccesses(&B);
     const auto *DL = getBlockDefs(&B);
     MemoryPhi *Phi = getMemoryAccess(&B);
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 93f885c5d5ad..254d79183a1e 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -10615,9 +10615,7 @@ bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred,
   if (const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(LHS)) {
     // Check for both operands constant.
     if (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(RHS)) {
-      if (ConstantExpr::getICmp(Pred,
-                                LHSC->getValue(),
-                                RHSC->getValue())->isNullValue())
+      if (!ICmpInst::compare(LHSC->getAPInt(), RHSC->getAPInt(), Pred))
         return TrivialCase(false);
       return TrivialCase(true);
     }
@@ -11276,7 +11274,7 @@ bool ScalarEvolution::isKnownPredicateViaNoOverflow(ICmpInst::Predicate Pred,
     [[fallthrough]];
   case ICmpInst::ICMP_ULE:
     // (X + C1)<nuw> u<= (X + C2)<nuw> for C1 u<= C2.
-    if (MatchBinaryAddToConst(RHS, LHS, C2, C1, SCEV::FlagNUW) && C1.ule(C2))
+    if (MatchBinaryAddToConst(LHS, RHS, C1, C2, SCEV::FlagNUW) && C1.ule(C2))
       return true;
 
     break;
@@ -11286,7 +11284,7 @@ bool ScalarEvolution::isKnownPredicateViaNoOverflow(ICmpInst::Predicate Pred,
     [[fallthrough]];
   case ICmpInst::ICMP_ULT:
     // (X + C1)<nuw> u< (X + C2)<nuw> if C1 u< C2.
-    if (MatchBinaryAddToConst(RHS, LHS, C2, C1, SCEV::FlagNUW) && C1.ult(C2))
+    if (MatchBinaryAddToConst(LHS, RHS, C1, C2, SCEV::FlagNUW) && C1.ult(C2))
       return true;
     break;
   }
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 33c899fe8899..f6a458f7ded4 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -513,6 +513,11 @@ bool TargetTransformInfo::isLegalStridedLoadStore(Type *DataType,
   return TTIImpl->isLegalStridedLoadStore(DataType, Alignment);
 }
 
+bool TargetTransformInfo::isLegalMaskedVectorHistogram(Type *AddrType,
+                                                       Type *DataType) const {
+  return TTIImpl->isLegalMaskedVectorHistogram(AddrType, DataType);
+}
+
 bool TargetTransformInfo::enableOrderedReductions() const {
   return TTIImpl->enableOrderedReductions();
 }
@@ -531,7 +536,7 @@ bool TargetTransformInfo::prefersVectorizedAddressing() const {
 }
 
 InstructionCost TargetTransformInfo::getScalingFactorCost(
-    Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg,
+    Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg,
     int64_t Scale, unsigned AddrSpace) const {
   InstructionCost Cost = TTIImpl->getScalingFactorCost(
       Ty, BaseGV, BaseOffset, HasBaseReg, Scale, AddrSpace);
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index fed2061aae3a..375385aca7a3 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -252,6 +252,13 @@ bool llvm::haveNoCommonBitsSet(const WithCache<const Value *> &LHSCache,
                                         RHSCache.getKnownBits(SQ));
 }
 
+bool llvm::isOnlyUsedInZeroComparison(const Instruction *I) {
+  return !I->user_empty() && all_of(I->users(), [](const User *U) {
+    ICmpInst::Predicate P;
+    return match(U, m_ICmp(P, m_Value(), m_Zero()));
+  });
+}
+
 bool llvm::isOnlyUsedInZeroEqualityComparison(const Instruction *I) {
   return !I->user_empty() && all_of(I->users(), [](const User *U) {
     ICmpInst::Predicate P;
@@ -2166,6 +2173,11 @@ bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth,
         if (OrZero || RHSBits.One.getBoolValue() || LHSBits.One.getBoolValue())
           return true;
     }
+
+    // LShr(UINT_MAX, Y) + 1 is a power of two (if add is nuw) or zero.
+    if (OrZero || Q.IIQ.hasNoUnsignedWrap(VOBO))
+      if (match(I, m_Add(m_LShr(m_AllOnes(), m_Value()), m_One())))
+        return true;
     return false;
   }
   case Instruction::Select:
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 2902bd9fe17c..34053a5ca9c8 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -74,23 +74,6 @@ static std::string getTypeString(Type *T) {
   return Tmp.str();
 }
 
-// Whatever debug info format we parsed, we should convert to the expected debug
-// info format immediately afterwards.
-bool LLParser::finalizeDebugInfoFormat(Module *M) {
-  // We should have already returned an error if we observed both intrinsics and
-  // records in this IR.
-  assert(!(SeenNewDbgInfoFormat && SeenOldDbgInfoFormat) &&
-         "Mixed debug intrinsics/records seen without a parsing error?");
-  if (PreserveInputDbgFormat == cl::boolOrDefault::BOU_TRUE) {
-    UseNewDbgInfoFormat = SeenNewDbgInfoFormat;
-    WriteNewDbgInfoFormatToBitcode = SeenNewDbgInfoFormat;
-    WriteNewDbgInfoFormat = SeenNewDbgInfoFormat;
-  } else if (M) {
-    M->setIsNewDbgInfoFormat(false);
-  }
-  return false;
-}
-
 /// Run: module ::= toplevelentity*
 bool LLParser::Run(bool UpgradeDebugInfo,
                    DataLayoutCallbackTy DataLayoutCallback) {
@@ -108,7 +91,7 @@ bool LLParser::Run(bool UpgradeDebugInfo,
   }
 
   return parseTopLevelEntities() || validateEndOfModule(UpgradeDebugInfo) ||
-         validateEndOfIndex() || finalizeDebugInfoFormat(M);
+         validateEndOfIndex();
 }
 
 bool LLParser::parseStandaloneConstantValue(Constant *&C,
@@ -207,6 +190,18 @@ void LLParser::dropUnknownMetadataReferences() {
 bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
   if (!M)
     return false;
+
+  // We should have already returned an error if we observed both intrinsics and
+  // records in this IR.
+  assert(!(SeenNewDbgInfoFormat && SeenOldDbgInfoFormat) &&
+         "Mixed debug intrinsics/records seen without a parsing error?");
+  if (PreserveInputDbgFormat == cl::boolOrDefault::BOU_TRUE) {
+    UseNewDbgInfoFormat = SeenNewDbgInfoFormat;
+    WriteNewDbgInfoFormatToBitcode = SeenNewDbgInfoFormat;
+    WriteNewDbgInfoFormat = SeenNewDbgInfoFormat;
+    M->setNewDbgInfoFormatFlag(SeenNewDbgInfoFormat);
+  }
+
   // Handle any function attribute group forward references.
   for (const auto &RAG : ForwardRefAttrGroups) {
     Value *V = RAG.first;
@@ -439,6 +434,9 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
   UpgradeModuleFlags(*M);
   UpgradeSectionAttributes(*M);
 
+  if (PreserveInputDbgFormat != cl::boolOrDefault::BOU_TRUE)
+    M->setIsNewDbgInfoFormat(UseNewDbgInfoFormat);
+
   if (!Slots)
     return false;
   // Initialize the slot mapping.
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index a0779f955cf2..be2381cd7d77 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -4319,7 +4319,7 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit,
   if (PreserveInputDbgFormat != cl::boolOrDefault::BOU_TRUE) {
     TheModule->IsNewDbgInfoFormat =
         UseNewDbgInfoFormat &&
-        LoadBitcodeIntoNewDbgInfoFormat == cl::boolOrDefault::BOU_TRUE;
+        LoadBitcodeIntoNewDbgInfoFormat != cl::boolOrDefault::BOU_FALSE;
   }
 
   this->ValueTypeCallback = std::move(Callbacks.ValueType);
@@ -6896,7 +6896,7 @@ Error BitcodeReader::materialize(GlobalValue *GV) {
         MDString *MDS = cast<MDString>(MD->getOperand(0));
         StringRef ProfName = MDS->getString();
         // Check consistency of !prof branch_weights metadata.
-        if (!ProfName.equals("branch_weights"))
+        if (ProfName != "branch_weights")
           continue;
         unsigned ExpectedNumOperands = 0;
         if (BranchInst *BI = dyn_cast<BranchInst>(&I))
@@ -7513,14 +7513,9 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       TheIndex.setFlags(Record[0]);
       break;
     }
-    case bitc::FS_VALUE_GUID: { // [valueid, refguid_upper32, refguid_lower32]
+    case bitc::FS_VALUE_GUID: { // [valueid, refguid]
       uint64_t ValueID = Record[0];
-      GlobalValue::GUID RefGUID;
-      if (Version >= 10) {
-        RefGUID = Record[1] << 32 | Record[2];
-      } else {
-        RefGUID = Record[1];
-      }
+      GlobalValue::GUID RefGUID = Record[1];
       ValueIdToValueInfoMap[ValueID] = std::make_tuple(
           TheIndex.getOrInsertValueInfo(RefGUID), RefGUID, RefGUID);
       break;
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 1aaf160e91ca..6d01e3b4d821 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -4299,20 +4299,9 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
     return;
   }
 
-  auto Abbv = std::make_shared<BitCodeAbbrev>();
-  Abbv->Add(BitCodeAbbrevOp(bitc::FS_VALUE_GUID));
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
-  // GUIDS often use up most of 64-bits, so encode as two Fixed 32.
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
-  unsigned ValueGuidAbbrev = Stream.EmitAbbrev(std::move(Abbv));
-
   for (const auto &GVI : valueIds()) {
     Stream.EmitRecord(bitc::FS_VALUE_GUID,
-                      ArrayRef<uint32_t>{GVI.second,
-                                         static_cast<uint32_t>(GVI.first >> 32),
-                                         static_cast<uint32_t>(GVI.first)},
-                      ValueGuidAbbrev);
+                      ArrayRef<uint64_t>{GVI.second, GVI.first});
   }
 
   if (!Index->stackIds().empty()) {
@@ -4326,7 +4315,7 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
   }
 
   // Abbrev for FS_PERMODULE_PROFILE.
-  Abbv = std::make_shared<BitCodeAbbrev>();
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_PROFILE));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // flags
@@ -4478,20 +4467,9 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
   // Write the index flags.
   Stream.EmitRecord(bitc::FS_FLAGS, ArrayRef<uint64_t>{Index.getFlags()});
 
-  auto Abbv = std::make_shared<BitCodeAbbrev>();
-  Abbv->Add(BitCodeAbbrevOp(bitc::FS_VALUE_GUID));
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
-  // GUIDS often use up most of 64-bits, so encode as two Fixed 32.
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
-  unsigned ValueGuidAbbrev = Stream.EmitAbbrev(std::move(Abbv));
-
   for (const auto &GVI : valueIds()) {
     Stream.EmitRecord(bitc::FS_VALUE_GUID,
-                      ArrayRef<uint32_t>{GVI.second,
-                                         static_cast<uint32_t>(GVI.first >> 32),
-                                         static_cast<uint32_t>(GVI.first)},
-                      ValueGuidAbbrev);
+                      ArrayRef<uint64_t>{GVI.second, GVI.first});
   }
 
   if (!StackIdIndices.empty()) {
@@ -4510,7 +4488,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
   }
 
   // Abbrev for FS_COMBINED_PROFILE.
-  Abbv = std::make_shared<BitCodeAbbrev>();
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED_PROFILE));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // modid
diff --git a/llvm/lib/CodeGen/Analysis.cpp b/llvm/lib/CodeGen/Analysis.cpp
index af7643d93591..e693cdbd0ccc 100644
--- a/llvm/lib/CodeGen/Analysis.cpp
+++ b/llvm/lib/CodeGen/Analysis.cpp
@@ -593,9 +593,10 @@ bool llvm::attributesPermitTailCall(const Function *F, const Instruction *I,
 
   // Following attributes are completely benign as far as calling convention
   // goes, they shouldn't affect whether the call is a tail call.
-  for (const auto &Attr : {Attribute::Alignment, Attribute::Dereferenceable,
-                           Attribute::DereferenceableOrNull, Attribute::NoAlias,
-                           Attribute::NonNull, Attribute::NoUndef}) {
+  for (const auto &Attr :
+       {Attribute::Alignment, Attribute::Dereferenceable,
+        Attribute::DereferenceableOrNull, Attribute::NoAlias,
+        Attribute::NonNull, Attribute::NoUndef, Attribute::Range}) {
     CallerAttrs.removeAttribute(Attr);
     CalleeAttrs.removeAttribute(Attr);
   }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 56c288ee95b4..1e33c2729e5d 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -30,6 +30,7 @@
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include <cassert>
 #include <cstdint>
+#include <limits>
 #include <string>
 #include <utility>
 
@@ -1649,7 +1650,9 @@ DIE &DwarfUnit::constructMemberDIE(DIE &Buffer, const DIDerivedType *DT) {
         addUInt(MemberDie, dwarf::DW_AT_byte_size, std::nullopt, FieldSize / 8);
       addUInt(MemberDie, dwarf::DW_AT_bit_size, std::nullopt, Size);
 
-      uint64_t Offset = DT->getOffsetInBits();
+      assert(DT->getOffsetInBits() <=
+             (uint64_t)std::numeric_limits<int64_t>::max());
+      int64_t Offset = DT->getOffsetInBits();
       // We can't use DT->getAlignInBits() here: AlignInBits for member type
       // is non-zero if and only if alignment was forced (e.g. _Alignas()),
       // which can't be done with bitfields. Thus we use FieldSize here.
@@ -1669,7 +1672,12 @@ DIE &DwarfUnit::constructMemberDIE(DIE &Buffer, const DIDerivedType *DT) {
         if (Asm->getDataLayout().isLittleEndian())
           Offset = FieldSize - (Offset + Size);
 
-        addUInt(MemberDie, dwarf::DW_AT_bit_offset, std::nullopt, Offset);
+        if (Offset < 0)
+          addSInt(MemberDie, dwarf::DW_AT_bit_offset, dwarf::DW_FORM_sdata,
+                  Offset);
+        else
+          addUInt(MemberDie, dwarf::DW_AT_bit_offset, std::nullopt,
+                  (uint64_t)Offset);
         OffsetInBytes = FieldOffset >> 3;
       } else {
         addUInt(MemberDie, dwarf::DW_AT_data_bit_offset, std::nullopt, Offset);
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index f3b8097396e2..ee44e9353d04 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -909,9 +909,10 @@ void AtomicExpandImpl::expandPartwordAtomicRMW(
   Value *ValOperand_Shifted = nullptr;
   if (Op == AtomicRMWInst::Xchg || Op == AtomicRMWInst::Add ||
       Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Nand) {
+    Value *ValOp = Builder.CreateBitCast(AI->getValOperand(), PMV.IntValueType);
     ValOperand_Shifted =
-        Builder.CreateShl(Builder.CreateZExt(AI->getValOperand(), PMV.WordType),
-                          PMV.ShiftAmt, "ValOperand_Shifted");
+        Builder.CreateShl(Builder.CreateZExt(ValOp, PMV.WordType), PMV.ShiftAmt,
+                          "ValOperand_Shifted");
   }
 
   auto PerformPartwordOp = [&](IRBuilderBase &Builder, Value *Loaded) {
diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp
index 14ac4b2102c2..677460a2d8e4 100644
--- a/llvm/lib/CodeGen/CommandFlags.cpp
+++ b/llvm/lib/CodeGen/CommandFlags.cpp
@@ -96,6 +96,7 @@ CGOPT_EXP(bool, EmulatedTLS)
 CGOPT_EXP(bool, EnableTLSDESC)
 CGOPT(bool, UniqueSectionNames)
 CGOPT(bool, UniqueBasicBlockSectionNames)
+CGOPT(bool, SeparateNamedSections)
 CGOPT(EABI, EABIVersion)
 CGOPT(DebuggerKind, DebuggerTuningOpt)
 CGOPT(bool, EnableStackSizeSection)
@@ -419,6 +420,12 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() {
       cl::init(false));
   CGBINDOPT(UniqueBasicBlockSectionNames);
 
+  static cl::opt<bool> SeparateNamedSections(
+      "separate-named-sections",
+      cl::desc("Use separate unique sections for named sections"),
+      cl::init(false));
+  CGBINDOPT(SeparateNamedSections);
+
   static cl::opt<EABI> EABIVersion(
       "meabi", cl::desc("Set EABI type (default depends on triple):"),
       cl::init(EABI::Default),
@@ -569,6 +576,7 @@ codegen::InitTargetOptionsFromCodeGenFlags(const Triple &TheTriple) {
   Options.BBSections = getBBSectionsMode(Options);
   Options.UniqueSectionNames = getUniqueSectionNames();
   Options.UniqueBasicBlockSectionNames = getUniqueBasicBlockSectionNames();
+  Options.SeparateNamedSections = getSeparateNamedSections();
   Options.TLSSize = getTLSSize();
   Options.EmulatedTLS =
       getExplicitEmulatedTLS().value_or(TheTriple.hasDefaultEmulatedTLS());
diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
index 8e623c85b737..dc35f33a3a05 100644
--- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp
+++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
@@ -367,7 +367,8 @@ static Value *getNeutralReductionElement(const VPReductionIntrinsic &VPI,
                                          Type *EltTy) {
   bool Negative = false;
   unsigned EltBits = EltTy->getScalarSizeInBits();
-  switch (VPI.getIntrinsicID()) {
+  Intrinsic::ID VID = VPI.getIntrinsicID();
+  switch (VID) {
   default:
     llvm_unreachable("Expecting a VP reduction intrinsic");
   case Intrinsic::vp_reduce_add:
@@ -387,12 +388,17 @@ static Value *getNeutralReductionElement(const VPReductionIntrinsic &VPI,
     return ConstantInt::get(EltTy->getContext(),
                             APInt::getSignedMinValue(EltBits));
   case Intrinsic::vp_reduce_fmax:
+  case Intrinsic::vp_reduce_fmaximum:
     Negative = true;
     [[fallthrough]];
-  case Intrinsic::vp_reduce_fmin: {
+  case Intrinsic::vp_reduce_fmin:
+  case Intrinsic::vp_reduce_fminimum: {
+    bool PropagatesNaN = VID == Intrinsic::vp_reduce_fminimum ||
+                         VID == Intrinsic::vp_reduce_fmaximum;
     FastMathFlags Flags = VPI.getFastMathFlags();
     const fltSemantics &Semantics = EltTy->getFltSemantics();
-    return !Flags.noNaNs() ? ConstantFP::getQNaN(EltTy, Negative)
+    return (!Flags.noNaNs() && !PropagatesNaN)
+               ? ConstantFP::getQNaN(EltTy, Negative)
            : !Flags.noInfs()
                ? ConstantFP::getInfinity(EltTy, Negative)
                : ConstantFP::get(EltTy,
@@ -480,6 +486,18 @@ CachingVPExpander::expandPredicationInReduction(IRBuilder<> &Builder,
     Reduction =
         Builder.CreateBinaryIntrinsic(Intrinsic::minnum, Reduction, Start);
     break;
+  case Intrinsic::vp_reduce_fmaximum:
+    Reduction = Builder.CreateFPMaximumReduce(RedOp);
+    transferDecorations(*Reduction, VPI);
+    Reduction =
+        Builder.CreateBinaryIntrinsic(Intrinsic::maximum, Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_fminimum:
+    Reduction = Builder.CreateFPMinimumReduce(RedOp);
+    transferDecorations(*Reduction, VPI);
+    Reduction =
+        Builder.CreateBinaryIntrinsic(Intrinsic::minimum, Reduction, Start);
+    break;
   case Intrinsic::vp_reduce_fadd:
     Reduction = Builder.CreateFAddReduce(Start, RedOp);
     break;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 653e7689b577..9999776b9826 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -4137,14 +4137,6 @@ void CombinerHelper::applyBuildFn(
   MI.eraseFromParent();
 }
 
-void CombinerHelper::applyBuildFnMO(const MachineOperand &MO,
-                                    BuildFnTy &MatchInfo) {
-  MachineInstr *Root = getDefIgnoringCopies(MO.getReg(), MRI);
-  Builder.setInstrAndDebugLoc(*Root);
-  MatchInfo(Builder);
-  Root->eraseFromParent();
-}
-
 void CombinerHelper::applyBuildFnNoErase(
     MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
   MatchInfo(Builder);
@@ -7252,3 +7244,78 @@ bool CombinerHelper::matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo) {
 
   return false;
 }
+
+void CombinerHelper::applyBuildFnMO(const MachineOperand &MO,
+                                    BuildFnTy &MatchInfo) {
+  MachineInstr *Root = getDefIgnoringCopies(MO.getReg(), MRI);
+  MatchInfo(Builder);
+  Root->eraseFromParent();
+}
+
+bool CombinerHelper::matchSextOfTrunc(const MachineOperand &MO,
+                                      BuildFnTy &MatchInfo) {
+  GSext *Sext = cast<GSext>(getDefIgnoringCopies(MO.getReg(), MRI));
+  GTrunc *Trunc = cast<GTrunc>(getDefIgnoringCopies(Sext->getSrcReg(), MRI));
+
+  Register Dst = Sext->getReg(0);
+  Register Src = Trunc->getSrcReg();
+
+  LLT DstTy = MRI.getType(Dst);
+  LLT SrcTy = MRI.getType(Src);
+
+  if (DstTy == SrcTy) {
+    MatchInfo = [=](MachineIRBuilder &B) { B.buildCopy(Dst, Src); };
+    return true;
+  }
+
+  if (DstTy.getScalarSizeInBits() < SrcTy.getScalarSizeInBits() &&
+      isLegalOrBeforeLegalizer({TargetOpcode::G_TRUNC, {DstTy, SrcTy}})) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.buildTrunc(Dst, Src, MachineInstr::MIFlag::NoSWrap);
+    };
+    return true;
+  }
+
+  if (DstTy.getScalarSizeInBits() > SrcTy.getScalarSizeInBits() &&
+      isLegalOrBeforeLegalizer({TargetOpcode::G_SEXT, {DstTy, SrcTy}})) {
+    MatchInfo = [=](MachineIRBuilder &B) { B.buildSExt(Dst, Src); };
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::matchZextOfTrunc(const MachineOperand &MO,
+                                      BuildFnTy &MatchInfo) {
+  GZext *Zext = cast<GZext>(getDefIgnoringCopies(MO.getReg(), MRI));
+  GTrunc *Trunc = cast<GTrunc>(getDefIgnoringCopies(Zext->getSrcReg(), MRI));
+
+  Register Dst = Zext->getReg(0);
+  Register Src = Trunc->getSrcReg();
+
+  LLT DstTy = MRI.getType(Dst);
+  LLT SrcTy = MRI.getType(Src);
+
+  if (DstTy == SrcTy) {
+    MatchInfo = [=](MachineIRBuilder &B) { B.buildCopy(Dst, Src); };
+    return true;
+  }
+
+  if (DstTy.getScalarSizeInBits() < SrcTy.getScalarSizeInBits() &&
+      isLegalOrBeforeLegalizer({TargetOpcode::G_TRUNC, {DstTy, SrcTy}})) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.buildTrunc(Dst, Src, MachineInstr::MIFlag::NoUWrap);
+    };
+    return true;
+  }
+
+  if (DstTy.getScalarSizeInBits() > SrcTy.getScalarSizeInBits() &&
+      isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {DstTy, SrcTy}})) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.buildZExt(Dst, Src, MachineInstr::MIFlag::NonNeg);
+    };
+    return true;
+  }
+
+  return false;
+}
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp
index fb33801a3a33..21b1eb262817 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp
@@ -325,6 +325,112 @@ bool CombinerHelper::matchExtractVectorElementWithBuildVectorTrunc(
   return true;
 }
 
+bool CombinerHelper::matchExtractVectorElementWithShuffleVector(
+    const MachineOperand &MO, BuildFnTy &MatchInfo) {
+  GExtractVectorElement *Extract =
+      cast<GExtractVectorElement>(getDefIgnoringCopies(MO.getReg(), MRI));
+
+  //
+  //  %zero:_(s64) = G_CONSTANT i64 0
+  //  %sv:_(<4 x s32>) = G_SHUFFLE_SHUFFLE %arg1(<4 x s32>), %arg2(<4 x s32>),
+  //                     shufflemask(0, 0, 0, 0)
+  //  %extract:_(s32) = G_EXTRACT_VECTOR_ELT %sv(<4 x s32>), %zero(s64)
+  //
+  //  -->
+  //
+  //  %zero1:_(s64) = G_CONSTANT i64 0
+  //  %extract:_(s32) = G_EXTRACT_VECTOR_ELT %arg1(<4 x s32>), %zero1(s64)
+  //
+  //
+  //
+  //
+  //  %three:_(s64) = G_CONSTANT i64 3
+  //  %sv:_(<4 x s32>) = G_SHUFFLE_SHUFFLE %arg1(<4 x s32>), %arg2(<4 x s32>),
+  //                     shufflemask(0, 0, 0, -1)
+  //  %extract:_(s32) = G_EXTRACT_VECTOR_ELT %sv(<4 x s32>), %three(s64)
+  //
+  //  -->
+  //
+  //  %extract:_(s32) = G_IMPLICIT_DEF
+  //
+  //
+  //
+  //
+  //
+  //  %sv:_(<4 x s32>) = G_SHUFFLE_SHUFFLE %arg1(<4 x s32>), %arg2(<4 x s32>),
+  //                     shufflemask(0, 0, 0, -1)
+  //  %extract:_(s32) = G_EXTRACT_VECTOR_ELT %sv(<4 x s32>), %opaque(s64)
+  //
+  //  -->
+  //
+  //  %sv:_(<4 x s32>) = G_SHUFFLE_SHUFFLE %arg1(<4 x s32>), %arg2(<4 x s32>),
+  //                     shufflemask(0, 0, 0, -1)
+  //  %extract:_(s32) = G_EXTRACT_VECTOR_ELT %sv(<4 x s32>), %opaque(s64)
+  //
+
+  // We try to get the value of the Index register.
+  std::optional<ValueAndVReg> MaybeIndex =
+      getIConstantVRegValWithLookThrough(Extract->getIndexReg(), MRI);
+  if (!MaybeIndex)
+    return false;
+
+  GShuffleVector *Shuffle =
+      cast<GShuffleVector>(getDefIgnoringCopies(Extract->getVectorReg(), MRI));
+
+  ArrayRef<int> Mask = Shuffle->getMask();
+
+  unsigned Offset = MaybeIndex->Value.getZExtValue();
+  int SrcIdx = Mask[Offset];
+
+  LLT Src1Type = MRI.getType(Shuffle->getSrc1Reg());
+  // At the IR level a <1 x ty> shuffle  vector is valid, but we want to extract
+  // from a vector.
+  assert(Src1Type.isVector() && "expected to extract from a vector");
+  unsigned LHSWidth = Src1Type.isVector() ? Src1Type.getNumElements() : 1;
+
+  // Note that there is no one use check.
+  Register Dst = Extract->getReg(0);
+  LLT DstTy = MRI.getType(Dst);
+
+  if (SrcIdx < 0 &&
+      isLegalOrBeforeLegalizer({TargetOpcode::G_IMPLICIT_DEF, {DstTy}})) {
+    MatchInfo = [=](MachineIRBuilder &B) { B.buildUndef(Dst); };
+    return true;
+  }
+
+  // If the legality check failed, then we still have to abort.
+  if (SrcIdx < 0)
+    return false;
+
+  Register NewVector;
+
+  // We check in which vector and at what offset to look through.
+  if (SrcIdx < (int)LHSWidth) {
+    NewVector = Shuffle->getSrc1Reg();
+    // SrcIdx unchanged
+  } else { // SrcIdx >= LHSWidth
+    NewVector = Shuffle->getSrc2Reg();
+    SrcIdx -= LHSWidth;
+  }
+
+  LLT IdxTy = MRI.getType(Extract->getIndexReg());
+  LLT NewVectorTy = MRI.getType(NewVector);
+
+  // We check the legality of the look through.
+  if (!isLegalOrBeforeLegalizer(
+          {TargetOpcode::G_EXTRACT_VECTOR_ELT, {DstTy, NewVectorTy, IdxTy}}) ||
+      !isConstantLegalOrBeforeLegalizer({IdxTy}))
+    return false;
+
+  // We look through the shuffle vector.
+  MatchInfo = [=](MachineIRBuilder &B) {
+    auto Idx = B.buildConstant(IdxTy, SrcIdx);
+    B.buildExtractVectorElement(Dst, NewVector, Idx);
+  };
+
+  return true;
+}
+
 bool CombinerHelper::matchInsertVectorElementOOB(MachineInstr &MI,
                                                  BuildFnTy &MatchInfo) {
   GInsertVectorElement *Insert = cast<GInsertVectorElement>(&MI);
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
index 529e50c8ebe0..32d607cfd71a 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
@@ -64,8 +64,11 @@ KnownBits GISelKnownBits::getKnownBits(MachineInstr &MI) {
 
 KnownBits GISelKnownBits::getKnownBits(Register R) {
   const LLT Ty = MRI.getType(R);
+  // Since the number of lanes in a scalable vector is unknown at compile time,
+  // we track one bit which is implicitly broadcast to all lanes.  This means
+  // that all lanes in a scalable vector are considered demanded.
   APInt DemandedElts =
-      Ty.isVector() ? APInt::getAllOnes(Ty.getNumElements()) : APInt(1, 1);
+      Ty.isFixedVector() ? APInt::getAllOnes(Ty.getNumElements()) : APInt(1, 1);
   return getKnownBits(R, DemandedElts);
 }
 
@@ -253,10 +256,7 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
     break;
   }
   case TargetOpcode::G_CONSTANT: {
-    auto CstVal = getIConstantVRegVal(R, MRI);
-    if (!CstVal)
-      break;
-    Known = KnownBits::makeConstant(*CstVal);
+    Known = KnownBits::makeConstant(MI.getOperand(1).getCImm()->getValue());
     break;
   }
   case TargetOpcode::G_FRAME_INDEX: {
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index e26c6ca3d616..5289b993476d 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1945,6 +1945,8 @@ unsigned IRTranslator::getSimpleIntrinsicOpcode(Intrinsic::ID ID) {
       return TargetOpcode::G_FSIN;
     case Intrinsic::sqrt:
       return TargetOpcode::G_FSQRT;
+    case Intrinsic::tan:
+      return TargetOpcode::G_FTAN;
     case Intrinsic::trunc:
       return TargetOpcode::G_INTRINSIC_TRUNC;
     case Intrinsic::readcyclecounter:
@@ -2053,11 +2055,8 @@ bool IRTranslator::translateConstrainedFPIntrinsic(
     Flags |= MachineInstr::NoFPExcept;
 
   SmallVector<llvm::SrcOp, 4> VRegs;
-  VRegs.push_back(getOrCreateVReg(*FPI.getArgOperand(0)));
-  if (!FPI.isUnaryOp())
-    VRegs.push_back(getOrCreateVReg(*FPI.getArgOperand(1)));
-  if (FPI.isTernaryOp())
-    VRegs.push_back(getOrCreateVReg(*FPI.getArgOperand(2)));
+  for (unsigned I = 0, E = FPI.getNonMetadataArgCount(); I != E; ++I)
+    VRegs.push_back(getOrCreateVReg(*FPI.getArgOperand(I)));
 
   MIRBuilder.buildInstr(Opcode, {getOrCreateVReg(FPI)}, VRegs, Flags);
   return true;
@@ -2852,7 +2851,7 @@ bool IRTranslator::translateInvoke(const User &U,
     return false;
 
   // FIXME: support whatever these are.
-  if (I.countOperandBundlesOfType(LLVMContext::OB_deopt))
+  if (I.hasDeoptState())
     return false;
 
   // FIXME: support control flow guard targets.
diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index 2e8407813ba6..afe270356940 100644
--- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -490,8 +490,9 @@ MachineInstrBuilder MachineIRBuilder::buildSExt(const DstOp &Res,
 }
 
 MachineInstrBuilder MachineIRBuilder::buildZExt(const DstOp &Res,
-                                                const SrcOp &Op) {
-  return buildInstr(TargetOpcode::G_ZEXT, Res, Op);
+                                                const SrcOp &Op,
+                                                std::optional<unsigned> Flags) {
+  return buildInstr(TargetOpcode::G_ZEXT, Res, Op, Flags);
 }
 
 unsigned MachineIRBuilder::getBoolExtOp(bool IsVec, bool IsFP) const {
@@ -869,9 +870,10 @@ MachineInstrBuilder MachineIRBuilder::buildIntrinsic(Intrinsic::ID ID,
   return buildIntrinsic(ID, Results, HasSideEffects, isConvergent);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildTrunc(const DstOp &Res,
-                                                 const SrcOp &Op) {
-  return buildInstr(TargetOpcode::G_TRUNC, Res, Op);
+MachineInstrBuilder
+MachineIRBuilder::buildTrunc(const DstOp &Res, const SrcOp &Op,
+                             std::optional<unsigned> Flags) {
+  return buildInstr(TargetOpcode::G_TRUNC, Res, Op, Flags);
 }
 
 MachineInstrBuilder
diff --git a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
index e5f164b18272..a9b59e738c00 100644
--- a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
+++ b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
@@ -877,6 +877,9 @@ public:
     if (LI->isAtomic())
       return false;
 
+    if (!DL.typeSizeEqualsStoreSize(Result.VTy->getElementType()))
+      return false;
+
     // Get the base polynomial
     computePolynomialFromPointer(*LI->getPointerOperand(), Offset, BasePtr, DL);
 
diff --git a/llvm/lib/CodeGen/MIRSampleProfile.cpp b/llvm/lib/CodeGen/MIRSampleProfile.cpp
index 42d0aba4b166..6faa1ad1a779 100644
--- a/llvm/lib/CodeGen/MIRSampleProfile.cpp
+++ b/llvm/lib/CodeGen/MIRSampleProfile.cpp
@@ -372,7 +372,7 @@ bool MIRProfileLoaderPass::runOnMachineFunction(MachineFunction &MF) {
   MF.RenumberBlocks();
   if (ViewBFIBefore && ViewBlockLayoutWithBFI != GVDT_None &&
       (ViewBlockFreqFuncName.empty() ||
-       MF.getFunction().getName().equals(ViewBlockFreqFuncName))) {
+       MF.getFunction().getName() == ViewBlockFreqFuncName)) {
     MBFI->view("MIR_Prof_loader_b." + MF.getName(), false);
   }
 
@@ -382,7 +382,7 @@ bool MIRProfileLoaderPass::runOnMachineFunction(MachineFunction &MF) {
 
   if (ViewBFIAfter && ViewBlockLayoutWithBFI != GVDT_None &&
       (ViewBlockFreqFuncName.empty() ||
-       MF.getFunction().getName().equals(ViewBlockFreqFuncName))) {
+       MF.getFunction().getName() == ViewBlockFreqFuncName)) {
     MBFI->view("MIR_prof_loader_a." + MF.getName(), false);
   }
 
diff --git a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
index cbebdd87398e..7ebecc6beb17 100644
--- a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
+++ b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
@@ -198,12 +198,11 @@ void MachineBlockFrequencyInfo::calculate(
     MBFI.reset(new ImplType);
   MBFI->calculate(F, MBPI, MLI);
   if (ViewMachineBlockFreqPropagationDAG != GVDT_None &&
-      (ViewBlockFreqFuncName.empty() ||
-       F.getName().equals(ViewBlockFreqFuncName))) {
+      (ViewBlockFreqFuncName.empty() || F.getName() == ViewBlockFreqFuncName)) {
     view("MachineBlockFrequencyDAGS." + F.getName());
   }
   if (PrintMachineBlockFreq &&
-      (PrintBFIFuncName.empty() || F.getName().equals(PrintBFIFuncName))) {
+      (PrintBFIFuncName.empty() || F.getName() == PrintBFIFuncName)) {
     MBFI->print(dbgs());
   }
 }
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index ef34e920aed5..c0cdeab25f1c 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -3500,7 +3500,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   }
   if (ViewBlockLayoutWithBFI != GVDT_None &&
       (ViewBlockFreqFuncName.empty() ||
-       F->getFunction().getName().equals(ViewBlockFreqFuncName))) {
+       F->getFunction().getName() == ViewBlockFreqFuncName)) {
     if (RenumberBlocksBeforeView)
       MF.RenumberBlocks();
     MBFI->view("MBP." + MF.getName(), false);
diff --git a/llvm/lib/CodeGen/RDFGraph.cpp b/llvm/lib/CodeGen/RDFGraph.cpp
index 6276a4722e3f..ff0fd61078c0 100644
--- a/llvm/lib/CodeGen/RDFGraph.cpp
+++ b/llvm/lib/CodeGen/RDFGraph.cpp
@@ -264,7 +264,7 @@ raw_ostream &operator<<(raw_ostream &OS, const Print<Block> &P) {
   MachineBasicBlock *BB = P.Obj.Addr->getCode();
   unsigned NP = BB->pred_size();
   std::vector<int> Ns;
-  auto PrintBBs = [&OS](std::vector<int> Ns) -> void {
+  auto PrintBBs = [&OS](const std::vector<int> &Ns) -> void {
     unsigned N = Ns.size();
     for (int I : Ns) {
       OS << "%bb." << I;
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c0bbea16a642..a044b6dc4838 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -1083,7 +1083,44 @@ bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc,
   // (load/store (add, (add, x, y), offset2)) ->
   // (load/store (add, (add, x, offset2), y)).
 
-  if (Opc != ISD::ADD || N0.getOpcode() != ISD::ADD)
+  if (N0.getOpcode() != ISD::ADD)
+    return false;
+
+  // Check for vscale addressing modes.
+  // (load/store (add/sub (add x, y), vscale))
+  // (load/store (add/sub (add x, y), (lsl vscale, C)))
+  // (load/store (add/sub (add x, y), (mul vscale, C)))
+  if ((N1.getOpcode() == ISD::VSCALE ||
+       ((N1.getOpcode() == ISD::SHL || N1.getOpcode() == ISD::MUL) &&
+        N1.getOperand(0).getOpcode() == ISD::VSCALE &&
+        isa<ConstantSDNode>(N1.getOperand(1)))) &&
+      N1.getValueType().getFixedSizeInBits() <= 64) {
+    int64_t ScalableOffset = N1.getOpcode() == ISD::VSCALE
+                                 ? N1.getConstantOperandVal(0)
+                                 : (N1.getOperand(0).getConstantOperandVal(0) *
+                                    (N1.getOpcode() == ISD::SHL
+                                         ? (1LL << N1.getConstantOperandVal(1))
+                                         : N1.getConstantOperandVal(1)));
+    if (Opc == ISD::SUB)
+      ScalableOffset = -ScalableOffset;
+    if (all_of(N->uses(), [&](SDNode *Node) {
+          if (auto *LoadStore = dyn_cast<MemSDNode>(Node);
+              LoadStore && LoadStore->getBasePtr().getNode() == N) {
+            TargetLoweringBase::AddrMode AM;
+            AM.HasBaseReg = true;
+            AM.ScalableOffset = ScalableOffset;
+            EVT VT = LoadStore->getMemoryVT();
+            unsigned AS = LoadStore->getAddressSpace();
+            Type *AccessTy = VT.getTypeForEVT(*DAG.getContext());
+            return TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, AccessTy,
+                                             AS);
+          }
+          return false;
+        }))
+      return true;
+  }
+
+  if (Opc != ISD::ADD)
     return false;
 
   auto *C2 = dyn_cast<ConstantSDNode>(N1);
@@ -2838,6 +2875,66 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) {
     return DAG.getNode(ISD::ADD, DL, VT, Not, N0.getOperand(0));
   }
 
+  // Fold add(mul(add(A, CA), CM), CB) -> add(mul(A, CM), CM*CA+CB).
+  // This can help if the inner add has multiple uses.
+  APInt CM, CA;
+  if (ConstantSDNode *CB = dyn_cast<ConstantSDNode>(N1)) {
+    if (VT.getScalarSizeInBits() <= 64) {
+      if (sd_match(N0, m_OneUse(m_Mul(m_Add(m_Value(A), m_ConstInt(CA)),
+                                      m_ConstInt(CM)))) &&
+          TLI.isLegalAddImmediate(
+              (CA * CM + CB->getAPIntValue()).getSExtValue())) {
+        SDNodeFlags Flags;
+        // If all the inputs are nuw, the outputs can be nuw. If all the input
+        // are _also_ nsw the outputs can be too.
+        if (N->getFlags().hasNoUnsignedWrap() &&
+            N0->getFlags().hasNoUnsignedWrap() &&
+            N0.getOperand(0)->getFlags().hasNoUnsignedWrap()) {
+          Flags.setNoUnsignedWrap(true);
+          if (N->getFlags().hasNoSignedWrap() &&
+              N0->getFlags().hasNoSignedWrap() &&
+              N0.getOperand(0)->getFlags().hasNoSignedWrap())
+            Flags.setNoSignedWrap(true);
+        }
+        SDValue Mul = DAG.getNode(ISD::MUL, SDLoc(N1), VT, A,
+                                  DAG.getConstant(CM, DL, VT), Flags);
+        return DAG.getNode(
+            ISD::ADD, DL, VT, Mul,
+            DAG.getConstant(CA * CM + CB->getAPIntValue(), DL, VT), Flags);
+      }
+      // Also look in case there is an intermediate add.
+      if (sd_match(N0, m_OneUse(m_Add(
+                           m_OneUse(m_Mul(m_Add(m_Value(A), m_ConstInt(CA)),
+                                          m_ConstInt(CM))),
+                           m_Value(B)))) &&
+          TLI.isLegalAddImmediate(
+              (CA * CM + CB->getAPIntValue()).getSExtValue())) {
+        SDNodeFlags Flags;
+        // If all the inputs are nuw, the outputs can be nuw. If all the input
+        // are _also_ nsw the outputs can be too.
+        SDValue OMul =
+            N0.getOperand(0) == B ? N0.getOperand(1) : N0.getOperand(0);
+        if (N->getFlags().hasNoUnsignedWrap() &&
+            N0->getFlags().hasNoUnsignedWrap() &&
+            OMul->getFlags().hasNoUnsignedWrap() &&
+            OMul.getOperand(0)->getFlags().hasNoUnsignedWrap()) {
+          Flags.setNoUnsignedWrap(true);
+          if (N->getFlags().hasNoSignedWrap() &&
+              N0->getFlags().hasNoSignedWrap() &&
+              OMul->getFlags().hasNoSignedWrap() &&
+              OMul.getOperand(0)->getFlags().hasNoSignedWrap())
+            Flags.setNoSignedWrap(true);
+        }
+        SDValue Mul = DAG.getNode(ISD::MUL, SDLoc(N1), VT, A,
+                                  DAG.getConstant(CM, DL, VT), Flags);
+        SDValue Add = DAG.getNode(ISD::ADD, SDLoc(N1), VT, Mul, B, Flags);
+        return DAG.getNode(
+            ISD::ADD, DL, VT, Add,
+            DAG.getConstant(CA * CM + CB->getAPIntValue(), DL, VT), Flags);
+      }
+    }
+  }
+
   if (SDValue Combined = visitADDLikeCommutative(N0, N1, N))
     return Combined;
 
@@ -3911,7 +4008,8 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
 
   // Hoist one-use addition by non-opaque constant:
   //   (x + C) - y  ->  (x - y) + C
-  if (N0.getOpcode() == ISD::ADD && N0.hasOneUse() &&
+  if (!reassociationCanBreakAddressingModePattern(ISD::SUB, DL, N, N0, N1) &&
+      N0.getOpcode() == ISD::ADD && N0.hasOneUse() &&
       isConstantOrConstantVector(N0.getOperand(1), /*NoOpaques=*/true)) {
     SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), N1);
     return DAG.getNode(ISD::ADD, DL, VT, Sub, N0.getOperand(1));
@@ -8728,15 +8826,16 @@ static std::optional<bool> isBigEndian(const ArrayRef<int64_t> ByteOffsets,
   return BigEndian;
 }
 
+// Look through one layer of truncate or extend.
 static SDValue stripTruncAndExt(SDValue Value) {
   switch (Value.getOpcode()) {
   case ISD::TRUNCATE:
   case ISD::ZERO_EXTEND:
   case ISD::SIGN_EXTEND:
   case ISD::ANY_EXTEND:
-    return stripTruncAndExt(Value.getOperand(0));
+    return Value.getOperand(0);
   }
-  return Value;
+  return SDValue();
 }
 
 /// Match a pattern where a wide type scalar value is stored by several narrow
@@ -8840,21 +8939,27 @@ SDValue DAGCombiner::mergeTruncStores(StoreSDNode *N) {
       if (ShiftAmtC % NarrowNumBits != 0)
         return SDValue();
 
+      // Make sure we aren't reading bits that are shifted in.
+      if (ShiftAmtC > WideVal.getScalarValueSizeInBits() - NarrowNumBits)
+        return SDValue();
+
       Offset = ShiftAmtC / NarrowNumBits;
       WideVal = WideVal.getOperand(0);
     }
 
     // Stores must share the same source value with different offsets.
-    // Truncate and extends should be stripped to get the single source value.
     if (!SourceValue)
       SourceValue = WideVal;
-    else if (stripTruncAndExt(SourceValue) != stripTruncAndExt(WideVal))
-      return SDValue();
-    else if (SourceValue.getValueType() != WideVT) {
-      if (WideVal.getValueType() == WideVT ||
-          WideVal.getScalarValueSizeInBits() >
-              SourceValue.getScalarValueSizeInBits())
+    else if (SourceValue != WideVal) {
+      // Truncate and extends can be stripped to see if the values are related.
+      if (stripTruncAndExt(SourceValue) != WideVal &&
+          stripTruncAndExt(WideVal) != SourceValue)
+        return SDValue();
+
+      if (WideVal.getScalarValueSizeInBits() >
+          SourceValue.getScalarValueSizeInBits())
         SourceValue = WideVal;
+
       // Give up if the source value type is smaller than the store size.
       if (SourceValue.getScalarValueSizeInBits() < WideVT.getScalarSizeInBits())
         return SDValue();
@@ -10956,9 +11061,23 @@ SDValue DAGCombiner::visitBITREVERSE(SDNode *N) {
   // fold (bitreverse c1) -> c2
   if (SDValue C = DAG.FoldConstantArithmetic(ISD::BITREVERSE, DL, VT, {N0}))
     return C;
+
   // fold (bitreverse (bitreverse x)) -> x
   if (N0.getOpcode() == ISD::BITREVERSE)
     return N0.getOperand(0);
+
+  SDValue X, Y;
+
+  // fold (bitreverse (lshr (bitreverse x), y)) -> (shl x, y)
+  if ((!LegalOperations || TLI.isOperationLegal(ISD::SHL, VT)) &&
+      sd_match(N, m_BitReverse(m_Srl(m_BitReverse(m_Value(X)), m_Value(Y)))))
+    return DAG.getNode(ISD::SHL, DL, VT, X, Y);
+
+  // fold (bitreverse (shl (bitreverse x), y)) -> (lshr x, y)
+  if ((!LegalOperations || TLI.isOperationLegal(ISD::SRL, VT)) &&
+      sd_match(N, m_BitReverse(m_Shl(m_BitReverse(m_Value(X)), m_Value(Y)))))
+    return DAG.getNode(ISD::SRL, DL, VT, X, Y);
+
   return SDValue();
 }
 
@@ -15458,9 +15577,11 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
       N0->getNumValues() != 1 || !N0->hasOneUse())
     return SDValue();
 
-  bool AllowMultipleMaybePoisonOperands = N0.getOpcode() == ISD::BUILD_VECTOR ||
-                                          N0.getOpcode() == ISD::BUILD_PAIR ||
-                                          N0.getOpcode() == ISD::CONCAT_VECTORS;
+  bool AllowMultipleMaybePoisonOperands =
+      N0.getOpcode() == ISD::BUILD_VECTOR ||
+      N0.getOpcode() == ISD::BUILD_PAIR ||
+      N0.getOpcode() == ISD::VECTOR_SHUFFLE ||
+      N0.getOpcode() == ISD::CONCAT_VECTORS;
 
   // Avoid turning a BUILD_VECTOR that can be recognized as "all zeros", "all
   // ones" or "constant" into something that depends on FrozenUndef. We can
@@ -15533,8 +15654,16 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
     if (Op.getOpcode() == ISD::UNDEF)
       Op = DAG.getFreeze(Op);
   }
-  // NOTE: this strips poison generating flags.
-  SDValue R = DAG.getNode(N0.getOpcode(), SDLoc(N0), N0->getVTList(), Ops);
+
+  SDValue R;
+  if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N0)) {
+    // Special case handling for ShuffleVectorSDNode nodes.
+    R = DAG.getVectorShuffle(N0.getValueType(), SDLoc(N0), Ops[0], Ops[1],
+                             SVN->getMask());
+  } else {
+    // NOTE: this strips poison generating flags.
+    R = DAG.getNode(N0.getOpcode(), SDLoc(N0), N0->getVTList(), Ops);
+  }
   assert(DAG.isGuaranteedNotToBeUndefOrPoison(R, /*PoisonOnly*/ false) &&
          "Can't create node that may be undef/poison!");
   return R;
@@ -17236,17 +17365,35 @@ SDValue DAGCombiner::visitFREM(SDNode *N) {
   EVT VT = N->getValueType(0);
   SDNodeFlags Flags = N->getFlags();
   SelectionDAG::FlagInserter FlagsInserter(DAG, N);
+  SDLoc DL(N);
 
   if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
     return R;
 
   // fold (frem c1, c2) -> fmod(c1,c2)
-  if (SDValue C = DAG.FoldConstantArithmetic(ISD::FREM, SDLoc(N), VT, {N0, N1}))
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::FREM, DL, VT, {N0, N1}))
     return C;
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
+  // Lower frem N0, N1 => x - trunc(N0 / N1) * N1, providing N1 is an integer
+  // power of 2.
+  if (!TLI.isOperationLegal(ISD::FREM, VT) &&
+      TLI.isOperationLegalOrCustom(ISD::FMUL, VT) &&
+      TLI.isOperationLegalOrCustom(ISD::FDIV, VT) &&
+      TLI.isOperationLegalOrCustom(ISD::FTRUNC, VT) &&
+      DAG.isKnownToBeAPowerOfTwoFP(N1) &&
+      (Flags.hasNoSignedZeros() || DAG.cannotBeOrderedNegativeFP(N0))) {
+    SDValue Div = DAG.getNode(ISD::FDIV, DL, VT, N0, N1);
+    SDValue Rnd = DAG.getNode(ISD::FTRUNC, DL, VT, Div);
+    if (TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT))
+      return DAG.getNode(ISD::FMA, DL, VT, DAG.getNode(ISD::FNEG, DL, VT, Rnd),
+                         N1, N0);
+    SDValue Mul = DAG.getNode(ISD::FMUL, DL, VT, Rnd, N1);
+    return DAG.getNode(ISD::FSUB, DL, VT, N0, Mul);
+  }
+
   return SDValue();
 }
 
@@ -22079,7 +22226,7 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
 /// Transform a vector binary operation into a scalar binary operation by moving
 /// the math/logic after an extract element of a vector.
 static SDValue scalarizeExtractedBinop(SDNode *ExtElt, SelectionDAG &DAG,
-                                       bool LegalOperations) {
+                                       const SDLoc &DL, bool LegalOperations) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue Vec = ExtElt->getOperand(0);
   SDValue Index = ExtElt->getOperand(1);
@@ -22104,7 +22251,6 @@ static SDValue scalarizeExtractedBinop(SDNode *ExtElt, SelectionDAG &DAG,
       ISD::isConstantSplatVector(Op1.getNode(), SplatVal)) {
     // extractelt (binop X, C), IndexC --> binop (extractelt X, IndexC), C'
     // extractelt (binop C, X), IndexC --> binop C', (extractelt X, IndexC)
-    SDLoc DL(ExtElt);
     EVT VT = ExtElt->getValueType(0);
     SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Index);
     SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op1, Index);
@@ -22343,7 +22489,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
     }
   }
 
-  if (SDValue BO = scalarizeExtractedBinop(N, DAG, LegalOperations))
+  if (SDValue BO = scalarizeExtractedBinop(N, DAG, DL, LegalOperations))
     return BO;
 
   if (VecVT.isScalableVector())
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index bfc3e08c1632..fd97a1283b65 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1222,6 +1222,8 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   case ISD::VP_REDUCE_UMIN:
   case ISD::VP_REDUCE_FMAX:
   case ISD::VP_REDUCE_FMIN:
+  case ISD::VP_REDUCE_FMAXIMUM:
+  case ISD::VP_REDUCE_FMINIMUM:
   case ISD::VP_REDUCE_SEQ_FADD:
   case ISD::VP_REDUCE_SEQ_FMUL:
     Action = TLI.getOperationAction(
@@ -5006,7 +5008,8 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
       Node->getOpcode() == ISD::INSERT_VECTOR_ELT) {
     OVT = Node->getOperand(0).getSimpleValueType();
   }
-  if (Node->getOpcode() == ISD::STRICT_UINT_TO_FP ||
+  if (Node->getOpcode() == ISD::ATOMIC_STORE ||
+      Node->getOpcode() == ISD::STRICT_UINT_TO_FP ||
       Node->getOpcode() == ISD::STRICT_SINT_TO_FP ||
       Node->getOpcode() == ISD::STRICT_FSETCC ||
       Node->getOpcode() == ISD::STRICT_FSETCCS ||
@@ -5014,6 +5017,8 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
       Node->getOpcode() == ISD::VP_REDUCE_FMUL ||
       Node->getOpcode() == ISD::VP_REDUCE_FMAX ||
       Node->getOpcode() == ISD::VP_REDUCE_FMIN ||
+      Node->getOpcode() == ISD::VP_REDUCE_FMAXIMUM ||
+      Node->getOpcode() == ISD::VP_REDUCE_FMINIMUM ||
       Node->getOpcode() == ISD::VP_REDUCE_SEQ_FADD)
     OVT = Node->getOperand(1).getSimpleValueType();
   if (Node->getOpcode() == ISD::BR_CC ||
@@ -5622,7 +5627,8 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     Results.push_back(CvtVec);
     break;
   }
-  case ISD::ATOMIC_SWAP: {
+  case ISD::ATOMIC_SWAP:
+  case ISD::ATOMIC_STORE: {
     AtomicSDNode *AM = cast<AtomicSDNode>(Node);
     SDLoc SL(Node);
     SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NVT, AM->getVal());
@@ -5631,13 +5637,22 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     assert(AM->getMemoryVT().getSizeInBits() == NVT.getSizeInBits() &&
            "unexpected atomic_swap with illegal type");
 
-    SDValue NewAtomic
-      = DAG.getAtomic(ISD::ATOMIC_SWAP, SL, NVT,
-                      DAG.getVTList(NVT, MVT::Other),
-                      { AM->getChain(), AM->getBasePtr(), CastVal },
-                      AM->getMemOperand());
-    Results.push_back(DAG.getNode(ISD::BITCAST, SL, OVT, NewAtomic));
-    Results.push_back(NewAtomic.getValue(1));
+    SDValue Op0 = AM->getBasePtr();
+    SDValue Op1 = CastVal;
+
+    // ATOMIC_STORE uses a swapped operand order from every other AtomicSDNode,
+    // but really it should merge with ISD::STORE.
+    if (AM->getOpcode() == ISD::ATOMIC_STORE)
+      std::swap(Op0, Op1);
+
+    SDValue NewAtomic = DAG.getAtomic(AM->getOpcode(), SL, NVT, AM->getChain(),
+                                      Op0, Op1, AM->getMemOperand());
+
+    if (AM->getOpcode() != ISD::ATOMIC_STORE) {
+      Results.push_back(DAG.getNode(ISD::BITCAST, SL, OVT, NewAtomic));
+      Results.push_back(NewAtomic.getValue(1));
+    } else
+      Results.push_back(NewAtomic);
     break;
   }
   case ISD::ATOMIC_LOAD: {
@@ -5676,6 +5691,8 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
   case ISD::VP_REDUCE_FMUL:
   case ISD::VP_REDUCE_FMAX:
   case ISD::VP_REDUCE_FMIN:
+  case ISD::VP_REDUCE_FMAXIMUM:
+  case ISD::VP_REDUCE_FMINIMUM:
   case ISD::VP_REDUCE_SEQ_FADD:
     Results.push_back(PromoteReduction(Node));
     break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index abe5be763825..fc96ecdc6628 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -134,6 +134,7 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::STRICT_FTRUNC:
     case ISD::FTRUNC:      R = SoftenFloatRes_FTRUNC(N); break;
     case ISD::LOAD:        R = SoftenFloatRes_LOAD(N); break;
+    case ISD::ATOMIC_LOAD: R = SoftenFloatRes_ATOMIC_LOAD(N); break;
     case ISD::ATOMIC_SWAP: R = BitcastToInt_ATOMIC_SWAP(N); break;
     case ISD::SELECT:      R = SoftenFloatRes_SELECT(N); break;
     case ISD::SELECT_CC:   R = SoftenFloatRes_SELECT_CC(N); break;
@@ -815,6 +816,26 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
   return BitConvertToInteger(ExtendNode);
 }
 
+SDValue DAGTypeLegalizer::SoftenFloatRes_ATOMIC_LOAD(SDNode *N) {
+  AtomicSDNode *L = cast<AtomicSDNode>(N);
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDLoc dl(N);
+
+  if (L->getExtensionType() == ISD::NON_EXTLOAD) {
+    SDValue NewL =
+        DAG.getAtomic(ISD::ATOMIC_LOAD, dl, NVT, DAG.getVTList(NVT, MVT::Other),
+                      {L->getChain(), L->getBasePtr()}, L->getMemOperand());
+
+    // Legalized the chain result - switch anything that used the old chain to
+    // use the new one.
+    ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
+    return NewL;
+  }
+
+  report_fatal_error("softening fp extending atomic load not handled");
+}
+
 SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT(SDNode *N) {
   SDValue LHS = GetSoftenedFloat(N->getOperand(1));
   SDValue RHS = GetSoftenedFloat(N->getOperand(2));
@@ -946,6 +967,9 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
   case ISD::STRICT_FSETCCS:
   case ISD::SETCC:       Res = SoftenFloatOp_SETCC(N); break;
   case ISD::STORE:       Res = SoftenFloatOp_STORE(N, OpNo); break;
+  case ISD::ATOMIC_STORE:
+    Res = SoftenFloatOp_ATOMIC_STORE(N, OpNo);
+    break;
   case ISD::FCOPYSIGN:   Res = SoftenFloatOp_FCOPYSIGN(N); break;
   }
 
@@ -1172,6 +1196,20 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_STORE(SDNode *N, unsigned OpNo) {
                       ST->getMemOperand());
 }
 
+SDValue DAGTypeLegalizer::SoftenFloatOp_ATOMIC_STORE(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 1 && "Can only soften the stored value!");
+  AtomicSDNode *ST = cast<AtomicSDNode>(N);
+  SDValue Val = ST->getVal();
+  EVT VT = Val.getValueType();
+  SDLoc dl(N);
+
+  assert(ST->getMemoryVT() == VT && "truncating atomic store not handled");
+
+  SDValue NewVal = GetSoftenedFloat(Val);
+  return DAG.getAtomic(ISD::ATOMIC_STORE, dl, VT, ST->getChain(), NewVal,
+                       ST->getBasePtr(), ST->getMemOperand());
+}
+
 SDValue DAGTypeLegalizer::SoftenFloatOp_FCOPYSIGN(SDNode *N) {
   SDValue LHS = N->getOperand(0);
   SDValue RHS = BitConvertToInteger(N->getOperand(1));
@@ -2249,6 +2287,7 @@ bool DAGTypeLegalizer::PromoteFloatOperand(SDNode *N, unsigned OpNo) {
     case ISD::SELECT_CC:  R = PromoteFloatOp_SELECT_CC(N, OpNo); break;
     case ISD::SETCC:      R = PromoteFloatOp_SETCC(N, OpNo); break;
     case ISD::STORE:      R = PromoteFloatOp_STORE(N, OpNo); break;
+    case ISD::ATOMIC_STORE: R = PromoteFloatOp_ATOMIC_STORE(N, OpNo); break;
   }
   // clang-format on
 
@@ -2371,6 +2410,23 @@ SDValue DAGTypeLegalizer::PromoteFloatOp_STORE(SDNode *N, unsigned OpNo) {
                       ST->getMemOperand());
 }
 
+SDValue DAGTypeLegalizer::PromoteFloatOp_ATOMIC_STORE(SDNode *N,
+                                                      unsigned OpNo) {
+  AtomicSDNode *ST = cast<AtomicSDNode>(N);
+  SDValue Val = ST->getVal();
+  SDLoc DL(N);
+
+  SDValue Promoted = GetPromotedFloat(Val);
+  EVT VT = ST->getOperand(1).getValueType();
+  EVT IVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+
+  SDValue NewVal = DAG.getNode(GetPromotionOpcode(Promoted.getValueType(), VT),
+                               DL, IVT, Promoted);
+
+  return DAG.getAtomic(ISD::ATOMIC_STORE, DL, IVT, ST->getChain(), NewVal,
+                       ST->getBasePtr(), ST->getMemOperand());
+}
+
 //===----------------------------------------------------------------------===//
 //  Float Result Promotion
 //===----------------------------------------------------------------------===//
@@ -2825,6 +2881,8 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) {
     report_fatal_error("Do not know how to soft promote this operator's "
                        "result!");
 
+  case ISD::ARITH_FENCE:
+    R = SoftPromoteHalfRes_ARITH_FENCE(N); break;
   case ISD::BITCAST:    R = SoftPromoteHalfRes_BITCAST(N); break;
   case ISD::ConstantFP: R = SoftPromoteHalfRes_ConstantFP(N); break;
   case ISD::EXTRACT_VECTOR_ELT:
@@ -2904,6 +2962,11 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) {
     SetSoftPromotedHalf(SDValue(N, ResNo), R);
 }
 
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_ARITH_FENCE(SDNode *N) {
+  return DAG.getNode(ISD::ARITH_FENCE, SDLoc(N), MVT::i16,
+                     BitConvertToInteger(N->getOperand(0)));
+}
+
 SDValue DAGTypeLegalizer::SoftPromoteHalfRes_BITCAST(SDNode *N) {
   return BitConvertToInteger(N->getOperand(0));
 }
@@ -3193,6 +3256,9 @@ bool DAGTypeLegalizer::SoftPromoteHalfOperand(SDNode *N, unsigned OpNo) {
   case ISD::SELECT_CC:  Res = SoftPromoteHalfOp_SELECT_CC(N, OpNo); break;
   case ISD::SETCC:      Res = SoftPromoteHalfOp_SETCC(N); break;
   case ISD::STORE:      Res = SoftPromoteHalfOp_STORE(N, OpNo); break;
+  case ISD::ATOMIC_STORE:
+    Res = SoftPromoteHalfOp_ATOMIC_STORE(N, OpNo);
+    break;
   case ISD::STACKMAP:
     Res = SoftPromoteHalfOp_STACKMAP(N, OpNo);
     break;
@@ -3346,6 +3412,19 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfOp_STORE(SDNode *N, unsigned OpNo) {
                       ST->getMemOperand());
 }
 
+SDValue DAGTypeLegalizer::SoftPromoteHalfOp_ATOMIC_STORE(SDNode *N,
+                                                         unsigned OpNo) {
+  assert(OpNo == 1 && "Can only soften the stored value!");
+  AtomicSDNode *ST = cast<AtomicSDNode>(N);
+  SDValue Val = ST->getVal();
+  SDLoc dl(N);
+
+  SDValue Promoted = GetSoftPromotedHalf(Val);
+  return DAG.getAtomic(ISD::ATOMIC_STORE, dl, Promoted.getValueType(),
+                       ST->getChain(), Promoted, ST->getBasePtr(),
+                       ST->getMemOperand());
+}
+
 SDValue DAGTypeLegalizer::SoftPromoteHalfOp_STACKMAP(SDNode *N, unsigned OpNo) {
   assert(OpNo > 1); // Because the first two arguments are guaranteed legal.
   SmallVector<SDValue> NewOps(N->ops().begin(), N->ops().end());
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 49be824deb51..d925089d5689 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -569,6 +569,7 @@ private:
   SDValue SoftenFloatRes_FSUB(SDNode *N);
   SDValue SoftenFloatRes_FTRUNC(SDNode *N);
   SDValue SoftenFloatRes_LOAD(SDNode *N);
+  SDValue SoftenFloatRes_ATOMIC_LOAD(SDNode *N);
   SDValue SoftenFloatRes_SELECT(SDNode *N);
   SDValue SoftenFloatRes_SELECT_CC(SDNode *N);
   SDValue SoftenFloatRes_UNDEF(SDNode *N);
@@ -592,6 +593,7 @@ private:
   SDValue SoftenFloatOp_SELECT_CC(SDNode *N);
   SDValue SoftenFloatOp_SETCC(SDNode *N);
   SDValue SoftenFloatOp_STORE(SDNode *N, unsigned OpNo);
+  SDValue SoftenFloatOp_ATOMIC_STORE(SDNode *N, unsigned OpNo);
   SDValue SoftenFloatOp_FCOPYSIGN(SDNode *N);
 
   //===--------------------------------------------------------------------===//
@@ -710,6 +712,7 @@ private:
   SDValue PromoteFloatOp_UnaryOp(SDNode *N, unsigned OpNo);
   SDValue PromoteFloatOp_FP_TO_XINT_SAT(SDNode *N, unsigned OpNo);
   SDValue PromoteFloatOp_STORE(SDNode *N, unsigned OpNo);
+  SDValue PromoteFloatOp_ATOMIC_STORE(SDNode *N, unsigned OpNo);
   SDValue PromoteFloatOp_SELECT_CC(SDNode *N, unsigned OpNo);
   SDValue PromoteFloatOp_SETCC(SDNode *N, unsigned OpNo);
 
@@ -726,6 +729,7 @@ private:
   void SetSoftPromotedHalf(SDValue Op, SDValue Result);
 
   void SoftPromoteHalfResult(SDNode *N, unsigned ResNo);
+  SDValue SoftPromoteHalfRes_ARITH_FENCE(SDNode *N);
   SDValue SoftPromoteHalfRes_BinOp(SDNode *N);
   SDValue SoftPromoteHalfRes_BITCAST(SDNode *N);
   SDValue SoftPromoteHalfRes_ConstantFP(SDNode *N);
@@ -754,6 +758,7 @@ private:
   SDValue SoftPromoteHalfOp_SETCC(SDNode *N);
   SDValue SoftPromoteHalfOp_SELECT_CC(SDNode *N, unsigned OpNo);
   SDValue SoftPromoteHalfOp_STORE(SDNode *N, unsigned OpNo);
+  SDValue SoftPromoteHalfOp_ATOMIC_STORE(SDNode *N, unsigned OpNo);
   SDValue SoftPromoteHalfOp_STACKMAP(SDNode *N, unsigned OpNo);
   SDValue SoftPromoteHalfOp_PATCHPOINT(SDNode *N, unsigned OpNo);
 
@@ -783,6 +788,7 @@ private:
   SDValue ScalarizeVecRes_InregOp(SDNode *N);
   SDValue ScalarizeVecRes_VecInregOp(SDNode *N);
 
+  SDValue ScalarizeVecRes_ADDRSPACECAST(SDNode *N);
   SDValue ScalarizeVecRes_BITCAST(SDNode *N);
   SDValue ScalarizeVecRes_BUILD_VECTOR(SDNode *N);
   SDValue ScalarizeVecRes_EXTRACT_SUBVECTOR(SDNode *N);
@@ -850,6 +856,7 @@ private:
   void SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_ADDRSPACECAST(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_FFREXP(SDNode *N, unsigned ResNo, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_InregOp(SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -953,6 +960,7 @@ private:
   // Widen Vector Result Promotion.
   void WidenVectorResult(SDNode *N, unsigned ResNo);
   SDValue WidenVecRes_MERGE_VALUES(SDNode* N, unsigned ResNo);
+  SDValue WidenVecRes_ADDRSPACECAST(SDNode *N);
   SDValue WidenVecRes_AssertZext(SDNode* N);
   SDValue WidenVecRes_BITCAST(SDNode* N);
   SDValue WidenVecRes_BUILD_VECTOR(SDNode* N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index cab4dc5f3c15..cd858003cf03 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -23,6 +23,7 @@
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TypeSize.h"
@@ -116,6 +117,9 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FCANONICALIZE:
     R = ScalarizeVecRes_UnaryOp(N);
     break;
+  case ISD::ADDRSPACECAST:
+    R = ScalarizeVecRes_ADDRSPACECAST(N);
+    break;
   case ISD::FFREXP:
     R = ScalarizeVecRes_FFREXP(N, ResNo);
     break;
@@ -475,6 +479,31 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_VecInregOp(SDNode *N) {
   llvm_unreachable("Illegal extend_vector_inreg opcode");
 }
 
+SDValue DAGTypeLegalizer::ScalarizeVecRes_ADDRSPACECAST(SDNode *N) {
+  EVT DestVT = N->getValueType(0).getVectorElementType();
+  SDValue Op = N->getOperand(0);
+  EVT OpVT = Op.getValueType();
+  SDLoc DL(N);
+  // The result needs scalarizing, but it's not a given that the source does.
+  // This is a workaround for targets where it's impossible to scalarize the
+  // result of a conversion, because the source type is legal.
+  // For instance, this happens on AArch64: v1i1 is illegal but v1i{8,16,32}
+  // are widened to v8i8, v4i16, and v2i32, which is legal, because v1i64 is
+  // legal and was not scalarized.
+  // See the similar logic in ScalarizeVecRes_SETCC
+  if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
+    Op = GetScalarizedVector(Op);
+  } else {
+    EVT VT = OpVT.getVectorElementType();
+    Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
+                     DAG.getVectorIdxConstant(0, DL));
+  }
+  auto *AddrSpaceCastN = cast<AddrSpaceCastSDNode>(N);
+  unsigned SrcAS = AddrSpaceCastN->getSrcAddressSpace();
+  unsigned DestAS = AddrSpaceCastN->getDestAddressSpace();
+  return DAG.getAddrSpaceCast(DL, DestVT, Op, SrcAS, DestAS);
+}
+
 SDValue DAGTypeLegalizer::ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N) {
   // If the operand is wider than the vector element type then it is implicitly
   // truncated.  Make that explicit here.
@@ -1122,6 +1151,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FCANONICALIZE:
     SplitVecRes_UnaryOp(N, Lo, Hi);
     break;
+  case ISD::ADDRSPACECAST:
+    SplitVecRes_ADDRSPACECAST(N, Lo, Hi);
+    break;
   case ISD::FFREXP:
     SplitVecRes_FFREXP(N, ResNo, Lo, Hi);
     break;
@@ -2353,6 +2385,26 @@ void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo,
   Hi = DAG.getNode(Opcode, dl, HiVT, {Hi, MaskHi, EVLHi}, Flags);
 }
 
+void DAGTypeLegalizer::SplitVecRes_ADDRSPACECAST(SDNode *N, SDValue &Lo,
+                                                 SDValue &Hi) {
+  SDLoc dl(N);
+  auto [LoVT, HiVT] = DAG.GetSplitDestVTs(N->getValueType(0));
+
+  // If the input also splits, handle it directly for a compile time speedup.
+  // Otherwise split it by hand.
+  EVT InVT = N->getOperand(0).getValueType();
+  if (getTypeAction(InVT) == TargetLowering::TypeSplitVector)
+    GetSplitVector(N->getOperand(0), Lo, Hi);
+  else
+    std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
+
+  auto *AddrSpaceCastN = cast<AddrSpaceCastSDNode>(N);
+  unsigned SrcAS = AddrSpaceCastN->getSrcAddressSpace();
+  unsigned DestAS = AddrSpaceCastN->getDestAddressSpace();
+  Lo = DAG.getAddrSpaceCast(dl, LoVT, Lo, SrcAS, DestAS);
+  Hi = DAG.getAddrSpaceCast(dl, HiVT, Hi, SrcAS, DestAS);
+}
+
 void DAGTypeLegalizer::SplitVecRes_FFREXP(SDNode *N, unsigned ResNo,
                                           SDValue &Lo, SDValue &Hi) {
   SDLoc dl(N);
@@ -3096,6 +3148,8 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
   case ISD::VP_REDUCE_UMIN:
   case ISD::VP_REDUCE_FMAX:
   case ISD::VP_REDUCE_FMIN:
+  case ISD::VP_REDUCE_FMAXIMUM:
+  case ISD::VP_REDUCE_FMINIMUM:
     Res = SplitVecOp_VP_REDUCE(N, OpNo);
     break;
   case ISD::VP_CTTZ_ELTS:
@@ -4121,6 +4175,9 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
     report_fatal_error("Do not know how to widen the result of this operator!");
 
   case ISD::MERGE_VALUES:      Res = WidenVecRes_MERGE_VALUES(N, ResNo); break;
+  case ISD::ADDRSPACECAST:
+    Res = WidenVecRes_ADDRSPACECAST(N);
+    break;
   case ISD::AssertZext:        Res = WidenVecRes_AssertZext(N); break;
   case ISD::BITCAST:           Res = WidenVecRes_BITCAST(N); break;
   case ISD::BUILD_VECTOR:      Res = WidenVecRes_BUILD_VECTOR(N); break;
@@ -5086,6 +5143,16 @@ SDValue DAGTypeLegalizer::WidenVecRes_MERGE_VALUES(SDNode *N, unsigned ResNo) {
   return GetWidenedVector(WidenVec);
 }
 
+SDValue DAGTypeLegalizer::WidenVecRes_ADDRSPACECAST(SDNode *N) {
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  auto *AddrSpaceCastN = cast<AddrSpaceCastSDNode>(N);
+
+  return DAG.getAddrSpaceCast(SDLoc(N), WidenVT, InOp,
+                              AddrSpaceCastN->getSrcAddressSpace(),
+                              AddrSpaceCastN->getDestAddressSpace());
+}
+
 SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) {
   SDValue InOp = N->getOperand(0);
   EVT InVT = InOp.getValueType();
@@ -6186,6 +6253,8 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) {
   case ISD::VP_REDUCE_UMIN:
   case ISD::VP_REDUCE_FMAX:
   case ISD::VP_REDUCE_FMIN:
+  case ISD::VP_REDUCE_FMAXIMUM:
+  case ISD::VP_REDUCE_FMINIMUM:
     Res = WidenVecOp_VP_REDUCE(N);
     break;
   case ISD::VP_CTTZ_ELTS:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index d92976bc2f30..247f52370e4c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -470,8 +470,10 @@ ISD::NodeType ISD::getVecReduceBaseOpcode(unsigned VecReduceOpcode) {
   case ISD::VP_REDUCE_FMIN:
     return ISD::FMINNUM;
   case ISD::VECREDUCE_FMAXIMUM:
+  case ISD::VP_REDUCE_FMAXIMUM:
     return ISD::FMAXIMUM;
   case ISD::VECREDUCE_FMINIMUM:
+  case ISD::VP_REDUCE_FMINIMUM:
     return ISD::FMINIMUM;
   }
 }
@@ -3527,16 +3529,23 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
       Known.Zero.setBitsFrom(1);
     break;
   }
-  case ISD::SHL:
+  case ISD::SHL: {
     Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
     Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
-    Known = KnownBits::shl(Known, Known2);
+
+    bool NUW = Op->getFlags().hasNoUnsignedWrap();
+    bool NSW = Op->getFlags().hasNoSignedWrap();
+
+    bool ShAmtNonZero = Known2.isNonZero();
+
+    Known = KnownBits::shl(Known, Known2, NUW, NSW, ShAmtNonZero);
 
     // Minimum shift low bits are known zero.
     if (const APInt *ShMinAmt =
             getValidMinimumShiftAmountConstant(Op, DemandedElts))
       Known.Zero.setLowBits(ShMinAmt->getZExtValue());
     break;
+  }
   case ISD::SRL:
     Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
     Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
@@ -4366,6 +4375,16 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val, unsigned Depth) const {
   return false;
 }
 
+bool SelectionDAG::isKnownToBeAPowerOfTwoFP(SDValue Val, unsigned Depth) const {
+  if (ConstantFPSDNode *C1 = isConstOrConstSplatFP(Val, true))
+    return C1->getValueAPF().getExactLog2Abs() >= 0;
+
+  if (Val.getOpcode() == ISD::UINT_TO_FP || Val.getOpcode() == ISD::SINT_TO_FP)
+    return isKnownToBeAPowerOfTwo(Val.getOperand(0), Depth + 1);
+
+  return false;
+}
+
 unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
   EVT VT = Op.getValueType();
 
@@ -5081,6 +5100,24 @@ bool SelectionDAG::isGuaranteedNotToBeUndefOrPoison(SDValue Op,
     }
     return true;
 
+  case ISD::VECTOR_SHUFFLE: {
+    APInt DemandedLHS, DemandedRHS;
+    auto *SVN = cast<ShuffleVectorSDNode>(Op);
+    if (!getShuffleDemandedElts(DemandedElts.getBitWidth(), SVN->getMask(),
+                                DemandedElts, DemandedLHS, DemandedRHS,
+                                /*AllowUndefElts=*/false))
+      return false;
+    if (!DemandedLHS.isZero() &&
+        !isGuaranteedNotToBeUndefOrPoison(Op.getOperand(0), DemandedLHS,
+                                          PoisonOnly, Depth + 1))
+      return false;
+    if (!DemandedRHS.isZero() &&
+        !isGuaranteedNotToBeUndefOrPoison(Op.getOperand(1), DemandedRHS,
+                                          PoisonOnly, Depth + 1))
+      return false;
+    return true;
+  }
+
     // TODO: Search for noundef attributes from library functions.
 
     // TODO: Pointers dereferenced by ISD::LOAD/STORE ops are noundef.
@@ -5218,6 +5255,15 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
     return KnownIdx.getMaxValue().uge(VecVT.getVectorMinNumElements());
   }
 
+  case ISD::VECTOR_SHUFFLE: {
+    // Check for any demanded shuffle element that is undef.
+    auto *SVN = cast<ShuffleVectorSDNode>(Op);
+    for (auto [Idx, Elt] : enumerate(SVN->getMask()))
+      if (Elt < 0 && DemandedElts[Idx])
+        return true;
+    return false;
+  }
+
   default:
     // Allow the target to implement this method for its nodes.
     if (Opcode >= ISD::BUILTIN_OP_END || Opcode == ISD::INTRINSIC_WO_CHAIN ||
@@ -5521,6 +5567,13 @@ bool SelectionDAG::isKnownNeverZero(SDValue Op, unsigned Depth) const {
   return computeKnownBits(Op, Depth).isNonZero();
 }
 
+bool SelectionDAG::cannotBeOrderedNegativeFP(SDValue Op) const {
+  if (ConstantFPSDNode *C1 = isConstOrConstSplatFP(Op, true))
+    return !C1->isNegative();
+
+  return Op.getOpcode() == ISD::FABS;
+}
+
 bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const {
   // Check the obvious case.
   if (A == B) return true;
@@ -9580,6 +9633,44 @@ SDValue SelectionDAG::getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl,
   return V;
 }
 
+SDValue SelectionDAG::getMaskedHistogram(SDVTList VTs, EVT MemVT,
+                                         const SDLoc &dl, ArrayRef<SDValue> Ops,
+                                         MachineMemOperand *MMO,
+                                         ISD::MemIndexType IndexType) {
+  assert(Ops.size() == 7 && "Incompatible number of operands");
+
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, VTs, Ops);
+  ID.AddInteger(MemVT.getRawBits());
+  ID.AddInteger(getSyntheticNodeSubclassData<MaskedHistogramSDNode>(
+      dl.getIROrder(), VTs, MemVT, MMO, IndexType));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  ID.AddInteger(MMO->getFlags());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+    cast<MaskedGatherSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+
+  auto *N = newSDNode<MaskedHistogramSDNode>(dl.getIROrder(), dl.getDebugLoc(),
+                                             VTs, MemVT, MMO, IndexType);
+  createOperands(N, Ops);
+
+  assert(N->getMask().getValueType().getVectorElementCount() ==
+             N->getIndex().getValueType().getVectorElementCount() &&
+         "Vector width mismatch between mask and data");
+  assert(isa<ConstantSDNode>(N->getScale()) &&
+         N->getScale()->getAsAPIntVal().isPowerOf2() &&
+         "Scale should be a constant power of 2");
+  assert(N->getInc().getValueType().isInteger() && "Non integer update value");
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
 SDValue SelectionDAG::getGetFPEnv(SDValue Chain, const SDLoc &dl, SDValue Ptr,
                                   EVT MemVT, MachineMemOperand *MMO) {
   assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index cfd82a342433..ca352da5d36e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3357,7 +3357,7 @@ void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) {
       break;
     }
     }
-  } else if (I.countOperandBundlesOfType(LLVMContext::OB_deopt)) {
+  } else if (I.hasDeoptState()) {
     // Currently we do not lower any intrinsic calls with deopt operand bundles.
     // Eventually we will support lowering the @llvm.experimental.deoptimize
     // intrinsic, and right now there are no plans to support other intrinsics
@@ -6281,6 +6281,64 @@ void SelectionDAGBuilder::visitConvergenceControl(const CallInst &I,
   }
 }
 
+void SelectionDAGBuilder::visitVectorHistogram(const CallInst &I,
+                                               unsigned IntrinsicID) {
+  // For now, we're only lowering an 'add' histogram.
+  // We can add others later, e.g. saturating adds, min/max.
+  assert(IntrinsicID == Intrinsic::experimental_vector_histogram_add &&
+         "Tried to lower unsupported histogram type");
+  SDLoc sdl = getCurSDLoc();
+  Value *Ptr = I.getOperand(0);
+  SDValue Inc = getValue(I.getOperand(1));
+  SDValue Mask = getValue(I.getOperand(2));
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  DataLayout TargetDL = DAG.getDataLayout();
+  EVT VT = Inc.getValueType();
+  Align Alignment = DAG.getEVTAlign(VT);
+
+  const MDNode *Ranges = getRangeMetadata(I);
+
+  SDValue Root = DAG.getRoot();
+  SDValue Base;
+  SDValue Index;
+  ISD::MemIndexType IndexType;
+  SDValue Scale;
+  bool UniformBase = getUniformBase(Ptr, Base, Index, IndexType, Scale, this,
+                                    I.getParent(), VT.getScalarStoreSize());
+
+  unsigned AS = Ptr->getType()->getScalarType()->getPointerAddressSpace();
+
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      MachinePointerInfo(AS),
+      MachineMemOperand::MOLoad | MachineMemOperand::MOStore,
+      MemoryLocation::UnknownSize, Alignment, I.getAAMetadata(), Ranges);
+
+  if (!UniformBase) {
+    Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
+    Index = getValue(Ptr);
+    IndexType = ISD::SIGNED_SCALED;
+    Scale =
+        DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
+  }
+
+  EVT IdxVT = Index.getValueType();
+  EVT EltTy = IdxVT.getVectorElementType();
+  if (TLI.shouldExtendGSIndex(IdxVT, EltTy)) {
+    EVT NewIdxVT = IdxVT.changeVectorElementType(EltTy);
+    Index = DAG.getNode(ISD::SIGN_EXTEND, sdl, NewIdxVT, Index);
+  }
+
+  SDValue ID = DAG.getTargetConstant(IntrinsicID, sdl, MVT::i32);
+
+  SDValue Ops[] = {Root, Inc, Mask, Base, Index, Scale, ID};
+  SDValue Histogram = DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), VT, sdl,
+                                             Ops, MMO, IndexType);
+
+  setValue(&I, Histogram);
+  DAG.setRoot(Histogram);
+}
+
 /// Lower the call to the specified intrinsic function.
 void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                                              unsigned Intrinsic) {
@@ -6700,22 +6758,24 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::roundeven:
   case Intrinsic::canonicalize: {
     unsigned Opcode;
+    // clang-format off
     switch (Intrinsic) {
     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-    case Intrinsic::sqrt:      Opcode = ISD::FSQRT;      break;
-    case Intrinsic::fabs:      Opcode = ISD::FABS;       break;
-    case Intrinsic::sin:       Opcode = ISD::FSIN;       break;
-    case Intrinsic::cos:       Opcode = ISD::FCOS;       break;
-    case Intrinsic::exp10:     Opcode = ISD::FEXP10;     break;
-    case Intrinsic::floor:     Opcode = ISD::FFLOOR;     break;
-    case Intrinsic::ceil:      Opcode = ISD::FCEIL;      break;
-    case Intrinsic::trunc:     Opcode = ISD::FTRUNC;     break;
-    case Intrinsic::rint:      Opcode = ISD::FRINT;      break;
-    case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
-    case Intrinsic::round:     Opcode = ISD::FROUND;     break;
-    case Intrinsic::roundeven: Opcode = ISD::FROUNDEVEN; break;
+    case Intrinsic::sqrt:         Opcode = ISD::FSQRT;         break;
+    case Intrinsic::fabs:         Opcode = ISD::FABS;          break;
+    case Intrinsic::sin:          Opcode = ISD::FSIN;          break;
+    case Intrinsic::cos:          Opcode = ISD::FCOS;          break;
+    case Intrinsic::exp10:        Opcode = ISD::FEXP10;        break;
+    case Intrinsic::floor:        Opcode = ISD::FFLOOR;        break;
+    case Intrinsic::ceil:         Opcode = ISD::FCEIL;         break;
+    case Intrinsic::trunc:        Opcode = ISD::FTRUNC;        break;
+    case Intrinsic::rint:         Opcode = ISD::FRINT;         break;
+    case Intrinsic::nearbyint:    Opcode = ISD::FNEARBYINT;    break;
+    case Intrinsic::round:        Opcode = ISD::FROUND;        break;
+    case Intrinsic::roundeven:    Opcode = ISD::FROUNDEVEN;    break;
     case Intrinsic::canonicalize: Opcode = ISD::FCANONICALIZE; break;
     }
+    // clang-format on
 
     setValue(&I, DAG.getNode(Opcode, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
@@ -6727,6 +6787,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::lrint:
   case Intrinsic::llrint: {
     unsigned Opcode;
+    // clang-format off
     switch (Intrinsic) {
     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
     case Intrinsic::lround:  Opcode = ISD::LROUND;  break;
@@ -6734,6 +6795,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     case Intrinsic::lrint:   Opcode = ISD::LRINT;   break;
     case Intrinsic::llrint:  Opcode = ISD::LLRINT;  break;
     }
+    // clang-format on
 
     EVT RetVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
     setValue(&I, DAG.getNode(Opcode, sdl, RetVT,
@@ -7861,20 +7923,15 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
       Op = DAG.getSetCC(DL, OpVT, Op, AllZero, ISD::SETNE);
     }
 
-    // Find the smallest "sensible" element type to use for the expansion.
-    ConstantRange CR(
-        APInt(64, OpVT.getVectorElementCount().getKnownMinValue()));
-    if (OpVT.isScalableVT())
-      CR = CR.umul_sat(getVScaleRange(I.getCaller(), 64));
-
     // If the zero-is-poison flag is set, we can assume the upper limit
     // of the result is VF-1.
-    if (!cast<ConstantSDNode>(getValue(I.getOperand(1)))->isZero())
-      CR = CR.subtract(APInt(64, 1));
-
-    unsigned EltWidth = I.getType()->getScalarSizeInBits();
-    EltWidth = std::min(EltWidth, (unsigned)CR.getActiveBits());
-    EltWidth = std::max(llvm::bit_ceil(EltWidth), (unsigned)8);
+    bool ZeroIsPoison =
+        !cast<ConstantSDNode>(getValue(I.getOperand(1)))->isZero();
+    ConstantRange VScaleRange(1, true); // Dummy value.
+    if (isa<ScalableVectorType>(I.getOperand(0)->getType()))
+      VScaleRange = getVScaleRange(I.getCaller(), 64);
+    unsigned EltWidth = TLI.getBitWidthForCttzElements(
+        I.getType(), OpVT.getVectorElementCount(), ZeroIsPoison, &VScaleRange);
 
     MVT NewEltTy = MVT::getIntegerVT(EltWidth);
 
@@ -7949,6 +8006,11 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::experimental_convergence_entry:
   case Intrinsic::experimental_convergence_loop:
     visitConvergenceControl(I, Intrinsic);
+    return;
+  case Intrinsic::experimental_vector_histogram_add: {
+    visitVectorHistogram(I, Intrinsic);
+    return;
+  }
   }
 }
 
@@ -7962,16 +8024,8 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
   SDValue Chain = DAG.getRoot();
   SmallVector<SDValue, 4> Opers;
   Opers.push_back(Chain);
-  if (FPI.isUnaryOp()) {
-    Opers.push_back(getValue(FPI.getArgOperand(0)));
-  } else if (FPI.isTernaryOp()) {
-    Opers.push_back(getValue(FPI.getArgOperand(0)));
-    Opers.push_back(getValue(FPI.getArgOperand(1)));
-    Opers.push_back(getValue(FPI.getArgOperand(2)));
-  } else {
-    Opers.push_back(getValue(FPI.getArgOperand(0)));
-    Opers.push_back(getValue(FPI.getArgOperand(1)));
-  }
+  for (unsigned I = 0, E = FPI.getNonMetadataArgCount(); I != E; ++I)
+    Opers.push_back(getValue(FPI.getArgOperand(I)));
 
   auto pushOutChain = [this](SDValue Result, fp::ExceptionBehavior EB) {
     assert(Result.getNode()->getNumValues() == 2);
@@ -9205,7 +9259,7 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
 
   SDValue Callee = getValue(I.getCalledOperand());
 
-  if (I.countOperandBundlesOfType(LLVMContext::OB_deopt))
+  if (I.hasDeoptState())
     LowerCallSiteWithDeoptBundle(&I, Callee, nullptr);
   else
     // Check if we can potentially perform a tail call. More detailed checking
@@ -12249,9 +12303,8 @@ void SelectionDAGBuilder::visitVectorSplice(const CallInst &I) {
 
   // VECTOR_SHUFFLE doesn't support a scalable mask so use a dedicated node.
   if (VT.isScalableVector()) {
-    MVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
     setValue(&I, DAG.getNode(ISD::VECTOR_SPLICE, DL, VT, V1, V2,
-                             DAG.getConstant(Imm, DL, IdxVT)));
+                             DAG.getVectorIdxConstant(Imm, DL)));
     return;
   }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 211e1653de56..ae361f8c500a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -624,6 +624,7 @@ private:
   void visitTargetIntrinsic(const CallInst &I, unsigned Intrinsic);
   void visitConstrainedFPIntrinsic(const ConstrainedFPIntrinsic &FPI);
   void visitConvergenceControl(const CallInst &I, unsigned Intrinsic);
+  void visitVectorHistogram(const CallInst &I, unsigned IntrinsicID);
   void visitVPLoad(const VPIntrinsic &VPIntrin, EVT VT,
                    const SmallVectorImpl<SDValue> &OpValues);
   void visitVPStore(const VPIntrinsic &VPIntrin,
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 4ad4a938ca97..59742e90c679 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -529,6 +529,9 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::PATCHPOINT:
     return "patchpoint";
 
+  case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
+    return "histogram";
+
     // Vector Predication
 #define BEGIN_REGISTER_VP_SDNODE(SDID, LEGALARG, NAME, ...)                    \
   case ISD::SDID:                                                              \
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 336d89fbcf63..7beaeb9b7a17 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -62,9 +62,10 @@ bool TargetLowering::isInTailCallPosition(SelectionDAG &DAG, SDNode *Node,
   // the return. Ignore following attributes because they don't affect the
   // call sequence.
   AttrBuilder CallerAttrs(F.getContext(), F.getAttributes().getRetAttrs());
-  for (const auto &Attr : {Attribute::Alignment, Attribute::Dereferenceable,
-                           Attribute::DereferenceableOrNull, Attribute::NoAlias,
-                           Attribute::NonNull, Attribute::NoUndef})
+  for (const auto &Attr :
+       {Attribute::Alignment, Attribute::Dereferenceable,
+        Attribute::DereferenceableOrNull, Attribute::NoAlias,
+        Attribute::NonNull, Attribute::NoUndef, Attribute::Range})
     CallerAttrs.removeAttribute(Attr);
 
   if (CallerAttrs.hasAttributes())
@@ -8400,8 +8401,14 @@ SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N,
   SDValue MinMax;
   unsigned CompOpcIeee = IsMax ? ISD::FMAXNUM_IEEE : ISD::FMINNUM_IEEE;
   unsigned CompOpc = IsMax ? ISD::FMAXNUM : ISD::FMINNUM;
+
+  // FIXME: We should probably define fminnum/fmaxnum variants with correct
+  // signed zero behavior.
+  bool MinMaxMustRespectOrderedZero = false;
+
   if (isOperationLegalOrCustom(CompOpcIeee, VT)) {
     MinMax = DAG.getNode(CompOpcIeee, DL, VT, LHS, RHS);
+    MinMaxMustRespectOrderedZero = true;
   } else if (isOperationLegalOrCustom(CompOpc, VT)) {
     MinMax = DAG.getNode(CompOpc, DL, VT, LHS, RHS);
   } else {
@@ -8421,8 +8428,8 @@ SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N,
   }
 
   // fminimum/fmaximum requires -0.0 less than +0.0
-  if (!N->getFlags().hasNoSignedZeros() && !DAG.isKnownNeverZeroFloat(RHS) &&
-      !DAG.isKnownNeverZeroFloat(LHS)) {
+  if (!MinMaxMustRespectOrderedZero && !N->getFlags().hasNoSignedZeros() &&
+      !DAG.isKnownNeverZeroFloat(RHS) && !DAG.isKnownNeverZeroFloat(LHS)) {
     SDValue IsZero = DAG.getSetCC(DL, CCVT, MinMax,
                                   DAG.getConstantFP(0.0, DL, VT), ISD::SETEQ);
     SDValue TestZero =
diff --git a/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp b/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp
index 778ac1f5701c..687acd90b405 100644
--- a/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp
+++ b/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp
@@ -126,8 +126,7 @@ bool StackMapLiveness::calculateLiveness(MachineFunction &MF) {
   for (auto &MBB : MF) {
     LLVM_DEBUG(dbgs() << "****** BB " << MBB.getName() << " ******\n");
     LiveRegs.init(*TRI);
-    // FIXME: This should probably be addLiveOuts().
-    LiveRegs.addLiveOutsNoPristines(MBB);
+    LiveRegs.addLiveOuts(MBB);
     bool HasStackMap = false;
     // Reverse iterate over all instructions and add the current live register
     // set to an instruction if we encounter a patchpoint instruction.
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 6e7b67ded23c..09b70cfb7227 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1048,6 +1048,24 @@ bool TargetLoweringBase::isFreeAddrSpaceCast(unsigned SrcAS,
   return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
 }
 
+unsigned TargetLoweringBase::getBitWidthForCttzElements(
+    Type *RetTy, ElementCount EC, bool ZeroIsPoison,
+    const ConstantRange *VScaleRange) const {
+  // Find the smallest "sensible" element type to use for the expansion.
+  ConstantRange CR(APInt(64, EC.getKnownMinValue()));
+  if (EC.isScalable())
+    CR = CR.umul_sat(*VScaleRange);
+
+  if (ZeroIsPoison)
+    CR = CR.subtract(APInt(64, 1));
+
+  unsigned EltWidth = RetTy->getScalarSizeInBits();
+  EltWidth = std::min(EltWidth, (unsigned)CR.getActiveBits());
+  EltWidth = std::max(llvm::bit_ceil(EltWidth), (unsigned)8);
+
+  return EltWidth;
+}
+
 void TargetLoweringBase::setJumpIsExpensive(bool isExpensive) {
   // If the command-line option was specified, ignore this request.
   if (!JumpIsExpensiveOverride.getNumOccurrences())
@@ -2249,7 +2267,7 @@ static int getOpEnabled(bool IsSqrt, EVT VT, StringRef Override) {
     if (IsDisabled)
       RecipType = RecipType.substr(1);
 
-    if (RecipType.equals(VTName) || RecipType.equals(VTNameNoSize))
+    if (RecipType == VTName || RecipType == VTNameNoSize)
       return IsDisabled ? TargetLoweringBase::ReciprocalEstimate::Disabled
                         : TargetLoweringBase::ReciprocalEstimate::Enabled;
   }
@@ -2299,7 +2317,7 @@ static int getOpRefinementSteps(bool IsSqrt, EVT VT, StringRef Override) {
       continue;
 
     RecipType = RecipType.substr(0, RefPos);
-    if (RecipType.equals(VTName) || RecipType.equals(VTNameNoSize))
+    if (RecipType == VTName || RecipType == VTNameNoSize)
       return RefSteps;
   }
 
diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 2a77a683a901..3e1897ce670a 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -212,13 +212,11 @@ void TargetLoweringObjectFileELF::Initialize(MCContext &Ctx,
     //        identify N64 from just a triple.
     TTypeEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
                     dwarf::DW_EH_PE_sdata4;
-    // We don't support PC-relative LSDA references in GAS so we use the default
-    // DW_EH_PE_absptr for those.
 
     // FreeBSD must be explicit about the data size and using pcrel since it's
     // assembler/linker won't do the automatic conversion that the Linux tools
     // do.
-    if (TgtM.getTargetTriple().isOSFreeBSD()) {
+    if (isPositionIndependent() || TgtM.getTargetTriple().isOSFreeBSD()) {
       PersonalityEncoding |= dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
       LSDAEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
     }
@@ -733,15 +731,20 @@ calcUniqueIDUpdateFlagsAndSize(const GlobalObject *GO, StringRef SectionName,
       Ctx.isELFGenericMergeableSection(SectionName);
   // If this is the first ocurrence of this section name, treat it as the
   // generic section
-  if (!SymbolMergeable && !SeenSectionNameBefore)
-    return MCContext::GenericSectionID;
+  if (!SymbolMergeable && !SeenSectionNameBefore) {
+    if (TM.getSeparateNamedSections())
+      return NextUniqueID++;
+    else
+      return MCContext::GenericSectionID;
+  }
 
   // Symbols must be placed into sections with compatible entry sizes. Generate
   // unique sections for symbols that have not been assigned to compatible
   // sections.
   const auto PreviousID =
       Ctx.getELFUniqueIDForEntsize(SectionName, Flags, EntrySize);
-  if (PreviousID)
+  if (PreviousID && (!TM.getSeparateNamedSections() ||
+                     *PreviousID == MCContext::GenericSectionID))
     return *PreviousID;
 
   // If the user has specified the same section name as would be created
@@ -1031,7 +1034,7 @@ MCSection *TargetLoweringObjectFileELF::getSectionForMachineBasicBlock(
   // name, or a unique ID for the section.
   SmallString<128> Name;
   StringRef FunctionSectionName = MBB.getParent()->getSection()->getName();
-  if (FunctionSectionName.equals(".text") ||
+  if (FunctionSectionName == ".text" ||
       FunctionSectionName.starts_with(".text.")) {
     // Function is in a regular .text section.
     StringRef FunctionName = MBB.getParent()->getName();
diff --git a/llvm/lib/DebugInfo/LogicalView/Core/LVOptions.cpp b/llvm/lib/DebugInfo/LogicalView/Core/LVOptions.cpp
index 265237ee21dc..c8789cb959fb 100644
--- a/llvm/lib/DebugInfo/LogicalView/Core/LVOptions.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Core/LVOptions.cpp
@@ -512,7 +512,7 @@ bool LVPatterns::matchPattern(StringRef Input, const LVMatchInfo &MatchInfo) {
   for (const LVMatch &Match : MatchInfo) {
     switch (Match.Mode) {
     case LVMatchMode::Match:
-      Matched = Input.equals(Match.Pattern);
+      Matched = Input == Match.Pattern;
       break;
     case LVMatchMode::NoCase:
       Matched = Input.equals_insensitive(Match.Pattern);
diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp
index 2d46414a6986..c45f0e91c435 100644
--- a/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp
@@ -184,9 +184,8 @@ void LVBinaryReader::mapVirtualAddress(const object::ObjectFile &Obj) {
       consumeError(SectionNameOrErr.takeError());
       continue;
     }
-    if ((*SectionNameOrErr).equals(".text") ||
-        (*SectionNameOrErr).equals("CODE") ||
-        (*SectionNameOrErr).equals(".code")) {
+    if (*SectionNameOrErr == ".text" || *SectionNameOrErr == "CODE" ||
+        *SectionNameOrErr == ".code") {
       DotTextSectionIndex = Section.getIndex();
       // If the object is WebAssembly, update the address offset that
       // will be added to DWARF DW_AT_* attributes.
diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewVisitor.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewVisitor.cpp
index 1d0178532882..e89664d360a9 100644
--- a/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewVisitor.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewVisitor.cpp
@@ -834,7 +834,7 @@ Error LVSymbolVisitor::visitKnownRecord(CVSymbol &Record,
     // Symbol was created as 'variable'; determine its real kind.
     Symbol->resetIsVariable();
 
-    if (Local.Name.equals("this")) {
+    if (Local.Name == "this") {
       Symbol->setIsParameter();
       Symbol->setIsArtificial();
     } else {
@@ -885,7 +885,7 @@ Error LVSymbolVisitor::visitKnownRecord(CVSymbol &Record,
     Symbol->resetIsVariable();
 
     // Check for the 'this' symbol.
-    if (Local.Name.equals("this")) {
+    if (Local.Name == "this") {
       Symbol->setIsArtificial();
       Symbol->setIsParameter();
     } else {
@@ -1429,7 +1429,7 @@ Error LVSymbolVisitor::visitKnownRecord(CVSymbol &Record, LocalSym &Local) {
 
     // Be sure the 'this' symbol is marked as 'compiler generated'.
     if (bool(Local.Flags & LocalSymFlags::IsCompilerGenerated) ||
-        Local.Name.equals("this")) {
+        Local.Name == "this") {
       Symbol->setIsArtificial();
       Symbol->setIsParameter();
     } else {
@@ -1669,7 +1669,7 @@ Error LVSymbolVisitor::visitKnownRecord(CVSymbol &Record, UDTSym &UDT) {
       Type->resetIncludeInPrint();
     else {
       StringRef RecordName = getRecordName(Types, UDT.Type);
-      if (UDT.Name.equals(RecordName))
+      if (UDT.Name == RecordName)
         Type->resetIncludeInPrint();
       Type->setType(LogicalVisitor->getElement(StreamTPI, UDT.Type));
     }
@@ -2740,7 +2740,7 @@ Error LVLogicalVisitor::visitKnownMember(CVMemberRecord &Record,
             getInnerComponent(NestedTypeName);
         // We have an already created nested type. Add it to the current scope
         // and update all its children if any.
-        if (OuterComponent.size() && OuterComponent.equals(RecordName)) {
+        if (OuterComponent.size() && OuterComponent == RecordName) {
           if (!NestedType->getIsScopedAlready()) {
             Scope->addElement(NestedType);
             NestedType->setIsScopedAlready();
diff --git a/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp b/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
index d4fc48e146f6..02a9555858e4 100644
--- a/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
+++ b/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
@@ -355,7 +355,7 @@ std::vector<object::SectionedAddress>
 SymbolizableObjectFile::findSymbol(StringRef Symbol, uint64_t Offset) const {
   std::vector<object::SectionedAddress> Result;
   for (const SymbolDesc &Sym : Symbols) {
-    if (Sym.Name.equals(Symbol)) {
+    if (Sym.Name == Symbol) {
       uint64_t Addr = Sym.Addr;
       if (Offset < Sym.Size)
         Addr += Offset;
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCGenericDylibManager.cpp b/llvm/lib/ExecutionEngine/Orc/EPCGenericDylibManager.cpp
index 6a7cab4a5510..298bde46ab75 100644
--- a/llvm/lib/ExecutionEngine/Orc/EPCGenericDylibManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/EPCGenericDylibManager.cpp
@@ -92,6 +92,7 @@ void EPCGenericDylibManager::lookupAsync(tpctypes::DylibHandle H,
         if (SerializationErr) {
           cantFail(Result.takeError());
           Complete(std::move(SerializationErr));
+          return;
         }
         Complete(std::move(Result));
       },
@@ -109,6 +110,7 @@ void EPCGenericDylibManager::lookupAsync(tpctypes::DylibHandle H,
         if (SerializationErr) {
           cantFail(Result.takeError());
           Complete(std::move(SerializationErr));
+          return;
         }
         Complete(std::move(Result));
       },
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index edeb563076fd..eaf8c35142de 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -659,7 +659,7 @@ void RuntimeDyldELF::setMipsABI(const ObjectFile &Obj) {
     IsMipsO32ABI = AbiVariant & ELF::EF_MIPS_ABI_O32;
     IsMipsN32ABI = AbiVariant & ELF::EF_MIPS_ABI2;
   }
-  IsMipsN64ABI = Obj.getFileFormatName().equals("elf64-mips");
+  IsMipsN64ABI = Obj.getFileFormatName() == "elf64-mips";
 }
 
 // Return the .TOC. section and offset.
diff --git a/llvm/lib/FileCheck/FileCheck.cpp b/llvm/lib/FileCheck/FileCheck.cpp
index 8f80a69c4abd..1719f8ef2b43 100644
--- a/llvm/lib/FileCheck/FileCheck.cpp
+++ b/llvm/lib/FileCheck/FileCheck.cpp
@@ -374,7 +374,7 @@ Expected<NumericVariable *> Pattern::parseNumericVariableDefinition(
 Expected<std::unique_ptr<NumericVariableUse>> Pattern::parseNumericVariableUse(
     StringRef Name, bool IsPseudo, std::optional<size_t> LineNumber,
     FileCheckPatternContext *Context, const SourceMgr &SM) {
-  if (IsPseudo && !Name.equals("@LINE"))
+  if (IsPseudo && Name != "@LINE")
     return ErrorDiagnostic::get(
         SM, Name, "invalid pseudo numeric variable '" + Name + "'");
 
diff --git a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
index 7241d15ed1c6..8b6f9ea1f4cc 100644
--- a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
+++ b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
@@ -232,12 +232,13 @@ void createRegisterFunction(Module &M, GlobalVariable *BinDesc,
   // Construct function body
   IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func));
 
+  Builder.CreateCall(RegFuncC, BinDesc);
+
   // Register the destructors with 'atexit'. This is expected by the CUDA
   // runtime and ensures that we clean up before dynamic objects are destroyed.
-  // This needs to be done before the runtime is called and registers its own.
+  // This needs to be done after plugin initialization to ensure that it is
+  // called before the plugin runtime is destroyed.
   Builder.CreateCall(AtExit, UnregFunc);
-
-  Builder.CreateCall(RegFuncC, BinDesc);
   Builder.CreateRetVoid();
 
   // Add this function to constructors.
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 4d2d352f7520..391a4947877a 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -1500,19 +1500,6 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel(
     };
   }
 
-  // Adjust the finalization stack, verify the adjustment, and call the
-  // finalize function a last time to finalize values between the pre-fini
-  // block and the exit block if we left the parallel "the normal way".
-  auto FiniInfo = FinalizationStack.pop_back_val();
-  (void)FiniInfo;
-  assert(FiniInfo.DK == OMPD_parallel &&
-         "Unexpected finalization stack state!");
-
-  Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
-
-  InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
-  FiniCB(PreFiniIP);
-
   OI.OuterAllocaBB = OuterAllocaBlock;
   OI.EntryBB = PRegEntryBB;
   OI.ExitBB = PRegExitBB;
@@ -1637,6 +1624,19 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel(
       dbgs() << " PBR: " << BB->getName() << "\n";
   });
 
+  // Adjust the finalization stack, verify the adjustment, and call the
+  // finalize function a last time to finalize values between the pre-fini
+  // block and the exit block if we left the parallel "the normal way".
+  auto FiniInfo = FinalizationStack.pop_back_val();
+  (void)FiniInfo;
+  assert(FiniInfo.DK == OMPD_parallel &&
+         "Unexpected finalization stack state!");
+
+  Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
+
+  InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
+  FiniCB(PreFiniIP);
+
   // Register the outlined info.
   addOutlineInfo(std::move(OI));
 
@@ -1870,6 +1870,9 @@ OpenMPIRBuilder::createTask(const LocationDescription &Loc,
     //    call @__kmpc_omp_task(...)
     //    br label %exit
     //  else:
+    //    ;; Wait for resolution of dependencies, if any, before
+    //    ;; beginning the task
+    //    call @__kmpc_omp_wait_deps(...)
     //    call @__kmpc_omp_task_begin_if0(...)
     //    call @outlined_fn(...)
     //    call @__kmpc_omp_task_complete_if0(...)
@@ -1887,6 +1890,16 @@ OpenMPIRBuilder::createTask(const LocationDescription &Loc,
       SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
                                     &ElseTI);
       Builder.SetInsertPoint(ElseTI);
+
+      if (Dependencies.size()) {
+        Function *TaskWaitFn =
+            getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
+        Builder.CreateCall(
+            TaskWaitFn,
+            {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
+             ConstantInt::get(Builder.getInt32Ty(), 0),
+             ConstantPointerNull::get(PointerType::getUnqual(M.getContext()))});
+      }
       Function *TaskBeginFn =
           getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
       Function *TaskCompleteFn =
@@ -4401,7 +4414,7 @@ CallInst *OpenMPIRBuilder::createOMPAlloc(const LocationDescription &Loc,
                                           Value *Size, Value *Allocator,
                                           std::string Name) {
   IRBuilder<>::InsertPointGuard IPG(Builder);
-  Builder.restoreIP(Loc.IP);
+  updateToLocation(Loc);
 
   uint32_t SrcLocStrSize;
   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
@@ -4418,7 +4431,7 @@ CallInst *OpenMPIRBuilder::createOMPFree(const LocationDescription &Loc,
                                          Value *Addr, Value *Allocator,
                                          std::string Name) {
   IRBuilder<>::InsertPointGuard IPG(Builder);
-  Builder.restoreIP(Loc.IP);
+  updateToLocation(Loc);
 
   uint32_t SrcLocStrSize;
   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
@@ -4434,7 +4447,7 @@ CallInst *OpenMPIRBuilder::createOMPInteropInit(
     omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
     Value *DependenceAddress, bool HaveNowaitClause) {
   IRBuilder<>::InsertPointGuard IPG(Builder);
-  Builder.restoreIP(Loc.IP);
+  updateToLocation(Loc);
 
   uint32_t SrcLocStrSize;
   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
@@ -4462,7 +4475,7 @@ CallInst *OpenMPIRBuilder::createOMPInteropDestroy(
     const LocationDescription &Loc, Value *InteropVar, Value *Device,
     Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
   IRBuilder<>::InsertPointGuard IPG(Builder);
-  Builder.restoreIP(Loc.IP);
+  updateToLocation(Loc);
 
   uint32_t SrcLocStrSize;
   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
@@ -4491,7 +4504,7 @@ CallInst *OpenMPIRBuilder::createOMPInteropUse(const LocationDescription &Loc,
                                                Value *DependenceAddress,
                                                bool HaveNowaitClause) {
   IRBuilder<>::InsertPointGuard IPG(Builder);
-  Builder.restoreIP(Loc.IP);
+  updateToLocation(Loc);
   uint32_t SrcLocStrSize;
   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
   Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
@@ -4517,7 +4530,7 @@ CallInst *OpenMPIRBuilder::createCachedThreadPrivate(
     const LocationDescription &Loc, llvm::Value *Pointer,
     llvm::ConstantInt *Size, const llvm::Twine &Name) {
   IRBuilder<>::InsertPointGuard IPG(Builder);
-  Builder.restoreIP(Loc.IP);
+  updateToLocation(Loc);
 
   uint32_t SrcLocStrSize;
   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
diff --git a/llvm/lib/FuzzMutate/FuzzerCLI.cpp b/llvm/lib/FuzzMutate/FuzzerCLI.cpp
index 58e4b74f4b22..504532865440 100644
--- a/llvm/lib/FuzzMutate/FuzzerCLI.cpp
+++ b/llvm/lib/FuzzMutate/FuzzerCLI.cpp
@@ -21,7 +21,7 @@ void llvm::parseFuzzerCLOpts(int ArgC, char *ArgV[]) {
 
   int I = 1;
   while (I < ArgC)
-    if (StringRef(ArgV[I++]).equals("-ignore_remaining_args=1"))
+    if (StringRef(ArgV[I++]) == "-ignore_remaining_args=1")
       break;
   while (I < ArgC)
     CLArgs.push_back(ArgV[I++]);
@@ -39,7 +39,7 @@ void llvm::handleExecNameEncodedBEOpts(StringRef ExecName) {
   SmallVector<StringRef, 4> Opts;
   NameAndArgs.second.split(Opts, '-');
   for (StringRef Opt : Opts) {
-    if (Opt.equals("gisel")) {
+    if (Opt == "gisel") {
       Args.push_back("-global-isel");
       // For now we default GlobalISel to -O0
       Args.push_back("-O0");
@@ -151,7 +151,7 @@ int llvm::runFuzzerOnInputs(int ArgC, char *ArgV[], FuzzerTestFun TestOne,
   for (int I = 1; I < ArgC; ++I) {
     StringRef Arg(ArgV[I]);
     if (Arg.starts_with("-")) {
-      if (Arg.equals("-ignore_remaining_args=1"))
+      if (Arg == "-ignore_remaining_args=1")
         break;
       continue;
     }
diff --git a/llvm/lib/IR/AttributeImpl.h b/llvm/lib/IR/AttributeImpl.h
index 58dc14588f41..dc5b80b6da68 100644
--- a/llvm/lib/IR/AttributeImpl.h
+++ b/llvm/lib/IR/AttributeImpl.h
@@ -77,7 +77,7 @@ public:
 
   Type *getValueAsType() const;
 
-  ConstantRange getValueAsConstantRange() const;
+  const ConstantRange &getValueAsConstantRange() const;
 
   /// Used when sorting the attributes.
   bool operator<(const AttributeImpl &AI) const;
@@ -219,7 +219,7 @@ public:
   ConstantRangeAttributeImpl(Attribute::AttrKind Kind, const ConstantRange &CR)
       : EnumAttributeImpl(ConstantRangeAttrEntry, Kind), CR(CR) {}
 
-  ConstantRange getConstantRangeValue() const { return CR; }
+  const ConstantRange &getConstantRangeValue() const { return CR; }
 };
 
 class AttributeBitSet {
diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
index c6e511b46e51..c8d6bdd42387 100644
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -360,7 +360,7 @@ Type *Attribute::getValueAsType() const {
   return pImpl->getValueAsType();
 }
 
-ConstantRange Attribute::getValueAsConstantRange() const {
+const ConstantRange &Attribute::getValueAsConstantRange() const {
   assert(isConstantRangeAttribute() &&
          "Invalid attribute type to get the value as a ConstantRange!");
   return pImpl->getValueAsConstantRange();
@@ -444,7 +444,7 @@ FPClassTest Attribute::getNoFPClass() const {
   return static_cast<FPClassTest>(pImpl->getValueAsInt());
 }
 
-ConstantRange Attribute::getRange() const {
+const ConstantRange &Attribute::getRange() const {
   assert(hasAttribute(Attribute::Range) &&
          "Trying to get range args from non-range attribute");
   return pImpl->getValueAsConstantRange();
@@ -607,7 +607,7 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
   if (hasAttribute(Attribute::Range)) {
     std::string Result;
     raw_string_ostream OS(Result);
-    ConstantRange CR = getValueAsConstantRange();
+    const ConstantRange &CR = getValueAsConstantRange();
     OS << "range(";
     OS << "i" << CR.getBitWidth() << " ";
     OS << CR.getLower() << ", " << CR.getUpper();
@@ -735,7 +735,7 @@ Type *AttributeImpl::getValueAsType() const {
   return static_cast<const TypeAttributeImpl *>(this)->getTypeValue();
 }
 
-ConstantRange AttributeImpl::getValueAsConstantRange() const {
+const ConstantRange &AttributeImpl::getValueAsConstantRange() const {
   assert(isConstantRangeAttribute());
   return static_cast<const ConstantRangeAttributeImpl *>(this)
       ->getConstantRangeValue();
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 5c65efba9e50..a7ed2de6e8a5 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -5406,6 +5406,14 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) {
     return Res;
   }
 
+  // AArch64 data layout upgrades.
+  if (T.isAArch64()) {
+    // Add "-Fn32"
+    if (!DL.empty() && !DL.contains("-Fn32"))
+      Res.append("-Fn32");
+    return Res;
+  }
+
   if (!T.isX86())
     return Res;
 
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 29f2cbf611fa..aea9425ebeba 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -181,7 +181,7 @@ template class llvm::SymbolTableListTraits<Instruction,
 BasicBlock::BasicBlock(LLVMContext &C, const Twine &Name, Function *NewParent,
                        BasicBlock *InsertBefore)
     : Value(Type::getLabelTy(C), Value::BasicBlockVal),
-      IsNewDbgInfoFormat(false), Parent(nullptr) {
+      IsNewDbgInfoFormat(UseNewDbgInfoFormat), Parent(nullptr) {
 
   if (NewParent)
     insertInto(NewParent, InsertBefore);
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index 5268eccf7014..db442c54125a 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -315,8 +315,8 @@ bool Constant::isElementWiseEqual(Value *Y) const {
   Type *IntTy = VectorType::getInteger(VTy);
   Constant *C0 = ConstantExpr::getBitCast(const_cast<Constant *>(this), IntTy);
   Constant *C1 = ConstantExpr::getBitCast(cast<Constant>(Y), IntTy);
-  Constant *CmpEq = ConstantExpr::getICmp(ICmpInst::ICMP_EQ, C0, C1);
-  return isa<PoisonValue>(CmpEq) || match(CmpEq, m_One());
+  Constant *CmpEq = ConstantFoldCompareInstruction(ICmpInst::ICMP_EQ, C0, C1);
+  return CmpEq && (isa<PoisonValue>(CmpEq) || match(CmpEq, m_One()));
 }
 
 static bool
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index 6aff94f39d9c..df90b8834112 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -14,6 +14,7 @@
 #include "llvm-c/Core.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -33,6 +34,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/raw_ostream.h"
@@ -45,6 +47,10 @@ using namespace llvm;
 
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(OperandBundleDef, LLVMOperandBundleRef)
 
+inline BasicBlock **unwrap(LLVMBasicBlockRef *BBs) {
+  return reinterpret_cast<BasicBlock **>(BBs);
+}
+
 #define DEBUG_TYPE "ir"
 
 void llvm::initializeCore(PassRegistry &Registry) {
@@ -178,6 +184,20 @@ LLVMTypeRef LLVMGetTypeAttributeValue(LLVMAttributeRef A) {
   return wrap(Attr.getValueAsType());
 }
 
+LLVMAttributeRef LLVMCreateConstantRangeAttribute(LLVMContextRef C,
+                                                  unsigned KindID,
+                                                  unsigned NumBits,
+                                                  const uint64_t LowerWords[],
+                                                  const uint64_t UpperWords[]) {
+  auto &Ctx = *unwrap(C);
+  auto AttrKind = (Attribute::AttrKind)KindID;
+  unsigned NumWords = divideCeil(NumBits, 64);
+  return wrap(Attribute::get(
+      Ctx, AttrKind,
+      ConstantRange(APInt(NumBits, ArrayRef(LowerWords, NumWords)),
+                    APInt(NumBits, ArrayRef(UpperWords, NumWords)))));
+}
+
 LLVMAttributeRef LLVMCreateStringAttribute(LLVMContextRef C,
                                            const char *K, unsigned KLength,
                                            const char *V, unsigned VLength) {
@@ -3015,6 +3035,18 @@ void LLVMSetUnwindDest(LLVMValueRef Invoke, LLVMBasicBlockRef B) {
   unwrap<InvokeInst>(Invoke)->setUnwindDest(unwrap(B));
 }
 
+LLVMBasicBlockRef LLVMGetCallBrDefaultDest(LLVMValueRef CallBr) {
+  return wrap(unwrap<CallBrInst>(CallBr)->getDefaultDest());
+}
+
+unsigned LLVMGetCallBrNumIndirectDests(LLVMValueRef CallBr) {
+  return unwrap<CallBrInst>(CallBr)->getNumIndirectDests();
+}
+
+LLVMBasicBlockRef LLVMGetCallBrIndirectDest(LLVMValueRef CallBr, unsigned Idx) {
+  return wrap(unwrap<CallBrInst>(CallBr)->getIndirectDest(Idx));
+}
+
 /*--.. Operations on terminators ...........................................--*/
 
 unsigned LLVMGetNumSuccessors(LLVMValueRef Term) {
@@ -3242,6 +3274,25 @@ LLVMValueRef LLVMBuildIndirectBr(LLVMBuilderRef B, LLVMValueRef Addr,
   return wrap(unwrap(B)->CreateIndirectBr(unwrap(Addr), NumDests));
 }
 
+LLVMValueRef LLVMBuildCallBr(LLVMBuilderRef B, LLVMTypeRef Ty, LLVMValueRef Fn,
+                             LLVMBasicBlockRef DefaultDest,
+                             LLVMBasicBlockRef *IndirectDests,
+                             unsigned NumIndirectDests, LLVMValueRef *Args,
+                             unsigned NumArgs, LLVMOperandBundleRef *Bundles,
+                             unsigned NumBundles, const char *Name) {
+
+  SmallVector<OperandBundleDef, 8> OBs;
+  for (auto *Bundle : ArrayRef(Bundles, NumBundles)) {
+    OperandBundleDef *OB = unwrap(Bundle);
+    OBs.push_back(*OB);
+  }
+
+  return wrap(unwrap(B)->CreateCallBr(
+      unwrap<FunctionType>(Ty), unwrap(Fn), unwrap(DefaultDest),
+      ArrayRef(unwrap(IndirectDests), NumIndirectDests),
+      ArrayRef<Value *>(unwrap(Args), NumArgs), OBs, Name));
+}
+
 LLVMValueRef LLVMBuildInvoke2(LLVMBuilderRef B, LLVMTypeRef Ty, LLVMValueRef Fn,
                               LLVMValueRef *Args, unsigned NumArgs,
                               LLVMBasicBlockRef Then, LLVMBasicBlockRef Catch,
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 7976904b1fe9..4c3f37ceaaa4 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -2130,6 +2130,30 @@ bool at::calculateFragmentIntersect(
                                         SliceSizeInBits, DVRAssign, Result);
 }
 
+/// Update inlined instructions' DIAssignID metadata. We need to do this
+/// otherwise a function inlined more than once into the same function
+/// will cause DIAssignID to be shared by many instructions.
+void at::remapAssignID(DenseMap<DIAssignID *, DIAssignID *> &Map,
+                       Instruction &I) {
+  auto GetNewID = [&Map](Metadata *Old) {
+    DIAssignID *OldID = cast<DIAssignID>(Old);
+    if (DIAssignID *NewID = Map.lookup(OldID))
+      return NewID;
+    DIAssignID *NewID = DIAssignID::getDistinct(OldID->getContext());
+    Map[OldID] = NewID;
+    return NewID;
+  };
+  // If we find a DIAssignID attachment or use, replace it with a new version.
+  for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
+    if (DVR.isDbgAssign())
+      DVR.setAssignId(GetNewID(DVR.getAssignID()));
+  }
+  if (auto *ID = I.getMetadata(LLVMContext::MD_DIAssignID))
+    I.setMetadata(LLVMContext::MD_DIAssignID, GetNewID(ID));
+  else if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(&I))
+    DAI->setAssignId(GetNewID(DAI->getAssignID()));
+}
+
 /// Collect constant properies (base, size, offset) of \p StoreDest.
 /// Return std::nullopt if any properties are not constants or the
 /// offset from the base pointer is negative.
diff --git a/llvm/lib/IR/DebugProgramInstruction.cpp b/llvm/lib/IR/DebugProgramInstruction.cpp
index fbca7cdfcf3f..9a4926c81dca 100644
--- a/llvm/lib/IR/DebugProgramInstruction.cpp
+++ b/llvm/lib/IR/DebugProgramInstruction.cpp
@@ -366,8 +366,8 @@ void DbgVariableRecord::setKillLocation() {
 }
 
 bool DbgVariableRecord::isKillLocation() const {
-  return (getNumVariableLocationOps() == 0 &&
-          !getExpression()->isComplex()) ||
+  return (!hasArgList() && isa<MDNode>(getRawLocation())) ||
+         (getNumVariableLocationOps() == 0 && !getExpression()->isComplex()) ||
          any_of(location_ops(), [](Value *V) { return isa<UndefValue>(V); });
 }
 
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 690184080657..7f1e832f8597 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -83,6 +83,8 @@ static cl::opt<unsigned> NonGlobalValueMaxNameSize(
     "non-global-value-max-name-size", cl::Hidden, cl::init(1024),
     cl::desc("Maximum size for the name of non-global values."));
 
+extern cl::opt<bool> UseNewDbgInfoFormat;
+
 void Function::convertToNewDbgValues() {
   IsNewDbgInfoFormat = true;
   for (auto &BB : *this) {
@@ -438,7 +440,7 @@ Function::Function(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace,
     : GlobalObject(Ty, Value::FunctionVal,
                    OperandTraits<Function>::op_begin(this), 0, Linkage, name,
                    computeAddrSpace(AddrSpace, ParentModule)),
-      NumArgs(Ty->getNumParams()), IsNewDbgInfoFormat(false) {
+      NumArgs(Ty->getNumParams()), IsNewDbgInfoFormat(UseNewDbgInfoFormat) {
   assert(FunctionType::isValidReturnType(getReturnType()) &&
          "invalid return type");
   setGlobalObjectSubClassData(0);
@@ -1491,7 +1493,19 @@ bool Intrinsic::isConstrainedFPIntrinsic(ID QID) {
 #define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)                         \
   case Intrinsic::INTRINSIC:
 #include "llvm/IR/ConstrainedOps.def"
+#undef INSTRUCTION
     return true;
+  default:
+    return false;
+  }
+}
+
+bool Intrinsic::hasConstrainedFPRoundingModeOperand(Intrinsic::ID QID) {
+  switch (QID) {
+#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)                         \
+  case Intrinsic::INTRINSIC:                                                   \
+    return ROUND_MODE == 1;
+#include "llvm/IR/ConstrainedOps.def"
 #undef INSTRUCTION
   default:
     return false;
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 9ec5a7deeec6..c6f20af0f1df 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -1029,17 +1029,7 @@ CallInst *IRBuilderBase::CreateConstrainedFPCast(
     UseFMF = FMFSource->getFastMathFlags();
 
   CallInst *C;
-  bool HasRoundingMD = false;
-  switch (ID) {
-  default:
-    break;
-#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)        \
-  case Intrinsic::INTRINSIC:                                \
-    HasRoundingMD = ROUND_MODE;                             \
-    break;
-#include "llvm/IR/ConstrainedOps.def"
-  }
-  if (HasRoundingMD) {
+  if (Intrinsic::hasConstrainedFPRoundingModeOperand(ID)) {
     Value *RoundingV = getConstrainedFPRounding(Rounding);
     C = CreateIntrinsic(ID, {DestTy, V->getType()}, {V, RoundingV, ExceptV},
                         nullptr, Name);
@@ -1088,17 +1078,8 @@ CallInst *IRBuilderBase::CreateConstrainedFPCall(
   llvm::SmallVector<Value *, 6> UseArgs;
 
   append_range(UseArgs, Args);
-  bool HasRoundingMD = false;
-  switch (Callee->getIntrinsicID()) {
-  default:
-    break;
-#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)        \
-  case Intrinsic::INTRINSIC:                                \
-    HasRoundingMD = ROUND_MODE;                             \
-    break;
-#include "llvm/IR/ConstrainedOps.def"
-  }
-  if (HasRoundingMD)
+
+  if (Intrinsic::hasConstrainedFPRoundingModeOperand(Callee->getIntrinsicID()))
     UseArgs.push_back(getConstrainedFPRounding(Rounding));
   UseArgs.push_back(getConstrainedFPExcept(Except));
 
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index c71a1926258a..ef7b679242f6 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -1266,12 +1266,14 @@ Instruction *Instruction::cloneImpl() const {
 
 void Instruction::swapProfMetadata() {
   MDNode *ProfileData = getBranchWeightMDNode(*this);
-  if (!isBranchWeightMD(ProfileData))
+  if (!ProfileData)
     return;
-
-  SmallVector<Metadata *, 4> Ops;
   unsigned FirstIdx = getBranchWeightOffset(ProfileData);
+  if (ProfileData->getNumOperands() != 2 + FirstIdx)
+    return;
+
   unsigned SecondIdx = FirstIdx + 1;
+  SmallVector<Metadata *, 4> Ops;
   // If there are more weights past the second, we can't swap them
   if (ProfileData->getNumOperands() > SecondIdx + 1)
     return;
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 2173b609ba3b..5aae6c36e284 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -454,24 +454,14 @@ bool CallBase::paramHasAttr(unsigned ArgNo, Attribute::AttrKind Kind) const {
 }
 
 bool CallBase::hasFnAttrOnCalledFunction(Attribute::AttrKind Kind) const {
-  Value *V = getCalledOperand();
-  if (auto *CE = dyn_cast<ConstantExpr>(V))
-    if (CE->getOpcode() == BitCast)
-      V = CE->getOperand(0);
-
-  if (auto *F = dyn_cast<Function>(V))
+  if (auto *F = dyn_cast<Function>(getCalledOperand()))
     return F->getAttributes().hasFnAttr(Kind);
 
   return false;
 }
 
 bool CallBase::hasFnAttrOnCalledFunction(StringRef Kind) const {
-  Value *V = getCalledOperand();
-  if (auto *CE = dyn_cast<ConstantExpr>(V))
-    if (CE->getOpcode() == BitCast)
-      V = CE->getOperand(0);
-
-  if (auto *F = dyn_cast<Function>(V))
+  if (auto *F = dyn_cast<Function>(getCalledOperand()))
     return F->getAttributes().hasFnAttr(Kind);
 
   return false;
@@ -485,12 +475,7 @@ Attribute CallBase::getFnAttrOnCalledFunction(AK Kind) const {
     assert(Kind != Attribute::Memory && "Use getMemoryEffects() instead");
   }
 
-  Value *V = getCalledOperand();
-  if (auto *CE = dyn_cast<ConstantExpr>(V))
-    if (CE->getOpcode() == BitCast)
-      V = CE->getOperand(0);
-
-  if (auto *F = dyn_cast<Function>(V))
+  if (auto *F = dyn_cast<Function>(getCalledOperand()))
     return F->getAttributes().getFnAttr(Kind);
 
   return Attribute();
@@ -500,6 +485,22 @@ template Attribute
 CallBase::getFnAttrOnCalledFunction(Attribute::AttrKind Kind) const;
 template Attribute CallBase::getFnAttrOnCalledFunction(StringRef Kind) const;
 
+template <typename AK>
+Attribute CallBase::getParamAttrOnCalledFunction(unsigned ArgNo,
+                                                 AK Kind) const {
+  Value *V = getCalledOperand();
+
+  if (auto *F = dyn_cast<Function>(V))
+    return F->getAttributes().getParamAttr(ArgNo, Kind);
+
+  return Attribute();
+}
+template Attribute
+CallBase::getParamAttrOnCalledFunction(unsigned ArgNo,
+                                       Attribute::AttrKind Kind) const;
+template Attribute CallBase::getParamAttrOnCalledFunction(unsigned ArgNo,
+                                                          StringRef Kind) const;
+
 void CallBase::getOperandBundlesAsDefs(
     SmallVectorImpl<OperandBundleDef> &Defs) const {
   for (unsigned i = 0, e = getNumOperandBundles(); i != e; ++i)
@@ -926,6 +927,18 @@ LandingPadInst *InvokeInst::getLandingPadInst() const {
   return cast<LandingPadInst>(getUnwindDest()->getFirstNonPHI());
 }
 
+void InvokeInst::updateProfWeight(uint64_t S, uint64_t T) {
+  if (T == 0) {
+    LLVM_DEBUG(dbgs() << "Attempting to update profile weights will result in "
+                         "div by 0. Ignoring. Likely the function "
+                      << getParent()->getParent()->getName()
+                      << " has 0 entry count, and contains call instructions "
+                         "with non-zero prof info.");
+    return;
+  }
+  scaleProfData(*this, S, T);
+}
+
 //===----------------------------------------------------------------------===//
 //                        CallBrInst Implementation
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp
index 6743b315c74a..e17755c8ad57 100644
--- a/llvm/lib/IR/IntrinsicInst.cpp
+++ b/llvm/lib/IR/IntrinsicInst.cpp
@@ -365,37 +365,23 @@ FCmpInst::Predicate ConstrainedFPCmpIntrinsic::getPredicate() const {
   return getFPPredicateFromMD(getArgOperand(2));
 }
 
-bool ConstrainedFPIntrinsic::isUnaryOp() const {
-  switch (getIntrinsicID()) {
-  default:
-    return false;
-#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)                         \
-  case Intrinsic::INTRINSIC:                                                   \
-    return NARG == 1;
-#include "llvm/IR/ConstrainedOps.def"
-  }
-}
+unsigned ConstrainedFPIntrinsic::getNonMetadataArgCount() const {
+  // All constrained fp intrinsics have "fpexcept" metadata.
+  unsigned NumArgs = arg_size() - 1;
 
-bool ConstrainedFPIntrinsic::isTernaryOp() const {
-  switch (getIntrinsicID()) {
-  default:
-    return false;
-#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)                         \
-  case Intrinsic::INTRINSIC:                                                   \
-    return NARG == 3;
-#include "llvm/IR/ConstrainedOps.def"
-  }
+  // Some intrinsics have "round" metadata.
+  if (Intrinsic::hasConstrainedFPRoundingModeOperand(getIntrinsicID()))
+    NumArgs -= 1;
+
+  // Compare intrinsics take their predicate as metadata.
+  if (isa<ConstrainedFPCmpIntrinsic>(this))
+    NumArgs -= 1;
+
+  return NumArgs;
 }
 
 bool ConstrainedFPIntrinsic::classof(const IntrinsicInst *I) {
-  switch (I->getIntrinsicID()) {
-#define INSTRUCTION(NAME, NARGS, ROUND_MODE, INTRINSIC)                        \
-  case Intrinsic::INTRINSIC:
-#include "llvm/IR/ConstrainedOps.def"
-    return true;
-  default:
-    return false;
-  }
+  return Intrinsic::isConstrainedFPIntrinsic(I->getIntrinsicID());
 }
 
 ElementCount VPIntrinsic::getStaticVectorLength() const {
diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index 2713015c266c..399fe0dad26c 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -825,19 +825,26 @@ template <> struct MDNodeKeyImpl<DISubprogram> {
   bool isDefinition() const { return SPFlags & DISubprogram::SPFlagDefinition; }
 
   unsigned getHashValue() const {
+    // Use the Scope's linkage name instead of using the scope directly, as the
+    // scope may be a temporary one which can replaced, which would produce a
+    // different hash for the same DISubprogram.
+    llvm::StringRef ScopeLinkageName;
+    if (auto *CT = dyn_cast_or_null<DICompositeType>(Scope))
+      if (auto *ID = CT->getRawIdentifier())
+        ScopeLinkageName = ID->getString();
+
     // If this is a declaration inside an ODR type, only hash the type and the
     // name.  Otherwise the hash will be stronger than
     // MDNodeSubsetEqualImpl::isDeclarationOfODRMember().
-    if (!isDefinition() && LinkageName)
-      if (auto *CT = dyn_cast_or_null<DICompositeType>(Scope))
-        if (CT->getRawIdentifier())
-          return hash_combine(LinkageName, Scope);
+    if (!isDefinition() && LinkageName &&
+        isa_and_nonnull<DICompositeType>(Scope))
+      return hash_combine(LinkageName, ScopeLinkageName);
 
     // Intentionally computes the hash on a subset of the operands for
     // performance reason. The subset has to be significant enough to avoid
     // collision "most of the time". There is no correctness issue in case of
     // collision because of the full check above.
-    return hash_combine(Name, Scope, File, Type, Line);
+    return hash_combine(Name, ScopeLinkageName, File, Type, Line);
   }
 };
 
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index a8696ed9e3ce..915fa5097383 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -54,6 +54,8 @@
 
 using namespace llvm;
 
+extern cl::opt<bool> UseNewDbgInfoFormat;
+
 //===----------------------------------------------------------------------===//
 // Methods to implement the globals and functions lists.
 //
@@ -72,7 +74,7 @@ template class llvm::SymbolTableListTraits<GlobalIFunc>;
 Module::Module(StringRef MID, LLVMContext &C)
     : Context(C), ValSymTab(std::make_unique<ValueSymbolTable>(-1)),
       ModuleID(std::string(MID)), SourceFileName(std::string(MID)), DL(""),
-      IsNewDbgInfoFormat(false) {
+      IsNewDbgInfoFormat(UseNewDbgInfoFormat) {
   Context.addModule(this);
 }
 
diff --git a/llvm/lib/IR/ProfDataUtils.cpp b/llvm/lib/IR/ProfDataUtils.cpp
index f1543d401fd4..f738d76937c2 100644
--- a/llvm/lib/IR/ProfDataUtils.cpp
+++ b/llvm/lib/IR/ProfDataUtils.cpp
@@ -43,6 +43,9 @@ namespace {
 // the minimum number of operands for MD_prof nodes with branch weights
 constexpr unsigned MinBWOps = 3;
 
+// the minimum number of operands for MD_prof nodes with value profiles
+constexpr unsigned MinVPOps = 5;
+
 // We may want to add support for other MD_prof types, so provide an abstraction
 // for checking the metadata type.
 bool isTargetMD(const MDNode *ProfData, const char *Name, unsigned MinOps) {
@@ -95,11 +98,25 @@ bool isBranchWeightMD(const MDNode *ProfileData) {
   return isTargetMD(ProfileData, "branch_weights", MinBWOps);
 }
 
+bool isValueProfileMD(const MDNode *ProfileData) {
+  return isTargetMD(ProfileData, "VP", MinVPOps);
+}
+
 bool hasBranchWeightMD(const Instruction &I) {
   auto *ProfileData = I.getMetadata(LLVMContext::MD_prof);
   return isBranchWeightMD(ProfileData);
 }
 
+bool hasCountTypeMD(const Instruction &I) {
+  auto *ProfileData = I.getMetadata(LLVMContext::MD_prof);
+  // Value profiles record count-type information.
+  if (isValueProfileMD(ProfileData))
+    return true;
+  // Conservatively assume non CallBase instruction only get taken/not-taken
+  // branch probability, so not interpret them as count.
+  return isa<CallBase>(I) && !isBranchWeightMD(ProfileData);
+}
+
 bool hasValidBranchWeightMD(const Instruction &I) {
   return getValidBranchWeightMDNode(I);
 }
@@ -233,6 +250,9 @@ void scaleProfData(Instruction &I, uint64_t S, uint64_t T) {
                         ProfDataName->getString() != "VP"))
     return;
 
+  if (!hasCountTypeMD(I))
+    return;
+
   LLVMContext &C = I.getContext();
 
   MDBuilder MDB(C);
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index b9b32eccb31c..71a1800c8167 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -153,8 +153,8 @@ struct VerifierSupport {
   bool TreatBrokenDebugInfoAsError = true;
 
   explicit VerifierSupport(raw_ostream *OS, const Module &M)
-      : OS(OS), M(M), MST(&M), TT(M.getTargetTriple()), DL(M.getDataLayout()),
-        Context(M.getContext()) {}
+      : OS(OS), M(M), MST(&M), TT(Triple::normalize(M.getTargetTriple())),
+        DL(M.getDataLayout()), Context(M.getContext()) {}
 
 private:
   void Write(const Module *M) {
@@ -2066,7 +2066,8 @@ void Verifier::verifyParameterAttrs(AttributeSet Attrs, Type *Ty,
           "Invalid value for 'nofpclass' test mask", V);
   }
   if (Attrs.hasAttribute(Attribute::Range)) {
-    auto CR = Attrs.getAttribute(Attribute::Range).getValueAsConstantRange();
+    const ConstantRange &CR =
+        Attrs.getAttribute(Attribute::Range).getValueAsConstantRange();
     Check(Ty->isIntOrIntVectorTy(CR.getBitWidth()),
           "Range bit width must match type bit width!", V);
   }
@@ -5386,11 +5387,13 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
   }
 #define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID:
 #include "llvm/IR/VPIntrinsics.def"
+#undef BEGIN_REGISTER_VP_INTRINSIC
     visitVPIntrinsic(cast<VPIntrinsic>(Call));
     break;
 #define INSTRUCTION(NAME, NARGS, ROUND_MODE, INTRINSIC)                        \
   case Intrinsic::INTRINSIC:
 #include "llvm/IR/ConstrainedOps.def"
+#undef INSTRUCTION
     visitConstrainedFPIntrinsic(cast<ConstrainedFPIntrinsic>(Call));
     break;
   case Intrinsic::dbg_declare: // llvm.dbg.declare
@@ -6529,19 +6532,13 @@ void Verifier::visitVPIntrinsic(VPIntrinsic &VPI) {
 }
 
 void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
-  unsigned NumOperands;
-  bool HasRoundingMD;
-  switch (FPI.getIntrinsicID()) {
-#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)                         \
-  case Intrinsic::INTRINSIC:                                                   \
-    NumOperands = NARG;                                                        \
-    HasRoundingMD = ROUND_MODE;                                                \
-    break;
-#include "llvm/IR/ConstrainedOps.def"
-  default:
-    llvm_unreachable("Invalid constrained FP intrinsic!");
-  }
+  unsigned NumOperands = FPI.getNonMetadataArgCount();
+  bool HasRoundingMD =
+      Intrinsic::hasConstrainedFPRoundingModeOperand(FPI.getIntrinsicID());
+
+  // Add the expected number of metadata operands.
   NumOperands += (1 + HasRoundingMD);
+
   // Compare intrinsics carry an extra predicate metadata operand.
   if (isa<ConstrainedFPCmpIntrinsic>(FPI))
     NumOperands += 1;
@@ -6555,8 +6552,8 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
     Type *ResultTy = FPI.getType();
     Check(!ValTy->isVectorTy() && !ResultTy->isVectorTy(),
           "Intrinsic does not support vectors", &FPI);
-  }
     break;
+  }
 
   case Intrinsic::experimental_constrained_lround:
   case Intrinsic::experimental_constrained_llround: {
@@ -6595,8 +6592,8 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
             "Intrinsic first argument and result vector lengths must be equal",
             &FPI);
     }
-  }
     break;
+  }
 
   case Intrinsic::experimental_constrained_sitofp:
   case Intrinsic::experimental_constrained_uitofp: {
@@ -6618,7 +6615,8 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
             "Intrinsic first argument and result vector lengths must be equal",
             &FPI);
     }
-  } break;
+    break;
+  }
 
   case Intrinsic::experimental_constrained_fptrunc:
   case Intrinsic::experimental_constrained_fpext: {
@@ -6647,8 +6645,8 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
             "Intrinsic first argument's type must be smaller than result type",
             &FPI);
     }
-  }
     break;
+  }
 
   default:
     break;
diff --git a/llvm/lib/LTO/LTOModule.cpp b/llvm/lib/LTO/LTOModule.cpp
index f839fe944e18..eac78069f4d2 100644
--- a/llvm/lib/LTO/LTOModule.cpp
+++ b/llvm/lib/LTO/LTOModule.cpp
@@ -694,7 +694,7 @@ bool LTOModule::hasCtorDtor() const {
     if (auto *GV = dyn_cast_if_present<GlobalValue *>(Sym)) {
       StringRef Name = GV->getName();
       if (Name.consume_front("llvm.global_")) {
-        if (Name.equals("ctors") || Name.equals("dtors"))
+        if (Name == "ctors" || Name == "dtors")
           return true;
       }
     }
diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index 3dc70a401589..f257d0d9e83f 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -468,7 +468,7 @@ void MCAsmStreamer::emitRawComment(const Twine &T, bool TabPrefix) {
 
 void MCAsmStreamer::addExplicitComment(const Twine &T) {
   StringRef c = T.getSingleStringRef();
-  if (c.equals(StringRef(MAI->getSeparatorString())))
+  if (c == MAI->getSeparatorString())
     return;
   if (c.starts_with(StringRef("//"))) {
     ExplicitCommentToEmit.append("\t");
diff --git a/llvm/lib/MC/MCDXContainerWriter.cpp b/llvm/lib/MC/MCDXContainerWriter.cpp
index 0580dc7e4282..ff64c6e538ac 100644
--- a/llvm/lib/MC/MCDXContainerWriter.cpp
+++ b/llvm/lib/MC/MCDXContainerWriter.cpp
@@ -117,9 +117,11 @@ uint64_t DXContainerObjectWriter::writeObject(MCAssembler &Asm,
 
       const Triple &TT = Asm.getContext().getTargetTriple();
       VersionTuple Version = TT.getOSVersion();
-      Header.MajorVersion = static_cast<uint8_t>(Version.getMajor());
-      if (Version.getMinor())
-        Header.MinorVersion = static_cast<uint8_t>(*Version.getMinor());
+      uint8_t MajorVersion = static_cast<uint8_t>(Version.getMajor());
+      uint8_t MinorVersion =
+          static_cast<uint8_t>(Version.getMinor().value_or(0));
+      Header.Version =
+          dxbc::ProgramHeader::getVersion(MajorVersion, MinorVersion);
       if (TT.hasEnvironment())
         Header.ShaderKind =
             static_cast<uint16_t>(TT.getEnvironment() - Triple::Pixel);
diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp
index 1f8f8ec55727..045b566aae78 100644
--- a/llvm/lib/MC/MCObjectFileInfo.cpp
+++ b/llvm/lib/MC/MCObjectFileInfo.cpp
@@ -343,7 +343,9 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) {
   case Triple::mips64el:
     // We cannot use DW_EH_PE_sdata8 for the large PositionIndependent case
     // since there is no R_MIPS_PC64 relocation (only a 32-bit version).
-    if (PositionIndependent && !Large)
+    // In fact DW_EH_PE_sdata4 is enough for us now, and GNU ld doesn't
+    // support pcrel|sdata8 well. Let's use sdata4 for now.
+    if (PositionIndependent)
       FDECFIEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
     else
       FDECFIEncoding = Ctx->getAsmInfo()->getCodePointerSize() == 4
diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index 76a3e501f459..8d9acd54e879 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -4543,7 +4543,7 @@ bool AsmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
 
     // Emit an error if two (or more) named parameters share the same name
     for (const MCAsmMacroParameter& CurrParam : Parameters)
-      if (CurrParam.Name.equals(Parameter.Name))
+      if (CurrParam.Name == Parameter.Name)
         return TokError("macro '" + Name + "' has multiple parameters"
                         " named '" + Parameter.Name + "'");
 
diff --git a/llvm/lib/MC/MCParser/DarwinAsmParser.cpp b/llvm/lib/MC/MCParser/DarwinAsmParser.cpp
index 3cd44e7195be..a97b72997ae3 100644
--- a/llvm/lib/MC/MCParser/DarwinAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/DarwinAsmParser.cpp
@@ -705,7 +705,7 @@ bool DarwinAsmParser::parseDirectiveSection(StringRef, SMLoc) {
                                    .Case("__datacoal_nt", "__data")
                                    .Default(Section);
 
-    if (!Section.equals(NonCoalSection)) {
+    if (Section != NonCoalSection) {
       StringRef SectionVal(Loc.getPointer());
       size_t B = SectionVal.find(',') + 1, E = SectionVal.find(',', B);
       SMLoc BLoc = SMLoc::getFromPointer(SectionVal.data() + B);
diff --git a/llvm/lib/MC/MCSymbolXCOFF.cpp b/llvm/lib/MC/MCSymbolXCOFF.cpp
index b4c96a1ffa23..599a3946a1ed 100644
--- a/llvm/lib/MC/MCSymbolXCOFF.cpp
+++ b/llvm/lib/MC/MCSymbolXCOFF.cpp
@@ -13,7 +13,7 @@ using namespace llvm;
 MCSectionXCOFF *MCSymbolXCOFF::getRepresentedCsect() const {
   assert(RepresentedCsect &&
          "Trying to get csect representation of this symbol but none was set.");
-  assert(getSymbolTableName().equals(RepresentedCsect->getSymbolTableName()) &&
+  assert(getSymbolTableName() == RepresentedCsect->getSymbolTableName() &&
          "SymbolTableNames need to be the same for this symbol and its csect "
          "representation.");
   return RepresentedCsect;
@@ -24,7 +24,7 @@ void MCSymbolXCOFF::setRepresentedCsect(MCSectionXCOFF *C) {
   assert((!RepresentedCsect || RepresentedCsect == C) &&
          "Trying to set a csect that doesn't match the one that this symbol is "
          "already mapped to.");
-  assert(getSymbolTableName().equals(C->getSymbolTableName()) &&
+  assert(getSymbolTableName() == C->getSymbolTableName() &&
          "SymbolTableNames need to be the same for this symbol and its csect "
          "representation.");
   RepresentedCsect = C;
diff --git a/llvm/lib/MCA/InstrBuilder.cpp b/llvm/lib/MCA/InstrBuilder.cpp
index 2e3ebe3d9073..bcf065c56691 100644
--- a/llvm/lib/MCA/InstrBuilder.cpp
+++ b/llvm/lib/MCA/InstrBuilder.cpp
@@ -320,9 +320,9 @@ void InstrBuilder::populateWrites(InstrDesc &ID, const MCInst &MCI,
 
   unsigned NumVariadicOps = MCI.getNumOperands() - MCDesc.getNumOperands();
   ID.Writes.resize(TotalDefs + NumVariadicOps);
-  // Iterate over the operands list, and skip non-register operands.
-  // The first NumExplicitDefs register operands are expected to be register
-  // definitions.
+  // Iterate over the operands list, and skip non-register or constant register
+  // operands. The first NumExplicitDefs register operands are expected to be
+  // register definitions.
   unsigned CurrentDef = 0;
   unsigned OptionalDefIdx = MCDesc.getNumOperands() - 1;
   unsigned i = 0;
@@ -335,6 +335,10 @@ void InstrBuilder::populateWrites(InstrDesc &ID, const MCInst &MCI,
       OptionalDefIdx = CurrentDef++;
       continue;
     }
+    if (MRI.isConstant(Op.getReg())) {
+      CurrentDef++;
+      continue;
+    }
 
     WriteDescriptor &Write = ID.Writes[CurrentDef];
     Write.OpIndex = i;
@@ -413,6 +417,8 @@ void InstrBuilder::populateWrites(InstrDesc &ID, const MCInst &MCI,
     const MCOperand &Op = MCI.getOperand(OpIndex);
     if (!Op.isReg())
       continue;
+    if (MRI.isConstant(Op.getReg()))
+      continue;
 
     WriteDescriptor &Write = ID.Writes[CurrentDef];
     Write.OpIndex = OpIndex;
@@ -448,6 +454,8 @@ void InstrBuilder::populateReads(InstrDesc &ID, const MCInst &MCI,
     const MCOperand &Op = MCI.getOperand(OpIndex);
     if (!Op.isReg())
       continue;
+    if (MRI.isConstant(Op.getReg()))
+      continue;
 
     ReadDescriptor &Read = ID.Reads[CurrentUse];
     Read.OpIndex = OpIndex;
@@ -465,6 +473,8 @@ void InstrBuilder::populateReads(InstrDesc &ID, const MCInst &MCI,
     Read.OpIndex = ~I;
     Read.UseIndex = NumExplicitUses + I;
     Read.RegisterID = MCDesc.implicit_uses()[I];
+    if (MRI.isConstant(Read.RegisterID))
+      continue;
     Read.SchedClassID = SchedClassID;
     LLVM_DEBUG(dbgs() << "\t\t[Use][I] OpIdx=" << ~Read.OpIndex
                       << ", UseIndex=" << Read.UseIndex << ", RegisterID="
@@ -747,8 +757,9 @@ InstrBuilder::createInstruction(const MCInst &MCI,
   for (const WriteDescriptor &WD : D.Writes) {
     RegID = WD.isImplicitWrite() ? WD.RegisterID
                                  : MCI.getOperand(WD.OpIndex).getReg();
-    // Check if this is a optional definition that references NoReg.
-    if (WD.IsOptionalDef && !RegID) {
+    // Check if this is a optional definition that references NoReg or a write
+    // to a constant register.
+    if ((WD.IsOptionalDef && !RegID) || MRI.isConstant(RegID)) {
       ++WriteIndex;
       continue;
     }
diff --git a/llvm/lib/Object/Archive.cpp b/llvm/lib/Object/Archive.cpp
index 6139d9996bda..e798bbdd16f1 100644
--- a/llvm/lib/Object/Archive.cpp
+++ b/llvm/lib/Object/Archive.cpp
@@ -269,11 +269,11 @@ Expected<StringRef> ArchiveMemberHeader::getName(uint64_t Size) const {
       return Name;
     // System libraries from the Windows SDK for Windows 11 contain this symbol.
     // It looks like a CFG guard: we just skip it for now.
-    if (Name.equals("/<XFGHASHMAP>/"))
+    if (Name == "/<XFGHASHMAP>/")
       return Name;
     // Some libraries (e.g., arm64rt.lib) from the Windows WDK
     // (version 10.0.22000.0) contain this undocumented special member.
-    if (Name.equals("/<ECSYMBOLS>/"))
+    if (Name == "/<ECSYMBOLS>/")
       return Name;
     // It's a long name.
     // Get the string table offset.
diff --git a/llvm/lib/Object/MachOObjectFile.cpp b/llvm/lib/Object/MachOObjectFile.cpp
index 1cfd0a069463..ef390ceca218 100644
--- a/llvm/lib/Object/MachOObjectFile.cpp
+++ b/llvm/lib/Object/MachOObjectFile.cpp
@@ -399,7 +399,7 @@ static Error parseSegmentLoadCommand(
       return malformedError("load command " + Twine(LoadCommandIndex) +
                             " filesize field in " + CmdName +
                             " greater than vmsize field");
-    IsPageZeroSegment |= StringRef("__PAGEZERO").equals(S.segname);
+    IsPageZeroSegment |= StringRef("__PAGEZERO") == S.segname;
   } else
     return SegOrErr.takeError();
 
@@ -3515,7 +3515,7 @@ void MachORebaseEntry::moveNext() {
     uint8_t Byte = *Ptr++;
     uint8_t ImmValue = Byte & MachO::REBASE_IMMEDIATE_MASK;
     uint8_t Opcode = Byte & MachO::REBASE_OPCODE_MASK;
-    uint32_t Count, Skip;
+    uint64_t Count, Skip;
     const char *error = nullptr;
     switch (Opcode) {
     case MachO::REBASE_OPCODE_DONE:
@@ -3854,7 +3854,7 @@ void MachOBindEntry::moveNext() {
     uint8_t Opcode = Byte & MachO::BIND_OPCODE_MASK;
     int8_t SignExtended;
     const uint8_t *SymStart;
-    uint32_t Count, Skip;
+    uint64_t Count, Skip;
     const char *error = nullptr;
     switch (Opcode) {
     case MachO::BIND_OPCODE_DONE:
@@ -4364,7 +4364,7 @@ BindRebaseSegInfo::BindRebaseSegInfo(const object::MachOObjectFile *Obj) {
     Info.Size = Section.getSize();
     Info.SegmentName =
         Obj->getSectionFinalSegmentName(Section.getRawDataRefImpl());
-    if (!Info.SegmentName.equals(CurSegName)) {
+    if (Info.SegmentName != CurSegName) {
       ++CurSegIndex;
       CurSegName = Info.SegmentName;
       CurSegAddress = Info.Address;
@@ -4384,18 +4384,18 @@ BindRebaseSegInfo::BindRebaseSegInfo(const object::MachOObjectFile *Obj) {
 // that fully contains a pointer at that location. Multiple fixups in a bind
 // (such as with the BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB opcode) can
 // be tested via the Count and Skip parameters.
-const char * BindRebaseSegInfo::checkSegAndOffsets(int32_t SegIndex,
-                                                   uint64_t SegOffset,
-                                                   uint8_t PointerSize,
-                                                   uint32_t Count,
-                                                   uint32_t Skip) {
+const char *BindRebaseSegInfo::checkSegAndOffsets(int32_t SegIndex,
+                                                  uint64_t SegOffset,
+                                                  uint8_t PointerSize,
+                                                  uint64_t Count,
+                                                  uint64_t Skip) {
   if (SegIndex == -1)
     return "missing preceding *_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB";
   if (SegIndex >= MaxSegIndex)
     return "bad segIndex (too large)";
-  for (uint32_t i = 0; i < Count; ++i) {
-    uint32_t Start = SegOffset + i * (PointerSize + Skip);
-    uint32_t End = Start + PointerSize;
+  for (uint64_t i = 0; i < Count; ++i) {
+    uint64_t Start = SegOffset + i * (PointerSize + Skip);
+    uint64_t End = Start + PointerSize;
     bool Found = false;
     for (const SectionInfo &SI : Sections) {
       if (SI.SegmentIndex != SegIndex)
diff --git a/llvm/lib/Object/OffloadBinary.cpp b/llvm/lib/Object/OffloadBinary.cpp
index 6e9f8bed513c..89dc12551494 100644
--- a/llvm/lib/Object/OffloadBinary.cpp
+++ b/llvm/lib/Object/OffloadBinary.cpp
@@ -359,7 +359,7 @@ bool object::areTargetsCompatible(const OffloadFile::TargetID &LHS,
     return false;
 
   // If the architecture is "all" we assume it is always compatible.
-  if (LHS.second.equals("generic") || RHS.second.equals("generic"))
+  if (LHS.second == "generic" || RHS.second == "generic")
     return true;
 
   // Only The AMDGPU target requires additional checks.
diff --git a/llvm/lib/ObjectYAML/COFFEmitter.cpp b/llvm/lib/ObjectYAML/COFFEmitter.cpp
index 7088223b9b67..bb46de4c6f57 100644
--- a/llvm/lib/ObjectYAML/COFFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/COFFEmitter.cpp
@@ -359,9 +359,9 @@ static uint32_t initializeOptionalHeader(COFFParser &CP, uint16_t Magic,
       SizeOfInitializedData += S.Header.SizeOfRawData;
     if (S.Header.Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA)
       SizeOfUninitializedData += S.Header.SizeOfRawData;
-    if (S.Name.equals(".text"))
+    if (S.Name == ".text")
       Header->BaseOfCode = S.Header.VirtualAddress; // RVA
-    else if (S.Name.equals(".data"))
+    else if (S.Name == ".data")
       BaseOfData = S.Header.VirtualAddress; // RVA
     if (S.Header.VirtualAddress)
       SizeOfImage += alignTo(S.Header.VirtualSize, Header->SectionAlignment);
diff --git a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp
index f3a518df3175..175f1a12f931 100644
--- a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp
+++ b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp
@@ -131,8 +131,8 @@ void DXContainerWriter::writeParts(raw_ostream &OS) {
       if (!P.Program)
         continue;
       dxbc::ProgramHeader Header;
-      Header.MajorVersion = P.Program->MajorVersion;
-      Header.MinorVersion = P.Program->MinorVersion;
+      Header.Version = dxbc::ProgramHeader::getVersion(P.Program->MajorVersion,
+                                                       P.Program->MinorVersion);
       Header.Unused = 0;
       Header.ShaderKind = P.Program->ShaderKind;
       memcpy(Header.Bitcode.Magic, "DXIL", 4);
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 30d3e7a1ec05..e4131706aba0 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -93,6 +93,8 @@
 #include "llvm/CodeGen/MIRPrinter.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/MachinePassManager.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PreISelIntrinsicLowering.h"
 #include "llvm/CodeGen/SafeStack.h"
 #include "llvm/CodeGen/SelectOptimize.h"
 #include "llvm/CodeGen/ShadowStackGCLowering.h"
@@ -175,6 +177,7 @@
 #include "llvm/Transforms/Instrumentation/LowerAllowCheckPass.h"
 #include "llvm/Transforms/Instrumentation/MemProfiler.h"
 #include "llvm/Transforms/Instrumentation/MemorySanitizer.h"
+#include "llvm/Transforms/Instrumentation/PGOCtxProfLowering.h"
 #include "llvm/Transforms/Instrumentation/PGOForceFunctionAttrs.h"
 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
 #include "llvm/Transforms/Instrumentation/PoisonChecking.h"
@@ -362,6 +365,14 @@ public:
     return PreservedAnalyses::none();
   }
 
+  PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &) {
+    // Intentionally create a virtual register and set NoVRegs property.
+    auto &MRI = MF.getRegInfo();
+    MRI.createGenericVirtualRegister(LLT::scalar(8));
+    MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs);
+    return PreservedAnalyses::all();
+  }
+
   static StringRef name() { return "TriggerVerifierErrorPass"; }
 };
 
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 100889c0845b..72f273972f2f 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -74,6 +74,7 @@
 #include "llvm/Transforms/Instrumentation/InstrOrderFile.h"
 #include "llvm/Transforms/Instrumentation/InstrProfiling.h"
 #include "llvm/Transforms/Instrumentation/MemProfiler.h"
+#include "llvm/Transforms/Instrumentation/PGOCtxProfLowering.h"
 #include "llvm/Transforms/Instrumentation/PGOForceFunctionAttrs.h"
 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
 #include "llvm/Transforms/Scalar/ADCE.h"
@@ -834,6 +835,10 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM,
         PTO.EagerlyInvalidateAnalyses));
   }
 
+  if (PGOCtxProfLoweringPass::isContextualIRPGOEnabled()) {
+    MPM.addPass(PGOCtxProfLoweringPass());
+    return;
+  }
   // Add the profile lowering pass.
   InstrProfOptions Options;
   if (!ProfileFile.empty())
@@ -963,8 +968,7 @@ PassBuilder::buildInlinerPipeline(OptimizationLevel Level,
   MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor(
       RequireAnalysisPass<ShouldNotRunFunctionPassesAnalysis, Function>()));
 
-  if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink)
-    MainCGPipeline.addPass(CoroSplitPass(Level != OptimizationLevel::O0));
+  MainCGPipeline.addPass(CoroSplitPass(Level != OptimizationLevel::O0));
 
   // Make sure we don't affect potential future NoRerun CGSCC adaptors.
   MIWP.addLateModulePass(createModuleToFunctionPassAdaptor(
@@ -1006,9 +1010,8 @@ PassBuilder::buildModuleInlinerPipeline(OptimizationLevel Level,
       buildFunctionSimplificationPipeline(Level, Phase),
       PTO.EagerlyInvalidateAnalyses));
 
-  if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink)
-    MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
-        CoroSplitPass(Level != OptimizationLevel::O0)));
+  MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
+      CoroSplitPass(Level != OptimizationLevel::O0)));
 
   return MPM;
 }
@@ -1185,8 +1188,7 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   // and argument promotion.
   MPM.addPass(DeadArgumentEliminationPass());
 
-  if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink)
-    MPM.addPass(CoroCleanupPass());
+  MPM.addPass(CoroCleanupPass());
 
   // Optimize globals now that functions are fully simplified.
   MPM.addPass(GlobalOptPass());
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 9b670e4e3a44..e5ce6cb7da64 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -77,6 +77,7 @@ MODULE_PASS("inliner-wrapper-no-mandatory-first",
 MODULE_PASS("insert-gcov-profiling", GCOVProfilerPass())
 MODULE_PASS("instrorderfile", InstrOrderFilePass())
 MODULE_PASS("instrprof", InstrProfilingLoweringPass())
+MODULE_PASS("ctx-instr-lower", PGOCtxProfLoweringPass())
 MODULE_PASS("internalize", InternalizePass())
 MODULE_PASS("invalidate<all>", InvalidateAllAnalysesPass())
 MODULE_PASS("iroutliner", IROutlinerPass())
@@ -102,6 +103,7 @@ MODULE_PASS("pgo-icall-prom", PGOIndirectCallPromotion())
 MODULE_PASS("pgo-instr-gen", PGOInstrumentationGen())
 MODULE_PASS("pgo-instr-use", PGOInstrumentationUse())
 MODULE_PASS("poison-checking", PoisonCheckingPass())
+MODULE_PASS("pre-isel-intrinsic-lowering", PreISelIntrinsicLoweringPass(*TM))
 MODULE_PASS("print", PrintModulePass(dbgs()))
 MODULE_PASS("print-callgraph", CallGraphPrinterPass(dbgs()))
 MODULE_PASS("print-callgraph-sccs", CallGraphSCCsPrinterPass(dbgs()))
diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp
index 63490c83e85f..c7adc7668b9a 100644
--- a/llvm/lib/Passes/StandardInstrumentations.cpp
+++ b/llvm/lib/Passes/StandardInstrumentations.cpp
@@ -245,7 +245,8 @@ std::string getIRName(Any IR) {
     return C->getName();
 
   if (const auto *L = unwrapIR<Loop>(IR))
-    return L->getName().str();
+    return "loop %" + L->getName().str() + " in function " +
+           L->getHeader()->getParent()->getName().str();
 
   if (const auto *MF = unwrapIR<MachineFunction>(IR))
     return MF->getName().str();
@@ -821,8 +822,7 @@ PrintIRInstrumentation::PassRunDescriptor
 PrintIRInstrumentation::popPassRunDescriptor(StringRef PassID) {
   assert(!PassRunDescriptorStack.empty() && "empty PassRunDescriptorStack");
   PassRunDescriptor Descriptor = PassRunDescriptorStack.pop_back_val();
-  assert(Descriptor.PassID.equals(PassID) &&
-         "malformed PassRunDescriptorStack");
+  assert(Descriptor.PassID == PassID && "malformed PassRunDescriptorStack");
   return Descriptor;
 }
 
@@ -1486,6 +1486,17 @@ void VerifyInstrumentation::registerCallbacks(
                                          "\"{0}\", compilation aborted!",
                                          P));
           }
+
+          // TODO: Use complete MachineVerifierPass.
+          if (auto *MF = unwrapIR<MachineFunction>(IR)) {
+            if (DebugLogging)
+              dbgs() << "Verifying machine function " << MF->getName() << '\n';
+            verifyMachineFunction(
+                formatv("Broken machine function found after pass "
+                        "\"{0}\", compilation aborted!",
+                        P),
+                *MF);
+          }
         }
       });
 }
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index 6c77ce017c03..8c81bbe8e9c4 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -338,7 +338,6 @@ public:
 #endif
     for (const auto *Branch : Branches) {
       const auto &BranchParams = Branch->getBranchParams();
-      assert(BranchParams.ID >= 0 && "CondID isn't set");
       assert(SeenIDs.insert(BranchParams.ID).second && "Duplicate CondID");
       NextIDs[BranchParams.ID] = BranchParams.Conds;
     }
@@ -694,7 +693,6 @@ private:
       assert(Branch.Kind == CounterMappingRegion::MCDCBranchRegion);
 
       auto ConditionID = Branch.getBranchParams().ID;
-      assert(ConditionID >= 0 && "ConditionID should be positive");
 
       if (ConditionIDs.contains(ConditionID) ||
           ConditionID >= DecisionParams.NumConditions)
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
index 5036bde5aca7..adfd22804356 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
@@ -256,7 +256,6 @@ void CoverageMappingWriter::write(raw_ostream &OS) {
         // They are written as internal values plus 1.
         const auto &BranchParams = I->getBranchParams();
         ParamsShouldBeNull = false;
-        assert(BranchParams.ID >= 0);
         unsigned ID1 = BranchParams.ID + 1;
         unsigned TID1 = BranchParams.Conds[true] + 1;
         unsigned FID1 = BranchParams.Conds[false] + 1;
diff --git a/llvm/lib/ProfileData/GCOV.cpp b/llvm/lib/ProfileData/GCOV.cpp
index ee61784abade..ecb12c045b5b 100644
--- a/llvm/lib/ProfileData/GCOV.cpp
+++ b/llvm/lib/ProfileData/GCOV.cpp
@@ -678,7 +678,7 @@ std::string Context::getCoveragePath(StringRef filename,
     return std::string(filename);
 
   std::string CoveragePath;
-  if (options.LongFileNames && !filename.equals(mainFilename))
+  if (options.LongFileNames && filename != mainFilename)
     CoveragePath =
         mangleCoveragePath(mainFilename, options.PreservePaths) + "##";
   CoveragePath += mangleCoveragePath(filename, options.PreservePaths);
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index f9ba80bd99c8..806d01de1ada 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -476,11 +476,43 @@ Error InstrProfSymtab::create(Module &M, bool InLTO) {
       return E;
   }
 
+  SmallVector<MDNode *, 2> Types;
+  for (GlobalVariable &G : M.globals()) {
+    if (!G.hasName() || !G.hasMetadata(LLVMContext::MD_type))
+      continue;
+    if (Error E = addVTableWithName(
+            G, getIRPGOObjectName(G, InLTO, /* PGONameMetadata */ nullptr)))
+      return E;
+  }
+
   Sorted = false;
   finalizeSymtab();
   return Error::success();
 }
 
+Error InstrProfSymtab::addVTableWithName(GlobalVariable &VTable,
+                                         StringRef VTablePGOName) {
+  auto mapName = [&](StringRef Name) -> Error {
+    if (Error E = addSymbolName(Name))
+      return E;
+
+    bool Inserted = true;
+    std::tie(std::ignore, Inserted) =
+        MD5VTableMap.try_emplace(GlobalValue::getGUID(Name), &VTable);
+    if (!Inserted)
+      LLVM_DEBUG(dbgs() << "GUID conflict within one module");
+    return Error::success();
+  };
+  if (Error E = mapName(VTablePGOName))
+    return E;
+
+  StringRef CanonicalName = getCanonicalName(VTablePGOName);
+  if (CanonicalName != VTablePGOName)
+    return mapName(CanonicalName);
+
+  return Error::success();
+}
+
 /// \c NameStrings is a string composed of one of more possibly encoded
 /// sub-strings. The substrings are separated by 0 or more zero bytes. This
 /// method decodes the string and calls `NameCallback` for each substring.
@@ -1283,7 +1315,7 @@ MDNode *mayHaveValueProfileOfKind(const Instruction &Inst,
     return nullptr;
 
   MDString *Tag = cast<MDString>(MD->getOperand(0));
-  if (!Tag || !Tag->getString().equals("VP"))
+  if (!Tag || Tag->getString() != "VP")
     return nullptr;
 
   // Now check kind:
diff --git a/llvm/lib/ProfileData/MemProfReader.cpp b/llvm/lib/ProfileData/MemProfReader.cpp
index b4d2c6f043f6..c25babac844a 100644
--- a/llvm/lib/ProfileData/MemProfReader.cpp
+++ b/llvm/lib/ProfileData/MemProfReader.cpp
@@ -164,9 +164,9 @@ bool isRuntimePath(const StringRef Path) {
   const StringRef Filename = llvm::sys::path::filename(Path);
   // This list should be updated in case new files with additional interceptors
   // are added to the memprof runtime.
-  return Filename.equals("memprof_malloc_linux.cpp") ||
-         Filename.equals("memprof_interceptors.cpp") ||
-         Filename.equals("memprof_new_delete.cpp");
+  return Filename == "memprof_malloc_linux.cpp" ||
+         Filename == "memprof_interceptors.cpp" ||
+         Filename == "memprof_new_delete.cpp";
 }
 
 std::string getBuildIdString(const SegmentEntry &Entry) {
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 0a4f5ac01553..64a7e0c7223f 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -3123,7 +3123,7 @@ bool IEEEFloat::convertFromStringSpecials(StringRef str) {
   if (str.size() < MIN_NAME_SIZE)
     return false;
 
-  if (str.equals("inf") || str.equals("INFINITY") || str.equals("+Inf")) {
+  if (str == "inf" || str == "INFINITY" || str == "+Inf") {
     makeInf(false);
     return true;
   }
@@ -3134,7 +3134,7 @@ bool IEEEFloat::convertFromStringSpecials(StringRef str) {
     if (str.size() < MIN_NAME_SIZE)
       return false;
 
-    if (str.equals("inf") || str.equals("INFINITY") || str.equals("Inf")) {
+    if (str == "inf" || str == "INFINITY" || str == "Inf") {
       makeInf(true);
       return true;
     }
diff --git a/llvm/lib/Support/CodeGenCoverage.cpp b/llvm/lib/Support/CodeGenCoverage.cpp
index 0df45b4ff2ba..4d41c42e527e 100644
--- a/llvm/lib/Support/CodeGenCoverage.cpp
+++ b/llvm/lib/Support/CodeGenCoverage.cpp
@@ -53,7 +53,7 @@ bool CodeGenCoverage::parse(MemoryBuffer &Buffer, StringRef BackendName) {
     if (CurPtr == Buffer.getBufferEnd())
       return false; // Data is invalid, expected rule id's to follow.
 
-    bool IsForThisBackend = BackendName.equals(LexedBackendName);
+    bool IsForThisBackend = BackendName == LexedBackendName;
     while (CurPtr != Buffer.getBufferEnd()) {
       if (std::distance(CurPtr, Buffer.getBufferEnd()) < 8)
         return false; // Data is invalid. Not enough bytes for another rule id.
diff --git a/llvm/lib/Support/FileCollector.cpp b/llvm/lib/Support/FileCollector.cpp
index be0b06b0aff8..29436f85c2f2 100644
--- a/llvm/lib/Support/FileCollector.cpp
+++ b/llvm/lib/Support/FileCollector.cpp
@@ -44,7 +44,7 @@ static bool isCaseSensitivePath(StringRef Path) {
   // sensitive in the absence of real_path, since this is the YAMLVFSWriter
   // default.
   UpperDest = Path.upper();
-  if (!sys::fs::real_path(UpperDest, RealDest) && Path.equals(RealDest))
+  if (!sys::fs::real_path(UpperDest, RealDest) && Path == RealDest)
     return false;
   return true;
 }
diff --git a/llvm/lib/Support/GraphWriter.cpp b/llvm/lib/Support/GraphWriter.cpp
index 0c7aacb2fe21..5583ca18ab20 100644
--- a/llvm/lib/Support/GraphWriter.cpp
+++ b/llvm/lib/Support/GraphWriter.cpp
@@ -115,7 +115,8 @@ std::string llvm::createGraphFilename(const Twine &Name, int &FD) {
 
   // Windows can't always handle long paths, so limit the length of the name.
   std::string N = Name.str();
-  N = N.substr(0, std::min<std::size_t>(N.size(), 140));
+  if (N.size() > 140)
+    N.resize(140);
 
   // Replace illegal characters in graph Filename with '_' if needed
   std::string CleansedName = replaceIllegalFilenameChars(N, '_');
diff --git a/llvm/lib/Support/JSON.cpp b/llvm/lib/Support/JSON.cpp
index c672a43b033e..2f9c1df3d20b 100644
--- a/llvm/lib/Support/JSON.cpp
+++ b/llvm/lib/Support/JSON.cpp
@@ -336,7 +336,7 @@ void Path::Root::printErrorContext(const Value &R, raw_ostream &OS) const {
       JOS.object([&] {
         for (const auto *KV : sortedElements(*O)) {
           JOS.attributeBegin(KV->first);
-          if (FieldName.equals(KV->first))
+          if (FieldName == StringRef(KV->first))
             Recurse(KV->second, Path.drop_back(), Recurse);
           else
             abbreviate(KV->second, JOS);
diff --git a/llvm/lib/Support/RISCVISAUtils.cpp b/llvm/lib/Support/RISCVISAUtils.cpp
index ca7518f71907..d6b002e66e7a 100644
--- a/llvm/lib/Support/RISCVISAUtils.cpp
+++ b/llvm/lib/Support/RISCVISAUtils.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/RISCVISAUtils.h"
+#include "llvm/ADT/StringExtras.h"
 #include <cassert>
 
 using namespace llvm;
@@ -23,19 +24,21 @@ using namespace llvm;
 // -Multi-letter extensions starting with 's' in alphabetical order.
 // -(TODO) Multi-letter extensions starting with 'zxm' in alphabetical order.
 // -X extensions in alphabetical order.
+// -Unknown multi-letter extensions in alphabetical order.
 // These flags are used to indicate the category. The first 6 bits store the
 // single letter extension rank for single letter and multi-letter extensions
 // starting with 'z'.
 enum RankFlags {
   RF_Z_EXTENSION = 1 << 6,
-  RF_S_EXTENSION = 1 << 7,
-  RF_X_EXTENSION = 1 << 8,
+  RF_S_EXTENSION = 2 << 6,
+  RF_X_EXTENSION = 3 << 6,
+  RF_UNKNOWN_MULTILETTER_EXTENSION = 4 << 6,
 };
 
 // Get the rank for single-letter extension, lower value meaning higher
 // priority.
 static unsigned singleLetterExtensionRank(char Ext) {
-  assert(Ext >= 'a' && Ext <= 'z');
+  assert(isLower(Ext));
   switch (Ext) {
   case 'i':
     return 0;
@@ -67,8 +70,9 @@ static unsigned getExtensionRank(const std::string &ExtName) {
   case 'x':
     return RF_X_EXTENSION;
   default:
-    assert(ExtName.size() == 1);
-    return singleLetterExtensionRank(ExtName[0]);
+    if (ExtName.size() == 1)
+      return singleLetterExtensionRank(ExtName[0]);
+    return RF_UNKNOWN_MULTILETTER_EXTENSION;
   }
 }
 
diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp
index 921af30bfcde..fcefdef992be 100644
--- a/llvm/lib/Support/VirtualFileSystem.cpp
+++ b/llvm/lib/Support/VirtualFileSystem.cpp
@@ -151,13 +151,23 @@ bool FileSystem::exists(const Twine &Path) {
   return Status && Status->exists();
 }
 
+llvm::ErrorOr<bool> FileSystem::equivalent(const Twine &A, const Twine &B) {
+  auto StatusA = status(A);
+  if (!StatusA)
+    return StatusA.getError();
+  auto StatusB = status(B);
+  if (!StatusB)
+    return StatusB.getError();
+  return StatusA->equivalent(*StatusB);
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void FileSystem::dump() const { print(dbgs(), PrintType::RecursiveContents); }
 #endif
 
 #ifndef NDEBUG
 static bool isTraversalComponent(StringRef Component) {
-  return Component.equals("..") || Component.equals(".");
+  return Component == ".." || Component == ".";
 }
 
 static bool pathHasTraversal(StringRef Path) {
@@ -1715,7 +1725,7 @@ public:
                       RedirectingFileSystem::Entry *ParentEntry = nullptr) {
     if (!ParentEntry) { // Look for a existent root
       for (const auto &Root : FS->Roots) {
-        if (Name.equals(Root->getName())) {
+        if (Name == Root->getName()) {
           ParentEntry = Root.get();
           return ParentEntry;
         }
@@ -1726,7 +1736,7 @@ public:
            llvm::make_range(DE->contents_begin(), DE->contents_end())) {
         auto *DirContent =
             dyn_cast<RedirectingFileSystem::DirectoryEntry>(Content.get());
-        if (DirContent && Name.equals(Content->getName()))
+        if (DirContent && Name == Content->getName())
           return DirContent;
       }
     }
diff --git a/llvm/lib/Support/YAMLTraits.cpp b/llvm/lib/Support/YAMLTraits.cpp
index 7bb60894b335..56b557646100 100644
--- a/llvm/lib/Support/YAMLTraits.cpp
+++ b/llvm/lib/Support/YAMLTraits.cpp
@@ -120,7 +120,7 @@ bool Input::mapTag(StringRef Tag, bool Default) {
     return Default;
   }
   // Return true iff found tag matches supplied tag.
-  return Tag.equals(foundTag);
+  return Tag == foundTag;
 }
 
 void Input::beginMapping() {
@@ -271,7 +271,7 @@ bool Input::matchEnumScalar(const char *Str, bool) {
   if (ScalarMatchFound)
     return false;
   if (ScalarHNode *SN = dyn_cast<ScalarHNode>(CurrentNode)) {
-    if (SN->value().equals(Str)) {
+    if (SN->value() == Str) {
       ScalarMatchFound = true;
       return true;
     }
@@ -310,7 +310,7 @@ bool Input::bitSetMatch(const char *Str, bool) {
     unsigned Index = 0;
     for (auto &N : SQ->Entries) {
       if (ScalarHNode *SN = dyn_cast<ScalarHNode>(N)) {
-        if (SN->value().equals(Str)) {
+        if (SN->value() == Str) {
           BitValuesUsed[Index] = true;
           return true;
         }
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index b6c8e5f16089..920ca7f4fbfc 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -11,42 +11,121 @@
 
 // A SubtargetFeature that can be toggled from the command line, and therefore
 // has an AEK_* entry in ArmExtKind.
+//
+// If Function MultiVersioning (FMV) properties are left at their defaults
+// (FEAT_INIT, no dependencies, priority 0) it indiates that this extension is
+// not an FMV feature, but can be enabled via the command line (-march, -mcpu,
+// etc).
+//
+// Conversely if the ArchExtKindSpelling is set to AEK_NONE, this indicates
+// that a feature is FMV-only, and can not be selected on the command line.
+// Such extensions should be added via FMVOnlyExtension.
 class Extension<
-  string TargetFeatureName,            // String used for -target-feature.
+  string TargetFeatureName,            // String used for -target-feature and -march, unless overridden.
   string Spelling,                     // The XYZ in HasXYZ and AEK_XYZ.
   string Desc,                         // Description.
-  list<SubtargetFeature> Implies = []  // List of dependent features.
+  list<SubtargetFeature> Implies = [], // List of dependent features.
+  // FMV properties
+  string _FMVBit = "FEAT_INIT",        // FEAT_INIT is repurposed to indicate "not an FMV feature"
+  string _FMVDependencies = "",
+  int _FMVPriority = 0
 > : SubtargetFeature<TargetFeatureName, "Has" # Spelling, "true", Desc, Implies>
 {
     string ArchExtKindSpelling = "AEK_" # Spelling; // ArchExtKind enum name.
+
+    // In general, the name written on the command line should match the name
+    // used for -target-feature. However, there are exceptions. Therefore we
+    // add a separate field for this, to allow overriding it. Strongly prefer
+    // not doing so.
+    string MArchName = TargetFeatureName;
+
+    // Function MultiVersioning (FMV) properties
+
+    // A C++ expression giving the number of the bit in the FMV ABI.
+    // Currently this is given as a value from the enum "CPUFeatures".
+    // If this is not set, it indicates that this is not an FMV extension.
+    string FMVBit = _FMVBit;
+
+    // List of features that this feature depends on.
+    // FIXME generate this from Implies.
+    string FMVDependencies = _FMVDependencies;
+
+    // The FMV priority
+    int FMVPriority = _FMVPriority;
+}
+
+// Some extensions are available for FMV but can not be controlled via the
+// command line. These entries:
+//  - are SubtargetFeatures, so they have (unused) FieldNames on the subtarget
+//    e.g. HasFMVOnlyFEAT_XYZ
+//  - have incorrect (empty) Implies fields, because the code that handles FMV
+//    ignores these dependencies and looks only at FMVDependencies.
+//  - have no description.
+// 
+// In the generated data structures for extensions (ExtensionInfo), AEK_NONE is
+// used to indicate that a feature is FMV only. Therefore ArchExtKindSpelling is
+// manually overridden here.
+class FMVOnlyExtension<string FMVBit, string Name, string Deps, int Priority>
+  : Extension<Name, "FMVOnly"#FMVBit, "", [], FMVBit, Deps, Priority> {
+    let ArchExtKindSpelling = "AEK_NONE"; // AEK_NONE indicates FMV-only feature
 }
 
+def : FMVOnlyExtension<"FEAT_DGH", "dgh", "", 260>;
+def : FMVOnlyExtension<"FEAT_DPB", "dpb", "+ccpp", 190>;
+def : FMVOnlyExtension<"FEAT_DPB2", "dpb2", "+ccpp,+ccdp", 200>;
+def : FMVOnlyExtension<"FEAT_EBF16", "ebf16", "+bf16", 290>;
+def : FMVOnlyExtension<"FEAT_FLAGM2", "flagm2", "+flagm,+altnzcv", 30>;
+def : FMVOnlyExtension<"FEAT_FRINTTS", "frintts", "+fptoint", 250>;
+def : FMVOnlyExtension<"FEAT_LS64_ACCDATA", "ls64_accdata", "+ls64", 540>;
+def : FMVOnlyExtension<"FEAT_LS64_V", "ls64_v", "", 530>;
+def : FMVOnlyExtension<"FEAT_MEMTAG2", "memtag2", "+mte", 450>;
+def : FMVOnlyExtension<"FEAT_MEMTAG3", "memtag3", "+mte", 460>;
+def : FMVOnlyExtension<"FEAT_PMULL", "pmull", "+aes,+fp-armv8,+neon", 160>;
+def : FMVOnlyExtension<"FEAT_RCPC2", "rcpc2", "+rcpc", 240>;
+def : FMVOnlyExtension<"FEAT_RPRES", "rpres", "", 300>;
+def : FMVOnlyExtension<"FEAT_SHA1", "sha1", "+fp-armv8,+neon", 120>;
+def : FMVOnlyExtension<"FEAT_SSBS2", "ssbs2", "+ssbs", 500>;
+def : FMVOnlyExtension<"FEAT_SVE_BF16", "sve-bf16", "+sve,+bf16,+fullfp16,+fp-armv8,+neon", 320>;
+def : FMVOnlyExtension<"FEAT_SVE_EBF16", "sve-ebf16", "+sve,+bf16,+fullfp16,+fp-armv8,+neon", 330>;
+def : FMVOnlyExtension<"FEAT_SVE_I8MM", "sve-i8mm", "+sve,+i8mm,+fullfp16,+fp-armv8,+neon", 340>;
+def : FMVOnlyExtension<"FEAT_SVE_PMULL128", "sve2-pmull128", "+sve2,+sve,+sve2-aes,+fullfp16,+fp-armv8,+neon", 390>;
+
+
 // Each SubtargetFeature which corresponds to an Arm Architecture feature should
 // be annotated with the respective FEAT_ feature name from the Architecture
 // Reference Manual. If a SubtargetFeature enables instructions from multiple
 // Arm Architecture Features, it should list all the relevant features. Not all
 // FEAT_ features have a corresponding SubtargetFeature.
 
-def FeatureFPARMv8 : Extension<"fp-armv8", "FPARMv8", "Enable ARMv8 (FEAT_FP)">;
+let ArchExtKindSpelling = "AEK_FP", MArchName = "fp" in
+def FeatureFPARMv8 : Extension<"fp-armv8", "FPARMv8",
+  "Enable ARMv8 (FEAT_FP)", [],
+  "FEAT_FP", "+fp-armv8,+neon", 90>;
 
+let ArchExtKindSpelling = "AEK_SIMD", MArchName = "simd" in
 def FeatureNEON : Extension<"neon", "NEON",
-  "Enable Advanced SIMD instructions (FEAT_AdvSIMD)", [FeatureFPARMv8]>;
+  "Enable Advanced SIMD instructions (FEAT_AdvSIMD)", [FeatureFPARMv8],
+  "FEAT_SIMD", "+fp-armv8,+neon", 100>;
 
 def FeatureSM4 : Extension<
     "sm4", "SM4",
-    "Enable SM3 and SM4 support (FEAT_SM4, FEAT_SM3)", [FeatureNEON]>;
+    "Enable SM3 and SM4 support (FEAT_SM4, FEAT_SM3)", [FeatureNEON],
+    "FEAT_SM4", "+sm4,+fp-armv8,+neon", 106>;
 
 def FeatureSHA2 : Extension<
     "sha2", "SHA2",
-    "Enable SHA1 and SHA256 support (FEAT_SHA1, FEAT_SHA256)", [FeatureNEON]>;
+    "Enable SHA1 and SHA256 support (FEAT_SHA1, FEAT_SHA256)", [FeatureNEON],
+    "FEAT_SHA2", "+sha2,+fp-armv8,+neon", 130>;
 
 def FeatureSHA3 : Extension<
     "sha3", "SHA3",
-    "Enable SHA512 and SHA3 support (FEAT_SHA3, FEAT_SHA512)", [FeatureNEON, FeatureSHA2]>;
+    "Enable SHA512 and SHA3 support (FEAT_SHA3, FEAT_SHA512)", [FeatureNEON, FeatureSHA2],
+    "FEAT_SHA3", "+sha3,+sha2,+fp-armv8,+neon", 140>;
 
 def FeatureAES : Extension<
     "aes", "AES",
-    "Enable AES support (FEAT_AES, FEAT_PMULL)", [FeatureNEON]>;
+    "Enable AES support (FEAT_AES, FEAT_PMULL)", [FeatureNEON],
+    "FEAT_AES", "+fp-armv8,+neon", 150>;
 
 // Crypto has been split up and any combination is now valid (see the
 // crypto definitions above). Also, crypto is now context sensitive:
@@ -56,11 +135,13 @@ def FeatureAES : Extension<
 // meaning anymore. We kept the Crypto definition here for backward
 // compatibility, and now imply features SHA2 and AES, which was the
 // "traditional" meaning of Crypto.
+let FMVDependencies = "+aes,+sha2" in
 def FeatureCrypto : Extension<"crypto", "Crypto",
   "Enable cryptographic instructions", [FeatureNEON, FeatureSHA2, FeatureAES]>;
 
 def FeatureCRC : Extension<"crc", "CRC",
-  "Enable ARMv8 CRC-32 checksum instructions (FEAT_CRC32)">;
+  "Enable ARMv8 CRC-32 checksum instructions (FEAT_CRC32)", [],
+  "FEAT_CRC", "+crc", 110>;
 
 def FeatureRAS : Extension<"ras", "RAS",
   "Enable ARMv8 Reliability, Availability and Serviceability Extensions (FEAT_RAS, FEAT_RASv1p1)">;
@@ -70,7 +151,8 @@ def FeatureRASv2 : Extension<"rasv2", "RASv2",
   [FeatureRAS]>;
 
 def FeatureLSE : Extension<"lse", "LSE",
-  "Enable ARMv8.1 Large System Extension (LSE) atomic instructions (FEAT_LSE)">;
+  "Enable ARMv8.1 Large System Extension (LSE) atomic instructions (FEAT_LSE)", [],
+  "FEAT_LSE", "+lse", 80>;
 
 def FeatureLSE2 : SubtargetFeature<"lse2", "HasLSE2", "true",
   "Enable ARMv8.4 Large System Extension 2 (LSE2) atomicity rules (FEAT_LSE2)">;
@@ -83,7 +165,8 @@ def FeatureFMV : SubtargetFeature<"fmv", "HasFMV", "true",
 
 def FeatureRDM : Extension<"rdm", "RDM",
   "Enable ARMv8.1 Rounding Double Multiply Add/Subtract instructions (FEAT_RDM)",
-  [FeatureNEON]>;
+  [FeatureNEON],
+  "FEAT_RDM", "+rdm,+fp-armv8,+neon", 108>;
 
 def FeaturePAN : SubtargetFeature<
     "pan", "HasPAN", "true",
@@ -102,15 +185,20 @@ def FeatureVH : SubtargetFeature<"vh", "HasVH", "true",
 // This SubtargetFeature is special. It controls only whether codegen will turn
 // `llvm.readcyclecounter()` into an access to a PMUv3 System Register. The
 // `FEAT_PMUv3*` system registers are always available for assembly/disassembly.
+let MArchName = "pmuv3" in
 def FeaturePerfMon : Extension<"perfmon", "PerfMon",
   "Enable Code Generation for ARMv8 PMUv3 Performance Monitors extension (FEAT_PMUv3)">;
 
+let ArchExtKindSpelling = "AEK_FP16", MArchName = "fp16" in
 def FeatureFullFP16 : Extension<"fullfp16", "FullFP16",
-  "Full FP16 (FEAT_FP16)", [FeatureFPARMv8]>;
+  "Full FP16 (FEAT_FP16)", [FeatureFPARMv8],
+  "FEAT_FP16", "+fullfp16,+fp-armv8,+neon", 170>;
 
 def FeatureFP16FML : Extension<"fp16fml", "FP16FML",
-  "Enable FP16 FML instructions (FEAT_FHM)", [FeatureFullFP16]>;
+  "Enable FP16 FML instructions (FEAT_FHM)", [FeatureFullFP16],
+  "FEAT_FP16FML", "+fp16fml,+fullfp16,+fp-armv8,+neon", 175>;
 
+let ArchExtKindSpelling = "AEK_PROFILE", MArchName = "profile" in
 def FeatureSPE : Extension<"spe", "SPE",
   "Enable Statistical Profiling extension (FEAT_SPE)">;
 
@@ -127,11 +215,13 @@ def FeatureCCPP : SubtargetFeature<"ccpp", "HasCCPP",
     "true", "Enable v8.2 data Cache Clean to Point of Persistence (FEAT_DPB)" >;
 
 def FeatureSVE : Extension<"sve", "SVE",
-  "Enable Scalable Vector Extension (SVE) instructions (FEAT_SVE)", [FeatureFullFP16]>;
+  "Enable Scalable Vector Extension (SVE) instructions (FEAT_SVE)", [FeatureFullFP16],
+  "FEAT_SVE", "+sve,+fullfp16,+fp-armv8,+neon", 310>;
 
 def FeatureFPMR : Extension<"fpmr", "FPMR",
   "Enable FPMR Register (FEAT_FPMR)">;
 
+let FMVDependencies = "+fpmr" in
 def FeatureFP8 : Extension<"fp8", "FP8",
   "Enable FP8 instructions (FEAT_FP8)">;
 
@@ -157,28 +247,35 @@ def FeatureUseScalarIncVL : SubtargetFeature<"use-scalar-inc-vl",
   "UseScalarIncVL", "true", "Prefer inc/dec over add+cnt">;
 
 def FeatureBF16 : Extension<"bf16", "BF16",
-    "Enable BFloat16 Extension (FEAT_BF16)" >;
+    "Enable BFloat16 Extension (FEAT_BF16)", [],
+    "FEAT_BF16", "+bf16", 280>;
 
 def FeatureNoSVEFPLD1R : SubtargetFeature<"no-sve-fp-ld1r",
   "NoSVEFPLD1R", "true", "Avoid using LD1RX instructions for FP">;
 
 def FeatureSVE2 : Extension<"sve2", "SVE2",
   "Enable Scalable Vector Extension 2 (SVE2) instructions (FEAT_SVE2)",
-  [FeatureSVE, FeatureUseScalarIncVL]>;
+  [FeatureSVE, FeatureUseScalarIncVL],
+  "FEAT_SVE2", "+sve2,+sve,+fullfp16,+fp-armv8,+neon", 370>;
 
 def FeatureSVE2AES : Extension<"sve2-aes", "SVE2AES",
   "Enable AES SVE2 instructions (FEAT_SVE_AES, FEAT_SVE_PMULL128)",
-  [FeatureSVE2, FeatureAES]>;
+  [FeatureSVE2, FeatureAES],
+  "FEAT_SVE_AES", "+sve2,+sve,+sve2-aes,+fullfp16,+fp-armv8,+neon", 380>;
 
 def FeatureSVE2SM4 : Extension<"sve2-sm4", "SVE2SM4",
-  "Enable SM4 SVE2 instructions (FEAT_SVE_SM4)", [FeatureSVE2, FeatureSM4]>;
+  "Enable SM4 SVE2 instructions (FEAT_SVE_SM4)", [FeatureSVE2, FeatureSM4],
+  "FEAT_SVE_SM4", "+sve2,+sve,+sve2-sm4,+fullfp16,+fp-armv8,+neon", 420>;
 
 def FeatureSVE2SHA3 : Extension<"sve2-sha3", "SVE2SHA3",
-  "Enable SHA3 SVE2 instructions (FEAT_SVE_SHA3)", [FeatureSVE2, FeatureSHA3]>;
+  "Enable SHA3 SVE2 instructions (FEAT_SVE_SHA3)", [FeatureSVE2, FeatureSHA3],
+  "FEAT_SVE_SHA3", "+sve2,+sve,+sve2-sha3,+fullfp16,+fp-armv8,+neon", 410>;
 
 def FeatureSVE2BitPerm : Extension<"sve2-bitperm", "SVE2BitPerm",
-  "Enable bit permutation SVE2 instructions (FEAT_SVE_BitPerm)", [FeatureSVE2]>;
+  "Enable bit permutation SVE2 instructions (FEAT_SVE_BitPerm)", [FeatureSVE2],
+  "FEAT_SVE_BITPERM", "+sve2,+sve,+sve2-bitperm,+fullfp16,+fp-armv8,+neon", 400>;
 
+let FMVDependencies = "+sve2p1,+sve2,+sve,+fullfp16,+fp-armv8,+neon" in
 def FeatureSVE2p1: Extension<"sve2p1", "SVE2p1",
   "Enable Scalable Vector Extension 2.1 instructions", [FeatureSVE2]>;
 
@@ -315,7 +412,8 @@ def FeatureForce32BitJumpTables
                       "Force jump table entries to be 32-bits wide except at MinSize">;
 
 def FeatureRCPC : Extension<"rcpc", "RCPC",
-    "Enable support for RCPC extension (FEAT_LRCPC)">;
+    "Enable support for RCPC extension (FEAT_LRCPC)", [],
+    "FEAT_RCPC", "+rcpc", 230>;
 
 def FeatureUseRSqrt : SubtargetFeature<
     "use-reciprocal-square-root", "UseRSqrt", "true",
@@ -323,25 +421,30 @@ def FeatureUseRSqrt : SubtargetFeature<
 
 def FeatureDotProd : Extension<
     "dotprod", "DotProd",
-    "Enable dot product support (FEAT_DotProd)", [FeatureNEON]>;
+    "Enable dot product support (FEAT_DotProd)", [FeatureNEON],
+    "FEAT_DOTPROD", "+dotprod,+fp-armv8,+neon", 104>;
 
 def FeaturePAuth : Extension<
     "pauth", "PAuth",
     "Enable v8.3-A Pointer Authentication extension (FEAT_PAuth)">;
 
+let ArchExtKindSpelling = "AEK_JSCVT", MArchName = "jscvt" in
 def FeatureJS : Extension<
     "jsconv", "JS",
     "Enable v8.3-A JavaScript FP conversion instructions (FEAT_JSCVT)",
-    [FeatureFPARMv8]>;
+    [FeatureFPARMv8],
+    "FEAT_JSCVT", "+fp-armv8,+neon,+jsconv", 210>;
 
 def FeatureCCIDX : SubtargetFeature<
     "ccidx", "HasCCIDX", "true",
     "Enable v8.3-A Extend of the CCSIDR number of sets (FEAT_CCIDX)">;
 
+let ArchExtKindSpelling = "AEK_FCMA", MArchName = "fcma" in
 def FeatureComplxNum : Extension<
     "complxnum", "ComplxNum",
     "Enable v8.3-A Floating-point complex number support (FEAT_FCMA)",
-    [FeatureNEON]>;
+    [FeatureNEON],
+    "FEAT_FCMA", "+fp-armv8,+neon,+complxnum", 220>;
 
 def FeatureNV : SubtargetFeature<
     "nv", "HasNV", "true",
@@ -351,9 +454,10 @@ def FeatureMPAM : SubtargetFeature<
     "mpam", "HasMPAM", "true",
     "Enable v8.4-A Memory system Partitioning and Monitoring extension (FEAT_MPAM)">;
 
-def FeatureDIT : SubtargetFeature<
-    "dit", "HasDIT", "true",
-    "Enable v8.4-A Data Independent Timing instructions (FEAT_DIT)">;
+def FeatureDIT : Extension<
+    "dit", "DIT",
+    "Enable v8.4-A Data Independent Timing instructions (FEAT_DIT)", [],
+    "FEAT_DIT", "+dit", 180>;
 
 def FeatureTRACEV8_4 : SubtargetFeature<
     "tracev8.4", "HasTRACEV8_4", "true",
@@ -378,7 +482,8 @@ def FeatureTLB_RMI : SubtargetFeature<
 
 def FeatureFlagM : Extension<
     "flagm", "FlagM",
-    "Enable v8.4-A Flag Manipulation Instructions (FEAT_FlagM)">;
+    "Enable v8.4-A Flag Manipulation Instructions (FEAT_FlagM)", [],
+    "FEAT_FLAGM", "+flagm", 20>;
 
 // 8.4 RCPC enchancements: LDAPR & STLR instructions with Immediate Offset
 def FeatureRCPC_IMMO : SubtargetFeature<"rcpc-immo", "HasRCPC_IMMO", "true",
@@ -426,30 +531,41 @@ def FeatureSpecRestrict : SubtargetFeature<"specrestrict", "HasSpecRestrict",
   "true", "Enable architectural speculation restriction (FEAT_CSV2_2)">;
 
 def FeatureSB : Extension<"sb", "SB",
-  "Enable v8.5 Speculation Barrier (FEAT_SB)" >;
+  "Enable v8.5 Speculation Barrier (FEAT_SB)", [],
+  "FEAT_SB", "+sb", 470>;
 
 def FeatureSSBS : Extension<"ssbs", "SSBS",
-  "Enable Speculative Store Bypass Safe bit (FEAT_SSBS, FEAT_SSBS2)" >;
+  "Enable Speculative Store Bypass Safe bit (FEAT_SSBS, FEAT_SSBS2)", [],
+  "FEAT_SSBS", "", 490>;
 
 def FeaturePredRes : Extension<"predres", "PredRes",
-  "Enable v8.5a execution and data prediction invalidation instructions (FEAT_SPECRES)" >;
+  "Enable v8.5a execution and data prediction invalidation instructions (FEAT_SPECRES)", [],
+  "FEAT_PREDRES", "+predres", 480>;
 
-def FeatureCacheDeepPersist : Extension<"ccdp", "CCDP",
+def FeatureCacheDeepPersist : SubtargetFeature<"ccdp", "CCDP", "true",
     "Enable v8.5 Cache Clean to Point of Deep Persistence (FEAT_DPB2)" >;
 
+let ArchExtKindSpelling = "AEK_NONE" in
 def FeatureBranchTargetId : Extension<"bti", "BTI",
-    "Enable Branch Target Identification (FEAT_BTI)" >;
+    "Enable Branch Target Identification (FEAT_BTI)", [],
+    "FEAT_BTI", "+bti", 510>;
 
+let ArchExtKindSpelling = "AEK_RAND", MArchName = "rng" in
 def FeatureRandGen : Extension<"rand", "RandGen",
-    "Enable Random Number generation instructions (FEAT_RNG)" >;
+    "Enable Random Number generation instructions (FEAT_RNG)", [],
+    "FEAT_RNG", "+rand", 10>;
 
+// NOTE: "memtag" means FEAT_MTE + FEAT_MTE2 for -march or
+// __attribute((target(...))), but only FEAT_MTE for FMV.
+let MArchName = "memtag" in
 def FeatureMTE : Extension<"mte", "MTE",
-    "Enable Memory Tagging Extension (FEAT_MTE, FEAT_MTE2)" >;
+    "Enable Memory Tagging Extension (FEAT_MTE, FEAT_MTE2)", [],
+    "FEAT_MEMTAG", "", 440>;
 
-def FeatureTRBE : Extension<"trbe", "TRBE",
+def FeatureTRBE : SubtargetFeature<"trbe", "TRBE", "true",
     "Enable Trace Buffer Extension (FEAT_TRBE)">;
 
-def FeatureETE : Extension<"ete", "ETE",
+def FeatureETE : SubtargetFeature<"ete", "ETE", "true",
     "Enable Embedded Trace Extension (FEAT_ETE)",
     [FeatureTRBE]>;
 
@@ -461,32 +577,41 @@ def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals",
     "true", "Use an instruction sequence for taking the address of a global "
     "that allows a memory tag in the upper address bits">;
 
+let ArchExtKindSpelling = "AEK_I8MM" in
 def FeatureMatMulInt8 : Extension<"i8mm", "MatMulInt8",
-    "Enable Matrix Multiply Int8 Extension (FEAT_I8MM)">;
+    "Enable Matrix Multiply Int8 Extension (FEAT_I8MM)", [],
+    "FEAT_I8MM", "+i8mm", 270>;
 
+let ArchExtKindSpelling = "AEK_F32MM" in
 def FeatureMatMulFP32 : Extension<"f32mm", "MatMulFP32",
-    "Enable Matrix Multiply FP32 Extension (FEAT_F32MM)", [FeatureSVE]>;
+    "Enable Matrix Multiply FP32 Extension (FEAT_F32MM)", [FeatureSVE],
+    "FEAT_SVE_F32MM", "+sve,+f32mm,+fullfp16,+fp-armv8,+neon", 350>;
 
+let ArchExtKindSpelling = "AEK_F64MM" in
 def FeatureMatMulFP64 : Extension<"f64mm", "MatMulFP64",
-    "Enable Matrix Multiply FP64 Extension (FEAT_F64MM)", [FeatureSVE]>;
+    "Enable Matrix Multiply FP64 Extension (FEAT_F64MM)", [FeatureSVE],
+    "FEAT_SVE_F64MM", "+sve,+f64mm,+fullfp16,+fp-armv8,+neon", 360>;
 
 def FeatureXS : SubtargetFeature<"xs", "HasXS",
     "true", "Enable Armv8.7-A limited-TLB-maintenance instruction (FEAT_XS)">;
 
-def FeatureWFxT : SubtargetFeature<"wfxt", "HasWFxT",
-    "true", "Enable Armv8.7-A WFET and WFIT instruction (FEAT_WFxT)">;
+def FeatureWFxT : Extension<"wfxt", "WFxT",
+    "Enable Armv8.7-A WFET and WFIT instruction (FEAT_WFxT)", [],
+    "FEAT_WFXT", "+wfxt", 550>;
 
 def FeatureHCX : SubtargetFeature<
     "hcx", "HasHCX", "true", "Enable Armv8.7-A HCRX_EL2 system register (FEAT_HCX)">;
 
 def FeatureLS64 : Extension<"ls64", "LS64",
-    "Enable Armv8.7-A LD64B/ST64B Accelerator Extension (FEAT_LS64, FEAT_LS64_V, FEAT_LS64_ACCDATA)">;
+    "Enable Armv8.7-A LD64B/ST64B Accelerator Extension (FEAT_LS64, FEAT_LS64_V, FEAT_LS64_ACCDATA)", [],
+    "FEAT_LS64", "", 520>;
 
 def FeatureHBC : Extension<"hbc", "HBC",
     "Enable Armv8.8-A Hinted Conditional Branches Extension (FEAT_HBC)">;
 
 def FeatureMOPS : Extension<"mops", "MOPS",
-    "Enable Armv8.8-A memcpy and memset acceleration instructions (FEAT_MOPS)">;
+    "Enable Armv8.8-A memcpy and memset acceleration instructions (FEAT_MOPS)", [],
+    "FEAT_MOPS", "+mops", 650>;
 
 def FeatureNMI : SubtargetFeature<"nmi", "HasNMI",
     "true", "Enable Armv8.8-A Non-maskable Interrupts (FEAT_NMI, FEAT_GICv3_NMI)">;
@@ -508,44 +633,54 @@ def FeatureRME : SubtargetFeature<"rme", "HasRME",
     "true", "Enable Realm Management Extension (FEAT_RME)">;
 
 def FeatureSME : Extension<"sme", "SME",
-  "Enable Scalable Matrix Extension (SME) (FEAT_SME)", [FeatureBF16, FeatureUseScalarIncVL]>;
+  "Enable Scalable Matrix Extension (SME) (FEAT_SME)", [FeatureBF16, FeatureUseScalarIncVL],
+  "FEAT_SME", "+sme,+bf16", 430>;
 
 def FeatureSMEF64F64 : Extension<"sme-f64f64", "SMEF64F64",
-  "Enable Scalable Matrix Extension (SME) F64F64 instructions (FEAT_SME_F64F64)", [FeatureSME]>;
+  "Enable Scalable Matrix Extension (SME) F64F64 instructions (FEAT_SME_F64F64)", [FeatureSME],
+  "FEAT_SME_F64", "+sme,+sme-f64f64,+bf16", 560>;
 
 def FeatureSMEI16I64 : Extension<"sme-i16i64", "SMEI16I64",
-  "Enable Scalable Matrix Extension (SME) I16I64 instructions (FEAT_SME_I16I64)", [FeatureSME]>;
+  "Enable Scalable Matrix Extension (SME) I16I64 instructions (FEAT_SME_I16I64)", [FeatureSME],
+  "FEAT_SME_I64", "+sme,+sme-i16i64,+bf16", 570>;
 
 def FeatureSMEFA64 : Extension<"sme-fa64", "SMEFA64",
   "Enable the full A64 instruction set in streaming SVE mode (FEAT_SME_FA64)", [FeatureSME, FeatureSVE2]>;
 
 def FeatureSME2 : Extension<"sme2", "SME2",
-  "Enable Scalable Matrix Extension 2 (SME2) instructions", [FeatureSME]>;
+  "Enable Scalable Matrix Extension 2 (SME2) instructions", [FeatureSME],
+  "FEAT_SME2", "+sme2,+sme,+bf16", 580>;
 
+let FMVDependencies = "+sme2,+sme-f16f16" in
 def FeatureSMEF16F16 : Extension<"sme-f16f16", "SMEF16F16",
   "Enable SME non-widening Float16 instructions (FEAT_SME_F16F16)", [FeatureSME2]>;
 
+let FMVDependencies = "+sme2p1,+sme2,+sme,+bf16" in
 def FeatureSME2p1 : Extension<"sme2p1", "SME2p1",
   "Enable Scalable Matrix Extension 2.1 (FEAT_SME2p1) instructions", [FeatureSME2]>;
 
 def FeatureFAMINMAX: Extension<"faminmax", "FAMINMAX",
    "Enable FAMIN and FAMAX instructions (FEAT_FAMINMAX)">;
 
+let FMVDependencies = "+fpmr" in
 def FeatureFP8FMA : Extension<"fp8fma", "FP8FMA",
   "Enable fp8 multiply-add instructions (FEAT_FP8FMA)">;
 
+let FMVDependencies = "+sme2" in
 def FeatureSSVE_FP8FMA : Extension<"ssve-fp8fma", "SSVE_FP8FMA",
   "Enable SVE2 fp8 multiply-add instructions (FEAT_SSVE_FP8FMA)", [FeatureSME2]>;
 
 def FeatureFP8DOT2: Extension<"fp8dot2", "FP8DOT2",
    "Enable fp8 2-way dot instructions (FEAT_FP8DOT2)">;
 
+let FMVDependencies = "+sme2" in
 def FeatureSSVE_FP8DOT2 : Extension<"ssve-fp8dot2", "SSVE_FP8DOT2",
   "Enable SVE2 fp8 2-way dot product instructions (FEAT_SSVE_FP8DOT2)", [FeatureSME2]>;
 
 def FeatureFP8DOT4: Extension<"fp8dot4", "FP8DOT4",
    "Enable fp8 4-way dot instructions (FEAT_FP8DOT4)">;
 
+let FMVDependencies = "+sme2" in
 def FeatureSSVE_FP8DOT4 : Extension<"ssve-fp8dot4", "SSVE_FP8DOT4",
   "Enable SVE2 fp8 4-way dot product instructions (FEAT_SSVE_FP8DOT4)", [FeatureSME2]>;
 def FeatureLUT: Extension<"lut", "LUT",
@@ -554,9 +689,11 @@ def FeatureLUT: Extension<"lut", "LUT",
 def FeatureSME_LUTv2 : Extension<"sme-lutv2", "SME_LUTv2",
   "Enable Scalable Matrix Extension (SME) LUTv2 instructions (FEAT_SME_LUTv2)">;
 
+let FMVDependencies = "+fp8,+sme2" in
 def FeatureSMEF8F16 : Extension<"sme-f8f16", "SMEF8F16",
   "Enable Scalable Matrix Extension (SME) F8F16 instructions(FEAT_SME_F8F16)", [FeatureSME2, FeatureFP8]>;
 
+let FMVDependencies = "+sme2,+fp8" in
 def FeatureSMEF8F32 : Extension<"sme-f8f32", "SMEF8F32",
   "Enable Scalable Matrix Extension (SME) F8F32 instructions (FEAT_SME_F8F32)", [FeatureSME2, FeatureFP8]>;
 
@@ -592,6 +729,7 @@ def FeatureCLRBHB : SubtargetFeature<"clrbhb", "HasCLRBHB",
 def FeaturePRFM_SLC : SubtargetFeature<"prfm-slc-target", "HasPRFM_SLC",
     "true", "Enable SLC target for PRFM instruction">;
 
+let MArchName = "predres2" in
 def FeatureSPECRES2 : Extension<"specres2", "SPECRES2",
     "Enable Speculation Restriction Instruction (FEAT_SPECRES2)",
     [FeaturePredRes]>;
@@ -605,7 +743,8 @@ def FeatureITE : Extension<"ite", "ITE",
 
 def FeatureRCPC3 : Extension<"rcpc3", "RCPC3",
     "Enable Armv8.9-A RCPC instructions for A64 and Advanced SIMD and floating-point instruction set (FEAT_LRCPC3)",
-    [FeatureRCPC_IMMO]>;
+    [FeatureRCPC_IMMO],
+    "FEAT_RCPC3", "+rcpc,+rcpc3", 241>;
 
 def FeatureTHE : Extension<"the", "THE",
     "Enable Armv8.9-A Translation Hardening Extension (FEAT_THE)">;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2af679e0755b..2aa328e0a127 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1048,9 +1048,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
 
   setTargetDAGCombine({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND,
-                       ISD::VECTOR_SPLICE, ISD::SIGN_EXTEND_INREG,
-                       ISD::CONCAT_VECTORS, ISD::EXTRACT_SUBVECTOR,
-                       ISD::INSERT_SUBVECTOR, ISD::STORE, ISD::BUILD_VECTOR});
+                       ISD::SIGN_EXTEND_INREG, ISD::CONCAT_VECTORS,
+                       ISD::EXTRACT_SUBVECTOR, ISD::INSERT_SUBVECTOR,
+                       ISD::STORE, ISD::BUILD_VECTOR});
   setTargetDAGCombine(ISD::TRUNCATE);
   setTargetDAGCombine(ISD::LOAD);
 
@@ -1304,6 +1304,15 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
           setOperationAction(Op, Ty, Legal);
     }
 
+    // LRINT and LLRINT.
+    for (auto Op : {ISD::LRINT, ISD::LLRINT}) {
+      for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
+        setOperationAction(Op, Ty, Custom);
+      if (Subtarget->hasFullFP16())
+        for (MVT Ty : {MVT::v4f16, MVT::v8f16})
+          setOperationAction(Op, Ty, Custom);
+    }
+
     setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
 
     setOperationAction(ISD::BITCAST, MVT::i2, Custom);
@@ -1525,6 +1534,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::FFLOOR, VT, Custom);
       setOperationAction(ISD::FNEARBYINT, VT, Custom);
       setOperationAction(ISD::FRINT, VT, Custom);
+      setOperationAction(ISD::LRINT, VT, Custom);
+      setOperationAction(ISD::LLRINT, VT, Custom);
       setOperationAction(ISD::FROUND, VT, Custom);
       setOperationAction(ISD::FROUNDEVEN, VT, Custom);
       setOperationAction(ISD::FTRUNC, VT, Custom);
@@ -1580,6 +1591,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::MLOAD, VT, Custom);
       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
       setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
+      setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
 
       if (!Subtarget->isLittleEndian())
         setOperationAction(ISD::BITCAST, VT, Expand);
@@ -1606,6 +1618,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
         setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
     }
 
+    // Histcnt is SVE2 only
+    if (Subtarget->hasSVE2() && Subtarget->isSVEAvailable())
+      setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::Other,
+                         Custom);
+
     // NOTE: Currently this has to happen after computeRegisterProperties rather
     // than the preferred option of combining it with the addRegisterClass call.
     if (Subtarget->useSVEForFixedLengthVectors()) {
@@ -1665,7 +1682,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
         setOperationAction(ISD::MULHU, VT, Custom);
       }
 
-
       // Use SVE for vectors with more than 2 elements.
       for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
         setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
@@ -1939,6 +1955,8 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
   setOperationAction(ISD::FP_TO_SINT, VT, Default);
   setOperationAction(ISD::FP_TO_UINT, VT, Default);
   setOperationAction(ISD::FRINT, VT, Default);
+  setOperationAction(ISD::LRINT, VT, Default);
+  setOperationAction(ISD::LLRINT, VT, Default);
   setOperationAction(ISD::FROUND, VT, Default);
   setOperationAction(ISD::FROUNDEVEN, VT, Default);
   setOperationAction(ISD::FSQRT, VT, Default);
@@ -4273,6 +4291,15 @@ AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
     return SDValue();
 
   SDLoc DL(Op);
+  // Expand to f64 if we are saturating to i64, to help produce keep the lanes
+  // the same width and produce a fcvtzu.
+  if (SatWidth == 64 && SrcElementWidth < 64) {
+    MVT F64VT = MVT::getVectorVT(MVT::f64, SrcVT.getVectorNumElements());
+    SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F64VT, SrcVal);
+    SrcVT = F64VT;
+    SrcElementVT = MVT::f64;
+    SrcElementWidth = 64;
+  }
   // Cases that we can emit directly.
   if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth)
     return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
@@ -4362,6 +4389,26 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
   return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
 }
 
+SDValue AArch64TargetLowering::LowerVectorXRINT(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDValue Src = Op.getOperand(0);
+  SDLoc DL(Op);
+
+  assert(VT.isVector() && "Expected vector type");
+
+  EVT CastVT =
+      VT.changeVectorElementType(Src.getValueType().getVectorElementType());
+
+  // Round the floating-point value into a floating-point register with the
+  // current rounding mode.
+  SDValue FOp = DAG.getNode(ISD::FRINT, DL, CastVT, Src);
+
+  // Truncate the rounded floating point to an integer.
+  return DAG.getNode(ISD::FP_TO_SINT_SAT, DL, VT, FOp,
+                     DAG.getValueType(VT.getVectorElementType()));
+}
+
 SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
                                                     SelectionDAG &DAG) const {
   // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
@@ -6685,10 +6732,13 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerVECTOR_DEINTERLEAVE(Op, DAG);
   case ISD::VECTOR_INTERLEAVE:
     return LowerVECTOR_INTERLEAVE(Op, DAG);
-  case ISD::LROUND:
-  case ISD::LLROUND:
   case ISD::LRINT:
-  case ISD::LLRINT: {
+  case ISD::LLRINT:
+    if (Op.getValueType().isVector())
+      return LowerVectorXRINT(Op, DAG);
+    [[fallthrough]];
+  case ISD::LROUND:
+  case ISD::LLROUND: {
     assert((Op.getOperand(0).getValueType() == MVT::f16 ||
             Op.getOperand(0).getValueType() == MVT::bf16) &&
            "Expected custom lowering of rounding operations only for f16");
@@ -6730,6 +6780,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerFunnelShift(Op, DAG);
   case ISD::FLDEXP:
     return LowerFLDEXP(Op, DAG);
+  case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
+    return LowerVECTOR_HISTOGRAM(Op, DAG);
   }
 }
 
@@ -10102,10 +10154,9 @@ SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
                        Op.getOperand(1));
   }
 
-  // This will select to an EXT instruction, which has a maximum immediate
-  // value of 255, hence 2048-bits is the maximum value we can lower.
-  if (IdxVal >= 0 &&
-      IdxVal < int64_t(2048 / Ty.getVectorElementType().getSizeInBits()))
+  // We can select to an EXT instruction when indexing the first 256 bytes.
+  unsigned BlockSize = AArch64::SVEBitsPerBlock / Ty.getVectorMinNumElements();
+  if (IdxVal >= 0 && (IdxVal * BlockSize / 8) < 256)
     return Op;
 
   return SDValue();
@@ -11880,47 +11931,6 @@ static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
   return true;
 }
 
-/// isREVMask - Check if a vector shuffle corresponds to a REV
-/// instruction with the specified blocksize.  (The order of the elements
-/// within each block of the vector is reversed.)
-static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
-  assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64 ||
-          BlockSize == 128) &&
-         "Only possible block sizes for REV are: 16, 32, 64, 128");
-
-  unsigned EltSz = VT.getScalarSizeInBits();
-  unsigned NumElts = VT.getVectorNumElements();
-  unsigned BlockElts = M[0] + 1;
-  // If the first shuffle index is UNDEF, be optimistic.
-  if (M[0] < 0)
-    BlockElts = BlockSize / EltSz;
-
-  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
-    return false;
-
-  for (unsigned i = 0; i < NumElts; ++i) {
-    if (M[i] < 0)
-      continue; // ignore UNDEF indices
-    if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
-      return false;
-  }
-
-  return true;
-}
-
-static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
-  unsigned NumElts = VT.getVectorNumElements();
-  if (NumElts % 2 != 0)
-    return false;
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  for (unsigned i = 0; i < NumElts; i += 2) {
-    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
-        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
-      return false;
-  }
-  return true;
-}
-
 /// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
@@ -12585,15 +12595,16 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
     }
   }
 
-  if (isREVMask(ShuffleMask, VT, 64))
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned EltSize = VT.getScalarSizeInBits();
+  if (isREVMask(ShuffleMask, EltSize, NumElts, 64))
     return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
-  if (isREVMask(ShuffleMask, VT, 32))
+  if (isREVMask(ShuffleMask, EltSize, NumElts, 32))
     return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
-  if (isREVMask(ShuffleMask, VT, 16))
+  if (isREVMask(ShuffleMask, EltSize, NumElts, 16))
     return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
 
-  if (((VT.getVectorNumElements() == 8 && VT.getScalarSizeInBits() == 16) ||
-       (VT.getVectorNumElements() == 16 && VT.getScalarSizeInBits() == 8)) &&
+  if (((NumElts == 8 && EltSize == 16) || (NumElts == 16 && EltSize == 8)) &&
       ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
     SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1);
     return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
@@ -12615,15 +12626,15 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   }
 
   unsigned WhichResult;
-  if (isZIPMask(ShuffleMask, VT, WhichResult)) {
+  if (isZIPMask(ShuffleMask, NumElts, WhichResult)) {
     unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
   }
-  if (isUZPMask(ShuffleMask, VT, WhichResult)) {
+  if (isUZPMask(ShuffleMask, NumElts, WhichResult)) {
     unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
   }
-  if (isTRNMask(ShuffleMask, VT, WhichResult)) {
+  if (isTRNMask(ShuffleMask, NumElts, WhichResult)) {
     unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
   }
@@ -12655,7 +12666,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
     int SrcLane = ShuffleMask[Anomaly];
     if (SrcLane >= NumInputElements) {
       SrcVec = V2;
-      SrcLane -= VT.getVectorNumElements();
+      SrcLane -= NumElts;
     }
     SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
 
@@ -12675,7 +12686,6 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
 
   // If the shuffle is not directly supported and it has 4 elements, use
   // the PerfectShuffle-generated table to synthesize it from other shuffles.
-  unsigned NumElts = VT.getVectorNumElements();
   if (NumElts == 4) {
     unsigned PFIndexes[4];
     for (unsigned i = 0; i != 4; ++i) {
@@ -14126,16 +14136,20 @@ bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
   int DummyInt;
   unsigned DummyUnsigned;
 
-  return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
-          isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
+  unsigned EltSize = VT.getScalarSizeInBits();
+  unsigned NumElts = VT.getVectorNumElements();
+  return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
+          isREVMask(M, EltSize, NumElts, 64) ||
+          isREVMask(M, EltSize, NumElts, 32) ||
+          isREVMask(M, EltSize, NumElts, 16) ||
           isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
-          // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
-          isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
-          isZIPMask(M, VT, DummyUnsigned) ||
+          isTRNMask(M, NumElts, DummyUnsigned) ||
+          isUZPMask(M, NumElts, DummyUnsigned) ||
+          isZIPMask(M, NumElts, DummyUnsigned) ||
           isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
           isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
           isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
-          isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
+          isINSMask(M, NumElts, DummyBool, DummyInt) ||
           isConcatMask(M, VT, VT.getSizeInBits() == 128));
 }
 
@@ -15979,7 +15993,8 @@ bool AArch64TargetLowering::isLegalInterleavedAccessType(
 
   UseScalable = false;
 
-  if (!VecTy->isScalableTy() && !Subtarget->hasNEON())
+  if (!VecTy->isScalableTy() && !Subtarget->isNeonAvailable() &&
+      !Subtarget->useSVEForFixedLengthVectors())
     return false;
 
   if (VecTy->isScalableTy() && !Subtarget->hasSVEorSME())
@@ -16003,18 +16018,20 @@ bool AArch64TargetLowering::isLegalInterleavedAccessType(
   }
 
   unsigned VecSize = DL.getTypeSizeInBits(VecTy);
-  if (!Subtarget->isNeonAvailable() ||
-      (Subtarget->useSVEForFixedLengthVectors() &&
-       (VecSize % Subtarget->getMinSVEVectorSizeInBits() == 0 ||
-        (VecSize < Subtarget->getMinSVEVectorSizeInBits() &&
-         isPowerOf2_32(MinElts) && VecSize > 128)))) {
-    UseScalable = true;
-    return true;
+  if (Subtarget->useSVEForFixedLengthVectors()) {
+    unsigned MinSVEVectorSize =
+        std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
+    if (VecSize % MinSVEVectorSize == 0 ||
+        (VecSize < MinSVEVectorSize && isPowerOf2_32(MinElts) &&
+         (!Subtarget->isNeonAvailable() || VecSize > 128))) {
+      UseScalable = true;
+      return true;
+    }
   }
 
   // Ensure the total vector size is 64 or a multiple of 128. Types larger than
   // 128 will be split into multiple interleaved accesses.
-  return VecSize == 64 || VecSize % 128 == 0;
+  return Subtarget->isNeonAvailable() && (VecSize == 64 || VecSize % 128 == 0);
 }
 
 static ScalableVectorType *getSVEContainerIRType(FixedVectorType *VTy) {
@@ -16105,8 +16122,7 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
   // "legalize" wide vector types into multiple interleaved accesses as long as
   // the vector types are divisible by 128.
   bool UseScalable;
-  if (!Subtarget->hasNEON() ||
-      !isLegalInterleavedAccessType(VTy, DL, UseScalable))
+  if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
     return false;
 
   unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
@@ -16283,8 +16299,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
   // Skip if we do not have NEON and skip illegal vector types. We can
   // "legalize" wide vector types into multiple interleaved accesses as long as
   // the vector types are divisible by 128.
-  if (!Subtarget->hasNEON() ||
-      !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
+  if (!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
     return false;
 
   unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
@@ -16649,13 +16664,13 @@ bool AArch64TargetLowering::isLegalAddScalableImmediate(int64_t Imm) const {
 
   // inch|dech
   if (Imm % 8 == 0)
-    return std::labs(Imm / 8) <= 16;
+    return std::abs(Imm / 8) <= 16;
   // incw|decw
   if (Imm % 4 == 0)
-    return std::labs(Imm / 4) <= 16;
+    return std::abs(Imm / 4) <= 16;
   // incd|decd
   if (Imm % 2 == 0)
-    return std::labs(Imm / 2) <= 16;
+    return std::abs(Imm / 2) <= 16;
 
   return false;
 }
@@ -17690,6 +17705,23 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
     return false;
   };
 
+  // Can the const C be decomposed into (1 - (1 - 2^M) * 2^N), eg:
+  // C = 29 is equal to 1 - (1 - 2^3) * 2^2.
+  auto isPowMinusMinusOneConst = [](APInt C, APInt &M, APInt &N) {
+    APInt CVMinus1 = C - 1;
+    if (CVMinus1.isNegative())
+      return false;
+    unsigned TrailingZeroes = CVMinus1.countr_zero();
+    APInt CVPlus1 = CVMinus1.ashr(TrailingZeroes) + 1;
+    if (CVPlus1.isPowerOf2()) {
+      unsigned BitWidth = CVPlus1.getBitWidth();
+      M = APInt(BitWidth, CVPlus1.logBase2());
+      N = APInt(BitWidth, TrailingZeroes);
+      return true;
+    }
+    return false;
+  };
+
   if (ConstValue.isNonNegative()) {
     // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
     // (mul x, 2^N - 1) => (sub (shl x, N), x)
@@ -17698,6 +17730,8 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
     //     => MV = (add (shl x, M), x); (add (shl MV, N), MV)
     // (mul x, (2^M + 1) * 2^N + 1))
     //     =>  MV = add (shl x, M), x); add (shl MV, N), x)
+    // (mul x, 1 - (1 - 2^M) * 2^N))
+    //     =>  MV = sub (x - (shl x, M)); sub (x - (shl MV, N))
     APInt SCVMinus1 = ShiftedConstValue - 1;
     APInt SCVPlus1 = ShiftedConstValue + 1;
     APInt CVPlus1 = ConstValue + 1;
@@ -17734,6 +17768,17 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
         return Add(Shl(MVal, CVN.getZExtValue()), N0);
       }
     }
+
+    if (Subtarget->hasALULSLFast() &&
+        isPowMinusMinusOneConst(ConstValue, CVM, CVN)) {
+      unsigned ShiftM = CVM.getZExtValue();
+      unsigned ShiftN = CVN.getZExtValue();
+      // ALULSLFast implicate that Shifts <= 4 places are fast
+      if (ShiftM <= 4 && ShiftN <= 4) {
+        SDValue MVal = Sub(N0, Shl(N0, CVM.getZExtValue()));
+        return Sub(N0, Shl(MVal, CVN.getZExtValue()));
+      }
+    }
   } else {
     // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
     // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
@@ -20541,6 +20586,66 @@ static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc,
   return SDValue();
 }
 
+static SDValue tryCombineWhileLo(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const AArch64Subtarget *Subtarget) {
+  if (DCI.isBeforeLegalize())
+    return SDValue();
+
+  if (!Subtarget->hasSVE2p1())
+    return SDValue();
+
+  if (!N->hasNUsesOfValue(2, 0))
+    return SDValue();
+
+  const uint64_t HalfSize = N->getValueType(0).getVectorMinNumElements() / 2;
+  if (HalfSize < 2)
+    return SDValue();
+
+  auto It = N->use_begin();
+  SDNode *Lo = *It++;
+  SDNode *Hi = *It;
+
+  if (Lo->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+      Hi->getOpcode() != ISD::EXTRACT_SUBVECTOR)
+    return SDValue();
+
+  uint64_t OffLo = Lo->getConstantOperandVal(1);
+  uint64_t OffHi = Hi->getConstantOperandVal(1);
+
+  if (OffLo > OffHi) {
+    std::swap(Lo, Hi);
+    std::swap(OffLo, OffHi);
+  }
+
+  if (OffLo != 0 || OffHi != HalfSize)
+    return SDValue();
+
+  EVT HalfVec = Lo->getValueType(0);
+  if (HalfVec != Hi->getValueType(0) ||
+      HalfVec.getVectorElementCount() != ElementCount::getScalable(HalfSize))
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+  SDValue ID =
+      DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
+  SDValue Idx = N->getOperand(1);
+  SDValue TC = N->getOperand(2);
+  if (Idx.getValueType() != MVT::i64) {
+    Idx = DAG.getZExtOrTrunc(Idx, DL, MVT::i64);
+    TC = DAG.getZExtOrTrunc(TC, DL, MVT::i64);
+  }
+  auto R =
+      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL,
+                  {Lo->getValueType(0), Hi->getValueType(0)}, {ID, Idx, TC});
+
+  DCI.CombineTo(Lo, R.getValue(0));
+  DCI.CombineTo(Hi, R.getValue(1));
+
+  return SDValue(N, 0);
+}
+
 static SDValue performIntrinsicCombine(SDNode *N,
                                        TargetLowering::DAGCombinerInfo &DCI,
                                        const AArch64Subtarget *Subtarget) {
@@ -20838,6 +20943,8 @@ static SDValue performIntrinsicCombine(SDNode *N,
   case Intrinsic::aarch64_sve_ptest_last:
     return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
                     AArch64CC::LAST_ACTIVE);
+  case Intrinsic::aarch64_sve_whilelo:
+    return tryCombineWhileLo(N, DCI, Subtarget);
   }
   return SDValue();
 }
@@ -21454,6 +21561,29 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
   SDValue Op1 = N->getOperand(1);
   EVT ResVT = N->getValueType(0);
 
+  // uzp(extract_lo(x), extract_hi(x)) -> extract_lo(uzp x, x)
+  if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+      Op1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+      Op0.getOperand(0) == Op1.getOperand(0)) {
+
+    SDValue SourceVec = Op0.getOperand(0);
+    uint64_t ExtIdx0 = Op0.getConstantOperandVal(1);
+    uint64_t ExtIdx1 = Op1.getConstantOperandVal(1);
+    uint64_t NumElements = SourceVec.getValueType().getVectorMinNumElements();
+    if (ExtIdx0 == 0 && ExtIdx1 == NumElements / 2) {
+      EVT OpVT = Op0.getOperand(1).getValueType();
+      EVT WidenedResVT = ResVT.getDoubleNumVectorElementsVT(*DAG.getContext());
+      SDValue Uzp = DAG.getNode(N->getOpcode(), DL, WidenedResVT, SourceVec,
+                                DAG.getUNDEF(WidenedResVT));
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Uzp,
+                         DAG.getConstant(0, DL, OpVT));
+    }
+  }
+
+  // Following optimizations only work with uzp1.
+  if (N->getOpcode() == AArch64ISD::UZP2)
+    return SDValue();
+
   // uzp1(x, undef) -> concat(truncate(x), undef)
   if (Op1.getOpcode() == ISD::UNDEF) {
     EVT BCVT = MVT::Other, HalfVT = MVT::Other;
@@ -22832,7 +22962,8 @@ SDValue performCONDCombine(SDNode *N,
   SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
   unsigned CondOpcode = SubsNode->getOpcode();
 
-  if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0))
+  if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0) ||
+      !SubsNode->hasOneUse())
     return SDValue();
 
   // There is a SUBS feeding this condition. Is it fed by a mask we can
@@ -24237,28 +24368,6 @@ performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   return performPostLD1Combine(N, DCI, true);
 }
 
-static SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG) {
-  EVT Ty = N->getValueType(0);
-  if (Ty.isInteger())
-    return SDValue();
-
-  EVT IntTy = Ty.changeVectorElementTypeToInteger();
-  EVT ExtIntTy = getPackedSVEVectorVT(IntTy.getVectorElementCount());
-  if (ExtIntTy.getVectorElementType().getScalarSizeInBits() <
-      IntTy.getVectorElementType().getScalarSizeInBits())
-    return SDValue();
-
-  SDLoc DL(N);
-  SDValue LHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(0)),
-                                     DL, ExtIntTy);
-  SDValue RHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(1)),
-                                     DL, ExtIntTy);
-  SDValue Idx = N->getOperand(2);
-  SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ExtIntTy, LHS, RHS, Idx);
-  SDValue Trunc = DAG.getAnyExtOrTrunc(Splice, DL, IntTy);
-  return DAG.getBitcast(Ty, Trunc);
-}
-
 static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG,
                                       TargetLowering::DAGCombinerInfo &DCI,
                                       const AArch64Subtarget *Subtarget) {
@@ -24643,8 +24752,6 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::MGATHER:
   case ISD::MSCATTER:
     return performMaskedGatherScatterCombine(N, DCI, DAG);
-  case ISD::VECTOR_SPLICE:
-    return performSVESpliceCombine(N, DAG);
   case ISD::FP_EXTEND:
     return performFPExtendCombine(N, DAG, DCI, Subtarget);
   case AArch64ISD::BRCOND:
@@ -24670,6 +24777,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
   case AArch64ISD::UUNPKHI:
     return performUnpackCombine(N, DAG, Subtarget);
   case AArch64ISD::UZP1:
+  case AArch64ISD::UZP2:
     return performUzpCombine(N, DAG, Subtarget);
   case AArch64ISD::SETCC_MERGE_ZERO:
     return performSetccMergeZeroCombine(N, DCI);
@@ -27254,6 +27362,62 @@ SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
   return DAG.getMergeValues({Lo, Hi}, DL);
 }
 
+SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  // FIXME: Maybe share some code with LowerMGather/Scatter?
+  MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(Op);
+  SDLoc DL(HG);
+  SDValue Chain = HG->getChain();
+  SDValue Inc = HG->getInc();
+  SDValue Mask = HG->getMask();
+  SDValue Ptr = HG->getBasePtr();
+  SDValue Index = HG->getIndex();
+  SDValue Scale = HG->getScale();
+  SDValue IntID = HG->getIntID();
+
+  // The Intrinsic ID determines the type of update operation.
+  [[maybe_unused]] ConstantSDNode *CID = cast<ConstantSDNode>(IntID.getNode());
+  // Right now, we only support 'add' as an update.
+  assert(CID->getZExtValue() == Intrinsic::experimental_vector_histogram_add &&
+         "Unexpected histogram update operation");
+
+  EVT IncVT = Inc.getValueType();
+  EVT IndexVT = Index.getValueType();
+  EVT MemVT = EVT::getVectorVT(*DAG.getContext(), IncVT,
+                               IndexVT.getVectorElementCount());
+  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+  SDValue PassThru = DAG.getSplatVector(MemVT, DL, Zero);
+  SDValue IncSplat = DAG.getSplatVector(MemVT, DL, Inc);
+  SDValue Ops[] = {Chain, PassThru, Mask, Ptr, Index, Scale};
+
+  // Set the MMO to load only, rather than load|store.
+  MachineMemOperand *GMMO = HG->getMemOperand();
+  GMMO->setFlags(MachineMemOperand::MOLoad);
+  ISD::MemIndexType IndexType = HG->getIndexType();
+  SDValue Gather =
+      DAG.getMaskedGather(DAG.getVTList(MemVT, MVT::Other), MemVT, DL, Ops,
+                          GMMO, IndexType, ISD::NON_EXTLOAD);
+
+  SDValue GChain = Gather.getValue(1);
+
+  // Perform the histcnt, multiply by inc, add to bucket data.
+  SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_histcnt, DL, IncVT);
+  SDValue HistCnt =
+      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, ID, Mask, Index, Index);
+  SDValue Mul = DAG.getNode(ISD::MUL, DL, MemVT, HistCnt, IncSplat);
+  SDValue Add = DAG.getNode(ISD::ADD, DL, MemVT, Gather, Mul);
+
+  // Create a new MMO for the scatter.
+  MachineMemOperand *SMMO = DAG.getMachineFunction().getMachineMemOperand(
+      GMMO->getPointerInfo(), MachineMemOperand::MOStore, GMMO->getSize(),
+      GMMO->getAlign(), GMMO->getAAInfo());
+
+  SDValue ScatterOps[] = {GChain, Add, Mask, Ptr, Index, Scale};
+  SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MemVT, DL,
+                                         ScatterOps, SMMO, IndexType, false);
+  return Scatter;
+}
+
 SDValue
 AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
                                                     SelectionDAG &DAG) const {
@@ -27454,15 +27618,15 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
     return convertFromScalableVector(DAG, VT, Op);
   }
 
+  unsigned EltSize = VT.getScalarSizeInBits();
   for (unsigned LaneSize : {64U, 32U, 16U}) {
-    if (isREVMask(ShuffleMask, VT, LaneSize)) {
+    if (isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), LaneSize)) {
       EVT NewVT =
           getPackedSVEVectorVT(EVT::getIntegerVT(*DAG.getContext(), LaneSize));
       unsigned RevOp;
-      unsigned EltSz = VT.getScalarSizeInBits();
-      if (EltSz == 8)
+      if (EltSize == 8)
         RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU;
-      else if (EltSz == 16)
+      else if (EltSize == 16)
         RevOp = AArch64ISD::REVH_MERGE_PASSTHRU;
       else
         RevOp = AArch64ISD::REVW_MERGE_PASSTHRU;
@@ -27474,8 +27638,8 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
     }
   }
 
-  if (Subtarget->hasSVE2p1() && VT.getScalarSizeInBits() == 64 &&
-      isREVMask(ShuffleMask, VT, 128)) {
+  if (Subtarget->hasSVE2p1() && EltSize == 64 &&
+      isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), 128)) {
     if (!VT.isFloatingPoint())
       return LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
 
@@ -27487,11 +27651,12 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
   }
 
   unsigned WhichResult;
-  if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
+  if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
+      WhichResult == 0)
     return convertFromScalableVector(
         DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));
 
-  if (isTRNMask(ShuffleMask, VT, WhichResult)) {
+  if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
     unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
     return convertFromScalableVector(
         DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
@@ -27534,11 +27699,12 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
       return convertFromScalableVector(DAG, VT, Op);
     }
 
-    if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
+    if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
+        WhichResult != 0)
       return convertFromScalableVector(
           DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));
 
-    if (isUZPMask(ShuffleMask, VT, WhichResult)) {
+    if (isUZPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
       unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
       return convertFromScalableVector(
           DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index fbdc4de5617f..a44a3d35d2f9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1149,6 +1149,7 @@ private:
   SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECTOR_DEINTERLEAVE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECTOR_INTERLEAVE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVECTOR_HISTOGRAM(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDIV(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
@@ -1165,6 +1166,7 @@ private:
   SDValue LowerVectorFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVectorXRINT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 17d96370c04a..bb32280fe51f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -5403,6 +5403,52 @@ def : Pat<(AArch64bsp (v4i32 V128:$Rd), V128:$Rn, V128:$Rm),
 def : Pat<(AArch64bsp (v2i64 V128:$Rd), V128:$Rn, V128:$Rm),
           (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
 
+// The following SetCC patterns are used for GlobalISel only
+multiclass SelectSetCC<PatFrags InFrag, string INST> {
+  def : Pat<(v8i8 (InFrag (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
+            (v8i8 (!cast<Instruction>(INST # v8i8) (v8i8 V64:$Rn), (v8i8 V64:$Rm)))>;
+  def : Pat<(v16i8 (InFrag (v16i8 V128:$Rn), (v16i8 V128:$Rm))),
+            (v16i8 (!cast<Instruction>(INST # v16i8) (v16i8 V128:$Rn), (v16i8 V128:$Rm)))>;
+  def : Pat<(v4i16 (InFrag (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
+            (v4i16 (!cast<Instruction>(INST # v4i16) (v4i16 V64:$Rn), (v4i16 V64:$Rm)))>;
+  def : Pat<(v8i16 (InFrag (v8i16 V128:$Rn), (v8i16 V128:$Rm))),
+            (v8i16 (!cast<Instruction>(INST # v8i16) (v8i16 V128:$Rn), (v8i16 V128:$Rm)))>;
+  def : Pat<(v2i32 (InFrag (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
+            (v2i32 (!cast<Instruction>(INST # v2i32) (v2i32 V64:$Rn), (v2i32 V64:$Rm)))>;
+  def : Pat<(v4i32 (InFrag (v4i32 V128:$Rn), (v4i32 V128:$Rm))),
+            (v4i32 (!cast<Instruction>(INST # v4i32) (v4i32 V128:$Rn), (v4i32 V128:$Rm)))>;
+  def : Pat<(v2i64 (InFrag (v2i64 V128:$Rn), (v2i64 V128:$Rm))),
+            (v2i64 (!cast<Instruction>(INST # v2i64) (v2i64 V128:$Rn), (v2i64 V128:$Rm)))>;
+}
+
+defm : SelectSetCC<seteq, "CMEQ">;
+defm : SelectSetCC<setgt, "CMGT">;
+defm : SelectSetCC<setge, "CMGE">;
+defm : SelectSetCC<setugt, "CMHI">;
+defm : SelectSetCC<setuge, "CMHS">;
+
+multiclass SelectSetCCSwapOperands<PatFrags InFrag, string INST> {
+  def : Pat<(v8i8 (InFrag (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
+            (v8i8 (!cast<Instruction>(INST # v8i8) (v8i8 V64:$Rm), (v8i8 V64:$Rn)))>;
+  def : Pat<(v16i8 (InFrag (v16i8 V128:$Rn), (v16i8 V128:$Rm))),
+            (v16i8 (!cast<Instruction>(INST # v16i8) (v16i8 V128:$Rm), (v16i8 V128:$Rn)))>;
+  def : Pat<(v4i16 (InFrag (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
+            (v4i16 (!cast<Instruction>(INST # v4i16) (v4i16 V64:$Rm), (v4i16 V64:$Rn)))>;
+  def : Pat<(v8i16 (InFrag (v8i16 V128:$Rn), (v8i16 V128:$Rm))),
+            (v8i16 (!cast<Instruction>(INST # v8i16) (v8i16 V128:$Rm), (v8i16 V128:$Rn)))>;
+  def : Pat<(v2i32 (InFrag (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
+            (v2i32 (!cast<Instruction>(INST # v2i32) (v2i32 V64:$Rm), (v2i32 V64:$Rn)))>;
+  def : Pat<(v4i32 (InFrag (v4i32 V128:$Rn), (v4i32 V128:$Rm))),
+            (v4i32 (!cast<Instruction>(INST # v4i32) (v4i32 V128:$Rm), (v4i32 V128:$Rn)))>;
+  def : Pat<(v2i64 (InFrag (v2i64 V128:$Rn), (v2i64 V128:$Rm))),
+            (v2i64 (!cast<Instruction>(INST # v2i64) (v2i64 V128:$Rm), (v2i64 V128:$Rn)))>;
+}
+
+defm : SelectSetCCSwapOperands<setlt, "CMGT">;
+defm : SelectSetCCSwapOperands<setle, "CMGE">;
+defm : SelectSetCCSwapOperands<setult, "CMHI">;
+defm : SelectSetCCSwapOperands<setule, "CMHS">;
+
 let Predicates = [HasNEON] in {
 def : InstAlias<"mov{\t$dst.16b, $src.16b|.16b\t$dst, $src}",
                 (ORRv16i8 V128:$dst, V128:$src, V128:$src), 1>;
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
index 1a8c71888a85..c3d64f5a0a96 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -56,13 +56,13 @@ static std::pair<bool, bool> GetSignReturnAddress(const Function &F) {
   }
 
   StringRef Scope = F.getFnAttribute("sign-return-address").getValueAsString();
-  if (Scope.equals("none"))
+  if (Scope == "none")
     return {false, false};
 
-  if (Scope.equals("all"))
+  if (Scope == "all")
     return {true, true};
 
-  assert(Scope.equals("non-leaf"));
+  assert(Scope == "non-leaf");
   return {true, false};
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
index a143243a8d3b..7b044cf7c238 100644
--- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
+++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
@@ -6588,7 +6588,7 @@ static const unsigned PerfectShuffleTable[6561 + 1] = {
     835584U,     // <u,u,u,u>: Cost 0 copy LHS
     0};
 
-static unsigned getPerfectShuffleCost(llvm::ArrayRef<int> M) {
+inline unsigned getPerfectShuffleCost(llvm::ArrayRef<int> M) {
   assert(M.size() == 4 && "Expected a 4 entry perfect shuffle");
 
   // Special case zero-cost nop copies, from either LHS or RHS.
@@ -6623,8 +6623,8 @@ static unsigned getPerfectShuffleCost(llvm::ArrayRef<int> M) {
 /// Return true for zip1 or zip2 masks of the form:
 ///  <0,  8, 1,  9, 2, 10, 3, 11> or
 ///  <4, 12, 5, 13, 6, 14, 7, 15>
-inline bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResultOut) {
-  unsigned NumElts = VT.getVectorNumElements();
+inline bool isZIPMask(ArrayRef<int> M, unsigned NumElts,
+                      unsigned &WhichResultOut) {
   if (NumElts % 2 != 0)
     return false;
   // Check the first non-undef element for which half to use.
@@ -6656,8 +6656,8 @@ inline bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResultOut) {
 /// Return true for uzp1 or uzp2 masks of the form:
 ///  <0, 2, 4, 6, 8, 10, 12, 14> or
 ///  <1, 3, 5, 7, 9, 11, 13, 15>
-inline bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResultOut) {
-  unsigned NumElts = VT.getVectorNumElements();
+inline bool isUZPMask(ArrayRef<int> M, unsigned NumElts,
+                      unsigned &WhichResultOut) {
   // Check the first non-undef element for which half to use.
   unsigned WhichResult = 2;
   for (unsigned i = 0; i != NumElts; i++) {
@@ -6680,6 +6680,49 @@ inline bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResultOut) {
   return true;
 }
 
+/// Return true for trn1 or trn2 masks of the form:
+///  <0, 8, 2, 10, 4, 12, 6, 14> or
+///  <1, 9, 3, 11, 5, 13, 7, 15>
+inline bool isTRNMask(ArrayRef<int> M, unsigned NumElts,
+                      unsigned &WhichResult) {
+  if (NumElts % 2 != 0)
+    return false;
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i < NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
+        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
+      return false;
+  }
+  return true;
+}
+
+/// isREVMask - Check if a vector shuffle corresponds to a REV
+/// instruction with the specified blocksize.  (The order of the elements
+/// within each block of the vector is reversed.)
+inline bool isREVMask(ArrayRef<int> M, unsigned EltSize, unsigned NumElts,
+                      unsigned BlockSize) {
+  assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64 ||
+          BlockSize == 128) &&
+         "Only possible block sizes for REV are: 16, 32, 64, 128");
+
+  unsigned BlockElts = M[0] + 1;
+  // If the first shuffle index is UNDEF, be optimistic.
+  if (M[0] < 0)
+    BlockElts = BlockSize / EltSize;
+
+  if (BlockSize <= EltSize || BlockSize != BlockElts * EltSize)
+    return false;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if (M[i] < 0)
+      continue; // ignore UNDEF indices
+    if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
+      return false;
+  }
+
+  return true;
+}
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 574178c8d524..c5cbdce476ca 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -797,28 +797,26 @@ defm FADD_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"fadd", 0b0100, MatrixOp16
 defm FADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"fadd", 0b0100, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>;
 defm FSUB_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"fsub", 0b0101, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>;
 defm FSUB_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"fsub", 0b0101, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>;
-}
 
-let Predicates = [HasSMEF16F16] in {
-defm FMLA_VG2_M2ZZI_H : sme2p1_multi_vec_array_vg2_index_16b<"fmla", 0b00, 0b100, ZZ_h_mul_r, ZPR4b16>;
-defm FMLA_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmla", 0b000, ZZZZ_h_mul_r, ZPR4b16>;
-defm FMLA_VG2_M2ZZ_H :  sme2_dot_mla_add_sub_array_vg24_single<"fmla", 0b0011100, MatrixOp16, ZZ_h, ZPR4b16>;
-defm FMLA_VG4_M4ZZ_H :  sme2_dot_mla_add_sub_array_vg24_single<"fmla", 0b0111100, MatrixOp16, ZZZZ_h, ZPR4b16>;
-defm FMLA_VG2_M2Z4Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b0100001, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>;
-defm FMLA_VG4_M4Z4Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmla", 0b0100001, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>;
-
-defm FMLS_VG2_M2ZZI_H : sme2p1_multi_vec_array_vg2_index_16b<"fmls", 0b00, 0b101, ZZ_h_mul_r, ZPR4b16>;
-defm FMLS_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmls", 0b001, ZZZZ_h_mul_r, ZPR4b16>;
-defm FMLS_VG2_M2ZZ_H :  sme2_dot_mla_add_sub_array_vg24_single<"fmls", 0b0011101, MatrixOp16, ZZ_h, ZPR4b16>;
-defm FMLS_VG4_M4ZZ_H :  sme2_dot_mla_add_sub_array_vg24_single<"fmls", 0b0111101, MatrixOp16, ZZZZ_h, ZPR4b16>;
-defm FMLS_VG2_M2Z2Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b0100011, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>;
-defm FMLS_VG4_M4Z2Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b0100011, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>;
+defm FMLA_VG2_M2ZZI_H : sme2p1_multi_vec_array_vg2_index_16b<"fmla", 0b00, 0b100, ZZ_h_mul_r, ZPR4b16, nxv8f16, int_aarch64_sme_fmla_lane_vg1x2>;
+defm FMLA_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmla", 0b000, ZZZZ_h_mul_r, ZPR4b16, nxv8f16, int_aarch64_sme_fmla_lane_vg1x4>;
+defm FMLA_VG2_M2ZZ_H :  sme2_dot_mla_add_sub_array_vg2_single<"fmla", 0b0011100, MatrixOp16, ZZ_h, ZPR4b16, nxv8f16, int_aarch64_sme_fmla_single_vg1x2>;
+defm FMLA_VG4_M4ZZ_H :  sme2_dot_mla_add_sub_array_vg4_single<"fmla", 0b0111100, MatrixOp16, ZZZZ_h, ZPR4b16, nxv8f16, int_aarch64_sme_fmla_single_vg1x4>;
+defm FMLA_VG2_M2Z4Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b0100001, MatrixOp16, ZZ_h_mul_r, nxv8f16, int_aarch64_sme_fmla_vg1x2>;
+defm FMLA_VG4_M4Z4Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmla", 0b0100001, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, int_aarch64_sme_fmla_vg1x4>;
+
+defm FMLS_VG2_M2ZZI_H : sme2p1_multi_vec_array_vg2_index_16b<"fmls", 0b00, 0b101, ZZ_h_mul_r, ZPR4b16, nxv8f16, int_aarch64_sme_fmls_lane_vg1x2>;
+defm FMLS_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmls", 0b001, ZZZZ_h_mul_r, ZPR4b16, nxv8f16, int_aarch64_sme_fmls_lane_vg1x4>;
+defm FMLS_VG2_M2ZZ_H :  sme2_dot_mla_add_sub_array_vg2_single<"fmls", 0b0011101, MatrixOp16, ZZ_h, ZPR4b16, nxv8f16, int_aarch64_sme_fmls_single_vg1x2>;
+defm FMLS_VG4_M4ZZ_H :  sme2_dot_mla_add_sub_array_vg4_single<"fmls", 0b0111101, MatrixOp16, ZZZZ_h, ZPR4b16, nxv8f16, int_aarch64_sme_fmls_single_vg1x4>;
+defm FMLS_VG2_M2Z2Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b0100011, MatrixOp16, ZZ_h_mul_r, nxv8f16, int_aarch64_sme_fmls_vg1x2>;
+defm FMLS_VG4_M4Z2Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b0100011, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, int_aarch64_sme_fmls_vg1x4>;
 
 defm FCVT_2ZZ_H  : sme2p1_fp_cvt_vector_vg2_single<"fcvt", 0b0>;
 defm FCVTL_2ZZ_H : sme2p1_fp_cvt_vector_vg2_single<"fcvtl", 0b1>;
 
-defm FMOPA_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmopa", 0b0, 0b0, 0b11, ZPR16>;
-defm FMOPS_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmops", 0b0, 0b1, 0b11, ZPR16>;
+defm FMOPA_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmopa", 0b0, 0b0, nxv8f16, int_aarch64_sme_mopa>;
+defm FMOPS_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmops", 0b0, 0b1, nxv8f16, int_aarch64_sme_mops>;
 }
 
 let Predicates = [HasSME2, HasB16B16] in {
@@ -827,20 +825,19 @@ defm BFADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"bfadd", 0b1100, MatrixOp
 defm BFSUB_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"bfsub", 0b1101, MatrixOp16, ZZ_h_mul_r,  nxv8bf16, null_frag>;
 defm BFSUB_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"bfsub", 0b1101, MatrixOp16, ZZZZ_h_mul_r,  nxv8bf16, null_frag>;
 
-defm BFMLA_VG2_M2ZZI : sme2p1_multi_vec_array_vg2_index_16b<"bfmla", 0b00, 0b110, ZZ_h_mul_r, ZPR4b16>;
-defm BFMLA_VG4_M4ZZI : sme2p1_multi_vec_array_vg4_index_16b<"bfmla", 0b010, ZZZZ_h_mul_r, ZPR4b16>;
-defm BFMLA_VG2_M2ZZ : sme2_dot_mla_add_sub_array_vg24_single<"bfmla", 0b1011100, MatrixOp16, ZZ_h, ZPR4b16>;
-defm BFMLA_VG4_M4ZZ : sme2_dot_mla_add_sub_array_vg24_single<"bfmla", 0b1111100, MatrixOp16, ZZZZ_h, ZPR4b16>;
-defm BFMLA_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmla", 0b1100001, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>;
-defm BFMLA_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmla", 0b1100001, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, null_frag>;
-
-defm BFMLS_VG2_M2ZZI : sme2p1_multi_vec_array_vg2_index_16b<"bfmls", 0b00, 0b111, ZZ_h_mul_r, ZPR4b16>;
-defm BFMLS_VG4_M4ZZI : sme2p1_multi_vec_array_vg4_index_16b<"bfmls", 0b011, ZZZZ_h_mul_r, ZPR4b16>;
-defm BFMLS_VG2_M2ZZ : sme2_dot_mla_add_sub_array_vg24_single<"bfmls", 0b1011101, MatrixOp16, ZZ_h, ZPR4b16>;
-defm BFMLS_VG4_M4ZZ : sme2_dot_mla_add_sub_array_vg24_single<"bfmls", 0b1111101, MatrixOp16, ZZZZ_h, ZPR4b16>;
-defm BFMLS_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmls", 0b1100011, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>;
-defm BFMLS_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmls", 0b1100011, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, null_frag>;
+defm BFMLA_VG2_M2ZZI : sme2p1_multi_vec_array_vg2_index_16b<"bfmla", 0b00, 0b110, ZZ_h_mul_r, ZPR4b16, nxv8bf16, int_aarch64_sme_fmla_lane_vg1x2>;
+defm BFMLA_VG4_M4ZZI : sme2p1_multi_vec_array_vg4_index_16b<"bfmla", 0b010, ZZZZ_h_mul_r, ZPR4b16, nxv8bf16, int_aarch64_sme_fmla_lane_vg1x4>;
+defm BFMLA_VG2_M2ZZ  : sme2_dot_mla_add_sub_array_vg2_single<"bfmla", 0b1011100, MatrixOp16, ZZ_h, ZPR4b16, nxv8bf16, int_aarch64_sme_fmla_single_vg1x2>;
+defm BFMLA_VG4_M4ZZ  : sme2_dot_mla_add_sub_array_vg4_single<"bfmla", 0b1111100, MatrixOp16, ZZZZ_h, ZPR4b16, nxv8bf16, int_aarch64_sme_fmla_single_vg1x4>;
+defm BFMLA_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmla", 0b1100001, MatrixOp16, ZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fmla_vg1x2>;
+defm BFMLA_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmla", 0b1100001, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fmla_vg1x4>;
 
+defm BFMLS_VG2_M2ZZI : sme2p1_multi_vec_array_vg2_index_16b<"bfmls", 0b00, 0b111, ZZ_h_mul_r, ZPR4b16, nxv8bf16, int_aarch64_sme_fmls_lane_vg1x2>;
+defm BFMLS_VG4_M4ZZI : sme2p1_multi_vec_array_vg4_index_16b<"bfmls", 0b011, ZZZZ_h_mul_r, ZPR4b16, nxv8bf16, int_aarch64_sme_fmls_lane_vg1x4>;
+defm BFMLS_VG2_M2ZZ  : sme2_dot_mla_add_sub_array_vg2_single<"bfmls", 0b1011101, MatrixOp16, ZZ_h, ZPR4b16, nxv8bf16, int_aarch64_sme_fmls_single_vg1x2>;
+defm BFMLS_VG4_M4ZZ  : sme2_dot_mla_add_sub_array_vg4_single<"bfmls", 0b1111101, MatrixOp16, ZZZZ_h, ZPR4b16, nxv8bf16, int_aarch64_sme_fmls_single_vg1x4>;
+defm BFMLS_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmls", 0b1100011, MatrixOp16, ZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fmls_vg1x2>;
+defm BFMLS_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmls", 0b1100011, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fmls_vg1x4>;
 
 defm BFMAX_VG2_2ZZ  : sme2p1_bf_max_min_vector_vg2_single<"bfmax", 0b0010000>;
 defm BFMAX_VG4_4ZZ  : sme2p1_bf_max_min_vector_vg4_single<"bfmax", 0b0010000>;
@@ -865,8 +862,8 @@ defm BFMINNM_VG4_4Z2Z : sme2p1_bf_max_min_vector_vg4_multi<"bfminnm",  0b0010011
 defm BFCLAMP_VG2_2ZZZ: sme2p1_bfclamp_vector_vg2_multi<"bfclamp">;
 defm BFCLAMP_VG4_4ZZZ: sme2p1_bfclamp_vector_vg4_multi<"bfclamp">;
 
-defm BFMOPA_MPPZZ_H : sme2p1_fmop_tile_fp16<"bfmopa", 0b1, 0b0, 0b11, ZPR16>;
-defm BFMOPS_MPPZZ_H : sme2p1_fmop_tile_fp16<"bfmops", 0b1, 0b1, 0b11, ZPR16>;
+defm BFMOPA_MPPZZ_H : sme2p1_fmop_tile_fp16<"bfmopa", 0b1, 0b0, nxv8bf16, int_aarch64_sme_mopa>;
+defm BFMOPS_MPPZZ_H : sme2p1_fmop_tile_fp16<"bfmops", 0b1, 0b1, nxv8bf16, int_aarch64_sme_mops>;
 }
 
 let Predicates = [HasSME2, HasFP8] in {
@@ -909,9 +906,9 @@ def LUTI4_S_4ZZT2Z  : sme2_luti4_vector_vg4_strided<0b00, 0b00, "luti4">;
 } //[HasSME2p1, HasSME_LUTv2]
 
 let Predicates = [HasSMEF8F16] in {
-defm FVDOT_VG2_M2ZZI_BtoH : sme2p1_multi_vec_array_vg2_index_16b<"fvdot", 0b11, 0b110, ZZ_b_mul_r, ZPR4b8>;
-defm FDOT_VG2_M2ZZI_BtoH  : sme2p1_multi_vec_array_vg2_index_16b<"fdot",    0b11, 0b010, ZZ_b_mul_r, ZPR4b8>;
-defm FDOT_VG4_M4ZZI_BtoH  : sme2p1_multi_vec_array_vg4_index_16b<"fdot",    0b100, ZZZZ_b_mul_r, ZPR4b8>;
+defm FVDOT_VG2_M2ZZI_BtoH : sme2p1_multi_vec_array_vg2_index_f8f16<"fvdot", 0b11, 0b110, ZZ_b_mul_r, ZPR4b8>;
+defm FDOT_VG2_M2ZZI_BtoH  : sme2p1_multi_vec_array_vg2_index_f8f16<"fdot",  0b11, 0b010, ZZ_b_mul_r, ZPR4b8>;
+defm FDOT_VG4_M4ZZI_BtoH  : sme2p1_multi_vec_array_vg4_index_f8f16<"fdot",    0b100, ZZZZ_b_mul_r, ZPR4b8>;
 defm FDOT_VG2_M2ZZ_BtoH   :  sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0010001, MatrixOp16, ZZ_b, ZPR4b8>;
 defm FDOT_VG4_M4ZZ_BtoH   :  sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0110001, MatrixOp16, ZZZZ_b, ZPR4b8>;
 // TODO: Replace nxv16i8 by nxv16f8
@@ -928,7 +925,7 @@ defm FMLAL_VG4_M4ZZ_BtoH  :  sme2_fp_mla_long_array_vg4_single<"fmlal", 0b001, M
 defm FMLAL_VG2_M2Z2Z_BtoH : sme2_fp_mla_long_array_vg2_multi<"fmlal",   0b100, MatrixOp16, ZZ_b_mul_r, nxv16i8, null_frag>;
 defm FMLAL_VG4_M4Z4Z_BtoH : sme2_fp_mla_long_array_vg4_multi<"fmlal",   0b100, MatrixOp16, ZZZZ_b_mul_r, nxv16i8, null_frag>;
 
-defm FMOPA_MPPZZ_BtoH     : sme2p1_fmop_tile_fp16<"fmopa", 0b1, 0b0, 0b01, ZPR8>;
+defm FMOPA_MPPZZ_BtoH     : sme2p1_fmop_tile_f8f16<"fmopa", 0b1, 0b0, 0b01>;
 
 } //[HasSMEF8F16]
 
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 62e68de1359f..d4405a230613 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1269,33 +1269,33 @@ let Predicates = [HasSVE] in {
 
   multiclass sve_masked_gather_x2_scaled<ValueType Ty, SDPatternOperator Load, string Inst> {
     // base + vector of scaled offsets
-    def : Pat<(Ty (Load (SVEDup0Undef), (nxv2i1 PPR:$gp), GPR64:$base, (nxv2i64 ZPR:$offs))),
+    def : Pat<(Ty (Load (SVEDup0Undef), nxv2i1:$gp, GPR64:$base, nxv2i64:$offs)),
               (!cast<Instruction>(Inst # _SCALED) PPR:$gp, GPR64:$base, ZPR:$offs)>;
     // base + vector of signed 32bit scaled offsets
-    def : Pat<(Ty (Load (SVEDup0Undef), (nxv2i1 PPR:$gp), GPR64:$base, (sext_inreg (nxv2i64 ZPR:$offs), nxv2i32))),
+    def : Pat<(Ty (Load (SVEDup0Undef), nxv2i1:$gp, GPR64:$base, (sext_inreg nxv2i64:$offs, nxv2i32))),
               (!cast<Instruction>(Inst # _SXTW_SCALED) PPR:$gp, GPR64:$base, ZPR:$offs)>;
     // base + vector of unsigned 32bit scaled offsets
-    def : Pat<(Ty (Load (SVEDup0Undef), (nxv2i1 PPR:$gp), GPR64:$base, (and (nxv2i64 ZPR:$offs), (nxv2i64 (splat_vector (i64 0xFFFFFFFF)))))),
+    def : Pat<(Ty (Load (SVEDup0Undef), nxv2i1:$gp, GPR64:$base, (and nxv2i64:$offs, (nxv2i64 (splat_vector (i64 0xFFFFFFFF)))))),
               (!cast<Instruction>(Inst # _UXTW_SCALED) PPR:$gp, GPR64:$base, ZPR:$offs)>;
   }
 
   multiclass sve_masked_gather_x2_unscaled<ValueType Ty, SDPatternOperator Load, string Inst, Operand ImmTy> {
     // vector of pointers + immediate offset (includes zero)
-    def : Pat<(Ty (Load (SVEDup0Undef), (nxv2i1 PPR:$gp), (i64 ImmTy:$imm), (nxv2i64 ZPR:$ptrs))),
+    def : Pat<(Ty (Load (SVEDup0Undef), nxv2i1:$gp, (i64 ImmTy:$imm), nxv2i64:$ptrs)),
               (!cast<Instruction>(Inst # _IMM) PPR:$gp, ZPR:$ptrs, ImmTy:$imm)>;
     // base + vector of offsets
-    def : Pat<(Ty (Load (SVEDup0Undef), (nxv2i1 PPR:$gp), GPR64:$base, (nxv2i64 ZPR:$offs))),
+    def : Pat<(Ty (Load (SVEDup0Undef), nxv2i1:$gp, GPR64:$base, nxv2i64:$offs)),
               (!cast<Instruction>(Inst) PPR:$gp, GPR64:$base, ZPR:$offs)>;
     // base + vector of signed 32bit offsets
-    def : Pat<(Ty (Load (SVEDup0Undef), (nxv2i1 PPR:$gp), GPR64:$base, (sext_inreg (nxv2i64 ZPR:$offs), nxv2i32))),
+    def : Pat<(Ty (Load (SVEDup0Undef), nxv2i1:$gp, GPR64:$base, (sext_inreg nxv2i64:$offs, nxv2i32))),
               (!cast<Instruction>(Inst # _SXTW) PPR:$gp, GPR64:$base, ZPR:$offs)>;
     // base + vector of unsigned 32bit offsets
-    def : Pat<(Ty (Load (SVEDup0Undef), (nxv2i1 PPR:$gp), GPR64:$base, (and (nxv2i64 ZPR:$offs), (nxv2i64 (splat_vector (i64 0xFFFFFFFF)))))),
+    def : Pat<(Ty (Load (SVEDup0Undef), nxv2i1:$gp, GPR64:$base, (and nxv2i64:$offs, (nxv2i64 (splat_vector (i64 0xFFFFFFFF)))))),
               (!cast<Instruction>(Inst # _UXTW) PPR:$gp, GPR64:$base, ZPR:$offs)>;
   }
 
   multiclass sve_masked_gather_x4<ValueType Ty, SDPatternOperator Load, Instruction Inst> {
-    def : Pat<(Ty (Load (SVEDup0Undef), (nxv4i1 PPR:$gp), GPR64:$base, (nxv4i32 ZPR:$offs))),
+    def : Pat<(Ty (Load (SVEDup0Undef), nxv4i1:$gp, GPR64:$base, nxv4i32:$offs)),
               (Inst PPR:$gp, GPR64:$base, ZPR:$offs)>;
   }
 
@@ -1503,33 +1503,33 @@ let Predicates = [HasSVE] in {
 
   multiclass sve_masked_scatter_x2_scaled<ValueType Ty, SDPatternOperator Store, string Inst> {
     // base + vector of scaled offsets
-    def : Pat<(Store (Ty ZPR:$data), (nxv2i1 PPR:$gp), GPR64:$base, (nxv2i64 ZPR:$offs)),
+    def : Pat<(Store Ty:$data, nxv2i1:$gp, GPR64:$base, nxv2i64:$offs),
               (!cast<Instruction>(Inst # _SCALED) ZPR:$data, PPR:$gp, GPR64:$base, ZPR:$offs)>;
     // base + vector of signed 32bit scaled offsets
-    def : Pat<(Store (Ty ZPR:$data), (nxv2i1 PPR:$gp), GPR64:$base, (sext_inreg (nxv2i64 ZPR:$offs), nxv2i32)),
+    def : Pat<(Store Ty:$data, nxv2i1:$gp, GPR64:$base, (sext_inreg nxv2i64:$offs, nxv2i32)),
               (!cast<Instruction>(Inst # _SXTW_SCALED) ZPR:$data, PPR:$gp, GPR64:$base, ZPR:$offs)>;
     // base + vector of unsigned 32bit scaled offsets
-    def : Pat<(Store (Ty ZPR:$data), (nxv2i1 PPR:$gp), GPR64:$base, (and (nxv2i64 ZPR:$offs), (nxv2i64 (splat_vector (i64 0xFFFFFFFF))))),
+    def : Pat<(Store Ty:$data, nxv2i1:$gp, GPR64:$base, (and nxv2i64:$offs, (nxv2i64 (splat_vector (i64 0xFFFFFFFF))))),
               (!cast<Instruction>(Inst # _UXTW_SCALED) ZPR:$data, PPR:$gp, GPR64:$base, ZPR:$offs)>;
   }
 
   multiclass sve_masked_scatter_x2_unscaled<ValueType Ty, SDPatternOperator Store, string Inst, Operand ImmTy> {
     // vector of pointers + immediate offset (includes zero)
-    def : Pat<(Store (Ty ZPR:$data), (nxv2i1 PPR:$gp), (i64 ImmTy:$imm), (nxv2i64 ZPR:$ptrs)),
+    def : Pat<(Store Ty:$data, nxv2i1:$gp, (i64 ImmTy:$imm), nxv2i64:$ptrs),
               (!cast<Instruction>(Inst # _IMM) ZPR:$data, PPR:$gp, ZPR:$ptrs, ImmTy:$imm)>;
     // base + vector of offsets
-    def : Pat<(Store (Ty ZPR:$data), (nxv2i1 PPR:$gp), GPR64:$base, (nxv2i64 ZPR:$offs)),
+    def : Pat<(Store Ty:$data, nxv2i1:$gp, GPR64:$base, nxv2i64:$offs),
               (!cast<Instruction>(Inst) ZPR:$data, PPR:$gp, GPR64:$base, ZPR:$offs)>;
     // base + vector of signed 32bit offsets
-    def : Pat<(Store (Ty ZPR:$data), (nxv2i1 PPR:$gp), GPR64:$base, (sext_inreg (nxv2i64 ZPR:$offs), nxv2i32)),
+    def : Pat<(Store Ty:$data, nxv2i1:$gp, GPR64:$base, (sext_inreg nxv2i64:$offs, nxv2i32)),
               (!cast<Instruction>(Inst # _SXTW) ZPR:$data, PPR:$gp, GPR64:$base, ZPR:$offs)>;
     // base + vector of unsigned 32bit offsets
-    def : Pat<(Store (Ty ZPR:$data), (nxv2i1 PPR:$gp), GPR64:$base, (and (nxv2i64 ZPR:$offs), (nxv2i64 (splat_vector (i64 0xFFFFFFFF))))),
+    def : Pat<(Store Ty:$data, nxv2i1:$gp, GPR64:$base, (and nxv2i64:$offs, (nxv2i64 (splat_vector (i64 0xFFFFFFFF))))),
               (!cast<Instruction>(Inst # _UXTW) ZPR:$data, PPR:$gp, GPR64:$base, ZPR:$offs)>;
   }
 
   multiclass sve_masked_scatter_x4<ValueType Ty, SDPatternOperator Store, Instruction Inst> {
-    def : Pat<(Store (Ty ZPR:$data), (nxv4i1 PPR:$gp), GPR64:$base, (nxv4i32 ZPR:$offs)),
+    def : Pat<(Store Ty:$data, nxv4i1:$gp, GPR64:$base, nxv4i32:$offs),
               (Inst ZPR:$data, PPR:$gp, GPR64:$base, ZPR:$offs)>;
   }
 
@@ -1791,159 +1791,159 @@ let Predicates = [HasSVEorSME] in {
   defm TRN2_PPP : sve_int_perm_bin_perm_pp<0b101, "trn2", AArch64trn2, int_aarch64_sve_trn2_b16, int_aarch64_sve_trn2_b32, int_aarch64_sve_trn2_b64>;
 
   // Extract lo/hi halves of legal predicate types.
-  def : Pat<(nxv1i1 (extract_subvector (nxv2i1 PPR:$Ps), (i64 0))),
+  def : Pat<(nxv1i1 (extract_subvector nxv2i1:$Ps, (i64 0))),
             (PUNPKLO_PP PPR:$Ps)>;
-  def : Pat<(nxv1i1 (extract_subvector (nxv2i1 PPR:$Ps), (i64 1))),
+  def : Pat<(nxv1i1 (extract_subvector nxv2i1:$Ps, (i64 1))),
             (PUNPKHI_PP PPR:$Ps)>;
-  def : Pat<(nxv2i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 0))),
+  def : Pat<(nxv2i1 (extract_subvector nxv4i1:$Ps, (i64 0))),
             (PUNPKLO_PP PPR:$Ps)>;
-  def : Pat<(nxv2i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 2))),
+  def : Pat<(nxv2i1 (extract_subvector nxv4i1:$Ps, (i64 2))),
             (PUNPKHI_PP PPR:$Ps)>;
-  def : Pat<(nxv4i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 0))),
+  def : Pat<(nxv4i1 (extract_subvector nxv8i1:$Ps, (i64 0))),
             (PUNPKLO_PP PPR:$Ps)>;
-  def : Pat<(nxv4i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 4))),
+  def : Pat<(nxv4i1 (extract_subvector nxv8i1:$Ps, (i64 4))),
             (PUNPKHI_PP PPR:$Ps)>;
-  def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 0))),
+  def : Pat<(nxv8i1 (extract_subvector nxv16i1:$Ps, (i64 0))),
             (PUNPKLO_PP PPR:$Ps)>;
-  def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 8))),
+  def : Pat<(nxv8i1 (extract_subvector nxv16i1:$Ps, (i64 8))),
             (PUNPKHI_PP PPR:$Ps)>;
 
-  def : Pat<(nxv1i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 0))),
+  def : Pat<(nxv1i1 (extract_subvector nxv4i1:$Ps, (i64 0))),
             (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))>;
-  def : Pat<(nxv1i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 1))),
+  def : Pat<(nxv1i1 (extract_subvector nxv4i1:$Ps, (i64 1))),
             (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps))>;
-  def : Pat<(nxv1i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 2))),
+  def : Pat<(nxv1i1 (extract_subvector nxv4i1:$Ps, (i64 2))),
             (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps))>;
-  def : Pat<(nxv1i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 3))),
+  def : Pat<(nxv1i1 (extract_subvector nxv4i1:$Ps, (i64 3))),
             (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))>;
-  def : Pat<(nxv2i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 0))),
+  def : Pat<(nxv2i1 (extract_subvector nxv8i1:$Ps, (i64 0))),
             (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))>;
-  def : Pat<(nxv2i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 2))),
+  def : Pat<(nxv2i1 (extract_subvector nxv8i1:$Ps, (i64 2))),
             (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps))>;
-  def : Pat<(nxv2i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 4))),
+  def : Pat<(nxv2i1 (extract_subvector nxv8i1:$Ps, (i64 4))),
             (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps))>;
-  def : Pat<(nxv2i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 6))),
+  def : Pat<(nxv2i1 (extract_subvector nxv8i1:$Ps, (i64 6))),
             (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))>;
-  def : Pat<(nxv4i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 0))),
+  def : Pat<(nxv4i1 (extract_subvector nxv16i1:$Ps, (i64 0))),
             (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))>;
-  def : Pat<(nxv4i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 4))),
+  def : Pat<(nxv4i1 (extract_subvector nxv16i1:$Ps, (i64 4))),
             (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps))>;
-  def : Pat<(nxv4i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 8))),
+  def : Pat<(nxv4i1 (extract_subvector nxv16i1:$Ps, (i64 8))),
             (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps))>;
-  def : Pat<(nxv4i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 12))),
+  def : Pat<(nxv4i1 (extract_subvector nxv16i1:$Ps, (i64 12))),
             (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))>;
 
 
-  def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 0))),
+  def : Pat<(nxv1i1 (extract_subvector nxv8i1:$Ps, (i64 0))),
             (PUNPKLO_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps)))>;
-  def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 1))),
+  def : Pat<(nxv1i1 (extract_subvector nxv8i1:$Ps, (i64 1))),
             (PUNPKHI_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps)))>;
-  def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 2))),
+  def : Pat<(nxv1i1 (extract_subvector nxv8i1:$Ps, (i64 2))),
             (PUNPKLO_PP (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps)))>;
-  def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 3))),
+  def : Pat<(nxv1i1 (extract_subvector nxv8i1:$Ps, (i64 3))),
             (PUNPKHI_PP (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps)))>;
-  def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 4))),
+  def : Pat<(nxv1i1 (extract_subvector nxv8i1:$Ps, (i64 4))),
             (PUNPKLO_PP (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps)))>;
-  def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 5))),
+  def : Pat<(nxv1i1 (extract_subvector nxv8i1:$Ps, (i64 5))),
             (PUNPKHI_PP (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps)))>;
-  def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 6))),
+  def : Pat<(nxv1i1 (extract_subvector nxv8i1:$Ps, (i64 6))),
             (PUNPKLO_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps)))>;
-  def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 7))),
+  def : Pat<(nxv1i1 (extract_subvector nxv8i1:$Ps, (i64 7))),
             (PUNPKHI_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps)))>;
-  def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 0))),
+  def : Pat<(nxv2i1 (extract_subvector nxv16i1:$Ps, (i64 0))),
             (PUNPKLO_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps)))>;
-  def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 2))),
+  def : Pat<(nxv2i1 (extract_subvector nxv16i1:$Ps, (i64 2))),
             (PUNPKHI_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps)))>;
-  def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 4))),
+  def : Pat<(nxv2i1 (extract_subvector nxv16i1:$Ps, (i64 4))),
             (PUNPKLO_PP (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps)))>;
-  def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 6))),
+  def : Pat<(nxv2i1 (extract_subvector nxv16i1:$Ps, (i64 6))),
             (PUNPKHI_PP (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps)))>;
-  def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 8))),
+  def : Pat<(nxv2i1 (extract_subvector nxv16i1:$Ps, (i64 8))),
             (PUNPKLO_PP (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps)))>;
-  def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 10))),
+  def : Pat<(nxv2i1 (extract_subvector nxv16i1:$Ps, (i64 10))),
             (PUNPKHI_PP (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps)))>;
-  def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 12))),
+  def : Pat<(nxv2i1 (extract_subvector nxv16i1:$Ps, (i64 12))),
             (PUNPKLO_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps)))>;
-  def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 14))),
+  def : Pat<(nxv2i1 (extract_subvector nxv16i1:$Ps, (i64 14))),
             (PUNPKHI_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps)))>;
 
-  def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 0))),
+  def : Pat<(nxv1i1 (extract_subvector nxv16i1:$Ps, (i64 0))),
             (PUNPKLO_PP (PUNPKLO_PP (PUNPKLO_PP  (PUNPKLO_PP PPR:$Ps))))>;
-  def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 1))),
+  def : Pat<(nxv1i1 (extract_subvector nxv16i1:$Ps, (i64 1))),
             (PUNPKHI_PP (PUNPKLO_PP (PUNPKLO_PP  (PUNPKLO_PP PPR:$Ps))))>;
-  def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 2))),
+  def : Pat<(nxv1i1 (extract_subvector nxv16i1:$Ps, (i64 2))),
             (PUNPKLO_PP (PUNPKHI_PP (PUNPKLO_PP  (PUNPKLO_PP PPR:$Ps))))>;
-  def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 3))),
+  def : Pat<(nxv1i1 (extract_subvector nxv16i1:$Ps, (i64 3))),
             (PUNPKHI_PP (PUNPKHI_PP (PUNPKLO_PP  (PUNPKLO_PP PPR:$Ps))))>;
-  def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 4))),
+  def : Pat<(nxv1i1 (extract_subvector nxv16i1:$Ps, (i64 4))),
             (PUNPKLO_PP (PUNPKLO_PP (PUNPKHI_PP  (PUNPKLO_PP PPR:$Ps))))>;
-  def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 5))),
+  def : Pat<(nxv1i1 (extract_subvector nxv16i1:$Ps, (i64 5))),
             (PUNPKHI_PP (PUNPKLO_PP (PUNPKHI_PP  (PUNPKLO_PP PPR:$Ps))))>;
-  def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 6))),
+  def : Pat<(nxv1i1 (extract_subvector nxv16i1:$Ps, (i64 6))),
             (PUNPKLO_PP (PUNPKHI_PP (PUNPKHI_PP  (PUNPKLO_PP PPR:$Ps))))>;
-  def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 7))),
+  def : Pat<(nxv1i1 (extract_subvector nxv16i1:$Ps, (i64 7))),
             (PUNPKHI_PP (PUNPKHI_PP (PUNPKHI_PP  (PUNPKLO_PP PPR:$Ps))))>;
-  def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 8))),
+  def : Pat<(nxv1i1 (extract_subvector nxv16i1:$Ps, (i64 8))),
             (PUNPKLO_PP (PUNPKLO_PP (PUNPKLO_PP  (PUNPKHI_PP PPR:$Ps))))>;
-  def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 9))),
+  def : Pat<(nxv1i1 (extract_subvector nxv16i1:$Ps, (i64 9))),
             (PUNPKHI_PP (PUNPKLO_PP (PUNPKLO_PP  (PUNPKHI_PP PPR:$Ps))))>;
-  def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 10))),
+  def : Pat<(nxv1i1 (extract_subvector nxv16i1:$Ps, (i64 10))),
             (PUNPKLO_PP (PUNPKHI_PP (PUNPKLO_PP  (PUNPKHI_PP PPR:$Ps))))>;
-  def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 11))),
+  def : Pat<(nxv1i1 (extract_subvector nxv16i1:$Ps, (i64 11))),
             (PUNPKHI_PP (PUNPKHI_PP (PUNPKLO_PP  (PUNPKHI_PP PPR:$Ps))))>;
-  def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 12))),
+  def : Pat<(nxv1i1 (extract_subvector nxv16i1:$Ps, (i64 12))),
             (PUNPKLO_PP (PUNPKLO_PP (PUNPKHI_PP  (PUNPKHI_PP PPR:$Ps))))>;
-  def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 13))),
+  def : Pat<(nxv1i1 (extract_subvector nxv16i1:$Ps, (i64 13))),
             (PUNPKHI_PP (PUNPKLO_PP (PUNPKHI_PP  (PUNPKHI_PP PPR:$Ps))))>;
-  def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 14))),
+  def : Pat<(nxv1i1 (extract_subvector nxv16i1:$Ps, (i64 14))),
             (PUNPKLO_PP (PUNPKHI_PP (PUNPKHI_PP  (PUNPKHI_PP PPR:$Ps))))>;
-  def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 15))),
+  def : Pat<(nxv1i1 (extract_subvector nxv16i1:$Ps, (i64 15))),
             (PUNPKHI_PP (PUNPKHI_PP (PUNPKHI_PP  (PUNPKHI_PP PPR:$Ps))))>;
 
   // Extract subvectors from FP SVE vectors
-  def : Pat<(nxv2f16 (extract_subvector (nxv4f16 ZPR:$Zs), (i64 0))),
+  def : Pat<(nxv2f16 (extract_subvector nxv4f16:$Zs, (i64 0))),
             (UUNPKLO_ZZ_D ZPR:$Zs)>;
-  def : Pat<(nxv2f16 (extract_subvector (nxv4f16 ZPR:$Zs), (i64 2))),
+  def : Pat<(nxv2f16 (extract_subvector nxv4f16:$Zs, (i64 2))),
             (UUNPKHI_ZZ_D ZPR:$Zs)>;
-  def : Pat<(nxv4f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 0))),
+  def : Pat<(nxv4f16 (extract_subvector nxv8f16:$Zs, (i64 0))),
             (UUNPKLO_ZZ_S ZPR:$Zs)>;
-  def : Pat<(nxv4f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 4))),
+  def : Pat<(nxv4f16 (extract_subvector nxv8f16:$Zs, (i64 4))),
             (UUNPKHI_ZZ_S ZPR:$Zs)>;
-  def : Pat<(nxv2f32 (extract_subvector (nxv4f32 ZPR:$Zs), (i64 0))),
+  def : Pat<(nxv2f32 (extract_subvector nxv4f32:$Zs, (i64 0))),
             (UUNPKLO_ZZ_D ZPR:$Zs)>;
-  def : Pat<(nxv2f32 (extract_subvector (nxv4f32 ZPR:$Zs), (i64 2))),
+  def : Pat<(nxv2f32 (extract_subvector nxv4f32:$Zs, (i64 2))),
             (UUNPKHI_ZZ_D ZPR:$Zs)>;
 
-  def : Pat<(nxv2bf16 (extract_subvector (nxv4bf16 ZPR:$Zs), (i64 0))),
+  def : Pat<(nxv2bf16 (extract_subvector nxv4bf16:$Zs, (i64 0))),
             (UUNPKLO_ZZ_D ZPR:$Zs)>;
-  def : Pat<(nxv2bf16 (extract_subvector (nxv4bf16 ZPR:$Zs), (i64 2))),
+  def : Pat<(nxv2bf16 (extract_subvector nxv4bf16:$Zs, (i64 2))),
             (UUNPKHI_ZZ_D ZPR:$Zs)>;
-  def : Pat<(nxv4bf16 (extract_subvector (nxv8bf16 ZPR:$Zs), (i64 0))),
+  def : Pat<(nxv4bf16 (extract_subvector nxv8bf16:$Zs, (i64 0))),
             (UUNPKLO_ZZ_S ZPR:$Zs)>;
-  def : Pat<(nxv4bf16 (extract_subvector (nxv8bf16 ZPR:$Zs), (i64 4))),
+  def : Pat<(nxv4bf16 (extract_subvector nxv8bf16:$Zs, (i64 4))),
             (UUNPKHI_ZZ_S ZPR:$Zs)>;
 
-  def : Pat<(nxv2f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 0))),
+  def : Pat<(nxv2f16 (extract_subvector nxv8f16:$Zs, (i64 0))),
             (UUNPKLO_ZZ_D (UUNPKLO_ZZ_S ZPR:$Zs))>;
-  def : Pat<(nxv2f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 2))),
+  def : Pat<(nxv2f16 (extract_subvector nxv8f16:$Zs, (i64 2))),
             (UUNPKHI_ZZ_D (UUNPKLO_ZZ_S ZPR:$Zs))>;
-  def : Pat<(nxv2f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 4))),
+  def : Pat<(nxv2f16 (extract_subvector nxv8f16:$Zs, (i64 4))),
             (UUNPKLO_ZZ_D (UUNPKHI_ZZ_S ZPR:$Zs))>;
-  def : Pat<(nxv2f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 6))),
+  def : Pat<(nxv2f16 (extract_subvector nxv8f16:$Zs, (i64 6))),
             (UUNPKHI_ZZ_D (UUNPKHI_ZZ_S ZPR:$Zs))>;
 
-  def : Pat<(nxv2bf16 (extract_subvector (nxv8bf16 ZPR:$Zs), (i64 0))),
+  def : Pat<(nxv2bf16 (extract_subvector nxv8bf16:$Zs, (i64 0))),
             (UUNPKLO_ZZ_D (UUNPKLO_ZZ_S ZPR:$Zs))>;
-  def : Pat<(nxv2bf16 (extract_subvector (nxv8bf16 ZPR:$Zs), (i64 2))),
+  def : Pat<(nxv2bf16 (extract_subvector nxv8bf16:$Zs, (i64 2))),
             (UUNPKHI_ZZ_D (UUNPKLO_ZZ_S ZPR:$Zs))>;
-  def : Pat<(nxv2bf16 (extract_subvector (nxv8bf16 ZPR:$Zs), (i64 4))),
+  def : Pat<(nxv2bf16 (extract_subvector nxv8bf16:$Zs, (i64 4))),
             (UUNPKLO_ZZ_D (UUNPKHI_ZZ_S ZPR:$Zs))>;
-  def : Pat<(nxv2bf16 (extract_subvector (nxv8bf16 ZPR:$Zs), (i64 6))),
+  def : Pat<(nxv2bf16 (extract_subvector nxv8bf16:$Zs, (i64 6))),
             (UUNPKHI_ZZ_D (UUNPKHI_ZZ_S ZPR:$Zs))>;
 
   // extract/insert 64-bit fixed length vector from/into a scalable vector
   foreach VT = [v8i8, v4i16, v2i32, v1i64, v4f16, v2f32, v1f64, v4bf16] in {
-    def : Pat<(VT (vector_extract_subvec (SVEContainerVT<VT>.Value ZPR:$Zs), (i64 0))),
+    def : Pat<(VT (vector_extract_subvec SVEContainerVT<VT>.Value:$Zs, (i64 0))),
               (EXTRACT_SUBREG ZPR:$Zs, dsub)>;
     def : Pat<(SVEContainerVT<VT>.Value (vector_insert_subvec undef, (VT V64:$src), (i64 0))),
               (INSERT_SUBREG (IMPLICIT_DEF), $src, dsub)>;
@@ -1951,7 +1951,7 @@ let Predicates = [HasSVEorSME] in {
 
   // extract/insert 128-bit fixed length vector from/into a scalable vector
   foreach VT = [v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64, v8bf16] in {
-    def : Pat<(VT (vector_extract_subvec (SVEContainerVT<VT>.Value ZPR:$Zs), (i64 0))),
+    def : Pat<(VT (vector_extract_subvec SVEContainerVT<VT>.Value:$Zs, (i64 0))),
               (EXTRACT_SUBREG ZPR:$Zs, zsub)>;
     def : Pat<(SVEContainerVT<VT>.Value (vector_insert_subvec undef, (VT V128:$src), (i64 0))),
               (INSERT_SUBREG (IMPLICIT_DEF), $src, zsub)>;
@@ -1980,28 +1980,35 @@ let Predicates = [HasSVEorSME] in {
             (UZP1_ZZZ_H $v1, $v2)>;
 
   // Splice with lane equal to -1
-  def : Pat<(nxv16i8 (vector_splice (nxv16i8 ZPR:$Z1), (nxv16i8 ZPR:$Z2), (i64 -1))),
+  def : Pat<(nxv16i8 (vector_splice nxv16i8:$Z1, nxv16i8:$Z2, (i64 -1))),
             (INSR_ZV_B ZPR:$Z2, (INSERT_SUBREG (IMPLICIT_DEF),
             (LASTB_VPZ_B (PTRUE_B 31), ZPR:$Z1), bsub))>;
-  def : Pat<(nxv8i16 (vector_splice (nxv8i16 ZPR:$Z1), (nxv8i16 ZPR:$Z2), (i64 -1))),
+  def : Pat<(nxv8i16 (vector_splice nxv8i16:$Z1, nxv8i16:$Z2, (i64 -1))),
             (INSR_ZV_H ZPR:$Z2, (INSERT_SUBREG (IMPLICIT_DEF),
             (LASTB_VPZ_H (PTRUE_H 31), ZPR:$Z1), hsub))>;
-  def : Pat<(nxv4i32 (vector_splice (nxv4i32 ZPR:$Z1), (nxv4i32 ZPR:$Z2), (i64 -1))),
+  def : Pat<(nxv4i32 (vector_splice nxv4i32:$Z1, nxv4i32:$Z2, (i64 -1))),
             (INSR_ZV_S ZPR:$Z2, (INSERT_SUBREG (IMPLICIT_DEF),
             (LASTB_VPZ_S (PTRUE_S 31), ZPR:$Z1), ssub))>;
-  def : Pat<(nxv2i64 (vector_splice (nxv2i64 ZPR:$Z1), (nxv2i64 ZPR:$Z2), (i64 -1))),
+  def : Pat<(nxv2i64 (vector_splice nxv2i64:$Z1, nxv2i64:$Z2, (i64 -1))),
             (INSR_ZV_D ZPR:$Z2, (INSERT_SUBREG (IMPLICIT_DEF),
             (LASTB_VPZ_D (PTRUE_D 31), ZPR:$Z1), dsub))>;
 
   // Splice with lane bigger or equal to 0
-  def : Pat<(nxv16i8 (vector_splice (nxv16i8 ZPR:$Z1), (nxv16i8 ZPR:$Z2), (i64 (sve_ext_imm_0_255 i32:$index)))),
-            (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
-  def : Pat<(nxv8i16 (vector_splice (nxv8i16 ZPR:$Z1), (nxv8i16 ZPR:$Z2), (i64 (sve_ext_imm_0_127 i32:$index)))),
-            (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
-  def : Pat<(nxv4i32 (vector_splice (nxv4i32 ZPR:$Z1), (nxv4i32 ZPR:$Z2), (i64 (sve_ext_imm_0_63 i32:$index)))),
-            (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
-  def : Pat<(nxv2i64 (vector_splice (nxv2i64 ZPR:$Z1), (nxv2i64 ZPR:$Z2), (i64 (sve_ext_imm_0_31 i32:$index)))),
-            (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
+  foreach VT = [nxv16i8] in
+    def : Pat<(VT (vector_splice VT:$Z1, VT:$Z2, (i64 (sve_ext_imm_0_255 i32:$index)))),
+              (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
+
+  foreach VT = [nxv8i16, nxv8f16, nxv8bf16] in
+    def : Pat<(VT (vector_splice VT:$Z1, VT:$Z2, (i64 (sve_ext_imm_0_127 i32:$index)))),
+              (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
+
+  foreach VT = [nxv4i32, nxv4f16, nxv4f32, nxv4bf16] in
+    def : Pat<(VT (vector_splice VT:$Z1, VT:$Z2, (i64 (sve_ext_imm_0_63 i32:$index)))),
+              (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
+
+  foreach VT = [nxv2i64, nxv2f16, nxv2f32, nxv2f64, nxv2bf16] in
+    def : Pat<(VT (vector_splice VT:$Z1, VT:$Z2, (i64 (sve_ext_imm_0_31 i32:$index)))),
+              (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
 
   defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", SETUGE, SETULE>;
   defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", SETUGT, SETULT>;
@@ -2256,59 +2263,59 @@ let Predicates = [HasSVEorSME] in {
   defm FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd< 0b1111111, "fcvtzu", ZPR64, ZPR64, null_frag,                     AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;
 
   //These patterns exist to improve the code quality of conversions on unpacked types.
-  def : Pat<(nxv2f32 (AArch64fcvte_mt (nxv2i1 (SVEAllActive):$Pg), (nxv2f16 ZPR:$Zs), (nxv2f32 ZPR:$Zd))),
+  def : Pat<(nxv2f32 (AArch64fcvte_mt (nxv2i1 (SVEAllActive:$Pg)), nxv2f16:$Zs, nxv2f32:$Zd)),
             (FCVT_ZPmZ_HtoS_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
 
   // FP_ROUND has an additional 'precise' flag which indicates the type of rounding.
   // This is ignored by the pattern below where it is matched by (i64 timm0_1)
-  def : Pat<(nxv2f16 (AArch64fcvtr_mt (nxv2i1 (SVEAllActive):$Pg), (nxv2f32 ZPR:$Zs), (i64 timm0_1), (nxv2f16 ZPR:$Zd))),
+  def : Pat<(nxv2f16 (AArch64fcvtr_mt (nxv2i1 (SVEAllActive:$Pg)), nxv2f32:$Zs, (i64 timm0_1), nxv2f16:$Zd)),
             (FCVT_ZPmZ_StoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
 
   // Signed integer -> Floating-point 
   def : Pat<(nxv2f16 (AArch64scvtf_mt (nxv2i1 (SVEAllActive):$Pg),
-                      (sext_inreg (nxv2i64 ZPR:$Zs), nxv2i16), (nxv2f16 ZPR:$Zd))),
+                      (sext_inreg nxv2i64:$Zs, nxv2i16), nxv2f16:$Zd)),
             (SCVTF_ZPmZ_HtoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
 
   def : Pat<(nxv4f16 (AArch64scvtf_mt (nxv4i1 (SVEAllActive):$Pg),
-                      (sext_inreg (nxv4i32 ZPR:$Zs), nxv4i16), (nxv4f16 ZPR:$Zd))),
+                      (sext_inreg nxv4i32:$Zs, nxv4i16), nxv4f16:$Zd)),
             (SCVTF_ZPmZ_HtoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
 
   def : Pat<(nxv2f16 (AArch64scvtf_mt (nxv2i1 (SVEAllActive):$Pg),
-                      (sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (nxv2f16 ZPR:$Zd))),
+                      (sext_inreg nxv2i64:$Zs, nxv2i32), nxv2f16:$Zd)),
             (SCVTF_ZPmZ_StoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
 
   def : Pat<(nxv2f32 (AArch64scvtf_mt (nxv2i1 (SVEAllActive):$Pg),
-                      (sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (nxv2f32 ZPR:$Zd))),
+                      (sext_inreg nxv2i64:$Zs, nxv2i32), nxv2f32:$Zd)),
             (SCVTF_ZPmZ_StoS_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
 
   def : Pat<(nxv2f64 (AArch64scvtf_mt (nxv2i1 (SVEAllActive):$Pg),
-                      (sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (nxv2f64 ZPR:$Zd))),
+                      (sext_inreg nxv2i64:$Zs, nxv2i32), nxv2f64:$Zd)),
             (SCVTF_ZPmZ_StoD_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
 
   // Unsigned integer -> Floating-point
-  def : Pat<(nxv2f16 (AArch64ucvtf_mt (nxv2i1 (SVEAllActive):$Pg),
-                      (and (nxv2i64 ZPR:$Zs),
-                       (nxv2i64 (splat_vector (i64 0xFFFF)))), (nxv2f16 ZPR:$Zd))),
+  def : Pat<(nxv2f16 (AArch64ucvtf_mt (nxv2i1 (SVEAllActive:$Pg)),
+                      (and nxv2i64:$Zs,
+                       (nxv2i64 (splat_vector (i64 0xFFFF)))), nxv2f16:$Zd)),
             (UCVTF_ZPmZ_HtoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
 
-  def : Pat<(nxv2f16 (AArch64ucvtf_mt (nxv2i1 (SVEAllActive):$Pg),
-                      (and (nxv2i64 ZPR:$Zs),
-                       (nxv2i64 (splat_vector (i64 0xFFFFFFFF)))), (nxv2f16 ZPR:$Zd))),
+  def : Pat<(nxv2f16 (AArch64ucvtf_mt (nxv2i1 (SVEAllActive:$Pg)),
+                      (and nxv2i64:$Zs,
+                       (nxv2i64 (splat_vector (i64 0xFFFFFFFF)))), nxv2f16:$Zd)),
             (UCVTF_ZPmZ_StoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
 
-  def : Pat<(nxv4f16 (AArch64ucvtf_mt (nxv4i1 (SVEAllActive):$Pg),
-                      (and (nxv4i32 ZPR:$Zs),
-                       (nxv4i32 (splat_vector (i32 0xFFFF)))), (nxv4f16 ZPR:$Zd))),
+  def : Pat<(nxv4f16 (AArch64ucvtf_mt (nxv4i1 (SVEAllActive:$Pg)),
+                      (and nxv4i32:$Zs,
+                       (nxv4i32 (splat_vector (i32 0xFFFF)))), nxv4f16:$Zd)),
             (UCVTF_ZPmZ_HtoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
 
-  def : Pat<(nxv2f32 (AArch64ucvtf_mt (nxv2i1 (SVEAllActive):$Pg),
-                      (and (nxv2i64 ZPR:$Zs),
-                       (nxv2i64 (splat_vector (i64 0xFFFFFFFF)))), (nxv2f32 ZPR:$Zd))),
+  def : Pat<(nxv2f32 (AArch64ucvtf_mt (nxv2i1 (SVEAllActive:$Pg)),
+                      (and nxv2i64:$Zs,
+                       (nxv2i64 (splat_vector (i64 0xFFFFFFFF)))), nxv2f32:$Zd)),
             (UCVTF_ZPmZ_StoS_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
 
-  def : Pat<(nxv2f64 (AArch64ucvtf_mt (nxv2i1 (SVEAllActive):$Pg),
-                      (and (nxv2i64 ZPR:$Zs),
-                       (nxv2i64 (splat_vector (i64 0xFFFFFFFF)))), (nxv2f64 ZPR:$Zd))),
+  def : Pat<(nxv2f64 (AArch64ucvtf_mt (nxv2i1 (SVEAllActive:$Pg)),
+                      (and nxv2i64:$Zs,
+                       (nxv2i64 (splat_vector (i64 0xFFFFFFFF)))), nxv2f64:$Zd)),
             (UCVTF_ZPmZ_StoD_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
 
   defm FRINTN_ZPmZ : sve_fp_2op_p_zd_HSD<0b00000, "frintn", AArch64frintn_mt>;
@@ -2503,12 +2510,12 @@ let Predicates = [HasSVEorSME] in {
   defm : ld1rq_pat<nxv4i32, AArch64ld1rq_z, LD1RQ_W, am_sve_regreg_lsl2>;
   defm : ld1rq_pat<nxv2i64, AArch64ld1rq_z, LD1RQ_D, am_sve_regreg_lsl3>;
 
-  def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (SXTW_ZPmZ_D_UNDEF (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
-  def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i16), (SXTH_ZPmZ_D_UNDEF (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
-  def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i8),  (SXTB_ZPmZ_D_UNDEF (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
-  def : Pat<(sext_inreg (nxv4i32 ZPR:$Zs), nxv4i16), (SXTH_ZPmZ_S_UNDEF (IMPLICIT_DEF), (PTRUE_S 31), ZPR:$Zs)>;
-  def : Pat<(sext_inreg (nxv4i32 ZPR:$Zs), nxv4i8),  (SXTB_ZPmZ_S_UNDEF (IMPLICIT_DEF), (PTRUE_S 31), ZPR:$Zs)>;
-  def : Pat<(sext_inreg (nxv8i16 ZPR:$Zs), nxv8i8),  (SXTB_ZPmZ_H_UNDEF (IMPLICIT_DEF), (PTRUE_H 31), ZPR:$Zs)>;
+  def : Pat<(sext_inreg nxv2i64:$Zs, nxv2i32), (SXTW_ZPmZ_D_UNDEF (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
+  def : Pat<(sext_inreg nxv2i64:$Zs, nxv2i16), (SXTH_ZPmZ_D_UNDEF (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
+  def : Pat<(sext_inreg nxv2i64:$Zs, nxv2i8),  (SXTB_ZPmZ_D_UNDEF (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
+  def : Pat<(sext_inreg nxv4i32:$Zs, nxv4i16), (SXTH_ZPmZ_S_UNDEF (IMPLICIT_DEF), (PTRUE_S 31), ZPR:$Zs)>;
+  def : Pat<(sext_inreg nxv4i32:$Zs, nxv4i8),  (SXTB_ZPmZ_S_UNDEF (IMPLICIT_DEF), (PTRUE_S 31), ZPR:$Zs)>;
+  def : Pat<(sext_inreg nxv8i16:$Zs, nxv8i8),  (SXTB_ZPmZ_H_UNDEF (IMPLICIT_DEF), (PTRUE_H 31), ZPR:$Zs)>;
 
   // General case that we ideally never want to match.
   def : Pat<(vscale GPR64:$scale), (MADDXrrr (UBFMXri (RDVLI_XI 1), 4, 63), $scale, XZR)>;
@@ -2614,109 +2621,109 @@ let Predicates = [HasSVEorSME] in {
   // constraint that none of the bits change when stored to memory as one
   // type, and reloaded as another type.
   let Predicates = [IsLE] in {
-    def : Pat<(nxv16i8 (bitconvert (nxv8i16 ZPR:$src))), (nxv16i8 ZPR:$src)>;
-    def : Pat<(nxv16i8 (bitconvert (nxv4i32 ZPR:$src))), (nxv16i8 ZPR:$src)>;
-    def : Pat<(nxv16i8 (bitconvert (nxv2i64 ZPR:$src))), (nxv16i8 ZPR:$src)>;
-    def : Pat<(nxv16i8 (bitconvert (nxv8f16 ZPR:$src))), (nxv16i8 ZPR:$src)>;
-    def : Pat<(nxv16i8 (bitconvert (nxv4f32 ZPR:$src))), (nxv16i8 ZPR:$src)>;
-    def : Pat<(nxv16i8 (bitconvert (nxv2f64 ZPR:$src))), (nxv16i8 ZPR:$src)>;
-
-    def : Pat<(nxv8i16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8i16 ZPR:$src)>;
-    def : Pat<(nxv8i16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8i16 ZPR:$src)>;
-    def : Pat<(nxv8i16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8i16 ZPR:$src)>;
-    def : Pat<(nxv8i16 (bitconvert (nxv8f16 ZPR:$src))), (nxv8i16 ZPR:$src)>;
-    def : Pat<(nxv8i16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8i16 ZPR:$src)>;
-    def : Pat<(nxv8i16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8i16 ZPR:$src)>;
-
-    def : Pat<(nxv4i32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4i32 ZPR:$src)>;
-    def : Pat<(nxv4i32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4i32 ZPR:$src)>;
-    def : Pat<(nxv4i32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4i32 ZPR:$src)>;
-    def : Pat<(nxv4i32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4i32 ZPR:$src)>;
-    def : Pat<(nxv4i32 (bitconvert (nxv4f32 ZPR:$src))), (nxv4i32 ZPR:$src)>;
-    def : Pat<(nxv4i32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4i32 ZPR:$src)>;
-
-    def : Pat<(nxv2i64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2i64 ZPR:$src)>;
-    def : Pat<(nxv2i64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
-    def : Pat<(nxv2i64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2i64 ZPR:$src)>;
-    def : Pat<(nxv2i64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
-    def : Pat<(nxv2i64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2i64 ZPR:$src)>;
-    def : Pat<(nxv2i64 (bitconvert (nxv2f64 ZPR:$src))), (nxv2i64 ZPR:$src)>;
-
-    def : Pat<(nxv8f16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8f16 ZPR:$src)>;
-    def : Pat<(nxv8f16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8f16 ZPR:$src)>;
-    def : Pat<(nxv8f16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8f16 ZPR:$src)>;
-    def : Pat<(nxv8f16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8f16 ZPR:$src)>;
-    def : Pat<(nxv8f16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8f16 ZPR:$src)>;
-    def : Pat<(nxv8f16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8f16 ZPR:$src)>;
-
-    def : Pat<(nxv4f32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4f32 ZPR:$src)>;
-    def : Pat<(nxv4f32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4f32 ZPR:$src)>;
-    def : Pat<(nxv4f32 (bitconvert (nxv4i32 ZPR:$src))), (nxv4f32 ZPR:$src)>;
-    def : Pat<(nxv4f32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4f32 ZPR:$src)>;
-    def : Pat<(nxv4f32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4f32 ZPR:$src)>;
-    def : Pat<(nxv4f32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4f32 ZPR:$src)>;
-
-    def : Pat<(nxv2f64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2f64 ZPR:$src)>;
-    def : Pat<(nxv2f64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
-    def : Pat<(nxv2f64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2f64 ZPR:$src)>;
-    def : Pat<(nxv2f64 (bitconvert (nxv2i64 ZPR:$src))), (nxv2f64 ZPR:$src)>;
-    def : Pat<(nxv2f64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
-    def : Pat<(nxv2f64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2f64 ZPR:$src)>;
-
-    def : Pat<(nxv8bf16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
-    def : Pat<(nxv8bf16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
-    def : Pat<(nxv8bf16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
-    def : Pat<(nxv8bf16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
-    def : Pat<(nxv8bf16 (bitconvert (nxv8f16 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
-    def : Pat<(nxv8bf16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
-    def : Pat<(nxv8bf16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
-
-    def : Pat<(nxv16i8 (bitconvert (nxv8bf16 ZPR:$src))), (nxv16i8 ZPR:$src)>;
-    def : Pat<(nxv8i16 (bitconvert (nxv8bf16 ZPR:$src))), (nxv8i16 ZPR:$src)>;
-    def : Pat<(nxv4i32 (bitconvert (nxv8bf16 ZPR:$src))), (nxv4i32 ZPR:$src)>;
-    def : Pat<(nxv2i64 (bitconvert (nxv8bf16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
-    def : Pat<(nxv8f16 (bitconvert (nxv8bf16 ZPR:$src))), (nxv8f16 ZPR:$src)>;
-    def : Pat<(nxv4f32 (bitconvert (nxv8bf16 ZPR:$src))), (nxv4f32 ZPR:$src)>;
-    def : Pat<(nxv2f64 (bitconvert (nxv8bf16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
-
-    def : Pat<(nxv16i1 (bitconvert (aarch64svcount PNR:$src))), (nxv16i1 PPR:$src)>;
-    def : Pat<(aarch64svcount (bitconvert (nxv16i1 PPR:$src))), (aarch64svcount PNR:$src)>;
+    def : Pat<(nxv16i8 (bitconvert nxv8i16:$src)), (nxv16i8 ZPR:$src)>;
+    def : Pat<(nxv16i8 (bitconvert nxv4i32:$src)), (nxv16i8 ZPR:$src)>;
+    def : Pat<(nxv16i8 (bitconvert nxv2i64:$src)), (nxv16i8 ZPR:$src)>;
+    def : Pat<(nxv16i8 (bitconvert nxv8f16:$src)), (nxv16i8 ZPR:$src)>;
+    def : Pat<(nxv16i8 (bitconvert nxv4f32:$src)), (nxv16i8 ZPR:$src)>;
+    def : Pat<(nxv16i8 (bitconvert nxv2f64:$src)), (nxv16i8 ZPR:$src)>;
+
+    def : Pat<(nxv8i16 (bitconvert nxv16i8:$src)), (nxv8i16 ZPR:$src)>;
+    def : Pat<(nxv8i16 (bitconvert nxv4i32:$src)), (nxv8i16 ZPR:$src)>;
+    def : Pat<(nxv8i16 (bitconvert nxv2i64:$src)), (nxv8i16 ZPR:$src)>;
+    def : Pat<(nxv8i16 (bitconvert nxv8f16:$src)), (nxv8i16 ZPR:$src)>;
+    def : Pat<(nxv8i16 (bitconvert nxv4f32:$src)), (nxv8i16 ZPR:$src)>;
+    def : Pat<(nxv8i16 (bitconvert nxv2f64:$src)), (nxv8i16 ZPR:$src)>;
+
+    def : Pat<(nxv4i32 (bitconvert nxv16i8:$src)), (nxv4i32 ZPR:$src)>;
+    def : Pat<(nxv4i32 (bitconvert nxv8i16:$src)), (nxv4i32 ZPR:$src)>;
+    def : Pat<(nxv4i32 (bitconvert nxv2i64:$src)), (nxv4i32 ZPR:$src)>;
+    def : Pat<(nxv4i32 (bitconvert nxv8f16:$src)), (nxv4i32 ZPR:$src)>;
+    def : Pat<(nxv4i32 (bitconvert nxv4f32:$src)), (nxv4i32 ZPR:$src)>;
+    def : Pat<(nxv4i32 (bitconvert nxv2f64:$src)), (nxv4i32 ZPR:$src)>;
+
+    def : Pat<(nxv2i64 (bitconvert nxv16i8:$src)), (nxv2i64 ZPR:$src)>;
+    def : Pat<(nxv2i64 (bitconvert nxv8i16:$src)), (nxv2i64 ZPR:$src)>;
+    def : Pat<(nxv2i64 (bitconvert nxv4i32:$src)), (nxv2i64 ZPR:$src)>;
+    def : Pat<(nxv2i64 (bitconvert nxv8f16:$src)), (nxv2i64 ZPR:$src)>;
+    def : Pat<(nxv2i64 (bitconvert nxv4f32:$src)), (nxv2i64 ZPR:$src)>;
+    def : Pat<(nxv2i64 (bitconvert nxv2f64:$src)), (nxv2i64 ZPR:$src)>;
+
+    def : Pat<(nxv8f16 (bitconvert nxv16i8:$src)), (nxv8f16 ZPR:$src)>;
+    def : Pat<(nxv8f16 (bitconvert nxv8i16:$src)), (nxv8f16 ZPR:$src)>;
+    def : Pat<(nxv8f16 (bitconvert nxv4i32:$src)), (nxv8f16 ZPR:$src)>;
+    def : Pat<(nxv8f16 (bitconvert nxv2i64:$src)), (nxv8f16 ZPR:$src)>;
+    def : Pat<(nxv8f16 (bitconvert nxv4f32:$src)), (nxv8f16 ZPR:$src)>;
+    def : Pat<(nxv8f16 (bitconvert nxv2f64:$src)), (nxv8f16 ZPR:$src)>;
+
+    def : Pat<(nxv4f32 (bitconvert nxv16i8:$src)), (nxv4f32 ZPR:$src)>;
+    def : Pat<(nxv4f32 (bitconvert nxv8i16:$src)), (nxv4f32 ZPR:$src)>;
+    def : Pat<(nxv4f32 (bitconvert nxv4i32:$src)), (nxv4f32 ZPR:$src)>;
+    def : Pat<(nxv4f32 (bitconvert nxv2i64:$src)), (nxv4f32 ZPR:$src)>;
+    def : Pat<(nxv4f32 (bitconvert nxv8f16:$src)), (nxv4f32 ZPR:$src)>;
+    def : Pat<(nxv4f32 (bitconvert nxv2f64:$src)), (nxv4f32 ZPR:$src)>;
+
+    def : Pat<(nxv2f64 (bitconvert nxv16i8:$src)), (nxv2f64 ZPR:$src)>;
+    def : Pat<(nxv2f64 (bitconvert nxv8i16:$src)), (nxv2f64 ZPR:$src)>;
+    def : Pat<(nxv2f64 (bitconvert nxv4i32:$src)), (nxv2f64 ZPR:$src)>;
+    def : Pat<(nxv2f64 (bitconvert nxv2i64:$src)), (nxv2f64 ZPR:$src)>;
+    def : Pat<(nxv2f64 (bitconvert nxv8f16:$src)), (nxv2f64 ZPR:$src)>;
+    def : Pat<(nxv2f64 (bitconvert nxv4f32:$src)), (nxv2f64 ZPR:$src)>;
+
+    def : Pat<(nxv8bf16 (bitconvert nxv16i8:$src)), (nxv8bf16 ZPR:$src)>;
+    def : Pat<(nxv8bf16 (bitconvert nxv8i16:$src)), (nxv8bf16 ZPR:$src)>;
+    def : Pat<(nxv8bf16 (bitconvert nxv4i32:$src)), (nxv8bf16 ZPR:$src)>;
+    def : Pat<(nxv8bf16 (bitconvert nxv2i64:$src)), (nxv8bf16 ZPR:$src)>;
+    def : Pat<(nxv8bf16 (bitconvert nxv8f16:$src)), (nxv8bf16 ZPR:$src)>;
+    def : Pat<(nxv8bf16 (bitconvert nxv4f32:$src)), (nxv8bf16 ZPR:$src)>;
+    def : Pat<(nxv8bf16 (bitconvert nxv2f64:$src)), (nxv8bf16 ZPR:$src)>;
+
+    def : Pat<(nxv16i8 (bitconvert nxv8bf16:$src)), (nxv16i8 ZPR:$src)>;
+    def : Pat<(nxv8i16 (bitconvert nxv8bf16:$src)), (nxv8i16 ZPR:$src)>;
+    def : Pat<(nxv4i32 (bitconvert nxv8bf16:$src)), (nxv4i32 ZPR:$src)>;
+    def : Pat<(nxv2i64 (bitconvert nxv8bf16:$src)), (nxv2i64 ZPR:$src)>;
+    def : Pat<(nxv8f16 (bitconvert nxv8bf16:$src)), (nxv8f16 ZPR:$src)>;
+    def : Pat<(nxv4f32 (bitconvert nxv8bf16:$src)), (nxv4f32 ZPR:$src)>;
+    def : Pat<(nxv2f64 (bitconvert nxv8bf16:$src)), (nxv2f64 ZPR:$src)>;
+
+    def : Pat<(nxv16i1 (bitconvert aarch64svcount:$src)), (nxv16i1 PPR:$src)>;
+    def : Pat<(aarch64svcount (bitconvert nxv16i1:$src)), (aarch64svcount PNR:$src)>;
   }
 
   // These allow casting from/to unpacked predicate types.
-  def : Pat<(nxv16i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv16i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv16i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv16i1 (reinterpret_cast (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv16i1 (reinterpret_cast (nxv1i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv8i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv8i1 (reinterpret_cast  (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv8i1 (reinterpret_cast  (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv8i1 (reinterpret_cast  (nxv1i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv4i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv4i1 (reinterpret_cast  (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv4i1 (reinterpret_cast  (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv4i1 (reinterpret_cast  (nxv1i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv2i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv2i1 (reinterpret_cast  (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv2i1 (reinterpret_cast  (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv2i1 (reinterpret_cast  (nxv1i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv1i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv1i1 (reinterpret_cast  (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv1i1 (reinterpret_cast  (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv1i1 (reinterpret_cast  (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv16i1 (reinterpret_cast nxv16i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv16i1 (reinterpret_cast nxv8i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv16i1 (reinterpret_cast nxv4i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv16i1 (reinterpret_cast nxv2i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv16i1 (reinterpret_cast nxv1i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv8i1 (reinterpret_cast nxv16i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv8i1 (reinterpret_cast  nxv4i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv8i1 (reinterpret_cast  nxv2i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv8i1 (reinterpret_cast  nxv1i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv4i1 (reinterpret_cast nxv16i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv4i1 (reinterpret_cast  nxv8i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv4i1 (reinterpret_cast  nxv2i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv4i1 (reinterpret_cast  nxv1i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv2i1 (reinterpret_cast nxv16i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv2i1 (reinterpret_cast  nxv8i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv2i1 (reinterpret_cast  nxv4i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv2i1 (reinterpret_cast  nxv1i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv1i1 (reinterpret_cast nxv16i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv1i1 (reinterpret_cast  nxv8i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv1i1 (reinterpret_cast  nxv4i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv1i1 (reinterpret_cast  nxv2i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
 
   // These allow casting from/to unpacked floating-point types.
-  def : Pat<(nxv2f16 (reinterpret_cast (nxv8f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
-  def : Pat<(nxv8f16 (reinterpret_cast (nxv2f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
-  def : Pat<(nxv4f16 (reinterpret_cast (nxv8f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
-  def : Pat<(nxv8f16 (reinterpret_cast (nxv4f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
-  def : Pat<(nxv2f32 (reinterpret_cast (nxv4f32 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
-  def : Pat<(nxv4f32 (reinterpret_cast (nxv2f32 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
-  def : Pat<(nxv2bf16 (reinterpret_cast (nxv8bf16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
-  def : Pat<(nxv8bf16 (reinterpret_cast (nxv2bf16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
-  def : Pat<(nxv4bf16 (reinterpret_cast (nxv8bf16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
-  def : Pat<(nxv8bf16 (reinterpret_cast (nxv4bf16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+  def : Pat<(nxv2f16 (reinterpret_cast nxv8f16:$src)), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+  def : Pat<(nxv8f16 (reinterpret_cast nxv2f16:$src)), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+  def : Pat<(nxv4f16 (reinterpret_cast nxv8f16:$src)), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+  def : Pat<(nxv8f16 (reinterpret_cast nxv4f16:$src)), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+  def : Pat<(nxv2f32 (reinterpret_cast nxv4f32:$src)), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+  def : Pat<(nxv4f32 (reinterpret_cast nxv2f32:$src)), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+  def : Pat<(nxv2bf16 (reinterpret_cast nxv8bf16:$src)), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+  def : Pat<(nxv8bf16 (reinterpret_cast nxv2bf16:$src)), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+  def : Pat<(nxv4bf16 (reinterpret_cast nxv8bf16:$src)), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+  def : Pat<(nxv8bf16 (reinterpret_cast nxv4bf16:$src)), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
 
   def : Pat<(nxv16i1 (and PPR:$Ps1, PPR:$Ps2)),
             (AND_PPzPP (PTRUE_B 31), PPR:$Ps1, PPR:$Ps2)>;
@@ -2781,14 +2788,14 @@ let Predicates = [HasSVEorSME] in {
   multiclass pred_store<ValueType Ty, ValueType PredTy, SDPatternOperator Store,
                         Instruction RegRegInst, Instruction RegImmInst, ComplexPattern AddrCP> {
     let AddedComplexity = 1 in {
-      def _reg_reg : Pat<(Store (Ty ZPR:$vec), (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp)),
+      def _reg_reg : Pat<(Store Ty:$vec, (AddrCP GPR64:$base, GPR64:$offset), PredTy:$gp),
                          (RegRegInst ZPR:$vec, PPR:$gp, GPR64:$base, GPR64:$offset)>;
     }
     let AddedComplexity = 2 in {
-      def _reg_imm : Pat<(Store (Ty ZPR:$vec), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp)),
+      def _reg_imm : Pat<(Store Ty:$vec, (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), PredTy:$gp),
                          (RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, simm4s1:$offset)>;
     }
-    def _default : Pat<(Store (Ty ZPR:$vec), GPR64:$base, (PredTy PPR:$gp)),
+    def _default : Pat<(Store Ty:$vec, GPR64:$base, PredTy:$gp),
                        (RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, (i64 0))>;
   }
 
@@ -2833,15 +2840,15 @@ let Predicates = [HasSVEorSME] in {
                           Instruction RegImmInst, Instruction PTrue,
                           ComplexPattern AddrCP> {
     let AddedComplexity = 1 in {
-      def _reg : Pat<(Store (Ty ZPR:$val), (AddrCP GPR64sp:$base, GPR64:$offset)),
+      def _reg : Pat<(Store Ty:$val, (AddrCP GPR64sp:$base, GPR64:$offset)),
                      (RegRegInst ZPR:$val, (PTrue 31), GPR64sp:$base, GPR64:$offset)>;
     }
     let AddedComplexity = 2 in {
-      def _imm : Pat<(Store (Ty ZPR:$val), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset)),
+      def _imm : Pat<(Store Ty:$val, (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset)),
                      (RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
     }
 
-    def : Pat<(Store (Ty ZPR:$val), GPR64:$base),
+    def : Pat<(Store Ty:$val, GPR64:$base),
               (RegImmInst ZPR:$val, (PTrue 31), GPR64:$base, (i64 0))>;
   }
 
@@ -2920,7 +2927,7 @@ let Predicates = [HasSVEorSME] in {
     let Predicates = [IsLE] in {
       def : Pat<(Ty (load (am_sve_regreg_lsl0 GPR64sp:$base, GPR64:$offset))),
                 (LD1B (PTRUE_B 31), GPR64sp:$base, GPR64:$offset)>;
-      def : Pat<(store (Ty ZPR:$val), (am_sve_regreg_lsl0 GPR64sp:$base, GPR64:$offset)),
+      def : Pat<(store Ty:$val, (am_sve_regreg_lsl0 GPR64sp:$base, GPR64:$offset)),
                 (ST1B ZPR:$val, (PTRUE_B 31), GPR64sp:$base, GPR64:$offset)>;
     }
   }
@@ -3088,18 +3095,18 @@ let Predicates = [HasSVEorSME] in {
                  SDPatternOperator Store, ValueType PredTy, ValueType MemVT, ComplexPattern AddrCP> {
     // reg + reg
     let AddedComplexity = 1 in {
-      def : Pat<(Store (Ty ZPR:$vec), (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp), MemVT),
+      def : Pat<(Store Ty:$vec, (AddrCP GPR64:$base, GPR64:$offset), PredTy:$gp, MemVT),
                 (RegRegInst ZPR:$vec, PPR:$gp, GPR64sp:$base, GPR64:$offset)>;
     }
 
     // scalar + immediate (mul vl)
     let AddedComplexity = 2 in {
-      def : Pat<(Store (Ty ZPR:$vec), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp), MemVT),
+      def : Pat<(Store Ty:$vec, (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), PredTy:$gp, MemVT),
                 (RegImmInst ZPR:$vec, PPR:$gp, GPR64sp:$base, simm4s1:$offset)>;
     }
 
     // base
-    def : Pat<(Store (Ty ZPR:$vec), GPR64:$base, (PredTy PPR:$gp), MemVT),
+    def : Pat<(Store Ty:$vec, GPR64:$base, (PredTy PPR:$gp), MemVT),
               (RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, (i64 0))>;
   }
 
@@ -3151,44 +3158,44 @@ let Predicates = [HasSVEorSME] in {
             (INSERT_SUBREG (nxv2f64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
 
   // Insert scalar into vector[0]
-  def : Pat<(nxv16i8 (vector_insert (nxv16i8 ZPR:$vec), (i32 GPR32:$src), 0)),
+  def : Pat<(nxv16i8 (vector_insert nxv16i8:$vec, (i32 GPR32:$src), 0)),
             (CPY_ZPmR_B ZPR:$vec, (PTRUE_B 1), GPR32:$src)>;
-  def : Pat<(nxv8i16 (vector_insert (nxv8i16 ZPR:$vec), (i32 GPR32:$src), 0)),
+  def : Pat<(nxv8i16 (vector_insert nxv8i16:$vec, (i32 GPR32:$src), 0)),
             (CPY_ZPmR_H ZPR:$vec, (PTRUE_H 1), GPR32:$src)>;
-  def : Pat<(nxv4i32 (vector_insert (nxv4i32 ZPR:$vec), (i32 GPR32:$src), 0)),
+  def : Pat<(nxv4i32 (vector_insert nxv4i32:$vec, (i32 GPR32:$src), 0)),
             (CPY_ZPmR_S ZPR:$vec, (PTRUE_S 1), GPR32:$src)>;
-  def : Pat<(nxv2i64 (vector_insert (nxv2i64 ZPR:$vec), (i64 GPR64:$src), 0)),
+  def : Pat<(nxv2i64 (vector_insert nxv2i64:$vec, (i64 GPR64:$src), 0)),
             (CPY_ZPmR_D ZPR:$vec, (PTRUE_D 1), GPR64:$src)>;
 
-  def : Pat<(nxv8f16 (vector_insert (nxv8f16 ZPR:$vec), (f16 FPR16:$src), 0)),
+  def : Pat<(nxv8f16 (vector_insert nxv8f16:$vec, (f16 FPR16:$src), 0)),
             (SEL_ZPZZ_H (PTRUE_H 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), ZPR:$vec)>;
-  def : Pat<(nxv8bf16 (vector_insert (nxv8bf16 ZPR:$vec), (bf16 FPR16:$src), 0)),
+  def : Pat<(nxv8bf16 (vector_insert nxv8bf16:$vec, (bf16 FPR16:$src), 0)),
             (SEL_ZPZZ_H (PTRUE_H 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), ZPR:$vec)>;
-  def : Pat<(nxv4f32 (vector_insert (nxv4f32 ZPR:$vec), (f32 FPR32:$src), 0)),
+  def : Pat<(nxv4f32 (vector_insert nxv4f32:$vec, (f32 FPR32:$src), 0)),
             (SEL_ZPZZ_S (PTRUE_S 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), ZPR:$vec)>;
-  def : Pat<(nxv2f64 (vector_insert (nxv2f64 ZPR:$vec), (f64 FPR64:$src), 0)),
+  def : Pat<(nxv2f64 (vector_insert nxv2f64:$vec, (f64 FPR64:$src), 0)),
             (SEL_ZPZZ_D (PTRUE_D 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$src, dsub), ZPR:$vec)>;
 
   // Insert scalar into vector with scalar index
-  def : Pat<(nxv16i8 (vector_insert (nxv16i8 ZPR:$vec), GPR32:$src, GPR64:$index)),
+  def : Pat<(nxv16i8 (vector_insert nxv16i8:$vec, GPR32:$src, GPR64:$index)),
             (CPY_ZPmR_B ZPR:$vec,
                         (CMPEQ_PPzZZ_B (PTRUE_B 31),
                                        (INDEX_II_B 0, 1),
                                        (DUP_ZR_B (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
                         GPR32:$src)>;
-  def : Pat<(nxv8i16 (vector_insert (nxv8i16 ZPR:$vec), GPR32:$src, GPR64:$index)),
+  def : Pat<(nxv8i16 (vector_insert nxv8i16:$vec, GPR32:$src, GPR64:$index)),
             (CPY_ZPmR_H ZPR:$vec,
                         (CMPEQ_PPzZZ_H (PTRUE_H 31),
                                        (INDEX_II_H 0, 1),
                                        (DUP_ZR_H (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
                         GPR32:$src)>;
-  def : Pat<(nxv4i32 (vector_insert (nxv4i32 ZPR:$vec), GPR32:$src, GPR64:$index)),
+  def : Pat<(nxv4i32 (vector_insert nxv4i32:$vec, GPR32:$src, GPR64:$index)),
             (CPY_ZPmR_S ZPR:$vec,
                         (CMPEQ_PPzZZ_S (PTRUE_S 31),
                                        (INDEX_II_S 0, 1),
                                        (DUP_ZR_S (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
                         GPR32:$src)>;
-  def : Pat<(nxv2i64 (vector_insert (nxv2i64 ZPR:$vec), GPR64:$src, GPR64:$index)),
+  def : Pat<(nxv2i64 (vector_insert nxv2i64:$vec, GPR64:$src, GPR64:$index)),
             (CPY_ZPmR_D ZPR:$vec,
                         (CMPEQ_PPzZZ_D (PTRUE_D 31),
                                        (INDEX_II_D 0, 1),
@@ -3196,55 +3203,55 @@ let Predicates = [HasSVEorSME] in {
                         GPR64:$src)>;
 
   // Insert FP scalar into vector with scalar index
-  def : Pat<(nxv2f16 (vector_insert (nxv2f16 ZPR:$vec), (f16 FPR16:$src), GPR64:$index)),
+  def : Pat<(nxv2f16 (vector_insert nxv2f16:$vec, (f16 FPR16:$src), GPR64:$index)),
             (CPY_ZPmV_H ZPR:$vec,
                         (CMPEQ_PPzZZ_D (PTRUE_D 31),
                                        (INDEX_II_D 0, 1),
                                        (DUP_ZR_D GPR64:$index)),
                         $src)>;
-  def : Pat<(nxv4f16 (vector_insert (nxv4f16 ZPR:$vec), (f16 FPR16:$src), GPR64:$index)),
+  def : Pat<(nxv4f16 (vector_insert nxv4f16:$vec, (f16 FPR16:$src), GPR64:$index)),
             (CPY_ZPmV_H ZPR:$vec,
                         (CMPEQ_PPzZZ_S (PTRUE_S 31),
                                        (INDEX_II_S 0, 1),
                                        (DUP_ZR_S (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
                         $src)>;
-  def : Pat<(nxv8f16 (vector_insert (nxv8f16 ZPR:$vec), (f16 FPR16:$src), GPR64:$index)),
+  def : Pat<(nxv8f16 (vector_insert nxv8f16:$vec, (f16 FPR16:$src), GPR64:$index)),
             (CPY_ZPmV_H ZPR:$vec,
                         (CMPEQ_PPzZZ_H (PTRUE_H 31),
                                        (INDEX_II_H 0, 1),
                                        (DUP_ZR_H (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
                         $src)>;
-  def : Pat<(nxv2bf16 (vector_insert (nxv2bf16 ZPR:$vec), (bf16 FPR16:$src), GPR64:$index)),
+  def : Pat<(nxv2bf16 (vector_insert nxv2bf16:$vec, (bf16 FPR16:$src), GPR64:$index)),
             (CPY_ZPmV_H ZPR:$vec,
                         (CMPEQ_PPzZZ_D (PTRUE_D 31),
                                        (INDEX_II_D 0, 1),
                                        (DUP_ZR_D GPR64:$index)),
                         $src)>;
-  def : Pat<(nxv4bf16 (vector_insert (nxv4bf16 ZPR:$vec), (bf16 FPR16:$src), GPR64:$index)),
+  def : Pat<(nxv4bf16 (vector_insert nxv4bf16:$vec, (bf16 FPR16:$src), GPR64:$index)),
             (CPY_ZPmV_H ZPR:$vec,
                         (CMPEQ_PPzZZ_S (PTRUE_S 31),
                                        (INDEX_II_S 0, 1),
                                        (DUP_ZR_S (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
                         $src)>;
-  def : Pat<(nxv8bf16 (vector_insert (nxv8bf16 ZPR:$vec), (bf16 FPR16:$src), GPR64:$index)),
+  def : Pat<(nxv8bf16 (vector_insert nxv8bf16:$vec, (bf16 FPR16:$src), GPR64:$index)),
             (CPY_ZPmV_H ZPR:$vec,
                         (CMPEQ_PPzZZ_H (PTRUE_H 31),
                                        (INDEX_II_H 0, 1),
                                        (DUP_ZR_H (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
                         $src)>;
-  def : Pat<(nxv2f32 (vector_insert (nxv2f32 ZPR:$vec), (f32 FPR32:$src), GPR64:$index)),
+  def : Pat<(nxv2f32 (vector_insert nxv2f32:$vec, (f32 FPR32:$src), GPR64:$index)),
             (CPY_ZPmV_S ZPR:$vec,
                         (CMPEQ_PPzZZ_D (PTRUE_D 31),
                                        (INDEX_II_D 0, 1),
                                        (DUP_ZR_D GPR64:$index)),
                         $src) >;
-  def : Pat<(nxv4f32 (vector_insert (nxv4f32 ZPR:$vec), (f32 FPR32:$src), GPR64:$index)),
+  def : Pat<(nxv4f32 (vector_insert nxv4f32:$vec, (f32 FPR32:$src), GPR64:$index)),
             (CPY_ZPmV_S ZPR:$vec,
                         (CMPEQ_PPzZZ_S (PTRUE_S 31),
                                        (INDEX_II_S 0, 1),
                                        (DUP_ZR_S (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
                         $src)>;
-  def : Pat<(nxv2f64 (vector_insert (nxv2f64 ZPR:$vec), (f64 FPR64:$src), GPR64:$index)),
+  def : Pat<(nxv2f64 (vector_insert nxv2f64:$vec, (f64 FPR64:$src), GPR64:$index)),
             (CPY_ZPmV_D ZPR:$vec,
                         (CMPEQ_PPzZZ_D (PTRUE_D 31),
                                        (INDEX_II_D 0, 1),
@@ -3252,139 +3259,139 @@ let Predicates = [HasSVEorSME] in {
                         $src)>;
 
   // Extract element from vector with scalar index
-  def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), GPR64:$index)),
+  def : Pat<(i32 (vector_extract nxv16i8:$vec, GPR64:$index)),
             (LASTB_RPZ_B (WHILELS_PXX_B XZR, GPR64:$index), ZPR:$vec)>;
-  def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), GPR64:$index)),
+  def : Pat<(i32 (vector_extract nxv8i16:$vec, GPR64:$index)),
             (LASTB_RPZ_H (WHILELS_PXX_H XZR, GPR64:$index), ZPR:$vec)>;
-  def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), GPR64:$index)),
+  def : Pat<(i32 (vector_extract nxv4i32:$vec, GPR64:$index)),
             (LASTB_RPZ_S (WHILELS_PXX_S XZR, GPR64:$index), ZPR:$vec)>;
-  def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), GPR64:$index)),
+  def : Pat<(i64 (vector_extract nxv2i64:$vec, GPR64:$index)),
             (LASTB_RPZ_D (WHILELS_PXX_D XZR, GPR64:$index), ZPR:$vec)>;
-  def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), GPR64:$index)),
+  def : Pat<(f16 (vector_extract nxv8f16:$vec, GPR64:$index)),
             (LASTB_VPZ_H (WHILELS_PXX_H XZR, GPR64:$index), ZPR:$vec)>;
-  def : Pat<(f16 (vector_extract (nxv4f16 ZPR:$vec), GPR64:$index)),
+  def : Pat<(f16 (vector_extract nxv4f16:$vec, GPR64:$index)),
             (LASTB_VPZ_H (WHILELS_PXX_S XZR, GPR64:$index), ZPR:$vec)>;
-  def : Pat<(f16 (vector_extract (nxv2f16 ZPR:$vec), GPR64:$index)),
+  def : Pat<(f16 (vector_extract nxv2f16:$vec, GPR64:$index)),
             (LASTB_VPZ_H (WHILELS_PXX_D XZR, GPR64:$index), ZPR:$vec)>;
-  def : Pat<(bf16 (vector_extract (nxv8bf16 ZPR:$vec), GPR64:$index)),
+  def : Pat<(bf16 (vector_extract nxv8bf16:$vec, GPR64:$index)),
             (LASTB_VPZ_H (WHILELS_PXX_H XZR, GPR64:$index), ZPR:$vec)>;
-  def : Pat<(bf16 (vector_extract (nxv4bf16 ZPR:$vec), GPR64:$index)),
+  def : Pat<(bf16 (vector_extract nxv4bf16:$vec, GPR64:$index)),
             (LASTB_VPZ_H (WHILELS_PXX_S XZR, GPR64:$index), ZPR:$vec)>;
-  def : Pat<(bf16 (vector_extract (nxv2bf16 ZPR:$vec), GPR64:$index)),
+  def : Pat<(bf16 (vector_extract nxv2bf16:$vec, GPR64:$index)),
             (LASTB_VPZ_H (WHILELS_PXX_D XZR, GPR64:$index), ZPR:$vec)>;
-  def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), GPR64:$index)),
+  def : Pat<(f32 (vector_extract nxv4f32:$vec, GPR64:$index)),
             (LASTB_VPZ_S (WHILELS_PXX_S XZR, GPR64:$index), ZPR:$vec)>;
-  def : Pat<(f32 (vector_extract (nxv2f32 ZPR:$vec), GPR64:$index)),
+  def : Pat<(f32 (vector_extract nxv2f32:$vec, GPR64:$index)),
             (LASTB_VPZ_S (WHILELS_PXX_D XZR, GPR64:$index), ZPR:$vec)>;
-  def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), GPR64:$index)),
+  def : Pat<(f64 (vector_extract nxv2f64:$vec, GPR64:$index)),
             (LASTB_VPZ_D (WHILELS_PXX_D XZR, GPR64:$index), ZPR:$vec)>;
 
   // Extract element from vector with immediate index
-  def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), sve_elm_idx_extdup_b:$index)),
+  def : Pat<(i32 (vector_extract nxv16i8:$vec, sve_elm_idx_extdup_b:$index)),
             (EXTRACT_SUBREG (DUP_ZZI_B ZPR:$vec, sve_elm_idx_extdup_b:$index), ssub)>;
-  def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), sve_elm_idx_extdup_h:$index)),
+  def : Pat<(i32 (vector_extract nxv8i16:$vec, sve_elm_idx_extdup_h:$index)),
             (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), ssub)>;
-  def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), sve_elm_idx_extdup_s:$index)),
+  def : Pat<(i32 (vector_extract nxv4i32:$vec, sve_elm_idx_extdup_s:$index)),
             (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), ssub)>;
-  def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
+  def : Pat<(i64 (vector_extract nxv2i64:$vec, sve_elm_idx_extdup_d:$index)),
             (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>;
-  def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)),
+  def : Pat<(f16 (vector_extract nxv8f16:$vec, sve_elm_idx_extdup_h:$index)),
             (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), hsub)>;
-  def : Pat<(f16 (vector_extract (nxv4f16 ZPR:$vec), sve_elm_idx_extdup_s:$index)),
+  def : Pat<(f16 (vector_extract nxv4f16:$vec, sve_elm_idx_extdup_s:$index)),
             (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), hsub)>;
-  def : Pat<(f16 (vector_extract (nxv2f16 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
+  def : Pat<(f16 (vector_extract nxv2f16:$vec, sve_elm_idx_extdup_d:$index)),
             (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), hsub)>;
-  def : Pat<(bf16 (vector_extract (nxv8bf16 ZPR:$vec), sve_elm_idx_extdup_h:$index)),
+  def : Pat<(bf16 (vector_extract nxv8bf16:$vec, sve_elm_idx_extdup_h:$index)),
             (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), hsub)>;
-  def : Pat<(bf16 (vector_extract (nxv4bf16 ZPR:$vec), sve_elm_idx_extdup_s:$index)),
+  def : Pat<(bf16 (vector_extract nxv4bf16:$vec, sve_elm_idx_extdup_s:$index)),
             (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), hsub)>;
-  def : Pat<(bf16 (vector_extract (nxv2bf16 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
+  def : Pat<(bf16 (vector_extract nxv2bf16:$vec, sve_elm_idx_extdup_d:$index)),
             (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), hsub)>;
-  def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), sve_elm_idx_extdup_s:$index)),
+  def : Pat<(f32 (vector_extract nxv4f32:$vec, sve_elm_idx_extdup_s:$index)),
             (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), ssub)>;
-  def : Pat<(f32 (vector_extract (nxv2f32 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
+  def : Pat<(f32 (vector_extract nxv2f32:$vec, sve_elm_idx_extdup_d:$index)),
             (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), ssub)>;
-  def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
+  def : Pat<(f64 (vector_extract nxv2f64:$vec, sve_elm_idx_extdup_d:$index)),
             (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>;
 
   // Extract element from vector with immediate index that's within the bottom 128-bits.
   let Predicates = [IsNeonAvailable], AddedComplexity = 1 in {
-  def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), VectorIndexB:$index)),
+  def : Pat<(i32 (vector_extract nxv16i8:$vec, VectorIndexB:$index)),
             (UMOVvi8 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index)>;
-  def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), VectorIndexH:$index)),
+  def : Pat<(i32 (vector_extract nxv8i16:$vec, VectorIndexH:$index)),
             (UMOVvi16 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index)>;
-  def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), VectorIndexS:$index)),
+  def : Pat<(i32 (vector_extract nxv4i32:$vec, VectorIndexS:$index)),
             (UMOVvi32 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index)>;
-  def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), VectorIndexD:$index)),
+  def : Pat<(i64 (vector_extract nxv2i64:$vec, VectorIndexD:$index)),
             (UMOVvi64 (v2i64 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexD:$index)>;
   } // End IsNeonAvailable
 
   let Predicates = [IsNeonAvailable] in {
-  def : Pat<(sext_inreg (vector_extract (nxv16i8 ZPR:$vec), VectorIndexB:$index), i8),
+  def : Pat<(sext_inreg (vector_extract nxv16i8:$vec, VectorIndexB:$index), i8),
             (SMOVvi8to32 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index)>;
-  def : Pat<(sext_inreg (anyext (i32 (vector_extract (nxv16i8 ZPR:$vec), VectorIndexB:$index))), i8),
+  def : Pat<(sext_inreg (anyext (i32 (vector_extract nxv16i8:$vec, VectorIndexB:$index))), i8),
             (SMOVvi8to64 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index)>;
 
-  def : Pat<(sext_inreg (vector_extract (nxv8i16 ZPR:$vec), VectorIndexH:$index), i16),
+  def : Pat<(sext_inreg (vector_extract nxv8i16:$vec, VectorIndexH:$index), i16),
             (SMOVvi16to32 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index)>;
-  def : Pat<(sext_inreg (anyext (i32 (vector_extract (nxv8i16 ZPR:$vec), VectorIndexH:$index))), i16),
+  def : Pat<(sext_inreg (anyext (i32 (vector_extract nxv8i16:$vec, VectorIndexH:$index))), i16),
             (SMOVvi16to64 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index)>;
 
-  def : Pat<(sext (i32 (vector_extract (nxv4i32 ZPR:$vec), VectorIndexS:$index))),
+  def : Pat<(sext (i32 (vector_extract nxv4i32:$vec, VectorIndexS:$index))),
             (SMOVvi32to64 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index)>;
   } // End IsNeonAvailable
 
   // Extract first element from vector.
   let AddedComplexity = 2 in {
-  def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$Zs), (i64 0))),
+  def : Pat<(i32 (vector_extract nxv16i8:$Zs, (i64 0))),
             (EXTRACT_SUBREG ZPR:$Zs, ssub)>;
-  def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$Zs), (i64 0))),
+  def : Pat<(i32 (vector_extract nxv8i16:$Zs, (i64 0))),
             (EXTRACT_SUBREG ZPR:$Zs, ssub)>;
-  def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$Zs), (i64 0))),
+  def : Pat<(i32 (vector_extract nxv4i32:$Zs, (i64 0))),
             (EXTRACT_SUBREG ZPR:$Zs, ssub)>;
-  def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$Zs), (i64 0))),
+  def : Pat<(i64 (vector_extract nxv2i64:$Zs, (i64 0))),
             (EXTRACT_SUBREG ZPR:$Zs, dsub)>;
-  def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$Zs), (i64 0))),
+  def : Pat<(f16 (vector_extract nxv8f16:$Zs, (i64 0))),
             (EXTRACT_SUBREG ZPR:$Zs, hsub)>;
-  def : Pat<(f16 (vector_extract (nxv4f16 ZPR:$Zs), (i64 0))),
+  def : Pat<(f16 (vector_extract nxv4f16:$Zs, (i64 0))),
             (EXTRACT_SUBREG ZPR:$Zs, hsub)>;
-  def : Pat<(f16 (vector_extract (nxv2f16 ZPR:$Zs), (i64 0))),
+  def : Pat<(f16 (vector_extract nxv2f16:$Zs, (i64 0))),
             (EXTRACT_SUBREG ZPR:$Zs, hsub)>;
-  def : Pat<(bf16 (vector_extract (nxv8bf16 ZPR:$Zs), (i64 0))),
+  def : Pat<(bf16 (vector_extract nxv8bf16:$Zs, (i64 0))),
             (EXTRACT_SUBREG ZPR:$Zs, hsub)>;
-  def : Pat<(bf16 (vector_extract (nxv4bf16 ZPR:$Zs), (i64 0))),
+  def : Pat<(bf16 (vector_extract nxv4bf16:$Zs, (i64 0))),
             (EXTRACT_SUBREG ZPR:$Zs, hsub)>;
-  def : Pat<(bf16 (vector_extract (nxv2bf16 ZPR:$Zs), (i64 0))),
+  def : Pat<(bf16 (vector_extract nxv2bf16:$Zs, (i64 0))),
             (EXTRACT_SUBREG ZPR:$Zs, hsub)>;
-  def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$Zs), (i64 0))),
+  def : Pat<(f32 (vector_extract nxv4f32:$Zs, (i64 0))),
             (EXTRACT_SUBREG ZPR:$Zs, ssub)>;
-  def : Pat<(f32 (vector_extract (nxv2f32 ZPR:$Zs), (i64 0))),
+  def : Pat<(f32 (vector_extract nxv2f32:$Zs, (i64 0))),
             (EXTRACT_SUBREG ZPR:$Zs, ssub)>;
-  def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$Zs), (i64 0))),
+  def : Pat<(f64 (vector_extract nxv2f64:$Zs, (i64 0))),
             (EXTRACT_SUBREG ZPR:$Zs, dsub)>;
   }
 
   multiclass sve_predicated_add<SDNode extend, int value> {
-    def : Pat<(nxv16i8 (add ZPR:$op, (extend (nxv16i1 PPR:$pred)))),
+    def : Pat<(nxv16i8 (add ZPR:$op, (extend nxv16i1:$pred))),
               (ADD_ZPmZ_B PPR:$pred, ZPR:$op, (DUP_ZI_B value, 0))>;
-    def : Pat<(nxv8i16 (add ZPR:$op, (extend (nxv8i1 PPR:$pred)))),
+    def : Pat<(nxv8i16 (add ZPR:$op, (extend nxv8i1:$pred))),
               (ADD_ZPmZ_H PPR:$pred, ZPR:$op, (DUP_ZI_H value, 0))>;
-    def : Pat<(nxv4i32 (add ZPR:$op, (extend (nxv4i1 PPR:$pred)))),
+    def : Pat<(nxv4i32 (add ZPR:$op, (extend nxv4i1:$pred))),
               (ADD_ZPmZ_S PPR:$pred, ZPR:$op, (DUP_ZI_S value, 0))>;
-    def : Pat<(nxv2i64 (add ZPR:$op, (extend (nxv2i1 PPR:$pred)))),
+    def : Pat<(nxv2i64 (add ZPR:$op, (extend nxv2i1:$pred))),
               (ADD_ZPmZ_D PPR:$pred, ZPR:$op, (DUP_ZI_D value, 0))>;
   }
 
   defm : sve_predicated_add<zext, 1>;
   defm : sve_predicated_add<sext, 255>;
 
-  def : Pat<(nxv16i8 (sub ZPR:$op, (sext (nxv16i1 PPR:$pred)))),
+  def : Pat<(nxv16i8 (sub ZPR:$op, (sext nxv16i1:$pred))),
             (SUB_ZPmZ_B PPR:$pred, ZPR:$op, (DUP_ZI_B 255, 0))>;
-  def : Pat<(nxv8i16 (sub ZPR:$op, (sext (nxv8i1 PPR:$pred)))),
+  def : Pat<(nxv8i16 (sub ZPR:$op, (sext nxv8i1:$pred))),
             (SUB_ZPmZ_H PPR:$pred, ZPR:$op, (DUP_ZI_H 255, 0))>;
-  def : Pat<(nxv4i32 (sub ZPR:$op, (sext (nxv4i1 PPR:$pred)))),
+  def : Pat<(nxv4i32 (sub ZPR:$op, (sext nxv4i1:$pred))),
             (SUB_ZPmZ_S PPR:$pred, ZPR:$op, (DUP_ZI_S 255, 0))>;
-  def : Pat<(nxv2i64 (sub ZPR:$op, (sext (nxv2i1 PPR:$pred)))),
+  def : Pat<(nxv2i64 (sub ZPR:$op, (sext nxv2i1:$pred))),
             (SUB_ZPmZ_D PPR:$pred, ZPR:$op, (DUP_ZI_D 255, 0))>;
 } // End HasSVEorSME
 
@@ -3988,8 +3995,7 @@ defm STNT1D_4Z_IMM : sve2p1_mem_cst_si_4z<"stnt1d", 0b11, 0b1, ZZZZ_d_mul_r>;
 
 multiclass store_pn_x2<ValueType Ty, SDPatternOperator Store,
                         Instruction RegImmInst> {
-  def : Pat<(Store (Ty ZPR:$vec0), (Ty ZPR:$vec1),
-                   (aarch64svcount PNR:$PNg), GPR64:$base),
+  def : Pat<(Store Ty:$vec0, Ty:$vec1, aarch64svcount:$PNg, GPR64:$base),
             (RegImmInst (REG_SEQUENCE ZPR2Mul2, Ty:$vec0, zsub0, Ty:$vec1, zsub1),
                          PNR:$PNg, GPR64:$base, (i64 0))>;
 }
@@ -4014,8 +4020,7 @@ defm : store_pn_x2<nxv2f64, int_aarch64_sve_stnt1_pn_x2, STNT1D_2Z_IMM>;
 
 multiclass store_pn_x4<ValueType Ty, SDPatternOperator Store,
                         Instruction RegImmInst> {
-  def : Pat<(Store (Ty ZPR:$vec0), (Ty ZPR:$vec1), (Ty ZPR:$vec2), (Ty ZPR:$vec3),
-                   (aarch64svcount PNR:$PNg), GPR64:$base),
+  def : Pat<(Store Ty:$vec0, Ty:$vec1, Ty:$vec2, Ty:$vec3, aarch64svcount:$PNg, GPR64:$base),
             (RegImmInst (REG_SEQUENCE ZPR4Mul4, Ty:$vec0, zsub0, Ty:$vec1, zsub1,
                                                 Ty:$vec2, zsub2, Ty:$vec3, zsub3),
                         PNR:$PNg, GPR64:$base, (i64 0))>;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 7ef78cbba352..df802cf42526 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -275,15 +275,15 @@ static std::string computeDataLayout(const Triple &TT,
                                      bool LittleEndian) {
   if (TT.isOSBinFormatMachO()) {
     if (TT.getArch() == Triple::aarch64_32)
-      return "e-m:o-p:32:32-i64:64-i128:128-n32:64-S128";
-    return "e-m:o-i64:64-i128:128-n32:64-S128";
+      return "e-m:o-p:32:32-i64:64-i128:128-n32:64-S128-Fn32";
+    return "e-m:o-i64:64-i128:128-n32:64-S128-Fn32";
   }
   if (TT.isOSBinFormatCOFF())
-    return "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128";
+    return "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128-Fn32";
   std::string Endian = LittleEndian ? "e" : "E";
   std::string Ptr32 = TT.getEnvironment() == Triple::GNUILP32 ? "-p:32:32" : "";
   return Endian + "-m:e" + Ptr32 +
-         "-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128";
+         "-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32";
 }
 
 static StringRef computeDefaultCPU(const Triple &TT, StringRef CPU) {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index af3a94a0faec..f49c73dc7951 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3968,8 +3968,8 @@ InstructionCost AArch64TTIImpl::getShuffleCost(
   if (LT.second.isFixedLengthVector() &&
       LT.second.getVectorNumElements() == Mask.size() &&
       (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
-      (isZIPMask(Mask, LT.second, Unused) ||
-       isUZPMask(Mask, LT.second, Unused) ||
+      (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
+       isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
        // Check for non-zero lane splats
        all_of(drop_begin(Mask),
               [&Mask](int M) { return M < 0 || M == Mask[0]; })))
@@ -4183,7 +4183,7 @@ bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
 
 InstructionCost
 AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
-                                     int64_t BaseOffset, bool HasBaseReg,
+                                     StackOffset BaseOffset, bool HasBaseReg,
                                      int64_t Scale, unsigned AddrSpace) const {
   // Scaling factors are not free at all.
   // Operands                     | Rt Latency
@@ -4194,9 +4194,10 @@ AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
   // Rt, [Xn, Wm, <extend> #imm]  |
   TargetLoweringBase::AddrMode AM;
   AM.BaseGV = BaseGV;
-  AM.BaseOffs = BaseOffset;
+  AM.BaseOffs = BaseOffset.getFixed();
   AM.HasBaseReg = HasBaseReg;
   AM.Scale = Scale;
+  AM.ScalableOffset = BaseOffset.getScalable();
   if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
     // Scale represents reg2 * scale, thus account for 1 if
     // it is not equal to 0 or 1.
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 678c132e6a80..2f44aaa3e26a 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -407,7 +407,7 @@ public:
   /// If the AM is supported, the return value must be >= 0.
   /// If the AM is not supported, it returns a negative value.
   InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
-                                       int64_t BaseOffset, bool HasBaseReg,
+                                       StackOffset BaseOffset, bool HasBaseReg,
                                        int64_t Scale, unsigned AddrSpace) const;
   /// @}
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 61f5bc2464ee..1b65ae7b4782 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -221,7 +221,6 @@ private:
   bool selectIntrinsicWithSideEffects(MachineInstr &I,
                                       MachineRegisterInfo &MRI);
   bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI);
-  bool selectVectorICmp(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI);
@@ -3403,7 +3402,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
   }
   case TargetOpcode::G_ICMP: {
     if (Ty.isVector())
-      return selectVectorICmp(I, MRI);
+      return false;
 
     if (Ty != LLT::scalar(32)) {
       LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty
@@ -3652,177 +3651,6 @@ bool AArch64InstructionSelector::selectTLSGlobalValue(
   return true;
 }
 
-bool AArch64InstructionSelector::selectVectorICmp(
-    MachineInstr &I, MachineRegisterInfo &MRI) {
-  Register DstReg = I.getOperand(0).getReg();
-  LLT DstTy = MRI.getType(DstReg);
-  Register SrcReg = I.getOperand(2).getReg();
-  Register Src2Reg = I.getOperand(3).getReg();
-  LLT SrcTy = MRI.getType(SrcReg);
-
-  unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
-  unsigned NumElts = DstTy.getNumElements();
-
-  // First index is element size, 0 == 8b, 1 == 16b, 2 == 32b, 3 == 64b
-  // Second index is num elts, 0 == v2, 1 == v4, 2 == v8, 3 == v16
-  // Third index is cc opcode:
-  // 0 == eq
-  // 1 == ugt
-  // 2 == uge
-  // 3 == ult
-  // 4 == ule
-  // 5 == sgt
-  // 6 == sge
-  // 7 == slt
-  // 8 == sle
-  // ne is done by negating 'eq' result.
-
-  // This table below assumes that for some comparisons the operands will be
-  // commuted.
-  // ult op == commute + ugt op
-  // ule op == commute + uge op
-  // slt op == commute + sgt op
-  // sle op == commute + sge op
-  unsigned PredIdx = 0;
-  bool SwapOperands = false;
-  CmpInst::Predicate Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
-  switch (Pred) {
-  case CmpInst::ICMP_NE:
-  case CmpInst::ICMP_EQ:
-    PredIdx = 0;
-    break;
-  case CmpInst::ICMP_UGT:
-    PredIdx = 1;
-    break;
-  case CmpInst::ICMP_UGE:
-    PredIdx = 2;
-    break;
-  case CmpInst::ICMP_ULT:
-    PredIdx = 3;
-    SwapOperands = true;
-    break;
-  case CmpInst::ICMP_ULE:
-    PredIdx = 4;
-    SwapOperands = true;
-    break;
-  case CmpInst::ICMP_SGT:
-    PredIdx = 5;
-    break;
-  case CmpInst::ICMP_SGE:
-    PredIdx = 6;
-    break;
-  case CmpInst::ICMP_SLT:
-    PredIdx = 7;
-    SwapOperands = true;
-    break;
-  case CmpInst::ICMP_SLE:
-    PredIdx = 8;
-    SwapOperands = true;
-    break;
-  default:
-    llvm_unreachable("Unhandled icmp predicate");
-    return false;
-  }
-
-  // This table obviously should be tablegen'd when we have our GISel native
-  // tablegen selector.
-
-  static const unsigned OpcTable[4][4][9] = {
-      {
-          {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
-           0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
-           0 /* invalid */},
-          {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
-           0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
-           0 /* invalid */},
-          {AArch64::CMEQv8i8, AArch64::CMHIv8i8, AArch64::CMHSv8i8,
-           AArch64::CMHIv8i8, AArch64::CMHSv8i8, AArch64::CMGTv8i8,
-           AArch64::CMGEv8i8, AArch64::CMGTv8i8, AArch64::CMGEv8i8},
-          {AArch64::CMEQv16i8, AArch64::CMHIv16i8, AArch64::CMHSv16i8,
-           AArch64::CMHIv16i8, AArch64::CMHSv16i8, AArch64::CMGTv16i8,
-           AArch64::CMGEv16i8, AArch64::CMGTv16i8, AArch64::CMGEv16i8}
-      },
-      {
-          {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
-           0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
-           0 /* invalid */},
-          {AArch64::CMEQv4i16, AArch64::CMHIv4i16, AArch64::CMHSv4i16,
-           AArch64::CMHIv4i16, AArch64::CMHSv4i16, AArch64::CMGTv4i16,
-           AArch64::CMGEv4i16, AArch64::CMGTv4i16, AArch64::CMGEv4i16},
-          {AArch64::CMEQv8i16, AArch64::CMHIv8i16, AArch64::CMHSv8i16,
-           AArch64::CMHIv8i16, AArch64::CMHSv8i16, AArch64::CMGTv8i16,
-           AArch64::CMGEv8i16, AArch64::CMGTv8i16, AArch64::CMGEv8i16},
-          {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
-           0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
-           0 /* invalid */}
-      },
-      {
-          {AArch64::CMEQv2i32, AArch64::CMHIv2i32, AArch64::CMHSv2i32,
-           AArch64::CMHIv2i32, AArch64::CMHSv2i32, AArch64::CMGTv2i32,
-           AArch64::CMGEv2i32, AArch64::CMGTv2i32, AArch64::CMGEv2i32},
-          {AArch64::CMEQv4i32, AArch64::CMHIv4i32, AArch64::CMHSv4i32,
-           AArch64::CMHIv4i32, AArch64::CMHSv4i32, AArch64::CMGTv4i32,
-           AArch64::CMGEv4i32, AArch64::CMGTv4i32, AArch64::CMGEv4i32},
-          {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
-           0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
-           0 /* invalid */},
-          {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
-           0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
-           0 /* invalid */}
-      },
-      {
-          {AArch64::CMEQv2i64, AArch64::CMHIv2i64, AArch64::CMHSv2i64,
-           AArch64::CMHIv2i64, AArch64::CMHSv2i64, AArch64::CMGTv2i64,
-           AArch64::CMGEv2i64, AArch64::CMGTv2i64, AArch64::CMGEv2i64},
-          {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
-           0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
-           0 /* invalid */},
-          {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
-           0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
-           0 /* invalid */},
-          {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
-           0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
-           0 /* invalid */}
-      },
-  };
-  unsigned EltIdx = Log2_32(SrcEltSize / 8);
-  unsigned NumEltsIdx = Log2_32(NumElts / 2);
-  unsigned Opc = OpcTable[EltIdx][NumEltsIdx][PredIdx];
-  if (!Opc) {
-    LLVM_DEBUG(dbgs() << "Could not map G_ICMP to cmp opcode");
-    return false;
-  }
-
-  const RegisterBank &VecRB = *RBI.getRegBank(SrcReg, MRI, TRI);
-  const TargetRegisterClass *SrcRC =
-      getRegClassForTypeOnBank(SrcTy, VecRB, true);
-  if (!SrcRC) {
-    LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
-    return false;
-  }
-
-  unsigned NotOpc = Pred == ICmpInst::ICMP_NE ? AArch64::NOTv8i8 : 0;
-  if (SrcTy.getSizeInBits() == 128)
-    NotOpc = NotOpc ? AArch64::NOTv16i8 : 0;
-
-  if (SwapOperands)
-    std::swap(SrcReg, Src2Reg);
-
-  auto Cmp = MIB.buildInstr(Opc, {SrcRC}, {SrcReg, Src2Reg});
-  constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
-
-  // Invert if we had a 'ne' cc.
-  if (NotOpc) {
-    Cmp = MIB.buildInstr(NotOpc, {DstReg}, {Cmp});
-    constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
-  } else {
-    MIB.buildCopy(DstReg, Cmp.getReg(0));
-  }
-  RBI.constrainGenericRegister(DstReg, *SrcRC, MRI);
-  I.eraseFromParent();
-  return true;
-}
-
 MachineInstr *AArch64InstructionSelector::emitScalarToVector(
     unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar,
     MachineIRBuilder &MIRBuilder) const {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 243891249668..b8274f0f872c 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -177,7 +177,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
 
   getActionDefinitionsBuilder(G_PTR_ADD)
       .legalFor({{p0, s64}, {v2p0, v2s64}})
-      .clampScalar(1, s64, s64);
+      .clampScalar(1, s64, s64)
+      .clampNumElements(0, v2p0, v2p0)
+      .clampNumElements(1, v2s64, v2s64);
 
   getActionDefinitionsBuilder(G_PTRMASK).legalFor({{p0, s64}});
 
@@ -493,17 +495,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
 
   // FIXME: fix moreElementsToNextPow2
   getActionDefinitionsBuilder(G_ICMP)
-      .legalFor({{s32, s32},
-                 {s32, s64},
-                 {s32, p0},
-                 {v4s32, v4s32},
-                 {v2s32, v2s32},
-                 {v2s64, v2s64},
-                 {v2s64, v2p0},
-                 {v4s16, v4s16},
-                 {v8s16, v8s16},
-                 {v8s8, v8s8},
-                 {v16s8, v16s8}})
+      .legalFor({{s32, s32}, {s32, s64}, {s32, p0}})
       .widenScalarOrEltToNextPow2(1)
       .clampScalar(1, s32, s64)
       .clampScalar(0, s32, s32)
@@ -525,7 +517,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampNumElements(1, v8s8, v16s8)
       .clampNumElements(1, v4s16, v8s16)
       .clampNumElements(1, v2s32, v4s32)
-      .clampNumElements(1, v2s64, v2s64);
+      .clampNumElements(1, v2s64, v2s64)
+      .customIf(isVector(0));
 
   getActionDefinitionsBuilder(G_FCMP)
       .legalFor({{s32, MinFPScalar},
@@ -1264,6 +1257,8 @@ bool AArch64LegalizerInfo::legalizeCustom(
     return legalizePrefetch(MI, Helper);
   case TargetOpcode::G_ABS:
     return Helper.lowerAbsToCNeg(MI);
+  case TargetOpcode::G_ICMP:
+    return legalizeICMP(MI, MRI, MIRBuilder);
   }
 
   llvm_unreachable("expected switch to return");
@@ -1322,6 +1317,36 @@ bool AArch64LegalizerInfo::legalizeFunnelShift(MachineInstr &MI,
   return true;
 }
 
+bool AArch64LegalizerInfo::legalizeICMP(MachineInstr &MI,
+                                        MachineRegisterInfo &MRI,
+                                        MachineIRBuilder &MIRBuilder) const {
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg1 = MI.getOperand(2).getReg();
+  Register SrcReg2 = MI.getOperand(3).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+  LLT SrcTy = MRI.getType(SrcReg1);
+
+  // Check the vector types are legal
+  if (DstTy.getScalarSizeInBits() != SrcTy.getScalarSizeInBits() ||
+      DstTy.getNumElements() != SrcTy.getNumElements() ||
+      (DstTy.getSizeInBits() != 64 && DstTy.getSizeInBits() != 128))
+    return false;
+
+  // Lowers G_ICMP NE => G_ICMP EQ to allow better pattern matching for
+  // following passes
+  CmpInst::Predicate Pred = (CmpInst::Predicate)MI.getOperand(1).getPredicate();
+  if (Pred != CmpInst::ICMP_NE)
+    return true;
+  Register CmpReg =
+      MIRBuilder
+          .buildICmp(CmpInst::ICMP_EQ, MRI.getType(DstReg), SrcReg1, SrcReg2)
+          .getReg(0);
+  MIRBuilder.buildNot(DstReg, CmpReg);
+
+  MI.eraseFromParent();
+  return true;
+}
+
 bool AArch64LegalizerInfo::legalizeRotate(MachineInstr &MI,
                                           MachineRegisterInfo &MRI,
                                           LegalizerHelper &Helper) const {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
index b69d9b015bd2..00d85a36e4b2 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
@@ -50,6 +50,8 @@ private:
                                LegalizerHelper &Helper) const;
   bool legalizeRotate(MachineInstr &MI, MachineRegisterInfo &MRI,
                       LegalizerHelper &Helper) const;
+  bool legalizeICMP(MachineInstr &MI, MachineRegisterInfo &MRI,
+                    MachineIRBuilder &MIRBuilder) const;
   bool legalizeFunnelShift(MachineInstr &MI, MachineRegisterInfo &MRI,
                            MachineIRBuilder &MIRBuilder,
                            GISelChangeObserver &Observer,
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
index b571f56bf9e1..77b8cbe5793c 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -20,6 +20,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64GlobalISelUtils.h"
+#include "AArch64PerfectShuffle.h"
 #include "AArch64Subtarget.h"
 #include "AArch64TargetMachine.h"
 #include "GISel/AArch64LegalizerInfo.h"
@@ -77,50 +78,6 @@ struct ShuffleVectorPseudo {
   ShuffleVectorPseudo() = default;
 };
 
-/// Check if a vector shuffle corresponds to a REV instruction with the
-/// specified blocksize.
-bool isREVMask(ArrayRef<int> M, unsigned EltSize, unsigned NumElts,
-               unsigned BlockSize) {
-  assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
-         "Only possible block sizes for REV are: 16, 32, 64");
-  assert(EltSize != 64 && "EltSize cannot be 64 for REV mask.");
-
-  unsigned BlockElts = M[0] + 1;
-
-  // If the first shuffle index is UNDEF, be optimistic.
-  if (M[0] < 0)
-    BlockElts = BlockSize / EltSize;
-
-  if (BlockSize <= EltSize || BlockSize != BlockElts * EltSize)
-    return false;
-
-  for (unsigned i = 0; i < NumElts; ++i) {
-    // Ignore undef indices.
-    if (M[i] < 0)
-      continue;
-    if (static_cast<unsigned>(M[i]) !=
-        (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
-      return false;
-  }
-
-  return true;
-}
-
-/// Determines if \p M is a shuffle vector mask for a TRN of \p NumElts.
-/// Whether or not G_TRN1 or G_TRN2 should be used is stored in \p WhichResult.
-bool isTRNMask(ArrayRef<int> M, unsigned NumElts, unsigned &WhichResult) {
-  if (NumElts % 2 != 0)
-    return false;
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  for (unsigned i = 0; i < NumElts; i += 2) {
-    if ((M[i] >= 0 && static_cast<unsigned>(M[i]) != i + WhichResult) ||
-        (M[i + 1] >= 0 &&
-         static_cast<unsigned>(M[i + 1]) != i + NumElts + WhichResult))
-      return false;
-  }
-  return true;
-}
-
 /// Check if a G_EXT instruction can handle a shuffle mask \p M when the vector
 /// sources of the shuffle are different.
 std::optional<std::pair<bool, uint64_t>> getExtMask(ArrayRef<int> M,
@@ -163,38 +120,6 @@ std::optional<std::pair<bool, uint64_t>> getExtMask(ArrayRef<int> M,
   return std::make_pair(ReverseExt, Imm);
 }
 
-/// Determines if \p M is a shuffle vector mask for a UZP of \p NumElts.
-/// Whether or not G_UZP1 or G_UZP2 should be used is stored in \p WhichResult.
-bool isUZPMask(ArrayRef<int> M, unsigned NumElts, unsigned &WhichResult) {
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  for (unsigned i = 0; i != NumElts; ++i) {
-    // Skip undef indices.
-    if (M[i] < 0)
-      continue;
-    if (static_cast<unsigned>(M[i]) != 2 * i + WhichResult)
-      return false;
-  }
-  return true;
-}
-
-/// \return true if \p M is a zip mask for a shuffle vector of \p NumElts.
-/// Whether or not G_ZIP1 or G_ZIP2 should be used is stored in \p WhichResult.
-bool isZipMask(ArrayRef<int> M, unsigned NumElts, unsigned &WhichResult) {
-  if (NumElts % 2 != 0)
-    return false;
-
-  // 0 means use ZIP1, 1 means use ZIP2.
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  unsigned Idx = WhichResult * NumElts / 2;
-  for (unsigned i = 0; i != NumElts; i += 2) {
-    if ((M[i] >= 0 && static_cast<unsigned>(M[i]) != Idx) ||
-        (M[i + 1] >= 0 && static_cast<unsigned>(M[i + 1]) != Idx + NumElts))
-      return false;
-    Idx += 1;
-  }
-  return true;
-}
-
 /// Helper function for matchINS.
 ///
 /// \returns a value when \p M is an ins mask for \p NumInputElements.
@@ -308,7 +233,7 @@ bool matchZip(MachineInstr &MI, MachineRegisterInfo &MRI,
   ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
   Register Dst = MI.getOperand(0).getReg();
   unsigned NumElts = MRI.getType(Dst).getNumElements();
-  if (!isZipMask(ShuffleMask, NumElts, WhichResult))
+  if (!isZIPMask(ShuffleMask, NumElts, WhichResult))
     return false;
   unsigned Opc = (WhichResult == 0) ? AArch64::G_ZIP1 : AArch64::G_ZIP2;
   Register V1 = MI.getOperand(1).getReg();
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 3363aab4b093..50ee37b0dfeb 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -286,14 +286,26 @@ multiclass sme_outer_product_fp64<bit S, string mnemonic, SDPatternOperator op>
   def : SME_ZA_Tile_TwoPred_TwoVec_Pat<NAME, op, timm32_0_7, nxv2i1, nxv2f64>;
 }
 
-multiclass sme2p1_fmop_tile_fp16<string mnemonic, bit bf, bit s, bits<2> op, ZPRRegOp zpr_ty>{
-  def NAME : sme_fp_outer_product_inst<s, {0,bf}, op, TileOp16, zpr_ty, mnemonic> {
+multiclass sme2p1_fmop_tile_f8f16<string mnemonic, bit bf, bit s, bits<2> op> {
+  def NAME : sme_fp_outer_product_inst<s, {0,bf}, op, TileOp16, ZPR8, mnemonic> {
     bits<1> ZAda;
     let Inst{2-1} = 0b00;
     let Inst{0}   = ZAda;
   }
 }
 
+multiclass sme2p1_fmop_tile_fp16<string mnemonic, bit bf, bit s, ValueType vt, SDPatternOperator intrinsic = null_frag> {
+  def NAME : sme_fp_outer_product_inst<s, {0,bf}, 0b11, TileOp16, ZPR16, mnemonic>, SMEPseudo2Instr<NAME, 1> {
+    bits<1> ZAda;
+    let Inst{2-1} = 0b00;
+    let Inst{0}   = ZAda;
+  }
+
+  def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR16, SMEMatrixTileH>, SMEPseudo2Instr<NAME, 0>;
+
+  def : SME_ZA_Tile_TwoPred_TwoVec_Pat<NAME, intrinsic, timm32_0_1, nxv8i1, vt>;
+}
+
 class sme_int_outer_product_inst<bits<3> opc, bit sz, bit sme2,
                                  MatrixTileOperand za_ty, ZPRRegOp zpr_ty,
                                  string mnemonic>
@@ -2448,9 +2460,29 @@ multiclass sme2_multi_vec_array_vg2_index_32b<string mnemonic, bits<2> sz, bits<
 }
 
 // SME2.1 multi-vec ternary indexed two registers 16-bit
-// SME2 multi-vec indexed FP8 two-way dot product to FP16 two registers
 multiclass sme2p1_multi_vec_array_vg2_index_16b<string mnemonic, bits<2> sz, bits<3> op,
-                                                RegisterOperand multi_vector_ty, ZPRRegOp zpr_ty> {
+                                                RegisterOperand multi_vector_ty, ZPRRegOp vector_ty,
+                                                ValueType vt, SDPatternOperator intrinsic> {
+  def NAME : sme2_multi_vec_array_vg2_index<sz, {op{2},?,?,op{1-0},?}, MatrixOp16,
+                                            multi_vector_ty, vector_ty,
+                                            VectorIndexH, mnemonic>, SMEPseudo2Instr<NAME, 1> {
+    bits<3> i;
+    let Inst{11-10} = i{2-1};
+    let Inst{3}     = i{0};
+  }
+
+  def _PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME, sme_elm_idx0_7, multi_vector_ty, vector_ty, VectorIndexH32b, SMEMatrixArray>;
+
+  def : SME2_ZA_TwoOp_VG2_Multi_Index_Pat<NAME, intrinsic, sme_elm_idx0_7, vector_ty, vt, VectorIndexH32b_timm, tileslice16>;
+
+  def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm3], $Zn, $Zm$i",
+        (!cast<Instruction>(NAME) MatrixOp16:$ZAda,  MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3,
+        multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexH:$i), 0>;
+}
+
+// SME2 multi-vec indexed FP8 two-way dot product to FP16 two registers
+multiclass sme2p1_multi_vec_array_vg2_index_f8f16<string mnemonic, bits<2> sz, bits<3> op,
+                                                  RegisterOperand multi_vector_ty, ZPRRegOp zpr_ty> {
   def NAME : sme2_multi_vec_array_vg2_index<sz, {op{2},?,?,op{1-0},?}, MatrixOp16,
                                             multi_vector_ty, zpr_ty,
                                             VectorIndexH, mnemonic> {
@@ -2569,10 +2601,10 @@ multiclass sme2_multi_vec_array_vg4_index_32b<string mnemonic, bits<4> op,
         multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexS32b_timm:$i), 0>;
 }
 
-// SME2.1 multi-vec ternary indexed four registers 16-bit
-multiclass sme2p1_multi_vec_array_vg4_index_16b<string mnemonic, bits<3> op,
-                                                RegisterOperand multi_vector_ty,
-                                                ZPRRegOp zpr_ty> {
+// SME2.1 multi-vec ternary indexed four registers 16-bit (FP8)
+multiclass sme2p1_multi_vec_array_vg4_index_f8f16<string mnemonic, bits<3> op,
+                                                  RegisterOperand multi_vector_ty,
+                                                  ZPRRegOp zpr_ty> {
   def NAME : sme2_multi_vec_array_vg4_index<0b0,{0b1,?,?,op,?}, MatrixOp16,
                                             multi_vector_ty, zpr_ty,
                                             VectorIndexH, mnemonic>{
@@ -2586,6 +2618,28 @@ multiclass sme2p1_multi_vec_array_vg4_index_16b<string mnemonic, bits<3> op,
         sme_elm_idx0_7:$imm3, multi_vector_ty:$Zn, zpr_ty:$Zm, VectorIndexH:$i), 0>;
 }
 
+// SME2.1 multi-vec ternary indexed four registers 16-bit
+multiclass sme2p1_multi_vec_array_vg4_index_16b<string mnemonic, bits<3> op,
+                                                RegisterOperand multi_vector_ty,
+                                                ZPRRegOp vector_ty, ValueType vt,
+                                                SDPatternOperator intrinsic> {
+  def NAME : sme2_multi_vec_array_vg4_index<0b0,{0b1,?,?,op,?}, MatrixOp16,
+                                            multi_vector_ty, vector_ty,
+                                            VectorIndexH, mnemonic>, SMEPseudo2Instr<NAME, 1> {
+    bits<3> i;
+    let Inst{11-10} = i{2-1};
+    let Inst{3}     = i{0};
+  }
+
+  def _PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME, sme_elm_idx0_7, multi_vector_ty, vector_ty, VectorIndexH32b_timm, SMEMatrixArray>;
+
+  def : SME2_ZA_TwoOp_VG4_Multi_Index_Pat<NAME, intrinsic, sme_elm_idx0_7, vector_ty, vt, VectorIndexH32b_timm, tileslice16>;
+
+  def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm3], $Zn, $Zm$i",
+        (!cast<Instruction>(NAME) MatrixOp16:$ZAda,  MatrixIndexGPR32Op8_11:$Rv,
+        sme_elm_idx0_7:$imm3, multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexH:$i), 0>;
+}
+
 // SME2 multi-vec ternary indexed four registers 64-bit
 class sme2_multi_vec_array_vg4_index_64b<bits<3> op,
                                          RegisterOperand multi_vector_ty,
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 69c3238c7d61..fc7d3cdda4ac 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -7060,16 +7060,17 @@ multiclass sve_int_perm_splice<string asm, SDPatternOperator op> {
   def _S : sve_int_perm_splice<0b10, asm, ZPR32>;
   def _D : sve_int_perm_splice<0b11, asm, ZPR64>;
 
-  def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
-  def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1,  nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1,  nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1,  nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
+ foreach VT = [nxv16i8] in
+   def : SVE_3_Op_Pat<VT, op, nxv16i1, VT, VT, !cast<Instruction>(NAME # _B)>;
 
-  def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1,  nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1,  nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1,  nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
+ foreach VT = [nxv8i16, nxv8f16, nxv8bf16] in
+   def : SVE_3_Op_Pat<VT, op, nxv8i1, VT, VT, !cast<Instruction>(NAME # _H)>;
 
-  def : SVE_3_Op_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME # _H)>;
+ foreach VT = [nxv4i32, nxv4f16, nxv4f32, nxv4bf16] in
+   def : SVE_3_Op_Pat<VT, op, nxv4i1, VT, VT, !cast<Instruction>(NAME # _S)>;
+
+ foreach VT = [nxv2i64, nxv2f16, nxv2f32, nxv2f64, nxv2bf16] in
+   def : SVE_3_Op_Pat<VT, op, nxv2i1, VT, VT, !cast<Instruction>(NAME # _D)>;
 }
 
 class sve2_int_perm_splice_cons<bits<2> sz8_64, string asm,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 8abe9920c02c..35b0cb439bfa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1918,7 +1918,8 @@ def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">,
 
 def HasTrue16BitInsts : Predicate<"Subtarget->hasTrue16BitInsts()">,
   AssemblerPredicate<(all_of FeatureTrue16BitInsts)>;
-def NotHasTrue16BitInsts : True16PredicateClass<"!Subtarget->hasTrue16BitInsts()">;
+def NotHasTrue16BitInsts : True16PredicateClass<"!Subtarget->hasTrue16BitInsts()">,
+  AssemblerPredicate<(all_of (not FeatureTrue16BitInsts))>;
 
 // Control use of True16 instructions. The real True16 instructions are
 // True16 instructions as they are defined in the ISA. Fake True16
@@ -1927,7 +1928,10 @@ def NotHasTrue16BitInsts : True16PredicateClass<"!Subtarget->hasTrue16BitInsts()
 def UseRealTrue16Insts : True16PredicateClass<"Subtarget->useRealTrue16Insts()">,
   AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts)>;
 def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() && "
-                                              "!Subtarget->useRealTrue16Insts()">;
+                                              "!Subtarget->useRealTrue16Insts()">,
+  AssemblerPredicate<(all_of FeatureTrue16BitInsts)>;
+  // FIXME When we default to RealTrue16 instead of Fake, change the line as follows.
+  // AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>;
 
 def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,
   AssemblerPredicate<(all_of FeatureVOP3P)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index b53def912ab6..f55f656ff922 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -42,7 +42,7 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
- }
+  }
 };
 
 } // End anonymous namespace
@@ -89,6 +89,7 @@ recursivelyVisitUsers(GlobalValue &GV,
 static bool alwaysInlineImpl(Module &M, bool GlobalOpt) {
   std::vector<GlobalAlias*> AliasesToRemove;
 
+  bool Changed = false;
   SmallPtrSet<Function *, 8> FuncsToAlwaysInline;
   SmallPtrSet<Function *, 8> FuncsToNoInline;
   Triple TT(M.getTargetTriple());
@@ -98,6 +99,7 @@ static bool alwaysInlineImpl(Module &M, bool GlobalOpt) {
       if (TT.getArch() == Triple::amdgcn &&
           A.getLinkage() != GlobalValue::InternalLinkage)
         continue;
+      Changed = true;
       A.replaceAllUsesWith(F);
       AliasesToRemove.push_back(&A);
     }
@@ -153,7 +155,7 @@ static bool alwaysInlineImpl(Module &M, bool GlobalOpt) {
   for (Function *F : FuncsToNoInline)
     F->addFnAttr(Attribute::NoInline);
 
-  return !FuncsToAlwaysInline.empty() || !FuncsToNoInline.empty();
+  return Changed || !FuncsToAlwaysInline.empty() || !FuncsToNoInline.empty();
 }
 
 bool AMDGPUAlwaysInline::runOnModule(Module &M) {
@@ -166,6 +168,6 @@ ModulePass *llvm::createAMDGPUAlwaysInlinePass(bool GlobalOpt) {
 
 PreservedAnalyses AMDGPUAlwaysInlinePass::run(Module &M,
                                               ModuleAnalysisManager &AM) {
-  alwaysInlineImpl(M, GlobalOpt);
-  return PreservedAnalyses::all();
+  const bool Changed = alwaysInlineImpl(M, GlobalOpt);
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 052b231d62a3..b7388ed9e85a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -22,6 +22,7 @@
 #include "AMDKernelCodeT.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUInstPrinter.h"
+#include "MCTargetDesc/AMDGPUMCExpr.h"
 #include "MCTargetDesc/AMDGPUMCKernelDescriptor.h"
 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
 #include "R600AsmPrinter.h"
@@ -134,6 +135,15 @@ void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
     getTargetStreamer()->getPALMetadata()->readFromIR(M);
 }
 
+uint64_t AMDGPUAsmPrinter::getMCExprValue(const MCExpr *Value, MCContext &Ctx) {
+  int64_t Val;
+  if (!Value->evaluateAsAbsolute(Val)) {
+    Ctx.reportError(SMLoc(), "could not resolve expression when required.");
+    return 0;
+  }
+  return static_cast<uint64_t>(Val);
+}
+
 void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
   // Init target streamer if it has not yet happened
   if (!IsTargetStreamerInitialized)
@@ -237,12 +247,14 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
   getNameWithPrefix(KernelName, &MF->getFunction());
   getTargetStreamer()->EmitAmdhsaKernelDescriptor(
       STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
-      CurrentProgramInfo.NumVGPRsForWavesPerEU,
-      CurrentProgramInfo.NumSGPRsForWavesPerEU -
+      getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Context),
+      getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Context) -
           IsaInfo::getNumExtraSGPRs(
-              &STM, CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
+              &STM, getMCExprValue(CurrentProgramInfo.VCCUsed, Context),
+              getMCExprValue(CurrentProgramInfo.FlatUsed, Context),
               getTargetStreamer()->getTargetID()->isXnackOnOrAny()),
-      CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
+      getMCExprValue(CurrentProgramInfo.VCCUsed, Context),
+      getMCExprValue(CurrentProgramInfo.FlatUsed, Context));
 
   Streamer.popSection();
 }
@@ -422,7 +434,7 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
   }
 
-  if (CurrentProgramInfo.DynamicCallStack &&
+  if (getMCExprValue(CurrentProgramInfo.DynamicCallStack, MF.getContext()) &&
       CodeObjectVersion >= AMDGPU::AMDHSA_COV5)
     KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK;
 
@@ -439,29 +451,22 @@ AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
 
   MCKernelDescriptor KernelDescriptor;
 
-  assert(isUInt<32>(PI.ScratchSize));
-  assert(isUInt<32>(PI.getComputePGMRSrc1(STM)));
-  assert(isUInt<32>(PI.getComputePGMRSrc2()));
-
   KernelDescriptor.group_segment_fixed_size =
       MCConstantExpr::create(PI.LDSSize, Ctx);
-  KernelDescriptor.private_segment_fixed_size =
-      MCConstantExpr::create(PI.ScratchSize, Ctx);
+  KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
 
   Align MaxKernArgAlign;
   KernelDescriptor.kernarg_size = MCConstantExpr::create(
       STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx);
 
-  KernelDescriptor.compute_pgm_rsrc1 =
-      MCConstantExpr::create(PI.getComputePGMRSrc1(STM), Ctx);
-  KernelDescriptor.compute_pgm_rsrc2 =
-      MCConstantExpr::create(PI.getComputePGMRSrc2(), Ctx);
+  KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx);
+  KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx);
   KernelDescriptor.kernel_code_properties =
       MCConstantExpr::create(getAmdhsaKernelCodeProperties(MF), Ctx);
 
-  assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
-  KernelDescriptor.compute_pgm_rsrc3 = MCConstantExpr::create(
-      STM.hasGFX90AInsts() ? CurrentProgramInfo.ComputePGMRSrc3GFX90A : 0, Ctx);
+  assert(STM.hasGFX90AInsts() ||
+         getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx) == 0);
+  KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A;
 
   KernelDescriptor.kernarg_preload = MCConstantExpr::create(
       AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
@@ -477,9 +482,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
     initTargetStreamer(*MF.getFunction().getParent());
 
   ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>();
-  CurrentProgramInfo = SIProgramInfo();
+  CurrentProgramInfo.reset(MF);
 
   const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
+  MCContext &Ctx = MF.getContext();
 
   // The starting address of all shader programs must be 256 bytes aligned.
   // Regular functions just need the basic required instruction alignment.
@@ -550,11 +556,13 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 
     OutStreamer->emitRawComment(" Kernel info:", false);
     emitCommonFunctionComments(
-        CurrentProgramInfo.NumArchVGPR,
-        STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR
+        getMCExprValue(CurrentProgramInfo.NumArchVGPR, Ctx),
+        STM.hasMAIInsts() ? getMCExprValue(CurrentProgramInfo.NumAccVGPR, Ctx)
                           : std::optional<uint32_t>(),
-        CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
-        CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI);
+        getMCExprValue(CurrentProgramInfo.NumVGPR, Ctx),
+        getMCExprValue(CurrentProgramInfo.NumSGPR, Ctx),
+        getMCExprValue(CurrentProgramInfo.ScratchSize, Ctx),
+        getFunctionCodeSize(MF), MFI);
 
     OutStreamer->emitRawComment(
       " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
@@ -565,32 +573,44 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
       " bytes/workgroup (compile time only)", false);
 
     OutStreamer->emitRawComment(
-      " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false);
+        " SGPRBlocks: " +
+            Twine(getMCExprValue(CurrentProgramInfo.SGPRBlocks, Ctx)),
+        false);
     OutStreamer->emitRawComment(
-      " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false);
+        " VGPRBlocks: " +
+            Twine(getMCExprValue(CurrentProgramInfo.VGPRBlocks, Ctx)),
+        false);
 
     OutStreamer->emitRawComment(
-      " NumSGPRsForWavesPerEU: " +
-      Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false);
+        " NumSGPRsForWavesPerEU: " +
+            Twine(
+                getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx)),
+        false);
     OutStreamer->emitRawComment(
-      " NumVGPRsForWavesPerEU: " +
-      Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
+        " NumVGPRsForWavesPerEU: " +
+            Twine(
+                getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx)),
+        false);
 
     if (STM.hasGFX90AInsts())
       OutStreamer->emitRawComment(
-        " AccumOffset: " +
-        Twine((CurrentProgramInfo.AccumOffset + 1) * 4), false);
+          " AccumOffset: " +
+              Twine((getMCExprValue(CurrentProgramInfo.AccumOffset, Ctx) + 1) *
+                    4),
+          false);
 
     OutStreamer->emitRawComment(
-      " Occupancy: " +
-      Twine(CurrentProgramInfo.Occupancy), false);
+        " Occupancy: " +
+            Twine(getMCExprValue(CurrentProgramInfo.Occupancy, Ctx)),
+        false);
 
     OutStreamer->emitRawComment(
       " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
 
-    OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
-                                    Twine(CurrentProgramInfo.ScratchEnable),
-                                false);
+    OutStreamer->emitRawComment(
+        " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
+            Twine(getMCExprValue(CurrentProgramInfo.ScratchEnable, Ctx)),
+        false);
     OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
                                     Twine(CurrentProgramInfo.UserSGPR),
                                 false);
@@ -611,18 +631,20 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
                                 false);
 
     assert(STM.hasGFX90AInsts() ||
-           CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
+           getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx) == 0);
     if (STM.hasGFX90AInsts()) {
       OutStreamer->emitRawComment(
-        " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
-        Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
-                               amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))),
-                               false);
+          " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
+              Twine((AMDHSA_BITS_GET(
+                  getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx),
+                  amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))),
+          false);
       OutStreamer->emitRawComment(
-        " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
-        Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
-                               amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))),
-                               false);
+          " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
+              Twine((AMDHSA_BITS_GET(
+                  getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx),
+                  amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))),
+          false);
     }
   }
 
@@ -702,23 +724,40 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
       ResourceUsage->getResourceInfo(&MF.getFunction());
   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+  MCContext &Ctx = MF.getContext();
 
-  ProgInfo.NumArchVGPR = Info.NumVGPR;
-  ProgInfo.NumAccVGPR = Info.NumAGPR;
-  ProgInfo.NumVGPR = Info.getTotalNumVGPRs(STM);
-  ProgInfo.AccumOffset = alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1;
+  auto CreateExpr = [&Ctx](int64_t Value) {
+    return MCConstantExpr::create(Value, Ctx);
+  };
+
+  auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
+    int64_t Val;
+    if (Value->evaluateAsAbsolute(Val)) {
+      Res = Val;
+      return true;
+    }
+    return false;
+  };
+
+  ProgInfo.NumArchVGPR = CreateExpr(Info.NumVGPR);
+  ProgInfo.NumAccVGPR = CreateExpr(Info.NumAGPR);
+  ProgInfo.NumVGPR = CreateExpr(Info.getTotalNumVGPRs(STM));
+  ProgInfo.AccumOffset =
+      CreateExpr(alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1);
   ProgInfo.TgSplit = STM.isTgSplitEnabled();
-  ProgInfo.NumSGPR = Info.NumExplicitSGPR;
-  ProgInfo.ScratchSize = Info.PrivateSegmentSize;
-  ProgInfo.VCCUsed = Info.UsesVCC;
-  ProgInfo.FlatUsed = Info.UsesFlatScratch;
-  ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;
+  ProgInfo.NumSGPR = CreateExpr(Info.NumExplicitSGPR);
+  ProgInfo.ScratchSize = CreateExpr(Info.PrivateSegmentSize);
+  ProgInfo.VCCUsed = CreateExpr(Info.UsesVCC);
+  ProgInfo.FlatUsed = CreateExpr(Info.UsesFlatScratch);
+  ProgInfo.DynamicCallStack =
+      CreateExpr(Info.HasDynamicallySizedStack || Info.HasRecursion);
 
   const uint64_t MaxScratchPerWorkitem =
       STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
-  if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) {
-    DiagnosticInfoStackSize DiagStackSize(MF.getFunction(),
-                                          ProgInfo.ScratchSize,
+  uint64_t ScratchSize;
+  if (TryGetMCExprValue(ProgInfo.ScratchSize, ScratchSize) &&
+      ScratchSize > MaxScratchPerWorkitem) {
+    DiagnosticInfoStackSize DiagStackSize(MF.getFunction(), ScratchSize,
                                           MaxScratchPerWorkitem, DS_Error);
     MF.getFunction().getContext().diagnose(DiagStackSize);
   }
@@ -728,27 +767,29 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   // The calculations related to SGPR/VGPR blocks are
   // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
   // unified.
-  unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
-      &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed,
-      getTargetStreamer()->getTargetID()->isXnackOnOrAny());
+  const MCExpr *ExtraSGPRs = AMDGPUVariadicMCExpr::createExtraSGPRs(
+      ProgInfo.VCCUsed, ProgInfo.FlatUsed,
+      getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
 
   // Check the addressable register limit before we add ExtraSGPRs.
   if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
       !STM.hasSGPRInitBug()) {
     unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
-    if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
+    uint64_t NumSgpr;
+    if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
+        NumSgpr > MaxAddressableNumSGPRs) {
       // This can happen due to a compiler bug or when using inline asm.
       LLVMContext &Ctx = MF.getFunction().getContext();
       DiagnosticInfoResourceLimit Diag(
-          MF.getFunction(), "addressable scalar registers", ProgInfo.NumSGPR,
+          MF.getFunction(), "addressable scalar registers", NumSgpr,
           MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);
       Ctx.diagnose(Diag);
-      ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1;
+      ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs - 1);
     }
   }
 
   // Account for extra SGPRs and VGPRs reserved for debugger use.
-  ProgInfo.NumSGPR += ExtraSGPRs;
+  ProgInfo.NumSGPR = MCBinaryExpr::createAdd(ProgInfo.NumSGPR, ExtraSGPRs, Ctx);
 
   const Function &F = MF.getFunction();
 
@@ -819,40 +860,51 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
         }
       }
     }
-    ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR);
-    ProgInfo.NumArchVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR);
-    ProgInfo.NumVGPR =
-        Info.getTotalNumVGPRs(STM, Info.NumAGPR, ProgInfo.NumArchVGPR);
+    ProgInfo.NumSGPR = AMDGPUVariadicMCExpr::createMax(
+        {ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx);
+
+    ProgInfo.NumArchVGPR = AMDGPUVariadicMCExpr::createMax(
+        {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
+
+    ProgInfo.NumVGPR = AMDGPUVariadicMCExpr::createTotalNumVGPR(
+        ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
   }
 
   // Adjust number of registers used to meet default/requested minimum/maximum
   // number of waves per execution unit request.
-  ProgInfo.NumSGPRsForWavesPerEU = std::max(
-    std::max(ProgInfo.NumSGPR, 1u), STM.getMinNumSGPRs(MFI->getMaxWavesPerEU()));
-  ProgInfo.NumVGPRsForWavesPerEU = std::max(
-    std::max(ProgInfo.NumVGPR, 1u), STM.getMinNumVGPRs(MFI->getMaxWavesPerEU()));
+  unsigned MaxWaves = MFI->getMaxWavesPerEU();
+  ProgInfo.NumSGPRsForWavesPerEU = AMDGPUVariadicMCExpr::createMax(
+      {ProgInfo.NumSGPR, CreateExpr(1ul),
+       CreateExpr(STM.getMinNumSGPRs(MaxWaves))},
+      Ctx);
+  ProgInfo.NumVGPRsForWavesPerEU = AMDGPUVariadicMCExpr::createMax(
+      {ProgInfo.NumVGPR, CreateExpr(1ul),
+       CreateExpr(STM.getMinNumVGPRs(MaxWaves))},
+      Ctx);
 
   if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
       STM.hasSGPRInitBug()) {
     unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
-    if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
+    uint64_t NumSgpr;
+    if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
+        NumSgpr > MaxAddressableNumSGPRs) {
       // This can happen due to a compiler bug or when using inline asm to use
       // the registers which are usually reserved for vcc etc.
       LLVMContext &Ctx = MF.getFunction().getContext();
       DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",
-                                       ProgInfo.NumSGPR, MaxAddressableNumSGPRs,
+                                       NumSgpr, MaxAddressableNumSGPRs,
                                        DS_Error, DK_ResourceLimit);
       Ctx.diagnose(Diag);
-      ProgInfo.NumSGPR = MaxAddressableNumSGPRs;
-      ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs;
+      ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs);
+      ProgInfo.NumSGPRsForWavesPerEU = CreateExpr(MaxAddressableNumSGPRs);
     }
   }
 
   if (STM.hasSGPRInitBug()) {
     ProgInfo.NumSGPR =
-        AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
+        CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG);
     ProgInfo.NumSGPRsForWavesPerEU =
-        AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
+        CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG);
   }
 
   if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
@@ -871,11 +923,26 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
         STM.getAddressableLocalMemorySize(), DS_Error);
     Ctx.diagnose(Diag);
   }
+  // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
+  // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
+  auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR,
+                                             unsigned Granule) {
+    const MCExpr *OneConst = CreateExpr(1ul);
+    const MCExpr *GranuleConst = CreateExpr(Granule);
+    const MCExpr *MaxNumGPR =
+        AMDGPUVariadicMCExpr::createMax({NumGPR, OneConst}, Ctx);
+    const MCExpr *AlignToGPR =
+        AMDGPUVariadicMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);
+    const MCExpr *DivGPR =
+        MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx);
+    const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx);
+    return SubGPR;
+  };
 
-  ProgInfo.SGPRBlocks = IsaInfo::getNumSGPRBlocks(
-      &STM, ProgInfo.NumSGPRsForWavesPerEU);
-  ProgInfo.VGPRBlocks =
-      IsaInfo::getEncodedNumVGPRBlocks(&STM, ProgInfo.NumVGPRsForWavesPerEU);
+  ProgInfo.SGPRBlocks = GetNumGPRBlocks(ProgInfo.NumSGPRsForWavesPerEU,
+                                        IsaInfo::getSGPREncodingGranule(&STM));
+  ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU,
+                                        IsaInfo::getVGPREncodingGranule(&STM));
 
   const SIModeRegisterDefaults Mode = MFI->getMode();
 
@@ -904,14 +971,23 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   ProgInfo.LDSBlocks =
       alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
 
+  // The MCExpr equivalent of divideCeil.
+  auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) {
+    const MCExpr *Ceil =
+        AMDGPUVariadicMCExpr::createAlignTo(Numerator, Denominator, Ctx);
+    return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx);
+  };
+
   // Scratch is allocated in 64-dword or 256-dword blocks.
   unsigned ScratchAlignShift =
       STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
   // We need to program the hardware with the amount of scratch memory that
   // is used by the entire wave.  ProgInfo.ScratchSize is the amount of
   // scratch memory used per thread.
-  ProgInfo.ScratchBlocks = divideCeil(
-      ProgInfo.ScratchSize * STM.getWavefrontSize(), 1ULL << ScratchAlignShift);
+  ProgInfo.ScratchBlocks = DivideCeil(
+      MCBinaryExpr::createMul(ProgInfo.ScratchSize,
+                              CreateExpr(STM.getWavefrontSize()), Ctx),
+      CreateExpr(1ULL << ScratchAlignShift));
 
   if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
     ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
@@ -930,8 +1006,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   // anything to disable it if we know the stack isn't used here. We may still
   // have emitted code reading it to initialize scratch, but if that's unused
   // reading garbage should be OK.
-  ProgInfo.ScratchEnable =
-      ProgInfo.ScratchBlocks > 0 || ProgInfo.DynamicCallStack;
+  ProgInfo.ScratchEnable = MCBinaryExpr::createLOr(
+      MCBinaryExpr::createGT(ProgInfo.ScratchBlocks,
+                             MCConstantExpr::create(0, Ctx), Ctx),
+      ProgInfo.DynamicCallStack, Ctx);
+
   ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
   // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
   ProgInfo.TrapHandlerEnable =
@@ -947,26 +1026,41 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   ProgInfo.EXCPEnable = 0;
 
   if (STM.hasGFX90AInsts()) {
-    AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A,
-                    amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
-                    ProgInfo.AccumOffset);
-    AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A,
-                    amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
-                    ProgInfo.TgSplit);
+    // return ((Dst & ~Mask) | (Value << Shift))
+    auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
+                          uint32_t Shift) {
+      auto Shft = MCConstantExpr::create(Shift, Ctx);
+      auto Msk = MCConstantExpr::create(Mask, Ctx);
+      Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
+      Dst = MCBinaryExpr::createOr(
+          Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), Ctx);
+      return Dst;
+    };
+
+    ProgInfo.ComputePGMRSrc3GFX90A =
+        SetBits(ProgInfo.ComputePGMRSrc3GFX90A, ProgInfo.AccumOffset,
+                amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
+                amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
+    ProgInfo.ComputePGMRSrc3GFX90A =
+        SetBits(ProgInfo.ComputePGMRSrc3GFX90A, CreateExpr(ProgInfo.TgSplit),
+                amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
+                amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
   }
 
-  ProgInfo.Occupancy = STM.computeOccupancy(MF.getFunction(), ProgInfo.LDSSize,
-                                            ProgInfo.NumSGPRsForWavesPerEU,
-                                            ProgInfo.NumVGPRsForWavesPerEU);
+  ProgInfo.Occupancy = AMDGPUVariadicMCExpr::createOccupancy(
+      STM.computeOccupancy(F, ProgInfo.LDSSize), ProgInfo.NumSGPRsForWavesPerEU,
+      ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
+
   const auto [MinWEU, MaxWEU] =
       AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
-  if (ProgInfo.Occupancy < MinWEU) {
+  uint64_t Occupancy;
+  if (TryGetMCExprValue(ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) {
     DiagnosticInfoOptimizationFailure Diag(
         F, F.getSubprogram(),
         "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
         "'" +
             F.getName() + "': desired occupancy was " + Twine(MinWEU) +
-            ", final occupancy is " + Twine(ProgInfo.Occupancy));
+            ", final occupancy is " + Twine(Occupancy));
     F.getContext().diagnose(Diag);
   }
 }
@@ -989,36 +1083,78 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
+  MCContext &Ctx = MF.getContext();
+
+  // (((Value) & Mask) << Shift)
+  auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) {
+    const MCExpr *msk = MCConstantExpr::create(Mask, Ctx);
+    const MCExpr *shft = MCConstantExpr::create(Shift, Ctx);
+    return MCBinaryExpr::createShl(MCBinaryExpr::createAnd(Value, msk, Ctx),
+                                   shft, Ctx);
+  };
+
+  auto EmitResolvedOrExpr = [this](const MCExpr *Value, unsigned Size) {
+    int64_t Val;
+    if (Value->evaluateAsAbsolute(Val))
+      OutStreamer->emitIntValue(static_cast<uint64_t>(Val), Size);
+    else
+      OutStreamer->emitValue(Value, Size);
+  };
 
   if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
     OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1);
 
-    OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc1(STM));
+    EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx),
+                       /*Size=*/4);
 
     OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2);
-    OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc2());
+    EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(Ctx), /*Size=*/4);
 
     OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
-    OutStreamer->emitInt32(
-        STM.getGeneration() >= AMDGPUSubtarget::GFX12
-            ? S_00B860_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks)
-        : STM.getGeneration() == AMDGPUSubtarget::GFX11
-            ? S_00B860_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks)
-            : S_00B860_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
+
+    // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
+    // appropriate generation.
+    if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
+      EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
+                                 /*Mask=*/0x3FFFF, /*Shift=*/12),
+                         /*Size=*/4);
+    } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
+      EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
+                                 /*Mask=*/0x7FFF, /*Shift=*/12),
+                         /*Size=*/4);
+    } else {
+      EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
+                                 /*Mask=*/0x1FFF, /*Shift=*/12),
+                         /*Size=*/4);
+    }
 
     // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
     // 0" comment but I don't see a corresponding field in the register spec.
   } else {
     OutStreamer->emitInt32(RsrcReg);
-    OutStreamer->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
-                              S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
+
+    const MCExpr *GPRBlocks = MCBinaryExpr::createOr(
+        SetBits(CurrentProgramInfo.VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0),
+        SetBits(CurrentProgramInfo.SGPRBlocks, /*Mask=*/0x0F, /*Shift=*/6),
+        MF.getContext());
+    EmitResolvedOrExpr(GPRBlocks, /*Size=*/4);
     OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE);
-    OutStreamer->emitInt32(
-        STM.getGeneration() >= AMDGPUSubtarget::GFX12
-            ? S_0286E8_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks)
-        : STM.getGeneration() == AMDGPUSubtarget::GFX11
-            ? S_0286E8_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks)
-            : S_0286E8_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
+
+    // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
+    // appropriate generation.
+    if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
+      EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
+                                 /*Mask=*/0x3FFFF, /*Shift=*/12),
+                         /*Size=*/4);
+    } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
+      EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
+                                 /*Mask=*/0x7FFF, /*Shift=*/12),
+                         /*Size=*/4);
+    } else {
+      EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
+                                 /*Mask=*/0x1FFF, /*Shift=*/12),
+                         /*Size=*/4);
+    }
   }
 
   if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
@@ -1053,11 +1189,11 @@ static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
     MD->setHwStage(CC, ".trap_present",
                    (bool)CurrentProgramInfo.TrapHandlerEnable);
     MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
-
-    MD->setHwStage(CC, ".lds_size",
-                   (unsigned)(CurrentProgramInfo.LdsSize *
-                              getLdsDwGranularity(ST) * sizeof(uint32_t)));
   }
+
+  MD->setHwStage(CC, ".lds_size",
+                 (unsigned)(CurrentProgramInfo.LdsSize *
+                            getLdsDwGranularity(ST) * sizeof(uint32_t)));
 }
 
 // This is the equivalent of EmitProgramInfoSI above, but for when the OS type
@@ -1070,33 +1206,38 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   auto CC = MF.getFunction().getCallingConv();
   auto MD = getTargetStreamer()->getPALMetadata();
+  auto &Ctx = MF.getContext();
 
   MD->setEntryPoint(CC, MF.getFunction().getName());
-  MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU);
+  MD->setNumUsedVgprs(
+      CC, getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx));
 
   // Only set AGPRs for supported devices
   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   if (STM.hasMAIInsts()) {
-    MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
+    MD->setNumUsedAgprs(CC, getMCExprValue(CurrentProgramInfo.NumAccVGPR, Ctx));
   }
 
-  MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU);
+  MD->setNumUsedSgprs(
+      CC, getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx));
   if (MD->getPALMajorVersion() < 3) {
     MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM));
     if (AMDGPU::isCompute(CC)) {
       MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2());
     } else {
-      if (CurrentProgramInfo.ScratchBlocks > 0)
+      if (getMCExprValue(CurrentProgramInfo.ScratchBlocks, Ctx) > 0)
         MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1));
     }
   } else {
     MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
-    MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable);
+    MD->setHwStage(CC, ".scratch_en",
+                   (bool)getMCExprValue(CurrentProgramInfo.ScratchEnable, Ctx));
     EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM);
   }
 
   // ScratchSize is in bytes, 16 aligned.
-  MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16));
+  MD->setScratchSize(
+      CC, alignTo(getMCExprValue(CurrentProgramInfo.ScratchSize, Ctx), 16));
   if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
     unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
                                 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
@@ -1145,6 +1286,7 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
   StringRef FnName = MF.getFunction().getName();
   MD->setFunctionScratchSize(FnName, MFI.getStackSize());
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  MCContext &Ctx = MF.getContext();
 
   if (MD->getPALMajorVersion() < 3) {
     // Set compute registers
@@ -1158,8 +1300,10 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
 
   // Set optional info
   MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
-  MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU);
-  MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU);
+  MD->setFunctionNumUsedVgprs(
+      FnName, getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx));
+  MD->setFunctionNumUsedSgprs(
+      FnName, getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx));
 }
 
 // This is supposed to be log2(Size)
@@ -1185,6 +1329,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
 
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+  MCContext &Ctx = MF.getContext();
 
   AMDGPU::initDefaultAMDKernelCodeT(Out, &STM);
 
@@ -1193,7 +1338,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
       (CurrentProgramInfo.getComputePGMRSrc2() << 32);
   Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64;
 
-  if (CurrentProgramInfo.DynamicCallStack)
+  if (getMCExprValue(CurrentProgramInfo.DynamicCallStack, Ctx))
     Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK;
 
   AMD_HSA_BITS_SET(Out.code_properties,
@@ -1229,9 +1374,10 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
 
   Align MaxKernArgAlign;
   Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
-  Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
-  Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
-  Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
+  Out.wavefront_sgpr_count = getMCExprValue(CurrentProgramInfo.NumSGPR, Ctx);
+  Out.workitem_vgpr_count = getMCExprValue(CurrentProgramInfo.NumVGPR, Ctx);
+  Out.workitem_private_segment_byte_size =
+      getMCExprValue(CurrentProgramInfo.ScratchSize, Ctx);
   Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
 
   // kernarg_segment_alignment is specified as log of the alignment.
@@ -1306,7 +1452,7 @@ void AMDGPUAsmPrinter::emitResourceUsageRemarks(
     // makes it easier to tell which resource usage go with which kernel since
     // the kernel name will always be displayed first.
     std::string LabelStr = RemarkLabel.str() + ": ";
-    if (!RemarkName.equals("FunctionName"))
+    if (RemarkName != "FunctionName")
       LabelStr = Indent + LabelStr;
 
     ORE->emit([&]() {
@@ -1322,19 +1468,28 @@ void AMDGPUAsmPrinter::emitResourceUsageRemarks(
   // remarks to simulate newlines. If and when clang does accept newlines, this
   // formatting should be aggregated into one remark with newlines to avoid
   // printing multiple diagnostic location and diag opts.
+  MCContext &MCCtx = MF.getContext();
   EmitResourceUsageRemark("FunctionName", "Function Name",
                           MF.getFunction().getName());
-  EmitResourceUsageRemark("NumSGPR", "SGPRs", CurrentProgramInfo.NumSGPR);
-  EmitResourceUsageRemark("NumVGPR", "VGPRs", CurrentProgramInfo.NumArchVGPR);
-  if (hasMAIInsts)
-    EmitResourceUsageRemark("NumAGPR", "AGPRs", CurrentProgramInfo.NumAccVGPR);
-  EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
-                          CurrentProgramInfo.ScratchSize);
+  EmitResourceUsageRemark("NumSGPR", "SGPRs",
+                          getMCExprValue(CurrentProgramInfo.NumSGPR, MCCtx));
+  EmitResourceUsageRemark(
+      "NumVGPR", "VGPRs",
+      getMCExprValue(CurrentProgramInfo.NumArchVGPR, MCCtx));
+  if (hasMAIInsts) {
+    EmitResourceUsageRemark(
+        "NumAGPR", "AGPRs",
+        getMCExprValue(CurrentProgramInfo.NumAccVGPR, MCCtx));
+  }
+  EmitResourceUsageRemark(
+      "ScratchSize", "ScratchSize [bytes/lane]",
+      getMCExprValue(CurrentProgramInfo.ScratchSize, MCCtx));
   StringRef DynamicStackStr =
-      CurrentProgramInfo.DynamicCallStack ? "True" : "False";
+      getMCExprValue(CurrentProgramInfo.DynamicCallStack, MCCtx) ? "True"
+                                                                 : "False";
   EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
   EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
-                          CurrentProgramInfo.Occupancy);
+                          getMCExprValue(CurrentProgramInfo.Occupancy, MCCtx));
   EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
                           CurrentProgramInfo.SGPRSpill);
   EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index b8b2718d293e..16d8952a533e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -78,6 +78,8 @@ private:
 
   void initTargetStreamer(Module &M);
 
+  static uint64_t getMCExprValue(const MCExpr *Value, MCContext &Ctx);
+
 public:
   explicit AMDGPUAsmPrinter(TargetMachine &TM,
                             std::unique_ptr<MCStreamer> Streamer);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index ad98f4f743ae..1d645002b1fe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -493,8 +493,8 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
     if (!ST->isWave32()) {
       // Combine lane 31 into lanes 32..63.
       V = B.CreateBitCast(V, IntNTy);
-      Value *const Lane31 = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
-                                              {V, B.getInt32(31)});
+      Value *const Lane31 = B.CreateIntrinsic(
+          V->getType(), Intrinsic::amdgcn_readlane, {V, B.getInt32(31)});
 
       Value *UpdateDPPCall = B.CreateCall(
           UpdateDPP, {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID),
@@ -598,8 +598,8 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
 
   // Get the value required for atomic operation
   V = B.CreateBitCast(V, IntNTy);
-  Value *LaneValue =
-      B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {V, LaneIdxInt});
+  Value *LaneValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_readlane,
+                                       {V, LaneIdxInt});
   LaneValue = B.CreateBitCast(LaneValue, Ty);
 
   // Perform writelane if intermediate scan results are required later in the
@@ -607,7 +607,7 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
   Value *OldValue = nullptr;
   if (NeedResult) {
     OldValue =
-        B.CreateIntrinsic(Intrinsic::amdgcn_writelane, {},
+        B.CreateIntrinsic(IntNTy, Intrinsic::amdgcn_writelane,
                           {B.CreateBitCast(Accumulator, IntNTy), LaneIdxInt,
                            B.CreateBitCast(OldValuePhi, IntNTy)});
     OldValue = B.CreateBitCast(OldValue, Ty);
@@ -789,7 +789,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
         Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
         assert(TyBitWidth == 32);
         NewV = B.CreateBitCast(NewV, IntNTy);
-        NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
+        NewV = B.CreateIntrinsic(IntNTy, Intrinsic::amdgcn_readlane,
                                  {NewV, LastLaneIdx});
         NewV = B.CreateBitCast(NewV, Ty);
       }
@@ -936,10 +936,10 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
       Value *const ExtractLo = B.CreateTrunc(CastedPhi, Int32Ty);
       Value *const ExtractHi =
           B.CreateTrunc(B.CreateLShr(CastedPhi, 32), Int32Ty);
-      CallInst *const ReadFirstLaneLo =
-          B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
-      CallInst *const ReadFirstLaneHi =
-          B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
+      CallInst *const ReadFirstLaneLo = B.CreateIntrinsic(
+          Int32Ty, Intrinsic::amdgcn_readfirstlane, ExtractLo);
+      CallInst *const ReadFirstLaneHi = B.CreateIntrinsic(
+          Int32Ty, Intrinsic::amdgcn_readfirstlane, ExtractHi);
       Value *const PartialInsert = B.CreateInsertElement(
           PoisonValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
       Value *const Insert =
@@ -948,7 +948,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
     } else if (TyBitWidth == 32) {
       Value *CastedPhi = B.CreateBitCast(PHI, IntNTy);
       BroadcastI =
-          B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, CastedPhi);
+          B.CreateIntrinsic(IntNTy, Intrinsic::amdgcn_readfirstlane, CastedPhi);
       BroadcastI = B.CreateBitCast(BroadcastI, Ty);
 
     } else {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 9bd30458bc0a..43bfd0f13f87 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -332,9 +332,9 @@ struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
 
     bool InitialValue = false;
     if (F->hasFnAttribute("uniform-work-group-size"))
-      InitialValue = F->getFnAttribute("uniform-work-group-size")
-                         .getValueAsString()
-                         .equals("true");
+      InitialValue =
+          F->getFnAttribute("uniform-work-group-size").getValueAsString() ==
+          "true";
 
     if (InitialValue)
       indicateOptimisticFixpoint();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 9218760538dc..b2a3f9392157 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -15,9 +15,10 @@ def fmin_fmax_legacy_matchdata : GIDefMatchData<"FMinFMaxLegacyInfo">;
 let Predicates = [HasFminFmaxLegacy] in
 def fcmp_select_to_fmin_fmax_legacy : GICombineRule<
   (defs root:$select, fmin_fmax_legacy_matchdata:$matchinfo),
-  (match (wip_match_opcode G_SELECT):$select,
-         [{ return matchFMinFMaxLegacy(*${select}, ${matchinfo}); }]),
-  (apply [{ applySelectFCmpToFMinToFMaxLegacy(*${select}, ${matchinfo}); }])>;
+  (match (G_FCMP $cond, $pred, $lhs, $rhs):$fcmp,
+         (G_SELECT f32:$dst, $cond, $true, $false):$select,
+         [{ return matchFMinFMaxLegacy(*${select}, *${fcmp}, ${matchinfo}); }]),
+  (apply [{ applySelectFCmpToFMinFMaxLegacy(*${select}, ${matchinfo}); }])>;
 
 
 def uchar_to_float : GICombineRule<
@@ -94,10 +95,8 @@ def fmed3_intrinsic_to_clamp : GICombineRule<
          [{ return matchFPMed3ToClamp(*${fmed3}, ${matchinfo}); }]),
   (apply [{ applyClamp(*${fmed3}, ${matchinfo}); }])>;
 
-def remove_fcanonicalize_matchinfo : GIDefMatchData<"Register">;
-
 def remove_fcanonicalize : GICombineRule<
-  (defs root:$fcanonicalize, remove_fcanonicalize_matchinfo:$matchinfo),
+  (defs root:$fcanonicalize, register_matchinfo:$matchinfo),
   (match (wip_match_opcode G_FCANONICALIZE):$fcanonicalize,
          [{ return matchRemoveFcanonicalize(*${fcanonicalize}, ${matchinfo}); }]),
   (apply [{ Helper.replaceSingleDefInstWithReg(*${fcanonicalize}, ${matchinfo}); }])>;
@@ -115,7 +114,7 @@ def smulu64 : GICombineRule<
   (defs root:$smul, unsigned_matchinfo:$matchinfo),
   (match (wip_match_opcode G_MUL):$smul,
          [{ return matchCombine_s_mul_u64(*${smul}, ${matchinfo}); }]),
-  (apply [{ applyCombine_s_mul_u64(*${smul}, ${matchinfo}); }])>;
+  (apply [{ Helper.replaceOpcodeWith(*${smul}, ${matchinfo}); }])>;
 
 def sign_exension_in_reg_matchdata : GIDefMatchData<"std::pair<MachineInstr *, unsigned>">;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
index 69dc78d33c83..2f567ecb121f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
@@ -432,8 +432,6 @@ void AMDGPUCombinerHelper::applyExpandPromotedF16FMed3(MachineInstr &MI,
                                                        Register Src0,
                                                        Register Src1,
                                                        Register Src2) {
-  Builder.setInstrAndDebugLoc(MI);
-
   // We expect fptrunc (fpext x) to fold out, and to constant fold any constant
   // sources.
   Src0 = Builder.buildFPTrunc(LLT::scalar(16), Src0).getReg(0);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 9e288ab50e17..7ab9ba285133 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -19,6 +19,8 @@
 #include "SIMachineFunctionInfo.h"
 #include "SIProgramInfo.h"
 #include "llvm/IR/Module.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
 using namespace llvm;
 
 static std::pair<Type *, Align> getArgumentTypeAlign(const Argument &Arg,
@@ -462,6 +464,16 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
   const Function &F = MF.getFunction();
 
+  auto GetMCExprValue = [&MF](const MCExpr *Value) {
+    int64_t Val;
+    if (!Value->evaluateAsAbsolute(Val)) {
+      MCContext &Ctx = MF.getContext();
+      Ctx.reportError(SMLoc(), "could not resolve expression when required.");
+      Val = 0;
+    }
+    return static_cast<uint64_t>(Val);
+  };
+
   auto Kern = HSAMetadataDoc->getMapNode();
 
   Align MaxKernArgAlign;
@@ -470,10 +482,11 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
   Kern[".group_segment_fixed_size"] =
       Kern.getDocument()->getNode(ProgramInfo.LDSSize);
   Kern[".private_segment_fixed_size"] =
-      Kern.getDocument()->getNode(ProgramInfo.ScratchSize);
-  if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5)
-    Kern[".uses_dynamic_stack"] =
-        Kern.getDocument()->getNode(ProgramInfo.DynamicCallStack);
+      Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.ScratchSize));
+  if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5) {
+    Kern[".uses_dynamic_stack"] = Kern.getDocument()->getNode(
+        static_cast<bool>(GetMCExprValue(ProgramInfo.DynamicCallStack)));
+  }
 
   if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5 && STM.supportsWGP())
     Kern[".workgroup_processor_mode"] =
@@ -484,12 +497,15 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
       Kern.getDocument()->getNode(std::max(Align(4), MaxKernArgAlign).value());
   Kern[".wavefront_size"] =
       Kern.getDocument()->getNode(STM.getWavefrontSize());
-  Kern[".sgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumSGPR);
-  Kern[".vgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumVGPR);
+  Kern[".sgpr_count"] =
+      Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.NumSGPR));
+  Kern[".vgpr_count"] =
+      Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.NumVGPR));
 
   // Only add AGPR count to metadata for supported devices
   if (STM.hasMAIInsts()) {
-    Kern[".agpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumAccVGPR);
+    Kern[".agpr_count"] =
+        Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.NumAccVGPR));
   }
 
   Kern[".max_flat_workgroup_size"] =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index 26229af638f2..0e3bc63919f0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -18,6 +18,7 @@
 #include "llvm/BinaryFormat/MsgPackDocument.h"
 #include "llvm/Support/AMDGPUMetadata.h"
 #include "llvm/Support/Alignment.h"
+#include "llvm/Support/Compiler.h"
 
 namespace llvm {
 
@@ -61,7 +62,8 @@ protected:
                                msgpack::MapDocNode Kern) = 0;
 };
 
-class MetadataStreamerMsgPackV4 : public MetadataStreamer {
+class LLVM_EXTERNAL_VISIBILITY MetadataStreamerMsgPackV4
+    : public MetadataStreamer {
 protected:
   std::unique_ptr<msgpack::Document> HSAMetadataDoc =
       std::make_unique<msgpack::Document>();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index bba7682cd7a0..c11c7a57e059 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -132,6 +132,7 @@ bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
   }
 #endif
   Subtarget = &MF.getSubtarget<GCNSubtarget>();
+  Subtarget->checkSubtargetFeatures(MF.getFunction());
   Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
   return SelectionDAGISel::runOnMachineFunction(MF);
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 3124fb23fb0b..d35a022ad680 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -161,6 +161,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::ATOMIC_LOAD, MVT::bf16, Promote);
   AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);
 
+  setOperationAction(ISD::ATOMIC_STORE, MVT::f32, Promote);
+  AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32);
+
+  setOperationAction(ISD::ATOMIC_STORE, MVT::f64, Promote);
+  AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64);
+
+  setOperationAction(ISD::ATOMIC_STORE, MVT::f16, Promote);
+  AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16);
+
+  setOperationAction(ISD::ATOMIC_STORE, MVT::bf16, Promote);
+  AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16);
+
   // There are no 64-bit extloads. These should be done as a 32-bit extload and
   // an extension to 64-bit.
   for (MVT VT : MVT::integer_valuetypes())
@@ -934,14 +946,14 @@ bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
 
   // Packed operations do not have a fabs modifier.
   return VT == MVT::f32 || VT == MVT::f64 ||
-         (Subtarget->has16BitInsts() && VT == MVT::f16);
+         (Subtarget->has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16));
 }
 
 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
   assert(VT.isFloatingPoint());
   // Report this based on the end legalized type.
   VT = VT.getScalarType();
-  return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
+  return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
 }
 
 bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT,
@@ -1460,7 +1472,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
   if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
       G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
     if (!MFI->isModuleEntryFunction() &&
-        !GV->getName().equals("llvm.amdgcn.module.lds")) {
+        GV->getName() != "llvm.amdgcn.module.lds") {
       SDLoc DL(Op);
       const Function &Fn = DAG.getMachineFunction().getFunction();
       DiagnosticInfoUnsupported BadLDSDecl(
@@ -5988,6 +6000,13 @@ AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
   case AtomicRMWInst::FMax:
   case AtomicRMWInst::FMin:
     return AtomicExpansionKind::CmpXChg;
+  case AtomicRMWInst::Xchg: {
+    const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
+    unsigned ValSize = DL.getTypeSizeInBits(RMW->getType());
+    if (ValSize == 32 || ValSize == 64)
+      return AtomicExpansionKind::None;
+    return AtomicExpansionKind::CmpXChg;
+  }
   default: {
     if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {
       unsigned Size = IntTy->getBitWidth();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 269c414521db..3814b56a4d56 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -236,6 +236,14 @@ public:
     return AtomicExpansionKind::None;
   }
 
+  AtomicExpansionKind shouldCastAtomicStoreInIR(StoreInst *SI) const override {
+    return AtomicExpansionKind::None;
+  }
+
+  AtomicExpansionKind shouldCastAtomicRMWIInIR(AtomicRMWInst *) const override {
+    return AtomicExpansionKind::None;
+  }
+
   static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
   static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
index 9415bd3695f0..b78952ca3a62 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
@@ -16,10 +16,11 @@
 
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -28,10 +29,11 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/Register.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/MC/MCRegister.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Pass.h"
+#include <array>
 
 using namespace llvm;
 
@@ -41,17 +43,110 @@ namespace {
 class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
 private:
   const SIInstrInfo *SII;
+  class SingleUseInstruction {
+  private:
+    static const unsigned MaxSkipRange = 0b111;
+    static const unsigned MaxNumberOfSkipRegions = 2;
+
+    unsigned LastEncodedPositionEnd;
+    MachineInstr *ProducerInstr;
+
+    std::array<unsigned, MaxNumberOfSkipRegions + 1> SingleUseRegions;
+    SmallVector<unsigned, MaxNumberOfSkipRegions> SkipRegions;
+
+    // Adds a skip region into the instruction.
+    void skip(const unsigned ProducerPosition) {
+      while (LastEncodedPositionEnd + MaxSkipRange < ProducerPosition) {
+        SkipRegions.push_back(MaxSkipRange);
+        LastEncodedPositionEnd += MaxSkipRange;
+      }
+      SkipRegions.push_back(ProducerPosition - LastEncodedPositionEnd);
+      LastEncodedPositionEnd = ProducerPosition;
+    }
+
+    bool currentRegionHasSpace() {
+      const auto Region = SkipRegions.size();
+      // The first region has an extra bit of encoding space.
+      return SingleUseRegions[Region] <
+             ((Region == MaxNumberOfSkipRegions) ? 0b1111U : 0b111U);
+    }
+
+    unsigned encodeImm() {
+      // Handle the first Single Use Region separately as it has an extra bit
+      // of encoding space.
+      unsigned Imm = SingleUseRegions[SkipRegions.size()];
+      unsigned ShiftAmount = 4;
+      for (unsigned i = SkipRegions.size(); i > 0; i--) {
+        Imm |= SkipRegions[i - 1] << ShiftAmount;
+        ShiftAmount += 3;
+        Imm |= SingleUseRegions[i - 1] << ShiftAmount;
+        ShiftAmount += 3;
+      }
+      return Imm;
+    }
+
+  public:
+    SingleUseInstruction(const unsigned ProducerPosition,
+                         MachineInstr *Producer)
+        : LastEncodedPositionEnd(ProducerPosition + 1), ProducerInstr(Producer),
+          SingleUseRegions({1, 0, 0}) {}
+
+    // Returns false if adding a new single use producer failed. This happens
+    // because it could not be encoded, either because there is no room to
+    // encode another single use producer region or that this single use
+    // producer is too far away to encode the amount of instructions to skip.
+    bool tryAddProducer(const unsigned ProducerPosition, MachineInstr *MI) {
+      // Producer is too far away to encode into this instruction or another
+      // skip region is needed and SkipRegions.size() = 2 so there's no room for
+      // another skip region, therefore a new instruction is needed.
+      if (LastEncodedPositionEnd +
+              (MaxSkipRange * (MaxNumberOfSkipRegions - SkipRegions.size())) <
+          ProducerPosition)
+        return false;
+
+      // If a skip region is needed.
+      if (LastEncodedPositionEnd != ProducerPosition ||
+          !currentRegionHasSpace()) {
+        // If the current region is out of space therefore a skip region would
+        // be needed, but there is no room for another skip region.
+        if (SkipRegions.size() == MaxNumberOfSkipRegions)
+          return false;
+        skip(ProducerPosition);
+      }
+
+      SingleUseRegions[SkipRegions.size()]++;
+      LastEncodedPositionEnd = ProducerPosition + 1;
+      ProducerInstr = MI;
+      return true;
+    }
+
+    auto emit(const SIInstrInfo *SII) {
+      return BuildMI(*ProducerInstr->getParent(), ProducerInstr, DebugLoc(),
+                     SII->get(AMDGPU::S_SINGLEUSE_VDST))
+          .addImm(encodeImm());
+    }
+  };
 
 public:
   static char ID;
 
   AMDGPUInsertSingleUseVDST() : MachineFunctionPass(ID) {}
 
-  void emitSingleUseVDST(MachineInstr &MI) const {
-    // Mark the following instruction as a single-use producer:
-    //   s_singleuse_vdst { supr0: 1 }
-    BuildMI(*MI.getParent(), MI, DebugLoc(), SII->get(AMDGPU::S_SINGLEUSE_VDST))
-        .addImm(0x1);
+  void insertSingleUseInstructions(
+      ArrayRef<std::pair<unsigned, MachineInstr *>> SingleUseProducers) const {
+    SmallVector<SingleUseInstruction> Instructions;
+
+    for (auto &[Position, MI] : SingleUseProducers) {
+      // Encode this position into the last single use instruction if possible.
+      if (Instructions.empty() ||
+          !Instructions.back().tryAddProducer(Position, MI)) {
+        // If not, add a new instruction.
+        Instructions.push_back(SingleUseInstruction(Position, MI));
+      }
+    }
+
+    for (auto &Instruction : Instructions)
+      Instruction.emit(SII);
   }
 
   bool runOnMachineFunction(MachineFunction &MF) override {
@@ -78,6 +173,10 @@ public:
         }
       }
 
+      SmallVector<std::pair<unsigned, MachineInstr *>>
+          SingleUseProducerPositions;
+
+      unsigned VALUInstrCount = 0;
       for (MachineInstr &MI : reverse(MBB.instrs())) {
         // All registers in all operands need to be single use for an
         // instruction to be marked as a single use producer.
@@ -119,13 +218,16 @@ public:
           for (auto &UsedReg : RegisterUseCount)
             UsedReg.second = 2;
         }
-        if (AllProducerOperandsAreSingleUse && SIInstrInfo::isVALU(MI)) {
-          // TODO: Replace with candidate logging for instruction grouping
-          // later.
-          emitSingleUseVDST(MI);
+
+        if (!SIInstrInfo::isVALU(MI))
+          continue;
+        if (AllProducerOperandsAreSingleUse) {
+          SingleUseProducerPositions.push_back({VALUInstrCount, &MI});
           InstructionEmitted = true;
         }
+        VALUInstrCount++;
       }
+      insertSingleUseInstructions(SingleUseProducerPositions);
     }
     return InstructionEmitted;
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 5b7fa13f2e83..160a17584ca3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -854,8 +854,9 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
 
     if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
       if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
-        Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
-        if (CCmp->isNullValue()) {
+        Constant *CCmp = ConstantFoldCompareInstOperands(
+            (ICmpInst::Predicate)CCVal, CSrc0, CSrc1, DL);
+        if (CCmp && CCmp->isNullValue()) {
           return IC.replaceInstUsesWith(
               II, IC.Builder.CreateSExt(CCmp, II.getType()));
         }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index e13c13913d4e..b48a09489653 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -63,6 +63,7 @@ void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits *KB,
                                         BlockFrequencyInfo *BFI) {
   MRI = &MF.getRegInfo();
   Subtarget = &MF.getSubtarget<GCNSubtarget>();
+  Subtarget->checkSubtargetFeatures(MF.getFunction());
   InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI);
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 6cd93abff1a4..bd7bf78c4c0b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2919,7 +2919,7 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
 
   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
     if (!MFI->isModuleEntryFunction() &&
-        !GV->getName().equals("llvm.amdgcn.module.lds")) {
+        GV->getName() != "llvm.amdgcn.module.lds") {
       const Function &Fn = MF.getFunction();
       DiagnosticInfoUnsupported BadLDSDecl(
         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index c8bf9dd39e38..2c7163a77537 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -212,6 +212,7 @@
 #define DEBUG_TYPE "amdgpu-lower-module-lds"
 
 using namespace llvm;
+using namespace AMDGPU;
 
 namespace {
 
@@ -234,17 +235,6 @@ cl::opt<LoweringKind> LoweringKindLoc(
         clEnumValN(LoweringKind::hybrid, "hybrid",
                    "Lower via mixture of above strategies")));
 
-bool isKernelLDS(const Function *F) {
-  // Some weirdness here. AMDGPU::isKernelCC does not call into
-  // AMDGPU::isKernel with the calling conv, it instead calls into
-  // isModuleEntryFunction which returns true for more calling conventions
-  // than AMDGPU::isKernel does. There's a FIXME on AMDGPU::isKernel.
-  // There's also a test that checks that the LDS lowering does not hit on
-  // a graphics shader, denoted amdgpu_ps, so stay with the limited case.
-  // Putting LDS in the name of the function to draw attention to this.
-  return AMDGPU::isKernel(F->getCallingConv());
-}
-
 template <typename T> std::vector<T> sortByName(std::vector<T> &&V) {
   llvm::sort(V.begin(), V.end(), [](const auto *L, const auto *R) {
     return L->getName() < R->getName();
@@ -305,183 +295,9 @@ class AMDGPULowerModuleLDS {
         Decl, {}, {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)});
   }
 
-  static bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M) {
-    // Constants are uniqued within LLVM. A ConstantExpr referring to a LDS
-    // global may have uses from multiple different functions as a result.
-    // This pass specialises LDS variables with respect to the kernel that
-    // allocates them.
-
-    // This is semantically equivalent to (the unimplemented as slow):
-    // for (auto &F : M.functions())
-    //   for (auto &BB : F)
-    //     for (auto &I : BB)
-    //       for (Use &Op : I.operands())
-    //         if (constantExprUsesLDS(Op))
-    //           replaceConstantExprInFunction(I, Op);
-
-    SmallVector<Constant *> LDSGlobals;
-    for (auto &GV : M.globals())
-      if (AMDGPU::isLDSVariableToLower(GV))
-        LDSGlobals.push_back(&GV);
-
-    return convertUsersOfConstantsToInstructions(LDSGlobals);
-  }
-
 public:
   AMDGPULowerModuleLDS(const AMDGPUTargetMachine &TM_) : TM(TM_) {}
 
-  using FunctionVariableMap = DenseMap<Function *, DenseSet<GlobalVariable *>>;
-
-  using VariableFunctionMap = DenseMap<GlobalVariable *, DenseSet<Function *>>;
-
-  static void getUsesOfLDSByFunction(CallGraph const &CG, Module &M,
-                                     FunctionVariableMap &kernels,
-                                     FunctionVariableMap &functions) {
-
-    // Get uses from the current function, excluding uses by called functions
-    // Two output variables to avoid walking the globals list twice
-    for (auto &GV : M.globals()) {
-      if (!AMDGPU::isLDSVariableToLower(GV)) {
-        continue;
-      }
-
-      for (User *V : GV.users()) {
-        if (auto *I = dyn_cast<Instruction>(V)) {
-          Function *F = I->getFunction();
-          if (isKernelLDS(F)) {
-            kernels[F].insert(&GV);
-          } else {
-            functions[F].insert(&GV);
-          }
-        }
-      }
-    }
-  }
-
-  struct LDSUsesInfoTy {
-    FunctionVariableMap direct_access;
-    FunctionVariableMap indirect_access;
-  };
-
-  static LDSUsesInfoTy getTransitiveUsesOfLDS(CallGraph const &CG, Module &M) {
-
-    FunctionVariableMap direct_map_kernel;
-    FunctionVariableMap direct_map_function;
-    getUsesOfLDSByFunction(CG, M, direct_map_kernel, direct_map_function);
-
-    // Collect variables that are used by functions whose address has escaped
-    DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer;
-    for (Function &F : M.functions()) {
-      if (!isKernelLDS(&F))
-        if (F.hasAddressTaken(nullptr,
-                              /* IgnoreCallbackUses */ false,
-                              /* IgnoreAssumeLikeCalls */ false,
-                              /* IgnoreLLVMUsed */ true,
-                              /* IgnoreArcAttachedCall */ false)) {
-          set_union(VariablesReachableThroughFunctionPointer,
-                    direct_map_function[&F]);
-        }
-    }
-
-    auto functionMakesUnknownCall = [&](const Function *F) -> bool {
-      assert(!F->isDeclaration());
-      for (const CallGraphNode::CallRecord &R : *CG[F]) {
-        if (!R.second->getFunction()) {
-          return true;
-        }
-      }
-      return false;
-    };
-
-    // Work out which variables are reachable through function calls
-    FunctionVariableMap transitive_map_function = direct_map_function;
-
-    // If the function makes any unknown call, assume the worst case that it can
-    // access all variables accessed by functions whose address escaped
-    for (Function &F : M.functions()) {
-      if (!F.isDeclaration() && functionMakesUnknownCall(&F)) {
-        if (!isKernelLDS(&F)) {
-          set_union(transitive_map_function[&F],
-                    VariablesReachableThroughFunctionPointer);
-        }
-      }
-    }
-
-    // Direct implementation of collecting all variables reachable from each
-    // function
-    for (Function &Func : M.functions()) {
-      if (Func.isDeclaration() || isKernelLDS(&Func))
-        continue;
-
-      DenseSet<Function *> seen; // catches cycles
-      SmallVector<Function *, 4> wip{&Func};
-
-      while (!wip.empty()) {
-        Function *F = wip.pop_back_val();
-
-        // Can accelerate this by referring to transitive map for functions that
-        // have already been computed, with more care than this
-        set_union(transitive_map_function[&Func], direct_map_function[F]);
-
-        for (const CallGraphNode::CallRecord &R : *CG[F]) {
-          Function *ith = R.second->getFunction();
-          if (ith) {
-            if (!seen.contains(ith)) {
-              seen.insert(ith);
-              wip.push_back(ith);
-            }
-          }
-        }
-      }
-    }
-
-    // direct_map_kernel lists which variables are used by the kernel
-    // find the variables which are used through a function call
-    FunctionVariableMap indirect_map_kernel;
-
-    for (Function &Func : M.functions()) {
-      if (Func.isDeclaration() || !isKernelLDS(&Func))
-        continue;
-
-      for (const CallGraphNode::CallRecord &R : *CG[&Func]) {
-        Function *ith = R.second->getFunction();
-        if (ith) {
-          set_union(indirect_map_kernel[&Func], transitive_map_function[ith]);
-        } else {
-          set_union(indirect_map_kernel[&Func],
-                    VariablesReachableThroughFunctionPointer);
-        }
-      }
-    }
-
-    // Verify that we fall into one of 2 cases:
-    //    - All variables are absolute: this is a re-run of the pass
-    //      so we don't have anything to do.
-    //    - No variables are absolute.
-    std::optional<bool> HasAbsoluteGVs;
-    for (auto &Map : {direct_map_kernel, indirect_map_kernel}) {
-      for (auto &[Fn, GVs] : Map) {
-        for (auto *GV : GVs) {
-          bool IsAbsolute = GV->isAbsoluteSymbolRef();
-          if (HasAbsoluteGVs.has_value()) {
-            if (*HasAbsoluteGVs != IsAbsolute) {
-              report_fatal_error(
-                  "Module cannot mix absolute and non-absolute LDS GVs");
-            }
-          } else
-            HasAbsoluteGVs = IsAbsolute;
-        }
-      }
-    }
-
-    // If we only had absolute GVs, we have nothing to do, return an empty
-    // result.
-    if (HasAbsoluteGVs && *HasAbsoluteGVs)
-      return {FunctionVariableMap(), FunctionVariableMap()};
-
-    return {std::move(direct_map_kernel), std::move(indirect_map_kernel)};
-  }
-
   struct LDSVariableReplacement {
     GlobalVariable *SGV = nullptr;
     DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index 82e17ddad851..f36374b08b34 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -66,15 +66,14 @@ public:
   struct FMinFMaxLegacyInfo {
     Register LHS;
     Register RHS;
-    Register True;
-    Register False;
     CmpInst::Predicate Pred;
   };
 
   // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
-  bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info) const;
-  void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
-                                         const FMinFMaxLegacyInfo &Info) const;
+  bool matchFMinFMaxLegacy(MachineInstr &MI, MachineInstr &FCmp,
+                           FMinFMaxLegacyInfo &Info) const;
+  void applySelectFCmpToFMinFMaxLegacy(MachineInstr &MI,
+                                       const FMinFMaxLegacyInfo &Info) const;
 
   bool matchUCharToFloat(MachineInstr &MI) const;
   void applyUCharToFloat(MachineInstr &MI) const;
@@ -109,11 +108,10 @@ public:
 
   // Find the s_mul_u64 instructions where the higher bits are either
   // zero-extended or sign-extended.
-  bool matchCombine_s_mul_u64(MachineInstr &MI, unsigned &NewOpcode) const;
   // Replace the s_mul_u64 instructions with S_MUL_I64_I32_PSEUDO if the higher
   // 33 bits are sign extended and with S_MUL_U64_U32_PSEUDO if the higher 32
   // bits are zero extended.
-  void applyCombine_s_mul_u64(MachineInstr &MI, unsigned &NewOpcode) const;
+  bool matchCombine_s_mul_u64(MachineInstr &MI, unsigned &NewOpcode) const;
 
 private:
 #define GET_GICOMBINER_CLASS_MEMBERS
@@ -161,87 +159,48 @@ bool AMDGPUPostLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const {
 }
 
 bool AMDGPUPostLegalizerCombinerImpl::matchFMinFMaxLegacy(
-    MachineInstr &MI, FMinFMaxLegacyInfo &Info) const {
-  // FIXME: Type predicate on pattern
-  if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))
-    return false;
-
-  Register Cond = MI.getOperand(1).getReg();
-  if (!MRI.hasOneNonDBGUse(Cond) ||
-      !mi_match(Cond, MRI,
-                m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS))))
+    MachineInstr &MI, MachineInstr &FCmp, FMinFMaxLegacyInfo &Info) const {
+  if (!MRI.hasOneNonDBGUse(FCmp.getOperand(0).getReg()))
     return false;
 
-  Info.True = MI.getOperand(2).getReg();
-  Info.False = MI.getOperand(3).getReg();
+  Info.Pred =
+      static_cast<CmpInst::Predicate>(FCmp.getOperand(1).getPredicate());
+  Info.LHS = FCmp.getOperand(2).getReg();
+  Info.RHS = FCmp.getOperand(3).getReg();
+  Register True = MI.getOperand(2).getReg();
+  Register False = MI.getOperand(3).getReg();
 
   // TODO: Handle case where the the selected value is an fneg and the compared
   // constant is the negation of the selected value.
-  if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&
-      !(Info.LHS == Info.False && Info.RHS == Info.True))
+  if ((Info.LHS != True || Info.RHS != False) &&
+      (Info.LHS != False || Info.RHS != True))
     return false;
 
-  switch (Info.Pred) {
-  case CmpInst::FCMP_FALSE:
-  case CmpInst::FCMP_OEQ:
-  case CmpInst::FCMP_ONE:
-  case CmpInst::FCMP_ORD:
-  case CmpInst::FCMP_UNO:
-  case CmpInst::FCMP_UEQ:
-  case CmpInst::FCMP_UNE:
-  case CmpInst::FCMP_TRUE:
-    return false;
-  default:
-    return true;
-  }
+  // Invert the predicate if necessary so that the apply function can assume
+  // that the select operands are the same as the fcmp operands.
+  // (select (fcmp P, L, R), R, L) -> (select (fcmp !P, L, R), L, R)
+  if (Info.LHS != True)
+    Info.Pred = CmpInst::getInversePredicate(Info.Pred);
+
+  // Only match </<=/>=/> not ==/!= etc.
+  return Info.Pred != CmpInst::getSwappedPredicate(Info.Pred);
 }
 
-void AMDGPUPostLegalizerCombinerImpl::applySelectFCmpToFMinToFMaxLegacy(
+void AMDGPUPostLegalizerCombinerImpl::applySelectFCmpToFMinFMaxLegacy(
     MachineInstr &MI, const FMinFMaxLegacyInfo &Info) const {
-  B.setInstrAndDebugLoc(MI);
-  auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) {
-    B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
-  };
-
-  switch (Info.Pred) {
-  case CmpInst::FCMP_ULT:
-  case CmpInst::FCMP_ULE:
-    if (Info.LHS == Info.True)
-      buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
-    else
-      buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
-    break;
-  case CmpInst::FCMP_OLE:
-  case CmpInst::FCMP_OLT: {
+  unsigned Opc = (Info.Pred & CmpInst::FCMP_OGT) ? AMDGPU::G_AMDGPU_FMAX_LEGACY
+                                                 : AMDGPU::G_AMDGPU_FMIN_LEGACY;
+  Register X = Info.LHS;
+  Register Y = Info.RHS;
+  if (Info.Pred == CmpInst::getUnorderedPredicate(Info.Pred)) {
     // We need to permute the operands to get the correct NaN behavior. The
     // selected operand is the second one based on the failing compare with NaN,
     // so permute it based on the compare type the hardware uses.
-    if (Info.LHS == Info.True)
-      buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
-    else
-      buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
-    break;
-  }
-  case CmpInst::FCMP_UGE:
-  case CmpInst::FCMP_UGT: {
-    if (Info.LHS == Info.True)
-      buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
-    else
-      buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
-    break;
-  }
-  case CmpInst::FCMP_OGT:
-  case CmpInst::FCMP_OGE: {
-    if (Info.LHS == Info.True)
-      buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
-    else
-      buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
-    break;
-  }
-  default:
-    llvm_unreachable("predicate should not have matched");
+    std::swap(X, Y);
   }
 
+  B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
+
   MI.eraseFromParent();
 }
 
@@ -267,8 +226,6 @@ bool AMDGPUPostLegalizerCombinerImpl::matchUCharToFloat(
 
 void AMDGPUPostLegalizerCombinerImpl::applyUCharToFloat(
     MachineInstr &MI) const {
-  B.setInstrAndDebugLoc(MI);
-
   const LLT S32 = LLT::scalar(32);
 
   Register DstReg = MI.getOperand(0).getReg();
@@ -387,7 +344,6 @@ bool AMDGPUPostLegalizerCombinerImpl::matchCvtF32UByteN(
 
 void AMDGPUPostLegalizerCombinerImpl::applyCvtF32UByteN(
     MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) const {
-  B.setInstrAndDebugLoc(MI);
   unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
 
   const LLT S32 = LLT::scalar(32);
@@ -479,11 +435,6 @@ bool AMDGPUPostLegalizerCombinerImpl::matchCombine_s_mul_u64(
   return false;
 }
 
-void AMDGPUPostLegalizerCombinerImpl::applyCombine_s_mul_u64(
-    MachineInstr &MI, unsigned &NewOpcode) const {
-  Helper.replaceOpcodeWith(MI, NewOpcode);
-}
-
 // Pass boilerplate
 // ================
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index f14d970f1e5d..3f01a328afaf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -182,8 +182,6 @@ void AMDGPUPreLegalizerCombinerImpl::applyClampI64ToI16(
          LLT::scalar(64));
   const LLT S32 = LLT::scalar(32);
 
-  B.setInstrAndDebugLoc(MI);
-
   auto Unmerge = B.buildUnmerge(S32, Src);
 
   assert(MI.getOpcode() != AMDGPU::G_AMDGPU_CVT_PK_I16_I32);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index 20e1aaa5419a..35abd6eddde8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -350,7 +350,6 @@ bool AMDGPURegBankCombinerImpl::matchFPMed3ToClamp(MachineInstr &MI,
 
 void AMDGPURegBankCombinerImpl::applyClamp(MachineInstr &MI,
                                            Register &Reg) const {
-  B.setInstrAndDebugLoc(MI);
   B.buildInstr(AMDGPU::G_AMDGPU_CLAMP, {MI.getOperand(0)}, {Reg},
                MI.getFlags());
   MI.eraseFromParent();
@@ -358,7 +357,6 @@ void AMDGPURegBankCombinerImpl::applyClamp(MachineInstr &MI,
 
 void AMDGPURegBankCombinerImpl::applyMed3(MachineInstr &MI,
                                           Med3MatchInfo &MatchInfo) const {
-  B.setInstrAndDebugLoc(MI);
   B.buildInstr(MatchInfo.Opc, {MI.getOperand(0)},
                {getAsVgpr(MatchInfo.Val0), getAsVgpr(MatchInfo.Val1),
                 getAsVgpr(MatchInfo.Val2)},
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 36e453f04426..94ee4ac78142 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -25,6 +25,7 @@
 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsR600.h"
 #include "llvm/IR/MDBuilder.h"
@@ -165,6 +166,15 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
   return *this;
 }
 
+void GCNSubtarget::checkSubtargetFeatures(const Function &F) const {
+  LLVMContext &Ctx = F.getContext();
+  if (hasFeature(AMDGPU::FeatureWavefrontSize32) ==
+      hasFeature(AMDGPU::FeatureWavefrontSize64)) {
+    Ctx.diagnose(DiagnosticInfoUnsupported(
+        F, "must specify exactly one of wavefrontsize32 and wavefrontsize64"));
+  }
+}
+
 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : TargetTriple(TT) {}
 
 bool AMDGPUSubtarget::useRealTrue16Insts() const {
@@ -664,29 +674,8 @@ bool GCNSubtarget::useVGPRIndexMode() const {
 bool GCNSubtarget::useAA() const { return UseAA; }
 
 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
-  if (getGeneration() >= AMDGPUSubtarget::GFX10)
-    return getMaxWavesPerEU();
-
-  if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-    if (SGPRs <= 80)
-      return 10;
-    if (SGPRs <= 88)
-      return 9;
-    if (SGPRs <= 100)
-      return 8;
-    return 7;
-  }
-  if (SGPRs <= 48)
-    return 10;
-  if (SGPRs <= 56)
-    return 9;
-  if (SGPRs <= 64)
-    return 8;
-  if (SGPRs <= 72)
-    return 7;
-  if (SGPRs <= 80)
-    return 6;
-  return 5;
+  return AMDGPU::IsaInfo::getOccupancyWithNumSGPRs(SGPRs, getMaxWavesPerEU(),
+                                                   getGeneration());
 }
 
 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const {
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 4d036fdea63b..d47a5f8ebb81 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -3910,8 +3910,8 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst,
   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
       AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
   int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
-  int RSrcOpName = Desc.TSFlags & SIInstrFlags::MIMG ? AMDGPU::OpName::srsrc
-                                                     : AMDGPU::OpName::rsrc;
+  int RSrcOpName = (Desc.TSFlags & SIInstrFlags::MIMG) ? AMDGPU::OpName::srsrc
+                                                       : AMDGPU::OpName::rsrc;
   int SrsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RSrcOpName);
   int DimIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dim);
   int A16Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::a16);
@@ -7436,7 +7436,8 @@ AMDGPUAsmParser::parseSendMsgBody(OperandInfoTy &Msg,
     Op.IsDefined = true;
     Op.Loc = getLoc();
     if (isToken(AsmToken::Identifier) &&
-        (Op.Val = getMsgOpId(Msg.Val, getTokenStr())) >= 0) {
+        (Op.Val = getMsgOpId(Msg.Val, getTokenStr(), getSTI())) !=
+            OPR_ID_UNKNOWN) {
       lex(); // skip operation name
     } else if (!parseExpr(Op.Val, "an operation name")) {
       return false;
@@ -7484,7 +7485,10 @@ AMDGPUAsmParser::validateSendMsg(const OperandInfoTy &Msg,
     return false;
   }
   if (!isValidMsgOp(Msg.Val, Op.Val, getSTI(), Strict)) {
-    Error(Op.Loc, "invalid operation id");
+    if (Op.Val == OPR_ID_UNSUPPORTED)
+      Error(Op.Loc, "specified operation id is not supported on this GPU");
+    else
+      Error(Op.Loc, "invalid operation id");
     return false;
   }
   if (Strict && !msgSupportsStream(Msg.Val, Op.Val, getSTI()) &&
@@ -8395,12 +8399,16 @@ bool AMDGPUAsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
     AGVK VK = StringSwitch<AGVK>(TokenId)
                   .Case("max", AGVK::AGVK_Max)
                   .Case("or", AGVK::AGVK_Or)
+                  .Case("extrasgprs", AGVK::AGVK_ExtraSGPRs)
+                  .Case("totalnumvgprs", AGVK::AGVK_TotalNumVGPRs)
+                  .Case("alignto", AGVK::AGVK_AlignTo)
+                  .Case("occupancy", AGVK::AGVK_Occupancy)
                   .Default(AGVK::AGVK_None);
 
     if (VK != AGVK::AGVK_None && peekToken().is(AsmToken::LParen)) {
       SmallVector<const MCExpr *, 4> Exprs;
       uint64_t CommaCount = 0;
-      lex(); // Eat 'max'/'or'
+      lex(); // Eat Arg ('or', 'max', 'occupancy', etc.)
       lex(); // Eat '('
       while (true) {
         if (trySkipToken(AsmToken::RParen)) {
@@ -8634,8 +8642,8 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
   }
 
   if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::byte_sel)) {
-    assert(AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in));
-    Inst.addOperand(Inst.getOperand(0));
+    if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in))
+      Inst.addOperand(Inst.getOperand(0));
     addOptionalImmOperand(Inst, Operands, OptionalIdx,
                           AMDGPUOperand::ImmTyByteSel);
   }
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 8053d89aeb0a..8eaa113ac181 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -2456,13 +2456,19 @@ class get_BUF_ps<string name> {
 
 // gfx11 instruction that accept both old and new assembler name.
 class Mnem_gfx11_gfx12 <string mnemonic, string real_name> :
-  MnemonicAlias<mnemonic, real_name>, Requires<[isGFX11Plus]>;
+    AMDGPUMnemonicAlias<mnemonic, real_name> {
+  let AssemblerPredicate = isGFX11Plus;
+}
 
 class Mnem_gfx11 <string mnemonic, string real_name> :
-  MnemonicAlias<mnemonic, real_name>, Requires<[isGFX11Only]>;
+    AMDGPUMnemonicAlias<mnemonic, real_name> {
+  let AssemblerPredicate = isGFX11Only;
+}
 
 class Mnem_gfx12 <string mnemonic, string real_name> :
-  MnemonicAlias<mnemonic, real_name>, Requires<[isGFX12Plus]>;
+    AMDGPUMnemonicAlias<mnemonic, real_name> {
+  let AssemblerPredicate = isGFX12Plus;
+}
 
 multiclass MUBUF_Real_AllAddr_gfx11_Impl2<bits<8> op, string real_name> {
   defm _BOTHEN : MUBUF_Real_gfx11<op, real_name>;
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index d63f04ab6d4c..f2825c48fcec 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1213,12 +1213,14 @@ class Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op, DS_Pseudo ps, int ef,
 
 multiclass DS_Real_gfx12<bits<8> op, string name = !tolower(NAME), bit needAlias = true> {
   defvar ps = !cast<DS_Pseudo>(NAME);
-  let AssemblerPredicate = isGFX12Plus, DecoderNamespace = "GFX12" in
-    def _gfx12 :
-      Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, ps, SIEncodingFamily.GFX12,
+  let AssemblerPredicate = isGFX12Plus in {
+    let DecoderNamespace = "GFX12" in
+      def _gfx12 :
+        Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, ps, SIEncodingFamily.GFX12,
                                                name, /*hasGDS=*/false>;
-  if !and(needAlias, !ne(ps.Mnemonic, name)) then
-    def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX12Plus]>;
+    if !and(needAlias, !ne(ps.Mnemonic, name)) then
+      def : AMDGPUMnemonicAlias<ps.Mnemonic, name>;
+  } // End AssemblerPredicate
 }
 
 defm DS_MIN_F32           : DS_Real_gfx12<0x012, "ds_min_num_f32">;
@@ -1239,10 +1241,12 @@ defm DS_PK_ADD_BF16       : DS_Real_gfx12<0x09b>;
 defm DS_PK_ADD_RTN_BF16   : DS_Real_gfx12<0x0ab>;
 
 // New aliases added in GFX12 without renaming the instructions.
-def : MnemonicAlias<"ds_subrev_u32", "ds_rsub_u32">, Requires<[isGFX12Plus]>;
-def : MnemonicAlias<"ds_subrev_rtn_u32", "ds_rsub_rtn_u32">, Requires<[isGFX12Plus]>;
-def : MnemonicAlias<"ds_subrev_u64", "ds_rsub_u64">, Requires<[isGFX12Plus]>;
-def : MnemonicAlias<"ds_subrev_rtn_u64", "ds_rsub_rtn_u64">, Requires<[isGFX12Plus]>;
+let AssemblerPredicate = isGFX12Plus in {
+  def : AMDGPUMnemonicAlias<"ds_subrev_u32", "ds_rsub_u32">;
+  def : AMDGPUMnemonicAlias<"ds_subrev_rtn_u32", "ds_rsub_rtn_u32">;
+  def : AMDGPUMnemonicAlias<"ds_subrev_u64", "ds_rsub_u64">;
+  def : AMDGPUMnemonicAlias<"ds_subrev_rtn_u64", "ds_rsub_rtn_u64">;
+}
 
 //===----------------------------------------------------------------------===//
 // GFX11.
@@ -1250,12 +1254,14 @@ def : MnemonicAlias<"ds_subrev_rtn_u64", "ds_rsub_rtn_u64">, Requires<[isGFX12Pl
 
 multiclass DS_Real_gfx11<bits<8> op, string name = !tolower(NAME)> {
   defvar ps = !cast<DS_Pseudo>(NAME);
-  let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in
-    def _gfx11 :
-      Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, ps, SIEncodingFamily.GFX11,
+  let AssemblerPredicate = isGFX11Only in {
+    let DecoderNamespace = "GFX11" in
+      def _gfx11 :
+        Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, ps, SIEncodingFamily.GFX11,
                                                name>;
-  if !ne(ps.Mnemonic, name) then
-    def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX11Only]>;
+    if !ne(ps.Mnemonic, name) then
+      def : AMDGPUMnemonicAlias<ps.Mnemonic, name>;
+  } // End AssemblerPredicate
 }
 
 multiclass DS_Real_gfx11_gfx12<bits<8> op, string name = !tolower(NAME)>
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 8fd36b84a00c..05063c6c321a 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -921,8 +921,8 @@ void AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
                                             AMDGPU::OpName::vdata);
   int VAddr0Idx =
       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
-  int RsrcOpName = TSFlags & SIInstrFlags::MIMG ? AMDGPU::OpName::srsrc
-                                                : AMDGPU::OpName::rsrc;
+  int RsrcOpName = (TSFlags & SIInstrFlags::MIMG) ? AMDGPU::OpName::srsrc
+                                                  : AMDGPU::OpName::rsrc;
   int RsrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), RsrcOpName);
   int DMaskIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
                                             AMDGPU::OpName::dmask);
diff --git a/llvm/lib/Target/AMDGPU/EXPInstructions.td b/llvm/lib/Target/AMDGPU/EXPInstructions.td
index b73b83031af0..5e426b5acd50 100644
--- a/llvm/lib/Target/AMDGPU/EXPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/EXPInstructions.td
@@ -117,12 +117,15 @@ multiclass EXP_Real_gfx11 {
 multiclass VEXPORT_Real_gfx12 {
   defvar ps = !cast<EXP_Pseudo>(NAME);
   def _gfx12 : EXP_Real_Row<ps, SIEncodingFamily.GFX12, "export">,
-    EXPe_Row, MnemonicAlias<"exp", "export">, Requires<[isGFX12Plus, HasExportInsts]> {
+    EXPe_Row {
     let AssemblerPredicate = isGFX12Only;
     let DecoderNamespace = "GFX12";
     let row = ps.row;
     let done = ps.done;
   }
+  def : AMDGPUMnemonicAlias<"exp", "export"> {
+    let AssemblerPredicate = isGFX12Plus;
+  }
 }
 
 defm EXP          : EXP_Real_gfx11, VEXPORT_Real_gfx12;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 27d5616565f2..377d48a48e9b 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -2357,7 +2357,9 @@ multiclass FLAT_Real_gfx11 <bits<7> op,
 multiclass FLAT_Aliases_gfx11<string name> {
   defvar ps = get_FLAT_ps<NAME>;
   if !ne(ps.Mnemonic, name) then
-    def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX11Only]>;
+    def : AMDGPUMnemonicAlias<ps.Mnemonic, name> {
+      let AssemblerPredicate = isGFX11Only;
+    }
 }
 
 multiclass FLAT_Real_Base_gfx11<bits<7> op,
@@ -2544,10 +2546,12 @@ multiclass VFLAT_Real_gfx12 <bits<8> op, string name = get_FLAT_ps<NAME>.Mnemoni
 
 multiclass VFLAT_Aliases_gfx12<string name, string alias = name> {
   defvar ps = get_FLAT_ps<NAME>;
-  if !ne(ps.Mnemonic, name) then
-    def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX12Only]>;
-  if !ne(alias, name) then
-    def : MnemonicAlias<alias, name>, Requires<[isGFX12Only]>;
+  let AssemblerPredicate = isGFX12Only in {
+    if !ne(ps.Mnemonic, name) then
+      def : AMDGPUMnemonicAlias<ps.Mnemonic, name>;
+    if !ne(alias, name) then
+      def : AMDGPUMnemonicAlias<alias, name>;
+  }
 }
 
 multiclass VFLAT_Real_Base_gfx12<bits<8> op,
diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index 91733c2933b4..0ac079c69e60 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -140,7 +140,8 @@ bool GCNDPPCombine::isShrinkable(MachineInstr &MI) const {
   if (!hasNoImmOrEqual(MI, AMDGPU::OpName::src0_modifiers, 0, Mask) ||
       !hasNoImmOrEqual(MI, AMDGPU::OpName::src1_modifiers, 0, Mask) ||
       !hasNoImmOrEqual(MI, AMDGPU::OpName::clamp, 0) ||
-      !hasNoImmOrEqual(MI, AMDGPU::OpName::omod, 0)) {
+      !hasNoImmOrEqual(MI, AMDGPU::OpName::omod, 0) ||
+      !hasNoImmOrEqual(MI, AMDGPU::OpName::byte_sel, 0)) {
     LLVM_DEBUG(dbgs() << "  Inst has non-default modifiers\n");
     return false;
   }
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index be337e0b2192..b7548671f2c5 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -250,6 +250,10 @@ public:
   GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
                                                    StringRef GPU, StringRef FS);
 
+  /// Diagnose inconsistent subtarget features before attempting to codegen
+  /// function \p F.
+  void checkSubtargetFeatures(const Function &F) const;
+
   const SIInstrInfo *getInstrInfo() const override {
     return &InstrInfo;
   }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
index 4578c33d92dc..159664faf983 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
@@ -7,6 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUMCExpr.h"
+#include "GCNSubtarget.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/IR/Function.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
@@ -16,6 +19,7 @@
 #include <optional>
 
 using namespace llvm;
+using namespace llvm::AMDGPU;
 
 AMDGPUVariadicMCExpr::AMDGPUVariadicMCExpr(VariadicKind Kind,
                                            ArrayRef<const MCExpr *> Args,
@@ -61,6 +65,18 @@ void AMDGPUVariadicMCExpr::printImpl(raw_ostream &OS,
   case AGVK_Max:
     OS << "max(";
     break;
+  case AGVK_ExtraSGPRs:
+    OS << "extrasgprs(";
+    break;
+  case AGVK_TotalNumVGPRs:
+    OS << "totalnumvgprs(";
+    break;
+  case AGVK_AlignTo:
+    OS << "alignto(";
+    break;
+  case AGVK_Occupancy:
+    OS << "occupancy(";
+    break;
   }
   for (auto It = Args.begin(); It != Args.end(); ++It) {
     (*It)->print(OS, MAI, /*InParens=*/false);
@@ -82,10 +98,151 @@ static int64_t op(AMDGPUVariadicMCExpr::VariadicKind Kind, int64_t Arg1,
   }
 }
 
+bool AMDGPUVariadicMCExpr::evaluateExtraSGPRs(MCValue &Res,
+                                              const MCAsmLayout *Layout,
+                                              const MCFixup *Fixup) const {
+  auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) {
+    MCValue MCVal;
+    if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) ||
+        !MCVal.isAbsolute())
+      return false;
+
+    ConstantValue = MCVal.getConstant();
+    return true;
+  };
+
+  assert(Args.size() == 3 &&
+         "AMDGPUVariadic Argument count incorrect for ExtraSGPRs");
+  const MCSubtargetInfo *STI = Ctx.getSubtargetInfo();
+  uint64_t VCCUsed = 0, FlatScrUsed = 0, XNACKUsed = 0;
+
+  bool Success = TryGetMCExprValue(Args[2], XNACKUsed);
+
+  assert(Success && "Arguments 3 for ExtraSGPRs should be a known constant");
+  if (!Success || !TryGetMCExprValue(Args[0], VCCUsed) ||
+      !TryGetMCExprValue(Args[1], FlatScrUsed))
+    return false;
+
+  uint64_t ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
+      STI, (bool)VCCUsed, (bool)FlatScrUsed, (bool)XNACKUsed);
+  Res = MCValue::get(ExtraSGPRs);
+  return true;
+}
+
+bool AMDGPUVariadicMCExpr::evaluateTotalNumVGPR(MCValue &Res,
+                                                const MCAsmLayout *Layout,
+                                                const MCFixup *Fixup) const {
+  auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) {
+    MCValue MCVal;
+    if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) ||
+        !MCVal.isAbsolute())
+      return false;
+
+    ConstantValue = MCVal.getConstant();
+    return true;
+  };
+  assert(Args.size() == 2 &&
+         "AMDGPUVariadic Argument count incorrect for TotalNumVGPRs");
+  const MCSubtargetInfo *STI = Ctx.getSubtargetInfo();
+  uint64_t NumAGPR = 0, NumVGPR = 0;
+
+  bool Has90AInsts = AMDGPU::isGFX90A(*STI);
+
+  if (!TryGetMCExprValue(Args[0], NumAGPR) ||
+      !TryGetMCExprValue(Args[1], NumVGPR))
+    return false;
+
+  uint64_t TotalNum = Has90AInsts && NumAGPR ? alignTo(NumVGPR, 4) + NumAGPR
+                                             : std::max(NumVGPR, NumAGPR);
+  Res = MCValue::get(TotalNum);
+  return true;
+}
+
+bool AMDGPUVariadicMCExpr::evaluateAlignTo(MCValue &Res,
+                                           const MCAsmLayout *Layout,
+                                           const MCFixup *Fixup) const {
+  auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) {
+    MCValue MCVal;
+    if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) ||
+        !MCVal.isAbsolute())
+      return false;
+
+    ConstantValue = MCVal.getConstant();
+    return true;
+  };
+
+  assert(Args.size() == 2 &&
+         "AMDGPUVariadic Argument count incorrect for AlignTo");
+  uint64_t Value = 0, Align = 0;
+  if (!TryGetMCExprValue(Args[0], Value) || !TryGetMCExprValue(Args[1], Align))
+    return false;
+
+  Res = MCValue::get(alignTo(Value, Align));
+  return true;
+}
+
+bool AMDGPUVariadicMCExpr::evaluateOccupancy(MCValue &Res,
+                                             const MCAsmLayout *Layout,
+                                             const MCFixup *Fixup) const {
+  auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) {
+    MCValue MCVal;
+    if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) ||
+        !MCVal.isAbsolute())
+      return false;
+
+    ConstantValue = MCVal.getConstant();
+    return true;
+  };
+  assert(Args.size() == 7 &&
+         "AMDGPUVariadic Argument count incorrect for Occupancy");
+  uint64_t InitOccupancy, MaxWaves, Granule, TargetTotalNumVGPRs, Generation,
+      NumSGPRs, NumVGPRs;
+
+  bool Success = true;
+  Success &= TryGetMCExprValue(Args[0], MaxWaves);
+  Success &= TryGetMCExprValue(Args[1], Granule);
+  Success &= TryGetMCExprValue(Args[2], TargetTotalNumVGPRs);
+  Success &= TryGetMCExprValue(Args[3], Generation);
+  Success &= TryGetMCExprValue(Args[4], InitOccupancy);
+
+  assert(Success && "Arguments 1 to 5 for Occupancy should be known constants");
+
+  if (!Success || !TryGetMCExprValue(Args[5], NumSGPRs) ||
+      !TryGetMCExprValue(Args[6], NumVGPRs))
+    return false;
+
+  unsigned Occupancy = InitOccupancy;
+  if (NumSGPRs)
+    Occupancy = std::min(
+        Occupancy, IsaInfo::getOccupancyWithNumSGPRs(
+                       NumSGPRs, MaxWaves,
+                       static_cast<AMDGPUSubtarget::Generation>(Generation)));
+  if (NumVGPRs)
+    Occupancy = std::min(Occupancy,
+                         IsaInfo::getNumWavesPerEUWithNumVGPRs(
+                             NumVGPRs, Granule, MaxWaves, TargetTotalNumVGPRs));
+
+  Res = MCValue::get(Occupancy);
+  return true;
+}
+
 bool AMDGPUVariadicMCExpr::evaluateAsRelocatableImpl(
     MCValue &Res, const MCAsmLayout *Layout, const MCFixup *Fixup) const {
   std::optional<int64_t> Total;
 
+  switch (Kind) {
+  default:
+    break;
+  case AGVK_ExtraSGPRs:
+    return evaluateExtraSGPRs(Res, Layout, Fixup);
+  case AGVK_AlignTo:
+    return evaluateAlignTo(Res, Layout, Fixup);
+  case AGVK_TotalNumVGPRs:
+    return evaluateTotalNumVGPR(Res, Layout, Fixup);
+  case AGVK_Occupancy:
+    return evaluateOccupancy(Res, Layout, Fixup);
+  }
+
   for (const MCExpr *Arg : Args) {
     MCValue ArgRes;
     if (!Arg->evaluateAsRelocatable(ArgRes, Layout, Fixup) ||
@@ -113,3 +270,47 @@ MCFragment *AMDGPUVariadicMCExpr::findAssociatedFragment() const {
   }
   return nullptr;
 }
+
+/// Allow delayed MCExpr resolve of ExtraSGPRs (in case VCCUsed or FlatScrUsed
+/// are unresolvable but needed for further MCExprs). Derived from
+/// implementation of IsaInfo::getNumExtraSGPRs in AMDGPUBaseInfo.cpp.
+///
+const AMDGPUVariadicMCExpr *
+AMDGPUVariadicMCExpr::createExtraSGPRs(const MCExpr *VCCUsed,
+                                       const MCExpr *FlatScrUsed,
+                                       bool XNACKUsed, MCContext &Ctx) {
+
+  return create(AGVK_ExtraSGPRs,
+                {VCCUsed, FlatScrUsed, MCConstantExpr::create(XNACKUsed, Ctx)},
+                Ctx);
+}
+
+const AMDGPUVariadicMCExpr *AMDGPUVariadicMCExpr::createTotalNumVGPR(
+    const MCExpr *NumAGPR, const MCExpr *NumVGPR, MCContext &Ctx) {
+  return create(AGVK_TotalNumVGPRs, {NumAGPR, NumVGPR}, Ctx);
+}
+
+/// Mimics GCNSubtarget::computeOccupancy for MCExpr.
+///
+/// Remove dependency on GCNSubtarget and depend only only the necessary values
+/// for said occupancy computation. Should match computeOccupancy implementation
+/// without passing \p STM on.
+const AMDGPUVariadicMCExpr *
+AMDGPUVariadicMCExpr::createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
+                                      const MCExpr *NumVGPRs,
+                                      const GCNSubtarget &STM, MCContext &Ctx) {
+  unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM);
+  unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM);
+  unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM);
+  unsigned Generation = STM.getGeneration();
+
+  auto CreateExpr = [&Ctx](unsigned Value) {
+    return MCConstantExpr::create(Value, Ctx);
+  };
+
+  return create(AGVK_Occupancy,
+                {CreateExpr(MaxWaves), CreateExpr(Granule),
+                 CreateExpr(TargetTotalNumVGPRs), CreateExpr(Generation),
+                 CreateExpr(InitOcc), NumSGPRs, NumVGPRs},
+                Ctx);
+}
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
index 238e0dea791b..f92350b59235 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
@@ -14,6 +14,9 @@
 
 namespace llvm {
 
+class Function;
+class GCNSubtarget;
+
 /// AMDGPU target specific variadic MCExpr operations.
 ///
 /// Takes in a minimum of 1 argument to be used with an operation. The supported
@@ -26,7 +29,15 @@ namespace llvm {
 ///
 class AMDGPUVariadicMCExpr : public MCTargetExpr {
 public:
-  enum VariadicKind { AGVK_None, AGVK_Or, AGVK_Max };
+  enum VariadicKind {
+    AGVK_None,
+    AGVK_Or,
+    AGVK_Max,
+    AGVK_ExtraSGPRs,
+    AGVK_TotalNumVGPRs,
+    AGVK_AlignTo,
+    AGVK_Occupancy
+  };
 
 private:
   VariadicKind Kind;
@@ -38,6 +49,15 @@ private:
                        MCContext &Ctx);
   ~AMDGPUVariadicMCExpr();
 
+  bool evaluateExtraSGPRs(MCValue &Res, const MCAsmLayout *Layout,
+                          const MCFixup *Fixup) const;
+  bool evaluateTotalNumVGPR(MCValue &Res, const MCAsmLayout *Layout,
+                            const MCFixup *Fixup) const;
+  bool evaluateAlignTo(MCValue &Res, const MCAsmLayout *Layout,
+                       const MCFixup *Fixup) const;
+  bool evaluateOccupancy(MCValue &Res, const MCAsmLayout *Layout,
+                         const MCFixup *Fixup) const;
+
 public:
   static const AMDGPUVariadicMCExpr *
   create(VariadicKind Kind, ArrayRef<const MCExpr *> Args, MCContext &Ctx);
@@ -52,6 +72,26 @@ public:
     return create(VariadicKind::AGVK_Max, Args, Ctx);
   }
 
+  static const AMDGPUVariadicMCExpr *createExtraSGPRs(const MCExpr *VCCUsed,
+                                                      const MCExpr *FlatScrUsed,
+                                                      bool XNACKUsed,
+                                                      MCContext &Ctx);
+
+  static const AMDGPUVariadicMCExpr *createTotalNumVGPR(const MCExpr *NumAGPR,
+                                                        const MCExpr *NumVGPR,
+                                                        MCContext &Ctx);
+
+  static const AMDGPUVariadicMCExpr *
+  createAlignTo(const MCExpr *Value, const MCExpr *Align, MCContext &Ctx) {
+    return create(VariadicKind::AGVK_AlignTo, {Value, Align}, Ctx);
+  }
+
+  static const AMDGPUVariadicMCExpr *createOccupancy(unsigned InitOcc,
+                                                     const MCExpr *NumSGPRs,
+                                                     const MCExpr *NumVGPRs,
+                                                     const GCNSubtarget &STM,
+                                                     MCContext &Ctx);
+
   VariadicKind getKind() const { return Kind; }
   const MCExpr *getSubExpr(size_t Index) const;
 
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 23e8be0d5e45..351263d07976 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -963,11 +963,10 @@ class VIMAGE_Atomic_gfx12<mimgopc op, string opcode, RegisterClass DataRC,
   let AsmString = opcode#" $vdata, "#AddrAsm#", $rsrc$dmask$dim$cpol$r128$a16$tfe";
 }
 
-class VIMAGE_Atomic_gfx12_Renamed<mimgopc op, string opcode, string renamed,
+class VIMAGE_Atomic_gfx12_Renamed<mimgopc op, string renamed,
                                   RegisterClass DataRC, int num_addrs,
                                   bit enableDisasm = 0>
-   : VIMAGE_Atomic_gfx12<op, renamed, DataRC, num_addrs, enableDisasm>,
-     MnemonicAlias<opcode, renamed>, Requires<[isGFX12Plus, HasImageInsts]>;
+  : VIMAGE_Atomic_gfx12<op, renamed, DataRC, num_addrs, enableDisasm>;
 
 multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
                                       RegisterClass data_rc,
@@ -998,7 +997,7 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
         if !empty(renamed) then
           def _V1_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 1, enableDasm>;
         else
-          def _V1_gfx12 : VIMAGE_Atomic_gfx12_Renamed <op, asm, renamed, data_rc, 1, enableDasm>;
+          def _V1_gfx12 : VIMAGE_Atomic_gfx12_Renamed <op, renamed, data_rc, 1, enableDasm>;
       }
     }
     let VAddrDwords = 2 in {
@@ -1023,7 +1022,7 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
         if !empty(renamed) then
           def _V2_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 2, 0>;
         else
-          def _V2_gfx12 : VIMAGE_Atomic_gfx12_Renamed <op, asm, renamed, data_rc, 2, 0>;
+          def _V2_gfx12 : VIMAGE_Atomic_gfx12_Renamed <op, renamed, data_rc, 2, 0>;
       }
     }
     let VAddrDwords = 3 in {
@@ -1048,7 +1047,7 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
         if !empty(renamed) then
           def _V3_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 3, 0>;
         else
-          def _V3_gfx12 : VIMAGE_Atomic_gfx12_Renamed <op, asm, renamed, data_rc, 3, 0>;
+          def _V3_gfx12 : VIMAGE_Atomic_gfx12_Renamed <op, renamed, data_rc, 3, 0>;
       }
     }
     let VAddrDwords = 4 in {
@@ -1073,10 +1072,17 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
         if !empty(renamed) then
           def _V4_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 4, enableDasm>;
         else
-          def _V4_gfx12 : VIMAGE_Atomic_gfx12_Renamed <op, asm, renamed, data_rc, 4, enableDasm>;
+          def _V4_gfx12 : VIMAGE_Atomic_gfx12_Renamed <op, renamed, data_rc, 4, enableDasm>;
       }
     }
   }
+  if !and(op.HAS_GFX12, !not(!empty(renamed))) then
+    def : AMDGPUMnemonicAlias<asm, renamed> {
+      let AssemblerPredicate = isGFX12Plus;
+      bit IsAtomicRet; // Unused
+      MIMGBaseOpcode BaseOpcode; // Unused
+      int VDataDwords; // Unused
+    }
 }
 
 multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0,
diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index 58214f30bb8d..08e1d6b87b0d 100644
--- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -336,8 +336,12 @@ bool SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
       // Split edge to make Def dominate Use
       FirstInsertionPt = SplitEdge(DefBB, BB, DT, LI)->getFirstInsertionPt();
     }
-    IRBuilder<>(FirstInsertionPt->getParent(), FirstInsertionPt)
-        .CreateCall(EndCf, {Exec});
+    IRBuilder<> IRB(FirstInsertionPt->getParent(), FirstInsertionPt);
+    // TODO: StructurizeCFG 'Flow' blocks have debug locations from the
+    // condition, for now just avoid copying these DebugLocs so that stepping
+    // out of the then/else block in a debugger doesn't step to the condition.
+    IRB.SetCurrentDebugLocation(DebugLoc());
+    IRB.CreateCall(EndCf, {Exec});
   }
 
   return true;
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 1f0207ddb0eb..6d0e0b3f4de2 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -468,7 +468,6 @@ enum Id { // Message ID, width(4) [3:0].
 };
 
 enum Op { // Both GS and SYS operation IDs.
-  OP_UNKNOWN_ = -1,
   OP_SHIFT_ = 4,
   OP_NONE_ = 0,
   // Bits used for operation encoding
@@ -479,14 +478,12 @@ enum Op { // Both GS and SYS operation IDs.
   OP_GS_CUT = 1,
   OP_GS_EMIT = 2,
   OP_GS_EMIT_CUT = 3,
-  OP_GS_LAST_,
   OP_GS_FIRST_ = OP_GS_NOP,
   // SYS operations are encoded in bits 6:4
   OP_SYS_ECC_ERR_INTERRUPT = 1,
   OP_SYS_REG_RD = 2,
   OP_SYS_HOST_TRAP_ACK = 3,
   OP_SYS_TTRACE_PC = 4,
-  OP_SYS_LAST_,
   OP_SYS_FIRST_ = OP_SYS_ECC_ERR_INTERRUPT,
 };
 
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index cb448aaafa4c..5c411a095587 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -2106,6 +2106,8 @@ bool SIFoldOperands::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
 
     for (unsigned K = 1; K < MI.getNumOperands(); K += 2) {
       MachineOperand &PhiMO = MI.getOperand(K);
+      if (!PhiMO.getSubReg())
+        continue;
       RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(&PhiMO);
     }
   }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index cb4efdc7cf65..0a3a56e9b3a0 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -225,10 +225,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SELECT, MVT::bf16, Promote);
     AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
 
-    // TODO: Could make these legal
-    setOperationAction(ISD::FABS, MVT::bf16, Expand);
-    setOperationAction(ISD::FNEG, MVT::bf16, Expand);
-    setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Expand);
+    setOperationAction(ISD::FABS, MVT::bf16, Legal);
+    setOperationAction(ISD::FNEG, MVT::bf16, Legal);
+    setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Legal);
 
     // We only need to custom lower because we can't specify an action for bf16
     // sources.
@@ -854,9 +853,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   if (Subtarget->hasPrefetch())
     setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
 
-  if (Subtarget->hasIEEEMinMax())
+  if (Subtarget->hasIEEEMinMax()) {
     setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM},
                        {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
+    setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM},
+                       {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
+                       Custom);
+  }
 
   setOperationAction(ISD::INTRINSIC_WO_CHAIN,
                      {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
@@ -877,6 +880,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
   setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
+  setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
   setOperationAction(ISD::GET_FPENV, MVT::i64, Custom);
   setOperationAction(ISD::SET_FPENV, MVT::i64, Custom);
 
@@ -4059,6 +4063,91 @@ SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
   return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
 }
 
+SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+
+  SDValue NewMode = Op.getOperand(1);
+  assert(NewMode.getValueType() == MVT::i32);
+
+  // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
+  // hardware MODE.fp_round values.
+  if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
+    uint32_t ClampedVal = std::min(
+        static_cast<uint32_t>(ConstMode->getZExtValue()),
+        static_cast<uint32_t>(AMDGPU::TowardZeroF32_TowardNegativeF64));
+    NewMode = DAG.getConstant(
+        AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
+  } else {
+    // If we know the input can only be one of the supported standard modes in
+    // the range 0-3, we can use a simplified mapping to hardware values.
+    KnownBits KB = DAG.computeKnownBits(NewMode);
+    const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
+    // The supported standard values are 0-3. The extended values start at 8. We
+    // need to offset by 4 if the value is in the extended range.
+
+    if (UseReducedTable) {
+      // Truncate to the low 32-bits.
+      SDValue BitTable = DAG.getConstant(
+          AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
+
+      SDValue Two = DAG.getConstant(2, SL, MVT::i32);
+      SDValue RoundModeTimesNumBits =
+          DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
+
+      NewMode =
+          DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
+
+      // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
+      // the table extracted bits into inline immediates.
+    } else {
+      // table_index = umin(value, value - 4)
+      // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
+      SDValue BitTable =
+          DAG.getConstant(AMDGPU::FltRoundToHWConversionTable, SL, MVT::i64);
+
+      SDValue Four = DAG.getConstant(4, SL, MVT::i32);
+      SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
+      SDValue IndexVal =
+          DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
+
+      SDValue Two = DAG.getConstant(2, SL, MVT::i32);
+      SDValue RoundModeTimesNumBits =
+          DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
+
+      SDValue TableValue =
+          DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
+      SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
+
+      // No need to mask out the high bits since the setreg will ignore them
+      // anyway.
+      NewMode = TruncTable;
+    }
+
+    // Insert a readfirstlane in case the value is a VGPR. We could do this
+    // earlier and keep more operations scalar, but that interferes with
+    // combining the source.
+    SDValue ReadFirstLaneID =
+        DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
+    NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
+                          ReadFirstLaneID, NewMode);
+  }
+
+  // N.B. The setreg will be later folded into s_round_mode on supported
+  // targets.
+  SDValue IntrinID =
+      DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
+  uint32_t BothRoundHwReg =
+      AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 4);
+  SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
+
+  SDValue SetReg =
+      DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
+                  IntrinID, RoundBothImm, NewMode);
+
+  return SetReg;
+}
+
 SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
   if (Op->isDivergent())
     return SDValue();
@@ -5735,6 +5824,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FMUL:
   case ISD::FMINNUM_IEEE:
   case ISD::FMAXNUM_IEEE:
+  case ISD::FMINIMUM:
+  case ISD::FMAXIMUM:
   case ISD::UADDSAT:
   case ISD::USUBSAT:
   case ISD::SADDSAT:
@@ -5754,6 +5845,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return LowerSTACKSAVE(Op, DAG);
   case ISD::GET_ROUNDING:
     return lowerGET_ROUNDING(Op, DAG);
+  case ISD::SET_ROUNDING:
+    return lowerSET_ROUNDING(Op, DAG);
   case ISD::PREFETCH:
     return lowerPREFETCH(Op, DAG);
   case ISD::FP_EXTEND:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 9856a2923d38..08aa2a599163 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -422,6 +422,7 @@ public:
   SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 76b90042d65f..08351c49b223 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -213,15 +213,13 @@ bool SIInstrInfo::isSafeToSink(MachineInstr &MI,
       // Check if there is a FromCycle that contains SgprDef's basic block but
       // does not contain SuccToSinkTo and also has divergent exit condition.
       while (FromCycle && !FromCycle->contains(ToCycle)) {
-        // After structurize-cfg, there should be exactly one cycle exit.
-        SmallVector<MachineBasicBlock *, 1> ExitBlocks;
-        FromCycle->getExitBlocks(ExitBlocks);
-        assert(ExitBlocks.size() == 1);
-        assert(ExitBlocks[0]->getSinglePredecessor());
+        SmallVector<MachineBasicBlock *, 1> ExitingBlocks;
+        FromCycle->getExitingBlocks(ExitingBlocks);
 
         // FromCycle has divergent exit condition.
-        if (hasDivergentBranch(ExitBlocks[0]->getSinglePredecessor())) {
-          return false;
+        for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
+          if (hasDivergentBranch(ExitingBlock))
+            return false;
         }
 
         FromCycle = FromCycle->getParentCycle();
@@ -4459,7 +4457,8 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
 
   // Check output modifiers
   return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
-         !hasModifiersSet(MI, AMDGPU::OpName::clamp);
+         !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
+         !hasModifiersSet(MI, AMDGPU::OpName::byte_sel);
 }
 
 // Set VCC operand with all flags from \p Orig, except for setting it as
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 7a8b6c98fc36..0ed2f60ea66a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -16,6 +16,9 @@ class GCNPredicateControl : PredicateControl {
   Predicate VIAssemblerPredicate = isGFX8GFX9;
 }
 
+class AMDGPUMnemonicAlias<string From, string To, string VariantName = "">
+    : MnemonicAlias<From, To, VariantName>, GCNPredicateControl;
+
 // Except for the NONE field, this must be kept in sync with the
 // SIEncodingFamily enum in SIInstrInfo.cpp and the columns of the
 // getMCOpcodeGen table.
@@ -2306,8 +2309,9 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
   field bit IsWMMA = 0;
   field bit IsSWMMAC = 0;
 
-  field bit IsFP8 = 0;
+  field bit IsFP8SrcByteSel = 0;
   field bit IsFP8DstByteSel = 0;
+  field bit IsFP8ByteSel = !or(IsFP8SrcByteSel, IsFP8DstByteSel);
 
   field bit HasDst = !ne(DstVT.Value, untyped.Value);
   field bit HasDst32 = HasDst;
@@ -2427,7 +2431,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
   field string AsmDPP8 = getAsmDPP8<HasDst, NumSrcArgs, 0 /*HasModifiers*/, DstVT>.ret;
   field string AsmVOP3Base = getAsmVOP3Base<NumSrcArgs, HasDst, HasClamp,
    HasOpSel, HasOMod, IsVOP3P, HasModifiers, HasModifiers, HasModifiers,
-   HasModifiers, DstVT, IsFP8DstByteSel>.ret;
+   HasModifiers, DstVT, IsFP8ByteSel>.ret;
   field string Asm64 = AsmVOP3Base;
   field string AsmVOP3P = getAsmVOP3P<NumSrcArgs, HasModifiers, HasClamp, HasOpSel>.ret;
   field string AsmVOP3OpSel = getAsmVOP3OpSel<NumSrcArgs,
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index cca8d96f29c0..f9e811f54d05 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1908,20 +1908,22 @@ def : GCNPat <
   (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000)))
 >;
 
+foreach fp16vt = [f16, bf16] in {
 def : GCNPat <
-  (UniformUnaryFrag<fneg> (f16 SReg_32:$src)),
+  (UniformUnaryFrag<fneg> (fp16vt SReg_32:$src)),
   (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000)))
 >;
 
 def : GCNPat <
-  (UniformUnaryFrag<fabs> (f16 SReg_32:$src)),
+  (UniformUnaryFrag<fabs> (fp16vt SReg_32:$src)),
   (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00007fff)))
 >;
 
 def : GCNPat <
-  (UniformUnaryFrag<fneg> (fabs (f16 SReg_32:$src))),
+  (UniformUnaryFrag<fneg> (fabs (fp16vt SReg_32:$src))),
   (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit
 >;
+} // End foreach fp16vt = ...
 
 def : GCNPat <
   (UniformUnaryFrag<fneg> (v2f16 SReg_32:$src)),
@@ -2030,20 +2032,22 @@ def : GCNPat <
   (V_XOR_B32_e64 (S_MOV_B32 (i32 0x80000000)), VGPR_32:$src)
 >;
 
+foreach fp16vt = [f16, bf16] in {
 def : GCNPat <
-  (fabs (f16 VGPR_32:$src)),
+  (fabs (fp16vt VGPR_32:$src)),
   (V_AND_B32_e64 (S_MOV_B32 (i32 0x00007fff)), VGPR_32:$src)
 >;
 
 def : GCNPat <
-  (fneg (f16 VGPR_32:$src)),
+  (fneg (fp16vt VGPR_32:$src)),
   (V_XOR_B32_e64 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src)
 >;
 
 def : GCNPat <
-  (fneg (fabs (f16 VGPR_32:$src))),
+  (fneg (fabs (fp16vt VGPR_32:$src))),
   (V_OR_B32_e64 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) // Set sign bit
 >;
+} // End foreach fp16vt = ...
 
 def : GCNPat <
   (fneg (v2f16 VGPR_32:$src)),
@@ -3162,50 +3166,34 @@ def : GCNPat <
   (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
 >;
 
-def : GCNPat <
-  (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src0), (i16 undef))),
-  (COPY_TO_REGCLASS SReg_32:$src0, SReg_32)
->;
+foreach vecTy = [v2i16, v2f16] in {
 
-def : GCNPat <
-  (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src0), (i16 undef))),
-  (COPY_TO_REGCLASS VGPR_32:$src0, VGPR_32)
->;
+defvar Ty = vecTy.ElementType;
 
 def : GCNPat <
-  (v2f16 (build_vector f16:$src0, (f16 undef))),
-  (COPY $src0)
->;
-
-def : GCNPat <
-  (v2i16 (UniformBinFrag<build_vector> (i16 undef), (i16 SReg_32:$src1))),
-  (S_LSHL_B32 SReg_32:$src1, (i32 16))
+  (vecTy (UniformBinFrag<build_vector> (Ty SReg_32:$src0), (Ty undef))),
+  (COPY_TO_REGCLASS SReg_32:$src0, SReg_32)
 >;
 
 def : GCNPat <
-  (v2i16 (DivergentBinFrag<build_vector> (i16 undef), (i16 VGPR_32:$src1))),
-  (v2i16 (V_LSHLREV_B32_e64 (i32 16), VGPR_32:$src1))
+  (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$src0), (Ty undef))),
+  (COPY_TO_REGCLASS VGPR_32:$src0, VGPR_32)
 >;
 
-
 def : GCNPat <
-  (v2f16 (UniformBinFrag<build_vector> (f16 undef), (f16 SReg_32:$src1))),
+  (vecTy (UniformBinFrag<build_vector> (Ty undef), (Ty SReg_32:$src1))),
   (S_LSHL_B32 SReg_32:$src1, (i32 16))
 >;
 
 def : GCNPat <
-  (v2f16 (DivergentBinFrag<build_vector> (f16 undef), (f16 VGPR_32:$src1))),
-  (v2f16 (V_LSHLREV_B32_e64 (i32 16), VGPR_32:$src1))
+  (vecTy (DivergentBinFrag<build_vector> (Ty undef), (Ty VGPR_32:$src1))),
+  (vecTy (V_LSHLREV_B32_e64 (i32 16), VGPR_32:$src1))
 >;
+} // End foreach Ty = ...
 }
 
 let SubtargetPredicate = HasVOP3PInsts in {
 def : GCNPat <
-  (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src0), (i16 SReg_32:$src1))),
-  (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1)
->;
-
-def : GCNPat <
   (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src0), (i16 VGPR_32:$src1))),
   (v2i16 (V_LSHL_OR_B32_e64 $src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), $src0))))
 >;
@@ -3223,18 +3211,17 @@ def : GCNPat <
   (S_PACK_HH_B32_B16 SReg_32:$src0, SReg_32:$src1)
 >;
 
-def : GCNPat <
-  (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src0), (f16 SReg_32:$src1))),
-  (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1)
->;
 
+foreach vecTy = [v2i16, v2f16] in {
 
-
-foreach Ty = [i16, f16] in {
-
-defvar vecTy = !if(!eq(Ty, i16), v2i16, v2f16);
+defvar Ty = vecTy.ElementType;
 defvar immzeroTy = !if(!eq(Ty, i16), immzero, fpimmzero);
 
+def : GCNPat <
+  (vecTy (UniformBinFrag<build_vector> (Ty SReg_32:$src0), (Ty SReg_32:$src1))),
+  (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1)
+>;
+
 // Take the lower 16 bits from each VGPR_32 and concat them
 def : GCNPat <
   (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a), (Ty VGPR_32:$b))),
@@ -3338,13 +3325,15 @@ def : GCNPat <
                         (as_i1timm $bound_ctrl))
 >;
 
+foreach vt = Reg64Types.types in {
 def : GCNPat <
-  (i64 (int_amdgcn_update_dpp i64:$old, i64:$src, timm:$dpp_ctrl, timm:$row_mask,
+  (vt (int_amdgcn_update_dpp vt:$old, vt:$src, timm:$dpp_ctrl, timm:$row_mask,
                               timm:$bank_mask, timm:$bound_ctrl)),
   (V_MOV_B64_DPP_PSEUDO VReg_64_Align2:$old, VReg_64_Align2:$src, (as_i32timm $dpp_ctrl),
                         (as_i32timm $row_mask), (as_i32timm $bank_mask),
                         (as_i1timm $bound_ctrl))
 >;
+}
 
 //===----------------------------------------------------------------------===//
 // Fract Patterns
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index ba01b8513dca..83f922fb09ae 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -219,12 +219,20 @@ private:
   static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
                                                      const CombineInfo &Paired);
-  const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
-                                                    const CombineInfo &Paired);
+  const TargetRegisterClass *
+  getTargetRegisterClass(const CombineInfo &CI,
+                         const CombineInfo &Paired) const;
   const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
 
   CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
 
+  void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
+                      MachineBasicBlock::iterator InsertBefore, int OpName,
+                      Register DestReg) const;
+  Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
+                           MachineBasicBlock::iterator InsertBefore,
+                           int OpName) const;
+
   unsigned read2Opcode(unsigned EltSize) const;
   unsigned read2ST64Opcode(unsigned EltSize) const;
   MachineBasicBlock::iterator
@@ -1191,6 +1199,57 @@ SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
   return Where;
 }
 
+// Copy the merged load result from DestReg to the original dest regs of CI and
+// Paired.
+void SILoadStoreOptimizer::copyToDestRegs(
+    CombineInfo &CI, CombineInfo &Paired,
+    MachineBasicBlock::iterator InsertBefore, int OpName,
+    Register DestReg) const {
+  MachineBasicBlock *MBB = CI.I->getParent();
+  DebugLoc DL = CI.I->getDebugLoc();
+
+  auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
+
+  // Copy to the old destination registers.
+  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
+  const auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
+  const auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
+
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+      .add(*Dest0) // Copy to same destination including flags and sub reg.
+      .addReg(DestReg, 0, SubRegIdx0);
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+      .add(*Dest1)
+      .addReg(DestReg, RegState::Kill, SubRegIdx1);
+}
+
+// Return a register for the source of the merged store after copying the
+// original source regs of CI and Paired into it.
+Register
+SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
+                                      MachineBasicBlock::iterator InsertBefore,
+                                      int OpName) const {
+  MachineBasicBlock *MBB = CI.I->getParent();
+  DebugLoc DL = CI.I->getDebugLoc();
+
+  auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
+
+  // Copy to the new source register.
+  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
+  Register SrcReg = MRI->createVirtualRegister(SuperRC);
+
+  const auto *Src0 = TII->getNamedOperand(*CI.I, OpName);
+  const auto *Src1 = TII->getNamedOperand(*Paired.I, OpName);
+
+  BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
+      .add(*Src0)
+      .addImm(SubRegIdx0)
+      .add(*Src1)
+      .addImm(SubRegIdx1);
+
+  return SrcReg;
+}
+
 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
   if (STM->ldsRequiresM0Init())
     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
@@ -1214,23 +1273,11 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
   // cases, like vectors of pointers.
   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
 
-  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
-  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
-
-  unsigned NewOffset0 = CI.Offset;
-  unsigned NewOffset1 = Paired.Offset;
+  unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset);
+  unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset);
   unsigned Opc =
       CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
 
-  unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
-  unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
-
-  if (NewOffset0 > NewOffset1) {
-    // Canonicalize the merged instruction so the smaller offset comes first.
-    std::swap(NewOffset0, NewOffset1);
-    std::swap(SubRegIdx0, SubRegIdx1);
-  }
-
   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
 
@@ -1267,17 +1314,7 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
           .addImm(0)                                 // gds
           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
 
-  (void)Read2;
-
-  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
-
-  // Copy to the old destination registers.
-  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
-      .add(*Dest0) // Copy to same destination including flags and sub reg.
-      .addReg(DestReg, 0, SubRegIdx0);
-  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
-      .add(*Dest1)
-      .addReg(DestReg, RegState::Kill, SubRegIdx1);
+  copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1397,20 +1434,7 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
 
   MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
-  unsigned SubRegIdx0, SubRegIdx1;
-  std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
-
-  // Copy to the old destination registers.
-  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
-  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
-  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
-
-  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
-      .add(*Dest0) // Copy to same destination including flags and sub reg.
-      .addReg(DestReg, 0, SubRegIdx0);
-  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
-      .add(*Dest1)
-      .addReg(DestReg, RegState::Kill, SubRegIdx1);
+  copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1442,21 +1466,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
   New.addImm(MergedOffset);
   New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
-  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
-  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
-  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
-
-  // Copy to the old destination registers.
-  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
-  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
-  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
-
-  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
-      .add(*Dest0) // Copy to same destination including flags and sub reg.
-      .addReg(DestReg, 0, SubRegIdx0);
-  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
-      .add(*Dest1)
-      .addReg(DestReg, RegState::Kill, SubRegIdx1);
+  copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1497,21 +1507,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
         .addImm(0)            // swz
         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
-  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
-  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
-  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
-
-  // Copy to the old destination registers.
-  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
-  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
-  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
-
-  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
-      .add(*Dest0) // Copy to same destination including flags and sub reg.
-      .addReg(DestReg, 0, SubRegIdx0);
-  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
-      .add(*Dest1)
-      .addReg(DestReg, RegState::Kill, SubRegIdx1);
+  copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1556,21 +1552,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
           .addImm(0)            // swz
           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
-  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
-  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
-  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
-
-  // Copy to the old destination registers.
-  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
-  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
-  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
-
-  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
-      .add(*Dest0) // Copy to same destination including flags and sub reg.
-      .addReg(DestReg, 0, SubRegIdx0);
-  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
-      .add(*Dest1)
-      .addReg(DestReg, RegState::Kill, SubRegIdx1);
+  copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1585,22 +1567,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
 
   const unsigned Opcode = getNewOpcode(CI, Paired);
 
-  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
-  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
-  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
-
-  // Copy to the new source register.
-  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
-  Register SrcReg = MRI->createVirtualRegister(SuperRC);
-
-  const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
-  const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
-
-  BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
-      .add(*Src0)
-      .addImm(SubRegIdx0)
-      .add(*Src1)
-      .addImm(SubRegIdx1);
+  Register SrcReg =
+      copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
 
   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
                  .addReg(SrcReg, RegState::Kill);
@@ -1654,21 +1622,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
        .addImm(CI.CPol)
        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
-  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
-  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
-  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
-
-  // Copy to the old destination registers.
-  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
-  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
-  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
-
-  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
-      .add(*Dest0) // Copy to same destination including flags and sub reg.
-      .addReg(DestReg, 0, SubRegIdx0);
-  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
-      .add(*Dest1)
-      .addReg(DestReg, RegState::Kill, SubRegIdx1);
+  copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1683,22 +1637,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
 
   const unsigned Opcode = getNewOpcode(CI, Paired);
 
-  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
-  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
-  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
-
-  // Copy to the new source register.
-  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
-  Register SrcReg = MRI->createVirtualRegister(SuperRC);
-
-  const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
-  const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
-
-  BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
-      .add(*Src0)
-      .addImm(SubRegIdx0)
-      .add(*Src1)
-      .addImm(SubRegIdx1);
+  Register SrcReg =
+      copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
 
   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
                  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
@@ -1876,12 +1816,12 @@ SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
     Idx1 = Idxs[CI.Width][Paired.Width - 1];
   }
 
-  return std::pair(Idx0, Idx1);
+  return {Idx0, Idx1};
 }
 
 const TargetRegisterClass *
 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
-                                             const CombineInfo &Paired) {
+                                             const CombineInfo &Paired) const {
   if (CI.InstClass == S_BUFFER_LOAD_IMM ||
       CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
     switch (CI.Width + Paired.Width) {
@@ -1914,22 +1854,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
 
   const unsigned Opcode = getNewOpcode(CI, Paired);
 
-  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
-  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
-  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
-
-  // Copy to the new source register.
-  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
-  Register SrcReg = MRI->createVirtualRegister(SuperRC);
-
-  const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
-  const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
-
-  BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
-      .add(*Src0)
-      .addImm(SubRegIdx0)
-      .add(*Src1)
-      .addImm(SubRegIdx1);
+  Register SrcReg =
+      copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
 
   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
                  .addReg(SrcReg, RegState::Kill);
@@ -2225,7 +2151,7 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
         MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
       continue;
 
-    InstsWCommonBase.push_back(std::pair(&MINext, MAddrNext.Offset));
+    InstsWCommonBase.emplace_back(&MINext, MAddrNext.Offset);
 
     int64_t Dist = MAddr.Offset - MAddrNext.Offset;
     TargetLoweringBase::AddrMode AM;
@@ -2252,16 +2178,16 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
     updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
     LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
 
-    for (auto P : InstsWCommonBase) {
+    for (auto [OtherMI, OtherOffset] : InstsWCommonBase) {
       TargetLoweringBase::AddrMode AM;
       AM.HasBaseReg = true;
-      AM.BaseOffs = P.second - AnchorAddr.Offset;
+      AM.BaseOffs = OtherOffset - AnchorAddr.Offset;
 
       if (TLI->isLegalGlobalAddressingMode(AM)) {
-        LLVM_DEBUG(dbgs() << "  Promote Offset(" << P.second;
-                   dbgs() << ")"; P.first->dump());
-        updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
-        LLVM_DEBUG(dbgs() << "     After promotion: "; P.first->dump());
+        LLVM_DEBUG(dbgs() << "  Promote Offset(" << OtherOffset; dbgs() << ")";
+                   OtherMI->dump());
+        updateBaseAndOffset(*OtherMI, Base, OtherOffset - AnchorAddr.Offset);
+        LLVM_DEBUG(dbgs() << "     After promotion: "; OtherMI->dump());
       }
     }
     AnchorList.insert(AnchorInst);
@@ -2375,7 +2301,7 @@ SILoadStoreOptimizer::collectMergeableInsts(
     ++I;
   }
 
-  return std::pair(BlockI, Modified);
+  return {BlockI, Modified};
 }
 
 // Scan through looking for adjacent LDS operations with constant offsets from
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 7d0c1ba8448e..8c014832f5e4 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -522,13 +522,13 @@ private:
   // the serialization easier.
   ReservedRegSet WWMReservedRegs;
 
-  using PrologEpilogSGPRSpillsMap =
-      DenseMap<Register, PrologEpilogSGPRSaveRestoreInfo>;
+  using PrologEpilogSGPRSpill =
+      std::pair<Register, PrologEpilogSGPRSaveRestoreInfo>;
   // To track the SGPR spill method used for a CSR SGPR register during
   // frame lowering. Even though the SGPR spills are handled during
   // SILowerSGPRSpills pass, some special handling needed later during the
   // PrologEpilogInserter.
-  PrologEpilogSGPRSpillsMap PrologEpilogSGPRSpills;
+  SmallVector<PrologEpilogSGPRSpill, 3> PrologEpilogSGPRSpills;
 
   // To save/restore EXEC MASK around WWM spills and copies.
   Register SGPRForEXECCopy;
@@ -596,7 +596,11 @@ public:
   const WWMSpillsMap &getWWMSpills() const { return WWMSpills; }
   const ReservedRegSet &getWWMReservedRegs() const { return WWMReservedRegs; }
 
-  const PrologEpilogSGPRSpillsMap &getPrologEpilogSGPRSpills() const {
+  ArrayRef<PrologEpilogSGPRSpill> getPrologEpilogSGPRSpills() const {
+    assert(
+        is_sorted(PrologEpilogSGPRSpills, [](const auto &LHS, const auto &RHS) {
+          return LHS.first < RHS.first;
+        }));
     return PrologEpilogSGPRSpills;
   }
 
@@ -606,18 +610,29 @@ public:
 
   void addToPrologEpilogSGPRSpills(Register Reg,
                                    PrologEpilogSGPRSaveRestoreInfo SI) {
-    PrologEpilogSGPRSpills.insert(std::make_pair(Reg, SI));
+    assert(!hasPrologEpilogSGPRSpillEntry(Reg));
+
+    // Insert a new entry in the right place to keep the vector in sorted order.
+    // This should be cheap since the vector is expected to be very short.
+    PrologEpilogSGPRSpills.insert(
+        upper_bound(
+            PrologEpilogSGPRSpills, Reg,
+            [](const auto &LHS, const auto &RHS) { return LHS < RHS.first; }),
+        std::make_pair(Reg, SI));
   }
 
   // Check if an entry created for \p Reg in PrologEpilogSGPRSpills. Return true
   // on success and false otherwise.
   bool hasPrologEpilogSGPRSpillEntry(Register Reg) const {
-    return PrologEpilogSGPRSpills.contains(Reg);
+    auto I = find_if(PrologEpilogSGPRSpills,
+                     [&Reg](const auto &Spill) { return Spill.first == Reg; });
+    return I != PrologEpilogSGPRSpills.end();
   }
 
   // Get the scratch SGPR if allocated to save/restore \p Reg.
   Register getScratchSGPRCopyDstReg(Register Reg) const {
-    auto I = PrologEpilogSGPRSpills.find(Reg);
+    auto I = find_if(PrologEpilogSGPRSpills,
+                     [&Reg](const auto &Spill) { return Spill.first == Reg; });
     if (I != PrologEpilogSGPRSpills.end() &&
         I->second.getKind() == SGPRSaveKind::COPY_TO_SCRATCH_SGPR)
       return I->second.getReg();
@@ -646,7 +661,8 @@ public:
 
   const PrologEpilogSGPRSaveRestoreInfo &
   getPrologEpilogSGPRSaveRestoreInfo(Register Reg) const {
-    auto I = PrologEpilogSGPRSpills.find(Reg);
+    auto I = find_if(PrologEpilogSGPRSpills,
+                     [&Reg](const auto &Spill) { return Spill.first == Reg; });
     assert(I != PrologEpilogSGPRSpills.end());
 
     return I->second;
diff --git a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
index c01b1266a553..e7f448233ca3 100644
--- a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
+++ b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
@@ -430,6 +430,14 @@ void SIModeRegister::processBlockPhase3(MachineBasicBlock &MBB,
 }
 
 bool SIModeRegister::runOnMachineFunction(MachineFunction &MF) {
+  // Constrained FP intrinsics are used to support non-default rounding modes.
+  // strictfp attribute is required to mark functions with strict FP semantics
+  // having constrained FP intrinsics. This pass fixes up operations that uses
+  // a non-default rounding mode for non-strictfp functions. But it should not
+  // assume or modify any default rounding modes in case of strictfp functions.
+  const Function &F = MF.getFunction();
+  if (F.hasFnAttribute(llvm::Attribute::StrictFP))
+    return Changed;
   BlockInfo.resize(MF.getNumBlockIDs());
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
diff --git a/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp b/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp
index 2684a1e3c335..f9efee6d3934 100644
--- a/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp
+++ b/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp
@@ -174,3 +174,122 @@ static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
 static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
                   HWTowardNegative, HWTowardPositive)) ==
               TowardNegativeF32_TowardPositiveF64);
+
+// Decode FLT_ROUNDS into the hardware value where the two rounding modes are
+// the same and use a standard value
+static constexpr uint64_t encodeFltRoundsToHWTableSame(uint32_t HWVal,
+                                                       uint32_t FltRoundsVal) {
+  if (FltRoundsVal > TowardNegative)
+    FltRoundsVal -= ExtendedFltRoundOffset;
+
+  return static_cast<uint64_t>(getModeRegisterRoundMode(HWVal, HWVal))
+         << (FltRoundsVal << 2);
+}
+
+/// Decode FLT_ROUNDS into the hardware value where the two rounding modes
+/// different and use an extended value.
+static constexpr uint64_t encodeFltRoundsToHWTable(uint32_t HWF32Val,
+                                                   uint32_t HWF64Val,
+                                                   uint32_t FltRoundsVal) {
+  if (FltRoundsVal > TowardNegative)
+    FltRoundsVal -= ExtendedFltRoundOffset;
+  return static_cast<uint64_t>(getModeRegisterRoundMode(HWF32Val, HWF64Val))
+         << (FltRoundsVal << 2);
+}
+
+const uint64_t AMDGPU::FltRoundToHWConversionTable =
+    encodeFltRoundsToHWTableSame(HWTowardZero, TowardZeroF32_TowardZeroF64) |
+    encodeFltRoundsToHWTableSame(HWNearestTiesToEven,
+                                 NearestTiesToEvenF32_NearestTiesToEvenF64) |
+    encodeFltRoundsToHWTableSame(HWTowardPositive,
+                                 TowardPositiveF32_TowardPositiveF64) |
+    encodeFltRoundsToHWTableSame(HWTowardNegative,
+                                 TowardNegativeF32_TowardNegativeF64) |
+
+    encodeFltRoundsToHWTable(HWTowardZero, HWNearestTiesToEven,
+                             TowardZeroF32_NearestTiesToEvenF64) |
+    encodeFltRoundsToHWTable(HWTowardZero, HWTowardPositive,
+                             TowardZeroF32_TowardPositiveF64) |
+    encodeFltRoundsToHWTable(HWTowardZero, HWTowardNegative,
+                             TowardZeroF32_TowardNegativeF64) |
+
+    encodeFltRoundsToHWTable(HWNearestTiesToEven, HWTowardZero,
+                             NearestTiesToEvenF32_TowardZeroF64) |
+    encodeFltRoundsToHWTable(HWNearestTiesToEven, HWTowardPositive,
+                             NearestTiesToEvenF32_TowardPositiveF64) |
+    encodeFltRoundsToHWTable(HWNearestTiesToEven, HWTowardNegative,
+                             NearestTiesToEvenF32_TowardNegativeF64) |
+
+    encodeFltRoundsToHWTable(HWTowardPositive, HWTowardZero,
+                             TowardPositiveF32_TowardZeroF64) |
+    encodeFltRoundsToHWTable(HWTowardPositive, HWNearestTiesToEven,
+                             TowardPositiveF32_NearestTiesToEvenF64) |
+    encodeFltRoundsToHWTable(HWTowardPositive, HWTowardNegative,
+                             TowardPositiveF32_TowardNegativeF64) |
+
+    encodeFltRoundsToHWTable(HWTowardNegative, HWTowardZero,
+                             TowardNegativeF32_TowardZeroF64) |
+    encodeFltRoundsToHWTable(HWTowardNegative, HWNearestTiesToEven,
+                             TowardNegativeF32_NearestTiesToEvenF64) |
+    encodeFltRoundsToHWTable(HWTowardNegative, HWTowardPositive,
+                             TowardNegativeF32_TowardPositiveF64);
+
+/// Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
+static constexpr uint32_t
+decodeFltRoundToHWConversionTable(uint64_t FltRoundToHWConversionTable,
+                                  uint32_t FltRounds) {
+  uint32_t IndexVal = FltRounds;
+  if (IndexVal > TowardNegative)
+    IndexVal -= ExtendedFltRoundOffset;
+  return (FltRoundToHWConversionTable >> (IndexVal << 2)) & 0xf;
+}
+
+uint32_t AMDGPU::decodeFltRoundToHWConversionTable(uint32_t FltRounds) {
+  return ::decodeFltRoundToHWConversionTable(FltRoundToHWConversionTable,
+                                             FltRounds);
+}
+
+static constexpr uint32_t decodeFltRoundToHW(uint32_t FltRounds) {
+  return ::decodeFltRoundToHWConversionTable(FltRoundToHWConversionTable,
+                                             FltRounds);
+}
+
+// Verify evaluation of FltRoundToHWConversionTable
+
+static_assert(decodeFltRoundToHW(AMDGPUFltRounds::TowardZero) ==
+              getModeRegisterRoundMode(HWTowardZero, HWTowardZero));
+static_assert(decodeFltRoundToHW(AMDGPUFltRounds::NearestTiesToEven) ==
+              getModeRegisterRoundMode(HWNearestTiesToEven,
+                                       HWNearestTiesToEven));
+static_assert(decodeFltRoundToHW(AMDGPUFltRounds::TowardPositive) ==
+              getModeRegisterRoundMode(HWTowardPositive, HWTowardPositive));
+static_assert(decodeFltRoundToHW(AMDGPUFltRounds::TowardNegative) ==
+              getModeRegisterRoundMode(HWTowardNegative, HWTowardNegative));
+
+static_assert(decodeFltRoundToHW(NearestTiesToEvenF32_TowardPositiveF64) ==
+              getModeRegisterRoundMode(HWNearestTiesToEven, HWTowardPositive));
+static_assert(decodeFltRoundToHW(NearestTiesToEvenF32_TowardNegativeF64) ==
+              getModeRegisterRoundMode(HWNearestTiesToEven, HWTowardNegative));
+static_assert(decodeFltRoundToHW(NearestTiesToEvenF32_TowardZeroF64) ==
+              getModeRegisterRoundMode(HWNearestTiesToEven, HWTowardZero));
+
+static_assert(decodeFltRoundToHW(TowardPositiveF32_NearestTiesToEvenF64) ==
+              getModeRegisterRoundMode(HWTowardPositive, HWNearestTiesToEven));
+static_assert(decodeFltRoundToHW(TowardPositiveF32_TowardNegativeF64) ==
+              getModeRegisterRoundMode(HWTowardPositive, HWTowardNegative));
+static_assert(decodeFltRoundToHW(TowardPositiveF32_TowardZeroF64) ==
+              getModeRegisterRoundMode(HWTowardPositive, HWTowardZero));
+
+static_assert(decodeFltRoundToHW(TowardNegativeF32_NearestTiesToEvenF64) ==
+              getModeRegisterRoundMode(HWTowardNegative, HWNearestTiesToEven));
+static_assert(decodeFltRoundToHW(TowardNegativeF32_TowardPositiveF64) ==
+              getModeRegisterRoundMode(HWTowardNegative, HWTowardPositive));
+static_assert(decodeFltRoundToHW(TowardNegativeF32_TowardZeroF64) ==
+              getModeRegisterRoundMode(HWTowardNegative, HWTowardZero));
+
+static_assert(decodeFltRoundToHW(TowardZeroF32_NearestTiesToEvenF64) ==
+              getModeRegisterRoundMode(HWTowardZero, HWNearestTiesToEven));
+static_assert(decodeFltRoundToHW(TowardZeroF32_TowardPositiveF64) ==
+              getModeRegisterRoundMode(HWTowardZero, HWTowardPositive));
+static_assert(decodeFltRoundToHW(TowardZeroF32_TowardNegativeF64) ==
+              getModeRegisterRoundMode(HWTowardZero, HWTowardNegative));
diff --git a/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h b/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h
index 9fbd74c3eede..c86678a73253 100644
--- a/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h
+++ b/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h
@@ -144,6 +144,13 @@ static constexpr uint32_t F64FltRoundOffset = 2;
 // values.
 extern const uint64_t FltRoundConversionTable;
 
+// Bit indexed table to convert from FLT_ROUNDS values to hardware rounding mode
+// values
+extern const uint64_t FltRoundToHWConversionTable;
+
+/// Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
+uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds);
+
 } // end namespace AMDGPU
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
index 9ed7aacc0538..0d40816cdd4b 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
@@ -18,57 +18,114 @@
 #include "GCNSubtarget.h"
 #include "SIDefines.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/MC/MCExpr.h"
 
 using namespace llvm;
 
-uint64_t SIProgramInfo::getComputePGMRSrc1(const GCNSubtarget &ST) const {
-  uint64_t Reg = S_00B848_VGPRS(VGPRBlocks) | S_00B848_SGPRS(SGPRBlocks) |
-                 S_00B848_PRIORITY(Priority) | S_00B848_FLOAT_MODE(FloatMode) |
-                 S_00B848_PRIV(Priv) | S_00B848_DEBUG_MODE(DebugMode) |
-                 S_00B848_WGP_MODE(WgpMode) | S_00B848_MEM_ORDERED(MemOrdered);
+void SIProgramInfo::reset(const MachineFunction &MF) {
+  MCContext &Ctx = MF.getContext();
+
+  const MCExpr *ZeroExpr = MCConstantExpr::create(0, Ctx);
+
+  VGPRBlocks = ZeroExpr;
+  SGPRBlocks = ZeroExpr;
+  Priority = 0;
+  FloatMode = 0;
+  Priv = 0;
+  DX10Clamp = 0;
+  DebugMode = 0;
+  IEEEMode = 0;
+  WgpMode = 0;
+  MemOrdered = 0;
+  RrWgMode = 0;
+  ScratchSize = ZeroExpr;
+
+  LDSBlocks = 0;
+  ScratchBlocks = ZeroExpr;
+
+  ScratchEnable = ZeroExpr;
+  UserSGPR = 0;
+  TrapHandlerEnable = 0;
+  TGIdXEnable = 0;
+  TGIdYEnable = 0;
+  TGIdZEnable = 0;
+  TGSizeEnable = 0;
+  TIdIGCompCount = 0;
+  EXCPEnMSB = 0;
+  LdsSize = 0;
+  EXCPEnable = 0;
+
+  ComputePGMRSrc3GFX90A = ZeroExpr;
+
+  NumVGPR = ZeroExpr;
+  NumArchVGPR = ZeroExpr;
+  NumAccVGPR = ZeroExpr;
+  AccumOffset = ZeroExpr;
+  TgSplit = 0;
+  NumSGPR = ZeroExpr;
+  SGPRSpill = 0;
+  VGPRSpill = 0;
+  LDSSize = 0;
+  FlatUsed = ZeroExpr;
+
+  NumSGPRsForWavesPerEU = ZeroExpr;
+  NumVGPRsForWavesPerEU = ZeroExpr;
+  Occupancy = ZeroExpr;
+  DynamicCallStack = ZeroExpr;
+  VCCUsed = ZeroExpr;
+}
+
+static uint64_t getComputePGMRSrc1Reg(const SIProgramInfo &ProgInfo,
+                                      const GCNSubtarget &ST) {
+  uint64_t Reg = S_00B848_PRIORITY(ProgInfo.Priority) |
+                 S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
+                 S_00B848_PRIV(ProgInfo.Priv) |
+                 S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
+                 S_00B848_WGP_MODE(ProgInfo.WgpMode) |
+                 S_00B848_MEM_ORDERED(ProgInfo.MemOrdered);
 
   if (ST.hasDX10ClampMode())
-    Reg |= S_00B848_DX10_CLAMP(DX10Clamp);
+    Reg |= S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp);
 
   if (ST.hasIEEEMode())
-    Reg |= S_00B848_IEEE_MODE(IEEEMode);
+    Reg |= S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
 
   if (ST.hasRrWGMode())
-    Reg |= S_00B848_RR_WG_MODE(RrWgMode);
+    Reg |= S_00B848_RR_WG_MODE(ProgInfo.RrWgMode);
 
   return Reg;
 }
 
-uint64_t SIProgramInfo::getPGMRSrc1(CallingConv::ID CC,
-                                    const GCNSubtarget &ST) const {
-  if (AMDGPU::isCompute(CC)) {
-    return getComputePGMRSrc1(ST);
-  }
-  uint64_t Reg = S_00B848_VGPRS(VGPRBlocks) | S_00B848_SGPRS(SGPRBlocks) |
-                 S_00B848_PRIORITY(Priority) | S_00B848_FLOAT_MODE(FloatMode) |
-                 S_00B848_PRIV(Priv) | S_00B848_DEBUG_MODE(DebugMode);
+static uint64_t getPGMRSrc1Reg(const SIProgramInfo &ProgInfo,
+                               CallingConv::ID CC, const GCNSubtarget &ST) {
+  uint64_t Reg = S_00B848_PRIORITY(ProgInfo.Priority) |
+                 S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
+                 S_00B848_PRIV(ProgInfo.Priv) |
+                 S_00B848_DEBUG_MODE(ProgInfo.DebugMode);
 
   if (ST.hasDX10ClampMode())
-    Reg |= S_00B848_DX10_CLAMP(DX10Clamp);
+    Reg |= S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp);
 
   if (ST.hasIEEEMode())
-    Reg |= S_00B848_IEEE_MODE(IEEEMode);
+    Reg |= S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
 
   if (ST.hasRrWGMode())
-    Reg |= S_00B848_RR_WG_MODE(RrWgMode);
+    Reg |= S_00B848_RR_WG_MODE(ProgInfo.RrWgMode);
 
   switch (CC) {
   case CallingConv::AMDGPU_PS:
-    Reg |= S_00B028_MEM_ORDERED(MemOrdered);
+    Reg |= S_00B028_MEM_ORDERED(ProgInfo.MemOrdered);
     break;
   case CallingConv::AMDGPU_VS:
-    Reg |= S_00B128_MEM_ORDERED(MemOrdered);
+    Reg |= S_00B128_MEM_ORDERED(ProgInfo.MemOrdered);
     break;
   case CallingConv::AMDGPU_GS:
-    Reg |= S_00B228_WGP_MODE(WgpMode) | S_00B228_MEM_ORDERED(MemOrdered);
+    Reg |= S_00B228_WGP_MODE(ProgInfo.WgpMode) |
+           S_00B228_MEM_ORDERED(ProgInfo.MemOrdered);
     break;
   case CallingConv::AMDGPU_HS:
-    Reg |= S_00B428_WGP_MODE(WgpMode) | S_00B428_MEM_ORDERED(MemOrdered);
+    Reg |= S_00B428_WGP_MODE(ProgInfo.WgpMode) |
+           S_00B428_MEM_ORDERED(ProgInfo.MemOrdered);
     break;
   default:
     break;
@@ -76,22 +133,108 @@ uint64_t SIProgramInfo::getPGMRSrc1(CallingConv::ID CC,
   return Reg;
 }
 
-uint64_t SIProgramInfo::getComputePGMRSrc2() const {
-  uint64_t Reg =
-      S_00B84C_SCRATCH_EN(ScratchEnable) | S_00B84C_USER_SGPR(UserSGPR) |
-      S_00B84C_TRAP_HANDLER(TrapHandlerEnable) |
-      S_00B84C_TGID_X_EN(TGIdXEnable) | S_00B84C_TGID_Y_EN(TGIdYEnable) |
-      S_00B84C_TGID_Z_EN(TGIdZEnable) | S_00B84C_TG_SIZE_EN(TGSizeEnable) |
-      S_00B84C_TIDIG_COMP_CNT(TIdIGCompCount) |
-      S_00B84C_EXCP_EN_MSB(EXCPEnMSB) | S_00B84C_LDS_SIZE(LdsSize) |
-      S_00B84C_EXCP_EN(EXCPEnable);
+static uint64_t getComputePGMRSrc2Reg(const SIProgramInfo &ProgInfo) {
+  uint64_t Reg = S_00B84C_USER_SGPR(ProgInfo.UserSGPR) |
+                 S_00B84C_TRAP_HANDLER(ProgInfo.TrapHandlerEnable) |
+                 S_00B84C_TGID_X_EN(ProgInfo.TGIdXEnable) |
+                 S_00B84C_TGID_Y_EN(ProgInfo.TGIdYEnable) |
+                 S_00B84C_TGID_Z_EN(ProgInfo.TGIdZEnable) |
+                 S_00B84C_TG_SIZE_EN(ProgInfo.TGSizeEnable) |
+                 S_00B84C_TIDIG_COMP_CNT(ProgInfo.TIdIGCompCount) |
+                 S_00B84C_EXCP_EN_MSB(ProgInfo.EXCPEnMSB) |
+                 S_00B84C_LDS_SIZE(ProgInfo.LdsSize) |
+                 S_00B84C_EXCP_EN(ProgInfo.EXCPEnable);
+
+  return Reg;
+}
+
+static const MCExpr *MaskShift(const MCExpr *Val, uint32_t Mask, uint32_t Shift,
+                               MCContext &Ctx) {
+  if (Mask) {
+    const MCExpr *MaskExpr = MCConstantExpr::create(Mask, Ctx);
+    Val = MCBinaryExpr::createAnd(Val, MaskExpr, Ctx);
+  }
+  if (Shift) {
+    const MCExpr *ShiftExpr = MCConstantExpr::create(Shift, Ctx);
+    Val = MCBinaryExpr::createShl(Val, ShiftExpr, Ctx);
+  }
+  return Val;
+}
+
+uint64_t SIProgramInfo::getComputePGMRSrc1(const GCNSubtarget &ST) const {
+  int64_t VBlocks, SBlocks;
+  VGPRBlocks->evaluateAsAbsolute(VBlocks);
+  SGPRBlocks->evaluateAsAbsolute(SBlocks);
+
+  uint64_t Reg = S_00B848_VGPRS(static_cast<uint64_t>(VBlocks)) |
+                 S_00B848_SGPRS(static_cast<uint64_t>(SBlocks)) |
+                 getComputePGMRSrc1Reg(*this, ST);
 
   return Reg;
 }
 
+uint64_t SIProgramInfo::getPGMRSrc1(CallingConv::ID CC,
+                                    const GCNSubtarget &ST) const {
+  if (AMDGPU::isCompute(CC)) {
+    return getComputePGMRSrc1(ST);
+  }
+  int64_t VBlocks, SBlocks;
+  VGPRBlocks->evaluateAsAbsolute(VBlocks);
+  SGPRBlocks->evaluateAsAbsolute(SBlocks);
+
+  return getPGMRSrc1Reg(*this, CC, ST) |
+         S_00B848_VGPRS(static_cast<uint64_t>(VBlocks)) |
+         S_00B848_SGPRS(static_cast<uint64_t>(SBlocks));
+}
+
+uint64_t SIProgramInfo::getComputePGMRSrc2() const {
+  int64_t ScratchEn;
+  ScratchEnable->evaluateAsAbsolute(ScratchEn);
+  return ScratchEn | getComputePGMRSrc2Reg(*this);
+}
+
 uint64_t SIProgramInfo::getPGMRSrc2(CallingConv::ID CC) const {
   if (AMDGPU::isCompute(CC))
     return getComputePGMRSrc2();
 
   return 0;
 }
+
+const MCExpr *SIProgramInfo::getComputePGMRSrc1(const GCNSubtarget &ST,
+                                                MCContext &Ctx) const {
+  uint64_t Reg = getComputePGMRSrc1Reg(*this, ST);
+  const MCExpr *RegExpr = MCConstantExpr::create(Reg, Ctx);
+  const MCExpr *Res = MCBinaryExpr::createOr(
+      MaskShift(VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0, Ctx),
+      MaskShift(SGPRBlocks, /*Mask=*/0xF, /*Shift=*/6, Ctx), Ctx);
+  return MCBinaryExpr::createOr(RegExpr, Res, Ctx);
+}
+
+const MCExpr *SIProgramInfo::getPGMRSrc1(CallingConv::ID CC,
+                                         const GCNSubtarget &ST,
+                                         MCContext &Ctx) const {
+  if (AMDGPU::isCompute(CC)) {
+    return getComputePGMRSrc1(ST, Ctx);
+  }
+
+  uint64_t Reg = getPGMRSrc1Reg(*this, CC, ST);
+  const MCExpr *RegExpr = MCConstantExpr::create(Reg, Ctx);
+  const MCExpr *Res = MCBinaryExpr::createOr(
+      MaskShift(VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0, Ctx),
+      MaskShift(SGPRBlocks, /*Mask=*/0xF, /*Shift=*/6, Ctx), Ctx);
+  return MCBinaryExpr::createOr(RegExpr, Res, Ctx);
+}
+
+const MCExpr *SIProgramInfo::getComputePGMRSrc2(MCContext &Ctx) const {
+  uint64_t Reg = getComputePGMRSrc2Reg(*this);
+  const MCExpr *RegExpr = MCConstantExpr::create(Reg, Ctx);
+  return MCBinaryExpr::createOr(ScratchEnable, RegExpr, Ctx);
+}
+
+const MCExpr *SIProgramInfo::getPGMRSrc2(CallingConv::ID CC,
+                                         MCContext &Ctx) const {
+  if (AMDGPU::isCompute(CC))
+    return getComputePGMRSrc2(Ctx);
+
+  return MCConstantExpr::create(0, Ctx);
+}
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
index 8c26789f936c..e66e5a194c8b 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -17,17 +17,21 @@
 #define LLVM_LIB_TARGET_AMDGPU_SIPROGRAMINFO_H
 
 #include "llvm/IR/CallingConv.h"
+#include "llvm/Support/Compiler.h"
 #include <cstdint>
 
 namespace llvm {
 
 class GCNSubtarget;
+class MCContext;
+class MCExpr;
+class MachineFunction;
 
 /// Track resource usage for kernels / entry functions.
-struct SIProgramInfo {
+struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo {
     // Fields set in PGM_RSRC1 pm4 packet.
-    uint32_t VGPRBlocks = 0;
-    uint32_t SGPRBlocks = 0;
+    const MCExpr *VGPRBlocks = nullptr;
+    const MCExpr *SGPRBlocks = nullptr;
     uint32_t Priority = 0;
     uint32_t FloatMode = 0;
     uint32_t Priv = 0;
@@ -37,14 +41,14 @@ struct SIProgramInfo {
     uint32_t WgpMode = 0; // GFX10+
     uint32_t MemOrdered = 0; // GFX10+
     uint32_t RrWgMode = 0;   // GFX12+
-    uint64_t ScratchSize = 0;
+    const MCExpr *ScratchSize = nullptr;
 
     // State used to calculate fields set in PGM_RSRC2 pm4 packet.
     uint32_t LDSBlocks = 0;
-    uint32_t ScratchBlocks = 0;
+    const MCExpr *ScratchBlocks = nullptr;
 
     // Fields set in PGM_RSRC2 pm4 packet
-    uint32_t ScratchEnable = 0;
+    const MCExpr *ScratchEnable = nullptr;
     uint32_t UserSGPR = 0;
     uint32_t TrapHandlerEnable = 0;
     uint32_t TGIdXEnable = 0;
@@ -56,44 +60,56 @@ struct SIProgramInfo {
     uint32_t LdsSize = 0;
     uint32_t EXCPEnable = 0;
 
-    uint64_t ComputePGMRSrc3GFX90A = 0;
+    const MCExpr *ComputePGMRSrc3GFX90A = nullptr;
 
-    uint32_t NumVGPR = 0;
-    uint32_t NumArchVGPR = 0;
-    uint32_t NumAccVGPR = 0;
-    uint32_t AccumOffset = 0;
+    const MCExpr *NumVGPR = nullptr;
+    const MCExpr *NumArchVGPR = nullptr;
+    const MCExpr *NumAccVGPR = nullptr;
+    const MCExpr *AccumOffset = nullptr;
     uint32_t TgSplit = 0;
-    uint32_t NumSGPR = 0;
+    const MCExpr *NumSGPR = nullptr;
     unsigned SGPRSpill = 0;
     unsigned VGPRSpill = 0;
     uint32_t LDSSize = 0;
-    bool FlatUsed = false;
+    const MCExpr *FlatUsed = nullptr;
 
     // Number of SGPRs that meets number of waves per execution unit request.
-    uint32_t NumSGPRsForWavesPerEU = 0;
+    const MCExpr *NumSGPRsForWavesPerEU = nullptr;
 
     // Number of VGPRs that meets number of waves per execution unit request.
-    uint32_t NumVGPRsForWavesPerEU = 0;
+    const MCExpr *NumVGPRsForWavesPerEU = nullptr;
 
     // Final occupancy.
-    uint32_t Occupancy = 0;
+    const MCExpr *Occupancy = nullptr;
 
     // Whether there is recursion, dynamic allocas, indirect calls or some other
     // reason there may be statically unknown stack usage.
-    bool DynamicCallStack = false;
+    const MCExpr *DynamicCallStack = nullptr;
 
     // Bonus information for debugging.
-    bool VCCUsed = false;
+    const MCExpr *VCCUsed = nullptr;
 
     SIProgramInfo() = default;
 
+    // The constructor sets the values for each member as shown in the struct.
+    // However, setting the MCExpr members to their zero value equivalent
+    // happens in reset together with (duplicated) value re-set for the
+    // non-MCExpr members.
+    void reset(const MachineFunction &MF);
+
     /// Compute the value of the ComputePGMRsrc1 register.
     uint64_t getComputePGMRSrc1(const GCNSubtarget &ST) const;
     uint64_t getPGMRSrc1(CallingConv::ID CC, const GCNSubtarget &ST) const;
+    const MCExpr *getComputePGMRSrc1(const GCNSubtarget &ST,
+                                     MCContext &Ctx) const;
+    const MCExpr *getPGMRSrc1(CallingConv::ID CC, const GCNSubtarget &ST,
+                              MCContext &Ctx) const;
 
     /// Compute the value of the ComputePGMRsrc2 register.
     uint64_t getComputePGMRSrc2() const;
     uint64_t getPGMRSrc2(CallingConv::ID CC) const;
+    const MCExpr *getComputePGMRSrc2(MCContext &Ctx) const;
+    const MCExpr *getPGMRSrc2(CallingConv::ID CC, MCContext &Ctx) const;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 01ed565bb756..caac7126068e 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -586,6 +586,7 @@ class RegisterTypes<list<ValueType> reg_types> {
 
 def Reg16Types : RegisterTypes<[i16, f16, bf16]>;
 def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, v2bf16, p2, p3, p5, p6]>;
+def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, p0]>;
 
 let HasVGPR = 1 in {
 // VOP3 and VINTERP can access 256 lo and 256 hi registers.
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index afc9da07bc96..40ba47f88771 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -1332,8 +1332,9 @@ multiclass SM_Real_Loads_gfx11<bits<8> op, string ps> {
   def _IMM_gfx11 : SMEM_Real_Load_gfx11<op, ps#"_IMM", opName>;
   def _SGPR_gfx11 : SMEM_Real_Load_gfx11<op, ps#"_SGPR", opName>;
   def _SGPR_IMM_gfx11 : SMEM_Real_Load_gfx11<op, ps#"_SGPR_IMM", opName>;
-  def : MnemonicAlias<!cast<SM_Pseudo>(ps#"_IMM").Mnemonic, opName>,
-                      Requires<[isGFX11Plus]>;
+  def : AMDGPUMnemonicAlias<!cast<SM_Pseudo>(ps#"_IMM").Mnemonic, opName> {
+    let AssemblerPredicate = isGFX11Plus;
+  }
 }
 
 defm S_LOAD_B32  : SM_Real_Loads_gfx11<0x000, "S_LOAD_DWORD">;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 93b7e86b5f29..394a5ed991bc 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1110,14 +1110,15 @@ def S_CBRANCH_I_FORK : SOPK_Pseudo <
 
 // This is hasSideEffects to allow its use in readcyclecounter selection.
 // FIXME: Need to truncate immediate to 16-bits.
-// FIXME: Missing mode register use. Should have separate pseudos for
-// known may read MODE and only read MODE.
+// FIXME: Should have separate pseudos for known may read MODE and
+// only read MODE.
 def S_GETREG_B32 : SOPK_Pseudo <
   "s_getreg_b32",
   (outs SReg_32:$sdst), (ins hwreg:$simm16),
   "$sdst, $simm16",
   [(set i32:$sdst, (int_amdgcn_s_getreg (i32 timm:$simm16)))]> {
   let hasSideEffects = 1;
+  let Uses = [MODE];
 }
 
 let Defs = [MODE], Uses = [MODE] in {
@@ -1974,7 +1975,9 @@ multiclass SOP1_Real_gfx11<bits<8> op, string name = !tolower(NAME)> {
   def _gfx11 : SOP1_Real<op, ps, name>,
                Select_gfx11<ps.PseudoInstr>;
   if !ne(ps.Mnemonic, name) then
-    def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX11Only]>;
+    def : AMDGPUMnemonicAlias<ps.Mnemonic, name> {
+      let AssemblerPredicate = isGFX11Only;
+    }
 }
 
 multiclass SOP1_Real_gfx12<bits<8> op, string name = !tolower(NAME)> {
@@ -1982,7 +1985,9 @@ multiclass SOP1_Real_gfx12<bits<8> op, string name = !tolower(NAME)> {
   def _gfx12 : SOP1_Real<op, ps, name>,
                Select_gfx12<ps.PseudoInstr>;
   if !ne(ps.Mnemonic, name) then
-    def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX12Plus]>;
+    def : AMDGPUMnemonicAlias<ps.Mnemonic, name> {
+      let AssemblerPredicate = isGFX12Plus;
+    }
 }
 
 multiclass SOP1_M0_Real_gfx12<bits<8> op> {
@@ -2207,7 +2212,9 @@ multiclass SOP2_Real_gfx12<bits<7> op, string name = !tolower(NAME)> {
   def _gfx12 : SOP2_Real32<op, ps, name>,
                Select_gfx12<ps.PseudoInstr>;
   if !ne(ps.Mnemonic, name) then
-    def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX12Plus]>;
+    def : AMDGPUMnemonicAlias<ps.Mnemonic, name> {
+      let AssemblerPredicate = isGFX12Plus;
+    }
 }
 
 defm S_MINIMUM_F32 : SOP2_Real_gfx12<0x04f>;
@@ -2224,7 +2231,9 @@ multiclass SOP2_Real_gfx11<bits<7> op, string name = !tolower(NAME)> {
   def _gfx11 : SOP2_Real32<op, ps, name>,
                Select_gfx11<ps.PseudoInstr>;
   if !ne(ps.Mnemonic, name) then
-    def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX11Only]>;
+    def : AMDGPUMnemonicAlias<ps.Mnemonic, name> {
+      let AssemblerPredicate = isGFX11Only;
+    }
 }
 
 multiclass SOP2_Real_gfx11_gfx12<bits<7> op, string name = !tolower(NAME)> :
@@ -2412,7 +2421,9 @@ multiclass SOPK_Real32_gfx12<bits<5> op, string name = !tolower(NAME)> {
   def _gfx12 : SOPK_Real32<op, ps, name>,
                Select_gfx12<ps.PseudoInstr>;
   if !ne(ps.Mnemonic, name) then
-    def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX12Plus]>;
+    def : AMDGPUMnemonicAlias<ps.Mnemonic, name> {
+      let AssemblerPredicate = isGFX12Plus;
+    }
 }
 
 multiclass SOPK_Real32_gfx11<bits<5> op> {
@@ -2541,7 +2552,9 @@ multiclass SOPP_Real_32_gfx12<bits<7> op, string name = !tolower(NAME)> {
   def _gfx12 : SOPP_Real_32<op, ps, name>,
                Select_gfx12<ps.PseudoInstr>;
   if !ne(ps.Mnemonic, name) then
-    def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX12Plus]>;
+    def : AMDGPUMnemonicAlias<ps.Mnemonic, name> {
+      let AssemblerPredicate = isGFX12Plus;
+    }
 }
 
 defm S_BARRIER_WAIT         : SOPP_Real_32_gfx12<0x014>;
@@ -2567,7 +2580,9 @@ multiclass SOPP_Real_32_gfx11<bits<7> op, string name = !tolower(NAME)> {
                Select_gfx11<ps.PseudoInstr>,
                SOPPRelaxTable<0, ps.KeyName, "_gfx11">;
   if !ne(ps.Mnemonic, name) then
-    def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX11Only]>;
+    def : AMDGPUMnemonicAlias<ps.Mnemonic, name> {
+      let AssemblerPredicate = isGFX11Only;
+    }
 }
 
 multiclass SOPP_Real_64_gfx12<bits<7> op> {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index d468b14d54d3..2e1db1665b9c 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -12,6 +12,60 @@
 namespace llvm {
 namespace AMDGPU {
 
+//===----------------------------------------------------------------------===//
+// Custom Operands.
+//
+// A table of custom operands shall describe "primary" operand names first
+// followed by aliases if any. It is not required but recommended to arrange
+// operands so that operand encoding match operand position in the table. This
+// will make getNameFromOperandTable() a bit more efficient. Unused slots in the
+// table shall have an empty name.
+//
+//===----------------------------------------------------------------------===//
+
+/// Map from the encoding of a sendmsg/hwreg asm operand to it's name.
+template <size_t N>
+static StringRef getNameFromOperandTable(const CustomOperand (&Table)[N],
+                                         unsigned Encoding,
+                                         const MCSubtargetInfo &STI) {
+  auto isValidIndexForEncoding = [&](size_t Idx) {
+    return Idx < N && Table[Idx].Encoding == Encoding &&
+           !Table[Idx].Name.empty() &&
+           (!Table[Idx].Cond || Table[Idx].Cond(STI));
+  };
+
+  // This is an optimization that should work in most cases. As a side effect,
+  // it may cause selection of an alias instead of a primary operand name in
+  // case of sparse tables.
+  if (isValidIndexForEncoding(Encoding))
+    return Table[Encoding].Name;
+
+  for (size_t Idx = 0; Idx != N; ++Idx)
+    if (isValidIndexForEncoding(Idx))
+      return Table[Idx].Name;
+
+  return "";
+}
+
+/// Map from a symbolic name for a sendmsg/hwreg asm operand to it's encoding.
+template <size_t N>
+static int64_t getEncodingFromOperandTable(const CustomOperand (&Table)[N],
+                                           StringRef Name,
+                                           const MCSubtargetInfo &STI) {
+  int64_t InvalidEncoding = OPR_ID_UNKNOWN;
+  for (const CustomOperand &Entry : Table) {
+    if (Entry.Name != Name)
+      continue;
+
+    if (!Entry.Cond || Entry.Cond(STI))
+      return Entry.Encoding;
+
+    InvalidEncoding = OPR_ID_UNSUPPORTED;
+  }
+
+  return InvalidEncoding;
+}
+
 namespace DepCtr {
 
 // NOLINTBEGIN
@@ -34,10 +88,11 @@ const int DEP_CTR_SIZE =
 
 namespace SendMsg {
 
-// Disable lint checking for this block since it makes the table unreadable.
+// Disable lint checking here since it makes these tables unreadable.
 // NOLINTBEGIN
 // clang-format off
-const CustomOperand<const MCSubtargetInfo &> Msg[] = {
+
+static constexpr CustomOperand MsgOperands[] = {
   {{""}},
   {{"MSG_INTERRUPT"},           ID_INTERRUPT},
   {{"MSG_GS"},                  ID_GS_PreGFX11,             isNotGFX11Plus},
@@ -63,27 +118,47 @@ const CustomOperand<const MCSubtargetInfo &> Msg[] = {
   {{"MSG_RTN_GET_TBA_TO_PC"},   ID_RTN_GET_TBA_TO_PC,       isGFX11Plus},
   {{"MSG_RTN_GET_SE_AID_ID"},   ID_RTN_GET_SE_AID_ID,       isGFX12Plus},
 };
+
+static constexpr CustomOperand SysMsgOperands[] = {
+  {{""}},
+  {{"SYSMSG_OP_ECC_ERR_INTERRUPT"},  OP_SYS_ECC_ERR_INTERRUPT},
+  {{"SYSMSG_OP_REG_RD"},             OP_SYS_REG_RD},
+  {{"SYSMSG_OP_HOST_TRAP_ACK"},      OP_SYS_HOST_TRAP_ACK,      isNotGFX9Plus},
+  {{"SYSMSG_OP_TTRACE_PC"},          OP_SYS_TTRACE_PC},
+};
+
+static constexpr CustomOperand StreamMsgOperands[] = {
+  {{"GS_OP_NOP"},       OP_GS_NOP},
+  {{"GS_OP_CUT"},       OP_GS_CUT},
+  {{"GS_OP_EMIT"},      OP_GS_EMIT},
+  {{"GS_OP_EMIT_CUT"},  OP_GS_EMIT_CUT},
+};
+
 // clang-format on
 // NOLINTEND
 
-const int MSG_SIZE = static_cast<int>(
-    sizeof(Msg) / sizeof(CustomOperand<const MCSubtargetInfo &>));
+int64_t getMsgId(StringRef Name, const MCSubtargetInfo &STI) {
+  return getEncodingFromOperandTable(MsgOperands, Name, STI);
+}
 
-// These two must be in sync with llvm::AMDGPU::SendMsg::Op enum members, see SIDefines.h.
-const char *const OpSysSymbolic[OP_SYS_LAST_] = {
-  nullptr,
-  "SYSMSG_OP_ECC_ERR_INTERRUPT",
-  "SYSMSG_OP_REG_RD",
-  "SYSMSG_OP_HOST_TRAP_ACK",
-  "SYSMSG_OP_TTRACE_PC"
-};
+StringRef getMsgName(uint64_t Encoding, const MCSubtargetInfo &STI) {
+  return getNameFromOperandTable(MsgOperands, Encoding, STI);
+}
 
-const char *const OpGsSymbolic[OP_GS_LAST_] = {
-  "GS_OP_NOP",
-  "GS_OP_CUT",
-  "GS_OP_EMIT",
-  "GS_OP_EMIT_CUT"
-};
+int64_t getMsgOpId(int64_t MsgId, StringRef Name, const MCSubtargetInfo &STI) {
+  if (MsgId == ID_SYSMSG)
+    return getEncodingFromOperandTable(SysMsgOperands, Name, STI);
+  return getEncodingFromOperandTable(StreamMsgOperands, Name, STI);
+}
+
+StringRef getMsgOpName(int64_t MsgId, uint64_t Encoding,
+                       const MCSubtargetInfo &STI) {
+  assert(msgRequiresOp(MsgId, STI) && "must have an operand");
+
+  if (MsgId == ID_SYSMSG)
+    return getNameFromOperandTable(SysMsgOperands, Encoding, STI);
+  return getNameFromOperandTable(StreamMsgOperands, Encoding, STI);
+}
 
 } // namespace SendMsg
 
@@ -92,7 +167,7 @@ namespace Hwreg {
 // Disable lint checking for this block since it makes the table unreadable.
 // NOLINTBEGIN
 // clang-format off
-const CustomOperand<const MCSubtargetInfo &> Opr[] = {
+static constexpr CustomOperand Operands[] = {
   {{""}},
   {{"HW_REG_MODE"},          ID_MODE},
   {{"HW_REG_STATUS"},        ID_STATUS},
@@ -155,8 +230,13 @@ const CustomOperand<const MCSubtargetInfo &> Opr[] = {
 // clang-format on
 // NOLINTEND
 
-const int OPR_SIZE = static_cast<int>(
-    sizeof(Opr) / sizeof(CustomOperand<const MCSubtargetInfo &>));
+int64_t getHwregId(StringRef Name, const MCSubtargetInfo &STI) {
+  return getEncodingFromOperandTable(Operands, Name, STI);
+}
+
+StringRef getHwreg(uint64_t Encoding, const MCSubtargetInfo &STI) {
+  return getNameFromOperandTable(Operands, Encoding, STI);
+}
 
 } // namespace Hwreg
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
index 054e35e90f2f..069134a7ae7f 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
@@ -25,10 +25,10 @@ const int OPR_ID_UNSUPPORTED = -2;
 const int OPR_ID_DUPLICATE = -3;
 const int OPR_VAL_INVALID = -4;
 
-template <class T> struct CustomOperand {
+struct CustomOperand {
   StringLiteral Name;
-  int Encoding = 0;
-  bool (*Cond)(T Context) = nullptr;
+  unsigned Encoding = 0;
+  bool (*Cond)(const MCSubtargetInfo &STI) = nullptr;
 };
 
 struct CustomOperandVal {
@@ -60,20 +60,34 @@ extern const int DEP_CTR_SIZE;
 
 } // namespace DepCtr
 
-namespace SendMsg { // Symbolic names for the sendmsg(...) syntax.
+// Symbolic names for the sendmsg(msg_id, operation, stream) syntax.
+namespace SendMsg {
+
+/// Map from a symbolic name for a msg_id to the message portion of the
+/// immediate encoding. A negative return value indicates that the Name was
+/// unknown or unsupported on this target.
+int64_t getMsgId(StringRef Name, const MCSubtargetInfo &STI);
+
+/// Map from an encoding to the symbolic name for a msg_id immediate. This is
+/// doing opposite of getMsgId().
+StringRef getMsgName(uint64_t Encoding, const MCSubtargetInfo &STI);
 
-extern const CustomOperand<const MCSubtargetInfo &> Msg[];
-extern const int MSG_SIZE;
+/// Map from a symbolic name for a sendmsg operation to the operation portion of
+/// the immediate encoding. A negative return value indicates that the Name was
+/// unknown or unsupported on this target.
+int64_t getMsgOpId(int64_t MsgId, StringRef Name, const MCSubtargetInfo &STI);
 
-extern const char *const OpSysSymbolic[OP_SYS_LAST_];
-extern const char *const OpGsSymbolic[OP_GS_LAST_];
+/// Map from an encoding to the symbolic name for a sendmsg operation. This is
+/// doing opposite of getMsgOpId().
+StringRef getMsgOpName(int64_t MsgId, uint64_t Encoding,
+                       const MCSubtargetInfo &STI);
 
 } // namespace SendMsg
 
 namespace Hwreg { // Symbolic names for the hwreg(...) syntax.
 
-extern const CustomOperand<const MCSubtargetInfo &> Opr[];
-extern const int OPR_SIZE;
+int64_t getHwregId(StringRef Name, const MCSubtargetInfo &STI);
+StringRef getHwreg(uint64_t Encoding, const MCSubtargetInfo &STI);
 
 } // namespace Hwreg
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 4e0074451aa5..2beaf903542b 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1129,12 +1129,45 @@ unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
 
 unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
                                       unsigned NumVGPRs) {
-  unsigned MaxWaves = getMaxWavesPerEU(STI);
-  unsigned Granule = getVGPRAllocGranule(STI);
+  return getNumWavesPerEUWithNumVGPRs(NumVGPRs, getVGPRAllocGranule(STI),
+                                      getMaxWavesPerEU(STI),
+                                      getTotalNumVGPRs(STI));
+}
+
+unsigned getNumWavesPerEUWithNumVGPRs(unsigned NumVGPRs, unsigned Granule,
+                                      unsigned MaxWaves,
+                                      unsigned TotalNumVGPRs) {
   if (NumVGPRs < Granule)
     return MaxWaves;
   unsigned RoundedRegs = alignTo(NumVGPRs, Granule);
-  return std::min(std::max(getTotalNumVGPRs(STI) / RoundedRegs, 1u), MaxWaves);
+  return std::min(std::max(TotalNumVGPRs / RoundedRegs, 1u), MaxWaves);
+}
+
+unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves,
+                                  AMDGPUSubtarget::Generation Gen) {
+  if (Gen >= AMDGPUSubtarget::GFX10)
+    return MaxWaves;
+
+  if (Gen >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+    if (SGPRs <= 80)
+      return 10;
+    if (SGPRs <= 88)
+      return 9;
+    if (SGPRs <= 100)
+      return 8;
+    return 7;
+  }
+  if (SGPRs <= 48)
+    return 10;
+  if (SGPRs <= 56)
+    return 9;
+  if (SGPRs <= 64)
+    return 8;
+  if (SGPRs <= 72)
+    return 7;
+  if (SGPRs <= 80)
+    return 6;
+  return 5;
 }
 
 unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
@@ -1496,62 +1529,6 @@ unsigned encodeStorecntDscnt(const IsaVersion &Version,
 }
 
 //===----------------------------------------------------------------------===//
-// Custom Operands.
-//
-// A table of custom operands shall describe "primary" operand names
-// first followed by aliases if any. It is not required but recommended
-// to arrange operands so that operand encoding match operand position
-// in the table. This will make disassembly a bit more efficient.
-// Unused slots in the table shall have an empty name.
-//
-//===----------------------------------------------------------------------===//
-
-template <class T>
-static bool isValidOpr(int Idx, const CustomOperand<T> OpInfo[], int OpInfoSize,
-                       T Context) {
-  return 0 <= Idx && Idx < OpInfoSize && !OpInfo[Idx].Name.empty() &&
-         (!OpInfo[Idx].Cond || OpInfo[Idx].Cond(Context));
-}
-
-template <class T>
-static int getOprIdx(std::function<bool(const CustomOperand<T> &)> Test,
-                     const CustomOperand<T> OpInfo[], int OpInfoSize,
-                     T Context) {
-  int InvalidIdx = OPR_ID_UNKNOWN;
-  for (int Idx = 0; Idx < OpInfoSize; ++Idx) {
-    if (Test(OpInfo[Idx])) {
-      if (!OpInfo[Idx].Cond || OpInfo[Idx].Cond(Context))
-        return Idx;
-      InvalidIdx = OPR_ID_UNSUPPORTED;
-    }
-  }
-  return InvalidIdx;
-}
-
-template <class T>
-static int getOprIdx(const StringRef Name, const CustomOperand<T> OpInfo[],
-                     int OpInfoSize, T Context) {
-  auto Test = [=](const CustomOperand<T> &Op) { return Op.Name == Name; };
-  return getOprIdx<T>(Test, OpInfo, OpInfoSize, Context);
-}
-
-template <class T>
-static int getOprIdx(int Id, const CustomOperand<T> OpInfo[], int OpInfoSize,
-                     T Context, bool QuickCheck = true) {
-  auto Test = [=](const CustomOperand<T> &Op) {
-    return Op.Encoding == Id && !Op.Name.empty();
-  };
-  // This is an optimization that should work in most cases.
-  // As a side effect, it may cause selection of an alias
-  // instead of a primary operand name in case of sparse tables.
-  if (QuickCheck && isValidOpr<T>(Id, OpInfo, OpInfoSize, Context) &&
-      OpInfo[Id].Encoding == Id) {
-    return Id;
-  }
-  return getOprIdx<T>(Test, OpInfo, OpInfoSize, Context);
-}
-
-//===----------------------------------------------------------------------===//
 // Custom Operand Values
 //===----------------------------------------------------------------------===//
 
@@ -1702,24 +1679,6 @@ unsigned encodeFieldSaSdst(unsigned SaSdst) {
 } // namespace DepCtr
 
 //===----------------------------------------------------------------------===//
-// hwreg
-//===----------------------------------------------------------------------===//
-
-namespace Hwreg {
-
-int64_t getHwregId(const StringRef Name, const MCSubtargetInfo &STI) {
-  int Idx = getOprIdx<const MCSubtargetInfo &>(Name, Opr, OPR_SIZE, STI);
-  return (Idx < 0) ? Idx : Opr[Idx].Encoding;
-}
-
-StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI) {
-  int Idx = getOprIdx<const MCSubtargetInfo &>(Id, Opr, OPR_SIZE, STI);
-  return (Idx < 0) ? "" : Opr[Idx].Name;
-}
-
-} // namespace Hwreg
-
-//===----------------------------------------------------------------------===//
 // exp tgt
 //===----------------------------------------------------------------------===//
 
@@ -1919,32 +1878,10 @@ static uint64_t getMsgIdMask(const MCSubtargetInfo &STI) {
   return isGFX11Plus(STI) ? ID_MASK_GFX11Plus_ : ID_MASK_PreGFX11_;
 }
 
-int64_t getMsgId(const StringRef Name, const MCSubtargetInfo &STI) {
-  int Idx = getOprIdx<const MCSubtargetInfo &>(Name, Msg, MSG_SIZE, STI);
-  return (Idx < 0) ? Idx : Msg[Idx].Encoding;
-}
-
 bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI) {
   return (MsgId & ~(getMsgIdMask(STI))) == 0;
 }
 
-StringRef getMsgName(int64_t MsgId, const MCSubtargetInfo &STI) {
-  int Idx = getOprIdx<const MCSubtargetInfo &>(MsgId, Msg, MSG_SIZE, STI);
-  return (Idx < 0) ? "" : Msg[Idx].Name;
-}
-
-int64_t getMsgOpId(int64_t MsgId, const StringRef Name) {
-  const char* const *S = (MsgId == ID_SYSMSG) ? OpSysSymbolic : OpGsSymbolic;
-  const int F = (MsgId == ID_SYSMSG) ? OP_SYS_FIRST_ : OP_GS_FIRST_;
-  const int L = (MsgId == ID_SYSMSG) ? OP_SYS_LAST_ : OP_GS_LAST_;
-  for (int i = F; i < L; ++i) {
-    if (Name == S[i]) {
-      return i;
-    }
-  }
-  return OP_UNKNOWN_;
-}
-
 bool isValidMsgOp(int64_t MsgId, int64_t OpId, const MCSubtargetInfo &STI,
                   bool Strict) {
   assert(isValidMsgId(MsgId, STI));
@@ -1952,23 +1889,14 @@ bool isValidMsgOp(int64_t MsgId, int64_t OpId, const MCSubtargetInfo &STI,
   if (!Strict)
     return 0 <= OpId && isUInt<OP_WIDTH_>(OpId);
 
-  if (MsgId == ID_SYSMSG)
-    return OP_SYS_FIRST_ <= OpId && OpId < OP_SYS_LAST_;
-  if (!isGFX11Plus(STI)) {
-    switch (MsgId) {
-    case ID_GS_PreGFX11:
-      return (OP_GS_FIRST_ <= OpId && OpId < OP_GS_LAST_) && OpId != OP_GS_NOP;
-    case ID_GS_DONE_PreGFX11:
-      return OP_GS_FIRST_ <= OpId && OpId < OP_GS_LAST_;
-    }
+  if (msgRequiresOp(MsgId, STI)) {
+    if (MsgId == ID_GS_PreGFX11 && OpId == OP_GS_NOP)
+      return false;
+
+    return !getMsgOpName(MsgId, OpId, STI).empty();
   }
-  return OpId == OP_NONE_;
-}
 
-StringRef getMsgOpName(int64_t MsgId, int64_t OpId,
-                       const MCSubtargetInfo &STI) {
-  assert(msgRequiresOp(MsgId, STI));
-  return (MsgId == ID_SYSMSG)? OpSysSymbolic[OpId] : OpGsSymbolic[OpId];
+  return OpId == OP_NONE_;
 }
 
 bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId,
@@ -2186,6 +2114,8 @@ bool isGFX9Plus(const MCSubtargetInfo &STI) {
   return isGFX9(STI) || isGFX10Plus(STI);
 }
 
+bool isNotGFX9Plus(const MCSubtargetInfo &STI) { return !isGFX9Plus(STI); }
+
 bool isGFX10(const MCSubtargetInfo &STI) {
   return STI.hasFeature(AMDGPU::FeatureGFX10);
 }
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 943588fe701c..fc4147df76e3 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H
 #define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H
 
+#include "AMDGPUSubtarget.h"
 #include "SIDefines.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/InstrTypes.h"
@@ -311,6 +312,17 @@ unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
 unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
                                       unsigned NumVGPRs);
 
+/// \returns Number of waves reachable for a given \p NumVGPRs usage, \p Granule
+/// size, \p MaxWaves possible, and \p TotalNumVGPRs available.
+unsigned getNumWavesPerEUWithNumVGPRs(unsigned NumVGPRs, unsigned Granule,
+                                      unsigned MaxWaves,
+                                      unsigned TotalNumVGPRs);
+
+/// \returns Occupancy for a given \p SGPRs usage, \p MaxWaves possible, and \p
+/// Gen.
+unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves,
+                                  AMDGPUSubtarget::Generation Gen);
+
 /// \returns Number of VGPR blocks needed for given subtarget \p STI when
 /// \p NumVGPRs are used. We actually return the number of blocks -1, since
 /// that's what we encode.
@@ -1078,12 +1090,6 @@ struct HwregSize : EncodingField<15, 11, 32> {
 
 using HwregEncoding = EncodingFields<HwregId, HwregOffset, HwregSize>;
 
-LLVM_READONLY
-int64_t getHwregId(const StringRef Name, const MCSubtargetInfo &STI);
-
-LLVM_READNONE
-StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI);
-
 } // namespace Hwreg
 
 namespace DepCtr {
@@ -1173,18 +1179,6 @@ unsigned getDefaultFormatEncoding(const MCSubtargetInfo &STI);
 
 namespace SendMsg {
 
-LLVM_READONLY
-int64_t getMsgId(const StringRef Name, const MCSubtargetInfo &STI);
-
-LLVM_READONLY
-int64_t getMsgOpId(int64_t MsgId, const StringRef Name);
-
-LLVM_READNONE
-StringRef getMsgName(int64_t MsgId, const MCSubtargetInfo &STI);
-
-LLVM_READNONE
-StringRef getMsgOpName(int64_t MsgId, int64_t OpId, const MCSubtargetInfo &STI);
-
 LLVM_READNONE
 bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI);
 
@@ -1276,6 +1270,7 @@ bool isGFX9_GFX10_GFX11(const MCSubtargetInfo &STI);
 bool isGFX8_GFX9_GFX10(const MCSubtargetInfo &STI);
 bool isGFX8Plus(const MCSubtargetInfo &STI);
 bool isGFX9Plus(const MCSubtargetInfo &STI);
+bool isNotGFX9Plus(const MCSubtargetInfo &STI);
 bool isGFX10(const MCSubtargetInfo &STI);
 bool isGFX10_GFX11(const MCSubtargetInfo &STI);
 bool isGFX10Plus(const MCSubtargetInfo &STI);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
index 79c359a57554..239e0ee70572 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
@@ -9,13 +9,16 @@
 #include "AMDGPUMemoryUtils.h"
 #include "AMDGPU.h"
 #include "AMDGPUBaseInfo.h"
+#include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/Operator.h"
 #include "llvm/IR/ReplaceConstant.h"
 
 #define DEBUG_TYPE "amdgpu-memory-utils"
@@ -26,7 +29,7 @@ namespace llvm {
 
 namespace AMDGPU {
 
-Align getAlign(DataLayout const &DL, const GlobalVariable *GV) {
+Align getAlign(const DataLayout &DL, const GlobalVariable *GV) {
   return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL),
                                        GV->getValueType());
 }
@@ -61,6 +64,216 @@ bool isLDSVariableToLower(const GlobalVariable &GV) {
   return true;
 }
 
+bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M) {
+  // Constants are uniqued within LLVM. A ConstantExpr referring to a LDS
+  // global may have uses from multiple different functions as a result.
+  // This pass specialises LDS variables with respect to the kernel that
+  // allocates them.
+
+  // This is semantically equivalent to (the unimplemented as slow):
+  // for (auto &F : M.functions())
+  //   for (auto &BB : F)
+  //     for (auto &I : BB)
+  //       for (Use &Op : I.operands())
+  //         if (constantExprUsesLDS(Op))
+  //           replaceConstantExprInFunction(I, Op);
+
+  SmallVector<Constant *> LDSGlobals;
+  for (auto &GV : M.globals())
+    if (AMDGPU::isLDSVariableToLower(GV))
+      LDSGlobals.push_back(&GV);
+  return convertUsersOfConstantsToInstructions(LDSGlobals);
+}
+
+void getUsesOfLDSByFunction(const CallGraph &CG, Module &M,
+                            FunctionVariableMap &kernels,
+                            FunctionVariableMap &Functions) {
+  // Get uses from the current function, excluding uses by called Functions
+  // Two output variables to avoid walking the globals list twice
+  for (auto &GV : M.globals()) {
+    if (!AMDGPU::isLDSVariableToLower(GV))
+      continue;
+    for (User *V : GV.users()) {
+      if (auto *I = dyn_cast<Instruction>(V)) {
+        Function *F = I->getFunction();
+        if (isKernelLDS(F))
+          kernels[F].insert(&GV);
+        else
+          Functions[F].insert(&GV);
+      }
+    }
+  }
+}
+
+bool isKernelLDS(const Function *F) {
+  // Some weirdness here. AMDGPU::isKernelCC does not call into
+  // AMDGPU::isKernel with the calling conv, it instead calls into
+  // isModuleEntryFunction which returns true for more calling conventions
+  // than AMDGPU::isKernel does. There's a FIXME on AMDGPU::isKernel.
+  // There's also a test that checks that the LDS lowering does not hit on
+  // a graphics shader, denoted amdgpu_ps, so stay with the limited case.
+  // Putting LDS in the name of the function to draw attention to this.
+  return AMDGPU::isKernel(F->getCallingConv());
+}
+
+LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
+
+  FunctionVariableMap DirectMapKernel;
+  FunctionVariableMap DirectMapFunction;
+  getUsesOfLDSByFunction(CG, M, DirectMapKernel, DirectMapFunction);
+
+  // Collect variables that are used by functions whose address has escaped
+  DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer;
+  for (Function &F : M.functions()) {
+    if (!isKernelLDS(&F))
+      if (F.hasAddressTaken(nullptr,
+                            /* IgnoreCallbackUses */ false,
+                            /* IgnoreAssumeLikeCalls */ false,
+                            /* IgnoreLLVMUsed */ true,
+                            /* IgnoreArcAttachedCall */ false)) {
+        set_union(VariablesReachableThroughFunctionPointer,
+                  DirectMapFunction[&F]);
+      }
+  }
+
+  auto FunctionMakesUnknownCall = [&](const Function *F) -> bool {
+    assert(!F->isDeclaration());
+    for (const CallGraphNode::CallRecord &R : *CG[F]) {
+      if (!R.second->getFunction())
+        return true;
+    }
+    return false;
+  };
+
+  // Work out which variables are reachable through function calls
+  FunctionVariableMap TransitiveMapFunction = DirectMapFunction;
+
+  // If the function makes any unknown call, assume the worst case that it can
+  // access all variables accessed by functions whose address escaped
+  for (Function &F : M.functions()) {
+    if (!F.isDeclaration() && FunctionMakesUnknownCall(&F)) {
+      if (!isKernelLDS(&F)) {
+        set_union(TransitiveMapFunction[&F],
+                  VariablesReachableThroughFunctionPointer);
+      }
+    }
+  }
+
+  // Direct implementation of collecting all variables reachable from each
+  // function
+  for (Function &Func : M.functions()) {
+    if (Func.isDeclaration() || isKernelLDS(&Func))
+      continue;
+
+    DenseSet<Function *> seen; // catches cycles
+    SmallVector<Function *, 4> wip = {&Func};
+
+    while (!wip.empty()) {
+      Function *F = wip.pop_back_val();
+
+      // Can accelerate this by referring to transitive map for functions that
+      // have already been computed, with more care than this
+      set_union(TransitiveMapFunction[&Func], DirectMapFunction[F]);
+
+      for (const CallGraphNode::CallRecord &R : *CG[F]) {
+        Function *Ith = R.second->getFunction();
+        if (Ith) {
+          if (!seen.contains(Ith)) {
+            seen.insert(Ith);
+            wip.push_back(Ith);
+          }
+        }
+      }
+    }
+  }
+
+  // DirectMapKernel lists which variables are used by the kernel
+  // find the variables which are used through a function call
+  FunctionVariableMap IndirectMapKernel;
+
+  for (Function &Func : M.functions()) {
+    if (Func.isDeclaration() || !isKernelLDS(&Func))
+      continue;
+
+    for (const CallGraphNode::CallRecord &R : *CG[&Func]) {
+      Function *Ith = R.second->getFunction();
+      if (Ith) {
+        set_union(IndirectMapKernel[&Func], TransitiveMapFunction[Ith]);
+      } else {
+        set_union(IndirectMapKernel[&Func],
+                  VariablesReachableThroughFunctionPointer);
+      }
+    }
+  }
+
+  // Verify that we fall into one of 2 cases:
+  //    - All variables are absolute: this is a re-run of the pass
+  //      so we don't have anything to do.
+  //    - No variables are absolute.
+  std::optional<bool> HasAbsoluteGVs;
+  for (auto &Map : {DirectMapKernel, IndirectMapKernel}) {
+    for (auto &[Fn, GVs] : Map) {
+      for (auto *GV : GVs) {
+        bool IsAbsolute = GV->isAbsoluteSymbolRef();
+        if (HasAbsoluteGVs.has_value()) {
+          if (*HasAbsoluteGVs != IsAbsolute) {
+            report_fatal_error(
+                "Module cannot mix absolute and non-absolute LDS GVs");
+          }
+        } else
+          HasAbsoluteGVs = IsAbsolute;
+      }
+    }
+  }
+
+  // If we only had absolute GVs, we have nothing to do, return an empty
+  // result.
+  if (HasAbsoluteGVs && *HasAbsoluteGVs)
+    return {FunctionVariableMap(), FunctionVariableMap()};
+
+  return {std::move(DirectMapKernel), std::move(IndirectMapKernel)};
+}
+
+void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot,
+                               StringRef FnAttr) {
+  KernelRoot->removeFnAttr(FnAttr);
+
+  SmallVector<Function *> WorkList = {CG[KernelRoot]->getFunction()};
+  SmallPtrSet<Function *, 8> Visited;
+  bool SeenUnknownCall = false;
+
+  while (!WorkList.empty()) {
+    Function *F = WorkList.pop_back_val();
+
+    for (auto &CallRecord : *CG[F]) {
+      if (!CallRecord.second)
+        continue;
+
+      Function *Callee = CallRecord.second->getFunction();
+      if (!Callee) {
+        if (!SeenUnknownCall) {
+          SeenUnknownCall = true;
+
+          // If we see any indirect calls, assume nothing about potential
+          // targets.
+          // TODO: This could be refined to possible LDS global users.
+          for (auto &ExternalCallRecord : *CG.getExternalCallingNode()) {
+            Function *PotentialCallee =
+                ExternalCallRecord.second->getFunction();
+            assert(PotentialCallee);
+            if (!isKernelLDS(PotentialCallee))
+              PotentialCallee->removeFnAttr(FnAttr);
+          }
+        }
+      } else {
+        Callee->removeFnAttr(FnAttr);
+        if (Visited.insert(Callee).second)
+          WorkList.push_back(Callee);
+      }
+    }
+  }
+}
+
 bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) {
   Instruction *DefInst = Def->getMemoryInst();
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
index e42b27f8e09e..4d3ad328e131 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
@@ -9,6 +9,9 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H
 #define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H
 
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+
 namespace llvm {
 
 struct Align;
@@ -19,14 +22,40 @@ class LoadInst;
 class MemoryDef;
 class MemorySSA;
 class Value;
+class Function;
+class CallGraph;
+class Module;
 
 namespace AMDGPU {
 
-Align getAlign(DataLayout const &DL, const GlobalVariable *GV);
+using FunctionVariableMap = DenseMap<Function *, DenseSet<GlobalVariable *>>;
+using VariableFunctionMap = DenseMap<GlobalVariable *, DenseSet<Function *>>;
+
+Align getAlign(const DataLayout &DL, const GlobalVariable *GV);
 
 bool isDynamicLDS(const GlobalVariable &GV);
 bool isLDSVariableToLower(const GlobalVariable &GV);
 
+struct LDSUsesInfoTy {
+  FunctionVariableMap direct_access;
+  FunctionVariableMap indirect_access;
+};
+
+bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M);
+
+void getUsesOfLDSByFunction(const CallGraph &CG, Module &M,
+                            FunctionVariableMap &kernels,
+                            FunctionVariableMap &functions);
+
+bool isKernelLDS(const Function *F);
+
+LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M);
+
+/// Strip FnAttr attribute from any functions where we may have
+/// introduced its use.
+void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot,
+                               StringRef FnAttr);
+
 /// Given a \p Def clobbering a load from \p Ptr according to the MSSA check
 /// if this is actually a memory update or an artificial clobber to facilitate
 /// ordering constraints.
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 2341e0d9d32b..4a56fad0cd60 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -143,14 +143,14 @@ multiclass VOP1Inst <string opName, VOPProfile P,
       def _e64_dpp  : VOP3_DPP_Pseudo <opName, P>;
   } // End SubtargetPredicate = isGFX11Plus
 
-  def : MnemonicAlias<opName#"_e32", opName>, LetDummies;
-  def : MnemonicAlias<opName#"_e64", opName>, LetDummies;
+  def : LetDummies, AMDGPUMnemonicAlias<opName#"_e32", opName>;
+  def : LetDummies, AMDGPUMnemonicAlias<opName#"_e64", opName>;
 
   if P.HasExtSDWA then
-    def : MnemonicAlias<opName#"_sdwa", opName>, LetDummies;
+    def : LetDummies, AMDGPUMnemonicAlias<opName#"_sdwa", opName>;
 
   if P.HasExtDPP then
-    def : MnemonicAlias<opName#"_dpp", opName, AMDGPUAsmVariants.DPP>, LetDummies;
+    def : LetDummies, AMDGPUMnemonicAlias<opName#"_dpp", opName, AMDGPUAsmVariants.DPP>;
 }
 
 multiclass VOP1Inst_t16<string opName,
@@ -625,42 +625,44 @@ def VOPProfile_Base_CVT_PK_F32_F8_OpSel : VOPProfileI2F <v2f32, i32> {
   let HasExtVOP3DPP = 0;
 }
 
-def VOPProfile_Base_CVT_F32_F8_OpSel : VOPProfile<[f32, i32, untyped, untyped]> {
-  let HasOpSel = 1;
+class VOPProfile_Base_CVT_F_F8_ByteSel<ValueType DstVT> : VOPProfile<[DstVT, i32, untyped, untyped]> {
+  let IsFP8SrcByteSel = 1;
+  let HasOpSel = 0;
   let HasExtDPP = 1;
   let HasExtVOP3DPP = 1;
-  let IsFP8 = 1;
+  let HasExtSDWA = 0;
   let HasClamp = 0;
   let HasOMod = 0;
-  let HasModifiers = 1;
-  let Src1VOP3DPP = Src1RC64;
+  let HasModifiers = 0;
+
+  defvar bytesel = (ins ByteSel:$byte_sel);
+  let Ins64 = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
+                            HasClamp, HasModifiers, HasSrc2Mods,
+                            HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret,
+                   bytesel);
+  let InsVOP3Base = !con(getInsVOP3Base<Src0VOP3DPP, Src1VOP3DPP, Src2VOP3DPP,
+                                        NumSrcArgs, HasClamp, HasModifiers, HasSrc2Mods,
+                                        HasOMod, Src0ModVOP3DPP, Src1ModVOP3DPP,
+                                        Src2ModVOP3DPP, HasOpSel>.ret,
+                         bytesel);
 }
 
 let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts],
     mayRaiseFPException = 0, SchedRW = [WriteFloatCvt] in {
-  defm V_CVT_F32_FP8_OP_SEL    : VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F32_F8_OpSel>;
-  defm V_CVT_F32_BF8_OP_SEL    : VOP1Inst<"v_cvt_f32_bf8_op_sel", VOPProfile_Base_CVT_F32_F8_OpSel>;
+  defm V_CVT_F32_FP8_OP_SEL    : VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F_F8_ByteSel<f32>>;
+  defm V_CVT_F32_BF8_OP_SEL    : VOP1Inst<"v_cvt_f32_bf8_op_sel", VOPProfile_Base_CVT_F_F8_ByteSel<f32>>;
   defm V_CVT_PK_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_pk_f32_fp8_op_sel", VOPProfile_Base_CVT_PK_F32_F8_OpSel>;
   defm V_CVT_PK_F32_BF8_OP_SEL : VOP1Inst<"v_cvt_pk_f32_bf8_op_sel", VOPProfile_Base_CVT_PK_F32_F8_OpSel>;
 }
 
-class Cvt_F32_F8_Pat_OpSel<SDPatternOperator node, bits<2> index,
-    VOP1_Pseudo inst_e32, VOP3_Pseudo inst_e64> : GCNPat<
-    (f32 (node i32:$src, index)),
-    !if (index,
-         (inst_e64 !or(!if(index{0}, SRCMODS.OP_SEL_1, 0),
-                       !if(index{1}, SRCMODS.OP_SEL_0, 0)),
-                    $src, 0),
-         (inst_e32 $src))
+class Cvt_F_F8_Pat_ByteSel<SDPatternOperator node, VOP3_Pseudo inst> : GCNPat<
+  (node i32:$src0, timm:$byte_sel),
+  (inst $src0, (as_i32timm $byte_sel))
 >;
 
 let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts] in {
-  foreach Index = [0, 1, 2, 3] in {
-    def : Cvt_F32_F8_Pat_OpSel<int_amdgcn_cvt_f32_fp8, Index,
-                               V_CVT_F32_FP8_e32, V_CVT_F32_FP8_OP_SEL_e64>;
-    def : Cvt_F32_F8_Pat_OpSel<int_amdgcn_cvt_f32_bf8, Index,
-                               V_CVT_F32_BF8_e32, V_CVT_F32_BF8_OP_SEL_e64>;
-  }
+  def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f32_fp8, V_CVT_F32_FP8_OP_SEL_e64>;
+  def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f32_bf8, V_CVT_F32_BF8_OP_SEL_e64>;
 }
 
 class Cvt_PK_F32_F8_Pat_OpSel<SDPatternOperator node, int index,
@@ -858,8 +860,9 @@ multiclass VOP1_Real_NO_VOP3_with_name_gfx11<bits<9> op, string opName,
               VOP1_Real_dpp_with_name<GFX11Gen, op, opName, asmName>,
               VOP1_Real_dpp8_with_name<GFX11Gen, op, opName, asmName>;
   defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
-  def gfx11_alias : MnemonicAlias<ps.Mnemonic, asmName>,
-                    Requires<[isGFX11Plus]>;
+  def gfx11_alias : AMDGPUMnemonicAlias<ps.Mnemonic, asmName> {
+    let AssemblerPredicate = isGFX11Plus;
+  }
 }
 
 multiclass VOP1_Real_NO_VOP3_with_name_gfx12<bits<9> op, string opName,
@@ -901,14 +904,11 @@ multiclass VOP1_Real_NO_DPP_OP_SEL_with_name<GFXGen Gen, bits<9> op,
   VOP3_Real_with_name<Gen, {0, 1, 1, op{6-0}}, opName, asmName>;
 
 
+defm V_CVT_F32_FP8      : VOP1_Real_FULL_with_name<GFX12Gen, 0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">;
+defm V_CVT_F32_BF8      : VOP1_Real_FULL_with_name<GFX12Gen, 0x06d, "V_CVT_F32_BF8_OP_SEL", "v_cvt_f32_bf8">;
+
 // Define VOP1 instructions using the pseudo instruction with its old profile and
 // VOP3 using the OpSel profile for the pseudo instruction.
-defm V_CVT_F32_FP8      : VOP1_Real_NO_VOP3_with_name_gfx12<0x06c, "V_CVT_F32_FP8", "v_cvt_f32_fp8">;
-defm V_CVT_F32_FP8      : VOP1_Realtriple_e64_with_name<GFX12Gen, 0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">;
-
-defm V_CVT_F32_BF8      : VOP1_Real_NO_VOP3_with_name_gfx12<0x06d, "V_CVT_F32_BF8", "v_cvt_f32_bf8">;
-defm V_CVT_F32_BF8      : VOP1_Realtriple_e64_with_name<GFX12Gen, 0x06d, "V_CVT_F32_BF8_OP_SEL", "v_cvt_f32_bf8">;
-
 defm V_CVT_PK_F32_FP8   : VOP1_Real_e32_with_name<GFX12Gen, 0x06e, "V_CVT_PK_F32_FP8", "v_cvt_pk_f32_fp8">;
 defm V_CVT_PK_F32_FP8   : VOP3_Real_with_name<GFX12Gen, 0x1ee, "V_CVT_PK_F32_FP8_OP_SEL", "v_cvt_pk_f32_fp8">;
 
@@ -1341,7 +1341,8 @@ def : GCNPat <
                        (as_i1timm $bound_ctrl))
 >;
 
-class UpdateDPPPat<ValueType vt> : GCNPat <
+foreach vt = Reg32Types.types in {
+def : GCNPat <
   (vt (int_amdgcn_update_dpp vt:$old, vt:$src, timm:$dpp_ctrl,
                               timm:$row_mask, timm:$bank_mask,
                               timm:$bound_ctrl)),
@@ -1349,11 +1350,7 @@ class UpdateDPPPat<ValueType vt> : GCNPat <
                  (as_i32timm $row_mask), (as_i32timm $bank_mask),
                  (as_i1timm $bound_ctrl))
 >;
-
-def : UpdateDPPPat<i32>;
-def : UpdateDPPPat<f32>;
-def : UpdateDPPPat<v2i16>;
-def : UpdateDPPPat<v2f16>;
+}
 
 } // End OtherPredicates = [isGFX8Plus]
 
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index c001c5de81e0..d2af1753d550 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -1510,7 +1510,9 @@ multiclass VOP2_Real_NO_VOP3_with_name<GFXGen Gen, bits<6> op, string opName,
               VOP2_Real_dpp_with_name<Gen, op, opName, asmName>,
               VOP2_Real_dpp8_with_name<Gen, op, opName, asmName>;
   defvar ps = !cast<VOP2_Pseudo>(opName#"_e32");
-  def Gen.Suffix#"_alias" : MnemonicAlias<ps.Mnemonic, asmName>, Requires<[Gen.AssemblerPredicate]>;
+  def Gen.Suffix#"_alias" : AMDGPUMnemonicAlias<ps.Mnemonic, asmName> {
+    let AssemblerPredicate = Gen.AssemblerPredicate;
+  }
 }
 
 multiclass VOP2_Real_FULL_with_name<GFXGen Gen, bits<6> op, string opName,
@@ -1523,13 +1525,17 @@ multiclass VOP2_Real_NO_DPP_with_name<GFXGen Gen, bits<6> op, string opName,
   defm NAME : VOP2_Real_e32_with_name<Gen, op, opName, asmName>,
               VOP2_Real_e64_with_name<Gen, op, opName, asmName>;
   defvar ps = !cast<VOP2_Pseudo>(opName#"_e32");
-  def Gen.Suffix#"_alias" : MnemonicAlias<ps.Mnemonic, asmName>, Requires<[Gen.AssemblerPredicate]>;
+  def Gen.Suffix#"_alias" : AMDGPUMnemonicAlias<ps.Mnemonic, asmName> {
+    let AssemblerPredicate = Gen.AssemblerPredicate;
+  }
 }
 
 multiclass VOP2_Real_NO_DPP_with_alias<GFXGen Gen, bits<6> op, string alias> {
   defm NAME : VOP2_Real_e32<Gen, op>,
               VOP2_Real_e64<Gen, op>;
-  def Gen.Suffix#"_alias" : MnemonicAlias<alias, NAME>, Requires<[Gen.AssemblerPredicate]>;
+  def Gen.Suffix#"_alias" : AMDGPUMnemonicAlias<alias, NAME> {
+    let AssemblerPredicate = Gen.AssemblerPredicate;
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -1550,7 +1556,9 @@ multiclass VOP2_Real_FULL_with_name_gfx12<bits<6> op, string opName,
 multiclass VOP2_Real_FULL_t16_with_name_gfx12<bits<6> op, string opName,
                                               string asmName, string alias> {
   defm NAME : VOP2_Real_FULL_with_name<GFX12Gen, op, opName, asmName>;
-  def _gfx12_2nd_alias : MnemonicAlias<alias, asmName>, Requires<[isGFX12Only]>;
+  def _gfx12_2nd_alias : AMDGPUMnemonicAlias<alias, asmName> {
+    let AssemblerPredicate = isGFX12Only;
+  }
 }
 
 multiclass VOP2_Real_NO_DPP_with_name_gfx12<bits<6> op, string opName,
@@ -1609,7 +1617,9 @@ multiclass VOP2_Real_NO_VOP3_with_name_gfx11<bits<6> op, string opName,
               VOP2_Real_dpp_with_name<GFX11Gen, op, opName, asmName>,
               VOP2_Real_dpp8_with_name<GFX11Gen, op, opName, asmName>;
   defvar ps = !cast<VOP2_Pseudo>(opName#"_e32");
-  def _gfx11_alias : MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX11Only]>;
+  def _gfx11_alias : AMDGPUMnemonicAlias<ps.Mnemonic, asmName> {
+    let AssemblerPredicate = isGFX11Only;
+  }
 }
 
 multiclass VOP2_Real_NO_DPP_with_name_gfx11<bits<6> op, string opName,
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index a7d63fdb2e04..c3bdbbfc3846 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -110,8 +110,8 @@ defm V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I1
 defm V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, umax>;
 
 let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
-defm V_PK_MAXIMUM_F16 : VOP3PInst<"v_pk_maximum_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16>, fmaximum>;
-defm V_PK_MINIMUM_F16 : VOP3PInst<"v_pk_minimum_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16>, fminimum>;
+defm V_PK_MAXIMUM_F16 : VOP3PInst<"v_pk_maximum_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16, VOP3_PACKED>, fmaximum>;
+defm V_PK_MINIMUM_F16 : VOP3PInst<"v_pk_minimum_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16, VOP3_PACKED>, fminimum>;
 } // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
 }
 
@@ -814,8 +814,8 @@ let isCommutable = 1, isReMaterializable = 1 in {
   defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3P_Profile<VOP_V2I32_V2I32_V2I32, VOP3_PACKED>>;
 } // End isCommutable = 1, isReMaterializable = 1
 
-def : MnemonicAlias<"v_accvgpr_read",  "v_accvgpr_read_b32">;
-def : MnemonicAlias<"v_accvgpr_write", "v_accvgpr_write_b32">;
+def : AMDGPUMnemonicAlias<"v_accvgpr_read",  "v_accvgpr_read_b32">;
+def : AMDGPUMnemonicAlias<"v_accvgpr_write", "v_accvgpr_write_b32">;
 
 class VOPProfileWMMA<VOPProfile P, string Suffix, RegisterOperand _Src01RC64, bit _HasClamp, bit _HasOpSel> : VOP3P_Profile<P> {
   let DstRC = !if(!eq(Suffix, "_w32"), VDst_256, VDst_128);
@@ -1481,8 +1481,11 @@ multiclass VOP3P_Real_with_name<GFXGen Gen, bits<7> op,
   let AsmString = asmName # ps.AsmOperands in
     def Gen.Suffix :
       VOP3P_Real_Gen<!cast<VOP3P_Pseudo>(backing_ps_name), Gen, asmName>,
-      VOP3Pe_gfx11_gfx12<op, !cast<VOP3P_Pseudo>(backing_ps_name).Pfl>,
-      MnemonicAlias<ps.Mnemonic, asmName>, Requires<[Gen.AssemblerPredicate]>;
+      VOP3Pe_gfx11_gfx12<op, !cast<VOP3P_Pseudo>(backing_ps_name).Pfl>;
+
+  def : AMDGPUMnemonicAlias<ps.Mnemonic, asmName> {
+    let AssemblerPredicate = Gen.AssemblerPredicate;
+  }
 }
 
 multiclass VOP3P_Real_dpp<GFXGen Gen, bits<7> op, string backing_ps_name = NAME,
@@ -1661,7 +1664,9 @@ multiclass VOP3P_Real_SMFMAC<bits<7> op, string alias> {
     let AssemblerPredicate = isGFX940Plus;
     let DecoderNamespace = "GFX8";
   }
-  def : MnemonicAlias<alias, !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic>;
+  def : AMDGPUMnemonicAlias<alias, !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic> {
+    let AssemblerPredicate = isGFX940Plus;
+  }
 }
 
 let SubtargetPredicate = isGFX8GFX9 in {
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index a0d666b39b2b..ddd6d8b074aa 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -1389,17 +1389,12 @@ multiclass VOPC_Real_with_name<GFXGen Gen, bits<9> op, string OpName,
   defvar ps32 = !cast<VOPC_Pseudo>(OpName#"_e32");
   defvar ps64 = !cast<VOP3_Pseudo>(OpName#"_e64");
   let AssemblerPredicate = Gen.AssemblerPredicate in {
-    // MnemonicAlias and GCNPredicateControl both define the field Predicates,
-    // so GCNPredicateControl must come after MnemonicAlias because it contains
-    // the predicates we actually want.
-    def : MnemonicAlias<!if(!empty(pseudo_mnemonic), ps32.Mnemonic,
+    def : AMDGPUMnemonicAlias<!if(!empty(pseudo_mnemonic), ps32.Mnemonic,
                                                      pseudo_mnemonic),
-                        asm_name, ps32.AsmVariantName>,
-          GCNPredicateControl;
-    def : MnemonicAlias<!if(!empty(pseudo_mnemonic), ps64.Mnemonic,
+                              asm_name, ps32.AsmVariantName>;
+    def : AMDGPUMnemonicAlias<!if(!empty(pseudo_mnemonic), ps64.Mnemonic,
                                                      pseudo_mnemonic),
-                        asm_name, ps64.AsmVariantName>,
-          GCNPredicateControl;
+                              asm_name, ps64.AsmVariantName>;
 
     let DecoderNamespace = Gen.DecoderNamespace in {
       def _e32#Gen.Suffix :
@@ -1523,17 +1518,12 @@ multiclass VOPCX_Real_with_name<GFXGen Gen, bits<9> op, string OpName,
   defvar ps32 = !cast<VOPC_Pseudo>(OpName#"_nosdst_e32");
   defvar ps64 = !cast<VOP3_Pseudo>(OpName#"_nosdst_e64");
   let AssemblerPredicate = Gen.AssemblerPredicate in {
-    // MnemonicAlias and GCNPredicateControl both define the field Predicates,
-    // so GCNPredicateControl must come after MnemonicAlias because it contains
-    // the predicates we actually want.
-    def : MnemonicAlias<!if(!empty(pseudo_mnemonic), !subst("_nosdst", "", ps32.Mnemonic),
+    def : AMDGPUMnemonicAlias<!if(!empty(pseudo_mnemonic), !subst("_nosdst", "", ps32.Mnemonic),
                                                      pseudo_mnemonic),
-                        asm_name, ps32.AsmVariantName>,
-          GCNPredicateControl;
-    def : MnemonicAlias<!if(!empty(pseudo_mnemonic), !subst("_nosdst", "", ps64.Mnemonic),
+                              asm_name, ps32.AsmVariantName>;
+    def : AMDGPUMnemonicAlias<!if(!empty(pseudo_mnemonic), !subst("_nosdst", "", ps64.Mnemonic),
                                                      pseudo_mnemonic),
-                        asm_name, ps64.AsmVariantName>,
-          GCNPredicateControl;
+                              asm_name, ps64.AsmVariantName>;
 
     let DecoderNamespace = Gen.DecoderNamespace in {
       def _e32#Gen.Suffix
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 7cdb5cbfe297..f45ab9bf46db 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -306,9 +306,10 @@ class VOP3OpSel_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
 
 class VOP3OpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3OpSel_gfx10<op, p>;
 
-class VOP3FP8OpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
-  let Inst{11} = !if(p.HasSrc0, src0_modifiers{2}, 0);
-  let Inst{12} = !if(p.HasSrc0, src0_modifiers{3}, 0);
+class VOP3FP8OpSel_src_bytesel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
+  bits<2> byte_sel;
+  let Inst{11-12} = byte_sel; // NB: bit order is intentionally reversed!
+  let Inst{14-13} = 0;  // op_sel2/3
 }
 
  class VOP3FP8OpSel_dst_bytesel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
@@ -755,10 +756,14 @@ class VOP3_DPPe_Common_Base<bits<10> op, VOPProfile P> : Enc96 {
   let Inst{9}     = !if(P.HasSrc1Mods, src1_modifiers{1}, 0);
   let Inst{10}    = !if(P.HasSrc2Mods, src2_modifiers{1}, 0);
   // OPSEL must be set such that the low result only uses low inputs, and the high result only uses high inputs.
-  let Inst{11} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{2}, 0),?);
-  let Inst{12} = !if(P.HasOpSel,!if(P.HasSrc1Mods, src1_modifiers{2}, !if((P.IsFP8), src0_modifiers{3}, 0)), ?);
-  let Inst{13} = !if(P.HasOpSel,!if(P.HasSrc2Mods, src2_modifiers{2}, 0),!if(P.IsFP8DstByteSel, byte_sel{0}, ?));
-  let Inst{14} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{3}, 0),!if(P.IsFP8DstByteSel, byte_sel{1}, ?));
+  let Inst{11} = !if(P.HasOpSel, !if(P.HasSrc0Mods, src0_modifiers{2}, 0),
+                                 !if(P.IsFP8SrcByteSel, byte_sel{1}, ?));
+  let Inst{12} = !if(P.HasOpSel, !if(P.HasSrc1Mods, src1_modifiers{2}, 0),
+                                 !if(P.IsFP8SrcByteSel, byte_sel{0}, ?));
+  let Inst{13} = !if(P.HasOpSel, !if(P.HasSrc2Mods, src2_modifiers{2}, 0),
+                                 !if(P.IsFP8DstByteSel, byte_sel{0}, ?));
+  let Inst{14} = !if(P.HasOpSel, !if(P.HasSrc0Mods, src0_modifiers{3}, 0),
+                                 !if(P.IsFP8DstByteSel, byte_sel{1}, ?));
   let Inst{15}    = !if(P.HasClamp, clamp, 0);
   let Inst{25-16} = op;
   let Inst{31-26} = 0x35;
@@ -1397,11 +1402,15 @@ multiclass VOP3_Real_Base<GFXGen Gen, bits<10> op, string opName = NAME,
                           bit isSingle = 0> {
   defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
   let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
-    if ps.Pfl.IsFP8DstByteSel then {
+    if ps.Pfl.IsFP8SrcByteSel then {
+      def _e64#Gen.Suffix :
+        VOP3_Real_Gen<ps, Gen>,
+        VOP3FP8OpSel_src_bytesel_gfx11_gfx12<op, ps.Pfl>;
+    } else if ps.Pfl.IsFP8DstByteSel then {
       def _e64#Gen.Suffix :
         VOP3_Real_Gen<ps, Gen>,
         VOP3FP8OpSel_dst_bytesel_gfx11_gfx12<op, ps.Pfl>;
-    } if ps.Pfl.HasOpSel then {
+    } else if ps.Pfl.HasOpSel then {
       def _e64#Gen.Suffix :
         VOP3_Real_Gen<ps, Gen>,
         VOP3OpSel_gfx11_gfx12<op, ps.Pfl>;
@@ -1428,10 +1437,10 @@ multiclass VOP3_Real_with_name<GFXGen Gen, bits<10> op, string opName,
   defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
   let AsmString = asmName # ps.AsmOperands,
       IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
-    if ps.Pfl.IsFP8 then {
+    if ps.Pfl.IsFP8SrcByteSel then {
       def _e64#Gen.Suffix :
         VOP3_Real_Gen<ps, Gen>,
-        VOP3FP8OpSel_gfx11_gfx12<op, ps.Pfl>;
+        VOP3FP8OpSel_src_bytesel_gfx11_gfx12<op, ps.Pfl>;
     } else if ps.Pfl.IsFP8DstByteSel then {
       def _e64#Gen.Suffix :
         VOP3_Real_Gen<ps, Gen>,
@@ -1446,7 +1455,9 @@ multiclass VOP3_Real_with_name<GFXGen Gen, bits<10> op, string opName,
         VOP3e_gfx11_gfx12<op, ps.Pfl>;
     }
   }
-  def Gen.Suffix#"_VOP3_alias" : MnemonicAlias<ps.Mnemonic, asmName>, Requires<[Gen.AssemblerPredicate]>, LetDummies;
+  def Gen.Suffix#"_VOP3_alias" : LetDummies, AMDGPUMnemonicAlias<ps.Mnemonic, asmName> {
+    let AssemblerPredicate = Gen.AssemblerPredicate;
+  }
 }
 
 // for READLANE/WRITELANE
@@ -1619,8 +1630,10 @@ multiclass VOP3be_Real_with_name_gfx12<bits<10> op, string opName,
       IsSingle = !or(isSingle, ps.Pfl.IsSingle) in
     def _e64_gfx12 :
       VOP3_Real_Gen<ps, GFX12Gen, asmName>,
-      VOP3be_gfx11_gfx12<op, ps.Pfl>,
-      MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX12Only]>;
+      VOP3be_gfx11_gfx12<op, ps.Pfl>;
+  def : AMDGPUMnemonicAlias<ps.Mnemonic, asmName> {
+    let AssemblerPredicate = GFX12Gen.AssemblerPredicate;
+  }
 }
 
 multiclass VOP3_Realtriple_with_name_gfx12<bits<10> op, string opName,
diff --git a/llvm/lib/Target/ARM/ARMArchitectures.td b/llvm/lib/Target/ARM/ARMArchitectures.td
index daf54f457b3b..e1e90cdae188 100644
--- a/llvm/lib/Target/ARM/ARMArchitectures.td
+++ b/llvm/lib/Target/ARM/ARMArchitectures.td
@@ -293,9 +293,8 @@ def ARMv8r    : Architecture<"armv8-r",   "ARMv8r",   [HasV8Ops,
                                                        FeatureDSP,
                                                        FeatureCRC,
                                                        FeatureMP,
-                                                       FeatureVirtualization,
-                                                       FeatureFPARMv8,
-                                                       FeatureNEON]>;
+                                                       FeatureFPARMv8_D16_SP,
+                                                       FeatureVirtualization]>;
 
 def ARMv8mBaseline : Architecture<"armv8-m.base", "ARMv8mBaseline",
                                                       [HasV8MBaselineOps,
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index f67a68acbf23..73f8bda9a021 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -2366,6 +2366,12 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   bool PreferIndirect = false;
   bool GuardWithBTI = false;
 
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
+  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
+
   // Lower 'returns_twice' calls to a pseudo-instruction.
   if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
       !Subtarget->noBTIAtReturnTwice())
@@ -2401,10 +2407,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   }
   if (isTailCall) {
     // Check if it's really possible to do a tail call.
-    isTailCall = IsEligibleForTailCallOptimization(
-        Callee, CallConv, isVarArg, isStructRet,
-        MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG,
-        PreferIndirect);
+    isTailCall =
+        IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs, PreferIndirect);
 
     if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt &&
         CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
@@ -2419,11 +2423,6 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
     report_fatal_error("failed to perform tail call elimination on a call "
                        "site marked musttail");
-  // Analyze operands of the call, assigning locations to each operand.
-  SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
-                 *DAG.getContext());
-  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getStackSize();
@@ -2985,14 +2984,19 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
 
 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
 /// for tail call optimization. Targets which want to do tail call
-/// optimization should implement this function.
+/// optimization should implement this function. Note that this function also
+/// processes musttail calls, so when this function returns false on a valid
+/// musttail call, a fatal backend error occurs.
 bool ARMTargetLowering::IsEligibleForTailCallOptimization(
-    SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
-    bool isCalleeStructRet, bool isCallerStructRet,
-    const SmallVectorImpl<ISD::OutputArg> &Outs,
-    const SmallVectorImpl<SDValue> &OutVals,
-    const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG,
-    const bool isIndirect) const {
+    TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo,
+    SmallVectorImpl<CCValAssign> &ArgLocs, const bool isIndirect) const {
+  CallingConv::ID CalleeCC = CLI.CallConv;
+  SDValue Callee = CLI.Callee;
+  bool isVarArg = CLI.IsVarArg;
+  const SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  const SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+  const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+  const SelectionDAG &DAG = CLI.DAG;
   MachineFunction &MF = DAG.getMachineFunction();
   const Function &CallerF = MF.getFunction();
   CallingConv::ID CallerCC = CallerF.getCallingConv();
@@ -3028,6 +3032,8 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization(
 
   // Also avoid sibcall optimization if either caller or callee uses struct
   // return semantics.
+  bool isCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet();
+  bool isCallerStructRet = MF.getFunction().hasStructRetAttr();
   if (isCalleeStructRet || isCallerStructRet)
     return false;
 
@@ -3073,11 +3079,6 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization(
   // If the callee takes no arguments then go on to check the results of the
   // call.
   if (!Outs.empty()) {
-    // Check if stack adjustment is needed. For now, do not do this if any
-    // argument is passed on the stack.
-    SmallVector<CCValAssign, 16> ArgLocs;
-    CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
-    CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
     if (CCInfo.getStackSize()) {
       // Check if the arguments are already laid out in the right way as
       // the caller's fixed stack objects.
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 26ef295e3d3f..ed4df7edd16e 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -943,12 +943,8 @@ class VectorType;
     /// for tail call optimization. Targets which want to do tail call
     /// optimization should implement this function.
     bool IsEligibleForTailCallOptimization(
-        SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
-        bool isCalleeStructRet, bool isCallerStructRet,
-        const SmallVectorImpl<ISD::OutputArg> &Outs,
-        const SmallVectorImpl<SDValue> &OutVals,
-        const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG,
-        const bool isIndirect) const;
+        TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo,
+        SmallVectorImpl<CCValAssign> &ArgLocs, const bool isIndirect) const;
 
     bool CanLowerReturn(CallingConv::ID CallConv,
                         MachineFunction &MF, bool isVarArg,
diff --git a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
index a364992fab3e..54207562dbae 100644
--- a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
@@ -61,13 +61,13 @@ static std::pair<bool, bool> GetSignReturnAddress(const Function &F) {
   }
 
   StringRef Scope = F.getFnAttribute("sign-return-address").getValueAsString();
-  if (Scope.equals("none"))
+  if (Scope == "none")
     return {false, false};
 
-  if (Scope.equals("all"))
+  if (Scope == "all")
     return {true, true};
 
-  assert(Scope.equals("non-leaf"));
+  assert(Scope == "non-leaf");
   return {true, false};
 }
 
diff --git a/llvm/lib/Target/ARM/ARMProcessors.td b/llvm/lib/Target/ARM/ARMProcessors.td
index 2c5594976400..eb5ed41ae8a1 100644
--- a/llvm/lib/Target/ARM/ARMProcessors.td
+++ b/llvm/lib/Target/ARM/ARMProcessors.td
@@ -573,5 +573,7 @@ def : ProcNoItin<"kryo",                                [ARMv8a, ProcKryo,
                                                          FeatureCRC]>;
 
 def : ProcessorModel<"cortex-r52", CortexR52Model,      [ARMv8r, ProcR52,
+                                                         FeatureFPARMv8,
+                                                         FeatureNEON,
                                                          FeatureUseMISched,
                                                          FeatureFPAO]>;
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index ee87f7f0e555..7db2e8ee7e6f 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -2571,14 +2571,15 @@ bool ARMTTIImpl::preferPredicatedReductionSelect(
 }
 
 InstructionCost ARMTTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
-                                                 int64_t BaseOffset,
+                                                 StackOffset BaseOffset,
                                                  bool HasBaseReg, int64_t Scale,
                                                  unsigned AddrSpace) const {
   TargetLoweringBase::AddrMode AM;
   AM.BaseGV = BaseGV;
-  AM.BaseOffs = BaseOffset;
+  AM.BaseOffs = BaseOffset.getFixed();
   AM.HasBaseReg = HasBaseReg;
   AM.Scale = Scale;
+  AM.ScalableOffset = BaseOffset.getScalable();
   if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) {
     if (ST->hasFPAO())
       return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 58eab45b9641..8c4b92b85688 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -303,7 +303,7 @@ public:
   /// If the AM is supported, the return value must be >= 0.
   /// If the AM is not supported, the return value must be negative.
   InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
-                                       int64_t BaseOffset, bool HasBaseReg,
+                                       StackOffset BaseOffset, bool HasBaseReg,
                                        int64_t Scale, unsigned AddrSpace) const;
 
   bool maybeLoweredToCall(Instruction &I);
diff --git a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
index 1c8213b668f7..aaec545fc1fe 100644
--- a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
+++ b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
@@ -134,8 +134,8 @@ bool AVRAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
     Reg = MI->getOperand(OpNum + RegIdx).getReg();
 
     if (BytesPerReg == 2) {
-      Reg = TRI.getSubReg(Reg,
-                          ByteNumber % BytesPerReg ? AVR::sub_hi : AVR::sub_lo);
+      Reg = TRI.getSubReg(Reg, (ByteNumber % BytesPerReg) ? AVR::sub_hi
+                                                          : AVR::sub_lo);
     }
 
     O << AVRInstPrinter::getPrettyRegisterName(Reg, MRI);
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.td b/llvm/lib/Target/AVR/AVRInstrInfo.td
index 38ebfab64c61..88b1989ef917 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.td
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.td
@@ -536,208 +536,95 @@ let Constraints = "$src = $rd", Defs = [SREG] in {
   // Register-Register logic instructions (which have the
   // property of commutativity).
   let isCommutable = 1 in {
-    def ANDRdRr
-        : FRdRr<0b0010, 0b00,
-                (outs GPR8
-                 : $rd),
-                (ins GPR8
-                 : $src, GPR8
-                 : $rr),
-                "and\t$rd, $rr",
-                [(set i8
-                  : $rd, (and i8
-                          : $src, i8
-                          : $rr)),
-                 (implicit SREG)]>;
+    def ANDRdRr : FRdRr<0b0010, 0b00, (outs GPR8:$rd),
+                        (ins GPR8:$src, GPR8:$rr), "and\t$rd, $rr",
+                        [(set i8:$rd, (and i8:$src, i8:$rr)), (implicit SREG)]>;
 
     // ANDW Rd+1:Rd, Rr+1:Rr
     //
     // Expands to:
     // and Rd,   Rr
     // and Rd+1, Rr+1
-    def ANDWRdRr : Pseudo<(outs DREGS
-                           : $rd),
-                          (ins DREGS
-                           : $src, DREGS
-                           : $rr),
-                          "andw\t$rd, $rr", [
-                            (set i16
-                             : $rd, (and i16
-                                     : $src, i16
-                                     : $rr)),
-                            (implicit SREG)
-                          ]>;
-
-    def ORRdRr
-        : FRdRr<0b0010, 0b10,
-                (outs GPR8
-                 : $rd),
-                (ins GPR8
-                 : $src, GPR8
-                 : $rr),
-                "or\t$rd, $rr",
-                [(set i8
-                  : $rd, (or i8
-                          : $src, i8
-                          : $rr)),
-                 (implicit SREG)]>;
+    def ANDWRdRr : Pseudo<(outs DREGS:$rd), (ins DREGS:$src, DREGS:$rr),
+                          "andw\t$rd, $rr",
+                          [(set i16:$rd, (and i16:$src, i16:$rr)),
+                           (implicit SREG)]>;
+
+    def ORRdRr : FRdRr<0b0010, 0b10, (outs GPR8:$rd), (ins GPR8:$src, GPR8:$rr),
+                       "or\t$rd, $rr",
+                       [(set i8:$rd, (or i8:$src, i8:$rr)), (implicit SREG)]>;
 
     // ORW Rd+1:Rd, Rr+1:Rr
     //
     // Expands to:
     // or Rd,   Rr
     // or Rd+1, Rr+1
-    def ORWRdRr : Pseudo<(outs DREGS
-                          : $rd),
-                         (ins DREGS
-                          : $src, DREGS
-                          : $rr),
-                         "orw\t$rd, $rr", [
-                           (set i16
-                            : $rd, (or i16
-                                    : $src, i16
-                                    : $rr)),
-                           (implicit SREG)
-                         ]>;
-
-    def EORRdRr
-        : FRdRr<0b0010, 0b01,
-                (outs GPR8
-                 : $rd),
-                (ins GPR8
-                 : $src, GPR8
-                 : $rr),
-                "eor\t$rd, $rr",
-                [(set i8
-                  : $rd, (xor i8
-                          : $src, i8
-                          : $rr)),
-                 (implicit SREG)]>;
+    def ORWRdRr : Pseudo<(outs DREGS:$rd), (ins DREGS:$src, DREGS:$rr),
+                         "orw\t$rd, $rr",
+                         [(set i16:$rd, (or i16:$src, i16:$rr)),
+                          (implicit SREG)]>;
+
+    def EORRdRr : FRdRr<0b0010, 0b01, (outs GPR8:$rd),
+                        (ins GPR8:$src, GPR8:$rr), "eor\t$rd, $rr",
+                        [(set i8:$rd, (xor i8:$src, i8:$rr)), (implicit SREG)]>;
 
     // EORW Rd+1:Rd, Rr+1:Rr
     //
     // Expands to:
     // eor Rd,   Rr
     // eor Rd+1, Rr+1
-    def EORWRdRr : Pseudo<(outs DREGS
-                           : $rd),
-                          (ins DREGS
-                           : $src, DREGS
-                           : $rr),
-                          "eorw\t$rd, $rr", [
-                            (set i16
-                             : $rd, (xor i16
-                                     : $src, i16
-                                     : $rr)),
-                            (implicit SREG)
-                          ]>;
+    def EORWRdRr : Pseudo<(outs DREGS:$rd), (ins DREGS:$src, DREGS:$rr),
+                          "eorw\t$rd, $rr",
+                          [(set i16:$rd, (xor i16:$src, i16:$rr)),
+                           (implicit SREG)]>;
   }
 
-  def ANDIRdK
-      : FRdK<0b0111,
-             (outs LD8
-              : $rd),
-             (ins LD8
-              : $src, imm_ldi8
-              : $k),
-             "andi\t$rd, $k",
-             [(set i8
-               : $rd, (and i8
-                       : $src, imm
-                       : $k)),
-              (implicit SREG)]>;
+  def ANDIRdK : FRdK<0b0111, (outs LD8:$rd), (ins LD8:$src, imm_ldi8:$k),
+                     "andi\t$rd, $k",
+                     [(set i8:$rd, (and i8:$src, imm:$k)), (implicit SREG)]>;
 
   // ANDI Rd+1:Rd, K+1:K
   //
   // Expands to:
   // andi Rd,   K
   // andi Rd+1, K+1
-  def ANDIWRdK
-      : Pseudo<(outs DLDREGS
-                : $rd),
-               (ins DLDREGS
-                : $src, i16imm
-                : $k),
-               "andiw\t$rd, $k",
-               [(set i16
-                 : $rd, (and i16
-                         : $src, imm
-                         : $k)),
-                (implicit SREG)]>;
-
-  def ORIRdK
-      : FRdK<0b0110,
-             (outs LD8
-              : $rd),
-             (ins LD8
-              : $src, imm_ldi8
-              : $k),
-             "ori\t$rd, $k",
-             [(set i8
-               : $rd, (or i8
-                       : $src, imm
-                       : $k)),
-              (implicit SREG)]>;
+  def ANDIWRdK : Pseudo<(outs DLDREGS:$rd), (ins DLDREGS:$src, i16imm:$k),
+                        "andiw\t$rd, $k",
+                        [(set i16:$rd, (and i16:$src, imm:$k)),
+                         (implicit SREG)]>;
+
+  def ORIRdK : FRdK<0b0110, (outs LD8:$rd), (ins LD8:$src, imm_ldi8:$k),
+                    "ori\t$rd, $k",
+                    [(set i8:$rd, (or i8:$src, imm:$k)), (implicit SREG)]>;
 
   // ORIW Rd+1:Rd, K+1,K
   //
   // Expands to:
   // ori Rd,   K
   // ori Rd+1, K+1
-  def ORIWRdK
-      : Pseudo<(outs DLDREGS
-                : $rd),
-               (ins DLDREGS
-                : $src, i16imm
-                : $rr),
-               "oriw\t$rd, $rr",
-               [(set i16
-                 : $rd, (or i16
-                         : $src, imm
-                         : $rr)),
-                (implicit SREG)]>;
+  def ORIWRdK : Pseudo<(outs DLDREGS:$rd), (ins DLDREGS:$src, i16imm:$rr),
+                       "oriw\t$rd, $rr",
+                       [(set i16:$rd, (or i16:$src, imm:$rr)),
+                        (implicit SREG)]>;
 }
 
 //===----------------------------------------------------------------------===//
 // One's/Two's Complement
 //===----------------------------------------------------------------------===//
 let Constraints = "$src = $rd", Defs = [SREG] in {
-  def COMRd
-      : FRd<0b1001, 0b0100000,
-            (outs GPR8
-             : $rd),
-            (ins GPR8
-             : $src),
-            "com\t$rd", [(set i8
-                          : $rd, (not i8
-                                  : $src)),
-                         (implicit SREG)]>;
+  def COMRd : FRd<0b1001, 0b0100000, (outs GPR8:$rd), (ins GPR8:$src),
+                  "com\t$rd", [(set i8:$rd, (not i8:$src)), (implicit SREG)]>;
 
   // COMW Rd+1:Rd
   //
   // Expands to:
   // com Rd
   // com Rd+1
-  def COMWRd : Pseudo<(outs DREGS
-                       : $rd),
-                      (ins DREGS
-                       : $src),
-                      "comw\t$rd",
-                      [(set i16
-                        : $rd, (not i16
-                                : $src)),
-                       (implicit SREG)]>;
+  def COMWRd : Pseudo<(outs DREGS:$rd), (ins DREGS:$src), "comw\t$rd",
+                      [(set i16:$rd, (not i16:$src)), (implicit SREG)]>;
 
-  def NEGRd
-      : FRd<0b1001, 0b0100001,
-            (outs GPR8
-             : $rd),
-            (ins GPR8
-             : $src),
-            "neg\t$rd", [(set i8
-                          : $rd, (ineg i8
-                                  : $src)),
-                         (implicit SREG)]>;
+  def NEGRd : FRd<0b1001, 0b0100001, (outs GPR8:$rd), (ins GPR8:$src),
+                  "neg\t$rd", [(set i8:$rd, (ineg i8:$src)), (implicit SREG)]>;
 
   // NEGW Rd+1:Rd
   //
@@ -746,51 +633,37 @@ let Constraints = "$src = $rd", Defs = [SREG] in {
   // neg Rd
   // sbc Rd+1, r1
   let hasSideEffects=0 in
-  def NEGWRd : Pseudo<(outs DREGS:$rd),
-                      (ins DREGS:$src, GPR8:$zero),
-                      "negw\t$rd",
-                      []>;
+  def NEGWRd : Pseudo<(outs DREGS:$rd), (ins DREGS:$src, GPR8:$zero),
+                      "negw\t$rd", []>;
 }
 
 // TST Rd
 // Test for zero of minus.
 // This operation is identical to a `Rd AND Rd`.
-def : InstAlias<"tst\t$rd", (ANDRdRr GPR8 : $rd, GPR8 : $rd)>;
+def : InstAlias<"tst\t$rd", (ANDRdRr GPR8:$rd, GPR8:$rd)>;
 
 // SBR Rd, K
 //
 // Mnemonic alias to 'ORI Rd, K'. Same bit pattern, same operands,
 // same everything.
-def : InstAlias<"sbr\t$rd, $k",
-                (ORIRdK LD8
-                 : $rd, imm_ldi8
-                 : $k),
+def : InstAlias<"sbr\t$rd, $k", (ORIRdK LD8:$rd, imm_ldi8:$k),
                 /* Disable display, so we don't override ORI */ 0>;
 
 //===----------------------------------------------------------------------===//
 // Jump instructions
 //===----------------------------------------------------------------------===//
 let isBarrier = 1, isBranch = 1, isTerminator = 1 in {
-  def RJMPk : FBRk<0, (outs),
-                   (ins brtarget_13
-                    : $k),
-                   "rjmp\t$k", [(br bb
-                                 : $k)]>;
-
-  let isIndirectBranch = 1,
-      Uses = [R31R30] in def IJMP
-      : F16<0b1001010000001001, (outs), (ins), "ijmp", []>,
-      Requires<[HasIJMPCALL]>;
-
-  let isIndirectBranch = 1,
-      Uses = [R31R30] in def EIJMP
-      : F16<0b1001010000011001, (outs), (ins), "eijmp", []>,
-      Requires<[HasEIJMPCALL]>;
+  def RJMPk : FBRk<0, (outs), (ins brtarget_13:$k), "rjmp\t$k", [(br bb:$k)]>;
 
-  def JMPk : F32BRk<0b110, (outs),
-                    (ins call_target
-                     : $k),
-                    "jmp\t$k", []>,
+  let isIndirectBranch = 1, Uses = [R31R30] in
+  def IJMP : F16<0b1001010000001001, (outs), (ins), "ijmp", []>,
+             Requires<[HasIJMPCALL]>;
+
+  let isIndirectBranch = 1, Uses = [R31R30] in
+  def EIJMP : F16<0b1001010000011001, (outs), (ins), "eijmp", []>,
+              Requires<[HasEIJMPCALL]>;
+
+  def JMPk : F32BRk<0b110, (outs), (ins call_target:$k), "jmp\t$k", []>,
              Requires<[HasJMPCALL]>;
 }
 
@@ -800,19 +673,21 @@ let isBarrier = 1, isBranch = 1, isTerminator = 1 in {
 let isCall = 1 in {
   // SP is marked as a use to prevent stack-pointer assignments that appear
   // immediately before calls from potentially appearing dead.
-  let Uses = [SP] in def RCALLk : FBRk<1, (outs), (ins rcalltarget_13:$k),
-                                       "rcall\t$k", [(AVRcall imm:$k)]>;
+  let Uses = [SP] in
+  def RCALLk : FBRk<1, (outs), (ins rcalltarget_13:$k), "rcall\t$k",
+                    [(AVRcall imm:$k)]>;
 
   // SP is marked as a use to prevent stack-pointer assignments that appear
   // immediately before calls from potentially appearing dead.
-  let Uses = [SP, R31R30] in def ICALL
-      : F16<0b1001010100001001, (outs), (ins variable_ops), "icall", []>,
-      Requires<[HasIJMPCALL]>;
+  let Uses = [SP, R31R30] in
+  def ICALL : F16<0b1001010100001001, (outs), (ins variable_ops), "icall", []>,
+              Requires<[HasIJMPCALL]>;
 
   // SP is marked as a use to prevent stack-pointer assignments that appear
   // immediately before calls from potentially appearing dead.
-  let Uses = [SP, R31R30] in def EICALL
-      : F16<0b1001010100011001, (outs), (ins variable_ops), "eicall", []>,
+  let Uses = [SP, R31R30] in
+  def EICALL : F16<0b1001010100011001, (outs), (ins variable_ops), "eicall",
+                   []>,
       Requires<[HasEIJMPCALL]>;
 
   // SP is marked as a use to prevent stack-pointer assignments that appear
@@ -820,9 +695,10 @@ let isCall = 1 in {
   //
   // TODO: the imm field can be either 16 or 22 bits in devices with more
   // than 64k of ROM, fix it once we support the largest devices.
-  let Uses = [SP] in def CALLk : F32BRk<0b111, (outs), (ins call_target:$k),
-                                        "call\t$k", [(AVRcall imm:$k)]>,
-      Requires<[HasJMPCALL]>;
+  let Uses = [SP] in
+  def CALLk : F32BRk<0b111, (outs), (ins call_target:$k), "call\t$k",
+                     [(AVRcall imm:$k)]>,
+              Requires<[HasJMPCALL]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -840,75 +716,42 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1 in {
 let Defs = [SREG] in {
   // CPSE Rd, Rr
   // Compare Rd and Rr, skipping the next instruction if they are equal.
-  let isBarrier = 1, isBranch = 1,
-      isTerminator = 1 in def CPSE : FRdRr<0b0001, 0b00, (outs),
-                                           (ins GPR8
-                                            : $rd, GPR8
-                                            : $rr),
-                                           "cpse\t$rd, $rr", []>;
-
-  def CPRdRr
-      : FRdRr<0b0001, 0b01, (outs),
-              (ins GPR8
-               : $rd, GPR8
-               : $rr),
-              "cp\t$rd, $rr", [(AVRcmp i8
-                                : $rd, i8
-                                : $rr),
-                               (implicit SREG)]>;
+  let isBarrier = 1, isBranch = 1, isTerminator = 1 in
+  def CPSE : FRdRr<0b0001, 0b00, (outs), (ins GPR8:$rd, GPR8:$rr),
+                   "cpse\t$rd, $rr", []>;
+
+  def CPRdRr : FRdRr<0b0001, 0b01, (outs), (ins GPR8:$rd, GPR8:$rr),
+                     "cp\t$rd, $rr",
+                     [(AVRcmp i8:$rd, i8:$rr), (implicit SREG)]>;
 
   // CPW Rd+1:Rd, Rr+1:Rr
   //
   // Expands to:
   // cp  Rd,   Rr
   // cpc Rd+1, Rr+1
-  def CPWRdRr : Pseudo<(outs),
-                       (ins DREGS
-                        : $src, DREGS
-                        : $src2),
+  def CPWRdRr : Pseudo<(outs), (ins DREGS:$src, DREGS:$src2),
                        "cpw\t$src, $src2",
-                       [(AVRcmp i16
-                         : $src, i16
-                         : $src2),
-                        (implicit SREG)]>;
+                       [(AVRcmp i16:$src, i16:$src2), (implicit SREG)]>;
 
-  let Uses = [SREG] in def CPCRdRr
-      : FRdRr<0b0000, 0b01, (outs),
-              (ins GPR8
-               : $rd, GPR8
-               : $rr),
-              "cpc\t$rd, $rr", [(AVRcmpc i8
-                                 : $rd, i8
-                                 : $rr),
-                                (implicit SREG)]>;
+  let Uses = [SREG] in
+  def CPCRdRr : FRdRr<0b0000, 0b01, (outs), (ins GPR8:$rd, GPR8:$rr),
+                      "cpc\t$rd, $rr",
+                      [(AVRcmpc i8:$rd, i8:$rr), (implicit SREG)]>;
 
   // CPCW Rd+1:Rd. Rr+1:Rr
   //
   // Expands to:
   // cpc Rd,   Rr
   // cpc Rd+1, Rr+1
-  let Uses = [SREG] in def CPCWRdRr
-      : Pseudo<(outs),
-               (ins DREGS
-                : $src, DREGS
-                : $src2),
-               "cpcw\t$src, $src2",
-               [(AVRcmpc i16
-                 : $src, i16
-                 : $src2),
-                (implicit SREG)]>;
+  let Uses = [SREG] in
+  def CPCWRdRr : Pseudo<(outs), (ins DREGS:$src, DREGS:$src2),
+                        "cpcw\t$src, $src2",
+                        [(AVRcmpc i16:$src, i16:$src2), (implicit SREG)]>;
 
   // CPI Rd, K
   // Compares a register with an 8 bit immediate.
-  def CPIRdK
-      : FRdK<0b0011, (outs),
-             (ins LD8
-              : $rd, imm_ldi8
-              : $k),
-             "cpi\t$rd, $k", [(AVRcmp i8
-                               : $rd, imm
-                               : $k),
-                              (implicit SREG)]>;
+  def CPIRdK : FRdK<0b0011, (outs), (ins LD8:$rd, imm_ldi8:$k), "cpi\t$rd, $k",
+                    [(AVRcmp i8:$rd, imm:$k), (implicit SREG)]>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index 8c9f5c4dc554..b6d3b460005c 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -588,7 +588,7 @@ void BTFDebug::processDeclAnnotations(DINodeArray Annotations,
   for (const Metadata *Annotation : Annotations->operands()) {
     const MDNode *MD = cast<MDNode>(Annotation);
     const MDString *Name = cast<MDString>(MD->getOperand(0));
-    if (!Name->getString().equals("btf_decl_tag"))
+    if (Name->getString() != "btf_decl_tag")
       continue;
 
     const MDString *Value = cast<MDString>(MD->getOperand(1));
@@ -627,7 +627,7 @@ int BTFDebug::genBTFTypeTags(const DIDerivedType *DTy, int BaseTypeId) {
     for (const Metadata *Annotations : Annots->operands()) {
       const MDNode *MD = cast<MDNode>(Annotations);
       const MDString *Name = cast<MDString>(MD->getOperand(0));
-      if (!Name->getString().equals("btf_type_tag"))
+      if (Name->getString() != "btf_type_tag")
         continue;
       MDStrs.push_back(cast<MDString>(MD->getOperand(1)));
     }
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index cd388ed3e319..24a0c8524230 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -266,6 +266,9 @@ def Cos  : DXILOpMapping<12, unary, int_cos,
 def Sin  : DXILOpMapping<13, unary, int_sin,
                          "Returns sine(theta) for theta in radians.",
                          [llvm_halforfloat_ty, LLVMMatchType<0>]>;
+def Tan  : DXILOpMapping<14, unary, int_tan,
+                         "Returns tangent(theta) for theta in radians.",
+                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
 def Exp2 : DXILOpMapping<21, unary, int_exp2,
                          "Returns the base 2 exponential, or 2**x, of the specified value."
                          "exp2(x) = 2**x.",
diff --git a/llvm/lib/Target/DirectX/DXILMetadata.cpp b/llvm/lib/Target/DirectX/DXILMetadata.cpp
index 2d94490a7f24..ed0434ac98a1 100644
--- a/llvm/lib/Target/DirectX/DXILMetadata.cpp
+++ b/llvm/lib/Target/DirectX/DXILMetadata.cpp
@@ -40,6 +40,15 @@ void ValidatorVersionMD::update(VersionTuple ValidatorVer) {
 
 bool ValidatorVersionMD::isEmpty() { return Entry->getNumOperands() == 0; }
 
+VersionTuple ValidatorVersionMD::getAsVersionTuple() {
+  if (isEmpty())
+    return VersionTuple(1, 0);
+  auto *ValVerMD = cast<MDNode>(Entry->getOperand(0));
+  auto *MajorMD = mdconst::extract<ConstantInt>(ValVerMD->getOperand(0));
+  auto *MinorMD = mdconst::extract<ConstantInt>(ValVerMD->getOperand(1));
+  return VersionTuple(MajorMD->getZExtValue(), MinorMD->getZExtValue());
+}
+
 static StringRef getShortShaderStage(Triple::EnvironmentType Env) {
   switch (Env) {
   case Triple::Pixel:
@@ -81,6 +90,18 @@ void dxil::createShaderModelMD(Module &M) {
   Entry->addOperand(MDNode::get(Ctx, Vals));
 }
 
+void dxil::createDXILVersionMD(Module &M) {
+  Triple TT(Triple::normalize(M.getTargetTriple()));
+  VersionTuple Ver = TT.getDXILVersion();
+  LLVMContext &Ctx = M.getContext();
+  IRBuilder<> B(Ctx);
+  NamedMDNode *Entry = M.getOrInsertNamedMetadata("dx.version");
+  Metadata *Vals[2];
+  Vals[0] = ConstantAsMetadata::get(B.getInt32(Ver.getMajor()));
+  Vals[1] = ConstantAsMetadata::get(B.getInt32(Ver.getMinor().value_or(0)));
+  Entry->addOperand(MDNode::get(Ctx, Vals));
+}
+
 static uint32_t getShaderStage(Triple::EnvironmentType Env) {
   return (uint32_t)Env - (uint32_t)llvm::Triple::Pixel;
 }
diff --git a/llvm/lib/Target/DirectX/DXILMetadata.h b/llvm/lib/Target/DirectX/DXILMetadata.h
index 2f5d7d9fe768..e05db8d5370d 100644
--- a/llvm/lib/Target/DirectX/DXILMetadata.h
+++ b/llvm/lib/Target/DirectX/DXILMetadata.h
@@ -30,9 +30,11 @@ public:
   void update(VersionTuple ValidatorVer);
 
   bool isEmpty();
+  VersionTuple getAsVersionTuple();
 };
 
 void createShaderModelMD(Module &M);
+void createDXILVersionMD(Module &M);
 void createEntryMD(Module &M, const uint64_t ShaderFlags);
 
 } // namespace dxil
diff --git a/llvm/lib/Target/DirectX/DXILPrepare.cpp b/llvm/lib/Target/DirectX/DXILPrepare.cpp
index 026911946b47..24be644d9fc0 100644
--- a/llvm/lib/Target/DirectX/DXILPrepare.cpp
+++ b/llvm/lib/Target/DirectX/DXILPrepare.cpp
@@ -11,10 +11,14 @@
 /// Language (DXIL).
 //===----------------------------------------------------------------------===//
 
+#include "DXILMetadata.h"
+#include "DXILResourceAnalysis.h"
+#include "DXILShaderFlags.h"
 #include "DirectX.h"
 #include "DirectXIRPasses/PointerTypeAnalysis.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/AttributeMask.h"
 #include "llvm/IR/IRBuilder.h"
@@ -23,6 +27,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/VersionTuple.h"
 
 #define DEBUG_TYPE "dxil-prepare"
 
@@ -80,6 +85,37 @@ constexpr bool isValidForDXIL(Attribute::AttrKind Attr) {
                       Attr);
 }
 
+static void collectDeadStringAttrs(AttributeMask &DeadAttrs, AttributeSet &&AS,
+                                   const StringSet<> &LiveKeys,
+                                   bool AllowExperimental) {
+  for (auto &Attr : AS) {
+    if (!Attr.isStringAttribute())
+      continue;
+    StringRef Key = Attr.getKindAsString();
+    if (LiveKeys.contains(Key))
+      continue;
+    if (AllowExperimental && Key.starts_with("exp-"))
+      continue;
+    DeadAttrs.addAttribute(Key);
+  }
+}
+
+static void removeStringFunctionAttributes(Function &F,
+                                           bool AllowExperimental) {
+  AttributeList Attrs = F.getAttributes();
+  const StringSet<> LiveKeys = {"waveops-include-helper-lanes",
+                                "fp32-denorm-mode"};
+  // Collect DeadKeys in FnAttrs.
+  AttributeMask DeadAttrs;
+  collectDeadStringAttrs(DeadAttrs, Attrs.getFnAttrs(), LiveKeys,
+                         AllowExperimental);
+  collectDeadStringAttrs(DeadAttrs, Attrs.getRetAttrs(), LiveKeys,
+                         AllowExperimental);
+
+  F.removeFnAttrs(DeadAttrs);
+  F.removeRetAttrs(DeadAttrs);
+}
+
 class DXILPrepareModule : public ModulePass {
 
   static Value *maybeGenerateBitcast(IRBuilder<> &Builder,
@@ -110,9 +146,18 @@ public:
       if (!isValidForDXIL(I))
         AttrMask.addAttribute(I);
     }
+
+    dxil::ValidatorVersionMD ValVerMD(M);
+    VersionTuple ValVer = ValVerMD.getAsVersionTuple();
+    bool SkipValidation = ValVer.getMajor() == 0 && ValVer.getMinor() == 0;
+
     for (auto &F : M.functions()) {
       F.removeFnAttrs(AttrMask);
       F.removeRetAttrs(AttrMask);
+      // Only remove string attributes if we are not skipping validation.
+      // This will reserve the experimental attributes when validation version
+      // is 0.0 for experiment mode.
+      removeStringFunctionAttributes(F, SkipValidation);
       for (size_t Idx = 0, End = F.arg_size(); Idx < End; ++Idx)
         F.removeParamAttrs(Idx, AttrMask);
 
@@ -172,7 +217,10 @@ public:
   }
 
   DXILPrepareModule() : ModulePass(ID) {}
-
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<ShaderFlagsAnalysisWrapper>();
+    AU.addPreserved<DXILResourceWrapper>();
+  }
   static char ID; // Pass identification.
 };
 char DXILPrepareModule::ID = 0;
diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
index 80d94bf0c9d4..ae6d6f96904c 100644
--- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
+++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
@@ -48,6 +48,7 @@ bool DXILTranslateMetadata::runOnModule(Module &M) {
   if (ValVerMD.isEmpty())
     ValVerMD.update(VersionTuple(1, 0));
   dxil::createShaderModelMD(M);
+  dxil::createDXILVersionMD(M);
 
   const dxil::Resources &Res =
       getAnalysis<DXILResourceWrapper>().getDXILResource();
diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
index bebca0675522..c853393e4282 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
@@ -79,8 +79,8 @@ public:
   void addCodeGenPrepare() override {
     addPass(createDXILIntrinsicExpansionLegacyPass());
     addPass(createDXILOpLoweringLegacyPass());
-    addPass(createDXILPrepareModulePass());
     addPass(createDXILTranslateMetadataPass());
+    addPass(createDXILPrepareModulePass());
   }
 };
 
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index a7ac24e25a5f..35188ff2e97f 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -86,7 +86,7 @@ static cl::opt<bool>
 static bool isSmallDataSection(StringRef Sec) {
   // sectionName is either ".sdata" or ".sbss". Looking for an exact match
   // obviates the need for checks for section names such as ".sdatafoo".
-  if (Sec.equals(".sdata") || Sec.equals(".sbss") || Sec.equals(".scommon"))
+  if (Sec == ".sdata" || Sec == ".sbss" || Sec == ".scommon")
     return true;
   // If either ".sdata." or ".sbss." is a substring of the section name
   // then put the symbol in small data.
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 0a948402fb89..eab7647e633b 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -158,7 +158,7 @@ StringRef Hexagon_MC::selectHexagonCPU(StringRef CPU) {
     // non-tiny subtarget.  See: addArchSubtarget
     std::pair<StringRef, StringRef> ArchP = ArchV.split('t');
     std::pair<StringRef, StringRef> CPUP = CPU.split('t');
-    if (!ArchP.first.equals(CPUP.first))
+    if (ArchP.first != CPUP.first)
       report_fatal_error("conflicting architectures specified.");
     return CPU;
   }
@@ -578,7 +578,7 @@ MCSubtargetInfo *Hexagon_MC::createHexagonMCSubtargetInfo(const Triple &TT,
   if (X != nullptr && (CPUName == "hexagonv67t" || CPUName == "hexagon71t"))
     addArchSubtarget(X, ArchFS);
 
-  if (CPU.equals("help"))
+  if (CPU == "help")
     exit(0);
 
   if (!isCPUValid(CPUName.str())) {
diff --git a/llvm/lib/Target/LoongArch/CMakeLists.txt b/llvm/lib/Target/LoongArch/CMakeLists.txt
index 5fb8b60be6c6..5085e23f82a7 100644
--- a/llvm/lib/Target/LoongArch/CMakeLists.txt
+++ b/llvm/lib/Target/LoongArch/CMakeLists.txt
@@ -23,6 +23,7 @@ add_llvm_target(LoongArchCodeGen
   LoongArchISelDAGToDAG.cpp
   LoongArchISelLowering.cpp
   LoongArchMCInstLower.cpp
+  LoongArchOptWInstrs.cpp
   LoongArchRegisterInfo.cpp
   LoongArchSubtarget.cpp
   LoongArchTargetMachine.cpp
diff --git a/llvm/lib/Target/LoongArch/LoongArch.h b/llvm/lib/Target/LoongArch/LoongArch.h
index 09ca089c9115..2109176d4998 100644
--- a/llvm/lib/Target/LoongArch/LoongArch.h
+++ b/llvm/lib/Target/LoongArch/LoongArch.h
@@ -35,10 +35,12 @@ bool lowerLoongArchMachineOperandToMCOperand(const MachineOperand &MO,
 
 FunctionPass *createLoongArchExpandAtomicPseudoPass();
 FunctionPass *createLoongArchISelDag(LoongArchTargetMachine &TM);
+FunctionPass *createLoongArchOptWInstrsPass();
 FunctionPass *createLoongArchPreRAExpandPseudoPass();
 FunctionPass *createLoongArchExpandPseudoPass();
 void initializeLoongArchDAGToDAGISelPass(PassRegistry &);
 void initializeLoongArchExpandAtomicPseudoPass(PassRegistry &);
+void initializeLoongArchOptWInstrsPass(PassRegistry &);
 void initializeLoongArchPreRAExpandPseudoPass(PassRegistry &);
 void initializeLoongArchExpandPseudoPass(PassRegistry &);
 } // end namespace llvm
diff --git a/llvm/lib/Target/LoongArch/LoongArch.td b/llvm/lib/Target/LoongArch/LoongArch.td
index c2a669931d78..8a628157c601 100644
--- a/llvm/lib/Target/LoongArch/LoongArch.td
+++ b/llvm/lib/Target/LoongArch/LoongArch.td
@@ -117,6 +117,9 @@ def FeatureFrecipe
                        "Support frecipe.{s/d} and frsqrte.{s/d} instructions.">;
 def HasFrecipe : Predicate<"Subtarget->hasFrecipe()">;
 
+def TunePreferWInst
+    : SubtargetFeature<"prefer-w-inst", "PreferWInst", "true",
+                       "Prefer instructions with W suffix">;
 
 //===----------------------------------------------------------------------===//
 // Registers, instruction descriptions ...
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index d83fd2b4d25f..21d520656091 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -1671,10 +1671,9 @@ static LoongArchISD::NodeType getLoongArchWOpcode(unsigned Opcode) {
     return LoongArchISD::SRA_W;
   case ISD::SRL:
     return LoongArchISD::SRL_W;
+  case ISD::ROTL:
   case ISD::ROTR:
     return LoongArchISD::ROTR_W;
-  case ISD::ROTL:
-    return LoongArchISD::ROTL_W;
   case ISD::CTTZ:
     return LoongArchISD::CTZ_W;
   case ISD::CTLZ:
@@ -1704,6 +1703,10 @@ static SDValue customLegalizeToWOp(SDNode *N, SelectionDAG &DAG, int NumOp,
   case 2: {
     NewOp0 = DAG.getNode(ExtOpc, DL, MVT::i64, N->getOperand(0));
     SDValue NewOp1 = DAG.getNode(ExtOpc, DL, MVT::i64, N->getOperand(1));
+    if (N->getOpcode() == ISD::ROTL) {
+      SDValue TmpOp = DAG.getConstant(32, DL, MVT::i64);
+      NewOp1 = DAG.getNode(ISD::SUB, DL, MVT::i64, TmpOp, NewOp1);
+    }
     NewRes = DAG.getNode(WOpcode, DL, MVT::i64, NewOp0, NewOp1);
     break;
   }
@@ -1841,7 +1844,6 @@ void LoongArchTargetLowering::ReplaceNodeResults(
   case ISD::SHL:
   case ISD::SRA:
   case ISD::SRL:
-  case ISD::ROTR:
     assert(VT == MVT::i32 && Subtarget.is64Bit() &&
            "Unexpected custom legalisation");
     if (N->getOperand(1).getOpcode() != ISD::Constant) {
@@ -1850,11 +1852,10 @@ void LoongArchTargetLowering::ReplaceNodeResults(
     }
     break;
   case ISD::ROTL:
-    ConstantSDNode *CN;
-    if ((CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))) {
-      Results.push_back(customLegalizeToWOp(N, DAG, 2));
-      break;
-    }
+  case ISD::ROTR:
+    assert(VT == MVT::i32 && Subtarget.is64Bit() &&
+           "Unexpected custom legalisation");
+    Results.push_back(customLegalizeToWOp(N, DAG, 2));
     break;
   case ISD::FP_TO_SINT: {
     assert(VT == MVT::i32 && Subtarget.is64Bit() &&
@@ -3768,6 +3769,7 @@ static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDValue Val,
 
 static SDValue unpackFromRegLoc(SelectionDAG &DAG, SDValue Chain,
                                 const CCValAssign &VA, const SDLoc &DL,
+                                const ISD::InputArg &In,
                                 const LoongArchTargetLowering &TLI) {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineRegisterInfo &RegInfo = MF.getRegInfo();
@@ -3778,6 +3780,21 @@ static SDValue unpackFromRegLoc(SelectionDAG &DAG, SDValue Chain,
   RegInfo.addLiveIn(VA.getLocReg(), VReg);
   Val = DAG.getCopyFromReg(Chain, DL, VReg, LocVT);
 
+  // If input is sign extended from 32 bits, note it for the OptW pass.
+  if (In.isOrigArg()) {
+    Argument *OrigArg = MF.getFunction().getArg(In.getOrigArgIndex());
+    if (OrigArg->getType()->isIntegerTy()) {
+      unsigned BitWidth = OrigArg->getType()->getIntegerBitWidth();
+      // An input zero extended from i31 can also be considered sign extended.
+      if ((BitWidth <= 32 && In.Flags.isSExt()) ||
+          (BitWidth < 32 && In.Flags.isZExt())) {
+        LoongArchMachineFunctionInfo *LAFI =
+            MF.getInfo<LoongArchMachineFunctionInfo>();
+        LAFI->addSExt32Register(VReg);
+      }
+    }
+  }
+
   return convertLocVTToValVT(DAG, Val, VA, DL);
 }
 
@@ -3909,7 +3926,7 @@ SDValue LoongArchTargetLowering::LowerFormalArguments(
     CCValAssign &VA = ArgLocs[i];
     SDValue ArgValue;
     if (VA.isRegLoc())
-      ArgValue = unpackFromRegLoc(DAG, Chain, VA, DL, *this);
+      ArgValue = unpackFromRegLoc(DAG, Chain, VA, DL, Ins[i], *this);
     else
       ArgValue = unpackFromMemLoc(DAG, Chain, VA, DL);
     if (VA.getLocInfo() == CCValAssign::Indirect) {
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
index babb6632471b..6b75634f5b2e 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
@@ -540,3 +540,9 @@ LoongArchInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
       {MO_GD_PC_HI, "loongarch-gd-pc-hi"}};
   return ArrayRef(TargetFlags);
 }
+
+// Returns true if this is the sext.w pattern, addi.w rd, rs, 0.
+bool LoongArch::isSEXT_W(const MachineInstr &MI) {
+  return MI.getOpcode() == LoongArch::ADDI_W && MI.getOperand(1).isReg() &&
+         MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0;
+}
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
index 4b145d0baa41..3b80f55bc84f 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
@@ -90,6 +90,9 @@ protected:
 
 namespace LoongArch {
 
+// Returns true if this is the sext.w pattern, addi.w rd, rs, 0.
+bool isSEXT_W(const MachineInstr &MI);
+
 // Mask assignments for floating-point.
 static constexpr unsigned FClassMaskSignalingNaN = 0x001;
 static constexpr unsigned FClassMaskQuietNaN = 0x002;
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index a7f6eb9a79eb..f56f8f7e1179 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -85,7 +85,6 @@ def loongarch_sll_w : SDNode<"LoongArchISD::SLL_W", SDT_LoongArchIntBinOpW>;
 def loongarch_sra_w : SDNode<"LoongArchISD::SRA_W", SDT_LoongArchIntBinOpW>;
 def loongarch_srl_w : SDNode<"LoongArchISD::SRL_W", SDT_LoongArchIntBinOpW>;
 def loongarch_rotr_w : SDNode<"LoongArchISD::ROTR_W", SDT_LoongArchIntBinOpW>;
-def loongarch_rotl_w : SDNode<"LoongArchISD::ROTL_W", SDT_LoongArchIntBinOpW>;
 def loongarch_crc_w_b_w
     : SDNode<"LoongArchISD::CRC_W_B_W", SDT_LoongArchIntBinOpW, [SDNPHasChain]>;
 def loongarch_crc_w_h_w
@@ -1116,12 +1115,10 @@ def : PatGprGpr<srem, MOD_D>;
 def : PatGprGpr<urem, MOD_DU>;
 def : PatGprGpr<rotr, ROTR_D>;
 def : PatGprGpr<loongarch_rotr_w, ROTR_W>;
+def : PatGprGpr_32<rotr, ROTR_W>;
 def : PatGprImm<rotr, ROTRI_D, uimm6>;
 def : PatGprImm_32<rotr, ROTRI_W, uimm5>;
-def : Pat<(loongarch_rotl_w GPR:$rj, uimm5:$imm),
-          (ROTRI_W GPR:$rj, (ImmSubFrom32 uimm5:$imm))>;
-def : Pat<(sext_inreg (loongarch_rotl_w GPR:$rj, uimm5:$imm), i32),
-          (ROTRI_W GPR:$rj, (ImmSubFrom32 uimm5:$imm))>;
+def : PatGprImm<loongarch_rotr_w, ROTRI_W, uimm5>;
 // TODO: Select "_W[U]" instructions for i32xi32 if only lower 32 bits of the
 // product are used.
 def : PatGprGpr<mul, MUL_D>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h b/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h
index 0d819154a89c..a7366a5dba04 100644
--- a/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h
@@ -36,6 +36,9 @@ private:
   /// insertIndirectBranch.
   int BranchRelaxationSpillFrameIndex = -1;
 
+  /// Registers that have been sign extended from i32.
+  SmallVector<Register, 8> SExt32Registers;
+
 public:
   LoongArchMachineFunctionInfo(const Function &F,
                                const TargetSubtargetInfo *STI) {}
@@ -62,6 +65,12 @@ public:
   void setBranchRelaxationSpillFrameIndex(int Index) {
     BranchRelaxationSpillFrameIndex = Index;
   }
+
+  void addSExt32Register(Register Reg) { SExt32Registers.push_back(Reg); }
+
+  bool isSExt32Register(Register Reg) const {
+    return is_contained(SExt32Registers, Reg);
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/LoongArch/LoongArchOptWInstrs.cpp b/llvm/lib/Target/LoongArch/LoongArchOptWInstrs.cpp
new file mode 100644
index 000000000000..abac69054f3b
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/LoongArchOptWInstrs.cpp
@@ -0,0 +1,819 @@
+//===- LoongArchOptWInstrs.cpp - MI W instruction optimizations ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+//
+// This pass does some optimizations for *W instructions at the MI level.
+//
+// First it removes unneeded sext(addi.w rd, rs, 0) instructions. Either
+// because the sign extended bits aren't consumed or because the input was
+// already sign extended by an earlier instruction.
+//
+// Then:
+// 1. Unless explicit disabled or the target prefers instructions with W suffix,
+//    it removes the -w suffix from opw instructions whenever all users are
+//    dependent only on the lower word of the result of the instruction.
+//    The cases handled are:
+//    * addi.w because it helps reduce test differences between LA32 and LA64
+//      w/o being a pessimization.
+//
+// 2. Or if explicit enabled or the target prefers instructions with W suffix,
+//    it adds the W suffix to the instruction whenever all users are dependent
+//    only on the lower word of the result of the instruction.
+//    The cases handled are:
+//    * add.d/addi.d/sub.d/mul.d.
+//    * slli.d with imm < 32.
+//    * ld.d/ld.wu.
+//===---------------------------------------------------------------------===//
+
+#include "LoongArch.h"
+#include "LoongArchMachineFunctionInfo.h"
+#include "LoongArchSubtarget.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loongarch-opt-w-instrs"
+#define LOONGARCH_OPT_W_INSTRS_NAME "LoongArch Optimize W Instructions"
+
+STATISTIC(NumRemovedSExtW, "Number of removed sign-extensions");
+STATISTIC(NumTransformedToWInstrs,
+          "Number of instructions transformed to W-ops");
+
+static cl::opt<bool>
+    DisableSExtWRemoval("loongarch-disable-sextw-removal",
+                        cl::desc("Disable removal of sign-extend insn"),
+                        cl::init(false), cl::Hidden);
+static cl::opt<bool>
+    DisableCvtToDSuffix("loongarch-disable-cvt-to-d-suffix",
+                        cl::desc("Disable convert to D suffix"),
+                        cl::init(false), cl::Hidden);
+
+namespace {
+
+class LoongArchOptWInstrs : public MachineFunctionPass {
+public:
+  static char ID;
+
+  LoongArchOptWInstrs() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  bool removeSExtWInstrs(MachineFunction &MF, const LoongArchInstrInfo &TII,
+                         const LoongArchSubtarget &ST,
+                         MachineRegisterInfo &MRI);
+  bool convertToDSuffixes(MachineFunction &MF, const LoongArchInstrInfo &TII,
+                          const LoongArchSubtarget &ST,
+                          MachineRegisterInfo &MRI);
+  bool convertToWSuffixes(MachineFunction &MF, const LoongArchInstrInfo &TII,
+                          const LoongArchSubtarget &ST,
+                          MachineRegisterInfo &MRI);
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  StringRef getPassName() const override { return LOONGARCH_OPT_W_INSTRS_NAME; }
+};
+
+} // end anonymous namespace
+
+char LoongArchOptWInstrs::ID = 0;
+INITIALIZE_PASS(LoongArchOptWInstrs, DEBUG_TYPE, LOONGARCH_OPT_W_INSTRS_NAME,
+                false, false)
+
+FunctionPass *llvm::createLoongArchOptWInstrsPass() {
+  return new LoongArchOptWInstrs();
+}
+
+// Checks if all users only demand the lower \p OrigBits of the original
+// instruction's result.
+// TODO: handle multiple interdependent transformations
+static bool hasAllNBitUsers(const MachineInstr &OrigMI,
+                            const LoongArchSubtarget &ST,
+                            const MachineRegisterInfo &MRI, unsigned OrigBits) {
+
+  SmallSet<std::pair<const MachineInstr *, unsigned>, 4> Visited;
+  SmallVector<std::pair<const MachineInstr *, unsigned>, 4> Worklist;
+
+  Worklist.push_back(std::make_pair(&OrigMI, OrigBits));
+
+  while (!Worklist.empty()) {
+    auto P = Worklist.pop_back_val();
+    const MachineInstr *MI = P.first;
+    unsigned Bits = P.second;
+
+    if (!Visited.insert(P).second)
+      continue;
+
+    // Only handle instructions with one def.
+    if (MI->getNumExplicitDefs() != 1)
+      return false;
+
+    Register DestReg = MI->getOperand(0).getReg();
+    if (!DestReg.isVirtual())
+      return false;
+
+    for (auto &UserOp : MRI.use_nodbg_operands(DestReg)) {
+      const MachineInstr *UserMI = UserOp.getParent();
+      unsigned OpIdx = UserOp.getOperandNo();
+
+      switch (UserMI->getOpcode()) {
+      default:
+        // TODO: Add vector
+        return false;
+
+      case LoongArch::ADD_W:
+      case LoongArch::ADDI_W:
+      case LoongArch::SUB_W:
+      case LoongArch::ALSL_W:
+      case LoongArch::ALSL_WU:
+      case LoongArch::MUL_W:
+      case LoongArch::MULH_W:
+      case LoongArch::MULH_WU:
+      case LoongArch::MULW_D_W:
+      case LoongArch::MULW_D_WU:
+      // TODO: {DIV,MOD}.{W,WU} consumes the upper 32 bits before LA664+.
+      // case LoongArch::DIV_W:
+      // case LoongArch::DIV_WU:
+      // case LoongArch::MOD_W:
+      // case LoongArch::MOD_WU:
+      case LoongArch::SLL_W:
+      case LoongArch::SLLI_W:
+      case LoongArch::SRL_W:
+      case LoongArch::SRLI_W:
+      case LoongArch::SRA_W:
+      case LoongArch::SRAI_W:
+      case LoongArch::ROTR_W:
+      case LoongArch::ROTRI_W:
+      case LoongArch::CLO_W:
+      case LoongArch::CLZ_W:
+      case LoongArch::CTO_W:
+      case LoongArch::CTZ_W:
+      case LoongArch::BYTEPICK_W:
+      case LoongArch::REVB_2H:
+      case LoongArch::BITREV_4B:
+      case LoongArch::BITREV_W:
+      case LoongArch::BSTRINS_W:
+      case LoongArch::BSTRPICK_W:
+      case LoongArch::CRC_W_W_W:
+      case LoongArch::CRCC_W_W_W:
+      case LoongArch::MOVGR2FCSR:
+      case LoongArch::MOVGR2FRH_W:
+      case LoongArch::MOVGR2FR_W_64:
+        if (Bits >= 32)
+          break;
+        return false;
+      case LoongArch::MOVGR2CF:
+        if (Bits >= 1)
+          break;
+        return false;
+      case LoongArch::EXT_W_B:
+        if (Bits >= 8)
+          break;
+        return false;
+      case LoongArch::EXT_W_H:
+        if (Bits >= 16)
+          break;
+        return false;
+
+      case LoongArch::SRLI_D: {
+        // If we are shifting right by less than Bits, and users don't demand
+        // any bits that were shifted into [Bits-1:0], then we can consider this
+        // as an N-Bit user.
+        unsigned ShAmt = UserMI->getOperand(2).getImm();
+        if (Bits > ShAmt) {
+          Worklist.push_back(std::make_pair(UserMI, Bits - ShAmt));
+          break;
+        }
+        return false;
+      }
+
+      // these overwrite higher input bits, otherwise the lower word of output
+      // depends only on the lower word of input. So check their uses read W.
+      case LoongArch::SLLI_D:
+        if (Bits >= (ST.getGRLen() - UserMI->getOperand(2).getImm()))
+          break;
+        Worklist.push_back(std::make_pair(UserMI, Bits));
+        break;
+      case LoongArch::ANDI: {
+        uint64_t Imm = UserMI->getOperand(2).getImm();
+        if (Bits >= (unsigned)llvm::bit_width(Imm))
+          break;
+        Worklist.push_back(std::make_pair(UserMI, Bits));
+        break;
+      }
+      case LoongArch::ORI: {
+        uint64_t Imm = UserMI->getOperand(2).getImm();
+        if (Bits >= (unsigned)llvm::bit_width<uint64_t>(~Imm))
+          break;
+        Worklist.push_back(std::make_pair(UserMI, Bits));
+        break;
+      }
+
+      case LoongArch::SLL_D:
+        // Operand 2 is the shift amount which uses log2(grlen) bits.
+        if (OpIdx == 2) {
+          if (Bits >= Log2_32(ST.getGRLen()))
+            break;
+          return false;
+        }
+        Worklist.push_back(std::make_pair(UserMI, Bits));
+        break;
+
+      case LoongArch::SRA_D:
+      case LoongArch::SRL_D:
+      case LoongArch::ROTR_D:
+        // Operand 2 is the shift amount which uses 6 bits.
+        if (OpIdx == 2 && Bits >= Log2_32(ST.getGRLen()))
+          break;
+        return false;
+
+      case LoongArch::ST_B:
+      case LoongArch::STX_B:
+      case LoongArch::STGT_B:
+      case LoongArch::STLE_B:
+      case LoongArch::IOCSRWR_B:
+        // The first argument is the value to store.
+        if (OpIdx == 0 && Bits >= 8)
+          break;
+        return false;
+      case LoongArch::ST_H:
+      case LoongArch::STX_H:
+      case LoongArch::STGT_H:
+      case LoongArch::STLE_H:
+      case LoongArch::IOCSRWR_H:
+        // The first argument is the value to store.
+        if (OpIdx == 0 && Bits >= 16)
+          break;
+        return false;
+      case LoongArch::ST_W:
+      case LoongArch::STX_W:
+      case LoongArch::SCREL_W:
+      case LoongArch::STPTR_W:
+      case LoongArch::STGT_W:
+      case LoongArch::STLE_W:
+      case LoongArch::IOCSRWR_W:
+        // The first argument is the value to store.
+        if (OpIdx == 0 && Bits >= 32)
+          break;
+        return false;
+
+      case LoongArch::CRC_W_B_W:
+      case LoongArch::CRCC_W_B_W:
+        if ((OpIdx == 1 && Bits >= 8) || (OpIdx == 2 && Bits >= 32))
+          break;
+        return false;
+      case LoongArch::CRC_W_H_W:
+      case LoongArch::CRCC_W_H_W:
+        if ((OpIdx == 1 && Bits >= 16) || (OpIdx == 2 && Bits >= 32))
+          break;
+        return false;
+      case LoongArch::CRC_W_D_W:
+      case LoongArch::CRCC_W_D_W:
+        if (OpIdx == 2 && Bits >= 32)
+          break;
+        return false;
+
+      // For these, lower word of output in these operations, depends only on
+      // the lower word of input. So, we check all uses only read lower word.
+      case LoongArch::COPY:
+      case LoongArch::PHI:
+      case LoongArch::ADD_D:
+      case LoongArch::ADDI_D:
+      case LoongArch::SUB_D:
+      case LoongArch::MUL_D:
+      case LoongArch::AND:
+      case LoongArch::OR:
+      case LoongArch::NOR:
+      case LoongArch::XOR:
+      case LoongArch::XORI:
+      case LoongArch::ANDN:
+      case LoongArch::ORN:
+        Worklist.push_back(std::make_pair(UserMI, Bits));
+        break;
+
+      case LoongArch::MASKNEZ:
+      case LoongArch::MASKEQZ:
+        if (OpIdx != 1)
+          return false;
+        Worklist.push_back(std::make_pair(UserMI, Bits));
+        break;
+      }
+    }
+  }
+
+  return true;
+}
+
+static bool hasAllWUsers(const MachineInstr &OrigMI,
+                         const LoongArchSubtarget &ST,
+                         const MachineRegisterInfo &MRI) {
+  return hasAllNBitUsers(OrigMI, ST, MRI, 32);
+}
+
+// This function returns true if the machine instruction always outputs a value
+// where bits 63:32 match bit 31.
+static bool isSignExtendingOpW(const MachineInstr &MI,
+                               const MachineRegisterInfo &MRI, unsigned OpNo) {
+  switch (MI.getOpcode()) {
+  // Normal cases
+  case LoongArch::ADD_W:
+  case LoongArch::SUB_W:
+  case LoongArch::ADDI_W:
+  case LoongArch::ALSL_W:
+  case LoongArch::LU12I_W:
+  case LoongArch::SLT:
+  case LoongArch::SLTU:
+  case LoongArch::SLTI:
+  case LoongArch::SLTUI:
+  case LoongArch::ANDI:
+  case LoongArch::MUL_W:
+  case LoongArch::MULH_W:
+  case LoongArch::MULH_WU:
+  case LoongArch::DIV_W:
+  case LoongArch::MOD_W:
+  case LoongArch::DIV_WU:
+  case LoongArch::MOD_WU:
+  case LoongArch::SLL_W:
+  case LoongArch::SRL_W:
+  case LoongArch::SRA_W:
+  case LoongArch::ROTR_W:
+  case LoongArch::SLLI_W:
+  case LoongArch::SRLI_W:
+  case LoongArch::SRAI_W:
+  case LoongArch::ROTRI_W:
+  case LoongArch::EXT_W_B:
+  case LoongArch::EXT_W_H:
+  case LoongArch::CLO_W:
+  case LoongArch::CLZ_W:
+  case LoongArch::CTO_W:
+  case LoongArch::CTZ_W:
+  case LoongArch::BYTEPICK_W:
+  case LoongArch::REVB_2H:
+  case LoongArch::BITREV_4B:
+  case LoongArch::BITREV_W:
+  case LoongArch::BSTRINS_W:
+  case LoongArch::BSTRPICK_W:
+  case LoongArch::LD_B:
+  case LoongArch::LD_H:
+  case LoongArch::LD_W:
+  case LoongArch::LD_BU:
+  case LoongArch::LD_HU:
+  case LoongArch::LL_W:
+  case LoongArch::LLACQ_W:
+  case LoongArch::RDTIMEL_W:
+  case LoongArch::RDTIMEH_W:
+  case LoongArch::CPUCFG:
+  case LoongArch::LDX_B:
+  case LoongArch::LDX_H:
+  case LoongArch::LDX_W:
+  case LoongArch::LDX_BU:
+  case LoongArch::LDX_HU:
+  case LoongArch::LDPTR_W:
+  case LoongArch::LDGT_B:
+  case LoongArch::LDGT_H:
+  case LoongArch::LDGT_W:
+  case LoongArch::LDLE_B:
+  case LoongArch::LDLE_H:
+  case LoongArch::LDLE_W:
+  case LoongArch::AMSWAP_B:
+  case LoongArch::AMSWAP_H:
+  case LoongArch::AMSWAP_W:
+  case LoongArch::AMADD_B:
+  case LoongArch::AMADD_H:
+  case LoongArch::AMADD_W:
+  case LoongArch::AMAND_W:
+  case LoongArch::AMOR_W:
+  case LoongArch::AMXOR_W:
+  case LoongArch::AMMAX_W:
+  case LoongArch::AMMIN_W:
+  case LoongArch::AMMAX_WU:
+  case LoongArch::AMMIN_WU:
+  case LoongArch::AMSWAP__DB_B:
+  case LoongArch::AMSWAP__DB_H:
+  case LoongArch::AMSWAP__DB_W:
+  case LoongArch::AMADD__DB_B:
+  case LoongArch::AMADD__DB_H:
+  case LoongArch::AMADD__DB_W:
+  case LoongArch::AMAND__DB_W:
+  case LoongArch::AMOR__DB_W:
+  case LoongArch::AMXOR__DB_W:
+  case LoongArch::AMMAX__DB_W:
+  case LoongArch::AMMIN__DB_W:
+  case LoongArch::AMMAX__DB_WU:
+  case LoongArch::AMMIN__DB_WU:
+  case LoongArch::AMCAS_B:
+  case LoongArch::AMCAS_H:
+  case LoongArch::AMCAS_W:
+  case LoongArch::AMCAS__DB_B:
+  case LoongArch::AMCAS__DB_H:
+  case LoongArch::AMCAS__DB_W:
+  case LoongArch::CRC_W_B_W:
+  case LoongArch::CRC_W_H_W:
+  case LoongArch::CRC_W_W_W:
+  case LoongArch::CRC_W_D_W:
+  case LoongArch::CRCC_W_B_W:
+  case LoongArch::CRCC_W_H_W:
+  case LoongArch::CRCC_W_W_W:
+  case LoongArch::CRCC_W_D_W:
+  case LoongArch::IOCSRRD_B:
+  case LoongArch::IOCSRRD_H:
+  case LoongArch::IOCSRRD_W:
+  case LoongArch::MOVFR2GR_S:
+  case LoongArch::MOVFCSR2GR:
+  case LoongArch::MOVCF2GR:
+  case LoongArch::MOVFRH2GR_S:
+  case LoongArch::MOVFR2GR_S_64:
+    // TODO: Add vector
+    return true;
+  // Special cases that require checking operands.
+  // shifting right sufficiently makes the value 32-bit sign-extended
+  case LoongArch::SRAI_D:
+    return MI.getOperand(2).getImm() >= 32;
+  case LoongArch::SRLI_D:
+    return MI.getOperand(2).getImm() > 32;
+  // The LI pattern ADDI rd, R0, imm and ORI rd, R0, imm are sign extended.
+  case LoongArch::ADDI_D:
+  case LoongArch::ORI:
+    return MI.getOperand(1).isReg() &&
+           MI.getOperand(1).getReg() == LoongArch::R0;
+  // A bits extract is sign extended if the msb is less than 31.
+  case LoongArch::BSTRPICK_D:
+    return MI.getOperand(2).getImm() < 31;
+  // Copying from R0 produces zero.
+  case LoongArch::COPY:
+    return MI.getOperand(1).getReg() == LoongArch::R0;
+  // Ignore the scratch register destination.
+  case LoongArch::PseudoMaskedAtomicSwap32:
+  case LoongArch::PseudoAtomicSwap32:
+  case LoongArch::PseudoMaskedAtomicLoadAdd32:
+  case LoongArch::PseudoMaskedAtomicLoadSub32:
+  case LoongArch::PseudoAtomicLoadNand32:
+  case LoongArch::PseudoMaskedAtomicLoadNand32:
+  case LoongArch::PseudoAtomicLoadAdd32:
+  case LoongArch::PseudoAtomicLoadSub32:
+  case LoongArch::PseudoAtomicLoadAnd32:
+  case LoongArch::PseudoAtomicLoadOr32:
+  case LoongArch::PseudoAtomicLoadXor32:
+  case LoongArch::PseudoMaskedAtomicLoadUMax32:
+  case LoongArch::PseudoMaskedAtomicLoadUMin32:
+  case LoongArch::PseudoCmpXchg32:
+  case LoongArch::PseudoMaskedCmpXchg32:
+  case LoongArch::PseudoMaskedAtomicLoadMax32:
+  case LoongArch::PseudoMaskedAtomicLoadMin32:
+    return OpNo == 0;
+  }
+
+  return false;
+}
+
+static bool isSignExtendedW(Register SrcReg, const LoongArchSubtarget &ST,
+                            const MachineRegisterInfo &MRI,
+                            SmallPtrSetImpl<MachineInstr *> &FixableDef) {
+  SmallSet<Register, 4> Visited;
+  SmallVector<Register, 4> Worklist;
+
+  auto AddRegToWorkList = [&](Register SrcReg) {
+    if (!SrcReg.isVirtual())
+      return false;
+    Worklist.push_back(SrcReg);
+    return true;
+  };
+
+  if (!AddRegToWorkList(SrcReg))
+    return false;
+
+  while (!Worklist.empty()) {
+    Register Reg = Worklist.pop_back_val();
+
+    // If we already visited this register, we don't need to check it again.
+    if (!Visited.insert(Reg).second)
+      continue;
+
+    MachineInstr *MI = MRI.getVRegDef(Reg);
+    if (!MI)
+      continue;
+
+    int OpNo = MI->findRegisterDefOperandIdx(Reg, /*TRI=*/nullptr);
+    assert(OpNo != -1 && "Couldn't find register");
+
+    // If this is a sign extending operation we don't need to look any further.
+    if (isSignExtendingOpW(*MI, MRI, OpNo))
+      continue;
+
+    // Is this an instruction that propagates sign extend?
+    switch (MI->getOpcode()) {
+    default:
+      // Unknown opcode, give up.
+      return false;
+    case LoongArch::COPY: {
+      const MachineFunction *MF = MI->getMF();
+      const LoongArchMachineFunctionInfo *LAFI =
+          MF->getInfo<LoongArchMachineFunctionInfo>();
+
+      // If this is the entry block and the register is livein, see if we know
+      // it is sign extended.
+      if (MI->getParent() == &MF->front()) {
+        Register VReg = MI->getOperand(0).getReg();
+        if (MF->getRegInfo().isLiveIn(VReg) && LAFI->isSExt32Register(VReg))
+          continue;
+      }
+
+      Register CopySrcReg = MI->getOperand(1).getReg();
+      if (CopySrcReg == LoongArch::R4) {
+        // For a method return value, we check the ZExt/SExt flags in attribute.
+        // We assume the following code sequence for method call.
+        // PseudoCALL @bar, ...
+        // ADJCALLSTACKUP 0, 0, implicit-def dead $r3, implicit $r3
+        // %0:gpr = COPY $r4
+        //
+        // We use the PseudoCall to look up the IR function being called to find
+        // its return attributes.
+        const MachineBasicBlock *MBB = MI->getParent();
+        auto II = MI->getIterator();
+        if (II == MBB->instr_begin() ||
+            (--II)->getOpcode() != LoongArch::ADJCALLSTACKUP)
+          return false;
+
+        const MachineInstr &CallMI = *(--II);
+        if (!CallMI.isCall() || !CallMI.getOperand(0).isGlobal())
+          return false;
+
+        auto *CalleeFn =
+            dyn_cast_if_present<Function>(CallMI.getOperand(0).getGlobal());
+        if (!CalleeFn)
+          return false;
+
+        auto *IntTy = dyn_cast<IntegerType>(CalleeFn->getReturnType());
+        if (!IntTy)
+          return false;
+
+        const AttributeSet &Attrs = CalleeFn->getAttributes().getRetAttrs();
+        unsigned BitWidth = IntTy->getBitWidth();
+        if ((BitWidth <= 32 && Attrs.hasAttribute(Attribute::SExt)) ||
+            (BitWidth < 32 && Attrs.hasAttribute(Attribute::ZExt)))
+          continue;
+      }
+
+      if (!AddRegToWorkList(CopySrcReg))
+        return false;
+
+      break;
+    }
+
+    // For these, we just need to check if the 1st operand is sign extended.
+    case LoongArch::MOD_D:
+    case LoongArch::ANDI:
+    case LoongArch::ORI:
+    case LoongArch::XORI:
+      // |Remainder| is always <= |Dividend|. If D is 32-bit, then so is R.
+      // DIV doesn't work because of the edge case 0xf..f 8000 0000 / (long)-1
+      // Logical operations use a sign extended 12-bit immediate.
+      if (!AddRegToWorkList(MI->getOperand(1).getReg()))
+        return false;
+
+      break;
+    case LoongArch::MOD_DU:
+    case LoongArch::AND:
+    case LoongArch::OR:
+    case LoongArch::XOR:
+    case LoongArch::ANDN:
+    case LoongArch::ORN:
+    case LoongArch::PHI: {
+      // If all incoming values are sign-extended, the output of AND, OR, XOR,
+      // or PHI is also sign-extended.
+
+      // The input registers for PHI are operand 1, 3, ...
+      // The input registers for others are operand 1 and 2.
+      unsigned B = 1, E = 3, D = 1;
+      switch (MI->getOpcode()) {
+      case LoongArch::PHI:
+        E = MI->getNumOperands();
+        D = 2;
+        break;
+      }
+
+      for (unsigned I = B; I != E; I += D) {
+        if (!MI->getOperand(I).isReg())
+          return false;
+
+        if (!AddRegToWorkList(MI->getOperand(I).getReg()))
+          return false;
+      }
+
+      break;
+    }
+
+    case LoongArch::MASKEQZ:
+    case LoongArch::MASKNEZ:
+      // Instructions return zero or operand 1. Result is sign extended if
+      // operand 1 is sign extended.
+      if (!AddRegToWorkList(MI->getOperand(1).getReg()))
+        return false;
+      break;
+
+    // With these opcode, we can "fix" them with the W-version
+    // if we know all users of the result only rely on bits 31:0
+    case LoongArch::SLLI_D:
+      // SLLI_W reads the lowest 5 bits, while SLLI_D reads lowest 6 bits
+      if (MI->getOperand(2).getImm() >= 32)
+        return false;
+      [[fallthrough]];
+    case LoongArch::ADDI_D:
+    case LoongArch::ADD_D:
+    case LoongArch::LD_D:
+    case LoongArch::LD_WU:
+    case LoongArch::MUL_D:
+    case LoongArch::SUB_D:
+      if (hasAllWUsers(*MI, ST, MRI)) {
+        FixableDef.insert(MI);
+        break;
+      }
+      return false;
+    }
+  }
+
+  // If we get here, then every node we visited produces a sign extended value
+  // or propagated sign extended values. So the result must be sign extended.
+  return true;
+}
+
+static unsigned getWOp(unsigned Opcode) {
+  switch (Opcode) {
+  case LoongArch::ADDI_D:
+    return LoongArch::ADDI_W;
+  case LoongArch::ADD_D:
+    return LoongArch::ADD_W;
+  case LoongArch::LD_D:
+  case LoongArch::LD_WU:
+    return LoongArch::LD_W;
+  case LoongArch::MUL_D:
+    return LoongArch::MUL_W;
+  case LoongArch::SLLI_D:
+    return LoongArch::SLLI_W;
+  case LoongArch::SUB_D:
+    return LoongArch::SUB_W;
+  default:
+    llvm_unreachable("Unexpected opcode for replacement with W variant");
+  }
+}
+
+bool LoongArchOptWInstrs::removeSExtWInstrs(MachineFunction &MF,
+                                            const LoongArchInstrInfo &TII,
+                                            const LoongArchSubtarget &ST,
+                                            MachineRegisterInfo &MRI) {
+  if (DisableSExtWRemoval)
+    return false;
+
+  bool MadeChange = false;
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
+      // We're looking for the sext.w pattern ADDI.W rd, rs, 0.
+      if (!LoongArch::isSEXT_W(MI))
+        continue;
+
+      Register SrcReg = MI.getOperand(1).getReg();
+
+      SmallPtrSet<MachineInstr *, 4> FixableDefs;
+
+      // If all users only use the lower bits, this sext.w is redundant.
+      // Or if all definitions reaching MI sign-extend their output,
+      // then sext.w is redundant.
+      if (!hasAllWUsers(MI, ST, MRI) &&
+          !isSignExtendedW(SrcReg, ST, MRI, FixableDefs))
+        continue;
+
+      Register DstReg = MI.getOperand(0).getReg();
+      if (!MRI.constrainRegClass(SrcReg, MRI.getRegClass(DstReg)))
+        continue;
+
+      // Convert Fixable instructions to their W versions.
+      for (MachineInstr *Fixable : FixableDefs) {
+        LLVM_DEBUG(dbgs() << "Replacing " << *Fixable);
+        Fixable->setDesc(TII.get(getWOp(Fixable->getOpcode())));
+        Fixable->clearFlag(MachineInstr::MIFlag::NoSWrap);
+        Fixable->clearFlag(MachineInstr::MIFlag::NoUWrap);
+        Fixable->clearFlag(MachineInstr::MIFlag::IsExact);
+        LLVM_DEBUG(dbgs() << "     with " << *Fixable);
+        ++NumTransformedToWInstrs;
+      }
+
+      LLVM_DEBUG(dbgs() << "Removing redundant sign-extension\n");
+      MRI.replaceRegWith(DstReg, SrcReg);
+      MRI.clearKillFlags(SrcReg);
+      MI.eraseFromParent();
+      ++NumRemovedSExtW;
+      MadeChange = true;
+    }
+  }
+
+  return MadeChange;
+}
+
+bool LoongArchOptWInstrs::convertToDSuffixes(MachineFunction &MF,
+                                             const LoongArchInstrInfo &TII,
+                                             const LoongArchSubtarget &ST,
+                                             MachineRegisterInfo &MRI) {
+  bool MadeChange = false;
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      unsigned Opc;
+      switch (MI.getOpcode()) {
+      default:
+        continue;
+      case LoongArch::ADDI_W:
+        Opc = LoongArch::ADDI_D;
+        break;
+      }
+
+      if (hasAllWUsers(MI, ST, MRI)) {
+        MI.setDesc(TII.get(Opc));
+        MadeChange = true;
+      }
+    }
+  }
+
+  return MadeChange;
+}
+
+bool LoongArchOptWInstrs::convertToWSuffixes(MachineFunction &MF,
+                                             const LoongArchInstrInfo &TII,
+                                             const LoongArchSubtarget &ST,
+                                             MachineRegisterInfo &MRI) {
+  bool MadeChange = false;
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      unsigned WOpc;
+      // TODO: Add more?
+      switch (MI.getOpcode()) {
+      default:
+        continue;
+      case LoongArch::ADD_D:
+        WOpc = LoongArch::ADD_W;
+        break;
+      case LoongArch::ADDI_D:
+        WOpc = LoongArch::ADDI_W;
+        break;
+      case LoongArch::SUB_D:
+        WOpc = LoongArch::SUB_W;
+        break;
+      case LoongArch::MUL_D:
+        WOpc = LoongArch::MUL_W;
+        break;
+      case LoongArch::SLLI_D:
+        // SLLI.W reads the lowest 5 bits, while SLLI.D reads lowest 6 bits
+        if (MI.getOperand(2).getImm() >= 32)
+          continue;
+        WOpc = LoongArch::SLLI_W;
+        break;
+      case LoongArch::LD_D:
+      case LoongArch::LD_WU:
+        WOpc = LoongArch::LD_W;
+        break;
+      }
+
+      if (hasAllWUsers(MI, ST, MRI)) {
+        LLVM_DEBUG(dbgs() << "Replacing " << MI);
+        MI.setDesc(TII.get(WOpc));
+        MI.clearFlag(MachineInstr::MIFlag::NoSWrap);
+        MI.clearFlag(MachineInstr::MIFlag::NoUWrap);
+        MI.clearFlag(MachineInstr::MIFlag::IsExact);
+        LLVM_DEBUG(dbgs() << "     with " << MI);
+        ++NumTransformedToWInstrs;
+        MadeChange = true;
+      }
+    }
+  }
+
+  return MadeChange;
+}
+
+bool LoongArchOptWInstrs::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const LoongArchSubtarget &ST = MF.getSubtarget<LoongArchSubtarget>();
+  const LoongArchInstrInfo &TII = *ST.getInstrInfo();
+
+  if (!ST.is64Bit())
+    return false;
+
+  bool MadeChange = false;
+  MadeChange |= removeSExtWInstrs(MF, TII, ST, MRI);
+
+  if (!(DisableCvtToDSuffix || ST.preferWInst()))
+    MadeChange |= convertToDSuffixes(MF, TII, ST, MRI);
+
+  if (ST.preferWInst())
+    MadeChange |= convertToWSuffixes(MF, TII, ST, MRI);
+
+  return MadeChange;
+}
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
index e5494488e113..2b2d4e478cc8 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
@@ -34,6 +34,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLoongArchTarget() {
   RegisterTargetMachine<LoongArchTargetMachine> X(getTheLoongArch32Target());
   RegisterTargetMachine<LoongArchTargetMachine> Y(getTheLoongArch64Target());
   auto *PR = PassRegistry::getPassRegistry();
+  initializeLoongArchOptWInstrsPass(*PR);
   initializeLoongArchPreRAExpandPseudoPass(*PR);
   initializeLoongArchDAGToDAGISelPass(*PR);
 }
@@ -145,6 +146,7 @@ public:
   bool addInstSelector() override;
   void addPreEmitPass() override;
   void addPreEmitPass2() override;
+  void addMachineSSAOptimization() override;
   void addPreRegAlloc() override;
 };
 } // end namespace
@@ -187,6 +189,14 @@ void LoongArchPassConfig::addPreEmitPass2() {
   addPass(createLoongArchExpandAtomicPseudoPass());
 }
 
+void LoongArchPassConfig::addMachineSSAOptimization() {
+  TargetPassConfig::addMachineSSAOptimization();
+
+  if (TM->getTargetTriple().isLoongArch64()) {
+    addPass(createLoongArchOptWInstrsPass());
+  }
+}
+
 void LoongArchPassConfig::addPreRegAlloc() {
   addPass(createLoongArchPreRAExpandPseudoPass());
 }
diff --git a/llvm/lib/Target/M68k/M68kSubtarget.cpp b/llvm/lib/Target/M68k/M68kSubtarget.cpp
index 3af1e994c01c..cacdbf559faa 100644
--- a/llvm/lib/Target/M68k/M68kSubtarget.cpp
+++ b/llvm/lib/Target/M68k/M68kSubtarget.cpp
@@ -251,6 +251,6 @@ M68kSubtarget::classifyGlobalFunctionReference(const GlobalValue *GV,
     return M68kII::MO_GOTPCREL;
   }
 
-  // otherwise linker will figure this out
-  return M68kII::MO_PLT;
+  // Ensure that we don't emit PLT relocations when in non-pic modes.
+  return isPositionIndependent() ? M68kII::MO_PLT : M68kII::MO_ABSOLUTE_ADDRESS;
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td
index 6aa98543e5e2..05457c71cd39 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.td
+++ b/llvm/lib/Target/NVPTX/NVPTX.td
@@ -41,7 +41,7 @@ foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53,
 def SM90a: FeatureSM<"90a", 901>;
 
 foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 64, 65,
-                   70, 71, 72, 73, 74, 75, 76, 77, 78, 80, 81, 82, 83] in
+                   70, 71, 72, 73, 74, 75, 76, 77, 78, 80, 81, 82, 83, 84] in
   def PTX#version: FeaturePTX<version>;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 44b61a937d64..b03803f52b78 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -6125,6 +6125,9 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
       if (Ty->isHalfTy() && STI.getSmVersion() >= 70 &&
           STI.getPTXVersion() >= 63)
         return AtomicExpansionKind::None;
+      if (Ty->isBFloatTy() && STI.getSmVersion() >= 90 &&
+          STI.getPTXVersion() >= 78)
+        return AtomicExpansionKind::None;
       if (Ty->isFloatTy())
         return AtomicExpansionKind::None;
       if (Ty->isDoubleTy() && STI.hasAtomAddF64())
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 142dd64ddea9..393fa29ff051 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2497,10 +2497,6 @@ defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>;
 defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>;
 defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>;
 
-// FIXME: What is this doing here?  Can it be deleted?
-// def ld_param         : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad,
-//                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
-
 def SDTDeclareParamProfile :
   SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
 def SDTDeclareScalarParamProfile :
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 5f6e28283c5d..440af085cb8e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1545,7 +1545,7 @@ multiclass F_ATOMIC_2_imp<ValueType ptrT, NVPTXRegClass ptrclass,
   def imm : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, IMMType:$b),
     !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b;", ""),
     [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), IMM:$b))]>,
-  Requires<!if(!eq(TypeStr, ".f16"), [Predicate<"false">], Pred)>;
+  Requires<!if(!or(!eq(TypeStr, ".f16"), !eq(TypeStr, ".bf16")), [Predicate<"false">], Pred)>;
 }
 multiclass F_ATOMIC_2<ValueType regT, NVPTXRegClass regclass, string SpaceStr, string TypeStr,
   string OpcStr, PatFrag IntOp, Operand IMMType, SDNode IMM,
@@ -1662,6 +1662,13 @@ defm INT_PTX_ATOM_ADD_S_F16 : F_ATOMIC_2<f16, Int16Regs, ".shared", ".f16", ".ad
 defm INT_PTX_ATOM_ADD_GEN_F16 : F_ATOMIC_2<f16, Int16Regs, "", ".f16", ".add.noftz",
   atomic_load_add_gen, f16imm, fpimm, [hasSM<70>, hasPTX<63>]>;
 
+defm INT_PTX_ATOM_ADD_G_BF16 : F_ATOMIC_2<bf16, Int16Regs, ".global", ".bf16", ".add.noftz",
+  atomic_load_add_g, bf16imm, fpimm, [hasSM<90>, hasPTX<78>]>;
+defm INT_PTX_ATOM_ADD_S_BF16 : F_ATOMIC_2<bf16, Int16Regs, ".shared", ".bf16", ".add.noftz",
+  atomic_load_add_s, bf16imm, fpimm, [hasSM<90>, hasPTX<78>]>;
+defm INT_PTX_ATOM_ADD_GEN_BF16 : F_ATOMIC_2<bf16, Int16Regs, "", ".bf16", ".add.noftz",
+  atomic_load_add_gen, bf16imm, fpimm, [hasSM<90>, hasPTX<78>]>;
+
 defm INT_PTX_ATOM_ADD_G_F32 : F_ATOMIC_2<f32, Float32Regs, ".global", ".f32", ".add",
   atomic_load_add_g, f32imm, fpimm>;
 defm INT_PTX_ATOM_ADD_S_F32 : F_ATOMIC_2<f32, Float32Regs, ".shared", ".f32", ".add",
@@ -2174,6 +2181,8 @@ multiclass ATOM2_add_impl<string OpStr> {
    defm _s32  : ATOM2S_impl<OpStr, "i", "s32", i32, Int32Regs, i32imm, imm, i32, []>;
    defm _u32  : ATOM2S_impl<OpStr, "i", "u32", i32, Int32Regs, i32imm, imm, i32, []>;
    defm _u64  : ATOM2S_impl<OpStr, "i", "u64", i64, Int64Regs, i64imm, imm, i64, []>;
+   defm _bf16  : ATOM2S_impl<OpStr, "f", "bf16", bf16, Int16Regs, bf16imm, fpimm, bf16,
+                            [hasSM<90>, hasPTX<78>]>;
    defm _f16  : ATOM2S_impl<OpStr, "f", "f16", f16, Int16Regs, f16imm, fpimm, f16,
                             [hasSM<70>, hasPTX<63>]>;
    defm _f32  : ATOM2S_impl<OpStr, "f", "f32", f32, Float32Regs, f32imm, fpimm, f32,
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 241078b03873..9106e6ab6397 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -63,45 +63,45 @@ const char *PPC::stripRegisterPrefix(const char *RegName) {
   switch (RegName[0]) {
     case 'a':
       if (RegName[1] == 'c' && RegName[2] == 'c')
-	return RegName + 3;
+        return RegName + 3;
       break;
     case 'f':
       if (RegName[1] == 'p')
-	return RegName + 2;
+        return RegName + 2;
       [[fallthrough]];
     case 'r':
     case 'v':
       if (RegName[1] == 's') {
-	if (RegName[2] == 'p')
-	  return RegName + 3;
-	return RegName + 2;
+        if (RegName[2] == 'p')
+          return RegName + 3;
+        return RegName + 2;
       }
       return RegName + 1;
     case 'c':
       if (RegName[1] == 'r')
-	return RegName + 2;
+        return RegName + 2;
       break;
     case 'w':
       // For wacc and wacc_hi
       if (RegName[1] == 'a' && RegName[2] == 'c' && RegName[3] == 'c') {
-	if (RegName[4] == '_')
-	  return RegName + 7;
-	else
-	  return RegName + 4;
+        if (RegName[4] == '_')
+          return RegName + 7;
+        else
+          return RegName + 4;
       }
       break;
     case 'd':
       // For dmr, dmrp, dmrrow, dmrrowp
       if (RegName[1] == 'm' && RegName[2] == 'r') {
-	if (RegName[3] == 'r' && RegName[4] == 'o' && RegName[5] == 'w' &&
-	    RegName[6] == 'p')
-	  return RegName + 7;
-	else if (RegName[3] == 'r' && RegName[4] == 'o' && RegName[5] == 'w')
-	  return RegName + 6;
-	else if (RegName[3] == 'p')
-	  return RegName + 4;
-	else
-	  return RegName + 3;
+        if (RegName[3] == 'r' && RegName[4] == 'o' && RegName[5] == 'w' &&
+            RegName[6] == 'p')
+          return RegName + 7;
+        else if (RegName[3] == 'r' && RegName[4] == 'o' && RegName[5] == 'w')
+          return RegName + 6;
+        else if (RegName[3] == 'p')
+          return RegName + 4;
+        else
+          return RegName + 3;
       }
       break;
   }
diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td
index b962ed28d720..639771ab9eab 100644
--- a/llvm/lib/Target/PowerPC/PPC.td
+++ b/llvm/lib/Target/PowerPC/PPC.td
@@ -338,6 +338,12 @@ def FeatureAIXLocalDynamicTLS :
                    "true", "Produce a faster local-dynamic TLS sequence for this "
                    "function for 64-bit AIX">;
 
+def FeatureAIXSharedLibTLSModelOpt :
+  SubtargetFeature<"aix-shared-lib-tls-model-opt",
+                   "HasAIXShLibTLSModelOpt", "true",
+                   "Tune TLS model at function level in shared library loaded "
+                   "with the main program (for 64-bit AIX only)">;
+
 def FeaturePredictableSelectIsExpensive :
   SubtargetFeature<"predictable-select-expensive",
                    "PredictableSelectIsExpensive",
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 51b79dc2b04b..ac48dc5af9d5 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -205,8 +205,8 @@ public:
   void LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI);
   void EmitTlsCall(const MachineInstr *MI, MCSymbolRefExpr::VariantKind VK);
   void EmitAIXTlsCallHelper(const MachineInstr *MI);
-  const MCExpr *getAdjustedLocalExecExpr(const MachineOperand &MO,
-                                         int64_t Offset);
+  const MCExpr *getAdjustedFasterLocalExpr(const MachineOperand &MO,
+                                           int64_t Offset);
   bool runOnMachineFunction(MachineFunction &MF) override {
     Subtarget = &MF.getSubtarget<PPCSubtarget>();
     bool Changed = AsmPrinter::runOnMachineFunction(MF);
@@ -878,6 +878,15 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
         return MCSymbolRefExpr::VariantKind::VK_PPC_AIX_TLSLE;
       if (Model == TLSModel::InitialExec)
         return MCSymbolRefExpr::VariantKind::VK_PPC_AIX_TLSIE;
+      // On AIX, TLS model opt may have turned local-dynamic accesses into
+      // initial-exec accesses.
+      PPCFunctionInfo *FuncInfo = MF->getInfo<PPCFunctionInfo>();
+      if (Model == TLSModel::LocalDynamic &&
+          FuncInfo->isAIXFuncUseTLSIEForLD()) {
+        LLVM_DEBUG(
+            dbgs() << "Current function uses IE access for default LD vars.\n");
+        return MCSymbolRefExpr::VariantKind::VK_PPC_AIX_TLSIE;
+      }
       llvm_unreachable("Only expecting local-exec or initial-exec accesses!");
     }
     // For GD TLS access on AIX, we have two TOC entries for the symbol (one for
@@ -1598,7 +1607,8 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     // machine operand (which is a TargetGlobalTLSAddress) is expected to be
     // the same operand for both loads and stores.
     for (const MachineOperand &TempMO : MI->operands()) {
-      if (((TempMO.getTargetFlags() == PPCII::MO_TPREL_FLAG)) &&
+      if (((TempMO.getTargetFlags() == PPCII::MO_TPREL_FLAG ||
+            TempMO.getTargetFlags() == PPCII::MO_TLSLD_FLAG)) &&
           TempMO.getOperandNo() == 1)
         OpNum = 1;
     }
@@ -1634,8 +1644,8 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
   case PPC::ADDI8: {
     // A faster non-TOC-based local-[exec|dynamic] sequence is represented by
     // `addi` or a load/store instruction (that directly loads or stores off of
-    // the thread pointer) with an immediate operand having the MO_TPREL_FLAG.
-    // Such instructions do not otherwise arise.
+    // the thread pointer) with an immediate operand having the
+    // [MO_TPREL_FLAG|MO_TLSLD_FLAG]. Such instructions do not otherwise arise.
     if (!HasAIXSmallLocalTLS)
       break;
     bool IsMIADDI8 = MI->getOpcode() == PPC::ADDI8;
@@ -1647,7 +1657,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
         Flag == PPCII::MO_TPREL_PCREL_FLAG || Flag == PPCII::MO_TLSLD_FLAG) {
       LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
 
-      const MCExpr *Expr = getAdjustedLocalExecExpr(MO, MO.getOffset());
+      const MCExpr *Expr = getAdjustedFasterLocalExpr(MO, MO.getOffset());
       if (Expr)
         TmpInst.getOperand(OpNum) = MCOperand::createExpr(Expr);
 
@@ -1677,14 +1687,15 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
   EmitToStreamer(*OutStreamer, TmpInst);
 }
 
-// For non-TOC-based local-exec variables that have a non-zero offset,
+// For non-TOC-based local-[exec|dynamic] variables that have a non-zero offset,
 // we need to create a new MCExpr that adds the non-zero offset to the address
-// of the local-exec variable that will be used in either an addi, load or
-// store. However, the final displacement for these instructions must be
+// of the local-[exec|dynamic] variable that will be used in either an addi,
+// load or store. However, the final displacement for these instructions must be
 // between [-32768, 32768), so if the TLS address + its non-zero offset is
 // greater than 32KB, a new MCExpr is produced to accommodate this situation.
-const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO,
-                                                      int64_t Offset) {
+const MCExpr *
+PPCAsmPrinter::getAdjustedFasterLocalExpr(const MachineOperand &MO,
+                                          int64_t Offset) {
   // Non-zero offsets (for loads, stores or `addi`) require additional handling.
   // When the offset is zero, there is no need to create an adjusted MCExpr.
   if (!Offset)
@@ -1692,13 +1703,9 @@ const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO,
 
   assert(MO.isGlobal() && "Only expecting a global MachineOperand here!");
   const GlobalValue *GValue = MO.getGlobal();
-  // TODO: Handle the aix-small-local-dynamic-tls non-zero offset case.
   TLSModel::Model Model = TM.getTLSModel(GValue);
-  if (Model == TLSModel::LocalDynamic) {
-    return nullptr;
-  }
-  assert(Model == TLSModel::LocalExec &&
-         "Only local-exec accesses are handled!");
+  assert((Model == TLSModel::LocalExec || Model == TLSModel::LocalDynamic) &&
+         "Only local-[exec|dynamic] accesses are handled!");
 
   bool IsGlobalADeclaration = GValue->isDeclarationForLinker();
   // Find the GlobalVariable that corresponds to the particular TLS variable
@@ -1719,7 +1726,10 @@ const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO,
   // For when TLS variables are extern, this is safe to do because we can
   // assume that the address of extern TLS variables are zero.
   const MCExpr *Expr = MCSymbolRefExpr::create(
-      getSymbol(GValue), MCSymbolRefExpr::VK_PPC_AIX_TLSLE, OutContext);
+      getSymbol(GValue),
+      Model == TLSModel::LocalExec ? MCSymbolRefExpr::VK_PPC_AIX_TLSLE
+                                   : MCSymbolRefExpr::VK_PPC_AIX_TLSLD,
+      OutContext);
   Expr = MCBinaryExpr::createAdd(
       Expr, MCConstantExpr::create(Offset, OutContext), OutContext);
   if (FinalAddress >= 32768) {
@@ -1732,10 +1742,10 @@ const MCExpr *PPCAsmPrinter::getAdjustedLocalExecExpr(const MachineOperand &MO,
     ptrdiff_t Delta = ((FinalAddress + 32768) & ~0xFFFF);
     // Check that the total instruction displacement fits within [-32768,32768).
     [[maybe_unused]] ptrdiff_t InstDisp = TLSVarAddress + Offset - Delta;
-    assert(((InstDisp < 32768) &&
-            (InstDisp >= -32768)) &&
-               "Expecting the instruction displacement for local-exec TLS "
-               "variables to be between [-32768, 32768)!");
+    assert(
+        ((InstDisp < 32768) && (InstDisp >= -32768)) &&
+        "Expecting the instruction displacement for local-[exec|dynamic] TLS "
+        "variables to be between [-32768, 32768)!");
     Expr = MCBinaryExpr::createAdd(
         Expr, MCConstantExpr::create(-Delta, OutContext), OutContext);
   }
@@ -2896,7 +2906,7 @@ void PPCAIXAsmPrinter::emitPGORefs(Module &M) {
   bool HasNonZeroLengthPrfCntsSection = false;
   const DataLayout &DL = M.getDataLayout();
   for (GlobalVariable &GV : M.globals())
-    if (GV.hasSection() && GV.getSection().equals("__llvm_prf_cnts") &&
+    if (GV.hasSection() && GV.getSection() == "__llvm_prf_cnts" &&
         DL.getTypeAllocSize(GV.getValueType()) > 0) {
       HasNonZeroLengthPrfCntsSection = true;
       break;
@@ -2949,7 +2959,11 @@ void PPCAIXAsmPrinter::emitEndOfAsmFile(Module &M) {
     // Setup the csect for the current TC entry. If the variant kind is
     // VK_PPC_AIX_TLSGDM the entry represents the region handle, we create a
     // new symbol to prefix the name with a dot.
-    if (I.first.second == MCSymbolRefExpr::VariantKind::VK_PPC_AIX_TLSGDM) {
+    // If TLS model opt is turned on, create a new symbol to prefix the name
+    // with a dot.
+    if (I.first.second == MCSymbolRefExpr::VariantKind::VK_PPC_AIX_TLSGDM ||
+        (Subtarget->hasAIXShLibTLSModelOpt() &&
+         I.first.second == MCSymbolRefExpr::VariantKind::VK_PPC_AIX_TLSLD)) {
       SmallString<128> Name;
       StringRef Prefix = ".";
       Name += Prefix;
diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index 04e9f9e2366e..8444266459c4 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -1966,6 +1966,8 @@ void PPCFrameLowering::determineCalleeSaves(MachineFunction &MF,
                                             BitVector &SavedRegs,
                                             RegScavenger *RS) const {
   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+  if (Subtarget.isAIXABI())
+    updateCalleeSaves(MF, SavedRegs);
 
   const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
 
@@ -2725,6 +2727,63 @@ bool PPCFrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
   return !MF.getSubtarget<PPCSubtarget>().is32BitELFABI();
 }
 
+void PPCFrameLowering::updateCalleeSaves(const MachineFunction &MF,
+                                         BitVector &SavedRegs) const {
+  // The AIX ABI uses traceback tables for EH which require that if callee-saved
+  // register N is used, all registers N-31 must be saved/restored.
+  // NOTE: The check for AIX is not actually what is relevant. Traceback tables
+  // on Linux have the same requirements. It is just that AIX is the only ABI
+  // for which we actually use traceback tables. If another ABI needs to be
+  // supported that also uses them, we can add a check such as
+  // Subtarget.usesTraceBackTables().
+  assert(Subtarget.isAIXABI() &&
+         "Function updateCalleeSaves should only be called for AIX.");
+
+  // If there are no callee saves then there is nothing to do.
+  if (SavedRegs.none())
+    return;
+
+  const MCPhysReg *CSRegs =
+      Subtarget.getRegisterInfo()->getCalleeSavedRegs(&MF);
+  MCPhysReg LowestGPR = PPC::R31;
+  MCPhysReg LowestG8R = PPC::X31;
+  MCPhysReg LowestFPR = PPC::F31;
+  MCPhysReg LowestVR = PPC::V31;
+
+  // Traverse the CSRs twice so as not to rely on ascending ordering of
+  // registers in the array. The first pass finds the lowest numbered
+  // register and the second pass marks all higher numbered registers
+  // for spilling.
+  for (int i = 0; CSRegs[i]; i++) {
+    // Get the lowest numbered register for each class that actually needs
+    // to be saved.
+    MCPhysReg Cand = CSRegs[i];
+    if (!SavedRegs.test(Cand))
+      continue;
+    if (PPC::GPRCRegClass.contains(Cand) && Cand < LowestGPR)
+      LowestGPR = Cand;
+    else if (PPC::G8RCRegClass.contains(Cand) && Cand < LowestG8R)
+      LowestG8R = Cand;
+    else if ((PPC::F4RCRegClass.contains(Cand) ||
+              PPC::F8RCRegClass.contains(Cand)) &&
+             Cand < LowestFPR)
+      LowestFPR = Cand;
+    else if (PPC::VRRCRegClass.contains(Cand) && Cand < LowestVR)
+      LowestVR = Cand;
+  }
+
+  for (int i = 0; CSRegs[i]; i++) {
+    MCPhysReg Cand = CSRegs[i];
+    if ((PPC::GPRCRegClass.contains(Cand) && Cand > LowestGPR) ||
+        (PPC::G8RCRegClass.contains(Cand) && Cand > LowestG8R) ||
+        ((PPC::F4RCRegClass.contains(Cand) ||
+          PPC::F8RCRegClass.contains(Cand)) &&
+         Cand > LowestFPR) ||
+        (PPC::VRRCRegClass.contains(Cand) && Cand > LowestVR))
+      SavedRegs.set(Cand);
+  }
+}
+
 uint64_t PPCFrameLowering::getStackThreshold() const {
   // On PPC64, we use `stux r1, r1, <scratch_reg>` to extend the stack;
   // use `add r1, r1, <scratch_reg>` to release the stack frame.
diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.h b/llvm/lib/Target/PowerPC/PPCFrameLowering.h
index e19087ce0e18..d74c87428326 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.h
@@ -173,6 +173,7 @@ public:
   /// function prologue/epilogue.
   bool canUseAsPrologue(const MachineBasicBlock &MBB) const override;
   bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override;
+  void updateCalleeSaves(const MachineFunction &MF, BitVector &SavedRegs) const;
 
   uint64_t getStackThreshold() const override;
 };
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 2f647daa4bcb..68621558e3fa 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -5302,9 +5302,10 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
       SDValue MDV = N->getOperand(MDIndex);
       const MDNode *MD = cast<MDNodeSDNode>(MDV)->getMD();
       assert(MD->getNumOperands() != 0 && "Empty MDNode in operands!");
-      assert((isa<MDString>(MD->getOperand(0)) && cast<MDString>(
-           MD->getOperand(0))->getString().equals("ppc-trap-reason")) 
-           && "Unsupported annotation data type!");
+      assert((isa<MDString>(MD->getOperand(0)) &&
+              cast<MDString>(MD->getOperand(0))->getString() ==
+                  "ppc-trap-reason") &&
+             "Unsupported annotation data type!");
       for (unsigned i = 1; i < MD->getNumOperands(); i++) {
         assert(isa<MDString>(MD->getOperand(i)) && 
                "Invalid data type for annotation ppc-trap-reason!");
@@ -6141,12 +6142,12 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     assert((isPPC64 || (isAIXABI && !isPPC64)) && "We are dealing with 64-bit"
            " ELF/AIX or 32-bit AIX in the following.");
 
-    // Transforms the ISD::TOC_ENTRY node for 32-bit AIX large code model mode
-    // or 64-bit medium (ELF-only) or large (ELF and AIX) code model code non
-    // toc-data symbols.
+    // Transforms the ISD::TOC_ENTRY node for 32-bit AIX large code model mode,
+    // or 64-bit medium (ELF-only), or large (ELF and AIX) code model code that
+    // does not conain TOC data symbols.
     // We generate two instructions as described below. The first source
-    // operand is a symbol reference. If it must be toc-referenced according to
-    // Subtarget, we generate:
+    // operand is a symbol reference. If it must be referenced via the toc
+    // according to Subtarget, we generate:
     // [32-bit AIX]
     //   LWZtocL(@sym, ADDIStocHA(%r2, @sym))
     // [64-bit ELF/AIX]
@@ -6154,7 +6155,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     // Otherwise we generate:
     //   ADDItocL8(ADDIStocHA8(%x2, @sym), @sym)
 
-    // For large code model toc-data symbols we generate:
+    // For large code model with TOC data symbols we generate:
     // [32-bit AIX]
     //   ADDItocL(ADDIStocHA(%x2, @sym), @sym)
     // [64-bit AIX]
@@ -6167,9 +6168,8 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     SDNode *Tmp = CurDAG->getMachineNode(
         isPPC64 ? PPC::ADDIStocHA8 : PPC::ADDIStocHA, dl, VT, TOCbase, GA);
 
-    // On AIX if the symbol has the toc-data attribute it will be defined
-    // in the TOC entry, so we use an ADDItocL similar to the medium code
-    // model ELF abi.
+    // On AIX, if the symbol has the toc-data attribute it will be defined
+    // in the TOC entry, so we use an ADDItocL/ADDItocL8.
     if (isAIXABI && hasTocDataAttr(GA)) {
       if (isPPC64)
         report_fatal_error(
@@ -7587,29 +7587,23 @@ static bool hasAIXSmallTLSAttr(SDValue Val) {
   return false;
 }
 
-// Is an ADDI eligible for folding for non-TOC-based local-exec accesses?
-static bool isEligibleToFoldADDIForLocalExecAccesses(SelectionDAG *DAG,
-                                                     SDValue ADDIToFold) {
+// Is an ADDI eligible for folding for non-TOC-based local-[exec|dynamic]
+// accesses?
+static bool isEligibleToFoldADDIForFasterLocalAccesses(SelectionDAG *DAG,
+                                                       SDValue ADDIToFold) {
   // Check if ADDIToFold (the ADDI that we want to fold into local-exec
   // accesses), is truly an ADDI.
   if (!ADDIToFold.isMachineOpcode() ||
       (ADDIToFold.getMachineOpcode() != PPC::ADDI8))
     return false;
 
-  // Folding is only allowed for the AIX small-local-exec TLS target attribute
-  // or when the 'aix-small-tls' global variable attribute is present.
+  // Folding is only allowed for the AIX small-local-[exec|dynamic] TLS target
+  // attribute or when the 'aix-small-tls' global variable attribute is present.
   const PPCSubtarget &Subtarget =
       DAG->getMachineFunction().getSubtarget<PPCSubtarget>();
   SDValue TLSVarNode = ADDIToFold.getOperand(1);
-  if (!(Subtarget.hasAIXSmallLocalExecTLS() || hasAIXSmallTLSAttr(TLSVarNode)))
-    return false;
-
-  // The first operand of the ADDIToFold should be the thread pointer.
-  // This transformation is only performed if the first operand of the
-  // addi is the thread pointer.
-  SDValue TPRegNode = ADDIToFold.getOperand(0);
-  RegisterSDNode *TPReg = dyn_cast<RegisterSDNode>(TPRegNode.getNode());
-  if (!TPReg || (TPReg->getReg() != Subtarget.getThreadPointerRegister()))
+  if (!(Subtarget.hasAIXSmallLocalDynamicTLS() ||
+        Subtarget.hasAIXSmallLocalExecTLS() || hasAIXSmallTLSAttr(TLSVarNode)))
     return false;
 
   // The second operand of the ADDIToFold should be the global TLS address
@@ -7619,24 +7613,36 @@ static bool isEligibleToFoldADDIForLocalExecAccesses(SelectionDAG *DAG,
   if (!GA)
     return false;
 
-  // The local-exec TLS variable should only have the MO_TPREL_FLAG target flag,
-  // so this optimization is not performed otherwise if the flag is not set.
+  if (DAG->getTarget().getTLSModel(GA->getGlobal()) == TLSModel::LocalExec) {
+    // The first operand of the ADDIToFold should be the thread pointer.
+    // This transformation is only performed if the first operand of the
+    // addi is the thread pointer.
+    SDValue TPRegNode = ADDIToFold.getOperand(0);
+    RegisterSDNode *TPReg = dyn_cast<RegisterSDNode>(TPRegNode.getNode());
+    if (!TPReg || (TPReg->getReg() != Subtarget.getThreadPointerRegister()))
+      return false;
+  }
+
+  // The local-[exec|dynamic] TLS variable should only have the
+  // [MO_TPREL_FLAG|MO_TLSLD_FLAG] target flags, so this optimization is not
+  // performed otherwise if the flag is not set.
   unsigned TargetFlags = GA->getTargetFlags();
-  if (TargetFlags != PPCII::MO_TPREL_FLAG)
+  if (!(TargetFlags == PPCII::MO_TPREL_FLAG ||
+        TargetFlags == PPCII::MO_TLSLD_FLAG))
     return false;
 
   // If all conditions are satisfied, the ADDI is valid for folding.
   return true;
 }
 
-// For non-TOC-based local-exec access where an addi is feeding into another
-// addi, fold this sequence into a single addi if possible.
-// Before this optimization, the sequence appears as:
-//    addi rN, r13, sym@le
+// For non-TOC-based local-[exec|dynamic] access where an addi is feeding into
+// another addi, fold this sequence into a single addi if possible. Before this
+// optimization, the sequence appears as:
+//    addi rN, r13, sym@[le|ld]
 //    addi rM, rN, imm
 // After this optimization, we can fold the two addi into a single one:
-//    addi rM, r13, sym@le + imm
-static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) {
+//    addi rM, r13, sym@[le|ld] + imm
+static void foldADDIForFasterLocalAccesses(SDNode *N, SelectionDAG *DAG) {
   if (N->getMachineOpcode() != PPC::ADDI8)
     return;
 
@@ -7644,27 +7650,17 @@ static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) {
   // we want optimized out.
   SDValue InitialADDI = N->getOperand(0);
 
-  if (!isEligibleToFoldADDIForLocalExecAccesses(DAG, InitialADDI))
+  if (!isEligibleToFoldADDIForFasterLocalAccesses(DAG, InitialADDI))
     return;
 
-  // At this point, InitialADDI can be folded into a non-TOC-based local-exec
-  // access. The first operand of InitialADDI should be the thread pointer,
-  // which has been checked in isEligibleToFoldADDIForLocalExecAccesses().
-  SDValue TPRegNode = InitialADDI.getOperand(0);
-  [[maybe_unused]] RegisterSDNode *TPReg = dyn_cast<RegisterSDNode>(TPRegNode.getNode());
-  [[maybe_unused]] const PPCSubtarget &Subtarget =
-      DAG->getMachineFunction().getSubtarget<PPCSubtarget>();
-  assert((TPReg && (TPReg->getReg() == Subtarget.getThreadPointerRegister())) &&
-         "Expecting the first operand to be a thread pointer for folding addi "
-         "in local-exec accesses!");
-
   // The second operand of the InitialADDI should be the global TLS address
-  // (the local-exec TLS variable), with the MO_TPREL_FLAG target flag.
-  // This has been checked in isEligibleToFoldADDIForLocalExecAccesses().
+  // (the local-[exec|dynamic] TLS variable), with the
+  // [MO_TPREL_FLAG|MO_TLSLD_FLAG] target flag. This has been checked in
+  // isEligibleToFoldADDIForFasterLocalAccesses().
   SDValue TLSVarNode = InitialADDI.getOperand(1);
   GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(TLSVarNode);
   assert(GA && "Expecting a valid GlobalAddressSDNode when folding addi into "
-               "local-exec accesses!");
+               "local-[exec|dynamic] accesses!");
   unsigned TargetFlags = GA->getTargetFlags();
 
   // The second operand of the addi that we want to preserve will be an
@@ -7676,7 +7672,7 @@ static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) {
   TLSVarNode = DAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(GA), MVT::i64,
                                            Offset, TargetFlags);
 
-  (void)DAG->UpdateNodeOperands(N, TPRegNode, TLSVarNode);
+  (void)DAG->UpdateNodeOperands(N, InitialADDI.getOperand(0), TLSVarNode);
   if (InitialADDI.getNode()->use_empty())
     DAG->RemoveDeadNode(InitialADDI.getNode());
 }
@@ -7693,8 +7689,9 @@ void PPCDAGToDAGISel::PeepholePPC64() {
     if (isVSXSwap(SDValue(N, 0)))
       reduceVSXSwap(N, CurDAG);
 
-    // This optimization is performed for non-TOC-based local-exec accesses.
-    foldADDIForLocalExecAccesses(N, CurDAG);
+    // This optimization is performed for non-TOC-based local-[exec|dynamic]
+    // accesses.
+    foldADDIForFasterLocalAccesses(N, CurDAG);
 
     unsigned FirstOp;
     unsigned StorageOpcode = N->getMachineOpcode();
@@ -7852,13 +7849,15 @@ void PPCDAGToDAGISel::PeepholePPC64() {
         ImmOpnd = CurDAG->getTargetConstant(Offset, SDLoc(ImmOpnd),
                                             ImmOpnd.getValueType());
       } else if (Offset != 0) {
-        // This optimization is performed for non-TOC-based local-exec accesses.
-        if (isEligibleToFoldADDIForLocalExecAccesses(CurDAG, Base)) {
+        // This optimization is performed for non-TOC-based local-[exec|dynamic]
+        // accesses.
+        if (isEligibleToFoldADDIForFasterLocalAccesses(CurDAG, Base)) {
           // Add the non-zero offset information into the load or store
-          // instruction to be used for non-TOC-based local-exec accesses.
+          // instruction to be used for non-TOC-based local-[exec|dynamic]
+          // accesses.
           GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd);
           assert(GA && "Expecting a valid GlobalAddressSDNode when folding "
-                       "addi into local-exec accesses!");
+                       "addi into local-[exec|dynamic] accesses!");
           ImmOpnd = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(GA),
                                                    MVT::i64, Offset,
                                                    GA->getTargetFlags());
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index d27932f2915f..0a7483fc45b2 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -141,6 +141,11 @@ static cl::opt<unsigned> PPCGatherAllAliasesMaxDepth(
     "ppc-gather-alias-max-depth", cl::init(18), cl::Hidden,
     cl::desc("max depth when checking alias info in GatherAllAliases()"));
 
+static cl::opt<unsigned> PPCAIXTLSModelOptUseIEForLDLimit(
+    "ppc-aix-shared-lib-tls-model-opt-limit", cl::init(1), cl::Hidden,
+    cl::desc("Set inclusive limit count of TLS local-dynamic access(es) in a "
+             "function to use initial-exec"));
+
 STATISTIC(NumTailCalls, "Number of tail calls");
 STATISTIC(NumSiblingCalls, "Number of sibling calls");
 STATISTIC(ShufflesHandledWithVPERM,
@@ -3362,6 +3367,54 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
   return LowerGlobalTLSAddressLinux(Op, DAG);
 }
 
+/// updateForAIXShLibTLSModelOpt - Helper to initialize TLS model opt settings,
+/// and then apply the update.
+static void updateForAIXShLibTLSModelOpt(TLSModel::Model &Model,
+                                         SelectionDAG &DAG,
+                                         const TargetMachine &TM) {
+  // Initialize TLS model opt setting lazily:
+  // (1) Use initial-exec for single TLS var references within current function.
+  // (2) Use local-dynamic for multiple TLS var references within current
+  // function.
+  PPCFunctionInfo *FuncInfo =
+      DAG.getMachineFunction().getInfo<PPCFunctionInfo>();
+  if (!FuncInfo->isAIXFuncTLSModelOptInitDone()) {
+    SmallPtrSet<const GlobalValue *, 8> TLSGV;
+    // Iterate over all instructions within current function, collect all TLS
+    // global variables (global variables taken as the first parameter to
+    // Intrinsic::threadlocal_address).
+    const Function &Func = DAG.getMachineFunction().getFunction();
+    for (Function::const_iterator BI = Func.begin(), BE = Func.end(); BI != BE;
+         ++BI)
+      for (BasicBlock::const_iterator II = BI->begin(), IE = BI->end();
+           II != IE; ++II)
+        if (II->getOpcode() == Instruction::Call)
+          if (const CallInst *CI = dyn_cast<const CallInst>(&*II))
+            if (Function *CF = CI->getCalledFunction())
+              if (CF->isDeclaration() &&
+                  CF->getIntrinsicID() == Intrinsic::threadlocal_address)
+                if (const GlobalValue *GV =
+                        dyn_cast<GlobalValue>(II->getOperand(0))) {
+                  TLSModel::Model GVModel = TM.getTLSModel(GV);
+                  if (GVModel == TLSModel::LocalDynamic)
+                    TLSGV.insert(GV);
+                }
+
+    unsigned TLSGVCnt = TLSGV.size();
+    LLVM_DEBUG(dbgs() << format("LocalDynamic TLSGV count:%d\n", TLSGVCnt));
+    if (TLSGVCnt <= PPCAIXTLSModelOptUseIEForLDLimit)
+      FuncInfo->setAIXFuncUseTLSIEForLD();
+    FuncInfo->setAIXFuncTLSModelOptInitDone();
+  }
+
+  if (FuncInfo->isAIXFuncUseTLSIEForLD()) {
+    LLVM_DEBUG(
+        dbgs() << DAG.getMachineFunction().getName()
+               << " function is using the TLS-IE model for TLS-LD access.\n");
+    Model = TLSModel::InitialExec;
+  }
+}
+
 SDValue PPCTargetLowering::LowerGlobalTLSAddressAIX(SDValue Op,
                                                     SelectionDAG &DAG) const {
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
@@ -3374,6 +3427,11 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddressAIX(SDValue Op,
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   bool Is64Bit = Subtarget.isPPC64();
   TLSModel::Model Model = getTargetMachine().getTLSModel(GV);
+
+  // Apply update to the TLS model.
+  if (Subtarget.hasAIXShLibTLSModelOpt())
+    updateForAIXShLibTLSModelOpt(Model, DAG, getTargetMachine());
+
   bool IsTLSLocalExecModel = Model == TLSModel::LocalExec;
 
   if (IsTLSLocalExecModel || Model == TLSModel::InitialExec) {
diff --git a/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp b/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
index c05bb37e58bf..31a261482358 100644
--- a/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -13,6 +13,7 @@
 
 #include "MCTargetDesc/PPCMCExpr.h"
 #include "PPC.h"
+#include "PPCMachineFunctionInfo.h"
 #include "PPCSubtarget.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Twine.h"
@@ -81,6 +82,8 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
   }
 
   const TargetMachine &TM = Printer.TM;
+  const MachineInstr *MI = MO.getParent();
+  const MachineFunction *MF = MI->getMF();
 
   if (MO.getTargetFlags() == PPCII::MO_PLT)
     RefKind = MCSymbolRefExpr::VK_PLT;
@@ -100,18 +103,22 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
            MO.getTargetFlags() == PPCII::MO_TLSLD_FLAG) {
     assert(MO.isGlobal() && "Only expecting a global MachineOperand here!");
     TLSModel::Model Model = TM.getTLSModel(MO.getGlobal());
+    const PPCFunctionInfo *FuncInfo = MF->getInfo<PPCFunctionInfo>();
     // For the local-[exec|dynamic] TLS model, we may generate the offset from
     // the TLS base as an immediate operand (instead of using a TOC entry). Set
     // the relocation type in case the result is used for purposes other than a
     // TOC reference. In TOC reference cases, this result is discarded.
     if (Model == TLSModel::LocalExec)
       RefKind = MCSymbolRefExpr::VK_PPC_AIX_TLSLE;
+    else if (Model == TLSModel::LocalDynamic &&
+             FuncInfo->isAIXFuncUseTLSIEForLD())
+      // On AIX, TLS model opt may have turned local-dynamic accesses into
+      // initial-exec accesses.
+      RefKind = MCSymbolRefExpr::VK_PPC_AIX_TLSIE;
     else if (Model == TLSModel::LocalDynamic)
       RefKind = MCSymbolRefExpr::VK_PPC_AIX_TLSLD;
   }
 
-  const MachineInstr *MI = MO.getParent();
-  const MachineFunction *MF = MI->getMF();
   const Module *M = MF->getFunction().getParent();
   const PPCSubtarget *Subtarget = &(MF->getSubtarget<PPCSubtarget>());
 
diff --git a/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
index df655a3be951..b7d14da05ee2 100644
--- a/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
@@ -150,6 +150,11 @@ private:
   /// to use SExt/ZExt flags in later optimization.
   std::vector<std::pair<Register, ISD::ArgFlagsTy>> LiveInAttrs;
 
+  /// Flags for aix-shared-lib-tls-model-opt, will be lazily initialized for
+  /// each function.
+  bool AIXFuncUseTLSIEForLD = false;
+  bool AIXFuncTLSModelOptInitDone = false;
+
 public:
   explicit PPCFunctionInfo(const Function &F, const TargetSubtargetInfo *STI);
 
@@ -221,6 +226,13 @@ public:
   void setHasFastCall() { HasFastCall = true; }
   bool hasFastCall() const { return HasFastCall;}
 
+  void setAIXFuncTLSModelOptInitDone() { AIXFuncTLSModelOptInitDone = true; }
+  bool isAIXFuncTLSModelOptInitDone() const {
+    return AIXFuncTLSModelOptInitDone;
+  }
+  void setAIXFuncUseTLSIEForLD() { AIXFuncUseTLSIEForLD = true; }
+  bool isAIXFuncUseTLSIEForLD() const { return AIXFuncUseTLSIEForLD; }
+
   int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
   void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
 
diff --git a/llvm/lib/Target/PowerPC/PPCMergeStringPool.cpp b/llvm/lib/Target/PowerPC/PPCMergeStringPool.cpp
index 76d60c28f1e4..abc5353e4a5e 100644
--- a/llvm/lib/Target/PowerPC/PPCMergeStringPool.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMergeStringPool.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/Pass.h"
@@ -117,9 +118,20 @@ private:
 // sure that they can be replaced.
 static bool hasReplaceableUsers(GlobalVariable &GV) {
   for (User *CurrentUser : GV.users()) {
-    // Instruction users are always valid.
-    if (isa<Instruction>(CurrentUser))
+    if (auto *I = dyn_cast<Instruction>(CurrentUser)) {
+      // Do not merge globals in exception pads.
+      if (I->isEHPad())
+        return false;
+
+      if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+        // Some intrinsics require a plain global.
+        if (II->getIntrinsicID() == Intrinsic::eh_typeid_for)
+          return false;
+      }
+
+      // Other instruction users are always valid.
       continue;
+    }
 
     // We cannot replace GlobalValue users because they are not just nodes
     // in IR. To replace a user like this we would need to create a new
@@ -314,14 +326,6 @@ void PPCMergeStringPool::replaceUsesWithGEP(GlobalVariable *GlobalToReplace,
     Users.push_back(CurrentUser);
 
   for (User *CurrentUser : Users) {
-    Instruction *UserInstruction = dyn_cast<Instruction>(CurrentUser);
-    Constant *UserConstant = dyn_cast<Constant>(CurrentUser);
-
-    // At this point we expect that the user is either an instruction or a
-    // constant.
-    assert((UserConstant || UserInstruction) &&
-           "Expected the user to be an instruction or a constant.");
-
     // The user was not found so it must have been replaced earlier.
     if (!userHasOperand(CurrentUser, GlobalToReplace))
       continue;
@@ -330,38 +334,13 @@ void PPCMergeStringPool::replaceUsesWithGEP(GlobalVariable *GlobalToReplace,
     if (isa<GlobalValue>(CurrentUser))
       continue;
 
-    if (!UserInstruction) {
-      // User is a constant type.
-      Constant *ConstGEP = ConstantExpr::getInBoundsGetElementPtr(
-          PooledStructType, GPool, Indices);
-      UserConstant->handleOperandChange(GlobalToReplace, ConstGEP);
-      continue;
-    }
-
-    if (PHINode *UserPHI = dyn_cast<PHINode>(UserInstruction)) {
-      // GEP instructions cannot be added before PHI nodes.
-      // With getInBoundsGetElementPtr we create the GEP and then replace it
-      // inline into the PHI.
-      Constant *ConstGEP = ConstantExpr::getInBoundsGetElementPtr(
-          PooledStructType, GPool, Indices);
-      UserPHI->replaceUsesOfWith(GlobalToReplace, ConstGEP);
-      continue;
-    }
-    // The user is a valid instruction that is not a PHINode.
-    GetElementPtrInst *GEPInst =
-        GetElementPtrInst::Create(PooledStructType, GPool, Indices);
-    GEPInst->insertBefore(UserInstruction);
-
-    LLVM_DEBUG(dbgs() << "Inserting GEP before:\n");
-    LLVM_DEBUG(UserInstruction->dump());
-
+    Constant *ConstGEP = ConstantExpr::getInBoundsGetElementPtr(
+        PooledStructType, GPool, Indices);
     LLVM_DEBUG(dbgs() << "Replacing this global:\n");
     LLVM_DEBUG(GlobalToReplace->dump());
     LLVM_DEBUG(dbgs() << "with this:\n");
-    LLVM_DEBUG(GEPInst->dump());
-
-    // After the GEP is inserted the GV can be replaced.
-    CurrentUser->replaceUsesOfWith(GlobalToReplace, GEPInst);
+    LLVM_DEBUG(ConstGEP->dump());
+    GlobalToReplace->replaceAllUsesWith(ConstGEP);
   }
 }
 
diff --git a/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp b/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
index a182be3ea712..d45edd74ab85 100644
--- a/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
+++ b/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
@@ -415,7 +415,7 @@ static bool hasPCRelativeForm(MachineInstr &Use) {
     bool runOnMachineFunction(MachineFunction &MF) override {
       // If the user wants to set the DSCR using command-line options,
       // load in the specified value at the start of main.
-      if (DSCRValue.getNumOccurrences() > 0 && MF.getName().equals("main") &&
+      if (DSCRValue.getNumOccurrences() > 0 && MF.getName() == "main" &&
           MF.getFunction().hasExternalLinkage()) {
         DSCRValue = (uint32_t)(DSCRValue & 0x01FFFFFF); // 25-bit DSCR mask
         RegScavenger RS;
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index d1722555f1fc..0628fbb26245 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -141,6 +141,11 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU,
                          "-data-sections.\n",
                          false);
   }
+
+  if (HasAIXShLibTLSModelOpt && (!TargetTriple.isOSAIX() || !IsPPC64))
+    report_fatal_error("The aix-shared-lib-tls-model-opt attribute "
+                       "is only supported on AIX in 64-bit mode.\n",
+                       false);
 }
 
 bool PPCSubtarget::enableMachineScheduler() const { return true; }
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 8ac79ddce595..6af1d5010d3a 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -84,6 +84,9 @@ class RISCVAsmParser : public MCTargetAsmParser {
   SMLoc getLoc() const { return getParser().getTok().getLoc(); }
   bool isRV64() const { return getSTI().hasFeature(RISCV::Feature64Bit); }
   bool isRVE() const { return getSTI().hasFeature(RISCV::FeatureStdExtE); }
+  bool enableExperimentalExtension() const {
+    return getSTI().hasFeature(RISCV::Experimental);
+  }
 
   RISCVTargetStreamer &getTargetStreamer() {
     assert(getParser().getStreamer().getTargetStreamer() &&
@@ -2824,17 +2827,19 @@ bool RISCVAsmParser::parseDirectiveOption() {
         break;
       }
 
-      auto Ext = llvm::lower_bound(RISCVFeatureKV, Arch);
-      if (Ext == std::end(RISCVFeatureKV) || StringRef(Ext->Key) != Arch ||
-          !RISCVISAInfo::isSupportedExtension(Arch)) {
-        if (isDigit(Arch.back()))
-          return Error(
-              Loc,
-              "Extension version number parsing not currently implemented");
+      if (isDigit(Arch.back()))
+        return Error(
+            Loc, "Extension version number parsing not currently implemented");
+
+      std::string Feature = RISCVISAInfo::getTargetFeatureForExtension(Arch);
+      if (!enableExperimentalExtension() &&
+          StringRef(Feature).starts_with("experimental-"))
+        return Error(Loc, "Unexpected experimental extensions.");
+      auto Ext = llvm::lower_bound(RISCVFeatureKV, Feature);
+      if (Ext == std::end(RISCVFeatureKV) || StringRef(Ext->Key) != Feature)
         return Error(Loc, "unknown extension feature");
-      }
 
-      Args.emplace_back(Type, Ext->Key);
+      Args.emplace_back(Type, Arch.str());
 
       if (Type == RISCVOptionArchArgType::Plus) {
         FeatureBitset OldFeatureBits = STI->getFeatureBits();
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
index 3103992a86c0..791d364655e5 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
@@ -177,6 +177,20 @@ RISCVInstructionSelector::selectShiftMask(MachineOperand &Root) const {
 
   APInt AndMask;
   Register AndSrcReg;
+  // Try to combine the following pattern (applicable to other shift
+  // instructions as well as 32-bit ones):
+  //
+  //   %4:gprb(s64) = G_AND %3, %2
+  //   %5:gprb(s64) = G_LSHR %1, %4(s64)
+  //
+  // According to RISC-V's ISA manual, SLL, SRL, and SRA ignore other bits than
+  // the lowest log2(XLEN) bits of register rs2. As for the above pattern, if
+  // the lowest log2(XLEN) bits of register rd and rs2 of G_AND are the same,
+  // then it can be eliminated. Given register rs1 or rs2 holding a constant
+  // (the and mask), there are two cases G_AND can be erased:
+  //
+  // 1. the lowest log2(XLEN) bits of the and mask are all set
+  // 2. the bits of the register being masked are already unset (zero set)
   if (mi_match(ShAmtReg, MRI, m_GAnd(m_Reg(AndSrcReg), m_ICst(AndMask)))) {
     APInt ShMask(AndMask.getBitWidth(), ShiftWidth - 1);
     if (ShMask.isSubsetOf(AndMask)) {
@@ -184,7 +198,7 @@ RISCVInstructionSelector::selectShiftMask(MachineOperand &Root) const {
     } else {
       // SimplifyDemandedBits may have optimized the mask so try restoring any
       // bits that are known zero.
-      KnownBits Known = KB->getKnownBits(ShAmtReg);
+      KnownBits Known = KB->getKnownBits(AndSrcReg);
       if (ShMask.isSubsetOf(AndMask | Known.Zero))
         ShAmtReg = AndSrcReg;
     }
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
index cc534f29685f..686c8d89a732 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
@@ -117,7 +117,9 @@ RISCVRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
   case RISCV::GPRNoX0RegClassID:
   case RISCV::GPRNoX0X2RegClassID:
   case RISCV::GPRJALRRegClassID:
+  case RISCV::GPRJALRNonX7RegClassID:
   case RISCV::GPRTCRegClassID:
+  case RISCV::GPRTCNonX7RegClassID:
   case RISCV::GPRC_and_GPRTCRegClassID:
   case RISCV::GPRCRegClassID:
   case RISCV::GPRC_and_SR07RegClassID:
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index 5ea386c3c32a..0863345b0c6d 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -126,6 +126,10 @@ void RISCVMCCodeEmitter::expandFunctionCall(const MCInst &MI,
   if (MI.getOpcode() == RISCV::PseudoTAIL) {
     Func = MI.getOperand(0);
     Ra = RISCV::X6;
+    // For Zicfilp, PseudoTAIL should be expanded to a software guarded branch.
+    // It means to use t2(x7) as rs1 of JALR to expand PseudoTAIL.
+    if (STI.hasFeature(RISCV::FeatureStdExtZicfilp))
+      Ra = RISCV::X7;
   } else if (MI.getOpcode() == RISCV::PseudoCALLReg) {
     Func = MI.getOperand(1);
     Ra = MI.getOperand(0).getReg();
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
index c3bae152993e..0a304d4cb7d9 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
@@ -310,56 +310,45 @@ InstSeq generateInstSeq(int64_t Val, const MCSubtargetInfo &STI) {
     }
   }
 
-  // Perform optimization with BCLRI/BSETI in the Zbs extension.
+  // Perform optimization with BSETI in the Zbs extension.
   if (Res.size() > 2 && STI.hasFeature(RISCV::FeatureStdExtZbs)) {
-    // 1. For values in range 0xffffffff 7fffffff ~ 0xffffffff 00000000,
-    //    call generateInstSeqImpl with Val|0x80000000 (which is expected be
-    //    an int32), then emit (BCLRI r, 31).
-    // 2. For values in range 0x80000000 ~ 0xffffffff, call generateInstSeqImpl
-    //    with Val&~0x80000000 (which is expected to be an int32), then
-    //    emit (BSETI r, 31).
-    int64_t NewVal;
-    unsigned Opc;
-    if (Val < 0) {
-      Opc = RISCV::BCLRI;
-      NewVal = Val | 0x80000000ll;
-    } else {
-      Opc = RISCV::BSETI;
-      NewVal = Val & ~0x80000000ll;
-    }
-    if (isInt<32>(NewVal)) {
-      RISCVMatInt::InstSeq TmpSeq;
-      generateInstSeqImpl(NewVal, STI, TmpSeq);
-      if ((TmpSeq.size() + 1) < Res.size()) {
-        TmpSeq.emplace_back(Opc, 31);
-        Res = TmpSeq;
-      }
+    // Create a simm32 value for LUI+ADDIW by forcing the upper 33 bits to zero.
+    // Xor that with original value to get which bits should be set by BSETI.
+    uint64_t Lo = Val & 0x7fffffff;
+    uint64_t Hi = Val ^ Lo;
+    assert(Hi != 0);
+    RISCVMatInt::InstSeq TmpSeq;
+
+    if (Lo != 0)
+      generateInstSeqImpl(Lo, STI, TmpSeq);
+
+    if (TmpSeq.size() + llvm::popcount(Hi) < Res.size()) {
+      do {
+        TmpSeq.emplace_back(RISCV::BSETI, llvm::countr_zero(Hi));
+        Hi &= (Hi - 1); // Clear lowest set bit.
+      } while (Hi != 0);
+      Res = TmpSeq;
     }
+  }
+
+  // Perform optimization with BCLRI in the Zbs extension.
+  if (Res.size() > 2 && STI.hasFeature(RISCV::FeatureStdExtZbs)) {
+    // Create a simm32 value for LUI+ADDIW by forcing the upper 33 bits to one.
+    // Xor that with original value to get which bits should be cleared by
+    // BCLRI.
+    uint64_t Lo = Val | 0xffffffff80000000;
+    uint64_t Hi = Val ^ Lo;
+    assert(Hi != 0);
 
-    // Try to use BCLRI for upper 32 bits if the original lower 32 bits are
-    // negative int32, or use BSETI for upper 32 bits if the original lower
-    // 32 bits are positive int32.
-    int32_t Lo = Lo_32(Val);
-    uint32_t Hi = Hi_32(Val);
-    Opc = 0;
     RISCVMatInt::InstSeq TmpSeq;
     generateInstSeqImpl(Lo, STI, TmpSeq);
-    // Check if it is profitable to use BCLRI/BSETI.
-    if (Lo > 0 && TmpSeq.size() + llvm::popcount(Hi) < Res.size()) {
-      Opc = RISCV::BSETI;
-    } else if (Lo < 0 && TmpSeq.size() + llvm::popcount(~Hi) < Res.size()) {
-      Opc = RISCV::BCLRI;
-      Hi = ~Hi;
-    }
-    // Search for each bit and build corresponding BCLRI/BSETI.
-    if (Opc > 0) {
-      while (Hi != 0) {
-        unsigned Bit = llvm::countr_zero(Hi);
-        TmpSeq.emplace_back(Opc, Bit + 32);
+
+    if (TmpSeq.size() + llvm::popcount(Hi) < Res.size()) {
+      do {
+        TmpSeq.emplace_back(RISCV::BCLRI, llvm::countr_zero(Hi));
         Hi &= (Hi - 1); // Clear lowest set bit.
-      }
-      if (TmpSeq.size() < Res.size())
-        Res = TmpSeq;
+      } while (Hi != 0);
+      Res = TmpSeq;
     }
   }
 
diff --git a/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp b/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp
index df607236f7d5..5e6b7891449f 100644
--- a/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp
+++ b/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp
@@ -14,6 +14,9 @@
 #include "RISCVInstrInfo.h"
 #include "RISCVSubtarget.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveDebugVariables.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveStacks.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
@@ -32,6 +35,12 @@ public:
   bool runOnMachineFunction(MachineFunction &MF) override;
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
+    AU.addRequired<LiveIntervals>();
+    AU.addPreserved<LiveIntervals>();
+    AU.addRequired<LiveIntervals>();
+    AU.addPreserved<SlotIndexes>();
+    AU.addPreserved<LiveDebugVariables>();
+    AU.addPreserved<LiveStacks>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
@@ -51,9 +60,9 @@ bool RISCVDeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
 
-  const MachineRegisterInfo *MRI = &MF.getRegInfo();
   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  LiveIntervals &LIS = getAnalysis<LiveIntervals>();
   LLVM_DEBUG(dbgs() << "***** RISCVDeadRegisterDefinitions *****\n");
 
   bool MadeChange = false;
@@ -77,10 +86,8 @@ bool RISCVDeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) {
           LLVM_DEBUG(dbgs() << "    Ignoring, def is tied operand.\n");
           continue;
         }
-        // We should not have any relevant physreg defs that are replacable by
-        // zero before register allocation. So we just check for dead vreg defs.
         Register Reg = MO.getReg();
-        if (!Reg.isVirtual() || (!MO.isDead() && !MRI->use_nodbg_empty(Reg)))
+        if (!Reg.isVirtual() || !MO.isDead())
           continue;
         LLVM_DEBUG(dbgs() << "    Dead def operand #" << I << " in:\n      ";
                    MI.print(dbgs()));
@@ -89,8 +96,9 @@ bool RISCVDeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) {
           LLVM_DEBUG(dbgs() << "    Ignoring, register is not a GPR.\n");
           continue;
         }
+        assert(LIS.hasInterval(Reg));
+        LIS.removeInterval(Reg);
         MO.setReg(RISCV::X0);
-        MO.setIsDead();
         LLVM_DEBUG(dbgs() << "    Replacing with zero register. New:\n      ";
                    MI.print(dbgs()));
         ++NumDeadDefsReplaced;
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index c3dc4ea53697..89e1214f469d 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -156,6 +156,8 @@ def FeatureStdExtZicfilp
 def HasStdExtZicfilp : Predicate<"Subtarget->hasStdExtZicfilp()">,
                        AssemblerPredicate<(all_of FeatureStdExtZicfilp),
                                           "'Zicfilp' (Landing pad)">;
+def NoStdExtZicfilp : Predicate<"!Subtarget->hasStdExtZicfilp()">,
+                                 AssemblerPredicate<(all_of (not FeatureStdExtZicfilp))>;
 
 def FeatureStdExtZicfiss
     : RISCVExperimentalExtension<"zicfiss", 0, 4,
@@ -865,6 +867,9 @@ def FeatureStdExtSscounterenw
                      "'Sscounterenw' (Support writeable scounteren enable "
                      "bit for any hpmcounter that is not read-only zero)">;
 
+def FeatureStdExtSmstateen
+    : RISCVExtension<"smstateen", 1, 0,
+                     "'Smstateen' (Machine-mode view of the state-enable extension)">;
 def FeatureStdExtSsstateen
     : RISCVExtension<"ssstateen", 1, 0,
                      "'Ssstateen' (Supervisor-mode view of the state-enable extension)">;
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index dc3ad5ac5908..3c4646b95715 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -1416,6 +1416,19 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
           ReplaceNode(Node, SLLI);
           return;
         }
+
+        // If we have 32 bits in the mask, we can use SLLI_UW instead of SLLI.
+        if (C2 < Trailing && Leading + Trailing == 32 && OneUseOrZExtW &&
+            Subtarget->hasStdExtZba()) {
+          SDNode *SRLI = CurDAG->getMachineNode(
+              RISCV::SRLI, DL, VT, X,
+              CurDAG->getTargetConstant(Trailing - C2, DL, VT));
+          SDNode *SLLI_UW = CurDAG->getMachineNode(
+              RISCV::SLLI_UW, DL, VT, SDValue(SRLI, 0),
+              CurDAG->getTargetConstant(Trailing, DL, VT));
+          ReplaceNode(Node, SLLI_UW);
+          return;
+        }
       }
     }
 
@@ -3478,8 +3491,15 @@ static bool usesAllOnesMask(SDNode *N, unsigned MaskOpIdx) {
 }
 
 static bool isImplicitDef(SDValue V) {
-  return V.isMachineOpcode() &&
-         V.getMachineOpcode() == TargetOpcode::IMPLICIT_DEF;
+  if (!V.isMachineOpcode())
+    return false;
+  if (V.getMachineOpcode() == TargetOpcode::REG_SEQUENCE) {
+    for (unsigned I = 1; I < V.getNumOperands(); I += 2)
+      if (!isImplicitDef(V.getOperand(I)))
+        return false;
+    return true;
+  }
+  return V.getMachineOpcode() == TargetOpcode::IMPLICIT_DEF;
 }
 
 // Optimize masked RVV pseudo instructions with a known all-ones mask to their
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 19ef1f2f18ec..d0f62b1d5414 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -713,7 +713,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         ISD::VP_FRINT,       ISD::VP_FNEARBYINT,  ISD::VP_IS_FPCLASS,
         ISD::VP_FMINIMUM,    ISD::VP_FMAXIMUM,    ISD::VP_LRINT,
         ISD::VP_LLRINT,      ISD::EXPERIMENTAL_VP_REVERSE,
-        ISD::EXPERIMENTAL_VP_SPLICE};
+        ISD::EXPERIMENTAL_VP_SPLICE, ISD::VP_REDUCE_FMINIMUM,
+        ISD::VP_REDUCE_FMAXIMUM};
 
     static const unsigned IntegerVecReduceOps[] = {
         ISD::VECREDUCE_ADD,  ISD::VECREDUCE_AND,  ISD::VECREDUCE_OR,
@@ -958,7 +959,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         ISD::VP_FFLOOR,      ISD::VP_FROUND,       ISD::VP_FROUNDEVEN,
         ISD::VP_FCOPYSIGN,   ISD::VP_FROUNDTOZERO, ISD::VP_FRINT,
         ISD::VP_FNEARBYINT,  ISD::VP_SETCC,        ISD::VP_FMINIMUM,
-        ISD::VP_FMAXIMUM};
+        ISD::VP_FMAXIMUM,    ISD::VP_REDUCE_FMINIMUM, ISD::VP_REDUCE_FMAXIMUM};
 
     // Sets common operation actions on RVV floating-point vector types.
     const auto SetCommonVFPActions = [&](MVT VT) {
@@ -1087,6 +1088,23 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       }
     }
 
+    // TODO: Could we merge some code with zvfhmin?
+    if (Subtarget.hasVInstructionsBF16()) {
+      for (MVT VT : BF16VecVTs) {
+        if (!isTypeLegal(VT))
+          continue;
+        setOperationAction({ISD::FP_ROUND, ISD::FP_EXTEND}, VT, Custom);
+        setOperationAction({ISD::VP_FP_ROUND, ISD::VP_FP_EXTEND}, VT, Custom);
+        setOperationAction({ISD::STRICT_FP_ROUND, ISD::STRICT_FP_EXTEND}, VT,
+                           Custom);
+        setOperationAction({ISD::CONCAT_VECTORS, ISD::INSERT_SUBVECTOR,
+                            ISD::EXTRACT_SUBVECTOR},
+                           VT, Custom);
+        setOperationAction({ISD::LOAD, ISD::STORE}, VT, Custom);
+        // TODO: Promote to fp32.
+      }
+    }
+
     if (Subtarget.hasVInstructionsF32()) {
       for (MVT VT : F32VecVTs) {
         if (!isTypeLegal(VT))
@@ -1302,6 +1320,19 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
           continue;
         }
 
+        if (VT.getVectorElementType() == MVT::bf16) {
+          setOperationAction({ISD::FP_ROUND, ISD::FP_EXTEND}, VT, Custom);
+          setOperationAction({ISD::VP_FP_ROUND, ISD::VP_FP_EXTEND}, VT, Custom);
+          setOperationAction({ISD::STRICT_FP_ROUND, ISD::STRICT_FP_EXTEND}, VT,
+                             Custom);
+          setOperationAction({ISD::CONCAT_VECTORS, ISD::INSERT_SUBVECTOR,
+                              ISD::EXTRACT_SUBVECTOR},
+                             VT, Custom);
+          setOperationAction({ISD::LOAD, ISD::STORE}, VT, Custom);
+          // TODO: Promote to fp32.
+          continue;
+        }
+
         // We use EXTRACT_SUBVECTOR as a "cast" from scalable to fixed.
         setOperationAction({ISD::INSERT_SUBVECTOR, ISD::EXTRACT_SUBVECTOR}, VT,
                            Custom);
@@ -1989,6 +2020,7 @@ bool RISCVTargetLowering::canSplatOperand(unsigned Opcode, int Operand) const {
   case Instruction::SDiv:
   case Instruction::URem:
   case Instruction::SRem:
+  case Instruction::Select:
     return Operand == 1;
   default:
     return false;
@@ -2561,6 +2593,10 @@ static bool useRVVForFixedLengthVectorVT(MVT VT,
     if (!Subtarget.hasVInstructionsF16Minimal())
       return false;
     break;
+  case MVT::bf16:
+    if (!Subtarget.hasVInstructionsBF16())
+      return false;
+    break;
   case MVT::f32:
     if (!Subtarget.hasVInstructionsF32())
       return false;
@@ -2612,6 +2648,7 @@ static MVT getContainerForFixedLengthVector(const TargetLowering &TLI, MVT VT,
   case MVT::i16:
   case MVT::i32:
   case MVT::i64:
+  case MVT::bf16:
   case MVT::f16:
   case MVT::f32:
   case MVT::f64: {
@@ -6625,6 +6662,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
   case ISD::VP_REDUCE_SEQ_FADD:
   case ISD::VP_REDUCE_FMIN:
   case ISD::VP_REDUCE_FMAX:
+  case ISD::VP_REDUCE_FMINIMUM:
+  case ISD::VP_REDUCE_FMAXIMUM:
     if (Op.getOperand(1).getValueType() == MVT::nxv32f16 &&
         (Subtarget.hasVInstructionsF16Minimal() &&
          !Subtarget.hasVInstructionsF16()))
@@ -8101,8 +8140,10 @@ RISCVTargetLowering::lowerStrictFPExtendOrRoundLike(SDValue Op,
 
   // RVV can only widen/truncate fp to types double/half the size as the source.
   if ((VT.getVectorElementType() == MVT::f64 &&
-       SrcVT.getVectorElementType() == MVT::f16) ||
-      (VT.getVectorElementType() == MVT::f16 &&
+       (SrcVT.getVectorElementType() == MVT::f16 ||
+        SrcVT.getVectorElementType() == MVT::bf16)) ||
+      ((VT.getVectorElementType() == MVT::f16 ||
+        VT.getVectorElementType() == MVT::bf16) &&
        SrcVT.getVectorElementType() == MVT::f64)) {
     // For double rounding, the intermediate rounding should be round-to-odd.
     unsigned InterConvOpc = Op.getOpcode() == ISD::STRICT_FP_EXTEND
@@ -8146,9 +8187,12 @@ RISCVTargetLowering::lowerVectorFPExtendOrRoundLike(SDValue Op,
   SDValue Src = Op.getOperand(0);
   MVT SrcVT = Src.getSimpleValueType();
 
-  bool IsDirectExtend = IsExtend && (VT.getVectorElementType() != MVT::f64 ||
-                                     SrcVT.getVectorElementType() != MVT::f16);
-  bool IsDirectTrunc = !IsExtend && (VT.getVectorElementType() != MVT::f16 ||
+  bool IsDirectExtend =
+      IsExtend && (VT.getVectorElementType() != MVT::f64 ||
+                   (SrcVT.getVectorElementType() != MVT::f16 &&
+                    SrcVT.getVectorElementType() != MVT::bf16));
+  bool IsDirectTrunc = !IsExtend && ((VT.getVectorElementType() != MVT::f16 &&
+                                      VT.getVectorElementType() != MVT::bf16) ||
                                      SrcVT.getVectorElementType() != MVT::f64);
 
   bool IsDirectConv = IsDirectExtend || IsDirectTrunc;
@@ -9485,8 +9529,10 @@ static unsigned getRVVReductionOp(unsigned ISDOpcode) {
   case ISD::VP_REDUCE_SEQ_FADD:
     return RISCVISD::VECREDUCE_SEQ_FADD_VL;
   case ISD::VP_REDUCE_FMAX:
+  case ISD::VP_REDUCE_FMAXIMUM:
     return RISCVISD::VECREDUCE_FMAX_VL;
   case ISD::VP_REDUCE_FMIN:
+  case ISD::VP_REDUCE_FMINIMUM:
     return RISCVISD::VECREDUCE_FMIN_VL;
   }
 
@@ -9745,8 +9791,11 @@ SDValue RISCVTargetLowering::lowerFPVECREDUCE(SDValue Op,
 SDValue RISCVTargetLowering::lowerVPREDUCE(SDValue Op,
                                            SelectionDAG &DAG) const {
   SDLoc DL(Op);
+  unsigned Opc = Op.getOpcode();
+  SDValue Start = Op.getOperand(0);
   SDValue Vec = Op.getOperand(1);
   EVT VecEVT = Vec.getValueType();
+  MVT XLenVT = Subtarget.getXLenVT();
 
   // TODO: The type may need to be widened rather than split. Or widened before
   // it can be split.
@@ -9754,7 +9803,7 @@ SDValue RISCVTargetLowering::lowerVPREDUCE(SDValue Op,
     return SDValue();
 
   MVT VecVT = VecEVT.getSimpleVT();
-  unsigned RVVOpcode = getRVVReductionOp(Op.getOpcode());
+  unsigned RVVOpcode = getRVVReductionOp(Opc);
 
   if (VecVT.isFixedLengthVector()) {
     auto ContainerVT = getContainerForFixedLengthVector(VecVT);
@@ -9763,8 +9812,30 @@ SDValue RISCVTargetLowering::lowerVPREDUCE(SDValue Op,
 
   SDValue VL = Op.getOperand(3);
   SDValue Mask = Op.getOperand(2);
-  return lowerReductionSeq(RVVOpcode, Op.getSimpleValueType(), Op.getOperand(0),
-                           Vec, Mask, VL, DL, DAG, Subtarget);
+  SDValue Res =
+      lowerReductionSeq(RVVOpcode, Op.getSimpleValueType(), Op.getOperand(0),
+                        Vec, Mask, VL, DL, DAG, Subtarget);
+  if ((Opc != ISD::VP_REDUCE_FMINIMUM && Opc != ISD::VP_REDUCE_FMAXIMUM) ||
+      Op->getFlags().hasNoNaNs())
+    return Res;
+
+  // Propagate NaNs.
+  MVT PredVT = getMaskTypeFor(Vec.getSimpleValueType());
+  // Check if any of the elements in Vec is NaN.
+  SDValue IsNaN = DAG.getNode(
+      RISCVISD::SETCC_VL, DL, PredVT,
+      {Vec, Vec, DAG.getCondCode(ISD::SETNE), DAG.getUNDEF(PredVT), Mask, VL});
+  SDValue VCPop = DAG.getNode(RISCVISD::VCPOP_VL, DL, XLenVT, IsNaN, Mask, VL);
+  // Check if the start value is NaN.
+  SDValue StartIsNaN = DAG.getSetCC(DL, XLenVT, Start, Start, ISD::SETUO);
+  VCPop = DAG.getNode(ISD::OR, DL, XLenVT, VCPop, StartIsNaN);
+  SDValue NoNaNs = DAG.getSetCC(DL, XLenVT, VCPop,
+                                DAG.getConstant(0, DL, XLenVT), ISD::SETEQ);
+  MVT ResVT = Res.getSimpleValueType();
+  return DAG.getSelect(
+      DL, ResVT, NoNaNs, Res,
+      DAG.getConstantFP(APFloat::getNaN(DAG.EVTToAPFloatSemantics(ResVT)), DL,
+                        ResVT));
 }
 
 SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
@@ -13525,10 +13596,27 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
     if (MulAmt % Divisor != 0)
       continue;
     uint64_t MulAmt2 = MulAmt / Divisor;
-    // 3/5/9 * 2^N -> shXadd (sll X, C), (sll X, C)
-    // Matched in tablegen, avoid perturbing patterns.
-    if (isPowerOf2_64(MulAmt2))
-      return SDValue();
+    // 3/5/9 * 2^N ->  shl (shXadd X, X), N
+    if (isPowerOf2_64(MulAmt2)) {
+      SDLoc DL(N);
+      SDValue X = N->getOperand(0);
+      // Put the shift first if we can fold a zext into the
+      // shift forming a slli.uw.
+      if (X.getOpcode() == ISD::AND && isa<ConstantSDNode>(X.getOperand(1)) &&
+          X.getConstantOperandVal(1) == UINT64_C(0xffffffff)) {
+        SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, X,
+                                  DAG.getConstant(Log2_64(MulAmt2), DL, VT));
+        return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Shl,
+                           DAG.getConstant(Log2_64(Divisor - 1), DL, VT), Shl);
+      }
+      // Otherwise, put rhe shl second so that it can fold with following
+      // instructions (e.g. sext or add).
+      SDValue Mul359 =
+          DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
+                      DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X);
+      return DAG.getNode(ISD::SHL, DL, VT, Mul359,
+                         DAG.getConstant(Log2_64(MulAmt2), DL, VT));
+    }
 
     // 3/5/9 * 3/5/9 -> shXadd (shYadd X, X), (shYadd X, X)
     if (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9) {
@@ -17623,6 +17711,7 @@ static bool isSelectPseudo(MachineInstr &MI) {
   default:
     return false;
   case RISCV::Select_GPR_Using_CC_GPR:
+  case RISCV::Select_GPR_Using_CC_Imm:
   case RISCV::Select_FPR16_Using_CC_GPR:
   case RISCV::Select_FPR16INX_Using_CC_GPR:
   case RISCV::Select_FPR32_Using_CC_GPR:
@@ -17806,7 +17895,9 @@ static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI,
   // is checked here and handled by a separate function -
   // EmitLoweredCascadedSelect.
   Register LHS = MI.getOperand(1).getReg();
-  Register RHS = MI.getOperand(2).getReg();
+  Register RHS;
+  if (MI.getOperand(2).isReg())
+    RHS = MI.getOperand(2).getReg();
   auto CC = static_cast<RISCVCC::CondCode>(MI.getOperand(3).getImm());
 
   SmallVector<MachineInstr *, 4> SelectDebugValues;
@@ -17815,8 +17906,9 @@ static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI,
 
   MachineInstr *LastSelectPseudo = &MI;
   auto Next = next_nodbg(MI.getIterator(), BB->instr_end());
-  if (MI.getOpcode() != RISCV::Select_GPR_Using_CC_GPR && Next != BB->end() &&
-      Next->getOpcode() == MI.getOpcode() &&
+  if ((MI.getOpcode() != RISCV::Select_GPR_Using_CC_GPR &&
+       MI.getOpcode() != RISCV::Select_GPR_Using_CC_Imm) &&
+      Next != BB->end() && Next->getOpcode() == MI.getOpcode() &&
       Next->getOperand(5).getReg() == MI.getOperand(0).getReg() &&
       Next->getOperand(5).isKill()) {
     return EmitLoweredCascadedSelect(MI, *Next, BB, Subtarget);
@@ -17828,6 +17920,7 @@ static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI,
       continue;
     if (isSelectPseudo(*SequenceMBBI)) {
       if (SequenceMBBI->getOperand(1).getReg() != LHS ||
+          !SequenceMBBI->getOperand(2).isReg() ||
           SequenceMBBI->getOperand(2).getReg() != RHS ||
           SequenceMBBI->getOperand(3).getImm() != CC ||
           SelectDests.count(SequenceMBBI->getOperand(4).getReg()) ||
@@ -17877,10 +17970,16 @@ static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI,
   HeadMBB->addSuccessor(TailMBB);
 
   // Insert appropriate branch.
-  BuildMI(HeadMBB, DL, TII.getBrCond(CC))
-    .addReg(LHS)
-    .addReg(RHS)
-    .addMBB(TailMBB);
+  if (MI.getOperand(2).isImm())
+    BuildMI(HeadMBB, DL, TII.getBrCond(CC, MI.getOperand(2).isImm()))
+        .addReg(LHS)
+        .addImm(MI.getOperand(2).getImm())
+        .addMBB(TailMBB);
+  else
+    BuildMI(HeadMBB, DL, TII.getBrCond(CC))
+        .addReg(LHS)
+        .addReg(RHS)
+        .addMBB(TailMBB);
 
   // IfFalseMBB just falls through to TailMBB.
   IfFalseMBB->addSuccessor(TailMBB);
@@ -18126,6 +18225,7 @@ RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
            "ReadCounterWide is only to be used on riscv32");
     return emitReadCounterWidePseudo(MI, BB);
   case RISCV::Select_GPR_Using_CC_GPR:
+  case RISCV::Select_GPR_Using_CC_Imm:
   case RISCV::Select_FPR16_Using_CC_GPR:
   case RISCV::Select_FPR16INX_Using_CC_GPR:
   case RISCV::Select_FPR32_Using_CC_GPR:
@@ -20952,6 +21052,12 @@ Value *RISCVTargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
   if (Subtarget.isTargetFuchsia())
     return useTpOffset(IRB, -0x10);
 
+  // Android provides a fixed TLS slot for the stack cookie. See the definition
+  // of TLS_SLOT_STACK_GUARD in
+  // https://android.googlesource.com/platform/bionic/+/main/libc/platform/bionic/tls_defines.h
+  if (Subtarget.isTargetAndroid())
+    return useTpOffset(IRB, -0x18);
+
   return TargetLowering::getIRStackGuard(IRB);
 }
 
@@ -20979,6 +21085,11 @@ bool RISCVTargetLowering::isLegalInterleavedAccessType(
       return false;
 
     ContainerVT = getContainerForFixedLengthVector(VT.getSimpleVT());
+  } else {
+    // The intrinsics for scalable vectors are not overloaded on pointer type
+    // and can only handle the default address space.
+    if (AddrSpace)
+      return false;
   }
 
   // Need to make sure that EMUL * NFIELDS ≤ 8
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 216dc7808520..7a8ff84995ea 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -173,8 +173,7 @@ static bool isMaskRegOp(const MachineInstr &MI) {
 /// Note that this is different from "agnostic" as defined by the vector
 /// specification.  Agnostic requires each lane to either be undisturbed, or
 /// take the value -1; no other value is allowed.
-static bool hasUndefinedMergeOp(const MachineInstr &MI,
-                                const MachineRegisterInfo &MRI) {
+static bool hasUndefinedMergeOp(const MachineInstr &MI) {
 
   unsigned UseOpIdx;
   if (!MI.isRegTiedToUseOperand(0, &UseOpIdx))
@@ -182,32 +181,10 @@ static bool hasUndefinedMergeOp(const MachineInstr &MI,
     // lanes are undefined.
     return true;
 
-  // If the tied operand is NoReg, an IMPLICIT_DEF, or a REG_SEQEUENCE whose
-  // operands are solely IMPLICIT_DEFS, then the pass through lanes are
-  // undefined.
+  // All undefined passthrus should be $noreg: see
+  // RISCVDAGToDAGISel::doPeepholeNoRegPassThru
   const MachineOperand &UseMO = MI.getOperand(UseOpIdx);
-  if (UseMO.getReg() == RISCV::NoRegister)
-    return true;
-
-  if (UseMO.isUndef())
-    return true;
-  if (UseMO.getReg().isPhysical())
-    return false;
-
-  if (MachineInstr *UseMI = MRI.getVRegDef(UseMO.getReg())) {
-    if (UseMI->isImplicitDef())
-      return true;
-
-    if (UseMI->isRegSequence()) {
-      for (unsigned i = 1, e = UseMI->getNumOperands(); i < e; i += 2) {
-        MachineInstr *SourceMI = MRI.getVRegDef(UseMI->getOperand(i).getReg());
-        if (!SourceMI || !SourceMI->isImplicitDef())
-          return false;
-      }
-      return true;
-    }
-  }
-  return false;
+  return UseMO.getReg() == RISCV::NoRegister || UseMO.isUndef();
 }
 
 /// Which subfields of VL or VTYPE have values we need to preserve?
@@ -360,9 +337,7 @@ static bool areCompatibleVTYPEs(uint64_t CurVType, uint64_t NewVType,
 }
 
 /// Return the fields and properties demanded by the provided instruction.
-DemandedFields getDemanded(const MachineInstr &MI,
-                           const MachineRegisterInfo *MRI,
-                           const RISCVSubtarget *ST) {
+DemandedFields getDemanded(const MachineInstr &MI, const RISCVSubtarget *ST) {
   // Warning: This function has to work on both the lowered (i.e. post
   // emitVSETVLIs) and pre-lowering forms.  The main implication of this is
   // that it can't use the value of a SEW, VL, or Policy operand as they might
@@ -426,7 +401,7 @@ DemandedFields getDemanded(const MachineInstr &MI,
     // this for any tail agnostic operation, but we can't as TA requires
     // tail lanes to either be the original value or -1.  We are writing
     // unknown bits to the lanes here.
-    if (hasUndefinedMergeOp(MI, *MRI)) {
+    if (hasUndefinedMergeOp(MI)) {
       if (isFloatScalarMoveOrScalarSplatInstr(MI) && !ST->hasVInstructionsF64())
         Res.SEW = DemandedFields::SEWGreaterThanOrEqualAndLessThan64;
       else
@@ -886,7 +861,7 @@ static VSETVLIInfo getInfoForVSETVLI(const MachineInstr &MI,
     if (AVLReg == RISCV::X0)
       NewInfo.setAVLVLMAX();
     else
-      NewInfo.setAVLRegDef(MRI.getVRegDef(AVLReg), AVLReg);
+      NewInfo.setAVLRegDef(MRI.getUniqueVRegDef(AVLReg), AVLReg);
   }
   NewInfo.setVTYPE(MI.getOperand(2).getImm());
 
@@ -910,7 +885,7 @@ static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags,
 
   bool TailAgnostic = true;
   bool MaskAgnostic = true;
-  if (!hasUndefinedMergeOp(MI, *MRI)) {
+  if (!hasUndefinedMergeOp(MI)) {
     // Start with undisturbed.
     TailAgnostic = false;
     MaskAgnostic = false;
@@ -958,7 +933,8 @@ static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags,
       else
         InstrInfo.setAVLImm(Imm);
     } else {
-      InstrInfo.setAVLRegDef(MRI->getVRegDef(VLOp.getReg()), VLOp.getReg());
+      InstrInfo.setAVLRegDef(MRI->getUniqueVRegDef(VLOp.getReg()),
+                             VLOp.getReg());
     }
   } else {
     assert(isScalarExtractInstr(MI));
@@ -1094,7 +1070,7 @@ bool RISCVInsertVSETVLI::needVSETVLI(const MachineInstr &MI,
   if (!CurInfo.isValid() || CurInfo.isUnknown() || CurInfo.hasSEWLMULRatioOnly())
     return true;
 
-  DemandedFields Used = getDemanded(MI, MRI, ST);
+  DemandedFields Used = getDemanded(MI, ST);
 
   // A slidedown/slideup with an *undefined* merge op can freely clobber
   // elements not copied from the source vector (e.g. masked off, tail, or
@@ -1105,7 +1081,7 @@ bool RISCVInsertVSETVLI::needVSETVLI(const MachineInstr &MI,
   // * The LMUL1 restriction is for machines whose latency may depend on VL.
   // * As above, this is only legal for tail "undefined" not "agnostic".
   if (isVSlideInstr(MI) && Require.hasAVLImm() && Require.getAVLImm() == 1 &&
-      isLMUL1OrSmaller(CurInfo.getVLMUL()) && hasUndefinedMergeOp(MI, *MRI)) {
+      isLMUL1OrSmaller(CurInfo.getVLMUL()) && hasUndefinedMergeOp(MI)) {
     Used.VLAny = false;
     Used.VLZeroness = true;
     Used.LMUL = false;
@@ -1117,8 +1093,9 @@ bool RISCVInsertVSETVLI::needVSETVLI(const MachineInstr &MI,
   // immediate form of vmv.s.x, and thus frequently use vmv.v.i in it's place.
   // Since a splat is non-constant time in LMUL, we do need to be careful to not
   // increase the number of active vector registers (unlike for vmv.s.x.)
-  if (isScalarSplatInstr(MI) && Require.hasAVLImm() && Require.getAVLImm() == 1 &&
-      isLMUL1OrSmaller(CurInfo.getVLMUL()) && hasUndefinedMergeOp(MI, *MRI)) {
+  if (isScalarSplatInstr(MI) && Require.hasAVLImm() &&
+      Require.getAVLImm() == 1 && isLMUL1OrSmaller(CurInfo.getVLMUL()) &&
+      hasUndefinedMergeOp(MI)) {
     Used.LMUL = false;
     Used.SEWLMULRatio = false;
     Used.VLAny = false;
@@ -1184,7 +1161,7 @@ void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info,
   if (!Info.isValid() || Info.isUnknown())
     Info = NewInfo;
 
-  DemandedFields Demanded = getDemanded(MI, MRI, ST);
+  DemandedFields Demanded = getDemanded(MI, ST);
   const VSETVLIInfo IncomingInfo = adjustIncoming(PrevInfo, NewInfo, Demanded);
 
   // If MI only demands that VL has the same zeroness, we only need to set the
@@ -1231,7 +1208,7 @@ void RISCVInsertVSETVLI::transferAfter(VSETVLIInfo &Info,
 
   if (RISCV::isFaultFirstLoad(MI)) {
     // Update AVL to vl-output of the fault first load.
-    Info.setAVLRegDef(MRI->getVRegDef(MI.getOperand(1).getReg()),
+    Info.setAVLRegDef(MRI->getUniqueVRegDef(MI.getOperand(1).getReg()),
                       MI.getOperand(1).getReg());
     return;
   }
@@ -1338,8 +1315,9 @@ bool RISCVInsertVSETVLI::needVSETVLIPHI(const VSETVLIInfo &Require,
     const VSETVLIInfo &PBBExit = BlockInfo[PBB->getNumber()].Exit;
 
     // We need the PHI input to the be the output of a VSET(I)VLI.
-    MachineInstr *DefMI = MRI->getVRegDef(InReg);
-    if (!DefMI || !isVectorConfigInstr(*DefMI))
+    MachineInstr *DefMI = MRI->getUniqueVRegDef(InReg);
+    assert(DefMI);
+    if (!isVectorConfigInstr(*DefMI))
       return true;
 
     // We found a VSET(I)VLI make sure it matches the output of the
@@ -1399,7 +1377,8 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
         MachineOperand &VLOp = MI.getOperand(getVLOpNum(MI));
         if (VLOp.isReg()) {
           Register Reg = VLOp.getReg();
-          MachineInstr *VLOpDef = MRI->getVRegDef(Reg);
+          MachineInstr *VLOpDef = MRI->getUniqueVRegDef(Reg);
+          assert(VLOpDef);
 
           // Erase the AVL operand from the instruction.
           VLOp.setReg(RISCV::NoRegister);
@@ -1409,8 +1388,7 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
           // as an ADDI. However, the ADDI might not have been used in the
           // vsetvli, or a vsetvli might not have been emitted, so it may be
           // dead now.
-          if (VLOpDef && TII->isAddImmediate(*VLOpDef, Reg) &&
-              MRI->use_nodbg_empty(Reg))
+          if (TII->isAddImmediate(*VLOpDef, Reg) && MRI->use_nodbg_empty(Reg))
             VLOpDef->eraseFromParent();
         }
         MI.addOperand(MachineOperand::CreateReg(RISCV::VL, /*isDef*/ false,
@@ -1592,7 +1570,7 @@ bool RISCVCoalesceVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) {
   for (MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend())) {
 
     if (!isVectorConfigInstr(MI)) {
-      Used.doUnion(getDemanded(MI, MRI, ST));
+      Used.doUnion(getDemanded(MI, ST));
       if (MI.isCall() || MI.isInlineAsm() ||
           MI.modifiesRegister(RISCV::VL, /*TRI=*/nullptr) ||
           MI.modifiesRegister(RISCV::VTYPE, /*TRI=*/nullptr))
@@ -1600,9 +1578,7 @@ bool RISCVCoalesceVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) {
       continue;
     }
 
-    Register RegDef = MI.getOperand(0).getReg();
-    assert(RegDef == RISCV::X0 || RegDef.isVirtual());
-    if (RegDef != RISCV::X0 && !MRI->use_nodbg_empty(RegDef))
+    if (!MI.getOperand(0).isDead())
       Used.demandVL();
 
     if (NextMI) {
@@ -1667,7 +1643,7 @@ bool RISCVCoalesceVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) {
       }
     }
     NextMI = &MI;
-    Used = getDemanded(MI, MRI, ST);
+    Used = getDemanded(MI, ST);
   }
 
   NumCoalescedVSETVL += ToDelete.size();
@@ -1684,6 +1660,7 @@ void RISCVInsertVSETVLI::insertReadVL(MachineBasicBlock &MBB) {
     MachineInstr &MI = *I++;
     if (RISCV::isFaultFirstLoad(MI)) {
       Register VLOutput = MI.getOperand(1).getReg();
+      assert(VLOutput.isVirtual());
       if (!MRI->use_nodbg_empty(VLOutput))
         BuildMI(MBB, I, MI.getDebugLoc(), TII->get(RISCV::PseudoReadVL),
                 VLOutput);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 8cb9a40a98bc..444b9076005c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -833,6 +833,10 @@ static RISCVCC::CondCode getCondFromBranchOpc(unsigned Opc) {
   switch (Opc) {
   default:
     return RISCVCC::COND_INVALID;
+  case RISCV::CV_BEQIMM:
+    return RISCVCC::COND_EQ;
+  case RISCV::CV_BNEIMM:
+    return RISCVCC::COND_NE;
   case RISCV::BEQ:
     return RISCVCC::COND_EQ;
   case RISCV::BNE:
@@ -863,14 +867,14 @@ static void parseCondBranch(MachineInstr &LastInst, MachineBasicBlock *&Target,
   Cond.push_back(LastInst.getOperand(1));
 }
 
-unsigned RISCVCC::getBrCond(RISCVCC::CondCode CC) {
+unsigned RISCVCC::getBrCond(RISCVCC::CondCode CC, bool Imm) {
   switch (CC) {
   default:
     llvm_unreachable("Unknown condition code!");
   case RISCVCC::COND_EQ:
-    return RISCV::BEQ;
+    return Imm ? RISCV::CV_BEQIMM : RISCV::BEQ;
   case RISCVCC::COND_NE:
-    return RISCV::BNE;
+    return Imm ? RISCV::CV_BNEIMM : RISCV::BNE;
   case RISCVCC::COND_LT:
     return RISCV::BLT;
   case RISCVCC::COND_GE:
@@ -882,8 +886,9 @@ unsigned RISCVCC::getBrCond(RISCVCC::CondCode CC) {
   }
 }
 
-const MCInstrDesc &RISCVInstrInfo::getBrCond(RISCVCC::CondCode CC) const {
-  return get(RISCVCC::getBrCond(CC));
+const MCInstrDesc &RISCVInstrInfo::getBrCond(RISCVCC::CondCode CC,
+                                             bool Imm) const {
+  return get(RISCVCC::getBrCond(CC, Imm));
 }
 
 RISCVCC::CondCode RISCVCC::getOppositeBranchCondition(RISCVCC::CondCode CC) {
@@ -1032,8 +1037,10 @@ unsigned RISCVInstrInfo::insertBranch(
 
   // Either a one or two-way conditional branch.
   auto CC = static_cast<RISCVCC::CondCode>(Cond[0].getImm());
-  MachineInstr &CondMI =
-      *BuildMI(&MBB, DL, getBrCond(CC)).add(Cond[1]).add(Cond[2]).addMBB(TBB);
+  MachineInstr &CondMI = *BuildMI(&MBB, DL, getBrCond(CC, Cond[2].isImm()))
+                              .add(Cond[1])
+                              .add(Cond[2])
+                              .addMBB(TBB);
   if (BytesAdded)
     *BytesAdded += getInstSizeInBytes(CondMI);
 
@@ -1257,6 +1264,8 @@ bool RISCVInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
   case RISCV::BGE:
   case RISCV::BLTU:
   case RISCV::BGEU:
+  case RISCV::CV_BEQIMM:
+  case RISCV::CV_BNEIMM:
     return isIntN(13, BrOffset);
   case RISCV::JAL:
   case RISCV::PseudoBR:
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 170f813eb10d..e069717aaef2 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -45,7 +45,7 @@ enum CondCode {
 };
 
 CondCode getOppositeBranchCondition(CondCode);
-unsigned getBrCond(CondCode CC);
+unsigned getBrCond(CondCode CC, bool Imm = false);
 
 } // end of namespace RISCVCC
 
@@ -65,7 +65,7 @@ public:
   explicit RISCVInstrInfo(RISCVSubtarget &STI);
 
   MCInst getNop() const override;
-  const MCInstrDesc &getBrCond(RISCVCC::CondCode CC) const;
+  const MCInstrDesc &getBrCond(RISCVCC::CondCode CC, bool Imm = false) const;
 
   Register isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index da4020758eb6..b867eccf4266 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1448,13 +1448,29 @@ let isBarrier = 1, isBranch = 1, isTerminator = 1 in
 def PseudoBR : Pseudo<(outs), (ins simm21_lsb0_jal:$imm20), [(br bb:$imm20)]>,
                PseudoInstExpansion<(JAL X0, simm21_lsb0_jal:$imm20)>;
 
-let isBarrier = 1, isBranch = 1, isIndirectBranch = 1, isTerminator = 1 in
+let Predicates = [NoStdExtZicfilp],
+    isBarrier = 1, isBranch = 1, isIndirectBranch = 1, isTerminator = 1 in
 def PseudoBRIND : Pseudo<(outs), (ins GPRJALR:$rs1, simm12:$imm12), []>,
                   PseudoInstExpansion<(JALR X0, GPR:$rs1, simm12:$imm12)>;
 
+let Predicates = [HasStdExtZicfilp],
+    isBarrier = 1, isBranch = 1, isIndirectBranch = 1, isTerminator = 1 in
+def PseudoBRINDNonX7 : Pseudo<(outs), (ins GPRJALRNonX7:$rs1, simm12:$imm12), []>,
+                       PseudoInstExpansion<(JALR X0, GPR:$rs1, simm12:$imm12)>;
+
+// For Zicfilp, need to avoid using X7/T2 for indirect branches which need
+// landing pad.
+let Predicates = [HasStdExtZicfilp] in {
+def : Pat<(brind GPRJALRNonX7:$rs1), (PseudoBRINDNonX7 GPRJALRNonX7:$rs1, 0)>;
+def : Pat<(brind (add GPRJALRNonX7:$rs1, simm12:$imm12)),
+          (PseudoBRINDNonX7 GPRJALRNonX7:$rs1, simm12:$imm12)>;
+}
+
+let Predicates = [NoStdExtZicfilp] in {
 def : Pat<(brind GPRJALR:$rs1), (PseudoBRIND GPRJALR:$rs1, 0)>;
 def : Pat<(brind (add GPRJALR:$rs1, simm12:$imm12)),
           (PseudoBRIND GPRJALR:$rs1, simm12:$imm12)>;
+}
 
 // PseudoCALLReg is a generic pseudo instruction for calls which will eventually
 // expand to auipc and jalr while encoding, with any given register used as the
@@ -1484,10 +1500,16 @@ def : Pat<(riscv_call texternalsym:$func), (PseudoCALL texternalsym:$func)>;
 def : Pat<(riscv_sret_glue), (SRET (XLenVT X0), (XLenVT X0))>;
 def : Pat<(riscv_mret_glue), (MRET (XLenVT X0), (XLenVT X0))>;
 
-let isCall = 1, Defs = [X1] in
+let isCall = 1, Defs = [X1] in {
+let Predicates = [NoStdExtZicfilp] in
 def PseudoCALLIndirect : Pseudo<(outs), (ins GPRJALR:$rs1),
                                 [(riscv_call GPRJALR:$rs1)]>,
                          PseudoInstExpansion<(JALR X1, GPR:$rs1, 0)>;
+let Predicates = [HasStdExtZicfilp] in
+def PseudoCALLIndirectNonX7 : Pseudo<(outs), (ins GPRJALRNonX7:$rs1),
+                                     [(riscv_call GPRJALRNonX7:$rs1)]>,
+                              PseudoInstExpansion<(JALR X1, GPR:$rs1, 0)>;
+}
 
 let isBarrier = 1, isReturn = 1, isTerminator = 1 in
 def PseudoRET : Pseudo<(outs), (ins), [(riscv_ret_glue)]>,
@@ -1502,10 +1524,16 @@ def PseudoTAIL : Pseudo<(outs), (ins call_symbol:$dst), [],
                         "tail", "$dst">,
                  Sched<[WriteIALU, WriteJalr, ReadJalr]>;
 
-let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [X2] in
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [X2] in {
+let Predicates = [NoStdExtZicfilp] in
 def PseudoTAILIndirect : Pseudo<(outs), (ins GPRTC:$rs1),
                                 [(riscv_tail GPRTC:$rs1)]>,
                          PseudoInstExpansion<(JALR X0, GPR:$rs1, 0)>;
+let Predicates = [HasStdExtZicfilp] in
+def PseudoTAILIndirectNonX7 : Pseudo<(outs), (ins GPRTCNonX7:$rs1),
+                                     [(riscv_tail GPRTCNonX7:$rs1)]>,
+                              PseudoInstExpansion<(JALR X0, GPR:$rs1, 0)>;
+}
 
 def : Pat<(riscv_tail (iPTR tglobaladdr:$dst)),
           (PseudoTAIL tglobaladdr:$dst)>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 22e548861784..4adc26f62891 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -355,24 +355,24 @@ defset list<VTypeInfo> AllVectors = {
                                  V_M8, f64, FPR64>;
     }
   }
-}
 
-defset list<VTypeInfo> AllBFloatVectors = {
-  defset list<VTypeInfo> NoGroupBFloatVectors = {
-    defset list<VTypeInfo> FractionalGroupBFloatVectors = {
-      def VBF16MF4: VTypeInfo<vbfloat16mf4_t, vbool64_t, 16, V_MF4, bf16, FPR16>;
-      def VBF16MF2: VTypeInfo<vbfloat16mf2_t, vbool32_t, 16, V_MF2, bf16, FPR16>;
+  defset list<VTypeInfo> AllBFloatVectors = {
+    defset list<VTypeInfo> NoGroupBFloatVectors = {
+      defset list<VTypeInfo> FractionalGroupBFloatVectors = {
+        def VBF16MF4: VTypeInfo<vbfloat16mf4_t, vbool64_t, 16, V_MF4, bf16, FPR16>;
+        def VBF16MF2: VTypeInfo<vbfloat16mf2_t, vbool32_t, 16, V_MF2, bf16, FPR16>;
+      }
+      def VBF16M1:  VTypeInfo<vbfloat16m1_t, vbool16_t, 16, V_M1, bf16, FPR16>;
+    }
+  
+    defset list<GroupVTypeInfo> GroupBFloatVectors = {
+      def VBF16M2: GroupVTypeInfo<vbfloat16m2_t, vbfloat16m1_t, vbool8_t, 16,
+                                  V_M2, bf16, FPR16>;
+      def VBF16M4: GroupVTypeInfo<vbfloat16m4_t, vbfloat16m1_t, vbool4_t, 16,
+                                  V_M4, bf16, FPR16>;
+      def VBF16M8: GroupVTypeInfo<vbfloat16m8_t, vbfloat16m1_t, vbool2_t, 16,
+                                  V_M8, bf16, FPR16>;
     }
-    def VBF16M1:  VTypeInfo<vbfloat16m1_t, vbool16_t, 16, V_M1, bf16, FPR16>;
-  }
-
-  defset list<GroupVTypeInfo> GroupBFloatVectors = {
-    def VBF16M2: GroupVTypeInfo<vbfloat16m2_t, vbfloat16m1_t, vbool8_t, 16,
-                                V_M2, bf16, FPR16>;
-    def VBF16M4: GroupVTypeInfo<vbfloat16m4_t, vbfloat16m1_t, vbool4_t, 16,
-                                V_M4, bf16, FPR16>;
-    def VBF16M8: GroupVTypeInfo<vbfloat16m8_t, vbfloat16m1_t, vbool2_t, 16,
-                                V_M8, bf16, FPR16>;
   }
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
index b4af83a3cbf6..714f8cff7b63 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
@@ -1495,6 +1495,20 @@ foreach fvtiToFWti = AllWidenableFloatVectors in {
                 fvti.AVL, fvti.Log2SEW, TA_MA)>;
 }
 
+foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in {
+  defvar fvti = fvtiToFWti.Vti;
+  defvar fwti = fvtiToFWti.Wti;
+  let Predicates = [HasVInstructionsBF16] in
+  def : Pat<(fvti.Vector (fpround (fwti.Vector fwti.RegClass:$rs1))),
+            (!cast<Instruction>("PseudoVFNCVTBF16_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW)
+                (fvti.Vector (IMPLICIT_DEF)),
+                fwti.RegClass:$rs1,
+                // Value to indicate no rounding mode change in
+                // RISCVInsertReadWriteCSR
+                FRM_DYN,
+                fvti.AVL, fvti.Log2SEW, TA_MA)>;
+}
+
 //===----------------------------------------------------------------------===//
 // Vector Splats
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index 6c6ecb604fd0..e10b8bf2767b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -2670,6 +2670,20 @@ foreach fvtiToFWti = AllWidenableFloatVectors in {
                 GPR:$vl, fvti.Log2SEW, TA_MA)>;
 }
 
+foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in {
+  defvar fvti = fvtiToFWti.Vti;
+  defvar fwti = fvtiToFWti.Wti;
+  let Predicates = [HasVInstructionsBF16] in
+  def : Pat<(fwti.Vector (any_riscv_fpextend_vl
+                             (fvti.Vector fvti.RegClass:$rs1),
+                             (fvti.Mask V0),
+                             VLOpFrag)),
+            (!cast<Instruction>("PseudoVFWCVTBF16_F_F_V_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK")
+                (fwti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs1,
+                (fvti.Mask V0),
+                GPR:$vl, fvti.Log2SEW, TA_MA)>;
+}
+
 // 13.19 Narrowing Floating-Point/Integer Type-Convert Instructions
 defm : VPatNConvertFP2IVL_W_RM<riscv_vfcvt_xu_f_vl, "PseudoVFNCVT_XU_F_W">;
 defm : VPatNConvertFP2IVL_W_RM<riscv_vfcvt_x_f_vl, "PseudoVFNCVT_X_F_W">;
@@ -2714,6 +2728,22 @@ foreach fvtiToFWti = AllWidenableFloatVectors in {
   }
 }
 
+foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in {
+  defvar fvti = fvtiToFWti.Vti;
+  defvar fwti = fvtiToFWti.Wti;
+  let Predicates = [HasVInstructionsBF16] in
+    def : Pat<(fvti.Vector (any_riscv_fpround_vl
+                               (fwti.Vector fwti.RegClass:$rs1),
+                               (fwti.Mask V0), VLOpFrag)),
+              (!cast<Instruction>("PseudoVFNCVTBF16_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK")
+                  (fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1,
+                  (fwti.Mask V0),
+                  // Value to indicate no rounding mode change in
+                  // RISCVInsertReadWriteCSR
+                  FRM_DYN,
+                  GPR:$vl, fvti.Log2SEW, TA_MA)>;
+}
+
 // 14. Vector Reduction Operations
 
 // 14.1. Vector Single-Width Integer Reduction Instructions
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td
index 924e91e15c34..6dae8ca8f7a8 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td
@@ -704,3 +704,29 @@ let Predicates = [HasVendorXCVbitmanip, IsRV32] in {
             (CV_BITREV GPR:$rs1, cv_tuimm2:$radix, cv_tuimm5:$pts)>;
   def : Pat<(bitreverse (XLenVT GPR:$rs)), (CV_BITREV GPR:$rs, 0, 0)>;
 }
+
+//===----------------------------------------------------------------------===//
+// Patterns for immediate branching operations 
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasVendorXCVbi, IsRV32], AddedComplexity = 2 in {
+  def : Pat<(riscv_brcc GPR:$rs1, simm5:$imm5, SETEQ, bb:$imm12),
+            (CV_BEQIMM GPR:$rs1, simm5:$imm5, simm13_lsb0:$imm12)>;
+  def : Pat<(riscv_brcc GPR:$rs1, simm5:$imm5, SETNE, bb:$imm12),
+            (CV_BNEIMM GPR:$rs1, simm5:$imm5, simm13_lsb0:$imm12)>;
+
+  let usesCustomInserter = 1 in
+  def Select_GPR_Using_CC_Imm : Pseudo<(outs GPR:$dst),
+                             (ins GPR:$lhs, simm5:$imm5, ixlenimm:$cc,
+                              GPR:$truev, GPR:$falsev), []>;
+
+
+  class Selectbi<CondCode Cond>
+      : Pat<(riscv_selectcc_frag:$cc (i32 GPR:$lhs), simm5:$Constant, Cond,
+                                     (i32 GPR:$truev), GPR:$falsev),
+            (Select_GPR_Using_CC_Imm GPR:$lhs, simm5:$Constant,
+             (IntCCtoRISCVCC $cc), GPR:$truev, GPR:$falsev)>;
+
+  def : Selectbi<SETEQ>;
+  def : Selectbi<SETNE>;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
index b398c5e7fec2..bc14f165d962 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
@@ -549,40 +549,11 @@ def : Pat<(add_non_imm12 sh2add_op:$rs1, (XLenVT GPR:$rs2)),
 def : Pat<(add_non_imm12 sh3add_op:$rs1, (XLenVT GPR:$rs2)),
           (TH_ADDSL GPR:$rs2, sh3add_op:$rs1, 3)>;
 
-def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 6)), GPR:$rs2),
-          (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 1)), 1)>;
-def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 10)), GPR:$rs2),
-          (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 2)), 1)>;
-def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 18)), GPR:$rs2),
-          (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 3)), 1)>;
-def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 12)), GPR:$rs2),
-          (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 1)), 2)>;
-def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 20)), GPR:$rs2),
-          (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 2)), 2)>;
-def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 36)), GPR:$rs2),
-          (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 3)), 2)>;
-def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 24)), GPR:$rs2),
-          (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 1)), 3)>;
-def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 40)), GPR:$rs2),
-          (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 2)), 3)>;
-def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 72)), GPR:$rs2),
-          (TH_ADDSL GPR:$rs2, (XLenVT (TH_ADDSL GPR:$rs1, GPR:$rs1, 3)), 3)>;
-
 def : Pat<(add (XLenVT GPR:$r), CSImm12MulBy4:$i),
           (TH_ADDSL GPR:$r, (XLenVT (ADDI (XLenVT X0), (SimmShiftRightBy2XForm CSImm12MulBy4:$i))), 2)>;
 def : Pat<(add (XLenVT GPR:$r), CSImm12MulBy8:$i),
           (TH_ADDSL GPR:$r, (XLenVT (ADDI (XLenVT X0), (SimmShiftRightBy3XForm CSImm12MulBy8:$i))), 3)>;
 
-def : Pat<(mul (XLenVT GPR:$r), C3LeftShift:$i),
-          (SLLI (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 1)),
-                (TrailingZeros C3LeftShift:$i))>;
-def : Pat<(mul (XLenVT GPR:$r), C5LeftShift:$i),
-          (SLLI (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 2)),
-                (TrailingZeros C5LeftShift:$i))>;
-def : Pat<(mul (XLenVT GPR:$r), C9LeftShift:$i),
-          (SLLI (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 3)),
-                (TrailingZeros C9LeftShift:$i))>;
-
 def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 200)),
           (SLLI (XLenVT (TH_ADDSL (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 2)),
                                   (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 2)), 2)), 3)>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index ffe2b7e27120..8a0bbf6abd33 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -173,42 +173,6 @@ def BCLRIANDIMaskLow : SDNodeXForm<imm, [{
                                    SDLoc(N), N->getValueType(0));
 }]>;
 
-def C3LeftShift : PatLeaf<(imm), [{
-  uint64_t C = N->getZExtValue();
-  return C > 3 && (C >> llvm::countr_zero(C)) == 3;
-}]>;
-
-def C5LeftShift : PatLeaf<(imm), [{
-  uint64_t C = N->getZExtValue();
-  return C > 5 && (C >> llvm::countr_zero(C)) == 5;
-}]>;
-
-def C9LeftShift : PatLeaf<(imm), [{
-  uint64_t C = N->getZExtValue();
-  return C > 9 && (C >> llvm::countr_zero(C)) == 9;
-}]>;
-
-// Constant of the form (3 << C) where C is less than 32.
-def C3LeftShiftUW : PatLeaf<(imm), [{
-  uint64_t C = N->getZExtValue();
-  unsigned Shift = llvm::countr_zero(C);
-  return 1 <= Shift && Shift < 32 && (C >> Shift) == 3;
-}]>;
-
-// Constant of the form (5 << C) where C is less than 32.
-def C5LeftShiftUW : PatLeaf<(imm), [{
-  uint64_t C = N->getZExtValue();
-  unsigned Shift = llvm::countr_zero(C);
-  return 1 <= Shift && Shift < 32 && (C >> Shift) == 5;
-}]>;
-
-// Constant of the form (9 << C) where C is less than 32.
-def C9LeftShiftUW : PatLeaf<(imm), [{
-  uint64_t C = N->getZExtValue();
-  unsigned Shift = llvm::countr_zero(C);
-  return 1 <= Shift && Shift < 32 && (C >> Shift) == 9;
-}]>;
-
 def CSImm12MulBy4 : PatLeaf<(imm), [{
   if (!N->hasOneUse())
     return false;
@@ -693,25 +657,6 @@ foreach i = {1,2,3} in {
             (shxadd pat:$rs1, GPR:$rs2)>;
 }
 
-def : Pat<(add_like (mul_oneuse GPR:$rs1, (XLenVT 6)), GPR:$rs2),
-          (SH1ADD (XLenVT (SH1ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>;
-def : Pat<(add_like (mul_oneuse GPR:$rs1, (XLenVT 10)), GPR:$rs2),
-          (SH1ADD (XLenVT (SH2ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>;
-def : Pat<(add_like (mul_oneuse GPR:$rs1, (XLenVT 18)), GPR:$rs2),
-          (SH1ADD (XLenVT (SH3ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>;
-def : Pat<(add_like (mul_oneuse GPR:$rs1, (XLenVT 12)), GPR:$rs2),
-          (SH2ADD (XLenVT (SH1ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>;
-def : Pat<(add_like (mul_oneuse GPR:$rs1, (XLenVT 20)), GPR:$rs2),
-          (SH2ADD (XLenVT (SH2ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>;
-def : Pat<(add_like (mul_oneuse GPR:$rs1, (XLenVT 36)), GPR:$rs2),
-          (SH2ADD (XLenVT (SH3ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>;
-def : Pat<(add_like (mul_oneuse GPR:$rs1, (XLenVT 24)), GPR:$rs2),
-          (SH3ADD (XLenVT (SH1ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>;
-def : Pat<(add_like (mul_oneuse GPR:$rs1, (XLenVT 40)), GPR:$rs2),
-          (SH3ADD (XLenVT (SH2ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>;
-def : Pat<(add_like (mul_oneuse GPR:$rs1, (XLenVT 72)), GPR:$rs2),
-          (SH3ADD (XLenVT (SH3ADD GPR:$rs1, GPR:$rs1)), GPR:$rs2)>;
-
 def : Pat<(add_like (XLenVT GPR:$r), CSImm12MulBy4:$i),
           (SH2ADD (XLenVT (ADDI (XLenVT X0), (SimmShiftRightBy2XForm CSImm12MulBy4:$i))),
                   GPR:$r)>;
@@ -719,16 +664,6 @@ def : Pat<(add_like (XLenVT GPR:$r), CSImm12MulBy8:$i),
           (SH3ADD (XLenVT (ADDI (XLenVT X0), (SimmShiftRightBy3XForm CSImm12MulBy8:$i))),
                   GPR:$r)>;
 
-def : Pat<(mul (XLenVT GPR:$r), C3LeftShift:$i),
-          (SLLI (XLenVT (SH1ADD GPR:$r, GPR:$r)),
-                (TrailingZeros C3LeftShift:$i))>;
-def : Pat<(mul (XLenVT GPR:$r), C5LeftShift:$i),
-          (SLLI (XLenVT (SH2ADD GPR:$r, GPR:$r)),
-                (TrailingZeros C5LeftShift:$i))>;
-def : Pat<(mul (XLenVT GPR:$r), C9LeftShift:$i),
-          (SLLI (XLenVT (SH3ADD GPR:$r, GPR:$r)),
-                (TrailingZeros C9LeftShift:$i))>;
-
 } // Predicates = [HasStdExtZba]
 
 let Predicates = [HasStdExtZba, IsRV64] in {
@@ -780,15 +715,6 @@ def : Pat<(i64 (add_like_non_imm12 (and GPR:$rs1, 0x3FFFFFFFC), (XLenVT GPR:$rs2
 def : Pat<(i64 (add_like_non_imm12 (and GPR:$rs1, 0x7FFFFFFF8), (XLenVT GPR:$rs2))),
           (SH3ADD_UW (XLenVT (SRLI GPR:$rs1, 3)), GPR:$rs2)>;
 
-def : Pat<(i64 (mul (and_oneuse GPR:$r, 0xFFFFFFFF), C3LeftShiftUW:$i)),
-          (SH1ADD (XLenVT (SLLI_UW GPR:$r, (TrailingZeros C3LeftShiftUW:$i))),
-                  (XLenVT (SLLI_UW GPR:$r, (TrailingZeros C3LeftShiftUW:$i))))>;
-def : Pat<(i64 (mul (and_oneuse GPR:$r, 0xFFFFFFFF), C5LeftShiftUW:$i)),
-          (SH2ADD (XLenVT (SLLI_UW GPR:$r, (TrailingZeros C5LeftShiftUW:$i))),
-                  (XLenVT (SLLI_UW GPR:$r, (TrailingZeros C5LeftShiftUW:$i))))>;
-def : Pat<(i64 (mul (and_oneuse GPR:$r, 0xFFFFFFFF), C9LeftShiftUW:$i)),
-          (SH3ADD (XLenVT (SLLI_UW GPR:$r, (TrailingZeros C9LeftShiftUW:$i))),
-                  (XLenVT (SLLI_UW GPR:$r, (TrailingZeros C9LeftShiftUW:$i))))>;
 } // Predicates = [HasStdExtZba, IsRV64]
 
 let Predicates = [HasStdExtZbcOrZbkc] in {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
index aaf9c019aedf..d091077f729b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
@@ -24,11 +24,9 @@ def tuimm5 : RISCVOp, TImmLeaf<XLenVT, [{return isUInt<5>(Imm);}]>;
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
 multiclass VCLMUL_MV_V_X<string opcodestr, bits<6> funct6> {
   def V  : VALUVV<funct6, OPMVV, opcodestr # "." # "vv">,
-           Sched<[WriteVIALUV_WorstCase, ReadVIALUV_WorstCase,
-                  ReadVIALUV_WorstCase, ReadVMask]>;
+           SchedBinaryMC<"WriteVCLMULV", "ReadVCLMULV", "ReadVCLMULV">;
   def X  : VALUVX<funct6, OPMVX, opcodestr # "." # "vx">,
-           Sched<[WriteVIALUX_WorstCase, ReadVIALUV_WorstCase,
-                  ReadVIALUX_WorstCase, ReadVMask]>;
+           SchedBinaryMC<"WriteVCLMULX", "ReadVCLMULV", "ReadVCLMULX">;
 }
 
 class RVInstIVI_VROR<bits<6> funct6, dag outs, dag ins, string opcodestr,
@@ -57,8 +55,7 @@ multiclass VROR_IV_V_X_I<string opcodestr, bits<6> funct6>
   def I : RVInstIVI_VROR<funct6, (outs VR:$vd),
               (ins VR:$vs2, uimm6:$imm, VMaskOp:$vm),
               opcodestr # ".vi", "$vd, $vs2, $imm$vm">,
-         Sched<[WriteVIALUI_WorstCase, ReadVIALUV_WorstCase,
-                ReadVMask]>;
+          SchedUnaryMC<"WriteVRotI", "ReadVRotV">;
 }
 
 // op vd, vs2, vs1
@@ -109,9 +106,11 @@ class PALUVs2NoVmBinary<bits<6> funct6, bits<5> vs1, RISCVVFormat opv,
 multiclass VAES_MV_V_S<bits<6> funct6_vv, bits<6> funct6_vs, bits<5> vs1,
                          RISCVVFormat opv, string opcodestr> {
   let RVVConstraint = NoConstraint in
-  def NAME # _VV : PALUVs2NoVmBinary<funct6_vv, vs1, opv, opcodestr # ".vv">;
+  def NAME # _VV : PALUVs2NoVmBinary<funct6_vv, vs1, opv, opcodestr # ".vv">,
+                   SchedBinaryMC<"WriteVAESMVV", "ReadVAESMVV", "ReadVAESMVV">;
   let RVVConstraint = VS2Constraint in
-  def NAME # _VS : PALUVs2NoVmBinary<funct6_vs, vs1, opv, opcodestr # ".vs">;
+  def NAME # _VS : PALUVs2NoVmBinary<funct6_vs, vs1, opv, opcodestr # ".vs">,
+                   SchedBinaryMC<"WriteVAESMVV", "ReadVAESMVV", "ReadVAESMVV">;
 }
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0
 
@@ -142,14 +141,23 @@ let Predicates = [HasStdExtZvkb] in {
 } // Predicates = [HasStdExtZvkb]
 
 let Predicates = [HasStdExtZvkg], RVVConstraint = NoConstraint in {
-  def VGHSH_VV : PALUVVNoVmTernary<0b101100, OPMVV, "vghsh.vv">;
-  def VGMUL_VV : PALUVs2NoVmBinary<0b101000, 0b10001, OPMVV, "vgmul.vv">;
+  def VGHSH_VV : PALUVVNoVmTernary<0b101100, OPMVV, "vghsh.vv">,
+                 SchedTernaryMC<"WriteVGHSHV", "ReadVGHSHV", "ReadVGHSHV",
+                                "ReadVGHSHV">;
+  def VGMUL_VV : PALUVs2NoVmBinary<0b101000, 0b10001, OPMVV, "vgmul.vv">,
+                 SchedBinaryMC<"WriteVGMULV", "ReadVGMULV", "ReadVGMULV">;
 } // Predicates = [HasStdExtZvkg]
 
 let Predicates = [HasStdExtZvknhaOrZvknhb], RVVConstraint = Sha2Constraint in {
-  def VSHA2CH_VV : PALUVVNoVmTernary<0b101110, OPMVV, "vsha2ch.vv">;
-  def VSHA2CL_VV : PALUVVNoVmTernary<0b101111, OPMVV, "vsha2cl.vv">;
-  def VSHA2MS_VV : PALUVVNoVmTernary<0b101101, OPMVV, "vsha2ms.vv">;
+  def VSHA2CH_VV : PALUVVNoVmTernary<0b101110, OPMVV, "vsha2ch.vv">,
+                   SchedTernaryMC<"WriteVSHA2CHV", "ReadVSHA2CHV", "ReadVSHA2CHV",
+                                  "ReadVSHA2CHV">;
+  def VSHA2CL_VV : PALUVVNoVmTernary<0b101111, OPMVV, "vsha2cl.vv">,
+                   SchedTernaryMC<"WriteVSHA2CLV", "ReadVSHA2CLV", "ReadVSHA2CLV",
+                                  "ReadVSHA2CLV">;
+  def VSHA2MS_VV : PALUVVNoVmTernary<0b101101, OPMVV, "vsha2ms.vv">,
+                   SchedTernaryMC<"WriteVSHA2MSV", "ReadVSHA2MSV", "ReadVSHA2MSV",
+                                  "ReadVSHA2MSV">;
 } // Predicates = [HasStdExtZvknhaOrZvknhb]
 
 let Predicates = [HasStdExtZvkned] in {
@@ -157,21 +165,27 @@ let Predicates = [HasStdExtZvkned] in {
   defm VAESDM     : VAES_MV_V_S<0b101000, 0b101001, 0b00000, OPMVV, "vaesdm">;
   defm VAESEF     : VAES_MV_V_S<0b101000, 0b101001, 0b00011, OPMVV, "vaesef">;
   defm VAESEM     : VAES_MV_V_S<0b101000, 0b101001, 0b00010, OPMVV, "vaesem">;
-  def  VAESKF1_VI : PALUVINoVm<0b100010, "vaeskf1.vi", uimm5>;
-  def  VAESKF2_VI : PALUVINoVmBinary<0b101010, "vaeskf2.vi", uimm5>;
+  def  VAESKF1_VI : PALUVINoVm<0b100010, "vaeskf1.vi", uimm5>,
+                    SchedUnaryMC<"WriteVAESKF1V", "ReadVAESKF1V">;
+  def  VAESKF2_VI : PALUVINoVmBinary<0b101010, "vaeskf2.vi", uimm5>,
+                    SchedBinaryMC<"WriteVAESKF2V", "ReadVAESKF2V", "ReadVAESKF2V">;
   let RVVConstraint = VS2Constraint in
-  def  VAESZ_VS   : PALUVs2NoVmBinary<0b101001, 0b00111, OPMVV, "vaesz.vs">;
+  def  VAESZ_VS   : PALUVs2NoVmBinary<0b101001, 0b00111, OPMVV, "vaesz.vs">,
+                    SchedBinaryMC<"WriteVAESZV", "ReadVAESZV", "ReadVAESZV">;
 } // Predicates = [HasStdExtZvkned]
 
 let Predicates = [HasStdExtZvksed] in {
   let RVVConstraint = NoConstraint in
-  def  VSM4K_VI : PALUVINoVm<0b100001, "vsm4k.vi", uimm5>;
+  def  VSM4K_VI : PALUVINoVm<0b100001, "vsm4k.vi", uimm5>,
+                  SchedUnaryMC<"WriteVSM4KV", "ReadVSM4KV">;
   defm VSM4R    : VAES_MV_V_S<0b101000, 0b101001, 0b10000, OPMVV, "vsm4r">;
 } // Predicates = [HasStdExtZvksed]
 
 let Predicates = [HasStdExtZvksh], RVVConstraint = VS2Constraint in {
-  def VSM3C_VI  : PALUVINoVmBinary<0b101011, "vsm3c.vi", uimm5>;
-  def VSM3ME_VV : PALUVVNoVm<0b100000, OPMVV, "vsm3me.vv">;
+  def VSM3C_VI  : PALUVINoVmBinary<0b101011, "vsm3c.vi", uimm5>,
+                  SchedBinaryMC<"WriteVSM3CV", "ReadVSM3CV", "ReadVSM3CV">;
+  def VSM3ME_VV : PALUVVNoVm<0b100000, OPMVV, "vsm3me.vv">,
+                  SchedUnaryMC<"WriteVSM3MEV", "ReadVSM3MEV">;
 } // Predicates = [HasStdExtZvksh]
 
 //===----------------------------------------------------------------------===//
@@ -266,55 +280,121 @@ multiclass VPseudoBinaryV_S_NoMask_Zvk<LMULInfo m> {
       def "_VS_" # m.MX # "_" # vs2_lmul.MX : VPseudoBinaryNoMask_Zvk<m.vrclass, vs2_lmul.vrclass>;
 }
 
-multiclass VPseudoVALU_V_NoMask_Zvk {
+multiclass VPseudoVGMUL {
   foreach m = MxListVF4 in {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryV_V_NoMask_Zvk<m>,
-              SchedBinary<"WriteVIALUV", "ReadVIALUV", "ReadVIALUV", mx>;
+              SchedBinary<"WriteVGMULV", "ReadVGMULV", "ReadVGMULV", mx>;
   }
 }
 
-multiclass VPseudoVALU_S_NoMask_Zvk {
+multiclass VPseudoVAESMV {
   foreach m = MxListVF4 in {
     defvar mx = m.MX;
+    defm "" : VPseudoBinaryV_V_NoMask_Zvk<m>,
+              SchedBinary<"WriteVAESMVV", "ReadVAESMVV", "ReadVAESMVV", mx>;
     defm "" : VPseudoBinaryV_S_NoMask_Zvk<m>,
-              SchedBinary<"WriteVIALUV", "ReadVIALUV", "ReadVIALUV", mx>;
+              SchedBinary<"WriteVAESMVV", "ReadVAESMVV", "ReadVAESMVV", mx>;
+
+  }
+}
+
+multiclass VPseudoVSM4R {
+  foreach m = MxListVF4 in {
+    defvar mx = m.MX;
+    defm "" : VPseudoBinaryV_V_NoMask_Zvk<m>,
+              SchedBinary<"WriteVSM4RV", "ReadVSM4RV", "ReadVSM4RV", mx>;
+    defm "" : VPseudoBinaryV_S_NoMask_Zvk<m>,
+              SchedBinary<"WriteVSM4RV", "ReadVSM4RV", "ReadVSM4RV", mx>;
+
+  }
+}
+
+multiclass VPseudoVGHSH {
+  foreach m = MxListVF4 in {
+    defvar mx = m.MX;
+    defm _VV : VPseudoTernaryNoMask_Zvk<m.vrclass, m.vrclass, m.vrclass, m>,
+               SchedTernary<"WriteVGHSHV", "ReadVGHSHV", "ReadVGHSHV",
+                            "ReadVGHSHV", mx>;
   }
 }
 
-multiclass VPseudoVALU_V_S_NoMask_Zvk
-  : VPseudoVALU_V_NoMask_Zvk, VPseudoVALU_S_NoMask_Zvk;
+multiclass VPseudoVSHA2CH {
+  foreach m = MxListVF4 in {
+    defvar mx = m.MX;
+    defm _VV : VPseudoTernaryNoMask_Zvk<m.vrclass, m.vrclass, m.vrclass, m>,
+               SchedTernary<"WriteVSHA2CHV", "ReadVSHA2CHV", "ReadVSHA2CHV",
+                            "ReadVSHA2CHV", mx>;
+  }
+}
+
+multiclass VPseudoVSHA2CL {
+  foreach m = MxListVF4 in {
+    defvar mx = m.MX;
+    defm _VV : VPseudoTernaryNoMask_Zvk<m.vrclass, m.vrclass, m.vrclass, m>,
+               SchedTernary<"WriteVSHA2CLV", "ReadVSHA2CLV", "ReadVSHA2CLV",
+                            "ReadVSHA2CLV", mx>;
+  }
+}
 
-multiclass VPseudoVALU_VV_NoMask_Zvk {
+multiclass VPseudoVSHA2MS {
   foreach m = MxListVF4 in {
     defvar mx = m.MX;
     defm _VV : VPseudoTernaryNoMask_Zvk<m.vrclass, m.vrclass, m.vrclass, m>,
-               SchedTernary<"WriteVIALUV", "ReadVIALUV", "ReadVIALUV", "ReadVIALUV", mx>;
+               SchedTernary<"WriteVSHA2MSV", "ReadVSHA2MSV", "ReadVSHA2MSV",
+                            "ReadVSHA2MSV", mx>;
+  }
+}
+
+multiclass VPseudoVAESKF1 {
+  foreach m = MxListVF4 in {
+    defvar mx = m.MX;
+    defm _VI : VPseudoBinaryNoMaskTU_Zvk<m.vrclass, m.vrclass, uimm5, m>,
+               SchedBinary<"WriteVAESKF1V", "ReadVAESKF1V", "ReadVAESKF1V", mx,
+                           forceMergeOpRead=true>;
+  }
+}
+
+multiclass VPseudoVAESKF2 {
+  foreach m = MxListVF4 in {
+    defvar mx = m.MX;
+    defm _VI : VPseudoTernaryNoMask_Zvk<m.vrclass, m.vrclass, uimm5, m>,
+               SchedTernary<"WriteVAESKF2V", "ReadVAESKF2V", "ReadVAESKF2V",
+                            "ReadVAESKF2V", mx>;
+  }
+}
+
+multiclass VPseudoVAESZ {
+  foreach m = MxListVF4 in {
+    defvar mx = m.MX;
+    defm "" : VPseudoBinaryV_S_NoMask_Zvk<m>,
+              SchedBinary<"WriteVAESZV", "ReadVAESZV", "ReadVAESZV", mx>;
   }
 }
 
-multiclass VPseudoVALU_VI_NoMask_Zvk {
+multiclass VPseudoVSM3C {
   foreach m = MxListVF4 in {
     defvar mx = m.MX;
     defm _VI : VPseudoTernaryNoMask_Zvk<m.vrclass, m.vrclass, uimm5, m>,
-               SchedTernary<"WriteVIALUV", "ReadVIALUV", "ReadVIALUV", "ReadVIALUV", mx>;
+               SchedTernary<"WriteVSM3CV", "ReadVSM3CV", "ReadVSM3CV",
+                            "ReadVSM3CV", mx>;
   }
 }
 
-multiclass VPseudoVALU_VI_NoMaskTU_Zvk {
+multiclass VPseudoVSM4K {
   foreach m = MxListVF4 in {
     defvar mx = m.MX;
     defm _VI : VPseudoBinaryNoMaskTU_Zvk<m.vrclass, m.vrclass, uimm5, m>,
-               SchedBinary<"WriteVIALUV", "ReadVIALUV", "ReadVIALUV", mx,
+               SchedBinary<"WriteVSM4KV", "ReadVSM4KV", "ReadVSM4KV", mx,
                            forceMergeOpRead=true>;
   }
 }
 
-multiclass VPseudoVALU_VV_NoMaskTU_Zvk {
+multiclass VPseudoVSM3ME {
   foreach m = MxListVF4 in {
     defvar mx = m.MX;
     defm _VV : VPseudoBinaryNoMaskTU_Zvk<m.vrclass, m.vrclass, m.vrclass, m>,
-               SchedBinary<"WriteVIALUV", "ReadVIALUV", "ReadVIALUV", mx,
+               SchedBinary<"WriteVSM3MEV", "ReadVSM3MEV", "ReadVSM3MEV", mx,
                            forceMergeOpRead=true>;
   }
 }
@@ -323,10 +403,10 @@ multiclass VPseudoVCLMUL_VV_VX {
   foreach m = MxList in {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryV_VV<m>,
-              SchedBinary<"WriteVIALUV", "ReadVIALUV", "ReadVIALUV", mx,
+              SchedBinary<"WriteVCLMULV", "ReadVCLMULV", "ReadVCLMULV", mx,
                           forceMergeOpRead=true>;
     defm "" : VPseudoBinaryV_VX<m>,
-              SchedBinary<"WriteVIALUX", "ReadVIALUV", "ReadVIALUX", mx,
+              SchedBinary<"WriteVCLMULX", "ReadVCLMULV", "ReadVCLMULX", mx,
                           forceMergeOpRead=true>;
   }
 }
@@ -340,28 +420,111 @@ multiclass VPseudoUnaryV_V<LMULInfo m> {
   }
 }
 
-multiclass VPseudoVALU_V {
+multiclass VPseudoVBREV {
   foreach m = MxList in {
     defvar mx = m.MX;
     defm "" : VPseudoUnaryV_V<m>,
-              SchedUnary<"WriteVIALUV", "ReadVIALUV", mx,
-                         forceMergeOpRead=true>;
+              SchedUnary<"WriteVBREVV", "ReadVBREVV", mx, forceMergeOpRead=true>;
   }
 }
 
-multiclass VPseudoVWALU_VV_VX_VI<Operand ImmType> : VPseudoVWALU_VV_VX {
+multiclass VPseudoVCLZ {
+  foreach m = MxList in {
+    defvar mx = m.MX;
+    defm "" : VPseudoUnaryV_V<m>,
+              SchedUnary<"WriteVCLZV", "ReadVCLZV", mx, forceMergeOpRead=true>;
+  }
+}
+
+multiclass VPseudoVCTZ {
+  foreach m = MxList in {
+    defvar mx = m.MX;
+    defm "" : VPseudoUnaryV_V<m>,
+              SchedUnary<"WriteVCTZV", "ReadVCTZV", mx, forceMergeOpRead=true>;
+  }
+}
+
+multiclass VPseudoVCPOP {
+  foreach m = MxList in {
+    defvar mx = m.MX;
+    defm "" : VPseudoUnaryV_V<m>,
+              SchedUnary<"WriteVCPOPV", "ReadVCPOPV", mx, forceMergeOpRead=true>;
+  }
+}
+
+multiclass VPseudoVWALU_VV_VX_VI<Operand ImmType> {
   foreach m = MxListW in {
+    defvar mx = m.MX;
+    defm "" : VPseudoBinaryW_VV<m>,
+              SchedBinary<"WriteVWSLLV", "ReadVWSLLV", "ReadVWSLLV", mx,
+                          forceMergeOpRead=true>;
+    defm "" : VPseudoBinaryW_VX<m>, 
+              SchedBinary<"WriteVWSLLX", "ReadVWSLLV", "ReadVWSLLX", mx,
+                          forceMergeOpRead=true>;
     defm "" : VPseudoBinaryW_VI<ImmType, m>,
-              SchedUnary<"WriteVIWALUV", "ReadVIWALUV", m.MX,
+              SchedUnary<"WriteVWSLLI", "ReadVWSLLV", mx,
                          forceMergeOpRead=true>;
   }
 }
 
+multiclass VPseudoVANDN {
+ foreach m = MxList in {
+    defm "" : VPseudoBinaryV_VV<m>,
+              SchedBinary<"WriteVIALUV", "ReadVIALUV", "ReadVIALUV", m.MX,
+                          forceMergeOpRead=true>;
+    defm "" : VPseudoBinaryV_VX<m>,
+              SchedBinary<"WriteVIALUX", "ReadVIALUV", "ReadVIALUX", m.MX,
+                          forceMergeOpRead=true>;
+  }
+}
+
+multiclass VPseudoVBREV8 {
+  foreach m = MxList in {
+    defvar mx = m.MX;
+    defm "" : VPseudoUnaryV_V<m>,
+              SchedUnary<"WriteVBREV8V", "ReadVBREV8V", mx, forceMergeOpRead=true>;
+  }
+}
+
+multiclass VPseudoVREV8 {
+  foreach m = MxList in {
+    defvar mx = m.MX;
+    defm "" : VPseudoUnaryV_V<m>,
+              SchedUnary<"WriteVREV8V", "ReadVREV8V", mx, forceMergeOpRead=true>;
+  }
+}
+
+multiclass VPseudoVROL {
+ foreach m = MxList in {
+    defm "" : VPseudoBinaryV_VV<m>,
+              SchedBinary<"WriteVRotV", "ReadVRotV", "ReadVRotV", m.MX,
+                          forceMergeOpRead=true>;
+    defm "" : VPseudoBinaryV_VX<m>,
+              SchedBinary<"WriteVRotX", "ReadVRotV", "ReadVRotX", m.MX,
+                          forceMergeOpRead=true>;
+  }
+}
+
+multiclass VPseudoVROR<Operand ImmType> {
+  defvar Constraint = "";
+  foreach m = MxList in {
+    defvar mx = m.MX;
+    defm "" : VPseudoBinaryV_VV<m>,
+              SchedBinary<"WriteVRotV", "ReadVRotV", "ReadVRotV", mx,
+                          forceMergeOpRead=true>;
+    defm "" : VPseudoBinaryV_VX<m>,
+              SchedBinary<"WriteVRotX", "ReadVRotV", "ReadVRotX", mx,
+                          forceMergeOpRead=true>;
+    defm "" : VPseudoBinaryV_VI<ImmType, m>,
+              SchedUnary<"WriteVRotI", "ReadVRotV", mx, forceMergeOpRead=true>;
+  }
+}
+
 let Predicates = [HasStdExtZvbb] in {
-  defm PseudoVBREV  : VPseudoVALU_V;
-  defm PseudoVCLZ   : VPseudoVALU_V;
-  defm PseudoVCTZ   : VPseudoVALU_V;
-  defm PseudoVCPOP  : VPseudoVALU_V;
+  defm PseudoVBREV  : VPseudoVBREV;
+  defm PseudoVCLZ   : VPseudoVCLZ;
+  defm PseudoVCTZ   : VPseudoVCTZ;
+  defm PseudoVCPOP  : VPseudoVCPOP;
   defm PseudoVWSLL : VPseudoVWALU_VV_VX_VI<uimm5>;
 } // Predicates = [HasStdExtZvbb]
 
@@ -371,42 +534,42 @@ let Predicates = [HasStdExtZvbc] in {
 } // Predicates = [HasStdExtZvbc]
 
 let Predicates = [HasStdExtZvkb] in {
-  defm PseudoVANDN  : VPseudoVALU_VV_VX;
-  defm PseudoVBREV8 : VPseudoVALU_V;
-  defm PseudoVREV8  : VPseudoVALU_V;
-  defm PseudoVROL   : VPseudoVALU_VV_VX;
-  defm PseudoVROR   : VPseudoVALU_VV_VX_VI<uimm6>;
+  defm PseudoVANDN  : VPseudoVANDN;
+  defm PseudoVBREV8 : VPseudoVBREV8;
+  defm PseudoVREV8  : VPseudoVREV8;
+  defm PseudoVROL   : VPseudoVROL;
+  defm PseudoVROR   : VPseudoVROR<uimm6>;
 } // Predicates = [HasStdExtZvkb]
 
 let Predicates = [HasStdExtZvkg] in {
-  defm PseudoVGHSH : VPseudoVALU_VV_NoMask_Zvk;
-  defm PseudoVGMUL : VPseudoVALU_V_NoMask_Zvk;
+  defm PseudoVGHSH : VPseudoVGHSH;
+  defm PseudoVGMUL : VPseudoVGMUL;
 } // Predicates = [HasStdExtZvkg]
 
 let Predicates = [HasStdExtZvkned] in {
-  defm PseudoVAESDF  : VPseudoVALU_V_S_NoMask_Zvk;
-  defm PseudoVAESDM  : VPseudoVALU_V_S_NoMask_Zvk;
-  defm PseudoVAESEF  : VPseudoVALU_V_S_NoMask_Zvk;
-  defm PseudoVAESEM  : VPseudoVALU_V_S_NoMask_Zvk;
-  defm PseudoVAESKF1 : VPseudoVALU_VI_NoMaskTU_Zvk;
-  defm PseudoVAESKF2 : VPseudoVALU_VI_NoMask_Zvk;
-  defm PseudoVAESZ   : VPseudoVALU_S_NoMask_Zvk;
+  defm PseudoVAESDF  : VPseudoVAESMV;
+  defm PseudoVAESDM  : VPseudoVAESMV;
+  defm PseudoVAESEF  : VPseudoVAESMV;
+  defm PseudoVAESEM  : VPseudoVAESMV;
+  defm PseudoVAESKF1 : VPseudoVAESKF1;
+  defm PseudoVAESKF2 : VPseudoVAESKF2;
+  defm PseudoVAESZ   : VPseudoVAESZ;
 } // Predicates = [HasStdExtZvkned]
 
 let Predicates = [HasStdExtZvknhaOrZvknhb] in {
-  defm PseudoVSHA2CH : VPseudoVALU_VV_NoMask_Zvk;
-  defm PseudoVSHA2CL : VPseudoVALU_VV_NoMask_Zvk;
-  defm PseudoVSHA2MS : VPseudoVALU_VV_NoMask_Zvk;
+  defm PseudoVSHA2CH : VPseudoVSHA2CH;
+  defm PseudoVSHA2CL : VPseudoVSHA2CL;
+  defm PseudoVSHA2MS : VPseudoVSHA2MS;
 } // Predicates = [HasStdExtZvknhaOrZvknhb]
 
 let Predicates = [HasStdExtZvksed] in {
-  defm PseudoVSM4K : VPseudoVALU_VI_NoMaskTU_Zvk;
-  defm PseudoVSM4R : VPseudoVALU_V_S_NoMask_Zvk;
+  defm PseudoVSM4K : VPseudoVSM4K;
+  defm PseudoVSM4R : VPseudoVSM4R;
 } // Predicates = [HasStdExtZvksed]
 
 let Predicates = [HasStdExtZvksh] in {
-  defm PseudoVSM3C  : VPseudoVALU_VI_NoMask_Zvk;
-  defm PseudoVSM3ME : VPseudoVALU_VV_NoMaskTU_Zvk;
+  defm PseudoVSM3C  : VPseudoVSM3C;
+  defm PseudoVSM3ME : VPseudoVSM3ME;
 } // Predicates = [HasStdExtZvksh]
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp
index 57b473645ae7..52f2ce27164d 100644
--- a/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp
@@ -31,9 +31,7 @@ public:
   const RISCVInstrInfo *TII;
   static char ID;
 
-  RISCVPostRAExpandPseudo() : MachineFunctionPass(ID) {
-    initializeRISCVPostRAExpandPseudoPass(*PassRegistry::getPassRegistry());
-  }
+  RISCVPostRAExpandPseudo() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
diff --git a/llvm/lib/Target/RISCV/RISCVRedundantCopyElimination.cpp b/llvm/lib/Target/RISCV/RISCVRedundantCopyElimination.cpp
index 61d605fda3f5..65ff67b42479 100644
--- a/llvm/lib/Target/RISCV/RISCVRedundantCopyElimination.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRedundantCopyElimination.cpp
@@ -77,9 +77,11 @@ guaranteesZeroRegInBlock(MachineBasicBlock &MBB,
   assert(Cond.size() == 3 && "Unexpected number of operands");
   assert(TBB != nullptr && "Expected branch target basic block");
   auto CC = static_cast<RISCVCC::CondCode>(Cond[0].getImm());
-  if (CC == RISCVCC::COND_EQ && Cond[2].getReg() == RISCV::X0 && TBB == &MBB)
+  if (CC == RISCVCC::COND_EQ && Cond[2].isReg() &&
+      Cond[2].getReg() == RISCV::X0 && TBB == &MBB)
     return true;
-  if (CC == RISCVCC::COND_NE && Cond[2].getReg() == RISCV::X0 && TBB != &MBB)
+  if (CC == RISCVCC::COND_NE && Cond[2].isReg() &&
+      Cond[2].getReg() == RISCV::X0 && TBB != &MBB)
     return true;
   return false;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index 316daf2763ca..90e62dc39e6a 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -173,6 +173,8 @@ def GPRNoX0X2 : GPRRegisterClass<(sub GPR, X0, X2)>;
 // by tablegen.
 def GPRJALR : GPRRegisterClass<(sub GPR, (sequence "X%u", 0, 5))>;
 
+def GPRJALRNonX7 : GPRRegisterClass<(sub GPRJALR, X7)>;
+
 def GPRC : GPRRegisterClass<(add (sequence "X%u", 10, 15),
                                  (sequence "X%u", 8, 9))>;
 
@@ -183,6 +185,7 @@ def GPRC : GPRRegisterClass<(add (sequence "X%u", 10, 15),
 def GPRTC : GPRRegisterClass<(add (sequence "X%u", 6, 7),
                                   (sequence "X%u", 10, 17),
                                   (sequence "X%u", 28, 31))>;
+def GPRTCNonX7 : GPRRegisterClass<(sub GPRTC, X7)>;
 
 def SP : GPRRegisterClass<(add X2)>;
 
diff --git a/llvm/lib/Target/RISCV/RISCVSchedRocket.td b/llvm/lib/Target/RISCV/RISCVSchedRocket.td
index 65494e73758d..9ddc4281092d 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedRocket.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedRocket.td
@@ -262,4 +262,5 @@ defm : UnsupportedSchedZfa;
 defm : UnsupportedSchedZfh;
 defm : UnsupportedSchedSFB;
 defm : UnsupportedSchedXsfvcp;
+defm : UnsupportedSchedZvk;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index a532066b3a1c..e67da839bdb8 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -1298,4 +1298,5 @@ defm : UnsupportedSchedZbc;
 defm : UnsupportedSchedZbkb;
 defm : UnsupportedSchedZbkx;
 defm : UnsupportedSchedZfa;
+defm : UnsupportedSchedZvk;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td
index fccdd7e4f3ec..a37958826e02 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td
@@ -367,4 +367,5 @@ defm : UnsupportedSchedSFB;
 defm : UnsupportedSchedZfa;
 defm : UnsupportedSchedV;
 defm : UnsupportedSchedXsfvcp;
+defm : UnsupportedSchedZvk;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
index 6e4fb19361f5..6ba299385f07 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
@@ -748,6 +748,62 @@ foreach mx = SchedMxList in {
   }
 }
 
+// Vector Crypto
+foreach mx = SchedMxList in {
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
+  // Zvbb
+  let Latency = 2, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVBREVV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVCLZV",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVCPOPV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVCTZV",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVWSLLV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVWSLLX",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVWSLLI",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+  // Zvbc
+  let Latency = 2, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVCLMULV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVCLMULX", [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+  // Zvkb
+  // VANDN uses WriteVIALU[V|X|I]
+  let Latency = 2, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVBREV8V",  [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVREV8V",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVRotV",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVRotX",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVRotI",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+  // Zvkg
+  let Latency = 2, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVGHSHV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVGMULV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+  // ZvknhaOrZvknhb
+  let Latency = 3, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVSHA2CHV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSHA2CLV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSHA2MSV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+  // Zvkned
+  let Latency = 2, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVAESMVV",  [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVAESKF1V", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVAESKF2V", [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+  let Latency = 1, ReleaseAtCycles = [LMulLat] in
+  defm "" : LMULWriteResMX<"WriteVAESZV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+  // Zvksed
+  let Latency = 3, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVSM4KV",   [SiFiveP600VEXQ0], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSM4RV",   [SiFiveP600VEXQ0], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSM3CV",   [SiFiveP600VEXQ0], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSM3MEV",  [SiFiveP600VEXQ0], mx, IsWorstCase>;
+  }
+}
+
 // Others
 def : WriteRes<WriteCSR, [SiFiveP600SYS]>;
 def : WriteRes<WriteNop, []>;
@@ -1032,6 +1088,42 @@ foreach mx = SchedMxList in {
     def : ReadAdvance<!cast<SchedRead>("ReadVMergeOp_" # mx  # "_E" # sew), 0>;
 }
 
+// Vector Crypto Extensions
+// Zvbb
+defm "" : LMULReadAdvance<"ReadVBREVV", 0>;
+defm "" : LMULReadAdvance<"ReadVCLZV", 0>;
+defm "" : LMULReadAdvance<"ReadVCPOPV", 0>;
+defm "" : LMULReadAdvance<"ReadVCTZV", 0>;
+defm "" : LMULReadAdvance<"ReadVWSLLV", 0>;
+defm "" : LMULReadAdvance<"ReadVWSLLX", 0>;
+// Zvbc
+defm "" : LMULReadAdvance<"ReadVCLMULV", 0>;
+defm "" : LMULReadAdvance<"ReadVCLMULX", 0>;
+// Zvkb
+// VANDN uses ReadVIALU[V|X|I]
+defm "" : LMULReadAdvance<"ReadVBREV8V", 0>;
+defm "" : LMULReadAdvance<"ReadVREV8V", 0>;
+defm "" : LMULReadAdvance<"ReadVRotV", 0>;
+defm "" : LMULReadAdvance<"ReadVRotX", 0>;
+// Zvkg
+defm "" : LMULReadAdvance<"ReadVGHSHV", 0>;
+defm "" : LMULReadAdvance<"ReadVGMULV", 0>;
+// Zvknha or Zvknhb
+defm "" : LMULReadAdvance<"ReadVSHA2CHV", 0>;
+defm "" : LMULReadAdvance<"ReadVSHA2CLV", 0>;
+defm "" : LMULReadAdvance<"ReadVSHA2MSV", 0>;
+// Zvkned
+defm "" : LMULReadAdvance<"ReadVAESMVV", 0>;
+defm "" : LMULReadAdvance<"ReadVAESKF1V", 0>;
+defm "" : LMULReadAdvance<"ReadVAESKF2V", 0>;
+defm "" : LMULReadAdvance<"ReadVAESZV", 0>;
+// Zvksed
+defm "" : LMULReadAdvance<"ReadVSM4KV", 0>;
+defm "" : LMULReadAdvance<"ReadVSM4RV", 0>;
+// Zbksh
+defm "" : LMULReadAdvance<"ReadVSM3CV", 0>;
+defm "" : LMULReadAdvance<"ReadVSM3MEV", 0>;
+
 //===----------------------------------------------------------------------===//
 // Unsupported extensions
 defm : UnsupportedSchedZabha;
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR1.td b/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR1.td
index 0885e325f24e..31112d140cde 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR1.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR1.td
@@ -213,4 +213,5 @@ defm : UnsupportedSchedZbkx;
 defm : UnsupportedSchedZfa;
 defm : UnsupportedSchedZfh;
 defm : UnsupportedSchedXsfvcp;
+defm : UnsupportedSchedZvk;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td b/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td
index e0f1fab1d6b4..dcd1a938a914 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td
@@ -312,4 +312,5 @@ defm : UnsupportedSchedZfh;
 defm : UnsupportedSchedSFB;
 defm : UnsupportedSchedZabha;
 defm : UnsupportedSchedXsfvcp;
+defm : UnsupportedSchedZvk;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVSchedule.td b/llvm/lib/Target/RISCV/RISCVSchedule.td
index 0086557a41fe..d9a2e38c0e9d 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedule.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedule.td
@@ -297,3 +297,4 @@ def : ReadAdvance<ReadAtomicHD, 0>;
 include "RISCVScheduleZb.td"
 include "RISCVScheduleV.td"
 include "RISCVScheduleXSf.td"
+include "RISCVScheduleZvk.td"
diff --git a/llvm/lib/Target/RISCV/RISCVScheduleZvk.td b/llvm/lib/Target/RISCV/RISCVScheduleZvk.td
new file mode 100644
index 000000000000..640c456322f0
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVScheduleZvk.td
@@ -0,0 +1,208 @@
+//=== RISCVScheduleZvk.td - RISC-V Scheduling Definitions Zvk -*- tablegen ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+/// Define scheduler resources associated with def operands.
+
+/// Zvbb extension
+defm "" : LMULSchedWrites<"WriteVBREVV">;
+defm "" : LMULSchedWrites<"WriteVCLZV">;
+defm "" : LMULSchedWrites<"WriteVCPOPV">;
+defm "" : LMULSchedWrites<"WriteVCTZV">;
+defm "" : LMULSchedWrites<"WriteVWSLLV">;
+defm "" : LMULSchedWrites<"WriteVWSLLX">;
+defm "" : LMULSchedWrites<"WriteVWSLLI">;
+
+/// Zvbc extension
+defm "" : LMULSchedWrites<"WriteVCLMULV">;
+defm "" : LMULSchedWrites<"WriteVCLMULX">;
+
+/// Zvkb extension
+// VANDN uses WriteVIALU[V|X|I]
+defm "" : LMULSchedWrites<"WriteVBREV8V">;
+defm "" : LMULSchedWrites<"WriteVREV8V">;
+defm "" : LMULSchedWrites<"WriteVRotV">;
+defm "" : LMULSchedWrites<"WriteVRotX">;
+defm "" : LMULSchedWrites<"WriteVRotI">;
+
+/// Zvkg extension
+defm "" : LMULSchedWrites<"WriteVGHSHV">;
+defm "" : LMULSchedWrites<"WriteVGMULV">;
+
+/// Zvknha or Zvknhb extensions
+defm "" : LMULSchedWrites<"WriteVSHA2CHV">;
+defm "" : LMULSchedWrites<"WriteVSHA2CLV">;
+defm "" : LMULSchedWrites<"WriteVSHA2MSV">;
+
+/// Zvkned extension
+defm "" : LMULSchedWrites<"WriteVAESMVV">;
+defm "" : LMULSchedWrites<"WriteVAESKF1V">;
+defm "" : LMULSchedWrites<"WriteVAESKF2V">;
+defm "" : LMULSchedWrites<"WriteVAESZV">;
+
+/// Zvksed extension
+defm "" : LMULSchedWrites<"WriteVSM4KV">;
+defm "" : LMULSchedWrites<"WriteVSM4RV">;
+
+/// Zvksh extension
+defm "" : LMULSchedWrites<"WriteVSM3CV">;
+defm "" : LMULSchedWrites<"WriteVSM3MEV">;
+
+/// Define scheduler resources associated with use operands.
+/// Zvbb extension
+defm "" : LMULSchedReads<"ReadVBREVV">;
+defm "" : LMULSchedReads<"ReadVCLZV">;
+defm "" : LMULSchedReads<"ReadVCPOPV">;
+defm "" : LMULSchedReads<"ReadVCTZV">;
+defm "" : LMULSchedReads<"ReadVWSLLV">;
+defm "" : LMULSchedReads<"ReadVWSLLX">;
+
+/// Zvbc extension
+defm "" : LMULSchedReads<"ReadVCLMULV">;
+defm "" : LMULSchedReads<"ReadVCLMULX">;
+
+/// Zvkb extension
+// VANDN uses ReadVIALU[V|X|I]
+defm "" : LMULSchedReads<"ReadVBREV8V">;
+defm "" : LMULSchedReads<"ReadVREV8V">;
+defm "" : LMULSchedReads<"ReadVRotV">;
+defm "" : LMULSchedReads<"ReadVRotX">;
+
+/// Zvkg extension
+defm "" : LMULSchedReads<"ReadVGHSHV">;
+defm "" : LMULSchedReads<"ReadVGMULV">;
+
+/// Zvknha or Zvknhb extensions
+defm "" : LMULSchedReads<"ReadVSHA2CHV">;
+defm "" : LMULSchedReads<"ReadVSHA2CLV">;
+defm "" : LMULSchedReads<"ReadVSHA2MSV">;
+
+/// Zvkned extension
+defm "" : LMULSchedReads<"ReadVAESMVV">;
+defm "" : LMULSchedReads<"ReadVAESKF1V">;
+defm "" : LMULSchedReads<"ReadVAESKF2V">;
+defm "" : LMULSchedReads<"ReadVAESZV">;
+
+/// Zvksed extension
+defm "" : LMULSchedReads<"ReadVSM4KV">;
+defm "" : LMULSchedReads<"ReadVSM4RV">;
+
+/// Zvksh extension
+defm "" : LMULSchedReads<"ReadVSM3CV">;
+defm "" : LMULSchedReads<"ReadVSM3MEV">;
+
+multiclass UnsupportedSchedZvbb {
+let Unsupported = true in {
+defm "" : LMULWriteRes<"WriteVBREVV", []>;
+defm "" : LMULWriteRes<"WriteVCLZV", []>;
+defm "" : LMULWriteRes<"WriteVCPOPV", []>;
+defm "" : LMULWriteRes<"WriteVCTZV", []>;
+defm "" : LMULWriteRes<"WriteVWSLLV", []>;
+defm "" : LMULWriteRes<"WriteVWSLLX", []>;
+defm "" : LMULWriteRes<"WriteVWSLLI", []>;
+
+defm "" : LMULReadAdvance<"ReadVBREVV", 0>;
+defm "" : LMULReadAdvance<"ReadVCLZV", 0>;
+defm "" : LMULReadAdvance<"ReadVCPOPV", 0>;
+defm "" : LMULReadAdvance<"ReadVCTZV", 0>;
+defm "" : LMULReadAdvance<"ReadVWSLLV", 0>;
+defm "" : LMULReadAdvance<"ReadVWSLLX", 0>;
+}
+}
+
+multiclass UnsupportedSchedZvbc {
+let Unsupported = true in {
+defm "" : LMULWriteRes<"WriteVCLMULV", []>;
+defm "" : LMULWriteRes<"WriteVCLMULX", []>;
+
+defm "" : LMULReadAdvance<"ReadVCLMULV", 0>; 
+defm "" : LMULReadAdvance<"ReadVCLMULX", 0>;
+}
+}
+
+multiclass UnsupportedSchedZvkb {
+let Unsupported = true in {
+defm "" : LMULWriteRes<"WriteVBREV8V", []>;
+defm "" : LMULWriteRes<"WriteVREV8V", []>;
+defm "" : LMULWriteRes<"WriteVRotV", []>;
+defm "" : LMULWriteRes<"WriteVRotX", []>;
+defm "" : LMULWriteRes<"WriteVRotI", []>;
+
+defm "" : LMULReadAdvance<"ReadVBREV8V", 0>;
+defm "" : LMULReadAdvance<"ReadVREV8V", 0>;
+defm "" : LMULReadAdvance<"ReadVRotV", 0>;
+defm "" : LMULReadAdvance<"ReadVRotX", 0>;
+}
+}
+
+multiclass UnsupportedSchedZvkg {
+let Unsupported = true in {
+defm "" : LMULWriteRes<"WriteVGHSHV", []>;
+defm "" : LMULWriteRes<"WriteVGMULV", []>;
+
+defm "" : LMULReadAdvance<"ReadVGHSHV", 0>;
+defm "" : LMULReadAdvance<"ReadVGMULV", 0>;
+}
+}
+
+multiclass UnsupportedSchedZvknhaOrZvknhb {
+let Unsupported = true in {
+defm "" : LMULWriteRes<"WriteVSHA2CHV", []>;
+defm "" : LMULWriteRes<"WriteVSHA2CLV", []>;
+defm "" : LMULWriteRes<"WriteVSHA2MSV", []>;
+
+defm "" : LMULReadAdvance<"ReadVSHA2CHV", 0>;
+defm "" : LMULReadAdvance<"ReadVSHA2CLV", 0>;
+defm "" : LMULReadAdvance<"ReadVSHA2MSV", 0>;
+}
+}
+
+multiclass UnsupportedSchedZvkned {
+let Unsupported = true in {
+defm "" : LMULWriteRes<"WriteVAESMVV", []>;
+defm "" : LMULWriteRes<"WriteVAESKF1V", []>;
+defm "" : LMULWriteRes<"WriteVAESKF2V", []>;
+defm "" : LMULWriteRes<"WriteVAESZV", []>;
+
+defm "" : LMULReadAdvance<"ReadVAESMVV", 0>;
+defm "" : LMULReadAdvance<"ReadVAESKF1V", 0>;
+defm "" : LMULReadAdvance<"ReadVAESKF2V", 0>;
+defm "" : LMULReadAdvance<"ReadVAESZV", 0>;
+}
+}
+
+multiclass UnsupportedSchedZvksed {
+let Unsupported = true in {
+defm "" : LMULWriteRes<"WriteVSM4KV", []>;
+defm "" : LMULWriteRes<"WriteVSM4RV", []>;
+
+defm "" : LMULReadAdvance<"ReadVSM4KV", 0>;
+defm "" : LMULReadAdvance<"ReadVSM4RV", 0>;
+}
+}
+
+multiclass UnsupportedSchedZvksh {
+let Unsupported = true in {
+defm "" : LMULWriteRes<"WriteVSM3CV", []>;
+defm "" : LMULWriteRes<"WriteVSM3MEV", []>;
+
+defm "" : LMULReadAdvance<"ReadVSM3CV", 0>;
+defm "" : LMULReadAdvance<"ReadVSM3MEV", 0>;
+}
+}
+
+// Helper class to define all RISC-V Vector Crypto extensions as unsupported
+multiclass UnsupportedSchedZvk {
+defm "" : UnsupportedSchedZvbb;
+defm "" : UnsupportedSchedZvbc;
+defm "" : UnsupportedSchedZvkb;
+defm "" : UnsupportedSchedZvkg;
+defm "" : UnsupportedSchedZvknhaOrZvknhb;
+defm "" : UnsupportedSchedZvkned;
+defm "" : UnsupportedSchedZvksed;
+defm "" : UnsupportedSchedZvksh;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 0876f46728a1..7b2dcadc4191 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -390,6 +390,9 @@ FunctionPass *RISCVPassConfig::createRVVRegAllocPass(bool Optimized) {
 bool RISCVPassConfig::addRegAssignAndRewriteFast() {
   addPass(createRVVRegAllocPass(false));
   addPass(createRISCVCoalesceVSETVLIPass());
+  if (TM->getOptLevel() != CodeGenOptLevel::None &&
+      EnableRISCVDeadRegisterElimination)
+    addPass(createRISCVDeadRegisterDefinitionsPass());
   return TargetPassConfig::addRegAssignAndRewriteFast();
 }
 
@@ -397,6 +400,9 @@ bool RISCVPassConfig::addRegAssignAndRewriteOptimized() {
   addPass(createRVVRegAllocPass(true));
   addPass(createVirtRegRewriter(false));
   addPass(createRISCVCoalesceVSETVLIPass());
+  if (TM->getOptLevel() != CodeGenOptLevel::None &&
+      EnableRISCVDeadRegisterElimination)
+    addPass(createRISCVDeadRegisterDefinitionsPass());
   return TargetPassConfig::addRegAssignAndRewriteOptimized();
 }
 
@@ -535,12 +541,9 @@ void RISCVPassConfig::addPreRegAlloc() {
   addPass(createRISCVPreRAExpandPseudoPass());
   if (TM->getOptLevel() != CodeGenOptLevel::None)
     addPass(createRISCVMergeBaseOffsetOptPass());
-  addPass(createRISCVInsertVSETVLIPass());
-  if (TM->getOptLevel() != CodeGenOptLevel::None &&
-      EnableRISCVDeadRegisterElimination)
-    addPass(createRISCVDeadRegisterDefinitionsPass());
   addPass(createRISCVInsertReadWriteCSRPass());
   addPass(createRISCVInsertWriteVXRMPass());
+  addPass(createRISCVInsertVSETVLIPass());
 }
 
 void RISCVPassConfig::addFastRegAlloc() {
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index ce26e61880fd..d94dff5f2b1f 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -599,12 +599,8 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
     Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
     bool UseMaskForCond, bool UseMaskForGaps) {
-  if (isa<ScalableVectorType>(VecTy))
+  if (isa<ScalableVectorType>(VecTy) && Factor != 2)
     return InstructionCost::getInvalid();
-  auto *FVTy = cast<FixedVectorType>(VecTy);
-  InstructionCost MemCost =
-      getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
-  unsigned VF = FVTy->getNumElements() / Factor;
 
   // The interleaved memory access pass will lower interleaved memory ops (i.e
   // a load and store followed by a specific shuffle) to vlseg/vsseg
@@ -612,24 +608,35 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
   // memory op
   if (!UseMaskForCond && !UseMaskForGaps &&
       Factor <= TLI->getMaxSupportedInterleaveFactor()) {
-    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(FVTy);
+    auto *VTy = cast<VectorType>(VecTy);
+    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
     // Need to make sure type has't been scalarized
-    if (LT.second.isFixedLengthVector()) {
-      auto *LegalFVTy = FixedVectorType::get(FVTy->getElementType(),
-                                             LT.second.getVectorNumElements());
+    if (LT.second.isVector()) {
+      auto *LegalVTy = VectorType::get(VTy->getElementType(),
+                                       LT.second.getVectorElementCount());
       // FIXME: We use the memory op cost of the *legalized* type here, becuase
       // it's getMemoryOpCost returns a really expensive cost for types like
       // <6 x i8>, which show up when doing interleaves of Factor=3 etc.
       // Should the memory op cost of these be cheaper?
-      if (TLI->isLegalInterleavedAccessType(LegalFVTy, Factor, Alignment,
+      if (TLI->isLegalInterleavedAccessType(LegalVTy, Factor, Alignment,
                                             AddressSpace, DL)) {
         InstructionCost LegalMemCost = getMemoryOpCost(
-            Opcode, LegalFVTy, Alignment, AddressSpace, CostKind);
+            Opcode, LegalVTy, Alignment, AddressSpace, CostKind);
         return LT.first + LegalMemCost;
       }
     }
   }
 
+  // TODO: Return the cost of interleaved accesses for scalable vector when
+  // unable to convert to segment accesses instructions.
+  if (isa<ScalableVectorType>(VecTy))
+    return InstructionCost::getInvalid();
+
+  auto *FVTy = cast<FixedVectorType>(VecTy);
+  InstructionCost MemCost =
+      getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
+  unsigned VF = FVTy->getNumElements() / Factor;
+
   // An interleaved load will look like this for Factor=3:
   // %wide.vec = load <12 x i32>, ptr %3, align 4
   // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
@@ -1612,29 +1619,59 @@ InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
   if (Op2Info.isConstant())
     ConstantMatCost += getConstantMatCost(1, Op2Info);
 
+  unsigned Op;
   switch (TLI->InstructionOpcodeToISD(Opcode)) {
   case ISD::ADD:
   case ISD::SUB:
-  case ISD::AND:
-  case ISD::OR:
-  case ISD::XOR:
+    Op = RISCV::VADD_VV;
+    break;
   case ISD::SHL:
   case ISD::SRL:
   case ISD::SRA:
+    Op = RISCV::VSLL_VV;
+    break;
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+    Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV;
+    break;
   case ISD::MUL:
   case ISD::MULHS:
   case ISD::MULHU:
+    Op = RISCV::VMUL_VV;
+    break;
+  case ISD::SDIV:
+  case ISD::UDIV:
+    Op = RISCV::VDIV_VV;
+    break;
+  case ISD::SREM:
+  case ISD::UREM:
+    Op = RISCV::VREM_VV;
+    break;
   case ISD::FADD:
   case ISD::FSUB:
+    // TODO: Address FP16 with VFHMIN
+    Op = RISCV::VFADD_VV;
+    break;
   case ISD::FMUL:
-  case ISD::FNEG: {
-    return ConstantMatCost + TLI->getLMULCost(LT.second) * LT.first * 1;
-  }
+    // TODO: Address FP16 with VFHMIN
+    Op = RISCV::VFMUL_VV;
+    break;
+  case ISD::FDIV:
+    Op = RISCV::VFDIV_VV;
+    break;
+  case ISD::FNEG:
+    Op = RISCV::VFSGNJN_VV;
+    break;
   default:
-    return ConstantMatCost +
-           BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
-                                         Args, CxtI);
+    // Assuming all other instructions have the same cost until a need arises to
+    // differentiate them.
+    return ConstantMatCost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
+                                                           Op1Info, Op2Info,
+                                                           Args, CxtI);
   }
+  return ConstantMatCost +
+         LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
 }
 
 // TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index 7439d0fefa98..32de8b9587b4 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -1763,7 +1763,7 @@ static bool buildNDRange(const SPIRV::IncomingCall *Call,
       if (!MRI->getRegClassOrNull(GWSPtr))
         MRI->setRegClass(GWSPtr, &SPIRV::IDRegClass);
       // TODO: Maybe simplify generation of the type of the fields.
-      unsigned Size = Call->Builtin->Name.equals("ndrange_3D") ? 3 : 2;
+      unsigned Size = Call->Builtin->Name == "ndrange_3D" ? 3 : 2;
       unsigned BitWidth = GR->getPointerSize() == 64 ? 64 : 32;
       Type *BaseTy = IntegerType::get(MF.getFunction().getContext(), BitWidth);
       Type *FieldTy = ArrayType::get(BaseTy, Size);
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 9994a966c82c..2051cdc7e01f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -467,6 +467,8 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg,
     return selectExtInst(ResVReg, ResType, I, CL::cos, GL::Cos);
   case TargetOpcode::G_FSIN:
     return selectExtInst(ResVReg, ResType, I, CL::sin, GL::Sin);
+  case TargetOpcode::G_FTAN:
+    return selectExtInst(ResVReg, ResType, I, CL::tan, GL::Tan);
 
   case TargetOpcode::G_FSQRT:
     return selectExtInst(ResVReg, ResType, I, CL::sqrt, GL::Sqrt);
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
index 4b871bdd5d07..e7b35555293a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
@@ -277,6 +277,7 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
                                G_FCEIL,
                                G_FCOS,
                                G_FSIN,
+                               G_FTAN,
                                G_FSQRT,
                                G_FFLOOR,
                                G_FRINT,
diff --git a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index 67e2b9d7c997..185b2fe90c6c 100644
--- a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -1377,7 +1377,7 @@ MCRegister SparcAsmParser::matchRegisterName(const AsmToken &Tok,
     return IntRegs[RegNo];
   }
 
-  if (Name.equals("xcc")) {
+  if (Name == "xcc") {
     // FIXME:: check 64bit.
     RegKind = SparcOperand::rk_Special;
     return SP::ICC;
@@ -1385,36 +1385,36 @@ MCRegister SparcAsmParser::matchRegisterName(const AsmToken &Tok,
 
   // JPS1 extension - aliases for ASRs
   // Section A.51 - Read State Register
-  if (Name.equals("pcr")) {
+  if (Name == "pcr") {
     RegKind = SparcOperand::rk_Special;
     return SP::ASR16;
   }
 
-  if (Name.equals("pic")) {
+  if (Name == "pic") {
     RegKind = SparcOperand::rk_Special;
     return SP::ASR17;
   }
-  if (Name.equals("dcr")) {
+  if (Name == "dcr") {
     RegKind = SparcOperand::rk_Special;
     return SP::ASR18;
   }
-  if (Name.equals("gsr")) {
+  if (Name == "gsr") {
     RegKind = SparcOperand::rk_Special;
     return SP::ASR19;
   }
-  if (Name.equals("softint")) {
+  if (Name == "softint") {
     RegKind = SparcOperand::rk_Special;
     return SP::ASR22;
   }
-  if (Name.equals("tick_cmpr")) {
+  if (Name == "tick_cmpr") {
     RegKind = SparcOperand::rk_Special;
     return SP::ASR23;
   }
-  if (Name.equals("stick") || Name.equals("sys_tick")) {
+  if (Name == "stick" || Name == "sys_tick") {
     RegKind = SparcOperand::rk_Special;
     return SP::ASR24;
   }
-  if (Name.equals("stick_cmpr") || Name.equals("sys_tick_cmpr")) {
+  if (Name == "stick_cmpr" || Name == "sys_tick_cmpr") {
     RegKind = SparcOperand::rk_Special;
     return SP::ASR25;
   }
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 2da4431cf077..2920c1f02a31 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -294,6 +294,8 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   // the atomic operations in order to exploit SystemZ instructions.
   setOperationAction(ISD::ATOMIC_LOAD,     MVT::i128, Custom);
   setOperationAction(ISD::ATOMIC_STORE,    MVT::i128, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD, MVT::f128, Custom);
+  setOperationAction(ISD::ATOMIC_STORE, MVT::f128, Custom);
 
   // Mark sign/zero extending atomic loads as legal, which will make
   // DAGCombiner fold extensions into atomic loads if possible.
@@ -935,17 +937,11 @@ bool SystemZTargetLowering::hasInlineStackProbe(const MachineFunction &MF) const
 
 TargetLowering::AtomicExpansionKind
 SystemZTargetLowering::shouldCastAtomicLoadInIR(LoadInst *LI) const {
-  // Lower fp128 the same way as i128.
-  if (LI->getType()->isFP128Ty())
-    return AtomicExpansionKind::CastToInteger;
   return AtomicExpansionKind::None;
 }
 
 TargetLowering::AtomicExpansionKind
 SystemZTargetLowering::shouldCastAtomicStoreInIR(StoreInst *SI) const {
-  // Lower fp128 the same way as i128.
-  if (SI->getValueOperand()->getType()->isFP128Ty())
-    return AtomicExpansionKind::CastToInteger;
   return AtomicExpansionKind::None;
 }
 
@@ -1555,6 +1551,8 @@ static SDValue lowerI128ToGR128(SelectionDAG &DAG, SDValue In) {
     std::tie(Lo, Hi) = DAG.SplitScalar(In, DL, MVT::i64, MVT::i64);
   }
 
+  // FIXME: If v2i64 were a legal type, we could use it instead of
+  // Untyped here.  This might enable improved folding.
   SDNode *Pair = DAG.getMachineNode(SystemZ::PAIR128, DL,
                                     MVT::Untyped, Hi, Lo);
   return SDValue(Pair, 0);
@@ -4550,7 +4548,9 @@ SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op,
 SDValue SystemZTargetLowering::lowerATOMIC_LDST_I128(SDValue Op,
                                                      SelectionDAG &DAG) const {
   auto *Node = cast<AtomicSDNode>(Op.getNode());
-  assert(Node->getMemoryVT() == MVT::i128 && "Only custom lowering i128.");
+  assert(
+      (Node->getMemoryVT() == MVT::i128 || Node->getMemoryVT() == MVT::f128) &&
+      "Only custom lowering i128 or f128.");
   // Use same code to handle both legal and non-legal i128 types.
   SmallVector<SDValue, 2> Results;
   LowerOperationWrapper(Node, Results, DAG);
@@ -6249,6 +6249,50 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
   }
 }
 
+static SDValue expandBitCastI128ToF128(SelectionDAG &DAG, SDValue Src,
+                                       const SDLoc &SL) {
+  // If i128 is legal, just use a normal bitcast.
+  if (DAG.getTargetLoweringInfo().isTypeLegal(MVT::i128))
+    return DAG.getBitcast(MVT::f128, Src);
+
+  // Otherwise, f128 must live in FP128, so do a partwise move.
+  assert(DAG.getTargetLoweringInfo().getRepRegClassFor(MVT::f128) ==
+         &SystemZ::FP128BitRegClass);
+
+  SDValue Hi, Lo;
+  std::tie(Lo, Hi) = DAG.SplitScalar(Src, SL, MVT::i64, MVT::i64);
+
+  Hi = DAG.getBitcast(MVT::f64, Hi);
+  Lo = DAG.getBitcast(MVT::f64, Lo);
+
+  SDNode *Pair = DAG.getMachineNode(
+      SystemZ::REG_SEQUENCE, SL, MVT::f128,
+      {DAG.getTargetConstant(SystemZ::FP128BitRegClassID, SL, MVT::i32), Lo,
+       DAG.getTargetConstant(SystemZ::subreg_l64, SL, MVT::i32), Hi,
+       DAG.getTargetConstant(SystemZ::subreg_h64, SL, MVT::i32)});
+  return SDValue(Pair, 0);
+}
+
+static SDValue expandBitCastF128ToI128(SelectionDAG &DAG, SDValue Src,
+                                       const SDLoc &SL) {
+  // If i128 is legal, just use a normal bitcast.
+  if (DAG.getTargetLoweringInfo().isTypeLegal(MVT::i128))
+    return DAG.getBitcast(MVT::i128, Src);
+
+  // Otherwise, f128 must live in FP128, so do a partwise move.
+  assert(DAG.getTargetLoweringInfo().getRepRegClassFor(MVT::f128) ==
+         &SystemZ::FP128BitRegClass);
+
+  SDValue LoFP =
+      DAG.getTargetExtractSubreg(SystemZ::subreg_l64, SL, MVT::f64, Src);
+  SDValue HiFP =
+      DAG.getTargetExtractSubreg(SystemZ::subreg_h64, SL, MVT::f64, Src);
+  SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i64, LoFP);
+  SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i64, HiFP);
+
+  return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i128, Lo, Hi);
+}
+
 // Lower operations with invalid operand or result types (currently used
 // only for 128-bit integer types).
 void
@@ -6263,15 +6307,23 @@ SystemZTargetLowering::LowerOperationWrapper(SDNode *N,
     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
     SDValue Res = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_LOAD_128,
                                           DL, Tys, Ops, MVT::i128, MMO);
-    Results.push_back(lowerGR128ToI128(DAG, Res));
+
+    SDValue Lowered = lowerGR128ToI128(DAG, Res);
+    if (N->getValueType(0) == MVT::f128)
+      Lowered = expandBitCastI128ToF128(DAG, Lowered, DL);
+    Results.push_back(Lowered);
     Results.push_back(Res.getValue(1));
     break;
   }
   case ISD::ATOMIC_STORE: {
     SDLoc DL(N);
     SDVTList Tys = DAG.getVTList(MVT::Other);
-    SDValue Ops[] = {N->getOperand(0), lowerI128ToGR128(DAG, N->getOperand(1)),
-                     N->getOperand(2)};
+    SDValue Val = N->getOperand(1);
+    if (Val.getValueType() == MVT::f128)
+      Val = expandBitCastF128ToI128(DAG, Val, DL);
+    Val = lowerI128ToGR128(DAG, Val);
+
+    SDValue Ops[] = {N->getOperand(0), Val, N->getOperand(2)};
     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
     SDValue Res = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_STORE_128,
                                           DL, Tys, Ops, MVT::i128, MMO);
@@ -6306,24 +6358,7 @@ SystemZTargetLowering::LowerOperationWrapper(SDNode *N,
     if (N->getValueType(0) == MVT::i128 && Src.getValueType() == MVT::f128 &&
         !useSoftFloat()) {
       SDLoc DL(N);
-      SDValue Lo, Hi;
-      if (getRepRegClassFor(MVT::f128) == &SystemZ::VR128BitRegClass) {
-        SDValue VecBC = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Src);
-        Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, VecBC,
-                         DAG.getConstant(1, DL, MVT::i32));
-        Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, VecBC,
-                         DAG.getConstant(0, DL, MVT::i32));
-      } else {
-        assert(getRepRegClassFor(MVT::f128) == &SystemZ::FP128BitRegClass &&
-               "Unrecognized register class for f128.");
-        SDValue LoFP = DAG.getTargetExtractSubreg(SystemZ::subreg_l64,
-                                                  DL, MVT::f64, Src);
-        SDValue HiFP = DAG.getTargetExtractSubreg(SystemZ::subreg_h64,
-                                                  DL, MVT::f64, Src);
-        Lo = DAG.getNode(ISD::BITCAST, DL, MVT::i64, LoFP);
-        Hi = DAG.getNode(ISD::BITCAST, DL, MVT::i64, HiFP);
-      }
-      Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, Lo, Hi));
+      Results.push_back(expandBitCastF128ToI128(DAG, Src, DL));
     }
     break;
   }
@@ -6768,72 +6803,118 @@ SDValue SystemZTargetLowering::combineMERGE(
   return SDValue();
 }
 
+static bool isI128MovedToParts(LoadSDNode *LD, SDNode *&LoPart,
+                               SDNode *&HiPart) {
+  LoPart = HiPart = nullptr;
+
+  // Scan through all users.
+  for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end();
+       UI != UIEnd; ++UI) {
+    // Skip the uses of the chain.
+    if (UI.getUse().getResNo() != 0)
+      continue;
+
+    // Verify every user is a TRUNCATE to i64 of the low or high half.
+    SDNode *User = *UI;
+    bool IsLoPart = true;
+    if (User->getOpcode() == ISD::SRL &&
+        User->getOperand(1).getOpcode() == ISD::Constant &&
+        User->getConstantOperandVal(1) == 64 && User->hasOneUse()) {
+      User = *User->use_begin();
+      IsLoPart = false;
+    }
+    if (User->getOpcode() != ISD::TRUNCATE || User->getValueType(0) != MVT::i64)
+      return false;
+
+    if (IsLoPart) {
+      if (LoPart)
+        return false;
+      LoPart = User;
+    } else {
+      if (HiPart)
+        return false;
+      HiPart = User;
+    }
+  }
+  return true;
+}
+
+static bool isF128MovedToParts(LoadSDNode *LD, SDNode *&LoPart,
+                               SDNode *&HiPart) {
+  LoPart = HiPart = nullptr;
+
+  // Scan through all users.
+  for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end();
+       UI != UIEnd; ++UI) {
+    // Skip the uses of the chain.
+    if (UI.getUse().getResNo() != 0)
+      continue;
+
+    // Verify every user is an EXTRACT_SUBREG of the low or high half.
+    SDNode *User = *UI;
+    if (!User->hasOneUse() || !User->isMachineOpcode() ||
+        User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
+      return false;
+
+    switch (User->getConstantOperandVal(1)) {
+    case SystemZ::subreg_l64:
+      if (LoPart)
+        return false;
+      LoPart = User;
+      break;
+    case SystemZ::subreg_h64:
+      if (HiPart)
+        return false;
+      HiPart = User;
+      break;
+    default:
+      return false;
+    }
+  }
+  return true;
+}
+
 SDValue SystemZTargetLowering::combineLOAD(
     SDNode *N, DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
   EVT LdVT = N->getValueType(0);
   SDLoc DL(N);
 
-  // Replace an i128 load that is used solely to move its value into GPRs
+  // Replace a 128-bit load that is used solely to move its value into GPRs
   // by separate loads of both halves.
-  if (LdVT == MVT::i128) {
-    LoadSDNode *LD = cast<LoadSDNode>(N);
-    if (!LD->isSimple() || !ISD::isNormalLoad(LD))
-      return SDValue();
-
-    // Scan through all users.
-    SmallVector<std::pair<SDNode *, int>, 2> Users;
-    int UsedElements = 0;
-    for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end();
-         UI != UIEnd; ++UI) {
-      // Skip the uses of the chain.
-      if (UI.getUse().getResNo() != 0)
-        continue;
-
-      // Verify every user is a TRUNCATE to i64 of the low or high half ...
-      SDNode *User = *UI;
-      int Index = 1;
-      if (User->getOpcode() == ISD::SRL &&
-          User->getOperand(1).getOpcode() == ISD::Constant &&
-          User->getConstantOperandVal(1) == 64 && User->hasOneUse()) {
-        User = *User->use_begin();
-        Index = 0;
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  if (LD->isSimple() && ISD::isNormalLoad(LD)) {
+    SDNode *LoPart, *HiPart;
+    if ((LdVT == MVT::i128 && isI128MovedToParts(LD, LoPart, HiPart)) ||
+        (LdVT == MVT::f128 && isF128MovedToParts(LD, LoPart, HiPart))) {
+      // Rewrite each extraction as an independent load.
+      SmallVector<SDValue, 2> ArgChains;
+      if (HiPart) {
+        SDValue EltLoad = DAG.getLoad(
+            HiPart->getValueType(0), DL, LD->getChain(), LD->getBasePtr(),
+            LD->getPointerInfo(), LD->getOriginalAlign(),
+            LD->getMemOperand()->getFlags(), LD->getAAInfo());
+
+        DCI.CombineTo(HiPart, EltLoad, true);
+        ArgChains.push_back(EltLoad.getValue(1));
+      }
+      if (LoPart) {
+        SDValue EltLoad = DAG.getLoad(
+            LoPart->getValueType(0), DL, LD->getChain(),
+            DAG.getObjectPtrOffset(DL, LD->getBasePtr(), TypeSize::getFixed(8)),
+            LD->getPointerInfo().getWithOffset(8), LD->getOriginalAlign(),
+            LD->getMemOperand()->getFlags(), LD->getAAInfo());
+
+        DCI.CombineTo(LoPart, EltLoad, true);
+        ArgChains.push_back(EltLoad.getValue(1));
       }
-      if (User->getOpcode() != ISD::TRUNCATE ||
-          User->getValueType(0) != MVT::i64)
-        return SDValue();
-
-      // ... and no half is extracted twice.
-      if (UsedElements & (1 << Index))
-        return SDValue();
-
-      UsedElements |= 1 << Index;
-      Users.push_back(std::make_pair(User, Index));
-    }
-
-    // Rewrite each extraction as an independent load.
-    SmallVector<SDValue, 2> ArgChains;
-    for (auto UserAndIndex : Users) {
-      SDNode *User = UserAndIndex.first;
-      unsigned Offset = User->getValueType(0).getStoreSize() * UserAndIndex.second;
-      SDValue Ptr =
-        DAG.getMemBasePlusOffset(LD->getBasePtr(), TypeSize::getFixed(Offset), DL);
-      SDValue EltLoad =
-        DAG.getLoad(User->getValueType(0), DL, LD->getChain(), Ptr,
-                    LD->getPointerInfo().getWithOffset(Offset),
-                    LD->getOriginalAlign(), LD->getMemOperand()->getFlags(),
-                    LD->getAAInfo());
 
-      DCI.CombineTo(User, EltLoad, true);
-      ArgChains.push_back(EltLoad.getValue(1));
+      // Collect all chains via TokenFactor.
+      SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, ArgChains);
+      DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
+      DCI.AddToWorklist(Chain.getNode());
+      return SDValue(N, 0);
     }
-
-    // Collect all chains via TokenFactor.
-    SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
-                                ArgChains);
-    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
-    DCI.AddToWorklist(Chain.getNode());
-    return SDValue(N, 0);
   }
 
   if (LdVT.isVector() || LdVT.isInteger())
@@ -6913,7 +6994,8 @@ static bool isOnlyUsedByStores(SDValue StoredVal, SelectionDAG &DAG) {
   return true;
 }
 
-static bool isMovedFromParts(SDValue Val, SDValue &LoPart, SDValue &HiPart) {
+static bool isI128MovedFromParts(SDValue Val, SDValue &LoPart,
+                                 SDValue &HiPart) {
   if (Val.getOpcode() != ISD::OR || !Val.getNode()->hasOneUse())
     return false;
 
@@ -6940,6 +7022,23 @@ static bool isMovedFromParts(SDValue Val, SDValue &LoPart, SDValue &HiPart) {
   return true;
 }
 
+static bool isF128MovedFromParts(SDValue Val, SDValue &LoPart,
+                                 SDValue &HiPart) {
+  if (!Val.getNode()->hasOneUse() || !Val.isMachineOpcode() ||
+      Val.getMachineOpcode() != TargetOpcode::REG_SEQUENCE)
+    return false;
+
+  if (Val->getNumOperands() != 5 ||
+      Val->getOperand(0)->getAsZExtVal() != SystemZ::FP128BitRegClassID ||
+      Val->getOperand(2)->getAsZExtVal() != SystemZ::subreg_l64 ||
+      Val->getOperand(4)->getAsZExtVal() != SystemZ::subreg_h64)
+    return false;
+
+  LoPart = Val->getOperand(1);
+  HiPart = Val->getOperand(3);
+  return true;
+}
+
 SDValue SystemZTargetLowering::combineSTORE(
     SDNode *N, DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -7009,10 +7108,11 @@ SDValue SystemZTargetLowering::combineSTORE(
                                      Ops, MemVT, SN->getMemOperand());
   }
 
-  // Transform a store of an i128 moved from GPRs into two separate stores.
-  if (MemVT == MVT::i128 && SN->isSimple() && ISD::isNormalStore(SN)) {
+  // Transform a store of a 128-bit value moved from parts into two stores.
+  if (SN->isSimple() && ISD::isNormalStore(SN)) {
     SDValue LoPart, HiPart;
-    if (isMovedFromParts(Op1, LoPart, HiPart)) {
+    if ((MemVT == MVT::i128 && isI128MovedFromParts(Op1, LoPart, HiPart)) ||
+        (MemVT == MVT::f128 && isF128MovedFromParts(Op1, LoPart, HiPart))) {
       SDLoc DL(SN);
       SDValue Chain0 =
         DAG.getStore(SN->getChain(), DL, HiPart, SN->getBasePtr(),
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index b3517fb0ea77..0a29b4f79c7d 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -640,6 +640,48 @@ bool SystemZInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
                                      Register Reg,
                                      MachineRegisterInfo *MRI) const {
   unsigned DefOpc = DefMI.getOpcode();
+
+  if (DefOpc == SystemZ::VGBM) {
+    int64_t ImmVal = DefMI.getOperand(1).getImm();
+    if (ImmVal != 0) // TODO: Handle other values
+      return false;
+
+    // Fold gr128 = COPY (vr128 VGBM imm)
+    //
+    // %tmp:gr64 = LGHI 0
+    // to  gr128 = REG_SEQUENCE %tmp, %tmp
+    assert(DefMI.getOperand(0).getReg() == Reg);
+
+    if (!UseMI.isCopy())
+      return false;
+
+    Register CopyDstReg = UseMI.getOperand(0).getReg();
+    if (CopyDstReg.isVirtual() &&
+        MRI->getRegClass(CopyDstReg) == &SystemZ::GR128BitRegClass &&
+        MRI->hasOneNonDBGUse(Reg)) {
+      // TODO: Handle physical registers
+      // TODO: Handle gr64 uses with subregister indexes
+      // TODO: Should this multi-use cases?
+      Register TmpReg = MRI->createVirtualRegister(&SystemZ::GR64BitRegClass);
+      MachineBasicBlock &MBB = *UseMI.getParent();
+
+      loadImmediate(MBB, UseMI.getIterator(), TmpReg, ImmVal);
+
+      UseMI.setDesc(get(SystemZ::REG_SEQUENCE));
+      UseMI.getOperand(1).setReg(TmpReg);
+      MachineInstrBuilder(*MBB.getParent(), &UseMI)
+          .addImm(SystemZ::subreg_h64)
+          .addReg(TmpReg)
+          .addImm(SystemZ::subreg_l64);
+
+      if (MRI->use_nodbg_empty(Reg))
+        DefMI.eraseFromParent();
+      return true;
+    }
+
+    return false;
+  }
+
   if (DefOpc != SystemZ::LHIMux && DefOpc != SystemZ::LHI &&
       DefOpc != SystemZ::LGHI)
     return false;
@@ -856,6 +898,22 @@ void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
   }
 
+  if (SystemZ::FP128BitRegClass.contains(DestReg) &&
+      SystemZ::GR128BitRegClass.contains(SrcReg)) {
+    MCRegister DestRegHi = RI.getSubReg(DestReg, SystemZ::subreg_h64);
+    MCRegister DestRegLo = RI.getSubReg(DestReg, SystemZ::subreg_l64);
+    MCRegister SrcRegHi = RI.getSubReg(SrcReg, SystemZ::subreg_h64);
+    MCRegister SrcRegLo = RI.getSubReg(SrcReg, SystemZ::subreg_l64);
+
+    BuildMI(MBB, MBBI, DL, get(SystemZ::LDGR), DestRegHi)
+        .addReg(SrcRegHi)
+        .addReg(DestReg, RegState::ImplicitDefine);
+
+    BuildMI(MBB, MBBI, DL, get(SystemZ::LDGR), DestRegLo)
+        .addReg(SrcRegLo, getKillRegState(KillSrc));
+    return;
+  }
+
   // Move CC value from a GR32.
   if (DestReg == SystemZ::CC) {
     unsigned Opcode =
@@ -2085,8 +2143,8 @@ prepareCompareSwapOperands(MachineBasicBlock::iterator const MBBI) const {
 
 unsigned SystemZ::reverseCCMask(unsigned CCMask) {
   return ((CCMask & SystemZ::CCMASK_CMP_EQ) |
-          (CCMask & SystemZ::CCMASK_CMP_GT ? SystemZ::CCMASK_CMP_LT : 0) |
-          (CCMask & SystemZ::CCMASK_CMP_LT ? SystemZ::CCMASK_CMP_GT : 0) |
+          ((CCMask & SystemZ::CCMASK_CMP_GT) ? SystemZ::CCMASK_CMP_LT : 0) |
+          ((CCMask & SystemZ::CCMASK_CMP_LT) ? SystemZ::CCMASK_CMP_GT : 0) |
           (CCMask & SystemZ::CCMASK_CMP_UO));
 }
 
@@ -2221,3 +2279,16 @@ areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
 
   return false;
 }
+
+bool SystemZInstrInfo::getConstValDefinedInReg(const MachineInstr &MI,
+                                               const Register Reg,
+                                               int64_t &ImmVal) const {
+
+  if (MI.getOpcode() == SystemZ::VGBM && Reg == MI.getOperand(0).getReg()) {
+    ImmVal = MI.getOperand(1).getImm();
+    // TODO: Handle non-0 values
+    return ImmVal == 0;
+  }
+
+  return false;
+}
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
index aa10fb564962..61338b081615 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -383,6 +383,9 @@ public:
   bool
   areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
                                   const MachineInstr &MIb) const override;
+
+  bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg,
+                               int64_t &ImmVal) const override;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
index c29c54a6cb79..c09f48891c13 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
@@ -1692,6 +1692,7 @@ let Predicates = [FeatureVector] in
 // Conversions
 //===----------------------------------------------------------------------===//
 
+let Predicates = [FeatureVector] in {
 def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
@@ -1755,6 +1756,7 @@ def : Pat<(i128  (bitconvert (v2i64 VR128:$src))), (i128  VR128:$src)>;
 def : Pat<(i128  (bitconvert (v4f32 VR128:$src))), (i128  VR128:$src)>;
 def : Pat<(i128  (bitconvert (v2f64 VR128:$src))), (i128  VR128:$src)>;
 def : Pat<(i128  (bitconvert (f128  VR128:$src))), (i128  VR128:$src)>;
+} // End Predicates = [FeatureVector]
 
 //===----------------------------------------------------------------------===//
 // Replicating scalars
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index 15aeaaeb8c4a..d4e9fb057c44 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -206,6 +206,8 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
   WASM_LOAD_STORE(LOAD16_SPLAT)
   WASM_LOAD_STORE(LOAD_LANE_I16x8)
   WASM_LOAD_STORE(STORE_LANE_I16x8)
+  WASM_LOAD_STORE(LOAD_F16_F32)
+  WASM_LOAD_STORE(STORE_F16_F32)
   return 1;
   WASM_LOAD_STORE(LOAD_I32)
   WASM_LOAD_STORE(LOAD_F32)
diff --git a/llvm/lib/Target/WebAssembly/WebAssembly.td b/llvm/lib/Target/WebAssembly/WebAssembly.td
index f00974531209..0dd674426e9e 100644
--- a/llvm/lib/Target/WebAssembly/WebAssembly.td
+++ b/llvm/lib/Target/WebAssembly/WebAssembly.td
@@ -22,40 +22,28 @@ include "llvm/Target/Target.td"
 // WebAssembly Subtarget features.
 //===----------------------------------------------------------------------===//
 
-def FeatureSIMD128 : SubtargetFeature<"simd128", "SIMDLevel", "SIMD128",
-                                      "Enable 128-bit SIMD">;
-
-def FeatureRelaxedSIMD : SubtargetFeature<"relaxed-simd", "SIMDLevel", "RelaxedSIMD",
-                                      "Enable relaxed-simd instructions">;
-
-def FeatureHalfPrecision : SubtargetFeature<"half-precision", "HasHalfPrecision", "true",
-                                            "Enable half precision instructions">;
-
 def FeatureAtomics : SubtargetFeature<"atomics", "HasAtomics", "true",
                                       "Enable Atomics">;
 
-def FeatureNontrappingFPToInt :
-      SubtargetFeature<"nontrapping-fptoint",
-                       "HasNontrappingFPToInt", "true",
-                       "Enable non-trapping float-to-int conversion operators">;
-
-def FeatureSignExt :
-      SubtargetFeature<"sign-ext",
-                       "HasSignExt", "true",
-                       "Enable sign extension operators">;
-
-def FeatureTailCall :
-      SubtargetFeature<"tail-call",
-                       "HasTailCall", "true",
-                       "Enable tail call instructions">;
+def FeatureBulkMemory :
+      SubtargetFeature<"bulk-memory", "HasBulkMemory", "true",
+                       "Enable bulk memory operations">;
 
 def FeatureExceptionHandling :
       SubtargetFeature<"exception-handling", "HasExceptionHandling", "true",
                        "Enable Wasm exception handling">;
 
-def FeatureBulkMemory :
-      SubtargetFeature<"bulk-memory", "HasBulkMemory", "true",
-                       "Enable bulk memory operations">;
+def FeatureExtendedConst :
+      SubtargetFeature<"extended-const", "HasExtendedConst", "true",
+                       "Enable extended const expressions">;
+
+def FeatureHalfPrecision :
+      SubtargetFeature<"half-precision", "HasHalfPrecision", "true",
+                       "Enable half precision instructions">;
+
+def FeatureMultiMemory :
+      SubtargetFeature<"multimemory", "HasMultiMemory", "true",
+                       "Enable multiple memories">;
 
 def FeatureMultivalue :
       SubtargetFeature<"multivalue",
@@ -66,17 +54,29 @@ def FeatureMutableGlobals :
       SubtargetFeature<"mutable-globals", "HasMutableGlobals", "true",
                        "Enable mutable globals">;
 
+def FeatureNontrappingFPToInt :
+      SubtargetFeature<"nontrapping-fptoint",
+                       "HasNontrappingFPToInt", "true",
+                       "Enable non-trapping float-to-int conversion operators">;
+
 def FeatureReferenceTypes :
       SubtargetFeature<"reference-types", "HasReferenceTypes", "true",
                        "Enable reference types">;
 
-def FeatureExtendedConst :
-      SubtargetFeature<"extended-const", "HasExtendedConst", "true",
-                       "Enable extended const expressions">;
+def FeatureRelaxedSIMD :
+      SubtargetFeature<"relaxed-simd", "SIMDLevel", "RelaxedSIMD",
+                       "Enable relaxed-simd instructions">;
 
-def FeatureMultiMemory :
-      SubtargetFeature<"multimemory", "HasMultiMemory", "true",
-                       "Enable multiple memories">;
+def FeatureSignExt :
+      SubtargetFeature<"sign-ext", "HasSignExt", "true",
+                       "Enable sign extension operators">;
+
+def FeatureSIMD128 : SubtargetFeature<"simd128", "SIMDLevel", "SIMD128",
+                                      "Enable 128-bit SIMD">;
+
+def FeatureTailCall :
+      SubtargetFeature<"tail-call", "HasTailCall", "true",
+                       "Enable tail call instructions">;
 
 //===----------------------------------------------------------------------===//
 // Architectures.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 64bcadf3f567..527bb4c9fbea 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -906,6 +906,22 @@ bool WebAssemblyTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.align = Align(8);
     Info.flags = MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad;
     return true;
+  case Intrinsic::wasm_loadf16_f32:
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::f16;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = Align(2);
+    Info.flags = MachineMemOperand::MOLoad;
+    return true;
+  case Intrinsic::wasm_storef16_f32:
+    Info.opc = ISD::INTRINSIC_VOID;
+    Info.memVT = MVT::f16;
+    Info.ptrVal = I.getArgOperand(1);
+    Info.offset = 0;
+    Info.align = Align(2);
+    Info.flags = MachineMemOperand::MOStore;
+    return true;
   default:
     return false;
   }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index fb2ca532d252..c1a5a45395e8 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -22,66 +22,68 @@ def HasAddr32 : Predicate<"!Subtarget->hasAddr64()">;
 
 def HasAddr64 : Predicate<"Subtarget->hasAddr64()">;
 
-def HasSIMD128 :
-    Predicate<"Subtarget->hasSIMD128()">,
-    AssemblerPredicate<(all_of FeatureSIMD128), "simd128">;
+def HasAtomics :
+    Predicate<"Subtarget->hasAtomics()">,
+    AssemblerPredicate<(all_of FeatureAtomics), "atomics">;
 
-def HasRelaxedSIMD :
-    Predicate<"Subtarget->hasRelaxedSIMD()">,
-    AssemblerPredicate<(all_of FeatureRelaxedSIMD), "relaxed-simd">;
+def HasBulkMemory :
+    Predicate<"Subtarget->hasBulkMemory()">,
+    AssemblerPredicate<(all_of FeatureBulkMemory), "bulk-memory">;
+
+def HasExceptionHandling :
+    Predicate<"Subtarget->hasExceptionHandling()">,
+    AssemblerPredicate<(all_of FeatureExceptionHandling), "exception-handling">;
+
+def HasExtendedConst :
+    Predicate<"Subtarget->hasExtendedConst()">,
+    AssemblerPredicate<(all_of FeatureExtendedConst), "extended-const">;
 
 def HasHalfPrecision :
     Predicate<"Subtarget->hasHalfPrecision()">,
     AssemblerPredicate<(all_of FeatureHalfPrecision), "half-precision">;
 
-def HasAtomics :
-    Predicate<"Subtarget->hasAtomics()">,
-    AssemblerPredicate<(all_of FeatureAtomics), "atomics">;
+def HasMultiMemory :
+    Predicate<"Subtarget->hasMultiMemory()">,
+    AssemblerPredicate<(all_of FeatureMultiMemory), "multimemory">;
 
 def HasMultivalue :
     Predicate<"Subtarget->hasMultivalue()">,
     AssemblerPredicate<(all_of FeatureMultivalue), "multivalue">;
 
+def HasMutableGlobals:
+    Predicate<"Subtarget->hasMutableGlobals()">,
+    AssemblerPredicate<(all_of FeatureMutableGlobals), "mutable-globals">;
+
 def HasNontrappingFPToInt :
     Predicate<"Subtarget->hasNontrappingFPToInt()">,
-    AssemblerPredicate<(all_of FeatureNontrappingFPToInt), "nontrapping-fptoint">;
+    AssemblerPredicate<(all_of FeatureNontrappingFPToInt),
+                       "nontrapping-fptoint">;
 
 def NotHasNontrappingFPToInt :
     Predicate<"!Subtarget->hasNontrappingFPToInt()">,
-    AssemblerPredicate<(all_of (not FeatureNontrappingFPToInt)), "nontrapping-fptoint">;
+    AssemblerPredicate<(all_of (not FeatureNontrappingFPToInt)),
+                       "nontrapping-fptoint">;
+
+def HasReferenceTypes :
+    Predicate<"Subtarget->hasReferenceTypes()">,
+    AssemblerPredicate<(all_of FeatureReferenceTypes), "reference-types">;
+
+def HasRelaxedSIMD :
+    Predicate<"Subtarget->hasRelaxedSIMD()">,
+    AssemblerPredicate<(all_of FeatureRelaxedSIMD), "relaxed-simd">;
 
 def HasSignExt :
     Predicate<"Subtarget->hasSignExt()">,
     AssemblerPredicate<(all_of FeatureSignExt), "sign-ext">;
 
+def HasSIMD128 :
+    Predicate<"Subtarget->hasSIMD128()">,
+    AssemblerPredicate<(all_of FeatureSIMD128), "simd128">;
+
 def HasTailCall :
     Predicate<"Subtarget->hasTailCall()">,
     AssemblerPredicate<(all_of FeatureTailCall), "tail-call">;
 
-def HasExceptionHandling :
-    Predicate<"Subtarget->hasExceptionHandling()">,
-    AssemblerPredicate<(all_of FeatureExceptionHandling), "exception-handling">;
-
-def HasBulkMemory :
-    Predicate<"Subtarget->hasBulkMemory()">,
-    AssemblerPredicate<(all_of FeatureBulkMemory), "bulk-memory">;
-
-def HasReferenceTypes :
-    Predicate<"Subtarget->hasReferenceTypes()">,
-    AssemblerPredicate<(all_of FeatureReferenceTypes), "reference-types">;
-
-def HasExtendedConst :
-    Predicate<"Subtarget->hasExtendedConst()">,
-    AssemblerPredicate<(all_of FeatureExtendedConst), "extended-const">;
-
-def HasMultiMemory :
-    Predicate<"Subtarget->hasMultiMemory()">,
-    AssemblerPredicate<(all_of FeatureMultiMemory), "multimemory">;
-
-def HasMutableGlobals:
-    Predicate<"Subtarget->hasMutableGlobals()">,
-    AssemblerPredicate<(all_of FeatureMutableGlobals), "mutable-globals">;
-
 //===----------------------------------------------------------------------===//
 // WebAssembly-specific DAG Node Types.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
index 01c0909af72e..9d452879bbf8 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
@@ -72,6 +72,10 @@ defm LOAD16_U_I64 : WebAssemblyLoad<I64, "i64.load16_u", 0x33, []>;
 defm LOAD32_S_I64 : WebAssemblyLoad<I64, "i64.load32_s", 0x34, []>;
 defm LOAD32_U_I64 : WebAssemblyLoad<I64, "i64.load32_u", 0x35, []>;
 
+// Half-precision load.
+defm LOAD_F16_F32 :
+  WebAssemblyLoad<F32, "f32.load_f16", 0xfc30, [HasHalfPrecision]>;
+
 // Pattern matching
 
 multiclass LoadPat<ValueType ty, SDPatternOperator kind, string Name> {
@@ -111,6 +115,8 @@ defm : LoadPat<i64, extloadi8, "LOAD8_U_I64">;
 defm : LoadPat<i64, extloadi16, "LOAD16_U_I64">;
 defm : LoadPat<i64, extloadi32, "LOAD32_U_I64">;
 
+defm : LoadPat<f32, int_wasm_loadf16_f32, "LOAD_F16_F32">;
+
 // Defines atomic and non-atomic stores, regular and truncating
 multiclass WebAssemblyStore<WebAssemblyRegClass rc, string Name, int Opcode,
                             list<Predicate> reqs = []> {
@@ -166,12 +172,18 @@ defm STORE8_I64 : WebAssemblyStore<I64, "i64.store8", 0x3c>;
 defm STORE16_I64 : WebAssemblyStore<I64, "i64.store16", 0x3d>;
 defm STORE32_I64 : WebAssemblyStore<I64, "i64.store32", 0x3e>;
 
+// Half-precision store.
+defm STORE_F16_F32 :
+  WebAssemblyStore<F32, "f32.store_f16", 0xfc31, [HasHalfPrecision]>;
+
 defm : StorePat<i32, truncstorei8, "STORE8_I32">;
 defm : StorePat<i32, truncstorei16, "STORE16_I32">;
 defm : StorePat<i64, truncstorei8, "STORE8_I64">;
 defm : StorePat<i64, truncstorei16, "STORE16_I64">;
 defm : StorePat<i64, truncstorei32, "STORE32_I64">;
 
+defm : StorePat<f32, int_wasm_storef16_f32, "STORE_F16_F32">;
+
 multiclass MemoryOps<WebAssemblyRegClass rc, string B> {
 // Current memory size.
 defm MEMORY_SIZE_A#B : I<(outs rc:$dst), (ins i32imm:$flags),
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
index cb4589961867..540da4b51cca 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
@@ -40,17 +40,17 @@ class WebAssemblySubtarget final : public WebAssemblyGenSubtargetInfo {
   } SIMDLevel = NoSIMD;
 
   bool HasAtomics = false;
-  bool HasNontrappingFPToInt = false;
-  bool HasSignExt = false;
-  bool HasExceptionHandling = false;
   bool HasBulkMemory = false;
+  bool HasExceptionHandling = false;
+  bool HasExtendedConst = false;
+  bool HasHalfPrecision = false;
+  bool HasMultiMemory = false;
   bool HasMultivalue = false;
   bool HasMutableGlobals = false;
-  bool HasTailCall = false;
+  bool HasNontrappingFPToInt = false;
   bool HasReferenceTypes = false;
-  bool HasExtendedConst = false;
-  bool HasMultiMemory = false;
-  bool HasHalfPrecision = false;
+  bool HasSignExt = false;
+  bool HasTailCall = false;
 
   /// What processor and OS we're targeting.
   Triple TargetTriple;
@@ -92,20 +92,20 @@ public:
 
   // Predicates used by WebAssemblyInstrInfo.td.
   bool hasAddr64() const { return TargetTriple.isArch64Bit(); }
-  bool hasSIMD128() const { return SIMDLevel >= SIMD128; }
-  bool hasRelaxedSIMD() const { return SIMDLevel >= RelaxedSIMD; }
-  bool hasHalfPrecision() const { return HasHalfPrecision; }
   bool hasAtomics() const { return HasAtomics; }
-  bool hasNontrappingFPToInt() const { return HasNontrappingFPToInt; }
-  bool hasSignExt() const { return HasSignExt; }
-  bool hasExceptionHandling() const { return HasExceptionHandling; }
   bool hasBulkMemory() const { return HasBulkMemory; }
+  bool hasExceptionHandling() const { return HasExceptionHandling; }
+  bool hasExtendedConst() const { return HasExtendedConst; }
+  bool hasHalfPrecision() const { return HasHalfPrecision; }
+  bool hasMultiMemory() const { return HasMultiMemory; }
   bool hasMultivalue() const { return HasMultivalue; }
   bool hasMutableGlobals() const { return HasMutableGlobals; }
-  bool hasTailCall() const { return HasTailCall; }
+  bool hasNontrappingFPToInt() const { return HasNontrappingFPToInt; }
   bool hasReferenceTypes() const { return HasReferenceTypes; }
-  bool hasMultiMemory() const { return HasMultiMemory; }
-  bool hasExtendedConst() const { return HasExtendedConst; }
+  bool hasRelaxedSIMD() const { return SIMDLevel >= RelaxedSIMD; }
+  bool hasSignExt() const { return HasSignExt; }
+  bool hasSIMD128() const { return SIMDLevel >= SIMD128; }
+  bool hasTailCall() const { return HasTailCall; }
 
   /// Parses features string setting specified subtarget options. Definition of
   /// function is auto generated by tblgen.
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index b05a036fb2f0..62b4a9278954 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -2296,7 +2296,7 @@ bool X86AsmParser::ParseRoundingModeOp(SMLoc Start, OperandVector &Operands) {
     Operands.push_back(X86Operand::CreateImm(RndModeOp, Start, End));
     return false;
   }
-  if(Tok.getIdentifier().equals("sae")){
+  if (Tok.getIdentifier() == "sae") {
     Parser.Lex();  // Eat the sae
     if (!getLexer().is(AsmToken::RCurly))
       return Error(Tok.getLoc(), "Expected } at this point");
@@ -2567,7 +2567,7 @@ bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) {
     .Default(0);
   if (Size) {
     const AsmToken &Tok = Lex(); // Eat operand size (e.g., byte, word).
-    if (!(Tok.getString().equals("PTR") || Tok.getString().equals("ptr")))
+    if (!(Tok.getString() == "PTR" || Tok.getString() == "ptr"))
       return Error(Tok.getLoc(), "Expected 'PTR' or 'ptr' token!");
     Lex(); // Eat ptr.
   }
@@ -3802,7 +3802,7 @@ bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
     //    VFMULCPHZrr   Dest, Src1, Src2
     //    VFMULCPHZrrk  Dest, Dest, Mask, Src1, Src2
     //    VFMULCPHZrrkz Dest, Mask, Src1, Src2
-    for (unsigned i = TSFlags & X86II::EVEX_K ? 2 : 1;
+    for (unsigned i = ((TSFlags & X86II::EVEX_K) ? 2 : 1);
          i < Inst.getNumOperands(); i++)
       if (Inst.getOperand(i).isReg() && Dest == Inst.getOperand(i).getReg())
         return Warning(Ops[0]->getStartLoc(), "Destination register should be "
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index a5859f98bae0..b4633b91bee3 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -980,7 +980,7 @@ X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
     break;
   case X86II::VEX:
     // VEX can be 2 byte or 3 byte, not determined yet if not explicit
-    Prefix.setLowerBound(MI.getFlags() & X86::IP_USE_VEX3 ? VEX3 : VEX2);
+    Prefix.setLowerBound((MI.getFlags() & X86::IP_USE_VEX3) ? VEX3 : VEX2);
     break;
   case X86II::EVEX:
     Prefix.setLowerBound(EVEX);
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 78bc043911f2..54642ecde18c 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -351,6 +351,8 @@ def FeatureNDD : SubtargetFeature<"ndd", "HasNDD", "true",
                                   "Support non-destructive destination">;
 def FeatureCCMP : SubtargetFeature<"ccmp", "HasCCMP", "true",
                                    "Support conditional cmp & test instructions">;
+def FeatureNF : SubtargetFeature<"nf", "HasNF", "true",
+                                 "Support status flags update suppression">;
 def FeatureCF : SubtargetFeature<"cf", "HasCF", "true",
                                  "Support conditional faulting">;
 
@@ -739,6 +741,10 @@ def TuningFastMOVBE
     : SubtargetFeature<"fast-movbe", "HasFastMOVBE", "true",
     "Prefer a movbe over a single-use load + bswap / single-use bswap + store">;
 
+def TuningFastImm16
+    : SubtargetFeature<"fast-imm16", "HasFastImm16", "true",
+    "Prefer a i16 instruction with i16 immediate over extension to i32">;
+
 def TuningUseSLMArithCosts
     : SubtargetFeature<"use-slm-arith-costs", "UseSLMArithCosts", "true",
         "Use Silvermont specific arithmetic costs">;
@@ -873,6 +879,7 @@ def ProcessorFeatures {
   // Nehalem
   list<SubtargetFeature> NHMFeatures = X86_64V2Features;
   list<SubtargetFeature> NHMTuning = [TuningMacroFusion,
+                                      TuningSlowDivide64,
                                       TuningInsertVZEROUPPER,
                                       TuningNoDomainDelayMov];
 
@@ -1145,6 +1152,7 @@ def ProcessorFeatures {
                                        TuningSlowDivide32,
                                        TuningSlowDivide64,
                                        TuningSlowTwoMemOps,
+                                       TuningFastImm16,
                                        TuningLEAUsesAG,
                                        TuningPadShortFunctions,
                                        TuningInsertVZEROUPPER,
@@ -1165,6 +1173,7 @@ def ProcessorFeatures {
                                       TuningSlowPMULLD,
                                       TuningFast7ByteNOP,
                                       TuningFastMOVBE,
+                                      TuningFastImm16,
                                       TuningPOPCNTFalseDeps,
                                       TuningInsertVZEROUPPER,
                                       TuningNoDomainDelay];
@@ -1186,6 +1195,7 @@ def ProcessorFeatures {
                                       TuningSlowLEA,
                                       TuningSlowIncDec,
                                       TuningFastMOVBE,
+                                      TuningFastImm16,
                                       TuningPOPCNTFalseDeps,
                                       TuningInsertVZEROUPPER,
                                       TuningNoDomainDelay];
@@ -1200,6 +1210,7 @@ def ProcessorFeatures {
                                       TuningSlowLEA,
                                       TuningSlowIncDec,
                                       TuningFastMOVBE,
+                                      TuningFastImm16,
                                       TuningInsertVZEROUPPER,
                                       TuningNoDomainDelay];
   list<SubtargetFeature> GLPFeatures =
@@ -1320,6 +1331,7 @@ def ProcessorFeatures {
                                       TuningPreferMaskRegisters,
                                       TuningFastGather,
                                       TuningFastMOVBE,
+                                      TuningFastImm16,
                                       TuningSlowPMADDWD];
   // TODO Add AVX5124FMAPS/AVX5124VNNIW features
   list<SubtargetFeature> KNMFeatures =
@@ -1340,6 +1352,7 @@ def ProcessorFeatures {
                                               FeatureCMOV,
                                               FeatureX86_64];
   list<SubtargetFeature> BarcelonaTuning = [TuningFastScalarShiftMasks,
+                                            TuningSlowDivide64,
                                             TuningSlowSHLD,
                                             TuningSBBDepBreaking,
                                             TuningInsertVZEROUPPER];
@@ -1362,7 +1375,9 @@ def ProcessorFeatures {
   list<SubtargetFeature> BtVer1Tuning = [TuningFast15ByteNOP,
                                          TuningFastScalarShiftMasks,
                                          TuningFastVectorShiftMasks,
+                                         TuningSlowDivide64,
                                          TuningSlowSHLD,
+                                         TuningFastImm16,
                                          TuningSBBDepBreaking,
                                          TuningInsertVZEROUPPER];
 
@@ -1383,7 +1398,9 @@ def ProcessorFeatures {
                                          TuningFastScalarShiftMasks,
                                          TuningFastVectorShiftMasks,
                                          TuningFastMOVBE,
+                                         TuningFastImm16,
                                          TuningSBBDepBreaking,
+                                         TuningSlowDivide64,
                                          TuningSlowSHLD];
   list<SubtargetFeature> BtVer2Features =
     !listconcat(BtVer1Features, BtVer2AdditionalFeatures);
@@ -1408,6 +1425,7 @@ def ProcessorFeatures {
                                            FeatureLWP,
                                            FeatureLAHFSAHF64];
   list<SubtargetFeature> BdVer1Tuning = [TuningSlowSHLD,
+                                         TuningSlowDivide64,
                                          TuningFast11ByteNOP,
                                          TuningFastScalarShiftMasks,
                                          TuningBranchFusion,
@@ -1487,6 +1505,8 @@ def ProcessorFeatures {
                                      TuningFastScalarShiftMasks,
                                      TuningFastVariablePerLaneShuffle,
                                      TuningFastMOVBE,
+                                     TuningFastImm16,
+                                     TuningSlowDivide64,
                                      TuningSlowSHLD,
                                      TuningSBBDepBreaking,
                                      TuningInsertVZEROUPPER,
diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
index 12178bcaf042..9ec68bfb8e0f 100644
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -1063,11 +1063,13 @@ def CC_X86_64_Preserve_None : CallingConv<[
   //   - R10        'nest' parameter
   //   - RBX        base pointer
   //   - R16 - R31  these are not available everywhere
-  CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D, R9D,
-	                         R11D, R12D, R13D, R14D, R15D, EAX]>>,
+  // Use non-volatile registers first, so functions using this convention can
+  // call "normal" functions without saving and restoring incoming values:
+  CCIfType<[i32], CCAssignToReg<[R12D, R13D, R14D, R15D, EDI, ESI,
+                                 EDX, ECX, R8D, R9D, R11D, EAX]>>,
 
-  CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8, R9,
-                                 R11, R12, R13, R14, R15, RAX]>>,
+  CCIfType<[i64], CCAssignToReg<[R12, R13, R14, R15, RDI, RSI,
+                                 RDX, RCX, R8, R9, R11, RAX]>>,
 
   // Otherwise it's the same as the regular C calling convention.
   CCDelegateTo<CC_X86_64_C>
diff --git a/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/llvm/lib/Target/X86/X86FixupBWInsts.cpp
index bf8588ad6dee..db1d21b59a7b 100644
--- a/llvm/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/llvm/lib/Target/X86/X86FixupBWInsts.cpp
@@ -248,7 +248,7 @@ Register FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI) const {
   //   Predecessors according to CFG: %bb.2 %bb.1
   //   %ax = KILL %ax, implicit killed %eax
   //   RET 0, %ax
-  unsigned Opc = OrigMI->getOpcode(); (void)Opc;
+  unsigned Opc = OrigMI->getOpcode();
   // These are the opcodes currently known to work with the code below, if
   // something // else will be added we need to ensure that new opcode has the
   // same properties.
@@ -261,8 +261,6 @@ Register FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI) const {
     if (!MO.isReg())
       continue;
 
-    assert((MO.isDef() || MO.isUse()) && "Expected Def or Use only!");
-
     if (MO.isDef() && TRI->isSuperRegisterEq(OrigDestReg, MO.getReg()))
       IsDefined = true;
 
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 14c62893766a..ea3b84d0ca9e 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1560,7 +1560,9 @@ void X86DAGToDAGISel::PostprocessISelDAG() {
       SDValue And = N->getOperand(0);
       unsigned N0Opc = And.getMachineOpcode();
       if ((N0Opc == X86::AND8rr || N0Opc == X86::AND16rr ||
-           N0Opc == X86::AND32rr || N0Opc == X86::AND64rr) &&
+           N0Opc == X86::AND32rr || N0Opc == X86::AND64rr ||
+           N0Opc == X86::AND8rr_ND || N0Opc == X86::AND16rr_ND ||
+           N0Opc == X86::AND32rr_ND || N0Opc == X86::AND64rr_ND) &&
           !And->hasAnyUseOfValue(1)) {
         MachineSDNode *Test = CurDAG->getMachineNode(Opc, SDLoc(N),
                                                      MVT::i32,
@@ -1571,15 +1573,25 @@ void X86DAGToDAGISel::PostprocessISelDAG() {
         continue;
       }
       if ((N0Opc == X86::AND8rm || N0Opc == X86::AND16rm ||
-           N0Opc == X86::AND32rm || N0Opc == X86::AND64rm) &&
+           N0Opc == X86::AND32rm || N0Opc == X86::AND64rm ||
+           N0Opc == X86::AND8rm_ND || N0Opc == X86::AND16rm_ND ||
+           N0Opc == X86::AND32rm_ND || N0Opc == X86::AND64rm_ND) &&
           !And->hasAnyUseOfValue(1)) {
         unsigned NewOpc;
+#define CASE_ND(OP)                                                            \
+  case X86::OP:                                                                \
+  case X86::OP##_ND:
+#define FROM_TO(A, B)                                                          \
+  CASE_ND(A) NewOpc = X86::B;                                                  \
+  break;
         switch (N0Opc) {
-        case X86::AND8rm:  NewOpc = X86::TEST8mr; break;
-        case X86::AND16rm: NewOpc = X86::TEST16mr; break;
-        case X86::AND32rm: NewOpc = X86::TEST32mr; break;
-        case X86::AND64rm: NewOpc = X86::TEST64mr; break;
+          FROM_TO(AND8rm, TEST8mr);
+          FROM_TO(AND16rm, TEST16mr);
+          FROM_TO(AND32rm, TEST32mr);
+          FROM_TO(AND64rm, TEST64mr);
         }
+#undef FROM_TO
+#undef CASE_ND
 
         // Need to swap the memory and register operand.
         SDValue Ops[] = { And.getOperand(1),
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6a5fc3c53146..ecc5b3b3bf84 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1092,6 +1092,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
     setOperationAction(ISD::FCOPYSIGN,          MVT::v2f64, Custom);
 
+    setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
+
     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
       setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
       setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
@@ -1431,6 +1433,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::FMINIMUM,          VT, Custom);
     }
 
+    setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
+    setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
+
     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
     // even though v8i16 is a legal type.
     setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v8i16, MVT::v8i32);
@@ -1731,6 +1736,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
   }
+  if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
+    for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
+      setOperationAction(ISD::LRINT, VT, Legal);
+      setOperationAction(ISD::LLRINT, VT, Legal);
+    }
+  }
 
   // This block controls legalization for 512-bit operations with 8/16/32/64 bit
   // elements. 512-bits can be disabled based on prefer-vector-width and
@@ -1765,6 +1776,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::STRICT_FMA, VT, Legal);
       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
     }
+    setOperationAction(ISD::LRINT, MVT::v16f32,
+                       Subtarget.hasDQI() ? Legal : Custom);
+    setOperationAction(ISD::LRINT, MVT::v8f64,
+                       Subtarget.hasDQI() ? Legal : Custom);
+    if (Subtarget.hasDQI())
+      setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
 
     for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
       setOperationPromotedToType(ISD::FP_TO_SINT       , VT, MVT::v16i32);
@@ -2488,6 +2505,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
                        ISD::FMAXNUM,
                        ISD::SUB,
                        ISD::LOAD,
+                       ISD::LRINT,
+                       ISD::LLRINT,
                        ISD::MLOAD,
                        ISD::STORE,
                        ISD::MSTORE,
@@ -3558,6 +3577,16 @@ static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
       Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
 }
 
+/// Return true if every element in Mask, is an in-place blend/select mask or is
+/// undef.
+LLVM_ATTRIBUTE_UNUSED static bool isBlendOrUndef(ArrayRef<int> Mask) {
+  unsigned NumElts = Mask.size();
+  for (auto [I, M] : enumerate(Mask))
+    if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))
+      return false;
+  return true;
+}
+
 /// Return true if every element in Mask, beginning
 /// from position Pos and ending in Pos + Size, falls within the specified
 /// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
@@ -3709,6 +3738,11 @@ static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
   return false;
 }
 
+static bool canScaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts) {
+  SmallVector<int, 32> ScaledMask;
+  return scaleShuffleElements(Mask, NumDstElts, ScaledMask);
+}
+
 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
 bool X86::isZeroNode(SDValue Elt) {
   return isNullConstant(Elt) || isNullFPConstant(Elt);
@@ -7395,7 +7429,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
     if (ScalarSize == 32 ||
         (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
-        CVT == MVT::f16 ||
+        (CVT == MVT::f16 && Subtarget.hasAVX2()) ||
         (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
       const Constant *C = nullptr;
       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
@@ -18525,7 +18559,7 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
 SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
                                                  bool ForCall) const {
   // Unpack the global address or external symbol.
-  const SDLoc &dl = SDLoc(Op);
+  SDLoc dl(Op);
   const GlobalValue *GV = nullptr;
   int64_t Offset = 0;
   const char *ExternalSym = nullptr;
@@ -20079,7 +20113,7 @@ SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
                                          DAG.getVTList(MVT::Other),
                                          Ops, DstTy, MMO);
 
-  SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
+  SDValue Res = DAG.getLoad(Op.getValueType(), DL, FIST, StackSlot, MPI);
   Chain = Res.getValue(1);
 
   // If we need an unsigned fixup, XOR the result with adjust.
@@ -20557,14 +20591,12 @@ static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL,
   return SDValue();
 }
 
-static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
+static SDValue LowerTruncateVecI1(SDValue Op, const SDLoc &DL,
+                                  SelectionDAG &DAG,
                                   const X86Subtarget &Subtarget) {
-
-  SDLoc DL(Op);
   MVT VT = Op.getSimpleValueType();
   SDValue In = Op.getOperand(0);
   MVT InVT = In.getSimpleValueType();
-
   assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
 
   // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
@@ -20683,7 +20715,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   }
 
   if (VT.getVectorElementType() == MVT::i1)
-    return LowerTruncateVecI1(Op, DAG, Subtarget);
+    return LowerTruncateVecI1(Op, DL, DAG, Subtarget);
 
   // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
   // concat from subvectors to use VPTRUNC etc.
@@ -21139,8 +21171,8 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
       LC = RTLIB::getFPTOUINT(SrcVT, VT);
 
     MakeLibCallOptions CallOptions;
-    std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
-                                                  SDLoc(Op), Chain);
+    std::pair<SDValue, SDValue> Tmp =
+        makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain);
 
     if (IsStrict)
       return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
@@ -21161,8 +21193,12 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
 SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
                                              SelectionDAG &DAG) const {
   SDValue Src = Op.getOperand(0);
+  EVT DstVT = Op.getSimpleValueType();
   MVT SrcVT = Src.getSimpleValueType();
 
+  if (SrcVT.isVector())
+    return DstVT.getScalarType() == MVT::i32 ? Op : SDValue();
+
   if (SrcVT == MVT::f16)
     return SDValue();
 
@@ -22657,7 +22693,7 @@ static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
 
   // Only promote the compare up to I32 if it is a 16 bit operation
   // with an immediate.  16 bit immediates are to be avoided.
-  if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
+  if (CmpVT == MVT::i16 && !Subtarget.hasFastImm16() &&
       !DAG.getMachineFunction().getFunction().hasMinSize()) {
     ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
     ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
@@ -22685,10 +22721,10 @@ static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
   }
 
   // Try to shrink i64 compares if the input has enough zero bits.
-  // FIXME: Do this for non-constant compares for constant on LHS?
-  if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
+  // TODO: Add sign-bits equivalent for isX86CCSigned(X86CC)?
+  if (CmpVT == MVT::i64 && !isX86CCSigned(X86CC) &&
       Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
-      Op1->getAsAPIntVal().getActiveBits() <= 32 &&
+      DAG.MaskedValueIsZero(Op1, APInt::getHighBitsSet(64, 32)) &&
       DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
     CmpVT = MVT::i32;
     Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
@@ -23068,14 +23104,12 @@ static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS,
                      DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
 }
 
-static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
-
+static SDValue LowerIntVSETCC_AVX512(SDValue Op, const SDLoc &dl,
+                                     SelectionDAG &DAG) {
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   SDValue CC = Op.getOperand(2);
   MVT VT = Op.getSimpleValueType();
-  SDLoc dl(Op);
-
   assert(VT.getVectorElementType() == MVT::i1 &&
          "Cannot set masked compare for this operation");
 
@@ -23351,7 +23385,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
     // But there is no compare instruction for i8 and i16 elements in KNL.
     assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
            "Unexpected operand type");
-    return LowerIntVSETCC_AVX512(Op, DAG);
+    return LowerIntVSETCC_AVX512(Op, dl, DAG);
   }
 
   // Lower using XOP integer comparisons.
@@ -28989,6 +29023,29 @@ SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
   return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
 }
 
+// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
+uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt = 0) {
+  assert((Amt < 8) && "Shift/Rotation amount out of range");
+  switch (Opcode) {
+  case ISD::BITREVERSE:
+    return 0x8040201008040201ULL;
+  case ISD::SHL:
+    return ((0x0102040810204080ULL >> (Amt)) &
+            (0x0101010101010101ULL * (0xFF >> (Amt))));
+  case ISD::SRL:
+    return ((0x0102040810204080ULL << (Amt)) &
+            (0x0101010101010101ULL * ((0xFF << (Amt)) & 0xFF)));
+  case ISD::SRA:
+    return (getGFNICtrlImm(ISD::SRL, Amt) |
+            (0x8080808080808080ULL >> (64 - (8 * Amt))));
+  case ISD::ROTL:
+    return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt);
+  case ISD::ROTR:
+    return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt);
+  }
+  llvm_unreachable("Unsupported GFNI opcode");
+}
+
 // Return true if the required (according to Opcode) shift-imm form is natively
 // supported by the Subtarget
 static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
@@ -29176,6 +29233,14 @@ static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,
     if (VT == MVT::v16i8 && Subtarget.hasXOP())
       return SDValue();
 
+    if (Subtarget.hasGFNI()) {
+      uint64_t ShiftMask = getGFNICtrlImm(Op.getOpcode(), ShiftAmt);
+      MVT MaskVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
+      SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(ShiftMask, dl, MaskVT));
+      return DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, R, Mask,
+                         DAG.getTargetConstant(0, dl, MVT::i8));
+    }
+
     if (Op.getOpcode() == ISD::SHL) {
       // Make a large shift.
       SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
@@ -29859,13 +29924,15 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
       uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;
       uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);
       assert((ShXAmt + ShYAmt) == EltSizeInBits && "Illegal funnel shift");
+      MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
 
-      if (EltSizeInBits == 8 && ShXAmt > 1 &&
-          (Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT))) {
+      if (EltSizeInBits == 8 &&
+          (Subtarget.hasXOP() ||
+           (useVPTERNLOG(Subtarget, VT) &&
+            supportedVectorShiftWithImm(WideVT, Subtarget, ISD::SHL)))) {
         // For vXi8 cases on Subtargets that can perform VPCMOV/VPTERNLOG
         // bit-select - lower using vXi16 shifts and then perform the bitmask at
         // the original vector width to handle cases where we split.
-        MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
         APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt);
         APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt);
         SDValue ShX =
@@ -30035,7 +30102,9 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
     return R;
 
   // AVX512 implicitly uses modulo rotation amounts.
-  if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
+  if ((Subtarget.hasVLX() ||
+       (Subtarget.hasAVX512() && Subtarget.hasEVEX512())) &&
+      32 <= EltSizeInBits) {
     // Attempt to rotate by immediate.
     if (IsCstSplat) {
       unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
@@ -30068,6 +30137,17 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
                          DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
   }
 
+  // Attempt to use GFNI gf2p8affine to rotate vXi8 by an uniform constant.
+  if (IsCstSplat && Subtarget.hasGFNI() && VT.getScalarType() == MVT::i8 &&
+      DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+    uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
+    uint64_t RotMask = getGFNICtrlImm(Opcode, RotAmt);
+    MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
+    SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(RotMask, DL, MaskVT));
+    return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, R, Mask,
+                       DAG.getTargetConstant(0, DL, MVT::i8));
+  }
+
   // Split 256-bit integers on XOP/pre-AVX2 targets.
   if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
     return splitVectorIntBinary(Op, DAG, DL);
@@ -31391,7 +31471,8 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
   // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
   if (Subtarget.hasGFNI()) {
     MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
-    SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
+    SDValue Matrix =
+        DAG.getConstant(getGFNICtrlImm(ISD::BITREVERSE), DL, MatrixVT);
     Matrix = DAG.getBitcast(VT, Matrix);
     return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
                        DAG.getTargetConstant(0, DL, MVT::i8));
@@ -39996,6 +40077,101 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
   return SDValue();
 }
 
+// Attempt to fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y))
+// iff we don't demand the same element index for both X and Y.
+static SDValue
+combineBlendOfPermutes(MVT VT, SDValue N0, SDValue N1, ArrayRef<int> BlendMask,
+                       const APInt &DemandedElts, SelectionDAG &DAG,
+                       const X86Subtarget &Subtarget, const SDLoc &DL) {
+  assert(isBlendOrUndef(BlendMask) && "Blend shuffle expected");
+  if (!N0.hasOneUse() || !N1.hasOneUse())
+    return SDValue();
+
+  unsigned NumElts = VT.getVectorNumElements();
+  SDValue BC0 = peekThroughOneUseBitcasts(N0);
+  SDValue BC1 = peekThroughOneUseBitcasts(N1);
+
+  // See if both operands are shuffles, and that we can scale the shuffle masks
+  // to the same width as the blend mask.
+  // TODO: Support SM_SentinelZero?
+  SmallVector<SDValue, 2> Ops0, Ops1;
+  SmallVector<int, 32> Mask0, Mask1, ScaledMask0, ScaledMask1;
+  if (!getTargetShuffleMask(BC0, /*AllowSentinelZero=*/false, Ops0, Mask0) ||
+      !getTargetShuffleMask(BC1, /*AllowSentinelZero=*/false, Ops1, Mask1) ||
+      !scaleShuffleElements(Mask0, NumElts, ScaledMask0) ||
+      !scaleShuffleElements(Mask1, NumElts, ScaledMask1))
+    return SDValue();
+
+  // Determine the demanded elts from both permutes.
+  APInt Demanded0, DemandedLHS0, DemandedRHS0;
+  APInt Demanded1, DemandedLHS1, DemandedRHS1;
+  if (!getShuffleDemandedElts(NumElts, BlendMask, DemandedElts, Demanded0,
+                              Demanded1,
+                              /*AllowUndefElts=*/true) ||
+      !getShuffleDemandedElts(NumElts, ScaledMask0, Demanded0, DemandedLHS0,
+                              DemandedRHS0, /*AllowUndefElts=*/true) ||
+      !getShuffleDemandedElts(NumElts, ScaledMask1, Demanded1, DemandedLHS1,
+                              DemandedRHS1, /*AllowUndefElts=*/true))
+    return SDValue();
+
+  // Confirm that we only use a single operand from both permutes and that we
+  // don't demand the same index from both.
+  if (!DemandedRHS0.isZero() || !DemandedRHS1.isZero() ||
+      DemandedLHS0.intersects(DemandedLHS1))
+    return SDValue();
+
+  // Use the permute demanded elts masks as the new blend mask.
+  // Create the new permute mask as a blend of the 2 original permute masks.
+  SmallVector<int, 32> NewBlendMask(NumElts, SM_SentinelUndef);
+  SmallVector<int, 32> NewPermuteMask(NumElts, SM_SentinelUndef);
+  for (unsigned I = 0; I != NumElts; ++I) {
+    if (Demanded0[I]) {
+      int M = ScaledMask0[I];
+      if (0 <= M) {
+        assert(isUndefOrEqual(NewBlendMask[M], M) &&
+               "BlendMask demands LHS AND RHS");
+        NewBlendMask[M] = M;
+        NewPermuteMask[I] = M;
+      }
+    } else if (Demanded1[I]) {
+      int M = ScaledMask1[I];
+      if (0 <= M) {
+        assert(isUndefOrEqual(NewBlendMask[M], M + NumElts) &&
+               "BlendMask demands LHS AND RHS");
+        NewBlendMask[M] = M + NumElts;
+        NewPermuteMask[I] = M;
+      }
+    }
+  }
+  assert(isBlendOrUndef(NewBlendMask) && "Bad blend");
+  assert(isUndefOrInRange(NewPermuteMask, 0, NumElts) && "Bad permute");
+
+  // v16i16 shuffles can explode in complexity very easily, only accept them if
+  // the blend mask is the same in the 128-bit subvectors (or can widen to
+  // v8i32) and the permute can be widened as well.
+  if (VT == MVT::v16i16) {
+    if (!is128BitLaneRepeatedShuffleMask(VT, NewBlendMask) &&
+        !canWidenShuffleElements(NewBlendMask))
+      return SDValue();
+    if (!canWidenShuffleElements(NewPermuteMask))
+      return SDValue();
+  }
+
+  // Don't introduce lane-crossing permutes without AVX2, unless it can be
+  // widened to a lane permute (vperm2f128).
+  if (VT.is256BitVector() && !Subtarget.hasAVX2() &&
+      isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(),
+                                NewPermuteMask) &&
+      !canScaleShuffleElements(NewPermuteMask, 2))
+    return SDValue();
+
+  SDValue NewBlend =
+      DAG.getVectorShuffle(VT, DL, DAG.getBitcast(VT, Ops0[0]),
+                           DAG.getBitcast(VT, Ops1[0]), NewBlendMask);
+  return DAG.getVectorShuffle(VT, DL, NewBlend, DAG.getUNDEF(VT),
+                              NewPermuteMask);
+}
+
 // TODO - move this to TLI like isBinOp?
 static bool isUnaryOp(unsigned Opcode) {
   switch (Opcode) {
@@ -41748,6 +41924,15 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
     KnownUndef = SrcUndef.zextOrTrunc(NumElts);
     break;
   }
+  case X86ISD::BLENDI: {
+    SmallVector<int, 16> BlendMask;
+    DecodeBLENDMask(NumElts, Op.getConstantOperandVal(2), BlendMask);
+    if (SDValue R = combineBlendOfPermutes(
+            VT.getSimpleVT(), Op.getOperand(0), Op.getOperand(1), BlendMask,
+            DemandedElts, TLO.DAG, Subtarget, SDLoc(Op)))
+      return TLO.CombineTo(Op, R);
+    break;
+  }
   case X86ISD::BLENDV: {
     APInt SelUndef, SelZero;
     if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
@@ -42813,6 +42998,8 @@ static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
     [[fallthrough]];
   case ISD::SETCC:
     return Src.getOperand(0).getValueSizeInBits() == Size;
+  case ISD::FREEZE:
+    return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate);
   case ISD::AND:
   case ISD::XOR:
   case ISD::OR:
@@ -42882,6 +43069,7 @@ static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
                                           SDValue Src, const SDLoc &DL) {
   switch (Src.getOpcode()) {
   case ISD::SETCC:
+  case ISD::FREEZE:
   case ISD::TRUNCATE:
   case ISD::BUILD_VECTOR:
     return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
@@ -46500,14 +46688,14 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
   // To address this, we check that we can scale the shuffle mask to MOVMSK
   // element width (this will ensure "high" elements match). Its slightly overly
   // conservative, but fine for an edge case fold.
-  SmallVector<int, 32> ShuffleMask, ScaledMaskUnused;
+  SmallVector<int, 32> ShuffleMask;
   SmallVector<SDValue, 2> ShuffleInputs;
   if (NumElts <= CmpBits &&
       getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
                              ShuffleMask, DAG) &&
       ShuffleInputs.size() == 1 && isCompletePermute(ShuffleMask) &&
       ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits() &&
-      scaleShuffleElements(ShuffleMask, NumElts, ScaledMaskUnused)) {
+      canScaleShuffleElements(ShuffleMask, NumElts)) {
     SDLoc DL(EFLAGS);
     SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
     Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
@@ -51542,6 +51730,22 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG,
+                                   const X86Subtarget &Subtarget) {
+  EVT VT = N->getValueType(0);
+  SDValue Src = N->getOperand(0);
+  EVT SrcVT = Src.getValueType();
+  SDLoc DL(N);
+
+  if (!Subtarget.hasDQI() || !Subtarget.hasVLX() || VT != MVT::v2i64 ||
+      SrcVT != MVT::v2f32)
+    return SDValue();
+
+  return DAG.getNode(X86ISD::CVTP2SI, DL, VT,
+                     DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, Src,
+                                 DAG.getUNDEF(SrcVT)));
+}
+
 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
 /// the codegen.
 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
@@ -51888,6 +52092,11 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
       return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
   }
 
+  // Try to combine (trunc (vNi64 (lrint x))) to (vNi32 (lrint x)).
+  if (Src.getOpcode() == ISD::LRINT && VT.getScalarType() == MVT::i32 &&
+      Src.hasOneUse())
+    return DAG.getNode(ISD::LRINT, DL, VT, Src.getOperand(0));
+
   return SDValue();
 }
 
@@ -56834,6 +57043,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::UINT_TO_FP:
   case ISD::STRICT_UINT_TO_FP:
     return combineUIntToFP(N, DAG, Subtarget);
+  case ISD::LRINT:
+  case ISD::LLRINT:         return combineLRINT_LLRINT(N, DAG, Subtarget);
   case ISD::FADD:
   case ISD::FSUB:           return combineFaddFsub(N, DAG, Subtarget);
   case X86ISD::VFCMULC:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index e348ba6e8ac0..ade54f73bff0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1632,10 +1632,8 @@ namespace llvm {
     /// Check whether the call is eligible for tail call optimization. Targets
     /// that want to do tail call optimization should implement this function.
     bool IsEligibleForTailCallOptimization(
-        SDValue Callee, CallingConv::ID CalleeCC, bool IsCalleeStackStructRet,
-        bool isVarArg, Type *RetTy, const SmallVectorImpl<ISD::OutputArg> &Outs,
-        const SmallVectorImpl<SDValue> &OutVals,
-        const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
+        TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo,
+        SmallVectorImpl<CCValAssign> &ArgLocs, bool IsCalleePopSRet) const;
     SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
                                     SDValue Chain, bool IsTailCall,
                                     bool Is64Bit, int FPDiff,
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 1f76f7451033..b107d56f8cf9 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -2021,6 +2021,22 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (CallConv == CallingConv::X86_INTR)
     report_fatal_error("X86 interrupts may not be called directly");
 
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
+
+  // Allocate shadow area for Win64.
+  if (IsWin64)
+    CCInfo.AllocateStack(32, Align(8));
+
+  CCInfo.AnalyzeArguments(Outs, CC_X86);
+
+  // In vectorcall calling convention a second pass is required for the HVA
+  // types.
+  if (CallingConv::X86_VectorCall == CallConv) {
+    CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
+  }
+
   bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
   if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) {
     // If we are using a GOT, disable tail calls to external symbols with
@@ -2036,9 +2052,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   if (isTailCall && !IsMustTail) {
     // Check if it's really possible to do a tail call.
-    isTailCall = IsEligibleForTailCallOptimization(
-        Callee, CallConv, IsCalleePopSRet, isVarArg, CLI.RetTy, Outs, OutVals,
-        Ins, DAG);
+    isTailCall = IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs,
+                                                   IsCalleePopSRet);
 
     // Sibcalls are automatically detected tailcalls which do not require
     // ABI changes.
@@ -2056,22 +2071,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
          "Var args not supported with calling convention fastcc, ghc or hipe");
 
-  // Analyze operands of the call, assigning locations to each operand.
-  SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
-
-  // Allocate shadow area for Win64.
-  if (IsWin64)
-    CCInfo.AllocateStack(32, Align(8));
-
-  CCInfo.AnalyzeArguments(Outs, CC_X86);
-
-  // In vectorcall calling convention a second pass is required for the HVA
-  // types.
-  if (CallingConv::X86_VectorCall == CallConv) {
-    CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
-  }
-
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
   if (IsSibcall)
@@ -2723,11 +2722,20 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
 
 /// Check whether the call is eligible for tail call optimization. Targets
 /// that want to do tail call optimization should implement this function.
+/// Note that the x86 backend does not check musttail calls for eligibility! The
+/// rest of x86 tail call lowering must be prepared to forward arguments of any
+/// type.
 bool X86TargetLowering::IsEligibleForTailCallOptimization(
-    SDValue Callee, CallingConv::ID CalleeCC, bool IsCalleePopSRet,
-    bool isVarArg, Type *RetTy, const SmallVectorImpl<ISD::OutputArg> &Outs,
-    const SmallVectorImpl<SDValue> &OutVals,
-    const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
+    TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo,
+    SmallVectorImpl<CCValAssign> &ArgLocs, bool IsCalleePopSRet) const {
+  SelectionDAG &DAG = CLI.DAG;
+  const SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  const SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+  const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+  SDValue Callee = CLI.Callee;
+  CallingConv::ID CalleeCC = CLI.CallConv;
+  bool isVarArg = CLI.IsVarArg;
+
   if (!mayTailCallThisCC(CalleeCC))
     return false;
 
@@ -2738,7 +2746,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
   // If the function return type is x86_fp80 and the callee return type is not,
   // then the FP_EXTEND of the call result is not a nop. It's not safe to
   // perform a tailcall optimization here.
-  if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
+  if (CallerF.getReturnType()->isX86_FP80Ty() && !CLI.RetTy->isX86_FP80Ty())
     return false;
 
   CallingConv::ID CallerCC = CallerF.getCallingConv();
@@ -2791,9 +2799,6 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
     if (IsCalleeWin64 || IsCallerWin64)
       return false;
 
-    SmallVector<CCValAssign, 16> ArgLocs;
-    CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
-    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
     for (const auto &VA : ArgLocs)
       if (!VA.isRegLoc())
         return false;
@@ -2811,8 +2816,8 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
   }
   if (Unused) {
     SmallVector<CCValAssign, 16> RVLocs;
-    CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
-    CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
+    CCState RVCCInfo(CalleeCC, false, MF, RVLocs, C);
+    RVCCInfo.AnalyzeCallResult(Ins, RetCC_X86);
     for (const auto &VA : RVLocs) {
       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
         return false;
@@ -2832,24 +2837,12 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
       return false;
   }
 
-  unsigned StackArgsSize = 0;
+  unsigned StackArgsSize = CCInfo.getStackSize();
 
   // If the callee takes no arguments then go on to check the results of the
   // call.
   if (!Outs.empty()) {
-    // Check if stack adjustment is needed. For now, do not do this if any
-    // argument is passed on the stack.
-    SmallVector<CCValAssign, 16> ArgLocs;
-    CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
-
-    // Allocate shadow area for Win64
-    if (IsCalleeWin64)
-      CCInfo.AllocateStack(32, Align(8));
-
-    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
-    StackArgsSize = CCInfo.getStackSize();
-
-    if (CCInfo.getStackSize()) {
+    if (StackArgsSize > 0) {
       // Check if the arguments are already laid out in the right way as
       // the caller's fixed stack objects.
       MachineFrameInfo &MFI = MF.getFrameInfo();
diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
index e46fc034cc26..8e75e185f0f6 100644
--- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -26,20 +26,21 @@ using namespace llvm;
 
 /// Return a constant boolean vector that has true elements in all positions
 /// where the input constant data vector has an element with the sign bit set.
-static Constant *getNegativeIsTrueBoolVec(Constant *V) {
+static Constant *getNegativeIsTrueBoolVec(Constant *V, const DataLayout &DL) {
   VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));
   V = ConstantExpr::getBitCast(V, IntTy);
-  V = ConstantExpr::getICmp(CmpInst::ICMP_SGT, Constant::getNullValue(IntTy),
-                            V);
+  V = ConstantFoldCompareInstOperands(CmpInst::ICMP_SGT,
+                                      Constant::getNullValue(IntTy), V, DL);
+  assert(V && "Vector must be foldable");
   return V;
 }
 
 /// Convert the x86 XMM integer vector mask to a vector of bools based on
 /// each element's most significant bit (the sign bit).
-static Value *getBoolVecFromMask(Value *Mask) {
+static Value *getBoolVecFromMask(Value *Mask, const DataLayout &DL) {
   // Fold Constant Mask.
   if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
-    return getNegativeIsTrueBoolVec(ConstantMask);
+    return getNegativeIsTrueBoolVec(ConstantMask, DL);
 
   // Mask was extended from a boolean vector.
   Value *ExtMask;
@@ -65,7 +66,7 @@ static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
 
   // The mask is constant or extended from a bool vector. Convert this x86
   // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
-  if (Value *BoolMask = getBoolVecFromMask(Mask)) {
+  if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) {
     // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
     // the LLVM intrinsic definition for the pointer argument.
     unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
@@ -102,7 +103,7 @@ static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
 
   // The mask is constant or extended from a bool vector. Convert this x86
   // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
-  if (Value *BoolMask = getBoolVecFromMask(Mask)) {
+  if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) {
     unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
     PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
     Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
@@ -2688,7 +2689,8 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
 
     // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
     if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
-      Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
+      Constant *NewSelector =
+          getNegativeIsTrueBoolVec(ConstantMask, IC.getDataLayout());
       return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
     }
 
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 43a40f5e691e..0723328d40e3 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -826,7 +826,7 @@ defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v32bf16_info, v16bf16x_info,
 
 // A 128-bit extract from bits [255:128] of a 512-bit vector should use a
 // smaller extract to enable EVEX->VEX.
-let Predicates = [NoVLX] in {
+let Predicates = [NoVLX, HasEVEX512] in {
 def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 2))),
           (v2i64 (VEXTRACTI128rr
                   (v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm)),
@@ -3080,7 +3080,7 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
            addr:$src2, (X86cmpm_imm_commute timm:$cc)), Narrow.KRC)>;
 }
 
-let Predicates = [HasAVX512, NoVLX] in {
+let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
   defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPD", v8i32x_info, v16i32_info>;
   defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUD", v8i32x_info, v16i32_info>;
 
@@ -3111,7 +3111,7 @@ let Predicates = [HasAVX512, NoVLX] in {
   defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPD", v2f64x_info, v8f64_info>;
 }
 
-let Predicates = [HasBWI, NoVLX] in {
+let Predicates = [HasBWI, NoVLX, HasEVEX512] in {
   defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPB", v32i8x_info, v64i8_info>;
   defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUB", v32i8x_info, v64i8_info>;
 
@@ -3505,7 +3505,7 @@ multiclass mask_move_lowering<string InstrStr, X86VectorVTInfo Narrow,
 
 // Patterns for handling v8i1 selects of 256-bit vectors when VLX isn't
 // available. Use a 512-bit operation and extract.
-let Predicates = [HasAVX512, NoVLX] in {
+let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
   defm : mask_move_lowering<"VMOVAPSZ", v4f32x_info, v16f32_info>;
   defm : mask_move_lowering<"VMOVDQA32Z", v4i32x_info, v16i32_info>;
   defm : mask_move_lowering<"VMOVAPSZ", v8f32x_info, v16f32_info>;
@@ -3517,7 +3517,7 @@ let Predicates = [HasAVX512, NoVLX] in {
   defm : mask_move_lowering<"VMOVDQA64Z", v4i64x_info, v8i64_info>;
 }
 
-let Predicates = [HasBWI, NoVLX] in {
+let Predicates = [HasBWI, NoVLX, HasEVEX512] in {
   defm : mask_move_lowering<"VMOVDQU8Z", v16i8x_info, v64i8_info>;
   defm : mask_move_lowering<"VMOVDQU8Z", v32i8x_info, v64i8_info>;
 
@@ -5010,8 +5010,8 @@ defm VPMINUD : avx512_binop_rm_vl_d<0x3B, "vpminud", umin,
 defm VPMINUQ : avx512_binop_rm_vl_q<0x3B, "vpminuq", umin,
                                     SchedWriteVecALU, HasAVX512, 1>, T8;
 
-// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX.
-let Predicates = [HasDQI, NoVLX] in {
+// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX, HasEVEX512.
+let Predicates = [HasDQI, NoVLX, HasEVEX512] in {
   def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))),
             (EXTRACT_SUBREG
                 (VPMULLQZrr
@@ -5067,7 +5067,7 @@ multiclass avx512_min_max_lowering<string Instr, SDNode OpNode> {
              sub_xmm)>;
 }
 
-let Predicates = [HasAVX512, NoVLX] in {
+let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
   defm : avx512_min_max_lowering<"VPMAXUQZ", umax>;
   defm : avx512_min_max_lowering<"VPMINUQZ", umin>;
   defm : avx512_min_max_lowering<"VPMAXSQZ", smax>;
@@ -6044,7 +6044,7 @@ defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl,
                                 SchedWriteVecShift>;
 
 // Use 512bit VPSRA/VPSRAI version to implement v2i64/v4i64 in case NoVLX.
-let Predicates = [HasAVX512, NoVLX] in {
+let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
   def : Pat<(v4i64 (X86vsra (v4i64 VR256X:$src1), (v2i64 VR128X:$src2))),
             (EXTRACT_SUBREG (v8i64
               (VPSRAQZrr
@@ -6173,14 +6173,14 @@ defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", X86vsrlv, SchedWriteVarVecS
 defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr, SchedWriteVarVecShift>;
 defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl, SchedWriteVarVecShift>;
 
-defm : avx512_var_shift_lowering<avx512vl_i64_info, "VPSRAVQ", X86vsrav, [HasAVX512, NoVLX]>;
-defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSLLVW", X86vshlv, [HasBWI, NoVLX]>;
-defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRAVW", X86vsrav, [HasBWI, NoVLX]>;
-defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRLVW", X86vsrlv, [HasBWI, NoVLX]>;
+defm : avx512_var_shift_lowering<avx512vl_i64_info, "VPSRAVQ", X86vsrav, [HasAVX512, NoVLX, HasEVEX512]>;
+defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSLLVW", X86vshlv, [HasBWI, NoVLX, HasEVEX512]>;
+defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRAVW", X86vsrav, [HasBWI, NoVLX, HasEVEX512]>;
+defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRLVW", X86vsrlv, [HasBWI, NoVLX, HasEVEX512]>;
 
 
 // Use 512bit VPROL/VPROLI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
-let Predicates = [HasAVX512, NoVLX] in {
+let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
   def : Pat<(v2i64 (rotl (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
             (EXTRACT_SUBREG (v8i64
               (VPROLVQZrr
@@ -6231,7 +6231,7 @@ let Predicates = [HasAVX512, NoVLX] in {
 }
 
 // Use 512bit VPROR/VPRORI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
-let Predicates = [HasAVX512, NoVLX] in {
+let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
   def : Pat<(v2i64 (rotr (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
             (EXTRACT_SUBREG (v8i64
               (VPRORVQZrr
@@ -8811,7 +8811,18 @@ let Predicates = [HasVLX] in {
   def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)),
                           v4i32x_info.ImmAllZerosV, VK2WM:$mask),
             (VCVTTPD2UDQZ128rmbkz VK2WM:$mask, addr:$src)>;
+
+  def : Pat<(v4i32 (lrint VR128X:$src)), (VCVTPS2DQZ128rr VR128X:$src)>;
+  def : Pat<(v4i32 (lrint (loadv4f32 addr:$src))), (VCVTPS2DQZ128rm addr:$src)>;
+  def : Pat<(v8i32 (lrint VR256X:$src)), (VCVTPS2DQZ256rr VR256X:$src)>;
+  def : Pat<(v8i32 (lrint (loadv8f32 addr:$src))), (VCVTPS2DQZ256rm addr:$src)>;
+  def : Pat<(v4i32 (lrint VR256X:$src)), (VCVTPD2DQZ256rr VR256X:$src)>;
+  def : Pat<(v4i32 (lrint (loadv4f64 addr:$src))), (VCVTPD2DQZ256rm addr:$src)>;
 }
+def : Pat<(v16i32 (lrint VR512:$src)), (VCVTPS2DQZrr VR512:$src)>;
+def : Pat<(v16i32 (lrint (loadv16f32 addr:$src))), (VCVTPS2DQZrm addr:$src)>;
+def : Pat<(v8i32 (lrint VR512:$src)), (VCVTPD2DQZrr VR512:$src)>;
+def : Pat<(v8i32 (lrint (loadv8f64 addr:$src))), (VCVTPD2DQZrm addr:$src)>;
 
 let Predicates = [HasDQI, HasVLX] in {
   def : Pat<(v2i64 (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
@@ -8857,6 +8868,30 @@ let Predicates = [HasDQI, HasVLX] in {
                                  (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
                                  v2i64x_info.ImmAllZerosV)),
             (VCVTTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>;
+
+  def : Pat<(v4i64 (lrint VR128X:$src)), (VCVTPS2QQZ256rr VR128X:$src)>;
+  def : Pat<(v4i64 (lrint (loadv4f32 addr:$src))), (VCVTPS2QQZ256rm addr:$src)>;
+  def : Pat<(v4i64 (llrint VR128X:$src)), (VCVTPS2QQZ256rr VR128X:$src)>;
+  def : Pat<(v4i64 (llrint (loadv4f32 addr:$src))), (VCVTPS2QQZ256rm addr:$src)>;
+  def : Pat<(v2i64 (lrint VR128X:$src)), (VCVTPD2QQZ128rr VR128X:$src)>;
+  def : Pat<(v2i64 (lrint (loadv2f64 addr:$src))), (VCVTPD2QQZ128rm addr:$src)>;
+  def : Pat<(v4i64 (lrint VR256X:$src)), (VCVTPD2QQZ256rr VR256X:$src)>;
+  def : Pat<(v4i64 (lrint (loadv4f64 addr:$src))), (VCVTPD2QQZ256rm addr:$src)>;
+  def : Pat<(v2i64 (llrint VR128X:$src)), (VCVTPD2QQZ128rr VR128X:$src)>;
+  def : Pat<(v2i64 (llrint (loadv2f64 addr:$src))), (VCVTPD2QQZ128rm addr:$src)>;
+  def : Pat<(v4i64 (llrint VR256X:$src)), (VCVTPD2QQZ256rr VR256X:$src)>;
+  def : Pat<(v4i64 (llrint (loadv4f64 addr:$src))), (VCVTPD2QQZ256rm addr:$src)>;
+}
+
+let Predicates = [HasDQI] in {
+  def : Pat<(v8i64 (lrint VR256X:$src)), (VCVTPS2QQZrr VR256X:$src)>;
+  def : Pat<(v8i64 (lrint (loadv8f32 addr:$src))), (VCVTPS2QQZrm addr:$src)>;
+  def : Pat<(v8i64 (llrint VR256X:$src)), (VCVTPS2QQZrr VR256X:$src)>;
+  def : Pat<(v8i64 (llrint (loadv8f32 addr:$src))), (VCVTPS2QQZrm addr:$src)>;
+  def : Pat<(v8i64 (lrint VR512:$src)), (VCVTPD2QQZrr VR512:$src)>;
+  def : Pat<(v8i64 (lrint (loadv8f64 addr:$src))), (VCVTPD2QQZrm addr:$src)>;
+  def : Pat<(v8i64 (llrint VR512:$src)), (VCVTPD2QQZrr VR512:$src)>;
+  def : Pat<(v8i64 (llrint (loadv8f64 addr:$src))), (VCVTPD2QQZrm addr:$src)>;
 }
 
 let Predicates = [HasVLX] in {
@@ -9828,7 +9863,7 @@ defm VPMOVUSWB  : avx512_trunc_wb<0x10, "vpmovuswb", X86vtruncus,
                                   truncstore_us_vi8, masked_truncstore_us_vi8,
                                   X86vtruncus, X86vmtruncus>;
 
-let Predicates = [HasAVX512, NoVLX] in {
+let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
 def: Pat<(v8i16 (trunc (v8i32 VR256X:$src))),
          (v8i16 (EXTRACT_SUBREG
                  (v16i16 (VPMOVDWZrr (v16i32 (INSERT_SUBREG (IMPLICIT_DEF),
@@ -9839,7 +9874,7 @@ def: Pat<(v4i32 (trunc (v4i64 VR256X:$src))),
                                            VR256X:$src, sub_ymm)))), sub_xmm))>;
 }
 
-let Predicates = [HasBWI, NoVLX] in {
+let Predicates = [HasBWI, NoVLX, HasEVEX512] in {
 def: Pat<(v16i8 (trunc (v16i16 VR256X:$src))),
          (v16i8 (EXTRACT_SUBREG (VPMOVWBZrr (v32i16 (INSERT_SUBREG (IMPLICIT_DEF),
                                             VR256X:$src, sub_ymm))), sub_xmm))>;
@@ -10382,7 +10417,7 @@ multiclass avx512_convert_vector_to_mask<bits<8> opc, string OpcodeStr,
     defm Z128 : convert_vector_to_mask_common<opc, VTInfo.info128, OpcodeStr>,
                                                EVEX_V128;
   }
-  let Predicates = [prd, NoVLX] in {
+  let Predicates = [prd, NoVLX, HasEVEX512] in {
     defm Z256_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info256, NAME>;
     defm Z128_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info128, NAME>;
   }
@@ -11169,7 +11204,7 @@ defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", abs,
                                     SchedWriteVecALU>;
 
 // VPABS: Use 512bit version to implement 128/256 bit in case NoVLX.
-let Predicates = [HasAVX512, NoVLX] in {
+let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
   def : Pat<(v4i64 (abs VR256X:$src)),
             (EXTRACT_SUBREG
                 (VPABSQZrr
@@ -11185,7 +11220,7 @@ let Predicates = [HasAVX512, NoVLX] in {
 // Use 512bit version to implement 128/256 bit.
 multiclass avx512_unary_lowering<string InstrStr, SDNode OpNode,
                                  AVX512VLVectorVTInfo _, Predicate prd> {
-  let Predicates = [prd, NoVLX] in {
+  let Predicates = [prd, NoVLX, HasEVEX512] in {
     def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1))),
               (EXTRACT_SUBREG
                 (!cast<Instruction>(InstrStr # "Zrr")
@@ -11804,7 +11839,7 @@ let Predicates = [HasAVX512] in {
             (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>;
 }
 
-let Predicates = [HasAVX512, NoVLX] in {
+let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
   def : Pat<(v16i8 (vnot VR128X:$src)),
             (EXTRACT_SUBREG
              (VPTERNLOGQZrri
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 3d80c43b571f..0e5e52d4d88e 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -3570,7 +3570,7 @@ bool X86InstrInfo::canMakeTailCallConditional(
     if (Target.isSymbol()) {
       StringRef Symbol(Target.getSymbolName());
       // this is currently only relevant to r11/kernel indirect thunk.
-      if (Symbol.equals("__x86_indirect_thunk_r11"))
+      if (Symbol == "__x86_indirect_thunk_r11")
         return false;
     }
   }
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 063b572761e7..bc15085f6c7b 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -1554,7 +1554,6 @@ def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>,
                      Sched<[WriteCvtPS2ILd]>, SIMD_EXC;
 
-
 // Convert Packed Double FP to Packed DW Integers
 let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
 // The assembler can recognize rr 256-bit instructions by seeing a ymm
@@ -1586,6 +1585,20 @@ def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                        VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, WIG;
 }
 
+let Predicates = [HasAVX] in {
+  def : Pat<(v4i32 (lrint VR128:$src)), (VCVTPS2DQrr VR128:$src)>;
+  def : Pat<(v4i32 (lrint (loadv4f32 addr:$src))), (VCVTPS2DQrm addr:$src)>;
+  def : Pat<(v8i32 (lrint VR256:$src)), (VCVTPS2DQYrr VR256:$src)>;
+  def : Pat<(v8i32 (lrint (loadv8f32 addr:$src))), (VCVTPS2DQYrm addr:$src)>;
+  def : Pat<(v4i32 (lrint VR256:$src)), (VCVTPD2DQYrr VR256:$src)>;
+  def : Pat<(v4i32 (lrint (loadv4f64 addr:$src))), (VCVTPD2DQYrm addr:$src)>;
+}
+
+let Predicates = [UseSSE2] in {
+  def : Pat<(v4i32 (lrint VR128:$src)), (CVTPS2DQrr VR128:$src)>;
+  def : Pat<(v4i32 (lrint (loadv4f32 addr:$src))), (CVTPS2DQrm addr:$src)>;
+}
+
 def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
                 (VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
 def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index aac355713f90..ac66144aeaae 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -99,6 +99,7 @@ struct CostKindCosts {
   }
 };
 using CostKindTblEntry = CostTblEntryT<CostKindCosts>;
+using TypeConversionCostKindTblEntry = TypeConversionCostTblEntryT<CostKindCosts>;
 
 TargetTransformInfo::PopcntSupportKind
 X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
@@ -345,6 +346,24 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
                                   Op1Info.getNoProps(), Op2Info.getNoProps());
   }
 
+  static const CostKindTblEntry GFNIUniformConstCostTable[] = {
+    { ISD::SHL,  MVT::v16i8,  { 1, 6, 1, 2 } }, // gf2p8affineqb
+    { ISD::SRL,  MVT::v16i8,  { 1, 6, 1, 2 } }, // gf2p8affineqb
+    { ISD::SRA,  MVT::v16i8,  { 1, 6, 1, 2 } }, // gf2p8affineqb
+    { ISD::SHL,  MVT::v32i8,  { 1, 6, 1, 2 } }, // gf2p8affineqb
+    { ISD::SRL,  MVT::v32i8,  { 1, 6, 1, 2 } }, // gf2p8affineqb
+    { ISD::SRA,  MVT::v32i8,  { 1, 6, 1, 2 } }, // gf2p8affineqb
+    { ISD::SHL,  MVT::v64i8,  { 1, 6, 1, 2 } }, // gf2p8affineqb
+    { ISD::SRL,  MVT::v64i8,  { 1, 6, 1, 2 } }, // gf2p8affineqb
+    { ISD::SRA,  MVT::v64i8,  { 1, 6, 1, 2 } }, // gf2p8affineqb
+  };
+
+  if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI())
+    if (const auto *Entry =
+            CostTableLookup(GFNIUniformConstCostTable, ISD, LT.second))
+      if (auto KindCost = Entry->Cost[CostKind])
+        return LT.first * *KindCost;
+
   static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
     { ISD::SHL,  MVT::v16i8,  { 1, 7, 2, 3 } }, // psllw + pand.
     { ISD::SRL,  MVT::v16i8,  { 1, 7, 2, 3 } }, // psrlw + pand.
@@ -2120,810 +2139,803 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
-  // TODO: Allow non-throughput costs that aren't binary.
-  auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
-    if (CostKind != TTI::TCK_RecipThroughput)
-      return Cost == 0 ? 0 : 1;
-    return Cost;
-  };
-
   // The cost tables include both specific, custom (non-legal) src/dst type
   // conversions and generic, legalized types. We test for customs first, before
   // falling back to legalization.
   // FIXME: Need a better design of the cost table to handle non-simple types of
   // potential massive combinations (elem_num x src_type x dst_type).
-  static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
-    { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
-    { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
+  static const TypeConversionCostKindTblEntry AVX512BWConversionTbl[]{
+    { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8,  { 1, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8,  { 1, 1, 1, 1 } },
 
     // Mask sign extend has an instruction.
-    { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,   1 },
-    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v2i1,   1 },
-    { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,   1 },
-    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v2i1,   1 },
-    { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,   1 },
-    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v4i1,   1 },
-    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,   1 },
-    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v4i1,   1 },
-    { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,   1 },
-    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v8i1,   1 },
-    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,   1 },
-    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1,  1 },
-    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  1 },
-    { ISD::SIGN_EXTEND, MVT::v32i8,  MVT::v32i1,  1 },
-    { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1,  1 },
-    { ISD::SIGN_EXTEND, MVT::v64i8,  MVT::v64i1,  1 },
-    { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1,  1 },
+    { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v2i1,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v2i1,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v4i1,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v4i1,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v8i1,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1,  { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v32i8,  MVT::v32i1,  { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1,  { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v64i8,  MVT::v64i1,  { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1,  { 1, 1, 1, 1 } },
 
     // Mask zero extend is a sext + shift.
-    { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,   2 },
-    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v2i1,   2 },
-    { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,   2 },
-    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v2i1,   2 },
-    { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,   2 },
-    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v4i1,   2 },
-    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,   2 },
-    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v4i1,   2 },
-    { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,   2 },
-    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v8i1,   2 },
-    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,   2 },
-    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1,  2 },
-    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  2 },
-    { ISD::ZERO_EXTEND, MVT::v32i8,  MVT::v32i1,  2 },
-    { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1,  2 },
-    { ISD::ZERO_EXTEND, MVT::v64i8,  MVT::v64i1,  2 },
-    { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1,  2 },
-
-    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   2 },
-    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v16i8,  2 },
-    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  2 },
-    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v8i16,  2 },
-    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   2 },
-    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v16i8,  2 },
-    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i16,  2 },
-    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v8i16,  2 },
-    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i8,   2 },
-    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v16i8,  2 },
-    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i16,  2 },
-    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i8,  2 },
-    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i16, 2 },
-    { ISD::TRUNCATE,    MVT::v32i1,  MVT::v32i8,  2 },
-    { ISD::TRUNCATE,    MVT::v32i1,  MVT::v32i16, 2 },
-    { ISD::TRUNCATE,    MVT::v64i1,  MVT::v64i8,  2 },
-    { ISD::TRUNCATE,    MVT::v64i1,  MVT::v32i16, 2 },
-
-    { ISD::TRUNCATE,    MVT::v32i8,  MVT::v32i16, 2 },
-    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 2 }, // widen to zmm
-    { ISD::TRUNCATE,    MVT::v2i8,   MVT::v2i16,  2 }, // vpmovwb
-    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  2 }, // vpmovwb
-    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  2 }, // vpmovwb
+    { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,   { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v2i1,   { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,   { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v2i1,   { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,   { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v4i1,   { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,   { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v4i1,   { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,   { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v8i1,   { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,   { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1,  { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v32i8,  MVT::v32i1,  { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1,  { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v64i8,  MVT::v64i1,  { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1,  { 2, 1, 1, 1 } },
+
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v16i8,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v8i16,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v16i8,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i16,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v8i16,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i8,   { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v16i8,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i16,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i8,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i16, { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v32i1,  MVT::v32i8,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v32i1,  MVT::v32i16, { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v64i1,  MVT::v64i8,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v64i1,  MVT::v32i16, { 2, 1, 1, 1 } },
+
+    { ISD::TRUNCATE,    MVT::v32i8,  MVT::v32i16, { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, { 2, 1, 1, 1 } }, // widen to zmm
+    { ISD::TRUNCATE,    MVT::v2i8,   MVT::v2i16,  { 2, 1, 1, 1 } }, // vpmovwb
+    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  { 2, 1, 1, 1 } }, // vpmovwb
+    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  { 2, 1, 1, 1 } }, // vpmovwb
   };
 
-  static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
+  static const TypeConversionCostKindTblEntry AVX512DQConversionTbl[] = {
     // Mask sign extend has an instruction.
-    { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v2i1,   1 },
-    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v2i1,   1 },
-    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i1,   1 },
-    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   1 },
-    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   1 },
-    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v16i1,  1 },
-    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i1,   1 },
-    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1,  1 },
+    { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v2i1,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v2i1,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i1,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v16i1,  { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i1,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1,  { 1, 1, 1, 1 } },
 
     // Mask zero extend is a sext + shift.
-    { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v2i1,   2 },
-    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v2i1,   2 },
-    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i1,   2 },
-    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   2 },
-    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   2 },
-    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v16i1,  2 },
-    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i1,   2 },
-    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1,  2 },
-
-    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i64,  2 },
-    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v4i32,  2 },
-    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i32,  2 },
-    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i64,  2 },
-    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i32,  2 },
-    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i64,  2 },
-    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i32, 2 },
-    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v8i64,  2 },
-
-    { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  1 },
-    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  1 },
-
-    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  1 },
-    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  1 },
-
-    { ISD::FP_TO_SINT,  MVT::v8i64,  MVT::v8f32,  1 },
-    { ISD::FP_TO_SINT,  MVT::v8i64,  MVT::v8f64,  1 },
-
-    { ISD::FP_TO_UINT,  MVT::v8i64,  MVT::v8f32,  1 },
-    { ISD::FP_TO_UINT,  MVT::v8i64,  MVT::v8f64,  1 },
+    { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v2i1,   { 2, 1, 1, 1, } },
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v2i1,   { 2, 1, 1, 1, } },
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i1,   { 2, 1, 1, 1, } },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   { 2, 1, 1, 1, } },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   { 2, 1, 1, 1, } },
+    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v16i1,  { 2, 1, 1, 1, } },
+    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i1,   { 2, 1, 1, 1, } },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1,  { 2, 1, 1, 1, } },
+
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i64,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v4i32,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i32,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i64,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i32,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i64,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i32, { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v8i64,  { 2, 1, 1, 1 } },
+
+    { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  { 1, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  { 1, 1, 1, 1 } },
+
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  { 1, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  { 1, 1, 1, 1 } },
+
+    { ISD::FP_TO_SINT,  MVT::v8i64,  MVT::v8f32,  { 1, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v8i64,  MVT::v8f64,  { 1, 1, 1, 1 } },
+
+    { ISD::FP_TO_UINT,  MVT::v8i64,  MVT::v8f32,  { 1, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v8i64,  MVT::v8f64,  { 1, 1, 1, 1 } },
   };
 
   // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
   // 256-bit wide vectors.
 
-  static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
-    { ISD::FP_EXTEND, MVT::v8f64,   MVT::v8f32,  1 },
-    { ISD::FP_EXTEND, MVT::v8f64,   MVT::v16f32, 3 },
-    { ISD::FP_EXTEND, MVT::v16f64,  MVT::v16f32, 4 }, // 2*vcvtps2pd+vextractf64x4
-    { ISD::FP_ROUND,  MVT::v8f32,   MVT::v8f64,  1 },
-
-    { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i8,   3 }, // sext+vpslld+vptestmd
-    { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i8,   3 }, // sext+vpslld+vptestmd
-    { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i8,   3 }, // sext+vpslld+vptestmd
-    { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i8,  3 }, // sext+vpslld+vptestmd
-    { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i16,  3 }, // sext+vpsllq+vptestmq
-    { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i16,  3 }, // sext+vpsllq+vptestmq
-    { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i16,  3 }, // sext+vpsllq+vptestmq
-    { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i16, 3 }, // sext+vpslld+vptestmd
-    { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i32,  2 }, // zmm vpslld+vptestmd
-    { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i32,  2 }, // zmm vpslld+vptestmd
-    { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i32,  2 }, // zmm vpslld+vptestmd
-    { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i32, 2 }, // vpslld+vptestmd
-    { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i64,  2 }, // zmm vpsllq+vptestmq
-    { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i64,  2 }, // zmm vpsllq+vptestmq
-    { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i64,  2 }, // vpsllq+vptestmq
-    { ISD::TRUNCATE,  MVT::v2i8,    MVT::v2i32,  2 }, // vpmovdb
-    { ISD::TRUNCATE,  MVT::v4i8,    MVT::v4i32,  2 }, // vpmovdb
-    { ISD::TRUNCATE,  MVT::v16i8,   MVT::v16i32, 2 }, // vpmovdb
-    { ISD::TRUNCATE,  MVT::v32i8,   MVT::v16i32, 2 }, // vpmovdb
-    { ISD::TRUNCATE,  MVT::v64i8,   MVT::v16i32, 2 }, // vpmovdb
-    { ISD::TRUNCATE,  MVT::v16i16,  MVT::v16i32, 2 }, // vpmovdw
-    { ISD::TRUNCATE,  MVT::v32i16,  MVT::v16i32, 2 }, // vpmovdw
-    { ISD::TRUNCATE,  MVT::v2i8,    MVT::v2i64,  2 }, // vpmovqb
-    { ISD::TRUNCATE,  MVT::v2i16,   MVT::v2i64,  1 }, // vpshufb
-    { ISD::TRUNCATE,  MVT::v8i8,    MVT::v8i64,  2 }, // vpmovqb
-    { ISD::TRUNCATE,  MVT::v16i8,   MVT::v8i64,  2 }, // vpmovqb
-    { ISD::TRUNCATE,  MVT::v32i8,   MVT::v8i64,  2 }, // vpmovqb
-    { ISD::TRUNCATE,  MVT::v64i8,   MVT::v8i64,  2 }, // vpmovqb
-    { ISD::TRUNCATE,  MVT::v8i16,   MVT::v8i64,  2 }, // vpmovqw
-    { ISD::TRUNCATE,  MVT::v16i16,  MVT::v8i64,  2 }, // vpmovqw
-    { ISD::TRUNCATE,  MVT::v32i16,  MVT::v8i64,  2 }, // vpmovqw
-    { ISD::TRUNCATE,  MVT::v8i32,   MVT::v8i64,  1 }, // vpmovqd
-    { ISD::TRUNCATE,  MVT::v4i32,   MVT::v4i64,  1 }, // zmm vpmovqd
-    { ISD::TRUNCATE,  MVT::v16i8,   MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
-
-    { ISD::TRUNCATE,  MVT::v16i8,  MVT::v16i16,  3 }, // extend to v16i32
-    { ISD::TRUNCATE,  MVT::v32i8,  MVT::v32i16,  8 },
-    { ISD::TRUNCATE,  MVT::v64i8,  MVT::v32i16,  8 },
+  static const TypeConversionCostKindTblEntry AVX512FConversionTbl[] = {
+    { ISD::FP_EXTEND, MVT::v8f64,   MVT::v8f32,   { 1, 1, 1, 1 } },
+    { ISD::FP_EXTEND, MVT::v8f64,   MVT::v16f32,  { 3, 1, 1, 1 } },
+    { ISD::FP_EXTEND, MVT::v16f64,  MVT::v16f32,  { 4, 1, 1, 1 } }, // 2*vcvtps2pd+vextractf64x4
+    { ISD::FP_ROUND,  MVT::v8f32,   MVT::v8f64,   { 1, 1, 1, 1 } },
+
+    { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i8,    { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
+    { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i8,    { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
+    { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i8,    { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
+    { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i8,   { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
+    { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i16,   { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
+    { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i16,   { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
+    { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i16,   { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
+    { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i16,  { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
+    { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i32,   { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
+    { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i32,   { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
+    { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i32,   { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
+    { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i32,  { 2, 1, 1, 1 } }, // vpslld+vptestmd
+    { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i64,   { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
+    { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i64,   { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
+    { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i64,   { 2, 1, 1, 1 } }, // vpsllq+vptestmq
+    { ISD::TRUNCATE,  MVT::v2i8,    MVT::v2i32,   { 2, 1, 1, 1 } }, // vpmovdb
+    { ISD::TRUNCATE,  MVT::v4i8,    MVT::v4i32,   { 2, 1, 1, 1 } }, // vpmovdb
+    { ISD::TRUNCATE,  MVT::v16i8,   MVT::v16i32,  { 2, 1, 1, 1 } }, // vpmovdb
+    { ISD::TRUNCATE,  MVT::v32i8,   MVT::v16i32,  { 2, 1, 1, 1 } }, // vpmovdb
+    { ISD::TRUNCATE,  MVT::v64i8,   MVT::v16i32,  { 2, 1, 1, 1 } }, // vpmovdb
+    { ISD::TRUNCATE,  MVT::v16i16,  MVT::v16i32,  { 2, 1, 1, 1 } }, // vpmovdw
+    { ISD::TRUNCATE,  MVT::v32i16,  MVT::v16i32,  { 2, 1, 1, 1 } }, // vpmovdw
+    { ISD::TRUNCATE,  MVT::v2i8,    MVT::v2i64,   { 2, 1, 1, 1 } }, // vpmovqb
+    { ISD::TRUNCATE,  MVT::v2i16,   MVT::v2i64,   { 1, 1, 1, 1 } }, // vpshufb
+    { ISD::TRUNCATE,  MVT::v8i8,    MVT::v8i64,   { 2, 1, 1, 1 } }, // vpmovqb
+    { ISD::TRUNCATE,  MVT::v16i8,   MVT::v8i64,   { 2, 1, 1, 1 } }, // vpmovqb
+    { ISD::TRUNCATE,  MVT::v32i8,   MVT::v8i64,   { 2, 1, 1, 1 } }, // vpmovqb
+    { ISD::TRUNCATE,  MVT::v64i8,   MVT::v8i64,   { 2, 1, 1, 1 } }, // vpmovqb
+    { ISD::TRUNCATE,  MVT::v8i16,   MVT::v8i64,   { 2, 1, 1, 1 } }, // vpmovqw
+    { ISD::TRUNCATE,  MVT::v16i16,  MVT::v8i64,   { 2, 1, 1, 1 } }, // vpmovqw
+    { ISD::TRUNCATE,  MVT::v32i16,  MVT::v8i64,   { 2, 1, 1, 1 } }, // vpmovqw
+    { ISD::TRUNCATE,  MVT::v8i32,   MVT::v8i64,   { 1, 1, 1, 1 } }, // vpmovqd
+    { ISD::TRUNCATE,  MVT::v4i32,   MVT::v4i64,   { 1, 1, 1, 1 } }, // zmm vpmovqd
+    { ISD::TRUNCATE,  MVT::v16i8,   MVT::v16i64,  { 5, 1, 1, 1 } },// 2*vpmovqd+concat+vpmovdb
+
+    { ISD::TRUNCATE,  MVT::v16i8,  MVT::v16i16,   { 3, 1, 1, 1 } }, // extend to v16i32
+    { ISD::TRUNCATE,  MVT::v32i8,  MVT::v32i16,   { 8, 1, 1, 1 } },
+    { ISD::TRUNCATE,  MVT::v64i8,  MVT::v32i16,   { 8, 1, 1, 1 } },
 
     // Sign extend is zmm vpternlogd+vptruncdb.
     // Zero extend is zmm broadcast load+vptruncdw.
-    { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,   3 },
-    { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,   4 },
-    { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,   3 },
-    { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,   4 },
-    { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,   3 },
-    { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,   4 },
-    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1,  3 },
-    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1,  4 },
+    { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,   { 3, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,   { 4, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,   { 3, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,   { 4, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,   { 3, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,   { 4, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1,  { 3, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1,  { 4, 1, 1, 1 } },
 
     // Sign extend is zmm vpternlogd+vptruncdw.
     // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
-    { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,   3 },
-    { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,   4 },
-    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,   3 },
-    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,   4 },
-    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,   3 },
-    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,   4 },
-    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  3 },
-    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  4 },
-
-    { ISD::SIGN_EXTEND, MVT::v2i32,  MVT::v2i1,   1 }, // zmm vpternlogd
-    { ISD::ZERO_EXTEND, MVT::v2i32,  MVT::v2i1,   2 }, // zmm vpternlogd+psrld
-    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i1,   1 }, // zmm vpternlogd
-    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i1,   2 }, // zmm vpternlogd+psrld
-    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   1 }, // zmm vpternlogd
-    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   2 }, // zmm vpternlogd+psrld
-    { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v2i1,   1 }, // zmm vpternlogq
-    { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v2i1,   2 }, // zmm vpternlogq+psrlq
-    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   1 }, // zmm vpternlogq
-    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   2 }, // zmm vpternlogq+psrlq
-
-    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1,  1 }, // vpternlogd
-    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1,  2 }, // vpternlogd+psrld
-    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i1,   1 }, // vpternlogq
-    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i1,   2 }, // vpternlogq+psrlq
-
-    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  1 },
-    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  1 },
-    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
-    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
-    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i8,   1 },
-    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i8,   1 },
-    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
-    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
-    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
-    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
-
-    { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8,  3 }, // FIXME: May not be right
-    { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8,  3 }, // FIXME: May not be right
-
-    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
-    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
-    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v16i8,  2 },
-    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i8,  1 },
-    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
-    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 1 },
-    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
-    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
-
-    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
-    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
-    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v16i8,  2 },
-    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i8,  1 },
-    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
-    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 1 },
-    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
-    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
-    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64, 26 },
-    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  5 },
-
-    { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v16f32, 2 },
-    { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v16f64, 7 },
-    { ISD::FP_TO_SINT,  MVT::v32i8,  MVT::v32f64,15 },
-    { ISD::FP_TO_SINT,  MVT::v64i8,  MVT::v64f32,11 },
-    { ISD::FP_TO_SINT,  MVT::v64i8,  MVT::v64f64,31 },
-    { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v8f64,  3 },
-    { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v16f64, 7 },
-    { ISD::FP_TO_SINT,  MVT::v32i16, MVT::v32f32, 5 },
-    { ISD::FP_TO_SINT,  MVT::v32i16, MVT::v32f64,15 },
-    { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f64,  1 },
-    { ISD::FP_TO_SINT,  MVT::v16i32, MVT::v16f64, 3 },
-
-    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f64,  1 },
-    { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v8f64,  3 },
-    { ISD::FP_TO_UINT,  MVT::v8i8,   MVT::v8f64,  3 },
-    { ISD::FP_TO_UINT,  MVT::v16i32, MVT::v16f32, 1 },
-    { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v16f32, 3 },
-    { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v16f32, 3 },
+    { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,   { 3, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,   { 4, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,   { 3, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,   { 4, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,   { 3, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,   { 4, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  { 3, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  { 4, 1, 1, 1 } },
+
+    { ISD::SIGN_EXTEND, MVT::v2i32,  MVT::v2i1,   { 1, 1, 1, 1 } }, // zmm vpternlogd
+    { ISD::ZERO_EXTEND, MVT::v2i32,  MVT::v2i1,   { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i1,   { 1, 1, 1, 1 } }, // zmm vpternlogd
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i1,   { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   { 1, 1, 1, 1 } }, // zmm vpternlogd
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
+    { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v2i1,   { 1, 1, 1, 1 } }, // zmm vpternlogq
+    { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v2i1,   { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   { 1, 1, 1, 1 } }, // zmm vpternlogq
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
+
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1,  { 1, 1, 1, 1 } }, // vpternlogd
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1,  { 2, 1, 1, 1 } }, // vpternlogd+psrld
+    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i1,   { 1, 1, 1, 1 } }, // vpternlogq
+    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i1,   { 2, 1, 1, 1 } }, // vpternlogq+psrlq
+
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  { 1, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i8,   { 1, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i8,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16,  { 1, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16,  { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i32,  { 1, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i32,  { 1, 1, 1, 1 } },
+
+    { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8,  { 3, 1, 1, 1 } }, // FIXME: May not be right
+    { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8,  { 3, 1, 1, 1 } }, // FIXME: May not be right
+
+    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   { 4, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i1,  { 3, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v16i8,  { 2, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i8,  { 1, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  { 2, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  { 1, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
+
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   { 4, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i1,  { 3, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v16i8,  { 2, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i8,  { 1, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  { 2, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  { 1, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  {26, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  { 5, 1, 1, 1 } },
+
+    { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v16f32, { 2, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v16f64, { 7, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v32i8,  MVT::v32f64, {15, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v64i8,  MVT::v64f32, {11, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v64i8,  MVT::v64f64, {31, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v8f64,  { 3, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v16f64, { 7, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v32i16, MVT::v32f32, { 5, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v32i16, MVT::v32f64, {15, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f64,  { 1, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v16i32, MVT::v16f64, { 3, 1, 1, 1 } },
+
+    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f64,  { 1, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v8f64,  { 3, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v8i8,   MVT::v8f64,  { 3, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v16i32, MVT::v16f32, { 1, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v16f32, { 3, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v16f32, { 3, 1, 1, 1 } },
   };
 
-  static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
+  static const TypeConversionCostKindTblEntry AVX512BWVLConversionTbl[] {
     // Mask sign extend has an instruction.
-    { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,   1 },
-    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v2i1,   1 },
-    { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,   1 },
-    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v2i1,   1 },
-    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,   1 },
-    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v4i1,   1 },
-    { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,   1 },
-    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v4i1,   1 },
-    { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,   1 },
-    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v8i1,   1 },
-    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,   1 },
-    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1,  1 },
-    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  1 },
-    { ISD::SIGN_EXTEND, MVT::v32i8,  MVT::v32i1,  1 },
-    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1,  1 },
-    { ISD::SIGN_EXTEND, MVT::v32i8,  MVT::v64i1,  1 },
-    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1,  1 },
+    { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v2i1,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v2i1,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v4i1,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v4i1,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v8i1,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1,  { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v32i8,  MVT::v32i1,  { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1,  { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v32i8,  MVT::v64i1,  { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1,  { 1, 1, 1, 1 } },
 
     // Mask zero extend is a sext + shift.
-    { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,   2 },
-    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v2i1,   2 },
-    { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,   2 },
-    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v2i1,   2 },
-    { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,   2 },
-    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v4i1,   2 },
-    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,   2 },
-    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v4i1,   2 },
-    { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,   2 },
-    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v8i1,   2 },
-    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,   2 },
-    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1,  2 },
-    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  2 },
-    { ISD::ZERO_EXTEND, MVT::v32i8,  MVT::v32i1,  2 },
-    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1,  2 },
-    { ISD::ZERO_EXTEND, MVT::v32i8,  MVT::v64i1,  2 },
-    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1,  2 },
-
-    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   2 },
-    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v16i8,  2 },
-    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  2 },
-    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v8i16,  2 },
-    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   2 },
-    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v16i8,  2 },
-    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i16,  2 },
-    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v8i16,  2 },
-    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i8,   2 },
-    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v16i8,  2 },
-    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i16,  2 },
-    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i8,  2 },
-    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i16, 2 },
-    { ISD::TRUNCATE,    MVT::v32i1,  MVT::v32i8,  2 },
-    { ISD::TRUNCATE,    MVT::v32i1,  MVT::v16i16, 2 },
-    { ISD::TRUNCATE,    MVT::v64i1,  MVT::v32i8,  2 },
-    { ISD::TRUNCATE,    MVT::v64i1,  MVT::v16i16, 2 },
-
-    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 2 },
+    { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,   { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v2i1,   { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,   { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v2i1,   { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,   { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v4i1,   { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,   { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v4i1,   { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,   { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v8i1,   { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,   { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1,  { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v32i8,  MVT::v32i1,  { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1,  { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v32i8,  MVT::v64i1,  { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1,  { 2, 1, 1, 1 } },
+
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v16i8,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v8i16,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v16i8,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i16,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v8i16,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i8,   { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v16i8,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i16,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i8,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i16, { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v32i1,  MVT::v32i8,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v32i1,  MVT::v16i16, { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v64i1,  MVT::v32i8,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v64i1,  MVT::v16i16, { 2, 1, 1, 1 } },
+
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, { 2, 1, 1, 1 } },
   };
 
-  static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
+  static const TypeConversionCostKindTblEntry AVX512DQVLConversionTbl[] = {
     // Mask sign extend has an instruction.
-    { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v2i1,   1 },
-    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v2i1,   1 },
-    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i1,   1 },
-    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v16i1,  1 },
-    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   1 },
-    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v8i1,   1 },
-    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v16i1,  1 },
-    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   1 },
+    { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v2i1,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v2i1,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i1,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v16i1,  { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v8i1,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v16i1,  { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   { 1, 1, 1, 1 } },
 
     // Mask zero extend is a sext + shift.
-    { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v2i1,   2 },
-    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v2i1,   2 },
-    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i1,   2 },
-    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v16i1,  2 },
-    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   2 },
-    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v8i1,   2 },
-    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v16i1,  2 },
-    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   2 },
-
-    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v4i64,  2 },
-    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v8i32,  2 },
-    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i64,  2 },
-    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v4i32,  2 },
-    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i32,  2 },
-    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i64,  2 },
-    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v4i64,  2 },
-    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i32,  2 },
-
-    { ISD::SINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  1 },
-    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
-    { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  1 },
-    { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  1 },
-
-    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  1 },
-    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
-    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  1 },
-    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  1 },
-
-    { ISD::FP_TO_SINT,  MVT::v2i64,  MVT::v4f32,  1 },
-    { ISD::FP_TO_SINT,  MVT::v4i64,  MVT::v4f32,  1 },
-    { ISD::FP_TO_SINT,  MVT::v2i64,  MVT::v2f64,  1 },
-    { ISD::FP_TO_SINT,  MVT::v4i64,  MVT::v4f64,  1 },
-
-    { ISD::FP_TO_UINT,  MVT::v2i64,  MVT::v4f32,  1 },
-    { ISD::FP_TO_UINT,  MVT::v4i64,  MVT::v4f32,  1 },
-    { ISD::FP_TO_UINT,  MVT::v2i64,  MVT::v2f64,  1 },
-    { ISD::FP_TO_UINT,  MVT::v4i64,  MVT::v4f64,  1 },
+    { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v2i1,   { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v2i1,   { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i1,   { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v16i1,  { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v8i1,   { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v16i1,  { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   { 2, 1, 1, 1 } },
+
+    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v4i64,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v8i32,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i64,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v4i32,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i32,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i64,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v4i64,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i32,  { 2, 1, 1, 1 } },
+
+    { ISD::SINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  { 1, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  { 1, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  { 1, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  { 1, 1, 1, 1 } },
+
+    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  { 1, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  { 1, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  { 1, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  { 1, 1, 1, 1 } },
+
+    { ISD::FP_TO_SINT,  MVT::v2i64,  MVT::v4f32,  { 1, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v4i64,  MVT::v4f32,  { 1, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v2i64,  MVT::v2f64,  { 1, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v4i64,  MVT::v4f64,  { 1, 1, 1, 1 } },
+
+    { ISD::FP_TO_UINT,  MVT::v2i64,  MVT::v4f32,  { 1, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v4i64,  MVT::v4f32,  { 1, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v2i64,  MVT::v2f64,  { 1, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v4i64,  MVT::v4f64,  { 1, 1, 1, 1 } },
   };
 
-  static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
-    { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i8,   3 }, // sext+vpslld+vptestmd
-    { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i8,   3 }, // sext+vpslld+vptestmd
-    { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i8,   3 }, // sext+vpslld+vptestmd
-    { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i8,  8 }, // split+2*v8i8
-    { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i16,  3 }, // sext+vpsllq+vptestmq
-    { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i16,  3 }, // sext+vpsllq+vptestmq
-    { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i16,  3 }, // sext+vpsllq+vptestmq
-    { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i16, 8 }, // split+2*v8i16
-    { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i32,  2 }, // vpslld+vptestmd
-    { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i32,  2 }, // vpslld+vptestmd
-    { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i32,  2 }, // vpslld+vptestmd
-    { ISD::TRUNCATE,  MVT::v16i1,   MVT::v8i32,  2 }, // vpslld+vptestmd
-    { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i64,  2 }, // vpsllq+vptestmq
-    { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i64,  2 }, // vpsllq+vptestmq
-    { ISD::TRUNCATE,  MVT::v4i32,   MVT::v4i64,  1 }, // vpmovqd
-    { ISD::TRUNCATE,  MVT::v4i8,    MVT::v4i64,  2 }, // vpmovqb
-    { ISD::TRUNCATE,  MVT::v4i16,   MVT::v4i64,  2 }, // vpmovqw
-    { ISD::TRUNCATE,  MVT::v8i8,    MVT::v8i32,  2 }, // vpmovwb
+  static const TypeConversionCostKindTblEntry AVX512VLConversionTbl[] = {
+    { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i8,    { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
+    { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i8,    { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
+    { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i8,    { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
+    { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i8,   { 8, 1, 1, 1 } }, // split+2*v8i8
+    { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i16,   { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
+    { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i16,   { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
+    { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i16,   { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
+    { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i16,  { 8, 1, 1, 1 } }, // split+2*v8i16
+    { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i32,   { 2, 1, 1, 1 } }, // vpslld+vptestmd
+    { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i32,   { 2, 1, 1, 1 } }, // vpslld+vptestmd
+    { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i32,   { 2, 1, 1, 1 } }, // vpslld+vptestmd
+    { ISD::TRUNCATE,  MVT::v16i1,   MVT::v8i32,   { 2, 1, 1, 1 } }, // vpslld+vptestmd
+    { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i64,   { 2, 1, 1, 1 } }, // vpsllq+vptestmq
+    { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i64,   { 2, 1, 1, 1 } }, // vpsllq+vptestmq
+    { ISD::TRUNCATE,  MVT::v4i32,   MVT::v4i64,   { 1, 1, 1, 1 } }, // vpmovqd
+    { ISD::TRUNCATE,  MVT::v4i8,    MVT::v4i64,   { 2, 1, 1, 1 } }, // vpmovqb
+    { ISD::TRUNCATE,  MVT::v4i16,   MVT::v4i64,   { 2, 1, 1, 1 } }, // vpmovqw
+    { ISD::TRUNCATE,  MVT::v8i8,    MVT::v8i32,   { 2, 1, 1, 1 } }, // vpmovwb
 
     // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
     // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
-    { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,   5 },
-    { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,   6 },
-    { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,   5 },
-    { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,   6 },
-    { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,   5 },
-    { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,   6 },
-    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1, 10 },
-    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1, 12 },
+    { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,   { 5, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,   { 6, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,   { 5, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,   { 6, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,   { 5, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,   { 6, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1,  {10, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1,  {12, 1, 1, 1 } },
 
     // sign extend is vpcmpeq+maskedmove+vpmovdw
     // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
-    { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,   4 },
-    { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,   5 },
-    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,   4 },
-    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,   5 },
-    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,   4 },
-    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,   5 },
-    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 },
-    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 },
-
-    { ISD::SIGN_EXTEND, MVT::v2i32,  MVT::v2i1,   1 }, // vpternlogd
-    { ISD::ZERO_EXTEND, MVT::v2i32,  MVT::v2i1,   2 }, // vpternlogd+psrld
-    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i1,   1 }, // vpternlogd
-    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i1,   2 }, // vpternlogd+psrld
-    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   1 }, // vpternlogd
-    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   2 }, // vpternlogd+psrld
-    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v16i1,  1 }, // vpternlogd
-    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v16i1,  2 }, // vpternlogd+psrld
-
-    { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v2i1,   1 }, // vpternlogq
-    { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v2i1,   2 }, // vpternlogq+psrlq
-    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   1 }, // vpternlogq
-    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   2 }, // vpternlogq+psrlq
-
-    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v16i8,  1 },
-    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v16i8,  1 },
-    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v16i8,  1 },
-    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v16i8,  1 },
-    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
-    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
-    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v8i16,  1 },
-    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v8i16,  1 },
-    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
-    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
-    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  1 },
-    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  1 },
-
-    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  1 },
-    { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  1 },
-    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  1 },
-    { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  1 },
-
-    { ISD::UINT_TO_FP,  MVT::f32,    MVT::i64,    1 },
-    { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    1 },
-    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  1 },
-    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  1 },
-    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  1 },
-    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  1 },
-    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  1 },
-    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  1 },
-    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  1 },
-    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  1 },
-    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  5 },
-    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  5 },
-    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  5 },
-
-    { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v8f32,  2 },
-    { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v16f32, 2 },
-    { ISD::FP_TO_SINT,  MVT::v32i8,  MVT::v32f32, 5 },
-
-    { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    1 },
-    { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,    1 },
-    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  1 },
-    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  1 },
-    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f64,  1 },
-    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  1 },
-    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f64,  1 },
+    { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,   { 4, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,   { 5, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,   { 4, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,   { 5, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,   { 4, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,   { 5, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  {10, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  {12, 1, 1, 1 } },
+
+    { ISD::SIGN_EXTEND, MVT::v2i32,  MVT::v2i1,   { 1, 1, 1, 1 } }, // vpternlogd
+    { ISD::ZERO_EXTEND, MVT::v2i32,  MVT::v2i1,   { 2, 1, 1, 1 } }, // vpternlogd+psrld
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i1,   { 1, 1, 1, 1 } }, // vpternlogd
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i1,   { 2, 1, 1, 1 } }, // vpternlogd+psrld
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   { 1, 1, 1, 1 } }, // vpternlogd
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   { 2, 1, 1, 1 } }, // vpternlogd+psrld
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v16i1,  { 1, 1, 1, 1 } }, // vpternlogd
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v16i1,  { 2, 1, 1, 1 } }, // vpternlogd+psrld
+
+    { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v2i1,   { 1, 1, 1, 1 } }, // vpternlogq
+    { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v2i1,   { 2, 1, 1, 1 } }, // vpternlogq+psrlq
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   { 1, 1, 1, 1 } }, // vpternlogq
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   { 2, 1, 1, 1 } }, // vpternlogq+psrlq
+
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v16i8,  { 1, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v16i8,  { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v16i8,  { 1, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v16i8,  { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  { 1, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v8i16,  { 1, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v8i16,  { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  { 1, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  { 1, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  { 1, 1, 1, 1 } },
+
+    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  { 1, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  { 1, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  { 1, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  { 1, 1, 1, 1 } },
+
+    { ISD::UINT_TO_FP,  MVT::f32,    MVT::i64,    { 1, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    { 1, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  { 1, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  { 1, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  { 1, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  { 1, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  { 1, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  { 1, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  { 1, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  { 1, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  { 5, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  { 5, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  { 5, 1, 1, 1 } },
+
+    { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v8f32,  { 2, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v16f32, { 2, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v32i8,  MVT::v32f32, { 5, 1, 1, 1 } },
+
+    { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    { 1, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,    { 1, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  { 1, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  { 1, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f64,  { 1, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  { 1, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f64,  { 1, 1, 1, 1 } },
   };
 
-  static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
-    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   3 },
-    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   3 },
-    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
-    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
-    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  1 },
-    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  1 },
-
-    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v16i8,  2 },
-    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v16i8,  2 },
-    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v16i8,  2 },
-    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v16i8,  2 },
-    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  2 },
-    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  2 },
-    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v8i16,  2 },
-    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v8i16,  2 },
-    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  2 },
-    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  2 },
-    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
-    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
-    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  2 },
-    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  2 },
-
-    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i32,  2 },
-
-    { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 4 },
-    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 4 },
-    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v8i16,  1 },
-    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v4i32,  1 },
-    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v2i64,  1 },
-    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v8i32,  4 },
-    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v4i64,  4 },
-    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v4i32,  1 },
-    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v2i64,  1 },
-    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v4i64,  5 },
-    { ISD::TRUNCATE,    MVT::v4i32,  MVT::v4i64,  1 },
-    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  2 },
-
-    { ISD::FP_EXTEND,   MVT::v8f64,  MVT::v8f32,  3 },
-    { ISD::FP_ROUND,    MVT::v8f32,  MVT::v8f64,  3 },
-
-    { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v8f32,  1 },
-    { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v4f64,  1 },
-    { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f32,  1 },
-    { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f64,  3 },
-
-    { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    3 },
-    { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,    3 },
-    { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v8f32,  1 },
-    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  3 },
-    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  4 },
-    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f64,  4 },
-    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  3 },
-    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v4f64,  4 },
-
-    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  2 },
-    { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  2 },
-    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  2 },
-    { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  2 },
-    { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  1 },
-    { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  1 },
-    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  3 },
-
-    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  2 },
-    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  2 },
-    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  2 },
-    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  2 },
-    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  2 },
-    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i32,  1 },
-    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  2 },
-    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  2 },
-    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  2 },
-    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  4 },
+  static const TypeConversionCostKindTblEntry AVX2ConversionTbl[] = {
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   { 3, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   { 3, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   { 3, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   { 3, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  { 1, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  { 1, 1, 1, 1 } },
+
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v16i8,  { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v16i8,  { 2, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v16i8,  { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v16i8,  { 2, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  { 2, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v8i16,  { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v8i16,  { 2, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  { 2, 1, 1, 1 } },
+
+    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i32,  { 2, 1, 1, 1 } },
+
+    { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, { 4, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, { 4, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v8i16,  { 1, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v4i32,  { 1, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v2i64,  { 1, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v8i32,  { 4, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v4i64,  { 4, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v4i32,  { 1, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v2i64,  { 1, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v4i64,  { 5, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v4i32,  MVT::v4i64,  { 1, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  { 2, 1, 1, 1 } },
+
+    { ISD::FP_EXTEND,   MVT::v8f64,  MVT::v8f32,  { 3, 1, 1, 1 } },
+    { ISD::FP_ROUND,    MVT::v8f32,  MVT::v8f64,  { 3, 1, 1, 1 } },
+
+    { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v8f32,  { 1, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v4f64,  { 1, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f32,  { 1, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f64,  { 3, 1, 1, 1 } },
+
+    { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    { 3, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,    { 3, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v8f32,  { 1, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  { 3, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  { 4, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f64,  { 4, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  { 3, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v4f64,  { 4, 1, 1, 1 } },
+
+    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  { 2, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  { 2, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  { 2, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  { 2, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  { 1, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  { 1, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  { 3, 1, 1, 1 } },
+
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  { 2, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  { 2, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  { 2, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  { 2, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  { 2, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i32,  { 1, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  { 2, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  { 2, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  { 2, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  { 4, 1, 1, 1 } },
   };
 
-  static const TypeConversionCostTblEntry AVXConversionTbl[] = {
-    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   4 },
-    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   4 },
-    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   4 },
-    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   4 },
-    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  4 },
-    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  4 },
-
-    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v16i8,  3 },
-    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v16i8,  3 },
-    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v16i8,  3 },
-    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v16i8,  3 },
-    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  3 },
-    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  3 },
-    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v8i16,  3 },
-    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v8i16,  3 },
-    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  3 },
-    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  3 },
-    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  3 },
-    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  3 },
-
-    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i64,  4 },
-    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i32,  5 },
-    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i16, 4 },
-    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i64,  9 },
-    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i64, 11 },
-
-    { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 6 },
-    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 6 },
-    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 2 }, // and+extract+packuswb
-    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v8i32,  5 },
-    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  5 },
-    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v4i64,  5 },
-    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v4i64,  3 }, // and+extract+2*packusdw
-    { ISD::TRUNCATE,    MVT::v4i32,  MVT::v4i64,  2 },
-
-    { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i1,   3 },
-    { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i1,   3 },
-    { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i1,   8 },
-    { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  4 },
-    { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v16i8,  2 },
-    { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  4 },
-    { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v8i16,  2 },
-    { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  2 },
-    { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  2 },
-    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  4 },
-    { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v2i64,  5 },
-    { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  8 },
-
-    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i1,   7 },
-    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i1,   7 },
-    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i1,   6 },
-    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  4 },
-    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v16i8,  2 },
-    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  4 },
-    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v8i16,  2 },
-    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  4 },
-    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i32,  4 },
-    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  5 },
-    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  6 },
-    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  8 },
-    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32, 10 },
-    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64, 10 },
-    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i64, 18 },
-    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  5 },
-    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64, 10 },
-
-    { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v8f32,  2 },
-    { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v4f64,  2 },
-    { ISD::FP_TO_SINT,  MVT::v32i8,  MVT::v8f32,  2 },
-    { ISD::FP_TO_SINT,  MVT::v32i8,  MVT::v4f64,  2 },
-    { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v8f32,  2 },
-    { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v4f64,  2 },
-    { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v8f32,  2 },
-    { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v4f64,  2 },
-    { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v4f64,  2 },
-    { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f32,  2 },
-    { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f64,  5 },
-
-    { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v8f32,  2 },
-    { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v4f64,  2 },
-    { ISD::FP_TO_UINT,  MVT::v32i8,  MVT::v8f32,  2 },
-    { ISD::FP_TO_UINT,  MVT::v32i8,  MVT::v4f64,  2 },
-    { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v8f32,  2 },
-    { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v4f64,  2 },
-    { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v8f32,  2 },
-    { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v4f64,  2 },
-    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  3 },
-    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  4 },
-    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f64,  6 },
-    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  7 },
-    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v4f64,  7 },
-
-    { ISD::FP_EXTEND,   MVT::v4f64,  MVT::v4f32,  1 },
-    { ISD::FP_ROUND,    MVT::v4f32,  MVT::v4f64,  1 },
+  static const TypeConversionCostKindTblEntry AVXConversionTbl[] = {
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   { 4, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   { 4, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   { 4, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   { 4, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  { 4, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  { 4, 1, 1, 1 } },
+
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v16i8,  { 3, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v16i8,  { 3, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v16i8,  { 3, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v16i8,  { 3, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  { 3, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  { 3, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v8i16,  { 3, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v8i16,  { 3, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  { 3, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  { 3, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  { 3, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  { 3, 1, 1, 1 } },
+
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i64,  { 4, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i32,  { 5, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i16, { 4, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i64,  { 9, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i64, {11, 1, 1, 1 } },
+
+    { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, { 6, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, { 6, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, { 2, 1, 1, 1 } }, // and+extract+packuswb
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v8i32,  { 5, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  { 5, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v4i64,  { 5, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v4i64,  { 3, 1, 1, 1 } }, // and+extract+2*packusdw
+    { ISD::TRUNCATE,    MVT::v4i32,  MVT::v4i64,  { 2, 1, 1, 1 } },
+
+    { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i1,   { 3, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i1,   { 3, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i1,   { 8, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  { 4, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v16i8,  { 2, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  { 4, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v8i16,  { 2, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  { 2, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  { 2, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  { 4, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v2i64,  { 5, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  { 8, 1, 1, 1 } },
+
+    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i1,   { 7, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i1,   { 7, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i1,   { 6, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v16i8,  { 4, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v16i8,  { 2, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  { 4, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v8i16,  { 2, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  { 4, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i32,  { 4, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  { 5, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  { 6, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  { 8, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  {10, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  {10, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  {18, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  { 5, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  {10, 1, 1, 1 } },
+
+    { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v8f32,  { 2, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v4f64,  { 2, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v32i8,  MVT::v8f32,  { 2, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v32i8,  MVT::v4f64,  { 2, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v8f32,  { 2, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v4f64,  { 2, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v8f32,  { 2, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v4f64,  { 2, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v4f64,  { 2, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f32,  { 2, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f64,  { 5, 1, 1, 1 } },
+
+    { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v8f32,  { 2, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v4f64,  { 2, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v32i8,  MVT::v8f32,  { 2, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v32i8,  MVT::v4f64,  { 2, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v8f32,  { 2, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v4f64,  { 2, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v8f32,  { 2, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v4f64,  { 2, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  { 3, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  { 4, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f64,  { 6, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  { 7, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v4f64,  { 7, 1, 1, 1 } },
+
+    { ISD::FP_EXTEND,   MVT::v4f64,  MVT::v4f32,  { 1, 1, 1, 1 } },
+    { ISD::FP_ROUND,    MVT::v4f32,  MVT::v4f64,  { 1, 1, 1, 1 } },
   };
 
-  static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
-    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8,   1 },
-    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8,   1 },
-    { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8,   1 },
-    { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8,   1 },
-    { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8,   1 },
-    { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8,   1 },
-    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16,   1 },
-    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16,   1 },
-    { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16,   1 },
-    { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16,   1 },
-    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32,   1 },
-    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32,   1 },
+  static const TypeConversionCostKindTblEntry SSE41ConversionTbl[] = {
+    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8,   { 1, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8,   { 1, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8,   { 1, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16,   { 1, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16,   { 1, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32,   { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32,   { 1, 1, 1, 1 } },
 
     // These truncates end up widening elements.
-    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   1 }, // PMOVXZBQ
-    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  1 }, // PMOVXZWQ
-    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   1 }, // PMOVXZBD
-
-    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v4i32,  2 },
-    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v4i32,  2 },
-    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v2i64,  2 },
-
-    { ISD::SINT_TO_FP,  MVT::f32,    MVT::i32,    1 },
-    { ISD::SINT_TO_FP,  MVT::f64,    MVT::i32,    1 },
-    { ISD::SINT_TO_FP,  MVT::f32,    MVT::i64,    1 },
-    { ISD::SINT_TO_FP,  MVT::f64,    MVT::i64,    1 },
-    { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v16i8,  1 },
-    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  1 },
-    { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v8i16,  1 },
-    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  1 },
-    { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  1 },
-    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v4i32,  1 },
-    { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  2 },
-
-    { ISD::UINT_TO_FP,  MVT::f32,    MVT::i32,    1 },
-    { ISD::UINT_TO_FP,  MVT::f64,    MVT::i32,    1 },
-    { ISD::UINT_TO_FP,  MVT::f32,    MVT::i64,    4 },
-    { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    4 },
-    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v16i8,  1 },
-    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  1 },
-    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v8i16,  1 },
-    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  1 },
-    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  3 },
-    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  3 },
-    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v4i32,  2 },
-    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v2i64, 12 },
-    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i64, 22 },
-    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  4 },
-
-    { ISD::FP_TO_SINT,  MVT::i32,    MVT::f32,    1 },
-    { ISD::FP_TO_SINT,  MVT::i64,    MVT::f32,    1 },
-    { ISD::FP_TO_SINT,  MVT::i32,    MVT::f64,    1 },
-    { ISD::FP_TO_SINT,  MVT::i64,    MVT::f64,    1 },
-    { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v4f32,  2 },
-    { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v2f64,  2 },
-    { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v4f32,  1 },
-    { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v2f64,  1 },
-    { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v4f32,  1 },
-    { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v2f64,  1 },
-
-    { ISD::FP_TO_UINT,  MVT::i32,    MVT::f32,    1 },
-    { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    4 },
-    { ISD::FP_TO_UINT,  MVT::i32,    MVT::f64,    1 },
-    { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,    4 },
-    { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v4f32,  2 },
-    { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v2f64,  2 },
-    { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v4f32,  1 },
-    { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v2f64,  1 },
-    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  4 },
-    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  4 },
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   { 1, 1, 1, 1 } }, // PMOVXZBQ
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  { 1, 1, 1, 1 } }, // PMOVXZWQ
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   { 1, 1, 1, 1 } }, // PMOVXZBD
+
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v4i32,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v4i32,  { 2, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v2i64,  { 2, 1, 1, 1 } },
+
+    { ISD::SINT_TO_FP,  MVT::f32,    MVT::i32,    { 1, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::f64,    MVT::i32,    { 1, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::f32,    MVT::i64,    { 1, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::f64,    MVT::i64,    { 1, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v16i8,  { 1, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  { 1, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v8i16,  { 1, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  { 1, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  { 1, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v4i32,  { 1, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  { 2, 1, 1, 1 } },
+
+    { ISD::UINT_TO_FP,  MVT::f32,    MVT::i32,    { 1, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::f64,    MVT::i32,    { 1, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::f32,    MVT::i64,    { 4, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    { 4, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v16i8,  { 1, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  { 1, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v8i16,  { 1, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  { 1, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  { 3, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  { 3, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v4i32,  { 2, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v2i64,  {12, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  {22, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  { 4, 1, 1, 1 } },
+
+    { ISD::FP_TO_SINT,  MVT::i32,    MVT::f32,    { 1, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::i64,    MVT::f32,    { 1, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::i32,    MVT::f64,    { 1, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::i64,    MVT::f64,    { 1, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v4f32,  { 2, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v2f64,  { 2, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v4f32,  { 1, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v2f64,  { 1, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v4f32,  { 1, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v2f64,  { 1, 1, 1, 1 } },
+
+    { ISD::FP_TO_UINT,  MVT::i32,    MVT::f32,    { 1, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    { 4, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::i32,    MVT::f64,    { 1, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,    { 4, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v4f32,  { 2, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v2f64,  { 2, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v4f32,  { 1, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v2f64,  { 1, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  { 4, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  { 4, 1, 1, 1 } },
   };
 
-  static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
+  static const TypeConversionCostKindTblEntry SSE2ConversionTbl[] = {
     // These are somewhat magic numbers justified by comparing the
     // output of llvm-mca for our various supported scheduler models
     // and basing it off the worst case scenario.
-    { ISD::SINT_TO_FP,  MVT::f32,    MVT::i32,    3 },
-    { ISD::SINT_TO_FP,  MVT::f64,    MVT::i32,    3 },
-    { ISD::SINT_TO_FP,  MVT::f32,    MVT::i64,    3 },
-    { ISD::SINT_TO_FP,  MVT::f64,    MVT::i64,    3 },
-    { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v16i8,  3 },
-    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  4 },
-    { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v8i16,  3 },
-    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  4 },
-    { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  3 },
-    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v4i32,  4 },
-    { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v2i64,  8 },
-    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  8 },
-
-    { ISD::UINT_TO_FP,  MVT::f32,    MVT::i32,    3 },
-    { ISD::UINT_TO_FP,  MVT::f64,    MVT::i32,    3 },
-    { ISD::UINT_TO_FP,  MVT::f32,    MVT::i64,    8 },
-    { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    9 },
-    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  4 },
-    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v16i8,  4 },
-    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v8i16,  4 },
-    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  4 },
-    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  7 },
-    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v4i32,  7 },
-    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  5 },
-    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64, 15 },
-    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v2i64, 18 },
-
-    { ISD::FP_TO_SINT,  MVT::i32,    MVT::f32,    4 },
-    { ISD::FP_TO_SINT,  MVT::i64,    MVT::f32,    4 },
-    { ISD::FP_TO_SINT,  MVT::i32,    MVT::f64,    4 },
-    { ISD::FP_TO_SINT,  MVT::i64,    MVT::f64,    4 },
-    { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v4f32,  6 },
-    { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v2f64,  6 },
-    { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v4f32,  5 },
-    { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v2f64,  5 },
-    { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v4f32,  4 },
-    { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v2f64,  4 },
-
-    { ISD::FP_TO_UINT,  MVT::i32,    MVT::f32,    4 },
-    { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    4 },
-    { ISD::FP_TO_UINT,  MVT::i32,    MVT::f64,    4 },
-    { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,   15 },
-    { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v4f32,  6 },
-    { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v2f64,  6 },
-    { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v4f32,  5 },
-    { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v2f64,  5 },
-    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  8 },
-    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  8 },
-
-    { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v16i8,  4 },
-    { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v16i8,  4 },
-    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v16i8,  2 },
-    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v16i8,  3 },
-    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v16i8,  1 },
-    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v16i8,  2 },
-    { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v8i16,  2 },
-    { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v8i16,  3 },
-    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v8i16,  1 },
-    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v8i16,  2 },
-    { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v4i32,  1 },
-    { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v4i32,  2 },
+    { ISD::SINT_TO_FP,  MVT::f32,    MVT::i32,    { 3, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::f64,    MVT::i32,    { 3, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::f32,    MVT::i64,    { 3, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::f64,    MVT::i64,    { 3, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v16i8,  { 3, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  { 4, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v8i16,  { 3, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  { 4, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  { 3, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v4i32,  { 4, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v2i64,  { 8, 1, 1, 1 } },
+    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  { 8, 1, 1, 1 } },
+
+    { ISD::UINT_TO_FP,  MVT::f32,    MVT::i32,    { 3, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::f64,    MVT::i32,    { 3, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::f32,    MVT::i64,    { 8, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    { 9, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  { 4, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v16i8,  { 4, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v8i16,  { 4, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  { 4, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  { 7, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v4i32,  { 7, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  { 5, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  {15, 1, 1, 1 } },
+    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v2i64,  {18, 1, 1, 1 } },
+
+    { ISD::FP_TO_SINT,  MVT::i32,    MVT::f32,    { 4, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::i64,    MVT::f32,    { 4, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::i32,    MVT::f64,    { 4, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::i64,    MVT::f64,    { 4, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v4f32,  { 6, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v2f64,  { 6, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v4f32,  { 5, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v2f64,  { 5, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v4f32,  { 4, 1, 1, 1 } },
+    { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v2f64,  { 4, 1, 1, 1 } },
+
+    { ISD::FP_TO_UINT,  MVT::i32,    MVT::f32,    { 4, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    { 4, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::i32,    MVT::f64,    { 4, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,    {15, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v4f32,  { 6, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v2f64,  { 6, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v4f32,  { 5, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v2f64,  { 5, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  { 8, 1, 1, 1 } },
+    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v2f64,  { 8, 1, 1, 1 } },
+
+    { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v16i8,  { 4, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v16i8,  { 4, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v16i8,  { 2, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v16i8,  { 3, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v16i8,  { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v16i8,  { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v8i16,  { 2, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v8i16,  { 3, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v8i16,  { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v8i16,  { 2, 1, 1, 1 } },
+    { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v4i32,  { 1, 1, 1, 1 } },
+    { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v4i32,  { 2, 1, 1, 1 } },
 
     // These truncates are really widening elements.
-    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i32,  1 }, // PSHUFD
-    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  2 }, // PUNPCKLWD+DQ
-    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   3 }, // PUNPCKLBW+WD+PSHUFD
-    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i16,  1 }, // PUNPCKLWD
-    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   2 }, // PUNPCKLBW+WD
-    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i8,   1 }, // PUNPCKLBW
-
-    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v8i16,  2 }, // PAND+PACKUSWB
-    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 3 },
-    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v4i32,  3 }, // PAND+2*PACKUSWB
-    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 7 },
-    { ISD::TRUNCATE,    MVT::v2i16,  MVT::v2i32,  1 },
-    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v4i32,  3 },
-    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  5 },
-    { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32,10 },
-    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v2i64,  4 }, // PAND+3*PACKUSWB
-    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v2i64,  2 }, // PSHUFD+PSHUFLW
-    { ISD::TRUNCATE,    MVT::v4i32,  MVT::v2i64,  1 }, // PSHUFD
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i32,  { 1, 1, 1, 1 } }, // PSHUFD
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  { 2, 1, 1, 1 } }, // PUNPCKLWD+DQ
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   { 3, 1, 1, 1 } }, // PUNPCKLBW+WD+PSHUFD
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i16,  { 1, 1, 1, 1 } }, // PUNPCKLWD
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   { 2, 1, 1, 1 } }, // PUNPCKLBW+WD
+    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i8,   { 1, 1, 1, 1 } }, // PUNPCKLBW
+
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v8i16,  { 2, 1, 1, 1 } }, // PAND+PACKUSWB
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, { 3, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v4i32,  { 3, 1, 1, 1 } }, // PAND+2*PACKUSWB
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, { 7, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v2i16,  MVT::v2i32,  { 1, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v4i32,  { 3, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  { 5, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, {10, 1, 1, 1 } },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v2i64,  { 4, 1, 1, 1 } }, // PAND+3*PACKUSWB
+    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v2i64,  { 2, 1, 1, 1 } }, // PSHUFD+PSHUFLW
+    { ISD::TRUNCATE,    MVT::v4i32,  MVT::v2i64,  { 1, 1, 1, 1 } }, // PSHUFD
   };
 
   // Attempt to map directly to (simple) MVT types to let us match custom entries.
@@ -2939,56 +2951,66 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
       if (ST->hasBWI())
         if (const auto *Entry = ConvertCostTableLookup(
                 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
-          return AdjustCost(Entry->Cost);
+          if (auto KindCost = Entry->Cost[CostKind])
+            return *KindCost;
 
       if (ST->hasDQI())
         if (const auto *Entry = ConvertCostTableLookup(
                 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
-          return AdjustCost(Entry->Cost);
+          if (auto KindCost = Entry->Cost[CostKind])
+            return *KindCost;
 
       if (ST->hasAVX512())
         if (const auto *Entry = ConvertCostTableLookup(
                 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
-          return AdjustCost(Entry->Cost);
+          if (auto KindCost = Entry->Cost[CostKind])
+            return *KindCost;
     }
 
     if (ST->hasBWI())
       if (const auto *Entry = ConvertCostTableLookup(
               AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
-        return AdjustCost(Entry->Cost);
+        if (auto KindCost = Entry->Cost[CostKind])
+          return *KindCost;
 
     if (ST->hasDQI())
       if (const auto *Entry = ConvertCostTableLookup(
               AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
-        return AdjustCost(Entry->Cost);
+        if (auto KindCost = Entry->Cost[CostKind])
+          return *KindCost;
 
     if (ST->hasAVX512())
       if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
                                                      SimpleDstTy, SimpleSrcTy))
-        return AdjustCost(Entry->Cost);
+        if (auto KindCost = Entry->Cost[CostKind])
+          return *KindCost;
 
     if (ST->hasAVX2()) {
       if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
                                                      SimpleDstTy, SimpleSrcTy))
-        return AdjustCost(Entry->Cost);
+        if (auto KindCost = Entry->Cost[CostKind])
+          return *KindCost;
     }
 
     if (ST->hasAVX()) {
       if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
                                                      SimpleDstTy, SimpleSrcTy))
-        return AdjustCost(Entry->Cost);
+        if (auto KindCost = Entry->Cost[CostKind])
+          return *KindCost;
     }
 
     if (ST->hasSSE41()) {
       if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
                                                      SimpleDstTy, SimpleSrcTy))
-        return AdjustCost(Entry->Cost);
+        if (auto KindCost = Entry->Cost[CostKind])
+          return *KindCost;
     }
 
     if (ST->hasSSE2()) {
       if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
                                                      SimpleDstTy, SimpleSrcTy))
-        return AdjustCost(Entry->Cost);
+        if (auto KindCost = Entry->Cost[CostKind])
+          return *KindCost;
     }
   }
 
@@ -3004,53 +3026,63 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     if (ST->hasBWI())
       if (const auto *Entry = ConvertCostTableLookup(
               AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
-        return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
+        if (auto KindCost = Entry->Cost[CostKind])
+          return std::max(LTSrc.first, LTDest.first) * *KindCost;
 
     if (ST->hasDQI())
       if (const auto *Entry = ConvertCostTableLookup(
               AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
-        return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
+        if (auto KindCost = Entry->Cost[CostKind])
+          return std::max(LTSrc.first, LTDest.first) * *KindCost;
 
     if (ST->hasAVX512())
       if (const auto *Entry = ConvertCostTableLookup(
               AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
-        return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
+        if (auto KindCost = Entry->Cost[CostKind])
+          return std::max(LTSrc.first, LTDest.first) * *KindCost;
   }
 
   if (ST->hasBWI())
     if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
                                                    LTDest.second, LTSrc.second))
-      return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
+      if (auto KindCost = Entry->Cost[CostKind])
+        return std::max(LTSrc.first, LTDest.first) * *KindCost;
 
   if (ST->hasDQI())
     if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
                                                    LTDest.second, LTSrc.second))
-      return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
+      if (auto KindCost = Entry->Cost[CostKind])
+        return std::max(LTSrc.first, LTDest.first) * *KindCost;
 
   if (ST->hasAVX512())
     if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
                                                    LTDest.second, LTSrc.second))
-      return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
+      if (auto KindCost = Entry->Cost[CostKind])
+        return std::max(LTSrc.first, LTDest.first) * *KindCost;
 
   if (ST->hasAVX2())
     if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
                                                    LTDest.second, LTSrc.second))
-      return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
+      if (auto KindCost = Entry->Cost[CostKind])
+        return std::max(LTSrc.first, LTDest.first) * *KindCost;
 
   if (ST->hasAVX())
     if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
                                                    LTDest.second, LTSrc.second))
-      return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
+      if (auto KindCost = Entry->Cost[CostKind])
+        return std::max(LTSrc.first, LTDest.first) * *KindCost;
 
   if (ST->hasSSE41())
     if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
                                                    LTDest.second, LTSrc.second))
-      return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
+      if (auto KindCost = Entry->Cost[CostKind])
+        return std::max(LTSrc.first, LTDest.first) * *KindCost;
 
   if (ST->hasSSE2())
     if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
                                                    LTDest.second, LTSrc.second))
-      return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
+      if (auto KindCost = Entry->Cost[CostKind])
+        return std::max(LTSrc.first, LTDest.first) * *KindCost;
 
   // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
   // sitofp.
@@ -3079,6 +3111,13 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
                             TTI::CastContextHint::None, CostKind);
   }
 
+  // TODO: Allow non-throughput costs that aren't binary.
+  auto AdjustCost = [&CostKind](InstructionCost Cost,
+                                InstructionCost N = 1) -> InstructionCost {
+    if (CostKind != TTI::TCK_RecipThroughput)
+      return Cost == 0 ? 0 : N;
+    return Cost * N;
+  };
   return AdjustCost(
       BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
 }
@@ -3868,6 +3907,9 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::BITREVERSE, MVT::v2i64,   {  1,  8,  2,  4 } }, // gf2p8affineqb
     { ISD::BITREVERSE, MVT::v4i64,   {  1,  9,  2,  4 } }, // gf2p8affineqb
     { ISD::BITREVERSE, MVT::v8i64,   {  1,  9,  2,  4 } }, // gf2p8affineqb
+    { X86ISD::VROTLI,  MVT::v16i8,   {  1,  6,  1,  2 } }, // gf2p8affineqb
+    { X86ISD::VROTLI,  MVT::v32i8,   {  1,  6,  1,  2 } }, // gf2p8affineqb
+    { X86ISD::VROTLI,  MVT::v64i8,   {  1,  6,  1,  2 } }, // gf2p8affineqb
   };
   static const CostKindTblEntry GLMCostTbl[] = {
     { ISD::FSQRT,      MVT::f32,     { 19, 20, 1, 1 } }, // sqrtss
@@ -4152,6 +4194,16 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
       }
     }
     break;
+  case Intrinsic::lrint:
+  case Intrinsic::llrint:
+    // X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which
+    // have the same costs as the CVTTP2SI (fptosi) instructions
+    if (!ICA.isTypeBasedOnly()) {
+      const SmallVectorImpl<Type *> &ArgTys = ICA.getArgTypes();
+      return getCastInstrCost(Instruction::FPToSI, RetTy, ArgTys[0],
+                              TTI::CastContextHint::None, CostKind);
+    }
+    break;
   case Intrinsic::maxnum:
   case Intrinsic::minnum:
     // FMINNUM has same costs so don't duplicate.
@@ -5737,7 +5789,6 @@ int X86TTIImpl::getScatterOverhead() const {
 }
 
 // Return an average cost of Gather / Scatter instruction, maybe improved later.
-// FIXME: Add TargetCostKind support.
 InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode,
                                             TTI::TargetCostKind CostKind,
                                             Type *SrcVTy, const Value *Ptr,
@@ -5796,63 +5847,17 @@ InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode,
                                          Alignment, AddressSpace);
   }
 
+  // If we didn't split, this will be a single gather/scatter instruction.
+  if (CostKind == TTI::TCK_CodeSize)
+    return 1;
+
   // The gather / scatter cost is given by Intel architects. It is a rough
   // number since we are looking at one instruction in a time.
-  const int GSOverhead = (Opcode == Instruction::Load)
-                             ? getGatherOverhead()
-                             : getScatterOverhead();
+  const int GSOverhead = (Opcode == Instruction::Load) ? getGatherOverhead()
+                                                       : getScatterOverhead();
   return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
                                            MaybeAlign(Alignment), AddressSpace,
-                                           TTI::TCK_RecipThroughput);
-}
-
-/// Return the cost of full scalarization of gather / scatter operation.
-///
-/// Opcode - Load or Store instruction.
-/// SrcVTy - The type of the data vector that should be gathered or scattered.
-/// VariableMask - The mask is non-constant at compile time.
-/// Alignment - Alignment for one element.
-/// AddressSpace - pointer[s] address space.
-/// TODO: Remove this and use getCommonMaskedMemoryOpCost directly.
-InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode,
-                                            TTI::TargetCostKind CostKind,
-                                            Type *SrcVTy, bool VariableMask,
-                                            Align Alignment,
-                                            unsigned AddressSpace) {
-  Type *ScalarTy = SrcVTy->getScalarType();
-  unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
-  APInt DemandedElts = APInt::getAllOnes(VF);
-
-  InstructionCost MaskUnpackCost = 0;
-  if (VariableMask) {
-    auto *MaskTy =
-        FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
-    MaskUnpackCost = getScalarizationOverhead(
-        MaskTy, DemandedElts, /*Insert=*/false, /*Extract=*/true, CostKind);
-    InstructionCost ScalarCompareCost = getCmpSelInstrCost(
-        Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr,
-        CmpInst::BAD_ICMP_PREDICATE, CostKind);
-    InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
-    MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
-  }
-
-  InstructionCost AddressUnpackCost = getScalarizationOverhead(
-      FixedVectorType::get(PointerType::getUnqual(ScalarTy->getContext()), VF),
-      DemandedElts, /*Insert=*/false, /*Extract=*/true, CostKind);
-
-  // The cost of the scalar loads/stores.
-  InstructionCost MemoryOpCost =
-      VF * getMemoryOpCost(Opcode, ScalarTy, MaybeAlign(Alignment),
-                           AddressSpace, CostKind);
-
-  // The cost of forming the vector from loaded scalars/
-  // scalarizing the vector to perform scalar stores.
-  InstructionCost InsertExtractCost = getScalarizationOverhead(
-      cast<FixedVectorType>(SrcVTy), DemandedElts,
-      /*Insert=*/Opcode == Instruction::Load,
-      /*Extract=*/Opcode == Instruction::Store, CostKind);
-
-  return AddressUnpackCost + MemoryOpCost + MaskUnpackCost + InsertExtractCost;
+                                           CostKind);
 }
 
 /// Calculate the cost of Gather / Scatter operation
@@ -5860,19 +5865,16 @@ InstructionCost X86TTIImpl::getGatherScatterOpCost(
     unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask,
     Align Alignment, TTI::TargetCostKind CostKind,
     const Instruction *I = nullptr) {
-  if (CostKind != TTI::TCK_RecipThroughput) {
-    if ((Opcode == Instruction::Load &&
-         isLegalMaskedGather(SrcVTy, Align(Alignment)) &&
-         !forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
-                                     Align(Alignment))) ||
-        (Opcode == Instruction::Store &&
-         isLegalMaskedScatter(SrcVTy, Align(Alignment)) &&
-         !forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
-                                      Align(Alignment))))
-      return 1;
+  if (((Opcode == Instruction::Load &&
+        (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
+         forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
+                                    Align(Alignment)))) ||
+       (Opcode == Instruction::Store &&
+        (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
+         forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
+                                     Align(Alignment))))))
     return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
                                          Alignment, CostKind, I);
-  }
 
   assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
   PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
@@ -5881,18 +5883,6 @@ InstructionCost X86TTIImpl::getGatherScatterOpCost(
         cast<VectorType>(Ptr->getType())->getElementType());
   assert(PtrTy && "Unexpected type for Ptr argument");
   unsigned AddressSpace = PtrTy->getAddressSpace();
-
-  if ((Opcode == Instruction::Load &&
-       (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
-        forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
-                                   Align(Alignment)))) ||
-      (Opcode == Instruction::Store &&
-       (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
-        forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
-                                    Align(Alignment)))))
-    return getGSScalarCost(Opcode, CostKind, SrcVTy, VariableMask, Alignment,
-                           AddressSpace);
-
   return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment,
                          AddressSpace);
 }
@@ -6708,7 +6698,7 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCost(
 }
 
 InstructionCost X86TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
-                                                 int64_t BaseOffset,
+                                                 StackOffset BaseOffset,
                                                  bool HasBaseReg, int64_t Scale,
                                                  unsigned AddrSpace) const {
   // Scaling factors are not free at all.
@@ -6731,9 +6721,10 @@ InstructionCost X86TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
   TargetLoweringBase::AddrMode AM;
   AM.BaseGV = BaseGV;
-  AM.BaseOffs = BaseOffset;
+  AM.BaseOffs = BaseOffset.getFixed();
   AM.HasBaseReg = HasBaseReg;
   AM.Scale = Scale;
+  AM.ScalableOffset = BaseOffset.getScalable();
   if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
     // Scale represents reg2 * scale, thus account for 1
     // as soon as we use a second register.
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index b50193074573..e14dc9fc0905 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -253,7 +253,7 @@ public:
   /// If the AM is supported, the return value must be >= 0.
   /// If the AM is not supported, it returns a negative value.
   InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
-                                       int64_t BaseOffset, bool HasBaseReg,
+                                       StackOffset BaseOffset, bool HasBaseReg,
                                        int64_t Scale, unsigned AddrSpace) const;
 
   bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
@@ -295,9 +295,6 @@ public:
 
 private:
   bool supportsGather() const;
-  InstructionCost getGSScalarCost(unsigned Opcode, TTI::TargetCostKind CostKind,
-                                  Type *DataTy, bool VariableMask,
-                                  Align Alignment, unsigned AddressSpace);
   InstructionCost getGSVectorCost(unsigned Opcode, TTI::TargetCostKind CostKind,
                                   Type *DataTy, const Value *Ptr,
                                   Align Alignment, unsigned AddressSpace);
diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp
index 71099462d5ec..026214e7e2ea 100644
--- a/llvm/lib/TargetParser/AArch64TargetParser.cpp
+++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp
@@ -280,3 +280,8 @@ bool AArch64::ExtensionSet::parseModifier(StringRef Modifier) {
   }
   return false;
 }
+
+const AArch64::ExtensionInfo &
+AArch64::getExtensionByID(AArch64::ArchExtKind ExtID) {
+  return lookupExtensionByID(ExtID);
+}
diff --git a/llvm/lib/TargetParser/ARMTargetParser.cpp b/llvm/lib/TargetParser/ARMTargetParser.cpp
index 67f937ebc33f..9d9917d86a36 100644
--- a/llvm/lib/TargetParser/ARMTargetParser.cpp
+++ b/llvm/lib/TargetParser/ARMTargetParser.cpp
@@ -610,7 +610,7 @@ StringRef ARM::getARMCPUForArch(const llvm::Triple &Triple, StringRef MArch) {
     return StringRef();
 
   StringRef CPU = llvm::ARM::getDefaultCPU(MArch);
-  if (!CPU.empty() && !CPU.equals("invalid"))
+  if (!CPU.empty() && CPU != "invalid")
     return CPU;
 
   // If no specific architecture version is requested, return the minimum CPU
diff --git a/llvm/lib/TargetParser/ARMTargetParserCommon.cpp b/llvm/lib/TargetParser/ARMTargetParserCommon.cpp
index 45d04f9bcbfb..d6ce6581bb1a 100644
--- a/llvm/lib/TargetParser/ARMTargetParserCommon.cpp
+++ b/llvm/lib/TargetParser/ARMTargetParserCommon.cpp
@@ -139,7 +139,7 @@ ARM::EndianKind ARM::parseArchEndian(StringRef Arch) {
 // returned in `PBP`. Returns false in error, with `Err` containing
 // an erroneous part of the spec.
 bool ARM::parseBranchProtection(StringRef Spec, ParsedBranchProtection &PBP,
-                                StringRef &Err) {
+                                StringRef &Err, bool EnablePAuthLR) {
   PBP = {"none", "a_key", false, false, false};
   if (Spec == "none")
     return true; // defaults are ok
@@ -148,6 +148,7 @@ bool ARM::parseBranchProtection(StringRef Spec, ParsedBranchProtection &PBP,
     PBP.Scope = "non-leaf";
     PBP.BranchTargetEnforcement = true;
     PBP.GuardedControlStack = true;
+    PBP.BranchProtectionPAuthLR = EnablePAuthLR;
     return true;
   }
 
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 834f4536f93a..c5156c6cb802 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1802,7 +1802,8 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   Features["rtm"]        = HasLeaf7 && ((EBX >> 11) & 1);
   // AVX512 is only supported if the OS supports the context save for it.
   Features["avx512f"]    = HasLeaf7 && ((EBX >> 16) & 1) && HasAVX512Save;
-  Features["evex512"]    = Features["avx512f"];
+  if (Features["avx512f"])
+    Features["evex512"]  = true;
   Features["avx512dq"]   = HasLeaf7 && ((EBX >> 17) & 1) && HasAVX512Save;
   Features["rdseed"]     = HasLeaf7 && ((EBX >> 18) & 1);
   Features["adx"]        = HasLeaf7 && ((EBX >> 19) & 1);
diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp
index e8172ebb2597..e22dd6032cb0 100644
--- a/llvm/lib/TargetParser/RISCVISAInfo.cpp
+++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp
@@ -98,6 +98,10 @@ void llvm::riscvExtensionsHelp(StringMap<StringRef> DescMap) {
     PrintExtension(E.first, Version, DescMap["experimental-" + E.first]);
   }
 
+  outs() << "\nSupported Profiles\n";
+  for (const auto &P : SupportedProfiles)
+    outs().indent(4) << P.Name << "\n";
+
   outs() << "\nUse -march to specify the target's extension.\n"
             "For example, clang -march=rv32i_v1p0\n";
 }
@@ -155,9 +159,9 @@ findDefaultVersion(StringRef ExtName) {
   return std::nullopt;
 }
 
-void RISCVISAInfo::addExtension(StringRef ExtName,
+bool RISCVISAInfo::addExtension(StringRef ExtName,
                                 RISCVISAUtils::ExtensionVersion Version) {
-  Exts[ExtName.str()] = Version;
+  return Exts.emplace(ExtName, Version).second;
 }
 
 static StringRef getExtensionTypeDesc(StringRef Ext) {
@@ -425,9 +429,11 @@ RISCVISAInfo::parseFeatures(unsigned XLen,
 
 llvm::Expected<std::unique_ptr<RISCVISAInfo>>
 RISCVISAInfo::parseNormalizedArchString(StringRef Arch) {
-  if (llvm::any_of(Arch, isupper))
+  // RISC-V ISA strings must be [a-z0-9_]
+  if (!llvm::all_of(
+          Arch, [](char C) { return isDigit(C) || isLower(C) || C == '_'; }))
     return createStringError(errc::invalid_argument,
-                             "string must be lowercase");
+                             "string may only contain [a-z0-9_]");
 
   // Must start with a valid base ISA name.
   unsigned XLen = 0;
@@ -445,9 +451,18 @@ RISCVISAInfo::parseNormalizedArchString(StringRef Arch) {
   // Each extension is of the form ${name}${major_version}p${minor_version}
   // and separated by _. Split by _ and then extract the name and version
   // information for each extension.
-  SmallVector<StringRef, 8> Split;
-  Arch.split(Split, '_');
-  for (StringRef Ext : Split) {
+  while (!Arch.empty()) {
+    if (Arch[0] == '_') {
+      if (Arch.size() == 1 || Arch[1] == '_')
+        return createStringError(errc::invalid_argument,
+                                 "extension name missing after separator '_'");
+      Arch = Arch.drop_front();
+    }
+
+    size_t Idx = Arch.find('_');
+    StringRef Ext = Arch.slice(0, Idx);
+    Arch = Arch.slice(Idx, StringRef::npos);
+
     StringRef Prefix, MinorVersionStr;
     std::tie(Prefix, MinorVersionStr) = Ext.rsplit('p');
     if (MinorVersionStr.empty())
@@ -470,33 +485,28 @@ RISCVISAInfo::parseNormalizedArchString(StringRef Arch) {
       return createStringError(errc::invalid_argument,
                                "extension lacks version in expected format");
 
+    if (VersionStart == 0)
+      return createStringError(errc::invalid_argument,
+                               "missing extension name");
+
     StringRef ExtName = Prefix.slice(0, VersionStart);
     StringRef MajorVersionStr = Prefix.slice(VersionStart, StringRef::npos);
     if (MajorVersionStr.getAsInteger(10, MajorVersion))
       return createStringError(errc::invalid_argument,
                                "failed to parse major version number");
-    ISAInfo->addExtension(ExtName, {MajorVersion, MinorVersion});
-  }
-  ISAInfo->updateImpliedLengths();
-  return std::move(ISAInfo);
-}
-
-static Error splitExtsByUnderscore(StringRef Exts,
-                                   std::vector<std::string> &SplitExts) {
-  SmallVector<StringRef, 8> Split;
-  if (Exts.empty())
-    return Error::success();
-
-  Exts.split(Split, "_");
 
-  for (auto Ext : Split) {
-    if (Ext.empty())
+    if ((ExtName[0] == 'z' || ExtName[0] == 's' || ExtName[0] == 'x') &&
+        (ExtName.size() == 1 || isDigit(ExtName[1])))
       return createStringError(errc::invalid_argument,
-                               "extension name missing after separator '_'");
+                               "'" + Twine(ExtName[0]) +
+                                   "' must be followed by a letter");
 
-    SplitExts.push_back(Ext.str());
+    if (!ISAInfo->addExtension(ExtName, {MajorVersion, MinorVersion}))
+      return createStringError(errc::invalid_argument,
+                               "duplicate extension '" + ExtName + "'");
   }
-  return Error::success();
+  ISAInfo->updateImpliedLengths();
+  return std::move(ISAInfo);
 }
 
 static Error processMultiLetterExtension(
@@ -584,10 +594,11 @@ llvm::Expected<std::unique_ptr<RISCVISAInfo>>
 RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension,
                               bool ExperimentalExtensionVersionCheck,
                               bool IgnoreUnknown) {
-  // RISC-V ISA strings must be lowercase.
-  if (llvm::any_of(Arch, isupper))
+  // RISC-V ISA strings must be [a-z0-9_]
+  if (!llvm::all_of(
+          Arch, [](char C) { return isDigit(C) || isLower(C) || C == '_'; }))
     return createStringError(errc::invalid_argument,
-                             "string must be lowercase");
+                             "string may only contain [a-z0-9_]");
 
   // ISA string must begin with rv32, rv64, or a profile.
   unsigned XLen = 0;
@@ -649,10 +660,6 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension,
     break;
   }
 
-  if (Arch.back() == '_')
-    return createStringError(errc::invalid_argument,
-                             "extension name missing after separator '_'");
-
   // Skip baseline.
   StringRef Exts = Arch.drop_front(1);
 
@@ -692,22 +699,27 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension,
   // Consume the base ISA version number and any '_' between rvxxx and the
   // first extension
   Exts = Exts.drop_front(ConsumeLength);
-  Exts.consume_front("_");
 
-  std::vector<std::string> SplitExts;
-  if (auto E = splitExtsByUnderscore(Exts, SplitExts))
-    return std::move(E);
+  while (!Exts.empty()) {
+    if (Exts.front() == '_') {
+      if (Exts.size() == 1 || Exts[1] == '_')
+        return createStringError(errc::invalid_argument,
+                                 "extension name missing after separator '_'");
+      Exts = Exts.drop_front();
+    }
+
+    size_t Idx = Exts.find('_');
+    StringRef Ext = Exts.slice(0, Idx);
+    Exts = Exts.slice(Idx, StringRef::npos);
 
-  for (auto &Ext : SplitExts) {
-    StringRef CurrExt = Ext;
-    while (!CurrExt.empty()) {
-      if (RISCVISAUtils::AllStdExts.contains(CurrExt.front())) {
+    do {
+      if (RISCVISAUtils::AllStdExts.contains(Ext.front())) {
         if (auto E = processSingleLetterExtension(
-                CurrExt, SeenExtMap, IgnoreUnknown, EnableExperimentalExtension,
+                Ext, SeenExtMap, IgnoreUnknown, EnableExperimentalExtension,
                 ExperimentalExtensionVersionCheck))
           return std::move(E);
-      } else if (CurrExt.front() == 'z' || CurrExt.front() == 's' ||
-                 CurrExt.front() == 'x') {
+      } else if (Ext.front() == 'z' || Ext.front() == 's' ||
+                 Ext.front() == 'x') {
         // Handle other types of extensions other than the standard
         // general purpose and standard user-level extensions.
         // Parse the ISA string containing non-standard user-level
@@ -717,7 +729,7 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension,
         // version number (major, minor) and are separated by a single
         // underscore '_'. We do not enforce a canonical order for them.
         if (auto E = processMultiLetterExtension(
-                CurrExt, SeenExtMap, IgnoreUnknown, EnableExperimentalExtension,
+                Ext, SeenExtMap, IgnoreUnknown, EnableExperimentalExtension,
                 ExperimentalExtensionVersionCheck))
           return std::move(E);
         // Multi-letter extension must be seperate following extension with
@@ -727,9 +739,9 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension,
         // FIXME: Could it be ignored by IgnoreUnknown?
         return createStringError(errc::invalid_argument,
                                  "invalid standard user-level extension '" +
-                                     Twine(CurrExt.front()) + "'");
+                                     Twine(Ext.front()) + "'");
       }
-    }
+    } while (!Ext.empty());
   }
 
   // Check all Extensions are supported.
@@ -883,7 +895,7 @@ void RISCVISAInfo::updateCombination() {
   do {
     MadeChange = false;
     for (StringRef CombineExt : CombineIntoExts) {
-      if (hasExtension(CombineExt))
+      if (Exts.count(CombineExt.str()))
         continue;
 
       // Look up the extension in the ImpliesExt table to find everything it
@@ -892,7 +904,7 @@ void RISCVISAInfo::updateCombination() {
                                     std::end(ImpliedExts), CombineExt);
       bool HasAllRequiredFeatures = std::all_of(
           Range.first, Range.second, [&](const ImpliedExtsEntry &Implied) {
-            return hasExtension(Implied.ImpliedExt);
+            return Exts.count(Implied.ImpliedExt);
           });
       if (HasAllRequiredFeatures) {
         auto Version = findDefaultVersion(CombineExt);
@@ -982,19 +994,19 @@ RISCVISAInfo::postProcessAndChecking(std::unique_ptr<RISCVISAInfo> &&ISAInfo) {
 
 StringRef RISCVISAInfo::computeDefaultABI() const {
   if (XLen == 32) {
-    if (hasExtension("e"))
+    if (Exts.count("e"))
       return "ilp32e";
-    if (hasExtension("d"))
+    if (Exts.count("d"))
       return "ilp32d";
-    if (hasExtension("f"))
+    if (Exts.count("f"))
       return "ilp32f";
     return "ilp32";
   } else if (XLen == 64) {
-    if (hasExtension("e"))
+    if (Exts.count("e"))
       return "lp64e";
-    if (hasExtension("d"))
+    if (Exts.count("d"))
       return "lp64d";
-    if (hasExtension("f"))
+    if (Exts.count("f"))
       return "lp64f";
     return "lp64";
   }
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index f3f244c814e7..f8269a51dc0b 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -115,6 +115,31 @@ StringRef Triple::getArchName(ArchType Kind, SubArchType SubArch) {
     if (SubArch == AArch64SubArch_arm64e)
       return "arm64e";
     break;
+  case Triple::dxil:
+    switch (SubArch) {
+    case Triple::NoSubArch:
+    case Triple::DXILSubArch_v1_0:
+      return "dxilv1.0";
+    case Triple::DXILSubArch_v1_1:
+      return "dxilv1.1";
+    case Triple::DXILSubArch_v1_2:
+      return "dxilv1.2";
+    case Triple::DXILSubArch_v1_3:
+      return "dxilv1.3";
+    case Triple::DXILSubArch_v1_4:
+      return "dxilv1.4";
+    case Triple::DXILSubArch_v1_5:
+      return "dxilv1.5";
+    case Triple::DXILSubArch_v1_6:
+      return "dxilv1.6";
+    case Triple::DXILSubArch_v1_7:
+      return "dxilv1.7";
+    case Triple::DXILSubArch_v1_8:
+      return "dxilv1.8";
+    default:
+      break;
+    }
+    break;
   default:
     break;
   }
@@ -348,14 +373,14 @@ StringRef Triple::getObjectFormatTypeName(ObjectFormatType Kind) {
 }
 
 static Triple::ArchType parseBPFArch(StringRef ArchName) {
-  if (ArchName.equals("bpf")) {
+  if (ArchName == "bpf") {
     if (sys::IsLittleEndianHost)
       return Triple::bpfel;
     else
       return Triple::bpfeb;
-  } else if (ArchName.equals("bpf_be") || ArchName.equals("bpfeb")) {
+  } else if (ArchName == "bpf_be" || ArchName == "bpfeb") {
     return Triple::bpfeb;
-  } else if (ArchName.equals("bpf_le") || ArchName.equals("bpfel")) {
+  } else if (ArchName == "bpf_le" || ArchName == "bpfel") {
     return Triple::bpfel;
   } else {
     return Triple::UnknownArch;
@@ -1014,6 +1039,53 @@ Triple::Triple(const Twine &ArchStr, const Twine &VendorStr, const Twine &OSStr,
     ObjectFormat = getDefaultFormat(*this);
 }
 
+static VersionTuple parseVersionFromName(StringRef Name);
+
+static StringRef getDXILArchNameFromShaderModel(StringRef ShaderModelStr) {
+  VersionTuple Ver =
+      parseVersionFromName(ShaderModelStr.drop_front(strlen("shadermodel")));
+  // Default DXIL minor version when Shader Model version is anything other
+  // than 6.[0...8] or 6.x (which translates to latest current SM version)
+  const unsigned SMMajor = 6;
+  if (!Ver.empty()) {
+    if (Ver.getMajor() == SMMajor) {
+      if (std::optional<unsigned> SMMinor = Ver.getMinor()) {
+        switch (*SMMinor) {
+        case 0:
+          return Triple::getArchName(Triple::dxil, Triple::DXILSubArch_v1_0);
+        case 1:
+          return Triple::getArchName(Triple::dxil, Triple::DXILSubArch_v1_1);
+        case 2:
+          return Triple::getArchName(Triple::dxil, Triple::DXILSubArch_v1_2);
+        case 3:
+          return Triple::getArchName(Triple::dxil, Triple::DXILSubArch_v1_3);
+        case 4:
+          return Triple::getArchName(Triple::dxil, Triple::DXILSubArch_v1_4);
+        case 5:
+          return Triple::getArchName(Triple::dxil, Triple::DXILSubArch_v1_5);
+        case 6:
+          return Triple::getArchName(Triple::dxil, Triple::DXILSubArch_v1_6);
+        case 7:
+          return Triple::getArchName(Triple::dxil, Triple::DXILSubArch_v1_7);
+        case 8:
+          return Triple::getArchName(Triple::dxil, Triple::DXILSubArch_v1_8);
+        default:
+          report_fatal_error("Unsupported Shader Model version", false);
+        }
+      }
+    }
+  } else {
+    // Special case: DXIL minor version is set to LatestCurrentDXILMinor for
+    // shadermodel6.x is
+    if (ShaderModelStr == "shadermodel6.x") {
+      return Triple::getArchName(Triple::dxil, Triple::LatestDXILSubArch);
+    }
+  }
+  // DXIL version corresponding to Shader Model version other than 6.Minor
+  // is 1.0
+  return Triple::getArchName(Triple::dxil, Triple::DXILSubArch_v1_0);
+}
+
 std::string Triple::normalize(StringRef Str) {
   bool IsMinGW32 = false;
   bool IsCygwin = false;
@@ -1206,6 +1278,20 @@ std::string Triple::normalize(StringRef Str) {
     }
   }
 
+  // Normalize DXIL triple if it does not include DXIL version number.
+  // Determine DXIL version number using the minor version number of Shader
+  // Model version specified in target triple, if any. Prior to decoupling DXIL
+  // version numbering from that of Shader Model DXIL version 1.Y corresponds to
+  // SM 6.Y. E.g., dxilv1.Y-unknown-shadermodelX.Y-hull
+  if (Components[0] == "dxil") {
+    if (Components.size() > 4) {
+      Components.resize(4);
+    }
+    // Add DXIL version only if shadermodel is specified in the triple
+    if (OS == Triple::ShaderModel) {
+      Components[0] = getDXILArchNameFromShaderModel(Components[2]);
+    }
+  }
   // Stick the corrected components back together to form the normalized string.
   return join(Components, "-");
 }
@@ -1420,6 +1506,17 @@ VersionTuple Triple::getVulkanVersion() const {
   return VersionTuple(0);
 }
 
+VersionTuple Triple::getDXILVersion() const {
+  if (getArch() != dxil || getOS() != ShaderModel)
+    llvm_unreachable("invalid DXIL triple");
+  StringRef Arch = getArchName();
+  Arch.consume_front("dxilv");
+  VersionTuple DXILVersion = parseVersionFromName(Arch);
+  // FIXME: validate DXIL version against Shader Model version.
+  // Tracked by https://github.com/llvm/llvm-project/issues/91388
+  return DXILVersion;
+}
+
 void Triple::setTriple(const Twine &Str) {
   *this = Triple(Str);
 }
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index 21f46f576490..efe392b94545 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -632,6 +632,7 @@ constexpr FeatureBitset ImpliedFeaturesPush2Pop2 = {};
 constexpr FeatureBitset ImpliedFeaturesPPX = {};
 constexpr FeatureBitset ImpliedFeaturesNDD = {};
 constexpr FeatureBitset ImpliedFeaturesCCMP = {};
+constexpr FeatureBitset ImpliedFeaturesNF = {};
 constexpr FeatureBitset ImpliedFeaturesCF = {};
 
 constexpr FeatureInfo FeatureInfos[X86::CPU_FEATURE_MAX] = {
diff --git a/llvm/lib/TextAPI/Utils.cpp b/llvm/lib/TextAPI/Utils.cpp
index 3b5e11e29de4..08f14f65177e 100644
--- a/llvm/lib/TextAPI/Utils.cpp
+++ b/llvm/lib/TextAPI/Utils.cpp
@@ -232,3 +232,13 @@ llvm::MachO::parseAliasList(std::unique_ptr<llvm::MemoryBuffer> &Buffer) {
 
   return Aliases;
 }
+
+PathSeq llvm::MachO::getPathsForPlatform(const PathToPlatformSeq &Paths,
+                                         PlatformType Platform) {
+  PathSeq Result;
+  for (const auto &[Path, CurrP] : Paths) {
+    if (!CurrP.has_value() || CurrP.value() == Platform)
+      Result.push_back(Path);
+  }
+  return Result;
+}
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index e586e9eda132..c7e84a009221 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -28,6 +29,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/Local.h"
 
@@ -47,6 +49,11 @@ static cl::opt<unsigned> MaxInstrsToScan(
     "aggressive-instcombine-max-scan-instrs", cl::init(64), cl::Hidden,
     cl::desc("Max number of instructions to scan for aggressive instcombine."));
 
+static cl::opt<unsigned> StrNCmpInlineThreshold(
+    "strncmp-inline-threshold", cl::init(3), cl::Hidden,
+    cl::desc("The maximum length of a constant string for a builtin string cmp "
+             "call eligible for inlining. The default value is 3."));
+
 /// Match a pattern for a bitwise funnel/rotate operation that partially guards
 /// against undefined behavior by branching around the funnel-shift/rotation
 /// when the shift amount is 0.
@@ -73,7 +80,7 @@ static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) {
                      m_Shl(m_Value(ShVal0), m_Value(ShAmt)),
                      m_LShr(m_Value(ShVal1),
                             m_Sub(m_SpecificInt(Width), m_Deferred(ShAmt))))))) {
-        return Intrinsic::fshl;
+      return Intrinsic::fshl;
     }
 
     // fshr(ShVal0, ShVal1, ShAmt)
@@ -82,7 +89,7 @@ static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) {
               m_OneUse(m_c_Or(m_Shl(m_Value(ShVal0), m_Sub(m_SpecificInt(Width),
                                                            m_Value(ShAmt))),
                               m_LShr(m_Value(ShVal1), m_Deferred(ShAmt)))))) {
-        return Intrinsic::fshr;
+      return Intrinsic::fshr;
     }
 
     return Intrinsic::not_intrinsic;
@@ -399,21 +406,11 @@ static bool tryToFPToSat(Instruction &I, TargetTransformInfo &TTI) {
 /// Try to replace a mathlib call to sqrt with the LLVM intrinsic. This avoids
 /// pessimistic codegen that has to account for setting errno and can enable
 /// vectorization.
-static bool foldSqrt(Instruction &I, TargetTransformInfo &TTI,
+static bool foldSqrt(CallInst *Call, LibFunc Func, TargetTransformInfo &TTI,
                      TargetLibraryInfo &TLI, AssumptionCache &AC,
                      DominatorTree &DT) {
-  // Match a call to sqrt mathlib function.
-  auto *Call = dyn_cast<CallInst>(&I);
-  if (!Call)
-    return false;
 
   Module *M = Call->getModule();
-  LibFunc Func;
-  if (!TLI.getLibFunc(*Call, Func) || !isLibFuncEmittable(M, &TLI, Func))
-    return false;
-
-  if (Func != LibFunc_sqrt && Func != LibFunc_sqrtf && Func != LibFunc_sqrtl)
-    return false;
 
   // If (1) this is a sqrt libcall, (2) we can assume that NAN is not created
   // (because NNAN or the operand arg must not be less than -0.0) and (2) we
@@ -426,18 +423,18 @@ static bool foldSqrt(Instruction &I, TargetTransformInfo &TTI,
   if (TTI.haveFastSqrt(Ty) &&
       (Call->hasNoNaNs() ||
        cannotBeOrderedLessThanZero(
-           Arg, 0, SimplifyQuery(M->getDataLayout(), &TLI, &DT, &AC, &I)))) {
-    IRBuilder<> Builder(&I);
+           Arg, 0, SimplifyQuery(M->getDataLayout(), &TLI, &DT, &AC, Call)))) {
+    IRBuilder<> Builder(Call);
     IRBuilderBase::FastMathFlagGuard Guard(Builder);
     Builder.setFastMathFlags(Call->getFastMathFlags());
 
     Function *Sqrt = Intrinsic::getDeclaration(M, Intrinsic::sqrt, Ty);
     Value *NewSqrt = Builder.CreateCall(Sqrt, Arg, "sqrt");
-    I.replaceAllUsesWith(NewSqrt);
+    Call->replaceAllUsesWith(NewSqrt);
 
     // Explicitly erase the old call because a call with side effects is not
     // trivially dead.
-    I.eraseFromParent();
+    Call->eraseFromParent();
     return true;
   }
 
@@ -922,13 +919,233 @@ static bool foldPatternedLoads(Instruction &I, const DataLayout &DL) {
   return true;
 }
 
+namespace {
+class StrNCmpInliner {
+public:
+  StrNCmpInliner(CallInst *CI, LibFunc Func, DomTreeUpdater *DTU,
+                 const DataLayout &DL)
+      : CI(CI), Func(Func), DTU(DTU), DL(DL) {}
+
+  bool optimizeStrNCmp();
+
+private:
+  void inlineCompare(Value *LHS, StringRef RHS, uint64_t N, bool Swapped);
+
+  CallInst *CI;
+  LibFunc Func;
+  DomTreeUpdater *DTU;
+  const DataLayout &DL;
+};
+
+} // namespace
+
+/// First we normalize calls to strncmp/strcmp to the form of
+/// compare(s1, s2, N), which means comparing first N bytes of s1 and s2
+/// (without considering '\0').
+///
+/// Examples:
+///
+/// \code
+///   strncmp(s, "a", 3) -> compare(s, "a", 2)
+///   strncmp(s, "abc", 3) -> compare(s, "abc", 3)
+///   strncmp(s, "a\0b", 3) -> compare(s, "a\0b", 2)
+///   strcmp(s, "a") -> compare(s, "a", 2)
+///
+///   char s2[] = {'a'}
+///   strncmp(s, s2, 3) -> compare(s, s2, 3)
+///
+///   char s2[] = {'a', 'b', 'c', 'd'}
+///   strncmp(s, s2, 3) -> compare(s, s2, 3)
+/// \endcode
+///
+/// We only handle cases where N and exactly one of s1 and s2 are constant.
+/// Cases that s1 and s2 are both constant are already handled by the
+/// instcombine pass.
+///
+/// We do not handle cases where N > StrNCmpInlineThreshold.
+///
+/// We also do not handles cases where N < 2, which are already
+/// handled by the instcombine pass.
+///
+bool StrNCmpInliner::optimizeStrNCmp() {
+  if (StrNCmpInlineThreshold < 2)
+    return false;
+
+  if (!isOnlyUsedInZeroComparison(CI))
+    return false;
+
+  Value *Str1P = CI->getArgOperand(0);
+  Value *Str2P = CI->getArgOperand(1);
+  // Should be handled elsewhere.
+  if (Str1P == Str2P)
+    return false;
+
+  StringRef Str1, Str2;
+  bool HasStr1 = getConstantStringInfo(Str1P, Str1, /*TrimAtNul=*/false);
+  bool HasStr2 = getConstantStringInfo(Str2P, Str2, /*TrimAtNul=*/false);
+  if (HasStr1 == HasStr2)
+    return false;
+
+  // Note that '\0' and characters after it are not trimmed.
+  StringRef Str = HasStr1 ? Str1 : Str2;
+  Value *StrP = HasStr1 ? Str2P : Str1P;
+
+  size_t Idx = Str.find('\0');
+  uint64_t N = Idx == StringRef::npos ? UINT64_MAX : Idx + 1;
+  if (Func == LibFunc_strncmp) {
+    if (auto *ConstInt = dyn_cast<ConstantInt>(CI->getArgOperand(2)))
+      N = std::min(N, ConstInt->getZExtValue());
+    else
+      return false;
+  }
+  // Now N means how many bytes we need to compare at most.
+  if (N > Str.size() || N < 2 || N > StrNCmpInlineThreshold)
+    return false;
+
+  // Cases where StrP has two or more dereferenceable bytes might be better
+  // optimized elsewhere.
+  bool CanBeNull = false, CanBeFreed = false;
+  if (StrP->getPointerDereferenceableBytes(DL, CanBeNull, CanBeFreed) > 1)
+    return false;
+  inlineCompare(StrP, Str, N, HasStr1);
+  return true;
+}
+
+/// Convert
+///
+/// \code
+///   ret = compare(s1, s2, N)
+/// \endcode
+///
+/// into
+///
+/// \code
+///   ret = (int)s1[0] - (int)s2[0]
+///   if (ret != 0)
+///     goto NE
+///   ...
+///   ret = (int)s1[N-2] - (int)s2[N-2]
+///   if (ret != 0)
+///     goto NE
+///   ret = (int)s1[N-1] - (int)s2[N-1]
+///   NE:
+/// \endcode
+///
+/// CFG before and after the transformation:
+///
+/// (before)
+/// BBCI
+///
+/// (after)
+/// BBCI -> BBSubs[0] (sub,icmp) --NE-> BBNE -> BBTail
+///                 |                    ^
+///                 E                    |
+///                 |                    |
+///        BBSubs[1] (sub,icmp) --NE-----+
+///                ...                   |
+///        BBSubs[N-1]    (sub) ---------+
+///
+void StrNCmpInliner::inlineCompare(Value *LHS, StringRef RHS, uint64_t N,
+                                   bool Swapped) {
+  auto &Ctx = CI->getContext();
+  IRBuilder<> B(Ctx);
+
+  BasicBlock *BBCI = CI->getParent();
+  BasicBlock *BBTail =
+      SplitBlock(BBCI, CI, DTU, nullptr, nullptr, BBCI->getName() + ".tail");
+
+  SmallVector<BasicBlock *> BBSubs;
+  for (uint64_t I = 0; I < N; ++I)
+    BBSubs.push_back(
+        BasicBlock::Create(Ctx, "sub_" + Twine(I), BBCI->getParent(), BBTail));
+  BasicBlock *BBNE = BasicBlock::Create(Ctx, "ne", BBCI->getParent(), BBTail);
+
+  cast<BranchInst>(BBCI->getTerminator())->setSuccessor(0, BBSubs[0]);
+
+  B.SetInsertPoint(BBNE);
+  PHINode *Phi = B.CreatePHI(CI->getType(), N);
+  B.CreateBr(BBTail);
+
+  Value *Base = LHS;
+  for (uint64_t i = 0; i < N; ++i) {
+    B.SetInsertPoint(BBSubs[i]);
+    Value *VL =
+        B.CreateZExt(B.CreateLoad(B.getInt8Ty(),
+                                  B.CreateInBoundsPtrAdd(Base, B.getInt64(i))),
+                     CI->getType());
+    Value *VR =
+        ConstantInt::get(CI->getType(), static_cast<unsigned char>(RHS[i]));
+    Value *Sub = Swapped ? B.CreateSub(VR, VL) : B.CreateSub(VL, VR);
+    if (i < N - 1)
+      B.CreateCondBr(B.CreateICmpNE(Sub, ConstantInt::get(CI->getType(), 0)),
+                     BBNE, BBSubs[i + 1]);
+    else
+      B.CreateBr(BBNE);
+
+    Phi->addIncoming(Sub, BBSubs[i]);
+  }
+
+  CI->replaceAllUsesWith(Phi);
+  CI->eraseFromParent();
+
+  if (DTU) {
+    SmallVector<DominatorTree::UpdateType, 8> Updates;
+    Updates.push_back({DominatorTree::Insert, BBCI, BBSubs[0]});
+    for (uint64_t i = 0; i < N; ++i) {
+      if (i < N - 1)
+        Updates.push_back({DominatorTree::Insert, BBSubs[i], BBSubs[i + 1]});
+      Updates.push_back({DominatorTree::Insert, BBSubs[i], BBNE});
+    }
+    Updates.push_back({DominatorTree::Insert, BBNE, BBTail});
+    Updates.push_back({DominatorTree::Delete, BBCI, BBTail});
+    DTU->applyUpdates(Updates);
+  }
+}
+
+static bool foldLibCalls(Instruction &I, TargetTransformInfo &TTI,
+                         TargetLibraryInfo &TLI, AssumptionCache &AC,
+                         DominatorTree &DT, const DataLayout &DL,
+                         bool &MadeCFGChange) {
+
+  auto *CI = dyn_cast<CallInst>(&I);
+  if (!CI || CI->isNoBuiltin())
+    return false;
+
+  Function *CalledFunc = CI->getCalledFunction();
+  if (!CalledFunc)
+    return false;
+
+  LibFunc LF;
+  if (!TLI.getLibFunc(*CalledFunc, LF) ||
+      !isLibFuncEmittable(CI->getModule(), &TLI, LF))
+    return false;
+
+  DomTreeUpdater DTU(&DT, DomTreeUpdater::UpdateStrategy::Lazy);
+
+  switch (LF) {
+  case LibFunc_sqrt:
+  case LibFunc_sqrtf:
+  case LibFunc_sqrtl:
+    return foldSqrt(CI, LF, TTI, TLI, AC, DT);
+  case LibFunc_strcmp:
+  case LibFunc_strncmp:
+    if (StrNCmpInliner(CI, LF, &DTU, DL).optimizeStrNCmp()) {
+      MadeCFGChange = true;
+      return true;
+    }
+    break;
+  default:;
+  }
+  return false;
+}
+
 /// This is the entry point for folds that could be implemented in regular
 /// InstCombine, but they are separated because they are not expected to
 /// occur frequently and/or have more than a constant-length pattern match.
 static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
                                 TargetTransformInfo &TTI,
                                 TargetLibraryInfo &TLI, AliasAnalysis &AA,
-                                AssumptionCache &AC) {
+                                AssumptionCache &AC, bool &MadeCFGChange) {
   bool MadeChange = false;
   for (BasicBlock &BB : F) {
     // Ignore unreachable basic blocks.
@@ -953,7 +1170,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
       // NOTE: This function introduces erasing of the instruction `I`, so it
       // needs to be called at the end of this sequence, otherwise we may make
       // bugs.
-      MadeChange |= foldSqrt(I, TTI, TLI, AC, DT);
+      MadeChange |= foldLibCalls(I, TTI, TLI, AC, DT, DL, MadeCFGChange);
     }
   }
 
@@ -969,12 +1186,12 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
 /// handled in the callers of this function.
 static bool runImpl(Function &F, AssumptionCache &AC, TargetTransformInfo &TTI,
                     TargetLibraryInfo &TLI, DominatorTree &DT,
-                    AliasAnalysis &AA) {
+                    AliasAnalysis &AA, bool &MadeCFGChange) {
   bool MadeChange = false;
   const DataLayout &DL = F.getParent()->getDataLayout();
   TruncInstCombine TIC(AC, TLI, DL, DT);
   MadeChange |= TIC.run(F);
-  MadeChange |= foldUnusualPatterns(F, DT, TTI, TLI, AA, AC);
+  MadeChange |= foldUnusualPatterns(F, DT, TTI, TLI, AA, AC, MadeCFGChange);
   return MadeChange;
 }
 
@@ -985,12 +1202,16 @@ PreservedAnalyses AggressiveInstCombinePass::run(Function &F,
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &TTI = AM.getResult<TargetIRAnalysis>(F);
   auto &AA = AM.getResult<AAManager>(F);
-  if (!runImpl(F, AC, TTI, TLI, DT, AA)) {
+  bool MadeCFGChange = false;
+  if (!runImpl(F, AC, TTI, TLI, DT, AA, MadeCFGChange)) {
     // No changes, all analyses are preserved.
     return PreservedAnalyses::all();
   }
   // Mark all the analyses that instcombine updates as preserved.
   PreservedAnalyses PA;
-  PA.preserveSet<CFGAnalyses>();
+  if (MadeCFGChange)
+    PA.preserve<DominatorTreeAnalysis>();
+  else
+    PA.preserveSet<CFGAnalyses>();
   return PA;
 }
diff --git a/llvm/lib/Transforms/Coroutines/CoroElide.cpp b/llvm/lib/Transforms/Coroutines/CoroElide.cpp
index d356a6d2e575..bb244489e4c2 100644
--- a/llvm/lib/Transforms/Coroutines/CoroElide.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroElide.cpp
@@ -33,24 +33,47 @@ static cl::opt<std::string> CoroElideInfoOutputFilename(
 
 namespace {
 // Created on demand if the coro-elide pass has work to do.
-struct Lowerer : coro::LowererBase {
+class FunctionElideInfo {
+public:
+  FunctionElideInfo(Function *F) : ContainingFunction(F) {
+    this->collectPostSplitCoroIds();
+  }
+
+  bool hasCoroIds() const { return !CoroIds.empty(); }
+
+  const SmallVectorImpl<CoroIdInst *> &getCoroIds() const { return CoroIds; }
+
+private:
+  Function *ContainingFunction;
   SmallVector<CoroIdInst *, 4> CoroIds;
+  // Used in canCoroBeginEscape to distinguish coro.suspend switchs.
+  SmallPtrSet<const SwitchInst *, 4> CoroSuspendSwitches;
+
+  void collectPostSplitCoroIds();
+  friend class CoroIdElider;
+};
+
+class CoroIdElider {
+public:
+  CoroIdElider(CoroIdInst *CoroId, FunctionElideInfo &FEI, AAResults &AA,
+               DominatorTree &DT, OptimizationRemarkEmitter &ORE);
+  void elideHeapAllocations(uint64_t FrameSize, Align FrameAlign);
+  bool lifetimeEligibleForElide() const;
+  bool attemptElide();
+  bool canCoroBeginEscape(const CoroBeginInst *,
+                          const SmallPtrSetImpl<BasicBlock *> &) const;
+
+private:
+  CoroIdInst *CoroId;
+  FunctionElideInfo &FEI;
+  AAResults &AA;
+  DominatorTree &DT;
+  OptimizationRemarkEmitter &ORE;
+
   SmallVector<CoroBeginInst *, 1> CoroBegins;
   SmallVector<CoroAllocInst *, 1> CoroAllocs;
   SmallVector<CoroSubFnInst *, 4> ResumeAddr;
   DenseMap<CoroBeginInst *, SmallVector<CoroSubFnInst *, 4>> DestroyAddr;
-  SmallPtrSet<const SwitchInst *, 4> CoroSuspendSwitches;
-
-  Lowerer(Module &M) : LowererBase(M) {}
-
-  void elideHeapAllocations(Function *F, uint64_t FrameSize, Align FrameAlign,
-                            AAResults &AA);
-  bool shouldElide(Function *F, DominatorTree &DT) const;
-  void collectPostSplitCoroIds(Function *F);
-  bool processCoroId(CoroIdInst *, AAResults &AA, DominatorTree &DT,
-                     OptimizationRemarkEmitter &ORE);
-  bool hasEscapePath(const CoroBeginInst *,
-                     const SmallPtrSetImpl<BasicBlock *> &) const;
 };
 } // end anonymous namespace
 
@@ -136,14 +159,66 @@ static std::unique_ptr<raw_fd_ostream> getOrCreateLogFile() {
 }
 #endif
 
+void FunctionElideInfo::collectPostSplitCoroIds() {
+  for (auto &I : instructions(this->ContainingFunction)) {
+    if (auto *CII = dyn_cast<CoroIdInst>(&I))
+      if (CII->getInfo().isPostSplit())
+        // If it is the coroutine itself, don't touch it.
+        if (CII->getCoroutine() != CII->getFunction())
+          CoroIds.push_back(CII);
+
+    // Consider case like:
+    // %0 = call i8 @llvm.coro.suspend(...)
+    // switch i8 %0, label %suspend [i8 0, label %resume
+    //                              i8 1, label %cleanup]
+    // and collect the SwitchInsts which are used by escape analysis later.
+    if (auto *CSI = dyn_cast<CoroSuspendInst>(&I))
+      if (CSI->hasOneUse() && isa<SwitchInst>(CSI->use_begin()->getUser())) {
+        SwitchInst *SWI = cast<SwitchInst>(CSI->use_begin()->getUser());
+        if (SWI->getNumCases() == 2)
+          CoroSuspendSwitches.insert(SWI);
+      }
+  }
+}
+
+CoroIdElider::CoroIdElider(CoroIdInst *CoroId, FunctionElideInfo &FEI,
+                           AAResults &AA, DominatorTree &DT,
+                           OptimizationRemarkEmitter &ORE)
+    : CoroId(CoroId), FEI(FEI), AA(AA), DT(DT), ORE(ORE) {
+  // Collect all coro.begin and coro.allocs associated with this coro.id.
+  for (User *U : CoroId->users()) {
+    if (auto *CB = dyn_cast<CoroBeginInst>(U))
+      CoroBegins.push_back(CB);
+    else if (auto *CA = dyn_cast<CoroAllocInst>(U))
+      CoroAllocs.push_back(CA);
+  }
+
+  // Collect all coro.subfn.addrs associated with coro.begin.
+  // Note, we only devirtualize the calls if their coro.subfn.addr refers to
+  // coro.begin directly. If we run into cases where this check is too
+  // conservative, we can consider relaxing the check.
+  for (CoroBeginInst *CB : CoroBegins) {
+    for (User *U : CB->users())
+      if (auto *II = dyn_cast<CoroSubFnInst>(U))
+        switch (II->getIndex()) {
+        case CoroSubFnInst::ResumeIndex:
+          ResumeAddr.push_back(II);
+          break;
+        case CoroSubFnInst::DestroyIndex:
+          DestroyAddr[CB].push_back(II);
+          break;
+        default:
+          llvm_unreachable("unexpected coro.subfn.addr constant");
+        }
+  }
+}
+
 // To elide heap allocations we need to suppress code blocks guarded by
 // llvm.coro.alloc and llvm.coro.free instructions.
-void Lowerer::elideHeapAllocations(Function *F, uint64_t FrameSize,
-                                   Align FrameAlign, AAResults &AA) {
-  LLVMContext &C = F->getContext();
+void CoroIdElider::elideHeapAllocations(uint64_t FrameSize, Align FrameAlign) {
+  LLVMContext &C = FEI.ContainingFunction->getContext();
   BasicBlock::iterator InsertPt =
-      getFirstNonAllocaInTheEntryBlock(CoroIds.front()->getFunction())
-          ->getIterator();
+      getFirstNonAllocaInTheEntryBlock(FEI.ContainingFunction)->getIterator();
 
   // Replacing llvm.coro.alloc with false will suppress dynamic
   // allocation as it is expected for the frontend to generate the code that
@@ -161,7 +236,7 @@ void Lowerer::elideHeapAllocations(Function *F, uint64_t FrameSize,
   // is spilled into the coroutine frame and recreate the alignment information
   // here. Possibly we will need to do a mini SROA here and break the coroutine
   // frame into individual AllocaInst recreating the original alignment.
-  const DataLayout &DL = F->getParent()->getDataLayout();
+  const DataLayout &DL = FEI.ContainingFunction->getParent()->getDataLayout();
   auto FrameTy = ArrayType::get(Type::getInt8Ty(C), FrameSize);
   auto *Frame = new AllocaInst(FrameTy, DL.getAllocaAddrSpace(), "", InsertPt);
   Frame->setAlignment(FrameAlign);
@@ -178,8 +253,8 @@ void Lowerer::elideHeapAllocations(Function *F, uint64_t FrameSize,
   removeTailCallAttribute(Frame, AA);
 }
 
-bool Lowerer::hasEscapePath(const CoroBeginInst *CB,
-                            const SmallPtrSetImpl<BasicBlock *> &TIs) const {
+bool CoroIdElider::canCoroBeginEscape(
+    const CoroBeginInst *CB, const SmallPtrSetImpl<BasicBlock *> &TIs) const {
   const auto &It = DestroyAddr.find(CB);
   assert(It != DestroyAddr.end());
 
@@ -248,7 +323,7 @@ bool Lowerer::hasEscapePath(const CoroBeginInst *CB,
     // which means a escape path to normal terminator, it is reasonable to skip
     // it since coroutine frame doesn't change outside the coroutine body.
     if (isa<SwitchInst>(TI) &&
-        CoroSuspendSwitches.count(cast<SwitchInst>(TI))) {
+        FEI.CoroSuspendSwitches.count(cast<SwitchInst>(TI))) {
       Worklist.push_back(cast<SwitchInst>(TI)->getSuccessor(1));
       Worklist.push_back(cast<SwitchInst>(TI)->getSuccessor(2));
     } else
@@ -261,7 +336,7 @@ bool Lowerer::hasEscapePath(const CoroBeginInst *CB,
   return false;
 }
 
-bool Lowerer::shouldElide(Function *F, DominatorTree &DT) const {
+bool CoroIdElider::lifetimeEligibleForElide() const {
   // If no CoroAllocs, we cannot suppress allocation, so elision is not
   // possible.
   if (CoroAllocs.empty())
@@ -270,6 +345,7 @@ bool Lowerer::shouldElide(Function *F, DominatorTree &DT) const {
   // Check that for every coro.begin there is at least one coro.destroy directly
   // referencing the SSA value of that coro.begin along each
   // non-exceptional path.
+  //
   // If the value escaped, then coro.destroy would have been referencing a
   // memory location storing that value and not the virtual register.
 
@@ -277,7 +353,7 @@ bool Lowerer::shouldElide(Function *F, DominatorTree &DT) const {
   // First gather all of the terminators for the function.
   // Consider the final coro.suspend as the real terminator when the current
   // function is a coroutine.
-  for (BasicBlock &B : *F) {
+  for (BasicBlock &B : *FEI.ContainingFunction) {
     auto *TI = B.getTerminator();
 
     if (TI->getNumSuccessors() != 0 || isa<UnreachableInst>(TI))
@@ -287,91 +363,43 @@ bool Lowerer::shouldElide(Function *F, DominatorTree &DT) const {
   }
 
   // Filter out the coro.destroy that lie along exceptional paths.
-  SmallPtrSet<CoroBeginInst *, 8> ReferencedCoroBegins;
-  for (const auto &It : DestroyAddr) {
+  for (const auto *CB : CoroBegins) {
+    auto It = DestroyAddr.find(CB);
+
+    // FIXME: If we have not found any destroys for this coro.begin, we
+    // disqualify this elide.
+    if (It == DestroyAddr.end())
+      return false;
+
+    const auto &CorrespondingDestroyAddrs = It->second;
+
     // If every terminators is dominated by coro.destroy, we could know the
     // corresponding coro.begin wouldn't escape.
-    //
-    // Otherwise hasEscapePath would decide whether there is any paths from
+    auto DominatesTerminator = [&](auto *TI) {
+      return llvm::any_of(CorrespondingDestroyAddrs, [&](auto *Destroy) {
+        return DT.dominates(Destroy, TI->getTerminator());
+      });
+    };
+
+    if (llvm::all_of(Terminators, DominatesTerminator))
+      continue;
+
+    // Otherwise canCoroBeginEscape would decide whether there is any paths from
     // coro.begin to Terminators which not pass through any of the
-    // coro.destroys.
+    // coro.destroys. This is a slower analysis.
     //
-    // hasEscapePath is relatively slow, so we avoid to run it as much as
+    // canCoroBeginEscape is relatively slow, so we avoid to run it as much as
     // possible.
-    if (llvm::all_of(Terminators,
-                     [&](auto *TI) {
-                       return llvm::any_of(It.second, [&](auto *DA) {
-                         return DT.dominates(DA, TI->getTerminator());
-                       });
-                     }) ||
-        !hasEscapePath(It.first, Terminators))
-      ReferencedCoroBegins.insert(It.first);
+    if (canCoroBeginEscape(CB, Terminators))
+      return false;
   }
 
-  // If size of the set is the same as total number of coro.begin, that means we
-  // found a coro.free or coro.destroy referencing each coro.begin, so we can
-  // perform heap elision.
-  return ReferencedCoroBegins.size() == CoroBegins.size();
-}
-
-void Lowerer::collectPostSplitCoroIds(Function *F) {
-  CoroIds.clear();
-  CoroSuspendSwitches.clear();
-  for (auto &I : instructions(F)) {
-    if (auto *CII = dyn_cast<CoroIdInst>(&I))
-      if (CII->getInfo().isPostSplit())
-        // If it is the coroutine itself, don't touch it.
-        if (CII->getCoroutine() != CII->getFunction())
-          CoroIds.push_back(CII);
-
-    // Consider case like:
-    // %0 = call i8 @llvm.coro.suspend(...)
-    // switch i8 %0, label %suspend [i8 0, label %resume
-    //                              i8 1, label %cleanup]
-    // and collect the SwitchInsts which are used by escape analysis later.
-    if (auto *CSI = dyn_cast<CoroSuspendInst>(&I))
-      if (CSI->hasOneUse() && isa<SwitchInst>(CSI->use_begin()->getUser())) {
-        SwitchInst *SWI = cast<SwitchInst>(CSI->use_begin()->getUser());
-        if (SWI->getNumCases() == 2)
-          CoroSuspendSwitches.insert(SWI);
-      }
-  }
+  // We have checked all CoroBegins and their paths to the terminators without
+  // finding disqualifying code patterns, so we can perform heap allocations.
+  return true;
 }
 
-bool Lowerer::processCoroId(CoroIdInst *CoroId, AAResults &AA,
-                            DominatorTree &DT, OptimizationRemarkEmitter &ORE) {
-  CoroBegins.clear();
-  CoroAllocs.clear();
-  ResumeAddr.clear();
-  DestroyAddr.clear();
-
-  // Collect all coro.begin and coro.allocs associated with this coro.id.
-  for (User *U : CoroId->users()) {
-    if (auto *CB = dyn_cast<CoroBeginInst>(U))
-      CoroBegins.push_back(CB);
-    else if (auto *CA = dyn_cast<CoroAllocInst>(U))
-      CoroAllocs.push_back(CA);
-  }
-
-  // Collect all coro.subfn.addrs associated with coro.begin.
-  // Note, we only devirtualize the calls if their coro.subfn.addr refers to
-  // coro.begin directly. If we run into cases where this check is too
-  // conservative, we can consider relaxing the check.
-  for (CoroBeginInst *CB : CoroBegins) {
-    for (User *U : CB->users())
-      if (auto *II = dyn_cast<CoroSubFnInst>(U))
-        switch (II->getIndex()) {
-        case CoroSubFnInst::ResumeIndex:
-          ResumeAddr.push_back(II);
-          break;
-        case CoroSubFnInst::DestroyIndex:
-          DestroyAddr[CB].push_back(II);
-          break;
-        default:
-          llvm_unreachable("unexpected coro.subfn.addr constant");
-        }
-  }
-
+bool CoroIdElider::attemptElide() {
   // PostSplit coro.id refers to an array of subfunctions in its Info
   // argument.
   ConstantArray *Resumers = CoroId->getInfo().Resumers;
@@ -382,63 +410,55 @@ bool Lowerer::processCoroId(CoroIdInst *CoroId, AAResults &AA,
 
   replaceWithConstant(ResumeAddrConstant, ResumeAddr);
 
-  bool ShouldElide = shouldElide(CoroId->getFunction(), DT);
-  if (!ShouldElide)
-    ORE.emit([&]() {
-      if (auto FrameSizeAndAlign =
-              getFrameLayout(cast<Function>(ResumeAddrConstant)))
-        return OptimizationRemarkMissed(DEBUG_TYPE, "CoroElide", CoroId)
-               << "'" << ore::NV("callee", CoroId->getCoroutine()->getName())
-               << "' not elided in '"
-               << ore::NV("caller", CoroId->getFunction()->getName())
-               << "' (frame_size="
-               << ore::NV("frame_size", FrameSizeAndAlign->first) << ", align="
-               << ore::NV("align", FrameSizeAndAlign->second.value()) << ")";
-      else
-        return OptimizationRemarkMissed(DEBUG_TYPE, "CoroElide", CoroId)
-               << "'" << ore::NV("callee", CoroId->getCoroutine()->getName())
-               << "' not elided in '"
-               << ore::NV("caller", CoroId->getFunction()->getName())
-               << "' (frame_size=unknown, align=unknown)";
-    });
+  bool EligibleForElide = lifetimeEligibleForElide();
 
   auto *DestroyAddrConstant = Resumers->getAggregateElement(
-      ShouldElide ? CoroSubFnInst::CleanupIndex : CoroSubFnInst::DestroyIndex);
+      EligibleForElide ? CoroSubFnInst::CleanupIndex
+                       : CoroSubFnInst::DestroyIndex);
 
   for (auto &It : DestroyAddr)
     replaceWithConstant(DestroyAddrConstant, It.second);
 
-  if (ShouldElide) {
-    if (auto FrameSizeAndAlign =
-            getFrameLayout(cast<Function>(ResumeAddrConstant))) {
-      elideHeapAllocations(CoroId->getFunction(), FrameSizeAndAlign->first,
-                           FrameSizeAndAlign->second, AA);
-      coro::replaceCoroFree(CoroId, /*Elide=*/true);
-      NumOfCoroElided++;
+  auto FrameSizeAndAlign = getFrameLayout(cast<Function>(ResumeAddrConstant));
+
+  auto CallerFunctionName = FEI.ContainingFunction->getName();
+  auto CalleeCoroutineName = CoroId->getCoroutine()->getName();
+
+  if (EligibleForElide && FrameSizeAndAlign) {
+    elideHeapAllocations(FrameSizeAndAlign->first, FrameSizeAndAlign->second);
+    coro::replaceCoroFree(CoroId, /*Elide=*/true);
+    NumOfCoroElided++;
+
 #ifndef NDEBUG
       if (!CoroElideInfoOutputFilename.empty())
-        *getOrCreateLogFile()
-            << "Elide " << CoroId->getCoroutine()->getName() << " in "
-            << CoroId->getFunction()->getName() << "\n";
+        *getOrCreateLogFile() << "Elide " << CalleeCoroutineName << " in "
+                              << FEI.ContainingFunction->getName() << "\n";
 #endif
+
       ORE.emit([&]() {
         return OptimizationRemark(DEBUG_TYPE, "CoroElide", CoroId)
-               << "'" << ore::NV("callee", CoroId->getCoroutine()->getName())
-               << "' elided in '"
-               << ore::NV("caller", CoroId->getFunction()->getName())
+               << "'" << ore::NV("callee", CalleeCoroutineName)
+               << "' elided in '" << ore::NV("caller", CallerFunctionName)
                << "' (frame_size="
                << ore::NV("frame_size", FrameSizeAndAlign->first) << ", align="
                << ore::NV("align", FrameSizeAndAlign->second.value()) << ")";
       });
-    } else {
-      ORE.emit([&]() {
-        return OptimizationRemarkMissed(DEBUG_TYPE, "CoroElide", CoroId)
-               << "'" << ore::NV("callee", CoroId->getCoroutine()->getName())
-               << "' not elided in '"
-               << ore::NV("caller", CoroId->getFunction()->getName())
-               << "' (frame_size=unknown, align=unknown)";
-      });
-    }
+  } else {
+    ORE.emit([&]() {
+      auto Remark = OptimizationRemarkMissed(DEBUG_TYPE, "CoroElide", CoroId)
+                    << "'" << ore::NV("callee", CalleeCoroutineName)
+                    << "' not elided in '"
+                    << ore::NV("caller", CallerFunctionName);
+
+      if (FrameSizeAndAlign)
+        return Remark << "' (frame_size="
+                      << ore::NV("frame_size", FrameSizeAndAlign->first)
+                      << ", align="
+                      << ore::NV("align", FrameSizeAndAlign->second.value())
+                      << ")";
+      else
+        return Remark << "' (frame_size=unknown, align=unknown)";
+    });
   }
 
   return true;
@@ -453,11 +473,9 @@ PreservedAnalyses CoroElidePass::run(Function &F, FunctionAnalysisManager &AM) {
   if (!declaresCoroElideIntrinsics(M))
     return PreservedAnalyses::all();
 
-  Lowerer L(M);
-  L.CoroIds.clear();
-  L.collectPostSplitCoroIds(&F);
-  // If we did not find any coro.id, there is nothing to do.
-  if (L.CoroIds.empty())
+  FunctionElideInfo FEI{&F};
+  // Elide is not necessary if there's no coro.id within the function.
+  if (!FEI.hasCoroIds())
     return PreservedAnalyses::all();
 
   AAResults &AA = AM.getResult<AAManager>(F);
@@ -465,8 +483,10 @@ PreservedAnalyses CoroElidePass::run(Function &F, FunctionAnalysisManager &AM) {
   auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
 
   bool Changed = false;
-  for (auto *CII : L.CoroIds)
-    Changed |= L.processCoroId(CII, AA, DT, ORE);
+  for (auto *CII : FEI.getCoroIds()) {
+    CoroIdElider CIE(CII, FEI, AA, DT, ORE);
+    Changed |= CIE.attemptElide();
+  }
 
   return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
 }
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 3a43b1edcaba..4eb6e75d09fa 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -1846,11 +1846,10 @@ static void splitAsyncCoroutine(Function &F, coro::Shape &Shape,
     auto ProjectionFunctionName =
         Suspend->getAsyncContextProjectionFunction()->getName();
     bool UseSwiftMangling = false;
-    if (ProjectionFunctionName.equals("__swift_async_resume_project_context")) {
+    if (ProjectionFunctionName == "__swift_async_resume_project_context") {
       ResumeNameSuffix = "TQ";
       UseSwiftMangling = true;
-    } else if (ProjectionFunctionName.equals(
-                   "__swift_async_resume_get_context")) {
+    } else if (ProjectionFunctionName == "__swift_async_resume_get_context") {
       ResumeNameSuffix = "TY";
       UseSwiftMangling = true;
     }
diff --git a/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp b/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp
index fb7cba9edbdb..1a8096f647d8 100644
--- a/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp
+++ b/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp
@@ -133,6 +133,7 @@ static inline void maybeHandleGlobals(Module &M) {
       continue;
 
     G.setLinkage(GlobalVariable::ExternalWeakLinkage);
+    G.setInitializer(nullptr);
     G.setExternallyInitialized(true);
   }
 }
diff --git a/llvm/lib/Transforms/IPO/BlockExtractor.cpp b/llvm/lib/Transforms/IPO/BlockExtractor.cpp
index 0c406aa9822e..ec1be35a3316 100644
--- a/llvm/lib/Transforms/IPO/BlockExtractor.cpp
+++ b/llvm/lib/Transforms/IPO/BlockExtractor.cpp
@@ -142,9 +142,8 @@ bool BlockExtractor::runOnModule(Module &M) {
       report_fatal_error("Invalid function name specified in the input file",
                          /*GenCrashDiag=*/false);
     for (const auto &BBInfo : BInfo.second) {
-      auto Res = llvm::find_if(*F, [&](const BasicBlock &BB) {
-        return BB.getName().equals(BBInfo);
-      });
+      auto Res = llvm::find_if(
+          *F, [&](const BasicBlock &BB) { return BB.getName() == BBInfo; });
       if (Res == F->end())
         report_fatal_error("Invalid block name specified in the input file",
                            /*GenCrashDiag=*/false);
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index 8e11cbf1cee4..26a4508aa151 100644
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -1186,10 +1186,15 @@ static bool isReturnNonNull(Function *F, const SCCNodeSet &SCCNodes,
     switch (RVI->getOpcode()) {
     // Extend the analysis by looking upwards.
     case Instruction::BitCast:
-    case Instruction::GetElementPtr:
     case Instruction::AddrSpaceCast:
       FlowsToReturn.insert(RVI->getOperand(0));
       continue;
+    case Instruction::GetElementPtr:
+      if (cast<GEPOperator>(RVI)->isInBounds()) {
+        FlowsToReturn.insert(RVI->getOperand(0));
+        continue;
+      }
+      return false;
     case Instruction::Select: {
       SelectInst *SI = cast<SelectInst>(RVI);
       FlowsToReturn.insert(SI->getTrueValue());
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 403d6cb111ba..da8f81cfeed3 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -1716,13 +1716,15 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) {
       // if needed. Sample counts in profiles are 64-bit unsigned values,
       // but internally branch weights are expressed as 32-bit values.
       if (Weight > std::numeric_limits<uint32_t>::max()) {
-        LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)");
+        LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)\n");
         Weight = std::numeric_limits<uint32_t>::max();
       }
       if (!SampleProfileUseProfi) {
         // Weight is added by one to avoid propagation errors introduced by
         // 0 weights.
-        Weights.push_back(static_cast<uint32_t>(Weight + 1));
+        Weights.push_back(static_cast<uint32_t>(
+            Weight == std::numeric_limits<uint32_t>::max() ? Weight
+                                                           : Weight + 1));
       } else {
         // Profi creates proper weights that do not require "+1" adjustments but
         // we evenly split the weight among branches with the same destination.
diff --git a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp
index 1ca89e0091da..142660bcc58e 100644
--- a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp
@@ -37,7 +37,8 @@ void SampleProfileMatcher::findIRAnchors(
       DIL = DIL->getInlinedAt();
     } while (DIL->getInlinedAt());
 
-    LineLocation Callsite = FunctionSamples::getCallSiteIdentifier(DIL);
+    LineLocation Callsite = FunctionSamples::getCallSiteIdentifier(
+        DIL, FunctionSamples::ProfileIsFS);
     StringRef CalleeName = PrevDIL->getSubprogramLinkageName();
     return std::make_pair(Callsite, CalleeName);
   };
@@ -82,7 +83,8 @@ void SampleProfileMatcher::findIRAnchors(
         if (DIL->getInlinedAt()) {
           IRAnchors.emplace(FindTopLevelInlinedCallsite(DIL));
         } else {
-          LineLocation Callsite = FunctionSamples::getCallSiteIdentifier(DIL);
+          LineLocation Callsite = FunctionSamples::getCallSiteIdentifier(
+              DIL, FunctionSamples::ProfileIsFS);
           StringRef CalleeName = GetCanonicalCalleeName(dyn_cast<CallBase>(&I));
           IRAnchors.emplace(Callsite, CalleeName);
         }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 51ac77348ed9..bff09f567668 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1014,7 +1014,7 @@ static bool matchesSquareSum(BinaryOperator &I, Mul2Rhs M2Rhs, Value *&A,
   // (a * a) + (((a * 2) + b) * b)
   if (match(&I, m_c_BinOp(
                     AddOp, m_OneUse(m_BinOp(MulOp, m_Value(A), m_Deferred(A))),
-                    m_OneUse(m_BinOp(
+                    m_OneUse(m_c_BinOp(
                         MulOp,
                         m_c_BinOp(AddOp, m_BinOp(Mul2Op, m_Deferred(A), M2Rhs),
                                   m_Value(B)),
@@ -1025,16 +1025,16 @@ static bool matchesSquareSum(BinaryOperator &I, Mul2Rhs M2Rhs, Value *&A,
   // +
   // (a * a + b * b) or (b * b + a * a)
   return match(
-      &I,
-      m_c_BinOp(AddOp,
-                m_CombineOr(
-                    m_OneUse(m_BinOp(
-                        Mul2Op, m_BinOp(MulOp, m_Value(A), m_Value(B)), M2Rhs)),
-                    m_OneUse(m_BinOp(MulOp, m_BinOp(Mul2Op, m_Value(A), M2Rhs),
+      &I, m_c_BinOp(
+              AddOp,
+              m_CombineOr(
+                  m_OneUse(m_BinOp(
+                      Mul2Op, m_BinOp(MulOp, m_Value(A), m_Value(B)), M2Rhs)),
+                  m_OneUse(m_c_BinOp(MulOp, m_BinOp(Mul2Op, m_Value(A), M2Rhs),
                                      m_Value(B)))),
-                m_OneUse(m_c_BinOp(
-                    AddOp, m_BinOp(MulOp, m_Deferred(A), m_Deferred(A)),
-                    m_BinOp(MulOp, m_Deferred(B), m_Deferred(B))))));
+              m_OneUse(
+                  m_c_BinOp(AddOp, m_BinOp(MulOp, m_Deferred(A), m_Deferred(A)),
+                            m_BinOp(MulOp, m_Deferred(B), m_Deferred(B))))));
 }
 
 // Fold integer variations of a^2 + 2*a*b + b^2 -> (a + b)^2
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index ed9a89b14efc..8695e9e69df2 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -2504,8 +2504,8 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
           match(C1, m_Power2())) {
         Constant *Log2C1 = ConstantExpr::getExactLogBase2(C1);
         Constant *Cmp =
-            ConstantExpr::getCompare(ICmpInst::ICMP_ULT, Log2C3, C2);
-        if (Cmp->isZeroValue()) {
+            ConstantFoldCompareInstOperands(ICmpInst::ICMP_ULT, Log2C3, C2, DL);
+        if (Cmp && Cmp->isZeroValue()) {
           // iff C1,C3 is pow2 and Log2(C3) >= C2:
           // ((C1 >> X) << C2) & C3 -> X == (cttz(C1)+C2-cttz(C3)) ? C3 : 0
           Constant *ShlC = ConstantExpr::getAdd(C2, Log2C1);
@@ -3599,12 +3599,16 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
 
   // (A ^ B) | ((B ^ C) ^ A) -> (A ^ B) | C
   if (match(Op0, m_Xor(m_Value(A), m_Value(B))))
-    if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A))))
+    if (match(Op1,
+              m_c_Xor(m_c_Xor(m_Specific(B), m_Value(C)), m_Specific(A))) ||
+        match(Op1, m_c_Xor(m_c_Xor(m_Specific(A), m_Value(C)), m_Specific(B))))
       return BinaryOperator::CreateOr(Op0, C);
 
-  // ((A ^ C) ^ B) | (B ^ A) -> (B ^ A) | C
-  if (match(Op0, m_Xor(m_Xor(m_Value(A), m_Value(C)), m_Value(B))))
-    if (match(Op1, m_Xor(m_Specific(B), m_Specific(A))))
+  // ((B ^ C) ^ A) | (A ^ B) -> (A ^ B) | C
+  if (match(Op1, m_Xor(m_Value(A), m_Value(B))))
+    if (match(Op0,
+              m_c_Xor(m_c_Xor(m_Specific(B), m_Value(C)), m_Specific(A))) ||
+        match(Op0, m_c_Xor(m_c_Xor(m_Specific(A), m_Value(C)), m_Specific(B))))
       return BinaryOperator::CreateOr(Op1, C);
 
   if (Instruction *DeMorgan = matchDeMorgansLaws(I, *this))
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 1913ef92c16c..77534e0d3613 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -562,6 +562,13 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) {
           IC.Builder.CreateBinaryIntrinsic(Intrinsic::cttz, C, Op1);
       return BinaryOperator::CreateSub(ConstCttz, X);
     }
+
+    // cttz(add(lshr(UINT_MAX, %val), 1)) --> sub(width, %val)
+    if (match(Op0, m_Add(m_LShr(m_AllOnes(), m_Value(X)), m_One()))) {
+      Value *Width =
+          ConstantInt::get(II.getType(), II.getType()->getScalarSizeInBits());
+      return BinaryOperator::CreateSub(Width, X);
+    }
   } else {
     // ctlz(lshr(%const, %val), 1) --> add(ctlz(%const, 1), %val)
     if (match(Op0, m_LShr(m_ImmConstant(C), m_Value(X))) &&
@@ -1975,7 +1982,8 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       if (ModuloC != ShAmtC)
         return replaceOperand(*II, 2, ModuloC);
 
-      assert(match(ConstantExpr::getICmp(ICmpInst::ICMP_UGT, WidthC, ShAmtC),
+      assert(match(ConstantFoldCompareInstOperands(ICmpInst::ICMP_UGT, WidthC,
+                                                   ShAmtC, DL),
                    m_One()) &&
              "Shift amount expected to be modulo bitwidth");
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index f66883de8dd5..9883d02c87a3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1479,19 +1479,29 @@ Instruction *InstCombinerImpl::foldICmpTruncConstant(ICmpInst &Cmp,
   return nullptr;
 }
 
-/// Fold icmp (trunc X), (trunc Y).
-/// Fold icmp (trunc X), (zext Y).
+/// Fold icmp (trunc nuw/nsw X), (trunc nuw/nsw Y).
+/// Fold icmp (trunc nuw/nsw X), (zext/sext Y).
 Instruction *
 InstCombinerImpl::foldICmpTruncWithTruncOrExt(ICmpInst &Cmp,
                                               const SimplifyQuery &Q) {
-  if (Cmp.isSigned())
-    return nullptr;
-
   Value *X, *Y;
   ICmpInst::Predicate Pred;
-  bool YIsZext = false;
+  bool YIsSExt = false;
   // Try to match icmp (trunc X), (trunc Y)
   if (match(&Cmp, m_ICmp(Pred, m_Trunc(m_Value(X)), m_Trunc(m_Value(Y))))) {
+    unsigned NoWrapFlags = cast<TruncInst>(Cmp.getOperand(0))->getNoWrapKind() &
+                           cast<TruncInst>(Cmp.getOperand(1))->getNoWrapKind();
+    if (Cmp.isSigned()) {
+      // For signed comparisons, both truncs must be nsw.
+      if (!(NoWrapFlags & TruncInst::NoSignedWrap))
+        return nullptr;
+    } else {
+      // For unsigned and equality comparisons, either both must be nuw or
+      // both must be nsw, we don't care which.
+      if (!NoWrapFlags)
+        return nullptr;
+    }
+
     if (X->getType() != Y->getType() &&
         (!Cmp.getOperand(0)->hasOneUse() || !Cmp.getOperand(1)->hasOneUse()))
       return nullptr;
@@ -1500,13 +1510,21 @@ InstCombinerImpl::foldICmpTruncWithTruncOrExt(ICmpInst &Cmp,
       std::swap(X, Y);
       Pred = Cmp.getSwappedPredicate(Pred);
     }
+    YIsSExt = !(NoWrapFlags & TruncInst::NoUnsignedWrap);
   }
-  // Try to match icmp (trunc X), (zext Y)
-  else if (match(&Cmp, m_c_ICmp(Pred, m_Trunc(m_Value(X)),
-                                m_OneUse(m_ZExt(m_Value(Y))))))
-
-    YIsZext = true;
-  else
+  // Try to match icmp (trunc nuw X), (zext Y)
+  else if (!Cmp.isSigned() &&
+           match(&Cmp, m_c_ICmp(Pred, m_NUWTrunc(m_Value(X)),
+                                m_OneUse(m_ZExt(m_Value(Y)))))) {
+    // Can fold trunc nuw + zext for unsigned and equality predicates.
+  }
+  // Try to match icmp (trunc nsw X), (sext Y)
+  else if (match(&Cmp, m_c_ICmp(Pred, m_NSWTrunc(m_Value(X)),
+                                m_OneUse(m_ZExtOrSExt(m_Value(Y)))))) {
+    // Can fold trunc nsw + zext/sext for all predicates.
+    YIsSExt =
+        isa<SExtInst>(Cmp.getOperand(0)) || isa<SExtInst>(Cmp.getOperand(1));
+  } else
     return nullptr;
 
   Type *TruncTy = Cmp.getOperand(0)->getType();
@@ -1518,19 +1536,7 @@ InstCombinerImpl::foldICmpTruncWithTruncOrExt(ICmpInst &Cmp,
       !isDesirableIntType(X->getType()->getScalarSizeInBits()))
     return nullptr;
 
-  // Check if the trunc is unneeded.
-  KnownBits KnownX = llvm::computeKnownBits(X, /*Depth*/ 0, Q);
-  if (KnownX.countMaxActiveBits() > TruncBits)
-    return nullptr;
-
-  if (!YIsZext) {
-    // If Y is also a trunc, make sure it is unneeded.
-    KnownBits KnownY = llvm::computeKnownBits(Y, /*Depth*/ 0, Q);
-    if (KnownY.countMaxActiveBits() > TruncBits)
-      return nullptr;
-  }
-
-  Value *NewY = Builder.CreateZExtOrTrunc(Y, X->getType());
+  Value *NewY = Builder.CreateIntCast(Y, X->getType(), YIsSExt);
   return new ICmpInst(Pred, X, NewY);
 }
 
@@ -2473,6 +2479,16 @@ Instruction *InstCombinerImpl::foldICmpShrConstant(ICmpInst &Cmp,
   // those conditions rather than checking them. This is difficult because of
   // undef/poison (PR34838).
   if (IsAShr && Shr->hasOneUse()) {
+    if (IsExact && (Pred == CmpInst::ICMP_SLT || Pred == CmpInst::ICMP_ULT) &&
+        (C - 1).isPowerOf2() && C.countLeadingZeros() > ShAmtVal) {
+      // When C - 1 is a power of two and the transform can be legally
+      // performed, prefer this form so the produced constant is close to a
+      // power of two.
+      // icmp slt/ult (ashr exact X, ShAmtC), C
+      // --> icmp slt/ult X, (C - 1) << ShAmtC) + 1
+      APInt ShiftedC = (C - 1).shl(ShAmtVal) + 1;
+      return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC));
+    }
     if (IsExact || Pred == CmpInst::ICMP_SLT || Pred == CmpInst::ICMP_ULT) {
       // When ShAmtC can be shifted losslessly:
       // icmp PRED (ashr exact X, ShAmtC), C --> icmp PRED X, (C << ShAmtC)
@@ -3170,15 +3186,12 @@ Instruction *InstCombinerImpl::foldICmpSelectConstant(ICmpInst &Cmp,
                               C3GreaterThan)) {
     assert(C1LessThan && C2Equal && C3GreaterThan);
 
-    bool TrueWhenLessThan =
-        ConstantExpr::getCompare(Cmp.getPredicate(), C1LessThan, C)
-            ->isAllOnesValue();
-    bool TrueWhenEqual =
-        ConstantExpr::getCompare(Cmp.getPredicate(), C2Equal, C)
-            ->isAllOnesValue();
-    bool TrueWhenGreaterThan =
-        ConstantExpr::getCompare(Cmp.getPredicate(), C3GreaterThan, C)
-            ->isAllOnesValue();
+    bool TrueWhenLessThan = ICmpInst::compare(
+        C1LessThan->getValue(), C->getValue(), Cmp.getPredicate());
+    bool TrueWhenEqual = ICmpInst::compare(C2Equal->getValue(), C->getValue(),
+                                           Cmp.getPredicate());
+    bool TrueWhenGreaterThan = ICmpInst::compare(
+        C3GreaterThan->getValue(), C->getValue(), Cmp.getPredicate());
 
     // This generates the new instruction that will replace the original Cmp
     // Instruction. Instead of enumerating the various combinations when
@@ -7123,34 +7136,30 @@ Instruction *InstCombinerImpl::foldICmpCommutative(ICmpInst::Predicate Pred,
     return replaceInstUsesWith(CxtI, V);
 
   // Folding (X / Y) pred X => X swap(pred) 0 for constant Y other than 0 or 1
+  auto CheckUGT1 = [](const APInt &Divisor) { return Divisor.ugt(1); };
   {
-    const APInt *Divisor;
-    if (match(Op0, m_UDiv(m_Specific(Op1), m_APInt(Divisor))) &&
-        Divisor->ugt(1)) {
+    if (match(Op0, m_UDiv(m_Specific(Op1), m_CheckedInt(CheckUGT1)))) {
       return new ICmpInst(ICmpInst::getSwappedPredicate(Pred), Op1,
                           Constant::getNullValue(Op1->getType()));
     }
 
     if (!ICmpInst::isUnsigned(Pred) &&
-        match(Op0, m_SDiv(m_Specific(Op1), m_APInt(Divisor))) &&
-        Divisor->ugt(1)) {
+        match(Op0, m_SDiv(m_Specific(Op1), m_CheckedInt(CheckUGT1)))) {
       return new ICmpInst(ICmpInst::getSwappedPredicate(Pred), Op1,
                           Constant::getNullValue(Op1->getType()));
     }
   }
 
   // Another case of this fold is (X >> Y) pred X => X swap(pred) 0 if Y != 0
+  auto CheckNE0 = [](const APInt &Shift) { return !Shift.isZero(); };
   {
-    const APInt *Shift;
-    if (match(Op0, m_LShr(m_Specific(Op1), m_APInt(Shift))) &&
-        !Shift->isZero()) {
+    if (match(Op0, m_LShr(m_Specific(Op1), m_CheckedInt(CheckNE0)))) {
       return new ICmpInst(ICmpInst::getSwappedPredicate(Pred), Op1,
                           Constant::getNullValue(Op1->getType()));
     }
 
     if ((Pred == CmpInst::ICMP_SLT || Pred == CmpInst::ICMP_SGE) &&
-        match(Op0, m_AShr(m_Specific(Op1), m_APInt(Shift))) &&
-        !Shift->isZero()) {
+        match(Op0, m_AShr(m_Specific(Op1), m_CheckedInt(CheckNE0)))) {
       return new ICmpInst(ICmpInst::getSwappedPredicate(Pred), Op1,
                           Constant::getNullValue(Op1->getType()));
     }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index c70872c12917..537890d9025f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -382,12 +382,12 @@ void PointerReplacer::replace(Instruction *I) {
   } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
     auto *V = getReplacement(GEP->getPointerOperand());
     assert(V && "Operand not replaced");
-    SmallVector<Value *, 8> Indices;
-    Indices.append(GEP->idx_begin(), GEP->idx_end());
+    SmallVector<Value *, 8> Indices(GEP->indices());
     auto *NewI =
         GetElementPtrInst::Create(GEP->getSourceElementType(), V, Indices);
     IC.InsertNewInstWith(NewI, GEP->getIterator());
     NewI->takeName(GEP);
+    NewI->setIsInBounds(GEP->isInBounds());
     WorkMap[GEP] = NewI;
   } else if (auto *BC = dyn_cast<BitCastInst>(I)) {
     auto *V = getReplacement(BC->getOperand(0));
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 8818369e7945..a3ddb402bf66 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1365,7 +1365,8 @@ Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel,
 // Also ULT predicate can also be UGT iff C0 != -1 (+invert result)
 //      SLT predicate can also be SGT iff C2 != INT_MAX (+invert res.)
 static Value *canonicalizeClampLike(SelectInst &Sel0, ICmpInst &Cmp0,
-                                    InstCombiner::BuilderTy &Builder) {
+                                    InstCombiner::BuilderTy &Builder,
+                                    InstCombiner &IC) {
   Value *X = Sel0.getTrueValue();
   Value *Sel1 = Sel0.getFalseValue();
 
@@ -1493,14 +1494,14 @@ static Value *canonicalizeClampLike(SelectInst &Sel0, ICmpInst &Cmp0,
     std::swap(ThresholdLowIncl, ThresholdHighExcl);
 
   // The fold has a precondition 1: C2 s>= ThresholdLow
-  auto *Precond1 = ConstantExpr::getICmp(ICmpInst::Predicate::ICMP_SGE, C2,
-                                         ThresholdLowIncl);
-  if (!match(Precond1, m_One()))
+  auto *Precond1 = ConstantFoldCompareInstOperands(
+      ICmpInst::Predicate::ICMP_SGE, C2, ThresholdLowIncl, IC.getDataLayout());
+  if (!Precond1 || !match(Precond1, m_One()))
     return nullptr;
   // The fold has a precondition 2: C2 s<= ThresholdHigh
-  auto *Precond2 = ConstantExpr::getICmp(ICmpInst::Predicate::ICMP_SLE, C2,
-                                         ThresholdHighExcl);
-  if (!match(Precond2, m_One()))
+  auto *Precond2 = ConstantFoldCompareInstOperands(
+      ICmpInst::Predicate::ICMP_SLE, C2, ThresholdHighExcl, IC.getDataLayout());
+  if (!Precond2 || !match(Precond2, m_One()))
     return nullptr;
 
   // If we are matching from a truncated input, we need to sext the
@@ -1803,7 +1804,7 @@ Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI,
   if (Value *V = foldSelectInstWithICmpConst(SI, ICI, Builder))
     return replaceInstUsesWith(SI, V);
 
-  if (Value *V = canonicalizeClampLike(SI, *ICI, Builder))
+  if (Value *V = canonicalizeClampLike(SI, *ICI, Builder, *this))
     return replaceInstUsesWith(SI, V);
 
   if (Instruction *NewSel =
@@ -3342,7 +3343,8 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) {
 // pattern.
 static bool isSafeToRemoveBitCeilSelect(ICmpInst::Predicate Pred, Value *Cond0,
                                         const APInt *Cond1, Value *CtlzOp,
-                                        unsigned BitWidth) {
+                                        unsigned BitWidth,
+                                        bool &ShouldDropNUW) {
   // The challenge in recognizing std::bit_ceil(X) is that the operand is used
   // for the CTLZ proper and select condition, each possibly with some
   // operation like add and sub.
@@ -3365,6 +3367,8 @@ static bool isSafeToRemoveBitCeilSelect(ICmpInst::Predicate Pred, Value *Cond0,
   ConstantRange CR = ConstantRange::makeExactICmpRegion(
       CmpInst::getInversePredicate(Pred), *Cond1);
 
+  ShouldDropNUW = false;
+
   // Match the operation that's used to compute CtlzOp from CommonAncestor.  If
   // CtlzOp == CommonAncestor, return true as no operation is needed.  If a
   // match is found, execute the operation on CR, update CR, and return true.
@@ -3378,6 +3382,7 @@ static bool isSafeToRemoveBitCeilSelect(ICmpInst::Predicate Pred, Value *Cond0,
       return true;
     }
     if (match(CtlzOp, m_Sub(m_APInt(C), m_Specific(CommonAncestor)))) {
+      ShouldDropNUW = true;
       CR = ConstantRange(*C).sub(CR);
       return true;
     }
@@ -3447,14 +3452,20 @@ static Instruction *foldBitCeil(SelectInst &SI, IRBuilderBase &Builder) {
     Pred = CmpInst::getInversePredicate(Pred);
   }
 
+  bool ShouldDropNUW;
+
   if (!match(FalseVal, m_One()) ||
       !match(TrueVal,
              m_OneUse(m_Shl(m_One(), m_OneUse(m_Sub(m_SpecificInt(BitWidth),
                                                     m_Value(Ctlz)))))) ||
       !match(Ctlz, m_Intrinsic<Intrinsic::ctlz>(m_Value(CtlzOp), m_Zero())) ||
-      !isSafeToRemoveBitCeilSelect(Pred, Cond0, Cond1, CtlzOp, BitWidth))
+      !isSafeToRemoveBitCeilSelect(Pred, Cond0, Cond1, CtlzOp, BitWidth,
+                                   ShouldDropNUW))
     return nullptr;
 
+  if (ShouldDropNUW)
+    cast<Instruction>(CtlzOp)->setHasNoUnsignedWrap(false);
+
   // Build 1 << (-CTLZ & (BitWidth-1)).  The negation likely corresponds to a
   // single hardware instruction as opposed to BitWidth - CTLZ, where BitWidth
   // is an integer constant.  Masking with BitWidth-1 comes free on some
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 1cb21a1d81af..ba297111d945 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -1259,6 +1259,54 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) {
       match(Op1, m_SpecificIntAllowPoison(BitWidth - 1)))
     return new ZExtInst(Builder.CreateIsNotNeg(X, "isnotneg"), Ty);
 
+  // ((X << nuw Z) sub nuw Y) >>u exact Z --> X sub nuw (Y >>u exact Z)
+  Value *Y;
+  if (I.isExact() &&
+      match(Op0, m_OneUse(m_NUWSub(m_NUWShl(m_Value(X), m_Specific(Op1)),
+                                   m_Value(Y))))) {
+    Value *NewLshr = Builder.CreateLShr(Y, Op1, "", /*isExact=*/true);
+    auto *NewSub = BinaryOperator::CreateNUWSub(X, NewLshr);
+    NewSub->setHasNoSignedWrap(
+        cast<OverflowingBinaryOperator>(Op0)->hasNoSignedWrap());
+    return NewSub;
+  }
+
+  auto isSuitableBinOpcode = [](Instruction::BinaryOps BinOpcode) {
+    switch (BinOpcode) {
+    default:
+      return false;
+    case Instruction::Add:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+      // Sub is handled separately.
+      return true;
+    }
+  };
+
+  // If both the binop and the shift are nuw, then:
+  // ((X << nuw Z) binop nuw Y) >>u Z --> X binop nuw (Y >>u Z)
+  if (match(Op0, m_OneUse(m_c_BinOp(m_NUWShl(m_Value(X), m_Specific(Op1)),
+                                    m_Value(Y))))) {
+    BinaryOperator *Op0OB = cast<BinaryOperator>(Op0);
+    if (isSuitableBinOpcode(Op0OB->getOpcode())) {
+      if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(Op0);
+          !OBO || OBO->hasNoUnsignedWrap()) {
+        Value *NewLshr = Builder.CreateLShr(
+            Y, Op1, "", I.isExact() && Op0OB->getOpcode() != Instruction::And);
+        auto *NewBinOp = BinaryOperator::Create(Op0OB->getOpcode(), NewLshr, X);
+        if (OBO) {
+          NewBinOp->setHasNoUnsignedWrap(true);
+          NewBinOp->setHasNoSignedWrap(OBO->hasNoSignedWrap());
+        } else if (auto *Disjoint = dyn_cast<PossiblyDisjointInst>(Op0)) {
+          cast<PossiblyDisjointInst>(NewBinOp)->setIsDisjoint(
+              Disjoint->isDisjoint());
+        }
+        return NewBinOp;
+      }
+    }
+  }
+
   if (match(Op1, m_APInt(C))) {
     unsigned ShAmtC = C->getZExtValue();
     auto *II = dyn_cast<IntrinsicInst>(Op0);
@@ -1275,7 +1323,6 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) {
       return new ZExtInst(Cmp, Ty);
     }
 
-    Value *X;
     const APInt *C1;
     if (match(Op0, m_Shl(m_Value(X), m_APInt(C1))) && C1->ult(BitWidth)) {
       if (C1->ult(ShAmtC)) {
@@ -1320,7 +1367,6 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) {
     // ((X << C) + Y) >>u C --> (X + (Y >>u C)) & (-1 >>u C)
     // TODO: Consolidate with the more general transform that starts from shl
     //       (the shifts are in the opposite order).
-    Value *Y;
     if (match(Op0,
               m_OneUse(m_c_Add(m_OneUse(m_Shl(m_Value(X), m_Specific(Op1))),
                                m_Value(Y))))) {
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index b6f8b24f43b8..6c25ff215c37 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -808,9 +808,12 @@ Instruction *InstCombinerImpl::tryFoldInstWithCtpopWithNot(Instruction *I) {
   Constant *BitWidthC = ConstantInt::get(Ty, Ty->getScalarSizeInBits());
   // Need extra check for icmp. Note if this check is true, it generally means
   // the icmp will simplify to true/false.
-  if (Opc == Instruction::ICmp && !cast<ICmpInst>(I)->isEquality() &&
-      !ConstantExpr::getICmp(ICmpInst::ICMP_UGT, C, BitWidthC)->isZeroValue())
-    return nullptr;
+  if (Opc == Instruction::ICmp && !cast<ICmpInst>(I)->isEquality()) {
+    Constant *Cmp =
+        ConstantFoldCompareInstOperands(ICmpInst::ICMP_UGT, C, BitWidthC, DL);
+    if (!Cmp || !Cmp->isZeroValue())
+      return nullptr;
+  }
 
   // Check we can invert `(not x)` for free.
   bool Consumes = false;
diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index 54b51b520369..851edb4ce829 100644
--- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -357,7 +357,7 @@ public:
 /// useful for updating calls of the old function to the new type.
 struct TransformedFunction {
   TransformedFunction(FunctionType *OriginalType, FunctionType *TransformedType,
-                      std::vector<unsigned> ArgumentIndexMapping)
+                      const std::vector<unsigned> &ArgumentIndexMapping)
       : OriginalType(OriginalType), TransformedType(TransformedType),
         ArgumentIndexMapping(ArgumentIndexMapping) {}
 
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index fa661b17c13a..fca1824165e7 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1271,6 +1271,9 @@ Value *HWAddressSanitizer::getFrameRecordInfo(IRBuilder<> &IRB) {
   // FP is 0xfffffffffffFFFF0  (4 lower bits are zero)
   // We only really need ~20 lower non-zero bits (FFFF), so we mix like this:
   //       0xFFFFPPPPPPPPPPPP
+  //
+  // FP works because in AArch64FrameLowering::getFrameIndexReference, we
+  // prefer FP-relative offsets for functions compiled with HWASan.
   FP = IRB.CreateShl(FP, 44);
   return IRB.CreateOr(PC, FP);
 }
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
index 2445ba7f0dda..c0a3bf8464d2 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
@@ -140,6 +140,15 @@ static cl::opt<int> ClDebugMin("memprof-debug-min", cl::desc("Debug min inst"),
 static cl::opt<int> ClDebugMax("memprof-debug-max", cl::desc("Debug max inst"),
                                cl::Hidden, cl::init(-1));
 
+// By default disable matching of allocation profiles onto operator new that
+// already explicitly pass a hot/cold hint, since we don't currently
+// override these hints anyway.
+static cl::opt<bool> ClMemProfMatchHotColdNew(
+    "memprof-match-hot-cold-new",
+    cl::desc(
+        "Match allocation profiles onto existing hot/cold operator new calls"),
+    cl::Hidden, cl::init(false));
+
 STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
 STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
 STATISTIC(NumSkippedStackReads, "Number of non-instrumented stack reads");
@@ -661,6 +670,37 @@ stackFrameIncludesInlinedCallStack(ArrayRef<Frame> ProfileCallStack,
   return InlCallStackIter == InlinedCallStack.end();
 }
 
+static bool isNewWithHotColdVariant(Function *Callee,
+                                    const TargetLibraryInfo &TLI) {
+  if (!Callee)
+    return false;
+  LibFunc Func;
+  if (!TLI.getLibFunc(*Callee, Func))
+    return false;
+  switch (Func) {
+  case LibFunc_Znwm:
+  case LibFunc_ZnwmRKSt9nothrow_t:
+  case LibFunc_ZnwmSt11align_val_t:
+  case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t:
+  case LibFunc_Znam:
+  case LibFunc_ZnamRKSt9nothrow_t:
+  case LibFunc_ZnamSt11align_val_t:
+  case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t:
+    return true;
+  case LibFunc_Znwm12__hot_cold_t:
+  case LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t:
+  case LibFunc_ZnwmSt11align_val_t12__hot_cold_t:
+  case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t:
+  case LibFunc_Znam12__hot_cold_t:
+  case LibFunc_ZnamRKSt9nothrow_t12__hot_cold_t:
+  case LibFunc_ZnamSt11align_val_t12__hot_cold_t:
+  case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t:
+    return ClMemProfMatchHotColdNew;
+  default:
+    return false;
+  }
+}
+
 static void readMemprof(Module &M, Function &F,
                         IndexedInstrProfReader *MemProfReader,
                         const TargetLibraryInfo &TLI) {
@@ -812,7 +852,7 @@ static void readMemprof(Module &M, Function &F,
       if (AllocInfoIter != LocHashToAllocInfo.end()) {
         // Only consider allocations via new, to reduce unnecessary metadata,
         // since those are the only allocations that will be targeted initially.
-        if (!isNewLikeFn(CI, &TLI))
+        if (!isNewWithHotColdVariant(CI->getCalledFunction(), TLI))
           continue;
         // We may match this instruction's location list to multiple MIB
         // contexts. Add them to a Trie specialized for trimming the contexts to
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 2b504b893ddb..b352558a1c0d 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -1237,9 +1237,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // Note: The loop based formation works for fixed length vectors too,
     // however we prefer to unroll and specialize alignment below.
     if (TS.isScalable()) {
-      Value *Size = IRB.CreateTypeSize(IRB.getInt32Ty(), TS);
-      Value *RoundUp = IRB.CreateAdd(Size, IRB.getInt32(kOriginSize - 1));
-      Value *End = IRB.CreateUDiv(RoundUp, IRB.getInt32(kOriginSize));
+      Value *Size = IRB.CreateTypeSize(MS.IntptrTy, TS);
+      Value *RoundUp =
+          IRB.CreateAdd(Size, ConstantInt::get(MS.IntptrTy, kOriginSize - 1));
+      Value *End =
+          IRB.CreateUDiv(RoundUp, ConstantInt::get(MS.IntptrTy, kOriginSize));
       auto [InsertPt, Index] =
         SplitBlockAndInsertSimpleForLoop(End, &*IRB.GetInsertPoint());
       IRB.SetInsertPoint(InsertPt);
@@ -4458,8 +4460,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       InsPoint = &I;
     NextNodeIRBuilder IRB(InsPoint);
     const DataLayout &DL = F.getParent()->getDataLayout();
-    uint64_t TypeSize = DL.getTypeAllocSize(I.getAllocatedType());
-    Value *Len = ConstantInt::get(MS.IntptrTy, TypeSize);
+    TypeSize TS = DL.getTypeAllocSize(I.getAllocatedType());
+    Value *Len = IRB.CreateTypeSize(MS.IntptrTy, TS);
     if (I.isArrayAllocation())
       Len = IRB.CreateMul(Len,
                           IRB.CreateZExtOrTrunc(I.getArraySize(), MS.IntptrTy));
diff --git a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
index 9d6dd5ccb38b..76afa2f22461 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
@@ -8,10 +8,20 @@
 //
 
 #include "llvm/Transforms/Instrumentation/PGOCtxProfLowering.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/IR/Analysis.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/Support/CommandLine.h"
+#include <utility>
 
 using namespace llvm;
 
+#define DEBUG_TYPE "ctx-instr-lower"
+
 static cl::list<std::string> ContextRoots(
     "profile-context-root", cl::Hidden,
     cl::desc(
@@ -22,3 +32,319 @@ static cl::list<std::string> ContextRoots(
 bool PGOCtxProfLoweringPass::isContextualIRPGOEnabled() {
   return !ContextRoots.empty();
 }
+
+// the names of symbols we expect in compiler-rt. Using a namespace for
+// readability.
+namespace CompilerRtAPINames {
+static auto StartCtx = "__llvm_ctx_profile_start_context";
+static auto ReleaseCtx = "__llvm_ctx_profile_release_context";
+static auto GetCtx = "__llvm_ctx_profile_get_context";
+static auto ExpectedCalleeTLS = "__llvm_ctx_profile_expected_callee";
+static auto CallsiteTLS = "__llvm_ctx_profile_callsite";
+} // namespace CompilerRtAPINames
+
+namespace {
+// The lowering logic and state.
+class CtxInstrumentationLowerer final {
+  Module &M;
+  ModuleAnalysisManager &MAM;
+  Type *ContextNodeTy = nullptr;
+  Type *ContextRootTy = nullptr;
+
+  DenseMap<const Function *, Constant *> ContextRootMap;
+  Function *StartCtx = nullptr;
+  Function *GetCtx = nullptr;
+  Function *ReleaseCtx = nullptr;
+  GlobalVariable *ExpectedCalleeTLS = nullptr;
+  GlobalVariable *CallsiteInfoTLS = nullptr;
+
+public:
+  CtxInstrumentationLowerer(Module &M, ModuleAnalysisManager &MAM);
+  // return true if lowering happened (i.e. a change was made)
+  bool lowerFunction(Function &F);
+};
+
+// llvm.instrprof.increment[.step] captures the total number of counters as one
+// of its parameters, and llvm.instrprof.callsite captures the total number of
+// callsites. Those values are the same for instances of those intrinsics in
+// this function. Find the first instance of each and return them.
+std::pair<uint32_t, uint32_t> getNrCountersAndCallsites(const Function &F) {
+  uint32_t NrCounters = 0;
+  uint32_t NrCallsites = 0;
+  for (const auto &BB : F) {
+    for (const auto &I : BB) {
+      if (const auto *Incr = dyn_cast<InstrProfIncrementInst>(&I)) {
+        uint32_t V =
+            static_cast<uint32_t>(Incr->getNumCounters()->getZExtValue());
+        assert((!NrCounters || V == NrCounters) &&
+               "expected all llvm.instrprof.increment[.step] intrinsics to "
+               "have the same total nr of counters parameter");
+        NrCounters = V;
+      } else if (const auto *CSIntr = dyn_cast<InstrProfCallsite>(&I)) {
+        uint32_t V =
+            static_cast<uint32_t>(CSIntr->getNumCounters()->getZExtValue());
+        assert((!NrCallsites || V == NrCallsites) &&
+               "expected all llvm.instrprof.callsite intrinsics to have the "
+               "same total nr of callsites parameter");
+        NrCallsites = V;
+      }
+#if NDEBUG
+      if (NrCounters && NrCallsites)
+        return std::make_pair(NrCounters, NrCallsites);
+#endif
+    }
+  }
+  return {NrCounters, NrCallsites};
+}
+} // namespace
+
+// set up tie-in with compiler-rt.
+// NOTE!!!
+// These have to match compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
+CtxInstrumentationLowerer::CtxInstrumentationLowerer(Module &M,
+                                                     ModuleAnalysisManager &MAM)
+    : M(M), MAM(MAM) {
+  auto *PointerTy = PointerType::get(M.getContext(), 0);
+  auto *SanitizerMutexType = Type::getInt8Ty(M.getContext());
+  auto *I32Ty = Type::getInt32Ty(M.getContext());
+  auto *I64Ty = Type::getInt64Ty(M.getContext());
+
+  // The ContextRoot type
+  ContextRootTy =
+      StructType::get(M.getContext(), {
+                                          PointerTy,          /*FirstNode*/
+                                          PointerTy,          /*FirstMemBlock*/
+                                          PointerTy,          /*CurrentMem*/
+                                          SanitizerMutexType, /*Taken*/
+                                      });
+  // The Context header.
+  ContextNodeTy = StructType::get(M.getContext(), {
+                                                      I64Ty,     /*Guid*/
+                                                      PointerTy, /*Next*/
+                                                      I32Ty,     /*NrCounters*/
+                                                      I32Ty,     /*NrCallsites*/
+                                                  });
+
+  // Define a global for each entrypoint. We'll reuse the entrypoint's name as
+  // prefix. We assume the entrypoint names to be unique.
+  for (const auto &Fname : ContextRoots) {
+    if (const auto *F = M.getFunction(Fname)) {
+      if (F->isDeclaration())
+        continue;
+      auto *G = M.getOrInsertGlobal(Fname + "_ctx_root", ContextRootTy);
+      cast<GlobalVariable>(G)->setInitializer(
+          Constant::getNullValue(ContextRootTy));
+      ContextRootMap.insert(std::make_pair(F, G));
+      for (const auto &BB : *F)
+        for (const auto &I : BB)
+          if (const auto *CB = dyn_cast<CallBase>(&I))
+            if (CB->isMustTailCall()) {
+              M.getContext().emitError(
+                  "The function " + Fname +
+                  " was indicated as a context root, but it features musttail "
+                  "calls, which is not supported.");
+            }
+    }
+  }
+
+  // Declare the functions we will call.
+  StartCtx = cast<Function>(
+      M.getOrInsertFunction(
+           CompilerRtAPINames::StartCtx,
+           FunctionType::get(ContextNodeTy->getPointerTo(),
+                             {ContextRootTy->getPointerTo(), /*ContextRoot*/
+                              I64Ty, /*Guid*/ I32Ty,
+                              /*NrCounters*/ I32Ty /*NrCallsites*/},
+                             false))
+          .getCallee());
+  GetCtx = cast<Function>(
+      M.getOrInsertFunction(CompilerRtAPINames::GetCtx,
+                            FunctionType::get(ContextNodeTy->getPointerTo(),
+                                              {PointerTy, /*Callee*/
+                                               I64Ty,     /*Guid*/
+                                               I32Ty,     /*NrCounters*/
+                                               I32Ty},    /*NrCallsites*/
+                                              false))
+          .getCallee());
+  ReleaseCtx = cast<Function>(
+      M.getOrInsertFunction(
+           CompilerRtAPINames::ReleaseCtx,
+           FunctionType::get(Type::getVoidTy(M.getContext()),
+                             {
+                                 ContextRootTy->getPointerTo(), /*ContextRoot*/
+                             },
+                             false))
+          .getCallee());
+
+  // Declare the TLSes we will need to use.
+  CallsiteInfoTLS =
+      new GlobalVariable(M, PointerTy, false, GlobalValue::ExternalLinkage,
+                         nullptr, CompilerRtAPINames::CallsiteTLS);
+  CallsiteInfoTLS->setThreadLocal(true);
+  CallsiteInfoTLS->setVisibility(llvm::GlobalValue::HiddenVisibility);
+  ExpectedCalleeTLS =
+      new GlobalVariable(M, PointerTy, false, GlobalValue::ExternalLinkage,
+                         nullptr, CompilerRtAPINames::ExpectedCalleeTLS);
+  ExpectedCalleeTLS->setThreadLocal(true);
+  ExpectedCalleeTLS->setVisibility(llvm::GlobalValue::HiddenVisibility);
+}
+
+PreservedAnalyses PGOCtxProfLoweringPass::run(Module &M,
+                                              ModuleAnalysisManager &MAM) {
+  CtxInstrumentationLowerer Lowerer(M, MAM);
+  bool Changed = false;
+  for (auto &F : M)
+    Changed |= Lowerer.lowerFunction(F);
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
+
+bool CtxInstrumentationLowerer::lowerFunction(Function &F) {
+  if (F.isDeclaration())
+    return false;
+  auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+
+  Value *Guid = nullptr;
+  auto [NrCounters, NrCallsites] = getNrCountersAndCallsites(F);
+
+  Value *Context = nullptr;
+  Value *RealContext = nullptr;
+
+  StructType *ThisContextType = nullptr;
+  Value *TheRootContext = nullptr;
+  Value *ExpectedCalleeTLSAddr = nullptr;
+  Value *CallsiteInfoTLSAddr = nullptr;
+
+  auto &Head = F.getEntryBlock();
+  for (auto &I : Head) {
+    // Find the increment intrinsic in the entry basic block.
+    if (auto *Mark = dyn_cast<InstrProfIncrementInst>(&I)) {
+      assert(Mark->getIndex()->isZero());
+
+      IRBuilder<> Builder(Mark);
+      // FIXME(mtrofin): use InstrProfSymtab::getCanonicalName
+      Guid = Builder.getInt64(F.getGUID());
+      // The type of the context of this function is now knowable since we have
+      // NrCallsites and NrCounters. We delcare it here because it's more
+      // convenient - we have the Builder.
+      ThisContextType = StructType::get(
+          F.getContext(),
+          {ContextNodeTy, ArrayType::get(Builder.getInt64Ty(), NrCounters),
+           ArrayType::get(Builder.getPtrTy(), NrCallsites)});
+      // Figure out which way we obtain the context object for this function -
+      // if it's an entrypoint, then we call StartCtx, otherwise GetCtx. In the
+      // former case, we also set TheRootContext since we need to release it
+      // at the end (plus it can be used to know if we have an entrypoint or a
+      // regular function)
+      auto Iter = ContextRootMap.find(&F);
+      if (Iter != ContextRootMap.end()) {
+        TheRootContext = Iter->second;
+        Context = Builder.CreateCall(StartCtx, {TheRootContext, Guid,
+                                                Builder.getInt32(NrCounters),
+                                                Builder.getInt32(NrCallsites)});
+        ORE.emit(
+            [&] { return OptimizationRemark(DEBUG_TYPE, "Entrypoint", &F); });
+      } else {
+        Context =
+            Builder.CreateCall(GetCtx, {&F, Guid, Builder.getInt32(NrCounters),
+                                        Builder.getInt32(NrCallsites)});
+        ORE.emit([&] {
+          return OptimizationRemark(DEBUG_TYPE, "RegularFunction", &F);
+        });
+      }
+      // The context could be scratch.
+      auto *CtxAsInt = Builder.CreatePtrToInt(Context, Builder.getInt64Ty());
+      if (NrCallsites > 0) {
+        // Figure out which index of the TLS 2-element buffers to use.
+        // Scratch context => we use index == 1. Real contexts => index == 0.
+        auto *Index = Builder.CreateAnd(CtxAsInt, Builder.getInt64(1));
+        // The GEPs corresponding to that index, in the respective TLS.
+        ExpectedCalleeTLSAddr = Builder.CreateGEP(
+            Builder.getInt8Ty()->getPointerTo(),
+            Builder.CreateThreadLocalAddress(ExpectedCalleeTLS), {Index});
+        CallsiteInfoTLSAddr = Builder.CreateGEP(
+            Builder.getInt32Ty(),
+            Builder.CreateThreadLocalAddress(CallsiteInfoTLS), {Index});
+      }
+      // Because the context pointer may have LSB set (to indicate scratch),
+      // clear it for the value we use as base address for the counter vector.
+      // This way, if later we want to have "real" (not clobbered) buffers
+      // acting as scratch, the lowering (at least this part of it that deals
+      // with counters) stays the same.
+      RealContext = Builder.CreateIntToPtr(
+          Builder.CreateAnd(CtxAsInt, Builder.getInt64(-2)),
+          ThisContextType->getPointerTo());
+      I.eraseFromParent();
+      break;
+    }
+  }
+  if (!Context) {
+    ORE.emit([&] {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "Skip", &F)
+             << "Function doesn't have instrumentation, skipping";
+    });
+    return false;
+  }
+
+  bool ContextWasReleased = false;
+  for (auto &BB : F) {
+    for (auto &I : llvm::make_early_inc_range(BB)) {
+      if (auto *Instr = dyn_cast<InstrProfCntrInstBase>(&I)) {
+        IRBuilder<> Builder(Instr);
+        switch (Instr->getIntrinsicID()) {
+        case llvm::Intrinsic::instrprof_increment:
+        case llvm::Intrinsic::instrprof_increment_step: {
+          // Increments (or increment-steps) are just a typical load - increment
+          // - store in the RealContext.
+          auto *AsStep = cast<InstrProfIncrementInst>(Instr);
+          auto *GEP = Builder.CreateGEP(
+              ThisContextType, RealContext,
+              {Builder.getInt32(0), Builder.getInt32(1), AsStep->getIndex()});
+          Builder.CreateStore(
+              Builder.CreateAdd(Builder.CreateLoad(Builder.getInt64Ty(), GEP),
+                                AsStep->getStep()),
+              GEP);
+        } break;
+        case llvm::Intrinsic::instrprof_callsite:
+          // callsite lowering: write the called value in the expected callee
+          // TLS we treat the TLS as volatile because of signal handlers and to
+          // avoid these being moved away from the callsite they decorate.
+          auto *CSIntrinsic = dyn_cast<InstrProfCallsite>(Instr);
+          Builder.CreateStore(CSIntrinsic->getCallee(), ExpectedCalleeTLSAddr,
+                              true);
+          // write the GEP of the slot in the sub-contexts portion of the
+          // context in TLS. Now, here, we use the actual Context value - as
+          // returned from compiler-rt - which may have the LSB set if the
+          // Context was scratch. Since the header of the context object and
+          // then the values are all 8-aligned (or, really, insofar as we care,
+          // they are even) - if the context is scratch (meaning, an odd value),
+          // so will the GEP. This is important because this is then visible to
+          // compiler-rt which will produce scratch contexts for callers that
+          // have a scratch context.
+          Builder.CreateStore(
+              Builder.CreateGEP(ThisContextType, Context,
+                                {Builder.getInt32(0), Builder.getInt32(2),
+                                 CSIntrinsic->getIndex()}),
+              CallsiteInfoTLSAddr, true);
+          break;
+        }
+        I.eraseFromParent();
+      } else if (TheRootContext && isa<ReturnInst>(I)) {
+        // Remember to release the context if we are an entrypoint.
+        IRBuilder<> Builder(&I);
+        Builder.CreateCall(ReleaseCtx, {TheRootContext});
+        ContextWasReleased = true;
+      }
+    }
+  }
+  // FIXME: This would happen if the entrypoint tailcalls. A way to fix would be
+  // to disallow this, (so this then stays as an error), another is to detect
+  // that and then do a wrapper or disallow the tail call. This only affects
+  // instrumentation, when we want to detect the call graph.
+  if (TheRootContext && !ContextWasReleased)
+    F.getContext().emitError(
+        "[ctx_prof] An entrypoint was instrumented but it has no `ret` "
+        "instructions above which to release the context: " +
+        F.getName());
+  return true;
+}
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index dbf00871543f..ac6d3348b3db 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -2126,7 +2126,7 @@ static bool annotateAllFunctions(
       HotFunctions.push_back(&F);
     if (PGOViewCounts != PGOVCT_None &&
         (ViewBlockFreqFuncName.empty() ||
-         F.getName().equals(ViewBlockFreqFuncName))) {
+         F.getName() == ViewBlockFreqFuncName)) {
       LoopInfo LI{DominatorTree(F)};
       std::unique_ptr<BranchProbabilityInfo> NewBPI =
           std::make_unique<BranchProbabilityInfo>(F, LI);
@@ -2141,7 +2141,7 @@ static bool annotateAllFunctions(
     }
     if (PGOViewRawCounts != PGOVCT_None &&
         (ViewBlockFreqFuncName.empty() ||
-         F.getName().equals(ViewBlockFreqFuncName))) {
+         F.getName() == ViewBlockFreqFuncName)) {
       if (PGOViewRawCounts == PGOVCT_Graph)
         if (ViewBlockFreqFuncName.empty())
           WriteGraph(&Func, Twine("PGORawCounts_") + Func.getFunc().getName());
diff --git a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 956ebe8fc8b9..2f3de199f51d 100644
--- a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -803,6 +803,10 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) {
 int ThreadSanitizer::getMemoryAccessFuncIndex(Type *OrigTy, Value *Addr,
                                               const DataLayout &DL) {
   assert(OrigTy->isSized());
+  if (OrigTy->isScalableTy()) {
+    // FIXME: support vscale.
+    return -1;
+  }
   uint32_t TypeSize = DL.getTypeStoreSizeInBits(OrigTy);
   if (TypeSize != 8  && TypeSize != 16 &&
       TypeSize != 32 && TypeSize != 64 && TypeSize != 128) {
diff --git a/llvm/lib/Transforms/Scalar/ADCE.cpp b/llvm/lib/Transforms/Scalar/ADCE.cpp
index 96ecd7f368a0..5f0a9b22c3ee 100644
--- a/llvm/lib/Transforms/Scalar/ADCE.cpp
+++ b/llvm/lib/Transforms/Scalar/ADCE.cpp
@@ -350,7 +350,7 @@ bool AggressiveDeadCodeElimination::isInstrumentsConstant(Instruction &I) {
   // TODO -- move this test into llvm::isInstructionTriviallyDead
   if (CallInst *CI = dyn_cast<CallInst>(&I))
     if (Function *Callee = CI->getCalledFunction())
-      if (Callee->getName().equals(getInstrProfValueProfFuncName()))
+      if (Callee->getName() == getInstrProfValueProfFuncName())
         if (isa<Constant>(CI->getArgOperand(0)))
           return true;
   return false;
diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 715cdaff9727..50b5fdb56720 100644
--- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -62,6 +62,7 @@ STATISTIC(NumAShrsConverted, "Number of ashr converted to lshr");
 STATISTIC(NumAShrsRemoved, "Number of ashr removed");
 STATISTIC(NumSRems,     "Number of srem converted to urem");
 STATISTIC(NumSExt,      "Number of sext converted to zext");
+STATISTIC(NumSIToFP,    "Number of sitofp converted to uitofp");
 STATISTIC(NumSICmps,    "Number of signed icmp preds simplified to unsigned");
 STATISTIC(NumAnd,       "Number of ands removed");
 STATISTIC(NumNW,        "Number of no-wrap deductions");
@@ -89,7 +90,7 @@ STATISTIC(NumSMinMax,
           "Number of llvm.s{min,max} intrinsics simplified to unsigned");
 STATISTIC(NumUDivURemsNarrowedExpanded,
           "Number of bound udiv's/urem's expanded");
-STATISTIC(NumZExt, "Number of non-negative deductions");
+STATISTIC(NumNNeg, "Number of zext/uitofp non-negative deductions");
 
 static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) {
   if (Constant *C = LVI->getConstant(V, At))
@@ -1075,20 +1076,49 @@ static bool processSExt(SExtInst *SDI, LazyValueInfo *LVI) {
   return true;
 }
 
-static bool processZExt(ZExtInst *ZExt, LazyValueInfo *LVI) {
-  if (ZExt->getType()->isVectorTy())
+static bool processPossibleNonNeg(PossiblyNonNegInst *I, LazyValueInfo *LVI) {
+  if (I->getType()->isVectorTy())
     return false;
 
-  if (ZExt->hasNonNeg())
+  if (I->hasNonNeg())
     return false;
 
-  const Use &Base = ZExt->getOperandUse(0);
+  const Use &Base = I->getOperandUse(0);
   if (!LVI->getConstantRangeAtUse(Base, /*UndefAllowed*/ false)
            .isAllNonNegative())
     return false;
 
-  ++NumZExt;
-  ZExt->setNonNeg();
+  ++NumNNeg;
+  I->setNonNeg();
+
+  return true;
+}
+
+static bool processZExt(ZExtInst *ZExt, LazyValueInfo *LVI) {
+  return processPossibleNonNeg(cast<PossiblyNonNegInst>(ZExt), LVI);
+}
+
+static bool processUIToFP(UIToFPInst *UIToFP, LazyValueInfo *LVI) {
+  return processPossibleNonNeg(cast<PossiblyNonNegInst>(UIToFP), LVI);
+}
+
+static bool processSIToFP(SIToFPInst *SIToFP, LazyValueInfo *LVI) {
+  if (SIToFP->getType()->isVectorTy())
+    return false;
+
+  const Use &Base = SIToFP->getOperandUse(0);
+  if (!LVI->getConstantRangeAtUse(Base, /*UndefAllowed*/ false)
+           .isAllNonNegative())
+    return false;
+
+  ++NumSIToFP;
+  auto *UIToFP = CastInst::Create(Instruction::UIToFP, Base, SIToFP->getType(),
+                                  "", SIToFP->getIterator());
+  UIToFP->takeName(SIToFP);
+  UIToFP->setDebugLoc(SIToFP->getDebugLoc());
+  UIToFP->setNonNeg();
+  SIToFP->replaceAllUsesWith(UIToFP);
+  SIToFP->eraseFromParent();
 
   return true;
 }
@@ -1197,6 +1227,12 @@ static bool runImpl(Function &F, LazyValueInfo *LVI, DominatorTree *DT,
       case Instruction::ZExt:
         BBChanged |= processZExt(cast<ZExtInst>(&II), LVI);
         break;
+      case Instruction::UIToFP:
+        BBChanged |= processUIToFP(cast<UIToFPInst>(&II), LVI);
+        break;
+      case Instruction::SIToFP:
+        BBChanged |= processSIToFP(cast<SIToFPInst>(&II), LVI);
+        break;
       case Instruction::Add:
       case Instruction::Sub:
       case Instruction::Mul:
diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp
index d4907326eb0a..95a4c644a91a 100644
--- a/llvm/lib/Transforms/Scalar/GVNSink.cpp
+++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp
@@ -226,12 +226,22 @@ class ModelledPHI {
 public:
   ModelledPHI() = default;
 
-  ModelledPHI(const PHINode *PN) {
-    // BasicBlock comes first so we sort by basic block pointer order, then by value pointer order.
-    SmallVector<std::pair<BasicBlock *, Value *>, 4> Ops;
+  ModelledPHI(const PHINode *PN,
+              const DenseMap<const BasicBlock *, unsigned> &BlockOrder) {
+    // BasicBlock comes first so we sort by basic block pointer order,
+    // then by value pointer order. No need to call `verifyModelledPHI`
+    // As the Values and Blocks are populated in a deterministic order.
+    using OpsType = std::pair<BasicBlock *, Value *>;
+    SmallVector<OpsType, 4> Ops;
     for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I)
       Ops.push_back({PN->getIncomingBlock(I), PN->getIncomingValue(I)});
-    llvm::sort(Ops);
+
+    auto ComesBefore = [BlockOrder](OpsType O1, OpsType O2) {
+      return BlockOrder.lookup(O1.first) < BlockOrder.lookup(O2.first);
+    };
+    // Sort in a deterministic order.
+    llvm::sort(Ops, ComesBefore);
+
     for (auto &P : Ops) {
       Blocks.push_back(P.first);
       Values.push_back(P.second);
@@ -247,16 +257,38 @@ public:
     return M;
   }
 
+  void
+  verifyModelledPHI(const DenseMap<const BasicBlock *, unsigned> &BlockOrder) {
+    assert(Values.size() > 1 && Blocks.size() > 1 &&
+           "Modelling PHI with less than 2 values");
+    auto ComesBefore = [BlockOrder](const BasicBlock *BB1,
+                                    const BasicBlock *BB2) {
+      return BlockOrder.lookup(BB1) < BlockOrder.lookup(BB2);
+    };
+    assert(llvm::is_sorted(Blocks, ComesBefore));
+    int C = 0;
+    for (const Value *V : Values) {
+      if (!isa<UndefValue>(V)) {
+        assert(cast<Instruction>(V)->getParent() == Blocks[C]);
+        (void)C;
+      }
+      C++;
+    }
+  }
   /// Create a PHI from an array of incoming values and incoming blocks.
-  template <typename VArray, typename BArray>
-  ModelledPHI(const VArray &V, const BArray &B) {
+  ModelledPHI(SmallVectorImpl<Instruction *> &V,
+              SmallSetVector<BasicBlock *, 4> &B,
+              const DenseMap<const BasicBlock *, unsigned> &BlockOrder) {
+    // The order of Values and Blocks are already ordered by the caller.
     llvm::copy(V, std::back_inserter(Values));
     llvm::copy(B, std::back_inserter(Blocks));
+    verifyModelledPHI(BlockOrder);
   }
 
   /// Create a PHI from [I[OpNum] for I in Insts].
-  template <typename BArray>
-  ModelledPHI(ArrayRef<Instruction *> Insts, unsigned OpNum, const BArray &B) {
+  /// TODO: Figure out a way to verifyModelledPHI in this constructor.
+  ModelledPHI(ArrayRef<Instruction *> Insts, unsigned OpNum,
+              SmallSetVector<BasicBlock *, 4> &B) {
     llvm::copy(B, std::back_inserter(Blocks));
     for (auto *I : Insts)
       Values.push_back(I->getOperand(OpNum));
@@ -297,7 +329,8 @@ public:
 
   // Hash functor
   unsigned hash() const {
-      return (unsigned)hash_combine_range(Values.begin(), Values.end());
+    // Is deterministic because Values are saved in a specific order.
+    return (unsigned)hash_combine_range(Values.begin(), Values.end());
   }
 
   bool operator==(const ModelledPHI &Other) const {
@@ -566,7 +599,7 @@ public:
 
 class GVNSink {
 public:
-  GVNSink() = default;
+  GVNSink() {}
 
   bool run(Function &F) {
     LLVM_DEBUG(dbgs() << "GVNSink: running on function @" << F.getName()
@@ -575,6 +608,16 @@ public:
     unsigned NumSunk = 0;
     ReversePostOrderTraversal<Function*> RPOT(&F);
     VN.setReachableBBs(BasicBlocksSet(RPOT.begin(), RPOT.end()));
+    // Populate reverse post-order to order basic blocks in deterministic
+    // order. Any arbitrary ordering will work in this case as long as they are
+    // deterministic. The node ordering of newly created basic blocks
+    // are irrelevant because RPOT(for computing sinkable candidates) is also
+    // obtained ahead of time and only their order are relevant for this pass.
+    unsigned NodeOrdering = 0;
+    RPOTOrder[*RPOT.begin()] = ++NodeOrdering;
+    for (auto *BB : RPOT)
+      if (!pred_empty(BB))
+        RPOTOrder[BB] = ++NodeOrdering;
     for (auto *N : RPOT)
       NumSunk += sinkBB(N);
 
@@ -583,6 +626,7 @@ public:
 
 private:
   ValueTable VN;
+  DenseMap<const BasicBlock *, unsigned> RPOTOrder;
 
   bool shouldAvoidSinkingInstruction(Instruction *I) {
     // These instructions may change or break semantics if moved.
@@ -603,7 +647,7 @@ private:
   void analyzeInitialPHIs(BasicBlock *BB, ModelledPHISet &PHIs,
                           SmallPtrSetImpl<Value *> &PHIContents) {
     for (PHINode &PN : BB->phis()) {
-      auto MPHI = ModelledPHI(&PN);
+      auto MPHI = ModelledPHI(&PN, RPOTOrder);
       PHIs.insert(MPHI);
       for (auto *V : MPHI.getValues())
         PHIContents.insert(V);
@@ -691,7 +735,7 @@ GVNSink::analyzeInstructionForSinking(LockstepReverseIterator &LRI,
   }
 
   // The sunk instruction's results.
-  ModelledPHI NewPHI(NewInsts, ActivePreds);
+  ModelledPHI NewPHI(NewInsts, ActivePreds, RPOTOrder);
 
   // Does sinking this instruction render previous PHIs redundant?
   if (NeededPHIs.erase(NewPHI))
@@ -766,6 +810,9 @@ unsigned GVNSink::sinkBB(BasicBlock *BBEnd) {
              BBEnd->printAsOperand(dbgs()); dbgs() << "\n");
   SmallVector<BasicBlock *, 4> Preds;
   for (auto *B : predecessors(BBEnd)) {
+    // Bailout on basic blocks without predecessor(PR42346).
+    if (!RPOTOrder.count(B))
+      return 0;
     auto *T = B->getTerminator();
     if (isa<BranchInst>(T) || isa<SwitchInst>(T))
       Preds.push_back(B);
@@ -774,7 +821,11 @@ unsigned GVNSink::sinkBB(BasicBlock *BBEnd) {
   }
   if (Preds.size() < 2)
     return 0;
-  llvm::sort(Preds);
+  auto ComesBefore = [this](const BasicBlock *BB1, const BasicBlock *BB2) {
+    return RPOTOrder.lookup(BB1) < RPOTOrder.lookup(BB2);
+  };
+  // Sort in a deterministic order.
+  llvm::sort(Preds, ComesBefore);
 
   unsigned NumOrigPreds = Preds.size();
   // We can only sink instructions through unconditional branches.
@@ -889,5 +940,6 @@ PreservedAnalyses GVNSinkPass::run(Function &F, FunctionAnalysisManager &AM) {
   GVNSink G;
   if (!G.run(F))
     return PreservedAnalyses::all();
+
   return PreservedAnalyses::none();
 }
diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index ba392e187b8b..dd7c89034ca0 100644
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -359,15 +359,18 @@ bool IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) {
       PHINode::Create(Int32Ty, 2, PN->getName() + ".int", PN->getIterator());
   NewPHI->addIncoming(ConstantInt::get(Int32Ty, InitValue),
                       PN->getIncomingBlock(IncomingEdge));
+  NewPHI->setDebugLoc(PN->getDebugLoc());
 
-  Value *NewAdd =
+  Instruction *NewAdd =
       BinaryOperator::CreateAdd(NewPHI, ConstantInt::get(Int32Ty, IncValue),
                                 Incr->getName() + ".int", Incr->getIterator());
+  NewAdd->setDebugLoc(Incr->getDebugLoc());
   NewPHI->addIncoming(NewAdd, PN->getIncomingBlock(BackEdge));
 
   ICmpInst *NewCompare =
       new ICmpInst(TheBr->getIterator(), NewPred, NewAdd,
                    ConstantInt::get(Int32Ty, ExitValue), Compare->getName());
+  NewCompare->setDebugLoc(Compare->getDebugLoc());
 
   // In the following deletions, PN may become dead and may be deleted.
   // Use a WeakTrackingVH to observe whether this happens.
@@ -391,8 +394,9 @@ bool IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) {
   // We give preference to sitofp over uitofp because it is faster on most
   // platforms.
   if (WeakPH) {
-    Value *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv",
-                                 PN->getParent()->getFirstInsertionPt());
+    Instruction *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv",
+                                       PN->getParent()->getFirstInsertionPt());
+    Conv->setDebugLoc(PN->getDebugLoc());
     PN->replaceAllUsesWith(Conv);
     RecursivelyDeleteTriviallyDeadInstructions(PN, TLI, MSSAU.get());
   }
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index d552b8aabb99..88307b8b074e 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -868,7 +868,8 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl(
 
       for (const auto &LHSVal : LHSVals) {
         Constant *V = LHSVal.first;
-        Constant *Folded = ConstantExpr::getCompare(Pred, V, CmpConst);
+        Constant *Folded =
+            ConstantFoldCompareInstOperands(Pred, V, CmpConst, DL);
         if (Constant *KC = getKnownConstant(Folded, WantInteger))
           Result.emplace_back(KC, LHSVal.second);
       }
@@ -1278,9 +1279,11 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) {
     // only happen in dead loops.
     if (AvailableVal == LoadI)
       AvailableVal = PoisonValue::get(LoadI->getType());
-    if (AvailableVal->getType() != LoadI->getType())
+    if (AvailableVal->getType() != LoadI->getType()) {
       AvailableVal = CastInst::CreateBitOrPointerCast(
           AvailableVal, LoadI->getType(), "", LoadI->getIterator());
+      cast<Instruction>(AvailableVal)->setDebugLoc(LoadI->getDebugLoc());
+    }
     LoadI->replaceAllUsesWith(AvailableVal);
     LoadI->eraseFromParent();
     return true;
@@ -1509,7 +1512,8 @@ findMostPopularDest(BasicBlock *BB,
 // BB->getSinglePredecessor() and then on to BB.
 Constant *JumpThreadingPass::evaluateOnPredecessorEdge(BasicBlock *BB,
                                                        BasicBlock *PredPredBB,
-                                                       Value *V) {
+                                                       Value *V,
+                                                       const DataLayout &DL) {
   BasicBlock *PredBB = BB->getSinglePredecessor();
   assert(PredBB && "Expected a single predecessor");
 
@@ -1534,11 +1538,12 @@ Constant *JumpThreadingPass::evaluateOnPredecessorEdge(BasicBlock *BB,
   if (CmpInst *CondCmp = dyn_cast<CmpInst>(V)) {
     if (CondCmp->getParent() == BB) {
       Constant *Op0 =
-          evaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(0));
+          evaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(0), DL);
       Constant *Op1 =
-          evaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(1));
+          evaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(1), DL);
       if (Op0 && Op1) {
-        return ConstantExpr::getCompare(CondCmp->getPredicate(), Op0, Op1);
+        return ConstantFoldCompareInstOperands(CondCmp->getPredicate(), Op0,
+                                               Op1, DL);
       }
     }
     return nullptr;
@@ -2191,12 +2196,13 @@ bool JumpThreadingPass::maybethreadThroughTwoBasicBlocks(BasicBlock *BB,
   unsigned OneCount = 0;
   BasicBlock *ZeroPred = nullptr;
   BasicBlock *OnePred = nullptr;
+  const DataLayout &DL = BB->getModule()->getDataLayout();
   for (BasicBlock *P : predecessors(PredBB)) {
     // If PredPred ends with IndirectBrInst, we can't handle it.
     if (isa<IndirectBrInst>(P->getTerminator()))
       continue;
     if (ConstantInt *CI = dyn_cast_or_null<ConstantInt>(
-            evaluateOnPredecessorEdge(BB, P, Cond))) {
+            evaluateOnPredecessorEdge(BB, P, Cond, DL))) {
       if (CI->isZero()) {
         ZeroCount++;
         ZeroPred = P;
@@ -2983,6 +2989,7 @@ bool JumpThreadingPass::tryToUnfoldSelectInCurrBB(BasicBlock *BB) {
     PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI->getIterator());
     NewPN->addIncoming(SI->getTrueValue(), Term->getParent());
     NewPN->addIncoming(SI->getFalseValue(), BB);
+    NewPN->setDebugLoc(SI->getDebugLoc());
     SI->replaceAllUsesWith(NewPN);
     SI->eraseFromParent();
     // NewBB and SplitBB are newly created blocks which require insertion.
@@ -3120,6 +3127,7 @@ bool JumpThreadingPass::threadGuard(BasicBlock *BB, IntrinsicInst *Guard,
       PHINode *NewPN = PHINode::Create(Inst->getType(), 2);
       NewPN->addIncoming(UnguardedMapping[Inst], UnguardedBlock);
       NewPN->addIncoming(GuardedMapping[Inst], GuardedBlock);
+      NewPN->setDebugLoc(Inst->getDebugLoc());
       NewPN->insertBefore(InsertionPoint);
       Inst->replaceAllUsesWith(NewPN);
     }
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index e50413de46b1..6aa4188d1cc4 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -933,12 +933,14 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
         ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags());
         SafetyInfo->insertInstructionTo(ReciprocalDivisor, I.getParent());
         ReciprocalDivisor->insertBefore(&I);
+        ReciprocalDivisor->setDebugLoc(I.getDebugLoc());
 
         auto Product =
             BinaryOperator::CreateFMul(I.getOperand(0), ReciprocalDivisor);
         Product->setFastMathFlags(I.getFastMathFlags());
         SafetyInfo->insertInstructionTo(Product, I.getParent());
         Product->insertAfter(&I);
+        Product->setDebugLoc(I.getDebugLoc());
         I.replaceAllUsesWith(Product);
         eraseInstruction(I, *SafetyInfo, MSSAU);
 
diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
index 0e9cf328f149..a7f8a22ece27 100644
--- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
@@ -1005,7 +1005,7 @@ PreservedAnalyses LoopFlattenPass::run(LoopNest &LN, LoopAnalysisManager &LAM,
   // in simplified form, and also needs LCSSA. Running
   // this pass will simplify all loops that contain inner loops,
   // regardless of whether anything ends up being flattened.
-  LoopAccessInfoManager LAIM(AR.SE, AR.AA, AR.DT, AR.LI, nullptr);
+  LoopAccessInfoManager LAIM(AR.SE, AR.AA, AR.DT, AR.LI, &AR.TTI, nullptr);
   for (Loop *InnerLoop : LN.getLoops()) {
     auto *OuterLoop = InnerLoop->getParentLoop();
     if (!OuterLoop)
diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
index 059900f357e6..f611ef6b2fa2 100644
--- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -183,7 +183,8 @@ public:
   findStoreToLoadDependences(const LoopAccessInfo &LAI) {
     std::forward_list<StoreToLoadForwardingCandidate> Candidates;
 
-    const auto *Deps = LAI.getDepChecker().getDependences();
+    const auto &DepChecker = LAI.getDepChecker();
+    const auto *Deps = DepChecker.getDependences();
     if (!Deps)
       return Candidates;
 
@@ -194,8 +195,8 @@ public:
     SmallPtrSet<Instruction *, 4> LoadsWithUnknownDepedence;
 
     for (const auto &Dep : *Deps) {
-      Instruction *Source = Dep.getSource(LAI);
-      Instruction *Destination = Dep.getDestination(LAI);
+      Instruction *Source = Dep.getSource(DepChecker);
+      Instruction *Destination = Dep.getDestination(DepChecker);
 
       if (Dep.Type == MemoryDepChecker::Dependence::Unknown ||
           Dep.Type == MemoryDepChecker::Dependence::IndirectUnsafe) {
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index ec42e2d6e193..eb1904ccaff3 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -1817,10 +1817,12 @@ static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI,
   case LSRUse::Address: {
     // Check the scaling factor cost with both the min and max offsets.
     InstructionCost ScaleCostMinOffset = TTI.getScalingFactorCost(
-        LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MinOffset, F.HasBaseReg,
+        LU.AccessTy.MemTy, F.BaseGV,
+        StackOffset::getFixed(F.BaseOffset + LU.MinOffset), F.HasBaseReg,
         F.Scale, LU.AccessTy.AddrSpace);
     InstructionCost ScaleCostMaxOffset = TTI.getScalingFactorCost(
-        LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MaxOffset, F.HasBaseReg,
+        LU.AccessTy.MemTy, F.BaseGV,
+        StackOffset::getFixed(F.BaseOffset + LU.MaxOffset), F.HasBaseReg,
         F.Scale, LU.AccessTy.AddrSpace);
 
     assert(ScaleCostMinOffset.isValid() && ScaleCostMaxOffset.isValid() &&
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 75fb8765061e..10fc9e9303e8 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopedHashTable.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -27,6 +28,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/LoopUnrollAnalyzer.h"
+#include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@@ -1140,7 +1142,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
                 std::optional<bool> ProvidedUpperBound,
                 std::optional<bool> ProvidedAllowPeeling,
                 std::optional<bool> ProvidedAllowProfileBasedPeeling,
-                std::optional<unsigned> ProvidedFullUnrollMaxCount) {
+                std::optional<unsigned> ProvidedFullUnrollMaxCount,
+                AAResults *AA = nullptr) {
 
   LLVM_DEBUG(dbgs() << "Loop Unroll: F["
                     << L->getHeader()->getParent()->getName() << "] Loop %"
@@ -1292,7 +1295,7 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
 
     ValueToValueMapTy VMap;
     if (peelLoop(L, PP.PeelCount, LI, &SE, DT, &AC, PreserveLCSSA, VMap)) {
-      simplifyLoopAfterUnroll(L, true, LI, &SE, &DT, &AC, &TTI);
+      simplifyLoopAfterUnroll(L, true, LI, &SE, &DT, &AC, &TTI, nullptr);
       // If the loop was peeled, we already "used up" the profile information
       // we had, so we don't want to unroll or peel again.
       if (PP.PeelProfiledIterations)
@@ -1325,7 +1328,7 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
       L,
       {UP.Count, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount,
        UP.UnrollRemainder, ForgetAllSCEV},
-      LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop);
+      LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop, AA);
   if (UnrollResult == LoopUnrollResult::Unmodified)
     return LoopUnrollResult::Unmodified;
 
@@ -1572,6 +1575,7 @@ PreservedAnalyses LoopUnrollPass::run(Function &F,
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &AC = AM.getResult<AssumptionAnalysis>(F);
   auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  AAResults &AA = AM.getResult<AAManager>(F);
 
   LoopAnalysisManager *LAM = nullptr;
   if (auto *LAMProxy = AM.getCachedResult<LoopAnalysisManagerFunctionProxy>(F))
@@ -1627,7 +1631,8 @@ PreservedAnalyses LoopUnrollPass::run(Function &F,
         /*Count*/ std::nullopt,
         /*Threshold*/ std::nullopt, UnrollOpts.AllowPartial,
         UnrollOpts.AllowRuntime, UnrollOpts.AllowUpperBound, LocalAllowPeeling,
-        UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount);
+        UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount,
+        &AA);
     Changed |= Result != LoopUnrollResult::Unmodified;
 
     // The parent must not be damaged by unrolling!
diff --git a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
index f39c24484840..663715948241 100644
--- a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -582,7 +582,7 @@ PreservedAnalyses LoopVersioningLICMPass::run(Loop &L, LoopAnalysisManager &AM,
   const Function *F = L.getHeader()->getParent();
   OptimizationRemarkEmitter ORE(F);
 
-  LoopAccessInfoManager LAIs(*SE, *AA, *DT, LAR.LI, nullptr);
+  LoopAccessInfoManager LAIs(*SE, *AA, *DT, LAR.LI, nullptr, nullptr);
   if (!LoopVersioningLICM(AA, SE, &ORE, LAIs, LAR.LI, &L).run(DT))
     return PreservedAnalyses::all();
   return getLoopPassPreservedAnalyses();
diff --git a/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp b/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
index f5c9aaa4f20b..77d155d7e78e 100644
--- a/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
+++ b/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
@@ -591,7 +591,7 @@ static Instruction *findLocationForEntrySafepoint(Function &F,
 const char GCSafepointPollName[] = "gc.safepoint_poll";
 
 static bool isGCSafepointPoll(Function &F) {
-  return F.getName().equals(GCSafepointPollName);
+  return F.getName() == GCSafepointPollName;
 }
 
 /// Returns true if this function should be rewritten to include safepoint
diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index 330b464667ee..858e54c4a9bc 100644
--- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -1685,10 +1685,10 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
 
   // Pass through the requested lowering if any.  The default is live-through.
   StringRef DeoptLowering = getDeoptLowering(Call);
-  if (DeoptLowering.equals("live-in"))
+  if (DeoptLowering == "live-in")
     Flags |= uint32_t(StatepointFlags::DeoptLiveIn);
   else {
-    assert(DeoptLowering.equals("live-through") && "Unsupported value!");
+    assert(DeoptLowering == "live-through" && "Unsupported value!");
   }
 
   FunctionCallee CallTarget(Call->getFunctionType(), Call->getCalledOperand());
@@ -3046,8 +3046,7 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT,
       // which doesn't know how to produce a proper deopt state. So if we see a
       // non-leaf memcpy/memmove without deopt state just treat it as a leaf
       // copy and don't produce a statepoint.
-      if (!AllowStatepointWithNoDeoptInfo &&
-          !Call->getOperandBundle(LLVMContext::OB_deopt)) {
+      if (!AllowStatepointWithNoDeoptInfo && !Call->hasDeoptState()) {
         assert((isa<AtomicMemCpyInst>(Call) || isa<AtomicMemMoveInst>(Call)) &&
                "Don't expect any other calls here!");
         return false;
diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
index a4111fad5d9f..de80fa2c0502 100644
--- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
+++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
@@ -862,6 +862,69 @@ static void scalarizeMaskedCompressStore(const DataLayout &DL, CallInst *CI,
   ModifiedDT = true;
 }
 
+static void scalarizeMaskedVectorHistogram(const DataLayout &DL, CallInst *CI,
+                                           DomTreeUpdater *DTU,
+                                           bool &ModifiedDT) {
+  // If we extend histogram to return a result someday (like the updated vector)
+  // then we'll need to support it here.
+  assert(CI->getType()->isVoidTy() && "Histogram with non-void return.");
+  Value *Ptrs = CI->getArgOperand(0);
+  Value *Inc = CI->getArgOperand(1);
+  Value *Mask = CI->getArgOperand(2);
+
+  auto *AddrType = cast<FixedVectorType>(Ptrs->getType());
+  Type *EltTy = Inc->getType();
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  Builder.SetInsertPoint(InsertPt);
+
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  // FIXME: Do we need to add an alignment parameter to the intrinsic?
+  unsigned VectorWidth = AddrType->getNumElements();
+
+  // Shorten the way if the mask is a vector of constants.
+  if (isConstantIntVector(Mask)) {
+    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+      if (cast<Constant>(Mask)->getAggregateElement(Idx)->isNullValue())
+        continue;
+      Value *Ptr = Builder.CreateExtractElement(Ptrs, Idx, "Ptr" + Twine(Idx));
+      LoadInst *Load = Builder.CreateLoad(EltTy, Ptr, "Load" + Twine(Idx));
+      Value *Add = Builder.CreateAdd(Load, Inc);
+      Builder.CreateStore(Add, Ptr);
+    }
+    CI->eraseFromParent();
+    return;
+  }
+
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+    Value *Predicate =
+        Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx));
+
+    Instruction *ThenTerm =
+        SplitBlockAndInsertIfThen(Predicate, InsertPt, /*Unreachable=*/false,
+                                  /*BranchWeights=*/nullptr, DTU);
+
+    BasicBlock *CondBlock = ThenTerm->getParent();
+    CondBlock->setName("cond.histogram.update");
+
+    Builder.SetInsertPoint(CondBlock->getTerminator());
+    Value *Ptr = Builder.CreateExtractElement(Ptrs, Idx, "Ptr" + Twine(Idx));
+    LoadInst *Load = Builder.CreateLoad(EltTy, Ptr, "Load" + Twine(Idx));
+    Value *Add = Builder.CreateAdd(Load, Inc);
+    Builder.CreateStore(Add, Ptr);
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock = ThenTerm->getSuccessor(0);
+    NewIfBlock->setName("else");
+    Builder.SetInsertPoint(NewIfBlock, NewIfBlock->begin());
+  }
+
+  CI->eraseFromParent();
+  ModifiedDT = true;
+}
+
 static bool runImpl(Function &F, const TargetTransformInfo &TTI,
                     DominatorTree *DT) {
   std::optional<DomTreeUpdater> DTU;
@@ -938,6 +1001,12 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT,
     switch (II->getIntrinsicID()) {
     default:
       break;
+    case Intrinsic::experimental_vector_histogram_add:
+      if (TTI.isLegalMaskedVectorHistogram(CI->getArgOperand(0)->getType(),
+                                           CI->getArgOperand(1)->getType()))
+        return false;
+      scalarizeMaskedVectorHistogram(DL, CI, DTU, ModifiedDT);
+      break;
     case Intrinsic::masked_load:
       // Scalarize unsupported vector masked load
       if (TTI.isLegalMaskedLoad(
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index c54a956fc7e2..9f85396cde25 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -972,22 +972,13 @@ SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic,
 
 bool SeparateConstOffsetFromGEP::reorderGEP(GetElementPtrInst *GEP,
                                             TargetTransformInfo &TTI) {
-  Type *GEPType = GEP->getResultElementType();
-  // TODO: support reordering for non-trivial GEP chains
-  if (GEPType->isAggregateType() || GEP->getNumIndices() != 1)
+  if (GEP->getNumIndices() != 1)
     return false;
 
   auto PtrGEP = dyn_cast<GetElementPtrInst>(GEP->getPointerOperand());
   if (!PtrGEP)
     return false;
-  Type *PtrGEPType = PtrGEP->getResultElementType();
-  // TODO: support reordering for non-trivial GEP chains
-  if (PtrGEPType->isAggregateType() || PtrGEP->getNumIndices() != 1)
-    return false;
-
-  // TODO: support reordering for non-trivial GEP chains
-  if (PtrGEPType != GEPType ||
-      PtrGEP->getSourceElementType() != GEP->getSourceElementType())
+  if (PtrGEP->getNumIndices() != 1)
     return false;
 
   bool NestedNeedsExtraction;
@@ -1002,8 +993,6 @@ bool SeparateConstOffsetFromGEP::reorderGEP(GetElementPtrInst *GEP,
                                  /*HasBaseReg=*/true, /*Scale=*/0, AddrSpace))
     return false;
 
-  IRBuilder<> Builder(GEP);
-  Builder.SetCurrentDebugLocation(GEP->getDebugLoc());
   bool GEPInBounds = GEP->isInBounds();
   bool PtrGEPInBounds = PtrGEP->isInBounds();
   bool IsChainInBounds = GEPInBounds && PtrGEPInBounds;
@@ -1018,13 +1007,14 @@ bool SeparateConstOffsetFromGEP::reorderGEP(GetElementPtrInst *GEP,
     }
   }
 
+  IRBuilder<> Builder(GEP);
   // For trivial GEP chains, we can swap the indicies.
-  auto NewSrc = Builder.CreateGEP(PtrGEPType, PtrGEP->getPointerOperand(),
-                                  SmallVector<Value *, 4>(GEP->indices()));
-  cast<GetElementPtrInst>(NewSrc)->setIsInBounds(IsChainInBounds);
-  auto NewGEP = Builder.CreateGEP(GEPType, NewSrc,
-                                  SmallVector<Value *, 4>(PtrGEP->indices()));
-  cast<GetElementPtrInst>(NewGEP)->setIsInBounds(IsChainInBounds);
+  Value *NewSrc = Builder.CreateGEP(
+      GEP->getSourceElementType(), PtrGEP->getPointerOperand(),
+      SmallVector<Value *, 4>(GEP->indices()), "", IsChainInBounds);
+  Value *NewGEP = Builder.CreateGEP(PtrGEP->getSourceElementType(), NewSrc,
+                                    SmallVector<Value *, 4>(PtrGEP->indices()),
+                                    "", IsChainInBounds);
   GEP->replaceAllUsesWith(NewGEP);
   RecursivelyDeleteTriviallyDeadInstructions(GEP);
   return true;
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index d763b1ee0aa1..002ed381a4fd 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -1261,9 +1261,8 @@ static BasicBlock *buildClonedLoopBlocks(
   Module *M = ClonedPH->getParent()->getParent();
   for (auto *ClonedBB : NewBlocks)
     for (Instruction &I : *ClonedBB) {
-      RemapDbgVariableRecordRange(M, I.getDbgRecordRange(), VMap,
-                                  RF_NoModuleLevelChanges |
-                                      RF_IgnoreMissingLocals);
+      RemapDbgRecordRange(M, I.getDbgRecordRange(), VMap,
+                          RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
       RemapInstruction(&I, VMap,
                        RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
       if (auto *II = dyn_cast<AssumeInst>(&I))
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 51fc28ef90ef..462283c0bfe0 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -333,10 +333,6 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU,
   // Finally, erase the old block and update dominator info.
   DeleteDeadBlock(BB, DTU);
 
-  // Remove redundant "llvm.dbg" instrunctions after blocks have been merged.
-  if (PredBB->getParent()->getSubprogram())
-    RemoveRedundantDbgInstrs(PredBB);
-
   return true;
 }
 
@@ -1145,6 +1141,7 @@ BasicBlock *llvm::splitBlockBefore(BasicBlock *Old, BasicBlock::iterator SplitPt
 }
 
 /// Update DominatorTree, LoopInfo, and LCCSA analysis information.
+/// Invalidates DFS Numbering when DTU or DT is provided.
 static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
                                       ArrayRef<BasicBlock *> Preds,
                                       DomTreeUpdater *DTU, DominatorTree *DT,
diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp
index 303a09805a9d..981183682b8b 100644
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -14,9 +14,11 @@
 
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/AttributeMask.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
@@ -276,8 +278,8 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
     // attached debug-info records.
     for (Instruction &II : *BB) {
       RemapInstruction(&II, VMap, RemapFlag, TypeMapper, Materializer);
-      RemapDbgVariableRecordRange(II.getModule(), II.getDbgRecordRange(), VMap,
-                                  RemapFlag, TypeMapper, Materializer);
+      RemapDbgRecordRange(II.getModule(), II.getDbgRecordRange(), VMap,
+                          RemapFlag, TypeMapper, Materializer);
     }
 
   // Only update !llvm.dbg.cu for DifferentModule (not CloneModule). In the
@@ -384,18 +386,6 @@ public:
 };
 } // namespace
 
-static bool hasRoundingModeOperand(Intrinsic::ID CIID) {
-  switch (CIID) {
-#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)                         \
-  case Intrinsic::INTRINSIC:                                                   \
-    return ROUND_MODE == 1;
-#define FUNCTION INSTRUCTION
-#include "llvm/IR/ConstrainedOps.def"
-  default:
-    llvm_unreachable("Unexpected constrained intrinsic id");
-  }
-}
-
 Instruction *
 PruningFunctionCloner::cloneInstruction(BasicBlock::const_iterator II) {
   const Instruction &OldInst = *II;
@@ -453,7 +443,7 @@ PruningFunctionCloner::cloneInstruction(BasicBlock::const_iterator II) {
       // The last arguments of a constrained intrinsic are metadata that
       // represent rounding mode (absents in some intrinsics) and exception
       // behavior. The inlined function uses default settings.
-      if (hasRoundingModeOperand(CIID))
+      if (Intrinsic::hasConstrainedFPRoundingModeOperand(CIID))
         Args.push_back(
             MetadataAsValue::get(Ctx, MDString::get(Ctx, "round.tonearest")));
       Args.push_back(
@@ -540,18 +530,13 @@ void PruningFunctionCloner::CloneBlock(
       RemapInstruction(NewInst, VMap,
                        ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges);
 
-      // If we can simplify this instruction to some other value, simply add
-      // a mapping to that value rather than inserting a new instruction into
-      // the basic block.
-      if (Value *V =
-              simplifyInstruction(NewInst, BB->getModule()->getDataLayout())) {
-        // On the off-chance that this simplifies to an instruction in the old
-        // function, map it back into the new function.
-        if (NewFunc != OldFunc)
-          if (Value *MappedV = VMap.lookup(V))
-            V = MappedV;
-
-        if (!NewInst->mayHaveSideEffects()) {
+      // Eagerly constant fold the newly cloned instruction. If successful, add
+      // a mapping to the new value. Non-constant operands may be incomplete at
+      // this stage, thus instruction simplification is performed after
+      // processing phi-nodes.
+      if (Value *V = ConstantFoldInstruction(
+              NewInst, BB->getModule()->getDataLayout())) {
+        if (isInstructionTriviallyDead(NewInst)) {
           VMap[&*II] = V;
           NewInst->eraseFromParent();
           continue;
@@ -823,54 +808,47 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
     }
   }
 
-  // Make a second pass over the PHINodes now that all of them have been
-  // remapped into the new function, simplifying the PHINode and performing any
-  // recursive simplifications exposed. This will transparently update the
-  // WeakTrackingVH in the VMap. Notably, we rely on that so that if we coalesce
-  // two PHINodes, the iteration over the old PHIs remains valid, and the
-  // mapping will just map us to the new node (which may not even be a PHI
-  // node).
-  const DataLayout &DL = NewFunc->getParent()->getDataLayout();
-  SmallSetVector<const Value *, 8> Worklist;
-  for (unsigned Idx = 0, Size = PHIToResolve.size(); Idx != Size; ++Idx)
-    if (isa<PHINode>(VMap[PHIToResolve[Idx]]))
-      Worklist.insert(PHIToResolve[Idx]);
-
-  // Note that we must test the size on each iteration, the worklist can grow.
-  for (unsigned Idx = 0; Idx != Worklist.size(); ++Idx) {
-    const Value *OrigV = Worklist[Idx];
-    auto *I = dyn_cast_or_null<Instruction>(VMap.lookup(OrigV));
-    if (!I)
-      continue;
-
-    // Skip over non-intrinsic callsites, we don't want to remove any nodes from
-    // the CGSCC.
-    CallBase *CB = dyn_cast<CallBase>(I);
-    if (CB && CB->getCalledFunction() &&
-        !CB->getCalledFunction()->isIntrinsic())
-      continue;
-
-    // See if this instruction simplifies.
-    Value *SimpleV = simplifyInstruction(I, DL);
-    if (!SimpleV)
-      continue;
-
-    // Stash away all the uses of the old instruction so we can check them for
-    // recursive simplifications after a RAUW. This is cheaper than checking all
-    // uses of To on the recursive step in most cases.
-    for (const User *U : OrigV->users())
-      Worklist.insert(cast<Instruction>(U));
+  // Drop all incompatible return attributes that cannot be applied to NewFunc
+  // during cloning, so as to allow instruction simplification to reason on the
+  // old state of the function. The original attributes are restored later.
+  AttributeMask IncompatibleAttrs =
+      AttributeFuncs::typeIncompatible(OldFunc->getReturnType());
+  AttributeList Attrs = NewFunc->getAttributes();
+  NewFunc->removeRetAttrs(IncompatibleAttrs);
 
-    // Replace the instruction with its simplified value.
-    I->replaceAllUsesWith(SimpleV);
-
-    // If the original instruction had no side effects, remove it.
-    if (isInstructionTriviallyDead(I))
-      I->eraseFromParent();
-    else
-      VMap[OrigV] = I;
+  // As phi-nodes have been now remapped, allow incremental simplification of
+  // newly-cloned instructions.
+  const DataLayout &DL = NewFunc->getParent()->getDataLayout();
+  for (const auto &BB : *OldFunc) {
+    for (const auto &I : BB) {
+      auto *NewI = dyn_cast_or_null<Instruction>(VMap.lookup(&I));
+      if (!NewI)
+        continue;
+
+      // Skip over non-intrinsic callsites, we don't want to remove any nodes
+      // from the CGSCC.
+      CallBase *CB = dyn_cast<CallBase>(NewI);
+      if (CB && CB->getCalledFunction() &&
+          !CB->getCalledFunction()->isIntrinsic())
+        continue;
+
+      if (Value *V = simplifyInstruction(NewI, DL)) {
+        NewI->replaceAllUsesWith(V);
+
+        if (isInstructionTriviallyDead(NewI)) {
+          NewI->eraseFromParent();
+        } else {
+          // Did not erase it? Restore the new instruction into VMap previously
+          // dropped by `ValueIsRAUWd`.
+          VMap[&I] = NewI;
+        }
+      }
+    }
   }
 
+  // Restore attributes.
+  NewFunc->setAttributes(Attrs);
+
   // Remap debug intrinsic operands now that all values have been mapped.
   // Doing this now (late) preserves use-before-defs in debug intrinsics. If
   // we didn't do this, ValueAsMetadata(use-before-def) operands would be
@@ -889,10 +867,10 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
   Function::iterator Begin = cast<BasicBlock>(VMap[StartingBB])->getIterator();
   for (BasicBlock &BB : make_range(Begin, NewFunc->end())) {
     for (Instruction &I : BB) {
-      RemapDbgVariableRecordRange(I.getModule(), I.getDbgRecordRange(), VMap,
-                                  ModuleLevelChanges ? RF_None
-                                                     : RF_NoModuleLevelChanges,
-                                  TypeMapper, Materializer);
+      RemapDbgRecordRange(I.getModule(), I.getDbgRecordRange(), VMap,
+                          ModuleLevelChanges ? RF_None
+                                             : RF_NoModuleLevelChanges,
+                          TypeMapper, Materializer);
     }
   }
 
@@ -991,9 +969,8 @@ void llvm::remapInstructionsInBlocks(ArrayRef<BasicBlock *> Blocks,
   // Rewrite the code to refer to itself.
   for (auto *BB : Blocks) {
     for (auto &Inst : *BB) {
-      RemapDbgVariableRecordRange(
-          Inst.getModule(), Inst.getDbgRecordRange(), VMap,
-          RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+      RemapDbgRecordRange(Inst.getModule(), Inst.getDbgRecordRange(), VMap,
+                          RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
       RemapInstruction(&Inst, VMap,
                        RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
     }
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 6988292ac715..f2672b8e9118 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -1678,8 +1678,9 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
     DVR->getMarker()->MarkedInstr->dropOneDbgRecord(DVR);
   DIB.finalizeSubprogram(NewSP);
 
-  // Fix up the scope information attached to the line locations in the new
-  // function.
+  // Fix up the scope information attached to the line locations and the
+  // debug assignment metadata in the new function.
+  DenseMap<DIAssignID *, DIAssignID *> AssignmentIDMap;
   for (Instruction &I : instructions(NewFunc)) {
     if (const DebugLoc &DL = I.getDebugLoc())
       I.setDebugLoc(
@@ -1695,6 +1696,7 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
       return MD;
     };
     updateLoopMetadataDebugLocations(I, updateLoopInfoLoc);
+    at::remapAssignID(AssignmentIDMap, I);
   }
   if (!TheCall.getDebugLoc())
     TheCall.setDebugLoc(DILocation::get(Ctx, 0, 0, OldSP));
diff --git a/llvm/lib/Transforms/Utils/FunctionComparator.cpp b/llvm/lib/Transforms/Utils/FunctionComparator.cpp
index 67aeba7048f8..d95248c84b86 100644
--- a/llvm/lib/Transforms/Utils/FunctionComparator.cpp
+++ b/llvm/lib/Transforms/Utils/FunctionComparator.cpp
@@ -148,8 +148,8 @@ int FunctionComparator::cmpAttrs(const AttributeList L,
         if (LA.getKindAsEnum() != RA.getKindAsEnum())
           return cmpNumbers(LA.getKindAsEnum(), RA.getKindAsEnum());
 
-        ConstantRange LCR = LA.getRange();
-        ConstantRange RCR = RA.getRange();
+        const ConstantRange &LCR = LA.getRange();
+        const ConstantRange &RCR = RA.getRange();
         if (int Res = cmpAPInts(LCR.getLower(), RCR.getLower()))
           return Res;
         if (int Res = cmpAPInts(LCR.getUpper(), RCR.getUpper()))
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 833dcbec228b..82daaedaa0e8 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -1344,6 +1344,79 @@ static bool MayContainThrowingOrExitingCallAfterCB(CallBase *Begin,
       ++BeginIt, End->getIterator(), InlinerAttributeWindow + 1);
 }
 
+// Add attributes from CB params and Fn attributes that can always be propagated
+// to the corresponding argument / inner callbases.
+static void AddParamAndFnBasicAttributes(const CallBase &CB,
+                                         ValueToValueMapTy &VMap) {
+  auto *CalledFunction = CB.getCalledFunction();
+  auto &Context = CalledFunction->getContext();
+
+  // Collect valid attributes for all params.
+  SmallVector<AttrBuilder> ValidParamAttrs;
+  bool HasAttrToPropagate = false;
+
+  for (unsigned I = 0, E = CB.arg_size(); I < E; ++I) {
+    ValidParamAttrs.emplace_back(AttrBuilder{CB.getContext()});
+    // Access attributes can be propagated to any param with the same underlying
+    // object as the argument.
+    if (CB.paramHasAttr(I, Attribute::ReadNone))
+      ValidParamAttrs.back().addAttribute(Attribute::ReadNone);
+    if (CB.paramHasAttr(I, Attribute::ReadOnly))
+      ValidParamAttrs.back().addAttribute(Attribute::ReadOnly);
+    if (CB.paramHasAttr(I, Attribute::WriteOnly))
+      ValidParamAttrs.back().addAttribute(Attribute::WriteOnly);
+    HasAttrToPropagate |= ValidParamAttrs.back().hasAttributes();
+  }
+
+  // Won't be able to propagate anything.
+  if (!HasAttrToPropagate)
+    return;
+
+  for (BasicBlock &BB : *CalledFunction) {
+    for (Instruction &Ins : BB) {
+      const auto *InnerCB = dyn_cast<CallBase>(&Ins);
+      if (!InnerCB)
+        continue;
+      auto *NewInnerCB = dyn_cast_or_null<CallBase>(VMap.lookup(InnerCB));
+      if (!NewInnerCB)
+        continue;
+      AttributeList AL = NewInnerCB->getAttributes();
+      for (unsigned I = 0, E = InnerCB->arg_size(); I < E; ++I) {
+        // Check if the underlying value for the parameter is an argument.
+        const Value *UnderlyingV =
+            getUnderlyingObject(InnerCB->getArgOperand(I));
+        const Argument *Arg = dyn_cast<Argument>(UnderlyingV);
+        if (!Arg)
+          continue;
+
+        unsigned ArgNo = Arg->getArgNo();
+        // If so, propagate its access attributes.
+        AL = AL.addParamAttributes(Context, I, ValidParamAttrs[ArgNo]);
+        // We can have conflicting attributes from the inner callsite and
+        // to-be-inlined callsite. In that case, choose the most
+        // restrictive.
+
+        // readonly + writeonly means we can never deref so make readnone.
+        if (AL.hasParamAttr(I, Attribute::ReadOnly) &&
+            AL.hasParamAttr(I, Attribute::WriteOnly))
+          AL = AL.addParamAttribute(Context, I, Attribute::ReadNone);
+
+        // If have readnone, need to clear readonly/writeonly
+        if (AL.hasParamAttr(I, Attribute::ReadNone)) {
+          AL = AL.removeParamAttribute(Context, I, Attribute::ReadOnly);
+          AL = AL.removeParamAttribute(Context, I, Attribute::WriteOnly);
+        }
+
+        // Writable cannot exist in conjunction w/ readonly/readnone
+        if (AL.hasParamAttr(I, Attribute::ReadOnly) ||
+            AL.hasParamAttr(I, Attribute::ReadNone))
+          AL = AL.removeParamAttribute(Context, I, Attribute::Writable);
+      }
+      NewInnerCB->setAttributes(AL);
+    }
+  }
+}
+
 // Only allow these white listed attributes to be propagated back to the
 // callee. This is because other attributes may only be valid on the call
 // itself, i.e. attributes such as signext and zeroext.
@@ -1815,29 +1888,12 @@ static void trackInlinedStores(Function::iterator Start, Function::iterator End,
 /// otherwise a function inlined more than once into the same function
 /// will cause DIAssignID to be shared by many instructions.
 static void fixupAssignments(Function::iterator Start, Function::iterator End) {
-  // Map {Old, New} metadata. Not used directly - use GetNewID.
   DenseMap<DIAssignID *, DIAssignID *> Map;
-  auto GetNewID = [&Map](Metadata *Old) {
-    DIAssignID *OldID = cast<DIAssignID>(Old);
-    if (DIAssignID *NewID = Map.lookup(OldID))
-      return NewID;
-    DIAssignID *NewID = DIAssignID::getDistinct(OldID->getContext());
-    Map[OldID] = NewID;
-    return NewID;
-  };
   // Loop over all the inlined instructions. If we find a DIAssignID
   // attachment or use, replace it with a new version.
   for (auto BBI = Start; BBI != End; ++BBI) {
-    for (Instruction &I : *BBI) {
-      for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
-        if (DVR.isDbgAssign())
-          DVR.setAssignId(GetNewID(DVR.getAssignID()));
-      }
-      if (auto *ID = I.getMetadata(LLVMContext::MD_DIAssignID))
-        I.setMetadata(LLVMContext::MD_DIAssignID, GetNewID(ID));
-      else if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(&I))
-        DAI->setAssignId(GetNewID(DAI->getAssignID()));
-    }
+    for (Instruction &I : *BBI)
+      at::remapAssignID(Map, I);
   }
 }
 #undef DEBUG_TYPE
@@ -1909,10 +1965,14 @@ void llvm::updateProfileCallee(
   // During inlining ?
   if (VMap) {
     uint64_t CloneEntryCount = PriorEntryCount - NewEntryCount;
-    for (auto Entry : *VMap)
+    for (auto Entry : *VMap) {
       if (isa<CallInst>(Entry.first))
         if (auto *CI = dyn_cast_or_null<CallInst>(Entry.second))
           CI->updateProfWeight(CloneEntryCount, PriorEntryCount);
+      if (isa<InvokeInst>(Entry.first))
+        if (auto *II = dyn_cast_or_null<InvokeInst>(Entry.second))
+          II->updateProfWeight(CloneEntryCount, PriorEntryCount);
+    }
   }
 
   if (EntryDelta) {
@@ -1921,9 +1981,12 @@ void llvm::updateProfileCallee(
     for (BasicBlock &BB : *Callee)
       // No need to update the callsite if it is pruned during inlining.
       if (!VMap || VMap->count(&BB))
-        for (Instruction &I : BB)
+        for (Instruction &I : BB) {
           if (CallInst *CI = dyn_cast<CallInst>(&I))
             CI->updateProfWeight(NewEntryCount, PriorEntryCount);
+          if (InvokeInst *II = dyn_cast<InvokeInst>(&I))
+            II->updateProfWeight(NewEntryCount, PriorEntryCount);
+        }
   }
 }
 
@@ -2363,6 +2426,10 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
     // function which feed into its return value.
     AddReturnAttributes(CB, VMap);
 
+    // Clone attributes on the params of the callsite to calls within the
+    // inlined function which use the same param.
+    AddParamAndFnBasicAttributes(CB, VMap);
+
     propagateMemProfMetadata(CalledFunc, CB,
                              InlinedFunctionInfo.ContainsMemProfMetadata, VMap);
 
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index 143677f1d14b..a02ea1c06147 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -639,9 +639,8 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
             !NextDbgInsts.empty()) {
           auto DbgValueRange =
               LoopEntryBranch->cloneDebugInfoFrom(Inst, NextDbgInsts.begin());
-          RemapDbgVariableRecordRange(M, DbgValueRange, ValueMap,
-                                      RF_NoModuleLevelChanges |
-                                          RF_IgnoreMissingLocals);
+          RemapDbgRecordRange(M, DbgValueRange, ValueMap,
+                              RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
           // Erase anything we've seen before.
           for (DbgVariableRecord &DVR :
                make_early_inc_range(filterDbgVars(DbgValueRange)))
@@ -666,9 +665,8 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
       if (LoopEntryBranch->getParent()->IsNewDbgInfoFormat &&
           !NextDbgInsts.empty()) {
         auto Range = C->cloneDebugInfoFrom(Inst, NextDbgInsts.begin());
-        RemapDbgVariableRecordRange(M, Range, ValueMap,
-                                    RF_NoModuleLevelChanges |
-                                        RF_IgnoreMissingLocals);
+        RemapDbgRecordRange(M, Range, ValueMap,
+                            RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
         NextDbgInsts = DbgMarker::getEmptyDbgRecordRange();
         // Erase anything we've seen before.
         for (DbgVariableRecord &DVR :
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 6f0d00081572..1216538195fb 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -18,17 +18,20 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopedHashTable.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/ilist_iterator.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/IR/BasicBlock.h"
@@ -209,13 +212,140 @@ static bool isEpilogProfitable(Loop *L) {
   return false;
 }
 
+struct LoadValue {
+  Instruction *DefI = nullptr;
+  unsigned Generation = 0;
+  LoadValue() = default;
+  LoadValue(Instruction *Inst, unsigned Generation)
+      : DefI(Inst), Generation(Generation) {}
+};
+
+class StackNode {
+  ScopedHashTable<const SCEV *, LoadValue>::ScopeTy LoadScope;
+  unsigned CurrentGeneration;
+  unsigned ChildGeneration;
+  DomTreeNode *Node;
+  DomTreeNode::const_iterator ChildIter;
+  DomTreeNode::const_iterator EndIter;
+  bool Processed = false;
+
+public:
+  StackNode(ScopedHashTable<const SCEV *, LoadValue> &AvailableLoads,
+            unsigned cg, DomTreeNode *N, DomTreeNode::const_iterator Child,
+            DomTreeNode::const_iterator End)
+      : LoadScope(AvailableLoads), CurrentGeneration(cg), ChildGeneration(cg),
+        Node(N), ChildIter(Child), EndIter(End) {}
+  // Accessors.
+  unsigned currentGeneration() const { return CurrentGeneration; }
+  unsigned childGeneration() const { return ChildGeneration; }
+  void childGeneration(unsigned generation) { ChildGeneration = generation; }
+  DomTreeNode *node() { return Node; }
+  DomTreeNode::const_iterator childIter() const { return ChildIter; }
+
+  DomTreeNode *nextChild() {
+    DomTreeNode *Child = *ChildIter;
+    ++ChildIter;
+    return Child;
+  }
+
+  DomTreeNode::const_iterator end() const { return EndIter; }
+  bool isProcessed() const { return Processed; }
+  void process() { Processed = true; }
+};
+
+Value *getMatchingValue(LoadValue LV, LoadInst *LI, unsigned CurrentGeneration,
+                        BatchAAResults &BAA,
+                        function_ref<MemorySSA *()> GetMSSA) {
+  if (!LV.DefI)
+    return nullptr;
+  if (LV.DefI->getType() != LI->getType())
+    return nullptr;
+  if (LV.Generation != CurrentGeneration) {
+    MemorySSA *MSSA = GetMSSA();
+    if (!MSSA)
+      return nullptr;
+    auto *EarlierMA = MSSA->getMemoryAccess(LV.DefI);
+    MemoryAccess *LaterDef =
+        MSSA->getWalker()->getClobberingMemoryAccess(LI, BAA);
+    if (!MSSA->dominates(LaterDef, EarlierMA))
+      return nullptr;
+  }
+  return LV.DefI;
+}
+
+void loadCSE(Loop *L, DominatorTree &DT, ScalarEvolution &SE, LoopInfo &LI,
+             BatchAAResults &BAA, function_ref<MemorySSA *()> GetMSSA) {
+  ScopedHashTable<const SCEV *, LoadValue> AvailableLoads;
+  SmallVector<std::unique_ptr<StackNode>> NodesToProcess;
+  DomTreeNode *HeaderD = DT.getNode(L->getHeader());
+  NodesToProcess.emplace_back(new StackNode(AvailableLoads, 0, HeaderD,
+                                            HeaderD->begin(), HeaderD->end()));
+
+  unsigned CurrentGeneration = 0;
+  while (!NodesToProcess.empty()) {
+    StackNode *NodeToProcess = &*NodesToProcess.back();
+
+    CurrentGeneration = NodeToProcess->currentGeneration();
+
+    if (!NodeToProcess->isProcessed()) {
+      // Process the node.
+
+      // If this block has a single predecessor, then the predecessor is the
+      // parent
+      // of the domtree node and all of the live out memory values are still
+      // current in this block.  If this block has multiple predecessors, then
+      // they could have invalidated the live-out memory values of our parent
+      // value.  For now, just be conservative and invalidate memory if this
+      // block has multiple predecessors.
+      if (!NodeToProcess->node()->getBlock()->getSinglePredecessor())
+        ++CurrentGeneration;
+      for (auto &I : make_early_inc_range(*NodeToProcess->node()->getBlock())) {
+
+        auto *Load = dyn_cast<LoadInst>(&I);
+        if (!Load || !Load->isSimple()) {
+          if (I.mayWriteToMemory())
+            CurrentGeneration++;
+          continue;
+        }
+
+        const SCEV *PtrSCEV = SE.getSCEV(Load->getPointerOperand());
+        LoadValue LV = AvailableLoads.lookup(PtrSCEV);
+        if (Value *M =
+                getMatchingValue(LV, Load, CurrentGeneration, BAA, GetMSSA)) {
+          if (LI.replacementPreservesLCSSAForm(Load, M)) {
+            Load->replaceAllUsesWith(M);
+            Load->eraseFromParent();
+          }
+        } else {
+          AvailableLoads.insert(PtrSCEV, LoadValue(Load, CurrentGeneration));
+        }
+      }
+      NodeToProcess->childGeneration(CurrentGeneration);
+      NodeToProcess->process();
+    } else if (NodeToProcess->childIter() != NodeToProcess->end()) {
+      // Push the next child onto the stack.
+      DomTreeNode *Child = NodeToProcess->nextChild();
+      if (!L->contains(Child->getBlock()))
+        continue;
+      NodesToProcess.emplace_back(
+          new StackNode(AvailableLoads, NodeToProcess->childGeneration(), Child,
+                        Child->begin(), Child->end()));
+    } else {
+      // It has been processed, and there are no more children to process,
+      // so delete it and pop it off the stack.
+      NodesToProcess.pop_back();
+    }
+  }
+}
+
 /// Perform some cleanup and simplifications on loops after unrolling. It is
 /// useful to simplify the IV's in the new loop, as well as do a quick
 /// simplify/dce pass of the instructions.
 void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
                                    ScalarEvolution *SE, DominatorTree *DT,
                                    AssumptionCache *AC,
-                                   const TargetTransformInfo *TTI) {
+                                   const TargetTransformInfo *TTI,
+                                   AAResults *AA) {
   using namespace llvm::PatternMatch;
 
   // Simplify any new induction variables in the partially unrolled loop.
@@ -230,6 +360,16 @@ void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
       if (Instruction *Inst = dyn_cast_or_null<Instruction>(V))
         RecursivelyDeleteTriviallyDeadInstructions(Inst);
     }
+
+    if (AA) {
+      std::unique_ptr<MemorySSA> MSSA = nullptr;
+      BatchAAResults BAA(*AA);
+      loadCSE(L, *DT, *SE, *LI, BAA, [L, AA, DT, &MSSA]() -> MemorySSA * {
+        if (!MSSA)
+          MSSA.reset(new MemorySSA(*L, AA, DT));
+        return &*MSSA;
+      });
+    }
   }
 
   // At this point, the code is well formed.  Perform constprop, instsimplify,
@@ -237,6 +377,10 @@ void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
   SmallVector<WeakTrackingVH, 16> DeadInsts;
   for (BasicBlock *BB : L->getBlocks()) {
+    // Remove repeated debug instructions after loop unrolling.
+    if (BB->getParent()->getSubprogram())
+      RemoveRedundantDbgInstrs(BB);
+
     for (Instruction &Inst : llvm::make_early_inc_range(*BB)) {
       if (Value *V = simplifyInstruction(&Inst, {DL, nullptr, DT, AC}))
         if (LI->replacementPreservesLCSSAForm(&Inst, V))
@@ -292,12 +436,11 @@ void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
 ///
 /// If RemainderLoop is non-null, it will receive the remainder loop (if
 /// required and not fully unrolled).
-LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
-                                  ScalarEvolution *SE, DominatorTree *DT,
-                                  AssumptionCache *AC,
-                                  const TargetTransformInfo *TTI,
-                                  OptimizationRemarkEmitter *ORE,
-                                  bool PreserveLCSSA, Loop **RemainderLoop) {
+LoopUnrollResult
+llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
+                 ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
+                 const TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE,
+                 bool PreserveLCSSA, Loop **RemainderLoop, AAResults *AA) {
   assert(DT && "DomTree is required");
 
   if (!L->getLoopPreheader()) {
@@ -852,7 +995,7 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
   // At this point, the code is well formed.  We now simplify the unrolled loop,
   // doing constant propagation and dead code elimination as we go.
   simplifyLoopAfterUnroll(L, !CompletelyUnroll && ULO.Count > 1, LI, SE, DT, AC,
-                          TTI);
+                          TTI, AA);
 
   NumCompletelyUnrolled += CompletelyUnroll;
   ++NumUnrolled;
@@ -938,7 +1081,7 @@ MDNode *llvm::GetUnrollMetadata(MDNode *LoopID, StringRef Name) {
     if (!S)
       continue;
 
-    if (Name.equals(S->getString()))
+    if (Name == S->getString())
       return MD;
   }
   return nullptr;
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 2d5b5f967ffb..e1af02829c1d 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -917,9 +917,8 @@ bool llvm::UnrollRuntimeLoopRemainder(
     for (Instruction &I : *BB) {
       RemapInstruction(&I, VMap,
                        RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
-      RemapDbgVariableRecordRange(M, I.getDbgRecordRange(), VMap,
-                                  RF_NoModuleLevelChanges |
-                                      RF_IgnoreMissingLocals);
+      RemapDbgRecordRange(M, I.getDbgRecordRange(), VMap,
+                          RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
     }
   }
 
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index e3e09d11ba8c..cc883a7dc292 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -222,7 +222,7 @@ void llvm::addStringMetadataToLoop(Loop *TheLoop, const char *StringMD,
       // If it is of form key = value, try to parse it.
       if (Node->getNumOperands() == 2) {
         MDString *S = dyn_cast<MDString>(Node->getOperand(0));
-        if (S && S->getString().equals(StringMD)) {
+        if (S && S->getString() == StringMD) {
           ConstantInt *IntMD =
               mdconst::extract_or_null<ConstantInt>(Node->getOperand(1));
           if (IntMD && IntMD->getSExtValue() == V)
@@ -1034,15 +1034,6 @@ CmpInst::Predicate llvm::getMinMaxReductionPredicate(RecurKind RK) {
   }
 }
 
-Value *llvm::createAnyOfOp(IRBuilderBase &Builder, Value *StartVal,
-                           RecurKind RK, Value *Left, Value *Right) {
-  if (auto VTy = dyn_cast<VectorType>(Left->getType()))
-    StartVal = Builder.CreateVectorSplat(VTy->getElementCount(), StartVal);
-  Value *Cmp =
-      Builder.CreateCmp(CmpInst::ICMP_NE, Left, StartVal, "rdx.select.cmp");
-  return Builder.CreateSelect(Cmp, Left, Right, "rdx.select");
-}
-
 Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left,
                             Value *Right) {
   Type *Ty = Left->getType();
@@ -1151,16 +1142,13 @@ Value *llvm::createAnyOfTargetReduction(IRBuilderBase &Builder, Value *Src,
     NewVal = SI->getTrueValue();
   }
 
-  // Create a splat vector with the new value and compare this to the vector
-  // we want to reduce.
-  ElementCount EC = cast<VectorType>(Src->getType())->getElementCount();
-  Value *Right = Builder.CreateVectorSplat(EC, InitVal);
-  Value *Cmp =
-      Builder.CreateCmp(CmpInst::ICMP_NE, Src, Right, "rdx.select.cmp");
-
   // If any predicate is true it means that we want to select the new value.
-  Cmp = Builder.CreateOrReduce(Cmp);
-  return Builder.CreateSelect(Cmp, NewVal, InitVal, "rdx.select");
+  Value *AnyOf =
+      Src->getType()->isVectorTy() ? Builder.CreateOrReduce(Src) : Src;
+  // The compares in the loop may yield poison, which propagates through the
+  // bitwise ORs. Freeze it here before the condition is used.
+  AnyOf = Builder.CreateFreeze(AnyOf);
+  return Builder.CreateSelect(AnyOf, NewVal, InitVal, "rdx.select");
 }
 
 Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder, Value *Src,
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index 0464ba5e1811..77b2c50b4413 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -180,6 +180,8 @@ void StackInfoBuilder::visit(Instruction &Inst) {
 
 bool StackInfoBuilder::isInterestingAlloca(const AllocaInst &AI) {
   return (AI.getAllocatedType()->isSized() &&
+          // FIXME: support vscale.
+          !AI.getAllocatedType()->isScalableTy() &&
           // FIXME: instrument dynamic allocas, too
           AI.isStaticAlloca() &&
           // alloca() may be called with 0 size, ignore it.
diff --git a/llvm/lib/Transforms/Utils/MisExpect.cpp b/llvm/lib/Transforms/Utils/MisExpect.cpp
index 9cb7c54e0c8c..59e13795f0f2 100644
--- a/llvm/lib/Transforms/Utils/MisExpect.cpp
+++ b/llvm/lib/Transforms/Utils/MisExpect.cpp
@@ -59,9 +59,10 @@ static cl::opt<bool> PGOWarnMisExpect(
     cl::desc("Use this option to turn on/off "
              "warnings about incorrect usage of llvm.expect intrinsics."));
 
+// Command line option for setting the diagnostic tolerance threshold
 static cl::opt<uint32_t> MisExpectTolerance(
     "misexpect-tolerance", cl::init(0),
-    cl::desc("Prevents emiting diagnostics when profile counts are "
+    cl::desc("Prevents emitting diagnostics when profile counts are "
              "within N% of the threshold.."));
 
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index c6029b428ed3..ce40e8b31b76 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -143,7 +143,7 @@ static bool refineInstruction(SCCPSolver &Solver,
         Changed = true;
       }
     }
-  } else if (isa<ZExtInst>(Inst) && !Inst.hasNonNeg()) {
+  } else if (isa<PossiblyNonNegInst>(Inst) && !Inst.hasNonNeg()) {
     auto Range = GetRange(Inst.getOperand(0));
     if (Range.isAllNonNegative()) {
       Inst.setNonNeg();
@@ -191,14 +191,16 @@ static bool replaceSignedInst(SCCPSolver &Solver,
 
   Instruction *NewInst = nullptr;
   switch (Inst.getOpcode()) {
-  // Note: We do not fold sitofp -> uitofp here because that could be more
-  // expensive in codegen and may not be reversible in the backend.
+  case Instruction::SIToFP:
   case Instruction::SExt: {
-    // If the source value is not negative, this is a zext.
+    // If the source value is not negative, this is a zext/uitofp.
     Value *Op0 = Inst.getOperand(0);
     if (InsertedValues.count(Op0) || !isNonNegative(Op0))
       return false;
-    NewInst = new ZExtInst(Op0, Inst.getType(), "", Inst.getIterator());
+    NewInst = CastInst::Create(Inst.getOpcode() == Instruction::SExt
+                                   ? Instruction::ZExt
+                                   : Instruction::UIToFP,
+                               Op0, Inst.getType(), "", Inst.getIterator());
     NewInst->setNonNeg();
     break;
   }
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index b891eea0f341..689b4e75f52e 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -1126,9 +1126,8 @@ static void CloneInstructionsIntoPredecessorBlockAndUpdateSSAUses(
 
     NewBonusInst->insertInto(PredBlock, PTI->getIterator());
     auto Range = NewBonusInst->cloneDebugInfoFrom(&BonusInst);
-    RemapDbgVariableRecordRange(NewBonusInst->getModule(), Range, VMap,
-                                RF_NoModuleLevelChanges |
-                                    RF_IgnoreMissingLocals);
+    RemapDbgRecordRange(NewBonusInst->getModule(), Range, VMap,
+                        RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
 
     if (isa<DbgInfoIntrinsic>(BonusInst))
       continue;
@@ -3862,8 +3861,8 @@ static bool performBranchToCommonDestFolding(BranchInst *BI, BranchInst *PBI,
     PredBlock->getTerminator()->cloneDebugInfoFrom(BB->getTerminator());
     for (DbgVariableRecord &DVR :
          filterDbgVars(PredBlock->getTerminator()->getDbgRecordRange())) {
-      RemapDbgVariableRecord(M, &DVR, VMap,
-                             RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+      RemapDbgRecord(M, &DVR, VMap,
+                     RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
     }
   }
 
@@ -6584,16 +6583,17 @@ static void reuseTableCompare(
   Constant *FalseConst = ConstantInt::getFalse(RangeCmp->getType());
 
   // Check if the compare with the default value is constant true or false.
-  Constant *DefaultConst = ConstantExpr::getICmp(CmpInst->getPredicate(),
-                                                 DefaultValue, CmpOp1, true);
+  const DataLayout &DL = PhiBlock->getModule()->getDataLayout();
+  Constant *DefaultConst = ConstantFoldCompareInstOperands(
+      CmpInst->getPredicate(), DefaultValue, CmpOp1, DL);
   if (DefaultConst != TrueConst && DefaultConst != FalseConst)
     return;
 
   // Check if the compare with the case values is distinct from the default
   // compare result.
   for (auto ValuePair : Values) {
-    Constant *CaseConst = ConstantExpr::getICmp(CmpInst->getPredicate(),
-                                                ValuePair.second, CmpOp1, true);
+    Constant *CaseConst = ConstantFoldCompareInstOperands(
+        CmpInst->getPredicate(), ValuePair.second, CmpOp1, DL);
     if (!CaseConst || CaseConst == DefaultConst ||
         (CaseConst != TrueConst && CaseConst != FalseConst))
       return;
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 2e68a9c01898..174cc7a3c778 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -52,6 +52,10 @@ static cl::opt<bool>
 static cl::opt<bool>
     OptimizeHotColdNew("optimize-hot-cold-new", cl::Hidden, cl::init(false),
                        cl::desc("Enable hot/cold operator new library calls"));
+static cl::opt<bool> OptimizeExistingHotColdNew(
+    "optimize-existing-hot-cold-new", cl::Hidden, cl::init(false),
+    cl::desc(
+        "Enable optimization of existing hot/cold operator new library calls"));
 
 namespace {
 
@@ -81,6 +85,10 @@ struct HotColdHintParser : public cl::parser<unsigned> {
 static cl::opt<unsigned, false, HotColdHintParser> ColdNewHintValue(
     "cold-new-hint-value", cl::Hidden, cl::init(1),
     cl::desc("Value to pass to hot/cold operator new for cold allocation"));
+static cl::opt<unsigned, false, HotColdHintParser>
+    NotColdNewHintValue("notcold-new-hint-value", cl::Hidden, cl::init(128),
+                        cl::desc("Value to pass to hot/cold operator new for "
+                                 "notcold (warm) allocation"));
 static cl::opt<unsigned, false, HotColdHintParser> HotNewHintValue(
     "hot-new-hint-value", cl::Hidden, cl::init(254),
     cl::desc("Value to pass to hot/cold operator new for hot allocation"));
@@ -1722,45 +1730,122 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B,
   uint8_t HotCold;
   if (CI->getAttributes().getFnAttr("memprof").getValueAsString() == "cold")
     HotCold = ColdNewHintValue;
+  else if (CI->getAttributes().getFnAttr("memprof").getValueAsString() ==
+           "notcold")
+    HotCold = NotColdNewHintValue;
   else if (CI->getAttributes().getFnAttr("memprof").getValueAsString() == "hot")
     HotCold = HotNewHintValue;
   else
     return nullptr;
 
+  // For calls that already pass a hot/cold hint, only update the hint if
+  // directed by OptimizeExistingHotColdNew. For other calls to new, add a hint
+  // if cold or hot, and leave as-is for default handling if "notcold" aka warm.
+  // Note that in cases where we decide it is "notcold", it might be slightly
+  // better to replace the hinted call with a non hinted call, to avoid the
+  // extra paramter and the if condition check of the hint value in the
+  // allocator. This can be considered in the future.
   switch (Func) {
+  case LibFunc_Znwm12__hot_cold_t:
+    if (OptimizeExistingHotColdNew)
+      return emitHotColdNew(CI->getArgOperand(0), B, TLI,
+                            LibFunc_Znwm12__hot_cold_t, HotCold);
+    break;
   case LibFunc_Znwm:
-    return emitHotColdNew(CI->getArgOperand(0), B, TLI,
-                          LibFunc_Znwm12__hot_cold_t, HotCold);
+    if (HotCold != NotColdNewHintValue)
+      return emitHotColdNew(CI->getArgOperand(0), B, TLI,
+                            LibFunc_Znwm12__hot_cold_t, HotCold);
+    break;
+  case LibFunc_Znam12__hot_cold_t:
+    if (OptimizeExistingHotColdNew)
+      return emitHotColdNew(CI->getArgOperand(0), B, TLI,
+                            LibFunc_Znam12__hot_cold_t, HotCold);
+    break;
   case LibFunc_Znam:
-    return emitHotColdNew(CI->getArgOperand(0), B, TLI,
-                          LibFunc_Znam12__hot_cold_t, HotCold);
+    if (HotCold != NotColdNewHintValue)
+      return emitHotColdNew(CI->getArgOperand(0), B, TLI,
+                            LibFunc_Znam12__hot_cold_t, HotCold);
+    break;
+  case LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t:
+    if (OptimizeExistingHotColdNew)
+      return emitHotColdNewNoThrow(
+          CI->getArgOperand(0), CI->getArgOperand(1), B, TLI,
+          LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t, HotCold);
+    break;
   case LibFunc_ZnwmRKSt9nothrow_t:
-    return emitHotColdNewNoThrow(CI->getArgOperand(0), CI->getArgOperand(1), B,
-                                 TLI, LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t,
-                                 HotCold);
+    if (HotCold != NotColdNewHintValue)
+      return emitHotColdNewNoThrow(
+          CI->getArgOperand(0), CI->getArgOperand(1), B, TLI,
+          LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t, HotCold);
+    break;
+  case LibFunc_ZnamRKSt9nothrow_t12__hot_cold_t:
+    if (OptimizeExistingHotColdNew)
+      return emitHotColdNewNoThrow(
+          CI->getArgOperand(0), CI->getArgOperand(1), B, TLI,
+          LibFunc_ZnamRKSt9nothrow_t12__hot_cold_t, HotCold);
+    break;
   case LibFunc_ZnamRKSt9nothrow_t:
-    return emitHotColdNewNoThrow(CI->getArgOperand(0), CI->getArgOperand(1), B,
-                                 TLI, LibFunc_ZnamRKSt9nothrow_t12__hot_cold_t,
-                                 HotCold);
+    if (HotCold != NotColdNewHintValue)
+      return emitHotColdNewNoThrow(
+          CI->getArgOperand(0), CI->getArgOperand(1), B, TLI,
+          LibFunc_ZnamRKSt9nothrow_t12__hot_cold_t, HotCold);
+    break;
+  case LibFunc_ZnwmSt11align_val_t12__hot_cold_t:
+    if (OptimizeExistingHotColdNew)
+      return emitHotColdNewAligned(
+          CI->getArgOperand(0), CI->getArgOperand(1), B, TLI,
+          LibFunc_ZnwmSt11align_val_t12__hot_cold_t, HotCold);
+    break;
   case LibFunc_ZnwmSt11align_val_t:
-    return emitHotColdNewAligned(CI->getArgOperand(0), CI->getArgOperand(1), B,
-                                 TLI, LibFunc_ZnwmSt11align_val_t12__hot_cold_t,
-                                 HotCold);
+    if (HotCold != NotColdNewHintValue)
+      return emitHotColdNewAligned(
+          CI->getArgOperand(0), CI->getArgOperand(1), B, TLI,
+          LibFunc_ZnwmSt11align_val_t12__hot_cold_t, HotCold);
+    break;
+  case LibFunc_ZnamSt11align_val_t12__hot_cold_t:
+    if (OptimizeExistingHotColdNew)
+      return emitHotColdNewAligned(
+          CI->getArgOperand(0), CI->getArgOperand(1), B, TLI,
+          LibFunc_ZnamSt11align_val_t12__hot_cold_t, HotCold);
+    break;
   case LibFunc_ZnamSt11align_val_t:
-    return emitHotColdNewAligned(CI->getArgOperand(0), CI->getArgOperand(1), B,
-                                 TLI, LibFunc_ZnamSt11align_val_t12__hot_cold_t,
-                                 HotCold);
+    if (HotCold != NotColdNewHintValue)
+      return emitHotColdNewAligned(
+          CI->getArgOperand(0), CI->getArgOperand(1), B, TLI,
+          LibFunc_ZnamSt11align_val_t12__hot_cold_t, HotCold);
+    break;
+  case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t:
+    if (OptimizeExistingHotColdNew)
+      return emitHotColdNewAlignedNoThrow(
+          CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), B,
+          TLI, LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t,
+          HotCold);
+    break;
   case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t:
-    return emitHotColdNewAlignedNoThrow(
-        CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), B,
-        TLI, LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t, HotCold);
+    if (HotCold != NotColdNewHintValue)
+      return emitHotColdNewAlignedNoThrow(
+          CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), B,
+          TLI, LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t,
+          HotCold);
+    break;
+  case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t:
+    if (OptimizeExistingHotColdNew)
+      return emitHotColdNewAlignedNoThrow(
+          CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), B,
+          TLI, LibFunc_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t,
+          HotCold);
+    break;
   case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t:
-    return emitHotColdNewAlignedNoThrow(
-        CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), B,
-        TLI, LibFunc_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t, HotCold);
+    if (HotCold != NotColdNewHintValue)
+      return emitHotColdNewAlignedNoThrow(
+          CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), B,
+          TLI, LibFunc_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t,
+          HotCold);
+    break;
   default:
     return nullptr;
   }
+  return nullptr;
 }
 
 //===----------------------------------------------------------------------===//
@@ -3675,6 +3760,14 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
     case LibFunc_ZnamRKSt9nothrow_t:
     case LibFunc_ZnamSt11align_val_t:
     case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t:
+    case LibFunc_Znwm12__hot_cold_t:
+    case LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t:
+    case LibFunc_ZnwmSt11align_val_t12__hot_cold_t:
+    case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t:
+    case LibFunc_Znam12__hot_cold_t:
+    case LibFunc_ZnamRKSt9nothrow_t12__hot_cold_t:
+    case LibFunc_ZnamSt11align_val_t12__hot_cold_t:
+    case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t:
       return optimizeNew(CI, Builder, Func);
     default:
       break;
diff --git a/llvm/lib/Transforms/Utils/SymbolRewriter.cpp b/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
index 8b4f34209e85..d52d52a9b7d3 100644
--- a/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
+++ b/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
@@ -308,11 +308,11 @@ bool RewriteMapParser::parseEntry(yaml::Stream &YS, yaml::KeyValueNode &Entry,
   }
 
   RewriteType = Key->getValue(KeyStorage);
-  if (RewriteType.equals("function"))
+  if (RewriteType == "function")
     return parseRewriteFunctionDescriptor(YS, Key, Value, DL);
-  else if (RewriteType.equals("global variable"))
+  else if (RewriteType == "global variable")
     return parseRewriteGlobalVariableDescriptor(YS, Key, Value, DL);
-  else if (RewriteType.equals("global alias"))
+  else if (RewriteType == "global alias")
     return parseRewriteGlobalAliasDescriptor(YS, Key, Value, DL);
 
   YS.printError(Entry.getKey(), "unknown rewrite type");
@@ -348,7 +348,7 @@ parseRewriteFunctionDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
     }
 
     KeyValue = Key->getValue(KeyStorage);
-    if (KeyValue.equals("source")) {
+    if (KeyValue == "source") {
       std::string Error;
 
       Source = std::string(Value->getValue(ValueStorage));
@@ -356,11 +356,11 @@ parseRewriteFunctionDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
         YS.printError(Field.getKey(), "invalid regex: " + Error);
         return false;
       }
-    } else if (KeyValue.equals("target")) {
+    } else if (KeyValue == "target") {
       Target = std::string(Value->getValue(ValueStorage));
-    } else if (KeyValue.equals("transform")) {
+    } else if (KeyValue == "transform") {
       Transform = std::string(Value->getValue(ValueStorage));
-    } else if (KeyValue.equals("naked")) {
+    } else if (KeyValue == "naked") {
       std::string Undecorated;
 
       Undecorated = std::string(Value->getValue(ValueStorage));
@@ -417,7 +417,7 @@ parseRewriteGlobalVariableDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
     }
 
     KeyValue = Key->getValue(KeyStorage);
-    if (KeyValue.equals("source")) {
+    if (KeyValue == "source") {
       std::string Error;
 
       Source = std::string(Value->getValue(ValueStorage));
@@ -425,9 +425,9 @@ parseRewriteGlobalVariableDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
         YS.printError(Field.getKey(), "invalid regex: " + Error);
         return false;
       }
-    } else if (KeyValue.equals("target")) {
+    } else if (KeyValue == "target") {
       Target = std::string(Value->getValue(ValueStorage));
-    } else if (KeyValue.equals("transform")) {
+    } else if (KeyValue == "transform") {
       Transform = std::string(Value->getValue(ValueStorage));
     } else {
       YS.printError(Field.getKey(), "unknown Key for Global Variable");
@@ -480,7 +480,7 @@ parseRewriteGlobalAliasDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
     }
 
     KeyValue = Key->getValue(KeyStorage);
-    if (KeyValue.equals("source")) {
+    if (KeyValue == "source") {
       std::string Error;
 
       Source = std::string(Value->getValue(ValueStorage));
@@ -488,9 +488,9 @@ parseRewriteGlobalAliasDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
         YS.printError(Field.getKey(), "invalid regex: " + Error);
         return false;
       }
-    } else if (KeyValue.equals("target")) {
+    } else if (KeyValue == "target") {
       Target = std::string(Value->getValue(ValueStorage));
-    } else if (KeyValue.equals("transform")) {
+    } else if (KeyValue == "transform") {
       Transform = std::string(Value->getValue(ValueStorage));
     } else {
       YS.printError(Field.getKey(), "unknown key for Global Alias");
diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp
index 6ebdd85d37b4..1696e9c72673 100644
--- a/llvm/lib/Transforms/Utils/ValueMapper.cpp
+++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp
@@ -538,17 +538,20 @@ Value *Mapper::mapValue(const Value *V) {
 }
 
 void Mapper::remapDbgRecord(DbgRecord &DR) {
+  // Remap DILocations.
+  auto *MappedDILoc = mapMetadata(DR.getDebugLoc());
+  DR.setDebugLoc(DebugLoc(cast<DILocation>(MappedDILoc)));
+
   if (DbgLabelRecord *DLR = dyn_cast<DbgLabelRecord>(&DR)) {
+    // Remap labels.
     DLR->setLabel(cast<DILabel>(mapMetadata(DLR->getLabel())));
     return;
   }
 
   DbgVariableRecord &V = cast<DbgVariableRecord>(DR);
-  // Remap variables and DILocations.
+  // Remap variables.
   auto *MappedVar = mapMetadata(V.getVariable());
-  auto *MappedDILoc = mapMetadata(V.getDebugLoc());
   V.setVariable(cast<DILocalVariable>(MappedVar));
-  V.setDebugLoc(DebugLoc(cast<DILocation>(MappedDILoc)));
 
   bool IgnoreMissingLocals = Flags & RF_IgnoreMissingLocals;
 
@@ -1233,14 +1236,14 @@ void ValueMapper::remapInstruction(Instruction &I) {
   FlushingMapper(pImpl)->remapInstruction(&I);
 }
 
-void ValueMapper::remapDbgVariableRecord(Module *M, DbgVariableRecord &V) {
-  FlushingMapper(pImpl)->remapDbgRecord(V);
+void ValueMapper::remapDbgRecord(Module *M, DbgRecord &DR) {
+  FlushingMapper(pImpl)->remapDbgRecord(DR);
 }
 
-void ValueMapper::remapDbgVariableRecordRange(
+void ValueMapper::remapDbgRecordRange(
     Module *M, iterator_range<DbgRecord::self_iterator> Range) {
-  for (DbgVariableRecord &DVR : filterDbgVars(Range)) {
-    remapDbgVariableRecord(M, DVR);
+  for (DbgRecord &DR : Range) {
+    remapDbgRecord(M, DR);
   }
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index d33743e74cbe..9de49d1bcfea 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1067,6 +1067,15 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   if (!LAI->canVectorizeMemory())
     return false;
 
+  if (LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) {
+    reportVectorizationFailure("We don't allow storing to uniform addresses",
+                               "write to a loop invariant address could not "
+                               "be vectorized",
+                               "CantVectorizeStoreToLoopInvariantAddress", ORE,
+                               TheLoop);
+    return false;
+  }
+
   // We can vectorize stores to invariant address when final reduction value is
   // guaranteed to be stored at the end of the loop. Also, if decision to
   // vectorize loop is made, runtime checks are added so as to make sure that
@@ -1102,13 +1111,12 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
       }
     }
 
-    if (LAI->hasDependenceInvolvingLoopInvariantAddress()) {
+    if (LAI->hasStoreStoreDependenceInvolvingLoopInvariantAddress()) {
       // For each invariant address, check its last stored value is the result
       // of one of our reductions.
       //
-      // We do not check if dependence with loads exists because they are
-      // currently rejected earlier in LoopAccessInfo::analyzeLoop. In case this
-      // behaviour changes we have to modify this code.
+      // We do not check if dependence with loads exists because that is already
+      // checked via hasLoadStoreDependenceInvolvingLoopInvariantAddress.
       ScalarEvolution *SE = PSE.getSE();
       SmallVector<StoreInst *, 4> UnhandledStores;
       for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index ebca2d855a46..ece2a34f180c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -68,9 +68,7 @@ class VPBuilder {
 public:
   VPBuilder() = default;
   VPBuilder(VPBasicBlock *InsertBB) { setInsertPoint(InsertBB); }
-  VPBuilder(VPRecipeBase *InsertPt) {
-    setInsertPoint(InsertPt->getParent(), InsertPt->getIterator());
-  }
+  VPBuilder(VPRecipeBase *InsertPt) { setInsertPoint(InsertPt); }
 
   /// Clear the insertion point: created instructions will not be inserted into
   /// a block.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 59012170a386..988f45c3f08d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1571,13 +1571,7 @@ public:
   /// Returns true if VP intrinsics with explicit vector length support should
   /// be generated in the tail folded loop.
   bool foldTailWithEVL() const {
-    return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL &&
-           // FIXME: remove this once vp_reverse is supported.
-           none_of(
-               WideningDecisions,
-               [](const std::pair<std::pair<Instruction *, ElementCount>,
-                                  std::pair<InstWidening, InstructionCost>>
-                      &Data) { return Data.second.first == CM_Widen_Reverse; });
+    return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL;
   }
 
   /// Returns true if the Phi is part of an inloop reduction.
@@ -3042,9 +3036,8 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue(
   }
 
   // Create phi nodes to merge from the  backedge-taken check block.
-  PHINode *BCResumeVal =
-      PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
-                      LoopScalarPreHeader->getTerminator()->getIterator());
+  PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
+                                         LoopScalarPreHeader->getFirstNonPHI());
   // Copy original phi DL over to the new one.
   BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
 
@@ -7450,7 +7443,8 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) {
 static void createAndCollectMergePhiForReduction(
     VPInstruction *RedResult,
     DenseMap<const RecurrenceDescriptor *, Value *> &ReductionResumeValues,
-    VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock) {
+    VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock,
+    bool VectorizingEpilogue) {
   if (!RedResult ||
       RedResult->getOpcode() != VPInstruction::ComputeReductionResult)
     return;
@@ -7458,11 +7452,20 @@ static void createAndCollectMergePhiForReduction(
   auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
 
-  TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
   Value *FinalValue =
       State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane()));
   auto *ResumePhi =
       dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
+  if (VectorizingEpilogue && RecurrenceDescriptor::isAnyOfRecurrenceKind(
+                                 RdxDesc.getRecurrenceKind())) {
+    auto *Cmp = cast<ICmpInst>(PhiR->getStartValue()->getUnderlyingValue());
+    assert(Cmp->getPredicate() == CmpInst::ICMP_NE);
+    assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue());
+    ResumePhi = cast<PHINode>(Cmp->getOperand(0));
+  }
+  assert((!VectorizingEpilogue || ResumePhi) &&
+         "when vectorizing the epilogue loop, we need a resume phi from main "
+         "vector loop");
 
   // TODO: bc.merge.rdx should not be created here, instead it should be
   // modeled in VPlan.
@@ -7483,7 +7486,7 @@ static void createAndCollectMergePhiForReduction(
       BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
                               Incoming);
     else
-      BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
+      BCBlockPhi->addIncoming(RdxDesc.getRecurrenceStartValue(), Incoming);
   }
 
   auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
@@ -7588,9 +7591,9 @@ LoopVectorizationPlanner::executePlan(
   auto *ExitVPBB =
       cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
   for (VPRecipeBase &R : *ExitVPBB) {
-    createAndCollectMergePhiForReduction(dyn_cast<VPInstruction>(&R),
-                                         ReductionResumeValues, State, OrigLoop,
-                                         State.CFG.VPBB2IRBB[ExitVPBB]);
+    createAndCollectMergePhiForReduction(
+        dyn_cast<VPInstruction>(&R), ReductionResumeValues, State, OrigLoop,
+        State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
   }
 
   // 2.6. Maintain Loop Hints
@@ -7776,11 +7779,10 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
 
   // Now, compare the remaining count and if there aren't enough iterations to
   // execute the vectorized epilogue skip to the scalar part.
-  BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
-  VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
-  LoopVectorPreHeader =
-      SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
-                 LI, nullptr, "vec.epilog.ph");
+  LoopVectorPreHeader->setName("vec.epilog.ph");
+  BasicBlock *VecEpilogueIterationCountCheck =
+      SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->begin(), DT, LI,
+                 nullptr, "vec.epilog.iter.check", true);
   emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
                                           VecEpilogueIterationCountCheck);
 
@@ -8839,8 +8841,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
       VPValue *StrideVPV = Plan->getLiveIn(U);
       if (!StrideVPV)
         continue;
-      VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(
-          U->getType(), ScevStride->getAPInt().getSExtValue()));
+      unsigned BW = U->getType()->getScalarSizeInBits();
+      APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW)
+                                 : ScevStride->getAPInt().zext(BW);
+      VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C));
       StrideVPV->replaceAllUsesWith(CI);
     }
   }
@@ -8915,6 +8919,10 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
 // A ComputeReductionResult recipe is added to the middle block, also for
 // in-loop reductions which compute their result in-loop, because generating
 // the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
+//
+// Adjust AnyOf reductions; replace the reduction phi for the selected value
+// with a boolean reduction phi node to check if the condition is true in any
+// iteration. The final value is selected by the final ComputeReductionResult.
 void LoopVectorizationPlanner::adjustRecipesForReductions(
     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
     ElementCount MinVF) {
@@ -9089,6 +9097,41 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
       continue;
 
     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
+    // Adjust AnyOf reductions; replace the reduction phi for the selected value
+    // with a boolean reduction phi node to check if the condition is true in
+    // any iteration. The final value is selected by the final
+    // ComputeReductionResult.
+    if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
+            RdxDesc.getRecurrenceKind())) {
+      auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
+        return isa<VPWidenSelectRecipe>(U) ||
+               (isa<VPReplicateRecipe>(U) &&
+                cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
+                    Instruction::Select);
+      }));
+      VPValue *Cmp = Select->getOperand(0);
+      // If the compare is checking the reduction PHI node, adjust it to check
+      // the start value.
+      if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) {
+        for (unsigned I = 0; I != CmpR->getNumOperands(); ++I)
+          if (CmpR->getOperand(I) == PhiR)
+            CmpR->setOperand(I, PhiR->getStartValue());
+      }
+      VPBuilder::InsertPointGuard Guard(Builder);
+      Builder.setInsertPoint(Select);
+
+      // If the true value of the select is the reduction phi, the new value is
+      // selected if the negated condition is true in any iteration.
+      if (Select->getOperand(1) == PhiR)
+        Cmp = Builder.createNot(Cmp);
+      VPValue *Or = Builder.createOr(PhiR, Cmp);
+      Select->getVPSingleValue()->replaceAllUsesWith(Or);
+
+      // Convert the reduction phi to operate on bools.
+      PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
+                              OrigLoop->getHeader()->getContext())));
+    }
+
     // If tail is folded by masking, introduce selects between the phi
     // and the live-out instruction of each reduction, at the beginning of the
     // dedicated latch block.
@@ -9121,7 +9164,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
     // then extend the loop exit value to enable InstCombine to evaluate the
     // entire expression in the smaller type.
     Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
-    if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
+    if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
+        !RecurrenceDescriptor::isAnyOfRecurrenceKind(
+            RdxDesc.getRecurrenceKind())) {
       assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
       Type *RdxTy = RdxDesc.getRecurrenceType();
       auto *Trunc =
@@ -9384,12 +9429,20 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
   }
 }
 
+/// Use all-true mask for reverse rather than actual mask, as it avoids a
+/// dependence w/o affecting the result.
+static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
+                                     Value *EVL, const Twine &Name) {
+  VectorType *ValTy = cast<VectorType>(Operand->getType());
+  Value *AllTrueMask =
+      Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
+  return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
+                                 {Operand, AllTrueMask, EVL}, nullptr, Name);
+}
+
 void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
   assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
                           "explicit vector length.");
-  // FIXME: Support reverse loading after vp_reverse is added.
-  assert(!isReverse() && "Reverse loads are not implemented yet.");
-
   auto *LI = cast<LoadInst>(&Ingredient);
 
   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
@@ -9402,9 +9455,15 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
   CallInst *NewLI;
   Value *EVL = State.get(getEVL(), VPIteration(0, 0));
   Value *Addr = State.get(getAddr(), 0, !CreateGather);
-  Value *Mask = getMask()
-                    ? State.get(getMask(), 0)
-                    : Builder.CreateVectorSplat(State.VF, Builder.getTrue());
+  Value *Mask = nullptr;
+  if (VPValue *VPMask = getMask()) {
+    Mask = State.get(VPMask, 0);
+    if (isReverse())
+      Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
+  } else {
+    Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
+  }
+
   if (CreateGather) {
     NewLI =
         Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
@@ -9418,7 +9477,10 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
   NewLI->addParamAttr(
       0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
   State.addMetadata(NewLI, LI);
-  State.set(this, NewLI, 0);
+  Instruction *Res = NewLI;
+  if (isReverse())
+    Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
+  State.set(this, Res, 0);
 }
 
 void VPWidenStoreRecipe::execute(VPTransformState &State) {
@@ -9464,9 +9526,6 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) {
 void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
   assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
                           "explicit vector length.");
-  // FIXME: Support reverse loading after vp_reverse is added.
-  assert(!isReverse() && "Reverse store are not implemented yet.");
-
   auto *SI = cast<StoreInst>(&Ingredient);
 
   VPValue *StoredValue = getStoredValue();
@@ -9479,10 +9538,16 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
   CallInst *NewSI = nullptr;
   Value *StoredVal = State.get(StoredValue, 0);
   Value *EVL = State.get(getEVL(), VPIteration(0, 0));
-  // FIXME: Support reverse store after vp_reverse is added.
-  Value *Mask = getMask()
-                    ? State.get(getMask(), 0)
-                    : Builder.CreateVectorSplat(State.VF, Builder.getTrue());
+  if (isReverse())
+    StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
+  Value *Mask = nullptr;
+  if (VPValue *VPMask = getMask()) {
+    Mask = State.get(VPMask, 0);
+    if (isReverse())
+      Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
+  } else {
+    Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
+  }
   Value *Addr = State.get(getAddr(), 0, !CreateScatter);
   if (CreateScatter) {
     NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
@@ -10142,9 +10207,19 @@ bool LoopVectorizePass::processLoop(Loop *L) {
           Value *ResumeV = nullptr;
           // TODO: Move setting of resume values to prepareToExecute.
           if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
-            ResumeV = ReductionResumeValues
-                          .find(&ReductionPhi->getRecurrenceDescriptor())
-                          ->second;
+            const RecurrenceDescriptor &RdxDesc =
+                ReductionPhi->getRecurrenceDescriptor();
+            RecurKind RK = RdxDesc.getRecurrenceKind();
+            ResumeV = ReductionResumeValues.find(&RdxDesc)->second;
+            if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
+              // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
+              // start value; compare the final value from the main vector loop
+              // to the start value.
+              IRBuilder<> Builder(
+                  cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
+              ResumeV = Builder.CreateICmpNE(ResumeV,
+                                             RdxDesc.getRecurrenceStartValue());
+            }
           } else {
             // Create induction resume values for both widened pointer and
             // integer/fp inductions and update the start value of the induction
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index bc553c5009ed..2e0a39c4b4fd 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -11419,8 +11419,16 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy) {
     if (Scalar->getType() != Ty) {
       assert(Scalar->getType()->isIntegerTy() && Ty->isIntegerTy() &&
              "Expected integer types only.");
+      Value *V = Scalar;
+      if (auto *CI = dyn_cast<CastInst>(Scalar);
+          isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
+        Value *Op = CI->getOperand(0);
+        if (auto *IOp = dyn_cast<Instruction>(Op);
+            !IOp || !(isDeleted(IOp) || getTreeEntry(IOp)))
+          V = Op;
+      }
       Scalar = Builder.CreateIntCast(
-          Scalar, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
+          V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
     }
 
     Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
@@ -11640,13 +11648,14 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
 
   /// Cast value \p V to the vector type with the same number of elements, but
   /// the base type \p ScalarTy.
-  Value *castToScalarTyElem(Value *V) {
+  Value *castToScalarTyElem(Value *V,
+                            std::optional<bool> IsSigned = std::nullopt) {
     auto *VecTy = cast<VectorType>(V->getType());
     if (VecTy->getElementType() == ScalarTy)
       return V;
     return Builder.CreateIntCast(
         V, VectorType::get(ScalarTy, VecTy->getElementCount()),
-        !isKnownNonNegative(V, SimplifyQuery(*R.DL)));
+        IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
   }
 
 public:
@@ -11795,12 +11804,30 @@ public:
   /// Adds 2 input vectors (in form of tree entries) and the mask for their
   /// shuffling.
   void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
-    add(E1.VectorizedValue, E2.VectorizedValue, Mask);
+    Value *V1 = E1.VectorizedValue;
+    if (V1->getType()->isIntOrIntVectorTy())
+      V1 = castToScalarTyElem(V1, all_of(E1.Scalars, [&](Value *V) {
+                                return !isKnownNonNegative(
+                                    V, SimplifyQuery(*R.DL));
+                              }));
+    Value *V2 = E2.VectorizedValue;
+    if (V2->getType()->isIntOrIntVectorTy())
+      V2 = castToScalarTyElem(V2, all_of(E2.Scalars, [&](Value *V) {
+                                return !isKnownNonNegative(
+                                    V, SimplifyQuery(*R.DL));
+                              }));
+    add(V1, V2, Mask);
   }
   /// Adds single input vector (in form of tree entry) and the mask for its
   /// shuffling.
   void add(const TreeEntry &E1, ArrayRef<int> Mask) {
-    add(E1.VectorizedValue, Mask);
+    Value *V1 = E1.VectorizedValue;
+    if (V1->getType()->isIntOrIntVectorTy())
+      V1 = castToScalarTyElem(V1, all_of(E1.Scalars, [&](Value *V) {
+                                return !isKnownNonNegative(
+                                    V, SimplifyQuery(*R.DL));
+                              }));
+    add(V1, Mask);
   }
   /// Adds 2 input vectors and the mask for their shuffling.
   void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
@@ -13150,6 +13177,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       } else {
         assert(E->State == TreeEntry::StridedVectorize &&
                "Expected either strided or conseutive stores.");
+        if (!E->ReorderIndices.empty()) {
+          SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
+          Ptr = SI->getPointerOperand();
+        }
         Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
         Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
         auto *Inst = Builder.CreateIntrinsic(
@@ -13747,7 +13778,10 @@ Value *BoUpSLP::vectorizeTree(
             auto VecIt = VectorCasts.find(Key);
             if (VecIt == VectorCasts.end()) {
               IRBuilderBase::InsertPointGuard Guard(Builder);
-              if (auto *IVec = dyn_cast<Instruction>(Vec))
+              if (auto *IVec = dyn_cast<PHINode>(Vec))
+                Builder.SetInsertPoint(
+                    IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
+              else if (auto *IVec = dyn_cast<Instruction>(Vec))
                 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
               Vec = Builder.CreateIntCast(
                   Vec,
@@ -15141,14 +15175,18 @@ bool BoUpSLP::collectValuesToDemote(
                "Expected min/max intrinsics only.");
         unsigned SignBits = OrigBitWidth - BitWidth;
         APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
-        return SignBits <= ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
-                                              nullptr, DT) &&
-               (!isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL)) ||
+        unsigned Op0SignBits = ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
+                                              nullptr, DT);
+        unsigned Op1SignBits = ComputeNumSignBits(I->getOperand(1), *DL, 0, AC,
+                                              nullptr, DT);
+        return SignBits <= Op0SignBits &&
+               ((SignBits != Op0SignBits &&
+                 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
                 MaskedValueIsZero(I->getOperand(0), Mask,
                                   SimplifyQuery(*DL))) &&
-               SignBits <= ComputeNumSignBits(I->getOperand(1), *DL, 0, AC,
-                                              nullptr, DT) &&
-               (!isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL)) ||
+               SignBits <= Op1SignBits &&
+               ((SignBits != Op1SignBits &&
+                 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
                 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
       });
     };
@@ -15457,8 +15495,7 @@ void BoUpSLP::computeMinimumValueSizes() {
       TreeEntry *TE = VectorizableTree[Idx].get();
       if (MinBWs.contains(TE))
         continue;
-      bool IsSigned = TE->getOpcode() == Instruction::SExt ||
-                      any_of(TE->Scalars, [&](Value *R) {
+      bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
                         return !isKnownNonNegative(R, SimplifyQuery(*DL));
                       });
       MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 71594be2b965..0784665efd14 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -841,6 +841,7 @@ public:
   static inline bool classof(const VPRecipeBase *R) {
     switch (R->getVPDefID()) {
     case VPRecipeBase::VPDerivedIVSC:
+    case VPRecipeBase::VPEVLBasedIVPHISC:
     case VPRecipeBase::VPExpandSCEVSC:
     case VPRecipeBase::VPInstructionSC:
     case VPRecipeBase::VPReductionSC:
@@ -1757,7 +1758,8 @@ public:
   const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }
 
   /// Returns true if the induction is canonical, i.e. starting at 0 and
-  /// incremented by UF * VF (= the original IV is incremented by 1).
+  /// incremented by UF * VF (= the original IV is incremented by 1) and has the
+  /// same type as the canonical induction.
   bool isCanonical() const;
 
   /// Returns the scalar type of the induction.
@@ -2413,8 +2415,8 @@ struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue {
 struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
   VPWidenLoadEVLRecipe(VPWidenLoadRecipe *L, VPValue *EVL, VPValue *Mask)
       : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L->getIngredient(),
-                            {L->getAddr(), EVL}, L->isConsecutive(), false,
-                            L->getDebugLoc()),
+                            {L->getAddr(), EVL}, L->isConsecutive(),
+                            L->isReverse(), L->getDebugLoc()),
         VPValue(this, &getIngredient()) {
     setMask(Mask);
   }
@@ -2490,7 +2492,8 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
   VPWidenStoreEVLRecipe(VPWidenStoreRecipe *S, VPValue *EVL, VPValue *Mask)
       : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S->getIngredient(),
                             {S->getAddr(), S->getStoredValue(), EVL},
-                            S->isConsecutive(), false, S->getDebugLoc()) {
+                            S->isConsecutive(), S->isReverse(),
+                            S->getDebugLoc()) {
     setMask(Mask);
   }
 
@@ -2708,12 +2711,6 @@ public:
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
 #endif
-
-  /// Returns the scalar type of the induction.
-  const Type *getScalarType() const {
-    return cast<VPCanonicalIVPHIRecipe>(getOperand(0)->getDefiningRecipe())
-        ->getScalarType();
-  }
 };
 
 /// A recipe for converting the input value \p IV value to the corresponding
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 29ed001ccd2c..140516e08e79 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -513,6 +513,8 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
     // Reduce all of the unrolled parts into a single vector.
     Value *ReducedPartRdx = RdxParts[0];
     unsigned Op = RecurrenceDescriptor::getOpcode(RK);
+    if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
+      Op = Instruction::Or;
 
     if (PhiR->isOrdered()) {
       ReducedPartRdx = RdxParts[State.UF - 1];
@@ -525,19 +527,16 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
         if (Op != Instruction::ICmp && Op != Instruction::FCmp)
           ReducedPartRdx = Builder.CreateBinOp(
               (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
-        else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
-          TrackingVH<Value> ReductionStartValue =
-              RdxDesc.getRecurrenceStartValue();
-          ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK,
-                                         ReducedPartRdx, RdxPart);
-        } else
+        else
           ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
       }
     }
 
     // Create the reduction after the loop. Note that inloop reductions create
     // the target reduction in the loop using a Reduction recipe.
-    if (State.VF.isVector() && !PhiR->isInLoop()) {
+    if ((State.VF.isVector() ||
+         RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) &&
+        !PhiR->isInLoop()) {
       ReducedPartRdx =
           createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
       // If the reduction can be performed in a smaller type, we need to extend
@@ -1222,7 +1221,9 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
     return false;
   auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
-  return StartC && StartC->isZero() && StepC && StepC->isOne();
+  auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin());
+  return StartC && StartC->isZero() && StepC && StepC->isOne() &&
+         getScalarType() == CanIV->getScalarType();
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 017b00c042f4..c0eb6d710ad3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -452,8 +452,7 @@ static void removeRedundantCanonicalIVs(VPlan &Plan) {
   for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
     auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
 
-    if (!WidenOriginalIV || !WidenOriginalIV->isCanonical() ||
-        WidenOriginalIV->getScalarType() != WidenNewIV->getScalarType())
+    if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
       continue;
 
     // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
@@ -1341,8 +1340,6 @@ void VPlanTransforms::addExplicitVectorLength(VPlan &Plan) {
       auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U);
       if (!MemR)
         continue;
-      assert(!MemR->isReverse() &&
-             "Reversed memory operations not supported yet.");
       VPValue *OrigMask = MemR->getMask();
       assert(OrigMask && "Unmasked widen memory recipe when folding tail");
       VPValue *NewMask = HeaderMask == OrigMask ? nullptr : OrigMask;
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index da3c780550a0..9d43fb4ab607 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -114,6 +114,7 @@ private:
   bool foldShuffleOfBinops(Instruction &I);
   bool foldShuffleOfCastops(Instruction &I);
   bool foldShuffleOfShuffles(Instruction &I);
+  bool foldShuffleToIdentity(Instruction &I);
   bool foldShuffleFromReductions(Instruction &I);
   bool foldTruncFromReductions(Instruction &I);
   bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
@@ -1667,6 +1668,181 @@ bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
   return true;
 }
 
+// Starting from a shuffle, look up through operands tracking the shuffled index
+// of each lane. If we can simplify away the shuffles to identities then
+// do so.
+bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
+  auto *Ty = dyn_cast<FixedVectorType>(I.getType());
+  if (!Ty || !isa<Instruction>(I.getOperand(0)) ||
+      !isa<Instruction>(I.getOperand(1)))
+    return false;
+
+  using InstLane = std::pair<Value *, int>;
+
+  auto LookThroughShuffles = [](Value *V, int Lane) -> InstLane {
+    while (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
+      unsigned NumElts =
+          cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
+      int M = SV->getMaskValue(Lane);
+      if (M < 0)
+        return {nullptr, PoisonMaskElem};
+      else if (M < (int)NumElts) {
+        V = SV->getOperand(0);
+        Lane = M;
+      } else {
+        V = SV->getOperand(1);
+        Lane = M - NumElts;
+      }
+    }
+    return InstLane{V, Lane};
+  };
+
+  auto GenerateInstLaneVectorFromOperand =
+      [&LookThroughShuffles](ArrayRef<InstLane> Item, int Op) {
+        SmallVector<InstLane> NItem;
+        for (InstLane V : Item) {
+          NItem.emplace_back(
+              !V.first
+                  ? InstLane{nullptr, PoisonMaskElem}
+                  : LookThroughShuffles(
+                        cast<Instruction>(V.first)->getOperand(Op), V.second));
+        }
+        return NItem;
+      };
+
+  SmallVector<InstLane> Start(Ty->getNumElements());
+  for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M)
+    Start[M] = LookThroughShuffles(&I, M);
+
+  SmallVector<SmallVector<InstLane>> Worklist;
+  Worklist.push_back(Start);
+  SmallPtrSet<Value *, 4> IdentityLeafs, SplatLeafs;
+  unsigned NumVisited = 0;
+
+  while (!Worklist.empty()) {
+    SmallVector<InstLane> Item = Worklist.pop_back_val();
+    if (++NumVisited > MaxInstrsToScan)
+      return false;
+
+    // If we found an undef first lane then bail out to keep things simple.
+    if (!Item[0].first)
+      return false;
+
+    // Look for an identity value.
+    if (Item[0].second == 0 &&
+        cast<FixedVectorType>(Item[0].first->getType())->getNumElements() ==
+            Ty->getNumElements() &&
+        all_of(drop_begin(enumerate(Item)), [&](const auto &E) {
+          return !E.value().first || (E.value().first == Item[0].first &&
+                                      E.value().second == (int)E.index());
+        })) {
+      IdentityLeafs.insert(Item[0].first);
+      continue;
+    }
+    // Look for a splat value.
+    if (all_of(drop_begin(Item), [&](InstLane &IL) {
+          return !IL.first ||
+                 (IL.first == Item[0].first && IL.second == Item[0].second);
+        })) {
+      SplatLeafs.insert(Item[0].first);
+      continue;
+    }
+
+    // We need each element to be the same type of value, and check that each
+    // element has a single use.
+    if (!all_of(drop_begin(Item), [&](InstLane IL) {
+          if (!IL.first)
+            return true;
+          if (auto *I = dyn_cast<Instruction>(IL.first); I && !I->hasOneUse())
+            return false;
+          if (IL.first->getValueID() != Item[0].first->getValueID())
+            return false;
+          if (isa<CallInst>(IL.first) && !isa<IntrinsicInst>(IL.first))
+            return false;
+          auto *II = dyn_cast<IntrinsicInst>(IL.first);
+          return !II ||
+                 (isa<IntrinsicInst>(Item[0].first) &&
+                  II->getIntrinsicID() ==
+                      cast<IntrinsicInst>(Item[0].first)->getIntrinsicID());
+        }))
+      return false;
+
+    // Check the operator is one that we support. We exclude div/rem in case
+    // they hit UB from poison lanes.
+    if (isa<BinaryOperator>(Item[0].first) &&
+        !cast<BinaryOperator>(Item[0].first)->isIntDivRem()) {
+      Worklist.push_back(GenerateInstLaneVectorFromOperand(Item, 0));
+      Worklist.push_back(GenerateInstLaneVectorFromOperand(Item, 1));
+    } else if (isa<UnaryOperator>(Item[0].first)) {
+      Worklist.push_back(GenerateInstLaneVectorFromOperand(Item, 0));
+    } else if (auto *II = dyn_cast<IntrinsicInst>(Item[0].first);
+               II && isTriviallyVectorizable(II->getIntrinsicID())) {
+      for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
+        if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op)) {
+          if (!all_of(drop_begin(Item), [&](InstLane &IL) {
+                return !IL.first ||
+                       (cast<Instruction>(IL.first)->getOperand(Op) ==
+                        cast<Instruction>(Item[0].first)->getOperand(Op));
+              }))
+            return false;
+          continue;
+        }
+        Worklist.push_back(GenerateInstLaneVectorFromOperand(Item, Op));
+      }
+    } else {
+      return false;
+    }
+  }
+
+  // If we got this far, we know the shuffles are superfluous and can be
+  // removed. Scan through again and generate the new tree of instructions.
+  std::function<Value *(ArrayRef<InstLane>)> Generate =
+      [&](ArrayRef<InstLane> Item) -> Value * {
+    if (IdentityLeafs.contains(Item[0].first) &&
+        all_of(drop_begin(enumerate(Item)), [&](const auto &E) {
+          return !E.value().first || (E.value().first == Item[0].first &&
+                                      E.value().second == (int)E.index());
+        })) {
+      return Item[0].first;
+    }
+    if (SplatLeafs.contains(Item[0].first)) {
+      if (auto ILI = dyn_cast<Instruction>(Item[0].first))
+        Builder.SetInsertPoint(*ILI->getInsertionPointAfterDef());
+      else if (isa<Argument>(Item[0].first))
+        Builder.SetInsertPointPastAllocas(I.getParent()->getParent());
+      SmallVector<int, 16> Mask(Ty->getNumElements(), Item[0].second);
+      return Builder.CreateShuffleVector(Item[0].first, Mask);
+    }
+
+    auto *I = cast<Instruction>(Item[0].first);
+    auto *II = dyn_cast<IntrinsicInst>(I);
+    unsigned NumOps = I->getNumOperands() - (II ? 1 : 0);
+    SmallVector<Value *> Ops(NumOps);
+    for (unsigned Idx = 0; Idx < NumOps; Idx++) {
+      if (II && isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx)) {
+        Ops[Idx] = II->getOperand(Idx);
+        continue;
+      }
+      Ops[Idx] = Generate(GenerateInstLaneVectorFromOperand(Item, Idx));
+    }
+    Builder.SetInsertPoint(I);
+    Type *DstTy = FixedVectorType::get(I->getType()->getScalarType(),
+                                       Ty->getNumElements());
+    if (auto BI = dyn_cast<BinaryOperator>(I))
+      return Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(),
+                                 Ops[0], Ops[1]);
+    if (II)
+      return Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops);
+    assert(isa<UnaryInstruction>(I) &&
+           "Unexpected instruction type in Generate");
+    return Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]);
+  };
+
+  Value *V = Generate(Start);
+  replaceValue(I, *V);
+  return true;
+}
+
 /// Given a commutative reduction, the order of the input lanes does not alter
 /// the results. We can use this to remove certain shuffles feeding the
 /// reduction, removing the need to shuffle at all.
@@ -1812,17 +1988,17 @@ bool VectorCombine::foldTruncFromReductions(Instruction &I) {
   if (!match(ReductionSrc, m_OneUse(m_Trunc(m_Value(TruncSrc)))))
     return false;
 
-  auto *Trunc = cast<CastInst>(ReductionSrc);
   auto *TruncSrcTy = cast<VectorType>(TruncSrc->getType());
   auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType());
   Type *ResultTy = I.getType();
 
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-  InstructionCost OldCost =
-      TTI.getCastInstrCost(Instruction::Trunc, ReductionSrcTy, TruncSrcTy,
-                           TTI::CastContextHint::None, CostKind, Trunc) +
-      TTI.getArithmeticReductionCost(ReductionOpc, ReductionSrcTy, std::nullopt,
-                                     CostKind);
+  InstructionCost OldCost = TTI.getArithmeticReductionCost(
+      ReductionOpc, ReductionSrcTy, std::nullopt, CostKind);
+  if (auto *Trunc = dyn_cast<CastInst>(ReductionSrc))
+    OldCost +=
+        TTI.getCastInstrCost(Instruction::Trunc, ReductionSrcTy, TruncSrcTy,
+                             TTI::CastContextHint::None, CostKind, Trunc);
   InstructionCost NewCost =
       TTI.getArithmeticReductionCost(ReductionOpc, TruncSrcTy, std::nullopt,
                                      CostKind) +
@@ -2224,6 +2400,7 @@ bool VectorCombine::run() {
         MadeChange |= foldShuffleOfCastops(I);
         MadeChange |= foldShuffleOfShuffles(I);
         MadeChange |= foldSelectShuffle(I);
+        MadeChange |= foldShuffleToIdentity(I);
         break;
       case Instruction::BitCast:
         MadeChange |= foldBitcastShuffle(I);
diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt
index 8159d7f8a0a1..8a3ec1e3300d 100644
--- a/llvm/runtimes/CMakeLists.txt
+++ b/llvm/runtimes/CMakeLists.txt
@@ -431,8 +431,10 @@ if(runtimes)
       set(LIBOMP_MODULES_INSTALL_PATH "${CMAKE_INSTALL_INCLUDEDIR}/flang")
       # TODO: This is a workaround until flang becomes a first-class project
       # in llvm/CMakeList.txt.  Until then, this line ensures that flang-new is
-      # built before "openmp" is built as a runtime project.
-      list(APPEND extra_deps "flang-new")
+      # built before "openmp" is built as a runtime project.  Besides "flang-new"
+      # to build the compiler, we also need to add "module_files" to make sure
+      # that all .mod files are also properly build.
+      list(APPEND extra_deps "flang-new" "module_files")
     endif()
     foreach(dep opt llvm-link llvm-extract clang clang-offload-packager)
       if(TARGET ${dep})
@@ -559,6 +561,13 @@ if(runtimes)
     # We need to add the runtimes as a dependency because compiler-rt can be
     # built as part of runtimes and we need the profile runtime for PGO
     add_dependencies(clang-bootstrap-deps runtimes)
+    # The bootstrap build will attempt to configure the offload runtime
+    # before the openmp project which will error out due to failing to
+    # find libomp.so. We must add omp as a dependency before runtimes
+    # are configured.
+    if("openmp" IN_LIST LLVM_ENABLE_PROJECTS AND "offload" IN_LIST LLVM_ENABLE_RUNTIMES)
+      add_dependencies(clang-bootstrap-deps omp)
+    endif()
   endif()
 
   if(LLVM_INCLUDE_TESTS)
diff --git a/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll b/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll
new file mode 100644
index 000000000000..01dc086d9385
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll
@@ -0,0 +1,209 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-linux-gnu -mattr=+sve | FileCheck %s
+
+define void @foo_no_vscale_range() {
+; CHECK-LABEL: 'foo_no_vscale_range'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res.i64.v32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v32i1(<32 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res.i32.v32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res.i64.v32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v32i1(<32 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res.i32.v32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
+  %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 true)
+  %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 true)
+  %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv161(<vscale x 16 x i1> undef, i1 true)
+  %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+  %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
+  %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
+  %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
+  %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv161(<vscale x 16 x i1> undef, i1 true)
+  %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+
+  %res.i64.v2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 true)
+  %res.i64.v4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 true)
+  %res.i64.v8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 true)
+  %res.i64.v16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v161(<16 x i1> undef, i1 true)
+  %res.i64.v32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v32i1(<32 x i1> undef, i1 true)
+  %res.i32.v2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 true)
+  %res.i32.v4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 true)
+  %res.i32.v8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 true)
+  %res.i32.v16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v161(<16 x i1> undef, i1 true)
+  %res.i32.v32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 true)
+
+  %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
+  %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
+  %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
+  %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv161(<vscale x 16 x i1> undef, i1 false)
+  %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+  %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
+  %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
+  %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
+  %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv161(<vscale x 16 x i1> undef, i1 false)
+  %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+
+  %res.i64.v2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 false)
+  %res.i64.v4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 false)
+  %res.i64.v8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 false)
+  %res.i64.v16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v161(<16 x i1> undef, i1 false)
+  %res.i64.v32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v32i1(<32 x i1> undef, i1 false)
+  %res.i32.v2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 false)
+  %res.i32.v4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 false)
+  %res.i32.v8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 false)
+  %res.i32.v16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v161(<16 x i1> undef, i1 false)
+  %res.i32.v32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 false)
+
+  ret void
+}
+
+
+define void @foo_vscale_range_1_16() vscale_range(1,16) {
+; CHECK-LABEL: 'foo_vscale_range_1_16'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
+  %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 true)
+  %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 true)
+  %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv161(<vscale x 16 x i1> undef, i1 true)
+  %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+  %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
+  %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
+  %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
+  %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv161(<vscale x 16 x i1> undef, i1 true)
+  %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+
+  %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
+  %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
+  %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
+  %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv161(<vscale x 16 x i1> undef, i1 false)
+  %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+  %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
+  %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
+  %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
+  %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv161(<vscale x 16 x i1> undef, i1 false)
+  %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+
+  ret void
+}
+
+define void @foo_vscale_range_1_16384() vscale_range(1,16384) {
+; CHECK-LABEL: 'foo_vscale_range_1_16384'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
+  %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 true)
+  %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 true)
+  %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv161(<vscale x 16 x i1> undef, i1 true)
+  %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+  %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
+  %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
+  %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
+  %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv161(<vscale x 16 x i1> undef, i1 true)
+  %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+
+  %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
+  %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
+  %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
+  %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv161(<vscale x 16 x i1> undef, i1 false)
+  %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+  %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
+  %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
+  %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
+  %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv161(<vscale x 16 x i1> undef, i1 false)
+  %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+
+  ret void
+}
+
+declare i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1>, i1)
+declare i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1>, i1)
+declare i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1>, i1)
+declare i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1>, i1)
+declare i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1>, i1)
+declare i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1>, i1)
+declare i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1>, i1)
+declare i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1>, i1)
+declare i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1>, i1)
+declare i64 @llvm.experimental.cttz.elts.i64.v32i1(<32 x i1>, i1)
+
+declare i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1>, i1)
+declare i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1>, i1)
+declare i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1>, i1)
+declare i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1>, i1)
+declare i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1>, i1)
diff --git a/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll b/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
index f5ca6a22b60a..af41ed92319c 100644
--- a/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
@@ -5,24 +5,24 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @fixed() {
 ; CHECK-LABEL: 'fixed'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 8, <16 x i1> undef, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr undef, i32 8, <32 x i1> undef, <32 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 8, <16 x i1> undef, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 152 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr undef, i32 8, <32 x i1> undef, <32 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 entry:
diff --git a/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll b/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll
index 521a0900c844..6f9f64a26851 100644
--- a/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll
@@ -212,11 +212,11 @@ define <4 x i8> @gather_load_4xi8_constant_mask(<4 x ptr> %ptrs) {
 define <4 x i8> @gather_load_4xi8_variable_mask(<4 x ptr> %ptrs, <4 x i1> %cond) {
 ; CHECK:         gather_load_4xi8_variable_mask
 ; CHECK-NEON-LABEL: 'gather_load_4xi8_variable_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i8> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i8> undef)
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i8> %lv
 ;
 ; CHECK-SVE-128-LABEL: 'gather_load_4xi8_variable_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i8> undef)
+; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i8> undef)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i8> %lv
 ;
 ; CHECK-SVE-256-LABEL: 'gather_load_4xi8_variable_mask'
@@ -257,11 +257,11 @@ define void @scatter_store_4xi8_constant_mask(<4 x i8> %val, <4 x ptr> %ptrs) {
 define void @scatter_store_4xi8_variable_mask(<4 x i8> %val, <4 x ptr> %ptrs, <4 x i1> %cond) {
 ; CHECK:         scatter_store_4xi8_variable_mask
 ; CHECK-NEON-LABEL: 'scatter_store_4xi8_variable_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'scatter_store_4xi8_variable_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
+; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'scatter_store_4xi8_variable_mask'
@@ -302,11 +302,11 @@ define <4 x i32> @gather_load_4xi32_constant_mask(<4 x ptr> %ptrs) {
 define <4 x i32> @gather_load_4xi32_variable_mask(<4 x ptr> %ptrs, <4 x i1> %cond) {
 ; CHECK:         gather_load_4xi32_variable_mask
 ; CHECK-NEON-LABEL: 'gather_load_4xi32_variable_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i32> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i32> undef)
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %lv
 ;
 ; CHECK-SVE-128-LABEL: 'gather_load_4xi32_variable_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i32> undef)
+; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i32> undef)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %lv
 ;
 ; CHECK-SVE-256-LABEL: 'gather_load_4xi32_variable_mask'
@@ -347,11 +347,11 @@ define void @scatter_store_4xi32_constant_mask(<4 x i32> %val, <4 x ptr> %ptrs)
 define void @scatter_store_4xi32_variable_mask(<4 x i32> %val, <4 x ptr> %ptrs, <4 x i1> %cond) {
 ; CHECK:         scatter_store_4xi32_variable_mask
 ; CHECK-NEON-LABEL: 'scatter_store_4xi32_variable_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'scatter_store_4xi32_variable_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
+; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'scatter_store_4xi32_variable_mask'
@@ -370,11 +370,11 @@ declare <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr>, i32, <256 x
 define void @sve_gather_vls(<256 x i1> %v256i1mask) {
 ; CHECK-LABEL: 'sve_scatter_vls'
 ; CHECK-NEON-LABEL: 'sve_gather_vls'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2304 for instruction: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'sve_gather_vls'
-; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 2304 for instruction: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
+; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'sve_gather_vls'
@@ -394,11 +394,11 @@ declare <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr>, i32, <256
 define void @sve_gather_vls_float(<256 x i1> %v256i1mask) {
 ; CHECK-LABEL: 'sve_gather_vls_float'
 ; CHECK-NEON-LABEL: 'sve_gather_vls_float'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2176 for instruction: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1664 for instruction: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'sve_gather_vls_float'
-; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 2176 for instruction: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
+; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 1664 for instruction: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'sve_gather_vls_float'
@@ -418,11 +418,11 @@ declare void @llvm.masked.scatter.v256i8.v256p0(<256 x i8>, <256 x ptr>, i32, <2
 define void @sve_scatter_vls(<256 x i1> %v256i1mask){
 ; CHECK-LABEL: 'sve_scatter_vls'
 ; CHECK-NEON-LABEL: 'sve_scatter_vls'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2304 for instruction: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> undef, i32 0, <256 x i1> %v256i1mask)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> undef, i32 0, <256 x i1> %v256i1mask)
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'sve_scatter_vls'
-; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 2304 for instruction: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> undef, i32 0, <256 x i1> %v256i1mask)
+; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> undef, i32 0, <256 x i1> %v256i1mask)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'sve_scatter_vls'
@@ -442,11 +442,11 @@ declare void @llvm.masked.scatter.v512f16.v512p0(<512 x half>, <512 x ptr>, i32,
 define void @sve_scatter_vls_float(<512 x i1> %v512i1mask){
 ; CHECK-LABEL: 'sve_scatter_vls_float'
 ; CHECK-NEON-LABEL: 'sve_scatter_vls_float'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4480 for instruction: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> undef, i32 0, <512 x i1> %v512i1mask)
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 3456 for instruction: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> undef, i32 0, <512 x i1> %v512i1mask)
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'sve_scatter_vls_float'
-; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 4480 for instruction: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> undef, i32 0, <512 x i1> %v512i1mask)
+; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 3456 for instruction: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> undef, i32 0, <512 x i1> %v512i1mask)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'sve_scatter_vls_float'
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll b/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll
index a9c18e20c1f5..c05339d89d35 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll
@@ -107,15 +107,15 @@ define void @masked_gathers_no_vscale_range() #2 {
 
 define <2 x i128> @masked_gather_v1i128(<2 x ptr> %ld, <2 x i1> %masks, <2 x i128> %passthru) #3 {
 ; CHECK-LABEL: 'masked_gather_v1i128'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i128> %res
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_gather_v1i128'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i128> %res
 ;
 ; CHECK-VSCALE-1-LABEL: 'masked_gather_v1i128'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i128> %res
 ;
   %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru)
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
index 15c278b060c9..1ff280d75b4e 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -829,7 +829,7 @@ define <vscale x 8 x i32> @masked_gather_nxv8i32(<vscale x 8 x ptr> %ld, <vscale
 
 define <4 x i32> @masked_gather_v4i32(<4 x ptr> %ld, <4 x i1> %masks, <4 x i32> %passthru) {
 ; CHECK-LABEL: 'masked_gather_v4i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
 ;
 ; TYPE_BASED_ONLY-LABEL: 'masked_gather_v4i32'
@@ -842,7 +842,7 @@ define <4 x i32> @masked_gather_v4i32(<4 x ptr> %ld, <4 x i1> %masks, <4 x i32>
 
 define <1 x i128> @masked_gather_v1i128(<1 x ptr> %ld, <1 x i1> %masks, <1 x i128> %passthru) {
 ; CHECK-LABEL: 'masked_gather_v1i128'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <1 x i128> @llvm.masked.gather.v1i128.v1p0(<1 x ptr> %ld, i32 0, <1 x i1> %masks, <1 x i128> %passthru)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res = call <1 x i128> @llvm.masked.gather.v1i128.v1p0(<1 x ptr> %ld, i32 0, <1 x i1> %masks, <1 x i128> %passthru)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <1 x i128> %res
 ;
 ; TYPE_BASED_ONLY-LABEL: 'masked_gather_v1i128'
@@ -883,7 +883,7 @@ define void @masked_scatter_nxv8i32(<vscale x 8 x i32> %data, <vscale x 8 x ptr>
 
 define void @masked_scatter_v4i32(<4 x i32> %data, <4 x ptr> %ptrs, <4 x i1> %masks) {
 ; CHECK-LABEL: 'masked_scatter_v4i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data, <4 x ptr> %ptrs, i32 0, <4 x i1> %masks)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data, <4 x ptr> %ptrs, i32 0, <4 x i1> %masks)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'masked_scatter_v4i32'
@@ -897,7 +897,7 @@ define void @masked_scatter_v4i32(<4 x i32> %data, <4 x ptr> %ptrs, <4 x i1> %ma
 
 define void @masked_scatter_v1i128(<1 x i128> %data, <1 x ptr> %ptrs, <1 x i1> %masks) {
 ; CHECK-LABEL: 'masked_scatter_v1i128'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v1i128.v1p0(<1 x i128> %data, <1 x ptr> %ptrs, i32 0, <1 x i1> %masks)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v1i128.v1p0(<1 x i128> %data, <1 x ptr> %ptrs, i32 0, <1 x i1> %masks)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'masked_scatter_v1i128'
diff --git a/llvm/test/Analysis/CostModel/ARM/arith.ll b/llvm/test/Analysis/CostModel/ARM/arith.ll
index 3a137a5af366..8f173596c3b9 100644
--- a/llvm/test/Analysis/CostModel/ARM/arith.ll
+++ b/llvm/test/Analysis/CostModel/ARM/arith.ll
@@ -4,7 +4,7 @@
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve,+mve4beat < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVE4
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv8r-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8R
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv8r-none-eabi -mattr=+neon,+fp-armv8 < %s | FileCheck %s --check-prefix=CHECK-V8R
 ; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=thumbv8.1m.main -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE-SIZE
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
diff --git a/llvm/test/Analysis/CostModel/ARM/cast.ll b/llvm/test/Analysis/CostModel/ARM/cast.ll
index 60addd3077ed..ae0d2347ec8b 100644
--- a/llvm/test/Analysis/CostModel/ARM/cast.ll
+++ b/llvm/test/Analysis/CostModel/ARM/cast.ll
@@ -3,11 +3,11 @@
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVE-RECIP
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN-RECIP
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE-RECIP
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv8r-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8R-RECIP
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv8r-none-eabi -mattr=+neon,+fp-armv8 < %s | FileCheck %s --check-prefix=CHECK-V8R-RECIP
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVE-SIZE
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN-SIZE
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE-SIZE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=armv8r-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8R-SIZE
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=armv8r-none-eabi -mattr=+neon,+fp-armv8 < %s | FileCheck %s --check-prefix=CHECK-V8R-SIZE
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
diff --git a/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll b/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll
index db700eb3baee..4a2f9a25dc15 100644
--- a/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll
+++ b/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll
@@ -3,11 +3,11 @@
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVE-RECIP
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN-RECIP
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE-RECIP
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv8r-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8R-RECIP
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv8r-none-eabi -mattr=+neon,+fp-armv8 < %s | FileCheck %s --check-prefix=CHECK-V8R-RECIP
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVE-SIZE
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN-SIZE
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE-SIZE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=armv8r-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8R-SIZE
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=armv8r-none-eabi -mattr=+neon,+fp-armv8 < %s | FileCheck %s --check-prefix=CHECK-V8R-SIZE
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
diff --git a/llvm/test/Analysis/CostModel/ARM/cmps.ll b/llvm/test/Analysis/CostModel/ARM/cmps.ll
index 7f89f521e77c..184b7076d02b 100644
--- a/llvm/test/Analysis/CostModel/ARM/cmps.ll
+++ b/llvm/test/Analysis/CostModel/ARM/cmps.ll
@@ -2,11 +2,11 @@
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVE-RECIP
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN-RECIP
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE-RECIP
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv8r-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8R-RECIP
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv8r-none-eabi -mattr=+neon,+fp-armv8 < %s | FileCheck %s --check-prefix=CHECK-V8R-RECIP
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVE-SIZE
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN-SIZE
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE-SIZE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=armv8r-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8R-SIZE
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=armv8r-none-eabi -mattr=+neon,+fp-armv8 < %s | FileCheck %s --check-prefix=CHECK-V8R-SIZE
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
diff --git a/llvm/test/Analysis/CostModel/ARM/divrem.ll b/llvm/test/Analysis/CostModel/ARM/divrem.ll
index b582a61c2a0f..36c258503232 100644
--- a/llvm/test/Analysis/CostModel/ARM/divrem.ll
+++ b/llvm/test/Analysis/CostModel/ARM/divrem.ll
@@ -3,7 +3,7 @@
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVE
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv8r-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8R
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv8r-none-eabi -mattr=+neon,+fp-armv8 < %s | FileCheck %s --check-prefix=CHECK-V8R
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
diff --git a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
index 1dde88f366a3..d1e8bb015491 100644
--- a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
@@ -248,36 +248,36 @@ define i32 @fdiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fdiv half undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = fdiv float undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = fdiv double undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F16 = fdiv <1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F16 = fdiv <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F16 = fdiv <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F16 = fdiv <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fdiv <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F16 = fdiv <1 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F16 = fdiv <2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = fdiv <4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = fdiv <8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = fdiv <16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32F16 = fdiv <32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fdiv <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fdiv <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fdiv <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F16 = fdiv <vscale x 1 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F16 = fdiv <vscale x 2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F16 = fdiv <vscale x 4 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV8F16 = fdiv <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV16F16 = fdiv <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV32F16 = fdiv <vscale x 32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F32 = fdiv <1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = fdiv <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = fdiv <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fdiv <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV16F16 = fdiv <vscale x 16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV32F16 = fdiv <vscale x 32 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F32 = fdiv <1 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = fdiv <2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fdiv <4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fdiv <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = fdiv <16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32 = fdiv <vscale x 1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32 = fdiv <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F32 = fdiv <vscale x 1 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F32 = fdiv <vscale x 2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F32 = fdiv <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV8F32 = fdiv <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV16F32 = fdiv <vscale x 16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F64 = fdiv <1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = fdiv <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fdiv <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F32 = fdiv <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F32 = fdiv <vscale x 16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = fdiv <1 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = fdiv <2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = fdiv <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = fdiv <8 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64 = fdiv <vscale x 1 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F64 = fdiv <vscale x 1 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F64 = fdiv <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F64 = fdiv <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV8F64 = fdiv <vscale x 8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F64 = fdiv <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F64 = fdiv <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %F16 = fdiv half undef, undef
diff --git a/llvm/test/Analysis/CostModel/RISCV/arith-int.ll b/llvm/test/Analysis/CostModel/RISCV/arith-int.ll
index b4afbb513166..c976f483fdfe 100644
--- a/llvm/test/Analysis/CostModel/RISCV/arith-int.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/arith-int.ll
@@ -705,72 +705,72 @@ define i32 @udiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = udiv i64 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I16 = udiv <1 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = udiv <2 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = udiv <4 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = udiv <8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I16 = udiv <1 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = udiv <2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = udiv <4 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = udiv <8 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = udiv <16 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = udiv <32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = udiv <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I16 = udiv <vscale x 1 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I16 = udiv <vscale x 2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I16 = udiv <vscale x 4 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV8I16 = udiv <vscale x 8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16I16 = udiv <vscale x 16 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV32I16 = udiv <vscale x 32 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = udiv <1 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = udiv <2 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = udiv <4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV8I16 = udiv <vscale x 8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV16I16 = udiv <vscale x 16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV32I16 = udiv <vscale x 32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I32 = udiv <1 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = udiv <2 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = udiv <4 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = udiv <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = udiv <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = udiv <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I32 = udiv <vscale x 1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I32 = udiv <vscale x 2 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I32 = udiv <vscale x 4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV8I32 = udiv <vscale x 8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16I32 = udiv <vscale x 16 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = udiv <1 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = udiv <2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4I32 = udiv <vscale x 4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8I32 = udiv <vscale x 8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16I32 = udiv <vscale x 16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = udiv <1 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = udiv <2 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = udiv <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = udiv <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = udiv <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I64 = udiv <vscale x 1 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I64 = udiv <vscale x 2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I64 = udiv <vscale x 4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV8I64 = udiv <vscale x 8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2I64 = udiv <vscale x 2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4I64 = udiv <vscale x 4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8I64 = udiv <vscale x 8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SIFIVE-X280-LABEL: 'udiv'
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = udiv i64 undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I16 = udiv <1 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = udiv <2 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = udiv <4 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = udiv <8 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = udiv <16 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I16 = udiv <1 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = udiv <2 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = udiv <4 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = udiv <8 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = udiv <16 x i16> undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = udiv <32 x i16> undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I16 = udiv <vscale x 1 x i16> undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I16 = udiv <vscale x 2 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I16 = udiv <vscale x 4 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV8I16 = udiv <vscale x 8 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16I16 = udiv <vscale x 16 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV32I16 = udiv <vscale x 32 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = udiv <1 x i32> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = udiv <2 x i32> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = udiv <4 x i32> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = udiv <8 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4I16 = udiv <vscale x 4 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8I16 = udiv <vscale x 8 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16I16 = udiv <vscale x 16 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32I16 = udiv <vscale x 32 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I32 = udiv <1 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = udiv <2 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = udiv <4 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = udiv <8 x i32> undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = udiv <16 x i32> undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I32 = udiv <vscale x 1 x i32> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I32 = udiv <vscale x 2 x i32> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I32 = udiv <vscale x 4 x i32> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV8I32 = udiv <vscale x 8 x i32> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16I32 = udiv <vscale x 16 x i32> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = udiv <1 x i64> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = udiv <2 x i64> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = udiv <4 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2I32 = udiv <vscale x 2 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4I32 = udiv <vscale x 4 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8I32 = udiv <vscale x 8 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16I32 = udiv <vscale x 16 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = udiv <1 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = udiv <2 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = udiv <4 x i64> undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = udiv <8 x i64> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I64 = udiv <vscale x 1 x i64> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I64 = udiv <vscale x 2 x i64> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I64 = udiv <vscale x 4 x i64> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV8I64 = udiv <vscale x 8 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1I64 = udiv <vscale x 1 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2I64 = udiv <vscale x 2 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4I64 = udiv <vscale x 4 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8I64 = udiv <vscale x 8 x i64> undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I16 = udiv i16 undef, undef
@@ -821,72 +821,72 @@ define i32 @urem() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = urem i64 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I16 = urem <1 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = urem <2 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = urem <4 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = urem <8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I16 = urem <1 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = urem <2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = urem <4 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = urem <8 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = urem <16 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = urem <32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = urem <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I16 = urem <vscale x 1 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I16 = urem <vscale x 2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I16 = urem <vscale x 4 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV8I16 = urem <vscale x 8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16I16 = urem <vscale x 16 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV32I16 = urem <vscale x 32 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = urem <1 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = urem <2 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = urem <4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV8I16 = urem <vscale x 8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV16I16 = urem <vscale x 16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV32I16 = urem <vscale x 32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I32 = urem <1 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = urem <2 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = urem <4 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = urem <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = urem <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = urem <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I32 = urem <vscale x 1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I32 = urem <vscale x 2 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I32 = urem <vscale x 4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV8I32 = urem <vscale x 8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16I32 = urem <vscale x 16 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = urem <1 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = urem <2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4I32 = urem <vscale x 4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8I32 = urem <vscale x 8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16I32 = urem <vscale x 16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = urem <1 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = urem <2 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = urem <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = urem <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = urem <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I64 = urem <vscale x 1 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I64 = urem <vscale x 2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I64 = urem <vscale x 4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV8I64 = urem <vscale x 8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2I64 = urem <vscale x 2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4I64 = urem <vscale x 4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8I64 = urem <vscale x 8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SIFIVE-X280-LABEL: 'urem'
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = urem i64 undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I16 = urem <1 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = urem <2 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = urem <4 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = urem <8 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = urem <16 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I16 = urem <1 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = urem <2 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = urem <4 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = urem <8 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = urem <16 x i16> undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = urem <32 x i16> undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I16 = urem <vscale x 1 x i16> undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I16 = urem <vscale x 2 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I16 = urem <vscale x 4 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV8I16 = urem <vscale x 8 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16I16 = urem <vscale x 16 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV32I16 = urem <vscale x 32 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = urem <1 x i32> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = urem <2 x i32> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = urem <4 x i32> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = urem <8 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4I16 = urem <vscale x 4 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8I16 = urem <vscale x 8 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16I16 = urem <vscale x 16 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32I16 = urem <vscale x 32 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I32 = urem <1 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = urem <2 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = urem <4 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = urem <8 x i32> undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = urem <16 x i32> undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I32 = urem <vscale x 1 x i32> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I32 = urem <vscale x 2 x i32> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I32 = urem <vscale x 4 x i32> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV8I32 = urem <vscale x 8 x i32> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16I32 = urem <vscale x 16 x i32> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = urem <1 x i64> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = urem <2 x i64> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = urem <4 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2I32 = urem <vscale x 2 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4I32 = urem <vscale x 4 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8I32 = urem <vscale x 8 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16I32 = urem <vscale x 16 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = urem <1 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = urem <2 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = urem <4 x i64> undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = urem <8 x i64> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I64 = urem <vscale x 1 x i64> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I64 = urem <vscale x 2 x i64> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I64 = urem <vscale x 4 x i64> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV8I64 = urem <vscale x 8 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1I64 = urem <vscale x 1 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2I64 = urem <vscale x 2 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4I64 = urem <vscale x 4 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8I64 = urem <vscale x 8 x i64> undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I16 = urem i16 undef, undef
@@ -937,72 +937,72 @@ define i32 @sdiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I16 = sdiv <1 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = sdiv <2 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = sdiv <4 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = sdiv <8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I16 = sdiv <1 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = sdiv <2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = sdiv <4 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = sdiv <8 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = sdiv <16 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = sdiv <32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = sdiv <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I16 = sdiv <vscale x 1 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I16 = sdiv <vscale x 2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I16 = sdiv <vscale x 4 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV8I16 = sdiv <vscale x 8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16I16 = sdiv <vscale x 16 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV32I16 = sdiv <vscale x 32 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = sdiv <1 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = sdiv <2 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = sdiv <4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV8I16 = sdiv <vscale x 8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV16I16 = sdiv <vscale x 16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV32I16 = sdiv <vscale x 32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I32 = sdiv <1 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = sdiv <2 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = sdiv <4 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = sdiv <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = sdiv <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = sdiv <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I32 = sdiv <vscale x 1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I32 = sdiv <vscale x 2 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I32 = sdiv <vscale x 4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV8I32 = sdiv <vscale x 8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16I32 = sdiv <vscale x 16 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = sdiv <1 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = sdiv <2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4I32 = sdiv <vscale x 4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8I32 = sdiv <vscale x 8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16I32 = sdiv <vscale x 16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = sdiv <1 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = sdiv <2 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = sdiv <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = sdiv <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = sdiv <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I64 = sdiv <vscale x 1 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I64 = sdiv <vscale x 2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I64 = sdiv <vscale x 4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV8I64 = sdiv <vscale x 8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2I64 = sdiv <vscale x 2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4I64 = sdiv <vscale x 4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8I64 = sdiv <vscale x 8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SIFIVE-X280-LABEL: 'sdiv'
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I16 = sdiv <1 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = sdiv <2 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = sdiv <4 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = sdiv <8 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = sdiv <16 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I16 = sdiv <1 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = sdiv <2 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = sdiv <4 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = sdiv <8 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = sdiv <16 x i16> undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = sdiv <32 x i16> undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I16 = sdiv <vscale x 1 x i16> undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I16 = sdiv <vscale x 2 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I16 = sdiv <vscale x 4 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV8I16 = sdiv <vscale x 8 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16I16 = sdiv <vscale x 16 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV32I16 = sdiv <vscale x 32 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = sdiv <1 x i32> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = sdiv <2 x i32> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = sdiv <4 x i32> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = sdiv <8 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4I16 = sdiv <vscale x 4 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8I16 = sdiv <vscale x 8 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16I16 = sdiv <vscale x 16 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32I16 = sdiv <vscale x 32 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I32 = sdiv <1 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = sdiv <2 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = sdiv <4 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = sdiv <8 x i32> undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = sdiv <16 x i32> undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I32 = sdiv <vscale x 1 x i32> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I32 = sdiv <vscale x 2 x i32> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I32 = sdiv <vscale x 4 x i32> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV8I32 = sdiv <vscale x 8 x i32> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16I32 = sdiv <vscale x 16 x i32> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = sdiv <1 x i64> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = sdiv <2 x i64> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = sdiv <4 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2I32 = sdiv <vscale x 2 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4I32 = sdiv <vscale x 4 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8I32 = sdiv <vscale x 8 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16I32 = sdiv <vscale x 16 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = sdiv <1 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = sdiv <2 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = sdiv <4 x i64> undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = sdiv <8 x i64> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I64 = sdiv <vscale x 1 x i64> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I64 = sdiv <vscale x 2 x i64> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I64 = sdiv <vscale x 4 x i64> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV8I64 = sdiv <vscale x 8 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1I64 = sdiv <vscale x 1 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2I64 = sdiv <vscale x 2 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4I64 = sdiv <vscale x 4 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8I64 = sdiv <vscale x 8 x i64> undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I16 = sdiv i16 undef, undef
@@ -1053,72 +1053,72 @@ define i32 @srem() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I16 = srem <1 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = srem <2 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = srem <4 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = srem <8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I16 = srem <1 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = srem <2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = srem <4 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = srem <8 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = srem <16 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = srem <32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = srem <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I16 = srem <vscale x 1 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I16 = srem <vscale x 2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I16 = srem <vscale x 4 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV8I16 = srem <vscale x 8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16I16 = srem <vscale x 16 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV32I16 = srem <vscale x 32 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = srem <1 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = srem <2 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = srem <4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV8I16 = srem <vscale x 8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV16I16 = srem <vscale x 16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV32I16 = srem <vscale x 32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I32 = srem <1 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = srem <2 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = srem <4 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = srem <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = srem <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = srem <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I32 = srem <vscale x 1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I32 = srem <vscale x 2 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I32 = srem <vscale x 4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV8I32 = srem <vscale x 8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16I32 = srem <vscale x 16 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = srem <1 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = srem <2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4I32 = srem <vscale x 4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8I32 = srem <vscale x 8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16I32 = srem <vscale x 16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = srem <1 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = srem <2 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = srem <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = srem <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = srem <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I64 = srem <vscale x 1 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I64 = srem <vscale x 2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I64 = srem <vscale x 4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV8I64 = srem <vscale x 8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2I64 = srem <vscale x 2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4I64 = srem <vscale x 4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8I64 = srem <vscale x 8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SIFIVE-X280-LABEL: 'srem'
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I16 = srem <1 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = srem <2 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = srem <4 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = srem <8 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = srem <16 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I16 = srem <1 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = srem <2 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = srem <4 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = srem <8 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = srem <16 x i16> undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = srem <32 x i16> undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I16 = srem <vscale x 1 x i16> undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I16 = srem <vscale x 2 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I16 = srem <vscale x 4 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV8I16 = srem <vscale x 8 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16I16 = srem <vscale x 16 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV32I16 = srem <vscale x 32 x i16> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = srem <1 x i32> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = srem <2 x i32> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = srem <4 x i32> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = srem <8 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4I16 = srem <vscale x 4 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8I16 = srem <vscale x 8 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16I16 = srem <vscale x 16 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32I16 = srem <vscale x 32 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I32 = srem <1 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = srem <2 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = srem <4 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = srem <8 x i32> undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = srem <16 x i32> undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I32 = srem <vscale x 1 x i32> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I32 = srem <vscale x 2 x i32> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I32 = srem <vscale x 4 x i32> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV8I32 = srem <vscale x 8 x i32> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16I32 = srem <vscale x 16 x i32> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = srem <1 x i64> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = srem <2 x i64> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = srem <4 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2I32 = srem <vscale x 2 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4I32 = srem <vscale x 4 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8I32 = srem <vscale x 8 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16I32 = srem <vscale x 16 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = srem <1 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = srem <2 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = srem <4 x i64> undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = srem <8 x i64> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I64 = srem <vscale x 1 x i64> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I64 = srem <vscale x 2 x i64> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I64 = srem <vscale x 4 x i64> undef, undef
-; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV8I64 = srem <vscale x 8 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1I64 = srem <vscale x 1 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2I64 = srem <vscale x 2 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4I64 = srem <vscale x 4 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8I64 = srem <vscale x 8 x i64> undef, undef
 ; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I16 = srem i16 undef, undef
@@ -1232,3 +1232,471 @@ define void @add_of_constant() {
 
   ret void
 }
+
+define i32 @and() {
+; CHECK-LABEL: 'and'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = and i1 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = and i16 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = and i32 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = and i64 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = and <1 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = and <2 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = and <4 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = and <8 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = and <16 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = and <32 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = and <1 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = and <2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = and <4 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = and <8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %15 = and <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %16 = and <32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = and <1 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = and <2 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = and <4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %20 = and <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %21 = and <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = and <1 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = and <2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %24 = and <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %25 = and <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = and <vscale x 1 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = and <vscale x 2 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = and <vscale x 4 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %29 = and <vscale x 8 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %30 = and <vscale x 16 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %31 = and <vscale x 32 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %32 = and <vscale x 1 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %33 = and <vscale x 2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %34 = and <vscale x 4 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %35 = and <vscale x 8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %36 = and <vscale x 16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %37 = and <vscale x 32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %38 = and <vscale x 1 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %39 = and <vscale x 2 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %40 = and <vscale x 4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %41 = and <vscale x 8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %42 = and <vscale x 16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %43 = and <vscale x 1 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %44 = and <vscale x 2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %45 = and <vscale x 4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %46 = and <vscale x 8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SIFIVE-X280-LABEL: 'and'
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = and i1 undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = and i16 undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = and i32 undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = and i64 undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = and <1 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = and <2 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = and <4 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = and <8 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = and <16 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = and <32 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = and <1 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = and <2 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = and <4 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = and <8 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = and <16 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %16 = and <32 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = and <1 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = and <2 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = and <4 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = and <8 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %21 = and <16 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = and <1 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = and <2 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %24 = and <4 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %25 = and <8 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = and <vscale x 1 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = and <vscale x 2 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = and <vscale x 4 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %29 = and <vscale x 8 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %30 = and <vscale x 16 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %31 = and <vscale x 32 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %32 = and <vscale x 1 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %33 = and <vscale x 2 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %34 = and <vscale x 4 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %35 = and <vscale x 8 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %36 = and <vscale x 16 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %37 = and <vscale x 32 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %38 = and <vscale x 1 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %39 = and <vscale x 2 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %40 = and <vscale x 4 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %41 = and <vscale x 8 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %42 = and <vscale x 16 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %43 = and <vscale x 1 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %44 = and <vscale x 2 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %45 = and <vscale x 4 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %46 = and <vscale x 8 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  and i1 undef, undef
+  and i16 undef, undef
+  and i32 undef, undef
+  and i64 undef, undef
+
+  and <1 x i1> undef, undef
+  and <2 x i1> undef, undef
+  and <4 x i1> undef, undef
+  and <8 x i1> undef, undef
+  and <16 x i1> undef, undef
+  and <32 x i1> undef, undef
+
+  and <1 x i16> undef, undef
+  and <2 x i16> undef, undef
+  and <4 x i16> undef, undef
+  and <8 x i16> undef, undef
+  and <16 x i16> undef, undef
+  and <32 x i16> undef, undef
+
+  and <1 x i32> undef, undef
+  and <2 x i32> undef, undef
+  and <4 x i32> undef, undef
+  and <8 x i32> undef, undef
+  and <16 x i32> undef, undef
+
+  and <1 x i64> undef, undef
+  and <2 x i64> undef, undef
+  and <4 x i64> undef, undef
+  and <8 x i64> undef, undef
+
+  and <vscale x 1 x i1> undef, undef
+  and <vscale x 2 x i1> undef, undef
+  and <vscale x 4 x i1> undef, undef
+  and <vscale x 8 x i1> undef, undef
+  and <vscale x 16 x i1> undef, undef
+  and <vscale x 32 x i1> undef, undef
+
+  and <vscale x 1 x i16> undef, undef
+  and <vscale x 2 x i16> undef, undef
+  and <vscale x 4 x i16> undef, undef
+  and <vscale x 8 x i16> undef, undef
+  and <vscale x 16 x i16> undef, undef
+  and <vscale x 32 x i16> undef, undef
+
+  and <vscale x 1 x i32> undef, undef
+  and <vscale x 2 x i32> undef, undef
+  and <vscale x 4 x i32> undef, undef
+  and <vscale x 8 x i32> undef, undef
+  and <vscale x 16 x i32> undef, undef
+
+  and <vscale x 1 x i64> undef, undef
+  and <vscale x 2 x i64> undef, undef
+  and <vscale x 4 x i64> undef, undef
+  and <vscale x 8 x i64> undef, undef
+  ret i32 undef
+}
+
+define i32 @or() {
+; CHECK-LABEL: 'or'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = or i1 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = or i16 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = or i32 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = or i64 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = or <1 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = or <2 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = or <4 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = or <8 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = or <16 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = or <32 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = or <1 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = or <2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = or <4 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = or <8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %15 = or <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %16 = or <32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = or <1 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = or <2 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = or <4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %20 = or <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %21 = or <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = or <1 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = or <2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %24 = or <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %25 = or <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = or <vscale x 1 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = or <vscale x 2 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = or <vscale x 4 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %29 = or <vscale x 8 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %30 = or <vscale x 16 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %31 = or <vscale x 32 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %32 = or <vscale x 1 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %33 = or <vscale x 2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %34 = or <vscale x 4 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %35 = or <vscale x 8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %36 = or <vscale x 16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %37 = or <vscale x 32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %38 = or <vscale x 1 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %39 = or <vscale x 2 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %40 = or <vscale x 4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %41 = or <vscale x 8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %42 = or <vscale x 16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %43 = or <vscale x 1 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %44 = or <vscale x 2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %45 = or <vscale x 4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %46 = or <vscale x 8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SIFIVE-X280-LABEL: 'or'
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = or i1 undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = or i16 undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = or i32 undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = or i64 undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = or <1 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = or <2 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = or <4 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = or <8 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = or <16 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = or <32 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = or <1 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = or <2 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = or <4 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = or <8 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = or <16 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %16 = or <32 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = or <1 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = or <2 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = or <4 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = or <8 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %21 = or <16 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = or <1 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = or <2 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %24 = or <4 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %25 = or <8 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = or <vscale x 1 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = or <vscale x 2 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = or <vscale x 4 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %29 = or <vscale x 8 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %30 = or <vscale x 16 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %31 = or <vscale x 32 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %32 = or <vscale x 1 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %33 = or <vscale x 2 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %34 = or <vscale x 4 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %35 = or <vscale x 8 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %36 = or <vscale x 16 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %37 = or <vscale x 32 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %38 = or <vscale x 1 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %39 = or <vscale x 2 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %40 = or <vscale x 4 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %41 = or <vscale x 8 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %42 = or <vscale x 16 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %43 = or <vscale x 1 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %44 = or <vscale x 2 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %45 = or <vscale x 4 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %46 = or <vscale x 8 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  or i1 undef, undef
+  or i16 undef, undef
+  or i32 undef, undef
+  or i64 undef, undef
+
+  or <1 x i1> undef, undef
+  or <2 x i1> undef, undef
+  or <4 x i1> undef, undef
+  or <8 x i1> undef, undef
+  or <16 x i1> undef, undef
+  or <32 x i1> undef, undef
+
+  or <1 x i16> undef, undef
+  or <2 x i16> undef, undef
+  or <4 x i16> undef, undef
+  or <8 x i16> undef, undef
+  or <16 x i16> undef, undef
+  or <32 x i16> undef, undef
+
+  or <1 x i32> undef, undef
+  or <2 x i32> undef, undef
+  or <4 x i32> undef, undef
+  or <8 x i32> undef, undef
+  or <16 x i32> undef, undef
+
+  or <1 x i64> undef, undef
+  or <2 x i64> undef, undef
+  or <4 x i64> undef, undef
+  or <8 x i64> undef, undef
+
+  or <vscale x 1 x i1> undef, undef
+  or <vscale x 2 x i1> undef, undef
+  or <vscale x 4 x i1> undef, undef
+  or <vscale x 8 x i1> undef, undef
+  or <vscale x 16 x i1> undef, undef
+  or <vscale x 32 x i1> undef, undef
+
+  or <vscale x 1 x i16> undef, undef
+  or <vscale x 2 x i16> undef, undef
+  or <vscale x 4 x i16> undef, undef
+  or <vscale x 8 x i16> undef, undef
+  or <vscale x 16 x i16> undef, undef
+  or <vscale x 32 x i16> undef, undef
+
+  or <vscale x 1 x i32> undef, undef
+  or <vscale x 2 x i32> undef, undef
+  or <vscale x 4 x i32> undef, undef
+  or <vscale x 8 x i32> undef, undef
+  or <vscale x 16 x i32> undef, undef
+
+  or <vscale x 1 x i64> undef, undef
+  or <vscale x 2 x i64> undef, undef
+  or <vscale x 4 x i64> undef, undef
+  or <vscale x 8 x i64> undef, undef
+  ret i32 undef
+}
+
+define i32 @xor() {
+; CHECK-LABEL: 'xor'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = xor i1 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = xor i16 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = xor i32 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = xor i64 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = xor <1 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = xor <2 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = xor <4 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = xor <8 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = xor <16 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = xor <32 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = xor <1 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = xor <2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = xor <4 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = xor <8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %15 = xor <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %16 = xor <32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = xor <1 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = xor <2 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = xor <4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %20 = xor <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %21 = xor <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = xor <1 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = xor <2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %24 = xor <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %25 = xor <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = xor <vscale x 1 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = xor <vscale x 2 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = xor <vscale x 4 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %29 = xor <vscale x 8 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %30 = xor <vscale x 16 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %31 = xor <vscale x 32 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %32 = xor <vscale x 1 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %33 = xor <vscale x 2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %34 = xor <vscale x 4 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %35 = xor <vscale x 8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %36 = xor <vscale x 16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %37 = xor <vscale x 32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %38 = xor <vscale x 1 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %39 = xor <vscale x 2 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %40 = xor <vscale x 4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %41 = xor <vscale x 8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %42 = xor <vscale x 16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %43 = xor <vscale x 1 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %44 = xor <vscale x 2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %45 = xor <vscale x 4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %46 = xor <vscale x 8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SIFIVE-X280-LABEL: 'xor'
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = xor i1 undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = xor i16 undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = xor i32 undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = xor i64 undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = xor <1 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = xor <2 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = xor <4 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = xor <8 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = xor <16 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = xor <32 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = xor <1 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = xor <2 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = xor <4 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = xor <8 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = xor <16 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %16 = xor <32 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = xor <1 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = xor <2 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = xor <4 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = xor <8 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %21 = xor <16 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = xor <1 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = xor <2 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %24 = xor <4 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %25 = xor <8 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = xor <vscale x 1 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = xor <vscale x 2 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = xor <vscale x 4 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %29 = xor <vscale x 8 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %30 = xor <vscale x 16 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %31 = xor <vscale x 32 x i1> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %32 = xor <vscale x 1 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %33 = xor <vscale x 2 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %34 = xor <vscale x 4 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %35 = xor <vscale x 8 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %36 = xor <vscale x 16 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %37 = xor <vscale x 32 x i16> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %38 = xor <vscale x 1 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %39 = xor <vscale x 2 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %40 = xor <vscale x 4 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %41 = xor <vscale x 8 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %42 = xor <vscale x 16 x i32> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %43 = xor <vscale x 1 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %44 = xor <vscale x 2 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %45 = xor <vscale x 4 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %46 = xor <vscale x 8 x i64> undef, undef
+; SIFIVE-X280-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  xor i1 undef, undef
+  xor i16 undef, undef
+  xor i32 undef, undef
+  xor i64 undef, undef
+
+  xor <1 x i1> undef, undef
+  xor <2 x i1> undef, undef
+  xor <4 x i1> undef, undef
+  xor <8 x i1> undef, undef
+  xor <16 x i1> undef, undef
+  xor <32 x i1> undef, undef
+
+  xor <1 x i16> undef, undef
+  xor <2 x i16> undef, undef
+  xor <4 x i16> undef, undef
+  xor <8 x i16> undef, undef
+  xor <16 x i16> undef, undef
+  xor <32 x i16> undef, undef
+
+  xor <1 x i32> undef, undef
+  xor <2 x i32> undef, undef
+  xor <4 x i32> undef, undef
+  xor <8 x i32> undef, undef
+  xor <16 x i32> undef, undef
+
+  xor <1 x i64> undef, undef
+  xor <2 x i64> undef, undef
+  xor <4 x i64> undef, undef
+  xor <8 x i64> undef, undef
+
+  xor <vscale x 1 x i1> undef, undef
+  xor <vscale x 2 x i1> undef, undef
+  xor <vscale x 4 x i1> undef, undef
+  xor <vscale x 8 x i1> undef, undef
+  xor <vscale x 16 x i1> undef, undef
+  xor <vscale x 32 x i1> undef, undef
+
+  xor <vscale x 1 x i16> undef, undef
+  xor <vscale x 2 x i16> undef, undef
+  xor <vscale x 4 x i16> undef, undef
+  xor <vscale x 8 x i16> undef, undef
+  xor <vscale x 16 x i16> undef, undef
+  xor <vscale x 32 x i16> undef, undef
+
+  xor <vscale x 1 x i32> undef, undef
+  xor <vscale x 2 x i32> undef, undef
+  xor <vscale x 4 x i32> undef, undef
+  xor <vscale x 8 x i32> undef, undef
+  xor <vscale x 16 x i32> undef, undef
+
+  xor <vscale x 1 x i64> undef, undef
+  xor <vscale x 2 x i64> undef, undef
+  xor <vscale x 4 x i64> undef, undef
+  xor <vscale x 8 x i64> undef, undef
+  ret i32 undef
+}
diff --git a/llvm/test/Analysis/CostModel/RISCV/cttz_elts.ll b/llvm/test/Analysis/CostModel/RISCV/cttz_elts.ll
new file mode 100644
index 000000000000..ca09d027b547
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/RISCV/cttz_elts.ll
@@ -0,0 +1,149 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v | FileCheck %s
+
+define void @foo_no_vscale_range() {
+; CHECK-LABEL: 'foo_no_vscale_range'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv64i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv64i1(<vscale x 64 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 781 for instruction: %res.i64.nxv128i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1(<vscale x 128 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv64i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 390 for instruction: %res.i32.nxv128i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1(<vscale x 128 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv64i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv64i1(<vscale x 64 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 781 for instruction: %res.i64.nxv128i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1(<vscale x 128 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv64i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 390 for instruction: %res.i32.nxv128i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1(<vscale x 128 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
+  %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 true)
+  %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 true)
+  %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv161(<vscale x 16 x i1> undef, i1 true)
+  %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+  %res.i64.nxv64i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv64i1(<vscale x 64 x i1> undef, i1 true)
+  %res.i64.nxv128i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1(<vscale x 128 x i1> undef, i1 true)
+  %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
+  %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
+  %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
+  %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv161(<vscale x 16 x i1> undef, i1 true)
+  %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+  %res.i32.nxv64i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> undef, i1 true)
+  %res.i32.nxv128i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1(<vscale x 128 x i1> undef, i1 true)
+
+  %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
+  %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
+  %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
+  %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv161(<vscale x 16 x i1> undef, i1 false)
+  %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+  %res.i64.nxv64i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv64i1(<vscale x 64 x i1> undef, i1 false)
+  %res.i64.nxv128i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1(<vscale x 128 x i1> undef, i1 false)
+  %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
+  %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
+  %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
+  %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv161(<vscale x 16 x i1> undef, i1 false)
+  %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+  %res.i32.nxv64i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> undef, i1 false)
+  %res.i32.nxv128i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1(<vscale x 128 x i1> undef, i1 false)
+
+  ret void
+}
+
+
+define void @foo_vscale_range_2_16() vscale_range(2,16) {
+; CHECK-LABEL: 'foo_vscale_range_2_16'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv64i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv64i1(<vscale x 64 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %res.i64.nxv128i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1(<vscale x 128 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv64i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %res.i32.nxv128i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1(<vscale x 128 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv64i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv64i1(<vscale x 64 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %res.i64.nxv128i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1(<vscale x 128 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv64i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %res.i32.nxv128i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1(<vscale x 128 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
+  %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 true)
+  %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 true)
+  %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv161(<vscale x 16 x i1> undef, i1 true)
+  %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+  %res.i64.nxv64i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv64i1(<vscale x 64 x i1> undef, i1 true)
+  %res.i64.nxv128i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1(<vscale x 128 x i1> undef, i1 true)
+  %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
+  %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
+  %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
+  %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv161(<vscale x 16 x i1> undef, i1 true)
+  %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
+  %res.i32.nxv64i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> undef, i1 true)
+  %res.i32.nxv128i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1(<vscale x 128 x i1> undef, i1 true)
+
+  %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
+  %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
+  %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
+  %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv161(<vscale x 16 x i1> undef, i1 false)
+  %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+  %res.i64.nxv64i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv64i1(<vscale x 64 x i1> undef, i1 false)
+  %res.i64.nxv128i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv128i1(<vscale x 128 x i1> undef, i1 false)
+  %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
+  %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
+  %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
+  %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv161(<vscale x 16 x i1> undef, i1 false)
+  %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
+  %res.i32.nxv64i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> undef, i1 false)
+  %res.i32.nxv128i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv128i1(<vscale x 128 x i1> undef, i1 false)
+
+  ret void
+}
+
+declare i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1>, i1)
+declare i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1>, i1)
+declare i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1>, i1)
+declare i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1>, i1)
+declare i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1>, i1)
+declare i64 @llvm.experimental.cttz.elts.i64.nxv64i1(<vscale x 64 x i1>, i1)
+declare i64 @llvm.experimental.cttz.elts.i64.nxv128i1(<vscale x 128 x i1>, i1)
+
+declare i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1>, i1)
+declare i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1>, i1)
+declare i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1>, i1)
+declare i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1>, i1)
+declare i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1>, i1)
+declare i32 @llvm.experimental.cttz.elts.i32.nxv64i1(<vscale x 64 x i1>, i1)
+declare i32 @llvm.experimental.cttz.elts.i32.nxv128i1(<vscale x 128 x i1>, i1)
diff --git a/llvm/test/Analysis/CostModel/RISCV/masked_ldst.ll b/llvm/test/Analysis/CostModel/RISCV/masked_ldst.ll
index 31bbc8b02a19..e6f53d4429c7 100644
--- a/llvm/test/Analysis/CostModel/RISCV/masked_ldst.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/masked_ldst.ll
@@ -13,14 +13,14 @@ define void @fixed() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 287 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr undef, i32 8, <32 x i1> undef, <32 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 223 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr undef, i32 8, <32 x i1> undef, <32 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 entry:
diff --git a/llvm/test/Analysis/CostModel/X86/arith-fp-codesize.ll b/llvm/test/Analysis/CostModel/X86/arith-fp-codesize.ll
index 749e239c33ad..b965a726262e 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-fp-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-fp-codesize.ll
@@ -616,6 +616,136 @@ define i32 @fma(i32 %arg) {
   ret i32 undef
 }
 
+define i32 @rint(i32 %arg) {
+; SSE1-LABEL: 'rint'
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.rint.f32(float undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.rint.v4f32(<4 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.rint.v8f32(<8 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.rint.v16f32(<16 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.rint.f64(double undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.rint.v2f64(<2 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.rint.v4f64(<4 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.rint.v8f64(<8 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'rint'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.rint.f32(float undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.rint.v4f32(<4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.rint.v8f32(<8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.rint.v16f32(<16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.rint.f64(double undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.rint.v2f64(<2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.rint.v4f64(<4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.rint.v8f64(<8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'rint'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.rint.f32(float undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.rint.v4f32(<4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.rint.v8f32(<8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.rint.v16f32(<16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.rint.f64(double undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.rint.v2f64(<2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.rint.v4f64(<4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.rint.v8f64(<8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+  %F32 = call float @llvm.rint.f32.f32(float undef)
+  %V4F32 = call <4 x float> @llvm.rint.v4f32.v4f32(<4 x float> undef)
+  %V8F32 = call <8 x float> @llvm.rint.v8f32.v8f32(<8 x float> undef)
+  %V16F32 = call <16 x float> @llvm.rint.v16f32.v16f32(<16 x float> undef)
+
+  %F64 = call double @llvm.rint.f64.f64(double undef)
+  %V2F64 = call <2 x double> @llvm.rint.v2f64.v2f64(<2 x double> undef)
+  %V4F64 = call <4 x double> @llvm.rint.v4f64.v4f64(<4 x double> undef)
+  %V8F64 = call <8 x double> @llvm.rint.v8f64.v8f64(<8 x double> undef)
+
+  ret i32 undef
+}
+
+define i32 @lrint(i32 %arg) {
+; SSE1-LABEL: 'lrint'
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE2-LABEL: 'lrint'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'lrint'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'lrint'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+  %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+  %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+  %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+  %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+
+  %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+  %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+  %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+  %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+
+  ret i32 undef
+}
+
+define i32 @llrint(i32 %arg) {
+; CHECK-LABEL: 'llrint'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+  %F32 = call i64 @llvm.llrint.i64.f32(float undef)
+  %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
+  %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
+  %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
+
+  %F64 = call i64 @llvm.llrint.i64.f64(double undef)
+  %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
+  %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
+  %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
+
+  ret i32 undef
+}
+
 declare float @llvm.sqrt.f32(float)
 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
 declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
diff --git a/llvm/test/Analysis/CostModel/X86/arith-fp-latency.ll b/llvm/test/Analysis/CostModel/X86/arith-fp-latency.ll
index 0f4bf2896559..c147bd2eef6e 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-fp-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-fp-latency.ll
@@ -1012,6 +1012,213 @@ define i32 @fma(i32 %arg) {
   ret i32 undef
 }
 
+define i32 @rint(i32 %arg) {
+; SSE1-LABEL: 'rint'
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.rint.f32(float undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.rint.v4f32(<4 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.rint.v8f32(<8 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.rint.v16f32(<16 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.rint.f64(double undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2F64 = call <2 x double> @llvm.rint.v2f64(<2 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V4F64 = call <4 x double> @llvm.rint.v4f64(<4 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V8F64 = call <8 x double> @llvm.rint.v8f64(<8 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE2-LABEL: 'rint'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.rint.f32(float undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.rint.v4f32(<4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.rint.v8f32(<8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.rint.v16f32(<16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.rint.f64(double undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.rint.v2f64(<2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.rint.v4f64(<4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V8F64 = call <8 x double> @llvm.rint.v8f64(<8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'rint'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.rint.f32(float undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.rint.v4f32(<4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = call <8 x float> @llvm.rint.v8f32(<8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = call <16 x float> @llvm.rint.v16f32(<16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.rint.f64(double undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.rint.v2f64(<2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.rint.v4f64(<4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.rint.v8f64(<8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'rint'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.rint.f32(float undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.rint.v4f32(<4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.rint.v8f32(<8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.rint.v16f32(<16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.rint.f64(double undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.rint.v2f64(<2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.rint.v4f64(<4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.rint.v8f64(<8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'rint'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.rint.f32(float undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.rint.v4f32(<4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.rint.v8f32(<8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.rint.v16f32(<16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.rint.f64(double undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.rint.v2f64(<2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.rint.v4f64(<4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.rint.v8f64(<8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SLM-LABEL: 'rint'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.rint.f32(float undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.rint.v4f32(<4 x float> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = call <8 x float> @llvm.rint.v8f32(<8 x float> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = call <16 x float> @llvm.rint.v16f32(<16 x float> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.rint.f64(double undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.rint.v2f64(<2 x double> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.rint.v4f64(<4 x double> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.rint.v8f64(<8 x double> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; GLM-LABEL: 'rint'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.rint.f32(float undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.rint.v4f32(<4 x float> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = call <8 x float> @llvm.rint.v8f32(<8 x float> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = call <16 x float> @llvm.rint.v16f32(<16 x float> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.rint.f64(double undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.rint.v2f64(<2 x double> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.rint.v4f64(<4 x double> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.rint.v8f64(<8 x double> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+  %F32 = call float @llvm.rint.f32.f32(float undef)
+  %V4F32 = call <4 x float> @llvm.rint.v4f32.v4f32(<4 x float> undef)
+  %V8F32 = call <8 x float> @llvm.rint.v8f32.v8f32(<8 x float> undef)
+  %V16F32 = call <16 x float> @llvm.rint.v16f32.v16f32(<16 x float> undef)
+
+  %F64 = call double @llvm.rint.f64.f64(double undef)
+  %V2F64 = call <2 x double> @llvm.rint.v2f64.v2f64(<2 x double> undef)
+  %V4F64 = call <4 x double> @llvm.rint.v4f64.v4f64(<4 x double> undef)
+  %V8F64 = call <8 x double> @llvm.rint.v8f64.v8f64(<8 x double> undef)
+
+  ret i32 undef
+}
+
+define i32 @lrint(i32 %arg) {
+; SSE1-LABEL: 'lrint'
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE2-LABEL: 'lrint'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'lrint'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'lrint'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'lrint'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SLM-LABEL: 'lrint'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; GLM-LABEL: 'lrint'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+  %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+  %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+  %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+  %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+
+  %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+  %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+  %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+  %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+
+  ret i32 undef
+}
+
+define i32 @llrint(i32 %arg) {
+; CHECK-LABEL: 'llrint'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+  %F32 = call i64 @llvm.llrint.i64.f32(float undef)
+  %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
+  %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
+  %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
+
+  %F64 = call i64 @llvm.llrint.i64.f64(double undef)
+  %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
+  %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
+  %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
+
+  ret i32 undef
+}
+
 declare float @llvm.sqrt.f32(float)
 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
 declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
diff --git a/llvm/test/Analysis/CostModel/X86/arith-fp-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/arith-fp-sizelatency.ll
index 90b7feb77cf8..d9312ac05601 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-fp-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-fp-sizelatency.ll
@@ -946,6 +946,213 @@ define i32 @fma(i32 %arg) {
   ret i32 undef
 }
 
+define i32 @rint(i32 %arg) {
+; SSE1-LABEL: 'rint'
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.rint.f32(float undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.rint.v4f32(<4 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.rint.v8f32(<8 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.rint.v16f32(<16 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.rint.f64(double undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2F64 = call <2 x double> @llvm.rint.v2f64(<2 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V4F64 = call <4 x double> @llvm.rint.v4f64(<4 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V8F64 = call <8 x double> @llvm.rint.v8f64(<8 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE2-LABEL: 'rint'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.rint.f32(float undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.rint.v4f32(<4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.rint.v8f32(<8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.rint.v16f32(<16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.rint.f64(double undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.rint.v2f64(<2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.rint.v4f64(<4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V8F64 = call <8 x double> @llvm.rint.v8f64(<8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'rint'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.rint.f32(float undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.rint.v4f32(<4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = call <8 x float> @llvm.rint.v8f32(<8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = call <16 x float> @llvm.rint.v16f32(<16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.rint.f64(double undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.rint.v2f64(<2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.rint.v4f64(<4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.rint.v8f64(<8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'rint'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.rint.f32(float undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.rint.v4f32(<4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.rint.v8f32(<8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.rint.v16f32(<16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.rint.f64(double undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.rint.v2f64(<2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.rint.v4f64(<4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.rint.v8f64(<8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'rint'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.rint.f32(float undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.rint.v4f32(<4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.rint.v8f32(<8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.rint.v16f32(<16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.rint.f64(double undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.rint.v2f64(<2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.rint.v4f64(<4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.rint.v8f64(<8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SLM-LABEL: 'rint'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.rint.f32(float undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.rint.v4f32(<4 x float> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = call <8 x float> @llvm.rint.v8f32(<8 x float> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = call <16 x float> @llvm.rint.v16f32(<16 x float> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.rint.f64(double undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.rint.v2f64(<2 x double> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.rint.v4f64(<4 x double> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.rint.v8f64(<8 x double> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; GLM-LABEL: 'rint'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.rint.f32(float undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.rint.v4f32(<4 x float> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = call <8 x float> @llvm.rint.v8f32(<8 x float> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = call <16 x float> @llvm.rint.v16f32(<16 x float> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.rint.f64(double undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.rint.v2f64(<2 x double> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.rint.v4f64(<4 x double> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.rint.v8f64(<8 x double> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+  %F32 = call float @llvm.rint.f32.f32(float undef)
+  %V4F32 = call <4 x float> @llvm.rint.v4f32.v4f32(<4 x float> undef)
+  %V8F32 = call <8 x float> @llvm.rint.v8f32.v8f32(<8 x float> undef)
+  %V16F32 = call <16 x float> @llvm.rint.v16f32.v16f32(<16 x float> undef)
+
+  %F64 = call double @llvm.rint.f64.f64(double undef)
+  %V2F64 = call <2 x double> @llvm.rint.v2f64.v2f64(<2 x double> undef)
+  %V4F64 = call <4 x double> @llvm.rint.v4f64.v4f64(<4 x double> undef)
+  %V8F64 = call <8 x double> @llvm.rint.v8f64.v8f64(<8 x double> undef)
+
+  ret i32 undef
+}
+
+define i32 @lrint(i32 %arg) {
+; SSE1-LABEL: 'lrint'
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE2-LABEL: 'lrint'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'lrint'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'lrint'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'lrint'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SLM-LABEL: 'lrint'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; GLM-LABEL: 'lrint'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+  %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+  %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+  %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+  %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+
+  %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+  %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+  %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+  %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+
+  ret i32 undef
+}
+
+define i32 @llrint(i32 %arg) {
+; CHECK-LABEL: 'llrint'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+  %F32 = call i64 @llvm.llrint.i64.f32(float undef)
+  %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
+  %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
+  %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
+
+  %F64 = call i64 @llvm.llrint.i64.f64(double undef)
+  %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
+  %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
+  %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
+
+  ret i32 undef
+}
+
 declare float @llvm.sqrt.f32(float)
 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
 declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
diff --git a/llvm/test/Analysis/CostModel/X86/arith-fp.ll b/llvm/test/Analysis/CostModel/X86/arith-fp.ll
index ae3b5d7cddb2..90871e3a3831 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-fp.ll
@@ -1089,6 +1089,290 @@ define i32 @fma(i32 %arg) {
   ret i32 undef
 }
 
+define i32 @rint(i32 %arg) {
+; SSE1-LABEL: 'rint'
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.rint.f32(float undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.rint.v4f32(<4 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.rint.v8f32(<8 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.rint.v16f32(<16 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.rint.f64(double undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2F64 = call <2 x double> @llvm.rint.v2f64(<2 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V4F64 = call <4 x double> @llvm.rint.v4f64(<4 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V8F64 = call <8 x double> @llvm.rint.v8f64(<8 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE2-LABEL: 'rint'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.rint.f32(float undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.rint.v4f32(<4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.rint.v8f32(<8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.rint.v16f32(<16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.rint.f64(double undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.rint.v2f64(<2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.rint.v4f64(<4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V8F64 = call <8 x double> @llvm.rint.v8f64(<8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'rint'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.rint.f32(float undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.rint.v4f32(<4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = call <8 x float> @llvm.rint.v8f32(<8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = call <16 x float> @llvm.rint.v16f32(<16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.rint.f64(double undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.rint.v2f64(<2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.rint.v4f64(<4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.rint.v8f64(<8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'rint'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.rint.f32(float undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.rint.v4f32(<4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.rint.v8f32(<8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.rint.v16f32(<16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.rint.f64(double undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.rint.v2f64(<2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.rint.v4f64(<4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.rint.v8f64(<8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'rint'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.rint.f32(float undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.rint.v4f32(<4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.rint.v8f32(<8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.rint.v16f32(<16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.rint.f64(double undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.rint.v2f64(<2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.rint.v4f64(<4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.rint.v8f64(<8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SLM-LABEL: 'rint'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.rint.f32(float undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.rint.v4f32(<4 x float> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = call <8 x float> @llvm.rint.v8f32(<8 x float> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = call <16 x float> @llvm.rint.v16f32(<16 x float> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.rint.f64(double undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.rint.v2f64(<2 x double> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.rint.v4f64(<4 x double> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.rint.v8f64(<8 x double> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; GLM-LABEL: 'rint'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.rint.f32(float undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.rint.v4f32(<4 x float> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = call <8 x float> @llvm.rint.v8f32(<8 x float> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = call <16 x float> @llvm.rint.v16f32(<16 x float> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.rint.f64(double undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.rint.v2f64(<2 x double> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.rint.v4f64(<4 x double> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.rint.v8f64(<8 x double> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %F32 = call float @llvm.rint.f32.f32(float undef)
+  %V4F32 = call <4 x float> @llvm.rint.v4f32.v4f32(<4 x float> undef)
+  %V8F32 = call <8 x float> @llvm.rint.v8f32.v8f32(<8 x float> undef)
+  %V16F32 = call <16 x float> @llvm.rint.v16f32.v16f32(<16 x float> undef)
+
+  %F64 = call double @llvm.rint.f64.f64(double undef)
+  %V2F64 = call <2 x double> @llvm.rint.v2f64.v2f64(<2 x double> undef)
+  %V4F64 = call <4 x double> @llvm.rint.v4f64.v4f64(<4 x double> undef)
+  %V8F64 = call <8 x double> @llvm.rint.v8f64.v8f64(<8 x double> undef)
+
+  ret i32 undef
+}
+
+define i32 @lrint(i32 %arg) {
+; SSE1-LABEL: 'lrint'
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE2-LABEL: 'lrint'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'lrint'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'lrint'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'lrint'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'lrint'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SLM-LABEL: 'lrint'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; GLM-LABEL: 'lrint'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %F32 = call i32 @llvm.lrint.i32.f32(float undef)
+  %V4F32 = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> undef)
+  %V8F32 = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> undef)
+  %V16F32 = call <16 x i32> @llvm.lrint.v16i32.v16f32(<16 x float> undef)
+
+  %F64 = call i32 @llvm.lrint.i32.f64(double undef)
+  %V2F64 = call <2 x i32> @llvm.lrint.v2i32.v2f64(<2 x double> undef)
+  %V4F64 = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> undef)
+  %V8F64 = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> undef)
+
+  ret i32 undef
+}
+
+define i32 @llrint(i32 %arg) {
+; SSE1-LABEL: 'llrint'
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE2-LABEL: 'llrint'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'llrint'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'llrint'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'llrint'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SLM-LABEL: 'llrint'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; GLM-LABEL: 'llrint'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i64 @llvm.llrint.i64.f32(float undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i64 @llvm.llrint.i64.f64(double undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %F32 = call i64 @llvm.llrint.i64.f32(float undef)
+  %V4F32 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef)
+  %V8F32 = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef)
+  %V16F32 = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef)
+
+  %F64 = call i64 @llvm.llrint.i64.f64(double undef)
+  %V2F64 = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef)
+  %V4F64 = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef)
+  %V8F64 = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef)
+
+  ret i32 undef
+}
+
 declare float @llvm.sqrt.f32(float)
 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
 declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
diff --git a/llvm/test/Analysis/CostModel/X86/fshl-codesize.ll b/llvm/test/Analysis/CostModel/X86/fshl-codesize.ll
index a7585a4d9f39..71927002b599 100644
--- a/llvm/test/Analysis/CostModel/X86/fshl-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/fshl-codesize.ll
@@ -1597,9 +1597,9 @@ define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ;
 ; AVX512GFNI-LABEL: 'splatconstant_funnel_i8'
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
@@ -2871,9 +2871,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ;
 ; AVX512GFNI-LABEL: 'splatconstant_rotate_i8'
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
diff --git a/llvm/test/Analysis/CostModel/X86/fshl-latency.ll b/llvm/test/Analysis/CostModel/X86/fshl-latency.ll
index 7105f713fdc3..c40394ba9a72 100644
--- a/llvm/test/Analysis/CostModel/X86/fshl-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/fshl-latency.ll
@@ -1549,9 +1549,9 @@ define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ;
 ; AVX512GFNI-LABEL: 'splatconstant_funnel_i8'
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
@@ -2823,9 +2823,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ;
 ; AVX512GFNI-LABEL: 'splatconstant_rotate_i8'
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
diff --git a/llvm/test/Analysis/CostModel/X86/fshl-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/fshl-sizelatency.ll
index 5d7361e29317..7b0daf504855 100644
--- a/llvm/test/Analysis/CostModel/X86/fshl-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/fshl-sizelatency.ll
@@ -1597,9 +1597,9 @@ define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ;
 ; AVX512GFNI-LABEL: 'splatconstant_funnel_i8'
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
@@ -3111,9 +3111,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ;
 ; AVX512GFNI-LABEL: 'splatconstant_rotate_i8'
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
diff --git a/llvm/test/Analysis/CostModel/X86/fshl.ll b/llvm/test/Analysis/CostModel/X86/fshl.ll
index 1cbdab09acd9..127dec0a1a6f 100644
--- a/llvm/test/Analysis/CostModel/X86/fshl.ll
+++ b/llvm/test/Analysis/CostModel/X86/fshl.ll
@@ -2811,7 +2811,7 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
diff --git a/llvm/test/Analysis/CostModel/X86/fshr-codesize.ll b/llvm/test/Analysis/CostModel/X86/fshr-codesize.ll
index ecc861dd7f8e..92a20b938142 100644
--- a/llvm/test/Analysis/CostModel/X86/fshr-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/fshr-codesize.ll
@@ -1597,9 +1597,9 @@ define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ;
 ; AVX512GFNI-LABEL: 'splatconstant_funnel_i8'
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
@@ -2871,9 +2871,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ;
 ; AVX512GFNI-LABEL: 'splatconstant_rotate_i8'
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
diff --git a/llvm/test/Analysis/CostModel/X86/fshr-latency.ll b/llvm/test/Analysis/CostModel/X86/fshr-latency.ll
index 0142ad77849c..33fadef536bf 100644
--- a/llvm/test/Analysis/CostModel/X86/fshr-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/fshr-latency.ll
@@ -1549,9 +1549,9 @@ define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ;
 ; AVX512GFNI-LABEL: 'splatconstant_funnel_i8'
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
@@ -2823,9 +2823,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ;
 ; AVX512GFNI-LABEL: 'splatconstant_rotate_i8'
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
diff --git a/llvm/test/Analysis/CostModel/X86/fshr-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/fshr-sizelatency.ll
index 6dafb20a0aee..ef831328c480 100644
--- a/llvm/test/Analysis/CostModel/X86/fshr-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/fshr-sizelatency.ll
@@ -1597,9 +1597,9 @@ define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ;
 ; AVX512GFNI-LABEL: 'splatconstant_funnel_i8'
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
@@ -3111,9 +3111,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ;
 ; AVX512GFNI-LABEL: 'splatconstant_rotate_i8'
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
diff --git a/llvm/test/Analysis/CostModel/X86/fshr.ll b/llvm/test/Analysis/CostModel/X86/fshr.ll
index ada1b9c5bdc4..3c233b51053d 100644
--- a/llvm/test/Analysis/CostModel/X86/fshr.ll
+++ b/llvm/test/Analysis/CostModel/X86/fshr.ll
@@ -2811,7 +2811,7 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
diff --git a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
index cda9744a8d6b..06429a510711 100644
--- a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
+++ b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
@@ -56,17 +56,17 @@ define void @umul(i32 %a, i32 %b, <16 x i32> %va, <16 x i32> %vb) {
 ;
 ; LATE-LABEL: 'umul'
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'umul'
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE_LATE-LABEL: 'umul'
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
+; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %s = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
@@ -310,7 +310,7 @@ define void @fshl(i32 %a, i32 %b, i32 %c, <16 x i32> %va, <16 x i32> %vb, <16 x
 
 define void @maskedgather(<16 x ptr> %va, <16 x i1> %vb, <16 x float> %vc) {
 ; THRU-LABEL: 'maskedgather'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
+; THRU-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'maskedgather'
@@ -331,7 +331,7 @@ define void @maskedgather(<16 x ptr> %va, <16 x i1> %vb, <16 x float> %vc) {
 
 define void @maskedscatter(<16 x float> %va, <16 x ptr> %vb, <16 x i1> %vc) {
 ; THRU-LABEL: 'maskedscatter'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc)
+; THRU-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc)
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'maskedscatter'
diff --git a/llvm/test/Analysis/CostModel/X86/masked-gather-i32-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/masked-gather-i32-with-i8-index.ll
index 80b350254e94..b62d3fb25091 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-gather-i32-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-gather-i32-with-i8-index.ll
@@ -49,8 +49,8 @@ define void @test() {
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX512:  LV: Found an estimated cost of 10 for VF 2 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX512:  LV: Found an estimated cost of 21 for VF 4 For instruction: %valB.loaded = load i32, ptr %inB, align 4
+; AVX512:  LV: Found an estimated cost of 8 for VF 2 For instruction: %valB.loaded = load i32, ptr %inB, align 4
+; AVX512:  LV: Found an estimated cost of 17 for VF 4 For instruction: %valB.loaded = load i32, ptr %inB, align 4
 ; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: %valB.loaded = load i32, ptr %inB, align 4
 ; AVX512:  LV: Found an estimated cost of 18 for VF 16 For instruction: %valB.loaded = load i32, ptr %inB, align 4
 ; AVX512:  LV: Found an estimated cost of 36 for VF 32 For instruction: %valB.loaded = load i32, ptr %inB, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/masked-gather-i64-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/masked-gather-i64-with-i8-index.ll
index 9146ca498237..1d3e45765e51 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-gather-i64-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-gather-i64-with-i8-index.ll
@@ -49,8 +49,8 @@ define void @test() {
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX512:  LV: Found an estimated cost of 10 for VF 2 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX512:  LV: Found an estimated cost of 22 for VF 4 For instruction: %valB.loaded = load i64, ptr %inB, align 8
+; AVX512:  LV: Found an estimated cost of 8 for VF 2 For instruction: %valB.loaded = load i64, ptr %inB, align 8
+; AVX512:  LV: Found an estimated cost of 18 for VF 4 For instruction: %valB.loaded = load i64, ptr %inB, align 8
 ; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: %valB.loaded = load i64, ptr %inB, align 8
 ; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %valB.loaded = load i64, ptr %inB, align 8
 ; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %valB.loaded = load i64, ptr %inB, align 8
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll
index 827e503fe7b1..1e5c02afc2b3 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll
@@ -840,20 +840,20 @@ define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_gather'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
@@ -871,7 +871,7 @@ define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
@@ -879,7 +879,7 @@ define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
@@ -898,7 +898,7 @@ define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
@@ -906,7 +906,7 @@ define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
@@ -1094,7 +1094,7 @@ define i32 @masked_scatter(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
@@ -1102,7 +1102,7 @@ define i32 @masked_scatter(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
@@ -1121,7 +1121,7 @@ define i32 @masked_scatter(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
@@ -1129,7 +1129,7 @@ define i32 @masked_scatter(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
@@ -1883,13 +1883,13 @@ define <4 x i32> @test_gather_4i32_const_mask(<4 x ptr> %ptrs, <4 x i32> %src0)
 
 define <16 x float> @test_gather_16f32_const_mask(ptr %base, <16 x i32> %ind) {
 ; SSE2-LABEL: 'test_gather_16f32_const_mask'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_const_mask'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
@@ -1909,7 +1909,7 @@ define <16 x float> @test_gather_16f32_const_mask(ptr %base, <16 x i32> %ind) {
 ; SKL-LABEL: 'test_gather_16f32_const_mask'
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
 ;
 ; AVX512-LABEL: 'test_gather_16f32_const_mask'
@@ -1927,13 +1927,13 @@ define <16 x float> @test_gather_16f32_const_mask(ptr %base, <16 x i32> %ind) {
 
 define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16 x i1>%mask) {
 ; SSE2-LABEL: 'test_gather_16f32_var_mask'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_var_mask'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
@@ -1953,7 +1953,7 @@ define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16
 ; SKL-LABEL: 'test_gather_16f32_var_mask'
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
 ;
 ; AVX512-LABEL: 'test_gather_16f32_var_mask'
@@ -1971,13 +1971,13 @@ define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16
 
 define <16 x float> @test_gather_16f32_ra_var_mask(<16 x ptr> %ptrs, <16 x i32> %ind, <16 x i1>%mask) {
 ; SSE2-LABEL: 'test_gather_16f32_ra_var_mask'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_ra_var_mask'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
@@ -1997,13 +1997,13 @@ define <16 x float> @test_gather_16f32_ra_var_mask(<16 x ptr> %ptrs, <16 x i32>
 ; SKL-LABEL: 'test_gather_16f32_ra_var_mask'
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
 ;
 ; AVX512-LABEL: 'test_gather_16f32_ra_var_mask'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
 ;
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
@@ -2017,7 +2017,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; SSE2-LABEL: 'test_gather_16f32_const_mask2'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
@@ -2025,7 +2025,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; SSE42-LABEL: 'test_gather_16f32_const_mask2'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
@@ -2051,7 +2051,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
 ;
 ; AVX512-LABEL: 'test_gather_16f32_const_mask2'
@@ -2178,13 +2178,13 @@ define void @test_scatter_4i32(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
 
 define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) {
 ; SSE2-LABEL: 'test_gather_4f32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_4f32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
@@ -2228,13 +2228,13 @@ define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) {
 
 define <4 x float> @test_gather_4f32_const_mask(ptr %ptr, <4 x i32> %ind) {
 ; SSE2-LABEL: 'test_gather_4f32_const_mask'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_4f32_const_mask'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
index b0f9f6d32a56..25d12da306ab 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
@@ -732,118 +732,118 @@ define i32 @masked_store(<1 x i1> %m1, <2 x i1> %m2, <3 x i1> %m3, <4 x i1> %m4,
 
 define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_gather'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_gather'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_gather'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 278 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 214 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_gather'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_gather'
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
@@ -851,73 +851,73 @@ define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; KNL-LABEL: 'masked_gather'
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 347 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 143 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 283 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SKX-LABEL: 'masked_gather'
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 347 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 143 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 283 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
   %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
@@ -955,192 +955,192 @@ define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8
 
 define i32 @masked_scatter(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_scatter'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_scatter'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_scatter'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 278 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 214 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_scatter'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_scatter'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; KNL-LABEL: 'masked_scatter'
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 347 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 143 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 283 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SKX-LABEL: 'masked_scatter'
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 347 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 143 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 283 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
   call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
@@ -1788,19 +1788,19 @@ define <2 x i32> @test8(<2 x i32> %trigger, ptr %addr, <2 x i32> %dst) {
 
 define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x double> %src0)  {
 ; SSE2-LABEL: 'test_gather_2f64'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res
 ;
 ; SSE42-LABEL: 'test_gather_2f64'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res
 ;
 ; AVX1-LABEL: 'test_gather_2f64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res
 ;
 ; AVX2-LABEL: 'test_gather_2f64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res
 ;
 ; SKL-LABEL: 'test_gather_2f64'
@@ -1808,7 +1808,7 @@ define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x doub
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res
 ;
 ; AVX512-LABEL: 'test_gather_2f64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res
 ;
   %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
@@ -1817,19 +1817,19 @@ define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x doub
 
 define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %src0)  {
 ; SSE2-LABEL: 'test_gather_4i32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
 ;
 ; SSE42-LABEL: 'test_gather_4i32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
 ;
 ; AVX1-LABEL: 'test_gather_4i32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
 ;
 ; AVX2-LABEL: 'test_gather_4i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
 ;
 ; SKL-LABEL: 'test_gather_4i32'
@@ -1837,7 +1837,7 @@ define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %s
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
 ;
 ; KNL-LABEL: 'test_gather_4i32'
-; KNL-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
 ;
 ; SKX-LABEL: 'test_gather_4i32'
@@ -1929,25 +1929,25 @@ define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16
 ; SSE2-LABEL: 'test_gather_16f32_var_mask'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_var_mask'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; AVX1-LABEL: 'test_gather_16f32_var_mask'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_16f32_var_mask'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_16f32_var_mask'
@@ -1973,25 +1973,25 @@ define <16 x float> @test_gather_16f32_ra_var_mask(<16 x ptr> %ptrs, <16 x i32>
 ; SSE2-LABEL: 'test_gather_16f32_ra_var_mask'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_ra_var_mask'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; AVX1-LABEL: 'test_gather_16f32_ra_var_mask'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_16f32_ra_var_mask'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_16f32_ra_var_mask'
@@ -2078,7 +2078,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_scatter_16i32'
@@ -2086,7 +2086,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_scatter_16i32'
@@ -2094,7 +2094,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_scatter_16i32'
@@ -2102,7 +2102,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SKL-LABEL: 'test_scatter_16i32'
@@ -2110,7 +2110,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
-; SKL-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_scatter_16i32'
@@ -2132,15 +2132,15 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 
 define void @test_scatter_8i32(<8 x i32>%a1, <8 x ptr> %ptr, <8 x i1>%mask) {
 ; SSE2-LABEL: 'test_scatter_8i32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_scatter_8i32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_scatter_8i32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_scatter_8i32'
@@ -2153,19 +2153,19 @@ define void @test_scatter_8i32(<8 x i32>%a1, <8 x ptr> %ptr, <8 x i1>%mask) {
 
 define void @test_scatter_4i32(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
 ; SSE2-LABEL: 'test_scatter_4i32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_scatter_4i32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_scatter_4i32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; KNL-LABEL: 'test_scatter_4i32'
-; KNL-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SKX-LABEL: 'test_scatter_4i32'
@@ -2180,25 +2180,25 @@ define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) {
 ; SSE2-LABEL: 'test_gather_4f32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_4f32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res
 ;
 ; AVX1-LABEL: 'test_gather_4f32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_4f32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_4f32'
@@ -2210,7 +2210,7 @@ define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) {
 ; KNL-LABEL: 'test_gather_4f32'
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res
 ;
 ; SKX-LABEL: 'test_gather_4f32'
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
index 46123e9f6057..332d90ac4191 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
@@ -732,118 +732,118 @@ define i32 @masked_store(<1 x i1> %m1, <2 x i1> %m2, <3 x i1> %m3, <4 x i1> %m4,
 
 define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_gather'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_gather'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_gather'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 278 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 214 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_gather'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_gather'
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
@@ -851,73 +851,73 @@ define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; KNL-LABEL: 'masked_gather'
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 347 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 143 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 283 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SKX-LABEL: 'masked_gather'
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 347 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 143 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 283 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
   %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
@@ -955,192 +955,192 @@ define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8
 
 define i32 @masked_scatter(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_scatter'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_scatter'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_scatter'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 278 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 214 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_scatter'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_scatter'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; KNL-LABEL: 'masked_scatter'
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 347 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 143 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 283 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SKX-LABEL: 'masked_scatter'
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 347 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 143 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 283 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
   call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
@@ -1788,19 +1788,19 @@ define <2 x i32> @test8(<2 x i32> %trigger, ptr %addr, <2 x i32> %dst) {
 
 define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x double> %src0)  {
 ; SSE2-LABEL: 'test_gather_2f64'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res
 ;
 ; SSE42-LABEL: 'test_gather_2f64'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res
 ;
 ; AVX1-LABEL: 'test_gather_2f64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res
 ;
 ; AVX2-LABEL: 'test_gather_2f64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res
 ;
 ; SKL-LABEL: 'test_gather_2f64'
@@ -1808,7 +1808,7 @@ define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x doub
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res
 ;
 ; AVX512-LABEL: 'test_gather_2f64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res
 ;
   %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
@@ -1817,19 +1817,19 @@ define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x doub
 
 define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %src0)  {
 ; SSE2-LABEL: 'test_gather_4i32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
 ;
 ; SSE42-LABEL: 'test_gather_4i32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
 ;
 ; AVX1-LABEL: 'test_gather_4i32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
 ;
 ; AVX2-LABEL: 'test_gather_4i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
 ;
 ; SKL-LABEL: 'test_gather_4i32'
@@ -1837,7 +1837,7 @@ define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %s
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
 ;
 ; KNL-LABEL: 'test_gather_4i32'
-; KNL-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
 ;
 ; SKX-LABEL: 'test_gather_4i32'
@@ -1929,25 +1929,25 @@ define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16
 ; SSE2-LABEL: 'test_gather_16f32_var_mask'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_var_mask'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; AVX1-LABEL: 'test_gather_16f32_var_mask'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_16f32_var_mask'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_16f32_var_mask'
@@ -1973,25 +1973,25 @@ define <16 x float> @test_gather_16f32_ra_var_mask(<16 x ptr> %ptrs, <16 x i32>
 ; SSE2-LABEL: 'test_gather_16f32_ra_var_mask'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_ra_var_mask'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; AVX1-LABEL: 'test_gather_16f32_ra_var_mask'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_16f32_ra_var_mask'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_16f32_ra_var_mask'
@@ -2078,7 +2078,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_scatter_16i32'
@@ -2086,7 +2086,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_scatter_16i32'
@@ -2094,7 +2094,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_scatter_16i32'
@@ -2102,7 +2102,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SKL-LABEL: 'test_scatter_16i32'
@@ -2110,7 +2110,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
-; SKL-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_scatter_16i32'
@@ -2132,15 +2132,15 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 
 define void @test_scatter_8i32(<8 x i32>%a1, <8 x ptr> %ptr, <8 x i1>%mask) {
 ; SSE2-LABEL: 'test_scatter_8i32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_scatter_8i32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_scatter_8i32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_scatter_8i32'
@@ -2153,19 +2153,19 @@ define void @test_scatter_8i32(<8 x i32>%a1, <8 x ptr> %ptr, <8 x i1>%mask) {
 
 define void @test_scatter_4i32(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
 ; SSE2-LABEL: 'test_scatter_4i32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_scatter_4i32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_scatter_4i32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; KNL-LABEL: 'test_scatter_4i32'
-; KNL-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SKX-LABEL: 'test_scatter_4i32'
@@ -2180,25 +2180,25 @@ define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) {
 ; SSE2-LABEL: 'test_gather_4f32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_4f32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res
 ;
 ; AVX1-LABEL: 'test_gather_4f32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_4f32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_4f32'
@@ -2210,7 +2210,7 @@ define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) {
 ; KNL-LABEL: 'test_gather_4f32'
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res
 ;
 ; SKX-LABEL: 'test_gather_4f32'
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll
index 8ca572ada8b7..14dc561edc34 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll
@@ -840,22 +840,22 @@ define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_gather'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
@@ -867,20 +867,20 @@ define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; KNL-LABEL: 'masked_gather'
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
@@ -894,21 +894,21 @@ define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKX-LABEL: 'masked_gather'
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
@@ -1090,20 +1090,20 @@ define i32 @masked_scatter(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; KNL-LABEL: 'masked_scatter'
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
@@ -1117,21 +1117,21 @@ define i32 @masked_scatter(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKX-LABEL: 'masked_scatter'
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
@@ -1804,7 +1804,7 @@ define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x doub
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
 ;
 ; SKL-LABEL: 'test_gather_2f64'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
 ;
 ; AVX512-LABEL: 'test_gather_2f64'
@@ -1833,7 +1833,7 @@ define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %s
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
 ;
 ; SKL-LABEL: 'test_gather_4i32'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
 ;
 ; KNL-LABEL: 'test_gather_4i32'
@@ -1841,7 +1841,7 @@ define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %s
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
 ;
 ; SKX-LABEL: 'test_gather_4i32'
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
 ;
   %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
@@ -1866,7 +1866,7 @@ define <4 x i32> @test_gather_4i32_const_mask(<4 x ptr> %ptrs, <4 x i32> %src0)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
 ;
 ; SKL-LABEL: 'test_gather_4i32_const_mask'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
 ;
 ; KNL-LABEL: 'test_gather_4i32_const_mask'
@@ -1874,7 +1874,7 @@ define <4 x i32> @test_gather_4i32_const_mask(<4 x ptr> %ptrs, <4 x i32> %src0)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
 ;
 ; SKX-LABEL: 'test_gather_4i32_const_mask'
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
 ;
   %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
@@ -1883,13 +1883,13 @@ define <4 x i32> @test_gather_4i32_const_mask(<4 x ptr> %ptrs, <4 x i32> %src0)
 
 define <16 x float> @test_gather_16f32_const_mask(ptr %base, <16 x i32> %ind) {
 ; SSE2-LABEL: 'test_gather_16f32_const_mask'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_const_mask'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
@@ -1909,13 +1909,13 @@ define <16 x float> @test_gather_16f32_const_mask(ptr %base, <16 x i32> %ind) {
 ; SKL-LABEL: 'test_gather_16f32_const_mask'
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
 ;
 ; AVX512-LABEL: 'test_gather_16f32_const_mask'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
 ;
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
@@ -1927,13 +1927,13 @@ define <16 x float> @test_gather_16f32_const_mask(ptr %base, <16 x i32> %ind) {
 
 define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16 x i1>%mask) {
 ; SSE2-LABEL: 'test_gather_16f32_var_mask'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_var_mask'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
@@ -1953,13 +1953,13 @@ define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16
 ; SKL-LABEL: 'test_gather_16f32_var_mask'
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
 ;
 ; AVX512-LABEL: 'test_gather_16f32_var_mask'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
 ;
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
@@ -1971,13 +1971,13 @@ define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16
 
 define <16 x float> @test_gather_16f32_ra_var_mask(<16 x ptr> %ptrs, <16 x i32> %ind, <16 x i1>%mask) {
 ; SSE2-LABEL: 'test_gather_16f32_ra_var_mask'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_ra_var_mask'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
@@ -1997,13 +1997,13 @@ define <16 x float> @test_gather_16f32_ra_var_mask(<16 x ptr> %ptrs, <16 x i32>
 ; SKL-LABEL: 'test_gather_16f32_ra_var_mask'
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
 ;
 ; AVX512-LABEL: 'test_gather_16f32_ra_var_mask'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
 ;
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
@@ -2017,7 +2017,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; SSE2-LABEL: 'test_gather_16f32_const_mask2'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
@@ -2025,7 +2025,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; SSE42-LABEL: 'test_gather_16f32_const_mask2'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
@@ -2051,7 +2051,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
 ;
 ; AVX512-LABEL: 'test_gather_16f32_const_mask2'
@@ -2059,7 +2059,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
 ;
   %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
@@ -2118,7 +2118,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
@@ -2144,7 +2144,7 @@ define void @test_scatter_8i32(<8 x i32>%a1, <8 x ptr> %ptr, <8 x i1>%mask) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_scatter_8i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
@@ -2169,7 +2169,7 @@ define void @test_scatter_4i32(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SKX-LABEL: 'test_scatter_4i32'
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
@@ -2178,13 +2178,13 @@ define void @test_scatter_4i32(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
 
 define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) {
 ; SSE2-LABEL: 'test_gather_4f32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_4f32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
@@ -2204,7 +2204,7 @@ define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) {
 ; SKL-LABEL: 'test_gather_4f32'
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
 ;
 ; KNL-LABEL: 'test_gather_4f32'
@@ -2216,7 +2216,7 @@ define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) {
 ; SKX-LABEL: 'test_gather_4f32'
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
 ;
   %sext_ind = sext <4 x i32> %ind to <4 x i64>
@@ -2228,13 +2228,13 @@ define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) {
 
 define <4 x float> @test_gather_4f32_const_mask(ptr %ptr, <4 x i32> %ind) {
 ; SSE2-LABEL: 'test_gather_4f32_const_mask'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_4f32_const_mask'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
@@ -2254,7 +2254,7 @@ define <4 x float> @test_gather_4f32_const_mask(ptr %ptr, <4 x i32> %ind) {
 ; SKL-LABEL: 'test_gather_4f32_const_mask'
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
 ;
 ; KNL-LABEL: 'test_gather_4f32_const_mask'
@@ -2266,7 +2266,7 @@ define <4 x float> @test_gather_4f32_const_mask(ptr %ptr, <4 x i32> %ind) {
 ; SKX-LABEL: 'test_gather_4f32_const_mask'
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
 ;
   %sext_ind = sext <4 x i32> %ind to <4 x i64>
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll
index 07583d268c8a..a030068dfaf5 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll
@@ -840,22 +840,22 @@ define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_gather'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
@@ -867,20 +867,20 @@ define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; KNL-LABEL: 'masked_gather'
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
@@ -894,21 +894,21 @@ define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKX-LABEL: 'masked_gather'
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
@@ -1090,20 +1090,20 @@ define i32 @masked_scatter(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; KNL-LABEL: 'masked_scatter'
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
@@ -1117,21 +1117,21 @@ define i32 @masked_scatter(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKX-LABEL: 'masked_scatter'
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
@@ -1804,7 +1804,7 @@ define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x doub
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
 ;
 ; SKL-LABEL: 'test_gather_2f64'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
 ;
 ; AVX512-LABEL: 'test_gather_2f64'
@@ -1833,7 +1833,7 @@ define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %s
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
 ;
 ; SKL-LABEL: 'test_gather_4i32'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
 ;
 ; KNL-LABEL: 'test_gather_4i32'
@@ -1841,7 +1841,7 @@ define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %s
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
 ;
 ; SKX-LABEL: 'test_gather_4i32'
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
 ;
   %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
@@ -1866,7 +1866,7 @@ define <4 x i32> @test_gather_4i32_const_mask(<4 x ptr> %ptrs, <4 x i32> %src0)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
 ;
 ; SKL-LABEL: 'test_gather_4i32_const_mask'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
 ;
 ; KNL-LABEL: 'test_gather_4i32_const_mask'
@@ -1874,7 +1874,7 @@ define <4 x i32> @test_gather_4i32_const_mask(<4 x ptr> %ptrs, <4 x i32> %src0)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
 ;
 ; SKX-LABEL: 'test_gather_4i32_const_mask'
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
 ;
   %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
@@ -1883,13 +1883,13 @@ define <4 x i32> @test_gather_4i32_const_mask(<4 x ptr> %ptrs, <4 x i32> %src0)
 
 define <16 x float> @test_gather_16f32_const_mask(ptr %base, <16 x i32> %ind) {
 ; SSE2-LABEL: 'test_gather_16f32_const_mask'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_const_mask'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
@@ -1909,13 +1909,13 @@ define <16 x float> @test_gather_16f32_const_mask(ptr %base, <16 x i32> %ind) {
 ; SKL-LABEL: 'test_gather_16f32_const_mask'
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
 ;
 ; AVX512-LABEL: 'test_gather_16f32_const_mask'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
 ;
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
@@ -1927,13 +1927,13 @@ define <16 x float> @test_gather_16f32_const_mask(ptr %base, <16 x i32> %ind) {
 
 define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16 x i1>%mask) {
 ; SSE2-LABEL: 'test_gather_16f32_var_mask'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_var_mask'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
@@ -1953,13 +1953,13 @@ define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16
 ; SKL-LABEL: 'test_gather_16f32_var_mask'
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
 ;
 ; AVX512-LABEL: 'test_gather_16f32_var_mask'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
 ;
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
@@ -1971,13 +1971,13 @@ define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16
 
 define <16 x float> @test_gather_16f32_ra_var_mask(<16 x ptr> %ptrs, <16 x i32> %ind, <16 x i1>%mask) {
 ; SSE2-LABEL: 'test_gather_16f32_ra_var_mask'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_ra_var_mask'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
@@ -1997,13 +1997,13 @@ define <16 x float> @test_gather_16f32_ra_var_mask(<16 x ptr> %ptrs, <16 x i32>
 ; SKL-LABEL: 'test_gather_16f32_ra_var_mask'
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
 ;
 ; AVX512-LABEL: 'test_gather_16f32_ra_var_mask'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
 ;
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
@@ -2017,7 +2017,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; SSE2-LABEL: 'test_gather_16f32_const_mask2'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
@@ -2025,7 +2025,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; SSE42-LABEL: 'test_gather_16f32_const_mask2'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
@@ -2051,7 +2051,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
 ;
 ; AVX512-LABEL: 'test_gather_16f32_const_mask2'
@@ -2059,7 +2059,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
 ;
   %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
@@ -2118,7 +2118,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
@@ -2144,7 +2144,7 @@ define void @test_scatter_8i32(<8 x i32>%a1, <8 x ptr> %ptr, <8 x i1>%mask) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_scatter_8i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
@@ -2169,7 +2169,7 @@ define void @test_scatter_4i32(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SKX-LABEL: 'test_scatter_4i32'
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
@@ -2178,13 +2178,13 @@ define void @test_scatter_4i32(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
 
 define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) {
 ; SSE2-LABEL: 'test_gather_4f32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_4f32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
@@ -2204,7 +2204,7 @@ define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) {
 ; SKL-LABEL: 'test_gather_4f32'
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
 ;
 ; KNL-LABEL: 'test_gather_4f32'
@@ -2216,7 +2216,7 @@ define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) {
 ; SKX-LABEL: 'test_gather_4f32'
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
 ;
   %sext_ind = sext <4 x i32> %ind to <4 x i64>
@@ -2228,13 +2228,13 @@ define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) {
 
 define <4 x float> @test_gather_4f32_const_mask(ptr %ptr, <4 x i32> %ind) {
 ; SSE2-LABEL: 'test_gather_4f32_const_mask'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_4f32_const_mask'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
@@ -2254,7 +2254,7 @@ define <4 x float> @test_gather_4f32_const_mask(ptr %ptr, <4 x i32> %ind) {
 ; SKL-LABEL: 'test_gather_4f32_const_mask'
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
 ;
 ; KNL-LABEL: 'test_gather_4f32_const_mask'
@@ -2266,7 +2266,7 @@ define <4 x float> @test_gather_4f32_const_mask(ptr %ptr, <4 x i32> %ind) {
 ; SKX-LABEL: 'test_gather_4f32_const_mask'
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
 ;
   %sext_ind = sext <4 x i32> %ind to <4 x i64>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i1-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i1-codesize.ll
index 9df928cfa053..5d567ccfc0b5 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i1-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i1-codesize.ll
@@ -98,21 +98,21 @@ define void @replication_i1_stride2() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i1_stride2'
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <2 x i32> zeroinitializer
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC512-LABEL: 'replication_i1_stride2'
@@ -120,21 +120,21 @@ define void @replication_i1_stride2() nounwind "min-legal-vector-width"="256" {
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC256-LABEL: 'replication_i1_stride2'
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <2 x i32> zeroinitializer
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC512-LABEL: 'replication_i1_stride2'
@@ -143,9 +143,9 @@ define void @replication_i1_stride2() nounwind "min-legal-vector-width"="256" {
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC256-LABEL: 'replication_i1_stride2'
@@ -153,10 +153,10 @@ define void @replication_i1_stride2() nounwind "min-legal-vector-width"="256" {
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC512-LABEL: 'replication_i1_stride2'
@@ -166,8 +166,8 @@ define void @replication_i1_stride2() nounwind "min-legal-vector-width"="256" {
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC256-LABEL: 'replication_i1_stride2'
@@ -176,9 +176,9 @@ define void @replication_i1_stride2() nounwind "min-legal-vector-width"="256" {
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <2 x i32> zeroinitializer
@@ -274,44 +274,44 @@ define void @replication_i1_stride3() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <3 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i1_stride3'
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <3 x i32> zeroinitializer
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC512-LABEL: 'replication_i1_stride3'
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <3 x i32> zeroinitializer
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC256-LABEL: 'replication_i1_stride3'
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <3 x i32> zeroinitializer
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC512-LABEL: 'replication_i1_stride3'
@@ -319,21 +319,21 @@ define void @replication_i1_stride3() nounwind "min-legal-vector-width"="256" {
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC256-LABEL: 'replication_i1_stride3'
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <3 x i32> zeroinitializer
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC512-LABEL: 'replication_i1_stride3'
@@ -342,9 +342,9 @@ define void @replication_i1_stride3() nounwind "min-legal-vector-width"="256" {
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC256-LABEL: 'replication_i1_stride3'
@@ -352,10 +352,10 @@ define void @replication_i1_stride3() nounwind "min-legal-vector-width"="256" {
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <3 x i32> zeroinitializer
@@ -451,44 +451,44 @@ define void @replication_i1_stride4() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <4 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i1_stride4'
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <4 x i32> zeroinitializer
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC512-LABEL: 'replication_i1_stride4'
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <4 x i32> zeroinitializer
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC256-LABEL: 'replication_i1_stride4'
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <4 x i32> zeroinitializer
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC512-LABEL: 'replication_i1_stride4'
@@ -496,21 +496,21 @@ define void @replication_i1_stride4() nounwind "min-legal-vector-width"="256" {
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC256-LABEL: 'replication_i1_stride4'
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <4 x i32> zeroinitializer
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC512-LABEL: 'replication_i1_stride4'
@@ -519,9 +519,9 @@ define void @replication_i1_stride4() nounwind "min-legal-vector-width"="256" {
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC256-LABEL: 'replication_i1_stride4'
@@ -529,10 +529,10 @@ define void @replication_i1_stride4() nounwind "min-legal-vector-width"="256" {
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <4 x i32> zeroinitializer
@@ -627,67 +627,67 @@ define void @replication_i1_stride5() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-LABEL: 'replication_i1_stride5'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <5 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i1_stride5'
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <5 x i32> zeroinitializer
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC512-LABEL: 'replication_i1_stride5'
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <5 x i32> zeroinitializer
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC256-LABEL: 'replication_i1_stride5'
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <5 x i32> zeroinitializer
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC512-LABEL: 'replication_i1_stride5'
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <5 x i32> zeroinitializer
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC256-LABEL: 'replication_i1_stride5'
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <5 x i32> zeroinitializer
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 152 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC512-LABEL: 'replication_i1_stride5'
@@ -695,21 +695,21 @@ define void @replication_i1_stride5() nounwind "min-legal-vector-width"="256" {
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC256-LABEL: 'replication_i1_stride5'
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <5 x i32> zeroinitializer
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <5 x i32> zeroinitializer
@@ -804,67 +804,67 @@ define void @replication_i1_stride6() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-LABEL: 'replication_i1_stride6'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <6 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i1_stride6'
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <6 x i32> zeroinitializer
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC512-LABEL: 'replication_i1_stride6'
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <6 x i32> zeroinitializer
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC256-LABEL: 'replication_i1_stride6'
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <6 x i32> zeroinitializer
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC512-LABEL: 'replication_i1_stride6'
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <6 x i32> zeroinitializer
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC256-LABEL: 'replication_i1_stride6'
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <6 x i32> zeroinitializer
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC512-LABEL: 'replication_i1_stride6'
@@ -872,21 +872,21 @@ define void @replication_i1_stride6() nounwind "min-legal-vector-width"="256" {
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC256-LABEL: 'replication_i1_stride6'
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <6 x i32> zeroinitializer
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <6 x i32> zeroinitializer
@@ -981,67 +981,67 @@ define void @replication_i1_stride7() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-LABEL: 'replication_i1_stride7'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <7 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i1_stride7'
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <7 x i32> zeroinitializer
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC512-LABEL: 'replication_i1_stride7'
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <7 x i32> zeroinitializer
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC256-LABEL: 'replication_i1_stride7'
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <7 x i32> zeroinitializer
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC512-LABEL: 'replication_i1_stride7'
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <7 x i32> zeroinitializer
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC256-LABEL: 'replication_i1_stride7'
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <7 x i32> zeroinitializer
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC512-LABEL: 'replication_i1_stride7'
@@ -1049,21 +1049,21 @@ define void @replication_i1_stride7() nounwind "min-legal-vector-width"="256" {
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC256-LABEL: 'replication_i1_stride7'
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <7 x i32> zeroinitializer
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <7 x i32> zeroinitializer
@@ -1158,67 +1158,67 @@ define void @replication_i1_stride8() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-LABEL: 'replication_i1_stride8'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <8 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i1_stride8'
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <8 x i32> zeroinitializer
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 272 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC512-LABEL: 'replication_i1_stride8'
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <8 x i32> zeroinitializer
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC256-LABEL: 'replication_i1_stride8'
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <8 x i32> zeroinitializer
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 272 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC512-LABEL: 'replication_i1_stride8'
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <8 x i32> zeroinitializer
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC256-LABEL: 'replication_i1_stride8'
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <8 x i32> zeroinitializer
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC512-LABEL: 'replication_i1_stride8'
@@ -1226,21 +1226,21 @@ define void @replication_i1_stride8() nounwind "min-legal-vector-width"="256" {
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC256-LABEL: 'replication_i1_stride8'
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <8 x i32> zeroinitializer
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <8 x i32> zeroinitializer
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i1-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i1-latency.ll
index 3dadf9efb3c3..deea2c34ddc2 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i1-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i1-latency.ll
@@ -98,21 +98,21 @@ define void @replication_i1_stride2() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i1_stride2'
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <2 x i32> zeroinitializer
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC512-LABEL: 'replication_i1_stride2'
@@ -120,21 +120,21 @@ define void @replication_i1_stride2() nounwind "min-legal-vector-width"="256" {
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC256-LABEL: 'replication_i1_stride2'
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <2 x i32> zeroinitializer
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC512-LABEL: 'replication_i1_stride2'
@@ -143,9 +143,9 @@ define void @replication_i1_stride2() nounwind "min-legal-vector-width"="256" {
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC256-LABEL: 'replication_i1_stride2'
@@ -153,10 +153,10 @@ define void @replication_i1_stride2() nounwind "min-legal-vector-width"="256" {
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC512-LABEL: 'replication_i1_stride2'
@@ -166,8 +166,8 @@ define void @replication_i1_stride2() nounwind "min-legal-vector-width"="256" {
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC256-LABEL: 'replication_i1_stride2'
@@ -176,9 +176,9 @@ define void @replication_i1_stride2() nounwind "min-legal-vector-width"="256" {
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <2 x i32> zeroinitializer
@@ -274,44 +274,44 @@ define void @replication_i1_stride3() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <3 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i1_stride3'
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <3 x i32> zeroinitializer
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC512-LABEL: 'replication_i1_stride3'
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <3 x i32> zeroinitializer
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC256-LABEL: 'replication_i1_stride3'
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <3 x i32> zeroinitializer
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC512-LABEL: 'replication_i1_stride3'
@@ -319,21 +319,21 @@ define void @replication_i1_stride3() nounwind "min-legal-vector-width"="256" {
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC256-LABEL: 'replication_i1_stride3'
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <3 x i32> zeroinitializer
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC512-LABEL: 'replication_i1_stride3'
@@ -342,9 +342,9 @@ define void @replication_i1_stride3() nounwind "min-legal-vector-width"="256" {
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC256-LABEL: 'replication_i1_stride3'
@@ -352,10 +352,10 @@ define void @replication_i1_stride3() nounwind "min-legal-vector-width"="256" {
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <3 x i32> zeroinitializer
@@ -451,44 +451,44 @@ define void @replication_i1_stride4() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <4 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i1_stride4'
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <4 x i32> zeroinitializer
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 272 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC512-LABEL: 'replication_i1_stride4'
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <4 x i32> zeroinitializer
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC256-LABEL: 'replication_i1_stride4'
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <4 x i32> zeroinitializer
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 272 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC512-LABEL: 'replication_i1_stride4'
@@ -496,21 +496,21 @@ define void @replication_i1_stride4() nounwind "min-legal-vector-width"="256" {
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC256-LABEL: 'replication_i1_stride4'
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <4 x i32> zeroinitializer
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC512-LABEL: 'replication_i1_stride4'
@@ -519,9 +519,9 @@ define void @replication_i1_stride4() nounwind "min-legal-vector-width"="256" {
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC256-LABEL: 'replication_i1_stride4'
@@ -529,10 +529,10 @@ define void @replication_i1_stride4() nounwind "min-legal-vector-width"="256" {
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <4 x i32> zeroinitializer
@@ -627,67 +627,67 @@ define void @replication_i1_stride5() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-LABEL: 'replication_i1_stride5'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <5 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 122 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i1_stride5'
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <5 x i32> zeroinitializer
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 122 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 242 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC512-LABEL: 'replication_i1_stride5'
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <5 x i32> zeroinitializer
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 122 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC256-LABEL: 'replication_i1_stride5'
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <5 x i32> zeroinitializer
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 122 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 242 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC512-LABEL: 'replication_i1_stride5'
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <5 x i32> zeroinitializer
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC256-LABEL: 'replication_i1_stride5'
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <5 x i32> zeroinitializer
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 152 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC512-LABEL: 'replication_i1_stride5'
@@ -695,21 +695,21 @@ define void @replication_i1_stride5() nounwind "min-legal-vector-width"="256" {
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC256-LABEL: 'replication_i1_stride5'
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <5 x i32> zeroinitializer
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <5 x i32> zeroinitializer
@@ -804,67 +804,67 @@ define void @replication_i1_stride6() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-LABEL: 'replication_i1_stride6'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <6 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i1_stride6'
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <6 x i32> zeroinitializer
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 290 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 432 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC512-LABEL: 'replication_i1_stride6'
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <6 x i32> zeroinitializer
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC256-LABEL: 'replication_i1_stride6'
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <6 x i32> zeroinitializer
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 290 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 432 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC512-LABEL: 'replication_i1_stride6'
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <6 x i32> zeroinitializer
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC256-LABEL: 'replication_i1_stride6'
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <6 x i32> zeroinitializer
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC512-LABEL: 'replication_i1_stride6'
@@ -872,21 +872,21 @@ define void @replication_i1_stride6() nounwind "min-legal-vector-width"="256" {
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC256-LABEL: 'replication_i1_stride6'
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <6 x i32> zeroinitializer
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <6 x i32> zeroinitializer
@@ -981,67 +981,67 @@ define void @replication_i1_stride7() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-LABEL: 'replication_i1_stride7'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <7 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 170 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i1_stride7'
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <7 x i32> zeroinitializer
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 170 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 338 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 480 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC512-LABEL: 'replication_i1_stride7'
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <7 x i32> zeroinitializer
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 170 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC256-LABEL: 'replication_i1_stride7'
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <7 x i32> zeroinitializer
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 170 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 338 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 480 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC512-LABEL: 'replication_i1_stride7'
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <7 x i32> zeroinitializer
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC256-LABEL: 'replication_i1_stride7'
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <7 x i32> zeroinitializer
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC512-LABEL: 'replication_i1_stride7'
@@ -1049,21 +1049,21 @@ define void @replication_i1_stride7() nounwind "min-legal-vector-width"="256" {
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC256-LABEL: 'replication_i1_stride7'
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <7 x i32> zeroinitializer
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <7 x i32> zeroinitializer
@@ -1158,67 +1158,67 @@ define void @replication_i1_stride8() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-LABEL: 'replication_i1_stride8'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <8 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i1_stride8'
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <8 x i32> zeroinitializer
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 386 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 528 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC512-LABEL: 'replication_i1_stride8'
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <8 x i32> zeroinitializer
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC256-LABEL: 'replication_i1_stride8'
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <8 x i32> zeroinitializer
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 386 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 528 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC512-LABEL: 'replication_i1_stride8'
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <8 x i32> zeroinitializer
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC256-LABEL: 'replication_i1_stride8'
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <8 x i32> zeroinitializer
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC512-LABEL: 'replication_i1_stride8'
@@ -1226,21 +1226,21 @@ define void @replication_i1_stride8() nounwind "min-legal-vector-width"="256" {
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC256-LABEL: 'replication_i1_stride8'
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <8 x i32> zeroinitializer
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <8 x i32> zeroinitializer
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i1-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i1-sizelatency.ll
index 6b1d2bb21e94..0044202c5fb3 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i1-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i1-sizelatency.ll
@@ -98,21 +98,21 @@ define void @replication_i1_stride2() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i1_stride2'
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <2 x i32> zeroinitializer
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC512-LABEL: 'replication_i1_stride2'
@@ -120,21 +120,21 @@ define void @replication_i1_stride2() nounwind "min-legal-vector-width"="256" {
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC256-LABEL: 'replication_i1_stride2'
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <2 x i32> zeroinitializer
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC512-LABEL: 'replication_i1_stride2'
@@ -143,9 +143,9 @@ define void @replication_i1_stride2() nounwind "min-legal-vector-width"="256" {
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC256-LABEL: 'replication_i1_stride2'
@@ -153,10 +153,10 @@ define void @replication_i1_stride2() nounwind "min-legal-vector-width"="256" {
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC512-LABEL: 'replication_i1_stride2'
@@ -166,8 +166,8 @@ define void @replication_i1_stride2() nounwind "min-legal-vector-width"="256" {
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC256-LABEL: 'replication_i1_stride2'
@@ -176,9 +176,9 @@ define void @replication_i1_stride2() nounwind "min-legal-vector-width"="256" {
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <2 x i32> zeroinitializer
@@ -274,44 +274,44 @@ define void @replication_i1_stride3() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <3 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i1_stride3'
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <3 x i32> zeroinitializer
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC512-LABEL: 'replication_i1_stride3'
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <3 x i32> zeroinitializer
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC256-LABEL: 'replication_i1_stride3'
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <3 x i32> zeroinitializer
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC512-LABEL: 'replication_i1_stride3'
@@ -319,21 +319,21 @@ define void @replication_i1_stride3() nounwind "min-legal-vector-width"="256" {
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC256-LABEL: 'replication_i1_stride3'
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <3 x i32> zeroinitializer
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC512-LABEL: 'replication_i1_stride3'
@@ -342,9 +342,9 @@ define void @replication_i1_stride3() nounwind "min-legal-vector-width"="256" {
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC256-LABEL: 'replication_i1_stride3'
@@ -352,10 +352,10 @@ define void @replication_i1_stride3() nounwind "min-legal-vector-width"="256" {
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <3 x i32> zeroinitializer
@@ -451,44 +451,44 @@ define void @replication_i1_stride4() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <4 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i1_stride4'
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <4 x i32> zeroinitializer
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC512-LABEL: 'replication_i1_stride4'
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <4 x i32> zeroinitializer
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC256-LABEL: 'replication_i1_stride4'
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <4 x i32> zeroinitializer
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC512-LABEL: 'replication_i1_stride4'
@@ -496,21 +496,21 @@ define void @replication_i1_stride4() nounwind "min-legal-vector-width"="256" {
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC256-LABEL: 'replication_i1_stride4'
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <4 x i32> zeroinitializer
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC512-LABEL: 'replication_i1_stride4'
@@ -519,9 +519,9 @@ define void @replication_i1_stride4() nounwind "min-legal-vector-width"="256" {
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC256-LABEL: 'replication_i1_stride4'
@@ -529,10 +529,10 @@ define void @replication_i1_stride4() nounwind "min-legal-vector-width"="256" {
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <4 x i32> zeroinitializer
@@ -627,67 +627,67 @@ define void @replication_i1_stride5() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-LABEL: 'replication_i1_stride5'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <5 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i1_stride5'
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <5 x i32> zeroinitializer
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC512-LABEL: 'replication_i1_stride5'
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <5 x i32> zeroinitializer
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC256-LABEL: 'replication_i1_stride5'
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <5 x i32> zeroinitializer
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC512-LABEL: 'replication_i1_stride5'
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <5 x i32> zeroinitializer
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC256-LABEL: 'replication_i1_stride5'
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <5 x i32> zeroinitializer
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 152 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC512-LABEL: 'replication_i1_stride5'
@@ -695,21 +695,21 @@ define void @replication_i1_stride5() nounwind "min-legal-vector-width"="256" {
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC256-LABEL: 'replication_i1_stride5'
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <5 x i32> zeroinitializer
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <5 x i32> zeroinitializer
@@ -804,67 +804,67 @@ define void @replication_i1_stride6() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-LABEL: 'replication_i1_stride6'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <6 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i1_stride6'
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <6 x i32> zeroinitializer
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC512-LABEL: 'replication_i1_stride6'
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <6 x i32> zeroinitializer
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC256-LABEL: 'replication_i1_stride6'
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <6 x i32> zeroinitializer
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC512-LABEL: 'replication_i1_stride6'
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <6 x i32> zeroinitializer
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC256-LABEL: 'replication_i1_stride6'
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <6 x i32> zeroinitializer
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC512-LABEL: 'replication_i1_stride6'
@@ -872,21 +872,21 @@ define void @replication_i1_stride6() nounwind "min-legal-vector-width"="256" {
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC256-LABEL: 'replication_i1_stride6'
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <6 x i32> zeroinitializer
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <6 x i32> zeroinitializer
@@ -981,67 +981,67 @@ define void @replication_i1_stride7() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-LABEL: 'replication_i1_stride7'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <7 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i1_stride7'
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <7 x i32> zeroinitializer
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC512-LABEL: 'replication_i1_stride7'
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <7 x i32> zeroinitializer
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC256-LABEL: 'replication_i1_stride7'
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <7 x i32> zeroinitializer
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC512-LABEL: 'replication_i1_stride7'
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <7 x i32> zeroinitializer
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC256-LABEL: 'replication_i1_stride7'
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <7 x i32> zeroinitializer
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC512-LABEL: 'replication_i1_stride7'
@@ -1049,21 +1049,21 @@ define void @replication_i1_stride7() nounwind "min-legal-vector-width"="256" {
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC256-LABEL: 'replication_i1_stride7'
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <7 x i32> zeroinitializer
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <7 x i32> zeroinitializer
@@ -1158,67 +1158,67 @@ define void @replication_i1_stride8() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-LABEL: 'replication_i1_stride8'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <8 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i1_stride8'
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <8 x i32> zeroinitializer
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 272 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC512-LABEL: 'replication_i1_stride8'
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <8 x i32> zeroinitializer
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQVEC256-LABEL: 'replication_i1_stride8'
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <8 x i32> zeroinitializer
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 272 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC512-LABEL: 'replication_i1_stride8'
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <8 x i32> zeroinitializer
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BWVEC256-LABEL: 'replication_i1_stride8'
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <8 x i32> zeroinitializer
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC512-LABEL: 'replication_i1_stride8'
@@ -1226,21 +1226,21 @@ define void @replication_i1_stride8() nounwind "min-legal-vector-width"="256" {
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMIVEC256-LABEL: 'replication_i1_stride8'
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <8 x i32> zeroinitializer
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i1> undef, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i1> undef, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i1> undef, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf16 = shufflevector <16 x i1> undef, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf32 = shufflevector <32 x i1> undef, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf64 = shufflevector <64 x i1> undef, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vf128 = shufflevector <128 x i1> undef, <128 x i1> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512VBMIVEC256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %vf1 = shufflevector <1 x i1> undef, <1 x i1> poison, <8 x i32> zeroinitializer
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i16-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i16-codesize.ll
index 3db892e2e7b4..fd56574c08c4 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i16-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i16-codesize.ll
@@ -77,9 +77,9 @@ define void @replication_i16_stride2() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i16_stride2'
@@ -187,10 +187,10 @@ define void @replication_i16_stride3() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <3 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i16_stride3'
@@ -298,10 +298,10 @@ define void @replication_i16_stride4() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <4 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i16_stride4'
@@ -408,11 +408,11 @@ define void @replication_i16_stride5() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-LABEL: 'replication_i16_stride5'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <5 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i16_stride5'
@@ -519,11 +519,11 @@ define void @replication_i16_stride6() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-LABEL: 'replication_i16_stride6'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <6 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i16_stride6'
@@ -630,11 +630,11 @@ define void @replication_i16_stride7() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-LABEL: 'replication_i16_stride7'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <7 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i16_stride7'
@@ -741,11 +741,11 @@ define void @replication_i16_stride8() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-LABEL: 'replication_i16_stride8'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <8 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i16_stride8'
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i16-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i16-latency.ll
index d1e7da364629..3b3de6eb9473 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i16-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i16-latency.ll
@@ -77,9 +77,9 @@ define void @replication_i16_stride2() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i16_stride2'
@@ -187,10 +187,10 @@ define void @replication_i16_stride3() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <3 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i16_stride3'
@@ -298,10 +298,10 @@ define void @replication_i16_stride4() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <4 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i16_stride4'
@@ -408,11 +408,11 @@ define void @replication_i16_stride5() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-LABEL: 'replication_i16_stride5'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <5 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i16_stride5'
@@ -519,11 +519,11 @@ define void @replication_i16_stride6() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-LABEL: 'replication_i16_stride6'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <6 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 105 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i16_stride6'
@@ -630,11 +630,11 @@ define void @replication_i16_stride7() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-LABEL: 'replication_i16_stride7'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <7 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i16_stride7'
@@ -741,11 +741,11 @@ define void @replication_i16_stride8() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-LABEL: 'replication_i16_stride8'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <8 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i16_stride8'
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i16-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i16-sizelatency.ll
index fdc687b12d29..9fe412970b74 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i16-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i16-sizelatency.ll
@@ -77,9 +77,9 @@ define void @replication_i16_stride2() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i16_stride2'
@@ -187,10 +187,10 @@ define void @replication_i16_stride3() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <3 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i16_stride3'
@@ -298,10 +298,10 @@ define void @replication_i16_stride4() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <4 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i16_stride4'
@@ -408,11 +408,11 @@ define void @replication_i16_stride5() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-LABEL: 'replication_i16_stride5'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <5 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i16_stride5'
@@ -519,11 +519,11 @@ define void @replication_i16_stride6() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-LABEL: 'replication_i16_stride6'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <6 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i16_stride6'
@@ -630,11 +630,11 @@ define void @replication_i16_stride7() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-LABEL: 'replication_i16_stride7'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <7 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i16_stride7'
@@ -741,11 +741,11 @@ define void @replication_i16_stride8() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-LABEL: 'replication_i16_stride8'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <8 x i32> zeroinitializer
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i16_stride8'
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i8-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i8-codesize.ll
index 6c82ad5b39b5..164df18b9f03 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i8-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i8-codesize.ll
@@ -76,17 +76,17 @@ define void @replication_i8_stride2() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i8_stride2'
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
@@ -186,18 +186,18 @@ define void @replication_i8_stride3() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-LABEL: 'replication_i8_stride3'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i8_stride3'
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
@@ -297,18 +297,18 @@ define void @replication_i8_stride4() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-LABEL: 'replication_i8_stride4'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i8_stride4'
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
@@ -407,19 +407,19 @@ define void @replication_i8_stride5() nounwind "min-legal-vector-width"="256" {
 ;
 ; AVX512FVEC512-LABEL: 'replication_i8_stride5'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 105 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i8_stride5'
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
@@ -518,19 +518,19 @@ define void @replication_i8_stride6() nounwind "min-legal-vector-width"="256" {
 ;
 ; AVX512FVEC512-LABEL: 'replication_i8_stride6'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 113 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i8_stride6'
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
@@ -629,19 +629,19 @@ define void @replication_i8_stride7() nounwind "min-legal-vector-width"="256" {
 ;
 ; AVX512FVEC512-LABEL: 'replication_i8_stride7'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 121 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i8_stride7'
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
@@ -740,19 +740,19 @@ define void @replication_i8_stride8() nounwind "min-legal-vector-width"="256" {
 ;
 ; AVX512FVEC512-LABEL: 'replication_i8_stride8'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i8_stride8'
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i8-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i8-latency.ll
index 8d44d1e56c2b..aa4f8db955a8 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i8-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i8-latency.ll
@@ -76,17 +76,17 @@ define void @replication_i8_stride2() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i8_stride2'
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
@@ -186,18 +186,18 @@ define void @replication_i8_stride3() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-LABEL: 'replication_i8_stride3'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 105 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i8_stride3'
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
@@ -297,18 +297,18 @@ define void @replication_i8_stride4() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-LABEL: 'replication_i8_stride4'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i8_stride4'
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
@@ -407,19 +407,19 @@ define void @replication_i8_stride5() nounwind "min-legal-vector-width"="256" {
 ;
 ; AVX512FVEC512-LABEL: 'replication_i8_stride5'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 122 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 185 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i8_stride5'
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 122 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 242 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
@@ -518,19 +518,19 @@ define void @replication_i8_stride6() nounwind "min-legal-vector-width"="256" {
 ;
 ; AVX512FVEC512-LABEL: 'replication_i8_stride6'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 105 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 209 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i8_stride6'
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 290 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
@@ -629,19 +629,19 @@ define void @replication_i8_stride7() nounwind "min-legal-vector-width"="256" {
 ;
 ; AVX512FVEC512-LABEL: 'replication_i8_stride7'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 170 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 233 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i8_stride7'
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 170 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 338 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
@@ -740,19 +740,19 @@ define void @replication_i8_stride8() nounwind "min-legal-vector-width"="256" {
 ;
 ; AVX512FVEC512-LABEL: 'replication_i8_stride8'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 257 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i8_stride8'
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 386 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i8-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i8-sizelatency.ll
index 232a84ea678f..f50b64f4ffe2 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i8-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i8-sizelatency.ll
@@ -76,17 +76,17 @@ define void @replication_i8_stride2() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i8_stride2'
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <256 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63, i32 64, i32 64, i32 65, i32 65, i32 66, i32 66, i32 67, i32 67, i32 68, i32 68, i32 69, i32 69, i32 70, i32 70, i32 71, i32 71, i32 72, i32 72, i32 73, i32 73, i32 74, i32 74, i32 75, i32 75, i32 76, i32 76, i32 77, i32 77, i32 78, i32 78, i32 79, i32 79, i32 80, i32 80, i32 81, i32 81, i32 82, i32 82, i32 83, i32 83, i32 84, i32 84, i32 85, i32 85, i32 86, i32 86, i32 87, i32 87, i32 88, i32 88, i32 89, i32 89, i32 90, i32 90, i32 91, i32 91, i32 92, i32 92, i32 93, i32 93, i32 94, i32 94, i32 95, i32 95, i32 96, i32 96, i32 97, i32 97, i32 98, i32 98, i32 99, i32 99, i32 100, i32 100, i32 101, i32 101, i32 102, i32 102, i32 103, i32 103, i32 104, i32 104, i32 105, i32 105, i32 106, i32 106, i32 107, i32 107, i32 108, i32 108, i32 109, i32 109, i32 110, i32 110, i32 111, i32 111, i32 112, i32 112, i32 113, i32 113, i32 114, i32 114, i32 115, i32 115, i32 116, i32 116, i32 117, i32 117, i32 118, i32 118, i32 119, i32 119, i32 120, i32 120, i32 121, i32 121, i32 122, i32 122, i32 123, i32 123, i32 124, i32 124, i32 125, i32 125, i32 126, i32 126, i32 127, i32 127>
@@ -186,18 +186,18 @@ define void @replication_i8_stride3() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-LABEL: 'replication_i8_stride3'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i8_stride3'
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127>
@@ -297,18 +297,18 @@ define void @replication_i8_stride4() nounwind "min-legal-vector-width"="256" {
 ; AVX512FVEC512-LABEL: 'replication_i8_stride4'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i8_stride4'
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127>
@@ -407,19 +407,19 @@ define void @replication_i8_stride5() nounwind "min-legal-vector-width"="256" {
 ;
 ; AVX512FVEC512-LABEL: 'replication_i8_stride5'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 105 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i8_stride5'
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <640 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127>
@@ -518,19 +518,19 @@ define void @replication_i8_stride6() nounwind "min-legal-vector-width"="256" {
 ;
 ; AVX512FVEC512-LABEL: 'replication_i8_stride6'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 113 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i8_stride6'
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <768 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
@@ -629,19 +629,19 @@ define void @replication_i8_stride7() nounwind "min-legal-vector-width"="256" {
 ;
 ; AVX512FVEC512-LABEL: 'replication_i8_stride7'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 121 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i8_stride7'
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <896 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
@@ -740,19 +740,19 @@ define void @replication_i8_stride8() nounwind "min-legal-vector-width"="256" {
 ;
 ; AVX512FVEC512-LABEL: 'replication_i8_stride8'
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512FVEC256-LABEL: 'replication_i8_stride8'
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i8> undef, <2 x i8> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vf4 = shufflevector <4 x i8> undef, <4 x i8> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %vf8 = shufflevector <8 x i8> undef, <8 x i8> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %vf16 = shufflevector <16 x i8> undef, <16 x i8> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vf32 = shufflevector <32 x i8> undef, <32 x i8> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %vf64 = shufflevector <64 x i8> undef, <64 x i8> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %vf128 = shufflevector <128 x i8> undef, <128 x i8> poison, <1024 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 67, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 68, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 69, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 70, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 71, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 72, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 73, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 74, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 75, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 76, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 77, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 78, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 79, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 81, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 82, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 83, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 84, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 85, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 86, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 87, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 88, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 89, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 90, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 91, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 92, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 93, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 94, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 97, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 98, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 99, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 101, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 102, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 103, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 104, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 105, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 106, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 107, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 108, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 109, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 110, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 111, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 113, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 114, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 115, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 116, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 118, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 119, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 120, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 121, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 122, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 123, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 124, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 125, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 126, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
diff --git a/llvm/test/Analysis/CostModel/X86/trunc-codesize.ll b/llvm/test/Analysis/CostModel/X86/trunc-codesize.ll
index 9c4abfef52d9..a2f46bc18126 100644
--- a/llvm/test/Analysis/CostModel/X86/trunc-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/trunc-codesize.ll
@@ -16,13 +16,37 @@
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=x86_64-apple-macosx10.8.0 -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,AVX
 
 define i32 @trunc_vXi32() "min-legal-vector-width"="256" {
-; CHECK-LABEL: 'trunc_vXi32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; SSE-LABEL: 'trunc_vXi32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i32
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'trunc_vXi32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i32
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i32>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i32>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i32>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i32>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'trunc_vXi32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i32
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX256-LABEL: 'trunc_vXi32'
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i32
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i32>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i32>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i32>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i32>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %i64 = trunc i64 undef to i32
   %V2i64 = trunc <2 x i64> undef to <2 x i32>
@@ -33,72 +57,273 @@ define i32 @trunc_vXi32() "min-legal-vector-width"="256" {
 }
 
 define i32 @trunc_vXi16() "min-legal-vector-width"="256" {
-; CHECK-LABEL: 'trunc_vXi16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i64 = trunc <96 x i64> undef to <96 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i32 = trunc <96 x i32> undef to <96 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; SSE-LABEL: 'trunc_vXi16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i16
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V96i64 = trunc <96 x i64> undef to <96 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i16
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V96i32 = trunc <96 x i32> undef to <96 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'trunc_vXi16'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i16
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i64 = trunc <96 x i64> undef to <96 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i16
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i32 = trunc <96 x i32> undef to <96 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'trunc_vXi16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i16
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V96i64 = trunc <96 x i64> undef to <96 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i16
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V96i32 = trunc <96 x i32> undef to <96 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX256-LABEL: 'trunc_vXi16'
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i16
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i64 = trunc <96 x i64> undef to <96 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i16
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i32 = trunc <96 x i32> undef to <96 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %i64 = trunc i64 undef to i16
   %V2i64 = trunc <2 x i64> undef to <2 x i16>
@@ -170,107 +395,413 @@ define i32 @trunc_vXi16() "min-legal-vector-width"="256" {
 }
 
 define i32 @trunc_vXi8() "min-legal-vector-width"="256" {
-; CHECK-LABEL: 'trunc_vXi8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i64 = trunc <96 x i64> undef to <96 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i32 = trunc <96 x i32> undef to <96 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i16 = trunc <96 x i16> undef to <96 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; SSE-LABEL: 'trunc_vXi8'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i8
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V96i64 = trunc <96 x i64> undef to <96 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i8
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V96i32 = trunc <96 x i32> undef to <96 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i8
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V96i16 = trunc <96 x i16> undef to <96 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'trunc_vXi8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i64 = trunc <96 x i64> undef to <96 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i32 = trunc <96 x i32> undef to <96 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i16 = trunc <96 x i16> undef to <96 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'trunc_vXi8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V96i64 = trunc <96 x i64> undef to <96 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V96i32 = trunc <96 x i32> undef to <96 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V96i16 = trunc <96 x i16> undef to <96 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX256-LABEL: 'trunc_vXi8'
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i8
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i64 = trunc <96 x i64> undef to <96 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i8
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i32 = trunc <96 x i32> undef to <96 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i8
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i16 = trunc <96 x i16> undef to <96 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %i64 = trunc i64 undef to i8
   %V2i64 = trunc <2 x i64> undef to <2 x i8>
@@ -381,74 +912,74 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; SSE-LABEL: 'trunc_vXi1'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i1
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
@@ -457,33 +988,33 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V7i16 = trunc <7 x i16> undef to <7 x i1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
@@ -526,13 +1057,13 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
@@ -565,10 +1096,10 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
@@ -811,5 +1342,4 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
   ret i32 undef
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX256: {{.*}}
-; AVX512: {{.*}}
+; CHECK: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/X86/trunc-latency.ll b/llvm/test/Analysis/CostModel/X86/trunc-latency.ll
index f6092b9f5bab..a904c7946ba5 100644
--- a/llvm/test/Analysis/CostModel/X86/trunc-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/trunc-latency.ll
@@ -16,13 +16,37 @@
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,AVX
 
 define i32 @trunc_vXi32() "min-legal-vector-width"="256" {
-; CHECK-LABEL: 'trunc_vXi32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; SSE-LABEL: 'trunc_vXi32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i32
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'trunc_vXi32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i32
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i32>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i32>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i32>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i32>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'trunc_vXi32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i32
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX256-LABEL: 'trunc_vXi32'
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i32
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i32>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i32>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i32>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i32>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %i64 = trunc i64 undef to i32
   %V2i64 = trunc <2 x i64> undef to <2 x i32>
@@ -33,72 +57,273 @@ define i32 @trunc_vXi32() "min-legal-vector-width"="256" {
 }
 
 define i32 @trunc_vXi16() "min-legal-vector-width"="256" {
-; CHECK-LABEL: 'trunc_vXi16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i64 = trunc <96 x i64> undef to <96 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i32 = trunc <96 x i32> undef to <96 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; SSE-LABEL: 'trunc_vXi16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i16
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V96i64 = trunc <96 x i64> undef to <96 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i16
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V96i32 = trunc <96 x i32> undef to <96 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'trunc_vXi16'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i16
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i64 = trunc <96 x i64> undef to <96 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i16
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i32 = trunc <96 x i32> undef to <96 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'trunc_vXi16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i16
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V96i64 = trunc <96 x i64> undef to <96 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i16
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V96i32 = trunc <96 x i32> undef to <96 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX256-LABEL: 'trunc_vXi16'
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i16
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i64 = trunc <96 x i64> undef to <96 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i16
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i32 = trunc <96 x i32> undef to <96 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %i64 = trunc i64 undef to i16
   %V2i64 = trunc <2 x i64> undef to <2 x i16>
@@ -170,107 +395,413 @@ define i32 @trunc_vXi16() "min-legal-vector-width"="256" {
 }
 
 define i32 @trunc_vXi8() "min-legal-vector-width"="256" {
-; CHECK-LABEL: 'trunc_vXi8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i64 = trunc <96 x i64> undef to <96 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i32 = trunc <96 x i32> undef to <96 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i16 = trunc <96 x i16> undef to <96 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; SSE-LABEL: 'trunc_vXi8'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i8
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V96i64 = trunc <96 x i64> undef to <96 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i8
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V96i32 = trunc <96 x i32> undef to <96 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i8
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V96i16 = trunc <96 x i16> undef to <96 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'trunc_vXi8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i64 = trunc <96 x i64> undef to <96 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i32 = trunc <96 x i32> undef to <96 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i16 = trunc <96 x i16> undef to <96 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'trunc_vXi8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V96i64 = trunc <96 x i64> undef to <96 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V96i32 = trunc <96 x i32> undef to <96 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V96i16 = trunc <96 x i16> undef to <96 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX256-LABEL: 'trunc_vXi8'
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i8
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i64 = trunc <96 x i64> undef to <96 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i8
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i32 = trunc <96 x i32> undef to <96 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i8
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i16 = trunc <96 x i16> undef to <96 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %i64 = trunc i64 undef to i8
   %V2i64 = trunc <2 x i64> undef to <2 x i8>
@@ -381,74 +912,74 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; SSE-LABEL: 'trunc_vXi1'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i1
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
@@ -457,33 +988,33 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V7i16 = trunc <7 x i16> undef to <7 x i1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
@@ -526,13 +1057,13 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
@@ -565,10 +1096,10 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
@@ -811,5 +1342,4 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
   ret i32 undef
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX256: {{.*}}
-; AVX512: {{.*}}
+; CHECK: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/X86/trunc-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/trunc-sizelatency.ll
index 49ed2902da45..ed8706f8a6ca 100644
--- a/llvm/test/Analysis/CostModel/X86/trunc-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/trunc-sizelatency.ll
@@ -16,13 +16,37 @@
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,AVX
 
 define i32 @trunc_vXi32() "min-legal-vector-width"="256" {
-; CHECK-LABEL: 'trunc_vXi32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; SSE-LABEL: 'trunc_vXi32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i32
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'trunc_vXi32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i32
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i32>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i32>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i32>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i32>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'trunc_vXi32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i32
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i32>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX256-LABEL: 'trunc_vXi32'
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i32
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i32>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i32>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i32>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i32>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %i64 = trunc i64 undef to i32
   %V2i64 = trunc <2 x i64> undef to <2 x i32>
@@ -33,72 +57,273 @@ define i32 @trunc_vXi32() "min-legal-vector-width"="256" {
 }
 
 define i32 @trunc_vXi16() "min-legal-vector-width"="256" {
-; CHECK-LABEL: 'trunc_vXi16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i64 = trunc <96 x i64> undef to <96 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i32 = trunc <96 x i32> undef to <96 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; SSE-LABEL: 'trunc_vXi16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i16
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V96i64 = trunc <96 x i64> undef to <96 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i16
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V96i32 = trunc <96 x i32> undef to <96 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'trunc_vXi16'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i16
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i64 = trunc <96 x i64> undef to <96 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i16
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i32 = trunc <96 x i32> undef to <96 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i16>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'trunc_vXi16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i16
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V96i64 = trunc <96 x i64> undef to <96 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i16
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V96i32 = trunc <96 x i32> undef to <96 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i16>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX256-LABEL: 'trunc_vXi16'
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i16
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i64 = trunc <96 x i64> undef to <96 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i16
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i32 = trunc <96 x i32> undef to <96 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i16>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %i64 = trunc i64 undef to i16
   %V2i64 = trunc <2 x i64> undef to <2 x i16>
@@ -170,107 +395,413 @@ define i32 @trunc_vXi16() "min-legal-vector-width"="256" {
 }
 
 define i32 @trunc_vXi8() "min-legal-vector-width"="256" {
-; CHECK-LABEL: 'trunc_vXi8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i64 = trunc <96 x i64> undef to <96 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i32 = trunc <96 x i32> undef to <96 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i16 = trunc <96 x i16> undef to <96 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; SSE-LABEL: 'trunc_vXi8'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i8
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V96i64 = trunc <96 x i64> undef to <96 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i8
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V96i32 = trunc <96 x i32> undef to <96 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i8
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V96i16 = trunc <96 x i16> undef to <96 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i8>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'trunc_vXi8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i64 = trunc <96 x i64> undef to <96 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i32 = trunc <96 x i32> undef to <96 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i16 = trunc <96 x i16> undef to <96 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i8>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'trunc_vXi8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V96i64 = trunc <96 x i64> undef to <96 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V96i32 = trunc <96 x i32> undef to <96 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V96i16 = trunc <96 x i16> undef to <96 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i8>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; AVX256-LABEL: 'trunc_vXi8'
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i8
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i64 = trunc <96 x i64> undef to <96 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i8
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i32 = trunc <96 x i32> undef to <96 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i8
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V96i16 = trunc <96 x i16> undef to <96 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i8>
+; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %i64 = trunc i64 undef to i8
   %V2i64 = trunc <2 x i64> undef to <2 x i8>
@@ -381,74 +912,74 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; SSE-LABEL: 'trunc_vXi1'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i1
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
@@ -457,33 +988,33 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V7i16 = trunc <7 x i16> undef to <7 x i1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
@@ -526,13 +1057,13 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
@@ -565,10 +1096,10 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
@@ -811,5 +1342,4 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
   ret i32 undef
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX256: {{.*}}
-; AVX512: {{.*}}
+; CHECK: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/X86/vshift-ashr-codesize.ll b/llvm/test/Analysis/CostModel/X86/vshift-ashr-codesize.ll
index a3c24bdd1a88..9ff975665f13 100644
--- a/llvm/test/Analysis/CostModel/X86/vshift-ashr-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-ashr-codesize.ll
@@ -1676,7 +1676,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatconstant_shift_v16i8'
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
@@ -1713,7 +1713,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatconstant_shift_v32i8'
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
   %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
@@ -1750,7 +1750,7 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8'
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
diff --git a/llvm/test/Analysis/CostModel/X86/vshift-ashr-latency.ll b/llvm/test/Analysis/CostModel/X86/vshift-ashr-latency.ll
index cd4189d4a7f8..ab300779b434 100644
--- a/llvm/test/Analysis/CostModel/X86/vshift-ashr-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-ashr-latency.ll
@@ -1806,7 +1806,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatconstant_shift_v16i8'
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
@@ -1847,7 +1847,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatconstant_shift_v32i8'
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
   %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
@@ -1888,7 +1888,7 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8'
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
diff --git a/llvm/test/Analysis/CostModel/X86/vshift-ashr-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/vshift-ashr-sizelatency.ll
index 84ccad029415..1b51a2e0a1e6 100644
--- a/llvm/test/Analysis/CostModel/X86/vshift-ashr-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-ashr-sizelatency.ll
@@ -1700,7 +1700,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatconstant_shift_v16i8'
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
@@ -1741,7 +1741,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatconstant_shift_v32i8'
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
   %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
@@ -1782,7 +1782,7 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8'
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
diff --git a/llvm/test/Analysis/CostModel/X86/vshift-lshr-codesize.ll b/llvm/test/Analysis/CostModel/X86/vshift-lshr-codesize.ll
index a0e15bb8ff73..644fcbbfefdf 100644
--- a/llvm/test/Analysis/CostModel/X86/vshift-lshr-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-lshr-codesize.ll
@@ -1619,9 +1619,17 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
-; AVX512-LABEL: 'splatconstant_shift_v16i8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+; AVX512F-LABEL: 'splatconstant_shift_v16i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
+; AVX512BW-LABEL: 'splatconstant_shift_v16i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <16 x i8> %shift
@@ -1652,9 +1660,17 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
-; AVX512-LABEL: 'splatconstant_shift_v32i8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
+; AVX512F-LABEL: 'splatconstant_shift_v32i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
+;
+; AVX512BW-LABEL: 'splatconstant_shift_v32i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
   %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
@@ -1694,7 +1710,7 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8'
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
diff --git a/llvm/test/Analysis/CostModel/X86/vshift-lshr-latency.ll b/llvm/test/Analysis/CostModel/X86/vshift-lshr-latency.ll
index 61620e2cc97e..f879a09d067e 100644
--- a/llvm/test/Analysis/CostModel/X86/vshift-lshr-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-lshr-latency.ll
@@ -1778,7 +1778,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatconstant_shift_v16i8'
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
@@ -1806,9 +1806,17 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
-; AVX512-LABEL: 'splatconstant_shift_v32i8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
+; AVX512F-LABEL: 'splatconstant_shift_v32i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
+;
+; AVX512BW-LABEL: 'splatconstant_shift_v32i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
   %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
@@ -1844,7 +1852,7 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8'
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
diff --git a/llvm/test/Analysis/CostModel/X86/vshift-lshr-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/vshift-lshr-sizelatency.ll
index e6b6ac75b65d..fe472342e214 100644
--- a/llvm/test/Analysis/CostModel/X86/vshift-lshr-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-lshr-sizelatency.ll
@@ -1635,9 +1635,17 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
-; AVX512-LABEL: 'splatconstant_shift_v16i8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+; AVX512F-LABEL: 'splatconstant_shift_v16i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
+; AVX512BW-LABEL: 'splatconstant_shift_v16i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <16 x i8> %shift
@@ -1677,7 +1685,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatconstant_shift_v32i8'
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
   %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
@@ -1718,7 +1726,7 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8'
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
diff --git a/llvm/test/Analysis/CostModel/X86/vshift-shl-codesize.ll b/llvm/test/Analysis/CostModel/X86/vshift-shl-codesize.ll
index 265658b1e3a2..1045b827da7c 100644
--- a/llvm/test/Analysis/CostModel/X86/vshift-shl-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-shl-codesize.ll
@@ -1593,9 +1593,17 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
-; AVX512-LABEL: 'splatconstant_shift_v16i8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+; AVX512F-LABEL: 'splatconstant_shift_v16i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
+; AVX512BW-LABEL: 'splatconstant_shift_v16i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <16 x i8> %shift
@@ -1626,9 +1634,17 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
-; AVX512-LABEL: 'splatconstant_shift_v32i8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
+; AVX512F-LABEL: 'splatconstant_shift_v32i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
+;
+; AVX512BW-LABEL: 'splatconstant_shift_v32i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
   %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
@@ -1668,7 +1684,7 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8'
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
diff --git a/llvm/test/Analysis/CostModel/X86/vshift-shl-latency.ll b/llvm/test/Analysis/CostModel/X86/vshift-shl-latency.ll
index 42c91144ff6f..3ae71daf50a3 100644
--- a/llvm/test/Analysis/CostModel/X86/vshift-shl-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-shl-latency.ll
@@ -1738,7 +1738,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatconstant_shift_v16i8'
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
@@ -1766,9 +1766,17 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
-; AVX512-LABEL: 'splatconstant_shift_v32i8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
+; AVX512F-LABEL: 'splatconstant_shift_v32i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
+;
+; AVX512BW-LABEL: 'splatconstant_shift_v32i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
   %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
@@ -1804,7 +1812,7 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8'
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
diff --git a/llvm/test/Analysis/CostModel/X86/vshift-shl-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/vshift-shl-sizelatency.ll
index 47b24df063ef..4256a73a7cf7 100644
--- a/llvm/test/Analysis/CostModel/X86/vshift-shl-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-shl-sizelatency.ll
@@ -1691,9 +1691,17 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
-; AVX512-LABEL: 'splatconstant_shift_v16i8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+; AVX512F-LABEL: 'splatconstant_shift_v16i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
+; AVX512BW-LABEL: 'splatconstant_shift_v16i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <16 x i8> %shift
@@ -1729,7 +1737,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatconstant_shift_v32i8'
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
   %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
@@ -1766,7 +1774,7 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8'
-; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/multiple-strides-rt-memory-checks.ll b/llvm/test/Analysis/LoopAccessAnalysis/multiple-strides-rt-memory-checks.ll
index 05882a7b0a7d..cb50b2c75ccb 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/multiple-strides-rt-memory-checks.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/multiple-strides-rt-memory-checks.ll
@@ -23,14 +23,13 @@
 
 ; CHECK: function 'Test':
 ; CHECK:   .inner:
-; CHECK-NEXT:     Memory dependences are safe
+; CHECK-NEXT:     Memory dependences are safe with run-time checks
 ; CHECK-NEXT:     Dependences:
 ; CHECK-NEXT:     Run-time memory checks:
 ; CHECK:          Check 0:
 ; CHECK:          Check 1:
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
 
 %struct.s = type { [32 x i32], [32 x i32], [32 x [32 x i32]] }
 
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/non-constant-distance-backward.ll b/llvm/test/Analysis/LoopAccessAnalysis/non-constant-distance-backward.ll
new file mode 100644
index 000000000000..0058135a30d6
--- /dev/null
+++ b/llvm/test/Analysis/LoopAccessAnalysis/non-constant-distance-backward.ll
@@ -0,0 +1,284 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes='print<access-info>' -disable-output %s 2>&1 | FileCheck --check-prefixes=COMMON,MAXLEN %s
+; RUN: opt -passes='print<access-info>' -disable-output -mtriple=arm64-apple-macosx %s 2>&1 | FileCheck --check-prefixes=COMMON,VW128 %s
+; RUN: opt -passes='print<access-info>' -disable-output -mtriple=arm64-apple-macosx -mattr=+sve %s 2>&1 | FileCheck --check-prefixes=COMMON,MAXLEN %s
+
+; REQUIRES: aarch64-registered-target
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; If the dependence distance is not a constant, whether it gets identified as backwards or unknown depends on the minimum distance and the target's vector length.
+
+define void @backward_min_distance_8(ptr %A, i64 %N) {
+; COMMON-LABEL: 'backward_min_distance_8'
+; COMMON-NEXT:    loop:
+; COMMON-NEXT:      Memory dependences are safe with run-time checks
+; COMMON-NEXT:      Dependences:
+; COMMON-NEXT:      Run-time memory checks:
+; COMMON-NEXT:      Check 0:
+; COMMON-NEXT:        Comparing group ([[GRP1:0x[0-9a-f]+]]):
+; COMMON-NEXT:          %gep.off.iv = getelementptr inbounds i8, ptr %gep.off, i64 %iv
+; COMMON-NEXT:        Against group ([[GRP2:0x[0-9a-f]+]]):
+; COMMON-NEXT:          %gep = getelementptr inbounds i8, ptr %A, i64 %iv
+; COMMON-NEXT:      Grouped accesses:
+; COMMON-NEXT:        Group [[GRP1]]:
+; COMMON-NEXT:          (Low: {(1 + %A)<nuw>,+,1}<nuw><%outer.header> High: {(257 + %A),+,1}<nw><%outer.header>)
+; COMMON-NEXT:            Member: {{\{\{}}(1 + %A)<nuw>,+,1}<nuw><%outer.header>,+,1}<nuw><%loop>
+; COMMON-NEXT:        Group [[GRP2]]:
+; COMMON-NEXT:          (Low: %A High: (256 + %A))
+; COMMON-NEXT:            Member: {%A,+,1}<nuw><%loop>
+; COMMON-EMPTY:
+; COMMON-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; COMMON-NEXT:      SCEV assumptions:
+; COMMON-EMPTY:
+; COMMON-NEXT:      Expressions re-written:
+; COMMON-NEXT:    outer.header:
+; COMMON-NEXT:      Report: loop is not the innermost loop
+; COMMON-NEXT:      Dependences:
+; COMMON-NEXT:      Run-time memory checks:
+; COMMON-NEXT:      Grouped accesses:
+; COMMON-EMPTY:
+; COMMON-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; COMMON-NEXT:      SCEV assumptions:
+; COMMON-EMPTY:
+; COMMON-NEXT:      Expressions re-written:
+;
+entry:
+  br label %outer.header
+
+outer.header:
+  %outer.iv = phi i64 [ 1, %entry ], [ %outer.iv.next, %outer.latch ]
+  %gep.off = getelementptr inbounds i8, ptr %A, i64 %outer.iv
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %outer.header ], [ %iv.next, %loop ]
+  %gep = getelementptr inbounds i8, ptr %A, i64 %iv
+  %l = load i8 , ptr %gep, align 4
+  %add = add nsw i8 %l, 5
+  %gep.off.iv = getelementptr inbounds i8, ptr %gep.off, i64 %iv
+  store i8 %add, ptr %gep.off.iv, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 256
+  br i1 %exitcond.not, label %outer.latch, label %loop
+
+outer.latch:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %ec.2 = icmp eq i64 %outer.iv.next, %N
+  br i1 %ec.2, label %exit, label %outer.header
+
+exit:
+  ret void
+}
+
+define void @backward_min_distance_120(ptr %A, i64 %N) {
+; COMMON-LABEL: 'backward_min_distance_120'
+; COMMON-NEXT:    loop:
+; COMMON-NEXT:      Memory dependences are safe with run-time checks
+; COMMON-NEXT:      Dependences:
+; COMMON-NEXT:      Run-time memory checks:
+; COMMON-NEXT:      Check 0:
+; COMMON-NEXT:        Comparing group ([[GRP3:0x[0-9a-f]+]]):
+; COMMON-NEXT:          %gep.off.iv = getelementptr inbounds i8, ptr %gep.off, i64 %iv
+; COMMON-NEXT:        Against group ([[GRP4:0x[0-9a-f]+]]):
+; COMMON-NEXT:          %gep = getelementptr inbounds i8, ptr %A, i64 %iv
+; COMMON-NEXT:      Grouped accesses:
+; COMMON-NEXT:        Group [[GRP3]]:
+; COMMON-NEXT:          (Low: {(15 + %A)<nuw>,+,1}<nuw><%outer.header> High: {(271 + %A),+,1}<nw><%outer.header>)
+; COMMON-NEXT:            Member: {{\{\{}}(15 + %A)<nuw>,+,1}<nuw><%outer.header>,+,1}<nuw><%loop>
+; COMMON-NEXT:        Group [[GRP4]]:
+; COMMON-NEXT:          (Low: %A High: (256 + %A))
+; COMMON-NEXT:            Member: {%A,+,1}<nuw><%loop>
+; COMMON-EMPTY:
+; COMMON-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; COMMON-NEXT:      SCEV assumptions:
+; COMMON-EMPTY:
+; COMMON-NEXT:      Expressions re-written:
+; COMMON-NEXT:    outer.header:
+; COMMON-NEXT:      Report: loop is not the innermost loop
+; COMMON-NEXT:      Dependences:
+; COMMON-NEXT:      Run-time memory checks:
+; COMMON-NEXT:      Grouped accesses:
+; COMMON-EMPTY:
+; COMMON-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; COMMON-NEXT:      SCEV assumptions:
+; COMMON-EMPTY:
+; COMMON-NEXT:      Expressions re-written:
+;
+entry:
+  br label %outer.header
+
+outer.header:
+  %outer.iv = phi i64 [ 15, %entry ], [ %outer.iv.next, %outer.latch ]
+  %gep.off = getelementptr inbounds i8, ptr %A, i64 %outer.iv
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %outer.header ], [ %iv.next, %loop ]
+  %gep = getelementptr inbounds i8, ptr %A, i64 %iv
+  %l = load i8 , ptr %gep, align 4
+  %add = add nsw i8 %l, 5
+  %gep.off.iv = getelementptr inbounds i8, ptr %gep.off, i64 %iv
+  store i8 %add, ptr %gep.off.iv, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 256
+  br i1 %exitcond.not, label %outer.latch, label %loop
+
+outer.latch:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %ec.2 = icmp eq i64 %outer.iv.next, %N
+  br i1 %ec.2, label %exit, label %outer.header
+
+exit:
+  ret void
+}
+
+define void @backward_min_distance_128(ptr %A, i64 %N) {
+; COMMON-LABEL: 'backward_min_distance_128'
+; COMMON-NEXT:    loop:
+; COMMON-NEXT:      Memory dependences are safe with run-time checks
+; COMMON-NEXT:      Dependences:
+; COMMON-NEXT:      Run-time memory checks:
+; COMMON-NEXT:      Check 0:
+; COMMON-NEXT:        Comparing group ([[GRP13:0x[0-9a-f]+]]):
+; COMMON-NEXT:          %gep.off.iv = getelementptr inbounds i8, ptr %gep.off, i64 %iv
+; COMMON-NEXT:        Against group ([[GRP14:0x[0-9a-f]+]]):
+; COMMON-NEXT:          %gep = getelementptr inbounds i8, ptr %A, i64 %iv
+; COMMON-NEXT:      Grouped accesses:
+; COMMON-NEXT:        Group [[GRP13]]:
+; COMMON-NEXT:          (Low: {(16 + %A)<nuw>,+,1}<nuw><%outer.header> High: {(272 + %A),+,1}<nw><%outer.header>)
+; COMMON-NEXT:            Member: {{\{\{}}(16 + %A)<nuw>,+,1}<nuw><%outer.header>,+,1}<nuw><%loop>
+; COMMON-NEXT:        Group [[GRP14]]:
+; COMMON-NEXT:          (Low: %A High: (256 + %A))
+; COMMON-NEXT:            Member: {%A,+,1}<nuw><%loop>
+; COMMON-EMPTY:
+; COMMON-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; COMMON-NEXT:      SCEV assumptions:
+; COMMON-EMPTY:
+; COMMON-NEXT:      Expressions re-written:
+; COMMON-NEXT:    outer.header:
+; COMMON-NEXT:      Report: loop is not the innermost loop
+; COMMON-NEXT:      Dependences:
+; COMMON-NEXT:      Run-time memory checks:
+; COMMON-NEXT:      Grouped accesses:
+; COMMON-EMPTY:
+; COMMON-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; COMMON-NEXT:      SCEV assumptions:
+; COMMON-EMPTY:
+; COMMON-NEXT:      Expressions re-written:
+;
+entry:
+  br label %outer.header
+
+outer.header:
+  %outer.iv = phi i64 [ 16, %entry ], [ %outer.iv.next, %outer.latch ]
+  %gep.off = getelementptr inbounds i8, ptr %A, i64 %outer.iv
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %outer.header ], [ %iv.next, %loop ]
+  %gep = getelementptr inbounds i8, ptr %A, i64 %iv
+  %l = load i8 , ptr %gep, align 4
+  %add = add nsw i8 %l, 5
+  %gep.off.iv = getelementptr inbounds i8, ptr %gep.off, i64 %iv
+  store i8 %add, ptr %gep.off.iv, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 256
+  br i1 %exitcond.not, label %outer.latch, label %loop
+
+outer.latch:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %ec.2 = icmp eq i64 %outer.iv.next, %N
+  br i1 %ec.2, label %exit, label %outer.header
+
+exit:
+  ret void
+}
+
+define void @backward_min_distance_256(ptr %A, i64 %N) {
+; MAXLEN-LABEL: 'backward_min_distance_256'
+; MAXLEN-NEXT:    loop:
+; MAXLEN-NEXT:      Memory dependences are safe with run-time checks
+; MAXLEN-NEXT:      Dependences:
+; MAXLEN-NEXT:      Run-time memory checks:
+; MAXLEN-NEXT:      Check 0:
+; MAXLEN-NEXT:        Comparing group ([[GRP17:0x[0-9a-f]+]]):
+; MAXLEN-NEXT:          %gep.off.iv = getelementptr inbounds i8, ptr %gep.off, i64 %iv
+; MAXLEN-NEXT:        Against group ([[GRP18:0x[0-9a-f]+]]):
+; MAXLEN-NEXT:          %gep = getelementptr inbounds i8, ptr %A, i64 %iv
+; MAXLEN-NEXT:      Grouped accesses:
+; MAXLEN-NEXT:        Group [[GRP17]]:
+; MAXLEN-NEXT:          (Low: {(32 + %A)<nuw>,+,1}<nuw><%outer.header> High: {(288 + %A),+,1}<nw><%outer.header>)
+; MAXLEN-NEXT:            Member: {{\{\{}}(32 + %A)<nuw>,+,1}<nuw><%outer.header>,+,1}<nuw><%loop>
+; MAXLEN-NEXT:        Group [[GRP18]]:
+; MAXLEN-NEXT:          (Low: %A High: (256 + %A))
+; MAXLEN-NEXT:            Member: {%A,+,1}<nuw><%loop>
+; MAXLEN-EMPTY:
+; MAXLEN-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; MAXLEN-NEXT:      SCEV assumptions:
+; MAXLEN-EMPTY:
+; MAXLEN-NEXT:      Expressions re-written:
+; MAXLEN-NEXT:    outer.header:
+; MAXLEN-NEXT:      Report: loop is not the innermost loop
+; MAXLEN-NEXT:      Dependences:
+; MAXLEN-NEXT:      Run-time memory checks:
+; MAXLEN-NEXT:      Grouped accesses:
+; MAXLEN-EMPTY:
+; MAXLEN-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; MAXLEN-NEXT:      SCEV assumptions:
+; MAXLEN-EMPTY:
+; MAXLEN-NEXT:      Expressions re-written:
+;
+; VW128-LABEL: 'backward_min_distance_256'
+; VW128-NEXT:    loop:
+; VW128-NEXT:      Memory dependences are safe with a maximum safe vector width of 256 bits
+; VW128-NEXT:      Dependences:
+; VW128-NEXT:        BackwardVectorizable:
+; VW128-NEXT:            %l = load i8, ptr %gep, align 4 ->
+; VW128-NEXT:            store i8 %add, ptr %gep.off.iv, align 4
+; VW128-EMPTY:
+; VW128-NEXT:      Run-time memory checks:
+; VW128-NEXT:      Grouped accesses:
+; VW128-EMPTY:
+; VW128-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; VW128-NEXT:      SCEV assumptions:
+; VW128-EMPTY:
+; VW128-NEXT:      Expressions re-written:
+; VW128-NEXT:    outer.header:
+; VW128-NEXT:      Report: loop is not the innermost loop
+; VW128-NEXT:      Dependences:
+; VW128-NEXT:      Run-time memory checks:
+; VW128-NEXT:      Grouped accesses:
+; VW128-EMPTY:
+; VW128-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; VW128-NEXT:      SCEV assumptions:
+; VW128-EMPTY:
+; VW128-NEXT:      Expressions re-written:
+;
+entry:
+  br label %outer.header
+
+outer.header:
+  %outer.iv = phi i64 [ 32, %entry ], [ %outer.iv.next, %outer.latch ]
+  %gep.off = getelementptr inbounds i8, ptr %A, i64 %outer.iv
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %outer.header ], [ %iv.next, %loop ]
+  %gep = getelementptr inbounds i8, ptr %A, i64 %iv
+  %l = load i8 , ptr %gep, align 4
+  %add = add nsw i8 %l, 5
+  %gep.off.iv = getelementptr inbounds i8, ptr %gep.off, i64 %iv
+  store i8 %add, ptr %gep.off.iv, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 256
+  br i1 %exitcond.not, label %outer.latch, label %loop
+
+outer.latch:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %ec.2 = icmp eq i64 %outer.iv.next, %N
+  br i1 %ec.2, label %exit, label %outer.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/offset-range-known-via-assume.ll b/llvm/test/Analysis/LoopAccessAnalysis/offset-range-known-via-assume.ll
new file mode 100644
index 000000000000..c358b00dad22
--- /dev/null
+++ b/llvm/test/Analysis/LoopAccessAnalysis/offset-range-known-via-assume.ll
@@ -0,0 +1,230 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes='print<access-info>' -disable-output %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+declare void @llvm.assume(i1)
+
+declare void @use(ptr noundef)
+
+; %offset is known positive via assume, so we should be able to detect the
+; forward dependence.
+define void @offset_i8_known_positive_via_assume_forward_dep_1(ptr %A, i64 %offset, i64 %N) {
+; CHECK-LABEL: 'offset_i8_known_positive_via_assume_forward_dep_1'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Forward:
+; CHECK-NEXT:            %l = load i8, ptr %gep.off, align 4 ->
+; CHECK-NEXT:            store i8 %add, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %c = icmp sgt i64 %offset, 0
+  call void @llvm.assume(i1 %c)
+  %off = getelementptr inbounds i8, ptr %A, i64 %offset
+  call void @use(ptr noundef %off)
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.off = getelementptr inbounds i8, ptr %off, i64 %iv
+  %l = load i8 , ptr %gep.off, align 4
+  %add = add nsw i8 %l, 5
+  %gep = getelementptr inbounds i8, ptr %A, i64 %iv
+  store i8 %add, ptr %gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @offset_i32_known_positive_via_assume_forward_dep_1(ptr %A, i64 %offset, i64 %N) {
+; CHECK-LABEL: 'offset_i32_known_positive_via_assume_forward_dep_1'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP1:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep = getelementptr inbounds i32, ptr %A, i64 %iv
+; CHECK-NEXT:        Against group ([[GRP2:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep.off = getelementptr inbounds i32, ptr %off, i64 %iv
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP1]]:
+; CHECK-NEXT:          (Low: %A High: (-3 + (4 * %N) + %A))
+; CHECK-NEXT:            Member: {%A,+,4}<nuw><%loop>
+; CHECK-NEXT:        Group [[GRP2]]:
+; CHECK-NEXT:          (Low: ((4 * %offset)<nsw> + %A) High: (-3 + (4 * %offset)<nsw> + (4 * %N) + %A))
+; CHECK-NEXT:            Member: {((4 * %offset)<nsw> + %A),+,4}<nw><%loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      {((4 * %offset)<nsw> + %A),+,4}<nw><%loop> Added Flags: <nusw>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %c = icmp sgt i64 %offset, 0
+  call void @llvm.assume(i1 %c)
+  %off = getelementptr inbounds i32, ptr %A, i64 %offset
+  call void @use(ptr noundef %off)
+
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.off = getelementptr inbounds i32, ptr %off, i64 %iv
+  %l = load i8 , ptr %gep.off, align 4
+  %add = add nsw i8 %l, 5
+  %gep = getelementptr inbounds i32, ptr %A, i64 %iv
+  store i8 %add, ptr %gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; %offset is known positive via assume, so we should be able to detect the
+; forward dependence.
+define void @offset_known_positive_via_assume_forward_dep_2(ptr %A, i64 %offset, i64 %N) {
+; CHECK-LABEL: 'offset_known_positive_via_assume_forward_dep_2'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Forward:
+; CHECK-NEXT:            %l = load i32, ptr %gep.off, align 4 ->
+; CHECK-NEXT:            store i32 %add, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %c = icmp sgt i64 %offset, 0
+  call void @llvm.assume(i1 %c)
+  %c.2 = icmp slt i64 %offset, 20
+  call void @llvm.assume(i1 %c.2)
+  %off = getelementptr inbounds i32, ptr %A, i64 %offset
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.off = getelementptr inbounds i32, ptr %off, i64 %iv
+  %l = load i32, ptr %gep.off, align 4
+  %add = add nsw i32 %l, 5
+  %gep = getelementptr inbounds i32, ptr %A, i64 %iv
+  store i32 %add, ptr %gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; The range of %offset is known via assumes, but it may be positive or negative.
+define void @offset_may_be_negative_via_assume_unknown_dep(ptr %A, i64 %offset, i64 %N) {
+; CHECK-LABEL: 'offset_may_be_negative_via_assume_unknown_dep'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP3:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep.mul.2 = getelementptr inbounds i32, ptr %A, i64 %iv
+; CHECK-NEXT:        Against group ([[GRP4:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep = getelementptr inbounds i32, ptr %off, i64 %iv
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP3]]:
+; CHECK-NEXT:          (Low: %A High: ((4 * %N) + %A))
+; CHECK-NEXT:            Member: {%A,+,4}<nuw><%loop>
+; CHECK-NEXT:        Group [[GRP4]]:
+; CHECK-NEXT:          (Low: ((4 * %offset)<nsw> + %A) High: ((4 * %offset)<nsw> + (4 * %N) + %A))
+; CHECK-NEXT:            Member: {((4 * %offset)<nsw> + %A),+,4}<nw><%loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %c = icmp sgt i64 %offset, -4
+  call void @llvm.assume(i1 %c)
+  %c.2 = icmp slt i64 %offset, 20
+  call void @llvm.assume(i1 %c.2)
+  %off = getelementptr inbounds i32, ptr %A, i64 %offset
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr inbounds i32, ptr %off, i64 %iv
+  %l = load i32, ptr %gep, align 4
+  %add = add nsw i32 %l, 5
+  %gep.mul.2 = getelementptr inbounds i32, ptr %A, i64 %iv
+  store i32 %add, ptr %gep.mul.2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @offset_no_assumes(ptr %A, i64 %offset, i64 %N) {
+; CHECK-LABEL: 'offset_no_assumes'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP5:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep = getelementptr inbounds i32, ptr %A, i64 %iv
+; CHECK-NEXT:        Against group ([[GRP6:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep.off = getelementptr inbounds i32, ptr %off, i64 %iv
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP5]]:
+; CHECK-NEXT:          (Low: %A High: ((4 * %N) + %A))
+; CHECK-NEXT:            Member: {%A,+,4}<nuw><%loop>
+; CHECK-NEXT:        Group [[GRP6]]:
+; CHECK-NEXT:          (Low: ((4 * %offset)<nsw> + %A) High: ((4 * %offset)<nsw> + (4 * %N) + %A))
+; CHECK-NEXT:            Member: {((4 * %offset)<nsw> + %A),+,4}<nw><%loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %off = getelementptr inbounds i32, ptr %A, i64 %offset
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.off = getelementptr inbounds i32, ptr %off, i64 %iv
+  %l = load i32, ptr %gep.off, align 4
+  %add = add nsw i32 %l, 5
+  %gep = getelementptr inbounds i32, ptr %A, i64 %iv
+  store i32 %add, ptr %gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/unknown-dependence-retry-with-runtime-checks.ll b/llvm/test/Analysis/LoopAccessAnalysis/unknown-dependence-retry-with-runtime-checks.ll
new file mode 100644
index 000000000000..23ab92d75cbc
--- /dev/null
+++ b/llvm/test/Analysis/LoopAccessAnalysis/unknown-dependence-retry-with-runtime-checks.ll
@@ -0,0 +1,143 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes='print<access-info>' -disable-output %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+declare void @llvm.assume(i1)
+
+; FIXME: Currently we check all accesses to the same underlying objects, but
+; it is sufficient to check the accesses to %A.
+define void @test_dependence_with_non_constant_offset_and_other_accesses_to_noalias_pointers(ptr %A, ptr noalias %B, i64 %off) {
+; CHECK-LABEL: 'test_dependence_with_non_constant_offset_and_other_accesses_to_noalias_pointers'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP1:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep.A.400 = getelementptr inbounds i32, ptr %A.off, i64 %iv
+; CHECK-NEXT:        Against group ([[GRP2:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep.A = getelementptr inbounds i8, ptr %A, i64 %iv
+; CHECK-NEXT:      Check 1:
+; CHECK-NEXT:        Comparing group ([[GRP3:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep.B = getelementptr inbounds i8, ptr %B, i64 %iv
+; CHECK-NEXT:        Against group ([[GRP4:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep.B.1 = getelementptr inbounds i8, ptr %B, i64 %iv.next
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP1]]:
+; CHECK-NEXT:          (Low: (%off + %A) High: (404 + %off + %A))
+; CHECK-NEXT:            Member: {(%off + %A),+,4}<nw><%loop>
+; CHECK-NEXT:        Group [[GRP2]]:
+; CHECK-NEXT:          (Low: %A High: (101 + %A))
+; CHECK-NEXT:            Member: {%A,+,1}<nuw><%loop>
+; CHECK-NEXT:        Group [[GRP3]]:
+; CHECK-NEXT:          (Low: %B High: (101 + %B))
+; CHECK-NEXT:            Member: {%B,+,1}<nuw><%loop>
+; CHECK-NEXT:        Group [[GRP4]]:
+; CHECK-NEXT:          (Low: (1 + %B)<nuw> High: (102 + %B))
+; CHECK-NEXT:            Member: {(1 + %B)<nuw>,+,1}<nuw><%loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %c = icmp sgt i64 %off, 0
+  call void @llvm.assume(i1 %c)
+  %A.off = getelementptr inbounds i8, ptr %A, i64 %off
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.A.400 = getelementptr inbounds i32, ptr %A.off, i64 %iv
+  %gep.A = getelementptr inbounds i8, ptr %A, i64 %iv
+  %l = load i8, ptr %gep.A, align 1
+  %ext = zext i8 %l to i32
+  store i32 %ext, ptr %gep.A.400, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %gep.B.1 = getelementptr inbounds i8, ptr %B, i64 %iv.next
+  %l.2 = load i8, ptr %gep.B.1
+  %gep.B = getelementptr inbounds i8, ptr %B, i64 %iv
+  store i8 %l.2, ptr %gep.B
+  %ec = icmp eq i64 %iv, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_dependence_with_non_constant_offset_and_other_accesses_to_mayalias_pointers(ptr %A, ptr %B, i64 %off) {
+; CHECK-LABEL: 'test_dependence_with_non_constant_offset_and_other_accesses_to_mayalias_pointers'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP5:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep.A.400 = getelementptr inbounds i32, ptr %A.off, i64 %iv
+; CHECK-NEXT:        Against group ([[GRP6:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep.B = getelementptr inbounds i8, ptr %B, i64 %iv
+; CHECK-NEXT:      Check 1:
+; CHECK-NEXT:        Comparing group ([[GRP5]]):
+; CHECK-NEXT:          %gep.A.400 = getelementptr inbounds i32, ptr %A.off, i64 %iv
+; CHECK-NEXT:        Against group ([[GRP7:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep.A = getelementptr inbounds i8, ptr %A, i64 %iv
+; CHECK-NEXT:      Check 2:
+; CHECK-NEXT:        Comparing group ([[GRP5]]):
+; CHECK-NEXT:          %gep.A.400 = getelementptr inbounds i32, ptr %A.off, i64 %iv
+; CHECK-NEXT:        Against group ([[GRP8:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep.B.1 = getelementptr inbounds i8, ptr %B, i64 %iv.next
+; CHECK-NEXT:      Check 3:
+; CHECK-NEXT:        Comparing group ([[GRP6]]):
+; CHECK-NEXT:          %gep.B = getelementptr inbounds i8, ptr %B, i64 %iv
+; CHECK-NEXT:        Against group ([[GRP7]]):
+; CHECK-NEXT:          %gep.A = getelementptr inbounds i8, ptr %A, i64 %iv
+; CHECK-NEXT:      Check 4:
+; CHECK-NEXT:        Comparing group ([[GRP6]]):
+; CHECK-NEXT:          %gep.B = getelementptr inbounds i8, ptr %B, i64 %iv
+; CHECK-NEXT:        Against group ([[GRP8]]):
+; CHECK-NEXT:          %gep.B.1 = getelementptr inbounds i8, ptr %B, i64 %iv.next
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP5]]:
+; CHECK-NEXT:          (Low: (%off + %A) High: (404 + %off + %A))
+; CHECK-NEXT:            Member: {(%off + %A),+,4}<nw><%loop>
+; CHECK-NEXT:        Group [[GRP6]]:
+; CHECK-NEXT:          (Low: %B High: (101 + %B))
+; CHECK-NEXT:            Member: {%B,+,1}<nuw><%loop>
+; CHECK-NEXT:        Group [[GRP7]]:
+; CHECK-NEXT:          (Low: %A High: (101 + %A))
+; CHECK-NEXT:            Member: {%A,+,1}<nuw><%loop>
+; CHECK-NEXT:        Group [[GRP8]]:
+; CHECK-NEXT:          (Low: (1 + %B)<nuw> High: (102 + %B))
+; CHECK-NEXT:            Member: {(1 + %B)<nuw>,+,1}<nuw><%loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %c = icmp sgt i64 %off, 0
+  call void @llvm.assume(i1 %c)
+  %A.off = getelementptr inbounds i8, ptr %A, i64 %off
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.A.400 = getelementptr inbounds i32, ptr %A.off, i64 %iv
+  %gep.A = getelementptr inbounds i8, ptr %A, i64 %iv
+  %l = load i8, ptr %gep.A, align 1
+  %ext = zext i8 %l to i32
+  store i32 %ext, ptr %gep.A.400, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %gep.B.1 = getelementptr inbounds i8, ptr %B, i64 %iv.next
+  %l.2 = load i8, ptr %gep.B.1
+  %gep.B = getelementptr inbounds i8, ptr %B, i64 %iv
+  store i8 %l.2, ptr %gep.B
+  %ec = icmp eq i64 %iv, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/MustExecute/const-cond.ll b/llvm/test/Analysis/MustExecute/const-cond.ll
index d36598cc63ee..e829db349ca6 100644
--- a/llvm/test/Analysis/MustExecute/const-cond.ll
+++ b/llvm/test/Analysis/MustExecute/const-cond.ll
@@ -1,47 +1,47 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -disable-output -passes=print-mustexecute %s 2>&1 | FileCheck %s
-; RUN: opt -disable-output -passes=print-mustexecute %s 2>&1 | FileCheck %s
-
-; In general the CFG below is easily simplified but this is useful for
-; pass ordering issue elimination.
-define i1 @const_cond(i32 %high) {
-; CHECK-LABEL: @const_cond(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] ; (mustexec in: loop)
-; CHECK-NEXT:    br i1 true, label [[NEXT:%.*]], label [[NEVER1:%.*]] ; (mustexec in: loop)
-; CHECK:       next:
-; CHECK-NEXT:    br i1 false, label [[NEVER2:%.*]], label [[BACKEDGE]] ; (mustexec in: loop)
-; CHECK:       backedge:
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 ; (mustexec in: loop)
-; CHECK-NEXT:    [[EXIT_TEST:%.*]] = icmp slt i32 [[IV]], [[HIGH:%.*]] ; (mustexec in: loop)
-; CHECK-NEXT:    br i1 [[EXIT_TEST]], label [[LOOP]], label [[EXIT:%.*]] ; (mustexec in: loop)
-; CHECK:       exit:
-; CHECK-NEXT:    ret i1 false
-; CHECK:       never1:
-; CHECK-NEXT:    unreachable
-; CHECK:       never2:
-; CHECK-NEXT:    unreachable
-;
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i32 [0, %entry], [%iv.next, %backedge]
-  br i1 true, label %next, label %never1
-next:
-  br i1 false, label %never2, label %backedge
-backedge:
-  %iv.next = add nsw nuw i32 %iv, 1
-  %exit.test = icmp slt i32 %iv, %high
-  br i1 %exit.test, label %loop, label %exit
-
-exit:
-  ret i1 false
-never1:
-  unreachable
-never2:
-  unreachable
-}
-
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -disable-output -passes=print-mustexecute %s 2>&1 | FileCheck %s
+; RUN: opt -disable-output -passes=print-mustexecute %s 2>&1 | FileCheck %s
+
+; In general the CFG below is easily simplified but this is useful for
+; pass ordering issue elimination.
+define i1 @const_cond(i32 %high) {
+; CHECK-LABEL: @const_cond(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] ; (mustexec in: loop)
+; CHECK-NEXT:    br i1 true, label [[NEXT:%.*]], label [[NEVER1:%.*]] ; (mustexec in: loop)
+; CHECK:       next:
+; CHECK-NEXT:    br i1 false, label [[NEVER2:%.*]], label [[BACKEDGE]] ; (mustexec in: loop)
+; CHECK:       backedge:
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 ; (mustexec in: loop)
+; CHECK-NEXT:    [[EXIT_TEST:%.*]] = icmp slt i32 [[IV]], [[HIGH:%.*]] ; (mustexec in: loop)
+; CHECK-NEXT:    br i1 [[EXIT_TEST]], label [[LOOP]], label [[EXIT:%.*]] ; (mustexec in: loop)
+; CHECK:       exit:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       never1:
+; CHECK-NEXT:    unreachable
+; CHECK:       never2:
+; CHECK-NEXT:    unreachable
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %backedge]
+  br i1 true, label %next, label %never1
+next:
+  br i1 false, label %never2, label %backedge
+backedge:
+  %iv.next = add nsw nuw i32 %iv, 1
+  %exit.test = icmp slt i32 %iv, %high
+  br i1 %exit.test, label %loop, label %exit
+
+exit:
+  ret i1 false
+never1:
+  unreachable
+never2:
+  unreachable
+}
+
diff --git a/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info.ll b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info.ll
index 1f475e80e562..da4487ce9cd4 100644
--- a/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info.ll
+++ b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info.ll
@@ -69,4 +69,38 @@ exit:
   ret void
 }
 
+declare void @use(i32)
+
+define void @rewrite_preserve_add_nsw(i32 %a) {
+; CHECK-LABEL: 'rewrite_preserve_add_nsw'
+; CHECK-NEXT:  Classifying expressions for: @rewrite_preserve_add_nsw
+; CHECK-NEXT:    %add = add nsw i32 %a, 4
+; CHECK-NEXT:    --> (4 + %a)<nsw> U: [-2147483644,-2147483648) S: [-2147483644,-2147483648)
+; CHECK-NEXT:    %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    --> {0,+,1}<nuw><%loop> U: [0,-2147483648) S: [0,-2147483648) Exits: (0 smax (4 + %a)<nsw>) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %iv.next = add i32 %iv, 1
+; CHECK-NEXT:    --> {1,+,1}<nuw><%loop> U: [1,-2147483647) S: [1,-2147483647) Exits: (1 + (0 smax (4 + %a)<nsw>))<nuw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @rewrite_preserve_add_nsw
+; CHECK-NEXT:  Loop %loop: backedge-taken count is (0 smax (4 + %a)<nsw>)
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i32 2147483647
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is (0 smax (4 + %a)<nsw>)
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+;
+entry:
+  %add = add nsw i32 %a, 4
+  call void @use(i32 noundef %add)
+  %pre = icmp sgt i32 %a, -4
+  br i1 %pre, label %loop, label %exit
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  call void @clobber()
+  %iv.next = add i32 %iv, 1
+  %ec = icmp slt i32 %iv, %add
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret void
+}
+
 declare void @clobber()
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir
index 17757b99ccab..ce00edf3363f 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir
@@ -4,16 +4,16 @@
 # CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
 # CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt)
 # CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_
-# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if)
-# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if)
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if)
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if)
 # CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1
 # CHECK: DIVERGENT: G_BR %bb.2
 # CHECK-LABEL: BLOCK bb.1
 # CHECK-LABEL: BLOCK bb.2
 # CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_PHI %{{[0-9]*}}:_(s32), %bb.1, %{{[0-9]*}}:_(s32), %bb.0
 # CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_PHI %{{[0-9]*}}:_(s1), %bb.1, %{{[0-9]*}}:_(s1), %bb.0
-# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if)
-# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if)
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if)
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if)
 # CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.3
 # CHECK: DIVERGENT: G_BR %bb.4
 # CHECK-LABEL: BLOCK bb.3
@@ -44,7 +44,7 @@ body:             |
     %14:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
     %16:_(s1) = G_ICMP intpred(slt), %14(s32), %15
     %18:_(s1) = G_XOR %16, %17
-    %19:_(s1), %20:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %16(s1)
+    %19:_(s1), %20:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %16(s1)
     G_BRCOND %19(s1), %bb.2
     G_BR %bb.3
 
@@ -60,8 +60,8 @@ body:             |
 
     %25:_(s32) = G_PHI %22(s32), %bb.2, %33(s32), %bb.1
     %26:_(s1) = G_PHI %24(s1), %bb.2, %18(s1), %bb.1
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %20(s64)
-    %27:_(s1), %28:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %26(s1)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %20(s64)
+    %27:_(s1), %28:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %26(s1)
     G_BRCOND %27(s1), %bb.4
     G_BR %bb.5
 
@@ -72,7 +72,7 @@ body:             |
 
   bb.5:
     %31:_(s32) = G_PHI %25(s32), %bb.3, %29(s32), %bb.4
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %28(s64)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %28(s64)
     G_STORE %31(s32), %32(p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
     S_ENDPGM 0
 
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/temporal-divergence.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/temporal-divergence.mir
index 1e83151f6924..7bff87c09b3c 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/temporal-divergence.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/temporal-divergence.mir
@@ -27,7 +27,7 @@ body:             |
 
     %11:_(s64) = G_PHI %12(s64), %bb.2, %15(s64), %bb.1
     %18:_(s1) = G_CONSTANT i1 false
-    %12:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %18(s1), %11(s64)
+    %12:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %18(s1), %11(s64)
   ; CHECK:   DIVERGENT: SI_LOOP
     SI_LOOP %12(s64), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.3
@@ -35,7 +35,7 @@ body:             |
   bb.3:
   ; CHECK:   DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI
     %14:_(s64) = G_PHI %12(s64), %bb.2
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %14(s64)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %14(s64)
     S_ENDPGM 0
 
 ...
@@ -82,7 +82,7 @@ body:             |
     successors: %bb.5, %bb.4
 
     %15:_(s64) = G_PHI %24(s64), %bb.2, %16(s64), %bb.4
-    %16:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %14(s1), %15(s64)
+    %16:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %14(s1), %15(s64)
   ; CHECK:   DIVERGENT: SI_LOOP
     SI_LOOP %16(s64), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.5
@@ -90,7 +90,7 @@ body:             |
   bb.5:
   ; CHECK:   DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI
     %18:_(s64) = G_PHI %16(s64), %bb.4
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %18(s64)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %18(s64)
     G_BR %bb.3
 
   bb.6:
@@ -140,7 +140,7 @@ body:             |
     successors: %bb.5, %bb.4
 
     %15:_(s64) = G_PHI %24(s64), %bb.2, %16(s64), %bb.4
-    %16:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %14(s1), %15(s64)
+    %16:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %14(s1), %15(s64)
   ; CHECK:   DIVERGENT: SI_LOOP
     SI_LOOP %16(s64), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.5
@@ -148,7 +148,7 @@ body:             |
   bb.5:
   ; CHECK:   DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI
     %18:_(s64) = G_PHI %16(s64), %bb.4
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %18(s64)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %18(s64)
     G_BR %bb.3
 
   bb.6:
@@ -191,7 +191,7 @@ body:             |
 
     %15:_(s64) = G_PHI %25(s64), %bb.2, %16(s64), %bb.3
     %24:_(s1) = G_CONSTANT i1 false
-    %16:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %24(s1), %15(s64)
+    %16:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %24(s1), %15(s64)
   ; CHECK:   DIVERGENT: SI_LOOP
     SI_LOOP %16(s64), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.4
@@ -201,7 +201,7 @@ body:             |
     successors: %bb.5, %bb.2
 
     %18:_(s64) = G_PHI %16(s64), %bb.3
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %18(s64)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %18(s64)
     G_BRCOND %13(s1), %bb.2
     G_BR %bb.5
 
@@ -241,7 +241,7 @@ body:             |
   bb.2:
     %15:_(s64) = G_PHI %16(s64), %bb.4, %19(s64), %bb.1
     %24:_(s1) = G_CONSTANT i1 true
-    %16:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %24(s1), %15(s64)
+    %16:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %24(s1), %15(s64)
 
   bb.3:
     successors: %bb.4, %bb.3
@@ -259,7 +259,7 @@ body:             |
   bb.5:
   ; CHECK:   DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI
     %18:_(s64) = G_PHI %16(s64), %bb.4
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %18(s64)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %18(s64)
     S_ENDPGM 0
 
 ...
@@ -291,7 +291,7 @@ body:             |
 
     %10:_(s64) = G_PHI %11(s64), %bb.2, %19(s64), %bb.1
     %24:_(s1) = G_CONSTANT i1 false
-    %11:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %24(s1), %10(s64)
+    %11:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %24(s1), %10(s64)
   ; CHECK:   DIVERGENT: SI_LOOP
     SI_LOOP %11(s64), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.3
@@ -300,7 +300,7 @@ body:             |
   ; CHECK:   DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI
   ; CHECK-NOT:   DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI
     %13:_(s64) = G_PHI %11(s64), %bb.2
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %13(s64)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %13(s64)
     %14:_(p4) = COPY %3(p4)
     %15:_(s64) = G_CONSTANT i64 40
     %16:_(p4) = G_PTR_ADD %14, %15(s64)
@@ -354,7 +354,7 @@ body:             |
 
     %15:_(s64) = G_PHI %23(s64), %bb.2, %16(s64), %bb.3
     %25:_(s1) = G_CONSTANT i1 false
-    %16:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %25(s1), %15(s64)
+    %16:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %25(s1), %15(s64)
   ; CHECK:   DIVERGENT: SI_LOOP
     SI_LOOP %16(s64), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.4
@@ -362,7 +362,7 @@ body:             |
   bb.4:
   ; CHECK:   DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI
     %18:_(s64) = G_PHI %16(s64), %bb.3
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %18(s64)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %18(s64)
 
   bb.5:
 
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uses-value-from-cycle.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uses-value-from-cycle.mir
index d1f53aadc37d..b7e0d5449d2e 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uses-value-from-cycle.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uses-value-from-cycle.mir
@@ -43,18 +43,18 @@ body:             |
     ; CHECK: DIVERGENT: %{{[0-9]}}: %{{[0-9]}}:_(s32) = G_PHI
     ; CHECK: DIVERGENT: %{{[0-9]}}: %{{[0-9]}}:_(s32) = G_PHI
     ; CHECK: DIVERGENT: %{{[0-9]}}: %{{[0-9]}}:_(s32) = G_PHI
-    ; CHECK-NOT: DIVERGENT: %{{[0-9]}}: %{{[0-9]}}:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break)
+    ; CHECK-NOT: DIVERGENT: %{{[0-9]}}: %{{[0-9]}}:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break)
     %19:_(s32) = G_PHI %18(s32), %bb.7, %25(s32), %bb.4
     %20:_(s32) = G_PHI %6(s32), %bb.7, %25(s32), %bb.4
     %21:_(s1) = G_PHI %34(s1), %bb.7, %33(s1), %bb.4
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %16(s32)
-    %22:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %21(s1), %0(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %16(s32)
+    %22:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %21(s1), %0(s32)
     SI_LOOP %22(s32), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.6
   
   bb.6:
     %24:_(s32) = G_PHI %22(s32), %bb.5
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %24(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %24(s32)
     SI_RETURN
   
   bb.7:
diff --git a/llvm/test/Analysis/ValueTracking/known-power-of-two-urem.ll b/llvm/test/Analysis/ValueTracking/known-power-of-two-urem.ll
index 0fa8a74d250d..47c4587f6991 100644
--- a/llvm/test/Analysis/ValueTracking/known-power-of-two-urem.ll
+++ b/llvm/test/Analysis/ValueTracking/known-power-of-two-urem.ll
@@ -387,3 +387,119 @@ for.end:
   %r = phi i64 [ %sum, %for.body ]
   ret i64 %r
 }
+
+; https://alive2.llvm.org/ce/z/3QfEHm
+define i8 @known_power_of_two_rust_next_power_of_two(i8 %x, i8 %y) {
+; CHECK-LABEL: @known_power_of_two_rust_next_power_of_two(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call range(i8 0, 9) i8 @llvm.ctlz.i8(i8 [[TMP1]], i1 true)
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i8 -1, [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ugt i8 [[X]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i8 [[TMP3]], i8 0
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[TMP5]], [[Y:%.*]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %2 = add i8 %x, -1
+  %3 = tail call i8 @llvm.ctlz.i8(i8 %2, i1 true)
+  %4 = lshr i8 -1, %3
+  %5 = add i8 %4, 1
+  %6 = icmp ugt i8 %x, 1
+  %p = select i1 %6, i8 %5, i8 1
+  ; Rust's implementation of `%p = next_power_of_two(%x)`
+
+  %r = urem i8 %y, %p
+  ret i8 %r
+}
+
+define i8 @known_power_of_two_lshr_add_one_allow_zero(i8 %x, i8 %y) {
+; CHECK-LABEL: @known_power_of_two_lshr_add_one_allow_zero(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i8 -1, [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[TMP1]], [[Y:%.*]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %4 = lshr i8 -1, %x
+  %p = add i8 %4, 1
+
+  ; Note: y % p --> y & (p - 1) allows p == 0
+  %r = urem i8 %y, %p
+  ret i8 %r
+}
+
+define i1 @known_power_of_two_lshr_add_one_nuw_deny_zero(i8 %x, i8 %y) {
+; CHECK-LABEL: @known_power_of_two_lshr_add_one_nuw_deny_zero(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i8 -1, [[X:%.*]]
+; CHECK-NEXT:    [[P:%.*]] = add nuw i8 [[TMP1]], 1
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[P]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[AND]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %4 = lshr i8 -1, %x
+  %p = add nuw i8 %4, 1
+
+  ; Note: A & B_Pow2 != B_Pow2 --> A & B_Pow2 == 0 requires B_Pow2 != 0
+  %and = and i8 %p, %y
+  %r = icmp ne i8 %and, %p
+  ret i1 %r
+}
+
+define i1 @negative_known_power_of_two_lshr_add_one_deny_zero(i8 %x, i8 %y) {
+; CHECK-LABEL: @negative_known_power_of_two_lshr_add_one_deny_zero(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i8 -1, [[X:%.*]]
+; CHECK-NEXT:    [[P:%.*]] = add i8 [[TMP1]], 1
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[P]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[AND]], [[P]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %4 = lshr i8 -1, %x
+  %p = add i8 %4, 1
+
+  ; Note: A & B_Pow2 != B_Pow2 --> A & B_Pow2 == 0 requires B_Pow2 != 0
+  %and = and i8 %p, %y
+  %r = icmp ne i8 %and, %p
+  ret i1 %r
+}
+
+define i1 @negative_known_power_of_two_lshr_add_one_nsw_deny_zero(i8 %x, i8 %y) {
+; CHECK-LABEL: @negative_known_power_of_two_lshr_add_one_nsw_deny_zero(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i8 -1, [[X:%.*]]
+; CHECK-NEXT:    [[P:%.*]] = add nsw i8 [[TMP1]], 1
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[P]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[AND]], [[P]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %4 = lshr i8 -1, %x
+  %p = add nsw i8 %4, 1
+
+  ; Note: A & B_Pow2 != B_Pow2 --> A & B_Pow2 == 0 requires B_Pow2 != 0
+  %and = and i8 %p, %y
+  %r = icmp ne i8 %and, %p
+  ret i1 %r
+}
+
+define i8 @known_power_of_two_lshr_add_negative_1(i8 %x, i8 %y) {
+; CHECK-LABEL: @known_power_of_two_lshr_add_negative_1(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i8 -2, [[X:%.*]]
+; CHECK-NEXT:    [[P:%.*]] = add nuw i8 [[TMP1]], 1
+; CHECK-NEXT:    [[R:%.*]] = urem i8 [[Y:%.*]], [[P]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %4 = lshr i8 -2, %x
+  %p = add i8 %4, 1
+
+  %r = urem i8 %y, %p
+  ret i8 %r
+}
+
+define i8 @known_power_of_two_lshr_add_negative_2(i8 %x, i8 %y) {
+; CHECK-LABEL: @known_power_of_two_lshr_add_negative_2(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i8 -1, [[X:%.*]]
+; CHECK-NEXT:    [[P:%.*]] = add nsw i8 [[TMP1]], -1
+; CHECK-NEXT:    [[R:%.*]] = urem i8 [[Y:%.*]], [[P]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %4 = lshr i8 -1, %x
+  %p = add i8 %4, -1
+
+  %r = urem i8 %y, %p
+  ret i8 %r
+}
diff --git a/llvm/test/Assembler/thinlto-summary.ll b/llvm/test/Assembler/thinlto-summary.ll
index e0d866da0d8a..05dad2c7acad 100644
--- a/llvm/test/Assembler/thinlto-summary.ll
+++ b/llvm/test/Assembler/thinlto-summary.ll
@@ -46,32 +46,28 @@
 ^18 = gv: (guid: 17, summaries: (alias: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1), aliasee: ^14)))
 
 ; Test all types of TypeIdInfo on function summaries.
-^19 = gv: (guid: 18, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 4, typeIdInfo: (typeTests: (^26, ^28)))))
-^20 = gv: (guid: 19, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 8, typeIdInfo: (typeTestAssumeVCalls: (vFuncId: (^29, offset: 16))))))
-^21 = gv: (guid: 20, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 5, typeIdInfo: (typeCheckedLoadVCalls: (vFuncId: (^27, offset: 16))))))
-^22 = gv: (guid: 21, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 15, typeIdInfo: (typeTestAssumeConstVCalls: ((vFuncId: (^29, offset: 16), args: (42)), (vFuncId: (^29, offset: 24)))))))
-^23 = gv: (guid: 22, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 5, typeIdInfo: (typeCheckedLoadConstVCalls: ((vFuncId: (^30, offset: 16), args: (42)))))))
+^19 = gv: (guid: 18, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 4, typeIdInfo: (typeTests: (^25, ^27)))))
+^20 = gv: (guid: 19, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 8, typeIdInfo: (typeTestAssumeVCalls: (vFuncId: (^28, offset: 16))))))
+^21 = gv: (guid: 20, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 5, typeIdInfo: (typeCheckedLoadVCalls: (vFuncId: (^26, offset: 16))))))
+^22 = gv: (guid: 21, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 15, typeIdInfo: (typeTestAssumeConstVCalls: ((vFuncId: (^28, offset: 16), args: (42)), (vFuncId: (^28, offset: 24)))))))
+^23 = gv: (guid: 22, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 5, typeIdInfo: (typeCheckedLoadConstVCalls: ((vFuncId: (^29, offset: 16), args: (42)))))))
 
 ; Function summary with an import type of declaration
 ^24 = gv: (guid: 23, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0, importType: declaration), insts: 5)))
 
-; GUID that are 64-bit
-
-^25 = gv: (guid: 9123456789101112131, summaries: (function: (module: ^0, flags: (linkage: internal, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, importType: definition), insts: 1)))
-
 ; Test TypeId summaries:
 
-^26 = typeid: (name: "_ZTS1C", summary: (typeTestRes: (kind: single, sizeM1BitWidth: 0)))
+^25 = typeid: (name: "_ZTS1C", summary: (typeTestRes: (kind: single, sizeM1BitWidth: 0)))
 ; Test TypeId with other optional fields (alignLog2/sizeM1/bitMask/inlineBits)
-^27 = typeid: (name: "_ZTS1B", summary: (typeTestRes: (kind: inline, sizeM1BitWidth: 0, alignLog2: 1, sizeM1: 2, bitMask: 3, inlineBits: 4)))
+^26 = typeid: (name: "_ZTS1B", summary: (typeTestRes: (kind: inline, sizeM1BitWidth: 0, alignLog2: 1, sizeM1: 2, bitMask: 3, inlineBits: 4)))
 ; Test the AllOnes resolution, and all kinds of WholeProgramDevirtResolution
 ; types, including all optional resolution by argument kinds.
-^28 = typeid: (name: "_ZTS1A", summary: (typeTestRes: (kind: allOnes, sizeM1BitWidth: 7), wpdResolutions: ((offset: 0, wpdRes: (kind: branchFunnel)), (offset: 8, wpdRes: (kind: singleImpl, singleImplName: "_ZN1A1nEi")), (offset: 16, wpdRes: (kind: indir, resByArg: (args: (1, 2), byArg: (kind: indir, byte: 2, bit: 3), args: (3), byArg: (kind: uniformRetVal, info: 1), args: (4), byArg: (kind: uniqueRetVal, info: 1), args: (5), byArg: (kind: virtualConstProp)))))))
+^27 = typeid: (name: "_ZTS1A", summary: (typeTestRes: (kind: allOnes, sizeM1BitWidth: 7), wpdResolutions: ((offset: 0, wpdRes: (kind: branchFunnel)), (offset: 8, wpdRes: (kind: singleImpl, singleImplName: "_ZN1A1nEi")), (offset: 16, wpdRes: (kind: indir, resByArg: (args: (1, 2), byArg: (kind: indir, byte: 2, bit: 3), args: (3), byArg: (kind: uniformRetVal, info: 1), args: (4), byArg: (kind: uniqueRetVal, info: 1), args: (5), byArg: (kind: virtualConstProp)))))))
 ; Test the other kinds of type test resoultions
-^29 = typeid: (name: "_ZTS1D", summary: (typeTestRes: (kind: byteArray, sizeM1BitWidth: 0)))
-^30 = typeid: (name: "_ZTS1E", summary: (typeTestRes: (kind: unsat, sizeM1BitWidth: 0)))
-^31 = flags: 8
-^32 = blockcount: 1888
+^28 = typeid: (name: "_ZTS1D", summary: (typeTestRes: (kind: byteArray, sizeM1BitWidth: 0)))
+^29 = typeid: (name: "_ZTS1E", summary: (typeTestRes: (kind: unsat, sizeM1BitWidth: 0)))
+^30 = flags: 8
+^31 = blockcount: 1888
 
 ; Make sure we get back from llvm-dis essentially what we put in via llvm-as.
 ; CHECK: ^0 = module: (path: "thinlto-summary1.o", hash: (1369602428, 2747878711, 259090915, 2507395659, 1141468049))
@@ -95,20 +91,19 @@
 ; CHECK: ^16 = gv: (guid: 15, summaries: (function: (module: ^1, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0, canAutoHide: 0, importType: definition), insts: 1, funcFlags: (readNone: 1, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 0, alwaysInline: 1, noUnwind: 1, mayThrow: 1, hasUnknownCall: 1, mustBeUnreachable: 0))))
 ; CHECK: ^17 = gv: (guid: 16, summaries: (function: (module: ^1, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0, canAutoHide: 0, importType: definition), insts: 1, funcFlags: (readNone: 0, readOnly: 1, noRecurse: 0, returnDoesNotAlias: 1, noInline: 0, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 1), calls: ((callee: ^15)))))
 ; CHECK: ^18 = gv: (guid: 17, summaries: (alias: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0, importType: definition), aliasee: ^14)))
-; CHECK: ^19 = gv: (guid: 18, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0, canAutoHide: 0, importType: definition), insts: 4, typeIdInfo: (typeTests: (^26, ^28)))))
-; CHECK: ^20 = gv: (guid: 19, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0, canAutoHide: 0, importType: definition), insts: 8, typeIdInfo: (typeTestAssumeVCalls: (vFuncId: (^29, offset: 16))))))
-; CHECK: ^21 = gv: (guid: 20, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0, canAutoHide: 0, importType: definition), insts: 5, typeIdInfo: (typeCheckedLoadVCalls: (vFuncId: (^27, offset: 16))))))
-; CHECK: ^22 = gv: (guid: 21, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0, canAutoHide: 0, importType: definition), insts: 15, typeIdInfo: (typeTestAssumeConstVCalls: ((vFuncId: (^29, offset: 16), args: (42)), (vFuncId: (^29, offset: 24)))))))
-; CHECK: ^23 = gv: (guid: 22, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0, canAutoHide: 0, importType: definition), insts: 5, typeIdInfo: (typeCheckedLoadConstVCalls: ((vFuncId: (^30, offset: 16), args: (42)))))))
+; CHECK: ^19 = gv: (guid: 18, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0, canAutoHide: 0, importType: definition), insts: 4, typeIdInfo: (typeTests: (^25, ^27)))))
+; CHECK: ^20 = gv: (guid: 19, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0, canAutoHide: 0, importType: definition), insts: 8, typeIdInfo: (typeTestAssumeVCalls: (vFuncId: (^28, offset: 16))))))
+; CHECK: ^21 = gv: (guid: 20, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0, canAutoHide: 0, importType: definition), insts: 5, typeIdInfo: (typeCheckedLoadVCalls: (vFuncId: (^26, offset: 16))))))
+; CHECK: ^22 = gv: (guid: 21, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0, canAutoHide: 0, importType: definition), insts: 15, typeIdInfo: (typeTestAssumeConstVCalls: ((vFuncId: (^28, offset: 16), args: (42)), (vFuncId: (^28, offset: 24)))))))
+; CHECK: ^23 = gv: (guid: 22, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0, canAutoHide: 0, importType: definition), insts: 5, typeIdInfo: (typeCheckedLoadConstVCalls: ((vFuncId: (^29, offset: 16), args: (42)))))))
 ; CHECK: ^24 = gv: (guid: 23, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0, canAutoHide: 0, importType: declaration), insts: 5)))
-; CHECK: ^25 = gv: (guid: 9123456789101112131, summaries: (function: (module: ^0, flags: (linkage: internal, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0, importType: definition), insts: 1)))
-; CHECK: ^26 = typeid: (name: "_ZTS1C", summary: (typeTestRes: (kind: single, sizeM1BitWidth: 0))) ; guid = 1884921850105019584
-; CHECK: ^27 = typeid: (name: "_ZTS1B", summary: (typeTestRes: (kind: inline, sizeM1BitWidth: 0, alignLog2: 1, sizeM1: 2, bitMask: 3, inlineBits: 4))) ; guid = 6203814149063363976
-; CHECK: ^28 = typeid: (name: "_ZTS1A", summary: (typeTestRes: (kind: allOnes, sizeM1BitWidth: 7), wpdResolutions: ((offset: 0, wpdRes: (kind: branchFunnel)), (offset: 8, wpdRes: (kind: singleImpl, singleImplName: "_ZN1A1nEi")), (offset: 16, wpdRes: (kind: indir, resByArg: (args: (1, 2), byArg: (kind: indir, byte: 2, bit: 3), args: (3), byArg: (kind: uniformRetVal, info: 1), args: (4), byArg: (kind: uniqueRetVal, info: 1), args: (5), byArg: (kind: virtualConstProp))))))) ; guid = 7004155349499253778
-; CHECK: ^29 = typeid: (name: "_ZTS1D", summary: (typeTestRes: (kind: byteArray, sizeM1BitWidth: 0))) ; guid = 9614786172484273522
-; CHECK: ^30 = typeid: (name: "_ZTS1E", summary: (typeTestRes: (kind: unsat, sizeM1BitWidth: 0))) ; guid = 17437243864166745132
-; CHECK: ^31 = flags: 8
-; CHECK: ^32 = blockcount: 1888
+; CHECK: ^25 = typeid: (name: "_ZTS1C", summary: (typeTestRes: (kind: single, sizeM1BitWidth: 0))) ; guid = 1884921850105019584
+; CHECK: ^26 = typeid: (name: "_ZTS1B", summary: (typeTestRes: (kind: inline, sizeM1BitWidth: 0, alignLog2: 1, sizeM1: 2, bitMask: 3, inlineBits: 4))) ; guid = 6203814149063363976
+; CHECK: ^27 = typeid: (name: "_ZTS1A", summary: (typeTestRes: (kind: allOnes, sizeM1BitWidth: 7), wpdResolutions: ((offset: 0, wpdRes: (kind: branchFunnel)), (offset: 8, wpdRes: (kind: singleImpl, singleImplName: "_ZN1A1nEi")), (offset: 16, wpdRes: (kind: indir, resByArg: (args: (1, 2), byArg: (kind: indir, byte: 2, bit: 3), args: (3), byArg: (kind: uniformRetVal, info: 1), args: (4), byArg: (kind: uniqueRetVal, info: 1), args: (5), byArg: (kind: virtualConstProp))))))) ; guid = 7004155349499253778
+; CHECK: ^28 = typeid: (name: "_ZTS1D", summary: (typeTestRes: (kind: byteArray, sizeM1BitWidth: 0))) ; guid = 9614786172484273522
+; CHECK: ^29 = typeid: (name: "_ZTS1E", summary: (typeTestRes: (kind: unsat, sizeM1BitWidth: 0))) ; guid = 17437243864166745132
+; CHECK: ^30 = flags: 8
+; CHECK: ^31 = blockcount: 1888
 
 ; Make sure parsing of a non-summary entry containing a ":" does not fail
 ; after summary parsing, which handles colons differently.
diff --git a/llvm/test/Bindings/llvm-c/echo.ll b/llvm/test/Bindings/llvm-c/echo.ll
index 953a16b7e624..bb5fae0dcd12 100644
--- a/llvm/test/Bindings/llvm-c/echo.ll
+++ b/llvm/test/Bindings/llvm-c/echo.ll
@@ -348,6 +348,32 @@ define void @test_func_prologue_data_01() prologue %func_prolog_struct <{ i8 235
   ret void
 }
 
+
+define void @test_call_br_01(i32 %input) {
+entry:
+  callbr void asm "nop", "r,!i"(i32 %input) to label %bb_01 [label %bb_02]
+
+bb_01:
+  ret void
+bb_02:
+  ret void
+}
+
+define void @test_call_br_02(i32 %input0, i32 %input1) {
+entry:
+  ; Multiple indirect destinations, operand bundles, and arguments
+  callbr void asm "nop", "r,r,!i,!i"(i32 %input0, i32 %input1)
+    ["op0"(i32 %input1), "op1"(label %bb_02)]
+    to label %bb_01 [label %bb_03, label %bb_02]
+
+bb_01:
+  ret void
+bb_02:
+  ret void
+bb_03:
+  ret void
+}
+
 !llvm.dbg.cu = !{!0, !2}
 !llvm.module.flags = !{!3}
 
diff --git a/llvm/test/Bitcode/summary_version.ll b/llvm/test/Bitcode/summary_version.ll
index 26c64f81a773..98feab6fe2f9 100644
--- a/llvm/test/Bitcode/summary_version.ll
+++ b/llvm/test/Bitcode/summary_version.ll
@@ -2,7 +2,7 @@
 ; RUN: opt  -module-summary  %s -o - | llvm-bcanalyzer -dump | FileCheck %s
 
 ; CHECK: <GLOBALVAL_SUMMARY_BLOCK
-; CHECK: <VERSION op0=10/>
+; CHECK: <VERSION op0=9/>
 
 
 
diff --git a/llvm/test/Bitcode/thinlto-alias.ll b/llvm/test/Bitcode/thinlto-alias.ll
index 7deb2d8259e3..5dfff0f79619 100644
--- a/llvm/test/Bitcode/thinlto-alias.ll
+++ b/llvm/test/Bitcode/thinlto-alias.ll
@@ -31,9 +31,9 @@
 ; COMBINED-NEXT:    <VERSION
 ; COMBINED-NEXT:    <FLAGS
 ; See if the call to analias is registered, using the expected value id.
-; COMBINED-NEXT:    <VALUE_GUID {{.*}} op0=[[ALIASID:[0-9]+]] op1=2955807229 op2=886945438/>
+; COMBINED-NEXT:    <VALUE_GUID op0=[[ALIASID:[0-9]+]] op1=-5751648690987223394/>
 ; COMBINED-NEXT:    <VALUE_GUID
-; COMBINED-NEXT:    <VALUE_GUID {{.*}} op0=[[ALIASEEID:[0-9]+]] op1=4053019222 op2=46484856/>
+; COMBINED-NEXT:    <VALUE_GUID op0=[[ALIASEEID:[0-9]+]] op1=-1039159065113703048/>
 ; COMBINED-NEXT:    <COMBINED_PROFILE {{.*}} op9=[[ALIASID]]
 ; COMBINED-NEXT:    <COMBINED_PROFILE {{.*}}
 ; COMBINED-NEXT:    <COMBINED_ALIAS  {{.*}} op3=[[ALIASEEID]]
diff --git a/llvm/test/Bitcode/thinlto-func-summary-vtableref-pgo.ll b/llvm/test/Bitcode/thinlto-func-summary-vtableref-pgo.ll
index d864cadcff35..19e228fd5355 100644
--- a/llvm/test/Bitcode/thinlto-func-summary-vtableref-pgo.ll
+++ b/llvm/test/Bitcode/thinlto-func-summary-vtableref-pgo.ll
@@ -11,21 +11,21 @@
 ; RUN: llvm-dis -o - %t.o | llvm-as -o - | llvm-dis -o - | FileCheck %s --check-prefix=DIS
 
 ; CHECK: <GLOBALVAL_SUMMARY_BLOCK
-; CHECK-NEXT:   <VERSION op0=10/>
+; CHECK-NEXT:   <VERSION op0=9/>
 ; CHECK-NEXT:   <FLAGS op0=0/>
 ; The `VALUE_GUID` below represents the "_ZTV4Base" referenced by the instruction
 ; that loads vtable pointers.
-; CHECK-NEXT: <VALUE_GUID {{.*}} op0=21 op1=456547254 op2=3929380924/>
+; CHECK-NEXT: <VALUE_GUID op0=21 op1=1960855528937986108/>
 ; The `VALUE_GUID` below represents the "_ZN4Base4funcEv" referenced by the
 ; indirect call instruction.
-; CHECK-NEXT:      <VALUE_GUID {{.*}} op0=20 op1=1271117309 op2=2009351347/>
+; CHECK-NEXT:      <VALUE_GUID op0=20 op1=5459407273543877811/>
 ; NOTE vtables and functions from Derived class is dropped because
 ; `-icp-max-num-vtables` and `-icp-max-prom` are both set to one.
 ; <PERMODULE_PROFILE> has the format [valueid, flags, instcount, funcflags,
 ;                                     numrefs, rorefcnt, worefcnt,
 ;                                     m x valueid,
 ;                                     n x (valueid, hotness+tailcall)]
-; CHECK-NEXT:   <PERMODULE_PROFILE {{.*}} op0=0 op1=0 op2=4 op3=256 op4=1 op5=1 op6=0 op7=21 op8=20 op9=3/>
+; CHECK-NEXT:   <PERMODULE_PROFILE abbrevid=4 op0=0 op1=0 op2=4 op3=256 op4=1 op5=1 op6=0 op7=21 op8=20 op9=3/>
 ; CHECK-NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/Bitcode/thinlto-function-summary-callgraph-partial-sample-profile-summary.ll b/llvm/test/Bitcode/thinlto-function-summary-callgraph-partial-sample-profile-summary.ll
index 0c3ab9b20893..d44ee24694be 100644
--- a/llvm/test/Bitcode/thinlto-function-summary-callgraph-partial-sample-profile-summary.ll
+++ b/llvm/test/Bitcode/thinlto-function-summary-callgraph-partial-sample-profile-summary.ll
@@ -30,7 +30,7 @@
 ; CHECK-LABEL:       <GLOBALVAL_SUMMARY_BLOCK
 ; CHECK-NEXT:    <VERSION
 ; CHECK-NEXT:    <FLAGS
-; CHECK-NEXT:    <VALUE_GUID {{.*}} op0=27 op1=0 op2=123/>
+; CHECK-NEXT:    <VALUE_GUID op0=27 op1=123/>
 ; op4=none1 op6=hot1 op8=cold1 op10=none2 op12=hot2 op14=cold2 op16=none3 op18=hot3 op20=cold3 op22=123
 ; CHECK-NEXT:    <PERMODULE_PROFILE {{.*}} op7=7 op8=0 op9=1 op10=3 op11=4 op12=1 op13=8 op14=0 op15=2 op16=3 op17=5 op18=1 op19=9 op20=0 op21=3 op22=3 op23=6 op24=1 op25=27 op26=4/>
 ; CHECK-NEXT:    <BLOCK_COUNT op0=4/>
diff --git a/llvm/test/Bitcode/thinlto-function-summary-callgraph-pgo.ll b/llvm/test/Bitcode/thinlto-function-summary-callgraph-pgo.ll
index ed3c716288d6..2bbab0c6bb0d 100644
--- a/llvm/test/Bitcode/thinlto-function-summary-callgraph-pgo.ll
+++ b/llvm/test/Bitcode/thinlto-function-summary-callgraph-pgo.ll
@@ -26,7 +26,7 @@
 ; COMBINED:       <GLOBALVAL_SUMMARY_BLOCK
 ; COMBINED-NEXT:    <VERSION
 ; COMBINED-NEXT:    <FLAGS
-; COMBINED-NEXT:    <VALUE_GUID {{.*}} op0=[[FUNCID:[0-9]+]] op1=1697143370 op2=1603531901/>
+; COMBINED-NEXT:    <VALUE_GUID op0=[[FUNCID:[0-9]+]] op1=7289175272376759421/>
 ; COMBINED-NEXT:    <VALUE_GUID
 ; COMBINED-NEXT:    <COMBINED
 ; See if the call to func is registered, using the expected hotness type.
diff --git a/llvm/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll b/llvm/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll
index 576261e5392b..563fb18107d3 100644
--- a/llvm/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll
+++ b/llvm/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll
@@ -47,7 +47,7 @@
 ; CHECK-LABEL:       <GLOBALVAL_SUMMARY_BLOCK
 ; CHECK-NEXT:    <VERSION
 ; CHECK-NEXT:    <FLAGS
-; CHECK-NEXT:    <VALUE_GUID {{.*}} op0=25 op1=0 op2=123/>
+; CHECK-NEXT:    <VALUE_GUID op0=25 op1=123/>
 ; op4=hot1 op6=cold op8=hot2 op10=hot4 op12=none1 op14=hot3 op16=none2 op18=none3 op20=123
 ; CHECK-NEXT:    <PERMODULE_PROFILE {{.*}} op7=1 op8=3 op9=5 op10=1 op11=2 op12=3 op13=4 op14=1 op15=6 op16=2 op17=3 op18=3 op19=7 op20=2 op21=8 op22=2 op23=25 op24=4/>
 ; CHECK-NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
diff --git a/llvm/test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll b/llvm/test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll
index 2e9b362d39bb..601bebd39267 100644
--- a/llvm/test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll
+++ b/llvm/test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll
@@ -30,7 +30,7 @@
 ; CHECK-LABEL:       <GLOBALVAL_SUMMARY_BLOCK
 ; CHECK-NEXT:    <VERSION
 ; CHECK-NEXT:    <FLAGS
-; CHECK-NEXT:    <VALUE_GUID {{.*}} op0=26 op1=0 op2=123/>
+; CHECK-NEXT:    <VALUE_GUID op0=26 op1=123/>
 ; op4=none1 op6=hot1 op8=cold1 op10=none2 op12=hot2 op14=cold2 op16=none3 op18=hot3 op20=cold3 op22=123
 ; CHECK-NEXT:    <PERMODULE_PROFILE {{.*}} op7=7 op8=0 op9=1 op10=3 op11=4 op12=1 op13=8 op14=0 op15=2 op16=3 op17=5 op18=1 op19=9 op20=0 op21=3 op22=3 op23=6 op24=1 op25=26 op26=4/>
 ; CHECK-NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
diff --git a/llvm/test/Bitcode/thinlto-function-summary-callgraph.ll b/llvm/test/Bitcode/thinlto-function-summary-callgraph.ll
index becbc4a32dd9..542b400f8e33 100644
--- a/llvm/test/Bitcode/thinlto-function-summary-callgraph.ll
+++ b/llvm/test/Bitcode/thinlto-function-summary-callgraph.ll
@@ -30,7 +30,7 @@
 ; COMBINED-NEXT:    <FLAGS
 ; Only 2 VALUE_GUID since reference to undefinedglob should not be included in
 ; combined index.
-; COMBINED-NEXT:    <VALUE_GUID {{.*}} op0=[[FUNCID:[0-9]+]] op1=1697143370 op2=1603531901/>
+; COMBINED-NEXT:    <VALUE_GUID op0=[[FUNCID:[0-9]+]] op1=7289175272376759421/>
 ; COMBINED-NEXT:    <VALUE_GUID
 ; COMBINED-NEXT:    <COMBINED_PROFILE
 ; See if the call to func is registered.
diff --git a/llvm/test/Bitcode/thinlto-function-summary-originalnames.ll b/llvm/test/Bitcode/thinlto-function-summary-originalnames.ll
index 306eed8ec9ae..0139f00b4aa3 100644
--- a/llvm/test/Bitcode/thinlto-function-summary-originalnames.ll
+++ b/llvm/test/Bitcode/thinlto-function-summary-originalnames.ll
@@ -6,9 +6,9 @@
 ; COMBINED:       <GLOBALVAL_SUMMARY_BLOCK
 ; COMBINED-NEXT:    <VERSION
 ; COMBINED-NEXT:    <FLAGS
-; COMBINED-NEXT:    <VALUE_GUID {{.*}} op1=159893130 op2=1103175344/>
-; COMBINED-NEXT:    <VALUE_GUID {{.*}} op1=1049484794 op2=2739878751/>
-; COMBINED-NEXT:    <VALUE_GUID {{.*}} op1=2404717469 op2=2695872723/>
+; COMBINED-NEXT:    <VALUE_GUID {{.*}} op1=686735765308251824/>
+; COMBINED-NEXT:    <VALUE_GUID {{.*}} op1=4507502870619175775/>
+; COMBINED-NEXT:    <VALUE_GUID {{.*}} op1=-8118561185538785069/>
 ; COMBINED-DAG:    <COMBINED_PROFILE{{ }}
 ; COMBINED-DAG:    <COMBINED_ORIGINAL_NAME op0=-2012135647395072713/>
 ; COMBINED-DAG:    <COMBINED_GLOBALVAR_INIT_REFS
diff --git a/llvm/test/Bitcode/thinlto-function-summary-paramaccess.ll b/llvm/test/Bitcode/thinlto-function-summary-paramaccess.ll
index 9efe794580e1..f8e6476ea3e8 100644
--- a/llvm/test/Bitcode/thinlto-function-summary-paramaccess.ll
+++ b/llvm/test/Bitcode/thinlto-function-summary-paramaccess.ll
@@ -286,55 +286,55 @@ entry:
 
 
 ; COMBINED: <FLAGS op0=0/>
-; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=[[CALLEE1:1]] op1=16929164 op2=2901240562/>
-; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=[[CALLEE2:2]] op1=209731497 op2=362741704/>
-; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=3 op1=250424426 op2=1162038428/>
-; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=4 op1=330115482 op2=2111435476/>
-; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=5 op1=686623312 op2=3842916447/>
-; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=6 op1=973226983 op2=3211083905/>
-; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=7 op1=1290060147 op2=3622505949/>
-; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=8 op1=1440541236 op2=767101629/>
-; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=9 op1=1561683677 op2=160047780/>
-; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=10 op1=1958554144 op2=1633580483/>
-; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=11 op1=2395229756 op2=192361927/>
-; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=12 op1=2758440458 op2=1889048978/>
-; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=13 op1=3065148999 op2=3004927693/>
-; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=14 op1=3553425976 op2=2012721546/>
-; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=15 op1=3654668137 op2=2609615749/>
-; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=16 op1=3762448991 op2=3668708854/>
-; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=17 op1=3849363762 op2=364807594/>
-; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=18 op1=3877572794 op2=2713121666/>
-; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=19 op1=3993142988 op2=2149688628/>
+; COMBINED-NEXT: <VALUE_GUID op0=[[CALLEE1:1]] op1=72710208629861106/>
+; COMBINED-NEXT: <VALUE_GUID op0=[[CALLEE2:2]] op1=900789920918863816/>
+; COMBINED-NEXT: <VALUE_GUID op0=3 op1=1075564720951610524/>
+; COMBINED-NEXT: <VALUE_GUID op0=4 op1=1417835201204712148/>
+; COMBINED-NEXT: <VALUE_GUID op0=5 op1=2949024673554120799/>
+; COMBINED-NEXT: <VALUE_GUID op0=6 op1=4179978066780831873/>
+; COMBINED-NEXT: <VALUE_GUID op0=7 op1=5540766144860458461/>
+; COMBINED-NEXT: <VALUE_GUID op0=8 op1=6187077497926519485/>
+; COMBINED-NEXT: <VALUE_GUID op0=9 op1=6707380319572075172/>
+; COMBINED-NEXT: <VALUE_GUID op0=10 op1=8411925997558855107/>
+; COMBINED-NEXT: <VALUE_GUID op0=11 op1=-8159310605091129913/>
+; COMBINED-NEXT: <VALUE_GUID op0=12 op1=-6599332516747241070/>
+; COMBINED-NEXT: <VALUE_GUID op0=13 op1=-5282029362632487219/>
+; COMBINED-NEXT: <VALUE_GUID op0=14 op1=-3184895716019949174/>
+; COMBINED-NEXT: <VALUE_GUID op0=15 op1=-2750063944951688315/>
+; COMBINED-NEXT: <VALUE_GUID op0=16 op1=-2287148700827644426/>
+; COMBINED-NEXT: <VALUE_GUID op0=17 op1=-1913852605147216470/>
+; COMBINED-NEXT: <VALUE_GUID op0=18 op1=-1792695732907084926/>
+; COMBINED-NEXT: <VALUE_GUID op0=19 op1=-1296325529848142540/>
 ; COMBINED-NEXT: <PARAM_ACCESS op0=1 op1=0 op2=0 op3=0/>
-; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=1
+; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=1
 ; COMBINED-NEXT: <PARAM_ACCESS op0=0 op1=0 op2=0 op3=0/>
-; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=2
+; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=2
 ; COMBINED-NEXT: <PARAM_ACCESS op0=0 op1=0 op2=0 op3=1 op4=0 op5=[[CALLEE2]] op6=4 op7=6/>
-; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=3
+; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=3
 ; COMBINED-NEXT: <PARAM_ACCESS op0=0 op1=24 op2=32 op3=0/>
-; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=4
+; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=4
 ; COMBINED-NEXT: <PARAM_ACCESS op0=0 op1=0 op2=2 op3=0 op4=2 op5=0 op6=8 op7=0/>
-; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=5
-; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=6
+; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=5
+; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=6
 ; COMBINED-NEXT: <PARAM_ACCESS op0=0 op1=0 op2=2 op3=0/>
-; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=7
+; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=7
 ; COMBINED-NEXT: <PARAM_ACCESS op0=0 op1=0 op2=2 op3=0 op4=1 op5=0 op6=8 op7=0/>
-; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=8
-; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=9
+; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=8
+; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=9
 ; COMBINED-NEXT: <PARAM_ACCESS op0=0 op1=0 op2=0 op3=1 op4=0 op5=[[CALLEE2]] op6=0 op7=2/>
-; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=10
-; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=11
+; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=10
+; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=11
 ; COMBINED-NEXT: <PARAM_ACCESS op0=0 op1=113 op2=97 op3=0/>
-; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=12
-; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=13
+; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=12
+; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=13
 ; COMBINED-NEXT: <PARAM_ACCESS op0=0 op1=0 op2=0 op3=0/>
-; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=14
-; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=15
+; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=14
+; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=15
 ; COMBINED-NEXT: <PARAM_ACCESS op0=0 op1=1 op2=-2 op3=0/>
-; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=16
+; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=16
 ; COMBINED-NEXT: <PARAM_ACCESS op0=0 op1=0 op2=0 op3=1 op4=0 op5=[[CALLEE2]] op6=1431 op7=1429/>
-; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=17
+; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=17
 ; COMBINED-NEXT: <PARAM_ACCESS op0=0 op1=0 op2=0 op3=2 op4=0 op5=[[CALLEE2]] op6=1431 op7=250 op8=1 op9=[[CALLEE1]] op10=67 op11=65/>
-; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=18
+; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=18
 ; COMBINED-NEXT: <PARAM_ACCESS op0=0 op1=0 op2=0 op3=1 op4=0 op5=[[CALLEE2]] op6=1431 op7=250/>
-; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=19
+; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=19
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-bitreverse-shift.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-bitreverse-shift.ll
new file mode 100644
index 000000000000..b9fbe2379a42
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-bitreverse-shift.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -global-isel | FileCheck %s
+
+; These tests can be optimised
+;       fold (bitreverse(srl (bitreverse c), x)) -> (shl c, x)
+;       fold (bitreverse(shl (bitreverse c), x)) -> (srl c, x)
+
+declare i8 @llvm.bitreverse.i8(i8)
+declare i16 @llvm.bitreverse.i16(i16)
+declare i32 @llvm.bitreverse.i32(i32)
+declare i64 @llvm.bitreverse.i64(i64)
+
+define i8 @test_bitreverse_srli_bitreverse_i8(i8 %a) nounwind {
+; CHECK-LABEL: test_bitreverse_srli_bitreverse_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl w0, w0, #3
+; CHECK-NEXT:    ret
+  %1 = call i8 @llvm.bitreverse.i8(i8 %a)
+  %2 = lshr i8 %1, 3
+  %3 = call i8 @llvm.bitreverse.i8(i8 %2)
+  ret i8 %3
+}
+
+define i16 @test_bitreverse_srli_bitreverse_i16(i16 %a) nounwind {
+; CHECK-LABEL: test_bitreverse_srli_bitreverse_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl w0, w0, #7
+; CHECK-NEXT:    ret
+  %1 = call i16 @llvm.bitreverse.i16(i16 %a)
+  %2 = lshr i16 %1, 7
+  %3 = call i16 @llvm.bitreverse.i16(i16 %2)
+  ret i16 %3
+}
+
+define i32 @test_bitreverse_srli_bitreverse_i32(i32 %a) nounwind {
+; CHECK-LABEL: test_bitreverse_srli_bitreverse_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl w0, w0, #15
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.bitreverse.i32(i32 %a)
+  %2 = lshr i32 %1, 15
+  %3 = call i32 @llvm.bitreverse.i32(i32 %2)
+  ret i32 %3
+}
+
+define i64 @test_bitreverse_srli_bitreverse_i64(i64 %a) nounwind {
+; CHECK-LABEL: test_bitreverse_srli_bitreverse_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl x0, x0, #33
+; CHECK-NEXT:    ret
+  %1 = call i64 @llvm.bitreverse.i64(i64 %a)
+  %2 = lshr i64 %1, 33
+  %3 = call i64 @llvm.bitreverse.i64(i64 %2)
+  ret i64 %3
+}
+
+define i8 @test_bitreverse_shli_bitreverse_i8(i8 %a) nounwind {
+; CHECK-LABEL: test_bitreverse_shli_bitreverse_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ubfx w0, w0, #3, #5
+; CHECK-NEXT:    ret
+  %1 = call i8 @llvm.bitreverse.i8(i8 %a)
+  %2 = shl i8 %1, 3
+  %3 = call i8 @llvm.bitreverse.i8(i8 %2)
+  ret i8 %3
+}
+
+define i16 @test_bitreverse_shli_bitreverse_i16(i16 %a) nounwind {
+; CHECK-LABEL: test_bitreverse_shli_bitreverse_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ubfx w0, w0, #7, #9
+; CHECK-NEXT:    ret
+  %1 = call i16 @llvm.bitreverse.i16(i16 %a)
+  %2 = shl i16 %1, 7
+  %3 = call i16 @llvm.bitreverse.i16(i16 %2)
+  ret i16 %3
+}
+
+define i32 @test_bitreverse_shli_bitreverse_i32(i32 %a) nounwind {
+; CHECK-LABEL: test_bitreverse_shli_bitreverse_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsr w0, w0, #15
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.bitreverse.i32(i32 %a)
+  %2 = shl i32 %1, 15
+  %3 = call i32 @llvm.bitreverse.i32(i32 %2)
+  ret i32 %3
+}
+
+define i64 @test_bitreverse_shli_bitreverse_i64(i64 %a) nounwind {
+; CHECK-LABEL: test_bitreverse_shli_bitreverse_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsr x0, x0, #33
+; CHECK-NEXT:    ret
+  %1 = call i64 @llvm.bitreverse.i64(i64 %a)
+  %2 = shl i64 %1, 33
+  %3 = call i64 @llvm.bitreverse.i64(i64 %2)
+  ret i64 %3
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir
index 587d53c300f8..d5d33742148a 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir
@@ -571,3 +571,66 @@ body:             |
     RET_ReallyLR implicit $x0
 ...
 ---
+name:            extract_from_build_vector_shuffle_vector_undef
+body:             |
+  bb.1:
+    liveins: $x0, $x1
+    ; CHECK-LABEL: name: extract_from_build_vector_shuffle_vector_undef
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %extract:_(s32) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: $w0 = COPY %extract(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %arg1:_(<4 x s32>) = COPY $q0
+    %arg2:_(<4 x s32>) = COPY $q1
+    %idx:_(s64) = G_CONSTANT i64 0
+    %sv:_(<4 x s32>) = G_SHUFFLE_VECTOR %arg1(<4 x s32>), %arg2(<4 x s32>), shufflemask(-1, 0, 0, 0)
+    %extract:_(s32) = G_EXTRACT_VECTOR_ELT %sv(<4 x s32>), %idx(s64)
+    $w0 = COPY %extract(s32)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            extract_from_build_vector_shuffle_vector_opaque
+body:             |
+  bb.1:
+    liveins: $x0, $x1
+    ; CHECK-LABEL: name: extract_from_build_vector_shuffle_vector_opaque
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %arg1:_(<4 x s32>) = COPY $q0
+    ; CHECK-NEXT: %arg2:_(<4 x s32>) = COPY $q1
+    ; CHECK-NEXT: %idx:_(s64) = COPY $x1
+    ; CHECK-NEXT: %sv:_(<4 x s32>) = G_SHUFFLE_VECTOR %arg1(<4 x s32>), %arg2, shufflemask(undef, 0, 0, 0)
+    ; CHECK-NEXT: %extract:_(s32) = G_EXTRACT_VECTOR_ELT %sv(<4 x s32>), %idx(s64)
+    ; CHECK-NEXT: $w0 = COPY %extract(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %arg1:_(<4 x s32>) = COPY $q0
+    %arg2:_(<4 x s32>) = COPY $q1
+    %idx:_(s64) = COPY $x1
+    %sv:_(<4 x s32>) = G_SHUFFLE_VECTOR %arg1(<4 x s32>), %arg2(<4 x s32>), shufflemask(-1, 0, 0, 0)
+    %extract:_(s32) = G_EXTRACT_VECTOR_ELT %sv(<4 x s32>), %idx(s64)
+    $w0 = COPY %extract(s32)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            extract_from_build_vector_shuffle_vector_const
+body:             |
+  bb.1:
+    liveins: $x0, $x1
+    ; CHECK-LABEL: name: extract_from_build_vector_shuffle_vector_const
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %arg1:_(<4 x s32>) = COPY $q0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; CHECK-NEXT: %extract:_(s32) = G_EXTRACT_VECTOR_ELT %arg1(<4 x s32>), [[C]](s64)
+    ; CHECK-NEXT: $w0 = COPY %extract(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %arg1:_(<4 x s32>) = COPY $q0
+    %arg2:_(<4 x s32>) = COPY $q1
+    %idx:_(s64) = G_CONSTANT i64 0
+    %sv:_(<4 x s32>) = G_SHUFFLE_VECTOR %arg1(<4 x s32>), %arg2(<4 x s32>), shufflemask(3, 0, 0, 0)
+    %extract:_(s32) = G_EXTRACT_VECTOR_ELT %sv(<4 x s32>), %idx(s64)
+    $w0 = COPY %extract(s32)
+    RET_ReallyLR implicit $x0
+...
+---
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-with-flags.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-with-flags.mir
new file mode 100644
index 000000000000..6eece5c56258
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-with-flags.mir
@@ -0,0 +1,353 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s
+
+---
+name:            zext_trunc_nuw
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: zext_trunc_nuw
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: $x1 = COPY [[COPY]](s64)
+    %0:_(s64) = COPY $x0
+    %2:_(s32) = nuw G_TRUNC %0
+    %3:_(s64) = G_ZEXT  %2
+    $x1 = COPY %3
+...
+---
+name:            zext_trunc_nsw
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: zext_trunc_nsw
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = nsw G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s32)
+    ; CHECK-NEXT: $x1 = COPY [[ZEXT]](s64)
+    %0:_(s64) = COPY $x0
+    %2:_(s32) = nsw G_TRUNC %0
+    %3:_(s64) = G_ZEXT  %2
+    $x1 = COPY %3
+...
+---
+name:            zext_trunc
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: zext_trunc
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s32)
+    ; CHECK-NEXT: $x1 = COPY [[ZEXT]](s64)
+    %0:_(s64) = COPY $x0
+    %2:_(s32) = G_TRUNC %0
+    %3:_(s64) = G_ZEXT  %2
+    $x1 = COPY %3
+...
+---
+name:            zext_trunc_nuw_vector
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: zext_trunc_nuw_vector
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+    ; CHECK-NEXT: %bv0:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY]](s32), [[COPY1]](s32)
+    ; CHECK-NEXT: $q0 = COPY %bv0(<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = COPY $w1
+    %2:_(s32) = COPY $w2
+    %3:_(s32) = COPY $w3
+    %bv0:_(<4 x s32>) = G_BUILD_VECTOR %0:_(s32), %1:_(s32), %0:_(s32), %1:_(s32)
+    %trunc:_(<4 x s16>) = nuw G_TRUNC %bv0
+    %zext:_(<4 x s32>) = G_ZEXT  %trunc
+    $q0 = COPY %zext(<4 x s32>)
+    RET_ReallyLR implicit $w0
+...
+---
+name:            sext_trunc_nsw
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: sext_trunc_nsw
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: $x1 = COPY [[COPY]](s64)
+    %0:_(s64) = COPY $x0
+    %2:_(s32) = nsw G_TRUNC %0
+    %3:_(s64) = G_SEXT  %2
+    $x1 = COPY %3
+...
+---
+name:            sext_trunc_nuw
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: sext_trunc_nuw
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = nuw G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[TRUNC]](s32)
+    ; CHECK-NEXT: $x1 = COPY [[SEXT]](s64)
+    %0:_(s64) = COPY $x0
+    %2:_(s32) = nuw G_TRUNC %0
+    %3:_(s64) = G_SEXT  %2
+    $x1 = COPY %3
+...
+---
+name:            sext_trunc
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: sext_trunc
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[TRUNC]](s32)
+    ; CHECK-NEXT: $x1 = COPY [[SEXT]](s64)
+    %0:_(s64) = COPY $x0
+    %2:_(s32) = G_TRUNC %0
+    %3:_(s64) = G_SEXT  %2
+    $x1 = COPY %3
+...
+---
+name:            sext_trunc_nsw_types_wrong
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: sext_trunc_nsw_types_wrong
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = nsw G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: $w1 = COPY [[TRUNC]](s32)
+    %0:_(s64) = COPY $x0
+    %2:_(s16) = nsw G_TRUNC %0
+    %3:_(s32) = G_SEXT  %2
+    $w1 = COPY %3
+...
+---
+name:            sext_trunc_nsw_nuw
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: sext_trunc_nsw_nuw
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: $x1 = COPY [[COPY]](s64)
+    %0:_(s64) = COPY $x0
+    %2:_(s32) = nsw nuw G_TRUNC %0
+    %3:_(s64) = G_SEXT  %2
+    $x1 = COPY %3
+...
+---
+name:            sext_trunc_nsw_nuw_vector
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: sext_trunc_nsw_nuw_vector
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $w3
+    ; CHECK-NEXT: %bv0:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+    ; CHECK-NEXT: $q0 = COPY %bv0(<4 x s32>)
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = COPY $w1
+    %2:_(s32) = COPY $w2
+    %3:_(s32) = COPY $w3
+    %bv0:_(<4 x s32>) = G_BUILD_VECTOR %0:_(s32), %1:_(s32), %2:_(s32), %3:_(s32)
+    %t:_(<4 x s16>) = nsw nuw G_TRUNC %bv0
+    %s:_(<4 x s32>) = G_SEXT  %t
+    $q0 = COPY %s
+...
+---
+name:            zext_trunc_vector
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: zext_trunc_vector
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $w3
+    ; CHECK-NEXT: %bv0:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+    ; CHECK-NEXT: %t:_(<4 x s16>) = G_TRUNC %bv0(<4 x s32>)
+    ; CHECK-NEXT: %z:_(<4 x s32>) = G_ZEXT %t(<4 x s16>)
+    ; CHECK-NEXT: $q0 = COPY %z(<4 x s32>)
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = COPY $w1
+    %2:_(s32) = COPY $w2
+    %3:_(s32) = COPY $w3
+    %bv0:_(<4 x s32>) = G_BUILD_VECTOR %0:_(s32), %1:_(s32), %2:_(s32), %3:_(s32)
+    %t:_(<4 x s16>) = G_TRUNC %bv0
+    %z:_(<4 x s32>) = G_ZEXT  %t
+    $q0 = COPY %z
+...
+---
+name:            zext_trunc_nsw_vector
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: zext_trunc_nsw_vector
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $w3
+    ; CHECK-NEXT: %bv0:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+    ; CHECK-NEXT: %t:_(<4 x s16>) = nsw G_TRUNC %bv0(<4 x s32>)
+    ; CHECK-NEXT: %z:_(<4 x s32>) = G_ZEXT %t(<4 x s16>)
+    ; CHECK-NEXT: $q0 = COPY %z(<4 x s32>)
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = COPY $w1
+    %2:_(s32) = COPY $w2
+    %3:_(s32) = COPY $w3
+    %bv0:_(<4 x s32>) = G_BUILD_VECTOR %0:_(s32), %1:_(s32), %2:_(s32), %3:_(s32)
+    %t:_(<4 x s16>) = nsw G_TRUNC %bv0
+    %z:_(<4 x s32>) = G_ZEXT  %t
+    $q0 = COPY %z
+...
+---
+name:            zext_trunc_nuw_vector2
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: zext_trunc_nuw_vector2
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $w3
+    ; CHECK-NEXT: %bv0:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+    ; CHECK-NEXT: $q0 = COPY %bv0(<4 x s32>)
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = COPY $w1
+    %2:_(s32) = COPY $w2
+    %3:_(s32) = COPY $w3
+    %bv0:_(<4 x s32>) = G_BUILD_VECTOR %0:_(s32), %1:_(s32), %2:_(s32), %3:_(s32)
+    %t:_(<4 x s16>) = nuw G_TRUNC %bv0
+    %z:_(<4 x s32>) = G_ZEXT  %t
+    $q0 = COPY %z
+...
+---
+name:            zext_trunc_nuw_vector_wrong_type
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: zext_trunc_nuw_vector_wrong_type
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
+    ; CHECK-NEXT: %bv0:_(<2 x s64>) = G_BUILD_VECTOR [[COPY]](s64), [[COPY1]](s64)
+    ; CHECK-NEXT: %z:_(<2 x s32>) = nuw G_TRUNC %bv0(<2 x s64>)
+    ; CHECK-NEXT: $d0 = COPY %z(<2 x s32>)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %bv0:_(<2 x s64>) = G_BUILD_VECTOR %0:_(s64), %1:_(s64)
+    %t:_(<2 x s16>) = nuw G_TRUNC %bv0
+    %z:_(<2 x s32>) = G_ZEXT  %t
+    $d0 = COPY %z
+...
+---
+name:            zext_trunc_nuw_scalable_vector
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: zext_trunc_nuw_scalable_vector
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %sv0:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[COPY]](s64)
+    ; CHECK-NEXT: $z0 = COPY %sv0(<vscale x 2 x s64>)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %sv0:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR %0:_(s64)
+    %t:_(<vscale x 2 x s32>) = nuw G_TRUNC %sv0
+    %z:_(<vscale x 2 x s64>) = G_ZEXT  %t
+    $z0 = COPY %z
+...
+---
+name:            zext_trunc_nuw_to_zext
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: zext_trunc_nuw_to_zext
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: %2:_(s64) = nneg G_ZEXT [[COPY]](s32)
+    ; CHECK-NEXT: $x1 = COPY %2(s64)
+    %0:_(s32) = COPY $w0
+    %2:_(s16) = nuw G_TRUNC %0
+    %3:_(s64) = G_ZEXT  %2
+    $x1 = COPY %3
+...
+---
+name:            zext_trunc_nuw_to_trunc
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: zext_trunc_nuw_to_trunc
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = nuw G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: $w1 = COPY [[TRUNC]](s32)
+    %0:_(s64) = COPY $x0
+    %2:_(s16) = nuw G_TRUNC %0
+    %3:_(s32) = G_ZEXT  %2
+    $w1 = COPY %3
+...
+---
+name:            sext_trunc_nsw_to_sext
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: sext_trunc_nsw_to_sext
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY]](s32)
+    ; CHECK-NEXT: $x1 = COPY [[SEXT]](s64)
+    %0:_(s32) = COPY $w0
+    %2:_(s16) = nsw G_TRUNC %0
+    %3:_(s64) = G_SEXT  %2
+    $x1 = COPY %3
+...
+---
+name:            sext_trunc_nsw_to_trunc
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: sext_trunc_nsw_to_trunc
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = nsw G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: $w1 = COPY [[TRUNC]](s32)
+    %0:_(s64) = COPY $x0
+    %2:_(s16) = nsw G_TRUNC %0
+    %3:_(s32) = G_SEXT  %2
+    $w1 = COPY %3
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ptr-add.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ptr-add.mir
index 1ecd36b55380..1d3f7eab79d6 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ptr-add.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ptr-add.mir
@@ -6,12 +6,65 @@ body:             |
   bb.0.entry:
     ; CHECK-LABEL: name: test_ptr_add_vec_p0
     ; CHECK: [[COPY:%[0-9]+]]:_(<2 x p0>) = COPY $q0
-    ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
-    ; CHECK: [[PTR_ADD:%[0-9]+]]:_(<2 x p0>) = G_PTR_ADD [[COPY]], [[COPY1]](<2 x s64>)
-    ; CHECK: $q0 = COPY [[PTR_ADD]](<2 x p0>)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(<2 x p0>) = G_PTR_ADD [[COPY]], [[COPY1]](<2 x s64>)
+    ; CHECK-NEXT: $q0 = COPY [[PTR_ADD]](<2 x p0>)
     %0:_(<2 x p0>) = COPY $q0
     %1:_(<2 x s64>) = COPY $q1
     %3:_(<2 x p0>) = G_PTR_ADD %0, %1(<2 x s64>)
     $q0 = COPY %3(<2 x p0>)
 
 ...
+---
+name:            test_ptr_add_vec_4xp0
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: test_ptr_add_vec_4xp0
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(p0) = COPY $x3
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s64) = COPY $x4
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s64) = COPY $x5
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s64) = COPY $x6
+    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s64) = COPY $x7
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p0>) = G_BUILD_VECTOR [[COPY]](p0), [[COPY1]](p0)
+    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x p0>) = G_BUILD_VECTOR [[COPY2]](p0), [[COPY3]](p0)
+    ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[COPY4]](s64), [[COPY5]](s64)
+    ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[COPY6]](s64), [[COPY7]](s64)
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(<2 x p0>) = G_PTR_ADD [[BUILD_VECTOR]], [[BUILD_VECTOR2]](<2 x s64>)
+    ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(<2 x p0>) = G_PTR_ADD [[BUILD_VECTOR1]], [[BUILD_VECTOR3]](<2 x s64>)
+    ; CHECK-NEXT: %zero:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: %one:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: %extract0:_(p0) = G_EXTRACT_VECTOR_ELT [[PTR_ADD]](<2 x p0>), %zero(s64)
+    ; CHECK-NEXT: %extract1:_(p0) = G_EXTRACT_VECTOR_ELT [[PTR_ADD]](<2 x p0>), %one(s64)
+    ; CHECK-NEXT: %extract2:_(p0) = G_EXTRACT_VECTOR_ELT [[PTR_ADD1]](<2 x p0>), %zero(s64)
+    ; CHECK-NEXT: %extract3:_(p0) = G_EXTRACT_VECTOR_ELT [[PTR_ADD1]](<2 x p0>), %one(s64)
+    ; CHECK-NEXT: $x0 = COPY %extract0(p0)
+    ; CHECK-NEXT: $x1 = COPY %extract1(p0)
+    ; CHECK-NEXT: $x2 = COPY %extract2(p0)
+    ; CHECK-NEXT: $x3 = COPY %extract3(p0)
+    %0:_(p0) = COPY $x0
+    %1:_(p0) = COPY $x1
+    %2:_(p0) = COPY $x2
+    %3:_(p0) = COPY $x3
+    %4:_(s64) = COPY $x4
+    %5:_(s64) = COPY $x5
+    %6:_(s64) = COPY $x6
+    %7:_(s64) = COPY $x7
+    %ptr:_(<4 x p0>) = G_BUILD_VECTOR %0(p0), %1(p0), %2(p0), %3(p0)
+    %add:_(<4 x s64>) = G_BUILD_VECTOR %4(s64), %5(s64), %6(s64), %7(s64)
+    %res:_(<4 x p0>) = G_PTR_ADD %ptr, %add(<4 x s64>)
+    %zero:_(s64) = G_CONSTANT i64 0
+    %one:_(s64) = G_CONSTANT i64 1
+    %two:_(s64) = G_CONSTANT i64 2
+    %three:_(s64) = G_CONSTANT i64 3
+    %extract0:_(p0) = G_EXTRACT_VECTOR_ELT %res(<4 x p0>), %zero(s64)
+    %extract1:_(p0) = G_EXTRACT_VECTOR_ELT %res(<4 x p0>), %one(s64)
+    %extract2:_(p0) = G_EXTRACT_VECTOR_ELT %res(<4 x p0>), %two(s64)
+    %extract3:_(p0) = G_EXTRACT_VECTOR_ELT %res(<4 x p0>), %three(s64)
+    $x0 = COPY %extract0(p0)
+    $x1 = COPY %extract1(p0)
+    $x2 = COPY %extract2(p0)
+    $x3 = COPY %extract3(p0)
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index 20133158e4fa..d71111b57efe 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -674,6 +674,9 @@
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. the first uncovered type index: 1, OK
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
+# DEBUG-NEXT: G_FTAN (opcode {{[0-9]+}}): 1 type index, 0 imm indices
+# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: G_FSQRT (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-ext.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-ext.mir
index a6c37f631ca2..3a8357db311c 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-ext.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-ext.mir
@@ -302,7 +302,7 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %v1:_(<2 x s64>) = COPY $q0
     ; CHECK-NEXT: %v2:_(<2 x s64>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: %shuf:_(<2 x s64>) = G_ZIP2 %v1, %v2
+    ; CHECK-NEXT: %shuf:_(<2 x s64>) = G_TRN2 %v1, %v2
     ; CHECK-NEXT: $q0 = COPY %shuf(<2 x s64>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %v1:_(<2 x s64>) = COPY $q0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-cmp.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-cmp.mir
index 4151f7ecb3ea..df4e7ddaac8b 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-cmp.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-cmp.mir
@@ -361,8 +361,8 @@ body:             |
     ; CHECK-NEXT: %cmp_lhs:fpr128 = COPY $q0
     ; CHECK-NEXT: %cmp_rhs:fpr128 = COPY $q1
     ; CHECK-NEXT: %add_lhs:fpr128 = COPY $q2
-    ; CHECK-NEXT: [[CMEQv4i32_:%[0-9]+]]:fpr128 = CMEQv4i32 %cmp_lhs, %cmp_rhs
-    ; CHECK-NEXT: %add:fpr128 = ADDv4i32 %add_lhs, [[CMEQv4i32_]]
+    ; CHECK-NEXT: %cmp:fpr128 = CMEQv4i32 %cmp_lhs, %cmp_rhs
+    ; CHECK-NEXT: %add:fpr128 = ADDv4i32 %add_lhs, %cmp
     ; CHECK-NEXT: $q0 = COPY %add
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %cmp_lhs:fpr(<4 x s32>) = COPY $q0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-vector-icmp.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-vector-icmp.mir
index 21e84ecaed32..7884d9e1b1d7 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-vector-icmp.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-vector-icmp.mir
@@ -46,46 +46,6 @@
     ret <8 x i1> %cmp
   }
 
-  define <2 x i1> @test_v2i64_ne(<2 x i64> %v1, <2 x i64> %v2) {
-    %cmp = icmp ne <2 x i64> %v1, %v2
-    ret <2 x i1> %cmp
-  }
-
-  define <4 x i1> @test_v4i32_ne(<4 x i32> %v1, <4 x i32> %v2) {
-    %cmp = icmp ne <4 x i32> %v1, %v2
-    ret <4 x i1> %cmp
-  }
-
-  define <2 x i1> @test_v2i32_ne(<2 x i32> %v1, <2 x i32> %v2) {
-    %cmp = icmp ne <2 x i32> %v1, %v2
-    ret <2 x i1> %cmp
-  }
-
-  define <2 x i1> @test_v2i16_ne(<2 x i16> %v1, <2 x i16> %v2) {
-    %cmp = icmp ne <2 x i16> %v1, %v2
-    ret <2 x i1> %cmp
-  }
-
-  define <8 x i1> @test_v8i16_ne(<8 x i16> %v1, <8 x i16> %v2) {
-    %cmp = icmp ne <8 x i16> %v1, %v2
-    ret <8 x i1> %cmp
-  }
-
-  define <4 x i1> @test_v4i16_ne(<4 x i16> %v1, <4 x i16> %v2) {
-    %cmp = icmp ne <4 x i16> %v1, %v2
-    ret <4 x i1> %cmp
-  }
-
-  define <16 x i1> @test_v16i8_ne(<16 x i8> %v1, <16 x i8> %v2) {
-    %cmp = icmp ne <16 x i8> %v1, %v2
-    ret <16 x i1> %cmp
-  }
-
-  define <8 x i1> @test_v8i8_ne(<8 x i8> %v1, <8 x i8> %v2) {
-    %cmp = icmp ne <8 x i8> %v1, %v2
-    ret <8 x i1> %cmp
-  }
-
   define <2 x i1> @test_v2i64_ugt(<2 x i64> %v1, <2 x i64> %v2) {
     %cmp = icmp ugt <2 x i64> %v1, %v2
     ret <2 x i1> %cmp
@@ -698,304 +658,6 @@ body:             |
 
 ...
 ---
-name:            test_v2i64_ne
-alignment:       4
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: fpr }
-  - { id: 1, class: fpr }
-  - { id: 2, class: _ }
-  - { id: 3, class: fpr }
-  - { id: 4, class: fpr }
-machineFunctionInfo: {}
-body:             |
-  bb.1 (%ir-block.0):
-    liveins: $q0, $q1
-
-    ; CHECK-LABEL: name: test_v2i64_ne
-    ; CHECK: liveins: $q0, $q1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr128 = COPY $q1
-    ; CHECK-NEXT: [[CMEQv2i64_:%[0-9]+]]:fpr128 = CMEQv2i64 [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[NOTv16i8_:%[0-9]+]]:fpr128 = NOTv16i8 [[CMEQv2i64_]]
-    ; CHECK-NEXT: [[XTNv2i32_:%[0-9]+]]:fpr64 = XTNv2i32 [[NOTv16i8_]]
-    ; CHECK-NEXT: $d0 = COPY [[XTNv2i32_]]
-    ; CHECK-NEXT: RET_ReallyLR implicit $d0
-    %0:fpr(<2 x s64>) = COPY $q0
-    %1:fpr(<2 x s64>) = COPY $q1
-    %4:fpr(<2 x s64>) = G_ICMP intpred(ne), %0(<2 x s64>), %1
-    %3:fpr(<2 x s32>) = G_TRUNC %4(<2 x s64>)
-    $d0 = COPY %3(<2 x s32>)
-    RET_ReallyLR implicit $d0
-
-...
----
-name:            test_v4i32_ne
-alignment:       4
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: fpr }
-  - { id: 1, class: fpr }
-  - { id: 2, class: _ }
-  - { id: 3, class: fpr }
-  - { id: 4, class: fpr }
-machineFunctionInfo: {}
-body:             |
-  bb.1 (%ir-block.0):
-    liveins: $q0, $q1
-
-    ; CHECK-LABEL: name: test_v4i32_ne
-    ; CHECK: liveins: $q0, $q1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr128 = COPY $q1
-    ; CHECK-NEXT: [[CMEQv4i32_:%[0-9]+]]:fpr128 = CMEQv4i32 [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[NOTv16i8_:%[0-9]+]]:fpr128 = NOTv16i8 [[CMEQv4i32_]]
-    ; CHECK-NEXT: [[XTNv4i16_:%[0-9]+]]:fpr64 = XTNv4i16 [[NOTv16i8_]]
-    ; CHECK-NEXT: $d0 = COPY [[XTNv4i16_]]
-    ; CHECK-NEXT: RET_ReallyLR implicit $d0
-    %0:fpr(<4 x s32>) = COPY $q0
-    %1:fpr(<4 x s32>) = COPY $q1
-    %4:fpr(<4 x s32>) = G_ICMP intpred(ne), %0(<4 x s32>), %1
-    %3:fpr(<4 x s16>) = G_TRUNC %4(<4 x s32>)
-    $d0 = COPY %3(<4 x s16>)
-    RET_ReallyLR implicit $d0
-
-...
----
-name:            test_v2i32_ne
-alignment:       4
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: fpr }
-  - { id: 1, class: fpr }
-  - { id: 2, class: _ }
-  - { id: 3, class: fpr }
-  - { id: 4, class: fpr }
-machineFunctionInfo: {}
-body:             |
-  bb.1 (%ir-block.0):
-    liveins: $d0, $d1
-
-    ; CHECK-LABEL: name: test_v2i32_ne
-    ; CHECK: liveins: $d0, $d1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr64 = COPY $d1
-    ; CHECK-NEXT: [[CMEQv2i32_:%[0-9]+]]:fpr64 = CMEQv2i32 [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[NOTv8i8_:%[0-9]+]]:fpr64 = NOTv8i8 [[CMEQv2i32_]]
-    ; CHECK-NEXT: $d0 = COPY [[NOTv8i8_]]
-    ; CHECK-NEXT: RET_ReallyLR implicit $d0
-    %0:fpr(<2 x s32>) = COPY $d0
-    %1:fpr(<2 x s32>) = COPY $d1
-    %4:fpr(<2 x s32>) = G_ICMP intpred(ne), %0(<2 x s32>), %1
-    %3:fpr(<2 x s32>) = COPY %4(<2 x s32>)
-    $d0 = COPY %3(<2 x s32>)
-    RET_ReallyLR implicit $d0
-
-...
----
-name:            test_v2i16_ne
-alignment:       4
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: fpr }
-  - { id: 3, class: fpr }
-  - { id: 4, class: _ }
-  - { id: 5, class: fpr }
-  - { id: 6, class: _ }
-  - { id: 7, class: fpr }
-  - { id: 8, class: fpr }
-  - { id: 9, class: fpr }
-  - { id: 10, class: gpr }
-  - { id: 11, class: fpr }
-  - { id: 12, class: fpr }
-  - { id: 13, class: gpr }
-  - { id: 14, class: fpr }
-  - { id: 15, class: fpr }
-machineFunctionInfo: {}
-body:             |
-  bb.1 (%ir-block.0):
-    liveins: $d0, $d1
-
-    ; CHECK-LABEL: name: test_v2i16_ne
-    ; CHECK: liveins: $d0, $d1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr64 = COPY $d1
-    ; CHECK-NEXT: [[MOVID:%[0-9]+]]:fpr64 = MOVID 51
-    ; CHECK-NEXT: [[ANDv8i8_:%[0-9]+]]:fpr64 = ANDv8i8 [[COPY]], [[MOVID]]
-    ; CHECK-NEXT: [[MOVID1:%[0-9]+]]:fpr64 = MOVID 51
-    ; CHECK-NEXT: [[ANDv8i8_1:%[0-9]+]]:fpr64 = ANDv8i8 [[COPY1]], [[MOVID1]]
-    ; CHECK-NEXT: [[CMEQv2i32_:%[0-9]+]]:fpr64 = CMEQv2i32 [[ANDv8i8_]], [[ANDv8i8_1]]
-    ; CHECK-NEXT: [[NOTv8i8_:%[0-9]+]]:fpr64 = NOTv8i8 [[CMEQv2i32_]]
-    ; CHECK-NEXT: $d0 = COPY [[NOTv8i8_]]
-    ; CHECK-NEXT: RET_ReallyLR implicit $d0
-    %2:fpr(<2 x s32>) = COPY $d0
-    %3:fpr(<2 x s32>) = COPY $d1
-    %13:gpr(s32) = G_CONSTANT i32 65535
-    %14:fpr(<2 x s32>) = G_BUILD_VECTOR %13(s32), %13(s32)
-    %15:fpr(<2 x s32>) = COPY %2(<2 x s32>)
-    %7:fpr(<2 x s32>) = G_AND %15, %14
-    %10:gpr(s32) = G_CONSTANT i32 65535
-    %11:fpr(<2 x s32>) = G_BUILD_VECTOR %10(s32), %10(s32)
-    %12:fpr(<2 x s32>) = COPY %3(<2 x s32>)
-    %8:fpr(<2 x s32>) = G_AND %12, %11
-    %9:fpr(<2 x s32>) = G_ICMP intpred(ne), %7(<2 x s32>), %8
-    %5:fpr(<2 x s32>) = COPY %9(<2 x s32>)
-    $d0 = COPY %5(<2 x s32>)
-    RET_ReallyLR implicit $d0
-
-...
----
-name:            test_v8i16_ne
-alignment:       4
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: fpr }
-  - { id: 1, class: fpr }
-  - { id: 2, class: _ }
-  - { id: 3, class: fpr }
-  - { id: 4, class: fpr }
-machineFunctionInfo: {}
-body:             |
-  bb.1 (%ir-block.0):
-    liveins: $q0, $q1
-
-    ; CHECK-LABEL: name: test_v8i16_ne
-    ; CHECK: liveins: $q0, $q1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr128 = COPY $q1
-    ; CHECK-NEXT: [[CMEQv8i16_:%[0-9]+]]:fpr128 = CMEQv8i16 [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[NOTv16i8_:%[0-9]+]]:fpr128 = NOTv16i8 [[CMEQv8i16_]]
-    ; CHECK-NEXT: [[XTNv8i8_:%[0-9]+]]:fpr64 = XTNv8i8 [[NOTv16i8_]]
-    ; CHECK-NEXT: $d0 = COPY [[XTNv8i8_]]
-    ; CHECK-NEXT: RET_ReallyLR implicit $d0
-    %0:fpr(<8 x s16>) = COPY $q0
-    %1:fpr(<8 x s16>) = COPY $q1
-    %4:fpr(<8 x s16>) = G_ICMP intpred(ne), %0(<8 x s16>), %1
-    %3:fpr(<8 x s8>) = G_TRUNC %4(<8 x s16>)
-    $d0 = COPY %3(<8 x s8>)
-    RET_ReallyLR implicit $d0
-
-...
----
-name:            test_v4i16_ne
-alignment:       4
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: fpr }
-  - { id: 1, class: fpr }
-  - { id: 2, class: _ }
-  - { id: 3, class: fpr }
-  - { id: 4, class: fpr }
-machineFunctionInfo: {}
-body:             |
-  bb.1 (%ir-block.0):
-    liveins: $d0, $d1
-
-    ; CHECK-LABEL: name: test_v4i16_ne
-    ; CHECK: liveins: $d0, $d1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr64 = COPY $d1
-    ; CHECK-NEXT: [[CMEQv4i16_:%[0-9]+]]:fpr64 = CMEQv4i16 [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[NOTv8i8_:%[0-9]+]]:fpr64 = NOTv8i8 [[CMEQv4i16_]]
-    ; CHECK-NEXT: $d0 = COPY [[NOTv8i8_]]
-    ; CHECK-NEXT: RET_ReallyLR implicit $d0
-    %0:fpr(<4 x s16>) = COPY $d0
-    %1:fpr(<4 x s16>) = COPY $d1
-    %4:fpr(<4 x s16>) = G_ICMP intpred(ne), %0(<4 x s16>), %1
-    %3:fpr(<4 x s16>) = COPY %4(<4 x s16>)
-    $d0 = COPY %3(<4 x s16>)
-    RET_ReallyLR implicit $d0
-
-...
----
-name:            test_v16i8_ne
-alignment:       4
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: fpr }
-  - { id: 1, class: fpr }
-  - { id: 2, class: _ }
-  - { id: 3, class: fpr }
-  - { id: 4, class: fpr }
-machineFunctionInfo: {}
-body:             |
-  bb.1 (%ir-block.0):
-    liveins: $q0, $q1
-
-    ; CHECK-LABEL: name: test_v16i8_ne
-    ; CHECK: liveins: $q0, $q1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr128 = COPY $q1
-    ; CHECK-NEXT: [[CMEQv16i8_:%[0-9]+]]:fpr128 = CMEQv16i8 [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[NOTv16i8_:%[0-9]+]]:fpr128 = NOTv16i8 [[CMEQv16i8_]]
-    ; CHECK-NEXT: $q0 = COPY [[NOTv16i8_]]
-    ; CHECK-NEXT: RET_ReallyLR implicit $q0
-    %0:fpr(<16 x s8>) = COPY $q0
-    %1:fpr(<16 x s8>) = COPY $q1
-    %4:fpr(<16 x s8>) = G_ICMP intpred(ne), %0(<16 x s8>), %1
-    %3:fpr(<16 x s8>) = COPY %4(<16 x s8>)
-    $q0 = COPY %3(<16 x s8>)
-    RET_ReallyLR implicit $q0
-
-...
----
-name:            test_v8i8_ne
-alignment:       4
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: fpr }
-  - { id: 1, class: fpr }
-  - { id: 2, class: _ }
-  - { id: 3, class: fpr }
-  - { id: 4, class: fpr }
-machineFunctionInfo: {}
-body:             |
-  bb.1 (%ir-block.0):
-    liveins: $d0, $d1
-
-    ; CHECK-LABEL: name: test_v8i8_ne
-    ; CHECK: liveins: $d0, $d1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr64 = COPY $d1
-    ; CHECK-NEXT: [[CMEQv8i8_:%[0-9]+]]:fpr64 = CMEQv8i8 [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[NOTv8i8_:%[0-9]+]]:fpr64 = NOTv8i8 [[CMEQv8i8_]]
-    ; CHECK-NEXT: $d0 = COPY [[NOTv8i8_]]
-    ; CHECK-NEXT: RET_ReallyLR implicit $d0
-    %0:fpr(<8 x s8>) = COPY $d0
-    %1:fpr(<8 x s8>) = COPY $d1
-    %4:fpr(<8 x s8>) = G_ICMP intpred(ne), %0(<8 x s8>), %1
-    %3:fpr(<8 x s8>) = COPY %4(<8 x s8>)
-    $d0 = COPY %3(<8 x s8>)
-    RET_ReallyLR implicit $d0
-
-...
----
 name:            test_v2i64_ugt
 alignment:       4
 legalized:       true
diff --git a/llvm/test/CodeGen/AArch64/aarch64-vuzp.ll b/llvm/test/CodeGen/AArch64/aarch64-vuzp.ll
index ba1ad9ba989c..1debf0256467 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-vuzp.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-vuzp.ll
@@ -3,18 +3,18 @@
 declare <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>)
 
 ; CHECK-LABEL: fun1:
-; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 define i32 @fun1() {
 entry:
   %vtbl1.i.1 = tail call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> <i8 0, i8 16, i8 19, i8 4, i8 -65, i8 -65, i8 -71, i8 -71, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> undef)
-  %vuzp.i212.1 = shufflevector <16 x i8> %vtbl1.i.1, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-  %scevgep = getelementptr <8 x i8>, ptr undef, i64 1
-  store <8 x i8> %vuzp.i212.1, ptr %scevgep, align 1
+  %vuzp.i212.1 = shufflevector <16 x i8> %vtbl1.i.1, <16 x i8> %vtbl1.i.1, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %scevgep = getelementptr <16 x i8>, ptr undef, i64 1
+  store <16 x i8> %vuzp.i212.1, ptr %scevgep, align 1
   ret i32 undef
 }
 
 ; CHECK-LABEL: fun2:
-; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 define i32 @fun2() {
 entry:
   %vtbl1.i.1 = tail call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> <i8 0, i8 16, i8 19, i8 4, i8 -65, i8 -65, i8 -71, i8 -71, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> undef)
diff --git a/llvm/test/CodeGen/AArch64/addimm-mulimm.ll b/llvm/test/CodeGen/AArch64/addimm-mulimm.ll
index cc6523d1bb1d..6636813eb250 100644
--- a/llvm/test/CodeGen/AArch64/addimm-mulimm.ll
+++ b/llvm/test/CodeGen/AArch64/addimm-mulimm.ll
@@ -4,8 +4,8 @@
 define i64 @addimm_mulimm_accept_00(i64 %a) {
 ; CHECK-LABEL: addimm_mulimm_accept_00:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #37
-; CHECK-NEXT:    mov x9, #1147
+; CHECK-NEXT:    mov w8, #37 // =0x25
+; CHECK-NEXT:    mov x9, #1147 // =0x47b
 ; CHECK-NEXT:    madd x0, x0, x8, x9
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %a, 31
@@ -16,8 +16,8 @@ define i64 @addimm_mulimm_accept_00(i64 %a) {
 define i64 @addimm_mulimm_accept_01(i64 %a) {
 ; CHECK-LABEL: addimm_mulimm_accept_01:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #37
-; CHECK-NEXT:    mov x9, #-1147
+; CHECK-NEXT:    mov w8, #37 // =0x25
+; CHECK-NEXT:    mov x9, #-1147 // =0xfffffffffffffb85
 ; CHECK-NEXT:    madd x0, x0, x8, x9
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %a, -31
@@ -28,8 +28,8 @@ define i64 @addimm_mulimm_accept_01(i64 %a) {
 define signext i32 @addimm_mulimm_accept_02(i32 signext %a) {
 ; CHECK-LABEL: addimm_mulimm_accept_02:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #37
-; CHECK-NEXT:    mov w9, #1147
+; CHECK-NEXT:    mov w8, #37 // =0x25
+; CHECK-NEXT:    mov w9, #1147 // =0x47b
 ; CHECK-NEXT:    madd w0, w0, w8, w9
 ; CHECK-NEXT:    ret
   %tmp0 = add i32 %a, 31
@@ -40,8 +40,8 @@ define signext i32 @addimm_mulimm_accept_02(i32 signext %a) {
 define signext i32 @addimm_mulimm_accept_03(i32 signext %a) {
 ; CHECK-LABEL: addimm_mulimm_accept_03:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #37
-; CHECK-NEXT:    mov w9, #-1147
+; CHECK-NEXT:    mov w8, #37 // =0x25
+; CHECK-NEXT:    mov w9, #-1147 // =0xfffffb85
 ; CHECK-NEXT:    madd w0, w0, w8, w9
 ; CHECK-NEXT:    ret
   %tmp0 = add i32 %a, -31
@@ -52,8 +52,8 @@ define signext i32 @addimm_mulimm_accept_03(i32 signext %a) {
 define i64 @addimm_mulimm_accept_10(i64 %a) {
 ; CHECK-LABEL: addimm_mulimm_accept_10:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #37
-; CHECK-NEXT:    mov w9, #32888
+; CHECK-NEXT:    mov w8, #37 // =0x25
+; CHECK-NEXT:    mov w9, #32888 // =0x8078
 ; CHECK-NEXT:    movk w9, #17, lsl #16
 ; CHECK-NEXT:    madd x0, x0, x8, x9
 ; CHECK-NEXT:    ret
@@ -65,8 +65,8 @@ define i64 @addimm_mulimm_accept_10(i64 %a) {
 define i64 @addimm_mulimm_accept_11(i64 %a) {
 ; CHECK-LABEL: addimm_mulimm_accept_11:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #37
-; CHECK-NEXT:    mov x9, #-32888
+; CHECK-NEXT:    mov w8, #37 // =0x25
+; CHECK-NEXT:    mov x9, #-32888 // =0xffffffffffff7f88
 ; CHECK-NEXT:    movk x9, #65518, lsl #16
 ; CHECK-NEXT:    madd x0, x0, x8, x9
 ; CHECK-NEXT:    ret
@@ -78,8 +78,8 @@ define i64 @addimm_mulimm_accept_11(i64 %a) {
 define signext i32 @addimm_mulimm_accept_12(i32 signext %a) {
 ; CHECK-LABEL: addimm_mulimm_accept_12:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #37
-; CHECK-NEXT:    mov w9, #32888
+; CHECK-NEXT:    mov w8, #37 // =0x25
+; CHECK-NEXT:    mov w9, #32888 // =0x8078
 ; CHECK-NEXT:    movk w9, #17, lsl #16
 ; CHECK-NEXT:    madd w0, w0, w8, w9
 ; CHECK-NEXT:    ret
@@ -91,8 +91,8 @@ define signext i32 @addimm_mulimm_accept_12(i32 signext %a) {
 define signext i32 @addimm_mulimm_accept_13(i32 signext %a) {
 ; CHECK-LABEL: addimm_mulimm_accept_13:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #37
-; CHECK-NEXT:    mov w9, #32648
+; CHECK-NEXT:    mov w8, #37 // =0x25
+; CHECK-NEXT:    mov w9, #32648 // =0x7f88
 ; CHECK-NEXT:    movk w9, #65518, lsl #16
 ; CHECK-NEXT:    madd w0, w0, w8, w9
 ; CHECK-NEXT:    ret
@@ -104,7 +104,7 @@ define signext i32 @addimm_mulimm_accept_13(i32 signext %a) {
 define i64 @addimm_mulimm_reject_00(i64 %a) {
 ; CHECK-LABEL: addimm_mulimm_reject_00:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #3700
+; CHECK-NEXT:    mov w8, #3700 // =0xe74
 ; CHECK-NEXT:    add x9, x0, #3100
 ; CHECK-NEXT:    mul x0, x9, x8
 ; CHECK-NEXT:    ret
@@ -116,7 +116,7 @@ define i64 @addimm_mulimm_reject_00(i64 %a) {
 define i64 @addimm_mulimm_reject_01(i64 %a) {
 ; CHECK-LABEL: addimm_mulimm_reject_01:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #3700
+; CHECK-NEXT:    mov w8, #3700 // =0xe74
 ; CHECK-NEXT:    sub x9, x0, #3100
 ; CHECK-NEXT:    mul x0, x9, x8
 ; CHECK-NEXT:    ret
@@ -128,7 +128,7 @@ define i64 @addimm_mulimm_reject_01(i64 %a) {
 define signext i32 @addimm_mulimm_reject_02(i32 signext %a) {
 ; CHECK-LABEL: addimm_mulimm_reject_02:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #3700
+; CHECK-NEXT:    mov w8, #3700 // =0xe74
 ; CHECK-NEXT:    add w9, w0, #3100
 ; CHECK-NEXT:    mul w0, w9, w8
 ; CHECK-NEXT:    ret
@@ -140,7 +140,7 @@ define signext i32 @addimm_mulimm_reject_02(i32 signext %a) {
 define signext i32 @addimm_mulimm_reject_03(i32 signext %a) {
 ; CHECK-LABEL: addimm_mulimm_reject_03:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #3700
+; CHECK-NEXT:    mov w8, #3700 // =0xe74
 ; CHECK-NEXT:    sub w9, w0, #3100
 ; CHECK-NEXT:    mul w0, w9, w8
 ; CHECK-NEXT:    ret
@@ -148,3 +148,257 @@ define signext i32 @addimm_mulimm_reject_03(i32 signext %a) {
   %tmp1 = mul i32 %tmp0, 3700
   ret i32 %tmp1
 }
+
+define signext i32 @addmuladd(i32 signext %a) {
+; CHECK-LABEL: addmuladd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #324 // =0x144
+; CHECK-NEXT:    mov w9, #1300 // =0x514
+; CHECK-NEXT:    madd w0, w0, w8, w9
+; CHECK-NEXT:    ret
+  %tmp0 = add i32 %a, 4
+  %tmp1 = mul i32 %tmp0, 324
+  %tmp2 = add i32 %tmp1, 4
+  ret i32 %tmp2
+}
+
+define signext i32 @addmuladd_multiuse(i32 signext %a) {
+; CHECK-LABEL: addmuladd_multiuse:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #324 // =0x144
+; CHECK-NEXT:    mov w9, #1300 // =0x514
+; CHECK-NEXT:    madd w8, w0, w8, w9
+; CHECK-NEXT:    add w9, w0, #4
+; CHECK-NEXT:    eor w0, w9, w8
+; CHECK-NEXT:    ret
+  %tmp0 = add i32 %a, 4
+  %tmp1 = mul i32 %tmp0, 324
+  %tmp2 = add i32 %tmp1, 4
+  %tmp3 = xor i32 %tmp0, %tmp2
+  ret i32 %tmp3
+}
+
+define signext i32 @addmuladd_multiusemul(i32 signext %a) {
+; CHECK-LABEL: addmuladd_multiusemul:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #324 // =0x144
+; CHECK-NEXT:    mul w8, w0, w8
+; CHECK-NEXT:    add w9, w8, #1296
+; CHECK-NEXT:    add w8, w8, #1300
+; CHECK-NEXT:    eor w0, w9, w8
+; CHECK-NEXT:    ret
+  %tmp0 = add i32 %a, 4
+  %tmp1 = mul i32 %tmp0, 324
+  %tmp2 = add i32 %tmp1, 4
+  %tmp3 = xor i32 %tmp1, %tmp2
+  ret i32 %tmp3
+}
+
+define signext i32 @addmuladd_multiuse2(i32 signext %a) {
+; CHECK-LABEL: addmuladd_multiuse2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #324 // =0x144
+; CHECK-NEXT:    lsl w9, w0, #2
+; CHECK-NEXT:    mov w10, #1300 // =0x514
+; CHECK-NEXT:    madd w8, w0, w8, w10
+; CHECK-NEXT:    add w9, w9, #20
+; CHECK-NEXT:    eor w0, w8, w9
+; CHECK-NEXT:    ret
+  %tmp0 = add i32 %a, 4
+  %tmp1 = mul i32 %tmp0, 4
+  %tmp2 = add i32 %tmp1, 4
+  %tmp3 = mul i32 %tmp0, 324
+  %tmp4 = add i32 %tmp3, 4
+  %tmp5 = xor i32 %tmp4, %tmp2
+  ret i32 %tmp5
+}
+
+define signext i32 @addaddmuladd(i32 signext %a, i32 %b) {
+; CHECK-LABEL: addaddmuladd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #324 // =0x144
+; CHECK-NEXT:    madd w8, w0, w8, w1
+; CHECK-NEXT:    add w0, w8, #1300
+; CHECK-NEXT:    ret
+  %tmp0 = add i32 %a, 4
+  %tmp1 = mul i32 %tmp0, 324
+  %tmp2 = add i32 %tmp1, %b
+  %tmp3 = add i32 %tmp2, 4
+  ret i32 %tmp3
+}
+
+define signext i32 @addaddmuladd_multiuse(i32 signext %a, i32 %b) {
+; CHECK-LABEL: addaddmuladd_multiuse:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #324 // =0x144
+; CHECK-NEXT:    add w9, w0, #4
+; CHECK-NEXT:    madd w8, w0, w8, w1
+; CHECK-NEXT:    add w8, w8, #1300
+; CHECK-NEXT:    eor w0, w9, w8
+; CHECK-NEXT:    ret
+  %tmp0 = add i32 %a, 4
+  %tmp1 = mul i32 %tmp0, 324
+  %tmp2 = add i32 %tmp1, %b
+  %tmp3 = add i32 %tmp2, 4
+  %tmp4 = xor i32 %tmp0, %tmp3
+  ret i32 %tmp4
+}
+
+define signext i32 @addaddmuladd_multiuse2(i32 signext %a, i32 %b) {
+; CHECK-LABEL: addaddmuladd_multiuse2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #324 // =0x144
+; CHECK-NEXT:    mov w9, #162 // =0xa2
+; CHECK-NEXT:    madd w8, w0, w8, w1
+; CHECK-NEXT:    madd w9, w0, w9, w1
+; CHECK-NEXT:    add w8, w8, #1300
+; CHECK-NEXT:    add w9, w9, #652
+; CHECK-NEXT:    eor w0, w9, w8
+; CHECK-NEXT:    ret
+  %tmp0 = add i32 %a, 4
+  %tmp1 = mul i32 %tmp0, 324
+  %tmp2 = add i32 %tmp1, %b
+  %tmp3 = add i32 %tmp2, 4
+  %tmp1b = mul i32 %tmp0, 162
+  %tmp2b = add i32 %tmp1b, %b
+  %tmp3b = add i32 %tmp2b, 4
+  %tmp4 = xor i32 %tmp3b, %tmp3
+  ret i32 %tmp4
+}
+
+define <4 x i32> @addmuladd_vec(<4 x i32> %a) {
+; CHECK-LABEL: addmuladd_vec:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #324 // =0x144
+; CHECK-NEXT:    mov w9, #1300 // =0x514
+; CHECK-NEXT:    dup v2.4s, w8
+; CHECK-NEXT:    dup v1.4s, w9
+; CHECK-NEXT:    mla v1.4s, v0.4s, v2.4s
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %tmp0 = add <4 x i32> %a, <i32 4, i32 4, i32 4, i32 4>
+  %tmp1 = mul <4 x i32> %tmp0, <i32 324, i32 324, i32 324, i32 324>
+  %tmp2 = add <4 x i32> %tmp1, <i32 4, i32 4, i32 4, i32 4>
+  ret <4 x i32> %tmp2
+}
+
+define <4 x i32> @addmuladd_vec_multiuse(<4 x i32> %a) {
+; CHECK-LABEL: addmuladd_vec_multiuse:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v1.4s, #4
+; CHECK-NEXT:    mov w8, #324 // =0x144
+; CHECK-NEXT:    dup v2.4s, w8
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mla v1.4s, v0.4s, v2.4s
+; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %tmp0 = add <4 x i32> %a, <i32 4, i32 4, i32 4, i32 4>
+  %tmp1 = mul <4 x i32> %tmp0, <i32 324, i32 324, i32 324, i32 324>
+  %tmp2 = add <4 x i32> %tmp1, <i32 4, i32 4, i32 4, i32 4>
+  %tmp3 = xor <4 x i32> %tmp0, %tmp2
+  ret <4 x i32> %tmp3
+}
+
+define void @addmuladd_gep(ptr %p, i64 %a) {
+; CHECK-LABEL: addmuladd_gep:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #40 // =0x28
+; CHECK-NEXT:    str wzr, [x0, #10]!
+; CHECK-NEXT:    madd x8, x1, x8, x0
+; CHECK-NEXT:    str wzr, [x8, #20]
+; CHECK-NEXT:    ret
+  %q = getelementptr i8, ptr %p, i64 10
+  %r = getelementptr [10 x [10 x i32]], ptr %q, i64 0, i64 %a, i64 5
+  store i32 0, ptr %q
+  store i32 0, ptr %r
+  ret void
+}
+
+define i32 @addmuladd_gep2(ptr %p, i32 %a) {
+; CHECK-LABEL: addmuladd_gep2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #3240 // =0xca8
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    smaddl x8, w1, w8, x0
+; CHECK-NEXT:    ldr w8, [x8, #3260]
+; CHECK-NEXT:    tbnz w8, #31, .LBB22_2
+; CHECK-NEXT:  // %bb.1:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB22_2: // %then
+; CHECK-NEXT:    sxtw x8, w1
+; CHECK-NEXT:    add x8, x8, #1
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    mov w0, #1 // =0x1
+; CHECK-NEXT:    ret
+  %b = sext i32 %a to i64
+  %c = add nsw i64 %b, 1
+  %d = mul nsw i64 %c, 81
+  %g = getelementptr [10 x [10 x i32]], ptr %p, i64 0, i64 %d, i64 5
+  %l = load i32, ptr %g, align 4
+  %cc = icmp slt i32 %l, 0
+  br i1 %cc, label %then, label %else
+then:
+  store i64 %c, ptr %p
+  ret i32 1
+else:
+  ret i32 0
+}
+
+define signext i32 @addmuladd_multiuse2_nsw(i32 signext %a) {
+; CHECK-LABEL: addmuladd_multiuse2_nsw:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #324 // =0x144
+; CHECK-NEXT:    lsl w9, w0, #2
+; CHECK-NEXT:    mov w10, #1300 // =0x514
+; CHECK-NEXT:    madd w8, w0, w8, w10
+; CHECK-NEXT:    add w9, w9, #20
+; CHECK-NEXT:    eor w0, w8, w9
+; CHECK-NEXT:    ret
+  %tmp0 = add nsw i32 %a, 4
+  %tmp1 = mul nsw i32 %tmp0, 4
+  %tmp2 = add nsw i32 %tmp1, 4
+  %tmp3 = mul nsw i32 %tmp0, 324
+  %tmp4 = add nsw i32 %tmp3, 4
+  %tmp5 = xor i32 %tmp4, %tmp2
+  ret i32 %tmp5
+}
+
+define signext i32 @addmuladd_multiuse2_nuw(i32 signext %a) {
+; CHECK-LABEL: addmuladd_multiuse2_nuw:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #324 // =0x144
+; CHECK-NEXT:    lsl w9, w0, #2
+; CHECK-NEXT:    mov w10, #1300 // =0x514
+; CHECK-NEXT:    madd w8, w0, w8, w10
+; CHECK-NEXT:    add w9, w9, #20
+; CHECK-NEXT:    eor w0, w8, w9
+; CHECK-NEXT:    ret
+  %tmp0 = add nuw i32 %a, 4
+  %tmp1 = mul nuw i32 %tmp0, 4
+  %tmp2 = add nuw i32 %tmp1, 4
+  %tmp3 = mul nuw i32 %tmp0, 324
+  %tmp4 = add nuw i32 %tmp3, 4
+  %tmp5 = xor i32 %tmp4, %tmp2
+  ret i32 %tmp5
+}
+
+define signext i32 @addmuladd_multiuse2_nswnuw(i32 signext %a) {
+; CHECK-LABEL: addmuladd_multiuse2_nswnuw:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #324 // =0x144
+; CHECK-NEXT:    lsl w9, w0, #2
+; CHECK-NEXT:    mov w10, #1300 // =0x514
+; CHECK-NEXT:    madd w8, w0, w8, w10
+; CHECK-NEXT:    add w9, w9, #20
+; CHECK-NEXT:    eor w0, w8, w9
+; CHECK-NEXT:    ret
+  %tmp0 = add nsw nuw i32 %a, 4
+  %tmp1 = mul nsw nuw i32 %tmp0, 4
+  %tmp2 = add nsw nuw i32 %tmp1, 4
+  %tmp3 = mul nsw nuw i32 %tmp0, 324
+  %tmp4 = add nsw nuw i32 %tmp3, 4
+  %tmp5 = xor i32 %tmp4, %tmp2
+  ret i32 %tmp5
+}
+
diff --git a/llvm/test/CodeGen/AArch64/and-mask-removal.ll b/llvm/test/CodeGen/AArch64/and-mask-removal.ll
index a8a59f159126..493d503de2cc 100644
--- a/llvm/test/CodeGen/AArch64/and-mask-removal.ll
+++ b/llvm/test/CodeGen/AArch64/and-mask-removal.ll
@@ -526,4 +526,26 @@ define i64 @pr58109b(i8 signext %0, i64 %a, i64 %b) {
   ret i64 %4
 }
 
+define i64 @test_2_selects(i8 zeroext %a) {
+; CHECK-LABEL: test_2_selects:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    add w9, w0, #24
+; CHECK-NEXT:    mov w8, #131
+; CHECK-NEXT:    and w9, w9, #0xff
+; CHECK-NEXT:    cmp w9, #81
+; CHECK-NEXT:    mov w9, #57
+; CHECK-NEXT:    csel x8, x8, xzr, lo
+; CHECK-NEXT:    csel x9, xzr, x9, eq
+; CHECK-NEXT:    add x0, x8, x9
+; CHECK-NEXT:    ret
+  %1 = add i8 %a, 24
+  %2 = zext i8 %1 to i64
+  %3 = icmp ult i8 %1, 81
+  %4 = select i1 %3, i64 131, i64 0
+  %5 = icmp eq i8 %1, 81
+  %6 = select i1 %5, i64 0, i64 57
+  %7 = add i64 %4, %6
+  ret i64 %7
+}
+
 declare i8 @llvm.usub.sat.i8(i8, i8) #0
diff --git a/llvm/test/CodeGen/AArch64/arm64-uzp.ll b/llvm/test/CodeGen/AArch64/arm64-uzp.ll
index 49a51d96fbc8..bd6bf1bf1578 100644
--- a/llvm/test/CodeGen/AArch64/arm64-uzp.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-uzp.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -global-isel | FileCheck %s
 
 define <8 x i8> @vuzpi8(<8 x i8> %A, <8 x i8> %B) nounwind {
 ; CHECK-LABEL: vuzpi8:
diff --git a/llvm/test/CodeGen/AArch64/arm64-zip.ll b/llvm/test/CodeGen/AArch64/arm64-zip.ll
index 4c771cbd2966..9955b253f563 100644
--- a/llvm/test/CodeGen/AArch64/arm64-zip.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-zip.ll
@@ -1,5 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:       warning: Instruction selection used fallback path for shuffle_zip1
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for shuffle_zip2
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for shuffle_zip3
 
 define <8 x i8> @vzipi8(ptr %A, ptr %B) nounwind {
 ; CHECK-LABEL: vzipi8:
@@ -223,12 +228,20 @@ define <16 x i8> @combine_v16i8(<8 x i8> %0, <8 x i8> %1) {
 }
 
 define <16 x i8> @combine2_v16i8(<8 x i8> %0, <8 x i8> %1) {
-; CHECK-LABEL: combine2_v16i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    zip1.16b v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: combine2_v16i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    zip1.16b v0, v0, v1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: combine2_v16i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    zip1.8b v2, v0, v1
+; CHECK-GI-NEXT:    zip2.8b v0, v0, v1
+; CHECK-GI-NEXT:    mov.d v2[1], v0[0]
+; CHECK-GI-NEXT:    mov.16b v0, v2
+; CHECK-GI-NEXT:    ret
   %3 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
   %4 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   %5 = shufflevector <8 x i8> %3, <8 x i8> %4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -247,12 +260,20 @@ define <8 x i16> @combine_v8i16(<4 x i16> %0, <4 x i16> %1) {
 }
 
 define <8 x i16> @combine2_v8i16(<4 x i16> %0, <4 x i16> %1) {
-; CHECK-LABEL: combine2_v8i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    zip1.8h v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: combine2_v8i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    zip1.8h v0, v0, v1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: combine2_v8i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    zip1.4h v2, v0, v1
+; CHECK-GI-NEXT:    zip2.4h v0, v0, v1
+; CHECK-GI-NEXT:    mov.d v2[1], v0[0]
+; CHECK-GI-NEXT:    mov.16b v0, v2
+; CHECK-GI-NEXT:    ret
   %3 = shufflevector <4 x i16> %0, <4 x i16> %1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %4 = shufflevector <4 x i16> %0, <4 x i16> %1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   %5 = shufflevector <4 x i16> %3, <4 x i16> %4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -271,12 +292,20 @@ define <4 x i32> @combine_v4i32(<2 x i32> %0, <2 x i32> %1) {
 }
 
 define <4 x i32> @combine2_v4i32(<2 x i32> %0, <2 x i32> %1) {
-; CHECK-LABEL: combine2_v4i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    zip1.4s v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: combine2_v4i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    zip1.4s v0, v0, v1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: combine2_v4i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    zip1.2s v2, v0, v1
+; CHECK-GI-NEXT:    zip2.2s v0, v0, v1
+; CHECK-GI-NEXT:    mov.d v2[1], v0[0]
+; CHECK-GI-NEXT:    mov.16b v0, v2
+; CHECK-GI-NEXT:    ret
   %3 = shufflevector <2 x i32> %0, <2 x i32> %1, <2 x i32> <i32 0, i32 2>
   %4 = shufflevector <2 x i32> %0, <2 x i32> %1, <2 x i32> <i32 1, i32 3>
   %5 = shufflevector <2 x i32> %3, <2 x i32> %4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -295,12 +324,20 @@ define <16 x i8> @combine_v16i8_undef(<8 x i8> %0, <8 x i8> %1) {
 }
 
 define <16 x i8> @combine2_v16i8_undef(<8 x i8> %0, <8 x i8> %1) {
-; CHECK-LABEL: combine2_v16i8_undef:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    zip1.16b v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: combine2_v16i8_undef:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    zip1.16b v0, v0, v1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: combine2_v16i8_undef:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    zip1.8b v2, v0, v1
+; CHECK-GI-NEXT:    zip2.8b v0, v0, v1
+; CHECK-GI-NEXT:    mov.d v2[1], v0[0]
+; CHECK-GI-NEXT:    mov.16b v0, v2
+; CHECK-GI-NEXT:    ret
   %3 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 0, i32 undef, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
   %4 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   %5 = shufflevector <8 x i8> %3, <8 x i8> %4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -320,14 +357,23 @@ define <8 x i16> @combine_v8i16_undef(<4 x i16> %0, <4 x i16> %1) {
 
 ; FIXME: This could be zip1 too, 8,0,9,1... pattern is handled
 define <16 x i8> @combine_v8i16_8first(<8 x i8> %0, <8 x i8> %1) {
-; CHECK-LABEL: combine_v8i16_8first:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1_q2
-; CHECK-NEXT:    adrp x8, .LCPI25_0
-; CHECK-NEXT:    fmov d2, d0
-; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI25_0]
-; CHECK-NEXT:    tbl.16b v0, { v1, v2 }, v3
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: combine_v8i16_8first:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1_q2
+; CHECK-SD-NEXT:    adrp x8, .LCPI25_0
+; CHECK-SD-NEXT:    fmov d2, d0
+; CHECK-SD-NEXT:    ldr q3, [x8, :lo12:.LCPI25_0]
+; CHECK-SD-NEXT:    tbl.16b v0, { v1, v2 }, v3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: combine_v8i16_8first:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q31_q0
+; CHECK-GI-NEXT:    adrp x8, .LCPI25_0
+; CHECK-GI-NEXT:    fmov d31, d1
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI25_0]
+; CHECK-GI-NEXT:    tbl.16b v0, { v31, v0 }, v2
+; CHECK-GI-NEXT:    ret
   %3 = shufflevector <8 x i8> %1, <8 x i8> %0, <16 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3, i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 7>
   ret <16 x i8> %3
 }
@@ -335,14 +381,23 @@ define <16 x i8> @combine_v8i16_8first(<8 x i8> %0, <8 x i8> %1) {
 
 ; FIXME: This could be zip1 too, 8,0,9,1... pattern is handled
 define <16 x i8> @combine_v8i16_8firstundef(<8 x i8> %0, <8 x i8> %1) {
-; CHECK-LABEL: combine_v8i16_8firstundef:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1_q2
-; CHECK-NEXT:    adrp x8, .LCPI26_0
-; CHECK-NEXT:    fmov d2, d0
-; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI26_0]
-; CHECK-NEXT:    tbl.16b v0, { v1, v2 }, v3
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: combine_v8i16_8firstundef:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1_q2
+; CHECK-SD-NEXT:    adrp x8, .LCPI26_0
+; CHECK-SD-NEXT:    fmov d2, d0
+; CHECK-SD-NEXT:    ldr q3, [x8, :lo12:.LCPI26_0]
+; CHECK-SD-NEXT:    tbl.16b v0, { v1, v2 }, v3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: combine_v8i16_8firstundef:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q31_q0
+; CHECK-GI-NEXT:    adrp x8, .LCPI26_0
+; CHECK-GI-NEXT:    fmov d31, d1
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI26_0]
+; CHECK-GI-NEXT:    tbl.16b v0, { v31, v0 }, v2
+; CHECK-GI-NEXT:    ret
   %3 = shufflevector <8 x i8> %1, <8 x i8> %0, <16 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3, i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 undef>
   ret <16 x i8> %3
 }
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll
index dae8d9f89e99..c2fc959d8e10 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll
@@ -7,18 +7,18 @@ target triple = "aarch64"
 define <vscale x 4 x half> @complex_add_v4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b) {
 ; CHECK-LABEL: complex_add_v4f16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uunpkhi z2.d, z0.s
+; CHECK-NEXT:    uzp1 z2.s, z0.s, z0.s
+; CHECK-NEXT:    uzp2 z0.s, z0.s, z0.s
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uzp2 z3.s, z1.s, z0.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z0.s
+; CHECK-NEXT:    uunpklo z2.d, z2.s
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    uunpkhi z3.d, z1.s
+; CHECK-NEXT:    uunpklo z3.d, z3.s
 ; CHECK-NEXT:    uunpklo z1.d, z1.s
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uzp1 z4.d, z0.d, z2.d
-; CHECK-NEXT:    uzp2 z0.d, z0.d, z2.d
-; CHECK-NEXT:    uzp2 z2.d, z1.d, z3.d
-; CHECK-NEXT:    uzp1 z1.d, z1.d, z3.d
 ; CHECK-NEXT:    fsubr z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fadd z1.h, p0/m, z1.h, z4.h
+; CHECK-NEXT:    movprfx z1, z3
+; CHECK-NEXT:    fadd z1.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    zip2 z2.d, z0.d, z1.d
 ; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll
index c09ec616b015..b42d484ea74c 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll
@@ -7,23 +7,22 @@ target triple = "aarch64"
 define <vscale x 4 x half> @complex_mul_v4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b) {
 ; CHECK-LABEL: complex_mul_v4f16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uunpkhi z2.d, z0.s
+; CHECK-NEXT:    uzp2 z2.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uzp2 z3.s, z1.s, z0.s
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    uunpkhi z3.d, z1.s
+; CHECK-NEXT:    uunpklo z2.d, z2.s
+; CHECK-NEXT:    uunpklo z3.d, z3.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z0.s
 ; CHECK-NEXT:    uunpklo z1.d, z1.s
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uzp2 z4.d, z0.d, z2.d
-; CHECK-NEXT:    uzp1 z0.d, z0.d, z2.d
-; CHECK-NEXT:    uzp2 z2.d, z1.d, z3.d
-; CHECK-NEXT:    uzp1 z1.d, z1.d, z3.d
-; CHECK-NEXT:    movprfx z5, z2
-; CHECK-NEXT:    fmul z5.h, p0/m, z5.h, z0.h
-; CHECK-NEXT:    fmul z2.h, p0/m, z2.h, z4.h
-; CHECK-NEXT:    movprfx z3, z5
-; CHECK-NEXT:    fmla z3.h, p0/m, z1.h, z4.h
-; CHECK-NEXT:    fnmsb z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    zip2 z1.d, z0.d, z3.d
-; CHECK-NEXT:    zip1 z0.d, z0.d, z3.d
+; CHECK-NEXT:    movprfx z4, z3
+; CHECK-NEXT:    fmul z4.h, p0/m, z4.h, z0.h
+; CHECK-NEXT:    fmul z3.h, p0/m, z3.h, z2.h
+; CHECK-NEXT:    fmad z2.h, p0/m, z1.h, z4.h
+; CHECK-NEXT:    fnmsb z0.h, p0/m, z1.h, z3.h
+; CHECK-NEXT:    zip2 z1.d, z0.d, z2.d
+; CHECK-NEXT:    zip1 z0.d, z0.d, z2.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
index 504222e0036e..0481d997d24f 100644
--- a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
@@ -938,21 +938,10 @@ entry:
 }
 
 define i32 @extract_v4i32_shuffle_const(<4 x i32> %a, <4 x i32> %b, i32 %c) {
-; CHECK-SD-LABEL: extract_v4i32_shuffle_const:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmov w0, s1
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: extract_v4i32_shuffle_const:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    adrp x8, .LCPI36_0
-; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI36_0]
-; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-GI-NEXT:    mov s0, v0.s[2]
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: extract_v4i32_shuffle_const:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov w0, s1
+; CHECK-NEXT:    ret
 entry:
   %vector = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 3>
   %d = extractelement <4 x i32> %vector, i32 2
diff --git a/llvm/test/CodeGen/AArch64/fcvt_combine.ll b/llvm/test/CodeGen/AArch64/fcvt_combine.ll
index 29170aab9656..62669a6d99ea 100644
--- a/llvm/test/CodeGen/AArch64/fcvt_combine.ll
+++ b/llvm/test/CodeGen/AArch64/fcvt_combine.ll
@@ -345,11 +345,8 @@ define <2 x i64> @test6_sat(<2 x float> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov v1.2s, #16.00000000
 ; CHECK-NEXT:    fmul v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    mov s1, v0.s[1]
-; CHECK-NEXT:    fcvtzs x8, s0
-; CHECK-NEXT:    fcvtzs x9, s1
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    mov v0.d[1], x9
+; CHECK-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
 ; CHECK-NEXT:    ret
   %mul.i = fmul <2 x float> %f, <float 16.000000e+00, float 16.000000e+00>
   %vcvt.i = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> %mul.i)
diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
index c58db8290c87..5bd680ed4893 100644
--- a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
@@ -30,21 +30,13 @@ define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec
 }
 
 define {<4 x half>, <4 x half>} @vector_deinterleave_v4f16_v8f16(<8 x half> %vec) {
-; CHECK-SD-LABEL: vector_deinterleave_v4f16_v8f16:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT:    uzp1 v2.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT:    uzp2 v1.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT:    fmov d0, d2
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: vector_deinterleave_v4f16_v8f16:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    uzp1 v2.8h, v0.8h, v0.8h
-; CHECK-GI-NEXT:    uzp2 v1.8h, v0.8h, v0.8h
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 killed $q1
-; CHECK-GI-NEXT:    fmov d0, d2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: vector_deinterleave_v4f16_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp1 v2.8h, v0.8h, v0.8h
+; CHECK-NEXT:    uzp2 v1.8h, v0.8h, v0.8h
+; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-NEXT:    fmov d0, d2
+; CHECK-NEXT:    ret
   %retval = call {<4 x half>, <4 x half>} @llvm.vector.deinterleave2.v8f16(<8 x half> %vec)
   ret {<4 x half>, <4 x half>}   %retval
 }
diff --git a/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll b/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll
index 2ea581359af6..4e8bfcd9d751 100644
--- a/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll
@@ -436,12 +436,8 @@ entry:
 define <2 x i64> @stest_f32i64(<2 x float> %x) {
 ; CHECK-LABEL: stest_f32i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov s1, v0.s[1]
-; CHECK-NEXT:    fcvtzs x8, s0
-; CHECK-NEXT:    fcvtzs x9, s1
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    mov v0.d[1], x9
+; CHECK-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
 ; CHECK-NEXT:    ret
 entry:
   %conv = fptosi <2 x float> %x to <2 x i128>
@@ -1056,12 +1052,8 @@ entry:
 define <2 x i64> @stest_f32i64_mm(<2 x float> %x) {
 ; CHECK-LABEL: stest_f32i64_mm:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov s1, v0.s[1]
-; CHECK-NEXT:    fcvtzs x8, s0
-; CHECK-NEXT:    fcvtzs x9, s1
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    mov v0.d[1], x9
+; CHECK-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
 ; CHECK-NEXT:    ret
 entry:
   %conv = fptosi <2 x float> %x to <2 x i128>
diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
index c45885a38f15..d620a8851ee4 100644
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
@@ -793,12 +793,8 @@ define <2 x i50> @test_signed_v2f32_v2i50(<2 x float> %f) {
 define <2 x i64> @test_signed_v2f32_v2i64(<2 x float> %f) {
 ; CHECK-LABEL: test_signed_v2f32_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov s1, v0.s[1]
-; CHECK-NEXT:    fcvtzs x8, s0
-; CHECK-NEXT:    fcvtzs x9, s1
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    mov v0.d[1], x9
+; CHECK-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
 ; CHECK-NEXT:    ret
     %x = call <2 x i64> @llvm.fptosi.sat.v2f32.v2i64(<2 x float> %f)
     ret <2 x i64> %x
@@ -1060,17 +1056,10 @@ define <4 x i50> @test_signed_v4f32_v4i50(<4 x float> %f) {
 define <4 x i64> @test_signed_v4f32_v4i64(<4 x float> %f) {
 ; CHECK-LABEL: test_signed_v4f32_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    mov s3, v0.s[1]
-; CHECK-NEXT:    fcvtzs x9, s0
-; CHECK-NEXT:    mov s2, v1.s[1]
-; CHECK-NEXT:    fcvtzs x8, s1
-; CHECK-NEXT:    fcvtzs x11, s3
-; CHECK-NEXT:    fmov d0, x9
-; CHECK-NEXT:    fcvtzs x10, s2
-; CHECK-NEXT:    fmov d1, x8
-; CHECK-NEXT:    mov v0.d[1], x11
-; CHECK-NEXT:    mov v1.d[1], x10
+; CHECK-NEXT:    fcvtl2 v1.2d, v0.4s
+; CHECK-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
 ; CHECK-NEXT:    ret
     %x = call <4 x i64> @llvm.fptosi.sat.v4f32.v4i64(<4 x float> %f)
     ret <4 x i64> %x
diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
index c94db3484994..16e04070b654 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
@@ -707,12 +707,8 @@ define <2 x i50> @test_unsigned_v2f32_v2i50(<2 x float> %f) {
 define <2 x i64> @test_unsigned_v2f32_v2i64(<2 x float> %f) {
 ; CHECK-LABEL: test_unsigned_v2f32_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov s1, v0.s[1]
-; CHECK-NEXT:    fcvtzu x8, s0
-; CHECK-NEXT:    fcvtzu x9, s1
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    mov v0.d[1], x9
+; CHECK-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-NEXT:    fcvtzu v0.2d, v0.2d
 ; CHECK-NEXT:    ret
     %x = call <2 x i64> @llvm.fptoui.sat.v2f32.v2i64(<2 x float> %f)
     ret <2 x i64> %x
@@ -927,17 +923,10 @@ define <4 x i50> @test_unsigned_v4f32_v4i50(<4 x float> %f) {
 define <4 x i64> @test_unsigned_v4f32_v4i64(<4 x float> %f) {
 ; CHECK-LABEL: test_unsigned_v4f32_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    mov s3, v0.s[1]
-; CHECK-NEXT:    fcvtzu x9, s0
-; CHECK-NEXT:    mov s2, v1.s[1]
-; CHECK-NEXT:    fcvtzu x8, s1
-; CHECK-NEXT:    fcvtzu x11, s3
-; CHECK-NEXT:    fmov d0, x9
-; CHECK-NEXT:    fcvtzu x10, s2
-; CHECK-NEXT:    fmov d1, x8
-; CHECK-NEXT:    mov v0.d[1], x11
-; CHECK-NEXT:    mov v1.d[1], x10
+; CHECK-NEXT:    fcvtl2 v1.2d, v0.4s
+; CHECK-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-NEXT:    fcvtzu v1.2d, v1.2d
+; CHECK-NEXT:    fcvtzu v0.2d, v0.2d
 ; CHECK-NEXT:    ret
     %x = call <4 x i64> @llvm.fptoui.sat.v4f32.v4i64(<4 x float> %f)
     ret <4 x i64> %x
diff --git a/llvm/test/CodeGen/AArch64/frem-power2.ll b/llvm/test/CodeGen/AArch64/frem-power2.ll
new file mode 100644
index 000000000000..402e03c5e265
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/frem-power2.ll
@@ -0,0 +1,536 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+define float @frem2(float %x) {
+; CHECK-LABEL: frem2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s1, #2.00000000
+; CHECK-NEXT:    b fmodf
+entry:
+  %fmod = frem float %x, 2.0
+  ret float %fmod
+}
+
+define float @frem2_nsz(float %x) {
+; CHECK-SD-LABEL: frem2_nsz:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s1, #2.00000000
+; CHECK-SD-NEXT:    fdiv s2, s0, s1
+; CHECK-SD-NEXT:    frintz s2, s2
+; CHECK-SD-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: frem2_nsz:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov s1, #2.00000000
+; CHECK-GI-NEXT:    b fmodf
+entry:
+  %fmod = frem nsz float %x, 2.0
+  ret float %fmod
+}
+
+define float @frem2_fast(float %x) {
+; CHECK-SD-LABEL: frem2_fast:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s1, #0.50000000
+; CHECK-SD-NEXT:    fmov s2, #-2.00000000
+; CHECK-SD-NEXT:    fmul s1, s0, s1
+; CHECK-SD-NEXT:    frintz s1, s1
+; CHECK-SD-NEXT:    fmadd s0, s1, s2, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: frem2_fast:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov s1, #2.00000000
+; CHECK-GI-NEXT:    b fmodf
+entry:
+  %fmod = frem fast float %x, 2.0
+  ret float %fmod
+}
+
+define float @frem2_abs(float %x) {
+; CHECK-SD-LABEL: frem2_abs:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fabs s0, s0
+; CHECK-SD-NEXT:    fmov s1, #2.00000000
+; CHECK-SD-NEXT:    fdiv s2, s0, s1
+; CHECK-SD-NEXT:    frintz s2, s2
+; CHECK-SD-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: frem2_abs:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fabs s0, s0
+; CHECK-GI-NEXT:    fmov s1, #2.00000000
+; CHECK-GI-NEXT:    b fmodf
+entry:
+  %a = tail call float @llvm.fabs.f32(float %x)
+  %fmod = frem float %a, 2.0
+  ret float %fmod
+}
+
+define half @hrem2_nsz(half %x) {
+; CHECK-SD-LABEL: hrem2_nsz:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov h1, #2.00000000
+; CHECK-SD-NEXT:    fmov h2, #-2.00000000
+; CHECK-SD-NEXT:    fdiv h1, h0, h1
+; CHECK-SD-NEXT:    frintz h1, h1
+; CHECK-SD-NEXT:    fmadd h0, h1, h2, h0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: hrem2_nsz:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    fmov h1, #2.00000000
+; CHECK-GI-NEXT:    fcvt s0, h0
+; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    bl fmodf
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %fmod = frem nsz half %x, 2.0
+  ret half %fmod
+}
+
+define double @drem2_nsz(double %x) {
+; CHECK-SD-LABEL: drem2_nsz:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov d1, #2.00000000
+; CHECK-SD-NEXT:    fdiv d2, d0, d1
+; CHECK-SD-NEXT:    frintz d2, d2
+; CHECK-SD-NEXT:    fmsub d0, d2, d1, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: drem2_nsz:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov d1, #2.00000000
+; CHECK-GI-NEXT:    b fmod
+entry:
+  %fmod = frem nsz double %x, 2.0
+  ret double %fmod
+}
+
+define float @frem3_nsz(float %x) {
+; CHECK-LABEL: frem3_nsz:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s1, #3.00000000
+; CHECK-NEXT:    b fmodf
+entry:
+  %fmod = frem nsz float %x, 3.0
+  ret float %fmod
+}
+
+define float @frem05_nsz(float %x) {
+; CHECK-LABEL: frem05_nsz:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s1, #0.50000000
+; CHECK-NEXT:    b fmodf
+entry:
+  %fmod = frem nsz float %x, 0.5
+  ret float %fmod
+}
+
+define float @frem1_nsz(float %x) {
+; CHECK-SD-LABEL: frem1_nsz:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    frintz s1, s0
+; CHECK-SD-NEXT:    fsub s0, s0, s1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: frem1_nsz:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov s1, #1.00000000
+; CHECK-GI-NEXT:    b fmodf
+entry:
+  %fmod = frem nsz float %x, 1.0
+  ret float %fmod
+}
+
+define float @frem0_nsz(float %x) {
+; CHECK-LABEL: frem0_nsz:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi d1, #0000000000000000
+; CHECK-NEXT:    b fmodf
+entry:
+  %fmod = frem nsz float %x, 0.0
+  ret float %fmod
+}
+
+define float @fremm2_nsz(float %x) {
+; CHECK-SD-LABEL: fremm2_nsz:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s1, #-2.00000000
+; CHECK-SD-NEXT:    fdiv s2, s0, s1
+; CHECK-SD-NEXT:    frintz s2, s2
+; CHECK-SD-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fremm2_nsz:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov s1, #-2.00000000
+; CHECK-GI-NEXT:    b fmodf
+entry:
+  %fmod = frem nsz float %x, -2.0
+  ret float %fmod
+}
+
+define float @frem4_abs(float %x) {
+; CHECK-SD-LABEL: frem4_abs:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fabs s0, s0
+; CHECK-SD-NEXT:    fmov s1, #4.00000000
+; CHECK-SD-NEXT:    fdiv s2, s0, s1
+; CHECK-SD-NEXT:    frintz s2, s2
+; CHECK-SD-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: frem4_abs:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fabs s0, s0
+; CHECK-GI-NEXT:    fmov s1, #4.00000000
+; CHECK-GI-NEXT:    b fmodf
+entry:
+  %a = tail call float @llvm.fabs.f32(float %x)
+  %fmod = frem float %a, 4.0
+  ret float %fmod
+}
+
+define float @frem16_abs(float %x) {
+; CHECK-SD-LABEL: frem16_abs:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fabs s0, s0
+; CHECK-SD-NEXT:    fmov s1, #16.00000000
+; CHECK-SD-NEXT:    fdiv s2, s0, s1
+; CHECK-SD-NEXT:    frintz s2, s2
+; CHECK-SD-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: frem16_abs:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fabs s0, s0
+; CHECK-GI-NEXT:    fmov s1, #16.00000000
+; CHECK-GI-NEXT:    b fmodf
+entry:
+  %a = tail call float @llvm.fabs.f32(float %x)
+  %fmod = frem float %a, 16.0
+  ret float %fmod
+}
+
+define float @frem4294967296_abs(float %x) {
+; CHECK-SD-LABEL: frem4294967296_abs:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fabs s0, s0
+; CHECK-SD-NEXT:    mov w8, #1333788672 // =0x4f800000
+; CHECK-SD-NEXT:    fmov s1, w8
+; CHECK-SD-NEXT:    fdiv s2, s0, s1
+; CHECK-SD-NEXT:    frintz s2, s2
+; CHECK-SD-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: frem4294967296_abs:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fabs s0, s0
+; CHECK-GI-NEXT:    mov w8, #1333788672 // =0x4f800000
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    b fmodf
+entry:
+  %a = tail call float @llvm.fabs.f32(float %x)
+  %fmod = frem float %a, 4294967296.0
+  ret float %fmod
+}
+
+define float @frem1152921504606846976_abs(float %x) {
+; CHECK-SD-LABEL: frem1152921504606846976_abs:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fabs s0, s0
+; CHECK-SD-NEXT:    mov w8, #1568669696 // =0x5d800000
+; CHECK-SD-NEXT:    fmov s1, w8
+; CHECK-SD-NEXT:    fdiv s2, s0, s1
+; CHECK-SD-NEXT:    frintz s2, s2
+; CHECK-SD-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: frem1152921504606846976_abs:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fabs s0, s0
+; CHECK-GI-NEXT:    mov w8, #1568669696 // =0x5d800000
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    b fmodf
+entry:
+  %a = tail call float @llvm.fabs.f32(float %x)
+  %fmod = frem float %a, 1152921504606846976.0
+  ret float %fmod
+}
+
+define float @frem4611686018427387904_abs(float %x) {
+; CHECK-SD-LABEL: frem4611686018427387904_abs:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fabs s0, s0
+; CHECK-SD-NEXT:    mov w8, #1585446912 // =0x5e800000
+; CHECK-SD-NEXT:    fmov s1, w8
+; CHECK-SD-NEXT:    fdiv s2, s0, s1
+; CHECK-SD-NEXT:    frintz s2, s2
+; CHECK-SD-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: frem4611686018427387904_abs:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fabs s0, s0
+; CHECK-GI-NEXT:    mov w8, #1585446912 // =0x5e800000
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    b fmodf
+entry:
+  %a = tail call float @llvm.fabs.f32(float %x)
+  %fmod = frem float %a, 4611686018427387904.0
+  ret float %fmod
+}
+
+define float @frem9223372036854775808_abs(float %x) {
+; CHECK-SD-LABEL: frem9223372036854775808_abs:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v1.2s, #95, lsl #24
+; CHECK-SD-NEXT:    fabs s0, s0
+; CHECK-SD-NEXT:    fdiv s2, s0, s1
+; CHECK-SD-NEXT:    frintz s2, s2
+; CHECK-SD-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: frem9223372036854775808_abs:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fabs s0, s0
+; CHECK-GI-NEXT:    movi v1.2s, #95, lsl #24
+; CHECK-GI-NEXT:    b fmodf
+entry:
+  %a = tail call float @llvm.fabs.f32(float %x)
+  %fmod = frem float %a, 9223372036854775808.0
+  ret float %fmod
+}
+
+define <4 x float> @frem2_nsz_vec(<4 x float> %x) {
+; CHECK-SD-LABEL: frem2_nsz_vec:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v1.4s, #64, lsl #24
+; CHECK-SD-NEXT:    fdiv v2.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    frintz v2.4s, v2.4s
+; CHECK-SD-NEXT:    fmls v0.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: frem2_nsz_vec:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #80
+; CHECK-GI-NEXT:    str d10, [sp, #48] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #56] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #72] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-GI-NEXT:    .cfi_offset w30, -8
+; CHECK-GI-NEXT:    .cfi_offset b8, -16
+; CHECK-GI-NEXT:    .cfi_offset b9, -24
+; CHECK-GI-NEXT:    .cfi_offset b10, -32
+; CHECK-GI-NEXT:    fmov s1, #2.00000000
+; CHECK-GI-NEXT:    mov s8, v0.s[1]
+; CHECK-GI-NEXT:    mov s9, v0.s[2]
+; CHECK-GI-NEXT:    mov s10, v0.s[3]
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-GI-NEXT:    bl fmodf
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s1, #2.00000000
+; CHECK-GI-NEXT:    fmov s0, s8
+; CHECK-GI-NEXT:    bl fmodf
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s1, #2.00000000
+; CHECK-GI-NEXT:    fmov s0, s9
+; CHECK-GI-NEXT:    bl fmodf
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s1, #2.00000000
+; CHECK-GI-NEXT:    fmov s0, s10
+; CHECK-GI-NEXT:    bl fmodf
+; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    ldr x30, [sp, #72] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #56] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr d10, [sp, #48] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
+; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.s[2], v2.s[0]
+; CHECK-GI-NEXT:    mov v1.s[3], v0.s[0]
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    add sp, sp, #80
+; CHECK-GI-NEXT:    ret
+entry:
+  %fmod = frem nsz <4 x float> %x, <float 2.0, float 2.0, float 2.0, float 2.0>
+  ret <4 x float> %fmod
+}
+
+define <4 x float> @frem1152921504606846976_absv(<4 x float> %x) {
+; CHECK-SD-LABEL: frem1152921504606846976_absv:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #1568669696 // =0x5d800000
+; CHECK-SD-NEXT:    fabs v0.4s, v0.4s
+; CHECK-SD-NEXT:    dup v1.4s, w8
+; CHECK-SD-NEXT:    fdiv v2.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    frintz v2.4s, v2.4s
+; CHECK-SD-NEXT:    fmls v0.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: frem1152921504606846976_absv:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #96
+; CHECK-GI-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    .cfi_offset b10, -40
+; CHECK-GI-NEXT:    .cfi_offset b11, -48
+; CHECK-GI-NEXT:    mov w8, #1568669696 // =0x5d800000
+; CHECK-GI-NEXT:    fabs v0.4s, v0.4s
+; CHECK-GI-NEXT:    fmov s11, w8
+; CHECK-GI-NEXT:    fmov s1, s11
+; CHECK-GI-NEXT:    mov s8, v0.s[1]
+; CHECK-GI-NEXT:    mov s9, v0.s[2]
+; CHECK-GI-NEXT:    mov s10, v0.s[3]
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-GI-NEXT:    bl fmodf
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s1, s11
+; CHECK-GI-NEXT:    fmov s0, s8
+; CHECK-GI-NEXT:    bl fmodf
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s1, s11
+; CHECK-GI-NEXT:    fmov s0, s9
+; CHECK-GI-NEXT:    bl fmodf
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s1, s11
+; CHECK-GI-NEXT:    fmov s0, s10
+; CHECK-GI-NEXT:    bl fmodf
+; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
+; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.s[2], v2.s[0]
+; CHECK-GI-NEXT:    mov v1.s[3], v0.s[0]
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    add sp, sp, #96
+; CHECK-GI-NEXT:    ret
+entry:
+  %a = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> %x)
+  %fmod = frem <4 x float> %a, <float 1152921504606846976.0, float 1152921504606846976.0, float 1152921504606846976.0, float 1152921504606846976.0>
+  ret <4 x float> %fmod
+}
+
+define float @frem2_nsz_sitofp(float %x, i32 %sa) {
+; CHECK-SD-LABEL: frem2_nsz_sitofp:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #1 // =0x1
+; CHECK-SD-NEXT:    lsl w8, w8, w0
+; CHECK-SD-NEXT:    scvtf s1, w8
+; CHECK-SD-NEXT:    fdiv s2, s0, s1
+; CHECK-SD-NEXT:    frintz s2, s2
+; CHECK-SD-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: frem2_nsz_sitofp:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    lsl w8, w8, w0
+; CHECK-GI-NEXT:    scvtf s1, w8
+; CHECK-GI-NEXT:    b fmodf
+entry:
+  %s = shl i32 1, %sa
+  %y = sitofp i32 %s to float
+  %fmod = frem nsz float %x, %y
+  ret float %fmod
+}
+
+define float @frem2_nsz_uitofp(float %x, i32 %sa) {
+; CHECK-SD-LABEL: frem2_nsz_uitofp:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #1 // =0x1
+; CHECK-SD-NEXT:    lsl w8, w8, w0
+; CHECK-SD-NEXT:    ucvtf s1, w8
+; CHECK-SD-NEXT:    fdiv s2, s0, s1
+; CHECK-SD-NEXT:    frintz s2, s2
+; CHECK-SD-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: frem2_nsz_uitofp:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    lsl w8, w8, w0
+; CHECK-GI-NEXT:    ucvtf s1, w8
+; CHECK-GI-NEXT:    b fmodf
+entry:
+  %s = shl i32 1, %sa
+  %y = uitofp i32 %s to float
+  %fmod = frem nsz float %x, %y
+  ret float %fmod
+}
+
+define float @frem2_const_sitofp(float %x, i32 %sa) {
+; CHECK-SD-LABEL: frem2_const_sitofp:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #1 // =0x1
+; CHECK-SD-NEXT:    fmov s1, #12.50000000
+; CHECK-SD-NEXT:    lsl w8, w8, w0
+; CHECK-SD-NEXT:    scvtf s0, w8
+; CHECK-SD-NEXT:    fdiv s2, s1, s0
+; CHECK-SD-NEXT:    frintz s2, s2
+; CHECK-SD-NEXT:    fmsub s0, s2, s0, s1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: frem2_const_sitofp:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    and w9, w0, #0x1f
+; CHECK-GI-NEXT:    fmov s0, #12.50000000
+; CHECK-GI-NEXT:    lsl w8, w8, w9
+; CHECK-GI-NEXT:    scvtf s1, w8
+; CHECK-GI-NEXT:    b fmodf
+entry:
+  %sa2 = and i32 %sa, 31
+  %s = shl i32 1, %sa2
+  %y = sitofp i32 %s to float
+  %fmod = frem float 12.50, %y
+  ret float %fmod
+}
+
+define float @frem2_constneg_sitofp(float %x, i32 %sa) {
+; CHECK-SD-LABEL: frem2_constneg_sitofp:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #1 // =0x1
+; CHECK-SD-NEXT:    fmov s0, #-12.50000000
+; CHECK-SD-NEXT:    lsl w8, w8, w0
+; CHECK-SD-NEXT:    scvtf s1, w8
+; CHECK-SD-NEXT:    b fmodf
+;
+; CHECK-GI-LABEL: frem2_constneg_sitofp:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    and w9, w0, #0x1f
+; CHECK-GI-NEXT:    fmov s0, #-12.50000000
+; CHECK-GI-NEXT:    lsl w8, w8, w9
+; CHECK-GI-NEXT:    scvtf s1, w8
+; CHECK-GI-NEXT:    b fmodf
+entry:
+  %sa2 = and i32 %sa, 31
+  %s = shl i32 1, %sa2
+  %y = sitofp i32 %s to float
+  %fmod = frem float -12.50, %y
+  ret float %fmod
+}
diff --git a/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll b/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll
new file mode 100644
index 000000000000..2d84a69f3144
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll
@@ -0,0 +1,156 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mattr=+sve    < %s | FileCheck %s -check-prefix CHECK-SVE
+; RUN: llc -mattr=+sve2p1 < %s | FileCheck %s -check-prefix CHECK-SVE2p1
+target triple = "aarch64-linux"
+
+; Test combining of getActiveLaneMask with a pair of extract_vector operations.
+
+define void @test_2x8bit_mask_with_32bit_index_and_trip_count(i32 %i, i32 %n) #0 {
+; CHECK-SVE-LABEL: test_2x8bit_mask_with_32bit_index_and_trip_count:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    whilelo p1.b, w0, w1
+; CHECK-SVE-NEXT:    punpklo p0.h, p1.b
+; CHECK-SVE-NEXT:    punpkhi p1.h, p1.b
+; CHECK-SVE-NEXT:    b use
+;
+; CHECK-SVE2p1-LABEL: test_2x8bit_mask_with_32bit_index_and_trip_count:
+; CHECK-SVE2p1:       // %bb.0:
+; CHECK-SVE2p1-NEXT:    mov w8, w1
+; CHECK-SVE2p1-NEXT:    mov w9, w0
+; CHECK-SVE2p1-NEXT:    whilelo { p0.h, p1.h }, x9, x8
+; CHECK-SVE2p1-NEXT:    b use
+    %r = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 %i, i32 %n)
+    %v0 = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1.i64(<vscale x 16 x i1> %r, i64 0)
+    %v1 = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1.i64(<vscale x 16 x i1> %r, i64 8)
+    tail call void @use(<vscale x 8 x i1> %v0, <vscale x 8 x i1> %v1)
+    ret void
+}
+
+define void @test_2x8bit_mask_with_64bit_index_and_trip_count(i64 %i, i64 %n) #0 {
+; CHECK-SVE-LABEL: test_2x8bit_mask_with_64bit_index_and_trip_count:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    whilelo p1.b, x0, x1
+; CHECK-SVE-NEXT:    punpklo p0.h, p1.b
+; CHECK-SVE-NEXT:    punpkhi p1.h, p1.b
+; CHECK-SVE-NEXT:    b use
+;
+; CHECK-SVE2p1-LABEL: test_2x8bit_mask_with_64bit_index_and_trip_count:
+; CHECK-SVE2p1:       // %bb.0:
+; CHECK-SVE2p1-NEXT:    whilelo { p0.h, p1.h }, x0, x1
+; CHECK-SVE2p1-NEXT:    b use
+    %r = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %i, i64 %n)
+    %v0 = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1.i64(<vscale x 16 x i1> %r, i64 0)
+    %v1 = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1.i64(<vscale x 16 x i1> %r, i64 8)
+    tail call void @use(<vscale x 8 x i1> %v0, <vscale x 8 x i1> %v1)
+    ret void
+}
+
+define void @test_edge_case_2x1bit_mask(i64 %i, i64 %n) #0 {
+; CHECK-SVE-LABEL: test_edge_case_2x1bit_mask:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    whilelo p1.d, x0, x1
+; CHECK-SVE-NEXT:    punpklo p0.h, p1.b
+; CHECK-SVE-NEXT:    punpkhi p1.h, p1.b
+; CHECK-SVE-NEXT:    b use
+;
+; CHECK-SVE2p1-LABEL: test_edge_case_2x1bit_mask:
+; CHECK-SVE2p1:       // %bb.0:
+; CHECK-SVE2p1-NEXT:    whilelo p1.d, x0, x1
+; CHECK-SVE2p1-NEXT:    punpklo p0.h, p1.b
+; CHECK-SVE2p1-NEXT:    punpkhi p1.h, p1.b
+; CHECK-SVE2p1-NEXT:    b use
+    %r = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 %i, i64 %n)
+    %v0 = call <vscale x 1 x i1> @llvm.vector.extract.nxv1i1.nxv2i1.i64(<vscale x 2 x i1> %r, i64 0)
+    %v1 = call <vscale x 1 x i1> @llvm.vector.extract.nxv1i1.nxv2i1.i64(<vscale x 2 x i1> %r, i64 1)
+    tail call void @use(<vscale x 1 x i1> %v0, <vscale x 1 x i1> %v1)
+    ret void
+}
+
+define void @test_boring_case_2x2bit_mask(i64 %i, i64 %n) #0 {
+; CHECK-SVE-LABEL: test_boring_case_2x2bit_mask:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    whilelo p1.s, x0, x1
+; CHECK-SVE-NEXT:    punpklo p0.h, p1.b
+; CHECK-SVE-NEXT:    punpkhi p1.h, p1.b
+; CHECK-SVE-NEXT:    b use
+;
+; CHECK-SVE2p1-LABEL: test_boring_case_2x2bit_mask:
+; CHECK-SVE2p1:       // %bb.0:
+; CHECK-SVE2p1-NEXT:    whilelo { p0.d, p1.d }, x0, x1
+; CHECK-SVE2p1-NEXT:    b use
+    %r = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %i, i64 %n)
+    %v0 = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1.i64(<vscale x 4 x i1> %r, i64 0)
+    %v1 = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1.i64(<vscale x 4 x i1> %r, i64 2)
+    tail call void @use(<vscale x 2 x i1> %v0, <vscale x 2 x i1> %v1)
+    ret void
+}
+
+; Negative test for when not extracting exactly two halves of the source vector
+define void @test_partial_extract(i64 %i, i64 %n) #0 {
+; CHECK-SVE-LABEL: test_partial_extract:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    whilelo p0.h, x0, x1
+; CHECK-SVE-NEXT:    punpklo p1.h, p0.b
+; CHECK-SVE-NEXT:    punpkhi p2.h, p0.b
+; CHECK-SVE-NEXT:    punpklo p0.h, p1.b
+; CHECK-SVE-NEXT:    punpklo p1.h, p2.b
+; CHECK-SVE-NEXT:    b use
+;
+; CHECK-SVE2p1-LABEL: test_partial_extract:
+; CHECK-SVE2p1:       // %bb.0:
+; CHECK-SVE2p1-NEXT:    whilelo p0.h, x0, x1
+; CHECK-SVE2p1-NEXT:    punpklo p1.h, p0.b
+; CHECK-SVE2p1-NEXT:    punpkhi p2.h, p0.b
+; CHECK-SVE2p1-NEXT:    punpklo p0.h, p1.b
+; CHECK-SVE2p1-NEXT:    punpklo p1.h, p2.b
+; CHECK-SVE2p1-NEXT:    b use
+    %r = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 %i, i64 %n)
+    %v0 = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1.i64(<vscale x 8 x i1> %r, i64 0)
+    %v1 = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1.i64(<vscale x 8 x i1> %r, i64 4)
+    tail call void @use(<vscale x 2 x i1> %v0, <vscale x 2 x i1> %v1)
+    ret void
+}
+
+;; Negative test for when extracting a fixed-length vector.
+define void @test_fixed_extract(i64 %i, i64 %n) #0 {
+; CHECK-SVE-LABEL: test_fixed_extract:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    whilelo p0.h, x0, x1
+; CHECK-SVE-NEXT:    cset w8, mi
+; CHECK-SVE-NEXT:    mov z0.h, p0/z, #1 // =0x1
+; CHECK-SVE-NEXT:    umov w9, v0.h[4]
+; CHECK-SVE-NEXT:    umov w10, v0.h[1]
+; CHECK-SVE-NEXT:    umov w11, v0.h[5]
+; CHECK-SVE-NEXT:    fmov s0, w8
+; CHECK-SVE-NEXT:    fmov s1, w9
+; CHECK-SVE-NEXT:    mov v0.s[1], w10
+; CHECK-SVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SVE-NEXT:    mov v1.s[1], w11
+; CHECK-SVE-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-SVE-NEXT:    b use
+;
+; CHECK-SVE2p1-LABEL: test_fixed_extract:
+; CHECK-SVE2p1:       // %bb.0:
+; CHECK-SVE2p1-NEXT:    whilelo p0.h, x0, x1
+; CHECK-SVE2p1-NEXT:    cset w8, mi
+; CHECK-SVE2p1-NEXT:    mov z0.h, p0/z, #1 // =0x1
+; CHECK-SVE2p1-NEXT:    umov w9, v0.h[4]
+; CHECK-SVE2p1-NEXT:    umov w10, v0.h[1]
+; CHECK-SVE2p1-NEXT:    umov w11, v0.h[5]
+; CHECK-SVE2p1-NEXT:    fmov s0, w8
+; CHECK-SVE2p1-NEXT:    fmov s1, w9
+; CHECK-SVE2p1-NEXT:    mov v0.s[1], w10
+; CHECK-SVE2p1-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SVE2p1-NEXT:    mov v1.s[1], w11
+; CHECK-SVE2p1-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-SVE2p1-NEXT:    b use
+    %r = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 %i, i64 %n)
+    %v0 = call <2 x i1> @llvm.vector.extract.v2i1.nxv8i1.i64(<vscale x 8 x i1> %r, i64 0)
+    %v1 = call <2 x i1> @llvm.vector.extract.v2i1.nxv8i1.i64(<vscale x 8 x i1> %r, i64 4)
+    tail call void @use(<2 x i1> %v0, <2 x i1> %v1)
+    ret void
+}
+
+declare void @use(...)
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/icmp.ll b/llvm/test/CodeGen/AArch64/icmp.ll
index 8e10847e7aae..88b2f279ec3e 100644
--- a/llvm/test/CodeGen/AArch64/icmp.ll
+++ b/llvm/test/CodeGen/AArch64/icmp.ll
@@ -52,6 +52,1062 @@ entry:
   ret i8 %s
 }
 
+define <2 x i1> @test_v2i64_eq(<2 x i64> %v1, <2 x i64> %v2) {
+; CHECK-LABEL: test_v2i64_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmeq v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
+  %cmp = icmp eq <2 x i64> %v1, %v2
+  ret <2 x i1> %cmp
+}
+
+define <4 x i1> @test_v4i64_eq(<4 x i64> %v1, <4 x i64> %v2) {
+; CHECK-SD-LABEL: test_v4i64_eq:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmeq v1.2d, v1.2d, v3.2d
+; CHECK-SD-NEXT:    cmeq v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_v4i64_eq:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmeq v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    cmeq v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %cmp = icmp eq <4 x i64> %v1, %v2
+  ret <4 x i1> %cmp
+}
+
+define <4 x i1> @test_v4i32_eq(<4 x i32> %v1, <4 x i32> %v2) {
+; CHECK-LABEL: test_v4i32_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmeq v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %cmp = icmp eq <4 x i32> %v1, %v2
+  ret <4 x i1> %cmp
+}
+
+define <2 x i1> @test_v2i32_eq(<2 x i32> %v1, <2 x i32> %v2) {
+; CHECK-LABEL: test_v2i32_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmeq v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
+  %cmp = icmp eq <2 x i32> %v1, %v2
+  ret <2 x i1> %cmp
+}
+
+define <2 x i1> @test_v2i16_eq(<2 x i16> %v1, <2 x i16> %v2) {
+; CHECK-SD-LABEL: test_v2i16_eq:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-SD-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-SD-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-SD-NEXT:    cmeq v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_v2i16_eq:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    cmeq v0.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    ret
+  %cmp = icmp eq <2 x i16> %v1, %v2
+  ret <2 x i1> %cmp
+}
+
+define <8 x i1> @test_v8i16_eq(<8 x i16> %v1, <8 x i16> %v2) {
+; CHECK-LABEL: test_v8i16_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmeq v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    ret
+  %cmp = icmp eq <8 x i16> %v1, %v2
+  ret <8 x i1> %cmp
+}
+
+define <4 x i1> @test_v4i16_eq(<4 x i16> %v1, <4 x i16> %v2) {
+; CHECK-LABEL: test_v4i16_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmeq v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+  %cmp = icmp eq <4 x i16> %v1, %v2
+  ret <4 x i1> %cmp
+}
+
+define <16 x i1> @test_v16i8_eq(<16 x i8> %v1, <16 x i8> %v2) {
+; CHECK-LABEL: test_v16i8_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmeq v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %cmp = icmp eq <16 x i8> %v1, %v2
+  ret <16 x i1> %cmp
+}
+
+define <8 x i1> @test_v8i8_eq(<8 x i8> %v1, <8 x i8> %v2) {
+; CHECK-LABEL: test_v8i8_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmeq v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+  %cmp = icmp eq <8 x i8> %v1, %v2
+  ret <8 x i1> %cmp
+}
+
+define <2 x i1> @test_v2i64_ne(<2 x i64> %v1, <2 x i64> %v2) {
+; CHECK-LABEL: test_v2i64_ne:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmeq v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
+  %cmp = icmp ne <2 x i64> %v1, %v2
+  ret <2 x i1> %cmp
+}
+
+define <4 x i1> @test_v4i64_ne(<4 x i64> %v1, <4 x i64> %v2) {
+; CHECK-SD-LABEL: test_v4i64_ne:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmeq v1.2d, v1.2d, v3.2d
+; CHECK-SD-NEXT:    cmeq v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    mvn v0.16b, v0.16b
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_v4i64_ne:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmeq v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    cmeq v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    mvn v0.16b, v0.16b
+; CHECK-GI-NEXT:    mvn v1.16b, v1.16b
+; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %cmp = icmp ne <4 x i64> %v1, %v2
+  ret <4 x i1> %cmp
+}
+
+define <4 x i1> @test_v4i32_ne(<4 x i32> %v1, <4 x i32> %v2) {
+; CHECK-LABEL: test_v4i32_ne:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmeq v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %cmp = icmp ne <4 x i32> %v1, %v2
+  ret <4 x i1> %cmp
+}
+
+define <2 x i1> @test_v2i32_ne(<2 x i32> %v1, <2 x i32> %v2) {
+; CHECK-LABEL: test_v2i32_ne:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmeq v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    mvn v0.8b, v0.8b
+; CHECK-NEXT:    ret
+  %cmp = icmp ne <2 x i32> %v1, %v2
+  ret <2 x i1> %cmp
+}
+
+define <2 x i1> @test_v2i16_ne(<2 x i16> %v1, <2 x i16> %v2) {
+; CHECK-SD-LABEL: test_v2i16_ne:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-SD-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-SD-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-SD-NEXT:    cmeq v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    mvn v0.8b, v0.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_v2i16_ne:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    cmeq v0.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    mvn v0.8b, v0.8b
+; CHECK-GI-NEXT:    ret
+  %cmp = icmp ne <2 x i16> %v1, %v2
+  ret <2 x i1> %cmp
+}
+
+define <8 x i1> @test_v8i16_ne(<8 x i16> %v1, <8 x i16> %v2) {
+; CHECK-LABEL: test_v8i16_ne:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmeq v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    ret
+  %cmp = icmp ne <8 x i16> %v1, %v2
+  ret <8 x i1> %cmp
+}
+
+define <4 x i1> @test_v4i16_ne(<4 x i16> %v1, <4 x i16> %v2) {
+; CHECK-LABEL: test_v4i16_ne:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmeq v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    mvn v0.8b, v0.8b
+; CHECK-NEXT:    ret
+  %cmp = icmp ne <4 x i16> %v1, %v2
+  ret <4 x i1> %cmp
+}
+
+define <16 x i1> @test_v16i8_ne(<16 x i8> %v1, <16 x i8> %v2) {
+; CHECK-LABEL: test_v16i8_ne:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmeq v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    ret
+  %cmp = icmp ne <16 x i8> %v1, %v2
+  ret <16 x i1> %cmp
+}
+
+define <8 x i1> @test_v8i8_ne(<8 x i8> %v1, <8 x i8> %v2) {
+; CHECK-LABEL: test_v8i8_ne:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmeq v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    mvn v0.8b, v0.8b
+; CHECK-NEXT:    ret
+  %cmp = icmp ne <8 x i8> %v1, %v2
+  ret <8 x i1> %cmp
+}
+
+define <2 x i1> @test_v2i64_ugt(<2 x i64> %v1, <2 x i64> %v2) {
+; CHECK-LABEL: test_v2i64_ugt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmhi v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt <2 x i64> %v1, %v2
+  ret <2 x i1> %cmp
+}
+
+define <4 x i1> @test_v4i64_ugt(<4 x i64> %v1, <4 x i64> %v2) {
+; CHECK-SD-LABEL: test_v4i64_ugt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmhi v1.2d, v1.2d, v3.2d
+; CHECK-SD-NEXT:    cmhi v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_v4i64_ugt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmhi v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    cmhi v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %cmp = icmp ugt <4 x i64> %v1, %v2
+  ret <4 x i1> %cmp
+}
+
+define <4 x i1> @test_v4i32_ugt(<4 x i32> %v1, <4 x i32> %v2) {
+; CHECK-LABEL: test_v4i32_ugt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmhi v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt <4 x i32> %v1, %v2
+  ret <4 x i1> %cmp
+}
+
+define <2 x i1> @test_v2i32_ugt(<2 x i32> %v1, <2 x i32> %v2) {
+; CHECK-LABEL: test_v2i32_ugt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmhi v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt <2 x i32> %v1, %v2
+  ret <2 x i1> %cmp
+}
+
+define <2 x i1> @test_v2i16_ugt(<2 x i16> %v1, <2 x i16> %v2) {
+; CHECK-SD-LABEL: test_v2i16_ugt:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-SD-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-SD-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-SD-NEXT:    cmhi v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_v2i16_ugt:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    cmhi v0.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    ret
+  %cmp = icmp ugt <2 x i16> %v1, %v2
+  ret <2 x i1> %cmp
+}
+
+define <8 x i1> @test_v8i16_ugt(<8 x i16> %v1, <8 x i16> %v2) {
+; CHECK-LABEL: test_v8i16_ugt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmhi v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt <8 x i16> %v1, %v2
+  ret <8 x i1> %cmp
+}
+
+define <4 x i1> @test_v4i16_ugt(<4 x i16> %v1, <4 x i16> %v2) {
+; CHECK-LABEL: test_v4i16_ugt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmhi v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt <4 x i16> %v1, %v2
+  ret <4 x i1> %cmp
+}
+
+define <16 x i1> @test_v16i8_ugt(<16 x i8> %v1, <16 x i8> %v2) {
+; CHECK-LABEL: test_v16i8_ugt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmhi v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt <16 x i8> %v1, %v2
+  ret <16 x i1> %cmp
+}
+
+define <8 x i1> @test_v8i8_ugt(<8 x i8> %v1, <8 x i8> %v2) {
+; CHECK-LABEL: test_v8i8_ugt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmhi v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt <8 x i8> %v1, %v2
+  ret <8 x i1> %cmp
+}
+
+define <2 x i1> @test_v2i64_uge(<2 x i64> %v1, <2 x i64> %v2) {
+; CHECK-LABEL: test_v2i64_uge:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmhs v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
+  %cmp = icmp uge <2 x i64> %v1, %v2
+  ret <2 x i1> %cmp
+}
+
+define <4 x i1> @test_v4i64_uge(<4 x i64> %v1, <4 x i64> %v2) {
+; CHECK-SD-LABEL: test_v4i64_uge:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmhs v1.2d, v1.2d, v3.2d
+; CHECK-SD-NEXT:    cmhs v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_v4i64_uge:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmhs v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    cmhs v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %cmp = icmp uge <4 x i64> %v1, %v2
+  ret <4 x i1> %cmp
+}
+
+define <4 x i1> @test_v4i32_uge(<4 x i32> %v1, <4 x i32> %v2) {
+; CHECK-LABEL: test_v4i32_uge:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmhs v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %cmp = icmp uge <4 x i32> %v1, %v2
+  ret <4 x i1> %cmp
+}
+
+define <2 x i1> @test_v2i32_uge(<2 x i32> %v1, <2 x i32> %v2) {
+; CHECK-LABEL: test_v2i32_uge:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmhs v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
+  %cmp = icmp uge <2 x i32> %v1, %v2
+  ret <2 x i1> %cmp
+}
+
+define <2 x i1> @test_v2i16_uge(<2 x i16> %v1, <2 x i16> %v2) {
+; CHECK-SD-LABEL: test_v2i16_uge:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-SD-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-SD-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-SD-NEXT:    cmhs v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_v2i16_uge:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    cmhs v0.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    ret
+  %cmp = icmp uge <2 x i16> %v1, %v2
+  ret <2 x i1> %cmp
+}
+
+define <8 x i1> @test_v8i16_uge(<8 x i16> %v1, <8 x i16> %v2) {
+; CHECK-LABEL: test_v8i16_uge:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmhs v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    ret
+  %cmp = icmp uge <8 x i16> %v1, %v2
+  ret <8 x i1> %cmp
+}
+
+define <4 x i1> @test_v4i16_uge(<4 x i16> %v1, <4 x i16> %v2) {
+; CHECK-LABEL: test_v4i16_uge:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmhs v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+  %cmp = icmp uge <4 x i16> %v1, %v2
+  ret <4 x i1> %cmp
+}
+
+define <16 x i1> @test_v16i8_uge(<16 x i8> %v1, <16 x i8> %v2) {
+; CHECK-LABEL: test_v16i8_uge:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmhs v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %cmp = icmp uge <16 x i8> %v1, %v2
+  ret <16 x i1> %cmp
+}
+
+define <8 x i1> @test_v8i8_uge(<8 x i8> %v1, <8 x i8> %v2) {
+; CHECK-LABEL: test_v8i8_uge:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmhs v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+  %cmp = icmp uge <8 x i8> %v1, %v2
+  ret <8 x i1> %cmp
+}
+
+define <2 x i1> @test_v2i64_ult(<2 x i64> %v1, <2 x i64> %v2) {
+; CHECK-LABEL: test_v2i64_ult:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmhi v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
+  %cmp = icmp ult <2 x i64> %v1, %v2
+  ret <2 x i1> %cmp
+}
+
+define <4 x i1> @test_v4i64_ult(<4 x i64> %v1, <4 x i64> %v2) {
+; CHECK-SD-LABEL: test_v4i64_ult:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmhi v1.2d, v3.2d, v1.2d
+; CHECK-SD-NEXT:    cmhi v0.2d, v2.2d, v0.2d
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_v4i64_ult:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmhi v0.2d, v2.2d, v0.2d
+; CHECK-GI-NEXT:    cmhi v1.2d, v3.2d, v1.2d
+; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %cmp = icmp ult <4 x i64> %v1, %v2
+  ret <4 x i1> %cmp
+}
+
+define <4 x i1> @test_v4i32_ult(<4 x i32> %v1, <4 x i32> %v2) {
+; CHECK-LABEL: test_v4i32_ult:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmhi v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %cmp = icmp ult <4 x i32> %v1, %v2
+  ret <4 x i1> %cmp
+}
+
+define <2 x i1> @test_v2i32_ult(<2 x i32> %v1, <2 x i32> %v2) {
+; CHECK-LABEL: test_v2i32_ult:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmhi v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    ret
+  %cmp = icmp ult <2 x i32> %v1, %v2
+  ret <2 x i1> %cmp
+}
+
+define <2 x i1> @test_v2i16_ult(<2 x i16> %v1, <2 x i16> %v2) {
+; CHECK-LABEL: test_v2i16_ult:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-NEXT:    cmhi v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    ret
+  %cmp = icmp ult <2 x i16> %v1, %v2
+  ret <2 x i1> %cmp
+}
+
+define <8 x i1> @test_v8i16_ult(<8 x i16> %v1, <8 x i16> %v2) {
+; CHECK-LABEL: test_v8i16_ult:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmhi v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    ret
+  %cmp = icmp ult <8 x i16> %v1, %v2
+  ret <8 x i1> %cmp
+}
+
+define <4 x i1> @test_v4i16_ult(<4 x i16> %v1, <4 x i16> %v2) {
+; CHECK-LABEL: test_v4i16_ult:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmhi v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    ret
+  %cmp = icmp ult <4 x i16> %v1, %v2
+  ret <4 x i1> %cmp
+}
+
+define <16 x i1> @test_v16i8_ult(<16 x i8> %v1, <16 x i8> %v2) {
+; CHECK-LABEL: test_v16i8_ult:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmhi v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    ret
+  %cmp = icmp ult <16 x i8> %v1, %v2
+  ret <16 x i1> %cmp
+}
+
+define <8 x i1> @test_v8i8_ult(<8 x i8> %v1, <8 x i8> %v2) {
+; CHECK-LABEL: test_v8i8_ult:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmhi v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ret
+  %cmp = icmp ult <8 x i8> %v1, %v2
+  ret <8 x i1> %cmp
+}
+
+define <2 x i1> @test_v2i64_ule(<2 x i64> %v1, <2 x i64> %v2) {
+; CHECK-LABEL: test_v2i64_ule:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmhs v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
+  %cmp = icmp ule <2 x i64> %v1, %v2
+  ret <2 x i1> %cmp
+}
+
+define <4 x i1> @test_v4i64_ule(<4 x i64> %v1, <4 x i64> %v2) {
+; CHECK-SD-LABEL: test_v4i64_ule:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmhs v1.2d, v3.2d, v1.2d
+; CHECK-SD-NEXT:    cmhs v0.2d, v2.2d, v0.2d
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_v4i64_ule:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmhs v0.2d, v2.2d, v0.2d
+; CHECK-GI-NEXT:    cmhs v1.2d, v3.2d, v1.2d
+; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %cmp = icmp ule <4 x i64> %v1, %v2
+  ret <4 x i1> %cmp
+}
+
+define <4 x i1> @test_v4i32_ule(<4 x i32> %v1, <4 x i32> %v2) {
+; CHECK-LABEL: test_v4i32_ule:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %cmp = icmp ule <4 x i32> %v1, %v2
+  ret <4 x i1> %cmp
+}
+
+define <2 x i1> @test_v2i32_ule(<2 x i32> %v1, <2 x i32> %v2) {
+; CHECK-LABEL: test_v2i32_ule:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmhs v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    ret
+  %cmp = icmp ule <2 x i32> %v1, %v2
+  ret <2 x i1> %cmp
+}
+
+define <2 x i1> @test_v2i16_ule(<2 x i16> %v1, <2 x i16> %v2) {
+; CHECK-LABEL: test_v2i16_ule:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-NEXT:    cmhs v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    ret
+  %cmp = icmp ule <2 x i16> %v1, %v2
+  ret <2 x i1> %cmp
+}
+
+define <8 x i1> @test_v8i16_ule(<8 x i16> %v1, <8 x i16> %v2) {
+; CHECK-LABEL: test_v8i16_ule:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmhs v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    ret
+  %cmp = icmp ule <8 x i16> %v1, %v2
+  ret <8 x i1> %cmp
+}
+
+define <4 x i1> @test_v4i16_ule(<4 x i16> %v1, <4 x i16> %v2) {
+; CHECK-LABEL: test_v4i16_ule:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmhs v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    ret
+  %cmp = icmp ule <4 x i16> %v1, %v2
+  ret <4 x i1> %cmp
+}
+
+define <16 x i1> @test_v16i8_ule(<16 x i8> %v1, <16 x i8> %v2) {
+; CHECK-LABEL: test_v16i8_ule:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmhs v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    ret
+  %cmp = icmp ule <16 x i8> %v1, %v2
+  ret <16 x i1> %cmp
+}
+
+define <8 x i1> @test_v8i8_ule(<8 x i8> %v1, <8 x i8> %v2) {
+; CHECK-LABEL: test_v8i8_ule:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmhs v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ret
+  %cmp = icmp ule <8 x i8> %v1, %v2
+  ret <8 x i1> %cmp
+}
+
+define <2 x i1> @test_v2i64_sgt(<2 x i64> %v1, <2 x i64> %v2) {
+; CHECK-LABEL: test_v2i64_sgt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt <2 x i64> %v1, %v2
+  ret <2 x i1> %cmp
+}
+
+define <4 x i1> @test_v4i64_sgt(<4 x i64> %v1, <4 x i64> %v2) {
+; CHECK-SD-LABEL: test_v4i64_sgt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmgt v1.2d, v1.2d, v3.2d
+; CHECK-SD-NEXT:    cmgt v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_v4i64_sgt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmgt v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    cmgt v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %cmp = icmp sgt <4 x i64> %v1, %v2
+  ret <4 x i1> %cmp
+}
+
+define <4 x i1> @test_v4i32_sgt(<4 x i32> %v1, <4 x i32> %v2) {
+; CHECK-LABEL: test_v4i32_sgt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt <4 x i32> %v1, %v2
+  ret <4 x i1> %cmp
+}
+
+define <2 x i1> @test_v2i32_sgt(<2 x i32> %v1, <2 x i32> %v2) {
+; CHECK-LABEL: test_v2i32_sgt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt <2 x i32> %v1, %v2
+  ret <2 x i1> %cmp
+}
+
+define <2 x i1> @test_v2i16_sgt(<2 x i16> %v1, <2 x i16> %v2) {
+; CHECK-SD-LABEL: test_v2i16_sgt:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    shl v1.2s, v1.2s, #16
+; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-SD-NEXT:    sshr v1.2s, v1.2s, #16
+; CHECK-SD-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-SD-NEXT:    cmgt v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_v2i16_sgt:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-GI-NEXT:    shl v1.2s, v1.2s, #16
+; CHECK-GI-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-GI-NEXT:    sshr v1.2s, v1.2s, #16
+; CHECK-GI-NEXT:    cmgt v0.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    ret
+  %cmp = icmp sgt <2 x i16> %v1, %v2
+  ret <2 x i1> %cmp
+}
+
+define <8 x i1> @test_v8i16_sgt(<8 x i16> %v1, <8 x i16> %v2) {
+; CHECK-LABEL: test_v8i16_sgt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt <8 x i16> %v1, %v2
+  ret <8 x i1> %cmp
+}
+
+define <4 x i1> @test_v4i16_sgt(<4 x i16> %v1, <4 x i16> %v2) {
+; CHECK-LABEL: test_v4i16_sgt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt <4 x i16> %v1, %v2
+  ret <4 x i1> %cmp
+}
+
+define <16 x i1> @test_v16i8_sgt(<16 x i8> %v1, <16 x i8> %v2) {
+; CHECK-LABEL: test_v16i8_sgt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt <16 x i8> %v1, %v2
+  ret <16 x i1> %cmp
+}
+
+define <8 x i1> @test_v8i8_sgt(<8 x i8> %v1, <8 x i8> %v2) {
+; CHECK-LABEL: test_v8i8_sgt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt <8 x i8> %v1, %v2
+  ret <8 x i1> %cmp
+}
+
+define <2 x i1> @test_v2i64_sge(<2 x i64> %v1, <2 x i64> %v2) {
+; CHECK-LABEL: test_v2i64_sge:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmge v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
+  %cmp = icmp sge <2 x i64> %v1, %v2
+  ret <2 x i1> %cmp
+}
+
+define <4 x i1> @test_v4i64_sge(<4 x i64> %v1, <4 x i64> %v2) {
+; CHECK-SD-LABEL: test_v4i64_sge:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmge v1.2d, v1.2d, v3.2d
+; CHECK-SD-NEXT:    cmge v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_v4i64_sge:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmge v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    cmge v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %cmp = icmp sge <4 x i64> %v1, %v2
+  ret <4 x i1> %cmp
+}
+
+define <4 x i1> @test_v4i32_sge(<4 x i32> %v1, <4 x i32> %v2) {
+; CHECK-LABEL: test_v4i32_sge:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmge v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %cmp = icmp sge <4 x i32> %v1, %v2
+  ret <4 x i1> %cmp
+}
+
+define <2 x i1> @test_v2i32_sge(<2 x i32> %v1, <2 x i32> %v2) {
+; CHECK-LABEL: test_v2i32_sge:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmge v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
+  %cmp = icmp sge <2 x i32> %v1, %v2
+  ret <2 x i1> %cmp
+}
+
+define <2 x i1> @test_v2i16_sge(<2 x i16> %v1, <2 x i16> %v2) {
+; CHECK-SD-LABEL: test_v2i16_sge:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    shl v1.2s, v1.2s, #16
+; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-SD-NEXT:    sshr v1.2s, v1.2s, #16
+; CHECK-SD-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-SD-NEXT:    cmge v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_v2i16_sge:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-GI-NEXT:    shl v1.2s, v1.2s, #16
+; CHECK-GI-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-GI-NEXT:    sshr v1.2s, v1.2s, #16
+; CHECK-GI-NEXT:    cmge v0.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    ret
+  %cmp = icmp sge <2 x i16> %v1, %v2
+  ret <2 x i1> %cmp
+}
+
+define <8 x i1> @test_v8i16_sge(<8 x i16> %v1, <8 x i16> %v2) {
+; CHECK-LABEL: test_v8i16_sge:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmge v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    ret
+  %cmp = icmp sge <8 x i16> %v1, %v2
+  ret <8 x i1> %cmp
+}
+
+define <4 x i1> @test_v4i16_sge(<4 x i16> %v1, <4 x i16> %v2) {
+; CHECK-LABEL: test_v4i16_sge:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmge v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+  %cmp = icmp sge <4 x i16> %v1, %v2
+  ret <4 x i1> %cmp
+}
+
+define <16 x i1> @test_v16i8_sge(<16 x i8> %v1, <16 x i8> %v2) {
+; CHECK-LABEL: test_v16i8_sge:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmge v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %cmp = icmp sge <16 x i8> %v1, %v2
+  ret <16 x i1> %cmp
+}
+
+define <8 x i1> @test_v8i8_sge(<8 x i8> %v1, <8 x i8> %v2) {
+; CHECK-LABEL: test_v8i8_sge:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmge v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+  %cmp = icmp sge <8 x i8> %v1, %v2
+  ret <8 x i1> %cmp
+}
+
+define <2 x i1> @test_v2i64_slt(<2 x i64> %v1, <2 x i64> %v2) {
+; CHECK-LABEL: test_v2i64_slt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
+  %cmp = icmp slt <2 x i64> %v1, %v2
+  ret <2 x i1> %cmp
+}
+
+define <4 x i1> @test_v4i64_slt(<4 x i64> %v1, <4 x i64> %v2) {
+; CHECK-SD-LABEL: test_v4i64_slt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmgt v1.2d, v3.2d, v1.2d
+; CHECK-SD-NEXT:    cmgt v0.2d, v2.2d, v0.2d
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_v4i64_slt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmgt v0.2d, v2.2d, v0.2d
+; CHECK-GI-NEXT:    cmgt v1.2d, v3.2d, v1.2d
+; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %cmp = icmp slt <4 x i64> %v1, %v2
+  ret <4 x i1> %cmp
+}
+
+define <4 x i1> @test_v4i32_slt(<4 x i32> %v1, <4 x i32> %v2) {
+; CHECK-LABEL: test_v4i32_slt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %cmp = icmp slt <4 x i32> %v1, %v2
+  ret <4 x i1> %cmp
+}
+
+define <2 x i1> @test_v2i32_slt(<2 x i32> %v1, <2 x i32> %v2) {
+; CHECK-LABEL: test_v2i32_slt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    ret
+  %cmp = icmp slt <2 x i32> %v1, %v2
+  ret <2 x i1> %cmp
+}
+
+define <2 x i1> @test_v2i16_slt(<2 x i16> %v1, <2 x i16> %v2) {
+; CHECK-LABEL: test_v2i16_slt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-NEXT:    shl v1.2s, v1.2s, #16
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-NEXT:    sshr v1.2s, v1.2s, #16
+; CHECK-NEXT:    cmgt v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    ret
+  %cmp = icmp slt <2 x i16> %v1, %v2
+  ret <2 x i1> %cmp
+}
+
+define <8 x i1> @test_v8i16_slt(<8 x i16> %v1, <8 x i16> %v2) {
+; CHECK-LABEL: test_v8i16_slt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    ret
+  %cmp = icmp slt <8 x i16> %v1, %v2
+  ret <8 x i1> %cmp
+}
+
+define <4 x i1> @test_v4i16_slt(<4 x i16> %v1, <4 x i16> %v2) {
+; CHECK-LABEL: test_v4i16_slt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    ret
+  %cmp = icmp slt <4 x i16> %v1, %v2
+  ret <4 x i1> %cmp
+}
+
+define <16 x i1> @test_v16i8_slt(<16 x i8> %v1, <16 x i8> %v2) {
+; CHECK-LABEL: test_v16i8_slt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    ret
+  %cmp = icmp slt <16 x i8> %v1, %v2
+  ret <16 x i1> %cmp
+}
+
+define <8 x i1> @test_v8i8_slt(<8 x i8> %v1, <8 x i8> %v2) {
+; CHECK-LABEL: test_v8i8_slt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ret
+  %cmp = icmp slt <8 x i8> %v1, %v2
+  ret <8 x i1> %cmp
+}
+
+define <2 x i1> @test_v2i64_sle(<2 x i64> %v1, <2 x i64> %v2) {
+; CHECK-LABEL: test_v2i64_sle:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmge v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
+  %cmp = icmp sle <2 x i64> %v1, %v2
+  ret <2 x i1> %cmp
+}
+
+define <4 x i1> @test_v4i64_sle(<4 x i64> %v1, <4 x i64> %v2) {
+; CHECK-SD-LABEL: test_v4i64_sle:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmge v1.2d, v3.2d, v1.2d
+; CHECK-SD-NEXT:    cmge v0.2d, v2.2d, v0.2d
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_v4i64_sle:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmge v0.2d, v2.2d, v0.2d
+; CHECK-GI-NEXT:    cmge v1.2d, v3.2d, v1.2d
+; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %cmp = icmp sle <4 x i64> %v1, %v2
+  ret <4 x i1> %cmp
+}
+
+define <4 x i1> @test_v4i32_sle(<4 x i32> %v1, <4 x i32> %v2) {
+; CHECK-LABEL: test_v4i32_sle:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmge v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %cmp = icmp sle <4 x i32> %v1, %v2
+  ret <4 x i1> %cmp
+}
+
+define <2 x i1> @test_v2i32_sle(<2 x i32> %v1, <2 x i32> %v2) {
+; CHECK-LABEL: test_v2i32_sle:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmge v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    ret
+  %cmp = icmp sle <2 x i32> %v1, %v2
+  ret <2 x i1> %cmp
+}
+
+define <2 x i1> @test_v2i16_sle(<2 x i16> %v1, <2 x i16> %v2) {
+; CHECK-LABEL: test_v2i16_sle:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-NEXT:    shl v1.2s, v1.2s, #16
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-NEXT:    sshr v1.2s, v1.2s, #16
+; CHECK-NEXT:    cmge v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    ret
+  %cmp = icmp sle <2 x i16> %v1, %v2
+  ret <2 x i1> %cmp
+}
+
+define <8 x i1> @test_v8i16_sle(<8 x i16> %v1, <8 x i16> %v2) {
+; CHECK-LABEL: test_v8i16_sle:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmge v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    ret
+  %cmp = icmp sle <8 x i16> %v1, %v2
+  ret <8 x i1> %cmp
+}
+
+define <4 x i1> @test_v4i16_sle(<4 x i16> %v1, <4 x i16> %v2) {
+; CHECK-LABEL: test_v4i16_sle:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmge v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    ret
+  %cmp = icmp sle <4 x i16> %v1, %v2
+  ret <4 x i1> %cmp
+}
+
+define <16 x i1> @test_v16i8_sle(<16 x i8> %v1, <16 x i8> %v2) {
+; CHECK-LABEL: test_v16i8_sle:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmge v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    ret
+  %cmp = icmp sle <16 x i8> %v1, %v2
+  ret <16 x i1> %cmp
+}
+
+define <8 x i1> @test_v8i8_sle(<8 x i8> %v1, <8 x i8> %v2) {
+; CHECK-LABEL: test_v8i8_sle:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmge v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ret
+  %cmp = icmp sle <8 x i8> %v1, %v2
+  ret <8 x i1> %cmp
+}
+
 define <2 x i64> @v2i64_i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %d, <2 x i64> %e) {
 ; CHECK-LABEL: v2i64_i64:
 ; CHECK:       // %bb.0: // %entry
@@ -319,3 +1375,1117 @@ entry:
   %s = select <32 x i1> %c, <32 x i8> %d, <32 x i8> %e
   ret <32 x i8> %s
 }
+
+; ===== ICMP Zero RHS =====
+
+define <8 x i1> @icmp_eq_v8i8_Zero_RHS(<8 x i8> %a) {
+; CHECK-SD-LABEL: icmp_eq_v8i8_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmeq v0.8b, v0.8b, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_eq_v8i8_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmeq v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
+    %c = icmp eq <8 x i8> %a, <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+    ret <8 x i1> %c
+}
+
+define <16 x i1> @icmp_eq_v16i8_Zero_RHS(<16 x i8> %a) {
+; CHECK-SD-LABEL: icmp_eq_v16i8_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmeq v0.16b, v0.16b, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_eq_v16i8_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmeq v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+    %c = icmp eq <16 x i8> %a, <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+    ret <16 x i1> %c
+}
+
+define <4 x i1> @icmp_eq_v4i16_Zero_RHS(<4 x i16> %a) {
+; CHECK-SD-LABEL: icmp_eq_v4i16_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmeq v0.4h, v0.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_eq_v4i16_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmeq v0.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    ret
+    %c = icmp eq <4 x i16> %a, <i16 0, i16 0, i16 0, i16 0>
+    ret <4 x i1> %c
+}
+
+define <8 x i1> @icmp_eq_v8i16_Zero_RHS(<8 x i16> %a) {
+; CHECK-SD-LABEL: icmp_eq_v8i16_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmeq v0.8h, v0.8h, #0
+; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_eq_v8i16_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmeq v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    ret
+    %c = icmp eq <8 x i16> %a, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+    ret <8 x i1> %c
+}
+
+define <2 x i1> @icmp_eq_v2i32_Zero_RHS(<2 x i32> %a) {
+; CHECK-SD-LABEL: icmp_eq_v2i32_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmeq v0.2s, v0.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_eq_v2i32_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmeq v0.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    ret
+    %c = icmp eq <2 x i32> %a, <i32 0, i32 0>
+    ret <2 x i1> %c
+}
+
+define <4 x i1> @icmp_eq_v4i32_Zero_RHS(<4 x i32> %a) {
+; CHECK-SD-LABEL: icmp_eq_v4i32_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_eq_v4i32_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmeq v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+    %c = icmp eq <4 x i32> %a, <i32 0, i32 0, i32 0, i32 0>
+    ret <4 x i1> %c
+}
+
+define <2 x i1> @icmp_eq_v2i64_Zero_RHS(<2 x i64> %a) {
+; CHECK-SD-LABEL: icmp_eq_v2i64_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmeq v0.2d, v0.2d, #0
+; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_eq_v2i64_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmeq v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    ret
+    %c = icmp eq <2 x i64> %a, <i64 0, i64 0>
+    ret <2 x i1> %c
+}
+
+define <8 x i1> @icmp_sge_v8i8_Zero_RHS(<8 x i8> %a) {
+; CHECK-SD-LABEL: icmp_sge_v8i8_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmge v0.8b, v0.8b, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sge_v8i8_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmge v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
+    %c = icmp sge <8 x i8> %a, <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+    ret <8 x i1> %c
+}
+
+define <16 x i1> @icmp_sge_v16i8_Zero_RHS(<16 x i8> %a) {
+; CHECK-SD-LABEL: icmp_sge_v16i8_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmge v0.16b, v0.16b, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sge_v16i8_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmge v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+    %c = icmp sge <16 x i8> %a, <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+    ret <16 x i1> %c
+}
+
+define <4 x i1> @icmp_sge_v4i16_Zero_RHS(<4 x i16> %a) {
+; CHECK-SD-LABEL: icmp_sge_v4i16_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmge v0.4h, v0.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sge_v4i16_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmge v0.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    ret
+    %c = icmp sge <4 x i16> %a, <i16 0, i16 0, i16 0, i16 0>
+    ret <4 x i1> %c
+}
+
+define <8 x i1> @icmp_sge_v8i16_Zero_RHS(<8 x i16> %a) {
+; CHECK-SD-LABEL: icmp_sge_v8i16_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmge v0.8h, v0.8h, #0
+; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sge_v8i16_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmge v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    ret
+    %c = icmp sge <8 x i16> %a, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+    ret <8 x i1> %c
+}
+
+define <2 x i1> @icmp_sge_v2i32_Zero_RHS(<2 x i32> %a) {
+; CHECK-SD-LABEL: icmp_sge_v2i32_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmge v0.2s, v0.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sge_v2i32_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmge v0.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    ret
+    %c = icmp sge <2 x i32> %a, <i32 0, i32 0>
+    ret <2 x i1> %c
+}
+
+define <4 x i1> @icmp_sge_v4i32_Zero_RHS(<4 x i32> %a) {
+; CHECK-SD-LABEL: icmp_sge_v4i32_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmge v0.4s, v0.4s, #0
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sge_v4i32_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmge v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+    %c = icmp sge <4 x i32> %a, <i32 0, i32 0, i32 0, i32 0>
+    ret <4 x i1> %c
+}
+
+define <2 x i1> @icmp_sge_v2i64_Zero_RHS(<2 x i64> %a) {
+; CHECK-SD-LABEL: icmp_sge_v2i64_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmge v0.2d, v0.2d, #0
+; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sge_v2i64_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmge v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    ret
+    %c = icmp sge <2 x i64> %a, <i64 0, i64 0>
+    ret <2 x i1> %c
+}
+
+define <8 x i1> @icmp_sgt_v8i8_Zero_RHS(<8 x i8> %a) {
+; CHECK-SD-LABEL: icmp_sgt_v8i8_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmgt v0.8b, v0.8b, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sgt_v8i8_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmgt v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
+    %c = icmp sgt <8 x i8> %a, <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+    ret <8 x i1> %c
+}
+
+define <16 x i1> @icmp_sgt_v16i8_Zero_RHS(<16 x i8> %a) {
+; CHECK-SD-LABEL: icmp_sgt_v16i8_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmgt v0.16b, v0.16b, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sgt_v16i8_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmgt v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+    %c = icmp sgt <16 x i8> %a, <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+    ret <16 x i1> %c
+}
+
+define <4 x i1> @icmp_sgt_v4i16_Zero_RHS(<4 x i16> %a) {
+; CHECK-SD-LABEL: icmp_sgt_v4i16_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmgt v0.4h, v0.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sgt_v4i16_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmgt v0.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    ret
+    %c = icmp sgt <4 x i16> %a, <i16 0, i16 0, i16 0, i16 0>
+    ret <4 x i1> %c
+}
+
+define <8 x i1> @icmp_sgt_v8i16_Zero_RHS(<8 x i16> %a) {
+; CHECK-SD-LABEL: icmp_sgt_v8i16_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmgt v0.8h, v0.8h, #0
+; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sgt_v8i16_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmgt v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    ret
+    %c = icmp sgt <8 x i16> %a, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+    ret <8 x i1> %c
+}
+
+define <2 x i1> @icmp_sgt_v2i32_Zero_RHS(<2 x i32> %a) {
+; CHECK-SD-LABEL: icmp_sgt_v2i32_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmgt v0.2s, v0.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sgt_v2i32_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmgt v0.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    ret
+    %c = icmp sgt <2 x i32> %a, <i32 0, i32 0>
+    ret <2 x i1> %c
+}
+
+define <4 x i1> @icmp_sgt_v4i32_Zero_RHS(<4 x i32> %a) {
+; CHECK-SD-LABEL: icmp_sgt_v4i32_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmgt v0.4s, v0.4s, #0
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sgt_v4i32_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmgt v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+    %c = icmp sgt <4 x i32> %a, <i32 0, i32 0, i32 0, i32 0>
+    ret <4 x i1> %c
+}
+
+define <2 x i1> @icmp_sgt_v2i64_Zero_RHS(<2 x i64> %a) {
+; CHECK-SD-LABEL: icmp_sgt_v2i64_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmgt v0.2d, v0.2d, #0
+; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sgt_v2i64_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmgt v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    ret
+    %c = icmp sgt <2 x i64> %a, <i64 0, i64 0>
+    ret <2 x i1> %c
+}
+
+define <8 x i1> @icmp_sle_v8i8_Zero_RHS(<8 x i8> %a) {
+; CHECK-SD-LABEL: icmp_sle_v8i8_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmle v0.8b, v0.8b, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sle_v8i8_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmge v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ret
+    %c = icmp sle <8 x i8> %a, <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+    ret <8 x i1> %c
+}
+
+define <16 x i1> @icmp_sle_v16i8_Zero_RHS(<16 x i8> %a) {
+; CHECK-SD-LABEL: icmp_sle_v16i8_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmle v0.16b, v0.16b, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sle_v16i8_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmge v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
+    %c = icmp sle <16 x i8> %a, <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+    ret <16 x i1> %c
+}
+
+define <4 x i1> @icmp_sle_v4i16_Zero_RHS(<4 x i16> %a) {
+; CHECK-SD-LABEL: icmp_sle_v4i16_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmle v0.4h, v0.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sle_v4i16_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmge v0.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    ret
+    %c = icmp sle <4 x i16> %a, <i16 0, i16 0, i16 0, i16 0>
+    ret <4 x i1> %c
+}
+
+define <8 x i1> @icmp_sle_v8i16_Zero_RHS(<8 x i16> %a) {
+; CHECK-SD-LABEL: icmp_sle_v8i16_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmle v0.8h, v0.8h, #0
+; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sle_v8i16_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmge v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    ret
+    %c = icmp sle <8 x i16> %a, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+    ret <8 x i1> %c
+}
+
+define <2 x i1> @icmp_sle_v2i32_Zero_RHS(<2 x i32> %a) {
+; CHECK-SD-LABEL: icmp_sle_v2i32_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmle v0.2s, v0.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sle_v2i32_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmge v0.2s, v1.2s, v0.2s
+; CHECK-GI-NEXT:    ret
+    %c = icmp sle <2 x i32> %a, <i32 0, i32 0>
+    ret <2 x i1> %c
+}
+
+define <4 x i1> @icmp_sle_v4i32_Zero_RHS(<4 x i32> %a) {
+; CHECK-SD-LABEL: icmp_sle_v4i32_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmle v0.4s, v0.4s, #0
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sle_v4i32_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmge v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+    %c = icmp sle <4 x i32> %a, <i32 0, i32 0, i32 0, i32 0>
+    ret <4 x i1> %c
+}
+
+define <2 x i1> @icmp_sle_v2i64_Zero_RHS(<2 x i64> %a) {
+; CHECK-SD-LABEL: icmp_sle_v2i64_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmle v0.2d, v0.2d, #0
+; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sle_v2i64_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmge v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    ret
+    %c = icmp sle <2 x i64> %a, <i64 0, i64 0>
+    ret <2 x i1> %c
+}
+
+define <8 x i1> @icmp_slt_v8i8_Zero_RHS(<8 x i8> %a) {
+; CHECK-SD-LABEL: icmp_slt_v8i8_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmlt v0.8b, v0.8b, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_slt_v8i8_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmgt v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ret
+    %c = icmp slt <8 x i8> %a, <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+    ret <8 x i1> %c
+}
+
+define <16 x i1> @icmp_slt_v16i8_Zero_RHS(<16 x i8> %a) {
+; CHECK-SD-LABEL: icmp_slt_v16i8_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmlt v0.16b, v0.16b, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_slt_v16i8_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmgt v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
+    %c = icmp slt <16 x i8> %a, <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+    ret <16 x i1> %c
+}
+
+define <4 x i1> @icmp_slt_v4i16_Zero_RHS(<4 x i16> %a) {
+; CHECK-SD-LABEL: icmp_slt_v4i16_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmlt v0.4h, v0.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_slt_v4i16_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmgt v0.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    ret
+    %c = icmp slt <4 x i16> %a, <i16 0, i16 0, i16 0, i16 0>
+    ret <4 x i1> %c
+}
+
+define <8 x i1> @icmp_slt_v8i16_Zero_RHS(<8 x i16> %a) {
+; CHECK-SD-LABEL: icmp_slt_v8i16_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_slt_v8i16_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmgt v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    ret
+    %c = icmp slt <8 x i16> %a, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+    ret <8 x i1> %c
+}
+
+define <2 x i1> @icmp_slt_v2i32_Zero_RHS(<2 x i32> %a) {
+; CHECK-SD-LABEL: icmp_slt_v2i32_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmlt v0.2s, v0.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_slt_v2i32_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmgt v0.2s, v1.2s, v0.2s
+; CHECK-GI-NEXT:    ret
+    %c = icmp slt <2 x i32> %a, <i32 0, i32 0>
+    ret <2 x i1> %c
+}
+
+define <4 x i1> @icmp_slt_v4i32_Zero_RHS(<4 x i32> %a) {
+; CHECK-SD-LABEL: icmp_slt_v4i32_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmlt v0.4s, v0.4s, #0
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_slt_v4i32_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmgt v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+    %c = icmp slt <4 x i32> %a, <i32 0, i32 0, i32 0, i32 0>
+    ret <4 x i1> %c
+}
+
+define <2 x i1> @icmp_slt_v2i64_Zero_RHS(<2 x i64> %a) {
+; CHECK-SD-LABEL: icmp_slt_v2i64_Zero_RHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmlt v0.2d, v0.2d, #0
+; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_slt_v2i64_Zero_RHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmgt v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    ret
+    %c = icmp slt <2 x i64> %a, <i64 0, i64 0>
+    ret <2 x i1> %c
+}
+
+; ===== ICMP Zero LHS =====
+
+define <8 x i1> @icmp_eq_v8i8_Zero_LHS(<8 x i8> %a) {
+; CHECK-SD-LABEL: icmp_eq_v8i8_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmeq v0.8b, v0.8b, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_eq_v8i8_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmeq v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ret
+    %c = icmp eq <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, %a
+    ret <8 x i1> %c
+}
+
+define <16 x i1> @icmp_eq_v16i8_Zero_LHS(<16 x i8> %a) {
+; CHECK-SD-LABEL: icmp_eq_v16i8_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmeq v0.16b, v0.16b, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_eq_v16i8_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmeq v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
+    %c = icmp eq <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, %a
+    ret <16 x i1> %c
+}
+
+define <4 x i1> @icmp_eq_v4i16_Zero_LHS(<4 x i16> %a) {
+; CHECK-SD-LABEL: icmp_eq_v4i16_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmeq v0.4h, v0.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_eq_v4i16_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmeq v0.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    ret
+    %c = icmp eq <4 x i16> <i16 0, i16 0, i16 0, i16 0>, %a
+    ret <4 x i1> %c
+}
+
+define <8 x i1> @icmp_eq_v8i16_Zero_LHS(<8 x i16> %a) {
+; CHECK-SD-LABEL: icmp_eq_v8i16_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmeq v0.8h, v0.8h, #0
+; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_eq_v8i16_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmeq v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    ret
+    %c = icmp eq <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, %a
+    ret <8 x i1> %c
+}
+
+define <2 x i1> @icmp_eq_v2i32_Zero_LHS(<2 x i32> %a) {
+; CHECK-SD-LABEL: icmp_eq_v2i32_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmeq v0.2s, v0.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_eq_v2i32_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmeq v0.2s, v1.2s, v0.2s
+; CHECK-GI-NEXT:    ret
+    %c = icmp eq <2 x i32> <i32 0, i32 0>, %a
+    ret <2 x i1> %c
+}
+
+define <4 x i1> @icmp_eq_v4i32_Zero_LHS(<4 x i32> %a) {
+; CHECK-SD-LABEL: icmp_eq_v4i32_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_eq_v4i32_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmeq v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+    %c = icmp eq <4 x i32> <i32 0, i32 0, i32 0, i32 0>, %a
+    ret <4 x i1> %c
+}
+
+define <2 x i1> @icmp_eq_v2i64_Zero_LHS(<2 x i64> %a) {
+; CHECK-SD-LABEL: icmp_eq_v2i64_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmeq v0.2d, v0.2d, #0
+; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_eq_v2i64_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmeq v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    ret
+    %c = icmp eq <2 x i64> <i64 0, i64 0>, %a
+    ret <2 x i1> %c
+}
+
+define <8 x i1> @icmp_sge_v8i8_Zero_LHS(<8 x i8> %a) {
+; CHECK-SD-LABEL: icmp_sge_v8i8_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmle v0.8b, v0.8b, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sge_v8i8_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmge v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ret
+    %c = icmp sge <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, %a
+    ret <8 x i1> %c
+}
+
+define <16 x i1> @icmp_sge_v16i8_Zero_LHS(<16 x i8> %a) {
+; CHECK-SD-LABEL: icmp_sge_v16i8_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmle v0.16b, v0.16b, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sge_v16i8_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmge v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
+    %c = icmp sge <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, %a
+    ret <16 x i1> %c
+}
+
+define <4 x i1> @icmp_sge_v4i16_Zero_LHS(<4 x i16> %a) {
+; CHECK-SD-LABEL: icmp_sge_v4i16_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmle v0.4h, v0.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sge_v4i16_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmge v0.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    ret
+    %c = icmp sge <4 x i16> <i16 0, i16 0, i16 0, i16 0>, %a
+    ret <4 x i1> %c
+}
+
+define <8 x i1> @icmp_sge_v8i16_Zero_LHS(<8 x i16> %a) {
+; CHECK-SD-LABEL: icmp_sge_v8i16_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmle v0.8h, v0.8h, #0
+; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sge_v8i16_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmge v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    ret
+    %c = icmp sge <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, %a
+    ret <8 x i1> %c
+}
+
+define <2 x i1> @icmp_sge_v2i32_Zero_LHS(<2 x i32> %a) {
+; CHECK-SD-LABEL: icmp_sge_v2i32_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmle v0.2s, v0.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sge_v2i32_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmge v0.2s, v1.2s, v0.2s
+; CHECK-GI-NEXT:    ret
+    %c = icmp sge <2 x i32> <i32 0, i32 0>, %a
+    ret <2 x i1> %c
+}
+
+define <4 x i1> @icmp_sge_v4i32_Zero_LHS(<4 x i32> %a) {
+; CHECK-SD-LABEL: icmp_sge_v4i32_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmle v0.4s, v0.4s, #0
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sge_v4i32_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmge v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+    %c = icmp sge <4 x i32> <i32 0, i32 0, i32 0, i32 0>, %a
+    ret <4 x i1> %c
+}
+
+define <2 x i1> @icmp_sge_v2i64_Zero_LHS(<2 x i64> %a) {
+; CHECK-SD-LABEL: icmp_sge_v2i64_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmle v0.2d, v0.2d, #0
+; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sge_v2i64_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmge v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    ret
+    %c = icmp sge <2 x i64> <i64 0, i64 0>, %a
+    ret <2 x i1> %c
+}
+
+define <8 x i1> @icmp_sgt_v8i8_Zero_LHS(<8 x i8> %a) {
+; CHECK-SD-LABEL: icmp_sgt_v8i8_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmlt v0.8b, v0.8b, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sgt_v8i8_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmgt v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ret
+    %c = icmp sgt <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, %a
+    ret <8 x i1> %c
+}
+
+define <16 x i1> @icmp_sgt_v16i8_Zero_LHS(<16 x i8> %a) {
+; CHECK-SD-LABEL: icmp_sgt_v16i8_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmlt v0.16b, v0.16b, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sgt_v16i8_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmgt v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
+    %c = icmp sgt <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, %a
+    ret <16 x i1> %c
+}
+
+define <4 x i1> @icmp_sgt_v4i16_Zero_LHS(<4 x i16> %a) {
+; CHECK-SD-LABEL: icmp_sgt_v4i16_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmlt v0.4h, v0.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sgt_v4i16_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmgt v0.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    ret
+    %c = icmp sgt <4 x i16> <i16 0, i16 0, i16 0, i16 0>, %a
+    ret <4 x i1> %c
+}
+
+define <8 x i1> @icmp_sgt_v8i16_Zero_LHS(<8 x i16> %a) {
+; CHECK-SD-LABEL: icmp_sgt_v8i16_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sgt_v8i16_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmgt v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    ret
+    %c = icmp sgt <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, %a
+    ret <8 x i1> %c
+}
+
+define <2 x i1> @icmp_sgt_v2i32_Zero_LHS(<2 x i32> %a) {
+; CHECK-SD-LABEL: icmp_sgt_v2i32_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmlt v0.2s, v0.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sgt_v2i32_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmgt v0.2s, v1.2s, v0.2s
+; CHECK-GI-NEXT:    ret
+    %c = icmp sgt <2 x i32> <i32 0, i32 0>, %a
+    ret <2 x i1> %c
+}
+
+define <4 x i1> @icmp_sgt_v4i32_Zero_LHS(<4 x i32> %a) {
+; CHECK-SD-LABEL: icmp_sgt_v4i32_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmlt v0.4s, v0.4s, #0
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sgt_v4i32_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmgt v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+    %c = icmp sgt <4 x i32> <i32 0, i32 0, i32 0, i32 0>, %a
+    ret <4 x i1> %c
+}
+
+define <2 x i1> @icmp_sgt_v2i64_Zero_LHS(<2 x i64> %a) {
+; CHECK-SD-LABEL: icmp_sgt_v2i64_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmlt v0.2d, v0.2d, #0
+; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sgt_v2i64_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmgt v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    ret
+    %c = icmp sgt <2 x i64> <i64 0, i64 0>, %a
+    ret <2 x i1> %c
+}
+
+define <8 x i1> @icmp_sle_v8i8_Zero_LHS(<8 x i8> %a) {
+; CHECK-SD-LABEL: icmp_sle_v8i8_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmge v0.8b, v0.8b, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sle_v8i8_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmge v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
+    %c = icmp sle <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, %a
+    ret <8 x i1> %c
+}
+
+define <16 x i1> @icmp_sle_v16i8_Zero_LHS(<16 x i8> %a) {
+; CHECK-SD-LABEL: icmp_sle_v16i8_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmge v0.16b, v0.16b, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sle_v16i8_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmge v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+    %c = icmp sle <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, %a
+    ret <16 x i1> %c
+}
+
+define <4 x i1> @icmp_sle_v4i16_Zero_LHS(<4 x i16> %a) {
+; CHECK-SD-LABEL: icmp_sle_v4i16_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmge v0.4h, v0.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sle_v4i16_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmge v0.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    ret
+    %c = icmp sle <4 x i16> <i16 0, i16 0, i16 0, i16 0>, %a
+    ret <4 x i1> %c
+}
+
+define <8 x i1> @icmp_sle_v8i16_Zero_LHS(<8 x i16> %a) {
+; CHECK-SD-LABEL: icmp_sle_v8i16_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmge v0.8h, v0.8h, #0
+; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sle_v8i16_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmge v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    ret
+    %c = icmp sle <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, %a
+    ret <8 x i1> %c
+}
+
+define <2 x i1> @icmp_sle_v2i32_Zero_LHS(<2 x i32> %a) {
+; CHECK-SD-LABEL: icmp_sle_v2i32_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmge v0.2s, v0.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sle_v2i32_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmge v0.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    ret
+    %c = icmp sle <2 x i32> <i32 0, i32 0>, %a
+    ret <2 x i1> %c
+}
+
+define <4 x i1> @icmp_sle_v4i32_Zero_LHS(<4 x i32> %a) {
+; CHECK-SD-LABEL: icmp_sle_v4i32_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmge v0.4s, v0.4s, #0
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sle_v4i32_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmge v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+    %c = icmp sle <4 x i32> <i32 0, i32 0, i32 0, i32 0>, %a
+    ret <4 x i1> %c
+}
+
+define <2 x i1> @icmp_sle_v2i64_Zero_LHS(<2 x i64> %a) {
+; CHECK-SD-LABEL: icmp_sle_v2i64_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmge v0.2d, v0.2d, #0
+; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_sle_v2i64_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmge v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    ret
+    %c = icmp sle <2 x i64> <i64 0, i64 0>, %a
+    ret <2 x i1> %c
+}
+
+define <8 x i1> @icmp_slt_v8i8_Zero_LHS(<8 x i8> %a) {
+; CHECK-SD-LABEL: icmp_slt_v8i8_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmgt v0.8b, v0.8b, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_slt_v8i8_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmgt v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
+    %c = icmp slt <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, %a
+    ret <8 x i1> %c
+}
+
+define <16 x i1> @icmp_slt_v16i8_Zero_LHS(<16 x i8> %a) {
+; CHECK-SD-LABEL: icmp_slt_v16i8_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmgt v0.16b, v0.16b, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_slt_v16i8_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmgt v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
+    %c = icmp slt <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, %a
+    ret <16 x i1> %c
+}
+
+define <4 x i1> @icmp_slt_v4i16_Zero_LHS(<4 x i16> %a) {
+; CHECK-SD-LABEL: icmp_slt_v4i16_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmgt v0.4h, v0.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_slt_v4i16_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmgt v0.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    ret
+    %c = icmp slt <4 x i16> <i16 0, i16 0, i16 0, i16 0>, %a
+    ret <4 x i1> %c
+}
+
+define <8 x i1> @icmp_slt_v8i16_Zero_LHS(<8 x i16> %a) {
+; CHECK-SD-LABEL: icmp_slt_v8i16_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmgt v0.8h, v0.8h, #0
+; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_slt_v8i16_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmgt v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    ret
+    %c = icmp slt <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, %a
+    ret <8 x i1> %c
+}
+
+define <2 x i1> @icmp_slt_v2i32_Zero_LHS(<2 x i32> %a) {
+; CHECK-SD-LABEL: icmp_slt_v2i32_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmgt v0.2s, v0.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_slt_v2i32_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmgt v0.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    ret
+    %c = icmp slt <2 x i32> <i32 0, i32 0>, %a
+    ret <2 x i1> %c
+}
+
+define <4 x i1> @icmp_slt_v4i32_Zero_LHS(<4 x i32> %a) {
+; CHECK-SD-LABEL: icmp_slt_v4i32_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmgt v0.4s, v0.4s, #0
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_slt_v4i32_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmgt v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+    %c = icmp slt <4 x i32> <i32 0, i32 0, i32 0, i32 0>, %a
+    ret <4 x i1> %c
+}
+
+define <2 x i1> @icmp_slt_v2i64_Zero_LHS(<2 x i64> %a) {
+; CHECK-SD-LABEL: icmp_slt_v2i64_Zero_LHS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmgt v0.2d, v0.2d, #0
+; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: icmp_slt_v2i64_Zero_LHS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    cmgt v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    ret
+    %c = icmp slt <2 x i64> <i64 0, i64 0>, %a
+    ret <2 x i1> %c
+}
diff --git a/llvm/test/CodeGen/AArch64/interleaved-load-combine-pr90695.ll b/llvm/test/CodeGen/AArch64/interleaved-load-combine-pr90695.ll
new file mode 100644
index 000000000000..ee75b3a083f7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/interleaved-load-combine-pr90695.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=interleaved-load-combine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-windows-gnu"
+
+; Make sure we don't crash on loads of vectors of non-byte-sized types.
+define <4 x i1> @test(ptr %p) {
+; CHECK-LABEL: define <4 x i1> @test(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOAD:%.*]] = load <2 x i1>, ptr [[P]], align 1
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i1> [[LOAD]], <2 x i1> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; CHECK-NEXT:    ret <4 x i1> [[SHUF]]
+;
+entry:
+  %load = load <2 x i1>, ptr %p, align 1
+  %shuf = shufflevector <2 x i1> %load, <2 x i1> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+  ret <4 x i1> %shuf
+}
diff --git a/llvm/test/CodeGen/AArch64/mul_pow2.ll b/llvm/test/CodeGen/AArch64/mul_pow2.ll
index 0c9ea51ba367..c4839175ded5 100644
--- a/llvm/test/CodeGen/AArch64/mul_pow2.ll
+++ b/llvm/test/CodeGen/AArch64/mul_pow2.ll
@@ -527,6 +527,23 @@ define i32 @test25_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" {
   ret i32 %mul
 }
 
+define i32 @test29_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" {
+; CHECK-LABEL: test29_fast_shift:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub w8, w0, w0, lsl #3
+; CHECK-NEXT:    sub w0, w0, w8, lsl #2
+; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: test29_fast_shift:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    mov w8, #29 // =0x1d
+; GISEL-NEXT:    mul w0, w0, w8
+; GISEL-NEXT:    ret
+
+  %mul = mul nsw i32 %x, 29 ; 29 = 1 - (1-8) * 4
+  ret i32 %mul
+}
+
 define i32 @test45_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" {
 ; CHECK-LABEL: test45_fast_shift:
 ; CHECK:       // %bb.0:
@@ -615,6 +632,42 @@ define i32 @test97_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" {
   ret i32 %mul
 }
 
+; Negative test: The shift number 5 is out of bound
+define i32 @test125_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" {
+; CHECK-LABEL: test125_fast_shift:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #125 // =0x7d
+; CHECK-NEXT:    mul w0, w0, w8
+; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: test125_fast_shift:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    mov w8, #125 // =0x7d
+; GISEL-NEXT:    mul w0, w0, w8
+; GISEL-NEXT:    ret
+
+  %mul = mul nsw i32 %x, 125 ; 125 = 1 - ((1-32) << 2)
+  ret i32 %mul
+}
+
+; TODO: (1 - 2^M) * (1 - 2^N)
+define i32 @test225_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" {
+; CHECK-LABEL: test225_fast_shift:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #225 // =0xe1
+; CHECK-NEXT:    mul w0, w0, w8
+; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: test225_fast_shift:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    mov w8, #225 // =0xe1
+; GISEL-NEXT:    mul w0, w0, w8
+; GISEL-NEXT:    ret
+
+  %mul = mul nsw i32 %x, 225 ; 225 = (1-16)*(1-16)
+  ret i32 %mul
+}
+
 ; Negative test: The shift amount 5 larger than 4
 define i32 @test297_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" {
 ; CHECK-LABEL: test297_fast_shift:
@@ -633,6 +686,24 @@ define i32 @test297_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" {
   ret i32 %mul
 }
 
+; Negative test: The shift number 5 is out of bound
+define i32 @test481_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" {
+; CHECK-LABEL: test481_fast_shift:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #481 // =0x1e1
+; CHECK-NEXT:    mul w0, w0, w8
+; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: test481_fast_shift:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    mov w8, #481 // =0x1e1
+; GISEL-NEXT:    mul w0, w0, w8
+; GISEL-NEXT:    ret
+
+  %mul = mul nsw i32 %x, 481 ; 481 = 1 - ((1-16) << 5)
+  ret i32 %mul
+}
+
 ; Convert mul x, -pow2 to shift.
 ; Convert mul x, -(pow2 +/- 1) to shift + add/sub.
 ; Lowering other negative constants are not supported yet.
@@ -910,9 +981,9 @@ define <4 x i32> @muladd_demand_commute(<4 x i32> %x, <4 x i32> %y) {
 ;
 ; GISEL-LABEL: muladd_demand_commute:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    adrp x8, .LCPI52_0
+; GISEL-NEXT:    adrp x8, .LCPI56_0
 ; GISEL-NEXT:    movi v3.4s, #1, msl #16
-; GISEL-NEXT:    ldr q2, [x8, :lo12:.LCPI52_0]
+; GISEL-NEXT:    ldr q2, [x8, :lo12:.LCPI56_0]
 ; GISEL-NEXT:    mla v1.4s, v0.4s, v2.4s
 ; GISEL-NEXT:    and v0.16b, v1.16b, v3.16b
 ; GISEL-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
index f5763cd61033..d1171bc31247 100644
--- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
+++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
@@ -692,6 +692,104 @@ define <vscale x 2 x double> @splice_nxv2f64_neg3(<vscale x 2 x double> %a, <vsc
   ret <vscale x 2 x double> %res
 }
 
+define <vscale x 2 x bfloat> @splice_nxv2bf16_neg_idx(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv2bf16_neg_idx:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    rev p0.d, p0.d
+; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    ret
+  %res = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b, i32 -1)
+  ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 2 x bfloat> @splice_nxv2bf16_neg2_idx(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv2bf16_neg2_idx:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    rev p0.d, p0.d
+; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    ret
+  %res = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b, i32 -2)
+  ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 2 x bfloat> @splice_nxv2bf16_first_idx(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv2bf16_first_idx:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #8
+; CHECK-NEXT:    ret
+  %res = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b, i32 1)
+  ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 2 x bfloat> @splice_nxv2bf16_last_idx(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) vscale_range(16,16) #0 {
+; CHECK-LABEL: splice_nxv2bf16_last_idx:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #248
+; CHECK-NEXT:    ret
+  %res = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b, i32 31)
+  ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @splice_nxv4bf16_neg_idx(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv4bf16_neg_idx:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl1
+; CHECK-NEXT:    rev p0.s, p0.s
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %res = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, i32 -1)
+  ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @splice_nxv4bf16_neg3_idx(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv4bf16_neg3_idx:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl3
+; CHECK-NEXT:    rev p0.s, p0.s
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %res = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, i32 -3)
+  ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @splice_nxv4bf16_first_idx(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv4bf16_first_idx:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #4
+; CHECK-NEXT:    ret
+  %res = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, i32 1)
+  ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @splice_nxv4bf16_last_idx(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) vscale_range(16,16) #0 {
+; CHECK-LABEL: splice_nxv4bf16_last_idx:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #252
+; CHECK-NEXT:    ret
+  %res = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, i32 63)
+  ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @splice_nxv8bf16_first_idx(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv8bf16_first_idx:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #2
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, i32 1)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @splice_nxv8bf16_last_idx(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) vscale_range(16,16) #0 {
+; CHECK-LABEL: splice_nxv8bf16_last_idx:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #254
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, i32 127)
+  ret <vscale x 8 x bfloat> %res
+}
+
 ; Ensure predicate based splice is promoted to use ZPRs.
 define <vscale x 2 x i1> @splice_nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv2i1:
@@ -834,12 +932,14 @@ declare <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1>, <vscale
 declare <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, i32)
 declare <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, i32)
 declare <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, i32)
+
 declare <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32)
 declare <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
 declare <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
 declare <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
 declare <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
 declare <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
+
 declare <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, i32)
 declare <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>, i32)
 declare <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32)
@@ -848,4 +948,8 @@ declare <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float>, <
 declare <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float>, <vscale x 16 x float>, i32)
 declare <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
 
+declare <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, i32)
+declare <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, i32)
+declare <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
+
 attributes #0 = { nounwind "target-features"="+sve" }
diff --git a/llvm/test/CodeGen/AArch64/neon-perm.ll b/llvm/test/CodeGen/AArch64/neon-perm.ll
index 26ffa2727a1c..15763543113e 100644
--- a/llvm/test/CodeGen/AArch64/neon-perm.ll
+++ b/llvm/test/CodeGen/AArch64/neon-perm.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 %struct.int8x8x2_t = type { [2 x <8 x i8>] }
 %struct.int16x4x2_t = type { [2 x <4 x i16>] }
@@ -1731,10 +1732,23 @@ entry:
 }
 
 define <4 x i8> @test_vzip1_v4i8(<8 x i8> %p) {
-; CHECK-LABEL: test_vzip1_v4i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 v0.8b, v0.8b, v0.8b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vzip1_v4i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    zip1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vzip1_v4i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov b1, v0.b[1]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b3, v0.b[3]
+; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
+; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v0.b[3], v3.b[0]
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
  %lo = shufflevector <8 x i8> %p, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  ret <4 x i8> %lo
 }
@@ -2201,10 +2215,15 @@ entry:
 }
 
 define <16 x i8> @test_undef_vuzp1q_s8(<16 x i8> %a) {
-; CHECK-LABEL: test_undef_vuzp1q_s8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    xtn v0.8b, v0.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vuzp1q_s8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vuzp1q_s8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uzp1 v0.16b, v0.16b, v0.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
   ret <16 x i8> %shuffle.i
@@ -2221,20 +2240,30 @@ entry:
 }
 
 define <8 x i16> @test_undef_vuzp1q_s16(<8 x i16> %a) {
-; CHECK-LABEL: test_undef_vuzp1q_s16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    xtn v0.4h, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vuzp1q_s16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vuzp1q_s16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v0.8h
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
   ret <8 x i16> %shuffle.i
 }
 
 define <4 x i32> @test_undef_vuzp1q_s32(<4 x i32> %a) {
-; CHECK-LABEL: test_undef_vuzp1q_s32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    xtn v0.2s, v0.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vuzp1q_s32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vuzp1q_s32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v0.4s
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   ret <4 x i32> %shuffle.i
@@ -2251,10 +2280,15 @@ entry:
 }
 
 define <16 x i8> @test_undef_vuzp1q_u8(<16 x i8> %a) {
-; CHECK-LABEL: test_undef_vuzp1q_u8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    xtn v0.8b, v0.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vuzp1q_u8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vuzp1q_u8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uzp1 v0.16b, v0.16b, v0.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
   ret <16 x i8> %shuffle.i
@@ -2271,20 +2305,30 @@ entry:
 }
 
 define <8 x i16> @test_undef_vuzp1q_u16(<8 x i16> %a) {
-; CHECK-LABEL: test_undef_vuzp1q_u16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    xtn v0.4h, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vuzp1q_u16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vuzp1q_u16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v0.8h
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
   ret <8 x i16> %shuffle.i
 }
 
 define <4 x i32> @test_undef_vuzp1q_u32(<4 x i32> %a) {
-; CHECK-LABEL: test_undef_vuzp1q_u32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    xtn v0.2s, v0.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vuzp1q_u32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vuzp1q_u32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v0.4s
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   ret <4 x i32> %shuffle.i
@@ -2311,10 +2355,15 @@ entry:
 }
 
 define <16 x i8> @test_undef_vuzp1q_p8(<16 x i8> %a) {
-; CHECK-LABEL: test_undef_vuzp1q_p8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    xtn v0.8b, v0.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vuzp1q_p8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vuzp1q_p8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uzp1 v0.16b, v0.16b, v0.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
   ret <16 x i8> %shuffle.i
@@ -2331,10 +2380,15 @@ entry:
 }
 
 define <8 x i16> @test_undef_vuzp1q_p16(<8 x i16> %a) {
-; CHECK-LABEL: test_undef_vuzp1q_p16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    xtn v0.4h, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vuzp1q_p16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vuzp1q_p16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v0.8h
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
   ret <8 x i16> %shuffle.i
@@ -2791,285 +2845,435 @@ entry:
 }
 
 define <8 x i8> @test_undef_vtrn1_s8(<8 x i8> %a) {
-; CHECK-LABEL: test_undef_vtrn1_s8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vtrn1_s8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vtrn1_s8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    trn1 v0.8b, v0.8b, v0.8b
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   ret <8 x i8> %shuffle.i
 }
 
 define <16 x i8> @test_undef_vtrn1q_s8(<16 x i8> %a) {
-; CHECK-LABEL: test_undef_vtrn1q_s8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vtrn1q_s8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vtrn1q_s8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    trn1 v0.16b, v0.16b, v0.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
   ret <16 x i8> %shuffle.i
 }
 
 define <4 x i16> @test_undef_vtrn1_s16(<4 x i16> %a) {
-; CHECK-LABEL: test_undef_vtrn1_s16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vtrn1_s16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vtrn1_s16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    trn1 v0.4h, v0.4h, v0.4h
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   ret <4 x i16> %shuffle.i
 }
 
 define <8 x i16> @test_undef_vtrn1q_s16(<8 x i16> %a) {
-; CHECK-LABEL: test_undef_vtrn1q_s16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vtrn1q_s16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vtrn1q_s16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    trn1 v0.8h, v0.8h, v0.8h
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   ret <8 x i16> %shuffle.i
 }
 
 define <4 x i32> @test_undef_vtrn1q_s32(<4 x i32> %a) {
-; CHECK-LABEL: test_undef_vtrn1q_s32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vtrn1q_s32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vtrn1q_s32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    trn1 v0.4s, v0.4s, v0.4s
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   ret <4 x i32> %shuffle.i
 }
 
 define <8 x i8> @test_undef_vtrn1_u8(<8 x i8> %a) {
-; CHECK-LABEL: test_undef_vtrn1_u8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vtrn1_u8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vtrn1_u8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    trn1 v0.8b, v0.8b, v0.8b
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   ret <8 x i8> %shuffle.i
 }
 
 define <16 x i8> @test_undef_vtrn1q_u8(<16 x i8> %a) {
-; CHECK-LABEL: test_undef_vtrn1q_u8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vtrn1q_u8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vtrn1q_u8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    trn1 v0.16b, v0.16b, v0.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
   ret <16 x i8> %shuffle.i
 }
 
 define <4 x i16> @test_undef_vtrn1_u16(<4 x i16> %a) {
-; CHECK-LABEL: test_undef_vtrn1_u16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vtrn1_u16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vtrn1_u16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    trn1 v0.4h, v0.4h, v0.4h
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   ret <4 x i16> %shuffle.i
 }
 
 define <8 x i16> @test_undef_vtrn1q_u16(<8 x i16> %a) {
-; CHECK-LABEL: test_undef_vtrn1q_u16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vtrn1q_u16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vtrn1q_u16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    trn1 v0.8h, v0.8h, v0.8h
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   ret <8 x i16> %shuffle.i
 }
 
 define <4 x i32> @test_undef_vtrn1q_u32(<4 x i32> %a) {
-; CHECK-LABEL: test_undef_vtrn1q_u32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vtrn1q_u32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vtrn1q_u32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    trn1 v0.4s, v0.4s, v0.4s
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   ret <4 x i32> %shuffle.i
 }
 
 define <4 x float> @test_undef_vtrn1q_f32(<4 x float> %a) {
-; CHECK-LABEL: test_undef_vtrn1q_f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vtrn1q_f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vtrn1q_f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    trn1 v0.4s, v0.4s, v0.4s
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   ret <4 x float> %shuffle.i
 }
 
 define <8 x i8> @test_undef_vtrn1_p8(<8 x i8> %a) {
-; CHECK-LABEL: test_undef_vtrn1_p8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vtrn1_p8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vtrn1_p8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    trn1 v0.8b, v0.8b, v0.8b
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   ret <8 x i8> %shuffle.i
 }
 
 define <16 x i8> @test_undef_vtrn1q_p8(<16 x i8> %a) {
-; CHECK-LABEL: test_undef_vtrn1q_p8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vtrn1q_p8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vtrn1q_p8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    trn1 v0.16b, v0.16b, v0.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
   ret <16 x i8> %shuffle.i
 }
 
 define <4 x i16> @test_undef_vtrn1_p16(<4 x i16> %a) {
-; CHECK-LABEL: test_undef_vtrn1_p16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vtrn1_p16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vtrn1_p16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    trn1 v0.4h, v0.4h, v0.4h
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   ret <4 x i16> %shuffle.i
 }
 
 define <8 x i16> @test_undef_vtrn1q_p16(<8 x i16> %a) {
-; CHECK-LABEL: test_undef_vtrn1q_p16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vtrn1q_p16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vtrn1q_p16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    trn1 v0.8h, v0.8h, v0.8h
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   ret <8 x i16> %shuffle.i
 }
 
 define <8 x i8> @test_undef_vtrn2_s8(<8 x i8> %a) {
-; CHECK-LABEL: test_undef_vtrn2_s8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rev16 v0.8b, v0.8b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vtrn2_s8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vtrn2_s8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    trn2 v0.8b, v0.8b, v0.8b
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   ret <8 x i8> %shuffle.i
 }
 
 define <16 x i8> @test_undef_vtrn2q_s8(<16 x i8> %a) {
-; CHECK-LABEL: test_undef_vtrn2q_s8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rev16 v0.16b, v0.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vtrn2q_s8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    rev16 v0.16b, v0.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vtrn2q_s8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    trn2 v0.16b, v0.16b, v0.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
   ret <16 x i8> %shuffle.i
 }
 
 define <4 x i16> @test_undef_vtrn2_s16(<4 x i16> %a) {
-; CHECK-LABEL: test_undef_vtrn2_s16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rev32 v0.4h, v0.4h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vtrn2_s16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vtrn2_s16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    trn2 v0.4h, v0.4h, v0.4h
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   ret <4 x i16> %shuffle.i
 }
 
 define <8 x i16> @test_undef_vtrn2q_s16(<8 x i16> %a) {
-; CHECK-LABEL: test_undef_vtrn2q_s16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rev32 v0.8h, v0.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vtrn2q_s16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    rev32 v0.8h, v0.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vtrn2q_s16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    trn2 v0.8h, v0.8h, v0.8h
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   ret <8 x i16> %shuffle.i
 }
 
 define <4 x i32> @test_undef_vtrn2q_s32(<4 x i32> %a) {
-; CHECK-LABEL: test_undef_vtrn2q_s32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rev64 v0.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vtrn2q_s32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vtrn2q_s32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    trn2 v0.4s, v0.4s, v0.4s
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   ret <4 x i32> %shuffle.i
 }
 
 define <8 x i8> @test_undef_vtrn2_u8(<8 x i8> %a) {
-; CHECK-LABEL: test_undef_vtrn2_u8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rev16 v0.8b, v0.8b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vtrn2_u8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vtrn2_u8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    trn2 v0.8b, v0.8b, v0.8b
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   ret <8 x i8> %shuffle.i
 }
 
 define <16 x i8> @test_undef_vtrn2q_u8(<16 x i8> %a) {
-; CHECK-LABEL: test_undef_vtrn2q_u8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rev16 v0.16b, v0.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vtrn2q_u8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    rev16 v0.16b, v0.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vtrn2q_u8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    trn2 v0.16b, v0.16b, v0.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
   ret <16 x i8> %shuffle.i
 }
 
 define <4 x i16> @test_undef_vtrn2_u16(<4 x i16> %a) {
-; CHECK-LABEL: test_undef_vtrn2_u16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rev32 v0.4h, v0.4h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vtrn2_u16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vtrn2_u16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    trn2 v0.4h, v0.4h, v0.4h
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   ret <4 x i16> %shuffle.i
 }
 
 define <8 x i16> @test_undef_vtrn2q_u16(<8 x i16> %a) {
-; CHECK-LABEL: test_undef_vtrn2q_u16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rev32 v0.8h, v0.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vtrn2q_u16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    rev32 v0.8h, v0.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vtrn2q_u16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    trn2 v0.8h, v0.8h, v0.8h
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   ret <8 x i16> %shuffle.i
 }
 
 define <4 x i32> @test_undef_vtrn2q_u32(<4 x i32> %a) {
-; CHECK-LABEL: test_undef_vtrn2q_u32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rev64 v0.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vtrn2q_u32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vtrn2q_u32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    trn2 v0.4s, v0.4s, v0.4s
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   ret <4 x i32> %shuffle.i
 }
 
 define <4 x float> @test_undef_vtrn2q_f32(<4 x float> %a) {
-; CHECK-LABEL: test_undef_vtrn2q_f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rev64 v0.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vtrn2q_f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vtrn2q_f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    trn2 v0.4s, v0.4s, v0.4s
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   ret <4 x float> %shuffle.i
 }
 
 define <8 x i8> @test_undef_vtrn2_p8(<8 x i8> %a) {
-; CHECK-LABEL: test_undef_vtrn2_p8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rev16 v0.8b, v0.8b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vtrn2_p8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vtrn2_p8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    trn2 v0.8b, v0.8b, v0.8b
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   ret <8 x i8> %shuffle.i
 }
 
 define <16 x i8> @test_undef_vtrn2q_p8(<16 x i8> %a) {
-; CHECK-LABEL: test_undef_vtrn2q_p8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rev16 v0.16b, v0.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vtrn2q_p8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    rev16 v0.16b, v0.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vtrn2q_p8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    trn2 v0.16b, v0.16b, v0.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
   ret <16 x i8> %shuffle.i
 }
 
 define <4 x i16> @test_undef_vtrn2_p16(<4 x i16> %a) {
-; CHECK-LABEL: test_undef_vtrn2_p16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rev32 v0.4h, v0.4h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vtrn2_p16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vtrn2_p16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    trn2 v0.4h, v0.4h, v0.4h
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   ret <4 x i16> %shuffle.i
 }
 
 define <8 x i16> @test_undef_vtrn2q_p16(<8 x i16> %a) {
-; CHECK-LABEL: test_undef_vtrn2q_p16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rev32 v0.8h, v0.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_undef_vtrn2q_p16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    rev32 v0.8h, v0.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_undef_vtrn2q_p16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    trn2 v0.8h, v0.8h, v0.8h
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   ret <8 x i16> %shuffle.i
@@ -3886,13 +4090,22 @@ entry:
 }
 
 define %struct.uint8x8x2_t @test_uzp(<16 x i8> %y) {
-; CHECK-LABEL: test_uzp:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    uzp1 v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    uzp2 v1.8b, v0.8b, v1.8b
-; CHECK-NEXT:    fmov d0, d2
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_uzp:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    xtn v2.8b, v0.8h
+; CHECK-SD-NEXT:    uzp2 v1.16b, v0.16b, v0.16b
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-SD-NEXT:    fmov d0, d2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_uzp:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uzp1 v2.16b, v0.16b, v0.16b
+; CHECK-GI-NEXT:    uzp2 v1.16b, v0.16b, v0.16b
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-GI-NEXT:    fmov d0, d2
+; CHECK-GI-NEXT:    ret
+
 
   %vuzp.i = shufflevector <16 x i8> %y, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
   %vuzp1.i = shufflevector <16 x i8> %y, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
diff --git a/llvm/test/CodeGen/AArch64/neon-scalarize-histogram.ll b/llvm/test/CodeGen/AArch64/neon-scalarize-histogram.ll
new file mode 100644
index 000000000000..45f1429a810a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-scalarize-histogram.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=aarch64 < %s -o - | FileCheck %s
+
+;; This test exercises the default lowering of the histogram to scalarized code.
+
+define void @histogram_i64(<2 x ptr> %buckets, i64 %inc, <2 x i1> %mask) {
+; CHECK-LABEL: histogram_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    tbnz w8, #0, .LBB0_3
+; CHECK-NEXT:  // %bb.1: // %else
+; CHECK-NEXT:    mov w8, v1.s[1]
+; CHECK-NEXT:    tbnz w8, #0, .LBB0_4
+; CHECK-NEXT:  .LBB0_2: // %else2
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_3: // %cond.histogram.update
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    ldr x9, [x8]
+; CHECK-NEXT:    add x9, x9, x0
+; CHECK-NEXT:    str x9, [x8]
+; CHECK-NEXT:    mov w8, v1.s[1]
+; CHECK-NEXT:    tbz w8, #0, .LBB0_2
+; CHECK-NEXT:  .LBB0_4: // %cond.histogram.update1
+; CHECK-NEXT:    mov x8, v0.d[1]
+; CHECK-NEXT:    ldr x9, [x8]
+; CHECK-NEXT:    add x9, x9, x0
+; CHECK-NEXT:    str x9, [x8]
+; CHECK-NEXT:    ret
+  call void @llvm.experimental.vector.histogram.add.nxv2p0.i64(<2 x ptr> %buckets, i64 %inc, <2 x i1> %mask)
+  ret void
+}
+
+define void @histogram_i32_literal(ptr %base, <4 x i32> %indices, <4 x i1> %mask) {
+; CHECK-LABEL: histogram_i32_literal:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v2.2d, x0
+; CHECK-NEXT:    sshll v3.2d, v0.2s, #2
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    umov w8, v1.h[0]
+; CHECK-NEXT:    add v3.2d, v2.2d, v3.2d
+; CHECK-NEXT:    tbz w8, #0, .LBB1_2
+; CHECK-NEXT:  // %bb.1: // %cond.histogram.update
+; CHECK-NEXT:    fmov x8, d3
+; CHECK-NEXT:    ldr w9, [x8]
+; CHECK-NEXT:    add w9, w9, #1
+; CHECK-NEXT:    str w9, [x8]
+; CHECK-NEXT:  .LBB1_2: // %else
+; CHECK-NEXT:    umov w8, v1.h[1]
+; CHECK-NEXT:    sshll2 v0.2d, v0.4s, #2
+; CHECK-NEXT:    tbz w8, #0, .LBB1_4
+; CHECK-NEXT:  // %bb.3: // %cond.histogram.update1
+; CHECK-NEXT:    mov x8, v3.d[1]
+; CHECK-NEXT:    ldr w9, [x8]
+; CHECK-NEXT:    add w9, w9, #1
+; CHECK-NEXT:    str w9, [x8]
+; CHECK-NEXT:  .LBB1_4: // %else2
+; CHECK-NEXT:    umov w8, v1.h[2]
+; CHECK-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-NEXT:    tbnz w8, #0, .LBB1_7
+; CHECK-NEXT:  // %bb.5: // %else4
+; CHECK-NEXT:    umov w8, v1.h[3]
+; CHECK-NEXT:    tbnz w8, #0, .LBB1_8
+; CHECK-NEXT:  .LBB1_6: // %else6
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB1_7: // %cond.histogram.update3
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    ldr w9, [x8]
+; CHECK-NEXT:    add w9, w9, #1
+; CHECK-NEXT:    str w9, [x8]
+; CHECK-NEXT:    umov w8, v1.h[3]
+; CHECK-NEXT:    tbz w8, #0, .LBB1_6
+; CHECK-NEXT:  .LBB1_8: // %cond.histogram.update5
+; CHECK-NEXT:    mov x8, v0.d[1]
+; CHECK-NEXT:    ldr w9, [x8]
+; CHECK-NEXT:    add w9, w9, #1
+; CHECK-NEXT:    str w9, [x8]
+; CHECK-NEXT:    ret
+
+  %buckets = getelementptr i32, ptr %base, <4 x i32> %indices
+  call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> %mask)
+  ret void
+}
+
+define void @histogram_i32_literal_alltruemask(ptr %base, <4 x i32> %indices) {
+; CHECK-LABEL: histogram_i32_literal_alltruemask:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v1.2d, x0
+; CHECK-NEXT:    sshll v2.2d, v0.2s, #2
+; CHECK-NEXT:    sshll2 v0.2d, v0.4s, #2
+; CHECK-NEXT:    add v2.2d, v1.2d, v2.2d
+; CHECK-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    fmov x8, d2
+; CHECK-NEXT:    mov x9, v2.d[1]
+; CHECK-NEXT:    ldr w10, [x8]
+; CHECK-NEXT:    add w10, w10, #1
+; CHECK-NEXT:    str w10, [x8]
+; CHECK-NEXT:    ldr w8, [x9]
+; CHECK-NEXT:    add w8, w8, #1
+; CHECK-NEXT:    str w8, [x9]
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    mov x9, v0.d[1]
+; CHECK-NEXT:    ldr w10, [x8]
+; CHECK-NEXT:    add w10, w10, #1
+; CHECK-NEXT:    str w10, [x8]
+; CHECK-NEXT:    ldr w8, [x9]
+; CHECK-NEXT:    add w8, w8, #1
+; CHECK-NEXT:    str w8, [x9]
+; CHECK-NEXT:    ret
+
+  %buckets = getelementptr i32, ptr %base, <4 x i32> %indices
+  call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/pr90936.ll b/llvm/test/CodeGen/AArch64/pr90936.ll
new file mode 100644
index 000000000000..3ed8468b37f4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/pr90936.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
+
+define void @f(i16 %arg, ptr %arg1) {
+; CHECK-LABEL: f:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ubfx w8, w0, #8, #6
+; CHECK-NEXT:    strb w0, [x1]
+; CHECK-NEXT:    strb w8, [x1, #1]
+; CHECK-NEXT:    ret
+bb:
+  %i = trunc i16 %arg to i8
+  %i2 = trunc i16 %arg to i14
+  %i3 = lshr i14 %i2, 8
+  store i8 %i, ptr %arg1, align 1
+  %i4 = getelementptr i8, ptr %arg1, i64 1
+  %i5 = trunc i14 %i3 to i8
+  store i8 %i5, ptr %i4, align 1
+  ret void
+}
+
+define void @g(i32 %arg, ptr %arg1) {
+; CHECK-LABEL: g:
+; CHECK:       // %bb.0: // %bb
+; CHECK-NEXT:    lsr w8, w0, #8
+; CHECK-NEXT:    lsr w9, w0, #16
+; CHECK-NEXT:    strb w0, [x1]
+; CHECK-NEXT:    strb wzr, [x1, #3]
+; CHECK-NEXT:    strb w8, [x1, #1]
+; CHECK-NEXT:    strb w9, [x1, #2]
+; CHECK-NEXT:    ret
+bb:
+  %i = trunc i32 %arg to i8
+  store i8 %i, ptr %arg1, align 1
+  %i2 = lshr i32 %arg, 8
+  %i3 = trunc i32 %i2 to i8
+  %i4 = getelementptr i8, ptr %arg1, i64 1
+  store i8 %i3, ptr %i4, align 1
+  %i5 = lshr i32 %arg, 16
+  %i6 = trunc i32 %i5 to i8
+  %i7 = getelementptr i8, ptr %arg1, i64 2
+  store i8 %i6, ptr %i7, align 1
+  %i8 = zext i8 %i to i32
+  %i9 = lshr i32 %i8, 24
+  %i10 = getelementptr i8, ptr %arg1, i64 3
+  %i11 = trunc i32 %i9 to i8
+  store i8 %i11, ptr %i10, align 1
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas16.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas16.ll
new file mode 100644
index 000000000000..3e807b7e6338
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas16.ll
@@ -0,0 +1,462 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "// kill:.*$" --version 4
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+
+target triple = "aarch64-linux"
+
+define void @test_fmla_f16_vg2_single(i32 %slice, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: test_fmla_f16_vg2_single:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    fmla za.h[w8, 0, vgx2], { z0.h, z1.h }, z2.h
+; CHECK:    fmla za.h[w8, 7, vgx2], { z0.h, z1.h }, z2.h
+; CHECK:    ret
+  call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8f16(i32 %slice, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1, <vscale x 8 x half> %b)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8f16(i32 %slice.7, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1, <vscale x 8 x half> %b)
+  ret void
+}
+
+define void @test_fmla_f16_vg4_single(i32 %slice, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+; CHECK-LABEL: test_fmla_f16_vg4_single:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    fmla za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h
+; CHECK:    fmla za.h[w8, 7, vgx4], { z0.h - z3.h }, z4.h
+; CHECK:    ret
+                                      <vscale x 8 x half> %a2, <vscale x 8 x half> %a3, <vscale x 8 x half> %b) #0 {
+  call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8f16(i32 %slice, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                        <vscale x 8 x half> %a2, <vscale x 8 x half> %a3, <vscale x 8 x half> %b)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8f16(i32 %slice.7, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                        <vscale x 8 x half> %a2, <vscale x 8 x half> %a3, <vscale x 8 x half> %b)
+  ret void
+}
+
+define void @test_fmls_f16_vg2_single(i32 %slice, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: test_fmls_f16_vg2_single:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    fmls za.h[w8, 0, vgx2], { z0.h, z1.h }, z2.h
+; CHECK:    fmls za.h[w8, 7, vgx2], { z0.h, z1.h }, z2.h
+; CHECK:    ret
+  call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8f16(i32 %slice, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1, <vscale x 8 x half> %b)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8f16(i32 %slice.7, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1, <vscale x 8 x half> %b)
+  ret void
+}
+
+define void @test_fmls_f16_vg4_single(i32 %slice, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+; CHECK-LABEL: test_fmls_f16_vg4_single:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    fmls za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h
+; CHECK:    fmls za.h[w8, 7, vgx4], { z0.h - z3.h }, z4.h
+; CHECK:    ret
+                                      <vscale x 8 x half> %a2, <vscale x 8 x half> %a3, <vscale x 8 x half> %b) #0 {
+  call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8f16(i32 %slice, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                        <vscale x 8 x half> %a2, <vscale x 8 x half> %a3, <vscale x 8 x half> %b)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8f16(i32 %slice.7, <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                        <vscale x 8 x half> %a2, <vscale x 8 x half> %a3, <vscale x 8 x half> %b)
+  ret void
+}
+
+define void @test_fmla_f16_vg2_multi(i32 %slice,
+; CHECK-LABEL: test_fmla_f16_vg2_multi:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    fmla za.h[w8, 0, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK:    fmla za.h[w8, 7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK:    ret
+                                     <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                     <vscale x 8 x half> %b0, <vscale x 8 x half> %b1) #0 {
+  call void @llvm.aarch64.sme.fmla.vg1x2.nxv8f16(i32 %slice,
+                                                <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                <vscale x 8 x half> %b0, <vscale x 8 x half> %b1)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmla.vg1x2.nxv8f16(i32 %slice.7,
+                                                 <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                 <vscale x 8 x half> %b0, <vscale x 8 x half> %b1)
+  ret void
+}
+
+define void @test_fmla_f16_vg4_multi(i32 %slice,
+; CHECK-LABEL: test_fmla_f16_vg4_multi:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    fmla za.h[w8, 0, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK:    fmla za.h[w8, 7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK:    ret
+                                     <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                     <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+                                     <vscale x 8 x half> %b0, <vscale x 8 x half> %b1,
+                                     <vscale x 8 x half> %b2, <vscale x 8 x half> %b3) #0 {
+  call void @llvm.aarch64.sme.fmla.vg1x4.nxv8f16(i32 %slice,
+                                                 <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                 <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+                                                 <vscale x 8 x half> %b0, <vscale x 8 x half> %b1,
+                                                 <vscale x 8 x half> %b2, <vscale x 8 x half> %b3)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmla.vg1x4.nxv8f16(i32 %slice.7,
+                                                 <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                 <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+                                                 <vscale x 8 x half> %b0, <vscale x 8 x half> %b1,
+                                                 <vscale x 8 x half> %b2, <vscale x 8 x half> %b3)
+  ret void
+}
+
+define void @test_fmls_f16_vg2_multi(i32 %slice,
+; CHECK-LABEL: test_fmls_f16_vg2_multi:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    fmls za.h[w8, 0, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK:    fmls za.h[w8, 7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK:    ret
+                                     <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                     <vscale x 8 x half> %b0, <vscale x 8 x half> %b1) #0 {
+  call void @llvm.aarch64.sme.fmls.vg1x2.nxv8f16(i32 %slice,
+                                                <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                <vscale x 8 x half> %b0, <vscale x 8 x half> %b1)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmls.vg1x2.nxv8f16(i32 %slice.7,
+                                                 <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                 <vscale x 8 x half> %b0, <vscale x 8 x half> %b1)
+  ret void
+}
+
+define void @test_fmls_f16_vg4_multi(i32 %slice,
+; CHECK-LABEL: test_fmls_f16_vg4_multi:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    fmls za.h[w8, 0, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK:    fmls za.h[w8, 7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK:    ret
+                                     <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                     <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+                                     <vscale x 8 x half> %b0, <vscale x 8 x half> %b1,
+                                     <vscale x 8 x half> %b2, <vscale x 8 x half> %b3) #0 {
+  call void @llvm.aarch64.sme.fmls.vg1x4.nxv8f16(i32 %slice,
+                                                 <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                 <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+                                                 <vscale x 8 x half> %b0, <vscale x 8 x half> %b1,
+                                                 <vscale x 8 x half> %b2, <vscale x 8 x half> %b3)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmls.vg1x4.nxv8f16(i32 %slice.7,
+                                                 <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                 <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+                                                 <vscale x 8 x half> %b0, <vscale x 8 x half> %b1,
+                                                 <vscale x 8 x half> %b2, <vscale x 8 x half> %b3)
+  ret void
+}
+
+define void @test_fmla_f16_vg2_index(i32 %slice,
+; CHECK-LABEL: test_fmla_f16_vg2_index:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    fmla za.h[w8, 0, vgx2], { z0.h, z1.h }, z2.h[7]
+; CHECK:    fmla za.h[w8, 7, vgx2], { z0.h, z1.h }, z2.h[7]
+; CHECK:    ret
+                                     <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                     <vscale x 8 x half> %b) #0 {
+  call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv8f16(i32 %slice,
+                                                      <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                      <vscale x 8 x half> %b, i32 7);
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv8f16(i32 %slice.7,
+                                                     <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                     <vscale x 8 x half> %b, i32 7);
+  ret void
+}
+
+define void @test_fmla_f16_vg4_index(i32 %slice,
+; CHECK-LABEL: test_fmla_f16_vg4_index:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    fmla za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h[7]
+; CHECK:    fmla za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h[7]
+; CHECK:    ret
+                                     <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                     <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+                                     <vscale x 8 x half> %b) #0 {
+  call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv8f16(i32 %slice,
+                                                      <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                      <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+                                                      <vscale x 8 x half> %b, i32 7);
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv8f16(i32 %slice,
+                                                      <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                      <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+                                                      <vscale x 8 x half> %b, i32 7);
+  ret void
+}
+
+define void @test_fmls_f16_vg2_index(i32 %slice,
+; CHECK-LABEL: test_fmls_f16_vg2_index:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    fmls za.h[w8, 0, vgx2], { z0.h, z1.h }, z2.h[7]
+; CHECK:    fmls za.h[w8, 7, vgx2], { z0.h, z1.h }, z2.h[7]
+; CHECK:    ret
+                                     <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                     <vscale x 8 x half> %b) #0 {
+  call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv8f16(i32 %slice,
+                                                      <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                      <vscale x 8 x half> %b, i32 7);
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv8f16(i32 %slice.7,
+                                                     <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                     <vscale x 8 x half> %b, i32 7);
+  ret void
+}
+
+define void @test_fmls_f16_vg4_index(i32 %slice,
+; CHECK-LABEL: test_fmls_f16_vg4_index:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    fmls za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h[7]
+; CHECK:    fmls za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h[7]
+; CHECK:    ret
+                                     <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                     <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+                                     <vscale x 8 x half> %b) #0 {
+  call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv8f16(i32 %slice,
+                                                      <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                      <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+                                                      <vscale x 8 x half> %b, i32 7);
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv8f16(i32 %slice,
+                                                      <vscale x 8 x half> %a0, <vscale x 8 x half> %a1,
+                                                      <vscale x 8 x half> %a2, <vscale x 8 x half> %a3,
+                                                      <vscale x 8 x half> %b, i32 7);
+  ret void
+}
+
+define void @test_fmla_bf16_vg2_single(i32 %slice, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1, <vscale x 8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fmla_bf16_vg2_single:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    bfmla za.h[w8, 0, vgx2], { z0.h, z1.h }, z2.h
+; CHECK:    bfmla za.h[w8, 7, vgx2], { z0.h, z1.h }, z2.h
+; CHECK:    ret
+  call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1, <vscale x 8 x bfloat> %b)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8bf16(i32 %slice.7, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1, <vscale x 8 x bfloat> %b)
+  ret void
+}
+
+define void @test_fmla_bf16_vg4_single(i32 %slice, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+; CHECK-LABEL: test_fmla_bf16_vg4_single:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    bfmla za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h
+; CHECK:    bfmla za.h[w8, 7, vgx4], { z0.h - z3.h }, z4.h
+; CHECK:    ret
+                                       <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3, <vscale x 8 x bfloat> %b) #0 {
+  call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                         <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3, <vscale x 8 x bfloat> %b)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8bf16(i32 %slice.7, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                         <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3, <vscale x 8 x bfloat> %b)
+  ret void
+}
+
+define void @test_fmls_bf16_vg2_single(i32 %slice, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1, <vscale x 8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fmls_bf16_vg2_single:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    bfmls za.h[w8, 0, vgx2], { z0.h, z1.h }, z2.h
+; CHECK:    bfmls za.h[w8, 7, vgx2], { z0.h, z1.h }, z2.h
+; CHECK:    ret
+  call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1, <vscale x 8 x bfloat> %b)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8bf16(i32 %slice.7, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1, <vscale x 8 x bfloat> %b)
+  ret void
+}
+
+define void @test_fmls_bf16_vg4_single(i32 %slice, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+; CHECK-LABEL: test_fmls_bf16_vg4_single:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    bfmls za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h
+; CHECK:    bfmls za.h[w8, 7, vgx4], { z0.h - z3.h }, z4.h
+; CHECK:    ret
+                                       <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3, <vscale x 8 x bfloat> %b) #0 {
+  call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                         <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3, <vscale x 8 x bfloat> %b)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8bf16(i32 %slice.7, <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                         <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3, <vscale x 8 x bfloat> %b)
+  ret void
+}
+
+define void @test_fmla_bf16_vg2_multi(i32 %slice,
+; CHECK-LABEL: test_fmla_bf16_vg2_multi:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    bfmla za.h[w8, 0, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK:    bfmla za.h[w8, 7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK:    ret
+                                      <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                      <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1) #0 {
+  call void @llvm.aarch64.sme.fmla.vg1x2.nxv8bf16(i32 %slice,
+                                                 <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                 <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmla.vg1x2.nxv8bf16(i32 %slice.7,
+                                                  <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                  <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1)
+  ret void
+}
+
+define void @test_fmla_bf16_vg4_multi(i32 %slice,
+; CHECK-LABEL: test_fmla_bf16_vg4_multi:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    bfmla za.h[w8, 0, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK:    bfmla za.h[w8, 7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK:    ret
+                                      <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                      <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+                                      <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1,
+                                      <vscale x 8 x bfloat> %b2, <vscale x 8 x bfloat> %b3) #0 {
+  call void @llvm.aarch64.sme.fmla.vg1x4.nxv8bf16(i32 %slice,
+                                                  <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                  <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+                                                  <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1,
+                                                  <vscale x 8 x bfloat> %b2, <vscale x 8 x bfloat> %b3)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmla.vg1x4.nxv8bf16(i32 %slice.7,
+                                                  <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                  <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+                                                  <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1,
+                                                  <vscale x 8 x bfloat> %b2, <vscale x 8 x bfloat> %b3)
+  ret void
+}
+
+define void @test_fmls_bf16_vg2_multi(i32 %slice,
+; CHECK-LABEL: test_fmls_bf16_vg2_multi:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    bfmls za.h[w8, 0, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK:    bfmls za.h[w8, 7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK:    ret
+                                     <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                     <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1) #0 {
+  call void @llvm.aarch64.sme.fmls.vg1x2.nxv8bf16(i32 %slice,
+                                                <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmls.vg1x2.nxv8bf16(i32 %slice.7,
+                                                 <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                 <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1)
+  ret void
+}
+
+define void @test_fmls_bf16_vg4_multi(i32 %slice,
+; CHECK-LABEL: test_fmls_bf16_vg4_multi:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    bfmls za.h[w8, 0, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK:    bfmls za.h[w8, 7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK:    ret
+                                     <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                     <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+                                     <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1,
+                                     <vscale x 8 x bfloat> %b2, <vscale x 8 x bfloat> %b3) #0 {
+  call void @llvm.aarch64.sme.fmls.vg1x4.nxv8bf16(i32 %slice,
+                                                 <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                 <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+                                                 <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1,
+                                                 <vscale x 8 x bfloat> %b2, <vscale x 8 x bfloat> %b3)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmls.vg1x4.nxv8bf16(i32 %slice.7,
+                                                 <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                 <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+                                                 <vscale x 8 x bfloat> %b0, <vscale x 8 x bfloat> %b1,
+                                                 <vscale x 8 x bfloat> %b2, <vscale x 8 x bfloat> %b3)
+  ret void
+}
+
+define void @test_fmla_bf16_vg2_index(i32 %slice,
+; CHECK-LABEL: test_fmla_bf16_vg2_index:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    bfmla za.h[w8, 0, vgx2], { z0.h, z1.h }, z2.h[7]
+; CHECK:    bfmla za.h[w8, 7, vgx2], { z0.h, z1.h }, z2.h[7]
+; CHECK:    ret
+                                      <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                      <vscale x 8 x bfloat> %b) #0 {
+  call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv8bf16(i32 %slice,
+                                                       <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                       <vscale x 8 x bfloat> %b, i32 7);
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv8bf16(i32 %slice.7,
+                                                       <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                       <vscale x 8 x bfloat> %b, i32 7);
+  ret void
+}
+
+define void @test_fmla_bf16_vg4_index(i32 %slice,
+; CHECK-LABEL: test_fmla_bf16_vg4_index:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    bfmla za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h[7]
+; CHECK:    bfmla za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h[7]
+; CHECK:    ret
+                                      <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                      <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+                                      <vscale x 8 x bfloat> %b) #0 {
+  call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv8bf16(i32 %slice,
+                                                       <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                       <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+                                                       <vscale x 8 x bfloat> %b, i32 7);
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv8bf16(i32 %slice,
+                                                       <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                       <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+                                                       <vscale x 8 x bfloat> %b, i32 7);
+  ret void
+}
+
+define void @test_fmls_bf16_vg2_index(i32 %slice,
+; CHECK-LABEL: test_fmls_bf16_vg2_index:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    bfmls za.h[w8, 0, vgx2], { z0.h, z1.h }, z2.h[7]
+; CHECK:    bfmls za.h[w8, 7, vgx2], { z0.h, z1.h }, z2.h[7]
+; CHECK:    ret
+                                      <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                      <vscale x 8 x bfloat> %b) #0 {
+  call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv8bf16(i32 %slice,
+                                                       <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                       <vscale x 8 x bfloat> %b, i32 7);
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv8bf16(i32 %slice.7,
+                                                      <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                      <vscale x 8 x bfloat> %b, i32 7);
+  ret void
+}
+
+define void @test_fmls_bf16_vg4_index(i32 %slice,
+; CHECK-LABEL: test_fmls_bf16_vg4_index:
+; CHECK:  // %bb.0:
+; CHECK:    mov w8, w0
+; CHECK:    bfmls za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h[7]
+; CHECK:    bfmls za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h[7]
+; CHECK:    ret
+                                      <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                      <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+                                      <vscale x 8 x bfloat> %b) #0 {
+  call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv8bf16(i32 %slice,
+                                                       <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                       <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+                                                       <vscale x 8 x bfloat> %b, i32 7);
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv8bf16(i32 %slice,
+                                                       <vscale x 8 x bfloat> %a0, <vscale x 8 x bfloat> %a1,
+                                                       <vscale x 8 x bfloat> %a2, <vscale x 8 x bfloat> %a3,
+                                                       <vscale x 8 x bfloat> %b, i32 7);
+  ret void
+}
+
+attributes #0 = { nounwind "target-features"="+sme2p1,+sme-f16f16,+b16b16" }
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mopa.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mopa.ll
new file mode 100644
index 000000000000..fa0fd4360702
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mopa.ll
@@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+
+target triple = "aarch64-linux"
+
+define void @mopa_bf16(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) #0 {
+; CHECK-LABEL: mopa_bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfmopa za0.h, p0/m, p1/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.mopa.nxv8bf16(i32 0, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
+  ret void
+}
+
+define void @mopa_f16(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm) #0 {
+; CHECK-LABEL: mopa_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmopa za1.h, p0/m, p1/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.mopa.nxv8f16(i32 1, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
+  ret void
+}
+
+define void @mops_bf16(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) #0 {
+; CHECK-LABEL: mops_bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfmops za0.h, p0/m, p1/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.mops.nxv8bf16(i32 0, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
+  ret void
+}
+
+define void @mops_f16(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm) #0 {
+; CHECK-LABEL: mops_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmops za1.h, p0/m, p1/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.mops.nxv8f16(i32 1, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
+  ret void
+}
+
+attributes #0 = {nounwind "target-features" = "+sme,+sme2p1,+bf16,+sme-f16f16,+b16b16" }
diff --git a/llvm/test/CodeGen/AArch64/stackmap-liveness.ll b/llvm/test/CodeGen/AArch64/stackmap-liveness.ll
index e1f9ffe42a77..c19c2623e322 100644
--- a/llvm/test/CodeGen/AArch64/stackmap-liveness.ll
+++ b/llvm/test/CodeGen/AArch64/stackmap-liveness.ll
@@ -27,16 +27,88 @@ define i64 @stackmap_liveness(i1 %c) {
 ; Padding
 ; CHECK-NEXT:   .p2align  3
 ; CHECK-NEXT:   .short  0
-; Num LiveOut Entries: 1
-; CHECK-NEXT:   .short  2
-; LiveOut Entry 0: X0
+; Num LiveOut Entries: 20
+; CHECK-NEXT:   .short  20
+; LiveOut Entry 1: X0
 ; CHECK-NEXT:   .short 0
 ; CHECK-NEXT:   .byte 0
 ; CHECK-NEXT:   .byte 8
-; LiveOut Entry 1: SP
+; LiveOut Entry 2:
+; CHECK-NEXT:   .short 19
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .byte 8
+; LiveOut Entry 3:
+; CHECK-NEXT:   .short 20
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .byte 8
+; LiveOut Entry 4:
+; CHECK-NEXT:   .short 21
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .byte 8
+; LiveOut Entry 5:
+; CHECK-NEXT:   .short 22
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .byte 8
+; LiveOut Entry 6:
+; CHECK-NEXT:   .short 23
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .byte 8
+; LiveOut Entry 7:
+; CHECK-NEXT:   .short 24
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .byte 8
+; LiveOut Entry 8:
+; CHECK-NEXT:   .short 25
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .byte 8
+; LiveOut Entry 9:
+; CHECK-NEXT:   .short 26
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .byte 8
+; LiveOut Entry 10:
+; CHECK-NEXT:   .short 27
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .byte 8
+; LiveOut Entry 11:
+; CHECK-NEXT:   .short 28
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .byte 8
+; LiveOut Entry 12: SP
 ; CHECK-NEXT:   .short 31
 ; CHECK-NEXT:   .byte 0
 ; CHECK-NEXT:   .byte 8
+; LiveOut Entry 13:
+; CHECK-NEXT:   .short 72
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .byte 8
+; LiveOut Entry 14:
+; CHECK-NEXT:   .short 73
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .byte 8
+; LiveOut Entry 15:
+; CHECK-NEXT:   .short 74
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .byte 8
+; LiveOut Entry 16:
+; CHECK-NEXT:   .short 75
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .byte 8
+; LiveOut Entry 17:
+; CHECK-NEXT:   .short 76
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .byte 8
+; LiveOut Entry 18:
+; CHECK-NEXT:   .short 77
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .byte 8
+; LiveOut Entry 19:
+; CHECK-NEXT:   .short 78
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .byte 8
+; LiveOut Entry 20:
+; CHECK-NEXT:   .short 79
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .byte 8
 ; Align
 ; CHECK-NEXT:   .p2align  3
   %1 = select i1 %c, i64 1, i64 2
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll
index b9c531fe3352..e91aac430110 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll
@@ -307,11 +307,85 @@ define <4 x i64> @extract_v4i64_nxv8i64_0(<vscale x 8 x i64> %arg) {
   ret <4 x i64> %ext
 }
 
+define <4 x half> @extract_v4f16_nxv2f16_0(<vscale x 2 x half> %arg) {
+; CHECK-LABEL: extract_v4f16_nxv2f16_0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    addpl x9, sp, #6
+; CHECK-NEXT:    subs x8, x8, #4
+; CHECK-NEXT:    csel x8, xzr, x8, lo
+; CHECK-NEXT:    st1h { z0.d }, p0, [sp, #3, mul vl]
+; CHECK-NEXT:    cmp x8, #0
+; CHECK-NEXT:    csel x8, x8, xzr, lo
+; CHECK-NEXT:    lsl x8, x8, #1
+; CHECK-NEXT:    ldr d0, [x9, x8]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %ext = call <4 x half> @llvm.vector.extract.v4f16.nxv2f16(<vscale x 2 x half> %arg, i64 0)
+  ret <4 x half> %ext
+}
+
+define <4 x half> @extract_v4f16_nxv2f16_4(<vscale x 2 x half> %arg) {
+; CHECK-LABEL: extract_v4f16_nxv2f16_4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    mov w9, #4 // =0x4
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    subs x8, x8, #4
+; CHECK-NEXT:    csel x8, xzr, x8, lo
+; CHECK-NEXT:    st1h { z0.d }, p0, [sp, #3, mul vl]
+; CHECK-NEXT:    cmp x8, #4
+; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    addpl x9, sp, #6
+; CHECK-NEXT:    lsl x8, x8, #1
+; CHECK-NEXT:    ldr d0, [x9, x8]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %ext = call <4 x half> @llvm.vector.extract.v4f16.nxv2f16(<vscale x 2 x half> %arg, i64 4)
+  ret <4 x half> %ext
+}
+
+define <2 x half> @extract_v2f16_nxv4f16_2(<vscale x 4 x half> %arg) {
+; CHECK-LABEL: extract_v2f16_nxv4f16_2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.s, z0.s[3]
+; CHECK-NEXT:    mov z0.s, z0.s[2]
+; CHECK-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
+  %ext = call <2 x half> @llvm.vector.extract.v2f16.nxv4f16(<vscale x 4 x half> %arg, i64 2)
+  ret <2 x half> %ext
+}
+
+define <2 x half> @extract_v2f16_nxv4f16_6(<vscale x 4 x half> %arg) {
+; CHECK-LABEL: extract_v2f16_nxv4f16_6:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.s, z0.s[7]
+; CHECK-NEXT:    mov z0.s, z0.s[6]
+; CHECK-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
+  %ext = call <2 x half> @llvm.vector.extract.v2f16.nxv4f16(<vscale x 4 x half> %arg, i64 6)
+  ret <2 x half> %ext
+}
 
-declare <2 x i64> @llvm.vector.extract.v2i64.nxv8i64(<vscale x 8 x i64>, i64)
-declare <4 x i64> @llvm.vector.extract.v4i64.nxv8i64(<vscale x 8 x i64>, i64)
 declare <4 x float> @llvm.vector.extract.v4f32.nxv16f32(<vscale x 16 x float>, i64)
 declare <2 x float> @llvm.vector.extract.v2f32.nxv16f32(<vscale x 16 x float>, i64)
+declare <4 x half> @llvm.vector.extract.v4f16.nxv2f16(<vscale x 2 x half>, i64);
+declare <2 x half> @llvm.vector.extract.v2f16.nxv4f16(<vscale x 4 x half>, i64);
+declare <2 x i64> @llvm.vector.extract.v2i64.nxv8i64(<vscale x 8 x i64>, i64)
+declare <4 x i64> @llvm.vector.extract.v4i64.nxv8i64(<vscale x 8 x i64>, i64)
 declare <4 x i32> @llvm.vector.extract.v4i32.nxv16i32(<vscale x 16 x i32>, i64)
 declare <2 x i32> @llvm.vector.extract.v2i32.nxv16i32(<vscale x 16 x i32>, i64)
 declare <8 x i16> @llvm.vector.extract.v8i16.nxv32i16(<vscale x 32 x i16>, i64)
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-vector-llrint.ll b/llvm/test/CodeGen/AArch64/sve-fixed-vector-llrint.ll
new file mode 100644
index 000000000000..c77861509e4a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-vector-llrint.ll
@@ -0,0 +1,863 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=256 | FileCheck %s
+
+define <1 x i64> @llrint_v1i64_v1f16(<1 x half> %x) {
+; CHECK-LABEL: llrint_v1i64_v1f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintx h0, h0
+; CHECK-NEXT:    fcvtzs x8, h0
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+  %a = call <1 x i64> @llvm.llrint.v1i64.v1f16(<1 x half> %x)
+  ret <1 x i64> %a
+}
+declare <1 x i64> @llvm.llrint.v1i64.v1f16(<1 x half>)
+
+define <2 x i64> @llrint_v1i64_v2f16(<2 x half> %x) {
+; CHECK-LABEL: llrint_v1i64_v2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov h1, v0.h[1]
+; CHECK-NEXT:    frintx h0, h0
+; CHECK-NEXT:    frintx h1, h1
+; CHECK-NEXT:    fcvtzs x8, h0
+; CHECK-NEXT:    fcvtzs x9, h1
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    mov v0.d[1], x9
+; CHECK-NEXT:    ret
+  %a = call <2 x i64> @llvm.llrint.v2i64.v2f16(<2 x half> %x)
+  ret <2 x i64> %a
+}
+declare <2 x i64> @llvm.llrint.v2i64.v2f16(<2 x half>)
+
+define <4 x i64> @llrint_v4i64_v4f16(<4 x half> %x) {
+; CHECK-LABEL: llrint_v4i64_v4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintx v0.4h, v0.4h
+; CHECK-NEXT:    mov h1, v0.h[2]
+; CHECK-NEXT:    mov h2, v0.h[3]
+; CHECK-NEXT:    mov h3, v0.h[1]
+; CHECK-NEXT:    fcvtzs x9, h0
+; CHECK-NEXT:    fcvtzs x8, h1
+; CHECK-NEXT:    fcvtzs x10, h2
+; CHECK-NEXT:    fcvtzs x11, h3
+; CHECK-NEXT:    fmov d0, x9
+; CHECK-NEXT:    fmov d1, x8
+; CHECK-NEXT:    mov v0.d[1], x11
+; CHECK-NEXT:    mov v1.d[1], x10
+; CHECK-NEXT:    ret
+  %a = call <4 x i64> @llvm.llrint.v4i64.v4f16(<4 x half> %x)
+  ret <4 x i64> %a
+}
+declare <4 x i64> @llvm.llrint.v4i64.v4f16(<4 x half>)
+
+define <8 x i64> @llrint_v8i64_v8f16(<8 x half> %x) {
+; CHECK-LABEL: llrint_v8i64_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    frintx v0.4h, v0.4h
+; CHECK-NEXT:    frintx v1.4h, v1.4h
+; CHECK-NEXT:    mov h4, v0.h[2]
+; CHECK-NEXT:    mov h2, v0.h[1]
+; CHECK-NEXT:    mov h7, v0.h[3]
+; CHECK-NEXT:    fcvtzs x8, h0
+; CHECK-NEXT:    mov h3, v1.h[2]
+; CHECK-NEXT:    mov h5, v1.h[3]
+; CHECK-NEXT:    mov h6, v1.h[1]
+; CHECK-NEXT:    fcvtzs x11, h1
+; CHECK-NEXT:    fcvtzs x12, h4
+; CHECK-NEXT:    fcvtzs x9, h2
+; CHECK-NEXT:    fcvtzs x15, h7
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    fcvtzs x10, h3
+; CHECK-NEXT:    fcvtzs x13, h5
+; CHECK-NEXT:    fcvtzs x14, h6
+; CHECK-NEXT:    fmov d1, x12
+; CHECK-NEXT:    fmov d2, x11
+; CHECK-NEXT:    mov v0.d[1], x9
+; CHECK-NEXT:    fmov d3, x10
+; CHECK-NEXT:    mov v1.d[1], x15
+; CHECK-NEXT:    mov v2.d[1], x14
+; CHECK-NEXT:    mov v3.d[1], x13
+; CHECK-NEXT:    ret
+  %a = call <8 x i64> @llvm.llrint.v8i64.v8f16(<8 x half> %x)
+  ret <8 x i64> %a
+}
+declare <8 x i64> @llvm.llrint.v8i64.v8f16(<8 x half>)
+
+define <16 x i64> @llrint_v16i64_v16f16(<16 x half> %x) {
+; CHECK-LABEL: llrint_v16i64_v16f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT:    frintx v1.4h, v1.4h
+; CHECK-NEXT:    frintx v3.4h, v0.4h
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    frintx v2.4h, v2.4h
+; CHECK-NEXT:    mov h4, v1.h[2]
+; CHECK-NEXT:    mov h5, v3.h[2]
+; CHECK-NEXT:    frintx v0.4h, v0.4h
+; CHECK-NEXT:    mov h6, v3.h[1]
+; CHECK-NEXT:    fcvtzs x9, h3
+; CHECK-NEXT:    mov h16, v1.h[1]
+; CHECK-NEXT:    fcvtzs x12, h1
+; CHECK-NEXT:    mov h3, v3.h[3]
+; CHECK-NEXT:    mov h17, v1.h[3]
+; CHECK-NEXT:    mov h7, v2.h[3]
+; CHECK-NEXT:    fcvtzs x8, h4
+; CHECK-NEXT:    fcvtzs x10, h5
+; CHECK-NEXT:    mov h4, v2.h[2]
+; CHECK-NEXT:    mov h5, v0.h[2]
+; CHECK-NEXT:    fcvtzs x11, h6
+; CHECK-NEXT:    mov h6, v0.h[3]
+; CHECK-NEXT:    fcvtzs x15, h2
+; CHECK-NEXT:    mov h2, v2.h[1]
+; CHECK-NEXT:    fcvtzs x14, h0
+; CHECK-NEXT:    fcvtzs x17, h3
+; CHECK-NEXT:    fcvtzs x0, h17
+; CHECK-NEXT:    fcvtzs x13, h7
+; CHECK-NEXT:    mov h7, v0.h[1]
+; CHECK-NEXT:    fmov d0, x9
+; CHECK-NEXT:    fcvtzs x16, h4
+; CHECK-NEXT:    fcvtzs x9, h5
+; CHECK-NEXT:    fmov d4, x12
+; CHECK-NEXT:    fcvtzs x12, h16
+; CHECK-NEXT:    fmov d1, x10
+; CHECK-NEXT:    fcvtzs x10, h6
+; CHECK-NEXT:    fmov d5, x8
+; CHECK-NEXT:    fcvtzs x8, h2
+; CHECK-NEXT:    fmov d2, x14
+; CHECK-NEXT:    fcvtzs x18, h7
+; CHECK-NEXT:    fmov d6, x15
+; CHECK-NEXT:    mov v0.d[1], x11
+; CHECK-NEXT:    fmov d3, x9
+; CHECK-NEXT:    fmov d7, x16
+; CHECK-NEXT:    mov v1.d[1], x17
+; CHECK-NEXT:    mov v4.d[1], x12
+; CHECK-NEXT:    mov v5.d[1], x0
+; CHECK-NEXT:    mov v6.d[1], x8
+; CHECK-NEXT:    mov v2.d[1], x18
+; CHECK-NEXT:    mov v3.d[1], x10
+; CHECK-NEXT:    mov v7.d[1], x13
+; CHECK-NEXT:    ret
+  %a = call <16 x i64> @llvm.llrint.v16i64.v16f16(<16 x half> %x)
+  ret <16 x i64> %a
+}
+declare <16 x i64> @llvm.llrint.v16i64.v16f16(<16 x half>)
+
+define <32 x i64> @llrint_v32i64_v32f16(<32 x half> %x) {
+; CHECK-LABEL: llrint_v32i64_v32f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    sub x9, sp, #272
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    frintx v5.4h, v0.4h
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT:    ext v17.16b, v2.16b, v2.16b, #8
+; CHECK-NEXT:    frintx v1.4h, v1.4h
+; CHECK-NEXT:    frintx v2.4h, v2.4h
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    mov h6, v5.h[3]
+; CHECK-NEXT:    frintx v0.4h, v0.4h
+; CHECK-NEXT:    mov h7, v5.h[2]
+; CHECK-NEXT:    mov h16, v5.h[1]
+; CHECK-NEXT:    frintx v4.4h, v4.4h
+; CHECK-NEXT:    fcvtzs x12, h5
+; CHECK-NEXT:    ext v5.16b, v3.16b, v3.16b, #8
+; CHECK-NEXT:    frintx v17.4h, v17.4h
+; CHECK-NEXT:    frintx v3.4h, v3.4h
+; CHECK-NEXT:    fcvtzs x9, h6
+; CHECK-NEXT:    mov h6, v0.h[3]
+; CHECK-NEXT:    fcvtzs x10, h7
+; CHECK-NEXT:    mov h7, v0.h[2]
+; CHECK-NEXT:    fcvtzs x11, h16
+; CHECK-NEXT:    mov h16, v0.h[1]
+; CHECK-NEXT:    fcvtzs x13, h6
+; CHECK-NEXT:    mov h6, v4.h[3]
+; CHECK-NEXT:    stp x10, x9, [sp, #48]
+; CHECK-NEXT:    fcvtzs x9, h7
+; CHECK-NEXT:    mov h7, v4.h[2]
+; CHECK-NEXT:    fcvtzs x10, h16
+; CHECK-NEXT:    mov h16, v4.h[1]
+; CHECK-NEXT:    stp x12, x11, [sp, #32]
+; CHECK-NEXT:    fcvtzs x11, h0
+; CHECK-NEXT:    frintx v0.4h, v5.4h
+; CHECK-NEXT:    mov h5, v17.h[3]
+; CHECK-NEXT:    fcvtzs x12, h6
+; CHECK-NEXT:    mov h6, v17.h[2]
+; CHECK-NEXT:    stp x9, x13, [sp, #16]
+; CHECK-NEXT:    fcvtzs x13, h7
+; CHECK-NEXT:    mov h7, v17.h[1]
+; CHECK-NEXT:    fcvtzs x9, h16
+; CHECK-NEXT:    stp x11, x10, [sp]
+; CHECK-NEXT:    fcvtzs x10, h4
+; CHECK-NEXT:    fcvtzs x11, h5
+; CHECK-NEXT:    mov h4, v0.h[3]
+; CHECK-NEXT:    mov h5, v0.h[2]
+; CHECK-NEXT:    stp x13, x12, [sp, #80]
+; CHECK-NEXT:    fcvtzs x12, h6
+; CHECK-NEXT:    fcvtzs x13, h7
+; CHECK-NEXT:    mov h6, v0.h[1]
+; CHECK-NEXT:    stp x10, x9, [sp, #64]
+; CHECK-NEXT:    fcvtzs x9, h17
+; CHECK-NEXT:    mov h7, v1.h[3]
+; CHECK-NEXT:    fcvtzs x10, h4
+; CHECK-NEXT:    mov h4, v1.h[2]
+; CHECK-NEXT:    stp x12, x11, [sp, #144]
+; CHECK-NEXT:    fcvtzs x11, h5
+; CHECK-NEXT:    mov h5, v1.h[1]
+; CHECK-NEXT:    fcvtzs x12, h6
+; CHECK-NEXT:    stp x9, x13, [sp, #128]
+; CHECK-NEXT:    fcvtzs x9, h0
+; CHECK-NEXT:    fcvtzs x13, h7
+; CHECK-NEXT:    mov h0, v2.h[3]
+; CHECK-NEXT:    stp x11, x10, [sp, #208]
+; CHECK-NEXT:    fcvtzs x10, h4
+; CHECK-NEXT:    mov h4, v2.h[2]
+; CHECK-NEXT:    fcvtzs x11, h5
+; CHECK-NEXT:    mov h5, v2.h[1]
+; CHECK-NEXT:    stp x9, x12, [sp, #192]
+; CHECK-NEXT:    fcvtzs x9, h1
+; CHECK-NEXT:    fcvtzs x12, h0
+; CHECK-NEXT:    mov h0, v3.h[3]
+; CHECK-NEXT:    mov h1, v3.h[2]
+; CHECK-NEXT:    stp x10, x13, [sp, #112]
+; CHECK-NEXT:    fcvtzs x10, h4
+; CHECK-NEXT:    mov h4, v3.h[1]
+; CHECK-NEXT:    fcvtzs x13, h5
+; CHECK-NEXT:    stp x9, x11, [sp, #96]
+; CHECK-NEXT:    fcvtzs x9, h2
+; CHECK-NEXT:    fcvtzs x11, h0
+; CHECK-NEXT:    stp x10, x12, [sp, #176]
+; CHECK-NEXT:    fcvtzs x10, h1
+; CHECK-NEXT:    fcvtzs x12, h4
+; CHECK-NEXT:    stp x9, x13, [sp, #160]
+; CHECK-NEXT:    fcvtzs x9, h3
+; CHECK-NEXT:    stp x10, x11, [sp, #240]
+; CHECK-NEXT:    add x10, sp, #64
+; CHECK-NEXT:    stp x9, x12, [sp, #224]
+; CHECK-NEXT:    add x9, sp, #32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x9]
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x10]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x9]
+; CHECK-NEXT:    add x9, sp, #224
+; CHECK-NEXT:    add x10, sp, #128
+; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x9]
+; CHECK-NEXT:    add x9, sp, #160
+; CHECK-NEXT:    ld1d { z4.d }, p0/z, [x10]
+; CHECK-NEXT:    add x10, sp, #96
+; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x9]
+; CHECK-NEXT:    add x9, sp, #192
+; CHECK-NEXT:    ld1d { z6.d }, p0/z, [x10]
+; CHECK-NEXT:    mov x10, #24 // =0x18
+; CHECK-NEXT:    ld1d { z7.d }, p0/z, [x9]
+; CHECK-NEXT:    mov x9, #16 // =0x10
+; CHECK-NEXT:    st1d { z3.d }, p0, [x8, x10, lsl #3]
+; CHECK-NEXT:    st1d { z5.d }, p0, [x8, x9, lsl #3]
+; CHECK-NEXT:    mov x9, #8 // =0x8
+; CHECK-NEXT:    st1d { z6.d }, p0, [x8, x9, lsl #3]
+; CHECK-NEXT:    mov x9, #28 // =0x1c
+; CHECK-NEXT:    st1d { z7.d }, p0, [x8, x9, lsl #3]
+; CHECK-NEXT:    mov x9, #20 // =0x14
+; CHECK-NEXT:    st1d { z4.d }, p0, [x8, x9, lsl #3]
+; CHECK-NEXT:    mov x9, #12 // =0xc
+; CHECK-NEXT:    st1d { z2.d }, p0, [x8, x9, lsl #3]
+; CHECK-NEXT:    mov x9, #4 // =0x4
+; CHECK-NEXT:    st1d { z1.d }, p0, [x8, x9, lsl #3]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x8]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %a = call <32 x i64> @llvm.llrint.v32i64.v32f16(<32 x half> %x)
+  ret <32 x i64> %a
+}
+declare <32 x i64> @llvm.llrint.v32i64.v32f16(<32 x half>)
+
+define <1 x i64> @llrint_v1i64_v1f32(<1 x float> %x) {
+; CHECK-LABEL: llrint_v1i64_v1f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    frintx s0, s0
+; CHECK-NEXT:    fcvtzs x8, s0
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+  %a = call <1 x i64> @llvm.llrint.v1i64.v1f32(<1 x float> %x)
+  ret <1 x i64> %a
+}
+declare <1 x i64> @llvm.llrint.v1i64.v1f32(<1 x float>)
+
+define <2 x i64> @llrint_v2i64_v2f32(<2 x float> %x) {
+; CHECK-LABEL: llrint_v2i64_v2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintx v0.2s, v0.2s
+; CHECK-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    ret
+  %a = call <2 x i64> @llvm.llrint.v2i64.v2f32(<2 x float> %x)
+  ret <2 x i64> %a
+}
+declare <2 x i64> @llvm.llrint.v2i64.v2f32(<2 x float>)
+
+define <4 x i64> @llrint_v4i64_v4f32(<4 x float> %x) {
+; CHECK-LABEL: llrint_v4i64_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintx v0.4s, v0.4s
+; CHECK-NEXT:    mov s1, v0.s[2]
+; CHECK-NEXT:    mov s2, v0.s[3]
+; CHECK-NEXT:    mov s3, v0.s[1]
+; CHECK-NEXT:    fcvtzs x9, s0
+; CHECK-NEXT:    fcvtzs x8, s1
+; CHECK-NEXT:    fcvtzs x10, s2
+; CHECK-NEXT:    fcvtzs x11, s3
+; CHECK-NEXT:    fmov d0, x9
+; CHECK-NEXT:    fmov d1, x8
+; CHECK-NEXT:    mov v0.d[1], x11
+; CHECK-NEXT:    mov v1.d[1], x10
+; CHECK-NEXT:    ret
+  %a = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> %x)
+  ret <4 x i64> %a
+}
+declare <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float>)
+
+define <8 x i64> @llrint_v8i64_v8f32(<8 x float> %x) {
+; CHECK-LABEL: llrint_v8i64_v8f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintx v0.4s, v0.4s
+; CHECK-NEXT:    frintx v1.4s, v1.4s
+; CHECK-NEXT:    mov s3, v1.s[2]
+; CHECK-NEXT:    mov s4, v0.s[2]
+; CHECK-NEXT:    mov s2, v0.s[1]
+; CHECK-NEXT:    mov s5, v1.s[3]
+; CHECK-NEXT:    mov s6, v1.s[1]
+; CHECK-NEXT:    mov s7, v0.s[3]
+; CHECK-NEXT:    fcvtzs x8, s0
+; CHECK-NEXT:    fcvtzs x10, s1
+; CHECK-NEXT:    fcvtzs x11, s3
+; CHECK-NEXT:    fcvtzs x12, s4
+; CHECK-NEXT:    fcvtzs x9, s2
+; CHECK-NEXT:    fcvtzs x13, s5
+; CHECK-NEXT:    fcvtzs x14, s6
+; CHECK-NEXT:    fcvtzs x15, s7
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    fmov d2, x10
+; CHECK-NEXT:    fmov d1, x12
+; CHECK-NEXT:    fmov d3, x11
+; CHECK-NEXT:    mov v0.d[1], x9
+; CHECK-NEXT:    mov v2.d[1], x14
+; CHECK-NEXT:    mov v1.d[1], x15
+; CHECK-NEXT:    mov v3.d[1], x13
+; CHECK-NEXT:    ret
+  %a = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> %x)
+  ret <8 x i64> %a
+}
+declare <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float>)
+
+define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) {
+; CHECK-LABEL: llrint_v16i64_v16f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintx v3.4s, v3.4s
+; CHECK-NEXT:    frintx v2.4s, v2.4s
+; CHECK-NEXT:    frintx v1.4s, v1.4s
+; CHECK-NEXT:    frintx v0.4s, v0.4s
+; CHECK-NEXT:    mov s4, v3.s[2]
+; CHECK-NEXT:    mov s5, v2.s[2]
+; CHECK-NEXT:    mov s6, v1.s[2]
+; CHECK-NEXT:    mov s7, v0.s[2]
+; CHECK-NEXT:    fcvtzs x10, s1
+; CHECK-NEXT:    fcvtzs x11, s0
+; CHECK-NEXT:    mov s16, v0.s[1]
+; CHECK-NEXT:    mov s17, v1.s[1]
+; CHECK-NEXT:    mov s18, v3.s[1]
+; CHECK-NEXT:    fcvtzs x14, s3
+; CHECK-NEXT:    fcvtzs x16, s2
+; CHECK-NEXT:    fcvtzs x8, s4
+; CHECK-NEXT:    mov s4, v2.s[1]
+; CHECK-NEXT:    fcvtzs x9, s5
+; CHECK-NEXT:    mov s5, v1.s[3]
+; CHECK-NEXT:    fcvtzs x12, s6
+; CHECK-NEXT:    mov s6, v0.s[3]
+; CHECK-NEXT:    fcvtzs x13, s7
+; CHECK-NEXT:    mov s7, v3.s[3]
+; CHECK-NEXT:    fmov d0, x11
+; CHECK-NEXT:    fcvtzs x17, s16
+; CHECK-NEXT:    fcvtzs x18, s18
+; CHECK-NEXT:    fcvtzs x15, s4
+; CHECK-NEXT:    mov s4, v2.s[3]
+; CHECK-NEXT:    fmov d2, x10
+; CHECK-NEXT:    fcvtzs x11, s5
+; CHECK-NEXT:    fcvtzs x10, s6
+; CHECK-NEXT:    fmov d3, x12
+; CHECK-NEXT:    fmov d1, x13
+; CHECK-NEXT:    fcvtzs x12, s17
+; CHECK-NEXT:    fcvtzs x13, s7
+; CHECK-NEXT:    fmov d5, x9
+; CHECK-NEXT:    fmov d6, x14
+; CHECK-NEXT:    fmov d7, x8
+; CHECK-NEXT:    fcvtzs x0, s4
+; CHECK-NEXT:    fmov d4, x16
+; CHECK-NEXT:    mov v0.d[1], x17
+; CHECK-NEXT:    mov v1.d[1], x10
+; CHECK-NEXT:    mov v3.d[1], x11
+; CHECK-NEXT:    mov v2.d[1], x12
+; CHECK-NEXT:    mov v6.d[1], x18
+; CHECK-NEXT:    mov v7.d[1], x13
+; CHECK-NEXT:    mov v4.d[1], x15
+; CHECK-NEXT:    mov v5.d[1], x0
+; CHECK-NEXT:    ret
+  %a = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> %x)
+  ret <16 x i64> %a
+}
+declare <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float>)
+
+define <32 x i64> @llrint_v32i64_v32f32(<32 x float> %x) {
+; CHECK-LABEL: llrint_v32i64_v32f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    sub x9, sp, #272
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    frintx v0.4s, v0.4s
+; CHECK-NEXT:    frintx v1.4s, v1.4s
+; CHECK-NEXT:    frintx v2.4s, v2.4s
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    mov s16, v0.s[3]
+; CHECK-NEXT:    mov s17, v0.s[2]
+; CHECK-NEXT:    mov s18, v0.s[1]
+; CHECK-NEXT:    fcvtzs x12, s0
+; CHECK-NEXT:    frintx v0.4s, v3.4s
+; CHECK-NEXT:    mov s3, v2.s[3]
+; CHECK-NEXT:    fcvtzs x9, s16
+; CHECK-NEXT:    mov s16, v1.s[3]
+; CHECK-NEXT:    fcvtzs x10, s17
+; CHECK-NEXT:    mov s17, v1.s[2]
+; CHECK-NEXT:    fcvtzs x11, s18
+; CHECK-NEXT:    mov s18, v1.s[1]
+; CHECK-NEXT:    fcvtzs x13, s16
+; CHECK-NEXT:    stp x10, x9, [sp, #16]
+; CHECK-NEXT:    mov s16, v2.s[2]
+; CHECK-NEXT:    fcvtzs x9, s17
+; CHECK-NEXT:    fcvtzs x10, s18
+; CHECK-NEXT:    mov s17, v2.s[1]
+; CHECK-NEXT:    stp x12, x11, [sp]
+; CHECK-NEXT:    fcvtzs x11, s1
+; CHECK-NEXT:    frintx v1.4s, v4.4s
+; CHECK-NEXT:    fcvtzs x12, s3
+; CHECK-NEXT:    mov s3, v0.s[3]
+; CHECK-NEXT:    mov s4, v0.s[2]
+; CHECK-NEXT:    stp x9, x13, [sp, #48]
+; CHECK-NEXT:    fcvtzs x13, s16
+; CHECK-NEXT:    fcvtzs x9, s17
+; CHECK-NEXT:    mov s16, v0.s[1]
+; CHECK-NEXT:    stp x11, x10, [sp, #32]
+; CHECK-NEXT:    fcvtzs x10, s2
+; CHECK-NEXT:    frintx v2.4s, v5.4s
+; CHECK-NEXT:    fcvtzs x11, s3
+; CHECK-NEXT:    mov s3, v1.s[3]
+; CHECK-NEXT:    mov s5, v1.s[1]
+; CHECK-NEXT:    stp x13, x12, [sp, #80]
+; CHECK-NEXT:    fcvtzs x12, s4
+; CHECK-NEXT:    mov s4, v1.s[2]
+; CHECK-NEXT:    fcvtzs x13, s16
+; CHECK-NEXT:    stp x10, x9, [sp, #64]
+; CHECK-NEXT:    fcvtzs x9, s0
+; CHECK-NEXT:    mov s0, v2.s[3]
+; CHECK-NEXT:    fcvtzs x10, s3
+; CHECK-NEXT:    frintx v3.4s, v6.4s
+; CHECK-NEXT:    stp x12, x11, [sp, #112]
+; CHECK-NEXT:    fcvtzs x11, s4
+; CHECK-NEXT:    mov s4, v2.s[2]
+; CHECK-NEXT:    fcvtzs x12, s5
+; CHECK-NEXT:    mov s5, v2.s[1]
+; CHECK-NEXT:    stp x9, x13, [sp, #96]
+; CHECK-NEXT:    fcvtzs x9, s1
+; CHECK-NEXT:    fcvtzs x13, s0
+; CHECK-NEXT:    mov s0, v3.s[3]
+; CHECK-NEXT:    frintx v1.4s, v7.4s
+; CHECK-NEXT:    stp x11, x10, [sp, #144]
+; CHECK-NEXT:    fcvtzs x10, s4
+; CHECK-NEXT:    mov s4, v3.s[2]
+; CHECK-NEXT:    fcvtzs x11, s5
+; CHECK-NEXT:    mov s5, v3.s[1]
+; CHECK-NEXT:    stp x9, x12, [sp, #128]
+; CHECK-NEXT:    fcvtzs x9, s2
+; CHECK-NEXT:    fcvtzs x12, s0
+; CHECK-NEXT:    mov s0, v1.s[3]
+; CHECK-NEXT:    mov s2, v1.s[2]
+; CHECK-NEXT:    stp x10, x13, [sp, #176]
+; CHECK-NEXT:    fcvtzs x10, s4
+; CHECK-NEXT:    mov s4, v1.s[1]
+; CHECK-NEXT:    fcvtzs x13, s5
+; CHECK-NEXT:    stp x9, x11, [sp, #160]
+; CHECK-NEXT:    fcvtzs x9, s3
+; CHECK-NEXT:    fcvtzs x11, s0
+; CHECK-NEXT:    stp x10, x12, [sp, #208]
+; CHECK-NEXT:    fcvtzs x10, s2
+; CHECK-NEXT:    fcvtzs x12, s4
+; CHECK-NEXT:    stp x9, x13, [sp, #192]
+; CHECK-NEXT:    fcvtzs x9, s1
+; CHECK-NEXT:    stp x10, x11, [sp, #240]
+; CHECK-NEXT:    add x10, sp, #64
+; CHECK-NEXT:    stp x9, x12, [sp, #224]
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x9]
+; CHECK-NEXT:    add x9, sp, #32
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x10]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x9]
+; CHECK-NEXT:    add x9, sp, #224
+; CHECK-NEXT:    add x10, sp, #96
+; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x9]
+; CHECK-NEXT:    add x9, sp, #192
+; CHECK-NEXT:    ld1d { z4.d }, p0/z, [x10]
+; CHECK-NEXT:    add x10, sp, #160
+; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x9]
+; CHECK-NEXT:    add x9, sp, #128
+; CHECK-NEXT:    ld1d { z6.d }, p0/z, [x10]
+; CHECK-NEXT:    mov x10, #28 // =0x1c
+; CHECK-NEXT:    ld1d { z7.d }, p0/z, [x9]
+; CHECK-NEXT:    mov x9, #24 // =0x18
+; CHECK-NEXT:    st1d { z3.d }, p0, [x8, x10, lsl #3]
+; CHECK-NEXT:    st1d { z5.d }, p0, [x8, x9, lsl #3]
+; CHECK-NEXT:    mov x9, #20 // =0x14
+; CHECK-NEXT:    st1d { z6.d }, p0, [x8, x9, lsl #3]
+; CHECK-NEXT:    mov x9, #16 // =0x10
+; CHECK-NEXT:    st1d { z7.d }, p0, [x8, x9, lsl #3]
+; CHECK-NEXT:    mov x9, #12 // =0xc
+; CHECK-NEXT:    st1d { z4.d }, p0, [x8, x9, lsl #3]
+; CHECK-NEXT:    mov x9, #8 // =0x8
+; CHECK-NEXT:    st1d { z2.d }, p0, [x8, x9, lsl #3]
+; CHECK-NEXT:    mov x9, #4 // =0x4
+; CHECK-NEXT:    st1d { z1.d }, p0, [x8, x9, lsl #3]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x8]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %a = call <32 x i64> @llvm.llrint.v32i64.v32f32(<32 x float> %x)
+  ret <32 x i64> %a
+}
+declare <32 x i64> @llvm.llrint.v32i64.v32f32(<32 x float>)
+
+define <1 x i64> @llrint_v1i64_v1f64(<1 x double> %x) {
+; CHECK-LABEL: llrint_v1i64_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintx d0, d0
+; CHECK-NEXT:    fcvtzs x8, d0
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+  %a = call <1 x i64> @llvm.llrint.v1i64.v1f64(<1 x double> %x)
+  ret <1 x i64> %a
+}
+declare <1 x i64> @llvm.llrint.v1i64.v1f64(<1 x double>)
+
+define <2 x i64> @llrint_v2i64_v2f64(<2 x double> %x) {
+; CHECK-LABEL: llrint_v2i64_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintx v0.2d, v0.2d
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    ret
+  %a = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> %x)
+  ret <2 x i64> %a
+}
+declare <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double>)
+
+define <4 x i64> @llrint_v4i64_v4f64(<4 x double> %x) {
+; CHECK-LABEL: llrint_v4i64_v4f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
+; CHECK-NEXT:    mov z1.d, z0.d[2]
+; CHECK-NEXT:    mov z2.d, z0.d[3]
+; CHECK-NEXT:    mov z3.d, z0.d[1]
+; CHECK-NEXT:    fcvtzs x9, d0
+; CHECK-NEXT:    fcvtzs x8, d1
+; CHECK-NEXT:    fcvtzs x10, d2
+; CHECK-NEXT:    fcvtzs x11, d3
+; CHECK-NEXT:    fmov d0, x9
+; CHECK-NEXT:    fmov d1, x8
+; CHECK-NEXT:    mov v0.d[1], x11
+; CHECK-NEXT:    mov v1.d[1], x10
+; CHECK-NEXT:    ret
+  %a = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> %x)
+  ret <4 x i64> %a
+}
+declare <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double>)
+
+define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) {
+; CHECK-LABEL: llrint_v8i64_v8f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q2 killed $q2 def $z2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    // kill: def $q3 killed $q3 def $z3
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    splice z2.d, p0, z2.d, z3.d
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
+; CHECK-NEXT:    movprfx z1, z2
+; CHECK-NEXT:    frintx z1.d, p0/m, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d[2]
+; CHECK-NEXT:    mov z5.d, z0.d[2]
+; CHECK-NEXT:    mov z2.d, z0.d[1]
+; CHECK-NEXT:    mov z3.d, z1.d[3]
+; CHECK-NEXT:    mov z6.d, z0.d[3]
+; CHECK-NEXT:    fcvtzs x8, d0
+; CHECK-NEXT:    mov z0.d, z1.d[1]
+; CHECK-NEXT:    fcvtzs x10, d1
+; CHECK-NEXT:    fcvtzs x11, d4
+; CHECK-NEXT:    fcvtzs x12, d5
+; CHECK-NEXT:    fcvtzs x9, d2
+; CHECK-NEXT:    fcvtzs x13, d3
+; CHECK-NEXT:    fcvtzs x14, d6
+; CHECK-NEXT:    fcvtzs x15, d0
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    fmov d2, x10
+; CHECK-NEXT:    fmov d1, x12
+; CHECK-NEXT:    fmov d3, x11
+; CHECK-NEXT:    mov v0.d[1], x9
+; CHECK-NEXT:    mov v2.d[1], x15
+; CHECK-NEXT:    mov v1.d[1], x14
+; CHECK-NEXT:    mov v3.d[1], x13
+; CHECK-NEXT:    ret
+  %a = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> %x)
+  ret <8 x i64> %a
+}
+declare <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double>)
+
+define <16 x i64> @llrint_v16f64(<16 x double> %x) {
+; CHECK-LABEL: llrint_v16f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.d, vl2
+; CHECK-NEXT:    // kill: def $q6 killed $q6 def $z6
+; CHECK-NEXT:    // kill: def $q4 killed $q4 def $z4
+; CHECK-NEXT:    // kill: def $q7 killed $q7 def $z7
+; CHECK-NEXT:    // kill: def $q5 killed $q5 def $z5
+; CHECK-NEXT:    // kill: def $q2 killed $q2 def $z2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    // kill: def $q3 killed $q3 def $z3
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    splice z6.d, p1, z6.d, z7.d
+; CHECK-NEXT:    splice z4.d, p1, z4.d, z5.d
+; CHECK-NEXT:    splice z2.d, p1, z2.d, z3.d
+; CHECK-NEXT:    splice z0.d, p1, z0.d, z1.d
+; CHECK-NEXT:    movprfx z3, z6
+; CHECK-NEXT:    frintx z3.d, p0/m, z6.d
+; CHECK-NEXT:    movprfx z1, z4
+; CHECK-NEXT:    frintx z1.d, p0/m, z4.d
+; CHECK-NEXT:    frintx z2.d, p0/m, z2.d
+; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
+; CHECK-NEXT:    mov z4.d, z3.d[2]
+; CHECK-NEXT:    mov z5.d, z1.d[2]
+; CHECK-NEXT:    mov z6.d, z2.d[3]
+; CHECK-NEXT:    fcvtzs x11, d0
+; CHECK-NEXT:    fcvtzs x12, d1
+; CHECK-NEXT:    fcvtzs x13, d2
+; CHECK-NEXT:    fcvtzs x14, d3
+; CHECK-NEXT:    mov z7.d, z3.d[3]
+; CHECK-NEXT:    mov z16.d, z1.d[3]
+; CHECK-NEXT:    fcvtzs x9, d4
+; CHECK-NEXT:    fcvtzs x10, d5
+; CHECK-NEXT:    mov z4.d, z2.d[2]
+; CHECK-NEXT:    mov z5.d, z0.d[2]
+; CHECK-NEXT:    fcvtzs x8, d6
+; CHECK-NEXT:    mov z2.d, z2.d[1]
+; CHECK-NEXT:    mov z6.d, z0.d[3]
+; CHECK-NEXT:    mov z1.d, z1.d[1]
+; CHECK-NEXT:    mov z3.d, z3.d[1]
+; CHECK-NEXT:    fcvtzs x15, d4
+; CHECK-NEXT:    mov z4.d, z0.d[1]
+; CHECK-NEXT:    fmov d0, x11
+; CHECK-NEXT:    fcvtzs x16, d5
+; CHECK-NEXT:    fcvtzs x11, d2
+; CHECK-NEXT:    fmov d2, x13
+; CHECK-NEXT:    fcvtzs x17, d7
+; CHECK-NEXT:    fcvtzs x18, d16
+; CHECK-NEXT:    fcvtzs x0, d3
+; CHECK-NEXT:    fcvtzs x13, d4
+; CHECK-NEXT:    fmov d4, x12
+; CHECK-NEXT:    fcvtzs x12, d6
+; CHECK-NEXT:    fmov d6, x14
+; CHECK-NEXT:    fcvtzs x14, d1
+; CHECK-NEXT:    fmov d3, x15
+; CHECK-NEXT:    fmov d1, x16
+; CHECK-NEXT:    fmov d5, x10
+; CHECK-NEXT:    fmov d7, x9
+; CHECK-NEXT:    mov v2.d[1], x11
+; CHECK-NEXT:    mov v0.d[1], x13
+; CHECK-NEXT:    mov v3.d[1], x8
+; CHECK-NEXT:    mov v6.d[1], x0
+; CHECK-NEXT:    mov v4.d[1], x14
+; CHECK-NEXT:    mov v1.d[1], x12
+; CHECK-NEXT:    mov v5.d[1], x18
+; CHECK-NEXT:    mov v7.d[1], x17
+; CHECK-NEXT:    ret
+  %a = call <16 x i64> @llvm.llrint.v16i64.v16f64(<16 x double> %x)
+  ret <16 x i64> %a
+}
+declare <16 x i64> @llvm.llrint.v16i64.v16f64(<16 x double>)
+
+define <32 x i64> @llrint_v32f64(<32 x double> %x) {
+; CHECK-LABEL: llrint_v32f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    sub x9, sp, #272
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    ptrue p1.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    // kill: def $q3 killed $q3 def $z3
+; CHECK-NEXT:    // kill: def $q2 killed $q2 def $z2
+; CHECK-NEXT:    // kill: def $q7 killed $q7 def $z7
+; CHECK-NEXT:    // kill: def $q6 killed $q6 def $z6
+; CHECK-NEXT:    // kill: def $q4 killed $q4 def $z4
+; CHECK-NEXT:    // kill: def $q5 killed $q5 def $z5
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    splice z0.d, p1, z0.d, z1.d
+; CHECK-NEXT:    splice z2.d, p1, z2.d, z3.d
+; CHECK-NEXT:    splice z4.d, p1, z4.d, z5.d
+; CHECK-NEXT:    splice z6.d, p1, z6.d, z7.d
+; CHECK-NEXT:    ldp q5, q19, [x29, #16]
+; CHECK-NEXT:    movprfx z3, z0
+; CHECK-NEXT:    frintx z3.d, p0/m, z0.d
+; CHECK-NEXT:    movprfx z16, z2
+; CHECK-NEXT:    frintx z16.d, p0/m, z2.d
+; CHECK-NEXT:    frintx z4.d, p0/m, z4.d
+; CHECK-NEXT:    splice z5.d, p1, z5.d, z19.d
+; CHECK-NEXT:    frintx z6.d, p0/m, z6.d
+; CHECK-NEXT:    ldp q2, q17, [x29, #48]
+; CHECK-NEXT:    ldp q0, q1, [x29, #112]
+; CHECK-NEXT:    mov z18.d, z3.d[3]
+; CHECK-NEXT:    mov z7.d, z3.d[2]
+; CHECK-NEXT:    fcvtzs x9, d3
+; CHECK-NEXT:    mov z3.d, z3.d[1]
+; CHECK-NEXT:    mov z20.d, z16.d[3]
+; CHECK-NEXT:    fcvtzs x12, d16
+; CHECK-NEXT:    splice z2.d, p1, z2.d, z17.d
+; CHECK-NEXT:    frintx z5.d, p0/m, z5.d
+; CHECK-NEXT:    splice z0.d, p1, z0.d, z1.d
+; CHECK-NEXT:    fcvtzs x10, d18
+; CHECK-NEXT:    fcvtzs x11, d7
+; CHECK-NEXT:    mov z18.d, z16.d[2]
+; CHECK-NEXT:    mov z7.d, z16.d[1]
+; CHECK-NEXT:    fcvtzs x13, d3
+; CHECK-NEXT:    fcvtzs x14, d20
+; CHECK-NEXT:    str x9, [sp, #128]
+; CHECK-NEXT:    mov z16.d, z4.d[3]
+; CHECK-NEXT:    fcvtzs x9, d18
+; CHECK-NEXT:    mov z18.d, z4.d[2]
+; CHECK-NEXT:    frintx z2.d, p0/m, z2.d
+; CHECK-NEXT:    stp x11, x10, [sp, #144]
+; CHECK-NEXT:    fcvtzs x10, d7
+; CHECK-NEXT:    mov z7.d, z4.d[1]
+; CHECK-NEXT:    str x13, [sp, #136]
+; CHECK-NEXT:    fcvtzs x11, d16
+; CHECK-NEXT:    mov z16.d, z6.d[3]
+; CHECK-NEXT:    fcvtzs x13, d18
+; CHECK-NEXT:    ldp q3, q19, [x29, #80]
+; CHECK-NEXT:    stp x9, x14, [sp, #176]
+; CHECK-NEXT:    fcvtzs x9, d4
+; CHECK-NEXT:    mov z4.d, z6.d[2]
+; CHECK-NEXT:    stp x12, x10, [sp, #160]
+; CHECK-NEXT:    fcvtzs x10, d7
+; CHECK-NEXT:    mov z7.d, z6.d[1]
+; CHECK-NEXT:    fcvtzs x12, d6
+; CHECK-NEXT:    splice z3.d, p1, z3.d, z19.d
+; CHECK-NEXT:    mov z6.d, z5.d[2]
+; CHECK-NEXT:    stp x13, x11, [sp, #208]
+; CHECK-NEXT:    fcvtzs x11, d16
+; CHECK-NEXT:    fcvtzs x13, d4
+; CHECK-NEXT:    mov z4.d, z5.d[3]
+; CHECK-NEXT:    mov z1.d, z5.d[1]
+; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
+; CHECK-NEXT:    stp x9, x10, [sp, #192]
+; CHECK-NEXT:    fcvtzs x9, d7
+; CHECK-NEXT:    frintx z3.d, p0/m, z3.d
+; CHECK-NEXT:    fcvtzs x10, d4
+; CHECK-NEXT:    stp x13, x11, [sp, #240]
+; CHECK-NEXT:    fcvtzs x11, d6
+; CHECK-NEXT:    mov z4.d, z2.d[3]
+; CHECK-NEXT:    fcvtzs x13, d2
+; CHECK-NEXT:    stp x12, x9, [sp, #224]
+; CHECK-NEXT:    fcvtzs x9, d5
+; CHECK-NEXT:    fcvtzs x12, d1
+; CHECK-NEXT:    mov z5.d, z2.d[2]
+; CHECK-NEXT:    mov z1.d, z2.d[1]
+; CHECK-NEXT:    mov z2.d, z3.d[2]
+; CHECK-NEXT:    stp x11, x10, [sp, #16]
+; CHECK-NEXT:    fcvtzs x10, d4
+; CHECK-NEXT:    mov z4.d, z3.d[3]
+; CHECK-NEXT:    fcvtzs x11, d5
+; CHECK-NEXT:    stp x9, x12, [sp]
+; CHECK-NEXT:    fcvtzs x9, d1
+; CHECK-NEXT:    mov z1.d, z3.d[1]
+; CHECK-NEXT:    fcvtzs x12, d4
+; CHECK-NEXT:    stp x11, x10, [sp, #48]
+; CHECK-NEXT:    fcvtzs x10, d2
+; CHECK-NEXT:    fcvtzs x11, d3
+; CHECK-NEXT:    stp x13, x9, [sp, #32]
+; CHECK-NEXT:    fcvtzs x9, d1
+; CHECK-NEXT:    mov z2.d, z0.d[3]
+; CHECK-NEXT:    mov z3.d, z0.d[2]
+; CHECK-NEXT:    mov z1.d, z0.d[1]
+; CHECK-NEXT:    fcvtzs x13, d2
+; CHECK-NEXT:    stp x10, x12, [sp, #80]
+; CHECK-NEXT:    fcvtzs x12, d0
+; CHECK-NEXT:    fcvtzs x10, d3
+; CHECK-NEXT:    stp x11, x9, [sp, #64]
+; CHECK-NEXT:    fcvtzs x9, d1
+; CHECK-NEXT:    stp x10, x13, [sp, #112]
+; CHECK-NEXT:    add x10, sp, #192
+; CHECK-NEXT:    stp x12, x9, [sp, #96]
+; CHECK-NEXT:    add x9, sp, #128
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x9]
+; CHECK-NEXT:    add x9, sp, #160
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x10]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x9]
+; CHECK-NEXT:    add x9, sp, #96
+; CHECK-NEXT:    add x10, sp, #224
+; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x9]
+; CHECK-NEXT:    add x9, sp, #64
+; CHECK-NEXT:    ld1d { z4.d }, p0/z, [x10]
+; CHECK-NEXT:    add x10, sp, #32
+; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x9]
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    ld1d { z6.d }, p0/z, [x10]
+; CHECK-NEXT:    mov x10, #28 // =0x1c
+; CHECK-NEXT:    ld1d { z7.d }, p0/z, [x9]
+; CHECK-NEXT:    mov x9, #24 // =0x18
+; CHECK-NEXT:    st1d { z3.d }, p0, [x8, x10, lsl #3]
+; CHECK-NEXT:    st1d { z5.d }, p0, [x8, x9, lsl #3]
+; CHECK-NEXT:    mov x9, #20 // =0x14
+; CHECK-NEXT:    st1d { z6.d }, p0, [x8, x9, lsl #3]
+; CHECK-NEXT:    mov x9, #16 // =0x10
+; CHECK-NEXT:    st1d { z7.d }, p0, [x8, x9, lsl #3]
+; CHECK-NEXT:    mov x9, #12 // =0xc
+; CHECK-NEXT:    st1d { z4.d }, p0, [x8, x9, lsl #3]
+; CHECK-NEXT:    mov x9, #8 // =0x8
+; CHECK-NEXT:    st1d { z2.d }, p0, [x8, x9, lsl #3]
+; CHECK-NEXT:    mov x9, #4 // =0x4
+; CHECK-NEXT:    st1d { z1.d }, p0, [x8, x9, lsl #3]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x8]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %a = call <32 x i64> @llvm.llrint.v32i64.v16f64(<32 x double> %x)
+  ret <32 x i64> %a
+}
+declare <32 x i64> @llvm.llrint.v32i64.v32f64(<32 x double>)
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-vector-lrint.ll b/llvm/test/CodeGen/AArch64/sve-fixed-vector-lrint.ll
new file mode 100644
index 000000000000..6a97e7ad64bf
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-vector-lrint.ll
@@ -0,0 +1,1613 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=aarch64 -mattr=+sve \
+; RUN:   -aarch64-sve-vector-bits-min=256 | FileCheck --check-prefixes=CHECK-i32 %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=aarch64 -mattr=+sve \
+; RUN:   -aarch64-sve-vector-bits-min=256 | FileCheck --check-prefixes=CHECK-i64 %s
+
+define <1 x iXLen> @lrint_v1f16(<1 x half> %x) {
+; CHECK-i32-LABEL: lrint_v1f16:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    frintx h0, h0
+; CHECK-i32-NEXT:    fcvtzs w8, h0
+; CHECK-i32-NEXT:    fmov s0, w8
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v1f16:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    frintx h0, h0
+; CHECK-i64-NEXT:    fcvtzs x8, h0
+; CHECK-i64-NEXT:    fmov d0, x8
+; CHECK-i64-NEXT:    ret
+  %a = call <1 x iXLen> @llvm.lrint.v1iXLen.v1f16(<1 x half> %x)
+  ret <1 x iXLen> %a
+}
+declare <1 x iXLen> @llvm.lrint.v1iXLen.v1f16(<1 x half>)
+
+define <2 x iXLen> @lrint_v2f16(<2 x half> %x) {
+; CHECK-i32-LABEL: lrint_v2f16:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-i32-NEXT:    mov h1, v0.h[1]
+; CHECK-i32-NEXT:    frintx h0, h0
+; CHECK-i32-NEXT:    frintx h1, h1
+; CHECK-i32-NEXT:    fcvtzs w8, h0
+; CHECK-i32-NEXT:    fcvtzs w9, h1
+; CHECK-i32-NEXT:    fmov s0, w8
+; CHECK-i32-NEXT:    mov v0.s[1], w9
+; CHECK-i32-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v2f16:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-i64-NEXT:    mov h1, v0.h[1]
+; CHECK-i64-NEXT:    frintx h0, h0
+; CHECK-i64-NEXT:    frintx h1, h1
+; CHECK-i64-NEXT:    fcvtzs x8, h0
+; CHECK-i64-NEXT:    fcvtzs x9, h1
+; CHECK-i64-NEXT:    fmov d0, x8
+; CHECK-i64-NEXT:    mov v0.d[1], x9
+; CHECK-i64-NEXT:    ret
+  %a = call <2 x iXLen> @llvm.lrint.v2iXLen.v2f16(<2 x half> %x)
+  ret <2 x iXLen> %a
+}
+declare <2 x iXLen> @llvm.lrint.v2iXLen.v2f16(<2 x half>)
+
+define <4 x iXLen> @lrint_v4f16(<4 x half> %x) {
+; CHECK-i32-LABEL: lrint_v4f16:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    frintx v0.4h, v0.4h
+; CHECK-i32-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-i32-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v4f16:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    frintx v0.4h, v0.4h
+; CHECK-i64-NEXT:    mov h1, v0.h[2]
+; CHECK-i64-NEXT:    mov h2, v0.h[3]
+; CHECK-i64-NEXT:    mov h3, v0.h[1]
+; CHECK-i64-NEXT:    fcvtzs x9, h0
+; CHECK-i64-NEXT:    fcvtzs x8, h1
+; CHECK-i64-NEXT:    fcvtzs x10, h2
+; CHECK-i64-NEXT:    fcvtzs x11, h3
+; CHECK-i64-NEXT:    fmov d0, x9
+; CHECK-i64-NEXT:    fmov d1, x8
+; CHECK-i64-NEXT:    mov v0.d[1], x11
+; CHECK-i64-NEXT:    mov v1.d[1], x10
+; CHECK-i64-NEXT:    ret
+  %a = call <4 x iXLen> @llvm.lrint.v4iXLen.v4f16(<4 x half> %x)
+  ret <4 x iXLen> %a
+}
+declare <4 x iXLen> @llvm.lrint.v4iXLen.v4f16(<4 x half>)
+
+define <8 x iXLen> @lrint_v8f16(<8 x half> %x) {
+; CHECK-i32-LABEL: lrint_v8f16:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    frintx v2.8h, v0.8h
+; CHECK-i32-NEXT:    mov h0, v2.h[4]
+; CHECK-i32-NEXT:    mov h1, v2.h[5]
+; CHECK-i32-NEXT:    mov h3, v2.h[1]
+; CHECK-i32-NEXT:    fcvtzs w9, h2
+; CHECK-i32-NEXT:    mov h4, v2.h[6]
+; CHECK-i32-NEXT:    fcvtzs w8, h0
+; CHECK-i32-NEXT:    mov h0, v2.h[2]
+; CHECK-i32-NEXT:    fcvtzs w10, h1
+; CHECK-i32-NEXT:    fcvtzs w11, h3
+; CHECK-i32-NEXT:    mov h3, v2.h[7]
+; CHECK-i32-NEXT:    fcvtzs w12, h4
+; CHECK-i32-NEXT:    mov h2, v2.h[3]
+; CHECK-i32-NEXT:    fcvtzs w13, h0
+; CHECK-i32-NEXT:    fmov s0, w9
+; CHECK-i32-NEXT:    fmov s1, w8
+; CHECK-i32-NEXT:    fcvtzs w8, h3
+; CHECK-i32-NEXT:    fcvtzs w9, h2
+; CHECK-i32-NEXT:    mov v0.s[1], w11
+; CHECK-i32-NEXT:    mov v1.s[1], w10
+; CHECK-i32-NEXT:    mov v0.s[2], w13
+; CHECK-i32-NEXT:    mov v1.s[2], w12
+; CHECK-i32-NEXT:    mov v0.s[3], w9
+; CHECK-i32-NEXT:    mov v1.s[3], w8
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v8f16:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-i64-NEXT:    frintx v0.4h, v0.4h
+; CHECK-i64-NEXT:    frintx v1.4h, v1.4h
+; CHECK-i64-NEXT:    mov h4, v0.h[2]
+; CHECK-i64-NEXT:    mov h2, v0.h[1]
+; CHECK-i64-NEXT:    mov h7, v0.h[3]
+; CHECK-i64-NEXT:    fcvtzs x8, h0
+; CHECK-i64-NEXT:    mov h3, v1.h[2]
+; CHECK-i64-NEXT:    mov h5, v1.h[3]
+; CHECK-i64-NEXT:    mov h6, v1.h[1]
+; CHECK-i64-NEXT:    fcvtzs x11, h1
+; CHECK-i64-NEXT:    fcvtzs x12, h4
+; CHECK-i64-NEXT:    fcvtzs x9, h2
+; CHECK-i64-NEXT:    fcvtzs x15, h7
+; CHECK-i64-NEXT:    fmov d0, x8
+; CHECK-i64-NEXT:    fcvtzs x10, h3
+; CHECK-i64-NEXT:    fcvtzs x13, h5
+; CHECK-i64-NEXT:    fcvtzs x14, h6
+; CHECK-i64-NEXT:    fmov d1, x12
+; CHECK-i64-NEXT:    fmov d2, x11
+; CHECK-i64-NEXT:    mov v0.d[1], x9
+; CHECK-i64-NEXT:    fmov d3, x10
+; CHECK-i64-NEXT:    mov v1.d[1], x15
+; CHECK-i64-NEXT:    mov v2.d[1], x14
+; CHECK-i64-NEXT:    mov v3.d[1], x13
+; CHECK-i64-NEXT:    ret
+  %a = call <8 x iXLen> @llvm.lrint.v8iXLen.v8f16(<8 x half> %x)
+  ret <8 x iXLen> %a
+}
+declare <8 x iXLen> @llvm.lrint.v8iXLen.v8f16(<8 x half>)
+
+define <16 x iXLen> @lrint_v16f16(<16 x half> %x) {
+; CHECK-i32-LABEL: lrint_v16f16:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    frintx v1.8h, v1.8h
+; CHECK-i32-NEXT:    frintx v0.8h, v0.8h
+; CHECK-i32-NEXT:    mov h3, v1.h[4]
+; CHECK-i32-NEXT:    mov h2, v1.h[5]
+; CHECK-i32-NEXT:    mov h5, v0.h[4]
+; CHECK-i32-NEXT:    mov h4, v1.h[1]
+; CHECK-i32-NEXT:    mov h6, v0.h[1]
+; CHECK-i32-NEXT:    fcvtzs w11, h0
+; CHECK-i32-NEXT:    fcvtzs w14, h1
+; CHECK-i32-NEXT:    mov h7, v1.h[6]
+; CHECK-i32-NEXT:    mov h16, v1.h[3]
+; CHECK-i32-NEXT:    mov h17, v0.h[7]
+; CHECK-i32-NEXT:    mov h18, v0.h[3]
+; CHECK-i32-NEXT:    fcvtzs w9, h3
+; CHECK-i32-NEXT:    mov h3, v0.h[5]
+; CHECK-i32-NEXT:    fcvtzs w8, h2
+; CHECK-i32-NEXT:    mov h2, v1.h[2]
+; CHECK-i32-NEXT:    fcvtzs w12, h5
+; CHECK-i32-NEXT:    fcvtzs w10, h4
+; CHECK-i32-NEXT:    mov h4, v0.h[6]
+; CHECK-i32-NEXT:    mov h5, v0.h[2]
+; CHECK-i32-NEXT:    fcvtzs w13, h6
+; CHECK-i32-NEXT:    mov h6, v1.h[7]
+; CHECK-i32-NEXT:    fmov s0, w11
+; CHECK-i32-NEXT:    fcvtzs w16, h7
+; CHECK-i32-NEXT:    fcvtzs w15, h3
+; CHECK-i32-NEXT:    fmov s3, w9
+; CHECK-i32-NEXT:    fcvtzs w9, h16
+; CHECK-i32-NEXT:    fcvtzs w17, h2
+; CHECK-i32-NEXT:    fmov s1, w12
+; CHECK-i32-NEXT:    fmov s2, w14
+; CHECK-i32-NEXT:    fcvtzs w11, h4
+; CHECK-i32-NEXT:    fcvtzs w18, h5
+; CHECK-i32-NEXT:    mov v0.s[1], w13
+; CHECK-i32-NEXT:    mov v3.s[1], w8
+; CHECK-i32-NEXT:    fcvtzs w8, h6
+; CHECK-i32-NEXT:    fcvtzs w12, h18
+; CHECK-i32-NEXT:    mov v1.s[1], w15
+; CHECK-i32-NEXT:    mov v2.s[1], w10
+; CHECK-i32-NEXT:    fcvtzs w10, h17
+; CHECK-i32-NEXT:    mov v0.s[2], w18
+; CHECK-i32-NEXT:    mov v3.s[2], w16
+; CHECK-i32-NEXT:    mov v1.s[2], w11
+; CHECK-i32-NEXT:    mov v2.s[2], w17
+; CHECK-i32-NEXT:    mov v0.s[3], w12
+; CHECK-i32-NEXT:    mov v3.s[3], w8
+; CHECK-i32-NEXT:    mov v1.s[3], w10
+; CHECK-i32-NEXT:    mov v2.s[3], w9
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v16f16:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
+; CHECK-i64-NEXT:    frintx v1.4h, v1.4h
+; CHECK-i64-NEXT:    frintx v3.4h, v0.4h
+; CHECK-i64-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-i64-NEXT:    frintx v2.4h, v2.4h
+; CHECK-i64-NEXT:    mov h4, v1.h[2]
+; CHECK-i64-NEXT:    mov h5, v3.h[2]
+; CHECK-i64-NEXT:    frintx v0.4h, v0.4h
+; CHECK-i64-NEXT:    mov h6, v3.h[1]
+; CHECK-i64-NEXT:    fcvtzs x9, h3
+; CHECK-i64-NEXT:    mov h16, v1.h[1]
+; CHECK-i64-NEXT:    fcvtzs x12, h1
+; CHECK-i64-NEXT:    mov h3, v3.h[3]
+; CHECK-i64-NEXT:    mov h17, v1.h[3]
+; CHECK-i64-NEXT:    mov h7, v2.h[3]
+; CHECK-i64-NEXT:    fcvtzs x8, h4
+; CHECK-i64-NEXT:    fcvtzs x10, h5
+; CHECK-i64-NEXT:    mov h4, v2.h[2]
+; CHECK-i64-NEXT:    mov h5, v0.h[2]
+; CHECK-i64-NEXT:    fcvtzs x11, h6
+; CHECK-i64-NEXT:    mov h6, v0.h[3]
+; CHECK-i64-NEXT:    fcvtzs x15, h2
+; CHECK-i64-NEXT:    mov h2, v2.h[1]
+; CHECK-i64-NEXT:    fcvtzs x14, h0
+; CHECK-i64-NEXT:    fcvtzs x17, h3
+; CHECK-i64-NEXT:    fcvtzs x0, h17
+; CHECK-i64-NEXT:    fcvtzs x13, h7
+; CHECK-i64-NEXT:    mov h7, v0.h[1]
+; CHECK-i64-NEXT:    fmov d0, x9
+; CHECK-i64-NEXT:    fcvtzs x16, h4
+; CHECK-i64-NEXT:    fcvtzs x9, h5
+; CHECK-i64-NEXT:    fmov d4, x12
+; CHECK-i64-NEXT:    fcvtzs x12, h16
+; CHECK-i64-NEXT:    fmov d1, x10
+; CHECK-i64-NEXT:    fcvtzs x10, h6
+; CHECK-i64-NEXT:    fmov d5, x8
+; CHECK-i64-NEXT:    fcvtzs x8, h2
+; CHECK-i64-NEXT:    fmov d2, x14
+; CHECK-i64-NEXT:    fcvtzs x18, h7
+; CHECK-i64-NEXT:    fmov d6, x15
+; CHECK-i64-NEXT:    mov v0.d[1], x11
+; CHECK-i64-NEXT:    fmov d3, x9
+; CHECK-i64-NEXT:    fmov d7, x16
+; CHECK-i64-NEXT:    mov v1.d[1], x17
+; CHECK-i64-NEXT:    mov v4.d[1], x12
+; CHECK-i64-NEXT:    mov v5.d[1], x0
+; CHECK-i64-NEXT:    mov v6.d[1], x8
+; CHECK-i64-NEXT:    mov v2.d[1], x18
+; CHECK-i64-NEXT:    mov v3.d[1], x10
+; CHECK-i64-NEXT:    mov v7.d[1], x13
+; CHECK-i64-NEXT:    ret
+  %a = call <16 x iXLen> @llvm.lrint.v16iXLen.v16f16(<16 x half> %x)
+  ret <16 x iXLen> %a
+}
+declare <16 x iXLen> @llvm.lrint.v16iXLen.v16f16(<16 x half>)
+
+define <32 x iXLen> @lrint_v32f16(<32 x half> %x) {
+; CHECK-i32-LABEL: lrint_v32f16:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    stp x26, x25, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-i32-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; CHECK-i32-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-i32-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-i32-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-i32-NEXT:    .cfi_offset w19, -8
+; CHECK-i32-NEXT:    .cfi_offset w20, -16
+; CHECK-i32-NEXT:    .cfi_offset w21, -24
+; CHECK-i32-NEXT:    .cfi_offset w22, -32
+; CHECK-i32-NEXT:    .cfi_offset w23, -40
+; CHECK-i32-NEXT:    .cfi_offset w24, -48
+; CHECK-i32-NEXT:    .cfi_offset w25, -56
+; CHECK-i32-NEXT:    .cfi_offset w26, -64
+; CHECK-i32-NEXT:    frintx v3.8h, v3.8h
+; CHECK-i32-NEXT:    frintx v2.8h, v2.8h
+; CHECK-i32-NEXT:    frintx v1.8h, v1.8h
+; CHECK-i32-NEXT:    frintx v0.8h, v0.8h
+; CHECK-i32-NEXT:    mov h4, v3.h[7]
+; CHECK-i32-NEXT:    mov h5, v3.h[6]
+; CHECK-i32-NEXT:    mov h6, v3.h[5]
+; CHECK-i32-NEXT:    mov h7, v3.h[4]
+; CHECK-i32-NEXT:    mov h16, v3.h[3]
+; CHECK-i32-NEXT:    mov h17, v3.h[2]
+; CHECK-i32-NEXT:    mov h18, v3.h[1]
+; CHECK-i32-NEXT:    mov h19, v2.h[7]
+; CHECK-i32-NEXT:    fcvtzs w1, h3
+; CHECK-i32-NEXT:    mov h3, v1.h[6]
+; CHECK-i32-NEXT:    fcvtzs w7, h2
+; CHECK-i32-NEXT:    fcvtzs w22, h0
+; CHECK-i32-NEXT:    fcvtzs w8, h4
+; CHECK-i32-NEXT:    mov h4, v2.h[6]
+; CHECK-i32-NEXT:    fcvtzs w10, h5
+; CHECK-i32-NEXT:    mov h5, v2.h[5]
+; CHECK-i32-NEXT:    fcvtzs w12, h6
+; CHECK-i32-NEXT:    mov h6, v2.h[4]
+; CHECK-i32-NEXT:    fcvtzs w13, h7
+; CHECK-i32-NEXT:    mov h7, v2.h[3]
+; CHECK-i32-NEXT:    fcvtzs w9, h16
+; CHECK-i32-NEXT:    fcvtzs w11, h17
+; CHECK-i32-NEXT:    mov h16, v2.h[2]
+; CHECK-i32-NEXT:    mov h17, v2.h[1]
+; CHECK-i32-NEXT:    fcvtzs w17, h4
+; CHECK-i32-NEXT:    mov h4, v1.h[5]
+; CHECK-i32-NEXT:    mov h2, v0.h[5]
+; CHECK-i32-NEXT:    fcvtzs w0, h5
+; CHECK-i32-NEXT:    fcvtzs w3, h6
+; CHECK-i32-NEXT:    mov h5, v1.h[4]
+; CHECK-i32-NEXT:    mov h6, v0.h[4]
+; CHECK-i32-NEXT:    fcvtzs w16, h7
+; CHECK-i32-NEXT:    mov h7, v0.h[1]
+; CHECK-i32-NEXT:    fcvtzs w15, h18
+; CHECK-i32-NEXT:    fcvtzs w2, h3
+; CHECK-i32-NEXT:    mov h3, v1.h[2]
+; CHECK-i32-NEXT:    fcvtzs w19, h4
+; CHECK-i32-NEXT:    mov h4, v1.h[1]
+; CHECK-i32-NEXT:    mov h18, v0.h[6]
+; CHECK-i32-NEXT:    fcvtzs w20, h5
+; CHECK-i32-NEXT:    fcvtzs w23, h2
+; CHECK-i32-NEXT:    mov h2, v0.h[2]
+; CHECK-i32-NEXT:    fcvtzs w21, h6
+; CHECK-i32-NEXT:    fcvtzs w25, h1
+; CHECK-i32-NEXT:    fcvtzs w4, h17
+; CHECK-i32-NEXT:    fcvtzs w24, h7
+; CHECK-i32-NEXT:    fcvtzs w14, h19
+; CHECK-i32-NEXT:    fcvtzs w18, h16
+; CHECK-i32-NEXT:    fcvtzs w26, h4
+; CHECK-i32-NEXT:    mov h16, v1.h[7]
+; CHECK-i32-NEXT:    mov h17, v1.h[3]
+; CHECK-i32-NEXT:    fcvtzs w5, h3
+; CHECK-i32-NEXT:    mov h19, v0.h[7]
+; CHECK-i32-NEXT:    fcvtzs w6, h18
+; CHECK-i32-NEXT:    mov h18, v0.h[3]
+; CHECK-i32-NEXT:    fmov s0, w22
+; CHECK-i32-NEXT:    fmov s1, w21
+; CHECK-i32-NEXT:    fcvtzs w21, h2
+; CHECK-i32-NEXT:    fmov s2, w25
+; CHECK-i32-NEXT:    fmov s3, w20
+; CHECK-i32-NEXT:    fmov s4, w7
+; CHECK-i32-NEXT:    fmov s5, w3
+; CHECK-i32-NEXT:    fmov s6, w1
+; CHECK-i32-NEXT:    fmov s7, w13
+; CHECK-i32-NEXT:    mov v0.s[1], w24
+; CHECK-i32-NEXT:    mov v1.s[1], w23
+; CHECK-i32-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; CHECK-i32-NEXT:    mov v2.s[1], w26
+; CHECK-i32-NEXT:    mov v3.s[1], w19
+; CHECK-i32-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-i32-NEXT:    mov v4.s[1], w4
+; CHECK-i32-NEXT:    mov v5.s[1], w0
+; CHECK-i32-NEXT:    mov v6.s[1], w15
+; CHECK-i32-NEXT:    mov v7.s[1], w12
+; CHECK-i32-NEXT:    fcvtzs w12, h16
+; CHECK-i32-NEXT:    fcvtzs w13, h17
+; CHECK-i32-NEXT:    fcvtzs w15, h19
+; CHECK-i32-NEXT:    fcvtzs w0, h18
+; CHECK-i32-NEXT:    mov v0.s[2], w21
+; CHECK-i32-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-i32-NEXT:    mov v1.s[2], w6
+; CHECK-i32-NEXT:    mov v2.s[2], w5
+; CHECK-i32-NEXT:    mov v3.s[2], w2
+; CHECK-i32-NEXT:    mov v4.s[2], w18
+; CHECK-i32-NEXT:    mov v5.s[2], w17
+; CHECK-i32-NEXT:    mov v6.s[2], w11
+; CHECK-i32-NEXT:    mov v7.s[2], w10
+; CHECK-i32-NEXT:    mov v0.s[3], w0
+; CHECK-i32-NEXT:    mov v1.s[3], w15
+; CHECK-i32-NEXT:    mov v2.s[3], w13
+; CHECK-i32-NEXT:    mov v3.s[3], w12
+; CHECK-i32-NEXT:    mov v4.s[3], w16
+; CHECK-i32-NEXT:    mov v5.s[3], w14
+; CHECK-i32-NEXT:    mov v6.s[3], w9
+; CHECK-i32-NEXT:    mov v7.s[3], w8
+; CHECK-i32-NEXT:    ldp x26, x25, [sp], #64 // 16-byte Folded Reload
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v32f16:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-i64-NEXT:    sub x9, sp, #272
+; CHECK-i64-NEXT:    mov x29, sp
+; CHECK-i64-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK-i64-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-i64-NEXT:    .cfi_offset w30, -8
+; CHECK-i64-NEXT:    .cfi_offset w29, -16
+; CHECK-i64-NEXT:    frintx v5.4h, v0.4h
+; CHECK-i64-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-i64-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-i64-NEXT:    ext v17.16b, v2.16b, v2.16b, #8
+; CHECK-i64-NEXT:    frintx v1.4h, v1.4h
+; CHECK-i64-NEXT:    frintx v2.4h, v2.4h
+; CHECK-i64-NEXT:    ptrue p0.d, vl4
+; CHECK-i64-NEXT:    mov h6, v5.h[3]
+; CHECK-i64-NEXT:    frintx v0.4h, v0.4h
+; CHECK-i64-NEXT:    mov h7, v5.h[2]
+; CHECK-i64-NEXT:    mov h16, v5.h[1]
+; CHECK-i64-NEXT:    frintx v4.4h, v4.4h
+; CHECK-i64-NEXT:    fcvtzs x12, h5
+; CHECK-i64-NEXT:    ext v5.16b, v3.16b, v3.16b, #8
+; CHECK-i64-NEXT:    frintx v17.4h, v17.4h
+; CHECK-i64-NEXT:    frintx v3.4h, v3.4h
+; CHECK-i64-NEXT:    fcvtzs x9, h6
+; CHECK-i64-NEXT:    mov h6, v0.h[3]
+; CHECK-i64-NEXT:    fcvtzs x10, h7
+; CHECK-i64-NEXT:    mov h7, v0.h[2]
+; CHECK-i64-NEXT:    fcvtzs x11, h16
+; CHECK-i64-NEXT:    mov h16, v0.h[1]
+; CHECK-i64-NEXT:    fcvtzs x13, h6
+; CHECK-i64-NEXT:    mov h6, v4.h[3]
+; CHECK-i64-NEXT:    stp x10, x9, [sp, #48]
+; CHECK-i64-NEXT:    fcvtzs x9, h7
+; CHECK-i64-NEXT:    mov h7, v4.h[2]
+; CHECK-i64-NEXT:    fcvtzs x10, h16
+; CHECK-i64-NEXT:    mov h16, v4.h[1]
+; CHECK-i64-NEXT:    stp x12, x11, [sp, #32]
+; CHECK-i64-NEXT:    fcvtzs x11, h0
+; CHECK-i64-NEXT:    frintx v0.4h, v5.4h
+; CHECK-i64-NEXT:    mov h5, v17.h[3]
+; CHECK-i64-NEXT:    fcvtzs x12, h6
+; CHECK-i64-NEXT:    mov h6, v17.h[2]
+; CHECK-i64-NEXT:    stp x9, x13, [sp, #16]
+; CHECK-i64-NEXT:    fcvtzs x13, h7
+; CHECK-i64-NEXT:    mov h7, v17.h[1]
+; CHECK-i64-NEXT:    fcvtzs x9, h16
+; CHECK-i64-NEXT:    stp x11, x10, [sp]
+; CHECK-i64-NEXT:    fcvtzs x10, h4
+; CHECK-i64-NEXT:    fcvtzs x11, h5
+; CHECK-i64-NEXT:    mov h4, v0.h[3]
+; CHECK-i64-NEXT:    mov h5, v0.h[2]
+; CHECK-i64-NEXT:    stp x13, x12, [sp, #80]
+; CHECK-i64-NEXT:    fcvtzs x12, h6
+; CHECK-i64-NEXT:    fcvtzs x13, h7
+; CHECK-i64-NEXT:    mov h6, v0.h[1]
+; CHECK-i64-NEXT:    stp x10, x9, [sp, #64]
+; CHECK-i64-NEXT:    fcvtzs x9, h17
+; CHECK-i64-NEXT:    mov h7, v1.h[3]
+; CHECK-i64-NEXT:    fcvtzs x10, h4
+; CHECK-i64-NEXT:    mov h4, v1.h[2]
+; CHECK-i64-NEXT:    stp x12, x11, [sp, #144]
+; CHECK-i64-NEXT:    fcvtzs x11, h5
+; CHECK-i64-NEXT:    mov h5, v1.h[1]
+; CHECK-i64-NEXT:    fcvtzs x12, h6
+; CHECK-i64-NEXT:    stp x9, x13, [sp, #128]
+; CHECK-i64-NEXT:    fcvtzs x9, h0
+; CHECK-i64-NEXT:    fcvtzs x13, h7
+; CHECK-i64-NEXT:    mov h0, v2.h[3]
+; CHECK-i64-NEXT:    stp x11, x10, [sp, #208]
+; CHECK-i64-NEXT:    fcvtzs x10, h4
+; CHECK-i64-NEXT:    mov h4, v2.h[2]
+; CHECK-i64-NEXT:    fcvtzs x11, h5
+; CHECK-i64-NEXT:    mov h5, v2.h[1]
+; CHECK-i64-NEXT:    stp x9, x12, [sp, #192]
+; CHECK-i64-NEXT:    fcvtzs x9, h1
+; CHECK-i64-NEXT:    fcvtzs x12, h0
+; CHECK-i64-NEXT:    mov h0, v3.h[3]
+; CHECK-i64-NEXT:    mov h1, v3.h[2]
+; CHECK-i64-NEXT:    stp x10, x13, [sp, #112]
+; CHECK-i64-NEXT:    fcvtzs x10, h4
+; CHECK-i64-NEXT:    mov h4, v3.h[1]
+; CHECK-i64-NEXT:    fcvtzs x13, h5
+; CHECK-i64-NEXT:    stp x9, x11, [sp, #96]
+; CHECK-i64-NEXT:    fcvtzs x9, h2
+; CHECK-i64-NEXT:    fcvtzs x11, h0
+; CHECK-i64-NEXT:    stp x10, x12, [sp, #176]
+; CHECK-i64-NEXT:    fcvtzs x10, h1
+; CHECK-i64-NEXT:    fcvtzs x12, h4
+; CHECK-i64-NEXT:    stp x9, x13, [sp, #160]
+; CHECK-i64-NEXT:    fcvtzs x9, h3
+; CHECK-i64-NEXT:    stp x10, x11, [sp, #240]
+; CHECK-i64-NEXT:    add x10, sp, #64
+; CHECK-i64-NEXT:    stp x9, x12, [sp, #224]
+; CHECK-i64-NEXT:    add x9, sp, #32
+; CHECK-i64-NEXT:    ld1d { z0.d }, p0/z, [x9]
+; CHECK-i64-NEXT:    mov x9, sp
+; CHECK-i64-NEXT:    ld1d { z2.d }, p0/z, [x10]
+; CHECK-i64-NEXT:    ld1d { z1.d }, p0/z, [x9]
+; CHECK-i64-NEXT:    add x9, sp, #224
+; CHECK-i64-NEXT:    add x10, sp, #128
+; CHECK-i64-NEXT:    ld1d { z3.d }, p0/z, [x9]
+; CHECK-i64-NEXT:    add x9, sp, #160
+; CHECK-i64-NEXT:    ld1d { z4.d }, p0/z, [x10]
+; CHECK-i64-NEXT:    add x10, sp, #96
+; CHECK-i64-NEXT:    ld1d { z5.d }, p0/z, [x9]
+; CHECK-i64-NEXT:    add x9, sp, #192
+; CHECK-i64-NEXT:    ld1d { z6.d }, p0/z, [x10]
+; CHECK-i64-NEXT:    mov x10, #24 // =0x18
+; CHECK-i64-NEXT:    ld1d { z7.d }, p0/z, [x9]
+; CHECK-i64-NEXT:    mov x9, #16 // =0x10
+; CHECK-i64-NEXT:    st1d { z3.d }, p0, [x8, x10, lsl #3]
+; CHECK-i64-NEXT:    st1d { z5.d }, p0, [x8, x9, lsl #3]
+; CHECK-i64-NEXT:    mov x9, #8 // =0x8
+; CHECK-i64-NEXT:    st1d { z6.d }, p0, [x8, x9, lsl #3]
+; CHECK-i64-NEXT:    mov x9, #28 // =0x1c
+; CHECK-i64-NEXT:    st1d { z7.d }, p0, [x8, x9, lsl #3]
+; CHECK-i64-NEXT:    mov x9, #20 // =0x14
+; CHECK-i64-NEXT:    st1d { z4.d }, p0, [x8, x9, lsl #3]
+; CHECK-i64-NEXT:    mov x9, #12 // =0xc
+; CHECK-i64-NEXT:    st1d { z2.d }, p0, [x8, x9, lsl #3]
+; CHECK-i64-NEXT:    mov x9, #4 // =0x4
+; CHECK-i64-NEXT:    st1d { z1.d }, p0, [x8, x9, lsl #3]
+; CHECK-i64-NEXT:    st1d { z0.d }, p0, [x8]
+; CHECK-i64-NEXT:    mov sp, x29
+; CHECK-i64-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-i64-NEXT:    ret
+  %a = call <32 x iXLen> @llvm.lrint.v32iXLen.v32f16(<32 x half> %x)
+  ret <32 x iXLen> %a
+}
+declare <32 x iXLen> @llvm.lrint.v32iXLen.v32f16(<32 x half>)
+
+define <1 x iXLen> @lrint_v1f32(<1 x float> %x) {
+; CHECK-i32-LABEL: lrint_v1f32:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    frintx v0.2s, v0.2s
+; CHECK-i32-NEXT:    fcvtzs v0.2s, v0.2s
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v1f32:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-i64-NEXT:    frintx s0, s0
+; CHECK-i64-NEXT:    fcvtzs x8, s0
+; CHECK-i64-NEXT:    fmov d0, x8
+; CHECK-i64-NEXT:    ret
+  %a = call <1 x iXLen> @llvm.lrint.v1iXLen.v1f32(<1 x float> %x)
+  ret <1 x iXLen> %a
+}
+declare <1 x iXLen> @llvm.lrint.v1iXLen.v1f32(<1 x float>)
+
+define <2 x iXLen> @lrint_v2f32(<2 x float> %x) {
+; CHECK-i32-LABEL: lrint_v2f32:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    frintx v0.2s, v0.2s
+; CHECK-i32-NEXT:    fcvtzs v0.2s, v0.2s
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v2f32:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    frintx v0.2s, v0.2s
+; CHECK-i64-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-i64-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-i64-NEXT:    ret
+  %a = call <2 x iXLen> @llvm.lrint.v2iXLen.v2f32(<2 x float> %x)
+  ret <2 x iXLen> %a
+}
+declare <2 x iXLen> @llvm.lrint.v2iXLen.v2f32(<2 x float>)
+
+define <4 x iXLen> @lrint_v4f32(<4 x float> %x) {
+; CHECK-i32-LABEL: lrint_v4f32:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    frintx v0.4s, v0.4s
+; CHECK-i32-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v4f32:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    frintx v0.4s, v0.4s
+; CHECK-i64-NEXT:    mov s1, v0.s[2]
+; CHECK-i64-NEXT:    mov s2, v0.s[3]
+; CHECK-i64-NEXT:    mov s3, v0.s[1]
+; CHECK-i64-NEXT:    fcvtzs x9, s0
+; CHECK-i64-NEXT:    fcvtzs x8, s1
+; CHECK-i64-NEXT:    fcvtzs x10, s2
+; CHECK-i64-NEXT:    fcvtzs x11, s3
+; CHECK-i64-NEXT:    fmov d0, x9
+; CHECK-i64-NEXT:    fmov d1, x8
+; CHECK-i64-NEXT:    mov v0.d[1], x11
+; CHECK-i64-NEXT:    mov v1.d[1], x10
+; CHECK-i64-NEXT:    ret
+  %a = call <4 x iXLen> @llvm.lrint.v4iXLen.v4f32(<4 x float> %x)
+  ret <4 x iXLen> %a
+}
+declare <4 x iXLen> @llvm.lrint.v4iXLen.v4f32(<4 x float>)
+
+define <8 x iXLen> @lrint_v8f32(<8 x float> %x) {
+; CHECK-i32-LABEL: lrint_v8f32:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    ptrue p0.d, vl2
+; CHECK-i32-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-i32-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-i32-NEXT:    splice z0.d, p0, z0.d, z1.d
+; CHECK-i32-NEXT:    ptrue p0.s, vl8
+; CHECK-i32-NEXT:    movprfx z2, z0
+; CHECK-i32-NEXT:    frintx z2.s, p0/m, z0.s
+; CHECK-i32-NEXT:    mov z0.s, z2.s[4]
+; CHECK-i32-NEXT:    mov z1.s, z2.s[5]
+; CHECK-i32-NEXT:    mov z3.s, z2.s[1]
+; CHECK-i32-NEXT:    fcvtzs w9, s2
+; CHECK-i32-NEXT:    fcvtzs w8, s0
+; CHECK-i32-NEXT:    mov z0.s, z2.s[6]
+; CHECK-i32-NEXT:    fcvtzs w10, s1
+; CHECK-i32-NEXT:    mov z1.s, z2.s[2]
+; CHECK-i32-NEXT:    fcvtzs w11, s3
+; CHECK-i32-NEXT:    mov z3.s, z2.s[7]
+; CHECK-i32-NEXT:    mov z2.s, z2.s[3]
+; CHECK-i32-NEXT:    fcvtzs w12, s0
+; CHECK-i32-NEXT:    fmov s0, w9
+; CHECK-i32-NEXT:    fcvtzs w13, s1
+; CHECK-i32-NEXT:    fmov s1, w8
+; CHECK-i32-NEXT:    fcvtzs w8, s3
+; CHECK-i32-NEXT:    fcvtzs w9, s2
+; CHECK-i32-NEXT:    mov v0.s[1], w11
+; CHECK-i32-NEXT:    mov v1.s[1], w10
+; CHECK-i32-NEXT:    mov v0.s[2], w13
+; CHECK-i32-NEXT:    mov v1.s[2], w12
+; CHECK-i32-NEXT:    mov v0.s[3], w9
+; CHECK-i32-NEXT:    mov v1.s[3], w8
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v8f32:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    frintx v0.4s, v0.4s
+; CHECK-i64-NEXT:    frintx v1.4s, v1.4s
+; CHECK-i64-NEXT:    mov s3, v1.s[2]
+; CHECK-i64-NEXT:    mov s4, v0.s[2]
+; CHECK-i64-NEXT:    mov s2, v0.s[1]
+; CHECK-i64-NEXT:    mov s5, v1.s[3]
+; CHECK-i64-NEXT:    mov s6, v1.s[1]
+; CHECK-i64-NEXT:    mov s7, v0.s[3]
+; CHECK-i64-NEXT:    fcvtzs x8, s0
+; CHECK-i64-NEXT:    fcvtzs x10, s1
+; CHECK-i64-NEXT:    fcvtzs x11, s3
+; CHECK-i64-NEXT:    fcvtzs x12, s4
+; CHECK-i64-NEXT:    fcvtzs x9, s2
+; CHECK-i64-NEXT:    fcvtzs x13, s5
+; CHECK-i64-NEXT:    fcvtzs x14, s6
+; CHECK-i64-NEXT:    fcvtzs x15, s7
+; CHECK-i64-NEXT:    fmov d0, x8
+; CHECK-i64-NEXT:    fmov d2, x10
+; CHECK-i64-NEXT:    fmov d1, x12
+; CHECK-i64-NEXT:    fmov d3, x11
+; CHECK-i64-NEXT:    mov v0.d[1], x9
+; CHECK-i64-NEXT:    mov v2.d[1], x14
+; CHECK-i64-NEXT:    mov v1.d[1], x15
+; CHECK-i64-NEXT:    mov v3.d[1], x13
+; CHECK-i64-NEXT:    ret
+  %a = call <8 x iXLen> @llvm.lrint.v8iXLen.v8f32(<8 x float> %x)
+  ret <8 x iXLen> %a
+}
+declare <8 x iXLen> @llvm.lrint.v8iXLen.v8f32(<8 x float>)
+
+define <16 x iXLen> @lrint_v16f32(<16 x float> %x) {
+; CHECK-i32-LABEL: lrint_v16f32:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    ptrue p0.d, vl2
+; CHECK-i32-NEXT:    // kill: def $q2 killed $q2 def $z2
+; CHECK-i32-NEXT:    // kill: def $q3 killed $q3 def $z3
+; CHECK-i32-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-i32-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-i32-NEXT:    splice z2.d, p0, z2.d, z3.d
+; CHECK-i32-NEXT:    splice z0.d, p0, z0.d, z1.d
+; CHECK-i32-NEXT:    ptrue p0.s, vl8
+; CHECK-i32-NEXT:    movprfx z1, z2
+; CHECK-i32-NEXT:    frintx z1.s, p0/m, z2.s
+; CHECK-i32-NEXT:    frintx z0.s, p0/m, z0.s
+; CHECK-i32-NEXT:    mov z2.s, z1.s[5]
+; CHECK-i32-NEXT:    mov z3.s, z1.s[4]
+; CHECK-i32-NEXT:    mov z5.s, z0.s[5]
+; CHECK-i32-NEXT:    mov z7.s, z0.s[1]
+; CHECK-i32-NEXT:    fcvtzs w11, s0
+; CHECK-i32-NEXT:    fcvtzs w13, s1
+; CHECK-i32-NEXT:    mov z4.s, z1.s[7]
+; CHECK-i32-NEXT:    mov z6.s, z1.s[6]
+; CHECK-i32-NEXT:    mov z16.s, z0.s[7]
+; CHECK-i32-NEXT:    fcvtzs w8, s2
+; CHECK-i32-NEXT:    mov z2.s, z0.s[4]
+; CHECK-i32-NEXT:    fcvtzs w9, s3
+; CHECK-i32-NEXT:    mov z3.s, z1.s[1]
+; CHECK-i32-NEXT:    fcvtzs w10, s5
+; CHECK-i32-NEXT:    fcvtzs w12, s7
+; CHECK-i32-NEXT:    mov z5.s, z0.s[6]
+; CHECK-i32-NEXT:    mov z7.s, z1.s[2]
+; CHECK-i32-NEXT:    mov z17.s, z1.s[3]
+; CHECK-i32-NEXT:    fcvtzs w14, s2
+; CHECK-i32-NEXT:    mov z2.s, z0.s[2]
+; CHECK-i32-NEXT:    mov z18.s, z0.s[3]
+; CHECK-i32-NEXT:    fcvtzs w15, s3
+; CHECK-i32-NEXT:    fmov s0, w11
+; CHECK-i32-NEXT:    fmov s3, w9
+; CHECK-i32-NEXT:    fcvtzs w16, s6
+; CHECK-i32-NEXT:    fcvtzs w17, s5
+; CHECK-i32-NEXT:    fcvtzs w11, s7
+; CHECK-i32-NEXT:    fcvtzs w18, s2
+; CHECK-i32-NEXT:    fmov s2, w13
+; CHECK-i32-NEXT:    fcvtzs w9, s16
+; CHECK-i32-NEXT:    fmov s1, w14
+; CHECK-i32-NEXT:    mov v0.s[1], w12
+; CHECK-i32-NEXT:    mov v3.s[1], w8
+; CHECK-i32-NEXT:    fcvtzs w8, s4
+; CHECK-i32-NEXT:    fcvtzs w12, s18
+; CHECK-i32-NEXT:    mov v2.s[1], w15
+; CHECK-i32-NEXT:    mov v1.s[1], w10
+; CHECK-i32-NEXT:    fcvtzs w10, s17
+; CHECK-i32-NEXT:    mov v0.s[2], w18
+; CHECK-i32-NEXT:    mov v3.s[2], w16
+; CHECK-i32-NEXT:    mov v2.s[2], w11
+; CHECK-i32-NEXT:    mov v1.s[2], w17
+; CHECK-i32-NEXT:    mov v0.s[3], w12
+; CHECK-i32-NEXT:    mov v3.s[3], w8
+; CHECK-i32-NEXT:    mov v2.s[3], w10
+; CHECK-i32-NEXT:    mov v1.s[3], w9
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v16f32:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    frintx v3.4s, v3.4s
+; CHECK-i64-NEXT:    frintx v2.4s, v2.4s
+; CHECK-i64-NEXT:    frintx v1.4s, v1.4s
+; CHECK-i64-NEXT:    frintx v0.4s, v0.4s
+; CHECK-i64-NEXT:    mov s4, v3.s[2]
+; CHECK-i64-NEXT:    mov s5, v2.s[2]
+; CHECK-i64-NEXT:    mov s6, v1.s[2]
+; CHECK-i64-NEXT:    mov s7, v0.s[2]
+; CHECK-i64-NEXT:    fcvtzs x10, s1
+; CHECK-i64-NEXT:    fcvtzs x11, s0
+; CHECK-i64-NEXT:    mov s16, v0.s[1]
+; CHECK-i64-NEXT:    mov s17, v1.s[1]
+; CHECK-i64-NEXT:    mov s18, v3.s[1]
+; CHECK-i64-NEXT:    fcvtzs x14, s3
+; CHECK-i64-NEXT:    fcvtzs x16, s2
+; CHECK-i64-NEXT:    fcvtzs x8, s4
+; CHECK-i64-NEXT:    mov s4, v2.s[1]
+; CHECK-i64-NEXT:    fcvtzs x9, s5
+; CHECK-i64-NEXT:    mov s5, v1.s[3]
+; CHECK-i64-NEXT:    fcvtzs x12, s6
+; CHECK-i64-NEXT:    mov s6, v0.s[3]
+; CHECK-i64-NEXT:    fcvtzs x13, s7
+; CHECK-i64-NEXT:    mov s7, v3.s[3]
+; CHECK-i64-NEXT:    fmov d0, x11
+; CHECK-i64-NEXT:    fcvtzs x17, s16
+; CHECK-i64-NEXT:    fcvtzs x18, s18
+; CHECK-i64-NEXT:    fcvtzs x15, s4
+; CHECK-i64-NEXT:    mov s4, v2.s[3]
+; CHECK-i64-NEXT:    fmov d2, x10
+; CHECK-i64-NEXT:    fcvtzs x11, s5
+; CHECK-i64-NEXT:    fcvtzs x10, s6
+; CHECK-i64-NEXT:    fmov d3, x12
+; CHECK-i64-NEXT:    fmov d1, x13
+; CHECK-i64-NEXT:    fcvtzs x12, s17
+; CHECK-i64-NEXT:    fcvtzs x13, s7
+; CHECK-i64-NEXT:    fmov d5, x9
+; CHECK-i64-NEXT:    fmov d6, x14
+; CHECK-i64-NEXT:    fmov d7, x8
+; CHECK-i64-NEXT:    fcvtzs x0, s4
+; CHECK-i64-NEXT:    fmov d4, x16
+; CHECK-i64-NEXT:    mov v0.d[1], x17
+; CHECK-i64-NEXT:    mov v1.d[1], x10
+; CHECK-i64-NEXT:    mov v3.d[1], x11
+; CHECK-i64-NEXT:    mov v2.d[1], x12
+; CHECK-i64-NEXT:    mov v6.d[1], x18
+; CHECK-i64-NEXT:    mov v7.d[1], x13
+; CHECK-i64-NEXT:    mov v4.d[1], x15
+; CHECK-i64-NEXT:    mov v5.d[1], x0
+; CHECK-i64-NEXT:    ret
+  %a = call <16 x iXLen> @llvm.lrint.v16iXLen.v16f32(<16 x float> %x)
+  ret <16 x iXLen> %a
+}
+declare <16 x iXLen> @llvm.lrint.v16iXLen.v16f32(<16 x float>)
+
+define <32 x iXLen> @lrint_v32f32(<32 x float> %x) {
+; CHECK-i32-LABEL: lrint_v32f32:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    str x27, [sp, #-80]! // 8-byte Folded Spill
+; CHECK-i32-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
+; CHECK-i32-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
+; CHECK-i32-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; CHECK-i32-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-i32-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-i32-NEXT:    .cfi_offset w19, -8
+; CHECK-i32-NEXT:    .cfi_offset w20, -16
+; CHECK-i32-NEXT:    .cfi_offset w21, -24
+; CHECK-i32-NEXT:    .cfi_offset w22, -32
+; CHECK-i32-NEXT:    .cfi_offset w23, -40
+; CHECK-i32-NEXT:    .cfi_offset w24, -48
+; CHECK-i32-NEXT:    .cfi_offset w25, -56
+; CHECK-i32-NEXT:    .cfi_offset w26, -64
+; CHECK-i32-NEXT:    .cfi_offset w27, -80
+; CHECK-i32-NEXT:    ptrue p1.d, vl2
+; CHECK-i32-NEXT:    // kill: def $q6 killed $q6 def $z6
+; CHECK-i32-NEXT:    // kill: def $q7 killed $q7 def $z7
+; CHECK-i32-NEXT:    // kill: def $q2 killed $q2 def $z2
+; CHECK-i32-NEXT:    // kill: def $q4 killed $q4 def $z4
+; CHECK-i32-NEXT:    // kill: def $q3 killed $q3 def $z3
+; CHECK-i32-NEXT:    // kill: def $q5 killed $q5 def $z5
+; CHECK-i32-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-i32-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-i32-NEXT:    ptrue p0.s, vl8
+; CHECK-i32-NEXT:    splice z6.d, p1, z6.d, z7.d
+; CHECK-i32-NEXT:    splice z2.d, p1, z2.d, z3.d
+; CHECK-i32-NEXT:    splice z4.d, p1, z4.d, z5.d
+; CHECK-i32-NEXT:    splice z0.d, p1, z0.d, z1.d
+; CHECK-i32-NEXT:    movprfx z3, z6
+; CHECK-i32-NEXT:    frintx z3.s, p0/m, z6.s
+; CHECK-i32-NEXT:    frintx z2.s, p0/m, z2.s
+; CHECK-i32-NEXT:    movprfx z1, z4
+; CHECK-i32-NEXT:    frintx z1.s, p0/m, z4.s
+; CHECK-i32-NEXT:    frintx z0.s, p0/m, z0.s
+; CHECK-i32-NEXT:    mov z4.s, z3.s[7]
+; CHECK-i32-NEXT:    mov z5.s, z3.s[6]
+; CHECK-i32-NEXT:    mov z6.s, z3.s[5]
+; CHECK-i32-NEXT:    mov z16.s, z1.s[7]
+; CHECK-i32-NEXT:    mov z7.s, z3.s[4]
+; CHECK-i32-NEXT:    mov z17.s, z1.s[6]
+; CHECK-i32-NEXT:    mov z18.s, z1.s[5]
+; CHECK-i32-NEXT:    mov z19.s, z1.s[4]
+; CHECK-i32-NEXT:    fcvtzs w7, s3
+; CHECK-i32-NEXT:    fcvtzs w8, s4
+; CHECK-i32-NEXT:    mov z4.s, z2.s[7]
+; CHECK-i32-NEXT:    fcvtzs w10, s5
+; CHECK-i32-NEXT:    mov z5.s, z2.s[6]
+; CHECK-i32-NEXT:    fcvtzs w13, s6
+; CHECK-i32-NEXT:    fcvtzs w9, s16
+; CHECK-i32-NEXT:    mov z6.s, z2.s[4]
+; CHECK-i32-NEXT:    mov z16.s, z0.s[6]
+; CHECK-i32-NEXT:    fcvtzs w14, s7
+; CHECK-i32-NEXT:    fcvtzs w11, s4
+; CHECK-i32-NEXT:    mov z4.s, z2.s[5]
+; CHECK-i32-NEXT:    mov z7.s, z0.s[7]
+; CHECK-i32-NEXT:    fcvtzs w16, s5
+; CHECK-i32-NEXT:    mov z5.s, z0.s[4]
+; CHECK-i32-NEXT:    fcvtzs w12, s17
+; CHECK-i32-NEXT:    fcvtzs w15, s18
+; CHECK-i32-NEXT:    fcvtzs w17, s19
+; CHECK-i32-NEXT:    mov z17.s, z0.s[5]
+; CHECK-i32-NEXT:    fcvtzs w3, s4
+; CHECK-i32-NEXT:    mov z4.s, z3.s[1]
+; CHECK-i32-NEXT:    mov z18.s, z3.s[2]
+; CHECK-i32-NEXT:    fcvtzs w4, s6
+; CHECK-i32-NEXT:    fcvtzs w0, s16
+; CHECK-i32-NEXT:    fcvtzs w6, s5
+; CHECK-i32-NEXT:    mov z16.s, z3.s[3]
+; CHECK-i32-NEXT:    mov z3.s, z0.s[1]
+; CHECK-i32-NEXT:    mov z5.s, z1.s[1]
+; CHECK-i32-NEXT:    mov z6.s, z2.s[1]
+; CHECK-i32-NEXT:    fcvtzs w21, s1
+; CHECK-i32-NEXT:    fcvtzs w22, s0
+; CHECK-i32-NEXT:    fcvtzs w23, s2
+; CHECK-i32-NEXT:    fcvtzs w18, s7
+; CHECK-i32-NEXT:    fcvtzs w2, s4
+; CHECK-i32-NEXT:    mov z4.s, z1.s[2]
+; CHECK-i32-NEXT:    mov z7.s, z2.s[2]
+; CHECK-i32-NEXT:    fcvtzs w5, s17
+; CHECK-i32-NEXT:    fcvtzs w24, s3
+; CHECK-i32-NEXT:    fcvtzs w25, s5
+; CHECK-i32-NEXT:    fcvtzs w26, s6
+; CHECK-i32-NEXT:    fcvtzs w1, s18
+; CHECK-i32-NEXT:    mov z18.s, z0.s[2]
+; CHECK-i32-NEXT:    mov z17.s, z1.s[3]
+; CHECK-i32-NEXT:    fcvtzs w19, s4
+; CHECK-i32-NEXT:    mov z19.s, z2.s[3]
+; CHECK-i32-NEXT:    fcvtzs w20, s7
+; CHECK-i32-NEXT:    mov z20.s, z0.s[3]
+; CHECK-i32-NEXT:    fmov s0, w22
+; CHECK-i32-NEXT:    fmov s2, w23
+; CHECK-i32-NEXT:    fmov s4, w21
+; CHECK-i32-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-i32-NEXT:    fmov s1, w6
+; CHECK-i32-NEXT:    fmov s6, w7
+; CHECK-i32-NEXT:    fmov s3, w4
+; CHECK-i32-NEXT:    fmov s5, w17
+; CHECK-i32-NEXT:    fmov s7, w14
+; CHECK-i32-NEXT:    fcvtzs w27, s18
+; CHECK-i32-NEXT:    mov v0.s[1], w24
+; CHECK-i32-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
+; CHECK-i32-NEXT:    mov v2.s[1], w26
+; CHECK-i32-NEXT:    mov v4.s[1], w25
+; CHECK-i32-NEXT:    mov v1.s[1], w5
+; CHECK-i32-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
+; CHECK-i32-NEXT:    mov v3.s[1], w3
+; CHECK-i32-NEXT:    mov v6.s[1], w2
+; CHECK-i32-NEXT:    mov v5.s[1], w15
+; CHECK-i32-NEXT:    mov v7.s[1], w13
+; CHECK-i32-NEXT:    fcvtzs w13, s16
+; CHECK-i32-NEXT:    fcvtzs w14, s17
+; CHECK-i32-NEXT:    fcvtzs w15, s19
+; CHECK-i32-NEXT:    fcvtzs w17, s20
+; CHECK-i32-NEXT:    mov v0.s[2], w27
+; CHECK-i32-NEXT:    mov v1.s[2], w0
+; CHECK-i32-NEXT:    mov v2.s[2], w20
+; CHECK-i32-NEXT:    mov v4.s[2], w19
+; CHECK-i32-NEXT:    mov v3.s[2], w16
+; CHECK-i32-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-i32-NEXT:    mov v6.s[2], w1
+; CHECK-i32-NEXT:    mov v5.s[2], w12
+; CHECK-i32-NEXT:    mov v7.s[2], w10
+; CHECK-i32-NEXT:    mov v0.s[3], w17
+; CHECK-i32-NEXT:    mov v1.s[3], w18
+; CHECK-i32-NEXT:    mov v2.s[3], w15
+; CHECK-i32-NEXT:    mov v4.s[3], w14
+; CHECK-i32-NEXT:    mov v3.s[3], w11
+; CHECK-i32-NEXT:    mov v6.s[3], w13
+; CHECK-i32-NEXT:    mov v5.s[3], w9
+; CHECK-i32-NEXT:    mov v7.s[3], w8
+; CHECK-i32-NEXT:    ldr x27, [sp], #80 // 8-byte Folded Reload
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v32f32:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-i64-NEXT:    sub x9, sp, #272
+; CHECK-i64-NEXT:    mov x29, sp
+; CHECK-i64-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK-i64-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-i64-NEXT:    .cfi_offset w30, -8
+; CHECK-i64-NEXT:    .cfi_offset w29, -16
+; CHECK-i64-NEXT:    frintx v0.4s, v0.4s
+; CHECK-i64-NEXT:    frintx v1.4s, v1.4s
+; CHECK-i64-NEXT:    frintx v2.4s, v2.4s
+; CHECK-i64-NEXT:    ptrue p0.d, vl4
+; CHECK-i64-NEXT:    mov s16, v0.s[3]
+; CHECK-i64-NEXT:    mov s17, v0.s[2]
+; CHECK-i64-NEXT:    mov s18, v0.s[1]
+; CHECK-i64-NEXT:    fcvtzs x12, s0
+; CHECK-i64-NEXT:    frintx v0.4s, v3.4s
+; CHECK-i64-NEXT:    mov s3, v2.s[3]
+; CHECK-i64-NEXT:    fcvtzs x9, s16
+; CHECK-i64-NEXT:    mov s16, v1.s[3]
+; CHECK-i64-NEXT:    fcvtzs x10, s17
+; CHECK-i64-NEXT:    mov s17, v1.s[2]
+; CHECK-i64-NEXT:    fcvtzs x11, s18
+; CHECK-i64-NEXT:    mov s18, v1.s[1]
+; CHECK-i64-NEXT:    fcvtzs x13, s16
+; CHECK-i64-NEXT:    stp x10, x9, [sp, #16]
+; CHECK-i64-NEXT:    mov s16, v2.s[2]
+; CHECK-i64-NEXT:    fcvtzs x9, s17
+; CHECK-i64-NEXT:    fcvtzs x10, s18
+; CHECK-i64-NEXT:    mov s17, v2.s[1]
+; CHECK-i64-NEXT:    stp x12, x11, [sp]
+; CHECK-i64-NEXT:    fcvtzs x11, s1
+; CHECK-i64-NEXT:    frintx v1.4s, v4.4s
+; CHECK-i64-NEXT:    fcvtzs x12, s3
+; CHECK-i64-NEXT:    mov s3, v0.s[3]
+; CHECK-i64-NEXT:    mov s4, v0.s[2]
+; CHECK-i64-NEXT:    stp x9, x13, [sp, #48]
+; CHECK-i64-NEXT:    fcvtzs x13, s16
+; CHECK-i64-NEXT:    fcvtzs x9, s17
+; CHECK-i64-NEXT:    mov s16, v0.s[1]
+; CHECK-i64-NEXT:    stp x11, x10, [sp, #32]
+; CHECK-i64-NEXT:    fcvtzs x10, s2
+; CHECK-i64-NEXT:    frintx v2.4s, v5.4s
+; CHECK-i64-NEXT:    fcvtzs x11, s3
+; CHECK-i64-NEXT:    mov s3, v1.s[3]
+; CHECK-i64-NEXT:    mov s5, v1.s[1]
+; CHECK-i64-NEXT:    stp x13, x12, [sp, #80]
+; CHECK-i64-NEXT:    fcvtzs x12, s4
+; CHECK-i64-NEXT:    mov s4, v1.s[2]
+; CHECK-i64-NEXT:    fcvtzs x13, s16
+; CHECK-i64-NEXT:    stp x10, x9, [sp, #64]
+; CHECK-i64-NEXT:    fcvtzs x9, s0
+; CHECK-i64-NEXT:    mov s0, v2.s[3]
+; CHECK-i64-NEXT:    fcvtzs x10, s3
+; CHECK-i64-NEXT:    frintx v3.4s, v6.4s
+; CHECK-i64-NEXT:    stp x12, x11, [sp, #112]
+; CHECK-i64-NEXT:    fcvtzs x11, s4
+; CHECK-i64-NEXT:    mov s4, v2.s[2]
+; CHECK-i64-NEXT:    fcvtzs x12, s5
+; CHECK-i64-NEXT:    mov s5, v2.s[1]
+; CHECK-i64-NEXT:    stp x9, x13, [sp, #96]
+; CHECK-i64-NEXT:    fcvtzs x9, s1
+; CHECK-i64-NEXT:    fcvtzs x13, s0
+; CHECK-i64-NEXT:    mov s0, v3.s[3]
+; CHECK-i64-NEXT:    frintx v1.4s, v7.4s
+; CHECK-i64-NEXT:    stp x11, x10, [sp, #144]
+; CHECK-i64-NEXT:    fcvtzs x10, s4
+; CHECK-i64-NEXT:    mov s4, v3.s[2]
+; CHECK-i64-NEXT:    fcvtzs x11, s5
+; CHECK-i64-NEXT:    mov s5, v3.s[1]
+; CHECK-i64-NEXT:    stp x9, x12, [sp, #128]
+; CHECK-i64-NEXT:    fcvtzs x9, s2
+; CHECK-i64-NEXT:    fcvtzs x12, s0
+; CHECK-i64-NEXT:    mov s0, v1.s[3]
+; CHECK-i64-NEXT:    mov s2, v1.s[2]
+; CHECK-i64-NEXT:    stp x10, x13, [sp, #176]
+; CHECK-i64-NEXT:    fcvtzs x10, s4
+; CHECK-i64-NEXT:    mov s4, v1.s[1]
+; CHECK-i64-NEXT:    fcvtzs x13, s5
+; CHECK-i64-NEXT:    stp x9, x11, [sp, #160]
+; CHECK-i64-NEXT:    fcvtzs x9, s3
+; CHECK-i64-NEXT:    fcvtzs x11, s0
+; CHECK-i64-NEXT:    stp x10, x12, [sp, #208]
+; CHECK-i64-NEXT:    fcvtzs x10, s2
+; CHECK-i64-NEXT:    fcvtzs x12, s4
+; CHECK-i64-NEXT:    stp x9, x13, [sp, #192]
+; CHECK-i64-NEXT:    fcvtzs x9, s1
+; CHECK-i64-NEXT:    stp x10, x11, [sp, #240]
+; CHECK-i64-NEXT:    add x10, sp, #64
+; CHECK-i64-NEXT:    stp x9, x12, [sp, #224]
+; CHECK-i64-NEXT:    mov x9, sp
+; CHECK-i64-NEXT:    ld1d { z0.d }, p0/z, [x9]
+; CHECK-i64-NEXT:    add x9, sp, #32
+; CHECK-i64-NEXT:    ld1d { z2.d }, p0/z, [x10]
+; CHECK-i64-NEXT:    ld1d { z1.d }, p0/z, [x9]
+; CHECK-i64-NEXT:    add x9, sp, #224
+; CHECK-i64-NEXT:    add x10, sp, #96
+; CHECK-i64-NEXT:    ld1d { z3.d }, p0/z, [x9]
+; CHECK-i64-NEXT:    add x9, sp, #192
+; CHECK-i64-NEXT:    ld1d { z4.d }, p0/z, [x10]
+; CHECK-i64-NEXT:    add x10, sp, #160
+; CHECK-i64-NEXT:    ld1d { z5.d }, p0/z, [x9]
+; CHECK-i64-NEXT:    add x9, sp, #128
+; CHECK-i64-NEXT:    ld1d { z6.d }, p0/z, [x10]
+; CHECK-i64-NEXT:    mov x10, #28 // =0x1c
+; CHECK-i64-NEXT:    ld1d { z7.d }, p0/z, [x9]
+; CHECK-i64-NEXT:    mov x9, #24 // =0x18
+; CHECK-i64-NEXT:    st1d { z3.d }, p0, [x8, x10, lsl #3]
+; CHECK-i64-NEXT:    st1d { z5.d }, p0, [x8, x9, lsl #3]
+; CHECK-i64-NEXT:    mov x9, #20 // =0x14
+; CHECK-i64-NEXT:    st1d { z6.d }, p0, [x8, x9, lsl #3]
+; CHECK-i64-NEXT:    mov x9, #16 // =0x10
+; CHECK-i64-NEXT:    st1d { z7.d }, p0, [x8, x9, lsl #3]
+; CHECK-i64-NEXT:    mov x9, #12 // =0xc
+; CHECK-i64-NEXT:    st1d { z4.d }, p0, [x8, x9, lsl #3]
+; CHECK-i64-NEXT:    mov x9, #8 // =0x8
+; CHECK-i64-NEXT:    st1d { z2.d }, p0, [x8, x9, lsl #3]
+; CHECK-i64-NEXT:    mov x9, #4 // =0x4
+; CHECK-i64-NEXT:    st1d { z1.d }, p0, [x8, x9, lsl #3]
+; CHECK-i64-NEXT:    st1d { z0.d }, p0, [x8]
+; CHECK-i64-NEXT:    mov sp, x29
+; CHECK-i64-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-i64-NEXT:    ret
+  %a = call <32 x iXLen> @llvm.lrint.v32iXLen.v32f32(<32 x float> %x)
+  ret <32 x iXLen> %a
+}
+declare <32 x iXLen> @llvm.lrint.v32iXLen.v32f32(<32 x float>)
+
+define <1 x iXLen> @lrint_v1f64(<1 x double> %x) {
+; CHECK-i32-LABEL: lrint_v1f64:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    frintx d0, d0
+; CHECK-i32-NEXT:    fcvtzs w8, d0
+; CHECK-i32-NEXT:    fmov s0, w8
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v1f64:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    frintx d0, d0
+; CHECK-i64-NEXT:    fcvtzs x8, d0
+; CHECK-i64-NEXT:    fmov d0, x8
+; CHECK-i64-NEXT:    ret
+  %a = call <1 x iXLen> @llvm.lrint.v1iXLen.v1f64(<1 x double> %x)
+  ret <1 x iXLen> %a
+}
+declare <1 x iXLen> @llvm.lrint.v1iXLen.v1f64(<1 x double>)
+
+define <2 x iXLen> @lrint_v2f64(<2 x double> %x) {
+; CHECK-i32-LABEL: lrint_v2f64:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    frintx v0.2d, v0.2d
+; CHECK-i32-NEXT:    mov d1, v0.d[1]
+; CHECK-i32-NEXT:    fcvtzs w8, d0
+; CHECK-i32-NEXT:    fcvtzs w9, d1
+; CHECK-i32-NEXT:    fmov s0, w8
+; CHECK-i32-NEXT:    mov v0.s[1], w9
+; CHECK-i32-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v2f64:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    frintx v0.2d, v0.2d
+; CHECK-i64-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-i64-NEXT:    ret
+  %a = call <2 x iXLen> @llvm.lrint.v2iXLen.v2f64(<2 x double> %x)
+  ret <2 x iXLen> %a
+}
+declare <2 x iXLen> @llvm.lrint.v2iXLen.v2f64(<2 x double>)
+
+define <4 x iXLen> @lrint_v4f64(<4 x double> %x) {
+; CHECK-i32-LABEL: lrint_v4f64:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    ptrue p0.d, vl2
+; CHECK-i32-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-i32-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-i32-NEXT:    splice z0.d, p0, z0.d, z1.d
+; CHECK-i32-NEXT:    ptrue p0.d, vl4
+; CHECK-i32-NEXT:    movprfx z1, z0
+; CHECK-i32-NEXT:    frintx z1.d, p0/m, z0.d
+; CHECK-i32-NEXT:    mov z0.d, z1.d[1]
+; CHECK-i32-NEXT:    fcvtzs w8, d1
+; CHECK-i32-NEXT:    mov z2.d, z1.d[2]
+; CHECK-i32-NEXT:    mov z1.d, z1.d[3]
+; CHECK-i32-NEXT:    fcvtzs w9, d0
+; CHECK-i32-NEXT:    fmov s0, w8
+; CHECK-i32-NEXT:    fcvtzs w8, d2
+; CHECK-i32-NEXT:    mov v0.s[1], w9
+; CHECK-i32-NEXT:    mov v0.s[2], w8
+; CHECK-i32-NEXT:    fcvtzs w8, d1
+; CHECK-i32-NEXT:    mov v0.s[3], w8
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v4f64:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    ptrue p0.d, vl2
+; CHECK-i64-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-i64-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-i64-NEXT:    splice z0.d, p0, z0.d, z1.d
+; CHECK-i64-NEXT:    ptrue p0.d, vl4
+; CHECK-i64-NEXT:    frintx z0.d, p0/m, z0.d
+; CHECK-i64-NEXT:    mov z1.d, z0.d[2]
+; CHECK-i64-NEXT:    mov z2.d, z0.d[3]
+; CHECK-i64-NEXT:    mov z3.d, z0.d[1]
+; CHECK-i64-NEXT:    fcvtzs x9, d0
+; CHECK-i64-NEXT:    fcvtzs x8, d1
+; CHECK-i64-NEXT:    fcvtzs x10, d2
+; CHECK-i64-NEXT:    fcvtzs x11, d3
+; CHECK-i64-NEXT:    fmov d0, x9
+; CHECK-i64-NEXT:    fmov d1, x8
+; CHECK-i64-NEXT:    mov v0.d[1], x11
+; CHECK-i64-NEXT:    mov v1.d[1], x10
+; CHECK-i64-NEXT:    ret
+  %a = call <4 x iXLen> @llvm.lrint.v4iXLen.v4f64(<4 x double> %x)
+  ret <4 x iXLen> %a
+}
+declare <4 x iXLen> @llvm.lrint.v4iXLen.v4f64(<4 x double>)
+
+define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
+; CHECK-i32-LABEL: lrint_v8f64:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    ptrue p0.d, vl2
+; CHECK-i32-NEXT:    // kill: def $q2 killed $q2 def $z2
+; CHECK-i32-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-i32-NEXT:    // kill: def $q3 killed $q3 def $z3
+; CHECK-i32-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-i32-NEXT:    splice z0.d, p0, z0.d, z1.d
+; CHECK-i32-NEXT:    splice z2.d, p0, z2.d, z3.d
+; CHECK-i32-NEXT:    ptrue p0.d, vl4
+; CHECK-i32-NEXT:    movprfx z3, z0
+; CHECK-i32-NEXT:    frintx z3.d, p0/m, z0.d
+; CHECK-i32-NEXT:    frintx z2.d, p0/m, z2.d
+; CHECK-i32-NEXT:    mov z0.d, z3.d[1]
+; CHECK-i32-NEXT:    mov z1.d, z2.d[1]
+; CHECK-i32-NEXT:    fcvtzs w8, d3
+; CHECK-i32-NEXT:    fcvtzs w9, d2
+; CHECK-i32-NEXT:    mov z4.d, z3.d[2]
+; CHECK-i32-NEXT:    mov z5.d, z2.d[2]
+; CHECK-i32-NEXT:    mov z3.d, z3.d[3]
+; CHECK-i32-NEXT:    mov z2.d, z2.d[3]
+; CHECK-i32-NEXT:    fcvtzs w10, d0
+; CHECK-i32-NEXT:    fcvtzs w11, d1
+; CHECK-i32-NEXT:    fmov s0, w8
+; CHECK-i32-NEXT:    fcvtzs w8, d4
+; CHECK-i32-NEXT:    fmov s1, w9
+; CHECK-i32-NEXT:    fcvtzs w9, d5
+; CHECK-i32-NEXT:    mov v0.s[1], w10
+; CHECK-i32-NEXT:    mov v1.s[1], w11
+; CHECK-i32-NEXT:    mov v0.s[2], w8
+; CHECK-i32-NEXT:    fcvtzs w8, d3
+; CHECK-i32-NEXT:    mov v1.s[2], w9
+; CHECK-i32-NEXT:    fcvtzs w9, d2
+; CHECK-i32-NEXT:    mov v0.s[3], w8
+; CHECK-i32-NEXT:    mov v1.s[3], w9
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v8f64:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    ptrue p0.d, vl2
+; CHECK-i64-NEXT:    // kill: def $q2 killed $q2 def $z2
+; CHECK-i64-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-i64-NEXT:    // kill: def $q3 killed $q3 def $z3
+; CHECK-i64-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-i64-NEXT:    splice z0.d, p0, z0.d, z1.d
+; CHECK-i64-NEXT:    splice z2.d, p0, z2.d, z3.d
+; CHECK-i64-NEXT:    ptrue p0.d, vl4
+; CHECK-i64-NEXT:    frintx z0.d, p0/m, z0.d
+; CHECK-i64-NEXT:    movprfx z1, z2
+; CHECK-i64-NEXT:    frintx z1.d, p0/m, z2.d
+; CHECK-i64-NEXT:    mov z4.d, z1.d[2]
+; CHECK-i64-NEXT:    mov z5.d, z0.d[2]
+; CHECK-i64-NEXT:    mov z2.d, z0.d[1]
+; CHECK-i64-NEXT:    mov z3.d, z1.d[3]
+; CHECK-i64-NEXT:    mov z6.d, z0.d[3]
+; CHECK-i64-NEXT:    fcvtzs x8, d0
+; CHECK-i64-NEXT:    mov z0.d, z1.d[1]
+; CHECK-i64-NEXT:    fcvtzs x10, d1
+; CHECK-i64-NEXT:    fcvtzs x11, d4
+; CHECK-i64-NEXT:    fcvtzs x12, d5
+; CHECK-i64-NEXT:    fcvtzs x9, d2
+; CHECK-i64-NEXT:    fcvtzs x13, d3
+; CHECK-i64-NEXT:    fcvtzs x14, d6
+; CHECK-i64-NEXT:    fcvtzs x15, d0
+; CHECK-i64-NEXT:    fmov d0, x8
+; CHECK-i64-NEXT:    fmov d2, x10
+; CHECK-i64-NEXT:    fmov d1, x12
+; CHECK-i64-NEXT:    fmov d3, x11
+; CHECK-i64-NEXT:    mov v0.d[1], x9
+; CHECK-i64-NEXT:    mov v2.d[1], x15
+; CHECK-i64-NEXT:    mov v1.d[1], x14
+; CHECK-i64-NEXT:    mov v3.d[1], x13
+; CHECK-i64-NEXT:    ret
+  %a = call <8 x iXLen> @llvm.lrint.v8iXLen.v8f64(<8 x double> %x)
+  ret <8 x iXLen> %a
+}
+declare <8 x iXLen> @llvm.lrint.v8iXLen.v8f64(<8 x double>)
+
+define <16 x iXLen> @lrint_v16f64(<16 x double> %x) {
+; CHECK-i32-LABEL: lrint_v16f64:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    ptrue p1.d, vl2
+; CHECK-i32-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-i32-NEXT:    // kill: def $q6 killed $q6 def $z6
+; CHECK-i32-NEXT:    // kill: def $q4 killed $q4 def $z4
+; CHECK-i32-NEXT:    // kill: def $q2 killed $q2 def $z2
+; CHECK-i32-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-i32-NEXT:    // kill: def $q7 killed $q7 def $z7
+; CHECK-i32-NEXT:    // kill: def $q5 killed $q5 def $z5
+; CHECK-i32-NEXT:    // kill: def $q3 killed $q3 def $z3
+; CHECK-i32-NEXT:    ptrue p0.d, vl4
+; CHECK-i32-NEXT:    splice z0.d, p1, z0.d, z1.d
+; CHECK-i32-NEXT:    splice z2.d, p1, z2.d, z3.d
+; CHECK-i32-NEXT:    splice z4.d, p1, z4.d, z5.d
+; CHECK-i32-NEXT:    splice z6.d, p1, z6.d, z7.d
+; CHECK-i32-NEXT:    movprfx z5, z0
+; CHECK-i32-NEXT:    frintx z5.d, p0/m, z0.d
+; CHECK-i32-NEXT:    movprfx z7, z2
+; CHECK-i32-NEXT:    frintx z7.d, p0/m, z2.d
+; CHECK-i32-NEXT:    frintx z4.d, p0/m, z4.d
+; CHECK-i32-NEXT:    frintx z6.d, p0/m, z6.d
+; CHECK-i32-NEXT:    fcvtzs w8, d5
+; CHECK-i32-NEXT:    mov z0.d, z5.d[1]
+; CHECK-i32-NEXT:    mov z1.d, z7.d[1]
+; CHECK-i32-NEXT:    fcvtzs w9, d7
+; CHECK-i32-NEXT:    mov z3.d, z4.d[1]
+; CHECK-i32-NEXT:    fcvtzs w10, d4
+; CHECK-i32-NEXT:    mov z16.d, z6.d[1]
+; CHECK-i32-NEXT:    fcvtzs w12, d6
+; CHECK-i32-NEXT:    mov z2.d, z5.d[2]
+; CHECK-i32-NEXT:    fcvtzs w11, d0
+; CHECK-i32-NEXT:    fcvtzs w13, d1
+; CHECK-i32-NEXT:    mov z17.d, z7.d[2]
+; CHECK-i32-NEXT:    fcvtzs w14, d3
+; CHECK-i32-NEXT:    fmov s0, w8
+; CHECK-i32-NEXT:    mov z18.d, z4.d[2]
+; CHECK-i32-NEXT:    fcvtzs w8, d16
+; CHECK-i32-NEXT:    mov z19.d, z6.d[2]
+; CHECK-i32-NEXT:    fcvtzs w15, d2
+; CHECK-i32-NEXT:    fmov s1, w9
+; CHECK-i32-NEXT:    fmov s2, w10
+; CHECK-i32-NEXT:    fmov s3, w12
+; CHECK-i32-NEXT:    fcvtzs w9, d17
+; CHECK-i32-NEXT:    fcvtzs w10, d18
+; CHECK-i32-NEXT:    mov v0.s[1], w11
+; CHECK-i32-NEXT:    fcvtzs w11, d19
+; CHECK-i32-NEXT:    mov z5.d, z5.d[3]
+; CHECK-i32-NEXT:    mov z7.d, z7.d[3]
+; CHECK-i32-NEXT:    mov v1.s[1], w13
+; CHECK-i32-NEXT:    mov v2.s[1], w14
+; CHECK-i32-NEXT:    mov v3.s[1], w8
+; CHECK-i32-NEXT:    mov z4.d, z4.d[3]
+; CHECK-i32-NEXT:    mov z6.d, z6.d[3]
+; CHECK-i32-NEXT:    mov v0.s[2], w15
+; CHECK-i32-NEXT:    fcvtzs w8, d5
+; CHECK-i32-NEXT:    mov v1.s[2], w9
+; CHECK-i32-NEXT:    fcvtzs w9, d7
+; CHECK-i32-NEXT:    mov v2.s[2], w10
+; CHECK-i32-NEXT:    fcvtzs w10, d4
+; CHECK-i32-NEXT:    mov v3.s[2], w11
+; CHECK-i32-NEXT:    fcvtzs w11, d6
+; CHECK-i32-NEXT:    mov v0.s[3], w8
+; CHECK-i32-NEXT:    mov v1.s[3], w9
+; CHECK-i32-NEXT:    mov v2.s[3], w10
+; CHECK-i32-NEXT:    mov v3.s[3], w11
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v16f64:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    ptrue p1.d, vl2
+; CHECK-i64-NEXT:    // kill: def $q6 killed $q6 def $z6
+; CHECK-i64-NEXT:    // kill: def $q4 killed $q4 def $z4
+; CHECK-i64-NEXT:    // kill: def $q7 killed $q7 def $z7
+; CHECK-i64-NEXT:    // kill: def $q5 killed $q5 def $z5
+; CHECK-i64-NEXT:    // kill: def $q2 killed $q2 def $z2
+; CHECK-i64-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-i64-NEXT:    // kill: def $q3 killed $q3 def $z3
+; CHECK-i64-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-i64-NEXT:    ptrue p0.d, vl4
+; CHECK-i64-NEXT:    splice z6.d, p1, z6.d, z7.d
+; CHECK-i64-NEXT:    splice z4.d, p1, z4.d, z5.d
+; CHECK-i64-NEXT:    splice z2.d, p1, z2.d, z3.d
+; CHECK-i64-NEXT:    splice z0.d, p1, z0.d, z1.d
+; CHECK-i64-NEXT:    movprfx z3, z6
+; CHECK-i64-NEXT:    frintx z3.d, p0/m, z6.d
+; CHECK-i64-NEXT:    movprfx z1, z4
+; CHECK-i64-NEXT:    frintx z1.d, p0/m, z4.d
+; CHECK-i64-NEXT:    frintx z2.d, p0/m, z2.d
+; CHECK-i64-NEXT:    frintx z0.d, p0/m, z0.d
+; CHECK-i64-NEXT:    mov z4.d, z3.d[2]
+; CHECK-i64-NEXT:    mov z5.d, z1.d[2]
+; CHECK-i64-NEXT:    mov z6.d, z2.d[3]
+; CHECK-i64-NEXT:    fcvtzs x11, d0
+; CHECK-i64-NEXT:    fcvtzs x12, d1
+; CHECK-i64-NEXT:    fcvtzs x13, d2
+; CHECK-i64-NEXT:    fcvtzs x14, d3
+; CHECK-i64-NEXT:    mov z7.d, z3.d[3]
+; CHECK-i64-NEXT:    mov z16.d, z1.d[3]
+; CHECK-i64-NEXT:    fcvtzs x9, d4
+; CHECK-i64-NEXT:    fcvtzs x10, d5
+; CHECK-i64-NEXT:    mov z4.d, z2.d[2]
+; CHECK-i64-NEXT:    mov z5.d, z0.d[2]
+; CHECK-i64-NEXT:    fcvtzs x8, d6
+; CHECK-i64-NEXT:    mov z2.d, z2.d[1]
+; CHECK-i64-NEXT:    mov z6.d, z0.d[3]
+; CHECK-i64-NEXT:    mov z1.d, z1.d[1]
+; CHECK-i64-NEXT:    mov z3.d, z3.d[1]
+; CHECK-i64-NEXT:    fcvtzs x15, d4
+; CHECK-i64-NEXT:    mov z4.d, z0.d[1]
+; CHECK-i64-NEXT:    fmov d0, x11
+; CHECK-i64-NEXT:    fcvtzs x16, d5
+; CHECK-i64-NEXT:    fcvtzs x11, d2
+; CHECK-i64-NEXT:    fmov d2, x13
+; CHECK-i64-NEXT:    fcvtzs x17, d7
+; CHECK-i64-NEXT:    fcvtzs x18, d16
+; CHECK-i64-NEXT:    fcvtzs x0, d3
+; CHECK-i64-NEXT:    fcvtzs x13, d4
+; CHECK-i64-NEXT:    fmov d4, x12
+; CHECK-i64-NEXT:    fcvtzs x12, d6
+; CHECK-i64-NEXT:    fmov d6, x14
+; CHECK-i64-NEXT:    fcvtzs x14, d1
+; CHECK-i64-NEXT:    fmov d3, x15
+; CHECK-i64-NEXT:    fmov d1, x16
+; CHECK-i64-NEXT:    fmov d5, x10
+; CHECK-i64-NEXT:    fmov d7, x9
+; CHECK-i64-NEXT:    mov v2.d[1], x11
+; CHECK-i64-NEXT:    mov v0.d[1], x13
+; CHECK-i64-NEXT:    mov v3.d[1], x8
+; CHECK-i64-NEXT:    mov v6.d[1], x0
+; CHECK-i64-NEXT:    mov v4.d[1], x14
+; CHECK-i64-NEXT:    mov v1.d[1], x12
+; CHECK-i64-NEXT:    mov v5.d[1], x18
+; CHECK-i64-NEXT:    mov v7.d[1], x17
+; CHECK-i64-NEXT:    ret
+  %a = call <16 x iXLen> @llvm.lrint.v16iXLen.v16f64(<16 x double> %x)
+  ret <16 x iXLen> %a
+}
+declare <16 x iXLen> @llvm.lrint.v16iXLen.v16f64(<16 x double>)
+
+define <32 x iXLen> @lrint_v32f64(<32 x double> %x) {
+; CHECK-i32-LABEL: lrint_v32f64:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    ptrue p1.d, vl2
+; CHECK-i32-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-i32-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-i32-NEXT:    // kill: def $q3 killed $q3 def $z3
+; CHECK-i32-NEXT:    // kill: def $q2 killed $q2 def $z2
+; CHECK-i32-NEXT:    // kill: def $q4 killed $q4 def $z4
+; CHECK-i32-NEXT:    // kill: def $q5 killed $q5 def $z5
+; CHECK-i32-NEXT:    // kill: def $q7 killed $q7 def $z7
+; CHECK-i32-NEXT:    // kill: def $q6 killed $q6 def $z6
+; CHECK-i32-NEXT:    ptrue p0.d, vl4
+; CHECK-i32-NEXT:    splice z0.d, p1, z0.d, z1.d
+; CHECK-i32-NEXT:    splice z2.d, p1, z2.d, z3.d
+; CHECK-i32-NEXT:    splice z4.d, p1, z4.d, z5.d
+; CHECK-i32-NEXT:    ldp q1, q3, [sp]
+; CHECK-i32-NEXT:    splice z6.d, p1, z6.d, z7.d
+; CHECK-i32-NEXT:    frintx z0.d, p0/m, z0.d
+; CHECK-i32-NEXT:    splice z1.d, p1, z1.d, z3.d
+; CHECK-i32-NEXT:    movprfx z18, z2
+; CHECK-i32-NEXT:    frintx z18.d, p0/m, z2.d
+; CHECK-i32-NEXT:    ldp q5, q3, [sp, #96]
+; CHECK-i32-NEXT:    ldp q2, q7, [sp, #64]
+; CHECK-i32-NEXT:    splice z5.d, p1, z5.d, z3.d
+; CHECK-i32-NEXT:    movprfx z3, z4
+; CHECK-i32-NEXT:    frintx z3.d, p0/m, z4.d
+; CHECK-i32-NEXT:    mov z4.d, z0.d[1]
+; CHECK-i32-NEXT:    fcvtzs w8, d0
+; CHECK-i32-NEXT:    splice z2.d, p1, z2.d, z7.d
+; CHECK-i32-NEXT:    mov z19.d, z18.d[1]
+; CHECK-i32-NEXT:    ldp q7, q16, [sp, #32]
+; CHECK-i32-NEXT:    movprfx z17, z1
+; CHECK-i32-NEXT:    frintx z17.d, p0/m, z1.d
+; CHECK-i32-NEXT:    fcvtzs w10, d4
+; CHECK-i32-NEXT:    mov z1.d, z0.d[2]
+; CHECK-i32-NEXT:    fcvtzs w9, d18
+; CHECK-i32-NEXT:    mov z4.d, z0.d[3]
+; CHECK-i32-NEXT:    fcvtzs w11, d19
+; CHECK-i32-NEXT:    mov z20.d, z18.d[3]
+; CHECK-i32-NEXT:    fmov s0, w8
+; CHECK-i32-NEXT:    splice z7.d, p1, z7.d, z16.d
+; CHECK-i32-NEXT:    movprfx z16, z6
+; CHECK-i32-NEXT:    frintx z16.d, p0/m, z6.d
+; CHECK-i32-NEXT:    mov z6.d, z18.d[2]
+; CHECK-i32-NEXT:    mov z18.d, z3.d[1]
+; CHECK-i32-NEXT:    fcvtzs w12, d3
+; CHECK-i32-NEXT:    fcvtzs w13, d1
+; CHECK-i32-NEXT:    fmov s1, w9
+; CHECK-i32-NEXT:    movprfx z19, z2
+; CHECK-i32-NEXT:    frintx z19.d, p0/m, z2.d
+; CHECK-i32-NEXT:    mov v0.s[1], w10
+; CHECK-i32-NEXT:    mov z21.d, z3.d[2]
+; CHECK-i32-NEXT:    fcvtzs w8, d4
+; CHECK-i32-NEXT:    fcvtzs w14, d6
+; CHECK-i32-NEXT:    mov z6.d, z16.d[1]
+; CHECK-i32-NEXT:    fcvtzs w15, d18
+; CHECK-i32-NEXT:    movprfx z18, z7
+; CHECK-i32-NEXT:    frintx z18.d, p0/m, z7.d
+; CHECK-i32-NEXT:    mov v1.s[1], w11
+; CHECK-i32-NEXT:    fmov s2, w12
+; CHECK-i32-NEXT:    mov z7.d, z17.d[1]
+; CHECK-i32-NEXT:    mov z4.d, z16.d[2]
+; CHECK-i32-NEXT:    fcvtzs w16, d16
+; CHECK-i32-NEXT:    mov v0.s[2], w13
+; CHECK-i32-NEXT:    fcvtzs w13, d17
+; CHECK-i32-NEXT:    fcvtzs w12, d6
+; CHECK-i32-NEXT:    mov z6.d, z19.d[1]
+; CHECK-i32-NEXT:    fcvtzs w11, d21
+; CHECK-i32-NEXT:    movprfx z21, z5
+; CHECK-i32-NEXT:    frintx z21.d, p0/m, z5.d
+; CHECK-i32-NEXT:    mov z3.d, z3.d[3]
+; CHECK-i32-NEXT:    mov v2.s[1], w15
+; CHECK-i32-NEXT:    mov z5.d, z18.d[1]
+; CHECK-i32-NEXT:    fcvtzs w15, d7
+; CHECK-i32-NEXT:    fcvtzs w0, d19
+; CHECK-i32-NEXT:    mov v1.s[2], w14
+; CHECK-i32-NEXT:    fcvtzs w14, d4
+; CHECK-i32-NEXT:    mov z7.d, z18.d[2]
+; CHECK-i32-NEXT:    fmov s4, w13
+; CHECK-i32-NEXT:    fcvtzs w13, d6
+; CHECK-i32-NEXT:    mov z6.d, z19.d[2]
+; CHECK-i32-NEXT:    fcvtzs w10, d3
+; CHECK-i32-NEXT:    fmov s3, w16
+; CHECK-i32-NEXT:    fcvtzs w17, d18
+; CHECK-i32-NEXT:    fcvtzs w18, d5
+; CHECK-i32-NEXT:    mov z5.d, z21.d[1]
+; CHECK-i32-NEXT:    fcvtzs w2, d21
+; CHECK-i32-NEXT:    fcvtzs w1, d7
+; CHECK-i32-NEXT:    mov z7.d, z21.d[2]
+; CHECK-i32-NEXT:    mov v4.s[1], w15
+; CHECK-i32-NEXT:    fcvtzs w15, d6
+; CHECK-i32-NEXT:    fmov s6, w0
+; CHECK-i32-NEXT:    mov v3.s[1], w12
+; CHECK-i32-NEXT:    fcvtzs w9, d20
+; CHECK-i32-NEXT:    fcvtzs w12, d5
+; CHECK-i32-NEXT:    mov z20.d, z17.d[2]
+; CHECK-i32-NEXT:    fmov s5, w17
+; CHECK-i32-NEXT:    mov z16.d, z16.d[3]
+; CHECK-i32-NEXT:    mov z17.d, z17.d[3]
+; CHECK-i32-NEXT:    mov z18.d, z18.d[3]
+; CHECK-i32-NEXT:    mov v6.s[1], w13
+; CHECK-i32-NEXT:    fcvtzs w13, d7
+; CHECK-i32-NEXT:    fmov s7, w2
+; CHECK-i32-NEXT:    fcvtzs w16, d20
+; CHECK-i32-NEXT:    mov v5.s[1], w18
+; CHECK-i32-NEXT:    mov z19.d, z19.d[3]
+; CHECK-i32-NEXT:    mov z20.d, z21.d[3]
+; CHECK-i32-NEXT:    mov v2.s[2], w11
+; CHECK-i32-NEXT:    mov v3.s[2], w14
+; CHECK-i32-NEXT:    mov v7.s[1], w12
+; CHECK-i32-NEXT:    fcvtzs w11, d16
+; CHECK-i32-NEXT:    fcvtzs w12, d17
+; CHECK-i32-NEXT:    fcvtzs w14, d18
+; CHECK-i32-NEXT:    mov v6.s[2], w15
+; CHECK-i32-NEXT:    fcvtzs w15, d19
+; CHECK-i32-NEXT:    mov v4.s[2], w16
+; CHECK-i32-NEXT:    mov v5.s[2], w1
+; CHECK-i32-NEXT:    mov v0.s[3], w8
+; CHECK-i32-NEXT:    mov v1.s[3], w9
+; CHECK-i32-NEXT:    mov v2.s[3], w10
+; CHECK-i32-NEXT:    mov v7.s[2], w13
+; CHECK-i32-NEXT:    fcvtzs w13, d20
+; CHECK-i32-NEXT:    mov v3.s[3], w11
+; CHECK-i32-NEXT:    mov v6.s[3], w15
+; CHECK-i32-NEXT:    mov v4.s[3], w12
+; CHECK-i32-NEXT:    mov v5.s[3], w14
+; CHECK-i32-NEXT:    mov v7.s[3], w13
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v32f64:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-i64-NEXT:    sub x9, sp, #272
+; CHECK-i64-NEXT:    mov x29, sp
+; CHECK-i64-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK-i64-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-i64-NEXT:    .cfi_offset w30, -8
+; CHECK-i64-NEXT:    .cfi_offset w29, -16
+; CHECK-i64-NEXT:    ptrue p1.d, vl2
+; CHECK-i64-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-i64-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-i64-NEXT:    // kill: def $q3 killed $q3 def $z3
+; CHECK-i64-NEXT:    // kill: def $q2 killed $q2 def $z2
+; CHECK-i64-NEXT:    // kill: def $q7 killed $q7 def $z7
+; CHECK-i64-NEXT:    // kill: def $q6 killed $q6 def $z6
+; CHECK-i64-NEXT:    // kill: def $q4 killed $q4 def $z4
+; CHECK-i64-NEXT:    // kill: def $q5 killed $q5 def $z5
+; CHECK-i64-NEXT:    ptrue p0.d, vl4
+; CHECK-i64-NEXT:    splice z0.d, p1, z0.d, z1.d
+; CHECK-i64-NEXT:    splice z2.d, p1, z2.d, z3.d
+; CHECK-i64-NEXT:    splice z4.d, p1, z4.d, z5.d
+; CHECK-i64-NEXT:    splice z6.d, p1, z6.d, z7.d
+; CHECK-i64-NEXT:    ldp q5, q19, [x29, #16]
+; CHECK-i64-NEXT:    movprfx z3, z0
+; CHECK-i64-NEXT:    frintx z3.d, p0/m, z0.d
+; CHECK-i64-NEXT:    movprfx z16, z2
+; CHECK-i64-NEXT:    frintx z16.d, p0/m, z2.d
+; CHECK-i64-NEXT:    frintx z4.d, p0/m, z4.d
+; CHECK-i64-NEXT:    splice z5.d, p1, z5.d, z19.d
+; CHECK-i64-NEXT:    frintx z6.d, p0/m, z6.d
+; CHECK-i64-NEXT:    ldp q2, q17, [x29, #48]
+; CHECK-i64-NEXT:    ldp q0, q1, [x29, #112]
+; CHECK-i64-NEXT:    mov z18.d, z3.d[3]
+; CHECK-i64-NEXT:    mov z7.d, z3.d[2]
+; CHECK-i64-NEXT:    fcvtzs x9, d3
+; CHECK-i64-NEXT:    mov z3.d, z3.d[1]
+; CHECK-i64-NEXT:    mov z20.d, z16.d[3]
+; CHECK-i64-NEXT:    fcvtzs x12, d16
+; CHECK-i64-NEXT:    splice z2.d, p1, z2.d, z17.d
+; CHECK-i64-NEXT:    frintx z5.d, p0/m, z5.d
+; CHECK-i64-NEXT:    splice z0.d, p1, z0.d, z1.d
+; CHECK-i64-NEXT:    fcvtzs x10, d18
+; CHECK-i64-NEXT:    fcvtzs x11, d7
+; CHECK-i64-NEXT:    mov z18.d, z16.d[2]
+; CHECK-i64-NEXT:    mov z7.d, z16.d[1]
+; CHECK-i64-NEXT:    fcvtzs x13, d3
+; CHECK-i64-NEXT:    fcvtzs x14, d20
+; CHECK-i64-NEXT:    str x9, [sp, #128]
+; CHECK-i64-NEXT:    mov z16.d, z4.d[3]
+; CHECK-i64-NEXT:    fcvtzs x9, d18
+; CHECK-i64-NEXT:    mov z18.d, z4.d[2]
+; CHECK-i64-NEXT:    frintx z2.d, p0/m, z2.d
+; CHECK-i64-NEXT:    stp x11, x10, [sp, #144]
+; CHECK-i64-NEXT:    fcvtzs x10, d7
+; CHECK-i64-NEXT:    mov z7.d, z4.d[1]
+; CHECK-i64-NEXT:    str x13, [sp, #136]
+; CHECK-i64-NEXT:    fcvtzs x11, d16
+; CHECK-i64-NEXT:    mov z16.d, z6.d[3]
+; CHECK-i64-NEXT:    fcvtzs x13, d18
+; CHECK-i64-NEXT:    ldp q3, q19, [x29, #80]
+; CHECK-i64-NEXT:    stp x9, x14, [sp, #176]
+; CHECK-i64-NEXT:    fcvtzs x9, d4
+; CHECK-i64-NEXT:    mov z4.d, z6.d[2]
+; CHECK-i64-NEXT:    stp x12, x10, [sp, #160]
+; CHECK-i64-NEXT:    fcvtzs x10, d7
+; CHECK-i64-NEXT:    mov z7.d, z6.d[1]
+; CHECK-i64-NEXT:    fcvtzs x12, d6
+; CHECK-i64-NEXT:    splice z3.d, p1, z3.d, z19.d
+; CHECK-i64-NEXT:    mov z6.d, z5.d[2]
+; CHECK-i64-NEXT:    stp x13, x11, [sp, #208]
+; CHECK-i64-NEXT:    fcvtzs x11, d16
+; CHECK-i64-NEXT:    fcvtzs x13, d4
+; CHECK-i64-NEXT:    mov z4.d, z5.d[3]
+; CHECK-i64-NEXT:    mov z1.d, z5.d[1]
+; CHECK-i64-NEXT:    frintx z0.d, p0/m, z0.d
+; CHECK-i64-NEXT:    stp x9, x10, [sp, #192]
+; CHECK-i64-NEXT:    fcvtzs x9, d7
+; CHECK-i64-NEXT:    frintx z3.d, p0/m, z3.d
+; CHECK-i64-NEXT:    fcvtzs x10, d4
+; CHECK-i64-NEXT:    stp x13, x11, [sp, #240]
+; CHECK-i64-NEXT:    fcvtzs x11, d6
+; CHECK-i64-NEXT:    mov z4.d, z2.d[3]
+; CHECK-i64-NEXT:    fcvtzs x13, d2
+; CHECK-i64-NEXT:    stp x12, x9, [sp, #224]
+; CHECK-i64-NEXT:    fcvtzs x9, d5
+; CHECK-i64-NEXT:    fcvtzs x12, d1
+; CHECK-i64-NEXT:    mov z5.d, z2.d[2]
+; CHECK-i64-NEXT:    mov z1.d, z2.d[1]
+; CHECK-i64-NEXT:    mov z2.d, z3.d[2]
+; CHECK-i64-NEXT:    stp x11, x10, [sp, #16]
+; CHECK-i64-NEXT:    fcvtzs x10, d4
+; CHECK-i64-NEXT:    mov z4.d, z3.d[3]
+; CHECK-i64-NEXT:    fcvtzs x11, d5
+; CHECK-i64-NEXT:    stp x9, x12, [sp]
+; CHECK-i64-NEXT:    fcvtzs x9, d1
+; CHECK-i64-NEXT:    mov z1.d, z3.d[1]
+; CHECK-i64-NEXT:    fcvtzs x12, d4
+; CHECK-i64-NEXT:    stp x11, x10, [sp, #48]
+; CHECK-i64-NEXT:    fcvtzs x10, d2
+; CHECK-i64-NEXT:    fcvtzs x11, d3
+; CHECK-i64-NEXT:    stp x13, x9, [sp, #32]
+; CHECK-i64-NEXT:    fcvtzs x9, d1
+; CHECK-i64-NEXT:    mov z2.d, z0.d[3]
+; CHECK-i64-NEXT:    mov z3.d, z0.d[2]
+; CHECK-i64-NEXT:    mov z1.d, z0.d[1]
+; CHECK-i64-NEXT:    fcvtzs x13, d2
+; CHECK-i64-NEXT:    stp x10, x12, [sp, #80]
+; CHECK-i64-NEXT:    fcvtzs x12, d0
+; CHECK-i64-NEXT:    fcvtzs x10, d3
+; CHECK-i64-NEXT:    stp x11, x9, [sp, #64]
+; CHECK-i64-NEXT:    fcvtzs x9, d1
+; CHECK-i64-NEXT:    stp x10, x13, [sp, #112]
+; CHECK-i64-NEXT:    add x10, sp, #192
+; CHECK-i64-NEXT:    stp x12, x9, [sp, #96]
+; CHECK-i64-NEXT:    add x9, sp, #128
+; CHECK-i64-NEXT:    ld1d { z0.d }, p0/z, [x9]
+; CHECK-i64-NEXT:    add x9, sp, #160
+; CHECK-i64-NEXT:    ld1d { z2.d }, p0/z, [x10]
+; CHECK-i64-NEXT:    ld1d { z1.d }, p0/z, [x9]
+; CHECK-i64-NEXT:    add x9, sp, #96
+; CHECK-i64-NEXT:    add x10, sp, #224
+; CHECK-i64-NEXT:    ld1d { z3.d }, p0/z, [x9]
+; CHECK-i64-NEXT:    add x9, sp, #64
+; CHECK-i64-NEXT:    ld1d { z4.d }, p0/z, [x10]
+; CHECK-i64-NEXT:    add x10, sp, #32
+; CHECK-i64-NEXT:    ld1d { z5.d }, p0/z, [x9]
+; CHECK-i64-NEXT:    mov x9, sp
+; CHECK-i64-NEXT:    ld1d { z6.d }, p0/z, [x10]
+; CHECK-i64-NEXT:    mov x10, #28 // =0x1c
+; CHECK-i64-NEXT:    ld1d { z7.d }, p0/z, [x9]
+; CHECK-i64-NEXT:    mov x9, #24 // =0x18
+; CHECK-i64-NEXT:    st1d { z3.d }, p0, [x8, x10, lsl #3]
+; CHECK-i64-NEXT:    st1d { z5.d }, p0, [x8, x9, lsl #3]
+; CHECK-i64-NEXT:    mov x9, #20 // =0x14
+; CHECK-i64-NEXT:    st1d { z6.d }, p0, [x8, x9, lsl #3]
+; CHECK-i64-NEXT:    mov x9, #16 // =0x10
+; CHECK-i64-NEXT:    st1d { z7.d }, p0, [x8, x9, lsl #3]
+; CHECK-i64-NEXT:    mov x9, #12 // =0xc
+; CHECK-i64-NEXT:    st1d { z4.d }, p0, [x8, x9, lsl #3]
+; CHECK-i64-NEXT:    mov x9, #8 // =0x8
+; CHECK-i64-NEXT:    st1d { z2.d }, p0, [x8, x9, lsl #3]
+; CHECK-i64-NEXT:    mov x9, #4 // =0x4
+; CHECK-i64-NEXT:    st1d { z1.d }, p0, [x8, x9, lsl #3]
+; CHECK-i64-NEXT:    st1d { z0.d }, p0, [x8]
+; CHECK-i64-NEXT:    mov sp, x29
+; CHECK-i64-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-i64-NEXT:    ret
+  %a = call <32 x iXLen> @llvm.lrint.v32iXLen.v16f64(<32 x double> %x)
+  ret <32 x iXLen> %a
+}
+declare <32 x iXLen> @llvm.lrint.v32iXLen.v32f64(<32 x double>)
diff --git a/llvm/test/CodeGen/AArch64/sve-llrint.ll b/llvm/test/CodeGen/AArch64/sve-llrint.ll
new file mode 100644
index 000000000000..a881af161201
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-llrint.ll
@@ -0,0 +1,1763 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve | FileCheck %s
+
+define <vscale x 1 x i64> @llrint_v1i64_v1f16(<vscale x 1 x half> %x) {
+; CHECK-LABEL: llrint_v1i64_v1f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
+; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
+; CHECK-NEXT:    mov z3.h, w8
+; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z3.h
+; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
+; CHECK-NEXT:    mov z1.d, p1/m, z2.d
+; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    ret
+  %a = call <vscale x 1 x i64> @llvm.llrint.nxv1i64.nxv1f16(<vscale x 1 x half> %x)
+  ret <vscale x 1 x i64> %a
+}
+declare <vscale x 1 x i64> @llvm.llrint.nxv1i64.nxv1f16(<vscale x 1 x half>)
+
+define <vscale x 2 x i64> @llrint_v1i64_v2f16(<vscale x 2 x half> %x) {
+; CHECK-LABEL: llrint_v1i64_v2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
+; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
+; CHECK-NEXT:    mov z3.h, w8
+; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z3.h
+; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
+; CHECK-NEXT:    mov z1.d, p1/m, z2.d
+; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    ret
+  %a = call <vscale x 2 x i64> @llvm.llrint.nxv2i64.nxv2f16(<vscale x 2 x half> %x)
+  ret <vscale x 2 x i64> %a
+}
+declare <vscale x 2 x i64> @llvm.llrint.nxv2i64.nxv2f16(<vscale x 2 x half>)
+
+define <vscale x 4 x i64> @llrint_v4i64_v4f16(<vscale x 4 x half> %x) {
+; CHECK-LABEL: llrint_v4i64_v4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    uunpklo z1.d, z0.s
+; CHECK-NEXT:    uunpkhi z0.d, z0.s
+; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z2.h, w8
+; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    mov z3.h, w8
+; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
+; CHECK-NEXT:    frintx z1.h, p0/m, z1.h
+; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
+; CHECK-NEXT:    fcmge p1.h, p0/z, z1.h, z2.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z0.h, z2.h
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
+; CHECK-NEXT:    movprfx z4, z1
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z1.h
+; CHECK-NEXT:    movprfx z5, z0
+; CHECK-NEXT:    fcvtzs z5.d, p0/m, z0.h
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z1.h, z3.h
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z0.h, z3.h
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    sel z3.d, p1, z2.d, z4.d
+; CHECK-NEXT:    fcmuo p1.h, p0/z, z1.h, z1.h
+; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
+; CHECK-NEXT:    sel z2.d, p2, z2.d, z5.d
+; CHECK-NEXT:    sel z0.d, p3, z6.d, z3.d
+; CHECK-NEXT:    sel z1.d, p4, z6.d, z2.d
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %a = call <vscale x 4 x i64> @llvm.llrint.nxv4i64.nxv4f16(<vscale x 4 x half> %x)
+  ret <vscale x 4 x i64> %a
+}
+declare <vscale x 4 x i64> @llvm.llrint.nxv4i64.nxv4f16(<vscale x 4 x half>)
+
+define <vscale x 8 x i64> @llrint_v8i64_v8f16(<vscale x 8 x half> %x) {
+; CHECK-LABEL: llrint_v8i64_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    uunpklo z1.s, z0.h
+; CHECK-NEXT:    uunpkhi z0.s, z0.h
+; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z4.h, w8
+; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    mov z6.h, w8
+; CHECK-NEXT:    mov z26.d, #0x7fffffffffffffff
+; CHECK-NEXT:    uunpklo z2.d, z1.s
+; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    uunpklo z3.d, z0.s
+; CHECK-NEXT:    uunpkhi z0.d, z0.s
+; CHECK-NEXT:    frintx z2.h, p0/m, z2.h
+; CHECK-NEXT:    frintx z1.h, p0/m, z1.h
+; CHECK-NEXT:    frintx z3.h, p0/m, z3.h
+; CHECK-NEXT:    movprfx z5, z0
+; CHECK-NEXT:    frintx z5.h, p0/m, z0.h
+; CHECK-NEXT:    mov z0.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p1.h, p0/z, z2.h, z4.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z1.h, z4.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z3.h, z4.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z5.h, z4.h
+; CHECK-NEXT:    movprfx z4, z2
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z2.h
+; CHECK-NEXT:    movprfx z7, z1
+; CHECK-NEXT:    fcvtzs z7.d, p0/m, z1.h
+; CHECK-NEXT:    movprfx z24, z3
+; CHECK-NEXT:    fcvtzs z24.d, p0/m, z3.h
+; CHECK-NEXT:    movprfx z25, z5
+; CHECK-NEXT:    fcvtzs z25.d, p0/m, z5.h
+; CHECK-NEXT:    fcmgt p7.h, p0/z, z3.h, z6.h
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z2.h, z6.h
+; CHECK-NEXT:    fcmgt p6.h, p0/z, z1.h, z6.h
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    not p3.b, p0/z, p3.b
+; CHECK-NEXT:    mov z4.d, p1/m, z0.d
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z5.h, z6.h
+; CHECK-NEXT:    not p4.b, p0/z, p4.b
+; CHECK-NEXT:    sel z6.d, p2, z0.d, z7.d
+; CHECK-NEXT:    fcmuo p2.h, p0/z, z2.h, z2.h
+; CHECK-NEXT:    sel z7.d, p3, z0.d, z24.d
+; CHECK-NEXT:    fcmuo p3.h, p0/z, z1.h, z1.h
+; CHECK-NEXT:    sel z24.d, p4, z0.d, z25.d
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z3.h, z3.h
+; CHECK-NEXT:    fcmuo p0.h, p0/z, z5.h, z5.h
+; CHECK-NEXT:    sel z0.d, p5, z26.d, z4.d
+; CHECK-NEXT:    sel z1.d, p6, z26.d, z6.d
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    sel z2.d, p7, z26.d, z7.d
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    sel z3.d, p1, z26.d, z24.d
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z2.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z3.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %a = call <vscale x 8 x i64> @llvm.llrint.nxv8i64.nxv8f16(<vscale x 8 x half> %x)
+  ret <vscale x 8 x i64> %a
+}
+declare <vscale x 8 x i64> @llvm.llrint.nxv8i64.nxv8f16(<vscale x 8 x half>)
+
+define <vscale x 16 x i64> @llrint_v16i64_v16f16(<vscale x 16 x half> %x) {
+; CHECK-LABEL: llrint_v16i64_v16f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    uunpklo z2.s, z0.h
+; CHECK-NEXT:    uunpkhi z0.s, z0.h
+; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    uunpklo z4.s, z1.h
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-NEXT:    mov z5.h, w8
+; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    mov z25.d, #0x8000000000000000
+; CHECK-NEXT:    mov z27.h, w8
+; CHECK-NEXT:    mov z7.d, #0x7fffffffffffffff
+; CHECK-NEXT:    uunpklo z3.d, z2.s
+; CHECK-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-NEXT:    uunpklo z6.d, z0.s
+; CHECK-NEXT:    uunpkhi z0.d, z0.s
+; CHECK-NEXT:    uunpklo z24.d, z4.s
+; CHECK-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-NEXT:    uunpklo z26.d, z1.s
+; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    frintx z2.h, p0/m, z2.h
+; CHECK-NEXT:    frintx z3.h, p0/m, z3.h
+; CHECK-NEXT:    frintx z6.h, p0/m, z6.h
+; CHECK-NEXT:    movprfx z28, z0
+; CHECK-NEXT:    frintx z28.h, p0/m, z0.h
+; CHECK-NEXT:    movprfx z29, z4
+; CHECK-NEXT:    frintx z29.h, p0/m, z4.h
+; CHECK-NEXT:    frintx z24.h, p0/m, z24.h
+; CHECK-NEXT:    movprfx z30, z1
+; CHECK-NEXT:    frintx z30.h, p0/m, z1.h
+; CHECK-NEXT:    frintx z26.h, p0/m, z26.h
+; CHECK-NEXT:    fcmge p5.h, p0/z, z2.h, z5.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z3.h, z5.h
+; CHECK-NEXT:    movprfx z1, z2
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z2.h
+; CHECK-NEXT:    movprfx z0, z3
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z3.h
+; CHECK-NEXT:    fcmge p6.h, p0/z, z6.h, z5.h
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z3.h, z27.h
+; CHECK-NEXT:    fcmuo p1.h, p0/z, z3.h, z3.h
+; CHECK-NEXT:    fcmge p7.h, p0/z, z28.h, z5.h
+; CHECK-NEXT:    movprfx z3, z6
+; CHECK-NEXT:    fcvtzs z3.d, p0/m, z6.h
+; CHECK-NEXT:    fcmge p8.h, p0/z, z24.h, z5.h
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z2.h, z27.h
+; CHECK-NEXT:    fcmge p9.h, p0/z, z26.h, z5.h
+; CHECK-NEXT:    not p5.b, p0/z, p5.b
+; CHECK-NEXT:    movprfx z4, z24
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z24.h
+; CHECK-NEXT:    fcmge p10.h, p0/z, z30.h, z5.h
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    movprfx z31, z26
+; CHECK-NEXT:    fcvtzs z31.d, p0/m, z26.h
+; CHECK-NEXT:    movprfx z8, z30
+; CHECK-NEXT:    fcvtzs z8.d, p0/m, z30.h
+; CHECK-NEXT:    mov z1.d, p5/m, z25.d
+; CHECK-NEXT:    fcmge p5.h, p0/z, z29.h, z5.h
+; CHECK-NEXT:    not p6.b, p0/z, p6.b
+; CHECK-NEXT:    mov z0.d, p2/m, z25.d
+; CHECK-NEXT:    fcmuo p2.h, p0/z, z2.h, z2.h
+; CHECK-NEXT:    movprfx z2, z28
+; CHECK-NEXT:    fcvtzs z2.d, p0/m, z28.h
+; CHECK-NEXT:    movprfx z5, z29
+; CHECK-NEXT:    fcvtzs z5.d, p0/m, z29.h
+; CHECK-NEXT:    not p7.b, p0/z, p7.b
+; CHECK-NEXT:    mov z3.d, p6/m, z25.d
+; CHECK-NEXT:    not p6.b, p0/z, p8.b
+; CHECK-NEXT:    fcmgt p8.h, p0/z, z6.h, z27.h
+; CHECK-NEXT:    mov z1.d, p4/m, z7.d
+; CHECK-NEXT:    not p5.b, p0/z, p5.b
+; CHECK-NEXT:    mov z0.d, p3/m, z7.d
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z29.h, z27.h
+; CHECK-NEXT:    sel z9.d, p7, z25.d, z2.d
+; CHECK-NEXT:    not p7.b, p0/z, p9.b
+; CHECK-NEXT:    mov z4.d, p6/m, z25.d
+; CHECK-NEXT:    not p6.b, p0/z, p10.b
+; CHECK-NEXT:    fcmgt p10.h, p0/z, z28.h, z27.h
+; CHECK-NEXT:    mov z5.d, p5/m, z25.d
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z24.h, z27.h
+; CHECK-NEXT:    fcmuo p9.h, p0/z, z6.h, z6.h
+; CHECK-NEXT:    sel z6.d, p7, z25.d, z31.d
+; CHECK-NEXT:    sel z25.d, p6, z25.d, z8.d
+; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    fcmgt p6.h, p0/z, z26.h, z27.h
+; CHECK-NEXT:    fcmgt p7.h, p0/z, z30.h, z27.h
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z28.h, z28.h
+; CHECK-NEXT:    sel z2.d, p8, z7.d, z3.d
+; CHECK-NEXT:    sel z3.d, p10, z7.d, z9.d
+; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    fcmuo p8.h, p0/z, z29.h, z29.h
+; CHECK-NEXT:    mov z4.d, p5/m, z7.d
+; CHECK-NEXT:    fcmuo p5.h, p0/z, z24.h, z24.h
+; CHECK-NEXT:    fcmuo p10.h, p0/z, z26.h, z26.h
+; CHECK-NEXT:    mov z5.d, p3/m, z7.d
+; CHECK-NEXT:    mov z6.d, p6/m, z7.d
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    fcmuo p0.h, p0/z, z30.h, z30.h
+; CHECK-NEXT:    sel z7.d, p7, z7.d, z25.d
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z2.d, p9/m, #0 // =0x0
+; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z3.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z4.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    mov z5.d, p8/m, #0 // =0x0
+; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z6.d, p10/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z1.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z7.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %a = call <vscale x 16 x i64> @llvm.llrint.nxv16i64.nxv16f16(<vscale x 16 x half> %x)
+  ret <vscale x 16 x i64> %a
+}
+declare <vscale x 16 x i64> @llvm.llrint.nxv16i64.nxv16f16(<vscale x 16 x half>)
+
+define <vscale x 32 x i64> @llrint_v32i64_v32f16(<vscale x 32 x half> %x) {
+; CHECK-LABEL: llrint_v32i64_v32f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-17
+; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 136 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
+; CHECK-NEXT:    uunpkhi z5.s, z0.h
+; CHECK-NEXT:    uunpklo z4.s, z0.h
+; CHECK-NEXT:    mov w9, #64511 // =0xfbff
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z6.s, z1.h
+; CHECK-NEXT:    mov z30.h, w9
+; CHECK-NEXT:    uunpkhi z10.s, z1.h
+; CHECK-NEXT:    mov w9, #31743 // =0x7bff
+; CHECK-NEXT:    mov z29.d, #0x8000000000000000
+; CHECK-NEXT:    uunpklo z8.s, z2.h
+; CHECK-NEXT:    uunpkhi z13.s, z3.h
+; CHECK-NEXT:    uunpklo z18.s, z3.h
+; CHECK-NEXT:    uunpklo z7.d, z5.s
+; CHECK-NEXT:    uunpklo z0.d, z4.s
+; CHECK-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-NEXT:    uunpkhi z24.d, z5.s
+; CHECK-NEXT:    uunpklo z25.d, z6.s
+; CHECK-NEXT:    uunpkhi z26.d, z6.s
+; CHECK-NEXT:    uunpklo z27.d, z10.s
+; CHECK-NEXT:    uunpkhi z10.d, z10.s
+; CHECK-NEXT:    uunpklo z12.d, z8.s
+; CHECK-NEXT:    uunpkhi z16.d, z8.s
+; CHECK-NEXT:    movprfx z5, z7
+; CHECK-NEXT:    frintx z5.h, p0/m, z7.h
+; CHECK-NEXT:    movprfx z1, z4
+; CHECK-NEXT:    frintx z1.h, p0/m, z4.h
+; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
+; CHECK-NEXT:    movprfx z6, z24
+; CHECK-NEXT:    frintx z6.h, p0/m, z24.h
+; CHECK-NEXT:    movprfx z24, z25
+; CHECK-NEXT:    frintx z24.h, p0/m, z25.h
+; CHECK-NEXT:    movprfx z25, z26
+; CHECK-NEXT:    frintx z25.h, p0/m, z26.h
+; CHECK-NEXT:    movprfx z28, z27
+; CHECK-NEXT:    frintx z28.h, p0/m, z27.h
+; CHECK-NEXT:    movprfx z8, z10
+; CHECK-NEXT:    frintx z8.h, p0/m, z10.h
+; CHECK-NEXT:    mov z7.h, w9
+; CHECK-NEXT:    mov z4.d, #0x7fffffffffffffff
+; CHECK-NEXT:    rdvl x9, #15
+; CHECK-NEXT:    fcmge p3.h, p0/z, z5.h, z30.h
+; CHECK-NEXT:    movprfx z11, z5
+; CHECK-NEXT:    fcvtzs z11.d, p0/m, z5.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z1.h, z30.h
+; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z30.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z6.h, z30.h
+; CHECK-NEXT:    movprfx z9, z6
+; CHECK-NEXT:    fcvtzs z9.d, p0/m, z6.h
+; CHECK-NEXT:    movprfx z15, z25
+; CHECK-NEXT:    fcvtzs z15.d, p0/m, z25.h
+; CHECK-NEXT:    movprfx z14, z24
+; CHECK-NEXT:    fcvtzs z14.d, p0/m, z24.h
+; CHECK-NEXT:    movprfx z26, z0
+; CHECK-NEXT:    fcvtzs z26.d, p0/m, z0.h
+; CHECK-NEXT:    movprfx z19, z28
+; CHECK-NEXT:    fcvtzs z19.d, p0/m, z28.h
+; CHECK-NEXT:    movprfx z31, z1
+; CHECK-NEXT:    fcvtzs z31.d, p0/m, z1.h
+; CHECK-NEXT:    not p3.b, p0/z, p3.b
+; CHECK-NEXT:    not p6.b, p0/z, p2.b
+; CHECK-NEXT:    fcmge p2.h, p0/z, z25.h, z30.h
+; CHECK-NEXT:    sel z27.d, p3, z29.d, z11.d
+; CHECK-NEXT:    uunpkhi z11.s, z2.h
+; CHECK-NEXT:    not p5.b, p0/z, p1.b
+; CHECK-NEXT:    fcmge p1.h, p0/z, z24.h, z30.h
+; CHECK-NEXT:    not p3.b, p0/z, p4.b
+; CHECK-NEXT:    fcmge p4.h, p0/z, z28.h, z30.h
+; CHECK-NEXT:    mov z26.d, p5/m, z29.d
+; CHECK-NEXT:    mov z31.d, p6/m, z29.d
+; CHECK-NEXT:    sel z2.d, p3, z29.d, z9.d
+; CHECK-NEXT:    movprfx z9, z12
+; CHECK-NEXT:    frintx z9.h, p0/m, z12.h
+; CHECK-NEXT:    uunpkhi z12.d, z13.s
+; CHECK-NEXT:    uunpklo z17.d, z11.s
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    sel z3.d, p2, z29.d, z15.d
+; CHECK-NEXT:    uunpklo z15.d, z13.s
+; CHECK-NEXT:    fcmge p2.h, p0/z, z8.h, z30.h
+; CHECK-NEXT:    sel z10.d, p1, z29.d, z14.d
+; CHECK-NEXT:    movprfx z14, z16
+; CHECK-NEXT:    frintx z14.h, p0/m, z16.h
+; CHECK-NEXT:    uunpkhi z16.d, z18.s
+; CHECK-NEXT:    movprfx z13, z17
+; CHECK-NEXT:    frintx z13.h, p0/m, z17.h
+; CHECK-NEXT:    movprfx z20, z12
+; CHECK-NEXT:    frintx z20.h, p0/m, z12.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z9.h, z30.h
+; CHECK-NEXT:    uunpkhi z17.d, z11.s
+; CHECK-NEXT:    uunpklo z18.d, z18.s
+; CHECK-NEXT:    movprfx z12, z8
+; CHECK-NEXT:    fcvtzs z12.d, p0/m, z8.h
+; CHECK-NEXT:    movprfx z21, z15
+; CHECK-NEXT:    frintx z21.h, p0/m, z15.h
+; CHECK-NEXT:    not p1.b, p0/z, p4.b
+; CHECK-NEXT:    movprfx z15, z9
+; CHECK-NEXT:    fcvtzs z15.d, p0/m, z9.h
+; CHECK-NEXT:    frintx z16.h, p0/m, z16.h
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    movprfx z22, z14
+; CHECK-NEXT:    fcvtzs z22.d, p0/m, z14.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z13.h, z30.h
+; CHECK-NEXT:    fcmge p5.h, p0/z, z20.h, z30.h
+; CHECK-NEXT:    sel z11.d, p1, z29.d, z19.d
+; CHECK-NEXT:    not p3.b, p0/z, p3.b
+; CHECK-NEXT:    frintx z17.h, p0/m, z17.h
+; CHECK-NEXT:    frintx z18.h, p0/m, z18.h
+; CHECK-NEXT:    movprfx z19, z20
+; CHECK-NEXT:    fcvtzs z19.d, p0/m, z20.h
+; CHECK-NEXT:    mov z12.d, p2/m, z29.d
+; CHECK-NEXT:    fcmge p2.h, p0/z, z21.h, z30.h
+; CHECK-NEXT:    fcmge p1.h, p0/z, z14.h, z30.h
+; CHECK-NEXT:    mov z15.d, p3/m, z29.d
+; CHECK-NEXT:    movprfx z23, z21
+; CHECK-NEXT:    fcvtzs z23.d, p0/m, z21.h
+; CHECK-NEXT:    not p3.b, p0/z, p4.b
+; CHECK-NEXT:    fcmge p4.h, p0/z, z16.h, z30.h
+; CHECK-NEXT:    fcmgt p8.h, p0/z, z21.h, z7.h
+; CHECK-NEXT:    not p5.b, p0/z, p5.b
+; CHECK-NEXT:    fcmge p6.h, p0/z, z17.h, z30.h
+; CHECK-NEXT:    fcmge p7.h, p0/z, z18.h, z30.h
+; CHECK-NEXT:    movprfx z30, z16
+; CHECK-NEXT:    fcvtzs z30.d, p0/m, z16.h
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    fcmuo p9.h, p0/z, z21.h, z21.h
+; CHECK-NEXT:    mov z19.d, p5/m, z29.d
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z20.h, z7.h
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    not p4.b, p0/z, p4.b
+; CHECK-NEXT:    mov z23.d, p2/m, z29.d
+; CHECK-NEXT:    fcmuo p2.h, p0/z, z20.h, z20.h
+; CHECK-NEXT:    movprfx z20, z18
+; CHECK-NEXT:    fcvtzs z20.d, p0/m, z18.h
+; CHECK-NEXT:    movprfx z21, z13
+; CHECK-NEXT:    fcvtzs z21.d, p0/m, z13.h
+; CHECK-NEXT:    mov z22.d, p1/m, z29.d
+; CHECK-NEXT:    not p1.b, p0/z, p7.b
+; CHECK-NEXT:    mov z30.d, p4/m, z29.d
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z18.h, z7.h
+; CHECK-NEXT:    mov z19.d, p5/m, z4.d
+; CHECK-NEXT:    fcmuo p7.h, p0/z, z18.h, z18.h
+; CHECK-NEXT:    movprfx z18, z17
+; CHECK-NEXT:    fcvtzs z18.d, p0/m, z17.h
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z16.h, z7.h
+; CHECK-NEXT:    not p6.b, p0/z, p6.b
+; CHECK-NEXT:    mov z23.d, p8/m, z4.d
+; CHECK-NEXT:    mov z20.d, p1/m, z29.d
+; CHECK-NEXT:    mov z21.d, p3/m, z29.d
+; CHECK-NEXT:    fcmuo p3.h, p0/z, z16.h, z16.h
+; CHECK-NEXT:    mov z19.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z17.h, z7.h
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    sel z29.d, p6, z29.d, z18.d
+; CHECK-NEXT:    mov z23.d, p9/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p6.h, p0/z, z14.h, z7.h
+; CHECK-NEXT:    mov z30.d, p5/m, z4.d
+; CHECK-NEXT:    sel z16.d, p4, z4.d, z20.d
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z17.h, z17.h
+; CHECK-NEXT:    st1b { z19.b }, p1, [x8, x9]
+; CHECK-NEXT:    rdvl x9, #14
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z1.h, z7.h
+; CHECK-NEXT:    st1b { z23.b }, p1, [x8, x9]
+; CHECK-NEXT:    rdvl x9, #13
+; CHECK-NEXT:    mov z29.d, p2/m, z4.d
+; CHECK-NEXT:    mov z30.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z13.h, z7.h
+; CHECK-NEXT:    mov z16.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z9.h, z7.h
+; CHECK-NEXT:    fcmuo p7.h, p0/z, z14.h, z14.h
+; CHECK-NEXT:    mov z29.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z13.h, z13.h
+; CHECK-NEXT:    st1b { z30.b }, p1, [x8, x9]
+; CHECK-NEXT:    rdvl x9, #12
+; CHECK-NEXT:    sel z30.d, p5, z4.d, z31.d
+; CHECK-NEXT:    st1b { z16.b }, p1, [x8, x9]
+; CHECK-NEXT:    rdvl x9, #11
+; CHECK-NEXT:    sel z31.d, p3, z4.d, z21.d
+; CHECK-NEXT:    st1b { z29.b }, p1, [x8, x9]
+; CHECK-NEXT:    rdvl x9, #10
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z24.h, z7.h
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z28.h, z7.h
+; CHECK-NEXT:    sel z13.d, p2, z4.d, z15.d
+; CHECK-NEXT:    fcmuo p2.h, p0/z, z9.h, z9.h
+; CHECK-NEXT:    sel z29.d, p6, z4.d, z22.d
+; CHECK-NEXT:    mov z31.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z8.h, z7.h
+; CHECK-NEXT:    fcmgt p6.h, p0/z, z5.h, z7.h
+; CHECK-NEXT:    sel z9.d, p5, z4.d, z10.d
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z6.h, z7.h
+; CHECK-NEXT:    st1b { z31.b }, p1, [x8, x9]
+; CHECK-NEXT:    rdvl x9, #9
+; CHECK-NEXT:    mov z29.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    sel z10.d, p3, z4.d, z11.d
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z25.h, z7.h
+; CHECK-NEXT:    mov z13.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p7.h, p0/z, z8.h, z8.h
+; CHECK-NEXT:    fcmuo p2.h, p0/z, z28.h, z28.h
+; CHECK-NEXT:    sel z28.d, p4, z4.d, z12.d
+; CHECK-NEXT:    st1b { z29.b }, p1, [x8, x9]
+; CHECK-NEXT:    rdvl x9, #8
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z25.h, z25.h
+; CHECK-NEXT:    st1b { z13.b }, p1, [x8, x9]
+; CHECK-NEXT:    fcmuo p1.h, p0/z, z24.h, z24.h
+; CHECK-NEXT:    mov z2.d, p5/m, z4.d
+; CHECK-NEXT:    mov z3.d, p3/m, z4.d
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z0.h, z7.h
+; CHECK-NEXT:    mov z28.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p7.h, p0/z, z6.h, z6.h
+; CHECK-NEXT:    mov z10.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p2.h, p0/z, z5.h, z5.h
+; CHECK-NEXT:    sel z5.d, p6, z4.d, z27.d
+; CHECK-NEXT:    mov z3.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z1.h, z1.h
+; CHECK-NEXT:    mov z9.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    st1d { z28.d }, p0, [x8, #7, mul vl]
+; CHECK-NEXT:    fcmuo p1.h, p0/z, z0.h, z0.h
+; CHECK-NEXT:    sel z0.d, p3, z4.d, z26.d
+; CHECK-NEXT:    st1d { z10.d }, p0, [x8, #6, mul vl]
+; CHECK-NEXT:    mov z2.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    st1d { z3.d }, p0, [x8, #5, mul vl]
+; CHECK-NEXT:    mov z5.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    st1d { z9.d }, p0, [x8, #4, mul vl]
+; CHECK-NEXT:    mov z30.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    st1d { z2.d }, p0, [x8, #3, mul vl]
+; CHECK-NEXT:    st1d { z5.d }, p0, [x8, #2, mul vl]
+; CHECK-NEXT:    st1d { z30.d }, p0, [x8, #1, mul vl]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x8]
+; CHECK-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #17
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %a = call <vscale x 32 x i64> @llvm.llrint.nxv32i64.nxv32f16(<vscale x 32 x half> %x)
+  ret <vscale x 32 x i64> %a
+}
+declare <vscale x 32 x i64> @llvm.llrint.nxv32i64.nxv32f16(<vscale x 32 x half>)
+
+define <vscale x 1 x i64> @llrint_v1i64_v1f32(<vscale x 1 x float> %x) {
+; CHECK-LABEL: llrint_v1i64_v1f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
+; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
+; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
+; CHECK-NEXT:    mov z3.s, w8
+; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.s
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z3.s
+; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
+; CHECK-NEXT:    mov z1.d, p1/m, z2.d
+; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    ret
+  %a = call <vscale x 1 x i64> @llvm.llrint.nxv1i64.nxv1f32(<vscale x 1 x float> %x)
+  ret <vscale x 1 x i64> %a
+}
+declare <vscale x 1 x i64> @llvm.llrint.nxv1i64.nxv1f32(<vscale x 1 x float>)
+
+define <vscale x 2 x i64> @llrint_v2i64_v2f32(<vscale x 2 x float> %x) {
+; CHECK-LABEL: llrint_v2i64_v2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
+; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
+; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
+; CHECK-NEXT:    mov z3.s, w8
+; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.s
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z3.s
+; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
+; CHECK-NEXT:    mov z1.d, p1/m, z2.d
+; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    ret
+  %a = call <vscale x 2 x i64> @llvm.llrint.nxv2i64.nxv2f32(<vscale x 2 x float> %x)
+  ret <vscale x 2 x i64> %a
+}
+declare <vscale x 2 x i64> @llvm.llrint.nxv2i64.nxv2f32(<vscale x 2 x float>)
+
+define <vscale x 4 x i64> @llrint_v4i64_v4f32(<vscale x 4 x float> %x) {
+; CHECK-LABEL: llrint_v4i64_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    uunpklo z1.d, z0.s
+; CHECK-NEXT:    uunpkhi z0.d, z0.s
+; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z2.s, w8
+; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
+; CHECK-NEXT:    mov z3.s, w8
+; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
+; CHECK-NEXT:    frintx z1.s, p0/m, z1.s
+; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
+; CHECK-NEXT:    fcmge p1.s, p0/z, z1.s, z2.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, z2.s
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
+; CHECK-NEXT:    movprfx z4, z1
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z1.s
+; CHECK-NEXT:    movprfx z5, z0
+; CHECK-NEXT:    fcvtzs z5.d, p0/m, z0.s
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z1.s, z3.s
+; CHECK-NEXT:    fcmgt p4.s, p0/z, z0.s, z3.s
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    sel z3.d, p1, z2.d, z4.d
+; CHECK-NEXT:    fcmuo p1.s, p0/z, z1.s, z1.s
+; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
+; CHECK-NEXT:    sel z2.d, p2, z2.d, z5.d
+; CHECK-NEXT:    sel z0.d, p3, z6.d, z3.d
+; CHECK-NEXT:    sel z1.d, p4, z6.d, z2.d
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %a = call <vscale x 4 x i64> @llvm.llrint.nxv4i64.nxv4f32(<vscale x 4 x float> %x)
+  ret <vscale x 4 x i64> %a
+}
+declare <vscale x 4 x i64> @llvm.llrint.nxv4i64.nxv4f32(<vscale x 4 x float>)
+
+define <vscale x 8 x i64> @llrint_v8i64_v8f32(<vscale x 8 x float> %x) {
+; CHECK-LABEL: llrint_v8i64_v8f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    uunpklo z2.d, z0.s
+; CHECK-NEXT:    uunpkhi z0.d, z0.s
+; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
+; CHECK-NEXT:    uunpklo z3.d, z1.s
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    mov z4.s, w8
+; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
+; CHECK-NEXT:    mov z5.d, #0x8000000000000000
+; CHECK-NEXT:    mov z6.s, w8
+; CHECK-NEXT:    mov z26.d, #0x7fffffffffffffff
+; CHECK-NEXT:    frintx z2.s, p0/m, z2.s
+; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
+; CHECK-NEXT:    frintx z3.s, p0/m, z3.s
+; CHECK-NEXT:    frintx z1.s, p0/m, z1.s
+; CHECK-NEXT:    fcmge p1.s, p0/z, z2.s, z4.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, z4.s
+; CHECK-NEXT:    movprfx z7, z0
+; CHECK-NEXT:    fcvtzs z7.d, p0/m, z0.s
+; CHECK-NEXT:    fcmge p3.s, p0/z, z3.s, z4.s
+; CHECK-NEXT:    fcmge p4.s, p0/z, z1.s, z4.s
+; CHECK-NEXT:    movprfx z4, z2
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z2.s
+; CHECK-NEXT:    movprfx z24, z3
+; CHECK-NEXT:    fcvtzs z24.d, p0/m, z3.s
+; CHECK-NEXT:    movprfx z25, z1
+; CHECK-NEXT:    fcvtzs z25.d, p0/m, z1.s
+; CHECK-NEXT:    fcmgt p7.s, p0/z, z3.s, z6.s
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z2.s, z6.s
+; CHECK-NEXT:    fcmgt p6.s, p0/z, z0.s, z6.s
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    not p3.b, p0/z, p3.b
+; CHECK-NEXT:    mov z4.d, p1/m, z5.d
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z1.s, z6.s
+; CHECK-NEXT:    not p4.b, p0/z, p4.b
+; CHECK-NEXT:    sel z6.d, p2, z5.d, z7.d
+; CHECK-NEXT:    fcmuo p2.s, p0/z, z2.s, z2.s
+; CHECK-NEXT:    sel z7.d, p3, z5.d, z24.d
+; CHECK-NEXT:    fcmuo p3.s, p0/z, z0.s, z0.s
+; CHECK-NEXT:    sel z5.d, p4, z5.d, z25.d
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z3.s, z3.s
+; CHECK-NEXT:    fcmuo p0.s, p0/z, z1.s, z1.s
+; CHECK-NEXT:    sel z0.d, p5, z26.d, z4.d
+; CHECK-NEXT:    sel z1.d, p6, z26.d, z6.d
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    sel z2.d, p7, z26.d, z7.d
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    sel z3.d, p1, z26.d, z5.d
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z2.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z3.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %a = call <vscale x 8 x i64> @llvm.llrint.nxv8i64.nxv8f32(<vscale x 8 x float> %x)
+  ret <vscale x 8 x i64> %a
+}
+declare <vscale x 8 x i64> @llvm.llrint.nxv8i64.nxv8f32(<vscale x 8 x float>)
+
+define <vscale x 16 x i64> @llrint_v16i64_v16f32(<vscale x 16 x float> %x) {
+; CHECK-LABEL: llrint_v16i64_v16f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    uunpklo z4.d, z0.s
+; CHECK-NEXT:    uunpkhi z0.d, z0.s
+; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z7.d, z1.s
+; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    uunpklo z24.d, z2.s
+; CHECK-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-NEXT:    uunpklo z25.d, z3.s
+; CHECK-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-NEXT:    mov z26.d, #0x7fffffffffffffff
+; CHECK-NEXT:    movprfx z5, z4
+; CHECK-NEXT:    frintx z5.s, p0/m, z4.s
+; CHECK-NEXT:    movprfx z6, z0
+; CHECK-NEXT:    frintx z6.s, p0/m, z0.s
+; CHECK-NEXT:    mov z4.s, w8
+; CHECK-NEXT:    frintx z7.s, p0/m, z7.s
+; CHECK-NEXT:    movprfx z28, z1
+; CHECK-NEXT:    frintx z28.s, p0/m, z1.s
+; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
+; CHECK-NEXT:    mov z0.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z24.s, p0/m, z24.s
+; CHECK-NEXT:    movprfx z29, z2
+; CHECK-NEXT:    frintx z29.s, p0/m, z2.s
+; CHECK-NEXT:    frintx z25.s, p0/m, z25.s
+; CHECK-NEXT:    movprfx z30, z3
+; CHECK-NEXT:    frintx z30.s, p0/m, z3.s
+; CHECK-NEXT:    mov z27.s, w8
+; CHECK-NEXT:    fcmge p1.s, p0/z, z5.s, z4.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z6.s, z4.s
+; CHECK-NEXT:    movprfx z1, z5
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z5.s
+; CHECK-NEXT:    movprfx z2, z6
+; CHECK-NEXT:    fcvtzs z2.d, p0/m, z6.s
+; CHECK-NEXT:    fcmge p5.s, p0/z, z7.s, z4.s
+; CHECK-NEXT:    fcmge p6.s, p0/z, z28.s, z4.s
+; CHECK-NEXT:    movprfx z3, z7
+; CHECK-NEXT:    fcvtzs z3.d, p0/m, z7.s
+; CHECK-NEXT:    fcmge p8.s, p0/z, z29.s, z4.s
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z5.s, z27.s
+; CHECK-NEXT:    fcmgt p7.s, p0/z, z6.s, z27.s
+; CHECK-NEXT:    fcmge p9.s, p0/z, z25.s, z4.s
+; CHECK-NEXT:    movprfx z31, z25
+; CHECK-NEXT:    fcvtzs z31.d, p0/m, z25.s
+; CHECK-NEXT:    not p4.b, p0/z, p1.b
+; CHECK-NEXT:    fcmuo p1.s, p0/z, z5.s, z5.s
+; CHECK-NEXT:    movprfx z5, z28
+; CHECK-NEXT:    fcvtzs z5.d, p0/m, z28.s
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    fcmge p10.s, p0/z, z30.s, z4.s
+; CHECK-NEXT:    movprfx z8, z30
+; CHECK-NEXT:    fcvtzs z8.d, p0/m, z30.s
+; CHECK-NEXT:    mov z1.d, p4/m, z0.d
+; CHECK-NEXT:    fcmge p4.s, p0/z, z24.s, z4.s
+; CHECK-NEXT:    movprfx z4, z29
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z29.s
+; CHECK-NEXT:    mov z2.d, p2/m, z0.d
+; CHECK-NEXT:    fcmuo p2.s, p0/z, z6.s, z6.s
+; CHECK-NEXT:    movprfx z6, z24
+; CHECK-NEXT:    fcvtzs z6.d, p0/m, z24.s
+; CHECK-NEXT:    not p5.b, p0/z, p5.b
+; CHECK-NEXT:    not p6.b, p0/z, p6.b
+; CHECK-NEXT:    not p4.b, p0/z, p4.b
+; CHECK-NEXT:    mov z3.d, p5/m, z0.d
+; CHECK-NEXT:    not p5.b, p0/z, p8.b
+; CHECK-NEXT:    mov z5.d, p6/m, z0.d
+; CHECK-NEXT:    fcmgt p8.s, p0/z, z7.s, z27.s
+; CHECK-NEXT:    not p6.b, p0/z, p9.b
+; CHECK-NEXT:    mov z6.d, p4/m, z0.d
+; CHECK-NEXT:    fcmuo p9.s, p0/z, z7.s, z7.s
+; CHECK-NEXT:    not p4.b, p0/z, p10.b
+; CHECK-NEXT:    fcmgt p10.s, p0/z, z28.s, z27.s
+; CHECK-NEXT:    sel z7.d, p5, z0.d, z4.d
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z24.s, z27.s
+; CHECK-NEXT:    mov z31.d, p6/m, z0.d
+; CHECK-NEXT:    fcmgt p6.s, p0/z, z30.s, z27.s
+; CHECK-NEXT:    mov z8.d, p4/m, z0.d
+; CHECK-NEXT:    sel z0.d, p3, z26.d, z1.d
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z29.s, z27.s
+; CHECK-NEXT:    fcmgt p4.s, p0/z, z25.s, z27.s
+; CHECK-NEXT:    sel z1.d, p7, z26.d, z2.d
+; CHECK-NEXT:    fcmuo p7.s, p0/z, z28.s, z28.s
+; CHECK-NEXT:    sel z2.d, p8, z26.d, z3.d
+; CHECK-NEXT:    sel z3.d, p10, z26.d, z5.d
+; CHECK-NEXT:    fcmuo p8.s, p0/z, z29.s, z29.s
+; CHECK-NEXT:    sel z4.d, p5, z26.d, z6.d
+; CHECK-NEXT:    fcmuo p5.s, p0/z, z24.s, z24.s
+; CHECK-NEXT:    fcmuo p10.s, p0/z, z25.s, z25.s
+; CHECK-NEXT:    sel z5.d, p3, z26.d, z7.d
+; CHECK-NEXT:    fcmuo p0.s, p0/z, z30.s, z30.s
+; CHECK-NEXT:    sel z7.d, p6, z26.d, z8.d
+; CHECK-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    sel z6.d, p4, z26.d, z31.d
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z2.d, p9/m, #0 // =0x0
+; CHECK-NEXT:    mov z3.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z4.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    mov z5.d, p8/m, #0 // =0x0
+; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z6.d, p10/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z1.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z7.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %a = call <vscale x 16 x i64> @llvm.llrint.nxv16i64.nxv16f32(<vscale x 16 x float> %x)
+  ret <vscale x 16 x i64> %a
+}
+declare <vscale x 16 x i64> @llvm.llrint.nxv16i64.nxv16f32(<vscale x 16 x float>)
+
+define <vscale x 32 x i64> @llrint_v32i64_v32f32(<vscale x 32 x float> %x) {
+; CHECK-LABEL: llrint_v32i64_v32f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-17
+; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 136 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
+; CHECK-NEXT:    uunpklo z24.d, z0.s
+; CHECK-NEXT:    uunpkhi z25.d, z0.s
+; CHECK-NEXT:    mov w9, #-553648128 // =0xdf000000
+; CHECK-NEXT:    uunpklo z26.d, z1.s
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpkhi z27.d, z1.s
+; CHECK-NEXT:    mov z31.s, w9
+; CHECK-NEXT:    mov w9, #1593835519 // =0x5effffff
+; CHECK-NEXT:    uunpklo z28.d, z2.s
+; CHECK-NEXT:    mov z8.d, #0x8000000000000000
+; CHECK-NEXT:    uunpklo z30.d, z3.s
+; CHECK-NEXT:    uunpklo z13.d, z4.s
+; CHECK-NEXT:    movprfx z0, z24
+; CHECK-NEXT:    frintx z0.s, p0/m, z24.s
+; CHECK-NEXT:    movprfx z1, z25
+; CHECK-NEXT:    frintx z1.s, p0/m, z25.s
+; CHECK-NEXT:    uunpkhi z15.d, z4.s
+; CHECK-NEXT:    movprfx z24, z26
+; CHECK-NEXT:    frintx z24.s, p0/m, z26.s
+; CHECK-NEXT:    uunpkhi z26.d, z2.s
+; CHECK-NEXT:    movprfx z25, z27
+; CHECK-NEXT:    frintx z25.s, p0/m, z27.s
+; CHECK-NEXT:    movprfx z27, z28
+; CHECK-NEXT:    frintx z27.s, p0/m, z28.s
+; CHECK-NEXT:    uunpklo z16.d, z5.s
+; CHECK-NEXT:    uunpkhi z17.d, z7.s
+; CHECK-NEXT:    frintx z30.s, p0/m, z30.s
+; CHECK-NEXT:    uunpklo z18.d, z7.s
+; CHECK-NEXT:    uunpklo z21.d, z6.s
+; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z31.s
+; CHECK-NEXT:    movprfx z9, z0
+; CHECK-NEXT:    fcvtzs z9.d, p0/m, z0.s
+; CHECK-NEXT:    movprfx z10, z1
+; CHECK-NEXT:    fcvtzs z10.d, p0/m, z1.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z1.s, z31.s
+; CHECK-NEXT:    fcmge p3.s, p0/z, z24.s, z31.s
+; CHECK-NEXT:    movprfx z11, z24
+; CHECK-NEXT:    fcvtzs z11.d, p0/m, z24.s
+; CHECK-NEXT:    movprfx z29, z26
+; CHECK-NEXT:    frintx z29.s, p0/m, z26.s
+; CHECK-NEXT:    fcmge p4.s, p0/z, z25.s, z31.s
+; CHECK-NEXT:    fcmge p5.s, p0/z, z27.s, z31.s
+; CHECK-NEXT:    movprfx z12, z27
+; CHECK-NEXT:    fcvtzs z12.d, p0/m, z27.s
+; CHECK-NEXT:    movprfx z19, z30
+; CHECK-NEXT:    fcvtzs z19.d, p0/m, z30.s
+; CHECK-NEXT:    movprfx z7, z16
+; CHECK-NEXT:    frintx z7.s, p0/m, z16.s
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    frintx z17.s, p0/m, z17.s
+; CHECK-NEXT:    uunpkhi z16.d, z5.s
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    frintx z18.s, p0/m, z18.s
+; CHECK-NEXT:    mov z28.s, w9
+; CHECK-NEXT:    not p6.b, p0/z, p3.b
+; CHECK-NEXT:    sel z26.d, p1, z8.d, z9.d
+; CHECK-NEXT:    movprfx z14, z29
+; CHECK-NEXT:    fcvtzs z14.d, p0/m, z29.s
+; CHECK-NEXT:    sel z9.d, p2, z8.d, z10.d
+; CHECK-NEXT:    uunpkhi z10.d, z3.s
+; CHECK-NEXT:    rdvl x9, #15
+; CHECK-NEXT:    sel z3.d, p6, z8.d, z11.d
+; CHECK-NEXT:    movprfx z11, z25
+; CHECK-NEXT:    fcvtzs z11.d, p0/m, z25.s
+; CHECK-NEXT:    fcmge p3.s, p0/z, z29.s, z31.s
+; CHECK-NEXT:    not p4.b, p0/z, p4.b
+; CHECK-NEXT:    fcmge p1.s, p0/z, z30.s, z31.s
+; CHECK-NEXT:    movprfx z23, z18
+; CHECK-NEXT:    fcvtzs z23.d, p0/m, z18.s
+; CHECK-NEXT:    not p2.b, p0/z, p5.b
+; CHECK-NEXT:    fcmge p5.s, p0/z, z17.s, z31.s
+; CHECK-NEXT:    frintx z16.s, p0/m, z16.s
+; CHECK-NEXT:    frintx z10.s, p0/m, z10.s
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p8.s, p0/z, z18.s, z28.s
+; CHECK-NEXT:    sel z4.d, p4, z8.d, z11.d
+; CHECK-NEXT:    movprfx z11, z13
+; CHECK-NEXT:    frintx z11.s, p0/m, z13.s
+; CHECK-NEXT:    not p3.b, p0/z, p3.b
+; CHECK-NEXT:    sel z13.d, p2, z8.d, z12.d
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    fcmge p4.s, p0/z, z7.s, z31.s
+; CHECK-NEXT:    sel z12.d, p3, z8.d, z14.d
+; CHECK-NEXT:    movprfx z14, z15
+; CHECK-NEXT:    frintx z14.s, p0/m, z15.s
+; CHECK-NEXT:    uunpkhi z15.d, z6.s
+; CHECK-NEXT:    movprfx z20, z10
+; CHECK-NEXT:    fcvtzs z20.d, p0/m, z10.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z10.s, z31.s
+; CHECK-NEXT:    sel z5.d, p1, z8.d, z19.d
+; CHECK-NEXT:    movprfx z19, z11
+; CHECK-NEXT:    fcvtzs z19.d, p0/m, z11.s
+; CHECK-NEXT:    fcmge p3.s, p0/z, z11.s, z31.s
+; CHECK-NEXT:    not p5.b, p0/z, p5.b
+; CHECK-NEXT:    fcmge p6.s, p0/z, z16.s, z31.s
+; CHECK-NEXT:    fcmuo p9.s, p0/z, z18.s, z18.s
+; CHECK-NEXT:    movprfx z22, z15
+; CHECK-NEXT:    frintx z22.s, p0/m, z15.s
+; CHECK-NEXT:    fcmge p1.s, p0/z, z14.s, z31.s
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    not p3.b, p0/z, p3.b
+; CHECK-NEXT:    sel z6.d, p2, z8.d, z20.d
+; CHECK-NEXT:    movprfx z20, z21
+; CHECK-NEXT:    frintx z20.s, p0/m, z21.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z18.s, z31.s
+; CHECK-NEXT:    sel z15.d, p3, z8.d, z19.d
+; CHECK-NEXT:    movprfx z19, z17
+; CHECK-NEXT:    fcvtzs z19.d, p0/m, z17.s
+; CHECK-NEXT:    not p3.b, p0/z, p4.b
+; CHECK-NEXT:    fcmge p4.s, p0/z, z22.s, z31.s
+; CHECK-NEXT:    movprfx z21, z14
+; CHECK-NEXT:    fcvtzs z21.d, p0/m, z14.s
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    movprfx z18, z7
+; CHECK-NEXT:    fcvtzs z18.d, p0/m, z7.s
+; CHECK-NEXT:    not p6.b, p0/z, p6.b
+; CHECK-NEXT:    fcmge p7.s, p0/z, z20.s, z31.s
+; CHECK-NEXT:    movprfx z31, z22
+; CHECK-NEXT:    fcvtzs z31.d, p0/m, z22.s
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    mov z19.d, p5/m, z8.d
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z17.s, z28.s
+; CHECK-NEXT:    not p4.b, p0/z, p4.b
+; CHECK-NEXT:    mov z23.d, p2/m, z8.d
+; CHECK-NEXT:    fcmuo p2.s, p0/z, z17.s, z17.s
+; CHECK-NEXT:    movprfx z17, z20
+; CHECK-NEXT:    fcvtzs z17.d, p0/m, z20.s
+; CHECK-NEXT:    mov z21.d, p1/m, z8.d
+; CHECK-NEXT:    mov z18.d, p3/m, z8.d
+; CHECK-NEXT:    not p1.b, p0/z, p7.b
+; CHECK-NEXT:    mov z31.d, p4/m, z8.d
+; CHECK-NEXT:    fcmgt p4.s, p0/z, z20.s, z28.s
+; CHECK-NEXT:    mov z19.d, p5/m, z2.d
+; CHECK-NEXT:    fcmuo p7.s, p0/z, z20.s, z20.s
+; CHECK-NEXT:    movprfx z20, z16
+; CHECK-NEXT:    fcvtzs z20.d, p0/m, z16.s
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z22.s, z28.s
+; CHECK-NEXT:    mov z23.d, p8/m, z2.d
+; CHECK-NEXT:    fcmuo p3.s, p0/z, z22.s, z22.s
+; CHECK-NEXT:    mov z17.d, p1/m, z8.d
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    mov z19.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z16.s, z28.s
+; CHECK-NEXT:    sel z8.d, p6, z8.d, z20.d
+; CHECK-NEXT:    mov z23.d, p9/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p6.s, p0/z, z14.s, z28.s
+; CHECK-NEXT:    mov z31.d, p5/m, z2.d
+; CHECK-NEXT:    mov z17.d, p4/m, z2.d
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z16.s, z16.s
+; CHECK-NEXT:    st1b { z19.b }, p1, [x8, x9]
+; CHECK-NEXT:    rdvl x9, #14
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z1.s, z28.s
+; CHECK-NEXT:    st1b { z23.b }, p1, [x8, x9]
+; CHECK-NEXT:    rdvl x9, #13
+; CHECK-NEXT:    mov z8.d, p2/m, z2.d
+; CHECK-NEXT:    mov z31.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z7.s, z28.s
+; CHECK-NEXT:    mov z17.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z11.s, z28.s
+; CHECK-NEXT:    fcmuo p7.s, p0/z, z14.s, z14.s
+; CHECK-NEXT:    mov z8.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z7.s, z7.s
+; CHECK-NEXT:    sel z7.d, p5, z2.d, z9.d
+; CHECK-NEXT:    st1b { z31.b }, p1, [x8, x9]
+; CHECK-NEXT:    rdvl x9, #12
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z27.s, z28.s
+; CHECK-NEXT:    st1b { z17.b }, p1, [x8, x9]
+; CHECK-NEXT:    rdvl x9, #11
+; CHECK-NEXT:    sel z31.d, p3, z2.d, z18.d
+; CHECK-NEXT:    st1b { z8.b }, p1, [x8, x9]
+; CHECK-NEXT:    rdvl x9, #10
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z30.s, z28.s
+; CHECK-NEXT:    sel z9.d, p2, z2.d, z15.d
+; CHECK-NEXT:    fcmuo p2.s, p0/z, z11.s, z11.s
+; CHECK-NEXT:    sel z8.d, p6, z2.d, z21.d
+; CHECK-NEXT:    mov z31.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p4.s, p0/z, z10.s, z28.s
+; CHECK-NEXT:    fcmgt p6.s, p0/z, z24.s, z28.s
+; CHECK-NEXT:    sel z11.d, p5, z2.d, z13.d
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z25.s, z28.s
+; CHECK-NEXT:    mov z8.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    mov z5.d, p3/m, z2.d
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z29.s, z28.s
+; CHECK-NEXT:    st1b { z31.b }, p1, [x8, x9]
+; CHECK-NEXT:    rdvl x9, #9
+; CHECK-NEXT:    mov z9.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p7.s, p0/z, z10.s, z10.s
+; CHECK-NEXT:    fcmuo p2.s, p0/z, z30.s, z30.s
+; CHECK-NEXT:    mov z6.d, p4/m, z2.d
+; CHECK-NEXT:    st1b { z8.b }, p1, [x8, x9]
+; CHECK-NEXT:    rdvl x9, #8
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z29.s, z29.s
+; CHECK-NEXT:    st1b { z9.b }, p1, [x8, x9]
+; CHECK-NEXT:    fcmuo p1.s, p0/z, z27.s, z27.s
+; CHECK-NEXT:    sel z27.d, p3, z2.d, z12.d
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z0.s, z28.s
+; CHECK-NEXT:    mov z4.d, p5/m, z2.d
+; CHECK-NEXT:    mov z3.d, p6/m, z2.d
+; CHECK-NEXT:    mov z6.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p7.s, p0/z, z25.s, z25.s
+; CHECK-NEXT:    mov z5.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p2.s, p0/z, z24.s, z24.s
+; CHECK-NEXT:    mov z27.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z1.s, z1.s
+; CHECK-NEXT:    mov z11.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p1.s, p0/z, z0.s, z0.s
+; CHECK-NEXT:    st1d { z6.d }, p0, [x8, #7, mul vl]
+; CHECK-NEXT:    sel z0.d, p3, z2.d, z26.d
+; CHECK-NEXT:    st1d { z5.d }, p0, [x8, #6, mul vl]
+; CHECK-NEXT:    mov z4.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    st1d { z27.d }, p0, [x8, #5, mul vl]
+; CHECK-NEXT:    mov z3.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z7.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    st1d { z11.d }, p0, [x8, #4, mul vl]
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    st1d { z4.d }, p0, [x8, #3, mul vl]
+; CHECK-NEXT:    st1d { z3.d }, p0, [x8, #2, mul vl]
+; CHECK-NEXT:    st1d { z7.d }, p0, [x8, #1, mul vl]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x8]
+; CHECK-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #17
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %a = call <vscale x 32 x i64> @llvm.llrint.nxv32i64.nxv32f32(<vscale x 32 x float> %x)
+  ret <vscale x 32 x i64> %a
+}
+declare <vscale x 32 x i64> @llvm.llrint.nxv32i64.nxv32f32(<vscale x 32 x float>)
+
+define <vscale x 1 x i64> @llrint_v1i64_v1f64(<vscale x 1 x double> %x) {
+; CHECK-LABEL: llrint_v1i64_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
+; CHECK-NEXT:    mov z1.d, x8
+; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
+; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
+; CHECK-NEXT:    mov z3.d, x8
+; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z0.d, z3.d
+; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    mov z1.d, p1/m, z2.d
+; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    ret
+  %a = call <vscale x 1 x i64> @llvm.llrint.nxv1i64.nxv1f64(<vscale x 1 x double> %x)
+  ret <vscale x 1 x i64> %a
+}
+declare <vscale x 1 x i64> @llvm.llrint.nxv1i64.nxv1f64(<vscale x 1 x double>)
+
+define <vscale x 2 x i64> @llrint_v2i64_v2f64(<vscale x 2 x double> %x) {
+; CHECK-LABEL: llrint_v2i64_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
+; CHECK-NEXT:    mov z1.d, x8
+; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
+; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
+; CHECK-NEXT:    mov z3.d, x8
+; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z0.d, z3.d
+; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    mov z1.d, p1/m, z2.d
+; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    ret
+  %a = call <vscale x 2 x i64> @llvm.llrint.nxv2i64.nxv2f64(<vscale x 2 x double> %x)
+  ret <vscale x 2 x i64> %a
+}
+declare <vscale x 2 x i64> @llvm.llrint.nxv2i64.nxv2f64(<vscale x 2 x double>)
+
+define <vscale x 4 x i64> @llrint_v4i64_v4f64(<vscale x 4 x double> %x) {
+; CHECK-LABEL: llrint_v4i64_v4f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
+; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
+; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
+; CHECK-NEXT:    frintx z1.d, p0/m, z1.d
+; CHECK-NEXT:    mov z3.d, x8
+; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z2.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, z2.d
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z0.d
+; CHECK-NEXT:    movprfx z5, z1
+; CHECK-NEXT:    fcvtzs z5.d, p0/m, z1.d
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z0.d, z3.d
+; CHECK-NEXT:    fcmgt p4.d, p0/z, z1.d, z3.d
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    sel z3.d, p1, z2.d, z4.d
+; CHECK-NEXT:    fcmuo p1.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    fcmuo p0.d, p0/z, z1.d, z1.d
+; CHECK-NEXT:    sel z2.d, p2, z2.d, z5.d
+; CHECK-NEXT:    sel z0.d, p3, z6.d, z3.d
+; CHECK-NEXT:    sel z1.d, p4, z6.d, z2.d
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %a = call <vscale x 4 x i64> @llvm.llrint.nxv4i64.nxv4f64(<vscale x 4 x double> %x)
+  ret <vscale x 4 x i64> %a
+}
+declare <vscale x 4 x i64> @llvm.llrint.nxv4i64.nxv4f64(<vscale x 4 x double>)
+
+define <vscale x 8 x i64> @llrint_v8i64_v8f64(<vscale x 8 x double> %x) {
+; CHECK-LABEL: llrint_v8i64_v8f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
+; CHECK-NEXT:    mov z5.d, #0x8000000000000000
+; CHECK-NEXT:    mov z4.d, x8
+; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
+; CHECK-NEXT:    mov z26.d, #0x7fffffffffffffff
+; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
+; CHECK-NEXT:    frintx z1.d, p0/m, z1.d
+; CHECK-NEXT:    frintx z2.d, p0/m, z2.d
+; CHECK-NEXT:    frintx z3.d, p0/m, z3.d
+; CHECK-NEXT:    mov z6.d, x8
+; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z4.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, z4.d
+; CHECK-NEXT:    fcmge p3.d, p0/z, z2.d, z4.d
+; CHECK-NEXT:    fcmge p4.d, p0/z, z3.d, z4.d
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z0.d
+; CHECK-NEXT:    movprfx z7, z1
+; CHECK-NEXT:    fcvtzs z7.d, p0/m, z1.d
+; CHECK-NEXT:    movprfx z24, z2
+; CHECK-NEXT:    fcvtzs z24.d, p0/m, z2.d
+; CHECK-NEXT:    movprfx z25, z3
+; CHECK-NEXT:    fcvtzs z25.d, p0/m, z3.d
+; CHECK-NEXT:    fcmgt p7.d, p0/z, z2.d, z6.d
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z0.d, z6.d
+; CHECK-NEXT:    fcmgt p6.d, p0/z, z1.d, z6.d
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    not p3.b, p0/z, p3.b
+; CHECK-NEXT:    mov z4.d, p1/m, z5.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z3.d, z6.d
+; CHECK-NEXT:    not p4.b, p0/z, p4.b
+; CHECK-NEXT:    sel z6.d, p2, z5.d, z7.d
+; CHECK-NEXT:    fcmuo p2.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    sel z7.d, p3, z5.d, z24.d
+; CHECK-NEXT:    fcmuo p3.d, p0/z, z1.d, z1.d
+; CHECK-NEXT:    sel z5.d, p4, z5.d, z25.d
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z2.d, z2.d
+; CHECK-NEXT:    fcmuo p0.d, p0/z, z3.d, z3.d
+; CHECK-NEXT:    sel z0.d, p5, z26.d, z4.d
+; CHECK-NEXT:    sel z1.d, p6, z26.d, z6.d
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    sel z2.d, p7, z26.d, z7.d
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    sel z3.d, p1, z26.d, z5.d
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z2.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z3.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %a = call <vscale x 8 x i64> @llvm.llrint.nxv8i64.nxv8f64(<vscale x 8 x double> %x)
+  ret <vscale x 8 x i64> %a
+}
+declare <vscale x 8 x i64> @llvm.llrint.nxv8i64.nxv8f64(<vscale x 8 x double>)
+
+define <vscale x 16 x i64> @llrint_v16f64(<vscale x 16 x double> %x) {
+; CHECK-LABEL: llrint_v16f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
+; CHECK-NEXT:    mov z24.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z25.d, x8
+; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
+; CHECK-NEXT:    movprfx z26, z0
+; CHECK-NEXT:    frintx z26.d, p0/m, z0.d
+; CHECK-NEXT:    movprfx z27, z1
+; CHECK-NEXT:    frintx z27.d, p0/m, z1.d
+; CHECK-NEXT:    frintx z2.d, p0/m, z2.d
+; CHECK-NEXT:    mov z0.d, #0x8000000000000000
+; CHECK-NEXT:    mov z1.d, x8
+; CHECK-NEXT:    frintx z3.d, p0/m, z3.d
+; CHECK-NEXT:    movprfx z28, z4
+; CHECK-NEXT:    frintx z28.d, p0/m, z4.d
+; CHECK-NEXT:    frintx z5.d, p0/m, z5.d
+; CHECK-NEXT:    frintx z6.d, p0/m, z6.d
+; CHECK-NEXT:    frintx z7.d, p0/m, z7.d
+; CHECK-NEXT:    fcmge p1.d, p0/z, z26.d, z25.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z27.d, z25.d
+; CHECK-NEXT:    movprfx z4, z26
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z26.d
+; CHECK-NEXT:    fcmge p5.d, p0/z, z2.d, z25.d
+; CHECK-NEXT:    movprfx z29, z27
+; CHECK-NEXT:    fcvtzs z29.d, p0/m, z27.d
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z26.d, z1.d
+; CHECK-NEXT:    fcmge p6.d, p0/z, z3.d, z25.d
+; CHECK-NEXT:    fcmge p8.d, p0/z, z5.d, z25.d
+; CHECK-NEXT:    fcmgt p7.d, p0/z, z27.d, z1.d
+; CHECK-NEXT:    fcmge p9.d, p0/z, z6.d, z25.d
+; CHECK-NEXT:    movprfx z30, z28
+; CHECK-NEXT:    fcvtzs z30.d, p0/m, z28.d
+; CHECK-NEXT:    fcmge p10.d, p0/z, z7.d, z25.d
+; CHECK-NEXT:    not p4.b, p0/z, p1.b
+; CHECK-NEXT:    fcmuo p1.d, p0/z, z26.d, z26.d
+; CHECK-NEXT:    movprfx z26, z2
+; CHECK-NEXT:    fcvtzs z26.d, p0/m, z2.d
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    movprfx z31, z6
+; CHECK-NEXT:    fcvtzs z31.d, p0/m, z6.d
+; CHECK-NEXT:    movprfx z8, z7
+; CHECK-NEXT:    fcvtzs z8.d, p0/m, z7.d
+; CHECK-NEXT:    mov z4.d, p4/m, z0.d
+; CHECK-NEXT:    fcmge p4.d, p0/z, z28.d, z25.d
+; CHECK-NEXT:    not p5.b, p0/z, p5.b
+; CHECK-NEXT:    mov z29.d, p2/m, z0.d
+; CHECK-NEXT:    fcmuo p2.d, p0/z, z27.d, z27.d
+; CHECK-NEXT:    movprfx z27, z3
+; CHECK-NEXT:    fcvtzs z27.d, p0/m, z3.d
+; CHECK-NEXT:    sel z25.d, p5, z0.d, z26.d
+; CHECK-NEXT:    movprfx z26, z5
+; CHECK-NEXT:    fcvtzs z26.d, p0/m, z5.d
+; CHECK-NEXT:    not p6.b, p0/z, p6.b
+; CHECK-NEXT:    not p5.b, p0/z, p8.b
+; CHECK-NEXT:    fcmgt p8.d, p0/z, z2.d, z1.d
+; CHECK-NEXT:    not p4.b, p0/z, p4.b
+; CHECK-NEXT:    mov z27.d, p6/m, z0.d
+; CHECK-NEXT:    not p6.b, p0/z, p9.b
+; CHECK-NEXT:    fcmuo p9.d, p0/z, z2.d, z2.d
+; CHECK-NEXT:    mov z30.d, p4/m, z0.d
+; CHECK-NEXT:    not p4.b, p0/z, p10.b
+; CHECK-NEXT:    fcmgt p10.d, p0/z, z3.d, z1.d
+; CHECK-NEXT:    mov z26.d, p5/m, z0.d
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z28.d, z1.d
+; CHECK-NEXT:    mov z31.d, p6/m, z0.d
+; CHECK-NEXT:    mov z8.d, p4/m, z0.d
+; CHECK-NEXT:    sel z0.d, p3, z24.d, z4.d
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z5.d, z1.d
+; CHECK-NEXT:    fcmgt p4.d, p0/z, z6.d, z1.d
+; CHECK-NEXT:    fcmgt p6.d, p0/z, z7.d, z1.d
+; CHECK-NEXT:    sel z1.d, p7, z24.d, z29.d
+; CHECK-NEXT:    fcmuo p7.d, p0/z, z3.d, z3.d
+; CHECK-NEXT:    sel z2.d, p8, z24.d, z25.d
+; CHECK-NEXT:    sel z3.d, p10, z24.d, z27.d
+; CHECK-NEXT:    sel z4.d, p5, z24.d, z30.d
+; CHECK-NEXT:    fcmuo p5.d, p0/z, z28.d, z28.d
+; CHECK-NEXT:    fcmuo p8.d, p0/z, z5.d, z5.d
+; CHECK-NEXT:    fcmuo p10.d, p0/z, z6.d, z6.d
+; CHECK-NEXT:    sel z5.d, p3, z24.d, z26.d
+; CHECK-NEXT:    fcmuo p0.d, p0/z, z7.d, z7.d
+; CHECK-NEXT:    sel z6.d, p4, z24.d, z31.d
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    sel z7.d, p6, z24.d, z8.d
+; CHECK-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z2.d, p9/m, #0 // =0x0
+; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z3.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    mov z4.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z5.d, p8/m, #0 // =0x0
+; CHECK-NEXT:    mov z6.d, p10/m, #0 // =0x0
+; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z1.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z7.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %a = call <vscale x 16 x i64> @llvm.llrint.nxv16i64.nxv16f64(<vscale x 16 x double> %x)
+  ret <vscale x 16 x i64> %a
+}
+declare <vscale x 16 x i64> @llvm.llrint.nxv16i64.nxv16f64(<vscale x 16 x double>)
+
+define <vscale x 32 x i64> @llrint_v32f64(<vscale x 32 x double> %x) {
+; CHECK-LABEL: llrint_v32f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-12
+; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z18, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z17, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z16, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z15, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z14, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z13, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z10, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xe0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 96 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    rdvl x9, #8
+; CHECK-NEXT:    rdvl x10, #9
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    rdvl x11, #10
+; CHECK-NEXT:    mov x12, #-4332462841530417152 // =0xc3e0000000000000
+; CHECK-NEXT:    ld1b { z0.b }, p1/z, [x0, x9]
+; CHECK-NEXT:    ld1b { z1.b }, p1/z, [x0, x10]
+; CHECK-NEXT:    mov z2.d, x12
+; CHECK-NEXT:    rdvl x14, #13
+; CHECK-NEXT:    rdvl x13, #12
+; CHECK-NEXT:    rdvl x12, #11
+; CHECK-NEXT:    ld1b { z6.b }, p1/z, [x0, x14]
+; CHECK-NEXT:    ld1b { z7.b }, p1/z, [x0, x13]
+; CHECK-NEXT:    mov z3.d, #0x8000000000000000
+; CHECK-NEXT:    movprfx z24, z0
+; CHECK-NEXT:    frintx z24.d, p0/m, z0.d
+; CHECK-NEXT:    ld1b { z0.b }, p1/z, [x0, x11]
+; CHECK-NEXT:    movprfx z5, z1
+; CHECK-NEXT:    frintx z5.d, p0/m, z1.d
+; CHECK-NEXT:    ld1b { z1.b }, p1/z, [x0, x12]
+; CHECK-NEXT:    mov x15, #4890909195324358655 // =0x43dfffffffffffff
+; CHECK-NEXT:    rdvl x16, #15
+; CHECK-NEXT:    movprfx z30, z6
+; CHECK-NEXT:    frintx z30.d, p0/m, z6.d
+; CHECK-NEXT:    movprfx z28, z7
+; CHECK-NEXT:    frintx z28.d, p0/m, z7.d
+; CHECK-NEXT:    ld1b { z8.b }, p1/z, [x0, x16]
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    frintx z4.d, p0/m, z0.d
+; CHECK-NEXT:    mov z0.d, #0x7fffffffffffffff
+; CHECK-NEXT:    ld1d { z18.d }, p0/z, [x0]
+; CHECK-NEXT:    fcmge p3.d, p0/z, z5.d, z2.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z24.d, z2.d
+; CHECK-NEXT:    movprfx z6, z5
+; CHECK-NEXT:    fcvtzs z6.d, p0/m, z5.d
+; CHECK-NEXT:    movprfx z27, z1
+; CHECK-NEXT:    frintx z27.d, p0/m, z1.d
+; CHECK-NEXT:    movprfx z25, z24
+; CHECK-NEXT:    fcvtzs z25.d, p0/m, z24.d
+; CHECK-NEXT:    mov z1.d, x15
+; CHECK-NEXT:    rdvl x15, #14
+; CHECK-NEXT:    movprfx z9, z28
+; CHECK-NEXT:    fcvtzs z9.d, p0/m, z28.d
+; CHECK-NEXT:    movprfx z13, z8
+; CHECK-NEXT:    frintx z13.d, p0/m, z8.d
+; CHECK-NEXT:    fcmge p4.d, p0/z, z4.d, z2.d
+; CHECK-NEXT:    movprfx z7, z4
+; CHECK-NEXT:    fcvtzs z7.d, p0/m, z4.d
+; CHECK-NEXT:    ld1d { z15.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    not p3.b, p0/z, p3.b
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z24.d, z1.d
+; CHECK-NEXT:    fcmgt p6.d, p0/z, z5.d, z1.d
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    fcmge p7.d, p0/z, z27.d, z2.d
+; CHECK-NEXT:    movprfx z26, z27
+; CHECK-NEXT:    fcvtzs z26.d, p0/m, z27.d
+; CHECK-NEXT:    sel z29.d, p3, z3.d, z6.d
+; CHECK-NEXT:    ld1b { z6.b }, p1/z, [x0, x15]
+; CHECK-NEXT:    fcmge p3.d, p0/z, z28.d, z2.d
+; CHECK-NEXT:    not p4.b, p0/z, p4.b
+; CHECK-NEXT:    mov z25.d, p2/m, z3.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z4.d, z1.d
+; CHECK-NEXT:    movprfx z16, z13
+; CHECK-NEXT:    fcvtzs z16.d, p0/m, z13.d
+; CHECK-NEXT:    ld1d { z17.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1d { z14.d }, p0/z, [x0, #3, mul vl]
+; CHECK-NEXT:    sel z31.d, p4, z3.d, z7.d
+; CHECK-NEXT:    movprfx z11, z6
+; CHECK-NEXT:    frintx z11.d, p0/m, z6.d
+; CHECK-NEXT:    not p7.b, p0/z, p7.b
+; CHECK-NEXT:    not p3.b, p0/z, p3.b
+; CHECK-NEXT:    sel z6.d, p5, z0.d, z25.d
+; CHECK-NEXT:    fcmgt p4.d, p0/z, z27.d, z1.d
+; CHECK-NEXT:    sel z7.d, p6, z0.d, z29.d
+; CHECK-NEXT:    mov z26.d, p7/m, z3.d
+; CHECK-NEXT:    fcmge p5.d, p0/z, z13.d, z2.d
+; CHECK-NEXT:    sel z25.d, p2, z0.d, z31.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z30.d, z2.d
+; CHECK-NEXT:    sel z29.d, p3, z3.d, z9.d
+; CHECK-NEXT:    fcmge p3.d, p0/z, z11.d, z2.d
+; CHECK-NEXT:    movprfx z31, z30
+; CHECK-NEXT:    fcvtzs z31.d, p0/m, z30.d
+; CHECK-NEXT:    movprfx z9, z11
+; CHECK-NEXT:    fcvtzs z9.d, p0/m, z11.d
+; CHECK-NEXT:    mov z26.d, p4/m, z0.d
+; CHECK-NEXT:    fcmgt p4.d, p0/z, z28.d, z1.d
+; CHECK-NEXT:    fcmgt p6.d, p0/z, z30.d, z1.d
+; CHECK-NEXT:    not p7.b, p0/z, p5.b
+; CHECK-NEXT:    fcmuo p5.d, p0/z, z27.d, z27.d
+; CHECK-NEXT:    fcmgt p8.d, p0/z, z13.d, z1.d
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    movprfx z27, z18
+; CHECK-NEXT:    frintx z27.d, p0/m, z18.d
+; CHECK-NEXT:    ld1d { z8.d }, p0/z, [x0, #7, mul vl]
+; CHECK-NEXT:    not p3.b, p0/z, p3.b
+; CHECK-NEXT:    mov z16.d, p7/m, z3.d
+; CHECK-NEXT:    fcmuo p7.d, p0/z, z13.d, z13.d
+; CHECK-NEXT:    mov z31.d, p2/m, z3.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z11.d, z1.d
+; CHECK-NEXT:    mov z29.d, p4/m, z0.d
+; CHECK-NEXT:    mov z9.d, p3/m, z3.d
+; CHECK-NEXT:    fcmuo p3.d, p0/z, z28.d, z28.d
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z30.d, z30.d
+; CHECK-NEXT:    movprfx z28, z17
+; CHECK-NEXT:    frintx z28.d, p0/m, z17.d
+; CHECK-NEXT:    movprfx z30, z15
+; CHECK-NEXT:    frintx z30.d, p0/m, z15.d
+; CHECK-NEXT:    ld1d { z13.d }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT:    mov z31.d, p6/m, z0.d
+; CHECK-NEXT:    fcmuo p6.d, p0/z, z11.d, z11.d
+; CHECK-NEXT:    sel z11.d, p8, z0.d, z16.d
+; CHECK-NEXT:    mov z9.d, p2/m, z0.d
+; CHECK-NEXT:    fcmuo p2.d, p0/z, z24.d, z24.d
+; CHECK-NEXT:    movprfx z24, z14
+; CHECK-NEXT:    frintx z24.d, p0/m, z14.d
+; CHECK-NEXT:    fcmge p8.d, p0/z, z27.d, z2.d
+; CHECK-NEXT:    ld1d { z10.d }, p0/z, [x0, #6, mul vl]
+; CHECK-NEXT:    ld1d { z12.d }, p0/z, [x0, #5, mul vl]
+; CHECK-NEXT:    mov z26.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    mov z29.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    fcmge p5.d, p0/z, z28.d, z2.d
+; CHECK-NEXT:    movprfx z14, z27
+; CHECK-NEXT:    fcvtzs z14.d, p0/m, z27.d
+; CHECK-NEXT:    fcmge p3.d, p0/z, z30.d, z2.d
+; CHECK-NEXT:    frintx z13.d, p0/m, z13.d
+; CHECK-NEXT:    mov z31.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmge p4.d, p0/z, z24.d, z2.d
+; CHECK-NEXT:    mov z9.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    movprfx z15, z28
+; CHECK-NEXT:    fcvtzs z15.d, p0/m, z28.d
+; CHECK-NEXT:    not p6.b, p0/z, p8.b
+; CHECK-NEXT:    movprfx z16, z30
+; CHECK-NEXT:    fcvtzs z16.d, p0/m, z30.d
+; CHECK-NEXT:    frintx z12.d, p0/m, z12.d
+; CHECK-NEXT:    frintx z10.d, p0/m, z10.d
+; CHECK-NEXT:    movprfx z17, z24
+; CHECK-NEXT:    fcvtzs z17.d, p0/m, z24.d
+; CHECK-NEXT:    movprfx z18, z8
+; CHECK-NEXT:    frintx z18.d, p0/m, z8.d
+; CHECK-NEXT:    not p5.b, p0/z, p5.b
+; CHECK-NEXT:    sel z8.d, p6, z3.d, z14.d
+; CHECK-NEXT:    not p3.b, p0/z, p3.b
+; CHECK-NEXT:    fcmge p6.d, p0/z, z13.d, z2.d
+; CHECK-NEXT:    mov z11.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    not p4.b, p0/z, p4.b
+; CHECK-NEXT:    sel z14.d, p5, z3.d, z15.d
+; CHECK-NEXT:    fcmuo p7.d, p0/z, z5.d, z5.d
+; CHECK-NEXT:    sel z15.d, p3, z3.d, z16.d
+; CHECK-NEXT:    movprfx z16, z13
+; CHECK-NEXT:    fcvtzs z16.d, p0/m, z13.d
+; CHECK-NEXT:    fcmge p5.d, p0/z, z12.d, z2.d
+; CHECK-NEXT:    fcmge p3.d, p0/z, z10.d, z2.d
+; CHECK-NEXT:    sel z5.d, p4, z3.d, z17.d
+; CHECK-NEXT:    fcmge p4.d, p0/z, z18.d, z2.d
+; CHECK-NEXT:    not p6.b, p0/z, p6.b
+; CHECK-NEXT:    movprfx z2, z12
+; CHECK-NEXT:    fcvtzs z2.d, p0/m, z12.d
+; CHECK-NEXT:    movprfx z17, z10
+; CHECK-NEXT:    fcvtzs z17.d, p0/m, z10.d
+; CHECK-NEXT:    st1b { z11.b }, p1, [x8, x16]
+; CHECK-NEXT:    movprfx z11, z18
+; CHECK-NEXT:    fcvtzs z11.d, p0/m, z18.d
+; CHECK-NEXT:    mov z6.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    st1b { z9.b }, p1, [x8, x15]
+; CHECK-NEXT:    sel z9.d, p6, z3.d, z16.d
+; CHECK-NEXT:    fcmuo p6.d, p0/z, z4.d, z4.d
+; CHECK-NEXT:    not p5.b, p0/z, p5.b
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z18.d, z1.d
+; CHECK-NEXT:    mov z7.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    not p3.b, p0/z, p3.b
+; CHECK-NEXT:    st1b { z31.b }, p1, [x8, x14]
+; CHECK-NEXT:    fcmgt p7.d, p0/z, z24.d, z1.d
+; CHECK-NEXT:    not p4.b, p0/z, p4.b
+; CHECK-NEXT:    mov z2.d, p5/m, z3.d
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z28.d, z1.d
+; CHECK-NEXT:    sel z4.d, p3, z3.d, z17.d
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z13.d, z1.d
+; CHECK-NEXT:    mov z25.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    sel z3.d, p4, z3.d, z11.d
+; CHECK-NEXT:    fcmgt p4.d, p0/z, z10.d, z1.d
+; CHECK-NEXT:    fcmgt p6.d, p0/z, z12.d, z1.d
+; CHECK-NEXT:    st1b { z29.b }, p1, [x8, x13]
+; CHECK-NEXT:    st1b { z26.b }, p1, [x8, x12]
+; CHECK-NEXT:    sel z26.d, p5, z0.d, z14.d
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z30.d, z1.d
+; CHECK-NEXT:    sel z29.d, p3, z0.d, z9.d
+; CHECK-NEXT:    fcmuo p3.d, p0/z, z18.d, z18.d
+; CHECK-NEXT:    mov z3.d, p2/m, z0.d
+; CHECK-NEXT:    st1b { z25.b }, p1, [x8, x11]
+; CHECK-NEXT:    fcmuo p2.d, p0/z, z10.d, z10.d
+; CHECK-NEXT:    mov z4.d, p4/m, z0.d
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z12.d, z12.d
+; CHECK-NEXT:    st1b { z7.b }, p1, [x8, x10]
+; CHECK-NEXT:    mov z2.d, p6/m, z0.d
+; CHECK-NEXT:    st1b { z6.b }, p1, [x8, x9]
+; CHECK-NEXT:    fcmuo p1.d, p0/z, z13.d, z13.d
+; CHECK-NEXT:    fcmgt p6.d, p0/z, z27.d, z1.d
+; CHECK-NEXT:    mov z3.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p3.d, p0/z, z24.d, z24.d
+; CHECK-NEXT:    sel z1.d, p7, z0.d, z5.d
+; CHECK-NEXT:    mov z4.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p2.d, p0/z, z30.d, z30.d
+; CHECK-NEXT:    sel z5.d, p5, z0.d, z15.d
+; CHECK-NEXT:    mov z2.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z28.d, z28.d
+; CHECK-NEXT:    mov z29.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p1.d, p0/z, z27.d, z27.d
+; CHECK-NEXT:    sel z0.d, p6, z0.d, z8.d
+; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    st1d { z3.d }, p0, [x8, #7, mul vl]
+; CHECK-NEXT:    mov z5.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    st1d { z4.d }, p0, [x8, #6, mul vl]
+; CHECK-NEXT:    mov z26.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    st1d { z2.d }, p0, [x8, #5, mul vl]
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    st1d { z29.d }, p0, [x8, #4, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [x8, #3, mul vl]
+; CHECK-NEXT:    st1d { z5.d }, p0, [x8, #2, mul vl]
+; CHECK-NEXT:    st1d { z26.d }, p0, [x8, #1, mul vl]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x8]
+; CHECK-NEXT:    ldr z18, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z17, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z16, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z15, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z14, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z13, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z12, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z10, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #12
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %a = call <vscale x 32 x i64> @llvm.llrint.nxv32i64.nxv16f64(<vscale x 32 x double> %x)
+  ret <vscale x 32 x i64> %a
+}
+declare <vscale x 32 x i64> @llvm.llrint.nxv32i64.nxv32f64(<vscale x 32 x double>)
diff --git a/llvm/test/CodeGen/AArch64/sve-lrint.ll b/llvm/test/CodeGen/AArch64/sve-lrint.ll
new file mode 100644
index 000000000000..2a1432d881e5
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-lrint.ll
@@ -0,0 +1,1764 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=aarch64 -mattr=+sve |\
+; RUN:   FileCheck --check-prefixes=CHECK %s
+
+define <vscale x 1 x iXLen> @lrint_v1f16(<vscale x 1 x half> %x) {
+; CHECK-LABEL: lrint_v1f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
+; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
+; CHECK-NEXT:    mov z3.h, w8
+; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z3.h
+; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
+; CHECK-NEXT:    mov z1.d, p1/m, z2.d
+; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    ret
+  %a = call <vscale x 1 x iXLen> @llvm.lrint.nxv1iXLen.nxv1f16(<vscale x 1 x half> %x)
+  ret <vscale x 1 x iXLen> %a
+}
+declare <vscale x 1 x iXLen> @llvm.lrint.nxv1iXLen.nxv1f16(<vscale x 1 x half>)
+
+define <vscale x 2 x iXLen> @lrint_v2f16(<vscale x 2 x half> %x) {
+; CHECK-LABEL: lrint_v2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
+; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
+; CHECK-NEXT:    mov z3.h, w8
+; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z3.h
+; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
+; CHECK-NEXT:    mov z1.d, p1/m, z2.d
+; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    ret
+  %a = call <vscale x 2 x iXLen> @llvm.lrint.nxv2iXLen.nxv2f16(<vscale x 2 x half> %x)
+  ret <vscale x 2 x iXLen> %a
+}
+declare <vscale x 2 x iXLen> @llvm.lrint.nxv2iXLen.nxv2f16(<vscale x 2 x half>)
+
+define <vscale x 4 x iXLen> @lrint_v4f16(<vscale x 4 x half> %x) {
+; CHECK-LABEL: lrint_v4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    uunpklo z1.d, z0.s
+; CHECK-NEXT:    uunpkhi z0.d, z0.s
+; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z2.h, w8
+; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    mov z3.h, w8
+; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
+; CHECK-NEXT:    frintx z1.h, p0/m, z1.h
+; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
+; CHECK-NEXT:    fcmge p1.h, p0/z, z1.h, z2.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z0.h, z2.h
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
+; CHECK-NEXT:    movprfx z4, z1
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z1.h
+; CHECK-NEXT:    movprfx z5, z0
+; CHECK-NEXT:    fcvtzs z5.d, p0/m, z0.h
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z1.h, z3.h
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z0.h, z3.h
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    sel z3.d, p1, z2.d, z4.d
+; CHECK-NEXT:    fcmuo p1.h, p0/z, z1.h, z1.h
+; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
+; CHECK-NEXT:    sel z2.d, p2, z2.d, z5.d
+; CHECK-NEXT:    sel z0.d, p3, z6.d, z3.d
+; CHECK-NEXT:    sel z1.d, p4, z6.d, z2.d
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %a = call <vscale x 4 x iXLen> @llvm.lrint.nxv4iXLen.nxv4f16(<vscale x 4 x half> %x)
+  ret <vscale x 4 x iXLen> %a
+}
+declare <vscale x 4 x iXLen> @llvm.lrint.nxv4iXLen.nxv4f16(<vscale x 4 x half>)
+
+define <vscale x 8 x iXLen> @lrint_v8f16(<vscale x 8 x half> %x) {
+; CHECK-LABEL: lrint_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    uunpklo z1.s, z0.h
+; CHECK-NEXT:    uunpkhi z0.s, z0.h
+; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z4.h, w8
+; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    mov z6.h, w8
+; CHECK-NEXT:    mov z26.d, #0x7fffffffffffffff
+; CHECK-NEXT:    uunpklo z2.d, z1.s
+; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    uunpklo z3.d, z0.s
+; CHECK-NEXT:    uunpkhi z0.d, z0.s
+; CHECK-NEXT:    frintx z2.h, p0/m, z2.h
+; CHECK-NEXT:    frintx z1.h, p0/m, z1.h
+; CHECK-NEXT:    frintx z3.h, p0/m, z3.h
+; CHECK-NEXT:    movprfx z5, z0
+; CHECK-NEXT:    frintx z5.h, p0/m, z0.h
+; CHECK-NEXT:    mov z0.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p1.h, p0/z, z2.h, z4.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z1.h, z4.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z3.h, z4.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z5.h, z4.h
+; CHECK-NEXT:    movprfx z4, z2
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z2.h
+; CHECK-NEXT:    movprfx z7, z1
+; CHECK-NEXT:    fcvtzs z7.d, p0/m, z1.h
+; CHECK-NEXT:    movprfx z24, z3
+; CHECK-NEXT:    fcvtzs z24.d, p0/m, z3.h
+; CHECK-NEXT:    movprfx z25, z5
+; CHECK-NEXT:    fcvtzs z25.d, p0/m, z5.h
+; CHECK-NEXT:    fcmgt p7.h, p0/z, z3.h, z6.h
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z2.h, z6.h
+; CHECK-NEXT:    fcmgt p6.h, p0/z, z1.h, z6.h
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    not p3.b, p0/z, p3.b
+; CHECK-NEXT:    mov z4.d, p1/m, z0.d
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z5.h, z6.h
+; CHECK-NEXT:    not p4.b, p0/z, p4.b
+; CHECK-NEXT:    sel z6.d, p2, z0.d, z7.d
+; CHECK-NEXT:    fcmuo p2.h, p0/z, z2.h, z2.h
+; CHECK-NEXT:    sel z7.d, p3, z0.d, z24.d
+; CHECK-NEXT:    fcmuo p3.h, p0/z, z1.h, z1.h
+; CHECK-NEXT:    sel z24.d, p4, z0.d, z25.d
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z3.h, z3.h
+; CHECK-NEXT:    fcmuo p0.h, p0/z, z5.h, z5.h
+; CHECK-NEXT:    sel z0.d, p5, z26.d, z4.d
+; CHECK-NEXT:    sel z1.d, p6, z26.d, z6.d
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    sel z2.d, p7, z26.d, z7.d
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    sel z3.d, p1, z26.d, z24.d
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z2.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z3.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %a = call <vscale x 8 x iXLen> @llvm.lrint.nxv8iXLen.nxv8f16(<vscale x 8 x half> %x)
+  ret <vscale x 8 x iXLen> %a
+}
+declare <vscale x 8 x iXLen> @llvm.lrint.nxv8iXLen.nxv8f16(<vscale x 8 x half>)
+
+define <vscale x 16 x iXLen> @lrint_v16f16(<vscale x 16 x half> %x) {
+; CHECK-LABEL: lrint_v16f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    uunpklo z2.s, z0.h
+; CHECK-NEXT:    uunpkhi z0.s, z0.h
+; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    uunpklo z4.s, z1.h
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-NEXT:    mov z5.h, w8
+; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    mov z25.d, #0x8000000000000000
+; CHECK-NEXT:    mov z27.h, w8
+; CHECK-NEXT:    mov z7.d, #0x7fffffffffffffff
+; CHECK-NEXT:    uunpklo z3.d, z2.s
+; CHECK-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-NEXT:    uunpklo z6.d, z0.s
+; CHECK-NEXT:    uunpkhi z0.d, z0.s
+; CHECK-NEXT:    uunpklo z24.d, z4.s
+; CHECK-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-NEXT:    uunpklo z26.d, z1.s
+; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    frintx z2.h, p0/m, z2.h
+; CHECK-NEXT:    frintx z3.h, p0/m, z3.h
+; CHECK-NEXT:    frintx z6.h, p0/m, z6.h
+; CHECK-NEXT:    movprfx z28, z0
+; CHECK-NEXT:    frintx z28.h, p0/m, z0.h
+; CHECK-NEXT:    movprfx z29, z4
+; CHECK-NEXT:    frintx z29.h, p0/m, z4.h
+; CHECK-NEXT:    frintx z24.h, p0/m, z24.h
+; CHECK-NEXT:    movprfx z30, z1
+; CHECK-NEXT:    frintx z30.h, p0/m, z1.h
+; CHECK-NEXT:    frintx z26.h, p0/m, z26.h
+; CHECK-NEXT:    fcmge p5.h, p0/z, z2.h, z5.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z3.h, z5.h
+; CHECK-NEXT:    movprfx z1, z2
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z2.h
+; CHECK-NEXT:    movprfx z0, z3
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z3.h
+; CHECK-NEXT:    fcmge p6.h, p0/z, z6.h, z5.h
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z3.h, z27.h
+; CHECK-NEXT:    fcmuo p1.h, p0/z, z3.h, z3.h
+; CHECK-NEXT:    fcmge p7.h, p0/z, z28.h, z5.h
+; CHECK-NEXT:    movprfx z3, z6
+; CHECK-NEXT:    fcvtzs z3.d, p0/m, z6.h
+; CHECK-NEXT:    fcmge p8.h, p0/z, z24.h, z5.h
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z2.h, z27.h
+; CHECK-NEXT:    fcmge p9.h, p0/z, z26.h, z5.h
+; CHECK-NEXT:    not p5.b, p0/z, p5.b
+; CHECK-NEXT:    movprfx z4, z24
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z24.h
+; CHECK-NEXT:    fcmge p10.h, p0/z, z30.h, z5.h
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    movprfx z31, z26
+; CHECK-NEXT:    fcvtzs z31.d, p0/m, z26.h
+; CHECK-NEXT:    movprfx z8, z30
+; CHECK-NEXT:    fcvtzs z8.d, p0/m, z30.h
+; CHECK-NEXT:    mov z1.d, p5/m, z25.d
+; CHECK-NEXT:    fcmge p5.h, p0/z, z29.h, z5.h
+; CHECK-NEXT:    not p6.b, p0/z, p6.b
+; CHECK-NEXT:    mov z0.d, p2/m, z25.d
+; CHECK-NEXT:    fcmuo p2.h, p0/z, z2.h, z2.h
+; CHECK-NEXT:    movprfx z2, z28
+; CHECK-NEXT:    fcvtzs z2.d, p0/m, z28.h
+; CHECK-NEXT:    movprfx z5, z29
+; CHECK-NEXT:    fcvtzs z5.d, p0/m, z29.h
+; CHECK-NEXT:    not p7.b, p0/z, p7.b
+; CHECK-NEXT:    mov z3.d, p6/m, z25.d
+; CHECK-NEXT:    not p6.b, p0/z, p8.b
+; CHECK-NEXT:    fcmgt p8.h, p0/z, z6.h, z27.h
+; CHECK-NEXT:    mov z1.d, p4/m, z7.d
+; CHECK-NEXT:    not p5.b, p0/z, p5.b
+; CHECK-NEXT:    mov z0.d, p3/m, z7.d
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z29.h, z27.h
+; CHECK-NEXT:    sel z9.d, p7, z25.d, z2.d
+; CHECK-NEXT:    not p7.b, p0/z, p9.b
+; CHECK-NEXT:    mov z4.d, p6/m, z25.d
+; CHECK-NEXT:    not p6.b, p0/z, p10.b
+; CHECK-NEXT:    fcmgt p10.h, p0/z, z28.h, z27.h
+; CHECK-NEXT:    mov z5.d, p5/m, z25.d
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z24.h, z27.h
+; CHECK-NEXT:    fcmuo p9.h, p0/z, z6.h, z6.h
+; CHECK-NEXT:    sel z6.d, p7, z25.d, z31.d
+; CHECK-NEXT:    sel z25.d, p6, z25.d, z8.d
+; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    fcmgt p6.h, p0/z, z26.h, z27.h
+; CHECK-NEXT:    fcmgt p7.h, p0/z, z30.h, z27.h
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z28.h, z28.h
+; CHECK-NEXT:    sel z2.d, p8, z7.d, z3.d
+; CHECK-NEXT:    sel z3.d, p10, z7.d, z9.d
+; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    fcmuo p8.h, p0/z, z29.h, z29.h
+; CHECK-NEXT:    mov z4.d, p5/m, z7.d
+; CHECK-NEXT:    fcmuo p5.h, p0/z, z24.h, z24.h
+; CHECK-NEXT:    fcmuo p10.h, p0/z, z26.h, z26.h
+; CHECK-NEXT:    mov z5.d, p3/m, z7.d
+; CHECK-NEXT:    mov z6.d, p6/m, z7.d
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    fcmuo p0.h, p0/z, z30.h, z30.h
+; CHECK-NEXT:    sel z7.d, p7, z7.d, z25.d
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z2.d, p9/m, #0 // =0x0
+; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z3.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z4.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    mov z5.d, p8/m, #0 // =0x0
+; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z6.d, p10/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z1.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z7.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %a = call <vscale x 16 x iXLen> @llvm.lrint.nxv16iXLen.nxv16f16(<vscale x 16 x half> %x)
+  ret <vscale x 16 x iXLen> %a
+}
+declare <vscale x 16 x iXLen> @llvm.lrint.nxv16iXLen.nxv16f16(<vscale x 16 x half>)
+
+define <vscale x 32 x iXLen> @lrint_v32f16(<vscale x 32 x half> %x) {
+; CHECK-LABEL: lrint_v32f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-17
+; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 136 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
+; CHECK-NEXT:    uunpkhi z5.s, z0.h
+; CHECK-NEXT:    uunpklo z4.s, z0.h
+; CHECK-NEXT:    mov w9, #64511 // =0xfbff
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z6.s, z1.h
+; CHECK-NEXT:    mov z30.h, w9
+; CHECK-NEXT:    uunpkhi z10.s, z1.h
+; CHECK-NEXT:    mov w9, #31743 // =0x7bff
+; CHECK-NEXT:    mov z29.d, #0x8000000000000000
+; CHECK-NEXT:    uunpklo z8.s, z2.h
+; CHECK-NEXT:    uunpkhi z13.s, z3.h
+; CHECK-NEXT:    uunpklo z18.s, z3.h
+; CHECK-NEXT:    uunpklo z7.d, z5.s
+; CHECK-NEXT:    uunpklo z0.d, z4.s
+; CHECK-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-NEXT:    uunpkhi z24.d, z5.s
+; CHECK-NEXT:    uunpklo z25.d, z6.s
+; CHECK-NEXT:    uunpkhi z26.d, z6.s
+; CHECK-NEXT:    uunpklo z27.d, z10.s
+; CHECK-NEXT:    uunpkhi z10.d, z10.s
+; CHECK-NEXT:    uunpklo z12.d, z8.s
+; CHECK-NEXT:    uunpkhi z16.d, z8.s
+; CHECK-NEXT:    movprfx z5, z7
+; CHECK-NEXT:    frintx z5.h, p0/m, z7.h
+; CHECK-NEXT:    movprfx z1, z4
+; CHECK-NEXT:    frintx z1.h, p0/m, z4.h
+; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
+; CHECK-NEXT:    movprfx z6, z24
+; CHECK-NEXT:    frintx z6.h, p0/m, z24.h
+; CHECK-NEXT:    movprfx z24, z25
+; CHECK-NEXT:    frintx z24.h, p0/m, z25.h
+; CHECK-NEXT:    movprfx z25, z26
+; CHECK-NEXT:    frintx z25.h, p0/m, z26.h
+; CHECK-NEXT:    movprfx z28, z27
+; CHECK-NEXT:    frintx z28.h, p0/m, z27.h
+; CHECK-NEXT:    movprfx z8, z10
+; CHECK-NEXT:    frintx z8.h, p0/m, z10.h
+; CHECK-NEXT:    mov z7.h, w9
+; CHECK-NEXT:    mov z4.d, #0x7fffffffffffffff
+; CHECK-NEXT:    rdvl x9, #15
+; CHECK-NEXT:    fcmge p3.h, p0/z, z5.h, z30.h
+; CHECK-NEXT:    movprfx z11, z5
+; CHECK-NEXT:    fcvtzs z11.d, p0/m, z5.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z1.h, z30.h
+; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z30.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z6.h, z30.h
+; CHECK-NEXT:    movprfx z9, z6
+; CHECK-NEXT:    fcvtzs z9.d, p0/m, z6.h
+; CHECK-NEXT:    movprfx z15, z25
+; CHECK-NEXT:    fcvtzs z15.d, p0/m, z25.h
+; CHECK-NEXT:    movprfx z14, z24
+; CHECK-NEXT:    fcvtzs z14.d, p0/m, z24.h
+; CHECK-NEXT:    movprfx z26, z0
+; CHECK-NEXT:    fcvtzs z26.d, p0/m, z0.h
+; CHECK-NEXT:    movprfx z19, z28
+; CHECK-NEXT:    fcvtzs z19.d, p0/m, z28.h
+; CHECK-NEXT:    movprfx z31, z1
+; CHECK-NEXT:    fcvtzs z31.d, p0/m, z1.h
+; CHECK-NEXT:    not p3.b, p0/z, p3.b
+; CHECK-NEXT:    not p6.b, p0/z, p2.b
+; CHECK-NEXT:    fcmge p2.h, p0/z, z25.h, z30.h
+; CHECK-NEXT:    sel z27.d, p3, z29.d, z11.d
+; CHECK-NEXT:    uunpkhi z11.s, z2.h
+; CHECK-NEXT:    not p5.b, p0/z, p1.b
+; CHECK-NEXT:    fcmge p1.h, p0/z, z24.h, z30.h
+; CHECK-NEXT:    not p3.b, p0/z, p4.b
+; CHECK-NEXT:    fcmge p4.h, p0/z, z28.h, z30.h
+; CHECK-NEXT:    mov z26.d, p5/m, z29.d
+; CHECK-NEXT:    mov z31.d, p6/m, z29.d
+; CHECK-NEXT:    sel z2.d, p3, z29.d, z9.d
+; CHECK-NEXT:    movprfx z9, z12
+; CHECK-NEXT:    frintx z9.h, p0/m, z12.h
+; CHECK-NEXT:    uunpkhi z12.d, z13.s
+; CHECK-NEXT:    uunpklo z17.d, z11.s
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    sel z3.d, p2, z29.d, z15.d
+; CHECK-NEXT:    uunpklo z15.d, z13.s
+; CHECK-NEXT:    fcmge p2.h, p0/z, z8.h, z30.h
+; CHECK-NEXT:    sel z10.d, p1, z29.d, z14.d
+; CHECK-NEXT:    movprfx z14, z16
+; CHECK-NEXT:    frintx z14.h, p0/m, z16.h
+; CHECK-NEXT:    uunpkhi z16.d, z18.s
+; CHECK-NEXT:    movprfx z13, z17
+; CHECK-NEXT:    frintx z13.h, p0/m, z17.h
+; CHECK-NEXT:    movprfx z20, z12
+; CHECK-NEXT:    frintx z20.h, p0/m, z12.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z9.h, z30.h
+; CHECK-NEXT:    uunpkhi z17.d, z11.s
+; CHECK-NEXT:    uunpklo z18.d, z18.s
+; CHECK-NEXT:    movprfx z12, z8
+; CHECK-NEXT:    fcvtzs z12.d, p0/m, z8.h
+; CHECK-NEXT:    movprfx z21, z15
+; CHECK-NEXT:    frintx z21.h, p0/m, z15.h
+; CHECK-NEXT:    not p1.b, p0/z, p4.b
+; CHECK-NEXT:    movprfx z15, z9
+; CHECK-NEXT:    fcvtzs z15.d, p0/m, z9.h
+; CHECK-NEXT:    frintx z16.h, p0/m, z16.h
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    movprfx z22, z14
+; CHECK-NEXT:    fcvtzs z22.d, p0/m, z14.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z13.h, z30.h
+; CHECK-NEXT:    fcmge p5.h, p0/z, z20.h, z30.h
+; CHECK-NEXT:    sel z11.d, p1, z29.d, z19.d
+; CHECK-NEXT:    not p3.b, p0/z, p3.b
+; CHECK-NEXT:    frintx z17.h, p0/m, z17.h
+; CHECK-NEXT:    frintx z18.h, p0/m, z18.h
+; CHECK-NEXT:    movprfx z19, z20
+; CHECK-NEXT:    fcvtzs z19.d, p0/m, z20.h
+; CHECK-NEXT:    mov z12.d, p2/m, z29.d
+; CHECK-NEXT:    fcmge p2.h, p0/z, z21.h, z30.h
+; CHECK-NEXT:    fcmge p1.h, p0/z, z14.h, z30.h
+; CHECK-NEXT:    mov z15.d, p3/m, z29.d
+; CHECK-NEXT:    movprfx z23, z21
+; CHECK-NEXT:    fcvtzs z23.d, p0/m, z21.h
+; CHECK-NEXT:    not p3.b, p0/z, p4.b
+; CHECK-NEXT:    fcmge p4.h, p0/z, z16.h, z30.h
+; CHECK-NEXT:    fcmgt p8.h, p0/z, z21.h, z7.h
+; CHECK-NEXT:    not p5.b, p0/z, p5.b
+; CHECK-NEXT:    fcmge p6.h, p0/z, z17.h, z30.h
+; CHECK-NEXT:    fcmge p7.h, p0/z, z18.h, z30.h
+; CHECK-NEXT:    movprfx z30, z16
+; CHECK-NEXT:    fcvtzs z30.d, p0/m, z16.h
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    fcmuo p9.h, p0/z, z21.h, z21.h
+; CHECK-NEXT:    mov z19.d, p5/m, z29.d
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z20.h, z7.h
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    not p4.b, p0/z, p4.b
+; CHECK-NEXT:    mov z23.d, p2/m, z29.d
+; CHECK-NEXT:    fcmuo p2.h, p0/z, z20.h, z20.h
+; CHECK-NEXT:    movprfx z20, z18
+; CHECK-NEXT:    fcvtzs z20.d, p0/m, z18.h
+; CHECK-NEXT:    movprfx z21, z13
+; CHECK-NEXT:    fcvtzs z21.d, p0/m, z13.h
+; CHECK-NEXT:    mov z22.d, p1/m, z29.d
+; CHECK-NEXT:    not p1.b, p0/z, p7.b
+; CHECK-NEXT:    mov z30.d, p4/m, z29.d
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z18.h, z7.h
+; CHECK-NEXT:    mov z19.d, p5/m, z4.d
+; CHECK-NEXT:    fcmuo p7.h, p0/z, z18.h, z18.h
+; CHECK-NEXT:    movprfx z18, z17
+; CHECK-NEXT:    fcvtzs z18.d, p0/m, z17.h
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z16.h, z7.h
+; CHECK-NEXT:    not p6.b, p0/z, p6.b
+; CHECK-NEXT:    mov z23.d, p8/m, z4.d
+; CHECK-NEXT:    mov z20.d, p1/m, z29.d
+; CHECK-NEXT:    mov z21.d, p3/m, z29.d
+; CHECK-NEXT:    fcmuo p3.h, p0/z, z16.h, z16.h
+; CHECK-NEXT:    mov z19.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z17.h, z7.h
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    sel z29.d, p6, z29.d, z18.d
+; CHECK-NEXT:    mov z23.d, p9/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p6.h, p0/z, z14.h, z7.h
+; CHECK-NEXT:    mov z30.d, p5/m, z4.d
+; CHECK-NEXT:    sel z16.d, p4, z4.d, z20.d
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z17.h, z17.h
+; CHECK-NEXT:    st1b { z19.b }, p1, [x8, x9]
+; CHECK-NEXT:    rdvl x9, #14
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z1.h, z7.h
+; CHECK-NEXT:    st1b { z23.b }, p1, [x8, x9]
+; CHECK-NEXT:    rdvl x9, #13
+; CHECK-NEXT:    mov z29.d, p2/m, z4.d
+; CHECK-NEXT:    mov z30.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z13.h, z7.h
+; CHECK-NEXT:    mov z16.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z9.h, z7.h
+; CHECK-NEXT:    fcmuo p7.h, p0/z, z14.h, z14.h
+; CHECK-NEXT:    mov z29.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z13.h, z13.h
+; CHECK-NEXT:    st1b { z30.b }, p1, [x8, x9]
+; CHECK-NEXT:    rdvl x9, #12
+; CHECK-NEXT:    sel z30.d, p5, z4.d, z31.d
+; CHECK-NEXT:    st1b { z16.b }, p1, [x8, x9]
+; CHECK-NEXT:    rdvl x9, #11
+; CHECK-NEXT:    sel z31.d, p3, z4.d, z21.d
+; CHECK-NEXT:    st1b { z29.b }, p1, [x8, x9]
+; CHECK-NEXT:    rdvl x9, #10
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z24.h, z7.h
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z28.h, z7.h
+; CHECK-NEXT:    sel z13.d, p2, z4.d, z15.d
+; CHECK-NEXT:    fcmuo p2.h, p0/z, z9.h, z9.h
+; CHECK-NEXT:    sel z29.d, p6, z4.d, z22.d
+; CHECK-NEXT:    mov z31.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z8.h, z7.h
+; CHECK-NEXT:    fcmgt p6.h, p0/z, z5.h, z7.h
+; CHECK-NEXT:    sel z9.d, p5, z4.d, z10.d
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z6.h, z7.h
+; CHECK-NEXT:    st1b { z31.b }, p1, [x8, x9]
+; CHECK-NEXT:    rdvl x9, #9
+; CHECK-NEXT:    mov z29.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    sel z10.d, p3, z4.d, z11.d
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z25.h, z7.h
+; CHECK-NEXT:    mov z13.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p7.h, p0/z, z8.h, z8.h
+; CHECK-NEXT:    fcmuo p2.h, p0/z, z28.h, z28.h
+; CHECK-NEXT:    sel z28.d, p4, z4.d, z12.d
+; CHECK-NEXT:    st1b { z29.b }, p1, [x8, x9]
+; CHECK-NEXT:    rdvl x9, #8
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z25.h, z25.h
+; CHECK-NEXT:    st1b { z13.b }, p1, [x8, x9]
+; CHECK-NEXT:    fcmuo p1.h, p0/z, z24.h, z24.h
+; CHECK-NEXT:    mov z2.d, p5/m, z4.d
+; CHECK-NEXT:    mov z3.d, p3/m, z4.d
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z0.h, z7.h
+; CHECK-NEXT:    mov z28.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p7.h, p0/z, z6.h, z6.h
+; CHECK-NEXT:    mov z10.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p2.h, p0/z, z5.h, z5.h
+; CHECK-NEXT:    sel z5.d, p6, z4.d, z27.d
+; CHECK-NEXT:    mov z3.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z1.h, z1.h
+; CHECK-NEXT:    mov z9.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    st1d { z28.d }, p0, [x8, #7, mul vl]
+; CHECK-NEXT:    fcmuo p1.h, p0/z, z0.h, z0.h
+; CHECK-NEXT:    sel z0.d, p3, z4.d, z26.d
+; CHECK-NEXT:    st1d { z10.d }, p0, [x8, #6, mul vl]
+; CHECK-NEXT:    mov z2.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    st1d { z3.d }, p0, [x8, #5, mul vl]
+; CHECK-NEXT:    mov z5.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    st1d { z9.d }, p0, [x8, #4, mul vl]
+; CHECK-NEXT:    mov z30.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    st1d { z2.d }, p0, [x8, #3, mul vl]
+; CHECK-NEXT:    st1d { z5.d }, p0, [x8, #2, mul vl]
+; CHECK-NEXT:    st1d { z30.d }, p0, [x8, #1, mul vl]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x8]
+; CHECK-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #17
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %a = call <vscale x 32 x iXLen> @llvm.lrint.nxv32iXLen.nxv32f16(<vscale x 32 x half> %x)
+  ret <vscale x 32 x iXLen> %a
+}
+declare <vscale x 32 x iXLen> @llvm.lrint.nxv32iXLen.nxv32f16(<vscale x 32 x half>)
+
+define <vscale x 1 x iXLen> @lrint_v1f32(<vscale x 1 x float> %x) {
+; CHECK-LABEL: lrint_v1f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
+; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
+; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
+; CHECK-NEXT:    mov z3.s, w8
+; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.s
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z3.s
+; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
+; CHECK-NEXT:    mov z1.d, p1/m, z2.d
+; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    ret
+  %a = call <vscale x 1 x iXLen> @llvm.lrint.nxv1iXLen.nxv1f32(<vscale x 1 x float> %x)
+  ret <vscale x 1 x iXLen> %a
+}
+declare <vscale x 1 x iXLen> @llvm.lrint.nxv1iXLen.nxv1f32(<vscale x 1 x float>)
+
+define <vscale x 2 x iXLen> @lrint_v2f32(<vscale x 2 x float> %x) {
+; CHECK-LABEL: lrint_v2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
+; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
+; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
+; CHECK-NEXT:    mov z3.s, w8
+; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.s
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z3.s
+; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
+; CHECK-NEXT:    mov z1.d, p1/m, z2.d
+; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    ret
+  %a = call <vscale x 2 x iXLen> @llvm.lrint.nxv2iXLen.nxv2f32(<vscale x 2 x float> %x)
+  ret <vscale x 2 x iXLen> %a
+}
+declare <vscale x 2 x iXLen> @llvm.lrint.nxv2iXLen.nxv2f32(<vscale x 2 x float>)
+
+define <vscale x 4 x iXLen> @lrint_v4f32(<vscale x 4 x float> %x) {
+; CHECK-LABEL: lrint_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    uunpklo z1.d, z0.s
+; CHECK-NEXT:    uunpkhi z0.d, z0.s
+; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z2.s, w8
+; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
+; CHECK-NEXT:    mov z3.s, w8
+; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
+; CHECK-NEXT:    frintx z1.s, p0/m, z1.s
+; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
+; CHECK-NEXT:    fcmge p1.s, p0/z, z1.s, z2.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, z2.s
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
+; CHECK-NEXT:    movprfx z4, z1
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z1.s
+; CHECK-NEXT:    movprfx z5, z0
+; CHECK-NEXT:    fcvtzs z5.d, p0/m, z0.s
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z1.s, z3.s
+; CHECK-NEXT:    fcmgt p4.s, p0/z, z0.s, z3.s
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    sel z3.d, p1, z2.d, z4.d
+; CHECK-NEXT:    fcmuo p1.s, p0/z, z1.s, z1.s
+; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
+; CHECK-NEXT:    sel z2.d, p2, z2.d, z5.d
+; CHECK-NEXT:    sel z0.d, p3, z6.d, z3.d
+; CHECK-NEXT:    sel z1.d, p4, z6.d, z2.d
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %a = call <vscale x 4 x iXLen> @llvm.lrint.nxv4iXLen.nxv4f32(<vscale x 4 x float> %x)
+  ret <vscale x 4 x iXLen> %a
+}
+declare <vscale x 4 x iXLen> @llvm.lrint.nxv4iXLen.nxv4f32(<vscale x 4 x float>)
+
+define <vscale x 8 x iXLen> @lrint_v8f32(<vscale x 8 x float> %x) {
+; CHECK-LABEL: lrint_v8f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    uunpklo z2.d, z0.s
+; CHECK-NEXT:    uunpkhi z0.d, z0.s
+; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
+; CHECK-NEXT:    uunpklo z3.d, z1.s
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    mov z4.s, w8
+; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
+; CHECK-NEXT:    mov z5.d, #0x8000000000000000
+; CHECK-NEXT:    mov z6.s, w8
+; CHECK-NEXT:    mov z26.d, #0x7fffffffffffffff
+; CHECK-NEXT:    frintx z2.s, p0/m, z2.s
+; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
+; CHECK-NEXT:    frintx z3.s, p0/m, z3.s
+; CHECK-NEXT:    frintx z1.s, p0/m, z1.s
+; CHECK-NEXT:    fcmge p1.s, p0/z, z2.s, z4.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, z4.s
+; CHECK-NEXT:    movprfx z7, z0
+; CHECK-NEXT:    fcvtzs z7.d, p0/m, z0.s
+; CHECK-NEXT:    fcmge p3.s, p0/z, z3.s, z4.s
+; CHECK-NEXT:    fcmge p4.s, p0/z, z1.s, z4.s
+; CHECK-NEXT:    movprfx z4, z2
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z2.s
+; CHECK-NEXT:    movprfx z24, z3
+; CHECK-NEXT:    fcvtzs z24.d, p0/m, z3.s
+; CHECK-NEXT:    movprfx z25, z1
+; CHECK-NEXT:    fcvtzs z25.d, p0/m, z1.s
+; CHECK-NEXT:    fcmgt p7.s, p0/z, z3.s, z6.s
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z2.s, z6.s
+; CHECK-NEXT:    fcmgt p6.s, p0/z, z0.s, z6.s
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    not p3.b, p0/z, p3.b
+; CHECK-NEXT:    mov z4.d, p1/m, z5.d
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z1.s, z6.s
+; CHECK-NEXT:    not p4.b, p0/z, p4.b
+; CHECK-NEXT:    sel z6.d, p2, z5.d, z7.d
+; CHECK-NEXT:    fcmuo p2.s, p0/z, z2.s, z2.s
+; CHECK-NEXT:    sel z7.d, p3, z5.d, z24.d
+; CHECK-NEXT:    fcmuo p3.s, p0/z, z0.s, z0.s
+; CHECK-NEXT:    sel z5.d, p4, z5.d, z25.d
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z3.s, z3.s
+; CHECK-NEXT:    fcmuo p0.s, p0/z, z1.s, z1.s
+; CHECK-NEXT:    sel z0.d, p5, z26.d, z4.d
+; CHECK-NEXT:    sel z1.d, p6, z26.d, z6.d
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    sel z2.d, p7, z26.d, z7.d
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    sel z3.d, p1, z26.d, z5.d
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z2.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z3.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %a = call <vscale x 8 x iXLen> @llvm.lrint.nxv8iXLen.nxv8f32(<vscale x 8 x float> %x)
+  ret <vscale x 8 x iXLen> %a
+}
+declare <vscale x 8 x iXLen> @llvm.lrint.nxv8iXLen.nxv8f32(<vscale x 8 x float>)
+
+define <vscale x 16 x iXLen> @lrint_v16f32(<vscale x 16 x float> %x) {
+; CHECK-LABEL: lrint_v16f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    uunpklo z4.d, z0.s
+; CHECK-NEXT:    uunpkhi z0.d, z0.s
+; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z7.d, z1.s
+; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    uunpklo z24.d, z2.s
+; CHECK-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-NEXT:    uunpklo z25.d, z3.s
+; CHECK-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-NEXT:    mov z26.d, #0x7fffffffffffffff
+; CHECK-NEXT:    movprfx z5, z4
+; CHECK-NEXT:    frintx z5.s, p0/m, z4.s
+; CHECK-NEXT:    movprfx z6, z0
+; CHECK-NEXT:    frintx z6.s, p0/m, z0.s
+; CHECK-NEXT:    mov z4.s, w8
+; CHECK-NEXT:    frintx z7.s, p0/m, z7.s
+; CHECK-NEXT:    movprfx z28, z1
+; CHECK-NEXT:    frintx z28.s, p0/m, z1.s
+; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
+; CHECK-NEXT:    mov z0.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z24.s, p0/m, z24.s
+; CHECK-NEXT:    movprfx z29, z2
+; CHECK-NEXT:    frintx z29.s, p0/m, z2.s
+; CHECK-NEXT:    frintx z25.s, p0/m, z25.s
+; CHECK-NEXT:    movprfx z30, z3
+; CHECK-NEXT:    frintx z30.s, p0/m, z3.s
+; CHECK-NEXT:    mov z27.s, w8
+; CHECK-NEXT:    fcmge p1.s, p0/z, z5.s, z4.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z6.s, z4.s
+; CHECK-NEXT:    movprfx z1, z5
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z5.s
+; CHECK-NEXT:    movprfx z2, z6
+; CHECK-NEXT:    fcvtzs z2.d, p0/m, z6.s
+; CHECK-NEXT:    fcmge p5.s, p0/z, z7.s, z4.s
+; CHECK-NEXT:    fcmge p6.s, p0/z, z28.s, z4.s
+; CHECK-NEXT:    movprfx z3, z7
+; CHECK-NEXT:    fcvtzs z3.d, p0/m, z7.s
+; CHECK-NEXT:    fcmge p8.s, p0/z, z29.s, z4.s
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z5.s, z27.s
+; CHECK-NEXT:    fcmgt p7.s, p0/z, z6.s, z27.s
+; CHECK-NEXT:    fcmge p9.s, p0/z, z25.s, z4.s
+; CHECK-NEXT:    movprfx z31, z25
+; CHECK-NEXT:    fcvtzs z31.d, p0/m, z25.s
+; CHECK-NEXT:    not p4.b, p0/z, p1.b
+; CHECK-NEXT:    fcmuo p1.s, p0/z, z5.s, z5.s
+; CHECK-NEXT:    movprfx z5, z28
+; CHECK-NEXT:    fcvtzs z5.d, p0/m, z28.s
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    fcmge p10.s, p0/z, z30.s, z4.s
+; CHECK-NEXT:    movprfx z8, z30
+; CHECK-NEXT:    fcvtzs z8.d, p0/m, z30.s
+; CHECK-NEXT:    mov z1.d, p4/m, z0.d
+; CHECK-NEXT:    fcmge p4.s, p0/z, z24.s, z4.s
+; CHECK-NEXT:    movprfx z4, z29
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z29.s
+; CHECK-NEXT:    mov z2.d, p2/m, z0.d
+; CHECK-NEXT:    fcmuo p2.s, p0/z, z6.s, z6.s
+; CHECK-NEXT:    movprfx z6, z24
+; CHECK-NEXT:    fcvtzs z6.d, p0/m, z24.s
+; CHECK-NEXT:    not p5.b, p0/z, p5.b
+; CHECK-NEXT:    not p6.b, p0/z, p6.b
+; CHECK-NEXT:    not p4.b, p0/z, p4.b
+; CHECK-NEXT:    mov z3.d, p5/m, z0.d
+; CHECK-NEXT:    not p5.b, p0/z, p8.b
+; CHECK-NEXT:    mov z5.d, p6/m, z0.d
+; CHECK-NEXT:    fcmgt p8.s, p0/z, z7.s, z27.s
+; CHECK-NEXT:    not p6.b, p0/z, p9.b
+; CHECK-NEXT:    mov z6.d, p4/m, z0.d
+; CHECK-NEXT:    fcmuo p9.s, p0/z, z7.s, z7.s
+; CHECK-NEXT:    not p4.b, p0/z, p10.b
+; CHECK-NEXT:    fcmgt p10.s, p0/z, z28.s, z27.s
+; CHECK-NEXT:    sel z7.d, p5, z0.d, z4.d
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z24.s, z27.s
+; CHECK-NEXT:    mov z31.d, p6/m, z0.d
+; CHECK-NEXT:    fcmgt p6.s, p0/z, z30.s, z27.s
+; CHECK-NEXT:    mov z8.d, p4/m, z0.d
+; CHECK-NEXT:    sel z0.d, p3, z26.d, z1.d
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z29.s, z27.s
+; CHECK-NEXT:    fcmgt p4.s, p0/z, z25.s, z27.s
+; CHECK-NEXT:    sel z1.d, p7, z26.d, z2.d
+; CHECK-NEXT:    fcmuo p7.s, p0/z, z28.s, z28.s
+; CHECK-NEXT:    sel z2.d, p8, z26.d, z3.d
+; CHECK-NEXT:    sel z3.d, p10, z26.d, z5.d
+; CHECK-NEXT:    fcmuo p8.s, p0/z, z29.s, z29.s
+; CHECK-NEXT:    sel z4.d, p5, z26.d, z6.d
+; CHECK-NEXT:    fcmuo p5.s, p0/z, z24.s, z24.s
+; CHECK-NEXT:    fcmuo p10.s, p0/z, z25.s, z25.s
+; CHECK-NEXT:    sel z5.d, p3, z26.d, z7.d
+; CHECK-NEXT:    fcmuo p0.s, p0/z, z30.s, z30.s
+; CHECK-NEXT:    sel z7.d, p6, z26.d, z8.d
+; CHECK-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    sel z6.d, p4, z26.d, z31.d
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z2.d, p9/m, #0 // =0x0
+; CHECK-NEXT:    mov z3.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z4.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    mov z5.d, p8/m, #0 // =0x0
+; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z6.d, p10/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z1.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z7.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %a = call <vscale x 16 x iXLen> @llvm.lrint.nxv16iXLen.nxv16f32(<vscale x 16 x float> %x)
+  ret <vscale x 16 x iXLen> %a
+}
+declare <vscale x 16 x iXLen> @llvm.lrint.nxv16iXLen.nxv16f32(<vscale x 16 x float>)
+
+define <vscale x 32 x iXLen> @lrint_v32f32(<vscale x 32 x float> %x) {
+; CHECK-LABEL: lrint_v32f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-17
+; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 136 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
+; CHECK-NEXT:    uunpklo z24.d, z0.s
+; CHECK-NEXT:    uunpkhi z25.d, z0.s
+; CHECK-NEXT:    mov w9, #-553648128 // =0xdf000000
+; CHECK-NEXT:    uunpklo z26.d, z1.s
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpkhi z27.d, z1.s
+; CHECK-NEXT:    mov z31.s, w9
+; CHECK-NEXT:    mov w9, #1593835519 // =0x5effffff
+; CHECK-NEXT:    uunpklo z28.d, z2.s
+; CHECK-NEXT:    mov z8.d, #0x8000000000000000
+; CHECK-NEXT:    uunpklo z30.d, z3.s
+; CHECK-NEXT:    uunpklo z13.d, z4.s
+; CHECK-NEXT:    movprfx z0, z24
+; CHECK-NEXT:    frintx z0.s, p0/m, z24.s
+; CHECK-NEXT:    movprfx z1, z25
+; CHECK-NEXT:    frintx z1.s, p0/m, z25.s
+; CHECK-NEXT:    uunpkhi z15.d, z4.s
+; CHECK-NEXT:    movprfx z24, z26
+; CHECK-NEXT:    frintx z24.s, p0/m, z26.s
+; CHECK-NEXT:    uunpkhi z26.d, z2.s
+; CHECK-NEXT:    movprfx z25, z27
+; CHECK-NEXT:    frintx z25.s, p0/m, z27.s
+; CHECK-NEXT:    movprfx z27, z28
+; CHECK-NEXT:    frintx z27.s, p0/m, z28.s
+; CHECK-NEXT:    uunpklo z16.d, z5.s
+; CHECK-NEXT:    uunpkhi z17.d, z7.s
+; CHECK-NEXT:    frintx z30.s, p0/m, z30.s
+; CHECK-NEXT:    uunpklo z18.d, z7.s
+; CHECK-NEXT:    uunpklo z21.d, z6.s
+; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z31.s
+; CHECK-NEXT:    movprfx z9, z0
+; CHECK-NEXT:    fcvtzs z9.d, p0/m, z0.s
+; CHECK-NEXT:    movprfx z10, z1
+; CHECK-NEXT:    fcvtzs z10.d, p0/m, z1.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z1.s, z31.s
+; CHECK-NEXT:    fcmge p3.s, p0/z, z24.s, z31.s
+; CHECK-NEXT:    movprfx z11, z24
+; CHECK-NEXT:    fcvtzs z11.d, p0/m, z24.s
+; CHECK-NEXT:    movprfx z29, z26
+; CHECK-NEXT:    frintx z29.s, p0/m, z26.s
+; CHECK-NEXT:    fcmge p4.s, p0/z, z25.s, z31.s
+; CHECK-NEXT:    fcmge p5.s, p0/z, z27.s, z31.s
+; CHECK-NEXT:    movprfx z12, z27
+; CHECK-NEXT:    fcvtzs z12.d, p0/m, z27.s
+; CHECK-NEXT:    movprfx z19, z30
+; CHECK-NEXT:    fcvtzs z19.d, p0/m, z30.s
+; CHECK-NEXT:    movprfx z7, z16
+; CHECK-NEXT:    frintx z7.s, p0/m, z16.s
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    frintx z17.s, p0/m, z17.s
+; CHECK-NEXT:    uunpkhi z16.d, z5.s
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    frintx z18.s, p0/m, z18.s
+; CHECK-NEXT:    mov z28.s, w9
+; CHECK-NEXT:    not p6.b, p0/z, p3.b
+; CHECK-NEXT:    sel z26.d, p1, z8.d, z9.d
+; CHECK-NEXT:    movprfx z14, z29
+; CHECK-NEXT:    fcvtzs z14.d, p0/m, z29.s
+; CHECK-NEXT:    sel z9.d, p2, z8.d, z10.d
+; CHECK-NEXT:    uunpkhi z10.d, z3.s
+; CHECK-NEXT:    rdvl x9, #15
+; CHECK-NEXT:    sel z3.d, p6, z8.d, z11.d
+; CHECK-NEXT:    movprfx z11, z25
+; CHECK-NEXT:    fcvtzs z11.d, p0/m, z25.s
+; CHECK-NEXT:    fcmge p3.s, p0/z, z29.s, z31.s
+; CHECK-NEXT:    not p4.b, p0/z, p4.b
+; CHECK-NEXT:    fcmge p1.s, p0/z, z30.s, z31.s
+; CHECK-NEXT:    movprfx z23, z18
+; CHECK-NEXT:    fcvtzs z23.d, p0/m, z18.s
+; CHECK-NEXT:    not p2.b, p0/z, p5.b
+; CHECK-NEXT:    fcmge p5.s, p0/z, z17.s, z31.s
+; CHECK-NEXT:    frintx z16.s, p0/m, z16.s
+; CHECK-NEXT:    frintx z10.s, p0/m, z10.s
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p8.s, p0/z, z18.s, z28.s
+; CHECK-NEXT:    sel z4.d, p4, z8.d, z11.d
+; CHECK-NEXT:    movprfx z11, z13
+; CHECK-NEXT:    frintx z11.s, p0/m, z13.s
+; CHECK-NEXT:    not p3.b, p0/z, p3.b
+; CHECK-NEXT:    sel z13.d, p2, z8.d, z12.d
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    fcmge p4.s, p0/z, z7.s, z31.s
+; CHECK-NEXT:    sel z12.d, p3, z8.d, z14.d
+; CHECK-NEXT:    movprfx z14, z15
+; CHECK-NEXT:    frintx z14.s, p0/m, z15.s
+; CHECK-NEXT:    uunpkhi z15.d, z6.s
+; CHECK-NEXT:    movprfx z20, z10
+; CHECK-NEXT:    fcvtzs z20.d, p0/m, z10.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z10.s, z31.s
+; CHECK-NEXT:    sel z5.d, p1, z8.d, z19.d
+; CHECK-NEXT:    movprfx z19, z11
+; CHECK-NEXT:    fcvtzs z19.d, p0/m, z11.s
+; CHECK-NEXT:    fcmge p3.s, p0/z, z11.s, z31.s
+; CHECK-NEXT:    not p5.b, p0/z, p5.b
+; CHECK-NEXT:    fcmge p6.s, p0/z, z16.s, z31.s
+; CHECK-NEXT:    fcmuo p9.s, p0/z, z18.s, z18.s
+; CHECK-NEXT:    movprfx z22, z15
+; CHECK-NEXT:    frintx z22.s, p0/m, z15.s
+; CHECK-NEXT:    fcmge p1.s, p0/z, z14.s, z31.s
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    not p3.b, p0/z, p3.b
+; CHECK-NEXT:    sel z6.d, p2, z8.d, z20.d
+; CHECK-NEXT:    movprfx z20, z21
+; CHECK-NEXT:    frintx z20.s, p0/m, z21.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z18.s, z31.s
+; CHECK-NEXT:    sel z15.d, p3, z8.d, z19.d
+; CHECK-NEXT:    movprfx z19, z17
+; CHECK-NEXT:    fcvtzs z19.d, p0/m, z17.s
+; CHECK-NEXT:    not p3.b, p0/z, p4.b
+; CHECK-NEXT:    fcmge p4.s, p0/z, z22.s, z31.s
+; CHECK-NEXT:    movprfx z21, z14
+; CHECK-NEXT:    fcvtzs z21.d, p0/m, z14.s
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    movprfx z18, z7
+; CHECK-NEXT:    fcvtzs z18.d, p0/m, z7.s
+; CHECK-NEXT:    not p6.b, p0/z, p6.b
+; CHECK-NEXT:    fcmge p7.s, p0/z, z20.s, z31.s
+; CHECK-NEXT:    movprfx z31, z22
+; CHECK-NEXT:    fcvtzs z31.d, p0/m, z22.s
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    mov z19.d, p5/m, z8.d
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z17.s, z28.s
+; CHECK-NEXT:    not p4.b, p0/z, p4.b
+; CHECK-NEXT:    mov z23.d, p2/m, z8.d
+; CHECK-NEXT:    fcmuo p2.s, p0/z, z17.s, z17.s
+; CHECK-NEXT:    movprfx z17, z20
+; CHECK-NEXT:    fcvtzs z17.d, p0/m, z20.s
+; CHECK-NEXT:    mov z21.d, p1/m, z8.d
+; CHECK-NEXT:    mov z18.d, p3/m, z8.d
+; CHECK-NEXT:    not p1.b, p0/z, p7.b
+; CHECK-NEXT:    mov z31.d, p4/m, z8.d
+; CHECK-NEXT:    fcmgt p4.s, p0/z, z20.s, z28.s
+; CHECK-NEXT:    mov z19.d, p5/m, z2.d
+; CHECK-NEXT:    fcmuo p7.s, p0/z, z20.s, z20.s
+; CHECK-NEXT:    movprfx z20, z16
+; CHECK-NEXT:    fcvtzs z20.d, p0/m, z16.s
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z22.s, z28.s
+; CHECK-NEXT:    mov z23.d, p8/m, z2.d
+; CHECK-NEXT:    fcmuo p3.s, p0/z, z22.s, z22.s
+; CHECK-NEXT:    mov z17.d, p1/m, z8.d
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    mov z19.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z16.s, z28.s
+; CHECK-NEXT:    sel z8.d, p6, z8.d, z20.d
+; CHECK-NEXT:    mov z23.d, p9/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p6.s, p0/z, z14.s, z28.s
+; CHECK-NEXT:    mov z31.d, p5/m, z2.d
+; CHECK-NEXT:    mov z17.d, p4/m, z2.d
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z16.s, z16.s
+; CHECK-NEXT:    st1b { z19.b }, p1, [x8, x9]
+; CHECK-NEXT:    rdvl x9, #14
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z1.s, z28.s
+; CHECK-NEXT:    st1b { z23.b }, p1, [x8, x9]
+; CHECK-NEXT:    rdvl x9, #13
+; CHECK-NEXT:    mov z8.d, p2/m, z2.d
+; CHECK-NEXT:    mov z31.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z7.s, z28.s
+; CHECK-NEXT:    mov z17.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z11.s, z28.s
+; CHECK-NEXT:    fcmuo p7.s, p0/z, z14.s, z14.s
+; CHECK-NEXT:    mov z8.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z7.s, z7.s
+; CHECK-NEXT:    sel z7.d, p5, z2.d, z9.d
+; CHECK-NEXT:    st1b { z31.b }, p1, [x8, x9]
+; CHECK-NEXT:    rdvl x9, #12
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z27.s, z28.s
+; CHECK-NEXT:    st1b { z17.b }, p1, [x8, x9]
+; CHECK-NEXT:    rdvl x9, #11
+; CHECK-NEXT:    sel z31.d, p3, z2.d, z18.d
+; CHECK-NEXT:    st1b { z8.b }, p1, [x8, x9]
+; CHECK-NEXT:    rdvl x9, #10
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z30.s, z28.s
+; CHECK-NEXT:    sel z9.d, p2, z2.d, z15.d
+; CHECK-NEXT:    fcmuo p2.s, p0/z, z11.s, z11.s
+; CHECK-NEXT:    sel z8.d, p6, z2.d, z21.d
+; CHECK-NEXT:    mov z31.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p4.s, p0/z, z10.s, z28.s
+; CHECK-NEXT:    fcmgt p6.s, p0/z, z24.s, z28.s
+; CHECK-NEXT:    sel z11.d, p5, z2.d, z13.d
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z25.s, z28.s
+; CHECK-NEXT:    mov z8.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    mov z5.d, p3/m, z2.d
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z29.s, z28.s
+; CHECK-NEXT:    st1b { z31.b }, p1, [x8, x9]
+; CHECK-NEXT:    rdvl x9, #9
+; CHECK-NEXT:    mov z9.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p7.s, p0/z, z10.s, z10.s
+; CHECK-NEXT:    fcmuo p2.s, p0/z, z30.s, z30.s
+; CHECK-NEXT:    mov z6.d, p4/m, z2.d
+; CHECK-NEXT:    st1b { z8.b }, p1, [x8, x9]
+; CHECK-NEXT:    rdvl x9, #8
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z29.s, z29.s
+; CHECK-NEXT:    st1b { z9.b }, p1, [x8, x9]
+; CHECK-NEXT:    fcmuo p1.s, p0/z, z27.s, z27.s
+; CHECK-NEXT:    sel z27.d, p3, z2.d, z12.d
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z0.s, z28.s
+; CHECK-NEXT:    mov z4.d, p5/m, z2.d
+; CHECK-NEXT:    mov z3.d, p6/m, z2.d
+; CHECK-NEXT:    mov z6.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p7.s, p0/z, z25.s, z25.s
+; CHECK-NEXT:    mov z5.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p2.s, p0/z, z24.s, z24.s
+; CHECK-NEXT:    mov z27.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z1.s, z1.s
+; CHECK-NEXT:    mov z11.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p1.s, p0/z, z0.s, z0.s
+; CHECK-NEXT:    st1d { z6.d }, p0, [x8, #7, mul vl]
+; CHECK-NEXT:    sel z0.d, p3, z2.d, z26.d
+; CHECK-NEXT:    st1d { z5.d }, p0, [x8, #6, mul vl]
+; CHECK-NEXT:    mov z4.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    st1d { z27.d }, p0, [x8, #5, mul vl]
+; CHECK-NEXT:    mov z3.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z7.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    st1d { z11.d }, p0, [x8, #4, mul vl]
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    st1d { z4.d }, p0, [x8, #3, mul vl]
+; CHECK-NEXT:    st1d { z3.d }, p0, [x8, #2, mul vl]
+; CHECK-NEXT:    st1d { z7.d }, p0, [x8, #1, mul vl]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x8]
+; CHECK-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #17
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %a = call <vscale x 32 x iXLen> @llvm.lrint.nxv32iXLen.nxv32f32(<vscale x 32 x float> %x)
+  ret <vscale x 32 x iXLen> %a
+}
+declare <vscale x 32 x iXLen> @llvm.lrint.nxv32iXLen.nxv32f32(<vscale x 32 x float>)
+
+define <vscale x 1 x iXLen> @lrint_v1f64(<vscale x 1 x double> %x) {
+; CHECK-LABEL: lrint_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
+; CHECK-NEXT:    mov z1.d, x8
+; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
+; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
+; CHECK-NEXT:    mov z3.d, x8
+; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z0.d, z3.d
+; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    mov z1.d, p1/m, z2.d
+; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    ret
+  %a = call <vscale x 1 x iXLen> @llvm.lrint.nxv1iXLen.nxv1f64(<vscale x 1 x double> %x)
+  ret <vscale x 1 x iXLen> %a
+}
+declare <vscale x 1 x iXLen> @llvm.lrint.nxv1iXLen.nxv1f64(<vscale x 1 x double>)
+
+define <vscale x 2 x iXLen> @lrint_v2f64(<vscale x 2 x double> %x) {
+; CHECK-LABEL: lrint_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
+; CHECK-NEXT:    mov z1.d, x8
+; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
+; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
+; CHECK-NEXT:    mov z3.d, x8
+; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z0.d, z3.d
+; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    mov z1.d, p1/m, z2.d
+; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    ret
+  %a = call <vscale x 2 x iXLen> @llvm.lrint.nxv2iXLen.nxv2f64(<vscale x 2 x double> %x)
+  ret <vscale x 2 x iXLen> %a
+}
+declare <vscale x 2 x iXLen> @llvm.lrint.nxv2iXLen.nxv2f64(<vscale x 2 x double>)
+
+define <vscale x 4 x iXLen> @lrint_v4f64(<vscale x 4 x double> %x) {
+; CHECK-LABEL: lrint_v4f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
+; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
+; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
+; CHECK-NEXT:    frintx z1.d, p0/m, z1.d
+; CHECK-NEXT:    mov z3.d, x8
+; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z2.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, z2.d
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z0.d
+; CHECK-NEXT:    movprfx z5, z1
+; CHECK-NEXT:    fcvtzs z5.d, p0/m, z1.d
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z0.d, z3.d
+; CHECK-NEXT:    fcmgt p4.d, p0/z, z1.d, z3.d
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    sel z3.d, p1, z2.d, z4.d
+; CHECK-NEXT:    fcmuo p1.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    fcmuo p0.d, p0/z, z1.d, z1.d
+; CHECK-NEXT:    sel z2.d, p2, z2.d, z5.d
+; CHECK-NEXT:    sel z0.d, p3, z6.d, z3.d
+; CHECK-NEXT:    sel z1.d, p4, z6.d, z2.d
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %a = call <vscale x 4 x iXLen> @llvm.lrint.nxv4iXLen.nxv4f64(<vscale x 4 x double> %x)
+  ret <vscale x 4 x iXLen> %a
+}
+declare <vscale x 4 x iXLen> @llvm.lrint.nxv4iXLen.nxv4f64(<vscale x 4 x double>)
+
+define <vscale x 8 x iXLen> @lrint_v8f64(<vscale x 8 x double> %x) {
+; CHECK-LABEL: lrint_v8f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
+; CHECK-NEXT:    mov z5.d, #0x8000000000000000
+; CHECK-NEXT:    mov z4.d, x8
+; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
+; CHECK-NEXT:    mov z26.d, #0x7fffffffffffffff
+; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
+; CHECK-NEXT:    frintx z1.d, p0/m, z1.d
+; CHECK-NEXT:    frintx z2.d, p0/m, z2.d
+; CHECK-NEXT:    frintx z3.d, p0/m, z3.d
+; CHECK-NEXT:    mov z6.d, x8
+; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z4.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, z4.d
+; CHECK-NEXT:    fcmge p3.d, p0/z, z2.d, z4.d
+; CHECK-NEXT:    fcmge p4.d, p0/z, z3.d, z4.d
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z0.d
+; CHECK-NEXT:    movprfx z7, z1
+; CHECK-NEXT:    fcvtzs z7.d, p0/m, z1.d
+; CHECK-NEXT:    movprfx z24, z2
+; CHECK-NEXT:    fcvtzs z24.d, p0/m, z2.d
+; CHECK-NEXT:    movprfx z25, z3
+; CHECK-NEXT:    fcvtzs z25.d, p0/m, z3.d
+; CHECK-NEXT:    fcmgt p7.d, p0/z, z2.d, z6.d
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z0.d, z6.d
+; CHECK-NEXT:    fcmgt p6.d, p0/z, z1.d, z6.d
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    not p3.b, p0/z, p3.b
+; CHECK-NEXT:    mov z4.d, p1/m, z5.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z3.d, z6.d
+; CHECK-NEXT:    not p4.b, p0/z, p4.b
+; CHECK-NEXT:    sel z6.d, p2, z5.d, z7.d
+; CHECK-NEXT:    fcmuo p2.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    sel z7.d, p3, z5.d, z24.d
+; CHECK-NEXT:    fcmuo p3.d, p0/z, z1.d, z1.d
+; CHECK-NEXT:    sel z5.d, p4, z5.d, z25.d
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z2.d, z2.d
+; CHECK-NEXT:    fcmuo p0.d, p0/z, z3.d, z3.d
+; CHECK-NEXT:    sel z0.d, p5, z26.d, z4.d
+; CHECK-NEXT:    sel z1.d, p6, z26.d, z6.d
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    sel z2.d, p7, z26.d, z7.d
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    sel z3.d, p1, z26.d, z5.d
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z2.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z3.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %a = call <vscale x 8 x iXLen> @llvm.lrint.nxv8iXLen.nxv8f64(<vscale x 8 x double> %x)
+  ret <vscale x 8 x iXLen> %a
+}
+declare <vscale x 8 x iXLen> @llvm.lrint.nxv8iXLen.nxv8f64(<vscale x 8 x double>)
+
+define <vscale x 16 x iXLen> @lrint_v16f64(<vscale x 16 x double> %x) {
+; CHECK-LABEL: lrint_v16f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
+; CHECK-NEXT:    mov z24.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z25.d, x8
+; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
+; CHECK-NEXT:    movprfx z26, z0
+; CHECK-NEXT:    frintx z26.d, p0/m, z0.d
+; CHECK-NEXT:    movprfx z27, z1
+; CHECK-NEXT:    frintx z27.d, p0/m, z1.d
+; CHECK-NEXT:    frintx z2.d, p0/m, z2.d
+; CHECK-NEXT:    mov z0.d, #0x8000000000000000
+; CHECK-NEXT:    mov z1.d, x8
+; CHECK-NEXT:    frintx z3.d, p0/m, z3.d
+; CHECK-NEXT:    movprfx z28, z4
+; CHECK-NEXT:    frintx z28.d, p0/m, z4.d
+; CHECK-NEXT:    frintx z5.d, p0/m, z5.d
+; CHECK-NEXT:    frintx z6.d, p0/m, z6.d
+; CHECK-NEXT:    frintx z7.d, p0/m, z7.d
+; CHECK-NEXT:    fcmge p1.d, p0/z, z26.d, z25.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z27.d, z25.d
+; CHECK-NEXT:    movprfx z4, z26
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z26.d
+; CHECK-NEXT:    fcmge p5.d, p0/z, z2.d, z25.d
+; CHECK-NEXT:    movprfx z29, z27
+; CHECK-NEXT:    fcvtzs z29.d, p0/m, z27.d
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z26.d, z1.d
+; CHECK-NEXT:    fcmge p6.d, p0/z, z3.d, z25.d
+; CHECK-NEXT:    fcmge p8.d, p0/z, z5.d, z25.d
+; CHECK-NEXT:    fcmgt p7.d, p0/z, z27.d, z1.d
+; CHECK-NEXT:    fcmge p9.d, p0/z, z6.d, z25.d
+; CHECK-NEXT:    movprfx z30, z28
+; CHECK-NEXT:    fcvtzs z30.d, p0/m, z28.d
+; CHECK-NEXT:    fcmge p10.d, p0/z, z7.d, z25.d
+; CHECK-NEXT:    not p4.b, p0/z, p1.b
+; CHECK-NEXT:    fcmuo p1.d, p0/z, z26.d, z26.d
+; CHECK-NEXT:    movprfx z26, z2
+; CHECK-NEXT:    fcvtzs z26.d, p0/m, z2.d
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    movprfx z31, z6
+; CHECK-NEXT:    fcvtzs z31.d, p0/m, z6.d
+; CHECK-NEXT:    movprfx z8, z7
+; CHECK-NEXT:    fcvtzs z8.d, p0/m, z7.d
+; CHECK-NEXT:    mov z4.d, p4/m, z0.d
+; CHECK-NEXT:    fcmge p4.d, p0/z, z28.d, z25.d
+; CHECK-NEXT:    not p5.b, p0/z, p5.b
+; CHECK-NEXT:    mov z29.d, p2/m, z0.d
+; CHECK-NEXT:    fcmuo p2.d, p0/z, z27.d, z27.d
+; CHECK-NEXT:    movprfx z27, z3
+; CHECK-NEXT:    fcvtzs z27.d, p0/m, z3.d
+; CHECK-NEXT:    sel z25.d, p5, z0.d, z26.d
+; CHECK-NEXT:    movprfx z26, z5
+; CHECK-NEXT:    fcvtzs z26.d, p0/m, z5.d
+; CHECK-NEXT:    not p6.b, p0/z, p6.b
+; CHECK-NEXT:    not p5.b, p0/z, p8.b
+; CHECK-NEXT:    fcmgt p8.d, p0/z, z2.d, z1.d
+; CHECK-NEXT:    not p4.b, p0/z, p4.b
+; CHECK-NEXT:    mov z27.d, p6/m, z0.d
+; CHECK-NEXT:    not p6.b, p0/z, p9.b
+; CHECK-NEXT:    fcmuo p9.d, p0/z, z2.d, z2.d
+; CHECK-NEXT:    mov z30.d, p4/m, z0.d
+; CHECK-NEXT:    not p4.b, p0/z, p10.b
+; CHECK-NEXT:    fcmgt p10.d, p0/z, z3.d, z1.d
+; CHECK-NEXT:    mov z26.d, p5/m, z0.d
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z28.d, z1.d
+; CHECK-NEXT:    mov z31.d, p6/m, z0.d
+; CHECK-NEXT:    mov z8.d, p4/m, z0.d
+; CHECK-NEXT:    sel z0.d, p3, z24.d, z4.d
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z5.d, z1.d
+; CHECK-NEXT:    fcmgt p4.d, p0/z, z6.d, z1.d
+; CHECK-NEXT:    fcmgt p6.d, p0/z, z7.d, z1.d
+; CHECK-NEXT:    sel z1.d, p7, z24.d, z29.d
+; CHECK-NEXT:    fcmuo p7.d, p0/z, z3.d, z3.d
+; CHECK-NEXT:    sel z2.d, p8, z24.d, z25.d
+; CHECK-NEXT:    sel z3.d, p10, z24.d, z27.d
+; CHECK-NEXT:    sel z4.d, p5, z24.d, z30.d
+; CHECK-NEXT:    fcmuo p5.d, p0/z, z28.d, z28.d
+; CHECK-NEXT:    fcmuo p8.d, p0/z, z5.d, z5.d
+; CHECK-NEXT:    fcmuo p10.d, p0/z, z6.d, z6.d
+; CHECK-NEXT:    sel z5.d, p3, z24.d, z26.d
+; CHECK-NEXT:    fcmuo p0.d, p0/z, z7.d, z7.d
+; CHECK-NEXT:    sel z6.d, p4, z24.d, z31.d
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    sel z7.d, p6, z24.d, z8.d
+; CHECK-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z2.d, p9/m, #0 // =0x0
+; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z3.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    mov z4.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z5.d, p8/m, #0 // =0x0
+; CHECK-NEXT:    mov z6.d, p10/m, #0 // =0x0
+; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z1.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z7.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %a = call <vscale x 16 x iXLen> @llvm.lrint.nxv16iXLen.nxv16f64(<vscale x 16 x double> %x)
+  ret <vscale x 16 x iXLen> %a
+}
+declare <vscale x 16 x iXLen> @llvm.lrint.nxv16iXLen.nxv16f64(<vscale x 16 x double>)
+
+define <vscale x 32 x iXLen> @lrint_v32f64(<vscale x 32 x double> %x) {
+; CHECK-LABEL: lrint_v32f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-12
+; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z18, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z17, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z16, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z15, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z14, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z13, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z10, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xe0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 96 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    rdvl x9, #8
+; CHECK-NEXT:    rdvl x10, #9
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    rdvl x11, #10
+; CHECK-NEXT:    mov x12, #-4332462841530417152 // =0xc3e0000000000000
+; CHECK-NEXT:    ld1b { z0.b }, p1/z, [x0, x9]
+; CHECK-NEXT:    ld1b { z1.b }, p1/z, [x0, x10]
+; CHECK-NEXT:    mov z2.d, x12
+; CHECK-NEXT:    rdvl x14, #13
+; CHECK-NEXT:    rdvl x13, #12
+; CHECK-NEXT:    rdvl x12, #11
+; CHECK-NEXT:    ld1b { z6.b }, p1/z, [x0, x14]
+; CHECK-NEXT:    ld1b { z7.b }, p1/z, [x0, x13]
+; CHECK-NEXT:    mov z3.d, #0x8000000000000000
+; CHECK-NEXT:    movprfx z24, z0
+; CHECK-NEXT:    frintx z24.d, p0/m, z0.d
+; CHECK-NEXT:    ld1b { z0.b }, p1/z, [x0, x11]
+; CHECK-NEXT:    movprfx z5, z1
+; CHECK-NEXT:    frintx z5.d, p0/m, z1.d
+; CHECK-NEXT:    ld1b { z1.b }, p1/z, [x0, x12]
+; CHECK-NEXT:    mov x15, #4890909195324358655 // =0x43dfffffffffffff
+; CHECK-NEXT:    rdvl x16, #15
+; CHECK-NEXT:    movprfx z30, z6
+; CHECK-NEXT:    frintx z30.d, p0/m, z6.d
+; CHECK-NEXT:    movprfx z28, z7
+; CHECK-NEXT:    frintx z28.d, p0/m, z7.d
+; CHECK-NEXT:    ld1b { z8.b }, p1/z, [x0, x16]
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    frintx z4.d, p0/m, z0.d
+; CHECK-NEXT:    mov z0.d, #0x7fffffffffffffff
+; CHECK-NEXT:    ld1d { z18.d }, p0/z, [x0]
+; CHECK-NEXT:    fcmge p3.d, p0/z, z5.d, z2.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z24.d, z2.d
+; CHECK-NEXT:    movprfx z6, z5
+; CHECK-NEXT:    fcvtzs z6.d, p0/m, z5.d
+; CHECK-NEXT:    movprfx z27, z1
+; CHECK-NEXT:    frintx z27.d, p0/m, z1.d
+; CHECK-NEXT:    movprfx z25, z24
+; CHECK-NEXT:    fcvtzs z25.d, p0/m, z24.d
+; CHECK-NEXT:    mov z1.d, x15
+; CHECK-NEXT:    rdvl x15, #14
+; CHECK-NEXT:    movprfx z9, z28
+; CHECK-NEXT:    fcvtzs z9.d, p0/m, z28.d
+; CHECK-NEXT:    movprfx z13, z8
+; CHECK-NEXT:    frintx z13.d, p0/m, z8.d
+; CHECK-NEXT:    fcmge p4.d, p0/z, z4.d, z2.d
+; CHECK-NEXT:    movprfx z7, z4
+; CHECK-NEXT:    fcvtzs z7.d, p0/m, z4.d
+; CHECK-NEXT:    ld1d { z15.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    not p3.b, p0/z, p3.b
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z24.d, z1.d
+; CHECK-NEXT:    fcmgt p6.d, p0/z, z5.d, z1.d
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    fcmge p7.d, p0/z, z27.d, z2.d
+; CHECK-NEXT:    movprfx z26, z27
+; CHECK-NEXT:    fcvtzs z26.d, p0/m, z27.d
+; CHECK-NEXT:    sel z29.d, p3, z3.d, z6.d
+; CHECK-NEXT:    ld1b { z6.b }, p1/z, [x0, x15]
+; CHECK-NEXT:    fcmge p3.d, p0/z, z28.d, z2.d
+; CHECK-NEXT:    not p4.b, p0/z, p4.b
+; CHECK-NEXT:    mov z25.d, p2/m, z3.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z4.d, z1.d
+; CHECK-NEXT:    movprfx z16, z13
+; CHECK-NEXT:    fcvtzs z16.d, p0/m, z13.d
+; CHECK-NEXT:    ld1d { z17.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1d { z14.d }, p0/z, [x0, #3, mul vl]
+; CHECK-NEXT:    sel z31.d, p4, z3.d, z7.d
+; CHECK-NEXT:    movprfx z11, z6
+; CHECK-NEXT:    frintx z11.d, p0/m, z6.d
+; CHECK-NEXT:    not p7.b, p0/z, p7.b
+; CHECK-NEXT:    not p3.b, p0/z, p3.b
+; CHECK-NEXT:    sel z6.d, p5, z0.d, z25.d
+; CHECK-NEXT:    fcmgt p4.d, p0/z, z27.d, z1.d
+; CHECK-NEXT:    sel z7.d, p6, z0.d, z29.d
+; CHECK-NEXT:    mov z26.d, p7/m, z3.d
+; CHECK-NEXT:    fcmge p5.d, p0/z, z13.d, z2.d
+; CHECK-NEXT:    sel z25.d, p2, z0.d, z31.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z30.d, z2.d
+; CHECK-NEXT:    sel z29.d, p3, z3.d, z9.d
+; CHECK-NEXT:    fcmge p3.d, p0/z, z11.d, z2.d
+; CHECK-NEXT:    movprfx z31, z30
+; CHECK-NEXT:    fcvtzs z31.d, p0/m, z30.d
+; CHECK-NEXT:    movprfx z9, z11
+; CHECK-NEXT:    fcvtzs z9.d, p0/m, z11.d
+; CHECK-NEXT:    mov z26.d, p4/m, z0.d
+; CHECK-NEXT:    fcmgt p4.d, p0/z, z28.d, z1.d
+; CHECK-NEXT:    fcmgt p6.d, p0/z, z30.d, z1.d
+; CHECK-NEXT:    not p7.b, p0/z, p5.b
+; CHECK-NEXT:    fcmuo p5.d, p0/z, z27.d, z27.d
+; CHECK-NEXT:    fcmgt p8.d, p0/z, z13.d, z1.d
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    movprfx z27, z18
+; CHECK-NEXT:    frintx z27.d, p0/m, z18.d
+; CHECK-NEXT:    ld1d { z8.d }, p0/z, [x0, #7, mul vl]
+; CHECK-NEXT:    not p3.b, p0/z, p3.b
+; CHECK-NEXT:    mov z16.d, p7/m, z3.d
+; CHECK-NEXT:    fcmuo p7.d, p0/z, z13.d, z13.d
+; CHECK-NEXT:    mov z31.d, p2/m, z3.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z11.d, z1.d
+; CHECK-NEXT:    mov z29.d, p4/m, z0.d
+; CHECK-NEXT:    mov z9.d, p3/m, z3.d
+; CHECK-NEXT:    fcmuo p3.d, p0/z, z28.d, z28.d
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z30.d, z30.d
+; CHECK-NEXT:    movprfx z28, z17
+; CHECK-NEXT:    frintx z28.d, p0/m, z17.d
+; CHECK-NEXT:    movprfx z30, z15
+; CHECK-NEXT:    frintx z30.d, p0/m, z15.d
+; CHECK-NEXT:    ld1d { z13.d }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT:    mov z31.d, p6/m, z0.d
+; CHECK-NEXT:    fcmuo p6.d, p0/z, z11.d, z11.d
+; CHECK-NEXT:    sel z11.d, p8, z0.d, z16.d
+; CHECK-NEXT:    mov z9.d, p2/m, z0.d
+; CHECK-NEXT:    fcmuo p2.d, p0/z, z24.d, z24.d
+; CHECK-NEXT:    movprfx z24, z14
+; CHECK-NEXT:    frintx z24.d, p0/m, z14.d
+; CHECK-NEXT:    fcmge p8.d, p0/z, z27.d, z2.d
+; CHECK-NEXT:    ld1d { z10.d }, p0/z, [x0, #6, mul vl]
+; CHECK-NEXT:    ld1d { z12.d }, p0/z, [x0, #5, mul vl]
+; CHECK-NEXT:    mov z26.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    mov z29.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    fcmge p5.d, p0/z, z28.d, z2.d
+; CHECK-NEXT:    movprfx z14, z27
+; CHECK-NEXT:    fcvtzs z14.d, p0/m, z27.d
+; CHECK-NEXT:    fcmge p3.d, p0/z, z30.d, z2.d
+; CHECK-NEXT:    frintx z13.d, p0/m, z13.d
+; CHECK-NEXT:    mov z31.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmge p4.d, p0/z, z24.d, z2.d
+; CHECK-NEXT:    mov z9.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    movprfx z15, z28
+; CHECK-NEXT:    fcvtzs z15.d, p0/m, z28.d
+; CHECK-NEXT:    not p6.b, p0/z, p8.b
+; CHECK-NEXT:    movprfx z16, z30
+; CHECK-NEXT:    fcvtzs z16.d, p0/m, z30.d
+; CHECK-NEXT:    frintx z12.d, p0/m, z12.d
+; CHECK-NEXT:    frintx z10.d, p0/m, z10.d
+; CHECK-NEXT:    movprfx z17, z24
+; CHECK-NEXT:    fcvtzs z17.d, p0/m, z24.d
+; CHECK-NEXT:    movprfx z18, z8
+; CHECK-NEXT:    frintx z18.d, p0/m, z8.d
+; CHECK-NEXT:    not p5.b, p0/z, p5.b
+; CHECK-NEXT:    sel z8.d, p6, z3.d, z14.d
+; CHECK-NEXT:    not p3.b, p0/z, p3.b
+; CHECK-NEXT:    fcmge p6.d, p0/z, z13.d, z2.d
+; CHECK-NEXT:    mov z11.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    not p4.b, p0/z, p4.b
+; CHECK-NEXT:    sel z14.d, p5, z3.d, z15.d
+; CHECK-NEXT:    fcmuo p7.d, p0/z, z5.d, z5.d
+; CHECK-NEXT:    sel z15.d, p3, z3.d, z16.d
+; CHECK-NEXT:    movprfx z16, z13
+; CHECK-NEXT:    fcvtzs z16.d, p0/m, z13.d
+; CHECK-NEXT:    fcmge p5.d, p0/z, z12.d, z2.d
+; CHECK-NEXT:    fcmge p3.d, p0/z, z10.d, z2.d
+; CHECK-NEXT:    sel z5.d, p4, z3.d, z17.d
+; CHECK-NEXT:    fcmge p4.d, p0/z, z18.d, z2.d
+; CHECK-NEXT:    not p6.b, p0/z, p6.b
+; CHECK-NEXT:    movprfx z2, z12
+; CHECK-NEXT:    fcvtzs z2.d, p0/m, z12.d
+; CHECK-NEXT:    movprfx z17, z10
+; CHECK-NEXT:    fcvtzs z17.d, p0/m, z10.d
+; CHECK-NEXT:    st1b { z11.b }, p1, [x8, x16]
+; CHECK-NEXT:    movprfx z11, z18
+; CHECK-NEXT:    fcvtzs z11.d, p0/m, z18.d
+; CHECK-NEXT:    mov z6.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    st1b { z9.b }, p1, [x8, x15]
+; CHECK-NEXT:    sel z9.d, p6, z3.d, z16.d
+; CHECK-NEXT:    fcmuo p6.d, p0/z, z4.d, z4.d
+; CHECK-NEXT:    not p5.b, p0/z, p5.b
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z18.d, z1.d
+; CHECK-NEXT:    mov z7.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    not p3.b, p0/z, p3.b
+; CHECK-NEXT:    st1b { z31.b }, p1, [x8, x14]
+; CHECK-NEXT:    fcmgt p7.d, p0/z, z24.d, z1.d
+; CHECK-NEXT:    not p4.b, p0/z, p4.b
+; CHECK-NEXT:    mov z2.d, p5/m, z3.d
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z28.d, z1.d
+; CHECK-NEXT:    sel z4.d, p3, z3.d, z17.d
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z13.d, z1.d
+; CHECK-NEXT:    mov z25.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    sel z3.d, p4, z3.d, z11.d
+; CHECK-NEXT:    fcmgt p4.d, p0/z, z10.d, z1.d
+; CHECK-NEXT:    fcmgt p6.d, p0/z, z12.d, z1.d
+; CHECK-NEXT:    st1b { z29.b }, p1, [x8, x13]
+; CHECK-NEXT:    st1b { z26.b }, p1, [x8, x12]
+; CHECK-NEXT:    sel z26.d, p5, z0.d, z14.d
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z30.d, z1.d
+; CHECK-NEXT:    sel z29.d, p3, z0.d, z9.d
+; CHECK-NEXT:    fcmuo p3.d, p0/z, z18.d, z18.d
+; CHECK-NEXT:    mov z3.d, p2/m, z0.d
+; CHECK-NEXT:    st1b { z25.b }, p1, [x8, x11]
+; CHECK-NEXT:    fcmuo p2.d, p0/z, z10.d, z10.d
+; CHECK-NEXT:    mov z4.d, p4/m, z0.d
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z12.d, z12.d
+; CHECK-NEXT:    st1b { z7.b }, p1, [x8, x10]
+; CHECK-NEXT:    mov z2.d, p6/m, z0.d
+; CHECK-NEXT:    st1b { z6.b }, p1, [x8, x9]
+; CHECK-NEXT:    fcmuo p1.d, p0/z, z13.d, z13.d
+; CHECK-NEXT:    fcmgt p6.d, p0/z, z27.d, z1.d
+; CHECK-NEXT:    mov z3.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p3.d, p0/z, z24.d, z24.d
+; CHECK-NEXT:    sel z1.d, p7, z0.d, z5.d
+; CHECK-NEXT:    mov z4.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p2.d, p0/z, z30.d, z30.d
+; CHECK-NEXT:    sel z5.d, p5, z0.d, z15.d
+; CHECK-NEXT:    mov z2.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z28.d, z28.d
+; CHECK-NEXT:    mov z29.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p1.d, p0/z, z27.d, z27.d
+; CHECK-NEXT:    sel z0.d, p6, z0.d, z8.d
+; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    st1d { z3.d }, p0, [x8, #7, mul vl]
+; CHECK-NEXT:    mov z5.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    st1d { z4.d }, p0, [x8, #6, mul vl]
+; CHECK-NEXT:    mov z26.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    st1d { z2.d }, p0, [x8, #5, mul vl]
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    st1d { z29.d }, p0, [x8, #4, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [x8, #3, mul vl]
+; CHECK-NEXT:    st1d { z5.d }, p0, [x8, #2, mul vl]
+; CHECK-NEXT:    st1d { z26.d }, p0, [x8, #1, mul vl]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x8]
+; CHECK-NEXT:    ldr z18, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z17, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z16, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z15, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z14, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z13, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z12, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z10, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #12
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %a = call <vscale x 32 x iXLen> @llvm.lrint.nxv32iXLen.nxv16f64(<vscale x 32 x double> %x)
+  ret <vscale x 32 x iXLen> %a
+}
+declare <vscale x 32 x iXLen> @llvm.lrint.nxv32iXLen.nxv32f64(<vscale x 32 x double>)
diff --git a/llvm/test/CodeGen/AArch64/sve-reassocadd.ll b/llvm/test/CodeGen/AArch64/sve-reassocadd.ll
index c7261200a567..f54098b29a27 100644
--- a/llvm/test/CodeGen/AArch64/sve-reassocadd.ll
+++ b/llvm/test/CodeGen/AArch64/sve-reassocadd.ll
@@ -22,11 +22,9 @@ entry:
 define <vscale x 16 x i8> @i8_4s_1v(ptr %b) {
 ; CHECK-LABEL: i8_4s_1v:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rdvl x8, #1
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov w9, #4 // =0x4
-; CHECK-NEXT:    add x8, x0, x8
-; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x8, x9]
+; CHECK-NEXT:    add x8, x0, #4
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x8, #1, mul vl]
 ; CHECK-NEXT:    ret
 entry:
   %add.ptr = getelementptr inbounds i8, ptr %b, i64 4
@@ -58,11 +56,9 @@ entry:
 define <vscale x 8 x i16> @i16_8s_1v(ptr %b) {
 ; CHECK-LABEL: i16_8s_1v:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rdvl x8, #1
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov x9, #4 // =0x4
-; CHECK-NEXT:    add x8, x0, x8
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x8, x9, lsl #1]
+; CHECK-NEXT:    add x8, x0, #8
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x8, #1, mul vl]
 ; CHECK-NEXT:    ret
 entry:
   %add.ptr = getelementptr inbounds i8, ptr %b, i64 8
@@ -94,11 +90,9 @@ entry:
 define <vscale x 8 x i16> @i16_8s_2v(ptr %b) {
 ; CHECK-LABEL: i16_8s_2v:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rdvl x8, #2
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov x9, #4 // =0x4
-; CHECK-NEXT:    add x8, x0, x8
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x8, x9, lsl #1]
+; CHECK-NEXT:    add x8, x0, #8
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x8, #2, mul vl]
 ; CHECK-NEXT:    ret
 entry:
   %add.ptr = getelementptr inbounds i8, ptr %b, i64 8
@@ -130,11 +124,9 @@ entry:
 define <vscale x 4 x i32> @i32_16s_2v(ptr %b) {
 ; CHECK-LABEL: i32_16s_2v:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rdvl x8, #1
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov x9, #4 // =0x4
-; CHECK-NEXT:    add x8, x0, x8
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8, x9, lsl #2]
+; CHECK-NEXT:    add x8, x0, #16
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8, #1, mul vl]
 ; CHECK-NEXT:    ret
 entry:
   %add.ptr = getelementptr inbounds i8, ptr %b, i64 16
@@ -166,11 +158,9 @@ entry:
 define <vscale x 2 x i64> @i64_32s_2v(ptr %b) {
 ; CHECK-LABEL: i64_32s_2v:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rdvl x8, #1
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov x9, #4 // =0x4
-; CHECK-NEXT:    add x8, x0, x8
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8, x9, lsl #3]
+; CHECK-NEXT:    add x8, x0, #32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8, #1, mul vl]
 ; CHECK-NEXT:    ret
 entry:
   %add.ptr = getelementptr inbounds i8, ptr %b, i64 32
@@ -203,11 +193,9 @@ entry:
 define <vscale x 16 x i8> @i8_4s_m2v(ptr %b) {
 ; CHECK-LABEL: i8_4s_m2v:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cnth x8, all, mul #4
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov w9, #4 // =0x4
-; CHECK-NEXT:    sub x8, x0, x8
-; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x8, x9]
+; CHECK-NEXT:    add x8, x0, #4
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x8, #-2, mul vl]
 ; CHECK-NEXT:    ret
 entry:
   %add.ptr = getelementptr inbounds i8, ptr %b, i64 4
@@ -239,11 +227,9 @@ entry:
 define <vscale x 8 x i16> @i16_8s_m2v(ptr %b) {
 ; CHECK-LABEL: i16_8s_m2v:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cnth x8, all, mul #4
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov x9, #4 // =0x4
-; CHECK-NEXT:    sub x8, x0, x8
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x8, x9, lsl #1]
+; CHECK-NEXT:    add x8, x0, #8
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x8, #-2, mul vl]
 ; CHECK-NEXT:    ret
 entry:
   %add.ptr = getelementptr inbounds i8, ptr %b, i64 8
@@ -275,11 +261,9 @@ entry:
 define <vscale x 4 x i32> @i32_16s_m2v(ptr %b) {
 ; CHECK-LABEL: i32_16s_m2v:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cnth x8, all, mul #4
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov x9, #4 // =0x4
-; CHECK-NEXT:    sub x8, x0, x8
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8, x9, lsl #2]
+; CHECK-NEXT:    add x8, x0, #16
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8, #-2, mul vl]
 ; CHECK-NEXT:    ret
 entry:
   %add.ptr = getelementptr inbounds i8, ptr %b, i64 16
@@ -311,11 +295,9 @@ entry:
 define <vscale x 2 x i64> @i64_32s_m2v(ptr %b) {
 ; CHECK-LABEL: i64_32s_m2v:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cnth x8, all, mul #4
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov x9, #4 // =0x4
-; CHECK-NEXT:    sub x8, x0, x8
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8, x9, lsl #3]
+; CHECK-NEXT:    add x8, x0, #32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8, #-2, mul vl]
 ; CHECK-NEXT:    ret
 entry:
   %add.ptr = getelementptr inbounds i8, ptr %b, i64 32
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
index fd9259048df5..d81f725eaefc 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -15,12 +14,6 @@ define <4 x i8> @vls_sve_and_4xi8(<4 x i8> %b) nounwind {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: vls_sve_and_4xi8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d1, #0xff000000ff0000
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ret
  %c = and <4 x i8> %b, <i8 0, i8 255, i8 0, i8 255>
  ret <4 x i8> %c
 }
@@ -34,12 +27,6 @@ define <8 x i8> @vls_sve_and_8xi8(<8 x i8> %b) nounwind {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: vls_sve_and_8xi8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d1, #0xff00ff00ff00ff00
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ret
  %c = and <8 x i8> %b, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
  ret <8 x i8> %c
 }
@@ -53,12 +40,6 @@ define <16 x i8> @vls_sve_and_16xi8(<16 x i8> %b) nounwind {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: vls_sve_and_16xi8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v1.2d, #0xff00ff00ff00ff00
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ret
  %c = and <16 x i8> %b, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
  ret <16 x i8> %c
 }
@@ -75,13 +56,6 @@ define <32 x i8> @vls_sve_and_32xi8(<32 x i8> %ap) nounwind {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: vls_sve_and_32xi8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v2.2d, #0xff00ff00ff00ff00
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
  %b = and <32 x i8> %ap, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255,
                          i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
  ret <32 x i8> %b
@@ -99,13 +73,6 @@ define <2 x i16> @vls_sve_and_2xi16(<2 x i16> %b) nounwind {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: vls_sve_and_2xi16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov v0.s[0], wzr
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
  %c = and <2 x i16> %b, <i16 0, i16 65535>
  ret <2 x i16> %c
 }
@@ -119,12 +86,6 @@ define <4 x i16> @vls_sve_and_4xi16(<4 x i16> %b) nounwind {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: vls_sve_and_4xi16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d1, #0xffff0000ffff0000
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ret
  %c = and <4 x i16> %b, <i16 0, i16 65535, i16 0, i16 65535>
  ret <4 x i16> %c
 }
@@ -138,12 +99,6 @@ define <8 x i16> @vls_sve_and_8xi16(<8 x i16> %b) nounwind {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: vls_sve_and_8xi16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v1.2d, #0xffff0000ffff0000
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ret
  %c = and <8 x i16> %b, <i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535>
  ret <8 x i16> %c
 }
@@ -160,13 +115,6 @@ define <16 x i16> @vls_sve_and_16xi16(<16 x i16> %b) nounwind {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: vls_sve_and_16xi16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v2.2d, #0xffff0000ffff0000
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
  %c = and <16 x i16> %b, <i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535>
  ret <16 x i16> %c
 }
@@ -180,13 +128,6 @@ define <2 x i32> @vls_sve_and_2xi32(<2 x i32> %b) nounwind {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: vls_sve_and_2xi32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov v0.s[0], wzr
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
  %c = and <2 x i32> %b, <i32 0, i32 4294967295>
  ret <2 x i32> %c
 }
@@ -200,12 +141,6 @@ define <4 x i32> @vls_sve_and_4xi32(<4 x i32> %b) nounwind {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: vls_sve_and_4xi32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v1.2d, #0xffffffff00000000
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ret
  %c = and <4 x i32> %b, <i32 0, i32 4294967295, i32 0, i32 4294967295>
  ret <4 x i32> %c
 }
@@ -222,13 +157,6 @@ define <8 x i32> @vls_sve_and_8xi32(<8 x i32> %b) nounwind {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: vls_sve_and_8xi32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v2.2d, #0xffffffff00000000
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
  %c = and <8 x i32> %b, <i32 0, i32 4294967295, i32 0, i32 4294967295, i32 0, i32 4294967295, i32 0, i32 4294967295>
  ret <8 x i32> %c
 }
@@ -242,11 +170,6 @@ define <2 x i64> @vls_sve_and_2xi64(<2 x i64> %b) nounwind {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: vls_sve_and_2xi64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov v0.d[0], xzr
-; NONEON-NOSVE-NEXT:    ret
  %c = and <2 x i64> %b, <i64 0, i64 18446744073709551615>
  ret <2 x i64> %c
 }
@@ -262,12 +185,6 @@ define <4 x i64> @vls_sve_and_4xi64(<4 x i64> %b) nounwind {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: vls_sve_and_4xi64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov v0.d[0], xzr
-; NONEON-NOSVE-NEXT:    mov v1.d[0], xzr
-; NONEON-NOSVE-NEXT:    ret
  %c = and <4 x i64> %b, <i64 0, i64 18446744073709551615, i64 0, i64 18446744073709551615>
  ret <4 x i64> %c
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
index 8f0378252a54..d547f99a0230 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -19,16 +18,6 @@ define <4 x i8> @ctlz_v4i8(<4 x i8> %op) {
 ; CHECK-NEXT:    sub z0.h, z0.h, #8 // =0x8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ctlz_v4i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d1, #0xff00ff00ff00ff
-; NONEON-NOSVE-NEXT:    mov w8, #8 // =0x8
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    dup v1.4h, w8
-; NONEON-NOSVE-NEXT:    clz v0.4h, v0.4h
-; NONEON-NOSVE-NEXT:    sub v0.4h, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i8> @llvm.ctlz.v4i8(<4 x i8> %op)
   ret <4 x i8> %res
 }
@@ -41,11 +30,6 @@ define <8 x i8> @ctlz_v8i8(<8 x i8> %op) {
 ; CHECK-NEXT:    clz z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ctlz_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    clz v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %op)
   ret <8 x i8> %res
 }
@@ -58,11 +42,6 @@ define <16 x i8> @ctlz_v16i8(<16 x i8> %op) {
 ; CHECK-NEXT:    clz z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ctlz_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    clz v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %op)
   ret <16 x i8> %res
 }
@@ -76,14 +55,6 @@ define void @ctlz_v32i8(ptr %a) {
 ; CHECK-NEXT:    clz z1.b, p0/m, z1.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ctlz_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    clz v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    clz v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %op)
   store <32 x i8> %res, ptr %a
@@ -100,16 +71,6 @@ define <2 x i16> @ctlz_v2i16(<2 x i16> %op) {
 ; CHECK-NEXT:    sub z0.s, z0.s, #16 // =0x10
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ctlz_v2i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d1, #0x00ffff0000ffff
-; NONEON-NOSVE-NEXT:    mov w8, #16 // =0x10
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    dup v1.2s, w8
-; NONEON-NOSVE-NEXT:    clz v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    sub v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.ctlz.v2i16(<2 x i16> %op)
   ret <2 x i16> %res
 }
@@ -122,11 +83,6 @@ define <4 x i16> @ctlz_v4i16(<4 x i16> %op) {
 ; CHECK-NEXT:    clz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ctlz_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    clz v0.4h, v0.4h
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
 }
@@ -139,11 +95,6 @@ define <8 x i16> @ctlz_v8i16(<8 x i16> %op) {
 ; CHECK-NEXT:    clz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ctlz_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    clz v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
 }
@@ -157,14 +108,6 @@ define void @ctlz_v16i16(ptr %a) {
 ; CHECK-NEXT:    clz z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ctlz_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    clz v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    clz v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %op)
   store <16 x i16> %res, ptr %a
@@ -179,11 +122,6 @@ define <2 x i32> @ctlz_v2i32(<2 x i32> %op) {
 ; CHECK-NEXT:    clz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ctlz_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    clz v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
 }
@@ -196,11 +134,6 @@ define <4 x i32> @ctlz_v4i32(<4 x i32> %op) {
 ; CHECK-NEXT:    clz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ctlz_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    clz v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
 }
@@ -214,14 +147,6 @@ define void @ctlz_v8i32(ptr %a) {
 ; CHECK-NEXT:    clz z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ctlz_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    clz v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    clz v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %op)
   store <8 x i32> %res, ptr %a
@@ -236,27 +161,6 @@ define <1 x i64> @ctlz_v1i64(<1 x i64> %op) {
 ; CHECK-NEXT:    clz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ctlz_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushr d1, d0, #1
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ushr d1, d0, #2
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ushr d1, d0, #4
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ushr d1, d0, #8
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ushr d1, d0, #16
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ushr d1, d0, #32
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    mvn v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.2s, v0.4h
-; NONEON-NOSVE-NEXT:    uaddlp v0.1d, v0.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.ctlz.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
 }
@@ -269,27 +173,6 @@ define <2 x i64> @ctlz_v2i64(<2 x i64> %op) {
 ; CHECK-NEXT:    clz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ctlz_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushr v1.2d, v0.2d, #1
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ushr v1.2d, v0.2d, #2
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ushr v1.2d, v0.2d, #4
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ushr v1.2d, v0.2d, #8
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ushr v1.2d, v0.2d, #16
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ushr v1.2d, v0.2d, #32
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    mvn v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    uaddlp v0.2d, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
 }
@@ -303,46 +186,6 @@ define void @ctlz_v4i64(ptr %a) {
 ; CHECK-NEXT:    clz z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ctlz_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ushr v2.2d, v0.2d, #1
-; NONEON-NOSVE-NEXT:    ushr v3.2d, v1.2d, #1
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v3.16b
-; NONEON-NOSVE-NEXT:    ushr v2.2d, v0.2d, #2
-; NONEON-NOSVE-NEXT:    ushr v3.2d, v1.2d, #2
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v3.16b
-; NONEON-NOSVE-NEXT:    ushr v2.2d, v0.2d, #4
-; NONEON-NOSVE-NEXT:    ushr v3.2d, v1.2d, #4
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v3.16b
-; NONEON-NOSVE-NEXT:    ushr v2.2d, v0.2d, #8
-; NONEON-NOSVE-NEXT:    ushr v3.2d, v1.2d, #8
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v3.16b
-; NONEON-NOSVE-NEXT:    ushr v2.2d, v0.2d, #16
-; NONEON-NOSVE-NEXT:    ushr v3.2d, v1.2d, #16
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v3.16b
-; NONEON-NOSVE-NEXT:    ushr v2.2d, v0.2d, #32
-; NONEON-NOSVE-NEXT:    ushr v3.2d, v1.2d, #32
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v3.16b
-; NONEON-NOSVE-NEXT:    mvn v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    mvn v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v1.8h, v1.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    uaddlp v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    uaddlp v0.2d, v0.4s
-; NONEON-NOSVE-NEXT:    uaddlp v1.2d, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %op)
   store <4 x i64> %res, ptr %a
@@ -362,14 +205,6 @@ define <4 x i8> @ctpop_v4i8(<4 x i8> %op) {
 ; CHECK-NEXT:    cnt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ctpop_v4i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d1, #0xff00ff00ff00ff
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i8> @llvm.ctpop.v4i8(<4 x i8> %op)
   ret <4 x i8> %res
 }
@@ -382,11 +217,6 @@ define <8 x i8> @ctpop_v8i8(<8 x i8> %op) {
 ; CHECK-NEXT:    cnt z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ctpop_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %op)
   ret <8 x i8> %res
 }
@@ -399,11 +229,6 @@ define <16 x i8> @ctpop_v16i8(<16 x i8> %op) {
 ; CHECK-NEXT:    cnt z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ctpop_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %op)
   ret <16 x i8> %res
 }
@@ -417,14 +242,6 @@ define void @ctpop_v32i8(ptr %a) {
 ; CHECK-NEXT:    cnt z1.b, p0/m, z1.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ctpop_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %op)
   store <32 x i8> %res, ptr %a
@@ -440,15 +257,6 @@ define <2 x i16> @ctpop_v2i16(<2 x i16> %op) {
 ; CHECK-NEXT:    cnt z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ctpop_v2i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d1, #0x00ffff0000ffff
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.2s, v0.4h
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.ctpop.v2i16(<2 x i16> %op)
   ret <2 x i16> %res
 }
@@ -461,12 +269,6 @@ define <4 x i16> @ctpop_v4i16(<4 x i16> %op) {
 ; CHECK-NEXT:    cnt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ctpop_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
 }
@@ -479,12 +281,6 @@ define <8 x i16> @ctpop_v8i16(<8 x i16> %op) {
 ; CHECK-NEXT:    cnt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ctpop_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
 }
@@ -498,16 +294,6 @@ define void @ctpop_v16i16(ptr %a) {
 ; CHECK-NEXT:    cnt z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ctpop_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v1.8h, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %op)
   store <16 x i16> %res, ptr %a
@@ -522,13 +308,6 @@ define <2 x i32> @ctpop_v2i32(<2 x i32> %op) {
 ; CHECK-NEXT:    cnt z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ctpop_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.2s, v0.4h
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
 }
@@ -541,13 +320,6 @@ define <4 x i32> @ctpop_v4i32(<4 x i32> %op) {
 ; CHECK-NEXT:    cnt z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ctpop_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
 }
@@ -561,18 +333,6 @@ define void @ctpop_v8i32(ptr %a) {
 ; CHECK-NEXT:    cnt z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ctpop_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v1.8h, v1.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    uaddlp v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %op)
   store <8 x i32> %res, ptr %a
@@ -587,14 +347,6 @@ define <1 x i64> @ctpop_v1i64(<1 x i64> %op) {
 ; CHECK-NEXT:    cnt z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ctpop_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.2s, v0.4h
-; NONEON-NOSVE-NEXT:    uaddlp v0.1d, v0.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.ctpop.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
 }
@@ -607,14 +359,6 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %op) {
 ; CHECK-NEXT:    cnt z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ctpop_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    uaddlp v0.2d, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
 }
@@ -628,20 +372,6 @@ define void @ctpop_v4i64(ptr %a) {
 ; CHECK-NEXT:    cnt z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ctpop_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v1.8h, v1.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    uaddlp v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    uaddlp v0.2d, v0.4s
-; NONEON-NOSVE-NEXT:    uaddlp v1.2d, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %op)
   store <4 x i64> %res, ptr %a
@@ -662,21 +392,6 @@ define <4 x i8> @cttz_v4i8(<4 x i8> %op) {
 ; CHECK-NEXT:    clz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: cttz_v4i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #256 // =0x100
-; NONEON-NOSVE-NEXT:    dup v1.4h, w8
-; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
-; NONEON-NOSVE-NEXT:    dup v2.4h, w8
-; NONEON-NOSVE-NEXT:    mov w8, #16 // =0x10
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    sub v1.4h, v0.4h, v2.4h
-; NONEON-NOSVE-NEXT:    bic v0.8b, v1.8b, v0.8b
-; NONEON-NOSVE-NEXT:    dup v1.4h, w8
-; NONEON-NOSVE-NEXT:    clz v0.4h, v0.4h
-; NONEON-NOSVE-NEXT:    sub v0.4h, v1.4h, v0.4h
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %op)
   ret <4 x i8> %res
 }
@@ -690,14 +405,6 @@ define <8 x i8> @cttz_v8i8(<8 x i8> %op) {
 ; CHECK-NEXT:    clz z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: cttz_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v1.8b, #1
-; NONEON-NOSVE-NEXT:    sub v1.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    bic v0.8b, v1.8b, v0.8b
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %op)
   ret <8 x i8> %res
 }
@@ -711,14 +418,6 @@ define <16 x i8> @cttz_v16i8(<16 x i8> %op) {
 ; CHECK-NEXT:    clz z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: cttz_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v1.16b, #1
-; NONEON-NOSVE-NEXT:    sub v1.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    bic v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %op)
   ret <16 x i8> %res
 }
@@ -734,19 +433,6 @@ define void @cttz_v32i8(ptr %a) {
 ; CHECK-NEXT:    clz z1.b, p0/m, z1.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: cttz_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #1
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    sub v3.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    sub v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bic v1.16b, v3.16b, v1.16b
-; NONEON-NOSVE-NEXT:    bic v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %op)
   store <32 x i8> %res, ptr %a
@@ -763,21 +449,6 @@ define <2 x i16> @cttz_v2i16(<2 x i16> %op) {
 ; CHECK-NEXT:    clz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: cttz_v2i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #65536 // =0x10000
-; NONEON-NOSVE-NEXT:    dup v1.2s, w8
-; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
-; NONEON-NOSVE-NEXT:    dup v2.2s, w8
-; NONEON-NOSVE-NEXT:    mov w8, #32 // =0x20
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    sub v1.2s, v0.2s, v2.2s
-; NONEON-NOSVE-NEXT:    bic v0.8b, v1.8b, v0.8b
-; NONEON-NOSVE-NEXT:    dup v1.2s, w8
-; NONEON-NOSVE-NEXT:    clz v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    sub v0.2s, v1.2s, v0.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %op)
   ret <2 x i16> %res
 }
@@ -791,18 +462,6 @@ define <4 x i16> @cttz_v4i16(<4 x i16> %op) {
 ; CHECK-NEXT:    clz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: cttz_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
-; NONEON-NOSVE-NEXT:    dup v1.4h, w8
-; NONEON-NOSVE-NEXT:    mov w8, #16 // =0x10
-; NONEON-NOSVE-NEXT:    sub v1.4h, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    bic v0.8b, v1.8b, v0.8b
-; NONEON-NOSVE-NEXT:    dup v1.4h, w8
-; NONEON-NOSVE-NEXT:    clz v0.4h, v0.4h
-; NONEON-NOSVE-NEXT:    sub v0.4h, v1.4h, v0.4h
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
 }
@@ -816,18 +475,6 @@ define <8 x i16> @cttz_v8i16(<8 x i16> %op) {
 ; CHECK-NEXT:    clz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: cttz_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
-; NONEON-NOSVE-NEXT:    dup v1.8h, w8
-; NONEON-NOSVE-NEXT:    mov w8, #16 // =0x10
-; NONEON-NOSVE-NEXT:    sub v1.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    bic v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    dup v1.8h, w8
-; NONEON-NOSVE-NEXT:    clz v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    sub v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
 }
@@ -843,24 +490,6 @@ define void @cttz_v16i16(ptr %a) {
 ; CHECK-NEXT:    clz z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: cttz_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    mov w8, #16 // =0x10
-; NONEON-NOSVE-NEXT:    sub v3.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    sub v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    bic v1.16b, v3.16b, v1.16b
-; NONEON-NOSVE-NEXT:    bic v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    dup v2.8h, w8
-; NONEON-NOSVE-NEXT:    clz v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    clz v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    sub v1.8h, v2.8h, v1.8h
-; NONEON-NOSVE-NEXT:    sub v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %op)
   store <16 x i16> %res, ptr %a
@@ -876,18 +505,6 @@ define <2 x i32> @cttz_v2i32(<2 x i32> %op) {
 ; CHECK-NEXT:    clz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: cttz_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
-; NONEON-NOSVE-NEXT:    dup v1.2s, w8
-; NONEON-NOSVE-NEXT:    mov w8, #32 // =0x20
-; NONEON-NOSVE-NEXT:    sub v1.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    bic v0.8b, v1.8b, v0.8b
-; NONEON-NOSVE-NEXT:    dup v1.2s, w8
-; NONEON-NOSVE-NEXT:    clz v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    sub v0.2s, v1.2s, v0.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
 }
@@ -901,18 +518,6 @@ define <4 x i32> @cttz_v4i32(<4 x i32> %op) {
 ; CHECK-NEXT:    clz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: cttz_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
-; NONEON-NOSVE-NEXT:    dup v1.4s, w8
-; NONEON-NOSVE-NEXT:    mov w8, #32 // =0x20
-; NONEON-NOSVE-NEXT:    sub v1.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    bic v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    dup v1.4s, w8
-; NONEON-NOSVE-NEXT:    clz v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    sub v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
 }
@@ -928,24 +533,6 @@ define void @cttz_v8i32(ptr %a) {
 ; CHECK-NEXT:    clz z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: cttz_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    mov w8, #32 // =0x20
-; NONEON-NOSVE-NEXT:    sub v3.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    sub v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    bic v1.16b, v3.16b, v1.16b
-; NONEON-NOSVE-NEXT:    bic v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    dup v2.4s, w8
-; NONEON-NOSVE-NEXT:    clz v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    clz v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    sub v1.4s, v2.4s, v1.4s
-; NONEON-NOSVE-NEXT:    sub v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %op)
   store <8 x i32> %res, ptr %a
@@ -961,18 +548,6 @@ define <1 x i64> @cttz_v1i64(<1 x i64> %op) {
 ; CHECK-NEXT:    clz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: cttz_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
-; NONEON-NOSVE-NEXT:    fmov d1, x8
-; NONEON-NOSVE-NEXT:    sub d1, d0, d1
-; NONEON-NOSVE-NEXT:    bic v0.8b, v1.8b, v0.8b
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.2s, v0.4h
-; NONEON-NOSVE-NEXT:    uaddlp v0.1d, v0.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.cttz.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
 }
@@ -986,18 +561,6 @@ define <2 x i64> @cttz_v2i64(<2 x i64> %op) {
 ; CHECK-NEXT:    clz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: cttz_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
-; NONEON-NOSVE-NEXT:    dup v1.2d, x8
-; NONEON-NOSVE-NEXT:    sub v1.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    bic v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    uaddlp v0.2d, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
 }
@@ -1013,26 +576,6 @@ define void @cttz_v4i64(ptr %a) {
 ; CHECK-NEXT:    clz z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: cttz_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    sub v3.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    sub v0.2d, v2.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bic v1.16b, v3.16b, v1.16b
-; NONEON-NOSVE-NEXT:    bic v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v1.8h, v1.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    uaddlp v1.2d, v1.4s
-; NONEON-NOSVE-NEXT:    uaddlp v0.2d, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %op)
   store <4 x i64> %res, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
index 64dc7ae117d3..e3cc74f766ee 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -12,12 +11,6 @@ define void @bitcast_v4i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bitcast_v4i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr w8, [x0]
-; NONEON-NOSVE-NEXT:    str w8, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <4 x i8>, ptr %a
   %cast = bitcast <4 x i8> %load to <4 x i8>
   store volatile <4 x i8> %cast, ptr %b
@@ -30,12 +23,6 @@ define void @bitcast_v8i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bitcast_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    str d0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <8 x i8>, ptr %a
   %cast = bitcast <8 x i8> %load to <8 x i8>
   store volatile <8 x i8> %cast, ptr %b
@@ -48,12 +35,6 @@ define void @bitcast_v16i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bitcast_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <16 x i8>, ptr %a
   %cast = bitcast <16 x i8> %load to <16 x i8>
   store volatile <16 x i8> %cast, ptr %b
@@ -68,14 +49,6 @@ define void @bitcast_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str q1, [x1, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bitcast_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <32 x i8>, ptr %a
   %cast = bitcast <32 x i8> %load to <32 x i8>
   store volatile <32 x i8> %cast, ptr %b
@@ -99,16 +72,6 @@ define void @bitcast_v2i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str w8, [x1]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bitcast_v2i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldrh w8, [x0]
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    add x8, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x8]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4h, v0.4h, v0.4h
-; NONEON-NOSVE-NEXT:    str s0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <2 x i16>, ptr %a
   %cast = bitcast <2 x i16> %load to <2 x half>
   store volatile <2 x half> %cast, ptr %b
@@ -121,12 +84,6 @@ define void @bitcast_v4i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bitcast_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    str d0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <4 x i16>, ptr %a
   %cast = bitcast <4 x i16> %load to <4 x half>
   store volatile <4 x half> %cast, ptr %b
@@ -139,12 +96,6 @@ define void @bitcast_v8i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bitcast_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <8 x i16>, ptr %a
   %cast = bitcast <8 x i16> %load to <8 x half>
   store volatile <8 x half> %cast, ptr %b
@@ -159,14 +110,6 @@ define void @bitcast_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str q1, [x1, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bitcast_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <16 x i16>, ptr %a
   %cast = bitcast <16 x i16> %load to <16 x half>
   store volatile <16 x half> %cast, ptr %b
@@ -179,12 +122,6 @@ define void @bitcast_v2i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bitcast_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    str d0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <2 x i32>, ptr %a
   %cast = bitcast <2 x i32> %load to <2 x float>
   store volatile <2 x float> %cast, ptr %b
@@ -197,12 +134,6 @@ define void @bitcast_v4i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bitcast_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <4 x i32>, ptr %a
   %cast = bitcast <4 x i32> %load to <4 x float>
   store volatile <4 x float> %cast, ptr %b
@@ -217,14 +148,6 @@ define void @bitcast_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str q1, [x1, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bitcast_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <8 x i32>, ptr %a
   %cast = bitcast <8 x i32> %load to <8 x float>
   store volatile <8 x float> %cast, ptr %b
@@ -237,12 +160,6 @@ define void @bitcast_v1i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bitcast_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    str d0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <1 x i64>, ptr %a
   %cast = bitcast <1 x i64> %load to <1 x double>
   store volatile <1 x double> %cast, ptr %b
@@ -255,12 +172,6 @@ define void @bitcast_v2i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bitcast_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <2 x i64>, ptr %a
   %cast = bitcast <2 x i64> %load to <2 x double>
   store volatile <2 x double> %cast, ptr %b
@@ -275,14 +186,6 @@ define void @bitcast_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str q1, [x1, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bitcast_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <4 x i64>, ptr %a
   %cast = bitcast <4 x i64> %load to <4 x double>
   store volatile <4 x double> %cast, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll
index 5e06cd62118d..74a4aab15597 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64"
 
@@ -31,17 +30,6 @@ define <8 x i32> @fixed_bitselect_v8i32(ptr %pre_cond_ptr, ptr %left_ptr, ptr %r
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fixed_bitselect_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x2]
-; NONEON-NOSVE-NEXT:    neg v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    neg v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v3.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    ret
   %pre_cond = load <8 x i32>, ptr %pre_cond_ptr
   %left = load <8 x i32>, ptr %left_ptr
   %right = load <8 x i32>, ptr %right_ptr
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
index 7a24430a3385..0c490a662a79 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -11,12 +10,6 @@ define void @build_vector_7_inc1_v4i1(ptr %a) {
 ; CHECK-NEXT:    mov w8, #5 // =0x5
 ; CHECK-NEXT:    strb w8, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: build_vector_7_inc1_v4i1:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    strb w8, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <4 x i1> <i1 true, i1 false, i1 true, i1 false>, ptr %a, align 1
   ret void
 }
@@ -30,15 +23,6 @@ define void @build_vector_7_inc1_v32i8(ptr %a) {
 ; CHECK-NEXT:    add z1.b, z1.b, #23 // =0x17
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: build_vector_7_inc1_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI1_0
-; NONEON-NOSVE-NEXT:    adrp x9, .LCPI1_1
-; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI1_0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI1_1]
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <32 x i8> <i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38>, ptr %a, align 1
   ret void
 }
@@ -51,15 +35,6 @@ define void @build_vector_0_inc2_v16i16(ptr %a) {
 ; CHECK-NEXT:    add z0.h, z0.h, #16 // =0x10
 ; CHECK-NEXT:    str q0, [x0, #16]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: build_vector_0_inc2_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI2_0
-; NONEON-NOSVE-NEXT:    adrp x9, .LCPI2_1
-; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI2_0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI2_1]
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <16 x i16> <i16 0, i16 2, i16 4, i16 6, i16 8, i16 10, i16 12, i16 14, i16 16, i16 18, i16 20, i16 22, i16 24, i16 26, i16 28, i16 30>, ptr %a, align 2
   ret void
 }
@@ -73,15 +48,6 @@ define void @build_vector_0_dec3_v8i32(ptr %a) {
 ; CHECK-NEXT:    add z1.s, z0.s, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: build_vector_0_dec3_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI3_0
-; NONEON-NOSVE-NEXT:    adrp x9, .LCPI3_1
-; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI3_0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI3_1]
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <8 x i32> <i32 0, i32 -3, i32 -6, i32 -9, i32 -12, i32 -15, i32 -18, i32 -21>, ptr %a, align 4
   ret void
 }
@@ -98,15 +64,6 @@ define void @build_vector_minus2_dec32_v4i64(ptr %a) {
 ; CHECK-NEXT:    add z0.d, z0.d, z2.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: build_vector_minus2_dec32_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI4_0
-; NONEON-NOSVE-NEXT:    adrp x9, .LCPI4_1
-; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI4_0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI4_1]
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <4 x i64> <i64 -2, i64 -34, i64 -66, i64 -98>, ptr %a, align 8
   ret void
 }
@@ -119,15 +76,6 @@ define void @build_vector_no_stride_v4i64(ptr %a) {
 ; CHECK-NEXT:    index z1.d, #0, #4
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: build_vector_no_stride_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI5_0
-; NONEON-NOSVE-NEXT:    adrp x9, .LCPI5_1
-; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI5_0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI5_1]
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <4 x i64> <i64 0, i64 4, i64 1, i64 8>, ptr %a, align 8
   ret void
 }
@@ -141,15 +89,6 @@ define void @build_vector_0_inc2_v16f16(ptr %a) {
 ; CHECK-NEXT:    ldr q1, [x9, :lo12:.LCPI6_1]
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: build_vector_0_inc2_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI6_0
-; NONEON-NOSVE-NEXT:    adrp x9, .LCPI6_1
-; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI6_0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI6_1]
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <16 x half> <half 0.0, half 2.0, half 4.0, half 6.0, half 8.0, half 10.0, half 12.0, half 14.0, half 16.0, half 18.0, half 20.0, half 22.0, half 24.0, half 26.0, half 28.0, half 30.0>, ptr %a, align 2
   ret void
 }
@@ -164,15 +103,6 @@ define void @build_vector_0_dec3_v8f32(ptr %a) {
 ; CHECK-NEXT:    ldr q1, [x9, :lo12:.LCPI7_1]
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: build_vector_0_dec3_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI7_0
-; NONEON-NOSVE-NEXT:    adrp x9, .LCPI7_1
-; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI7_0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI7_1]
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <8 x float> <float 0.0, float -3.0, float -6.0, float -9.0, float -12.0, float -15.0, float -18.0, float -21.0>, ptr %a, align 4
   ret void
 }
@@ -187,15 +117,6 @@ define void @build_vector_minus2_dec32_v4f64(ptr %a) {
 ; CHECK-NEXT:    ldr q1, [x9, :lo12:.LCPI8_1]
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: build_vector_minus2_dec32_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI8_0
-; NONEON-NOSVE-NEXT:    adrp x9, .LCPI8_1
-; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI8_0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI8_1]
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <4 x double> <double -2.0, double -34.0, double -66.0, double -98.0>, ptr %a, align 8
   ret void
 }
@@ -210,15 +131,6 @@ define void @build_vector_no_stride_v4f64(ptr %a) {
 ; CHECK-NEXT:    ldr q1, [x9, :lo12:.LCPI9_1]
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: build_vector_no_stride_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI9_0
-; NONEON-NOSVE-NEXT:    adrp x9, .LCPI9_1
-; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI9_0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI9_1]
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <4 x double> <double 0.0, double 4.0, double 1.0, double 8.0>, ptr %a, align 8
   ret void
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
index ee997228e453..86494c4be501 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -41,11 +40,6 @@ define <8 x i8> @concat_v8i8(<4 x i8> %op1, <4 x i8> %op2)  {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    uzp1 v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <4 x i8> %op1, <4 x i8> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i8> %res
 }
@@ -59,13 +53,6 @@ define <16 x i8> @concat_v16i8(<8 x i8> %op1, <8 x i8> %op2)  {
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
-; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <8 x i8> %op1, <8 x i8> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                                                                  i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i8> %res
@@ -78,13 +65,6 @@ define void @concat_v32i8(ptr %a, ptr %b, ptr %c)  {
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x1]
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i8>, ptr %a
   %op2 = load <16 x i8>, ptr %b
   %res = shufflevector <16 x i8> %op1, <16 x i8> %op2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -103,14 +83,6 @@ define void @concat_v64i8(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
 ; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v64i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x2, #32]
-; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = shufflevector <32 x i8> %op1, <32 x i8> %op2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -149,11 +121,6 @@ define <4 x i16> @concat_v4i16(<2 x i16> %op1, <2 x i16> %op2)  {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <2 x i16> %op1, <2 x i16> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i16> %res
 }
@@ -168,13 +135,6 @@ define <8 x i16> @concat_v8i16(<4 x i16> %op1, <4 x i16> %op2)  {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
-; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <4 x i16> %op1, <4 x i16> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %res
 }
@@ -186,13 +146,6 @@ define void @concat_v16i16(ptr %a, ptr %b, ptr %c)  {
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x1]
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %op2 = load <8 x i16>, ptr %b
   %res = shufflevector <8 x i16> %op1, <8 x i16> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -209,14 +162,6 @@ define void @concat_v32i16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
 ; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v32i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x2, #32]
-; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = shufflevector <16 x i16> %op1, <16 x i16> %op2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -240,11 +185,6 @@ define <2 x i32> @concat_v2i32(<1 x i32> %op1, <1 x i32> %op2)  {
 ; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    zip1 v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <1 x i32> %op1, <1 x i32> %op2, <2 x i32> <i32 0, i32 1>
   ret <2 x i32> %res
 }
@@ -259,13 +199,6 @@ define <4 x i32> @concat_v4i32(<2 x i32> %op1, <2 x i32> %op2)  {
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
-; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <2 x i32> %op1, <2 x i32> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %res
 }
@@ -277,13 +210,6 @@ define void @concat_v8i32(ptr %a, ptr %b, ptr %c)  {
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x1]
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %op2 = load <4 x i32>, ptr %b
   %res = shufflevector <4 x i32> %op1, <4 x i32> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -299,14 +225,6 @@ define void @concat_v16i32(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
 ; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v16i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x2, #32]
-; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = shufflevector <8 x i32> %op1, <8 x i32> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -329,13 +247,6 @@ define <2 x i64> @concat_v2i64(<1 x i64> %op1, <1 x i64> %op2)  {
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
-; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <1 x i64> %op1, <1 x i64> %op2, <2 x i32> <i32 0, i32 1>
   ret <2 x i64> %res
 }
@@ -347,13 +258,6 @@ define void @concat_v4i64(ptr %a, ptr %b, ptr %c)  {
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x1]
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %op2 = load <2 x i64>, ptr %b
   %res = shufflevector <2 x i64> %op1, <2 x i64> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -369,14 +273,6 @@ define void @concat_v8i64(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
 ; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v8i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x2, #32]
-; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = shufflevector <4 x i64> %op1, <4 x i64> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -404,11 +300,6 @@ define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2)  {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    zip1 v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <2 x half> %op1, <2 x half> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x half> %res
 }
@@ -422,13 +313,6 @@ define <8 x half> @concat_v8f16(<4 x half> %op1, <4 x half> %op2)  {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
-; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <4 x half> %op1, <4 x half> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x half> %res
 }
@@ -440,13 +324,6 @@ define void @concat_v16f16(ptr %a, ptr %b, ptr %c)  {
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x1]
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %op2 = load <8 x half>, ptr %b
   %res = shufflevector <8 x half> %op1, <8 x half> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -463,14 +340,6 @@ define void @concat_v32f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
 ; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v32f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x2, #32]
-; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = shufflevector <16 x half> %op1, <16 x half> %op2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -494,11 +363,6 @@ define <2 x float> @concat_v2f32(<1 x float> %op1, <1 x float> %op2)  {
 ; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    zip1 v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <1 x float> %op1, <1 x float> %op2, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %res
 }
@@ -513,13 +377,6 @@ define <4 x float> @concat_v4f32(<2 x float> %op1, <2 x float> %op2)  {
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
-; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <2 x float> %op1, <2 x float> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x float> %res
 }
@@ -531,13 +388,6 @@ define void @concat_v8f32(ptr %a, ptr %b, ptr %c)  {
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x1]
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %op2 = load <4 x float>, ptr %b
   %res = shufflevector <4 x float> %op1, <4 x float> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -553,14 +403,6 @@ define void @concat_v16f32(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
 ; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v16f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x2, #32]
-; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = shufflevector <8 x float> %op1, <8 x float> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -583,13 +425,6 @@ define <2 x double> @concat_v2f64(<1 x double> %op1, <1 x double> %op2)  {
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
-; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <1 x double> %op1, <1 x double> %op2, <2 x i32> <i32 0, i32 1>
   ret <2 x double> %res
 }
@@ -601,13 +436,6 @@ define void @concat_v4f64(ptr %a, ptr %b, ptr %c)  {
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x1]
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x double>, ptr %a
   %op2 = load <2 x double>, ptr %b
   %res = shufflevector <2 x double> %op1, <2 x double> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -623,14 +451,6 @@ define void @concat_v8f64(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
 ; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v8f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x2, #32]
-; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = shufflevector <4 x double> %op1, <4 x double> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -648,12 +468,6 @@ define void @concat_v32i8_undef(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v32i8_undef:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i8>, ptr %a
   %res = shufflevector <16 x i8> %op1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                                                                     i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
@@ -669,12 +483,6 @@ define void @concat_v16i16_undef(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v16i16_undef:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = shufflevector <8 x i16> %op1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                                                                     i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -688,12 +496,6 @@ define void @concat_v8i32_undef(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v8i32_undef:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %res = shufflevector <4 x i32> %op1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x i32> %res, ptr %b
@@ -706,12 +508,6 @@ define void @concat_v4i64_undef(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v4i64_undef:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %res = shufflevector <2 x i64> %op1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x i64> %res, ptr %b
@@ -728,12 +524,6 @@ define void @concat_v32i8_4op(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v32i8_4op:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i8>, ptr %a
   %shuffle = shufflevector <8 x i8> %op1, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                                                                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -751,12 +541,6 @@ define void @concat_v16i16_4op(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v16i16_4op:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i16>, ptr %a
   %shuffle = shufflevector <4 x i16> %op1, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %res = shufflevector <8 x i16> %shuffle, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -771,12 +555,6 @@ define void @concat_v8i32_4op(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v8i32_4op:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i32>, ptr %a
   %shuffle = shufflevector <2 x i32> %op1, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %res = shufflevector <4 x i32> %shuffle, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -790,12 +568,6 @@ define void @concat_v4i64_4op(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: concat_v4i64_4op:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <1 x i64>, ptr %a
   %shuffle = shufflevector <1 x i64> %op1, <1 x i64> undef, <2 x i32> <i32 0, i32 1>
   %res = shufflevector <2 x i64> %shuffle, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
index 42aa67fb2ab8..0aefba2d4c6a 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -12,12 +11,6 @@ define <8 x i16> @load_zext_v8i8i16(ptr %ap)  {
 ; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: load_zext_v8i8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i8>, ptr %ap
   %val = zext <8 x i8> %a to <8 x i16>
   ret <8 x i16> %val
@@ -30,12 +23,6 @@ define <4 x i32> @load_zext_v4i16i32(ptr %ap)  {
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: load_zext_v4i16i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i16>, ptr %ap
   %val = zext <4 x i16> %a to <4 x i32>
   ret <4 x i32> %val
@@ -48,12 +35,6 @@ define <2 x i64> @load_zext_v2i32i64(ptr %ap) {
 ; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: load_zext_v2i32i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i32>, ptr %ap
   %val = zext <2 x i32> %a to <2 x i64>
   ret <2 x i64> %val
@@ -73,19 +54,6 @@ define <2 x i256> @load_zext_v2i64i256(ptr %ap) {
 ; CHECK-NEXT:    mov x7, xzr
 ; CHECK-NEXT:    fmov x4, d1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: load_zext_v2i64i256:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    mov x1, xzr
-; NONEON-NOSVE-NEXT:    mov x2, xzr
-; NONEON-NOSVE-NEXT:    mov x3, xzr
-; NONEON-NOSVE-NEXT:    mov x5, xzr
-; NONEON-NOSVE-NEXT:    mov x6, xzr
-; NONEON-NOSVE-NEXT:    mov x4, v0.d[1]
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    mov x7, xzr
-; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i64>, ptr %ap
   %val = zext <2 x i64> %a to <2 x i256>
   ret <2 x i256> %val
@@ -107,24 +75,6 @@ define <16 x i32> @load_sext_v16i8i32(ptr %ap)  {
 ; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $z2
 ; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $z3
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: load_sext_v16i8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    sshll v1.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v2.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #16]
-; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #24]
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v3.4s, v4.4h, #0
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
-; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i8>, ptr %ap
   %val = sext <16 x i8> %a to <16 x i32>
   ret <16 x i32> %val
@@ -140,17 +90,6 @@ define <8 x i32> @load_sext_v8i16i32(ptr %ap)  {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: load_sext_v8i16i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i16>, ptr %ap
   %val = sext <8 x i16> %a to <8 x i32>
   ret <8 x i32> %val
@@ -182,39 +121,6 @@ define <4 x i256> @load_sext_v4i32i256(ptr %ap) {
 ; CHECK-NEXT:    stp x12, x12, [x8, #112]
 ; CHECK-NEXT:    stp x11, x12, [x8, #96]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: load_sext_v4i32i256:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    add x10, x8, #32
-; NONEON-NOSVE-NEXT:    add x11, x8, #96
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    mov x9, v0.d[1]
-; NONEON-NOSVE-NEXT:    st1 { v0.d }[1], [x10]
-; NONEON-NOSVE-NEXT:    fmov x10, d0
-; NONEON-NOSVE-NEXT:    st1 { v1.d }[1], [x11]
-; NONEON-NOSVE-NEXT:    mov x11, v1.d[1]
-; NONEON-NOSVE-NEXT:    asr x10, x10, #63
-; NONEON-NOSVE-NEXT:    str d0, [x8]
-; NONEON-NOSVE-NEXT:    asr x9, x9, #63
-; NONEON-NOSVE-NEXT:    str d1, [x8, #64]
-; NONEON-NOSVE-NEXT:    stp x10, x10, [x8, #16]
-; NONEON-NOSVE-NEXT:    stp x9, x9, [x8, #48]
-; NONEON-NOSVE-NEXT:    str x9, [x8, #40]
-; NONEON-NOSVE-NEXT:    fmov x9, d1
-; NONEON-NOSVE-NEXT:    str x10, [x8, #8]
-; NONEON-NOSVE-NEXT:    asr x10, x11, #63
-; NONEON-NOSVE-NEXT:    asr x9, x9, #63
-; NONEON-NOSVE-NEXT:    stp x10, x10, [x8, #112]
-; NONEON-NOSVE-NEXT:    str x10, [x8, #104]
-; NONEON-NOSVE-NEXT:    stp x9, x9, [x8, #80]
-; NONEON-NOSVE-NEXT:    str x9, [x8, #72]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i32>, ptr %ap
   %val = sext <4 x i32> %a to <4 x i256>
   ret <4 x i256> %val
@@ -248,22 +154,6 @@ define <2 x i256> @load_sext_v2i64i256(ptr %ap) {
 ; CHECK-NEXT:    fmov x1, d6
 ; CHECK-NEXT:    fmov x5, d0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: load_sext_v2i64i256:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
-; NONEON-NOSVE-NEXT:    dup v1.2d, v0.d[1]
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    asr x1, x0, #63
-; NONEON-NOSVE-NEXT:    asr x5, x8, #63
-; NONEON-NOSVE-NEXT:    mov x2, x1
-; NONEON-NOSVE-NEXT:    mov x3, x1
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x5
-; NONEON-NOSVE-NEXT:    mov x6, x5
-; NONEON-NOSVE-NEXT:    mov x7, x5
-; NONEON-NOSVE-NEXT:    fmov x4, d1
-; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i64>, ptr %ap
   %val = sext <2 x i64> %a to <2 x i256>
   ret <2 x i256> %val
@@ -297,34 +187,6 @@ define <16 x i64> @load_zext_v16i16i64(ptr %ap)  {
 ; CHECK-NEXT:    // kill: def $q6 killed $q6 killed $z6
 ; CHECK-NEXT:    // kill: def $q7 killed $q7 killed $z7
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: load_zext_v16i16i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v3.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
-; NONEON-NOSVE-NEXT:    ushll v4.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v5.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    stp q1, q2, [sp, #32]
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #56]
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #40]
-; NONEON-NOSVE-NEXT:    stp q5, q3, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldr d16, [sp, #88]
-; NONEON-NOSVE-NEXT:    ldr d17, [sp, #72]
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v6.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v3.2d, v7.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v6.2d, v5.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v5.2d, v16.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v7.2d, v17.2s, #0
-; NONEON-NOSVE-NEXT:    add sp, sp, #96
-; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %ap
   %val = zext <16 x i16> %a to <16 x i64>
   ret <16 x i64> %val
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
index d050ddc77640..25ecd7a8d7e3 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -28,11 +27,6 @@ define <4 x i1> @extract_subvector_v8i1(<8 x i1> %op) {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: extract_subvector_v8i1:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    zip2 v0.8b, v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    ret
   %ret = call <4 x i1> @llvm.vector.extract.v4i1.v8i1(<8 x i1> %op, i64 4)
   ret <4 x i1> %ret
 }
@@ -60,11 +54,6 @@ define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: extract_subvector_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    zip2 v0.8b, v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    ret
   %ret = call <4 x i8> @llvm.vector.extract.v4i8.v8i8(<8 x i8> %op, i64 4)
   ret <4 x i8> %ret
 }
@@ -76,14 +65,6 @@ define <8 x i8> @extract_subvector_v16i8(<16 x i8> %op) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: extract_subvector_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %ret = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> %op, i64 8)
   ret <8 x i8> %ret
 }
@@ -94,12 +75,6 @@ define void @extract_subvector_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: extract_subvector_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %ret = call <16 x i8> @llvm.vector.extract.v16i8.v32i8(<32 x i8> %op, i64 16)
   store <16 x i8> %ret, ptr %b
@@ -116,15 +91,6 @@ define <2 x i16> @extract_subvector_v4i16(<4 x i16> %op) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: extract_subvector_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %ret = call <2 x i16> @llvm.vector.extract.v2i16.v4i16(<4 x i16> %op, i64 2)
   ret <2 x i16> %ret
 }
@@ -136,14 +102,6 @@ define <4 x i16> @extract_subvector_v8i16(<8 x i16> %op) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: extract_subvector_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %ret = call <4 x i16> @llvm.vector.extract.v4i16.v8i16(<8 x i16> %op, i64 4)
   ret <4 x i16> %ret
 }
@@ -154,12 +112,6 @@ define void @extract_subvector_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: extract_subvector_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %ret = call <8 x i16> @llvm.vector.extract.v8i16.v16i16(<16 x i16> %op, i64 8)
   store <8 x i16> %ret, ptr %b
@@ -175,12 +127,6 @@ define <1 x i32> @extract_subvector_v2i32(<2 x i32> %op) {
 ; CHECK-NEXT:    mov z0.s, z0.s[1]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: extract_subvector_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.2s, v0.s[1]
-; NONEON-NOSVE-NEXT:    ret
   %ret = call <1 x i32> @llvm.vector.extract.v1i32.v2i32(<2 x i32> %op, i64 1)
   ret <1 x i32> %ret
 }
@@ -192,14 +138,6 @@ define <2 x i32> @extract_subvector_v4i32(<4 x i32> %op) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: extract_subvector_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %ret = call <2 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32> %op, i64 2)
   ret <2 x i32> %ret
 }
@@ -210,12 +148,6 @@ define void @extract_subvector_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: extract_subvector_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %ret = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> %op, i64 4)
   store <4 x i32> %ret, ptr %b
@@ -231,14 +163,6 @@ define <1 x i64> @extract_subvector_v2i64(<2 x i64> %op) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: extract_subvector_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %ret = call <1 x i64> @llvm.vector.extract.v1i64.v2i64(<2 x i64> %op, i64 1)
   ret <1 x i64> %ret
 }
@@ -249,12 +173,6 @@ define void @extract_subvector_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: extract_subvector_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %ret = call <2 x i64> @llvm.vector.extract.v2i64.v4i64(<4 x i64> %op, i64 2)
   store <2 x i64> %ret, ptr %b
@@ -272,12 +190,6 @@ define <2 x half> @extract_subvector_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    tbl z0.h, { z0.h }, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: extract_subvector_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.2s, v0.s[1]
-; NONEON-NOSVE-NEXT:    ret
   %ret = call <2 x half> @llvm.vector.extract.v2f16.v4f16(<4 x half> %op, i64 2)
   ret <2 x half> %ret
 }
@@ -289,14 +201,6 @@ define <4 x half> @extract_subvector_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: extract_subvector_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %ret = call <4 x half> @llvm.vector.extract.v4f16.v8f16(<8 x half> %op, i64 4)
   ret <4 x half> %ret
 }
@@ -307,12 +211,6 @@ define void @extract_subvector_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: extract_subvector_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %ret = call <8 x half> @llvm.vector.extract.v8f16.v16f16(<16 x half> %op, i64 8)
   store <8 x half> %ret, ptr %b
@@ -328,12 +226,6 @@ define <1 x float> @extract_subvector_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    mov z0.s, z0.s[1]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: extract_subvector_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.2s, v0.s[1]
-; NONEON-NOSVE-NEXT:    ret
   %ret = call <1 x float> @llvm.vector.extract.v1f32.v2f32(<2 x float> %op, i64 1)
   ret <1 x float> %ret
 }
@@ -345,14 +237,6 @@ define <2 x float> @extract_subvector_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: extract_subvector_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %ret = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> %op, i64 2)
   ret <2 x float> %ret
 }
@@ -363,12 +247,6 @@ define void @extract_subvector_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: extract_subvector_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %ret = call <4 x float> @llvm.vector.extract.v4f32.v8f32(<8 x float> %op, i64 4)
   store <4 x float> %ret, ptr %b
@@ -384,14 +262,6 @@ define <1 x double> @extract_subvector_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: extract_subvector_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %ret = call <1 x double> @llvm.vector.extract.v1f64.v2f64(<2 x double> %op, i64 1)
   ret <1 x double> %ret
 }
@@ -402,12 +272,6 @@ define void @extract_subvector_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: extract_subvector_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %ret = call <2 x double> @llvm.vector.extract.v2f64.v4f64(<4 x double> %op, i64 2)
   store <2 x double> %ret, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll
index b2cf818e6e3c..a752e119b2fb 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -16,12 +15,6 @@ define half @extractelement_v2f16(<2 x half> %op1) {
 ; CHECK-NEXT:    mov z0.h, z0.h[1]
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: extractelement_v2f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[1]
-; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <2 x half> %op1, i64 1
   ret half %r
 }
@@ -33,12 +26,6 @@ define half @extractelement_v4f16(<4 x half> %op1) {
 ; CHECK-NEXT:    mov z0.h, z0.h[3]
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: extractelement_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
-; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <4 x half> %op1, i64 3
   ret half %r
 }
@@ -50,11 +37,6 @@ define half @extractelement_v8f16(<8 x half> %op1) {
 ; CHECK-NEXT:    mov z0.h, z0.h[7]
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: extractelement_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <8 x half> %op1, i64 7
   ret half %r
 }
@@ -66,11 +48,6 @@ define half @extractelement_v16f16(ptr %a) {
 ; CHECK-NEXT:    mov z0.h, z0.h[7]
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: extractelement_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr h0, [x0, #30]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %r = extractelement <16 x half> %op1, i64 15
   ret half %r
@@ -83,12 +60,6 @@ define float @extractelement_v2f32(<2 x float> %op1) {
 ; CHECK-NEXT:    mov z0.s, z0.s[1]
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: extractelement_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov s0, v0.s[1]
-; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <2 x float> %op1, i64 1
   ret float %r
 }
@@ -100,11 +71,6 @@ define float @extractelement_v4f32(<4 x float> %op1) {
 ; CHECK-NEXT:    mov z0.s, z0.s[3]
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: extractelement_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov s0, v0.s[3]
-; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <4 x float> %op1, i64 3
   ret float %r
 }
@@ -116,11 +82,6 @@ define float @extractelement_v8f32(ptr %a) {
 ; CHECK-NEXT:    mov z0.s, z0.s[3]
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: extractelement_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr s0, [x0, #28]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %r = extractelement <8 x float> %op1, i64 7
   ret float %r
@@ -130,10 +91,6 @@ define double @extractelement_v1f64(<1 x double> %op1) {
 ; CHECK-LABEL: extractelement_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: extractelement_v1f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <1 x double> %op1, i64 0
   ret double %r
 }
@@ -144,11 +101,6 @@ define double @extractelement_v2f64(<2 x double> %op1) {
 ; CHECK-NEXT:    mov z0.d, z0.d[1]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: extractelement_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov d0, v0.d[1]
-; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <2 x double> %op1, i64 1
   ret double %r
 }
@@ -160,11 +112,6 @@ define double @extractelement_v4f64(ptr %a) {
 ; CHECK-NEXT:    mov z0.d, z0.d[1]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: extractelement_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0, #24]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %r = extractelement <4 x double> %op1, i64 3
   ret double %r
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
index bed5dd53c519..0d6675def8b5 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
@@ -2,7 +2,6 @@
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE
 ; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
@@ -29,16 +28,6 @@ define void @test_copysign_v4f16_v4f16(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
 ; SVE2-NEXT:    str d1, [x0]
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: test_copysign_v4f16_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #32767 // =0x7fff
-; NONEON-NOSVE-NEXT:    ldr d1, [x0]
-; NONEON-NOSVE-NEXT:    ldr d2, [x1]
-; NONEON-NOSVE-NEXT:    dup v0.4h, w8
-; NONEON-NOSVE-NEXT:    bsl v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    str d0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x half>, ptr %ap
   %b = load <4 x half>, ptr %bp
   %r = call <4 x half> @llvm.copysign.v4f16(<4 x half> %a, <4 x half> %b)
@@ -65,16 +54,6 @@ define void @test_copysign_v8f16_v8f16(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
 ; SVE2-NEXT:    str q1, [x0]
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: test_copysign_v8f16_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #32767 // =0x7fff
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x half>, ptr %ap
   %b = load <8 x half>, ptr %bp
   %r = call <8 x half> @llvm.copysign.v8f16(<8 x half> %a, <8 x half> %b)
@@ -105,17 +84,6 @@ define void @test_copysign_v16f16_v16f16(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z3.d, z3.d, z4.d, z0.d
 ; SVE2-NEXT:    stp q2, q3, [x0]
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: test_copysign_v16f16_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #32767 // =0x7fff
-; NONEON-NOSVE-NEXT:    ldp q1, q4, [x1]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    bit v1.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v3.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x half>, ptr %ap
   %b = load <16 x half>, ptr %bp
   %r = call <16 x half> @llvm.copysign.v16f16(<16 x half> %a, <16 x half> %b)
@@ -144,16 +112,6 @@ define void @test_copysign_v2f32_v2f32(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
 ; SVE2-NEXT:    str d1, [x0]
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: test_copysign_v2f32_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d0, #0xffffffffffffffff
-; NONEON-NOSVE-NEXT:    ldr d1, [x0]
-; NONEON-NOSVE-NEXT:    ldr d2, [x1]
-; NONEON-NOSVE-NEXT:    fneg v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    bsl v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    str d0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x float>, ptr %ap
   %b = load <2 x float>, ptr %bp
   %r = call <2 x float> @llvm.copysign.v2f32(<2 x float> %a, <2 x float> %b)
@@ -180,16 +138,6 @@ define void @test_copysign_v4f32_v4f32(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
 ; SVE2-NEXT:    str q1, [x0]
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: test_copysign_v4f32_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1]
-; NONEON-NOSVE-NEXT:    fneg v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x float>, ptr %ap
   %b = load <4 x float>, ptr %bp
   %r = call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %b)
@@ -220,17 +168,6 @@ define void @test_copysign_v8f32_v8f32(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z3.d, z3.d, z4.d, z0.d
 ; SVE2-NEXT:    stp q2, q3, [x0]
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: test_copysign_v8f32_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
-; NONEON-NOSVE-NEXT:    ldp q1, q4, [x1]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    fneg v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    bit v1.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v3.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x float>, ptr %ap
   %b = load <8 x float>, ptr %bp
   %r = call <8 x float> @llvm.copysign.v8f32(<8 x float> %a, <8 x float> %b)
@@ -259,16 +196,6 @@ define void @test_copysign_v2f64_v2f64(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
 ; SVE2-NEXT:    str q1, [x0]
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: test_copysign_v2f64_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1]
-; NONEON-NOSVE-NEXT:    fneg v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x double>, ptr %ap
   %b = load <2 x double>, ptr %bp
   %r = call <2 x double> @llvm.copysign.v2f64(<2 x double> %a, <2 x double> %b)
@@ -299,17 +226,6 @@ define void @test_copysign_v4f64_v4f64(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z3.d, z3.d, z4.d, z0.d
 ; SVE2-NEXT:    stp q2, q3, [x0]
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: test_copysign_v4f64_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
-; NONEON-NOSVE-NEXT:    ldp q1, q4, [x1]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    fneg v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bit v1.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v3.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x double>, ptr %ap
   %b = load <4 x double>, ptr %bp
   %r = call <4 x double> @llvm.copysign.v4f64(<4 x double> %a, <4 x double> %b)
@@ -344,17 +260,6 @@ define void @test_copysign_v2f32_v2f64(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z2.d, z2.d, z0.d, z1.d
 ; SVE2-NEXT:    str d2, [x0]
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: test_copysign_v2f32_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d0, #0xffffffffffffffff
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    ldr d2, [x0]
-; NONEON-NOSVE-NEXT:    fcvtn v1.2s, v1.2d
-; NONEON-NOSVE-NEXT:    fneg v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    bsl v0.8b, v2.8b, v1.8b
-; NONEON-NOSVE-NEXT:    str d0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x float>, ptr %ap
   %b = load <2 x double>, ptr %bp
   %tmp0 = fptrunc <2 x double> %b to <2 x float>
@@ -399,18 +304,6 @@ define void @test_copysign_v4f32_v4f64(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z2.d, z2.d, z0.d, z1.d
 ; SVE2-NEXT:    str q2, [x0]
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: test_copysign_v4f32_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
-; NONEON-NOSVE-NEXT:    fcvtn v1.2s, v1.2d
-; NONEON-NOSVE-NEXT:    fneg v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.4s, v2.2d
-; NONEON-NOSVE-NEXT:    ldr q2, [x0]
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v1.16b
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x float>, ptr %ap
   %b = load <4 x double>, ptr %bp
   %tmp0 = fptrunc <4 x double> %b to <4 x float>
@@ -444,17 +337,6 @@ define void @test_copysign_v2f64_v2f32(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z2.d, z2.d, z0.d, z1.d
 ; SVE2-NEXT:    str q2, [x0]
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: test_copysign_v2f64_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
-; NONEON-NOSVE-NEXT:    ldr d1, [x1]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
-; NONEON-NOSVE-NEXT:    fneg v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v1.16b
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x double>, ptr %ap
   %b = load < 2 x float>, ptr %bp
   %tmp0 = fpext <2 x float> %b to <2 x double>
@@ -499,23 +381,6 @@ define void @test_copysign_v4f64_v4f32(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z4.d, z4.d, z1.d, z2.d
 ; SVE2-NEXT:    stp q3, q4, [x0]
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: test_copysign_v4f64_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
-; NONEON-NOSVE-NEXT:    str q1, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
-; NONEON-NOSVE-NEXT:    fneg v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtl v4.2d, v4.2s
-; NONEON-NOSVE-NEXT:    bit v1.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v3.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x double>, ptr %ap
   %b = load <4 x float>, ptr %bp
   %tmp0 = fpext <4 x float> %b to <4 x double>
@@ -551,17 +416,6 @@ define void @test_copysign_v4f16_v4f32(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z2.d, z2.d, z0.d, z1.d
 ; SVE2-NEXT:    str d2, [x0]
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: test_copysign_v4f16_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x1]
-; NONEON-NOSVE-NEXT:    mov w8, #32767 // =0x7fff
-; NONEON-NOSVE-NEXT:    ldr d2, [x0]
-; NONEON-NOSVE-NEXT:    dup v1.4h, w8
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    bit v0.8b, v2.8b, v1.8b
-; NONEON-NOSVE-NEXT:    str d0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x half>, ptr %ap
   %b = load <4 x float>, ptr %bp
   %tmp0 = fptrunc <4 x float> %b to <4 x half>
@@ -603,19 +457,6 @@ define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z2.d, z2.d, z0.d, z1.d
 ; SVE2-NEXT:    str d2, [x0]
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: test_copysign_v4f16_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    mov w8, #32767 // =0x7fff
-; NONEON-NOSVE-NEXT:    ldr d2, [x0]
-; NONEON-NOSVE-NEXT:    fcvtxn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtxn2 v0.4s, v1.2d
-; NONEON-NOSVE-NEXT:    dup v1.4h, w8
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    bit v0.8b, v2.8b, v1.8b
-; NONEON-NOSVE-NEXT:    str d0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x half>, ptr %ap
   %b = load <4 x double>, ptr %bp
   %tmp0 = fptrunc <4 x double> %b to <4 x half>
@@ -659,18 +500,6 @@ define void @test_copysign_v8f16_v8f32(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z2.d, z2.d, z0.d, z1.d
 ; SVE2-NEXT:    str q2, [x0]
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: test_copysign_v8f16_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    mov w8, #32767 // =0x7fff
-; NONEON-NOSVE-NEXT:    ldr q2, [x0]
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
-; NONEON-NOSVE-NEXT:    dup v1.8h, w8
-; NONEON-NOSVE-NEXT:    bit v0.16b, v2.16b, v1.16b
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x half>, ptr %ap
   %b = load <8 x float>, ptr %bp
   %tmp0 = fptrunc <8 x float> %b to <8 x half>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
index 662a8f2b55fd..c2d6ed4e9ccf 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -18,14 +17,6 @@ define <2 x half> @fadd_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fadd_v2f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = fadd <2 x half> %op1, %op2
   ret <2 x half> %res
 }
@@ -39,14 +30,6 @@ define <4 x half> @fadd_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fadd_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = fadd <4 x half> %op1, %op2
   ret <4 x half> %res
 }
@@ -60,18 +43,6 @@ define <8 x half> @fadd_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fadd_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fadd v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fadd v1.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = fadd <8 x half> %op1, %op2
   ret <8 x half> %res
 }
@@ -87,29 +58,6 @@ define void @fadd_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fadd_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v6.4s, v3.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl v5.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v7.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v3.4s, v3.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
-; NONEON-NOSVE-NEXT:    fadd v4.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    fadd v5.4s, v7.4s, v6.4s
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fadd v2.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v4.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v5.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v2.4s
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = fadd <16 x half> %op1, %op2
@@ -126,11 +74,6 @@ define <2 x float> @fadd_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fadd_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fadd v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = fadd <2 x float> %op1, %op2
   ret <2 x float> %res
 }
@@ -144,11 +87,6 @@ define <4 x float> @fadd_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fadd_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = fadd <4 x float> %op1, %op2
   ret <4 x float> %res
 }
@@ -164,15 +102,6 @@ define void @fadd_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fadd_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fadd v1.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = fadd <8 x float> %op1, %op2
@@ -189,11 +118,6 @@ define <2 x double> @fadd_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fadd_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fadd v0.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = fadd <2 x double> %op1, %op2
   ret <2 x double> %res
 }
@@ -209,15 +133,6 @@ define void @fadd_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fadd_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fadd v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fadd v1.2d, v2.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = fadd <4 x double> %op1, %op2
@@ -238,14 +153,6 @@ define <2 x half> @fdiv_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ; CHECK-NEXT:    fdiv z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fdiv_v2f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fdiv v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <2 x half> %op1, %op2
   ret <2 x half> %res
 }
@@ -259,14 +166,6 @@ define <4 x half> @fdiv_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    fdiv z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fdiv_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fdiv v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <4 x half> %op1, %op2
   ret <4 x half> %res
 }
@@ -280,18 +179,6 @@ define <8 x half> @fdiv_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    fdiv z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fdiv_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fdiv v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fdiv v1.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <8 x half> %op1, %op2
   ret <8 x half> %res
 }
@@ -307,30 +194,6 @@ define void @fdiv_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fdiv z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fdiv_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q4, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v5.4s, v4.8h
-; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v4.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fdiv v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    ldr q3, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl2 v6.4s, v3.8h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
-; NONEON-NOSVE-NEXT:    fdiv v3.4s, v3.4s, v4.4s
-; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fdiv v5.4s, v6.4s, v5.4s
-; NONEON-NOSVE-NEXT:    fdiv v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v5.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = fdiv <16 x half> %op1, %op2
@@ -347,11 +210,6 @@ define <2 x float> @fdiv_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    fdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fdiv_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fdiv v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <2 x float> %op1, %op2
   ret <2 x float> %res
 }
@@ -365,11 +223,6 @@ define <4 x float> @fdiv_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    fdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fdiv_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fdiv v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <4 x float> %op1, %op2
   ret <4 x float> %res
 }
@@ -385,15 +238,6 @@ define void @fdiv_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fdiv z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fdiv_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fdiv v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fdiv v1.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = fdiv <8 x float> %op1, %op2
@@ -410,11 +254,6 @@ define <2 x double> @fdiv_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    fdiv z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fdiv_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fdiv v0.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <2 x double> %op1, %op2
   ret <2 x double> %res
 }
@@ -430,15 +269,6 @@ define void @fdiv_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fdiv z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fdiv_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fdiv v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fdiv v1.2d, v2.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = fdiv <4 x double> %op1, %op2
@@ -460,46 +290,6 @@ define <2 x half> @fma_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x half> %op3)
 ; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fma_v2f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d2 killed $d2 def $q2
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    fcvt s16, h0
-; NONEON-NOSVE-NEXT:    mov h17, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h18, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fmadd s6, s16, s7, s6
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s7, h19
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmadd s3, s5, s4, s3
-; NONEON-NOSVE-NEXT:    fcvt s4, h17
-; NONEON-NOSVE-NEXT:    fcvt s5, h18
-; NONEON-NOSVE-NEXT:    fcvt h0, s6
-; NONEON-NOSVE-NEXT:    fmadd s4, s7, s5, s4
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h16
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h3, s4
-; NONEON-NOSVE-NEXT:    fmadd s1, s5, s1, s2
-; NONEON-NOSVE-NEXT:    mov v0.h[2], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %op1, <2 x half> %op2, <2 x half> %op3)
   ret <2 x half> %res
 }
@@ -514,46 +304,6 @@ define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3)
 ; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fma_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d2 killed $d2 def $q2
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    fcvt s16, h0
-; NONEON-NOSVE-NEXT:    mov h17, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h18, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fmadd s6, s16, s7, s6
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s7, h19
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmadd s3, s5, s4, s3
-; NONEON-NOSVE-NEXT:    fcvt s4, h17
-; NONEON-NOSVE-NEXT:    fcvt s5, h18
-; NONEON-NOSVE-NEXT:    fcvt h0, s6
-; NONEON-NOSVE-NEXT:    fmadd s4, s7, s5, s4
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h16
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h3, s4
-; NONEON-NOSVE-NEXT:    fmadd s1, s5, s1, s2
-; NONEON-NOSVE-NEXT:    mov v0.h[2], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.fma.v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3)
   ret <4 x half> %res
 }
@@ -568,79 +318,6 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
 ; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fma_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    fcvt s16, h0
-; NONEON-NOSVE-NEXT:    mov h17, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h18, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fmadd s6, s16, s7, s6
-; NONEON-NOSVE-NEXT:    fcvt s7, h17
-; NONEON-NOSVE-NEXT:    fcvt s16, h18
-; NONEON-NOSVE-NEXT:    fcvt s17, h19
-; NONEON-NOSVE-NEXT:    mov h18, v1.h[3]
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[3]
-; NONEON-NOSVE-NEXT:    fmadd s4, s5, s4, s3
-; NONEON-NOSVE-NEXT:    mov h5, v2.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h3, s6
-; NONEON-NOSVE-NEXT:    fmadd s6, s17, s16, s7
-; NONEON-NOSVE-NEXT:    mov h17, v2.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s7, h18
-; NONEON-NOSVE-NEXT:    fcvt s16, h19
-; NONEON-NOSVE-NEXT:    mov h18, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fcvt s18, h18
-; NONEON-NOSVE-NEXT:    mov v3.h[1], v4.h[0]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    fmadd s5, s16, s7, s5
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s19, h19
-; NONEON-NOSVE-NEXT:    mov v3.h[2], v6.h[0]
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fcvt h5, s5
-; NONEON-NOSVE-NEXT:    fmadd s17, s19, s18, s17
-; NONEON-NOSVE-NEXT:    mov h18, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fmadd s4, s16, s7, s4
-; NONEON-NOSVE-NEXT:    mov v3.h[3], v5.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h6
-; NONEON-NOSVE-NEXT:    fcvt s6, h18
-; NONEON-NOSVE-NEXT:    fcvt s7, h19
-; NONEON-NOSVE-NEXT:    fcvt h16, s17
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fmadd s5, s7, s6, s5
-; NONEON-NOSVE-NEXT:    mov v3.h[4], v16.h[0]
-; NONEON-NOSVE-NEXT:    fmadd s0, s0, s1, s2
-; NONEON-NOSVE-NEXT:    mov v3.h[5], v4.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h4, s5
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    mov v3.h[6], v4.h[0]
-; NONEON-NOSVE-NEXT:    mov v3.h[7], v0.h[0]
-; NONEON-NOSVE-NEXT:    mov v0.16b, v3.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.fma.v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
   ret <8 x half> %res
 }
@@ -657,150 +334,6 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    fmla z1.h, p0/m, z3.h, z4.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fma_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q3, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q4, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q5, q2, [x2]
-; NONEON-NOSVE-NEXT:    mov h25, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s19, h0
-; NONEON-NOSVE-NEXT:    mov h24, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h17, v1.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s18, h1
-; NONEON-NOSVE-NEXT:    mov h22, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h16, v2.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    mov h20, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h26, v5.h[1]
-; NONEON-NOSVE-NEXT:    mov h27, v4.h[1]
-; NONEON-NOSVE-NEXT:    mov h28, v3.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s25, h25
-; NONEON-NOSVE-NEXT:    mov h7, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h29, v4.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s23, h17
-; NONEON-NOSVE-NEXT:    mov h17, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h30, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s21, h16
-; NONEON-NOSVE-NEXT:    fmadd s6, s19, s18, s6
-; NONEON-NOSVE-NEXT:    fcvt s18, h20
-; NONEON-NOSVE-NEXT:    fcvt s19, h22
-; NONEON-NOSVE-NEXT:    fcvt s20, h24
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s22, h5
-; NONEON-NOSVE-NEXT:    fcvt s24, h4
-; NONEON-NOSVE-NEXT:    fcvt s26, h26
-; NONEON-NOSVE-NEXT:    fcvt s27, h27
-; NONEON-NOSVE-NEXT:    fcvt s28, h28
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fmadd s21, s25, s23, s21
-; NONEON-NOSVE-NEXT:    fcvt s23, h3
-; NONEON-NOSVE-NEXT:    mov h25, v5.h[2]
-; NONEON-NOSVE-NEXT:    fmadd s18, s20, s19, s18
-; NONEON-NOSVE-NEXT:    mov h19, v3.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    mov h31, v0.h[4]
-; NONEON-NOSVE-NEXT:    fmadd s26, s28, s27, s26
-; NONEON-NOSVE-NEXT:    mov h27, v4.h[3]
-; NONEON-NOSVE-NEXT:    mov h28, v3.h[3]
-; NONEON-NOSVE-NEXT:    fmadd s22, s23, s24, s22
-; NONEON-NOSVE-NEXT:    fcvt h20, s21
-; NONEON-NOSVE-NEXT:    mov h21, v2.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s23, h25
-; NONEON-NOSVE-NEXT:    fcvt s24, h29
-; NONEON-NOSVE-NEXT:    fcvt s19, h19
-; NONEON-NOSVE-NEXT:    fmadd s16, s17, s16, s7
-; NONEON-NOSVE-NEXT:    mov h25, v5.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h18, s18
-; NONEON-NOSVE-NEXT:    fcvt h26, s26
-; NONEON-NOSVE-NEXT:    mov h29, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov v6.h[1], v20.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s17, h21
-; NONEON-NOSVE-NEXT:    fcvt s20, h30
-; NONEON-NOSVE-NEXT:    fmadd s19, s19, s24, s23
-; NONEON-NOSVE-NEXT:    fcvt s21, h31
-; NONEON-NOSVE-NEXT:    fcvt h7, s22
-; NONEON-NOSVE-NEXT:    fcvt s22, h25
-; NONEON-NOSVE-NEXT:    fcvt s23, h27
-; NONEON-NOSVE-NEXT:    fcvt s24, h28
-; NONEON-NOSVE-NEXT:    mov h25, v5.h[4]
-; NONEON-NOSVE-NEXT:    mov h27, v4.h[4]
-; NONEON-NOSVE-NEXT:    mov h28, v3.h[4]
-; NONEON-NOSVE-NEXT:    mov h30, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h31, v0.h[5]
-; NONEON-NOSVE-NEXT:    mov v6.h[2], v18.h[0]
-; NONEON-NOSVE-NEXT:    fmadd s17, s21, s20, s17
-; NONEON-NOSVE-NEXT:    mov v7.h[1], v26.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h18, s19
-; NONEON-NOSVE-NEXT:    fmadd s19, s24, s23, s22
-; NONEON-NOSVE-NEXT:    mov h26, v5.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h16, s16
-; NONEON-NOSVE-NEXT:    fcvt s20, h25
-; NONEON-NOSVE-NEXT:    fcvt s21, h27
-; NONEON-NOSVE-NEXT:    fcvt s22, h28
-; NONEON-NOSVE-NEXT:    mov h27, v4.h[5]
-; NONEON-NOSVE-NEXT:    mov h28, v3.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s23, h29
-; NONEON-NOSVE-NEXT:    fcvt s24, h30
-; NONEON-NOSVE-NEXT:    fcvt s25, h31
-; NONEON-NOSVE-NEXT:    mov h29, v2.h[6]
-; NONEON-NOSVE-NEXT:    mov h30, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h31, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov v7.h[2], v18.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h18, s19
-; NONEON-NOSVE-NEXT:    fmadd s19, s22, s21, s20
-; NONEON-NOSVE-NEXT:    mov h20, v5.h[6]
-; NONEON-NOSVE-NEXT:    mov h21, v4.h[6]
-; NONEON-NOSVE-NEXT:    mov h22, v3.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s26, h26
-; NONEON-NOSVE-NEXT:    fmadd s23, s25, s24, s23
-; NONEON-NOSVE-NEXT:    fcvt s27, h27
-; NONEON-NOSVE-NEXT:    fcvt s28, h28
-; NONEON-NOSVE-NEXT:    mov v6.h[3], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h16, s17
-; NONEON-NOSVE-NEXT:    fcvt s17, h29
-; NONEON-NOSVE-NEXT:    fcvt s24, h30
-; NONEON-NOSVE-NEXT:    fcvt s25, h31
-; NONEON-NOSVE-NEXT:    fcvt s20, h20
-; NONEON-NOSVE-NEXT:    fcvt s21, h21
-; NONEON-NOSVE-NEXT:    fcvt s22, h22
-; NONEON-NOSVE-NEXT:    mov v7.h[3], v18.h[0]
-; NONEON-NOSVE-NEXT:    fmadd s26, s28, s27, s26
-; NONEON-NOSVE-NEXT:    fcvt h18, s19
-; NONEON-NOSVE-NEXT:    mov h5, v5.h[7]
-; NONEON-NOSVE-NEXT:    mov h4, v4.h[7]
-; NONEON-NOSVE-NEXT:    mov h3, v3.h[7]
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    fmadd s17, s25, s24, s17
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fmadd s19, s22, s21, s20
-; NONEON-NOSVE-NEXT:    mov v6.h[4], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h16, s23
-; NONEON-NOSVE-NEXT:    mov v7.h[4], v18.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h18, s26
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v6.h[5], v16.h[0]
-; NONEON-NOSVE-NEXT:    mov v7.h[5], v18.h[0]
-; NONEON-NOSVE-NEXT:    fmadd s3, s3, s4, s5
-; NONEON-NOSVE-NEXT:    fcvt h4, s19
-; NONEON-NOSVE-NEXT:    fcvt h5, s17
-; NONEON-NOSVE-NEXT:    fmadd s0, s0, s1, s2
-; NONEON-NOSVE-NEXT:    mov v7.h[6], v4.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s3
-; NONEON-NOSVE-NEXT:    mov v6.h[6], v5.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], v1.h[0]
-; NONEON-NOSVE-NEXT:    mov v6.h[7], v0.h[0]
-; NONEON-NOSVE-NEXT:    stp q7, q6, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %op3 = load <16 x half>, ptr %c
@@ -819,12 +352,6 @@ define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %o
 ; CHECK-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fma_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmla v2.2s, v1.2s, v0.2s
-; NONEON-NOSVE-NEXT:    fmov d0, d2
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3)
   ret <2 x float> %res
 }
@@ -839,12 +366,6 @@ define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %o
 ; CHECK-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fma_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmla v2.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3)
   ret <4 x float> %res
 }
@@ -861,16 +382,6 @@ define void @fma_v8f32(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    fmla z1.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fma_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q4, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q5, [x2]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    fmla v1.4s, v0.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fmla v5.4s, v4.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q1, q5, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %op3 = load <8 x float>, ptr %c
@@ -889,12 +400,6 @@ define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double
 ; CHECK-NEXT:    fmad z0.d, p0/m, z1.d, z2.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fma_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmla v2.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3)
   ret <2 x double> %res
 }
@@ -911,16 +416,6 @@ define void @fma_v4f64(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    fmla z1.d, p0/m, z3.d, z4.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fma_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q4, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q5, [x2]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    fmla v1.2d, v0.2d, v2.2d
-; NONEON-NOSVE-NEXT:    fmla v5.2d, v4.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q1, q5, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %op3 = load <4 x double>, ptr %c
@@ -942,14 +437,6 @@ define <2 x half> @fmul_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ; CHECK-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmul_v2f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fmul v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = fmul <2 x half> %op1, %op2
   ret <2 x half> %res
 }
@@ -963,14 +450,6 @@ define <4 x half> @fmul_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmul_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fmul v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = fmul <4 x half> %op1, %op2
   ret <4 x half> %res
 }
@@ -984,18 +463,6 @@ define <8 x half> @fmul_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmul_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fmul v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fmul v1.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = fmul <8 x half> %op1, %op2
   ret <8 x half> %res
 }
@@ -1011,29 +478,6 @@ define void @fmul_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmul z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmul_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v6.4s, v3.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl v5.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v7.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v3.4s, v3.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
-; NONEON-NOSVE-NEXT:    fmul v4.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    fmul v5.4s, v7.4s, v6.4s
-; NONEON-NOSVE-NEXT:    fmul v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fmul v2.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v4.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v5.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v2.4s
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = fmul <16 x half> %op1, %op2
@@ -1050,11 +494,6 @@ define <2 x float> @fmul_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmul_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmul v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = fmul <2 x float> %op1, %op2
   ret <2 x float> %res
 }
@@ -1068,11 +507,6 @@ define <4 x float> @fmul_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmul_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmul v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = fmul <4 x float> %op1, %op2
   ret <4 x float> %res
 }
@@ -1088,15 +522,6 @@ define void @fmul_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmul z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmul_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fmul v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fmul v1.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = fmul <8 x float> %op1, %op2
@@ -1113,11 +538,6 @@ define <2 x double> @fmul_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    fmul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmul_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmul v0.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = fmul <2 x double> %op1, %op2
   ret <2 x double> %res
 }
@@ -1133,15 +553,6 @@ define void @fmul_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmul z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmul_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fmul v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fmul v1.2d, v2.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = fmul <4 x double> %op1, %op2
@@ -1161,12 +572,6 @@ define <2 x half> @fneg_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    fneg z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fneg_v2f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v1.4h, #128, lsl #8
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = fneg <2 x half> %op
   ret <2 x half> %res
 }
@@ -1179,12 +584,6 @@ define <4 x half> @fneg_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    fneg z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fneg_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v1.4h, #128, lsl #8
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = fneg <4 x half> %op
   ret <4 x half> %res
 }
@@ -1197,12 +596,6 @@ define <8 x half> @fneg_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    fneg z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fneg_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v1.8h, #128, lsl #8
-; NONEON-NOSVE-NEXT:    eor v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = fneg <8 x half> %op
   ret <8 x half> %res
 }
@@ -1216,15 +609,6 @@ define void @fneg_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fneg z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fneg_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.8h, #128, lsl #8
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    eor v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    eor v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = fneg <16 x half> %op
   store <16 x half> %res, ptr %a
@@ -1239,11 +623,6 @@ define <2 x float> @fneg_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    fneg z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fneg_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fneg v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = fneg <2 x float> %op
   ret <2 x float> %res
 }
@@ -1256,11 +635,6 @@ define <4 x float> @fneg_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    fneg z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fneg_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fneg v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = fneg <4 x float> %op
   ret <4 x float> %res
 }
@@ -1274,14 +648,6 @@ define void @fneg_v8f32(ptr %a) {
 ; CHECK-NEXT:    fneg z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fneg_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fneg v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fneg v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = fneg <8 x float> %op
   store <8 x float> %res, ptr %a
@@ -1296,11 +662,6 @@ define <2 x double> @fneg_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    fneg z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fneg_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fneg v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = fneg <2 x double> %op
   ret <2 x double> %res
 }
@@ -1314,14 +675,6 @@ define void @fneg_v4f64(ptr %a) {
 ; CHECK-NEXT:    fneg z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fneg_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fneg v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fneg v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = fneg <4 x double> %op
   store <4 x double> %res, ptr %a
@@ -1340,30 +693,6 @@ define <2 x half> @fsqrt_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    fsqrt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fsqrt_v2f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fsqrt s2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fsqrt s1, s1
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fsqrt s3, s3
-; NONEON-NOSVE-NEXT:    fsqrt s4, s0
-; NONEON-NOSVE-NEXT:    fcvt h0, s2
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v1.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s3
-; NONEON-NOSVE-NEXT:    mov v0.h[2], v1.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s4
-; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.sqrt.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -1376,30 +705,6 @@ define <4 x half> @fsqrt_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    fsqrt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fsqrt_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fsqrt s2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fsqrt s1, s1
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fsqrt s3, s3
-; NONEON-NOSVE-NEXT:    fsqrt s4, s0
-; NONEON-NOSVE-NEXT:    fcvt h0, s2
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v1.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s3
-; NONEON-NOSVE-NEXT:    mov v0.h[2], v1.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s4
-; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.sqrt.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -1412,48 +717,6 @@ define <8 x half> @fsqrt_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    fsqrt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fsqrt_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fsqrt s2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt s16, h0
-; NONEON-NOSVE-NEXT:    fcvt h0, s2
-; NONEON-NOSVE-NEXT:    fsqrt s1, s1
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v1.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s3, s3
-; NONEON-NOSVE-NEXT:    fcvt h1, s3
-; NONEON-NOSVE-NEXT:    mov v0.h[2], v1.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s4, s4
-; NONEON-NOSVE-NEXT:    fcvt h1, s4
-; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s5, s5
-; NONEON-NOSVE-NEXT:    fcvt h1, s5
-; NONEON-NOSVE-NEXT:    mov v0.h[4], v1.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s6, s6
-; NONEON-NOSVE-NEXT:    fcvt h1, s6
-; NONEON-NOSVE-NEXT:    mov v0.h[5], v1.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s7, s7
-; NONEON-NOSVE-NEXT:    fcvt h1, s7
-; NONEON-NOSVE-NEXT:    mov v0.h[6], v1.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s2, s16
-; NONEON-NOSVE-NEXT:    fcvt h1, s2
-; NONEON-NOSVE-NEXT:    mov v0.h[7], v1.h[0]
-; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -1467,89 +730,6 @@ define void @fsqrt_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fsqrt z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fsqrt_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q16, [x0]
-; NONEON-NOSVE-NEXT:    mov h0, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h17, v16.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s18, h16
-; NONEON-NOSVE-NEXT:    mov h19, v16.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[3]
-; NONEON-NOSVE-NEXT:    mov h20, v16.h[3]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h21, v16.h[4]
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h22, v16.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fsqrt s2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s19, h19
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s20, h20
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s21, h21
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s22, h22
-; NONEON-NOSVE-NEXT:    mov h23, v16.h[6]
-; NONEON-NOSVE-NEXT:    mov h16, v16.h[7]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt s23, h23
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fsqrt s0, s0
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    mov v2.h[1], v0.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s17, s17
-; NONEON-NOSVE-NEXT:    fcvt h17, s17
-; NONEON-NOSVE-NEXT:    fsqrt s18, s18
-; NONEON-NOSVE-NEXT:    fcvt h18, s18
-; NONEON-NOSVE-NEXT:    mov v18.h[1], v17.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s3, s3
-; NONEON-NOSVE-NEXT:    fcvt h0, s3
-; NONEON-NOSVE-NEXT:    mov v2.h[2], v0.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s19, s19
-; NONEON-NOSVE-NEXT:    fcvt h17, s19
-; NONEON-NOSVE-NEXT:    mov v18.h[2], v17.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s4, s4
-; NONEON-NOSVE-NEXT:    fcvt h0, s4
-; NONEON-NOSVE-NEXT:    mov v2.h[3], v0.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s20, s20
-; NONEON-NOSVE-NEXT:    fcvt h3, s20
-; NONEON-NOSVE-NEXT:    mov v18.h[3], v3.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s5, s5
-; NONEON-NOSVE-NEXT:    fcvt h0, s5
-; NONEON-NOSVE-NEXT:    mov v2.h[4], v0.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s21, s21
-; NONEON-NOSVE-NEXT:    fcvt h3, s21
-; NONEON-NOSVE-NEXT:    mov v18.h[4], v3.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s6, s6
-; NONEON-NOSVE-NEXT:    fcvt h0, s6
-; NONEON-NOSVE-NEXT:    mov v2.h[5], v0.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s22, s22
-; NONEON-NOSVE-NEXT:    fcvt h3, s22
-; NONEON-NOSVE-NEXT:    mov v18.h[5], v3.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s7, s7
-; NONEON-NOSVE-NEXT:    fcvt h0, s7
-; NONEON-NOSVE-NEXT:    mov v2.h[6], v0.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s23, s23
-; NONEON-NOSVE-NEXT:    fcvt h3, s23
-; NONEON-NOSVE-NEXT:    mov v18.h[6], v3.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s16, s16
-; NONEON-NOSVE-NEXT:    fcvt h3, s16
-; NONEON-NOSVE-NEXT:    mov v18.h[7], v3.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s1, s1
-; NONEON-NOSVE-NEXT:    fcvt h0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[7], v0.h[0]
-; NONEON-NOSVE-NEXT:    stp q18, q2, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -1564,11 +744,6 @@ define <2 x float> @fsqrt_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    fsqrt z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fsqrt_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fsqrt v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -1581,11 +756,6 @@ define <4 x float> @fsqrt_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    fsqrt z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fsqrt_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fsqrt v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -1599,14 +769,6 @@ define void @fsqrt_v8f32(ptr %a) {
 ; CHECK-NEXT:    fsqrt z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fsqrt_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fsqrt v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fsqrt v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -1621,11 +783,6 @@ define <2 x double> @fsqrt_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    fsqrt z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fsqrt_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fsqrt v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -1639,14 +796,6 @@ define void @fsqrt_v4f64(ptr %a) {
 ; CHECK-NEXT:    fsqrt z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fsqrt_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fsqrt v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fsqrt v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
@@ -1666,14 +815,6 @@ define <2 x half> @fsub_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ; CHECK-NEXT:    fsub z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fsub_v2f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fsub v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = fsub <2 x half> %op1, %op2
   ret <2 x half> %res
 }
@@ -1687,14 +828,6 @@ define <4 x half> @fsub_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    fsub z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fsub_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fsub v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = fsub <4 x half> %op1, %op2
   ret <4 x half> %res
 }
@@ -1708,18 +841,6 @@ define <8 x half> @fsub_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    fsub z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fsub_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fsub v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fsub v1.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = fsub <8 x half> %op1, %op2
   ret <8 x half> %res
 }
@@ -1735,29 +856,6 @@ define void @fsub_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fsub z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fsub_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v6.4s, v3.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl v5.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v7.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v3.4s, v3.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
-; NONEON-NOSVE-NEXT:    fsub v4.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    fsub v5.4s, v7.4s, v6.4s
-; NONEON-NOSVE-NEXT:    fsub v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fsub v2.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v4.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v5.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v2.4s
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = fsub <16 x half> %op1, %op2
@@ -1774,11 +872,6 @@ define <2 x float> @fsub_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    fsub z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fsub_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fsub v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = fsub <2 x float> %op1, %op2
   ret <2 x float> %res
 }
@@ -1792,11 +885,6 @@ define <4 x float> @fsub_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    fsub z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fsub_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fsub v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = fsub <4 x float> %op1, %op2
   ret <4 x float> %res
 }
@@ -1812,15 +900,6 @@ define void @fsub_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fsub z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fsub_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fsub v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fsub v1.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = fsub <8 x float> %op1, %op2
@@ -1837,11 +916,6 @@ define <2 x double> @fsub_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    fsub z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fsub_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fsub v0.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = fsub <2 x double> %op1, %op2
   ret <2 x double> %res
 }
@@ -1857,15 +931,6 @@ define void @fsub_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fsub z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fsub_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fsub v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fsub v1.2d, v2.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = fsub <4 x double> %op1, %op2
@@ -1885,11 +950,6 @@ define <2 x half> @fabs_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    fabs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fabs_v2f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    bic v0.4h, #128, lsl #8
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.fabs.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -1902,11 +962,6 @@ define <4 x half> @fabs_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    fabs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fabs_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    bic v0.4h, #128, lsl #8
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.fabs.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -1919,11 +974,6 @@ define <8 x half> @fabs_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    fabs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fabs_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    bic v0.8h, #128, lsl #8
-; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.fabs.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -1937,14 +987,6 @@ define void @fabs_v16f16(ptr %a) {
 ; CHECK-NEXT:    fabs z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fabs_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    bic v0.8h, #128, lsl #8
-; NONEON-NOSVE-NEXT:    bic v1.8h, #128, lsl #8
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.fabs.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -1959,11 +1001,6 @@ define <2 x float> @fabs_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    fabs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fabs_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fabs v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.fabs.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -1976,11 +1013,6 @@ define <4 x float> @fabs_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    fabs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fabs_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fabs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.fabs.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -1994,14 +1026,6 @@ define void @fabs_v8f32(ptr %a) {
 ; CHECK-NEXT:    fabs z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fabs_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fabs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fabs v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.fabs.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -2016,11 +1040,6 @@ define <2 x double> @fabs_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    fabs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fabs_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fabs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.fabs.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -2034,14 +1053,6 @@ define void @fabs_v4f64(ptr %a) {
 ; CHECK-NEXT:    fabs z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fabs_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fabs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fabs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.fabs.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll
index d4810c78cb53..465cc179a3b9 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -20,14 +19,6 @@ define <2 x i16> @fcmp_oeq_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcmp_oeq_v2f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <2 x half> %op1, %op2
   %sext = sext <2 x i1> %cmp to <2 x i16>
   ret <2 x i16> %sext
@@ -43,14 +34,6 @@ define <4 x i16> @fcmp_oeq_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcmp_oeq_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <4 x half> %op1, %op2
   %sext = sext <4 x i1> %cmp to <4 x i16>
   ret <4 x i16> %sext
@@ -66,65 +49,6 @@ define <8 x i16> @fcmp_oeq_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcmp_oeq_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcmp s3, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    fcvt s5, h6
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[4]
-; NONEON-NOSVE-NEXT:    csetm w9, eq
-; NONEON-NOSVE-NEXT:    fcmp s2, s5
-; NONEON-NOSVE-NEXT:    fmov s2, w9
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    fcvt s3, h5
-; NONEON-NOSVE-NEXT:    fcvt s4, h6
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <8 x half> %op1, %op2
   %sext = sext <8 x i1> %cmp to <8 x i16>
   ret <8 x i16> %sext
@@ -142,123 +66,6 @@ define void @fcmp_oeq_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcmp_oeq_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, eq
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, eq
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, eq
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, eq
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, eq
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, eq
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, eq
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp oeq <16 x half> %op1, %op2
@@ -277,11 +84,6 @@ define <2 x i32> @fcmp_oeq_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcmp_oeq_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcmeq v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <2 x float> %op1, %op2
   %sext = sext <2 x i1> %cmp to <2 x i32>
   ret <2 x i32> %sext
@@ -297,11 +99,6 @@ define <4 x i32> @fcmp_oeq_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcmp_oeq_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <4 x float> %op1, %op2
   %sext = sext <4 x i1> %cmp to <4 x i32>
   ret <4 x i32> %sext
@@ -319,15 +116,6 @@ define void @fcmp_oeq_v8f32(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcmp_oeq_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcmeq v1.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %cmp = fcmp oeq <8 x float> %op1, %op2
@@ -344,11 +132,6 @@ define <1 x i64> @fcmp_oeq_v1f64(<1 x double> %op1, <1 x double> %op2) {
 ; CHECK-NEXT:    mov z0.d, x8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcmp_oeq_v1f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcmeq d0, d0, d1
-; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <1 x double> %op1, %op2
   %sext = sext <1 x i1> %cmp to <1 x i64>
   ret <1 x i64> %sext
@@ -364,11 +147,6 @@ define <2 x i64> @fcmp_oeq_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcmp_oeq_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcmeq v0.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <2 x double> %op1, %op2
   %sext = sext <2 x i1> %cmp to <2 x i64>
   ret <2 x i64> %sext
@@ -386,15 +164,6 @@ define void @fcmp_oeq_v4f64(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcmp_oeq_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fcmeq v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcmeq v1.2d, v2.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %cmp = fcmp oeq <4 x double> %op1, %op2
@@ -423,139 +192,6 @@ define void @fcmp_ueq_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcmp_ueq_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h2
-; NONEON-NOSVE-NEXT:    mov h5, v2.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h1
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s6, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w9, eq
-; NONEON-NOSVE-NEXT:    csinv w12, w9, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s7, s5
-; NONEON-NOSVE-NEXT:    mov h5, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w9, eq
-; NONEON-NOSVE-NEXT:    csinv w10, w9, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x1]
-; NONEON-NOSVE-NEXT:    csetm w9, eq
-; NONEON-NOSVE-NEXT:    csinv w11, w9, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s6, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[7]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s6, h16
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w9, eq
-; NONEON-NOSVE-NEXT:    csinv w9, w9, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s7, s5
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w13, eq
-; NONEON-NOSVE-NEXT:    csinv w13, w13, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s6, s3
-; NONEON-NOSVE-NEXT:    fcvt s3, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h7
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[2]
-; NONEON-NOSVE-NEXT:    csetm w14, eq
-; NONEON-NOSVE-NEXT:    csinv w14, w14, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s4, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w15, eq
-; NONEON-NOSVE-NEXT:    csinv w15, w15, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s5, s3
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    csetm w16, eq
-; NONEON-NOSVE-NEXT:    csinv w16, w16, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s4, s2
-; NONEON-NOSVE-NEXT:    fcvt s4, h3
-; NONEON-NOSVE-NEXT:    fmov s2, w12
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w17, eq
-; NONEON-NOSVE-NEXT:    csinv w17, w17, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[4]
-; NONEON-NOSVE-NEXT:    fmov s3, w17
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    mov v3.h[1], w16
-; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w10
-; NONEON-NOSVE-NEXT:    mov v3.h[2], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w11
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov v3.h[3], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    fcvt s4, h6
-; NONEON-NOSVE-NEXT:    fcvt s5, h7
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w9
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v3.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w13
-; NONEON-NOSVE-NEXT:    mov v3.h[5], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s1, s0
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w14
-; NONEON-NOSVE-NEXT:    mov v3.h[6], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w15
-; NONEON-NOSVE-NEXT:    mov v3.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp ueq <16 x half> %op1, %op2
@@ -584,139 +220,6 @@ define void @fcmp_one_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcmp_one_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h2
-; NONEON-NOSVE-NEXT:    mov h5, v2.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h1
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
-; NONEON-NOSVE-NEXT:    csetm w8, mi
-; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s6, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w9, mi
-; NONEON-NOSVE-NEXT:    csinv w12, w9, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s7, s5
-; NONEON-NOSVE-NEXT:    mov h5, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w9, mi
-; NONEON-NOSVE-NEXT:    csinv w10, w9, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x1]
-; NONEON-NOSVE-NEXT:    csetm w9, mi
-; NONEON-NOSVE-NEXT:    csinv w11, w9, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s6, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[7]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s6, h16
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w9, mi
-; NONEON-NOSVE-NEXT:    csinv w9, w9, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s7, s5
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w13, mi
-; NONEON-NOSVE-NEXT:    csinv w13, w13, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s6, s3
-; NONEON-NOSVE-NEXT:    fcvt s3, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h7
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[2]
-; NONEON-NOSVE-NEXT:    csetm w14, mi
-; NONEON-NOSVE-NEXT:    csinv w14, w14, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s4, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w15, mi
-; NONEON-NOSVE-NEXT:    csinv w15, w15, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s5, s3
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    csetm w16, mi
-; NONEON-NOSVE-NEXT:    csinv w16, w16, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s4, s2
-; NONEON-NOSVE-NEXT:    fcvt s4, h3
-; NONEON-NOSVE-NEXT:    fmov s2, w12
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w17, mi
-; NONEON-NOSVE-NEXT:    csinv w17, w17, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[4]
-; NONEON-NOSVE-NEXT:    fmov s3, w17
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w8, mi
-; NONEON-NOSVE-NEXT:    mov v3.h[1], w16
-; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w10
-; NONEON-NOSVE-NEXT:    mov v3.h[2], w8
-; NONEON-NOSVE-NEXT:    csetm w8, mi
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w11
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov v3.h[3], w8
-; NONEON-NOSVE-NEXT:    csetm w8, mi
-; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    fcvt s4, h6
-; NONEON-NOSVE-NEXT:    fcvt s5, h7
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w9
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v3.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    csetm w8, mi
-; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w13
-; NONEON-NOSVE-NEXT:    mov v3.h[5], w8
-; NONEON-NOSVE-NEXT:    csetm w8, mi
-; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s1, s0
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w14
-; NONEON-NOSVE-NEXT:    mov v3.h[6], w8
-; NONEON-NOSVE-NEXT:    csetm w8, mi
-; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w15
-; NONEON-NOSVE-NEXT:    mov v3.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp one <16 x half> %op1, %op2
@@ -741,123 +244,6 @@ define void @fcmp_une_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcmp_une_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, ne
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, ne
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, ne
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, ne
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, ne
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, ne
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, ne
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, ne
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, ne
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp une <16 x half> %op1, %op2
@@ -882,123 +268,6 @@ define void @fcmp_ogt_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcmp_ogt_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
-; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, gt
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, gt
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, gt
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, gt
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, gt
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, gt
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, gt
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, gt
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, gt
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
-; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
-; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
-; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
-; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
-; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp ogt <16 x half> %op1, %op2
@@ -1026,123 +295,6 @@ define void @fcmp_ugt_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    eor z0.d, z2.d, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcmp_ugt_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
-; NONEON-NOSVE-NEXT:    csetm w8, hi
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, hi
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, hi
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, hi
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, hi
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, hi
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, hi
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, hi
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, hi
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, hi
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
-; NONEON-NOSVE-NEXT:    csetm w8, hi
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
-; NONEON-NOSVE-NEXT:    csetm w8, hi
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
-; NONEON-NOSVE-NEXT:    csetm w8, hi
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
-; NONEON-NOSVE-NEXT:    csetm w8, hi
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    csetm w8, hi
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
-; NONEON-NOSVE-NEXT:    csetm w8, hi
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp ugt <16 x half> %op1, %op2
@@ -1167,123 +319,6 @@ define void @fcmp_olt_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcmp_olt_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
-; NONEON-NOSVE-NEXT:    csetm w8, mi
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, mi
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, mi
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, mi
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, mi
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, mi
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, mi
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, mi
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, mi
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, mi
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
-; NONEON-NOSVE-NEXT:    csetm w8, mi
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
-; NONEON-NOSVE-NEXT:    csetm w8, mi
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
-; NONEON-NOSVE-NEXT:    csetm w8, mi
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
-; NONEON-NOSVE-NEXT:    csetm w8, mi
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    csetm w8, mi
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
-; NONEON-NOSVE-NEXT:    csetm w8, mi
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp olt <16 x half> %op1, %op2
@@ -1311,123 +346,6 @@ define void @fcmp_ult_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    eor z0.d, z2.d, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcmp_ult_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
-; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, lt
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, lt
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, lt
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, lt
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, lt
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, lt
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, lt
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, lt
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, lt
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
-; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
-; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
-; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
-; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
-; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp ult <16 x half> %op1, %op2
@@ -1452,123 +370,6 @@ define void @fcmp_oge_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcmp_oge_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
-; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, ge
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, ge
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, ge
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, ge
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, ge
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, ge
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, ge
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, ge
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, ge
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
-; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
-; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
-; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
-; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
-; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp oge <16 x half> %op1, %op2
@@ -1596,123 +397,6 @@ define void @fcmp_uge_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    eor z0.d, z2.d, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcmp_uge_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
-; NONEON-NOSVE-NEXT:    csetm w8, pl
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, pl
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, pl
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, pl
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, pl
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, pl
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, pl
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, pl
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, pl
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, pl
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
-; NONEON-NOSVE-NEXT:    csetm w8, pl
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
-; NONEON-NOSVE-NEXT:    csetm w8, pl
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
-; NONEON-NOSVE-NEXT:    csetm w8, pl
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
-; NONEON-NOSVE-NEXT:    csetm w8, pl
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    csetm w8, pl
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
-; NONEON-NOSVE-NEXT:    csetm w8, pl
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp uge <16 x half> %op1, %op2
@@ -1737,123 +421,6 @@ define void @fcmp_ole_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcmp_ole_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
-; NONEON-NOSVE-NEXT:    csetm w8, ls
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, ls
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, ls
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, ls
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, ls
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, ls
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, ls
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, ls
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, ls
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, ls
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
-; NONEON-NOSVE-NEXT:    csetm w8, ls
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
-; NONEON-NOSVE-NEXT:    csetm w8, ls
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
-; NONEON-NOSVE-NEXT:    csetm w8, ls
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
-; NONEON-NOSVE-NEXT:    csetm w8, ls
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    csetm w8, ls
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
-; NONEON-NOSVE-NEXT:    csetm w8, ls
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp ole <16 x half> %op1, %op2
@@ -1881,123 +448,6 @@ define void @fcmp_ule_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    eor z0.d, z2.d, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcmp_ule_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
-; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, le
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, le
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, le
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, le
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, le
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, le
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, le
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, le
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, le
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
-; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
-; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
-; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
-; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
-; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp ule <16 x half> %op1, %op2
@@ -2022,123 +472,6 @@ define void @fcmp_uno_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcmp_uno_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
-; NONEON-NOSVE-NEXT:    csetm w8, vs
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, vs
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, vs
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, vs
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, vs
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, vs
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, vs
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, vs
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, vs
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, vs
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
-; NONEON-NOSVE-NEXT:    csetm w8, vs
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
-; NONEON-NOSVE-NEXT:    csetm w8, vs
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
-; NONEON-NOSVE-NEXT:    csetm w8, vs
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
-; NONEON-NOSVE-NEXT:    csetm w8, vs
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    csetm w8, vs
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
-; NONEON-NOSVE-NEXT:    csetm w8, vs
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp uno <16 x half> %op1, %op2
@@ -2166,123 +499,6 @@ define void @fcmp_ord_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    eor z0.d, z2.d, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcmp_ord_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
-; NONEON-NOSVE-NEXT:    csetm w8, vc
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, vc
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, vc
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, vc
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, vc
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, vc
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, vc
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, vc
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, vc
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, vc
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
-; NONEON-NOSVE-NEXT:    csetm w8, vc
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
-; NONEON-NOSVE-NEXT:    csetm w8, vc
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
-; NONEON-NOSVE-NEXT:    csetm w8, vc
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
-; NONEON-NOSVE-NEXT:    csetm w8, vc
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    csetm w8, vc
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
-; NONEON-NOSVE-NEXT:    csetm w8, vc
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp ord <16 x half> %op1, %op2
@@ -2307,123 +523,6 @@ define void @fcmp_eq_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcmp_eq_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, eq
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, eq
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, eq
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, eq
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, eq
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, eq
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, eq
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp fast oeq <16 x half> %op1, %op2
@@ -2448,123 +547,6 @@ define void @fcmp_ne_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcmp_ne_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, ne
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, ne
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, ne
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, ne
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, ne
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, ne
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, ne
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, ne
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, ne
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp fast one <16 x half> %op1, %op2
@@ -2589,123 +571,6 @@ define void @fcmp_gt_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcmp_gt_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
-; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, gt
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, gt
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, gt
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, gt
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, gt
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, gt
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, gt
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, gt
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, gt
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
-; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
-; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
-; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
-; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
-; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp fast ogt <16 x half> %op1, %op2
@@ -2730,123 +595,6 @@ define void @fcmp_lt_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcmp_lt_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
-; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, lt
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, lt
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, lt
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, lt
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, lt
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, lt
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, lt
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, lt
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, lt
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
-; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
-; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
-; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
-; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
-; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp fast olt <16 x half> %op1, %op2
@@ -2871,123 +619,6 @@ define void @fcmp_ge_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcmp_ge_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
-; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, ge
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, ge
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, ge
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, ge
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, ge
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, ge
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, ge
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, ge
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, ge
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
-; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
-; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
-; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
-; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
-; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp fast oge <16 x half> %op1, %op2
@@ -3012,123 +643,6 @@ define void @fcmp_le_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcmp_le_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
-; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, le
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, le
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, le
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, le
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, le
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, le
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, le
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, le
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, le
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
-; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
-; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
-; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
-; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
-; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp fast ole <16 x half> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
index ac0b6c0e0440..9bdde14e8d83 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -18,17 +17,6 @@ define void @fp_convert_combine_crash(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fp_convert_combine_crash:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov v0.4s, #8.00000000
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fmul v1.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fmul v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %f = load <8 x float>, ptr %a
   %mul.i = fmul <8 x float> %f, <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00,
                                  float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
index 16f30adbd14e..244a40510173 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -18,12 +17,6 @@ define void @fcvt_v2f16_to_v2f32(<2 x half> %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvt_v2f16_to_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    str d0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %res = fpext <2 x half> %a to <2 x float>
   store <2 x float> %res, ptr %b
   ret void
@@ -38,12 +31,6 @@ define void @fcvt_v4f16_to_v4f32(<4 x half> %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvt_v4f16_to_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %res = fpext <4 x half> %a to <4 x float>
   store <4 x float> %res, ptr %b
   ret void
@@ -61,17 +48,6 @@ define void @fcvt_v8f16_to_v8f32(<8 x half> %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvt_v8f16_to_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %res = fpext <8 x half> %a to <8 x float>
   store <8 x float> %res, ptr %b
   ret void
@@ -96,21 +72,6 @@ define void @fcvt_v16f16_to_v16f32(<16 x half> %a, ptr %b) {
 ; CHECK-NEXT:    stp q3, q0, [x0]
 ; CHECK-NEXT:    stp q2, q1, [x0, #32]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvt_v16f16_to_v16f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x0, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
-; NONEON-NOSVE-NEXT:    ret
   %res = fpext <16 x half> %a to <16 x float>
   store <16 x float> %res, ptr %b
   ret void
@@ -129,13 +90,6 @@ define void @fcvt_v2f16_v2f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvt_v2f16_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr s0, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    str d0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x half>, ptr %a
   %res = fpext <2 x half> %op1 to <2 x float>
   store <2 x float> %res, ptr %b
@@ -150,13 +104,6 @@ define void @fcvt_v4f16_v4f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvt_v4f16_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x half>, ptr %a
   %res = fpext <4 x half> %op1 to <4 x float>
   store <4 x float> %res, ptr %b
@@ -174,18 +121,6 @@ define void @fcvt_v8f16_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z1.s, p0/m, z1.h
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvt_v8f16_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fpext <8 x half> %op1 to <8 x float>
   store <8 x float> %res, ptr %b
@@ -210,22 +145,6 @@ define void @fcvt_v16f16_v16f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q0, q1, [x1, #32]
 ; CHECK-NEXT:    stp q2, q3, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvt_v16f16_v16f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fpext <16 x half> %op1 to <16 x float>
   store <16 x float> %res, ptr %b
@@ -243,13 +162,6 @@ define void @fcvt_v1f16_v1f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt d0, h0
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvt_v1f16_v1f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr h0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt d0, h0
-; NONEON-NOSVE-NEXT:    str d0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <1 x half>, ptr %a
   %res = fpext <1 x half> %op1 to <1 x double>
   store <1 x double> %res, ptr %b
@@ -264,14 +176,6 @@ define void @fcvt_v2f16_v2f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.d, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvt_v2f16_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr s0, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x half>, ptr %a
   %res = fpext <2 x half> %op1 to <2 x double>
   store <2 x double> %res, ptr %b
@@ -289,19 +193,6 @@ define void @fcvt_v4f16_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z1.d, p0/m, z1.h
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvt_v4f16_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x half>, ptr %a
   %res = fpext <4 x half> %op1 to <4 x double>
   store <4 x double> %res, ptr %b
@@ -326,26 +217,6 @@ define void @fcvt_v8f16_v8f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q0, q1, [x1, #32]
 ; CHECK-NEXT:    stp q2, q3, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvt_v8f16_v8f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    fcvtl v2.2d, v2.2s
-; NONEON-NOSVE-NEXT:    fcvtl v3.2d, v3.2s
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fpext <8 x half> %op1 to <8 x double>
   store <8 x double> %res, ptr %b
@@ -387,38 +258,6 @@ define void @fcvt_v16f16_v16f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q4, q0, [x1, #32]
 ; CHECK-NEXT:    stp q1, q2, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvt_v16f16_v16f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
-; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtl v2.2d, v2.2s
-; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #72]
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #40]
-; NONEON-NOSVE-NEXT:    fcvtl v5.2d, v5.2s
-; NONEON-NOSVE-NEXT:    fcvtl v3.2d, v3.2s
-; NONEON-NOSVE-NEXT:    fcvtl v4.2d, v4.2s
-; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v7.2s
-; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v6.2s
-; NONEON-NOSVE-NEXT:    stp q2, q0, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q3, q1, [x1, #96]
-; NONEON-NOSVE-NEXT:    add sp, sp, #96
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fpext <16 x half> %op1 to <16 x double>
   store <16 x double> %res, ptr %b
@@ -436,13 +275,6 @@ define void @fcvt_v1f32_v1f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt d0, s0
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvt_v1f32_v1f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr s0, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    str d0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <1 x float>, ptr %a
   %res = fpext <1 x float> %op1 to <1 x double>
   store <1 x double> %res, ptr %b
@@ -457,13 +289,6 @@ define void @fcvt_v2f32_v2f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.d, p0/m, z0.s
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvt_v2f32_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x float>, ptr %a
   %res = fpext <2 x float> %op1 to <2 x double>
   store <2 x double> %res, ptr %b
@@ -481,18 +306,6 @@ define void @fcvt_v4f32_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z1.d, p0/m, z1.s
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvt_v4f32_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %res = fpext <4 x float> %op1 to <4 x double>
   store <4 x double> %res, ptr %b
@@ -517,22 +330,6 @@ define void @fcvt_v8f32_v8f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q0, q1, [x1, #32]
 ; CHECK-NEXT:    stp q2, q3, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvt_v8f32_v8f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtl v2.2d, v2.2s
-; NONEON-NOSVE-NEXT:    fcvtl v3.2d, v3.2s
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fpext <8 x float> %op1 to <8 x double>
   store <8 x double> %res, ptr %b
@@ -551,13 +348,6 @@ define void @fcvt_v2f32_v2f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.h, p0/m, z0.s
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvt_v2f32_v2f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    str s0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x float>, ptr %a
   %res = fptrunc <2 x float> %op1 to <2 x half>
   store <2 x half> %res, ptr %b
@@ -572,13 +362,6 @@ define void @fcvt_v4f32_v4f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.h, p0/m, z0.s
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvt_v4f32_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    str d0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %res = fptrunc <4 x float> %op1 to <4 x half>
   store <4 x half> %res, ptr %b
@@ -596,14 +379,6 @@ define void @fcvt_v8f32_v8f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x1, x8, lsl #1]
 ; CHECK-NEXT:    st1h { z1.s }, p0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvt_v8f32_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptrunc <8 x float> %op1 to <8 x half>
   store <8 x half> %res, ptr %b
@@ -622,13 +397,6 @@ define void @fcvt_v1f64_v1f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.h, p0/m, z0.d
 ; CHECK-NEXT:    st1h { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvt_v1f64_v1f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt h0, d0
-; NONEON-NOSVE-NEXT:    str h0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <1 x double>, ptr %a
   %res = fptrunc <1 x double> %op1 to <1 x half>
   store <1 x half> %res, ptr %b
@@ -643,14 +411,6 @@ define void @fcvt_v2f64_v2f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.h, p0/m, z0.d
 ; CHECK-NEXT:    st1h { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvt_v2f64_v2f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvtxn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    str s0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x double>, ptr %a
   %res = fptrunc <2 x double> %op1 to <2 x half>
   store <2 x half> %res, ptr %b
@@ -668,15 +428,6 @@ define void @fcvt_v4f64_v4f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    st1h { z0.d }, p0, [x1, x8, lsl #1]
 ; CHECK-NEXT:    st1h { z1.d }, p0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvt_v4f64_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtxn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtxn2 v0.4s, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    str d0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptrunc <4 x double> %op1 to <4 x half>
   store <4 x half> %res, ptr %b
@@ -695,13 +446,6 @@ define void @fcvt_v1f64_v1f32(<1 x double> %op1, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.d
 ; CHECK-NEXT:    st1w { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvt_v1f64_v1f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    str s0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %res = fptrunc <1 x double> %op1 to <1 x float>
   store <1 x float> %res, ptr %b
   ret void
@@ -715,12 +459,6 @@ define void @fcvt_v2f64_v2f32(<2 x double> %op1, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.d
 ; CHECK-NEXT:    st1w { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvt_v2f64_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    str d0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %res = fptrunc <2 x double> %op1 to <2 x float>
   store <2 x float> %res, ptr %b
   ret void
@@ -737,14 +475,6 @@ define void @fcvt_v4f64_v4f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    st1w { z0.d }, p0, [x1, x8, lsl #2]
 ; CHECK-NEXT:    st1w { z1.d }, p0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvt_v4f64_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v1.2d
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptrunc <4 x double> %op1 to <4 x float>
   store <4 x float> %res, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
index 44d7116e5f87..cbe71d715a8f 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -18,18 +17,6 @@ define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3)
 ; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fma_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fmul v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <4 x half> %op1, %op2
   %res = fadd contract <4 x half> %mul, %op3
   ret <4 x half> %res
@@ -45,26 +32,6 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
 ; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fma_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fmul v3.4s, v4.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fmul v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fadd v1.4s, v1.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <8 x half> %op1, %op2
   %res = fadd contract <8 x half> %mul, %op3
   ret <8 x half> %res
@@ -82,46 +49,6 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    fmla z1.h, p0/m, z3.h, z4.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fma_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    fcvtl v5.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v7.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v6.4s, v3.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v3.4s, v3.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
-; NONEON-NOSVE-NEXT:    fmul v4.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    fmul v5.4s, v7.4s, v6.4s
-; NONEON-NOSVE-NEXT:    fmul v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fmul v2.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v4.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v5.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v2.4s
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x2]
-; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v5.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v6.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl v7.4s, v3.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v3.4s, v3.8h
-; NONEON-NOSVE-NEXT:    fadd v4.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    fadd v5.4s, v7.4s, v6.4s
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fadd v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v4.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v5.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v2.4s
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %op3 = load <16 x half>, ptr %c
@@ -141,12 +68,6 @@ define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %o
 ; CHECK-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fma_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmla v2.2s, v1.2s, v0.2s
-; NONEON-NOSVE-NEXT:    fmov d0, d2
-; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <2 x float> %op1, %op2
   %res = fadd contract <2 x float> %mul, %op3
   ret <2 x float> %res
@@ -162,12 +83,6 @@ define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %o
 ; CHECK-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fma_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmla v2.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <4 x float> %op1, %op2
   %res = fadd contract <4 x float> %mul, %op3
   ret <4 x float> %res
@@ -185,16 +100,6 @@ define void @fma_v8f32(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    fmla z1.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fma_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q4, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q5, [x2]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    fmla v1.4s, v0.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fmla v5.4s, v4.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q1, q5, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %op3 = load <8 x float>, ptr %c
@@ -209,11 +114,6 @@ define <1 x double> @fma_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmadd d0, d0, d1, d2
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fma_v1f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmadd d0, d0, d1, d2
-; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <1 x double> %op1, %op2
   %res = fadd contract <1 x double> %mul, %op3
   ret <1 x double> %res
@@ -229,12 +129,6 @@ define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double
 ; CHECK-NEXT:    fmad z0.d, p0/m, z1.d, z2.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fma_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmla v2.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <2 x double> %op1, %op2
   %res = fadd contract <2 x double> %mul, %op3
   ret <2 x double> %res
@@ -252,16 +146,6 @@ define void @fma_v4f64(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    fmla z1.d, p0/m, z3.d, z4.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fma_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q4, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q5, [x2]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    fmla v1.2d, v0.2d, v2.2d
-; NONEON-NOSVE-NEXT:    fmla v5.2d, v4.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q1, q5, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %op3 = load <4 x double>, ptr %c
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
index bc7659c06ad0..94a74763aa0e 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -18,38 +17,6 @@ define <4 x half> @fmaxnm_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmaxnm_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h1
-; NONEON-NOSVE-NEXT:    fcvt s7, h0
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s2, s3, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fmaxnm s5, s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    fmaxnm s3, s4, s3
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt h0, s5
-; NONEON-NOSVE-NEXT:    fcvt s4, h6
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v2.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h2, s3
-; NONEON-NOSVE-NEXT:    fmaxnm s1, s4, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[2], v2.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %op1, <4 x half> %op2)
   ret <4 x half> %res
 }
@@ -63,64 +30,6 @@ define <8 x half> @fmaxnm_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmaxnm_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fmaxnm s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fmaxnm s3, s3, s2
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s4
-; NONEON-NOSVE-NEXT:    fmaxnm s4, s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fmaxnm s5, s5, s16
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    mov v2.h[1], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s3, h6
-; NONEON-NOSVE-NEXT:    fcvt s6, h7
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h5, s5
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    mov v2.h[2], v4.h[0]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
-; NONEON-NOSVE-NEXT:    fmaxnm s3, s6, s3
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], v5.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h6
-; NONEON-NOSVE-NEXT:    fmaxnm s6, s16, s7
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v2.h[4], v3.h[0]
-; NONEON-NOSVE-NEXT:    fmaxnm s4, s5, s4
-; NONEON-NOSVE-NEXT:    fcvt h3, s6
-; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[5], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h3, s4
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    mov v2.h[6], v3.h[0]
-; NONEON-NOSVE-NEXT:    mov v2.h[7], v0.h[0]
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.maxnum.v8f16(<8 x half> %op1, <8 x half> %op2)
   ret <8 x half> %res
 }
@@ -136,119 +45,6 @@ define void @fmaxnm_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmaxnm z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmaxnm_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h18, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h17, v3.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s19, h0
-; NONEON-NOSVE-NEXT:    fcvt s20, h3
-; NONEON-NOSVE-NEXT:    fcvt s21, h2
-; NONEON-NOSVE-NEXT:    mov h22, v3.h[2]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fcvt s18, h18
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fmaxnm s4, s19, s4
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h24, v3.h[3]
-; NONEON-NOSVE-NEXT:    fmaxnm s20, s21, s20
-; NONEON-NOSVE-NEXT:    fcvt s21, h22
-; NONEON-NOSVE-NEXT:    fcvt s22, h23
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h25, v2.h[6]
-; NONEON-NOSVE-NEXT:    fmaxnm s5, s7, s5
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[3]
-; NONEON-NOSVE-NEXT:    fmaxnm s6, s16, s6
-; NONEON-NOSVE-NEXT:    fmaxnm s16, s18, s17
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s18, h19
-; NONEON-NOSVE-NEXT:    fcvt s19, h24
-; NONEON-NOSVE-NEXT:    mov h24, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h17, s5
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt h5, s20
-; NONEON-NOSVE-NEXT:    fmaxnm s20, s22, s21
-; NONEON-NOSVE-NEXT:    fcvt h16, s16
-; NONEON-NOSVE-NEXT:    fcvt s21, h23
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    mov h22, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov v4.h[1], v17.h[0]
-; NONEON-NOSVE-NEXT:    mov h17, v1.h[4]
-; NONEON-NOSVE-NEXT:    fmaxnm s7, s18, s7
-; NONEON-NOSVE-NEXT:    mov h18, v3.h[4]
-; NONEON-NOSVE-NEXT:    mov v5.h[1], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h16, s20
-; NONEON-NOSVE-NEXT:    fmaxnm s19, s21, s19
-; NONEON-NOSVE-NEXT:    fcvt s20, h23
-; NONEON-NOSVE-NEXT:    mov h21, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    mov v4.h[2], v6.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s6, h17
-; NONEON-NOSVE-NEXT:    fcvt s17, h22
-; NONEON-NOSVE-NEXT:    fcvt h7, s7
-; NONEON-NOSVE-NEXT:    fcvt s18, h18
-; NONEON-NOSVE-NEXT:    mov h22, v3.h[5]
-; NONEON-NOSVE-NEXT:    mov v5.h[2], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h16, s19
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmaxnm s6, s17, s6
-; NONEON-NOSVE-NEXT:    mov h17, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    fmaxnm s18, s20, s18
-; NONEON-NOSVE-NEXT:    mov h20, v3.h[6]
-; NONEON-NOSVE-NEXT:    mov v4.h[3], v7.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s7, h22
-; NONEON-NOSVE-NEXT:    fcvt s22, h23
-; NONEON-NOSVE-NEXT:    mov v5.h[3], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s16, h21
-; NONEON-NOSVE-NEXT:    fcvt s21, h24
-; NONEON-NOSVE-NEXT:    fcvt s19, h19
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fcvt s23, h25
-; NONEON-NOSVE-NEXT:    fcvt h18, s18
-; NONEON-NOSVE-NEXT:    fcvt s20, h20
-; NONEON-NOSVE-NEXT:    mov h3, v3.h[7]
-; NONEON-NOSVE-NEXT:    fmaxnm s7, s22, s7
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmaxnm s16, s21, s16
-; NONEON-NOSVE-NEXT:    mov v4.h[4], v6.h[0]
-; NONEON-NOSVE-NEXT:    fmaxnm s6, s19, s17
-; NONEON-NOSVE-NEXT:    mov v5.h[4], v18.h[0]
-; NONEON-NOSVE-NEXT:    fmaxnm s17, s23, s20
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt h7, s7
-; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
-; NONEON-NOSVE-NEXT:    fcvt h16, s16
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    fmaxnm s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h3, s17
-; NONEON-NOSVE-NEXT:    mov v5.h[5], v7.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    mov v4.h[5], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s2
-; NONEON-NOSVE-NEXT:    mov v5.h[6], v3.h[0]
-; NONEON-NOSVE-NEXT:    mov v4.h[6], v6.h[0]
-; NONEON-NOSVE-NEXT:    mov v5.h[7], v1.h[0]
-; NONEON-NOSVE-NEXT:    mov v4.h[7], v0.h[0]
-; NONEON-NOSVE-NEXT:    stp q5, q4, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = call <16 x half> @llvm.maxnum.v16f16(<16 x half> %op1, <16 x half> %op2)
@@ -265,11 +61,6 @@ define <2 x float> @fmaxnm_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmaxnm_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmaxnm v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %op1, <2 x float> %op2)
   ret <2 x float> %res
 }
@@ -283,11 +74,6 @@ define <4 x float> @fmaxnm_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmaxnm_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmaxnm v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %op1, <4 x float> %op2)
   ret <4 x float> %res
 }
@@ -303,15 +89,6 @@ define void @fmaxnm_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmaxnm z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmaxnm_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fmaxnm v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fmaxnm v1.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %op1, <8 x float> %op2)
@@ -324,11 +101,6 @@ define <1 x double> @fmaxnm_v1f64(<1 x double> %op1, <1 x double> %op2) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmaxnm d0, d0, d1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmaxnm_v1f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmaxnm d0, d0, d1
-; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.maxnum.v1f64(<1 x double> %op1, <1 x double> %op2)
   ret <1 x double> %res
 }
@@ -342,11 +114,6 @@ define <2 x double> @fmaxnm_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    fmaxnm z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmaxnm_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmaxnm v0.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %op1, <2 x double> %op2)
   ret <2 x double> %res
 }
@@ -362,15 +129,6 @@ define void @fmaxnm_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmaxnm z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmaxnm_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fmaxnm v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fmaxnm v1.2d, v2.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %op1, <4 x double> %op2)
@@ -391,38 +149,6 @@ define <4 x half> @fminnm_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fminnm_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h1
-; NONEON-NOSVE-NEXT:    fcvt s7, h0
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s2, s3, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fminnm s5, s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    fminnm s3, s4, s3
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt h0, s5
-; NONEON-NOSVE-NEXT:    fcvt s4, h6
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v2.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h2, s3
-; NONEON-NOSVE-NEXT:    fminnm s1, s4, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[2], v2.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.minnum.v4f16(<4 x half> %op1, <4 x half> %op2)
   ret <4 x half> %res
 }
@@ -436,64 +162,6 @@ define <8 x half> @fminnm_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fminnm_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fminnm s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fminnm s3, s3, s2
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s4
-; NONEON-NOSVE-NEXT:    fminnm s4, s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fminnm s5, s5, s16
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    mov v2.h[1], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s3, h6
-; NONEON-NOSVE-NEXT:    fcvt s6, h7
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h5, s5
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    mov v2.h[2], v4.h[0]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
-; NONEON-NOSVE-NEXT:    fminnm s3, s6, s3
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], v5.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h6
-; NONEON-NOSVE-NEXT:    fminnm s6, s16, s7
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v2.h[4], v3.h[0]
-; NONEON-NOSVE-NEXT:    fminnm s4, s5, s4
-; NONEON-NOSVE-NEXT:    fcvt h3, s6
-; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[5], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h3, s4
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    mov v2.h[6], v3.h[0]
-; NONEON-NOSVE-NEXT:    mov v2.h[7], v0.h[0]
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.minnum.v8f16(<8 x half> %op1, <8 x half> %op2)
   ret <8 x half> %res
 }
@@ -509,119 +177,6 @@ define void @fminnm_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fminnm z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fminnm_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h18, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h17, v3.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s19, h0
-; NONEON-NOSVE-NEXT:    fcvt s20, h3
-; NONEON-NOSVE-NEXT:    fcvt s21, h2
-; NONEON-NOSVE-NEXT:    mov h22, v3.h[2]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fcvt s18, h18
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fminnm s4, s19, s4
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h24, v3.h[3]
-; NONEON-NOSVE-NEXT:    fminnm s20, s21, s20
-; NONEON-NOSVE-NEXT:    fcvt s21, h22
-; NONEON-NOSVE-NEXT:    fcvt s22, h23
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h25, v2.h[6]
-; NONEON-NOSVE-NEXT:    fminnm s5, s7, s5
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[3]
-; NONEON-NOSVE-NEXT:    fminnm s6, s16, s6
-; NONEON-NOSVE-NEXT:    fminnm s16, s18, s17
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s18, h19
-; NONEON-NOSVE-NEXT:    fcvt s19, h24
-; NONEON-NOSVE-NEXT:    mov h24, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h17, s5
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt h5, s20
-; NONEON-NOSVE-NEXT:    fminnm s20, s22, s21
-; NONEON-NOSVE-NEXT:    fcvt h16, s16
-; NONEON-NOSVE-NEXT:    fcvt s21, h23
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    mov h22, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov v4.h[1], v17.h[0]
-; NONEON-NOSVE-NEXT:    mov h17, v1.h[4]
-; NONEON-NOSVE-NEXT:    fminnm s7, s18, s7
-; NONEON-NOSVE-NEXT:    mov h18, v3.h[4]
-; NONEON-NOSVE-NEXT:    mov v5.h[1], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h16, s20
-; NONEON-NOSVE-NEXT:    fminnm s19, s21, s19
-; NONEON-NOSVE-NEXT:    fcvt s20, h23
-; NONEON-NOSVE-NEXT:    mov h21, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    mov v4.h[2], v6.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s6, h17
-; NONEON-NOSVE-NEXT:    fcvt s17, h22
-; NONEON-NOSVE-NEXT:    fcvt h7, s7
-; NONEON-NOSVE-NEXT:    fcvt s18, h18
-; NONEON-NOSVE-NEXT:    mov h22, v3.h[5]
-; NONEON-NOSVE-NEXT:    mov v5.h[2], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h16, s19
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fminnm s6, s17, s6
-; NONEON-NOSVE-NEXT:    mov h17, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    fminnm s18, s20, s18
-; NONEON-NOSVE-NEXT:    mov h20, v3.h[6]
-; NONEON-NOSVE-NEXT:    mov v4.h[3], v7.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s7, h22
-; NONEON-NOSVE-NEXT:    fcvt s22, h23
-; NONEON-NOSVE-NEXT:    mov v5.h[3], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s16, h21
-; NONEON-NOSVE-NEXT:    fcvt s21, h24
-; NONEON-NOSVE-NEXT:    fcvt s19, h19
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fcvt s23, h25
-; NONEON-NOSVE-NEXT:    fcvt h18, s18
-; NONEON-NOSVE-NEXT:    fcvt s20, h20
-; NONEON-NOSVE-NEXT:    mov h3, v3.h[7]
-; NONEON-NOSVE-NEXT:    fminnm s7, s22, s7
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fminnm s16, s21, s16
-; NONEON-NOSVE-NEXT:    mov v4.h[4], v6.h[0]
-; NONEON-NOSVE-NEXT:    fminnm s6, s19, s17
-; NONEON-NOSVE-NEXT:    mov v5.h[4], v18.h[0]
-; NONEON-NOSVE-NEXT:    fminnm s17, s23, s20
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt h7, s7
-; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
-; NONEON-NOSVE-NEXT:    fcvt h16, s16
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    fminnm s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h3, s17
-; NONEON-NOSVE-NEXT:    mov v5.h[5], v7.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    mov v4.h[5], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s2
-; NONEON-NOSVE-NEXT:    mov v5.h[6], v3.h[0]
-; NONEON-NOSVE-NEXT:    mov v4.h[6], v6.h[0]
-; NONEON-NOSVE-NEXT:    mov v5.h[7], v1.h[0]
-; NONEON-NOSVE-NEXT:    mov v4.h[7], v0.h[0]
-; NONEON-NOSVE-NEXT:    stp q5, q4, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = call <16 x half> @llvm.minnum.v16f16(<16 x half> %op1, <16 x half> %op2)
@@ -638,11 +193,6 @@ define <2 x float> @fminnm_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fminnm_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fminnm v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.minnum.v2f32(<2 x float> %op1, <2 x float> %op2)
   ret <2 x float> %res
 }
@@ -656,11 +206,6 @@ define <4 x float> @fminnm_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fminnm_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fminnm v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.minnum.v4f32(<4 x float> %op1, <4 x float> %op2)
   ret <4 x float> %res
 }
@@ -676,15 +221,6 @@ define void @fminnm_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fminnm z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fminnm_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fminnm v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fminnm v1.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = call <8 x float> @llvm.minnum.v8f32(<8 x float> %op1, <8 x float> %op2)
@@ -697,11 +233,6 @@ define <1 x double> @fminnm_v1f64(<1 x double> %op1, <1 x double> %op2) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fminnm d0, d0, d1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fminnm_v1f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fminnm d0, d0, d1
-; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.minnum.v1f64(<1 x double> %op1, <1 x double> %op2)
   ret <1 x double> %res
 }
@@ -715,11 +246,6 @@ define <2 x double> @fminnm_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fminnm_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fminnm v0.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.minnum.v2f64(<2 x double> %op1, <2 x double> %op2)
   ret <2 x double> %res
 }
@@ -735,15 +261,6 @@ define void @fminnm_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fminnm z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fminnm_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fminnm v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fminnm v1.2d, v2.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = call <4 x double> @llvm.minnum.v4f64(<4 x double> %op1, <4 x double> %op2)
@@ -764,38 +281,6 @@ define <4 x half> @fmax_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    fmax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmax_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h1
-; NONEON-NOSVE-NEXT:    fcvt s7, h0
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s2, s3, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fmax s5, s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    fmax s3, s4, s3
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt h0, s5
-; NONEON-NOSVE-NEXT:    fcvt s4, h6
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v2.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h2, s3
-; NONEON-NOSVE-NEXT:    fmax s1, s4, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[2], v2.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.maximum.v4f16(<4 x half> %op1, <4 x half> %op2)
   ret <4 x half> %res
 }
@@ -809,64 +294,6 @@ define <8 x half> @fmax_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    fmax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmax_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fmax s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fmax s3, s3, s2
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s4
-; NONEON-NOSVE-NEXT:    fmax s4, s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fmax s5, s5, s16
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    mov v2.h[1], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s3, h6
-; NONEON-NOSVE-NEXT:    fcvt s6, h7
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h5, s5
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    mov v2.h[2], v4.h[0]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
-; NONEON-NOSVE-NEXT:    fmax s3, s6, s3
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], v5.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h6
-; NONEON-NOSVE-NEXT:    fmax s6, s16, s7
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v2.h[4], v3.h[0]
-; NONEON-NOSVE-NEXT:    fmax s4, s5, s4
-; NONEON-NOSVE-NEXT:    fcvt h3, s6
-; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[5], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h3, s4
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    mov v2.h[6], v3.h[0]
-; NONEON-NOSVE-NEXT:    mov v2.h[7], v0.h[0]
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.maximum.v8f16(<8 x half> %op1, <8 x half> %op2)
   ret <8 x half> %res
 }
@@ -882,119 +309,6 @@ define void @fmax_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmax z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmax_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h18, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h17, v3.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s19, h0
-; NONEON-NOSVE-NEXT:    fcvt s20, h3
-; NONEON-NOSVE-NEXT:    fcvt s21, h2
-; NONEON-NOSVE-NEXT:    mov h22, v3.h[2]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fcvt s18, h18
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fmax s4, s19, s4
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h24, v3.h[3]
-; NONEON-NOSVE-NEXT:    fmax s20, s21, s20
-; NONEON-NOSVE-NEXT:    fcvt s21, h22
-; NONEON-NOSVE-NEXT:    fcvt s22, h23
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h25, v2.h[6]
-; NONEON-NOSVE-NEXT:    fmax s5, s7, s5
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[3]
-; NONEON-NOSVE-NEXT:    fmax s6, s16, s6
-; NONEON-NOSVE-NEXT:    fmax s16, s18, s17
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s18, h19
-; NONEON-NOSVE-NEXT:    fcvt s19, h24
-; NONEON-NOSVE-NEXT:    mov h24, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h17, s5
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt h5, s20
-; NONEON-NOSVE-NEXT:    fmax s20, s22, s21
-; NONEON-NOSVE-NEXT:    fcvt h16, s16
-; NONEON-NOSVE-NEXT:    fcvt s21, h23
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    mov h22, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov v4.h[1], v17.h[0]
-; NONEON-NOSVE-NEXT:    mov h17, v1.h[4]
-; NONEON-NOSVE-NEXT:    fmax s7, s18, s7
-; NONEON-NOSVE-NEXT:    mov h18, v3.h[4]
-; NONEON-NOSVE-NEXT:    mov v5.h[1], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h16, s20
-; NONEON-NOSVE-NEXT:    fmax s19, s21, s19
-; NONEON-NOSVE-NEXT:    fcvt s20, h23
-; NONEON-NOSVE-NEXT:    mov h21, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    mov v4.h[2], v6.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s6, h17
-; NONEON-NOSVE-NEXT:    fcvt s17, h22
-; NONEON-NOSVE-NEXT:    fcvt h7, s7
-; NONEON-NOSVE-NEXT:    fcvt s18, h18
-; NONEON-NOSVE-NEXT:    mov h22, v3.h[5]
-; NONEON-NOSVE-NEXT:    mov v5.h[2], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h16, s19
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmax s6, s17, s6
-; NONEON-NOSVE-NEXT:    mov h17, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    fmax s18, s20, s18
-; NONEON-NOSVE-NEXT:    mov h20, v3.h[6]
-; NONEON-NOSVE-NEXT:    mov v4.h[3], v7.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s7, h22
-; NONEON-NOSVE-NEXT:    fcvt s22, h23
-; NONEON-NOSVE-NEXT:    mov v5.h[3], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s16, h21
-; NONEON-NOSVE-NEXT:    fcvt s21, h24
-; NONEON-NOSVE-NEXT:    fcvt s19, h19
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fcvt s23, h25
-; NONEON-NOSVE-NEXT:    fcvt h18, s18
-; NONEON-NOSVE-NEXT:    fcvt s20, h20
-; NONEON-NOSVE-NEXT:    mov h3, v3.h[7]
-; NONEON-NOSVE-NEXT:    fmax s7, s22, s7
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmax s16, s21, s16
-; NONEON-NOSVE-NEXT:    mov v4.h[4], v6.h[0]
-; NONEON-NOSVE-NEXT:    fmax s6, s19, s17
-; NONEON-NOSVE-NEXT:    mov v5.h[4], v18.h[0]
-; NONEON-NOSVE-NEXT:    fmax s17, s23, s20
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt h7, s7
-; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
-; NONEON-NOSVE-NEXT:    fcvt h16, s16
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    fmax s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h3, s17
-; NONEON-NOSVE-NEXT:    mov v5.h[5], v7.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    mov v4.h[5], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s2
-; NONEON-NOSVE-NEXT:    mov v5.h[6], v3.h[0]
-; NONEON-NOSVE-NEXT:    mov v4.h[6], v6.h[0]
-; NONEON-NOSVE-NEXT:    mov v5.h[7], v1.h[0]
-; NONEON-NOSVE-NEXT:    mov v4.h[7], v0.h[0]
-; NONEON-NOSVE-NEXT:    stp q5, q4, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = call <16 x half> @llvm.maximum.v16f16(<16 x half> %op1, <16 x half> %op2)
@@ -1011,11 +325,6 @@ define <2 x float> @fmax_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    fmax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmax_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmax v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.maximum.v2f32(<2 x float> %op1, <2 x float> %op2)
   ret <2 x float> %res
 }
@@ -1029,11 +338,6 @@ define <4 x float> @fmax_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    fmax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmax_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmax v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.maximum.v4f32(<4 x float> %op1, <4 x float> %op2)
   ret <4 x float> %res
 }
@@ -1049,15 +353,6 @@ define void @fmax_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmax z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmax_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fmax v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fmax v1.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = call <8 x float> @llvm.maximum.v8f32(<8 x float> %op1, <8 x float> %op2)
@@ -1070,11 +365,6 @@ define <1 x double> @fmax_v1f64(<1 x double> %op1, <1 x double> %op2) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmax d0, d0, d1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmax_v1f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmax d0, d0, d1
-; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.maximum.v1f64(<1 x double> %op1, <1 x double> %op2)
   ret <1 x double> %res
 }
@@ -1088,11 +378,6 @@ define <2 x double> @fmax_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    fmax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmax_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmax v0.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.maximum.v2f64(<2 x double> %op1, <2 x double> %op2)
   ret <2 x double> %res
 }
@@ -1108,15 +393,6 @@ define void @fmax_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmax z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmax_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fmax v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fmax v1.2d, v2.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = call <4 x double> @llvm.maximum.v4f64(<4 x double> %op1, <4 x double> %op2)
@@ -1137,38 +413,6 @@ define <4 x half> @fmin_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    fmin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmin_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h1
-; NONEON-NOSVE-NEXT:    fcvt s7, h0
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s2, s3, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fmin s5, s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    fmin s3, s4, s3
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt h0, s5
-; NONEON-NOSVE-NEXT:    fcvt s4, h6
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v2.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h2, s3
-; NONEON-NOSVE-NEXT:    fmin s1, s4, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[2], v2.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.minimum.v4f16(<4 x half> %op1, <4 x half> %op2)
   ret <4 x half> %res
 }
@@ -1182,64 +426,6 @@ define <8 x half> @fmin_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    fmin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmin_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fmin s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fmin s3, s3, s2
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s4
-; NONEON-NOSVE-NEXT:    fmin s4, s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fmin s5, s5, s16
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    mov v2.h[1], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s3, h6
-; NONEON-NOSVE-NEXT:    fcvt s6, h7
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h5, s5
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    mov v2.h[2], v4.h[0]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
-; NONEON-NOSVE-NEXT:    fmin s3, s6, s3
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], v5.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h6
-; NONEON-NOSVE-NEXT:    fmin s6, s16, s7
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v2.h[4], v3.h[0]
-; NONEON-NOSVE-NEXT:    fmin s4, s5, s4
-; NONEON-NOSVE-NEXT:    fcvt h3, s6
-; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[5], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h3, s4
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    mov v2.h[6], v3.h[0]
-; NONEON-NOSVE-NEXT:    mov v2.h[7], v0.h[0]
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.minimum.v8f16(<8 x half> %op1, <8 x half> %op2)
   ret <8 x half> %res
 }
@@ -1255,119 +441,6 @@ define void @fmin_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmin z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmin_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h18, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h17, v3.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s19, h0
-; NONEON-NOSVE-NEXT:    fcvt s20, h3
-; NONEON-NOSVE-NEXT:    fcvt s21, h2
-; NONEON-NOSVE-NEXT:    mov h22, v3.h[2]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fcvt s18, h18
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fmin s4, s19, s4
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h24, v3.h[3]
-; NONEON-NOSVE-NEXT:    fmin s20, s21, s20
-; NONEON-NOSVE-NEXT:    fcvt s21, h22
-; NONEON-NOSVE-NEXT:    fcvt s22, h23
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h25, v2.h[6]
-; NONEON-NOSVE-NEXT:    fmin s5, s7, s5
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[3]
-; NONEON-NOSVE-NEXT:    fmin s6, s16, s6
-; NONEON-NOSVE-NEXT:    fmin s16, s18, s17
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s18, h19
-; NONEON-NOSVE-NEXT:    fcvt s19, h24
-; NONEON-NOSVE-NEXT:    mov h24, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h17, s5
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt h5, s20
-; NONEON-NOSVE-NEXT:    fmin s20, s22, s21
-; NONEON-NOSVE-NEXT:    fcvt h16, s16
-; NONEON-NOSVE-NEXT:    fcvt s21, h23
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    mov h22, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov v4.h[1], v17.h[0]
-; NONEON-NOSVE-NEXT:    mov h17, v1.h[4]
-; NONEON-NOSVE-NEXT:    fmin s7, s18, s7
-; NONEON-NOSVE-NEXT:    mov h18, v3.h[4]
-; NONEON-NOSVE-NEXT:    mov v5.h[1], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h16, s20
-; NONEON-NOSVE-NEXT:    fmin s19, s21, s19
-; NONEON-NOSVE-NEXT:    fcvt s20, h23
-; NONEON-NOSVE-NEXT:    mov h21, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    mov v4.h[2], v6.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s6, h17
-; NONEON-NOSVE-NEXT:    fcvt s17, h22
-; NONEON-NOSVE-NEXT:    fcvt h7, s7
-; NONEON-NOSVE-NEXT:    fcvt s18, h18
-; NONEON-NOSVE-NEXT:    mov h22, v3.h[5]
-; NONEON-NOSVE-NEXT:    mov v5.h[2], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h16, s19
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmin s6, s17, s6
-; NONEON-NOSVE-NEXT:    mov h17, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    fmin s18, s20, s18
-; NONEON-NOSVE-NEXT:    mov h20, v3.h[6]
-; NONEON-NOSVE-NEXT:    mov v4.h[3], v7.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s7, h22
-; NONEON-NOSVE-NEXT:    fcvt s22, h23
-; NONEON-NOSVE-NEXT:    mov v5.h[3], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s16, h21
-; NONEON-NOSVE-NEXT:    fcvt s21, h24
-; NONEON-NOSVE-NEXT:    fcvt s19, h19
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fcvt s23, h25
-; NONEON-NOSVE-NEXT:    fcvt h18, s18
-; NONEON-NOSVE-NEXT:    fcvt s20, h20
-; NONEON-NOSVE-NEXT:    mov h3, v3.h[7]
-; NONEON-NOSVE-NEXT:    fmin s7, s22, s7
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmin s16, s21, s16
-; NONEON-NOSVE-NEXT:    mov v4.h[4], v6.h[0]
-; NONEON-NOSVE-NEXT:    fmin s6, s19, s17
-; NONEON-NOSVE-NEXT:    mov v5.h[4], v18.h[0]
-; NONEON-NOSVE-NEXT:    fmin s17, s23, s20
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt h7, s7
-; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
-; NONEON-NOSVE-NEXT:    fcvt h16, s16
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    fmin s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h3, s17
-; NONEON-NOSVE-NEXT:    mov v5.h[5], v7.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    mov v4.h[5], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s2
-; NONEON-NOSVE-NEXT:    mov v5.h[6], v3.h[0]
-; NONEON-NOSVE-NEXT:    mov v4.h[6], v6.h[0]
-; NONEON-NOSVE-NEXT:    mov v5.h[7], v1.h[0]
-; NONEON-NOSVE-NEXT:    mov v4.h[7], v0.h[0]
-; NONEON-NOSVE-NEXT:    stp q5, q4, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = call <16 x half> @llvm.minimum.v16f16(<16 x half> %op1, <16 x half> %op2)
@@ -1384,11 +457,6 @@ define <2 x float> @fmin_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    fmin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmin_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmin v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.minimum.v2f32(<2 x float> %op1, <2 x float> %op2)
   ret <2 x float> %res
 }
@@ -1402,11 +470,6 @@ define <4 x float> @fmin_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    fmin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmin_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmin v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.minimum.v4f32(<4 x float> %op1, <4 x float> %op2)
   ret <4 x float> %res
 }
@@ -1422,15 +485,6 @@ define void @fmin_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmin z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmin_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fmin v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fmin v1.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = call <8 x float> @llvm.minimum.v8f32(<8 x float> %op1, <8 x float> %op2)
@@ -1443,11 +497,6 @@ define <1 x double> @fmin_v1f64(<1 x double> %op1, <1 x double> %op2) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmin d0, d0, d1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmin_v1f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmin d0, d0, d1
-; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.minimum.v1f64(<1 x double> %op1, <1 x double> %op2)
   ret <1 x double> %res
 }
@@ -1461,11 +510,6 @@ define <2 x double> @fmin_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    fmin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmin_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmin v0.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.minimum.v2f64(<2 x double> %op1, <2 x double> %op2)
   ret <2 x double> %res
 }
@@ -1481,15 +525,6 @@ define void @fmin_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmin z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmin_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fmin v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fmin v1.2d, v2.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = call <4 x double> @llvm.minimum.v4f64(<4 x double> %op1, <4 x double> %op2)
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll
index fdb81b8e5fe1..b56e67d95ba0 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sme-fa64 -force-streaming-compatible-sve < %s | FileCheck %s -check-prefix=FA64
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s -check-prefix=NO-FA64
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -27,30 +26,6 @@ define half @fadda_v4f16(half %start, <4 x half> %a) {
 ; NO-FA64-NEXT:    fadd h0, h0, h2
 ; NO-FA64-NEXT:    fadd h0, h0, h1
 ; NO-FA64-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fadda_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)
   ret half %res
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
index 74a5db4b38e0..df9613a30e40 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -20,30 +19,6 @@ define half @fadda_v4f16(half %start, <4 x half> %a) {
 ; CHECK-NEXT:    fadd h0, h0, h2
 ; CHECK-NEXT:    fadd h0, h0, h1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fadda_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)
   ret half %res
 }
@@ -68,49 +43,6 @@ define half @fadda_v8f16(half %start, <8 x half> %a) {
 ; CHECK-NEXT:    fadd h0, h0, h2
 ; CHECK-NEXT:    fadd h0, h0, h1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fadda_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a)
   ret half %res
 }
@@ -151,90 +83,6 @@ define half @fadda_v16f16(half %start, ptr %a) {
 ; CHECK-NEXT:    fadd h0, h0, h2
 ; CHECK-NEXT:    fadd h0, h0, h1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fadda_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op)
   ret half %res
@@ -248,14 +96,6 @@ define float @fadda_v2f32(float %start, <2 x float> %a) {
 ; CHECK-NEXT:    mov z1.s, z1.s[1]
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fadda_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    mov s2, v1.s[1]
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a)
   ret float %res
 }
@@ -272,17 +112,6 @@ define float @fadda_v4f32(float %start, <4 x float> %a) {
 ; CHECK-NEXT:    fadd s0, s0, s2
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fadda_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov s2, v1.s[1]
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
-; NONEON-NOSVE-NEXT:    mov s3, v1.s[2]
-; NONEON-NOSVE-NEXT:    mov s1, v1.s[3]
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s3
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
-; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a)
   ret float %res
 }
@@ -307,26 +136,6 @@ define float @fadda_v8f32(float %start, ptr %a) {
 ; CHECK-NEXT:    fadd s0, s0, s2
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fadda_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    mov s2, v1.s[1]
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
-; NONEON-NOSVE-NEXT:    mov s3, v1.s[2]
-; NONEON-NOSVE-NEXT:    mov s1, v1.s[3]
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s3
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    mov s2, v1.s[1]
-; NONEON-NOSVE-NEXT:    mov s3, v1.s[2]
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
-; NONEON-NOSVE-NEXT:    mov s1, v1.s[3]
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s3
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
-; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op)
   ret float %res
@@ -337,11 +146,6 @@ define double @fadda_v1f64(double %start, <1 x double> %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fadd d0, d0, d1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fadda_v1f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
-; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fadd.v1f64(double %start, <1 x double> %a)
   ret double %res
 }
@@ -354,13 +158,6 @@ define double @fadda_v2f64(double %start, <2 x double> %a) {
 ; CHECK-NEXT:    mov z1.d, z1.d[1]
 ; CHECK-NEXT:    fadd d0, d0, d1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fadda_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov d2, v1.d[1]
-; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
-; NONEON-NOSVE-NEXT:    fadd d0, d0, d2
-; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a)
   ret double %res
 }
@@ -377,17 +174,6 @@ define double @fadda_v4f64(double %start, ptr %a) {
 ; CHECK-NEXT:    mov z1.d, z1.d[1]
 ; CHECK-NEXT:    fadd d0, d0, d1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fadda_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x0]
-; NONEON-NOSVE-NEXT:    mov d2, v3.d[1]
-; NONEON-NOSVE-NEXT:    fadd d0, d0, d3
-; NONEON-NOSVE-NEXT:    fadd d0, d0, d2
-; NONEON-NOSVE-NEXT:    mov d2, v1.d[1]
-; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
-; NONEON-NOSVE-NEXT:    fadd d0, d0, d2
-; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op)
   ret double %res
@@ -405,30 +191,6 @@ define half @faddv_v4f16(half %start, <4 x half> %a) {
 ; CHECK-NEXT:    faddv h1, p0, z1.h
 ; CHECK-NEXT:    fadd h0, h0, h1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: faddv_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fadd s2, s3, s2
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fadd s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fadd s1, s2, s1
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call fast half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)
   ret half %res
 }
@@ -441,49 +203,6 @@ define half @faddv_v8f16(half %start, <8 x half> %a) {
 ; CHECK-NEXT:    faddv h1, p0, z1.h
 ; CHECK-NEXT:    fadd h0, h0, h1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: faddv_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fadd s2, s3, s2
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fadd s2, s2, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fadd s2, s2, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fadd s2, s2, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fadd s2, s2, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fadd s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fadd s1, s2, s1
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call fast half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a)
   ret half %res
 }
@@ -497,58 +216,6 @@ define half @faddv_v16f16(half %start, ptr %a) {
 ; CHECK-NEXT:    faddv h1, p0, z1.h
 ; CHECK-NEXT:    fadd h0, h0, h1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: faddv_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fadd v3.4s, v4.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fadd v1.4s, v1.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v1.4s
-; NONEON-NOSVE-NEXT:    mov h1, v2.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fadd s1, s3, s1
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fadd s1, s1, s3
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fadd s1, s1, s3
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fadd s1, s1, s3
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fadd s1, s1, s3
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fadd s1, s1, s3
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call fast half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op)
   ret half %res
@@ -562,12 +229,6 @@ define float @faddv_v2f32(float %start, <2 x float> %a) {
 ; CHECK-NEXT:    faddv s1, p0, z1.s
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: faddv_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    faddp s1, v1.2s
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
-; NONEON-NOSVE-NEXT:    ret
   %res = call fast float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a)
   ret float %res
 }
@@ -580,13 +241,6 @@ define float @faddv_v4f32(float %start, <4 x float> %a) {
 ; CHECK-NEXT:    faddv s1, p0, z1.s
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: faddv_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    faddp v1.4s, v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    faddp s1, v1.2s
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
-; NONEON-NOSVE-NEXT:    ret
   %res = call fast float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a)
   ret float %res
 }
@@ -600,15 +254,6 @@ define float @faddv_v8f32(float %start, ptr %a) {
 ; CHECK-NEXT:    faddv s1, p0, z1.s
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: faddv_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
-; NONEON-NOSVE-NEXT:    fadd v1.4s, v2.4s, v1.4s
-; NONEON-NOSVE-NEXT:    faddp v1.4s, v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    faddp s1, v1.2s
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
-; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call fast float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op)
   ret float %res
@@ -619,11 +264,6 @@ define double @faddv_v1f64(double %start, <1 x double> %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fadd d0, d0, d1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: faddv_v1f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
-; NONEON-NOSVE-NEXT:    ret
   %res = call fast double @llvm.vector.reduce.fadd.v1f64(double %start, <1 x double> %a)
   ret double %res
 }
@@ -636,12 +276,6 @@ define double @faddv_v2f64(double %start, <2 x double> %a) {
 ; CHECK-NEXT:    faddv d1, p0, z1.d
 ; CHECK-NEXT:    fadd d0, d0, d1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: faddv_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    faddp d1, v1.2d
-; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
-; NONEON-NOSVE-NEXT:    ret
   %res = call fast double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a)
   ret double %res
 }
@@ -655,14 +289,6 @@ define double @faddv_v4f64(double %start, ptr %a) {
 ; CHECK-NEXT:    faddv d1, p0, z1.d
 ; CHECK-NEXT:    fadd d0, d0, d1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: faddv_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
-; NONEON-NOSVE-NEXT:    fadd v1.2d, v2.2d, v1.2d
-; NONEON-NOSVE-NEXT:    faddp d1, v1.2d
-; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
-; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call fast double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op)
   ret double %res
@@ -680,26 +306,6 @@ define half @fmaxv_v4f16(<4 x half> %a) {
 ; CHECK-NEXT:    fmaxnmv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmaxv_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s1, s2, s1
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %a)
   ret half %res
 }
@@ -712,45 +318,6 @@ define half @fmaxv_v8f16(<8 x half> %a) {
 ; CHECK-NEXT:    fmaxnmv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmaxv_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s1, s2, s1
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> %a)
   ret half %res
 }
@@ -764,85 +331,6 @@ define half @fmaxv_v16f16(ptr %a) {
 ; CHECK-NEXT:    fmaxnmv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmaxv_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fmaxnm s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
-; NONEON-NOSVE-NEXT:    fmaxnm s2, s3, s2
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmaxnm s3, s5, s3
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
-; NONEON-NOSVE-NEXT:    fmaxnm s2, s4, s2
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmaxnm s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[4]
-; NONEON-NOSVE-NEXT:    fmaxnm s2, s2, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmaxnm s3, s5, s3
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[5]
-; NONEON-NOSVE-NEXT:    fmaxnm s2, s2, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmaxnm s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fmaxnm s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h3, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
-; NONEON-NOSVE-NEXT:    fmaxnm s2, s2, s3
-; NONEON-NOSVE-NEXT:    fmaxnm s3, s5, s4
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fmaxnm s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h1, s2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> %op)
   ret half %res
@@ -856,11 +344,6 @@ define float @fmaxv_v2f32(<2 x float> %a) {
 ; CHECK-NEXT:    fmaxnmv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmaxv_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmaxnmp s0, v0.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a)
   ret float %res
 }
@@ -873,11 +356,6 @@ define float @fmaxv_v4f32(<4 x float> %a) {
 ; CHECK-NEXT:    fmaxnmv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmaxv_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmaxnmv s0, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
   ret float %res
 }
@@ -891,13 +369,6 @@ define float @fmaxv_v8f32(ptr %a) {
 ; CHECK-NEXT:    fmaxnmv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmaxv_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    fmaxnm v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fmaxnmv s0, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %op)
   ret float %res
@@ -907,10 +378,6 @@ define double @fmaxv_v1f64(<1 x double> %a) {
 ; CHECK-LABEL: fmaxv_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmaxv_v1f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> %a)
   ret double %res
 }
@@ -923,11 +390,6 @@ define double @fmaxv_v2f64(<2 x double> %a) {
 ; CHECK-NEXT:    fmaxnmv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmaxv_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmaxnmp d0, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a)
   ret double %res
 }
@@ -941,13 +403,6 @@ define double @fmaxv_v4f64(ptr %a) {
 ; CHECK-NEXT:    fmaxnmv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmaxv_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    fmaxnm v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fmaxnmp d0, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %op)
   ret double %res
@@ -965,26 +420,6 @@ define half @fminv_v4f16(<4 x half> %a) {
 ; CHECK-NEXT:    fminnmv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fminv_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s1, s2, s1
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %a)
   ret half %res
 }
@@ -997,45 +432,6 @@ define half @fminv_v8f16(<8 x half> %a) {
 ; CHECK-NEXT:    fminnmv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fminv_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s1, s2, s1
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> %a)
   ret half %res
 }
@@ -1049,85 +445,6 @@ define half @fminv_v16f16(ptr %a) {
 ; CHECK-NEXT:    fminnmv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fminv_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fminnm s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
-; NONEON-NOSVE-NEXT:    fminnm s2, s3, s2
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fminnm s3, s5, s3
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
-; NONEON-NOSVE-NEXT:    fminnm s2, s4, s2
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fminnm s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[4]
-; NONEON-NOSVE-NEXT:    fminnm s2, s2, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fminnm s3, s5, s3
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[5]
-; NONEON-NOSVE-NEXT:    fminnm s2, s2, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fminnm s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fminnm s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h3, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
-; NONEON-NOSVE-NEXT:    fminnm s2, s2, s3
-; NONEON-NOSVE-NEXT:    fminnm s3, s5, s4
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fminnm s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h1, s2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> %op)
   ret half %res
@@ -1141,11 +458,6 @@ define float @fminv_v2f32(<2 x float> %a) {
 ; CHECK-NEXT:    fminnmv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fminv_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fminnmp s0, v0.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a)
   ret float %res
 }
@@ -1158,11 +470,6 @@ define float @fminv_v4f32(<4 x float> %a) {
 ; CHECK-NEXT:    fminnmv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fminv_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fminnmv s0, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
   ret float %res
 }
@@ -1176,13 +483,6 @@ define float @fminv_v8f32(ptr %a) {
 ; CHECK-NEXT:    fminnmv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fminv_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    fminnm v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fminnmv s0, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %op)
   ret float %res
@@ -1192,10 +492,6 @@ define double @fminv_v1f64(<1 x double> %a) {
 ; CHECK-LABEL: fminv_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fminv_v1f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> %a)
   ret double %res
 }
@@ -1208,11 +504,6 @@ define double @fminv_v2f64(<2 x double> %a) {
 ; CHECK-NEXT:    fminnmv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fminv_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fminnmp d0, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a)
   ret double %res
 }
@@ -1226,13 +517,6 @@ define double @fminv_v4f64(ptr %a) {
 ; CHECK-NEXT:    fminnmv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fminv_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    fminnm v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fminnmp d0, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %op)
   ret double %res
@@ -1250,26 +534,6 @@ define half @fmaximumv_v4f16(<4 x half> %a) {
 ; CHECK-NEXT:    fmaxv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmaximumv_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s1, s2, s1
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s1, s1, s2
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmaximum.v4f16(<4 x half> %a)
   ret half %res
 }
@@ -1282,45 +546,6 @@ define half @fmaximumv_v8f16(<8 x half> %a) {
 ; CHECK-NEXT:    fmaxv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmaximumv_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s1, s2, s1
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s1, s1, s2
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmaximum.v8f16(<8 x half> %a)
   ret half %res
 }
@@ -1334,85 +559,6 @@ define half @fmaximumv_v16f16(ptr %a) {
 ; CHECK-NEXT:    fmaxv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmaximumv_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fmax s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
-; NONEON-NOSVE-NEXT:    fmax s2, s3, s2
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmax s3, s5, s3
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
-; NONEON-NOSVE-NEXT:    fmax s2, s4, s2
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmax s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[4]
-; NONEON-NOSVE-NEXT:    fmax s2, s2, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmax s3, s5, s3
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[5]
-; NONEON-NOSVE-NEXT:    fmax s2, s2, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmax s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fmax s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h3, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
-; NONEON-NOSVE-NEXT:    fmax s2, s2, s3
-; NONEON-NOSVE-NEXT:    fmax s3, s5, s4
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fmax s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h1, s2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call half @llvm.vector.reduce.fmaximum.v16f16(<16 x half> %op)
   ret half %res
@@ -1426,11 +572,6 @@ define float @fmaximumv_v2f32(<2 x float> %a) {
 ; CHECK-NEXT:    fmaxv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmaximumv_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmaxp s0, v0.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> %a)
   ret float %res
 }
@@ -1443,11 +584,6 @@ define float @fmaximumv_v4f32(<4 x float> %a) {
 ; CHECK-NEXT:    fmaxv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmaximumv_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmaxv s0, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %a)
   ret float %res
 }
@@ -1461,13 +597,6 @@ define float @fmaximumv_v8f32(ptr %a) {
 ; CHECK-NEXT:    fmaxv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmaximumv_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    fmax v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fmaxv s0, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> %op)
   ret float %res
@@ -1477,10 +606,6 @@ define double @fmaximumv_v1f64(<1 x double> %a) {
 ; CHECK-LABEL: fmaximumv_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmaximumv_v1f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmaximum.v1f64(<1 x double> %a)
   ret double %res
 }
@@ -1493,11 +618,6 @@ define double @fmaximumv_v2f64(<2 x double> %a) {
 ; CHECK-NEXT:    fmaxv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmaximumv_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmaxp d0, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> %a)
   ret double %res
 }
@@ -1511,13 +631,6 @@ define double @fmaximumv_v4f64(ptr %a) {
 ; CHECK-NEXT:    fmaxv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fmaximumv_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    fmax v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fmaxp d0, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> %op)
   ret double %res
@@ -1535,26 +648,6 @@ define half @fminimumv_v4f16(<4 x half> %a) {
 ; CHECK-NEXT:    fminv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fminimumv_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s1, s2, s1
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s1, s1, s2
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fminimum.v4f16(<4 x half> %a)
   ret half %res
 }
@@ -1567,45 +660,6 @@ define half @fminimumv_v8f16(<8 x half> %a) {
 ; CHECK-NEXT:    fminv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fminimumv_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s1, s2, s1
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s1, s1, s2
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fminimum.v8f16(<8 x half> %a)
   ret half %res
 }
@@ -1619,85 +673,6 @@ define half @fminimumv_v16f16(ptr %a) {
 ; CHECK-NEXT:    fminv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fminimumv_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fmin s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
-; NONEON-NOSVE-NEXT:    fmin s2, s3, s2
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmin s3, s5, s3
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
-; NONEON-NOSVE-NEXT:    fmin s2, s4, s2
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmin s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[4]
-; NONEON-NOSVE-NEXT:    fmin s2, s2, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmin s3, s5, s3
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[5]
-; NONEON-NOSVE-NEXT:    fmin s2, s2, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmin s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fmin s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h3, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
-; NONEON-NOSVE-NEXT:    fmin s2, s2, s3
-; NONEON-NOSVE-NEXT:    fmin s3, s5, s4
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fmin s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h1, s2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call half @llvm.vector.reduce.fminimum.v16f16(<16 x half> %op)
   ret half %res
@@ -1711,11 +686,6 @@ define float @fminimumv_v2f32(<2 x float> %a) {
 ; CHECK-NEXT:    fminv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fminimumv_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fminp s0, v0.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fminimum.v2f32(<2 x float> %a)
   ret float %res
 }
@@ -1728,11 +698,6 @@ define float @fminimumv_v4f32(<4 x float> %a) {
 ; CHECK-NEXT:    fminv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fminimumv_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fminv s0, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %a)
   ret float %res
 }
@@ -1746,13 +711,6 @@ define float @fminimumv_v8f32(ptr %a) {
 ; CHECK-NEXT:    fminv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fminimumv_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    fmin v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fminv s0, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> %op)
   ret float %res
@@ -1762,10 +720,6 @@ define double @fminimumv_v1f64(<1 x double> %a) {
 ; CHECK-LABEL: fminimumv_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fminimumv_v1f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fminimum.v1f64(<1 x double> %a)
   ret double %res
 }
@@ -1778,11 +732,6 @@ define double @fminimumv_v2f64(<2 x double> %a) {
 ; CHECK-NEXT:    fminv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fminimumv_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fminp d0, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fminimum.v2f64(<2 x double> %a)
   ret double %res
 }
@@ -1796,13 +745,6 @@ define double @fminimumv_v4f64(ptr %a) {
 ; CHECK-NEXT:    fminv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fminimumv_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    fmin v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fminp d0, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> %op)
   ret double %res
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
index 454683865eb9..7ddc641f366c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -17,13 +16,6 @@ define <2 x half> @frintp_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    frintp z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintp_v2f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frintp v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.ceil.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -36,13 +28,6 @@ define <4 x half> @frintp_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    frintp z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintp_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frintp v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.ceil.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -55,16 +40,6 @@ define <8 x half> @frintp_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    frintp z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintp_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
-; NONEON-NOSVE-NEXT:    frintp v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
-; NONEON-NOSVE-NEXT:    frintp v1.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.ceil.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -78,24 +53,6 @@ define void @frintp_v16f16(ptr %a) {
 ; CHECK-NEXT:    frintp z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintp_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    frintp v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    frintp v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    frintp v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frintp v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
-; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.ceil.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -110,11 +67,6 @@ define <2 x float> @frintp_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    frintp z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintp_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintp v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.ceil.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -127,11 +79,6 @@ define <4 x float> @frintp_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    frintp z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintp_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintp v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.ceil.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -145,14 +92,6 @@ define void @frintp_v8f32(ptr %a) {
 ; CHECK-NEXT:    frintp z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintp_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frintp v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frintp v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.ceil.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -164,11 +103,6 @@ define <1 x double> @frintp_v1f64(<1 x double> %op) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frintp d0, d0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintp_v1f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintp d0, d0
-; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.ceil.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
@@ -181,11 +115,6 @@ define <2 x double> @frintp_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    frintp z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintp_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintp v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.ceil.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -199,14 +128,6 @@ define void @frintp_v4f64(ptr %a) {
 ; CHECK-NEXT:    frintp z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintp_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frintp v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    frintp v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.ceil.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
@@ -225,13 +146,6 @@ define <2 x half> @frintm_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    frintm z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintm_v2f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frintm v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.floor.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -244,13 +158,6 @@ define <4 x half> @frintm_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    frintm z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintm_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frintm v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.floor.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -263,16 +170,6 @@ define <8 x half> @frintm_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    frintm z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintm_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
-; NONEON-NOSVE-NEXT:    frintm v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
-; NONEON-NOSVE-NEXT:    frintm v1.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.floor.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -286,24 +183,6 @@ define void @frintm_v16f16(ptr %a) {
 ; CHECK-NEXT:    frintm z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintm_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    frintm v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    frintm v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    frintm v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frintm v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
-; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.floor.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -318,11 +197,6 @@ define <2 x float> @frintm_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    frintm z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintm_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintm v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.floor.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -335,11 +209,6 @@ define <4 x float> @frintm_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    frintm z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintm_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintm v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.floor.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -353,14 +222,6 @@ define void @frintm_v8f32(ptr %a) {
 ; CHECK-NEXT:    frintm z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintm_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frintm v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frintm v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.floor.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -372,11 +233,6 @@ define <1 x double> @frintm_v1f64(<1 x double> %op) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frintm d0, d0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintm_v1f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintm d0, d0
-; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.floor.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
@@ -389,11 +245,6 @@ define <2 x double> @frintm_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    frintm z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintm_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintm v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.floor.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -407,14 +258,6 @@ define void @frintm_v4f64(ptr %a) {
 ; CHECK-NEXT:    frintm z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintm_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frintm v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    frintm v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.floor.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
@@ -433,13 +276,6 @@ define <2 x half> @frinti_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    frinti z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frinti_v2f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frinti v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.nearbyint.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -452,13 +288,6 @@ define <4 x half> @frinti_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    frinti z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frinti_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frinti v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.nearbyint.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -471,16 +300,6 @@ define <8 x half> @frinti_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    frinti z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frinti_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
-; NONEON-NOSVE-NEXT:    frinti v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
-; NONEON-NOSVE-NEXT:    frinti v1.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.nearbyint.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -494,24 +313,6 @@ define void @frinti_v16f16(ptr %a) {
 ; CHECK-NEXT:    frinti z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frinti_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    frinti v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    frinti v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    frinti v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frinti v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
-; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.nearbyint.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -526,11 +327,6 @@ define <2 x float> @frinti_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    frinti z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frinti_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frinti v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -543,11 +339,6 @@ define <4 x float> @frinti_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    frinti z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frinti_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frinti v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -561,14 +352,6 @@ define void @frinti_v8f32(ptr %a) {
 ; CHECK-NEXT:    frinti z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frinti_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frinti v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frinti v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -580,11 +363,6 @@ define <1 x double> @frinti_v1f64(<1 x double> %op) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frinti d0, d0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frinti_v1f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frinti d0, d0
-; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.nearbyint.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
@@ -597,11 +375,6 @@ define <2 x double> @frinti_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    frinti z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frinti_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frinti v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -615,14 +388,6 @@ define void @frinti_v4f64(ptr %a) {
 ; CHECK-NEXT:    frinti z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frinti_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frinti v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    frinti v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
@@ -641,13 +406,6 @@ define <2 x half> @frintx_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintx_v2f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frintx v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.rint.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -660,13 +418,6 @@ define <4 x half> @frintx_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintx_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frintx v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.rint.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -679,16 +430,6 @@ define <8 x half> @frintx_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintx_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
-; NONEON-NOSVE-NEXT:    frintx v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
-; NONEON-NOSVE-NEXT:    frintx v1.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.rint.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -702,24 +443,6 @@ define void @frintx_v16f16(ptr %a) {
 ; CHECK-NEXT:    frintx z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintx_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    frintx v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    frintx v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    frintx v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frintx v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
-; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.rint.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -734,11 +457,6 @@ define <2 x float> @frintx_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintx_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintx v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.rint.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -751,11 +469,6 @@ define <4 x float> @frintx_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintx_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintx v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.rint.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -769,14 +482,6 @@ define void @frintx_v8f32(ptr %a) {
 ; CHECK-NEXT:    frintx z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintx_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frintx v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frintx v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.rint.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -788,11 +493,6 @@ define <1 x double> @frintx_v1f64(<1 x double> %op) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frintx d0, d0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintx_v1f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintx d0, d0
-; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.rint.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
@@ -805,11 +505,6 @@ define <2 x double> @frintx_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintx_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintx v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.rint.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -823,14 +518,6 @@ define void @frintx_v4f64(ptr %a) {
 ; CHECK-NEXT:    frintx z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintx_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frintx v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    frintx v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.rint.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
@@ -849,13 +536,6 @@ define <2 x half> @frinta_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    frinta z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frinta_v2f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frinta v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.round.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -868,13 +548,6 @@ define <4 x half> @frinta_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    frinta z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frinta_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frinta v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.round.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -887,16 +560,6 @@ define <8 x half> @frinta_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    frinta z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frinta_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
-; NONEON-NOSVE-NEXT:    frinta v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
-; NONEON-NOSVE-NEXT:    frinta v1.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.round.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -910,24 +573,6 @@ define void @frinta_v16f16(ptr %a) {
 ; CHECK-NEXT:    frinta z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frinta_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    frinta v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    frinta v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    frinta v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frinta v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
-; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.round.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -942,11 +587,6 @@ define <2 x float> @frinta_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    frinta z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frinta_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frinta v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.round.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -959,11 +599,6 @@ define <4 x float> @frinta_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    frinta z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frinta_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frinta v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.round.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -977,14 +612,6 @@ define void @frinta_v8f32(ptr %a) {
 ; CHECK-NEXT:    frinta z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frinta_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frinta v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frinta v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.round.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -996,11 +623,6 @@ define <1 x double> @frinta_v1f64(<1 x double> %op) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frinta d0, d0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frinta_v1f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frinta d0, d0
-; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.round.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
@@ -1013,11 +635,6 @@ define <2 x double> @frinta_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    frinta z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frinta_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frinta v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.round.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -1031,14 +648,6 @@ define void @frinta_v4f64(ptr %a) {
 ; CHECK-NEXT:    frinta z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frinta_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frinta v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    frinta v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.round.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
@@ -1057,13 +666,6 @@ define <2 x half> @frintn_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    frintn z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintn_v2f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frintn v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.roundeven.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -1076,13 +678,6 @@ define <4 x half> @frintn_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    frintn z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintn_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frintn v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.roundeven.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -1095,16 +690,6 @@ define <8 x half> @frintn_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    frintn z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintn_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
-; NONEON-NOSVE-NEXT:    frintn v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
-; NONEON-NOSVE-NEXT:    frintn v1.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.roundeven.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -1118,24 +703,6 @@ define void @frintn_v16f16(ptr %a) {
 ; CHECK-NEXT:    frintn z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintn_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    frintn v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    frintn v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    frintn v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frintn v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
-; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.roundeven.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -1150,11 +717,6 @@ define <2 x float> @frintn_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    frintn z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintn_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintn v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.roundeven.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -1167,11 +729,6 @@ define <4 x float> @frintn_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    frintn z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintn_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintn v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -1185,14 +742,6 @@ define void @frintn_v8f32(ptr %a) {
 ; CHECK-NEXT:    frintn z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintn_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frintn v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frintn v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -1204,11 +753,6 @@ define <1 x double> @frintn_v1f64(<1 x double> %op) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frintn d0, d0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintn_v1f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintn d0, d0
-; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.roundeven.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
@@ -1221,11 +765,6 @@ define <2 x double> @frintn_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    frintn z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintn_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintn v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -1239,14 +778,6 @@ define void @frintn_v4f64(ptr %a) {
 ; CHECK-NEXT:    frintn z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintn_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frintn v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    frintn v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
@@ -1265,13 +796,6 @@ define <2 x half> @frintz_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    frintz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintz_v2f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frintz v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.trunc.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -1284,13 +808,6 @@ define <4 x half> @frintz_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    frintz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintz_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frintz v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.trunc.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -1303,16 +820,6 @@ define <8 x half> @frintz_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    frintz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintz_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
-; NONEON-NOSVE-NEXT:    frintz v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
-; NONEON-NOSVE-NEXT:    frintz v1.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.trunc.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -1326,24 +833,6 @@ define void @frintz_v16f16(ptr %a) {
 ; CHECK-NEXT:    frintz z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintz_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    frintz v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    frintz v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    frintz v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frintz v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
-; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.trunc.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -1358,11 +847,6 @@ define <2 x float> @frintz_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    frintz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintz_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintz v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.trunc.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -1375,11 +859,6 @@ define <4 x float> @frintz_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    frintz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintz_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintz v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.trunc.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -1393,14 +872,6 @@ define void @frintz_v8f32(ptr %a) {
 ; CHECK-NEXT:    frintz z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintz_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frintz v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frintz v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.trunc.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -1412,11 +883,6 @@ define <1 x double> @frintz_v1f64(<1 x double> %op) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frintz d0, d0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintz_v1f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintz d0, d0
-; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.trunc.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
@@ -1429,11 +895,6 @@ define <2 x double> @frintz_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    frintz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintz_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintz v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.trunc.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -1447,14 +908,6 @@ define void @frintz_v4f64(ptr %a) {
 ; CHECK-NEXT:    frintz z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: frintz_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frintz v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    frintz v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.trunc.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll
index 0268dd1b5d31..7d36925fdc57 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -17,14 +16,6 @@ define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v2f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.4h, w8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x half> %op1, <2 x half> %op2
   ret <2 x half> %sel
 }
@@ -41,14 +32,6 @@ define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.4h, w8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <4 x half> %op1, <4 x half> %op2
   ret <4 x half> %sel
 }
@@ -65,14 +48,6 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.8h, w8
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <8 x half> %op1, <8 x half> %op2
   ret <8 x half> %sel
 }
@@ -92,20 +67,6 @@ define void @select_v16f16(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-NEXT:    sel z1.h, p0, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    tst w2, #0x1
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <16 x half>, ptr %a
   %op2 = load volatile <16 x half>, ptr %b
   %sel = select i1 %mask, <16 x half> %op1, <16 x half> %op2
@@ -125,14 +86,6 @@ define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.2s, w8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x float> %op1, <2 x float> %op2
   ret <2 x float> %sel
 }
@@ -149,14 +102,6 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.4s, w8
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <4 x float> %op1, <4 x float> %op2
   ret <4 x float> %sel
 }
@@ -176,20 +121,6 @@ define void @select_v8f32(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-NEXT:    sel z1.s, p0, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    tst w2, #0x1
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <8 x float>, ptr %a
   %op2 = load volatile <8 x float>, ptr %b
   %sel = select i1 %mask, <8 x float> %op1, <8 x float> %op2
@@ -203,14 +134,6 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, i1 %mask
 ; CHECK-NEXT:    tst w0, #0x1
 ; CHECK-NEXT:    fcsel d0, d0, d1, ne
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v1f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm x8, ne
-; NONEON-NOSVE-NEXT:    fmov d2, x8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <1 x double> %op1, <1 x double> %op2
   ret <1 x double> %sel
 }
@@ -228,14 +151,6 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask
 ; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm x8, ne
-; NONEON-NOSVE-NEXT:    dup v2.2d, x8
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x double> %op1, <2 x double> %op2
   ret <2 x double> %sel
 }
@@ -256,20 +171,6 @@ define void @select_v4f64(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-NEXT:    sel z1.d, p0, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    tst w2, #0x1
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    csetm x8, ne
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <4 x double>, ptr %a
   %op2 = load volatile <4 x double>, ptr %b
   %sel = select i1 %mask, <4 x double> %op1, <4 x double> %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
index 1c63a3870d68..bf8a335a8503 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -16,13 +15,6 @@ define <4 x i16> @fcvtzu_v4f16_v4i16(<4 x half> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <4 x half> %op1 to <4 x i16>
   ret <4 x i16> %res
 }
@@ -35,21 +27,6 @@ define void @fcvtzu_v8f16_v8i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzu z0.h, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptoui <8 x half> %op1 to <8 x i16>
   store <8 x i16> %res, ptr %b
@@ -65,27 +42,6 @@ define void @fcvtzu_v16f16_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzu z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
-; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v1.8h, v2.8h
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v3.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptoui <16 x half> %op1 to <16 x i16>
   store <16 x i16> %res, ptr %b
@@ -105,13 +61,6 @@ define <2 x i32> @fcvtzu_v2f16_v2i32(<2 x half> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v2f16_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x half> %op1 to <2 x i32>
   ret <2 x i32> %res
 }
@@ -125,12 +74,6 @@ define <4 x i32> @fcvtzu_v4f16_v4i32(<4 x half> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <4 x half> %op1 to <4 x i32>
   ret <4 x i32> %res
 }
@@ -147,20 +90,6 @@ define void @fcvtzu_v8f16_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.h
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptoui <8 x half> %op1 to <8 x i32>
   store <8 x i32> %res, ptr %b
@@ -185,26 +114,6 @@ define void @fcvtzu_v16f16_v16i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
-; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptoui <16 x half> %op1 to <16 x i32>
   store <16 x i32> %res, ptr %b
@@ -221,13 +130,6 @@ define <1 x i64> @fcvtzu_v1f16_v1i64(<1 x half> %op1) {
 ; CHECK-NEXT:    fcvtzu x8, h0
 ; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v1f16_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <1 x half> %op1 to <1 x i64>
   ret <1 x i64> %res
 }
@@ -243,18 +145,6 @@ define <2 x i64> @fcvtzu_v2f16_v2i64(<2 x half> %op1) {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldr q0, [sp], #16
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v2f16_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
-; NONEON-NOSVE-NEXT:    fcvtzu x9, s1
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
-; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x half> %op1 to <2 x i64>
   ret <2 x i64> %res
 }
@@ -277,27 +167,6 @@ define void @fcvtzu_v4f16_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
-; NONEON-NOSVE-NEXT:    fcvtzu x8, s1
-; NONEON-NOSVE-NEXT:    fcvtzu x10, s2
-; NONEON-NOSVE-NEXT:    fcvtzu x11, s3
-; NONEON-NOSVE-NEXT:    fmov d1, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x10
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x half>, ptr %a
   %res = fptoui <4 x half> %op1 to <4 x i64>
   store <4 x i64> %res, ptr %b
@@ -335,47 +204,6 @@ define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q0, [x1, #32]
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov h5, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h7, v2.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvtzu x13, s2
-; NONEON-NOSVE-NEXT:    fcvtzu x8, s1
-; NONEON-NOSVE-NEXT:    fcvt s1, h7
-; NONEON-NOSVE-NEXT:    fcvtzu x10, s3
-; NONEON-NOSVE-NEXT:    fcvtzu x11, s4
-; NONEON-NOSVE-NEXT:    fcvtzu x12, s5
-; NONEON-NOSVE-NEXT:    fcvtzu x14, s6
-; NONEON-NOSVE-NEXT:    fmov d3, x13
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    fcvtzu x8, s1
-; NONEON-NOSVE-NEXT:    fmov d1, x9
-; NONEON-NOSVE-NEXT:    fmov d2, x12
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x10
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
-; NONEON-NOSVE-NEXT:    mov v3.d[1], x8
-; NONEON-NOSVE-NEXT:    mov v2.d[1], x14
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
-; NONEON-NOSVE-NEXT:    stp q3, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptoui <8 x half> %op1 to <8 x i64>
   store <8 x i64> %res, ptr %b
@@ -436,80 +264,6 @@ define void @fcvtzu_v16f16_v16i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q5, q2, [x1, #96]
 ; CHECK-NEXT:    add sp, sp, #128
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s3, h1
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #24]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s6, h0
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s17, h4
-; NONEON-NOSVE-NEXT:    mov h18, v4.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvtzu x8, s3
-; NONEON-NOSVE-NEXT:    fcvt s3, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h7
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    mov h16, v4.h[3]
-; NONEON-NOSVE-NEXT:    fcvtzu x9, s6
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    mov h4, v4.h[1]
-; NONEON-NOSVE-NEXT:    fcvtzu x11, s2
-; NONEON-NOSVE-NEXT:    mov h2, v6.h[2]
-; NONEON-NOSVE-NEXT:    fcvtzu x10, s17
-; NONEON-NOSVE-NEXT:    fcvtzu x13, s5
-; NONEON-NOSVE-NEXT:    fcvtzu x12, s3
-; NONEON-NOSVE-NEXT:    mov h3, v6.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    mov h5, v6.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s17, h18
-; NONEON-NOSVE-NEXT:    fcvtzu x14, s7
-; NONEON-NOSVE-NEXT:    fmov d7, x8
-; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fmov d0, x11
-; NONEON-NOSVE-NEXT:    fcvtzu x11, s1
-; NONEON-NOSVE-NEXT:    fmov d1, x13
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvtzu x13, s16
-; NONEON-NOSVE-NEXT:    fmov d16, x9
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvtzu x15, s17
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x12
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x14
-; NONEON-NOSVE-NEXT:    fcvtzu x9, s2
-; NONEON-NOSVE-NEXT:    mov v16.d[1], x8
-; NONEON-NOSVE-NEXT:    fcvtzu x8, s6
-; NONEON-NOSVE-NEXT:    fcvtzu x14, s4
-; NONEON-NOSVE-NEXT:    fcvtzu x12, s3
-; NONEON-NOSVE-NEXT:    mov v7.d[1], x11
-; NONEON-NOSVE-NEXT:    fmov d3, x10
-; NONEON-NOSVE-NEXT:    fcvtzu x11, s5
-; NONEON-NOSVE-NEXT:    fmov d2, x15
-; NONEON-NOSVE-NEXT:    stp q16, q1, [x1, #64]
-; NONEON-NOSVE-NEXT:    fmov d1, x9
-; NONEON-NOSVE-NEXT:    fmov d4, x8
-; NONEON-NOSVE-NEXT:    stp q7, q0, [x1]
-; NONEON-NOSVE-NEXT:    mov v2.d[1], x13
-; NONEON-NOSVE-NEXT:    mov v3.d[1], x14
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x12
-; NONEON-NOSVE-NEXT:    mov v4.d[1], x11
-; NONEON-NOSVE-NEXT:    stp q3, q2, [x1, #96]
-; NONEON-NOSVE-NEXT:    stp q4, q1, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptoui <16 x half> %op1 to <16 x i64>
   store <16 x i64> %res, ptr %b
@@ -528,11 +282,6 @@ define <2 x i16> @fcvtzu_v2f32_v2i16(<2 x float> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v2f32_v2i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x float> %op1 to <2 x i16>
   ret <2 x i16> %res
 }
@@ -546,12 +295,6 @@ define <4 x i16> @fcvtzu_v4f32_v4i16(<4 x float> %op1) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v4f32_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <4 x float> %op1 to <4 x i16>
   ret <4 x i16> %res
 }
@@ -569,14 +312,6 @@ define <8 x i16> @fcvtzu_v8f32_v8i16(ptr %a) {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v8f32_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptoui <8 x float> %op1 to <8 x i16>
   ret <8 x i16> %res
@@ -601,19 +336,6 @@ define void @fcvtzu_v16f32_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
 ; CHECK-NEXT:    stp q2, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v16f32_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x float>, ptr %a
   %res = fptoui <16 x float> %op1 to <16 x i16>
   store <16 x i16> %res, ptr %b
@@ -632,11 +354,6 @@ define <2 x i32> @fcvtzu_v2f32_v2i32(<2 x float> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v2f32_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzu v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x float> %op1 to <2 x i32>
   ret <2 x i32> %res
 }
@@ -649,11 +366,6 @@ define <4 x i32> @fcvtzu_v4f32_v4i32(<4 x float> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v4f32_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <4 x float> %op1 to <4 x i32>
   ret <4 x i32> %res
 }
@@ -667,14 +379,6 @@ define void @fcvtzu_v8f32_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzu z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v8f32_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptoui <8 x float> %op1 to <8 x i32>
   store <8 x i32> %res, ptr %b
@@ -694,13 +398,6 @@ define <1 x i64> @fcvtzu_v1f32_v1i64(<1 x float> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v1f32_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <1 x float> %op1 to <1 x i64>
   ret <1 x i64> %res
 }
@@ -714,12 +411,6 @@ define <2 x i64> @fcvtzu_v2f32_v2i64(<2 x float> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v2f32_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x float> %op1 to <2 x i64>
   ret <2 x i64> %res
 }
@@ -736,20 +427,6 @@ define void @fcvtzu_v4f32_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v4f32_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
-; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzu v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %res = fptoui <4 x float> %op1 to <4 x i64>
   store <4 x i64> %res, ptr %b
@@ -774,26 +451,6 @@ define void @fcvtzu_v8f32_v8i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v8f32_v8i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtl v2.2d, v2.2s
-; NONEON-NOSVE-NEXT:    fcvtl v3.2d, v3.2s
-; NONEON-NOSVE-NEXT:    fcvtzu v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzu v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    fcvtzu v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptoui <8 x float> %op1 to <8 x i64>
   store <8 x i64> %res, ptr %b
@@ -811,12 +468,6 @@ define <1 x i16> @fcvtzu_v1f64_v1i16(<1 x double> %op1) {
 ; CHECK-NEXT:    mov z0.h, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v1f64_v1i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <1 x double> %op1 to <1 x i16>
   ret <1 x i16> %res
 }
@@ -830,12 +481,6 @@ define <2 x i16> @fcvtzu_v2f64_v2i16(<2 x double> %op1) {
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v2f64_v2i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x double> %op1 to <2 x i16>
   ret <2 x i16> %res
 }
@@ -864,15 +509,6 @@ define <4 x i16> @fcvtzu_v4f64_v4i16(ptr %a) {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v4f64_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptoui <4 x double> %op1 to <4 x i16>
   ret <4 x i16> %res
@@ -916,23 +552,6 @@ define <8 x i16> @fcvtzu_v8f64_v8i16(ptr %a) {
 ; CHECK-NEXT:    strh w8, [sp, #2]
 ; CHECK-NEXT:    ldr q0, [sp], #16
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v8f64_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI26_0
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    xtn v7.2s, v0.2d
-; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI26_0]
-; NONEON-NOSVE-NEXT:    xtn v6.2s, v1.2d
-; NONEON-NOSVE-NEXT:    xtn v5.2s, v2.2d
-; NONEON-NOSVE-NEXT:    xtn v4.2s, v3.2d
-; NONEON-NOSVE-NEXT:    tbl v0.16b, { v4.16b, v5.16b, v6.16b, v7.16b }, v0.16b
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x double>, ptr %a
   %res = fptoui <8 x double> %op1 to <8 x i16>
   ret <8 x i16> %res
@@ -1009,35 +628,6 @@ define void @fcvtzu_v16f64_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v16f64_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #96]
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI27_0
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0, #64]
-; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v5.2d, v5.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v4.2d, v4.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v6.2d, v6.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v7.2d, v7.2d
-; NONEON-NOSVE-NEXT:    xtn v19.2s, v0.2d
-; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI27_0]
-; NONEON-NOSVE-NEXT:    xtn v23.2s, v3.2d
-; NONEON-NOSVE-NEXT:    xtn v18.2s, v1.2d
-; NONEON-NOSVE-NEXT:    xtn v22.2s, v2.2d
-; NONEON-NOSVE-NEXT:    xtn v17.2s, v5.2d
-; NONEON-NOSVE-NEXT:    xtn v21.2s, v6.2d
-; NONEON-NOSVE-NEXT:    xtn v16.2s, v4.2d
-; NONEON-NOSVE-NEXT:    xtn v20.2s, v7.2d
-; NONEON-NOSVE-NEXT:    tbl v1.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.16b
-; NONEON-NOSVE-NEXT:    tbl v0.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x double>, ptr %a
   %res = fptoui <16 x double> %op1 to <16 x i16>
   store <16 x i16> %res, ptr %b
@@ -1057,13 +647,6 @@ define <1 x i32> @fcvtzu_v1f64_v1i32(<1 x double> %op1) {
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v1f64_v1i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <1 x double> %op1 to <1 x i32>
   ret <1 x i32> %res
 }
@@ -1077,12 +660,6 @@ define <2 x i32> @fcvtzu_v2f64_v2i32(<2 x double> %op1) {
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v2f64_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x double> %op1 to <2 x i32>
   ret <2 x i32> %res
 }
@@ -1100,14 +677,6 @@ define <4 x i32> @fcvtzu_v4f64_v4i32(ptr %a) {
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v4f64_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzu v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptoui <4 x double> %op1 to <4 x i32>
   ret <4 x i32> %res
@@ -1132,19 +701,6 @@ define void @fcvtzu_v8f64_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
 ; CHECK-NEXT:    stp q2, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v8f64_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzu v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzu v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    fcvtzu v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x double>, ptr %a
   %res = fptoui <8 x double> %op1 to <8 x i32>
   store <8 x i32> %res, ptr %b
@@ -1163,12 +719,6 @@ define <1 x i64> @fcvtzu_v1f64_v1i64(<1 x double> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v1f64_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzu x8, d0
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <1 x double> %op1 to <1 x i64>
   ret <1 x i64> %res
 }
@@ -1181,11 +731,6 @@ define <2 x i64> @fcvtzu_v2f64_v2i64(<2 x double> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v2f64_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x double> %op1 to <2 x i64>
   ret <2 x i64> %res
 }
@@ -1199,14 +744,6 @@ define void @fcvtzu_v4f64_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzu z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzu_v4f64_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzu v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptoui <4 x double> %op1 to <4 x i64>
   store <4 x i64> %res, ptr %b
@@ -1225,13 +762,6 @@ define <4 x i16> @fcvtzs_v4f16_v4i16(<4 x half> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <4 x half> %op1 to <4 x i16>
   ret <4 x i16> %res
 }
@@ -1244,21 +774,6 @@ define void @fcvtzs_v8f16_v8i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptosi <8 x half> %op1 to <8 x i16>
   store <8 x i16> %res, ptr %b
@@ -1274,27 +789,6 @@ define void @fcvtzs_v16f16_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzs z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
-; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v1.8h, v2.8h
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v3.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptosi <16 x half> %op1 to <16 x i16>
   store <16 x i16> %res, ptr %b
@@ -1314,13 +808,6 @@ define <2 x i32> @fcvtzs_v2f16_v2i32(<2 x half> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v2f16_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x half> %op1 to <2 x i32>
   ret <2 x i32> %res
 }
@@ -1334,12 +821,6 @@ define <4 x i32> @fcvtzs_v4f16_v4i32(<4 x half> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <4 x half> %op1 to <4 x i32>
   ret <4 x i32> %res
 }
@@ -1356,20 +837,6 @@ define void @fcvtzs_v8f16_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.h
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptosi <8 x half> %op1 to <8 x i32>
   store <8 x i32> %res, ptr %b
@@ -1394,26 +861,6 @@ define void @fcvtzs_v16f16_v16i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
-; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptosi <16 x half> %op1 to <16 x i32>
   store <16 x i32> %res, ptr %b
@@ -1430,13 +877,6 @@ define <1 x i64> @fcvtzs_v1f16_v1i64(<1 x half> %op1) {
 ; CHECK-NEXT:    fcvtzs x8, h0
 ; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v1f16_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <1 x half> %op1 to <1 x i64>
   ret <1 x i64> %res
 }
@@ -1453,18 +893,6 @@ define <2 x i64> @fcvtzs_v2f16_v2i64(<2 x half> %op1) {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldr q0, [sp], #16
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v2f16_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
-; NONEON-NOSVE-NEXT:    fcvtzs x9, s1
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
-; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x half> %op1 to <2 x i64>
   ret <2 x i64> %res
 }
@@ -1487,27 +915,6 @@ define void @fcvtzs_v4f16_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
-; NONEON-NOSVE-NEXT:    fcvtzs x8, s1
-; NONEON-NOSVE-NEXT:    fcvtzs x10, s2
-; NONEON-NOSVE-NEXT:    fcvtzs x11, s3
-; NONEON-NOSVE-NEXT:    fmov d1, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x10
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x half>, ptr %a
   %res = fptosi <4 x half> %op1 to <4 x i64>
   store <4 x i64> %res, ptr %b
@@ -1545,47 +952,6 @@ define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q0, [x1, #32]
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov h5, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h7, v2.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvtzs x13, s2
-; NONEON-NOSVE-NEXT:    fcvtzs x8, s1
-; NONEON-NOSVE-NEXT:    fcvt s1, h7
-; NONEON-NOSVE-NEXT:    fcvtzs x10, s3
-; NONEON-NOSVE-NEXT:    fcvtzs x11, s4
-; NONEON-NOSVE-NEXT:    fcvtzs x12, s5
-; NONEON-NOSVE-NEXT:    fcvtzs x14, s6
-; NONEON-NOSVE-NEXT:    fmov d3, x13
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    fcvtzs x8, s1
-; NONEON-NOSVE-NEXT:    fmov d1, x9
-; NONEON-NOSVE-NEXT:    fmov d2, x12
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x10
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
-; NONEON-NOSVE-NEXT:    mov v3.d[1], x8
-; NONEON-NOSVE-NEXT:    mov v2.d[1], x14
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
-; NONEON-NOSVE-NEXT:    stp q3, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptosi <8 x half> %op1 to <8 x i64>
   store <8 x i64> %res, ptr %b
@@ -1646,80 +1012,6 @@ define void @fcvtzs_v16f16_v16i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q5, q2, [x1, #96]
 ; CHECK-NEXT:    add sp, sp, #128
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s3, h1
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #24]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s6, h0
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s17, h4
-; NONEON-NOSVE-NEXT:    mov h18, v4.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvtzs x8, s3
-; NONEON-NOSVE-NEXT:    fcvt s3, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h7
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    mov h16, v4.h[3]
-; NONEON-NOSVE-NEXT:    fcvtzs x9, s6
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    mov h4, v4.h[1]
-; NONEON-NOSVE-NEXT:    fcvtzs x11, s2
-; NONEON-NOSVE-NEXT:    mov h2, v6.h[2]
-; NONEON-NOSVE-NEXT:    fcvtzs x10, s17
-; NONEON-NOSVE-NEXT:    fcvtzs x13, s5
-; NONEON-NOSVE-NEXT:    fcvtzs x12, s3
-; NONEON-NOSVE-NEXT:    mov h3, v6.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    mov h5, v6.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s17, h18
-; NONEON-NOSVE-NEXT:    fcvtzs x14, s7
-; NONEON-NOSVE-NEXT:    fmov d7, x8
-; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fmov d0, x11
-; NONEON-NOSVE-NEXT:    fcvtzs x11, s1
-; NONEON-NOSVE-NEXT:    fmov d1, x13
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvtzs x13, s16
-; NONEON-NOSVE-NEXT:    fmov d16, x9
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvtzs x15, s17
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x12
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x14
-; NONEON-NOSVE-NEXT:    fcvtzs x9, s2
-; NONEON-NOSVE-NEXT:    mov v16.d[1], x8
-; NONEON-NOSVE-NEXT:    fcvtzs x8, s6
-; NONEON-NOSVE-NEXT:    fcvtzs x14, s4
-; NONEON-NOSVE-NEXT:    fcvtzs x12, s3
-; NONEON-NOSVE-NEXT:    mov v7.d[1], x11
-; NONEON-NOSVE-NEXT:    fmov d3, x10
-; NONEON-NOSVE-NEXT:    fcvtzs x11, s5
-; NONEON-NOSVE-NEXT:    fmov d2, x15
-; NONEON-NOSVE-NEXT:    stp q16, q1, [x1, #64]
-; NONEON-NOSVE-NEXT:    fmov d1, x9
-; NONEON-NOSVE-NEXT:    fmov d4, x8
-; NONEON-NOSVE-NEXT:    stp q7, q0, [x1]
-; NONEON-NOSVE-NEXT:    mov v2.d[1], x13
-; NONEON-NOSVE-NEXT:    mov v3.d[1], x14
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x12
-; NONEON-NOSVE-NEXT:    mov v4.d[1], x11
-; NONEON-NOSVE-NEXT:    stp q3, q2, [x1, #96]
-; NONEON-NOSVE-NEXT:    stp q4, q1, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptosi <16 x half> %op1 to <16 x i64>
   store <16 x i64> %res, ptr %b
@@ -1738,11 +1030,6 @@ define <2 x i16> @fcvtzs_v2f32_v2i16(<2 x float> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v2f32_v2i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x float> %op1 to <2 x i16>
   ret <2 x i16> %res
 }
@@ -1756,12 +1043,6 @@ define <4 x i16> @fcvtzs_v4f32_v4i16(<4 x float> %op1) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v4f32_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <4 x float> %op1 to <4 x i16>
   ret <4 x i16> %res
 }
@@ -1779,14 +1060,6 @@ define <8 x i16> @fcvtzs_v8f32_v8i16(ptr %a) {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v8f32_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptosi <8 x float> %op1 to <8 x i16>
   ret <8 x i16> %res
@@ -1811,19 +1084,6 @@ define void @fcvtzs_v16f32_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
 ; CHECK-NEXT:    stp q2, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v16f32_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x float>, ptr %a
   %res = fptosi <16 x float> %op1 to <16 x i16>
   store <16 x i16> %res, ptr %b
@@ -1842,11 +1102,6 @@ define <2 x i32> @fcvtzs_v2f32_v2i32(<2 x float> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v2f32_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x float> %op1 to <2 x i32>
   ret <2 x i32> %res
 }
@@ -1859,11 +1114,6 @@ define <4 x i32> @fcvtzs_v4f32_v4i32(<4 x float> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v4f32_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <4 x float> %op1 to <4 x i32>
   ret <4 x i32> %res
 }
@@ -1877,14 +1127,6 @@ define void @fcvtzs_v8f32_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzs z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v8f32_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptosi <8 x float> %op1 to <8 x i32>
   store <8 x i32> %res, ptr %b
@@ -1904,13 +1146,6 @@ define <1 x i64> @fcvtzs_v1f32_v1i64(<1 x float> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v1f32_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <1 x float> %op1 to <1 x i64>
   ret <1 x i64> %res
 }
@@ -1924,12 +1159,6 @@ define <2 x i64> @fcvtzs_v2f32_v2i64(<2 x float> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v2f32_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x float> %op1 to <2 x i64>
   ret <2 x i64> %res
 }
@@ -1946,20 +1175,6 @@ define void @fcvtzs_v4f32_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v4f32_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %res = fptosi <4 x float> %op1 to <4 x i64>
   store <4 x i64> %res, ptr %b
@@ -1984,26 +1199,6 @@ define void @fcvtzs_v8f32_v8i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v8f32_v8i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtl v2.2d, v2.2s
-; NONEON-NOSVE-NEXT:    fcvtl v3.2d, v3.2s
-; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptosi <8 x float> %op1 to <8 x i64>
   store <8 x i64> %res, ptr %b
@@ -2023,12 +1218,6 @@ define <1 x i16> @fcvtzs_v1f64_v1i16(<1 x double> %op1) {
 ; CHECK-NEXT:    mov z0.h, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v1f64_v1i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <1 x double> %op1 to <1 x i16>
   ret <1 x i16> %res
 }
@@ -2042,12 +1231,6 @@ define <2 x i16> @fcvtzs_v2f64_v2i16(<2 x double> %op1) {
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v2f64_v2i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x double> %op1 to <2 x i16>
   ret <2 x i16> %res
 }
@@ -2076,15 +1259,6 @@ define <4 x i16> @fcvtzs_v4f64_v4i16(ptr %a) {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v4f64_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptosi <4 x double> %op1 to <4 x i16>
   ret <4 x i16> %res
@@ -2128,23 +1302,6 @@ define <8 x i16> @fcvtzs_v8f64_v8i16(ptr %a) {
 ; CHECK-NEXT:    strh w8, [sp, #2]
 ; CHECK-NEXT:    ldr q0, [sp], #16
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v8f64_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI61_0
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    xtn v7.2s, v0.2d
-; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI61_0]
-; NONEON-NOSVE-NEXT:    xtn v6.2s, v1.2d
-; NONEON-NOSVE-NEXT:    xtn v5.2s, v2.2d
-; NONEON-NOSVE-NEXT:    xtn v4.2s, v3.2d
-; NONEON-NOSVE-NEXT:    tbl v0.16b, { v4.16b, v5.16b, v6.16b, v7.16b }, v0.16b
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x double>, ptr %a
   %res = fptosi <8 x double> %op1 to <8 x i16>
   ret <8 x i16> %res
@@ -2221,35 +1378,6 @@ define void @fcvtzs_v16f64_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v16f64_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #96]
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI62_0
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0, #64]
-; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v5.2d, v5.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v4.2d, v4.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v6.2d, v6.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v7.2d, v7.2d
-; NONEON-NOSVE-NEXT:    xtn v19.2s, v0.2d
-; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI62_0]
-; NONEON-NOSVE-NEXT:    xtn v23.2s, v3.2d
-; NONEON-NOSVE-NEXT:    xtn v18.2s, v1.2d
-; NONEON-NOSVE-NEXT:    xtn v22.2s, v2.2d
-; NONEON-NOSVE-NEXT:    xtn v17.2s, v5.2d
-; NONEON-NOSVE-NEXT:    xtn v21.2s, v6.2d
-; NONEON-NOSVE-NEXT:    xtn v16.2s, v4.2d
-; NONEON-NOSVE-NEXT:    xtn v20.2s, v7.2d
-; NONEON-NOSVE-NEXT:    tbl v1.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.16b
-; NONEON-NOSVE-NEXT:    tbl v0.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x double>, ptr %a
   %res = fptosi <16 x double> %op1 to <16 x i16>
   store <16 x i16> %res, ptr %b
@@ -2269,13 +1397,6 @@ define <1 x i32> @fcvtzs_v1f64_v1i32(<1 x double> %op1) {
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v1f64_v1i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <1 x double> %op1 to <1 x i32>
   ret <1 x i32> %res
 }
@@ -2289,12 +1410,6 @@ define <2 x i32> @fcvtzs_v2f64_v2i32(<2 x double> %op1) {
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v2f64_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x double> %op1 to <2 x i32>
   ret <2 x i32> %res
 }
@@ -2312,14 +1427,6 @@ define <4 x i32> @fcvtzs_v4f64_v4i32(ptr %a) {
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v4f64_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptosi <4 x double> %op1 to <4 x i32>
   ret <4 x i32> %res
@@ -2344,19 +1451,6 @@ define void @fcvtzs_v8f64_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
 ; CHECK-NEXT:    stp q2, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v8f64_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x double>, ptr %a
   %res = fptosi <8 x double> %op1 to <8 x i32>
   store <8 x i32> %res, ptr %b
@@ -2375,12 +1469,6 @@ define <1 x i64> @fcvtzs_v1f64_v1i64(<1 x double> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v1f64_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzs x8, d0
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <1 x double> %op1 to <1 x i64>
   ret <1 x i64> %res
 }
@@ -2393,11 +1481,6 @@ define <2 x i64> @fcvtzs_v2f64_v2i64(<2 x double> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v2f64_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x double> %op1 to <2 x i64>
   ret <2 x i64> %res
 }
@@ -2411,14 +1494,6 @@ define void @fcvtzs_v4f64_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcvtzs_v4f64_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptosi <4 x double> %op1 to <4 x i64>
   store <4 x i64> %res, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
index 32fe74bbb65f..30a4f04a3d2b 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -28,14 +27,6 @@ define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x i1> %mask
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v2f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    uzp1 v2.4h, v2.4h, v0.4h
-; NONEON-NOSVE-NEXT:    shl v2.4h, v2.4h, #15
-; NONEON-NOSVE-NEXT:    cmlt v2.4h, v2.4h, #0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x half> %op1, <2 x half> %op2
   ret <2 x half> %sel
 }
@@ -54,13 +45,6 @@ define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x i1> %mask
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v2.4h, v2.4h, #15
-; NONEON-NOSVE-NEXT:    cmlt v2.4h, v2.4h, #0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select <4 x i1> %mask, <4 x half> %op1, <4 x half> %op2
   ret <4 x half> %sel
 }
@@ -80,14 +64,6 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x i1> %mask
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v2.8h, v2.8b, #0
-; NONEON-NOSVE-NEXT:    shl v2.8h, v2.8h, #15
-; NONEON-NOSVE-NEXT:    cmlt v2.8h, v2.8h, #0
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select <8 x i1> %mask, <8 x half> %op1, <8 x half> %op2
   ret <8 x half> %sel
 }
@@ -104,126 +80,6 @@ define void @select_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sel z1.h, p0, z2.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h1
-; NONEON-NOSVE-NEXT:    fcvt s7, h0
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h17, v0.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fcmp s3, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[3]
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w14, eq
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w12, eq
-; NONEON-NOSVE-NEXT:    fcmp s3, s2
-; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    ldr q3, [x1, #16]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w11, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov h18, v3.h[3]
-; NONEON-NOSVE-NEXT:    csetm w13, eq
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v3.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v2.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w9, eq
-; NONEON-NOSVE-NEXT:    fcmp s17, s16
-; NONEON-NOSVE-NEXT:    mov h16, v3.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h17, v2.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w10, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    fcvt s6, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h2
-; NONEON-NOSVE-NEXT:    csetm w15, eq
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    fmov s4, w14
-; NONEON-NOSVE-NEXT:    csetm w16, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s16, h17
-; NONEON-NOSVE-NEXT:    mov v4.h[1], w8
-; NONEON-NOSVE-NEXT:    fcvt s17, h18
-; NONEON-NOSVE-NEXT:    csetm w14, eq
-; NONEON-NOSVE-NEXT:    fmov s5, w14
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s16, s7
-; NONEON-NOSVE-NEXT:    mov h7, v3.h[4]
-; NONEON-NOSVE-NEXT:    mov h16, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov v4.h[2], w12
-; NONEON-NOSVE-NEXT:    mov v5.h[1], w16
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s6, s17
-; NONEON-NOSVE-NEXT:    mov h17, v2.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h7
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    mov h16, v3.h[5]
-; NONEON-NOSVE-NEXT:    mov v4.h[3], w11
-; NONEON-NOSVE-NEXT:    mov v5.h[2], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v3.h[6]
-; NONEON-NOSVE-NEXT:    mov h7, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    mov v4.h[4], w13
-; NONEON-NOSVE-NEXT:    mov v5.h[3], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcmp s17, s16
-; NONEON-NOSVE-NEXT:    mov h16, v3.h[7]
-; NONEON-NOSVE-NEXT:    mov h17, v2.h[7]
-; NONEON-NOSVE-NEXT:    mov v5.h[4], w8
-; NONEON-NOSVE-NEXT:    mov v4.h[5], w9
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    fcvt s6, h16
-; NONEON-NOSVE-NEXT:    fcvt s7, h17
-; NONEON-NOSVE-NEXT:    mov v5.h[5], w8
-; NONEON-NOSVE-NEXT:    mov v4.h[6], w10
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov v5.h[6], w8
-; NONEON-NOSVE-NEXT:    mov v4.h[7], w15
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    mov v5.h[7], w8
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
-; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %mask = fcmp oeq <16 x half> %op1, %op2
@@ -246,13 +102,6 @@ define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x i1> %m
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v2.2s, v2.2s, #31
-; NONEON-NOSVE-NEXT:    cmlt v2.2s, v2.2s, #0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x float> %op1, <2 x float> %op2
   ret <2 x float> %sel
 }
@@ -272,14 +121,6 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x i1> %m
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    shl v2.4s, v2.4s, #31
-; NONEON-NOSVE-NEXT:    cmlt v2.4s, v2.4s, #0
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select <4 x i1> %mask, <4 x float> %op1, <4 x float> %op2
   ret <4 x float> %sel
 }
@@ -296,18 +137,6 @@ define void @select_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sel z1.s, p0, z2.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    fcmeq v4.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcmeq v5.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
-; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %mask = fcmp oeq <8 x float> %op1, %op2
@@ -322,14 +151,6 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x i1>
 ; CHECK-NEXT:    tst w0, #0x1
 ; CHECK-NEXT:    fcsel d0, d0, d1, ne
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v1f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm x8, ne
-; NONEON-NOSVE-NEXT:    fmov d2, x8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select <1 x i1> %mask, <1 x double> %op1, <1 x double> %op2
   ret <1 x double> %sel
 }
@@ -349,14 +170,6 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x i1>
 ; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    shl v2.2d, v2.2d, #63
-; NONEON-NOSVE-NEXT:    cmlt v2.2d, v2.2d, #0
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x double> %op1, <2 x double> %op2
   ret <2 x double> %sel
 }
@@ -373,18 +186,6 @@ define void @select_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sel z1.d, p0, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    fcmeq v4.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcmeq v5.2d, v2.2d, v3.2d
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
-; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %mask = fcmp oeq <4 x double> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
index c85048ab72e0..4aa965777c74 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -22,14 +21,6 @@ define <4 x i8> @insertelement_v4i8(<4 x i8> %op1) {
 ; CHECK-NEXT:    mov z0.h, p0/m, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: insertelement_v4i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <4 x i8> %op1, i8 5, i64 3
     ret <4 x i8> %r
 }
@@ -47,14 +38,6 @@ define <8 x i8> @insertelement_v8i8(<8 x i8> %op1) {
 ; CHECK-NEXT:    mov z0.b, p0/m, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: insertelement_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v0.b[7], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <8 x i8> %op1, i8 5, i64 7
     ret <8 x i8> %r
 }
@@ -72,12 +55,6 @@ define <16 x i8> @insertelement_v16i8(<16 x i8> %op1) {
 ; CHECK-NEXT:    mov z0.b, p0/m, w8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: insertelement_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v0.b[15], w8
-; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <16 x i8> %op1, i8 5, i64 15
     ret <16 x i8> %r
 }
@@ -95,12 +72,6 @@ define <32 x i8> @insertelement_v32i8(<32 x i8> %op1) {
 ; CHECK-NEXT:    mov z1.b, p0/m, w8
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: insertelement_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v1.b[15], w8
-; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <32 x i8> %op1, i8 5, i64 31
     ret <32 x i8> %r
 }
@@ -119,14 +90,6 @@ define <2 x i16> @insertelement_v2i16(<2 x i16> %op1) {
 ; CHECK-NEXT:    mov z0.s, p0/m, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: insertelement_v2i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x i16> %op1, i16 5, i64 1
     ret <2 x i16> %r
 }
@@ -144,14 +107,6 @@ define <4 x i16> @insertelement_v4i16(<4 x i16> %op1) {
 ; CHECK-NEXT:    mov z0.h, p0/m, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: insertelement_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <4 x i16> %op1, i16 5, i64 3
     ret <4 x i16> %r
 }
@@ -169,12 +124,6 @@ define <8 x i16> @insertelement_v8i16(<8 x i16> %op1) {
 ; CHECK-NEXT:    mov z0.h, p0/m, w8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: insertelement_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v0.h[7], w8
-; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <8 x i16> %op1, i16 5, i64 7
     ret <8 x i16> %r
 }
@@ -192,12 +141,6 @@ define <16 x i16> @insertelement_v16i16(<16 x i16> %op1) {
 ; CHECK-NEXT:    mov z1.h, p0/m, w8
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: insertelement_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v1.h[7], w8
-; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <16 x i16> %op1, i16 5, i64 15
     ret <16 x i16> %r
 }
@@ -216,14 +159,6 @@ define <2 x i32> @insertelement_v2i32(<2 x i32> %op1) {
 ; CHECK-NEXT:    mov z0.s, p0/m, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: insertelement_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x i32> %op1, i32 5, i64 1
     ret <2 x i32> %r
 }
@@ -241,12 +176,6 @@ define <4 x i32> @insertelement_v4i32(<4 x i32> %op1) {
 ; CHECK-NEXT:    mov z0.s, p0/m, w8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: insertelement_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v0.s[3], w8
-; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <4 x i32> %op1, i32 5, i64 3
     ret <4 x i32> %r
 }
@@ -264,13 +193,6 @@ define <8 x i32> @insertelement_v8i32(ptr %a) {
 ; CHECK-NEXT:    mov z1.s, p0/m, w8
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: insertelement_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v1.s[3], w8
-; NONEON-NOSVE-NEXT:    ret
     %op1 = load <8 x i32>, ptr %a
     %r = insertelement <8 x i32> %op1, i32 5, i64 7
     ret <8 x i32> %r
@@ -283,12 +205,6 @@ define <1 x i64> @insertelement_v1i64(<1 x i64> %op1) {
 ; CHECK-NEXT:    mov z0.d, #5 // =0x5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: insertelement_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <1 x i64> %op1, i64 5, i64 0
     ret <1 x i64> %r
 }
@@ -306,12 +222,6 @@ define <2 x i64> @insertelement_v2i64(<2 x i64> %op1) {
 ; CHECK-NEXT:    mov z0.d, p0/m, x8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: insertelement_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x8
-; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x i64> %op1, i64 5, i64 1
     ret <2 x i64> %r
 }
@@ -329,13 +239,6 @@ define <4 x i64> @insertelement_v4i64(ptr %a) {
 ; CHECK-NEXT:    mov z1.d, p0/m, x8
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: insertelement_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x8
-; NONEON-NOSVE-NEXT:    ret
     %op1 = load <4 x i64>, ptr %a
     %r = insertelement <4 x i64> %op1, i64 5, i64 3
     ret <4 x i64> %r
@@ -354,16 +257,6 @@ define <2 x half> @insertelement_v2f16(<2 x half> %op1) {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: insertelement_v2f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI14_0
-; NONEON-NOSVE-NEXT:    add x8, x8, :lo12:.LCPI14_0
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    ld1r { v1.4h }, [x8]
-; NONEON-NOSVE-NEXT:    mov v1.h[0], v0.h[0]
-; NONEON-NOSVE-NEXT:    fmov d0, d1
-; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x half> %op1, half 5.0, i64 1
     ret <2 x half> %r
 }
@@ -381,15 +274,6 @@ define <4 x half> @insertelement_v4f16(<4 x half> %op1) {
 ; CHECK-NEXT:    mov z0.h, p0/m, h1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: insertelement_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI15_0
-; NONEON-NOSVE-NEXT:    add x8, x8, :lo12:.LCPI15_0
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[3], [x8]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <4 x half> %op1, half 5.0, i64 3
     ret <4 x half> %r
 }
@@ -407,13 +291,6 @@ define <8 x half> @insertelement_v8f16(<8 x half> %op1) {
 ; CHECK-NEXT:    mov z0.h, p0/m, h1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: insertelement_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI16_0
-; NONEON-NOSVE-NEXT:    add x8, x8, :lo12:.LCPI16_0
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[7], [x8]
-; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <8 x half> %op1, half 5.0, i64 7
     ret <8 x half> %r
 }
@@ -431,14 +308,6 @@ define <16 x half> @insertelement_v16f16(ptr %a) {
 ; CHECK-NEXT:    mov z1.h, p0/m, h2
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: insertelement_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI17_0
-; NONEON-NOSVE-NEXT:    add x8, x8, :lo12:.LCPI17_0
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[7], [x8]
-; NONEON-NOSVE-NEXT:    ret
     %op1 = load <16 x half>, ptr %a
     %r = insertelement <16 x half> %op1, half 5.0, i64 15
     ret <16 x half> %r
@@ -458,14 +327,6 @@ define <2 x float> @insertelement_v2f32(<2 x float> %op1) {
 ; CHECK-NEXT:    mov z0.s, p0/m, s1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: insertelement_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov s1, #5.00000000
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov v0.s[1], v1.s[0]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x float> %op1, float 5.0, i64 1
     ret <2 x float> %r
 }
@@ -483,12 +344,6 @@ define <4 x float> @insertelement_v4f32(<4 x float> %op1) {
 ; CHECK-NEXT:    mov z0.s, p0/m, s1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: insertelement_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov s1, #5.00000000
-; NONEON-NOSVE-NEXT:    mov v0.s[3], v1.s[0]
-; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <4 x float> %op1, float 5.0, i64 3
     ret <4 x float> %r
 }
@@ -506,13 +361,6 @@ define <8 x float> @insertelement_v8f32(ptr %a) {
 ; CHECK-NEXT:    mov z1.s, p0/m, s2
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: insertelement_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov s2, #5.00000000
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    mov v1.s[3], v2.s[0]
-; NONEON-NOSVE-NEXT:    ret
     %op1 = load <8 x float>, ptr %a
     %r = insertelement <8 x float> %op1, float 5.0, i64 7
     ret <8 x float> %r
@@ -524,12 +372,6 @@ define <1 x double> @insertelement_v1f64(<1 x double> %op1) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov d0, #5.00000000
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: insertelement_v1f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov x8, #4617315517961601024 // =0x4014000000000000
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <1 x double> %op1, double 5.0, i64 0
     ret <1 x double> %r
 }
@@ -547,12 +389,6 @@ define <2 x double> @insertelement_v2f64(<2 x double> %op1) {
 ; CHECK-NEXT:    mov z0.d, p0/m, d1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: insertelement_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov d1, #5.00000000
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
-; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x double> %op1, double 5.0, i64 1
     ret <2 x double> %r
 }
@@ -570,14 +406,6 @@ define <4 x double> @insertelement_v4f64(ptr %a) {
 ; CHECK-NEXT:    mov z1.d, p0/m, d2
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: insertelement_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov d0, #5.00000000
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    mov v1.d[1], v0.d[0]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
     %op1 = load <4 x double>, ptr %a
     %r = insertelement <4 x double> %op1, double 5.0, i64 3
     ret <4 x double> %r
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
index da408a11e784..8baa87c6d686 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
@@ -2,7 +2,6 @@
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE
 ; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -17,11 +16,6 @@ define <4 x i8> @add_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: add_v4i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    add v0.4h, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    ret
   %res = add <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -34,11 +28,6 @@ define <8 x i8> @add_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    add z0.b, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: add_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    add v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = add <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -51,11 +40,6 @@ define <16 x i8> @add_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    add z0.b, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: add_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = add <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -69,15 +53,6 @@ define void @add_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z1.b, z2.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: add_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    add v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = add <32 x i8> %op1, %op2
@@ -93,11 +68,6 @@ define <2 x i16> @add_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: add_v2i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    add v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = add <2 x i16> %op1, %op2
   ret <2 x i16> %res
 }
@@ -110,11 +80,6 @@ define <4 x i16> @add_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: add_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    add v0.4h, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    ret
   %res = add <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -127,11 +92,6 @@ define <8 x i16> @add_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: add_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    ret
   %res = add <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -145,15 +105,6 @@ define void @add_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z1.h, z2.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: add_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    add v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = add <16 x i16> %op1, %op2
@@ -169,11 +120,6 @@ define <2 x i32> @add_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: add_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    add v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = add <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -186,11 +132,6 @@ define <4 x i32> @add_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: add_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    add v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = add <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -204,15 +145,6 @@ define void @add_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z1.s, z2.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: add_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    add v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    add v1.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = add <8 x i32> %op1, %op2
@@ -228,11 +160,6 @@ define <1 x i64> @add_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    add z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: add_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    add d0, d0, d1
-; NONEON-NOSVE-NEXT:    ret
   %res = add <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -245,11 +172,6 @@ define <2 x i64> @add_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    add z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: add_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    add v0.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = add <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -263,15 +185,6 @@ define void @add_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: add_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    add v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    add v1.2d, v2.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = add <4 x i64> %op1, %op2
@@ -300,11 +213,6 @@ define <4 x i8> @mul_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; SVE2-NEXT:    mul z0.h, z0.h, z1.h
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: mul_v4i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mul v0.4h, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    ret
   %res = mul <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -326,11 +234,6 @@ define <8 x i8> @mul_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; SVE2-NEXT:    mul z0.b, z0.b, z1.b
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: mul_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mul v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = mul <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -352,11 +255,6 @@ define <16 x i8> @mul_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; SVE2-NEXT:    mul z0.b, z0.b, z1.b
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: mul_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mul v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = mul <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -381,15 +279,6 @@ define void @mul_v32i8(ptr %a, ptr %b) {
 ; SVE2-NEXT:    mul z1.b, z2.b, z3.b
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: mul_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    mul v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    mul v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = mul <32 x i8> %op1, %op2
@@ -414,11 +303,6 @@ define <2 x i16> @mul_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; SVE2-NEXT:    mul z0.s, z0.s, z1.s
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: mul_v2i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mul v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = mul <2 x i16> %op1, %op2
   ret <2 x i16> %res
 }
@@ -440,11 +324,6 @@ define <4 x i16> @mul_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; SVE2-NEXT:    mul z0.h, z0.h, z1.h
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: mul_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mul v0.4h, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    ret
   %res = mul <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -466,11 +345,6 @@ define <8 x i16> @mul_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; SVE2-NEXT:    mul z0.h, z0.h, z1.h
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: mul_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mul v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    ret
   %res = mul <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -495,15 +369,6 @@ define void @mul_v16i16(ptr %a, ptr %b) {
 ; SVE2-NEXT:    mul z1.h, z2.h, z3.h
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: mul_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    mul v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    mul v1.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = mul <16 x i16> %op1, %op2
@@ -528,11 +393,6 @@ define <2 x i32> @mul_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; SVE2-NEXT:    mul z0.s, z0.s, z1.s
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: mul_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mul v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = mul <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -554,11 +414,6 @@ define <4 x i32> @mul_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; SVE2-NEXT:    mul z0.s, z0.s, z1.s
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: mul_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mul v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = mul <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -583,15 +438,6 @@ define void @mul_v8i32(ptr %a, ptr %b) {
 ; SVE2-NEXT:    mul z1.s, z2.s, z3.s
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: mul_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    mul v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    mul v1.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = mul <8 x i32> %op1, %op2
@@ -616,16 +462,6 @@ define <1 x i64> @mul_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; SVE2-NEXT:    mul z0.d, z0.d, z1.d
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: mul_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    mul x8, x9, x8
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ret
   %res = mul <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -647,18 +483,6 @@ define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; SVE2-NEXT:    mul z0.d, z0.d, z1.d
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: mul_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x10, d1
-; NONEON-NOSVE-NEXT:    fmov x11, d0
-; NONEON-NOSVE-NEXT:    mov x8, v1.d[1]
-; NONEON-NOSVE-NEXT:    mov x9, v0.d[1]
-; NONEON-NOSVE-NEXT:    mul x10, x11, x10
-; NONEON-NOSVE-NEXT:    mul x8, x9, x8
-; NONEON-NOSVE-NEXT:    fmov d0, x10
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x8
-; NONEON-NOSVE-NEXT:    ret
   %res = mul <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -683,29 +507,6 @@ define void @mul_v4i64(ptr %a, ptr %b) {
 ; SVE2-NEXT:    mul z1.d, z2.d, z3.d
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: mul_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    fmov x12, d2
-; NONEON-NOSVE-NEXT:    mov x11, v2.d[1]
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    mov x10, v3.d[1]
-; NONEON-NOSVE-NEXT:    mov x13, v1.d[1]
-; NONEON-NOSVE-NEXT:    mov x14, v0.d[1]
-; NONEON-NOSVE-NEXT:    mul x8, x9, x8
-; NONEON-NOSVE-NEXT:    fmov x9, d3
-; NONEON-NOSVE-NEXT:    mul x10, x11, x10
-; NONEON-NOSVE-NEXT:    mul x9, x12, x9
-; NONEON-NOSVE-NEXT:    fmov d1, x8
-; NONEON-NOSVE-NEXT:    mul x11, x14, x13
-; NONEON-NOSVE-NEXT:    fmov d0, x9
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x10
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = mul <4 x i64> %op1, %op2
@@ -725,11 +526,6 @@ define <4 x i8> @sub_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    sub z0.h, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sub_v4i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub v0.4h, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    ret
   %res = sub <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -742,11 +538,6 @@ define <8 x i8> @sub_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    sub z0.b, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sub_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = sub <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -759,11 +550,6 @@ define <16 x i8> @sub_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    sub z0.b, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sub_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = sub <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -777,15 +563,6 @@ define void @sub_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sub z1.b, z2.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sub_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    sub v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    sub v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = sub <32 x i8> %op1, %op2
@@ -801,11 +578,6 @@ define <2 x i16> @sub_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-NEXT:    sub z0.s, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sub_v2i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = sub <2 x i16> %op1, %op2
   ret <2 x i16> %res
 }
@@ -818,11 +590,6 @@ define <4 x i16> @sub_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    sub z0.h, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sub_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub v0.4h, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    ret
   %res = sub <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -835,11 +602,6 @@ define <8 x i16> @sub_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    sub z0.h, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sub_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    ret
   %res = sub <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -853,15 +615,6 @@ define void @sub_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sub z1.h, z2.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sub_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    sub v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    sub v1.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = sub <16 x i16> %op1, %op2
@@ -877,11 +630,6 @@ define <2 x i32> @sub_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    sub z0.s, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sub_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = sub <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -894,11 +642,6 @@ define <4 x i32> @sub_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    sub z0.s, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sub_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = sub <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -912,15 +655,6 @@ define void @sub_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sub z1.s, z2.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sub_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    sub v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    sub v1.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = sub <8 x i32> %op1, %op2
@@ -936,11 +670,6 @@ define <1 x i64> @sub_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    sub z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sub_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub d0, d0, d1
-; NONEON-NOSVE-NEXT:    ret
   %res = sub <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -953,11 +682,6 @@ define <2 x i64> @sub_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    sub z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sub_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub v0.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = sub <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -971,15 +695,6 @@ define void @sub_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sub z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sub_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    sub v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    sub v1.2d, v2.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = sub <4 x i64> %op1, %op2
@@ -1000,13 +715,6 @@ define <4 x i8> @abs_v4i8(<4 x i8> %op1) {
 ; CHECK-NEXT:    abs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: abs_v4i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    abs v0.4h, v0.4h
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i8> @llvm.abs.v4i8(<4 x i8> %op1, i1 false)
   ret <4 x i8> %res
 }
@@ -1019,11 +727,6 @@ define <8 x i8> @abs_v8i8(<8 x i8> %op1) {
 ; CHECK-NEXT:    abs z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: abs_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    abs v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %op1, i1 false)
   ret <8 x i8> %res
 }
@@ -1036,11 +739,6 @@ define <16 x i8> @abs_v16i8(<16 x i8> %op1) {
 ; CHECK-NEXT:    abs z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: abs_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    abs v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %op1, i1 false)
   ret <16 x i8> %res
 }
@@ -1054,14 +752,6 @@ define void @abs_v32i8(ptr %a) {
 ; CHECK-NEXT:    abs z1.b, p0/m, z1.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: abs_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    abs v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    abs v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %res = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %op1, i1 false)
   store <32 x i8> %res, ptr %a
@@ -1077,13 +767,6 @@ define <2 x i16> @abs_v2i16(<2 x i16> %op1) {
 ; CHECK-NEXT:    abs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: abs_v2i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    abs v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %op1, i1 false)
   ret <2 x i16> %res
 }
@@ -1096,11 +779,6 @@ define <4 x i16> @abs_v4i16(<4 x i16> %op1) {
 ; CHECK-NEXT:    abs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: abs_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    abs v0.4h, v0.4h
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %op1, i1 false)
   ret <4 x i16> %res
 }
@@ -1113,11 +791,6 @@ define <8 x i16> @abs_v8i16(<8 x i16> %op1) {
 ; CHECK-NEXT:    abs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: abs_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    abs v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %op1, i1 false)
   ret <8 x i16> %res
 }
@@ -1131,14 +804,6 @@ define void @abs_v16i16(ptr %a) {
 ; CHECK-NEXT:    abs z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: abs_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    abs v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    abs v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %op1, i1 false)
   store <16 x i16> %res, ptr %a
@@ -1153,11 +818,6 @@ define <2 x i32> @abs_v2i32(<2 x i32> %op1) {
 ; CHECK-NEXT:    abs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: abs_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    abs v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %op1, i1 false)
   ret <2 x i32> %res
 }
@@ -1170,11 +830,6 @@ define <4 x i32> @abs_v4i32(<4 x i32> %op1) {
 ; CHECK-NEXT:    abs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: abs_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    abs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %op1, i1 false)
   ret <4 x i32> %res
 }
@@ -1188,14 +843,6 @@ define void @abs_v8i32(ptr %a) {
 ; CHECK-NEXT:    abs z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: abs_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    abs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    abs v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %op1, i1 false)
   store <8 x i32> %res, ptr %a
@@ -1210,11 +857,6 @@ define <1 x i64> @abs_v1i64(<1 x i64> %op1) {
 ; CHECK-NEXT:    abs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: abs_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    abs d0, d0
-; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.abs.v1i64(<1 x i64> %op1, i1 false)
   ret <1 x i64> %res
 }
@@ -1227,11 +869,6 @@ define <2 x i64> @abs_v2i64(<2 x i64> %op1) {
 ; CHECK-NEXT:    abs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: abs_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    abs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %op1, i1 false)
   ret <2 x i64> %res
 }
@@ -1245,14 +882,6 @@ define void @abs_v4i64(ptr %a) {
 ; CHECK-NEXT:    abs z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: abs_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    abs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    abs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %op1, i1 false)
   store <4 x i64> %res, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll
index 3148d4f1677c..73c1eac99dd3 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -19,11 +18,6 @@ define <8 x i8> @icmp_eq_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: icmp_eq_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmeq v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <8 x i8> %op1, %op2
   %sext = sext <8 x i1> %cmp to <8 x i8>
   ret <8 x i8> %sext
@@ -39,11 +33,6 @@ define <16 x i8> @icmp_eq_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: icmp_eq_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmeq v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <16 x i8> %op1, %op2
   %sext = sext <16 x i1> %cmp to <16 x i8>
   ret <16 x i8> %sext
@@ -61,15 +50,6 @@ define void @icmp_eq_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z1.b, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: icmp_eq_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmeq v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    cmeq v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %cmp = icmp eq <32 x i8> %op1, %op2
@@ -88,11 +68,6 @@ define <4 x i16> @icmp_eq_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: icmp_eq_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmeq v0.4h, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <4 x i16> %op1, %op2
   %sext = sext <4 x i1> %cmp to <4 x i16>
   ret <4 x i16> %sext
@@ -108,11 +83,6 @@ define <8 x i16> @icmp_eq_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: icmp_eq_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmeq v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <8 x i16> %op1, %op2
   %sext = sext <8 x i1> %cmp to <8 x i16>
   ret <8 x i16> %sext
@@ -130,15 +100,6 @@ define void @icmp_eq_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: icmp_eq_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmeq v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    cmeq v1.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %cmp = icmp eq <16 x i16> %op1, %op2
@@ -157,11 +118,6 @@ define <2 x i32> @icmp_eq_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: icmp_eq_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmeq v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <2 x i32> %op1, %op2
   %sext = sext <2 x i1> %cmp to <2 x i32>
   ret <2 x i32> %sext
@@ -177,11 +133,6 @@ define <4 x i32> @icmp_eq_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: icmp_eq_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmeq v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <4 x i32> %op1, %op2
   %sext = sext <4 x i1> %cmp to <4 x i32>
   ret <4 x i32> %sext
@@ -199,15 +150,6 @@ define void @icmp_eq_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z1.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: icmp_eq_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmeq v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    cmeq v1.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %cmp = icmp eq <8 x i32> %op1, %op2
@@ -226,11 +168,6 @@ define <1 x i64> @icmp_eq_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: icmp_eq_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmeq d0, d0, d1
-; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <1 x i64> %op1, %op2
   %sext = sext <1 x i1> %cmp to <1 x i64>
   ret <1 x i64> %sext
@@ -246,11 +183,6 @@ define <2 x i64> @icmp_eq_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: icmp_eq_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmeq v0.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <2 x i64> %op1, %op2
   %sext = sext <2 x i1> %cmp to <2 x i64>
   ret <2 x i64> %sext
@@ -268,15 +200,6 @@ define void @icmp_eq_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z1.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: icmp_eq_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmeq v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    cmeq v1.2d, v2.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %cmp = icmp eq <4 x i64> %op1, %op2
@@ -301,17 +224,6 @@ define void @icmp_ne_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z1.b, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: icmp_ne_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmeq v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    cmeq v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    mvn v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    mvn v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %cmp = icmp ne <32 x i8> %op1, %op2
@@ -334,14 +246,6 @@ define void @icmp_sge_v8i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: icmp_sge_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    cmge v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %op2 = load <8 x i16>, ptr %b
   %cmp = icmp sge <8 x i16> %op1, %op2
@@ -366,15 +270,6 @@ define void @icmp_sgt_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: icmp_sgt_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmgt v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    cmgt v1.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %cmp = icmp sgt <16 x i16> %op1, %op2
@@ -397,14 +292,6 @@ define void @icmp_sle_v4i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: icmp_sle_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    cmge v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %op2 = load <4 x i32>, ptr %b
   %cmp = icmp sle <4 x i32> %op1, %op2
@@ -429,15 +316,6 @@ define void @icmp_slt_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z1.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: icmp_slt_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmgt v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    cmgt v1.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %cmp = icmp slt <8 x i32> %op1, %op2
@@ -460,14 +338,6 @@ define void @icmp_uge_v2i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: icmp_uge_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    cmhs v0.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %op2 = load <2 x i64>, ptr %b
   %cmp = icmp uge <2 x i64> %op1, %op2
@@ -490,14 +360,6 @@ define void @icmp_ugt_v2i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: icmp_ugt_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    cmhi v0.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %op2 = load <2 x i64>, ptr %b
   %cmp = icmp ugt <2 x i64> %op1, %op2
@@ -520,14 +382,6 @@ define void @icmp_ule_v2i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: icmp_ule_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    cmhs v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %op2 = load <2 x i64>, ptr %b
   %cmp = icmp ule <2 x i64> %op1, %op2
@@ -550,14 +404,6 @@ define void @icmp_ult_v2i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: icmp_ult_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    cmhi v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %op2 = load <2 x i64>, ptr %b
   %cmp = icmp ult <2 x i64> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
index 27a4924ea367..5158dda37a8b 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
@@ -2,7 +2,6 @@
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE
 ; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -25,31 +24,6 @@ define <4 x i8> @sdiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sdiv_v4i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    shl v1.4h, v1.4h, #8
-; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    sshr v1.4h, v1.4h, #8
-; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    smov w10, v0.h[0]
-; NONEON-NOSVE-NEXT:    smov w11, v0.h[2]
-; NONEON-NOSVE-NEXT:    smov w12, v0.h[3]
-; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    smov w9, v1.h[0]
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.h[2]
-; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    smov w11, v1.h[3]
-; NONEON-NOSVE-NEXT:    fmov s0, w9
-; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w8, w12, w11
-; NONEON-NOSVE-NEXT:    mov v0.h[2], w10
-; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -77,45 +51,6 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sdiv_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    smov w10, v0.b[0]
-; NONEON-NOSVE-NEXT:    smov w11, v0.b[2]
-; NONEON-NOSVE-NEXT:    smov w12, v0.b[3]
-; NONEON-NOSVE-NEXT:    smov w13, v0.b[4]
-; NONEON-NOSVE-NEXT:    smov w14, v0.b[5]
-; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    smov w9, v1.b[0]
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.b[2]
-; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    smov w11, v1.b[3]
-; NONEON-NOSVE-NEXT:    fmov s2, w9
-; NONEON-NOSVE-NEXT:    smov w9, v1.b[6]
-; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    smov w12, v1.b[4]
-; NONEON-NOSVE-NEXT:    mov v2.b[2], w10
-; NONEON-NOSVE-NEXT:    smov w10, v0.b[6]
-; NONEON-NOSVE-NEXT:    sdiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    smov w13, v1.b[5]
-; NONEON-NOSVE-NEXT:    mov v2.b[3], w11
-; NONEON-NOSVE-NEXT:    smov w11, v0.b[7]
-; NONEON-NOSVE-NEXT:    sdiv w8, w14, w13
-; NONEON-NOSVE-NEXT:    mov v2.b[4], w12
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.b[7]
-; NONEON-NOSVE-NEXT:    mov v2.b[5], w8
-; NONEON-NOSVE-NEXT:    sdiv w8, w11, w10
-; NONEON-NOSVE-NEXT:    mov v2.b[6], w9
-; NONEON-NOSVE-NEXT:    mov v2.b[7], w8
-; NONEON-NOSVE-NEXT:    fmov d0, d2
-; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -163,75 +98,6 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sdiv_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    smov w10, v0.b[0]
-; NONEON-NOSVE-NEXT:    smov w11, v0.b[2]
-; NONEON-NOSVE-NEXT:    smov w12, v0.b[3]
-; NONEON-NOSVE-NEXT:    smov w13, v0.b[4]
-; NONEON-NOSVE-NEXT:    smov w14, v0.b[5]
-; NONEON-NOSVE-NEXT:    smov w15, v0.b[6]
-; NONEON-NOSVE-NEXT:    smov w16, v0.b[7]
-; NONEON-NOSVE-NEXT:    smov w17, v0.b[8]
-; NONEON-NOSVE-NEXT:    smov w18, v0.b[9]
-; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    smov w9, v1.b[0]
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.b[2]
-; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    smov w11, v1.b[3]
-; NONEON-NOSVE-NEXT:    fmov s2, w9
-; NONEON-NOSVE-NEXT:    smov w9, v1.b[10]
-; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    smov w12, v1.b[4]
-; NONEON-NOSVE-NEXT:    mov v2.b[2], w10
-; NONEON-NOSVE-NEXT:    smov w10, v0.b[10]
-; NONEON-NOSVE-NEXT:    sdiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    smov w13, v1.b[5]
-; NONEON-NOSVE-NEXT:    mov v2.b[3], w11
-; NONEON-NOSVE-NEXT:    smov w11, v0.b[11]
-; NONEON-NOSVE-NEXT:    sdiv w13, w14, w13
-; NONEON-NOSVE-NEXT:    smov w14, v1.b[6]
-; NONEON-NOSVE-NEXT:    mov v2.b[4], w12
-; NONEON-NOSVE-NEXT:    smov w12, v0.b[12]
-; NONEON-NOSVE-NEXT:    sdiv w14, w15, w14
-; NONEON-NOSVE-NEXT:    smov w15, v1.b[7]
-; NONEON-NOSVE-NEXT:    mov v2.b[5], w13
-; NONEON-NOSVE-NEXT:    smov w13, v0.b[13]
-; NONEON-NOSVE-NEXT:    sdiv w15, w16, w15
-; NONEON-NOSVE-NEXT:    smov w16, v1.b[8]
-; NONEON-NOSVE-NEXT:    mov v2.b[6], w14
-; NONEON-NOSVE-NEXT:    sdiv w16, w17, w16
-; NONEON-NOSVE-NEXT:    smov w17, v1.b[9]
-; NONEON-NOSVE-NEXT:    mov v2.b[7], w15
-; NONEON-NOSVE-NEXT:    sdiv w8, w18, w17
-; NONEON-NOSVE-NEXT:    mov v2.b[8], w16
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.b[11]
-; NONEON-NOSVE-NEXT:    mov v2.b[9], w8
-; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    smov w11, v1.b[12]
-; NONEON-NOSVE-NEXT:    mov v2.b[10], w9
-; NONEON-NOSVE-NEXT:    smov w9, v1.b[14]
-; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    smov w12, v1.b[13]
-; NONEON-NOSVE-NEXT:    mov v2.b[11], w10
-; NONEON-NOSVE-NEXT:    smov w10, v1.b[15]
-; NONEON-NOSVE-NEXT:    sdiv w8, w13, w12
-; NONEON-NOSVE-NEXT:    smov w12, v0.b[14]
-; NONEON-NOSVE-NEXT:    mov v2.b[12], w11
-; NONEON-NOSVE-NEXT:    smov w11, v0.b[15]
-; NONEON-NOSVE-NEXT:    sdiv w9, w12, w9
-; NONEON-NOSVE-NEXT:    mov v2.b[13], w8
-; NONEON-NOSVE-NEXT:    sdiv w8, w11, w10
-; NONEON-NOSVE-NEXT:    mov v2.b[14], w9
-; NONEON-NOSVE-NEXT:    mov v2.b[15], w8
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -312,163 +178,6 @@ define void @sdiv_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z3.b, p0, z3.b, z1.b
 ; CHECK-NEXT:    stp q3, q2, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sdiv_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str x27, [sp, #-80]! // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
-; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
-; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
-; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
-; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
-; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
-; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
-; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
-; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
-; NONEON-NOSVE-NEXT:    .cfi_offset w27, -80
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0]
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    smov w10, v0.b[0]
-; NONEON-NOSVE-NEXT:    smov w11, v0.b[2]
-; NONEON-NOSVE-NEXT:    smov w12, v0.b[3]
-; NONEON-NOSVE-NEXT:    smov w13, v0.b[4]
-; NONEON-NOSVE-NEXT:    smov w14, v0.b[5]
-; NONEON-NOSVE-NEXT:    smov w15, v0.b[6]
-; NONEON-NOSVE-NEXT:    smov w17, v0.b[8]
-; NONEON-NOSVE-NEXT:    smov w2, v0.b[10]
-; NONEON-NOSVE-NEXT:    smov w3, v0.b[11]
-; NONEON-NOSVE-NEXT:    smov w4, v0.b[12]
-; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    smov w9, v1.b[0]
-; NONEON-NOSVE-NEXT:    smov w5, v0.b[13]
-; NONEON-NOSVE-NEXT:    smov w6, v0.b[14]
-; NONEON-NOSVE-NEXT:    smov w1, v3.b[1]
-; NONEON-NOSVE-NEXT:    smov w7, v2.b[0]
-; NONEON-NOSVE-NEXT:    smov w19, v2.b[2]
-; NONEON-NOSVE-NEXT:    smov w20, v2.b[3]
-; NONEON-NOSVE-NEXT:    smov w21, v2.b[4]
-; NONEON-NOSVE-NEXT:    smov w22, v2.b[5]
-; NONEON-NOSVE-NEXT:    smov w23, v2.b[6]
-; NONEON-NOSVE-NEXT:    smov w24, v2.b[7]
-; NONEON-NOSVE-NEXT:    smov w25, v2.b[8]
-; NONEON-NOSVE-NEXT:    smov w26, v2.b[9]
-; NONEON-NOSVE-NEXT:    smov w27, v2.b[10]
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.b[2]
-; NONEON-NOSVE-NEXT:    sdiv w11, w11, w10
-; NONEON-NOSVE-NEXT:    smov w10, v1.b[3]
-; NONEON-NOSVE-NEXT:    fmov s5, w9
-; NONEON-NOSVE-NEXT:    smov w9, v3.b[11]
-; NONEON-NOSVE-NEXT:    mov v5.b[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w10, w12, w10
-; NONEON-NOSVE-NEXT:    smov w12, v1.b[4]
-; NONEON-NOSVE-NEXT:    mov v5.b[2], w11
-; NONEON-NOSVE-NEXT:    smov w11, v2.b[11]
-; NONEON-NOSVE-NEXT:    sdiv w13, w13, w12
-; NONEON-NOSVE-NEXT:    smov w12, v1.b[5]
-; NONEON-NOSVE-NEXT:    mov v5.b[3], w10
-; NONEON-NOSVE-NEXT:    smov w10, v3.b[12]
-; NONEON-NOSVE-NEXT:    sdiv w12, w14, w12
-; NONEON-NOSVE-NEXT:    smov w14, v1.b[6]
-; NONEON-NOSVE-NEXT:    mov v5.b[4], w13
-; NONEON-NOSVE-NEXT:    smov w13, v2.b[14]
-; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
-; NONEON-NOSVE-NEXT:    smov w14, v1.b[7]
-; NONEON-NOSVE-NEXT:    smov w15, v0.b[7]
-; NONEON-NOSVE-NEXT:    mov v5.b[5], w12
-; NONEON-NOSVE-NEXT:    smov w12, v2.b[13]
-; NONEON-NOSVE-NEXT:    sdiv w14, w15, w14
-; NONEON-NOSVE-NEXT:    smov w15, v1.b[8]
-; NONEON-NOSVE-NEXT:    mov v5.b[6], w16
-; NONEON-NOSVE-NEXT:    sdiv w18, w17, w15
-; NONEON-NOSVE-NEXT:    smov w15, v1.b[9]
-; NONEON-NOSVE-NEXT:    smov w17, v0.b[9]
-; NONEON-NOSVE-NEXT:    mov v5.b[7], w14
-; NONEON-NOSVE-NEXT:    sdiv w17, w17, w15
-; NONEON-NOSVE-NEXT:    smov w15, v1.b[10]
-; NONEON-NOSVE-NEXT:    mov v5.b[8], w18
-; NONEON-NOSVE-NEXT:    sdiv w15, w2, w15
-; NONEON-NOSVE-NEXT:    smov w2, v1.b[11]
-; NONEON-NOSVE-NEXT:    mov v5.b[9], w17
-; NONEON-NOSVE-NEXT:    sdiv w2, w3, w2
-; NONEON-NOSVE-NEXT:    smov w3, v1.b[12]
-; NONEON-NOSVE-NEXT:    mov v5.b[10], w15
-; NONEON-NOSVE-NEXT:    sdiv w3, w4, w3
-; NONEON-NOSVE-NEXT:    smov w4, v1.b[13]
-; NONEON-NOSVE-NEXT:    mov v5.b[11], w2
-; NONEON-NOSVE-NEXT:    sdiv w4, w5, w4
-; NONEON-NOSVE-NEXT:    smov w5, v1.b[14]
-; NONEON-NOSVE-NEXT:    mov v5.b[12], w3
-; NONEON-NOSVE-NEXT:    sdiv w5, w6, w5
-; NONEON-NOSVE-NEXT:    smov w6, v2.b[1]
-; NONEON-NOSVE-NEXT:    mov v5.b[13], w4
-; NONEON-NOSVE-NEXT:    sdiv w1, w6, w1
-; NONEON-NOSVE-NEXT:    smov w6, v3.b[0]
-; NONEON-NOSVE-NEXT:    mov v5.b[14], w5
-; NONEON-NOSVE-NEXT:    sdiv w6, w7, w6
-; NONEON-NOSVE-NEXT:    smov w7, v3.b[2]
-; NONEON-NOSVE-NEXT:    sdiv w7, w19, w7
-; NONEON-NOSVE-NEXT:    smov w19, v3.b[3]
-; NONEON-NOSVE-NEXT:    fmov s4, w6
-; NONEON-NOSVE-NEXT:    mov v4.b[1], w1
-; NONEON-NOSVE-NEXT:    sdiv w19, w20, w19
-; NONEON-NOSVE-NEXT:    smov w20, v3.b[4]
-; NONEON-NOSVE-NEXT:    mov v4.b[2], w7
-; NONEON-NOSVE-NEXT:    sdiv w20, w21, w20
-; NONEON-NOSVE-NEXT:    smov w21, v3.b[5]
-; NONEON-NOSVE-NEXT:    mov v4.b[3], w19
-; NONEON-NOSVE-NEXT:    sdiv w21, w22, w21
-; NONEON-NOSVE-NEXT:    smov w22, v3.b[6]
-; NONEON-NOSVE-NEXT:    mov v4.b[4], w20
-; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w22, w23, w22
-; NONEON-NOSVE-NEXT:    smov w23, v3.b[7]
-; NONEON-NOSVE-NEXT:    mov v4.b[5], w21
-; NONEON-NOSVE-NEXT:    sdiv w23, w24, w23
-; NONEON-NOSVE-NEXT:    smov w24, v3.b[8]
-; NONEON-NOSVE-NEXT:    mov v4.b[6], w22
-; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w24, w25, w24
-; NONEON-NOSVE-NEXT:    smov w25, v3.b[9]
-; NONEON-NOSVE-NEXT:    mov v4.b[7], w23
-; NONEON-NOSVE-NEXT:    sdiv w25, w26, w25
-; NONEON-NOSVE-NEXT:    smov w26, v3.b[10]
-; NONEON-NOSVE-NEXT:    mov v4.b[8], w24
-; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w8, w27, w26
-; NONEON-NOSVE-NEXT:    mov v4.b[9], w25
-; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w9, w11, w9
-; NONEON-NOSVE-NEXT:    smov w11, v2.b[12]
-; NONEON-NOSVE-NEXT:    mov v4.b[10], w8
-; NONEON-NOSVE-NEXT:    smov w8, v3.b[15]
-; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    smov w11, v3.b[13]
-; NONEON-NOSVE-NEXT:    mov v4.b[11], w9
-; NONEON-NOSVE-NEXT:    smov w9, v1.b[15]
-; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    smov w12, v3.b[14]
-; NONEON-NOSVE-NEXT:    mov v4.b[12], w10
-; NONEON-NOSVE-NEXT:    smov w10, v0.b[15]
-; NONEON-NOSVE-NEXT:    sdiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    smov w13, v2.b[15]
-; NONEON-NOSVE-NEXT:    mov v4.b[13], w11
-; NONEON-NOSVE-NEXT:    sdiv w8, w13, w8
-; NONEON-NOSVE-NEXT:    mov v4.b[14], w12
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    mov v4.b[15], w8
-; NONEON-NOSVE-NEXT:    mov v5.b[15], w9
-; NONEON-NOSVE-NEXT:    stp q4, q5, [x0]
-; NONEON-NOSVE-NEXT:    ldr x27, [sp], #80 // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = sdiv <32 x i8> %op1, %op2
@@ -487,23 +196,6 @@ define <2 x i16> @sdiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sdiv_v2i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    shl v1.2s, v1.2s, #16
-; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    sshr v1.2s, v1.2s, #16
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    fmov w9, s0
-; NONEON-NOSVE-NEXT:    mov w10, v0.s[1]
-; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    mov w9, v1.s[1]
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w9
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i16> %op1, %op2
   ret <2 x i16> %res
 }
@@ -520,29 +212,6 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sdiv_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    smov w10, v0.h[0]
-; NONEON-NOSVE-NEXT:    smov w11, v0.h[2]
-; NONEON-NOSVE-NEXT:    smov w12, v0.h[3]
-; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    smov w9, v1.h[0]
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.h[2]
-; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    smov w11, v1.h[3]
-; NONEON-NOSVE-NEXT:    fmov s0, w9
-; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w8, w12, w11
-; NONEON-NOSVE-NEXT:    mov v0.h[2], w10
-; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -569,43 +238,6 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sdiv_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    smov w10, v0.h[0]
-; NONEON-NOSVE-NEXT:    smov w11, v0.h[2]
-; NONEON-NOSVE-NEXT:    smov w12, v0.h[3]
-; NONEON-NOSVE-NEXT:    smov w13, v0.h[4]
-; NONEON-NOSVE-NEXT:    smov w14, v0.h[5]
-; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    smov w9, v1.h[0]
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.h[2]
-; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    smov w11, v1.h[3]
-; NONEON-NOSVE-NEXT:    fmov s2, w9
-; NONEON-NOSVE-NEXT:    smov w9, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    smov w12, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w10
-; NONEON-NOSVE-NEXT:    smov w10, v0.h[6]
-; NONEON-NOSVE-NEXT:    sdiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    smov w13, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w11
-; NONEON-NOSVE-NEXT:    smov w11, v0.h[7]
-; NONEON-NOSVE-NEXT:    sdiv w8, w14, w13
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w12
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    sdiv w8, w11, w10
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w9
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -646,79 +278,6 @@ define void @sdiv_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z3.h, p0, z3.h, z1.h
 ; CHECK-NEXT:    stp q3, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sdiv_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0]
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    smov w10, v0.h[0]
-; NONEON-NOSVE-NEXT:    smov w11, v0.h[2]
-; NONEON-NOSVE-NEXT:    smov w12, v0.h[3]
-; NONEON-NOSVE-NEXT:    smov w13, v0.h[4]
-; NONEON-NOSVE-NEXT:    smov w14, v0.h[5]
-; NONEON-NOSVE-NEXT:    smov w15, v0.h[6]
-; NONEON-NOSVE-NEXT:    smov w16, v2.h[1]
-; NONEON-NOSVE-NEXT:    smov w17, v2.h[0]
-; NONEON-NOSVE-NEXT:    smov w18, v2.h[2]
-; NONEON-NOSVE-NEXT:    smov w1, v2.h[3]
-; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    smov w9, v1.h[0]
-; NONEON-NOSVE-NEXT:    smov w2, v2.h[4]
-; NONEON-NOSVE-NEXT:    smov w3, v2.h[5]
-; NONEON-NOSVE-NEXT:    smov w4, v2.h[6]
-; NONEON-NOSVE-NEXT:    sdiv w10, w10, w9
-; NONEON-NOSVE-NEXT:    smov w9, v1.h[2]
-; NONEON-NOSVE-NEXT:    sdiv w9, w11, w9
-; NONEON-NOSVE-NEXT:    smov w11, v1.h[3]
-; NONEON-NOSVE-NEXT:    fmov s5, w10
-; NONEON-NOSVE-NEXT:    smov w10, v3.h[7]
-; NONEON-NOSVE-NEXT:    mov v5.h[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    smov w12, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov v5.h[2], w9
-; NONEON-NOSVE-NEXT:    smov w9, v2.h[7]
-; NONEON-NOSVE-NEXT:    sdiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    smov w13, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov v5.h[3], w11
-; NONEON-NOSVE-NEXT:    smov w11, v0.h[7]
-; NONEON-NOSVE-NEXT:    sdiv w13, w14, w13
-; NONEON-NOSVE-NEXT:    smov w14, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v5.h[4], w12
-; NONEON-NOSVE-NEXT:    sdiv w14, w15, w14
-; NONEON-NOSVE-NEXT:    smov w15, v3.h[1]
-; NONEON-NOSVE-NEXT:    mov v5.h[5], w13
-; NONEON-NOSVE-NEXT:    sdiv w15, w16, w15
-; NONEON-NOSVE-NEXT:    smov w16, v3.h[0]
-; NONEON-NOSVE-NEXT:    mov v5.h[6], w14
-; NONEON-NOSVE-NEXT:    sdiv w16, w17, w16
-; NONEON-NOSVE-NEXT:    smov w17, v3.h[2]
-; NONEON-NOSVE-NEXT:    sdiv w17, w18, w17
-; NONEON-NOSVE-NEXT:    smov w18, v3.h[3]
-; NONEON-NOSVE-NEXT:    fmov s4, w16
-; NONEON-NOSVE-NEXT:    mov v4.h[1], w15
-; NONEON-NOSVE-NEXT:    sdiv w18, w1, w18
-; NONEON-NOSVE-NEXT:    smov w1, v3.h[4]
-; NONEON-NOSVE-NEXT:    mov v4.h[2], w17
-; NONEON-NOSVE-NEXT:    sdiv w1, w2, w1
-; NONEON-NOSVE-NEXT:    smov w2, v3.h[5]
-; NONEON-NOSVE-NEXT:    mov v4.h[3], w18
-; NONEON-NOSVE-NEXT:    sdiv w2, w3, w2
-; NONEON-NOSVE-NEXT:    smov w3, v3.h[6]
-; NONEON-NOSVE-NEXT:    mov v4.h[4], w1
-; NONEON-NOSVE-NEXT:    sdiv w8, w4, w3
-; NONEON-NOSVE-NEXT:    mov v4.h[5], w2
-; NONEON-NOSVE-NEXT:    sdiv w9, w9, w10
-; NONEON-NOSVE-NEXT:    smov w10, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov v4.h[6], w8
-; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    mov v4.h[7], w9
-; NONEON-NOSVE-NEXT:    mov v5.h[7], w10
-; NONEON-NOSVE-NEXT:    stp q4, q5, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = sdiv <16 x i16> %op1, %op2
@@ -735,21 +294,6 @@ define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sdiv_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    fmov w9, s0
-; NONEON-NOSVE-NEXT:    mov w10, v0.s[1]
-; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    mov w9, v1.s[1]
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w9
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -763,26 +307,6 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sdiv_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
-; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
-; NONEON-NOSVE-NEXT:    fmov w10, s0
-; NONEON-NOSVE-NEXT:    mov w11, v0.s[2]
-; NONEON-NOSVE-NEXT:    mov w12, v0.s[3]
-; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    fmov w9, s1
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    mov w10, v1.s[2]
-; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    mov w11, v1.s[3]
-; NONEON-NOSVE-NEXT:    fmov s0, w9
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w8, w12, w11
-; NONEON-NOSVE-NEXT:    mov v0.s[2], w10
-; NONEON-NOSVE-NEXT:    mov v0.s[3], w8
-; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -798,45 +322,6 @@ define void @sdiv_v8i32(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    sdiv z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sdiv_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
-; NONEON-NOSVE-NEXT:    fmov w10, s0
-; NONEON-NOSVE-NEXT:    mov w11, v0.s[2]
-; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
-; NONEON-NOSVE-NEXT:    mov w12, v2.s[1]
-; NONEON-NOSVE-NEXT:    fmov w13, s2
-; NONEON-NOSVE-NEXT:    mov w14, v2.s[2]
-; NONEON-NOSVE-NEXT:    mov w15, v2.s[3]
-; NONEON-NOSVE-NEXT:    mov w16, v0.s[3]
-; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    fmov w9, s1
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    mov w10, v1.s[2]
-; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    mov w11, v3.s[1]
-; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    fmov w12, s3
-; NONEON-NOSVE-NEXT:    sdiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    mov w13, v3.s[2]
-; NONEON-NOSVE-NEXT:    sdiv w13, w14, w13
-; NONEON-NOSVE-NEXT:    mov w14, v3.s[3]
-; NONEON-NOSVE-NEXT:    fmov s0, w12
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w11
-; NONEON-NOSVE-NEXT:    sdiv w14, w15, w14
-; NONEON-NOSVE-NEXT:    mov w15, v1.s[3]
-; NONEON-NOSVE-NEXT:    fmov s1, w9
-; NONEON-NOSVE-NEXT:    mov v0.s[2], w13
-; NONEON-NOSVE-NEXT:    mov v1.s[1], w8
-; NONEON-NOSVE-NEXT:    mov v1.s[2], w10
-; NONEON-NOSVE-NEXT:    sdiv w8, w16, w15
-; NONEON-NOSVE-NEXT:    mov v0.s[3], w14
-; NONEON-NOSVE-NEXT:    mov v1.s[3], w8
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = sdiv <8 x i32> %op1, %op2
@@ -853,16 +338,6 @@ define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    sdiv z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sdiv_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    sdiv x8, x9, x8
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -876,18 +351,6 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    sdiv z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sdiv_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    mov x10, v0.d[1]
-; NONEON-NOSVE-NEXT:    sdiv x8, x9, x8
-; NONEON-NOSVE-NEXT:    mov x9, v1.d[1]
-; NONEON-NOSVE-NEXT:    sdiv x9, x10, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
-; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -903,29 +366,6 @@ define void @sdiv_v4i64(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    sdiv z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sdiv_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    mov x10, v2.d[1]
-; NONEON-NOSVE-NEXT:    fmov x11, d2
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    mov x12, v0.d[1]
-; NONEON-NOSVE-NEXT:    sdiv x8, x9, x8
-; NONEON-NOSVE-NEXT:    mov x9, v3.d[1]
-; NONEON-NOSVE-NEXT:    sdiv x9, x10, x9
-; NONEON-NOSVE-NEXT:    fmov x10, d3
-; NONEON-NOSVE-NEXT:    sdiv x10, x11, x10
-; NONEON-NOSVE-NEXT:    mov x11, v1.d[1]
-; NONEON-NOSVE-NEXT:    fmov d1, x8
-; NONEON-NOSVE-NEXT:    sdiv x11, x12, x11
-; NONEON-NOSVE-NEXT:    fmov d0, x10
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = sdiv <4 x i64> %op1, %op2
@@ -951,37 +391,6 @@ define <4 x i8> @udiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: udiv_v4i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    umov w10, v0.h[0]
-; NONEON-NOSVE-NEXT:    umov w11, v0.h[2]
-; NONEON-NOSVE-NEXT:    umov w12, v0.h[3]
-; NONEON-NOSVE-NEXT:    and w8, w8, #0xff
-; NONEON-NOSVE-NEXT:    and w9, w9, #0xff
-; NONEON-NOSVE-NEXT:    and w10, w10, #0xff
-; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    umov w9, v1.h[0]
-; NONEON-NOSVE-NEXT:    and w11, w11, #0xff
-; NONEON-NOSVE-NEXT:    and w9, w9, #0xff
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.h[2]
-; NONEON-NOSVE-NEXT:    and w10, w10, #0xff
-; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    umov w11, v1.h[3]
-; NONEON-NOSVE-NEXT:    fmov s0, w9
-; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
-; NONEON-NOSVE-NEXT:    and w9, w11, #0xff
-; NONEON-NOSVE-NEXT:    and w11, w12, #0xff
-; NONEON-NOSVE-NEXT:    udiv w8, w11, w9
-; NONEON-NOSVE-NEXT:    mov v0.h[2], w10
-; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %res = udiv <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -1009,45 +418,6 @@ define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: udiv_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    umov w10, v0.b[0]
-; NONEON-NOSVE-NEXT:    umov w11, v0.b[2]
-; NONEON-NOSVE-NEXT:    umov w12, v0.b[3]
-; NONEON-NOSVE-NEXT:    umov w13, v0.b[4]
-; NONEON-NOSVE-NEXT:    umov w14, v0.b[5]
-; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    umov w9, v1.b[0]
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.b[2]
-; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    umov w11, v1.b[3]
-; NONEON-NOSVE-NEXT:    fmov s2, w9
-; NONEON-NOSVE-NEXT:    umov w9, v1.b[6]
-; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
-; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    umov w12, v1.b[4]
-; NONEON-NOSVE-NEXT:    mov v2.b[2], w10
-; NONEON-NOSVE-NEXT:    umov w10, v0.b[6]
-; NONEON-NOSVE-NEXT:    udiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    umov w13, v1.b[5]
-; NONEON-NOSVE-NEXT:    mov v2.b[3], w11
-; NONEON-NOSVE-NEXT:    umov w11, v0.b[7]
-; NONEON-NOSVE-NEXT:    udiv w8, w14, w13
-; NONEON-NOSVE-NEXT:    mov v2.b[4], w12
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.b[7]
-; NONEON-NOSVE-NEXT:    mov v2.b[5], w8
-; NONEON-NOSVE-NEXT:    udiv w8, w11, w10
-; NONEON-NOSVE-NEXT:    mov v2.b[6], w9
-; NONEON-NOSVE-NEXT:    mov v2.b[7], w8
-; NONEON-NOSVE-NEXT:    fmov d0, d2
-; NONEON-NOSVE-NEXT:    ret
   %res = udiv <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -1095,75 +465,6 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: udiv_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    umov w10, v0.b[0]
-; NONEON-NOSVE-NEXT:    umov w11, v0.b[2]
-; NONEON-NOSVE-NEXT:    umov w12, v0.b[3]
-; NONEON-NOSVE-NEXT:    umov w13, v0.b[4]
-; NONEON-NOSVE-NEXT:    umov w14, v0.b[5]
-; NONEON-NOSVE-NEXT:    umov w15, v0.b[6]
-; NONEON-NOSVE-NEXT:    umov w16, v0.b[7]
-; NONEON-NOSVE-NEXT:    umov w17, v0.b[8]
-; NONEON-NOSVE-NEXT:    umov w18, v0.b[9]
-; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    umov w9, v1.b[0]
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.b[2]
-; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    umov w11, v1.b[3]
-; NONEON-NOSVE-NEXT:    fmov s2, w9
-; NONEON-NOSVE-NEXT:    umov w9, v1.b[10]
-; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
-; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    umov w12, v1.b[4]
-; NONEON-NOSVE-NEXT:    mov v2.b[2], w10
-; NONEON-NOSVE-NEXT:    umov w10, v0.b[10]
-; NONEON-NOSVE-NEXT:    udiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    umov w13, v1.b[5]
-; NONEON-NOSVE-NEXT:    mov v2.b[3], w11
-; NONEON-NOSVE-NEXT:    umov w11, v0.b[11]
-; NONEON-NOSVE-NEXT:    udiv w13, w14, w13
-; NONEON-NOSVE-NEXT:    umov w14, v1.b[6]
-; NONEON-NOSVE-NEXT:    mov v2.b[4], w12
-; NONEON-NOSVE-NEXT:    umov w12, v0.b[12]
-; NONEON-NOSVE-NEXT:    udiv w14, w15, w14
-; NONEON-NOSVE-NEXT:    umov w15, v1.b[7]
-; NONEON-NOSVE-NEXT:    mov v2.b[5], w13
-; NONEON-NOSVE-NEXT:    umov w13, v0.b[13]
-; NONEON-NOSVE-NEXT:    udiv w15, w16, w15
-; NONEON-NOSVE-NEXT:    umov w16, v1.b[8]
-; NONEON-NOSVE-NEXT:    mov v2.b[6], w14
-; NONEON-NOSVE-NEXT:    udiv w16, w17, w16
-; NONEON-NOSVE-NEXT:    umov w17, v1.b[9]
-; NONEON-NOSVE-NEXT:    mov v2.b[7], w15
-; NONEON-NOSVE-NEXT:    udiv w8, w18, w17
-; NONEON-NOSVE-NEXT:    mov v2.b[8], w16
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.b[11]
-; NONEON-NOSVE-NEXT:    mov v2.b[9], w8
-; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    umov w11, v1.b[12]
-; NONEON-NOSVE-NEXT:    mov v2.b[10], w9
-; NONEON-NOSVE-NEXT:    umov w9, v1.b[14]
-; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    umov w12, v1.b[13]
-; NONEON-NOSVE-NEXT:    mov v2.b[11], w10
-; NONEON-NOSVE-NEXT:    umov w10, v1.b[15]
-; NONEON-NOSVE-NEXT:    udiv w8, w13, w12
-; NONEON-NOSVE-NEXT:    umov w12, v0.b[14]
-; NONEON-NOSVE-NEXT:    mov v2.b[12], w11
-; NONEON-NOSVE-NEXT:    umov w11, v0.b[15]
-; NONEON-NOSVE-NEXT:    udiv w9, w12, w9
-; NONEON-NOSVE-NEXT:    mov v2.b[13], w8
-; NONEON-NOSVE-NEXT:    udiv w8, w11, w10
-; NONEON-NOSVE-NEXT:    mov v2.b[14], w9
-; NONEON-NOSVE-NEXT:    mov v2.b[15], w8
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = udiv <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -1244,163 +545,6 @@ define void @udiv_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z3.b, p0, z3.b, z1.b
 ; CHECK-NEXT:    stp q3, q2, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: udiv_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str x27, [sp, #-80]! // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
-; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
-; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
-; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
-; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
-; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
-; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
-; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
-; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
-; NONEON-NOSVE-NEXT:    .cfi_offset w27, -80
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0]
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    umov w10, v0.b[0]
-; NONEON-NOSVE-NEXT:    umov w11, v0.b[2]
-; NONEON-NOSVE-NEXT:    umov w12, v0.b[3]
-; NONEON-NOSVE-NEXT:    umov w13, v0.b[4]
-; NONEON-NOSVE-NEXT:    umov w14, v0.b[5]
-; NONEON-NOSVE-NEXT:    umov w15, v0.b[6]
-; NONEON-NOSVE-NEXT:    umov w17, v0.b[8]
-; NONEON-NOSVE-NEXT:    umov w2, v0.b[10]
-; NONEON-NOSVE-NEXT:    umov w3, v0.b[11]
-; NONEON-NOSVE-NEXT:    umov w4, v0.b[12]
-; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    umov w9, v1.b[0]
-; NONEON-NOSVE-NEXT:    umov w5, v0.b[13]
-; NONEON-NOSVE-NEXT:    umov w6, v0.b[14]
-; NONEON-NOSVE-NEXT:    umov w1, v3.b[1]
-; NONEON-NOSVE-NEXT:    umov w7, v2.b[0]
-; NONEON-NOSVE-NEXT:    umov w19, v2.b[2]
-; NONEON-NOSVE-NEXT:    umov w20, v2.b[3]
-; NONEON-NOSVE-NEXT:    umov w21, v2.b[4]
-; NONEON-NOSVE-NEXT:    umov w22, v2.b[5]
-; NONEON-NOSVE-NEXT:    umov w23, v2.b[6]
-; NONEON-NOSVE-NEXT:    umov w24, v2.b[7]
-; NONEON-NOSVE-NEXT:    umov w25, v2.b[8]
-; NONEON-NOSVE-NEXT:    umov w26, v2.b[9]
-; NONEON-NOSVE-NEXT:    umov w27, v2.b[10]
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.b[2]
-; NONEON-NOSVE-NEXT:    udiv w11, w11, w10
-; NONEON-NOSVE-NEXT:    umov w10, v1.b[3]
-; NONEON-NOSVE-NEXT:    fmov s5, w9
-; NONEON-NOSVE-NEXT:    umov w9, v3.b[11]
-; NONEON-NOSVE-NEXT:    mov v5.b[1], w8
-; NONEON-NOSVE-NEXT:    udiv w10, w12, w10
-; NONEON-NOSVE-NEXT:    umov w12, v1.b[4]
-; NONEON-NOSVE-NEXT:    mov v5.b[2], w11
-; NONEON-NOSVE-NEXT:    umov w11, v2.b[11]
-; NONEON-NOSVE-NEXT:    udiv w13, w13, w12
-; NONEON-NOSVE-NEXT:    umov w12, v1.b[5]
-; NONEON-NOSVE-NEXT:    mov v5.b[3], w10
-; NONEON-NOSVE-NEXT:    umov w10, v3.b[12]
-; NONEON-NOSVE-NEXT:    udiv w12, w14, w12
-; NONEON-NOSVE-NEXT:    umov w14, v1.b[6]
-; NONEON-NOSVE-NEXT:    mov v5.b[4], w13
-; NONEON-NOSVE-NEXT:    umov w13, v2.b[14]
-; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
-; NONEON-NOSVE-NEXT:    umov w14, v1.b[7]
-; NONEON-NOSVE-NEXT:    umov w15, v0.b[7]
-; NONEON-NOSVE-NEXT:    mov v5.b[5], w12
-; NONEON-NOSVE-NEXT:    umov w12, v2.b[13]
-; NONEON-NOSVE-NEXT:    udiv w14, w15, w14
-; NONEON-NOSVE-NEXT:    umov w15, v1.b[8]
-; NONEON-NOSVE-NEXT:    mov v5.b[6], w16
-; NONEON-NOSVE-NEXT:    udiv w18, w17, w15
-; NONEON-NOSVE-NEXT:    umov w15, v1.b[9]
-; NONEON-NOSVE-NEXT:    umov w17, v0.b[9]
-; NONEON-NOSVE-NEXT:    mov v5.b[7], w14
-; NONEON-NOSVE-NEXT:    udiv w17, w17, w15
-; NONEON-NOSVE-NEXT:    umov w15, v1.b[10]
-; NONEON-NOSVE-NEXT:    mov v5.b[8], w18
-; NONEON-NOSVE-NEXT:    udiv w15, w2, w15
-; NONEON-NOSVE-NEXT:    umov w2, v1.b[11]
-; NONEON-NOSVE-NEXT:    mov v5.b[9], w17
-; NONEON-NOSVE-NEXT:    udiv w2, w3, w2
-; NONEON-NOSVE-NEXT:    umov w3, v1.b[12]
-; NONEON-NOSVE-NEXT:    mov v5.b[10], w15
-; NONEON-NOSVE-NEXT:    udiv w3, w4, w3
-; NONEON-NOSVE-NEXT:    umov w4, v1.b[13]
-; NONEON-NOSVE-NEXT:    mov v5.b[11], w2
-; NONEON-NOSVE-NEXT:    udiv w4, w5, w4
-; NONEON-NOSVE-NEXT:    umov w5, v1.b[14]
-; NONEON-NOSVE-NEXT:    mov v5.b[12], w3
-; NONEON-NOSVE-NEXT:    udiv w5, w6, w5
-; NONEON-NOSVE-NEXT:    umov w6, v2.b[1]
-; NONEON-NOSVE-NEXT:    mov v5.b[13], w4
-; NONEON-NOSVE-NEXT:    udiv w1, w6, w1
-; NONEON-NOSVE-NEXT:    umov w6, v3.b[0]
-; NONEON-NOSVE-NEXT:    mov v5.b[14], w5
-; NONEON-NOSVE-NEXT:    udiv w6, w7, w6
-; NONEON-NOSVE-NEXT:    umov w7, v3.b[2]
-; NONEON-NOSVE-NEXT:    udiv w7, w19, w7
-; NONEON-NOSVE-NEXT:    umov w19, v3.b[3]
-; NONEON-NOSVE-NEXT:    fmov s4, w6
-; NONEON-NOSVE-NEXT:    mov v4.b[1], w1
-; NONEON-NOSVE-NEXT:    udiv w19, w20, w19
-; NONEON-NOSVE-NEXT:    umov w20, v3.b[4]
-; NONEON-NOSVE-NEXT:    mov v4.b[2], w7
-; NONEON-NOSVE-NEXT:    udiv w20, w21, w20
-; NONEON-NOSVE-NEXT:    umov w21, v3.b[5]
-; NONEON-NOSVE-NEXT:    mov v4.b[3], w19
-; NONEON-NOSVE-NEXT:    udiv w21, w22, w21
-; NONEON-NOSVE-NEXT:    umov w22, v3.b[6]
-; NONEON-NOSVE-NEXT:    mov v4.b[4], w20
-; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w22, w23, w22
-; NONEON-NOSVE-NEXT:    umov w23, v3.b[7]
-; NONEON-NOSVE-NEXT:    mov v4.b[5], w21
-; NONEON-NOSVE-NEXT:    udiv w23, w24, w23
-; NONEON-NOSVE-NEXT:    umov w24, v3.b[8]
-; NONEON-NOSVE-NEXT:    mov v4.b[6], w22
-; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w24, w25, w24
-; NONEON-NOSVE-NEXT:    umov w25, v3.b[9]
-; NONEON-NOSVE-NEXT:    mov v4.b[7], w23
-; NONEON-NOSVE-NEXT:    udiv w25, w26, w25
-; NONEON-NOSVE-NEXT:    umov w26, v3.b[10]
-; NONEON-NOSVE-NEXT:    mov v4.b[8], w24
-; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w8, w27, w26
-; NONEON-NOSVE-NEXT:    mov v4.b[9], w25
-; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w9, w11, w9
-; NONEON-NOSVE-NEXT:    umov w11, v2.b[12]
-; NONEON-NOSVE-NEXT:    mov v4.b[10], w8
-; NONEON-NOSVE-NEXT:    umov w8, v3.b[15]
-; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    umov w11, v3.b[13]
-; NONEON-NOSVE-NEXT:    mov v4.b[11], w9
-; NONEON-NOSVE-NEXT:    umov w9, v1.b[15]
-; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    umov w12, v3.b[14]
-; NONEON-NOSVE-NEXT:    mov v4.b[12], w10
-; NONEON-NOSVE-NEXT:    umov w10, v0.b[15]
-; NONEON-NOSVE-NEXT:    udiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    umov w13, v2.b[15]
-; NONEON-NOSVE-NEXT:    mov v4.b[13], w11
-; NONEON-NOSVE-NEXT:    udiv w8, w13, w8
-; NONEON-NOSVE-NEXT:    mov v4.b[14], w12
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    mov v4.b[15], w8
-; NONEON-NOSVE-NEXT:    mov v5.b[15], w9
-; NONEON-NOSVE-NEXT:    stp q4, q5, [x0]
-; NONEON-NOSVE-NEXT:    ldr x27, [sp], #80 // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = udiv <32 x i8> %op1, %op2
@@ -1419,22 +563,6 @@ define <2 x i16> @udiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: udiv_v2i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d2, #0x00ffff0000ffff
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v2.8b
-; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    fmov w9, s0
-; NONEON-NOSVE-NEXT:    mov w10, v0.s[1]
-; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    mov w9, v1.s[1]
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w9
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %res = udiv <2 x i16> %op1, %op2
   ret <2 x i16> %res
 }
@@ -1451,29 +579,6 @@ define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: udiv_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    umov w10, v0.h[0]
-; NONEON-NOSVE-NEXT:    umov w11, v0.h[2]
-; NONEON-NOSVE-NEXT:    umov w12, v0.h[3]
-; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    umov w9, v1.h[0]
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.h[2]
-; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    umov w11, v1.h[3]
-; NONEON-NOSVE-NEXT:    fmov s0, w9
-; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
-; NONEON-NOSVE-NEXT:    udiv w8, w12, w11
-; NONEON-NOSVE-NEXT:    mov v0.h[2], w10
-; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %res = udiv <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -1500,43 +605,6 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: udiv_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    umov w10, v0.h[0]
-; NONEON-NOSVE-NEXT:    umov w11, v0.h[2]
-; NONEON-NOSVE-NEXT:    umov w12, v0.h[3]
-; NONEON-NOSVE-NEXT:    umov w13, v0.h[4]
-; NONEON-NOSVE-NEXT:    umov w14, v0.h[5]
-; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    umov w9, v1.h[0]
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.h[2]
-; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    umov w11, v1.h[3]
-; NONEON-NOSVE-NEXT:    fmov s2, w9
-; NONEON-NOSVE-NEXT:    umov w9, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
-; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    umov w12, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w10
-; NONEON-NOSVE-NEXT:    umov w10, v0.h[6]
-; NONEON-NOSVE-NEXT:    udiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    umov w13, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w11
-; NONEON-NOSVE-NEXT:    umov w11, v0.h[7]
-; NONEON-NOSVE-NEXT:    udiv w8, w14, w13
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w12
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    udiv w8, w11, w10
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w9
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = udiv <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -1577,79 +645,6 @@ define void @udiv_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z3.h, p0, z3.h, z1.h
 ; CHECK-NEXT:    stp q3, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: udiv_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0]
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    umov w10, v0.h[0]
-; NONEON-NOSVE-NEXT:    umov w11, v0.h[2]
-; NONEON-NOSVE-NEXT:    umov w12, v0.h[3]
-; NONEON-NOSVE-NEXT:    umov w13, v0.h[4]
-; NONEON-NOSVE-NEXT:    umov w14, v0.h[5]
-; NONEON-NOSVE-NEXT:    umov w15, v0.h[6]
-; NONEON-NOSVE-NEXT:    umov w16, v2.h[1]
-; NONEON-NOSVE-NEXT:    umov w17, v2.h[0]
-; NONEON-NOSVE-NEXT:    umov w18, v2.h[2]
-; NONEON-NOSVE-NEXT:    umov w1, v2.h[3]
-; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    umov w9, v1.h[0]
-; NONEON-NOSVE-NEXT:    umov w2, v2.h[4]
-; NONEON-NOSVE-NEXT:    umov w3, v2.h[5]
-; NONEON-NOSVE-NEXT:    umov w4, v2.h[6]
-; NONEON-NOSVE-NEXT:    udiv w10, w10, w9
-; NONEON-NOSVE-NEXT:    umov w9, v1.h[2]
-; NONEON-NOSVE-NEXT:    udiv w9, w11, w9
-; NONEON-NOSVE-NEXT:    umov w11, v1.h[3]
-; NONEON-NOSVE-NEXT:    fmov s5, w10
-; NONEON-NOSVE-NEXT:    umov w10, v3.h[7]
-; NONEON-NOSVE-NEXT:    mov v5.h[1], w8
-; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    umov w12, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov v5.h[2], w9
-; NONEON-NOSVE-NEXT:    umov w9, v2.h[7]
-; NONEON-NOSVE-NEXT:    udiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    umov w13, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov v5.h[3], w11
-; NONEON-NOSVE-NEXT:    umov w11, v0.h[7]
-; NONEON-NOSVE-NEXT:    udiv w13, w14, w13
-; NONEON-NOSVE-NEXT:    umov w14, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v5.h[4], w12
-; NONEON-NOSVE-NEXT:    udiv w14, w15, w14
-; NONEON-NOSVE-NEXT:    umov w15, v3.h[1]
-; NONEON-NOSVE-NEXT:    mov v5.h[5], w13
-; NONEON-NOSVE-NEXT:    udiv w15, w16, w15
-; NONEON-NOSVE-NEXT:    umov w16, v3.h[0]
-; NONEON-NOSVE-NEXT:    mov v5.h[6], w14
-; NONEON-NOSVE-NEXT:    udiv w16, w17, w16
-; NONEON-NOSVE-NEXT:    umov w17, v3.h[2]
-; NONEON-NOSVE-NEXT:    udiv w17, w18, w17
-; NONEON-NOSVE-NEXT:    umov w18, v3.h[3]
-; NONEON-NOSVE-NEXT:    fmov s4, w16
-; NONEON-NOSVE-NEXT:    mov v4.h[1], w15
-; NONEON-NOSVE-NEXT:    udiv w18, w1, w18
-; NONEON-NOSVE-NEXT:    umov w1, v3.h[4]
-; NONEON-NOSVE-NEXT:    mov v4.h[2], w17
-; NONEON-NOSVE-NEXT:    udiv w1, w2, w1
-; NONEON-NOSVE-NEXT:    umov w2, v3.h[5]
-; NONEON-NOSVE-NEXT:    mov v4.h[3], w18
-; NONEON-NOSVE-NEXT:    udiv w2, w3, w2
-; NONEON-NOSVE-NEXT:    umov w3, v3.h[6]
-; NONEON-NOSVE-NEXT:    mov v4.h[4], w1
-; NONEON-NOSVE-NEXT:    udiv w8, w4, w3
-; NONEON-NOSVE-NEXT:    mov v4.h[5], w2
-; NONEON-NOSVE-NEXT:    udiv w9, w9, w10
-; NONEON-NOSVE-NEXT:    umov w10, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov v4.h[6], w8
-; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    mov v4.h[7], w9
-; NONEON-NOSVE-NEXT:    mov v5.h[7], w10
-; NONEON-NOSVE-NEXT:    stp q4, q5, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = udiv <16 x i16> %op1, %op2
@@ -1666,21 +661,6 @@ define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: udiv_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    fmov w9, s0
-; NONEON-NOSVE-NEXT:    mov w10, v0.s[1]
-; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    mov w9, v1.s[1]
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w9
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %res = udiv <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -1694,26 +674,6 @@ define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: udiv_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
-; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
-; NONEON-NOSVE-NEXT:    fmov w10, s0
-; NONEON-NOSVE-NEXT:    mov w11, v0.s[2]
-; NONEON-NOSVE-NEXT:    mov w12, v0.s[3]
-; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    fmov w9, s1
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    mov w10, v1.s[2]
-; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    mov w11, v1.s[3]
-; NONEON-NOSVE-NEXT:    fmov s0, w9
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w8
-; NONEON-NOSVE-NEXT:    udiv w8, w12, w11
-; NONEON-NOSVE-NEXT:    mov v0.s[2], w10
-; NONEON-NOSVE-NEXT:    mov v0.s[3], w8
-; NONEON-NOSVE-NEXT:    ret
   %res = udiv <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -1729,45 +689,6 @@ define void @udiv_v8i32(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    udiv z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: udiv_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
-; NONEON-NOSVE-NEXT:    fmov w10, s0
-; NONEON-NOSVE-NEXT:    mov w11, v0.s[2]
-; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
-; NONEON-NOSVE-NEXT:    mov w12, v2.s[1]
-; NONEON-NOSVE-NEXT:    fmov w13, s2
-; NONEON-NOSVE-NEXT:    mov w14, v2.s[2]
-; NONEON-NOSVE-NEXT:    mov w15, v2.s[3]
-; NONEON-NOSVE-NEXT:    mov w16, v0.s[3]
-; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    fmov w9, s1
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    mov w10, v1.s[2]
-; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    mov w11, v3.s[1]
-; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    fmov w12, s3
-; NONEON-NOSVE-NEXT:    udiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    mov w13, v3.s[2]
-; NONEON-NOSVE-NEXT:    udiv w13, w14, w13
-; NONEON-NOSVE-NEXT:    mov w14, v3.s[3]
-; NONEON-NOSVE-NEXT:    fmov s0, w12
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w11
-; NONEON-NOSVE-NEXT:    udiv w14, w15, w14
-; NONEON-NOSVE-NEXT:    mov w15, v1.s[3]
-; NONEON-NOSVE-NEXT:    fmov s1, w9
-; NONEON-NOSVE-NEXT:    mov v0.s[2], w13
-; NONEON-NOSVE-NEXT:    mov v1.s[1], w8
-; NONEON-NOSVE-NEXT:    mov v1.s[2], w10
-; NONEON-NOSVE-NEXT:    udiv w8, w16, w15
-; NONEON-NOSVE-NEXT:    mov v0.s[3], w14
-; NONEON-NOSVE-NEXT:    mov v1.s[3], w8
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = udiv <8 x i32> %op1, %op2
@@ -1784,16 +705,6 @@ define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    udiv z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: udiv_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    udiv x8, x9, x8
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ret
   %res = udiv <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -1807,18 +718,6 @@ define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    udiv z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: udiv_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    mov x10, v0.d[1]
-; NONEON-NOSVE-NEXT:    udiv x8, x9, x8
-; NONEON-NOSVE-NEXT:    mov x9, v1.d[1]
-; NONEON-NOSVE-NEXT:    udiv x9, x10, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
-; NONEON-NOSVE-NEXT:    ret
   %res = udiv <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -1834,29 +733,6 @@ define void @udiv_v4i64(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    udiv z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: udiv_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    mov x10, v2.d[1]
-; NONEON-NOSVE-NEXT:    fmov x11, d2
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    mov x12, v0.d[1]
-; NONEON-NOSVE-NEXT:    udiv x8, x9, x8
-; NONEON-NOSVE-NEXT:    mov x9, v3.d[1]
-; NONEON-NOSVE-NEXT:    udiv x9, x10, x9
-; NONEON-NOSVE-NEXT:    fmov x10, d3
-; NONEON-NOSVE-NEXT:    udiv x10, x11, x10
-; NONEON-NOSVE-NEXT:    mov x11, v1.d[1]
-; NONEON-NOSVE-NEXT:    fmov d1, x8
-; NONEON-NOSVE-NEXT:    udiv x11, x12, x11
-; NONEON-NOSVE-NEXT:    fmov d0, x10
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = udiv <4 x i64> %op1, %op2
@@ -1902,27 +778,6 @@ define void @udiv_constantsplat_v8i32(ptr %a)  {
 ; SVE2-NEXT:    lsr z0.s, z0.s, #6
 ; SVE2-NEXT:    stp q1, q0, [x0]
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: udiv_constantsplat_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #8969 // =0x2309
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    movk w8, #22765, lsl #16
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    umull2 v3.2d, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    umull v4.2d, v1.2s, v0.2s
-; NONEON-NOSVE-NEXT:    umull2 v5.2d, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    umull v0.2d, v2.2s, v0.2s
-; NONEON-NOSVE-NEXT:    uzp2 v3.4s, v4.4s, v3.4s
-; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v0.4s, v5.4s
-; NONEON-NOSVE-NEXT:    sub v1.4s, v1.4s, v3.4s
-; NONEON-NOSVE-NEXT:    sub v2.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    usra v3.4s, v1.4s, #1
-; NONEON-NOSVE-NEXT:    usra v0.4s, v2.4s, #1
-; NONEON-NOSVE-NEXT:    ushr v1.4s, v3.4s, #6
-; NONEON-NOSVE-NEXT:    ushr v0.4s, v0.4s, #6
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = udiv <8 x i32> %op1, <i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95>
   store <8 x i32> %res, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
index e320fed2a498..c7a89612d278 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
@@ -2,7 +2,6 @@
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE
 ; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -27,22 +26,6 @@ define void @sext_v8i1_v8i32(<8 x i1> %a, ptr %out) {
 ; CHECK-NEXT:    asr z0.s, z0.s, #31
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sext_v8i1_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    shl v0.4s, v0.4s, #31
-; NONEON-NOSVE-NEXT:    shl v1.4s, v1.4s, #31
-; NONEON-NOSVE-NEXT:    cmlt v0.4s, v0.4s, #0
-; NONEON-NOSVE-NEXT:    cmlt v1.4s, v1.4s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %b = sext <8 x i1> %a to <8 x i32>
   store <8 x i32> %b, ptr %out
   ret void
@@ -69,22 +52,6 @@ define void @sext_v4i3_v4i64(<4 x i3> %a, ptr %out) {
 ; CHECK-NEXT:    asr z0.d, z0.d, #61
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sext_v4i3_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    shl v0.2d, v0.2d, #61
-; NONEON-NOSVE-NEXT:    shl v1.2d, v1.2d, #61
-; NONEON-NOSVE-NEXT:    sshr v0.2d, v0.2d, #61
-; NONEON-NOSVE-NEXT:    sshr v1.2d, v1.2d, #61
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %b = sext <4 x i3> %a to <4 x i64>
   store <4 x i64> %b, ptr %out
   ret void
@@ -103,17 +70,6 @@ define void @sext_v16i8_v16i16(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    sunpklo z0.h, z0.b
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sext_v16i8_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    sshll v1.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %b = sext <16 x i8> %a to <16 x i16>
   store <16 x i16>%b, ptr %out
   ret void
@@ -135,24 +91,6 @@ define void @sext_v32i8_v32i16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sext_v32i8_v32i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v1.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    sshll v2.8h, v2.8b, #0
-; NONEON-NOSVE-NEXT:    sshll v3.8h, v3.8b, #0
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
-; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
   %c = sext <32 x i8> %b to <32 x i16>
@@ -174,18 +112,6 @@ define void @sext_v8i8_v8i32(<8 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sext_v8i8_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %b = sext <8 x i8> %a to <8 x i32>
   store <8 x i32>%b, ptr %out
   ret void
@@ -207,25 +133,6 @@ define void @sext_v16i8_v16i32(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    stp q3, q0, [x0, #32]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sext_v16i8_v16i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    sshll v1.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
-; NONEON-NOSVE-NEXT:    ret
   %b = sext <16 x i8> %a to <16 x i32>
   store <16 x i32> %b, ptr %out
   ret void
@@ -260,40 +167,6 @@ define void @sext_v32i8_v32i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q6, q0, [x1, #96]
 ; CHECK-NEXT:    stp q7, q1, [x1, #32]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sext_v32i8_v32i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    sshll v1.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    sshll v2.8h, v2.8b, #0
-; NONEON-NOSVE-NEXT:    sshll v3.8h, v3.8b, #0
-; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #72]
-; NONEON-NOSVE-NEXT:    sshll v5.4s, v5.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v4.4s, v4.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v2.4s, v6.4h, #0
-; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v3.4s, v7.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #96]
-; NONEON-NOSVE-NEXT:    add sp, sp, #96
-; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
   %c = sext <32 x i8> %b to <32 x i32>
@@ -321,22 +194,6 @@ define void @sext_v4i8_v4i64(<4 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    sxtb z0.d, p0/m, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sext_v4i8_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    shl v0.2d, v0.2d, #56
-; NONEON-NOSVE-NEXT:    shl v1.2d, v1.2d, #56
-; NONEON-NOSVE-NEXT:    sshr v0.2d, v0.2d, #56
-; NONEON-NOSVE-NEXT:    sshr v1.2d, v1.2d, #56
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %b = sext <4 x i8> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
   ret void
@@ -359,26 +216,6 @@ define void @sext_v8i8_v8i64(<8 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    stp q3, q0, [x0, #32]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sext_v8i8_v8i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
-; NONEON-NOSVE-NEXT:    ret
   %b = sext <8 x i8> %a to <8 x i64>
   store <8 x i64>%b, ptr %out
   ret void
@@ -416,41 +253,6 @@ define void @sext_v16i8_v16i64(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    stp q1, q4, [x0, #32]
 ; CHECK-NEXT:    stp q0, q2, [x0, #96]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sext_v16i8_v16i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-112]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 112
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    sshll v1.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #40]
-; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #48]
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #80]
-; NONEON-NOSVE-NEXT:    ldr d5, [sp, #72]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #104]
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #56]
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #88]
-; NONEON-NOSVE-NEXT:    sshll v5.2d, v5.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v4.2d, v4.2s, #0
-; NONEON-NOSVE-NEXT:    stp q1, q5, [x0, #64]
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v2.2d, v6.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q4, [x0]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v3.2d, v7.2s, #0
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x0, #96]
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x0, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #112
-; NONEON-NOSVE-NEXT:    ret
   %b = sext <16 x i8> %a to <16 x i64>
   store <16 x i64> %b, ptr %out
   ret void
@@ -519,73 +321,6 @@ define void @sext_v32i8_v32i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q0, q2, [x1, #224]
 ; CHECK-NEXT:    stp q3, q1, [x1, #96]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sext_v32i8_v32i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #224
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 224
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
-; NONEON-NOSVE-NEXT:    sshll v5.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    sshll v6.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v3.8h, v2.8b, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    sshll v4.8h, v2.8b, #0
-; NONEON-NOSVE-NEXT:    stp q3, q5, [sp, #32]
-; NONEON-NOSVE-NEXT:    sshll v5.4s, v5.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #56]
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
-; NONEON-NOSVE-NEXT:    stp q4, q6, [sp, #64]
-; NONEON-NOSVE-NEXT:    sshll v6.4s, v6.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v4.4s, v4.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #88]
-; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #72]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v7.4s, v7.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q2, q5, [sp, #128]
-; NONEON-NOSVE-NEXT:    sshll v5.2d, v5.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d19, [sp, #152]
-; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #96]
-; NONEON-NOSVE-NEXT:    ldr d20, [sp, #136]
-; NONEON-NOSVE-NEXT:    stp q1, q4, [sp, #160]
-; NONEON-NOSVE-NEXT:    ldr d17, [sp, #104]
-; NONEON-NOSVE-NEXT:    ldr d21, [sp, #120]
-; NONEON-NOSVE-NEXT:    stp q7, q6, [sp, #192]
-; NONEON-NOSVE-NEXT:    sshll v6.2d, v6.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v19.2d, v19.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d16, [sp, #216]
-; NONEON-NOSVE-NEXT:    ldr d22, [sp, #200]
-; NONEON-NOSVE-NEXT:    ldr d23, [sp, #184]
-; NONEON-NOSVE-NEXT:    ldr d18, [sp, #168]
-; NONEON-NOSVE-NEXT:    sshll v4.2d, v4.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v16.2d, v16.2s, #0
-; NONEON-NOSVE-NEXT:    stp q5, q19, [x1]
-; NONEON-NOSVE-NEXT:    sshll v5.2d, v7.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v7.2d, v22.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    stp q6, q16, [x1, #128]
-; NONEON-NOSVE-NEXT:    sshll v6.2d, v23.2s, #0
-; NONEON-NOSVE-NEXT:    stp q5, q7, [x1, #160]
-; NONEON-NOSVE-NEXT:    sshll v5.2d, v20.2s, #0
-; NONEON-NOSVE-NEXT:    stp q4, q6, [x1, #192]
-; NONEON-NOSVE-NEXT:    sshll v4.2d, v21.2s, #0
-; NONEON-NOSVE-NEXT:    stp q2, q5, [x1, #32]
-; NONEON-NOSVE-NEXT:    sshll v2.2d, v17.2s, #0
-; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #64]
-; NONEON-NOSVE-NEXT:    sshll v3.2d, v18.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #96]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #224]
-; NONEON-NOSVE-NEXT:    add sp, sp, #224
-; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
   %c = sext <32 x i8> %b to <32 x i64>
@@ -606,17 +341,6 @@ define void @sext_v8i16_v8i32(<8 x i16> %a, ptr %out) {
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sext_v8i16_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %b = sext <8 x i16> %a to <8 x i32>
   store <8 x i32>%b, ptr %out
   ret void
@@ -637,24 +361,6 @@ define void @sext_v16i16_v16i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sext_v16i16_v16i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
-; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = add <16 x i16> %a, %a
   %c = sext <16 x i16> %b to <16 x i32>
@@ -676,18 +382,6 @@ define void @sext_v4i16_v4i64(<4 x i16> %a, ptr %out) {
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sext_v4i16_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %b = sext <4 x i16> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
   ret void
@@ -709,25 +403,6 @@ define void @sext_v8i16_v8i64(<8 x i16> %a, ptr %out) {
 ; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    stp q3, q0, [x0, #32]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sext_v8i16_v8i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
-; NONEON-NOSVE-NEXT:    ret
   %b = sext <8 x i16> %a to <8 x i64>
   store <8 x i64>%b, ptr %out
   ret void
@@ -762,40 +437,6 @@ define void @sext_v16i16_v16i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q6, q0, [x1, #96]
 ; CHECK-NEXT:    stp q7, q1, [x1, #32]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sext_v16i16_v16i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #72]
-; NONEON-NOSVE-NEXT:    sshll v5.2d, v5.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v4.2d, v4.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v2.2d, v6.2s, #0
-; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v3.2d, v7.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #96]
-; NONEON-NOSVE-NEXT:    add sp, sp, #96
-; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = add <16 x i16> %a, %a
   %c = sext <16 x i16> %b to <16 x i64>
@@ -816,17 +457,6 @@ define void @sext_v4i32_v4i64(<4 x i32> %a, ptr %out) {
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sext_v4i32_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %b = sext <4 x i32> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
   ret void
@@ -847,24 +477,6 @@ define void @sext_v8i32_v8i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sext_v8i32_v8i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
-; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i32>, ptr %in
   %b = add <8 x i32> %a, %a
   %c = sext <8 x i32> %b to <8 x i64>
@@ -885,17 +497,6 @@ define void @zext_v16i8_v16i16(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: zext_v16i8_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    ushll v1.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %b = zext <16 x i8> %a to <16 x i16>
   store <16 x i16>%b, ptr %out
   ret void
@@ -917,24 +518,6 @@ define void @zext_v32i8_v32i16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: zext_v32i8_v32i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v1.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    ushll v2.8h, v2.8b, #0
-; NONEON-NOSVE-NEXT:    ushll v3.8h, v3.8b, #0
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
-; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
   %c = zext <32 x i8> %b to <32 x i16>
@@ -956,18 +539,6 @@ define void @zext_v8i8_v8i32(<8 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: zext_v8i8_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %b = zext <8 x i8> %a to <8 x i32>
   store <8 x i32>%b, ptr %out
   ret void
@@ -989,25 +560,6 @@ define void @zext_v16i8_v16i32(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    stp q3, q0, [x0, #32]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: zext_v16i8_v16i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    ushll v1.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
-; NONEON-NOSVE-NEXT:    ret
   %b = zext <16 x i8> %a to <16 x i32>
   store <16 x i32> %b, ptr %out
   ret void
@@ -1042,40 +594,6 @@ define void @zext_v32i8_v32i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q6, q0, [x1, #96]
 ; CHECK-NEXT:    stp q7, q1, [x1, #32]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: zext_v32i8_v32i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    ushll v1.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    ushll v2.8h, v2.8b, #0
-; NONEON-NOSVE-NEXT:    ushll v3.8h, v3.8b, #0
-; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #72]
-; NONEON-NOSVE-NEXT:    ushll v5.4s, v5.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v4.4s, v4.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v6.4h, #0
-; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v3.4s, v7.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #96]
-; NONEON-NOSVE-NEXT:    add sp, sp, #96
-; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
   %c = zext <32 x i8> %b to <32 x i32>
@@ -1101,20 +619,6 @@ define void @zext_v4i8_v4i64(<4 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: zext_v4i8_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d1, #0xff00ff00ff00ff
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %b = zext <4 x i8> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
   ret void
@@ -1137,26 +641,6 @@ define void @zext_v8i8_v8i64(<8 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    stp q3, q0, [x0, #32]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: zext_v8i8_v8i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
-; NONEON-NOSVE-NEXT:    ret
   %b = zext <8 x i8> %a to <8 x i64>
   store <8 x i64>%b, ptr %out
   ret void
@@ -1194,41 +678,6 @@ define void @zext_v16i8_v16i64(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    stp q1, q4, [x0, #32]
 ; CHECK-NEXT:    stp q0, q2, [x0, #96]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: zext_v16i8_v16i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-112]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 112
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    ushll v1.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #40]
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #48]
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #80]
-; NONEON-NOSVE-NEXT:    ldr d5, [sp, #72]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #104]
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #56]
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #88]
-; NONEON-NOSVE-NEXT:    ushll v5.2d, v5.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v4.2d, v4.2s, #0
-; NONEON-NOSVE-NEXT:    stp q1, q5, [x0, #64]
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v6.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q4, [x0]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v3.2d, v7.2s, #0
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x0, #96]
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x0, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #112
-; NONEON-NOSVE-NEXT:    ret
   %b = zext <16 x i8> %a to <16 x i64>
   store <16 x i64> %b, ptr %out
   ret void
@@ -1297,73 +746,6 @@ define void @zext_v32i8_v32i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q0, q2, [x1, #224]
 ; CHECK-NEXT:    stp q3, q1, [x1, #96]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: zext_v32i8_v32i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #224
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 224
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
-; NONEON-NOSVE-NEXT:    ushll v5.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    ushll v6.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v3.8h, v2.8b, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ushll v4.8h, v2.8b, #0
-; NONEON-NOSVE-NEXT:    stp q3, q5, [sp, #32]
-; NONEON-NOSVE-NEXT:    ushll v5.4s, v5.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #56]
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
-; NONEON-NOSVE-NEXT:    stp q4, q6, [sp, #64]
-; NONEON-NOSVE-NEXT:    ushll v6.4s, v6.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v4.4s, v4.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #88]
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #72]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v7.4s, v7.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q2, q5, [sp, #128]
-; NONEON-NOSVE-NEXT:    ushll v5.2d, v5.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d19, [sp, #152]
-; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #96]
-; NONEON-NOSVE-NEXT:    ldr d20, [sp, #136]
-; NONEON-NOSVE-NEXT:    stp q1, q4, [sp, #160]
-; NONEON-NOSVE-NEXT:    ldr d17, [sp, #104]
-; NONEON-NOSVE-NEXT:    ldr d21, [sp, #120]
-; NONEON-NOSVE-NEXT:    stp q7, q6, [sp, #192]
-; NONEON-NOSVE-NEXT:    ushll v6.2d, v6.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v19.2d, v19.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d16, [sp, #216]
-; NONEON-NOSVE-NEXT:    ldr d22, [sp, #200]
-; NONEON-NOSVE-NEXT:    ldr d23, [sp, #184]
-; NONEON-NOSVE-NEXT:    ldr d18, [sp, #168]
-; NONEON-NOSVE-NEXT:    ushll v4.2d, v4.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v16.2d, v16.2s, #0
-; NONEON-NOSVE-NEXT:    stp q5, q19, [x1]
-; NONEON-NOSVE-NEXT:    ushll v5.2d, v7.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v7.2d, v22.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    stp q6, q16, [x1, #128]
-; NONEON-NOSVE-NEXT:    ushll v6.2d, v23.2s, #0
-; NONEON-NOSVE-NEXT:    stp q5, q7, [x1, #160]
-; NONEON-NOSVE-NEXT:    ushll v5.2d, v20.2s, #0
-; NONEON-NOSVE-NEXT:    stp q4, q6, [x1, #192]
-; NONEON-NOSVE-NEXT:    ushll v4.2d, v21.2s, #0
-; NONEON-NOSVE-NEXT:    stp q2, q5, [x1, #32]
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v17.2s, #0
-; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #64]
-; NONEON-NOSVE-NEXT:    ushll v3.2d, v18.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #96]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #224]
-; NONEON-NOSVE-NEXT:    add sp, sp, #224
-; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
   %c = zext <32 x i8> %b to <32 x i64>
@@ -1384,17 +766,6 @@ define void @zext_v8i16_v8i32(<8 x i16> %a, ptr %out) {
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: zext_v8i16_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %b = zext <8 x i16> %a to <8 x i32>
   store <8 x i32>%b, ptr %out
   ret void
@@ -1415,24 +786,6 @@ define void @zext_v16i16_v16i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: zext_v16i16_v16i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
-; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = add <16 x i16> %a, %a
   %c = zext <16 x i16> %b to <16 x i32>
@@ -1454,18 +807,6 @@ define void @zext_v4i16_v4i64(<4 x i16> %a, ptr %out) {
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: zext_v4i16_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %b = zext <4 x i16> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
   ret void
@@ -1487,25 +828,6 @@ define void @zext_v8i16_v8i64(<8 x i16> %a, ptr %out) {
 ; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    stp q3, q0, [x0, #32]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: zext_v8i16_v8i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
-; NONEON-NOSVE-NEXT:    ret
   %b = zext <8 x i16> %a to <8 x i64>
   store <8 x i64>%b, ptr %out
   ret void
@@ -1540,40 +862,6 @@ define void @zext_v16i16_v16i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q6, q0, [x1, #96]
 ; CHECK-NEXT:    stp q7, q1, [x1, #32]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: zext_v16i16_v16i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #72]
-; NONEON-NOSVE-NEXT:    ushll v5.2d, v5.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v4.2d, v4.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v6.2s, #0
-; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v3.2d, v7.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #96]
-; NONEON-NOSVE-NEXT:    add sp, sp, #96
-; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = add <16 x i16> %a, %a
   %c = zext <16 x i16> %b to <16 x i64>
@@ -1594,17 +882,6 @@ define void @zext_v4i32_v4i64(<4 x i32> %a, ptr %out) {
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: zext_v4i32_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %b = zext <4 x i32> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
   ret void
@@ -1625,24 +902,6 @@ define void @zext_v8i32_v8i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: zext_v8i32_v8i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
-; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i32>, ptr %in
   %b = add <8 x i32> %a, %a
   %c = zext <8 x i32> %b to <8 x i64>
@@ -1669,21 +928,6 @@ define void @extend_and_mul(i32 %0, <2 x i64> %1, ptr %2) {
 ; SVE2-NEXT:    mul z0.d, z1.d, z0.d
 ; SVE2-NEXT:    str q0, [x1]
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: extend_and_mul:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v1.2s, w0
-; NONEON-NOSVE-NEXT:    fmov x10, d0
-; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    fmov x11, d1
-; NONEON-NOSVE-NEXT:    mov x9, v1.d[1]
-; NONEON-NOSVE-NEXT:    mul x10, x11, x10
-; NONEON-NOSVE-NEXT:    mul x8, x9, x8
-; NONEON-NOSVE-NEXT:    fmov d0, x10
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x8
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %broadcast.splatinsert2 = insertelement <2 x i32> poison, i32 %0, i64 0
   %broadcast.splat3 = shufflevector <2 x i32> %broadcast.splatinsert2, <2 x i32> poison, <2 x i32> zeroinitializer
   %4 = zext <2 x i32> %broadcast.splat3 to <2 x i64>
@@ -1699,13 +943,6 @@ define void @extend_no_mul(i32 %0, <2 x i64> %1, ptr %2) {
 ; CHECK-NEXT:    mov z0.d, x8
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: extend_no_mul:
-; NONEON-NOSVE:       // %bb.0: // %entry
-; NONEON-NOSVE-NEXT:    dup v0.2s, w0
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
 entry:
   %broadcast.splatinsert2 = insertelement <2 x i32> poison, i32 %0, i64 0
   %broadcast.splat3 = shufflevector <2 x i32> %broadcast.splatinsert2, <2 x i32> poison, <2 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll
index d86cfcbfb4f6..f028b3eeca25 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -23,15 +22,6 @@ define void @add_v32i8(ptr %a) {
 ; CHECK-NEXT:    add z1.b, z1.b, #7 // =0x7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: add_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #7
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i32 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -48,16 +38,6 @@ define void @add_v16i16(ptr %a) {
 ; CHECK-NEXT:    add z1.h, z1.h, #15 // =0xf
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: add_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -74,16 +54,6 @@ define void @add_v8i32(ptr %a) {
 ; CHECK-NEXT:    add z1.s, z1.s, #31 // =0x1f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: add_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    add v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -100,16 +70,6 @@ define void @add_v4i64(ptr %a) {
 ; CHECK-NEXT:    add z1.d, z1.d, #63 // =0x3f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: add_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    add v1.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    add v0.2d, v2.2d, v0.2d
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -130,15 +90,6 @@ define void @and_v32i8(ptr %a) {
 ; CHECK-NEXT:    and z1.b, z1.b, #0x7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: and_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #7
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    and v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i32 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -155,16 +106,6 @@ define void @and_v16i16(ptr %a) {
 ; CHECK-NEXT:    and z1.h, z1.h, #0xf
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: and_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    and v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -181,16 +122,6 @@ define void @and_v8i32(ptr %a) {
 ; CHECK-NEXT:    and z1.s, z1.s, #0x1f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: and_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    and v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -207,16 +138,6 @@ define void @and_v4i64(ptr %a) {
 ; CHECK-NEXT:    and z1.d, z1.d, #0x3f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: and_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    and v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -237,14 +158,6 @@ define void @ashr_v32i8(ptr %a) {
 ; CHECK-NEXT:    asr z1.b, z1.b, #7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ashr_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
-; NONEON-NOSVE-NEXT:    cmlt v1.16b, v1.16b, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i32 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -261,14 +174,6 @@ define void @ashr_v16i16(ptr %a) {
 ; CHECK-NEXT:    asr z1.h, z1.h, #15
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ashr_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cmlt v0.8h, v0.8h, #0
-; NONEON-NOSVE-NEXT:    cmlt v1.8h, v1.8h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -285,14 +190,6 @@ define void @ashr_v8i32(ptr %a) {
 ; CHECK-NEXT:    asr z1.s, z1.s, #31
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ashr_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cmlt v0.4s, v0.4s, #0
-; NONEON-NOSVE-NEXT:    cmlt v1.4s, v1.4s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -309,14 +206,6 @@ define void @ashr_v4i64(ptr %a) {
 ; CHECK-NEXT:    asr z1.d, z1.d, #63
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ashr_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cmlt v0.2d, v0.2d, #0
-; NONEON-NOSVE-NEXT:    cmlt v1.2d, v1.2d, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -340,15 +229,6 @@ define void @icmp_eq_v32i8(ptr %a) {
 ; CHECK-NEXT:    mov z1.b, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: icmp_eq_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #7
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmeq v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    cmeq v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -369,16 +249,6 @@ define void @icmp_sge_v16i16(ptr %a) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: icmp_sge_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    cmge v1.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    cmge v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -399,16 +269,6 @@ define void @icmp_sgt_v8i32(ptr %a) {
 ; CHECK-NEXT:    mov z1.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: icmp_sgt_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #-8 // =0xfffffff8
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    cmgt v1.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    cmgt v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 -8, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -429,16 +289,6 @@ define void @icmp_ult_v4i64(ptr %a) {
 ; CHECK-NEXT:    mov z1.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: icmp_ult_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    cmhi v1.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    cmhi v0.2d, v0.2d, v2.2d
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -460,14 +310,6 @@ define void @lshr_v32i8(ptr %a) {
 ; CHECK-NEXT:    lsr z1.b, z1.b, #7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: lshr_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ushr v0.16b, v0.16b, #7
-; NONEON-NOSVE-NEXT:    ushr v1.16b, v1.16b, #7
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -484,14 +326,6 @@ define void @lshr_v16i16(ptr %a) {
 ; CHECK-NEXT:    lsr z1.h, z1.h, #15
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: lshr_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ushr v0.8h, v0.8h, #15
-; NONEON-NOSVE-NEXT:    ushr v1.8h, v1.8h, #15
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -508,14 +342,6 @@ define void @lshr_v8i32(ptr %a) {
 ; CHECK-NEXT:    lsr z1.s, z1.s, #31
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: lshr_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ushr v0.4s, v0.4s, #31
-; NONEON-NOSVE-NEXT:    ushr v1.4s, v1.4s, #31
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -532,14 +358,6 @@ define void @lshr_v4i64(ptr %a) {
 ; CHECK-NEXT:    lsr z1.d, z1.d, #63
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: lshr_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ushr v0.2d, v0.2d, #63
-; NONEON-NOSVE-NEXT:    ushr v1.2d, v1.2d, #63
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -560,15 +378,6 @@ define void @mul_v32i8(ptr %a) {
 ; CHECK-NEXT:    mul z1.b, z1.b, #7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: mul_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #7
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    mul v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    mul v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -585,16 +394,6 @@ define void @mul_v16i16(ptr %a) {
 ; CHECK-NEXT:    mul z1.h, z1.h, #15
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: mul_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    mul v1.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    mul v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -611,16 +410,6 @@ define void @mul_v8i32(ptr %a) {
 ; CHECK-NEXT:    mul z1.s, z1.s, #31
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: mul_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    mul v1.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    mul v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -637,28 +426,6 @@ define void @mul_v4i64(ptr %a) {
 ; CHECK-NEXT:    mul z1.d, z1.d, #63
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: mul_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fmov x10, d0
-; NONEON-NOSVE-NEXT:    fmov x11, d1
-; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
-; NONEON-NOSVE-NEXT:    mov x9, v1.d[1]
-; NONEON-NOSVE-NEXT:    lsl x12, x10, #6
-; NONEON-NOSVE-NEXT:    lsl x13, x11, #6
-; NONEON-NOSVE-NEXT:    lsl x14, x8, #6
-; NONEON-NOSVE-NEXT:    sub x10, x12, x10
-; NONEON-NOSVE-NEXT:    sub x11, x13, x11
-; NONEON-NOSVE-NEXT:    lsl x12, x9, #6
-; NONEON-NOSVE-NEXT:    fmov d0, x10
-; NONEON-NOSVE-NEXT:    fmov d1, x11
-; NONEON-NOSVE-NEXT:    sub x8, x14, x8
-; NONEON-NOSVE-NEXT:    sub x9, x12, x9
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x8
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x9
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -679,15 +446,6 @@ define void @or_v32i8(ptr %a) {
 ; CHECK-NEXT:    orr z1.b, z1.b, #0x7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: or_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #7
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    orr v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -704,16 +462,6 @@ define void @or_v16i16(ptr %a) {
 ; CHECK-NEXT:    orr z1.h, z1.h, #0xf
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: or_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    orr v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -730,16 +478,6 @@ define void @or_v8i32(ptr %a) {
 ; CHECK-NEXT:    orr z1.s, z1.s, #0x1f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: or_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    orr v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -756,16 +494,6 @@ define void @or_v4i64(ptr %a) {
 ; CHECK-NEXT:    orr z1.d, z1.d, #0x3f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: or_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    orr v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -786,14 +514,6 @@ define void @shl_v32i8(ptr %a) {
 ; CHECK-NEXT:    lsl z1.b, z1.b, #7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shl_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
-; NONEON-NOSVE-NEXT:    shl v1.16b, v1.16b, #7
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -810,14 +530,6 @@ define void @shl_v16i16(ptr %a) {
 ; CHECK-NEXT:    lsl z1.h, z1.h, #15
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shl_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    shl v0.8h, v0.8h, #15
-; NONEON-NOSVE-NEXT:    shl v1.8h, v1.8h, #15
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -834,14 +546,6 @@ define void @shl_v8i32(ptr %a) {
 ; CHECK-NEXT:    lsl z1.s, z1.s, #31
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shl_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    shl v0.4s, v0.4s, #31
-; NONEON-NOSVE-NEXT:    shl v1.4s, v1.4s, #31
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -858,14 +562,6 @@ define void @shl_v4i64(ptr %a) {
 ; CHECK-NEXT:    lsl z1.d, z1.d, #63
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shl_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    shl v0.2d, v0.2d, #63
-; NONEON-NOSVE-NEXT:    shl v1.2d, v1.2d, #63
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -886,15 +582,6 @@ define void @smax_v32i8(ptr %a) {
 ; CHECK-NEXT:    smax z1.b, z1.b, #7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smax_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #7
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    smax v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    smax v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -911,16 +598,6 @@ define void @smax_v16i16(ptr %a) {
 ; CHECK-NEXT:    smax z1.h, z1.h, #15
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smax_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    smax v1.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    smax v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -937,16 +614,6 @@ define void @smax_v8i32(ptr %a) {
 ; CHECK-NEXT:    smax z1.s, z1.s, #31
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smax_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    smax v1.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    smax v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -963,18 +630,6 @@ define void @smax_v4i64(ptr %a) {
 ; CHECK-NEXT:    smax z1.d, z1.d, #63
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smax_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    cmgt v3.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    cmgt v4.2d, v2.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bif v1.16b, v0.16b, v3.16b
-; NONEON-NOSVE-NEXT:    bit v0.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -995,15 +650,6 @@ define void @smin_v32i8(ptr %a) {
 ; CHECK-NEXT:    smin z1.b, z1.b, #7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smin_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #7
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    smin v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    smin v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -1020,16 +666,6 @@ define void @smin_v16i16(ptr %a) {
 ; CHECK-NEXT:    smin z1.h, z1.h, #15
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smin_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    smin v1.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    smin v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -1046,16 +682,6 @@ define void @smin_v8i32(ptr %a) {
 ; CHECK-NEXT:    smin z1.s, z1.s, #31
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smin_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    smin v1.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    smin v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -1072,18 +698,6 @@ define void @smin_v4i64(ptr %a) {
 ; CHECK-NEXT:    smin z1.d, z1.d, #63
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smin_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    cmgt v3.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    cmgt v4.2d, v0.2d, v2.2d
-; NONEON-NOSVE-NEXT:    bif v1.16b, v0.16b, v3.16b
-; NONEON-NOSVE-NEXT:    bit v0.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -1104,15 +718,6 @@ define void @sub_v32i8(ptr %a) {
 ; CHECK-NEXT:    sub z1.b, z1.b, #7 // =0x7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sub_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #7
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    sub v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    sub v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -1129,16 +734,6 @@ define void @sub_v16i16(ptr %a) {
 ; CHECK-NEXT:    sub z1.h, z1.h, #15 // =0xf
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sub_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    sub v1.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    sub v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -1155,16 +750,6 @@ define void @sub_v8i32(ptr %a) {
 ; CHECK-NEXT:    sub z1.s, z1.s, #31 // =0x1f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sub_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    sub v1.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    sub v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -1181,16 +766,6 @@ define void @sub_v4i64(ptr %a) {
 ; CHECK-NEXT:    sub z1.d, z1.d, #63 // =0x3f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sub_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    sub v1.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    sub v0.2d, v2.2d, v0.2d
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -1211,15 +786,6 @@ define void @umax_v32i8(ptr %a) {
 ; CHECK-NEXT:    umax z1.b, z1.b, #7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umax_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #7
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    umax v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    umax v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -1236,16 +802,6 @@ define void @umax_v16i16(ptr %a) {
 ; CHECK-NEXT:    umax z1.h, z1.h, #15
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umax_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    umax v1.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    umax v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -1262,16 +818,6 @@ define void @umax_v8i32(ptr %a) {
 ; CHECK-NEXT:    umax z1.s, z1.s, #31
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umax_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    umax v1.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    umax v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -1288,18 +834,6 @@ define void @umax_v4i64(ptr %a) {
 ; CHECK-NEXT:    umax z1.d, z1.d, #63
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umax_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    cmhi v3.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    cmhi v4.2d, v2.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bif v1.16b, v0.16b, v3.16b
-; NONEON-NOSVE-NEXT:    bit v0.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -1320,15 +854,6 @@ define void @umin_v32i8(ptr %a) {
 ; CHECK-NEXT:    umin z1.b, z1.b, #7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umin_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #7
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    umin v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    umin v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -1345,16 +870,6 @@ define void @umin_v16i16(ptr %a) {
 ; CHECK-NEXT:    umin z1.h, z1.h, #15
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umin_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    umin v1.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    umin v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -1371,16 +886,6 @@ define void @umin_v8i32(ptr %a) {
 ; CHECK-NEXT:    umin z1.s, z1.s, #31
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umin_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    umin v1.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    umin v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -1397,18 +902,6 @@ define void @umin_v4i64(ptr %a) {
 ; CHECK-NEXT:    umin z1.d, z1.d, #63
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umin_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    cmhi v3.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    cmhi v4.2d, v0.2d, v2.2d
-; NONEON-NOSVE-NEXT:    bif v1.16b, v0.16b, v3.16b
-; NONEON-NOSVE-NEXT:    bit v0.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -1429,15 +922,6 @@ define void @xor_v32i8(ptr %a) {
 ; CHECK-NEXT:    eor z1.b, z1.b, #0x7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: xor_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #7
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    eor v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    eor v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -1454,16 +938,6 @@ define void @xor_v16i16(ptr %a) {
 ; CHECK-NEXT:    eor z1.h, z1.h, #0xf
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: xor_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    eor v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    eor v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -1480,16 +954,6 @@ define void @xor_v8i32(ptr %a) {
 ; CHECK-NEXT:    eor z1.s, z1.s, #0x1f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: xor_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    eor v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    eor v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -1506,16 +970,6 @@ define void @xor_v4i64(ptr %a) {
 ; CHECK-NEXT:    eor z1.d, z1.d, #0x3f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: xor_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    eor v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    eor v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll
index f0b39b275614..4d70c1dd1c91 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -17,11 +16,6 @@ define <8 x i8> @and_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: and_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = and <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -34,11 +28,6 @@ define <16 x i8> @and_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: and_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = and <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -52,15 +41,6 @@ define void @and_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    and z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: and_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    and v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = and <32 x i8> %op1, %op2
@@ -76,11 +56,6 @@ define <4 x i16> @and_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: and_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = and <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -93,11 +68,6 @@ define <8 x i16> @and_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: and_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = and <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -111,15 +81,6 @@ define void @and_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    and z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: and_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    and v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = and <16 x i16> %op1, %op2
@@ -135,11 +96,6 @@ define <2 x i32> @and_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: and_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = and <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -152,11 +108,6 @@ define <4 x i32> @and_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: and_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = and <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -170,15 +121,6 @@ define void @and_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    and z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: and_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    and v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = and <8 x i32> %op1, %op2
@@ -194,11 +136,6 @@ define <1 x i64> @and_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: and_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = and <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -211,11 +148,6 @@ define <2 x i64> @and_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: and_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = and <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -229,15 +161,6 @@ define void @and_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    and z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: and_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    and v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = and <4 x i64> %op1, %op2
@@ -257,11 +180,6 @@ define <8 x i8> @or_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: or_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = or <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -274,11 +192,6 @@ define <16 x i8> @or_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: or_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = or <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -292,15 +205,6 @@ define void @or_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    orr z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: or_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    orr v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = or <32 x i8> %op1, %op2
@@ -316,11 +220,6 @@ define <4 x i16> @or_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: or_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = or <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -333,11 +232,6 @@ define <8 x i16> @or_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: or_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = or <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -351,15 +245,6 @@ define void @or_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    orr z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: or_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    orr v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = or <16 x i16> %op1, %op2
@@ -375,11 +260,6 @@ define <2 x i32> @or_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: or_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = or <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -392,11 +272,6 @@ define <4 x i32> @or_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: or_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = or <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -410,15 +285,6 @@ define void @or_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    orr z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: or_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    orr v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = or <8 x i32> %op1, %op2
@@ -434,11 +300,6 @@ define <1 x i64> @or_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: or_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = or <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -451,11 +312,6 @@ define <2 x i64> @or_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: or_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = or <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -469,15 +325,6 @@ define void @or_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    orr z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: or_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    orr v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = or <4 x i64> %op1, %op2
@@ -497,11 +344,6 @@ define <8 x i8> @xor_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: xor_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = xor <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -514,11 +356,6 @@ define <16 x i8> @xor_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: xor_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    eor v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = xor <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -532,15 +369,6 @@ define void @xor_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    eor z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: xor_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    eor v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = xor <32 x i8> %op1, %op2
@@ -556,11 +384,6 @@ define <4 x i16> @xor_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: xor_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = xor <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -573,11 +396,6 @@ define <8 x i16> @xor_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: xor_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    eor v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = xor <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -591,15 +409,6 @@ define void @xor_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    eor z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: xor_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    eor v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = xor <16 x i16> %op1, %op2
@@ -615,11 +424,6 @@ define <2 x i32> @xor_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: xor_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = xor <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -632,11 +436,6 @@ define <4 x i32> @xor_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: xor_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    eor v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = xor <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -650,15 +449,6 @@ define void @xor_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    eor z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: xor_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    eor v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = xor <8 x i32> %op1, %op2
@@ -674,11 +464,6 @@ define <1 x i64> @xor_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: xor_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = xor <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -691,11 +476,6 @@ define <2 x i64> @xor_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: xor_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    eor v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = xor <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -709,15 +489,6 @@ define void @xor_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    eor z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: xor_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    eor v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = xor <4 x i64> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
index 51c404ece6cd..50cf9b73d9a7 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -18,11 +17,6 @@ define <8 x i8> @smax_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    smax z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smax_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smax v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.smax.v8i8(<8 x i8> %op1, <8 x i8> %op2)
   ret <8 x i8> %res
 }
@@ -36,11 +30,6 @@ define <16 x i8> @smax_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    smax z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smax_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smax v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.smax.v16i8(<16 x i8> %op1, <16 x i8> %op2)
   ret <16 x i8> %res
 }
@@ -56,15 +45,6 @@ define void @smax_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    smax z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smax_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    smax v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    smax v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = call <32 x i8> @llvm.smax.v32i8(<32 x i8> %op1, <32 x i8> %op2)
@@ -81,11 +61,6 @@ define <4 x i16> @smax_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    smax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smax_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smax v0.4h, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.smax.v4i16(<4 x i16> %op1, <4 x i16> %op2)
   ret <4 x i16> %res
 }
@@ -99,11 +74,6 @@ define <8 x i16> @smax_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    smax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smax_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smax v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %op1, <8 x i16> %op2)
   ret <8 x i16> %res
 }
@@ -119,15 +89,6 @@ define void @smax_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    smax z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smax_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    smax v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    smax v1.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = call <16 x i16> @llvm.smax.v16i16(<16 x i16> %op1, <16 x i16> %op2)
@@ -144,11 +105,6 @@ define <2 x i32> @smax_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    smax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smax_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smax v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.smax.v2i32(<2 x i32> %op1, <2 x i32> %op2)
   ret <2 x i32> %res
 }
@@ -162,11 +118,6 @@ define <4 x i32> @smax_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    smax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smax_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smax v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %op1, <4 x i32> %op2)
   ret <4 x i32> %res
 }
@@ -182,15 +133,6 @@ define void @smax_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    smax z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smax_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    smax v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    smax v1.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %op1, <8 x i32> %op2)
@@ -208,12 +150,6 @@ define <1 x i64> @smax_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    smax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smax_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmgt d2, d0, d1
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.smax.v1i64(<1 x i64> %op1, <1 x i64> %op2)
   ret <1 x i64> %res
 }
@@ -228,12 +164,6 @@ define <2 x i64> @smax_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    smax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smax_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmgt v2.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %op1, <2 x i64> %op2)
   ret <2 x i64> %res
 }
@@ -249,18 +179,6 @@ define void @smax_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    smax z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smax_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmgt v4.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    cmgt v5.2d, v2.2d, v3.2d
-; NONEON-NOSVE-NEXT:    bit v0.16b, v1.16b, v4.16b
-; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = call <4 x i64> @llvm.smax.v4i64(<4 x i64> %op1, <4 x i64> %op2)
@@ -281,11 +199,6 @@ define <8 x i8> @smin_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    smin z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smin_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smin v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.smin.v8i8(<8 x i8> %op1, <8 x i8> %op2)
   ret <8 x i8> %res
 }
@@ -299,11 +212,6 @@ define <16 x i8> @smin_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    smin z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smin_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smin v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.smin.v16i8(<16 x i8> %op1, <16 x i8> %op2)
   ret <16 x i8> %res
 }
@@ -319,15 +227,6 @@ define void @smin_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    smin z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smin_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    smin v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    smin v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = call <32 x i8> @llvm.smin.v32i8(<32 x i8> %op1, <32 x i8> %op2)
@@ -344,11 +243,6 @@ define <4 x i16> @smin_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    smin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smin_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smin v0.4h, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.smin.v4i16(<4 x i16> %op1, <4 x i16> %op2)
   ret <4 x i16> %res
 }
@@ -362,11 +256,6 @@ define <8 x i16> @smin_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    smin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smin_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smin v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %op1, <8 x i16> %op2)
   ret <8 x i16> %res
 }
@@ -382,15 +271,6 @@ define void @smin_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    smin z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smin_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    smin v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    smin v1.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = call <16 x i16> @llvm.smin.v16i16(<16 x i16> %op1, <16 x i16> %op2)
@@ -407,11 +287,6 @@ define <2 x i32> @smin_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    smin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smin_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smin v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %op1, <2 x i32> %op2)
   ret <2 x i32> %res
 }
@@ -425,11 +300,6 @@ define <4 x i32> @smin_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    smin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smin_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smin v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %op1, <4 x i32> %op2)
   ret <4 x i32> %res
 }
@@ -445,15 +315,6 @@ define void @smin_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    smin z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smin_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    smin v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    smin v1.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %op1, <8 x i32> %op2)
@@ -471,12 +332,6 @@ define <1 x i64> @smin_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    smin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smin_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmgt d2, d1, d0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.smin.v1i64(<1 x i64> %op1, <1 x i64> %op2)
   ret <1 x i64> %res
 }
@@ -491,12 +346,6 @@ define <2 x i64> @smin_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    smin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smin_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmgt v2.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %op1, <2 x i64> %op2)
   ret <2 x i64> %res
 }
@@ -512,18 +361,6 @@ define void @smin_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    smin z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smin_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmgt v4.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    cmgt v5.2d, v3.2d, v2.2d
-; NONEON-NOSVE-NEXT:    bit v0.16b, v1.16b, v4.16b
-; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %op1, <4 x i64> %op2)
@@ -544,11 +381,6 @@ define <8 x i8> @umax_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    umax z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umax_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umax v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.umax.v8i8(<8 x i8> %op1, <8 x i8> %op2)
   ret <8 x i8> %res
 }
@@ -562,11 +394,6 @@ define <16 x i8> @umax_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    umax z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umax_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umax v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.umax.v16i8(<16 x i8> %op1, <16 x i8> %op2)
   ret <16 x i8> %res
 }
@@ -582,15 +409,6 @@ define void @umax_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    umax z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umax_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    umax v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    umax v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = call <32 x i8> @llvm.umax.v32i8(<32 x i8> %op1, <32 x i8> %op2)
@@ -607,11 +425,6 @@ define <4 x i16> @umax_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    umax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umax_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umax v0.4h, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.umax.v4i16(<4 x i16> %op1, <4 x i16> %op2)
   ret <4 x i16> %res
 }
@@ -625,11 +438,6 @@ define <8 x i16> @umax_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    umax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umax_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umax v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.umax.v8i16(<8 x i16> %op1, <8 x i16> %op2)
   ret <8 x i16> %res
 }
@@ -645,15 +453,6 @@ define void @umax_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    umax z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umax_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    umax v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    umax v1.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = call <16 x i16> @llvm.umax.v16i16(<16 x i16> %op1, <16 x i16> %op2)
@@ -670,11 +469,6 @@ define <2 x i32> @umax_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    umax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umax_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umax v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.umax.v2i32(<2 x i32> %op1, <2 x i32> %op2)
   ret <2 x i32> %res
 }
@@ -688,11 +482,6 @@ define <4 x i32> @umax_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    umax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umax_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umax v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %op1, <4 x i32> %op2)
   ret <4 x i32> %res
 }
@@ -708,15 +497,6 @@ define void @umax_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    umax z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umax_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    umax v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    umax v1.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %op1, <8 x i32> %op2)
@@ -734,12 +514,6 @@ define <1 x i64> @umax_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    umax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umax_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmhi d2, d0, d1
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.umax.v1i64(<1 x i64> %op1, <1 x i64> %op2)
   ret <1 x i64> %res
 }
@@ -754,12 +528,6 @@ define <2 x i64> @umax_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    umax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umax_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmhi v2.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %op1, <2 x i64> %op2)
   ret <2 x i64> %res
 }
@@ -775,18 +543,6 @@ define void @umax_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    umax z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umax_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmhi v4.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    cmhi v5.2d, v2.2d, v3.2d
-; NONEON-NOSVE-NEXT:    bit v0.16b, v1.16b, v4.16b
-; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = call <4 x i64> @llvm.umax.v4i64(<4 x i64> %op1, <4 x i64> %op2)
@@ -807,11 +563,6 @@ define <8 x i8> @umin_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    umin z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umin_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umin v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.umin.v8i8(<8 x i8> %op1, <8 x i8> %op2)
   ret <8 x i8> %res
 }
@@ -825,11 +576,6 @@ define <16 x i8> @umin_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    umin z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umin_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umin v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.umin.v16i8(<16 x i8> %op1, <16 x i8> %op2)
   ret <16 x i8> %res
 }
@@ -845,15 +591,6 @@ define void @umin_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    umin z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umin_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    umin v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    umin v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = call <32 x i8> @llvm.umin.v32i8(<32 x i8> %op1, <32 x i8> %op2)
@@ -870,11 +607,6 @@ define <4 x i16> @umin_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    umin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umin_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umin v0.4h, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.umin.v4i16(<4 x i16> %op1, <4 x i16> %op2)
   ret <4 x i16> %res
 }
@@ -888,11 +620,6 @@ define <8 x i16> @umin_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    umin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umin_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umin v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.umin.v8i16(<8 x i16> %op1, <8 x i16> %op2)
   ret <8 x i16> %res
 }
@@ -908,15 +635,6 @@ define void @umin_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    umin z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umin_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    umin v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    umin v1.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = call <16 x i16> @llvm.umin.v16i16(<16 x i16> %op1, <16 x i16> %op2)
@@ -933,11 +651,6 @@ define <2 x i32> @umin_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    umin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umin_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umin v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.umin.v2i32(<2 x i32> %op1, <2 x i32> %op2)
   ret <2 x i32> %res
 }
@@ -951,11 +664,6 @@ define <4 x i32> @umin_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    umin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umin_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umin v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %op1, <4 x i32> %op2)
   ret <4 x i32> %res
 }
@@ -971,15 +679,6 @@ define void @umin_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    umin z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umin_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    umin v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    umin v1.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %op1, <8 x i32> %op2)
@@ -997,12 +696,6 @@ define <1 x i64> @umin_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umin_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmhi d2, d1, d0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.umin.v1i64(<1 x i64> %op1, <1 x i64> %op2)
   ret <1 x i64> %res
 }
@@ -1017,12 +710,6 @@ define <2 x i64> @umin_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umin_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmhi v2.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %op1, <2 x i64> %op2)
   ret <2 x i64> %res
 }
@@ -1038,18 +725,6 @@ define void @umin_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    umin z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umin_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmhi v4.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    cmhi v5.2d, v3.2d, v2.2d
-; NONEON-NOSVE-NEXT:    bit v0.16b, v1.16b, v4.16b
-; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = call <4 x i64> @llvm.umin.v4i64(<4 x i64> %op1, <4 x i64> %op2)
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll
index 83714152c173..149ad6d1e267 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sme-fa64 -force-streaming-compatible-sve < %s | FileCheck %s -check-prefix=FA64
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s -check-prefix=NO-FA64
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -21,12 +20,6 @@ define <8 x i8> @mla8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) {
 ; NO-FA64-NEXT:    mad z0.b, p0/m, z1.b, z2.b
 ; NO-FA64-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; NO-FA64-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: mla8xi8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mla v2.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov d0, d2
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = mul <8 x i8> %A, %B;
   %tmp2 = add <8 x i8> %C, %tmp1;
   ret <8 x i8> %tmp2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
index 6e6d40e2ea04..cb7fa53eac51 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
@@ -2,7 +2,6 @@
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE
 ; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 ; This test only tests the legal types for a given vector width, as mulh nodes
 ; do not get generated for non-legal types.
@@ -37,16 +36,6 @@ define <4 x i8> @smulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; SVE2-NEXT:    lsr z0.h, z0.h, #4
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smulh_v4i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    shl v1.4h, v1.4h, #8
-; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    sshr v1.4h, v1.4h, #8
-; NONEON-NOSVE-NEXT:    mul v0.4h, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    ushr v0.4h, v0.4h, #4
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i16> undef, i16 4, i64 0
   %splat = shufflevector <4 x i16> %insert, <4 x i16> undef, <4 x i32> zeroinitializer
   %1 = sext <4 x i8> %op1 to <4 x i16>
@@ -74,12 +63,6 @@ define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; SVE2-NEXT:    smulh z0.b, z0.b, z1.b
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smulh_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smull v0.8h, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    shrn v0.8b, v0.8h, #8
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x i16> undef, i16 8, i64 0
   %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
   %1 = sext <8 x i8> %op1 to <8 x i16>
@@ -107,13 +90,6 @@ define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; SVE2-NEXT:    smulh z0.b, z0.b, z1.b
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smulh_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smull2 v2.8h, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    smull v0.8h, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    uzp2 v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %1 = sext <16 x i8> %op1 to <16 x i16>
   %2 = sext <16 x i8> %op2 to <16 x i16>
   %mul = mul <16 x i16> %1, %2
@@ -142,19 +118,6 @@ define void @smulh_v32i8(ptr %a, ptr %b) {
 ; SVE2-NEXT:    smulh z1.b, z2.b, z3.b
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smulh_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    smull2 v4.8h, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    smull v0.8h, v1.8b, v0.8b
-; NONEON-NOSVE-NEXT:    smull2 v1.8h, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    smull v2.8h, v2.8b, v3.8b
-; NONEON-NOSVE-NEXT:    uzp2 v0.16b, v0.16b, v4.16b
-; NONEON-NOSVE-NEXT:    uzp2 v1.16b, v2.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %1 = sext <32 x i8> %op1 to <32 x i16>
@@ -190,16 +153,6 @@ define <2 x i16> @smulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; SVE2-NEXT:    lsr z0.s, z0.s, #16
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smulh_v2i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    shl v1.2s, v1.2s, #16
-; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    sshr v1.2s, v1.2s, #16
-; NONEON-NOSVE-NEXT:    mul v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ushr v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    ret
   %1 = sext <2 x i16> %op1 to <2 x i32>
   %2 = sext <2 x i16> %op2 to <2 x i32>
   %mul = mul <2 x i32> %1, %2
@@ -225,12 +178,6 @@ define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; SVE2-NEXT:    smulh z0.h, z0.h, z1.h
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smulh_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smull v0.4s, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    shrn v0.4h, v0.4s, #16
-; NONEON-NOSVE-NEXT:    ret
   %1 = sext <4 x i16> %op1 to <4 x i32>
   %2 = sext <4 x i16> %op2 to <4 x i32>
   %mul = mul <4 x i32> %1, %2
@@ -256,13 +203,6 @@ define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; SVE2-NEXT:    smulh z0.h, z0.h, z1.h
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smulh_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smull2 v2.4s, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    smull v0.4s, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
-; NONEON-NOSVE-NEXT:    ret
   %1 = sext <8 x i16> %op1 to <8 x i32>
   %2 = sext <8 x i16> %op2 to <8 x i32>
   %mul = mul <8 x i32> %1, %2
@@ -291,19 +231,6 @@ define void @smulh_v16i16(ptr %a, ptr %b) {
 ; SVE2-NEXT:    smulh z1.h, z2.h, z3.h
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smulh_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    smull2 v4.4s, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    smull v0.4s, v1.4h, v0.4h
-; NONEON-NOSVE-NEXT:    smull2 v1.4s, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    smull v2.4s, v2.4h, v3.4h
-; NONEON-NOSVE-NEXT:    uzp2 v0.8h, v0.8h, v4.8h
-; NONEON-NOSVE-NEXT:    uzp2 v1.8h, v2.8h, v1.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %1 = sext <16 x i16> %op1 to <16 x i32>
@@ -332,12 +259,6 @@ define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; SVE2-NEXT:    smulh z0.s, z0.s, z1.s
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smulh_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smull v0.2d, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    shrn v0.2s, v0.2d, #32
-; NONEON-NOSVE-NEXT:    ret
   %1 = sext <2 x i32> %op1 to <2 x i64>
   %2 = sext <2 x i32> %op2 to <2 x i64>
   %mul = mul <2 x i64> %1, %2
@@ -363,13 +284,6 @@ define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; SVE2-NEXT:    smulh z0.s, z0.s, z1.s
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smulh_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smull2 v2.2d, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    smull v0.2d, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v0.4s, v2.4s
-; NONEON-NOSVE-NEXT:    ret
   %1 = sext <4 x i32> %op1 to <4 x i64>
   %2 = sext <4 x i32> %op2 to <4 x i64>
   %mul = mul <4 x i64> %1, %2
@@ -398,19 +312,6 @@ define void @smulh_v8i32(ptr %a, ptr %b) {
 ; SVE2-NEXT:    smulh z1.s, z2.s, z3.s
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smulh_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    smull2 v4.2d, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    smull v0.2d, v1.2s, v0.2s
-; NONEON-NOSVE-NEXT:    smull2 v1.2d, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    smull v2.2d, v2.2s, v3.2s
-; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v0.4s, v4.4s
-; NONEON-NOSVE-NEXT:    uzp2 v1.4s, v2.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %1 = sext <8 x i32> %op1 to <8 x i64>
@@ -439,16 +340,6 @@ define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; SVE2-NEXT:    smulh z0.d, z0.d, z1.d
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smulh_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    fmov x9, d1
-; NONEON-NOSVE-NEXT:    smulh x8, x8, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <1 x i128> undef, i128 64, i128 0
   %splat = shufflevector <1 x i128> %insert, <1 x i128> undef, <1 x i32> zeroinitializer
   %1 = sext <1 x i64> %op1 to <1 x i128>
@@ -476,19 +367,6 @@ define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; SVE2-NEXT:    smulh z0.d, z0.d, z1.d
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smulh_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
-; NONEON-NOSVE-NEXT:    mov x9, v1.d[1]
-; NONEON-NOSVE-NEXT:    fmov x10, d0
-; NONEON-NOSVE-NEXT:    fmov x11, d1
-; NONEON-NOSVE-NEXT:    smulh x10, x10, x11
-; NONEON-NOSVE-NEXT:    smulh x8, x8, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x10
-; NONEON-NOSVE-NEXT:    fmov d1, x8
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
-; NONEON-NOSVE-NEXT:    ret
   %1 = sext <2 x i64> %op1 to <2 x i128>
   %2 = sext <2 x i64> %op2 to <2 x i128>
   %mul = mul <2 x i128> %1, %2
@@ -517,31 +395,6 @@ define void @smulh_v4i64(ptr %a, ptr %b) {
 ; SVE2-NEXT:    smulh z1.d, z2.d, z3.d
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smulh_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    mov x11, v0.d[1]
-; NONEON-NOSVE-NEXT:    mov x14, v3.d[1]
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    mov x10, v1.d[1]
-; NONEON-NOSVE-NEXT:    mov x13, v2.d[1]
-; NONEON-NOSVE-NEXT:    fmov x12, d3
-; NONEON-NOSVE-NEXT:    smulh x8, x8, x9
-; NONEON-NOSVE-NEXT:    fmov x9, d2
-; NONEON-NOSVE-NEXT:    smulh x10, x10, x11
-; NONEON-NOSVE-NEXT:    smulh x9, x9, x12
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    smulh x11, x13, x14
-; NONEON-NOSVE-NEXT:    fmov d1, x10
-; NONEON-NOSVE-NEXT:    fmov d2, x9
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
-; NONEON-NOSVE-NEXT:    fmov d3, x11
-; NONEON-NOSVE-NEXT:    mov v2.d[1], v3.d[0]
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %1 = sext <4 x i64> %op1 to <4 x i128>
@@ -580,15 +433,6 @@ define <4 x i8> @umulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; SVE2-NEXT:    lsr z0.h, z0.h, #4
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umulh_v4i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d2, #0xff00ff00ff00ff
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v2.8b
-; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    mul v0.4h, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    ushr v0.4h, v0.4h, #4
-; NONEON-NOSVE-NEXT:    ret
   %1 = zext <4 x i8> %op1 to <4 x i16>
   %2 = zext <4 x i8> %op2 to <4 x i16>
   %mul = mul <4 x i16> %1, %2
@@ -614,12 +458,6 @@ define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; SVE2-NEXT:    umulh z0.b, z0.b, z1.b
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umulh_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umull v0.8h, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    shrn v0.8b, v0.8h, #8
-; NONEON-NOSVE-NEXT:    ret
   %1 = zext <8 x i8> %op1 to <8 x i16>
   %2 = zext <8 x i8> %op2 to <8 x i16>
   %mul = mul <8 x i16> %1, %2
@@ -645,13 +483,6 @@ define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; SVE2-NEXT:    umulh z0.b, z0.b, z1.b
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umulh_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umull2 v2.8h, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    umull v0.8h, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    uzp2 v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %1 = zext <16 x i8> %op1 to <16 x i16>
   %2 = zext <16 x i8> %op2 to <16 x i16>
   %mul = mul <16 x i16> %1, %2
@@ -680,19 +511,6 @@ define void @umulh_v32i8(ptr %a, ptr %b) {
 ; SVE2-NEXT:    umulh z1.b, z2.b, z3.b
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umulh_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    umull2 v4.8h, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    umull v0.8h, v1.8b, v0.8b
-; NONEON-NOSVE-NEXT:    umull2 v1.8h, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    umull v2.8h, v2.8b, v3.8b
-; NONEON-NOSVE-NEXT:    uzp2 v0.16b, v0.16b, v4.16b
-; NONEON-NOSVE-NEXT:    uzp2 v1.16b, v2.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %1 = zext <32 x i8> %op1 to <32 x i16>
@@ -727,15 +545,6 @@ define <2 x i16> @umulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; SVE2-NEXT:    lsr z0.s, z0.s, #16
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umulh_v2i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d2, #0x00ffff0000ffff
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v2.8b
-; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    mul v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ushr v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    ret
   %1 = zext <2 x i16> %op1 to <2 x i32>
   %2 = zext <2 x i16> %op2 to <2 x i32>
   %mul = mul <2 x i32> %1, %2
@@ -761,12 +570,6 @@ define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; SVE2-NEXT:    umulh z0.h, z0.h, z1.h
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umulh_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umull v0.4s, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    shrn v0.4h, v0.4s, #16
-; NONEON-NOSVE-NEXT:    ret
   %1 = zext <4 x i16> %op1 to <4 x i32>
   %2 = zext <4 x i16> %op2 to <4 x i32>
   %mul = mul <4 x i32> %1, %2
@@ -792,13 +595,6 @@ define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; SVE2-NEXT:    umulh z0.h, z0.h, z1.h
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umulh_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umull2 v2.4s, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    umull v0.4s, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
-; NONEON-NOSVE-NEXT:    ret
   %1 = zext <8 x i16> %op1 to <8 x i32>
   %2 = zext <8 x i16> %op2 to <8 x i32>
   %mul = mul <8 x i32> %1, %2
@@ -827,19 +623,6 @@ define void @umulh_v16i16(ptr %a, ptr %b) {
 ; SVE2-NEXT:    umulh z1.h, z2.h, z3.h
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umulh_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    umull2 v4.4s, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    umull v0.4s, v1.4h, v0.4h
-; NONEON-NOSVE-NEXT:    umull2 v1.4s, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    umull v2.4s, v2.4h, v3.4h
-; NONEON-NOSVE-NEXT:    uzp2 v0.8h, v0.8h, v4.8h
-; NONEON-NOSVE-NEXT:    uzp2 v1.8h, v2.8h, v1.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %1 = zext <16 x i16> %op1 to <16 x i32>
@@ -868,12 +651,6 @@ define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; SVE2-NEXT:    umulh z0.s, z0.s, z1.s
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umulh_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umull v0.2d, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    shrn v0.2s, v0.2d, #32
-; NONEON-NOSVE-NEXT:    ret
   %1 = zext <2 x i32> %op1 to <2 x i64>
   %2 = zext <2 x i32> %op2 to <2 x i64>
   %mul = mul <2 x i64> %1, %2
@@ -899,13 +676,6 @@ define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; SVE2-NEXT:    umulh z0.s, z0.s, z1.s
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umulh_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umull2 v2.2d, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    umull v0.2d, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v0.4s, v2.4s
-; NONEON-NOSVE-NEXT:    ret
   %1 = zext <4 x i32> %op1 to <4 x i64>
   %2 = zext <4 x i32> %op2 to <4 x i64>
   %mul = mul <4 x i64> %1, %2
@@ -934,19 +704,6 @@ define void @umulh_v8i32(ptr %a, ptr %b) {
 ; SVE2-NEXT:    umulh z1.s, z2.s, z3.s
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umulh_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    umull2 v4.2d, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    umull v0.2d, v1.2s, v0.2s
-; NONEON-NOSVE-NEXT:    umull2 v1.2d, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    umull v2.2d, v2.2s, v3.2s
-; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v0.4s, v4.4s
-; NONEON-NOSVE-NEXT:    uzp2 v1.4s, v2.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %insert = insertelement <8 x i64> undef, i64 32, i64 0
@@ -977,16 +734,6 @@ define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; SVE2-NEXT:    umulh z0.d, z0.d, z1.d
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umulh_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    fmov x9, d1
-; NONEON-NOSVE-NEXT:    umulh x8, x8, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ret
   %1 = zext <1 x i64> %op1 to <1 x i128>
   %2 = zext <1 x i64> %op2 to <1 x i128>
   %mul = mul <1 x i128> %1, %2
@@ -1012,19 +759,6 @@ define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; SVE2-NEXT:    umulh z0.d, z0.d, z1.d
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umulh_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
-; NONEON-NOSVE-NEXT:    mov x9, v1.d[1]
-; NONEON-NOSVE-NEXT:    fmov x10, d0
-; NONEON-NOSVE-NEXT:    fmov x11, d1
-; NONEON-NOSVE-NEXT:    umulh x10, x10, x11
-; NONEON-NOSVE-NEXT:    umulh x8, x8, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x10
-; NONEON-NOSVE-NEXT:    fmov d1, x8
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
-; NONEON-NOSVE-NEXT:    ret
   %1 = zext <2 x i64> %op1 to <2 x i128>
   %2 = zext <2 x i64> %op2 to <2 x i128>
   %mul = mul <2 x i128> %1, %2
@@ -1053,31 +787,6 @@ define void @umulh_v4i64(ptr %a, ptr %b) {
 ; SVE2-NEXT:    umulh z1.d, z2.d, z3.d
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umulh_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    mov x11, v0.d[1]
-; NONEON-NOSVE-NEXT:    mov x14, v3.d[1]
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    mov x10, v1.d[1]
-; NONEON-NOSVE-NEXT:    mov x13, v2.d[1]
-; NONEON-NOSVE-NEXT:    fmov x12, d3
-; NONEON-NOSVE-NEXT:    umulh x8, x8, x9
-; NONEON-NOSVE-NEXT:    fmov x9, d2
-; NONEON-NOSVE-NEXT:    umulh x10, x10, x11
-; NONEON-NOSVE-NEXT:    umulh x9, x9, x12
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    umulh x11, x13, x14
-; NONEON-NOSVE-NEXT:    fmov d1, x10
-; NONEON-NOSVE-NEXT:    fmov d2, x9
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
-; NONEON-NOSVE-NEXT:    fmov d3, x11
-; NONEON-NOSVE-NEXT:    mov v2.d[1], v3.d[0]
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %1 = zext <4 x i64> %op1 to <4 x i128>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
index 50eaa6c12d71..751f43768a51 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -18,12 +17,6 @@ define i8 @uaddv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: uaddv_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    addv b0, v0.8b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a)
   ret i8 %res
 }
@@ -37,12 +30,6 @@ define i8 @uaddv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: uaddv_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    addv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a)
   ret i8 %res
 }
@@ -57,14 +44,6 @@ define i8 @uaddv_v32i8(ptr %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: uaddv_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    add v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    addv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %op)
   ret i8 %res
@@ -79,12 +58,6 @@ define i16 @uaddv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: uaddv_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    addv h0, v0.4h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a)
   ret i16 %res
 }
@@ -98,12 +71,6 @@ define i16 @uaddv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: uaddv_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    addv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a)
   ret i16 %res
 }
@@ -118,14 +85,6 @@ define i16 @uaddv_v16i16(ptr %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: uaddv_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    add v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    addv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %op)
   ret i16 %res
@@ -140,12 +99,6 @@ define i32 @uaddv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: uaddv_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    addp v0.2s, v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a)
   ret i32 %res
 }
@@ -159,12 +112,6 @@ define i32 @uaddv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: uaddv_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    addv s0, v0.4s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
   ret i32 %res
 }
@@ -179,14 +126,6 @@ define i32 @uaddv_v8i32(ptr %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: uaddv_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    add v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    addv s0, v0.4s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %op)
   ret i32 %res
@@ -200,12 +139,6 @@ define i64 @uaddv_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    uaddv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: uaddv_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    addp d0, v0.2d
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a)
   ret i64 %res
 }
@@ -219,14 +152,6 @@ define i64 @uaddv_v4i64(ptr %a) {
 ; CHECK-NEXT:    uaddv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: uaddv_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    add v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    addp d0, v0.2d
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %op)
   ret i64 %res
@@ -244,12 +169,6 @@ define i8 @smaxv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    smaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smaxv_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smaxv b0, v0.8b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %a)
   ret i8 %res
 }
@@ -262,12 +181,6 @@ define i8 @smaxv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    smaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smaxv_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smaxv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %a)
   ret i8 %res
 }
@@ -281,14 +194,6 @@ define i8 @smaxv_v32i8(ptr %a) {
 ; CHECK-NEXT:    smaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smaxv_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    smax v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    smaxv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %op)
   ret i8 %res
@@ -302,12 +207,6 @@ define i16 @smaxv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    smaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smaxv_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smaxv h0, v0.4h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %a)
   ret i16 %res
 }
@@ -320,12 +219,6 @@ define i16 @smaxv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    smaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smaxv_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smaxv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %a)
   ret i16 %res
 }
@@ -339,14 +232,6 @@ define i16 @smaxv_v16i16(ptr %a) {
 ; CHECK-NEXT:    smaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smaxv_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    smax v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    smaxv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %op)
   ret i16 %res
@@ -360,12 +245,6 @@ define i32 @smaxv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    smaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smaxv_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smaxp v0.2s, v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %a)
   ret i32 %res
 }
@@ -378,12 +257,6 @@ define i32 @smaxv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    smaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smaxv_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smaxv s0, v0.4s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a)
   ret i32 %res
 }
@@ -397,14 +270,6 @@ define i32 @smaxv_v8i32(ptr %a) {
 ; CHECK-NEXT:    smaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smaxv_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    smax v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    smaxv s0, v0.4s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %op)
   ret i32 %res
@@ -419,17 +284,6 @@ define i64 @smaxv_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    smaxv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smaxv_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    cmgt d2, d0, d1
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %a)
   ret i64 %res
 }
@@ -443,20 +297,6 @@ define i64 @smaxv_v4i64(ptr %a) {
 ; CHECK-NEXT:    smaxv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: smaxv_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    cmgt v2.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bit v0.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    cmgt d2, d0, d1
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %op)
   ret i64 %res
@@ -474,12 +314,6 @@ define i8 @sminv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    sminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sminv_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sminv b0, v0.8b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %a)
   ret i8 %res
 }
@@ -492,12 +326,6 @@ define i8 @sminv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    sminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sminv_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sminv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %a)
   ret i8 %res
 }
@@ -511,14 +339,6 @@ define i8 @sminv_v32i8(ptr %a) {
 ; CHECK-NEXT:    sminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sminv_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    smin v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    sminv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %op)
   ret i8 %res
@@ -532,12 +352,6 @@ define i16 @sminv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    sminv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sminv_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sminv h0, v0.4h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %a)
   ret i16 %res
 }
@@ -550,12 +364,6 @@ define i16 @sminv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    sminv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sminv_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sminv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %a)
   ret i16 %res
 }
@@ -569,14 +377,6 @@ define i16 @sminv_v16i16(ptr %a) {
 ; CHECK-NEXT:    sminv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sminv_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    smin v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    sminv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %op)
   ret i16 %res
@@ -590,12 +390,6 @@ define i32 @sminv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    sminv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sminv_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sminp v0.2s, v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %a)
   ret i32 %res
 }
@@ -608,12 +402,6 @@ define i32 @sminv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    sminv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sminv_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sminv s0, v0.4s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a)
   ret i32 %res
 }
@@ -627,14 +415,6 @@ define i32 @sminv_v8i32(ptr %a) {
 ; CHECK-NEXT:    sminv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sminv_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    smin v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    sminv s0, v0.4s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %op)
   ret i32 %res
@@ -649,17 +429,6 @@ define i64 @sminv_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    sminv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sminv_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    cmgt d2, d1, d0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %a)
   ret i64 %res
 }
@@ -673,20 +442,6 @@ define i64 @sminv_v4i64(ptr %a) {
 ; CHECK-NEXT:    sminv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sminv_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cmgt v2.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    cmgt d2, d1, d0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %op)
   ret i64 %res
@@ -704,12 +459,6 @@ define i8 @umaxv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    umaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umaxv_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umaxv b0, v0.8b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %a)
   ret i8 %res
 }
@@ -722,12 +471,6 @@ define i8 @umaxv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    umaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umaxv_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umaxv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %a)
   ret i8 %res
 }
@@ -741,14 +484,6 @@ define i8 @umaxv_v32i8(ptr %a) {
 ; CHECK-NEXT:    umaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umaxv_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    umax v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    umaxv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %op)
   ret i8 %res
@@ -762,12 +497,6 @@ define i16 @umaxv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    umaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umaxv_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umaxv h0, v0.4h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %a)
   ret i16 %res
 }
@@ -780,12 +509,6 @@ define i16 @umaxv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    umaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umaxv_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umaxv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %a)
   ret i16 %res
 }
@@ -799,14 +522,6 @@ define i16 @umaxv_v16i16(ptr %a) {
 ; CHECK-NEXT:    umaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umaxv_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    umax v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    umaxv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %op)
   ret i16 %res
@@ -820,12 +535,6 @@ define i32 @umaxv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    umaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umaxv_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umaxp v0.2s, v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %a)
   ret i32 %res
 }
@@ -838,12 +547,6 @@ define i32 @umaxv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    umaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umaxv_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umaxv s0, v0.4s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a)
   ret i32 %res
 }
@@ -857,14 +560,6 @@ define i32 @umaxv_v8i32(ptr %a) {
 ; CHECK-NEXT:    umaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umaxv_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    umax v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    umaxv s0, v0.4s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %op)
   ret i32 %res
@@ -879,17 +574,6 @@ define i64 @umaxv_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    umaxv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umaxv_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    cmhi d2, d0, d1
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %a)
   ret i64 %res
 }
@@ -903,20 +587,6 @@ define i64 @umaxv_v4i64(ptr %a) {
 ; CHECK-NEXT:    umaxv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: umaxv_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    cmhi v2.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bit v0.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    cmhi d2, d0, d1
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %op)
   ret i64 %res
@@ -934,12 +604,6 @@ define i8 @uminv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    uminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: uminv_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    uminv b0, v0.8b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %a)
   ret i8 %res
 }
@@ -952,12 +616,6 @@ define i8 @uminv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    uminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: uminv_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    uminv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %a)
   ret i8 %res
 }
@@ -971,14 +629,6 @@ define i8 @uminv_v32i8(ptr %a) {
 ; CHECK-NEXT:    uminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: uminv_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    umin v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uminv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %op)
   ret i8 %res
@@ -992,12 +642,6 @@ define i16 @uminv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    uminv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: uminv_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    uminv h0, v0.4h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %a)
   ret i16 %res
 }
@@ -1010,12 +654,6 @@ define i16 @uminv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    uminv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: uminv_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    uminv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %a)
   ret i16 %res
 }
@@ -1029,14 +667,6 @@ define i16 @uminv_v16i16(ptr %a) {
 ; CHECK-NEXT:    uminv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: uminv_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    umin v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    uminv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %op)
   ret i16 %res
@@ -1050,12 +680,6 @@ define i32 @uminv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    uminv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: uminv_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    uminp v0.2s, v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %a)
   ret i32 %res
 }
@@ -1068,12 +692,6 @@ define i32 @uminv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    uminv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: uminv_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    uminv s0, v0.4s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a)
   ret i32 %res
 }
@@ -1087,14 +705,6 @@ define i32 @uminv_v8i32(ptr %a) {
 ; CHECK-NEXT:    uminv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: uminv_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    umin v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uminv s0, v0.4s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
-; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %op)
   ret i32 %res
@@ -1109,17 +719,6 @@ define i64 @uminv_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    uminv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: uminv_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    cmhi d2, d1, d0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %a)
   ret i64 %res
 }
@@ -1133,20 +732,6 @@ define i64 @uminv_v4i64(ptr %a) {
 ; CHECK-NEXT:    uminv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: uminv_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cmhi v2.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    cmhi d2, d1, d0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %op)
   ret i64 %res
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
index 97bd76311b61..d373a9063f85 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -25,35 +24,6 @@ define <4 x i8> @srem_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: srem_v4i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    shl v1.4h, v1.4h, #8
-; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    sshr v1.4h, v1.4h, #8
-; NONEON-NOSVE-NEXT:    smov w11, v1.h[0]
-; NONEON-NOSVE-NEXT:    smov w12, v0.h[0]
-; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    smov w14, v1.h[2]
-; NONEON-NOSVE-NEXT:    smov w15, v0.h[2]
-; NONEON-NOSVE-NEXT:    smov w17, v1.h[3]
-; NONEON-NOSVE-NEXT:    smov w18, v0.h[3]
-; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    fmov s0, w11
-; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
-; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w9, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    mov v0.h[2], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w17, w18
-; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %res = srem <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -83,53 +53,6 @@ define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: srem_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    smov w11, v1.b[0]
-; NONEON-NOSVE-NEXT:    smov w12, v0.b[0]
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    smov w14, v1.b[2]
-; NONEON-NOSVE-NEXT:    smov w15, v0.b[2]
-; NONEON-NOSVE-NEXT:    smov w17, v1.b[3]
-; NONEON-NOSVE-NEXT:    smov w18, v0.b[3]
-; NONEON-NOSVE-NEXT:    smov w1, v1.b[4]
-; NONEON-NOSVE-NEXT:    smov w2, v0.b[4]
-; NONEON-NOSVE-NEXT:    smov w4, v1.b[5]
-; NONEON-NOSVE-NEXT:    smov w5, v0.b[5]
-; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    smov w13, v1.b[7]
-; NONEON-NOSVE-NEXT:    fmov s2, w11
-; NONEON-NOSVE-NEXT:    smov w11, v0.b[6]
-; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
-; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.b[6]
-; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w0, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    smov w14, v0.b[7]
-; NONEON-NOSVE-NEXT:    mov v2.b[2], w8
-; NONEON-NOSVE-NEXT:    sdiv w3, w2, w1
-; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
-; NONEON-NOSVE-NEXT:    mov v2.b[3], w8
-; NONEON-NOSVE-NEXT:    sdiv w9, w5, w4
-; NONEON-NOSVE-NEXT:    msub w8, w3, w1, w2
-; NONEON-NOSVE-NEXT:    mov v2.b[4], w8
-; NONEON-NOSVE-NEXT:    sdiv w12, w11, w10
-; NONEON-NOSVE-NEXT:    msub w8, w9, w4, w5
-; NONEON-NOSVE-NEXT:    mov v2.b[5], w8
-; NONEON-NOSVE-NEXT:    sdiv w9, w14, w13
-; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
-; NONEON-NOSVE-NEXT:    mov v2.b[6], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w13, w14
-; NONEON-NOSVE-NEXT:    mov v2.b[7], w8
-; NONEON-NOSVE-NEXT:    fmov d0, d2
-; NONEON-NOSVE-NEXT:    ret
   %res = srem <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -179,112 +102,6 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    mls z0.b, p0/m, z3.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: srem_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #-80]! // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
-; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
-; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
-; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
-; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
-; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
-; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
-; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
-; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
-; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
-; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
-; NONEON-NOSVE-NEXT:    smov w11, v1.b[0]
-; NONEON-NOSVE-NEXT:    smov w12, v0.b[0]
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    smov w14, v1.b[2]
-; NONEON-NOSVE-NEXT:    smov w15, v0.b[2]
-; NONEON-NOSVE-NEXT:    smov w17, v1.b[3]
-; NONEON-NOSVE-NEXT:    smov w18, v0.b[3]
-; NONEON-NOSVE-NEXT:    smov w1, v1.b[4]
-; NONEON-NOSVE-NEXT:    smov w2, v0.b[4]
-; NONEON-NOSVE-NEXT:    smov w4, v1.b[5]
-; NONEON-NOSVE-NEXT:    smov w5, v0.b[5]
-; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
-; NONEON-NOSVE-NEXT:    smov w7, v1.b[6]
-; NONEON-NOSVE-NEXT:    smov w19, v0.b[6]
-; NONEON-NOSVE-NEXT:    smov w21, v1.b[7]
-; NONEON-NOSVE-NEXT:    smov w22, v0.b[7]
-; NONEON-NOSVE-NEXT:    smov w24, v1.b[8]
-; NONEON-NOSVE-NEXT:    smov w25, v0.b[8]
-; NONEON-NOSVE-NEXT:    smov w27, v1.b[9]
-; NONEON-NOSVE-NEXT:    smov w28, v0.b[9]
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    smov w13, v1.b[11]
-; NONEON-NOSVE-NEXT:    fmov s2, w11
-; NONEON-NOSVE-NEXT:    smov w11, v0.b[10]
-; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
-; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.b[10]
-; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w0, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    smov w14, v0.b[11]
-; NONEON-NOSVE-NEXT:    smov w16, v1.b[12]
-; NONEON-NOSVE-NEXT:    mov v2.b[2], w8
-; NONEON-NOSVE-NEXT:    sdiv w3, w2, w1
-; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
-; NONEON-NOSVE-NEXT:    smov w17, v0.b[12]
-; NONEON-NOSVE-NEXT:    smov w0, v1.b[13]
-; NONEON-NOSVE-NEXT:    mov v2.b[3], w8
-; NONEON-NOSVE-NEXT:    sdiv w6, w5, w4
-; NONEON-NOSVE-NEXT:    msub w8, w3, w1, w2
-; NONEON-NOSVE-NEXT:    smov w1, v0.b[13]
-; NONEON-NOSVE-NEXT:    mov v2.b[4], w8
-; NONEON-NOSVE-NEXT:    sdiv w20, w19, w7
-; NONEON-NOSVE-NEXT:    msub w8, w6, w4, w5
-; NONEON-NOSVE-NEXT:    mov v2.b[5], w8
-; NONEON-NOSVE-NEXT:    sdiv w23, w22, w21
-; NONEON-NOSVE-NEXT:    msub w8, w20, w7, w19
-; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v2.b[6], w8
-; NONEON-NOSVE-NEXT:    sdiv w26, w25, w24
-; NONEON-NOSVE-NEXT:    msub w8, w23, w21, w22
-; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v2.b[7], w8
-; NONEON-NOSVE-NEXT:    sdiv w9, w28, w27
-; NONEON-NOSVE-NEXT:    msub w8, w26, w24, w25
-; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v2.b[8], w8
-; NONEON-NOSVE-NEXT:    sdiv w12, w11, w10
-; NONEON-NOSVE-NEXT:    msub w8, w9, w27, w28
-; NONEON-NOSVE-NEXT:    mov v2.b[9], w8
-; NONEON-NOSVE-NEXT:    sdiv w15, w14, w13
-; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
-; NONEON-NOSVE-NEXT:    smov w10, v1.b[14]
-; NONEON-NOSVE-NEXT:    smov w11, v0.b[14]
-; NONEON-NOSVE-NEXT:    mov v2.b[10], w8
-; NONEON-NOSVE-NEXT:    sdiv w18, w17, w16
-; NONEON-NOSVE-NEXT:    msub w8, w15, w13, w14
-; NONEON-NOSVE-NEXT:    smov w13, v1.b[15]
-; NONEON-NOSVE-NEXT:    smov w14, v0.b[15]
-; NONEON-NOSVE-NEXT:    mov v2.b[11], w8
-; NONEON-NOSVE-NEXT:    sdiv w9, w1, w0
-; NONEON-NOSVE-NEXT:    msub w8, w18, w16, w17
-; NONEON-NOSVE-NEXT:    mov v2.b[12], w8
-; NONEON-NOSVE-NEXT:    sdiv w12, w11, w10
-; NONEON-NOSVE-NEXT:    msub w8, w9, w0, w1
-; NONEON-NOSVE-NEXT:    mov v2.b[13], w8
-; NONEON-NOSVE-NEXT:    sdiv w9, w14, w13
-; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
-; NONEON-NOSVE-NEXT:    mov v2.b[14], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w13, w14
-; NONEON-NOSVE-NEXT:    mov v2.b[15], w8
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp], #80 // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ret
   %res = srem <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -372,279 +189,6 @@ define void @srem_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mls z2.b, p0/m, z7.b, z4.b
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: srem_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #320
-; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #224] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #240] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #256] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #272] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #288] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #304] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 320
-; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
-; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
-; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
-; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
-; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
-; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
-; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
-; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
-; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
-; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
-; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
-; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0]
-; NONEON-NOSVE-NEXT:    str x0, [sp, #216] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    smov w4, v3.b[1]
-; NONEON-NOSVE-NEXT:    smov w1, v2.b[1]
-; NONEON-NOSVE-NEXT:    smov w7, v3.b[7]
-; NONEON-NOSVE-NEXT:    smov w5, v2.b[7]
-; NONEON-NOSVE-NEXT:    smov w6, v3.b[8]
-; NONEON-NOSVE-NEXT:    smov w3, v2.b[8]
-; NONEON-NOSVE-NEXT:    smov w22, v3.b[9]
-; NONEON-NOSVE-NEXT:    smov w20, v2.b[9]
-; NONEON-NOSVE-NEXT:    smov w13, v3.b[0]
-; NONEON-NOSVE-NEXT:    smov w17, v3.b[3]
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    str w8, [sp, #100] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[0]
-; NONEON-NOSVE-NEXT:    str w9, [sp, #108] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[0]
-; NONEON-NOSVE-NEXT:    smov w14, v2.b[3]
-; NONEON-NOSVE-NEXT:    smov w15, v3.b[4]
-; NONEON-NOSVE-NEXT:    smov w12, v2.b[4]
-; NONEON-NOSVE-NEXT:    smov w2, v3.b[5]
-; NONEON-NOSVE-NEXT:    smov w18, v2.b[5]
-; NONEON-NOSVE-NEXT:    smov w0, v3.b[6]
-; NONEON-NOSVE-NEXT:    smov w16, v2.b[6]
-; NONEON-NOSVE-NEXT:    smov w21, v3.b[10]
-; NONEON-NOSVE-NEXT:    smov w19, v2.b[10]
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #36] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    ldr w30, [sp, #36] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    str w10, [sp, #116] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[2]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[2]
-; NONEON-NOSVE-NEXT:    stp w10, w8, [sp, #44] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[3]
-; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #52] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[3]
-; NONEON-NOSVE-NEXT:    sdiv w26, w14, w17
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w11, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[4]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[4]
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #60] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[5]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[5]
-; NONEON-NOSVE-NEXT:    str w8, [sp, #96] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w9, [sp, #104] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #68] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[6]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[6]
-; NONEON-NOSVE-NEXT:    stp w11, w8, [sp, #80] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #112] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[7]
-; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #88] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[7]
-; NONEON-NOSVE-NEXT:    sdiv w25, w12, w15
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #132] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[8]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[8]
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #140] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[9]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[9]
-; NONEON-NOSVE-NEXT:    str w8, [sp, #148] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w9, [sp, #156] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w11, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[10]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[10]
-; NONEON-NOSVE-NEXT:    str w10, [sp, #128] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #204] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[11]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[11]
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #192] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #212] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[12]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[12]
-; NONEON-NOSVE-NEXT:    str w8, [sp, #172] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w9, [sp, #180] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #200] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[13]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[13]
-; NONEON-NOSVE-NEXT:    stp w11, w8, [sp, #164] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    smov w11, v3.b[2]
-; NONEON-NOSVE-NEXT:    str w9, [sp, #176] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #188] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[14]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[14]
-; NONEON-NOSVE-NEXT:    str w8, [sp, #144] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w9, [sp, #152] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #184] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w9, v2.b[2]
-; NONEON-NOSVE-NEXT:    sdiv w8, w1, w4
-; NONEON-NOSVE-NEXT:    str w10, [sp, #160] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    smov w10, v2.b[0]
-; NONEON-NOSVE-NEXT:    str w8, [sp, #24] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w8, w5, w7
-; NONEON-NOSVE-NEXT:    str w8, [sp, #28] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w8, w3, w6
-; NONEON-NOSVE-NEXT:    str w8, [sp, #20] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w8, w20, w22
-; NONEON-NOSVE-NEXT:    sdiv w24, w10, w13
-; NONEON-NOSVE-NEXT:    str w8, [sp, #32] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    ldp w29, w8, [sp, #40] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w8, w8, w30, w29
-; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #224] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    fmov s4, w8
-; NONEON-NOSVE-NEXT:    sdiv w23, w9, w11
-; NONEON-NOSVE-NEXT:    msub w10, w24, w13, w10
-; NONEON-NOSVE-NEXT:    ldr w13, [sp, #24] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldr w24, [sp, #100] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w13, w13, w4, w1
-; NONEON-NOSVE-NEXT:    ldr w1, [sp, #116] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldr w4, [sp, #108] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    fmov s5, w10
-; NONEON-NOSVE-NEXT:    msub w1, w1, w24, w4
-; NONEON-NOSVE-NEXT:    mov v5.b[1], w13
-; NONEON-NOSVE-NEXT:    mov v4.b[1], w1
-; NONEON-NOSVE-NEXT:    ldr w1, [sp, #120] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w8, w23, w11, w9
-; NONEON-NOSVE-NEXT:    ldr w11, [sp, #48] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w28, w18, w2
-; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #52] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #272] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[2], w8
-; NONEON-NOSVE-NEXT:    msub w8, w26, w17, w14
-; NONEON-NOSVE-NEXT:    ldr w14, [sp, #72] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w11, w10
-; NONEON-NOSVE-NEXT:    ldr w17, [sp, #96] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    smov w10, v3.b[11]
-; NONEON-NOSVE-NEXT:    smov w11, v2.b[11]
-; NONEON-NOSVE-NEXT:    mov v4.b[2], w9
-; NONEON-NOSVE-NEXT:    mov v5.b[3], w8
-; NONEON-NOSVE-NEXT:    msub w8, w25, w15, w12
-; NONEON-NOSVE-NEXT:    ldp w13, w9, [sp, #76] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w27, w16, w0
-; NONEON-NOSVE-NEXT:    ldr w15, [sp, #104] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #256] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w14, w13
-; NONEON-NOSVE-NEXT:    ldr w14, [sp, #60] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[4], w8
-; NONEON-NOSVE-NEXT:    msub w8, w28, w2, w18
-; NONEON-NOSVE-NEXT:    ldr w2, [sp, #156] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[3], w9
-; NONEON-NOSVE-NEXT:    ldp w12, w9, [sp, #64] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[5], w8
-; NONEON-NOSVE-NEXT:    msub w8, w27, w0, w16
-; NONEON-NOSVE-NEXT:    ldr w0, [sp, #132] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w4, w19, w21
-; NONEON-NOSVE-NEXT:    msub w9, w9, w14, w12
-; NONEON-NOSVE-NEXT:    smov w12, v3.b[12]
-; NONEON-NOSVE-NEXT:    smov w14, v2.b[12]
-; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #240] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[6], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[4], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #112] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w8, w8, w7, w5
-; NONEON-NOSVE-NEXT:    ldr w5, [sp, #204] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w17, w15
-; NONEON-NOSVE-NEXT:    ldr w17, [sp, #84] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[7], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w13, w11, w10
-; NONEON-NOSVE-NEXT:    mov v4.b[5], w9
-; NONEON-NOSVE-NEXT:    ldp w16, w9, [sp, #88] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w8, w8, w6, w3
-; NONEON-NOSVE-NEXT:    ldr w3, [sp, #148] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w17, w16
-; NONEON-NOSVE-NEXT:    smov w16, v3.b[13]
-; NONEON-NOSVE-NEXT:    smov w17, v2.b[13]
-; NONEON-NOSVE-NEXT:    mov v5.b[8], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[6], w9
-; NONEON-NOSVE-NEXT:    msub w8, w8, w22, w20
-; NONEON-NOSVE-NEXT:    sdiv w15, w14, w12
-; NONEON-NOSVE-NEXT:    ldp w18, w9, [sp, #136] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[9], w8
-; NONEON-NOSVE-NEXT:    msub w8, w4, w21, w19
-; NONEON-NOSVE-NEXT:    msub w9, w9, w0, w18
-; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #304] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #288] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[7], w9
-; NONEON-NOSVE-NEXT:    mov v5.b[10], w8
-; NONEON-NOSVE-NEXT:    msub w8, w13, w10, w11
-; NONEON-NOSVE-NEXT:    ldp w0, w9, [sp, #124] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp, #196] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldr w13, [sp, #192] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w18, w17, w16
-; NONEON-NOSVE-NEXT:    msub w9, w9, w1, w0
-; NONEON-NOSVE-NEXT:    mov v5.b[11], w8
-; NONEON-NOSVE-NEXT:    smov w0, v3.b[14]
-; NONEON-NOSVE-NEXT:    msub w10, w10, w13, w11
-; NONEON-NOSVE-NEXT:    smov w1, v2.b[14]
-; NONEON-NOSVE-NEXT:    msub w8, w15, w12, w14
-; NONEON-NOSVE-NEXT:    mov v4.b[8], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #164] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp w15, w13, [sp, #168] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w3, w2
-; NONEON-NOSVE-NEXT:    mov v5.b[12], w8
-; NONEON-NOSVE-NEXT:    ldp w4, w3, [sp, #208] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp w14, w12, [sp, #176] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[9], w9
-; NONEON-NOSVE-NEXT:    sdiv w2, w1, w0
-; NONEON-NOSVE-NEXT:    smov w9, v3.b[15]
-; NONEON-NOSVE-NEXT:    msub w3, w3, w5, w4
-; NONEON-NOSVE-NEXT:    smov w4, v2.b[15]
-; NONEON-NOSVE-NEXT:    msub w8, w18, w16, w17
-; NONEON-NOSVE-NEXT:    ldr w16, [sp, #144] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[10], w3
-; NONEON-NOSVE-NEXT:    mov v5.b[13], w8
-; NONEON-NOSVE-NEXT:    mov v4.b[11], w10
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #188] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w11, w4, w9
-; NONEON-NOSVE-NEXT:    msub w8, w2, w0, w1
-; NONEON-NOSVE-NEXT:    msub w10, w10, w13, w12
-; NONEON-NOSVE-NEXT:    smov w12, v1.b[15]
-; NONEON-NOSVE-NEXT:    smov w13, v0.b[15]
-; NONEON-NOSVE-NEXT:    mov v5.b[14], w8
-; NONEON-NOSVE-NEXT:    mov v4.b[12], w10
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #184] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w10, w10, w15, w14
-; NONEON-NOSVE-NEXT:    ldr w15, [sp, #152] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w14, w13, w12
-; NONEON-NOSVE-NEXT:    msub w8, w11, w9, w4
-; NONEON-NOSVE-NEXT:    mov v4.b[13], w10
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #160] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[15], w8
-; NONEON-NOSVE-NEXT:    ldr x8, [sp, #216] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w10, w10, w16, w15
-; NONEON-NOSVE-NEXT:    mov v4.b[14], w10
-; NONEON-NOSVE-NEXT:    msub w9, w14, w12, w13
-; NONEON-NOSVE-NEXT:    mov v4.b[15], w9
-; NONEON-NOSVE-NEXT:    stp q5, q4, [x8]
-; NONEON-NOSVE-NEXT:    add sp, sp, #320
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = srem <32 x i8> %op1, %op2
@@ -666,33 +210,6 @@ define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: srem_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    smov w11, v1.h[0]
-; NONEON-NOSVE-NEXT:    smov w12, v0.h[0]
-; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    smov w14, v1.h[2]
-; NONEON-NOSVE-NEXT:    smov w15, v0.h[2]
-; NONEON-NOSVE-NEXT:    smov w17, v1.h[3]
-; NONEON-NOSVE-NEXT:    smov w18, v0.h[3]
-; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    fmov s0, w11
-; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
-; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w9, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    mov v0.h[2], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w17, w18
-; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %res = srem <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -721,51 +238,6 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    mls z0.h, p0/m, z3.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: srem_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smov w11, v1.h[0]
-; NONEON-NOSVE-NEXT:    smov w12, v0.h[0]
-; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    smov w14, v1.h[2]
-; NONEON-NOSVE-NEXT:    smov w15, v0.h[2]
-; NONEON-NOSVE-NEXT:    smov w17, v1.h[3]
-; NONEON-NOSVE-NEXT:    smov w18, v0.h[3]
-; NONEON-NOSVE-NEXT:    smov w1, v1.h[4]
-; NONEON-NOSVE-NEXT:    smov w2, v0.h[4]
-; NONEON-NOSVE-NEXT:    smov w4, v1.h[5]
-; NONEON-NOSVE-NEXT:    smov w5, v0.h[5]
-; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    smov w13, v1.h[7]
-; NONEON-NOSVE-NEXT:    fmov s2, w11
-; NONEON-NOSVE-NEXT:    smov w11, v0.h[6]
-; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
-; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w0, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    smov w14, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    sdiv w3, w2, w1
-; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    sdiv w9, w5, w4
-; NONEON-NOSVE-NEXT:    msub w8, w3, w1, w2
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    sdiv w12, w11, w10
-; NONEON-NOSVE-NEXT:    msub w8, w9, w4, w5
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    sdiv w9, w14, w13
-; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w13, w14
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = srem <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -810,139 +282,6 @@ define void @srem_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mls z0.h, p0/m, z7.h, z1.h
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: srem_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #144
-; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #48] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #64] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #80] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #96] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #112] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #128] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 144
-; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
-; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
-; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
-; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
-; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
-; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
-; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
-; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
-; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
-; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
-; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
-; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0]
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    smov w20, v1.h[0]
-; NONEON-NOSVE-NEXT:    smov w21, v0.h[0]
-; NONEON-NOSVE-NEXT:    smov w19, v0.h[3]
-; NONEON-NOSVE-NEXT:    smov w5, v1.h[4]
-; NONEON-NOSVE-NEXT:    smov w2, v0.h[4]
-; NONEON-NOSVE-NEXT:    smov w1, v3.h[1]
-; NONEON-NOSVE-NEXT:    smov w23, v2.h[1]
-; NONEON-NOSVE-NEXT:    smov w25, v3.h[0]
-; NONEON-NOSVE-NEXT:    smov w26, v2.h[0]
-; NONEON-NOSVE-NEXT:    smov w6, v1.h[5]
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #36] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    smov w8, v1.h[2]
-; NONEON-NOSVE-NEXT:    smov w9, v0.h[2]
-; NONEON-NOSVE-NEXT:    smov w3, v0.h[5]
-; NONEON-NOSVE-NEXT:    smov w4, v1.h[6]
-; NONEON-NOSVE-NEXT:    smov w7, v0.h[6]
-; NONEON-NOSVE-NEXT:    smov w28, v3.h[2]
-; NONEON-NOSVE-NEXT:    smov w29, v2.h[2]
-; NONEON-NOSVE-NEXT:    smov w15, v3.h[3]
-; NONEON-NOSVE-NEXT:    smov w13, v2.h[3]
-; NONEON-NOSVE-NEXT:    smov w12, v3.h[4]
-; NONEON-NOSVE-NEXT:    smov w14, v3.h[5]
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w11, w21, w20
-; NONEON-NOSVE-NEXT:    str w10, [sp, #44] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.h[3]
-; NONEON-NOSVE-NEXT:    stp w8, w11, [sp] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    smov w11, v2.h[4]
-; NONEON-NOSVE-NEXT:    ldr w22, [sp, #4] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w20, w22, w20, w21
-; NONEON-NOSVE-NEXT:    sdiv w9, w19, w8
-; NONEON-NOSVE-NEXT:    str w10, [sp, #32] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    smov w10, v3.h[6]
-; NONEON-NOSVE-NEXT:    fmov s5, w20
-; NONEON-NOSVE-NEXT:    smov w20, v3.h[7]
-; NONEON-NOSVE-NEXT:    sdiv w8, w2, w5
-; NONEON-NOSVE-NEXT:    sdiv w24, w23, w1
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w27, w26, w25
-; NONEON-NOSVE-NEXT:    msub w1, w24, w1, w23
-; NONEON-NOSVE-NEXT:    ldp w24, w23, [sp, #40] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w9, w3, w6
-; NONEON-NOSVE-NEXT:    msub w21, w27, w25, w26
-; NONEON-NOSVE-NEXT:    ldr w25, [sp, #36] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w23, w23, w25, w24
-; NONEON-NOSVE-NEXT:    ldr w25, [sp, #24] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    fmov s4, w21
-; NONEON-NOSVE-NEXT:    mov v5.h[1], w23
-; NONEON-NOSVE-NEXT:    ldp w23, w21, [sp, #28] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.h[1], w1
-; NONEON-NOSVE-NEXT:    sdiv w8, w7, w4
-; NONEON-NOSVE-NEXT:    msub w21, w21, w25, w23
-; NONEON-NOSVE-NEXT:    smov w23, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #80] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.h[2], w21
-; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #112] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w30, w29, w28
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    smov w9, v2.h[5]
-; NONEON-NOSVE-NEXT:    smov w8, v2.h[6]
-; NONEON-NOSVE-NEXT:    sdiv w18, w13, w15
-; NONEON-NOSVE-NEXT:    msub w1, w30, w28, w29
-; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #64] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #48] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.h[2], w1
-; NONEON-NOSVE-NEXT:    sdiv w16, w11, w12
-; NONEON-NOSVE-NEXT:    msub w13, w18, w15, w13
-; NONEON-NOSVE-NEXT:    ldr w15, [sp, #20] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldr w18, [sp] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w15, w15, w18, w19
-; NONEON-NOSVE-NEXT:    mov v4.h[3], w13
-; NONEON-NOSVE-NEXT:    smov w13, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov v5.h[3], w15
-; NONEON-NOSVE-NEXT:    smov w15, v0.h[7]
-; NONEON-NOSVE-NEXT:    sdiv w17, w9, w14
-; NONEON-NOSVE-NEXT:    msub w11, w16, w12, w11
-; NONEON-NOSVE-NEXT:    ldr w12, [sp, #16] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w12, w12, w5, w2
-; NONEON-NOSVE-NEXT:    mov v4.h[4], w11
-; NONEON-NOSVE-NEXT:    ldr w11, [sp, #12] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.h[4], w12
-; NONEON-NOSVE-NEXT:    msub w11, w11, w6, w3
-; NONEON-NOSVE-NEXT:    sdiv w24, w8, w10
-; NONEON-NOSVE-NEXT:    msub w9, w17, w14, w9
-; NONEON-NOSVE-NEXT:    mov v5.h[5], w11
-; NONEON-NOSVE-NEXT:    mov v4.h[5], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w4, w7
-; NONEON-NOSVE-NEXT:    sdiv w18, w23, w20
-; NONEON-NOSVE-NEXT:    msub w8, w24, w10, w8
-; NONEON-NOSVE-NEXT:    mov v5.h[6], w9
-; NONEON-NOSVE-NEXT:    mov v4.h[6], w8
-; NONEON-NOSVE-NEXT:    sdiv w12, w15, w13
-; NONEON-NOSVE-NEXT:    msub w8, w18, w20, w23
-; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #128] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #96] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.h[7], w8
-; NONEON-NOSVE-NEXT:    msub w9, w12, w13, w15
-; NONEON-NOSVE-NEXT:    mov v5.h[7], w9
-; NONEON-NOSVE-NEXT:    stp q4, q5, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #144
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = srem <16 x i16> %op1, %op2
@@ -961,23 +300,6 @@ define <2 x i32> @srem_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: srem_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    fmov w9, s0
-; NONEON-NOSVE-NEXT:    mov w11, v1.s[1]
-; NONEON-NOSVE-NEXT:    mov w12, v0.s[1]
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
-; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    msub w9, w13, w11, w12
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w9
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %res = srem <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -993,30 +315,6 @@ define <4 x i32> @srem_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: srem_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov w11, s1
-; NONEON-NOSVE-NEXT:    fmov w12, s0
-; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
-; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
-; NONEON-NOSVE-NEXT:    mov w14, v1.s[2]
-; NONEON-NOSVE-NEXT:    mov w15, v0.s[2]
-; NONEON-NOSVE-NEXT:    mov w17, v1.s[3]
-; NONEON-NOSVE-NEXT:    mov w18, v0.s[3]
-; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    fmov s0, w11
-; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
-; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w9, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    mov v0.s[2], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w17, w18
-; NONEON-NOSVE-NEXT:    mov v0.s[3], w8
-; NONEON-NOSVE-NEXT:    ret
   %res = srem <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -1036,65 +334,6 @@ define void @srem_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mls z1.s, p0/m, z5.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: srem_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str x23, [sp, #-48]! // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
-; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
-; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
-; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
-; NONEON-NOSVE-NEXT:    .cfi_offset w23, -48
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    fmov w12, s0
-; NONEON-NOSVE-NEXT:    fmov w3, s2
-; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
-; NONEON-NOSVE-NEXT:    fmov w11, s1
-; NONEON-NOSVE-NEXT:    fmov w2, s3
-; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
-; NONEON-NOSVE-NEXT:    mov w17, v3.s[1]
-; NONEON-NOSVE-NEXT:    mov w18, v2.s[1]
-; NONEON-NOSVE-NEXT:    mov w14, v1.s[2]
-; NONEON-NOSVE-NEXT:    mov w15, v0.s[2]
-; NONEON-NOSVE-NEXT:    mov w5, v3.s[2]
-; NONEON-NOSVE-NEXT:    mov w6, v2.s[2]
-; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
-; NONEON-NOSVE-NEXT:    mov w19, v3.s[3]
-; NONEON-NOSVE-NEXT:    mov w20, v2.s[3]
-; NONEON-NOSVE-NEXT:    mov w22, v1.s[3]
-; NONEON-NOSVE-NEXT:    mov w23, v0.s[3]
-; NONEON-NOSVE-NEXT:    sdiv w4, w3, w2
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    fmov s1, w11
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w12, w4, w2, w3
-; NONEON-NOSVE-NEXT:    fmov s0, w12
-; NONEON-NOSVE-NEXT:    sdiv w1, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    mov v1.s[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
-; NONEON-NOSVE-NEXT:    msub w13, w1, w17, w18
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w13
-; NONEON-NOSVE-NEXT:    sdiv w7, w6, w5
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    mov v1.s[2], w8
-; NONEON-NOSVE-NEXT:    sdiv w21, w20, w19
-; NONEON-NOSVE-NEXT:    msub w10, w7, w5, w6
-; NONEON-NOSVE-NEXT:    mov v0.s[2], w10
-; NONEON-NOSVE-NEXT:    sdiv w9, w23, w22
-; NONEON-NOSVE-NEXT:    msub w10, w21, w19, w20
-; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v0.s[3], w10
-; NONEON-NOSVE-NEXT:    msub w8, w9, w22, w23
-; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v1.s[3], w8
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr x23, [sp], #48 // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = srem <8 x i32> %op1, %op2
@@ -1113,17 +352,6 @@ define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: srem_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    sdiv x10, x9, x8
-; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ret
   %res = srem <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -1139,20 +367,6 @@ define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: srem_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    mov x11, v1.d[1]
-; NONEON-NOSVE-NEXT:    mov x12, v0.d[1]
-; NONEON-NOSVE-NEXT:    sdiv x10, x9, x8
-; NONEON-NOSVE-NEXT:    sdiv x13, x12, x11
-; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    msub x9, x13, x11, x12
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
-; NONEON-NOSVE-NEXT:    ret
   %res = srem <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -1172,33 +386,6 @@ define void @srem_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mls z1.d, p0/m, z5.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: srem_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    fmov x15, d2
-; NONEON-NOSVE-NEXT:    mov x12, v2.d[1]
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    fmov x14, d3
-; NONEON-NOSVE-NEXT:    mov x11, v3.d[1]
-; NONEON-NOSVE-NEXT:    mov x17, v1.d[1]
-; NONEON-NOSVE-NEXT:    mov x18, v0.d[1]
-; NONEON-NOSVE-NEXT:    sdiv x10, x9, x8
-; NONEON-NOSVE-NEXT:    sdiv x16, x15, x14
-; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
-; NONEON-NOSVE-NEXT:    fmov d1, x8
-; NONEON-NOSVE-NEXT:    sdiv x13, x12, x11
-; NONEON-NOSVE-NEXT:    msub x10, x16, x14, x15
-; NONEON-NOSVE-NEXT:    fmov d0, x10
-; NONEON-NOSVE-NEXT:    sdiv x1, x18, x17
-; NONEON-NOSVE-NEXT:    msub x9, x13, x11, x12
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
-; NONEON-NOSVE-NEXT:    msub x11, x1, x17, x18
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = srem <4 x i64> %op1, %op2
@@ -1226,41 +413,6 @@ define <4 x i8> @urem_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: urem_v4i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    umov w11, v1.h[0]
-; NONEON-NOSVE-NEXT:    umov w12, v0.h[0]
-; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    umov w14, v1.h[2]
-; NONEON-NOSVE-NEXT:    umov w15, v0.h[2]
-; NONEON-NOSVE-NEXT:    umov w17, v1.h[3]
-; NONEON-NOSVE-NEXT:    umov w18, v0.h[3]
-; NONEON-NOSVE-NEXT:    and w11, w11, #0xff
-; NONEON-NOSVE-NEXT:    and w12, w12, #0xff
-; NONEON-NOSVE-NEXT:    and w8, w8, #0xff
-; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
-; NONEON-NOSVE-NEXT:    and w9, w9, #0xff
-; NONEON-NOSVE-NEXT:    and w14, w14, #0xff
-; NONEON-NOSVE-NEXT:    and w15, w15, #0xff
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    and w12, w17, #0xff
-; NONEON-NOSVE-NEXT:    and w13, w18, #0xff
-; NONEON-NOSVE-NEXT:    fmov s0, w11
-; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
-; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
-; NONEON-NOSVE-NEXT:    udiv w9, w13, w12
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    mov v0.h[2], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w12, w13
-; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %res = urem <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -1290,53 +442,6 @@ define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: urem_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    umov w11, v1.b[0]
-; NONEON-NOSVE-NEXT:    umov w12, v0.b[0]
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    umov w14, v1.b[2]
-; NONEON-NOSVE-NEXT:    umov w15, v0.b[2]
-; NONEON-NOSVE-NEXT:    umov w17, v1.b[3]
-; NONEON-NOSVE-NEXT:    umov w18, v0.b[3]
-; NONEON-NOSVE-NEXT:    umov w1, v1.b[4]
-; NONEON-NOSVE-NEXT:    umov w2, v0.b[4]
-; NONEON-NOSVE-NEXT:    umov w4, v1.b[5]
-; NONEON-NOSVE-NEXT:    umov w5, v0.b[5]
-; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    umov w13, v1.b[7]
-; NONEON-NOSVE-NEXT:    fmov s2, w11
-; NONEON-NOSVE-NEXT:    umov w11, v0.b[6]
-; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
-; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.b[6]
-; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
-; NONEON-NOSVE-NEXT:    udiv w0, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    umov w14, v0.b[7]
-; NONEON-NOSVE-NEXT:    mov v2.b[2], w8
-; NONEON-NOSVE-NEXT:    udiv w3, w2, w1
-; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
-; NONEON-NOSVE-NEXT:    mov v2.b[3], w8
-; NONEON-NOSVE-NEXT:    udiv w9, w5, w4
-; NONEON-NOSVE-NEXT:    msub w8, w3, w1, w2
-; NONEON-NOSVE-NEXT:    mov v2.b[4], w8
-; NONEON-NOSVE-NEXT:    udiv w12, w11, w10
-; NONEON-NOSVE-NEXT:    msub w8, w9, w4, w5
-; NONEON-NOSVE-NEXT:    mov v2.b[5], w8
-; NONEON-NOSVE-NEXT:    udiv w9, w14, w13
-; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
-; NONEON-NOSVE-NEXT:    mov v2.b[6], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w13, w14
-; NONEON-NOSVE-NEXT:    mov v2.b[7], w8
-; NONEON-NOSVE-NEXT:    fmov d0, d2
-; NONEON-NOSVE-NEXT:    ret
   %res = urem <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -1386,112 +491,6 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    mls z0.b, p0/m, z3.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: urem_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #-80]! // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
-; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
-; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
-; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
-; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
-; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
-; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
-; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
-; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
-; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
-; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
-; NONEON-NOSVE-NEXT:    umov w11, v1.b[0]
-; NONEON-NOSVE-NEXT:    umov w12, v0.b[0]
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    umov w14, v1.b[2]
-; NONEON-NOSVE-NEXT:    umov w15, v0.b[2]
-; NONEON-NOSVE-NEXT:    umov w17, v1.b[3]
-; NONEON-NOSVE-NEXT:    umov w18, v0.b[3]
-; NONEON-NOSVE-NEXT:    umov w1, v1.b[4]
-; NONEON-NOSVE-NEXT:    umov w2, v0.b[4]
-; NONEON-NOSVE-NEXT:    umov w4, v1.b[5]
-; NONEON-NOSVE-NEXT:    umov w5, v0.b[5]
-; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
-; NONEON-NOSVE-NEXT:    umov w7, v1.b[6]
-; NONEON-NOSVE-NEXT:    umov w19, v0.b[6]
-; NONEON-NOSVE-NEXT:    umov w21, v1.b[7]
-; NONEON-NOSVE-NEXT:    umov w22, v0.b[7]
-; NONEON-NOSVE-NEXT:    umov w24, v1.b[8]
-; NONEON-NOSVE-NEXT:    umov w25, v0.b[8]
-; NONEON-NOSVE-NEXT:    umov w27, v1.b[9]
-; NONEON-NOSVE-NEXT:    umov w28, v0.b[9]
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    umov w13, v1.b[11]
-; NONEON-NOSVE-NEXT:    fmov s2, w11
-; NONEON-NOSVE-NEXT:    umov w11, v0.b[10]
-; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
-; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.b[10]
-; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
-; NONEON-NOSVE-NEXT:    udiv w0, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    umov w14, v0.b[11]
-; NONEON-NOSVE-NEXT:    umov w16, v1.b[12]
-; NONEON-NOSVE-NEXT:    mov v2.b[2], w8
-; NONEON-NOSVE-NEXT:    udiv w3, w2, w1
-; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
-; NONEON-NOSVE-NEXT:    umov w17, v0.b[12]
-; NONEON-NOSVE-NEXT:    umov w0, v1.b[13]
-; NONEON-NOSVE-NEXT:    mov v2.b[3], w8
-; NONEON-NOSVE-NEXT:    udiv w6, w5, w4
-; NONEON-NOSVE-NEXT:    msub w8, w3, w1, w2
-; NONEON-NOSVE-NEXT:    umov w1, v0.b[13]
-; NONEON-NOSVE-NEXT:    mov v2.b[4], w8
-; NONEON-NOSVE-NEXT:    udiv w20, w19, w7
-; NONEON-NOSVE-NEXT:    msub w8, w6, w4, w5
-; NONEON-NOSVE-NEXT:    mov v2.b[5], w8
-; NONEON-NOSVE-NEXT:    udiv w23, w22, w21
-; NONEON-NOSVE-NEXT:    msub w8, w20, w7, w19
-; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v2.b[6], w8
-; NONEON-NOSVE-NEXT:    udiv w26, w25, w24
-; NONEON-NOSVE-NEXT:    msub w8, w23, w21, w22
-; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v2.b[7], w8
-; NONEON-NOSVE-NEXT:    udiv w9, w28, w27
-; NONEON-NOSVE-NEXT:    msub w8, w26, w24, w25
-; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v2.b[8], w8
-; NONEON-NOSVE-NEXT:    udiv w12, w11, w10
-; NONEON-NOSVE-NEXT:    msub w8, w9, w27, w28
-; NONEON-NOSVE-NEXT:    mov v2.b[9], w8
-; NONEON-NOSVE-NEXT:    udiv w15, w14, w13
-; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
-; NONEON-NOSVE-NEXT:    umov w10, v1.b[14]
-; NONEON-NOSVE-NEXT:    umov w11, v0.b[14]
-; NONEON-NOSVE-NEXT:    mov v2.b[10], w8
-; NONEON-NOSVE-NEXT:    udiv w18, w17, w16
-; NONEON-NOSVE-NEXT:    msub w8, w15, w13, w14
-; NONEON-NOSVE-NEXT:    umov w13, v1.b[15]
-; NONEON-NOSVE-NEXT:    umov w14, v0.b[15]
-; NONEON-NOSVE-NEXT:    mov v2.b[11], w8
-; NONEON-NOSVE-NEXT:    udiv w9, w1, w0
-; NONEON-NOSVE-NEXT:    msub w8, w18, w16, w17
-; NONEON-NOSVE-NEXT:    mov v2.b[12], w8
-; NONEON-NOSVE-NEXT:    udiv w12, w11, w10
-; NONEON-NOSVE-NEXT:    msub w8, w9, w0, w1
-; NONEON-NOSVE-NEXT:    mov v2.b[13], w8
-; NONEON-NOSVE-NEXT:    udiv w9, w14, w13
-; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
-; NONEON-NOSVE-NEXT:    mov v2.b[14], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w13, w14
-; NONEON-NOSVE-NEXT:    mov v2.b[15], w8
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp], #80 // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ret
   %res = urem <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -1579,279 +578,6 @@ define void @urem_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mls z2.b, p0/m, z7.b, z4.b
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: urem_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #320
-; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #224] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #240] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #256] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #272] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #288] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #304] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 320
-; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
-; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
-; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
-; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
-; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
-; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
-; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
-; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
-; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
-; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
-; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
-; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0]
-; NONEON-NOSVE-NEXT:    str x0, [sp, #216] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    umov w4, v3.b[1]
-; NONEON-NOSVE-NEXT:    umov w1, v2.b[1]
-; NONEON-NOSVE-NEXT:    umov w7, v3.b[7]
-; NONEON-NOSVE-NEXT:    umov w5, v2.b[7]
-; NONEON-NOSVE-NEXT:    umov w6, v3.b[8]
-; NONEON-NOSVE-NEXT:    umov w3, v2.b[8]
-; NONEON-NOSVE-NEXT:    umov w22, v3.b[9]
-; NONEON-NOSVE-NEXT:    umov w20, v2.b[9]
-; NONEON-NOSVE-NEXT:    umov w13, v3.b[0]
-; NONEON-NOSVE-NEXT:    umov w17, v3.b[3]
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    str w8, [sp, #100] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[0]
-; NONEON-NOSVE-NEXT:    str w9, [sp, #108] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[0]
-; NONEON-NOSVE-NEXT:    umov w14, v2.b[3]
-; NONEON-NOSVE-NEXT:    umov w15, v3.b[4]
-; NONEON-NOSVE-NEXT:    umov w12, v2.b[4]
-; NONEON-NOSVE-NEXT:    umov w2, v3.b[5]
-; NONEON-NOSVE-NEXT:    umov w18, v2.b[5]
-; NONEON-NOSVE-NEXT:    umov w0, v3.b[6]
-; NONEON-NOSVE-NEXT:    umov w16, v2.b[6]
-; NONEON-NOSVE-NEXT:    umov w21, v3.b[10]
-; NONEON-NOSVE-NEXT:    umov w19, v2.b[10]
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #36] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    ldr w30, [sp, #36] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    str w10, [sp, #116] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[2]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[2]
-; NONEON-NOSVE-NEXT:    stp w10, w8, [sp, #44] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[3]
-; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #52] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[3]
-; NONEON-NOSVE-NEXT:    udiv w26, w14, w17
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w11, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[4]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[4]
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #60] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[5]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[5]
-; NONEON-NOSVE-NEXT:    str w8, [sp, #96] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w9, [sp, #104] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #68] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[6]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[6]
-; NONEON-NOSVE-NEXT:    stp w11, w8, [sp, #80] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #112] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[7]
-; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #88] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[7]
-; NONEON-NOSVE-NEXT:    udiv w25, w12, w15
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #132] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[8]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[8]
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #140] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[9]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[9]
-; NONEON-NOSVE-NEXT:    str w8, [sp, #148] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w9, [sp, #156] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w11, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[10]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[10]
-; NONEON-NOSVE-NEXT:    str w10, [sp, #128] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #204] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[11]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[11]
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #192] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #212] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[12]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[12]
-; NONEON-NOSVE-NEXT:    str w8, [sp, #172] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w9, [sp, #180] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #200] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[13]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[13]
-; NONEON-NOSVE-NEXT:    stp w11, w8, [sp, #164] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    umov w11, v3.b[2]
-; NONEON-NOSVE-NEXT:    str w9, [sp, #176] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #188] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[14]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[14]
-; NONEON-NOSVE-NEXT:    str w8, [sp, #144] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w9, [sp, #152] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #184] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w9, v2.b[2]
-; NONEON-NOSVE-NEXT:    udiv w8, w1, w4
-; NONEON-NOSVE-NEXT:    str w10, [sp, #160] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    umov w10, v2.b[0]
-; NONEON-NOSVE-NEXT:    str w8, [sp, #24] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w8, w5, w7
-; NONEON-NOSVE-NEXT:    str w8, [sp, #28] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w8, w3, w6
-; NONEON-NOSVE-NEXT:    str w8, [sp, #20] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w8, w20, w22
-; NONEON-NOSVE-NEXT:    udiv w24, w10, w13
-; NONEON-NOSVE-NEXT:    str w8, [sp, #32] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    ldp w29, w8, [sp, #40] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w8, w8, w30, w29
-; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #224] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    fmov s4, w8
-; NONEON-NOSVE-NEXT:    udiv w23, w9, w11
-; NONEON-NOSVE-NEXT:    msub w10, w24, w13, w10
-; NONEON-NOSVE-NEXT:    ldr w13, [sp, #24] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldr w24, [sp, #100] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w13, w13, w4, w1
-; NONEON-NOSVE-NEXT:    ldr w1, [sp, #116] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldr w4, [sp, #108] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    fmov s5, w10
-; NONEON-NOSVE-NEXT:    msub w1, w1, w24, w4
-; NONEON-NOSVE-NEXT:    mov v5.b[1], w13
-; NONEON-NOSVE-NEXT:    mov v4.b[1], w1
-; NONEON-NOSVE-NEXT:    ldr w1, [sp, #120] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w8, w23, w11, w9
-; NONEON-NOSVE-NEXT:    ldr w11, [sp, #48] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w28, w18, w2
-; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #52] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #272] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[2], w8
-; NONEON-NOSVE-NEXT:    msub w8, w26, w17, w14
-; NONEON-NOSVE-NEXT:    ldr w14, [sp, #72] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w11, w10
-; NONEON-NOSVE-NEXT:    ldr w17, [sp, #96] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    umov w10, v3.b[11]
-; NONEON-NOSVE-NEXT:    umov w11, v2.b[11]
-; NONEON-NOSVE-NEXT:    mov v4.b[2], w9
-; NONEON-NOSVE-NEXT:    mov v5.b[3], w8
-; NONEON-NOSVE-NEXT:    msub w8, w25, w15, w12
-; NONEON-NOSVE-NEXT:    ldp w13, w9, [sp, #76] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w27, w16, w0
-; NONEON-NOSVE-NEXT:    ldr w15, [sp, #104] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #256] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w14, w13
-; NONEON-NOSVE-NEXT:    ldr w14, [sp, #60] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[4], w8
-; NONEON-NOSVE-NEXT:    msub w8, w28, w2, w18
-; NONEON-NOSVE-NEXT:    ldr w2, [sp, #156] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[3], w9
-; NONEON-NOSVE-NEXT:    ldp w12, w9, [sp, #64] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[5], w8
-; NONEON-NOSVE-NEXT:    msub w8, w27, w0, w16
-; NONEON-NOSVE-NEXT:    ldr w0, [sp, #132] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w4, w19, w21
-; NONEON-NOSVE-NEXT:    msub w9, w9, w14, w12
-; NONEON-NOSVE-NEXT:    umov w12, v3.b[12]
-; NONEON-NOSVE-NEXT:    umov w14, v2.b[12]
-; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #240] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[6], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[4], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #112] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w8, w8, w7, w5
-; NONEON-NOSVE-NEXT:    ldr w5, [sp, #204] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w17, w15
-; NONEON-NOSVE-NEXT:    ldr w17, [sp, #84] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[7], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w13, w11, w10
-; NONEON-NOSVE-NEXT:    mov v4.b[5], w9
-; NONEON-NOSVE-NEXT:    ldp w16, w9, [sp, #88] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w8, w8, w6, w3
-; NONEON-NOSVE-NEXT:    ldr w3, [sp, #148] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w17, w16
-; NONEON-NOSVE-NEXT:    umov w16, v3.b[13]
-; NONEON-NOSVE-NEXT:    umov w17, v2.b[13]
-; NONEON-NOSVE-NEXT:    mov v5.b[8], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[6], w9
-; NONEON-NOSVE-NEXT:    msub w8, w8, w22, w20
-; NONEON-NOSVE-NEXT:    udiv w15, w14, w12
-; NONEON-NOSVE-NEXT:    ldp w18, w9, [sp, #136] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[9], w8
-; NONEON-NOSVE-NEXT:    msub w8, w4, w21, w19
-; NONEON-NOSVE-NEXT:    msub w9, w9, w0, w18
-; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #304] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #288] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[7], w9
-; NONEON-NOSVE-NEXT:    mov v5.b[10], w8
-; NONEON-NOSVE-NEXT:    msub w8, w13, w10, w11
-; NONEON-NOSVE-NEXT:    ldp w0, w9, [sp, #124] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp, #196] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldr w13, [sp, #192] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w18, w17, w16
-; NONEON-NOSVE-NEXT:    msub w9, w9, w1, w0
-; NONEON-NOSVE-NEXT:    mov v5.b[11], w8
-; NONEON-NOSVE-NEXT:    umov w0, v3.b[14]
-; NONEON-NOSVE-NEXT:    msub w10, w10, w13, w11
-; NONEON-NOSVE-NEXT:    umov w1, v2.b[14]
-; NONEON-NOSVE-NEXT:    msub w8, w15, w12, w14
-; NONEON-NOSVE-NEXT:    mov v4.b[8], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #164] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp w15, w13, [sp, #168] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w3, w2
-; NONEON-NOSVE-NEXT:    mov v5.b[12], w8
-; NONEON-NOSVE-NEXT:    ldp w4, w3, [sp, #208] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp w14, w12, [sp, #176] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[9], w9
-; NONEON-NOSVE-NEXT:    udiv w2, w1, w0
-; NONEON-NOSVE-NEXT:    umov w9, v3.b[15]
-; NONEON-NOSVE-NEXT:    msub w3, w3, w5, w4
-; NONEON-NOSVE-NEXT:    umov w4, v2.b[15]
-; NONEON-NOSVE-NEXT:    msub w8, w18, w16, w17
-; NONEON-NOSVE-NEXT:    ldr w16, [sp, #144] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[10], w3
-; NONEON-NOSVE-NEXT:    mov v5.b[13], w8
-; NONEON-NOSVE-NEXT:    mov v4.b[11], w10
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #188] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w11, w4, w9
-; NONEON-NOSVE-NEXT:    msub w8, w2, w0, w1
-; NONEON-NOSVE-NEXT:    msub w10, w10, w13, w12
-; NONEON-NOSVE-NEXT:    umov w12, v1.b[15]
-; NONEON-NOSVE-NEXT:    umov w13, v0.b[15]
-; NONEON-NOSVE-NEXT:    mov v5.b[14], w8
-; NONEON-NOSVE-NEXT:    mov v4.b[12], w10
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #184] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w10, w10, w15, w14
-; NONEON-NOSVE-NEXT:    ldr w15, [sp, #152] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w14, w13, w12
-; NONEON-NOSVE-NEXT:    msub w8, w11, w9, w4
-; NONEON-NOSVE-NEXT:    mov v4.b[13], w10
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #160] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[15], w8
-; NONEON-NOSVE-NEXT:    ldr x8, [sp, #216] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w10, w10, w16, w15
-; NONEON-NOSVE-NEXT:    mov v4.b[14], w10
-; NONEON-NOSVE-NEXT:    msub w9, w14, w12, w13
-; NONEON-NOSVE-NEXT:    mov v4.b[15], w9
-; NONEON-NOSVE-NEXT:    stp q5, q4, [x8]
-; NONEON-NOSVE-NEXT:    add sp, sp, #320
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = urem <32 x i8> %op1, %op2
@@ -1873,33 +599,6 @@ define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: urem_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    umov w11, v1.h[0]
-; NONEON-NOSVE-NEXT:    umov w12, v0.h[0]
-; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    umov w14, v1.h[2]
-; NONEON-NOSVE-NEXT:    umov w15, v0.h[2]
-; NONEON-NOSVE-NEXT:    umov w17, v1.h[3]
-; NONEON-NOSVE-NEXT:    umov w18, v0.h[3]
-; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    fmov s0, w11
-; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
-; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
-; NONEON-NOSVE-NEXT:    udiv w9, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    mov v0.h[2], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w17, w18
-; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %res = urem <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -1928,51 +627,6 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    mls z0.h, p0/m, z3.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: urem_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umov w11, v1.h[0]
-; NONEON-NOSVE-NEXT:    umov w12, v0.h[0]
-; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    umov w14, v1.h[2]
-; NONEON-NOSVE-NEXT:    umov w15, v0.h[2]
-; NONEON-NOSVE-NEXT:    umov w17, v1.h[3]
-; NONEON-NOSVE-NEXT:    umov w18, v0.h[3]
-; NONEON-NOSVE-NEXT:    umov w1, v1.h[4]
-; NONEON-NOSVE-NEXT:    umov w2, v0.h[4]
-; NONEON-NOSVE-NEXT:    umov w4, v1.h[5]
-; NONEON-NOSVE-NEXT:    umov w5, v0.h[5]
-; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    umov w13, v1.h[7]
-; NONEON-NOSVE-NEXT:    fmov s2, w11
-; NONEON-NOSVE-NEXT:    umov w11, v0.h[6]
-; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
-; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
-; NONEON-NOSVE-NEXT:    udiv w0, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    umov w14, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    udiv w3, w2, w1
-; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    udiv w9, w5, w4
-; NONEON-NOSVE-NEXT:    msub w8, w3, w1, w2
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    udiv w12, w11, w10
-; NONEON-NOSVE-NEXT:    msub w8, w9, w4, w5
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    udiv w9, w14, w13
-; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w13, w14
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = urem <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -2017,139 +671,6 @@ define void @urem_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mls z0.h, p0/m, z7.h, z1.h
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: urem_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #144
-; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #48] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #64] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #80] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #96] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #112] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #128] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 144
-; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
-; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
-; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
-; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
-; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
-; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
-; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
-; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
-; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
-; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
-; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
-; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0]
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    umov w20, v1.h[0]
-; NONEON-NOSVE-NEXT:    umov w21, v0.h[0]
-; NONEON-NOSVE-NEXT:    umov w19, v0.h[3]
-; NONEON-NOSVE-NEXT:    umov w5, v1.h[4]
-; NONEON-NOSVE-NEXT:    umov w2, v0.h[4]
-; NONEON-NOSVE-NEXT:    umov w1, v3.h[1]
-; NONEON-NOSVE-NEXT:    umov w23, v2.h[1]
-; NONEON-NOSVE-NEXT:    umov w25, v3.h[0]
-; NONEON-NOSVE-NEXT:    umov w26, v2.h[0]
-; NONEON-NOSVE-NEXT:    umov w6, v1.h[5]
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #36] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    umov w8, v1.h[2]
-; NONEON-NOSVE-NEXT:    umov w9, v0.h[2]
-; NONEON-NOSVE-NEXT:    umov w3, v0.h[5]
-; NONEON-NOSVE-NEXT:    umov w4, v1.h[6]
-; NONEON-NOSVE-NEXT:    umov w7, v0.h[6]
-; NONEON-NOSVE-NEXT:    umov w28, v3.h[2]
-; NONEON-NOSVE-NEXT:    umov w29, v2.h[2]
-; NONEON-NOSVE-NEXT:    umov w15, v3.h[3]
-; NONEON-NOSVE-NEXT:    umov w13, v2.h[3]
-; NONEON-NOSVE-NEXT:    umov w12, v3.h[4]
-; NONEON-NOSVE-NEXT:    umov w14, v3.h[5]
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w11, w21, w20
-; NONEON-NOSVE-NEXT:    str w10, [sp, #44] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.h[3]
-; NONEON-NOSVE-NEXT:    stp w8, w11, [sp] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    umov w11, v2.h[4]
-; NONEON-NOSVE-NEXT:    ldr w22, [sp, #4] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w20, w22, w20, w21
-; NONEON-NOSVE-NEXT:    udiv w9, w19, w8
-; NONEON-NOSVE-NEXT:    str w10, [sp, #32] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    umov w10, v3.h[6]
-; NONEON-NOSVE-NEXT:    fmov s5, w20
-; NONEON-NOSVE-NEXT:    umov w20, v3.h[7]
-; NONEON-NOSVE-NEXT:    udiv w8, w2, w5
-; NONEON-NOSVE-NEXT:    udiv w24, w23, w1
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w27, w26, w25
-; NONEON-NOSVE-NEXT:    msub w1, w24, w1, w23
-; NONEON-NOSVE-NEXT:    ldp w24, w23, [sp, #40] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w9, w3, w6
-; NONEON-NOSVE-NEXT:    msub w21, w27, w25, w26
-; NONEON-NOSVE-NEXT:    ldr w25, [sp, #36] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w23, w23, w25, w24
-; NONEON-NOSVE-NEXT:    ldr w25, [sp, #24] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    fmov s4, w21
-; NONEON-NOSVE-NEXT:    mov v5.h[1], w23
-; NONEON-NOSVE-NEXT:    ldp w23, w21, [sp, #28] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.h[1], w1
-; NONEON-NOSVE-NEXT:    udiv w8, w7, w4
-; NONEON-NOSVE-NEXT:    msub w21, w21, w25, w23
-; NONEON-NOSVE-NEXT:    umov w23, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #80] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.h[2], w21
-; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #112] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w30, w29, w28
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    umov w9, v2.h[5]
-; NONEON-NOSVE-NEXT:    umov w8, v2.h[6]
-; NONEON-NOSVE-NEXT:    udiv w18, w13, w15
-; NONEON-NOSVE-NEXT:    msub w1, w30, w28, w29
-; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #64] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #48] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.h[2], w1
-; NONEON-NOSVE-NEXT:    udiv w16, w11, w12
-; NONEON-NOSVE-NEXT:    msub w13, w18, w15, w13
-; NONEON-NOSVE-NEXT:    ldr w15, [sp, #20] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldr w18, [sp] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w15, w15, w18, w19
-; NONEON-NOSVE-NEXT:    mov v4.h[3], w13
-; NONEON-NOSVE-NEXT:    umov w13, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov v5.h[3], w15
-; NONEON-NOSVE-NEXT:    umov w15, v0.h[7]
-; NONEON-NOSVE-NEXT:    udiv w17, w9, w14
-; NONEON-NOSVE-NEXT:    msub w11, w16, w12, w11
-; NONEON-NOSVE-NEXT:    ldr w12, [sp, #16] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w12, w12, w5, w2
-; NONEON-NOSVE-NEXT:    mov v4.h[4], w11
-; NONEON-NOSVE-NEXT:    ldr w11, [sp, #12] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.h[4], w12
-; NONEON-NOSVE-NEXT:    msub w11, w11, w6, w3
-; NONEON-NOSVE-NEXT:    udiv w24, w8, w10
-; NONEON-NOSVE-NEXT:    msub w9, w17, w14, w9
-; NONEON-NOSVE-NEXT:    mov v5.h[5], w11
-; NONEON-NOSVE-NEXT:    mov v4.h[5], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w4, w7
-; NONEON-NOSVE-NEXT:    udiv w18, w23, w20
-; NONEON-NOSVE-NEXT:    msub w8, w24, w10, w8
-; NONEON-NOSVE-NEXT:    mov v5.h[6], w9
-; NONEON-NOSVE-NEXT:    mov v4.h[6], w8
-; NONEON-NOSVE-NEXT:    udiv w12, w15, w13
-; NONEON-NOSVE-NEXT:    msub w8, w18, w20, w23
-; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #128] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #96] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.h[7], w8
-; NONEON-NOSVE-NEXT:    msub w9, w12, w13, w15
-; NONEON-NOSVE-NEXT:    mov v5.h[7], w9
-; NONEON-NOSVE-NEXT:    stp q4, q5, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #144
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = urem <16 x i16> %op1, %op2
@@ -2168,23 +689,6 @@ define <2 x i32> @urem_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: urem_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    fmov w9, s0
-; NONEON-NOSVE-NEXT:    mov w11, v1.s[1]
-; NONEON-NOSVE-NEXT:    mov w12, v0.s[1]
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
-; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    msub w9, w13, w11, w12
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w9
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %res = urem <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -2200,30 +704,6 @@ define <4 x i32> @urem_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: urem_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov w11, s1
-; NONEON-NOSVE-NEXT:    fmov w12, s0
-; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
-; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
-; NONEON-NOSVE-NEXT:    mov w14, v1.s[2]
-; NONEON-NOSVE-NEXT:    mov w15, v0.s[2]
-; NONEON-NOSVE-NEXT:    mov w17, v1.s[3]
-; NONEON-NOSVE-NEXT:    mov w18, v0.s[3]
-; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    fmov s0, w11
-; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
-; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w8
-; NONEON-NOSVE-NEXT:    udiv w9, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    mov v0.s[2], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w17, w18
-; NONEON-NOSVE-NEXT:    mov v0.s[3], w8
-; NONEON-NOSVE-NEXT:    ret
   %res = urem <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -2243,65 +723,6 @@ define void @urem_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mls z1.s, p0/m, z5.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: urem_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str x23, [sp, #-48]! // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
-; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
-; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
-; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
-; NONEON-NOSVE-NEXT:    .cfi_offset w23, -48
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    fmov w12, s0
-; NONEON-NOSVE-NEXT:    fmov w3, s2
-; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
-; NONEON-NOSVE-NEXT:    fmov w11, s1
-; NONEON-NOSVE-NEXT:    fmov w2, s3
-; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
-; NONEON-NOSVE-NEXT:    mov w17, v3.s[1]
-; NONEON-NOSVE-NEXT:    mov w18, v2.s[1]
-; NONEON-NOSVE-NEXT:    mov w14, v1.s[2]
-; NONEON-NOSVE-NEXT:    mov w15, v0.s[2]
-; NONEON-NOSVE-NEXT:    mov w5, v3.s[2]
-; NONEON-NOSVE-NEXT:    mov w6, v2.s[2]
-; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
-; NONEON-NOSVE-NEXT:    mov w19, v3.s[3]
-; NONEON-NOSVE-NEXT:    mov w20, v2.s[3]
-; NONEON-NOSVE-NEXT:    mov w22, v1.s[3]
-; NONEON-NOSVE-NEXT:    mov w23, v0.s[3]
-; NONEON-NOSVE-NEXT:    udiv w4, w3, w2
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    fmov s1, w11
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w12, w4, w2, w3
-; NONEON-NOSVE-NEXT:    fmov s0, w12
-; NONEON-NOSVE-NEXT:    udiv w1, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    mov v1.s[1], w8
-; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
-; NONEON-NOSVE-NEXT:    msub w13, w1, w17, w18
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w13
-; NONEON-NOSVE-NEXT:    udiv w7, w6, w5
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    mov v1.s[2], w8
-; NONEON-NOSVE-NEXT:    udiv w21, w20, w19
-; NONEON-NOSVE-NEXT:    msub w10, w7, w5, w6
-; NONEON-NOSVE-NEXT:    mov v0.s[2], w10
-; NONEON-NOSVE-NEXT:    udiv w9, w23, w22
-; NONEON-NOSVE-NEXT:    msub w10, w21, w19, w20
-; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v0.s[3], w10
-; NONEON-NOSVE-NEXT:    msub w8, w9, w22, w23
-; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v1.s[3], w8
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr x23, [sp], #48 // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = urem <8 x i32> %op1, %op2
@@ -2320,17 +741,6 @@ define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: urem_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    udiv x10, x9, x8
-; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ret
   %res = urem <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -2346,20 +756,6 @@ define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: urem_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    mov x11, v1.d[1]
-; NONEON-NOSVE-NEXT:    mov x12, v0.d[1]
-; NONEON-NOSVE-NEXT:    udiv x10, x9, x8
-; NONEON-NOSVE-NEXT:    udiv x13, x12, x11
-; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    msub x9, x13, x11, x12
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
-; NONEON-NOSVE-NEXT:    ret
   %res = urem <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -2379,33 +775,6 @@ define void @urem_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mls z1.d, p0/m, z5.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: urem_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    fmov x15, d2
-; NONEON-NOSVE-NEXT:    mov x12, v2.d[1]
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    fmov x14, d3
-; NONEON-NOSVE-NEXT:    mov x11, v3.d[1]
-; NONEON-NOSVE-NEXT:    mov x17, v1.d[1]
-; NONEON-NOSVE-NEXT:    mov x18, v0.d[1]
-; NONEON-NOSVE-NEXT:    udiv x10, x9, x8
-; NONEON-NOSVE-NEXT:    udiv x16, x15, x14
-; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
-; NONEON-NOSVE-NEXT:    fmov d1, x8
-; NONEON-NOSVE-NEXT:    udiv x13, x12, x11
-; NONEON-NOSVE-NEXT:    msub x10, x16, x14, x15
-; NONEON-NOSVE-NEXT:    fmov d0, x10
-; NONEON-NOSVE-NEXT:    udiv x1, x18, x17
-; NONEON-NOSVE-NEXT:    msub x9, x13, x11, x12
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
-; NONEON-NOSVE-NEXT:    msub x11, x1, x17, x18
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = urem <4 x i64> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll
index b3adf4720ece..906112f7ac39 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -17,14 +16,6 @@ define <4 x i8> @select_v4i8(<4 x i8> %op1, <4 x i8> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v4i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.4h, w8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <4 x i8> %op1, <4 x i8> %op2
   ret <4 x i8> %sel
 }
@@ -40,14 +31,6 @@ define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.8b, w8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <8 x i8> %op1, <8 x i8> %op2
   ret <8 x i8> %sel
 }
@@ -63,14 +46,6 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.16b, w8
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <16 x i8> %op1, <16 x i8> %op2
   ret <16 x i8> %sel
 }
@@ -89,20 +64,6 @@ define void @select_v32i8(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-NEXT:    sel z1.b, p0, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    tst w2, #0x1
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
-; NONEON-NOSVE-NEXT:    dup v0.16b, w8
-; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <32 x i8>, ptr %a
   %op2 = load volatile <32 x i8>, ptr %b
   %sel = select i1 %mask, <32 x i8> %op1, <32 x i8> %op2
@@ -122,14 +83,6 @@ define <2 x i16> @select_v2i16(<2 x i16> %op1, <2 x i16> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v2i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.2s, w8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x i16> %op1, <2 x i16> %op2
   ret <2 x i16> %sel
 }
@@ -146,14 +99,6 @@ define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.4h, w8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <4 x i16> %op1, <4 x i16> %op2
   ret <4 x i16> %sel
 }
@@ -170,14 +115,6 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.8h, w8
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <8 x i16> %op1, <8 x i16> %op2
   ret <8 x i16> %sel
 }
@@ -197,20 +134,6 @@ define void @select_v16i16(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-NEXT:    sel z1.h, p0, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    tst w2, #0x1
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <16 x i16>, ptr %a
   %op2 = load volatile <16 x i16>, ptr %b
   %sel = select i1 %mask, <16 x i16> %op1, <16 x i16> %op2
@@ -230,14 +153,6 @@ define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.2s, w8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x i32> %op1, <2 x i32> %op2
   ret <2 x i32> %sel
 }
@@ -254,14 +169,6 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.4s, w8
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <4 x i32> %op1, <4 x i32> %op2
   ret <4 x i32> %sel
 }
@@ -281,20 +188,6 @@ define void @select_v8i32(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-NEXT:    sel z1.s, p0, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    tst w2, #0x1
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <8 x i32>, ptr %a
   %op2 = load volatile <8 x i32>, ptr %b
   %sel = select i1 %mask, <8 x i32> %op1, <8 x i32> %op2
@@ -315,14 +208,6 @@ define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm x8, ne
-; NONEON-NOSVE-NEXT:    fmov d2, x8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <1 x i64> %op1, <1 x i64> %op2
   ret <1 x i64> %sel
 }
@@ -340,14 +225,6 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm x8, ne
-; NONEON-NOSVE-NEXT:    dup v2.2d, x8
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x i64> %op1, <2 x i64> %op2
   ret <2 x i64> %sel
 }
@@ -368,20 +245,6 @@ define void @select_v4i64(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-NEXT:    sel z1.d, p0, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    tst w2, #0x1
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    csetm x8, ne
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <4 x i64>, ptr %a
   %op2 = load volatile <4 x i64>, ptr %b
   %sel = select i1 %mask, <4 x i64> %op1, <4 x i64> %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
index a429cd82a449..9ed52e321d9a 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -20,16 +19,6 @@ define <4 x i8> @ashr_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    asr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ashr_v4i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d2, #0xff00ff00ff00ff
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    neg v1.4h, v1.4h
-; NONEON-NOSVE-NEXT:    sshl v0.4h, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    ret
   %res = ashr <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -43,12 +32,6 @@ define <8 x i8> @ashr_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    asr z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ashr_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.8b, v1.8b
-; NONEON-NOSVE-NEXT:    sshl v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = ashr <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -62,12 +45,6 @@ define <16 x i8> @ashr_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    asr z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ashr_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    sshl v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = ashr <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -83,17 +60,6 @@ define void @ashr_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    asr z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ashr_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    neg v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    neg v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    sshl v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    sshl v1.16b, v3.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = ashr <32 x i8> %op1, %op2
@@ -112,16 +78,6 @@ define <2 x i16> @ashr_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-NEXT:    asr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ashr_v2i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d2, #0x00ffff0000ffff
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    neg v1.2s, v1.2s
-; NONEON-NOSVE-NEXT:    sshl v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = ashr <2 x i16> %op1, %op2
   ret <2 x i16> %res
 }
@@ -135,12 +91,6 @@ define <4 x i16> @ashr_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    asr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ashr_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.4h, v1.4h
-; NONEON-NOSVE-NEXT:    sshl v0.4h, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    ret
   %res = ashr <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -154,12 +104,6 @@ define <8 x i16> @ashr_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    asr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ashr_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    sshl v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    ret
   %res = ashr <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -175,17 +119,6 @@ define void @ashr_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    asr z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ashr_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    neg v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    neg v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    sshl v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    sshl v1.8h, v3.8h, v1.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = ashr <16 x i16> %op1, %op2
@@ -202,12 +135,6 @@ define <2 x i32> @ashr_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    asr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ashr_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.2s, v1.2s
-; NONEON-NOSVE-NEXT:    sshl v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = ashr <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -221,12 +148,6 @@ define <4 x i32> @ashr_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    asr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ashr_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    sshl v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = ashr <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -242,17 +163,6 @@ define void @ashr_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    asr z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ashr_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    neg v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    neg v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    sshl v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    sshl v1.4s, v3.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = ashr <8 x i32> %op1, %op2
@@ -269,12 +179,6 @@ define <1 x i64> @ashr_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    asr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ashr_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg d1, d1
-; NONEON-NOSVE-NEXT:    sshl d0, d0, d1
-; NONEON-NOSVE-NEXT:    ret
   %res = ashr <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -288,12 +192,6 @@ define <2 x i64> @ashr_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    asr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ashr_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    sshl v0.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = ashr <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -309,17 +207,6 @@ define void @ashr_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    asr z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ashr_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    neg v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    neg v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    sshl v0.2d, v2.2d, v0.2d
-; NONEON-NOSVE-NEXT:    sshl v1.2d, v3.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = ashr <4 x i64> %op1, %op2
@@ -342,15 +229,6 @@ define <4 x i8> @lshr_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    lsr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: lshr_v4i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d2, #0xff00ff00ff00ff
-; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v2.8b
-; NONEON-NOSVE-NEXT:    neg v1.4h, v1.4h
-; NONEON-NOSVE-NEXT:    ushl v0.4h, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    ret
   %res = lshr <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -364,12 +242,6 @@ define <8 x i8> @lshr_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    lsr z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: lshr_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ushl v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = lshr <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -383,12 +255,6 @@ define <16 x i8> @lshr_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    lsr z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: lshr_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ushl v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = lshr <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -404,17 +270,6 @@ define void @lshr_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    lsr z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: lshr_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    neg v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    neg v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ushl v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    ushl v1.16b, v3.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = lshr <32 x i8> %op1, %op2
@@ -433,15 +288,6 @@ define <2 x i16> @lshr_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-NEXT:    lsr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: lshr_v2i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d2, #0x00ffff0000ffff
-; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v2.8b
-; NONEON-NOSVE-NEXT:    neg v1.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ushl v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = lshr <2 x i16> %op1, %op2
   ret <2 x i16> %res
 }
@@ -455,12 +301,6 @@ define <4 x i16> @lshr_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    lsr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: lshr_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.4h, v1.4h
-; NONEON-NOSVE-NEXT:    ushl v0.4h, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    ret
   %res = lshr <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -474,12 +314,6 @@ define <8 x i16> @lshr_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    lsr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: lshr_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    ushl v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    ret
   %res = lshr <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -495,17 +329,6 @@ define void @lshr_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    lsr z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: lshr_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    neg v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    neg v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    ushl v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    ushl v1.8h, v3.8h, v1.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = lshr <16 x i16> %op1, %op2
@@ -522,12 +345,6 @@ define <2 x i32> @lshr_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    lsr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: lshr_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ushl v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = lshr <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -541,12 +358,6 @@ define <4 x i32> @lshr_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    lsr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: lshr_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ushl v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = lshr <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -562,17 +373,6 @@ define void @lshr_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    lsr z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: lshr_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    neg v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    neg v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ushl v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ushl v1.4s, v3.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = lshr <8 x i32> %op1, %op2
@@ -589,12 +389,6 @@ define <1 x i64> @lshr_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    lsr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: lshr_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg d1, d1
-; NONEON-NOSVE-NEXT:    ushl d0, d0, d1
-; NONEON-NOSVE-NEXT:    ret
   %res = lshr <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -608,12 +402,6 @@ define <2 x i64> @lshr_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    lsr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: lshr_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ushl v0.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = lshr <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -629,17 +417,6 @@ define void @lshr_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    lsr z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: lshr_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    neg v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    neg v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ushl v0.2d, v2.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ushl v1.2d, v3.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = lshr <4 x i64> %op1, %op2
@@ -661,13 +438,6 @@ define <2 x i8> @shl_v2i8(<2 x i8> %op1, <2 x i8> %op2) {
 ; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shl_v2i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d2, #0x0000ff000000ff
-; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    ushl v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = shl <2 x i8> %op1, %op2
   ret <2 x i8> %res
 }
@@ -682,13 +452,6 @@ define <4 x i8> @shl_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    lsl z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shl_v4i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d2, #0xff00ff00ff00ff
-; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    ushl v0.4h, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    ret
   %res = shl <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -702,11 +465,6 @@ define <8 x i8> @shl_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    lsl z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shl_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushl v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = shl <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -720,11 +478,6 @@ define <16 x i8> @shl_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    lsl z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shl_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushl v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = shl <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -740,15 +493,6 @@ define void @shl_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    lsl z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shl_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    ushl v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    ushl v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = shl <32 x i8> %op1, %op2
@@ -765,11 +509,6 @@ define <4 x i16> @shl_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    lsl z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shl_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushl v0.4h, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    ret
   %res = shl <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -783,11 +522,6 @@ define <8 x i16> @shl_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    lsl z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shl_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushl v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    ret
   %res = shl <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -803,15 +537,6 @@ define void @shl_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    lsl z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shl_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    ushl v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    ushl v1.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = shl <16 x i16> %op1, %op2
@@ -828,11 +553,6 @@ define <2 x i32> @shl_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shl_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushl v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = shl <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -846,11 +566,6 @@ define <4 x i32> @shl_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shl_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushl v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = shl <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -866,15 +581,6 @@ define void @shl_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    lsl z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shl_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    ushl v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ushl v1.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = shl <8 x i32> %op1, %op2
@@ -891,11 +597,6 @@ define <1 x i64> @shl_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shl_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushl d0, d0, d1
-; NONEON-NOSVE-NEXT:    ret
   %res = shl <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -909,11 +610,6 @@ define <2 x i64> @shl_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shl_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushl v0.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = shl <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -929,15 +625,6 @@ define void @shl_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    lsl z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shl_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    ushl v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ushl v1.2d, v2.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = shl <4 x i64> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
index d9ca19baea7d..b285659258f3 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -16,13 +15,6 @@ define <4 x half> @ucvtf_v4i16_v4f16(<4 x i16> %op1) {
 ; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_v4i16_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <4 x i16> %op1 to <4 x half>
   ret <4 x half> %res
 }
@@ -35,22 +27,6 @@ define void @ucvtf_v8i16_v8f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_v8i16_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v1.4s
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
-; NONEON-NOSVE-NEXT:    str q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = uitofp <8 x i16> %op1 to <8 x half>
   store <8 x half> %res, ptr %b
@@ -66,29 +42,6 @@ define void @ucvtf_v16i16_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ucvtf z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ucvtf v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ucvtf v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v3.4s
-; NONEON-NOSVE-NEXT:    stp q2, q0, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = uitofp <16 x i16> %op1 to <16 x half>
   store <16 x half> %res, ptr %b
@@ -108,13 +61,6 @@ define <2 x float> @ucvtf_v2i16_v2f32(<2 x i16> %op1) {
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_v2i16_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d1, #0x00ffff0000ffff
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ucvtf v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i16> %op1 to <2 x float>
   ret <2 x float> %res
 }
@@ -128,12 +74,6 @@ define <4 x float> @ucvtf_v4i16_v4f32(<4 x i16> %op1) {
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_v4i16_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <4 x i16> %op1 to <4 x float>
   ret <4 x float> %res
 }
@@ -150,20 +90,6 @@ define void @ucvtf_v8i16_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_v8i16_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = uitofp <8 x i16> %op1 to <8 x float>
   store <8 x float> %res, ptr %b
@@ -188,26 +114,6 @@ define void @ucvtf_v16i16_v16f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ucvtf v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    ucvtf v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = uitofp <16 x i16> %op1 to <16 x float>
   store <16 x float> %res, ptr %b
@@ -226,13 +132,6 @@ define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) {
 ; CHECK-NEXT:    and w8, w8, #0xffff
 ; CHECK-NEXT:    ucvtf d0, w8
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_v1i16_v1f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    umov w8, v0.h[0]
-; NONEON-NOSVE-NEXT:    ucvtf d0, w8
-; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <1 x i16> %op1 to <1 x double>
   ret <1 x double> %res
 }
@@ -247,14 +146,6 @@ define <2 x double> @ucvtf_v2i16_v2f64(<2 x i16> %op1) {
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_v2i16_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d1, #0x00ffff0000ffff
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i16> %op1 to <2 x double>
   ret <2 x double> %res
 }
@@ -272,21 +163,6 @@ define void @ucvtf_v4i16_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_v4i16_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i16>, ptr %a
   %res = uitofp <4 x i16> %op1 to <4 x double>
   store <4 x double> %res, ptr %b
@@ -314,30 +190,6 @@ define void @ucvtf_v8i16_v8f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q1, [x1]
 ; CHECK-NEXT:    stp q3, q0, [x1, #32]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_v8i16_v8f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ucvtf v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    ucvtf v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = uitofp <8 x i16> %op1 to <8 x double>
   store <8 x double> %res, ptr %b
@@ -386,46 +238,6 @@ define void @ucvtf_v16i16_v16f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q2, [x1, #32]
 ; CHECK-NEXT:    stp q3, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #72]
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #40]
-; NONEON-NOSVE-NEXT:    ushll v5.2d, v5.2s, #0
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v4.2d, v4.2s, #0
-; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ushll v6.2d, v6.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v7.2d, v7.2s, #0
-; NONEON-NOSVE-NEXT:    ucvtf v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    ucvtf v5.2d, v5.2d
-; NONEON-NOSVE-NEXT:    ucvtf v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    ucvtf v4.2d, v4.2d
-; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v7.2d
-; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
-; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v6.2d
-; NONEON-NOSVE-NEXT:    stp q2, q0, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q3, q1, [x1, #96]
-; NONEON-NOSVE-NEXT:    add sp, sp, #96
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = uitofp <16 x i16> %op1 to <16 x double>
   store <16 x double> %res, ptr %b
@@ -445,13 +257,6 @@ define <2 x half> @ucvtf_v2i32_v2f16(<2 x i32> %op1) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_v2i32_v2f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i32> %op1 to <2 x half>
   ret <2 x half> %res
 }
@@ -465,12 +270,6 @@ define <4 x half> @ucvtf_v4i32_v4f16(<4 x i32> %op1) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_v4i32_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <4 x i32> %op1 to <4 x half>
   ret <4 x half> %res
 }
@@ -488,15 +287,6 @@ define <8 x half> @ucvtf_v8i32_v8f16(ptr %a) {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_v8i32_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = uitofp <8 x i32> %op1 to <8 x half>
   ret <8 x half> %res
@@ -521,21 +311,6 @@ define void @ucvtf_v16i32_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
 ; CHECK-NEXT:    stp q2, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_v16i32_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ucvtf v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ucvtf v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v3.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i32>, ptr %a
   %res = uitofp <16 x i32> %op1 to <16 x half>
   store <16 x half> %res, ptr %b
@@ -554,11 +329,6 @@ define <2 x float> @ucvtf_v2i32_v2f32(<2 x i32> %op1) {
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_v2i32_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ucvtf v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i32> %op1 to <2 x float>
   ret <2 x float> %res
 }
@@ -571,11 +341,6 @@ define <4 x float> @ucvtf_v4i32_v4f32(<4 x i32> %op1) {
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_v4i32_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <4 x i32> %op1 to <4 x float>
   ret <4 x float> %res
 }
@@ -589,14 +354,6 @@ define void @ucvtf_v8i32_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ucvtf z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_v8i32_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = uitofp <8 x i32> %op1 to <8 x float>
   store <8 x float> %res, ptr %b
@@ -616,12 +373,6 @@ define <2 x double> @ucvtf_v2i32_v2f64(<2 x i32> %op1) {
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_v2i32_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i32> %op1 to <2 x double>
   ret <2 x double> %res
 }
@@ -638,20 +389,6 @@ define void @ucvtf_v4i32_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_v4i32_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %res = uitofp <4 x i32> %op1 to <4 x double>
   store <4 x double> %res, ptr %b
@@ -676,26 +413,6 @@ define void @ucvtf_v8i32_v8f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_v8i32_v8f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ucvtf v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    ucvtf v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = uitofp <8 x i32> %op1 to <8 x double>
   store <8 x double> %res, ptr %b
@@ -722,18 +439,6 @@ define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_v2i64_v2f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    ucvtf s1, x9
-; NONEON-NOSVE-NEXT:    ucvtf s0, x8
-; NONEON-NOSVE-NEXT:    fcvt h2, s0
-; NONEON-NOSVE-NEXT:    fcvt h0, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v2.h[0]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i64> %op1 to <2 x half>
   ret <2 x half> %res
 }
@@ -754,16 +459,6 @@ define <4 x half> @ucvtf_v4i64_v4f16(ptr %a) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_v4i64_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = uitofp <4 x i64> %op1 to <4 x half>
   ret <4 x half> %res
@@ -797,22 +492,6 @@ define <8 x half> @ucvtf_v8i64_v8f16(ptr %a) {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z2.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_v8i64_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ucvtf v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    ucvtf v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtn v2.2s, v2.2d
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.4s, v3.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v2.4s
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i64>, ptr %a
   %res = uitofp <8 x i64> %op1 to <8 x half>
   ret <8 x half> %res
@@ -831,12 +510,6 @@ define <2 x float> @ucvtf_v2i64_v2f32(<2 x i64> %op1) {
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_v2i64_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i64> %op1 to <2 x float>
   ret <2 x float> %res
 }
@@ -854,15 +527,6 @@ define <4 x float> @ucvtf_v4i64_v4f32(ptr %a) {
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_v4i64_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v1.2d
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = uitofp <4 x i64> %op1 to <4 x float>
   ret <4 x float> %res
@@ -887,21 +551,6 @@ define void @ucvtf_v8i64_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
 ; CHECK-NEXT:    stp q2, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_v8i64_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ucvtf v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ucvtf v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtn v1.2s, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v2.2d
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.4s, v3.2d
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i64>, ptr %a
   %res = uitofp <8 x i64> %op1 to <8 x float>
   store <8 x float> %res, ptr %b
@@ -920,11 +569,6 @@ define <2 x double> @ucvtf_v2i64_v2f64(<2 x i64> %op1) {
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_v2i64_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i64> %op1 to <2 x double>
   ret <2 x double> %res
 }
@@ -938,14 +582,6 @@ define void @ucvtf_v4i64_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ucvtf z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_v4i64_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = uitofp <4 x i64> %op1 to <4 x double>
   store <4 x double> %res, ptr %b
@@ -964,13 +600,6 @@ define <4 x half> @scvtf_v4i16_v4f16(<4 x i16> %op1) {
 ; CHECK-NEXT:    scvtf z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_v4i16_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <4 x i16> %op1 to <4 x half>
   ret <4 x half> %res
 }
@@ -983,22 +612,6 @@ define void @scvtf_v8i16_v8f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    scvtf z0.h, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_v8i16_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    scvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v1.4s
-; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
-; NONEON-NOSVE-NEXT:    str q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = sitofp <8 x i16> %op1 to <8 x half>
   store <8 x half> %res, ptr %b
@@ -1014,29 +627,6 @@ define void @scvtf_v16i16_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    scvtf z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    sshll v2.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    scvtf v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    scvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    scvtf v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v3.4s
-; NONEON-NOSVE-NEXT:    stp q2, q0, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = sitofp <16 x i16> %op1 to <16 x half>
   store <16 x half> %res, ptr %b
@@ -1055,13 +645,6 @@ define <2 x float> @scvtf_v2i16_v2f32(<2 x i16> %op1) {
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_v2i16_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    scvtf v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i16> %op1 to <2 x float>
   ret <2 x float> %res
 }
@@ -1075,12 +658,6 @@ define <4 x float> @scvtf_v4i16_v4f32(<4 x i16> %op1) {
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_v4i16_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <4 x i16> %op1 to <4 x float>
   ret <4 x float> %res
 }
@@ -1097,20 +674,6 @@ define void @scvtf_v8i16_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_v8i16_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    scvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = sitofp <8 x i16> %op1 to <8 x float>
   store <8 x float> %res, ptr %b
@@ -1135,26 +698,6 @@ define void @scvtf_v16i16_v16f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    scvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    scvtf v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    scvtf v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = sitofp <16 x i16> %op1 to <16 x float>
   store <16 x float> %res, ptr %b
@@ -1176,14 +719,6 @@ define <2 x double> @scvtf_v2i16_v2f64(<2 x i16> %op1) {
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_v2i16_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i16> %op1 to <2 x double>
   ret <2 x double> %res
 }
@@ -1201,21 +736,6 @@ define void @scvtf_v4i16_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_v4i16_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i16>, ptr %a
   %res = sitofp <4 x i16> %op1 to <4 x double>
   store <4 x double> %res, ptr %b
@@ -1243,30 +763,6 @@ define void @scvtf_v8i16_v8f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q1, [x1]
 ; CHECK-NEXT:    stp q3, q0, [x1, #32]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_v8i16_v8f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    scvtf v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    scvtf v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = sitofp <8 x i16> %op1 to <8 x double>
   store <8 x double> %res, ptr %b
@@ -1315,46 +811,6 @@ define void @scvtf_v16i16_v16f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q2, [x1, #32]
 ; CHECK-NEXT:    stp q3, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #72]
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #40]
-; NONEON-NOSVE-NEXT:    sshll v5.2d, v5.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v4.2d, v4.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    sshll v6.2d, v6.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v7.2d, v7.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    scvtf v5.2d, v5.2d
-; NONEON-NOSVE-NEXT:    scvtf v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    scvtf v4.2d, v4.2d
-; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v7.2d
-; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
-; NONEON-NOSVE-NEXT:    scvtf v1.2d, v6.2d
-; NONEON-NOSVE-NEXT:    stp q2, q0, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q3, q1, [x1, #96]
-; NONEON-NOSVE-NEXT:    add sp, sp, #96
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = sitofp <16 x i16> %op1 to <16 x double>
   store <16 x double> %res, ptr %b
@@ -1374,13 +830,6 @@ define <2 x half> @scvtf_v2i32_v2f16(<2 x i32> %op1) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_v2i32_v2f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i32> %op1 to <2 x half>
   ret <2 x half> %res
 }
@@ -1394,12 +843,6 @@ define <4 x half> @scvtf_v4i32_v4f16(<4 x i32> %op1) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_v4i32_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <4 x i32> %op1 to <4 x half>
   ret <4 x half> %res
 }
@@ -1417,15 +860,6 @@ define <8 x half> @scvtf_v8i32_v8f16(ptr %a) {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_v8i32_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    scvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = sitofp <8 x i32> %op1 to <8 x half>
   ret <8 x half> %res
@@ -1443,11 +877,6 @@ define <2 x float> @scvtf_v2i32_v2f32(<2 x i32> %op1) {
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_v2i32_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    scvtf v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i32> %op1 to <2 x float>
   ret <2 x float> %res
 }
@@ -1460,11 +889,6 @@ define <4 x float> @scvtf_v4i32_v4f32(<4 x i32> %op1) {
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_v4i32_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <4 x i32> %op1 to <4 x float>
   ret <4 x float> %res
 }
@@ -1478,14 +902,6 @@ define void @scvtf_v8i32_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    scvtf z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_v8i32_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    scvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = sitofp <8 x i32> %op1 to <8 x float>
   store <8 x float> %res, ptr %b
@@ -1505,12 +921,6 @@ define <2 x double> @scvtf_v2i32_v2f64(<2 x i32> %op1) {
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_v2i32_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i32> %op1 to <2 x double>
   ret <2 x double> %res
 }
@@ -1527,20 +937,6 @@ define void @scvtf_v4i32_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_v4i32_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %res = sitofp <4 x i32> %op1 to <4 x double>
   store <4 x double> %res, ptr %b
@@ -1565,26 +961,6 @@ define void @scvtf_v8i32_v8f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_v8i32_v8f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    scvtf v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    scvtf v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = sitofp <8 x i32> %op1 to <8 x double>
   store <8 x double> %res, ptr %b
@@ -1629,40 +1005,6 @@ define void @scvtf_v16i32_v16f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q1, [x1]
 ; CHECK-NEXT:    stp q4, q0, [x1, #32]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_v16i32_v16f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
-; NONEON-NOSVE-NEXT:    stp q0, q2, [sp, #-64]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
-; NONEON-NOSVE-NEXT:    stp q1, q3, [sp, #32]
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #24]
-; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
-; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #40]
-; NONEON-NOSVE-NEXT:    sshll v4.2d, v4.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v5.2d, v5.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    sshll v6.2d, v6.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v7.2d, v7.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v4.2d, v4.2d
-; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    scvtf v5.2d, v5.2d
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    stp q2, q4, [x1, #96]
-; NONEON-NOSVE-NEXT:    scvtf v2.2d, v6.2d
-; NONEON-NOSVE-NEXT:    stp q3, q5, [x1, #64]
-; NONEON-NOSVE-NEXT:    scvtf v3.2d, v7.2d
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #64
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i32>, ptr %a
   %res = sitofp <16 x i32> %op1 to <16 x double>
   store <16 x double> %res, ptr %b
@@ -1689,18 +1031,6 @@ define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_v2i64_v2f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    scvtf s1, x9
-; NONEON-NOSVE-NEXT:    scvtf s0, x8
-; NONEON-NOSVE-NEXT:    fcvt h2, s0
-; NONEON-NOSVE-NEXT:    fcvt h0, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v2.h[0]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i64> %op1 to <2 x half>
   ret <2 x half> %res
 }
@@ -1721,16 +1051,6 @@ define <4 x half> @scvtf_v4i64_v4f16(ptr %a) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_v4i64_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = sitofp <4 x i64> %op1 to <4 x half>
   ret <4 x half> %res
@@ -1749,12 +1069,6 @@ define <2 x float> @scvtf_v2i64_v2f32(<2 x i64> %op1) {
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_v2i64_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i64> %op1 to <2 x float>
   ret <2 x float> %res
 }
@@ -1772,15 +1086,6 @@ define <4 x float> @scvtf_v4i64_v4f32(ptr %a) {
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_v4i64_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v1.2d
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = sitofp <4 x i64> %op1 to <4 x float>
   ret <4 x float> %res
@@ -1798,11 +1103,6 @@ define <2 x double> @scvtf_v2i64_v2f64(<2 x i64> %op1) {
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_v2i64_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i64> %op1 to <2 x double>
   ret <2 x double> %res
 }
@@ -1816,14 +1116,6 @@ define void @scvtf_v4i64_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    scvtf z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_v4i64_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = sitofp <4 x i64> %op1 to <4 x double>
   store <4 x double> %res, ptr %b
@@ -1836,13 +1128,6 @@ define half @scvtf_i16_f16(ptr %0) {
 ; CHECK-NEXT:    ldrsh w8, [x0]
 ; CHECK-NEXT:    scvtf h0, w8
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_i16_f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldrsh w8, [x0]
-; NONEON-NOSVE-NEXT:    scvtf s0, w8
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    ret
   %2 = load i16, ptr %0, align 64
   %3 = sitofp i16 %2 to half
   ret half %3
@@ -1854,12 +1139,6 @@ define float @scvtf_i16_f32(ptr %0) {
 ; CHECK-NEXT:    ldrsh w8, [x0]
 ; CHECK-NEXT:    scvtf s0, w8
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_i16_f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldrsh w8, [x0]
-; NONEON-NOSVE-NEXT:    scvtf s0, w8
-; NONEON-NOSVE-NEXT:    ret
   %2 = load i16, ptr %0, align 64
   %3 = sitofp i16 %2 to float
   ret float %3
@@ -1871,12 +1150,6 @@ define double @scvtf_i16_f64(ptr %0) {
 ; CHECK-NEXT:    ldrsh w8, [x0]
 ; CHECK-NEXT:    scvtf d0, w8
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_i16_f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldrsh w8, [x0]
-; NONEON-NOSVE-NEXT:    scvtf d0, w8
-; NONEON-NOSVE-NEXT:    ret
   %2 = load i16, ptr %0, align 64
   %3 = sitofp i16 %2 to double
   ret double %3
@@ -1888,13 +1161,6 @@ define half @scvtf_i32_f16(ptr %0) {
 ; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    scvtf h0, w8
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_i32_f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr w8, [x0]
-; NONEON-NOSVE-NEXT:    scvtf s0, w8
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    ret
   %2 = load i32, ptr %0, align 64
   %3 = sitofp i32 %2 to half
   ret half %3
@@ -1906,12 +1172,6 @@ define float @scvtf_i32_f32(ptr %0) {
 ; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    scvtf s0, w8
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_i32_f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr w8, [x0]
-; NONEON-NOSVE-NEXT:    scvtf s0, w8
-; NONEON-NOSVE-NEXT:    ret
   %2 = load i32, ptr %0, align 64
   %3 = sitofp i32 %2 to float
   ret float %3
@@ -1923,12 +1183,6 @@ define double @scvtf_i32_f64(ptr %0) {
 ; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    scvtf d0, w8
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_i32_f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr w8, [x0]
-; NONEON-NOSVE-NEXT:    scvtf d0, w8
-; NONEON-NOSVE-NEXT:    ret
   %2 = load i32, ptr %0, align 64
   %3 = sitofp i32 %2 to double
   ret double %3
@@ -1940,13 +1194,6 @@ define half @scvtf_i64_f16(ptr %0) {
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    scvtf h0, x8
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_i64_f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr x8, [x0]
-; NONEON-NOSVE-NEXT:    scvtf s0, x8
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    ret
   %2 = load i64, ptr %0, align 64
   %3 = sitofp i64 %2 to half
   ret half %3
@@ -1958,12 +1205,6 @@ define float @scvtf_i64_f32(ptr %0) {
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    scvtf s0, x8
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_i64_f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr x8, [x0]
-; NONEON-NOSVE-NEXT:    scvtf s0, x8
-; NONEON-NOSVE-NEXT:    ret
   %2 = load i64, ptr %0, align 64
   %3 = sitofp i64 %2 to float
   ret float %3
@@ -1975,12 +1216,6 @@ define double @scvtf_i64_f64(ptr %0) {
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    scvtf d0, x8
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: scvtf_i64_f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr x8, [x0]
-; NONEON-NOSVE-NEXT:    scvtf d0, x8
-; NONEON-NOSVE-NEXT:    ret
   %2 = load i64, ptr %0, align 64
   %3 = sitofp i64 %2 to double
   ret double %3
@@ -1992,13 +1227,6 @@ define half @ucvtf_i16_f16(ptr %0) {
 ; CHECK-NEXT:    ldrh w8, [x0]
 ; CHECK-NEXT:    ucvtf h0, w8
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_i16_f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr h0, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    ret
   %2 = load i16, ptr %0, align 64
   %3 = uitofp i16 %2 to half
   ret half %3
@@ -2010,12 +1238,6 @@ define float @ucvtf_i16_f32(ptr %0) {
 ; CHECK-NEXT:    ldr h0, [x0]
 ; CHECK-NEXT:    ucvtf s0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_i16_f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr h0, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
-; NONEON-NOSVE-NEXT:    ret
   %2 = load i16, ptr %0, align 64
   %3 = uitofp i16 %2 to float
   ret float %3
@@ -2027,12 +1249,6 @@ define double @ucvtf_i16_f64(ptr %0) {
 ; CHECK-NEXT:    ldr h0, [x0]
 ; CHECK-NEXT:    ucvtf d0, d0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_i16_f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr h0, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
-; NONEON-NOSVE-NEXT:    ret
   %2 = load i16, ptr %0, align 64
   %3 = uitofp i16 %2 to double
   ret double %3
@@ -2044,13 +1260,6 @@ define half @ucvtf_i32_f16(ptr %0) {
 ; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    ucvtf h0, w8
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_i32_f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr w8, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf s0, w8
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    ret
   %2 = load i32, ptr %0, align 64
   %3 = uitofp i32 %2 to half
   ret half %3
@@ -2062,12 +1271,6 @@ define float @ucvtf_i32_f32(ptr %0) {
 ; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    ucvtf s0, w8
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_i32_f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr w8, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf s0, w8
-; NONEON-NOSVE-NEXT:    ret
   %2 = load i32, ptr %0, align 64
   %3 = uitofp i32 %2 to float
   ret float %3
@@ -2079,12 +1282,6 @@ define double @ucvtf_i32_f64(ptr %0) {
 ; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    ucvtf d0, d0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_i32_f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr s0, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
-; NONEON-NOSVE-NEXT:    ret
   %2 = load i32, ptr %0, align 64
   %3 = uitofp i32 %2 to double
   ret double %3
@@ -2096,13 +1293,6 @@ define half @ucvtf_i64_f16(ptr %0) {
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    ucvtf h0, x8
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_i64_f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr x8, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf s0, x8
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    ret
   %2 = load i64, ptr %0, align 64
   %3 = uitofp i64 %2 to half
   ret half %3
@@ -2114,12 +1304,6 @@ define float @ucvtf_i64_f32(ptr %0) {
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    ucvtf s0, x8
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_i64_f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr x8, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf s0, x8
-; NONEON-NOSVE-NEXT:    ret
   %2 = load i64, ptr %0, align 64
   %3 = uitofp i64 %2 to float
   ret float %3
@@ -2131,12 +1315,6 @@ define double @ucvtf_i64_f64(ptr %0) {
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    ucvtf d0, x8
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ucvtf_i64_f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr x8, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf d0, x8
-; NONEON-NOSVE-NEXT:    ret
   %2 = load i64, ptr %0, align 64
   %3 = uitofp i64 %2 to double
   ret double %3
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
index 42daa4fedc94..81bbaa92d4b4 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -19,13 +18,6 @@ define <4 x i8> @select_v4i8(<4 x i8> %op1, <4 x i8> %op2, <4 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v4i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v2.4h, v2.4h, #15
-; NONEON-NOSVE-NEXT:    cmlt v2.4h, v2.4h, #0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select <4 x i1> %mask, <4 x i8> %op1, <4 x i8> %op2
   ret <4 x i8> %sel
 }
@@ -44,13 +36,6 @@ define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, <8 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v2.8b, v2.8b, #7
-; NONEON-NOSVE-NEXT:    cmlt v2.8b, v2.8b, #0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select <8 x i1> %mask, <8 x i8> %op1, <8 x i8> %op2
   ret <8 x i8> %sel
 }
@@ -69,13 +54,6 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask)
 ; CHECK-NEXT:    sel z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v2.16b, v2.16b, #7
-; NONEON-NOSVE-NEXT:    cmlt v2.16b, v2.16b, #0
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select <16 x i1> %mask, <16 x i8> %op1, <16 x i8> %op2
   ret <16 x i8> %sel
 }
@@ -92,18 +70,6 @@ define void @select_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sel z1.b, p0, z2.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    cmeq v4.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    cmeq v5.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
-; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %mask = icmp eq <32 x i8> %op1, %op2
@@ -126,13 +92,6 @@ define <2 x i16> @select_v2i16(<2 x i16> %op1, <2 x i16> %op2, <2 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v2i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v2.2s, v2.2s, #31
-; NONEON-NOSVE-NEXT:    cmlt v2.2s, v2.2s, #0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x i16> %op1, <2 x i16> %op2
   ret <2 x i16> %sel
 }
@@ -151,13 +110,6 @@ define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, <4 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v2.4h, v2.4h, #15
-; NONEON-NOSVE-NEXT:    cmlt v2.4h, v2.4h, #0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select <4 x i1> %mask, <4 x i16> %op1, <4 x i16> %op2
   ret <4 x i16> %sel
 }
@@ -177,14 +129,6 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v2.8h, v2.8b, #0
-; NONEON-NOSVE-NEXT:    shl v2.8h, v2.8h, #15
-; NONEON-NOSVE-NEXT:    cmlt v2.8h, v2.8h, #0
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select <8 x i1> %mask, <8 x i16> %op1, <8 x i16> %op2
   ret <8 x i16> %sel
 }
@@ -201,18 +145,6 @@ define void @select_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sel z1.h, p0, z2.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    cmeq v4.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    cmeq v5.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
-; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %mask = icmp eq <16 x i16> %op1, %op2
@@ -235,13 +167,6 @@ define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, <2 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v2.2s, v2.2s, #31
-; NONEON-NOSVE-NEXT:    cmlt v2.2s, v2.2s, #0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x i32> %op1, <2 x i32> %op2
   ret <2 x i32> %sel
 }
@@ -261,14 +186,6 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, <4 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    shl v2.4s, v2.4s, #31
-; NONEON-NOSVE-NEXT:    cmlt v2.4s, v2.4s, #0
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select <4 x i1> %mask, <4 x i32> %op1, <4 x i32> %op2
   ret <4 x i32> %sel
 }
@@ -285,18 +202,6 @@ define void @select_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sel z1.s, p0, z2.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    cmeq v4.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    cmeq v5.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
-; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %mask = icmp eq <8 x i32> %op1, %op2
@@ -318,14 +223,6 @@ define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, <1 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm x8, ne
-; NONEON-NOSVE-NEXT:    fmov d2, x8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select <1 x i1> %mask, <1 x i64> %op1, <1 x i64> %op2
   ret <1 x i64> %sel
 }
@@ -345,14 +242,6 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    shl v2.2d, v2.2d, #63
-; NONEON-NOSVE-NEXT:    cmlt v2.2d, v2.2d, #0
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x i64> %op1, <2 x i64> %op2
   ret <2 x i64> %sel
 }
@@ -369,18 +258,6 @@ define void @select_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sel z1.d, p0, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: select_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    cmeq v4.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    cmeq v5.2d, v2.2d, v3.2d
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
-; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %mask = icmp eq <4 x i64> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
index efe9066f2c83..318285ded5a8 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
@@ -1,5 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -28,6 +30,23 @@ define void @alloc_v4i8(ptr %st_ptr) nounwind {
 ; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #48
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: alloc_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    mov x19, x0
+; NONEON-NOSVE-NEXT:    add x0, sp, #12
+; NONEON-NOSVE-NEXT:    bl def
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    umov w8, v0.h[2]
+; NONEON-NOSVE-NEXT:    umov w9, v0.h[0]
+; NONEON-NOSVE-NEXT:    strb w8, [x19, #1]
+; NONEON-NOSVE-NEXT:    strb w9, [x19]
+; NONEON-NOSVE-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %alloc = alloca [4 x i8]
   call void @def(ptr %alloc)
   %load = load <4 x i8>, ptr %alloc
@@ -40,38 +59,51 @@ define void @alloc_v6i8(ptr %st_ptr) nounwind {
 ; CHECK-LABEL: alloc_v6i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #48
-; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    add x0, sp, #24
-; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT:    add x20, sp, #24
 ; CHECK-NEXT:    bl def
-; CHECK-NEXT:    ptrue p0.b, vl3
-; CHECK-NEXT:    ptrue p1.s, vl2
-; CHECK-NEXT:    ld2b { z0.b, z1.b }, p0/z, [x20]
+; CHECK-NEXT:    ldr d0, [sp, #24]
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    mov z2.b, z1.b[3]
+; CHECK-NEXT:    ptrue p1.s, vl2
+; CHECK-NEXT:    mov z1.b, z0.b[3]
+; CHECK-NEXT:    mov z2.b, z0.b[5]
+; CHECK-NEXT:    mov z0.b, z0.b[1]
 ; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov z3.b, z1.b[2]
-; CHECK-NEXT:    mov z4.b, z1.b[1]
-; CHECK-NEXT:    strh w8, [sp]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    strh w8, [sp, #6]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    strh w9, [sp, #4]
-; CHECK-NEXT:    strh w8, [sp, #2]
-; CHECK-NEXT:    add x8, sp, #12
-; CHECK-NEXT:    ldr d0, [sp]
+; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    strh w8, [sp, #10]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    strh w9, [sp, #12]
+; CHECK-NEXT:    strh w8, [sp, #8]
+; CHECK-NEXT:    add x8, sp, #20
+; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x8]
 ; CHECK-NEXT:    ld1h { z0.s }, p1/z, [x8]
 ; CHECK-NEXT:    strb w9, [x19, #2]
-; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    strh w8, [x19]
-; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #48
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: alloc_v6i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    mov x19, x0
+; NONEON-NOSVE-NEXT:    add x0, sp, #8
+; NONEON-NOSVE-NEXT:    bl def
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add x9, x19, #2
+; NONEON-NOSVE-NEXT:    rev16 v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    xtn v1.8b, v1.8h
+; NONEON-NOSVE-NEXT:    str s1, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    st1 { v0.b }[5], [x9]
+; NONEON-NOSVE-NEXT:    strh w8, [x19]
+; NONEON-NOSVE-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %alloc = alloca [6 x i8]
   call void @def(ptr %alloc)
   %load = load <6 x i8>, ptr %alloc
@@ -100,6 +132,22 @@ define void @alloc_v32i8(ptr %st_ptr) nounwind {
 ; CHECK-NEXT:    ldp x30, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #48
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: alloc_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    stp x30, x19, [sp, #32] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    mov x19, x0
+; NONEON-NOSVE-NEXT:    mov x0, sp
+; NONEON-NOSVE-NEXT:    bl def
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    add x8, x19, #8
+; NONEON-NOSVE-NEXT:    xtn v0.8b, v0.8h
+; NONEON-NOSVE-NEXT:    st1 { v1.b }[0], [x8]
+; NONEON-NOSVE-NEXT:    str d0, [x19]
+; NONEON-NOSVE-NEXT:    ldp x30, x19, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %alloc = alloca [32 x i8]
   call void @def(ptr %alloc)
   %load = load <32 x i8>, ptr %alloc
@@ -128,6 +176,22 @@ define void @alloc_v8f64(ptr %st_ptr) nounwind {
 ; CHECK-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: alloc_v8f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    mov x19, x0
+; NONEON-NOSVE-NEXT:    mov x0, sp
+; NONEON-NOSVE-NEXT:    bl def
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp]
+; NONEON-NOSVE-NEXT:    zip1 v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    zip1 v1.2d, v3.2d, v2.2d
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x19]
+; NONEON-NOSVE-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %alloc = alloca [8 x double]
   call void @def(ptr %alloc)
   %load = load <8 x double>, ptr %alloc
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll
index 01a7a5cafd26..885030861469 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -19,19 +18,6 @@ define <4 x i32> @test(ptr %arg1, ptr %arg2) {
 ; CHECK-NEXT:    stp q2, q5, [x0, #32]
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: test:
-; NONEON-NOSVE:       // %bb.0: // %entry
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q3, q4, [x0]
-; NONEON-NOSVE-NEXT:    add v2.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    add v5.4s, v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    dup v0.4s, v1.s[2]
-; NONEON-NOSVE-NEXT:    add v1.4s, v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    add v3.4s, v4.4s, v4.4s
-; NONEON-NOSVE-NEXT:    stp q2, q5, [x0, #32]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0]
-; NONEON-NOSVE-NEXT:    ret
 entry:
   %0 = load <16 x i32>, ptr %arg1, align 256
   %1 = load <16 x i32>, ptr %arg2, align 256
@@ -56,19 +42,6 @@ define <2 x i32> @test2(ptr %arg1, ptr %arg2) {
 ; CHECK-NEXT:    stp q3, q4, [x0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: test2:
-; NONEON-NOSVE:       // %bb.0: // %entry
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q3, q4, [x0]
-; NONEON-NOSVE-NEXT:    add v2.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    dup v0.2s, v1.s[2]
-; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    add v3.4s, v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    add v4.4s, v4.4s, v4.4s
-; NONEON-NOSVE-NEXT:    stp q2, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    stp q3, q4, [x0]
-; NONEON-NOSVE-NEXT:    ret
 entry:
   %0 = load <16 x i32>, ptr %arg1, align 256
   %1 = load <16 x i32>, ptr %arg2, align 256
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll
index c57f3af0d4b6..8ca8e6980913 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -12,13 +11,6 @@ define <4 x i8> @load_v4i8(ptr %a) {
 ; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: load_v4i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr s0, [x0]
-; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %load = load <4 x i8>, ptr %a
   ret <4 x i8> %load
 }
@@ -28,11 +20,6 @@ define <8 x i8> @load_v8i8(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: load_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %load = load <8 x i8>, ptr %a
   ret <8 x i8> %load
 }
@@ -42,11 +29,6 @@ define <16 x i8> @load_v16i8(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: load_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %load = load <16 x i8>, ptr %a
   ret <16 x i8> %load
 }
@@ -56,11 +38,6 @@ define <32 x i8> @load_v32i8(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: load_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %load = load <32 x i8>, ptr %a
   ret <32 x i8> %load
 }
@@ -72,15 +49,6 @@ define <2 x i16> @load_v2i16(ptr %a) {
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: load_v2i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldrh w8, [x0]
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    add x8, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x8]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %load = load <2 x i16>, ptr %a
   ret <2 x i16> %load
 }
@@ -90,11 +58,6 @@ define <2 x half> @load_v2f16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: load_v2f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr s0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %load = load <2 x half>, ptr %a
   ret <2 x half> %load
 }
@@ -104,11 +67,6 @@ define <4 x i16> @load_v4i16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: load_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %load = load <4 x i16>, ptr %a
   ret <4 x i16> %load
 }
@@ -118,11 +76,6 @@ define <4 x half> @load_v4f16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: load_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %load = load <4 x half>, ptr %a
   ret <4 x half> %load
 }
@@ -132,11 +85,6 @@ define <8 x i16> @load_v8i16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: load_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %load = load <8 x i16>, ptr %a
   ret <8 x i16> %load
 }
@@ -146,11 +94,6 @@ define <8 x half> @load_v8f16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: load_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %load = load <8 x half>, ptr %a
   ret <8 x half> %load
 }
@@ -160,11 +103,6 @@ define <16 x i16> @load_v16i16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: load_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %load = load <16 x i16>, ptr %a
   ret <16 x i16> %load
 }
@@ -174,11 +112,6 @@ define <16 x half> @load_v16f16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: load_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %load = load <16 x half>, ptr %a
   ret <16 x half> %load
 }
@@ -188,11 +121,6 @@ define <2 x i32> @load_v2i32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: load_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %load = load <2 x i32>, ptr %a
   ret <2 x i32> %load
 }
@@ -202,11 +130,6 @@ define <2 x float> @load_v2f32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: load_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %load = load <2 x float>, ptr %a
   ret <2 x float> %load
 }
@@ -216,11 +139,6 @@ define <4 x i32> @load_v4i32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: load_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %load = load <4 x i32>, ptr %a
   ret <4 x i32> %load
 }
@@ -230,11 +148,6 @@ define <4 x float> @load_v4f32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: load_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %load = load <4 x float>, ptr %a
   ret <4 x float> %load
 }
@@ -244,11 +157,6 @@ define <8 x i32> @load_v8i32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: load_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %load = load <8 x i32>, ptr %a
   ret <8 x i32> %load
 }
@@ -258,11 +166,6 @@ define <8 x float> @load_v8f32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: load_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %load = load <8 x float>, ptr %a
   ret <8 x float> %load
 }
@@ -272,11 +175,6 @@ define <1 x i64> @load_v1i64(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: load_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %load = load <1 x i64>, ptr %a
   ret <1 x i64> %load
 }
@@ -286,11 +184,6 @@ define <1 x double> @load_v1f64(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: load_v1f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %load = load <1 x double>, ptr %a
   ret <1 x double> %load
 }
@@ -300,11 +193,6 @@ define <2 x i64> @load_v2i64(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: load_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %load = load <2 x i64>, ptr %a
   ret <2 x i64> %load
 }
@@ -314,11 +202,6 @@ define <2 x double> @load_v2f64(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: load_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %load = load <2 x double>, ptr %a
   ret <2 x double> %load
 }
@@ -328,11 +211,6 @@ define <4 x i64> @load_v4i64(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: load_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %load = load <4 x i64>, ptr %a
   ret <4 x i64> %load
 }
@@ -342,11 +220,6 @@ define <4 x double> @load_v4f64(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: load_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %load = load <4 x double>, ptr %a
   ret <4 x double> %load
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll
index 65c45587e120..c4aeb4465c53 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -18,14 +17,6 @@ define i8 @andv_v4i8(<4 x i8> %a) {
 ; CHECK-NEXT:    andv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: andv_v4i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
-; NONEON-NOSVE-NEXT:    and w0, w8, w9
-; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %a)
   ret i8 %res
 }
@@ -38,15 +29,6 @@ define i8 @andv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    andv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: andv_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #16
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
-; NONEON-NOSVE-NEXT:    and w0, w8, w9
-; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %a)
   ret i8 %res
 }
@@ -59,20 +41,6 @@ define i8 @andv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    andv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: andv_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #16
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
-; NONEON-NOSVE-NEXT:    and w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %a)
   ret i8 %res
 }
@@ -86,22 +54,6 @@ define i8 @andv_v32i8(ptr %a) {
 ; CHECK-NEXT:    andv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: andv_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #16
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
-; NONEON-NOSVE-NEXT:    and w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %op)
   ret i8 %res
@@ -115,13 +67,6 @@ define i16 @andv_v2i16(<2 x i16> %a) {
 ; CHECK-NEXT:    andv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: andv_v2i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
-; NONEON-NOSVE-NEXT:    and w0, w8, w9
-; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> %a)
   ret i16 %res
 }
@@ -134,14 +79,6 @@ define i16 @andv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    andv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: andv_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
-; NONEON-NOSVE-NEXT:    and w0, w8, w9
-; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %a)
   ret i16 %res
 }
@@ -154,19 +91,6 @@ define i16 @andv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    andv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: andv_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
-; NONEON-NOSVE-NEXT:    and w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %a)
   ret i16 %res
 }
@@ -180,21 +104,6 @@ define i16 @andv_v16i16(ptr %a) {
 ; CHECK-NEXT:    andv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: andv_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
-; NONEON-NOSVE-NEXT:    and w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %op)
   ret i16 %res
@@ -208,13 +117,6 @@ define i32 @andv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    andv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: andv_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
-; NONEON-NOSVE-NEXT:    and w0, w8, w9
-; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %a)
   ret i32 %res
 }
@@ -227,18 +129,6 @@ define i32 @andv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    andv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: andv_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
-; NONEON-NOSVE-NEXT:    and w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a)
   ret i32 %res
 }
@@ -252,20 +142,6 @@ define i32 @andv_v8i32(ptr %a) {
 ; CHECK-NEXT:    andv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: andv_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
-; NONEON-NOSVE-NEXT:    and w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %op)
   ret i32 %res
@@ -279,16 +155,6 @@ define i64 @andv_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    andv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: andv_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %a)
   ret i64 %res
 }
@@ -302,18 +168,6 @@ define i64 @andv_v4i64(ptr %a) {
 ; CHECK-NEXT:    andv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: andv_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %op)
   ret i64 %res
@@ -331,14 +185,6 @@ define i8 @eorv_v4i8(<4 x i8> %a) {
 ; CHECK-NEXT:    eorv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: eorv_v4i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
-; NONEON-NOSVE-NEXT:    eor w0, w8, w9
-; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> %a)
   ret i8 %res
 }
@@ -351,15 +197,6 @@ define i8 @eorv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    eorv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: eorv_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #16
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
-; NONEON-NOSVE-NEXT:    eor w0, w8, w9
-; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %a)
   ret i8 %res
 }
@@ -372,20 +209,6 @@ define i8 @eorv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    eorv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: eorv_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #16
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
-; NONEON-NOSVE-NEXT:    eor w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %a)
   ret i8 %res
 }
@@ -399,22 +222,6 @@ define i8 @eorv_v32i8(ptr %a) {
 ; CHECK-NEXT:    eorv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: eorv_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #16
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
-; NONEON-NOSVE-NEXT:    eor w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %op)
   ret i8 %res
@@ -428,13 +235,6 @@ define i16 @eorv_v2i16(<2 x i16> %a) {
 ; CHECK-NEXT:    eorv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: eorv_v2i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
-; NONEON-NOSVE-NEXT:    eor w0, w8, w9
-; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> %a)
   ret i16 %res
 }
@@ -447,14 +247,6 @@ define i16 @eorv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    eorv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: eorv_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
-; NONEON-NOSVE-NEXT:    eor w0, w8, w9
-; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %a)
   ret i16 %res
 }
@@ -467,19 +259,6 @@ define i16 @eorv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    eorv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: eorv_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
-; NONEON-NOSVE-NEXT:    eor w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %a)
   ret i16 %res
 }
@@ -493,21 +272,6 @@ define i16 @eorv_v16i16(ptr %a) {
 ; CHECK-NEXT:    eorv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: eorv_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
-; NONEON-NOSVE-NEXT:    eor w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %op)
   ret i16 %res
@@ -521,13 +285,6 @@ define i32 @eorv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    eorv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: eorv_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
-; NONEON-NOSVE-NEXT:    eor w0, w8, w9
-; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %a)
   ret i32 %res
 }
@@ -540,18 +297,6 @@ define i32 @eorv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    eorv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: eorv_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
-; NONEON-NOSVE-NEXT:    eor w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a)
   ret i32 %res
 }
@@ -565,20 +310,6 @@ define i32 @eorv_v8i32(ptr %a) {
 ; CHECK-NEXT:    eorv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: eorv_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
-; NONEON-NOSVE-NEXT:    eor w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %op)
   ret i32 %res
@@ -592,16 +323,6 @@ define i64 @eorv_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    eorv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: eorv_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %a)
   ret i64 %res
 }
@@ -615,18 +336,6 @@ define i64 @eorv_v4i64(ptr %a) {
 ; CHECK-NEXT:    eorv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: eorv_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %op)
   ret i64 %res
@@ -644,14 +353,6 @@ define i8 @orv_v4i8(<4 x i8> %a) {
 ; CHECK-NEXT:    orv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: orv_v4i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
-; NONEON-NOSVE-NEXT:    orr w0, w8, w9
-; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> %a)
   ret i8 %res
 }
@@ -664,15 +365,6 @@ define i8 @orv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    orv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: orv_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #16
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
-; NONEON-NOSVE-NEXT:    orr w0, w8, w9
-; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %a)
   ret i8 %res
 }
@@ -685,20 +377,6 @@ define i8 @orv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    orv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: orv_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #16
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
-; NONEON-NOSVE-NEXT:    orr w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %a)
   ret i8 %res
 }
@@ -712,22 +390,6 @@ define i8 @orv_v32i8(ptr %a) {
 ; CHECK-NEXT:    orv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: orv_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #16
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
-; NONEON-NOSVE-NEXT:    orr w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %op)
   ret i8 %res
@@ -741,13 +403,6 @@ define i16 @orv_v2i16(<2 x i16> %a) {
 ; CHECK-NEXT:    orv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: orv_v2i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
-; NONEON-NOSVE-NEXT:    orr w0, w8, w9
-; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> %a)
   ret i16 %res
 }
@@ -760,14 +415,6 @@ define i16 @orv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    orv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: orv_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
-; NONEON-NOSVE-NEXT:    orr w0, w8, w9
-; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %a)
   ret i16 %res
 }
@@ -780,19 +427,6 @@ define i16 @orv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    orv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: orv_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
-; NONEON-NOSVE-NEXT:    orr w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %a)
   ret i16 %res
 }
@@ -806,21 +440,6 @@ define i16 @orv_v16i16(ptr %a) {
 ; CHECK-NEXT:    orv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: orv_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
-; NONEON-NOSVE-NEXT:    orr w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %op)
   ret i16 %res
@@ -834,13 +453,6 @@ define i32 @orv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    orv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: orv_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
-; NONEON-NOSVE-NEXT:    orr w0, w8, w9
-; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %a)
   ret i32 %res
 }
@@ -853,18 +465,6 @@ define i32 @orv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    orv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: orv_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
-; NONEON-NOSVE-NEXT:    orr w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a)
   ret i32 %res
 }
@@ -878,20 +478,6 @@ define i32 @orv_v8i32(ptr %a) {
 ; CHECK-NEXT:    orv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: orv_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
-; NONEON-NOSVE-NEXT:    orr w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %op)
   ret i32 %res
@@ -905,16 +491,6 @@ define i64 @orv_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    orv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: orv_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %a)
   ret i64 %res
 }
@@ -928,18 +504,6 @@ define i64 @orv_v4i64(ptr %a) {
 ; CHECK-NEXT:    orv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: orv_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %op)
   ret i64 %res
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
index 886f97ed988d..ca58099244cf 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -20,44 +19,6 @@ define <4 x i8> @masked_load_v4i8(ptr %src, <4 x i1> %mask) {
 ; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: masked_load_v4i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI0_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI0_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv h0, v0.4h
-; NONEON-NOSVE-NEXT:    fmov w8, s0
-; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB0_2
-; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[0], [x0]
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB0_3
-; NONEON-NOSVE-NEXT:    b .LBB0_4
-; NONEON-NOSVE-NEXT:  .LBB0_2:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB0_4
-; NONEON-NOSVE-NEXT:  .LBB0_3: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x9, x0, #1
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[2], [x9]
-; NONEON-NOSVE-NEXT:  .LBB0_4: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB0_7
-; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB0_8
-; NONEON-NOSVE-NEXT:  .LBB0_6: // %else8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB0_7: // %cond.load4
-; NONEON-NOSVE-NEXT:    add x9, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[4], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB0_6
-; NONEON-NOSVE-NEXT:  .LBB0_8: // %cond.load7
-; NONEON-NOSVE-NEXT:    add x8, x0, #3
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[6], [x8]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %load = call <4 x i8> @llvm.masked.load.v4i8(ptr %src, i32 8, <4 x i1> %mask, <4 x i8> zeroinitializer)
   ret <4 x i8> %load
 }
@@ -73,67 +34,6 @@ define <8 x i8> @masked_load_v8i8(ptr %src, <8 x i1> %mask) {
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: masked_load_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.8b, v0.8b, #7
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI1_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI1_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.8b, v0.8b, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv b0, v0.8b
-; NONEON-NOSVE-NEXT:    fmov w8, s0
-; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB1_2
-; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr b0, [x0]
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB1_3
-; NONEON-NOSVE-NEXT:    b .LBB1_4
-; NONEON-NOSVE-NEXT:  .LBB1_2:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB1_4
-; NONEON-NOSVE-NEXT:  .LBB1_3: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x9, x0, #1
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[1], [x9]
-; NONEON-NOSVE-NEXT:  .LBB1_4: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB1_11
-; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB1_12
-; NONEON-NOSVE-NEXT:  .LBB1_6: // %else8
-; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB1_13
-; NONEON-NOSVE-NEXT:  .LBB1_7: // %else11
-; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB1_14
-; NONEON-NOSVE-NEXT:  .LBB1_8: // %else14
-; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB1_15
-; NONEON-NOSVE-NEXT:  .LBB1_9: // %else17
-; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB1_16
-; NONEON-NOSVE-NEXT:  .LBB1_10: // %else20
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB1_11: // %cond.load4
-; NONEON-NOSVE-NEXT:    add x9, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[2], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB1_6
-; NONEON-NOSVE-NEXT:  .LBB1_12: // %cond.load7
-; NONEON-NOSVE-NEXT:    add x9, x0, #3
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[3], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB1_7
-; NONEON-NOSVE-NEXT:  .LBB1_13: // %cond.load10
-; NONEON-NOSVE-NEXT:    add x9, x0, #4
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[4], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB1_8
-; NONEON-NOSVE-NEXT:  .LBB1_14: // %cond.load13
-; NONEON-NOSVE-NEXT:    add x9, x0, #5
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[5], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB1_9
-; NONEON-NOSVE-NEXT:  .LBB1_15: // %cond.load16
-; NONEON-NOSVE-NEXT:    add x9, x0, #6
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[6], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB1_10
-; NONEON-NOSVE-NEXT:  .LBB1_16: // %cond.load19
-; NONEON-NOSVE-NEXT:    add x8, x0, #7
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[7], [x8]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %load = call <8 x i8> @llvm.masked.load.v8i8(ptr %src, i32 8, <8 x i1> %mask, <8 x i8> zeroinitializer)
   ret <8 x i8> %load
 }
@@ -149,115 +49,6 @@ define <16 x i8> @masked_load_v16i8(ptr %src, <16 x i1> %mask) {
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: masked_load_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI2_0
-; NONEON-NOSVE-NEXT:    ldr q1, [x8, :lo12:.LCPI2_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    addv h1, v0.8h
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB2_17
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB2_18
-; NONEON-NOSVE-NEXT:  .LBB2_2: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB2_19
-; NONEON-NOSVE-NEXT:  .LBB2_3: // %else5
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB2_20
-; NONEON-NOSVE-NEXT:  .LBB2_4: // %else8
-; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB2_21
-; NONEON-NOSVE-NEXT:  .LBB2_5: // %else11
-; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB2_22
-; NONEON-NOSVE-NEXT:  .LBB2_6: // %else14
-; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB2_23
-; NONEON-NOSVE-NEXT:  .LBB2_7: // %else17
-; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB2_24
-; NONEON-NOSVE-NEXT:  .LBB2_8: // %else20
-; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB2_25
-; NONEON-NOSVE-NEXT:  .LBB2_9: // %else23
-; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB2_26
-; NONEON-NOSVE-NEXT:  .LBB2_10: // %else26
-; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB2_27
-; NONEON-NOSVE-NEXT:  .LBB2_11: // %else29
-; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB2_28
-; NONEON-NOSVE-NEXT:  .LBB2_12: // %else32
-; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB2_29
-; NONEON-NOSVE-NEXT:  .LBB2_13: // %else35
-; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB2_30
-; NONEON-NOSVE-NEXT:  .LBB2_14: // %else38
-; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB2_31
-; NONEON-NOSVE-NEXT:  .LBB2_15: // %else41
-; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB2_32
-; NONEON-NOSVE-NEXT:  .LBB2_16: // %else44
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB2_17: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr b0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB2_2
-; NONEON-NOSVE-NEXT:  .LBB2_18: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x9, x0, #1
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[1], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB2_3
-; NONEON-NOSVE-NEXT:  .LBB2_19: // %cond.load4
-; NONEON-NOSVE-NEXT:    add x9, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[2], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB2_4
-; NONEON-NOSVE-NEXT:  .LBB2_20: // %cond.load7
-; NONEON-NOSVE-NEXT:    add x9, x0, #3
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[3], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB2_5
-; NONEON-NOSVE-NEXT:  .LBB2_21: // %cond.load10
-; NONEON-NOSVE-NEXT:    add x9, x0, #4
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[4], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB2_6
-; NONEON-NOSVE-NEXT:  .LBB2_22: // %cond.load13
-; NONEON-NOSVE-NEXT:    add x9, x0, #5
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[5], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB2_7
-; NONEON-NOSVE-NEXT:  .LBB2_23: // %cond.load16
-; NONEON-NOSVE-NEXT:    add x9, x0, #6
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[6], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB2_8
-; NONEON-NOSVE-NEXT:  .LBB2_24: // %cond.load19
-; NONEON-NOSVE-NEXT:    add x9, x0, #7
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[7], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB2_9
-; NONEON-NOSVE-NEXT:  .LBB2_25: // %cond.load22
-; NONEON-NOSVE-NEXT:    add x9, x0, #8
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[8], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB2_10
-; NONEON-NOSVE-NEXT:  .LBB2_26: // %cond.load25
-; NONEON-NOSVE-NEXT:    add x9, x0, #9
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[9], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB2_11
-; NONEON-NOSVE-NEXT:  .LBB2_27: // %cond.load28
-; NONEON-NOSVE-NEXT:    add x9, x0, #10
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[10], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB2_12
-; NONEON-NOSVE-NEXT:  .LBB2_28: // %cond.load31
-; NONEON-NOSVE-NEXT:    add x9, x0, #11
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[11], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB2_13
-; NONEON-NOSVE-NEXT:  .LBB2_29: // %cond.load34
-; NONEON-NOSVE-NEXT:    add x9, x0, #12
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[12], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB2_14
-; NONEON-NOSVE-NEXT:  .LBB2_30: // %cond.load37
-; NONEON-NOSVE-NEXT:    add x9, x0, #13
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[13], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB2_15
-; NONEON-NOSVE-NEXT:  .LBB2_31: // %cond.load40
-; NONEON-NOSVE-NEXT:    add x9, x0, #14
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[14], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB2_16
-; NONEON-NOSVE-NEXT:  .LBB2_32: // %cond.load43
-; NONEON-NOSVE-NEXT:    add x8, x0, #15
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[15], [x8]
-; NONEON-NOSVE-NEXT:    ret
   %load = call <16 x i8> @llvm.masked.load.v16i8(ptr %src, i32 8, <16 x i1> %mask, <16 x i8> zeroinitializer)
   ret <16 x i8> %load
 }
@@ -339,277 +130,6 @@ define <32 x i8> @masked_load_v32i8(ptr %src, <32 x i1> %mask) {
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: masked_load_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #72]
-; NONEON-NOSVE-NEXT:    fmov s1, w1
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #80]
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #88]
-; NONEON-NOSVE-NEXT:    mov v1.b[1], w2
-; NONEON-NOSVE-NEXT:    mov v0.b[1], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp]
-; NONEON-NOSVE-NEXT:    mov v1.b[2], w3
-; NONEON-NOSVE-NEXT:    mov v0.b[2], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
-; NONEON-NOSVE-NEXT:    mov v1.b[3], w4
-; NONEON-NOSVE-NEXT:    mov v0.b[3], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #104]
-; NONEON-NOSVE-NEXT:    mov v1.b[4], w5
-; NONEON-NOSVE-NEXT:    mov v0.b[4], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
-; NONEON-NOSVE-NEXT:    mov v1.b[5], w6
-; NONEON-NOSVE-NEXT:    mov v0.b[5], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #120]
-; NONEON-NOSVE-NEXT:    mov v1.b[6], w7
-; NONEON-NOSVE-NEXT:    mov v0.b[6], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128]
-; NONEON-NOSVE-NEXT:    mov v1.b[7], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
-; NONEON-NOSVE-NEXT:    mov v0.b[7], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #136]
-; NONEON-NOSVE-NEXT:    mov v1.b[8], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
-; NONEON-NOSVE-NEXT:    mov v0.b[8], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #144]
-; NONEON-NOSVE-NEXT:    mov v1.b[9], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #24]
-; NONEON-NOSVE-NEXT:    mov v0.b[9], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #152]
-; NONEON-NOSVE-NEXT:    mov v1.b[10], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #32]
-; NONEON-NOSVE-NEXT:    mov v0.b[10], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #160]
-; NONEON-NOSVE-NEXT:    mov v1.b[11], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #40]
-; NONEON-NOSVE-NEXT:    mov v0.b[11], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #168]
-; NONEON-NOSVE-NEXT:    mov v1.b[12], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #48]
-; NONEON-NOSVE-NEXT:    mov v0.b[12], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #176]
-; NONEON-NOSVE-NEXT:    mov v1.b[13], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #56]
-; NONEON-NOSVE-NEXT:    mov v0.b[13], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #184]
-; NONEON-NOSVE-NEXT:    mov v1.b[14], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #64]
-; NONEON-NOSVE-NEXT:    mov v0.b[14], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #192]
-; NONEON-NOSVE-NEXT:    mov v1.b[15], w9
-; NONEON-NOSVE-NEXT:    mov v0.b[15], w8
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI3_0
-; NONEON-NOSVE-NEXT:    ldr q2, [x8, :lo12:.LCPI3_0]
-; NONEON-NOSVE-NEXT:    shl v1.16b, v1.16b, #7
-; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
-; NONEON-NOSVE-NEXT:    cmlt v1.16b, v1.16b, #0
-; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
-; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
-; NONEON-NOSVE-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    zip1 v1.16b, v1.16b, v3.16b
-; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    addv h1, v1.8h
-; NONEON-NOSVE-NEXT:    addv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    movi v1.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    fmov w9, s0
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    bfi w8, w9, #16, #16
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB3_33
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB3_34
-; NONEON-NOSVE-NEXT:  .LBB3_2: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB3_35
-; NONEON-NOSVE-NEXT:  .LBB3_3: // %else5
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB3_36
-; NONEON-NOSVE-NEXT:  .LBB3_4: // %else8
-; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB3_37
-; NONEON-NOSVE-NEXT:  .LBB3_5: // %else11
-; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB3_38
-; NONEON-NOSVE-NEXT:  .LBB3_6: // %else14
-; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB3_39
-; NONEON-NOSVE-NEXT:  .LBB3_7: // %else17
-; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB3_40
-; NONEON-NOSVE-NEXT:  .LBB3_8: // %else20
-; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB3_41
-; NONEON-NOSVE-NEXT:  .LBB3_9: // %else23
-; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB3_42
-; NONEON-NOSVE-NEXT:  .LBB3_10: // %else26
-; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB3_43
-; NONEON-NOSVE-NEXT:  .LBB3_11: // %else29
-; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB3_44
-; NONEON-NOSVE-NEXT:  .LBB3_12: // %else32
-; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB3_45
-; NONEON-NOSVE-NEXT:  .LBB3_13: // %else35
-; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB3_46
-; NONEON-NOSVE-NEXT:  .LBB3_14: // %else38
-; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB3_47
-; NONEON-NOSVE-NEXT:  .LBB3_15: // %else41
-; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB3_48
-; NONEON-NOSVE-NEXT:  .LBB3_16: // %else44
-; NONEON-NOSVE-NEXT:    tbnz w8, #16, .LBB3_49
-; NONEON-NOSVE-NEXT:  .LBB3_17: // %else47
-; NONEON-NOSVE-NEXT:    tbnz w8, #17, .LBB3_50
-; NONEON-NOSVE-NEXT:  .LBB3_18: // %else50
-; NONEON-NOSVE-NEXT:    tbnz w8, #18, .LBB3_51
-; NONEON-NOSVE-NEXT:  .LBB3_19: // %else53
-; NONEON-NOSVE-NEXT:    tbnz w8, #19, .LBB3_52
-; NONEON-NOSVE-NEXT:  .LBB3_20: // %else56
-; NONEON-NOSVE-NEXT:    tbnz w8, #20, .LBB3_53
-; NONEON-NOSVE-NEXT:  .LBB3_21: // %else59
-; NONEON-NOSVE-NEXT:    tbnz w8, #21, .LBB3_54
-; NONEON-NOSVE-NEXT:  .LBB3_22: // %else62
-; NONEON-NOSVE-NEXT:    tbnz w8, #22, .LBB3_55
-; NONEON-NOSVE-NEXT:  .LBB3_23: // %else65
-; NONEON-NOSVE-NEXT:    tbnz w8, #23, .LBB3_56
-; NONEON-NOSVE-NEXT:  .LBB3_24: // %else68
-; NONEON-NOSVE-NEXT:    tbnz w8, #24, .LBB3_57
-; NONEON-NOSVE-NEXT:  .LBB3_25: // %else71
-; NONEON-NOSVE-NEXT:    tbnz w8, #25, .LBB3_58
-; NONEON-NOSVE-NEXT:  .LBB3_26: // %else74
-; NONEON-NOSVE-NEXT:    tbnz w8, #26, .LBB3_59
-; NONEON-NOSVE-NEXT:  .LBB3_27: // %else77
-; NONEON-NOSVE-NEXT:    tbnz w8, #27, .LBB3_60
-; NONEON-NOSVE-NEXT:  .LBB3_28: // %else80
-; NONEON-NOSVE-NEXT:    tbnz w8, #28, .LBB3_61
-; NONEON-NOSVE-NEXT:  .LBB3_29: // %else83
-; NONEON-NOSVE-NEXT:    tbnz w8, #29, .LBB3_62
-; NONEON-NOSVE-NEXT:  .LBB3_30: // %else86
-; NONEON-NOSVE-NEXT:    tbnz w8, #30, .LBB3_63
-; NONEON-NOSVE-NEXT:  .LBB3_31: // %else89
-; NONEON-NOSVE-NEXT:    tbnz w8, #31, .LBB3_64
-; NONEON-NOSVE-NEXT:  .LBB3_32: // %else92
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB3_33: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr b0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB3_2
-; NONEON-NOSVE-NEXT:  .LBB3_34: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x9, x0, #1
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[1], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB3_3
-; NONEON-NOSVE-NEXT:  .LBB3_35: // %cond.load4
-; NONEON-NOSVE-NEXT:    add x9, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[2], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB3_4
-; NONEON-NOSVE-NEXT:  .LBB3_36: // %cond.load7
-; NONEON-NOSVE-NEXT:    add x9, x0, #3
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[3], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB3_5
-; NONEON-NOSVE-NEXT:  .LBB3_37: // %cond.load10
-; NONEON-NOSVE-NEXT:    add x9, x0, #4
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[4], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB3_6
-; NONEON-NOSVE-NEXT:  .LBB3_38: // %cond.load13
-; NONEON-NOSVE-NEXT:    add x9, x0, #5
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[5], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB3_7
-; NONEON-NOSVE-NEXT:  .LBB3_39: // %cond.load16
-; NONEON-NOSVE-NEXT:    add x9, x0, #6
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[6], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB3_8
-; NONEON-NOSVE-NEXT:  .LBB3_40: // %cond.load19
-; NONEON-NOSVE-NEXT:    add x9, x0, #7
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[7], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB3_9
-; NONEON-NOSVE-NEXT:  .LBB3_41: // %cond.load22
-; NONEON-NOSVE-NEXT:    add x9, x0, #8
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[8], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB3_10
-; NONEON-NOSVE-NEXT:  .LBB3_42: // %cond.load25
-; NONEON-NOSVE-NEXT:    add x9, x0, #9
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[9], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB3_11
-; NONEON-NOSVE-NEXT:  .LBB3_43: // %cond.load28
-; NONEON-NOSVE-NEXT:    add x9, x0, #10
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[10], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB3_12
-; NONEON-NOSVE-NEXT:  .LBB3_44: // %cond.load31
-; NONEON-NOSVE-NEXT:    add x9, x0, #11
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[11], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB3_13
-; NONEON-NOSVE-NEXT:  .LBB3_45: // %cond.load34
-; NONEON-NOSVE-NEXT:    add x9, x0, #12
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[12], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB3_14
-; NONEON-NOSVE-NEXT:  .LBB3_46: // %cond.load37
-; NONEON-NOSVE-NEXT:    add x9, x0, #13
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[13], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB3_15
-; NONEON-NOSVE-NEXT:  .LBB3_47: // %cond.load40
-; NONEON-NOSVE-NEXT:    add x9, x0, #14
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[14], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB3_16
-; NONEON-NOSVE-NEXT:  .LBB3_48: // %cond.load43
-; NONEON-NOSVE-NEXT:    add x9, x0, #15
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[15], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #16, .LBB3_17
-; NONEON-NOSVE-NEXT:  .LBB3_49: // %cond.load46
-; NONEON-NOSVE-NEXT:    add x9, x0, #16
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[0], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #17, .LBB3_18
-; NONEON-NOSVE-NEXT:  .LBB3_50: // %cond.load49
-; NONEON-NOSVE-NEXT:    add x9, x0, #17
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[1], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #18, .LBB3_19
-; NONEON-NOSVE-NEXT:  .LBB3_51: // %cond.load52
-; NONEON-NOSVE-NEXT:    add x9, x0, #18
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[2], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #19, .LBB3_20
-; NONEON-NOSVE-NEXT:  .LBB3_52: // %cond.load55
-; NONEON-NOSVE-NEXT:    add x9, x0, #19
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[3], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #20, .LBB3_21
-; NONEON-NOSVE-NEXT:  .LBB3_53: // %cond.load58
-; NONEON-NOSVE-NEXT:    add x9, x0, #20
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[4], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #21, .LBB3_22
-; NONEON-NOSVE-NEXT:  .LBB3_54: // %cond.load61
-; NONEON-NOSVE-NEXT:    add x9, x0, #21
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[5], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #22, .LBB3_23
-; NONEON-NOSVE-NEXT:  .LBB3_55: // %cond.load64
-; NONEON-NOSVE-NEXT:    add x9, x0, #22
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[6], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #23, .LBB3_24
-; NONEON-NOSVE-NEXT:  .LBB3_56: // %cond.load67
-; NONEON-NOSVE-NEXT:    add x9, x0, #23
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[7], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #24, .LBB3_25
-; NONEON-NOSVE-NEXT:  .LBB3_57: // %cond.load70
-; NONEON-NOSVE-NEXT:    add x9, x0, #24
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[8], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #25, .LBB3_26
-; NONEON-NOSVE-NEXT:  .LBB3_58: // %cond.load73
-; NONEON-NOSVE-NEXT:    add x9, x0, #25
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[9], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #26, .LBB3_27
-; NONEON-NOSVE-NEXT:  .LBB3_59: // %cond.load76
-; NONEON-NOSVE-NEXT:    add x9, x0, #26
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[10], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #27, .LBB3_28
-; NONEON-NOSVE-NEXT:  .LBB3_60: // %cond.load79
-; NONEON-NOSVE-NEXT:    add x9, x0, #27
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[11], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #28, .LBB3_29
-; NONEON-NOSVE-NEXT:  .LBB3_61: // %cond.load82
-; NONEON-NOSVE-NEXT:    add x9, x0, #28
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[12], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #29, .LBB3_30
-; NONEON-NOSVE-NEXT:  .LBB3_62: // %cond.load85
-; NONEON-NOSVE-NEXT:    add x9, x0, #29
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[13], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #30, .LBB3_31
-; NONEON-NOSVE-NEXT:  .LBB3_63: // %cond.load88
-; NONEON-NOSVE-NEXT:    add x9, x0, #30
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[14], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #31, .LBB3_32
-; NONEON-NOSVE-NEXT:  .LBB3_64: // %cond.load91
-; NONEON-NOSVE-NEXT:    add x8, x0, #31
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[15], [x8]
-; NONEON-NOSVE-NEXT:    ret
   %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %src, i32 8, <32 x i1> %mask, <32 x i8> zeroinitializer)
   ret <32 x i8> %load
 }
@@ -635,31 +155,6 @@ define <2 x half> @masked_load_v2f16(ptr %src, <2 x i1> %mask) {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: masked_load_v2f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #31
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI4_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI4_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.2s, v0.2s, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addp v1.2s, v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    movi d0, #0000000000000000
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB4_3
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB4_4
-; NONEON-NOSVE-NEXT:  .LBB4_2: // %else2
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB4_3: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr h0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB4_2
-; NONEON-NOSVE-NEXT:  .LBB4_4: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x8, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[1], [x8]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %load = call <2 x half> @llvm.masked.load.v2f16(ptr %src, i32 8, <2 x i1> %mask, <2 x half> zeroinitializer)
   ret <2 x half> %load
 }
@@ -675,43 +170,6 @@ define <4 x half> @masked_load_v4f16(ptr %src, <4 x i1> %mask) {
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: masked_load_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI5_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI5_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv h1, v0.4h
-; NONEON-NOSVE-NEXT:    movi d0, #0000000000000000
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB5_5
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB5_6
-; NONEON-NOSVE-NEXT:  .LBB5_2: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB5_7
-; NONEON-NOSVE-NEXT:  .LBB5_3: // %else5
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB5_8
-; NONEON-NOSVE-NEXT:  .LBB5_4: // %else8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB5_5: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr h0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB5_2
-; NONEON-NOSVE-NEXT:  .LBB5_6: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x9, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[1], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB5_3
-; NONEON-NOSVE-NEXT:  .LBB5_7: // %cond.load4
-; NONEON-NOSVE-NEXT:    add x9, x0, #4
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB5_4
-; NONEON-NOSVE-NEXT:  .LBB5_8: // %cond.load7
-; NONEON-NOSVE-NEXT:    add x8, x0, #6
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[3], [x8]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %load = call <4 x half> @llvm.masked.load.v4f16(ptr %src, i32 8, <4 x i1> %mask, <4 x half> zeroinitializer)
   ret <4 x half> %load
 }
@@ -728,65 +186,6 @@ define <8 x half> @masked_load_v8f16(ptr %src, <8 x i1> %mask) {
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: masked_load_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.8b, v0.8b, #7
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI6_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI6_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.8b, v0.8b, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv b1, v0.8b
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB6_9
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB6_10
-; NONEON-NOSVE-NEXT:  .LBB6_2: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB6_11
-; NONEON-NOSVE-NEXT:  .LBB6_3: // %else5
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB6_12
-; NONEON-NOSVE-NEXT:  .LBB6_4: // %else8
-; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB6_13
-; NONEON-NOSVE-NEXT:  .LBB6_5: // %else11
-; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB6_14
-; NONEON-NOSVE-NEXT:  .LBB6_6: // %else14
-; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB6_15
-; NONEON-NOSVE-NEXT:  .LBB6_7: // %else17
-; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB6_16
-; NONEON-NOSVE-NEXT:  .LBB6_8: // %else20
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB6_9: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr h0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB6_2
-; NONEON-NOSVE-NEXT:  .LBB6_10: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x9, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[1], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB6_3
-; NONEON-NOSVE-NEXT:  .LBB6_11: // %cond.load4
-; NONEON-NOSVE-NEXT:    add x9, x0, #4
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB6_4
-; NONEON-NOSVE-NEXT:  .LBB6_12: // %cond.load7
-; NONEON-NOSVE-NEXT:    add x9, x0, #6
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[3], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB6_5
-; NONEON-NOSVE-NEXT:  .LBB6_13: // %cond.load10
-; NONEON-NOSVE-NEXT:    add x9, x0, #8
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[4], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB6_6
-; NONEON-NOSVE-NEXT:  .LBB6_14: // %cond.load13
-; NONEON-NOSVE-NEXT:    add x9, x0, #10
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[5], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB6_7
-; NONEON-NOSVE-NEXT:  .LBB6_15: // %cond.load16
-; NONEON-NOSVE-NEXT:    add x9, x0, #12
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[6], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB6_8
-; NONEON-NOSVE-NEXT:  .LBB6_16: // %cond.load19
-; NONEON-NOSVE-NEXT:    add x8, x0, #14
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[7], [x8]
-; NONEON-NOSVE-NEXT:    ret
   %load = call <8 x half> @llvm.masked.load.v8f16(ptr %src, i32 8, <8 x i1> %mask, <8 x half> zeroinitializer)
   ret <8 x half> %load
 }
@@ -811,116 +210,6 @@ define <16 x half> @masked_load_v16f16(ptr %src, <16 x i1> %mask) {
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0, x8, lsl #1]
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: masked_load_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI7_0
-; NONEON-NOSVE-NEXT:    ldr q1, [x8, :lo12:.LCPI7_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    movi v1.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    addv h2, v0.8h
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    fmov w8, s2
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB7_17
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB7_18
-; NONEON-NOSVE-NEXT:  .LBB7_2: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB7_19
-; NONEON-NOSVE-NEXT:  .LBB7_3: // %else5
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB7_20
-; NONEON-NOSVE-NEXT:  .LBB7_4: // %else8
-; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB7_21
-; NONEON-NOSVE-NEXT:  .LBB7_5: // %else11
-; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB7_22
-; NONEON-NOSVE-NEXT:  .LBB7_6: // %else14
-; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB7_23
-; NONEON-NOSVE-NEXT:  .LBB7_7: // %else17
-; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB7_24
-; NONEON-NOSVE-NEXT:  .LBB7_8: // %else20
-; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB7_25
-; NONEON-NOSVE-NEXT:  .LBB7_9: // %else23
-; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB7_26
-; NONEON-NOSVE-NEXT:  .LBB7_10: // %else26
-; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB7_27
-; NONEON-NOSVE-NEXT:  .LBB7_11: // %else29
-; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB7_28
-; NONEON-NOSVE-NEXT:  .LBB7_12: // %else32
-; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB7_29
-; NONEON-NOSVE-NEXT:  .LBB7_13: // %else35
-; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB7_30
-; NONEON-NOSVE-NEXT:  .LBB7_14: // %else38
-; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB7_31
-; NONEON-NOSVE-NEXT:  .LBB7_15: // %else41
-; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB7_32
-; NONEON-NOSVE-NEXT:  .LBB7_16: // %else44
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB7_17: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr h0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB7_2
-; NONEON-NOSVE-NEXT:  .LBB7_18: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x9, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[1], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB7_3
-; NONEON-NOSVE-NEXT:  .LBB7_19: // %cond.load4
-; NONEON-NOSVE-NEXT:    add x9, x0, #4
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB7_4
-; NONEON-NOSVE-NEXT:  .LBB7_20: // %cond.load7
-; NONEON-NOSVE-NEXT:    add x9, x0, #6
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[3], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB7_5
-; NONEON-NOSVE-NEXT:  .LBB7_21: // %cond.load10
-; NONEON-NOSVE-NEXT:    add x9, x0, #8
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[4], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB7_6
-; NONEON-NOSVE-NEXT:  .LBB7_22: // %cond.load13
-; NONEON-NOSVE-NEXT:    add x9, x0, #10
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[5], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB7_7
-; NONEON-NOSVE-NEXT:  .LBB7_23: // %cond.load16
-; NONEON-NOSVE-NEXT:    add x9, x0, #12
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[6], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB7_8
-; NONEON-NOSVE-NEXT:  .LBB7_24: // %cond.load19
-; NONEON-NOSVE-NEXT:    add x9, x0, #14
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[7], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB7_9
-; NONEON-NOSVE-NEXT:  .LBB7_25: // %cond.load22
-; NONEON-NOSVE-NEXT:    add x9, x0, #16
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[0], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB7_10
-; NONEON-NOSVE-NEXT:  .LBB7_26: // %cond.load25
-; NONEON-NOSVE-NEXT:    add x9, x0, #18
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[1], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB7_11
-; NONEON-NOSVE-NEXT:  .LBB7_27: // %cond.load28
-; NONEON-NOSVE-NEXT:    add x9, x0, #20
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[2], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB7_12
-; NONEON-NOSVE-NEXT:  .LBB7_28: // %cond.load31
-; NONEON-NOSVE-NEXT:    add x9, x0, #22
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[3], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB7_13
-; NONEON-NOSVE-NEXT:  .LBB7_29: // %cond.load34
-; NONEON-NOSVE-NEXT:    add x9, x0, #24
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[4], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB7_14
-; NONEON-NOSVE-NEXT:  .LBB7_30: // %cond.load37
-; NONEON-NOSVE-NEXT:    add x9, x0, #26
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[5], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB7_15
-; NONEON-NOSVE-NEXT:  .LBB7_31: // %cond.load40
-; NONEON-NOSVE-NEXT:    add x9, x0, #28
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[6], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB7_16
-; NONEON-NOSVE-NEXT:  .LBB7_32: // %cond.load43
-; NONEON-NOSVE-NEXT:    add x8, x0, #30
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[7], [x8]
-; NONEON-NOSVE-NEXT:    ret
   %load = call <16 x half> @llvm.masked.load.v16f16(ptr %src, i32 8, <16 x i1> %mask, <16 x half> zeroinitializer)
   ret <16 x half> %load
 }
@@ -936,31 +225,6 @@ define <2 x float> @masked_load_v2f32(ptr %src, <2 x i1> %mask) {
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: masked_load_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #31
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI8_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI8_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.2s, v0.2s, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addp v1.2s, v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    movi d0, #0000000000000000
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB8_3
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB8_4
-; NONEON-NOSVE-NEXT:  .LBB8_2: // %else2
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB8_3: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr s0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB8_2
-; NONEON-NOSVE-NEXT:  .LBB8_4: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x8, x0, #4
-; NONEON-NOSVE-NEXT:    ld1 { v0.s }[1], [x8]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
   %load = call <2 x float> @llvm.masked.load.v2f32(ptr %src, i32 8, <2 x i1> %mask, <2 x float> zeroinitializer)
   ret <2 x float> %load
 }
@@ -977,41 +241,6 @@ define <4 x float> @masked_load_v4f32(ptr %src, <4 x i1> %mask) {
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: masked_load_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI9_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI9_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv h1, v0.4h
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB9_5
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB9_6
-; NONEON-NOSVE-NEXT:  .LBB9_2: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB9_7
-; NONEON-NOSVE-NEXT:  .LBB9_3: // %else5
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB9_8
-; NONEON-NOSVE-NEXT:  .LBB9_4: // %else8
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB9_5: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr s0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB9_2
-; NONEON-NOSVE-NEXT:  .LBB9_6: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x9, x0, #4
-; NONEON-NOSVE-NEXT:    ld1 { v0.s }[1], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB9_3
-; NONEON-NOSVE-NEXT:  .LBB9_7: // %cond.load4
-; NONEON-NOSVE-NEXT:    add x9, x0, #8
-; NONEON-NOSVE-NEXT:    ld1 { v0.s }[2], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB9_4
-; NONEON-NOSVE-NEXT:  .LBB9_8: // %cond.load7
-; NONEON-NOSVE-NEXT:    add x8, x0, #12
-; NONEON-NOSVE-NEXT:    ld1 { v0.s }[3], [x8]
-; NONEON-NOSVE-NEXT:    ret
   %load = call <4 x float> @llvm.masked.load.v4f32(ptr %src, i32 8, <4 x i1> %mask, <4 x float> zeroinitializer)
   ret <4 x float> %load
 }
@@ -1061,66 +290,6 @@ define <8 x float> @masked_load_v8f32(ptr %src, <8 x i1> %mask) {
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: masked_load_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.8b, v0.8b, #7
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI10_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI10_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.8b, v0.8b, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    movi v1.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    addv b2, v0.8b
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    fmov w8, s2
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB10_9
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB10_10
-; NONEON-NOSVE-NEXT:  .LBB10_2: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB10_11
-; NONEON-NOSVE-NEXT:  .LBB10_3: // %else5
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB10_12
-; NONEON-NOSVE-NEXT:  .LBB10_4: // %else8
-; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB10_13
-; NONEON-NOSVE-NEXT:  .LBB10_5: // %else11
-; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB10_14
-; NONEON-NOSVE-NEXT:  .LBB10_6: // %else14
-; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB10_15
-; NONEON-NOSVE-NEXT:  .LBB10_7: // %else17
-; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB10_16
-; NONEON-NOSVE-NEXT:  .LBB10_8: // %else20
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB10_9: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr s0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB10_2
-; NONEON-NOSVE-NEXT:  .LBB10_10: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x9, x0, #4
-; NONEON-NOSVE-NEXT:    ld1 { v0.s }[1], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB10_3
-; NONEON-NOSVE-NEXT:  .LBB10_11: // %cond.load4
-; NONEON-NOSVE-NEXT:    add x9, x0, #8
-; NONEON-NOSVE-NEXT:    ld1 { v0.s }[2], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB10_4
-; NONEON-NOSVE-NEXT:  .LBB10_12: // %cond.load7
-; NONEON-NOSVE-NEXT:    add x9, x0, #12
-; NONEON-NOSVE-NEXT:    ld1 { v0.s }[3], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB10_5
-; NONEON-NOSVE-NEXT:  .LBB10_13: // %cond.load10
-; NONEON-NOSVE-NEXT:    add x9, x0, #16
-; NONEON-NOSVE-NEXT:    ld1 { v1.s }[0], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB10_6
-; NONEON-NOSVE-NEXT:  .LBB10_14: // %cond.load13
-; NONEON-NOSVE-NEXT:    add x9, x0, #20
-; NONEON-NOSVE-NEXT:    ld1 { v1.s }[1], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB10_7
-; NONEON-NOSVE-NEXT:  .LBB10_15: // %cond.load16
-; NONEON-NOSVE-NEXT:    add x9, x0, #24
-; NONEON-NOSVE-NEXT:    ld1 { v1.s }[2], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB10_8
-; NONEON-NOSVE-NEXT:  .LBB10_16: // %cond.load19
-; NONEON-NOSVE-NEXT:    add x8, x0, #28
-; NONEON-NOSVE-NEXT:    ld1 { v1.s }[3], [x8]
-; NONEON-NOSVE-NEXT:    ret
   %load = call <8 x float> @llvm.masked.load.v8f32(ptr %src, i32 8, <8 x i1> %mask, <8 x float> zeroinitializer)
   ret <8 x float> %load
 }
@@ -1137,29 +306,6 @@ define <2 x double> @masked_load_v2f64(ptr %src, <2 x i1> %mask) {
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: masked_load_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #31
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI11_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI11_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.2s, v0.2s, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addp v1.2s, v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB11_3
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB11_4
-; NONEON-NOSVE-NEXT:  .LBB11_2: // %else2
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB11_3: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB11_2
-; NONEON-NOSVE-NEXT:  .LBB11_4: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x8, x0, #8
-; NONEON-NOSVE-NEXT:    ld1 { v0.d }[1], [x8]
-; NONEON-NOSVE-NEXT:    ret
   %load = call <2 x double> @llvm.masked.load.v2f64(ptr %src, i32 8, <2 x i1> %mask, <2 x double> zeroinitializer)
   ret <2 x double> %load
 }
@@ -1185,42 +331,6 @@ define <4 x double> @masked_load_v4f64(ptr %src, <4 x i1> %mask) {
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0, x8, lsl #3]
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: masked_load_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI12_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI12_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    movi v1.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    addv h2, v0.4h
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    fmov w8, s2
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB12_5
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB12_6
-; NONEON-NOSVE-NEXT:  .LBB12_2: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB12_7
-; NONEON-NOSVE-NEXT:  .LBB12_3: // %else5
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB12_8
-; NONEON-NOSVE-NEXT:  .LBB12_4: // %else8
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB12_5: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB12_2
-; NONEON-NOSVE-NEXT:  .LBB12_6: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x9, x0, #8
-; NONEON-NOSVE-NEXT:    ld1 { v0.d }[1], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB12_3
-; NONEON-NOSVE-NEXT:  .LBB12_7: // %cond.load4
-; NONEON-NOSVE-NEXT:    add x9, x0, #16
-; NONEON-NOSVE-NEXT:    ld1 { v1.d }[0], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB12_4
-; NONEON-NOSVE-NEXT:  .LBB12_8: // %cond.load7
-; NONEON-NOSVE-NEXT:    add x8, x0, #24
-; NONEON-NOSVE-NEXT:    ld1 { v1.d }[1], [x8]
-; NONEON-NOSVE-NEXT:    ret
   %load = call <4 x double> @llvm.masked.load.v4f64(ptr %src, i32 8, <4 x i1> %mask, <4 x double> zeroinitializer)
   ret <4 x double> %load
 }
@@ -1246,38 +356,6 @@ define <3 x i32> @masked_load_zext_v3i32(ptr %load_ptr, <3 x i1> %pm) {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: masked_load_zext_v3i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #16
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    and w8, w1, #0x1
-; NONEON-NOSVE-NEXT:    bfi w8, w2, #1, #1
-; NONEON-NOSVE-NEXT:    bfi w8, w3, #2, #1
-; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB13_2
-; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr h0, [x0]
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB13_3
-; NONEON-NOSVE-NEXT:    b .LBB13_4
-; NONEON-NOSVE-NEXT:  .LBB13_2:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB13_4
-; NONEON-NOSVE-NEXT:  .LBB13_3: // %cond.load1
-; NONEON-NOSVE-NEXT:    mov v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add x9, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[1], [x9]
-; NONEON-NOSVE-NEXT:    mov v1.h[2], v0.h[2]
-; NONEON-NOSVE-NEXT:    fmov d0, d1
-; NONEON-NOSVE-NEXT:  .LBB13_4: // %else2
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB13_6
-; NONEON-NOSVE-NEXT:  // %bb.5: // %cond.load4
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v0.h[1]
-; NONEON-NOSVE-NEXT:    add x8, x0, #4
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x8]
-; NONEON-NOSVE-NEXT:  .LBB13_6: // %else5
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %load_value = tail call <3 x i16> @llvm.masked.load.v3i16.p0(ptr %load_ptr, i32 4, <3 x i1> %pm, <3 x i16> zeroinitializer)
   %extend = zext <3 x i16> %load_value to <3 x i32>
   ret <3 x i32> %extend;
@@ -1304,38 +382,6 @@ define <3 x i32> @masked_load_sext_v3i32(ptr %load_ptr, <3 x i1> %pm) {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: masked_load_sext_v3i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #16
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    and w8, w1, #0x1
-; NONEON-NOSVE-NEXT:    bfi w8, w2, #1, #1
-; NONEON-NOSVE-NEXT:    bfi w8, w3, #2, #1
-; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB14_2
-; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr h0, [x0]
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB14_3
-; NONEON-NOSVE-NEXT:    b .LBB14_4
-; NONEON-NOSVE-NEXT:  .LBB14_2:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB14_4
-; NONEON-NOSVE-NEXT:  .LBB14_3: // %cond.load1
-; NONEON-NOSVE-NEXT:    mov v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add x9, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[1], [x9]
-; NONEON-NOSVE-NEXT:    mov v1.h[2], v0.h[2]
-; NONEON-NOSVE-NEXT:    fmov d0, d1
-; NONEON-NOSVE-NEXT:  .LBB14_4: // %else2
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB14_6
-; NONEON-NOSVE-NEXT:  // %bb.5: // %cond.load4
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v0.h[1]
-; NONEON-NOSVE-NEXT:    add x8, x0, #4
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x8]
-; NONEON-NOSVE-NEXT:  .LBB14_6: // %else5
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
-; NONEON-NOSVE-NEXT:    ret
   %load_value = tail call <3 x i16> @llvm.masked.load.v3i16.p0(ptr %load_ptr, i32 4, <3 x i1> %pm, <3 x i16> zeroinitializer)
   %extend = sext <3 x i16> %load_value to <3 x i32>
   ret <3 x i32> %extend;
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
index b175dcf3e9a0..f2b3f9b12ea7 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -20,37 +19,6 @@ define void @masked_store_v4i8(ptr %dst, <4 x i1> %mask) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: masked_store_v4i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI0_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI0_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv h0, v0.4h
-; NONEON-NOSVE-NEXT:    fmov w8, s0
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB0_5
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB0_6
-; NONEON-NOSVE-NEXT:  .LBB0_2: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB0_7
-; NONEON-NOSVE-NEXT:  .LBB0_3: // %else4
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB0_8
-; NONEON-NOSVE-NEXT:  .LBB0_4: // %else6
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB0_5: // %cond.store
-; NONEON-NOSVE-NEXT:    strb wzr, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB0_2
-; NONEON-NOSVE-NEXT:  .LBB0_6: // %cond.store1
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #1]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB0_3
-; NONEON-NOSVE-NEXT:  .LBB0_7: // %cond.store3
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #2]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB0_4
-; NONEON-NOSVE-NEXT:  .LBB0_8: // %cond.store5
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #3]
-; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v4i8(<4 x i8> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask)
   ret void
 }
@@ -66,57 +34,6 @@ define void @masked_store_v8i8(ptr %dst, <8 x i1> %mask) {
 ; CHECK-NEXT:    mov z0.b, #0 // =0x0
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: masked_store_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.8b, v0.8b, #7
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI1_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI1_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.8b, v0.8b, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv b0, v0.8b
-; NONEON-NOSVE-NEXT:    fmov w8, s0
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB1_9
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB1_10
-; NONEON-NOSVE-NEXT:  .LBB1_2: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB1_11
-; NONEON-NOSVE-NEXT:  .LBB1_3: // %else4
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB1_12
-; NONEON-NOSVE-NEXT:  .LBB1_4: // %else6
-; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB1_13
-; NONEON-NOSVE-NEXT:  .LBB1_5: // %else8
-; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB1_14
-; NONEON-NOSVE-NEXT:  .LBB1_6: // %else10
-; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB1_15
-; NONEON-NOSVE-NEXT:  .LBB1_7: // %else12
-; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB1_16
-; NONEON-NOSVE-NEXT:  .LBB1_8: // %else14
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB1_9: // %cond.store
-; NONEON-NOSVE-NEXT:    strb wzr, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB1_2
-; NONEON-NOSVE-NEXT:  .LBB1_10: // %cond.store1
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #1]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB1_3
-; NONEON-NOSVE-NEXT:  .LBB1_11: // %cond.store3
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #2]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB1_4
-; NONEON-NOSVE-NEXT:  .LBB1_12: // %cond.store5
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #3]
-; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB1_5
-; NONEON-NOSVE-NEXT:  .LBB1_13: // %cond.store7
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #4]
-; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB1_6
-; NONEON-NOSVE-NEXT:  .LBB1_14: // %cond.store9
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #5]
-; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB1_7
-; NONEON-NOSVE-NEXT:  .LBB1_15: // %cond.store11
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #6]
-; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB1_8
-; NONEON-NOSVE-NEXT:  .LBB1_16: // %cond.store13
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #7]
-; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v8i8(<8 x i8> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask)
   ret void
 }
@@ -132,99 +49,6 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) {
 ; CHECK-NEXT:    mov z0.b, #0 // =0x0
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: masked_store_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI2_0
-; NONEON-NOSVE-NEXT:    ldr q1, [x8, :lo12:.LCPI2_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    addv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w8, s0
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB2_17
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB2_18
-; NONEON-NOSVE-NEXT:  .LBB2_2: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB2_19
-; NONEON-NOSVE-NEXT:  .LBB2_3: // %else4
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB2_20
-; NONEON-NOSVE-NEXT:  .LBB2_4: // %else6
-; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB2_21
-; NONEON-NOSVE-NEXT:  .LBB2_5: // %else8
-; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB2_22
-; NONEON-NOSVE-NEXT:  .LBB2_6: // %else10
-; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB2_23
-; NONEON-NOSVE-NEXT:  .LBB2_7: // %else12
-; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB2_24
-; NONEON-NOSVE-NEXT:  .LBB2_8: // %else14
-; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB2_25
-; NONEON-NOSVE-NEXT:  .LBB2_9: // %else16
-; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB2_26
-; NONEON-NOSVE-NEXT:  .LBB2_10: // %else18
-; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB2_27
-; NONEON-NOSVE-NEXT:  .LBB2_11: // %else20
-; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB2_28
-; NONEON-NOSVE-NEXT:  .LBB2_12: // %else22
-; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB2_29
-; NONEON-NOSVE-NEXT:  .LBB2_13: // %else24
-; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB2_30
-; NONEON-NOSVE-NEXT:  .LBB2_14: // %else26
-; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB2_31
-; NONEON-NOSVE-NEXT:  .LBB2_15: // %else28
-; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB2_32
-; NONEON-NOSVE-NEXT:  .LBB2_16: // %else30
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB2_17: // %cond.store
-; NONEON-NOSVE-NEXT:    strb wzr, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB2_2
-; NONEON-NOSVE-NEXT:  .LBB2_18: // %cond.store1
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #1]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB2_3
-; NONEON-NOSVE-NEXT:  .LBB2_19: // %cond.store3
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #2]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB2_4
-; NONEON-NOSVE-NEXT:  .LBB2_20: // %cond.store5
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #3]
-; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB2_5
-; NONEON-NOSVE-NEXT:  .LBB2_21: // %cond.store7
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #4]
-; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB2_6
-; NONEON-NOSVE-NEXT:  .LBB2_22: // %cond.store9
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #5]
-; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB2_7
-; NONEON-NOSVE-NEXT:  .LBB2_23: // %cond.store11
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #6]
-; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB2_8
-; NONEON-NOSVE-NEXT:  .LBB2_24: // %cond.store13
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #7]
-; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB2_9
-; NONEON-NOSVE-NEXT:  .LBB2_25: // %cond.store15
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #8]
-; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB2_10
-; NONEON-NOSVE-NEXT:  .LBB2_26: // %cond.store17
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #9]
-; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB2_11
-; NONEON-NOSVE-NEXT:  .LBB2_27: // %cond.store19
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #10]
-; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB2_12
-; NONEON-NOSVE-NEXT:  .LBB2_28: // %cond.store21
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #11]
-; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB2_13
-; NONEON-NOSVE-NEXT:  .LBB2_29: // %cond.store23
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #12]
-; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB2_14
-; NONEON-NOSVE-NEXT:  .LBB2_30: // %cond.store25
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #13]
-; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB2_15
-; NONEON-NOSVE-NEXT:  .LBB2_31: // %cond.store27
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #14]
-; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB2_16
-; NONEON-NOSVE-NEXT:  .LBB2_32: // %cond.store29
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #15]
-; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v16i8(<16 x i8> zeroinitializer, ptr %dst, i32 8, <16 x i1> %mask)
   ret void
 }
@@ -305,244 +129,6 @@ define void @masked_store_v32i8(ptr %dst, <32 x i1> %mask) {
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: masked_store_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #72]
-; NONEON-NOSVE-NEXT:    fmov s1, w1
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #80]
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #88]
-; NONEON-NOSVE-NEXT:    mov v1.b[1], w2
-; NONEON-NOSVE-NEXT:    mov v0.b[1], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp]
-; NONEON-NOSVE-NEXT:    mov v1.b[2], w3
-; NONEON-NOSVE-NEXT:    mov v0.b[2], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
-; NONEON-NOSVE-NEXT:    mov v1.b[3], w4
-; NONEON-NOSVE-NEXT:    mov v0.b[3], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #104]
-; NONEON-NOSVE-NEXT:    mov v1.b[4], w5
-; NONEON-NOSVE-NEXT:    mov v0.b[4], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
-; NONEON-NOSVE-NEXT:    mov v1.b[5], w6
-; NONEON-NOSVE-NEXT:    mov v0.b[5], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #120]
-; NONEON-NOSVE-NEXT:    mov v1.b[6], w7
-; NONEON-NOSVE-NEXT:    mov v0.b[6], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128]
-; NONEON-NOSVE-NEXT:    mov v1.b[7], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
-; NONEON-NOSVE-NEXT:    mov v0.b[7], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #136]
-; NONEON-NOSVE-NEXT:    mov v1.b[8], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
-; NONEON-NOSVE-NEXT:    mov v0.b[8], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #144]
-; NONEON-NOSVE-NEXT:    mov v1.b[9], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #24]
-; NONEON-NOSVE-NEXT:    mov v0.b[9], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #152]
-; NONEON-NOSVE-NEXT:    mov v1.b[10], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #32]
-; NONEON-NOSVE-NEXT:    mov v0.b[10], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #160]
-; NONEON-NOSVE-NEXT:    mov v1.b[11], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #40]
-; NONEON-NOSVE-NEXT:    mov v0.b[11], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #168]
-; NONEON-NOSVE-NEXT:    mov v1.b[12], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #48]
-; NONEON-NOSVE-NEXT:    mov v0.b[12], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #176]
-; NONEON-NOSVE-NEXT:    mov v1.b[13], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #56]
-; NONEON-NOSVE-NEXT:    mov v0.b[13], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #184]
-; NONEON-NOSVE-NEXT:    mov v1.b[14], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #64]
-; NONEON-NOSVE-NEXT:    mov v0.b[14], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #192]
-; NONEON-NOSVE-NEXT:    mov v1.b[15], w9
-; NONEON-NOSVE-NEXT:    mov v0.b[15], w8
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI3_0
-; NONEON-NOSVE-NEXT:    ldr q2, [x8, :lo12:.LCPI3_0]
-; NONEON-NOSVE-NEXT:    shl v1.16b, v1.16b, #7
-; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
-; NONEON-NOSVE-NEXT:    cmlt v1.16b, v1.16b, #0
-; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
-; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
-; NONEON-NOSVE-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    zip1 v1.16b, v1.16b, v3.16b
-; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    addv h1, v1.8h
-; NONEON-NOSVE-NEXT:    addv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    fmov w9, s0
-; NONEON-NOSVE-NEXT:    bfi w8, w9, #16, #16
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB3_33
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB3_34
-; NONEON-NOSVE-NEXT:  .LBB3_2: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB3_35
-; NONEON-NOSVE-NEXT:  .LBB3_3: // %else4
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB3_36
-; NONEON-NOSVE-NEXT:  .LBB3_4: // %else6
-; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB3_37
-; NONEON-NOSVE-NEXT:  .LBB3_5: // %else8
-; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB3_38
-; NONEON-NOSVE-NEXT:  .LBB3_6: // %else10
-; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB3_39
-; NONEON-NOSVE-NEXT:  .LBB3_7: // %else12
-; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB3_40
-; NONEON-NOSVE-NEXT:  .LBB3_8: // %else14
-; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB3_41
-; NONEON-NOSVE-NEXT:  .LBB3_9: // %else16
-; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB3_42
-; NONEON-NOSVE-NEXT:  .LBB3_10: // %else18
-; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB3_43
-; NONEON-NOSVE-NEXT:  .LBB3_11: // %else20
-; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB3_44
-; NONEON-NOSVE-NEXT:  .LBB3_12: // %else22
-; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB3_45
-; NONEON-NOSVE-NEXT:  .LBB3_13: // %else24
-; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB3_46
-; NONEON-NOSVE-NEXT:  .LBB3_14: // %else26
-; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB3_47
-; NONEON-NOSVE-NEXT:  .LBB3_15: // %else28
-; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB3_48
-; NONEON-NOSVE-NEXT:  .LBB3_16: // %else30
-; NONEON-NOSVE-NEXT:    tbnz w8, #16, .LBB3_49
-; NONEON-NOSVE-NEXT:  .LBB3_17: // %else32
-; NONEON-NOSVE-NEXT:    tbnz w8, #17, .LBB3_50
-; NONEON-NOSVE-NEXT:  .LBB3_18: // %else34
-; NONEON-NOSVE-NEXT:    tbnz w8, #18, .LBB3_51
-; NONEON-NOSVE-NEXT:  .LBB3_19: // %else36
-; NONEON-NOSVE-NEXT:    tbnz w8, #19, .LBB3_52
-; NONEON-NOSVE-NEXT:  .LBB3_20: // %else38
-; NONEON-NOSVE-NEXT:    tbnz w8, #20, .LBB3_53
-; NONEON-NOSVE-NEXT:  .LBB3_21: // %else40
-; NONEON-NOSVE-NEXT:    tbnz w8, #21, .LBB3_54
-; NONEON-NOSVE-NEXT:  .LBB3_22: // %else42
-; NONEON-NOSVE-NEXT:    tbnz w8, #22, .LBB3_55
-; NONEON-NOSVE-NEXT:  .LBB3_23: // %else44
-; NONEON-NOSVE-NEXT:    tbnz w8, #23, .LBB3_56
-; NONEON-NOSVE-NEXT:  .LBB3_24: // %else46
-; NONEON-NOSVE-NEXT:    tbnz w8, #24, .LBB3_57
-; NONEON-NOSVE-NEXT:  .LBB3_25: // %else48
-; NONEON-NOSVE-NEXT:    tbnz w8, #25, .LBB3_58
-; NONEON-NOSVE-NEXT:  .LBB3_26: // %else50
-; NONEON-NOSVE-NEXT:    tbnz w8, #26, .LBB3_59
-; NONEON-NOSVE-NEXT:  .LBB3_27: // %else52
-; NONEON-NOSVE-NEXT:    tbnz w8, #27, .LBB3_60
-; NONEON-NOSVE-NEXT:  .LBB3_28: // %else54
-; NONEON-NOSVE-NEXT:    tbnz w8, #28, .LBB3_61
-; NONEON-NOSVE-NEXT:  .LBB3_29: // %else56
-; NONEON-NOSVE-NEXT:    tbnz w8, #29, .LBB3_62
-; NONEON-NOSVE-NEXT:  .LBB3_30: // %else58
-; NONEON-NOSVE-NEXT:    tbnz w8, #30, .LBB3_63
-; NONEON-NOSVE-NEXT:  .LBB3_31: // %else60
-; NONEON-NOSVE-NEXT:    tbnz w8, #31, .LBB3_64
-; NONEON-NOSVE-NEXT:  .LBB3_32: // %else62
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB3_33: // %cond.store
-; NONEON-NOSVE-NEXT:    strb wzr, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB3_2
-; NONEON-NOSVE-NEXT:  .LBB3_34: // %cond.store1
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #1]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB3_3
-; NONEON-NOSVE-NEXT:  .LBB3_35: // %cond.store3
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #2]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB3_4
-; NONEON-NOSVE-NEXT:  .LBB3_36: // %cond.store5
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #3]
-; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB3_5
-; NONEON-NOSVE-NEXT:  .LBB3_37: // %cond.store7
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #4]
-; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB3_6
-; NONEON-NOSVE-NEXT:  .LBB3_38: // %cond.store9
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #5]
-; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB3_7
-; NONEON-NOSVE-NEXT:  .LBB3_39: // %cond.store11
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #6]
-; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB3_8
-; NONEON-NOSVE-NEXT:  .LBB3_40: // %cond.store13
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #7]
-; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB3_9
-; NONEON-NOSVE-NEXT:  .LBB3_41: // %cond.store15
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #8]
-; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB3_10
-; NONEON-NOSVE-NEXT:  .LBB3_42: // %cond.store17
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #9]
-; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB3_11
-; NONEON-NOSVE-NEXT:  .LBB3_43: // %cond.store19
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #10]
-; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB3_12
-; NONEON-NOSVE-NEXT:  .LBB3_44: // %cond.store21
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #11]
-; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB3_13
-; NONEON-NOSVE-NEXT:  .LBB3_45: // %cond.store23
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #12]
-; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB3_14
-; NONEON-NOSVE-NEXT:  .LBB3_46: // %cond.store25
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #13]
-; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB3_15
-; NONEON-NOSVE-NEXT:  .LBB3_47: // %cond.store27
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #14]
-; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB3_16
-; NONEON-NOSVE-NEXT:  .LBB3_48: // %cond.store29
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #15]
-; NONEON-NOSVE-NEXT:    tbz w8, #16, .LBB3_17
-; NONEON-NOSVE-NEXT:  .LBB3_49: // %cond.store31
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #16]
-; NONEON-NOSVE-NEXT:    tbz w8, #17, .LBB3_18
-; NONEON-NOSVE-NEXT:  .LBB3_50: // %cond.store33
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #17]
-; NONEON-NOSVE-NEXT:    tbz w8, #18, .LBB3_19
-; NONEON-NOSVE-NEXT:  .LBB3_51: // %cond.store35
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #18]
-; NONEON-NOSVE-NEXT:    tbz w8, #19, .LBB3_20
-; NONEON-NOSVE-NEXT:  .LBB3_52: // %cond.store37
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #19]
-; NONEON-NOSVE-NEXT:    tbz w8, #20, .LBB3_21
-; NONEON-NOSVE-NEXT:  .LBB3_53: // %cond.store39
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #20]
-; NONEON-NOSVE-NEXT:    tbz w8, #21, .LBB3_22
-; NONEON-NOSVE-NEXT:  .LBB3_54: // %cond.store41
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #21]
-; NONEON-NOSVE-NEXT:    tbz w8, #22, .LBB3_23
-; NONEON-NOSVE-NEXT:  .LBB3_55: // %cond.store43
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #22]
-; NONEON-NOSVE-NEXT:    tbz w8, #23, .LBB3_24
-; NONEON-NOSVE-NEXT:  .LBB3_56: // %cond.store45
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #23]
-; NONEON-NOSVE-NEXT:    tbz w8, #24, .LBB3_25
-; NONEON-NOSVE-NEXT:  .LBB3_57: // %cond.store47
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #24]
-; NONEON-NOSVE-NEXT:    tbz w8, #25, .LBB3_26
-; NONEON-NOSVE-NEXT:  .LBB3_58: // %cond.store49
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #25]
-; NONEON-NOSVE-NEXT:    tbz w8, #26, .LBB3_27
-; NONEON-NOSVE-NEXT:  .LBB3_59: // %cond.store51
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #26]
-; NONEON-NOSVE-NEXT:    tbz w8, #27, .LBB3_28
-; NONEON-NOSVE-NEXT:  .LBB3_60: // %cond.store53
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #27]
-; NONEON-NOSVE-NEXT:    tbz w8, #28, .LBB3_29
-; NONEON-NOSVE-NEXT:  .LBB3_61: // %cond.store55
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #28]
-; NONEON-NOSVE-NEXT:    tbz w8, #29, .LBB3_30
-; NONEON-NOSVE-NEXT:  .LBB3_62: // %cond.store57
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #29]
-; NONEON-NOSVE-NEXT:    tbz w8, #30, .LBB3_31
-; NONEON-NOSVE-NEXT:  .LBB3_63: // %cond.store59
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #30]
-; NONEON-NOSVE-NEXT:    tbz w8, #31, .LBB3_32
-; NONEON-NOSVE-NEXT:  .LBB3_64: // %cond.store61
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #31]
-; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v32i8(<32 x i8> zeroinitializer, ptr %dst, i32 8, <32 x i1> %mask)
   ret void
 }
@@ -568,29 +154,6 @@ define void @masked_store_v2f16(ptr %dst, <2 x i1> %mask) {
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: masked_store_v2f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #31
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI4_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI4_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.2s, v0.2s, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addp v0.2s, v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    fmov w8, s0
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB4_3
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB4_4
-; NONEON-NOSVE-NEXT:  .LBB4_2: // %else2
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB4_3: // %cond.store
-; NONEON-NOSVE-NEXT:    fmov s0, wzr
-; NONEON-NOSVE-NEXT:    str h0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB4_2
-; NONEON-NOSVE-NEXT:  .LBB4_4: // %cond.store1
-; NONEON-NOSVE-NEXT:    fmov s0, wzr
-; NONEON-NOSVE-NEXT:    str h0, [x0, #2]
-; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v2f16(<2 x half> zeroinitializer, ptr %dst, i32 8, <2 x i1> %mask)
   ret void
 }
@@ -606,41 +169,6 @@ define void @masked_store_v4f16(ptr %dst, <4 x i1> %mask) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: masked_store_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI5_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI5_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv h0, v0.4h
-; NONEON-NOSVE-NEXT:    fmov w8, s0
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB5_5
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB5_6
-; NONEON-NOSVE-NEXT:  .LBB5_2: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB5_7
-; NONEON-NOSVE-NEXT:  .LBB5_3: // %else4
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB5_8
-; NONEON-NOSVE-NEXT:  .LBB5_4: // %else6
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB5_5: // %cond.store
-; NONEON-NOSVE-NEXT:    fmov s0, wzr
-; NONEON-NOSVE-NEXT:    str h0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB5_2
-; NONEON-NOSVE-NEXT:  .LBB5_6: // %cond.store1
-; NONEON-NOSVE-NEXT:    fmov s0, wzr
-; NONEON-NOSVE-NEXT:    str h0, [x0, #2]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB5_3
-; NONEON-NOSVE-NEXT:  .LBB5_7: // %cond.store3
-; NONEON-NOSVE-NEXT:    fmov s0, wzr
-; NONEON-NOSVE-NEXT:    str h0, [x0, #4]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB5_4
-; NONEON-NOSVE-NEXT:  .LBB5_8: // %cond.store5
-; NONEON-NOSVE-NEXT:    fmov s0, wzr
-; NONEON-NOSVE-NEXT:    str h0, [x0, #6]
-; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v4f16(<4 x half> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask)
   ret void
 }
@@ -657,65 +185,6 @@ define void @masked_store_v8f16(ptr %dst, <8 x i1> %mask) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: masked_store_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.8b, v0.8b, #7
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI6_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI6_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.8b, v0.8b, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv b0, v0.8b
-; NONEON-NOSVE-NEXT:    fmov w8, s0
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB6_9
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB6_10
-; NONEON-NOSVE-NEXT:  .LBB6_2: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB6_11
-; NONEON-NOSVE-NEXT:  .LBB6_3: // %else4
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB6_12
-; NONEON-NOSVE-NEXT:  .LBB6_4: // %else6
-; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB6_13
-; NONEON-NOSVE-NEXT:  .LBB6_5: // %else8
-; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB6_14
-; NONEON-NOSVE-NEXT:  .LBB6_6: // %else10
-; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB6_15
-; NONEON-NOSVE-NEXT:  .LBB6_7: // %else12
-; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB6_16
-; NONEON-NOSVE-NEXT:  .LBB6_8: // %else14
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB6_9: // %cond.store
-; NONEON-NOSVE-NEXT:    fmov s0, wzr
-; NONEON-NOSVE-NEXT:    str h0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB6_2
-; NONEON-NOSVE-NEXT:  .LBB6_10: // %cond.store1
-; NONEON-NOSVE-NEXT:    fmov s0, wzr
-; NONEON-NOSVE-NEXT:    str h0, [x0, #2]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB6_3
-; NONEON-NOSVE-NEXT:  .LBB6_11: // %cond.store3
-; NONEON-NOSVE-NEXT:    fmov s0, wzr
-; NONEON-NOSVE-NEXT:    str h0, [x0, #4]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB6_4
-; NONEON-NOSVE-NEXT:  .LBB6_12: // %cond.store5
-; NONEON-NOSVE-NEXT:    fmov s0, wzr
-; NONEON-NOSVE-NEXT:    str h0, [x0, #6]
-; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB6_5
-; NONEON-NOSVE-NEXT:  .LBB6_13: // %cond.store7
-; NONEON-NOSVE-NEXT:    fmov s0, wzr
-; NONEON-NOSVE-NEXT:    str h0, [x0, #8]
-; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB6_6
-; NONEON-NOSVE-NEXT:  .LBB6_14: // %cond.store9
-; NONEON-NOSVE-NEXT:    fmov s0, wzr
-; NONEON-NOSVE-NEXT:    str h0, [x0, #10]
-; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB6_7
-; NONEON-NOSVE-NEXT:  .LBB6_15: // %cond.store11
-; NONEON-NOSVE-NEXT:    fmov s0, wzr
-; NONEON-NOSVE-NEXT:    str h0, [x0, #12]
-; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB6_8
-; NONEON-NOSVE-NEXT:  .LBB6_16: // %cond.store13
-; NONEON-NOSVE-NEXT:    fmov s0, wzr
-; NONEON-NOSVE-NEXT:    str h0, [x0, #14]
-; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v8f16(<8 x half> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask)
   ret void
 }
@@ -740,115 +209,6 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) {
 ; CHECK-NEXT:    st1h { z1.h }, p1, [x0, x8, lsl #1]
 ; CHECK-NEXT:    st1h { z1.h }, p0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: masked_store_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI7_0
-; NONEON-NOSVE-NEXT:    ldr q1, [x8, :lo12:.LCPI7_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    addv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w8, s0
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB7_17
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB7_18
-; NONEON-NOSVE-NEXT:  .LBB7_2: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB7_19
-; NONEON-NOSVE-NEXT:  .LBB7_3: // %else4
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB7_20
-; NONEON-NOSVE-NEXT:  .LBB7_4: // %else6
-; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB7_21
-; NONEON-NOSVE-NEXT:  .LBB7_5: // %else8
-; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB7_22
-; NONEON-NOSVE-NEXT:  .LBB7_6: // %else10
-; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB7_23
-; NONEON-NOSVE-NEXT:  .LBB7_7: // %else12
-; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB7_24
-; NONEON-NOSVE-NEXT:  .LBB7_8: // %else14
-; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB7_25
-; NONEON-NOSVE-NEXT:  .LBB7_9: // %else16
-; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB7_26
-; NONEON-NOSVE-NEXT:  .LBB7_10: // %else18
-; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB7_27
-; NONEON-NOSVE-NEXT:  .LBB7_11: // %else20
-; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB7_28
-; NONEON-NOSVE-NEXT:  .LBB7_12: // %else22
-; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB7_29
-; NONEON-NOSVE-NEXT:  .LBB7_13: // %else24
-; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB7_30
-; NONEON-NOSVE-NEXT:  .LBB7_14: // %else26
-; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB7_31
-; NONEON-NOSVE-NEXT:  .LBB7_15: // %else28
-; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB7_32
-; NONEON-NOSVE-NEXT:  .LBB7_16: // %else30
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB7_17: // %cond.store
-; NONEON-NOSVE-NEXT:    fmov s0, wzr
-; NONEON-NOSVE-NEXT:    str h0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB7_2
-; NONEON-NOSVE-NEXT:  .LBB7_18: // %cond.store1
-; NONEON-NOSVE-NEXT:    fmov s0, wzr
-; NONEON-NOSVE-NEXT:    str h0, [x0, #2]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB7_3
-; NONEON-NOSVE-NEXT:  .LBB7_19: // %cond.store3
-; NONEON-NOSVE-NEXT:    fmov s0, wzr
-; NONEON-NOSVE-NEXT:    str h0, [x0, #4]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB7_4
-; NONEON-NOSVE-NEXT:  .LBB7_20: // %cond.store5
-; NONEON-NOSVE-NEXT:    fmov s0, wzr
-; NONEON-NOSVE-NEXT:    str h0, [x0, #6]
-; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB7_5
-; NONEON-NOSVE-NEXT:  .LBB7_21: // %cond.store7
-; NONEON-NOSVE-NEXT:    fmov s0, wzr
-; NONEON-NOSVE-NEXT:    str h0, [x0, #8]
-; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB7_6
-; NONEON-NOSVE-NEXT:  .LBB7_22: // %cond.store9
-; NONEON-NOSVE-NEXT:    fmov s0, wzr
-; NONEON-NOSVE-NEXT:    str h0, [x0, #10]
-; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB7_7
-; NONEON-NOSVE-NEXT:  .LBB7_23: // %cond.store11
-; NONEON-NOSVE-NEXT:    fmov s0, wzr
-; NONEON-NOSVE-NEXT:    str h0, [x0, #12]
-; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB7_8
-; NONEON-NOSVE-NEXT:  .LBB7_24: // %cond.store13
-; NONEON-NOSVE-NEXT:    fmov s0, wzr
-; NONEON-NOSVE-NEXT:    str h0, [x0, #14]
-; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB7_9
-; NONEON-NOSVE-NEXT:  .LBB7_25: // %cond.store15
-; NONEON-NOSVE-NEXT:    fmov s0, wzr
-; NONEON-NOSVE-NEXT:    str h0, [x0, #16]
-; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB7_10
-; NONEON-NOSVE-NEXT:  .LBB7_26: // %cond.store17
-; NONEON-NOSVE-NEXT:    fmov s0, wzr
-; NONEON-NOSVE-NEXT:    str h0, [x0, #18]
-; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB7_11
-; NONEON-NOSVE-NEXT:  .LBB7_27: // %cond.store19
-; NONEON-NOSVE-NEXT:    fmov s0, wzr
-; NONEON-NOSVE-NEXT:    str h0, [x0, #20]
-; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB7_12
-; NONEON-NOSVE-NEXT:  .LBB7_28: // %cond.store21
-; NONEON-NOSVE-NEXT:    fmov s0, wzr
-; NONEON-NOSVE-NEXT:    str h0, [x0, #22]
-; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB7_13
-; NONEON-NOSVE-NEXT:  .LBB7_29: // %cond.store23
-; NONEON-NOSVE-NEXT:    fmov s0, wzr
-; NONEON-NOSVE-NEXT:    str h0, [x0, #24]
-; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB7_14
-; NONEON-NOSVE-NEXT:  .LBB7_30: // %cond.store25
-; NONEON-NOSVE-NEXT:    fmov s0, wzr
-; NONEON-NOSVE-NEXT:    str h0, [x0, #26]
-; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB7_15
-; NONEON-NOSVE-NEXT:  .LBB7_31: // %cond.store27
-; NONEON-NOSVE-NEXT:    fmov s0, wzr
-; NONEON-NOSVE-NEXT:    str h0, [x0, #28]
-; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB7_16
-; NONEON-NOSVE-NEXT:  .LBB7_32: // %cond.store29
-; NONEON-NOSVE-NEXT:    fmov s0, wzr
-; NONEON-NOSVE-NEXT:    str h0, [x0, #30]
-; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v16f16(<16 x half> zeroinitializer, ptr %dst, i32 8, <16 x i1> %mask)
   ret void
 }
@@ -865,37 +225,6 @@ define void @masked_store_v4f32(ptr %dst, <4 x i1> %mask) {
 ; CHECK-NEXT:    mov z0.s, #0 // =0x0
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: masked_store_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI8_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI8_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv h0, v0.4h
-; NONEON-NOSVE-NEXT:    fmov w8, s0
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB8_5
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB8_6
-; NONEON-NOSVE-NEXT:  .LBB8_2: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB8_7
-; NONEON-NOSVE-NEXT:  .LBB8_3: // %else4
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB8_8
-; NONEON-NOSVE-NEXT:  .LBB8_4: // %else6
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB8_5: // %cond.store
-; NONEON-NOSVE-NEXT:    str wzr, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB8_2
-; NONEON-NOSVE-NEXT:  .LBB8_6: // %cond.store1
-; NONEON-NOSVE-NEXT:    str wzr, [x0, #4]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB8_3
-; NONEON-NOSVE-NEXT:  .LBB8_7: // %cond.store3
-; NONEON-NOSVE-NEXT:    str wzr, [x0, #8]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB8_4
-; NONEON-NOSVE-NEXT:  .LBB8_8: // %cond.store5
-; NONEON-NOSVE-NEXT:    str wzr, [x0, #12]
-; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v4f32(<4 x float> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask)
   ret void
 }
@@ -946,57 +275,6 @@ define void @masked_store_v8f32(ptr %dst, <8 x i1> %mask) {
 ; CHECK-NEXT:    st1w { z1.s }, p0, [x0]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: masked_store_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.8b, v0.8b, #7
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI9_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI9_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.8b, v0.8b, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv b0, v0.8b
-; NONEON-NOSVE-NEXT:    fmov w8, s0
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB9_9
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB9_10
-; NONEON-NOSVE-NEXT:  .LBB9_2: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB9_11
-; NONEON-NOSVE-NEXT:  .LBB9_3: // %else4
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB9_12
-; NONEON-NOSVE-NEXT:  .LBB9_4: // %else6
-; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB9_13
-; NONEON-NOSVE-NEXT:  .LBB9_5: // %else8
-; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB9_14
-; NONEON-NOSVE-NEXT:  .LBB9_6: // %else10
-; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB9_15
-; NONEON-NOSVE-NEXT:  .LBB9_7: // %else12
-; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB9_16
-; NONEON-NOSVE-NEXT:  .LBB9_8: // %else14
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB9_9: // %cond.store
-; NONEON-NOSVE-NEXT:    str wzr, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB9_2
-; NONEON-NOSVE-NEXT:  .LBB9_10: // %cond.store1
-; NONEON-NOSVE-NEXT:    str wzr, [x0, #4]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB9_3
-; NONEON-NOSVE-NEXT:  .LBB9_11: // %cond.store3
-; NONEON-NOSVE-NEXT:    str wzr, [x0, #8]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB9_4
-; NONEON-NOSVE-NEXT:  .LBB9_12: // %cond.store5
-; NONEON-NOSVE-NEXT:    str wzr, [x0, #12]
-; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB9_5
-; NONEON-NOSVE-NEXT:  .LBB9_13: // %cond.store7
-; NONEON-NOSVE-NEXT:    str wzr, [x0, #16]
-; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB9_6
-; NONEON-NOSVE-NEXT:  .LBB9_14: // %cond.store9
-; NONEON-NOSVE-NEXT:    str wzr, [x0, #20]
-; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB9_7
-; NONEON-NOSVE-NEXT:  .LBB9_15: // %cond.store11
-; NONEON-NOSVE-NEXT:    str wzr, [x0, #24]
-; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB9_8
-; NONEON-NOSVE-NEXT:  .LBB9_16: // %cond.store13
-; NONEON-NOSVE-NEXT:    str wzr, [x0, #28]
-; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v8f32(<8 x float> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask)
   ret void
 }
@@ -1013,27 +291,6 @@ define void @masked_store_v2f64(ptr %dst, <2 x i1> %mask) {
 ; CHECK-NEXT:    mov z0.d, #0 // =0x0
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: masked_store_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #31
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI10_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI10_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.2s, v0.2s, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addp v0.2s, v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    fmov w8, s0
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB10_3
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB10_4
-; NONEON-NOSVE-NEXT:  .LBB10_2: // %else2
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB10_3: // %cond.store
-; NONEON-NOSVE-NEXT:    str xzr, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB10_2
-; NONEON-NOSVE-NEXT:  .LBB10_4: // %cond.store1
-; NONEON-NOSVE-NEXT:    str xzr, [x0, #8]
-; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v2f64(<2 x double> zeroinitializer, ptr %dst, i32 8, <2 x i1> %mask)
   ret void
 }
@@ -1058,37 +315,6 @@ define void @masked_store_v4f64(ptr %dst, <4 x i1> %mask) {
 ; CHECK-NEXT:    st1d { z0.d }, p1, [x0, x8, lsl #3]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: masked_store_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI11_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI11_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv h0, v0.4h
-; NONEON-NOSVE-NEXT:    fmov w8, s0
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB11_5
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB11_6
-; NONEON-NOSVE-NEXT:  .LBB11_2: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB11_7
-; NONEON-NOSVE-NEXT:  .LBB11_3: // %else4
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB11_8
-; NONEON-NOSVE-NEXT:  .LBB11_4: // %else6
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB11_5: // %cond.store
-; NONEON-NOSVE-NEXT:    str xzr, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB11_2
-; NONEON-NOSVE-NEXT:  .LBB11_6: // %cond.store1
-; NONEON-NOSVE-NEXT:    str xzr, [x0, #8]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB11_3
-; NONEON-NOSVE-NEXT:  .LBB11_7: // %cond.store3
-; NONEON-NOSVE-NEXT:    str xzr, [x0, #16]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB11_4
-; NONEON-NOSVE-NEXT:  .LBB11_8: // %cond.store5
-; NONEON-NOSVE-NEXT:    str xzr, [x0, #24]
-; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v4f64(<4 x double> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask)
   ret void
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
index d7eaf766e7df..b5adea594242 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -15,15 +14,6 @@ define void @add_v4i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: add_v4i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr s0, [x0]
-; NONEON-NOSVE-NEXT:    ldr s1, [x1]
-; NONEON-NOSVE-NEXT:    uaddl v0.8h, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    str s0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i8>, ptr %a
   %op2 = load <4 x i8>, ptr %b
   %res = add <4 x i8> %op1, %op2
@@ -39,14 +29,6 @@ define void @add_v8i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z0.b, z0.b, z1.b
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: add_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ldr d1, [x1]
-; NONEON-NOSVE-NEXT:    add v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    str d0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i8>, ptr %a
   %op2 = load <8 x i8>, ptr %b
   %res = add <8 x i8> %op1, %op2
@@ -62,14 +44,6 @@ define void @add_v16i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z0.b, z0.b, z1.b
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: add_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i8>, ptr %a
   %op2 = load <16 x i8>, ptr %b
   %res = add <16 x i8> %op1, %op2
@@ -86,15 +60,6 @@ define void @add_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z1.b, z2.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: add_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    add v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = add <32 x i8> %op1, %op2
@@ -111,23 +76,6 @@ define void @add_v2i16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: add_v2i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldrh w8, [x0]
-; NONEON-NOSVE-NEXT:    ldrh w9, [x1]
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    fmov s1, w9
-; NONEON-NOSVE-NEXT:    add x8, x0, #2
-; NONEON-NOSVE-NEXT:    add x9, x1, #2
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x8]
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[2], [x9]
-; NONEON-NOSVE-NEXT:    add v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    mov w8, v0.s[1]
-; NONEON-NOSVE-NEXT:    fmov w9, s0
-; NONEON-NOSVE-NEXT:    strh w9, [x0]
-; NONEON-NOSVE-NEXT:    strh w8, [x0, #2]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i16>, ptr %a
   %op2 = load <2 x i16>, ptr %b
   %res = add <2 x i16> %op1, %op2
@@ -143,14 +91,6 @@ define void @add_v4i16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: add_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ldr d1, [x1]
-; NONEON-NOSVE-NEXT:    add v0.4h, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    str d0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i16>, ptr %a
   %op2 = load <4 x i16>, ptr %b
   %res = add <4 x i16> %op1, %op2
@@ -166,14 +106,6 @@ define void @add_v8i16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: add_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %op2 = load <8 x i16>, ptr %b
   %res = add <8 x i16> %op1, %op2
@@ -190,15 +122,6 @@ define void @add_v16i16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    add z1.h, z2.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: add_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    add v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = add <16 x i16> %op1, %op2
@@ -214,13 +137,6 @@ define void @abs_v2i32(ptr %a) {
 ; CHECK-NEXT:    abs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: abs_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    abs v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    str d0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i32>, ptr %a
   %res = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %op1, i1 false)
   store <2 x i32> %res, ptr %a
@@ -235,13 +151,6 @@ define void @abs_v4i32(ptr %a) {
 ; CHECK-NEXT:    abs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: abs_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    abs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %op1, i1 false)
   store <4 x i32> %res, ptr %a
@@ -257,14 +166,6 @@ define void @abs_v8i32(ptr %a) {
 ; CHECK-NEXT:    abs z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: abs_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    abs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    abs v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %op1, i1 false)
   store <8 x i32> %res, ptr %a
@@ -279,13 +180,6 @@ define void @abs_v2i64(ptr %a) {
 ; CHECK-NEXT:    abs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: abs_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    abs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %res = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %op1, i1 false)
   store <2 x i64> %res, ptr %a
@@ -301,14 +195,6 @@ define void @abs_v4i64(ptr %a) {
 ; CHECK-NEXT:    abs z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: abs_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    abs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    abs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %op1, i1 false)
   store <4 x i64> %res, ptr %a
@@ -325,17 +211,6 @@ define void @fadd_v2f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    str w8, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fadd_v2f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr s0, [x0]
-; NONEON-NOSVE-NEXT:    ldr s1, [x1]
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    str s0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x half>, ptr %a
   %op2 = load <2 x half>, ptr %b
   %res = fadd <2 x half> %op1, %op2
@@ -352,17 +227,6 @@ define void @fadd_v4f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fadd_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ldr d1, [x1]
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    str d0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x half>, ptr %a
   %op2 = load <4 x half>, ptr %b
   %res = fadd <4 x half> %op1, %op2
@@ -379,21 +243,6 @@ define void @fadd_v8f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fadd_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fadd v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
-; NONEON-NOSVE-NEXT:    str q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %op2 = load <8 x half>, ptr %b
   %res = fadd <8 x half> %op1, %op2
@@ -412,29 +261,6 @@ define void @fadd_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fadd_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v6.4s, v3.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl v5.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v7.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v3.4s, v3.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
-; NONEON-NOSVE-NEXT:    fadd v4.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    fadd v5.4s, v7.4s, v6.4s
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fadd v2.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v4.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v5.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v2.4s
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = fadd <16 x half> %op1, %op2
@@ -451,14 +277,6 @@ define void @fadd_v2f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fadd_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ldr d1, [x1]
-; NONEON-NOSVE-NEXT:    fadd v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    str d0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x float>, ptr %a
   %op2 = load <2 x float>, ptr %b
   %res = fadd <2 x float> %op1, %op2
@@ -475,14 +293,6 @@ define void @fadd_v4f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fadd_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %op2 = load <4 x float>, ptr %b
   %res = fadd <4 x float> %op1, %op2
@@ -501,15 +311,6 @@ define void @fadd_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fadd_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fadd v1.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = fadd <8 x float> %op1, %op2
@@ -526,14 +327,6 @@ define void @fadd_v2f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fadd_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fadd v0.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x double>, ptr %a
   %op2 = load <2 x double>, ptr %b
   %res = fadd <2 x double> %op1, %op2
@@ -552,15 +345,6 @@ define void @fadd_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fadd_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fadd v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fadd v1.2d, v2.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = fadd <4 x double> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
index f595a4219cac..00413302798c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -16,14 +15,6 @@ define void @test_revbv16i16(ptr %a) {
 ; CHECK-NEXT:    revb z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: test_revbv16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev16 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rev16 v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i8>, ptr %a
   %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14, i32 17, i32 16, i32 19, i32 18, i32 21, i32 20, i32 23, i32 22, i32 undef, i32 24, i32 27, i32 undef, i32 29, i32 28, i32 undef, i32 undef>
   store <32 x i8> %tmp2, ptr %a
@@ -40,14 +31,6 @@ define void @test_revbv8i32(ptr %a) {
 ; CHECK-NEXT:    revb z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: test_revbv8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev32 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rev32 v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i8>, ptr %a
   %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20, i32 27, i32 undef, i32 undef, i32 undef, i32 31, i32 30, i32 29, i32 undef>
   store <32 x i8> %tmp2, ptr %a
@@ -64,14 +47,6 @@ define void @test_revbv4i64(ptr %a) {
 ; CHECK-NEXT:    revb z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: test_revbv4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev64 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rev64 v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i8>, ptr %a
   %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 31, i32 30, i32 29, i32 undef, i32 27, i32 undef, i32 undef, i32 undef>
   store <32 x i8> %tmp2, ptr %a
@@ -88,14 +63,6 @@ define void @test_revhv8i32(ptr %a) {
 ; CHECK-NEXT:    revh z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: test_revhv8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev32 v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    rev32 v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x i16>, ptr %a
   %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
   store <16 x i16> %tmp2, ptr %a
@@ -112,14 +79,6 @@ define void @test_revhv8f32(ptr %a) {
 ; CHECK-NEXT:    revh z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: test_revhv8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev32 v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    rev32 v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x half>, ptr %a
   %tmp2 = shufflevector <16 x half> %tmp1, <16 x half> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
   store <16 x half> %tmp2, ptr %a
@@ -136,14 +95,6 @@ define void @test_revhv4i64(ptr %a) {
 ; CHECK-NEXT:    revh z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: test_revhv4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev64 v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    rev64 v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x i16>, ptr %a
   %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
   store <16 x i16> %tmp2, ptr %a
@@ -160,14 +111,6 @@ define void @test_revwv4i64(ptr %a) {
 ; CHECK-NEXT:    revw z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: test_revwv4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev64 v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    rev64 v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   store <8 x i32> %tmp2, ptr %a
@@ -184,14 +127,6 @@ define void @test_revwv4f64(ptr %a) {
 ; CHECK-NEXT:    revw z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: test_revwv4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev64 v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    rev64 v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x float>, ptr %a
   %tmp2 = shufflevector <8 x float> %tmp1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   store <8 x float> %tmp2, ptr %a
@@ -206,12 +141,6 @@ define <16 x i8> @test_revv16i8(ptr %a) {
 ; CHECK-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: test_revv16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    rev64 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %a
   %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
   ret <16 x i8> %tmp2
@@ -227,14 +156,6 @@ define void @test_revwv8i32v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    revw z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: test_revwv8i32v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    rev64 v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    rev64 v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp2 = load <8 x i32>, ptr %b
   %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> <i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
@@ -255,18 +176,6 @@ define void @test_revhv32i16(ptr %a) {
 ; CHECK-NEXT:    stp q0, q1, [x0, #32]
 ; CHECK-NEXT:    stp q2, q3, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: test_revhv32i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    rev64 v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    rev64 v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    rev64 v2.8h, v2.8h
-; NONEON-NOSVE-NEXT:    rev64 v3.8h, v3.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i16>, ptr %a
   %tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20, i32 27, i32 undef, i32 undef, i32 undef, i32 31, i32 30, i32 29, i32 undef>
   store <32 x i16> %tmp2, ptr %a
@@ -282,14 +191,6 @@ define void @test_rev_elts_fail(ptr %a) {
 ; CHECK-NEXT:    tbl z0.d, { z2.d }, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: test_rev_elts_fail:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x i64>, ptr %a
   %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   store <4 x i64> %tmp2, ptr %a
@@ -307,15 +208,6 @@ define void @test_revdv4i64_sve2p1(ptr %a) #1 {
 ; CHECK-NEXT:    revd z1.q, p0/m, z1.q
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: test_revdv4i64_sve2p1:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ptrue p0.d, vl2
-; NONEON-NOSVE-NEXT:    revd z0.q, p0/m, z0.q
-; NONEON-NOSVE-NEXT:    revd z1.q, p0/m, z1.q
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x i64>, ptr %a
   %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   store <4 x i64> %tmp2, ptr %a
@@ -331,15 +223,6 @@ define void @test_revdv4f64_sve2p1(ptr %a) #1 {
 ; CHECK-NEXT:    revd z1.q, p0/m, z1.q
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: test_revdv4f64_sve2p1:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ptrue p0.d
-; NONEON-NOSVE-NEXT:    revd z0.q, p0/m, z0.q
-; NONEON-NOSVE-NEXT:    revd z1.q, p0/m, z1.q
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x double>, ptr %a
   %tmp2 = shufflevector <4 x double> %tmp1, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   store <4 x double> %tmp2, ptr %a
@@ -355,16 +238,6 @@ define void @test_revv8i32(ptr %a) {
 ; CHECK-NEXT:    tbl z0.s, { z2.s }, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: test_revv8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    rev64 v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    rev64 v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   store <8 x i32> %tmp2, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
index df786933da88..cb73030306b0 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -69,18 +68,6 @@ define void @zip1_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: zip1_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    zip2 v2.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    str q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <32 x i8>, ptr %a
   %tmp2 = load volatile <32 x i8>, ptr %b
   %tmp3 = shufflevector <32 x i8> %tmp1, <32 x i8> %tmp2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47>
@@ -209,28 +196,6 @@ define void @zip_v32i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: zip_v32i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q4, q0, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q5, q1, [x0]
-; NONEON-NOSVE-NEXT:    ldp q6, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    ldp q7, q3, [x1]
-; NONEON-NOSVE-NEXT:    zip1 v17.8h, v0.8h, v2.8h
-; NONEON-NOSVE-NEXT:    zip2 v0.8h, v0.8h, v2.8h
-; NONEON-NOSVE-NEXT:    zip1 v16.8h, v1.8h, v3.8h
-; NONEON-NOSVE-NEXT:    zip2 v1.8h, v1.8h, v3.8h
-; NONEON-NOSVE-NEXT:    zip1 v2.8h, v5.8h, v7.8h
-; NONEON-NOSVE-NEXT:    zip1 v3.8h, v4.8h, v6.8h
-; NONEON-NOSVE-NEXT:    zip2 v5.8h, v5.8h, v7.8h
-; NONEON-NOSVE-NEXT:    zip2 v4.8h, v4.8h, v6.8h
-; NONEON-NOSVE-NEXT:    add v6.8h, v16.8h, v17.8h
-; NONEON-NOSVE-NEXT:    add v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    add v2.8h, v5.8h, v4.8h
-; NONEON-NOSVE-NEXT:    stp q6, q0, [x0, #32]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i16>, ptr %a
   %tmp2 = load <32 x i16>, ptr %b
   %tmp3 = shufflevector <32 x i16> %tmp1, <32 x i16> %tmp2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47>
@@ -279,18 +244,6 @@ define void @zip1_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: zip1_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    zip2 v2.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    zip1 v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    str q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <16 x i16>, ptr %a
   %tmp2 = load volatile <16 x i16>, ptr %b
   %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> %tmp2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -323,18 +276,6 @@ define void @zip1_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: zip1_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    zip2 v2.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    zip1 v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    str q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <8 x i32>, ptr %a
   %tmp2 = load volatile <8 x i32>, ptr %b
   %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -357,19 +298,6 @@ define void @zip_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: zip_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
-; NONEON-NOSVE-NEXT:    zip1 v4.2d, v1.2d, v3.2d
-; NONEON-NOSVE-NEXT:    zip1 v5.2d, v0.2d, v2.2d
-; NONEON-NOSVE-NEXT:    zip2 v1.2d, v1.2d, v3.2d
-; NONEON-NOSVE-NEXT:    zip2 v0.2d, v0.2d, v2.2d
-; NONEON-NOSVE-NEXT:    fadd v2.2d, v4.2d, v5.2d
-; NONEON-NOSVE-NEXT:    fadd v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    stp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x double>, ptr %a
   %tmp2 = load <4 x double>, ptr %b
   %tmp3 = shufflevector <4 x double> %tmp1, <4 x double> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -402,16 +330,6 @@ define void @zip_v4i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: zip_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    zip1 v2.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    zip2 v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    add v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %a
   %tmp2 = load <4 x i32>, ptr %b
   %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -433,16 +351,6 @@ define void @zip1_v8i32_undef(ptr %a) {
 ; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: zip1_v8i32_undef:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    zip2 v1.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    zip1 v0.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    str q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load  volatile <8 x i32>, ptr %a
   %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
   store volatile <8 x i32> %tmp2, ptr %a
@@ -462,19 +370,6 @@ define void @trn_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z1.b, z1.b, z2.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: trn_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    trn1 v4.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    trn2 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    trn1 v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    trn2 v2.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    add v0.16b, v4.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i8>, ptr %a
   %tmp2 = load <32 x i8>, ptr %b
   %tmp3 = shufflevector <32 x i8> %tmp1, <32 x i8> %tmp2, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
@@ -497,19 +392,6 @@ define void @trn_v8i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z0.h, z1.h, z0.h
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: trn_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI8_0
-; NONEON-NOSVE-NEXT:    adrp x9, .LCPI8_1
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI8_0]
-; NONEON-NOSVE-NEXT:    ldr q2, [x9, :lo12:.LCPI8_1]
-; NONEON-NOSVE-NEXT:    tbl v0.16b, { v1.16b }, v0.16b
-; NONEON-NOSVE-NEXT:    tbl v1.16b, { v1.16b }, v2.16b
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %a
   %tmp2 = load <8 x i16>, ptr %b
   %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 7, i32 2, i32 6, i32 4, i32 5, i32 1, i32 3>
@@ -532,19 +414,6 @@ define void @trn_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z1.h, z1.h, z2.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: trn_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    trn1 v4.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    trn2 v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    trn1 v1.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    trn2 v2.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    add v0.8h, v4.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v2.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x i16>, ptr %a
   %tmp2 = load <16 x i16>, ptr %b
   %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> %tmp2, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -567,19 +436,6 @@ define void @trn_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z1.s, z1.s, z2.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: trn_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    zip1 v4.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    trn2 v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    trn1 v1.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    trn2 v2.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    add v0.4s, v4.4s, v0.4s
-; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v2.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp2 = load <8 x i32>, ptr %b
   %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 6, i32 14>
@@ -603,19 +459,6 @@ define void @trn_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z1.d, p0/m, z1.d, z2.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: trn_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    zip1 v4.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    zip2 v0.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    zip1 v1.2d, v2.2d, v3.2d
-; NONEON-NOSVE-NEXT:    zip2 v2.2d, v2.2d, v3.2d
-; NONEON-NOSVE-NEXT:    fadd v0.2d, v4.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fadd v1.2d, v1.2d, v2.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x double>, ptr %a
   %tmp2 = load <4 x double>, ptr %b
   %tmp3 = shufflevector <4 x double> %tmp1, <4 x double> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -636,16 +479,6 @@ define void @trn_v4f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z2.s
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: trn_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    trn1 v2.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    trn2 v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x float>, ptr %a
   %tmp2 = load <4 x float>, ptr %b
   %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -667,18 +500,6 @@ define void @trn_v8i32_undef(ptr %a) {
 ; CHECK-NEXT:    add z1.s, z3.s, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: trn_v8i32_undef:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    trn1 v2.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    trn2 v0.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    trn1 v3.4s, v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    trn2 v1.4s, v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    add v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    add v1.4s, v3.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   %tmp4 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
@@ -750,18 +571,6 @@ define void @zip2_v32i8(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: zip2_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    zip2 v2.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    str q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <32 x i8>, ptr %a
   %tmp2 = load volatile <32 x i8>, ptr %b
   %tmp3 = shufflevector <32 x i8> %tmp1, <32 x i8> %tmp2, <32 x i32> <i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
@@ -808,18 +617,6 @@ define void @zip2_v16i16(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: zip2_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    zip2 v2.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    zip1 v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    str q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <16 x i16>, ptr %a
   %tmp2 = load volatile <16 x i16>, ptr %b
   %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> %tmp2, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -852,18 +649,6 @@ define void @zip2_v8i32(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: zip2_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    zip2 v2.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    zip1 v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    str q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <8 x i32>, ptr %a
   %tmp2 = load volatile <8 x i32>, ptr %b
   %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -883,16 +668,6 @@ define void @zip2_v8i32_undef(ptr %a) #0{
 ; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: zip2_v8i32_undef:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    zip2 v1.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    zip1 v0.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    str q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <8 x i32>, ptr %a
   %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
   store volatile <8 x i32> %tmp2, ptr %a
@@ -1094,19 +869,6 @@ define void @uzp_v32i8(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: uzp_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
-; NONEON-NOSVE-NEXT:    uzp1 v4.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uzp2 v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v3.16b, v2.16b
-; NONEON-NOSVE-NEXT:    uzp2 v2.16b, v3.16b, v2.16b
-; NONEON-NOSVE-NEXT:    add v0.16b, v4.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i8>, ptr %a
   %tmp2 = load <32 x i8>, ptr %b
   %tmp3 = shufflevector <32 x i8> %tmp1, <32 x i8> %tmp2, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
@@ -1129,17 +891,6 @@ define void @uzp_v4i16(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    add z0.h, z1.h, z0.h
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: uzp_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ext v1.8b, v0.8b, v0.8b, #6
-; NONEON-NOSVE-NEXT:    ext v2.8b, v0.8b, v0.8b, #2
-; NONEON-NOSVE-NEXT:    trn1 v1.4h, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    zip1 v0.4h, v2.4h, v0.4h
-; NONEON-NOSVE-NEXT:    add v0.4h, v1.4h, v0.4h
-; NONEON-NOSVE-NEXT:    str d0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %a
   %tmp2 = load <4 x i16>, ptr %b
   %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
@@ -1257,19 +1008,6 @@ define void @uzp_v16i16(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: uzp_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
-; NONEON-NOSVE-NEXT:    uzp1 v4.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    uzp2 v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v3.8h, v2.8h
-; NONEON-NOSVE-NEXT:    uzp2 v2.8h, v3.8h, v2.8h
-; NONEON-NOSVE-NEXT:    add v0.8h, v4.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v2.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x i16>, ptr %a
   %tmp2 = load <16 x i16>, ptr %b
   %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -1309,19 +1047,6 @@ define void @uzp_v8f32(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    add sp, sp, #48
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: uzp_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
-; NONEON-NOSVE-NEXT:    uzp1 v4.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    uzp2 v2.4s, v3.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v4.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fadd v1.4s, v1.4s, v2.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x float>, ptr %a
   %tmp2 = load <8 x float>, ptr %b
   %tmp3 = shufflevector <8 x float> %tmp1, <8 x float> %tmp2, <8 x i32> <i32 0, i32 undef, i32 4, i32 6, i32 undef, i32 10, i32 12, i32 14>
@@ -1344,19 +1069,6 @@ define void @uzp_v4i64(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    add z1.d, z1.d, z2.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: uzp_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
-; NONEON-NOSVE-NEXT:    zip1 v4.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    zip2 v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    zip1 v1.2d, v3.2d, v2.2d
-; NONEON-NOSVE-NEXT:    zip2 v2.2d, v3.2d, v2.2d
-; NONEON-NOSVE-NEXT:    add v0.2d, v4.2d, v0.2d
-; NONEON-NOSVE-NEXT:    add v1.2d, v1.2d, v2.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x i64>, ptr %a
   %tmp2 = load <4 x i64>, ptr %b
   %tmp3 = shufflevector <4 x i64> %tmp1, <4 x i64> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1424,16 +1136,6 @@ define void @uzp_v8i16(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: uzp_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    uzp2 v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    add v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %a
   %tmp2 = load <8 x i16>, ptr %b
   %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1472,15 +1174,6 @@ define void @uzp_v8i32_undef(ptr %a) #0{
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: uzp_v8i32_undef:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    add v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 0, i32 2, i32 4, i32 6>
   %tmp4 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 1, i32 3, i32 5, i32 7>
@@ -1504,19 +1197,6 @@ define void @zip_vscale2_4(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: zip_vscale2_4:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
-; NONEON-NOSVE-NEXT:    zip1 v4.2d, v1.2d, v3.2d
-; NONEON-NOSVE-NEXT:    zip1 v5.2d, v0.2d, v2.2d
-; NONEON-NOSVE-NEXT:    zip2 v1.2d, v1.2d, v3.2d
-; NONEON-NOSVE-NEXT:    zip2 v0.2d, v0.2d, v2.2d
-; NONEON-NOSVE-NEXT:    fadd v2.2d, v4.2d, v5.2d
-; NONEON-NOSVE-NEXT:    fadd v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    stp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x double>, ptr %a
   %tmp2 = load <4 x double>, ptr %b
   %tmp3 = shufflevector <4 x double> %tmp1, <4 x double> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
index 6b3c85f59357..ab7c42b3e9e3 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -36,23 +35,6 @@ define i1 @ptest_v16i1(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ptest_v16i1:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v0.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v1.4s, v1.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v3.4s, v3.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v2.4s, v2.4s, #0.0
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    mvn v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    umaxv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w8, s0
-; NONEON-NOSVE-NEXT:    and w0, w8, #0x1
-; NONEON-NOSVE-NEXT:    ret
   %v0 = bitcast ptr %a to ptr
   %v1 = load <16 x float>, ptr %v0, align 4
   %v2 = fcmp une <16 x float> %v1, zeroinitializer
@@ -110,33 +92,6 @@ define i1 @ptest_or_v16i1(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ptest_or_v16i1:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x1, #32]
-; NONEON-NOSVE-NEXT:    fcmeq v1.4s, v1.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v0.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v3.4s, v3.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v2.4s, v2.4s, #0.0
-; NONEON-NOSVE-NEXT:    ldp q6, q7, [x1]
-; NONEON-NOSVE-NEXT:    fcmeq v4.4s, v4.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v5.4s, v5.4s, #0.0
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    fcmeq v7.4s, v7.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v6.4s, v6.4s, #0.0
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v5.8h, v4.8h
-; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v6.8h, v7.8h
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v3.16b, v2.16b
-; NONEON-NOSVE-NEXT:    mvn v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    orn v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    umaxv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w8, s0
-; NONEON-NOSVE-NEXT:    and w0, w8, #0x1
-; NONEON-NOSVE-NEXT:    ret
   %v0 = bitcast ptr %a to ptr
   %v1 = load <16 x float>, ptr %v0, align 4
   %v2 = fcmp une <16 x float> %v1, zeroinitializer
@@ -204,33 +159,6 @@ define i1 @ptest_and_v16i1(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: ptest_and_v16i1:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x1, #32]
-; NONEON-NOSVE-NEXT:    fcmeq v1.4s, v1.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v0.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v3.4s, v3.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v2.4s, v2.4s, #0.0
-; NONEON-NOSVE-NEXT:    ldp q6, q7, [x1]
-; NONEON-NOSVE-NEXT:    fcmeq v4.4s, v4.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v5.4s, v5.4s, #0.0
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    fcmeq v7.4s, v7.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v6.4s, v6.4s, #0.0
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v5.8h, v4.8h
-; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v6.8h, v7.8h
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v3.16b, v2.16b
-; NONEON-NOSVE-NEXT:    mvn v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bic v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    uminv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w8, s0
-; NONEON-NOSVE-NEXT:    and w0, w8, #0x1
-; NONEON-NOSVE-NEXT:    ret
   %v0 = bitcast ptr %a to ptr
   %v1 = load <16 x float>, ptr %v0, align 4
   %v2 = fcmp une <16 x float> %v1, zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
index 0a7352bf4944..bfa931044bc5 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -19,13 +18,6 @@ define <4 x i8> @bitreverse_v4i8(<4 x i8> %op) {
 ; CHECK-NEXT:    lsr z0.h, z0.h, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bitreverse_v4i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev16 v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    rbit v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    ushr v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i8> @llvm.bitreverse.v4i8(<4 x i8> %op)
   ret <4 x i8> %res
 }
@@ -38,11 +30,6 @@ define <8 x i8> @bitreverse_v8i8(<8 x i8> %op) {
 ; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bitreverse_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rbit v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %op)
   ret <8 x i8> %res
 }
@@ -55,11 +42,6 @@ define <16 x i8> @bitreverse_v16i8(<16 x i8> %op) {
 ; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bitreverse_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %op)
   ret <16 x i8> %res
 }
@@ -73,14 +55,6 @@ define void @bitreverse_v32i8(ptr %a) {
 ; CHECK-NEXT:    rbit z1.b, p0/m, z1.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bitreverse_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rbit v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %op)
   store <32 x i8> %res, ptr %a
@@ -96,13 +70,6 @@ define <2 x i16> @bitreverse_v2i16(<2 x i16> %op) {
 ; CHECK-NEXT:    lsr z0.s, z0.s, #16
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bitreverse_v2i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev32 v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    rbit v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    ushr v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %op)
   ret <2 x i16> %res
 }
@@ -115,12 +82,6 @@ define <4 x i16> @bitreverse_v4i16(<4 x i16> %op) {
 ; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bitreverse_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev16 v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    rbit v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.bitreverse.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
 }
@@ -133,12 +94,6 @@ define <8 x i16> @bitreverse_v8i16(<8 x i16> %op) {
 ; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bitreverse_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev16 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
 }
@@ -152,16 +107,6 @@ define void @bitreverse_v16i16(ptr %a) {
 ; CHECK-NEXT:    rbit z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bitreverse_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev16 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rev16 v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rbit v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %op)
   store <16 x i16> %res, ptr %a
@@ -176,12 +121,6 @@ define <2 x i32> @bitreverse_v2i32(<2 x i32> %op) {
 ; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bitreverse_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev32 v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    rbit v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
 }
@@ -194,12 +133,6 @@ define <4 x i32> @bitreverse_v4i32(<4 x i32> %op) {
 ; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bitreverse_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev32 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
 }
@@ -213,16 +146,6 @@ define void @bitreverse_v8i32(ptr %a) {
 ; CHECK-NEXT:    rbit z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bitreverse_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev32 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rev32 v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rbit v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %op)
   store <8 x i32> %res, ptr %a
@@ -237,12 +160,6 @@ define <1 x i64> @bitreverse_v1i64(<1 x i64> %op) {
 ; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bitreverse_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev64 v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    rbit v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.bitreverse.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
 }
@@ -255,12 +172,6 @@ define <2 x i64> @bitreverse_v2i64(<2 x i64> %op) {
 ; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bitreverse_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev64 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
 }
@@ -274,16 +185,6 @@ define void @bitreverse_v4i64(ptr %a) {
 ; CHECK-NEXT:    rbit z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bitreverse_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev64 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rev64 v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rbit v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %op)
   store <4 x i64> %res, ptr %a
@@ -303,12 +204,6 @@ define <2 x i16> @bswap_v2i16(<2 x i16> %op) {
 ; CHECK-NEXT:    lsr z0.s, z0.s, #16
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bswap_v2i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev32 v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    ushr v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %op)
   ret <2 x i16> %res
 }
@@ -321,11 +216,6 @@ define <4 x i16> @bswap_v4i16(<4 x i16> %op) {
 ; CHECK-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bswap_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev16 v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
 }
@@ -338,11 +228,6 @@ define <8 x i16> @bswap_v8i16(<8 x i16> %op) {
 ; CHECK-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bswap_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev16 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
 }
@@ -356,14 +241,6 @@ define void @bswap_v16i16(ptr %a) {
 ; CHECK-NEXT:    revb z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bswap_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev16 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rev16 v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %op)
   store <16 x i16> %res, ptr %a
@@ -378,11 +255,6 @@ define <2 x i32> @bswap_v2i32(<2 x i32> %op) {
 ; CHECK-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bswap_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev32 v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
 }
@@ -395,11 +267,6 @@ define <4 x i32> @bswap_v4i32(<4 x i32> %op) {
 ; CHECK-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bswap_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev32 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
 }
@@ -413,14 +280,6 @@ define void @bswap_v8i32(ptr %a) {
 ; CHECK-NEXT:    revb z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bswap_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev32 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rev32 v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %op)
   store <8 x i32> %res, ptr %a
@@ -435,11 +294,6 @@ define <1 x i64> @bswap_v1i64(<1 x i64> %op) {
 ; CHECK-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bswap_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev64 v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.bswap.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
 }
@@ -452,11 +306,6 @@ define <2 x i64> @bswap_v2i64(<2 x i64> %op) {
 ; CHECK-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bswap_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev64 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
 }
@@ -470,14 +319,6 @@ define void @bswap_v4i64(ptr %a) {
 ; CHECK-NEXT:    revb z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: bswap_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev64 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rev64 v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %op)
   store <4 x i64> %res, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll
index d86c7d36a104..9dd42e7831e0 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -15,19 +14,6 @@ define <4 x i8> @sdiv_v4i8(<4 x i8> %op1) {
 ; CHECK-NEXT:    asrd z0.h, p0/m, z0.h, #5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sdiv_v4i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v1.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    movi d2, #0xff00ff00ff00ff
-; NONEON-NOSVE-NEXT:    sshr v1.4h, v1.4h, #8
-; NONEON-NOSVE-NEXT:    sshr v1.4h, v1.4h, #7
-; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    usra v0.4h, v1.4h, #3
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #5
-; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i8> %op1, shufflevector (<4 x i8> insertelement (<4 x i8> poison, i8 32, i32 0), <4 x i8> poison, <4 x i32> zeroinitializer)
   ret <4 x i8> %res
 }
@@ -40,13 +26,6 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1) {
 ; CHECK-NEXT:    asrd z0.b, p0/m, z0.b, #5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sdiv_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmlt v1.8b, v0.8b, #0
-; NONEON-NOSVE-NEXT:    usra v0.8b, v1.8b, #3
-; NONEON-NOSVE-NEXT:    sshr v0.8b, v0.8b, #5
-; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <8 x i8> %op1, shufflevector (<8 x i8> insertelement (<8 x i8> poison, i8 32, i32 0), <8 x i8> poison, <8 x i32> zeroinitializer)
   ret <8 x i8> %res
 }
@@ -59,13 +38,6 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1) {
 ; CHECK-NEXT:    asrd z0.b, p0/m, z0.b, #5
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sdiv_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmlt v1.16b, v0.16b, #0
-; NONEON-NOSVE-NEXT:    usra v0.16b, v1.16b, #3
-; NONEON-NOSVE-NEXT:    sshr v0.16b, v0.16b, #5
-; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <16 x i8> %op1, shufflevector (<16 x i8> insertelement (<16 x i8> poison, i8 32, i32 0), <16 x i8> poison, <16 x i32> zeroinitializer)
   ret <16 x i8> %res
 }
@@ -79,18 +51,6 @@ define void @sdiv_v32i8(ptr %a) {
 ; CHECK-NEXT:    asrd z1.b, p0/m, z1.b, #5
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sdiv_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cmlt v2.16b, v0.16b, #0
-; NONEON-NOSVE-NEXT:    cmlt v3.16b, v1.16b, #0
-; NONEON-NOSVE-NEXT:    usra v0.16b, v2.16b, #3
-; NONEON-NOSVE-NEXT:    usra v1.16b, v3.16b, #3
-; NONEON-NOSVE-NEXT:    sshr v0.16b, v0.16b, #5
-; NONEON-NOSVE-NEXT:    sshr v1.16b, v1.16b, #5
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %res = sdiv <32 x i8> %op1, shufflevector (<32 x i8> insertelement (<32 x i8> poison, i8 32, i32 0), <32 x i8> poison, <32 x i32> zeroinitializer)
   store <32 x i8> %res, ptr %a
@@ -106,20 +66,6 @@ define <2 x i16> @sdiv_v2i16(<2 x i16> %op1) {
 ; CHECK-NEXT:    asrd z0.s, p0/m, z0.s, #5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sdiv_v2i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v1.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
-; NONEON-NOSVE-NEXT:    dup v2.2s, w8
-; NONEON-NOSVE-NEXT:    sshr v1.2s, v1.2s, #16
-; NONEON-NOSVE-NEXT:    ushr v1.2s, v1.2s, #26
-; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    add v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #5
-; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i16> %op1, shufflevector (<2 x i16> insertelement (<2 x i16> poison, i16 32, i32 0), <2 x i16> poison, <2 x i32> zeroinitializer)
   ret <2 x i16> %res
 }
@@ -132,13 +78,6 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1) {
 ; CHECK-NEXT:    asrd z0.h, p0/m, z0.h, #5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sdiv_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmlt v1.4h, v0.4h, #0
-; NONEON-NOSVE-NEXT:    usra v0.4h, v1.4h, #11
-; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #5
-; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i16> %op1, shufflevector (<4 x i16> insertelement (<4 x i16> poison, i16 32, i32 0), <4 x i16> poison, <4 x i32> zeroinitializer)
   ret <4 x i16> %res
 }
@@ -151,13 +90,6 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1) {
 ; CHECK-NEXT:    asrd z0.h, p0/m, z0.h, #5
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sdiv_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmlt v1.8h, v0.8h, #0
-; NONEON-NOSVE-NEXT:    usra v0.8h, v1.8h, #11
-; NONEON-NOSVE-NEXT:    sshr v0.8h, v0.8h, #5
-; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <8 x i16> %op1, shufflevector (<8 x i16> insertelement (<8 x i16> poison, i16 32, i32 0), <8 x i16> poison, <8 x i32> zeroinitializer)
   ret <8 x i16> %res
 }
@@ -171,18 +103,6 @@ define void @sdiv_v16i16(ptr %a) {
 ; CHECK-NEXT:    asrd z1.h, p0/m, z1.h, #5
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sdiv_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cmlt v2.8h, v0.8h, #0
-; NONEON-NOSVE-NEXT:    cmlt v3.8h, v1.8h, #0
-; NONEON-NOSVE-NEXT:    usra v0.8h, v2.8h, #11
-; NONEON-NOSVE-NEXT:    usra v1.8h, v3.8h, #11
-; NONEON-NOSVE-NEXT:    sshr v0.8h, v0.8h, #5
-; NONEON-NOSVE-NEXT:    sshr v1.8h, v1.8h, #5
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = sdiv <16 x i16> %op1, shufflevector (<16 x i16> insertelement (<16 x i16> poison, i16 32, i32 0), <16 x i16> poison, <16 x i32> zeroinitializer)
   store <16 x i16> %res, ptr %a
@@ -197,13 +117,6 @@ define <2 x i32> @sdiv_v2i32(<2 x i32> %op1) {
 ; CHECK-NEXT:    asrd z0.s, p0/m, z0.s, #5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sdiv_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmlt v1.2s, v0.2s, #0
-; NONEON-NOSVE-NEXT:    usra v0.2s, v1.2s, #27
-; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #5
-; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i32> %op1, shufflevector (<2 x i32> insertelement (<2 x i32> poison, i32 32, i32 0), <2 x i32> poison, <2 x i32> zeroinitializer)
   ret <2 x i32> %res
 }
@@ -216,13 +129,6 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1) {
 ; CHECK-NEXT:    asrd z0.s, p0/m, z0.s, #5
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sdiv_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmlt v1.4s, v0.4s, #0
-; NONEON-NOSVE-NEXT:    usra v0.4s, v1.4s, #27
-; NONEON-NOSVE-NEXT:    sshr v0.4s, v0.4s, #5
-; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i32> %op1, shufflevector (<4 x i32> insertelement (<4 x i32> poison, i32 32, i32 0), <4 x i32> poison, <4 x i32> zeroinitializer)
   ret <4 x i32> %res
 }
@@ -236,18 +142,6 @@ define void @sdiv_v8i32(ptr %a) {
 ; CHECK-NEXT:    asrd z1.s, p0/m, z1.s, #5
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sdiv_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cmlt v2.4s, v0.4s, #0
-; NONEON-NOSVE-NEXT:    cmlt v3.4s, v1.4s, #0
-; NONEON-NOSVE-NEXT:    usra v0.4s, v2.4s, #27
-; NONEON-NOSVE-NEXT:    usra v1.4s, v3.4s, #27
-; NONEON-NOSVE-NEXT:    sshr v0.4s, v0.4s, #5
-; NONEON-NOSVE-NEXT:    sshr v1.4s, v1.4s, #5
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = sdiv <8 x i32> %op1, shufflevector (<8 x i32> insertelement (<8 x i32> poison, i32 32, i32 0), <8 x i32> poison, <8 x i32> zeroinitializer)
   store <8 x i32> %res, ptr %a
@@ -262,13 +156,6 @@ define <1 x i64> @sdiv_v1i64(<1 x i64> %op1) {
 ; CHECK-NEXT:    asrd z0.d, p0/m, z0.d, #5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sdiv_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmlt d1, d0, #0
-; NONEON-NOSVE-NEXT:    usra d0, d1, #59
-; NONEON-NOSVE-NEXT:    sshr d0, d0, #5
-; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <1 x i64> %op1, shufflevector (<1 x i64> insertelement (<1 x i64> poison, i64 32, i32 0), <1 x i64> poison, <1 x i32> zeroinitializer)
   ret <1 x i64> %res
 }
@@ -282,13 +169,6 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1) {
 ; CHECK-NEXT:    asrd z0.d, p0/m, z0.d, #5
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sdiv_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmlt v1.2d, v0.2d, #0
-; NONEON-NOSVE-NEXT:    usra v0.2d, v1.2d, #59
-; NONEON-NOSVE-NEXT:    sshr v0.2d, v0.2d, #5
-; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i64> %op1, shufflevector (<2 x i64> insertelement (<2 x i64> poison, i64 32, i32 0), <2 x i64> poison, <2 x i32> zeroinitializer)
   ret <2 x i64> %res
 }
@@ -302,18 +182,6 @@ define void @sdiv_v4i64(ptr %a) {
 ; CHECK-NEXT:    asrd z1.d, p0/m, z1.d, #5
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: sdiv_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cmlt v2.2d, v0.2d, #0
-; NONEON-NOSVE-NEXT:    cmlt v3.2d, v1.2d, #0
-; NONEON-NOSVE-NEXT:    usra v0.2d, v2.2d, #59
-; NONEON-NOSVE-NEXT:    usra v1.2d, v3.2d, #59
-; NONEON-NOSVE-NEXT:    sshr v0.2d, v0.2d, #5
-; NONEON-NOSVE-NEXT:    sshr v1.2d, v1.2d, #5
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = sdiv <4 x i64> %op1, shufflevector (<4 x i64> insertelement (<4 x i64> poison, i64 32, i32 0), <4 x i64> poison, <4 x i32> zeroinitializer)
   store <4 x i64> %res, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll
index ad0d4ef0afef..6f82c97f3b87 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -14,6 +15,13 @@ define void @hang_when_merging_stores_after_legalisation(ptr %a, <2 x i32> %b) {
 ; CHECK-NEXT:    mov z1.d, z0.d
 ; CHECK-NEXT:    st2w { z0.s, z1.s }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: hang_when_merging_stores_after_legalisation:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    dup v0.4s, v0.s[0]
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <8 x i32> zeroinitializer
   %interleaved.vec = shufflevector <8 x i32> %splat, <8 x i32> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
   store <8 x i32> %interleaved.vec, ptr %a, align 4
@@ -28,6 +36,13 @@ define void @interleave_store_without_splat(ptr %a, <4 x i32> %v1, <4 x i32> %v2
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    st2w { z0.s, z1.s }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: interleave_store_without_splat:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    zip2 v2.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    zip1 v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %shuffle = shufflevector <4 x i32> %v1, <4 x i32> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %interleaved = shufflevector <8 x i32> %shuffle, <8 x i32> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
   store <8 x i32> %interleaved, ptr %a, align 1
@@ -46,6 +61,16 @@ define void @interleave_store_legalization(ptr %a, <8 x i32> %v1, <8 x i32> %v2)
 ; CHECK-NEXT:    st2w { z4.s, z5.s }, p0, [x0]
 ; CHECK-NEXT:    st2w { z2.s, z3.s }, p0, [x0, x8, lsl #2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: interleave_store_legalization:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    zip2 v4.4s, v1.4s, v3.4s
+; NONEON-NOSVE-NEXT:    zip1 v1.4s, v1.4s, v3.4s
+; NONEON-NOSVE-NEXT:    zip2 v3.4s, v0.4s, v2.4s
+; NONEON-NOSVE-NEXT:    zip1 v0.4s, v0.4s, v2.4s
+; NONEON-NOSVE-NEXT:    stp q1, q4, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %interleaved.vec = shufflevector <8 x i32> %v1, <8 x i32> %v2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11,
                                                                              i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   store <16 x i32> %interleaved.vec, ptr %a, align 4
@@ -57,6 +82,10 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) {
 ; CHECK-LABEL: crash_when_lowering_extract_shuffle:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: crash_when_lowering_extract_shuffle:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ret
   %broadcast.splat = shufflevector <32 x i1> zeroinitializer, <32 x i1> zeroinitializer, <32 x i32> zeroinitializer
   br i1 %cond, label %exit, label %vector.body
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll
index 6489e8d94d31..323d5278592f 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 
@@ -16,11 +15,6 @@ define <4 x i8> @splat_v4i8(i8 %a) {
 ; CHECK-NEXT:    mov z0.h, w0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: splat_v4i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.4h, w0
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i8> undef, i8 %a, i64 0
   %splat = shufflevector <4 x i8> %insert, <4 x i8> undef, <4 x i32> zeroinitializer
   ret <4 x i8> %splat
@@ -32,11 +26,6 @@ define <8 x i8> @splat_v8i8(i8 %a) {
 ; CHECK-NEXT:    mov z0.b, w0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: splat_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.8b, w0
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x i8> undef, i8 %a, i64 0
   %splat = shufflevector <8 x i8> %insert, <8 x i8> undef, <8 x i32> zeroinitializer
   ret <8 x i8> %splat
@@ -48,11 +37,6 @@ define <16 x i8> @splat_v16i8(i8 %a) {
 ; CHECK-NEXT:    mov z0.b, w0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: splat_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.16b, w0
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <16 x i8> undef, i8 %a, i64 0
   %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
   ret <16 x i8> %splat
@@ -64,12 +48,6 @@ define void @splat_v32i8(i8 %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.b, w0
 ; CHECK-NEXT:    stp q0, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: splat_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.16b, w0
-; NONEON-NOSVE-NEXT:    stp q0, q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <32 x i8> undef, i8 %a, i64 0
   %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
   store <32 x i8> %splat, ptr %b
@@ -82,11 +60,6 @@ define <2 x i16> @splat_v2i16(i16 %a) {
 ; CHECK-NEXT:    mov z0.s, w0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: splat_v2i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.2s, w0
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x i16> undef, i16 %a, i64 0
   %splat = shufflevector <2 x i16> %insert, <2 x i16> undef, <2 x i32> zeroinitializer
   ret <2 x i16> %splat
@@ -98,11 +71,6 @@ define <4 x i16> @splat_v4i16(i16 %a) {
 ; CHECK-NEXT:    mov z0.h, w0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: splat_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.4h, w0
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i16> undef, i16 %a, i64 0
   %splat = shufflevector <4 x i16> %insert, <4 x i16> undef, <4 x i32> zeroinitializer
   ret <4 x i16> %splat
@@ -114,11 +82,6 @@ define <8 x i16> @splat_v8i16(i16 %a) {
 ; CHECK-NEXT:    mov z0.h, w0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: splat_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.8h, w0
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x i16> undef, i16 %a, i64 0
   %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
   ret <8 x i16> %splat
@@ -130,12 +93,6 @@ define void @splat_v16i16(i16 %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.h, w0
 ; CHECK-NEXT:    stp q0, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: splat_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.8h, w0
-; NONEON-NOSVE-NEXT:    stp q0, q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <16 x i16> undef, i16 %a, i64 0
   %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
   store <16 x i16> %splat, ptr %b
@@ -148,11 +105,6 @@ define <2 x i32> @splat_v2i32(i32 %a) {
 ; CHECK-NEXT:    mov z0.s, w0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: splat_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.2s, w0
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x i32> undef, i32 %a, i64 0
   %splat = shufflevector <2 x i32> %insert, <2 x i32> undef, <2 x i32> zeroinitializer
   ret <2 x i32> %splat
@@ -164,11 +116,6 @@ define <4 x i32> @splat_v4i32(i32 %a) {
 ; CHECK-NEXT:    mov z0.s, w0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: splat_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.4s, w0
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i32> undef, i32 %a, i64 0
   %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
   ret <4 x i32> %splat
@@ -180,12 +127,6 @@ define void @splat_v8i32(i32 %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.s, w0
 ; CHECK-NEXT:    stp q0, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: splat_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.4s, w0
-; NONEON-NOSVE-NEXT:    stp q0, q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x i32> undef, i32 %a, i64 0
   %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
   store <8 x i32> %splat, ptr %b
@@ -198,11 +139,6 @@ define <1 x i64> @splat_v1i64(i64 %a) {
 ; CHECK-NEXT:    mov z0.d, x0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: splat_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov d0, x0
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <1 x i64> undef, i64 %a, i64 0
   %splat = shufflevector <1 x i64> %insert, <1 x i64> undef, <1 x i32> zeroinitializer
   ret <1 x i64> %splat
@@ -214,11 +150,6 @@ define <2 x i64> @splat_v2i64(i64 %a) {
 ; CHECK-NEXT:    mov z0.d, x0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: splat_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.2d, x0
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x i64> undef, i64 %a, i64 0
   %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
   ret <2 x i64> %splat
@@ -230,12 +161,6 @@ define void @splat_v4i64(i64 %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.d, x0
 ; CHECK-NEXT:    stp q0, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: splat_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.2d, x0
-; NONEON-NOSVE-NEXT:    stp q0, q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i64> undef, i64 %a, i64 0
   %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
   store <4 x i64> %splat, ptr %b
@@ -253,12 +178,6 @@ define <2 x half> @splat_v2f16(half %a) {
 ; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: splat_v2f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $h0 killed $h0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.4h, v0.h[0]
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x half> undef, half %a, i64 0
   %splat = shufflevector <2 x half> %insert, <2 x half> undef, <2 x i32> zeroinitializer
   ret <2 x half> %splat
@@ -271,12 +190,6 @@ define <4 x half> @splat_v4f16(half %a) {
 ; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: splat_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $h0 killed $h0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.4h, v0.h[0]
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x half> undef, half %a, i64 0
   %splat = shufflevector <4 x half> %insert, <4 x half> undef, <4 x i32> zeroinitializer
   ret <4 x half> %splat
@@ -289,12 +202,6 @@ define <8 x half> @splat_v8f16(half %a) {
 ; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: splat_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $h0 killed $h0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.8h, v0.h[0]
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x half> undef, half %a, i64 0
   %splat = shufflevector <8 x half> %insert, <8 x half> undef, <8 x i32> zeroinitializer
   ret <8 x half> %splat
@@ -307,13 +214,6 @@ define void @splat_v16f16(half %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: splat_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $h0 killed $h0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.8h, v0.h[0]
-; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <16 x half> undef, half %a, i64 0
   %splat = shufflevector <16 x half> %insert, <16 x half> undef, <16 x i32> zeroinitializer
   store <16 x half> %splat, ptr %b
@@ -327,12 +227,6 @@ define <2 x float> @splat_v2f32(float %a, <2 x float> %op2) {
 ; CHECK-NEXT:    mov z0.s, s0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: splat_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $s0 killed $s0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.2s, v0.s[0]
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x float> undef, float %a, i64 0
   %splat = shufflevector <2 x float> %insert, <2 x float> undef, <2 x i32> zeroinitializer
   ret <2 x float> %splat
@@ -345,12 +239,6 @@ define <4 x float> @splat_v4f32(float %a, <4 x float> %op2) {
 ; CHECK-NEXT:    mov z0.s, s0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: splat_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $s0 killed $s0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.4s, v0.s[0]
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x float> undef, float %a, i64 0
   %splat = shufflevector <4 x float> %insert, <4 x float> undef, <4 x i32> zeroinitializer
   ret <4 x float> %splat
@@ -363,13 +251,6 @@ define void @splat_v8f32(float %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.s, s0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: splat_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $s0 killed $s0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.4s, v0.s[0]
-; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x float> undef, float %a, i64 0
   %splat = shufflevector <8 x float> %insert, <8 x float> undef, <8 x i32> zeroinitializer
   store <8 x float> %splat, ptr %b
@@ -380,10 +261,6 @@ define <1 x double> @splat_v1f64(double %a, <1 x double> %op2) {
 ; CHECK-LABEL: splat_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: splat_v1f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <1 x double> undef, double %a, i64 0
   %splat = shufflevector <1 x double> %insert, <1 x double> undef, <1 x i32> zeroinitializer
   ret <1 x double> %splat
@@ -396,12 +273,6 @@ define <2 x double> @splat_v2f64(double %a, <2 x double> %op2) {
 ; CHECK-NEXT:    mov z0.d, d0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: splat_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.2d, v0.d[0]
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x double> undef, double %a, i64 0
   %splat = shufflevector <2 x double> %insert, <2 x double> undef, <2 x i32> zeroinitializer
   ret <2 x double> %splat
@@ -414,13 +285,6 @@ define void @splat_v4f64(double %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.d, d0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: splat_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.2d, v0.d[0]
-; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x double> undef, double %a, i64 0
   %splat = shufflevector <4 x double> %insert, <4 x double> undef, <4 x i32> zeroinitializer
   store <4 x double> %splat, ptr %b
@@ -437,12 +301,6 @@ define void @splat_imm_v32i8(ptr %a) {
 ; CHECK-NEXT:    mov z0.b, #1 // =0x1
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: splat_imm_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #1
-; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <32 x i8> undef, i8 1, i64 0
   %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
   store <32 x i8> %splat, ptr %a
@@ -455,13 +313,6 @@ define void @splat_imm_v16i16(ptr %a) {
 ; CHECK-NEXT:    mov z0.h, #2 // =0x2
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: splat_imm_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #2 // =0x2
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <16 x i16> undef, i16 2, i64 0
   %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
   store <16 x i16> %splat, ptr %a
@@ -474,13 +325,6 @@ define void @splat_imm_v8i32(ptr %a) {
 ; CHECK-NEXT:    mov z0.s, #3 // =0x3
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: splat_imm_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #3 // =0x3
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x i32> undef, i32 3, i64 0
   %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
   store <8 x i32> %splat, ptr %a
@@ -493,13 +337,6 @@ define void @splat_imm_v4i64(ptr %a) {
 ; CHECK-NEXT:    mov z0.d, #4 // =0x4
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: splat_imm_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #4 // =0x4
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i64> undef, i64 4, i64 0
   %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
   store <4 x i64> %splat, ptr %a
@@ -516,13 +353,6 @@ define void @splat_imm_v16f16(ptr %a) {
 ; CHECK-NEXT:    fmov z0.h, #5.00000000
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: splat_imm_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #17664 // =0x4500
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <16 x half> undef, half 5.0, i64 0
   %splat = shufflevector <16 x half> %insert, <16 x half> undef, <16 x i32> zeroinitializer
   store <16 x half> %splat, ptr %a
@@ -535,12 +365,6 @@ define void @splat_imm_v8f32(ptr %a) {
 ; CHECK-NEXT:    fmov z0.s, #6.00000000
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: splat_imm_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov v0.4s, #6.00000000
-; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x float> undef, float 6.0, i64 0
   %splat = shufflevector <8 x float> %insert, <8 x float> undef, <8 x i32> zeroinitializer
   store <8 x float> %splat, ptr %a
@@ -553,12 +377,6 @@ define void @splat_imm_v4f64(ptr %a) {
 ; CHECK-NEXT:    fmov z0.d, #7.00000000
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: splat_imm_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov v0.2d, #7.00000000
-; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x double> undef, double 7.0, i64 0
   %splat = shufflevector <4 x double> %insert, <4 x double> undef, <4 x i32> zeroinitializer
   store <4 x double> %splat, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll
index 41449aa90ba0..06709ca3685c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -13,11 +12,6 @@ define void @store_v4i8(ptr %a) {
 ; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: store_v4i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str wzr, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <4 x i8> zeroinitializer, ptr %a
   ret void
 }
@@ -28,12 +22,6 @@ define void @store_v8i8(ptr %a) {
 ; CHECK-NEXT:    mov z0.b, #0 // =0x0
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: store_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    str d0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <8 x i8> zeroinitializer, ptr %a
   ret void
 }
@@ -44,12 +32,6 @@ define void @store_v16i8(ptr %a) {
 ; CHECK-NEXT:    mov z0.b, #0 // =0x0
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: store_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <16 x i8> zeroinitializer, ptr %a
   ret void
 }
@@ -60,12 +42,6 @@ define void @store_v32i8(ptr %a) {
 ; CHECK-NEXT:    mov z0.b, #0 // =0x0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: store_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <32 x i8> zeroinitializer, ptr %a
   ret void
 }
@@ -77,11 +53,6 @@ define void @store_v2i16(ptr %a) {
 ; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: store_v2i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str wzr, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <2 x i16> zeroinitializer, ptr %a
   ret void
 }
@@ -93,11 +64,6 @@ define void @store_v2f16(ptr %a) {
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    str w8, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: store_v2f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str wzr, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <2 x half> zeroinitializer, ptr %a
   ret void
 }
@@ -108,12 +74,6 @@ define void @store_v4i16(ptr %a) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: store_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    str d0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <4 x i16> zeroinitializer, ptr %a
   ret void
 }
@@ -124,12 +84,6 @@ define void @store_v4f16(ptr %a) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: store_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d0, #0000000000000000
-; NONEON-NOSVE-NEXT:    str d0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <4 x half> zeroinitializer, ptr %a
   ret void
 }
@@ -140,12 +94,6 @@ define void @store_v8i16(ptr %a) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: store_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <8 x i16> zeroinitializer, ptr %a
   ret void
 }
@@ -156,12 +104,6 @@ define void @store_v8f16(ptr %a) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: store_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    str q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <8 x half> zeroinitializer, ptr %a
   ret void
 }
@@ -172,12 +114,6 @@ define void @store_v16i16(ptr %a) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: store_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <16 x i16> zeroinitializer, ptr %a
   ret void
 }
@@ -188,12 +124,6 @@ define void @store_v16f16(ptr %a) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: store_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <16 x half> zeroinitializer, ptr %a
   ret void
 }
@@ -203,11 +133,6 @@ define void @store_v2i32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str xzr, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: store_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str xzr, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <2 x i32> zeroinitializer, ptr %a
   ret void
 }
@@ -217,11 +142,6 @@ define void @store_v2f32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str xzr, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: store_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str xzr, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <2 x float> zeroinitializer, ptr %a
   ret void
 }
@@ -231,11 +151,6 @@ define void @store_v4i32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp xzr, xzr, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: store_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp xzr, xzr, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <4 x i32> zeroinitializer, ptr %a
   ret void
 }
@@ -245,11 +160,6 @@ define void @store_v4f32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp xzr, xzr, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: store_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp xzr, xzr, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <4 x float> zeroinitializer, ptr %a
   ret void
 }
@@ -260,12 +170,6 @@ define void @store_v8i32(ptr %a) {
 ; CHECK-NEXT:    mov z0.s, #0 // =0x0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: store_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <8 x i32> zeroinitializer, ptr %a
   ret void
 }
@@ -276,12 +180,6 @@ define void @store_v8f32(ptr %a) {
 ; CHECK-NEXT:    mov z0.s, #0 // =0x0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: store_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <8 x float> zeroinitializer, ptr %a
   ret void
 }
@@ -292,12 +190,6 @@ define void @store_v1i64(ptr %a) {
 ; CHECK-NEXT:    mov z0.d, #0 // =0x0
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: store_v1i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    str d0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <1 x i64> zeroinitializer, ptr %a
   ret void
 }
@@ -308,12 +200,6 @@ define void @store_v1f64(ptr %a) {
 ; CHECK-NEXT:    fmov d0, xzr
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: store_v1f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d0, #0000000000000000
-; NONEON-NOSVE-NEXT:    str d0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <1 x double> zeroinitializer, ptr %a
   ret void
 }
@@ -323,11 +209,6 @@ define void @store_v2i64(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp xzr, xzr, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: store_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp xzr, xzr, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <2 x i64> zeroinitializer, ptr %a
   ret void
 }
@@ -337,11 +218,6 @@ define void @store_v2f64(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp xzr, xzr, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: store_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp xzr, xzr, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <2 x double> zeroinitializer, ptr %a
   ret void
 }
@@ -352,12 +228,6 @@ define void @store_v4i64(ptr %a) {
 ; CHECK-NEXT:    mov z0.d, #0 // =0x0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: store_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <4 x i64> zeroinitializer, ptr %a
   ret void
 }
@@ -368,12 +238,6 @@ define void @store_v4f64(ptr %a) {
 ; CHECK-NEXT:    mov z0.d, #0 // =0x0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: store_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   store <4 x double> zeroinitializer, ptr %a
   ret void
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll
index d1873f436815..838db0ce8185 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 ; Test we can code generater patterns of the form:
@@ -24,12 +23,6 @@ define void @subvector_v4i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: subvector_v4i8:
-; NONEON-NOSVE:       // %bb.0: // %bb1
-; NONEON-NOSVE-NEXT:    ldr w8, [x0]
-; NONEON-NOSVE-NEXT:    str w8, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i8>, ptr %in
   br label %bb1
 
@@ -44,12 +37,6 @@ define void @subvector_v8i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: subvector_v8i8:
-; NONEON-NOSVE:       // %bb.0: // %bb1
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    str d0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i8>, ptr %in
   br label %bb1
 
@@ -64,12 +51,6 @@ define void @subvector_v16i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: subvector_v16i8:
-; NONEON-NOSVE:       // %bb.0: // %bb1
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i8>, ptr %in
   br label %bb1
 
@@ -84,12 +65,6 @@ define void @subvector_v32i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: subvector_v32i8:
-; NONEON-NOSVE:       // %bb.0: // %bb1
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   br label %bb1
 
@@ -106,12 +81,6 @@ define void @subvector_v2i16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: subvector_v2i16:
-; NONEON-NOSVE:       // %bb.0: // %bb1
-; NONEON-NOSVE-NEXT:    ldr w8, [x0]
-; NONEON-NOSVE-NEXT:    str w8, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i16>, ptr %in
   br label %bb1
 
@@ -126,12 +95,6 @@ define void @subvector_v4i16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: subvector_v4i16:
-; NONEON-NOSVE:       // %bb.0: // %bb1
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    str d0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i16>, ptr %in
   br label %bb1
 
@@ -146,12 +109,6 @@ define void @subvector_v8i16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: subvector_v8i16:
-; NONEON-NOSVE:       // %bb.0: // %bb1
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i16>, ptr %in
   br label %bb1
 
@@ -166,12 +123,6 @@ define void @subvector_v16i16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: subvector_v16i16:
-; NONEON-NOSVE:       // %bb.0: // %bb1
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   br label %bb1
 
@@ -187,12 +138,6 @@ define void @subvector_v2i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: subvector_v2i32:
-; NONEON-NOSVE:       // %bb.0: // %bb1
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    str d0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i32>, ptr %in
   br label %bb1
 
@@ -207,12 +152,6 @@ define void @subvector_v4i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: subvector_v4i32:
-; NONEON-NOSVE:       // %bb.0: // %bb1
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i32>, ptr %in
   br label %bb1
 
@@ -227,12 +166,6 @@ define void @subvector_v8i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: subvector_v8i32:
-; NONEON-NOSVE:       // %bb.0: // %bb1
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i32>, ptr %in
   br label %bb1
 
@@ -248,12 +181,6 @@ define void @subvector_v2i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: subvector_v2i64:
-; NONEON-NOSVE:       // %bb.0: // %bb1
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i64>, ptr %in
   br label %bb1
 
@@ -268,12 +195,6 @@ define void @subvector_v4i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: subvector_v4i64:
-; NONEON-NOSVE:       // %bb.0: // %bb1
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i64>, ptr %in
   br label %bb1
 
@@ -289,12 +210,6 @@ define void @subvector_v2f16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    str w8, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: subvector_v2f16:
-; NONEON-NOSVE:       // %bb.0: // %bb1
-; NONEON-NOSVE-NEXT:    ldr w8, [x0]
-; NONEON-NOSVE-NEXT:    str w8, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x half>, ptr %in
   br label %bb1
 
@@ -309,12 +224,6 @@ define void @subvector_v4f16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: subvector_v4f16:
-; NONEON-NOSVE:       // %bb.0: // %bb1
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    str d0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x half>, ptr %in
   br label %bb1
 
@@ -329,12 +238,6 @@ define void @subvector_v8f16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: subvector_v8f16:
-; NONEON-NOSVE:       // %bb.0: // %bb1
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x half>, ptr %in
   br label %bb1
 
@@ -349,12 +252,6 @@ define void @subvector_v16f16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: subvector_v16f16:
-; NONEON-NOSVE:       // %bb.0: // %bb1
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x half>, ptr %in
   br label %bb1
 
@@ -370,12 +267,6 @@ define void @subvector_v2f32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: subvector_v2f32:
-; NONEON-NOSVE:       // %bb.0: // %bb1
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    str d0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x float>, ptr %in
   br label %bb1
 
@@ -390,12 +281,6 @@ define void @subvector_v4f32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: subvector_v4f32:
-; NONEON-NOSVE:       // %bb.0: // %bb1
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x float>, ptr %in
   br label %bb1
 
@@ -410,12 +295,6 @@ define void @subvector_v8f32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: subvector_v8f32:
-; NONEON-NOSVE:       // %bb.0: // %bb1
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x float>,ptr %in
   br label %bb1
 
@@ -431,12 +310,6 @@ define void @subvector_v2f64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: subvector_v2f64:
-; NONEON-NOSVE:       // %bb.0: // %bb1
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x double>, ptr %in
   br label %bb1
 
@@ -451,12 +324,6 @@ define void @subvector_v4f64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: subvector_v4f64:
-; NONEON-NOSVE:       // %bb.0: // %bb1
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x double>, ptr %in
   br label %bb1
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
index f0a4368da3ee..7e3a175c40d2 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -13,13 +12,6 @@ define void @store_trunc_v8i16i8(ptr %ap, ptr %dest) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: store_trunc_v8i16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    xtn v0.8b, v0.8h
-; NONEON-NOSVE-NEXT:    str d0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i16>, ptr %ap
   %val = trunc <8 x i16> %a to <8 x i8>
   store <8 x i8> %val, ptr %dest
@@ -33,14 +25,6 @@ define void @store_trunc_v4i32i8(ptr %ap, ptr %dest) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    st1b { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: store_trunc_v4i32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    str s0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i32>, ptr %ap
   %val = trunc <4 x i32> %a to <4 x i8>
   store <4 x i8> %val, ptr %dest
@@ -54,13 +38,6 @@ define void @store_trunc_v4i32i16(ptr %ap, ptr %dest) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: store_trunc_v4i32i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    str d0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i32>, ptr %ap
   %val = trunc <4 x i32> %a to <4 x i16>
   store <4 x i16> %val, ptr %dest
@@ -74,13 +51,6 @@ define void @store_trunc_v2i64i8(ptr %ap, ptr %dest) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    st1w { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: store_trunc_v2i64i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    str d0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i64>, ptr %ap
   %val = trunc <2 x i64> %a to <2 x i32>
   store <2 x i32> %val, ptr %dest
@@ -96,14 +66,6 @@ define void @store_trunc_v2i256i64(ptr %ap, ptr %dest) {
 ; CHECK-NEXT:    splice z1.d, p0, z1.d, z0.d
 ; CHECK-NEXT:    str q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: store_trunc_v2i256i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldr d1, [x0]
-; NONEON-NOSVE-NEXT:    mov v1.d[1], v0.d[0]
-; NONEON-NOSVE-NEXT:    str q1, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i256>, ptr %ap
   %val = trunc <2 x i256> %a to <2 x i64>
   store <2 x i64> %val, ptr %dest
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
index 4895ffb6858e..70219dd30f76 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -20,12 +19,6 @@ define <16 x i8> @trunc_v16i16_v16i8(ptr %in) nounwind {
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: trunc_v16i16_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = trunc <16 x i16> %a to <16 x i8>
   ret <16 x i8> %b
@@ -48,17 +41,6 @@ define void @trunc_v32i16_v32i8(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    add z1.b, z2.b, z2.b
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: trunc_v32i16_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v3.16b, v2.16b
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i16>, ptr %in
   %b = trunc <32 x i16> %a to <32 x i8>
   %c = add <32 x i8> %b, %b
@@ -94,24 +76,6 @@ define void @trunc_v64i16_v64i8(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    stp q0, q1, [x1, #32]
 ; CHECK-NEXT:    stp q2, q3, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: trunc_v64i16_v64i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #96]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    ldp q6, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    uzp1 v2.16b, v3.16b, v2.16b
-; NONEON-NOSVE-NEXT:    uzp1 v3.16b, v5.16b, v4.16b
-; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v6.16b, v1.16b
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v2.16b, v2.16b, v2.16b
-; NONEON-NOSVE-NEXT:    add v3.16b, v3.16b, v3.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <64 x i16>, ptr %in
   %b = trunc <64 x i16> %a to <64 x i8>
   %c = add <64 x i8> %b, %b
@@ -169,38 +133,6 @@ define void @trunc_v128i16_v128i8(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    stp q2, q3, [x1, #32]
 ; CHECK-NEXT:    stp q4, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: trunc_v128i16_v128i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #192]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #224]
-; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #128]
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    ldp q16, q1, [x0, #160]
-; NONEON-NOSVE-NEXT:    uzp1 v4.16b, v5.16b, v4.16b
-; NONEON-NOSVE-NEXT:    ldp q17, q5, [x0, #64]
-; NONEON-NOSVE-NEXT:    uzp1 v6.16b, v7.16b, v6.16b
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q18, q7, [x0, #96]
-; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v16.16b, v1.16b
-; NONEON-NOSVE-NEXT:    uzp1 v5.16b, v17.16b, v5.16b
-; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #32]
-; NONEON-NOSVE-NEXT:    uzp1 v2.16b, v3.16b, v2.16b
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v4.16b, v4.16b, v4.16b
-; NONEON-NOSVE-NEXT:    uzp1 v7.16b, v18.16b, v7.16b
-; NONEON-NOSVE-NEXT:    add v3.16b, v6.16b, v6.16b
-; NONEON-NOSVE-NEXT:    uzp1 v6.16b, v17.16b, v16.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q4, [x1, #96]
-; NONEON-NOSVE-NEXT:    add v0.16b, v5.16b, v5.16b
-; NONEON-NOSVE-NEXT:    add v2.16b, v2.16b, v2.16b
-; NONEON-NOSVE-NEXT:    add v4.16b, v7.16b, v7.16b
-; NONEON-NOSVE-NEXT:    stp q3, q1, [x1, #64]
-; NONEON-NOSVE-NEXT:    add v1.16b, v6.16b, v6.16b
-; NONEON-NOSVE-NEXT:    stp q0, q4, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q2, q1, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <128 x i16>, ptr %in
   %b = trunc <128 x i16> %a to <128 x i8>
   %c = add <128 x i8> %b, %b
@@ -223,13 +155,6 @@ define <8 x i8> @trunc_v8i32_v8i8(ptr %in) nounwind {
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: trunc_v8i32_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    xtn v0.8b, v0.8h
-; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i32>, ptr %in
   %b = trunc <8 x i32> %a to <8 x i8>
   ret <8 x i8> %b
@@ -253,15 +178,6 @@ define <16 x i8> @trunc_v16i32_v16i8(ptr %in) nounwind {
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: trunc_v16i32_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i32>, ptr %in
   %b = trunc <16 x i32> %a to <16 x i8>
   ret <16 x i8> %b
@@ -299,23 +215,6 @@ define void @trunc_v32i32_v32i8(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    add z1.b, z3.b, z3.b
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: trunc_v32i32_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #96]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0]
-; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #32]
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
-; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v5.8h, v4.8h
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v7.8h, v6.8h
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v3.16b, v1.16b
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i32>, ptr %in
   %b = trunc <32 x i32> %a to <32 x i8>
   %c = add <32 x i8> %b, %b
@@ -380,36 +279,6 @@ define void @trunc_v64i32_v64i8(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    stp q1, q2, [x1, #32]
 ; CHECK-NEXT:    stp q3, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: trunc_v64i32_v64i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #128]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #160]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #192]
-; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #224]
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v4.8h, v5.8h, v4.8h
-; NONEON-NOSVE-NEXT:    ldp q17, q5, [x0, #64]
-; NONEON-NOSVE-NEXT:    uzp1 v6.8h, v7.8h, v6.8h
-; NONEON-NOSVE-NEXT:    ldp q16, q7, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q19, q18, [x0, #96]
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v3.8h, v1.8h
-; NONEON-NOSVE-NEXT:    uzp1 v5.8h, v17.8h, v5.8h
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    uzp1 v7.8h, v16.8h, v7.8h
-; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v19.8h, v18.8h
-; NONEON-NOSVE-NEXT:    uzp1 v2.16b, v4.16b, v6.16b
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v1.16b, v7.16b
-; NONEON-NOSVE-NEXT:    uzp1 v3.16b, v5.16b, v3.16b
-; NONEON-NOSVE-NEXT:    add v2.16b, v2.16b, v2.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add v3.16b, v3.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <64 x i32>, ptr %in
   %b = trunc <64 x i32> %a to <64 x i8>
   %c = add <64 x i8> %b, %b
@@ -431,12 +300,6 @@ define <8 x i16> @trunc_v8i32_v8i16(ptr %in) nounwind {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: trunc_v8i32_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i32>, ptr %in
   %b = trunc <8 x i32> %a to <8 x i16>
   ret <8 x i16> %b
@@ -459,17 +322,6 @@ define void @trunc_v16i32_v16i16(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    add z1.h, z2.h, z2.h
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: trunc_v16i32_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v3.8h, v2.8h
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i32>, ptr %in
   %b = trunc <16 x i32> %a to <16 x i16>
   %c = add <16 x i16> %b, %b
@@ -505,24 +357,6 @@ define void @trunc_v32i32_v32i16(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    stp q0, q1, [x1, #32]
 ; CHECK-NEXT:    stp q2, q3, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: trunc_v32i32_v32i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #96]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    ldp q6, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
-; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v5.8h, v4.8h
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v6.8h, v1.8h
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v2.8h, v2.8h, v2.8h
-; NONEON-NOSVE-NEXT:    add v3.8h, v3.8h, v3.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i32>, ptr %in
   %b = trunc <32 x i32> %a to <32 x i16>
   %c = add <32 x i16> %b, %b
@@ -580,38 +414,6 @@ define void @trunc_v64i32_v64i16(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    stp q2, q3, [x1, #32]
 ; CHECK-NEXT:    stp q4, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: trunc_v64i32_v64i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #192]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #224]
-; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #128]
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    ldp q16, q1, [x0, #160]
-; NONEON-NOSVE-NEXT:    uzp1 v4.8h, v5.8h, v4.8h
-; NONEON-NOSVE-NEXT:    ldp q17, q5, [x0, #64]
-; NONEON-NOSVE-NEXT:    uzp1 v6.8h, v7.8h, v6.8h
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q18, q7, [x0, #96]
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v16.8h, v1.8h
-; NONEON-NOSVE-NEXT:    uzp1 v5.8h, v17.8h, v5.8h
-; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #32]
-; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v4.8h, v4.8h, v4.8h
-; NONEON-NOSVE-NEXT:    uzp1 v7.8h, v18.8h, v7.8h
-; NONEON-NOSVE-NEXT:    add v3.8h, v6.8h, v6.8h
-; NONEON-NOSVE-NEXT:    uzp1 v6.8h, v17.8h, v16.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    stp q0, q4, [x1, #96]
-; NONEON-NOSVE-NEXT:    add v0.8h, v5.8h, v5.8h
-; NONEON-NOSVE-NEXT:    add v2.8h, v2.8h, v2.8h
-; NONEON-NOSVE-NEXT:    add v4.8h, v7.8h, v7.8h
-; NONEON-NOSVE-NEXT:    stp q3, q1, [x1, #64]
-; NONEON-NOSVE-NEXT:    add v1.8h, v6.8h, v6.8h
-; NONEON-NOSVE-NEXT:    stp q0, q4, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q2, q1, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <64 x i32>, ptr %in
   %b = trunc <64 x i32> %a to <64 x i16>
   %c = add <64 x i16> %b, %b
@@ -635,13 +437,6 @@ define <4 x i8> @trunc_v4i64_v4i8(ptr %in) nounwind {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: trunc_v4i64_v4i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i64>, ptr %in
   %b = trunc <4 x i64> %a to <4 x i8>
   ret <4 x i8> %b
@@ -666,16 +461,6 @@ define <8 x i8> @trunc_v8i64_v8i8(ptr %in) nounwind {
 ; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: trunc_v8i64_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
-; NONEON-NOSVE-NEXT:    xtn v0.8b, v0.8h
-; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i64>, ptr %in
   %b = trunc <8 x i64> %a to <8 x i8>
   ret <8 x i8> %b
@@ -714,21 +499,6 @@ define <16 x i8> @trunc_v16i64_v16i8(ptr %in) nounwind {
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: trunc_v16i64_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #96]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #64]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    uzp1 v4.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    uzp1 v3.4s, v7.4s, v6.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v4.8h
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v3.8h, v2.8h
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i64>, ptr %in
   %b = trunc <16 x i64> %a to <16 x i8>
   ret <16 x i8> %b
@@ -795,35 +565,6 @@ define void @trunc_v32i64_v32i8(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    add z0.b, z0.b, z0.b
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: trunc_v32i64_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #224]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #192]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #96]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #128]
-; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #160]
-; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    ldp q19, q18, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q21, q20, [x0, #64]
-; NONEON-NOSVE-NEXT:    uzp1 v4.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    uzp1 v16.4s, v17.4s, v16.4s
-; NONEON-NOSVE-NEXT:    uzp1 v5.4s, v7.4s, v6.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    uzp1 v7.4s, v19.4s, v18.4s
-; NONEON-NOSVE-NEXT:    uzp1 v6.4s, v21.4s, v20.4s
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v4.8h, v16.8h
-; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v2.8h, v7.8h
-; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v6.8h, v5.8h
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i64>, ptr %in
   %b = trunc <32 x i64> %a to <32 x i8>
   %c = add <32 x i8> %b, %b
@@ -846,13 +587,6 @@ define <4 x i16> @trunc_v4i64_v4i16(ptr %in) nounwind {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: trunc_v4i64_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i64>, ptr %in
   %b = trunc <4 x i64> %a to <4 x i16>
   ret <4 x i16> %b
@@ -876,15 +610,6 @@ define <8 x i16> @trunc_v8i64_v8i16(ptr %in) nounwind {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: trunc_v8i64_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
-; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i64>, ptr %in
   %b = trunc <8 x i64> %a to <8 x i16>
   ret <8 x i16> %b
@@ -922,23 +647,6 @@ define void @trunc_v16i64_v16i16(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    add z1.h, z3.h, z3.h
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: trunc_v16i64_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #96]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0]
-; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #32]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    uzp1 v3.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v7.4s, v6.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v3.8h, v1.8h
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i64>, ptr %in
   %b = trunc <16 x i64> %a to <16 x i16>
   %c = add <16 x i16> %b, %b
@@ -1003,36 +711,6 @@ define void @trunc_v32i64_v32i16(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    stp q1, q2, [x1, #32]
 ; CHECK-NEXT:    stp q3, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: trunc_v32i64_v32i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #128]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #160]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #192]
-; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #224]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v4.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    ldp q17, q5, [x0, #64]
-; NONEON-NOSVE-NEXT:    uzp1 v6.4s, v7.4s, v6.4s
-; NONEON-NOSVE-NEXT:    ldp q16, q7, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q19, q18, [x0, #96]
-; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v3.4s, v1.4s
-; NONEON-NOSVE-NEXT:    uzp1 v5.4s, v17.4s, v5.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
-; NONEON-NOSVE-NEXT:    uzp1 v7.4s, v16.4s, v7.4s
-; NONEON-NOSVE-NEXT:    uzp1 v3.4s, v19.4s, v18.4s
-; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v4.8h, v6.8h
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v1.8h, v7.8h
-; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v5.8h, v3.8h
-; NONEON-NOSVE-NEXT:    add v2.8h, v2.8h, v2.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add v3.8h, v3.8h, v3.8h
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i64>, ptr %in
   %b = trunc <32 x i64> %a to <32 x i16>
   %c = add <32 x i16> %b, %b
@@ -1054,12 +732,6 @@ define <4 x i32> @trunc_v4i64_v4i32(ptr %in) nounwind {
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: trunc_v4i64_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i64>, ptr %in
   %b = trunc <4 x i64> %a to <4 x i32>
   ret <4 x i32> %b
@@ -1082,17 +754,6 @@ define void @trunc_v8i64_v8i32(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    add z1.s, z2.s, z2.s
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: trunc_v8i64_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    add v0.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i64>, ptr %in
   %b = trunc <8 x i64> %a to <8 x i32>
   %c = add <8 x i32> %b, %b
@@ -1128,24 +789,6 @@ define void @trunc_v16i64_v16i32(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    stp q0, q1, [x1, #32]
 ; CHECK-NEXT:    stp q2, q3, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: trunc_v16i64_v16i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #96]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ldp q6, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    uzp1 v3.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v6.4s, v1.4s
-; NONEON-NOSVE-NEXT:    add v0.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    add v2.4s, v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    add v3.4s, v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i64>, ptr %in
   %b = trunc <16 x i64> %a to <16 x i32>
   %c = add <16 x i32> %b, %b
@@ -1203,38 +846,6 @@ define void @trunc_v32i64_v32i32(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    stp q2, q3, [x1, #32]
 ; CHECK-NEXT:    stp q4, q0, [x1]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: trunc_v32i64_v32i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #192]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #224]
-; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #128]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ldp q16, q1, [x0, #160]
-; NONEON-NOSVE-NEXT:    uzp1 v4.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    ldp q17, q5, [x0, #64]
-; NONEON-NOSVE-NEXT:    uzp1 v6.4s, v7.4s, v6.4s
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q18, q7, [x0, #96]
-; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v16.4s, v1.4s
-; NONEON-NOSVE-NEXT:    uzp1 v5.4s, v17.4s, v5.4s
-; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #32]
-; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    add v0.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    add v4.4s, v4.4s, v4.4s
-; NONEON-NOSVE-NEXT:    uzp1 v7.4s, v18.4s, v7.4s
-; NONEON-NOSVE-NEXT:    add v3.4s, v6.4s, v6.4s
-; NONEON-NOSVE-NEXT:    uzp1 v6.4s, v17.4s, v16.4s
-; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q4, [x1, #96]
-; NONEON-NOSVE-NEXT:    add v0.4s, v5.4s, v5.4s
-; NONEON-NOSVE-NEXT:    add v2.4s, v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    add v4.4s, v7.4s, v7.4s
-; NONEON-NOSVE-NEXT:    stp q3, q1, [x1, #64]
-; NONEON-NOSVE-NEXT:    add v1.4s, v6.4s, v6.4s
-; NONEON-NOSVE-NEXT:    stp q0, q4, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q2, q1, [x1]
-; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i64>, ptr %in
   %b = trunc <32 x i64> %a to <32 x i32>
   %c = add <32 x i32> %b, %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
index dd308dfadd80..175731480407 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -15,12 +14,6 @@ define <4 x i8> @shuffle_ext_byone_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    tbl z0.h, { z0.h }, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v1.8b, v0.8b, v0.8b, #6
-; NONEON-NOSVE-NEXT:    trn1 v0.4h, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <4 x i8> %op1, <4 x i8> %op2, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
   ret <4 x i8> %ret
 }
@@ -35,11 +28,6 @@ define <8 x i8> @shuffle_ext_byone_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    insr z1.b, w8
 ; CHECK-NEXT:    fmov d0, d1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.8b, v0.8b, v1.8b, #7
-; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <8 x i8> %op1, <8 x i8> %op2, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
   ret <8 x i8> %ret
 }
@@ -54,11 +42,6 @@ define <16 x i8> @shuffle_ext_byone_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    insr z1.b, w8
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shuffle_ext_byone_v16i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #15
-; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <16 x i8> %op1, <16 x i8> %op2, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22,
                                                                    i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
   ret <16 x i8> %ret
@@ -77,15 +60,6 @@ define void @shuffle_ext_byone_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    insr z3.b, w8
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shuffle_ext_byone_v32i8:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #15
-; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #15
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %ret = shufflevector <32 x i8> %op1, <32 x i8> %op2, <32 x i32> <i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38,
@@ -104,11 +78,6 @@ define <2 x i16> @shuffle_ext_byone_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-NEXT:    revw z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev64 v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <2 x i16> %op1, <2 x i16> %op2, <2 x i32> <i32 1, i32 0>
   ret <2 x i16> %ret
 }
@@ -123,11 +92,6 @@ define <4 x i16> @shuffle_ext_byone_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    insr z1.h, w8
 ; CHECK-NEXT:    fmov d0, d1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.8b, v0.8b, v1.8b, #6
-; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <4 x i16> %op1, <4 x i16> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   ret <4 x i16> %ret
 }
@@ -142,11 +106,6 @@ define <8 x i16> @shuffle_ext_byone_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    insr z1.h, w8
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #14
-; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <8 x i16> %op1, <8 x i16> %op2, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
   ret <8 x i16> %ret
 }
@@ -164,15 +123,6 @@ define void @shuffle_ext_byone_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    insr z3.h, w8
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shuffle_ext_byone_v16i16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #14
-; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #14
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %ret = shufflevector <16 x i16> %op1, <16 x i16> %op2, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22,
@@ -191,11 +141,6 @@ define <2 x i32> @shuffle_ext_byone_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    insr z1.s, w8
 ; CHECK-NEXT:    fmov d0, d1
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
-; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <2 x i32> %op1, <2 x i32> %op2, <2 x i32> <i32 1, i32 2>
   ret <2 x i32> %ret
 }
@@ -210,11 +155,6 @@ define <4 x i32> @shuffle_ext_byone_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    insr z1.s, w8
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #12
-; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <4 x i32> %op1, <4 x i32> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   ret <4 x i32> %ret
 }
@@ -232,15 +172,6 @@ define void @shuffle_ext_byone_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    insr z3.s, w8
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8i32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #12
-; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #12
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %ret = shufflevector <8 x i32> %op1, <8 x i32> %op2, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
@@ -258,11 +189,6 @@ define <2 x i64> @shuffle_ext_byone_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    insr z1.d, x8
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
-; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <2 x i64> %op1, <2 x i64> %op2, <2 x i32> <i32 1, i32 2>
   ret <2 x i64> %ret
 }
@@ -280,15 +206,6 @@ define void @shuffle_ext_byone_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    insr z3.d, x8
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
-; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #8
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %ret = shufflevector <4 x i64> %op1, <4 x i64> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
@@ -306,11 +223,6 @@ define <4 x half> @shuffle_ext_byone_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    insr z0.h, h2
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.8b, v0.8b, v1.8b, #6
-; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <4 x half> %op1, <4 x half> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   ret <4 x half> %ret
 }
@@ -324,11 +236,6 @@ define <8 x half> @shuffle_ext_byone_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    insr z0.h, h2
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #14
-; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <8 x half> %op1, <8 x half> %op2, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
   ret <8 x half> %ret
 }
@@ -344,15 +251,6 @@ define void @shuffle_ext_byone_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    insr z3.h, h2
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shuffle_ext_byone_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #14
-; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #14
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %ret = shufflevector <16 x half> %op1, <16 x half> %op2, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22,
@@ -370,11 +268,6 @@ define <2 x float> @shuffle_ext_byone_v2f32(<2 x float> %op1, <2 x float> %op2)
 ; CHECK-NEXT:    insr z0.s, s2
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
-; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <2 x float> %op1, <2 x float> %op2, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %ret
 }
@@ -388,11 +281,6 @@ define <4 x float> @shuffle_ext_byone_v4f32(<4 x float> %op1, <4 x float> %op2)
 ; CHECK-NEXT:    insr z0.s, s2
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #12
-; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <4 x float> %op1, <4 x float> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   ret <4 x float> %ret
 }
@@ -408,15 +296,6 @@ define void @shuffle_ext_byone_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    insr z3.s, s2
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8f32:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #12
-; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #12
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %ret = shufflevector <8 x float> %op1, <8 x float> %op2, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
@@ -433,11 +312,6 @@ define <2 x double> @shuffle_ext_byone_v2f64(<2 x double> %op1, <2 x double> %op
 ; CHECK-NEXT:    insr z0.d, d2
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
-; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <2 x double> %op1, <2 x double> %op2, <2 x i32> <i32 1, i32 2>
   ret <2 x double> %ret
 }
@@ -453,15 +327,6 @@ define void @shuffle_ext_byone_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    insr z3.d, d2
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4f64:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
-; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #8
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %ret = shufflevector <4 x double> %op1, <4 x double> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
@@ -480,15 +345,6 @@ define void @shuffle_ext_byone_reverse(ptr %a, ptr %b) {
 ; CHECK-NEXT:    insr z3.d, d2
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shuffle_ext_byone_reverse:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v2.16b, #8
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %ret = shufflevector <4 x double> %op1, <4 x double> %op2, <4 x i32> <i32 7, i32 0, i32 1, i32 2>
@@ -503,13 +359,6 @@ define void @shuffle_ext_invalid(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: shuffle_ext_invalid:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %ret = shufflevector <4 x double> %op1, <4 x double> %op2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll
index 42f3f03a5ea0..337a2134de5b 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -12,11 +11,6 @@ define fp128 @test_streaming_compatible_register_mov(fp128 %q0, fp128 %q1) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: test_streaming_compatible_register_mov:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ret
   ret fp128 %q1
 }
 
@@ -26,11 +20,6 @@ define double @fp_zero_constant() {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov d0, xzr
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fp_zero_constant:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov d0, xzr
-; NONEON-NOSVE-NEXT:    ret
   ret double 0.0
 }
 
@@ -40,11 +29,6 @@ define <2 x i64> @fixed_vec_zero_constant() {
 ; CHECK-NEXT:    mov z0.d, #0 // =0x0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fixed_vec_zero_constant:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    ret
   ret <2 x i64> zeroinitializer
 }
 
@@ -54,10 +38,5 @@ define <2 x double> @fixed_vec_fp_zero_constant() {
 ; CHECK-NEXT:    mov z0.d, #0 // =0x0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fixed_vec_fp_zero_constant:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    ret
   ret <2 x double> <double 0.0, double 0.0>
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
index 478f4a689d3c..fd1365d56fee 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
@@ -4,10 +4,10 @@
 define {<vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_nxv4f16(<vscale x 4 x half> %vec) {
 ; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uunpkhi z1.d, z0.s
-; CHECK-NEXT:    uunpklo z2.d, z0.s
-; CHECK-NEXT:    uzp1 z0.d, z2.d, z1.d
-; CHECK-NEXT:    uzp2 z1.d, z2.d, z1.d
+; CHECK-NEXT:    uzp1 z1.s, z0.s, z0.s
+; CHECK-NEXT:    uzp2 z2.s, z0.s, z0.s
+; CHECK-NEXT:    uunpklo z0.d, z1.s
+; CHECK-NEXT:    uunpklo z1.d, z2.s
 ; CHECK-NEXT:    ret
   %retval = call {<vscale x 2 x half>, <vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %vec)
   ret {<vscale x 2 x half>, <vscale x 2 x half>}   %retval
@@ -16,10 +16,10 @@ define {<vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_n
 define {<vscale x 4 x half>, <vscale x 4 x half>} @vector_deinterleave_nxv4f16_nxv8f16(<vscale x 8 x half> %vec) {
 ; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uunpkhi z1.s, z0.h
-; CHECK-NEXT:    uunpklo z2.s, z0.h
-; CHECK-NEXT:    uzp1 z0.s, z2.s, z1.s
-; CHECK-NEXT:    uzp2 z1.s, z2.s, z1.s
+; CHECK-NEXT:    uzp1 z1.h, z0.h, z0.h
+; CHECK-NEXT:    uzp2 z2.h, z0.h, z0.h
+; CHECK-NEXT:    uunpklo z0.s, z1.h
+; CHECK-NEXT:    uunpklo z1.s, z2.h
 ; CHECK-NEXT:    ret
   %retval = call {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %vec)
   ret {<vscale x 4 x half>, <vscale x 4 x half>}   %retval
@@ -39,10 +39,10 @@ define {<vscale x 8 x half>, <vscale x 8 x half>} @vector_deinterleave_nxv8f16_n
 define {<vscale x 2 x float>, <vscale x 2 x float>} @vector_deinterleave_nxv2f32_nxv4f32(<vscale x 4 x float> %vec) {
 ; CHECK-LABEL: vector_deinterleave_nxv2f32_nxv4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uunpkhi z1.d, z0.s
-; CHECK-NEXT:    uunpklo z2.d, z0.s
-; CHECK-NEXT:    uzp1 z0.d, z2.d, z1.d
-; CHECK-NEXT:    uzp2 z1.d, z2.d, z1.d
+; CHECK-NEXT:    uzp1 z1.s, z0.s, z0.s
+; CHECK-NEXT:    uzp2 z2.s, z0.s, z0.s
+; CHECK-NEXT:    uunpklo z0.d, z1.s
+; CHECK-NEXT:    uunpklo z1.d, z2.s
 ; CHECK-NEXT:    ret
   %retval = call {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %vec)
   ret {<vscale x 2 x float>, <vscale x 2 x float>}   %retval
@@ -131,10 +131,10 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv
 define {<vscale x 8 x i1>, <vscale x 8 x i1>} @vector_deinterleave_nxv8i1_nxv16i1(<vscale x 16 x i1> %vec) {
 ; CHECK-LABEL: vector_deinterleave_nxv8i1_nxv16i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    punpkhi p1.h, p0.b
-; CHECK-NEXT:    punpklo p2.h, p0.b
-; CHECK-NEXT:    uzp1 p0.h, p2.h, p1.h
-; CHECK-NEXT:    uzp2 p1.h, p2.h, p1.h
+; CHECK-NEXT:    uzp1 p1.b, p0.b, p0.b
+; CHECK-NEXT:    uzp2 p2.b, p0.b, p0.b
+; CHECK-NEXT:    punpklo p0.h, p1.b
+; CHECK-NEXT:    punpklo p1.h, p2.b
 ; CHECK-NEXT:    ret
   %retval = call {<vscale x 8 x i1>, <vscale x 8 x i1>} @llvm.vector.deinterleave2.nxv16i1(<vscale x 16 x i1> %vec)
   ret {<vscale x 8 x i1>, <vscale x 8 x i1>}   %retval
@@ -143,10 +143,10 @@ define {<vscale x 8 x i1>, <vscale x 8 x i1>} @vector_deinterleave_nxv8i1_nxv16i
 define {<vscale x 4 x i1>, <vscale x 4 x i1>} @vector_deinterleave_nxv4i1_nxv8i1(<vscale x 8 x i1> %vec) {
 ; CHECK-LABEL: vector_deinterleave_nxv4i1_nxv8i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    punpkhi p1.h, p0.b
-; CHECK-NEXT:    punpklo p2.h, p0.b
-; CHECK-NEXT:    uzp1 p0.s, p2.s, p1.s
-; CHECK-NEXT:    uzp2 p1.s, p2.s, p1.s
+; CHECK-NEXT:    uzp1 p1.h, p0.h, p0.h
+; CHECK-NEXT:    uzp2 p2.h, p0.h, p0.h
+; CHECK-NEXT:    punpklo p0.h, p1.b
+; CHECK-NEXT:    punpklo p1.h, p2.b
 ; CHECK-NEXT:    ret
   %retval = call {<vscale x 4 x i1>, <vscale x 4 x i1>} @llvm.vector.deinterleave2.nxv8i1(<vscale x 8 x i1> %vec)
   ret {<vscale x 4 x i1>, <vscale x 4 x i1>}   %retval
@@ -155,10 +155,10 @@ define {<vscale x 4 x i1>, <vscale x 4 x i1>} @vector_deinterleave_nxv4i1_nxv8i1
 define {<vscale x 2 x i1>, <vscale x 2 x i1>} @vector_deinterleave_nxv2i1_nxv4i1(<vscale x 4 x i1> %vec) {
 ; CHECK-LABEL: vector_deinterleave_nxv2i1_nxv4i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    punpkhi p1.h, p0.b
-; CHECK-NEXT:    punpklo p2.h, p0.b
-; CHECK-NEXT:    uzp1 p0.d, p2.d, p1.d
-; CHECK-NEXT:    uzp2 p1.d, p2.d, p1.d
+; CHECK-NEXT:    uzp1 p1.s, p0.s, p0.s
+; CHECK-NEXT:    uzp2 p2.s, p0.s, p0.s
+; CHECK-NEXT:    punpklo p0.h, p1.b
+; CHECK-NEXT:    punpklo p1.h, p2.b
 ; CHECK-NEXT:    ret
   %retval = call {<vscale x 2 x i1>, <vscale x 2 x i1>} @llvm.vector.deinterleave2.nxv4i1(<vscale x 4 x i1> %vec)
   ret {<vscale x 2 x i1>, <vscale x 2 x i1>}   %retval
diff --git a/llvm/test/CodeGen/AArch64/sve2-histcnt.ll b/llvm/test/CodeGen/AArch64/sve2-histcnt.ll
new file mode 100644
index 000000000000..557a42116cdb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-histcnt.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=aarch64 < %s -o - | FileCheck %s
+
+define void @histogram_i64(<vscale x 2 x ptr> %buckets, i64 %inc, <vscale x 2 x i1> %mask) #0 {
+; CHECK-LABEL: histogram_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    histcnt z1.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    mov z3.d, x0
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [z0.d]
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    mad z1.d, p1/m, z3.d, z2.d
+; CHECK-NEXT:    st1d { z1.d }, p0, [z0.d]
+; CHECK-NEXT:    ret
+  call void @llvm.experimental.vector.histogram.add.nxv2p0.i64(<vscale x 2 x ptr> %buckets, i64 %inc, <vscale x 2 x i1> %mask)
+  ret void
+}
+
+;; FIXME: We maybe need some dagcombines here? We're multiplying the output of the histcnt
+;;        by 1, so we should be able to remove that and directly add the histcnt to the
+;;        current bucket data.
+define void @histogram_i32_literal(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask) #0 {
+; CHECK-LABEL: histogram_i32_literal:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    histcnt z1.s, p0/z, z0.s, z0.s
+; CHECK-NEXT:    mov z3.s, #1 // =0x1
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x0, z0.s, sxtw #2]
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    mad z1.s, p1/m, z3.s, z2.s
+; CHECK-NEXT:    st1w { z1.s }, p0, [x0, z0.s, sxtw #2]
+; CHECK-NEXT:    ret
+
+  %buckets = getelementptr i32, ptr %base, <vscale x 4 x i32> %indices
+  call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
+  ret void
+}
+
+define void @histogram_i32_literal_noscale(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask) #0 {
+; CHECK-LABEL: histogram_i32_literal_noscale:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    histcnt z1.s, p0/z, z0.s, z0.s
+; CHECK-NEXT:    mov z3.s, #1 // =0x1
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x0, z0.s, sxtw]
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    mad z1.s, p1/m, z3.s, z2.s
+; CHECK-NEXT:    st1w { z1.s }, p0, [x0, z0.s, sxtw]
+; CHECK-NEXT:    ret
+
+  %buckets = getelementptr i8, ptr %base, <vscale x 4 x i32> %indices
+  call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
+  ret void
+}
+
+attributes #0 = { "target-features"="+sve2" vscale_range(1, 16) }
diff --git a/llvm/test/CodeGen/AArch64/vector-llrint.ll b/llvm/test/CodeGen/AArch64/vector-llrint.ll
index beb2b6a13460..5503de2b4c5d 100644
--- a/llvm/test/CodeGen/AArch64/vector-llrint.ll
+++ b/llvm/test/CodeGen/AArch64/vector-llrint.ll
@@ -386,14 +386,9 @@ declare <1 x i64> @llvm.llrint.v1i64.v1f32(<1 x float>)
 define <2 x i64> @llrint_v2i64_v2f32(<2 x float> %x) {
 ; CHECK-LABEL: llrint_v2i64_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov s1, v0.s[1]
-; CHECK-NEXT:    frintx s0, s0
-; CHECK-NEXT:    frintx s1, s1
-; CHECK-NEXT:    fcvtzs x8, s0
-; CHECK-NEXT:    fcvtzs x9, s1
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    mov v0.d[1], x9
+; CHECK-NEXT:    frintx v0.2s, v0.2s
+; CHECK-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
 ; CHECK-NEXT:    ret
   %a = call <2 x i64> @llvm.llrint.v2i64.v2f32(<2 x float> %x)
   ret <2 x i64> %a
@@ -404,20 +399,12 @@ define <4 x i64> @llrint_v4i64_v4f32(<4 x float> %x) {
 ; CHECK-LABEL: llrint_v4i64_v4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    mov s3, v0.s[1]
-; CHECK-NEXT:    frintx s0, s0
-; CHECK-NEXT:    mov s2, v1.s[1]
-; CHECK-NEXT:    frintx s1, s1
-; CHECK-NEXT:    frintx s3, s3
-; CHECK-NEXT:    fcvtzs x9, s0
-; CHECK-NEXT:    frintx s2, s2
-; CHECK-NEXT:    fcvtzs x8, s1
-; CHECK-NEXT:    fcvtzs x11, s3
-; CHECK-NEXT:    fmov d0, x9
-; CHECK-NEXT:    fcvtzs x10, s2
-; CHECK-NEXT:    fmov d1, x8
-; CHECK-NEXT:    mov v0.d[1], x11
-; CHECK-NEXT:    mov v1.d[1], x10
+; CHECK-NEXT:    frintx v0.2s, v0.2s
+; CHECK-NEXT:    frintx v1.2s, v1.2s
+; CHECK-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-NEXT:    fcvtl v1.2d, v1.2s
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-NEXT:    ret
   %a = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> %x)
   ret <4 x i64> %a
@@ -429,34 +416,18 @@ define <8 x i64> @llrint_v8i64_v8f32(<8 x float> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    mov s4, v0.s[1]
-; CHECK-NEXT:    mov s7, v1.s[1]
-; CHECK-NEXT:    frintx s0, s0
-; CHECK-NEXT:    frintx s1, s1
-; CHECK-NEXT:    mov s5, v2.s[1]
-; CHECK-NEXT:    mov s6, v3.s[1]
-; CHECK-NEXT:    frintx s2, s2
-; CHECK-NEXT:    frintx s3, s3
-; CHECK-NEXT:    frintx s4, s4
-; CHECK-NEXT:    frintx s7, s7
-; CHECK-NEXT:    fcvtzs x9, s0
-; CHECK-NEXT:    fcvtzs x12, s1
-; CHECK-NEXT:    frintx s5, s5
-; CHECK-NEXT:    frintx s6, s6
-; CHECK-NEXT:    fcvtzs x8, s2
-; CHECK-NEXT:    fcvtzs x10, s3
-; CHECK-NEXT:    fcvtzs x11, s4
-; CHECK-NEXT:    fcvtzs x15, s7
-; CHECK-NEXT:    fmov d0, x9
-; CHECK-NEXT:    fmov d2, x12
-; CHECK-NEXT:    fcvtzs x13, s5
-; CHECK-NEXT:    fcvtzs x14, s6
-; CHECK-NEXT:    fmov d1, x8
-; CHECK-NEXT:    fmov d3, x10
-; CHECK-NEXT:    mov v0.d[1], x11
-; CHECK-NEXT:    mov v2.d[1], x15
-; CHECK-NEXT:    mov v1.d[1], x13
-; CHECK-NEXT:    mov v3.d[1], x14
+; CHECK-NEXT:    frintx v0.2s, v0.2s
+; CHECK-NEXT:    frintx v1.2s, v1.2s
+; CHECK-NEXT:    frintx v2.2s, v2.2s
+; CHECK-NEXT:    frintx v3.2s, v3.2s
+; CHECK-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-NEXT:    fcvtl v1.2d, v1.2s
+; CHECK-NEXT:    fcvtl v4.2d, v2.2s
+; CHECK-NEXT:    fcvtl v3.2d, v3.2s
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    fcvtzs v2.2d, v1.2d
+; CHECK-NEXT:    fcvtzs v1.2d, v4.2d
+; CHECK-NEXT:    fcvtzs v3.2d, v3.2d
 ; CHECK-NEXT:    ret
   %a = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> %x)
   ret <8 x i64> %a
@@ -466,72 +437,113 @@ declare <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float>)
 define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) {
 ; CHECK-LABEL: llrint_v16i64_v16f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v4.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    ext v5.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT:    ext v5.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    ext v6.16b, v2.16b, v2.16b, #8
-; CHECK-NEXT:    frintx s7, s0
-; CHECK-NEXT:    ext v16.16b, v3.16b, v3.16b, #8
-; CHECK-NEXT:    mov s0, v0.s[1]
-; CHECK-NEXT:    frintx s17, s4
-; CHECK-NEXT:    mov s4, v4.s[1]
-; CHECK-NEXT:    mov s18, v5.s[1]
-; CHECK-NEXT:    frintx s5, s5
-; CHECK-NEXT:    frintx s19, s6
-; CHECK-NEXT:    fcvtzs x8, s7
-; CHECK-NEXT:    frintx s7, s16
-; CHECK-NEXT:    mov s6, v6.s[1]
-; CHECK-NEXT:    mov s16, v16.s[1]
-; CHECK-NEXT:    frintx s0, s0
-; CHECK-NEXT:    frintx s4, s4
-; CHECK-NEXT:    fcvtzs x9, s17
-; CHECK-NEXT:    frintx s17, s1
-; CHECK-NEXT:    mov s1, v1.s[1]
-; CHECK-NEXT:    frintx s18, s18
-; CHECK-NEXT:    fcvtzs x10, s5
-; CHECK-NEXT:    mov s5, v2.s[1]
-; CHECK-NEXT:    fcvtzs x11, s19
-; CHECK-NEXT:    mov s19, v3.s[1]
-; CHECK-NEXT:    frintx s2, s2
-; CHECK-NEXT:    fcvtzs x12, s7
-; CHECK-NEXT:    frintx s6, s6
-; CHECK-NEXT:    fcvtzs x13, s4
-; CHECK-NEXT:    frintx s4, s3
-; CHECK-NEXT:    frintx s16, s16
-; CHECK-NEXT:    fcvtzs x14, s18
-; CHECK-NEXT:    frintx s18, s1
-; CHECK-NEXT:    fcvtzs x15, s17
-; CHECK-NEXT:    frintx s20, s5
-; CHECK-NEXT:    frintx s17, s19
-; CHECK-NEXT:    fmov d1, x9
-; CHECK-NEXT:    fcvtzs x9, s2
-; CHECK-NEXT:    fmov d5, x11
-; CHECK-NEXT:    fmov d3, x10
-; CHECK-NEXT:    fcvtzs x11, s4
-; CHECK-NEXT:    fcvtzs x10, s0
-; CHECK-NEXT:    fmov d7, x12
-; CHECK-NEXT:    fcvtzs x12, s18
-; CHECK-NEXT:    fcvtzs x17, s6
-; CHECK-NEXT:    fcvtzs x18, s16
-; CHECK-NEXT:    fcvtzs x16, s20
-; CHECK-NEXT:    fcvtzs x0, s17
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    fmov d2, x15
-; CHECK-NEXT:    fmov d4, x9
-; CHECK-NEXT:    mov v1.d[1], x13
-; CHECK-NEXT:    fmov d6, x11
-; CHECK-NEXT:    mov v3.d[1], x14
-; CHECK-NEXT:    mov v0.d[1], x10
-; CHECK-NEXT:    mov v5.d[1], x17
-; CHECK-NEXT:    mov v7.d[1], x18
-; CHECK-NEXT:    mov v2.d[1], x12
-; CHECK-NEXT:    mov v4.d[1], x16
-; CHECK-NEXT:    mov v6.d[1], x0
+; CHECK-NEXT:    ext v7.16b, v3.16b, v3.16b, #8
+; CHECK-NEXT:    frintx v0.2s, v0.2s
+; CHECK-NEXT:    frintx v1.2s, v1.2s
+; CHECK-NEXT:    frintx v2.2s, v2.2s
+; CHECK-NEXT:    frintx v3.2s, v3.2s
+; CHECK-NEXT:    frintx v5.2s, v5.2s
+; CHECK-NEXT:    frintx v4.2s, v4.2s
+; CHECK-NEXT:    frintx v6.2s, v6.2s
+; CHECK-NEXT:    frintx v7.2s, v7.2s
+; CHECK-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-NEXT:    fcvtl v1.2d, v1.2s
+; CHECK-NEXT:    fcvtl v16.2d, v2.2s
+; CHECK-NEXT:    fcvtl v18.2d, v3.2s
+; CHECK-NEXT:    fcvtl v5.2d, v5.2s
+; CHECK-NEXT:    fcvtl v17.2d, v4.2s
+; CHECK-NEXT:    fcvtl v19.2d, v6.2s
+; CHECK-NEXT:    fcvtl v7.2d, v7.2s
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    fcvtzs v2.2d, v1.2d
+; CHECK-NEXT:    fcvtzs v4.2d, v16.2d
+; CHECK-NEXT:    fcvtzs v6.2d, v18.2d
+; CHECK-NEXT:    fcvtzs v1.2d, v5.2d
+; CHECK-NEXT:    fcvtzs v3.2d, v17.2d
+; CHECK-NEXT:    fcvtzs v5.2d, v19.2d
+; CHECK-NEXT:    fcvtzs v7.2d, v7.2d
 ; CHECK-NEXT:    ret
   %a = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> %x)
   ret <16 x i64> %a
 }
 declare <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float>)
 
+define <32 x i64> @llrint_v32i64_v32f32(<32 x float> %x) {
+; CHECK-LABEL: llrint_v32i64_v32f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v16.16b, v7.16b, v7.16b, #8
+; CHECK-NEXT:    ext v17.16b, v6.16b, v6.16b, #8
+; CHECK-NEXT:    frintx v7.2s, v7.2s
+; CHECK-NEXT:    frintx v6.2s, v6.2s
+; CHECK-NEXT:    ext v18.16b, v5.16b, v5.16b, #8
+; CHECK-NEXT:    ext v21.16b, v4.16b, v4.16b, #8
+; CHECK-NEXT:    ext v22.16b, v2.16b, v2.16b, #8
+; CHECK-NEXT:    frintx v5.2s, v5.2s
+; CHECK-NEXT:    ext v23.16b, v3.16b, v3.16b, #8
+; CHECK-NEXT:    frintx v4.2s, v4.2s
+; CHECK-NEXT:    ext v19.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ext v20.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT:    frintx v16.2s, v16.2s
+; CHECK-NEXT:    frintx v17.2s, v17.2s
+; CHECK-NEXT:    fcvtl v7.2d, v7.2s
+; CHECK-NEXT:    fcvtl v6.2d, v6.2s
+; CHECK-NEXT:    frintx v18.2s, v18.2s
+; CHECK-NEXT:    frintx v21.2s, v21.2s
+; CHECK-NEXT:    frintx v2.2s, v2.2s
+; CHECK-NEXT:    frintx v3.2s, v3.2s
+; CHECK-NEXT:    fcvtl v5.2d, v5.2s
+; CHECK-NEXT:    frintx v23.2s, v23.2s
+; CHECK-NEXT:    fcvtl v4.2d, v4.2s
+; CHECK-NEXT:    frintx v1.2s, v1.2s
+; CHECK-NEXT:    fcvtl v16.2d, v16.2s
+; CHECK-NEXT:    fcvtl v17.2d, v17.2s
+; CHECK-NEXT:    fcvtzs v7.2d, v7.2d
+; CHECK-NEXT:    fcvtzs v6.2d, v6.2d
+; CHECK-NEXT:    fcvtl v18.2d, v18.2s
+; CHECK-NEXT:    fcvtl v21.2d, v21.2s
+; CHECK-NEXT:    frintx v20.2s, v20.2s
+; CHECK-NEXT:    fcvtl v3.2d, v3.2s
+; CHECK-NEXT:    fcvtzs v5.2d, v5.2d
+; CHECK-NEXT:    frintx v0.2s, v0.2s
+; CHECK-NEXT:    fcvtl v2.2d, v2.2s
+; CHECK-NEXT:    fcvtzs v4.2d, v4.2d
+; CHECK-NEXT:    fcvtzs v16.2d, v16.2d
+; CHECK-NEXT:    fcvtzs v17.2d, v17.2d
+; CHECK-NEXT:    fcvtl v1.2d, v1.2s
+; CHECK-NEXT:    fcvtzs v3.2d, v3.2d
+; CHECK-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-NEXT:    fcvtzs v2.2d, v2.2d
+; CHECK-NEXT:    stp q6, q17, [x8, #192]
+; CHECK-NEXT:    fcvtl v6.2d, v23.2s
+; CHECK-NEXT:    frintx v17.2s, v19.2s
+; CHECK-NEXT:    stp q7, q16, [x8, #224]
+; CHECK-NEXT:    frintx v7.2s, v22.2s
+; CHECK-NEXT:    fcvtzs v16.2d, v18.2d
+; CHECK-NEXT:    fcvtzs v18.2d, v21.2d
+; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    fcvtzs v6.2d, v6.2d
+; CHECK-NEXT:    stp q5, q16, [x8, #160]
+; CHECK-NEXT:    fcvtl v7.2d, v7.2s
+; CHECK-NEXT:    fcvtl v5.2d, v20.2s
+; CHECK-NEXT:    stp q4, q18, [x8, #128]
+; CHECK-NEXT:    fcvtl v4.2d, v17.2s
+; CHECK-NEXT:    stp q3, q6, [x8, #96]
+; CHECK-NEXT:    fcvtzs v7.2d, v7.2d
+; CHECK-NEXT:    fcvtzs v3.2d, v5.2d
+; CHECK-NEXT:    stp q1, q3, [x8, #32]
+; CHECK-NEXT:    stp q2, q7, [x8, #64]
+; CHECK-NEXT:    fcvtzs v2.2d, v4.2d
+; CHECK-NEXT:    stp q0, q2, [x8]
+; CHECK-NEXT:    ret
+  %a = call <32 x i64> @llvm.llrint.v32i64.v32f32(<32 x float> %x)
+  ret <32 x i64> %a
+}
+declare <32 x i64> @llvm.llrint.v32i64.v32f32(<32 x float>)
+
 define <1 x i64> @llrint_v1i64_v1f64(<1 x double> %x) {
 ; CHECK-LABEL: llrint_v1i64_v1f64:
 ; CHECK:       // %bb.0:
@@ -547,13 +559,8 @@ declare <1 x i64> @llvm.llrint.v1i64.v1f64(<1 x double>)
 define <2 x i64> @llrint_v2i64_v2f64(<2 x double> %x) {
 ; CHECK-LABEL: llrint_v2i64_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov d1, v0.d[1]
-; CHECK-NEXT:    frintx d0, d0
-; CHECK-NEXT:    frintx d1, d1
-; CHECK-NEXT:    fcvtzs x8, d0
-; CHECK-NEXT:    fcvtzs x9, d1
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    mov v0.d[1], x9
+; CHECK-NEXT:    frintx v0.2d, v0.2d
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
 ; CHECK-NEXT:    ret
   %a = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> %x)
   ret <2 x i64> %a
@@ -563,20 +570,10 @@ declare <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double>)
 define <4 x i64> @llrint_v4i64_v4f64(<4 x double> %x) {
 ; CHECK-LABEL: llrint_v4i64_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov d2, v0.d[1]
-; CHECK-NEXT:    mov d3, v1.d[1]
-; CHECK-NEXT:    frintx d0, d0
-; CHECK-NEXT:    frintx d1, d1
-; CHECK-NEXT:    frintx d2, d2
-; CHECK-NEXT:    frintx d3, d3
-; CHECK-NEXT:    fcvtzs x8, d0
-; CHECK-NEXT:    fcvtzs x9, d1
-; CHECK-NEXT:    fcvtzs x10, d2
-; CHECK-NEXT:    fcvtzs x11, d3
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    fmov d1, x9
-; CHECK-NEXT:    mov v0.d[1], x10
-; CHECK-NEXT:    mov v1.d[1], x11
+; CHECK-NEXT:    frintx v0.2d, v0.2d
+; CHECK-NEXT:    frintx v1.2d, v1.2d
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-NEXT:    ret
   %a = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> %x)
   ret <4 x i64> %a
@@ -586,36 +583,94 @@ declare <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double>)
 define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) {
 ; CHECK-LABEL: llrint_v8i64_v8f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov d4, v0.d[1]
-; CHECK-NEXT:    mov d5, v1.d[1]
-; CHECK-NEXT:    mov d6, v2.d[1]
-; CHECK-NEXT:    mov d7, v3.d[1]
-; CHECK-NEXT:    frintx d0, d0
-; CHECK-NEXT:    frintx d1, d1
-; CHECK-NEXT:    frintx d2, d2
-; CHECK-NEXT:    frintx d3, d3
-; CHECK-NEXT:    frintx d4, d4
-; CHECK-NEXT:    frintx d5, d5
-; CHECK-NEXT:    frintx d6, d6
-; CHECK-NEXT:    frintx d7, d7
-; CHECK-NEXT:    fcvtzs x8, d0
-; CHECK-NEXT:    fcvtzs x9, d1
-; CHECK-NEXT:    fcvtzs x10, d2
-; CHECK-NEXT:    fcvtzs x11, d3
-; CHECK-NEXT:    fcvtzs x12, d4
-; CHECK-NEXT:    fcvtzs x13, d5
-; CHECK-NEXT:    fcvtzs x14, d6
-; CHECK-NEXT:    fcvtzs x15, d7
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    fmov d1, x9
-; CHECK-NEXT:    fmov d2, x10
-; CHECK-NEXT:    fmov d3, x11
-; CHECK-NEXT:    mov v0.d[1], x12
-; CHECK-NEXT:    mov v1.d[1], x13
-; CHECK-NEXT:    mov v2.d[1], x14
-; CHECK-NEXT:    mov v3.d[1], x15
+; CHECK-NEXT:    frintx v0.2d, v0.2d
+; CHECK-NEXT:    frintx v1.2d, v1.2d
+; CHECK-NEXT:    frintx v2.2d, v2.2d
+; CHECK-NEXT:    frintx v3.2d, v3.2d
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
+; CHECK-NEXT:    fcvtzs v2.2d, v2.2d
+; CHECK-NEXT:    fcvtzs v3.2d, v3.2d
 ; CHECK-NEXT:    ret
   %a = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> %x)
   ret <8 x i64> %a
 }
 declare <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double>)
+
+define <16 x i64> @llrint_v16f64(<16 x double> %x) {
+; CHECK-LABEL: llrint_v16f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintx v0.2d, v0.2d
+; CHECK-NEXT:    frintx v1.2d, v1.2d
+; CHECK-NEXT:    frintx v2.2d, v2.2d
+; CHECK-NEXT:    frintx v3.2d, v3.2d
+; CHECK-NEXT:    frintx v4.2d, v4.2d
+; CHECK-NEXT:    frintx v5.2d, v5.2d
+; CHECK-NEXT:    frintx v6.2d, v6.2d
+; CHECK-NEXT:    frintx v7.2d, v7.2d
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
+; CHECK-NEXT:    fcvtzs v2.2d, v2.2d
+; CHECK-NEXT:    fcvtzs v3.2d, v3.2d
+; CHECK-NEXT:    fcvtzs v4.2d, v4.2d
+; CHECK-NEXT:    fcvtzs v5.2d, v5.2d
+; CHECK-NEXT:    fcvtzs v6.2d, v6.2d
+; CHECK-NEXT:    fcvtzs v7.2d, v7.2d
+; CHECK-NEXT:    ret
+  %a = call <16 x i64> @llvm.llrint.v16i64.v16f64(<16 x double> %x)
+  ret <16 x i64> %a
+}
+declare <16 x i64> @llvm.llrint.v16i64.v16f64(<16 x double>)
+
+define <32 x i64> @llrint_v32f64(<32 x double> %x) {
+; CHECK-LABEL: llrint_v32f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q17, q16, [sp, #96]
+; CHECK-NEXT:    frintx v7.2d, v7.2d
+; CHECK-NEXT:    ldp q19, q18, [sp, #64]
+; CHECK-NEXT:    frintx v6.2d, v6.2d
+; CHECK-NEXT:    ldp q21, q20, [sp, #32]
+; CHECK-NEXT:    frintx v5.2d, v5.2d
+; CHECK-NEXT:    frintx v16.2d, v16.2d
+; CHECK-NEXT:    frintx v17.2d, v17.2d
+; CHECK-NEXT:    frintx v4.2d, v4.2d
+; CHECK-NEXT:    frintx v18.2d, v18.2d
+; CHECK-NEXT:    frintx v19.2d, v19.2d
+; CHECK-NEXT:    frintx v3.2d, v3.2d
+; CHECK-NEXT:    ldp q23, q22, [sp]
+; CHECK-NEXT:    frintx v20.2d, v20.2d
+; CHECK-NEXT:    frintx v21.2d, v21.2d
+; CHECK-NEXT:    frintx v2.2d, v2.2d
+; CHECK-NEXT:    frintx v1.2d, v1.2d
+; CHECK-NEXT:    fcvtzs v16.2d, v16.2d
+; CHECK-NEXT:    fcvtzs v17.2d, v17.2d
+; CHECK-NEXT:    frintx v0.2d, v0.2d
+; CHECK-NEXT:    frintx v22.2d, v22.2d
+; CHECK-NEXT:    fcvtzs v18.2d, v18.2d
+; CHECK-NEXT:    frintx v23.2d, v23.2d
+; CHECK-NEXT:    fcvtzs v19.2d, v19.2d
+; CHECK-NEXT:    fcvtzs v20.2d, v20.2d
+; CHECK-NEXT:    fcvtzs v7.2d, v7.2d
+; CHECK-NEXT:    fcvtzs v6.2d, v6.2d
+; CHECK-NEXT:    fcvtzs v5.2d, v5.2d
+; CHECK-NEXT:    fcvtzs v4.2d, v4.2d
+; CHECK-NEXT:    stp q17, q16, [x8, #224]
+; CHECK-NEXT:    fcvtzs v16.2d, v21.2d
+; CHECK-NEXT:    fcvtzs v3.2d, v3.2d
+; CHECK-NEXT:    fcvtzs v17.2d, v22.2d
+; CHECK-NEXT:    fcvtzs v2.2d, v2.2d
+; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
+; CHECK-NEXT:    stp q19, q18, [x8, #192]
+; CHECK-NEXT:    fcvtzs v18.2d, v23.2d
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    stp q4, q5, [x8, #64]
+; CHECK-NEXT:    stp q6, q7, [x8, #96]
+; CHECK-NEXT:    stp q2, q3, [x8, #32]
+; CHECK-NEXT:    stp q0, q1, [x8]
+; CHECK-NEXT:    stp q18, q17, [x8, #128]
+; CHECK-NEXT:    stp q16, q20, [x8, #160]
+; CHECK-NEXT:    ret
+  %a = call <32 x i64> @llvm.llrint.v32i64.v16f64(<32 x double> %x)
+  ret <32 x i64> %a
+}
+declare <32 x i64> @llvm.llrint.v32i64.v32f64(<32 x double>)
diff --git a/llvm/test/CodeGen/AArch64/vector-lrint.ll b/llvm/test/CodeGen/AArch64/vector-lrint.ll
index db85b2342821..602643264e7b 100644
--- a/llvm/test/CodeGen/AArch64/vector-lrint.ll
+++ b/llvm/test/CodeGen/AArch64/vector-lrint.ll
@@ -1,642 +1,1337 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc < %s -mtriple=aarch64 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=aarch64 -mattr=+neon |\
+; RUN:   FileCheck %s --check-prefixes=CHECK-i32
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=aarch64 -mattr=+neon |\
+; RUN:   FileCheck %s --check-prefixes=CHECK-i64,CHECK-i64-SD
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=aarch64 -mattr=+neon \
+; RUN:   -global-isel -global-isel-abort=2 2>&1 |\
+; RUN:   FileCheck %s --check-prefixes=CHECK-i32,CHECK-i32-GI
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=aarch64 -mattr=+neon \
+; RUN:   -global-isel -global-isel-abort=2 2>&1 |\
+; RUN:   FileCheck %s --check-prefixes=CHECK-i64,CHECK-i64-GI
 
-; CHECK-GI:       warning: Instruction selection used fallback path for lrint_v2f16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v4f16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v8f16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v16i64_v16f16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v32i64_v32f16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v2f32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v4f32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v8f32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v16i64_v16f32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v2f64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v4f64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v8f64
+; CHECK-i32-GI:       warning: Instruction selection used fallback path for lrint_v1f16
+; CHECK-i32-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v2f16
+; CHECK-i32-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v4f16
+; CHECK-i32-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v8f16
+; CHECK-i32-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v16f16
+; CHECK-i32-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v32f16
+; CHECK-i32-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v1f32
+; CHECK-i32-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v2f32
+; CHECK-i32-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v4f32
+; CHECK-i32-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v8f32
+; CHECK-i32-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v16f32
+; CHECK-i32-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v32f32
+; CHECK-i32-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v1f64
+; CHECK-i32-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v2f64
+; CHECK-i32-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v4f64
+; CHECK-i32-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v8f64
+; CHECK-i32-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v16f64
+; CHECK-i32-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v32f64
 
-define <1 x i64> @lrint_v1f16(<1 x half> %x) {
-; CHECK-LABEL: lrint_v1f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fcvt s0, h0
-; CHECK-NEXT:    frintx s0, s0
-; CHECK-NEXT:    fcvtzs x8, s0
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    ret
-  %a = call <1 x i64> @llvm.lrint.v1i64.v1f16(<1 x half> %x)
-  ret <1 x i64> %a
+; CHECK-i64-GI:       warning: Instruction selection used fallback path for lrint_v2f16
+; CHECK-i64-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v4f16
+; CHECK-i64-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v8f16
+; CHECK-i64-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v16f16
+; CHECK-i64-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v32f16
+; CHECK-i64-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v2f32
+; CHECK-i64-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v4f32
+; CHECK-i64-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v8f32
+; CHECK-i64-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v16f32
+; CHECK-i64-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v32f32
+; CHECK-i64-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v2f64
+; CHECK-i64-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v4f64
+; CHECK-i64-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v8f64
+; CHECK-i64-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v16f64
+; CHECK-i64-GI-NEXT:  warning: Instruction selection used fallback path for lrint_v32f64
+
+define <1 x iXLen> @lrint_v1f16(<1 x half> %x) {
+; CHECK-i32-LABEL: lrint_v1f16:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    fcvt s0, h0
+; CHECK-i32-NEXT:    frintx s0, s0
+; CHECK-i32-NEXT:    fcvtzs w8, s0
+; CHECK-i32-NEXT:    fmov s0, w8
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v1f16:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    fcvt s0, h0
+; CHECK-i64-NEXT:    frintx s0, s0
+; CHECK-i64-NEXT:    fcvtzs x8, s0
+; CHECK-i64-NEXT:    fmov d0, x8
+; CHECK-i64-NEXT:    ret
+  %a = call <1 x iXLen> @llvm.lrint.v1iXLen.v1f16(<1 x half> %x)
+  ret <1 x iXLen> %a
 }
-declare <1 x i64> @llvm.lrint.v1i64.v1f16(<1 x half>)
+declare <1 x iXLen> @llvm.lrint.v1iXLen.v1f16(<1 x half>)
 
-define <2 x i64> @lrint_v2f16(<2 x half> %x) {
-; CHECK-LABEL: lrint_v2f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov h1, v0.h[1]
-; CHECK-NEXT:    fcvt s0, h0
-; CHECK-NEXT:    fcvt s1, h1
-; CHECK-NEXT:    frintx s0, s0
-; CHECK-NEXT:    frintx s1, s1
-; CHECK-NEXT:    fcvtzs x8, s0
-; CHECK-NEXT:    fcvtzs x9, s1
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    mov v0.d[1], x9
-; CHECK-NEXT:    ret
-  %a = call <2 x i64> @llvm.lrint.v2i64.v2f16(<2 x half> %x)
-  ret <2 x i64> %a
+define <2 x iXLen> @lrint_v2f16(<2 x half> %x) {
+; CHECK-i32-LABEL: lrint_v2f16:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-i32-NEXT:    mov h1, v0.h[1]
+; CHECK-i32-NEXT:    fcvt s0, h0
+; CHECK-i32-NEXT:    fcvt s1, h1
+; CHECK-i32-NEXT:    frintx s0, s0
+; CHECK-i32-NEXT:    frintx s1, s1
+; CHECK-i32-NEXT:    fcvtzs w8, s0
+; CHECK-i32-NEXT:    fcvtzs w9, s1
+; CHECK-i32-NEXT:    fmov s0, w8
+; CHECK-i32-NEXT:    mov v0.s[1], w9
+; CHECK-i32-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v2f16:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-i64-NEXT:    mov h1, v0.h[1]
+; CHECK-i64-NEXT:    fcvt s0, h0
+; CHECK-i64-NEXT:    fcvt s1, h1
+; CHECK-i64-NEXT:    frintx s0, s0
+; CHECK-i64-NEXT:    frintx s1, s1
+; CHECK-i64-NEXT:    fcvtzs x8, s0
+; CHECK-i64-NEXT:    fcvtzs x9, s1
+; CHECK-i64-NEXT:    fmov d0, x8
+; CHECK-i64-NEXT:    mov v0.d[1], x9
+; CHECK-i64-NEXT:    ret
+  %a = call <2 x iXLen> @llvm.lrint.v2iXLen.v2f16(<2 x half> %x)
+  ret <2 x iXLen> %a
 }
-declare <2 x i64> @llvm.lrint.v2i64.v2f16(<2 x half>)
+declare <2 x iXLen> @llvm.lrint.v2iXLen.v2f16(<2 x half>)
 
-define <4 x i64> @lrint_v4f16(<4 x half> %x) {
-; CHECK-LABEL: lrint_v4f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov h1, v0.h[2]
-; CHECK-NEXT:    mov h2, v0.h[1]
-; CHECK-NEXT:    mov h3, v0.h[3]
-; CHECK-NEXT:    fcvt s0, h0
-; CHECK-NEXT:    fcvt s1, h1
-; CHECK-NEXT:    fcvt s2, h2
-; CHECK-NEXT:    fcvt s3, h3
-; CHECK-NEXT:    frintx s0, s0
-; CHECK-NEXT:    frintx s1, s1
-; CHECK-NEXT:    frintx s2, s2
-; CHECK-NEXT:    frintx s3, s3
-; CHECK-NEXT:    fcvtzs x8, s0
-; CHECK-NEXT:    fcvtzs x9, s1
-; CHECK-NEXT:    fcvtzs x10, s2
-; CHECK-NEXT:    fcvtzs x11, s3
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    fmov d1, x9
-; CHECK-NEXT:    mov v0.d[1], x10
-; CHECK-NEXT:    mov v1.d[1], x11
-; CHECK-NEXT:    ret
-  %a = call <4 x i64> @llvm.lrint.v4i64.v4f16(<4 x half> %x)
-  ret <4 x i64> %a
+define <4 x iXLen> @lrint_v4f16(<4 x half> %x) {
+; CHECK-i32-LABEL: lrint_v4f16:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-i32-NEXT:    mov h1, v0.h[1]
+; CHECK-i32-NEXT:    fcvt s2, h0
+; CHECK-i32-NEXT:    mov h3, v0.h[2]
+; CHECK-i32-NEXT:    mov h0, v0.h[3]
+; CHECK-i32-NEXT:    fcvt s1, h1
+; CHECK-i32-NEXT:    frintx s2, s2
+; CHECK-i32-NEXT:    fcvt s3, h3
+; CHECK-i32-NEXT:    frintx s1, s1
+; CHECK-i32-NEXT:    fcvtzs w8, s2
+; CHECK-i32-NEXT:    fcvt s2, h0
+; CHECK-i32-NEXT:    fcvtzs w9, s1
+; CHECK-i32-NEXT:    frintx s1, s3
+; CHECK-i32-NEXT:    fmov s0, w8
+; CHECK-i32-NEXT:    mov v0.s[1], w9
+; CHECK-i32-NEXT:    fcvtzs w8, s1
+; CHECK-i32-NEXT:    frintx s1, s2
+; CHECK-i32-NEXT:    mov v0.s[2], w8
+; CHECK-i32-NEXT:    fcvtzs w8, s1
+; CHECK-i32-NEXT:    mov v0.s[3], w8
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v4f16:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-i64-NEXT:    mov h1, v0.h[2]
+; CHECK-i64-NEXT:    mov h2, v0.h[1]
+; CHECK-i64-NEXT:    mov h3, v0.h[3]
+; CHECK-i64-NEXT:    fcvt s0, h0
+; CHECK-i64-NEXT:    fcvt s1, h1
+; CHECK-i64-NEXT:    fcvt s2, h2
+; CHECK-i64-NEXT:    fcvt s3, h3
+; CHECK-i64-NEXT:    frintx s0, s0
+; CHECK-i64-NEXT:    frintx s1, s1
+; CHECK-i64-NEXT:    frintx s2, s2
+; CHECK-i64-NEXT:    frintx s3, s3
+; CHECK-i64-NEXT:    fcvtzs x8, s0
+; CHECK-i64-NEXT:    fcvtzs x9, s1
+; CHECK-i64-NEXT:    fcvtzs x10, s2
+; CHECK-i64-NEXT:    fcvtzs x11, s3
+; CHECK-i64-NEXT:    fmov d0, x8
+; CHECK-i64-NEXT:    fmov d1, x9
+; CHECK-i64-NEXT:    mov v0.d[1], x10
+; CHECK-i64-NEXT:    mov v1.d[1], x11
+; CHECK-i64-NEXT:    ret
+  %a = call <4 x iXLen> @llvm.lrint.v4iXLen.v4f16(<4 x half> %x)
+  ret <4 x iXLen> %a
 }
-declare <4 x i64> @llvm.lrint.v4i64.v4f16(<4 x half>)
+declare <4 x iXLen> @llvm.lrint.v4iXLen.v4f16(<4 x half>)
 
-define <8 x i64> @lrint_v8f16(<8 x half> %x) {
-; CHECK-LABEL: lrint_v8f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    mov h4, v0.h[2]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    mov h7, v0.h[3]
-; CHECK-NEXT:    fcvt s0, h0
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h5, v1.h[1]
-; CHECK-NEXT:    mov h6, v1.h[3]
-; CHECK-NEXT:    fcvt s1, h1
-; CHECK-NEXT:    fcvt s4, h4
-; CHECK-NEXT:    fcvt s3, h3
-; CHECK-NEXT:    fcvt s7, h7
-; CHECK-NEXT:    frintx s0, s0
-; CHECK-NEXT:    fcvt s2, h2
-; CHECK-NEXT:    fcvt s5, h5
-; CHECK-NEXT:    fcvt s6, h6
-; CHECK-NEXT:    frintx s1, s1
-; CHECK-NEXT:    frintx s4, s4
-; CHECK-NEXT:    frintx s3, s3
-; CHECK-NEXT:    frintx s7, s7
-; CHECK-NEXT:    fcvtzs x9, s0
-; CHECK-NEXT:    frintx s2, s2
-; CHECK-NEXT:    frintx s5, s5
-; CHECK-NEXT:    frintx s6, s6
-; CHECK-NEXT:    fcvtzs x8, s1
-; CHECK-NEXT:    fcvtzs x12, s4
-; CHECK-NEXT:    fcvtzs x11, s3
-; CHECK-NEXT:    fcvtzs x15, s7
-; CHECK-NEXT:    fmov d0, x9
-; CHECK-NEXT:    fcvtzs x10, s2
-; CHECK-NEXT:    fcvtzs x13, s5
-; CHECK-NEXT:    fcvtzs x14, s6
-; CHECK-NEXT:    fmov d2, x8
-; CHECK-NEXT:    fmov d1, x12
-; CHECK-NEXT:    mov v0.d[1], x11
-; CHECK-NEXT:    fmov d3, x10
-; CHECK-NEXT:    mov v2.d[1], x13
-; CHECK-NEXT:    mov v1.d[1], x15
-; CHECK-NEXT:    mov v3.d[1], x14
-; CHECK-NEXT:    ret
-  %a = call <8 x i64> @llvm.lrint.v8i64.v8f16(<8 x half> %x)
-  ret <8 x i64> %a
+define <8 x iXLen> @lrint_v8f16(<8 x half> %x) {
+; CHECK-i32-LABEL: lrint_v8f16:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-i32-NEXT:    mov h3, v0.h[1]
+; CHECK-i32-NEXT:    fcvt s6, h0
+; CHECK-i32-NEXT:    mov h4, v0.h[2]
+; CHECK-i32-NEXT:    mov h0, v0.h[3]
+; CHECK-i32-NEXT:    mov h2, v1.h[1]
+; CHECK-i32-NEXT:    fcvt s5, h1
+; CHECK-i32-NEXT:    mov h7, v1.h[2]
+; CHECK-i32-NEXT:    fcvt s3, h3
+; CHECK-i32-NEXT:    frintx s6, s6
+; CHECK-i32-NEXT:    fcvt s4, h4
+; CHECK-i32-NEXT:    mov h1, v1.h[3]
+; CHECK-i32-NEXT:    fcvt s2, h2
+; CHECK-i32-NEXT:    frintx s5, s5
+; CHECK-i32-NEXT:    fcvt s7, h7
+; CHECK-i32-NEXT:    frintx s3, s3
+; CHECK-i32-NEXT:    fcvtzs w9, s6
+; CHECK-i32-NEXT:    frintx s4, s4
+; CHECK-i32-NEXT:    frintx s2, s2
+; CHECK-i32-NEXT:    fcvtzs w8, s5
+; CHECK-i32-NEXT:    fcvt s5, h1
+; CHECK-i32-NEXT:    fcvtzs w11, s3
+; CHECK-i32-NEXT:    fcvt s3, h0
+; CHECK-i32-NEXT:    fmov s0, w9
+; CHECK-i32-NEXT:    fcvtzs w12, s4
+; CHECK-i32-NEXT:    fcvtzs w10, s2
+; CHECK-i32-NEXT:    frintx s2, s7
+; CHECK-i32-NEXT:    fmov s1, w8
+; CHECK-i32-NEXT:    mov v0.s[1], w11
+; CHECK-i32-NEXT:    fcvtzs w8, s2
+; CHECK-i32-NEXT:    mov v1.s[1], w10
+; CHECK-i32-NEXT:    frintx s2, s3
+; CHECK-i32-NEXT:    frintx s3, s5
+; CHECK-i32-NEXT:    mov v0.s[2], w12
+; CHECK-i32-NEXT:    mov v1.s[2], w8
+; CHECK-i32-NEXT:    fcvtzs w9, s2
+; CHECK-i32-NEXT:    fcvtzs w8, s3
+; CHECK-i32-NEXT:    mov v0.s[3], w9
+; CHECK-i32-NEXT:    mov v1.s[3], w8
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v8f16:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-i64-NEXT:    mov h4, v0.h[2]
+; CHECK-i64-NEXT:    mov h3, v0.h[1]
+; CHECK-i64-NEXT:    mov h7, v0.h[3]
+; CHECK-i64-NEXT:    fcvt s0, h0
+; CHECK-i64-NEXT:    mov h2, v1.h[2]
+; CHECK-i64-NEXT:    mov h5, v1.h[1]
+; CHECK-i64-NEXT:    mov h6, v1.h[3]
+; CHECK-i64-NEXT:    fcvt s1, h1
+; CHECK-i64-NEXT:    fcvt s4, h4
+; CHECK-i64-NEXT:    fcvt s3, h3
+; CHECK-i64-NEXT:    fcvt s7, h7
+; CHECK-i64-NEXT:    frintx s0, s0
+; CHECK-i64-NEXT:    fcvt s2, h2
+; CHECK-i64-NEXT:    fcvt s5, h5
+; CHECK-i64-NEXT:    fcvt s6, h6
+; CHECK-i64-NEXT:    frintx s1, s1
+; CHECK-i64-NEXT:    frintx s4, s4
+; CHECK-i64-NEXT:    frintx s3, s3
+; CHECK-i64-NEXT:    frintx s7, s7
+; CHECK-i64-NEXT:    fcvtzs x9, s0
+; CHECK-i64-NEXT:    frintx s2, s2
+; CHECK-i64-NEXT:    frintx s5, s5
+; CHECK-i64-NEXT:    frintx s6, s6
+; CHECK-i64-NEXT:    fcvtzs x8, s1
+; CHECK-i64-NEXT:    fcvtzs x12, s4
+; CHECK-i64-NEXT:    fcvtzs x11, s3
+; CHECK-i64-NEXT:    fcvtzs x15, s7
+; CHECK-i64-NEXT:    fmov d0, x9
+; CHECK-i64-NEXT:    fcvtzs x10, s2
+; CHECK-i64-NEXT:    fcvtzs x13, s5
+; CHECK-i64-NEXT:    fcvtzs x14, s6
+; CHECK-i64-NEXT:    fmov d2, x8
+; CHECK-i64-NEXT:    fmov d1, x12
+; CHECK-i64-NEXT:    mov v0.d[1], x11
+; CHECK-i64-NEXT:    fmov d3, x10
+; CHECK-i64-NEXT:    mov v2.d[1], x13
+; CHECK-i64-NEXT:    mov v1.d[1], x15
+; CHECK-i64-NEXT:    mov v3.d[1], x14
+; CHECK-i64-NEXT:    ret
+  %a = call <8 x iXLen> @llvm.lrint.v8iXLen.v8f16(<8 x half> %x)
+  ret <8 x iXLen> %a
 }
-declare <8 x i64> @llvm.lrint.v8i64.v8f16(<8 x half>)
+declare <8 x iXLen> @llvm.lrint.v8iXLen.v8f16(<8 x half>)
 
-define <16 x i64> @lrint_v16i64_v16f16(<16 x half> %x) {
-; CHECK-LABEL: lrint_v16i64_v16f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    mov h17, v0.h[1]
-; CHECK-NEXT:    mov h19, v0.h[2]
-; CHECK-NEXT:    fcvt s18, h0
-; CHECK-NEXT:    mov h0, v0.h[3]
-; CHECK-NEXT:    mov h4, v2.h[1]
-; CHECK-NEXT:    mov h5, v2.h[2]
-; CHECK-NEXT:    fcvt s7, h3
-; CHECK-NEXT:    fcvt s6, h2
-; CHECK-NEXT:    mov h16, v3.h[2]
-; CHECK-NEXT:    mov h2, v2.h[3]
-; CHECK-NEXT:    fcvt s17, h17
-; CHECK-NEXT:    fcvt s19, h19
-; CHECK-NEXT:    frintx s18, s18
-; CHECK-NEXT:    fcvt s0, h0
-; CHECK-NEXT:    fcvt s4, h4
-; CHECK-NEXT:    fcvt s5, h5
-; CHECK-NEXT:    frintx s7, s7
-; CHECK-NEXT:    frintx s6, s6
-; CHECK-NEXT:    fcvt s16, h16
-; CHECK-NEXT:    fcvt s2, h2
-; CHECK-NEXT:    frintx s17, s17
-; CHECK-NEXT:    frintx s19, s19
-; CHECK-NEXT:    fcvtzs x13, s18
-; CHECK-NEXT:    frintx s0, s0
-; CHECK-NEXT:    frintx s4, s4
-; CHECK-NEXT:    frintx s5, s5
-; CHECK-NEXT:    fcvtzs x9, s7
-; CHECK-NEXT:    mov h7, v1.h[2]
-; CHECK-NEXT:    fcvtzs x8, s6
-; CHECK-NEXT:    mov h6, v1.h[1]
-; CHECK-NEXT:    frintx s16, s16
-; CHECK-NEXT:    fcvtzs x14, s17
-; CHECK-NEXT:    fcvtzs x15, s19
-; CHECK-NEXT:    fcvtzs x10, s4
-; CHECK-NEXT:    mov h4, v3.h[1]
-; CHECK-NEXT:    fcvtzs x11, s5
-; CHECK-NEXT:    mov h5, v1.h[3]
-; CHECK-NEXT:    mov h3, v3.h[3]
-; CHECK-NEXT:    fcvt s1, h1
-; CHECK-NEXT:    fcvt s7, h7
-; CHECK-NEXT:    fcvt s6, h6
-; CHECK-NEXT:    fcvtzs x12, s16
-; CHECK-NEXT:    frintx s16, s2
-; CHECK-NEXT:    fmov d2, x8
-; CHECK-NEXT:    fcvt s4, h4
-; CHECK-NEXT:    fcvt s3, h3
-; CHECK-NEXT:    fcvt s5, h5
-; CHECK-NEXT:    frintx s1, s1
-; CHECK-NEXT:    frintx s7, s7
-; CHECK-NEXT:    frintx s17, s6
-; CHECK-NEXT:    fmov d6, x9
-; CHECK-NEXT:    mov v2.d[1], x10
-; CHECK-NEXT:    frintx s4, s4
-; CHECK-NEXT:    frintx s18, s3
-; CHECK-NEXT:    frintx s5, s5
-; CHECK-NEXT:    fcvtzs x8, s1
-; CHECK-NEXT:    fcvtzs x9, s7
-; CHECK-NEXT:    fmov d3, x11
-; CHECK-NEXT:    fcvtzs x11, s0
-; CHECK-NEXT:    fmov d7, x12
-; CHECK-NEXT:    fcvtzs x12, s16
-; CHECK-NEXT:    fcvtzs x16, s17
-; CHECK-NEXT:    fcvtzs x17, s4
-; CHECK-NEXT:    fmov d0, x13
-; CHECK-NEXT:    fmov d1, x15
-; CHECK-NEXT:    fcvtzs x18, s18
-; CHECK-NEXT:    fcvtzs x0, s5
-; CHECK-NEXT:    fmov d4, x8
-; CHECK-NEXT:    fmov d5, x9
-; CHECK-NEXT:    mov v0.d[1], x14
-; CHECK-NEXT:    mov v1.d[1], x11
-; CHECK-NEXT:    mov v3.d[1], x12
-; CHECK-NEXT:    mov v4.d[1], x16
-; CHECK-NEXT:    mov v6.d[1], x17
-; CHECK-NEXT:    mov v7.d[1], x18
-; CHECK-NEXT:    mov v5.d[1], x0
-; CHECK-NEXT:    ret
-  %a = call <16 x i64> @llvm.lrint.v16i64.v16f16(<16 x half> %x)
-  ret <16 x i64> %a
+define <16 x iXLen> @lrint_v16f16(<16 x half> %x) {
+; CHECK-i32-LABEL: lrint_v16f16:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
+; CHECK-i32-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-i32-NEXT:    mov h18, v0.h[1]
+; CHECK-i32-NEXT:    mov h19, v1.h[1]
+; CHECK-i32-NEXT:    fcvt s20, h0
+; CHECK-i32-NEXT:    mov h21, v0.h[2]
+; CHECK-i32-NEXT:    mov h0, v0.h[3]
+; CHECK-i32-NEXT:    mov h4, v2.h[1]
+; CHECK-i32-NEXT:    mov h5, v2.h[2]
+; CHECK-i32-NEXT:    fcvt s6, h2
+; CHECK-i32-NEXT:    fcvt s7, h3
+; CHECK-i32-NEXT:    mov h16, v3.h[1]
+; CHECK-i32-NEXT:    mov h17, v3.h[2]
+; CHECK-i32-NEXT:    fcvt s18, h18
+; CHECK-i32-NEXT:    fcvt s19, h19
+; CHECK-i32-NEXT:    mov h2, v2.h[3]
+; CHECK-i32-NEXT:    fcvt s4, h4
+; CHECK-i32-NEXT:    fcvt s5, h5
+; CHECK-i32-NEXT:    frintx s6, s6
+; CHECK-i32-NEXT:    frintx s7, s7
+; CHECK-i32-NEXT:    fcvt s16, h16
+; CHECK-i32-NEXT:    fcvt s17, h17
+; CHECK-i32-NEXT:    frintx s18, s18
+; CHECK-i32-NEXT:    fcvt s2, h2
+; CHECK-i32-NEXT:    frintx s4, s4
+; CHECK-i32-NEXT:    frintx s5, s5
+; CHECK-i32-NEXT:    fcvtzs w8, s6
+; CHECK-i32-NEXT:    fcvt s6, h1
+; CHECK-i32-NEXT:    fcvtzs w9, s7
+; CHECK-i32-NEXT:    mov h7, v1.h[2]
+; CHECK-i32-NEXT:    frintx s16, s16
+; CHECK-i32-NEXT:    fcvtzs w15, s18
+; CHECK-i32-NEXT:    fcvtzs w10, s4
+; CHECK-i32-NEXT:    frintx s4, s17
+; CHECK-i32-NEXT:    fcvtzs w11, s5
+; CHECK-i32-NEXT:    frintx s5, s20
+; CHECK-i32-NEXT:    fcvt s17, h21
+; CHECK-i32-NEXT:    frintx s6, s6
+; CHECK-i32-NEXT:    fcvtzs w12, s16
+; CHECK-i32-NEXT:    frintx s16, s19
+; CHECK-i32-NEXT:    fcvt s7, h7
+; CHECK-i32-NEXT:    mov h19, v1.h[3]
+; CHECK-i32-NEXT:    fmov s1, w8
+; CHECK-i32-NEXT:    fcvtzs w13, s4
+; CHECK-i32-NEXT:    mov h4, v3.h[3]
+; CHECK-i32-NEXT:    fmov s3, w9
+; CHECK-i32-NEXT:    fcvtzs w14, s5
+; CHECK-i32-NEXT:    frintx s5, s17
+; CHECK-i32-NEXT:    fcvtzs w16, s6
+; CHECK-i32-NEXT:    fcvt s17, h0
+; CHECK-i32-NEXT:    fcvtzs w8, s16
+; CHECK-i32-NEXT:    frintx s6, s7
+; CHECK-i32-NEXT:    fcvt s7, h19
+; CHECK-i32-NEXT:    mov v1.s[1], w10
+; CHECK-i32-NEXT:    mov v3.s[1], w12
+; CHECK-i32-NEXT:    fcvt s4, h4
+; CHECK-i32-NEXT:    fcvtzs w9, s5
+; CHECK-i32-NEXT:    fmov s0, w14
+; CHECK-i32-NEXT:    frintx s5, s2
+; CHECK-i32-NEXT:    fmov s2, w16
+; CHECK-i32-NEXT:    frintx s16, s17
+; CHECK-i32-NEXT:    fcvtzs w10, s6
+; CHECK-i32-NEXT:    frintx s6, s7
+; CHECK-i32-NEXT:    mov v1.s[2], w11
+; CHECK-i32-NEXT:    mov v3.s[2], w13
+; CHECK-i32-NEXT:    mov v0.s[1], w15
+; CHECK-i32-NEXT:    frintx s4, s4
+; CHECK-i32-NEXT:    mov v2.s[1], w8
+; CHECK-i32-NEXT:    fcvtzs w8, s5
+; CHECK-i32-NEXT:    fcvtzs w12, s16
+; CHECK-i32-NEXT:    mov v0.s[2], w9
+; CHECK-i32-NEXT:    fcvtzs w9, s4
+; CHECK-i32-NEXT:    mov v2.s[2], w10
+; CHECK-i32-NEXT:    fcvtzs w10, s6
+; CHECK-i32-NEXT:    mov v1.s[3], w8
+; CHECK-i32-NEXT:    mov v0.s[3], w12
+; CHECK-i32-NEXT:    mov v3.s[3], w9
+; CHECK-i32-NEXT:    mov v2.s[3], w10
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v16f16:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
+; CHECK-i64-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-i64-NEXT:    mov h17, v0.h[1]
+; CHECK-i64-NEXT:    mov h19, v0.h[2]
+; CHECK-i64-NEXT:    fcvt s18, h0
+; CHECK-i64-NEXT:    mov h0, v0.h[3]
+; CHECK-i64-NEXT:    mov h4, v2.h[1]
+; CHECK-i64-NEXT:    mov h5, v2.h[2]
+; CHECK-i64-NEXT:    fcvt s7, h3
+; CHECK-i64-NEXT:    fcvt s6, h2
+; CHECK-i64-NEXT:    mov h16, v3.h[2]
+; CHECK-i64-NEXT:    mov h2, v2.h[3]
+; CHECK-i64-NEXT:    fcvt s17, h17
+; CHECK-i64-NEXT:    fcvt s19, h19
+; CHECK-i64-NEXT:    frintx s18, s18
+; CHECK-i64-NEXT:    fcvt s0, h0
+; CHECK-i64-NEXT:    fcvt s4, h4
+; CHECK-i64-NEXT:    fcvt s5, h5
+; CHECK-i64-NEXT:    frintx s7, s7
+; CHECK-i64-NEXT:    frintx s6, s6
+; CHECK-i64-NEXT:    fcvt s16, h16
+; CHECK-i64-NEXT:    fcvt s2, h2
+; CHECK-i64-NEXT:    frintx s17, s17
+; CHECK-i64-NEXT:    frintx s19, s19
+; CHECK-i64-NEXT:    fcvtzs x13, s18
+; CHECK-i64-NEXT:    frintx s0, s0
+; CHECK-i64-NEXT:    frintx s4, s4
+; CHECK-i64-NEXT:    frintx s5, s5
+; CHECK-i64-NEXT:    fcvtzs x9, s7
+; CHECK-i64-NEXT:    mov h7, v1.h[2]
+; CHECK-i64-NEXT:    fcvtzs x8, s6
+; CHECK-i64-NEXT:    mov h6, v1.h[1]
+; CHECK-i64-NEXT:    frintx s16, s16
+; CHECK-i64-NEXT:    fcvtzs x14, s17
+; CHECK-i64-NEXT:    fcvtzs x15, s19
+; CHECK-i64-NEXT:    fcvtzs x10, s4
+; CHECK-i64-NEXT:    mov h4, v3.h[1]
+; CHECK-i64-NEXT:    fcvtzs x11, s5
+; CHECK-i64-NEXT:    mov h5, v1.h[3]
+; CHECK-i64-NEXT:    mov h3, v3.h[3]
+; CHECK-i64-NEXT:    fcvt s1, h1
+; CHECK-i64-NEXT:    fcvt s7, h7
+; CHECK-i64-NEXT:    fcvt s6, h6
+; CHECK-i64-NEXT:    fcvtzs x12, s16
+; CHECK-i64-NEXT:    frintx s16, s2
+; CHECK-i64-NEXT:    fmov d2, x8
+; CHECK-i64-NEXT:    fcvt s4, h4
+; CHECK-i64-NEXT:    fcvt s3, h3
+; CHECK-i64-NEXT:    fcvt s5, h5
+; CHECK-i64-NEXT:    frintx s1, s1
+; CHECK-i64-NEXT:    frintx s7, s7
+; CHECK-i64-NEXT:    frintx s17, s6
+; CHECK-i64-NEXT:    fmov d6, x9
+; CHECK-i64-NEXT:    mov v2.d[1], x10
+; CHECK-i64-NEXT:    frintx s4, s4
+; CHECK-i64-NEXT:    frintx s18, s3
+; CHECK-i64-NEXT:    frintx s5, s5
+; CHECK-i64-NEXT:    fcvtzs x8, s1
+; CHECK-i64-NEXT:    fcvtzs x9, s7
+; CHECK-i64-NEXT:    fmov d3, x11
+; CHECK-i64-NEXT:    fcvtzs x11, s0
+; CHECK-i64-NEXT:    fmov d7, x12
+; CHECK-i64-NEXT:    fcvtzs x12, s16
+; CHECK-i64-NEXT:    fcvtzs x16, s17
+; CHECK-i64-NEXT:    fcvtzs x17, s4
+; CHECK-i64-NEXT:    fmov d0, x13
+; CHECK-i64-NEXT:    fmov d1, x15
+; CHECK-i64-NEXT:    fcvtzs x18, s18
+; CHECK-i64-NEXT:    fcvtzs x0, s5
+; CHECK-i64-NEXT:    fmov d4, x8
+; CHECK-i64-NEXT:    fmov d5, x9
+; CHECK-i64-NEXT:    mov v0.d[1], x14
+; CHECK-i64-NEXT:    mov v1.d[1], x11
+; CHECK-i64-NEXT:    mov v3.d[1], x12
+; CHECK-i64-NEXT:    mov v4.d[1], x16
+; CHECK-i64-NEXT:    mov v6.d[1], x17
+; CHECK-i64-NEXT:    mov v7.d[1], x18
+; CHECK-i64-NEXT:    mov v5.d[1], x0
+; CHECK-i64-NEXT:    ret
+  %a = call <16 x iXLen> @llvm.lrint.v16iXLen.v16f16(<16 x half> %x)
+  ret <16 x iXLen> %a
 }
-declare <16 x i64> @llvm.lrint.v16i64.v16f16(<16 x half>)
+declare <16 x iXLen> @llvm.lrint.v16iXLen.v16f16(<16 x half>)
 
-define <32 x i64> @lrint_v32i64_v32f16(<32 x half> %x) {
-; CHECK-LABEL: lrint_v32i64_v32f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
-; CHECK-NEXT:    ext v6.16b, v3.16b, v3.16b, #8
-; CHECK-NEXT:    ext v7.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    mov h19, v0.h[1]
-; CHECK-NEXT:    fcvt s21, h0
-; CHECK-NEXT:    mov h23, v1.h[2]
-; CHECK-NEXT:    fcvt s22, h1
-; CHECK-NEXT:    fcvt s26, h2
-; CHECK-NEXT:    mov h27, v2.h[1]
-; CHECK-NEXT:    mov h28, v2.h[2]
-; CHECK-NEXT:    mov h16, v4.h[2]
-; CHECK-NEXT:    fcvt s17, h5
-; CHECK-NEXT:    mov h18, v5.h[2]
-; CHECK-NEXT:    mov h20, v6.h[2]
-; CHECK-NEXT:    fcvt s24, h7
-; CHECK-NEXT:    fcvt s25, h6
-; CHECK-NEXT:    fcvt s19, h19
-; CHECK-NEXT:    frintx s22, s22
-; CHECK-NEXT:    fcvt s16, h16
-; CHECK-NEXT:    frintx s17, s17
-; CHECK-NEXT:    fcvt s18, h18
-; CHECK-NEXT:    fcvt s20, h20
-; CHECK-NEXT:    frintx s16, s16
-; CHECK-NEXT:    fcvtzs x12, s17
-; CHECK-NEXT:    frintx s17, s18
-; CHECK-NEXT:    frintx s18, s21
-; CHECK-NEXT:    fcvt s21, h23
-; CHECK-NEXT:    frintx s23, s24
-; CHECK-NEXT:    frintx s24, s25
-; CHECK-NEXT:    frintx s25, s19
-; CHECK-NEXT:    mov h19, v7.h[1]
-; CHECK-NEXT:    fcvtzs x13, s16
-; CHECK-NEXT:    frintx s16, s20
-; CHECK-NEXT:    frintx s20, s26
-; CHECK-NEXT:    fcvtzs x9, s23
-; CHECK-NEXT:    mov h23, v3.h[2]
-; CHECK-NEXT:    fcvt s26, h27
-; CHECK-NEXT:    fcvtzs x15, s24
-; CHECK-NEXT:    fcvtzs x10, s25
-; CHECK-NEXT:    fcvt s24, h28
-; CHECK-NEXT:    mov h25, v3.h[3]
-; CHECK-NEXT:    fcvtzs x14, s17
-; CHECK-NEXT:    frintx s21, s21
-; CHECK-NEXT:    fmov d17, x12
-; CHECK-NEXT:    fcvtzs x12, s16
-; CHECK-NEXT:    fmov d16, x13
-; CHECK-NEXT:    fcvtzs x13, s22
-; CHECK-NEXT:    fcvt s22, h3
-; CHECK-NEXT:    mov h3, v3.h[1]
-; CHECK-NEXT:    mov h27, v0.h[2]
-; CHECK-NEXT:    mov h28, v2.h[3]
-; CHECK-NEXT:    fcvt s23, h23
-; CHECK-NEXT:    frintx s26, s26
-; CHECK-NEXT:    fcvtzs x16, s20
-; CHECK-NEXT:    frintx s20, s24
-; CHECK-NEXT:    fcvt s24, h25
-; CHECK-NEXT:    fcvtzs x11, s18
-; CHECK-NEXT:    fmov d18, x14
-; CHECK-NEXT:    fcvtzs x14, s21
-; CHECK-NEXT:    frintx s22, s22
-; CHECK-NEXT:    fcvt s3, h3
-; CHECK-NEXT:    fcvt s25, h27
-; CHECK-NEXT:    fcvt s27, h28
-; CHECK-NEXT:    frintx s23, s23
-; CHECK-NEXT:    mov h21, v1.h[3]
-; CHECK-NEXT:    fmov d2, x15
-; CHECK-NEXT:    fcvtzs x15, s26
-; CHECK-NEXT:    fmov d26, x13
-; CHECK-NEXT:    mov h1, v1.h[1]
-; CHECK-NEXT:    fcvtzs x13, s20
-; CHECK-NEXT:    frintx s20, s24
-; CHECK-NEXT:    fmov d24, x14
-; CHECK-NEXT:    fcvtzs x14, s22
-; CHECK-NEXT:    frintx s3, s3
-; CHECK-NEXT:    fmov d22, x16
-; CHECK-NEXT:    frintx s27, s27
-; CHECK-NEXT:    fcvtzs x16, s23
-; CHECK-NEXT:    fcvt s21, h21
-; CHECK-NEXT:    frintx s25, s25
-; CHECK-NEXT:    fcvt s1, h1
-; CHECK-NEXT:    mov h0, v0.h[3]
-; CHECK-NEXT:    mov h23, v7.h[2]
-; CHECK-NEXT:    mov v22.d[1], x15
-; CHECK-NEXT:    fcvtzs x15, s20
-; CHECK-NEXT:    fmov d20, x13
-; CHECK-NEXT:    fcvtzs x13, s3
-; CHECK-NEXT:    fmov d3, x14
-; CHECK-NEXT:    fcvtzs x14, s27
-; CHECK-NEXT:    fmov d27, x16
-; CHECK-NEXT:    frintx s21, s21
-; CHECK-NEXT:    mov h7, v7.h[3]
-; CHECK-NEXT:    frintx s1, s1
-; CHECK-NEXT:    fcvt s0, h0
-; CHECK-NEXT:    fcvt s23, h23
-; CHECK-NEXT:    fcvt s19, h19
-; CHECK-NEXT:    mov v27.d[1], x15
-; CHECK-NEXT:    fcvtzs x15, s25
-; CHECK-NEXT:    mov h25, v6.h[3]
-; CHECK-NEXT:    mov h6, v6.h[1]
-; CHECK-NEXT:    mov v3.d[1], x13
-; CHECK-NEXT:    fcvtzs x13, s21
-; CHECK-NEXT:    mov h21, v5.h[1]
-; CHECK-NEXT:    mov h5, v5.h[3]
-; CHECK-NEXT:    mov v20.d[1], x14
-; CHECK-NEXT:    fcvtzs x14, s1
-; CHECK-NEXT:    mov h1, v4.h[1]
-; CHECK-NEXT:    frintx s0, s0
-; CHECK-NEXT:    fcvt s25, h25
-; CHECK-NEXT:    fcvt s7, h7
-; CHECK-NEXT:    stp q3, q27, [x8, #192]
-; CHECK-NEXT:    fcvt s6, h6
-; CHECK-NEXT:    mov h3, v4.h[3]
-; CHECK-NEXT:    stp q22, q20, [x8, #128]
-; CHECK-NEXT:    fcvt s21, h21
-; CHECK-NEXT:    fcvt s5, h5
-; CHECK-NEXT:    mov v24.d[1], x13
-; CHECK-NEXT:    mov v26.d[1], x14
-; CHECK-NEXT:    fcvt s4, h4
-; CHECK-NEXT:    frintx s22, s25
-; CHECK-NEXT:    fmov d20, x12
-; CHECK-NEXT:    fcvt s1, h1
-; CHECK-NEXT:    frintx s6, s6
-; CHECK-NEXT:    fcvt s3, h3
-; CHECK-NEXT:    fcvtzs x12, s0
-; CHECK-NEXT:    frintx s5, s5
-; CHECK-NEXT:    frintx s21, s21
-; CHECK-NEXT:    fmov d0, x11
-; CHECK-NEXT:    stp q26, q24, [x8, #64]
-; CHECK-NEXT:    fmov d24, x15
-; CHECK-NEXT:    frintx s4, s4
-; CHECK-NEXT:    fcvtzs x11, s22
-; CHECK-NEXT:    frintx s22, s23
-; CHECK-NEXT:    frintx s1, s1
-; CHECK-NEXT:    fcvtzs x13, s6
-; CHECK-NEXT:    frintx s3, s3
-; CHECK-NEXT:    frintx s6, s7
-; CHECK-NEXT:    fcvtzs x14, s5
-; CHECK-NEXT:    mov v24.d[1], x12
-; CHECK-NEXT:    frintx s5, s19
-; CHECK-NEXT:    fcvtzs x12, s21
-; CHECK-NEXT:    mov v0.d[1], x10
-; CHECK-NEXT:    fcvtzs x10, s4
-; CHECK-NEXT:    mov v20.d[1], x11
-; CHECK-NEXT:    fcvtzs x11, s22
-; CHECK-NEXT:    mov v2.d[1], x13
-; CHECK-NEXT:    fcvtzs x15, s3
-; CHECK-NEXT:    fcvtzs x13, s1
-; CHECK-NEXT:    mov v18.d[1], x14
-; CHECK-NEXT:    fcvtzs x14, s6
-; CHECK-NEXT:    stp q0, q24, [x8]
-; CHECK-NEXT:    mov v17.d[1], x12
-; CHECK-NEXT:    fcvtzs x12, s5
-; CHECK-NEXT:    fmov d0, x10
-; CHECK-NEXT:    fmov d1, x11
-; CHECK-NEXT:    stp q2, q20, [x8, #224]
-; CHECK-NEXT:    fmov d2, x9
-; CHECK-NEXT:    mov v16.d[1], x15
-; CHECK-NEXT:    stp q17, q18, [x8, #160]
-; CHECK-NEXT:    mov v0.d[1], x13
-; CHECK-NEXT:    mov v1.d[1], x14
-; CHECK-NEXT:    mov v2.d[1], x12
-; CHECK-NEXT:    stp q0, q16, [x8, #96]
-; CHECK-NEXT:    stp q2, q1, [x8, #32]
-; CHECK-NEXT:    ret
-  %a = call <32 x i64> @llvm.lrint.v32i64.v32f16(<32 x half> %x)
-  ret <32 x i64> %a
+define <32 x iXLen> @lrint_v32f16(<32 x half> %x) {
+; CHECK-i32-LABEL: lrint_v32f16:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    ext v5.16b, v0.16b, v0.16b, #8
+; CHECK-i32-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-i32-NEXT:    ext v17.16b, v2.16b, v2.16b, #8
+; CHECK-i32-NEXT:    mov h6, v5.h[1]
+; CHECK-i32-NEXT:    fcvt s7, h5
+; CHECK-i32-NEXT:    mov h16, v5.h[2]
+; CHECK-i32-NEXT:    mov h5, v5.h[3]
+; CHECK-i32-NEXT:    mov h18, v4.h[1]
+; CHECK-i32-NEXT:    mov h20, v4.h[3]
+; CHECK-i32-NEXT:    mov h19, v4.h[2]
+; CHECK-i32-NEXT:    fcvt s21, h4
+; CHECK-i32-NEXT:    mov h23, v17.h[1]
+; CHECK-i32-NEXT:    ext v4.16b, v3.16b, v3.16b, #8
+; CHECK-i32-NEXT:    fcvt s22, h17
+; CHECK-i32-NEXT:    fcvt s6, h6
+; CHECK-i32-NEXT:    frintx s7, s7
+; CHECK-i32-NEXT:    fcvt s16, h16
+; CHECK-i32-NEXT:    fcvt s5, h5
+; CHECK-i32-NEXT:    fcvt s18, h18
+; CHECK-i32-NEXT:    fcvt s20, h20
+; CHECK-i32-NEXT:    fcvt s19, h19
+; CHECK-i32-NEXT:    frintx s22, s22
+; CHECK-i32-NEXT:    frintx s6, s6
+; CHECK-i32-NEXT:    fcvtzs w12, s7
+; CHECK-i32-NEXT:    frintx s7, s16
+; CHECK-i32-NEXT:    frintx s5, s5
+; CHECK-i32-NEXT:    frintx s16, s21
+; CHECK-i32-NEXT:    fcvt s21, h23
+; CHECK-i32-NEXT:    frintx s18, s18
+; CHECK-i32-NEXT:    frintx s20, s20
+; CHECK-i32-NEXT:    frintx s19, s19
+; CHECK-i32-NEXT:    fcvtzs w15, s22
+; CHECK-i32-NEXT:    mov h22, v1.h[2]
+; CHECK-i32-NEXT:    fcvtzs w17, s6
+; CHECK-i32-NEXT:    mov h6, v17.h[2]
+; CHECK-i32-NEXT:    mov h17, v17.h[3]
+; CHECK-i32-NEXT:    fcvtzs w9, s7
+; CHECK-i32-NEXT:    mov h7, v4.h[2]
+; CHECK-i32-NEXT:    fcvtzs w8, s5
+; CHECK-i32-NEXT:    mov h5, v4.h[1]
+; CHECK-i32-NEXT:    fcvtzs w13, s16
+; CHECK-i32-NEXT:    frintx s16, s21
+; CHECK-i32-NEXT:    fcvtzs w14, s18
+; CHECK-i32-NEXT:    fcvtzs w10, s20
+; CHECK-i32-NEXT:    fcvt s18, h4
+; CHECK-i32-NEXT:    fcvt s6, h6
+; CHECK-i32-NEXT:    fcvt s17, h17
+; CHECK-i32-NEXT:    mov h20, v0.h[2]
+; CHECK-i32-NEXT:    fcvt s7, h7
+; CHECK-i32-NEXT:    fcvtzs w11, s19
+; CHECK-i32-NEXT:    mov h19, v0.h[1]
+; CHECK-i32-NEXT:    fcvt s5, h5
+; CHECK-i32-NEXT:    fcvtzs w0, s16
+; CHECK-i32-NEXT:    mov h21, v1.h[1]
+; CHECK-i32-NEXT:    frintx s18, s18
+; CHECK-i32-NEXT:    mov h4, v4.h[3]
+; CHECK-i32-NEXT:    frintx s6, s6
+; CHECK-i32-NEXT:    frintx s16, s17
+; CHECK-i32-NEXT:    mov h17, v0.h[3]
+; CHECK-i32-NEXT:    fcvt s0, h0
+; CHECK-i32-NEXT:    fcvt s19, h19
+; CHECK-i32-NEXT:    frintx s5, s5
+; CHECK-i32-NEXT:    fcvtzs w2, s18
+; CHECK-i32-NEXT:    fcvt s18, h21
+; CHECK-i32-NEXT:    fcvt s21, h2
+; CHECK-i32-NEXT:    fcvtzs w18, s6
+; CHECK-i32-NEXT:    frintx s6, s7
+; CHECK-i32-NEXT:    fcvt s7, h20
+; CHECK-i32-NEXT:    fcvtzs w16, s16
+; CHECK-i32-NEXT:    fcvt s16, h17
+; CHECK-i32-NEXT:    fcvt s17, h1
+; CHECK-i32-NEXT:    frintx s0, s0
+; CHECK-i32-NEXT:    fcvtzs w3, s5
+; CHECK-i32-NEXT:    frintx s5, s19
+; CHECK-i32-NEXT:    fcvt s19, h22
+; CHECK-i32-NEXT:    mov h1, v1.h[3]
+; CHECK-i32-NEXT:    fcvtzs w1, s6
+; CHECK-i32-NEXT:    frintx s6, s7
+; CHECK-i32-NEXT:    mov h7, v2.h[1]
+; CHECK-i32-NEXT:    frintx s17, s17
+; CHECK-i32-NEXT:    frintx s20, s16
+; CHECK-i32-NEXT:    fmov s16, w12
+; CHECK-i32-NEXT:    fcvtzs w4, s0
+; CHECK-i32-NEXT:    frintx s0, s18
+; CHECK-i32-NEXT:    fcvtzs w5, s5
+; CHECK-i32-NEXT:    frintx s5, s19
+; CHECK-i32-NEXT:    frintx s18, s21
+; CHECK-i32-NEXT:    fcvt s19, h3
+; CHECK-i32-NEXT:    fcvtzs w12, s6
+; CHECK-i32-NEXT:    fcvt s6, h7
+; CHECK-i32-NEXT:    mov h7, v3.h[1]
+; CHECK-i32-NEXT:    fcvtzs w6, s17
+; CHECK-i32-NEXT:    fmov s17, w13
+; CHECK-i32-NEXT:    mov v16.s[1], w17
+; CHECK-i32-NEXT:    fcvtzs w17, s20
+; CHECK-i32-NEXT:    fcvtzs w7, s0
+; CHECK-i32-NEXT:    mov h0, v2.h[2]
+; CHECK-i32-NEXT:    mov h20, v3.h[2]
+; CHECK-i32-NEXT:    fcvtzs w13, s5
+; CHECK-i32-NEXT:    fmov s5, w15
+; CHECK-i32-NEXT:    frintx s6, s6
+; CHECK-i32-NEXT:    fcvt s7, h7
+; CHECK-i32-NEXT:    mov v17.s[1], w14
+; CHECK-i32-NEXT:    fcvtzs w14, s18
+; CHECK-i32-NEXT:    frintx s18, s19
+; CHECK-i32-NEXT:    mov h2, v2.h[3]
+; CHECK-i32-NEXT:    fcvt s0, h0
+; CHECK-i32-NEXT:    mov h3, v3.h[3]
+; CHECK-i32-NEXT:    mov v5.s[1], w0
+; CHECK-i32-NEXT:    fcvt s19, h20
+; CHECK-i32-NEXT:    fcvt s1, h1
+; CHECK-i32-NEXT:    mov v16.s[2], w9
+; CHECK-i32-NEXT:    fcvtzs w15, s6
+; CHECK-i32-NEXT:    frintx s6, s7
+; CHECK-i32-NEXT:    fmov s7, w2
+; CHECK-i32-NEXT:    fcvtzs w0, s18
+; CHECK-i32-NEXT:    fcvt s20, h2
+; CHECK-i32-NEXT:    fcvt s18, h4
+; CHECK-i32-NEXT:    frintx s21, s0
+; CHECK-i32-NEXT:    fcvt s3, h3
+; CHECK-i32-NEXT:    fmov s0, w4
+; CHECK-i32-NEXT:    frintx s19, s19
+; CHECK-i32-NEXT:    fmov s2, w6
+; CHECK-i32-NEXT:    fmov s4, w14
+; CHECK-i32-NEXT:    fcvtzs w2, s6
+; CHECK-i32-NEXT:    mov v7.s[1], w3
+; CHECK-i32-NEXT:    frintx s1, s1
+; CHECK-i32-NEXT:    fmov s6, w0
+; CHECK-i32-NEXT:    mov v0.s[1], w5
+; CHECK-i32-NEXT:    frintx s20, s20
+; CHECK-i32-NEXT:    mov v2.s[1], w7
+; CHECK-i32-NEXT:    fcvtzs w3, s21
+; CHECK-i32-NEXT:    mov v4.s[1], w15
+; CHECK-i32-NEXT:    fcvtzs w14, s19
+; CHECK-i32-NEXT:    frintx s18, s18
+; CHECK-i32-NEXT:    frintx s3, s3
+; CHECK-i32-NEXT:    mov v6.s[1], w2
+; CHECK-i32-NEXT:    mov v17.s[2], w11
+; CHECK-i32-NEXT:    fcvtzs w15, s1
+; CHECK-i32-NEXT:    fcvtzs w0, s20
+; CHECK-i32-NEXT:    mov v5.s[2], w18
+; CHECK-i32-NEXT:    mov v0.s[2], w12
+; CHECK-i32-NEXT:    mov v7.s[2], w1
+; CHECK-i32-NEXT:    mov v2.s[2], w13
+; CHECK-i32-NEXT:    mov v4.s[2], w3
+; CHECK-i32-NEXT:    fcvtzs w9, s18
+; CHECK-i32-NEXT:    fcvtzs w11, s3
+; CHECK-i32-NEXT:    mov v16.s[3], w8
+; CHECK-i32-NEXT:    mov v6.s[2], w14
+; CHECK-i32-NEXT:    mov v17.s[3], w10
+; CHECK-i32-NEXT:    mov v0.s[3], w17
+; CHECK-i32-NEXT:    mov v5.s[3], w16
+; CHECK-i32-NEXT:    mov v2.s[3], w15
+; CHECK-i32-NEXT:    mov v4.s[3], w0
+; CHECK-i32-NEXT:    mov v7.s[3], w9
+; CHECK-i32-NEXT:    mov v1.16b, v16.16b
+; CHECK-i32-NEXT:    mov v6.s[3], w11
+; CHECK-i32-NEXT:    mov v3.16b, v17.16b
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v32f16:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-i64-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
+; CHECK-i64-NEXT:    ext v6.16b, v3.16b, v3.16b, #8
+; CHECK-i64-NEXT:    ext v7.16b, v0.16b, v0.16b, #8
+; CHECK-i64-NEXT:    mov h19, v0.h[1]
+; CHECK-i64-NEXT:    fcvt s21, h0
+; CHECK-i64-NEXT:    mov h23, v1.h[2]
+; CHECK-i64-NEXT:    fcvt s22, h1
+; CHECK-i64-NEXT:    fcvt s26, h2
+; CHECK-i64-NEXT:    mov h27, v2.h[1]
+; CHECK-i64-NEXT:    mov h28, v2.h[2]
+; CHECK-i64-NEXT:    mov h16, v4.h[2]
+; CHECK-i64-NEXT:    fcvt s17, h5
+; CHECK-i64-NEXT:    mov h18, v5.h[2]
+; CHECK-i64-NEXT:    mov h20, v6.h[2]
+; CHECK-i64-NEXT:    fcvt s24, h7
+; CHECK-i64-NEXT:    fcvt s25, h6
+; CHECK-i64-NEXT:    fcvt s19, h19
+; CHECK-i64-NEXT:    frintx s22, s22
+; CHECK-i64-NEXT:    fcvt s16, h16
+; CHECK-i64-NEXT:    frintx s17, s17
+; CHECK-i64-NEXT:    fcvt s18, h18
+; CHECK-i64-NEXT:    fcvt s20, h20
+; CHECK-i64-NEXT:    frintx s16, s16
+; CHECK-i64-NEXT:    fcvtzs x12, s17
+; CHECK-i64-NEXT:    frintx s17, s18
+; CHECK-i64-NEXT:    frintx s18, s21
+; CHECK-i64-NEXT:    fcvt s21, h23
+; CHECK-i64-NEXT:    frintx s23, s24
+; CHECK-i64-NEXT:    frintx s24, s25
+; CHECK-i64-NEXT:    frintx s25, s19
+; CHECK-i64-NEXT:    mov h19, v7.h[1]
+; CHECK-i64-NEXT:    fcvtzs x13, s16
+; CHECK-i64-NEXT:    frintx s16, s20
+; CHECK-i64-NEXT:    frintx s20, s26
+; CHECK-i64-NEXT:    fcvtzs x9, s23
+; CHECK-i64-NEXT:    mov h23, v3.h[2]
+; CHECK-i64-NEXT:    fcvt s26, h27
+; CHECK-i64-NEXT:    fcvtzs x15, s24
+; CHECK-i64-NEXT:    fcvtzs x10, s25
+; CHECK-i64-NEXT:    fcvt s24, h28
+; CHECK-i64-NEXT:    mov h25, v3.h[3]
+; CHECK-i64-NEXT:    fcvtzs x14, s17
+; CHECK-i64-NEXT:    frintx s21, s21
+; CHECK-i64-NEXT:    fmov d17, x12
+; CHECK-i64-NEXT:    fcvtzs x12, s16
+; CHECK-i64-NEXT:    fmov d16, x13
+; CHECK-i64-NEXT:    fcvtzs x13, s22
+; CHECK-i64-NEXT:    fcvt s22, h3
+; CHECK-i64-NEXT:    mov h3, v3.h[1]
+; CHECK-i64-NEXT:    mov h27, v0.h[2]
+; CHECK-i64-NEXT:    mov h28, v2.h[3]
+; CHECK-i64-NEXT:    fcvt s23, h23
+; CHECK-i64-NEXT:    frintx s26, s26
+; CHECK-i64-NEXT:    fcvtzs x16, s20
+; CHECK-i64-NEXT:    frintx s20, s24
+; CHECK-i64-NEXT:    fcvt s24, h25
+; CHECK-i64-NEXT:    fcvtzs x11, s18
+; CHECK-i64-NEXT:    fmov d18, x14
+; CHECK-i64-NEXT:    fcvtzs x14, s21
+; CHECK-i64-NEXT:    frintx s22, s22
+; CHECK-i64-NEXT:    fcvt s3, h3
+; CHECK-i64-NEXT:    fcvt s25, h27
+; CHECK-i64-NEXT:    fcvt s27, h28
+; CHECK-i64-NEXT:    frintx s23, s23
+; CHECK-i64-NEXT:    mov h21, v1.h[3]
+; CHECK-i64-NEXT:    fmov d2, x15
+; CHECK-i64-NEXT:    fcvtzs x15, s26
+; CHECK-i64-NEXT:    fmov d26, x13
+; CHECK-i64-NEXT:    mov h1, v1.h[1]
+; CHECK-i64-NEXT:    fcvtzs x13, s20
+; CHECK-i64-NEXT:    frintx s20, s24
+; CHECK-i64-NEXT:    fmov d24, x14
+; CHECK-i64-NEXT:    fcvtzs x14, s22
+; CHECK-i64-NEXT:    frintx s3, s3
+; CHECK-i64-NEXT:    fmov d22, x16
+; CHECK-i64-NEXT:    frintx s27, s27
+; CHECK-i64-NEXT:    fcvtzs x16, s23
+; CHECK-i64-NEXT:    fcvt s21, h21
+; CHECK-i64-NEXT:    frintx s25, s25
+; CHECK-i64-NEXT:    fcvt s1, h1
+; CHECK-i64-NEXT:    mov h0, v0.h[3]
+; CHECK-i64-NEXT:    mov h23, v7.h[2]
+; CHECK-i64-NEXT:    mov v22.d[1], x15
+; CHECK-i64-NEXT:    fcvtzs x15, s20
+; CHECK-i64-NEXT:    fmov d20, x13
+; CHECK-i64-NEXT:    fcvtzs x13, s3
+; CHECK-i64-NEXT:    fmov d3, x14
+; CHECK-i64-NEXT:    fcvtzs x14, s27
+; CHECK-i64-NEXT:    fmov d27, x16
+; CHECK-i64-NEXT:    frintx s21, s21
+; CHECK-i64-NEXT:    mov h7, v7.h[3]
+; CHECK-i64-NEXT:    frintx s1, s1
+; CHECK-i64-NEXT:    fcvt s0, h0
+; CHECK-i64-NEXT:    fcvt s23, h23
+; CHECK-i64-NEXT:    fcvt s19, h19
+; CHECK-i64-NEXT:    mov v27.d[1], x15
+; CHECK-i64-NEXT:    fcvtzs x15, s25
+; CHECK-i64-NEXT:    mov h25, v6.h[3]
+; CHECK-i64-NEXT:    mov h6, v6.h[1]
+; CHECK-i64-NEXT:    mov v3.d[1], x13
+; CHECK-i64-NEXT:    fcvtzs x13, s21
+; CHECK-i64-NEXT:    mov h21, v5.h[1]
+; CHECK-i64-NEXT:    mov h5, v5.h[3]
+; CHECK-i64-NEXT:    mov v20.d[1], x14
+; CHECK-i64-NEXT:    fcvtzs x14, s1
+; CHECK-i64-NEXT:    mov h1, v4.h[1]
+; CHECK-i64-NEXT:    frintx s0, s0
+; CHECK-i64-NEXT:    fcvt s25, h25
+; CHECK-i64-NEXT:    fcvt s7, h7
+; CHECK-i64-NEXT:    stp q3, q27, [x8, #192]
+; CHECK-i64-NEXT:    fcvt s6, h6
+; CHECK-i64-NEXT:    mov h3, v4.h[3]
+; CHECK-i64-NEXT:    stp q22, q20, [x8, #128]
+; CHECK-i64-NEXT:    fcvt s21, h21
+; CHECK-i64-NEXT:    fcvt s5, h5
+; CHECK-i64-NEXT:    mov v24.d[1], x13
+; CHECK-i64-NEXT:    mov v26.d[1], x14
+; CHECK-i64-NEXT:    fcvt s4, h4
+; CHECK-i64-NEXT:    frintx s22, s25
+; CHECK-i64-NEXT:    fmov d20, x12
+; CHECK-i64-NEXT:    fcvt s1, h1
+; CHECK-i64-NEXT:    frintx s6, s6
+; CHECK-i64-NEXT:    fcvt s3, h3
+; CHECK-i64-NEXT:    fcvtzs x12, s0
+; CHECK-i64-NEXT:    frintx s5, s5
+; CHECK-i64-NEXT:    frintx s21, s21
+; CHECK-i64-NEXT:    fmov d0, x11
+; CHECK-i64-NEXT:    stp q26, q24, [x8, #64]
+; CHECK-i64-NEXT:    fmov d24, x15
+; CHECK-i64-NEXT:    frintx s4, s4
+; CHECK-i64-NEXT:    fcvtzs x11, s22
+; CHECK-i64-NEXT:    frintx s22, s23
+; CHECK-i64-NEXT:    frintx s1, s1
+; CHECK-i64-NEXT:    fcvtzs x13, s6
+; CHECK-i64-NEXT:    frintx s3, s3
+; CHECK-i64-NEXT:    frintx s6, s7
+; CHECK-i64-NEXT:    fcvtzs x14, s5
+; CHECK-i64-NEXT:    mov v24.d[1], x12
+; CHECK-i64-NEXT:    frintx s5, s19
+; CHECK-i64-NEXT:    fcvtzs x12, s21
+; CHECK-i64-NEXT:    mov v0.d[1], x10
+; CHECK-i64-NEXT:    fcvtzs x10, s4
+; CHECK-i64-NEXT:    mov v20.d[1], x11
+; CHECK-i64-NEXT:    fcvtzs x11, s22
+; CHECK-i64-NEXT:    mov v2.d[1], x13
+; CHECK-i64-NEXT:    fcvtzs x15, s3
+; CHECK-i64-NEXT:    fcvtzs x13, s1
+; CHECK-i64-NEXT:    mov v18.d[1], x14
+; CHECK-i64-NEXT:    fcvtzs x14, s6
+; CHECK-i64-NEXT:    stp q0, q24, [x8]
+; CHECK-i64-NEXT:    mov v17.d[1], x12
+; CHECK-i64-NEXT:    fcvtzs x12, s5
+; CHECK-i64-NEXT:    fmov d0, x10
+; CHECK-i64-NEXT:    fmov d1, x11
+; CHECK-i64-NEXT:    stp q2, q20, [x8, #224]
+; CHECK-i64-NEXT:    fmov d2, x9
+; CHECK-i64-NEXT:    mov v16.d[1], x15
+; CHECK-i64-NEXT:    stp q17, q18, [x8, #160]
+; CHECK-i64-NEXT:    mov v0.d[1], x13
+; CHECK-i64-NEXT:    mov v1.d[1], x14
+; CHECK-i64-NEXT:    mov v2.d[1], x12
+; CHECK-i64-NEXT:    stp q0, q16, [x8, #96]
+; CHECK-i64-NEXT:    stp q2, q1, [x8, #32]
+; CHECK-i64-NEXT:    ret
+  %a = call <32 x iXLen> @llvm.lrint.v32iXLen.v32f16(<32 x half> %x)
+  ret <32 x iXLen> %a
 }
-declare <32 x i64> @llvm.lrint.v32i64.v32f16(<32 x half>)
+declare <32 x iXLen> @llvm.lrint.v32iXLen.v32f16(<32 x half>)
 
-define <1 x i64> @lrint_v1f32(<1 x float> %x) {
-; CHECK-SD-LABEL: lrint_v1f32:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    frintx s0, s0
-; CHECK-SD-NEXT:    fcvtzs x8, s0
-; CHECK-SD-NEXT:    fmov d0, x8
-; CHECK-SD-NEXT:    ret
+define <1 x iXLen> @lrint_v1f32(<1 x float> %x) {
+; CHECK-i32-LABEL: lrint_v1f32:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    frintx v0.2s, v0.2s
+; CHECK-i32-NEXT:    fcvtzs v0.2s, v0.2s
+; CHECK-i32-NEXT:    ret
 ;
-; CHECK-GI-LABEL: lrint_v1f32:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    frintx s0, s0
-; CHECK-GI-NEXT:    fcvtzs x8, s0
-; CHECK-GI-NEXT:    fmov d0, x8
-; CHECK-GI-NEXT:    ret
-  %a = call <1 x i64> @llvm.lrint.v1i64.v1f32(<1 x float> %x)
-  ret <1 x i64> %a
+; CHECK-i64-SD-LABEL: lrint_v1f32:
+; CHECK-i64-SD:       // %bb.0:
+; CHECK-i64-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-i64-SD-NEXT:    frintx s0, s0
+; CHECK-i64-SD-NEXT:    fcvtzs x8, s0
+; CHECK-i64-SD-NEXT:    fmov d0, x8
+; CHECK-i64-SD-NEXT:    ret
+;
+; CHECK-i64-GI-LABEL: lrint_v1f32:
+; CHECK-i64-GI:       // %bb.0:
+; CHECK-i64-GI-NEXT:    frintx s0, s0
+; CHECK-i64-GI-NEXT:    fcvtzs x8, s0
+; CHECK-i64-GI-NEXT:    fmov d0, x8
+; CHECK-i64-GI-NEXT:    ret
+  %a = call <1 x iXLen> @llvm.lrint.v1iXLen.v1f32(<1 x float> %x)
+  ret <1 x iXLen> %a
 }
-declare <1 x i64> @llvm.lrint.v1i64.v1f32(<1 x float>)
+declare <1 x iXLen> @llvm.lrint.v1iXLen.v1f32(<1 x float>)
 
-define <2 x i64> @lrint_v2f32(<2 x float> %x) {
-; CHECK-LABEL: lrint_v2f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov s1, v0.s[1]
-; CHECK-NEXT:    frintx s0, s0
-; CHECK-NEXT:    frintx s1, s1
-; CHECK-NEXT:    fcvtzs x8, s0
-; CHECK-NEXT:    fcvtzs x9, s1
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    mov v0.d[1], x9
-; CHECK-NEXT:    ret
-  %a = call <2 x i64> @llvm.lrint.v2i64.v2f32(<2 x float> %x)
-  ret <2 x i64> %a
+define <2 x iXLen> @lrint_v2f32(<2 x float> %x) {
+; CHECK-i32-LABEL: lrint_v2f32:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    frintx v0.2s, v0.2s
+; CHECK-i32-NEXT:    fcvtzs v0.2s, v0.2s
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v2f32:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    frintx v0.2s, v0.2s
+; CHECK-i64-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-i64-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-i64-NEXT:    ret
+  %a = call <2 x iXLen> @llvm.lrint.v2iXLen.v2f32(<2 x float> %x)
+  ret <2 x iXLen> %a
 }
-declare <2 x i64> @llvm.lrint.v2i64.v2f32(<2 x float>)
+declare <2 x iXLen> @llvm.lrint.v2iXLen.v2f32(<2 x float>)
 
-define <4 x i64> @lrint_v4f32(<4 x float> %x) {
-; CHECK-LABEL: lrint_v4f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    mov s3, v0.s[1]
-; CHECK-NEXT:    frintx s0, s0
-; CHECK-NEXT:    mov s2, v1.s[1]
-; CHECK-NEXT:    frintx s1, s1
-; CHECK-NEXT:    frintx s3, s3
-; CHECK-NEXT:    fcvtzs x9, s0
-; CHECK-NEXT:    frintx s2, s2
-; CHECK-NEXT:    fcvtzs x8, s1
-; CHECK-NEXT:    fcvtzs x11, s3
-; CHECK-NEXT:    fmov d0, x9
-; CHECK-NEXT:    fcvtzs x10, s2
-; CHECK-NEXT:    fmov d1, x8
-; CHECK-NEXT:    mov v0.d[1], x11
-; CHECK-NEXT:    mov v1.d[1], x10
-; CHECK-NEXT:    ret
-  %a = call <4 x i64> @llvm.lrint.v4i64.v4f32(<4 x float> %x)
-  ret <4 x i64> %a
+define <4 x iXLen> @lrint_v4f32(<4 x float> %x) {
+; CHECK-i32-LABEL: lrint_v4f32:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    frintx v0.4s, v0.4s
+; CHECK-i32-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v4f32:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-i64-NEXT:    frintx v0.2s, v0.2s
+; CHECK-i64-NEXT:    frintx v1.2s, v1.2s
+; CHECK-i64-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-i64-NEXT:    fcvtl v1.2d, v1.2s
+; CHECK-i64-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-i64-NEXT:    fcvtzs v1.2d, v1.2d
+; CHECK-i64-NEXT:    ret
+  %a = call <4 x iXLen> @llvm.lrint.v4iXLen.v4f32(<4 x float> %x)
+  ret <4 x iXLen> %a
 }
-declare <4 x i64> @llvm.lrint.v4i64.v4f32(<4 x float>)
+declare <4 x iXLen> @llvm.lrint.v4iXLen.v4f32(<4 x float>)
 
-define <8 x i64> @lrint_v8f32(<8 x float> %x) {
-; CHECK-LABEL: lrint_v8f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    mov s4, v0.s[1]
-; CHECK-NEXT:    mov s7, v1.s[1]
-; CHECK-NEXT:    frintx s0, s0
-; CHECK-NEXT:    frintx s1, s1
-; CHECK-NEXT:    mov s5, v2.s[1]
-; CHECK-NEXT:    mov s6, v3.s[1]
-; CHECK-NEXT:    frintx s2, s2
-; CHECK-NEXT:    frintx s3, s3
-; CHECK-NEXT:    frintx s4, s4
-; CHECK-NEXT:    frintx s7, s7
-; CHECK-NEXT:    fcvtzs x9, s0
-; CHECK-NEXT:    fcvtzs x12, s1
-; CHECK-NEXT:    frintx s5, s5
-; CHECK-NEXT:    frintx s6, s6
-; CHECK-NEXT:    fcvtzs x8, s2
-; CHECK-NEXT:    fcvtzs x10, s3
-; CHECK-NEXT:    fcvtzs x11, s4
-; CHECK-NEXT:    fcvtzs x15, s7
-; CHECK-NEXT:    fmov d0, x9
-; CHECK-NEXT:    fmov d2, x12
-; CHECK-NEXT:    fcvtzs x13, s5
-; CHECK-NEXT:    fcvtzs x14, s6
-; CHECK-NEXT:    fmov d1, x8
-; CHECK-NEXT:    fmov d3, x10
-; CHECK-NEXT:    mov v0.d[1], x11
-; CHECK-NEXT:    mov v2.d[1], x15
-; CHECK-NEXT:    mov v1.d[1], x13
-; CHECK-NEXT:    mov v3.d[1], x14
-; CHECK-NEXT:    ret
-  %a = call <8 x i64> @llvm.lrint.v8i64.v8f32(<8 x float> %x)
-  ret <8 x i64> %a
+define <8 x iXLen> @lrint_v8f32(<8 x float> %x) {
+; CHECK-i32-LABEL: lrint_v8f32:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    frintx v0.4s, v0.4s
+; CHECK-i32-NEXT:    frintx v1.4s, v1.4s
+; CHECK-i32-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-i32-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v8f32:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
+; CHECK-i64-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-i64-NEXT:    frintx v0.2s, v0.2s
+; CHECK-i64-NEXT:    frintx v1.2s, v1.2s
+; CHECK-i64-NEXT:    frintx v2.2s, v2.2s
+; CHECK-i64-NEXT:    frintx v3.2s, v3.2s
+; CHECK-i64-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-i64-NEXT:    fcvtl v1.2d, v1.2s
+; CHECK-i64-NEXT:    fcvtl v4.2d, v2.2s
+; CHECK-i64-NEXT:    fcvtl v3.2d, v3.2s
+; CHECK-i64-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-i64-NEXT:    fcvtzs v2.2d, v1.2d
+; CHECK-i64-NEXT:    fcvtzs v1.2d, v4.2d
+; CHECK-i64-NEXT:    fcvtzs v3.2d, v3.2d
+; CHECK-i64-NEXT:    ret
+  %a = call <8 x iXLen> @llvm.lrint.v8iXLen.v8f32(<8 x float> %x)
+  ret <8 x iXLen> %a
 }
-declare <8 x i64> @llvm.lrint.v8i64.v8f32(<8 x float>)
+declare <8 x iXLen> @llvm.lrint.v8iXLen.v8f32(<8 x float>)
 
-define <16 x i64> @lrint_v16i64_v16f32(<16 x float> %x) {
-; CHECK-LABEL: lrint_v16i64_v16f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v4.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    ext v5.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    ext v6.16b, v2.16b, v2.16b, #8
-; CHECK-NEXT:    frintx s7, s0
-; CHECK-NEXT:    ext v16.16b, v3.16b, v3.16b, #8
-; CHECK-NEXT:    mov s0, v0.s[1]
-; CHECK-NEXT:    frintx s17, s4
-; CHECK-NEXT:    mov s4, v4.s[1]
-; CHECK-NEXT:    mov s18, v5.s[1]
-; CHECK-NEXT:    frintx s5, s5
-; CHECK-NEXT:    frintx s19, s6
-; CHECK-NEXT:    fcvtzs x8, s7
-; CHECK-NEXT:    frintx s7, s16
-; CHECK-NEXT:    mov s6, v6.s[1]
-; CHECK-NEXT:    mov s16, v16.s[1]
-; CHECK-NEXT:    frintx s0, s0
-; CHECK-NEXT:    frintx s4, s4
-; CHECK-NEXT:    fcvtzs x9, s17
-; CHECK-NEXT:    frintx s17, s1
-; CHECK-NEXT:    mov s1, v1.s[1]
-; CHECK-NEXT:    frintx s18, s18
-; CHECK-NEXT:    fcvtzs x10, s5
-; CHECK-NEXT:    mov s5, v2.s[1]
-; CHECK-NEXT:    fcvtzs x11, s19
-; CHECK-NEXT:    mov s19, v3.s[1]
-; CHECK-NEXT:    frintx s2, s2
-; CHECK-NEXT:    fcvtzs x12, s7
-; CHECK-NEXT:    frintx s6, s6
-; CHECK-NEXT:    fcvtzs x13, s4
-; CHECK-NEXT:    frintx s4, s3
-; CHECK-NEXT:    frintx s16, s16
-; CHECK-NEXT:    fcvtzs x14, s18
-; CHECK-NEXT:    frintx s18, s1
-; CHECK-NEXT:    fcvtzs x15, s17
-; CHECK-NEXT:    frintx s20, s5
-; CHECK-NEXT:    frintx s17, s19
-; CHECK-NEXT:    fmov d1, x9
-; CHECK-NEXT:    fcvtzs x9, s2
-; CHECK-NEXT:    fmov d5, x11
-; CHECK-NEXT:    fmov d3, x10
-; CHECK-NEXT:    fcvtzs x11, s4
-; CHECK-NEXT:    fcvtzs x10, s0
-; CHECK-NEXT:    fmov d7, x12
-; CHECK-NEXT:    fcvtzs x12, s18
-; CHECK-NEXT:    fcvtzs x17, s6
-; CHECK-NEXT:    fcvtzs x18, s16
-; CHECK-NEXT:    fcvtzs x16, s20
-; CHECK-NEXT:    fcvtzs x0, s17
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    fmov d2, x15
-; CHECK-NEXT:    fmov d4, x9
-; CHECK-NEXT:    mov v1.d[1], x13
-; CHECK-NEXT:    fmov d6, x11
-; CHECK-NEXT:    mov v3.d[1], x14
-; CHECK-NEXT:    mov v0.d[1], x10
-; CHECK-NEXT:    mov v5.d[1], x17
-; CHECK-NEXT:    mov v7.d[1], x18
-; CHECK-NEXT:    mov v2.d[1], x12
-; CHECK-NEXT:    mov v4.d[1], x16
-; CHECK-NEXT:    mov v6.d[1], x0
-; CHECK-NEXT:    ret
-  %a = call <16 x i64> @llvm.lrint.v16i64.v16f32(<16 x float> %x)
-  ret <16 x i64> %a
+define <16 x iXLen> @lrint_v16f32(<16 x float> %x) {
+; CHECK-i32-LABEL: lrint_v16f32:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    frintx v0.4s, v0.4s
+; CHECK-i32-NEXT:    frintx v1.4s, v1.4s
+; CHECK-i32-NEXT:    frintx v2.4s, v2.4s
+; CHECK-i32-NEXT:    frintx v3.4s, v3.4s
+; CHECK-i32-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-i32-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-i32-NEXT:    fcvtzs v2.4s, v2.4s
+; CHECK-i32-NEXT:    fcvtzs v3.4s, v3.4s
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v16f32:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-i64-NEXT:    ext v5.16b, v0.16b, v0.16b, #8
+; CHECK-i64-NEXT:    ext v6.16b, v2.16b, v2.16b, #8
+; CHECK-i64-NEXT:    ext v7.16b, v3.16b, v3.16b, #8
+; CHECK-i64-NEXT:    frintx v0.2s, v0.2s
+; CHECK-i64-NEXT:    frintx v1.2s, v1.2s
+; CHECK-i64-NEXT:    frintx v2.2s, v2.2s
+; CHECK-i64-NEXT:    frintx v3.2s, v3.2s
+; CHECK-i64-NEXT:    frintx v5.2s, v5.2s
+; CHECK-i64-NEXT:    frintx v4.2s, v4.2s
+; CHECK-i64-NEXT:    frintx v6.2s, v6.2s
+; CHECK-i64-NEXT:    frintx v7.2s, v7.2s
+; CHECK-i64-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-i64-NEXT:    fcvtl v1.2d, v1.2s
+; CHECK-i64-NEXT:    fcvtl v16.2d, v2.2s
+; CHECK-i64-NEXT:    fcvtl v18.2d, v3.2s
+; CHECK-i64-NEXT:    fcvtl v5.2d, v5.2s
+; CHECK-i64-NEXT:    fcvtl v17.2d, v4.2s
+; CHECK-i64-NEXT:    fcvtl v19.2d, v6.2s
+; CHECK-i64-NEXT:    fcvtl v7.2d, v7.2s
+; CHECK-i64-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-i64-NEXT:    fcvtzs v2.2d, v1.2d
+; CHECK-i64-NEXT:    fcvtzs v4.2d, v16.2d
+; CHECK-i64-NEXT:    fcvtzs v6.2d, v18.2d
+; CHECK-i64-NEXT:    fcvtzs v1.2d, v5.2d
+; CHECK-i64-NEXT:    fcvtzs v3.2d, v17.2d
+; CHECK-i64-NEXT:    fcvtzs v5.2d, v19.2d
+; CHECK-i64-NEXT:    fcvtzs v7.2d, v7.2d
+; CHECK-i64-NEXT:    ret
+  %a = call <16 x iXLen> @llvm.lrint.v16iXLen.v16f32(<16 x float> %x)
+  ret <16 x iXLen> %a
 }
-declare <16 x i64> @llvm.lrint.v16i64.v16f32(<16 x float>)
+declare <16 x iXLen> @llvm.lrint.v16iXLen.v16f32(<16 x float>)
 
-define <1 x i64> @lrint_v1f64(<1 x double> %x) {
-; CHECK-LABEL: lrint_v1f64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    frintx d0, d0
-; CHECK-NEXT:    fcvtzs x8, d0
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    ret
-  %a = call <1 x i64> @llvm.lrint.v1i64.v1f64(<1 x double> %x)
-  ret <1 x i64> %a
+define <32 x iXLen> @lrint_v32f32(<32 x float> %x) {
+; CHECK-i32-LABEL: lrint_v32f32:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    frintx v0.4s, v0.4s
+; CHECK-i32-NEXT:    frintx v1.4s, v1.4s
+; CHECK-i32-NEXT:    frintx v2.4s, v2.4s
+; CHECK-i32-NEXT:    frintx v3.4s, v3.4s
+; CHECK-i32-NEXT:    frintx v4.4s, v4.4s
+; CHECK-i32-NEXT:    frintx v5.4s, v5.4s
+; CHECK-i32-NEXT:    frintx v6.4s, v6.4s
+; CHECK-i32-NEXT:    frintx v7.4s, v7.4s
+; CHECK-i32-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-i32-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-i32-NEXT:    fcvtzs v2.4s, v2.4s
+; CHECK-i32-NEXT:    fcvtzs v3.4s, v3.4s
+; CHECK-i32-NEXT:    fcvtzs v4.4s, v4.4s
+; CHECK-i32-NEXT:    fcvtzs v5.4s, v5.4s
+; CHECK-i32-NEXT:    fcvtzs v6.4s, v6.4s
+; CHECK-i32-NEXT:    fcvtzs v7.4s, v7.4s
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v32f32:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    ext v16.16b, v7.16b, v7.16b, #8
+; CHECK-i64-NEXT:    ext v17.16b, v6.16b, v6.16b, #8
+; CHECK-i64-NEXT:    frintx v7.2s, v7.2s
+; CHECK-i64-NEXT:    frintx v6.2s, v6.2s
+; CHECK-i64-NEXT:    ext v18.16b, v5.16b, v5.16b, #8
+; CHECK-i64-NEXT:    ext v21.16b, v4.16b, v4.16b, #8
+; CHECK-i64-NEXT:    ext v22.16b, v2.16b, v2.16b, #8
+; CHECK-i64-NEXT:    frintx v5.2s, v5.2s
+; CHECK-i64-NEXT:    ext v23.16b, v3.16b, v3.16b, #8
+; CHECK-i64-NEXT:    frintx v4.2s, v4.2s
+; CHECK-i64-NEXT:    ext v19.16b, v0.16b, v0.16b, #8
+; CHECK-i64-NEXT:    ext v20.16b, v1.16b, v1.16b, #8
+; CHECK-i64-NEXT:    frintx v16.2s, v16.2s
+; CHECK-i64-NEXT:    frintx v17.2s, v17.2s
+; CHECK-i64-NEXT:    fcvtl v7.2d, v7.2s
+; CHECK-i64-NEXT:    fcvtl v6.2d, v6.2s
+; CHECK-i64-NEXT:    frintx v18.2s, v18.2s
+; CHECK-i64-NEXT:    frintx v21.2s, v21.2s
+; CHECK-i64-NEXT:    frintx v2.2s, v2.2s
+; CHECK-i64-NEXT:    frintx v3.2s, v3.2s
+; CHECK-i64-NEXT:    fcvtl v5.2d, v5.2s
+; CHECK-i64-NEXT:    frintx v23.2s, v23.2s
+; CHECK-i64-NEXT:    fcvtl v4.2d, v4.2s
+; CHECK-i64-NEXT:    frintx v1.2s, v1.2s
+; CHECK-i64-NEXT:    fcvtl v16.2d, v16.2s
+; CHECK-i64-NEXT:    fcvtl v17.2d, v17.2s
+; CHECK-i64-NEXT:    fcvtzs v7.2d, v7.2d
+; CHECK-i64-NEXT:    fcvtzs v6.2d, v6.2d
+; CHECK-i64-NEXT:    fcvtl v18.2d, v18.2s
+; CHECK-i64-NEXT:    fcvtl v21.2d, v21.2s
+; CHECK-i64-NEXT:    frintx v20.2s, v20.2s
+; CHECK-i64-NEXT:    fcvtl v3.2d, v3.2s
+; CHECK-i64-NEXT:    fcvtzs v5.2d, v5.2d
+; CHECK-i64-NEXT:    frintx v0.2s, v0.2s
+; CHECK-i64-NEXT:    fcvtl v2.2d, v2.2s
+; CHECK-i64-NEXT:    fcvtzs v4.2d, v4.2d
+; CHECK-i64-NEXT:    fcvtzs v16.2d, v16.2d
+; CHECK-i64-NEXT:    fcvtzs v17.2d, v17.2d
+; CHECK-i64-NEXT:    fcvtl v1.2d, v1.2s
+; CHECK-i64-NEXT:    fcvtzs v3.2d, v3.2d
+; CHECK-i64-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-i64-NEXT:    fcvtzs v2.2d, v2.2d
+; CHECK-i64-NEXT:    stp q6, q17, [x8, #192]
+; CHECK-i64-NEXT:    fcvtl v6.2d, v23.2s
+; CHECK-i64-NEXT:    frintx v17.2s, v19.2s
+; CHECK-i64-NEXT:    stp q7, q16, [x8, #224]
+; CHECK-i64-NEXT:    frintx v7.2s, v22.2s
+; CHECK-i64-NEXT:    fcvtzs v16.2d, v18.2d
+; CHECK-i64-NEXT:    fcvtzs v18.2d, v21.2d
+; CHECK-i64-NEXT:    fcvtzs v1.2d, v1.2d
+; CHECK-i64-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-i64-NEXT:    fcvtzs v6.2d, v6.2d
+; CHECK-i64-NEXT:    stp q5, q16, [x8, #160]
+; CHECK-i64-NEXT:    fcvtl v7.2d, v7.2s
+; CHECK-i64-NEXT:    fcvtl v5.2d, v20.2s
+; CHECK-i64-NEXT:    stp q4, q18, [x8, #128]
+; CHECK-i64-NEXT:    fcvtl v4.2d, v17.2s
+; CHECK-i64-NEXT:    stp q3, q6, [x8, #96]
+; CHECK-i64-NEXT:    fcvtzs v7.2d, v7.2d
+; CHECK-i64-NEXT:    fcvtzs v3.2d, v5.2d
+; CHECK-i64-NEXT:    stp q1, q3, [x8, #32]
+; CHECK-i64-NEXT:    stp q2, q7, [x8, #64]
+; CHECK-i64-NEXT:    fcvtzs v2.2d, v4.2d
+; CHECK-i64-NEXT:    stp q0, q2, [x8]
+; CHECK-i64-NEXT:    ret
+  %a = call <32 x iXLen> @llvm.lrint.v32iXLen.v32f32(<32 x float> %x)
+  ret <32 x iXLen> %a
 }
-declare <1 x i64> @llvm.lrint.v1i64.v1f64(<1 x double>)
+declare <32 x iXLen> @llvm.lrint.v32iXLen.v32f32(<32 x float>)
 
-define <2 x i64> @lrint_v2f64(<2 x double> %x) {
-; CHECK-LABEL: lrint_v2f64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov d1, v0.d[1]
-; CHECK-NEXT:    frintx d0, d0
-; CHECK-NEXT:    frintx d1, d1
-; CHECK-NEXT:    fcvtzs x8, d0
-; CHECK-NEXT:    fcvtzs x9, d1
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    mov v0.d[1], x9
-; CHECK-NEXT:    ret
-  %a = call <2 x i64> @llvm.lrint.v2i64.v2f64(<2 x double> %x)
-  ret <2 x i64> %a
+define <1 x iXLen> @lrint_v1f64(<1 x double> %x) {
+; CHECK-i32-LABEL: lrint_v1f64:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    frintx d0, d0
+; CHECK-i32-NEXT:    fcvtzs w8, d0
+; CHECK-i32-NEXT:    fmov s0, w8
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v1f64:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    frintx d0, d0
+; CHECK-i64-NEXT:    fcvtzs x8, d0
+; CHECK-i64-NEXT:    fmov d0, x8
+; CHECK-i64-NEXT:    ret
+  %a = call <1 x iXLen> @llvm.lrint.v1iXLen.v1f64(<1 x double> %x)
+  ret <1 x iXLen> %a
+}
+declare <1 x iXLen> @llvm.lrint.v1iXLen.v1f64(<1 x double>)
+
+define <2 x iXLen> @lrint_v2f64(<2 x double> %x) {
+; CHECK-i32-LABEL: lrint_v2f64:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    frintx v0.2d, v0.2d
+; CHECK-i32-NEXT:    mov d1, v0.d[1]
+; CHECK-i32-NEXT:    fcvtzs w8, d0
+; CHECK-i32-NEXT:    fcvtzs w9, d1
+; CHECK-i32-NEXT:    fmov s0, w8
+; CHECK-i32-NEXT:    mov v0.s[1], w9
+; CHECK-i32-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v2f64:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    frintx v0.2d, v0.2d
+; CHECK-i64-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-i64-NEXT:    ret
+  %a = call <2 x iXLen> @llvm.lrint.v2iXLen.v2f64(<2 x double> %x)
+  ret <2 x iXLen> %a
+}
+declare <2 x iXLen> @llvm.lrint.v2iXLen.v2f64(<2 x double>)
+
+define <4 x iXLen> @lrint_v4f64(<4 x double> %x) {
+; CHECK-i32-LABEL: lrint_v4f64:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    frintx v0.2d, v0.2d
+; CHECK-i32-NEXT:    frintx v1.2d, v1.2d
+; CHECK-i32-NEXT:    mov d2, v0.d[1]
+; CHECK-i32-NEXT:    fcvtzs w8, d0
+; CHECK-i32-NEXT:    fcvtzs w9, d2
+; CHECK-i32-NEXT:    fmov s0, w8
+; CHECK-i32-NEXT:    fcvtzs w8, d1
+; CHECK-i32-NEXT:    mov d1, v1.d[1]
+; CHECK-i32-NEXT:    mov v0.s[1], w9
+; CHECK-i32-NEXT:    mov v0.s[2], w8
+; CHECK-i32-NEXT:    fcvtzs w8, d1
+; CHECK-i32-NEXT:    mov v0.s[3], w8
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v4f64:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    frintx v0.2d, v0.2d
+; CHECK-i64-NEXT:    frintx v1.2d, v1.2d
+; CHECK-i64-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-i64-NEXT:    fcvtzs v1.2d, v1.2d
+; CHECK-i64-NEXT:    ret
+  %a = call <4 x iXLen> @llvm.lrint.v4iXLen.v4f64(<4 x double> %x)
+  ret <4 x iXLen> %a
+}
+declare <4 x iXLen> @llvm.lrint.v4iXLen.v4f64(<4 x double>)
+
+define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
+; CHECK-i32-LABEL: lrint_v8f64:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    frintx v2.2d, v2.2d
+; CHECK-i32-NEXT:    frintx v0.2d, v0.2d
+; CHECK-i32-NEXT:    frintx v3.2d, v3.2d
+; CHECK-i32-NEXT:    mov d4, v0.d[1]
+; CHECK-i32-NEXT:    mov d5, v2.d[1]
+; CHECK-i32-NEXT:    fcvtzs w8, d0
+; CHECK-i32-NEXT:    fcvtzs w9, d2
+; CHECK-i32-NEXT:    frintx v2.2d, v1.2d
+; CHECK-i32-NEXT:    fcvtzs w10, d4
+; CHECK-i32-NEXT:    fcvtzs w11, d5
+; CHECK-i32-NEXT:    fmov s0, w8
+; CHECK-i32-NEXT:    fmov s1, w9
+; CHECK-i32-NEXT:    fcvtzs w8, d2
+; CHECK-i32-NEXT:    mov d2, v2.d[1]
+; CHECK-i32-NEXT:    fcvtzs w9, d3
+; CHECK-i32-NEXT:    mov d3, v3.d[1]
+; CHECK-i32-NEXT:    mov v0.s[1], w10
+; CHECK-i32-NEXT:    mov v1.s[1], w11
+; CHECK-i32-NEXT:    mov v0.s[2], w8
+; CHECK-i32-NEXT:    fcvtzs w8, d2
+; CHECK-i32-NEXT:    mov v1.s[2], w9
+; CHECK-i32-NEXT:    fcvtzs w9, d3
+; CHECK-i32-NEXT:    mov v0.s[3], w8
+; CHECK-i32-NEXT:    mov v1.s[3], w9
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v8f64:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    frintx v0.2d, v0.2d
+; CHECK-i64-NEXT:    frintx v1.2d, v1.2d
+; CHECK-i64-NEXT:    frintx v2.2d, v2.2d
+; CHECK-i64-NEXT:    frintx v3.2d, v3.2d
+; CHECK-i64-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-i64-NEXT:    fcvtzs v1.2d, v1.2d
+; CHECK-i64-NEXT:    fcvtzs v2.2d, v2.2d
+; CHECK-i64-NEXT:    fcvtzs v3.2d, v3.2d
+; CHECK-i64-NEXT:    ret
+  %a = call <8 x iXLen> @llvm.lrint.v8iXLen.v8f64(<8 x double> %x)
+  ret <8 x iXLen> %a
 }
-declare <2 x i64> @llvm.lrint.v2i64.v2f64(<2 x double>)
+declare <8 x iXLen> @llvm.lrint.v8iXLen.v8f64(<8 x double>)
 
-define <4 x i64> @lrint_v4f64(<4 x double> %x) {
-; CHECK-LABEL: lrint_v4f64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov d2, v0.d[1]
-; CHECK-NEXT:    mov d3, v1.d[1]
-; CHECK-NEXT:    frintx d0, d0
-; CHECK-NEXT:    frintx d1, d1
-; CHECK-NEXT:    frintx d2, d2
-; CHECK-NEXT:    frintx d3, d3
-; CHECK-NEXT:    fcvtzs x8, d0
-; CHECK-NEXT:    fcvtzs x9, d1
-; CHECK-NEXT:    fcvtzs x10, d2
-; CHECK-NEXT:    fcvtzs x11, d3
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    fmov d1, x9
-; CHECK-NEXT:    mov v0.d[1], x10
-; CHECK-NEXT:    mov v1.d[1], x11
-; CHECK-NEXT:    ret
-  %a = call <4 x i64> @llvm.lrint.v4i64.v4f64(<4 x double> %x)
-  ret <4 x i64> %a
+define <16 x iXLen> @lrint_v16f64(<16 x double> %x) {
+; CHECK-i32-LABEL: lrint_v16f64:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    frintx v0.2d, v0.2d
+; CHECK-i32-NEXT:    frintx v2.2d, v2.2d
+; CHECK-i32-NEXT:    frintx v4.2d, v4.2d
+; CHECK-i32-NEXT:    frintx v6.2d, v6.2d
+; CHECK-i32-NEXT:    frintx v17.2d, v1.2d
+; CHECK-i32-NEXT:    frintx v5.2d, v5.2d
+; CHECK-i32-NEXT:    fcvtzs w8, d0
+; CHECK-i32-NEXT:    mov d16, v0.d[1]
+; CHECK-i32-NEXT:    fcvtzs w9, d2
+; CHECK-i32-NEXT:    mov d2, v2.d[1]
+; CHECK-i32-NEXT:    fcvtzs w10, d4
+; CHECK-i32-NEXT:    mov d4, v4.d[1]
+; CHECK-i32-NEXT:    fcvtzs w11, d6
+; CHECK-i32-NEXT:    mov d6, v6.d[1]
+; CHECK-i32-NEXT:    fmov s0, w8
+; CHECK-i32-NEXT:    fcvtzs w8, d16
+; CHECK-i32-NEXT:    frintx v16.2d, v3.2d
+; CHECK-i32-NEXT:    fmov s1, w9
+; CHECK-i32-NEXT:    fcvtzs w9, d2
+; CHECK-i32-NEXT:    fmov s2, w10
+; CHECK-i32-NEXT:    fcvtzs w10, d4
+; CHECK-i32-NEXT:    frintx v4.2d, v7.2d
+; CHECK-i32-NEXT:    fmov s3, w11
+; CHECK-i32-NEXT:    fcvtzs w11, d6
+; CHECK-i32-NEXT:    mov d6, v17.d[1]
+; CHECK-i32-NEXT:    mov v0.s[1], w8
+; CHECK-i32-NEXT:    fcvtzs w8, d17
+; CHECK-i32-NEXT:    mov d7, v16.d[1]
+; CHECK-i32-NEXT:    mov v1.s[1], w9
+; CHECK-i32-NEXT:    fcvtzs w9, d16
+; CHECK-i32-NEXT:    mov v2.s[1], w10
+; CHECK-i32-NEXT:    fcvtzs w10, d5
+; CHECK-i32-NEXT:    mov d5, v5.d[1]
+; CHECK-i32-NEXT:    mov v3.s[1], w11
+; CHECK-i32-NEXT:    fcvtzs w11, d4
+; CHECK-i32-NEXT:    mov d4, v4.d[1]
+; CHECK-i32-NEXT:    mov v0.s[2], w8
+; CHECK-i32-NEXT:    fcvtzs w8, d6
+; CHECK-i32-NEXT:    mov v1.s[2], w9
+; CHECK-i32-NEXT:    fcvtzs w9, d7
+; CHECK-i32-NEXT:    mov v2.s[2], w10
+; CHECK-i32-NEXT:    fcvtzs w10, d5
+; CHECK-i32-NEXT:    mov v3.s[2], w11
+; CHECK-i32-NEXT:    fcvtzs w11, d4
+; CHECK-i32-NEXT:    mov v0.s[3], w8
+; CHECK-i32-NEXT:    mov v1.s[3], w9
+; CHECK-i32-NEXT:    mov v2.s[3], w10
+; CHECK-i32-NEXT:    mov v3.s[3], w11
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v16f64:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    frintx v0.2d, v0.2d
+; CHECK-i64-NEXT:    frintx v1.2d, v1.2d
+; CHECK-i64-NEXT:    frintx v2.2d, v2.2d
+; CHECK-i64-NEXT:    frintx v3.2d, v3.2d
+; CHECK-i64-NEXT:    frintx v4.2d, v4.2d
+; CHECK-i64-NEXT:    frintx v5.2d, v5.2d
+; CHECK-i64-NEXT:    frintx v6.2d, v6.2d
+; CHECK-i64-NEXT:    frintx v7.2d, v7.2d
+; CHECK-i64-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-i64-NEXT:    fcvtzs v1.2d, v1.2d
+; CHECK-i64-NEXT:    fcvtzs v2.2d, v2.2d
+; CHECK-i64-NEXT:    fcvtzs v3.2d, v3.2d
+; CHECK-i64-NEXT:    fcvtzs v4.2d, v4.2d
+; CHECK-i64-NEXT:    fcvtzs v5.2d, v5.2d
+; CHECK-i64-NEXT:    fcvtzs v6.2d, v6.2d
+; CHECK-i64-NEXT:    fcvtzs v7.2d, v7.2d
+; CHECK-i64-NEXT:    ret
+  %a = call <16 x iXLen> @llvm.lrint.v16iXLen.v16f64(<16 x double> %x)
+  ret <16 x iXLen> %a
 }
-declare <4 x i64> @llvm.lrint.v4i64.v4f64(<4 x double>)
+declare <16 x iXLen> @llvm.lrint.v16iXLen.v16f64(<16 x double>)
 
-define <8 x i64> @lrint_v8f64(<8 x double> %x) {
-; CHECK-LABEL: lrint_v8f64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov d4, v0.d[1]
-; CHECK-NEXT:    mov d5, v1.d[1]
-; CHECK-NEXT:    mov d6, v2.d[1]
-; CHECK-NEXT:    mov d7, v3.d[1]
-; CHECK-NEXT:    frintx d0, d0
-; CHECK-NEXT:    frintx d1, d1
-; CHECK-NEXT:    frintx d2, d2
-; CHECK-NEXT:    frintx d3, d3
-; CHECK-NEXT:    frintx d4, d4
-; CHECK-NEXT:    frintx d5, d5
-; CHECK-NEXT:    frintx d6, d6
-; CHECK-NEXT:    frintx d7, d7
-; CHECK-NEXT:    fcvtzs x8, d0
-; CHECK-NEXT:    fcvtzs x9, d1
-; CHECK-NEXT:    fcvtzs x10, d2
-; CHECK-NEXT:    fcvtzs x11, d3
-; CHECK-NEXT:    fcvtzs x12, d4
-; CHECK-NEXT:    fcvtzs x13, d5
-; CHECK-NEXT:    fcvtzs x14, d6
-; CHECK-NEXT:    fcvtzs x15, d7
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    fmov d1, x9
-; CHECK-NEXT:    fmov d2, x10
-; CHECK-NEXT:    fmov d3, x11
-; CHECK-NEXT:    mov v0.d[1], x12
-; CHECK-NEXT:    mov v1.d[1], x13
-; CHECK-NEXT:    mov v2.d[1], x14
-; CHECK-NEXT:    mov v3.d[1], x15
-; CHECK-NEXT:    ret
-  %a = call <8 x i64> @llvm.lrint.v8i64.v8f64(<8 x double> %x)
-  ret <8 x i64> %a
+define <32 x iXLen> @lrint_v32f64(<32 x double> %x) {
+; CHECK-i32-LABEL: lrint_v32f64:
+; CHECK-i32:       // %bb.0:
+; CHECK-i32-NEXT:    frintx v17.2d, v0.2d
+; CHECK-i32-NEXT:    frintx v19.2d, v2.2d
+; CHECK-i32-NEXT:    frintx v0.2d, v1.2d
+; CHECK-i32-NEXT:    frintx v1.2d, v4.2d
+; CHECK-i32-NEXT:    frintx v2.2d, v3.2d
+; CHECK-i32-NEXT:    frintx v3.2d, v5.2d
+; CHECK-i32-NEXT:    ldp q16, q5, [sp]
+; CHECK-i32-NEXT:    frintx v18.2d, v6.2d
+; CHECK-i32-NEXT:    frintx v4.2d, v7.2d
+; CHECK-i32-NEXT:    ldp q22, q6, [sp, #64]
+; CHECK-i32-NEXT:    mov d20, v17.d[1]
+; CHECK-i32-NEXT:    mov d21, v19.d[1]
+; CHECK-i32-NEXT:    fcvtzs w8, d17
+; CHECK-i32-NEXT:    fcvtzs w9, d19
+; CHECK-i32-NEXT:    ldp q17, q7, [sp, #32]
+; CHECK-i32-NEXT:    fcvtzs w12, d0
+; CHECK-i32-NEXT:    mov d19, v1.d[1]
+; CHECK-i32-NEXT:    fcvtzs w13, d1
+; CHECK-i32-NEXT:    frintx v16.2d, v16.2d
+; CHECK-i32-NEXT:    mov d23, v18.d[1]
+; CHECK-i32-NEXT:    fcvtzs w15, d18
+; CHECK-i32-NEXT:    fcvtzs w10, d20
+; CHECK-i32-NEXT:    fcvtzs w11, d21
+; CHECK-i32-NEXT:    mov d21, v0.d[1]
+; CHECK-i32-NEXT:    fmov s0, w8
+; CHECK-i32-NEXT:    fmov s1, w9
+; CHECK-i32-NEXT:    frintx v17.2d, v17.2d
+; CHECK-i32-NEXT:    frintx v20.2d, v22.2d
+; CHECK-i32-NEXT:    mov d22, v2.d[1]
+; CHECK-i32-NEXT:    fcvtzs w14, d19
+; CHECK-i32-NEXT:    mov d18, v16.d[1]
+; CHECK-i32-NEXT:    frintx v7.2d, v7.2d
+; CHECK-i32-NEXT:    mov v0.s[1], w10
+; CHECK-i32-NEXT:    fcvtzs w10, d2
+; CHECK-i32-NEXT:    mov v1.s[1], w11
+; CHECK-i32-NEXT:    fcvtzs w8, d21
+; CHECK-i32-NEXT:    ldp q21, q19, [sp, #96]
+; CHECK-i32-NEXT:    fmov s2, w13
+; CHECK-i32-NEXT:    fcvtzs w11, d23
+; CHECK-i32-NEXT:    mov d23, v3.d[1]
+; CHECK-i32-NEXT:    fcvtzs w9, d22
+; CHECK-i32-NEXT:    mov d22, v17.d[1]
+; CHECK-i32-NEXT:    fcvtzs w13, d18
+; CHECK-i32-NEXT:    mov v0.s[2], w12
+; CHECK-i32-NEXT:    fcvtzs w12, d16
+; CHECK-i32-NEXT:    mov v1.s[2], w10
+; CHECK-i32-NEXT:    fcvtzs w10, d3
+; CHECK-i32-NEXT:    fmov s3, w15
+; CHECK-i32-NEXT:    frintx v21.2d, v21.2d
+; CHECK-i32-NEXT:    mov v2.s[1], w14
+; CHECK-i32-NEXT:    mov d16, v20.d[1]
+; CHECK-i32-NEXT:    fcvtzs w14, d17
+; CHECK-i32-NEXT:    mov d17, v4.d[1]
+; CHECK-i32-NEXT:    fcvtzs w15, d22
+; CHECK-i32-NEXT:    frintx v22.2d, v5.2d
+; CHECK-i32-NEXT:    mov v3.s[1], w11
+; CHECK-i32-NEXT:    fcvtzs w11, d4
+; CHECK-i32-NEXT:    fmov s4, w12
+; CHECK-i32-NEXT:    fcvtzs w12, d20
+; CHECK-i32-NEXT:    mov d18, v21.d[1]
+; CHECK-i32-NEXT:    mov d20, v7.d[1]
+; CHECK-i32-NEXT:    fmov s5, w14
+; CHECK-i32-NEXT:    fcvtzs w14, d21
+; CHECK-i32-NEXT:    mov v2.s[2], w10
+; CHECK-i32-NEXT:    mov v4.s[1], w13
+; CHECK-i32-NEXT:    fcvtzs w13, d16
+; CHECK-i32-NEXT:    frintx v16.2d, v6.2d
+; CHECK-i32-NEXT:    fcvtzs w10, d23
+; CHECK-i32-NEXT:    mov v3.s[2], w11
+; CHECK-i32-NEXT:    fcvtzs w11, d17
+; CHECK-i32-NEXT:    fmov s6, w12
+; CHECK-i32-NEXT:    mov v5.s[1], w15
+; CHECK-i32-NEXT:    fcvtzs w15, d18
+; CHECK-i32-NEXT:    frintx v18.2d, v19.2d
+; CHECK-i32-NEXT:    fcvtzs w12, d22
+; CHECK-i32-NEXT:    mov d19, v22.d[1]
+; CHECK-i32-NEXT:    mov v0.s[3], w8
+; CHECK-i32-NEXT:    mov v1.s[3], w9
+; CHECK-i32-NEXT:    mov v6.s[1], w13
+; CHECK-i32-NEXT:    fcvtzs w13, d7
+; CHECK-i32-NEXT:    fmov s7, w14
+; CHECK-i32-NEXT:    fcvtzs w14, d16
+; CHECK-i32-NEXT:    mov d16, v16.d[1]
+; CHECK-i32-NEXT:    mov v2.s[3], w10
+; CHECK-i32-NEXT:    mov v4.s[2], w12
+; CHECK-i32-NEXT:    fcvtzs w12, d19
+; CHECK-i32-NEXT:    mov v3.s[3], w11
+; CHECK-i32-NEXT:    mov v7.s[1], w15
+; CHECK-i32-NEXT:    fcvtzs w15, d18
+; CHECK-i32-NEXT:    mov d18, v18.d[1]
+; CHECK-i32-NEXT:    mov v5.s[2], w13
+; CHECK-i32-NEXT:    fcvtzs w13, d20
+; CHECK-i32-NEXT:    mov v6.s[2], w14
+; CHECK-i32-NEXT:    fcvtzs w14, d16
+; CHECK-i32-NEXT:    mov v4.s[3], w12
+; CHECK-i32-NEXT:    mov v7.s[2], w15
+; CHECK-i32-NEXT:    fcvtzs w15, d18
+; CHECK-i32-NEXT:    mov v5.s[3], w13
+; CHECK-i32-NEXT:    mov v6.s[3], w14
+; CHECK-i32-NEXT:    mov v7.s[3], w15
+; CHECK-i32-NEXT:    ret
+;
+; CHECK-i64-LABEL: lrint_v32f64:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    ldp q17, q16, [sp, #96]
+; CHECK-i64-NEXT:    frintx v7.2d, v7.2d
+; CHECK-i64-NEXT:    ldp q19, q18, [sp, #64]
+; CHECK-i64-NEXT:    frintx v6.2d, v6.2d
+; CHECK-i64-NEXT:    ldp q21, q20, [sp, #32]
+; CHECK-i64-NEXT:    frintx v5.2d, v5.2d
+; CHECK-i64-NEXT:    frintx v16.2d, v16.2d
+; CHECK-i64-NEXT:    frintx v17.2d, v17.2d
+; CHECK-i64-NEXT:    frintx v4.2d, v4.2d
+; CHECK-i64-NEXT:    frintx v18.2d, v18.2d
+; CHECK-i64-NEXT:    frintx v19.2d, v19.2d
+; CHECK-i64-NEXT:    frintx v3.2d, v3.2d
+; CHECK-i64-NEXT:    ldp q23, q22, [sp]
+; CHECK-i64-NEXT:    frintx v20.2d, v20.2d
+; CHECK-i64-NEXT:    frintx v21.2d, v21.2d
+; CHECK-i64-NEXT:    frintx v2.2d, v2.2d
+; CHECK-i64-NEXT:    frintx v1.2d, v1.2d
+; CHECK-i64-NEXT:    fcvtzs v16.2d, v16.2d
+; CHECK-i64-NEXT:    fcvtzs v17.2d, v17.2d
+; CHECK-i64-NEXT:    frintx v0.2d, v0.2d
+; CHECK-i64-NEXT:    frintx v22.2d, v22.2d
+; CHECK-i64-NEXT:    fcvtzs v18.2d, v18.2d
+; CHECK-i64-NEXT:    frintx v23.2d, v23.2d
+; CHECK-i64-NEXT:    fcvtzs v19.2d, v19.2d
+; CHECK-i64-NEXT:    fcvtzs v20.2d, v20.2d
+; CHECK-i64-NEXT:    fcvtzs v7.2d, v7.2d
+; CHECK-i64-NEXT:    fcvtzs v6.2d, v6.2d
+; CHECK-i64-NEXT:    fcvtzs v5.2d, v5.2d
+; CHECK-i64-NEXT:    fcvtzs v4.2d, v4.2d
+; CHECK-i64-NEXT:    stp q17, q16, [x8, #224]
+; CHECK-i64-NEXT:    fcvtzs v16.2d, v21.2d
+; CHECK-i64-NEXT:    fcvtzs v3.2d, v3.2d
+; CHECK-i64-NEXT:    fcvtzs v17.2d, v22.2d
+; CHECK-i64-NEXT:    fcvtzs v2.2d, v2.2d
+; CHECK-i64-NEXT:    fcvtzs v1.2d, v1.2d
+; CHECK-i64-NEXT:    stp q19, q18, [x8, #192]
+; CHECK-i64-NEXT:    fcvtzs v18.2d, v23.2d
+; CHECK-i64-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-i64-NEXT:    stp q4, q5, [x8, #64]
+; CHECK-i64-NEXT:    stp q6, q7, [x8, #96]
+; CHECK-i64-NEXT:    stp q2, q3, [x8, #32]
+; CHECK-i64-NEXT:    stp q0, q1, [x8]
+; CHECK-i64-NEXT:    stp q18, q17, [x8, #128]
+; CHECK-i64-NEXT:    stp q16, q20, [x8, #160]
+; CHECK-i64-NEXT:    ret
+  %a = call <32 x iXLen> @llvm.lrint.v32iXLen.v16f64(<32 x double> %x)
+  ret <32 x iXLen> %a
 }
-declare <8 x i64> @llvm.lrint.v8i64.v8f64(<8 x double>)
+declare <32 x iXLen> @llvm.lrint.v32iXLen.v32f64(<32 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
index 9b0bd2752b82..6594d7f50421 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
@@ -218,17 +218,17 @@ body: |
   ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C3]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI1]](s32)
+  ; GFX10-NEXT:   [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI1]](s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY3]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY4]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
-  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.1
   ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
   ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY5]](s1), [[C5]], [[C4]]
@@ -257,14 +257,14 @@ body: |
     %14:_(s1) = G_FCMP floatpred(ogt), %13(s32), %0
     %15:_(s32) = G_CONSTANT i32 1
     %9:_(s32) = G_ADD %8, %15
-    %7:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %14(s1), %6(s32)
+    %7:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %14(s1), %6(s32)
     SI_LOOP %7(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.2
 
   bb.2:
     %16:_(s1) = G_PHI %11(s1), %bb.1
     %17:_(s32) = G_PHI %7(s32), %bb.1
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %17(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %17(s32)
     %18:_(s32) = G_FCONSTANT float 0.000000e+00
     %19:_(s32) = G_FCONSTANT float 1.000000e+00
     %20:_(s32) = G_SELECT %16(s1), %19, %18
@@ -348,18 +348,18 @@ body: |
   ; GFX10-NEXT:   [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
   ; GFX10-NEXT:   [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C9]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI2]](s32)
+  ; GFX10-NEXT:   [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI2]](s32)
   ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[XOR1]](s1)
   ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
-  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.5
   ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
   ; GFX10-NEXT:   [[C10:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C11:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
   ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY13]](s1), [[C11]], [[C10]]
@@ -429,14 +429,14 @@ body: |
     %31:_(s1) = G_FCMP floatpred(ogt), %30(s32), %0
     %32:_(s32) = G_CONSTANT i32 1
     %17:_(s32) = G_ADD %16, %32
-    %15:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %31(s1), %14(s32)
+    %15:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %31(s1), %14(s32)
     SI_LOOP %15(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.6
 
   bb.6:
     %33:_(s1) = G_PHI %19(s1), %bb.5
     %34:_(s32) = G_PHI %15(s32), %bb.5
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %34(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %34(s32)
     %35:_(s32) = G_FCONSTANT float 0.000000e+00
     %36:_(s32) = G_FCONSTANT float 1.000000e+00
     %37:_(s32) = G_SELECT %33(s1), %36, %35
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
index 206c0adb6c0c..5bbe3e488689 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
@@ -45,20 +45,20 @@ body: |
   ; GFX10-NEXT:   [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C3]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI2]](s32)
+  ; GFX10-NEXT:   [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI2]](s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
-  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.1
   ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_2]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
   ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY11]](s1), [[C5]], [[C4]]
@@ -89,14 +89,14 @@ body: |
     %16:_(s1) = G_FCMP floatpred(ogt), %15(s32), %0
     %17:_(s32) = G_CONSTANT i32 1
     %11:_(s32) = G_ADD %10, %17
-    %9:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %16(s1), %8(s32)
+    %9:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %16(s1), %8(s32)
     SI_LOOP %9(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.2
 
   bb.2:
     %18:_(s1) = G_PHI %12(s1), %bb.1
     %19:_(s32) = G_PHI %9(s32), %bb.1
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %19(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %19(s32)
     %20:_(s32) = G_FCONSTANT float 0.000000e+00
     %21:_(s32) = G_FCONSTANT float 1.000000e+00
     %22:_(s32) = G_SELECT %18(s1), %21, %20
@@ -165,7 +165,7 @@ body: |
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.2
   ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI4]](s1)
   ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY12]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
   ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[PHI3]], [[C3]](s64)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
@@ -219,7 +219,7 @@ body: |
     successors: %bb.4(0x04000000), %bb.1(0x7c000000)
 
     %13:_(s1) = G_PHI %17(s1), %bb.2, %12(s1), %bb.1
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %14(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %14(s32)
     %18:_(s64) = G_CONSTANT i64 4
     %11:_(p1) = G_PTR_ADD %10, %18(s64)
     %19:_(s32) = G_CONSTANT i32 1
@@ -275,18 +275,18 @@ body: |
   ; GFX10-NEXT:   [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C3]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI2]](s32)
+  ; GFX10-NEXT:   [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI2]](s32)
   ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
   ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY7]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
-  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.1
   ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
   ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY9]](s1), [[C5]], [[C4]]
@@ -317,14 +317,14 @@ body: |
     %16:_(s1) = G_FCMP floatpred(ogt), %15(s32), %0
     %17:_(s32) = G_CONSTANT i32 1
     %11:_(s32) = G_ADD %10, %17
-    %9:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %16(s1), %8(s32)
+    %9:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %16(s1), %8(s32)
     SI_LOOP %9(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.2
 
   bb.2:
     %18:_(s1) = G_PHI %13(s1), %bb.1
     %19:_(s32) = G_PHI %9(s32), %bb.1
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %19(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %19(s32)
     %20:_(s32) = G_FCONSTANT float 0.000000e+00
     %21:_(s32) = G_FCONSTANT float 1.000000e+00
     %22:_(s32) = G_SELECT %18(s1), %21, %20
@@ -372,7 +372,7 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[COPY5]](s1), %bb.0, %40(s1), %bb.8
   ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
   ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY7]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
@@ -432,7 +432,7 @@ body: |
   ; GFX10-NEXT:   G_STORE [[C8]](s32), [[MV1]](p0) :: (store (s32))
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
   ; GFX10-NEXT:   SI_RETURN
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.7:
@@ -443,24 +443,24 @@ body: |
   ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.4, [[DEF]](s32), %bb.3
   ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]](s1)
   ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)
   ; GFX10-NEXT:   [[C9:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY18]], [[C9]]
   ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY17]](s1), [[PHI4]](s32)
+  ; GFX10-NEXT:   [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY17]](s1), [[PHI4]](s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_4:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY19]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_4:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_4]](s1), [[S_AND_B32_4]](s1), implicit-def $scc
-  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   SI_LOOP [[INT]](s32), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.8
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.8:
   ; GFX10-NEXT:   successors: %bb.2(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI9:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.7
+  ; GFX10-NEXT:   [[PHI9:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.7
   ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_4]](s1)
   ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY20]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI9]](s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI9]](s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY21]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_5]](s1), [[S_AND_B32_5]](s1), implicit-def $scc
@@ -493,7 +493,7 @@ body: |
     successors: %bb.5(0x40000000), %bb.6(0x40000000)
 
     %13:sreg_32_xm0_xexec(s1) = G_PHI %14(s1), %bb.8, %10(s1), %bb.0
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %11(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %11(s32)
     %15:sreg_32_xm0_xexec(s32) = SI_IF %13(s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.5
 
@@ -529,7 +529,7 @@ body: |
     G_STORE %33(s32), %6(p0) :: (store (s32))
 
   bb.6:
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32)
     SI_RETURN
 
   bb.7:
@@ -538,10 +538,10 @@ body: |
     %19:_(s32) = G_PHI %31(s32), %bb.4, %7(s32), %bb.3
     %34:_(s1) = G_PHI %29(s1), %bb.4, %20(s1), %bb.3
     %35:_(s1) = G_PHI %32(s1), %bb.4, %20(s1), %bb.3
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %28(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %28(s32)
     %36:_(s1) = G_CONSTANT i1 true
     %37:_(s1) = G_XOR %34, %36
-    %17:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %35(s1), %16(s32)
+    %17:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %35(s1), %16(s32)
     SI_LOOP %17(s32), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.8
 
@@ -550,7 +550,7 @@ body: |
 
     %14:_(s1) = G_PHI %37(s1), %bb.7
     %38:_(s32) = G_PHI %17(s32), %bb.7
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %38(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %38(s32)
     G_BR %bb.2
 ...
 
@@ -605,7 +605,7 @@ body: |
   ; GFX10-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[PHI2]]
   ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[C2]](s1)
   ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[COPY8]](s1)
@@ -629,21 +629,21 @@ body: |
   ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[COPY8]](s1), %bb.4, [[S_OR_B32_]](s1), %bb.5
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.5, [[DEF]](s32), %bb.4
   ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY11]](s1), [[PHI1]](s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
+  ; GFX10-NEXT:   [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY11]](s1), [[PHI1]](s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY7]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
-  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.7
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.7:
   ; GFX10-NEXT:   successors: %bb.8(0x40000000), %bb.9(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.6
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.6
   ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[PHI2]](s32), %bb.6
   ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_1]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
   ; GFX10-NEXT:   [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY12]](s1), %bb.9, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.8
   ; GFX10-NEXT: {{  $}}
@@ -653,7 +653,7 @@ body: |
   ; GFX10-NEXT:   G_STORE [[PHI6]](s32), [[MV1]](p1) :: (store (s32), addrspace 1)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.9:
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)
   ; GFX10-NEXT:   SI_RETURN
   bb.0:
     successors: %bb.1(0x80000000)
@@ -696,7 +696,7 @@ body: |
     successors: %bb.5(0x40000000), %bb.6(0x40000000)
 
     %20:_(s1) = G_CONSTANT i1 true
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32)
     %21:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %1(s32), %12
     %22:sreg_32_xm0_xexec(s32) = SI_IF %21(s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.5
@@ -713,8 +713,8 @@ body: |
 
     %13:_(s32) = G_PHI %25(s32), %bb.5, %9(s32), %bb.4
     %26:_(s1) = G_PHI %23(s1), %bb.5, %20(s1), %bb.4
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %22(s32)
-    %11:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %26(s1), %10(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %22(s32)
+    %11:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %26(s1), %10(s32)
     SI_LOOP %11(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.7
 
@@ -724,7 +724,7 @@ body: |
     %27:_(s32) = G_PHI %11(s32), %bb.6
     %28:sreg_32_xm0_xexec(s1) = G_PHI %14(s1), %bb.6
     %29:_(s32) = G_PHI %12(s32), %bb.6
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %27(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %27(s32)
     %30:sreg_32_xm0_xexec(s32) = SI_IF %28(s1), %bb.9, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.8
 
@@ -734,7 +734,7 @@ body: |
     G_STORE %29(s32), %7(p1) :: (store (s32), addrspace 1)
 
   bb.9:
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %30(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %30(s32)
     SI_RETURN
 ...
 
@@ -803,27 +803,27 @@ body: |
   ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[PHI2]](s1), %bb.1, [[DEF2]](s1), %bb.2
   ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
   ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
   ; GFX10-NEXT:   [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE [[COPY12]]
   ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[FREEZE]](s1)
   ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[FREEZE]](s1)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI4]], [[C4]]
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[PHI4]](s32), [[COPY]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[ICMP1]](s1), [[PHI3]](s32)
+  ; GFX10-NEXT:   [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[ICMP1]](s1), [[PHI3]](s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY13]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY15]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY14]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
-  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
-  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.3
   ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_3]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32)
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
   ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY16]](s1), [[C6]], [[C5]]
@@ -867,19 +867,19 @@ body: |
     successors: %bb.4(0x04000000), %bb.1(0x7c000000)
 
     %23:_(s1) = G_PHI %22(s1), %bb.2, %13(s1), %bb.1
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32)
     %14:_(s1) = G_FREEZE %23
     %24:_(s32) = G_CONSTANT i32 1
     %12:_(s32) = G_ADD %11, %24
     %25:_(s1) = G_ICMP intpred(slt), %11(s32), %0
-    %10:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %25(s1), %9(s32)
+    %10:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %25(s1), %9(s32)
     SI_LOOP %10(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.4
 
   bb.4:
     %26:_(s1) = G_PHI %14(s1), %bb.3
     %27:_(s32) = G_PHI %10(s32), %bb.3
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %27(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %27(s32)
     %28:_(s32) = G_FCONSTANT float 0.000000e+00
     %29:_(s32) = G_FCONSTANT float 1.000000e+00
     %30:_(s32) = G_SELECT %26(s1), %29, %28
@@ -976,7 +976,7 @@ body: |
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %35(s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %35(s32)
   ; GFX10-NEXT:   S_ENDPGM 0
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.5:
@@ -988,20 +988,20 @@ body: |
   ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
   ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]](s1)
   ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY16]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY15]](s1), [[PHI3]](s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+  ; GFX10-NEXT:   [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY15]](s1), [[PHI3]](s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY17]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_4]](s1), [[S_AND_B32_4]](s1), implicit-def $scc
-  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
+  ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.5
   ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_4]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI8]](s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI8]](s32)
   ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY18]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   bb.0:
@@ -1060,7 +1060,7 @@ body: |
     G_BR %bb.5
 
   bb.4:
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %35(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %35(s32)
     S_ENDPGM 0
 
   bb.5:
@@ -1069,8 +1069,8 @@ body: |
     %14:_(s32) = G_PHI %32(s32), %bb.3, %10(s32), %bb.1
     %36:_(s1) = G_PHI %25(s1), %bb.3, %15(s1), %bb.1
     %37:_(s1) = G_PHI %34(s1), %bb.3, %15(s1), %bb.1
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %23(s32)
-    %12:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %37(s1), %11(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %23(s32)
+    %12:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %37(s1), %11(s32)
     SI_LOOP %12(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.6
 
@@ -1079,7 +1079,7 @@ body: |
 
     %38:sreg_32_xm0_xexec(s1) = G_PHI %36(s1), %bb.5
     %39:_(s32) = G_PHI %12(s32), %bb.5
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %39(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %39(s32)
     %35:sreg_32_xm0_xexec(s32) = SI_IF %38(s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.2
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
index 8197b072c740..1d291eeab8e9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
@@ -38,7 +38,7 @@ body: |
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY4]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1
   ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY7]](s1), [[C4]], [[C3]]
@@ -68,7 +68,7 @@ body: |
 
   bb.2:
     %12:_(s1) = G_PHI %6(s1), %bb.0, %11(s1), %bb.1
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %9(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %9(s32)
     %13:_(s32) = G_CONSTANT i32 2
     %14:_(s32) = G_CONSTANT i32 1
     %15:_(s32) = G_SELECT %12(s1), %14, %13
@@ -134,7 +134,7 @@ body: |
   ; GFX10-NEXT: bb.4:
   ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[COPY7]](s1), %bb.1, [[S_OR_B32_]](s1), %bb.2
   ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_ELSE]](s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_ELSE]](s32)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY11]](s1), [[C3]], [[C4]]
@@ -178,7 +178,7 @@ body: |
 
   bb.4:
     %15:_(s1) = G_PHI %9(s1), %bb.1, %13(s1), %bb.2
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %11(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %11(s32)
     %16:_(s32) = G_CONSTANT i32 1
     %17:_(s32) = G_CONSTANT i32 2
     %18:_(s32) = G_SELECT %15(s1), %16, %17
@@ -253,14 +253,14 @@ body: |
   ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.2
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.2, [[DEF]](s32), %bb.1
   ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY8]](s1), [[PHI1]](s32)
-  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+  ; GFX10-NEXT:   [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY8]](s1), [[PHI1]](s32)
+  ; GFX10-NEXT:   SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.3
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
     successors: %bb.1(0x80000000)
@@ -310,14 +310,14 @@ body: |
 
     %11:_(s32) = G_PHI %27(s32), %bb.2, %7(s32), %bb.1
     %30:_(s1) = G_PHI %29(s1), %bb.2, %12(s1), %bb.1
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %20(s32)
-    %9:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %30(s1), %8(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %20(s32)
+    %9:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %30(s1), %8(s32)
     SI_LOOP %9(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.4
 
   bb.4:
     %31:_(s32) = G_PHI %9(s32), %bb.3
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %31(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %31(s32)
     S_ENDPGM 0
 ...
 
@@ -388,9 +388,9 @@ body: |
   ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, %47(s1), %bb.5
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI %32(s32), %bb.5, [[DEF]](s32), %bb.1
   ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY11]](s1), [[PHI1]](s32)
-  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+  ; GFX10-NEXT:   [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY11]](s1), [[PHI1]](s32)
+  ; GFX10-NEXT:   SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
@@ -418,15 +418,15 @@ body: |
   ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.4, [[DEF]](s32), %bb.2
   ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
   ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[COPY13]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY14]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.3
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
-  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32)
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.3
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32)
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
     successors: %bb.1(0x80000000)
@@ -478,8 +478,8 @@ body: |
 
     %14:_(s32) = G_PHI %32(s32), %bb.5, %10(s32), %bb.1
     %33:_(s1) = G_PHI %34(s1), %bb.5, %15(s1), %bb.1
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %23(s32)
-    %12:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %33(s1), %11(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %23(s32)
+    %12:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %33(s1), %11(s32)
     SI_LOOP %12(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.6
 
@@ -502,12 +502,12 @@ body: |
 
     %32:_(s32) = G_PHI %41(s32), %bb.4, %10(s32), %bb.2
     %34:_(s1) = G_PHI %43(s1), %bb.4, %24(s1), %bb.2
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %31(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %31(s32)
     G_BR %bb.3
 
   bb.6:
     %44:_(s32) = G_PHI %12(s32), %bb.3
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %44(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %44(s32)
     S_ENDPGM 0
 ...
 
@@ -581,9 +581,9 @@ body: |
   ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, %60(s1), %bb.5
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI %35(s32), %bb.5, [[DEF]](s32), %bb.1
   ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY13]](s1), [[PHI1]](s32)
-  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+  ; GFX10-NEXT:   [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY13]](s1), [[PHI1]](s32)
+  ; GFX10-NEXT:   SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.8
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
@@ -608,7 +608,7 @@ body: |
   ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI %46(s32), %bb.7, [[DEF]](s32), %bb.2
   ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
   ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[COPY16]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY17]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
@@ -639,15 +639,15 @@ body: |
   ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.6, [[DEF]](s32), %bb.4
   ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1)
   ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY [[COPY19]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY12]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY20]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.8:
-  ; GFX10-NEXT:   [[PHI9:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI9]](s32)
+  ; GFX10-NEXT:   [[PHI9:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.3
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI9]](s32)
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
     successors: %bb.1(0x80000000)
@@ -702,8 +702,8 @@ body: |
 
     %17:_(s32) = G_PHI %35(s32), %bb.5, %13(s32), %bb.1
     %36:_(s1) = G_PHI %37(s1), %bb.5, %18(s1), %bb.1
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %26(s32)
-    %15:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %36(s1), %14(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %26(s32)
+    %15:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %36(s1), %14(s32)
     SI_LOOP %15(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.8
 
@@ -725,7 +725,7 @@ body: |
 
     %35:_(s32) = G_PHI %46(s32), %bb.7, %13(s32), %bb.2
     %37:_(s1) = G_PHI %47(s1), %bb.7, %27(s1), %bb.2
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %34(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %34(s32)
     G_BR %bb.3
 
   bb.6:
@@ -747,12 +747,12 @@ body: |
 
     %46:_(s32) = G_PHI %54(s32), %bb.6, %13(s32), %bb.4
     %47:_(s1) = G_PHI %56(s1), %bb.6, %38(s1), %bb.4
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %45(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %45(s32)
     G_BR %bb.5
 
   bb.8:
     %57:_(s32) = G_PHI %15(s32), %bb.3
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %57(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %57(s32)
     S_ENDPGM 0
 ...
 
@@ -845,7 +845,7 @@ body: |
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %35(s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %35(s32)
   ; GFX10-NEXT:   S_ENDPGM 0
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.5:
@@ -857,20 +857,20 @@ body: |
   ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
   ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]](s1)
   ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY16]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY15]](s1), [[PHI3]](s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+  ; GFX10-NEXT:   [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY15]](s1), [[PHI3]](s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY17]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_4]](s1), [[S_AND_B32_4]](s1), implicit-def $scc
-  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
+  ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.5
   ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_4]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI8]](s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI8]](s32)
   ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY18]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   bb.0:
@@ -929,7 +929,7 @@ body: |
     G_BR %bb.5
 
   bb.4:
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %35(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %35(s32)
     S_ENDPGM 0
 
   bb.5:
@@ -938,8 +938,8 @@ body: |
     %14:_(s32) = G_PHI %32(s32), %bb.3, %10(s32), %bb.1
     %36:_(s1) = G_PHI %25(s1), %bb.3, %15(s1), %bb.1
     %37:_(s1) = G_PHI %34(s1), %bb.3, %15(s1), %bb.1
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %23(s32)
-    %12:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %37(s1), %11(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %23(s32)
+    %12:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %37(s1), %11(s32)
     SI_LOOP %12(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.6
 
@@ -948,7 +948,7 @@ body: |
 
     %38:sreg_32_xm0_xexec(s1) = G_PHI %36(s1), %bb.5
     %39:_(s32) = G_PHI %12(s32), %bb.5
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %39(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %39(s32)
     %35:sreg_32_xm0_xexec(s32) = SI_IF %38(s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.2
 ...
@@ -996,34 +996,34 @@ body: |
   ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
   ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1)
   ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY10]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY9]](s1), %17(s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32)
+  ; GFX10-NEXT:   [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY9]](s1), %17(s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_]](s1)
-  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   SI_LOOP [[INT]](s32), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.3(0x7c000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.1, %19(s32), %bb.3
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[ICMP1]](s1), [[PHI3]](s32)
-  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT1]](s32), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[INT1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[ICMP1]](s1), [[PHI3]](s32)
+  ; GFX10-NEXT:   SI_LOOP [[INT1]](s32), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
   ; GFX10-NEXT:   successors: %bb.5(0x04000000), %bb.7(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INTRINSIC_CONVERGENT]](s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT]](s32)
   ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY5]](s32), [[COPY]]
   ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[C2]](s1)
   ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP]], [[C2]]
   ; GFX10-NEXT:   [[OR:%[0-9]+]]:_(s1) = G_OR [[ICMP2]], [[XOR]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[OR]](s1), %25(s32)
+  ; GFX10-NEXT:   [[INT2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[OR]](s1), %25(s32)
   ; GFX10-NEXT:   [[DEF4:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT:   [[DEF5:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %63(s1), $exec_lo, implicit-def $scc
@@ -1032,26 +1032,26 @@ body: |
   ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY12]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY14]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
-  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT2]](s32), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   SI_LOOP [[INT2]](s32), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.5:
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT2]](s32), %bb.4
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INT2]](s32), %bb.4
   ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
   ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY15]](s1), [[COPY3]], [[COPY2]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT3:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[SELECT]](s32)
-  ; GFX10-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT3]](s32)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[SELECT]](s32)
+  ; GFX10-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
   ; GFX10-NEXT:   successors: %bb.2(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT1]](s32), %bb.3
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INT1]](s32), %bb.3
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
   ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
   ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %42(s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY17]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
@@ -1068,8 +1068,8 @@ body: |
   ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:sreg_32(s1) = PHI [[DEF3]](s1), %bb.0, [[PHI7]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4
   ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.0, [[PHI1]](s1), %bb.2, [[DEF5]](s1), %bb.4
   ; GFX10-NEXT:   [[PHI9:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, [[PHI2]](s1), %bb.2, [[DEF4]](s1), %bb.4
-  ; GFX10-NEXT:   [[PHI10:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT2]](s32), %bb.4, [[PHI10]](s32), %bb.2, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI11:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.4, [[INTRINSIC_CONVERGENT]](s32), %bb.2, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI10:%[0-9]+]]:_(s32) = G_PHI [[INT2]](s32), %bb.4, [[PHI10]](s32), %bb.2, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI11:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.4, [[INT]](s32), %bb.2, [[C]](s32), %bb.0
   ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]](s1)
   ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1)
   ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY [[PHI8]](s1)
@@ -1113,8 +1113,8 @@ body: |
 
     %11:_(s1) = G_PHI %12(s1), %bb.6, %7(s1), %bb.7
     %13:_(s1) = G_PHI %12(s1), %bb.6, %14(s1), %bb.7
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32)
-    %16:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %13(s1), %17(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32)
+    %16:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %13(s1), %17(s32)
     SI_LOOP %16(s32), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.4
 
@@ -1122,26 +1122,26 @@ body: |
     successors: %bb.6(0x04000000), %bb.3(0x7c000000)
 
     %18:_(s32) = G_PHI %9(s32), %bb.1, %19(s32), %bb.3
-    %19:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %10(s1), %18(s32)
+    %19:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %10(s1), %18(s32)
     SI_LOOP %19(s32), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.6
 
   bb.4:
     successors: %bb.5(0x04000000), %bb.7(0x7c000000)
 
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %16(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %16(s32)
     %20:_(s1) = G_ICMP intpred(sgt), %5(s32), %0
     %21:_(s1) = G_CONSTANT i1 true
     %22:_(s1) = G_XOR %8, %21
     %23:_(s1) = G_OR %20, %22
-    %24:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %23(s1), %25(s32)
+    %24:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %23(s1), %25(s32)
     SI_LOOP %24(s32), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.5
 
   bb.5:
     %26:_(s1) = G_PHI %20(s1), %bb.4
     %27:_(s32) = G_PHI %24(s32), %bb.4
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %27(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %27(s32)
     %28:_(s32) = G_SELECT %26(s1), %3, %2
     %29:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), %28(s32)
     $sgpr0 = COPY %29(s32)
@@ -1152,7 +1152,7 @@ body: |
 
     %30:_(s32) = G_PHI %19(s32), %bb.3
     %12:_(s1) = G_CONSTANT i1 false
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %30(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %30(s32)
     G_BR %bb.2
 
   bb.7:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir
index abb491f938e5..fb436623bed2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir
@@ -34,17 +34,17 @@ body: |
   ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C3]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI1]](s32)
+  ; GFX10-NEXT:   [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI1]](s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY4]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY3]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
-  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.1
   ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
   ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY5]](s1), [[C5]], [[C4]]
@@ -73,14 +73,14 @@ body: |
     %14:_(s1) = G_FCMP floatpred(ogt), %13(s32), %0
     %15:_(s32) = G_CONSTANT i32 1
     %9:_(s32) = G_ADD %8, %15
-    %7:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %14(s1), %6(s32)
+    %7:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %14(s1), %6(s32)
     SI_LOOP %7(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.2
 
   bb.2:
     %16:_(s1) = G_PHI %10(s1), %bb.1
     %17:_(s32) = G_PHI %7(s32), %bb.1
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %17(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %17(s32)
     %18:_(s32) = G_FCONSTANT float 0.000000e+00
     %19:_(s32) = G_FCONSTANT float 1.000000e+00
     %20:_(s32) = G_SELECT %16(s1), %19, %18
@@ -121,17 +121,17 @@ body: |
   ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C3]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI1]](s32)
+  ; GFX10-NEXT:   [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI1]](s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY3]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY4]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
-  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.1
   ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
   ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY5]](s1), [[C5]], [[C4]]
@@ -160,14 +160,14 @@ body: |
     %14:_(s1) = G_FCMP floatpred(ogt), %13(s32), %0
     %15:_(s32) = G_CONSTANT i32 1
     %9:_(s32) = G_ADD %8, %15
-    %7:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %14(s1), %6(s32)
+    %7:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %14(s1), %6(s32)
     SI_LOOP %7(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.2
 
   bb.2:
     %16:_(s1) = G_PHI %11(s1), %bb.1
     %17:_(s32) = G_PHI %7(s32), %bb.1
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %17(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %17(s32)
     %18:_(s32) = G_FCONSTANT float 0.000000e+00
     %19:_(s32) = G_FCONSTANT float 1.000000e+00
     %20:_(s32) = G_SELECT %16(s1), %19, %18
@@ -252,7 +252,7 @@ body: |
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %34(s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %34(s32)
   ; GFX10-NEXT:   S_ENDPGM 0
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.5:
@@ -263,19 +263,19 @@ body: |
   ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s1) = G_PHI [[C5]](s1), %bb.3, [[C1]](s1), %bb.1
   ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]](s1)
   ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[PHI4]](s1)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY13]](s1), [[PHI2]](s32)
+  ; GFX10-NEXT:   [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY13]](s1), [[PHI2]](s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY12]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
-  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.5
   ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_2]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32)
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32)
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY14]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   bb.0:
@@ -334,7 +334,7 @@ body: |
     G_BR %bb.5
 
   bb.4:
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %34(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %34(s32)
     S_ENDPGM 0
 
   bb.5:
@@ -343,7 +343,7 @@ body: |
     %15:_(s32) = G_PHI %32(s32), %bb.3, %11(s32), %bb.1
     %35:_(s1) = G_PHI %25(s1), %bb.3, %16(s1), %bb.1
     %36:_(s1) = G_PHI %33(s1), %bb.3, %16(s1), %bb.1
-    %13:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %36(s1), %12(s32)
+    %13:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %36(s1), %12(s32)
     SI_LOOP %13(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.6
 
@@ -352,7 +352,7 @@ body: |
 
     %37:sreg_32_xm0_xexec(s1) = G_PHI %35(s1), %bb.5
     %38:_(s32) = G_PHI %13(s32), %bb.5
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %38(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %38(s32)
     %34:sreg_32_xm0_xexec(s32) = SI_IF %37(s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.2
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir
index cd6a0f8297a5..d1b473f2f41d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir
@@ -27,14 +27,14 @@ body: |
   ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C2]]
   ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[ADD]](s32)
   ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32)
-  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.1
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI3]](s32)
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.1
+  ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI3]](s32)
   ; GFX10-NEXT:   G_STORE [[PHI2]](s32), [[MV]](p0) :: (store (s32))
   ; GFX10-NEXT:   SI_RETURN
   bb.0:
@@ -57,14 +57,14 @@ body: |
     %9:_(s32) = G_ADD %8, %10
     %11:_(s32) = G_UITOFP %9(s32)
     %12:_(s1) = G_FCMP floatpred(ogt), %11(s32), %0
-    %7:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %12(s1), %6(s32)
+    %7:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %12(s1), %6(s32)
     SI_LOOP %7(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.2
 
   bb.2:
     %13:_(s32) = G_PHI %9(s32), %bb.1
     %14:_(s32) = G_PHI %7(s32), %bb.1
-    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %14(s32)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %14(s32)
     G_STORE %13(s32), %3(p0) :: (store (s32))
     SI_RETURN
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll
index fa49b26847e5..8262cfd34823 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll
@@ -34,15 +34,15 @@ define float @test_atomicrmw_fsub(ptr addrspace(3) %addr) {
   ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[LOAD]](s32), %bb.1, %13(s32), %bb.2
   ; CHECK-NEXT:   [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[PHI1]], [[C]]
   ; CHECK-NEXT:   [[ATOMIC_CMPXCHG_WITH_SUCCESS:%[0-9]+]]:_(s32), [[ATOMIC_CMPXCHG_WITH_SUCCESS1:%[0-9]+]]:_(s1) = G_ATOMIC_CMPXCHG_WITH_SUCCESS [[COPY]](p3), [[PHI1]], [[FSUB]] :: (load store seq_cst seq_cst (s32) on %ir.addr, addrspace 3)
-  ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[ATOMIC_CMPXCHG_WITH_SUCCESS1]](s1), [[PHI]](s64)
-  ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:%[0-9]+]]:_(s1) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), [[INTRINSIC_CONVERGENT]](s64)
-  ; CHECK-NEXT:   G_BRCOND [[INTRINSIC_CONVERGENT_W_SIDE_EFFECTS]](s1), %bb.3
+  ; CHECK-NEXT:   [[INTRINSIC:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[ATOMIC_CMPXCHG_WITH_SUCCESS1]](s1), [[PHI]](s64)
+  ; CHECK-NEXT:   [[INTRINSIC_W_SIDE_EFFECTS:%[0-9]+]]:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), [[INTRINSIC]](s64)
+  ; CHECK-NEXT:   G_BRCOND [[INTRINSIC_W_SIDE_EFFECTS]](s1), %bb.3
   ; CHECK-NEXT:   G_BR %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3.atomicrmw.end:
   ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ATOMIC_CMPXCHG_WITH_SUCCESS]](s32), %bb.2
-  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:_(s64) = G_PHI [[INTRINSIC_CONVERGENT]](s64), %bb.2
-  ; CHECK-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI3]](s64)
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:_(s64) = G_PHI [[INTRINSIC]](s64), %bb.2
+  ; CHECK-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI3]](s64)
   ; CHECK-NEXT:   $vgpr0 = COPY [[PHI2]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
   %oldval = atomicrmw fsub ptr addrspace(3) %addr, float 1.0 seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
index 0b21c2112f05..6d32d4c720c9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
@@ -97,8 +97,8 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[TRUNC]], [[C]]
-  ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:%[0-9]+]]:_(s1), [[INTRINSIC_CONVERGENT_W_SIDE_EFFECTS1:%[0-9]+]]:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), [[XOR]](s1)
-  ; CHECK-NEXT:   G_BRCOND [[INTRINSIC_CONVERGENT_W_SIDE_EFFECTS]](s1), %bb.2
+  ; CHECK-NEXT:   [[INTRINSIC_W_SIDE_EFFECTS:%[0-9]+]]:_(s1), [[INTRINSIC_W_SIDE_EFFECTS1:%[0-9]+]]:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), [[XOR]](s1)
+  ; CHECK-NEXT:   G_BRCOND [[INTRINSIC_W_SIDE_EFFECTS]](s1), %bb.2
   ; CHECK-NEXT:   G_BR %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.bb1:
@@ -108,7 +108,7 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
   ; CHECK-NEXT:   G_BR %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3.bb2:
-  ; CHECK-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INTRINSIC_CONVERGENT_W_SIDE_EFFECTS1]](s64)
+  ; CHECK-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INTRINSIC_W_SIDE_EFFECTS1]](s64)
   ; CHECK-NEXT:   SI_RETURN
 bb:
   br i1 %arg, label %bb2, label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/is-safe-to-sink-bug.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/is-safe-to-sink-bug.ll
new file mode 100644
index 000000000000..d3bc661f5940
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/is-safe-to-sink-bug.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel -verify-machineinstrs < %s | FileCheck %s
+
+; early-tailduplication deletes cycle exit block created by structurize-cfg
+; that had exactly one predecessor. Now, new cycle exit block has two
+; predecessors, we need to find predecessor that belongs to the cycle.
+
+define amdgpu_ps void @_amdgpu_ps_main(i1 %arg) {
+; CHECK-LABEL: _amdgpu_ps_main:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
+; CHECK-NEXT:    s_mov_b32 s5, s4
+; CHECK-NEXT:    s_mov_b32 s6, s4
+; CHECK-NEXT:    s_mov_b32 s7, s4
+; CHECK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; CHECK-NEXT:    s_buffer_load_dword s1, s[4:7], 0x0
+; CHECK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; CHECK-NEXT:    s_mov_b32 s10, -1
+; CHECK-NEXT:    s_mov_b32 s11, 0x31c16000
+; CHECK-NEXT:    s_add_u32 s8, s8, s0
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, v0
+; CHECK-NEXT:    s_addc_u32 s9, s9, 0
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_cmp_ge_i32 s1, 0
+; CHECK-NEXT:    s_cbranch_scc0 .LBB0_2
+; CHECK-NEXT:  .LBB0_1: ; %bb12
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s4
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[8:9]
+; CHECK-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; CHECK-NEXT:    s_swappc_b64 s[30:31], 0
+; CHECK-NEXT:  .LBB0_2: ; %bb2.preheader
+; CHECK-NEXT:    s_mov_b32 s1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, s1
+; CHECK-NEXT:    s_branch .LBB0_4
+; CHECK-NEXT:    .p2align 6
+; CHECK-NEXT:  .LBB0_3: ; %bb6
+; CHECK-NEXT:    ; in Loop: Header=BB0_4 Depth=1
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; CHECK-NEXT:    s_and_b32 s2, 1, s2
+; CHECK-NEXT:    v_or_b32_e32 v1, 1, v0
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s2, 0, s2
+; CHECK-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v0
+; CHECK-NEXT:    v_mov_b32_e32 v0, v1
+; CHECK-NEXT:    s_and_b32 s4, s2, s1
+; CHECK-NEXT:    s_andn2_b32 s1, s1, exec_lo
+; CHECK-NEXT:    s_and_b32 s2, exec_lo, s4
+; CHECK-NEXT:    s_or_b32 s1, s1, s2
+; CHECK-NEXT:    s_cbranch_vccz .LBB0_1
+; CHECK-NEXT:  .LBB0_4: ; %bb2
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_mov_b32 s2, 0
+; CHECK-NEXT:    s_and_saveexec_b32 s3, s0
+; CHECK-NEXT:    s_cbranch_execz .LBB0_3
+; CHECK-NEXT:  ; %bb.5: ; %bb5
+; CHECK-NEXT:    ; in Loop: Header=BB0_4 Depth=1
+; CHECK-NEXT:    s_mov_b32 s2, 1
+; CHECK-NEXT:    s_branch .LBB0_3
+bb:
+  %i = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> zeroinitializer, i32 0, i32 0)
+  %i1 = icmp slt i32 %i, 0
+  br i1 %i1, label %bb2, label %bb12
+
+bb2:
+  %i3 = phi i1 [ %i9, %bb6 ], [ false, %bb ]
+  %i4 = phi i32 [ %i10, %bb6 ], [ 0, %bb ]
+  br i1 %arg, label %bb5, label %bb6
+
+bb5:
+  br label %bb6
+
+bb6:
+  %i7 = phi i32 [ 0, %bb2 ], [ 1, %bb5 ]
+  %i8 = icmp ne i32 %i7, 0
+  %i9 = select i1 %i8, i1 %i3, i1 false
+  %i10 = or i32 %i4, 1
+  %i11 = icmp slt i32 %i4, 0
+  br i1 %i11, label %bb2, label %bb12
+
+bb12:
+  %i13 = phi i1 [ false, %bb ], [ %i9, %bb6 ]
+  %i14 = select i1 %i13, float 0.000000e+00, float 1.000000e+00
+  %i15 = insertelement <4 x float> zeroinitializer, float %i14, i64 0
+  call amdgpu_gfx addrspace(4) void null(<4 x float> %i15, i32 0)
+  unreachable
+}
+
+declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if-invalid.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if-invalid.mir
index 1e87f8b62ca5..b7e52cadd8cd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if-invalid.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if-invalid.mir
@@ -2,12 +2,12 @@
 
 # Make sure incorrect usage of control flow intrinsics fails to select in case some transform separated the intrinsic from its branch.
 
-# ERR: remark: <unknown>:0:0: unable to legalize instruction: %3:_(s1), %4:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1) (in function: brcond_si_if_different_block)
-# ERR-NEXT: remark: <unknown>:0:0: unable to legalize instruction: %3:_(s1), %4:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1) (in function: si_if_not_brcond_user)
-# ERR-NEXT: remark: <unknown>:0:0: unable to legalize instruction: %3:_(s1), %4:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1) (in function: si_if_multi_user)
-# ERR-NEXT: remark: <unknown>:0:0: unable to legalize instruction: %3:_(s1), %4:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1) (in function: brcond_si_if_xor_0)
-# ERR-NEXT: remark: <unknown>:0:0: unable to legalize instruction: %3:_(s1), %4:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1) (in function: brcond_si_if_or_neg1)
-# ERR-NEXT: remark: <unknown>:0:0: unable to legalize instruction: %3:_(s1), %4:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1) (in function: brcond_si_if_negated_multi_use)
+# ERR: remark: <unknown>:0:0: unable to legalize instruction: %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1) (in function: brcond_si_if_different_block)
+# ERR-NEXT: remark: <unknown>:0:0: unable to legalize instruction: %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1) (in function: si_if_not_brcond_user)
+# ERR-NEXT: remark: <unknown>:0:0: unable to legalize instruction: %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1) (in function: si_if_multi_user)
+# ERR-NEXT: remark: <unknown>:0:0: unable to legalize instruction: %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1) (in function: brcond_si_if_xor_0)
+# ERR-NEXT: remark: <unknown>:0:0: unable to legalize instruction: %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1) (in function: brcond_si_if_or_neg1)
+# ERR-NEXT: remark: <unknown>:0:0: unable to legalize instruction: %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1) (in function: brcond_si_if_negated_multi_use)
 
 
 ---
@@ -19,7 +19,7 @@ body:             |
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s1) = G_ICMP intpred(ne), %0, %1
-    %3:_(s1), %4:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2
+    %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2
 
   bb.1:
       G_BRCOND %3, %bb.1
@@ -34,7 +34,7 @@ body:             |
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s1) = G_ICMP intpred(ne), %0, %1
-    %3:_(s1), %4:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2
+    %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2
     %5:_(s32) = G_SELECT %3, %0, %1
     S_ENDPGM 0, implicit %5
 
@@ -48,7 +48,7 @@ body:             |
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s1) = G_ICMP intpred(ne), %0, %1
-    %3:_(s1), %4:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2
+    %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2
     %5:_(s32) = G_SELECT %3, %0, %1
     G_BRCOND %3, %bb.1
 
@@ -67,7 +67,7 @@ body:             |
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s1) = G_ICMP intpred(ne), %0, %1
-    %3:_(s1), %4:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2
+    %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2
     %5:_(s1) = G_CONSTANT i1 false
     %6:_(s1) = G_XOR %3, %5
     G_BRCOND %6, %bb.2
@@ -93,7 +93,7 @@ body:             |
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s1) = G_ICMP intpred(ne), %0, %1
-    %3:_(s1), %4:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2
+    %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2
     %5:_(s1) = G_CONSTANT i1 true
     %6:_(s1) = G_OR %3, %5
     G_BRCOND %6, %bb.2
@@ -118,7 +118,7 @@ body:             |
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s1) = G_ICMP intpred(ne), %0, %1
-    %3:_(s1), %4:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2
+    %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2
     %5:_(s1) = G_CONSTANT i1 true
     %6:_(s1) = G_XOR %3, %5
     S_NOP 0, implicit %6
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if.xfail.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if.xfail.mir
index bfc398dba3a9..9716bb31db3f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if.xfail.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if.xfail.mir
@@ -2,7 +2,7 @@
 
 # Make sure there's no crash if there is somehow no successor block.
 
-# ERR: remark: <unknown>:0:0: unable to legalize instruction: %3:_(s1), %4:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1) (in function: brcond_si_if_no_succ_block)
+# ERR: remark: <unknown>:0:0: unable to legalize instruction: %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1) (in function: brcond_si_if_no_succ_block)
 
 ---
 name: brcond_si_if_no_succ_block
@@ -16,6 +16,6 @@ body:             |
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s1) = G_ICMP intpred(ne), %0, %1
-    %3:_(s1), %4:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2
+    %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2
     G_BRCOND %3, %bb.1
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir
index 137631a6a8f8..9be5e14cdc71 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir
@@ -150,7 +150,7 @@ body:             |
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s1) = G_ICMP intpred(ne), %0, %1
-    %3:_(s1), %4:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2
+    %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2
     G_BRCOND %3, %bb.1
 
   bb.1:
@@ -189,7 +189,7 @@ body:             |
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s1) = G_ICMP intpred(ne), %0, %1
-    %3:_(s1), %4:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), %2
+    %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), %2
     G_BRCOND %3, %bb.1
 
   bb.1:
@@ -244,7 +244,7 @@ body:             |
   bb.1:
     successors: %bb.1, %bb.2
     S_NOP 0
-    %3:_(s1) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), %2
+    %3:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), %2
     G_BRCOND %3, %bb.2
     G_BR %bb.1
 
@@ -303,7 +303,7 @@ body:             |
   bb.1:
     successors: %bb.1, %bb.2
     S_NOP 0
-    %3:_(s1) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), %2
+    %3:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), %2
     G_BRCOND %3, %bb.1
     G_BR %bb.2
 
@@ -360,7 +360,7 @@ body:             |
   bb.1:
     successors: %bb.1, %bb.2
     S_NOP 0
-    %3:_(s1) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), %2
+    %3:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), %2
     G_BRCOND %3, %bb.1
 
   bb.2:
@@ -405,7 +405,7 @@ body:             |
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s1) = G_ICMP intpred(ne), %0, %1
-    %3:_(s1), %4:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2
+    %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2
     %5:_(s32) = COPY $vgpr2
     G_BRCOND %3, %bb.1
 
@@ -466,7 +466,7 @@ body:             |
   bb.1:
     successors: %bb.1, %bb.2
     S_NOP 0
-    %3:_(s1) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), %2
+    %3:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), %2
     S_NOP 0
     S_NOP 0
     G_BRCOND %3, %bb.2
@@ -521,7 +521,7 @@ body:             |
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s1) = G_ICMP intpred(ne), %0, %1
-    %3:_(s1), %4:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2
+    %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2
     %5:_(s1) = G_CONSTANT i1 true
     %6:_(s1) = G_XOR %3, %5
     G_BRCOND %6, %bb.2
@@ -588,7 +588,7 @@ body:             |
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s1) = G_ICMP intpred(ne), %0, %1
-    %3:_(s1), %4:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2
+    %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2
     %5:_(s1) = G_CONSTANT i1 true
     %6:_(s1) = G_XOR %3, %5
     G_BRCOND %6, %bb.2
@@ -653,7 +653,7 @@ body:             |
   bb.1:
     successors: %bb.1, %bb.2
     S_NOP 0
-    %3:_(s1) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), %2
+    %3:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), %2
     %4:_(s1) = G_CONSTANT i1 true
     %5:_(s1) = G_XOR %3, %4
     G_BRCOND %5, %bb.1
@@ -711,7 +711,7 @@ body:             |
   bb.1:
     successors: %bb.1, %bb.2
     S_NOP 0
-    %3:_(s1) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), %2
+    %3:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), %2
     %4:_(s1) = G_CONSTANT i1 true
     %5:_(s1) = G_XOR %3, %4
     G_BRCOND %5, %bb.2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
index f7adfe47b64f..727184a36c00 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
@@ -43,8 +43,8 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
   store i32 %tmp0, ptr addrspace(1) %out
   ret void
 }
-define amdgpu_kernel void @update_dpp64_test(ptr addrspace(1) %arg, i64 %in1, i64 %in2) {
-; GFX8-LABEL: update_dpp64_test:
+define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i64 %in2) {
+; GFX8-LABEL: update_dppi64_test:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
@@ -62,7 +62,7 @@ define amdgpu_kernel void @update_dpp64_test(ptr addrspace(1) %arg, i64 %in1, i6
 ; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
 ; GFX8-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: update_dpp64_test:
+; GFX10-LABEL: update_dppi64_test:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
@@ -76,7 +76,7 @@ define amdgpu_kernel void @update_dpp64_test(ptr addrspace(1) %arg, i64 %in1, i6
 ; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: update_dpp64_test:
+; GFX11-LABEL: update_dppi64_test:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
@@ -98,6 +98,335 @@ define amdgpu_kernel void @update_dpp64_test(ptr addrspace(1) %arg, i64 %in1, i6
   ret void
 }
 
+define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1, double %in2) {
+; GFX8-LABEL: update_dppf64_test:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT:    v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX10-LABEL: update_dppf64_test:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX10-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX10-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: update_dppf64_test:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX11-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX11-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id
+  %load = load double, ptr addrspace(1) %gep
+  %tmp0 = call double @llvm.amdgcn.update.dpp.f64(double %in1, double %load, i32 1, i32 1, i32 1, i1 false) #1
+  store double %tmp0, ptr addrspace(1) %gep
+  ret void
+}
+
+define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> %in1, <2 x i32> %in2) {
+; GFX8-LABEL: update_dppv2i32_test:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT:    v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX10-LABEL: update_dppv2i32_test:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX10-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX10-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: update_dppv2i32_test:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX11-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX11-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <2 x i32>, ptr addrspace(1) %arg, i32 %id
+  %load = load <2 x i32>, ptr addrspace(1) %gep
+  %tmp0 = call <2 x i32> @llvm.amdgcn.update.dpp.v2i32(<2 x i32> %in1, <2 x i32> %load, i32 1, i32 1, i32 1, i1 false) #1
+  store <2 x i32> %tmp0, ptr addrspace(1) %gep
+  ret void
+}
+
+define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x float> %in1, <2 x float> %in2) {
+; GFX8-LABEL: update_dppv2f32_test:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT:    v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX10-LABEL: update_dppv2f32_test:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX10-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX10-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: update_dppv2f32_test:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX11-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX11-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %arg, i32 %id
+  %load = load <2 x float>, ptr addrspace(1) %gep
+  %tmp0 = call <2 x float> @llvm.amdgcn.update.dpp.v2f32(<2 x float> %in1, <2 x float> %load, i32 1, i32 1, i32 1, i1 false) #1
+  store <2 x float> %tmp0, ptr addrspace(1) %gep
+  ret void
+}
+
+define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, ptr %in2) {
+; GFX8-LABEL: update_dpp_p0_test:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT:    v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX10-LABEL: update_dpp_p0_test:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX10-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX10-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: update_dpp_p0_test:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX11-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX11-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds ptr, ptr addrspace(1) %arg, i32 %id
+  %load = load ptr, ptr addrspace(1) %gep
+  %tmp0 = call ptr @llvm.amdgcn.update.dpp.v2f32(ptr %in1, ptr %load, i32 1, i32 1, i32 1, i1 false) #1
+  store ptr %tmp0, ptr addrspace(1) %gep
+  ret void
+}
+
+define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspace(3) %in1, ptr %in2) {
+; GFX8-LABEL: update_dpp_p3_test:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT:    ds_read_b32 v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT:    ds_write_b32 v0, v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX10-LABEL: update_dpp_p3_test:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-NEXT:    ds_read_b32 v1, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX10-NEXT:    ds_write_b32 v0, v2
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: update_dpp_p3_test:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-NEXT:    ds_load_b32 v1, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX11-NEXT:    ds_store_b32 v0, v2
+; GFX11-NEXT:    s_endpgm
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %arg, i32 %id
+  %load = load ptr addrspace(3), ptr addrspace(3) %gep
+  %tmp0 = call ptr addrspace(3) @llvm.amdgcn.update.dpp.p3(ptr addrspace(3) %in1, ptr addrspace(3) %load, i32 1, i32 1, i32 1, i1 false) #1
+  store ptr addrspace(3) %tmp0, ptr addrspace(3) %gep
+  ret void
+}
+
+define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspace(5) %in1, ptr %in2) {
+; GFX8-LABEL: update_dpp_p5_test:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT:    s_mov_b32 s90, -1
+; GFX8-NEXT:    s_mov_b32 s91, 0xe80000
+; GFX8-NEXT:    s_add_u32 s88, s88, s3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT:    s_addc_u32 s89, s89, 0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT:    buffer_load_dword v1, v0, s[88:91], 0 offen
+; GFX8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT:    buffer_store_dword v2, v0, s[88:91], 0 offen
+; GFX8-NEXT:    s_endpgm
+;
+; GFX10-LABEL: update_dpp_p5_test:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
+; GFX10-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
+; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_mov_b32 s7, 0x31c16000
+; GFX10-NEXT:    s_add_u32 s4, s4, s3
+; GFX10-NEXT:    s_addc_u32 s5, s5, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-NEXT:    buffer_load_dword v1, v0, s[4:7], 0 offen
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX10-NEXT:    buffer_store_dword v2, v0, s[4:7], 0 offen
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: update_dpp_p5_test:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-NEXT:    scratch_load_b32 v1, v0, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX11-NEXT:    scratch_store_b32 v0, v2, off
+; GFX11-NEXT:    s_endpgm
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds ptr addrspace(5), ptr addrspace(5) %arg, i32 %id
+  %load = load ptr addrspace(5), ptr addrspace(5) %gep
+  %tmp0 = call ptr addrspace(5) @llvm.amdgcn.update.dpp.p5(ptr addrspace(5) %in1, ptr addrspace(5) %load, i32 1, i32 1, i32 1, i1 false) #1
+  store ptr addrspace(5) %tmp0, ptr addrspace(5) %gep
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #1
 declare i64 @llvm.amdgcn.update.dpp.i64(i64, i64, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
index 2e62d13f1e69..09882c446fc0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
@@ -4,9 +4,9 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -mattr=+flat-for-global -verify-machineinstrs | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mattr=+flat-for-global -mcpu=hawaii -verify-machineinstrs | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -verify-machineinstrs | FileCheck -check-prefixes=ALL,PACKED-TID %s
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 | FileCheck -check-prefixes=ALL,PACKED-TID %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -global-isel -mtriple=amdgcn -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx11-generic -verify-machineinstrs -amdgpu-enable-vopd=0 | FileCheck -check-prefixes=ALL,PACKED-TID %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -verify-machineinstrs | FileCheck -check-prefixes=ALL,PACKED-TID %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 | FileCheck -check-prefixes=ALL,PACKED-TID %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -global-isel -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx11-generic -verify-machineinstrs -amdgpu-enable-vopd=0 | FileCheck -check-prefixes=ALL,PACKED-TID %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 declare i32 @llvm.amdgcn.workitem.id.y() #0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.else.32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.else.32.mir
index 30a589fe5719..a0711e6c779c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.else.32.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.else.32.mir
@@ -15,8 +15,8 @@ body: |
     ; CHECK: liveins: $sgpr0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
-    ; CHECK-NEXT: [[INT:%[0-9]+]]:vcc(s1), [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), [[COPY]](s32)
+    ; CHECK-NEXT: [[INT:%[0-9]+]]:vcc(s1), [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), [[COPY]](s32)
     %0:_(s32) = COPY $sgpr0
-    %1:_(s1), %2:_(s32) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), %0
+    %1:_(s1), %2:_(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), %0
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.else.64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.else.64.mir
index 0b62155da496..8a2cbd0eafd6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.else.64.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.else.64.mir
@@ -12,8 +12,8 @@ body: |
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1
-    ; CHECK-NEXT: [[INT:%[0-9]+]]:vcc(s1), [[INT1:%[0-9]+]]:sgpr(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), [[COPY]](s64)
+    ; CHECK-NEXT: [[INT:%[0-9]+]]:vcc(s1), [[INT1:%[0-9]+]]:sgpr(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), [[COPY]](s64)
     %0:_(s64) = COPY $sgpr0_sgpr1
-    %1:_(s1), %2:_(s64) = G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), %0
+    %1:_(s1), %2:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), %0
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 4c9c34de7194..a86a3f6f279d 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -17008,33 +17008,27 @@ define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) {
 ;
 ; GFX8-LABEL: s_fabs_bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_mov_b32_e32 v0, 0x7fff
-; GFX8-NEXT:    v_and_b32_e32 v0, s0, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fabs_bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v0, s0, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fabs_bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_and_b32_e64 v0, 0x7fff, s0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fabs_bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_and_b32_e64 v0, 0x7fff, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX11-NEXT:    ; return to shader part epilog
   %op = call bfloat @llvm.fabs.bf16(bfloat %a)
   %cast = bitcast bfloat %op to i16
@@ -17059,25 +17053,25 @@ define bfloat @v_fneg_bf16(bfloat %a) {
 ; GFX8-LABEL: v_fneg_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fneg_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fneg_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX10-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX11-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fneg bfloat %a
   ret bfloat %op
@@ -17089,49 +17083,41 @@ declare i32 @llvm.amdgcn.readfirstlane(i32)
 define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) {
 ; GCN-LABEL: s_fneg_bf16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    v_mul_f32_e64 v0, 1.0, s0
+; GCN-NEXT:    v_mul_f32_e64 v0, -1.0, s0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
 ; GCN-NEXT:    v_readfirstlane_b32 s0, v0
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX7-LABEL: s_fneg_bf16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    v_mul_f32_e64 v0, 1.0, s0
+; GFX7-NEXT:    v_mul_f32_e64 v0, -1.0, s0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fneg_bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff8000
-; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    s_xor_b32 s0, s0, 0x8000
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fneg_bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff8000
-; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    s_xor_b32 s0, s0, 0x8000
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fneg_bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_xor_b32_e64 v0, 0xffff8000, s0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    s_xor_b32 s0, s0, 0x8000
+; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fneg_bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_xor_b32_e64 v0, 0xffff8000, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_xor_b32 s0, s0, 0x8000
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX11-NEXT:    ; return to shader part epilog
   %op = fneg bfloat %a
   %cast = bitcast bfloat %op to i16
@@ -17166,25 +17152,25 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
 ; GFX8-LABEL: v_fneg_fabs_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, 0x8000, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fneg_fabs_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, 0x8000, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fneg_fabs_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
+; GFX10-NEXT:    v_or_b32_e32 v0, 0x8000, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_fabs_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, 0x8000, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
   %op = fneg bfloat %fabs
@@ -17196,48 +17182,48 @@ define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
 ; GCN-LABEL: s_fneg_fabs_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_mul_f32_e64 v0, 1.0, s0
-; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT:    v_or_b32_e32 v0, 0x8000, v0
 ; GCN-NEXT:    v_readfirstlane_b32 s0, v0
+; GCN-NEXT:    s_and_b32 s0, s0, 0xffff0000
+; GCN-NEXT:    s_bitset0_b32 s0, 31
+; GCN-NEXT:    s_and_b32 s0, s0, 0xffff0000
+; GCN-NEXT:    s_xor_b32 s0, s0, 0x80000000
+; GCN-NEXT:    s_lshr_b32 s0, s0, 16
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX7-LABEL: s_fneg_fabs_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    v_mul_f32_e64 v0, 1.0, s0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, 0x8000, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff0000
+; GFX7-NEXT:    s_bitset0_b32 s0, 31
+; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff0000
+; GFX7-NEXT:    s_xor_b32 s0, s0, 0x80000000
+; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fneg_fabs_bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff8000
-; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    s_bitset1_b32 s0, 15
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fneg_fabs_bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff8000
-; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    s_bitset1_b32 s0, 15
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fneg_fabs_bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_or_b32_e64 v0, 0xffff8000, s0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    s_bitset1_b32 s0, 15
+; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fneg_fabs_bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_or_b32_e64 v0, 0xffff8000, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_bitset1_b32 s0, 15
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX11-NEXT:    ; return to shader part epilog
   %fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
   %op = fneg bfloat %fabs
@@ -27280,34 +27266,27 @@ define bfloat @v_copysign_bf16_bf16(bfloat %mag, bfloat %sign) {
 ; GFX8-LABEL: v_copysign_bf16_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_copysign_bf16_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_copysign_bf16_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_copysign_bf16_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
   ret bfloat %op
@@ -27339,36 +27318,29 @@ define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) {
 ; GFX8-LABEL: v_copysign_bf16_s_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffff8000
-; GFX8-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_movk_i32 s5, 0x7fff
+; GFX8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX8-NEXT:    v_bfi_b32 v0, s5, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_copysign_bf16_s_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff8000
-; GFX9-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_movk_i32 s5, 0x7fff
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_bfi_b32 v0, s5, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_copysign_bf16_s_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e64 v1, 0xffff8000, s4
-; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, v0, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_copysign_bf16_s_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e64 v1, 0xffff8000, s0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, v0, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
   ret bfloat %op
@@ -27400,36 +27372,29 @@ define bfloat @v_copysign_s_bf16_bf16(bfloat inreg %mag, bfloat %sign) {
 ; GFX8-LABEL: v_copysign_s_bf16_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_movk_i32 s5, 0x7fff
+; GFX8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX8-NEXT:    v_bfi_b32 v0, s5, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_copysign_s_bf16_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    s_movk_i32 s5, 0x7fff
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_bfi_b32 v0, s5, v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_copysign_s_bf16_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
-; GFX10-NEXT:    v_and_b32_e64 v1, 0x7fff, s4
-; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, s4, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_copysign_s_bf16_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
-; GFX11-NEXT:    v_and_b32_e64 v1, 0x7fff, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
   ret bfloat %op
@@ -27461,35 +27426,32 @@ define bfloat @v_copysign_bf16_f32(bfloat %mag, float %sign.f32) {
 ; GFX8-LABEL: v_copysign_bf16_f32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_copysign_bf16_f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_copysign_bf16_f32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
-; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_copysign_bf16_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sign = fptrunc float %sign.f32 to bfloat
   %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
@@ -27522,35 +27484,32 @@ define bfloat @v_copysign_bf16_f64(bfloat %mag, double %sign.f64) {
 ; GFX8-LABEL: v_copysign_bf16_f64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, 0x80000000, v2
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_copysign_bf16_f64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, 0x80000000, v2
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_copysign_bf16_f64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, 0x80000000, v2
-; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_copysign_bf16_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0x80000000, v2
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sign = fptrunc double %sign.f64 to bfloat
   %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
@@ -27583,34 +27542,27 @@ define bfloat @v_copysign_bf16_f16(bfloat %mag, half %sign.f16) {
 ; GFX8-LABEL: v_copysign_bf16_f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_copysign_bf16_f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_copysign_bf16_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_copysign_bf16_f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sign = bitcast half %sign.f16 to bfloat
   %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
@@ -27640,41 +27592,37 @@ define amdgpu_ps i32 @s_copysign_bf16_bf16(bfloat inreg %mag, bfloat inreg %sign
 ;
 ; GFX8-LABEL: s_copysign_bf16_bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff8000
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8-NEXT:    v_and_b32_e32 v0, s1, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, s0, v1
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_movk_i32 s2, 0x7fff
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_bfi_b32 v0, s2, v0, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_copysign_bf16_bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff8000
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v0, s1, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, s0, v1
-; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    s_movk_i32 s2, 0x7fff
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_bfi_b32 v0, s2, v0, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_copysign_bf16_bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_and_b32_e64 v0, 0xffff8000, s1
-; GFX10-NEXT:    v_and_b32_e64 v1, 0x7fff, s0
-; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s1
+; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_copysign_bf16_bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_and_b32_e64 v0, 0xffff8000, s1
-; GFX11-NEXT:    v_and_b32_e64 v1, 0x7fff, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
@@ -27709,46 +27657,39 @@ define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f
 ;
 ; GFX8-LABEL: s_copysign_bf16_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_mov_b32_e32 v0, 0x7fff
-; GFX8-NEXT:    v_and_b32_e32 v0, s0, v0
-; GFX8-NEXT:    s_and_b32 s0, s1, 0x80000000
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 16, s1
+; GFX8-NEXT:    s_movk_i32 s1, 0x7fff
+; GFX8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NEXT:    v_bfi_b32 v0, s1, v1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_copysign_bf16_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v0, s0, v0
-; GFX9-NEXT:    s_and_b32 s0, s1, 0x80000000
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX9-NEXT:    v_lshrrev_b32_e64 v0, 16, s1
+; GFX9-NEXT:    s_movk_i32 s1, 0x7fff
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    v_bfi_b32 v0, s1, v1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_copysign_bf16_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_and_b32_e64 v0, 0x7fff, s0
-; GFX10-NEXT:    s_and_b32 s0, s1, 0x80000000
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX10-NEXT:    v_lshrrev_b32_e64 v0, 16, s1
+; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_copysign_bf16_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_and_b32_e64 v0, 0x7fff, s0
-; GFX11-NEXT:    s_and_b32 s0, s1, 0x80000000
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX11-NEXT:    v_lshrrev_b32_e64 v0, 16, s1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %sign = fptrunc float %sign.f32 to bfloat
@@ -27782,46 +27723,39 @@ define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign.
 ;
 ; GFX8-LABEL: s_copysign_bf16_f64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_mov_b32_e32 v0, 0x7fff
-; GFX8-NEXT:    v_and_b32_e32 v0, s0, v0
-; GFX8-NEXT:    s_and_b32 s0, s2, 0x80000000
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 16, s2
+; GFX8-NEXT:    s_movk_i32 s1, 0x7fff
+; GFX8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NEXT:    v_bfi_b32 v0, s1, v1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_copysign_bf16_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v0, s0, v0
-; GFX9-NEXT:    s_and_b32 s0, s2, 0x80000000
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX9-NEXT:    v_lshrrev_b32_e64 v0, 16, s2
+; GFX9-NEXT:    s_movk_i32 s1, 0x7fff
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    v_bfi_b32 v0, s1, v1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_copysign_bf16_f64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_and_b32_e64 v0, 0x7fff, s0
-; GFX10-NEXT:    s_and_b32 s0, s2, 0x80000000
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX10-NEXT:    v_lshrrev_b32_e64 v0, 16, s2
+; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_copysign_bf16_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_and_b32_e64 v0, 0x7fff, s0
-; GFX11-NEXT:    s_and_b32 s0, s2, 0x80000000
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX11-NEXT:    v_lshrrev_b32_e64 v0, 16, s2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %sign = fptrunc double %sign.f64 to bfloat
@@ -27855,41 +27789,37 @@ define amdgpu_ps i32 @s_copysign_bf16_f16(bfloat inreg %mag, half inreg %sign.f1
 ;
 ; GFX8-LABEL: s_copysign_bf16_f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff8000
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8-NEXT:    v_and_b32_e32 v0, s1, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, s0, v1
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_movk_i32 s2, 0x7fff
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_bfi_b32 v0, s2, v0, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_copysign_bf16_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff8000
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v0, s1, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, s0, v1
-; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    s_movk_i32 s2, 0x7fff
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_bfi_b32 v0, s2, v0, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_copysign_bf16_f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_and_b32_e64 v0, 0xffff8000, s1
-; GFX10-NEXT:    v_and_b32_e64 v1, 0x7fff, s0
-; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s1
+; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_copysign_bf16_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_and_b32_e64 v0, 0xffff8000, s1
-; GFX11-NEXT:    v_and_b32_e64 v1, 0x7fff, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
@@ -33706,7 +33636,7 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -33715,7 +33645,7 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX9-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -33724,7 +33654,7 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX10-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -33733,7 +33663,7 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX11-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
@@ -33770,7 +33700,7 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
+; GFX8-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -33779,7 +33709,7 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
+; GFX9-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -33788,7 +33718,7 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
+; GFX10-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -33797,7 +33727,7 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
+; GFX11-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/check-subtarget-features.ll b/llvm/test/CodeGen/AMDGPU/check-subtarget-features.ll
new file mode 100644
index 000000000000..c24693981104
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/check-subtarget-features.ll
@@ -0,0 +1,10 @@
+; RUN: not llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,-wavefrontsize64 < %s 2>&1 | FileCheck %s -check-prefix=ERR -implicit-check-not=error:
+; RUN: not llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,-wavefrontsize64 < %s 2>&1 | FileCheck %s -check-prefix=ERR -implicit-check-not=error:
+; RUN: not llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+wavefrontsize64 < %s 2>&1 | FileCheck %s -check-prefix=ERR -implicit-check-not=error:
+; RUN: not llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+wavefrontsize64 < %s 2>&1 | FileCheck %s -check-prefix=ERR -implicit-check-not=error:
+
+; ERR: error: {{.*}} in function f void (): must specify exactly one of wavefrontsize32 and wavefrontsize64
+
+define void @f() {
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/combine_vloads.ll b/llvm/test/CodeGen/AMDGPU/combine_vloads.ll
index 10b7d62e275d..42a9b80b134c 100644
--- a/llvm/test/CodeGen/AMDGPU/combine_vloads.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine_vloads.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=r600 -mtriple=r600-- -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
+; RUN: llc -mtriple=r600-- -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
 
 ;
 ; kernel void combine_vloads(global char8 addrspace(5)* src, global char8 addrspace(5)* result) {
diff --git a/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll b/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll
index 1c8725f52f7e..238f6ab39e83 100644
--- a/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll
+++ b/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll
@@ -1,6 +1,6 @@
-; RUN: llc --amdgpu-disable-structurizer -stop-after=amdgpu-isel -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,ISEL %s
-; RUN: llc --amdgpu-disable-structurizer -stop-after=dead-mi-elimination -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,DEADMI %s
-; RUN: llc --amdgpu-disable-structurizer -global-isel -stop-after=irtranslator -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=CHECK,GISEL
+; RUN: llc -stop-after=amdgpu-isel -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,ISEL %s
+; RUN: llc -stop-after=dead-mi-elimination -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,DEADMI %s
+; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=CHECK,GISEL
 
 ; CHECK-LABEL: name:            basic_call
 ;       CHECK:    [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ENTRY
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 21aff62b9226..54adde38d6d2 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -4,9 +4,17 @@
 ; RUN: llc -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-GISEL %s
 
+declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone
+declare <2 x i7> @llvm.ctlz.v2i7(<2 x i7>, i1) nounwind readnone
 declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone
+declare <2 x i8> @llvm.ctlz.v2i8(<2 x i8>, i1) nounwind readnone
 
 declare i16 @llvm.ctlz.i16(i16, i1) nounwind readnone
+declare i18 @llvm.ctlz.i18(i18, i1) nounwind readnone
+
+declare <2 x i16> @llvm.ctlz.v2i16(<2 x i16>, i1) nounwind readnone
+declare <3 x i16> @llvm.ctlz.v3i16(<3 x i16>, i1) nounwind readnone
+declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) nounwind readnone
 
 declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
 declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
@@ -2158,3 +2166,468 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1
   store i32 %sel, ptr addrspace(1) %out
   ret void
 }
+
+define i7 @v_ctlz_zero_undef_i7(i7 %val) {
+; SI-LABEL: v_ctlz_zero_undef_i7:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; SI-NEXT:    v_ffbh_u32_e32 v0, v0
+; SI-NEXT:    v_subrev_i32_e32 v0, vcc, 25, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_ctlz_zero_undef_i7:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; VI-NEXT:    v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, -16, v0
+; VI-NEXT:    v_add_u16_e32 v0, -9, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; EG-LABEL: v_ctlz_zero_undef_i7:
+; EG:       ; %bb.0:
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+;
+; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i7:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 25, v0
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %ctlz = call i7 @llvm.ctlz.i7(i7 %val, i1 true)
+  ret i7 %ctlz
+}
+
+define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, i18 %val) nounwind {
+; SI-LABEL: s_ctlz_zero_undef_i18:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_and_b32 s2, s2, 0x3ffff
+; SI-NEXT:    s_flbit_i32_b32 s2, s2
+; SI-NEXT:    s_add_i32 s4, s2, -14
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    s_bfe_u32 s4, s4, 0x20010
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:2
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: s_ctlz_zero_undef_i18:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_and_b32 s2, s2, 0x3ffff
+; VI-NEXT:    s_flbit_i32_b32 s2, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_add_i32 s2, s2, -14
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    s_add_u32 s0, s0, 2
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_addc_u32 s1, s1, 0
+; VI-NEXT:    s_bfe_u32 s2, s2, 0x20010
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    flat_store_byte v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: s_ctlz_zero_undef_i18:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 30, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT MSKOR T1.XW, T3.X
+; EG-NEXT:    MEM_RAT MSKOR T0.XW, T2.X
+; EG-NEXT:    CF_END
+; EG-NEXT:    ALU clause starting at 4:
+; EG-NEXT:     AND_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    262143(3.673406e-40), 0(0.000000e+00)
+; EG-NEXT:     FFBH_UINT T0.W, PV.W,
+; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
+; EG-NEXT:    -14(nan), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T2.W, PV.W, literal.x,
+; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
+; EG-NEXT:     LSHL T1.X, PV.W, PS,
+; EG-NEXT:     LSHL * T1.W, literal.x, PS,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     MOV T1.Y, 0.0,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T3.W, PV.W, literal.x,
+; EG-NEXT:     MOV * T4.W, literal.y,
+; EG-NEXT:    3(4.203895e-45), 2(2.802597e-45)
+; EG-NEXT:     BFE_UINT T0.W, T0.W, literal.x, PS,
+; EG-NEXT:     LSHL * T3.W, PV.W, literal.y,
+; EG-NEXT:    16(2.242078e-44), 3(4.203895e-45)
+; EG-NEXT:     LSHL T0.X, PV.W, PS,
+; EG-NEXT:     LSHL * T0.W, literal.x, PS,
+; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT:     MOV T0.Y, 0.0,
+; EG-NEXT:     MOV T1.Z, 0.0,
+; EG-NEXT:     MOV * T0.Z, 0.0,
+; EG-NEXT:     LSHR T2.X, T2.W, literal.x,
+; EG-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i18:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_and_b32 s0, s4, 0x3ffff
+; GFX9-GISEL-NEXT:    s_flbit_i32_b32 s0, s0
+; GFX9-GISEL-NEXT:    s_sub_i32 s0, s0, 14
+; GFX9-GISEL-NEXT:    s_and_b32 s0, s0, 0x3ffff
+; GFX9-GISEL-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    s_endpgm
+  %ctlz = call i18 @llvm.ctlz.i18(i18 %val, i1 true) nounwind readnone
+  store i18 %ctlz, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define i18 @v_ctlz_zero_undef_i18(i18 %val) {
+; SI-LABEL: v_ctlz_zero_undef_i18:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v0, 0x3ffff, v0
+; SI-NEXT:    v_ffbh_u32_e32 v0, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, -14, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_ctlz_zero_undef_i18:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_and_b32_e32 v0, 0x3ffff, v0
+; VI-NEXT:    v_ffbh_u32_e32 v0, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, -14, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; EG-LABEL: v_ctlz_zero_undef_i18:
+; EG:       ; %bb.0:
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+;
+; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i18:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ffff, v0
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 14, v0
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %ctlz = call i18 @llvm.ctlz.i18(i18 %val, i1 true)
+  ret i18 %ctlz
+}
+
+define <2 x i18> @v_ctlz_zero_undef_v2i18(<2 x i18> %val) {
+; SI-LABEL: v_ctlz_zero_undef_v2i18:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0x3ffff, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0x3ffff, v0
+; SI-NEXT:    v_ffbh_u32_e32 v0, v0
+; SI-NEXT:    v_ffbh_u32_e32 v1, v1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, -14, v0
+; SI-NEXT:    v_add_i32_e32 v1, vcc, -14, v1
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_ctlz_zero_undef_v2i18:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_and_b32_e32 v1, 0x3ffff, v1
+; VI-NEXT:    v_and_b32_e32 v0, 0x3ffff, v0
+; VI-NEXT:    v_ffbh_u32_e32 v0, v0
+; VI-NEXT:    v_ffbh_u32_e32 v1, v1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, -14, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, -14, v1
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; EG-LABEL: v_ctlz_zero_undef_v2i18:
+; EG:       ; %bb.0:
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+;
+; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i18:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ffff, v0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ffff, v1
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 14, v0
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 14, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %ctlz = call <2 x i18> @llvm.ctlz.v2i18(<2 x i18> %val, i1 true)
+  ret <2 x i18> %ctlz
+}
+
+define <2 x i16> @v_ctlz_zero_undef_v2i16(<2 x i16> %val) {
+; SI-LABEL: v_ctlz_zero_undef_v2i16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_ffbh_u32_e32 v1, v1
+; SI-NEXT:    v_ffbh_u32_e32 v0, v0
+; SI-NEXT:    v_add_i32_e32 v1, vcc, -16, v1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, -16, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_ctlz_zero_undef_v2i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_ffbh_u32_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, -16, v0
+; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0xfff00000, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; EG-LABEL: v_ctlz_zero_undef_v2i16:
+; EG:       ; %bb.0:
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+;
+; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i16:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 16, v1
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 16, v0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %ctlz = call <2 x i16> @llvm.ctlz.v2i16(<2 x i16> %val, i1 true)
+  ret <2 x i16> %ctlz
+}
+
+define <3 x i16> @v_ctlz_zero_undef_v3i16(<3 x i16> %val) {
+; SI-LABEL: v_ctlz_zero_undef_v3i16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_ffbh_u32_e32 v1, v1
+; SI-NEXT:    v_ffbh_u32_e32 v0, v0
+; SI-NEXT:    v_ffbh_u32_e32 v2, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, -16, v0
+; SI-NEXT:    v_add_i32_e32 v3, vcc, -16, v2
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0xfff00000, v0
+; SI-NEXT:    v_or_b32_e32 v2, 0x100000, v2
+; SI-NEXT:    v_alignbit_b32 v1, v3, v0, 16
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_ctlz_zero_undef_v3i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_ffbh_u32_sdwa v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, -16, v0
+; VI-NEXT:    v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; VI-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, -16, v1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0xfff00000, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; EG-LABEL: v_ctlz_zero_undef_v3i16:
+; EG:       ; %bb.0:
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+;
+; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v3i16:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v2, 16, v2
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 16, v0
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 16, v1
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %ctlz = call <3 x i16> @llvm.ctlz.v3i16(<3 x i16> %val, i1 true)
+  ret <3 x i16> %ctlz
+}
+
+define <4 x i16> @v_ctlz_zero_undef_v4i16(<4 x i16> %val) {
+; SI-LABEL: v_ctlz_zero_undef_v4i16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_ffbh_u32_e32 v3, v3
+; SI-NEXT:    v_ffbh_u32_e32 v2, v2
+; SI-NEXT:    v_ffbh_u32_e32 v1, v1
+; SI-NEXT:    v_ffbh_u32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_add_i32_e32 v2, vcc, -16, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, -16, v0
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0xfff00000, v2
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0xfff00000, v0
+; SI-NEXT:    v_alignbit_b32 v1, v2, v0, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_ctlz_zero_undef_v4i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_ffbh_u32_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; VI-NEXT:    v_ffbh_u32_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, -16, v1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, -16, v0
+; VI-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0xfff00000, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0xfff00000, v1
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; EG-LABEL: v_ctlz_zero_undef_v4i16:
+; EG:       ; %bb.0:
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+;
+; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i16:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v2, 16, v2
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 16, v0
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v3, 16, v3
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 16, v1
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %ctlz = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %val, i1 true)
+  ret <4 x i16> %ctlz
+}
+
+define <2 x i8> @v_ctlz_zero_undef_v2i8(<2 x i8> %val) {
+; SI-LABEL: v_ctlz_zero_undef_v2i8:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; SI-NEXT:    v_ffbh_u32_e32 v1, v1
+; SI-NEXT:    v_ffbh_u32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; SI-NEXT:    v_subrev_i32_e32 v0, vcc, 24, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0xffffe800, v0
+; SI-NEXT:    v_bfe_u32 v1, v0, 8, 8
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_ctlz_zero_undef_v2i8:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_ffbh_u32_sdwa v1, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT:    v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT:    v_add_u16_e32 v1, 0xe800, v1
+; VI-NEXT:    v_subrev_u16_e32 v0, 24, v0
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshrrev_b16_e32 v1, 8, v1
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; EG-LABEL: v_ctlz_zero_undef_v2i8:
+; EG:       ; %bb.0:
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+;
+; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i8:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 24, v0
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 24, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %ctlz = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %val, i1 true)
+  ret <2 x i8> %ctlz
+}
+
+define <2 x i7> @v_ctlz_zero_undef_v2i7(<2 x i7> %val) {
+; SI-LABEL: v_ctlz_zero_undef_v2i7:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0x7f, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; SI-NEXT:    v_ffbh_u32_e32 v0, v0
+; SI-NEXT:    v_ffbh_u32_e32 v1, v1
+; SI-NEXT:    v_subrev_i32_e32 v0, vcc, 25, v0
+; SI-NEXT:    v_subrev_i32_e32 v1, vcc, 25, v1
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_ctlz_zero_undef_v2i7:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_and_b32_e32 v2, 0x7f007f, v0
+; VI-NEXT:    v_bfe_u32 v0, v0, 16, 7
+; VI-NEXT:    v_ffbh_u32_e32 v0, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, -16, v0
+; VI-NEXT:    v_add_u16_e32 v1, -9, v0
+; VI-NEXT:    v_and_b32_e32 v0, 0x7f, v2
+; VI-NEXT:    v_ffbh_u32_e32 v0, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, -16, v0
+; VI-NEXT:    v_add_u16_e32 v0, -9, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; EG-LABEL: v_ctlz_zero_undef_v2i7:
+; EG:       ; %bb.0:
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+;
+; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i7:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0x7f, v1
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 25, v0
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 25, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %ctlz = call <2 x i7> @llvm.ctlz.v2i7(<2 x i7> %val, i1 true)
+  ret <2 x i7> %ctlz
+}
diff --git a/llvm/test/CodeGen/AMDGPU/dead_bundle.mir b/llvm/test/CodeGen/AMDGPU/dead_bundle.mir
index dd9d6a1c788e..af656ea1c719 100644
--- a/llvm/test/CodeGen/AMDGPU/dead_bundle.mir
+++ b/llvm/test/CodeGen/AMDGPU/dead_bundle.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn--amdpal -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs=1 -start-before=greedy,0 -stop-after=virtregrewriter,0 -stress-regalloc=5 %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs=1 -start-before=greedy,0 -stop-after=virtregrewriter,0 -stress-regalloc=5 %s -o - | FileCheck %s
 
 # This test checks that dead bundles are handled correctly.
 ---
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index 7c5bdd691fc4..1c093bf31ea7 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -1,6 +1,6 @@
 ; RUN: not llc -mtriple=amdgcn-- -mcpu=tahiti -mattr=+promote-alloca -verify-machineinstrs < %s 2>&1 | FileCheck %s
 ; RUN: not llc -mtriple=amdgcn-- -mcpu=tahiti -mattr=-promote-alloca -verify-machineinstrs < %s 2>&1 | FileCheck %s
-; RUN: not llc -mtriple=r600 -mtriple=r600-- -mcpu=cypress < %s 2>&1 | FileCheck %s
+; RUN: not llc -mtriple=r600-- -mcpu=cypress < %s 2>&1 | FileCheck %s
 target datalayout = "A5"
 
 ; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll
index 1e5f4c08c7a0..0468175c5df5 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll
@@ -2417,12 +2417,12 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #2 {
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v3, v5, v3, v3
 ; GFX6-FASTFMA-NEXT:    v_mul_f32_e32 v5, v4, v3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, v6, v3, v5
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX6-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -2455,12 +2455,12 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #2 {
 ; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX7-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX7-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
 ; GFX7-NEXT:    v_fma_f32 v3, v5, v3, v3
 ; GFX7-NEXT:    v_mul_f32_e32 v5, v4, v3
 ; GFX7-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    v_fma_f32 v5, v6, v3, v5
 ; GFX7-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -2727,12 +2727,12 @@ define float @v_fdiv_f32_dynamic(float %x, float %y) #2 {
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v3, v5, v3, v3
 ; GFX6-FASTFMA-NEXT:    v_mul_f32_e32 v5, v4, v3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, v6, v3, v5
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX6-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -2765,12 +2765,12 @@ define float @v_fdiv_f32_dynamic(float %x, float %y) #2 {
 ; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX7-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX7-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
 ; GFX7-NEXT:    v_fma_f32 v3, v5, v3, v3
 ; GFX7-NEXT:    v_mul_f32_e32 v5, v4, v3
 ; GFX7-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    v_fma_f32 v5, v6, v3, v5
 ; GFX7-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -3294,12 +3294,12 @@ define float @v_fdiv_f32_dynamic_contractable_user(float %x, float %y, float %z)
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v4, v3
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v5, vcc, v0, v1, v0
+; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v4, v6, v4, v4
 ; GFX6-FASTFMA-NEXT:    v_mul_f32_e32 v6, v5, v4
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v7, -v3, v6, v5
-; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v6, v7, v4, v6
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v3, -v3, v6, v5
 ; GFX6-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -3334,12 +3334,12 @@ define float @v_fdiv_f32_dynamic_contractable_user(float %x, float %y, float %z)
 ; GFX7-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v4, v3
 ; GFX7-NEXT:    v_div_scale_f32 v5, vcc, v0, v1, v0
+; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX7-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
 ; GFX7-NEXT:    v_fma_f32 v4, v6, v4, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v6, v5, v4
 ; GFX7-NEXT:    v_fma_f32 v7, -v3, v6, v5
-; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    v_fma_f32 v6, v7, v4, v6
 ; GFX7-NEXT:    v_fma_f32 v3, -v3, v6, v5
 ; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -3868,12 +3868,12 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #2 {
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v3, v5, v3, v3
 ; GFX6-FASTFMA-NEXT:    v_mul_f32_e32 v5, v4, v3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, v6, v3, v5
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX6-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -3906,12 +3906,12 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #2 {
 ; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX7-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX7-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
 ; GFX7-NEXT:    v_fma_f32 v3, v5, v3, v3
 ; GFX7-NEXT:    v_mul_f32_e32 v5, v4, v3
 ; GFX7-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    v_fma_f32 v5, v6, v3, v5
 ; GFX7-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -4434,12 +4434,12 @@ define float @v_fdiv_f32_dynamic__nnan_ninf_contractable_user(float %x, float %y
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v4, v3
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v5, vcc, v0, v1, v0
+; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v4, v6, v4, v4
 ; GFX6-FASTFMA-NEXT:    v_mul_f32_e32 v6, v5, v4
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v7, -v3, v6, v5
-; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v6, v7, v4, v6
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v3, -v3, v6, v5
 ; GFX6-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -4474,12 +4474,12 @@ define float @v_fdiv_f32_dynamic__nnan_ninf_contractable_user(float %x, float %y
 ; GFX7-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v4, v3
 ; GFX7-NEXT:    v_div_scale_f32 v5, vcc, v0, v1, v0
+; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX7-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
 ; GFX7-NEXT:    v_fma_f32 v4, v6, v4, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v6, v5, v4
 ; GFX7-NEXT:    v_fma_f32 v7, -v3, v6, v5
-; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    v_fma_f32 v6, v7, v4, v6
 ; GFX7-NEXT:    v_fma_f32 v3, -v3, v6, v5
 ; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -5010,12 +5010,12 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #2 {
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, -v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, -v0, v1, -v0
+; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v3, v5, v3, v3
 ; GFX6-FASTFMA-NEXT:    v_mul_f32_e32 v5, v4, v3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, v6, v3, v5
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX6-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -5048,12 +5048,12 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #2 {
 ; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, -v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX7-NEXT:    v_div_scale_f32 v4, vcc, -v0, v1, -v0
+; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX7-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
 ; GFX7-NEXT:    v_fma_f32 v3, v5, v3, v3
 ; GFX7-NEXT:    v_mul_f32_e32 v5, v4, v3
 ; GFX7-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    v_fma_f32 v5, v6, v3, v5
 ; GFX7-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -5569,12 +5569,12 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #2 {
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], -v1, -v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, -v1, v0
+; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v3, v5, v3, v3
 ; GFX6-FASTFMA-NEXT:    v_mul_f32_e32 v5, v4, v3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, v6, v3, v5
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX6-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -5607,12 +5607,12 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #2 {
 ; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], -v1, -v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX7-NEXT:    v_div_scale_f32 v4, vcc, v0, -v1, v0
+; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX7-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
 ; GFX7-NEXT:    v_fma_f32 v3, v5, v3, v3
 ; GFX7-NEXT:    v_mul_f32_e32 v5, v4, v3
 ; GFX7-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    v_fma_f32 v5, v6, v3, v5
 ; GFX7-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -6113,12 +6113,12 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #2 {
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v1, s[4:5], s6, s6, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v2, v1
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, s6, v0
+; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v2, v4, v2, v2
 ; GFX6-FASTFMA-NEXT:    v_mul_f32_e32 v4, v3, v2
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, -v1, v4, v3
-; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v4, v5, v2, v4
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; GFX6-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -6153,12 +6153,12 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #2 {
 ; GFX7-NEXT:    v_div_scale_f32 v1, s[4:5], s6, s6, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v2, v1
 ; GFX7-NEXT:    v_div_scale_f32 v3, vcc, v0, s6, v0
+; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX7-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
 ; GFX7-NEXT:    v_fma_f32 v2, v4, v2, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v4, v3, v2
 ; GFX7-NEXT:    v_fma_f32 v5, -v1, v4, v3
-; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    v_fma_f32 v4, v5, v2, v4
 ; GFX7-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -6619,12 +6619,12 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #2 {
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, s6
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v2, v1
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v3, vcc, s6, v0, s6
+; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v2, v4, v2, v2
 ; GFX6-FASTFMA-NEXT:    v_mul_f32_e32 v4, v3, v2
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, -v1, v4, v3
-; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v4, v5, v2, v4
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; GFX6-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -6659,12 +6659,12 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #2 {
 ; GFX7-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, s6
 ; GFX7-NEXT:    v_rcp_f32_e32 v2, v1
 ; GFX7-NEXT:    v_div_scale_f32 v3, vcc, s6, v0, s6
+; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX7-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
 ; GFX7-NEXT:    v_fma_f32 v2, v4, v2, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v4, v3, v2
 ; GFX7-NEXT:    v_fma_f32 v5, -v1, v4, v3
-; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    v_fma_f32 v4, v5, v2, v4
 ; GFX7-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -7168,12 +7168,12 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) #
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v3, v5, v3, v3
 ; GFX6-FASTFMA-NEXT:    v_mul_f32_e32 v5, v4, v3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, v6, v3, v5
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX6-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -7206,12 +7206,12 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) #
 ; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX7-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX7-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
 ; GFX7-NEXT:    v_fma_f32 v3, v5, v3, v3
 ; GFX7-NEXT:    v_mul_f32_e32 v5, v4, v3
 ; GFX7-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    v_fma_f32 v5, v6, v3, v5
 ; GFX7-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -7721,12 +7721,12 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) #
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v3, v5, v3, v3
 ; GFX6-FASTFMA-NEXT:    v_mul_f32_e32 v5, v4, v3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX6-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, v6, v3, v5
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX6-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -7759,12 +7759,12 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) #
 ; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX7-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX7-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
 ; GFX7-NEXT:    v_fma_f32 v3, v5, v3, v3
 ; GFX7-NEXT:    v_mul_f32_e32 v5, v4, v3
 ; GFX7-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX7-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX7-NEXT:    v_fma_f32 v5, v6, v3, v5
 ; GFX7-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
index 1633d21c41d5..e4ffedd686ac 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
@@ -7,17 +7,17 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=carrizo -mattr=+xnack -verify-machineinstrs | FileCheck -check-prefix=VI-XNACK  -check-prefix=GCN %s
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=stoney -mattr=+xnack -verify-machineinstrs | FileCheck -check-prefix=VI-XNACK  -check-prefix=GCN %s
 
-; RUN: llc < %s -mtriple=amdgcn -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs | FileCheck -check-prefixes=GCN %s
-; RUN: llc < %s -mtriple=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefixes=VI-NOXNACK,HSA-VI-NOXNACK,GCN %s
-; RUN: llc < %s -mtriple=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack -verify-machineinstrs | FileCheck -check-prefixes=VI-XNACK,HSA-VI-XNACK,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs | FileCheck -check-prefixes=GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefixes=VI-NOXNACK,HSA-VI-NOXNACK,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack -verify-machineinstrs | FileCheck -check-prefixes=VI-XNACK,HSA-VI-XNACK,GCN %s
 
-; RUN: llc < %s -mtriple=amdgcn -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch -verify-machineinstrs | FileCheck -check-prefixes=GCN %s
-; RUN: llc < %s -mtriple=amdgcn -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,-xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-NOXNACK,GFX9-ARCH-FLAT,GCN %s
-; RUN: llc < %s -mtriple=amdgcn -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,+xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-XNACK,GFX9-ARCH-FLAT,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch -verify-machineinstrs | FileCheck -check-prefixes=GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,-xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-NOXNACK,GFX9-ARCH-FLAT,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,+xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-XNACK,GFX9-ARCH-FLAT,GCN %s
 
-; RUN: llc < %s -mtriple=amdgcn -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch -verify-machineinstrs | FileCheck -check-prefixes=GCN %s
-; RUN: llc < %s -mtriple=amdgcn -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,-xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-NOXNACK,GFX10-ARCH-FLAT,GCN %s
-; RUN: llc < %s -mtriple=amdgcn -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,+xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-XNACK,GFX10-ARCH-FLAT,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch -verify-machineinstrs | FileCheck -check-prefixes=GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,-xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-NOXNACK,GFX10-ARCH-FLAT,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,+xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-XNACK,GFX10-ARCH-FLAT,GCN %s
 
 ; GCN-LABEL: {{^}}no_vcc_no_flat:
 
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
index 961273468e75..5bd527149572 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
@@ -287,8 +287,8 @@ define amdgpu_gfx i32 @flat_atomic_xchg_i32_ret_offset_scalar(ptr inreg %out, i3
   ret i32 %result
 }
 
-define void @flat_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) {
-; GCN1-LABEL: flat_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory_access:
+define void @flat_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
+; GCN1-LABEL: flat_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
@@ -298,7 +298,7 @@ define void @flat_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory_access(p
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -308,7 +308,7 @@ define void @flat_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory_access(p
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_swap v[0:1], v2 offset:16
@@ -316,12 +316,12 @@ define void @flat_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory_access(p
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i64 4
-  %tmp0 = atomicrmw xchg ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw xchg ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i32 @flat_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) {
-; GCN1-LABEL: flat_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory_access:
+define i32 @flat_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
+; GCN1-LABEL: flat_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
@@ -331,7 +331,7 @@ define i32 @flat_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -341,7 +341,7 @@ define i32 @flat_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_swap v0, v[0:1], v2 offset:16 glc
@@ -349,7 +349,7 @@ define i32 @flat_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i64 4
-  %result = atomicrmw xchg ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw xchg ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %result
 }
 
@@ -637,8 +637,8 @@ define amdgpu_gfx float @flat_atomic_xchg_f32_ret_offset_scalar(ptr inreg %out,
   ret float %result
 }
 
-define void @flat_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, float %in) {
-; GCN1-LABEL: flat_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory_access:
+define void @flat_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory(ptr %out, float %in) {
+; GCN1-LABEL: flat_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
@@ -648,7 +648,7 @@ define void @flat_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory_access(p
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -658,7 +658,7 @@ define void @flat_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory_access(p
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_swap v[0:1], v2 offset:16
@@ -666,12 +666,12 @@ define void @flat_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory_access(p
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %out, i64 4
-  %tmp0 = atomicrmw xchg ptr %gep, float %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw xchg ptr %gep, float %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define float @flat_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, float %in) {
-; GCN1-LABEL: flat_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory_access:
+define float @flat_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory(ptr %out, float %in) {
+; GCN1-LABEL: flat_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
@@ -681,7 +681,7 @@ define float @flat_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory_access(pt
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -691,7 +691,7 @@ define float @flat_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory_access(pt
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_swap v0, v[0:1], v2 offset:16 glc
@@ -699,7 +699,7 @@ define float @flat_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory_access(pt
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %out, i64 4
-  %result = atomicrmw xchg ptr %gep, float %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw xchg ptr %gep, float %in seq_cst, !amdgpu.no.remote.memory !0
   ret float %result
 }
 
@@ -987,8 +987,8 @@ define amdgpu_gfx i32 @flat_atomic_add_i32_ret_offset_scalar(ptr inreg %out, i32
   ret i32 %result
 }
 
-define void @flat_atomic_add_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) {
-; GCN1-LABEL: flat_atomic_add_i32_noret_offset__amdgpu_no_remote_memory_access:
+define void @flat_atomic_add_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
+; GCN1-LABEL: flat_atomic_add_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
@@ -998,7 +998,7 @@ define void @flat_atomic_add_i32_noret_offset__amdgpu_no_remote_memory_access(pt
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_add_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_add_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -1008,7 +1008,7 @@ define void @flat_atomic_add_i32_noret_offset__amdgpu_no_remote_memory_access(pt
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_add_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_add_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_add v[0:1], v2 offset:16
@@ -1016,12 +1016,12 @@ define void @flat_atomic_add_i32_noret_offset__amdgpu_no_remote_memory_access(pt
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i64 4
-  %tmp0 = atomicrmw add ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw add ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i32 @flat_atomic_add_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) {
-; GCN1-LABEL: flat_atomic_add_i32_ret_offset__amdgpu_no_remote_memory_access:
+define i32 @flat_atomic_add_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
+; GCN1-LABEL: flat_atomic_add_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
@@ -1031,7 +1031,7 @@ define i32 @flat_atomic_add_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_add_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_add_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -1041,7 +1041,7 @@ define i32 @flat_atomic_add_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_add_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_add_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_add v0, v[0:1], v2 offset:16 glc
@@ -1049,7 +1049,7 @@ define i32 @flat_atomic_add_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i64 4
-  %result = atomicrmw add ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw add ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %result
 }
 
@@ -1337,8 +1337,8 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_offset_scalar(ptr inreg %out, i32
   ret i32 %result
 }
 
-define void @flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) {
-; GCN1-LABEL: flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory_access:
+define void @flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
+; GCN1-LABEL: flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
@@ -1348,7 +1348,7 @@ define void @flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory_access(pt
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -1358,7 +1358,7 @@ define void @flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory_access(pt
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_sub v[0:1], v2 offset:16
@@ -1366,12 +1366,12 @@ define void @flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory_access(pt
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i64 4
-  %tmp0 = atomicrmw sub ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw sub ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i32 @flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) {
-; GCN1-LABEL: flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory_access:
+define i32 @flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
+; GCN1-LABEL: flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
@@ -1381,7 +1381,7 @@ define i32 @flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -1391,7 +1391,7 @@ define i32 @flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_sub v0, v[0:1], v2 offset:16 glc
@@ -1399,7 +1399,7 @@ define i32 @flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i64 4
-  %result = atomicrmw sub ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw sub ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %result
 }
 
@@ -1687,8 +1687,8 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_offset_scalar(ptr inreg %out, i32
   ret i32 %result
 }
 
-define void @flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) {
-; GCN1-LABEL: flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory_access:
+define void @flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
+; GCN1-LABEL: flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
@@ -1698,7 +1698,7 @@ define void @flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory_access(pt
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -1708,7 +1708,7 @@ define void @flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory_access(pt
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_and v[0:1], v2 offset:16
@@ -1716,12 +1716,12 @@ define void @flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory_access(pt
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i64 4
-  %tmp0 = atomicrmw and ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw and ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i32 @flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) {
-; GCN1-LABEL: flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory_access:
+define i32 @flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
+; GCN1-LABEL: flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
@@ -1731,7 +1731,7 @@ define i32 @flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -1741,7 +1741,7 @@ define i32 @flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_and v0, v[0:1], v2 offset:16 glc
@@ -1749,7 +1749,7 @@ define i32 @flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i64 4
-  %result = atomicrmw and ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw and ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %result
 }
 
@@ -2373,8 +2373,8 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3
   ret i32 %result
 }
 
-define void @flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) {
-; GCN1-LABEL: flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory_access:
+define void @flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
+; GCN1-LABEL: flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
@@ -2398,7 +2398,7 @@ define void @flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory_access(p
 ; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -2422,7 +2422,7 @@ define void @flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory_access(p
 ; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dword v4, v[0:1] offset:16
@@ -2444,12 +2444,12 @@ define void @flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory_access(p
 ; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i64 4
-  %tmp0 = atomicrmw nand ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw nand ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i32 @flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) {
-; GCN1-LABEL: flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory_access:
+define i32 @flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
+; GCN1-LABEL: flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
@@ -2473,7 +2473,7 @@ define i32 @flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
@@ -2497,7 +2497,7 @@ define i32 @flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
@@ -2520,7 +2520,7 @@ define i32 @flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN3-NEXT:    v_mov_b32_e32 v0, v3
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i64 4
-  %result = atomicrmw nand ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw nand ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %result
 }
 
@@ -2808,8 +2808,8 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_offset_scalar(ptr inreg %out, i32
   ret i32 %result
 }
 
-define void @flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) {
-; GCN1-LABEL: flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory_access:
+define void @flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
+; GCN1-LABEL: flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
@@ -2819,7 +2819,7 @@ define void @flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -2829,7 +2829,7 @@ define void @flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_or v[0:1], v2 offset:16
@@ -2837,12 +2837,12 @@ define void @flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i64 4
-  %tmp0 = atomicrmw or ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw or ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i32 @flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) {
-; GCN1-LABEL: flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory_access:
+define i32 @flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
+; GCN1-LABEL: flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
@@ -2852,7 +2852,7 @@ define i32 @flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %o
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -2862,7 +2862,7 @@ define i32 @flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %o
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_or v0, v[0:1], v2 offset:16 glc
@@ -2870,7 +2870,7 @@ define i32 @flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %o
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i64 4
-  %result = atomicrmw or ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw or ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %result
 }
 
@@ -3158,8 +3158,8 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_offset_scalar(ptr inreg %out, i32
   ret i32 %result
 }
 
-define void @flat_xor_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) {
-; GCN1-LABEL: flat_xor_i32_noret_offset__amdgpu_no_remote_memory_access:
+define void @flat_xor_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
+; GCN1-LABEL: flat_xor_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
@@ -3169,7 +3169,7 @@ define void @flat_xor_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out,
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_xor_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_xor_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -3179,7 +3179,7 @@ define void @flat_xor_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out,
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_xor_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_xor_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_xor v[0:1], v2 offset:16
@@ -3187,12 +3187,12 @@ define void @flat_xor_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out,
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i64 4
-  %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i32 @flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) {
-; GCN1-LABEL: flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory_access:
+define i32 @flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
+; GCN1-LABEL: flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
@@ -3202,7 +3202,7 @@ define i32 @flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -3212,7 +3212,7 @@ define i32 @flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_xor v0, v[0:1], v2 offset:16 glc
@@ -3220,7 +3220,7 @@ define i32 @flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i64 4
-  %result = atomicrmw xor ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw xor ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %result
 }
 
@@ -4222,8 +4222,8 @@ entry:
   ret void
 }
 
-define void @flat_max_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) {
-; GCN1-LABEL: flat_max_i32_noret_offset__amdgpu_no_remote_memory_access:
+define void @flat_max_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
+; GCN1-LABEL: flat_max_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
@@ -4246,7 +4246,7 @@ define void @flat_max_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out,
 ; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_max_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_max_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -4269,7 +4269,7 @@ define void @flat_max_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out,
 ; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_max_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_max_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dword v4, v[0:1] offset:16
@@ -4290,12 +4290,12 @@ define void @flat_max_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out,
 ; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i64 4
-  %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i32 @flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) {
-; GCN1-LABEL: flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory_access:
+define i32 @flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
+; GCN1-LABEL: flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
@@ -4318,7 +4318,7 @@ define i32 @flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
@@ -4341,7 +4341,7 @@ define i32 @flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
@@ -4363,7 +4363,7 @@ define i32 @flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN3-NEXT:    v_mov_b32_e32 v0, v3
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i64 4
-  %result = atomicrmw max ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw max ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %result
 }
 
@@ -5275,8 +5275,8 @@ entry:
   ret void
 }
 
-define void @flat_umax_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) {
-; GCN1-LABEL: flat_umax_i32_noret_offset__amdgpu_no_remote_memory_access:
+define void @flat_umax_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
+; GCN1-LABEL: flat_umax_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
@@ -5299,7 +5299,7 @@ define void @flat_umax_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out
 ; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_umax_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_umax_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -5322,7 +5322,7 @@ define void @flat_umax_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out
 ; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_umax_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_umax_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dword v4, v[0:1] offset:16
@@ -5343,12 +5343,12 @@ define void @flat_umax_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out
 ; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i64 4
-  %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i32 @flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) {
-; GCN1-LABEL: flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory_access:
+define i32 @flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
+; GCN1-LABEL: flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
@@ -5371,7 +5371,7 @@ define i32 @flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
@@ -5394,7 +5394,7 @@ define i32 @flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
@@ -5416,7 +5416,7 @@ define i32 @flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN3-NEXT:    v_mov_b32_e32 v0, v3
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i64 4
-  %result = atomicrmw umax ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw umax ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %result
 }
 
@@ -6016,8 +6016,8 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3
   ret i32 %result
 }
 
-define void @flat_umin_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) {
-; GCN1-LABEL: flat_umin_i32_noret_offset__amdgpu_no_remote_memory_access:
+define void @flat_umin_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
+; GCN1-LABEL: flat_umin_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
@@ -6040,7 +6040,7 @@ define void @flat_umin_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out
 ; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_umin_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_umin_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -6063,7 +6063,7 @@ define void @flat_umin_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out
 ; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_umin_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_umin_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dword v4, v[0:1] offset:16
@@ -6084,12 +6084,12 @@ define void @flat_umin_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out
 ; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i64 4
-  %tmp0 = atomicrmw umin ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw umin ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i32 @flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) {
-; GCN1-LABEL: flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory_access:
+define i32 @flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
+; GCN1-LABEL: flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
@@ -6112,7 +6112,7 @@ define i32 @flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
@@ -6135,7 +6135,7 @@ define i32 @flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
@@ -6157,7 +6157,7 @@ define i32 @flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN3-NEXT:    v_mov_b32_e32 v0, v3
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i64 4
-  %result = atomicrmw umin ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw umin ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %result
 }
 
@@ -7146,8 +7146,8 @@ entry:
   ret void
 }
 
-define void @flat_min_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) {
-; GCN1-LABEL: flat_min_i32_noret_offset__amdgpu_no_remote_memory_access:
+define void @flat_min_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
+; GCN1-LABEL: flat_min_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
@@ -7170,7 +7170,7 @@ define void @flat_min_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out,
 ; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_min_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_min_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -7193,7 +7193,7 @@ define void @flat_min_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out,
 ; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_min_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_min_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dword v4, v[0:1] offset:16
@@ -7214,12 +7214,12 @@ define void @flat_min_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out,
 ; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i64 4
-  %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i32 @flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) {
-; GCN1-LABEL: flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory_access:
+define i32 @flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
+; GCN1-LABEL: flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
@@ -7242,7 +7242,7 @@ define i32 @flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
@@ -7265,7 +7265,7 @@ define i32 @flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
@@ -7287,7 +7287,7 @@ define i32 @flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN3-NEXT:    v_mov_b32_e32 v0, v3
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i64 4
-  %result = atomicrmw min ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw min ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %result
 }
 
@@ -7575,8 +7575,8 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_offset_scalar(ptr inreg %ou
   ret i32 %result
 }
 
-define void @flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) {
-; GCN1-LABEL: flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory_access:
+define void @flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
+; GCN1-LABEL: flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
@@ -7586,7 +7586,7 @@ define void @flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -7596,7 +7596,7 @@ define void @flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_inc v[0:1], v2 offset:16
@@ -7604,12 +7604,12 @@ define void @flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i64 4
-  %tmp0 = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i32 @flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) {
-; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory_access:
+define i32 @flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
+; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
@@ -7619,7 +7619,7 @@ define i32 @flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory_access
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -7629,7 +7629,7 @@ define i32 @flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory_access
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_inc v0, v[0:1], v2 offset:16 glc
@@ -7637,7 +7637,7 @@ define i32 @flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory_access
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i64 4
-  %result = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %result
 }
 
@@ -7925,8 +7925,8 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou
   ret i32 %result
 }
 
-define void @flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) {
-; GCN1-LABEL: flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory_access:
+define void @flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
+; GCN1-LABEL: flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
@@ -7936,7 +7936,7 @@ define void @flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -7946,7 +7946,7 @@ define void @flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_dec v[0:1], v2 offset:16
@@ -7954,12 +7954,12 @@ define void @flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i64 4
-  %tmp0 = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i32 @flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) {
-; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory_access:
+define i32 @flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
+; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
@@ -7969,7 +7969,7 @@ define i32 @flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory_access
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -7979,7 +7979,7 @@ define i32 @flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory_access
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_dec v0, v[0:1], v2 offset:16 glc
@@ -7987,7 +7987,7 @@ define i32 @flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory_access
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i64 4
-  %result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %result
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
index 4bf3e2fdd051..d812b4b7d86e 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
@@ -299,8 +299,8 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6
   ret i64 %result
 }
 
-define void @flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) {
-; GCN1-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory_access:
+define void @flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GCN1-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
@@ -310,7 +310,7 @@ define void @flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory_access(p
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -320,7 +320,7 @@ define void @flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory_access(p
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3] offset:32
@@ -328,12 +328,12 @@ define void @flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory_access(p
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
-  %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i64 @flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) {
-; GCN1-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory_access:
+define i64 @flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GCN1-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
@@ -343,7 +343,7 @@ define i64 @flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -353,7 +353,7 @@ define i64 @flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
@@ -361,7 +361,7 @@ define i64 @flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
-  %result = atomicrmw xchg ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw xchg ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %result
 }
 
@@ -661,8 +661,8 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out,
   ret double %result
 }
 
-define void @flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, double %in) {
-; GCN1-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory_access:
+define void @flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory(ptr %out, double %in) {
+; GCN1-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
@@ -672,7 +672,7 @@ define void @flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory_access(p
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -682,7 +682,7 @@ define void @flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory_access(p
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3] offset:32
@@ -690,12 +690,12 @@ define void @flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory_access(p
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr double, ptr %out, i64 4
-  %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define double @flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, double %in) {
-; GCN1-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory_access:
+define double @flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory(ptr %out, double %in) {
+; GCN1-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
@@ -705,7 +705,7 @@ define double @flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory_access(p
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -715,7 +715,7 @@ define double @flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory_access(p
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
@@ -723,7 +723,7 @@ define double @flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory_access(p
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr double, ptr %out, i64 4
-  %result = atomicrmw xchg ptr %gep, double %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw xchg ptr %gep, double %in seq_cst, !amdgpu.no.remote.memory !0
   ret double %result
 }
 
@@ -1023,8 +1023,8 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64
   ret i64 %result
 }
 
-define void @flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) {
-; GCN1-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory_access:
+define void @flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GCN1-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
@@ -1034,7 +1034,7 @@ define void @flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory_access(pt
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -1044,7 +1044,7 @@ define void @flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory_access(pt
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3] offset:32
@@ -1052,12 +1052,12 @@ define void @flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory_access(pt
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
-  %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i64 @flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) {
-; GCN1-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory_access:
+define i64 @flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GCN1-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
@@ -1067,7 +1067,7 @@ define i64 @flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -1077,7 +1077,7 @@ define i64 @flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
@@ -1085,7 +1085,7 @@ define i64 @flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
-  %result = atomicrmw add ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw add ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %result
 }
 
@@ -1385,8 +1385,8 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64
   ret i64 %result
 }
 
-define void @flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) {
-; GCN1-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory_access:
+define void @flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GCN1-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
@@ -1396,7 +1396,7 @@ define void @flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory_access(pt
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -1406,7 +1406,7 @@ define void @flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory_access(pt
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3] offset:32
@@ -1414,12 +1414,12 @@ define void @flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory_access(pt
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
-  %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i64 @flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) {
-; GCN1-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory_access:
+define i64 @flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GCN1-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
@@ -1429,7 +1429,7 @@ define i64 @flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -1439,7 +1439,7 @@ define i64 @flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
@@ -1447,7 +1447,7 @@ define i64 @flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
-  %result = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %result
 }
 
@@ -1747,8 +1747,8 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64
   ret i64 %result
 }
 
-define void @flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) {
-; GCN1-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory_access:
+define void @flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GCN1-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
@@ -1758,7 +1758,7 @@ define void @flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory_access(pt
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -1768,7 +1768,7 @@ define void @flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory_access(pt
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3] offset:32
@@ -1776,12 +1776,12 @@ define void @flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory_access(pt
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
-  %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i64 @flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) {
-; GCN1-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory_access:
+define i64 @flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GCN1-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
@@ -1791,7 +1791,7 @@ define i64 @flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -1801,7 +1801,7 @@ define i64 @flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
@@ -1809,7 +1809,7 @@ define i64 @flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
-  %result = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %result
 }
 
@@ -2581,8 +2581,8 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6
   ret i64 %result
 }
 
-define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) {
-; GCN1-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory_access:
+define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GCN1-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v8, vcc, 32, v0
@@ -2612,7 +2612,7 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory_access(p
 ; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v8, vcc, 32, v0
@@ -2642,7 +2642,7 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory_access(p
 ; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dwordx2 v[6:7], v[0:1] offset:32
@@ -2667,12 +2667,12 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory_access(p
 ; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
-  %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) {
-; GCN1-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory_access:
+define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GCN1-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v4, vcc, 32, v0
@@ -2702,7 +2702,7 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
@@ -2732,7 +2732,7 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:32
@@ -2759,7 +2759,7 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN3-NEXT:    v_mov_b32_e32 v1, v5
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
-  %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %result
 }
 
@@ -3059,8 +3059,8 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64
   ret i64 %result
 }
 
-define void @flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) {
-; GCN1-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory_access:
+define void @flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GCN1-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
@@ -3070,7 +3070,7 @@ define void @flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -3080,7 +3080,7 @@ define void @flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3] offset:32
@@ -3088,12 +3088,12 @@ define void @flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
-  %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i64 @flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) {
-; GCN1-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory_access:
+define i64 @flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GCN1-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
@@ -3103,7 +3103,7 @@ define i64 @flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %o
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -3113,7 +3113,7 @@ define i64 @flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %o
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
@@ -3121,7 +3121,7 @@ define i64 @flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %o
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
-  %result = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %result
 }
 
@@ -3421,8 +3421,8 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64
   ret i64 %result
 }
 
-define void @flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) {
-; GCN1-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory_access:
+define void @flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GCN1-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
@@ -3432,7 +3432,7 @@ define void @flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory_access(pt
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -3442,7 +3442,7 @@ define void @flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory_access(pt
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3] offset:32
@@ -3450,12 +3450,12 @@ define void @flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory_access(pt
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
-  %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i64 @flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) {
-; GCN1-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory_access:
+define i64 @flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GCN1-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
@@ -3465,7 +3465,7 @@ define i64 @flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -3475,7 +3475,7 @@ define i64 @flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
@@ -3483,7 +3483,7 @@ define i64 @flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
-  %result = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %result
 }
 
@@ -4693,8 +4693,8 @@ entry:
   ret void
 }
 
-define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) {
-; GCN1-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory_access:
+define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GCN1-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v8, vcc, 32, v0
@@ -4723,7 +4723,7 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory_access(pt
 ; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v8, vcc, 32, v0
@@ -4752,7 +4752,7 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory_access(pt
 ; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dwordx2 v[6:7], v[0:1] offset:32
@@ -4776,12 +4776,12 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory_access(pt
 ; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
-  %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) {
-; GCN1-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory_access:
+define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GCN1-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v4, vcc, 32, v0
@@ -4810,7 +4810,7 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
@@ -4839,7 +4839,7 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:32
@@ -4865,7 +4865,7 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN3-NEXT:    v_mov_b32_e32 v1, v5
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
-  %result = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %result
 }
 
@@ -5973,8 +5973,8 @@ entry:
   ret void
 }
 
-define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) {
-; GCN1-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory_access:
+define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GCN1-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v8, vcc, 32, v0
@@ -6003,7 +6003,7 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory_access(p
 ; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v8, vcc, 32, v0
@@ -6032,7 +6032,7 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory_access(p
 ; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dwordx2 v[6:7], v[0:1] offset:32
@@ -6056,12 +6056,12 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory_access(p
 ; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
-  %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) {
-; GCN1-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory_access:
+define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GCN1-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v4, vcc, 32, v0
@@ -6090,7 +6090,7 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
@@ -6119,7 +6119,7 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:32
@@ -6145,7 +6145,7 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN3-NEXT:    v_mov_b32_e32 v1, v5
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
-  %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %result
 }
 
@@ -6917,8 +6917,8 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6
   ret i64 %result
 }
 
-define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) {
-; GCN1-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory_access:
+define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GCN1-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v8, vcc, 32, v0
@@ -6947,7 +6947,7 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory_access(p
 ; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v8, vcc, 32, v0
@@ -6976,7 +6976,7 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory_access(p
 ; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dwordx2 v[6:7], v[0:1] offset:32
@@ -7000,12 +7000,12 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory_access(p
 ; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
-  %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) {
-; GCN1-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory_access:
+define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GCN1-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v4, vcc, 32, v0
@@ -7034,7 +7034,7 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
@@ -7063,7 +7063,7 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:32
@@ -7089,7 +7089,7 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GCN3-NEXT:    v_mov_b32_e32 v1, v5
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
-  %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %result
 }
 
@@ -8292,8 +8292,8 @@ entry:
   ret void
 }
 
-define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) {
-; GCN1-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory_access:
+define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GCN1-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v8, vcc, 32, v0
@@ -8322,7 +8322,7 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory_access(pt
 ; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v8, vcc, 32, v0
@@ -8351,7 +8351,7 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory_access(pt
 ; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dwordx2 v[6:7], v[0:1] offset:32
@@ -8375,12 +8375,12 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory_access(pt
 ; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
-  %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) {
-; GCN1-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory_access:
+define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GCN1-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v4, vcc, 32, v0
@@ -8409,7 +8409,7 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
@@ -8438,7 +8438,7 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:32
@@ -8464,7 +8464,7 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %
 ; GCN3-NEXT:    v_mov_b32_e32 v1, v5
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
-  %result = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %result
 }
 
@@ -8764,8 +8764,8 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou
   ret i64 %result
 }
 
-define void @flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) {
-; GCN1-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory_access:
+define void @flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GCN1-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
@@ -8775,7 +8775,7 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory_acc
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -8785,7 +8785,7 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory_acc
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3] offset:32
@@ -8793,12 +8793,12 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory_acc
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
-  %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i64 @flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) {
-; GCN1-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory_access:
+define i64 @flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GCN1-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
@@ -8808,7 +8808,7 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory_access
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -8818,7 +8818,7 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory_access
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
@@ -8826,7 +8826,7 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory_access
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
-  %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %result
 }
 
@@ -9126,8 +9126,8 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou
   ret i64 %result
 }
 
-define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) {
-; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory_access:
+define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
@@ -9137,7 +9137,7 @@ define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory_acc
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -9147,7 +9147,7 @@ define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory_acc
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3] offset:32
@@ -9155,12 +9155,12 @@ define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory_acc
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
-  %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) {
-; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory_access:
+define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN1:       ; %bb.0:
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
@@ -9170,7 +9170,7 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory_access
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN2:       ; %bb.0:
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -9180,7 +9180,7 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory_access
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory:
 ; GCN3:       ; %bb.0:
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
@@ -9188,7 +9188,7 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory_access
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
-  %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %result
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum.ll b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
index dd685a6169d8..87ac95a1cd73 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
@@ -148,23 +148,35 @@ define amdgpu_ps <2 x half> @test_fmaximum_v2f16_ss(<2 x half> inreg %a, <2 x ha
 }
 
 define amdgpu_ps <3 x half> @test_fmaximum_v3f16_vv(<3 x half> %a, <3 x half> %b) {
-; GCN-LABEL: test_fmaximum_v3f16_vv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_maximum_f16 v0, v0, v2
-; GCN-NEXT:    v_maximum_f16 v1, v1, v3
-; GCN-NEXT:    ; return to shader part epilog
+; GFX12-SDAG-LABEL: test_fmaximum_v3f16_vv:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-SDAG-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: test_fmaximum_v3f16_vv:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-GISEL-NEXT:    v_maximum_f16 v1, v1, v3
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %val = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
   ret <3 x half> %val
 }
 
 define amdgpu_ps <3 x half> @test_fmaximum_v3f16_ss(<3 x half> inreg %a, <3 x half> inreg %b) {
-; GCN-LABEL: test_fmaximum_v3f16_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_maximum_f16 v0, s0, s2
-; GCN-NEXT:    s_maximum_f16 s0, s1, s3
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT:    v_mov_b32_e32 v1, s0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX12-SDAG-LABEL: test_fmaximum_v3f16_ss:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_pk_maximum_f16 v0, s0, s2
+; GFX12-SDAG-NEXT:    v_pk_maximum_f16 v1, s1, s3
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: test_fmaximum_v3f16_ss:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_pk_maximum_f16 v0, s0, s2
+; GFX12-GISEL-NEXT:    s_maximum_f16 s0, s1, s3
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %val = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
   ret <3 x half> %val
 }
@@ -306,6 +318,3 @@ declare <4 x half> @llvm.maximum.v4f16(<4 x half>, <4 x half>)
 declare double @llvm.maximum.f64(double, double)
 declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>)
 declare <4 x double> @llvm.maximum.v4f64(<4 x double>, <4 x double>)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX12-GISEL: {{.*}}
-; GFX12-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum.ll b/llvm/test/CodeGen/AMDGPU/fminimum.ll
index 2b3cc4fd7385..45f6bff10f45 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum.ll
@@ -148,23 +148,35 @@ define amdgpu_ps <2 x half> @test_fminimum_v2f16_ss(<2 x half> inreg %a, <2 x ha
 }
 
 define amdgpu_ps <3 x half> @test_fminimum_v3f16_vv(<3 x half> %a, <3 x half> %b) {
-; GCN-LABEL: test_fminimum_v3f16_vv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_minimum_f16 v0, v0, v2
-; GCN-NEXT:    v_minimum_f16 v1, v1, v3
-; GCN-NEXT:    ; return to shader part epilog
+; GFX12-SDAG-LABEL: test_fminimum_v3f16_vv:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-SDAG-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: test_fminimum_v3f16_vv:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-GISEL-NEXT:    v_minimum_f16 v1, v1, v3
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %val = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b)
   ret <3 x half> %val
 }
 
 define amdgpu_ps <3 x half> @test_fminimum_v3f16_ss(<3 x half> inreg %a, <3 x half> inreg %b) {
-; GCN-LABEL: test_fminimum_v3f16_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_minimum_f16 v0, s0, s2
-; GCN-NEXT:    s_minimum_f16 s0, s1, s3
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT:    v_mov_b32_e32 v1, s0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX12-SDAG-LABEL: test_fminimum_v3f16_ss:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_pk_minimum_f16 v0, s0, s2
+; GFX12-SDAG-NEXT:    v_pk_minimum_f16 v1, s1, s3
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: test_fminimum_v3f16_ss:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_pk_minimum_f16 v0, s0, s2
+; GFX12-GISEL-NEXT:    s_minimum_f16 s0, s1, s3
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %val = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b)
   ret <3 x half> %val
 }
@@ -306,6 +318,3 @@ declare <4 x half> @llvm.minimum.v4f16(<4 x half>, <4 x half>)
 declare double @llvm.minimum.f64(double, double)
 declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>)
 declare <4 x double> @llvm.minimum.v4f64(<4 x double>, <4 x double>)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX12-GISEL: {{.*}}
-; GFX12-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fold-agpr-phis.mir b/llvm/test/CodeGen/AMDGPU/fold-agpr-phis.mir
index a32b3d0f1e6b..e94546fd5e8a 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-agpr-phis.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-agpr-phis.mir
@@ -465,7 +465,6 @@ body: |
   ; GFX90A-NEXT: bb.2:
   ; GFX90A-NEXT:   S_ENDPGM 0
   bb.0:
-    ; Tests that tryOptimizeAGPRPhis kicks in for GFX908.
     liveins: $sgpr0, $scc
     successors: %bb.1
 
@@ -715,3 +714,85 @@ body: |
   bb.3:
     S_ENDPGM 0
 ...
+
+---
+name:            skip_optimize_agpr_phi_without_subreg_use
+tracksRegLiveness: true
+body:             |
+  ; GFX908-LABEL: name: skip_optimize_agpr_phi_without_subreg_use
+  ; GFX908: bb.0:
+  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX908-NEXT:   liveins: $scc
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX908-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; GFX908-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; GFX908-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; GFX908-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_2:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; GFX908-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_3:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; GFX908-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:areg_128_align2 = REG_SEQUENCE [[V_ACCVGPR_WRITE_B32_e64_]], %subreg.sub0, [[V_ACCVGPR_WRITE_B32_e64_1]], %subreg.sub1, [[V_ACCVGPR_WRITE_B32_e64_2]], %subreg.sub2, [[V_ACCVGPR_WRITE_B32_e64_3]], %subreg.sub3
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.1:
+  ; GFX908-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX908-NEXT:   liveins: $scc
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   [[PHI:%[0-9]+]]:areg_128_align2 = PHI [[REG_SEQUENCE]], %bb.0, %7, %bb.1
+  ; GFX908-NEXT:   [[V_MFMA_F32_16X16X4F32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X4F32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], [[PHI]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_MFMA_F32_16X16X4F32_e64_]], implicit $exec
+  ; GFX908-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit $scc
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.2:
+  ; GFX908-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX90A-LABEL: name: skip_optimize_agpr_phi_without_subreg_use
+  ; GFX90A: bb.0:
+  ; GFX90A-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX90A-NEXT:   liveins: $scc
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; GFX90A-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; GFX90A-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; GFX90A-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_2:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; GFX90A-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_3:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:areg_128_align2 = REG_SEQUENCE [[V_ACCVGPR_WRITE_B32_e64_]], %subreg.sub0, [[V_ACCVGPR_WRITE_B32_e64_1]], %subreg.sub1, [[V_ACCVGPR_WRITE_B32_e64_2]], %subreg.sub2, [[V_ACCVGPR_WRITE_B32_e64_3]], %subreg.sub3
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT: bb.1:
+  ; GFX90A-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX90A-NEXT:   liveins: $scc
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   [[PHI:%[0-9]+]]:areg_128_align2 = PHI [[REG_SEQUENCE]], %bb.0, %7, %bb.1
+  ; GFX90A-NEXT:   [[V_MFMA_F32_16X16X4F32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X4F32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], [[PHI]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_MFMA_F32_16X16X4F32_e64_]], implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit $scc
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT: bb.2:
+  ; GFX90A-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $scc
+    successors: %bb.1
+
+    %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1:sgpr_32 = S_MOV_B32 0
+    %2:sgpr_128 = REG_SEQUENCE %1, %subreg.sub0, %1, %subreg.sub1, %1, %subreg.sub2, %1, %subreg.sub3
+    %3:vreg_128 = COPY %2
+    %4:sreg_64 = S_MOV_B64 0
+    %5:areg_128_align2 = COPY %3, implicit $exec
+
+  bb.1:
+    liveins: $scc
+    successors: %bb.1, %bb.2
+
+    %9:areg_128_align2 = PHI %5, %bb.0, %10, %bb.1
+    %11:areg_128_align2 = V_MFMA_F32_16X16X4F32_e64 %0:vgpr_32, %0:vgpr_32, %9:areg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %12:vgpr_32 = COPY %11.sub3
+    %13:vgpr_32 = COPY %11.sub2
+    %14:vgpr_32 = COPY %11.sub1
+    %15:vgpr_32 = COPY %11.sub0
+    %10:areg_128_align2 = COPY %11, implicit $exec
+    S_CBRANCH_SCC1 %bb.1, implicit $scc
+
+  bb.2:
+    S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
index 4ec3ac25b2f1..516c92f1640e 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
@@ -353,8 +353,8 @@ define amdgpu_gfx i32 @global_atomic_xchg_i32_ret_offset_scalar(ptr addrspace(1)
   ret i32 %result
 }
 
-define void @global_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) {
-; SI-LABEL: global_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory_access:
+define void @global_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: global_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -367,7 +367,7 @@ define void @global_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory_access
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -377,7 +377,7 @@ define void @global_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory_access
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_swap v[0:1], v2, off offset:16
@@ -385,12 +385,12 @@ define void @global_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory_access
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i32 @global_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) {
-; SI-LABEL: global_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory_access:
+define i32 @global_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: global_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -404,7 +404,7 @@ define i32 @global_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory_access(pt
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -414,7 +414,7 @@ define i32 @global_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory_access(pt
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_swap v0, v[0:1], v2, off offset:16 glc
@@ -422,7 +422,7 @@ define i32 @global_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory_access(pt
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %result = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %result
 }
 
@@ -776,8 +776,8 @@ define amdgpu_gfx float @global_atomic_xchg_f32_ret_offset_scalar(ptr addrspace(
   ret float %result
 }
 
-define void @global_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, float %in) {
-; SI-LABEL: global_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory_access:
+define void @global_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, float %in) {
+; SI-LABEL: global_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -790,7 +790,7 @@ define void @global_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory_access
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -800,7 +800,7 @@ define void @global_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory_access
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_swap v[0:1], v2, off offset:16
@@ -808,12 +808,12 @@ define void @global_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory_access
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define float @global_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, float %in) {
-; SI-LABEL: global_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory_access:
+define float @global_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, float %in) {
+; SI-LABEL: global_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -827,7 +827,7 @@ define float @global_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory_access(
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -837,7 +837,7 @@ define float @global_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory_access(
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_swap v0, v[0:1], v2, off offset:16 glc
@@ -845,7 +845,7 @@ define float @global_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory_access(
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %result = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst, !amdgpu.no.remote.memory !0
   ret float %result
 }
 
@@ -1199,8 +1199,8 @@ define amdgpu_gfx i32 @global_atomic_add_i32_ret_offset_scalar(ptr addrspace(1)
   ret i32 %result
 }
 
-define void @global_atomic_add_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) {
-; SI-LABEL: global_atomic_add_i32_noret_offset__amdgpu_no_remote_memory_access:
+define void @global_atomic_add_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: global_atomic_add_i32_noret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -1213,7 +1213,7 @@ define void @global_atomic_add_i32_noret_offset__amdgpu_no_remote_memory_access(
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_add_i32_noret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_add_i32_noret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -1223,7 +1223,7 @@ define void @global_atomic_add_i32_noret_offset__amdgpu_no_remote_memory_access(
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_add_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_add_i32_noret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_add v[0:1], v2, off offset:16
@@ -1231,12 +1231,12 @@ define void @global_atomic_add_i32_noret_offset__amdgpu_no_remote_memory_access(
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i32 @global_atomic_add_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) {
-; SI-LABEL: global_atomic_add_i32_ret_offset__amdgpu_no_remote_memory_access:
+define i32 @global_atomic_add_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: global_atomic_add_i32_ret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -1250,7 +1250,7 @@ define i32 @global_atomic_add_i32_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_add_i32_ret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_add_i32_ret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -1260,7 +1260,7 @@ define i32 @global_atomic_add_i32_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_add_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_add_i32_ret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_add v0, v[0:1], v2, off offset:16 glc
@@ -1268,7 +1268,7 @@ define i32 @global_atomic_add_i32_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %result = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %result
 }
 
@@ -1659,8 +1659,8 @@ define i32 @global_atomic_sub_0_i32_ret(ptr addrspace(1) %ptr) {
   ret i32 %result
 }
 
-define void @global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) {
-; SI-LABEL: global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory_access:
+define void @global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -1673,7 +1673,7 @@ define void @global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory_access(
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -1683,7 +1683,7 @@ define void @global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory_access(
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_sub v[0:1], v2, off offset:16
@@ -1691,12 +1691,12 @@ define void @global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory_access(
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i32 @global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) {
-; SI-LABEL: global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory_access:
+define i32 @global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -1710,7 +1710,7 @@ define i32 @global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -1720,7 +1720,7 @@ define i32 @global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_sub v0, v[0:1], v2, off offset:16 glc
@@ -1728,7 +1728,7 @@ define i32 @global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %result = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %result
 }
 
@@ -2082,8 +2082,8 @@ define amdgpu_gfx i32 @global_atomic_and_i32_ret_offset_scalar(ptr addrspace(1)
   ret i32 %result
 }
 
-define void @global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) {
-; SI-LABEL: global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory_access:
+define void @global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -2096,7 +2096,7 @@ define void @global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory_access(
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -2106,7 +2106,7 @@ define void @global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory_access(
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_and v[0:1], v2, off offset:16
@@ -2114,12 +2114,12 @@ define void @global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory_access(
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i32 @global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) {
-; SI-LABEL: global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory_access:
+define i32 @global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -2133,7 +2133,7 @@ define i32 @global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -2143,7 +2143,7 @@ define i32 @global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_and v0, v[0:1], v2, off offset:16 glc
@@ -2151,7 +2151,7 @@ define i32 @global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %result = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %result
 }
 
@@ -2854,8 +2854,8 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1)
   ret i32 %result
 }
 
-define void @global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) {
-; SI-LABEL: global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory_access:
+define void @global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -2885,7 +2885,7 @@ define void @global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory_access
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -2909,7 +2909,7 @@ define void @global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory_access
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:16
@@ -2931,12 +2931,12 @@ define void @global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory_access
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i32 @global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) {
-; SI-LABEL: global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory_access:
+define i32 @global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -2967,7 +2967,7 @@ define i32 @global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory_access(pt
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
@@ -2991,7 +2991,7 @@ define i32 @global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory_access(pt
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v3, v[0:1], off offset:16
@@ -3014,7 +3014,7 @@ define i32 @global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory_access(pt
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %result = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %result
 }
 
@@ -3405,8 +3405,8 @@ define i32 @global_atomic_or_0_i32_ret(ptr addrspace(1) %ptr) {
   ret i32 %result
 }
 
-define void @global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) {
-; SI-LABEL: global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory_access:
+define void @global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -3419,7 +3419,7 @@ define void @global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory_access(p
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -3429,7 +3429,7 @@ define void @global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory_access(p
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_or v[0:1], v2, off offset:16
@@ -3437,12 +3437,12 @@ define void @global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory_access(p
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i32 @global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) {
-; SI-LABEL: global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory_access:
+define i32 @global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -3456,7 +3456,7 @@ define i32 @global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -3466,7 +3466,7 @@ define i32 @global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_or v0, v[0:1], v2, off offset:16 glc
@@ -3474,7 +3474,7 @@ define i32 @global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %result = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %result
 }
 
@@ -3865,8 +3865,8 @@ define i32 @global_atomic_xor_0_i32_ret(ptr addrspace(1) %ptr) {
   ret i32 %result
 }
 
-define void @global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) {
-; SI-LABEL: global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory_access:
+define void @global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -3879,7 +3879,7 @@ define void @global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory_access(
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -3889,7 +3889,7 @@ define void @global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory_access(
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_xor v[0:1], v2, off offset:16
@@ -3897,12 +3897,12 @@ define void @global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory_access(
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i32 @global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) {
-; SI-LABEL: global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory_access:
+define i32 @global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -3916,7 +3916,7 @@ define i32 @global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -3926,7 +3926,7 @@ define i32 @global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_xor v0, v[0:1], v2, off offset:16 glc
@@ -3934,7 +3934,7 @@ define i32 @global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %result = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %result
 }
 
@@ -5033,8 +5033,8 @@ entry:
   ret void
 }
 
-define void @global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) {
-; SI-LABEL: global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory_access:
+define void @global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -5063,7 +5063,7 @@ define void @global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory_access(
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -5086,7 +5086,7 @@ define void @global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory_access(
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:16
@@ -5107,12 +5107,12 @@ define void @global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory_access(
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i32 @global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) {
-; SI-LABEL: global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory_access:
+define i32 @global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -5142,7 +5142,7 @@ define i32 @global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
@@ -5165,7 +5165,7 @@ define i32 @global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v3, v[0:1], off offset:16
@@ -5187,7 +5187,7 @@ define i32 @global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %result = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %result
 }
 
@@ -6191,8 +6191,8 @@ entry:
   ret void
 }
 
-define void @global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) {
-; SI-LABEL: global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory_access:
+define void @global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -6221,7 +6221,7 @@ define void @global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory_access
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -6244,7 +6244,7 @@ define void @global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory_access
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:16
@@ -6265,12 +6265,12 @@ define void @global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory_access
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i32 @global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) {
-; SI-LABEL: global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory_access:
+define i32 @global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -6300,7 +6300,7 @@ define i32 @global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory_access(pt
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
@@ -6323,7 +6323,7 @@ define i32 @global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory_access(pt
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v3, v[0:1], off offset:16
@@ -6345,7 +6345,7 @@ define i32 @global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory_access(pt
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %result = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %result
 }
 
@@ -7024,8 +7024,8 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1)
   ret i32 %result
 }
 
-define void @global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) {
-; SI-LABEL: global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory_access:
+define void @global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -7054,7 +7054,7 @@ define void @global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory_access
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -7077,7 +7077,7 @@ define void @global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory_access
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:16
@@ -7098,12 +7098,12 @@ define void @global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory_access
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i32 @global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) {
-; SI-LABEL: global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory_access:
+define i32 @global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -7133,7 +7133,7 @@ define i32 @global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory_access(pt
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
@@ -7156,7 +7156,7 @@ define i32 @global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory_access(pt
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v3, v[0:1], off offset:16
@@ -7178,7 +7178,7 @@ define i32 @global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory_access(pt
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %result = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %result
 }
 
@@ -8264,8 +8264,8 @@ entry:
   ret void
 }
 
-define void @global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) {
-; SI-LABEL: global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory_access:
+define void @global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -8294,7 +8294,7 @@ define void @global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory_access(
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -8317,7 +8317,7 @@ define void @global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory_access(
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:16
@@ -8338,12 +8338,12 @@ define void @global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory_access(
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i32 @global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) {
-; SI-LABEL: global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory_access:
+define i32 @global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -8373,7 +8373,7 @@ define i32 @global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
@@ -8396,7 +8396,7 @@ define i32 @global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v3, v[0:1], off offset:16
@@ -8418,7 +8418,7 @@ define i32 @global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %result = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %result
 }
 
@@ -8772,8 +8772,8 @@ define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_offset_scalar(ptr addrspa
   ret i32 %result
 }
 
-define void @global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) {
-; SI-LABEL: global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory_access:
+define void @global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -8786,7 +8786,7 @@ define void @global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory_a
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -8796,7 +8796,7 @@ define void @global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory_a
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_inc v[0:1], v2, off offset:16
@@ -8804,12 +8804,12 @@ define void @global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory_a
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i32 @global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) {
-; SI-LABEL: global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory_access:
+define i32 @global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -8823,7 +8823,7 @@ define i32 @global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory_acce
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -8833,7 +8833,7 @@ define i32 @global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory_acce
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_inc v0, v[0:1], v2, off offset:16 glc
@@ -8841,7 +8841,7 @@ define i32 @global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory_acce
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %result
 }
 
@@ -9195,8 +9195,8 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_offset_scalar(ptr addrspa
   ret i32 %result
 }
 
-define void @global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) {
-; SI-LABEL: global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory_access:
+define void @global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -9209,7 +9209,7 @@ define void @global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory_a
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -9219,7 +9219,7 @@ define void @global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory_a
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_dec v[0:1], v2, off offset:16
@@ -9227,12 +9227,12 @@ define void @global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory_a
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i32 @global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) {
-; SI-LABEL: global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory_access:
+define i32 @global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -9246,7 +9246,7 @@ define i32 @global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory_acce
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -9256,7 +9256,7 @@ define i32 @global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory_acce
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_dec v0, v[0:1], v2, off offset:16 glc
@@ -9264,7 +9264,7 @@ define i32 @global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory_acce
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %result
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
index 41a883302e8f..cafd35afea6e 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
@@ -367,8 +367,8 @@ define amdgpu_gfx i64 @global_atomic_xchg_i64_ret_offset_scalar(ptr addrspace(1)
   ret i64 %result
 }
 
-define void @global_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) {
-; SI-LABEL: global_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory_access:
+define void @global_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: global_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -381,7 +381,7 @@ define void @global_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory_access
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -391,7 +391,7 @@ define void @global_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory_access
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v[2:3], off offset:32
@@ -399,12 +399,12 @@ define void @global_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory_access
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i64 @global_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) {
-; SI-LABEL: global_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory_access:
+define i64 @global_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: global_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -419,7 +419,7 @@ define i64 @global_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory_access(pt
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -429,7 +429,7 @@ define i64 @global_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory_access(pt
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
@@ -437,7 +437,7 @@ define i64 @global_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory_access(pt
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %result = atomicrmw xchg ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw xchg ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %result
 }
 
@@ -805,8 +805,8 @@ define amdgpu_gfx double @global_atomic_xchg_f64_ret_offset_scalar(ptr addrspace
   ret double %result
 }
 
-define void @global_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, double %in) {
-; SI-LABEL: global_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory_access:
+define void @global_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, double %in) {
+; SI-LABEL: global_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -819,7 +819,7 @@ define void @global_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory_access
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -829,7 +829,7 @@ define void @global_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory_access
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v[2:3], off offset:16
@@ -837,12 +837,12 @@ define void @global_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory_access
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, double %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, double %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define double @global_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, double %in) {
-; SI-LABEL: global_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory_access:
+define double @global_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, double %in) {
+; SI-LABEL: global_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -857,7 +857,7 @@ define double @global_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory_access
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -867,7 +867,7 @@ define double @global_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory_access
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:16 glc
@@ -875,7 +875,7 @@ define double @global_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory_access
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %result = atomicrmw xchg ptr addrspace(1) %gep, double %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw xchg ptr addrspace(1) %gep, double %in seq_cst, !amdgpu.no.remote.memory !0
   ret double %result
 }
 
@@ -1243,8 +1243,8 @@ define amdgpu_gfx i64 @global_atomic_add_i64_ret_offset_scalar(ptr addrspace(1)
   ret i64 %result
 }
 
-define void @global_atomic_add_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) {
-; SI-LABEL: global_atomic_add_i64_noret_offset__amdgpu_no_remote_memory_access:
+define void @global_atomic_add_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: global_atomic_add_i64_noret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -1257,7 +1257,7 @@ define void @global_atomic_add_i64_noret_offset__amdgpu_no_remote_memory_access(
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_add_i64_noret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_add_i64_noret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -1267,7 +1267,7 @@ define void @global_atomic_add_i64_noret_offset__amdgpu_no_remote_memory_access(
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_add_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_add_i64_noret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_add_x2 v[0:1], v[2:3], off offset:32
@@ -1275,12 +1275,12 @@ define void @global_atomic_add_i64_noret_offset__amdgpu_no_remote_memory_access(
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw add ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw add ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i64 @global_atomic_add_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) {
-; SI-LABEL: global_atomic_add_i64_ret_offset__amdgpu_no_remote_memory_access:
+define i64 @global_atomic_add_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: global_atomic_add_i64_ret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -1295,7 +1295,7 @@ define i64 @global_atomic_add_i64_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_add_i64_ret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_add_i64_ret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -1305,7 +1305,7 @@ define i64 @global_atomic_add_i64_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_add_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_add_i64_ret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_add_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
@@ -1313,7 +1313,7 @@ define i64 @global_atomic_add_i64_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %result = atomicrmw add ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw add ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %result
 }
 
@@ -1681,8 +1681,8 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_offset_scalar(ptr addrspace(1)
   ret i64 %result
 }
 
-define void @global_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) {
-; SI-LABEL: global_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory_access:
+define void @global_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: global_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -1695,7 +1695,7 @@ define void @global_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory_access(
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -1705,7 +1705,7 @@ define void @global_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory_access(
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_sub_x2 v[0:1], v[2:3], off offset:32
@@ -1713,12 +1713,12 @@ define void @global_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory_access(
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i64 @global_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) {
-; SI-LABEL: global_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory_access:
+define i64 @global_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: global_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -1733,7 +1733,7 @@ define i64 @global_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -1743,7 +1743,7 @@ define i64 @global_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_sub_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
@@ -1751,7 +1751,7 @@ define i64 @global_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %result = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %result
 }
 
@@ -2119,8 +2119,8 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_offset_scalar(ptr addrspace(1)
   ret i64 %result
 }
 
-define void @global_atomic_and_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) {
-; SI-LABEL: global_atomic_and_i64_noret_offset__amdgpu_no_remote_memory_access:
+define void @global_atomic_and_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: global_atomic_and_i64_noret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -2133,7 +2133,7 @@ define void @global_atomic_and_i64_noret_offset__amdgpu_no_remote_memory_access(
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_and_i64_noret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_and_i64_noret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -2143,7 +2143,7 @@ define void @global_atomic_and_i64_noret_offset__amdgpu_no_remote_memory_access(
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_and_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_and_i64_noret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_and_x2 v[0:1], v[2:3], off offset:32
@@ -2151,12 +2151,12 @@ define void @global_atomic_and_i64_noret_offset__amdgpu_no_remote_memory_access(
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i64 @global_atomic_and_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) {
-; SI-LABEL: global_atomic_and_i64_ret_offset__amdgpu_no_remote_memory_access:
+define i64 @global_atomic_and_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: global_atomic_and_i64_ret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -2171,7 +2171,7 @@ define i64 @global_atomic_and_i64_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_and_i64_ret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_and_i64_ret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -2181,7 +2181,7 @@ define i64 @global_atomic_and_i64_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_and_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_and_i64_ret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_and_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
@@ -2189,7 +2189,7 @@ define i64 @global_atomic_and_i64_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %result = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %result
 }
 
@@ -2995,8 +2995,8 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1)
   ret i64 %result
 }
 
-define void @global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) {
-; SI-LABEL: global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory_access:
+define void @global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -3031,7 +3031,7 @@ define void @global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory_access
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -3058,7 +3058,7 @@ define void @global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory_access
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:32
@@ -3083,12 +3083,12 @@ define void @global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory_access
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i64 @global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) {
-; SI-LABEL: global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory_access:
+define i64 @global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v6, v3
@@ -3127,7 +3127,7 @@ define i64 @global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory_access(pt
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
@@ -3154,7 +3154,7 @@ define i64 @global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory_access(pt
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:32
@@ -3181,7 +3181,7 @@ define i64 @global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory_access(pt
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %result = atomicrmw nand ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw nand ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %result
 }
 
@@ -3549,8 +3549,8 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_offset_scalar(ptr addrspace(1) i
   ret i64 %result
 }
 
-define void @global_atomic_or_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) {
-; SI-LABEL: global_atomic_or_i64_noret_offset__amdgpu_no_remote_memory_access:
+define void @global_atomic_or_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: global_atomic_or_i64_noret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -3563,7 +3563,7 @@ define void @global_atomic_or_i64_noret_offset__amdgpu_no_remote_memory_access(p
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_or_i64_noret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_or_i64_noret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -3573,7 +3573,7 @@ define void @global_atomic_or_i64_noret_offset__amdgpu_no_remote_memory_access(p
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_or_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_or_i64_noret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_or_x2 v[0:1], v[2:3], off offset:32
@@ -3581,12 +3581,12 @@ define void @global_atomic_or_i64_noret_offset__amdgpu_no_remote_memory_access(p
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i64 @global_atomic_or_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) {
-; SI-LABEL: global_atomic_or_i64_ret_offset__amdgpu_no_remote_memory_access:
+define i64 @global_atomic_or_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: global_atomic_or_i64_ret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -3601,7 +3601,7 @@ define i64 @global_atomic_or_i64_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_or_i64_ret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_or_i64_ret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -3611,7 +3611,7 @@ define i64 @global_atomic_or_i64_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_or_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_or_i64_ret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_or_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
@@ -3619,7 +3619,7 @@ define i64 @global_atomic_or_i64_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %result = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %result
 }
 
@@ -3987,8 +3987,8 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_offset_scalar(ptr addrspace(1)
   ret i64 %result
 }
 
-define void @global_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) {
-; SI-LABEL: global_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory_access:
+define void @global_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: global_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -4001,7 +4001,7 @@ define void @global_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory_access(
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -4011,7 +4011,7 @@ define void @global_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory_access(
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_xor_x2 v[0:1], v[2:3], off offset:32
@@ -4019,12 +4019,12 @@ define void @global_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory_access(
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i64 @global_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) {
-; SI-LABEL: global_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory_access:
+define i64 @global_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: global_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -4039,7 +4039,7 @@ define i64 @global_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -4049,7 +4049,7 @@ define i64 @global_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
@@ -4057,7 +4057,7 @@ define i64 @global_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %result = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %result
 }
 
@@ -5339,8 +5339,8 @@ entry:
   ret void
 }
 
-define void @global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) {
-; SI-LABEL: global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory_access:
+define void @global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -5374,7 +5374,7 @@ define void @global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory_access(
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -5400,7 +5400,7 @@ define void @global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory_access(
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:32
@@ -5424,12 +5424,12 @@ define void @global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory_access(
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i64 @global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) {
-; SI-LABEL: global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory_access:
+define i64 @global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
@@ -5467,7 +5467,7 @@ define i64 @global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
@@ -5493,7 +5493,7 @@ define i64 @global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:32
@@ -5519,7 +5519,7 @@ define i64 @global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %result = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %result
 }
 
@@ -6689,8 +6689,8 @@ entry:
   ret void
 }
 
-define void @global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) {
-; SI-LABEL: global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory_access:
+define void @global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -6724,7 +6724,7 @@ define void @global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory_access
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -6750,7 +6750,7 @@ define void @global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory_access
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:32
@@ -6774,12 +6774,12 @@ define void @global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory_access
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i64 @global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) {
-; SI-LABEL: global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory_access:
+define i64 @global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
@@ -6817,7 +6817,7 @@ define i64 @global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory_access(pt
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
@@ -6843,7 +6843,7 @@ define i64 @global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory_access(pt
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:32
@@ -6869,7 +6869,7 @@ define i64 @global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory_access(pt
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %result = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %result
 }
 
@@ -7675,8 +7675,8 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1)
   ret i64 %result
 }
 
-define void @global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) {
-; SI-LABEL: global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory_access:
+define void @global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -7710,7 +7710,7 @@ define void @global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory_access
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -7736,7 +7736,7 @@ define void @global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory_access
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:32
@@ -7760,12 +7760,12 @@ define void @global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory_access
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i64 @global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) {
-; SI-LABEL: global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory_access:
+define i64 @global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
@@ -7803,7 +7803,7 @@ define i64 @global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory_access(pt
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
@@ -7829,7 +7829,7 @@ define i64 @global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory_access(pt
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:32
@@ -7855,7 +7855,7 @@ define i64 @global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory_access(pt
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %result = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %result
 }
 
@@ -9126,8 +9126,8 @@ entry:
   ret void
 }
 
-define void @global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) {
-; SI-LABEL: global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory_access:
+define void @global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -9161,7 +9161,7 @@ define void @global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory_access(
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -9187,7 +9187,7 @@ define void @global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory_access(
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:32
@@ -9211,12 +9211,12 @@ define void @global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory_access(
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i64 @global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) {
-; SI-LABEL: global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory_access:
+define i64 @global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
@@ -9254,7 +9254,7 @@ define i64 @global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
@@ -9280,7 +9280,7 @@ define i64 @global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:32
@@ -9306,7 +9306,7 @@ define i64 @global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory_access(ptr
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %result = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %result
 }
 
@@ -9674,8 +9674,8 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_offset_scalar(ptr addrspa
   ret i64 %result
 }
 
-define void @global_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) {
-; SI-LABEL: global_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory_access:
+define void @global_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: global_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -9688,7 +9688,7 @@ define void @global_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory_a
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -9698,7 +9698,7 @@ define void @global_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory_a
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v[2:3], off offset:32
@@ -9706,12 +9706,12 @@ define void @global_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory_a
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i64 @global_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) {
-; SI-LABEL: global_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory_access:
+define i64 @global_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: global_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -9726,7 +9726,7 @@ define i64 @global_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory_acce
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -9736,7 +9736,7 @@ define i64 @global_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory_acce
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
@@ -9744,7 +9744,7 @@ define i64 @global_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory_acce
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %result
 }
 
@@ -10112,8 +10112,8 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_offset_scalar(ptr addrspa
   ret i64 %result
 }
 
-define void @global_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) {
-; SI-LABEL: global_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory_access:
+define void @global_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: global_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -10126,7 +10126,7 @@ define void @global_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory_a
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -10136,7 +10136,7 @@ define void @global_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory_a
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_dec_x2 v[0:1], v[2:3], off offset:32
@@ -10144,12 +10144,12 @@ define void @global_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory_a
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define i64 @global_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) {
-; SI-LABEL: global_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory_access:
+define i64 @global_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: global_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -10164,7 +10164,7 @@ define i64 @global_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory_acce
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: global_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory_access:
+; VI-LABEL: global_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
@@ -10174,7 +10174,7 @@ define i64 @global_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory_acce
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_dec_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
@@ -10182,7 +10182,7 @@ define i64 @global_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory_acce
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0
+  %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %result
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 794c87b88831..d7773f746c6a 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -10986,8 +10986,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
   ret void
 }
 
-define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr) {
-; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) {
+; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7LESS:       ; %bb.0:
 ; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
@@ -11023,7 +11023,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX7LESS-NEXT:  .LBB18_3:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
@@ -11055,7 +11055,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX9-NEXT:  .LBB18_3:
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
@@ -11087,7 +11087,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX1064-NEXT:  .LBB18_3:
 ; GFX1064-NEXT:    s_endpgm
 ;
-; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s2, 0
@@ -11118,7 +11118,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX1032-NEXT:  .LBB18_3:
 ; GFX1032-NEXT:    s_endpgm
 ;
-; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1164:       ; %bb.0:
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
@@ -11154,7 +11154,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX1164-NEXT:  .LBB18_3:
 ; GFX1164-NEXT:    s_endpgm
 ;
-; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    s_mov_b32 s2, 0
@@ -11188,7 +11188,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX1132-NEXT:  .LBB18_3:
 ; GFX1132-NEXT:    s_endpgm
 ;
-; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX9-DPP:       ; %bb.0:
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
@@ -11220,7 +11220,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX9-DPP-NEXT:  .LBB18_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
-; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
@@ -11252,7 +11252,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX1064-DPP-NEXT:  .LBB18_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
-; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
@@ -11283,7 +11283,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX1032-DPP-NEXT:  .LBB18_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
-; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1164-DPP:       ; %bb.0:
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], exec
@@ -11319,7 +11319,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX1164-DPP-NEXT:  .LBB18_3:
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
-; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
@@ -11352,12 +11352,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB18_2
 ; GFX1132-DPP-NEXT:  .LBB18_3:
 ; GFX1132-DPP-NEXT:    s_endpgm
-  %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0  monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory.access !1, !amdgpu.ignore.denormal.mode !1
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0  monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1, !amdgpu.ignore.denormal.mode !1
   ret void
 }
 
-define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr) {
-; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) {
+; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7LESS:       ; %bb.0:
 ; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
@@ -11393,7 +11393,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX7LESS-NEXT:  .LBB19_3:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
@@ -11425,7 +11425,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX9-NEXT:  .LBB19_3:
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
@@ -11457,7 +11457,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX1064-NEXT:  .LBB19_3:
 ; GFX1064-NEXT:    s_endpgm
 ;
-; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s2, 0
@@ -11488,7 +11488,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX1032-NEXT:  .LBB19_3:
 ; GFX1032-NEXT:    s_endpgm
 ;
-; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1164:       ; %bb.0:
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
@@ -11524,7 +11524,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX1164-NEXT:  .LBB19_3:
 ; GFX1164-NEXT:    s_endpgm
 ;
-; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    s_mov_b32 s2, 0
@@ -11558,7 +11558,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX1132-NEXT:  .LBB19_3:
 ; GFX1132-NEXT:    s_endpgm
 ;
-; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX9-DPP:       ; %bb.0:
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
@@ -11590,7 +11590,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX9-DPP-NEXT:  .LBB19_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
-; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
@@ -11622,7 +11622,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX1064-DPP-NEXT:  .LBB19_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
-; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
@@ -11653,7 +11653,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX1032-DPP-NEXT:  .LBB19_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
-; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1164-DPP:       ; %bb.0:
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], exec
@@ -11689,7 +11689,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX1164-DPP-NEXT:  .LBB19_3:
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
-; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
@@ -11722,7 +11722,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB19_2
 ; GFX1132-DPP-NEXT:  .LBB19_3:
 ; GFX1132-DPP-NEXT:    s_endpgm
-  %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0  monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory.access !1
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0  monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 4c829f302e05..98c09dfaa2d5 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -7510,8 +7510,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
   ret void
 }
 
-define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr) {
-; GFX7LESS-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) {
+; GFX7LESS-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7LESS:       ; %bb.0:
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
 ; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
@@ -7544,7 +7544,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX7LESS-NEXT:  .LBB12_3:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -7573,7 +7573,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX9-NEXT:  .LBB12_3:
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX1064-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1064-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -7602,7 +7602,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1064-NEXT:  .LBB12_3:
 ; GFX1064-NEXT:    s_endpgm
 ;
-; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032-NEXT:    s_mov_b32 s2, 0
@@ -7630,7 +7630,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1032-NEXT:  .LBB12_3:
 ; GFX1032-NEXT:    s_endpgm
 ;
-; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1164:       ; %bb.0:
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
@@ -7662,7 +7662,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1164-NEXT:  .LBB12_3:
 ; GFX1164-NEXT:    s_endpgm
 ;
-; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132-NEXT:    s_mov_b32 s2, 0
@@ -7693,7 +7693,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1132-NEXT:  .LBB12_3:
 ; GFX1132-NEXT:    s_endpgm
 ;
-; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX9-DPP:       ; %bb.0:
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -7722,7 +7722,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX9-DPP-NEXT:  .LBB12_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
-; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -7751,7 +7751,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1064-DPP-NEXT:  .LBB12_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
-; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
@@ -7779,7 +7779,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1032-DPP-NEXT:  .LBB12_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
-; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1164-DPP:       ; %bb.0:
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], exec
@@ -7811,7 +7811,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1164-DPP-NEXT:  .LBB12_3:
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
-; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
@@ -7841,12 +7841,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB12_2
 ; GFX1132-DPP-NEXT:  .LBB12_3:
 ; GFX1132-DPP-NEXT:    s_endpgm
-  %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0  monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory.access !1, !amdgpu.ignore.denormal.mode !1
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0  monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1, !amdgpu.ignore.denormal.mode !1
   ret void
 }
 
-define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr) {
-; GFX7LESS-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) {
+; GFX7LESS-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7LESS:       ; %bb.0:
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
 ; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
@@ -7879,7 +7879,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX7LESS-NEXT:  .LBB13_3:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -7908,7 +7908,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX9-NEXT:  .LBB13_3:
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX1064-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1064-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -7937,7 +7937,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1064-NEXT:  .LBB13_3:
 ; GFX1064-NEXT:    s_endpgm
 ;
-; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032-NEXT:    s_mov_b32 s2, 0
@@ -7965,7 +7965,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1032-NEXT:  .LBB13_3:
 ; GFX1032-NEXT:    s_endpgm
 ;
-; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1164:       ; %bb.0:
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
@@ -7997,7 +7997,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1164-NEXT:  .LBB13_3:
 ; GFX1164-NEXT:    s_endpgm
 ;
-; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132-NEXT:    s_mov_b32 s2, 0
@@ -8028,7 +8028,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1132-NEXT:  .LBB13_3:
 ; GFX1132-NEXT:    s_endpgm
 ;
-; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX9-DPP:       ; %bb.0:
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -8057,7 +8057,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX9-DPP-NEXT:  .LBB13_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
-; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -8086,7 +8086,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1064-DPP-NEXT:  .LBB13_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
-; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
@@ -8114,7 +8114,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1032-DPP-NEXT:  .LBB13_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
-; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1164-DPP:       ; %bb.0:
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], exec
@@ -8146,7 +8146,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1164-DPP-NEXT:  .LBB13_3:
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
-; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
@@ -8176,7 +8176,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB13_2
 ; GFX1132-DPP-NEXT:  .LBB13_3:
 ; GFX1132-DPP-NEXT:    s_endpgm
-  %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0  monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory.access !1
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0  monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index 0b889d9eb0a5..1fb0db0e1f0d 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -7510,8 +7510,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
   ret void
 }
 
-define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr) {
-; GFX7LESS-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) {
+; GFX7LESS-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7LESS:       ; %bb.0:
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
 ; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
@@ -7544,7 +7544,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX7LESS-NEXT:  .LBB12_3:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -7573,7 +7573,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX9-NEXT:  .LBB12_3:
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX1064-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1064-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -7602,7 +7602,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1064-NEXT:  .LBB12_3:
 ; GFX1064-NEXT:    s_endpgm
 ;
-; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032-NEXT:    s_mov_b32 s2, 0
@@ -7630,7 +7630,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1032-NEXT:  .LBB12_3:
 ; GFX1032-NEXT:    s_endpgm
 ;
-; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1164:       ; %bb.0:
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
@@ -7662,7 +7662,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1164-NEXT:  .LBB12_3:
 ; GFX1164-NEXT:    s_endpgm
 ;
-; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132-NEXT:    s_mov_b32 s2, 0
@@ -7693,7 +7693,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1132-NEXT:  .LBB12_3:
 ; GFX1132-NEXT:    s_endpgm
 ;
-; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX9-DPP:       ; %bb.0:
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -7722,7 +7722,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX9-DPP-NEXT:  .LBB12_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
-; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -7751,7 +7751,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1064-DPP-NEXT:  .LBB12_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
-; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
@@ -7779,7 +7779,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1032-DPP-NEXT:  .LBB12_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
-; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1164-DPP:       ; %bb.0:
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], exec
@@ -7811,7 +7811,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1164-DPP-NEXT:  .LBB12_3:
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
-; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
@@ -7841,12 +7841,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB12_2
 ; GFX1132-DPP-NEXT:  .LBB12_3:
 ; GFX1132-DPP-NEXT:    s_endpgm
-  %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0  monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory.access !1, !amdgpu.ignore.denormal.mode !1
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0  monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1, !amdgpu.ignore.denormal.mode !1
   ret void
 }
 
-define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr) {
-; GFX7LESS-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) {
+; GFX7LESS-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7LESS:       ; %bb.0:
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
 ; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
@@ -7879,7 +7879,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX7LESS-NEXT:  .LBB13_3:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX9-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -7908,7 +7908,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX9-NEXT:  .LBB13_3:
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX1064-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1064-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -7937,7 +7937,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1064-NEXT:  .LBB13_3:
 ; GFX1064-NEXT:    s_endpgm
 ;
-; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032-NEXT:    s_mov_b32 s2, 0
@@ -7965,7 +7965,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1032-NEXT:  .LBB13_3:
 ; GFX1032-NEXT:    s_endpgm
 ;
-; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1164:       ; %bb.0:
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
@@ -7997,7 +7997,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1164-NEXT:  .LBB13_3:
 ; GFX1164-NEXT:    s_endpgm
 ;
-; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132-NEXT:    s_mov_b32 s2, 0
@@ -8028,7 +8028,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1132-NEXT:  .LBB13_3:
 ; GFX1132-NEXT:    s_endpgm
 ;
-; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX9-DPP:       ; %bb.0:
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -8057,7 +8057,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX9-DPP-NEXT:  .LBB13_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
-; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -8086,7 +8086,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1064-DPP-NEXT:  .LBB13_3:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
-; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
@@ -8114,7 +8114,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1032-DPP-NEXT:  .LBB13_3:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
-; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1164-DPP:       ; %bb.0:
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], exec
@@ -8146,7 +8146,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1164-DPP-NEXT:  .LBB13_3:
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
-; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access:
+; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
@@ -8176,7 +8176,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB13_2
 ; GFX1132-DPP-NEXT:  .LBB13_3:
 ; GFX1132-DPP-NEXT:    s_endpgm
-  %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0  monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory.access !1
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0  monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/idemponent-atomics.ll b/llvm/test/CodeGen/AMDGPU/idemponent-atomics.ll
index 6385466e1341..5416053078ec 100644
--- a/llvm/test/CodeGen/AMDGPU/idemponent-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/idemponent-atomics.ll
@@ -76,6 +76,26 @@ entry:
   ret i32 %val
 }
 
+define i32 @global_agent_acquire_release_idempotent_or__no_fine_grained(ptr addrspace(1) %in) {
+; GFX940-LABEL: global_agent_acquire_release_idempotent_or__no_fine_grained:
+; GFX940:       ; %bb.0: ; %entry
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_or v0, v[0:1], v2, off sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; OPT-LABEL: @global_agent_acquire_release_idempotent_or__no_fine_grained(
+; OPT-NEXT:  entry:
+; OPT-NEXT:    [[VAL:%.*]] = atomicrmw or ptr addrspace(1) [[IN:%.*]], i32 0 syncscope("agent-one-as") acq_rel, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; OPT-NEXT:    ret i32 [[VAL]]
+;
+entry:
+  %val = atomicrmw or ptr addrspace(1) %in, i32 0 syncscope("agent-one-as") acq_rel, align 4, !amdgpu.no.fine.grained.memory !0
+  ret i32 %val
+}
+
 define i32 @global_agent_seq_cst_idempotent_or(ptr addrspace(1) %in) {
 ; GFX940-LABEL: global_agent_seq_cst_idempotent_or:
 ; GFX940:       ; %bb.0: ; %entry
@@ -113,6 +133,23 @@ entry:
   ret i32 %val
 }
 
+define i32 @global_agent_monotonic_idempotent_add__no_fine_grained(ptr addrspace(1) %in) {
+; GFX940-LABEL: global_agent_monotonic_idempotent_add__no_fine_grained:
+; GFX940:       ; %bb.0: ; %entry
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v0, v[0:1], off sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; OPT-LABEL: @global_agent_monotonic_idempotent_add__no_fine_grained(
+; OPT-NEXT:  entry:
+; OPT-NEXT:    [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[IN:%.*]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; OPT-NEXT:    ret i32 [[VAL]]
+;
+entry:
+  %val = atomicrmw add ptr addrspace(1) %in, i32 0 syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory !0
+  ret i32 %val
+}
+
 define i32 @global_agent_monotonic_idempotent_sub(ptr addrspace(1) %in) {
 ; GFX940-LABEL: global_agent_monotonic_idempotent_sub:
 ; GFX940:       ; %bb.0: ; %entry
@@ -130,6 +167,23 @@ entry:
   ret i32 %val
 }
 
+define i32 @global_agent_monotonic_idempotent_sub__no_fine_grained(ptr addrspace(1) %in) {
+; GFX940-LABEL: global_agent_monotonic_idempotent_sub__no_fine_grained:
+; GFX940:       ; %bb.0: ; %entry
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v0, v[0:1], off
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; OPT-LABEL: @global_agent_monotonic_idempotent_sub__no_fine_grained(
+; OPT-NEXT:  entry:
+; OPT-NEXT:    [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[IN:%.*]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; OPT-NEXT:    ret i32 [[VAL]]
+;
+entry:
+  %val = atomicrmw sub ptr addrspace(1) %in, i32 0 syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory !0
+  ret i32 %val
+}
+
 define i32 @global_system_monotonic_idempotent_xor(ptr addrspace(1) %in) {
 ; GFX940-LABEL: global_system_monotonic_idempotent_xor:
 ; GFX940:       ; %bb.0: ; %entry
@@ -147,6 +201,23 @@ entry:
   ret i32 %val
 }
 
+define i32 @global_system_monotonic_idempotent_xor__no_fine_grained(ptr addrspace(1) %in) {
+; GFX940-LABEL: global_system_monotonic_idempotent_xor__no_fine_grained:
+; GFX940:       ; %bb.0: ; %entry
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v0, v[0:1], off sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; OPT-LABEL: @global_system_monotonic_idempotent_xor__no_fine_grained(
+; OPT-NEXT:  entry:
+; OPT-NEXT:    [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[IN:%.*]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; OPT-NEXT:    ret i32 [[VAL]]
+;
+entry:
+  %val = atomicrmw xor ptr addrspace(1) %in, i32 0 monotonic, align 4, !amdgpu.no.fine.grained.memory !0
+  ret i32 %val
+}
+
 define i32 @global_agent_monotonic_idempotent_and(ptr addrspace(1) %in) {
 ; GFX940-LABEL: global_agent_monotonic_idempotent_and:
 ; GFX940:       ; %bb.0: ; %entry
@@ -163,3 +234,22 @@ entry:
   %val = atomicrmw and ptr addrspace(1) %in, i32 -1 syncscope("singlethread") monotonic, align 4
   ret i32 %val
 }
+
+define i32 @global_agent_monotonic_idempotent_and_no_fined_grain(ptr addrspace(1) %in) {
+; GFX940-LABEL: global_agent_monotonic_idempotent_and_no_fined_grain:
+; GFX940:       ; %bb.0: ; %entry
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v0, v[0:1], off
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; OPT-LABEL: @global_agent_monotonic_idempotent_and_no_fined_grain(
+; OPT-NEXT:  entry:
+; OPT-NEXT:    [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[IN:%.*]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; OPT-NEXT:    ret i32 [[VAL]]
+;
+entry:
+  %val = atomicrmw and ptr addrspace(1) %in, i32 -1 syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory !0
+  ret i32 %val
+}
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir b/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
index 513734388eb6..f2a5139b73b1 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
@@ -60,9 +60,8 @@ body: |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $vgpr0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 2
   ; CHECK-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
   ; CHECK-NEXT:   $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
   ; CHECK-NEXT:   $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
   ; CHECK-NEXT: {{  $}}
@@ -87,13 +86,10 @@ body: |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $vgpr0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 4
   ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
   ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
   ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
   ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
@@ -160,16 +156,13 @@ body: |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $vgpr3, $vgpr5, $sgpr0, $sgpr2, $sgpr4, $sgpr5, $sgpr16, $sgpr17, $sgpr18, $sgpr19
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 274
   ; CHECK-NEXT:   $vgpr14 = V_MUL_F32_e32 $sgpr4, $vgpr3, implicit $exec, implicit $mode
   ; CHECK-NEXT:   $sgpr3 = S_MUL_F16 $sgpr0, $sgpr2, implicit $mode
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
   ; CHECK-NEXT:   $vgpr15 = V_MUL_F32_e32 $sgpr5, $vgpr3, implicit $exec, implicit $mode
   ; CHECK-NEXT:   $vgpr17 = V_FMA_F32_e64 0, $sgpr16, 0, $vgpr5, 0, $vgpr14, 0, 0, implicit $exec, implicit $mode
   ; CHECK-NEXT:   $sgpr1 = S_ADD_F16 $sgpr0, 15360, implicit $mode
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
   ; CHECK-NEXT:   $vgpr15 = V_FMA_F32_e64 0, $sgpr17, 0, $vgpr5, 0, $vgpr15, 0, 0, implicit $exec, implicit $mode
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
   ; CHECK-NEXT:   $vgpr14 = V_FMA_F32_e64 0, $sgpr18, 0, $vgpr15, 0, $vgpr17, 0, 0, implicit $exec, implicit $mode
   ; CHECK-NEXT:   $vgpr15 = V_FMA_F32_e64 0, $sgpr19, 0, $vgpr14, 0, $vgpr17, 0, 0, implicit $exec, implicit $mode
   ; CHECK-NEXT:   $vgpr16 = V_LOG_F32_e32 $vgpr15, implicit $exec, implicit $mode
@@ -229,9 +222,8 @@ body: |
   ; CHECK-NEXT:   liveins: $vgpr0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 2
   ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
   ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
   ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
   ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
@@ -686,9 +678,8 @@ body: |
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 2
   ; CHECK-NEXT:   $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
   ; CHECK-NEXT:   $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
   ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
   ; CHECK-NEXT: {{  $}}
@@ -726,3 +717,524 @@ body: |
   bb.1:
     liveins: $vgpr1, $vgpr2
 ...
+
+# Three single use producer instructions with non single use producer
+# instructions in between.
+---
+name: three_producers_with_two_skips
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: three_producers_with_two_skips
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 9361
+  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr2, $vgpr4
+  bb.0:
+    liveins: $vgpr0
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  bb.1:
+    liveins: $vgpr2, $vgpr4
+...
+
+# Six single use producer instructions with non single use producer
+# instructions in between.
+---
+name: six_producers_with_four_skips
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: six_producers_with_four_skips
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 145
+  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 9362
+  ; CHECK-NEXT:   $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr2, $vgpr4, $vgpr7, $vgpr9
+  bb.0:
+    liveins: $vgpr0
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  bb.1:
+    liveins: $vgpr2, $vgpr4, $vgpr7, $vgpr9
+...
+
+# Five single use producer instructions, followed by
+# four non single use producers, followed by
+# three single use producer instructions, followed by
+# two non single use producers, followed by
+# one single use producer instructions.
+---
+name: immediate_order
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: immediate_order
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 10693
+  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr13, $vgpr14
+  bb.0:
+    liveins: $vgpr0
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
+
+    $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
+
+    $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
+
+    $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
+
+    $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  bb.1:
+    liveins: $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr13, $vgpr14
+...
+
+# Maximum number of single use producers that can be encoded in a single
+# instruction.
+---
+name: maximum_producers_single_instruction
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: maximum_producers_single_instruction
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 58255
+  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr17 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr18 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr19 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr20 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr21 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr22 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr23 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr24 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr25 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr26 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr27 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr28 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr29 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  bb.0:
+    liveins: $vgpr0
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
+
+    $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr17 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr18 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr19 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr20 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr21 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr22 = V_MOV_B32_e32 $vgpr0, implicit $exec
+
+    $vgpr23 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr24 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr25 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr26 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr27 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr28 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr29 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  bb.1:
+...
+
+# One more than the maximum number of single use producers that can be encoded
+# in a single instruction.
+---
+name: too_many_producers_single_instruction
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: too_many_producers_single_instruction
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 58255
+  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr17 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr18 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr19 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr20 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr21 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr22 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr23 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr24 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr25 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr26 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr27 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr28 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr29 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr30 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+
+
+
+  bb.0:
+    liveins: $vgpr0
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
+
+    $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr17 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr18 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr19 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr20 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr21 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr22 = V_MOV_B32_e32 $vgpr0, implicit $exec
+
+    $vgpr23 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr24 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr25 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr26 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr27 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr28 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr29 = V_MOV_B32_e32 $vgpr0, implicit $exec
+
+    $vgpr30 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  bb.1:
+...
+
+# Maximum distance between single use producers that can be encoded in a single
+# instruction.
+---
+name: maximum_skips_single_instruction
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: maximum_skips_single_instruction
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 15473
+  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
+  bb.0:
+    liveins: $vgpr0
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+
+    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
+
+    $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  bb.1:
+    liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
+...
+
+# One more than the maximum distance between single use producers that can be
+# encoded in a single instruction.
+---
+name: too_many_skips_single_instruction
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: too_many_skips_single_instruction
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr17 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16
+  bb.0:
+    liveins: $vgpr0
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+
+    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec
+
+    $vgpr17 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  bb.1:
+    liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16
+...
+
+
+# Maximum possible encoding value with all bits of the immediate set
+---
+name: all_immediate_bits_set
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: all_immediate_bits_set
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 65535
+  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr17 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr18 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr19 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr20 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr21 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr22 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr23 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr24 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr25 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr26 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr27 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr28 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr29 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr30 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr31 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr33 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr34 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr35 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr36 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr37 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr38 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr39 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr40 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr41 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr42 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr43 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr30, $vgpr31, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36
+  bb.0:
+    liveins: $vgpr0
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
+
+    $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr17 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr18 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr19 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr20 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr21 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr22 = V_MOV_B32_e32 $vgpr0, implicit $exec
+
+    $vgpr23 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr24 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr25 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr26 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr27 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr28 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr29 = V_MOV_B32_e32 $vgpr0, implicit $exec
+
+    $vgpr30 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr31 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr33 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr34 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr35 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr36 = V_MOV_B32_e32 $vgpr0, implicit $exec
+
+    $vgpr37 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr38 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr39 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr40 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr41 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr42 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr43 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  bb.1:
+    liveins: $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr30, $vgpr31, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
index ffedde9416bb..113927100311 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
@@ -14,9 +14,7 @@ define amdgpu_cs float @test_cvt_f32_bf8_byte0(i32 %a) {
 define amdgpu_cs float @test_cvt_f32_bf8_byte1(i32 %a) {
 ; GFX12-LABEL: test_cvt_f32_bf8_byte1:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_cvt_f32_bf8_e64 v0, v0 op_sel:[0,1]
+; GFX12-NEXT:    v_cvt_f32_bf8_e64_dpp v0, v0 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX12-NEXT:    ; return to shader part epilog
   %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
   %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 1)
@@ -26,9 +24,7 @@ define amdgpu_cs float @test_cvt_f32_bf8_byte1(i32 %a) {
 define amdgpu_cs float @test_cvt_f32_bf8_byte2(i32 %a) {
 ; GFX12-LABEL: test_cvt_f32_bf8_byte2:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,0]
+; GFX12-NEXT:    v_cvt_f32_bf8_e64_dpp v0, v0 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX12-NEXT:    ; return to shader part epilog
   %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
   %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 2)
@@ -38,9 +34,7 @@ define amdgpu_cs float @test_cvt_f32_bf8_byte2(i32 %a) {
 define amdgpu_cs float @test_cvt_f32_fp8_byte3(i32 %a) {
 ; GFX12-LABEL: test_cvt_f32_fp8_byte3:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,1]
+; GFX12-NEXT:    v_cvt_f32_fp8_e64_dpp v0, v0 byte_sel:3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX12-NEXT:    ; return to shader part epilog
   %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
   %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %tmp0, i32 3)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir
index d11fb27640ee..7e286a4dd678 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir
@@ -13,12 +13,12 @@ body:             |
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-    ; GFX12-NEXT: [[V_CVT_F32_BF8_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_dpp [[DEF]], [[COPY]], 228, 15, 15, 1, implicit $mode, implicit $exec
-    ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_BF8_dpp]]
+    ; GFX12-NEXT: [[V_CVT_F32_BF8_OP_SEL_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_OP_SEL_dpp [[DEF]], [[COPY]], 228, 15, 15, 1, implicit $mode, implicit $exec
+    ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_BF8_OP_SEL_dpp]]
     ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec
-    %2:vgpr_32 = V_CVT_F32_BF8_e32 killed %1, implicit $mode, implicit $exec
+    %2:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e32 killed %1, implicit $mode, implicit $exec
     $vgpr0 = COPY %2
     SI_RETURN_TO_EPILOG $vgpr0
 
@@ -34,13 +34,13 @@ body:             |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY]], [[COPY]], 228, 15, 15, -1, implicit $exec
-    ; GFX12-NEXT: [[V_CVT_F32_BF8_OP_SEL_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64 8, killed [[V_MOV_B32_dpp]], 0, implicit $mode, implicit $exec
-    ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_BF8_OP_SEL_e64_]]
+    ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GFX12-NEXT: [[V_CVT_F32_BF8_OP_SEL_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64_dpp [[DEF]], [[COPY]], 2, 228, 15, 15, 1, implicit $mode, implicit $exec
+    ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_BF8_OP_SEL_e64_dpp]]
     ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec
-    %2:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64 8, killed %1, 0, implicit $mode, implicit $exec
+    %2:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64 killed %1, 2, implicit $mode, implicit $exec
     $vgpr0 = COPY %2
     SI_RETURN_TO_EPILOG $vgpr0
 
@@ -56,13 +56,13 @@ body:             |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY]], [[COPY]], 228, 15, 15, -1, implicit $exec
-    ; GFX12-NEXT: [[V_CVT_F32_FP8_OP_SEL_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64 12, killed [[V_MOV_B32_dpp]], 0, implicit $mode, implicit $exec
-    ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_FP8_OP_SEL_e64_]]
+    ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GFX12-NEXT: [[V_CVT_F32_FP8_OP_SEL_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64_dpp [[DEF]], [[COPY]], 3, 228, 15, 15, 1, implicit $mode, implicit $exec
+    ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_FP8_OP_SEL_e64_dpp]]
     ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec
-    %2:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64 12, killed %1, 0, implicit $mode, implicit $exec
+    %2:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64 killed %1, 3, implicit $mode, implicit $exec
     $vgpr0 = COPY %2
     SI_RETURN_TO_EPILOG $vgpr0
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
index 7662a3b78dea..d3fc96d7ff80 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
@@ -47,7 +47,7 @@ define float @test_cvt_f32_bf8_byte1(i32 %a) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_cvt_f32_bf8_e64 v0, v0 op_sel:[0,1]
+; GFX12-NEXT:    v_cvt_f32_bf8_e64 v0, v0 byte_sel:1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 1)
   ret float %ret
@@ -67,7 +67,7 @@ define float @test_cvt_f32_bf8_byte2(i32 %a) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,0]
+; GFX12-NEXT:    v_cvt_f32_bf8_e64 v0, v0 byte_sel:2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 2)
   ret float %ret
@@ -87,7 +87,7 @@ define float @test_cvt_f32_bf8_byte3(i32 %a) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,1]
+; GFX12-NEXT:    v_cvt_f32_bf8_e64 v0, v0 byte_sel:3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 3)
   ret float %ret
@@ -127,7 +127,7 @@ define float @test_cvt_f32_fp8_byte1(i32 %a) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_cvt_f32_fp8_e64 v0, v0 op_sel:[0,1]
+; GFX12-NEXT:    v_cvt_f32_fp8_e64 v0, v0 byte_sel:1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1)
   ret float %ret
@@ -147,7 +147,7 @@ define float @test_cvt_f32_fp8_byte2(i32 %a) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,0]
+; GFX12-NEXT:    v_cvt_f32_fp8_e64 v0, v0 byte_sel:2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 2)
   ret float %ret
@@ -167,7 +167,7 @@ define float @test_cvt_f32_fp8_byte3(i32 %a) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,1]
+; GFX12-NEXT:    v_cvt_f32_fp8_e64 v0, v0 byte_sel:3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 3)
   ret float %ret
@@ -552,7 +552,7 @@ define float @test_sext_cvt_f32_fp8(i16 %a) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_cvt_f32_fp8_e64 v0, v0 op_sel:[0,1]
+; GFX12-NEXT:    v_cvt_f32_fp8_e64 v0, v0 byte_sel:1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %a.sext = sext i16 %a to i32
   %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a.sext, i32 1)
@@ -576,7 +576,7 @@ define float @test_sext_cvt_f32_bf8(i16 %a) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_cvt_f32_bf8_e64 v0, v0 op_sel:[0,1]
+; GFX12-NEXT:    v_cvt_f32_bf8_e64 v0, v0 byte_sel:1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %a.sext = sext i16 %a to i32
   %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a.sext, i32 1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
index e43daf46e1e0..b678378e5554 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
@@ -55,11 +55,11 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}update_dpp64_test:
+; GCN-LABEL: {{^}}update_dppi64_test:
 ; GCN:     load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
 ; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
 ; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
-define amdgpu_kernel void @update_dpp64_test(ptr addrspace(1) %arg, i64 %in1, i64 %in2) {
+define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i64 %in2) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id
   %load = load i64, ptr addrspace(1) %gep
@@ -68,7 +68,83 @@ define amdgpu_kernel void @update_dpp64_test(ptr addrspace(1) %arg, i64 %in1, i6
   ret void
 }
 
-; GCN-LABEL: {{^}}update_dpp64_imm_old_test:
+; GCN-LABEL: {{^}}update_dppf64_test:
+; GCN:     load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
+; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1, double %in2) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id
+  %load = load double, ptr addrspace(1) %gep
+  %tmp0 = call double @llvm.amdgcn.update.dpp.f64(double %in1, double %load, i32 1, i32 1, i32 1, i1 false) #0
+  store double %tmp0, ptr addrspace(1) %gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}update_dppv2i32_test:
+; GCN:     load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
+; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> %in1, <2 x i32> %in2) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <2 x i32>, ptr addrspace(1) %arg, i32 %id
+  %load = load <2 x i32>, ptr addrspace(1) %gep
+  %tmp0 = call <2 x i32> @llvm.amdgcn.update.dpp.v2i32(<2 x i32> %in1, <2 x i32> %load, i32 1, i32 1, i32 1, i1 false) #0
+  store <2 x i32> %tmp0, ptr addrspace(1) %gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}update_dppv2f32_test:
+; GCN:     load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
+; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x float> %in1, <2 x float> %in2) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %arg, i32 %id
+  %load = load <2 x float>, ptr addrspace(1) %gep
+  %tmp0 = call <2 x float> @llvm.amdgcn.update.dpp.v2f32(<2 x float> %in1, <2 x float> %load, i32 1, i32 1, i32 1, i1 false) #0
+  store <2 x float> %tmp0, ptr addrspace(1) %gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}update_dpp_p0_test:
+; GCN:     load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
+; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, ptr %in2) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds ptr, ptr addrspace(1) %arg, i32 %id
+  %load = load ptr, ptr addrspace(1) %gep
+  %tmp0 = call ptr @llvm.amdgcn.update.dpp.p0(ptr %in1, ptr %load, i32 1, i32 1, i32 1, i1 false) #0
+  store ptr %tmp0, ptr addrspace(1) %gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}update_dpp_p3_test:
+; GCN: {{load|read}}_{{dword|b32}} v[[SRC:[0-9]+]]
+; GCN: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspace(3) %in1, ptr %in2) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %arg, i32 %id
+  %load = load ptr addrspace(3), ptr addrspace(3) %gep
+  %tmp0 = call ptr addrspace(3) @llvm.amdgcn.update.dpp.p3(ptr addrspace(3) %in1, ptr addrspace(3) %load, i32 1, i32 1, i32 1, i1 false) #0
+  store ptr addrspace(3) %tmp0, ptr addrspace(3) %gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}update_dpp_p5_test:
+; GCN: {{load|read}}_{{dword|b32}} v[[SRC:[0-9]+]]
+; GCN: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspace(5) %in1, ptr %in2) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds ptr addrspace(5), ptr addrspace(5) %arg, i32 %id
+  %load = load ptr addrspace(5), ptr addrspace(5) %gep
+  %tmp0 = call ptr addrspace(5) @llvm.amdgcn.update.dpp.p5(ptr addrspace(5) %in1, ptr addrspace(5) %load, i32 1, i32 1, i32 1, i1 false) #0
+  store ptr addrspace(5) %tmp0, ptr addrspace(5) %gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}update_dppi64_imm_old_test:
 ; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_LO:[0-9]+]], 0x3afaedd9
 ; GFX8-OPT-DAG,GFX10-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x7047
 ; GFX11-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x7047
@@ -79,7 +155,7 @@ define amdgpu_kernel void @update_dpp64_test(ptr addrspace(1) %arg, i64 %in1, i6
 ; GFX8-OPT-DAG,GFX10-DAG,GFX11-DAG: v_mov_b32_dpp v[[OLD_HI]], v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
 ; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
 ; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
-define amdgpu_kernel void @update_dpp64_imm_old_test(ptr addrspace(1) %arg, i64 %in2) {
+define amdgpu_kernel void @update_dppi64_imm_old_test(ptr addrspace(1) %arg, i64 %in2) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id
   %load = load i64, ptr addrspace(1) %gep
@@ -88,7 +164,27 @@ define amdgpu_kernel void @update_dpp64_imm_old_test(ptr addrspace(1) %arg, i64
   ret void
 }
 
-; GCN-LABEL: {{^}}update_dpp64_imm_src_test:
+; GCN-LABEL: {{^}}update_dppf64_imm_old_test:
+; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_LO:[0-9]+]], 0x6b8564a
+; GFX8-OPT-DAG,GFX10-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x405edce1
+; GFX11-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x405edce1
+; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_LO:[0-9]+]], 0x6b8564a
+; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_HI:[0-9]+]], 0x405edce1
+; GCN-DAG: load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
+; GCN-OPT-DAG: v_mov_b32_dpp v[[OLD_LO]], v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+; GFX8-OPT-DAG,GFX10-DAG,GFX11-DAG: v_mov_b32_dpp v[[OLD_HI]], v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+define amdgpu_kernel void @update_dppf64_imm_old_test(ptr addrspace(1) %arg, double %in2) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id
+  %load = load double, ptr addrspace(1) %gep
+  %tmp0 = call double @llvm.amdgcn.update.dpp.f64(double 123.4512345123450, double %load, i32 1, i32 1, i32 1, i1 false) #0
+  store double %tmp0, ptr addrspace(1) %gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}update_dppi64_imm_src_test:
 ; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_LO:[0-9]+]], 0x3afaedd9
 ; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x7047
 ; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_LO:[0-9]+]], 0x3afaedd9
@@ -97,12 +193,27 @@ define amdgpu_kernel void @update_dpp64_imm_old_test(ptr addrspace(1) %arg, i64
 ; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[OLD_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
 ; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
 ; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
-define amdgpu_kernel void @update_dpp64_imm_src_test(ptr addrspace(1) %out, i64 %in1) {
+define amdgpu_kernel void @update_dppi64_imm_src_test(ptr addrspace(1) %out, i64 %in1) {
   %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 123451234512345, i32 1, i32 1, i32 1, i1 false) #0
   store i64 %tmp0, ptr addrspace(1) %out
   ret void
 }
 
+; GCN-LABEL: {{^}}update_dppf64_imm_src_test:
+; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_LO:[0-9]+]], 0x6b8564a
+; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x405edce1
+; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_LO:[0-9]+]], 0x6b8564a
+; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_HI:[0-9]+]], 0x405edce1
+; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[OLD_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[OLD_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+define amdgpu_kernel void @update_dppf64_imm_src_test(ptr addrspace(1) %out, double %in1) {
+  %tmp0 = call double @llvm.amdgcn.update.dpp.f64(double %in1, double 123.451234512345, i32 1, i32 1, i32 1, i1 false) #0
+  store double %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
 ; GCN-LABEL: {{^}}dpp_test_f32:
 ; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
 ; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
index a1835ea176d5..47f988fc17d2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
@@ -2,8 +2,8 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
 ; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s
 ; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s
-; RUN: llc -mtriple=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,PACKED-TID %s
-; RUN: llc -mtriple=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=ALL,PACKED-TID %s
+; RUN: llc -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,PACKED-TID %s
+; RUN: llc -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=ALL,PACKED-TID %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 declare i32 @llvm.amdgcn.workitem.id.y() #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
new file mode 100644
index 000000000000..7d7a46259710
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
@@ -0,0 +1,2805 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx703 < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GCN,GFX940 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+
+define half @v_maximum_f16(half %src0, half %src1) {
+; GFX7-LABEL: v_maximum_f16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_max_f32_e32 v3, v0, v1
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_f16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call half @llvm.maximum.f16(half %src0, half %src1)
+  ret half %op
+}
+
+define half @v_maximum_f16__nnan(half %src0, half %src1) {
+; GFX7-LABEL: v_maximum_f16__nnan:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_f16__nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_f16__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_f16__nnan:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_f16__nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_f16__nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_f16__nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan half @llvm.maximum.f16(half %src0, half %src1)
+  ret half %op
+}
+
+define half @v_maximum_f16__nsz(half %src0, half %src1) {
+; GFX7-LABEL: v_maximum_f16__nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_max_f32_e32 v3, v0, v1
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_f16__nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_f16__nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_f16__nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_f16__nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_f16__nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_f16__nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nsz half @llvm.maximum.f16(half %src0, half %src1)
+  ret half %op
+}
+
+define half @v_maximum_f16__nnan_nsz(half %src0, half %src1) {
+; GFX7-LABEL: v_maximum_f16__nnan_nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_f16__nnan_nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_f16__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_f16__nnan_nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_f16__nnan_nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_f16__nnan_nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_f16__nnan_nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan nsz half @llvm.maximum.f16(half %src0, half %src1)
+  ret half %op
+}
+
+define half @v_maximum_f16__nnan_src0(half %arg0, half %src1) {
+; GFX7-LABEL: v_maximum_f16__nnan_src0:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_max_f32_e32 v3, v0, v1
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_f16__nnan_src0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_f16__nnan_src0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX9-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_f16__nnan_src0:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX940-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_f16__nnan_src0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX10-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_f16__nnan_src0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_f16__nnan_src0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %src0 = fadd nnan half %arg0, 1.0
+  %op = call half @llvm.maximum.f16(half %src0, half %src1)
+  ret half %op
+}
+
+define half @v_maximum_f16__nnan_src1(half %src0, half %arg1) {
+; GFX7-LABEL: v_maximum_f16__nnan_src1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_max_f32_e32 v3, v0, v1
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_f16__nnan_src1:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_f16_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_f16__nnan_src1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_f16_e32 v1, 1.0, v1
+; GFX9-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_f16__nnan_src1:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_add_f16_e32 v1, 1.0, v1
+; GFX940-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_f16__nnan_src1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_f16_e32 v1, 1.0, v1
+; GFX10-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_f16__nnan_src1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_f16_e32 v1, 1.0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_f16__nnan_src1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_f16_e32 v1, 1.0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %src1 = fadd nnan half %arg1, 1.0
+  %op = call half @llvm.maximum.f16(half %src0, half %src1)
+  ret half %op
+}
+
+define void @s_maximum_f16(half inreg %src0, half inreg %src1) {
+; GFX7-LABEL: s_maximum_f16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, s5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, s4
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_max_f32_e32 v3, v1, v0
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; use v0
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: s_maximum_f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s5
+; GFX8-NEXT:    v_max_f16_e32 v1, s4, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, s4, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use v0
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_maximum_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-NEXT:    v_max_f16_e32 v1, s4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_maximum_f16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NEXT:    v_max_f16_e32 v1, s0, v0
+; GFX940-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX940-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_maximum_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e64 v0, s4, s5
+; GFX10-NEXT:    v_cmp_o_f16_e64 vcc_lo, s4, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v0
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_maximum_f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e64 v0, s0, s1
+; GFX11-NEXT:    v_cmp_o_f16_e64 vcc_lo, s0, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v0
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: s_maximum_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_maximum_f16 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use s0
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call half @llvm.maximum.f16(half %src0, half %src1)
+  %cast = bitcast half %op to i16
+  %zext = zext i16 %cast to i32
+  call void asm sideeffect "; use $0", "s"(i32 %zext)
+  ret void
+}
+
+define <2 x half> @v_maximum_v2f16(<2 x half> %src0, <2 x half> %src1) {
+; GFX7-LABEL: v_maximum_v2f16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX7-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX7-NEXT:    v_max_f32_e32 v2, v1, v3
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v2f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_cmp_gt_f16_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX8-NEXT:    v_cmp_class_f16_e64 vcc, v3, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX8-NEXT:    v_cmp_class_f16_e64 vcc, v2, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT:    v_cmp_eq_f16_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    v_cmp_gt_f16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8-NEXT:    v_cmp_class_f16_e64 vcc, v0, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f16_e64 vcc, v1, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_cmp_eq_f16_e32 vcc, 0, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v2f16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_pk_max_f16 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX940-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v2f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_max_f16 v2, v0, v1
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v3, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v2, v0, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v2f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <2 x half> @llvm.maximum.v2f16(<2 x half> %src0, <2 x half> %src1)
+  ret <2 x half> %op
+}
+
+define <2 x half> @v_maximum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) {
+; GFX7-LABEL: v_maximum_v2f16__nnan:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX7-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX7-NEXT:    v_max_f32_e32 v2, v1, v3
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v2f16__nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_cmp_gt_f16_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
+; GFX8-NEXT:    v_cmp_class_f16_e64 vcc, v3, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX8-NEXT:    v_cmp_class_f16_e64 vcc, v2, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT:    v_cmp_eq_f16_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    v_cmp_gt_f16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f16_e64 vcc, v0, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f16_e64 vcc, v1, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_cmp_eq_f16_e32 vcc, 0, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v2f16__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v2f16__nnan:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v2f16__nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v2f16__nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v2f16__nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan <2 x half> @llvm.maximum.v2f16(<2 x half> %src0, <2 x half> %src1)
+  ret <2 x half> %op
+}
+
+define <2 x half> @v_maximum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) {
+; GFX7-LABEL: v_maximum_v2f16__nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX7-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX7-NEXT:    v_max_f32_e32 v2, v1, v3
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v2f16__nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_cmp_gt_f16_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
+; GFX8-NEXT:    v_cmp_gt_f16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v2f16__nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v2f16__nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_pk_max_f16 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX940-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v2f16__nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_max_f16 v2, v0, v1
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v3, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v2f16__nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v2, v0, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v2f16__nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nsz <2 x half> @llvm.maximum.v2f16(<2 x half> %src0, <2 x half> %src1)
+  ret <2 x half> %op
+}
+
+define <2 x half> @v_maximum_v2f16__nnan_nsz(<2 x half> %src0, <2 x half> %src1) {
+; GFX7-LABEL: v_maximum_v2f16__nnan_nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX7-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX7-NEXT:    v_max_f32_e32 v2, v1, v3
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v2f16__nnan_nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_cmp_gt_f16_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_cmp_gt_f16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v2f16__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v2f16__nnan_nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v2f16__nnan_nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v2f16__nnan_nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v2f16__nnan_nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan nsz <2 x half> @llvm.maximum.v2f16(<2 x half> %src0, <2 x half> %src1)
+  ret <2 x half> %op
+}
+
+define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
+; GFX7-LABEL: s_maximum_v2f16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, s7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, s5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, s6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, s4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX7-NEXT:    v_max_f32_e32 v4, v1, v0
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX7-NEXT:    v_max_f32_e32 v1, v3, v2
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v3, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; use v0
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: s_maximum_v2f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_lshr_b32 s6, s5, 16
+; GFX8-NEXT:    s_lshr_b32 s7, s4, 16
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_cmp_gt_f16_e32 vcc, s7, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, s7, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f16_e64 vcc, s7, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX8-NEXT:    v_cmp_class_f16_e64 vcc, s6, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f16_e32 vcc, 0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_cmp_gt_f16_e32 vcc, s4, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v1, v2, vcc
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, s4, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT:    v_cmp_class_f16_e64 vcc, s4, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f16_e64 vcc, s5, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX8-NEXT:    v_cmp_eq_f16_e32 vcc, 0, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use v0
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_maximum_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_lshr_b32 s5, s5, 16
+; GFX9-NEXT:    v_pk_max_f16 v1, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s4, v0
+; GFX9-NEXT:    s_lshr_b32 s4, s4, 16
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s4, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_maximum_v2f16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX940-NEXT:    v_pk_max_f16 v1, s0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX940-NEXT:    v_mov_b32_e32 v3, s1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, s0, v3
+; GFX940-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX940-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_maximum_v2f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_max_f16 v0, s4, s5
+; GFX10-NEXT:    v_cmp_o_f16_e64 vcc_lo, s4, s5
+; GFX10-NEXT:    s_lshr_b32 s6, s5, 16
+; GFX10-NEXT:    s_lshr_b32 s4, s4, 16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_e64 vcc_lo, s4, s6
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo
+; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v0
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_maximum_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v0, s0, s1
+; GFX11-NEXT:    v_cmp_o_f16_e64 vcc_lo, s0, s1
+; GFX11-NEXT:    s_lshr_b32 s2, s1, 16
+; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e64 vcc_lo, s0, s2
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo
+; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v0
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: s_maximum_v2f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, s0, s1
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use v0
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <2 x half> @llvm.maximum.v2f16(<2 x half> %src0, <2 x half> %src1)
+  %cast = bitcast <2 x half> %op to i32
+  call void asm sideeffect "; use $0", "s"(i32 %cast)
+  ret void
+}
+
+define <3 x half> @v_maximum_v3f16(<3 x half> %src0, <3 x half> %src1) {
+; GFX7-LABEL: v_maximum_v3f16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_max_f32_e32 v6, v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX7-NEXT:    v_max_f32_e32 v3, v1, v4
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX7-NEXT:    v_max_f32_e32 v3, v2, v5
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v3f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_max_f16_e32 v6, v5, v4
+; GFX8-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v5, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v6, vcc
+; GFX8-NEXT:    v_max_f16_e32 v5, v1, v3
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
+; GFX8-NEXT:    v_max_f16_e32 v3, v0, v2
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v3f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-NEXT:    v_pk_max_f16 v3, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v3f16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX940-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX940-NEXT:    v_pk_max_f16 v3, v0, v2
+; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX940-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v3f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_max_f16 v4, v0, v2
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0x7e00, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_pk_max_f16 v2, v1, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v2, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v3f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v4, v0, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v6, v5
+; GFX11-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v3f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <3 x half> @llvm.maximum.v3f16(<3 x half> %src0, <3 x half> %src1)
+  ret <3 x half> %op
+}
+
+define <3 x half> @v_maximum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) {
+; GFX7-LABEL: v_maximum_v3f16__nnan:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_max_f32_e32 v6, v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX7-NEXT:    v_max_f32_e32 v3, v1, v4
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX7-NEXT:    v_max_f32_e32 v3, v2, v5
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v3f16__nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v3f16__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v3f16__nnan:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX940-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v3f16__nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v3f16__nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v3f16__nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan <3 x half> @llvm.maximum.v3f16(<3 x half> %src0, <3 x half> %src1)
+  ret <3 x half> %op
+}
+
+define <3 x half> @v_maximum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) {
+; GFX7-LABEL: v_maximum_v3f16__nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_max_f32_e32 v6, v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX7-NEXT:    v_max_f32_e32 v3, v1, v4
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX7-NEXT:    v_max_f32_e32 v3, v2, v5
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v3f16__nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_max_f16_e32 v6, v5, v4
+; GFX8-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v5, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v6, vcc
+; GFX8-NEXT:    v_max_f16_e32 v5, v1, v3
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
+; GFX8-NEXT:    v_max_f16_e32 v3, v0, v2
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v3f16__nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-NEXT:    v_pk_max_f16 v3, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v3f16__nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX940-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX940-NEXT:    v_pk_max_f16 v3, v0, v2
+; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX940-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v3f16__nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_max_f16 v4, v0, v2
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0x7e00, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_pk_max_f16 v2, v1, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v2, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v3f16__nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v4, v0, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v6, v5
+; GFX11-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v3f16__nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nsz <3 x half> @llvm.maximum.v3f16(<3 x half> %src0, <3 x half> %src1)
+  ret <3 x half> %op
+}
+
+define <3 x half> @v_maximum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1) {
+; GFX7-LABEL: v_maximum_v3f16__nnan_nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_max_f32_e32 v6, v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX7-NEXT:    v_max_f32_e32 v3, v1, v4
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX7-NEXT:    v_max_f32_e32 v3, v2, v5
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v3f16__nnan_nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v3f16__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v3f16__nnan_nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX940-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v3f16__nnan_nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v3f16__nnan_nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v3f16__nnan_nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan nsz <3 x half> @llvm.maximum.v3f16(<3 x half> %src0, <3 x half> %src1)
+  ret <3 x half> %op
+}
+
+define <4 x half> @v_maximum_v4f16(<4 x half> %src0, <4 x half> %src1) {
+; GFX7-LABEL: v_maximum_v4f16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_max_f32_e32 v8, v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
+; GFX7-NEXT:    v_max_f32_e32 v4, v1, v5
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX7-NEXT:    v_max_f32_e32 v4, v2, v6
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
+; GFX7-NEXT:    v_max_f32_e32 v4, v3, v7
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v4f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX8-NEXT:    v_max_f16_e32 v6, v5, v4
+; GFX8-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v5, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v6, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX8-NEXT:    v_max_f16_e32 v8, v6, v5
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v6, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
+; GFX8-NEXT:    v_max_f16_e32 v6, v1, v3
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX8-NEXT:    v_max_f16_e32 v3, v0, v2
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v4f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-NEXT:    v_pk_max_f16 v3, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v4f16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX940-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    v_pk_max_f16 v3, v0, v2
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX940-NEXT:    v_perm_b32 v1, v1, v6, s0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX940-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v4f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
+; GFX10-NEXT:    v_pk_max_f16 v5, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0x7e00, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0x7e00, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v5, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v1, v1, v6, 0x5040100
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v4f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-NEXT:    v_pk_max_f16 v7, v0, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v6, v5
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7e00, v4, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v4f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <4 x half> @llvm.maximum.v4f16(<4 x half> %src0, <4 x half> %src1)
+  ret <4 x half> %op
+}
+
+define <4 x half> @v_maximum_v4f16__nnan(<4 x half> %src0, <4 x half> %src1) {
+; GFX7-LABEL: v_maximum_v4f16__nnan:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_max_f32_e32 v8, v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
+; GFX7-NEXT:    v_max_f32_e32 v4, v1, v5
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX7-NEXT:    v_max_f32_e32 v4, v2, v6
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
+; GFX7-NEXT:    v_max_f32_e32 v4, v3, v7
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v4f16__nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v3
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v4f16__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v4f16__nnan:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX940-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v4f16__nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v4f16__nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v4f16__nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan <4 x half> @llvm.maximum.v4f16(<4 x half> %src0, <4 x half> %src1)
+  ret <4 x half> %op
+}
+
+define <4 x half> @v_maximum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) {
+; GFX7-LABEL: v_maximum_v4f16__nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_max_f32_e32 v8, v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
+; GFX7-NEXT:    v_max_f32_e32 v4, v1, v5
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX7-NEXT:    v_max_f32_e32 v4, v2, v6
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
+; GFX7-NEXT:    v_max_f32_e32 v4, v3, v7
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v4f16__nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX8-NEXT:    v_max_f16_e32 v6, v5, v4
+; GFX8-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v5, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v6, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX8-NEXT:    v_max_f16_e32 v8, v6, v5
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v6, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
+; GFX8-NEXT:    v_max_f16_e32 v6, v1, v3
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX8-NEXT:    v_max_f16_e32 v3, v0, v2
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v4f16__nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-NEXT:    v_pk_max_f16 v3, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v4f16__nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX940-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    v_pk_max_f16 v3, v0, v2
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX940-NEXT:    v_perm_b32 v1, v1, v6, s0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX940-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v4f16__nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
+; GFX10-NEXT:    v_pk_max_f16 v5, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0x7e00, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0x7e00, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v5, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v1, v1, v6, 0x5040100
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v4f16__nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-NEXT:    v_pk_max_f16 v7, v0, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v6, v5
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7e00, v4, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v4f16__nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nsz <4 x half> @llvm.maximum.v4f16(<4 x half> %src0, <4 x half> %src1)
+  ret <4 x half> %op
+}
+
+define <4 x half> @v_maximum_v4f16__nnan_nsz(<4 x half> %src0, <4 x half> %src1) {
+; GFX7-LABEL: v_maximum_v4f16__nnan_nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_max_f32_e32 v8, v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
+; GFX7-NEXT:    v_max_f32_e32 v4, v1, v5
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX7-NEXT:    v_max_f32_e32 v4, v2, v6
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
+; GFX7-NEXT:    v_max_f32_e32 v4, v3, v7
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v4f16__nnan_nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v3
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v4f16__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v4f16__nnan_nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX940-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v4f16__nnan_nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v4f16__nnan_nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v4f16__nnan_nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan nsz <4 x half> @llvm.maximum.v4f16(<4 x half> %src0, <4 x half> %src1)
+  ret <4 x half> %op
+}
+
+define <8 x half> @v_maximum_v8f16(<8 x half> %src0, <8 x half> %src1) {
+; GFX7-LABEL: v_maximum_v8f16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_max_f32_e32 v16, v0, v8
+; GFX7-NEXT:    v_mov_b32_e32 v17, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v17, v16, vcc
+; GFX7-NEXT:    v_max_f32_e32 v8, v1, v9
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v9
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v17, v8, vcc
+; GFX7-NEXT:    v_max_f32_e32 v8, v2, v10
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v2, v10
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v17, v8, vcc
+; GFX7-NEXT:    v_max_f32_e32 v8, v3, v11
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v3, v11
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v17, v8, vcc
+; GFX7-NEXT:    v_max_f32_e32 v8, v4, v12
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v4, v12
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v17, v8, vcc
+; GFX7-NEXT:    v_max_f32_e32 v8, v5, v13
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v5, v13
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v17, v8, vcc
+; GFX7-NEXT:    v_max_f32_e32 v8, v6, v14
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v6, v14
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v17, v8, vcc
+; GFX7-NEXT:    v_max_f32_e32 v8, v7, v15
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v7, v15
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v17, v8, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v8f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX8-NEXT:    v_max_f16_e32 v10, v9, v8
+; GFX8-NEXT:    v_mov_b32_e32 v11, 0x7e00
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v9, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v11, v10, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX8-NEXT:    v_max_f16_e32 v12, v10, v9
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v10, v9
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v11, v12, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v1
+; GFX8-NEXT:    v_max_f16_e32 v13, v12, v10
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v12, v10
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v11, v13, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
+; GFX8-NEXT:    v_max_f16_e32 v14, v13, v12
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v13, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v11, v14, vcc
+; GFX8-NEXT:    v_max_f16_e32 v13, v3, v7
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v3, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v11, v13, vcc
+; GFX8-NEXT:    v_max_f16_e32 v7, v2, v6
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v2, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v11, v7, vcc
+; GFX8-NEXT:    v_max_f16_e32 v6, v1, v5
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v11, v6, vcc
+; GFX8-NEXT:    v_max_f16_e32 v5, v0, v4
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v11, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v12
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v10
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v8f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v8, v3, v7
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v3, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v8, vcc
+; GFX9-NEXT:    v_pk_max_f16 v7, v2, v6
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v7, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v7, vcc
+; GFX9-NEXT:    v_pk_max_f16 v6, v1, v5
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v6, vcc
+; GFX9-NEXT:    v_pk_max_f16 v5, v0, v4
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v9, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v9, v5, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v0, v6, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v7, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v8, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v10, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v8f16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_pk_max_f16 v8, v3, v7
+; GFX940-NEXT:    v_mov_b32_e32 v9, 0x7e00
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v3, v7
+; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    v_pk_max_f16 v7, v2, v6
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v9, v8, vcc
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v2, v6
+; GFX940-NEXT:    v_perm_b32 v3, v3, v10, s0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v9, v7, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    v_pk_max_f16 v6, v1, v5
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v9, v7, vcc
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v1, v5
+; GFX940-NEXT:    v_perm_b32 v2, v2, v8, s0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v9, v6, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    v_pk_max_f16 v5, v0, v4
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v9, v6, vcc
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v4
+; GFX940-NEXT:    v_perm_b32 v1, v1, v7, s0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v5, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v9, v5, vcc
+; GFX940-NEXT:    v_perm_b32 v0, v0, v6, s0
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v8f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_max_f16 v8, v3, v7
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3, v7
+; GFX10-NEXT:    v_pk_max_f16 v9, v2, v6
+; GFX10-NEXT:    v_pk_max_f16 v12, v1, v5
+; GFX10-NEXT:    v_pk_max_f16 v13, v0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, 0x7e00, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v2, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, 0x7e00, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v13
+; GFX10-NEXT:    v_perm_b32 v2, v2, v9, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0x7e00, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, 0x7e00, v13, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v13, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_perm_b32 v1, v1, v6, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7e00, v8, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v3, v3, v10, 0x5040100
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v8f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v8, v3, v7
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3, v7
+; GFX11-NEXT:    v_pk_max_f16 v10, v2, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
+; GFX11-NEXT:    v_pk_max_f16 v14, v1, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, 0x7e00, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v2, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v10
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v10, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v12, v11
+; GFX11-NEXT:    v_pk_max_f16 v11, v0, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0x7e00, v13, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 16, v11
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, 0x7e00, v14, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v11, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v13, v12
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x7e00, v15, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v14, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3, v7
+; GFX11-NEXT:    v_perm_b32 v1, v1, v10, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7e00, v8, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v3, v3, v9, 0x5040100
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v8f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v4
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v5
+; GFX12-NEXT:    v_pk_maximum_f16 v2, v2, v6
+; GFX12-NEXT:    v_pk_maximum_f16 v3, v3, v7
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <8 x half> @llvm.maximum.v8f16(<8 x half> %src0, <8 x half> %src1)
+  ret <8 x half> %op
+}
+
+define <16 x half> @v_maximum_v16f16(<16 x half> %src0, <16 x half> %src1) {
+; GFX7-LABEL: v_maximum_v16f16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; GFX7-NEXT:    v_cmp_o_f32_e64 s[12:13], v0, v16
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v16
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v16, v22
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; GFX7-NEXT:    v_cmp_o_f32_e64 s[14:15], v6, v16
+; GFX7-NEXT:    v_max_f32_e32 v6, v6, v16
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v16, v23
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v17
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cmp_o_f32_e64 s[16:17], v7, v16
+; GFX7-NEXT:    v_max_f32_e32 v7, v7, v16
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v16, v24
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v17
+; GFX7-NEXT:    v_max_f32_e32 v1, v1, v17
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v17, v18
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v17
+; GFX7-NEXT:    v_cmp_o_f32_e64 s[18:19], v8, v16
+; GFX7-NEXT:    v_max_f32_e32 v8, v8, v16
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v16, v25
+; GFX7-NEXT:    v_cmp_o_f32_e64 s[4:5], v2, v17
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v17
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v17, v19
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v17
+; GFX7-NEXT:    v_cmp_o_f32_e64 s[20:21], v9, v16
+; GFX7-NEXT:    v_max_f32_e32 v9, v9, v16
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v16, v26
+; GFX7-NEXT:    v_cmp_o_f32_e64 s[6:7], v3, v17
+; GFX7-NEXT:    v_max_f32_e32 v3, v3, v17
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v17, v20
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v17
+; GFX7-NEXT:    v_cmp_o_f32_e64 s[22:23], v10, v16
+; GFX7-NEXT:    v_max_f32_e32 v10, v10, v16
+; GFX7-NEXT:    buffer_load_dword v16, off, s[0:3], s32
+; GFX7-NEXT:    v_cmp_o_f32_e64 s[8:9], v4, v17
+; GFX7-NEXT:    v_max_f32_e32 v4, v4, v17
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v17, v21
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v20, v28
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v19, v29
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v17
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v18, v30
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; GFX7-NEXT:    v_cmp_o_f32_e64 s[10:11], v5, v17
+; GFX7-NEXT:    v_max_f32_e32 v5, v5, v17
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v17, v27
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v20, v20
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v17
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v19, v19
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v18
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; GFX7-NEXT:    v_cmp_o_f32_e64 s[24:25], v11, v17
+; GFX7-NEXT:    v_max_f32_e32 v11, v11, v17
+; GFX7-NEXT:    v_mov_b32_e32 v17, 0x7fc00000
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v12, v20
+; GFX7-NEXT:    v_max_f32_e32 v12, v12, v20
+; GFX7-NEXT:    v_cndmask_b32_e32 v12, v17, v12, vcc
+; GFX7-NEXT:    v_max_f32_e32 v20, v13, v19
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v13, v19
+; GFX7-NEXT:    v_cndmask_b32_e32 v13, v17, v20, vcc
+; GFX7-NEXT:    v_max_f32_e32 v19, v14, v18
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v14, v18
+; GFX7-NEXT:    v_cndmask_b32_e32 v14, v17, v19, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v17, v0, s[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v17, v2, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v17, v3, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v17, v4, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v17, v5, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, v17, v6, s[14:15]
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v17, v7, s[16:17]
+; GFX7-NEXT:    v_cndmask_b32_e64 v8, v17, v8, s[18:19]
+; GFX7-NEXT:    v_cndmask_b32_e64 v9, v17, v9, s[20:21]
+; GFX7-NEXT:    v_cndmask_b32_e64 v10, v17, v10, s[22:23]
+; GFX7-NEXT:    v_cndmask_b32_e64 v11, v17, v11, s[24:25]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT:    v_max_f32_e32 v18, v15, v16
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v15, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v16f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v15
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
+; GFX8-NEXT:    v_max_f16_e32 v18, v17, v16
+; GFX8-NEXT:    v_mov_b32_e32 v19, 0x7e00
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v17, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v19, v18, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v14
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v6
+; GFX8-NEXT:    v_max_f16_e32 v20, v18, v17
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v18, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v17, v19, v20, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v5
+; GFX8-NEXT:    v_max_f16_e32 v21, v20, v18
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v20, v18
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v19, v21, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v4
+; GFX8-NEXT:    v_max_f16_e32 v22, v21, v20
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v21, v20
+; GFX8-NEXT:    v_cndmask_b32_e32 v20, v19, v22, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v22, 16, v3
+; GFX8-NEXT:    v_max_f16_e32 v23, v22, v21
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v22, v21
+; GFX8-NEXT:    v_cndmask_b32_e32 v21, v19, v23, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v22, 16, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v2
+; GFX8-NEXT:    v_max_f16_e32 v24, v23, v22
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v23, v22
+; GFX8-NEXT:    v_cndmask_b32_e32 v22, v19, v24, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v24, 16, v1
+; GFX8-NEXT:    v_max_f16_e32 v25, v24, v23
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v24, v23
+; GFX8-NEXT:    v_cndmask_b32_e32 v23, v19, v25, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v24, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v25, 16, v0
+; GFX8-NEXT:    v_max_f16_e32 v26, v25, v24
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v25, v24
+; GFX8-NEXT:    v_cndmask_b32_e32 v24, v19, v26, vcc
+; GFX8-NEXT:    v_max_f16_e32 v25, v7, v15
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v7, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v19, v25, vcc
+; GFX8-NEXT:    v_max_f16_e32 v15, v6, v14
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v6, v14
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v19, v15, vcc
+; GFX8-NEXT:    v_max_f16_e32 v14, v5, v13
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v5, v13
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v19, v14, vcc
+; GFX8-NEXT:    v_max_f16_e32 v13, v4, v12
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v4, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v19, v13, vcc
+; GFX8-NEXT:    v_max_f16_e32 v12, v3, v11
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v3, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v19, v12, vcc
+; GFX8-NEXT:    v_max_f16_e32 v11, v2, v10
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v2, v10
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v19, v11, vcc
+; GFX8-NEXT:    v_max_f16_e32 v10, v1, v9
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v1, v9
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v19, v10, vcc
+; GFX8-NEXT:    v_max_f16_e32 v9, v0, v8
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v19, v9, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v24
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v23
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v22
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v21
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v20
+; GFX8-NEXT:    v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v18
+; GFX8-NEXT:    v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v17
+; GFX8-NEXT:    v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v16
+; GFX8-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v16f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v16, v7, v15
+; GFX9-NEXT:    v_mov_b32_e32 v17, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v7, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v17, v16, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v17, v16, vcc
+; GFX9-NEXT:    v_pk_max_f16 v15, v6, v14
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v6, v14
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v15, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v17, v15, vcc
+; GFX9-NEXT:    v_pk_max_f16 v14, v5, v13
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v13
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v14, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v17, v14, vcc
+; GFX9-NEXT:    v_pk_max_f16 v13, v4, v12
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v12
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v13, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v17, v13, vcc
+; GFX9-NEXT:    v_pk_max_f16 v12, v3, v11
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v3, v11
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v17, v12, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v17, v12, vcc
+; GFX9-NEXT:    v_pk_max_f16 v11, v2, v10
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v2, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v17, v11, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v17, v11, vcc
+; GFX9-NEXT:    v_pk_max_f16 v10, v1, v9
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v17, v10, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v10, vcc
+; GFX9-NEXT:    v_pk_max_f16 v9, v0, v8
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v17, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v17, v9, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v0, v10, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v11, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v12, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v13, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v14, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v15, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v16, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v18, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v16f16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_pk_max_f16 v16, v7, v15
+; GFX940-NEXT:    v_mov_b32_e32 v17, 0x7e00
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v7, v15
+; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v18, v17, v16, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    v_pk_max_f16 v15, v6, v14
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v17, v16, vcc
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v6, v14
+; GFX940-NEXT:    v_perm_b32 v7, v7, v18, s0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v17, v15, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    v_pk_max_f16 v14, v5, v13
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v17, v15, vcc
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v5, v13
+; GFX940-NEXT:    v_perm_b32 v6, v6, v16, s0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v15, v17, v14, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    v_pk_max_f16 v13, v4, v12
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v17, v14, vcc
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v4, v12
+; GFX940-NEXT:    v_perm_b32 v5, v5, v15, s0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v14, v17, v13, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    v_pk_max_f16 v12, v3, v11
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v17, v13, vcc
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v3, v11
+; GFX940-NEXT:    v_perm_b32 v4, v4, v14, s0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v13, v17, v12, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    v_pk_max_f16 v11, v2, v10
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v17, v12, vcc
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v2, v10
+; GFX940-NEXT:    v_perm_b32 v3, v3, v13, s0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v12, v17, v11, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    v_pk_max_f16 v10, v1, v9
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v17, v11, vcc
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v1, v9
+; GFX940-NEXT:    v_perm_b32 v2, v2, v12, s0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v11, v17, v10, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    v_pk_max_f16 v9, v0, v8
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v17, v10, vcc
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v8
+; GFX940-NEXT:    v_perm_b32 v1, v1, v11, s0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v10, v17, v9, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v17, v9, vcc
+; GFX940-NEXT:    v_perm_b32 v0, v0, v10, s0
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v16f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_max_f16 v16, v7, v15
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v7, v15
+; GFX10-NEXT:    v_pk_max_f16 v18, v6, v14
+; GFX10-NEXT:    v_pk_max_f16 v19, v3, v11
+; GFX10-NEXT:    v_pk_max_f16 v20, v2, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v16
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, 0x7e00, v16, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v18
+; GFX10-NEXT:    v_pk_max_f16 v21, v0, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0x7e00, v17, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v6, v14
+; GFX10-NEXT:    v_pk_max_f16 v17, v5, v13
+; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v21
+; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v18, 0x7e00, v18, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v17
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0x7e00, v15, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v5, v13
+; GFX10-NEXT:    v_perm_b32 v6, v6, v18, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, 0x7e00, v17, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_pk_max_f16 v17, v4, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0x7e00, v14, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v4, v12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v17
+; GFX10-NEXT:    v_perm_b32 v5, v5, v15, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, 0x7e00, v17, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3, v11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
+; GFX10-NEXT:    v_cndmask_b32_e32 v19, 0x7e00, v19, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_pk_max_f16 v11, v1, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7e00, v17, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v2, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v11
+; GFX10-NEXT:    v_perm_b32 v3, v3, v19, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v17, 0x7e00, v20, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, 0x7e00, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v22, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v8
+; GFX10-NEXT:    v_perm_b32 v1, v1, v11, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, 0x7e00, v21, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v23, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v9, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v20, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_perm_b32 v2, v2, v17, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0x7e00, v14, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v4, v4, v13, 0x5040100
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v16f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v16, v7, v15
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
+; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v7, v15
+; GFX11-NEXT:    v_pk_max_f16 v15, v6, v14
+; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v16
+; GFX11-NEXT:    v_pk_max_f16 v20, v4, v12
+; GFX11-NEXT:    v_pk_max_f16 v22, v2, v10
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, 0x7e00, v16, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v18, v17
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v14
+; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
+; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v16, 0x7e00, v19, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v6, v14
+; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v15
+; GFX11-NEXT:    v_pk_max_f16 v14, v5, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v7, v16, v7, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0x7e00, v15, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v18, v17
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v13
+; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v15, 0x7e00, v19, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v5, v13
+; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v6, v15, v6, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, 0x7e00, v14, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v18, v17
+; GFX11-NEXT:    v_pk_max_f16 v17, v3, v11
+; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v20
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, 0x7e00, v19, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v4, v12
+; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v11
+; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v17
+; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v14, 0x7e00, v20, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v3
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3, v11
+; GFX11-NEXT:    v_perm_b32 v5, v13, v5, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7e00, v17, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v20, v19
+; GFX11-NEXT:    v_pk_max_f16 v19, v1, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v22
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, 0x7e00, v21, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v2, v10
+; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v3, v11, v3, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v17, 0x7e00, v22, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_pk_max_f16 v22, v0, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v21, 0x7e00, v19, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v19, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v8
+; GFX11-NEXT:    v_perm_b32 v1, v1, v21, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v22, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v24, v23
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, 0x7e00, v25, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v2, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v8, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v20, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v4, v12
+; GFX11-NEXT:    v_perm_b32 v2, v2, v17, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x7e00, v18, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v4, v4, v14, 0x5040100
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v16f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v8
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v9
+; GFX12-NEXT:    v_pk_maximum_f16 v2, v2, v10
+; GFX12-NEXT:    v_pk_maximum_f16 v3, v3, v11
+; GFX12-NEXT:    v_pk_maximum_f16 v4, v4, v12
+; GFX12-NEXT:    v_pk_maximum_f16 v5, v5, v13
+; GFX12-NEXT:    v_pk_maximum_f16 v6, v6, v14
+; GFX12-NEXT:    v_pk_maximum_f16 v7, v7, v15
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <16 x half> @llvm.maximum.v16f16(<16 x half> %src0, <16 x half> %src1)
+  ret <16 x half> %op
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
new file mode 100644
index 000000000000..7c5bc7da4df2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
@@ -0,0 +1,4344 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx703 < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GCN,GFX940 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+
+define float @v_maximum_f32(float %src0, float %src1) {
+; GFX7-LABEL: v_maximum_f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_f32:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call float @llvm.maximum.f32(float %src0, float %src1)
+  ret float %op
+}
+
+define float @v_maximum_f32__nnan(float %src0, float %src1) {
+; GFX7-LABEL: v_maximum_f32__nnan:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_f32__nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_f32__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_f32__nnan:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_f32__nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_f32__nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_f32__nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan float @llvm.maximum.f32(float %src0, float %src1)
+  ret float %op
+}
+
+define float @v_maximum_f32__nsz(float %src0, float %src1) {
+; GFX7-LABEL: v_maximum_f32__nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_f32__nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_f32__nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_f32__nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_f32__nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_f32__nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_f32__nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nsz float @llvm.maximum.f32(float %src0, float %src1)
+  ret float %op
+}
+
+define float @v_maximum_f32__nnan_nsz(float %src0, float %src1) {
+; GFX7-LABEL: v_maximum_f32__nnan_nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_f32__nnan_nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_f32__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_f32__nnan_nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_f32__nnan_nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_f32__nnan_nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_f32__nnan_nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan nsz float @llvm.maximum.f32(float %src0, float %src1)
+  ret float %op
+}
+
+define float @v_maximum_f32__nnan_src0(float %arg0, float %src1) {
+; GFX7-LABEL: v_maximum_f32__nnan_src0:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_f32__nnan_src0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_f32__nnan_src0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_f32__nnan_src0:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX940-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_f32__nnan_src0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX10-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_f32__nnan_src0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_f32__nnan_src0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %src0 = fadd nnan float %arg0, 1.0
+  %op = call float @llvm.maximum.f32(float %src0, float %src1)
+  ret float %op
+}
+
+define float @v_maximum_f32__nnan_src1(float %src0, float %arg1) {
+; GFX7-LABEL: v_maximum_f32__nnan_src1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_f32__nnan_src1:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_f32__nnan_src1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_f32__nnan_src1:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GFX940-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_f32__nnan_src1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GFX10-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_f32__nnan_src1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_f32__nnan_src1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %src1 = fadd nnan float %arg1, 1.0
+  %op = call float @llvm.maximum.f32(float %src0, float %src1)
+  ret float %op
+}
+
+define void @s_maximum_f32(float inreg %src0, float inreg %src1) {
+; GFX7-LABEL: s_maximum_f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_max_f32_e32 v1, s4, v0
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, s4, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; use v0
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: s_maximum_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s5
+; GFX8-NEXT:    v_max_f32_e32 v1, s4, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, s4, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use v0
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_maximum_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-NEXT:    v_max_f32_e32 v1, s4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_maximum_f32:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NEXT:    v_max_f32_e32 v1, s0, v0
+; GFX940-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_maximum_f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e64 v0, s4, s5
+; GFX10-NEXT:    v_cmp_o_f32_e64 vcc_lo, s4, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v0
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_maximum_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e64 v0, s0, s1
+; GFX11-NEXT:    v_cmp_o_f32_e64 vcc_lo, s0, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v0
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: s_maximum_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_maximum_f32 s0, s0, s1
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use s0
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call float @llvm.maximum.f32(float %src0, float %src1)
+  call void asm sideeffect "; use $0", "s"(float %op)
+  ret void
+}
+
+define <2 x float> @v_maximum_v2f32(<2 x float> %src0, <2 x float> %src1) {
+; GFX7-LABEL: v_maximum_v2f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_legacy_f32_e32 v4, v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v2, v1, v3
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v2f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v2f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v2f32:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
+; GFX940-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v2f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0x7fc00000, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0x7fc00000, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v1, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v2, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v3, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v2f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x7fc00000, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, 0x7fc00000, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 64
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v1, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v2, 64
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v3, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v2f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v2
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <2 x float> @llvm.maximum.v2f32(<2 x float> %src0, <2 x float> %src1)
+  ret <2 x float> %op
+}
+
+define <2 x float> @v_maximum_v2f32__nnan(<2 x float> %src0, <2 x float> %src1) {
+; GFX7-LABEL: v_maximum_v2f32__nnan:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_legacy_f32_e32 v4, v0, v2
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v2, v1, v3
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v2f32__nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v2f32__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v2f32__nnan:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v2f32__nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v1, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v2, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v3, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v2f32__nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 64
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v1, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v2, 64
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v3, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v2f32__nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v2
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan <2 x float> @llvm.maximum.v2f32(<2 x float> %src0, <2 x float> %src1)
+  ret <2 x float> %op
+}
+
+define <2 x float> @v_maximum_v2f32__nsz(<2 x float> %src0, <2 x float> %src1) {
+; GFX7-LABEL: v_maximum_v2f32__nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_legacy_f32_e32 v4, v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v2, v1, v3
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v2f32__nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v2f32__nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v2f32__nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
+; GFX940-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v2f32__nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc00000, v5, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v2f32__nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc00000, v5, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v2f32__nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v2
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nsz <2 x float> @llvm.maximum.v2f32(<2 x float> %src0, <2 x float> %src1)
+  ret <2 x float> %op
+}
+
+define <2 x float> @v_maximum_v2f32__nnan_nsz(<2 x float> %src0, <2 x float> %src1) {
+; GFX7-LABEL: v_maximum_v2f32__nnan_nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_max_legacy_f32_e32 v1, v1, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v2f32__nnan_nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v2f32__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v2f32__nnan_nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v2f32__nnan_nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v2f32__nnan_nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v2f32__nnan_nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v2
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan nsz <2 x float> @llvm.maximum.v2f32(<2 x float> %src0, <2 x float> %src1)
+  ret <2 x float> %op
+}
+
+define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) {
+; GFX7-LABEL: s_maximum_v2f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s7
+; GFX7-NEXT:    v_max_legacy_f32_e32 v1, s5, v0
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, s5, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, s5, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, s7, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_max_legacy_f32_e32 v3, s4, v0
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, s4, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v3, s4
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, s4, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v2, v3, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, s6, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; use v[0:1]
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: s_maximum_v2f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s7
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, s5, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, s5, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, s5, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, s7, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v0, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v0, v2, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, s4, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, s4, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, s6, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use v[0:1]
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_maximum_v2f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s5, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s5, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, s5, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, s7, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, s4, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, s6, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v[0:1]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_maximum_v2f32:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, s3
+; GFX940-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, s1, v0
+; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, s1, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, s1, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, s3, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v2, v0, vcc
+; GFX940-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v0, v2, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, s0, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, s2, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v[0:1]
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_maximum_v2f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-NEXT:    v_cmp_gt_f32_e64 vcc_lo, s5, s7
+; GFX10-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-NEXT:    v_cmp_class_f32_e64 s8, s5, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, s7, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e64 vcc_lo, s4, s6
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, s6, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e64 vcc_lo, s5, s7
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e64 vcc_lo, s4, s6
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc00000, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s8
+; GFX10-NEXT:    v_cmp_class_f32_e64 s5, s4, 64
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v2, s4, s5
+; GFX10-NEXT:    v_cmp_class_f32_e64 s4, s7, 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s7, s4
+; GFX10-NEXT:    v_cmp_class_f32_e64 s4, s6, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s6, s4
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v[0:1]
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_maximum_v2f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT:    v_cmp_gt_f32_e64 vcc_lo, s1, s3
+; GFX11-NEXT:    v_cmp_class_f32_e64 s4, s1, 64
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, s3, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e64 vcc_lo, s0, s2
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, s2, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e64 vcc_lo, s1, s3
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e64 vcc_lo, s0, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc00000, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s4
+; GFX11-NEXT:    v_cmp_class_f32_e64 s1, s0, 64
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v2, s0, s1
+; GFX11-NEXT:    v_cmp_class_f32_e64 s0, s3, 64
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
+; GFX11-NEXT:    v_cmp_class_f32_e64 s0, s2, 64
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, s2, s0
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v[0:1]
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: s_maximum_v2f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_maximum_f32 s1, s1, s3
+; GFX12-NEXT:    s_maximum_f32 s0, s0, s2
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use s[0:1]
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <2 x float> @llvm.maximum.v2f32(<2 x float> %src0, <2 x float> %src1)
+  call void asm sideeffect "; use $0", "s"(<2 x float> %op)
+  ret void
+}
+
+define <3 x float> @v_maximum_v3f32(<3 x float> %src0, <3 x float> %src1) {
+; GFX7-LABEL: v_maximum_v3f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_legacy_f32_e32 v6, v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v3, v1, v4
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v3, v2, v5
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v3f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v3f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v3f32:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v3
+; GFX940-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v3f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v4, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v5, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0x7fc00000, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0x7fc00000, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, 0x7fc00000, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v1, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v2, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v3, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v4, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v5, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v3f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v4, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v5, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0x7fc00000, v6, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, 0x7fc00000, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, 0x7fc00000, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v1, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v2, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v3, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v4, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v5, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v3f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v3
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v4
+; GFX12-NEXT:    v_maximum_f32 v2, v2, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <3 x float> @llvm.maximum.v3f32(<3 x float> %src0, <3 x float> %src1)
+  ret <3 x float> %op
+}
+
+define <3 x float> @v_maximum_v3f32__nnan(<3 x float> %src0, <3 x float> %src1) {
+; GFX7-LABEL: v_maximum_v3f32__nnan:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_legacy_f32_e32 v6, v0, v3
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v3, v1, v4
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v3, v2, v5
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v3f32__nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v3f32__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v3f32__nnan:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v3f32__nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v4, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v5, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v1, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v2, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v3, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v4, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v5, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v3f32__nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v4, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v5, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v1, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v2, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v3, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v4, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v5, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v3f32__nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v3
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v4
+; GFX12-NEXT:    v_maximum_f32 v2, v2, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan <3 x float> @llvm.maximum.v3f32(<3 x float> %src0, <3 x float> %src1)
+  ret <3 x float> %op
+}
+
+define <3 x float> @v_maximum_v3f32__nsz(<3 x float> %src0, <3 x float> %src1) {
+; GFX7-LABEL: v_maximum_v3f32__nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_legacy_f32_e32 v6, v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v3, v1, v4
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v3, v2, v5
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v3f32__nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v3f32__nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v3f32__nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v3
+; GFX940-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v3f32__nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v4, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v5, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc00000, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc00000, v8, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v3f32__nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v4, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v5, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v6, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc00000, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc00000, v8, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v3f32__nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v3
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v4
+; GFX12-NEXT:    v_maximum_f32 v2, v2, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nsz <3 x float> @llvm.maximum.v3f32(<3 x float> %src0, <3 x float> %src1)
+  ret <3 x float> %op
+}
+
+define <3 x float> @v_maximum_v3f32__nnan_nsz(<3 x float> %src0, <3 x float> %src1) {
+; GFX7-LABEL: v_maximum_v3f32__nnan_nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_max_legacy_f32_e32 v1, v1, v4
+; GFX7-NEXT:    v_max_legacy_f32_e32 v2, v2, v5
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v3f32__nnan_nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v3f32__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v3f32__nnan_nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v3f32__nnan_nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v3f32__nnan_nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v3f32__nnan_nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v3
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v4
+; GFX12-NEXT:    v_maximum_f32 v2, v2, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan nsz <3 x float> @llvm.maximum.v3f32(<3 x float> %src0, <3 x float> %src1)
+  ret <3 x float> %op
+}
+
+define <4 x float> @v_maximum_v4f32(<4 x float> %src0, <4 x float> %src1) {
+; GFX7-LABEL: v_maximum_v4f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_legacy_f32_e32 v8, v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v4, v1, v5
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v4, v2, v6
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v6, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v4, v3, v7
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v7, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v4f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v1, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v2, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v6, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v3, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v7, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v4f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v6, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v7, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v4f32:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v4
+; GFX940-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v1, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v6
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v6, v2, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v6, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v7
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v7, v3, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v7, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v4f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, 0x7fc00000, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v5, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v6, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v4, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v7, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, 0x7fc00000, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, 0x7fc00000, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0x7fc00000, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v1, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v2, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v3, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v5, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v6, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v7, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v4f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, 0x7fc00000, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v5, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v6
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v6, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v4, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v7, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, 0x7fc00000, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v6
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, 0x7fc00000, v10, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x7fc00000, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v1, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v2, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v3, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v5, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v6, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v7, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v4f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v4
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v5
+; GFX12-NEXT:    v_maximum_f32 v2, v2, v6
+; GFX12-NEXT:    v_maximum_f32 v3, v3, v7
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <4 x float> @llvm.maximum.v4f32(<4 x float> %src0, <4 x float> %src1)
+  ret <4 x float> %op
+}
+
+define <4 x float> @v_maximum_v4f32__nnan(<4 x float> %src0, <4 x float> %src1) {
+; GFX7-LABEL: v_maximum_v4f32__nnan:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_legacy_f32_e32 v8, v0, v4
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v4, v1, v5
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v4, v2, v6
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v6, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v4, v3, v7
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v7, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v4f32__nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v1, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v6, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v3, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v7, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v4f32__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v6, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v7, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v4f32__nnan:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v1, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v6
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v6, v2, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v6, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v7
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v7, v3, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v7, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v4f32__nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v4, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v6, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v7, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v1, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v2, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v3, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v5, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v6, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v7, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v4f32__nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 64
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v4, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v5, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v6
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v6, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v7, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v1, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v2, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v3, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v5, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v6, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v7, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v4f32__nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v4
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v5
+; GFX12-NEXT:    v_maximum_f32 v2, v2, v6
+; GFX12-NEXT:    v_maximum_f32 v3, v3, v7
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan <4 x float> @llvm.maximum.v4f32(<4 x float> %src0, <4 x float> %src1)
+  ret <4 x float> %op
+}
+
+define <4 x float> @v_maximum_v4f32__nsz(<4 x float> %src0, <4 x float> %src1) {
+; GFX7-LABEL: v_maximum_v4f32__nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_legacy_f32_e32 v8, v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v4, v1, v5
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v4, v2, v6
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v4, v3, v7
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v4f32__nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v1, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v2, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v3, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v4f32__nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v4f32__nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v4
+; GFX940-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v1, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v6
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v6, v2, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v7
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v7, v3, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v4f32__nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v6, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v7, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc00000, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc00000, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7fc00000, v9, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v4f32__nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v5, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v6
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v6, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v7, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc00000, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v6
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc00000, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7fc00000, v9, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v4f32__nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v4
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v5
+; GFX12-NEXT:    v_maximum_f32 v2, v2, v6
+; GFX12-NEXT:    v_maximum_f32 v3, v3, v7
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nsz <4 x float> @llvm.maximum.v4f32(<4 x float> %src0, <4 x float> %src1)
+  ret <4 x float> %op
+}
+
+define <4 x float> @v_maximum_v4f32__nnan_nsz(<4 x float> %src0, <4 x float> %src1) {
+; GFX7-LABEL: v_maximum_v4f32__nnan_nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v0, v4
+; GFX7-NEXT:    v_max_legacy_f32_e32 v1, v1, v5
+; GFX7-NEXT:    v_max_legacy_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_max_legacy_f32_e32 v3, v3, v7
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v4f32__nnan_nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v4f32__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v4f32__nnan_nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v6
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v7
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v4f32__nnan_nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v4f32__nnan_nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v6
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v4f32__nnan_nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v4
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v5
+; GFX12-NEXT:    v_maximum_f32 v2, v2, v6
+; GFX12-NEXT:    v_maximum_f32 v3, v3, v7
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan nsz <4 x float> @llvm.maximum.v4f32(<4 x float> %src0, <4 x float> %src1)
+  ret <4 x float> %op
+}
+
+define <8 x float> @v_maximum_v8f32(<8 x float> %src0, <8 x float> %src1) {
+; GFX7-LABEL: v_maximum_v8f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_legacy_f32_e32 v16, v0, v8
+; GFX7-NEXT:    v_mov_b32_e32 v17, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v8
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v8, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v8, v1, v9
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v9
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v9, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v8, v2, v10
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v2, v10
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v10, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v8, v3, v11
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v3, v11
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v11, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v8, v4, v12
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v4, v12
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v12, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v8, v5, v13
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v5, v13
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v13, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v8, v6, v14
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v6, v14
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v6, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v14, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v8, v7, v15
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v7, v15
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v7, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v15, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v8f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v17, 0x7fc00000
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v8, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v9
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v9, v1, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v9
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v9, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v10
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v10, v2, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v10
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v10, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v11, v3, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v11, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v12, v4, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v4, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v12, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v13
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v13, v5, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v5, v13
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v13, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v14
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v14, v6, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v6, v14
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v6, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v14, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v15, v7, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v7, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v7, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v15, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v8f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v17, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v8, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v9, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v10, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v10, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v11
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v11, v3, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v11
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v11, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v12
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v12, v4, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v12
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v12, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v13
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v13, v5, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v5, v13
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v13, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v14
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v14, v6, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v6, v14
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v6, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v14, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v15, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v7, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v7, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v15, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v8f32:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v8
+; GFX940-NEXT:    v_mov_b32_e32 v17, 0x7fc00000
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v8
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v8, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v9
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v9, v1, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v9
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v9, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v10
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v10, v2, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v10
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v10, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v11
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v11, v3, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v3, v11
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v11, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v12
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v12, v4, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v4, v12
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v12, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v13
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v13, v5, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v5, v13
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v13, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v14
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v14, v6, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v6, v14
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v6, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v14, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v15
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v15, v7, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v7, v15
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v7, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v15, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v8f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v17, v9, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, 0x7fc00000, v16, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v17, 0x7fc00000, v17, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v1, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v8, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v9, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v11, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, v12, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, 0x7fc00000, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, 0x7fc00000, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, 0x7fc00000, v16, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v2, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v3, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v4, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v10, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v11, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v12, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v13, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v14
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v14, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v15
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v15, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v5, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, 0x7fc00000, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v6, v14
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, 0x7fc00000, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v7, v15
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, 0x7fc00000, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v5, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v6, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v7, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v13, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v14, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v15, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v8f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v9
+; GFX11-NEXT:    v_cndmask_b32_e32 v17, v9, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v16, 0x7fc00000, v16, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v9
+; GFX11-NEXT:    v_cndmask_b32_e32 v17, 0x7fc00000, v17, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 64
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v1, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v8, 64
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v9, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v10
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v10, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v11
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v11, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v12
+; GFX11-NEXT:    v_cndmask_b32_e32 v16, v12, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v10
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, 0x7fc00000, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v11
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, 0x7fc00000, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v12
+; GFX11-NEXT:    v_cndmask_b32_e32 v16, 0x7fc00000, v16, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v2, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v3, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v4, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v10, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v11, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v12, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v13
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v13, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v14
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v14, v6, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v15
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v15, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v5, v13
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, 0x7fc00000, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v6, v14
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, 0x7fc00000, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v7, v15
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, 0x7fc00000, v10, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v5, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v6, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v7, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v13, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v14, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v15, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v8f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v8
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v9
+; GFX12-NEXT:    v_maximum_f32 v2, v2, v10
+; GFX12-NEXT:    v_maximum_f32 v3, v3, v11
+; GFX12-NEXT:    v_maximum_f32 v4, v4, v12
+; GFX12-NEXT:    v_maximum_f32 v5, v5, v13
+; GFX12-NEXT:    v_maximum_f32 v6, v6, v14
+; GFX12-NEXT:    v_maximum_f32 v7, v7, v15
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <8 x float> @llvm.maximum.v8f32(<8 x float> %src0, <8 x float> %src1)
+  ret <8 x float> %op
+}
+
+define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) {
+; GFX7-LABEL: v_maximum_v16f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_legacy_f32_e32 v32, v0, v16
+; GFX7-NEXT:    v_mov_b32_e32 v31, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v32, v31, v32, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v32, v0, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v16, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v32
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v32, v0, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v16, v1, v17
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v17
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v16, v1, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v17, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
+; GFX7-NEXT:    buffer_load_dword v17, off, s[0:3], s32
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v16, v1, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v16, v2, v18
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v2, v18
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v18, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v16, v3, v19
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v3, v19
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v19, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v16, v4, v20
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v4, v20
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v20, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v16, v5, v21
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v5, v21
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v21, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v16, v6, v22
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v6, v22
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v6, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v16, v6, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v22, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v16, v6, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v16, v7, v23
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v7, v23
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v7, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v16, v7, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v23, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v16, v7, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v16, v8, v24
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v8, v24
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v8, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v24, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v16, v9, v25
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v9, v25
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v9, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v9, v16, v9, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v25, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v9, v16, v9, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v16, v10, v26
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v10, v26
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v10, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v10, v16, v10, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v26, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v10, v16, v10, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v16, v11, v27
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v11, v27
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v11, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v11, v16, v11, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v27, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v11, v16, v11, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v16, v12, v28
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v12, v28
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v12, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v12, v16, v12, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v28, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v12, v16, v12, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v16, v13, v29
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v13, v29
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v13, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v13, v16, v13, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v29, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v13, v16, v13, vcc
+; GFX7-NEXT:    v_max_legacy_f32_e32 v16, v14, v30
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v14, v30
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v14, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v14, v16, v14, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v30, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v14, v16, v14, vcc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_max_legacy_f32_e32 v16, v15, v17
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v15, v17
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v15, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v17, 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v15, v15, v17, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v16f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v32, v16, v0, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v31, 0x7fc00000
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v32, v31, v32, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v32, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v16, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v32
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v32, v0, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v17, v1, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v16, v1, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v17, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
+; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v16, v1, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v18
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v18, v2, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v18
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v18, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v19
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v19, v3, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v19
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v19, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v20
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v20, v4, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v4, v20
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v20, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v21
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v21, v5, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v5, v21
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v21, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v22
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v22, v6, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v6, v22
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v6, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v16, v6, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v22, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v16, v6, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v23
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v23, v7, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v7, v23
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v7, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v16, v7, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v23, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v16, v7, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v8, v24
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v24, v8, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v8, v24
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v8, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v24, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v9, v25
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v25, v9, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v9, v25
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v9, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v16, v9, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v25, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v16, v9, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v10, v26
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v26, v10, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v10, v26
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v10, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v16, v10, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v26, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v16, v10, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v11, v27
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v27, v11, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v11, v27
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v11, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v16, v11, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v27, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v16, v11, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v12, v28
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v28, v12, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v12, v28
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v12, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v16, v12, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v28, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v16, v12, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v13, v29
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v29, v13, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v13, v29
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v13, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v16, v13, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v29, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v16, v13, vcc
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v14, v30
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v30, v14, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v14, v30
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v14, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v16, v14, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v30, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v16, v14, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v15, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v17, v15, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v15, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v15, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v17, 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v15, v17, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v16f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v32, v16, v0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v31, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v32, v31, v32, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v32, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v16, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v32, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v17
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v17
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v16, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v17, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v16, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v18
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v18, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v18
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v18, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v19
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v19, v3, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v19
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v19, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v20
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v20, v4, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v20
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v20, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v21
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v21, v5, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v5, v21
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v21, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v22
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v22, v6, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v6, v22
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v6, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v16, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v22, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v16, v6, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v23
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v23, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v7, v23
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v7, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v16, v7, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v23, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v16, v7, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v8, v24
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v24, v8, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v8, v24
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v8, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v24, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v9, v25
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v25, v9, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v9, v25
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v9, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v16, v9, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v25, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v16, v9, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v10, v26
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v26, v10, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v10, v26
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v10, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v16, v10, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v26, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v16, v10, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v11, v27
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v27, v11, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v11, v27
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v11, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v16, v11, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v27, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v16, v11, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v12, v28
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v28, v12, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v12, v28
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v12, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v16, v12, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v28, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v16, v12, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v13, v29
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v29, v13, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v13, v29
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v13, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v16, v13, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v29, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v16, v13, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v14, v30
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v30, v14, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v14, v30
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v14, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v16, v14, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v30, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v16, v14, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v15, v17
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v15, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v15, v17
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v15, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v17, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v15, v17, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v16f32:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    scratch_load_dword v31, off, s32
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v16
+; GFX940-NEXT:    v_mov_b32_e32 v32, 0x7fc00000
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v33, v16, v0, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v33, v32, v33, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v33, v0, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v16, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v33
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v33, v0, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v17
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v17, v1, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v17
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v16, v1, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v17, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v16, v1, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v18
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v18, v2, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v18
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v18, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v19
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v19, v3, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v3, v19
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v19, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v20
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v20, v4, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v4, v20
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v20, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v21
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v21, v5, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v5, v21
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v21, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v22
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v22, v6, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v6, v22
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v6, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v16, v6, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v22, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v16, v6, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v23
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v23, v7, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v7, v23
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v7, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v16, v7, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v23, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v16, v7, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v8, v24
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v24, v8, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v8, v24
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v8, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v24, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v9, v25
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v25, v9, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v9, v25
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v9, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v9, v16, v9, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v25, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v9, v16, v9, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v10, v26
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v26, v10, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v10, v26
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v10, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v10, v16, v10, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v26, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v10, v16, v10, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v11, v27
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v27, v11, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v11, v27
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v11, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v11, v16, v11, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v27, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v11, v16, v11, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v12, v28
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v28, v12, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v12, v28
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v12, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v12, v16, v12, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v28, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v12, v16, v12, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v13, v29
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v29, v13, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v13, v29
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v13, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v13, v16, v13, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v29, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v13, v16, v13, vcc
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v14, v30
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v30, v14, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v14, v30
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v14, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v14, v16, v14, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v30, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v14, v16, v14, vcc
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_cmp_gt_f32_e32 vcc, v15, v31
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v31, v15, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v15, v31
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v15, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v31, 64
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v15, v15, v31, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v16f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v16
+; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT:    v_cndmask_b32_e32 v32, v16, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v17
+; GFX10-NEXT:    v_cndmask_b32_e32 v33, v17, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v18
+; GFX10-NEXT:    v_cndmask_b32_e32 v34, v18, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v19
+; GFX10-NEXT:    v_cndmask_b32_e32 v35, v19, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v20
+; GFX10-NEXT:    v_cndmask_b32_e32 v36, v20, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v21
+; GFX10-NEXT:    v_cndmask_b32_e32 v37, v21, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v22
+; GFX10-NEXT:    v_cndmask_b32_e32 v38, v22, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v23
+; GFX10-NEXT:    v_cndmask_b32_e32 v39, v23, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v24
+; GFX10-NEXT:    v_cndmask_b32_e32 v48, v24, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v25
+; GFX10-NEXT:    v_cndmask_b32_e32 v49, v25, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v26
+; GFX10-NEXT:    v_cndmask_b32_e32 v50, v26, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v27
+; GFX10-NEXT:    v_cndmask_b32_e32 v51, v27, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v28
+; GFX10-NEXT:    v_cndmask_b32_e32 v52, v28, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v13, v29
+; GFX10-NEXT:    v_cndmask_b32_e32 v53, v29, v13, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v14, v30
+; GFX10-NEXT:    v_cndmask_b32_e32 v54, v30, v14, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v16
+; GFX10-NEXT:    v_cndmask_b32_e32 v32, 0x7fc00000, v32, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v17
+; GFX10-NEXT:    v_cndmask_b32_e32 v33, 0x7fc00000, v33, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v18
+; GFX10-NEXT:    v_cndmask_b32_e32 v34, 0x7fc00000, v34, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v19
+; GFX10-NEXT:    v_cndmask_b32_e32 v35, 0x7fc00000, v35, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v20
+; GFX10-NEXT:    v_cndmask_b32_e32 v36, 0x7fc00000, v36, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v5, v21
+; GFX10-NEXT:    v_cndmask_b32_e32 v37, 0x7fc00000, v37, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v6, v22
+; GFX10-NEXT:    v_cndmask_b32_e32 v38, 0x7fc00000, v38, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v7, v23
+; GFX10-NEXT:    v_cndmask_b32_e32 v39, 0x7fc00000, v39, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v8, v24
+; GFX10-NEXT:    v_cndmask_b32_e32 v48, 0x7fc00000, v48, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v9, v25
+; GFX10-NEXT:    v_cndmask_b32_e32 v49, 0x7fc00000, v49, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v10, v26
+; GFX10-NEXT:    v_cndmask_b32_e32 v50, 0x7fc00000, v50, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v11, v27
+; GFX10-NEXT:    v_cndmask_b32_e32 v51, 0x7fc00000, v51, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v12, v28
+; GFX10-NEXT:    v_cndmask_b32_e32 v52, 0x7fc00000, v52, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v13, v29
+; GFX10-NEXT:    v_cndmask_b32_e32 v53, 0x7fc00000, v53, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v14, v30
+; GFX10-NEXT:    v_cndmask_b32_e32 v54, 0x7fc00000, v54, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v32, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v1, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v33, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v2, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v34, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v3, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v35, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v4, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v36, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v5, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v37, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v6, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v38, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v7, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v39, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v8, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v48, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v9, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v49, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v10, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v50, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v11, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v51, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v12, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v52, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v13, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v53, v13, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v14, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, v54, v14, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v16, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v17, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v18, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v19, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v20, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v21, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v22, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v23, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v24, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v25, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v26, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v27, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v28, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v29, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v30, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v32
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v32, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v33
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v33, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v34
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v34, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v35
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v35, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v36
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v36, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v37
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v37, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v38
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v38, v6, vcc_lo
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v15, v31
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, v31, v15, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v39
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v39, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v48
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v48, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v15, v31
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, 0x7fc00000, v16, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v49
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v49, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v50, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v15, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v51
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v51, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v52
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v52, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v31, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, v15, v31, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v53
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v53, v13, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v54
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, v54, v14, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v16f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v16
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    v_cndmask_b32_e32 v32, v16, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v17
+; GFX11-NEXT:    v_cndmask_b32_e32 v33, v17, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v18
+; GFX11-NEXT:    v_cndmask_b32_e32 v34, v18, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v19
+; GFX11-NEXT:    v_cndmask_b32_e32 v35, v19, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v20
+; GFX11-NEXT:    v_cndmask_b32_e32 v36, v20, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v21
+; GFX11-NEXT:    v_cndmask_b32_e32 v37, v21, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v22
+; GFX11-NEXT:    v_cndmask_b32_e32 v38, v22, v6, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v23
+; GFX11-NEXT:    v_cndmask_b32_e32 v39, v23, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v24
+; GFX11-NEXT:    v_cndmask_b32_e32 v48, v24, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v25
+; GFX11-NEXT:    v_cndmask_b32_e32 v49, v25, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v26
+; GFX11-NEXT:    v_cndmask_b32_e32 v50, v26, v10, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v11, v27
+; GFX11-NEXT:    v_cndmask_b32_e32 v51, v27, v11, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v28
+; GFX11-NEXT:    v_cndmask_b32_e32 v52, v28, v12, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v13, v29
+; GFX11-NEXT:    v_cndmask_b32_e32 v53, v29, v13, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v14, v30
+; GFX11-NEXT:    v_cndmask_b32_e32 v54, v30, v14, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v16
+; GFX11-NEXT:    v_cndmask_b32_e32 v32, 0x7fc00000, v32, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v17
+; GFX11-NEXT:    v_cndmask_b32_e32 v33, 0x7fc00000, v33, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v18
+; GFX11-NEXT:    v_cndmask_b32_e32 v34, 0x7fc00000, v34, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v19
+; GFX11-NEXT:    v_cndmask_b32_e32 v35, 0x7fc00000, v35, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v20
+; GFX11-NEXT:    v_cndmask_b32_e32 v36, 0x7fc00000, v36, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v5, v21
+; GFX11-NEXT:    v_cndmask_b32_e32 v37, 0x7fc00000, v37, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v6, v22
+; GFX11-NEXT:    v_cndmask_b32_e32 v38, 0x7fc00000, v38, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v7, v23
+; GFX11-NEXT:    v_cndmask_b32_e32 v39, 0x7fc00000, v39, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v8, v24
+; GFX11-NEXT:    v_cndmask_b32_e32 v48, 0x7fc00000, v48, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v9, v25
+; GFX11-NEXT:    v_cndmask_b32_e32 v49, 0x7fc00000, v49, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v10, v26
+; GFX11-NEXT:    v_cndmask_b32_e32 v50, 0x7fc00000, v50, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v11, v27
+; GFX11-NEXT:    v_cndmask_b32_e32 v51, 0x7fc00000, v51, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v12, v28
+; GFX11-NEXT:    v_cndmask_b32_e32 v52, 0x7fc00000, v52, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v13, v29
+; GFX11-NEXT:    v_cndmask_b32_e32 v53, 0x7fc00000, v53, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v14, v30
+; GFX11-NEXT:    v_cndmask_b32_e32 v54, 0x7fc00000, v54, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v32, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v1, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v33, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v2, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v34, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v3, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v35, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v4, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v36, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v5, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v37, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v6, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v38, v6, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v7, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v39, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v8, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v48, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v9, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v49, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v10, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v50, v10, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v11, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, v51, v11, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v12, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v12, v52, v12, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v13, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, v53, v13, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v14, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v14, v54, v14, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v16, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v17, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v18, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v19, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v20, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v21, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v22, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v23, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v24, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v25, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v26, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v27, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v28, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v29, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v30, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v32
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v32, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v33
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v33, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v34
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v34, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v35
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v35, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v36
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v36, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v37
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v37, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v38
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v38, v6, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v15, v31
+; GFX11-NEXT:    v_cndmask_b32_e32 v16, v31, v15, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v39
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v39, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v48
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v48, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v15, v31
+; GFX11-NEXT:    v_cndmask_b32_e32 v16, 0x7fc00000, v16, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v49
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v49, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v50, v10, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v15, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v51
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, v51, v11, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v52
+; GFX11-NEXT:    v_cndmask_b32_e32 v12, v52, v12, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v31, 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v15, v15, v31, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v53
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, v53, v13, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v54
+; GFX11-NEXT:    v_cndmask_b32_e32 v14, v54, v14, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX11-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v16f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    scratch_load_b32 v31, off, s32
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v16
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v17
+; GFX12-NEXT:    v_maximum_f32 v2, v2, v18
+; GFX12-NEXT:    v_maximum_f32 v3, v3, v19
+; GFX12-NEXT:    v_maximum_f32 v4, v4, v20
+; GFX12-NEXT:    v_maximum_f32 v5, v5, v21
+; GFX12-NEXT:    v_maximum_f32 v6, v6, v22
+; GFX12-NEXT:    v_maximum_f32 v7, v7, v23
+; GFX12-NEXT:    v_maximum_f32 v8, v8, v24
+; GFX12-NEXT:    v_maximum_f32 v9, v9, v25
+; GFX12-NEXT:    v_maximum_f32 v10, v10, v26
+; GFX12-NEXT:    v_maximum_f32 v11, v11, v27
+; GFX12-NEXT:    v_maximum_f32 v12, v12, v28
+; GFX12-NEXT:    v_maximum_f32 v13, v13, v29
+; GFX12-NEXT:    v_maximum_f32 v14, v14, v30
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_maximum_f32 v15, v15, v31
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <16 x float> @llvm.maximum.v16f32(<16 x float> %src0, <16 x float> %src1)
+  ret <16 x float> %op
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
new file mode 100644
index 000000000000..d60a28e74043
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
@@ -0,0 +1,6157 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx703 < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GCN,GFX940 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+
+define double @v_maximum_f64(double %src0, double %src1) {
+; GFX7-LABEL: v_maximum_f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX7-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX8-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_f64:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX10-NEXT:    v_cmp_u_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, 0x7ff80000, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX11-NEXT:    v_cmp_u_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, 0x7ff80000, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call double @llvm.maximum.f64(double %src0, double %src1)
+  ret double %op
+}
+
+define double @v_maximum_f64__nnan(double %src0, double %src1) {
+; GFX7-LABEL: v_maximum_f64__nnan:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_f64__nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_f64__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_f64__nnan:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_f64__nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_f64__nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_f64__nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan double @llvm.maximum.f64(double %src0, double %src1)
+  ret double %op
+}
+
+define double @v_maximum_f64__nsz(double %src0, double %src1) {
+; GFX7-LABEL: v_maximum_f64__nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX7-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_f64__nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX8-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_f64__nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_f64__nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_f64__nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX10-NEXT:    v_cmp_u_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, 0x7ff80000, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_f64__nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX11-NEXT:    v_cmp_u_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, 0x7ff80000, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_f64__nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nsz double @llvm.maximum.f64(double %src0, double %src1)
+  ret double %op
+}
+
+define double @v_maximum_f64__nnan_nsz(double %src0, double %src1) {
+; GFX7-LABEL: v_maximum_f64__nnan_nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_f64__nnan_nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_f64__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_f64__nnan_nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_f64__nnan_nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_f64__nnan_nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_f64__nnan_nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan nsz double @llvm.maximum.f64(double %src0, double %src1)
+  ret double %op
+}
+
+define double @v_maximum_f64__nnan_src0(double %arg0, double %src1) {
+; GFX7-LABEL: v_maximum_f64__nnan_src0:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX7-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX7-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_f64__nnan_src0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX8-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX8-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_f64__nnan_src0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX9-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_f64__nnan_src0:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX940-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_f64__nnan_src0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX10-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX10-NEXT:    v_cmp_u_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, 0x7ff80000, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_f64__nnan_src0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX11-NEXT:    v_cmp_u_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, 0x7ff80000, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_f64__nnan_src0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_f64_e32 v[0:1], 1.0, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %src0 = fadd nnan double %arg0, 1.0
+  %op = call double @llvm.maximum.f64(double %src0, double %src1)
+  ret double %op
+}
+
+define double @v_maximum_f64__nnan_src1(double %src0, double %arg1) {
+; GFX7-LABEL: v_maximum_f64__nnan_src1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX7-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX7-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_f64__nnan_src1:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX8-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX8-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_f64__nnan_src1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX9-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_f64__nnan_src1:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX940-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_f64__nnan_src1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX10-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX10-NEXT:    v_cmp_u_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, 0x7ff80000, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_f64__nnan_src1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX11-NEXT:    v_cmp_u_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, 0x7ff80000, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_f64__nnan_src1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_f64_e32 v[2:3], 1.0, v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %src1 = fadd nnan double %arg1, 1.0
+  %op = call double @llvm.maximum.f64(double %src0, double %src1)
+  ret double %op
+}
+
+define void @s_maximum_f64(double inreg %src0, double inreg %src1) {
+; GFX7-LABEL: s_maximum_f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_max_f64 v[2:3], s[4:5], v[0:1]
+; GFX7-NEXT:    v_cmp_u_f64_e32 vcc, s[4:5], v[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; use v[0:1]
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: s_maximum_f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_max_f64 v[2:3], s[4:5], v[0:1]
+; GFX8-NEXT:    v_cmp_u_f64_e32 vcc, s[4:5], v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use v[0:1]
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_maximum_f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_max_f64 v[2:3], s[4:5], v[0:1]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, s[4:5], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v[0:1]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_maximum_f64:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NEXT:    v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v[0:1]
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_maximum_f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[0:1], s[4:5], s[6:7]
+; GFX10-NEXT:    v_cmp_u_f64_e64 s4, s[4:5], s[6:7]
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, 0x7ff80000, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s4
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v[0:1]
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_maximum_f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[0:1], s[0:1], s[2:3]
+; GFX11-NEXT:    v_cmp_u_f64_e64 s0, s[0:1], s[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, 0x7ff80000, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s0
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v[0:1]
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: s_maximum_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], s[0:1], s[2:3]
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use v[0:1]
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call double @llvm.maximum.f64(double %src0, double %src1)
+  call void asm sideeffect "; use $0", "s"(double %op)
+  ret void
+}
+
+define <2 x double> @v_maximum_v2f64(<2 x double> %src0, <2 x double> %src1) {
+; GFX7-LABEL: v_maximum_v2f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[4:5]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v10, 0x7ff80000
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[6:7], v[0:1], 64
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[8:9], v[4:5], 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v5, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v9, v10, v8, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, v[2:3], v[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[4:5], v[2:3], v[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[8:9]
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v11, v7, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v6, v2, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v11, v10, v11, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v10, 0, v4, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[6:7], 64
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[12:13]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v2f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[4:5]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v10, 0x7ff80000
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[6:7], v[0:1], 64
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[8:9], v[4:5], 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v5, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v10, v8, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, v[2:3], v[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[4:5], v[2:3], v[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[8:9]
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v7, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v10, v11, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, v4, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[6:7], 64
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[12:13]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v2f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7ff80000
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[6:7], v[0:1], 64
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[8:9], v[4:5], 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v5, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v10, v8, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, v[2:3], v[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[4:5], v[2:3], v[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[8:9]
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v7, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v10, v11, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, v4, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[6:7], 64
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[12:13]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v2f64:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[4:5]
+; GFX940-NEXT:    v_mov_b32_e32 v10, 0x7ff80000
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[0:1], v[4:5]
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v5, v1, vcc
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v9, v10, v8, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[4:5], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[8:9]
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[2:3], v[6:7]
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v7, v3, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[2:3], v[6:7]
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v5, v10, v4, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v6, v2, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[6:7], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[4:5]
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[2:3]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v2f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s4, v[2:3], v[6:7]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s5, v[0:1], v[4:5]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s6, v[2:3], v[6:7]
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v5, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v7, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v6, v2, s4
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[0:1], 64
+; GFX10-NEXT:    v_cmp_class_f64_e64 s4, v[2:3], 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0x7ff80000, v8, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v10, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, v12, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, v13, s6
+; GFX10-NEXT:    v_cmp_class_f64_e64 s5, v[4:5], 64
+; GFX10-NEXT:    v_cmp_class_f64_e64 s6, v[6:7], 64
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s7, 0, v[8:9]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s8, 0, v[10:11]
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s8
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v2f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s1, v[0:1], v[4:5]
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s0, v[2:3], v[6:7]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s2, v[2:3], v[6:7]
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v5, v1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v10, v7, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v13, v6, v2, s0
+; GFX11-NEXT:    v_cmp_class_f64_e64 s0, v[2:3], 64
+; GFX11-NEXT:    v_cndmask_b32_e64 v9, 0x7ff80000, v8, s1
+; GFX11-NEXT:    v_cndmask_b32_e32 v12, v4, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[0:1], 64
+; GFX11-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v10, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v10, 0, v13, s2
+; GFX11-NEXT:    v_cmp_class_f64_e64 s2, v[6:7], 64
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s4, 0, v[10:11]
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v8, 0, v12, s1
+; GFX11-NEXT:    v_cmp_class_f64_e64 s1, v[4:5], 64
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s3, 0, v[8:9]
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v2f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    v_maximum_f64 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <2 x double> @llvm.maximum.v2f64(<2 x double> %src0, <2 x double> %src1)
+  ret <2 x double> %op
+}
+
+define <2 x double> @v_maximum_v2f64__nnan(<2 x double> %src0, <2 x double> %src1) {
+; GFX7-LABEL: v_maximum_v2f64__nnan:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[4:5]
+; GFX7-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[2:3], v[6:7]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[6:7], v[4:5], 64
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[10:11], v[6:7], 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v9, v5, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v11, v7, v3, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v10, v6, v2, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], 64
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[8:9], 0, v[8:9]
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[12:13]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v2f64__nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[4:5]
+; GFX8-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[2:3], v[6:7]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[6:7], v[4:5], 64
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[10:11], v[6:7], 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v5, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v7, v3, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, v6, v2, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], 64
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[8:9], 0, v[8:9]
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[12:13]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v2f64__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[2:3], v[6:7]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[6:7], v[4:5], 64
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[10:11], v[6:7], 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v5, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v7, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v6, v2, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], 64
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[8:9], 0, v[8:9]
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[12:13]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v2f64__nnan:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[4:5]
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[4:5], 64
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v9, v5, v1, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[8:9]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[2:3], v[6:7]
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v7, v3, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v6, v2, vcc
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[6:7], 64
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[4:5]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[0:1]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[2:3]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v2f64__nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s4, v[2:3], v[6:7]
+; GFX10-NEXT:    v_cmp_class_f64_e64 s5, v[4:5], 64
+; GFX10-NEXT:    v_cmp_class_f64_e64 s6, v[6:7], 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v5, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v7, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v6, v2, s4
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[0:1], 64
+; GFX10-NEXT:    v_cmp_class_f64_e64 s4, v[2:3], 64
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s7, 0, v[8:9]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s8, 0, v[10:11]
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s8
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v2f64__nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s0, v[2:3], v[6:7]
+; GFX11-NEXT:    v_cmp_class_f64_e64 s1, v[4:5], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s2, v[6:7], 64
+; GFX11-NEXT:    v_dual_cndmask_b32 v9, v5, v1 :: v_dual_cndmask_b32 v8, v4, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v11, v7, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v10, v6, v2, s0
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[0:1], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s0, v[2:3], 64
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s3, 0, v[8:9]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s4, 0, v[10:11]
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v8, v0 :: v_dual_cndmask_b32 v1, v9, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v2f64__nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    v_maximum_f64 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan <2 x double> @llvm.maximum.v2f64(<2 x double> %src0, <2 x double> %src1)
+  ret <2 x double> %op
+}
+
+define <2 x double> @v_maximum_v2f64__nsz(<2 x double> %src0, <2 x double> %src1) {
+; GFX7-LABEL: v_maximum_v2f64__nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[4:5]
+; GFX7-NEXT:    v_cmp_gt_f64_e64 s[6:7], v[2:3], v[6:7]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[4:5]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[8:9], v[2:3], v[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v6, v2, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, v8, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[8:9]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v2f64__nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[4:5]
+; GFX8-NEXT:    v_cmp_gt_f64_e64 s[6:7], v[2:3], v[6:7]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[4:5]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[8:9], v[2:3], v[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v6, v2, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, v8, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[8:9]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v2f64__nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_gt_f64_e64 s[6:7], v[2:3], v[6:7]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[8:9], v[2:3], v[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v6, v2, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, v8, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[8:9]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v2f64__nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[4:5]
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[0:1], v[4:5]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX940-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[2:3], v[6:7]
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, 0, v8, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v2, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[2:3], v[6:7]
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, 0, v5, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v2f64__nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s4, v[2:3], v[6:7]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s5, v[0:1], v[4:5]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s6, v[2:3], v[6:7]
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v6, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, v8, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, v9, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0x7ff80000, v1, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0x7ff80000, v3, s6
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v2f64__nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s0, v[2:3], v[6:7]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s1, v[0:1], v[4:5]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s2, v[2:3], v[6:7]
+; GFX11-NEXT:    v_dual_cndmask_b32 v8, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v9, v6, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, v8, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0x7ff80000, v1, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, v9, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0x7ff80000, v3, s2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v2f64__nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    v_maximum_f64 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nsz <2 x double> @llvm.maximum.v2f64(<2 x double> %src0, <2 x double> %src1)
+  ret <2 x double> %op
+}
+
+define <2 x double> @v_maximum_v2f64__nnan_nsz(<2 x double> %src0, <2 x double> %src1) {
+; GFX7-LABEL: v_maximum_v2f64__nnan_nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[4:5]
+; GFX7-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[2:3], v[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v2f64__nnan_nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[4:5]
+; GFX8-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[2:3], v[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v2f64__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[2:3], v[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v2f64__nnan_nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[4:5]
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[2:3], v[6:7]
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v2f64__nnan_nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s4, v[2:3], v[6:7]
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v2f64__nnan_nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s0, v[2:3], v[6:7]
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v2f64__nnan_nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    v_maximum_f64 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan nsz <2 x double> @llvm.maximum.v2f64(<2 x double> %src0, <2 x double> %src1)
+  ret <2 x double> %op
+}
+
+define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) {
+; GFX7-LABEL: s_maximum_v2f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s10
+; GFX7-NEXT:    v_mov_b32_e32 v1, s11
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[12:13], s[6:7], v[0:1]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[18:19], s[10:11], 64
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    v_mov_b32_e32 v1, s9
+; GFX7-NEXT:    s_and_b64 s[14:15], vcc, exec
+; GFX7-NEXT:    s_cselect_b32 s16, s7, s11
+; GFX7-NEXT:    s_and_b64 s[14:15], s[12:13], exec
+; GFX7-NEXT:    s_cselect_b32 s15, s16, 0x7ff80000
+; GFX7-NEXT:    s_and_b64 s[16:17], vcc, exec
+; GFX7-NEXT:    s_cselect_b32 s14, s6, s10
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[16:17], s[6:7], 64
+; GFX7-NEXT:    s_and_b64 s[12:13], s[12:13], exec
+; GFX7-NEXT:    s_cselect_b32 s14, s14, 0
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[20:21], s[14:15], 0
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX7-NEXT:    s_and_b64 s[12:13], s[16:17], exec
+; GFX7-NEXT:    s_cselect_b32 s7, s7, s15
+; GFX7-NEXT:    s_and_b64 s[12:13], s[18:19], exec
+; GFX7-NEXT:    s_cselect_b32 s7, s11, s7
+; GFX7-NEXT:    s_and_b64 s[12:13], s[20:21], exec
+; GFX7-NEXT:    s_cselect_b32 s7, s7, s15
+; GFX7-NEXT:    s_and_b64 s[12:13], s[16:17], exec
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[12:13], s[4:5], v[0:1]
+; GFX7-NEXT:    s_cselect_b32 s6, s6, s14
+; GFX7-NEXT:    s_and_b64 s[16:17], s[18:19], exec
+; GFX7-NEXT:    s_cselect_b32 s6, s10, s6
+; GFX7-NEXT:    s_and_b64 s[10:11], s[20:21], exec
+; GFX7-NEXT:    s_cselect_b32 s6, s6, s14
+; GFX7-NEXT:    s_and_b64 s[10:11], vcc, exec
+; GFX7-NEXT:    s_cselect_b32 s14, s5, s9
+; GFX7-NEXT:    s_and_b64 s[10:11], s[12:13], exec
+; GFX7-NEXT:    s_cselect_b32 s11, s14, 0x7ff80000
+; GFX7-NEXT:    s_and_b64 s[14:15], vcc, exec
+; GFX7-NEXT:    s_cselect_b32 s10, s4, s8
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[14:15], s[4:5], 64
+; GFX7-NEXT:    s_and_b64 s[12:13], s[12:13], exec
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[12:13], s[8:9], 64
+; GFX7-NEXT:    s_cselect_b32 s10, s10, 0
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[16:17], s[10:11], 0
+; GFX7-NEXT:    s_and_b64 s[18:19], s[14:15], exec
+; GFX7-NEXT:    s_cselect_b32 s5, s5, s11
+; GFX7-NEXT:    s_and_b64 s[18:19], s[12:13], exec
+; GFX7-NEXT:    s_cselect_b32 s5, s9, s5
+; GFX7-NEXT:    s_and_b64 s[18:19], s[16:17], exec
+; GFX7-NEXT:    s_cselect_b32 s5, s5, s11
+; GFX7-NEXT:    s_and_b64 s[14:15], s[14:15], exec
+; GFX7-NEXT:    s_cselect_b32 s4, s4, s10
+; GFX7-NEXT:    s_and_b64 s[12:13], s[12:13], exec
+; GFX7-NEXT:    s_cselect_b32 s4, s8, s4
+; GFX7-NEXT:    s_and_b64 s[8:9], s[16:17], exec
+; GFX7-NEXT:    s_cselect_b32 s4, s4, s10
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; use s[4:7]
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: s_maximum_v2f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[12:13], s[6:7], v[0:1]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[18:19], s[10:11], 64
+; GFX8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX8-NEXT:    s_and_b64 s[14:15], vcc, exec
+; GFX8-NEXT:    s_cselect_b32 s16, s7, s11
+; GFX8-NEXT:    s_and_b64 s[14:15], s[12:13], exec
+; GFX8-NEXT:    s_cselect_b32 s15, s16, 0x7ff80000
+; GFX8-NEXT:    s_and_b64 s[16:17], vcc, exec
+; GFX8-NEXT:    s_cselect_b32 s14, s6, s10
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[16:17], s[6:7], 64
+; GFX8-NEXT:    s_and_b64 s[12:13], s[12:13], exec
+; GFX8-NEXT:    s_cselect_b32 s14, s14, 0
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[20:21], s[14:15], 0
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX8-NEXT:    s_and_b64 s[12:13], s[16:17], exec
+; GFX8-NEXT:    s_cselect_b32 s7, s7, s15
+; GFX8-NEXT:    s_and_b64 s[12:13], s[18:19], exec
+; GFX8-NEXT:    s_cselect_b32 s7, s11, s7
+; GFX8-NEXT:    s_and_b64 s[12:13], s[20:21], exec
+; GFX8-NEXT:    s_cselect_b32 s7, s7, s15
+; GFX8-NEXT:    s_and_b64 s[12:13], s[16:17], exec
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[12:13], s[4:5], v[0:1]
+; GFX8-NEXT:    s_cselect_b32 s6, s6, s14
+; GFX8-NEXT:    s_and_b64 s[16:17], s[18:19], exec
+; GFX8-NEXT:    s_cselect_b32 s6, s10, s6
+; GFX8-NEXT:    s_and_b64 s[10:11], s[20:21], exec
+; GFX8-NEXT:    s_cselect_b32 s6, s6, s14
+; GFX8-NEXT:    s_and_b64 s[10:11], vcc, exec
+; GFX8-NEXT:    s_cselect_b32 s14, s5, s9
+; GFX8-NEXT:    s_and_b64 s[10:11], s[12:13], exec
+; GFX8-NEXT:    s_cselect_b32 s11, s14, 0x7ff80000
+; GFX8-NEXT:    s_and_b64 s[14:15], vcc, exec
+; GFX8-NEXT:    s_cselect_b32 s10, s4, s8
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[14:15], s[4:5], 64
+; GFX8-NEXT:    s_and_b64 s[12:13], s[12:13], exec
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[12:13], s[8:9], 64
+; GFX8-NEXT:    s_cselect_b32 s10, s10, 0
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[16:17], s[10:11], 0
+; GFX8-NEXT:    s_and_b64 s[18:19], s[14:15], exec
+; GFX8-NEXT:    s_cselect_b32 s5, s5, s11
+; GFX8-NEXT:    s_and_b64 s[18:19], s[12:13], exec
+; GFX8-NEXT:    s_cselect_b32 s5, s9, s5
+; GFX8-NEXT:    s_and_b64 s[18:19], s[16:17], exec
+; GFX8-NEXT:    s_cselect_b32 s5, s5, s11
+; GFX8-NEXT:    s_and_b64 s[14:15], s[14:15], exec
+; GFX8-NEXT:    s_cselect_b32 s4, s4, s10
+; GFX8-NEXT:    s_and_b64 s[12:13], s[12:13], exec
+; GFX8-NEXT:    s_cselect_b32 s4, s8, s4
+; GFX8-NEXT:    s_and_b64 s[8:9], s[16:17], exec
+; GFX8-NEXT:    s_cselect_b32 s4, s4, s10
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use s[4:7]
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_maximum_v2f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s10
+; GFX9-NEXT:    v_mov_b32_e32 v1, s11
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[12:13], s[6:7], v[0:1]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[18:19], s[10:11], 64
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    s_and_b64 s[14:15], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s16, s7, s11
+; GFX9-NEXT:    s_and_b64 s[14:15], s[12:13], exec
+; GFX9-NEXT:    s_cselect_b32 s15, s16, 0x7ff80000
+; GFX9-NEXT:    s_and_b64 s[16:17], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s14, s6, s10
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[16:17], s[6:7], 64
+; GFX9-NEXT:    s_and_b64 s[12:13], s[12:13], exec
+; GFX9-NEXT:    s_cselect_b32 s14, s14, 0
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[20:21], s[14:15], 0
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX9-NEXT:    s_and_b64 s[12:13], s[16:17], exec
+; GFX9-NEXT:    s_cselect_b32 s7, s7, s15
+; GFX9-NEXT:    s_and_b64 s[12:13], s[18:19], exec
+; GFX9-NEXT:    s_cselect_b32 s7, s11, s7
+; GFX9-NEXT:    s_and_b64 s[12:13], s[20:21], exec
+; GFX9-NEXT:    s_cselect_b32 s7, s7, s15
+; GFX9-NEXT:    s_and_b64 s[12:13], s[16:17], exec
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[12:13], s[4:5], v[0:1]
+; GFX9-NEXT:    s_cselect_b32 s6, s6, s14
+; GFX9-NEXT:    s_and_b64 s[16:17], s[18:19], exec
+; GFX9-NEXT:    s_cselect_b32 s6, s10, s6
+; GFX9-NEXT:    s_and_b64 s[10:11], s[20:21], exec
+; GFX9-NEXT:    s_cselect_b32 s6, s6, s14
+; GFX9-NEXT:    s_and_b64 s[10:11], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s14, s5, s9
+; GFX9-NEXT:    s_and_b64 s[10:11], s[12:13], exec
+; GFX9-NEXT:    s_cselect_b32 s11, s14, 0x7ff80000
+; GFX9-NEXT:    s_and_b64 s[14:15], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s10, s4, s8
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[14:15], s[4:5], 64
+; GFX9-NEXT:    s_and_b64 s[12:13], s[12:13], exec
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[12:13], s[8:9], 64
+; GFX9-NEXT:    s_cselect_b32 s10, s10, 0
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[16:17], s[10:11], 0
+; GFX9-NEXT:    s_and_b64 s[18:19], s[14:15], exec
+; GFX9-NEXT:    s_cselect_b32 s5, s5, s11
+; GFX9-NEXT:    s_and_b64 s[18:19], s[12:13], exec
+; GFX9-NEXT:    s_cselect_b32 s5, s9, s5
+; GFX9-NEXT:    s_and_b64 s[18:19], s[16:17], exec
+; GFX9-NEXT:    s_cselect_b32 s5, s5, s11
+; GFX9-NEXT:    s_and_b64 s[14:15], s[14:15], exec
+; GFX9-NEXT:    s_cselect_b32 s4, s4, s10
+; GFX9-NEXT:    s_and_b64 s[12:13], s[12:13], exec
+; GFX9-NEXT:    s_cselect_b32 s4, s8, s4
+; GFX9-NEXT:    s_and_b64 s[8:9], s[16:17], exec
+; GFX9-NEXT:    s_cselect_b32 s4, s4, s10
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[4:7]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_maximum_v2f64:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, s[2:3], v[0:1]
+; GFX940-NEXT:    s_and_b64 s[8:9], vcc, exec
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[8:9], s[2:3], v[0:1]
+; GFX940-NEXT:    s_cselect_b32 s12, s3, s7
+; GFX940-NEXT:    s_and_b64 s[10:11], s[8:9], exec
+; GFX940-NEXT:    s_cselect_b32 s11, s12, 0x7ff80000
+; GFX940-NEXT:    s_and_b64 s[12:13], vcc, exec
+; GFX940-NEXT:    s_cselect_b32 s10, s2, s6
+; GFX940-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[12:13], s[2:3], 64
+; GFX940-NEXT:    s_cselect_b32 s10, s10, 0
+; GFX940-NEXT:    s_and_b64 s[14:15], s[12:13], exec
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[14:15], s[6:7], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[8:9], s[10:11], 0
+; GFX940-NEXT:    s_cselect_b32 s3, s3, s11
+; GFX940-NEXT:    s_and_b64 s[16:17], s[14:15], exec
+; GFX940-NEXT:    s_cselect_b32 s3, s7, s3
+; GFX940-NEXT:    s_and_b64 s[16:17], s[8:9], exec
+; GFX940-NEXT:    s_cselect_b32 s7, s3, s11
+; GFX940-NEXT:    s_and_b64 s[12:13], s[12:13], exec
+; GFX940-NEXT:    s_cselect_b32 s11, s2, s10
+; GFX940-NEXT:    s_and_b64 s[2:3], s[14:15], exec
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-NEXT:    s_cselect_b32 s6, s6, s11
+; GFX940-NEXT:    s_and_b64 s[2:3], s[8:9], exec
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, s[0:1], v[0:1]
+; GFX940-NEXT:    s_cselect_b32 s6, s6, s10
+; GFX940-NEXT:    s_and_b64 s[2:3], vcc, exec
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[2:3], s[0:1], v[0:1]
+; GFX940-NEXT:    s_cselect_b32 s10, s1, s5
+; GFX940-NEXT:    s_and_b64 s[8:9], s[2:3], exec
+; GFX940-NEXT:    s_cselect_b32 s9, s10, 0x7ff80000
+; GFX940-NEXT:    s_and_b64 s[10:11], vcc, exec
+; GFX940-NEXT:    s_cselect_b32 s8, s0, s4
+; GFX940-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[10:11], s[0:1], 64
+; GFX940-NEXT:    s_cselect_b32 s8, s8, 0
+; GFX940-NEXT:    s_and_b64 s[12:13], s[10:11], exec
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[12:13], s[4:5], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], s[8:9], 0
+; GFX940-NEXT:    s_cselect_b32 s1, s1, s9
+; GFX940-NEXT:    s_and_b64 s[14:15], s[12:13], exec
+; GFX940-NEXT:    s_cselect_b32 s1, s5, s1
+; GFX940-NEXT:    s_and_b64 s[14:15], s[2:3], exec
+; GFX940-NEXT:    s_cselect_b32 s5, s1, s9
+; GFX940-NEXT:    s_and_b64 s[10:11], s[10:11], exec
+; GFX940-NEXT:    s_cselect_b32 s9, s0, s8
+; GFX940-NEXT:    s_and_b64 s[0:1], s[12:13], exec
+; GFX940-NEXT:    s_cselect_b32 s4, s4, s9
+; GFX940-NEXT:    s_and_b64 s[0:1], s[2:3], exec
+; GFX940-NEXT:    s_cselect_b32 s4, s4, s8
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use s[4:7]
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_maximum_v2f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s12, s[6:7], s[10:11]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s14, s[6:7], s[10:11]
+; GFX10-NEXT:    v_cmp_class_f64_e64 s15, s[6:7], 64
+; GFX10-NEXT:    v_cmp_class_f64_e64 s16, s[10:11], 64
+; GFX10-NEXT:    v_cmp_o_f64_e64 s18, s[4:5], s[8:9]
+; GFX10-NEXT:    v_cmp_class_f64_e64 s19, s[4:5], 64
+; GFX10-NEXT:    v_cmp_class_f64_e64 s20, s[8:9], 64
+; GFX10-NEXT:    s_and_b32 s13, s12, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s13, s7, s11
+; GFX10-NEXT:    s_and_b32 s17, s14, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s13, s13, 0x7ff80000
+; GFX10-NEXT:    s_and_b32 s12, s12, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s12, s6, s10
+; GFX10-NEXT:    s_and_b32 s14, s14, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s12, s12, 0
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s17, s[4:5], s[8:9]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s14, s[12:13], 0
+; GFX10-NEXT:    s_and_b32 s21, s15, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s7, s7, s13
+; GFX10-NEXT:    s_and_b32 s21, s16, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s7, s11, s7
+; GFX10-NEXT:    s_and_b32 s11, s14, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s7, s7, s13
+; GFX10-NEXT:    s_and_b32 s11, s15, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s6, s6, s12
+; GFX10-NEXT:    s_and_b32 s11, s16, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s6, s10, s6
+; GFX10-NEXT:    s_and_b32 s10, s14, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s6, s6, s12
+; GFX10-NEXT:    s_and_b32 s10, s17, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s10, s5, s9
+; GFX10-NEXT:    s_and_b32 s11, s18, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s11, s10, 0x7ff80000
+; GFX10-NEXT:    s_and_b32 s10, s17, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s10, s4, s8
+; GFX10-NEXT:    s_and_b32 s12, s18, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s10, s10, 0
+; GFX10-NEXT:    s_and_b32 s13, s19, exec_lo
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s12, s[10:11], 0
+; GFX10-NEXT:    s_cselect_b32 s5, s5, s11
+; GFX10-NEXT:    s_and_b32 s13, s20, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s5, s9, s5
+; GFX10-NEXT:    s_and_b32 s9, s12, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s5, s5, s11
+; GFX10-NEXT:    s_and_b32 s9, s19, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s4, s4, s10
+; GFX10-NEXT:    s_and_b32 s9, s20, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s4, s8, s4
+; GFX10-NEXT:    s_and_b32 s8, s12, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s4, s4, s10
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use s[4:7]
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_maximum_v2f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s8, s[2:3], s[6:7]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s10, s[2:3], s[6:7]
+; GFX11-NEXT:    v_cmp_class_f64_e64 s11, s[2:3], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s12, s[6:7], 64
+; GFX11-NEXT:    v_cmp_o_f64_e64 s14, s[0:1], s[4:5]
+; GFX11-NEXT:    v_cmp_class_f64_e64 s15, s[0:1], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s16, s[4:5], 64
+; GFX11-NEXT:    s_and_b32 s9, s8, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s9, s3, s7
+; GFX11-NEXT:    s_and_b32 s13, s10, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s9, s9, 0x7ff80000
+; GFX11-NEXT:    s_and_b32 s8, s8, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s8, s2, s6
+; GFX11-NEXT:    s_and_b32 s10, s10, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s8, s8, 0
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s13, s[0:1], s[4:5]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s10, s[8:9], 0
+; GFX11-NEXT:    s_and_b32 s17, s11, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s3, s3, s9
+; GFX11-NEXT:    s_and_b32 s17, s12, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s3, s7, s3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_and_b32 s7, s10, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s3, s3, s9
+; GFX11-NEXT:    s_and_b32 s7, s11, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s2, s2, s8
+; GFX11-NEXT:    s_and_b32 s7, s12, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s2, s6, s2
+; GFX11-NEXT:    s_and_b32 s6, s10, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s2, s2, s8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    s_and_b32 s6, s13, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s6, s1, s5
+; GFX11-NEXT:    s_and_b32 s7, s14, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s7, s6, 0x7ff80000
+; GFX11-NEXT:    s_and_b32 s6, s13, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s6, s0, s4
+; GFX11-NEXT:    s_and_b32 s8, s14, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s6, s6, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    s_and_b32 s9, s15, exec_lo
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s8, s[6:7], 0
+; GFX11-NEXT:    s_cselect_b32 s1, s1, s7
+; GFX11-NEXT:    s_and_b32 s9, s16, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s1, s5, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_and_b32 s5, s8, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s1, s1, s7
+; GFX11-NEXT:    s_and_b32 s5, s15, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s0, s0, s6
+; GFX11-NEXT:    s_and_b32 s5, s16, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s0, s4, s0
+; GFX11-NEXT:    s_and_b32 s4, s8, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s0, s0, s6
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use s[0:3]
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: s_maximum_v2f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[2:3], s[2:3], s[6:7]
+; GFX12-NEXT:    v_maximum_f64 v[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use v[0:3]
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <2 x double> @llvm.maximum.v2f64(<2 x double> %src0, <2 x double> %src1)
+  call void asm sideeffect "; use $0", "s"(<2 x double> %op)
+  ret void
+}
+
+define <3 x double> @v_maximum_v3f64(<3 x double> %src0, <3 x double> %src1) {
+; GFX7-LABEL: v_maximum_v3f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v14, 0x7ff80000
+; GFX7-NEXT:    v_cmp_gt_f64_e64 s[8:9], v[2:3], v[8:9]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[10:11], v[2:3], v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v12, v7, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v13, v14, v12, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v12, v6, v0, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v12, 0, v12, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[6:7], 64
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[6:7], 0, v[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, v9, v3, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v12, v0, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v13, v1, s[6:7]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[6:7], v[2:3], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v14, v6, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, v8, v2, s[8:9]
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, v[4:5], v[10:11]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[8:9], v[8:9], 64
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[4:5], v[4:5], v[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, 0, v6, s[10:11]
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e32 v12, v11, v5, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v10, v4, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v13, v14, v12, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v12, 0, v8, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[10:11], 64
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[12:13]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v3f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v14, 0x7ff80000
+; GFX8-NEXT:    v_cmp_gt_f64_e64 s[8:9], v[2:3], v[8:9]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[10:11], v[2:3], v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v7, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v13, v14, v12, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v6, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, v12, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[6:7], 64
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[6:7], 0, v[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v9, v3, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v12, v0, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v13, v1, s[6:7]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[6:7], v[2:3], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v14, v6, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v8, v2, s[8:9]
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, v[4:5], v[10:11]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[8:9], v[8:9], 64
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[4:5], v[4:5], v[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, v6, s[10:11]
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v11, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v10, v4, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v13, v14, v12, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, v8, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[10:11], 64
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[12:13]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v3f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[6:7]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v14, 0x7ff80000
+; GFX9-NEXT:    v_cmp_gt_f64_e64 s[8:9], v[2:3], v[8:9]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[10:11], v[2:3], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v7, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, v14, v12, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v6, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, v12, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[6:7], 64
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[6:7], 0, v[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v9, v3, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v12, v0, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v13, v1, s[6:7]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[6:7], v[2:3], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v14, v6, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v8, v2, s[8:9]
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, v[4:5], v[10:11]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[8:9], v[8:9], 64
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[4:5], v[4:5], v[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, v6, s[10:11]
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v11, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v10, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, v14, v12, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, v8, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[10:11], 64
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[12:13]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v3f64:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[6:7]
+; GFX940-NEXT:    v_mov_b32_e32 v14, 0x7ff80000
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[0:1], v[6:7]
+; GFX940-NEXT:    v_cndmask_b32_e32 v12, v7, v1, vcc
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v13, v14, v12, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v12, v6, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v12, 0, v12, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[6:7], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[12:13]
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[2:3], v[8:9]
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v3, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[2:3], v[8:9]
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v12, v0, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v13, v1, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v7, v14, v6, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v8, v2, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v6, 0, v6, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[8:9], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[6:7]
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[4:5], v[10:11]
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v11, v5, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[4:5], v[10:11]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[2:3]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v7, v14, v6, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v10, v4, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v6, 0, v6, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[10:11], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[6:7]
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[2:3]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v3f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[6:7]
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s4, v[2:3], v[8:9]
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s5, v[4:5], v[10:11]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s6, v[0:1], v[6:7]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s7, v[2:3], v[8:9]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s8, v[4:5], v[10:11]
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v7, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v9, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, v11, v5, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v17, v6, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, v8, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, v10, v4, s5
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[0:1], 64
+; GFX10-NEXT:    v_cmp_class_f64_e64 s4, v[2:3], 64
+; GFX10-NEXT:    v_cmp_class_f64_e64 s5, v[4:5], 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0x7ff80000, v12, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, 0x7ff80000, v14, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, v17, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, 0x7ff80000, v16, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, v18, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0, v19, s8
+; GFX10-NEXT:    v_cmp_class_f64_e64 s6, v[8:9], 64
+; GFX10-NEXT:    v_cmp_class_f64_e64 s7, v[6:7], 64
+; GFX10-NEXT:    v_cmp_class_f64_e64 s8, v[10:11], 64
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s9, 0, v[12:13]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s10, 0, v[14:15]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s11, 0, v[16:17]
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v14, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v16, v4, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v15, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v17, v5, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v12, v0, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v14, v2, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v16, v4, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v13, v1, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v15, v3, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v17, v5, s11
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v3f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[6:7]
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s0, v[2:3], v[8:9]
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s1, v[4:5], v[10:11]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s2, v[0:1], v[6:7]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s3, v[2:3], v[8:9]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s4, v[4:5], v[10:11]
+; GFX11-NEXT:    v_dual_cndmask_b32 v12, v7, v1 :: v_dual_cndmask_b32 v17, v6, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v14, v9, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v16, v11, v5, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v18, v8, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v19, v10, v4, s1
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[0:1], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s0, v[2:3], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s1, v[4:5], 64
+; GFX11-NEXT:    v_cndmask_b32_e64 v13, 0x7ff80000, v12, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v15, 0x7ff80000, v14, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v12, 0, v17, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v17, 0x7ff80000, v16, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v14, 0, v18, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v16, 0, v19, s4
+; GFX11-NEXT:    v_cmp_class_f64_e64 s2, v[8:9], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s3, v[6:7], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s4, v[10:11], 64
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s5, 0, v[12:13]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s6, 0, v[14:15]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s7, 0, v[16:17]
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v12, v0 :: v_dual_cndmask_b32 v1, v13, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v14, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v16, v4, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v15, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v17, v5, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v12, v0, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v14, v2, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v16, v4, s7
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v13, v1, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v15, v3, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v17, v5, s7
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v3f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[6:7]
+; GFX12-NEXT:    v_maximum_f64 v[2:3], v[2:3], v[8:9]
+; GFX12-NEXT:    v_maximum_f64 v[4:5], v[4:5], v[10:11]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <3 x double> @llvm.maximum.v3f64(<3 x double> %src0, <3 x double> %src1)
+  ret <3 x double> %op
+}
+
+define <3 x double> @v_maximum_v3f64__nnan(<3 x double> %src0, <3 x double> %src1) {
+; GFX7-LABEL: v_maximum_v3f64__nnan:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[6:7], 64
+; GFX7-NEXT:    v_cmp_gt_f64_e64 s[8:9], v[2:3], v[8:9]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[10:11], v[10:11], 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v13, v7, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v12, v6, v0, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 64
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[6:7], 0, v[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, v[4:5], v[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v12, v0, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v13, v1, s[6:7]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v9, v3, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, v8, v2, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v13, v11, v5, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v12, v10, v4, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 64
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[6:7], v[8:9], 64
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[8:9], 0, v[6:7]
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[12:13]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v3f64__nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[6:7], 64
+; GFX8-NEXT:    v_cmp_gt_f64_e64 s[8:9], v[2:3], v[8:9]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[10:11], v[10:11], 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v7, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v6, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 64
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[6:7], 0, v[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, v[4:5], v[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v12, v0, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v13, v1, s[6:7]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v9, v3, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v8, v2, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v11, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v10, v4, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 64
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[6:7], v[8:9], 64
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[8:9], 0, v[6:7]
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[12:13]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v3f64__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[6:7]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[6:7], 64
+; GFX9-NEXT:    v_cmp_gt_f64_e64 s[8:9], v[2:3], v[8:9]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[10:11], v[10:11], 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v7, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v6, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 64
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[6:7], 0, v[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, v[4:5], v[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v12, v0, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v13, v1, s[6:7]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v9, v3, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v8, v2, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v11, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v10, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 64
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[6:7], v[8:9], 64
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[8:9], 0, v[6:7]
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[12:13]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v3f64__nnan:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[6:7]
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[6:7], 64
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v13, v7, v1, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v12, v6, v0, vcc
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[12:13]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[2:3], v[8:9]
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v9, v3, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v8, v2, vcc
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[8:9], 64
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v12, v0, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v13, v1, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[0:1]
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[6:7]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[0:1]
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[4:5], v[10:11]
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v11, v5, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v10, v4, vcc
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[10:11], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[6:7]
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[2:3]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v3f64__nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[6:7]
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s4, v[2:3], v[8:9]
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s5, v[4:5], v[10:11]
+; GFX10-NEXT:    v_cmp_class_f64_e64 s6, v[8:9], 64
+; GFX10-NEXT:    v_cmp_class_f64_e64 s7, v[6:7], 64
+; GFX10-NEXT:    v_cmp_class_f64_e64 s8, v[10:11], 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v7, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, v9, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v6, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, v11, v5, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v8, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, v10, v4, s5
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[0:1], 64
+; GFX10-NEXT:    v_cmp_class_f64_e64 s4, v[2:3], 64
+; GFX10-NEXT:    v_cmp_class_f64_e64 s5, v[4:5], 64
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s9, 0, v[12:13]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s10, 0, v[14:15]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s11, 0, v[16:17]
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v14, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v16, v4, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v15, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v17, v5, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v12, v0, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v14, v2, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v16, v4, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v13, v1, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v15, v3, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v17, v5, s11
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v3f64__nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[6:7]
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s0, v[2:3], v[8:9]
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s1, v[4:5], v[10:11]
+; GFX11-NEXT:    v_cmp_class_f64_e64 s2, v[8:9], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s3, v[6:7], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s4, v[10:11], 64
+; GFX11-NEXT:    v_dual_cndmask_b32 v13, v7, v1 :: v_dual_cndmask_b32 v12, v6, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v15, v9, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v17, v11, v5, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v14, v8, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v16, v10, v4, s1
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[0:1], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s0, v[2:3], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s1, v[4:5], 64
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s5, 0, v[12:13]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s6, 0, v[14:15]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s7, 0, v[16:17]
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v12, v0 :: v_dual_cndmask_b32 v1, v13, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v14, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v16, v4, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v15, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v17, v5, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v12, v0, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v14, v2, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v16, v4, s7
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v13, v1, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v15, v3, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v17, v5, s7
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v3f64__nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[6:7]
+; GFX12-NEXT:    v_maximum_f64 v[2:3], v[2:3], v[8:9]
+; GFX12-NEXT:    v_maximum_f64 v[4:5], v[4:5], v[10:11]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan <3 x double> @llvm.maximum.v3f64(<3 x double> %src0, <3 x double> %src1)
+  ret <3 x double> %op
+}
+
+define <3 x double> @v_maximum_v3f64__nsz(<3 x double> %src0, <3 x double> %src1) {
+; GFX7-LABEL: v_maximum_v3f64__nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[6:7]
+; GFX7-NEXT:    v_cmp_gt_f64_e64 s[8:9], v[4:5], v[10:11]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[6:7], v[2:3], v[8:9]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[10:11], v[4:5], v[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e32 v12, v6, v0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v10, v4, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v11, v5, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, v12, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v8, v2, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v8, v1, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s[10:11]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v3f64__nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[6:7]
+; GFX8-NEXT:    v_cmp_gt_f64_e64 s[8:9], v[4:5], v[10:11]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[6:7], v[2:3], v[8:9]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[10:11], v[4:5], v[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v6, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v10, v4, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v11, v5, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, v12, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v8, v2, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v8, v1, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s[10:11]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v3f64__nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[6:7]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[6:7]
+; GFX9-NEXT:    v_cmp_gt_f64_e64 s[8:9], v[4:5], v[10:11]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[6:7], v[2:3], v[8:9]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[10:11], v[4:5], v[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v6, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, v[2:3], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v10, v4, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v11, v5, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, v12, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v8, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v8, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s[10:11]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v3f64__nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[6:7]
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[0:1], v[6:7]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v12, v6, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX940-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[2:3], v[8:9]
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, 0, v12, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v6, v1, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v8, v2, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[2:3], v[8:9]
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[4:5], v[10:11]
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, 0, v7, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v3, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v10, v4, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[4:5], v[10:11]
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v11, v5, vcc
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v5, v6, v5, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v3f64__nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[6:7]
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s4, v[2:3], v[8:9]
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s5, v[4:5], v[10:11]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s6, v[0:1], v[6:7]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s7, v[2:3], v[8:9]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s8, v[4:5], v[10:11]
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v6, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v8, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v10, v4, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v11, v5, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, v12, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0x7ff80000, v1, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0x7ff80000, v3, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v5, s8
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v3f64__nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[6:7]
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s0, v[2:3], v[8:9]
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s1, v[4:5], v[10:11]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s2, v[0:1], v[6:7]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s3, v[2:3], v[8:9]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s4, v[4:5], v[10:11]
+; GFX11-NEXT:    v_dual_cndmask_b32 v12, v6, v0 :: v_dual_cndmask_b32 v1, v7, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v8, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v8, v10, v4, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v11, v5, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, v12, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0x7ff80000, v1, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0x7ff80000, v3, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v5, s4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v3f64__nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[6:7]
+; GFX12-NEXT:    v_maximum_f64 v[2:3], v[2:3], v[8:9]
+; GFX12-NEXT:    v_maximum_f64 v[4:5], v[4:5], v[10:11]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nsz <3 x double> @llvm.maximum.v3f64(<3 x double> %src0, <3 x double> %src1)
+  ret <3 x double> %op
+}
+
+define <3 x double> @v_maximum_v3f64__nnan_nsz(<3 x double> %src0, <3 x double> %src1) {
+; GFX7-LABEL: v_maximum_v3f64__nnan_nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[2:3], v[8:9]
+; GFX7-NEXT:    v_cmp_gt_f64_e64 s[6:7], v[4:5], v[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v10, v4, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v11, v5, s[6:7]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v3f64__nnan_nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[2:3], v[8:9]
+; GFX8-NEXT:    v_cmp_gt_f64_e64 s[6:7], v[4:5], v[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v10, v4, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v11, v5, s[6:7]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v3f64__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[6:7]
+; GFX9-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[2:3], v[8:9]
+; GFX9-NEXT:    v_cmp_gt_f64_e64 s[6:7], v[4:5], v[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v10, v4, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v11, v5, s[6:7]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v3f64__nnan_nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[6:7]
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[2:3], v[8:9]
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[4:5], v[10:11]
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v11, v5, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v3f64__nnan_nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[6:7]
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s4, v[2:3], v[8:9]
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s5, v[4:5], v[10:11]
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v10, v4, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v11, v5, s5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v3f64__nnan_nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[6:7]
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s0, v[2:3], v[8:9]
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s1, v[4:5], v[10:11]
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v6, v0 :: v_dual_cndmask_b32 v1, v7, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v10, v4, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v11, v5, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v3f64__nnan_nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[6:7]
+; GFX12-NEXT:    v_maximum_f64 v[2:3], v[2:3], v[8:9]
+; GFX12-NEXT:    v_maximum_f64 v[4:5], v[4:5], v[10:11]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan nsz <3 x double> @llvm.maximum.v3f64(<3 x double> %src0, <3 x double> %src1)
+  ret <3 x double> %op
+}
+
+define <4 x double> @v_maximum_v4f64(<4 x double> %src0, <4 x double> %src1) {
+; GFX7-LABEL: v_maximum_v4f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v18, 0x7ff80000
+; GFX7-NEXT:    v_cmp_gt_f64_e64 s[6:7], v[2:3], v[10:11]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[8:9], v[2:3], v[10:11]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[12:13], v[4:5], v[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v9, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v17, v18, v16, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[8:9], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v19, v11, v3, s[6:7]
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[16:17]
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[10:11], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v8, v10, v2, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v9, v18, v19, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[8:9]
+; GFX7-NEXT:    v_cmp_gt_f64_e64 s[8:9], v[4:5], v[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[6:7], 0, v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, v[6:7], v[14:15]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[4:5]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[4:5], v[6:7], v[14:15]
+; GFX7-NEXT:    v_cndmask_b32_e64 v10, v13, v5, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v9, v18, v10, s[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e32 v10, v15, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v8, v12, v4, s[8:9]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[6:7], v[4:5], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v11, v18, v10, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v10, v14, v6, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[6:7], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[12:13]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[8:9], v[12:13], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v10, 0, v10, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[14:15], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[10:11]
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[8:9]
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v11, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, v10, v6, s[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v11, v7, s[12:13]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v4f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[8:9]
+; GFX8-NEXT:    v_mov_b32_e32 v18, 0x7ff80000
+; GFX8-NEXT:    v_cmp_gt_f64_e64 s[6:7], v[2:3], v[10:11]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[8:9], v[2:3], v[10:11]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[12:13], v[4:5], v[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v9, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v17, v18, v16, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[8:9], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v19, v11, v3, s[6:7]
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[16:17]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[10:11], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v10, v2, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v18, v19, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[8:9]
+; GFX8-NEXT:    v_cmp_gt_f64_e64 s[8:9], v[4:5], v[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[6:7], 0, v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, v[6:7], v[14:15]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[4:5]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[4:5], v[6:7], v[14:15]
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, v13, v5, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v18, v10, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v15, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v12, v4, s[8:9]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[6:7], v[4:5], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v18, v10, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v14, v6, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[6:7], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[12:13]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[8:9], v[12:13], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, v10, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[14:15], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[10:11]
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[8:9]
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v11, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v10, v6, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v11, v7, s[12:13]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v4f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[8:9]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[8:9]
+; GFX9-NEXT:    v_mov_b32_e32 v18, 0x7ff80000
+; GFX9-NEXT:    v_cmp_gt_f64_e64 s[6:7], v[2:3], v[10:11]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[8:9], v[2:3], v[10:11]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[12:13], v[4:5], v[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v9, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v17, v18, v16, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[8:9], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, v11, v3, s[6:7]
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[16:17]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[10:11], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v10, v2, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v18, v19, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[8:9]
+; GFX9-NEXT:    v_cmp_gt_f64_e64 s[8:9], v[4:5], v[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[6:7], 0, v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, v[6:7], v[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[4:5], v[6:7], v[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v13, v5, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v18, v10, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v15, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v12, v4, s[8:9]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[6:7], v[4:5], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v18, v10, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v14, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[6:7], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[12:13]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[8:9], v[12:13], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, v10, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[14:15], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[10:11]
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[8:9]
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v11, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v10, v6, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v11, v7, s[12:13]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v4f64:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[8:9]
+; GFX940-NEXT:    v_mov_b32_e32 v18, 0x7ff80000
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[0:1], v[8:9]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v9, v1, vcc
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v17, v18, v16, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[8:9], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[16:17]
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[2:3], v[10:11]
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v11, v3, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[2:3], v[10:11]
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v9, v18, v8, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v10, v2, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[10:11], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[8:9]
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[4:5], v[12:13]
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v13, v5, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[4:5], v[12:13]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[2:3]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v9, v18, v8, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v12, v4, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[12:13], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[8:9]
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[6:7], v[14:15]
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v15, v7, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[6:7], v[14:15]
+; GFX940-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[2:3]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v9, v18, v8, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v14, v6, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[6:7], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[14:15], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[8:9]
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v6, v8, v6, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v7, v9, v7, s[2:3]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v4f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s4, v[2:3], v[10:11]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s5, v[0:1], v[8:9]
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s6, v[4:5], v[12:13]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s7, v[2:3], v[10:11]
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s8, v[6:7], v[14:15]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s9, v[4:5], v[12:13]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s10, v[6:7], v[14:15]
+; GFX10-NEXT:    v_cmp_class_f64_e64 s11, v[14:15], 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, v9, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, v11, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v19, v8, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, v10, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v20, v13, v5, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, v15, v7, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, 0x7ff80000, v16, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0, v19, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, 0x7ff80000, v18, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v23, v12, v4, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, 0, v21, s7
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[0:1], 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v24, v14, v6, s8
+; GFX10-NEXT:    v_cmp_class_f64_e64 s4, v[2:3], 64
+; GFX10-NEXT:    v_cmp_class_f64_e64 s7, v[4:5], 64
+; GFX10-NEXT:    v_cmp_class_f64_e64 s8, v[6:7], 64
+; GFX10-NEXT:    v_cmp_class_f64_e64 s5, v[8:9], 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, 0x7ff80000, v20, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v20, 0, v23, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v23, 0x7ff80000, v22, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, 0, v24, s10
+; GFX10-NEXT:    v_cmp_class_f64_e64 s9, v[10:11], 64
+; GFX10-NEXT:    v_cmp_class_f64_e64 s10, v[12:13], 64
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s6, 0, v[16:17]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s12, 0, v[18:19]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s13, 0, v[20:21]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s14, 0, v[22:23]
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s14
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v4f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s0, v[0:1], v[8:9]
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s1, v[2:3], v[10:11]
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s2, v[4:5], v[12:13]
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s3, v[6:7], v[14:15]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s4, v[2:3], v[10:11]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s5, v[4:5], v[12:13]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s6, v[6:7], v[14:15]
+; GFX11-NEXT:    v_cndmask_b32_e32 v16, v9, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v18, v11, v3, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v20, v13, v5, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v22, v15, v7, s3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v17, 0x7ff80000, v16, s0
+; GFX11-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[0:1], 64
+; GFX11-NEXT:    v_cndmask_b32_e64 v19, 0x7ff80000, v18, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v21, 0x7ff80000, v20, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v18, v10, v2, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v20, v12, v4, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v24, v14, v6, s3
+; GFX11-NEXT:    v_cmp_class_f64_e64 s1, v[4:5], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s3, v[6:7], 64
+; GFX11-NEXT:    v_cndmask_b32_e64 v23, 0x7ff80000, v22, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v18, 0, v18, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v20, 0, v20, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v22, 0, v24, s6
+; GFX11-NEXT:    v_cmp_class_f64_e64 s2, v[12:13], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s4, v[14:15], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s5, v[8:9], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s6, v[10:11], 64
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s8, 0, v[18:19]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s9, 0, v[20:21]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s10, 0, v[22:23]
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s0
+; GFX11-NEXT:    v_cmp_class_f64_e64 s0, v[2:3], 64
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s7, 0, v[16:17]
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s9
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s9
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s10
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s8
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s8
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v4f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[8:9]
+; GFX12-NEXT:    v_maximum_f64 v[2:3], v[2:3], v[10:11]
+; GFX12-NEXT:    v_maximum_f64 v[4:5], v[4:5], v[12:13]
+; GFX12-NEXT:    v_maximum_f64 v[6:7], v[6:7], v[14:15]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <4 x double> @llvm.maximum.v4f64(<4 x double> %src0, <4 x double> %src1)
+  ret <4 x double> %op
+}
+
+define <4 x double> @v_maximum_v4f64__nnan(<4 x double> %src0, <4 x double> %src1) {
+; GFX7-LABEL: v_maximum_v4f64__nnan:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], 64
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[6:7], v[8:9], 64
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[10:11], v[10:11], 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v17, v9, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, v[2:3], v[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[4:5]
+; GFX7-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[4:5], v[12:13]
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[8:9], 0, v[16:17]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[6:7]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[6:7], v[12:13], 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v19, v11, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v18, v10, v2, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 64
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[18:19]
+; GFX7-NEXT:    v_cndmask_b32_e64 v9, v13, v5, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v8, v12, v4, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[4:5], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[8:9]
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[8:9], 0, v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, v[6:7], v[14:15]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[10:11]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[10:11], v[14:15], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v11, v15, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v10, v14, v6, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[6:7], 64
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v11, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, v10, v6, s[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v11, v7, s[12:13]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v4f64__nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], 64
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[6:7], v[8:9], 64
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[10:11], v[10:11], 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v17, v9, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, v[2:3], v[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[4:5]
+; GFX8-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[4:5], v[12:13]
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[8:9], 0, v[16:17]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[6:7]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[6:7], v[12:13], 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, v11, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v10, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 64
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[18:19]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v13, v5, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v12, v4, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[4:5], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[8:9]
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[8:9], 0, v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, v[6:7], v[14:15]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[10:11]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[10:11], v[14:15], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v15, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v14, v6, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[6:7], 64
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v11, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v10, v6, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v11, v7, s[12:13]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v4f64__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[8:9]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], 64
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[6:7], v[8:9], 64
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[10:11], v[10:11], 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v17, v9, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, v[2:3], v[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[4:5]
+; GFX9-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[4:5], v[12:13]
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[8:9], 0, v[16:17]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[6:7]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[6:7], v[12:13], 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v19, v11, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v10, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 64
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[18:19]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v13, v5, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v12, v4, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[4:5], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[8:9]
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[8:9], 0, v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, v[6:7], v[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[10:11]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[10:11], v[14:15], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v15, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v14, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[6:7], 64
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v11, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v10, v6, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v11, v7, s[12:13]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v4f64__nnan:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[8:9]
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[8:9], 64
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v17, v9, v1, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[16:17]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[2:3], v[10:11]
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v9, v11, v3, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v10, v2, vcc
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[10:11], 64
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[0:1]
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[8:9]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[0:1]
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[4:5], v[12:13]
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e32 v9, v13, v5, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v12, v4, vcc
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[12:13], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[8:9]
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[0:1]
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[6:7], v[14:15]
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e32 v9, v15, v7, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v14, v6, vcc
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[6:7], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[14:15], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[8:9]
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v6, v8, v6, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v7, v9, v7, s[2:3]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v4f64__nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s4, v[2:3], v[10:11]
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s5, v[4:5], v[12:13]
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s6, v[6:7], v[14:15]
+; GFX10-NEXT:    v_cmp_class_f64_e64 s7, v[10:11], 64
+; GFX10-NEXT:    v_cmp_class_f64_e64 s8, v[8:9], 64
+; GFX10-NEXT:    v_cmp_class_f64_e64 s10, v[12:13], 64
+; GFX10-NEXT:    v_cmp_class_f64_e64 s11, v[14:15], 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v17, v9, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, v11, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, v13, v5, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v23, v15, v7, s6
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, v10, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v20, v12, v4, s5
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[4:5], 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, v14, v6, s6
+; GFX10-NEXT:    v_cmp_class_f64_e64 s4, v[6:7], 64
+; GFX10-NEXT:    v_cmp_class_f64_e64 s5, v[0:1], 64
+; GFX10-NEXT:    v_cmp_class_f64_e64 s6, v[2:3], 64
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s9, 0, v[16:17]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s12, 0, v[18:19]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s13, 0, v[20:21]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s14, 0, v[22:23]
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s14
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v4f64__nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s0, v[2:3], v[10:11]
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s1, v[4:5], v[12:13]
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s2, v[6:7], v[14:15]
+; GFX11-NEXT:    v_cmp_class_f64_e64 s3, v[6:7], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s4, v[14:15], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s5, v[8:9], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s6, v[10:11], 64
+; GFX11-NEXT:    v_dual_cndmask_b32 v17, v9, v1 :: v_dual_cndmask_b32 v16, v8, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v19, v11, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v21, v13, v5, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v18, v10, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v20, v12, v4, s1
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[0:1], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s0, v[2:3], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s1, v[4:5], 64
+; GFX11-NEXT:    v_cndmask_b32_e64 v23, v15, v7, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v22, v14, v6, s2
+; GFX11-NEXT:    v_cmp_class_f64_e64 s2, v[12:13], 64
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s7, 0, v[16:17]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s8, 0, v[18:19]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s9, 0, v[20:21]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s10, 0, v[22:23]
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s4
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v16, v0 :: v_dual_cndmask_b32 v1, v17, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s7
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s8
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s9
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s10
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s7
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s8
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s9
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s10
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v4f64__nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[8:9]
+; GFX12-NEXT:    v_maximum_f64 v[2:3], v[2:3], v[10:11]
+; GFX12-NEXT:    v_maximum_f64 v[4:5], v[4:5], v[12:13]
+; GFX12-NEXT:    v_maximum_f64 v[6:7], v[6:7], v[14:15]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan <4 x double> @llvm.maximum.v4f64(<4 x double> %src0, <4 x double> %src1)
+  ret <4 x double> %op
+}
+
+define <4 x double> @v_maximum_v4f64__nsz(<4 x double> %src0, <4 x double> %src1) {
+; GFX7-LABEL: v_maximum_v4f64__nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[8:9]
+; GFX7-NEXT:    v_cmp_gt_f64_e64 s[6:7], v[4:5], v[12:13]
+; GFX7-NEXT:    v_cmp_gt_f64_e64 s[10:11], v[6:7], v[14:15]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[8:9], v[4:5], v[12:13]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[12:13], v[6:7], v[14:15]
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, v[2:3], v[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, v16, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v8, v1, s[4:5]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[4:5], v[2:3], v[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v8, v7, s[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e32 v9, v10, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v10, v14, v6, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, v9, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v9, v12, v4, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, 0, v9, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, 0, v10, s[12:13]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v4f64__nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_gt_f64_e64 s[6:7], v[4:5], v[12:13]
+; GFX8-NEXT:    v_cmp_gt_f64_e64 s[10:11], v[6:7], v[14:15]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[8:9], v[4:5], v[12:13]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[12:13], v[6:7], v[14:15]
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, v[2:3], v[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, v16, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v8, v1, s[4:5]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[4:5], v[2:3], v[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v8, v7, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v10, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, v14, v6, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, v9, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v12, v4, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, v9, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, v10, s[12:13]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v4f64__nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[8:9]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[8:9]
+; GFX9-NEXT:    v_cmp_gt_f64_e64 s[6:7], v[4:5], v[12:13]
+; GFX9-NEXT:    v_cmp_gt_f64_e64 s[10:11], v[6:7], v[14:15]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[8:9], v[4:5], v[12:13]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[12:13], v[6:7], v[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, v[2:3], v[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, v16, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v8, v1, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[4:5], v[2:3], v[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v8, v7, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v10, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v14, v6, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, v9, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v12, v4, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, v9, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, v10, s[12:13]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v4f64__nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[8:9]
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[0:1], v[8:9]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX940-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[2:3], v[10:11]
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, 0, v16, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v8, v1, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v9, v10, v2, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[2:3], v[10:11]
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[4:5], v[12:13]
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, 0, v9, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v9, v12, v4, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[4:5], v[12:13]
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[6:7], v[14:15]
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, 0, v9, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v9, v14, v6, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[6:7], v[14:15]
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v6, 0, v9, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v7, v8, v7, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v4f64__nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s4, v[2:3], v[10:11]
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s5, v[4:5], v[12:13]
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s7, v[6:7], v[14:15]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s6, v[0:1], v[8:9]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s8, v[2:3], v[10:11]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s9, v[4:5], v[12:13]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s10, v[6:7], v[14:15]
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v10, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v12, v4, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v14, v6, s7
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, v16, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0x7ff80000, v1, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0x7ff80000, v3, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v5, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0x7ff80000, v7, s10
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v4f64__nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s1, v[2:3], v[10:11]
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s2, v[4:5], v[12:13]
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s3, v[6:7], v[14:15]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s0, v[0:1], v[8:9]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s4, v[2:3], v[10:11]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s5, v[4:5], v[12:13]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s6, v[6:7], v[14:15]
+; GFX11-NEXT:    v_dual_cndmask_b32 v16, v8, v0 :: v_dual_cndmask_b32 v1, v9, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v8, v10, v2, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v10, v12, v4, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v12, v14, v6, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, v16, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0x7ff80000, v1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0x7ff80000, v3, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v5, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0x7ff80000, v7, s6
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v4f64__nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[8:9]
+; GFX12-NEXT:    v_maximum_f64 v[2:3], v[2:3], v[10:11]
+; GFX12-NEXT:    v_maximum_f64 v[4:5], v[4:5], v[12:13]
+; GFX12-NEXT:    v_maximum_f64 v[6:7], v[6:7], v[14:15]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nsz <4 x double> @llvm.maximum.v4f64(<4 x double> %src0, <4 x double> %src1)
+  ret <4 x double> %op
+}
+
+define <4 x double> @v_maximum_v4f64__nnan_nsz(<4 x double> %src0, <4 x double> %src1) {
+; GFX7-LABEL: v_maximum_v4f64__nnan_nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[2:3], v[10:11]
+; GFX7-NEXT:    v_cmp_gt_f64_e64 s[6:7], v[4:5], v[12:13]
+; GFX7-NEXT:    v_cmp_gt_f64_e64 s[8:9], v[6:7], v[14:15]
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s[8:9]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v4f64__nnan_nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[2:3], v[10:11]
+; GFX8-NEXT:    v_cmp_gt_f64_e64 s[6:7], v[4:5], v[12:13]
+; GFX8-NEXT:    v_cmp_gt_f64_e64 s[8:9], v[6:7], v[14:15]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s[8:9]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v4f64__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[8:9]
+; GFX9-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[2:3], v[10:11]
+; GFX9-NEXT:    v_cmp_gt_f64_e64 s[6:7], v[4:5], v[12:13]
+; GFX9-NEXT:    v_cmp_gt_f64_e64 s[8:9], v[6:7], v[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s[8:9]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v4f64__nnan_nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[8:9]
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[2:3], v[10:11]
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[4:5], v[12:13]
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[6:7], v[14:15]
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v4f64__nnan_nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s4, v[2:3], v[10:11]
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s5, v[4:5], v[12:13]
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s6, v[6:7], v[14:15]
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s6
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s6
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v4f64__nnan_nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s0, v[2:3], v[10:11]
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s1, v[4:5], v[12:13]
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s2, v[6:7], v[14:15]
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v8, v0 :: v_dual_cndmask_b32 v1, v9, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v4f64__nnan_nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[8:9]
+; GFX12-NEXT:    v_maximum_f64 v[2:3], v[2:3], v[10:11]
+; GFX12-NEXT:    v_maximum_f64 v[4:5], v[4:5], v[12:13]
+; GFX12-NEXT:    v_maximum_f64 v[6:7], v[6:7], v[14:15]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan nsz <4 x double> @llvm.maximum.v4f64(<4 x double> %src0, <4 x double> %src1)
+  ret <4 x double> %op
+}
+
+define <8 x double> @v_maximum_v8f64(<8 x double> %src0, <8 x double> %src1) {
+; GFX7-LABEL: v_maximum_v8f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[16:17]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[16:17]
+; GFX7-NEXT:    v_mov_b32_e32 v32, 0x7ff80000
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[6:7], v[4:5], v[20:21]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[8:9], v[8:9], v[24:25]
+; GFX7-NEXT:    v_cmp_gt_f64_e64 s[10:11], v[12:13], v[28:29]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[12:13], v[12:13], v[28:29]
+; GFX7-NEXT:    v_cndmask_b32_e32 v31, v17, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v34, v32, v31, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v31, v16, v0, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v33, 0, v31, s[4:5]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[4:5], v[2:3], v[18:19]
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v33, v0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v34, v1, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[16:17], 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
+; GFX7-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[33:34]
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v33, v0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v34, v1, vcc
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, v[2:3], v[18:19]
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v19, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v17, v32, v16, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v18, v2, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[4:5]
+; GFX7-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[4:5], v[20:21]
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v17, v3, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[18:19], 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
+; GFX7-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[16:17]
+; GFX7-NEXT:    v_cndmask_b32_e64 v18, v21, v5, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v17, v3, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v16, v20, v4, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[20:21], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v17, v32, v18, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[6:7]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[6:7], v[6:7], v[22:23]
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v20, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v21, s[4:5]
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[4:5], 0, v[16:17]
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, v[6:7], v[22:23]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v16, v4, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v17, v5, s[4:5]
+; GFX7-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[8:9], v[24:25]
+; GFX7-NEXT:    v_cndmask_b32_e32 v18, v23, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v22, v6, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[6:7], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v19, v32, v18, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v18, 0, v16, s[6:7]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[6:7], v[22:23], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v16, v25, v9, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v17, v32, v16, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v16, v24, v8, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v18, v6, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v19, v7, vcc
+; GFX7-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[18:19]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[8:9], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, v6, v22, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v7, v23, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[8:9]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[6:7], v[24:25], 64
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[8:9], v[10:11], v[26:27]
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v18, v6, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v8, v16, v8, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v19, v7, vcc
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, v[10:11], v[26:27]
+; GFX7-NEXT:    v_cndmask_b32_e64 v9, v17, v9, s[4:5]
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[4:5], 0, v[16:17]
+; GFX7-NEXT:    v_cndmask_b32_e64 v8, v8, v24, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v9, v9, v25, s[6:7]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[6:7], v[26:27], 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v18, v27, v11, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v19, v32, v18, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v8, v16, v8, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v26, v10, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[10:11], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v18, 0, v16, s[8:9]
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[8:9], 0, v[18:19]
+; GFX7-NEXT:    v_cndmask_b32_e64 v9, v17, v9, s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[4:5], v[14:15], v[30:31]
+; GFX7-NEXT:    v_cndmask_b32_e64 v16, v29, v13, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v17, v32, v16, s[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e64 v16, v28, v12, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e32 v10, v18, v10, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v11, v19, v11, vcc
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, v[14:15], v[30:31]
+; GFX7-NEXT:    v_cndmask_b32_e64 v10, v10, v26, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v11, v11, v27, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v10, v18, v10, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v11, v19, v11, s[8:9]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[6:7], v[12:13], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[12:13]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[8:9], v[28:29], 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v18, v31, v15, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v19, v32, v18, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v18, v30, v14, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[14:15], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v18, 0, v18, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[30:31], 64
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[16:17]
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[18:19]
+; GFX7-NEXT:    v_cndmask_b32_e64 v12, v16, v12, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v13, v17, v13, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v12, v12, v28, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v14, v18, v14, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v15, v19, v15, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v14, v14, v30, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v13, v13, v29, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v15, v15, v31, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v12, v16, v12, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v13, v17, v13, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v14, v18, v14, s[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e64 v15, v19, v15, s[12:13]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v8f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[16:17]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[16:17]
+; GFX8-NEXT:    v_mov_b32_e32 v32, 0x7ff80000
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[6:7], v[4:5], v[20:21]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[8:9], v[8:9], v[24:25]
+; GFX8-NEXT:    v_cmp_gt_f64_e64 s[10:11], v[12:13], v[28:29]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[12:13], v[12:13], v[28:29]
+; GFX8-NEXT:    v_cndmask_b32_e32 v31, v17, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v34, v32, v31, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v31, v16, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v33, 0, v31, s[4:5]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[4:5], v[2:3], v[18:19]
+; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v33, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v34, v1, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[16:17], 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
+; GFX8-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[33:34]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v33, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v34, v1, vcc
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, v[2:3], v[18:19]
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v19, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v17, v32, v16, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v18, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[4:5]
+; GFX8-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[4:5], v[20:21]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v17, v3, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[18:19], 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
+; GFX8-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[16:17]
+; GFX8-NEXT:    v_cndmask_b32_e64 v18, v21, v5, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v17, v3, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, v20, v4, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[20:21], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v17, v32, v18, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[6:7]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[6:7], v[6:7], v[22:23]
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v20, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v21, s[4:5]
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[4:5], 0, v[16:17]
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, v[6:7], v[22:23]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v16, v4, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v17, v5, s[4:5]
+; GFX8-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[8:9], v[24:25]
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v23, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v22, v6, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[6:7], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v19, v32, v18, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v18, 0, v16, s[6:7]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[6:7], v[22:23], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, v25, v9, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v17, v32, v16, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, v24, v8, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v18, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v19, v7, vcc
+; GFX8-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[18:19]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[8:9], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v22, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v23, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[8:9]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[6:7], v[24:25], 64
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[8:9], v[10:11], v[26:27]
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v18, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v16, v8, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v19, v7, vcc
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, v[10:11], v[26:27]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v17, v9, s[4:5]
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[4:5], 0, v[16:17]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v8, v24, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v9, v25, s[6:7]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[6:7], v[26:27], 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v27, v11, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v19, v32, v18, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v16, v8, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v26, v10, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[10:11], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v18, 0, v16, s[8:9]
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[8:9], 0, v[18:19]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v17, v9, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[4:5], v[14:15], v[30:31]
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, v29, v13, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v17, v32, v16, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, v28, v12, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v18, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v19, v11, vcc
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, v[14:15], v[30:31]
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, v10, v26, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v27, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, v18, v10, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v19, v11, s[8:9]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[6:7], v[12:13], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[12:13]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[8:9], v[28:29], 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v31, v15, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v19, v32, v18, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v30, v14, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[14:15], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v18, 0, v18, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[30:31], 64
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[16:17]
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[18:19]
+; GFX8-NEXT:    v_cndmask_b32_e64 v12, v16, v12, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v13, v17, v13, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v12, v12, v28, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v18, v14, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v19, v15, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v14, v14, v30, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v13, v13, v29, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v15, v15, v31, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v12, v16, v12, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v13, v17, v13, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v14, v18, v14, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v15, v19, v15, s[12:13]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v8f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[16:17]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[16:17]
+; GFX9-NEXT:    v_mov_b32_e32 v32, 0x7ff80000
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[6:7], v[4:5], v[20:21]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[8:9], v[8:9], v[24:25]
+; GFX9-NEXT:    v_cmp_gt_f64_e64 s[10:11], v[12:13], v[28:29]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[12:13], v[12:13], v[28:29]
+; GFX9-NEXT:    v_cndmask_b32_e32 v31, v17, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v34, v32, v31, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v31, v16, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v33, 0, v31, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[4:5], v[2:3], v[18:19]
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v33, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v34, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[16:17], 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
+; GFX9-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[33:34]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v33, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v34, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, v[2:3], v[18:19]
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v19, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v17, v32, v16, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v18, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[4:5]
+; GFX9-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[4:5], v[20:21]
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v17, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[18:19], 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
+; GFX9-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[16:17]
+; GFX9-NEXT:    v_cndmask_b32_e64 v18, v21, v5, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v17, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, v20, v4, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[20:21], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v17, v32, v18, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[6:7]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[6:7], v[6:7], v[22:23]
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v20, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v21, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[4:5], 0, v[16:17]
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, v[6:7], v[22:23]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v16, v4, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v17, v5, s[4:5]
+; GFX9-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[8:9], v[24:25]
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v23, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v22, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[6:7], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, v32, v18, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v18, 0, v16, s[6:7]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[6:7], v[22:23], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, v25, v9, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v17, v32, v16, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, v24, v8, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v18, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v19, v7, vcc
+; GFX9-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[18:19]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[8:9], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v22, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v23, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[8:9]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[6:7], v[24:25], 64
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[8:9], v[10:11], v[26:27]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v18, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v16, v8, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v19, v7, vcc
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, v[10:11], v[26:27]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v17, v9, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[4:5], 0, v[16:17]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v24, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v25, s[6:7]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[6:7], v[26:27], 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v27, v11, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, v32, v18, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v16, v8, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v26, v10, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[10:11], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v18, 0, v16, s[8:9]
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[8:9], 0, v[18:19]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v17, v9, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[4:5], v[14:15], v[30:31]
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, v29, v13, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v17, v32, v16, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, v28, v12, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v18, v10, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v19, v11, vcc
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, v[14:15], v[30:31]
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, v26, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v27, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v18, v10, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v19, v11, s[8:9]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[6:7], v[12:13], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[12:13]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[8:9], v[28:29], 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v31, v15, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, v32, v18, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v30, v14, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[14:15], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v18, 0, v18, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[30:31], 64
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[16:17]
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[18:19]
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v16, v12, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, v17, v13, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v28, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v18, v14, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v19, v15, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v14, v14, v30, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, v13, v29, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v15, v15, v31, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v16, v12, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, v17, v13, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v14, v18, v14, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v15, v19, v15, s[12:13]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v8f64:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    scratch_load_dword v31, off, s32
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[16:17]
+; GFX940-NEXT:    v_mov_b32_e32 v32, 0x7ff80000
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[0:1], v[16:17]
+; GFX940-NEXT:    v_cndmask_b32_e32 v33, v17, v1, vcc
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v35, v32, v33, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v33, v16, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v34, 0, v33, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[16:17], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[34:35]
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v34, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v35, v1, vcc
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[2:3], v[18:19]
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v0, v16, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v1, v17, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v19, v3, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[2:3], v[18:19]
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v34, v0, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v35, v1, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v17, v32, v16, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v18, v2, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[18:19], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[16:17]
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v2, v18, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v17, v3, vcc
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[4:5], v[20:21]
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v16, v2, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v3, v19, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v21, v5, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[4:5], v[20:21]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v17, v3, s[2:3]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v17, v32, v16, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v20, v4, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[20:21], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[16:17]
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, v4, v20, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[6:7], v[22:23]
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, v16, v4, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v5, v5, v21, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v23, v7, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[6:7], v[22:23]
+; GFX940-NEXT:    v_cndmask_b32_e64 v5, v17, v5, s[2:3]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v17, v32, v16, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v22, v6, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[6:7], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[22:23], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[16:17]
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v16, v6, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v6, v6, v22, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v17, v7, vcc
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[8:9], v[24:25]
+; GFX940-NEXT:    v_cndmask_b32_e64 v6, v16, v6, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v7, v7, v23, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v25, v9, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[8:9], v[24:25]
+; GFX940-NEXT:    v_cndmask_b32_e64 v7, v17, v7, s[2:3]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v17, v32, v16, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v24, v8, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[8:9], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[24:25], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[16:17]
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v8, v8, v24, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v9, v17, v9, vcc
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[10:11], v[26:27]
+; GFX940-NEXT:    v_cndmask_b32_e64 v8, v16, v8, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v9, v9, v25, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v27, v11, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[10:11], v[26:27]
+; GFX940-NEXT:    v_cndmask_b32_e64 v9, v17, v9, s[2:3]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v17, v32, v16, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v26, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[10:11], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[26:27], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[16:17]
+; GFX940-NEXT:    v_cndmask_b32_e32 v10, v16, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v10, v10, v26, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v11, v17, v11, vcc
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[12:13], v[28:29]
+; GFX940-NEXT:    v_cndmask_b32_e64 v10, v16, v10, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v11, v11, v27, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v29, v13, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[12:13], v[28:29]
+; GFX940-NEXT:    v_cndmask_b32_e64 v11, v17, v11, s[2:3]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v17, v32, v16, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v28, v12, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[12:13], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[28:29], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[16:17]
+; GFX940-NEXT:    v_cndmask_b32_e32 v12, v16, v12, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v12, v12, v28, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v13, v17, v13, vcc
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[14:15], v[30:31]
+; GFX940-NEXT:    v_cndmask_b32_e64 v12, v16, v12, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v13, v13, v29, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v31, v15, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[14:15], v[30:31]
+; GFX940-NEXT:    v_cndmask_b32_e64 v13, v17, v13, s[2:3]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v17, v32, v16, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v30, v14, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[14:15], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[30:31], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[16:17]
+; GFX940-NEXT:    v_cndmask_b32_e32 v14, v16, v14, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v15, v17, v15, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v14, v14, v30, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v15, v15, v31, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v14, v16, v14, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v15, v17, v15, s[2:3]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v8f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[16:17]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s4, v[0:1], v[16:17]
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s9, v[6:7], v[22:23]
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s10, v[8:9], v[24:25]
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s11, v[10:11], v[26:27]
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s12, v[12:13], v[28:29]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s13, v[6:7], v[22:23]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s14, v[8:9], v[24:25]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s15, v[10:11], v[26:27]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s16, v[12:13], v[28:29]
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s5, v[2:3], v[18:19]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s6, v[2:3], v[18:19]
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s7, v[4:5], v[20:21]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s8, v[4:5], v[20:21]
+; GFX10-NEXT:    v_cmp_class_f64_e64 s17, v[26:27], 64
+; GFX10-NEXT:    v_cmp_class_f64_e64 s18, v[28:29], 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v32, v17, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v38, v23, v7, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v48, v25, v9, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v50, v27, v11, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v33, 0x7ff80000, v32, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v32, v16, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v52, v29, v13, s12
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[0:1], 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v39, 0x7ff80000, v38, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v49, 0x7ff80000, v48, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v32, 0, v32, s4
+; GFX10-NEXT:    v_cmp_class_f64_e64 s4, v[2:3], 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v51, 0x7ff80000, v50, s15
+; GFX10-NEXT:    v_cndmask_b32_e64 v53, 0x7ff80000, v52, s16
+; GFX10-NEXT:    v_cndmask_b32_e64 v38, v22, v6, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v48, v24, v8, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v50, v26, v10, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v52, v28, v12, s12
+; GFX10-NEXT:    v_cmp_class_f64_e64 s11, v[16:17], 64
+; GFX10-NEXT:    v_cmp_class_f64_e64 s12, v[18:19], 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v34, v19, v3, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v38, 0, v38, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v36, v21, v5, s7
+; GFX10-NEXT:    v_cmp_class_f64_e64 s9, v[12:13], 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v48, 0, v48, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v35, 0x7ff80000, v34, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v34, v18, v2, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v37, 0x7ff80000, v36, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v36, v20, v4, s7
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v32, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f64_e64 s5, v[4:5], 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v34, 0, v34, s6
+; GFX10-NEXT:    v_cmp_class_f64_e64 s6, v[6:7], 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v36, 0, v36, s8
+; GFX10-NEXT:    v_cmp_class_f64_e64 s7, v[8:9], 64
+; GFX10-NEXT:    v_cmp_class_f64_e64 s8, v[10:11], 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v34, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v50, 0, v50, s15
+; GFX10-NEXT:    v_cndmask_b32_e64 v52, 0, v52, s16
+; GFX10-NEXT:    v_cmp_class_f64_e64 s14, v[20:21], 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v16, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v18, s12
+; GFX10-NEXT:    v_cmp_class_f64_e64 s15, v[22:23], 64
+; GFX10-NEXT:    v_cmp_class_f64_e64 s16, v[24:25], 64
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s19, 0, v[32:33]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s20, 0, v[34:35]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s21, 0, v[36:37]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s22, 0, v[48:49]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s23, 0, v[50:51]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s24, 0, v[52:53]
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v33, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v52, v12, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v36, v4, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v35, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v38, v6, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v37, v5, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v48, v8, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v50, v10, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v39, v7, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v49, v9, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v51, v11, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v53, v13, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v20, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v10, v26, s17
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v22, s15
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v24, s16
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v12, v28, s18
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v17, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v19, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v21, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v23, s15
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v9, v25, s16
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v11, v27, s17
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v13, v29, s18
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v32, v0, s19
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v34, v2, s20
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v36, v4, s21
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v48, v8, s22
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v50, v10, s23
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v52, v12, s24
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v33, v1, s19
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v35, v3, s20
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v37, v5, s21
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v49, v9, s22
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v51, v11, s23
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v53, v13, s24
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s10, v[14:15], v[30:31]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s13, v[14:15], v[30:31]
+; GFX10-NEXT:    v_cmp_class_f64_e64 s25, v[30:31], 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, v31, v15, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, v30, v14, s10
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s10, 0, v[38:39]
+; GFX10-NEXT:    v_cndmask_b32_e64 v55, 0x7ff80000, v16, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v54, 0, v18, s13
+; GFX10-NEXT:    v_cmp_class_f64_e64 s13, v[14:15], 64
+; GFX10-NEXT:    v_cmp_eq_f64_e32 vcc_lo, 0, v[54:55]
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v38, v6, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v39, v7, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v54, v14, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, v55, v15, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v14, v30, s25
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, v15, v31, s25
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, v54, v14, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, v55, v15, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v8f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s4, v[6:7], v[22:23]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s9, v[6:7], v[22:23]
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s1, v[2:3], v[18:19]
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s6, v[10:11], v[26:27]
+; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[16:17]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s2, v[2:3], v[18:19]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s11, v[10:11], v[26:27]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s0, v[0:1], v[16:17]
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s3, v[4:5], v[20:21]
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s5, v[8:9], v[24:25]
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s7, v[12:13], v[28:29]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s8, v[4:5], v[20:21]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s10, v[8:9], v[24:25]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s12, v[12:13], v[28:29]
+; GFX11-NEXT:    v_cmp_class_f64_e64 s13, v[18:19], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s15, v[20:21], 64
+; GFX11-NEXT:    v_cndmask_b32_e64 v38, v23, v7, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v34, v19, v3, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v50, v27, v11, s6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v39, 0x7ff80000, v38, s9
+; GFX11-NEXT:    v_cndmask_b32_e64 v38, v22, v6, s4
+; GFX11-NEXT:    v_cmp_class_f64_e64 s4, v[6:7], 64
+; GFX11-NEXT:    v_cndmask_b32_e64 v35, 0x7ff80000, v34, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v51, 0x7ff80000, v50, s11
+; GFX11-NEXT:    v_cndmask_b32_e64 v34, v18, v2, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v50, v26, v10, s6
+; GFX11-NEXT:    v_cmp_class_f64_e64 s1, v[0:1], 64
+; GFX11-NEXT:    v_cndmask_b32_e64 v36, v21, v5, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v48, v25, v9, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v52, v29, v13, s7
+; GFX11-NEXT:    v_cndmask_b32_e64 v50, 0, v50, s11
+; GFX11-NEXT:    v_cmp_class_f64_e64 s11, v[16:17], 64
+; GFX11-NEXT:    v_cndmask_b32_e64 v37, 0x7ff80000, v36, s8
+; GFX11-NEXT:    v_cndmask_b32_e64 v49, 0x7ff80000, v48, s10
+; GFX11-NEXT:    v_cndmask_b32_e64 v53, 0x7ff80000, v52, s12
+; GFX11-NEXT:    v_cndmask_b32_e64 v36, v20, v4, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v48, v24, v8, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v52, v28, v12, s7
+; GFX11-NEXT:    v_cndmask_b32_e64 v34, 0, v34, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v38, 0, v38, s9
+; GFX11-NEXT:    v_cmp_class_f64_e64 s2, v[2:3], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s3, v[4:5], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s5, v[8:9], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s7, v[10:11], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s9, v[12:13], 64
+; GFX11-NEXT:    v_cndmask_b32_e64 v36, 0, v36, s8
+; GFX11-NEXT:    v_cndmask_b32_e64 v48, 0, v48, s10
+; GFX11-NEXT:    v_cndmask_b32_e64 v52, 0, v52, s12
+; GFX11-NEXT:    v_cmp_class_f64_e64 s6, v[24:25], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s8, v[26:27], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s10, v[28:29], 64
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s14, 0, v[34:35]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s16, 0, v[36:37]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s17, 0, v[38:39]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s18, 0, v[48:49]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s20, 0, v[50:51]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s21, 0, v[52:53]
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v39, v7, s4
+; GFX11-NEXT:    v_cndmask_b32_e32 v32, v17, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v38, v6, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v33, 0x7ff80000, v32, s0
+; GFX11-NEXT:    v_cndmask_b32_e32 v32, v16, v0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v33, v1, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v32, 0, v32, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v17, s11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v32, v0, s1
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s12, 0, v[32:33]
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v34, v2, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v36, v4, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v8, v48, v8, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v16, s11
+; GFX11-NEXT:    v_cndmask_b32_e64 v10, v50, v10, s7
+; GFX11-NEXT:    v_cndmask_b32_e64 v12, v52, v12, s9
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v35, v3, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v37, v5, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v9, v49, v9, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v11, v51, v11, s7
+; GFX11-NEXT:    v_cndmask_b32_e64 v13, v53, v13, s9
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v18, s13
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v20, s15
+; GFX11-NEXT:    v_cndmask_b32_e64 v8, v8, v24, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v10, v10, v26, s8
+; GFX11-NEXT:    v_cndmask_b32_e64 v12, v12, v28, s10
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v19, s13
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, v21, s15
+; GFX11-NEXT:    v_cndmask_b32_e64 v9, v9, v25, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v11, v11, v27, s8
+; GFX11-NEXT:    v_cndmask_b32_e64 v13, v13, v29, s10
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v34, v2, s14
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v36, v4, s16
+; GFX11-NEXT:    v_cndmask_b32_e64 v8, v48, v8, s18
+; GFX11-NEXT:    v_cndmask_b32_e64 v10, v50, v10, s20
+; GFX11-NEXT:    v_cndmask_b32_e64 v12, v52, v12, s21
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v35, v3, s14
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v37, v5, s16
+; GFX11-NEXT:    v_cndmask_b32_e64 v9, v49, v9, s18
+; GFX11-NEXT:    v_cndmask_b32_e64 v11, v51, v11, s20
+; GFX11-NEXT:    v_cndmask_b32_e64 v13, v53, v13, s21
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v32, v0, s12
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v33, v1, s12
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[14:15], v[30:31]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s0, v[14:15], v[30:31]
+; GFX11-NEXT:    v_cmp_class_f64_e64 s19, v[30:31], 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v54, v31, v15, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v16, v30, v14, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[22:23], 64
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v55, 0x7ff80000, v54, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v54, 0, v16, s0
+; GFX11-NEXT:    v_cmp_class_f64_e64 s0, v[14:15], 64
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s22, 0, v[54:55]
+; GFX11-NEXT:    v_dual_cndmask_b32 v7, v7, v23 :: v_dual_cndmask_b32 v6, v6, v22
+; GFX11-NEXT:    v_cndmask_b32_e64 v14, v54, v14, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v15, v55, v15, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v38, v6, s17
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v39, v7, s17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v14, v14, v30, s19
+; GFX11-NEXT:    v_cndmask_b32_e64 v15, v15, v31, s19
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v14, v54, v14, s22
+; GFX11-NEXT:    v_cndmask_b32_e64 v15, v55, v15, s22
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v8f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    scratch_load_b32 v31, off, s32
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[16:17]
+; GFX12-NEXT:    v_maximum_f64 v[2:3], v[2:3], v[18:19]
+; GFX12-NEXT:    v_maximum_f64 v[4:5], v[4:5], v[20:21]
+; GFX12-NEXT:    v_maximum_f64 v[6:7], v[6:7], v[22:23]
+; GFX12-NEXT:    v_maximum_f64 v[8:9], v[8:9], v[24:25]
+; GFX12-NEXT:    v_maximum_f64 v[10:11], v[10:11], v[26:27]
+; GFX12-NEXT:    v_maximum_f64 v[12:13], v[12:13], v[28:29]
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[14:15], v[14:15], v[30:31]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <8 x double> @llvm.maximum.v8f64(<8 x double> %src0, <8 x double> %src1)
+  ret <8 x double> %op
+}
+
+define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) {
+; GFX7-LABEL: v_maximum_v16f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:16
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
+; GFX7-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:24
+; GFX7-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:20
+; GFX7-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:32
+; GFX7-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:28
+; GFX7-NEXT:    v_mov_b32_e32 v39, 0x7ff80000
+; GFX7-NEXT:    s_waitcnt vmcnt(6)
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[31:32]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[31:32]
+; GFX7-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[6:7], v[2:3], v[33:34]
+; GFX7-NEXT:    v_cndmask_b32_e32 v48, v32, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v49, v39, v48, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v48, v31, v0, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v48, 0, v48, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[31:32], 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v48, v0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v49, v1, vcc
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, v[2:3], v[33:34]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v31, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v32, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[33:34], 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v50, v34, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v31, v33, v2, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v32, v39, v50, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v31, 0, v31, s[6:7]
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[6:7], v[4:5], v[35:36]
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v31, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v32, v3, vcc
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, v[4:5], v[35:36]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v33, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v34, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[35:36], 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v50, v36, v5, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v33, v35, v4, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v34, v39, v50, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v33, 0, v33, s[6:7]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[6:7], v[6:7], v[37:38]
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v33, v4, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v34, v5, vcc
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, v[6:7], v[37:38]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v35, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v36, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[37:38], 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v50, v38, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v35, v37, v6, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[6:7], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v36, v39, v50, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v35, 0, v35, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v35, v6, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v36, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, v6, v37, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v7, v38, s[4:5]
+; GFX7-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:36
+; GFX7-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:40
+; GFX7-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[48:49]
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v48, v0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v49, v1, vcc
+; GFX7-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:48
+; GFX7-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:44
+; GFX7-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[31:32]
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v31, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v32, v3, vcc
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:52
+; GFX7-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[33:34]
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v33, v4, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v34, v5, vcc
+; GFX7-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:64
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:60
+; GFX7-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[35:36]
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v35, v6, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v36, v7, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[8:9], 64
+; GFX7-NEXT:    s_waitcnt vmcnt(6)
+; GFX7-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[8:9], v[37:38]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[6:7], v[8:9], v[37:38]
+; GFX7-NEXT:    v_cndmask_b32_e64 v50, v38, v9, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v35, v37, v8, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v36, v39, v50, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v35, 0, v35, s[6:7]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[37:38], 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v35, v8, vcc
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[6:7], 0, v[35:36]
+; GFX7-NEXT:    v_cndmask_b32_e32 v9, v36, v9, vcc
+; GFX7-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, v[10:11], v[48:49]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[8:9], v[10:11], v[48:49]
+; GFX7-NEXT:    v_cndmask_b32_e64 v8, v8, v37, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v9, v9, v38, s[4:5]
+; GFX7-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:68
+; GFX7-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:72
+; GFX7-NEXT:    v_cndmask_b32_e64 v8, v35, v8, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e32 v50, v49, v11, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v35, v48, v10, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[10:11], 64
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[48:49], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v51, v39, v50, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v50, 0, v35, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v9, v36, v9, s[6:7]
+; GFX7-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:80
+; GFX7-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:76
+; GFX7-NEXT:    s_waitcnt vmcnt(6)
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[6:7], v[12:13], v[31:32]
+; GFX7-NEXT:    v_cndmask_b32_e32 v10, v50, v10, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v11, v51, v11, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v10, v10, v48, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v11, v11, v49, s[4:5]
+; GFX7-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[50:51]
+; GFX7-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[12:13], v[31:32]
+; GFX7-NEXT:    v_cndmask_b32_e32 v10, v50, v10, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v48, v32, v13, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v11, v51, v11, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[12:13], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v49, v39, v48, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v48, v31, v12, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[31:32], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v48, 0, v48, s[6:7]
+; GFX7-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[6:7], v[14:15], v[33:34]
+; GFX7-NEXT:    v_cndmask_b32_e32 v12, v48, v12, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v13, v49, v13, vcc
+; GFX7-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[48:49]
+; GFX7-NEXT:    v_cndmask_b32_e64 v12, v12, v31, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v13, v13, v32, s[4:5]
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:84
+; GFX7-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[14:15], v[33:34]
+; GFX7-NEXT:    v_cndmask_b32_e32 v12, v48, v12, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v13, v49, v13, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[14:15], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v50, v34, v15, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v48, v33, v14, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[33:34], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v49, v39, v50, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v48, 0, v48, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e32 v14, v48, v14, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v15, v49, v15, vcc
+; GFX7-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[48:49]
+; GFX7-NEXT:    v_cndmask_b32_e64 v14, v14, v33, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v15, v15, v34, s[4:5]
+; GFX7-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:96
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
+; GFX7-NEXT:    v_cndmask_b32_e32 v14, v48, v14, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v15, v49, v15, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[16:17], 64
+; GFX7-NEXT:    s_waitcnt vmcnt(6)
+; GFX7-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[16:17], v[37:38]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[6:7], v[16:17], v[37:38]
+; GFX7-NEXT:    v_cndmask_b32_e64 v50, v38, v17, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v48, v37, v16, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v49, v39, v50, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v48, 0, v48, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v48, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[37:38], 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v17, v49, v17, vcc
+; GFX7-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, v[18:19], v[35:36]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[6:7], v[18:19], v[35:36]
+; GFX7-NEXT:    v_cndmask_b32_e64 v16, v16, v37, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v17, v17, v38, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v50, v36, v19, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v37, v35, v18, vcc
+; GFX7-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[48:49]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[18:19], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v51, v39, v50, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v50, 0, v37, s[6:7]
+; GFX7-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:100
+; GFX7-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:104
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v48, v16, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v17, v49, v17, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[35:36], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v18, v50, v18, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v19, v51, v19, s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[20:21], v[31:32]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[6:7], v[20:21], v[31:32]
+; GFX7-NEXT:    v_cndmask_b32_e32 v18, v18, v35, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v19, v19, v36, vcc
+; GFX7-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[50:51]
+; GFX7-NEXT:    v_cndmask_b32_e64 v48, v32, v21, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v35, v31, v20, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[20:21], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v36, v39, v48, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v35, 0, v35, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e32 v18, v50, v18, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v19, v51, v19, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[31:32], 64
+; GFX7-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:112
+; GFX7-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:108
+; GFX7-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:120
+; GFX7-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:116
+; GFX7-NEXT:    v_cndmask_b32_e64 v20, v35, v20, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v21, v36, v21, s[4:5]
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[4:5], 0, v[35:36]
+; GFX7-NEXT:    s_waitcnt vmcnt(6)
+; GFX7-NEXT:    v_cmp_gt_f64_e64 s[6:7], v[22:23], v[33:34]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[8:9], v[22:23], v[33:34]
+; GFX7-NEXT:    v_cndmask_b32_e32 v20, v20, v31, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v21, v21, v32, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[22:23], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v20, v35, v20, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v21, v36, v21, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[33:34], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v31, v34, v23, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v36, v39, v31, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v31, v33, v22, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v35, 0, v31, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v22, v35, v22, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v23, v36, v23, vcc
+; GFX7-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[35:36]
+; GFX7-NEXT:    v_cndmask_b32_e64 v22, v22, v33, s[4:5]
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:128
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX7-NEXT:    v_cndmask_b32_e64 v23, v23, v34, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v22, v35, v22, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v23, v36, v23, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[24:25], 64
+; GFX7-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[24:25], v[37:38]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[6:7], v[24:25], v[37:38]
+; GFX7-NEXT:    v_cndmask_b32_e64 v34, v38, v25, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v35, v39, v34, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v34, v37, v24, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v34, 0, v34, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e32 v24, v34, v24, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v25, v35, v25, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[37:38], 64
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[4:5], 0, v[34:35]
+; GFX7-NEXT:    v_cndmask_b32_e32 v24, v24, v37, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v25, v25, v38, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[26:27], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v24, v34, v24, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v25, v35, v25, s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-NEXT:    v_cmp_gt_f64_e64 s[6:7], v[26:27], v[48:49]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[8:9], v[26:27], v[48:49]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[48:49], 64
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[10:11], v[28:29], v[50:51]
+; GFX7-NEXT:    v_cndmask_b32_e64 v36, v49, v27, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v34, v48, v26, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v35, v39, v36, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v34, 0, v34, s[8:9]
+; GFX7-NEXT:    v_cmp_gt_f64_e64 s[8:9], v[28:29], v[50:51]
+; GFX7-NEXT:    v_cndmask_b32_e32 v26, v34, v26, vcc
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[6:7], 0, v[34:35]
+; GFX7-NEXT:    v_cndmask_b32_e32 v27, v35, v27, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v26, v26, v48, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v27, v27, v49, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v36, v51, v29, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v27, v35, v27, s[6:7]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_gt_f64_e32 vcc, v[30:31], v[32:33]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[4:5], v[30:31], v[32:33]
+; GFX7-NEXT:    v_cndmask_b32_e64 v35, v39, v36, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v26, v34, v26, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v34, v50, v28, s[8:9]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[6:7], v[28:29], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v34, 0, v34, s[10:11]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[8:9], v[50:51], 64
+; GFX7-NEXT:    v_cndmask_b32_e32 v36, v33, v31, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v37, v39, v36, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v36, v32, v30, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[30:31], 64
+; GFX7-NEXT:    v_cndmask_b32_e64 v36, 0, v36, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[32:33], 64
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[34:35]
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[36:37]
+; GFX7-NEXT:    v_cndmask_b32_e64 v28, v34, v28, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v29, v35, v29, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v28, v28, v50, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v30, v36, v30, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v31, v37, v31, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v30, v30, v32, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v29, v29, v51, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v31, v31, v33, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v28, v34, v28, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v29, v35, v29, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v30, v36, v30, s[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e64 v31, v37, v31, s[12:13]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximum_v16f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; GFX8-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:16
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
+; GFX8-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:24
+; GFX8-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:20
+; GFX8-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:32
+; GFX8-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:28
+; GFX8-NEXT:    v_mov_b32_e32 v39, 0x7ff80000
+; GFX8-NEXT:    s_waitcnt vmcnt(6)
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[31:32]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[31:32]
+; GFX8-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[6:7], v[2:3], v[33:34]
+; GFX8-NEXT:    v_cndmask_b32_e32 v48, v32, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v49, v39, v48, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v48, v31, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v48, 0, v48, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[31:32], 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v48, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v49, v1, vcc
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, v[2:3], v[33:34]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v31, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v32, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[33:34], 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v50, v34, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v31, v33, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v32, v39, v50, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v31, 0, v31, s[6:7]
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[6:7], v[4:5], v[35:36]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v31, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v32, v3, vcc
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, v[4:5], v[35:36]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v33, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v34, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[35:36], 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v50, v36, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v33, v35, v4, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v34, v39, v50, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v33, 0, v33, s[6:7]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[6:7], v[6:7], v[37:38]
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v33, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v34, v5, vcc
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, v[6:7], v[37:38]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v35, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v36, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[37:38], 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v50, v38, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v35, v37, v6, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[6:7], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v36, v39, v50, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v35, 0, v35, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v35, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v36, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v37, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v38, s[4:5]
+; GFX8-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:36
+; GFX8-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:40
+; GFX8-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[48:49]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v48, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v49, v1, vcc
+; GFX8-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:48
+; GFX8-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:44
+; GFX8-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[31:32]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v31, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v32, v3, vcc
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
+; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:52
+; GFX8-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[33:34]
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v33, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v34, v5, vcc
+; GFX8-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:64
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:60
+; GFX8-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[35:36]
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v35, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v36, v7, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[8:9], 64
+; GFX8-NEXT:    s_waitcnt vmcnt(6)
+; GFX8-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[8:9], v[37:38]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[6:7], v[8:9], v[37:38]
+; GFX8-NEXT:    v_cndmask_b32_e64 v50, v38, v9, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v35, v37, v8, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v36, v39, v50, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v35, 0, v35, s[6:7]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[37:38], 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v35, v8, vcc
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[6:7], 0, v[35:36]
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v36, v9, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, v[10:11], v[48:49]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[8:9], v[10:11], v[48:49]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v8, v37, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v9, v38, s[4:5]
+; GFX8-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:68
+; GFX8-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:72
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v35, v8, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v50, v49, v11, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v35, v48, v10, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[10:11], 64
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[48:49], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v51, v39, v50, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v50, 0, v35, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v36, v9, s[6:7]
+; GFX8-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:80
+; GFX8-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:76
+; GFX8-NEXT:    s_waitcnt vmcnt(6)
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[6:7], v[12:13], v[31:32]
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v50, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v51, v11, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, v10, v48, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v49, s[4:5]
+; GFX8-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[50:51]
+; GFX8-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[12:13], v[31:32]
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v50, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v48, v32, v13, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v51, v11, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[12:13], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v49, v39, v48, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v48, v31, v12, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[31:32], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v48, 0, v48, s[6:7]
+; GFX8-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[6:7], v[14:15], v[33:34]
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v48, v12, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v49, v13, vcc
+; GFX8-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[48:49]
+; GFX8-NEXT:    v_cndmask_b32_e64 v12, v12, v31, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v13, v13, v32, s[4:5]
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
+; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:84
+; GFX8-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[14:15], v[33:34]
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v48, v12, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v49, v13, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[14:15], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v50, v34, v15, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v48, v33, v14, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[33:34], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v49, v39, v50, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v48, 0, v48, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v48, v14, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v49, v15, vcc
+; GFX8-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[48:49]
+; GFX8-NEXT:    v_cndmask_b32_e64 v14, v14, v33, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v15, v15, v34, s[4:5]
+; GFX8-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:96
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v48, v14, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v49, v15, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[16:17], 64
+; GFX8-NEXT:    s_waitcnt vmcnt(6)
+; GFX8-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[16:17], v[37:38]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[6:7], v[16:17], v[37:38]
+; GFX8-NEXT:    v_cndmask_b32_e64 v50, v38, v17, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v48, v37, v16, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v49, v39, v50, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v48, 0, v48, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v48, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[37:38], 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v17, v49, v17, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, v[18:19], v[35:36]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[6:7], v[18:19], v[35:36]
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, v16, v37, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v17, v17, v38, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v50, v36, v19, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v37, v35, v18, vcc
+; GFX8-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[48:49]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[18:19], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v51, v39, v50, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v50, 0, v37, s[6:7]
+; GFX8-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:100
+; GFX8-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:104
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v48, v16, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v17, v49, v17, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[35:36], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v18, v50, v18, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v19, v51, v19, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[20:21], v[31:32]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[6:7], v[20:21], v[31:32]
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v18, v35, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, v19, v36, vcc
+; GFX8-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[50:51]
+; GFX8-NEXT:    v_cndmask_b32_e64 v48, v32, v21, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v35, v31, v20, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[20:21], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v36, v39, v48, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v35, 0, v35, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v50, v18, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, v51, v19, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[31:32], 64
+; GFX8-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:112
+; GFX8-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:108
+; GFX8-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:120
+; GFX8-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:116
+; GFX8-NEXT:    v_cndmask_b32_e64 v20, v35, v20, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v21, v36, v21, s[4:5]
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[4:5], 0, v[35:36]
+; GFX8-NEXT:    s_waitcnt vmcnt(6)
+; GFX8-NEXT:    v_cmp_gt_f64_e64 s[6:7], v[22:23], v[33:34]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[8:9], v[22:23], v[33:34]
+; GFX8-NEXT:    v_cndmask_b32_e32 v20, v20, v31, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v21, v21, v32, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[22:23], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v20, v35, v20, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v21, v36, v21, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[33:34], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v31, v34, v23, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v36, v39, v31, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v31, v33, v22, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v35, 0, v31, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v22, v35, v22, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v23, v36, v23, vcc
+; GFX8-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[35:36]
+; GFX8-NEXT:    v_cndmask_b32_e64 v22, v22, v33, s[4:5]
+; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:128
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX8-NEXT:    v_cndmask_b32_e64 v23, v23, v34, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v22, v35, v22, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v23, v36, v23, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[24:25], 64
+; GFX8-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[24:25], v[37:38]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[6:7], v[24:25], v[37:38]
+; GFX8-NEXT:    v_cndmask_b32_e64 v34, v38, v25, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v35, v39, v34, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v34, v37, v24, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v34, 0, v34, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v24, v34, v24, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v25, v35, v25, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[37:38], 64
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[4:5], 0, v[34:35]
+; GFX8-NEXT:    v_cndmask_b32_e32 v24, v24, v37, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v25, v25, v38, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[26:27], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v24, v34, v24, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v25, v35, v25, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(5)
+; GFX8-NEXT:    v_cmp_gt_f64_e64 s[6:7], v[26:27], v[48:49]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[8:9], v[26:27], v[48:49]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[48:49], 64
+; GFX8-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[10:11], v[28:29], v[50:51]
+; GFX8-NEXT:    v_cndmask_b32_e64 v36, v49, v27, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v34, v48, v26, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v35, v39, v36, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v34, 0, v34, s[8:9]
+; GFX8-NEXT:    v_cmp_gt_f64_e64 s[8:9], v[28:29], v[50:51]
+; GFX8-NEXT:    v_cndmask_b32_e32 v26, v34, v26, vcc
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[6:7], 0, v[34:35]
+; GFX8-NEXT:    v_cndmask_b32_e32 v27, v35, v27, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v26, v26, v48, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v27, v27, v49, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v36, v51, v29, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v27, v35, v27, s[6:7]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_f64_e32 vcc, v[30:31], v[32:33]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[4:5], v[30:31], v[32:33]
+; GFX8-NEXT:    v_cndmask_b32_e64 v35, v39, v36, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v26, v34, v26, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v34, v50, v28, s[8:9]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[6:7], v[28:29], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v34, 0, v34, s[10:11]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[8:9], v[50:51], 64
+; GFX8-NEXT:    v_cndmask_b32_e32 v36, v33, v31, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v37, v39, v36, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v36, v32, v30, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[30:31], 64
+; GFX8-NEXT:    v_cndmask_b32_e64 v36, 0, v36, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[32:33], 64
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[34:35]
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[36:37]
+; GFX8-NEXT:    v_cndmask_b32_e64 v28, v34, v28, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v29, v35, v29, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v28, v28, v50, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v30, v36, v30, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v31, v37, v31, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v30, v30, v32, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v29, v29, v51, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v31, v31, v33, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v28, v34, v28, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v29, v35, v29, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v30, v36, v30, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v31, v37, v31, s[12:13]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximum_v16f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    v_mov_b32_e32 v39, 0x7ff80000
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[31:32]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[31:32]
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[6:7], v[2:3], v[33:34]
+; GFX9-NEXT:    v_cndmask_b32_e32 v48, v32, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v49, v39, v48, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v48, v31, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v48, 0, v48, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[31:32], 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v48, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v49, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, v[2:3], v[33:34]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v31, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v32, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[33:34], 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v50, v34, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v31, v33, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v32, v39, v50, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v31, 0, v31, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[6:7], v[4:5], v[35:36]
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v31, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v32, v3, vcc
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, v[4:5], v[35:36]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v33, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v34, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[35:36], 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v50, v36, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v33, v35, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v34, v39, v50, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v33, 0, v33, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[6:7], v[6:7], v[37:38]
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v33, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v34, v5, vcc
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, v[6:7], v[37:38]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v35, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v36, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[37:38], 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v50, v38, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v35, v37, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[6:7], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v36, v39, v50, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v35, 0, v35, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v35, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v36, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v37, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v38, s[4:5]
+; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:40
+; GFX9-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[48:49]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v48, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v49, v1, vcc
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:48
+; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[31:32]
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v31, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v32, v3, vcc
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[33:34]
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v33, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v34, v5, vcc
+; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:64
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[35:36]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v35, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v36, v7, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[8:9], 64
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[8:9], v[37:38]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[6:7], v[8:9], v[37:38]
+; GFX9-NEXT:    v_cndmask_b32_e64 v50, v38, v9, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v35, v37, v8, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v36, v39, v50, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v35, 0, v35, s[6:7]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[37:38], 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v35, v8, vcc
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[6:7], 0, v[35:36]
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v36, v9, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, v[10:11], v[48:49]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[8:9], v[10:11], v[48:49]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v37, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v38, s[4:5]
+; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:72
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v35, v8, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v50, v49, v11, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v35, v48, v10, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[10:11], 64
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[48:49], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v51, v39, v50, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v50, 0, v35, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v36, v9, s[6:7]
+; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:80
+; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[6:7], v[12:13], v[31:32]
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v50, v10, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v51, v11, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, v48, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v49, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[50:51]
+; GFX9-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[12:13], v[31:32]
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v50, v10, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v48, v32, v13, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v51, v11, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[12:13], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v49, v39, v48, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v48, v31, v12, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[31:32], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v48, 0, v48, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[6:7], v[14:15], v[33:34]
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v48, v12, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v49, v13, vcc
+; GFX9-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[48:49]
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v31, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, v13, v32, s[4:5]
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[14:15], v[33:34]
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v48, v12, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v49, v13, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[14:15], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v50, v34, v15, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v48, v33, v14, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[33:34], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v49, v39, v50, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v48, 0, v48, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v48, v14, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v49, v15, vcc
+; GFX9-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[48:49]
+; GFX9-NEXT:    v_cndmask_b32_e64 v14, v14, v33, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v15, v15, v34, s[4:5]
+; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:96
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v48, v14, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v49, v15, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[16:17], 64
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[16:17], v[37:38]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[6:7], v[16:17], v[37:38]
+; GFX9-NEXT:    v_cndmask_b32_e64 v50, v38, v17, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v48, v37, v16, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v49, v39, v50, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v48, 0, v48, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v48, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[37:38], 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v17, v49, v17, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, v[18:19], v[35:36]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[6:7], v[18:19], v[35:36]
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, v16, v37, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v17, v17, v38, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v50, v36, v19, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v37, v35, v18, vcc
+; GFX9-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[48:49]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[18:19], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v51, v39, v50, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v50, 0, v37, s[6:7]
+; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:104
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v48, v16, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v17, v49, v17, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[35:36], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v18, v50, v18, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, v51, v19, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[20:21], v[31:32]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[6:7], v[20:21], v[31:32]
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v18, v35, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v19, v19, v36, vcc
+; GFX9-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[50:51]
+; GFX9-NEXT:    v_cndmask_b32_e64 v48, v32, v21, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v35, v31, v20, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[20:21], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v36, v39, v48, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v35, 0, v35, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v50, v18, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v19, v51, v19, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[31:32], 64
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:112
+; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:120
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:116
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, v35, v20, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v21, v36, v21, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[4:5], 0, v[35:36]
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    v_cmp_gt_f64_e64 s[6:7], v[22:23], v[33:34]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[8:9], v[22:23], v[33:34]
+; GFX9-NEXT:    v_cndmask_b32_e32 v20, v20, v31, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v21, v21, v32, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[22:23], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, v35, v20, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v21, v36, v21, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[33:34], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v31, v34, v23, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v36, v39, v31, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v31, v33, v22, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v35, 0, v31, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v22, v35, v22, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v23, v36, v23, vcc
+; GFX9-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[35:36]
+; GFX9-NEXT:    v_cndmask_b32_e64 v22, v22, v33, s[4:5]
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:128
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX9-NEXT:    v_cndmask_b32_e64 v23, v23, v34, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v22, v35, v22, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v23, v36, v23, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[24:25], 64
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    v_cmp_gt_f64_e64 s[4:5], v[24:25], v[37:38]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[6:7], v[24:25], v[37:38]
+; GFX9-NEXT:    v_cndmask_b32_e64 v34, v38, v25, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v35, v39, v34, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v34, v37, v24, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v34, 0, v34, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v24, v34, v24, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v25, v35, v25, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[37:38], 64
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[4:5], 0, v[34:35]
+; GFX9-NEXT:    v_cndmask_b32_e32 v24, v24, v37, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v25, v25, v38, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[26:27], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v24, v34, v24, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v25, v35, v25, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    v_cmp_gt_f64_e64 s[6:7], v[26:27], v[48:49]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[8:9], v[26:27], v[48:49]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[48:49], 64
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[10:11], v[28:29], v[50:51]
+; GFX9-NEXT:    v_cndmask_b32_e64 v36, v49, v27, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v34, v48, v26, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v35, v39, v36, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v34, 0, v34, s[8:9]
+; GFX9-NEXT:    v_cmp_gt_f64_e64 s[8:9], v[28:29], v[50:51]
+; GFX9-NEXT:    v_cndmask_b32_e32 v26, v34, v26, vcc
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[6:7], 0, v[34:35]
+; GFX9-NEXT:    v_cndmask_b32_e32 v27, v35, v27, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v26, v26, v48, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v27, v27, v49, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v36, v51, v29, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v27, v35, v27, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, v[30:31], v[32:33]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[4:5], v[30:31], v[32:33]
+; GFX9-NEXT:    v_cndmask_b32_e64 v35, v39, v36, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v26, v34, v26, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v34, v50, v28, s[8:9]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[6:7], v[28:29], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v34, 0, v34, s[10:11]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[8:9], v[50:51], 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v36, v33, v31, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v37, v39, v36, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v36, v32, v30, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[30:31], 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v36, 0, v36, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[32:33], 64
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[34:35]
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[36:37]
+; GFX9-NEXT:    v_cndmask_b32_e64 v28, v34, v28, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v29, v35, v29, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v28, v28, v50, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v30, v36, v30, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v31, v37, v31, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v30, v30, v32, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v29, v29, v51, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v31, v31, v33, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v28, v34, v28, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v29, v35, v29, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v30, v36, v30, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v31, v37, v31, s[12:13]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_maximum_v16f64:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_write_b32 a1, v41 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_write_b32 a2, v42 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_write_b32 a3, v43 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_write_b32 a4, v44 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_write_b32 a5, v45 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_write_b32 a6, v46 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_write_b32 a7, v47 ; Reload Reuse
+; GFX940-NEXT:    scratch_load_dword v41, off, s32 offset:8
+; GFX940-NEXT:    scratch_load_dword v40, off, s32 offset:4
+; GFX940-NEXT:    scratch_load_dword v51, off, s32 offset:16
+; GFX940-NEXT:    scratch_load_dword v50, off, s32 offset:12
+; GFX940-NEXT:    scratch_load_dword v45, off, s32 offset:24
+; GFX940-NEXT:    scratch_load_dword v44, off, s32 offset:20
+; GFX940-NEXT:    scratch_load_dword v47, off, s32 offset:32
+; GFX940-NEXT:    scratch_load_dword v46, off, s32 offset:28
+; GFX940-NEXT:    scratch_load_dword v31, off, s32
+; GFX940-NEXT:    scratch_load_dword v33, off, s32 offset:128
+; GFX940-NEXT:    scratch_load_dword v32, off, s32 offset:124
+; GFX940-NEXT:    scratch_load_dword v35, off, s32 offset:120
+; GFX940-NEXT:    scratch_load_dword v34, off, s32 offset:116
+; GFX940-NEXT:    scratch_load_dword v43, off, s32 offset:40
+; GFX940-NEXT:    scratch_load_dword v42, off, s32 offset:36
+; GFX940-NEXT:    scratch_load_dword v37, off, s32 offset:112
+; GFX940-NEXT:    scratch_load_dword v36, off, s32 offset:108
+; GFX940-NEXT:    scratch_load_dword v39, off, s32 offset:104
+; GFX940-NEXT:    scratch_load_dword v38, off, s32 offset:100
+; GFX940-NEXT:    scratch_load_dword v49, off, s32 offset:96
+; GFX940-NEXT:    scratch_load_dword v48, off, s32 offset:92
+; GFX940-NEXT:    scratch_load_dword v53, off, s32 offset:56
+; GFX940-NEXT:    scratch_load_dword v52, off, s32 offset:52
+; GFX940-NEXT:    scratch_load_dword v55, off, s32 offset:48
+; GFX940-NEXT:    scratch_load_dword v54, off, s32 offset:44
+; GFX940-NEXT:    v_accvgpr_write_b32 a8, v56 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_write_b32 a9, v57 ; Reload Reuse
+; GFX940-NEXT:    v_mov_b32_e32 v56, 0x7ff80000
+; GFX940-NEXT:    v_accvgpr_write_b32 a11, v59 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_write_b32 a10, v58 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_write_b32 a12, v60 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_write_b32 a13, v61 ; Reload Reuse
+; GFX940-NEXT:    s_waitcnt vmcnt(23)
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[40:41]
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v57, v41, v1, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[0:1], v[40:41]
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e64 v59, v56, v57, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v57, v40, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v58, 0, v57, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[40:41], 64
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v58, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v59, v1, vcc
+; GFX940-NEXT:    s_waitcnt vmcnt(21)
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[2:3], v[50:51]
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v0, v40, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v1, v41, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v57, v51, v3, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[2:3], v[50:51]
+; GFX940-NEXT:    v_cndmask_b32_e32 v40, v50, v2, vcc
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 64
+; GFX940-NEXT:    v_cndmask_b32_e64 v61, v56, v57, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v60, 0, v40, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v60, v2, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v61, v3, vcc
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[50:51], 64
+; GFX940-NEXT:    scratch_load_dword v41, off, s32 offset:64
+; GFX940-NEXT:    scratch_load_dword v40, off, s32 offset:60
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v2, v50, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v3, v51, vcc
+; GFX940-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[58:59]
+; GFX940-NEXT:    s_waitcnt vmcnt(21)
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[4:5], v[44:45]
+; GFX940-NEXT:    scratch_load_dword v51, off, s32 offset:88
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v58, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v59, v1, vcc
+; GFX940-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[60:61]
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v60, v2, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v61, v3, vcc
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[4:5], v[44:45]
+; GFX940-NEXT:    v_accvgpr_read_b32 v61, a13 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_read_b32 v60, a12 ; Reload Reuse
+; GFX940-NEXT:    v_cndmask_b32_e32 v50, v45, v5, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v59, v56, v50, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v50, v44, v4, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v58, 0, v50, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[44:45], 64
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v58, v4, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v59, v5, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, v4, v44, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v5, v5, v45, s[0:1]
+; GFX940-NEXT:    scratch_load_dword v45, off, s32 offset:72
+; GFX940-NEXT:    scratch_load_dword v44, off, s32 offset:68
+; GFX940-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[58:59]
+; GFX940-NEXT:    s_waitcnt vmcnt(22)
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[6:7], v[46:47]
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v58, v4, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v59, v5, vcc
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[6:7], v[46:47]
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v50, v47, v7, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v59, v56, v50, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v50, v46, v6, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v58, 0, v50, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[6:7], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[46:47], 64
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v58, v6, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v59, v7, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v6, v6, v46, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v7, v7, v47, s[0:1]
+; GFX940-NEXT:    scratch_load_dword v47, off, s32 offset:80
+; GFX940-NEXT:    scratch_load_dword v46, off, s32 offset:76
+; GFX940-NEXT:    scratch_load_dword v50, off, s32 offset:84
+; GFX940-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[58:59]
+; GFX940-NEXT:    s_waitcnt vmcnt(18)
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[8:9], v[42:43]
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v58, v6, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v59, v7, vcc
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[8:9], v[42:43]
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v57, v43, v9, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v59, v56, v57, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v57, v42, v8, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v58, 0, v57, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[8:9], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[42:43], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[58:59]
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v58, v8, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v9, v59, v9, vcc
+; GFX940-NEXT:    s_waitcnt vmcnt(8)
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[10:11], v[54:55]
+; GFX940-NEXT:    v_cndmask_b32_e64 v8, v8, v42, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v9, v9, v43, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v42, v55, v11, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[10:11], v[54:55]
+; GFX940-NEXT:    v_cndmask_b32_e64 v8, v58, v8, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v9, v59, v9, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v43, v56, v42, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v42, v54, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v42, 0, v42, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[10:11], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[54:55], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[42:43]
+; GFX940-NEXT:    v_cndmask_b32_e32 v10, v42, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v11, v43, v11, vcc
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[12:13], v[52:53]
+; GFX940-NEXT:    v_cndmask_b32_e64 v10, v10, v54, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v11, v11, v55, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v54, v53, v13, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[12:13], v[52:53]
+; GFX940-NEXT:    v_cndmask_b32_e64 v10, v42, v10, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v11, v43, v11, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v55, v56, v54, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v54, v52, v12, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v54, 0, v54, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[12:13], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[52:53], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[54:55]
+; GFX940-NEXT:    v_cndmask_b32_e32 v12, v54, v12, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v13, v55, v13, vcc
+; GFX940-NEXT:    s_waitcnt vmcnt(6)
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[14:15], v[40:41]
+; GFX940-NEXT:    v_cndmask_b32_e64 v12, v12, v52, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v13, v13, v53, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v52, v41, v15, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[14:15], v[40:41]
+; GFX940-NEXT:    v_cndmask_b32_e64 v12, v54, v12, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v13, v55, v13, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v53, v56, v52, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v52, v40, v14, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v52, 0, v52, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[14:15], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[40:41], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[52:53]
+; GFX940-NEXT:    v_cndmask_b32_e32 v14, v52, v14, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v14, v14, v40, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v15, v53, v15, vcc
+; GFX940-NEXT:    s_waitcnt vmcnt(3)
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[16:17], v[44:45]
+; GFX940-NEXT:    v_cndmask_b32_e64 v14, v52, v14, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v15, v15, v41, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v52, v45, v17, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[16:17], v[44:45]
+; GFX940-NEXT:    v_cndmask_b32_e64 v15, v53, v15, s[2:3]
+; GFX940-NEXT:    v_accvgpr_read_b32 v59, a11 ; Reload Reuse
+; GFX940-NEXT:    v_cndmask_b32_e64 v53, v56, v52, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v52, v44, v16, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v52, 0, v52, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[16:17], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[44:45], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[52:53]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v52, v16, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v16, v16, v44, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v17, v53, v17, vcc
+; GFX940-NEXT:    s_waitcnt vmcnt(1)
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[18:19], v[46:47]
+; GFX940-NEXT:    v_cndmask_b32_e64 v16, v52, v16, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v17, v17, v45, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v52, v47, v19, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[18:19], v[46:47]
+; GFX940-NEXT:    v_cndmask_b32_e64 v17, v53, v17, s[2:3]
+; GFX940-NEXT:    v_accvgpr_read_b32 v58, a10 ; Reload Reuse
+; GFX940-NEXT:    v_cndmask_b32_e64 v53, v56, v52, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v52, v46, v18, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v52, 0, v52, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[18:19], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[46:47], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[52:53]
+; GFX940-NEXT:    v_cndmask_b32_e32 v18, v52, v18, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v18, v18, v46, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v19, v53, v19, vcc
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[20:21], v[50:51]
+; GFX940-NEXT:    v_cndmask_b32_e64 v18, v52, v18, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v19, v19, v47, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v52, v51, v21, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[20:21], v[50:51]
+; GFX940-NEXT:    v_cndmask_b32_e64 v19, v53, v19, s[2:3]
+; GFX940-NEXT:    v_accvgpr_read_b32 v57, a9 ; Reload Reuse
+; GFX940-NEXT:    v_cndmask_b32_e64 v53, v56, v52, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v52, v50, v20, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v52, 0, v52, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[20:21], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[50:51], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[52:53]
+; GFX940-NEXT:    v_cndmask_b32_e32 v20, v52, v20, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v21, v53, v21, vcc
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[22:23], v[48:49]
+; GFX940-NEXT:    v_cndmask_b32_e64 v20, v20, v50, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v21, v21, v51, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v50, v49, v23, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[22:23], v[48:49]
+; GFX940-NEXT:    v_cndmask_b32_e64 v20, v52, v20, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v21, v53, v21, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v51, v56, v50, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v50, v48, v22, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v50, 0, v50, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[22:23], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[48:49], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[50:51]
+; GFX940-NEXT:    v_cndmask_b32_e32 v22, v50, v22, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v23, v51, v23, vcc
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[24:25], v[38:39]
+; GFX940-NEXT:    v_cndmask_b32_e64 v22, v22, v48, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v23, v23, v49, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v48, v39, v25, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[24:25], v[38:39]
+; GFX940-NEXT:    v_cndmask_b32_e64 v22, v50, v22, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v23, v51, v23, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v49, v56, v48, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v48, v38, v24, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v48, 0, v48, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[24:25], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[38:39], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[48:49]
+; GFX940-NEXT:    v_cndmask_b32_e32 v24, v48, v24, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v25, v49, v25, vcc
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[26:27], v[36:37]
+; GFX940-NEXT:    v_cndmask_b32_e64 v24, v24, v38, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v25, v25, v39, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v38, v37, v27, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[26:27], v[36:37]
+; GFX940-NEXT:    v_cndmask_b32_e64 v24, v48, v24, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v25, v49, v25, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v39, v56, v38, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v38, v36, v26, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v38, 0, v38, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[26:27], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[36:37], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[38:39]
+; GFX940-NEXT:    v_cndmask_b32_e32 v26, v38, v26, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v27, v39, v27, vcc
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[28:29], v[34:35]
+; GFX940-NEXT:    v_cndmask_b32_e64 v26, v26, v36, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v27, v27, v37, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v36, v35, v29, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[28:29], v[34:35]
+; GFX940-NEXT:    v_cndmask_b32_e64 v26, v38, v26, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v27, v39, v27, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v37, v56, v36, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v36, v34, v28, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v36, 0, v36, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[28:29], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[34:35], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[36:37]
+; GFX940-NEXT:    v_cndmask_b32_e32 v28, v36, v28, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v29, v37, v29, vcc
+; GFX940-NEXT:    v_cmp_gt_f64_e32 vcc, v[30:31], v[32:33]
+; GFX940-NEXT:    v_cndmask_b32_e64 v28, v28, v34, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v29, v29, v35, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v34, v33, v31, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[30:31], v[32:33]
+; GFX940-NEXT:    v_cndmask_b32_e64 v28, v36, v28, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v29, v37, v29, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v35, v56, v34, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v34, v32, v30, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v34, 0, v34, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[30:31], 64
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[32:33], 64
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[34:35]
+; GFX940-NEXT:    v_cndmask_b32_e32 v30, v34, v30, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v31, v35, v31, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v30, v30, v32, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v31, v31, v33, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v30, v34, v30, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v31, v35, v31, s[2:3]
+; GFX940-NEXT:    v_accvgpr_read_b32 v56, a8 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_read_b32 v47, a7 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_read_b32 v46, a6 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_read_b32 v45, a5 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_read_b32 v44, a4 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_read_b32 v43, a3 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_read_b32 v42, a2 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_read_b32 v41, a1 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_read_b32 v40, a0 ; Reload Reuse
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximum_v16f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x20
+; GFX10-NEXT:    buffer_load_dword v65, off, s[0:3], s32 offset:8
+; GFX10-NEXT:    buffer_load_dword v64, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:16
+; GFX10-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:12
+; GFX10-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:24
+; GFX10-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:20
+; GFX10-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:32
+; GFX10-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:28
+; GFX10-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:36
+; GFX10-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:40
+; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:56
+; GFX10-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:52
+; GFX10-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:48
+; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:44
+; GFX10-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:64
+; GFX10-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:60
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:72
+; GFX10-NEXT:    buffer_load_dword v83, off, s[0:3], s32 offset:80
+; GFX10-NEXT:    buffer_load_dword v82, off, s[0:3], s32 offset:76
+; GFX10-NEXT:    buffer_load_dword v67, off, s[0:3], s32 offset:88
+; GFX10-NEXT:    buffer_load_dword v66, off, s[0:3], s32 offset:84
+; GFX10-NEXT:    buffer_load_dword v69, off, s[0:3], s32 offset:96
+; GFX10-NEXT:    buffer_load_dword v68, off, s[0:3], s32 offset:92
+; GFX10-NEXT:    buffer_load_dword v70, off, s[0:3], s32 offset:100
+; GFX10-NEXT:    buffer_load_dword v71, off, s[0:3], s32 offset:104
+; GFX10-NEXT:    buffer_load_dword v81, off, s[0:3], s32 offset:112
+; GFX10-NEXT:    buffer_load_dword v80, off, s[0:3], s32 offset:108
+; GFX10-NEXT:    buffer_load_dword v85, off, s[0:3], s32 offset:120
+; GFX10-NEXT:    buffer_load_dword v84, off, s[0:3], s32 offset:116
+; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT:    buffer_load_dword v87, off, s[0:3], s32 offset:128
+; GFX10-NEXT:    buffer_load_dword v86, off, s[0:3], s32 offset:124
+; GFX10-NEXT:    v_cmp_class_f64_e64 s10, v[0:1], 64
+; GFX10-NEXT:    v_cmp_class_f64_e64 s12, v[2:3], 64
+; GFX10-NEXT:    v_cmp_class_f64_e64 s17, v[10:11], 64
+; GFX10-NEXT:    s_waitcnt vmcnt(31)
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[64:65]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s4, v[0:1], v[64:65]
+; GFX10-NEXT:    s_waitcnt vmcnt(29)
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s5, v[2:3], v[54:55]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s6, v[2:3], v[54:55]
+; GFX10-NEXT:    s_waitcnt vmcnt(27)
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s7, v[4:5], v[52:53]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s8, v[4:5], v[52:53]
+; GFX10-NEXT:    s_waitcnt vmcnt(25)
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s9, v[6:7], v[50:51]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s11, v[6:7], v[50:51]
+; GFX10-NEXT:    s_waitcnt vmcnt(23)
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s13, v[8:9], v[48:49]
+; GFX10-NEXT:    v_cmp_class_f64_e64 s14, v[64:65], 64
+; GFX10-NEXT:    s_waitcnt vmcnt(21)
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s15, v[12:13], v[36:37]
+; GFX10-NEXT:    s_waitcnt vmcnt(17)
+; GFX10-NEXT:    v_cmp_o_f64_e64 s16, v[14:15], v[34:35]
+; GFX10-NEXT:    v_cndmask_b32_e32 v96, v64, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v97, v54, v2, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v99, v55, v3, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v100, v52, v4, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v96, 0, v96, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v101, v50, v6, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v98, 0, v97, s6
+; GFX10-NEXT:    v_cndmask_b32_e32 v97, v65, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[54:55], 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v96, v0, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v99, 0x7ff80000, v99, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v98, v2, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v97, 0x7ff80000, v97, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v100, 0, v100, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v102, 0, v101, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v99, v3, s12
+; GFX10-NEXT:    v_cmp_class_f64_e64 s12, v[6:7], 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v97, v1, s10
+; GFX10-NEXT:    v_cmp_class_f64_e64 s10, v[4:5], 64
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s4, v[10:11], v[38:39]
+; GFX10-NEXT:    v_cndmask_b32_e64 v112, v48, v8, s13
+; GFX10-NEXT:    v_cmp_o_f64_e64 s5, v[12:13], v[36:37]
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s6, v[14:15], v[34:35]
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v64, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v65, s14
+; GFX10-NEXT:    v_cmp_class_f64_e64 s14, v[52:53], 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v113, v36, v12, s15
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cmp_o_f64_e64 s18, v[30:31], v[86:87]
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v54, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v54, v53, v5, s7
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v55, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[50:51], 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v55, v51, v7, s9
+; GFX10-NEXT:    v_cmp_o_f64_e64 s9, v[8:9], v[48:49]
+; GFX10-NEXT:    v_cndmask_b32_e64 v101, 0x7ff80000, v54, s8
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s7, v[16:17], v[32:33]
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v102, v6, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v103, 0x7ff80000, v55, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v100, v4, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v101, v5, s10
+; GFX10-NEXT:    v_cmp_class_f64_e64 s10, v[8:9], 64
+; GFX10-NEXT:    v_cmp_o_f64_e64 s11, v[10:11], v[38:39]
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v103, v7, s12
+; GFX10-NEXT:    v_cmp_class_f64_e64 s12, v[48:49], 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v114, v38, v10, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v115, v34, v14, s6
+; GFX10-NEXT:    v_cmp_o_f64_e64 s8, v[16:17], v[32:33]
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v52, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v53, s14
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s14, v[18:19], v[82:83]
+; GFX10-NEXT:    v_cndmask_b32_e64 v52, 0, v115, s16
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v50, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v50, v49, v9, s13
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v51, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[38:39], 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v54, 0, v112, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v51, v39, v11, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v55, 0x7ff80000, v50, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v50, 0, v113, s5
+; GFX10-NEXT:    v_cmp_o_f64_e64 s4, v[18:19], v[82:83]
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v54, v8, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v64, 0, v114, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v55, v9, s10
+; GFX10-NEXT:    v_cmp_class_f64_e64 s10, v[12:13], 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v65, 0x7ff80000, v51, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v48, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v64, v10, s17
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v9, v49, s12
+; GFX10-NEXT:    v_cmp_class_f64_e64 s12, v[14:15], 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v65, v11, s17
+; GFX10-NEXT:    v_cndmask_b32_e64 v48, v37, v13, s15
+; GFX10-NEXT:    v_cmp_class_f64_e64 s17, v[34:35], 64
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s9, v[20:21], v[66:67]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s11, v[20:21], v[66:67]
+; GFX10-NEXT:    v_cndmask_b32_e64 v116, v32, v16, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v51, 0x7ff80000, v48, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v38, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v11, v39, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[22:23], v[68:69]
+; GFX10-NEXT:    v_cndmask_b32_e64 v38, v35, v15, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v49, v82, v18, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v48, 0, v116, s8
+; GFX10-NEXT:    v_cmp_class_f64_e64 s13, v[36:37], 64
+; GFX10-NEXT:    v_cmp_o_f64_e64 s5, v[22:23], v[68:69]
+; GFX10-NEXT:    v_cndmask_b32_e64 v53, 0x7ff80000, v38, s16
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v50, v12, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v51, v13, s10
+; GFX10-NEXT:    v_cmp_class_f64_e64 s10, v[16:17], 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v38, 0, v49, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v112, v83, v19, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v52, v14, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, v53, v15, s12
+; GFX10-NEXT:    v_cmp_class_f64_e64 s12, v[32:33], 64
+; GFX10-NEXT:    v_cmp_class_f64_e64 s14, v[18:19], 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v114, v67, v21, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v14, v34, s17
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, v15, v35, s17
+; GFX10-NEXT:    v_cndmask_b32_e64 v34, v33, v17, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v39, 0x7ff80000, v112, s4
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s4, v[24:25], v[70:71]
+; GFX10-NEXT:    v_cndmask_b32_e32 v113, v69, v23, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v35, v68, v22, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[20:21], 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v49, 0x7ff80000, v34, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v34, 0x7ff80000, v114, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v12, v36, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v13, v37, s13
+; GFX10-NEXT:    v_cmp_class_f64_e64 s13, v[82:83], 64
+; GFX10-NEXT:    v_cmp_o_f64_e64 s6, v[24:25], v[70:71]
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, v48, v16, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, v49, v17, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v36, 0x7ff80000, v113, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v35, 0, v35, s5
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s7, v[26:27], v[80:81]
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, v16, v32, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v32, v66, v20, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, v17, v33, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, v38, v18, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, v39, v19, s14
+; GFX10-NEXT:    v_cmp_o_f64_e64 s15, v[26:27], v[80:81]
+; GFX10-NEXT:    v_cndmask_b32_e64 v33, 0, v32, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v112, v71, v25, s4
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s16, v[28:29], v[84:85]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s8, v[28:29], v[84:85]
+; GFX10-NEXT:    v_cndmask_b32_e32 v21, v34, v21, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v20, v33, v20, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[22:23], 64
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s17, v[30:31], v[86:87]
+; GFX10-NEXT:    v_cmp_class_f64_e64 s5, v[70:71], 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, v18, v82, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v82, v70, v24, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, v19, v83, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v83, 0x7ff80000, v112, s6
+; GFX10-NEXT:    v_cmp_class_f64_e64 s4, v[68:69], 64
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s9, 0, v[96:97]
+; GFX10-NEXT:    v_cndmask_b32_e64 v82, 0, v82, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v37, v81, v27, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v32, v80, v26, s7
+; GFX10-NEXT:    v_cmp_class_f64_e64 s6, v[80:81], 64
+; GFX10-NEXT:    v_cmp_class_f64_e64 s7, v[84:85], 64
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s10, 0, v[98:99]
+; GFX10-NEXT:    v_cndmask_b32_e64 v113, 0x7ff80000, v37, s15
+; GFX10-NEXT:    v_cndmask_b32_e64 v112, 0, v32, s15
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s11, 0, v[100:101]
+; GFX10-NEXT:    v_cndmask_b32_e64 v115, v85, v29, s16
+; GFX10-NEXT:    v_cndmask_b32_e64 v114, v84, v28, s16
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s12, 0, v[102:103]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s13, 0, v[54:55]
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, v35, v22, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v23, v36, v23, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[24:25], 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v115, 0x7ff80000, v115, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v114, 0, v114, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v116, v87, v31, s17
+; GFX10-NEXT:    v_cndmask_b32_e64 v32, v86, v30, s17
+; GFX10-NEXT:    v_cmp_class_f64_e64 s8, v[86:87], 64
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s14, 0, v[64:65]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s15, 0, v[50:51]
+; GFX10-NEXT:    v_cndmask_b32_e64 v117, 0x7ff80000, v116, s18
+; GFX10-NEXT:    v_cndmask_b32_e64 v116, 0, v32, s18
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s16, 0, v[52:53]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s17, 0, v[48:49]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s18, 0, v[38:39]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s19, 0, v[33:34]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s20, 0, v[35:36]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s21, 0, v[82:83]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s22, 0, v[112:113]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s23, 0, v[114:115]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s24, 0, v[116:117]
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, v22, v68, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v23, v23, v69, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v96, v0, s9
+; GFX10-NEXT:    v_cndmask_b32_e32 v24, v82, v24, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v25, v83, v25, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[26:27], 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v98, v2, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v100, v4, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v24, v24, v70, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v25, v25, v71, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v102, v6, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v54, v8, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v64, v10, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v50, v12, s15
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v52, v14, s16
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, v48, v16, s17
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, v38, v18, s18
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, v35, v22, s20
+; GFX10-NEXT:    v_cndmask_b32_e64 v24, v82, v24, s21
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v97, v1, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v99, v3, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v101, v5, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v103, v7, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v55, v9, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v65, v11, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v51, v13, s15
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, v53, v15, s16
+; GFX10-NEXT:    v_cndmask_b32_e32 v26, v112, v26, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v27, v113, v27, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[28:29], 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, v49, v17, s17
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, v39, v19, s18
+; GFX10-NEXT:    v_cndmask_b32_e64 v26, v26, v80, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v27, v27, v81, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v23, v36, v23, s20
+; GFX10-NEXT:    v_cndmask_b32_e64 v25, v83, v25, s21
+; GFX10-NEXT:    v_cndmask_b32_e64 v26, v112, v26, s22
+; GFX10-NEXT:    v_cndmask_b32_e64 v27, v113, v27, s22
+; GFX10-NEXT:    v_cndmask_b32_e32 v28, v114, v28, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v29, v115, v29, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[30:31], 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v28, v28, v84, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v29, v29, v85, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v28, v114, v28, s23
+; GFX10-NEXT:    v_cndmask_b32_e64 v29, v115, v29, s23
+; GFX10-NEXT:    v_cndmask_b32_e32 v30, v116, v30, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v31, v117, v31, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[66:67], 64
+; GFX10-NEXT:    v_cndmask_b32_e64 v30, v30, v86, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v31, v31, v87, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v30, v116, v30, s24
+; GFX10-NEXT:    v_cndmask_b32_e64 v31, v117, v31, s24
+; GFX10-NEXT:    v_cndmask_b32_e32 v20, v20, v66, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v21, v21, v67, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v20, v33, v20, s19
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, v34, v21, s19
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximum_v16f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_load_b32 v87, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v86, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_b32 v85, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_b32 v84, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:64
+; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:72
+; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:68
+; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:80
+; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:76
+; GFX11-NEXT:    scratch_load_b32 v65, off, s32 offset:88
+; GFX11-NEXT:    scratch_load_b32 v64, off, s32 offset:84
+; GFX11-NEXT:    scratch_load_b32 v67, off, s32 offset:96
+; GFX11-NEXT:    scratch_load_b32 v66, off, s32 offset:92
+; GFX11-NEXT:    scratch_load_b32 v69, off, s32 offset:104
+; GFX11-NEXT:    scratch_load_b32 v68, off, s32 offset:100
+; GFX11-NEXT:    scratch_load_b32 v71, off, s32 offset:112
+; GFX11-NEXT:    scratch_load_b32 v70, off, s32 offset:108
+; GFX11-NEXT:    scratch_load_b32 v81, off, s32 offset:120
+; GFX11-NEXT:    scratch_load_b32 v80, off, s32 offset:116
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    scratch_load_b32 v83, off, s32 offset:128
+; GFX11-NEXT:    scratch_load_b32 v82, off, s32 offset:124
+; GFX11-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s9, v[0:1], v[86:87]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s11, v[0:1], v[86:87]
+; GFX11-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s10, v[2:3], v[84:85]
+; GFX11-NEXT:    v_cmp_class_f64_e64 s14, v[86:87], 64
+; GFX11-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s0, v[4:5], v[32:33]
+; GFX11-NEXT:    v_cmp_o_f64_e32 vcc_lo, v[4:5], v[32:33]
+; GFX11-NEXT:    s_waitcnt vmcnt(25)
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s2, v[6:7], v[34:35]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s12, v[2:3], v[84:85]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s1, v[6:7], v[34:35]
+; GFX11-NEXT:    s_waitcnt vmcnt(23)
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s4, v[8:9], v[36:37]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s3, v[8:9], v[36:37]
+; GFX11-NEXT:    v_cmp_class_f64_e64 s16, v[84:85], 64
+; GFX11-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s6, v[10:11], v[38:39]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s5, v[10:11], v[38:39]
+; GFX11-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s8, v[12:13], v[48:49]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s7, v[12:13], v[48:49]
+; GFX11-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s13, v[14:15], v[50:51]
+; GFX11-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-NEXT:    v_cmp_o_f64_e64 s15, v[16:17], v[52:53]
+; GFX11-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s17, v[18:19], v[54:55]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s18, v[18:19], v[54:55]
+; GFX11-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s19, v[20:21], v[64:65]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s20, v[20:21], v[64:65]
+; GFX11-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s21, v[22:23], v[66:67]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s22, v[22:23], v[66:67]
+; GFX11-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s23, v[24:25], v[68:69]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s24, v[24:25], v[68:69]
+; GFX11-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s25, v[26:27], v[70:71]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s26, v[26:27], v[70:71]
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s27, v[28:29], v[80:81]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s28, v[28:29], v[80:81]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s29, v[30:31], v[82:83]
+; GFX11-NEXT:    v_cmp_o_f64_e64 vcc_hi, v[30:31], v[82:83]
+; GFX11-NEXT:    v_cndmask_b32_e64 v96, v87, v1, s9
+; GFX11-NEXT:    v_cndmask_b32_e64 v101, v86, v0, s9
+; GFX11-NEXT:    v_cndmask_b32_e64 v98, v85, v3, s10
+; GFX11-NEXT:    v_cndmask_b32_e64 v103, v84, v2, s10
+; GFX11-NEXT:    v_cmp_class_f64_e64 s10, v[0:1], 64
+; GFX11-NEXT:    v_cndmask_b32_e64 v97, 0x7ff80000, v96, s11
+; GFX11-NEXT:    v_cndmask_b32_e64 v96, 0, v101, s11
+; GFX11-NEXT:    v_cndmask_b32_e64 v100, v33, v5, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v102, v35, v7, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v99, 0x7ff80000, v98, s12
+; GFX11-NEXT:    v_cndmask_b32_e64 v98, 0, v103, s12
+; GFX11-NEXT:    v_cmp_class_f64_e64 s11, v[2:3], 64
+; GFX11-NEXT:    v_cndmask_b32_e32 v101, 0x7ff80000, v100, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v103, 0x7ff80000, v102, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v112, v37, v9, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v114, v39, v11, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v116, v49, v13, s8
+; GFX11-NEXT:    v_cmp_o_f64_e64 s9, v[14:15], v[50:51]
+; GFX11-NEXT:    v_cndmask_b32_e64 v118, v51, v15, s13
+; GFX11-NEXT:    v_cndmask_b32_e64 v113, 0x7ff80000, v112, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v115, 0x7ff80000, v114, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v117, 0x7ff80000, v116, s7
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s12, v[16:17], v[52:53]
+; GFX11-NEXT:    v_cndmask_b32_e64 v130, v55, v19, s17
+; GFX11-NEXT:    v_cndmask_b32_e64 v132, v65, v21, s19
+; GFX11-NEXT:    v_cndmask_b32_e64 v134, v67, v23, s21
+; GFX11-NEXT:    v_cndmask_b32_e64 v144, v69, v25, s23
+; GFX11-NEXT:    v_cndmask_b32_e64 v145, v71, v27, s25
+; GFX11-NEXT:    v_cndmask_b32_e64 v131, 0x7ff80000, v130, s18
+; GFX11-NEXT:    v_cndmask_b32_e64 v133, 0x7ff80000, v132, s20
+; GFX11-NEXT:    v_cndmask_b32_e64 v135, 0x7ff80000, v134, s22
+; GFX11-NEXT:    v_cndmask_b32_e64 v146, v81, v29, s27
+; GFX11-NEXT:    v_cndmask_b32_e64 v148, v80, v28, s27
+; GFX11-NEXT:    v_cndmask_b32_e64 v147, v83, v31, s29
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v147, 0x7ff80000, v147, vcc_hi
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v96, v0, s10
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v97, v1, s10
+; GFX11-NEXT:    v_cmp_class_f64_e64 s10, v[36:37], 64
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v86, s14
+; GFX11-NEXT:    v_cndmask_b32_e64 v86, v32, v4, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v87, s14
+; GFX11-NEXT:    v_cndmask_b32_e64 v87, v34, v6, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v98, v2, s11
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v99, v3, s11
+; GFX11-NEXT:    v_cndmask_b32_e32 v100, 0, v86, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[4:5], 64
+; GFX11-NEXT:    v_cndmask_b32_e64 v102, 0, v87, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v84, s16
+; GFX11-NEXT:    v_cndmask_b32_e64 v84, v36, v8, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v86, v38, v10, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v87, v48, v12, s8
+; GFX11-NEXT:    v_cndmask_b32_e64 v119, 0x7ff80000, v118, s9
+; GFX11-NEXT:    v_cndmask_b32_e64 v128, v53, v17, s12
+; GFX11-NEXT:    v_cndmask_b32_e64 v112, 0, v84, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v114, 0, v86, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v116, 0, v87, s7
+; GFX11-NEXT:    v_cndmask_b32_e64 v84, v50, v14, s13
+; GFX11-NEXT:    v_cndmask_b32_e64 v129, 0x7ff80000, v128, s15
+; GFX11-NEXT:    v_cndmask_b32_e64 v86, v52, v16, s12
+; GFX11-NEXT:    v_cndmask_b32_e64 v87, v54, v18, s17
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v85, s16
+; GFX11-NEXT:    v_cndmask_b32_e64 v118, 0, v84, s9
+; GFX11-NEXT:    v_cndmask_b32_e64 v84, v64, v20, s19
+; GFX11-NEXT:    v_cndmask_b32_e64 v128, 0, v86, s15
+; GFX11-NEXT:    v_cndmask_b32_e64 v130, 0, v87, s18
+; GFX11-NEXT:    v_cndmask_b32_e64 v86, v66, v22, s21
+; GFX11-NEXT:    v_cndmask_b32_e64 v85, 0x7ff80000, v144, s24
+; GFX11-NEXT:    v_cndmask_b32_e64 v132, 0, v84, s20
+; GFX11-NEXT:    v_cndmask_b32_e64 v87, v68, v24, s23
+; GFX11-NEXT:    v_cndmask_b32_e64 v144, v70, v26, s25
+; GFX11-NEXT:    v_cndmask_b32_e64 v134, 0, v86, s22
+; GFX11-NEXT:    v_cmp_class_f64_e64 s0, v[68:69], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s1, v[70:71], 64
+; GFX11-NEXT:    v_cndmask_b32_e64 v84, 0, v87, s24
+; GFX11-NEXT:    v_cndmask_b32_e64 v87, 0x7ff80000, v145, s26
+; GFX11-NEXT:    v_cndmask_b32_e64 v86, 0, v144, s26
+; GFX11-NEXT:    v_cndmask_b32_e64 v145, 0x7ff80000, v146, s28
+; GFX11-NEXT:    v_cndmask_b32_e64 v144, 0, v148, s28
+; GFX11-NEXT:    v_cndmask_b32_e64 v146, v82, v30, s29
+; GFX11-NEXT:    v_cmp_class_f64_e64 s2, v[80:81], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s3, v[82:83], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s6, v[32:33], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s8, v[34:35], 64
+; GFX11-NEXT:    v_dual_cndmask_b32 v5, v101, v5 :: v_dual_cndmask_b32 v4, v100, v4
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[6:7], 64
+; GFX11-NEXT:    v_cndmask_b32_e64 v146, 0, v146, vcc_hi
+; GFX11-NEXT:    v_cmp_class_f64_e64 s12, v[38:39], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s14, v[48:49], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s16, v[50:51], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s18, v[52:53], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s20, v[54:55], 64
+; GFX11-NEXT:    v_cmp_class_f64_e64 s21, v[64:65], 64
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s4, 0, v[96:97]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s5, 0, v[98:99]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s7, 0, v[100:101]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s9, 0, v[102:103]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s11, 0, v[112:113]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s13, 0, v[114:115]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s15, 0, v[116:117]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s17, 0, v[118:119]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s19, 0, v[128:129]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s22, 0, v[130:131]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s23, 0, v[132:133]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s24, 0, v[134:135]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s25, 0, v[84:85]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s26, 0, v[86:87]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s27, 0, v[144:145]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s28, 0, v[146:147]
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, v33, s6
+; GFX11-NEXT:    v_dual_cndmask_b32 v7, v103, v7 :: v_dual_cndmask_b32 v6, v102, v6
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[8:9], 64
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v32, s6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v7, v35, s8
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v96, v0, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v98, v2, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v100, v4, s7
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v97, v1, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v99, v3, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v101, v5, s7
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v103, v7, s9
+; GFX11-NEXT:    v_dual_cndmask_b32 v9, v113, v9 :: v_dual_cndmask_b32 v8, v112, v8
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[10:11], 64
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v6, v34, s8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v9, v9, v37, s10
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v102, v6, s9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v9, v113, v9, s11
+; GFX11-NEXT:    v_dual_cndmask_b32 v11, v115, v11 :: v_dual_cndmask_b32 v10, v114, v10
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[12:13], 64
+; GFX11-NEXT:    v_cndmask_b32_e64 v8, v8, v36, s10
+; GFX11-NEXT:    v_cndmask_b32_e64 v11, v11, v39, s12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v8, v112, v8, s11
+; GFX11-NEXT:    v_cndmask_b32_e64 v11, v115, v11, s13
+; GFX11-NEXT:    v_dual_cndmask_b32 v13, v117, v13 :: v_dual_cndmask_b32 v12, v116, v12
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[14:15], 64
+; GFX11-NEXT:    v_cndmask_b32_e64 v10, v10, v38, s12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v13, v13, v49, s14
+; GFX11-NEXT:    v_cndmask_b32_e64 v10, v114, v10, s13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v13, v117, v13, s15
+; GFX11-NEXT:    v_dual_cndmask_b32 v15, v119, v15 :: v_dual_cndmask_b32 v14, v118, v14
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[16:17], 64
+; GFX11-NEXT:    v_cndmask_b32_e64 v12, v12, v48, s14
+; GFX11-NEXT:    v_cndmask_b32_e64 v15, v15, v51, s16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v12, v116, v12, s15
+; GFX11-NEXT:    v_cndmask_b32_e64 v15, v119, v15, s17
+; GFX11-NEXT:    v_dual_cndmask_b32 v17, v129, v17 :: v_dual_cndmask_b32 v16, v128, v16
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[18:19], 64
+; GFX11-NEXT:    v_cndmask_b32_e64 v14, v14, v50, s16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v17, v17, v53, s18
+; GFX11-NEXT:    v_cndmask_b32_e64 v14, v118, v14, s17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v17, v129, v17, s19
+; GFX11-NEXT:    v_dual_cndmask_b32 v19, v131, v19 :: v_dual_cndmask_b32 v18, v130, v18
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[20:21], 64
+; GFX11-NEXT:    v_cndmask_b32_e64 v16, v16, v52, s18
+; GFX11-NEXT:    v_cndmask_b32_e64 v19, v19, v55, s20
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v16, v128, v16, s19
+; GFX11-NEXT:    v_cndmask_b32_e64 v19, v131, v19, s22
+; GFX11-NEXT:    v_dual_cndmask_b32 v21, v133, v21 :: v_dual_cndmask_b32 v20, v132, v20
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[22:23], 64
+; GFX11-NEXT:    v_cndmask_b32_e64 v18, v18, v54, s20
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v21, v21, v65, s21
+; GFX11-NEXT:    v_cndmask_b32_e64 v18, v130, v18, s22
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v21, v133, v21, s23
+; GFX11-NEXT:    v_dual_cndmask_b32 v23, v135, v23 :: v_dual_cndmask_b32 v22, v134, v22
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[24:25], 64
+; GFX11-NEXT:    v_cndmask_b32_e64 v20, v20, v64, s21
+; GFX11-NEXT:    v_cndmask_b32_e64 v20, v132, v20, s23
+; GFX11-NEXT:    v_dual_cndmask_b32 v25, v85, v25 :: v_dual_cndmask_b32 v24, v84, v24
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[26:27], 64
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v25, v25, v69, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v25, v85, v25, s25
+; GFX11-NEXT:    v_dual_cndmask_b32 v27, v87, v27 :: v_dual_cndmask_b32 v26, v86, v26
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[28:29], 64
+; GFX11-NEXT:    v_cndmask_b32_e64 v24, v24, v68, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v27, v27, v71, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v24, v84, v24, s25
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v27, v87, v27, s26
+; GFX11-NEXT:    v_dual_cndmask_b32 v29, v145, v29 :: v_dual_cndmask_b32 v28, v144, v28
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[30:31], 64
+; GFX11-NEXT:    v_cndmask_b32_e64 v26, v26, v70, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v29, v29, v81, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v26, v86, v26, s26
+; GFX11-NEXT:    v_cndmask_b32_e64 v29, v145, v29, s27
+; GFX11-NEXT:    v_dual_cndmask_b32 v31, v147, v31 :: v_dual_cndmask_b32 v30, v146, v30
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[66:67], 64
+; GFX11-NEXT:    v_cndmask_b32_e64 v28, v28, v80, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v31, v31, v83, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v28, v144, v28, s27
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v31, v147, v31, s28
+; GFX11-NEXT:    v_dual_cndmask_b32 v23, v23, v67 :: v_dual_cndmask_b32 v22, v22, v66
+; GFX11-NEXT:    v_cndmask_b32_e64 v30, v30, v82, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v23, v135, v23, s24
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v22, v134, v22, s24
+; GFX11-NEXT:    v_cndmask_b32_e64 v30, v146, v30, s28
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximum_v16f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_clause 0x1b
+; GFX12-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX12-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX12-NEXT:    scratch_load_b32 v35, off, s32 offset:16
+; GFX12-NEXT:    scratch_load_b32 v34, off, s32 offset:12
+; GFX12-NEXT:    scratch_load_b32 v31, off, s32
+; GFX12-NEXT:    scratch_load_b32 v37, off, s32 offset:120
+; GFX12-NEXT:    scratch_load_b32 v39, off, s32 offset:104
+; GFX12-NEXT:    scratch_load_b32 v49, off, s32 offset:24
+; GFX12-NEXT:    scratch_load_b32 v48, off, s32 offset:20
+; GFX12-NEXT:    scratch_load_b32 v51, off, s32 offset:32
+; GFX12-NEXT:    scratch_load_b32 v50, off, s32 offset:28
+; GFX12-NEXT:    scratch_load_b32 v53, off, s32 offset:40
+; GFX12-NEXT:    scratch_load_b32 v52, off, s32 offset:36
+; GFX12-NEXT:    scratch_load_b32 v55, off, s32 offset:48
+; GFX12-NEXT:    scratch_load_b32 v54, off, s32 offset:44
+; GFX12-NEXT:    scratch_load_b32 v65, off, s32 offset:56
+; GFX12-NEXT:    scratch_load_b32 v64, off, s32 offset:52
+; GFX12-NEXT:    scratch_load_b32 v67, off, s32 offset:64
+; GFX12-NEXT:    scratch_load_b32 v66, off, s32 offset:60
+; GFX12-NEXT:    scratch_load_b32 v69, off, s32 offset:72
+; GFX12-NEXT:    scratch_load_b32 v68, off, s32 offset:68
+; GFX12-NEXT:    scratch_load_b32 v71, off, s32 offset:80
+; GFX12-NEXT:    scratch_load_b32 v70, off, s32 offset:76
+; GFX12-NEXT:    scratch_load_b32 v81, off, s32 offset:88
+; GFX12-NEXT:    scratch_load_b32 v80, off, s32 offset:84
+; GFX12-NEXT:    scratch_load_b32 v83, off, s32 offset:96
+; GFX12-NEXT:    scratch_load_b32 v82, off, s32 offset:92
+; GFX12-NEXT:    scratch_load_b32 v38, off, s32 offset:100
+; GFX12-NEXT:    s_wait_loadcnt 0x1a
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[32:33]
+; GFX12-NEXT:    s_clause 0x2
+; GFX12-NEXT:    scratch_load_b32 v33, off, s32 offset:112
+; GFX12-NEXT:    scratch_load_b32 v32, off, s32 offset:108
+; GFX12-NEXT:    scratch_load_b32 v36, off, s32 offset:116
+; GFX12-NEXT:    s_wait_loadcnt 0x1b
+; GFX12-NEXT:    v_maximum_f64 v[2:3], v[2:3], v[34:35]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    scratch_load_b32 v35, off, s32 offset:128
+; GFX12-NEXT:    scratch_load_b32 v34, off, s32 offset:124
+; GFX12-NEXT:    s_wait_loadcnt 0x18
+; GFX12-NEXT:    v_maximum_f64 v[4:5], v[4:5], v[48:49]
+; GFX12-NEXT:    s_wait_loadcnt 0x16
+; GFX12-NEXT:    v_maximum_f64 v[6:7], v[6:7], v[50:51]
+; GFX12-NEXT:    s_wait_loadcnt 0x14
+; GFX12-NEXT:    v_maximum_f64 v[8:9], v[8:9], v[52:53]
+; GFX12-NEXT:    s_wait_loadcnt 0x12
+; GFX12-NEXT:    v_maximum_f64 v[10:11], v[10:11], v[54:55]
+; GFX12-NEXT:    s_wait_loadcnt 0x10
+; GFX12-NEXT:    v_maximum_f64 v[12:13], v[12:13], v[64:65]
+; GFX12-NEXT:    s_wait_loadcnt 0xe
+; GFX12-NEXT:    v_maximum_f64 v[14:15], v[14:15], v[66:67]
+; GFX12-NEXT:    s_wait_loadcnt 0xc
+; GFX12-NEXT:    v_maximum_f64 v[16:17], v[16:17], v[68:69]
+; GFX12-NEXT:    s_wait_loadcnt 0xa
+; GFX12-NEXT:    v_maximum_f64 v[18:19], v[18:19], v[70:71]
+; GFX12-NEXT:    s_wait_loadcnt 0x8
+; GFX12-NEXT:    v_maximum_f64 v[20:21], v[20:21], v[80:81]
+; GFX12-NEXT:    s_wait_loadcnt 0x6
+; GFX12-NEXT:    v_maximum_f64 v[22:23], v[22:23], v[82:83]
+; GFX12-NEXT:    s_wait_loadcnt 0x5
+; GFX12-NEXT:    v_maximum_f64 v[24:25], v[24:25], v[38:39]
+; GFX12-NEXT:    s_wait_loadcnt 0x3
+; GFX12-NEXT:    v_maximum_f64 v[26:27], v[26:27], v[32:33]
+; GFX12-NEXT:    s_wait_loadcnt 0x2
+; GFX12-NEXT:    v_maximum_f64 v[28:29], v[28:29], v[36:37]
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[30:31], v[30:31], v[34:35]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <16 x double> @llvm.maximum.v16f64(<16 x double> %src0, <16 x double> %src1)
+  ret <16 x double> %op
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
new file mode 100644
index 000000000000..95d351e8f1fa
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
@@ -0,0 +1,2181 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; xUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx703 < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GCN,GFX940 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+
+define half @v_minimum_f16(half %src0, half %src1) {
+; GFX8-LABEL: v_minimum_f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_f16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call half @llvm.minimum.f16(half %src0, half %src1)
+  ret half %op
+}
+
+define half @v_minimum_f16__nnan(half %src0, half %src1) {
+; GFX8-LABEL: v_minimum_f16__nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_f16__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_f16__nnan:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_f16__nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_f16__nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_f16__nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan half @llvm.minimum.f16(half %src0, half %src1)
+  ret half %op
+}
+
+define half @v_minimum_f16__nsz(half %src0, half %src1) {
+; GFX8-LABEL: v_minimum_f16__nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_f16__nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_f16__nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_f16__nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_f16__nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_f16__nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nsz half @llvm.minimum.f16(half %src0, half %src1)
+  ret half %op
+}
+
+define half @v_minimum_f16__nnan_nsz(half %src0, half %src1) {
+; GFX8-LABEL: v_minimum_f16__nnan_nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_f16__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_f16__nnan_nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_f16__nnan_nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_f16__nnan_nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_f16__nnan_nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan nsz half @llvm.minimum.f16(half %src0, half %src1)
+  ret half %op
+}
+
+define half @v_minimum_f16__nnan_src0(half %arg0, half %src1) {
+; GFX8-LABEL: v_minimum_f16__nnan_src0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_f16__nnan_src0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX9-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_f16__nnan_src0:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX940-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_f16__nnan_src0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX10-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_f16__nnan_src0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_f16__nnan_src0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %src0 = fadd nnan half %arg0, 1.0
+  %op = call half @llvm.minimum.f16(half %src0, half %src1)
+  ret half %op
+}
+
+define half @v_minimum_f16__nnan_src1(half %src0, half %arg1) {
+; GFX8-LABEL: v_minimum_f16__nnan_src1:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_f16_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_f16__nnan_src1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_f16_e32 v1, 1.0, v1
+; GFX9-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_f16__nnan_src1:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_add_f16_e32 v1, 1.0, v1
+; GFX940-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_f16__nnan_src1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_f16_e32 v1, 1.0, v1
+; GFX10-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_f16__nnan_src1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_f16_e32 v1, 1.0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_f16__nnan_src1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_f16_e32 v1, 1.0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %src1 = fadd nnan half %arg1, 1.0
+  %op = call half @llvm.minimum.f16(half %src0, half %src1)
+  ret half %op
+}
+
+define void @s_minimum_f16(half inreg %src0, half inreg %src1) {
+; GFX8-LABEL: s_minimum_f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s5
+; GFX8-NEXT:    v_min_f16_e32 v1, s4, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, s4, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use v0
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_minimum_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-NEXT:    v_min_f16_e32 v1, s4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_minimum_f16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NEXT:    v_min_f16_e32 v1, s0, v0
+; GFX940-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX940-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_minimum_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f16_e64 v0, s4, s5
+; GFX10-NEXT:    v_cmp_o_f16_e64 vcc_lo, s4, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v0
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_minimum_f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_min_f16_e64 v0, s0, s1
+; GFX11-NEXT:    v_cmp_o_f16_e64 vcc_lo, s0, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v0
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: s_minimum_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_minimum_f16 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use s0
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call half @llvm.minimum.f16(half %src0, half %src1)
+  %cast = bitcast half %op to i16
+  %zext = zext i16 %cast to i32
+  call void asm sideeffect "; use $0", "s"(i32 %zext)
+  ret void
+}
+
+define <2 x half> @v_minimum_v2f16(<2 x half> %src0, <2 x half> %src1) {
+; GFX8-LABEL: v_minimum_v2f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_cmp_lt_f16_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX8-NEXT:    v_cmp_class_f16_e64 vcc, v3, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX8-NEXT:    v_cmp_class_f16_e64 vcc, v2, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT:    v_cmp_eq_f16_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8-NEXT:    v_cmp_class_f16_e64 vcc, v0, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f16_e64 vcc, v1, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_cmp_eq_f16_e32 vcc, 0, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v2f16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_pk_min_f16 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX940-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v2f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_min_f16 v2, v0, v1
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v3, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_min_f16 v2, v0, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v2f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <2 x half> @llvm.minimum.v2f16(<2 x half> %src0, <2 x half> %src1)
+  ret <2 x half> %op
+}
+
+define <2 x half> @v_minimum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) {
+; GFX8-LABEL: v_minimum_v2f16__nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_cmp_lt_f16_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
+; GFX8-NEXT:    v_cmp_class_f16_e64 vcc, v3, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX8-NEXT:    v_cmp_class_f16_e64 vcc, v2, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT:    v_cmp_eq_f16_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f16_e64 vcc, v0, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f16_e64 vcc, v1, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_cmp_eq_f16_e32 vcc, 0, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v2f16__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v2f16__nnan:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v2f16__nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v2f16__nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v2f16__nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan <2 x half> @llvm.minimum.v2f16(<2 x half> %src0, <2 x half> %src1)
+  ret <2 x half> %op
+}
+
+define <2 x half> @v_minimum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) {
+; GFX8-LABEL: v_minimum_v2f16__nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_cmp_lt_f16_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
+; GFX8-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v2f16__nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v2f16__nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_pk_min_f16 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX940-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v2f16__nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_min_f16 v2, v0, v1
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v3, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v2f16__nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_min_f16 v2, v0, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v2f16__nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nsz <2 x half> @llvm.minimum.v2f16(<2 x half> %src0, <2 x half> %src1)
+  ret <2 x half> %op
+}
+
+define <2 x half> @v_minimum_v2f16__nnan_nsz(<2 x half> %src0, <2 x half> %src1) {
+; GFX8-LABEL: v_minimum_v2f16__nnan_nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_cmp_lt_f16_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v2f16__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v2f16__nnan_nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v2f16__nnan_nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v2f16__nnan_nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v2f16__nnan_nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan nsz <2 x half> @llvm.minimum.v2f16(<2 x half> %src0, <2 x half> %src1)
+  ret <2 x half> %op
+}
+
+define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
+; GFX8-LABEL: s_minimum_v2f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_lshr_b32 s6, s5, 16
+; GFX8-NEXT:    s_lshr_b32 s7, s4, 16
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_cmp_lt_f16_e32 vcc, s7, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, s7, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f16_e64 vcc, s7, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX8-NEXT:    v_cmp_class_f16_e64 vcc, s6, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f16_e32 vcc, 0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_cmp_lt_f16_e32 vcc, s4, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v1, v2, vcc
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, s4, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT:    v_cmp_class_f16_e64 vcc, s4, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f16_e64 vcc, s5, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX8-NEXT:    v_cmp_eq_f16_e32 vcc, 0, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use v0
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_minimum_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_lshr_b32 s5, s5, 16
+; GFX9-NEXT:    v_pk_min_f16 v1, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s4, v0
+; GFX9-NEXT:    s_lshr_b32 s4, s4, 16
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s4, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_minimum_v2f16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX940-NEXT:    v_pk_min_f16 v1, s0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX940-NEXT:    v_mov_b32_e32 v3, s1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, s0, v3
+; GFX940-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX940-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_minimum_v2f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_min_f16 v0, s4, s5
+; GFX10-NEXT:    v_cmp_o_f16_e64 vcc_lo, s4, s5
+; GFX10-NEXT:    s_lshr_b32 s6, s5, 16
+; GFX10-NEXT:    s_lshr_b32 s4, s4, 16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_e64 vcc_lo, s4, s6
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo
+; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v0
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_minimum_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_min_f16 v0, s0, s1
+; GFX11-NEXT:    v_cmp_o_f16_e64 vcc_lo, s0, s1
+; GFX11-NEXT:    s_lshr_b32 s2, s1, 16
+; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e64 vcc_lo, s0, s2
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo
+; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v0
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: s_minimum_v2f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, s0, s1
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use v0
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <2 x half> @llvm.minimum.v2f16(<2 x half> %src0, <2 x half> %src1)
+  %cast = bitcast <2 x half> %op to i32
+  call void asm sideeffect "; use $0", "s"(i32 %cast)
+  ret void
+}
+
+define <3 x half> @v_minimum_v3f16(<3 x half> %src0, <3 x half> %src1) {
+; GFX8-LABEL: v_minimum_v3f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_min_f16_e32 v6, v5, v4
+; GFX8-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v5, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v6, vcc
+; GFX8-NEXT:    v_min_f16_e32 v5, v1, v3
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
+; GFX8-NEXT:    v_min_f16_e32 v3, v0, v2
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v3f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-NEXT:    v_pk_min_f16 v3, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v3f16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX940-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX940-NEXT:    v_pk_min_f16 v3, v0, v2
+; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX940-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v3f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_min_f16 v4, v0, v2
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0x7e00, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_pk_min_f16 v2, v1, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v2, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v3f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_min_f16 v4, v0, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v6, v5
+; GFX11-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v3f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <3 x half> @llvm.minimum.v3f16(<3 x half> %src0, <3 x half> %src1)
+  ret <3 x half> %op
+}
+
+define <3 x half> @v_minimum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) {
+; GFX8-LABEL: v_minimum_v3f16__nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_min_f16_e32 v1, v1, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v3f16__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v3f16__nnan:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX940-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v3f16__nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX10-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v3f16__nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX11-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v3f16__nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan <3 x half> @llvm.minimum.v3f16(<3 x half> %src0, <3 x half> %src1)
+  ret <3 x half> %op
+}
+
+define <3 x half> @v_minimum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) {
+; GFX8-LABEL: v_minimum_v3f16__nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_min_f16_e32 v6, v5, v4
+; GFX8-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v5, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v6, vcc
+; GFX8-NEXT:    v_min_f16_e32 v5, v1, v3
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
+; GFX8-NEXT:    v_min_f16_e32 v3, v0, v2
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v3f16__nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-NEXT:    v_pk_min_f16 v3, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v3f16__nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX940-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX940-NEXT:    v_pk_min_f16 v3, v0, v2
+; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX940-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v3f16__nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_min_f16 v4, v0, v2
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0x7e00, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_pk_min_f16 v2, v1, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v2, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v3f16__nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_min_f16 v4, v0, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v6, v5
+; GFX11-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v3f16__nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nsz <3 x half> @llvm.minimum.v3f16(<3 x half> %src0, <3 x half> %src1)
+  ret <3 x half> %op
+}
+
+define <3 x half> @v_minimum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1) {
+; GFX8-LABEL: v_minimum_v3f16__nnan_nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_min_f16_e32 v1, v1, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v3f16__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v3f16__nnan_nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX940-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v3f16__nnan_nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX10-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v3f16__nnan_nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX11-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v3f16__nnan_nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan nsz <3 x half> @llvm.minimum.v3f16(<3 x half> %src0, <3 x half> %src1)
+  ret <3 x half> %op
+}
+
+define <4 x half> @v_minimum_v4f16(<4 x half> %src0, <4 x half> %src1) {
+; GFX8-LABEL: v_minimum_v4f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX8-NEXT:    v_min_f16_e32 v6, v5, v4
+; GFX8-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v5, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v6, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX8-NEXT:    v_min_f16_e32 v8, v6, v5
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v6, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
+; GFX8-NEXT:    v_min_f16_e32 v6, v1, v3
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX8-NEXT:    v_min_f16_e32 v3, v0, v2
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v4f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-NEXT:    v_pk_min_f16 v3, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v4f16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX940-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    v_pk_min_f16 v3, v0, v2
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX940-NEXT:    v_perm_b32 v1, v1, v6, s0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX940-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v4f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
+; GFX10-NEXT:    v_pk_min_f16 v5, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0x7e00, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0x7e00, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v5, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v1, v1, v6, 0x5040100
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v4f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-NEXT:    v_pk_min_f16 v7, v0, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v6, v5
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7e00, v4, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v4f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <4 x half> @llvm.minimum.v4f16(<4 x half> %src0, <4 x half> %src1)
+  ret <4 x half> %op
+}
+
+define <4 x half> @v_minimum_v4f16__nnan(<4 x half> %src0, <4 x half> %src1) {
+; GFX8-LABEL: v_minimum_v4f16__nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_e32 v1, v1, v3
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v4f16__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v4f16__nnan:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX940-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v4f16__nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX10-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v4f16__nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX11-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v4f16__nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan <4 x half> @llvm.minimum.v4f16(<4 x half> %src0, <4 x half> %src1)
+  ret <4 x half> %op
+}
+
+define <4 x half> @v_minimum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) {
+; GFX8-LABEL: v_minimum_v4f16__nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX8-NEXT:    v_min_f16_e32 v6, v5, v4
+; GFX8-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v5, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v6, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX8-NEXT:    v_min_f16_e32 v8, v6, v5
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v6, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
+; GFX8-NEXT:    v_min_f16_e32 v6, v1, v3
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX8-NEXT:    v_min_f16_e32 v3, v0, v2
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v4f16__nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-NEXT:    v_pk_min_f16 v3, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v4f16__nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX940-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    v_pk_min_f16 v3, v0, v2
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX940-NEXT:    v_perm_b32 v1, v1, v6, s0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX940-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v4f16__nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
+; GFX10-NEXT:    v_pk_min_f16 v5, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0x7e00, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0x7e00, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v5, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v1, v1, v6, 0x5040100
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v4f16__nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-NEXT:    v_pk_min_f16 v7, v0, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v6, v5
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7e00, v4, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v4f16__nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nsz <4 x half> @llvm.minimum.v4f16(<4 x half> %src0, <4 x half> %src1)
+  ret <4 x half> %op
+}
+
+define <4 x half> @v_minimum_v4f16__nnan_nsz(<4 x half> %src0, <4 x half> %src1) {
+; GFX8-LABEL: v_minimum_v4f16__nnan_nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_e32 v1, v1, v3
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v4f16__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v4f16__nnan_nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX940-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v4f16__nnan_nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX10-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v4f16__nnan_nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX11-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v4f16__nnan_nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan nsz <4 x half> @llvm.minimum.v4f16(<4 x half> %src0, <4 x half> %src1)
+  ret <4 x half> %op
+}
+
+define <8 x half> @v_minimum_v8f16(<8 x half> %src0, <8 x half> %src1) {
+; GFX8-LABEL: v_minimum_v8f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX8-NEXT:    v_min_f16_e32 v10, v9, v8
+; GFX8-NEXT:    v_mov_b32_e32 v11, 0x7e00
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v9, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v11, v10, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX8-NEXT:    v_min_f16_e32 v12, v10, v9
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v10, v9
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v11, v12, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v1
+; GFX8-NEXT:    v_min_f16_e32 v13, v12, v10
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v12, v10
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v11, v13, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
+; GFX8-NEXT:    v_min_f16_e32 v14, v13, v12
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v13, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v11, v14, vcc
+; GFX8-NEXT:    v_min_f16_e32 v13, v3, v7
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v3, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v11, v13, vcc
+; GFX8-NEXT:    v_min_f16_e32 v7, v2, v6
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v2, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v11, v7, vcc
+; GFX8-NEXT:    v_min_f16_e32 v6, v1, v5
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v11, v6, vcc
+; GFX8-NEXT:    v_min_f16_e32 v5, v0, v4
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v11, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v12
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v10
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v8f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v8, v3, v7
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v3, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v8, vcc
+; GFX9-NEXT:    v_pk_min_f16 v7, v2, v6
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v7, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v7, vcc
+; GFX9-NEXT:    v_pk_min_f16 v6, v1, v5
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v6, vcc
+; GFX9-NEXT:    v_pk_min_f16 v5, v0, v4
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v9, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v9, v5, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v0, v6, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v7, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v8, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v10, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v8f16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_pk_min_f16 v8, v3, v7
+; GFX940-NEXT:    v_mov_b32_e32 v9, 0x7e00
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v3, v7
+; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v10, v9, v8, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    v_pk_min_f16 v7, v2, v6
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v9, v8, vcc
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v2, v6
+; GFX940-NEXT:    v_perm_b32 v3, v3, v10, s0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v9, v7, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    v_pk_min_f16 v6, v1, v5
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v9, v7, vcc
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v1, v5
+; GFX940-NEXT:    v_perm_b32 v2, v2, v8, s0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v9, v6, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    v_pk_min_f16 v5, v0, v4
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v9, v6, vcc
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v4
+; GFX940-NEXT:    v_perm_b32 v1, v1, v7, s0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v5, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v9, v5, vcc
+; GFX940-NEXT:    v_perm_b32 v0, v0, v6, s0
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v8f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_min_f16 v8, v3, v7
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3, v7
+; GFX10-NEXT:    v_pk_min_f16 v9, v2, v6
+; GFX10-NEXT:    v_pk_min_f16 v12, v1, v5
+; GFX10-NEXT:    v_pk_min_f16 v13, v0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, 0x7e00, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v2, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, 0x7e00, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v13
+; GFX10-NEXT:    v_perm_b32 v2, v2, v9, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0x7e00, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, 0x7e00, v13, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v13, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_perm_b32 v1, v1, v6, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7e00, v8, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v3, v3, v10, 0x5040100
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v8f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_min_f16 v8, v3, v7
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3, v7
+; GFX11-NEXT:    v_pk_min_f16 v10, v2, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
+; GFX11-NEXT:    v_pk_min_f16 v14, v1, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, 0x7e00, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v2, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v10
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v10, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v12, v11
+; GFX11-NEXT:    v_pk_min_f16 v11, v0, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0x7e00, v13, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 16, v11
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, 0x7e00, v14, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v11, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v13, v12
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x7e00, v15, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v14, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3, v7
+; GFX11-NEXT:    v_perm_b32 v1, v1, v10, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7e00, v8, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v3, v3, v9, 0x5040100
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v8f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v4
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v5
+; GFX12-NEXT:    v_pk_minimum_f16 v2, v2, v6
+; GFX12-NEXT:    v_pk_minimum_f16 v3, v3, v7
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <8 x half> @llvm.minimum.v8f16(<8 x half> %src0, <8 x half> %src1)
+  ret <8 x half> %op
+}
+
+define <16 x half> @v_minimum_v16f16(<16 x half> %src0, <16 x half> %src1) {
+; GFX8-LABEL: v_minimum_v16f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v15
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
+; GFX8-NEXT:    v_min_f16_e32 v18, v17, v16
+; GFX8-NEXT:    v_mov_b32_e32 v19, 0x7e00
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v17, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v19, v18, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v14
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v6
+; GFX8-NEXT:    v_min_f16_e32 v20, v18, v17
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v18, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v17, v19, v20, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v5
+; GFX8-NEXT:    v_min_f16_e32 v21, v20, v18
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v20, v18
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v19, v21, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v4
+; GFX8-NEXT:    v_min_f16_e32 v22, v21, v20
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v21, v20
+; GFX8-NEXT:    v_cndmask_b32_e32 v20, v19, v22, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v22, 16, v3
+; GFX8-NEXT:    v_min_f16_e32 v23, v22, v21
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v22, v21
+; GFX8-NEXT:    v_cndmask_b32_e32 v21, v19, v23, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v22, 16, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v2
+; GFX8-NEXT:    v_min_f16_e32 v24, v23, v22
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v23, v22
+; GFX8-NEXT:    v_cndmask_b32_e32 v22, v19, v24, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v24, 16, v1
+; GFX8-NEXT:    v_min_f16_e32 v25, v24, v23
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v24, v23
+; GFX8-NEXT:    v_cndmask_b32_e32 v23, v19, v25, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v24, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v25, 16, v0
+; GFX8-NEXT:    v_min_f16_e32 v26, v25, v24
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v25, v24
+; GFX8-NEXT:    v_cndmask_b32_e32 v24, v19, v26, vcc
+; GFX8-NEXT:    v_min_f16_e32 v25, v7, v15
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v7, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v19, v25, vcc
+; GFX8-NEXT:    v_min_f16_e32 v15, v6, v14
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v6, v14
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v19, v15, vcc
+; GFX8-NEXT:    v_min_f16_e32 v14, v5, v13
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v5, v13
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v19, v14, vcc
+; GFX8-NEXT:    v_min_f16_e32 v13, v4, v12
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v4, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v19, v13, vcc
+; GFX8-NEXT:    v_min_f16_e32 v12, v3, v11
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v3, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v19, v12, vcc
+; GFX8-NEXT:    v_min_f16_e32 v11, v2, v10
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v2, v10
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v19, v11, vcc
+; GFX8-NEXT:    v_min_f16_e32 v10, v1, v9
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v1, v9
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v19, v10, vcc
+; GFX8-NEXT:    v_min_f16_e32 v9, v0, v8
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v19, v9, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v24
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v23
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v22
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v21
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v20
+; GFX8-NEXT:    v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v18
+; GFX8-NEXT:    v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v17
+; GFX8-NEXT:    v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v16
+; GFX8-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v16f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v16, v7, v15
+; GFX9-NEXT:    v_mov_b32_e32 v17, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v7, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v17, v16, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v17, v16, vcc
+; GFX9-NEXT:    v_pk_min_f16 v15, v6, v14
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v6, v14
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v15, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v17, v15, vcc
+; GFX9-NEXT:    v_pk_min_f16 v14, v5, v13
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v13
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v14, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v17, v14, vcc
+; GFX9-NEXT:    v_pk_min_f16 v13, v4, v12
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v12
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v13, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v17, v13, vcc
+; GFX9-NEXT:    v_pk_min_f16 v12, v3, v11
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v3, v11
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v17, v12, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v17, v12, vcc
+; GFX9-NEXT:    v_pk_min_f16 v11, v2, v10
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v2, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v17, v11, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v17, v11, vcc
+; GFX9-NEXT:    v_pk_min_f16 v10, v1, v9
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v17, v10, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v10, vcc
+; GFX9-NEXT:    v_pk_min_f16 v9, v0, v8
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v17, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v17, v9, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v0, v10, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v11, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v12, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v13, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v14, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v15, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v16, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v18, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v16f16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_pk_min_f16 v16, v7, v15
+; GFX940-NEXT:    v_mov_b32_e32 v17, 0x7e00
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v7, v15
+; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v18, v17, v16, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    v_pk_min_f16 v15, v6, v14
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v17, v16, vcc
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v6, v14
+; GFX940-NEXT:    v_perm_b32 v7, v7, v18, s0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v17, v15, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    v_pk_min_f16 v14, v5, v13
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v17, v15, vcc
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v5, v13
+; GFX940-NEXT:    v_perm_b32 v6, v6, v16, s0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v15, v17, v14, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    v_pk_min_f16 v13, v4, v12
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v17, v14, vcc
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v4, v12
+; GFX940-NEXT:    v_perm_b32 v5, v5, v15, s0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v14, v17, v13, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    v_pk_min_f16 v12, v3, v11
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v17, v13, vcc
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v3, v11
+; GFX940-NEXT:    v_perm_b32 v4, v4, v14, s0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v13, v17, v12, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    v_pk_min_f16 v11, v2, v10
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v17, v12, vcc
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v2, v10
+; GFX940-NEXT:    v_perm_b32 v3, v3, v13, s0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v12, v17, v11, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    v_pk_min_f16 v10, v1, v9
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v17, v11, vcc
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v1, v9
+; GFX940-NEXT:    v_perm_b32 v2, v2, v12, s0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v11, v17, v10, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    v_pk_min_f16 v9, v0, v8
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v17, v10, vcc
+; GFX940-NEXT:    v_cmp_o_f16_e32 vcc, v0, v8
+; GFX940-NEXT:    v_perm_b32 v1, v1, v11, s0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v10, v17, v9, vcc
+; GFX940-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX940-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v17, v9, vcc
+; GFX940-NEXT:    v_perm_b32 v0, v0, v10, s0
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v16f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_min_f16 v16, v7, v15
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v7, v15
+; GFX10-NEXT:    v_pk_min_f16 v18, v6, v14
+; GFX10-NEXT:    v_pk_min_f16 v19, v3, v11
+; GFX10-NEXT:    v_pk_min_f16 v20, v2, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v16
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, 0x7e00, v16, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v18
+; GFX10-NEXT:    v_pk_min_f16 v21, v0, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0x7e00, v17, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v6, v14
+; GFX10-NEXT:    v_pk_min_f16 v17, v5, v13
+; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v21
+; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v18, 0x7e00, v18, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v17
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0x7e00, v15, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v5, v13
+; GFX10-NEXT:    v_perm_b32 v6, v6, v18, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, 0x7e00, v17, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_pk_min_f16 v17, v4, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0x7e00, v14, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v4, v12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v17
+; GFX10-NEXT:    v_perm_b32 v5, v5, v15, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, 0x7e00, v17, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3, v11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
+; GFX10-NEXT:    v_cndmask_b32_e32 v19, 0x7e00, v19, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_pk_min_f16 v11, v1, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7e00, v17, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v2, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v11
+; GFX10-NEXT:    v_perm_b32 v3, v3, v19, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v17, 0x7e00, v20, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, 0x7e00, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v22, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v8
+; GFX10-NEXT:    v_perm_b32 v1, v1, v11, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, 0x7e00, v21, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v23, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v9, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v20, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_perm_b32 v2, v2, v17, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0x7e00, v14, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v4, v4, v13, 0x5040100
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v16f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_min_f16 v16, v7, v15
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
+; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v7, v15
+; GFX11-NEXT:    v_pk_min_f16 v15, v6, v14
+; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v16
+; GFX11-NEXT:    v_pk_min_f16 v20, v4, v12
+; GFX11-NEXT:    v_pk_min_f16 v22, v2, v10
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, 0x7e00, v16, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v18, v17
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v14
+; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
+; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v16, 0x7e00, v19, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v6, v14
+; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v15
+; GFX11-NEXT:    v_pk_min_f16 v14, v5, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v7, v16, v7, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0x7e00, v15, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v18, v17
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v13
+; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v15, 0x7e00, v19, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v5, v13
+; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v6, v15, v6, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, 0x7e00, v14, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v18, v17
+; GFX11-NEXT:    v_pk_min_f16 v17, v3, v11
+; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v20
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, 0x7e00, v19, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v4, v12
+; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v11
+; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v17
+; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v14, 0x7e00, v20, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v3
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3, v11
+; GFX11-NEXT:    v_perm_b32 v5, v13, v5, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7e00, v17, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v20, v19
+; GFX11-NEXT:    v_pk_min_f16 v19, v1, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v22
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, 0x7e00, v21, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v2, v10
+; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v3, v11, v3, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v17, 0x7e00, v22, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_pk_min_f16 v22, v0, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v21, 0x7e00, v19, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v19, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v8
+; GFX11-NEXT:    v_perm_b32 v1, v1, v21, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v22, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v24, v23
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, 0x7e00, v25, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v2, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v8, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v20, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v4, v12
+; GFX11-NEXT:    v_perm_b32 v2, v2, v17, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x7e00, v18, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v4, v4, v14, 0x5040100
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v16f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v8
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v9
+; GFX12-NEXT:    v_pk_minimum_f16 v2, v2, v10
+; GFX12-NEXT:    v_pk_minimum_f16 v3, v3, v11
+; GFX12-NEXT:    v_pk_minimum_f16 v4, v4, v12
+; GFX12-NEXT:    v_pk_minimum_f16 v5, v5, v13
+; GFX12-NEXT:    v_pk_minimum_f16 v6, v6, v14
+; GFX12-NEXT:    v_pk_minimum_f16 v7, v7, v15
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <16 x half> @llvm.minimum.v16f16(<16 x half> %src0, <16 x half> %src1)
+  ret <16 x half> %op
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
new file mode 100644
index 000000000000..1da2647fbd60
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
@@ -0,0 +1,4344 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx703 < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GCN,GFX940 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+
+define float @v_minimum_f32(float %src0, float %src1) {
+; GFX7-LABEL: v_minimum_f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_f32:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call float @llvm.minimum.f32(float %src0, float %src1)
+  ret float %op
+}
+
+define float @v_minimum_f32__nnan(float %src0, float %src1) {
+; GFX7-LABEL: v_minimum_f32__nnan:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_f32__nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_f32__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_f32__nnan:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_f32__nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_f32__nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_f32__nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan float @llvm.minimum.f32(float %src0, float %src1)
+  ret float %op
+}
+
+define float @v_minimum_f32__nsz(float %src0, float %src1) {
+; GFX7-LABEL: v_minimum_f32__nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_f32__nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_f32__nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_f32__nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_f32__nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_f32__nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_f32__nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nsz float @llvm.minimum.f32(float %src0, float %src1)
+  ret float %op
+}
+
+define float @v_minimum_f32__nnan_nsz(float %src0, float %src1) {
+; GFX7-LABEL: v_minimum_f32__nnan_nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_f32__nnan_nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_f32__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_f32__nnan_nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_f32__nnan_nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_f32__nnan_nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_f32__nnan_nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan nsz float @llvm.minimum.f32(float %src0, float %src1)
+  ret float %op
+}
+
+define float @v_minimum_f32__nnan_src0(float %arg0, float %src1) {
+; GFX7-LABEL: v_minimum_f32__nnan_src0:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_f32__nnan_src0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_f32__nnan_src0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_f32__nnan_src0:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX940-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_f32__nnan_src0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX10-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_f32__nnan_src0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_f32__nnan_src0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %src0 = fadd nnan float %arg0, 1.0
+  %op = call float @llvm.minimum.f32(float %src0, float %src1)
+  ret float %op
+}
+
+define float @v_minimum_f32__nnan_src1(float %src0, float %arg1) {
+; GFX7-LABEL: v_minimum_f32__nnan_src1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_f32__nnan_src1:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_f32__nnan_src1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_f32__nnan_src1:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GFX940-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_f32__nnan_src1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GFX10-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_f32__nnan_src1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_f32__nnan_src1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %src1 = fadd nnan float %arg1, 1.0
+  %op = call float @llvm.minimum.f32(float %src0, float %src1)
+  ret float %op
+}
+
+define void @s_minimum_f32(float inreg %src0, float inreg %src1) {
+; GFX7-LABEL: s_minimum_f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_min_f32_e32 v1, s4, v0
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, s4, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; use v0
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: s_minimum_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s5
+; GFX8-NEXT:    v_min_f32_e32 v1, s4, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, s4, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use v0
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_minimum_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-NEXT:    v_min_f32_e32 v1, s4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_minimum_f32:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NEXT:    v_min_f32_e32 v1, s0, v0
+; GFX940-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_minimum_f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f32_e64 v0, s4, s5
+; GFX10-NEXT:    v_cmp_o_f32_e64 vcc_lo, s4, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v0
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_minimum_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_min_f32_e64 v0, s0, s1
+; GFX11-NEXT:    v_cmp_o_f32_e64 vcc_lo, s0, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v0
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: s_minimum_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_minimum_f32 s0, s0, s1
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use s0
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call float @llvm.minimum.f32(float %src0, float %src1)
+  call void asm sideeffect "; use $0", "s"(float %op)
+  ret void
+}
+
+define <2 x float> @v_minimum_v2f32(<2 x float> %src0, <2 x float> %src1) {
+; GFX7-LABEL: v_minimum_v2f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_legacy_f32_e32 v4, v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v2, v1, v3
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_v2f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v2f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v2f32:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX940-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v2f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0x7fc00000, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0x7fc00000, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v1, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v2, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v3, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v2f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x7fc00000, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, 0x7fc00000, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 32
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v1, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v2, 32
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v3, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v2f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v2
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <2 x float> @llvm.minimum.v2f32(<2 x float> %src0, <2 x float> %src1)
+  ret <2 x float> %op
+}
+
+define <2 x float> @v_minimum_v2f32__nnan(<2 x float> %src0, <2 x float> %src1) {
+; GFX7-LABEL: v_minimum_v2f32__nnan:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_legacy_f32_e32 v4, v0, v2
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v2, v1, v3
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_v2f32__nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v2f32__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v2f32__nnan:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v2f32__nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v1, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v2, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v3, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v2f32__nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 32
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v1, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v2, 32
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v3, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v2f32__nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v2
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan <2 x float> @llvm.minimum.v2f32(<2 x float> %src0, <2 x float> %src1)
+  ret <2 x float> %op
+}
+
+define <2 x float> @v_minimum_v2f32__nsz(<2 x float> %src0, <2 x float> %src1) {
+; GFX7-LABEL: v_minimum_v2f32__nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_legacy_f32_e32 v4, v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v2, v1, v3
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_v2f32__nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v2f32__nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v2f32__nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX940-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v2f32__nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc00000, v5, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v2f32__nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc00000, v5, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v2f32__nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v2
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nsz <2 x float> @llvm.minimum.v2f32(<2 x float> %src0, <2 x float> %src1)
+  ret <2 x float> %op
+}
+
+define <2 x float> @v_minimum_v2f32__nnan_nsz(<2 x float> %src0, <2 x float> %src1) {
+; GFX7-LABEL: v_minimum_v2f32__nnan_nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_min_legacy_f32_e32 v1, v1, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_v2f32__nnan_nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v2f32__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v2f32__nnan_nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v2f32__nnan_nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v2f32__nnan_nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v2f32__nnan_nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v2
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan nsz <2 x float> @llvm.minimum.v2f32(<2 x float> %src0, <2 x float> %src1)
+  ret <2 x float> %op
+}
+
+define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) {
+; GFX7-LABEL: s_minimum_v2f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s7
+; GFX7-NEXT:    v_min_legacy_f32_e32 v1, s5, v0
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, s5, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, s5, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, s7, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_min_legacy_f32_e32 v3, s4, v0
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, s4, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v3, s4
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, s4, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v2, v3, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, s6, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; use v[0:1]
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: s_minimum_v2f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s7
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, s5, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, s5, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, s5, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, s7, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v0, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v0, v2, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, s4, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, s4, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, s6, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use v[0:1]
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_minimum_v2f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, s5, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s5, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, s5, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, s7, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, s4, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, s6, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v[0:1]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_minimum_v2f32:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, s3
+; GFX940-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, s1, v0
+; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, s1, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, s1, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, s3, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v2, v0, vcc
+; GFX940-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v0, v2, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, s0, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, s2, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v[0:1]
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_minimum_v2f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-NEXT:    v_cmp_lt_f32_e64 vcc_lo, s5, s7
+; GFX10-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-NEXT:    v_cmp_class_f32_e64 s8, s5, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, s7, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e64 vcc_lo, s4, s6
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, s6, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e64 vcc_lo, s5, s7
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e64 vcc_lo, s4, s6
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc00000, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s8
+; GFX10-NEXT:    v_cmp_class_f32_e64 s5, s4, 32
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v2, s4, s5
+; GFX10-NEXT:    v_cmp_class_f32_e64 s4, s7, 32
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s7, s4
+; GFX10-NEXT:    v_cmp_class_f32_e64 s4, s6, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s6, s4
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v[0:1]
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_minimum_v2f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT:    v_cmp_lt_f32_e64 vcc_lo, s1, s3
+; GFX11-NEXT:    v_cmp_class_f32_e64 s4, s1, 32
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, s3, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e64 vcc_lo, s0, s2
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, s2, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e64 vcc_lo, s1, s3
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e64 vcc_lo, s0, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc00000, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s4
+; GFX11-NEXT:    v_cmp_class_f32_e64 s1, s0, 32
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v2, s0, s1
+; GFX11-NEXT:    v_cmp_class_f32_e64 s0, s3, 32
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
+; GFX11-NEXT:    v_cmp_class_f32_e64 s0, s2, 32
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, s2, s0
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v[0:1]
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: s_minimum_v2f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_minimum_f32 s1, s1, s3
+; GFX12-NEXT:    s_minimum_f32 s0, s0, s2
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use s[0:1]
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <2 x float> @llvm.minimum.v2f32(<2 x float> %src0, <2 x float> %src1)
+  call void asm sideeffect "; use $0", "s"(<2 x float> %op)
+  ret void
+}
+
+define <3 x float> @v_minimum_v3f32(<3 x float> %src0, <3 x float> %src1) {
+; GFX7-LABEL: v_minimum_v3f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_legacy_f32_e32 v6, v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v3, v1, v4
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v3, v2, v5
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_v3f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v3f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v3f32:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
+; GFX940-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v3f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v4, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v5, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0x7fc00000, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0x7fc00000, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, 0x7fc00000, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v1, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v2, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v3, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v4, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v5, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v3f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v4, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v5, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0x7fc00000, v6, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, 0x7fc00000, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, 0x7fc00000, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v1, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v2, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v3, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v4, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v5, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v3f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v3
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v4
+; GFX12-NEXT:    v_minimum_f32 v2, v2, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <3 x float> @llvm.minimum.v3f32(<3 x float> %src0, <3 x float> %src1)
+  ret <3 x float> %op
+}
+
+define <3 x float> @v_minimum_v3f32__nnan(<3 x float> %src0, <3 x float> %src1) {
+; GFX7-LABEL: v_minimum_v3f32__nnan:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_legacy_f32_e32 v6, v0, v3
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v3, v1, v4
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v3, v2, v5
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_v3f32__nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v3f32__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v3f32__nnan:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v3f32__nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v4, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v5, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v1, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v2, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v3, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v4, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v5, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v3f32__nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v4, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v5, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v1, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v2, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v3, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v4, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v5, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v3f32__nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v3
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v4
+; GFX12-NEXT:    v_minimum_f32 v2, v2, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan <3 x float> @llvm.minimum.v3f32(<3 x float> %src0, <3 x float> %src1)
+  ret <3 x float> %op
+}
+
+define <3 x float> @v_minimum_v3f32__nsz(<3 x float> %src0, <3 x float> %src1) {
+; GFX7-LABEL: v_minimum_v3f32__nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_legacy_f32_e32 v6, v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v3, v1, v4
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v3, v2, v5
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_v3f32__nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v3f32__nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v3f32__nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
+; GFX940-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v3f32__nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v4, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v5, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc00000, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc00000, v8, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v3f32__nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v4, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v5, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v6, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc00000, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc00000, v8, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v3f32__nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v3
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v4
+; GFX12-NEXT:    v_minimum_f32 v2, v2, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nsz <3 x float> @llvm.minimum.v3f32(<3 x float> %src0, <3 x float> %src1)
+  ret <3 x float> %op
+}
+
+define <3 x float> @v_minimum_v3f32__nnan_nsz(<3 x float> %src0, <3 x float> %src1) {
+; GFX7-LABEL: v_minimum_v3f32__nnan_nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_min_legacy_f32_e32 v1, v1, v4
+; GFX7-NEXT:    v_min_legacy_f32_e32 v2, v2, v5
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_v3f32__nnan_nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v3f32__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v3f32__nnan_nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v3f32__nnan_nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v3f32__nnan_nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v3f32__nnan_nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v3
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v4
+; GFX12-NEXT:    v_minimum_f32 v2, v2, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan nsz <3 x float> @llvm.minimum.v3f32(<3 x float> %src0, <3 x float> %src1)
+  ret <3 x float> %op
+}
+
+define <4 x float> @v_minimum_v4f32(<4 x float> %src0, <4 x float> %src1) {
+; GFX7-LABEL: v_minimum_v4f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_legacy_f32_e32 v8, v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v4, v1, v5
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v4, v2, v6
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v6, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v4, v3, v7
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v7, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_v4f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v1, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v2, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v6, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v3, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v7, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v4f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v6, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v7, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v4f32:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
+; GFX940-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v1, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v6
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v6, v2, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v6, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v7
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v7, v3, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v7, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v4f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, 0x7fc00000, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v5, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v6, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v4, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v7, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, 0x7fc00000, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, 0x7fc00000, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0x7fc00000, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v1, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v2, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v3, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v5, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v6, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v7, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v4f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, 0x7fc00000, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v5, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v6
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v6, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v4, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v7, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, 0x7fc00000, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v6
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, 0x7fc00000, v10, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x7fc00000, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v1, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v2, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v3, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v5, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v6, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v7, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v4f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v4
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v5
+; GFX12-NEXT:    v_minimum_f32 v2, v2, v6
+; GFX12-NEXT:    v_minimum_f32 v3, v3, v7
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <4 x float> @llvm.minimum.v4f32(<4 x float> %src0, <4 x float> %src1)
+  ret <4 x float> %op
+}
+
+define <4 x float> @v_minimum_v4f32__nnan(<4 x float> %src0, <4 x float> %src1) {
+; GFX7-LABEL: v_minimum_v4f32__nnan:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_legacy_f32_e32 v8, v0, v4
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v4, v1, v5
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v4, v2, v6
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v6, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v4, v3, v7
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v7, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_v4f32__nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v1, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v6, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v3, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v7, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v4f32__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v6, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v7, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v4f32__nnan:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v1, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v6
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v6, v2, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v6, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v7
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v7, v3, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v7, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v4f32__nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v4, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v6, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v7, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v1, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v2, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v3, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v5, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v6, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v7, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v4f32__nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 32
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v4, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v5, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v6
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v6, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v7, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v1, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v2, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v3, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v5, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v6, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v7, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v4f32__nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v4
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v5
+; GFX12-NEXT:    v_minimum_f32 v2, v2, v6
+; GFX12-NEXT:    v_minimum_f32 v3, v3, v7
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan <4 x float> @llvm.minimum.v4f32(<4 x float> %src0, <4 x float> %src1)
+  ret <4 x float> %op
+}
+
+define <4 x float> @v_minimum_v4f32__nsz(<4 x float> %src0, <4 x float> %src1) {
+; GFX7-LABEL: v_minimum_v4f32__nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_legacy_f32_e32 v8, v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v4, v1, v5
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v4, v2, v6
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v4, v3, v7
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_v4f32__nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v1, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v2, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v3, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v4f32__nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v4f32__nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
+; GFX940-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v1, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v6
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v6, v2, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v7
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v7, v3, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v4f32__nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v6, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v7, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc00000, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc00000, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7fc00000, v9, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v4f32__nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v5, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v6
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v6, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v7, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc00000, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v6
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc00000, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7fc00000, v9, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v4f32__nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v4
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v5
+; GFX12-NEXT:    v_minimum_f32 v2, v2, v6
+; GFX12-NEXT:    v_minimum_f32 v3, v3, v7
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nsz <4 x float> @llvm.minimum.v4f32(<4 x float> %src0, <4 x float> %src1)
+  ret <4 x float> %op
+}
+
+define <4 x float> @v_minimum_v4f32__nnan_nsz(<4 x float> %src0, <4 x float> %src1) {
+; GFX7-LABEL: v_minimum_v4f32__nnan_nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v0, v4
+; GFX7-NEXT:    v_min_legacy_f32_e32 v1, v1, v5
+; GFX7-NEXT:    v_min_legacy_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_min_legacy_f32_e32 v3, v3, v7
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_v4f32__nnan_nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v4f32__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v4f32__nnan_nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v6
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v7
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v4f32__nnan_nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v4f32__nnan_nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v6
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v4f32__nnan_nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v4
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v5
+; GFX12-NEXT:    v_minimum_f32 v2, v2, v6
+; GFX12-NEXT:    v_minimum_f32 v3, v3, v7
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan nsz <4 x float> @llvm.minimum.v4f32(<4 x float> %src0, <4 x float> %src1)
+  ret <4 x float> %op
+}
+
+define <8 x float> @v_minimum_v8f32(<8 x float> %src0, <8 x float> %src1) {
+; GFX7-LABEL: v_minimum_v8f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_legacy_f32_e32 v16, v0, v8
+; GFX7-NEXT:    v_mov_b32_e32 v17, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v8
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v8, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v8, v1, v9
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v9
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v9, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v8, v2, v10
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v2, v10
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v10, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v8, v3, v11
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v3, v11
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v11, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v8, v4, v12
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v4, v12
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v12, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v8, v5, v13
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v5, v13
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v13, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v8, v6, v14
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v6, v14
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v6, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v14, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v8, v7, v15
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v7, v15
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v7, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v15, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_v8f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v17, 0x7fc00000
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v8, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v9
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v9, v1, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v9
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v9, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v10
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v10, v2, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v10
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v10, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v11, v3, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v11, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v12, v4, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v4, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v12, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v13
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v13, v5, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v5, v13
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v13, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v14
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v14, v6, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v6, v14
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v6, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v14, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v15, v7, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v7, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v7, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v15, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v8f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v17, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v8, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v9, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v10, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v10, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v11
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v11, v3, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v11
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v11, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v12
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v12, v4, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v12
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v12, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v13
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v13, v5, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v5, v13
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v13, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v14
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v14, v6, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v6, v14
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v6, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v14, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v15, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v7, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v7, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v15, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v8f32:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v8
+; GFX940-NEXT:    v_mov_b32_e32 v17, 0x7fc00000
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v8
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v8, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v9
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v9, v1, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v9
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v9, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v10
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v10, v2, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v10
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v10, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v11
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v11, v3, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v3, v11
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v11, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v12
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v12, v4, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v4, v12
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v12, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v13
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v13, v5, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v5, v13
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v13, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v14
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v14, v6, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v6, v14
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v6, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v14, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v15
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v15, v7, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v7, v15
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v7, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v15, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v8f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v17, v9, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, 0x7fc00000, v16, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v17, 0x7fc00000, v17, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v1, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v8, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v9, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v11, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, v12, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, 0x7fc00000, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, 0x7fc00000, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, 0x7fc00000, v16, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v2, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v3, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v4, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v10, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v11, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v12, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v13, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v14
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v14, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v15
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v15, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v5, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, 0x7fc00000, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v6, v14
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, 0x7fc00000, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v7, v15
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, 0x7fc00000, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v5, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v6, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v7, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v13, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v14, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v15, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v8f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v9
+; GFX11-NEXT:    v_cndmask_b32_e32 v17, v9, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v16, 0x7fc00000, v16, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v9
+; GFX11-NEXT:    v_cndmask_b32_e32 v17, 0x7fc00000, v17, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 32
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v1, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v8, 32
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v9, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v17
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v10
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v10, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v11
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v11, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v12
+; GFX11-NEXT:    v_cndmask_b32_e32 v16, v12, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v10
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, 0x7fc00000, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v11
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, 0x7fc00000, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v12
+; GFX11-NEXT:    v_cndmask_b32_e32 v16, 0x7fc00000, v16, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v2, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v3, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v4, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v10, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v11, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v12, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v13
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v13, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v14
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v14, v6, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v15
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v15, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v5, v13
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, 0x7fc00000, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v6, v14
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, 0x7fc00000, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v7, v15
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, 0x7fc00000, v10, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v5, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v6, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v7, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v13, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v14, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v15, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v8f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v8
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v9
+; GFX12-NEXT:    v_minimum_f32 v2, v2, v10
+; GFX12-NEXT:    v_minimum_f32 v3, v3, v11
+; GFX12-NEXT:    v_minimum_f32 v4, v4, v12
+; GFX12-NEXT:    v_minimum_f32 v5, v5, v13
+; GFX12-NEXT:    v_minimum_f32 v6, v6, v14
+; GFX12-NEXT:    v_minimum_f32 v7, v7, v15
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <8 x float> @llvm.minimum.v8f32(<8 x float> %src0, <8 x float> %src1)
+  ret <8 x float> %op
+}
+
+define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) {
+; GFX7-LABEL: v_minimum_v16f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_legacy_f32_e32 v32, v0, v16
+; GFX7-NEXT:    v_mov_b32_e32 v31, 0x7fc00000
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v32, v31, v32, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v32, v0, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v16, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v32
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v32, v0, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v16, v1, v17
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v1, v17
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v16, v1, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v17, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
+; GFX7-NEXT:    buffer_load_dword v17, off, s[0:3], s32
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v16, v1, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v16, v2, v18
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v2, v18
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v18, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v16, v3, v19
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v3, v19
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v19, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v16, v4, v20
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v4, v20
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v20, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v16, v5, v21
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v5, v21
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v21, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v16, v6, v22
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v6, v22
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v6, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v16, v6, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v22, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v16, v6, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v16, v7, v23
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v7, v23
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v7, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v16, v7, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v23, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v16, v7, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v16, v8, v24
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v8, v24
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v8, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v24, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v16, v9, v25
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v9, v25
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v9, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v9, v16, v9, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v25, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v9, v16, v9, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v16, v10, v26
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v10, v26
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v10, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v10, v16, v10, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v26, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v10, v16, v10, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v16, v11, v27
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v11, v27
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v11, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v11, v16, v11, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v27, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v11, v16, v11, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v16, v12, v28
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v12, v28
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v12, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v12, v16, v12, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v28, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v12, v16, v12, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v16, v13, v29
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v13, v29
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v13, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v13, v16, v13, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v29, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v13, v16, v13, vcc
+; GFX7-NEXT:    v_min_legacy_f32_e32 v16, v14, v30
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v14, v30
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v14, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v14, v16, v14, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v30, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v14, v16, v14, vcc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_min_legacy_f32_e32 v16, v15, v17
+; GFX7-NEXT:    v_cmp_o_f32_e32 vcc, v15, v17
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v15, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e64 vcc, v17, 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v15, v15, v17, vcc
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_v16f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v32, v16, v0, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v31, 0x7fc00000
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v32, v31, v32, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v32, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v16, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v32
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v32, v0, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v17, v1, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v1, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v16, v1, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v17, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
+; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v16, v1, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v18
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v18, v2, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v2, v18
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v18, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v19
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v19, v3, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v3, v19
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v19, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v20
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v20, v4, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v4, v20
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v20, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v21
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v21, v5, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v5, v21
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v21, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v22
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v22, v6, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v6, v22
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v6, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v16, v6, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v22, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v16, v6, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v23
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v23, v7, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v7, v23
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v7, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v16, v7, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v23, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v16, v7, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v8, v24
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v24, v8, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v8, v24
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v8, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v24, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v9, v25
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v25, v9, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v9, v25
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v9, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v16, v9, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v25, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v16, v9, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v10, v26
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v26, v10, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v10, v26
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v10, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v16, v10, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v26, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v16, v10, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v11, v27
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v27, v11, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v11, v27
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v11, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v16, v11, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v27, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v16, v11, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v12, v28
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v28, v12, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v12, v28
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v12, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v16, v12, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v28, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v16, v12, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v13, v29
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v29, v13, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v13, v29
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v13, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v16, v13, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v29, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v16, v13, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v14, v30
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v30, v14, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v14, v30
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v14, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v16, v14, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v30, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v16, v14, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v15, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v17, v15, vcc
+; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v15, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v15, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e64 vcc, v17, 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v15, v17, vcc
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v16f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v32, v16, v0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v31, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v32, v31, v32, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v32, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v16, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v32, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v17
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v17
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v16, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v17, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v16, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v18
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v18, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v18
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v18, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v19
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v19, v3, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v19
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v19, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v20
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v20, v4, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v20
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v20, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v21
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v21, v5, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v5, v21
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v21, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v22
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v22, v6, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v6, v22
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v6, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v16, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v22, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v16, v6, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v23
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v23, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v7, v23
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v7, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v16, v7, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v23, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v16, v7, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v8, v24
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v24, v8, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v8, v24
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v8, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v24, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v9, v25
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v25, v9, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v9, v25
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v9, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v16, v9, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v25, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v16, v9, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v10, v26
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v26, v10, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v10, v26
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v10, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v16, v10, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v26, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v16, v10, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v11, v27
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v27, v11, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v11, v27
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v11, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v16, v11, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v27, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v16, v11, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v12, v28
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v28, v12, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v12, v28
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v12, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v16, v12, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v28, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v16, v12, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v13, v29
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v29, v13, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v13, v29
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v13, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v16, v13, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v29, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v16, v13, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v14, v30
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v30, v14, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v14, v30
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v14, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v16, v14, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v30, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v16, v14, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v15, v17
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v15, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v15, v17
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v15, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v17, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v15, v17, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v16f32:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    scratch_load_dword v31, off, s32
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v16
+; GFX940-NEXT:    v_mov_b32_e32 v32, 0x7fc00000
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v33, v16, v0, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v33, v32, v33, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v33, v0, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v16, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v33
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v33, v0, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v17
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v17, v1, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v17
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v16, v1, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v17, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v16, v1, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v18
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v18, v2, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v18
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v18, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v3, v19
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v19, v3, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v3, v19
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v19, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v20
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v20, v4, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v4, v20
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v20, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v21
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v21, v5, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v5, v21
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v21, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v22
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v22, v6, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v6, v22
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v6, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v16, v6, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v22, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v16, v6, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v23
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v23, v7, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v7, v23
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v7, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v16, v7, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v23, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v16, v7, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v8, v24
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v24, v8, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v8, v24
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v8, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v24, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v9, v25
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v25, v9, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v9, v25
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v9, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v9, v16, v9, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v25, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v9, v16, v9, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v10, v26
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v26, v10, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v10, v26
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v10, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v10, v16, v10, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v26, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v10, v16, v10, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v11, v27
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v27, v11, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v11, v27
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v11, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v11, v16, v11, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v27, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v11, v16, v11, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v12, v28
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v28, v12, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v12, v28
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v12, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v12, v16, v12, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v28, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v12, v16, v12, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v13, v29
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v29, v13, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v13, v29
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v13, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v13, v16, v13, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v29, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v13, v16, v13, vcc
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v14, v30
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v30, v14, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v14, v30
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v14, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v14, v16, v14, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v30, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v14, v16, v14, vcc
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_cmp_lt_f32_e32 vcc, v15, v31
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v31, v15, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v15, v31
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v15, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
+; GFX940-NEXT:    v_cmp_class_f32_e64 vcc, v31, 32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v15, v15, v31, vcc
+; GFX940-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v16f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v16
+; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT:    v_cndmask_b32_e32 v32, v16, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v17
+; GFX10-NEXT:    v_cndmask_b32_e32 v33, v17, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v18
+; GFX10-NEXT:    v_cndmask_b32_e32 v34, v18, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v19
+; GFX10-NEXT:    v_cndmask_b32_e32 v35, v19, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v20
+; GFX10-NEXT:    v_cndmask_b32_e32 v36, v20, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v21
+; GFX10-NEXT:    v_cndmask_b32_e32 v37, v21, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v22
+; GFX10-NEXT:    v_cndmask_b32_e32 v38, v22, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v23
+; GFX10-NEXT:    v_cndmask_b32_e32 v39, v23, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v24
+; GFX10-NEXT:    v_cndmask_b32_e32 v48, v24, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v25
+; GFX10-NEXT:    v_cndmask_b32_e32 v49, v25, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v26
+; GFX10-NEXT:    v_cndmask_b32_e32 v50, v26, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v27
+; GFX10-NEXT:    v_cndmask_b32_e32 v51, v27, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v28
+; GFX10-NEXT:    v_cndmask_b32_e32 v52, v28, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v13, v29
+; GFX10-NEXT:    v_cndmask_b32_e32 v53, v29, v13, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v14, v30
+; GFX10-NEXT:    v_cndmask_b32_e32 v54, v30, v14, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v16
+; GFX10-NEXT:    v_cndmask_b32_e32 v32, 0x7fc00000, v32, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v17
+; GFX10-NEXT:    v_cndmask_b32_e32 v33, 0x7fc00000, v33, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v18
+; GFX10-NEXT:    v_cndmask_b32_e32 v34, 0x7fc00000, v34, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v19
+; GFX10-NEXT:    v_cndmask_b32_e32 v35, 0x7fc00000, v35, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v20
+; GFX10-NEXT:    v_cndmask_b32_e32 v36, 0x7fc00000, v36, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v5, v21
+; GFX10-NEXT:    v_cndmask_b32_e32 v37, 0x7fc00000, v37, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v6, v22
+; GFX10-NEXT:    v_cndmask_b32_e32 v38, 0x7fc00000, v38, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v7, v23
+; GFX10-NEXT:    v_cndmask_b32_e32 v39, 0x7fc00000, v39, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v8, v24
+; GFX10-NEXT:    v_cndmask_b32_e32 v48, 0x7fc00000, v48, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v9, v25
+; GFX10-NEXT:    v_cndmask_b32_e32 v49, 0x7fc00000, v49, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v10, v26
+; GFX10-NEXT:    v_cndmask_b32_e32 v50, 0x7fc00000, v50, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v11, v27
+; GFX10-NEXT:    v_cndmask_b32_e32 v51, 0x7fc00000, v51, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v12, v28
+; GFX10-NEXT:    v_cndmask_b32_e32 v52, 0x7fc00000, v52, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v13, v29
+; GFX10-NEXT:    v_cndmask_b32_e32 v53, 0x7fc00000, v53, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v14, v30
+; GFX10-NEXT:    v_cndmask_b32_e32 v54, 0x7fc00000, v54, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v32, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v1, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v33, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v2, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v34, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v3, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v35, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v4, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v36, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v5, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v37, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v6, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v38, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v7, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v39, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v8, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v48, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v9, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v49, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v10, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v50, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v11, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v51, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v12, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v52, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v13, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v53, v13, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v14, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, v54, v14, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v16, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v17, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v18, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v19, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v20, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v21, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v22, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v23, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v24, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v25, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v26, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v27, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v28, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v29, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v30, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v32
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v32, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v33
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v33, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v34
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v34, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v35
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v35, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v36
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v36, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v37
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v37, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v38
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v38, v6, vcc_lo
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v15, v31
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, v31, v15, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v39
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v39, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v48
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v48, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v15, v31
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, 0x7fc00000, v16, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v49
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v49, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v50, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v15, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v51
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v51, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v52
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v52, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v31, 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, v15, v31, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v53
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v53, v13, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v54
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, v54, v14, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v16f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v16
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    v_cndmask_b32_e32 v32, v16, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v17
+; GFX11-NEXT:    v_cndmask_b32_e32 v33, v17, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v18
+; GFX11-NEXT:    v_cndmask_b32_e32 v34, v18, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v19
+; GFX11-NEXT:    v_cndmask_b32_e32 v35, v19, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v20
+; GFX11-NEXT:    v_cndmask_b32_e32 v36, v20, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v21
+; GFX11-NEXT:    v_cndmask_b32_e32 v37, v21, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v22
+; GFX11-NEXT:    v_cndmask_b32_e32 v38, v22, v6, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v23
+; GFX11-NEXT:    v_cndmask_b32_e32 v39, v23, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v8, v24
+; GFX11-NEXT:    v_cndmask_b32_e32 v48, v24, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v25
+; GFX11-NEXT:    v_cndmask_b32_e32 v49, v25, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v26
+; GFX11-NEXT:    v_cndmask_b32_e32 v50, v26, v10, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v11, v27
+; GFX11-NEXT:    v_cndmask_b32_e32 v51, v27, v11, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v28
+; GFX11-NEXT:    v_cndmask_b32_e32 v52, v28, v12, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v13, v29
+; GFX11-NEXT:    v_cndmask_b32_e32 v53, v29, v13, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v14, v30
+; GFX11-NEXT:    v_cndmask_b32_e32 v54, v30, v14, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v16
+; GFX11-NEXT:    v_cndmask_b32_e32 v32, 0x7fc00000, v32, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v17
+; GFX11-NEXT:    v_cndmask_b32_e32 v33, 0x7fc00000, v33, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v2, v18
+; GFX11-NEXT:    v_cndmask_b32_e32 v34, 0x7fc00000, v34, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v3, v19
+; GFX11-NEXT:    v_cndmask_b32_e32 v35, 0x7fc00000, v35, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v4, v20
+; GFX11-NEXT:    v_cndmask_b32_e32 v36, 0x7fc00000, v36, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v5, v21
+; GFX11-NEXT:    v_cndmask_b32_e32 v37, 0x7fc00000, v37, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v6, v22
+; GFX11-NEXT:    v_cndmask_b32_e32 v38, 0x7fc00000, v38, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v7, v23
+; GFX11-NEXT:    v_cndmask_b32_e32 v39, 0x7fc00000, v39, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v8, v24
+; GFX11-NEXT:    v_cndmask_b32_e32 v48, 0x7fc00000, v48, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v9, v25
+; GFX11-NEXT:    v_cndmask_b32_e32 v49, 0x7fc00000, v49, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v10, v26
+; GFX11-NEXT:    v_cndmask_b32_e32 v50, 0x7fc00000, v50, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v11, v27
+; GFX11-NEXT:    v_cndmask_b32_e32 v51, 0x7fc00000, v51, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v12, v28
+; GFX11-NEXT:    v_cndmask_b32_e32 v52, 0x7fc00000, v52, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v13, v29
+; GFX11-NEXT:    v_cndmask_b32_e32 v53, 0x7fc00000, v53, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v14, v30
+; GFX11-NEXT:    v_cndmask_b32_e32 v54, 0x7fc00000, v54, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v32, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v1, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v33, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v2, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v34, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v3, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v35, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v4, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v36, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v5, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v37, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v6, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v38, v6, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v7, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v39, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v8, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v48, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v9, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v49, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v10, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v50, v10, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v11, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, v51, v11, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v12, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v12, v52, v12, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v13, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, v53, v13, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v14, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v14, v54, v14, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v16, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v17, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v18, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v19, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v20, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v20, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v21, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v5, v21, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v22, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v6, v22, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v23, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v7, v23, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v24, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v8, v24, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v25, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v9, v25, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v26, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v27, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v28, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v12, v12, v28, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v29, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, v13, v29, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v30, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v14, v14, v30, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v32
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v32, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v33
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v33, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v34
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v34, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v35
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v35, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v36
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v36, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v37
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v37, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v38
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v38, v6, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v15, v31
+; GFX11-NEXT:    v_cndmask_b32_e32 v16, v31, v15, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v39
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v39, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v48
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v48, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f32_e32 vcc_lo, v15, v31
+; GFX11-NEXT:    v_cndmask_b32_e32 v16, 0x7fc00000, v16, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v49
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v49, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v50, v10, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v15, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v51
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, v51, v11, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v52
+; GFX11-NEXT:    v_cndmask_b32_e32 v12, v52, v12, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v31, 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v15, v15, v31, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v53
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, v53, v13, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v54
+; GFX11-NEXT:    v_cndmask_b32_e32 v14, v54, v14, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX11-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v16f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    scratch_load_b32 v31, off, s32
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v16
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v17
+; GFX12-NEXT:    v_minimum_f32 v2, v2, v18
+; GFX12-NEXT:    v_minimum_f32 v3, v3, v19
+; GFX12-NEXT:    v_minimum_f32 v4, v4, v20
+; GFX12-NEXT:    v_minimum_f32 v5, v5, v21
+; GFX12-NEXT:    v_minimum_f32 v6, v6, v22
+; GFX12-NEXT:    v_minimum_f32 v7, v7, v23
+; GFX12-NEXT:    v_minimum_f32 v8, v8, v24
+; GFX12-NEXT:    v_minimum_f32 v9, v9, v25
+; GFX12-NEXT:    v_minimum_f32 v10, v10, v26
+; GFX12-NEXT:    v_minimum_f32 v11, v11, v27
+; GFX12-NEXT:    v_minimum_f32 v12, v12, v28
+; GFX12-NEXT:    v_minimum_f32 v13, v13, v29
+; GFX12-NEXT:    v_minimum_f32 v14, v14, v30
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_minimum_f32 v15, v15, v31
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <16 x float> @llvm.minimum.v16f32(<16 x float> %src0, <16 x float> %src1)
+  ret <16 x float> %op
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
new file mode 100644
index 000000000000..7013c60bada5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
@@ -0,0 +1,6157 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx703 < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GCN,GFX940 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+
+define double @v_minimum_f64(double %src0, double %src1) {
+; GFX7-LABEL: v_minimum_f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX7-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX8-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_f64:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX10-NEXT:    v_cmp_u_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, 0x7ff80000, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX11-NEXT:    v_cmp_u_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, 0x7ff80000, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call double @llvm.minimum.f64(double %src0, double %src1)
+  ret double %op
+}
+
+define double @v_minimum_f64__nnan(double %src0, double %src1) {
+; GFX7-LABEL: v_minimum_f64__nnan:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_f64__nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_f64__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_f64__nnan:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_f64__nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_f64__nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_f64__nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan double @llvm.minimum.f64(double %src0, double %src1)
+  ret double %op
+}
+
+define double @v_minimum_f64__nsz(double %src0, double %src1) {
+; GFX7-LABEL: v_minimum_f64__nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX7-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_f64__nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX8-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_f64__nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_f64__nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_f64__nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX10-NEXT:    v_cmp_u_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, 0x7ff80000, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_f64__nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX11-NEXT:    v_cmp_u_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, 0x7ff80000, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_f64__nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nsz double @llvm.minimum.f64(double %src0, double %src1)
+  ret double %op
+}
+
+define double @v_minimum_f64__nnan_nsz(double %src0, double %src1) {
+; GFX7-LABEL: v_minimum_f64__nnan_nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_f64__nnan_nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_f64__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_f64__nnan_nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_f64__nnan_nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_f64__nnan_nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_f64__nnan_nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan nsz double @llvm.minimum.f64(double %src0, double %src1)
+  ret double %op
+}
+
+define double @v_minimum_f64__nnan_src0(double %arg0, double %src1) {
+; GFX7-LABEL: v_minimum_f64__nnan_src0:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX7-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX7-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_f64__nnan_src0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX8-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX8-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_f64__nnan_src0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX9-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_f64__nnan_src0:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX940-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_f64__nnan_src0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX10-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX10-NEXT:    v_cmp_u_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, 0x7ff80000, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_f64__nnan_src0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX11-NEXT:    v_cmp_u_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, 0x7ff80000, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_f64__nnan_src0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_f64_e32 v[0:1], 1.0, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %src0 = fadd nnan double %arg0, 1.0
+  %op = call double @llvm.minimum.f64(double %src0, double %src1)
+  ret double %op
+}
+
+define double @v_minimum_f64__nnan_src1(double %src0, double %arg1) {
+; GFX7-LABEL: v_minimum_f64__nnan_src1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX7-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX7-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_f64__nnan_src1:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX8-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX8-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_f64__nnan_src1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX9-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_f64__nnan_src1:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX940-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_f64__nnan_src1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX10-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX10-NEXT:    v_cmp_u_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, 0x7ff80000, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_f64__nnan_src1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX11-NEXT:    v_cmp_u_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, 0x7ff80000, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_f64__nnan_src1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_add_f64_e32 v[2:3], 1.0, v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %src1 = fadd nnan double %arg1, 1.0
+  %op = call double @llvm.minimum.f64(double %src0, double %src1)
+  ret double %op
+}
+
+define void @s_minimum_f64(double inreg %src0, double inreg %src1) {
+; GFX7-LABEL: s_minimum_f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_min_f64 v[2:3], s[4:5], v[0:1]
+; GFX7-NEXT:    v_cmp_u_f64_e32 vcc, s[4:5], v[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; use v[0:1]
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: s_minimum_f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_min_f64 v[2:3], s[4:5], v[0:1]
+; GFX8-NEXT:    v_cmp_u_f64_e32 vcc, s[4:5], v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use v[0:1]
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_minimum_f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_min_f64 v[2:3], s[4:5], v[0:1]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, s[4:5], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v[0:1]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_minimum_f64:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NEXT:    v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX940-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v[0:1]
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_minimum_f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f64 v[0:1], s[4:5], s[6:7]
+; GFX10-NEXT:    v_cmp_u_f64_e64 s4, s[4:5], s[6:7]
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, 0x7ff80000, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s4
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v[0:1]
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_minimum_f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_min_f64 v[0:1], s[0:1], s[2:3]
+; GFX11-NEXT:    v_cmp_u_f64_e64 s0, s[0:1], s[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, 0x7ff80000, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s0
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v[0:1]
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: s_minimum_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], s[0:1], s[2:3]
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use v[0:1]
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call double @llvm.minimum.f64(double %src0, double %src1)
+  call void asm sideeffect "; use $0", "s"(double %op)
+  ret void
+}
+
+define <2 x double> @v_minimum_v2f64(<2 x double> %src0, <2 x double> %src1) {
+; GFX7-LABEL: v_minimum_v2f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v10, 0x7ff80000
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[6:7], v[0:1], 32
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[8:9], v[4:5], 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v5, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v9, v10, v8, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[4:5], v[2:3], v[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[8:9]
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v11, v7, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v6, v2, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v11, v10, v11, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v10, 0, v4, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[6:7], 32
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[12:13]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_v2f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v10, 0x7ff80000
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[6:7], v[0:1], 32
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[8:9], v[4:5], 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v5, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v10, v8, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[4:5], v[2:3], v[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[8:9]
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v7, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v10, v11, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, v4, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[6:7], 32
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[12:13]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v2f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7ff80000
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[6:7], v[0:1], 32
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[8:9], v[4:5], 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v5, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v10, v8, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[4:5], v[2:3], v[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[8:9]
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v7, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v10, v11, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, v4, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[6:7], 32
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[12:13]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v2f64:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5]
+; GFX940-NEXT:    v_mov_b32_e32 v10, 0x7ff80000
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[0:1], v[4:5]
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v5, v1, vcc
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v9, v10, v8, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[4:5], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[8:9]
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[6:7]
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v7, v3, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[2:3], v[6:7]
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v5, v10, v4, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v6, v2, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[6:7], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[4:5]
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[2:3]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v2f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s4, v[2:3], v[6:7]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s5, v[0:1], v[4:5]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s6, v[2:3], v[6:7]
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v5, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v7, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v6, v2, s4
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[0:1], 32
+; GFX10-NEXT:    v_cmp_class_f64_e64 s4, v[2:3], 32
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0x7ff80000, v8, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v10, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, v12, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, v13, s6
+; GFX10-NEXT:    v_cmp_class_f64_e64 s5, v[4:5], 32
+; GFX10-NEXT:    v_cmp_class_f64_e64 s6, v[6:7], 32
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s7, 0, v[8:9]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s8, 0, v[10:11]
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s8
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v2f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s1, v[0:1], v[4:5]
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s0, v[2:3], v[6:7]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s2, v[2:3], v[6:7]
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v5, v1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v10, v7, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v13, v6, v2, s0
+; GFX11-NEXT:    v_cmp_class_f64_e64 s0, v[2:3], 32
+; GFX11-NEXT:    v_cndmask_b32_e64 v9, 0x7ff80000, v8, s1
+; GFX11-NEXT:    v_cndmask_b32_e32 v12, v4, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[0:1], 32
+; GFX11-NEXT:    v_cndmask_b32_e64 v11, 0x7ff80000, v10, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v10, 0, v13, s2
+; GFX11-NEXT:    v_cmp_class_f64_e64 s2, v[6:7], 32
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s4, 0, v[10:11]
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v8, 0, v12, s1
+; GFX11-NEXT:    v_cmp_class_f64_e64 s1, v[4:5], 32
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s3, 0, v[8:9]
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v2f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    v_minimum_f64 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <2 x double> @llvm.minimum.v2f64(<2 x double> %src0, <2 x double> %src1)
+  ret <2 x double> %op
+}
+
+define <2 x double> @v_minimum_v2f64__nnan(<2 x double> %src0, <2 x double> %src1) {
+; GFX7-LABEL: v_minimum_v2f64__nnan:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5]
+; GFX7-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[6:7]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[6:7], v[4:5], 32
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[10:11], v[6:7], 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v9, v5, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v11, v7, v3, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v10, v6, v2, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], 32
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[8:9], 0, v[8:9]
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[12:13]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_v2f64__nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5]
+; GFX8-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[6:7]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[6:7], v[4:5], 32
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[10:11], v[6:7], 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v5, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v7, v3, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, v6, v2, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], 32
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[8:9], 0, v[8:9]
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[12:13]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v2f64__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[6:7]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[6:7], v[4:5], 32
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[10:11], v[6:7], 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v5, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v7, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v6, v2, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], 32
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[8:9], 0, v[8:9]
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[12:13]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v2f64__nnan:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5]
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[4:5], 32
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v9, v5, v1, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[8:9]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[6:7]
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v7, v3, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v6, v2, vcc
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[6:7], 32
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[4:5]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[0:1]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[2:3]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v2f64__nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s4, v[2:3], v[6:7]
+; GFX10-NEXT:    v_cmp_class_f64_e64 s5, v[4:5], 32
+; GFX10-NEXT:    v_cmp_class_f64_e64 s6, v[6:7], 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v5, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v7, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v6, v2, s4
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[0:1], 32
+; GFX10-NEXT:    v_cmp_class_f64_e64 s4, v[2:3], 32
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s7, 0, v[8:9]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s8, 0, v[10:11]
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s8
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v2f64__nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s0, v[2:3], v[6:7]
+; GFX11-NEXT:    v_cmp_class_f64_e64 s1, v[4:5], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s2, v[6:7], 32
+; GFX11-NEXT:    v_dual_cndmask_b32 v9, v5, v1 :: v_dual_cndmask_b32 v8, v4, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v11, v7, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v10, v6, v2, s0
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[0:1], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s0, v[2:3], 32
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s3, 0, v[8:9]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s4, 0, v[10:11]
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v8, v0 :: v_dual_cndmask_b32 v1, v9, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v2f64__nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    v_minimum_f64 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan <2 x double> @llvm.minimum.v2f64(<2 x double> %src0, <2 x double> %src1)
+  ret <2 x double> %op
+}
+
+define <2 x double> @v_minimum_v2f64__nsz(<2 x double> %src0, <2 x double> %src1) {
+; GFX7-LABEL: v_minimum_v2f64__nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5]
+; GFX7-NEXT:    v_cmp_lt_f64_e64 s[6:7], v[2:3], v[6:7]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[4:5]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[8:9], v[2:3], v[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v6, v2, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, v8, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[8:9]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_v2f64__nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5]
+; GFX8-NEXT:    v_cmp_lt_f64_e64 s[6:7], v[2:3], v[6:7]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[4:5]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[8:9], v[2:3], v[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v6, v2, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, v8, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[8:9]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v2f64__nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_lt_f64_e64 s[6:7], v[2:3], v[6:7]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[8:9], v[2:3], v[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v6, v2, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, v8, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[8:9]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v2f64__nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5]
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[0:1], v[4:5]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX940-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[6:7]
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, 0, v8, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v2, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[2:3], v[6:7]
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, 0, v5, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v2f64__nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s4, v[2:3], v[6:7]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s5, v[0:1], v[4:5]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s6, v[2:3], v[6:7]
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v6, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, v8, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, v9, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0x7ff80000, v1, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0x7ff80000, v3, s6
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v2f64__nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s0, v[2:3], v[6:7]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s1, v[0:1], v[4:5]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s2, v[2:3], v[6:7]
+; GFX11-NEXT:    v_dual_cndmask_b32 v8, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v9, v6, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, v8, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0x7ff80000, v1, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, v9, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0x7ff80000, v3, s2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v2f64__nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    v_minimum_f64 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nsz <2 x double> @llvm.minimum.v2f64(<2 x double> %src0, <2 x double> %src1)
+  ret <2 x double> %op
+}
+
+define <2 x double> @v_minimum_v2f64__nnan_nsz(<2 x double> %src0, <2 x double> %src1) {
+; GFX7-LABEL: v_minimum_v2f64__nnan_nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5]
+; GFX7-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_v2f64__nnan_nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5]
+; GFX8-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v2f64__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v2f64__nnan_nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5]
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[6:7]
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v2f64__nnan_nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s4, v[2:3], v[6:7]
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v2f64__nnan_nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s0, v[2:3], v[6:7]
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v2f64__nnan_nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    v_minimum_f64 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan nsz <2 x double> @llvm.minimum.v2f64(<2 x double> %src0, <2 x double> %src1)
+  ret <2 x double> %op
+}
+
+define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) {
+; GFX7-LABEL: s_minimum_v2f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s10
+; GFX7-NEXT:    v_mov_b32_e32 v1, s11
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, s[6:7], v[0:1]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[12:13], s[6:7], v[0:1]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[18:19], s[10:11], 32
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    v_mov_b32_e32 v1, s9
+; GFX7-NEXT:    s_and_b64 s[14:15], vcc, exec
+; GFX7-NEXT:    s_cselect_b32 s16, s7, s11
+; GFX7-NEXT:    s_and_b64 s[14:15], s[12:13], exec
+; GFX7-NEXT:    s_cselect_b32 s15, s16, 0x7ff80000
+; GFX7-NEXT:    s_and_b64 s[16:17], vcc, exec
+; GFX7-NEXT:    s_cselect_b32 s14, s6, s10
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[16:17], s[6:7], 32
+; GFX7-NEXT:    s_and_b64 s[12:13], s[12:13], exec
+; GFX7-NEXT:    s_cselect_b32 s14, s14, 0
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[20:21], s[14:15], 0
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX7-NEXT:    s_and_b64 s[12:13], s[16:17], exec
+; GFX7-NEXT:    s_cselect_b32 s7, s7, s15
+; GFX7-NEXT:    s_and_b64 s[12:13], s[18:19], exec
+; GFX7-NEXT:    s_cselect_b32 s7, s11, s7
+; GFX7-NEXT:    s_and_b64 s[12:13], s[20:21], exec
+; GFX7-NEXT:    s_cselect_b32 s7, s7, s15
+; GFX7-NEXT:    s_and_b64 s[12:13], s[16:17], exec
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[12:13], s[4:5], v[0:1]
+; GFX7-NEXT:    s_cselect_b32 s6, s6, s14
+; GFX7-NEXT:    s_and_b64 s[16:17], s[18:19], exec
+; GFX7-NEXT:    s_cselect_b32 s6, s10, s6
+; GFX7-NEXT:    s_and_b64 s[10:11], s[20:21], exec
+; GFX7-NEXT:    s_cselect_b32 s6, s6, s14
+; GFX7-NEXT:    s_and_b64 s[10:11], vcc, exec
+; GFX7-NEXT:    s_cselect_b32 s14, s5, s9
+; GFX7-NEXT:    s_and_b64 s[10:11], s[12:13], exec
+; GFX7-NEXT:    s_cselect_b32 s11, s14, 0x7ff80000
+; GFX7-NEXT:    s_and_b64 s[14:15], vcc, exec
+; GFX7-NEXT:    s_cselect_b32 s10, s4, s8
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[14:15], s[4:5], 32
+; GFX7-NEXT:    s_and_b64 s[12:13], s[12:13], exec
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[12:13], s[8:9], 32
+; GFX7-NEXT:    s_cselect_b32 s10, s10, 0
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[16:17], s[10:11], 0
+; GFX7-NEXT:    s_and_b64 s[18:19], s[14:15], exec
+; GFX7-NEXT:    s_cselect_b32 s5, s5, s11
+; GFX7-NEXT:    s_and_b64 s[18:19], s[12:13], exec
+; GFX7-NEXT:    s_cselect_b32 s5, s9, s5
+; GFX7-NEXT:    s_and_b64 s[18:19], s[16:17], exec
+; GFX7-NEXT:    s_cselect_b32 s5, s5, s11
+; GFX7-NEXT:    s_and_b64 s[14:15], s[14:15], exec
+; GFX7-NEXT:    s_cselect_b32 s4, s4, s10
+; GFX7-NEXT:    s_and_b64 s[12:13], s[12:13], exec
+; GFX7-NEXT:    s_cselect_b32 s4, s8, s4
+; GFX7-NEXT:    s_and_b64 s[8:9], s[16:17], exec
+; GFX7-NEXT:    s_cselect_b32 s4, s4, s10
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; use s[4:7]
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: s_minimum_v2f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, s[6:7], v[0:1]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[12:13], s[6:7], v[0:1]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[18:19], s[10:11], 32
+; GFX8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX8-NEXT:    s_and_b64 s[14:15], vcc, exec
+; GFX8-NEXT:    s_cselect_b32 s16, s7, s11
+; GFX8-NEXT:    s_and_b64 s[14:15], s[12:13], exec
+; GFX8-NEXT:    s_cselect_b32 s15, s16, 0x7ff80000
+; GFX8-NEXT:    s_and_b64 s[16:17], vcc, exec
+; GFX8-NEXT:    s_cselect_b32 s14, s6, s10
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[16:17], s[6:7], 32
+; GFX8-NEXT:    s_and_b64 s[12:13], s[12:13], exec
+; GFX8-NEXT:    s_cselect_b32 s14, s14, 0
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[20:21], s[14:15], 0
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX8-NEXT:    s_and_b64 s[12:13], s[16:17], exec
+; GFX8-NEXT:    s_cselect_b32 s7, s7, s15
+; GFX8-NEXT:    s_and_b64 s[12:13], s[18:19], exec
+; GFX8-NEXT:    s_cselect_b32 s7, s11, s7
+; GFX8-NEXT:    s_and_b64 s[12:13], s[20:21], exec
+; GFX8-NEXT:    s_cselect_b32 s7, s7, s15
+; GFX8-NEXT:    s_and_b64 s[12:13], s[16:17], exec
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[12:13], s[4:5], v[0:1]
+; GFX8-NEXT:    s_cselect_b32 s6, s6, s14
+; GFX8-NEXT:    s_and_b64 s[16:17], s[18:19], exec
+; GFX8-NEXT:    s_cselect_b32 s6, s10, s6
+; GFX8-NEXT:    s_and_b64 s[10:11], s[20:21], exec
+; GFX8-NEXT:    s_cselect_b32 s6, s6, s14
+; GFX8-NEXT:    s_and_b64 s[10:11], vcc, exec
+; GFX8-NEXT:    s_cselect_b32 s14, s5, s9
+; GFX8-NEXT:    s_and_b64 s[10:11], s[12:13], exec
+; GFX8-NEXT:    s_cselect_b32 s11, s14, 0x7ff80000
+; GFX8-NEXT:    s_and_b64 s[14:15], vcc, exec
+; GFX8-NEXT:    s_cselect_b32 s10, s4, s8
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[14:15], s[4:5], 32
+; GFX8-NEXT:    s_and_b64 s[12:13], s[12:13], exec
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[12:13], s[8:9], 32
+; GFX8-NEXT:    s_cselect_b32 s10, s10, 0
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[16:17], s[10:11], 0
+; GFX8-NEXT:    s_and_b64 s[18:19], s[14:15], exec
+; GFX8-NEXT:    s_cselect_b32 s5, s5, s11
+; GFX8-NEXT:    s_and_b64 s[18:19], s[12:13], exec
+; GFX8-NEXT:    s_cselect_b32 s5, s9, s5
+; GFX8-NEXT:    s_and_b64 s[18:19], s[16:17], exec
+; GFX8-NEXT:    s_cselect_b32 s5, s5, s11
+; GFX8-NEXT:    s_and_b64 s[14:15], s[14:15], exec
+; GFX8-NEXT:    s_cselect_b32 s4, s4, s10
+; GFX8-NEXT:    s_and_b64 s[12:13], s[12:13], exec
+; GFX8-NEXT:    s_cselect_b32 s4, s8, s4
+; GFX8-NEXT:    s_and_b64 s[8:9], s[16:17], exec
+; GFX8-NEXT:    s_cselect_b32 s4, s4, s10
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use s[4:7]
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_minimum_v2f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s10
+; GFX9-NEXT:    v_mov_b32_e32 v1, s11
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, s[6:7], v[0:1]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[12:13], s[6:7], v[0:1]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[18:19], s[10:11], 32
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    s_and_b64 s[14:15], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s16, s7, s11
+; GFX9-NEXT:    s_and_b64 s[14:15], s[12:13], exec
+; GFX9-NEXT:    s_cselect_b32 s15, s16, 0x7ff80000
+; GFX9-NEXT:    s_and_b64 s[16:17], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s14, s6, s10
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[16:17], s[6:7], 32
+; GFX9-NEXT:    s_and_b64 s[12:13], s[12:13], exec
+; GFX9-NEXT:    s_cselect_b32 s14, s14, 0
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[20:21], s[14:15], 0
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX9-NEXT:    s_and_b64 s[12:13], s[16:17], exec
+; GFX9-NEXT:    s_cselect_b32 s7, s7, s15
+; GFX9-NEXT:    s_and_b64 s[12:13], s[18:19], exec
+; GFX9-NEXT:    s_cselect_b32 s7, s11, s7
+; GFX9-NEXT:    s_and_b64 s[12:13], s[20:21], exec
+; GFX9-NEXT:    s_cselect_b32 s7, s7, s15
+; GFX9-NEXT:    s_and_b64 s[12:13], s[16:17], exec
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[12:13], s[4:5], v[0:1]
+; GFX9-NEXT:    s_cselect_b32 s6, s6, s14
+; GFX9-NEXT:    s_and_b64 s[16:17], s[18:19], exec
+; GFX9-NEXT:    s_cselect_b32 s6, s10, s6
+; GFX9-NEXT:    s_and_b64 s[10:11], s[20:21], exec
+; GFX9-NEXT:    s_cselect_b32 s6, s6, s14
+; GFX9-NEXT:    s_and_b64 s[10:11], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s14, s5, s9
+; GFX9-NEXT:    s_and_b64 s[10:11], s[12:13], exec
+; GFX9-NEXT:    s_cselect_b32 s11, s14, 0x7ff80000
+; GFX9-NEXT:    s_and_b64 s[14:15], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s10, s4, s8
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[14:15], s[4:5], 32
+; GFX9-NEXT:    s_and_b64 s[12:13], s[12:13], exec
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[12:13], s[8:9], 32
+; GFX9-NEXT:    s_cselect_b32 s10, s10, 0
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[16:17], s[10:11], 0
+; GFX9-NEXT:    s_and_b64 s[18:19], s[14:15], exec
+; GFX9-NEXT:    s_cselect_b32 s5, s5, s11
+; GFX9-NEXT:    s_and_b64 s[18:19], s[12:13], exec
+; GFX9-NEXT:    s_cselect_b32 s5, s9, s5
+; GFX9-NEXT:    s_and_b64 s[18:19], s[16:17], exec
+; GFX9-NEXT:    s_cselect_b32 s5, s5, s11
+; GFX9-NEXT:    s_and_b64 s[14:15], s[14:15], exec
+; GFX9-NEXT:    s_cselect_b32 s4, s4, s10
+; GFX9-NEXT:    s_and_b64 s[12:13], s[12:13], exec
+; GFX9-NEXT:    s_cselect_b32 s4, s8, s4
+; GFX9-NEXT:    s_and_b64 s[8:9], s[16:17], exec
+; GFX9-NEXT:    s_cselect_b32 s4, s4, s10
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[4:7]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: s_minimum_v2f64:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, s[2:3], v[0:1]
+; GFX940-NEXT:    s_and_b64 s[8:9], vcc, exec
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[8:9], s[2:3], v[0:1]
+; GFX940-NEXT:    s_cselect_b32 s12, s3, s7
+; GFX940-NEXT:    s_and_b64 s[10:11], s[8:9], exec
+; GFX940-NEXT:    s_cselect_b32 s11, s12, 0x7ff80000
+; GFX940-NEXT:    s_and_b64 s[12:13], vcc, exec
+; GFX940-NEXT:    s_cselect_b32 s10, s2, s6
+; GFX940-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[12:13], s[2:3], 32
+; GFX940-NEXT:    s_cselect_b32 s10, s10, 0
+; GFX940-NEXT:    s_and_b64 s[14:15], s[12:13], exec
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[14:15], s[6:7], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[8:9], s[10:11], 0
+; GFX940-NEXT:    s_cselect_b32 s3, s3, s11
+; GFX940-NEXT:    s_and_b64 s[16:17], s[14:15], exec
+; GFX940-NEXT:    s_cselect_b32 s3, s7, s3
+; GFX940-NEXT:    s_and_b64 s[16:17], s[8:9], exec
+; GFX940-NEXT:    s_cselect_b32 s7, s3, s11
+; GFX940-NEXT:    s_and_b64 s[12:13], s[12:13], exec
+; GFX940-NEXT:    s_cselect_b32 s11, s2, s10
+; GFX940-NEXT:    s_and_b64 s[2:3], s[14:15], exec
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-NEXT:    s_cselect_b32 s6, s6, s11
+; GFX940-NEXT:    s_and_b64 s[2:3], s[8:9], exec
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; GFX940-NEXT:    s_cselect_b32 s6, s6, s10
+; GFX940-NEXT:    s_and_b64 s[2:3], vcc, exec
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[2:3], s[0:1], v[0:1]
+; GFX940-NEXT:    s_cselect_b32 s10, s1, s5
+; GFX940-NEXT:    s_and_b64 s[8:9], s[2:3], exec
+; GFX940-NEXT:    s_cselect_b32 s9, s10, 0x7ff80000
+; GFX940-NEXT:    s_and_b64 s[10:11], vcc, exec
+; GFX940-NEXT:    s_cselect_b32 s8, s0, s4
+; GFX940-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[10:11], s[0:1], 32
+; GFX940-NEXT:    s_cselect_b32 s8, s8, 0
+; GFX940-NEXT:    s_and_b64 s[12:13], s[10:11], exec
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[12:13], s[4:5], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], s[8:9], 0
+; GFX940-NEXT:    s_cselect_b32 s1, s1, s9
+; GFX940-NEXT:    s_and_b64 s[14:15], s[12:13], exec
+; GFX940-NEXT:    s_cselect_b32 s1, s5, s1
+; GFX940-NEXT:    s_and_b64 s[14:15], s[2:3], exec
+; GFX940-NEXT:    s_cselect_b32 s5, s1, s9
+; GFX940-NEXT:    s_and_b64 s[10:11], s[10:11], exec
+; GFX940-NEXT:    s_cselect_b32 s9, s0, s8
+; GFX940-NEXT:    s_and_b64 s[0:1], s[12:13], exec
+; GFX940-NEXT:    s_cselect_b32 s4, s4, s9
+; GFX940-NEXT:    s_and_b64 s[0:1], s[2:3], exec
+; GFX940-NEXT:    s_cselect_b32 s4, s4, s8
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use s[4:7]
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_minimum_v2f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s12, s[6:7], s[10:11]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s14, s[6:7], s[10:11]
+; GFX10-NEXT:    v_cmp_class_f64_e64 s15, s[6:7], 32
+; GFX10-NEXT:    v_cmp_class_f64_e64 s16, s[10:11], 32
+; GFX10-NEXT:    v_cmp_o_f64_e64 s18, s[4:5], s[8:9]
+; GFX10-NEXT:    v_cmp_class_f64_e64 s19, s[4:5], 32
+; GFX10-NEXT:    v_cmp_class_f64_e64 s20, s[8:9], 32
+; GFX10-NEXT:    s_and_b32 s13, s12, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s13, s7, s11
+; GFX10-NEXT:    s_and_b32 s17, s14, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s13, s13, 0x7ff80000
+; GFX10-NEXT:    s_and_b32 s12, s12, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s12, s6, s10
+; GFX10-NEXT:    s_and_b32 s14, s14, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s12, s12, 0
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s17, s[4:5], s[8:9]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s14, s[12:13], 0
+; GFX10-NEXT:    s_and_b32 s21, s15, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s7, s7, s13
+; GFX10-NEXT:    s_and_b32 s21, s16, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s7, s11, s7
+; GFX10-NEXT:    s_and_b32 s11, s14, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s7, s7, s13
+; GFX10-NEXT:    s_and_b32 s11, s15, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s6, s6, s12
+; GFX10-NEXT:    s_and_b32 s11, s16, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s6, s10, s6
+; GFX10-NEXT:    s_and_b32 s10, s14, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s6, s6, s12
+; GFX10-NEXT:    s_and_b32 s10, s17, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s10, s5, s9
+; GFX10-NEXT:    s_and_b32 s11, s18, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s11, s10, 0x7ff80000
+; GFX10-NEXT:    s_and_b32 s10, s17, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s10, s4, s8
+; GFX10-NEXT:    s_and_b32 s12, s18, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s10, s10, 0
+; GFX10-NEXT:    s_and_b32 s13, s19, exec_lo
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s12, s[10:11], 0
+; GFX10-NEXT:    s_cselect_b32 s5, s5, s11
+; GFX10-NEXT:    s_and_b32 s13, s20, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s5, s9, s5
+; GFX10-NEXT:    s_and_b32 s9, s12, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s5, s5, s11
+; GFX10-NEXT:    s_and_b32 s9, s19, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s4, s4, s10
+; GFX10-NEXT:    s_and_b32 s9, s20, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s4, s8, s4
+; GFX10-NEXT:    s_and_b32 s8, s12, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s4, s4, s10
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use s[4:7]
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_minimum_v2f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s8, s[2:3], s[6:7]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s10, s[2:3], s[6:7]
+; GFX11-NEXT:    v_cmp_class_f64_e64 s11, s[2:3], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s12, s[6:7], 32
+; GFX11-NEXT:    v_cmp_o_f64_e64 s14, s[0:1], s[4:5]
+; GFX11-NEXT:    v_cmp_class_f64_e64 s15, s[0:1], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s16, s[4:5], 32
+; GFX11-NEXT:    s_and_b32 s9, s8, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s9, s3, s7
+; GFX11-NEXT:    s_and_b32 s13, s10, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s9, s9, 0x7ff80000
+; GFX11-NEXT:    s_and_b32 s8, s8, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s8, s2, s6
+; GFX11-NEXT:    s_and_b32 s10, s10, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s8, s8, 0
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s13, s[0:1], s[4:5]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s10, s[8:9], 0
+; GFX11-NEXT:    s_and_b32 s17, s11, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s3, s3, s9
+; GFX11-NEXT:    s_and_b32 s17, s12, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s3, s7, s3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_and_b32 s7, s10, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s3, s3, s9
+; GFX11-NEXT:    s_and_b32 s7, s11, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s2, s2, s8
+; GFX11-NEXT:    s_and_b32 s7, s12, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s2, s6, s2
+; GFX11-NEXT:    s_and_b32 s6, s10, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s2, s2, s8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    s_and_b32 s6, s13, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s6, s1, s5
+; GFX11-NEXT:    s_and_b32 s7, s14, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s7, s6, 0x7ff80000
+; GFX11-NEXT:    s_and_b32 s6, s13, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s6, s0, s4
+; GFX11-NEXT:    s_and_b32 s8, s14, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s6, s6, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    s_and_b32 s9, s15, exec_lo
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s8, s[6:7], 0
+; GFX11-NEXT:    s_cselect_b32 s1, s1, s7
+; GFX11-NEXT:    s_and_b32 s9, s16, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s1, s5, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_and_b32 s5, s8, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s1, s1, s7
+; GFX11-NEXT:    s_and_b32 s5, s15, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s0, s0, s6
+; GFX11-NEXT:    s_and_b32 s5, s16, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s0, s4, s0
+; GFX11-NEXT:    s_and_b32 s4, s8, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s0, s0, s6
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use s[0:3]
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: s_minimum_v2f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[2:3], s[2:3], s[6:7]
+; GFX12-NEXT:    v_minimum_f64 v[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use v[0:3]
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <2 x double> @llvm.minimum.v2f64(<2 x double> %src0, <2 x double> %src1)
+  call void asm sideeffect "; use $0", "s"(<2 x double> %op)
+  ret void
+}
+
+define <3 x double> @v_minimum_v3f64(<3 x double> %src0, <3 x double> %src1) {
+; GFX7-LABEL: v_minimum_v3f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v14, 0x7ff80000
+; GFX7-NEXT:    v_cmp_lt_f64_e64 s[8:9], v[2:3], v[8:9]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[10:11], v[2:3], v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v12, v7, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v13, v14, v12, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v12, v6, v0, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v12, 0, v12, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[6:7], 32
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[6:7], 0, v[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, v9, v3, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v12, v0, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v13, v1, s[6:7]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[6:7], v[2:3], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v14, v6, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, v8, v2, s[8:9]
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, v[4:5], v[10:11]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[8:9], v[8:9], 32
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[4:5], v[4:5], v[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, 0, v6, s[10:11]
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e32 v12, v11, v5, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v10, v4, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v13, v14, v12, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v12, 0, v8, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[10:11], 32
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[12:13]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_v3f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v14, 0x7ff80000
+; GFX8-NEXT:    v_cmp_lt_f64_e64 s[8:9], v[2:3], v[8:9]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[10:11], v[2:3], v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v7, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v13, v14, v12, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v6, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, v12, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[6:7], 32
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[6:7], 0, v[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v9, v3, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v12, v0, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v13, v1, s[6:7]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[6:7], v[2:3], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v14, v6, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v8, v2, s[8:9]
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, v[4:5], v[10:11]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[8:9], v[8:9], 32
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[4:5], v[4:5], v[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, v6, s[10:11]
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v11, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v10, v4, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v13, v14, v12, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, v8, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[10:11], 32
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[12:13]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v3f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v14, 0x7ff80000
+; GFX9-NEXT:    v_cmp_lt_f64_e64 s[8:9], v[2:3], v[8:9]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[10:11], v[2:3], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v7, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, v14, v12, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v6, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, v12, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[6:7], 32
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[6:7], 0, v[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v9, v3, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v12, v0, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v13, v1, s[6:7]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[6:7], v[2:3], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v14, v6, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v8, v2, s[8:9]
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, v[4:5], v[10:11]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[8:9], v[8:9], 32
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[4:5], v[4:5], v[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, v6, s[10:11]
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v11, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v10, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, v14, v12, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, v8, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[10:11], 32
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[12:13]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v3f64:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7]
+; GFX940-NEXT:    v_mov_b32_e32 v14, 0x7ff80000
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[0:1], v[6:7]
+; GFX940-NEXT:    v_cndmask_b32_e32 v12, v7, v1, vcc
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v13, v14, v12, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v12, v6, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v12, 0, v12, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[6:7], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[12:13]
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[8:9]
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v3, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[2:3], v[8:9]
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v12, v0, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v13, v1, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v7, v14, v6, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v8, v2, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v6, 0, v6, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[8:9], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[6:7]
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[4:5], v[10:11]
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v11, v5, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[4:5], v[10:11]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[2:3]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v7, v14, v6, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v10, v4, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v6, 0, v6, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[10:11], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[6:7]
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[2:3]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v3f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[6:7]
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s4, v[2:3], v[8:9]
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s5, v[4:5], v[10:11]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s6, v[0:1], v[6:7]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s7, v[2:3], v[8:9]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s8, v[4:5], v[10:11]
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v7, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v9, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, v11, v5, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v17, v6, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, v8, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, v10, v4, s5
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[0:1], 32
+; GFX10-NEXT:    v_cmp_class_f64_e64 s4, v[2:3], 32
+; GFX10-NEXT:    v_cmp_class_f64_e64 s5, v[4:5], 32
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0x7ff80000, v12, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, 0x7ff80000, v14, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, v17, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, 0x7ff80000, v16, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, v18, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0, v19, s8
+; GFX10-NEXT:    v_cmp_class_f64_e64 s6, v[8:9], 32
+; GFX10-NEXT:    v_cmp_class_f64_e64 s7, v[6:7], 32
+; GFX10-NEXT:    v_cmp_class_f64_e64 s8, v[10:11], 32
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s9, 0, v[12:13]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s10, 0, v[14:15]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s11, 0, v[16:17]
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v14, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v16, v4, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v15, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v17, v5, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v12, v0, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v14, v2, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v16, v4, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v13, v1, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v15, v3, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v17, v5, s11
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v3f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[6:7]
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s0, v[2:3], v[8:9]
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s1, v[4:5], v[10:11]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s2, v[0:1], v[6:7]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s3, v[2:3], v[8:9]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s4, v[4:5], v[10:11]
+; GFX11-NEXT:    v_dual_cndmask_b32 v12, v7, v1 :: v_dual_cndmask_b32 v17, v6, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v14, v9, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v16, v11, v5, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v18, v8, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v19, v10, v4, s1
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[0:1], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s0, v[2:3], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s1, v[4:5], 32
+; GFX11-NEXT:    v_cndmask_b32_e64 v13, 0x7ff80000, v12, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v15, 0x7ff80000, v14, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v12, 0, v17, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v17, 0x7ff80000, v16, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v14, 0, v18, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v16, 0, v19, s4
+; GFX11-NEXT:    v_cmp_class_f64_e64 s2, v[8:9], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s3, v[6:7], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s4, v[10:11], 32
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s5, 0, v[12:13]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s6, 0, v[14:15]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s7, 0, v[16:17]
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v12, v0 :: v_dual_cndmask_b32 v1, v13, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v14, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v16, v4, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v15, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v17, v5, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v12, v0, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v14, v2, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v16, v4, s7
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v13, v1, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v15, v3, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v17, v5, s7
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v3f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[6:7]
+; GFX12-NEXT:    v_minimum_f64 v[2:3], v[2:3], v[8:9]
+; GFX12-NEXT:    v_minimum_f64 v[4:5], v[4:5], v[10:11]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <3 x double> @llvm.minimum.v3f64(<3 x double> %src0, <3 x double> %src1)
+  ret <3 x double> %op
+}
+
+define <3 x double> @v_minimum_v3f64__nnan(<3 x double> %src0, <3 x double> %src1) {
+; GFX7-LABEL: v_minimum_v3f64__nnan:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[6:7], 32
+; GFX7-NEXT:    v_cmp_lt_f64_e64 s[8:9], v[2:3], v[8:9]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[10:11], v[10:11], 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v13, v7, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v12, v6, v0, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 32
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[6:7], 0, v[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, v[4:5], v[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v12, v0, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v13, v1, s[6:7]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v9, v3, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, v8, v2, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v13, v11, v5, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v12, v10, v4, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 32
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[6:7], v[8:9], 32
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[8:9], 0, v[6:7]
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[12:13]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_v3f64__nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[6:7], 32
+; GFX8-NEXT:    v_cmp_lt_f64_e64 s[8:9], v[2:3], v[8:9]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[10:11], v[10:11], 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v7, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v6, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 32
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[6:7], 0, v[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, v[4:5], v[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v12, v0, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v13, v1, s[6:7]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v9, v3, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v8, v2, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v11, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v10, v4, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 32
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[6:7], v[8:9], 32
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[8:9], 0, v[6:7]
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[12:13]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v3f64__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[6:7], 32
+; GFX9-NEXT:    v_cmp_lt_f64_e64 s[8:9], v[2:3], v[8:9]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[10:11], v[10:11], 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v7, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v6, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 32
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[6:7], 0, v[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, v[4:5], v[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v12, v0, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v13, v1, s[6:7]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v9, v3, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v8, v2, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v11, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v10, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 32
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[6:7], v[8:9], 32
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[8:9], 0, v[6:7]
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[12:13]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v3f64__nnan:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7]
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[6:7], 32
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v13, v7, v1, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v12, v6, v0, vcc
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[12:13]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[8:9]
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v9, v3, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v8, v2, vcc
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[8:9], 32
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v12, v0, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v13, v1, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[0:1]
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[6:7]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[0:1]
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[4:5], v[10:11]
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v11, v5, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v10, v4, vcc
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[10:11], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[6:7]
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[2:3]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v3f64__nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[6:7]
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s4, v[2:3], v[8:9]
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s5, v[4:5], v[10:11]
+; GFX10-NEXT:    v_cmp_class_f64_e64 s6, v[8:9], 32
+; GFX10-NEXT:    v_cmp_class_f64_e64 s7, v[6:7], 32
+; GFX10-NEXT:    v_cmp_class_f64_e64 s8, v[10:11], 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v7, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, v9, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v6, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, v11, v5, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v8, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, v10, v4, s5
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[0:1], 32
+; GFX10-NEXT:    v_cmp_class_f64_e64 s4, v[2:3], 32
+; GFX10-NEXT:    v_cmp_class_f64_e64 s5, v[4:5], 32
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s9, 0, v[12:13]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s10, 0, v[14:15]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s11, 0, v[16:17]
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v14, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v16, v4, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v15, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v17, v5, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v12, v0, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v14, v2, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v16, v4, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v13, v1, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v15, v3, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v17, v5, s11
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v3f64__nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[6:7]
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s0, v[2:3], v[8:9]
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s1, v[4:5], v[10:11]
+; GFX11-NEXT:    v_cmp_class_f64_e64 s2, v[8:9], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s3, v[6:7], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s4, v[10:11], 32
+; GFX11-NEXT:    v_dual_cndmask_b32 v13, v7, v1 :: v_dual_cndmask_b32 v12, v6, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v15, v9, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v17, v11, v5, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v14, v8, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v16, v10, v4, s1
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[0:1], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s0, v[2:3], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s1, v[4:5], 32
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s5, 0, v[12:13]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s6, 0, v[14:15]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s7, 0, v[16:17]
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v12, v0 :: v_dual_cndmask_b32 v1, v13, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v14, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v16, v4, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v15, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v17, v5, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v12, v0, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v14, v2, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v16, v4, s7
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v13, v1, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v15, v3, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v17, v5, s7
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v3f64__nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[6:7]
+; GFX12-NEXT:    v_minimum_f64 v[2:3], v[2:3], v[8:9]
+; GFX12-NEXT:    v_minimum_f64 v[4:5], v[4:5], v[10:11]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan <3 x double> @llvm.minimum.v3f64(<3 x double> %src0, <3 x double> %src1)
+  ret <3 x double> %op
+}
+
+define <3 x double> @v_minimum_v3f64__nsz(<3 x double> %src0, <3 x double> %src1) {
+; GFX7-LABEL: v_minimum_v3f64__nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[6:7]
+; GFX7-NEXT:    v_cmp_lt_f64_e64 s[8:9], v[4:5], v[10:11]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[6:7], v[2:3], v[8:9]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[10:11], v[4:5], v[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e32 v12, v6, v0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v10, v4, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v11, v5, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, v12, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v8, v2, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v8, v1, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s[10:11]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_v3f64__nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[6:7]
+; GFX8-NEXT:    v_cmp_lt_f64_e64 s[8:9], v[4:5], v[10:11]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[6:7], v[2:3], v[8:9]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[10:11], v[4:5], v[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v6, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v10, v4, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v11, v5, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, v12, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v8, v2, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v8, v1, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s[10:11]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v3f64__nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[6:7]
+; GFX9-NEXT:    v_cmp_lt_f64_e64 s[8:9], v[4:5], v[10:11]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[6:7], v[2:3], v[8:9]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[10:11], v[4:5], v[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v6, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v10, v4, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v11, v5, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, v12, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v8, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v8, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s[10:11]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v3f64__nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7]
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[0:1], v[6:7]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v12, v6, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX940-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[8:9]
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, 0, v12, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v6, v1, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v8, v2, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[2:3], v[8:9]
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[4:5], v[10:11]
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, 0, v7, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v3, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v10, v4, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[4:5], v[10:11]
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v11, v5, vcc
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v5, v6, v5, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v3f64__nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[6:7]
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s4, v[2:3], v[8:9]
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s5, v[4:5], v[10:11]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s6, v[0:1], v[6:7]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s7, v[2:3], v[8:9]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s8, v[4:5], v[10:11]
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v6, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v8, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v10, v4, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v11, v5, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, v12, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0x7ff80000, v1, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0x7ff80000, v3, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v5, s8
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v3f64__nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[6:7]
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s0, v[2:3], v[8:9]
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s1, v[4:5], v[10:11]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s2, v[0:1], v[6:7]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s3, v[2:3], v[8:9]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s4, v[4:5], v[10:11]
+; GFX11-NEXT:    v_dual_cndmask_b32 v12, v6, v0 :: v_dual_cndmask_b32 v1, v7, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v8, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v8, v10, v4, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v11, v5, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, v12, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0x7ff80000, v1, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0x7ff80000, v3, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v5, s4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v3f64__nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[6:7]
+; GFX12-NEXT:    v_minimum_f64 v[2:3], v[2:3], v[8:9]
+; GFX12-NEXT:    v_minimum_f64 v[4:5], v[4:5], v[10:11]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nsz <3 x double> @llvm.minimum.v3f64(<3 x double> %src0, <3 x double> %src1)
+  ret <3 x double> %op
+}
+
+define <3 x double> @v_minimum_v3f64__nnan_nsz(<3 x double> %src0, <3 x double> %src1) {
+; GFX7-LABEL: v_minimum_v3f64__nnan_nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[8:9]
+; GFX7-NEXT:    v_cmp_lt_f64_e64 s[6:7], v[4:5], v[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v10, v4, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v11, v5, s[6:7]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_v3f64__nnan_nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[8:9]
+; GFX8-NEXT:    v_cmp_lt_f64_e64 s[6:7], v[4:5], v[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v10, v4, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v11, v5, s[6:7]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v3f64__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7]
+; GFX9-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[8:9]
+; GFX9-NEXT:    v_cmp_lt_f64_e64 s[6:7], v[4:5], v[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v10, v4, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v11, v5, s[6:7]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v3f64__nnan_nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7]
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[8:9]
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[4:5], v[10:11]
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v11, v5, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v3f64__nnan_nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[6:7]
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s4, v[2:3], v[8:9]
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s5, v[4:5], v[10:11]
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v10, v4, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v11, v5, s5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v3f64__nnan_nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[6:7]
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s0, v[2:3], v[8:9]
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s1, v[4:5], v[10:11]
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v6, v0 :: v_dual_cndmask_b32 v1, v7, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v10, v4, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v11, v5, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v3f64__nnan_nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[6:7]
+; GFX12-NEXT:    v_minimum_f64 v[2:3], v[2:3], v[8:9]
+; GFX12-NEXT:    v_minimum_f64 v[4:5], v[4:5], v[10:11]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan nsz <3 x double> @llvm.minimum.v3f64(<3 x double> %src0, <3 x double> %src1)
+  ret <3 x double> %op
+}
+
+define <4 x double> @v_minimum_v4f64(<4 x double> %src0, <4 x double> %src1) {
+; GFX7-LABEL: v_minimum_v4f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v18, 0x7ff80000
+; GFX7-NEXT:    v_cmp_lt_f64_e64 s[6:7], v[2:3], v[10:11]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[8:9], v[2:3], v[10:11]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[12:13], v[4:5], v[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v9, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v17, v18, v16, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[8:9], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v19, v11, v3, s[6:7]
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[16:17]
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[10:11], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v8, v10, v2, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v9, v18, v19, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[8:9]
+; GFX7-NEXT:    v_cmp_lt_f64_e64 s[8:9], v[4:5], v[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[6:7], 0, v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, v[6:7], v[14:15]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[4:5]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[4:5], v[6:7], v[14:15]
+; GFX7-NEXT:    v_cndmask_b32_e64 v10, v13, v5, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v9, v18, v10, s[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e32 v10, v15, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v8, v12, v4, s[8:9]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[6:7], v[4:5], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v11, v18, v10, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v10, v14, v6, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[6:7], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[12:13]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[8:9], v[12:13], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v10, 0, v10, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[14:15], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[10:11]
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[8:9]
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v11, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, v10, v6, s[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v11, v7, s[12:13]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_v4f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[8:9]
+; GFX8-NEXT:    v_mov_b32_e32 v18, 0x7ff80000
+; GFX8-NEXT:    v_cmp_lt_f64_e64 s[6:7], v[2:3], v[10:11]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[8:9], v[2:3], v[10:11]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[12:13], v[4:5], v[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v9, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v17, v18, v16, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[8:9], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v19, v11, v3, s[6:7]
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[16:17]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[10:11], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v10, v2, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v18, v19, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[8:9]
+; GFX8-NEXT:    v_cmp_lt_f64_e64 s[8:9], v[4:5], v[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[6:7], 0, v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, v[6:7], v[14:15]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[4:5]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[4:5], v[6:7], v[14:15]
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, v13, v5, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v18, v10, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v15, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v12, v4, s[8:9]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[6:7], v[4:5], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v18, v10, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v14, v6, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[6:7], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[12:13]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[8:9], v[12:13], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, v10, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[14:15], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[10:11]
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[8:9]
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v11, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v10, v6, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v11, v7, s[12:13]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v4f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[8:9]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[8:9]
+; GFX9-NEXT:    v_mov_b32_e32 v18, 0x7ff80000
+; GFX9-NEXT:    v_cmp_lt_f64_e64 s[6:7], v[2:3], v[10:11]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[8:9], v[2:3], v[10:11]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[12:13], v[4:5], v[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v9, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v17, v18, v16, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[8:9], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, v11, v3, s[6:7]
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[16:17]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[10:11], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v10, v2, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v18, v19, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[8:9]
+; GFX9-NEXT:    v_cmp_lt_f64_e64 s[8:9], v[4:5], v[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[6:7], 0, v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, v[6:7], v[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[4:5], v[6:7], v[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v13, v5, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v18, v10, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v15, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v12, v4, s[8:9]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[6:7], v[4:5], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v18, v10, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v14, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[6:7], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[12:13]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[8:9], v[12:13], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, v10, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[14:15], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[10:11]
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[8:9]
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v11, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v10, v6, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v11, v7, s[12:13]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v4f64:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[8:9]
+; GFX940-NEXT:    v_mov_b32_e32 v18, 0x7ff80000
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[0:1], v[8:9]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v9, v1, vcc
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v17, v18, v16, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[8:9], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[16:17]
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[10:11]
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v11, v3, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[2:3], v[10:11]
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v9, v18, v8, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v10, v2, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[10:11], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[8:9]
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[4:5], v[12:13]
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v13, v5, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[4:5], v[12:13]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[2:3]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v9, v18, v8, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v12, v4, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[12:13], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[8:9]
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[6:7], v[14:15]
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v15, v7, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[6:7], v[14:15]
+; GFX940-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[2:3]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v9, v18, v8, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v14, v6, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[6:7], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[14:15], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[8:9]
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v6, v8, v6, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v7, v9, v7, s[2:3]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v4f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s4, v[2:3], v[10:11]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s5, v[0:1], v[8:9]
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s6, v[4:5], v[12:13]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s7, v[2:3], v[10:11]
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s8, v[6:7], v[14:15]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s9, v[4:5], v[12:13]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s10, v[6:7], v[14:15]
+; GFX10-NEXT:    v_cmp_class_f64_e64 s11, v[14:15], 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, v9, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, v11, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v19, v8, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, v10, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v20, v13, v5, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, v15, v7, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, 0x7ff80000, v16, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0, v19, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, 0x7ff80000, v18, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v23, v12, v4, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, 0, v21, s7
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[0:1], 32
+; GFX10-NEXT:    v_cndmask_b32_e64 v24, v14, v6, s8
+; GFX10-NEXT:    v_cmp_class_f64_e64 s4, v[2:3], 32
+; GFX10-NEXT:    v_cmp_class_f64_e64 s7, v[4:5], 32
+; GFX10-NEXT:    v_cmp_class_f64_e64 s8, v[6:7], 32
+; GFX10-NEXT:    v_cmp_class_f64_e64 s5, v[8:9], 32
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, 0x7ff80000, v20, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v20, 0, v23, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v23, 0x7ff80000, v22, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, 0, v24, s10
+; GFX10-NEXT:    v_cmp_class_f64_e64 s9, v[10:11], 32
+; GFX10-NEXT:    v_cmp_class_f64_e64 s10, v[12:13], 32
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s6, 0, v[16:17]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s12, 0, v[18:19]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s13, 0, v[20:21]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s14, 0, v[22:23]
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s14
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v4f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s0, v[0:1], v[8:9]
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s1, v[2:3], v[10:11]
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s2, v[4:5], v[12:13]
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s3, v[6:7], v[14:15]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s4, v[2:3], v[10:11]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s5, v[4:5], v[12:13]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s6, v[6:7], v[14:15]
+; GFX11-NEXT:    v_cndmask_b32_e32 v16, v9, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v18, v11, v3, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v20, v13, v5, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v22, v15, v7, s3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v17, 0x7ff80000, v16, s0
+; GFX11-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[0:1], 32
+; GFX11-NEXT:    v_cndmask_b32_e64 v19, 0x7ff80000, v18, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v21, 0x7ff80000, v20, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v18, v10, v2, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v20, v12, v4, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v24, v14, v6, s3
+; GFX11-NEXT:    v_cmp_class_f64_e64 s1, v[4:5], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s3, v[6:7], 32
+; GFX11-NEXT:    v_cndmask_b32_e64 v23, 0x7ff80000, v22, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v18, 0, v18, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v20, 0, v20, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v22, 0, v24, s6
+; GFX11-NEXT:    v_cmp_class_f64_e64 s2, v[12:13], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s4, v[14:15], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s5, v[8:9], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s6, v[10:11], 32
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s8, 0, v[18:19]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s9, 0, v[20:21]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s10, 0, v[22:23]
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s0
+; GFX11-NEXT:    v_cmp_class_f64_e64 s0, v[2:3], 32
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s7, 0, v[16:17]
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s9
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s9
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s10
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s8
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s8
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v4f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[8:9]
+; GFX12-NEXT:    v_minimum_f64 v[2:3], v[2:3], v[10:11]
+; GFX12-NEXT:    v_minimum_f64 v[4:5], v[4:5], v[12:13]
+; GFX12-NEXT:    v_minimum_f64 v[6:7], v[6:7], v[14:15]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <4 x double> @llvm.minimum.v4f64(<4 x double> %src0, <4 x double> %src1)
+  ret <4 x double> %op
+}
+
+define <4 x double> @v_minimum_v4f64__nnan(<4 x double> %src0, <4 x double> %src1) {
+; GFX7-LABEL: v_minimum_v4f64__nnan:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], 32
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[6:7], v[8:9], 32
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[10:11], v[10:11], 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v17, v9, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[4:5]
+; GFX7-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[4:5], v[12:13]
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[8:9], 0, v[16:17]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[6:7]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[6:7], v[12:13], 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v19, v11, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v18, v10, v2, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 32
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[18:19]
+; GFX7-NEXT:    v_cndmask_b32_e64 v9, v13, v5, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v8, v12, v4, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[4:5], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[8:9]
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[8:9], 0, v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, v[6:7], v[14:15]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[10:11]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[10:11], v[14:15], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v11, v15, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v10, v14, v6, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[6:7], 32
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v11, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, v10, v6, s[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v11, v7, s[12:13]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_v4f64__nnan:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], 32
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[6:7], v[8:9], 32
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[10:11], v[10:11], 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v17, v9, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[4:5]
+; GFX8-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[4:5], v[12:13]
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[8:9], 0, v[16:17]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[6:7]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[6:7], v[12:13], 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, v11, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v10, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 32
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[18:19]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v13, v5, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v12, v4, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[4:5], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[8:9]
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[8:9], 0, v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, v[6:7], v[14:15]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[10:11]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[10:11], v[14:15], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v15, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v14, v6, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[6:7], 32
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v11, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v10, v6, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v11, v7, s[12:13]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v4f64__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[8:9]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], 32
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[6:7], v[8:9], 32
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[10:11], v[10:11], 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v17, v9, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[4:5]
+; GFX9-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[4:5], v[12:13]
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[8:9], 0, v[16:17]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[6:7]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[6:7], v[12:13], 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v19, v11, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v10, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 32
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[18:19]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v13, v5, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v12, v4, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[4:5], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[8:9]
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[8:9], 0, v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, v[6:7], v[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[10:11]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[10:11], v[14:15], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v15, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v14, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[6:7], 32
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v11, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v10, v6, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v11, v7, s[12:13]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v4f64__nnan:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[8:9]
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[8:9], 32
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v17, v9, v1, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[16:17]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[10:11]
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v9, v11, v3, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v10, v2, vcc
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[10:11], 32
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[0:1]
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[8:9]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[0:1]
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[4:5], v[12:13]
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e32 v9, v13, v5, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v12, v4, vcc
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[12:13], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[8:9]
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[0:1]
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[6:7], v[14:15]
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e32 v9, v15, v7, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v14, v6, vcc
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[6:7], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[14:15], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[8:9]
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v6, v8, v6, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v7, v9, v7, s[2:3]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v4f64__nnan:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s4, v[2:3], v[10:11]
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s5, v[4:5], v[12:13]
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s6, v[6:7], v[14:15]
+; GFX10-NEXT:    v_cmp_class_f64_e64 s7, v[10:11], 32
+; GFX10-NEXT:    v_cmp_class_f64_e64 s8, v[8:9], 32
+; GFX10-NEXT:    v_cmp_class_f64_e64 s10, v[12:13], 32
+; GFX10-NEXT:    v_cmp_class_f64_e64 s11, v[14:15], 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v17, v9, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, v11, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, v13, v5, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v23, v15, v7, s6
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, v10, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v20, v12, v4, s5
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[4:5], 32
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, v14, v6, s6
+; GFX10-NEXT:    v_cmp_class_f64_e64 s4, v[6:7], 32
+; GFX10-NEXT:    v_cmp_class_f64_e64 s5, v[0:1], 32
+; GFX10-NEXT:    v_cmp_class_f64_e64 s6, v[2:3], 32
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s9, 0, v[16:17]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s12, 0, v[18:19]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s13, 0, v[20:21]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s14, 0, v[22:23]
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s14
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v4f64__nnan:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s0, v[2:3], v[10:11]
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s1, v[4:5], v[12:13]
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s2, v[6:7], v[14:15]
+; GFX11-NEXT:    v_cmp_class_f64_e64 s3, v[6:7], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s4, v[14:15], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s5, v[8:9], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s6, v[10:11], 32
+; GFX11-NEXT:    v_dual_cndmask_b32 v17, v9, v1 :: v_dual_cndmask_b32 v16, v8, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v19, v11, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v21, v13, v5, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v18, v10, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v20, v12, v4, s1
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[0:1], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s0, v[2:3], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s1, v[4:5], 32
+; GFX11-NEXT:    v_cndmask_b32_e64 v23, v15, v7, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v22, v14, v6, s2
+; GFX11-NEXT:    v_cmp_class_f64_e64 s2, v[12:13], 32
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s7, 0, v[16:17]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s8, 0, v[18:19]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s9, 0, v[20:21]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s10, 0, v[22:23]
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s4
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v16, v0 :: v_dual_cndmask_b32 v1, v17, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s7
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s8
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s9
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s10
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s7
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s8
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s9
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s10
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v4f64__nnan:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[8:9]
+; GFX12-NEXT:    v_minimum_f64 v[2:3], v[2:3], v[10:11]
+; GFX12-NEXT:    v_minimum_f64 v[4:5], v[4:5], v[12:13]
+; GFX12-NEXT:    v_minimum_f64 v[6:7], v[6:7], v[14:15]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan <4 x double> @llvm.minimum.v4f64(<4 x double> %src0, <4 x double> %src1)
+  ret <4 x double> %op
+}
+
+define <4 x double> @v_minimum_v4f64__nsz(<4 x double> %src0, <4 x double> %src1) {
+; GFX7-LABEL: v_minimum_v4f64__nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[8:9]
+; GFX7-NEXT:    v_cmp_lt_f64_e64 s[6:7], v[4:5], v[12:13]
+; GFX7-NEXT:    v_cmp_lt_f64_e64 s[10:11], v[6:7], v[14:15]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[8:9], v[4:5], v[12:13]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[12:13], v[6:7], v[14:15]
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, v16, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v8, v1, s[4:5]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[4:5], v[2:3], v[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v8, v7, s[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e32 v9, v10, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v10, v14, v6, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, v9, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v9, v12, v4, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, 0, v9, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, 0, v10, s[12:13]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_v4f64__nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_lt_f64_e64 s[6:7], v[4:5], v[12:13]
+; GFX8-NEXT:    v_cmp_lt_f64_e64 s[10:11], v[6:7], v[14:15]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[8:9], v[4:5], v[12:13]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[12:13], v[6:7], v[14:15]
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, v16, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v8, v1, s[4:5]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[4:5], v[2:3], v[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v8, v7, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v10, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, v14, v6, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, v9, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v12, v4, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, v9, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, v10, s[12:13]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v4f64__nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[8:9]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[8:9]
+; GFX9-NEXT:    v_cmp_lt_f64_e64 s[6:7], v[4:5], v[12:13]
+; GFX9-NEXT:    v_cmp_lt_f64_e64 s[10:11], v[6:7], v[14:15]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[8:9], v[4:5], v[12:13]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[12:13], v[6:7], v[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, v16, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v8, v1, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[4:5], v[2:3], v[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v8, v7, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v10, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v14, v6, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, v9, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v12, v4, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, v9, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, v10, s[12:13]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v4f64__nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[8:9]
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[0:1], v[8:9]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX940-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[10:11]
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, 0, v16, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v8, v1, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v9, v10, v2, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[2:3], v[10:11]
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[4:5], v[12:13]
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, 0, v9, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v9, v12, v4, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[4:5], v[12:13]
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[6:7], v[14:15]
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, 0, v9, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v9, v14, v6, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[6:7], v[14:15]
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v6, 0, v9, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v7, v8, v7, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v4f64__nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s4, v[2:3], v[10:11]
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s5, v[4:5], v[12:13]
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s7, v[6:7], v[14:15]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s6, v[0:1], v[8:9]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s8, v[2:3], v[10:11]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s9, v[4:5], v[12:13]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s10, v[6:7], v[14:15]
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, v8, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v10, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v12, v4, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v14, v6, s7
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, v16, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0x7ff80000, v1, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0x7ff80000, v3, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v5, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0x7ff80000, v7, s10
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v4f64__nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s1, v[2:3], v[10:11]
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s2, v[4:5], v[12:13]
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s3, v[6:7], v[14:15]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s0, v[0:1], v[8:9]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s4, v[2:3], v[10:11]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s5, v[4:5], v[12:13]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s6, v[6:7], v[14:15]
+; GFX11-NEXT:    v_dual_cndmask_b32 v16, v8, v0 :: v_dual_cndmask_b32 v1, v9, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v8, v10, v2, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v10, v12, v4, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v12, v14, v6, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, v16, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0x7ff80000, v1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0x7ff80000, v3, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0x7ff80000, v5, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0x7ff80000, v7, s6
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v4f64__nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[8:9]
+; GFX12-NEXT:    v_minimum_f64 v[2:3], v[2:3], v[10:11]
+; GFX12-NEXT:    v_minimum_f64 v[4:5], v[4:5], v[12:13]
+; GFX12-NEXT:    v_minimum_f64 v[6:7], v[6:7], v[14:15]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nsz <4 x double> @llvm.minimum.v4f64(<4 x double> %src0, <4 x double> %src1)
+  ret <4 x double> %op
+}
+
+define <4 x double> @v_minimum_v4f64__nnan_nsz(<4 x double> %src0, <4 x double> %src1) {
+; GFX7-LABEL: v_minimum_v4f64__nnan_nsz:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
+; GFX7-NEXT:    v_cmp_lt_f64_e64 s[6:7], v[4:5], v[12:13]
+; GFX7-NEXT:    v_cmp_lt_f64_e64 s[8:9], v[6:7], v[14:15]
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s[8:9]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_v4f64__nnan_nsz:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
+; GFX8-NEXT:    v_cmp_lt_f64_e64 s[6:7], v[4:5], v[12:13]
+; GFX8-NEXT:    v_cmp_lt_f64_e64 s[8:9], v[6:7], v[14:15]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s[8:9]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v4f64__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[8:9]
+; GFX9-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
+; GFX9-NEXT:    v_cmp_lt_f64_e64 s[6:7], v[4:5], v[12:13]
+; GFX9-NEXT:    v_cmp_lt_f64_e64 s[8:9], v[6:7], v[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s[8:9]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v4f64__nnan_nsz:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[8:9]
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[10:11]
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[4:5], v[12:13]
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[6:7], v[14:15]
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v4f64__nnan_nsz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s4, v[2:3], v[10:11]
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s5, v[4:5], v[12:13]
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s6, v[6:7], v[14:15]
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s6
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s6
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v4f64__nnan_nsz:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s0, v[2:3], v[10:11]
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s1, v[4:5], v[12:13]
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s2, v[6:7], v[14:15]
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v8, v0 :: v_dual_cndmask_b32 v1, v9, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v4f64__nnan_nsz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[8:9]
+; GFX12-NEXT:    v_minimum_f64 v[2:3], v[2:3], v[10:11]
+; GFX12-NEXT:    v_minimum_f64 v[4:5], v[4:5], v[12:13]
+; GFX12-NEXT:    v_minimum_f64 v[6:7], v[6:7], v[14:15]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call nnan nsz <4 x double> @llvm.minimum.v4f64(<4 x double> %src0, <4 x double> %src1)
+  ret <4 x double> %op
+}
+
+define <8 x double> @v_minimum_v8f64(<8 x double> %src0, <8 x double> %src1) {
+; GFX7-LABEL: v_minimum_v8f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[16:17]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[16:17]
+; GFX7-NEXT:    v_mov_b32_e32 v32, 0x7ff80000
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[6:7], v[4:5], v[20:21]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[8:9], v[8:9], v[24:25]
+; GFX7-NEXT:    v_cmp_lt_f64_e64 s[10:11], v[12:13], v[28:29]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[12:13], v[12:13], v[28:29]
+; GFX7-NEXT:    v_cndmask_b32_e32 v31, v17, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v34, v32, v31, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v31, v16, v0, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v33, 0, v31, s[4:5]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[4:5], v[2:3], v[18:19]
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v33, v0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v34, v1, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[16:17], 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
+; GFX7-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[33:34]
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v33, v0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v34, v1, vcc
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[18:19]
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v19, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v17, v32, v16, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v18, v2, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[4:5]
+; GFX7-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[4:5], v[20:21]
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v17, v3, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[18:19], 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
+; GFX7-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[16:17]
+; GFX7-NEXT:    v_cndmask_b32_e64 v18, v21, v5, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v17, v3, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v16, v20, v4, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[20:21], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v17, v32, v18, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[6:7]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[6:7], v[6:7], v[22:23]
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v20, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v21, s[4:5]
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[4:5], 0, v[16:17]
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, v[6:7], v[22:23]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v16, v4, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v17, v5, s[4:5]
+; GFX7-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[8:9], v[24:25]
+; GFX7-NEXT:    v_cndmask_b32_e32 v18, v23, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v22, v6, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[6:7], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v19, v32, v18, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v18, 0, v16, s[6:7]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[6:7], v[22:23], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v16, v25, v9, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v17, v32, v16, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v16, v24, v8, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v18, v6, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v19, v7, vcc
+; GFX7-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[18:19]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[8:9], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, v6, v22, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v7, v23, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[8:9]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[6:7], v[24:25], 32
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[8:9], v[10:11], v[26:27]
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v18, v6, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v8, v16, v8, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v19, v7, vcc
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, v[10:11], v[26:27]
+; GFX7-NEXT:    v_cndmask_b32_e64 v9, v17, v9, s[4:5]
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[4:5], 0, v[16:17]
+; GFX7-NEXT:    v_cndmask_b32_e64 v8, v8, v24, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v9, v9, v25, s[6:7]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[6:7], v[26:27], 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v18, v27, v11, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v19, v32, v18, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v8, v16, v8, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v26, v10, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[10:11], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v18, 0, v16, s[8:9]
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[8:9], 0, v[18:19]
+; GFX7-NEXT:    v_cndmask_b32_e64 v9, v17, v9, s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[4:5], v[14:15], v[30:31]
+; GFX7-NEXT:    v_cndmask_b32_e64 v16, v29, v13, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v17, v32, v16, s[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e64 v16, v28, v12, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e32 v10, v18, v10, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v11, v19, v11, vcc
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, v[14:15], v[30:31]
+; GFX7-NEXT:    v_cndmask_b32_e64 v10, v10, v26, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v11, v11, v27, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v10, v18, v10, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v11, v19, v11, s[8:9]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[6:7], v[12:13], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[12:13]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[8:9], v[28:29], 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v18, v31, v15, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v19, v32, v18, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v18, v30, v14, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[14:15], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v18, 0, v18, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[30:31], 32
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[16:17]
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[18:19]
+; GFX7-NEXT:    v_cndmask_b32_e64 v12, v16, v12, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v13, v17, v13, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v12, v12, v28, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v14, v18, v14, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v15, v19, v15, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v14, v14, v30, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v13, v13, v29, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v15, v15, v31, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v12, v16, v12, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v13, v17, v13, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v14, v18, v14, s[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e64 v15, v19, v15, s[12:13]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_v8f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[16:17]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[16:17]
+; GFX8-NEXT:    v_mov_b32_e32 v32, 0x7ff80000
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[6:7], v[4:5], v[20:21]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[8:9], v[8:9], v[24:25]
+; GFX8-NEXT:    v_cmp_lt_f64_e64 s[10:11], v[12:13], v[28:29]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[12:13], v[12:13], v[28:29]
+; GFX8-NEXT:    v_cndmask_b32_e32 v31, v17, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v34, v32, v31, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v31, v16, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v33, 0, v31, s[4:5]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[4:5], v[2:3], v[18:19]
+; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v33, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v34, v1, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[16:17], 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
+; GFX8-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[33:34]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v33, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v34, v1, vcc
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[18:19]
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v19, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v17, v32, v16, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v18, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[4:5]
+; GFX8-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[4:5], v[20:21]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v17, v3, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[18:19], 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
+; GFX8-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[16:17]
+; GFX8-NEXT:    v_cndmask_b32_e64 v18, v21, v5, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v17, v3, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, v20, v4, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[20:21], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v17, v32, v18, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[6:7]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[6:7], v[6:7], v[22:23]
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v20, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v21, s[4:5]
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[4:5], 0, v[16:17]
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, v[6:7], v[22:23]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v16, v4, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v17, v5, s[4:5]
+; GFX8-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[8:9], v[24:25]
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v23, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v22, v6, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[6:7], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v19, v32, v18, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v18, 0, v16, s[6:7]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[6:7], v[22:23], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, v25, v9, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v17, v32, v16, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, v24, v8, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v18, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v19, v7, vcc
+; GFX8-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[18:19]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[8:9], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v22, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v23, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[8:9]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[6:7], v[24:25], 32
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[8:9], v[10:11], v[26:27]
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v18, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v16, v8, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v19, v7, vcc
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, v[10:11], v[26:27]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v17, v9, s[4:5]
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[4:5], 0, v[16:17]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v8, v24, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v9, v25, s[6:7]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[6:7], v[26:27], 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v27, v11, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v19, v32, v18, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v16, v8, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v26, v10, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[10:11], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v18, 0, v16, s[8:9]
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[8:9], 0, v[18:19]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v17, v9, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[4:5], v[14:15], v[30:31]
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, v29, v13, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v17, v32, v16, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, v28, v12, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v18, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v19, v11, vcc
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, v[14:15], v[30:31]
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, v10, v26, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v27, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, v18, v10, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v19, v11, s[8:9]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[6:7], v[12:13], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[12:13]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[8:9], v[28:29], 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v31, v15, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v19, v32, v18, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v30, v14, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[14:15], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v18, 0, v18, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[30:31], 32
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[16:17]
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[18:19]
+; GFX8-NEXT:    v_cndmask_b32_e64 v12, v16, v12, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v13, v17, v13, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v12, v12, v28, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v18, v14, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v19, v15, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v14, v14, v30, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v13, v13, v29, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v15, v15, v31, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v12, v16, v12, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v13, v17, v13, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v14, v18, v14, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v15, v19, v15, s[12:13]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v8f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[16:17]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[16:17]
+; GFX9-NEXT:    v_mov_b32_e32 v32, 0x7ff80000
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[6:7], v[4:5], v[20:21]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[8:9], v[8:9], v[24:25]
+; GFX9-NEXT:    v_cmp_lt_f64_e64 s[10:11], v[12:13], v[28:29]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[12:13], v[12:13], v[28:29]
+; GFX9-NEXT:    v_cndmask_b32_e32 v31, v17, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v34, v32, v31, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v31, v16, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v33, 0, v31, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[4:5], v[2:3], v[18:19]
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v33, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v34, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[16:17], 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
+; GFX9-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[33:34]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v33, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v34, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[18:19]
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v19, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v17, v32, v16, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v18, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[4:5]
+; GFX9-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[4:5], v[20:21]
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v17, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[18:19], 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v18, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v19, vcc
+; GFX9-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[16:17]
+; GFX9-NEXT:    v_cndmask_b32_e64 v18, v21, v5, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v17, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, v20, v4, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[20:21], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v17, v32, v18, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[6:7]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[6:7], v[6:7], v[22:23]
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v20, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v21, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[4:5], 0, v[16:17]
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, v[6:7], v[22:23]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v16, v4, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v17, v5, s[4:5]
+; GFX9-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[8:9], v[24:25]
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v23, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v22, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[6:7], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, v32, v18, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v18, 0, v16, s[6:7]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[6:7], v[22:23], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, v25, v9, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v17, v32, v16, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, v24, v8, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v18, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v19, v7, vcc
+; GFX9-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[18:19]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[8:9], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v22, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v23, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[8:9]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[6:7], v[24:25], 32
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[8:9], v[10:11], v[26:27]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v18, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v16, v8, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v19, v7, vcc
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, v[10:11], v[26:27]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v17, v9, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[4:5], 0, v[16:17]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v24, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v25, s[6:7]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[6:7], v[26:27], 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v27, v11, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, v32, v18, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v16, v8, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v26, v10, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[10:11], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v18, 0, v16, s[8:9]
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[8:9], 0, v[18:19]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v17, v9, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[4:5], v[14:15], v[30:31]
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, v29, v13, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v17, v32, v16, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, v28, v12, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v18, v10, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v19, v11, vcc
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, v[14:15], v[30:31]
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, v26, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v27, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v18, v10, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v19, v11, s[8:9]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[6:7], v[12:13], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[12:13]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[8:9], v[28:29], 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v31, v15, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, v32, v18, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v30, v14, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[14:15], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v18, 0, v18, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[30:31], 32
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[16:17]
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[18:19]
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v16, v12, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, v17, v13, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v28, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v18, v14, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v19, v15, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v14, v14, v30, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, v13, v29, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v15, v15, v31, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v16, v12, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, v17, v13, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v14, v18, v14, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v15, v19, v15, s[12:13]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v8f64:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    scratch_load_dword v31, off, s32
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[16:17]
+; GFX940-NEXT:    v_mov_b32_e32 v32, 0x7ff80000
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[0:1], v[16:17]
+; GFX940-NEXT:    v_cndmask_b32_e32 v33, v17, v1, vcc
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v35, v32, v33, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v33, v16, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v34, 0, v33, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[16:17], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[34:35]
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v34, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v35, v1, vcc
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[18:19]
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v0, v16, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v1, v17, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v19, v3, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[2:3], v[18:19]
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v34, v0, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v35, v1, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v17, v32, v16, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v18, v2, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[18:19], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[16:17]
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v2, v18, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v17, v3, vcc
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[4:5], v[20:21]
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v16, v2, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v3, v19, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v21, v5, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[4:5], v[20:21]
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v17, v3, s[2:3]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v17, v32, v16, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v20, v4, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[20:21], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[16:17]
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, v4, v20, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[6:7], v[22:23]
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, v16, v4, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v5, v5, v21, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v23, v7, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[6:7], v[22:23]
+; GFX940-NEXT:    v_cndmask_b32_e64 v5, v17, v5, s[2:3]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v17, v32, v16, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v22, v6, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[6:7], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[22:23], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[16:17]
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v16, v6, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v6, v6, v22, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v17, v7, vcc
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[8:9], v[24:25]
+; GFX940-NEXT:    v_cndmask_b32_e64 v6, v16, v6, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v7, v7, v23, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v25, v9, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[8:9], v[24:25]
+; GFX940-NEXT:    v_cndmask_b32_e64 v7, v17, v7, s[2:3]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v17, v32, v16, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v24, v8, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[8:9], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[24:25], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[16:17]
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v8, v8, v24, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v9, v17, v9, vcc
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[10:11], v[26:27]
+; GFX940-NEXT:    v_cndmask_b32_e64 v8, v16, v8, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v9, v9, v25, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v27, v11, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[10:11], v[26:27]
+; GFX940-NEXT:    v_cndmask_b32_e64 v9, v17, v9, s[2:3]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v17, v32, v16, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v26, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[10:11], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[26:27], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[16:17]
+; GFX940-NEXT:    v_cndmask_b32_e32 v10, v16, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v10, v10, v26, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v11, v17, v11, vcc
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[12:13], v[28:29]
+; GFX940-NEXT:    v_cndmask_b32_e64 v10, v16, v10, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v11, v11, v27, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v29, v13, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[12:13], v[28:29]
+; GFX940-NEXT:    v_cndmask_b32_e64 v11, v17, v11, s[2:3]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v17, v32, v16, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v28, v12, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[12:13], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[28:29], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[16:17]
+; GFX940-NEXT:    v_cndmask_b32_e32 v12, v16, v12, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v12, v12, v28, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v13, v17, v13, vcc
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[14:15], v[30:31]
+; GFX940-NEXT:    v_cndmask_b32_e64 v12, v16, v12, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v13, v13, v29, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v31, v15, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[14:15], v[30:31]
+; GFX940-NEXT:    v_cndmask_b32_e64 v13, v17, v13, s[2:3]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e64 v17, v32, v16, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v30, v14, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[14:15], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[30:31], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[16:17]
+; GFX940-NEXT:    v_cndmask_b32_e32 v14, v16, v14, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v15, v17, v15, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v14, v14, v30, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v15, v15, v31, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v14, v16, v14, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v15, v17, v15, s[2:3]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v8f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[16:17]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s4, v[0:1], v[16:17]
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s9, v[6:7], v[22:23]
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s10, v[8:9], v[24:25]
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s11, v[10:11], v[26:27]
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s12, v[12:13], v[28:29]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s13, v[6:7], v[22:23]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s14, v[8:9], v[24:25]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s15, v[10:11], v[26:27]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s16, v[12:13], v[28:29]
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s5, v[2:3], v[18:19]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s6, v[2:3], v[18:19]
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s7, v[4:5], v[20:21]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s8, v[4:5], v[20:21]
+; GFX10-NEXT:    v_cmp_class_f64_e64 s17, v[26:27], 32
+; GFX10-NEXT:    v_cmp_class_f64_e64 s18, v[28:29], 32
+; GFX10-NEXT:    v_cndmask_b32_e32 v32, v17, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v38, v23, v7, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v48, v25, v9, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v50, v27, v11, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v33, 0x7ff80000, v32, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v32, v16, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v52, v29, v13, s12
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[0:1], 32
+; GFX10-NEXT:    v_cndmask_b32_e64 v39, 0x7ff80000, v38, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v49, 0x7ff80000, v48, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v32, 0, v32, s4
+; GFX10-NEXT:    v_cmp_class_f64_e64 s4, v[2:3], 32
+; GFX10-NEXT:    v_cndmask_b32_e64 v51, 0x7ff80000, v50, s15
+; GFX10-NEXT:    v_cndmask_b32_e64 v53, 0x7ff80000, v52, s16
+; GFX10-NEXT:    v_cndmask_b32_e64 v38, v22, v6, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v48, v24, v8, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v50, v26, v10, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v52, v28, v12, s12
+; GFX10-NEXT:    v_cmp_class_f64_e64 s11, v[16:17], 32
+; GFX10-NEXT:    v_cmp_class_f64_e64 s12, v[18:19], 32
+; GFX10-NEXT:    v_cndmask_b32_e64 v34, v19, v3, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v38, 0, v38, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v36, v21, v5, s7
+; GFX10-NEXT:    v_cmp_class_f64_e64 s9, v[12:13], 32
+; GFX10-NEXT:    v_cndmask_b32_e64 v48, 0, v48, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v35, 0x7ff80000, v34, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v34, v18, v2, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v37, 0x7ff80000, v36, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v36, v20, v4, s7
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v32, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f64_e64 s5, v[4:5], 32
+; GFX10-NEXT:    v_cndmask_b32_e64 v34, 0, v34, s6
+; GFX10-NEXT:    v_cmp_class_f64_e64 s6, v[6:7], 32
+; GFX10-NEXT:    v_cndmask_b32_e64 v36, 0, v36, s8
+; GFX10-NEXT:    v_cmp_class_f64_e64 s7, v[8:9], 32
+; GFX10-NEXT:    v_cmp_class_f64_e64 s8, v[10:11], 32
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v34, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v50, 0, v50, s15
+; GFX10-NEXT:    v_cndmask_b32_e64 v52, 0, v52, s16
+; GFX10-NEXT:    v_cmp_class_f64_e64 s14, v[20:21], 32
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v16, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v18, s12
+; GFX10-NEXT:    v_cmp_class_f64_e64 s15, v[22:23], 32
+; GFX10-NEXT:    v_cmp_class_f64_e64 s16, v[24:25], 32
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s19, 0, v[32:33]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s20, 0, v[34:35]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s21, 0, v[36:37]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s22, 0, v[48:49]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s23, 0, v[50:51]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s24, 0, v[52:53]
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v33, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v52, v12, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v36, v4, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v35, v3, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v38, v6, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v37, v5, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v48, v8, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v50, v10, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v39, v7, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v49, v9, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v51, v11, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v53, v13, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v20, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v10, v26, s17
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v22, s15
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v24, s16
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v12, v28, s18
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v17, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v19, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v21, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v23, s15
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v9, v25, s16
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v11, v27, s17
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v13, v29, s18
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v32, v0, s19
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v34, v2, s20
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v36, v4, s21
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v48, v8, s22
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v50, v10, s23
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v52, v12, s24
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v33, v1, s19
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v35, v3, s20
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v37, v5, s21
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v49, v9, s22
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v51, v11, s23
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v53, v13, s24
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s10, v[14:15], v[30:31]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s13, v[14:15], v[30:31]
+; GFX10-NEXT:    v_cmp_class_f64_e64 s25, v[30:31], 32
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, v31, v15, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, v30, v14, s10
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s10, 0, v[38:39]
+; GFX10-NEXT:    v_cndmask_b32_e64 v55, 0x7ff80000, v16, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v54, 0, v18, s13
+; GFX10-NEXT:    v_cmp_class_f64_e64 s13, v[14:15], 32
+; GFX10-NEXT:    v_cmp_eq_f64_e32 vcc_lo, 0, v[54:55]
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v38, v6, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v39, v7, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v54, v14, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, v55, v15, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v14, v30, s25
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, v15, v31, s25
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, v54, v14, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, v55, v15, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v8f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s4, v[6:7], v[22:23]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s9, v[6:7], v[22:23]
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s1, v[2:3], v[18:19]
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s6, v[10:11], v[26:27]
+; GFX11-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[16:17]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s2, v[2:3], v[18:19]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s11, v[10:11], v[26:27]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s0, v[0:1], v[16:17]
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s3, v[4:5], v[20:21]
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s5, v[8:9], v[24:25]
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s7, v[12:13], v[28:29]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s8, v[4:5], v[20:21]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s10, v[8:9], v[24:25]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s12, v[12:13], v[28:29]
+; GFX11-NEXT:    v_cmp_class_f64_e64 s13, v[18:19], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s15, v[20:21], 32
+; GFX11-NEXT:    v_cndmask_b32_e64 v38, v23, v7, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v34, v19, v3, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v50, v27, v11, s6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v39, 0x7ff80000, v38, s9
+; GFX11-NEXT:    v_cndmask_b32_e64 v38, v22, v6, s4
+; GFX11-NEXT:    v_cmp_class_f64_e64 s4, v[6:7], 32
+; GFX11-NEXT:    v_cndmask_b32_e64 v35, 0x7ff80000, v34, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v51, 0x7ff80000, v50, s11
+; GFX11-NEXT:    v_cndmask_b32_e64 v34, v18, v2, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v50, v26, v10, s6
+; GFX11-NEXT:    v_cmp_class_f64_e64 s1, v[0:1], 32
+; GFX11-NEXT:    v_cndmask_b32_e64 v36, v21, v5, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v48, v25, v9, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v52, v29, v13, s7
+; GFX11-NEXT:    v_cndmask_b32_e64 v50, 0, v50, s11
+; GFX11-NEXT:    v_cmp_class_f64_e64 s11, v[16:17], 32
+; GFX11-NEXT:    v_cndmask_b32_e64 v37, 0x7ff80000, v36, s8
+; GFX11-NEXT:    v_cndmask_b32_e64 v49, 0x7ff80000, v48, s10
+; GFX11-NEXT:    v_cndmask_b32_e64 v53, 0x7ff80000, v52, s12
+; GFX11-NEXT:    v_cndmask_b32_e64 v36, v20, v4, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v48, v24, v8, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v52, v28, v12, s7
+; GFX11-NEXT:    v_cndmask_b32_e64 v34, 0, v34, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v38, 0, v38, s9
+; GFX11-NEXT:    v_cmp_class_f64_e64 s2, v[2:3], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s3, v[4:5], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s5, v[8:9], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s7, v[10:11], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s9, v[12:13], 32
+; GFX11-NEXT:    v_cndmask_b32_e64 v36, 0, v36, s8
+; GFX11-NEXT:    v_cndmask_b32_e64 v48, 0, v48, s10
+; GFX11-NEXT:    v_cndmask_b32_e64 v52, 0, v52, s12
+; GFX11-NEXT:    v_cmp_class_f64_e64 s6, v[24:25], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s8, v[26:27], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s10, v[28:29], 32
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s14, 0, v[34:35]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s16, 0, v[36:37]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s17, 0, v[38:39]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s18, 0, v[48:49]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s20, 0, v[50:51]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s21, 0, v[52:53]
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v39, v7, s4
+; GFX11-NEXT:    v_cndmask_b32_e32 v32, v17, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v38, v6, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v33, 0x7ff80000, v32, s0
+; GFX11-NEXT:    v_cndmask_b32_e32 v32, v16, v0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v33, v1, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v32, 0, v32, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v17, s11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v32, v0, s1
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s12, 0, v[32:33]
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v34, v2, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v36, v4, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v8, v48, v8, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v16, s11
+; GFX11-NEXT:    v_cndmask_b32_e64 v10, v50, v10, s7
+; GFX11-NEXT:    v_cndmask_b32_e64 v12, v52, v12, s9
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v35, v3, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v37, v5, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v9, v49, v9, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v11, v51, v11, s7
+; GFX11-NEXT:    v_cndmask_b32_e64 v13, v53, v13, s9
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v18, s13
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v20, s15
+; GFX11-NEXT:    v_cndmask_b32_e64 v8, v8, v24, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v10, v10, v26, s8
+; GFX11-NEXT:    v_cndmask_b32_e64 v12, v12, v28, s10
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v19, s13
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, v21, s15
+; GFX11-NEXT:    v_cndmask_b32_e64 v9, v9, v25, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v11, v11, v27, s8
+; GFX11-NEXT:    v_cndmask_b32_e64 v13, v13, v29, s10
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v34, v2, s14
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v36, v4, s16
+; GFX11-NEXT:    v_cndmask_b32_e64 v8, v48, v8, s18
+; GFX11-NEXT:    v_cndmask_b32_e64 v10, v50, v10, s20
+; GFX11-NEXT:    v_cndmask_b32_e64 v12, v52, v12, s21
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v35, v3, s14
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v37, v5, s16
+; GFX11-NEXT:    v_cndmask_b32_e64 v9, v49, v9, s18
+; GFX11-NEXT:    v_cndmask_b32_e64 v11, v51, v11, s20
+; GFX11-NEXT:    v_cndmask_b32_e64 v13, v53, v13, s21
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v32, v0, s12
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v33, v1, s12
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[14:15], v[30:31]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s0, v[14:15], v[30:31]
+; GFX11-NEXT:    v_cmp_class_f64_e64 s19, v[30:31], 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v54, v31, v15, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v16, v30, v14, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[22:23], 32
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v55, 0x7ff80000, v54, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v54, 0, v16, s0
+; GFX11-NEXT:    v_cmp_class_f64_e64 s0, v[14:15], 32
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s22, 0, v[54:55]
+; GFX11-NEXT:    v_dual_cndmask_b32 v7, v7, v23 :: v_dual_cndmask_b32 v6, v6, v22
+; GFX11-NEXT:    v_cndmask_b32_e64 v14, v54, v14, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v15, v55, v15, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v38, v6, s17
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v39, v7, s17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v14, v14, v30, s19
+; GFX11-NEXT:    v_cndmask_b32_e64 v15, v15, v31, s19
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v14, v54, v14, s22
+; GFX11-NEXT:    v_cndmask_b32_e64 v15, v55, v15, s22
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v8f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    scratch_load_b32 v31, off, s32
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[16:17]
+; GFX12-NEXT:    v_minimum_f64 v[2:3], v[2:3], v[18:19]
+; GFX12-NEXT:    v_minimum_f64 v[4:5], v[4:5], v[20:21]
+; GFX12-NEXT:    v_minimum_f64 v[6:7], v[6:7], v[22:23]
+; GFX12-NEXT:    v_minimum_f64 v[8:9], v[8:9], v[24:25]
+; GFX12-NEXT:    v_minimum_f64 v[10:11], v[10:11], v[26:27]
+; GFX12-NEXT:    v_minimum_f64 v[12:13], v[12:13], v[28:29]
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[14:15], v[14:15], v[30:31]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <8 x double> @llvm.minimum.v8f64(<8 x double> %src0, <8 x double> %src1)
+  ret <8 x double> %op
+}
+
+define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) {
+; GFX7-LABEL: v_minimum_v16f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:16
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
+; GFX7-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:24
+; GFX7-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:20
+; GFX7-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:32
+; GFX7-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:28
+; GFX7-NEXT:    v_mov_b32_e32 v39, 0x7ff80000
+; GFX7-NEXT:    s_waitcnt vmcnt(6)
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[31:32]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[31:32]
+; GFX7-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[6:7], v[2:3], v[33:34]
+; GFX7-NEXT:    v_cndmask_b32_e32 v48, v32, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v49, v39, v48, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v48, v31, v0, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v48, 0, v48, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[31:32], 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v48, v0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v49, v1, vcc
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[33:34]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v31, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v32, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[33:34], 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v50, v34, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v31, v33, v2, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v32, v39, v50, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v31, 0, v31, s[6:7]
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[6:7], v[4:5], v[35:36]
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v31, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v32, v3, vcc
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, v[4:5], v[35:36]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v33, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v34, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[35:36], 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v50, v36, v5, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v33, v35, v4, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v34, v39, v50, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v33, 0, v33, s[6:7]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[6:7], v[6:7], v[37:38]
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v33, v4, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v34, v5, vcc
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, v[6:7], v[37:38]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v35, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v36, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[37:38], 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v50, v38, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v35, v37, v6, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[6:7], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v36, v39, v50, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v35, 0, v35, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v35, v6, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v36, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, v6, v37, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v7, v38, s[4:5]
+; GFX7-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:36
+; GFX7-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:40
+; GFX7-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[48:49]
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v48, v0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v49, v1, vcc
+; GFX7-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:48
+; GFX7-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:44
+; GFX7-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[31:32]
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v31, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v32, v3, vcc
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:52
+; GFX7-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[33:34]
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v33, v4, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v34, v5, vcc
+; GFX7-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:64
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:60
+; GFX7-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[35:36]
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v35, v6, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v36, v7, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[8:9], 32
+; GFX7-NEXT:    s_waitcnt vmcnt(6)
+; GFX7-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[8:9], v[37:38]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[6:7], v[8:9], v[37:38]
+; GFX7-NEXT:    v_cndmask_b32_e64 v50, v38, v9, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v35, v37, v8, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v36, v39, v50, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v35, 0, v35, s[6:7]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[37:38], 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v35, v8, vcc
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[6:7], 0, v[35:36]
+; GFX7-NEXT:    v_cndmask_b32_e32 v9, v36, v9, vcc
+; GFX7-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, v[10:11], v[48:49]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[8:9], v[10:11], v[48:49]
+; GFX7-NEXT:    v_cndmask_b32_e64 v8, v8, v37, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v9, v9, v38, s[4:5]
+; GFX7-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:68
+; GFX7-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:72
+; GFX7-NEXT:    v_cndmask_b32_e64 v8, v35, v8, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e32 v50, v49, v11, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v35, v48, v10, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[10:11], 32
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[48:49], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v51, v39, v50, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v50, 0, v35, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v9, v36, v9, s[6:7]
+; GFX7-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:80
+; GFX7-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:76
+; GFX7-NEXT:    s_waitcnt vmcnt(6)
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[6:7], v[12:13], v[31:32]
+; GFX7-NEXT:    v_cndmask_b32_e32 v10, v50, v10, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v11, v51, v11, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v10, v10, v48, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v11, v11, v49, s[4:5]
+; GFX7-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[50:51]
+; GFX7-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[12:13], v[31:32]
+; GFX7-NEXT:    v_cndmask_b32_e32 v10, v50, v10, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v48, v32, v13, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v11, v51, v11, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[12:13], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v49, v39, v48, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v48, v31, v12, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[31:32], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v48, 0, v48, s[6:7]
+; GFX7-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[6:7], v[14:15], v[33:34]
+; GFX7-NEXT:    v_cndmask_b32_e32 v12, v48, v12, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v13, v49, v13, vcc
+; GFX7-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[48:49]
+; GFX7-NEXT:    v_cndmask_b32_e64 v12, v12, v31, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v13, v13, v32, s[4:5]
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:84
+; GFX7-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[14:15], v[33:34]
+; GFX7-NEXT:    v_cndmask_b32_e32 v12, v48, v12, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v13, v49, v13, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[14:15], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v50, v34, v15, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v48, v33, v14, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[33:34], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v49, v39, v50, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v48, 0, v48, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e32 v14, v48, v14, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v15, v49, v15, vcc
+; GFX7-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[48:49]
+; GFX7-NEXT:    v_cndmask_b32_e64 v14, v14, v33, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v15, v15, v34, s[4:5]
+; GFX7-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:96
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
+; GFX7-NEXT:    v_cndmask_b32_e32 v14, v48, v14, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v15, v49, v15, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[16:17], 32
+; GFX7-NEXT:    s_waitcnt vmcnt(6)
+; GFX7-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[16:17], v[37:38]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[6:7], v[16:17], v[37:38]
+; GFX7-NEXT:    v_cndmask_b32_e64 v50, v38, v17, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v48, v37, v16, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v49, v39, v50, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v48, 0, v48, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v48, v16, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[37:38], 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v17, v49, v17, vcc
+; GFX7-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, v[18:19], v[35:36]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[6:7], v[18:19], v[35:36]
+; GFX7-NEXT:    v_cndmask_b32_e64 v16, v16, v37, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v17, v17, v38, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v50, v36, v19, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v37, v35, v18, vcc
+; GFX7-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[48:49]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[18:19], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v51, v39, v50, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v50, 0, v37, s[6:7]
+; GFX7-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:100
+; GFX7-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:104
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v48, v16, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v17, v49, v17, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[35:36], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v18, v50, v18, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v19, v51, v19, s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[20:21], v[31:32]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[6:7], v[20:21], v[31:32]
+; GFX7-NEXT:    v_cndmask_b32_e32 v18, v18, v35, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v19, v19, v36, vcc
+; GFX7-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[50:51]
+; GFX7-NEXT:    v_cndmask_b32_e64 v48, v32, v21, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v35, v31, v20, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[20:21], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v36, v39, v48, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v35, 0, v35, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e32 v18, v50, v18, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v19, v51, v19, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[31:32], 32
+; GFX7-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:112
+; GFX7-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:108
+; GFX7-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:120
+; GFX7-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:116
+; GFX7-NEXT:    v_cndmask_b32_e64 v20, v35, v20, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v21, v36, v21, s[4:5]
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[4:5], 0, v[35:36]
+; GFX7-NEXT:    s_waitcnt vmcnt(6)
+; GFX7-NEXT:    v_cmp_lt_f64_e64 s[6:7], v[22:23], v[33:34]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[8:9], v[22:23], v[33:34]
+; GFX7-NEXT:    v_cndmask_b32_e32 v20, v20, v31, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v21, v21, v32, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[22:23], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v20, v35, v20, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v21, v36, v21, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[33:34], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v31, v34, v23, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v36, v39, v31, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v31, v33, v22, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v35, 0, v31, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v22, v35, v22, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v23, v36, v23, vcc
+; GFX7-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[35:36]
+; GFX7-NEXT:    v_cndmask_b32_e64 v22, v22, v33, s[4:5]
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:128
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX7-NEXT:    v_cndmask_b32_e64 v23, v23, v34, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v22, v35, v22, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v23, v36, v23, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[24:25], 32
+; GFX7-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[24:25], v[37:38]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[6:7], v[24:25], v[37:38]
+; GFX7-NEXT:    v_cndmask_b32_e64 v34, v38, v25, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v35, v39, v34, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v34, v37, v24, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v34, 0, v34, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e32 v24, v34, v24, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v25, v35, v25, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[37:38], 32
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[4:5], 0, v[34:35]
+; GFX7-NEXT:    v_cndmask_b32_e32 v24, v24, v37, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v25, v25, v38, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[26:27], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v24, v34, v24, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v25, v35, v25, s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-NEXT:    v_cmp_lt_f64_e64 s[6:7], v[26:27], v[48:49]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[8:9], v[26:27], v[48:49]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[48:49], 32
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[10:11], v[28:29], v[50:51]
+; GFX7-NEXT:    v_cndmask_b32_e64 v36, v49, v27, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v34, v48, v26, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v35, v39, v36, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v34, 0, v34, s[8:9]
+; GFX7-NEXT:    v_cmp_lt_f64_e64 s[8:9], v[28:29], v[50:51]
+; GFX7-NEXT:    v_cndmask_b32_e32 v26, v34, v26, vcc
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[6:7], 0, v[34:35]
+; GFX7-NEXT:    v_cndmask_b32_e32 v27, v35, v27, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v26, v26, v48, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v27, v27, v49, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v36, v51, v29, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v27, v35, v27, s[6:7]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_lt_f64_e32 vcc, v[30:31], v[32:33]
+; GFX7-NEXT:    v_cmp_o_f64_e64 s[4:5], v[30:31], v[32:33]
+; GFX7-NEXT:    v_cndmask_b32_e64 v35, v39, v36, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v26, v34, v26, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v34, v50, v28, s[8:9]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[6:7], v[28:29], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v34, 0, v34, s[10:11]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[8:9], v[50:51], 32
+; GFX7-NEXT:    v_cndmask_b32_e32 v36, v33, v31, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v37, v39, v36, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e32 v36, v32, v30, vcc
+; GFX7-NEXT:    v_cmp_class_f64_e64 vcc, v[30:31], 32
+; GFX7-NEXT:    v_cndmask_b32_e64 v36, 0, v36, s[4:5]
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[32:33], 32
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[34:35]
+; GFX7-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[36:37]
+; GFX7-NEXT:    v_cndmask_b32_e64 v28, v34, v28, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v29, v35, v29, s[6:7]
+; GFX7-NEXT:    v_cndmask_b32_e64 v28, v28, v50, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v30, v36, v30, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v31, v37, v31, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v30, v30, v32, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v29, v29, v51, s[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e64 v31, v31, v33, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v28, v34, v28, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v29, v35, v29, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v30, v36, v30, s[12:13]
+; GFX7-NEXT:    v_cndmask_b32_e64 v31, v37, v31, s[12:13]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimum_v16f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; GFX8-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:16
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
+; GFX8-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:24
+; GFX8-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:20
+; GFX8-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:32
+; GFX8-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:28
+; GFX8-NEXT:    v_mov_b32_e32 v39, 0x7ff80000
+; GFX8-NEXT:    s_waitcnt vmcnt(6)
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[31:32]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[31:32]
+; GFX8-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[6:7], v[2:3], v[33:34]
+; GFX8-NEXT:    v_cndmask_b32_e32 v48, v32, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v49, v39, v48, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v48, v31, v0, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v48, 0, v48, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[31:32], 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v48, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v49, v1, vcc
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[33:34]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v31, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v32, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[33:34], 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v50, v34, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v31, v33, v2, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v32, v39, v50, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v31, 0, v31, s[6:7]
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[6:7], v[4:5], v[35:36]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v31, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v32, v3, vcc
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, v[4:5], v[35:36]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v33, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v34, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[35:36], 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v50, v36, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v33, v35, v4, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v34, v39, v50, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v33, 0, v33, s[6:7]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[6:7], v[6:7], v[37:38]
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v33, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v34, v5, vcc
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, v[6:7], v[37:38]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v35, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v36, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[37:38], 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v50, v38, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v35, v37, v6, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[6:7], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v36, v39, v50, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v35, 0, v35, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v35, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v36, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v37, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v38, s[4:5]
+; GFX8-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:36
+; GFX8-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:40
+; GFX8-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[48:49]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v48, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v49, v1, vcc
+; GFX8-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:48
+; GFX8-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:44
+; GFX8-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[31:32]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v31, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v32, v3, vcc
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
+; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:52
+; GFX8-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[33:34]
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v33, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v34, v5, vcc
+; GFX8-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:64
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:60
+; GFX8-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[35:36]
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v35, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v36, v7, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[8:9], 32
+; GFX8-NEXT:    s_waitcnt vmcnt(6)
+; GFX8-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[8:9], v[37:38]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[6:7], v[8:9], v[37:38]
+; GFX8-NEXT:    v_cndmask_b32_e64 v50, v38, v9, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v35, v37, v8, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v36, v39, v50, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v35, 0, v35, s[6:7]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[37:38], 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v35, v8, vcc
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[6:7], 0, v[35:36]
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v36, v9, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, v[10:11], v[48:49]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[8:9], v[10:11], v[48:49]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v8, v37, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v9, v38, s[4:5]
+; GFX8-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:68
+; GFX8-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:72
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v35, v8, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v50, v49, v11, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v35, v48, v10, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[10:11], 32
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[48:49], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v51, v39, v50, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v50, 0, v35, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v36, v9, s[6:7]
+; GFX8-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:80
+; GFX8-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:76
+; GFX8-NEXT:    s_waitcnt vmcnt(6)
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[6:7], v[12:13], v[31:32]
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v50, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v51, v11, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, v10, v48, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v49, s[4:5]
+; GFX8-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[50:51]
+; GFX8-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[12:13], v[31:32]
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v50, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v48, v32, v13, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v51, v11, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[12:13], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v49, v39, v48, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v48, v31, v12, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[31:32], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v48, 0, v48, s[6:7]
+; GFX8-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[6:7], v[14:15], v[33:34]
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v48, v12, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v49, v13, vcc
+; GFX8-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[48:49]
+; GFX8-NEXT:    v_cndmask_b32_e64 v12, v12, v31, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v13, v13, v32, s[4:5]
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
+; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:84
+; GFX8-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[14:15], v[33:34]
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v48, v12, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v49, v13, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[14:15], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v50, v34, v15, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v48, v33, v14, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[33:34], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v49, v39, v50, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v48, 0, v48, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v48, v14, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v49, v15, vcc
+; GFX8-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[48:49]
+; GFX8-NEXT:    v_cndmask_b32_e64 v14, v14, v33, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v15, v15, v34, s[4:5]
+; GFX8-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:96
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v48, v14, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v49, v15, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[16:17], 32
+; GFX8-NEXT:    s_waitcnt vmcnt(6)
+; GFX8-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[16:17], v[37:38]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[6:7], v[16:17], v[37:38]
+; GFX8-NEXT:    v_cndmask_b32_e64 v50, v38, v17, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v48, v37, v16, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v49, v39, v50, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v48, 0, v48, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v48, v16, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[37:38], 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v17, v49, v17, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, v[18:19], v[35:36]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[6:7], v[18:19], v[35:36]
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, v16, v37, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v17, v17, v38, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v50, v36, v19, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v37, v35, v18, vcc
+; GFX8-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[48:49]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[18:19], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v51, v39, v50, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v50, 0, v37, s[6:7]
+; GFX8-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:100
+; GFX8-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:104
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v48, v16, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v17, v49, v17, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[35:36], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v18, v50, v18, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v19, v51, v19, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[20:21], v[31:32]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[6:7], v[20:21], v[31:32]
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v18, v35, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, v19, v36, vcc
+; GFX8-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[50:51]
+; GFX8-NEXT:    v_cndmask_b32_e64 v48, v32, v21, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v35, v31, v20, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[20:21], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v36, v39, v48, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v35, 0, v35, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v50, v18, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, v51, v19, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[31:32], 32
+; GFX8-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:112
+; GFX8-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:108
+; GFX8-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:120
+; GFX8-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:116
+; GFX8-NEXT:    v_cndmask_b32_e64 v20, v35, v20, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v21, v36, v21, s[4:5]
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[4:5], 0, v[35:36]
+; GFX8-NEXT:    s_waitcnt vmcnt(6)
+; GFX8-NEXT:    v_cmp_lt_f64_e64 s[6:7], v[22:23], v[33:34]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[8:9], v[22:23], v[33:34]
+; GFX8-NEXT:    v_cndmask_b32_e32 v20, v20, v31, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v21, v21, v32, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[22:23], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v20, v35, v20, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v21, v36, v21, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[33:34], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v31, v34, v23, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v36, v39, v31, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v31, v33, v22, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v35, 0, v31, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v22, v35, v22, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v23, v36, v23, vcc
+; GFX8-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[35:36]
+; GFX8-NEXT:    v_cndmask_b32_e64 v22, v22, v33, s[4:5]
+; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:128
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX8-NEXT:    v_cndmask_b32_e64 v23, v23, v34, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v22, v35, v22, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v23, v36, v23, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[24:25], 32
+; GFX8-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[24:25], v[37:38]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[6:7], v[24:25], v[37:38]
+; GFX8-NEXT:    v_cndmask_b32_e64 v34, v38, v25, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v35, v39, v34, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v34, v37, v24, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v34, 0, v34, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v24, v34, v24, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v25, v35, v25, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[37:38], 32
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[4:5], 0, v[34:35]
+; GFX8-NEXT:    v_cndmask_b32_e32 v24, v24, v37, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v25, v25, v38, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[26:27], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v24, v34, v24, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v25, v35, v25, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(5)
+; GFX8-NEXT:    v_cmp_lt_f64_e64 s[6:7], v[26:27], v[48:49]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[8:9], v[26:27], v[48:49]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[48:49], 32
+; GFX8-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[10:11], v[28:29], v[50:51]
+; GFX8-NEXT:    v_cndmask_b32_e64 v36, v49, v27, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v34, v48, v26, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v35, v39, v36, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v34, 0, v34, s[8:9]
+; GFX8-NEXT:    v_cmp_lt_f64_e64 s[8:9], v[28:29], v[50:51]
+; GFX8-NEXT:    v_cndmask_b32_e32 v26, v34, v26, vcc
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[6:7], 0, v[34:35]
+; GFX8-NEXT:    v_cndmask_b32_e32 v27, v35, v27, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v26, v26, v48, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v27, v27, v49, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v36, v51, v29, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v27, v35, v27, s[6:7]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_f64_e32 vcc, v[30:31], v[32:33]
+; GFX8-NEXT:    v_cmp_o_f64_e64 s[4:5], v[30:31], v[32:33]
+; GFX8-NEXT:    v_cndmask_b32_e64 v35, v39, v36, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v26, v34, v26, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v34, v50, v28, s[8:9]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[6:7], v[28:29], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v34, 0, v34, s[10:11]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[8:9], v[50:51], 32
+; GFX8-NEXT:    v_cndmask_b32_e32 v36, v33, v31, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v37, v39, v36, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v36, v32, v30, vcc
+; GFX8-NEXT:    v_cmp_class_f64_e64 vcc, v[30:31], 32
+; GFX8-NEXT:    v_cndmask_b32_e64 v36, 0, v36, s[4:5]
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[32:33], 32
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[34:35]
+; GFX8-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[36:37]
+; GFX8-NEXT:    v_cndmask_b32_e64 v28, v34, v28, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v29, v35, v29, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v28, v28, v50, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v30, v36, v30, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v31, v37, v31, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v30, v30, v32, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v29, v29, v51, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v31, v31, v33, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v28, v34, v28, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v29, v35, v29, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v30, v36, v30, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v31, v37, v31, s[12:13]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimum_v16f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    v_mov_b32_e32 v39, 0x7ff80000
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[31:32]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[4:5], v[0:1], v[31:32]
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[6:7], v[2:3], v[33:34]
+; GFX9-NEXT:    v_cndmask_b32_e32 v48, v32, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v49, v39, v48, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v48, v31, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v48, 0, v48, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[31:32], 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v48, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v49, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[33:34]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v31, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v32, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[33:34], 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v50, v34, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v31, v33, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v32, v39, v50, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v31, 0, v31, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[6:7], v[4:5], v[35:36]
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v31, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v32, v3, vcc
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, v[4:5], v[35:36]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v33, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v34, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[35:36], 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v50, v36, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v33, v35, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v34, v39, v50, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v33, 0, v33, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[6:7], v[6:7], v[37:38]
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v33, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v34, v5, vcc
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, v[6:7], v[37:38]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v35, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v36, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[37:38], 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v50, v38, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v35, v37, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[6:7], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v36, v39, v50, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v35, 0, v35, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v35, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v36, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v37, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v38, s[4:5]
+; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:40
+; GFX9-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[48:49]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v48, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v49, v1, vcc
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:48
+; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[31:32]
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v31, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v32, v3, vcc
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[33:34]
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v33, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v34, v5, vcc
+; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:64
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[35:36]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v35, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v36, v7, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[8:9], 32
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[8:9], v[37:38]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[6:7], v[8:9], v[37:38]
+; GFX9-NEXT:    v_cndmask_b32_e64 v50, v38, v9, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v35, v37, v8, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v36, v39, v50, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v35, 0, v35, s[6:7]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[37:38], 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v35, v8, vcc
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[6:7], 0, v[35:36]
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v36, v9, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, v[10:11], v[48:49]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[8:9], v[10:11], v[48:49]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v37, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v38, s[4:5]
+; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:72
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v35, v8, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v50, v49, v11, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v35, v48, v10, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[10:11], 32
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[48:49], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v51, v39, v50, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v50, 0, v35, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v36, v9, s[6:7]
+; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:80
+; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[6:7], v[12:13], v[31:32]
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v50, v10, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v51, v11, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, v48, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v49, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[50:51]
+; GFX9-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[12:13], v[31:32]
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v50, v10, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v48, v32, v13, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v51, v11, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[12:13], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v49, v39, v48, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v48, v31, v12, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[31:32], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v48, 0, v48, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[6:7], v[14:15], v[33:34]
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v48, v12, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v49, v13, vcc
+; GFX9-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[48:49]
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v31, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, v13, v32, s[4:5]
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[14:15], v[33:34]
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v48, v12, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v49, v13, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[14:15], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v50, v34, v15, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v48, v33, v14, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[33:34], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v49, v39, v50, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v48, 0, v48, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v48, v14, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v49, v15, vcc
+; GFX9-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[48:49]
+; GFX9-NEXT:    v_cndmask_b32_e64 v14, v14, v33, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v15, v15, v34, s[4:5]
+; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:96
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v48, v14, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v49, v15, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[16:17], 32
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[16:17], v[37:38]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[6:7], v[16:17], v[37:38]
+; GFX9-NEXT:    v_cndmask_b32_e64 v50, v38, v17, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v48, v37, v16, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v49, v39, v50, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v48, 0, v48, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v48, v16, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[37:38], 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v17, v49, v17, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, v[18:19], v[35:36]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[6:7], v[18:19], v[35:36]
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, v16, v37, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v17, v17, v38, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v50, v36, v19, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v37, v35, v18, vcc
+; GFX9-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[48:49]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[18:19], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v51, v39, v50, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v50, 0, v37, s[6:7]
+; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:104
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v48, v16, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v17, v49, v17, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[35:36], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v18, v50, v18, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, v51, v19, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[20:21], v[31:32]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[6:7], v[20:21], v[31:32]
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v18, v35, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v19, v19, v36, vcc
+; GFX9-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[50:51]
+; GFX9-NEXT:    v_cndmask_b32_e64 v48, v32, v21, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v35, v31, v20, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[20:21], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v36, v39, v48, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v35, 0, v35, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v50, v18, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v19, v51, v19, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[31:32], 32
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:112
+; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:120
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:116
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, v35, v20, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v21, v36, v21, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[4:5], 0, v[35:36]
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    v_cmp_lt_f64_e64 s[6:7], v[22:23], v[33:34]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[8:9], v[22:23], v[33:34]
+; GFX9-NEXT:    v_cndmask_b32_e32 v20, v20, v31, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v21, v21, v32, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[22:23], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, v35, v20, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v21, v36, v21, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[33:34], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v31, v34, v23, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v36, v39, v31, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v31, v33, v22, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v35, 0, v31, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v22, v35, v22, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v23, v36, v23, vcc
+; GFX9-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[35:36]
+; GFX9-NEXT:    v_cndmask_b32_e64 v22, v22, v33, s[4:5]
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:128
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX9-NEXT:    v_cndmask_b32_e64 v23, v23, v34, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v22, v35, v22, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v23, v36, v23, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[24:25], 32
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[24:25], v[37:38]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[6:7], v[24:25], v[37:38]
+; GFX9-NEXT:    v_cndmask_b32_e64 v34, v38, v25, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v35, v39, v34, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v34, v37, v24, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v34, 0, v34, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v24, v34, v24, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v25, v35, v25, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[37:38], 32
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[4:5], 0, v[34:35]
+; GFX9-NEXT:    v_cndmask_b32_e32 v24, v24, v37, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v25, v25, v38, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[26:27], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v24, v34, v24, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v25, v35, v25, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    v_cmp_lt_f64_e64 s[6:7], v[26:27], v[48:49]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[8:9], v[26:27], v[48:49]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[48:49], 32
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[10:11], v[28:29], v[50:51]
+; GFX9-NEXT:    v_cndmask_b32_e64 v36, v49, v27, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v34, v48, v26, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v35, v39, v36, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v34, 0, v34, s[8:9]
+; GFX9-NEXT:    v_cmp_lt_f64_e64 s[8:9], v[28:29], v[50:51]
+; GFX9-NEXT:    v_cndmask_b32_e32 v26, v34, v26, vcc
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[6:7], 0, v[34:35]
+; GFX9-NEXT:    v_cndmask_b32_e32 v27, v35, v27, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v26, v26, v48, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v27, v27, v49, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v36, v51, v29, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v27, v35, v27, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f64_e32 vcc, v[30:31], v[32:33]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[4:5], v[30:31], v[32:33]
+; GFX9-NEXT:    v_cndmask_b32_e64 v35, v39, v36, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v26, v34, v26, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v34, v50, v28, s[8:9]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[6:7], v[28:29], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v34, 0, v34, s[10:11]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[8:9], v[50:51], 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v36, v33, v31, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v37, v39, v36, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v36, v32, v30, vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 vcc, v[30:31], 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v36, 0, v36, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[4:5], v[32:33], 32
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[10:11], 0, v[34:35]
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[12:13], 0, v[36:37]
+; GFX9-NEXT:    v_cndmask_b32_e64 v28, v34, v28, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v29, v35, v29, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v28, v28, v50, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v30, v36, v30, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v31, v37, v31, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v30, v30, v32, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v29, v29, v51, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v31, v31, v33, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v28, v34, v28, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v29, v35, v29, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v30, v36, v30, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v31, v37, v31, s[12:13]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: v_minimum_v16f64:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_write_b32 a1, v41 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_write_b32 a2, v42 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_write_b32 a3, v43 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_write_b32 a4, v44 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_write_b32 a5, v45 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_write_b32 a6, v46 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_write_b32 a7, v47 ; Reload Reuse
+; GFX940-NEXT:    scratch_load_dword v41, off, s32 offset:8
+; GFX940-NEXT:    scratch_load_dword v40, off, s32 offset:4
+; GFX940-NEXT:    scratch_load_dword v51, off, s32 offset:16
+; GFX940-NEXT:    scratch_load_dword v50, off, s32 offset:12
+; GFX940-NEXT:    scratch_load_dword v45, off, s32 offset:24
+; GFX940-NEXT:    scratch_load_dword v44, off, s32 offset:20
+; GFX940-NEXT:    scratch_load_dword v47, off, s32 offset:32
+; GFX940-NEXT:    scratch_load_dword v46, off, s32 offset:28
+; GFX940-NEXT:    scratch_load_dword v31, off, s32
+; GFX940-NEXT:    scratch_load_dword v33, off, s32 offset:128
+; GFX940-NEXT:    scratch_load_dword v32, off, s32 offset:124
+; GFX940-NEXT:    scratch_load_dword v35, off, s32 offset:120
+; GFX940-NEXT:    scratch_load_dword v34, off, s32 offset:116
+; GFX940-NEXT:    scratch_load_dword v43, off, s32 offset:40
+; GFX940-NEXT:    scratch_load_dword v42, off, s32 offset:36
+; GFX940-NEXT:    scratch_load_dword v37, off, s32 offset:112
+; GFX940-NEXT:    scratch_load_dword v36, off, s32 offset:108
+; GFX940-NEXT:    scratch_load_dword v39, off, s32 offset:104
+; GFX940-NEXT:    scratch_load_dword v38, off, s32 offset:100
+; GFX940-NEXT:    scratch_load_dword v49, off, s32 offset:96
+; GFX940-NEXT:    scratch_load_dword v48, off, s32 offset:92
+; GFX940-NEXT:    scratch_load_dword v53, off, s32 offset:56
+; GFX940-NEXT:    scratch_load_dword v52, off, s32 offset:52
+; GFX940-NEXT:    scratch_load_dword v55, off, s32 offset:48
+; GFX940-NEXT:    scratch_load_dword v54, off, s32 offset:44
+; GFX940-NEXT:    v_accvgpr_write_b32 a8, v56 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_write_b32 a9, v57 ; Reload Reuse
+; GFX940-NEXT:    v_mov_b32_e32 v56, 0x7ff80000
+; GFX940-NEXT:    v_accvgpr_write_b32 a11, v59 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_write_b32 a10, v58 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_write_b32 a12, v60 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_write_b32 a13, v61 ; Reload Reuse
+; GFX940-NEXT:    s_waitcnt vmcnt(23)
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[40:41]
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v57, v41, v1, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[0:1], v[40:41]
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e64 v59, v56, v57, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v57, v40, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v58, 0, v57, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[0:1], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[40:41], 32
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v58, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v59, v1, vcc
+; GFX940-NEXT:    s_waitcnt vmcnt(21)
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[50:51]
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v0, v40, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v1, v1, v41, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v57, v51, v3, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[2:3], v[50:51]
+; GFX940-NEXT:    v_cndmask_b32_e32 v40, v50, v2, vcc
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[2:3], 32
+; GFX940-NEXT:    v_cndmask_b32_e64 v61, v56, v57, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v60, 0, v40, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v60, v2, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v61, v3, vcc
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[50:51], 32
+; GFX940-NEXT:    scratch_load_dword v41, off, s32 offset:64
+; GFX940-NEXT:    scratch_load_dword v40, off, s32 offset:60
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v2, v50, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v3, v51, vcc
+; GFX940-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[58:59]
+; GFX940-NEXT:    s_waitcnt vmcnt(21)
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[4:5], v[44:45]
+; GFX940-NEXT:    scratch_load_dword v51, off, s32 offset:88
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v58, v0, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v59, v1, vcc
+; GFX940-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[60:61]
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v60, v2, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v61, v3, vcc
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[4:5], v[44:45]
+; GFX940-NEXT:    v_accvgpr_read_b32 v61, a13 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_read_b32 v60, a12 ; Reload Reuse
+; GFX940-NEXT:    v_cndmask_b32_e32 v50, v45, v5, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v59, v56, v50, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v50, v44, v4, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v58, 0, v50, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[4:5], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[44:45], 32
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v58, v4, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v59, v5, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, v4, v44, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v5, v5, v45, s[0:1]
+; GFX940-NEXT:    scratch_load_dword v45, off, s32 offset:72
+; GFX940-NEXT:    scratch_load_dword v44, off, s32 offset:68
+; GFX940-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[58:59]
+; GFX940-NEXT:    s_waitcnt vmcnt(22)
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[6:7], v[46:47]
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v58, v4, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v59, v5, vcc
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[6:7], v[46:47]
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v50, v47, v7, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v59, v56, v50, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v50, v46, v6, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v58, 0, v50, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[6:7], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[46:47], 32
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v58, v6, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v59, v7, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v6, v6, v46, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v7, v7, v47, s[0:1]
+; GFX940-NEXT:    scratch_load_dword v47, off, s32 offset:80
+; GFX940-NEXT:    scratch_load_dword v46, off, s32 offset:76
+; GFX940-NEXT:    scratch_load_dword v50, off, s32 offset:84
+; GFX940-NEXT:    v_cmp_eq_f64_e32 vcc, 0, v[58:59]
+; GFX940-NEXT:    s_waitcnt vmcnt(18)
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[8:9], v[42:43]
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v58, v6, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v7, v59, v7, vcc
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[8:9], v[42:43]
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v57, v43, v9, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v59, v56, v57, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v57, v42, v8, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v58, 0, v57, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[8:9], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[42:43], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[58:59]
+; GFX940-NEXT:    v_cndmask_b32_e32 v8, v58, v8, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v9, v59, v9, vcc
+; GFX940-NEXT:    s_waitcnt vmcnt(8)
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[10:11], v[54:55]
+; GFX940-NEXT:    v_cndmask_b32_e64 v8, v8, v42, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v9, v9, v43, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v42, v55, v11, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[10:11], v[54:55]
+; GFX940-NEXT:    v_cndmask_b32_e64 v8, v58, v8, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v9, v59, v9, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v43, v56, v42, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v42, v54, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v42, 0, v42, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[10:11], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[54:55], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[42:43]
+; GFX940-NEXT:    v_cndmask_b32_e32 v10, v42, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v11, v43, v11, vcc
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[12:13], v[52:53]
+; GFX940-NEXT:    v_cndmask_b32_e64 v10, v10, v54, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v11, v11, v55, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v54, v53, v13, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[12:13], v[52:53]
+; GFX940-NEXT:    v_cndmask_b32_e64 v10, v42, v10, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v11, v43, v11, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v55, v56, v54, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v54, v52, v12, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v54, 0, v54, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[12:13], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[52:53], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[54:55]
+; GFX940-NEXT:    v_cndmask_b32_e32 v12, v54, v12, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v13, v55, v13, vcc
+; GFX940-NEXT:    s_waitcnt vmcnt(6)
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[14:15], v[40:41]
+; GFX940-NEXT:    v_cndmask_b32_e64 v12, v12, v52, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v13, v13, v53, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v52, v41, v15, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[14:15], v[40:41]
+; GFX940-NEXT:    v_cndmask_b32_e64 v12, v54, v12, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v13, v55, v13, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v53, v56, v52, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v52, v40, v14, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v52, 0, v52, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[14:15], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[40:41], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[52:53]
+; GFX940-NEXT:    v_cndmask_b32_e32 v14, v52, v14, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v14, v14, v40, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v15, v53, v15, vcc
+; GFX940-NEXT:    s_waitcnt vmcnt(3)
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[16:17], v[44:45]
+; GFX940-NEXT:    v_cndmask_b32_e64 v14, v52, v14, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v15, v15, v41, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v52, v45, v17, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[16:17], v[44:45]
+; GFX940-NEXT:    v_cndmask_b32_e64 v15, v53, v15, s[2:3]
+; GFX940-NEXT:    v_accvgpr_read_b32 v59, a11 ; Reload Reuse
+; GFX940-NEXT:    v_cndmask_b32_e64 v53, v56, v52, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v52, v44, v16, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v52, 0, v52, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[16:17], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[44:45], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[52:53]
+; GFX940-NEXT:    v_cndmask_b32_e32 v16, v52, v16, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v16, v16, v44, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v17, v53, v17, vcc
+; GFX940-NEXT:    s_waitcnt vmcnt(1)
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[18:19], v[46:47]
+; GFX940-NEXT:    v_cndmask_b32_e64 v16, v52, v16, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v17, v17, v45, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v52, v47, v19, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[18:19], v[46:47]
+; GFX940-NEXT:    v_cndmask_b32_e64 v17, v53, v17, s[2:3]
+; GFX940-NEXT:    v_accvgpr_read_b32 v58, a10 ; Reload Reuse
+; GFX940-NEXT:    v_cndmask_b32_e64 v53, v56, v52, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v52, v46, v18, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v52, 0, v52, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[18:19], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[46:47], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[52:53]
+; GFX940-NEXT:    v_cndmask_b32_e32 v18, v52, v18, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v18, v18, v46, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v19, v53, v19, vcc
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[20:21], v[50:51]
+; GFX940-NEXT:    v_cndmask_b32_e64 v18, v52, v18, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v19, v19, v47, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v52, v51, v21, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[20:21], v[50:51]
+; GFX940-NEXT:    v_cndmask_b32_e64 v19, v53, v19, s[2:3]
+; GFX940-NEXT:    v_accvgpr_read_b32 v57, a9 ; Reload Reuse
+; GFX940-NEXT:    v_cndmask_b32_e64 v53, v56, v52, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v52, v50, v20, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v52, 0, v52, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[20:21], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[50:51], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[52:53]
+; GFX940-NEXT:    v_cndmask_b32_e32 v20, v52, v20, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v21, v53, v21, vcc
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[22:23], v[48:49]
+; GFX940-NEXT:    v_cndmask_b32_e64 v20, v20, v50, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v21, v21, v51, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v50, v49, v23, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[22:23], v[48:49]
+; GFX940-NEXT:    v_cndmask_b32_e64 v20, v52, v20, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v21, v53, v21, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v51, v56, v50, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v50, v48, v22, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v50, 0, v50, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[22:23], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[48:49], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[50:51]
+; GFX940-NEXT:    v_cndmask_b32_e32 v22, v50, v22, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v23, v51, v23, vcc
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[24:25], v[38:39]
+; GFX940-NEXT:    v_cndmask_b32_e64 v22, v22, v48, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v23, v23, v49, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v48, v39, v25, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[24:25], v[38:39]
+; GFX940-NEXT:    v_cndmask_b32_e64 v22, v50, v22, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v23, v51, v23, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v49, v56, v48, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v48, v38, v24, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v48, 0, v48, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[24:25], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[38:39], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[48:49]
+; GFX940-NEXT:    v_cndmask_b32_e32 v24, v48, v24, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v25, v49, v25, vcc
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[26:27], v[36:37]
+; GFX940-NEXT:    v_cndmask_b32_e64 v24, v24, v38, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v25, v25, v39, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v38, v37, v27, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[26:27], v[36:37]
+; GFX940-NEXT:    v_cndmask_b32_e64 v24, v48, v24, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v25, v49, v25, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v39, v56, v38, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v38, v36, v26, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v38, 0, v38, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[26:27], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[36:37], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[38:39]
+; GFX940-NEXT:    v_cndmask_b32_e32 v26, v38, v26, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v27, v39, v27, vcc
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[28:29], v[34:35]
+; GFX940-NEXT:    v_cndmask_b32_e64 v26, v26, v36, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v27, v27, v37, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v36, v35, v29, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[28:29], v[34:35]
+; GFX940-NEXT:    v_cndmask_b32_e64 v26, v38, v26, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v27, v39, v27, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v37, v56, v36, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v36, v34, v28, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v36, 0, v36, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[28:29], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[34:35], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[36:37]
+; GFX940-NEXT:    v_cndmask_b32_e32 v28, v36, v28, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v29, v37, v29, vcc
+; GFX940-NEXT:    v_cmp_lt_f64_e32 vcc, v[30:31], v[32:33]
+; GFX940-NEXT:    v_cndmask_b32_e64 v28, v28, v34, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v29, v29, v35, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v34, v33, v31, vcc
+; GFX940-NEXT:    v_cmp_o_f64_e64 s[0:1], v[30:31], v[32:33]
+; GFX940-NEXT:    v_cndmask_b32_e64 v28, v36, v28, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v29, v37, v29, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v35, v56, v34, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e32 v34, v32, v30, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v34, 0, v34, s[0:1]
+; GFX940-NEXT:    v_cmp_class_f64_e64 vcc, v[30:31], 32
+; GFX940-NEXT:    v_cmp_class_f64_e64 s[0:1], v[32:33], 32
+; GFX940-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0, v[34:35]
+; GFX940-NEXT:    v_cndmask_b32_e32 v30, v34, v30, vcc
+; GFX940-NEXT:    v_cndmask_b32_e32 v31, v35, v31, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v30, v30, v32, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v31, v31, v33, s[0:1]
+; GFX940-NEXT:    v_cndmask_b32_e64 v30, v34, v30, s[2:3]
+; GFX940-NEXT:    v_cndmask_b32_e64 v31, v35, v31, s[2:3]
+; GFX940-NEXT:    v_accvgpr_read_b32 v56, a8 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_read_b32 v47, a7 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_read_b32 v46, a6 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_read_b32 v45, a5 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_read_b32 v44, a4 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_read_b32 v43, a3 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_read_b32 v42, a2 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_read_b32 v41, a1 ; Reload Reuse
+; GFX940-NEXT:    v_accvgpr_read_b32 v40, a0 ; Reload Reuse
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimum_v16f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x20
+; GFX10-NEXT:    buffer_load_dword v65, off, s[0:3], s32 offset:8
+; GFX10-NEXT:    buffer_load_dword v64, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:16
+; GFX10-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:12
+; GFX10-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:24
+; GFX10-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:20
+; GFX10-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:32
+; GFX10-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:28
+; GFX10-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:36
+; GFX10-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:40
+; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:56
+; GFX10-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:52
+; GFX10-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:48
+; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:44
+; GFX10-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:64
+; GFX10-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:60
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:72
+; GFX10-NEXT:    buffer_load_dword v83, off, s[0:3], s32 offset:80
+; GFX10-NEXT:    buffer_load_dword v82, off, s[0:3], s32 offset:76
+; GFX10-NEXT:    buffer_load_dword v67, off, s[0:3], s32 offset:88
+; GFX10-NEXT:    buffer_load_dword v66, off, s[0:3], s32 offset:84
+; GFX10-NEXT:    buffer_load_dword v69, off, s[0:3], s32 offset:96
+; GFX10-NEXT:    buffer_load_dword v68, off, s[0:3], s32 offset:92
+; GFX10-NEXT:    buffer_load_dword v70, off, s[0:3], s32 offset:100
+; GFX10-NEXT:    buffer_load_dword v71, off, s[0:3], s32 offset:104
+; GFX10-NEXT:    buffer_load_dword v81, off, s[0:3], s32 offset:112
+; GFX10-NEXT:    buffer_load_dword v80, off, s[0:3], s32 offset:108
+; GFX10-NEXT:    buffer_load_dword v85, off, s[0:3], s32 offset:120
+; GFX10-NEXT:    buffer_load_dword v84, off, s[0:3], s32 offset:116
+; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT:    buffer_load_dword v87, off, s[0:3], s32 offset:128
+; GFX10-NEXT:    buffer_load_dword v86, off, s[0:3], s32 offset:124
+; GFX10-NEXT:    v_cmp_class_f64_e64 s10, v[0:1], 32
+; GFX10-NEXT:    v_cmp_class_f64_e64 s12, v[2:3], 32
+; GFX10-NEXT:    v_cmp_class_f64_e64 s17, v[10:11], 32
+; GFX10-NEXT:    s_waitcnt vmcnt(31)
+; GFX10-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[64:65]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s4, v[0:1], v[64:65]
+; GFX10-NEXT:    s_waitcnt vmcnt(29)
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s5, v[2:3], v[54:55]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s6, v[2:3], v[54:55]
+; GFX10-NEXT:    s_waitcnt vmcnt(27)
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s7, v[4:5], v[52:53]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s8, v[4:5], v[52:53]
+; GFX10-NEXT:    s_waitcnt vmcnt(25)
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s9, v[6:7], v[50:51]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s11, v[6:7], v[50:51]
+; GFX10-NEXT:    s_waitcnt vmcnt(23)
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s13, v[8:9], v[48:49]
+; GFX10-NEXT:    v_cmp_class_f64_e64 s14, v[64:65], 32
+; GFX10-NEXT:    s_waitcnt vmcnt(21)
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s15, v[12:13], v[36:37]
+; GFX10-NEXT:    s_waitcnt vmcnt(17)
+; GFX10-NEXT:    v_cmp_o_f64_e64 s16, v[14:15], v[34:35]
+; GFX10-NEXT:    v_cndmask_b32_e32 v96, v64, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v97, v54, v2, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v99, v55, v3, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v100, v52, v4, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v96, 0, v96, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v101, v50, v6, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v98, 0, v97, s6
+; GFX10-NEXT:    v_cndmask_b32_e32 v97, v65, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[54:55], 32
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v96, v0, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v99, 0x7ff80000, v99, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v98, v2, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v97, 0x7ff80000, v97, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v100, 0, v100, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v102, 0, v101, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v99, v3, s12
+; GFX10-NEXT:    v_cmp_class_f64_e64 s12, v[6:7], 32
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v97, v1, s10
+; GFX10-NEXT:    v_cmp_class_f64_e64 s10, v[4:5], 32
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s4, v[10:11], v[38:39]
+; GFX10-NEXT:    v_cndmask_b32_e64 v112, v48, v8, s13
+; GFX10-NEXT:    v_cmp_o_f64_e64 s5, v[12:13], v[36:37]
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s6, v[14:15], v[34:35]
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v64, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v65, s14
+; GFX10-NEXT:    v_cmp_class_f64_e64 s14, v[52:53], 32
+; GFX10-NEXT:    v_cndmask_b32_e64 v113, v36, v12, s15
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cmp_o_f64_e64 s18, v[30:31], v[86:87]
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v54, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v54, v53, v5, s7
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v55, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[50:51], 32
+; GFX10-NEXT:    v_cndmask_b32_e64 v55, v51, v7, s9
+; GFX10-NEXT:    v_cmp_o_f64_e64 s9, v[8:9], v[48:49]
+; GFX10-NEXT:    v_cndmask_b32_e64 v101, 0x7ff80000, v54, s8
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s7, v[16:17], v[32:33]
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v102, v6, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v103, 0x7ff80000, v55, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v100, v4, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v101, v5, s10
+; GFX10-NEXT:    v_cmp_class_f64_e64 s10, v[8:9], 32
+; GFX10-NEXT:    v_cmp_o_f64_e64 s11, v[10:11], v[38:39]
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v103, v7, s12
+; GFX10-NEXT:    v_cmp_class_f64_e64 s12, v[48:49], 32
+; GFX10-NEXT:    v_cndmask_b32_e64 v114, v38, v10, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v115, v34, v14, s6
+; GFX10-NEXT:    v_cmp_o_f64_e64 s8, v[16:17], v[32:33]
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v52, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v53, s14
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s14, v[18:19], v[82:83]
+; GFX10-NEXT:    v_cndmask_b32_e64 v52, 0, v115, s16
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v50, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v50, v49, v9, s13
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v51, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[38:39], 32
+; GFX10-NEXT:    v_cndmask_b32_e64 v54, 0, v112, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v51, v39, v11, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v55, 0x7ff80000, v50, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v50, 0, v113, s5
+; GFX10-NEXT:    v_cmp_o_f64_e64 s4, v[18:19], v[82:83]
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v54, v8, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v64, 0, v114, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v55, v9, s10
+; GFX10-NEXT:    v_cmp_class_f64_e64 s10, v[12:13], 32
+; GFX10-NEXT:    v_cndmask_b32_e64 v65, 0x7ff80000, v51, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v48, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v64, v10, s17
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v9, v49, s12
+; GFX10-NEXT:    v_cmp_class_f64_e64 s12, v[14:15], 32
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v65, v11, s17
+; GFX10-NEXT:    v_cndmask_b32_e64 v48, v37, v13, s15
+; GFX10-NEXT:    v_cmp_class_f64_e64 s17, v[34:35], 32
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s9, v[20:21], v[66:67]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s11, v[20:21], v[66:67]
+; GFX10-NEXT:    v_cndmask_b32_e64 v116, v32, v16, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v51, 0x7ff80000, v48, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v38, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v11, v39, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[22:23], v[68:69]
+; GFX10-NEXT:    v_cndmask_b32_e64 v38, v35, v15, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v49, v82, v18, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v48, 0, v116, s8
+; GFX10-NEXT:    v_cmp_class_f64_e64 s13, v[36:37], 32
+; GFX10-NEXT:    v_cmp_o_f64_e64 s5, v[22:23], v[68:69]
+; GFX10-NEXT:    v_cndmask_b32_e64 v53, 0x7ff80000, v38, s16
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v50, v12, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v51, v13, s10
+; GFX10-NEXT:    v_cmp_class_f64_e64 s10, v[16:17], 32
+; GFX10-NEXT:    v_cndmask_b32_e64 v38, 0, v49, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v112, v83, v19, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v52, v14, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, v53, v15, s12
+; GFX10-NEXT:    v_cmp_class_f64_e64 s12, v[32:33], 32
+; GFX10-NEXT:    v_cmp_class_f64_e64 s14, v[18:19], 32
+; GFX10-NEXT:    v_cndmask_b32_e64 v114, v67, v21, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v14, v34, s17
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, v15, v35, s17
+; GFX10-NEXT:    v_cndmask_b32_e64 v34, v33, v17, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v39, 0x7ff80000, v112, s4
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s4, v[24:25], v[70:71]
+; GFX10-NEXT:    v_cndmask_b32_e32 v113, v69, v23, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v35, v68, v22, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[20:21], 32
+; GFX10-NEXT:    v_cndmask_b32_e64 v49, 0x7ff80000, v34, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v34, 0x7ff80000, v114, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v12, v36, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v13, v37, s13
+; GFX10-NEXT:    v_cmp_class_f64_e64 s13, v[82:83], 32
+; GFX10-NEXT:    v_cmp_o_f64_e64 s6, v[24:25], v[70:71]
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, v48, v16, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, v49, v17, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v36, 0x7ff80000, v113, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v35, 0, v35, s5
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s7, v[26:27], v[80:81]
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, v16, v32, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v32, v66, v20, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, v17, v33, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, v38, v18, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, v39, v19, s14
+; GFX10-NEXT:    v_cmp_o_f64_e64 s15, v[26:27], v[80:81]
+; GFX10-NEXT:    v_cndmask_b32_e64 v33, 0, v32, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v112, v71, v25, s4
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s16, v[28:29], v[84:85]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s8, v[28:29], v[84:85]
+; GFX10-NEXT:    v_cndmask_b32_e32 v21, v34, v21, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v20, v33, v20, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[22:23], 32
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s17, v[30:31], v[86:87]
+; GFX10-NEXT:    v_cmp_class_f64_e64 s5, v[70:71], 32
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, v18, v82, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v82, v70, v24, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, v19, v83, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v83, 0x7ff80000, v112, s6
+; GFX10-NEXT:    v_cmp_class_f64_e64 s4, v[68:69], 32
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s9, 0, v[96:97]
+; GFX10-NEXT:    v_cndmask_b32_e64 v82, 0, v82, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v37, v81, v27, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v32, v80, v26, s7
+; GFX10-NEXT:    v_cmp_class_f64_e64 s6, v[80:81], 32
+; GFX10-NEXT:    v_cmp_class_f64_e64 s7, v[84:85], 32
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s10, 0, v[98:99]
+; GFX10-NEXT:    v_cndmask_b32_e64 v113, 0x7ff80000, v37, s15
+; GFX10-NEXT:    v_cndmask_b32_e64 v112, 0, v32, s15
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s11, 0, v[100:101]
+; GFX10-NEXT:    v_cndmask_b32_e64 v115, v85, v29, s16
+; GFX10-NEXT:    v_cndmask_b32_e64 v114, v84, v28, s16
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s12, 0, v[102:103]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s13, 0, v[54:55]
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, v35, v22, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v23, v36, v23, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[24:25], 32
+; GFX10-NEXT:    v_cndmask_b32_e64 v115, 0x7ff80000, v115, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v114, 0, v114, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v116, v87, v31, s17
+; GFX10-NEXT:    v_cndmask_b32_e64 v32, v86, v30, s17
+; GFX10-NEXT:    v_cmp_class_f64_e64 s8, v[86:87], 32
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s14, 0, v[64:65]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s15, 0, v[50:51]
+; GFX10-NEXT:    v_cndmask_b32_e64 v117, 0x7ff80000, v116, s18
+; GFX10-NEXT:    v_cndmask_b32_e64 v116, 0, v32, s18
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s16, 0, v[52:53]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s17, 0, v[48:49]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s18, 0, v[38:39]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s19, 0, v[33:34]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s20, 0, v[35:36]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s21, 0, v[82:83]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s22, 0, v[112:113]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s23, 0, v[114:115]
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s24, 0, v[116:117]
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, v22, v68, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v23, v23, v69, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v96, v0, s9
+; GFX10-NEXT:    v_cndmask_b32_e32 v24, v82, v24, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v25, v83, v25, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[26:27], 32
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v98, v2, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v100, v4, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v24, v24, v70, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v25, v25, v71, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v102, v6, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v54, v8, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v64, v10, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v50, v12, s15
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v52, v14, s16
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, v48, v16, s17
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, v38, v18, s18
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, v35, v22, s20
+; GFX10-NEXT:    v_cndmask_b32_e64 v24, v82, v24, s21
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v97, v1, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v99, v3, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v101, v5, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v103, v7, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v55, v9, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v65, v11, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v51, v13, s15
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, v53, v15, s16
+; GFX10-NEXT:    v_cndmask_b32_e32 v26, v112, v26, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v27, v113, v27, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[28:29], 32
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, v49, v17, s17
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, v39, v19, s18
+; GFX10-NEXT:    v_cndmask_b32_e64 v26, v26, v80, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v27, v27, v81, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v23, v36, v23, s20
+; GFX10-NEXT:    v_cndmask_b32_e64 v25, v83, v25, s21
+; GFX10-NEXT:    v_cndmask_b32_e64 v26, v112, v26, s22
+; GFX10-NEXT:    v_cndmask_b32_e64 v27, v113, v27, s22
+; GFX10-NEXT:    v_cndmask_b32_e32 v28, v114, v28, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v29, v115, v29, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[30:31], 32
+; GFX10-NEXT:    v_cndmask_b32_e64 v28, v28, v84, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v29, v29, v85, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v28, v114, v28, s23
+; GFX10-NEXT:    v_cndmask_b32_e64 v29, v115, v29, s23
+; GFX10-NEXT:    v_cndmask_b32_e32 v30, v116, v30, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v31, v117, v31, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[66:67], 32
+; GFX10-NEXT:    v_cndmask_b32_e64 v30, v30, v86, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v31, v31, v87, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v30, v116, v30, s24
+; GFX10-NEXT:    v_cndmask_b32_e64 v31, v117, v31, s24
+; GFX10-NEXT:    v_cndmask_b32_e32 v20, v20, v66, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v21, v21, v67, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v20, v33, v20, s19
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, v34, v21, s19
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimum_v16f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_load_b32 v87, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v86, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_b32 v85, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_b32 v84, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:64
+; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:72
+; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:68
+; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:80
+; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:76
+; GFX11-NEXT:    scratch_load_b32 v65, off, s32 offset:88
+; GFX11-NEXT:    scratch_load_b32 v64, off, s32 offset:84
+; GFX11-NEXT:    scratch_load_b32 v67, off, s32 offset:96
+; GFX11-NEXT:    scratch_load_b32 v66, off, s32 offset:92
+; GFX11-NEXT:    scratch_load_b32 v69, off, s32 offset:104
+; GFX11-NEXT:    scratch_load_b32 v68, off, s32 offset:100
+; GFX11-NEXT:    scratch_load_b32 v71, off, s32 offset:112
+; GFX11-NEXT:    scratch_load_b32 v70, off, s32 offset:108
+; GFX11-NEXT:    scratch_load_b32 v81, off, s32 offset:120
+; GFX11-NEXT:    scratch_load_b32 v80, off, s32 offset:116
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    scratch_load_b32 v83, off, s32 offset:128
+; GFX11-NEXT:    scratch_load_b32 v82, off, s32 offset:124
+; GFX11-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s9, v[0:1], v[86:87]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s11, v[0:1], v[86:87]
+; GFX11-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s10, v[2:3], v[84:85]
+; GFX11-NEXT:    v_cmp_class_f64_e64 s14, v[86:87], 32
+; GFX11-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s0, v[4:5], v[32:33]
+; GFX11-NEXT:    v_cmp_o_f64_e32 vcc_lo, v[4:5], v[32:33]
+; GFX11-NEXT:    s_waitcnt vmcnt(25)
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s2, v[6:7], v[34:35]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s12, v[2:3], v[84:85]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s1, v[6:7], v[34:35]
+; GFX11-NEXT:    s_waitcnt vmcnt(23)
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s4, v[8:9], v[36:37]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s3, v[8:9], v[36:37]
+; GFX11-NEXT:    v_cmp_class_f64_e64 s16, v[84:85], 32
+; GFX11-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s6, v[10:11], v[38:39]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s5, v[10:11], v[38:39]
+; GFX11-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s8, v[12:13], v[48:49]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s7, v[12:13], v[48:49]
+; GFX11-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s13, v[14:15], v[50:51]
+; GFX11-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-NEXT:    v_cmp_o_f64_e64 s15, v[16:17], v[52:53]
+; GFX11-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s17, v[18:19], v[54:55]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s18, v[18:19], v[54:55]
+; GFX11-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s19, v[20:21], v[64:65]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s20, v[20:21], v[64:65]
+; GFX11-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s21, v[22:23], v[66:67]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s22, v[22:23], v[66:67]
+; GFX11-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s23, v[24:25], v[68:69]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s24, v[24:25], v[68:69]
+; GFX11-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s25, v[26:27], v[70:71]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s26, v[26:27], v[70:71]
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s27, v[28:29], v[80:81]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s28, v[28:29], v[80:81]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s29, v[30:31], v[82:83]
+; GFX11-NEXT:    v_cmp_o_f64_e64 vcc_hi, v[30:31], v[82:83]
+; GFX11-NEXT:    v_cndmask_b32_e64 v96, v87, v1, s9
+; GFX11-NEXT:    v_cndmask_b32_e64 v101, v86, v0, s9
+; GFX11-NEXT:    v_cndmask_b32_e64 v98, v85, v3, s10
+; GFX11-NEXT:    v_cndmask_b32_e64 v103, v84, v2, s10
+; GFX11-NEXT:    v_cmp_class_f64_e64 s10, v[0:1], 32
+; GFX11-NEXT:    v_cndmask_b32_e64 v97, 0x7ff80000, v96, s11
+; GFX11-NEXT:    v_cndmask_b32_e64 v96, 0, v101, s11
+; GFX11-NEXT:    v_cndmask_b32_e64 v100, v33, v5, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v102, v35, v7, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v99, 0x7ff80000, v98, s12
+; GFX11-NEXT:    v_cndmask_b32_e64 v98, 0, v103, s12
+; GFX11-NEXT:    v_cmp_class_f64_e64 s11, v[2:3], 32
+; GFX11-NEXT:    v_cndmask_b32_e32 v101, 0x7ff80000, v100, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v103, 0x7ff80000, v102, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v112, v37, v9, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v114, v39, v11, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v116, v49, v13, s8
+; GFX11-NEXT:    v_cmp_o_f64_e64 s9, v[14:15], v[50:51]
+; GFX11-NEXT:    v_cndmask_b32_e64 v118, v51, v15, s13
+; GFX11-NEXT:    v_cndmask_b32_e64 v113, 0x7ff80000, v112, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v115, 0x7ff80000, v114, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v117, 0x7ff80000, v116, s7
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s12, v[16:17], v[52:53]
+; GFX11-NEXT:    v_cndmask_b32_e64 v130, v55, v19, s17
+; GFX11-NEXT:    v_cndmask_b32_e64 v132, v65, v21, s19
+; GFX11-NEXT:    v_cndmask_b32_e64 v134, v67, v23, s21
+; GFX11-NEXT:    v_cndmask_b32_e64 v144, v69, v25, s23
+; GFX11-NEXT:    v_cndmask_b32_e64 v145, v71, v27, s25
+; GFX11-NEXT:    v_cndmask_b32_e64 v131, 0x7ff80000, v130, s18
+; GFX11-NEXT:    v_cndmask_b32_e64 v133, 0x7ff80000, v132, s20
+; GFX11-NEXT:    v_cndmask_b32_e64 v135, 0x7ff80000, v134, s22
+; GFX11-NEXT:    v_cndmask_b32_e64 v146, v81, v29, s27
+; GFX11-NEXT:    v_cndmask_b32_e64 v148, v80, v28, s27
+; GFX11-NEXT:    v_cndmask_b32_e64 v147, v83, v31, s29
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v147, 0x7ff80000, v147, vcc_hi
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v96, v0, s10
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v97, v1, s10
+; GFX11-NEXT:    v_cmp_class_f64_e64 s10, v[36:37], 32
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v86, s14
+; GFX11-NEXT:    v_cndmask_b32_e64 v86, v32, v4, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v87, s14
+; GFX11-NEXT:    v_cndmask_b32_e64 v87, v34, v6, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v98, v2, s11
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v99, v3, s11
+; GFX11-NEXT:    v_cndmask_b32_e32 v100, 0, v86, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[4:5], 32
+; GFX11-NEXT:    v_cndmask_b32_e64 v102, 0, v87, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v84, s16
+; GFX11-NEXT:    v_cndmask_b32_e64 v84, v36, v8, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v86, v38, v10, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v87, v48, v12, s8
+; GFX11-NEXT:    v_cndmask_b32_e64 v119, 0x7ff80000, v118, s9
+; GFX11-NEXT:    v_cndmask_b32_e64 v128, v53, v17, s12
+; GFX11-NEXT:    v_cndmask_b32_e64 v112, 0, v84, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v114, 0, v86, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v116, 0, v87, s7
+; GFX11-NEXT:    v_cndmask_b32_e64 v84, v50, v14, s13
+; GFX11-NEXT:    v_cndmask_b32_e64 v129, 0x7ff80000, v128, s15
+; GFX11-NEXT:    v_cndmask_b32_e64 v86, v52, v16, s12
+; GFX11-NEXT:    v_cndmask_b32_e64 v87, v54, v18, s17
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v85, s16
+; GFX11-NEXT:    v_cndmask_b32_e64 v118, 0, v84, s9
+; GFX11-NEXT:    v_cndmask_b32_e64 v84, v64, v20, s19
+; GFX11-NEXT:    v_cndmask_b32_e64 v128, 0, v86, s15
+; GFX11-NEXT:    v_cndmask_b32_e64 v130, 0, v87, s18
+; GFX11-NEXT:    v_cndmask_b32_e64 v86, v66, v22, s21
+; GFX11-NEXT:    v_cndmask_b32_e64 v85, 0x7ff80000, v144, s24
+; GFX11-NEXT:    v_cndmask_b32_e64 v132, 0, v84, s20
+; GFX11-NEXT:    v_cndmask_b32_e64 v87, v68, v24, s23
+; GFX11-NEXT:    v_cndmask_b32_e64 v144, v70, v26, s25
+; GFX11-NEXT:    v_cndmask_b32_e64 v134, 0, v86, s22
+; GFX11-NEXT:    v_cmp_class_f64_e64 s0, v[68:69], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s1, v[70:71], 32
+; GFX11-NEXT:    v_cndmask_b32_e64 v84, 0, v87, s24
+; GFX11-NEXT:    v_cndmask_b32_e64 v87, 0x7ff80000, v145, s26
+; GFX11-NEXT:    v_cndmask_b32_e64 v86, 0, v144, s26
+; GFX11-NEXT:    v_cndmask_b32_e64 v145, 0x7ff80000, v146, s28
+; GFX11-NEXT:    v_cndmask_b32_e64 v144, 0, v148, s28
+; GFX11-NEXT:    v_cndmask_b32_e64 v146, v82, v30, s29
+; GFX11-NEXT:    v_cmp_class_f64_e64 s2, v[80:81], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s3, v[82:83], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s6, v[32:33], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s8, v[34:35], 32
+; GFX11-NEXT:    v_dual_cndmask_b32 v5, v101, v5 :: v_dual_cndmask_b32 v4, v100, v4
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[6:7], 32
+; GFX11-NEXT:    v_cndmask_b32_e64 v146, 0, v146, vcc_hi
+; GFX11-NEXT:    v_cmp_class_f64_e64 s12, v[38:39], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s14, v[48:49], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s16, v[50:51], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s18, v[52:53], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s20, v[54:55], 32
+; GFX11-NEXT:    v_cmp_class_f64_e64 s21, v[64:65], 32
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s4, 0, v[96:97]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s5, 0, v[98:99]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s7, 0, v[100:101]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s9, 0, v[102:103]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s11, 0, v[112:113]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s13, 0, v[114:115]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s15, 0, v[116:117]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s17, 0, v[118:119]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s19, 0, v[128:129]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s22, 0, v[130:131]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s23, 0, v[132:133]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s24, 0, v[134:135]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s25, 0, v[84:85]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s26, 0, v[86:87]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s27, 0, v[144:145]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s28, 0, v[146:147]
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, v33, s6
+; GFX11-NEXT:    v_dual_cndmask_b32 v7, v103, v7 :: v_dual_cndmask_b32 v6, v102, v6
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[8:9], 32
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v32, s6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v7, v35, s8
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v96, v0, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v98, v2, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v100, v4, s7
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v97, v1, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v99, v3, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v101, v5, s7
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v103, v7, s9
+; GFX11-NEXT:    v_dual_cndmask_b32 v9, v113, v9 :: v_dual_cndmask_b32 v8, v112, v8
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[10:11], 32
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v6, v34, s8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v9, v9, v37, s10
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v102, v6, s9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v9, v113, v9, s11
+; GFX11-NEXT:    v_dual_cndmask_b32 v11, v115, v11 :: v_dual_cndmask_b32 v10, v114, v10
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[12:13], 32
+; GFX11-NEXT:    v_cndmask_b32_e64 v8, v8, v36, s10
+; GFX11-NEXT:    v_cndmask_b32_e64 v11, v11, v39, s12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v8, v112, v8, s11
+; GFX11-NEXT:    v_cndmask_b32_e64 v11, v115, v11, s13
+; GFX11-NEXT:    v_dual_cndmask_b32 v13, v117, v13 :: v_dual_cndmask_b32 v12, v116, v12
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[14:15], 32
+; GFX11-NEXT:    v_cndmask_b32_e64 v10, v10, v38, s12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v13, v13, v49, s14
+; GFX11-NEXT:    v_cndmask_b32_e64 v10, v114, v10, s13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v13, v117, v13, s15
+; GFX11-NEXT:    v_dual_cndmask_b32 v15, v119, v15 :: v_dual_cndmask_b32 v14, v118, v14
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[16:17], 32
+; GFX11-NEXT:    v_cndmask_b32_e64 v12, v12, v48, s14
+; GFX11-NEXT:    v_cndmask_b32_e64 v15, v15, v51, s16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v12, v116, v12, s15
+; GFX11-NEXT:    v_cndmask_b32_e64 v15, v119, v15, s17
+; GFX11-NEXT:    v_dual_cndmask_b32 v17, v129, v17 :: v_dual_cndmask_b32 v16, v128, v16
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[18:19], 32
+; GFX11-NEXT:    v_cndmask_b32_e64 v14, v14, v50, s16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v17, v17, v53, s18
+; GFX11-NEXT:    v_cndmask_b32_e64 v14, v118, v14, s17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v17, v129, v17, s19
+; GFX11-NEXT:    v_dual_cndmask_b32 v19, v131, v19 :: v_dual_cndmask_b32 v18, v130, v18
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[20:21], 32
+; GFX11-NEXT:    v_cndmask_b32_e64 v16, v16, v52, s18
+; GFX11-NEXT:    v_cndmask_b32_e64 v19, v19, v55, s20
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v16, v128, v16, s19
+; GFX11-NEXT:    v_cndmask_b32_e64 v19, v131, v19, s22
+; GFX11-NEXT:    v_dual_cndmask_b32 v21, v133, v21 :: v_dual_cndmask_b32 v20, v132, v20
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[22:23], 32
+; GFX11-NEXT:    v_cndmask_b32_e64 v18, v18, v54, s20
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v21, v21, v65, s21
+; GFX11-NEXT:    v_cndmask_b32_e64 v18, v130, v18, s22
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v21, v133, v21, s23
+; GFX11-NEXT:    v_dual_cndmask_b32 v23, v135, v23 :: v_dual_cndmask_b32 v22, v134, v22
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[24:25], 32
+; GFX11-NEXT:    v_cndmask_b32_e64 v20, v20, v64, s21
+; GFX11-NEXT:    v_cndmask_b32_e64 v20, v132, v20, s23
+; GFX11-NEXT:    v_dual_cndmask_b32 v25, v85, v25 :: v_dual_cndmask_b32 v24, v84, v24
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[26:27], 32
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v25, v25, v69, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v25, v85, v25, s25
+; GFX11-NEXT:    v_dual_cndmask_b32 v27, v87, v27 :: v_dual_cndmask_b32 v26, v86, v26
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[28:29], 32
+; GFX11-NEXT:    v_cndmask_b32_e64 v24, v24, v68, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v27, v27, v71, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v24, v84, v24, s25
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v27, v87, v27, s26
+; GFX11-NEXT:    v_dual_cndmask_b32 v29, v145, v29 :: v_dual_cndmask_b32 v28, v144, v28
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[30:31], 32
+; GFX11-NEXT:    v_cndmask_b32_e64 v26, v26, v70, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v29, v29, v81, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v26, v86, v26, s26
+; GFX11-NEXT:    v_cndmask_b32_e64 v29, v145, v29, s27
+; GFX11-NEXT:    v_dual_cndmask_b32 v31, v147, v31 :: v_dual_cndmask_b32 v30, v146, v30
+; GFX11-NEXT:    v_cmp_class_f64_e64 vcc_lo, v[66:67], 32
+; GFX11-NEXT:    v_cndmask_b32_e64 v28, v28, v80, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v31, v31, v83, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v28, v144, v28, s27
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v31, v147, v31, s28
+; GFX11-NEXT:    v_dual_cndmask_b32 v23, v23, v67 :: v_dual_cndmask_b32 v22, v22, v66
+; GFX11-NEXT:    v_cndmask_b32_e64 v30, v30, v82, s3
+; GFX11-NEXT:    v_cndmask_b32_e64 v23, v135, v23, s24
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v22, v134, v22, s24
+; GFX11-NEXT:    v_cndmask_b32_e64 v30, v146, v30, s28
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimum_v16f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_clause 0x1b
+; GFX12-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX12-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX12-NEXT:    scratch_load_b32 v35, off, s32 offset:16
+; GFX12-NEXT:    scratch_load_b32 v34, off, s32 offset:12
+; GFX12-NEXT:    scratch_load_b32 v31, off, s32
+; GFX12-NEXT:    scratch_load_b32 v37, off, s32 offset:120
+; GFX12-NEXT:    scratch_load_b32 v39, off, s32 offset:104
+; GFX12-NEXT:    scratch_load_b32 v49, off, s32 offset:24
+; GFX12-NEXT:    scratch_load_b32 v48, off, s32 offset:20
+; GFX12-NEXT:    scratch_load_b32 v51, off, s32 offset:32
+; GFX12-NEXT:    scratch_load_b32 v50, off, s32 offset:28
+; GFX12-NEXT:    scratch_load_b32 v53, off, s32 offset:40
+; GFX12-NEXT:    scratch_load_b32 v52, off, s32 offset:36
+; GFX12-NEXT:    scratch_load_b32 v55, off, s32 offset:48
+; GFX12-NEXT:    scratch_load_b32 v54, off, s32 offset:44
+; GFX12-NEXT:    scratch_load_b32 v65, off, s32 offset:56
+; GFX12-NEXT:    scratch_load_b32 v64, off, s32 offset:52
+; GFX12-NEXT:    scratch_load_b32 v67, off, s32 offset:64
+; GFX12-NEXT:    scratch_load_b32 v66, off, s32 offset:60
+; GFX12-NEXT:    scratch_load_b32 v69, off, s32 offset:72
+; GFX12-NEXT:    scratch_load_b32 v68, off, s32 offset:68
+; GFX12-NEXT:    scratch_load_b32 v71, off, s32 offset:80
+; GFX12-NEXT:    scratch_load_b32 v70, off, s32 offset:76
+; GFX12-NEXT:    scratch_load_b32 v81, off, s32 offset:88
+; GFX12-NEXT:    scratch_load_b32 v80, off, s32 offset:84
+; GFX12-NEXT:    scratch_load_b32 v83, off, s32 offset:96
+; GFX12-NEXT:    scratch_load_b32 v82, off, s32 offset:92
+; GFX12-NEXT:    scratch_load_b32 v38, off, s32 offset:100
+; GFX12-NEXT:    s_wait_loadcnt 0x1a
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[32:33]
+; GFX12-NEXT:    s_clause 0x2
+; GFX12-NEXT:    scratch_load_b32 v33, off, s32 offset:112
+; GFX12-NEXT:    scratch_load_b32 v32, off, s32 offset:108
+; GFX12-NEXT:    scratch_load_b32 v36, off, s32 offset:116
+; GFX12-NEXT:    s_wait_loadcnt 0x1b
+; GFX12-NEXT:    v_minimum_f64 v[2:3], v[2:3], v[34:35]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    scratch_load_b32 v35, off, s32 offset:128
+; GFX12-NEXT:    scratch_load_b32 v34, off, s32 offset:124
+; GFX12-NEXT:    s_wait_loadcnt 0x18
+; GFX12-NEXT:    v_minimum_f64 v[4:5], v[4:5], v[48:49]
+; GFX12-NEXT:    s_wait_loadcnt 0x16
+; GFX12-NEXT:    v_minimum_f64 v[6:7], v[6:7], v[50:51]
+; GFX12-NEXT:    s_wait_loadcnt 0x14
+; GFX12-NEXT:    v_minimum_f64 v[8:9], v[8:9], v[52:53]
+; GFX12-NEXT:    s_wait_loadcnt 0x12
+; GFX12-NEXT:    v_minimum_f64 v[10:11], v[10:11], v[54:55]
+; GFX12-NEXT:    s_wait_loadcnt 0x10
+; GFX12-NEXT:    v_minimum_f64 v[12:13], v[12:13], v[64:65]
+; GFX12-NEXT:    s_wait_loadcnt 0xe
+; GFX12-NEXT:    v_minimum_f64 v[14:15], v[14:15], v[66:67]
+; GFX12-NEXT:    s_wait_loadcnt 0xc
+; GFX12-NEXT:    v_minimum_f64 v[16:17], v[16:17], v[68:69]
+; GFX12-NEXT:    s_wait_loadcnt 0xa
+; GFX12-NEXT:    v_minimum_f64 v[18:19], v[18:19], v[70:71]
+; GFX12-NEXT:    s_wait_loadcnt 0x8
+; GFX12-NEXT:    v_minimum_f64 v[20:21], v[20:21], v[80:81]
+; GFX12-NEXT:    s_wait_loadcnt 0x6
+; GFX12-NEXT:    v_minimum_f64 v[22:23], v[22:23], v[82:83]
+; GFX12-NEXT:    s_wait_loadcnt 0x5
+; GFX12-NEXT:    v_minimum_f64 v[24:25], v[24:25], v[38:39]
+; GFX12-NEXT:    s_wait_loadcnt 0x3
+; GFX12-NEXT:    v_minimum_f64 v[26:27], v[26:27], v[32:33]
+; GFX12-NEXT:    s_wait_loadcnt 0x2
+; GFX12-NEXT:    v_minimum_f64 v[28:29], v[28:29], v[36:37]
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[30:31], v[30:31], v[34:35]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <16 x double> @llvm.minimum.v16f64(<16 x double> %src0, <16 x double> %src1)
+  ret <16 x double> %op
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
new file mode 100644
index 000000000000..6a9c4c8d41c2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
@@ -0,0 +1,1792 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX678,GFX6 %s
+; RUN: llc -march=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX678,GFX7 %s
+; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX678,GFX8 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX10 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11 %s
+
+declare void @llvm.set.rounding(i32)
+declare i32 @llvm.get.rounding()
+
+define amdgpu_gfx void @s_set_rounding(i32 inreg %rounding) {
+; GFX678-LABEL: s_set_rounding:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_add_i32 s34, s4, -4
+; GFX678-NEXT:    s_min_u32 s34, s4, s34
+; GFX678-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX678-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX678-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX678-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX678-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_add_i32 s34, s4, -4
+; GFX9-NEXT:    s_min_u32 s34, s4, s34
+; GFX9-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX9-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX9-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX9-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_set_rounding:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_add_i32 s34, s4, -4
+; GFX10-NEXT:    s_min_u32 s34, s4, s34
+; GFX10-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX10-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX10-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX10-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_set_rounding:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_add_i32 s0, s4, -4
+; GFX11-NEXT:    s_min_u32 s0, s4, s0
+; GFX11-NEXT:    s_lshl_b32 s2, s0, 2
+; GFX11-NEXT:    s_mov_b32 s0, 0x1c84a50f
+; GFX11-NEXT:    s_mov_b32 s1, 0xb73e62d9
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 %rounding)
+  ret void
+}
+
+define amdgpu_kernel void @s_set_rounding_kernel(i32 inreg %rounding) {
+; GFX6-LABEL: s_set_rounding_kernel:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_load_dword s2, s[0:1], 0x9
+; GFX6-NEXT:    s_mov_b32 s0, 0x1c84a50f
+; GFX6-NEXT:    s_mov_b32 s1, 0xb73e62d9
+; GFX6-NEXT:    ;;#ASMSTART
+; GFX6-NEXT:    ;;#ASMEND
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s3, s2, -4
+; GFX6-NEXT:    s_min_u32 s2, s2, s3
+; GFX6-NEXT:    s_lshl_b32 s2, s2, 2
+; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX6-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: s_set_rounding_kernel:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dword s2, s[0:1], 0x9
+; GFX7-NEXT:    s_mov_b32 s0, 0x1c84a50f
+; GFX7-NEXT:    s_mov_b32 s1, 0xb73e62d9
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s3, s2, -4
+; GFX7-NEXT:    s_min_u32 s2, s2, s3
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 2
+; GFX7-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: s_set_rounding_kernel:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x24
+; GFX8-NEXT:    s_mov_b32 s0, 0x1c84a50f
+; GFX8-NEXT:    s_mov_b32 s1, 0xb73e62d9
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_add_i32 s3, s2, -4
+; GFX8-NEXT:    s_min_u32 s2, s2, s3
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 2
+; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX8-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: s_set_rounding_kernel:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s0, 0x1c84a50f
+; GFX9-NEXT:    s_mov_b32 s1, 0xb73e62d9
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_add_i32 s3, s2, -4
+; GFX9-NEXT:    s_min_u32 s2, s2, s3
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 2
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: s_set_rounding_kernel:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_add_i32 s1, s0, -4
+; GFX10-NEXT:    s_min_u32 s2, s0, s1
+; GFX10-NEXT:    s_mov_b32 s0, 0x1c84a50f
+; GFX10-NEXT:    s_mov_b32 s1, 0xb73e62d9
+; GFX10-NEXT:    s_lshl_b32 s2, s2, 2
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: s_set_rounding_kernel:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x24
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_add_i32 s1, s0, -4
+; GFX11-NEXT:    s_min_u32 s2, s0, s1
+; GFX11-NEXT:    s_mov_b32 s0, 0x1c84a50f
+; GFX11-NEXT:    s_mov_b32 s1, 0xb73e62d9
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 2
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX11-NEXT:    s_endpgm
+  call void @llvm.set.rounding(i32 %rounding)
+  call void asm sideeffect "",""()
+  ret void
+}
+
+define void @v_set_rounding(i32 %rounding) {
+; GFX6-LABEL: v_set_rounding:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, -4, v0
+; GFX6-NEXT:    v_min_u32_e32 v0, v0, v1
+; GFX6-NEXT:    s_mov_b32 s4, 0x1c84a50f
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT:    s_mov_b32 s5, 0xb73e62d9
+; GFX6-NEXT:    v_lshr_b64 v[0:1], s[4:5], v0
+; GFX6-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX6-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_set_rounding:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, -4, v0
+; GFX7-NEXT:    v_min_u32_e32 v0, v0, v1
+; GFX7-NEXT:    s_mov_b32 s4, 0x1c84a50f
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    s_mov_b32 s5, 0xb73e62d9
+; GFX7-NEXT:    v_lshr_b64 v[0:1], s[4:5], v0
+; GFX7-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_set_rounding:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, -4, v0
+; GFX8-NEXT:    v_min_u32_e32 v0, v0, v1
+; GFX8-NEXT:    s_mov_b32 s4, 0x1c84a50f
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT:    s_mov_b32 s5, 0xb73e62d9
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, s[4:5]
+; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX8-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_set_rounding:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, -4, v0
+; GFX9-NEXT:    v_min_u32_e32 v0, v0, v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x1c84a50f
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_mov_b32 s5, 0xb73e62d9
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, s[4:5]
+; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_set_rounding:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, -4, v0
+; GFX10-NEXT:    s_mov_b32 s4, 0x1c84a50f
+; GFX10-NEXT:    s_mov_b32 s5, 0xb73e62d9
+; GFX10-NEXT:    v_min_u32_e32 v0, v0, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v0, s[4:5]
+; GFX10-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_set_rounding:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, -4, v0
+; GFX11-NEXT:    s_mov_b32 s0, 0x1c84a50f
+; GFX11-NEXT:    s_mov_b32 s1, 0xb73e62d9
+; GFX11-NEXT:    v_min_u32_e32 v0, v0, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v0, s[0:1]
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 %rounding)
+  ret void
+}
+
+define void @set_rounding_get_rounding() {
+; GFX678-LABEL: set_rounding_get_rounding:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 4)
+; GFX678-NEXT:    s_lshl_b32 s6, s4, 2
+; GFX678-NEXT:    s_mov_b32 s4, 0xeb24da71
+; GFX678-NEXT:    s_mov_b32 s5, 0xc96f385
+; GFX678-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
+; GFX678-NEXT:    s_and_b32 s4, s4, 15
+; GFX678-NEXT:    s_add_i32 s5, s4, 4
+; GFX678-NEXT:    s_cmp_lt_u32 s4, 4
+; GFX678-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX678-NEXT:    s_add_i32 s5, s4, -4
+; GFX678-NEXT:    s_min_u32 s4, s4, s5
+; GFX678-NEXT:    s_lshl_b32 s6, s4, 2
+; GFX678-NEXT:    s_mov_b32 s4, 0x1c84a50f
+; GFX678-NEXT:    s_mov_b32 s5, 0xb73e62d9
+; GFX678-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
+; GFX678-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: set_rounding_get_rounding:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 4)
+; GFX9-NEXT:    s_lshl_b32 s6, s4, 2
+; GFX9-NEXT:    s_mov_b32 s4, 0xeb24da71
+; GFX9-NEXT:    s_mov_b32 s5, 0xc96f385
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
+; GFX9-NEXT:    s_and_b32 s4, s4, 15
+; GFX9-NEXT:    s_add_i32 s5, s4, 4
+; GFX9-NEXT:    s_cmp_lt_u32 s4, 4
+; GFX9-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX9-NEXT:    s_add_i32 s5, s4, -4
+; GFX9-NEXT:    s_min_u32 s4, s4, s5
+; GFX9-NEXT:    s_lshl_b32 s6, s4, 2
+; GFX9-NEXT:    s_mov_b32 s4, 0x1c84a50f
+; GFX9-NEXT:    s_mov_b32 s5, 0xb73e62d9
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
+; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: set_rounding_get_rounding:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_getreg_b32 s6, hwreg(HW_REG_MODE, 0, 4)
+; GFX10-NEXT:    s_mov_b32 s4, 0xeb24da71
+; GFX10-NEXT:    s_mov_b32 s5, 0xc96f385
+; GFX10-NEXT:    s_lshl_b32 s6, s6, 2
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
+; GFX10-NEXT:    s_and_b32 s4, s4, 15
+; GFX10-NEXT:    s_add_i32 s5, s4, 4
+; GFX10-NEXT:    s_cmp_lt_u32 s4, 4
+; GFX10-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX10-NEXT:    s_add_i32 s5, s4, -4
+; GFX10-NEXT:    s_min_u32 s4, s4, s5
+; GFX10-NEXT:    s_lshl_b32 s6, s4, 2
+; GFX10-NEXT:    s_mov_b32 s4, 0x1c84a50f
+; GFX10-NEXT:    s_mov_b32 s5, 0xb73e62d9
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: set_rounding_get_rounding:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_getreg_b32 s2, hwreg(HW_REG_MODE, 0, 4)
+; GFX11-NEXT:    s_mov_b32 s0, 0xeb24da71
+; GFX11-NEXT:    s_mov_b32 s1, 0xc96f385
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 2
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-NEXT:    s_and_b32 s0, s0, 15
+; GFX11-NEXT:    s_add_i32 s1, s0, 4
+; GFX11-NEXT:    s_cmp_lt_u32 s0, 4
+; GFX11-NEXT:    s_cselect_b32 s0, s0, s1
+; GFX11-NEXT:    s_add_i32 s1, s0, -4
+; GFX11-NEXT:    s_min_u32 s0, s0, s1
+; GFX11-NEXT:    s_lshl_b32 s2, s0, 2
+; GFX11-NEXT:    s_mov_b32 s0, 0x1c84a50f
+; GFX11-NEXT:    s_mov_b32 s1, 0xb73e62d9
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %rounding = call i32 @llvm.get.rounding()
+  call void @llvm.set.rounding(i32 %rounding)
+  ret void
+}
+
+define void @s_set_rounding_0() {
+; GFX678-LABEL: s_set_rounding_0:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 15
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 15
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_0:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0xf
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 0)
+  ret void
+}
+
+define void @s_set_rounding_1() {
+; GFX678-LABEL: s_set_rounding_1:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_1:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0x0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 1)
+  ret void
+}
+
+define void @s_set_rounding_2() {
+; GFX678-LABEL: s_set_rounding_2:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 5
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_2:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0x5
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 2)
+  ret void
+}
+
+define void @s_set_rounding_3() {
+; GFX678-LABEL: s_set_rounding_3:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 10
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 10
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_3:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0xa
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 3)
+  ret void
+}
+
+; Unsupported mode.
+define void @s_set_rounding_4() {
+; GFX678-LABEL: s_set_rounding_4:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 15
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 15
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_4:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0xf
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 4)
+  ret void
+}
+
+; undefined
+define void @s_set_rounding_5() {
+; GFX678-LABEL: s_set_rounding_5:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_5:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0x0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 5)
+  ret void
+}
+
+; undefined
+define void @s_set_rounding_6() {
+; GFX678-LABEL: s_set_rounding_6:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 5
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_6:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_6:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0x5
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 6)
+  ret void
+}
+
+; "Dynamic"
+define void @s_set_rounding_7() {
+; GFX678-LABEL: s_set_rounding_7:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 10
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_7:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 10
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_7:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0xa
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 7)
+  ret void
+}
+
+; Invalid
+define void @s_set_rounding_neg1() {
+; GFX678-LABEL: s_set_rounding_neg1:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 11
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_neg1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 11
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_neg1:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0xb
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 -1)
+  ret void
+}
+
+; --------------------------------------------------------------------
+; Test extended values
+; --------------------------------------------------------------------
+
+; NearestTiesToEvenF32_TowardPositiveF64 = 8
+define void @s_set_rounding_8() {
+; GFX678-LABEL: s_set_rounding_8:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 4
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_8:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0x4
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 8)
+  ret void
+}
+
+;  NearestTiesToEvenF32_TowardNegativeF64 = 9
+define void @s_set_rounding_9() {
+; GFX678-LABEL: s_set_rounding_9:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 8
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_9:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 8
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_9:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0x8
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 9)
+  ret void
+}
+
+; NearestTiesToEvenF32_TowardZeroF64 = 10
+define void @s_set_rounding_10() {
+; GFX678-LABEL: s_set_rounding_10:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 12
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_10:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 12
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_10:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0xc
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 10)
+  ret void
+}
+
+; TowardPositiveF32_NearestTiesToEvenF64 = 11
+define void @s_set_rounding_11() {
+; GFX678-LABEL: s_set_rounding_11:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 1
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_11:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_11:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0x1
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 11)
+  ret void
+}
+
+; TowardPositiveF32_TowardNegativeF64 = 12
+define void @s_set_rounding_12() {
+; GFX678-LABEL: s_set_rounding_12:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 9
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_12:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 9
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_12:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0x9
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 12)
+  ret void
+}
+
+; TowardPositiveF32_TowardZeroF64 = 13
+define void @s_set_rounding_13() {
+; GFX678-LABEL: s_set_rounding_13:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 13
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_13:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 13
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_13:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0xd
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 13)
+  ret void
+}
+
+;   TowardNegativeF32_NearestTiesToEvenF64 = 14
+define void @s_set_rounding_14() {
+; GFX678-LABEL: s_set_rounding_14:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 2
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_14:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_14:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0x2
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 14)
+  ret void
+}
+
+; TowardNegativeF32_TowardPositiveF64 = 15
+define void @s_set_rounding_15() {
+; GFX678-LABEL: s_set_rounding_15:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 6
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_15:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 6
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_15:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0x6
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 15)
+  ret void
+}
+
+
+; TowardNegativeF32_TowardZeroF64 = 16
+define void @s_set_rounding_16() {
+; GFX678-LABEL: s_set_rounding_16:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 14
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 14
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_16:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0xe
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 16)
+  ret void
+}
+
+;  TowardZeroF32_NearestTiesToEvenF64 = 17
+define void @s_set_rounding_17() {
+; GFX678-LABEL: s_set_rounding_17:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 3
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_17:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_17:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0x3
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 17)
+  ret void
+}
+
+; TowardZeroF32_TowardPositiveF64 = 18
+define void @s_set_rounding_18() {
+; GFX678-LABEL: s_set_rounding_18:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 7
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_18:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 7
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_18:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0x7
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 18)
+  ret void
+}
+
+; TowardZeroF32_TowardNegativeF64 = 19,
+define void @s_set_rounding_19() {
+; GFX678-LABEL: s_set_rounding_19:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 11
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_19:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 11
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_19:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0xb
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 19)
+  ret void
+}
+
+; Invalid, out of bounds
+define void @s_set_rounding_20() {
+; GFX678-LABEL: s_set_rounding_20:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 11
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_20:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 11
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_20:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0xb
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 20)
+  ret void
+}
+
+define void @s_set_rounding_0xffff() {
+; GFX678-LABEL: s_set_rounding_0xffff:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 11
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_0xffff:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 11
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: s_set_rounding_0xffff:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    s_round_mode 0xb
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.set.rounding(i32 65535)
+  ret void
+}
+
+; --------------------------------------------------------------------
+; Test optimization knowing the value can only be in the standard
+; range
+; --------------------------------------------------------------------
+
+define amdgpu_gfx void @s_set_rounding_i2_zeroext(i2 zeroext inreg %rounding) {
+; GFX6-LABEL: s_set_rounding_i2_zeroext:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_lshl_b32 s34, s4, 2
+; GFX6-NEXT:    s_lshr_b32 s34, 0xa50f, s34
+; GFX6-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: s_set_rounding_i2_zeroext:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_lshl_b32 s34, s4, 2
+; GFX7-NEXT:    s_lshr_b32 s34, 0xa50f, s34
+; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: s_set_rounding_i2_zeroext:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_and_b32 s34, 0xffff, s4
+; GFX8-NEXT:    s_lshl_b32 s34, s34, 2
+; GFX8-NEXT:    s_lshr_b32 s34, 0xa50f, s34
+; GFX8-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_i2_zeroext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_and_b32 s34, 0xffff, s4
+; GFX9-NEXT:    s_lshl_b32 s34, s34, 2
+; GFX9-NEXT:    s_lshr_b32 s34, 0xa50f, s34
+; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_set_rounding_i2_zeroext:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_and_b32 s34, 0xffff, s4
+; GFX10-NEXT:    s_lshl_b32 s34, s34, 2
+; GFX10-NEXT:    s_lshr_b32 s34, 0xa50f, s34
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_set_rounding_i2_zeroext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_and_b32 s0, 0xffff, s4
+; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-NEXT:    s_lshr_b32 s0, 0xa50f, s0
+; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %zext.rounding = zext i2 %rounding to i32
+  call void @llvm.set.rounding(i32 %zext.rounding)
+  ret void
+}
+
+define amdgpu_gfx void @s_set_rounding_i2_signext(i2 signext inreg %rounding) {
+; GFX6-LABEL: s_set_rounding_i2_signext:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s34, s4, -4
+; GFX6-NEXT:    s_min_u32 s34, s4, s34
+; GFX6-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX6-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX6-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX6-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX6-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: s_set_rounding_i2_signext:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s34, s4, -4
+; GFX7-NEXT:    s_min_u32 s34, s4, s34
+; GFX7-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX7-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX7-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX7-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: s_set_rounding_i2_signext:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_sext_i32_i16 s34, s4
+; GFX8-NEXT:    s_add_i32 s35, s34, -4
+; GFX8-NEXT:    s_min_u32 s34, s34, s35
+; GFX8-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX8-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX8-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX8-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX8-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_i2_signext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_sext_i32_i16 s34, s4
+; GFX9-NEXT:    s_add_i32 s35, s34, -4
+; GFX9-NEXT:    s_min_u32 s34, s34, s35
+; GFX9-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX9-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX9-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX9-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_set_rounding_i2_signext:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_sext_i32_i16 s34, s4
+; GFX10-NEXT:    s_add_i32 s35, s34, -4
+; GFX10-NEXT:    s_min_u32 s34, s34, s35
+; GFX10-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX10-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX10-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX10-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_set_rounding_i2_signext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_sext_i32_i16 s0, s4
+; GFX11-NEXT:    s_add_i32 s1, s0, -4
+; GFX11-NEXT:    s_min_u32 s0, s0, s1
+; GFX11-NEXT:    s_lshl_b32 s2, s0, 2
+; GFX11-NEXT:    s_mov_b32 s0, 0x1c84a50f
+; GFX11-NEXT:    s_mov_b32 s1, 0xb73e62d9
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sext.rounding = sext i2 %rounding to i32
+  call void @llvm.set.rounding(i32 %sext.rounding)
+  ret void
+}
+
+define amdgpu_gfx void @s_set_rounding_i3_signext(i3 signext inreg %rounding) {
+; GFX6-LABEL: s_set_rounding_i3_signext:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s34, s4, -4
+; GFX6-NEXT:    s_min_u32 s34, s4, s34
+; GFX6-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX6-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX6-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX6-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX6-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: s_set_rounding_i3_signext:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s34, s4, -4
+; GFX7-NEXT:    s_min_u32 s34, s4, s34
+; GFX7-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX7-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX7-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX7-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: s_set_rounding_i3_signext:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_sext_i32_i16 s34, s4
+; GFX8-NEXT:    s_add_i32 s35, s34, -4
+; GFX8-NEXT:    s_min_u32 s34, s34, s35
+; GFX8-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX8-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX8-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX8-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX8-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_i3_signext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_sext_i32_i16 s34, s4
+; GFX9-NEXT:    s_add_i32 s35, s34, -4
+; GFX9-NEXT:    s_min_u32 s34, s34, s35
+; GFX9-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX9-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX9-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX9-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_set_rounding_i3_signext:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_sext_i32_i16 s34, s4
+; GFX10-NEXT:    s_add_i32 s35, s34, -4
+; GFX10-NEXT:    s_min_u32 s34, s34, s35
+; GFX10-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX10-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX10-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX10-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_set_rounding_i3_signext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_sext_i32_i16 s0, s4
+; GFX11-NEXT:    s_add_i32 s1, s0, -4
+; GFX11-NEXT:    s_min_u32 s0, s0, s1
+; GFX11-NEXT:    s_lshl_b32 s2, s0, 2
+; GFX11-NEXT:    s_mov_b32 s0, 0x1c84a50f
+; GFX11-NEXT:    s_mov_b32 s1, 0xb73e62d9
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sext.rounding = sext i3 %rounding to i32
+  call void @llvm.set.rounding(i32 %sext.rounding)
+  ret void
+}
+
+define amdgpu_gfx void @s_set_rounding_i3_zeroext(i3 zeroext inreg %rounding) {
+; GFX6-LABEL: s_set_rounding_i3_zeroext:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s34, s4, -4
+; GFX6-NEXT:    s_min_u32 s34, s4, s34
+; GFX6-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX6-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX6-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX6-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX6-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: s_set_rounding_i3_zeroext:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s34, s4, -4
+; GFX7-NEXT:    s_min_u32 s34, s4, s34
+; GFX7-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX7-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX7-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX7-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: s_set_rounding_i3_zeroext:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_and_b32 s34, 0xffff, s4
+; GFX8-NEXT:    s_add_i32 s35, s34, -4
+; GFX8-NEXT:    s_min_u32 s34, s34, s35
+; GFX8-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX8-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX8-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX8-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX8-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_i3_zeroext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_and_b32 s34, 0xffff, s4
+; GFX9-NEXT:    s_add_i32 s35, s34, -4
+; GFX9-NEXT:    s_min_u32 s34, s34, s35
+; GFX9-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX9-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX9-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX9-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_set_rounding_i3_zeroext:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_and_b32 s34, 0xffff, s4
+; GFX10-NEXT:    s_add_i32 s35, s34, -4
+; GFX10-NEXT:    s_min_u32 s34, s34, s35
+; GFX10-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX10-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX10-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX10-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_set_rounding_i3_zeroext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_and_b32 s0, 0xffff, s4
+; GFX11-NEXT:    s_add_i32 s1, s0, -4
+; GFX11-NEXT:    s_min_u32 s0, s0, s1
+; GFX11-NEXT:    s_lshl_b32 s2, s0, 2
+; GFX11-NEXT:    s_mov_b32 s0, 0x1c84a50f
+; GFX11-NEXT:    s_mov_b32 s1, 0xb73e62d9
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sext.rounding = zext i3 %rounding to i32
+  call void @llvm.set.rounding(i32 %sext.rounding)
+  ret void
+}
+
+define amdgpu_gfx void @s_set_rounding_select_0_1(i32 inreg %cond) {
+; GFX6-LABEL: s_set_rounding_select_0_1:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX6-NEXT:    s_cselect_b64 s[34:35], -1, 0
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT:    v_lshr_b32_e32 v0, 0xa50f, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s34, v0
+; GFX6-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: s_set_rounding_select_0_1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX7-NEXT:    s_cselect_b64 s[34:35], -1, 0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_lshr_b32_e32 v0, 0xa50f, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s34, v0
+; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: s_set_rounding_select_0_1:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX8-NEXT:    s_cselect_b64 s[34:35], -1, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT:    s_mov_b32 s34, 0xa50f
+; GFX8-NEXT:    v_lshrrev_b32_e64 v0, v0, s34
+; GFX8-NEXT:    v_readfirstlane_b32 s34, v0
+; GFX8-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_select_0_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX9-NEXT:    s_cselect_b64 s[34:35], -1, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_mov_b32 s34, 0xa50f
+; GFX9-NEXT:    v_lshrrev_b32_e64 v0, v0, s34
+; GFX9-NEXT:    v_readfirstlane_b32 s34, v0
+; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_set_rounding_select_0_1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX10-NEXT:    s_cselect_b32 s34, -1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    v_lshrrev_b32_e64 v0, v0, 0xa50f
+; GFX10-NEXT:    v_readfirstlane_b32 s34, v0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_set_rounding_select_0_1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX11-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    v_lshrrev_b32_e64 v0, v0, 0xa50f
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = icmp eq i32 %cond, 0
+  %rounding = select i1 %cmp, i32 0, i32 1
+  call void @llvm.set.rounding(i32 %rounding)
+  ret void
+}
+
+define amdgpu_gfx void @s_set_rounding_select_1_3(i32 inreg %cond) {
+; GFX678-LABEL: s_set_rounding_select_1_3:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX678-NEXT:    s_cselect_b32 s34, 0xa50, 10
+; GFX678-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_select_1_3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX9-NEXT:    s_cselect_b32 s34, 0xa50, 10
+; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_set_rounding_select_1_3:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX10-NEXT:    s_cselect_b32 s34, 0xa50, 10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_set_rounding_select_1_3:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX11-NEXT:    s_cselect_b32 s0, 0xa50, 10
+; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = icmp eq i32 %cond, 0
+  %rounding = select i1 %cmp, i32 1, i32 3
+  call void @llvm.set.rounding(i32 %rounding)
+  ret void
+}
+
+define void @v_set_rounding_select_1_3(i32 %cond) {
+; GFX678-LABEL: v_set_rounding_select_1_3:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0xa50
+; GFX678-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX678-NEXT:    v_cndmask_b32_e32 v0, 10, v1, vcc
+; GFX678-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX678-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_set_rounding_select_1_3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xa50
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, 10, v1, vcc
+; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_set_rounding_select_1_3:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 10, 0xa50, vcc_lo
+; GFX10-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_set_rounding_select_1_3:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 10, 0xa50, vcc_lo
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = icmp eq i32 %cond, 0
+  %rounding = select i1 %cmp, i32 1, i32 3
+  call void @llvm.set.rounding(i32 %rounding)
+  ret void
+}
+
+define amdgpu_gfx void @s_set_rounding_select_2_0(i32 inreg %cond) {
+; GFX6-LABEL: s_set_rounding_select_2_0:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX6-NEXT:    s_cselect_b64 s[34:35], -1, 0
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX6-NEXT:    v_lshr_b32_e32 v0, 0xa50f, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s34, v0
+; GFX6-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: s_set_rounding_select_2_0:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX7-NEXT:    s_cselect_b64 s[34:35], -1, 0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT:    v_lshr_b32_e32 v0, 0xa50f, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s34, v0
+; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: s_set_rounding_select_2_0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX8-NEXT:    s_cselect_b64 s[34:35], -1, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX8-NEXT:    s_mov_b32 s34, 0xa50f
+; GFX8-NEXT:    v_lshrrev_b32_e64 v0, v0, s34
+; GFX8-NEXT:    v_readfirstlane_b32 s34, v0
+; GFX8-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_select_2_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX9-NEXT:    s_cselect_b64 s[34:35], -1, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NEXT:    s_mov_b32 s34, 0xa50f
+; GFX9-NEXT:    v_lshrrev_b32_e64 v0, v0, s34
+; GFX9-NEXT:    v_readfirstlane_b32 s34, v0
+; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_set_rounding_select_2_0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX10-NEXT:    s_cselect_b32 s34, -1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX10-NEXT:    v_lshrrev_b32_e64 v0, v0, 0xa50f
+; GFX10-NEXT:    v_readfirstlane_b32 s34, v0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_set_rounding_select_2_0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX11-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-NEXT:    v_lshrrev_b32_e64 v0, v0, 0xa50f
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = icmp eq i32 %cond, 0
+  %rounding = select i1 %cmp, i32 2, i32 0
+  call void @llvm.set.rounding(i32 %rounding)
+  ret void
+}
+
+define amdgpu_gfx void @s_set_rounding_select_2_1(i32 inreg %cond) {
+; GFX678-LABEL: s_set_rounding_select_2_1:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX678-NEXT:    s_movk_i32 s34, 0xa5
+; GFX678-NEXT:    s_cselect_b32 s34, s34, 0xa50
+; GFX678-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_select_2_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX9-NEXT:    s_movk_i32 s34, 0xa5
+; GFX9-NEXT:    s_cselect_b32 s34, s34, 0xa50
+; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_set_rounding_select_2_1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX10-NEXT:    s_movk_i32 s34, 0xa5
+; GFX10-NEXT:    s_cselect_b32 s34, s34, 0xa50
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_set_rounding_select_2_1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX11-NEXT:    s_movk_i32 s0, 0xa5
+; GFX11-NEXT:    s_cselect_b32 s0, s0, 0xa50
+; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = icmp eq i32 %cond, 0
+  %rounding = select i1 %cmp, i32 2, i32 1
+  call void @llvm.set.rounding(i32 %rounding)
+  ret void
+}
+
+define amdgpu_gfx void @s_set_rounding_select_1_2(i32 inreg %cond) {
+; GFX678-LABEL: s_set_rounding_select_1_2:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX678-NEXT:    s_movk_i32 s34, 0xa50
+; GFX678-NEXT:    s_cselect_b32 s34, s34, 0xa5
+; GFX678-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_select_1_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX9-NEXT:    s_movk_i32 s34, 0xa50
+; GFX9-NEXT:    s_cselect_b32 s34, s34, 0xa5
+; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_set_rounding_select_1_2:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX10-NEXT:    s_movk_i32 s34, 0xa50
+; GFX10-NEXT:    s_cselect_b32 s34, s34, 0xa5
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_set_rounding_select_1_2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX11-NEXT:    s_movk_i32 s0, 0xa50
+; GFX11-NEXT:    s_cselect_b32 s0, s0, 0xa5
+; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = icmp eq i32 %cond, 0
+  %rounding = select i1 %cmp, i32 1, i32 2
+  call void @llvm.set.rounding(i32 %rounding)
+  ret void
+}
+
+define amdgpu_gfx void @s_set_rounding_select_3_0(i32 inreg %cond) {
+; GFX678-LABEL: s_set_rounding_select_3_0:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX678-NEXT:    s_cselect_b32 s34, 10, 0xa50f
+; GFX678-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_select_3_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX9-NEXT:    s_cselect_b32 s34, 10, 0xa50f
+; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_set_rounding_select_3_0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX10-NEXT:    s_cselect_b32 s34, 10, 0xa50f
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_set_rounding_select_3_0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX11-NEXT:    s_cselect_b32 s0, 10, 0xa50f
+; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = icmp eq i32 %cond, 0
+  %rounding = select i1 %cmp, i32 3, i32 0
+  call void @llvm.set.rounding(i32 %rounding)
+  ret void
+}
+
+define amdgpu_gfx void @s_set_rounding_select_4_0(i32 inreg %cond) {
+; GFX678-LABEL: s_set_rounding_select_4_0:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX678-NEXT:    s_cselect_b64 s[34:35], -1, 0
+; GFX678-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
+; GFX678-NEXT:    v_readfirstlane_b32 s34, v0
+; GFX678-NEXT:    s_lshl_b32 s34, s34, 2
+; GFX678-NEXT:    s_add_i32 s35, s34, -4
+; GFX678-NEXT:    s_min_u32 s34, s34, s35
+; GFX678-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX678-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX678-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX678-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX678-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_select_4_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX9-NEXT:    s_cselect_b64 s[34:35], -1, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
+; GFX9-NEXT:    v_readfirstlane_b32 s34, v0
+; GFX9-NEXT:    s_lshl_b32 s34, s34, 2
+; GFX9-NEXT:    s_add_i32 s35, s34, -4
+; GFX9-NEXT:    s_min_u32 s34, s34, s35
+; GFX9-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX9-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX9-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX9-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_set_rounding_select_4_0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX10-NEXT:    s_cselect_b32 s34, -1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s34
+; GFX10-NEXT:    v_readfirstlane_b32 s34, v0
+; GFX10-NEXT:    s_lshl_b32 s34, s34, 2
+; GFX10-NEXT:    s_add_i32 s35, s34, -4
+; GFX10-NEXT:    s_min_u32 s34, s34, s35
+; GFX10-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX10-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX10-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX10-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_set_rounding_select_4_0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX11-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-NEXT:    s_add_i32 s1, s0, -4
+; GFX11-NEXT:    s_min_u32 s0, s0, s1
+; GFX11-NEXT:    s_lshl_b32 s2, s0, 2
+; GFX11-NEXT:    s_mov_b32 s0, 0x1c84a50f
+; GFX11-NEXT:    s_mov_b32 s1, 0xb73e62d9
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = icmp eq i32 %cond, 0
+  %rounding = select i1 %cmp, i32 4, i32 0
+  call void @llvm.set.rounding(i32 %rounding)
+  ret void
+}
+
+define amdgpu_gfx void @s_set_rounding_select_3_5(i32 inreg %cond) {
+; GFX678-LABEL: s_set_rounding_select_3_5:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX678-NEXT:    s_cselect_b32 s34, 3, 5
+; GFX678-NEXT:    s_add_i32 s35, s34, -4
+; GFX678-NEXT:    s_min_u32 s34, s34, s35
+; GFX678-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX678-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX678-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX678-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX678-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_set_rounding_select_3_5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX9-NEXT:    s_cselect_b32 s34, 3, 5
+; GFX9-NEXT:    s_add_i32 s35, s34, -4
+; GFX9-NEXT:    s_min_u32 s34, s34, s35
+; GFX9-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX9-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX9-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX9-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: s_set_rounding_select_3_5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX10-NEXT:    s_cselect_b32 s34, 3, 5
+; GFX10-NEXT:    s_add_i32 s35, s34, -4
+; GFX10-NEXT:    s_min_u32 s34, s34, s35
+; GFX10-NEXT:    s_lshl_b32 s36, s34, 2
+; GFX10-NEXT:    s_mov_b32 s34, 0x1c84a50f
+; GFX10-NEXT:    s_mov_b32 s35, 0xb73e62d9
+; GFX10-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_set_rounding_select_3_5:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX11-NEXT:    s_cselect_b32 s0, 3, 5
+; GFX11-NEXT:    s_add_i32 s1, s0, -4
+; GFX11-NEXT:    s_min_u32 s0, s0, s1
+; GFX11-NEXT:    s_lshl_b32 s2, s0, 2
+; GFX11-NEXT:    s_mov_b32 s0, 0x1c84a50f
+; GFX11-NEXT:    s_mov_b32 s1, 0xb73e62d9
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = icmp eq i32 %cond, 0
+  %rounding = select i1 %cmp, i32 3, i32 5
+  call void @llvm.set.rounding(i32 %rounding)
+  ret void
+}
+
+define amdgpu_kernel void @get_rounding_after_set_rounding_1() {
+; GFX6-LABEL: get_rounding_after_set_rounding_1:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_nop 0
+; GFX6-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 4)
+; GFX6-NEXT:    s_lshl_b32 s2, s0, 2
+; GFX6-NEXT:    s_mov_b32 s0, 0xeb24da71
+; GFX6-NEXT:    s_mov_b32 s1, 0xc96f385
+; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX6-NEXT:    s_and_b32 s0, s0, 15
+; GFX6-NEXT:    s_add_i32 s1, s0, 4
+; GFX6-NEXT:    s_cmp_lt_u32 s0, 4
+; GFX6-NEXT:    s_cselect_b32 s4, s0, s1
+; GFX6-NEXT:    s_mov_b32 s0, 0
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_mov_b32 s1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: get_rounding_after_set_rounding_1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_nop 0
+; GFX7-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 4)
+; GFX7-NEXT:    s_lshl_b32 s2, s0, 2
+; GFX7-NEXT:    s_mov_b32 s0, 0xeb24da71
+; GFX7-NEXT:    s_mov_b32 s1, 0xc96f385
+; GFX7-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX7-NEXT:    s_and_b32 s0, s0, 15
+; GFX7-NEXT:    s_add_i32 s1, s0, 4
+; GFX7-NEXT:    s_cmp_lt_u32 s0, 4
+; GFX7-NEXT:    s_cselect_b32 s4, s0, s1
+; GFX7-NEXT:    s_mov_b32 s0, 0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: get_rounding_after_set_rounding_1:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 4)
+; GFX8-NEXT:    s_lshl_b32 s2, s0, 2
+; GFX8-NEXT:    s_mov_b32 s0, 0xeb24da71
+; GFX8-NEXT:    s_mov_b32 s1, 0xc96f385
+; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX8-NEXT:    s_and_b32 s0, s0, 15
+; GFX8-NEXT:    s_add_i32 s1, s0, 4
+; GFX8-NEXT:    s_cmp_lt_u32 s0, 4
+; GFX8-NEXT:    s_cselect_b32 s0, s0, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: get_rounding_after_set_rounding_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 4)
+; GFX9-NEXT:    s_lshl_b32 s2, s0, 2
+; GFX9-NEXT:    s_mov_b32 s0, 0xeb24da71
+; GFX9-NEXT:    s_mov_b32 s1, 0xc96f385
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX9-NEXT:    s_and_b32 s0, s0, 15
+; GFX9-NEXT:    s_add_i32 s1, s0, 4
+; GFX9-NEXT:    s_cmp_lt_u32 s0, 4
+; GFX9-NEXT:    s_cselect_b32 s0, s0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: get_rounding_after_set_rounding_1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_round_mode 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 4)
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_lshl_b32 s2, s0, 2
+; GFX10-NEXT:    s_mov_b32 s0, 0xeb24da71
+; GFX10-NEXT:    s_mov_b32 s1, 0xc96f385
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX10-NEXT:    s_and_b32 s0, s0, 15
+; GFX10-NEXT:    s_add_i32 s1, s0, 4
+; GFX10-NEXT:    s_cmp_lt_u32 s0, 4
+; GFX10-NEXT:    s_cselect_b32 s0, s0, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: get_rounding_after_set_rounding_1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_round_mode 0x0
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 4)
+; GFX11-NEXT:    s_lshl_b32 s2, s0, 2
+; GFX11-NEXT:    s_mov_b32 s0, 0xeb24da71
+; GFX11-NEXT:    s_mov_b32 s1, 0xc96f385
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-NEXT:    s_and_b32 s0, s0, 15
+; GFX11-NEXT:    s_add_i32 s1, s0, 4
+; GFX11-NEXT:    s_cmp_lt_u32 s0, 4
+; GFX11-NEXT:    s_cselect_b32 s0, s0, s1
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+  tail call void @llvm.set.rounding(i32 1)
+  %set.mode = tail call i32 @llvm.get.rounding()
+  store volatile i32 %set.mode, ptr addrspace(1) null
+  ret void
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 88b18232ef9c..502cd14284e1 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -mtriple=r600 -mtriple=r600-- -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
+; RUN: llc -mtriple=r600-- -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
 
 define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i1.ll b/llvm/test/CodeGen/AMDGPU/load-global-i1.ll
index 5ab1f3d972b0..dac928d70c65 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i1.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -mtriple=r600 -mtriple=r600-- -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -mtriple=r600-- -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}global_load_i1:
 ; GCN: buffer_load_ubyte
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i1.ll b/llvm/test/CodeGen/AMDGPU/load-local-i1.ll
index ea858fb67443..578170941efa 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i1.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
-; RUN: llc -mtriple=r600 -mtriple=r600-- -mcpu=cypress < %s | FileCheck -check-prefixes=EG,FUNC %s
+; RUN: llc -mtriple=r600-- -mcpu=cypress < %s | FileCheck -check-prefixes=EG,FUNC %s
 
 ; FUNC-LABEL: {{^}}local_load_i1:
 ; SICIVI: s_mov_b32 m0
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i8.ll b/llvm/test/CodeGen/AMDGPU/load-local-i8.ll
index 9b1b32a65f23..a2e55ce06b52 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i8.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SI,SICIVI,FUNC %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,VI,SICIVI,FUNC %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,FUNC %s
-; RUN: llc -mtriple=r600 -mtriple=r600-- -mcpu=redwood -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -mtriple=r600-- -mcpu=redwood -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s
 
 ; Testing for ds_read/write_b128
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=CIVI,FUNC %s
diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll
index e9d42dc70cbb..cf3443ff33b7 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll
@@ -3,6 +3,10 @@
 ; RUN: opt -S -mtriple=amdgcn-- -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=4 %s | FileCheck -check-prefix=OPT4 %s
 ; RUN: opt -S -mtriple=amdgcn-- -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=0 %s | FileCheck -check-prefix=OPT0 %s
 ; RUN: opt -S -mtriple=amdgcn-- -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=-1 %s | FileCheck -check-prefix=OPT_NEG %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=8 %s | FileCheck -check-prefix=OPT8 %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=4 %s | FileCheck -check-prefix=OPT4 %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=0 %s | FileCheck -check-prefix=OPT0 %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=-1 %s | FileCheck -check-prefix=OPT_NEG %s
 
 ; Test the -mem-intrinsic-expand-size flag works.
 
diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
index 0f4e790a6976..1c4e4b8602ff 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
@@ -1,10 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; Check the default works
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -pre-isel-intrinsic-lowering %s | FileCheck -check-prefixes=OPT,MAX1024 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=pre-isel-intrinsic-lowering %s | FileCheck -check-prefixes=OPT,MAX1024 %s
 
 ; Check the default explicitly set works
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=1024 %s | FileCheck -check-prefixes=OPT,MAX1024 %s
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=0 %s | FileCheck -check-prefixes=OPT,ALL %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=1024 %s | FileCheck -check-prefixes=OPT,MAX1024 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=0 %s | FileCheck -check-prefixes=OPT,ALL %s
 
 declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) nocapture, ptr addrspace(1) nocapture readonly, i64, i1) #1
 declare void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) nocapture, ptr addrspace(3) nocapture readonly, i32, i1) #1
diff --git a/llvm/test/CodeGen/AMDGPU/mode-register-fpconstrain.ll b/llvm/test/CodeGen/AMDGPU/mode-register-fpconstrain.ll
index 2403aeaa4428..8a29229c152f 100644
--- a/llvm/test/CodeGen/AMDGPU/mode-register-fpconstrain.ll
+++ b/llvm/test/CodeGen/AMDGPU/mode-register-fpconstrain.ll
@@ -9,8 +9,6 @@ define double @ignoreStrictfp(double noundef %a, double noundef %b) #0 {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 1
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 0
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   tail call void @llvm.amdgcn.s.setreg(i32 2177, i32 1)
@@ -24,8 +22,6 @@ define double @set_fpenv(double noundef %a, double noundef %b) #0 {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 23), 4
 ; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 0, 5), 0
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 0
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/nullptr-long-address-spaces.ll b/llvm/test/CodeGen/AMDGPU/nullptr-long-address-spaces.ll
index 98c869f23d47..6556f07c7350 100644
--- a/llvm/test/CodeGen/AMDGPU/nullptr-long-address-spaces.ll
+++ b/llvm/test/CodeGen/AMDGPU/nullptr-long-address-spaces.ll
@@ -1,6 +1,6 @@
 ; XFAIL: *
 ; RUN: llc < %s -mtriple=amdgcn-- -verify-machineinstrs | FileCheck -check-prefixes=CHECK,GCN %s
-; RUN: llc < %s -mtriple=r600 -mtriple=r600-- -verify-machineinstrs | FileCheck -check-prefixes=CHECK,R600 %s
+; RUN: llc < %s -mtriple=r600-- -verify-machineinstrs | FileCheck -check-prefixes=CHECK,R600 %s
 
 ; This is a temporary xfail, as the assembly printer is broken when dealing with
 ; lowerConstant() trying to return a value of size greater than 8 bytes.
diff --git a/llvm/test/CodeGen/AMDGPU/nullptr.ll b/llvm/test/CodeGen/AMDGPU/nullptr.ll
index b7a15f97e103..5a736aabd4ee 100644
--- a/llvm/test/CodeGen/AMDGPU/nullptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/nullptr.ll
@@ -1,5 +1,5 @@
 ;RUN: llc < %s -mtriple=amdgcn-- -verify-machineinstrs | FileCheck -check-prefixes=CHECK,GCN %s
-;RUN: llc < %s -mtriple=r600 -mtriple=r600-- -verify-machineinstrs | FileCheck -check-prefixes=CHECK,R600 %s
+;RUN: llc < %s -mtriple=r600-- -verify-machineinstrs | FileCheck -check-prefixes=CHECK,R600 %s
 
 %struct.S = type { ptr addrspace(5), ptr addrspace(1), ptr addrspace(4), ptr addrspace(3), ptr, ptr addrspace(2)}
 
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
index cabc11037017..6d18f354e654 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
@@ -107,10 +107,33 @@
 ; CHECK-NEXT:        .vgpr_limit:     0x100
 ; CHECK-NEXT:        .wavefront_size: 0x40
 ; CHECK-NEXT:        .wgp_mode:       false
+; CHECK-NEXT:      .gs:
+; CHECK-NEXT:        .debug_mode:     false
+; CHECK-NEXT:        .entry_point:    gs_shader
+; CHECK-NEXT:        .ieee_mode:      false
+; CHECK-NEXT:        .lds_size:       0x200
+; CHECK-NEXT:        .mem_ordered:    true
+; CHECK-NEXT:        .scratch_en:     false
+; CHECK-NEXT:        .scratch_memory_size: 0
+; CHECK-NEXT:        .sgpr_count:     0x1
+; CHECK-NEXT:        .vgpr_count:     0x1
+; CHECK-NEXT:        .wgp_mode:       true
+; CHECK-NEXT:      .hs:
+; CHECK-NEXT:        .debug_mode:     false
+; CHECK-NEXT:        .entry_point:    hs_shader
+; CHECK-NEXT:        .ieee_mode:      false
+; CHECK-NEXT:        .lds_size:       0x1000
+; CHECK-NEXT:        .mem_ordered:    true
+; CHECK-NEXT:        .scratch_en:     false
+; CHECK-NEXT:        .scratch_memory_size: 0
+; CHECK-NEXT:        .sgpr_count:     0x1
+; CHECK-NEXT:        .vgpr_count:     0x1
+; CHECK-NEXT:        .wgp_mode:       true
 ; CHECK-NEXT:      .ps:
 ; CHECK-NEXT:        .debug_mode:     false
 ; CHECK-NEXT:        .entry_point:    ps_shader
 ; CHECK-NEXT:        .ieee_mode:      false
+; CHECK-NEXT:        .lds_size:       0
 ; CHECK-NEXT:        .mem_ordered:    true
 ; CHECK-NEXT:        .scratch_en:     false
 ; CHECK-NEXT:        .scratch_memory_size: 0
@@ -145,6 +168,22 @@ define dllexport amdgpu_ps void @ps_shader() #1 {
   ret void
 }
 
+@LDS.GS = external addrspace(3) global [1 x i32], align 4
+
+define dllexport amdgpu_gs void @gs_shader() #2 {
+  %ptr = getelementptr i32, ptr addrspace(3) @LDS.GS, i32 0
+  store i32 0, ptr addrspace(3) %ptr, align 4
+  ret void
+}
+
+@LDS.HS = external addrspace(3) global [1024 x i32], align 4
+
+define dllexport amdgpu_hs void @hs_shader() #2 {
+  %ptr = getelementptr i32, ptr addrspace(3) @LDS.HS, i32 0
+  store i32 0, ptr addrspace(3) %ptr, align 4
+  ret void
+}
+
 !amdgpu.pal.metadata.msgpack = !{!0}
 
 ; Function Attrs: nounwind willreturn memory(none)
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
index 1be041c8dc9b..fbe34a3a3970 100644
--- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
@@ -32,11 +32,11 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ; GFX906-NEXT:    v_writelane_b32 v2, s24, 5
 ; GFX906-NEXT:    s_mov_b64 s[26:27], s[10:11]
 ; GFX906-NEXT:    v_writelane_b32 v2, s26, 6
-; GFX906-NEXT:    v_writelane_b32 v41, s34, 2
+; GFX906-NEXT:    v_writelane_b32 v41, s16, 4
 ; GFX906-NEXT:    v_writelane_b32 v2, s27, 7
-; GFX906-NEXT:    v_writelane_b32 v41, s35, 3
+; GFX906-NEXT:    v_writelane_b32 v41, s34, 2
 ; GFX906-NEXT:    v_writelane_b32 v2, s8, 8
-; GFX906-NEXT:    v_writelane_b32 v41, s16, 4
+; GFX906-NEXT:    v_writelane_b32 v41, s35, 3
 ; GFX906-NEXT:    v_writelane_b32 v2, s9, 9
 ; GFX906-NEXT:    v_writelane_b32 v41, s30, 0
 ; GFX906-NEXT:    v_writelane_b32 v2, s4, 10
@@ -340,9 +340,9 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ; GFX906-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX906-NEXT:    v_readlane_b32 s30, v41, 0
 ; GFX906-NEXT:    ; kill: killed $vgpr40
+; GFX906-NEXT:    v_readlane_b32 s4, v41, 4
 ; GFX906-NEXT:    v_readlane_b32 s34, v41, 2
 ; GFX906-NEXT:    v_readlane_b32 s35, v41, 3
-; GFX906-NEXT:    v_readlane_b32 s4, v41, 4
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    flat_store_dwordx4 v[0:1], v[30:33] offset:112
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
@@ -383,12 +383,12 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ; GFX908-NEXT:    s_mov_b64 exec, -1
 ; GFX908-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill
 ; GFX908-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX908-NEXT:    v_mov_b32_e32 v3, s16
+; GFX908-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:168 ; 4-byte Folded Spill
 ; GFX908-NEXT:    v_mov_b32_e32 v3, s34
 ; GFX908-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:160 ; 4-byte Folded Spill
 ; GFX908-NEXT:    v_mov_b32_e32 v3, s35
 ; GFX908-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:164 ; 4-byte Folded Spill
-; GFX908-NEXT:    v_mov_b32_e32 v3, s16
-; GFX908-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:168 ; 4-byte Folded Spill
 ; GFX908-NEXT:    s_addk_i32 s32, 0x2c00
 ; GFX908-NEXT:    s_mov_b64 s[16:17], exec
 ; GFX908-NEXT:    s_mov_b64 exec, 1
@@ -753,16 +753,16 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:172
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:160 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:168 ; 4-byte Folded Reload
 ; GFX908-NEXT:    ; kill: killed $vgpr40
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:160 ; 4-byte Folded Reload
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_readfirstlane_b32 s34, v0
 ; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:164 ; 4-byte Folded Reload
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_readfirstlane_b32 s35, v0
-; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:168 ; 4-byte Folded Reload
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX908-NEXT:    s_xor_saveexec_b64 s[6:7], -1
 ; GFX908-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload
 ; GFX908-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:156 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll
index 8ef1d3ff27e5..406c953a06d9 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll
+++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll
@@ -8,13 +8,13 @@
 ; RUN: FileCheck --check-prefix=WARN-GFX90A %s < %t
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+wavefrontsize64 -verify-machineinstrs < %s
 
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -mattr=+wavefrontsize64 -stop-after=amdgpu-remove-incompatible-functions\
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=amdgpu-remove-incompatible-functions\
 ; RUN:   -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -mattr=+wavefrontsize64 -verify-machineinstrs < %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s
 
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=amdgpu-remove-incompatible-functions\
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=amdgpu-remove-incompatible-functions\
 ; RUN:   -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s
 
 ; WARN-GFX906: removing function 'needs_wavefrontsize32': +wavefrontsize32 is not supported on the current target
 ; WARN-GFX906-NOT: not supported
diff --git a/llvm/test/CodeGen/AMDGPU/setcc.ll b/llvm/test/CodeGen/AMDGPU/setcc.ll
index 6ab49382b904..c00cd763992d 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -mtriple=r600 -mtriple=r600-- -mcpu=redwood -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -mtriple=r600-- -mcpu=redwood -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=R600 -check-prefix=FUNC %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
index 38672da3c647..4e3dccb975fe 100644
--- a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
@@ -1,7 +1,7 @@
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SI,FUNC %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX89,FUNC %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX9,GFX89,FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=r600 -mtriple=r600-- -mcpu=cypress < %s | FileCheck -enable-var-scope --check-prefixes=EG,FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=r600-- -mcpu=cypress < %s | FileCheck -enable-var-scope --check-prefixes=EG,FUNC %s
 
 ; FIXME: i16 promotion pass ruins the scalar cases when legal.
 ; FIXME: r600 fails verifier
diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index c440392153ad..b1a82daa8e7d 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefixes=SI
 ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=VI
-; RUN: llc < %s -amdgpu-scalarize-global-loads=false  -mtriple=r600 -mtriple=r600-- -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefixes=EG
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false  -mtriple=r600-- -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefixes=EG
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-dbg-info.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-dbg-info.ll
index a7af02017001..a7b4eee84cb9 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-dbg-info.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-dbg-info.ll
@@ -23,9 +23,9 @@ define amdgpu_ps i32 @if_else(i32 %0) !dbg !5 {
 ; OPT-NEXT:    br label [[FLOW]], !dbg [[DBG16:![0-9]+]]
 ; OPT:       exit:
 ; OPT-NEXT:    [[RET:%.*]] = phi i32 [ [[TMP5]], [[FLOW]] ], [ 42, [[TRUE]] ], !dbg [[DBG17:![0-9]+]]
-; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP8]]), !dbg [[DBG18:![0-9]+]]
+; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP8]])
 ; OPT-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[RET]], metadata [[META11:![0-9]+]], metadata !DIExpression()), !dbg [[DBG17]]
-; OPT-NEXT:    ret i32 [[RET]], !dbg [[DBG18]]
+; OPT-NEXT:    ret i32 [[RET]], !dbg [[DBG18:![0-9]+]]
 ;
   %c = icmp eq i32 %0, 0, !dbg !13
   tail call void @llvm.dbg.value(metadata i1 %c, metadata !9, metadata !DIExpression()), !dbg !13
@@ -65,13 +65,13 @@ define amdgpu_ps void @loop_if_break(i32 %n) !dbg !19 {
 ; OPT:       Flow:
 ; OPT-NEXT:    [[TMP3]] = phi i32 [ [[I_NEXT]], [[LOOP_BODY]] ], [ undef, [[LOOP]] ]
 ; OPT-NEXT:    [[TMP4:%.*]] = phi i1 [ false, [[LOOP_BODY]] ], [ true, [[LOOP]] ]
-; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]]), !dbg [[DBG27]]
+; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]])
 ; OPT-NEXT:    [[TMP5]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP4]], i64 [[PHI_BROKEN]]), !dbg [[DBG27]]
 ; OPT-NEXT:    [[TMP6:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP5]]), !dbg [[DBG27]]
 ; OPT-NEXT:    br i1 [[TMP6]], label [[EXIT:%.*]], label [[LOOP]], !dbg [[DBG27]]
 ; OPT:       exit:
-; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP5]]), !dbg [[DBG30:![0-9]+]]
-; OPT-NEXT:    ret void, !dbg [[DBG30]]
+; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP5]])
+; OPT-NEXT:    ret void, !dbg [[DBG30:![0-9]+]]
 ;
 entry:
   br label %loop, !dbg !24
diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll
index ae0221b8b32b..b8cf69237206 100644
--- a/llvm/test/CodeGen/AMDGPU/sra.ll
+++ b/llvm/test/CodeGen/AMDGPU/sra.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs < %s | FileCheck %s -check-prefixes=SI
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefixes=VI
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=r600 -mtriple=r600-- -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s -check-prefixes=EG
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=r600-- -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s -check-prefixes=EG
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
index 4aeff3f23993..be3c0d741ac5 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
@@ -162,6 +162,7 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
 ; GCN-NEXT: buffer_store_dword [[VGPR_REG:v[0-9]+]], off, s[0:3], s33 offset:1028 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, s[18:19]
+; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], [[FP_SCRATCH_COPY]], 2
 ; GCN-NEXT: v_mov_b32_e32 v32, 0
 ; GCN-DAG: v_writelane_b32 [[VGPR_REG]], s34, 3
 ; GCN: s_mov_b32 s34, s32
@@ -169,14 +170,13 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34
 ; GCN-DAG: s_add_i32 s32, s32, 0x30000
-; GCN: v_writelane_b32 [[VGPR_REG]], [[FP_SCRATCH_COPY]], 2
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32
 ; GCN: s_swappc_b64 s[30:31],
 
 ; GCN: v_readlane_b32 s31, [[VGPR_REG]], 1
 ; GCN: v_readlane_b32 s30, [[VGPR_REG]], 0
-; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG]], 3
 ; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[VGPR_REG]], 2
+; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG]], 3
 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT: buffer_load_dword [[VGPR_REG]], off, s[0:3], s33 offset:1028 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, s[6:7]
@@ -265,9 +265,9 @@ define void @no_free_regs_spill_bp_to_memory(<32 x i32> %a, i32 %b) #5 {
 ; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
 ; GCN: s_xor_saveexec_b64 s[6:7], -1
 ; GCN: buffer_store_dword v39, off, s[0:3], s33
-; GCN: v_mov_b32_e32 v0, s34
-; GCN: buffer_store_dword v0, off, s[0:3], s33
 ; GCN: v_mov_b32_e32 v0, [[FP_SCRATCH_COPY]]
+; GCN: buffer_store_dword v0, off, s[0:3], s33
+; GCN: v_mov_b32_e32 v0, s34
 ; GCN-DAG: buffer_store_dword v0, off, s[0:3], s33
   %local_val = alloca i32, align 128, addrspace(5)
   store volatile i32 %b, ptr addrspace(5) %local_val, align 128
@@ -304,13 +304,11 @@ define void @spill_bp_to_memory_scratch_reg_needed_mubuf_offset(<32 x i32> %a, i
 ; GCN-NEXT: s_add_i32 s5, s33, 0x42100
 ; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s5 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, s[6:7]
-; GCN-NEXT: v_mov_b32_e32 v0, s34
-; GCN-NOT: v_mov_b32_e32 v0, 0x108c
-; GCN-NEXT: s_add_i32 s5, s33, 0x42300
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
 ; GCN-NEXT: v_mov_b32_e32 v0, [[FP_SCRATCH_COPY]]
-; GCN-NOT: v_mov_b32_e32 v0, 0x1088
 ; GCN-NEXT: s_add_i32 s5, s33, 0x42200
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
+; GCN-NEXT: v_mov_b32_e32 v0, s34
+; GCN-NEXT: s_add_i32 s5, s33, 0x42300
 ; GCN-NEXT: s_mov_b32 s34, s32
 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
   %local_val = alloca i32, align 128, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/store-global.ll b/llvm/test/CodeGen/AMDGPU/store-global.ll
index f068b1481aa9..1ff9b117237f 100644
--- a/llvm/test/CodeGen/AMDGPU/store-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-global.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=VI -check-prefix=FUNC %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=FUNC %s
-; RUN: llc -mtriple=r600 -mtriple=r600-- -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -mtriple=r600 -mtriple=r600-- -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
+; RUN: llc -mtriple=r600-- -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -mtriple=r600-- -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}store_i1:
 ; EG: MEM_RAT MSKOR
diff --git a/llvm/test/CodeGen/AMDGPU/store-local.ll b/llvm/test/CodeGen/AMDGPU/store-local.ll
index 479f881cd40c..76e2d4366e3e 100644
--- a/llvm/test/CodeGen/AMDGPU/store-local.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-local.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,VI,FUNC %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
-; RUN: llc -mtriple=r600 -mtriple=r600-- -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s
-; RUN: llc -mtriple=r600 -mtriple=r600-- -mcpu=cayman < %s | FileCheck -check-prefixes=CM,FUNC %s
+; RUN: llc -mtriple=r600-- -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s
+; RUN: llc -mtriple=r600-- -mcpu=cayman < %s | FileCheck -check-prefixes=CM,FUNC %s
 
 ; FUNC-LABEL: {{^}}store_local_i1:
 ; SICIVI: s_mov_b32 m0
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll b/llvm/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll
index 8ccd3b9dd124..8de059d1d7b8 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=r600 -mtriple=r600-- -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -mtriple=r600-- -mcpu=redwood | FileCheck %s
 
 ; This tests for a bug in the SelectionDAG where custom lowered truncated
 ; vector stores at the end of a basic block were not being added to the
diff --git a/llvm/test/CodeGen/AMDGPU/unknown-processor.ll b/llvm/test/CodeGen/AMDGPU/unknown-processor.ll
index f1f1c92bcbed..683ba98e52cf 100644
--- a/llvm/test/CodeGen/AMDGPU/unknown-processor.ll
+++ b/llvm/test/CodeGen/AMDGPU/unknown-processor.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn-- -mcpu=unknown -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=GCN %s
-; RUN: llc -mtriple=r600 -mtriple=r600-- -mcpu=unknown -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=R600 %s
+; RUN: not llc -mtriple=amdgcn-- -mcpu=unknown -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=GCN %s
+; RUN: llc -mtriple=r600-- -mcpu=unknown -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=R600 %s
 target datalayout = "A5"
 
 ; Should not crash when the processor is not recognized and the
diff --git a/llvm/test/CodeGen/AMDGPU/unsupported-calls.ll b/llvm/test/CodeGen/AMDGPU/unsupported-calls.ll
index 694f444b7747..fc00937e6c8a 100644
--- a/llvm/test/CodeGen/AMDGPU/unsupported-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/unsupported-calls.ll
@@ -1,6 +1,6 @@
 ; RUN: not llc -mtriple=amdgcn-mesa-mesa3d -tailcallopt < %s 2>&1 | FileCheck --check-prefix=GCN %s
 ; RUN: not llc -mtriple=amdgcn--amdpal -tailcallopt < %s 2>&1 | FileCheck --check-prefix=GCN %s
-; RUN: not llc -mtriple=r600 -mtriple=r600-- -mcpu=cypress -tailcallopt < %s 2>&1 | FileCheck -check-prefix=R600 %s
+; RUN: not llc -mtriple=r600-- -mcpu=cypress -tailcallopt < %s 2>&1 | FileCheck -check-prefix=R600 %s
 
 declare i32 @external_function(i32) nounwind
 
diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca.ll
index 5ef794b64c0b..2c87680284e2 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mtriple=amdgcn-- -mcpu=verde -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s
-; RUN: llc -mtriple=r600 -mtriple=r600-- -mcpu=redwood < %s | FileCheck --check-prefixes=EG,FUNC %s
+; RUN: llc -mtriple=r600-- -mcpu=redwood < %s | FileCheck --check-prefixes=EG,FUNC %s
 ; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' < %s | FileCheck -check-prefix=OPT %s
 target datalayout = "A5"
 
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
index f680bbdd05cd..8c285f37b487 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
@@ -18,9 +18,9 @@ define void @vector_reg_liverange_split() #0 {
 ; GFX90A-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX90A-NEXT:    buffer_store_dword a32, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX90A-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX90A-NEXT:    v_writelane_b32 v40, s16, 4
 ; GFX90A-NEXT:    v_writelane_b32 v40, s28, 2
 ; GFX90A-NEXT:    v_writelane_b32 v40, s29, 3
-; GFX90A-NEXT:    v_writelane_b32 v40, s16, 4
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
 ; GFX90A-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX90A-NEXT:    s_addk_i32 s32, 0x400
@@ -48,9 +48,9 @@ define void @vector_reg_liverange_split() #0 {
 ; GFX90A-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX90A-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX90A-NEXT:    ; kill: killed $vgpr0
+; GFX90A-NEXT:    v_readlane_b32 s4, v40, 4
 ; GFX90A-NEXT:    v_readlane_b32 s28, v40, 2
 ; GFX90A-NEXT:    v_readlane_b32 s29, v40, 3
-; GFX90A-NEXT:    v_readlane_b32 s4, v40, 4
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[6:7], -1
 ; GFX90A-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, -1
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
index 7eabe982ff2b..5608ea856354 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
@@ -24,9 +24,9 @@ define void @test() #0 {
 ; GCN-NEXT:    s_mov_b64 exec, -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
+; GCN-NEXT:    v_writelane_b32 v40, s16, 4
 ; GCN-NEXT:    v_writelane_b32 v40, s28, 2
 ; GCN-NEXT:    v_writelane_b32 v40, s29, 3
-; GCN-NEXT:    v_writelane_b32 v40, s16, 4
 ; GCN-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
 ; GCN-NEXT:    v_writelane_b32 v40, s30, 0
 ; GCN-NEXT:    s_addk_i32 s32, 0x800
@@ -55,9 +55,9 @@ define void @test() #0 {
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
 ; GCN-NEXT:    ; kill: killed $vgpr1
+; GCN-NEXT:    v_readlane_b32 s4, v40, 4
 ; GCN-NEXT:    v_readlane_b32 s28, v40, 2
 ; GCN-NEXT:    v_readlane_b32 s29, v40, 3
-; GCN-NEXT:    v_readlane_b32 s4, v40, 4
 ; GCN-NEXT:    s_xor_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
@@ -79,9 +79,9 @@ define void @test() #0 {
 ; GCN-O0-NEXT:    s_mov_b64 exec, -1
 ; GCN-O0-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GCN-O0-NEXT:    v_writelane_b32 v40, s16, 4
 ; GCN-O0-NEXT:    v_writelane_b32 v40, s28, 2
 ; GCN-O0-NEXT:    v_writelane_b32 v40, s29, 3
-; GCN-O0-NEXT:    v_writelane_b32 v40, s16, 4
 ; GCN-O0-NEXT:    s_add_i32 s32, s32, 0x400
 ; GCN-O0-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
 ; GCN-O0-NEXT:    v_writelane_b32 v40, s30, 0
@@ -117,9 +117,9 @@ define void @test() #0 {
 ; GCN-O0-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-O0-NEXT:    v_readlane_b32 s30, v40, 0
 ; GCN-O0-NEXT:    ; kill: killed $vgpr0
+; GCN-O0-NEXT:    v_readlane_b32 s4, v40, 4
 ; GCN-O0-NEXT:    v_readlane_b32 s28, v40, 2
 ; GCN-O0-NEXT:    v_readlane_b32 s29, v40, 3
-; GCN-O0-NEXT:    v_readlane_b32 s4, v40, 4
 ; GCN-O0-NEXT:    s_xor_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, -1
diff --git a/llvm/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll b/llvm/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll
index bdfa89d9f304..6db7fe80c3cc 100644
--- a/llvm/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll
+++ b/llvm/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=r600 -mcpu=redwood -mtriple=r600-- < %s | FileCheck %s
+; RUN: llc -mtriple=r600-- -mcpu=redwood < %s | FileCheck %s
 
 ; We want all MULLO_INT inst to be last in their instruction group
 ;CHECK: {{^}}fill3d:
diff --git a/llvm/test/CodeGen/ARM/cortex-a57-misched-basic.ll b/llvm/test/CodeGen/ARM/cortex-a57-misched-basic.ll
index 2e8a05417d43..ec4e37f0ba80 100644
--- a/llvm/test/CodeGen/ARM/cortex-a57-misched-basic.ll
+++ b/llvm/test/CodeGen/ARM/cortex-a57-misched-basic.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: asserts
 ; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=A57_SCHED
-; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=generic    -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=armv8r-eabi -mattr=+neon,+fp-armv8 -mcpu=generic    -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
 
 ; Check the latency for instructions for both generic and cortex-a57.
 ; SDIV should be scheduled at the block's begin (20 cyc of independent M unit).
diff --git a/llvm/test/CodeGen/ARM/fpconv.ll b/llvm/test/CodeGen/ARM/fpconv.ll
index 929da5f18c81..7e6109f75201 100644
--- a/llvm/test/CodeGen/ARM/fpconv.ll
+++ b/llvm/test/CodeGen/ARM/fpconv.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o - | FileCheck %s --check-prefix=CHECK-VFP
 ; RUN: llc -mtriple=arm-apple-darwin %s -o - | FileCheck %s
-; RUN: llc -mtriple=armv8r-none-none-eabi %s -o - | FileCheck %s --check-prefix=CHECK-VFP
-; RUN: llc -mtriple=armv8r-none-none-eabi -mattr=-fp64 %s -o - | FileCheck %s --check-prefix=CHECK-VFP-SP
+; RUN: llc -mtriple=armv8r-none-none-eabi -mattr=+neon,+fp-armv8 %s -o - | FileCheck %s --check-prefix=CHECK-VFP
+; RUN: llc -mtriple=armv8r-none-none-eabi %s -o - | FileCheck %s --check-prefix=CHECK-VFP-SP
 
 define float @f1(double %x) {
 ;CHECK-VFP-LABEL: f1:
diff --git a/llvm/test/CodeGen/ARM/frem-power2.ll b/llvm/test/CodeGen/ARM/frem-power2.ll
new file mode 100644
index 000000000000..7f52943175ac
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/frem-power2.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=thumbv8m.main-none-eabi %s -o - | FileCheck %s --check-prefix=CHECK-SOFT
+; RUN: llc -mtriple=thumbv8m.main-none-eabi -mattr=+fp-armv8 %s -o - | FileCheck %s --check-prefix=CHECK-FP
+; RUN: llc -mtriple=thumbv8m.main-none-eabi -mattr=+fp-armv8,+slowfpvfmx %s -o - | FileCheck %s --check-prefix=CHECK-M33
+
+define float @frem4(float %x) {
+; CHECK-SOFT-LABEL: frem4:
+; CHECK-SOFT:       @ %bb.0: @ %entry
+; CHECK-SOFT-NEXT:    .save {r7, lr}
+; CHECK-SOFT-NEXT:    push {r7, lr}
+; CHECK-SOFT-NEXT:    mov.w r1, #1082130432
+; CHECK-SOFT-NEXT:    bl fmodf
+; CHECK-SOFT-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: frem4:
+; CHECK-FP:       @ %bb.0: @ %entry
+; CHECK-FP-NEXT:    mov.w r1, #1082130432
+; CHECK-FP-NEXT:    b fmodf
+;
+; CHECK-M33-LABEL: frem4:
+; CHECK-M33:       @ %bb.0: @ %entry
+; CHECK-M33-NEXT:    mov.w r1, #1082130432
+; CHECK-M33-NEXT:    b fmodf
+entry:
+  %fmod = frem float %x, 4.0
+  ret float %fmod
+}
+
+define float @frem4_nsz(float %x) {
+; CHECK-SOFT-LABEL: frem4_nsz:
+; CHECK-SOFT:       @ %bb.0: @ %entry
+; CHECK-SOFT-NEXT:    .save {r7, lr}
+; CHECK-SOFT-NEXT:    push {r7, lr}
+; CHECK-SOFT-NEXT:    mov.w r1, #1082130432
+; CHECK-SOFT-NEXT:    bl fmodf
+; CHECK-SOFT-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: frem4_nsz:
+; CHECK-FP:       @ %bb.0: @ %entry
+; CHECK-FP-NEXT:    vmov.f32 s0, #4.000000e+00
+; CHECK-FP-NEXT:    vmov s2, r0
+; CHECK-FP-NEXT:    vdiv.f32 s4, s2, s0
+; CHECK-FP-NEXT:    vrintz.f32 s4, s4
+; CHECK-FP-NEXT:    vfms.f32 s2, s4, s0
+; CHECK-FP-NEXT:    vmov r0, s2
+; CHECK-FP-NEXT:    bx lr
+;
+; CHECK-M33-LABEL: frem4_nsz:
+; CHECK-M33:       @ %bb.0: @ %entry
+; CHECK-M33-NEXT:    vmov.f32 s0, #4.000000e+00
+; CHECK-M33-NEXT:    vmov s2, r0
+; CHECK-M33-NEXT:    vdiv.f32 s4, s2, s0
+; CHECK-M33-NEXT:    vrintz.f32 s4, s4
+; CHECK-M33-NEXT:    vmls.f32 s2, s4, s0
+; CHECK-M33-NEXT:    vmov r0, s2
+; CHECK-M33-NEXT:    bx lr
+entry:
+  %fmod = frem nsz float %x, 4.0
+  ret float %fmod
+}
diff --git a/llvm/test/CodeGen/ARM/half.ll b/llvm/test/CodeGen/ARM/half.ll
index 9b53dc77f227..9f8c552cf839 100644
--- a/llvm/test/CodeGen/ARM/half.ll
+++ b/llvm/test/CodeGen/ARM/half.ll
@@ -1,8 +1,8 @@
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios7.0 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-OLD
 ; RUN: llc < %s -mtriple=thumbv7s-apple-ios7.0 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-F16
 ; RUN: llc < %s -mtriple=thumbv8-apple-ios7.0 | FileCheck %s --check-prefix=CHECK  --check-prefix=CHECK-V8
-; RUN: llc < %s -mtriple=armv8r-none-none-eabi | FileCheck %s --check-prefix=CHECK  --check-prefix=CHECK-V8
-; RUN: llc < %s -mtriple=armv8r-none-none-eabi -mattr=-fp64 | FileCheck %s --check-prefix=CHECK  --check-prefix=CHECK-V8-SP
+; RUN: llc < %s -mtriple=armv8r-none-none-eabi -mattr=+neon,+fp-armv8 | FileCheck %s --check-prefix=CHECK  --check-prefix=CHECK-V8
+; RUN: llc < %s -mtriple=armv8r-none-none-eabi | FileCheck %s --check-prefix=CHECK  --check-prefix=CHECK-V8-SP
 ; RUN: llc < %s -mtriple=armv8.1m-none-none-eabi -mattr=+fp-armv8 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-V8
 ; RUN: llc < %s -mtriple=armv8.1m-none-none-eabi -mattr=+fp-armv8,-fp64 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-V8-SP
 ; RUN: llc < %s -mtriple=armv8.1m-none-none-eabi -mattr=+mve.fp,+fp64 | FileCheck %s --check-prefix=CHECK-V8
diff --git a/llvm/test/CodeGen/ARM/useaa.ll b/llvm/test/CodeGen/ARM/useaa.ll
index f8207a1056e3..d70d24b3fd34 100644
--- a/llvm/test/CodeGen/ARM/useaa.ll
+++ b/llvm/test/CodeGen/ARM/useaa.ll
@@ -1,15 +1,11 @@
 ; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-r52 | FileCheck %s --check-prefix=CHECK --check-prefix=USEAA
 ; RUN: llc < %s -mtriple=armv7m-eabi -mcpu=cortex-m4 | FileCheck %s --check-prefix=CHECK --check-prefix=USEAA
 ; RUN: llc < %s -mtriple=armv8m-eabi -mcpu=cortex-m33 | FileCheck %s --check-prefix=CHECK --check-prefix=USEAA
-; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=generic | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=generic | FileCheck %s --check-prefix=CHECK --check-prefix=USEAA
 
 ; Check we use AA during codegen, so can interleave these loads/stores.
 
 ; CHECK-LABEL: test
-; GENERIC: ldr
-; GENERIC: ldr
-; GENERIC: str
-; GENERIC: str
 ; USEAA: ldr
 ; USEAA: ldr
 ; USEAA: str
diff --git a/llvm/test/CodeGen/DirectX/Metadata/dxilVer-1.0.ll b/llvm/test/CodeGen/DirectX/Metadata/dxilVer-1.0.ll
new file mode 100644
index 000000000000..254479e5f94c
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/Metadata/dxilVer-1.0.ll
@@ -0,0 +1,12 @@
+; RUN: opt -S -dxil-metadata-emit %s | FileCheck %s
+target triple = "dxil-pc-shadermodel6.0-vertex"
+
+; CHECK: !dx.version = !{![[DXVER:[0-9]+]]}
+; CHECK: ![[DXVER]] = !{i32 1, i32 0}
+
+define void @entry() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { noinline nounwind "hlsl.shader"="vertex" }
diff --git a/llvm/test/CodeGen/DirectX/Metadata/dxilVer-1.8.ll b/llvm/test/CodeGen/DirectX/Metadata/dxilVer-1.8.ll
new file mode 100644
index 000000000000..efeb5a1b2486
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/Metadata/dxilVer-1.8.ll
@@ -0,0 +1,12 @@
+; RUN: opt -S -dxil-metadata-emit %s | FileCheck %s
+target triple = "dxil-pc-shadermodel6.8-compute"
+
+; CHECK: !dx.version = !{![[DXVER:[0-9]+]]}
+; CHECK: ![[DXVER]] = !{i32 1, i32 8}
+
+define void @entry() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { noinline nounwind "hlsl.numthreads"="1,2,1" "hlsl.shader"="compute" }
diff --git a/llvm/test/CodeGen/DirectX/Metadata/shaderModel-cs-val-ver-0.0.ll b/llvm/test/CodeGen/DirectX/Metadata/shaderModel-cs-val-ver-0.0.ll
new file mode 100644
index 000000000000..a85dc43ac2f6
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/Metadata/shaderModel-cs-val-ver-0.0.ll
@@ -0,0 +1,16 @@
+; RUN: opt -S -dxil-prepare  %s | FileCheck %s 
+
+target triple = "dxil-pc-shadermodel6.6-compute"
+
+define void @entry() #0 {
+entry:
+  ret void
+}
+
+; Make sure experimental attribute is left when validation version is 0.0.
+; CHECK:attributes #0 = { noinline nounwind "exp-shader"="cs" } 
+attributes #0 = { noinline nounwind "exp-shader"="cs" "hlsl.numthreads"="1,2,1" "hlsl.shader"="compute" }
+
+!dx.valver = !{!0}
+
+!0 = !{i32 0, i32 0}
diff --git a/llvm/test/CodeGen/DirectX/Metadata/shaderModel-cs.ll b/llvm/test/CodeGen/DirectX/Metadata/shaderModel-cs.ll
index be4b46f22ef2..343f190d994f 100644
--- a/llvm/test/CodeGen/DirectX/Metadata/shaderModel-cs.ll
+++ b/llvm/test/CodeGen/DirectX/Metadata/shaderModel-cs.ll
@@ -1,4 +1,6 @@
 ; RUN: opt -S -dxil-metadata-emit %s | FileCheck %s
+; RUN: opt -S -dxil-prepare  %s | FileCheck %s  --check-prefix=REMOVE_EXTRA_ATTRIBUTE
+
 target triple = "dxil-pc-shadermodel6.6-compute"
 
 ; CHECK: !dx.shaderModel = !{![[SM:[0-9]+]]}
@@ -9,4 +11,7 @@ entry:
   ret void
 }
 
-attributes #0 = { noinline nounwind "hlsl.numthreads"="1,2,1" "hlsl.shader"="compute" }
+; Make sure extra attribute like hlsl.numthreads are removed.
+; And experimental attribute is removed when validator version is not 0.0.
+; REMOVE_EXTRA_ATTRIBUTE:attributes #0 = { noinline nounwind } 
+attributes #0 = { noinline nounwind "exp-shader"="cs" "hlsl.numthreads"="1,2,1" "hlsl.shader"="compute" }
diff --git a/llvm/test/CodeGen/DirectX/strip-fn-attrs.ll b/llvm/test/CodeGen/DirectX/strip-fn-attrs.ll
index 4f2000640185..b0dd89cf90f2 100644
--- a/llvm/test/CodeGen/DirectX/strip-fn-attrs.ll
+++ b/llvm/test/CodeGen/DirectX/strip-fn-attrs.ll
@@ -9,12 +9,7 @@ define dso_local float @fma(float %0, float %1, float %2) local_unnamed_addr #0
   ret float %5
 }
 
-; CHECK: Function Attrs: nounwind memory(none)
-; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
 ; CHECK: attributes #0 = { nounwind memory(none) }
 ; CHECK-NOT: attributes #
 
 attributes #0 = { norecurse nounwind readnone willreturn }
-attributes #1 = { nofree nosync nounwind readnone speculatable willreturn }
diff --git a/llvm/test/CodeGen/DirectX/tan.ll b/llvm/test/CodeGen/DirectX/tan.ll
new file mode 100644
index 000000000000..567ab02d40f9
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/tan.ll
@@ -0,0 +1,20 @@
+; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+
+; Make sure dxil operation function calls for tan are generated for float and half.
+
+define noundef float @tan_float(float noundef %a) #0 {
+entry:
+; CHECK:call float @dx.op.unary.f32(i32 14, float %{{.*}})
+  %elt.tan = call float @llvm.tan.f32(float %a)
+  ret float %elt.tan
+}
+
+define noundef half @tan_half(half noundef %a) #0 {
+entry:
+; CHECK:call half @dx.op.unary.f16(i32 14, half %{{.*}})
+  %elt.tan = call half @llvm.tan.f16(half %a)
+  ret half %elt.tan
+}
+
+declare half @llvm.tan.f16(half)
+declare float @llvm.tan.f32(float)
diff --git a/llvm/test/CodeGen/DirectX/tan_error.ll b/llvm/test/CodeGen/DirectX/tan_error.ll
new file mode 100644
index 000000000000..c870c36f5492
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/tan_error.ll
@@ -0,0 +1,10 @@
+; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+
+; DXIL operation tan does not support double overload type
+; CHECK: LLVM ERROR: Invalid Overload
+
+define noundef double @tan_double(double noundef %a) #0 {
+entry:
+  %1 = call double @llvm.tan.f64(double %a)
+  ret double %1
+}
diff --git a/llvm/test/CodeGen/Generic/expand-vp.ll b/llvm/test/CodeGen/Generic/expand-vp.ll
index 40d183273b86..4fee9a533b94 100644
--- a/llvm/test/CodeGen/Generic/expand-vp.ll
+++ b/llvm/test/CodeGen/Generic/expand-vp.ll
@@ -41,6 +41,8 @@ declare i32 @llvm.vp.reduce.umin.v4i32(i32, <4 x i32>, <4 x i1>, i32)
 declare i32 @llvm.vp.reduce.umax.v4i32(i32, <4 x i32>, <4 x i1>, i32)
 declare float @llvm.vp.reduce.fmin.v4f32(float, <4 x float>, <4 x i1>, i32)
 declare float @llvm.vp.reduce.fmax.v4f32(float, <4 x float>, <4 x i1>, i32)
+declare float @llvm.vp.reduce.fminimum.v4f32(float, <4 x float>, <4 x i1>, i32)
+declare float @llvm.vp.reduce.fmaximum.v4f32(float, <4 x float>, <4 x i1>, i32)
 declare float @llvm.vp.reduce.fadd.v4f32(float, <4 x float>, <4 x i1>, i32)
 declare float @llvm.vp.reduce.fmul.v4f32(float, <4 x float>, <4 x i1>, i32)
 ; Comparisons
@@ -133,10 +135,16 @@ define void @test_vp_reduce_fp_v4(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n
   %r3 = call float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
   %r4 = call nnan float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
   %r5 = call nnan ninf float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
-  %r6 = call float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
-  %r7 = call reassoc float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
-  %r8 = call float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
-  %r9 = call reassoc float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+  %r6 = call float @llvm.vp.reduce.fminimum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+  %r7 = call nnan float @llvm.vp.reduce.fminimum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+  %r8 = call nnan ninf float @llvm.vp.reduce.fminimum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+  %r9 = call float @llvm.vp.reduce.fmaximum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+  %r10 = call nnan float @llvm.vp.reduce.fmaximum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+  %r11 = call nnan ninf float @llvm.vp.reduce.fmaximum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+  %r12 = call float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+  %r13 = call reassoc float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+  %r14 = call float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+  %r15 = call reassoc float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
   ret void
 }
 
@@ -254,6 +262,27 @@ define void @test_vp_cmp_v8(<8 x i32> %i0, <8 x i32> %i1, <8 x float> %f0, <8 x
 ; ALL-CONVERT:  [[FMAX_NNAN_NINF:%.+]] = select <4 x i1> %{{.+}}, <4 x float> %vf, <4 x float> <float 0xC7EFFFFFE0000000, float 0xC7EFFFFFE0000000, float 0xC7EFFFFFE0000000, float 0xC7EFFFFFE0000000>
 ; ALL-CONVERT-NEXT:  [[RED:%.+]] = call nnan ninf float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[FMAX_NNAN_NINF]])
 ; ALL-CONVERT-NEXT:  %{{.+}} = call nnan ninf float @llvm.maxnum.f32(float [[RED]], float %f)
+
+; ALL-CONVERT:       [[FMINIMUM:%.+]] = select <4 x i1> %{{.+}}, <4 x float> %vf, <4 x float> <float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000>
+; ALL-CONVERT-NEXT:  [[RED:%.+]] = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> [[FMINIMUM]])
+; ALL-CONVERT-NEXT:  %{{.+}} = call float @llvm.minimum.f32(float [[RED]], float %f)
+; ALL-CONVERT:       [[FMINIMUM_NNAN:%.+]] = select <4 x i1> %{{.+}}, <4 x float> %vf, <4 x float> <float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000>
+; ALL-CONVERT-NEXT:  [[RED:%.+]] = call nnan float @llvm.vector.reduce.fminimum.v4f32(<4 x float> [[FMINIMUM_NNAN]])
+; ALL-CONVERT-NEXT:  %{{.+}} = call nnan float @llvm.minimum.f32(float [[RED]], float %f)
+; ALL-CONVERT:       [[FMINIMUM_NNAN_NINF:%.+]] = select <4 x i1> %{{.+}}, <4 x float> %vf, <4 x float> <float 0x47EFFFFFE0000000, float 0x47EFFFFFE0000000, float 0x47EFFFFFE0000000, float 0x47EFFFFFE0000000>
+; ALL-CONVERT-NEXT:  [[RED:%.+]] = call nnan ninf float @llvm.vector.reduce.fminimum.v4f32(<4 x float> [[FMINIMUM_NNAN_NINF]])
+; ALL-CONVERT-NEXT:  %{{.+}} = call nnan ninf float @llvm.minimum.f32(float [[RED]], float %f)
+
+; ALL-CONVERT:  [[FMAXIMUM:%.+]] = select <4 x i1> %{{.+}}, <4 x float> %vf, <4 x float> <float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000>
+; ALL-CONVERT-NEXT:  [[RED:%.+]] = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> [[FMAXIMUM]])
+; ALL-CONVERT-NEXT:  %{{.+}} = call float @llvm.maximum.f32(float [[RED]], float %f)
+; ALL-CONVERT:  [[FMAXIMUM_NNAN:%.+]] = select <4 x i1> %{{.+}}, <4 x float> %vf, <4 x float> <float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000>
+; ALL-CONVERT-NEXT:  [[RED:%.+]] = call nnan float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> [[FMAXIMUM_NNAN]])
+; ALL-CONVERT-NEXT:  %{{.+}} = call nnan float @llvm.maximum.f32(float [[RED]], float %f)
+; ALL-CONVERT:  [[FMAXIMUM_NNAN_NINF:%.+]] = select <4 x i1> %{{.+}}, <4 x float> %vf, <4 x float> <float 0xC7EFFFFFE0000000, float 0xC7EFFFFFE0000000, float 0xC7EFFFFFE0000000, float 0xC7EFFFFFE0000000>
+; ALL-CONVERT-NEXT:  [[RED:%.+]] = call nnan ninf float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> [[FMAXIMUM_NNAN_NINF]])
+; ALL-CONVERT-NEXT:  %{{.+}} = call nnan ninf float @llvm.maximum.f32(float [[RED]], float %f)
+
 ; ALL-CONVERT:  [[FADD:%.+]] = select <4 x i1> %{{.+}}, <4 x float> %vf, <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>
 ; ALL-CONVERT-NEXT:  %{{.+}} = call float @llvm.vector.reduce.fadd.v4f32(float %f, <4 x float> [[FADD]])
 ; ALL-CONVERT:  [[FADD:%.+]] = select <4 x i1> %{{.+}}, <4 x float> %vf, <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>
@@ -328,16 +357,22 @@ define void @test_vp_cmp_v8(<8 x i32> %i0, <8 x i32> %i1, <8 x float> %f0, <8 x
 ; LEGAL_LEGAL-NEXT:  ret void
 
 ; LEGAL_LEGAL: define void @test_vp_reduce_fp_v4(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) {
-; LEGAL_LEGAL-NEXT:  %r0 = call float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
-; LEGAL_LEGAL-NEXT:  %r1 = call nnan float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
-; LEGAL_LEGAL-NEXT:  %r2 = call nnan ninf float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
-; LEGAL_LEGAL-NEXT:  %r3 = call float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
-; LEGAL_LEGAL-NEXT:  %r4 = call nnan float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
-; LEGAL_LEGAL-NEXT:  %r5 = call nnan ninf float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
-; LEGAL_LEGAL-NEXT:  %r6 = call float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
-; LEGAL_LEGAL-NEXT:  %r7 = call reassoc float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
-; LEGAL_LEGAL-NEXT:  %r8 = call float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
-; LEGAL_LEGAL-NEXT:  %r9 = call reassoc float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT: %r0 = call float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT: %r1 = call nnan float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT: %r2 = call nnan ninf float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT: %r3 = call float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT: %r4 = call nnan float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT: %r5 = call nnan ninf float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT: %r6 = call float @llvm.vp.reduce.fminimum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT: %r7 = call nnan float @llvm.vp.reduce.fminimum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT: %r8 = call nnan ninf float @llvm.vp.reduce.fminimum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT: %r9 = call float @llvm.vp.reduce.fmaximum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT: %r10 = call nnan float @llvm.vp.reduce.fmaximum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT: %r11 = call nnan ninf float @llvm.vp.reduce.fmaximum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT: %r12 = call float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT: %r13 = call reassoc float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT: %r14 = call float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT: %r15 = call reassoc float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
 ; LEGAL_LEGAL-NEXT:  ret void
 
 ; LEGAL_LEGAL: define void @test_vp_cmp_v8(<8 x i32> %i0, <8 x i32> %i1, <8 x float> %f0, <8 x float> %f1, <8 x i1> %m, i32 %n) {
@@ -425,10 +460,16 @@ define void @test_vp_cmp_v8(<8 x i32> %i0, <8 x i32> %i1, <8 x float> %f0, <8 x
 ; DISCARD_LEGAL-NOT:  %r3 = call float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
 ; DISCARD_LEGAL-NOT:  %r4 = call nnan float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
 ; DISCARD_LEGAL-NOT:  %r5 = call nnan ninf float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
-; DISCARD_LEGAL-NOT:  %r6 = call float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
-; DISCARD_LEGAL-NOT:  %r7 = call reassoc float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
-; DISCARD_LEGAL-NOT:  %r8 = call float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
-; DISCARD_LEGAL-NOT:  %r9 = call reassoc float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
+; DISCARD_LEGAL-NOT:  %r6 = call float @llvm.vp.reduce.fminimum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; DISCARD_LEGAL-NOT:  %r7 = call nnan float @llvm.vp.reduce.fminimum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; DISCARD_LEGAL-NOT:  %r8 = call nnan ninf float @llvm.vp.reduce.fminimum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; DISCARD_LEGAL-NOT:  %r9 = call float @llvm.vp.reduce.fmaximum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; DISCARD_LEGAL-NOT:  %r10 = call nnan float @llvm.vp.reduce.fmaximum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; DISCARD_LEGAL-NOT:  %r11 = call nnan ninf float @llvm.vp.reduce.fmaximum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; DISCARD_LEGAL-NOT:  %r12 = call float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; DISCARD_LEGAL-NOT:  %r13 = call reassoc float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; DISCARD_LEGAL-NOT:  %r14 = call float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; DISCARD_LEGAL-NOT:  %r15 = call reassoc float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
 ; DISCARD_LEGAL:      ret void
 
 ; DISCARD_LEGAL: define void @test_vp_cmp_v8(<8 x i32> %i0, <8 x i32> %i1, <8 x float> %f0, <8 x float> %f1, <8 x i1> %m, i32 %n) {
@@ -501,6 +542,12 @@ define void @test_vp_cmp_v8(<8 x i32> %i0, <8 x i32> %i1, <8 x float> %f0, <8 x
 ; CONVERT_LEGAL-NOT:   %{{.+}} = call float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
 ; CONVERT_LEGAL-NOT:   %{{.+}} = call nnan float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
 ; CONVERT_LEGAL-NOT:   %{{.+}} = call nnan ninf float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call float @llvm.vp.reduce.fminimum.v4f32(float %f, <4 x float> %vf, <4 x i1> [[NEWM]], i32 4)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call nnan float @llvm.vp.reduce.fminimum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call nnan ninf float @llvm.vp.reduce.fminimum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call float @llvm.vp.reduce.fmaximum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call nnan float @llvm.vp.reduce.fmaximum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call nnan ninf float @llvm.vp.reduce.fmaximum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
 ; CONVERT_LEGAL-NOT:   %{{.+}} = call float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
 ; CONVERT_LEGAL-NOT:   %{{.+}} = call reassoc float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
 ; CONVERT_LEGAL-NOT:   %{{.+}} = call float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
diff --git a/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll
index 7cde034726e0..b95c2e24737a 100644
--- a/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll
@@ -4,37 +4,37 @@
 define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
 ; LA64-LABEL: atomicrmw_uinc_wrap_i8:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    slli.d $a4, $a0, 3
+; LA64-NEXT:    slli.d $a3, $a0, 3
 ; LA64-NEXT:    bstrins.d $a0, $zero, 1, 0
-; LA64-NEXT:    andi $a2, $a4, 24
+; LA64-NEXT:    andi $a2, $a3, 24
 ; LA64-NEXT:    ori $a5, $zero, 255
-; LA64-NEXT:    ld.w $a3, $a0, 0
-; LA64-NEXT:    sll.w $a4, $a5, $a4
-; LA64-NEXT:    nor $a4, $a4, $zero
+; LA64-NEXT:    ld.w $a4, $a0, 0
+; LA64-NEXT:    sll.w $a3, $a5, $a3
+; LA64-NEXT:    nor $a3, $a3, $zero
 ; LA64-NEXT:    andi $a1, $a1, 255
 ; LA64-NEXT:    .p2align 4, , 16
 ; LA64-NEXT:  .LBB0_1: # %atomicrmw.start
 ; LA64-NEXT:    # =>This Loop Header: Depth=1
 ; LA64-NEXT:    # Child Loop BB0_3 Depth 2
-; LA64-NEXT:    srl.w $a5, $a3, $a2
-; LA64-NEXT:    addi.w $a6, $a3, 0
-; LA64-NEXT:    andi $a7, $a5, 255
-; LA64-NEXT:    addi.d $a5, $a5, 1
-; LA64-NEXT:    sltu $a7, $a7, $a1
-; LA64-NEXT:    xori $a7, $a7, 1
-; LA64-NEXT:    masknez $a5, $a5, $a7
-; LA64-NEXT:    andi $a5, $a5, 255
-; LA64-NEXT:    sll.w $a5, $a5, $a2
-; LA64-NEXT:    and $a3, $a3, $a4
-; LA64-NEXT:    or $a5, $a3, $a5
+; LA64-NEXT:    move $a5, $a4
+; LA64-NEXT:    srl.w $a4, $a4, $a2
+; LA64-NEXT:    andi $a6, $a4, 255
+; LA64-NEXT:    addi.d $a4, $a4, 1
+; LA64-NEXT:    sltu $a6, $a6, $a1
+; LA64-NEXT:    xori $a6, $a6, 1
+; LA64-NEXT:    masknez $a4, $a4, $a6
+; LA64-NEXT:    andi $a4, $a4, 255
+; LA64-NEXT:    sll.w $a4, $a4, $a2
+; LA64-NEXT:    and $a6, $a5, $a3
+; LA64-NEXT:    or $a6, $a6, $a4
 ; LA64-NEXT:  .LBB0_3: # %atomicrmw.start
 ; LA64-NEXT:    # Parent Loop BB0_1 Depth=1
 ; LA64-NEXT:    # => This Inner Loop Header: Depth=2
-; LA64-NEXT:    ll.w $a3, $a0, 0
-; LA64-NEXT:    bne $a3, $a6, .LBB0_5
+; LA64-NEXT:    ll.w $a4, $a0, 0
+; LA64-NEXT:    bne $a4, $a5, .LBB0_5
 ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB0_3 Depth=2
-; LA64-NEXT:    move $a7, $a5
+; LA64-NEXT:    move $a7, $a6
 ; LA64-NEXT:    sc.w $a7, $a0, 0
 ; LA64-NEXT:    beqz $a7, .LBB0_3
 ; LA64-NEXT:    b .LBB0_6
@@ -43,9 +43,9 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
 ; LA64-NEXT:    dbar 20
 ; LA64-NEXT:  .LBB0_6: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; LA64-NEXT:    bne $a3, $a6, .LBB0_1
+; LA64-NEXT:    bne $a4, $a5, .LBB0_1
 ; LA64-NEXT:  # %bb.2: # %atomicrmw.end
-; LA64-NEXT:    srl.w $a0, $a3, $a2
+; LA64-NEXT:    srl.w $a0, $a4, $a2
 ; LA64-NEXT:    ret
   %result = atomicrmw uinc_wrap ptr %ptr, i8 %val seq_cst
   ret i8 %result
@@ -54,38 +54,38 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
 define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
 ; LA64-LABEL: atomicrmw_uinc_wrap_i16:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    slli.d $a4, $a0, 3
+; LA64-NEXT:    slli.d $a3, $a0, 3
 ; LA64-NEXT:    bstrins.d $a0, $zero, 1, 0
-; LA64-NEXT:    andi $a2, $a4, 24
-; LA64-NEXT:    lu12i.w $a3, 15
-; LA64-NEXT:    ori $a5, $a3, 4095
-; LA64-NEXT:    ld.w $a3, $a0, 0
-; LA64-NEXT:    sll.w $a4, $a5, $a4
-; LA64-NEXT:    nor $a4, $a4, $zero
+; LA64-NEXT:    andi $a2, $a3, 24
+; LA64-NEXT:    lu12i.w $a4, 15
+; LA64-NEXT:    ori $a5, $a4, 4095
+; LA64-NEXT:    ld.w $a4, $a0, 0
+; LA64-NEXT:    sll.w $a3, $a5, $a3
+; LA64-NEXT:    nor $a3, $a3, $zero
 ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
 ; LA64-NEXT:    .p2align 4, , 16
 ; LA64-NEXT:  .LBB1_1: # %atomicrmw.start
 ; LA64-NEXT:    # =>This Loop Header: Depth=1
 ; LA64-NEXT:    # Child Loop BB1_3 Depth 2
-; LA64-NEXT:    srl.w $a5, $a3, $a2
-; LA64-NEXT:    addi.w $a6, $a3, 0
-; LA64-NEXT:    bstrpick.d $a7, $a5, 15, 0
-; LA64-NEXT:    addi.d $a5, $a5, 1
-; LA64-NEXT:    sltu $a7, $a7, $a1
-; LA64-NEXT:    xori $a7, $a7, 1
-; LA64-NEXT:    masknez $a5, $a5, $a7
-; LA64-NEXT:    bstrpick.d $a5, $a5, 15, 0
-; LA64-NEXT:    sll.w $a5, $a5, $a2
-; LA64-NEXT:    and $a3, $a3, $a4
-; LA64-NEXT:    or $a5, $a3, $a5
+; LA64-NEXT:    move $a5, $a4
+; LA64-NEXT:    srl.w $a4, $a4, $a2
+; LA64-NEXT:    bstrpick.d $a6, $a4, 15, 0
+; LA64-NEXT:    addi.d $a4, $a4, 1
+; LA64-NEXT:    sltu $a6, $a6, $a1
+; LA64-NEXT:    xori $a6, $a6, 1
+; LA64-NEXT:    masknez $a4, $a4, $a6
+; LA64-NEXT:    bstrpick.d $a4, $a4, 15, 0
+; LA64-NEXT:    sll.w $a4, $a4, $a2
+; LA64-NEXT:    and $a6, $a5, $a3
+; LA64-NEXT:    or $a6, $a6, $a4
 ; LA64-NEXT:  .LBB1_3: # %atomicrmw.start
 ; LA64-NEXT:    # Parent Loop BB1_1 Depth=1
 ; LA64-NEXT:    # => This Inner Loop Header: Depth=2
-; LA64-NEXT:    ll.w $a3, $a0, 0
-; LA64-NEXT:    bne $a3, $a6, .LBB1_5
+; LA64-NEXT:    ll.w $a4, $a0, 0
+; LA64-NEXT:    bne $a4, $a5, .LBB1_5
 ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB1_3 Depth=2
-; LA64-NEXT:    move $a7, $a5
+; LA64-NEXT:    move $a7, $a6
 ; LA64-NEXT:    sc.w $a7, $a0, 0
 ; LA64-NEXT:    beqz $a7, .LBB1_3
 ; LA64-NEXT:    b .LBB1_6
@@ -94,9 +94,9 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
 ; LA64-NEXT:    dbar 20
 ; LA64-NEXT:  .LBB1_6: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB1_1 Depth=1
-; LA64-NEXT:    bne $a3, $a6, .LBB1_1
+; LA64-NEXT:    bne $a4, $a5, .LBB1_1
 ; LA64-NEXT:  # %bb.2: # %atomicrmw.end
-; LA64-NEXT:    srl.w $a0, $a3, $a2
+; LA64-NEXT:    srl.w $a0, $a4, $a2
 ; LA64-NEXT:    ret
   %result = atomicrmw uinc_wrap ptr %ptr, i16 %val seq_cst
   ret i16 %result
@@ -111,19 +111,19 @@ define i32 @atomicrmw_uinc_wrap_i32(ptr %ptr, i32 %val) {
 ; LA64-NEXT:  .LBB2_1: # %atomicrmw.start
 ; LA64-NEXT:    # =>This Loop Header: Depth=1
 ; LA64-NEXT:    # Child Loop BB2_3 Depth 2
-; LA64-NEXT:    addi.d $a3, $a2, 1
-; LA64-NEXT:    addi.w $a4, $a2, 0
-; LA64-NEXT:    sltu $a2, $a4, $a1
-; LA64-NEXT:    xori $a2, $a2, 1
-; LA64-NEXT:    masknez $a3, $a3, $a2
+; LA64-NEXT:    move $a3, $a2
+; LA64-NEXT:    addi.d $a2, $a2, 1
+; LA64-NEXT:    sltu $a4, $a3, $a1
+; LA64-NEXT:    xori $a4, $a4, 1
+; LA64-NEXT:    masknez $a4, $a2, $a4
 ; LA64-NEXT:  .LBB2_3: # %atomicrmw.start
 ; LA64-NEXT:    # Parent Loop BB2_1 Depth=1
 ; LA64-NEXT:    # => This Inner Loop Header: Depth=2
 ; LA64-NEXT:    ll.w $a2, $a0, 0
-; LA64-NEXT:    bne $a2, $a4, .LBB2_5
+; LA64-NEXT:    bne $a2, $a3, .LBB2_5
 ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB2_3 Depth=2
-; LA64-NEXT:    move $a5, $a3
+; LA64-NEXT:    move $a5, $a4
 ; LA64-NEXT:    sc.w $a5, $a0, 0
 ; LA64-NEXT:    beqz $a5, .LBB2_3
 ; LA64-NEXT:    b .LBB2_6
@@ -132,7 +132,7 @@ define i32 @atomicrmw_uinc_wrap_i32(ptr %ptr, i32 %val) {
 ; LA64-NEXT:    dbar 20
 ; LA64-NEXT:  .LBB2_6: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB2_1 Depth=1
-; LA64-NEXT:    bne $a2, $a4, .LBB2_1
+; LA64-NEXT:    bne $a2, $a3, .LBB2_1
 ; LA64-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64-NEXT:    move $a0, $a2
 ; LA64-NEXT:    ret
@@ -180,42 +180,42 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
 define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
 ; LA64-LABEL: atomicrmw_udec_wrap_i8:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    slli.d $a4, $a0, 3
+; LA64-NEXT:    slli.d $a3, $a0, 3
 ; LA64-NEXT:    bstrins.d $a0, $zero, 1, 0
-; LA64-NEXT:    andi $a2, $a4, 24
-; LA64-NEXT:    ori $a5, $zero, 255
-; LA64-NEXT:    ld.w $a3, $a0, 0
-; LA64-NEXT:    sll.w $a4, $a5, $a4
-; LA64-NEXT:    nor $a4, $a4, $zero
-; LA64-NEXT:    andi $a5, $a1, 255
+; LA64-NEXT:    andi $a2, $a3, 24
+; LA64-NEXT:    ori $a4, $zero, 255
+; LA64-NEXT:    ld.w $a5, $a0, 0
+; LA64-NEXT:    sll.w $a3, $a4, $a3
+; LA64-NEXT:    nor $a3, $a3, $zero
+; LA64-NEXT:    andi $a4, $a1, 255
 ; LA64-NEXT:    .p2align 4, , 16
 ; LA64-NEXT:  .LBB4_1: # %atomicrmw.start
 ; LA64-NEXT:    # =>This Loop Header: Depth=1
 ; LA64-NEXT:    # Child Loop BB4_3 Depth 2
-; LA64-NEXT:    srl.w $a6, $a3, $a2
-; LA64-NEXT:    addi.w $a7, $a3, 0
-; LA64-NEXT:    andi $t0, $a6, 255
-; LA64-NEXT:    addi.d $a6, $a6, -1
-; LA64-NEXT:    sltui $t1, $t0, 1
-; LA64-NEXT:    sltu $t0, $a5, $t0
-; LA64-NEXT:    masknez $a6, $a6, $t0
-; LA64-NEXT:    maskeqz $t0, $a1, $t0
-; LA64-NEXT:    or $a6, $t0, $a6
-; LA64-NEXT:    masknez $a6, $a6, $t1
-; LA64-NEXT:    maskeqz $t0, $a1, $t1
-; LA64-NEXT:    or $a6, $t0, $a6
-; LA64-NEXT:    andi $a6, $a6, 255
-; LA64-NEXT:    sll.w $a6, $a6, $a2
-; LA64-NEXT:    and $a3, $a3, $a4
-; LA64-NEXT:    or $a6, $a3, $a6
+; LA64-NEXT:    move $a6, $a5
+; LA64-NEXT:    srl.w $a5, $a5, $a2
+; LA64-NEXT:    andi $a7, $a5, 255
+; LA64-NEXT:    addi.d $a5, $a5, -1
+; LA64-NEXT:    sltui $t0, $a7, 1
+; LA64-NEXT:    sltu $a7, $a4, $a7
+; LA64-NEXT:    masknez $a5, $a5, $a7
+; LA64-NEXT:    maskeqz $a7, $a1, $a7
+; LA64-NEXT:    or $a5, $a7, $a5
+; LA64-NEXT:    masknez $a5, $a5, $t0
+; LA64-NEXT:    maskeqz $a7, $a1, $t0
+; LA64-NEXT:    or $a5, $a7, $a5
+; LA64-NEXT:    andi $a5, $a5, 255
+; LA64-NEXT:    sll.w $a5, $a5, $a2
+; LA64-NEXT:    and $a7, $a6, $a3
+; LA64-NEXT:    or $a7, $a7, $a5
 ; LA64-NEXT:  .LBB4_3: # %atomicrmw.start
 ; LA64-NEXT:    # Parent Loop BB4_1 Depth=1
 ; LA64-NEXT:    # => This Inner Loop Header: Depth=2
-; LA64-NEXT:    ll.w $a3, $a0, 0
-; LA64-NEXT:    bne $a3, $a7, .LBB4_5
+; LA64-NEXT:    ll.w $a5, $a0, 0
+; LA64-NEXT:    bne $a5, $a6, .LBB4_5
 ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB4_3 Depth=2
-; LA64-NEXT:    move $t0, $a6
+; LA64-NEXT:    move $t0, $a7
 ; LA64-NEXT:    sc.w $t0, $a0, 0
 ; LA64-NEXT:    beqz $t0, .LBB4_3
 ; LA64-NEXT:    b .LBB4_6
@@ -224,9 +224,9 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
 ; LA64-NEXT:    dbar 20
 ; LA64-NEXT:  .LBB4_6: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB4_1 Depth=1
-; LA64-NEXT:    bne $a3, $a7, .LBB4_1
+; LA64-NEXT:    bne $a5, $a6, .LBB4_1
 ; LA64-NEXT:  # %bb.2: # %atomicrmw.end
-; LA64-NEXT:    srl.w $a0, $a3, $a2
+; LA64-NEXT:    srl.w $a0, $a5, $a2
 ; LA64-NEXT:    ret
   %result = atomicrmw udec_wrap ptr %ptr, i8 %val seq_cst
   ret i8 %result
@@ -235,43 +235,43 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
 define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
 ; LA64-LABEL: atomicrmw_udec_wrap_i16:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    slli.d $a4, $a0, 3
+; LA64-NEXT:    slli.d $a3, $a0, 3
 ; LA64-NEXT:    bstrins.d $a0, $zero, 1, 0
-; LA64-NEXT:    andi $a2, $a4, 24
-; LA64-NEXT:    lu12i.w $a3, 15
-; LA64-NEXT:    ori $a5, $a3, 4095
-; LA64-NEXT:    ld.w $a3, $a0, 0
-; LA64-NEXT:    sll.w $a4, $a5, $a4
-; LA64-NEXT:    nor $a4, $a4, $zero
-; LA64-NEXT:    bstrpick.d $a5, $a1, 15, 0
+; LA64-NEXT:    andi $a2, $a3, 24
+; LA64-NEXT:    lu12i.w $a4, 15
+; LA64-NEXT:    ori $a4, $a4, 4095
+; LA64-NEXT:    ld.w $a5, $a0, 0
+; LA64-NEXT:    sll.w $a3, $a4, $a3
+; LA64-NEXT:    nor $a3, $a3, $zero
+; LA64-NEXT:    bstrpick.d $a4, $a1, 15, 0
 ; LA64-NEXT:    .p2align 4, , 16
 ; LA64-NEXT:  .LBB5_1: # %atomicrmw.start
 ; LA64-NEXT:    # =>This Loop Header: Depth=1
 ; LA64-NEXT:    # Child Loop BB5_3 Depth 2
-; LA64-NEXT:    srl.w $a6, $a3, $a2
-; LA64-NEXT:    addi.w $a7, $a3, 0
-; LA64-NEXT:    bstrpick.d $t0, $a6, 15, 0
-; LA64-NEXT:    addi.d $a6, $a6, -1
-; LA64-NEXT:    sltui $t1, $t0, 1
-; LA64-NEXT:    sltu $t0, $a5, $t0
-; LA64-NEXT:    masknez $a6, $a6, $t0
-; LA64-NEXT:    maskeqz $t0, $a1, $t0
-; LA64-NEXT:    or $a6, $t0, $a6
-; LA64-NEXT:    masknez $a6, $a6, $t1
-; LA64-NEXT:    maskeqz $t0, $a1, $t1
-; LA64-NEXT:    or $a6, $t0, $a6
-; LA64-NEXT:    bstrpick.d $a6, $a6, 15, 0
-; LA64-NEXT:    sll.w $a6, $a6, $a2
-; LA64-NEXT:    and $a3, $a3, $a4
-; LA64-NEXT:    or $a6, $a3, $a6
+; LA64-NEXT:    move $a6, $a5
+; LA64-NEXT:    srl.w $a5, $a5, $a2
+; LA64-NEXT:    bstrpick.d $a7, $a5, 15, 0
+; LA64-NEXT:    addi.d $a5, $a5, -1
+; LA64-NEXT:    sltui $t0, $a7, 1
+; LA64-NEXT:    sltu $a7, $a4, $a7
+; LA64-NEXT:    masknez $a5, $a5, $a7
+; LA64-NEXT:    maskeqz $a7, $a1, $a7
+; LA64-NEXT:    or $a5, $a7, $a5
+; LA64-NEXT:    masknez $a5, $a5, $t0
+; LA64-NEXT:    maskeqz $a7, $a1, $t0
+; LA64-NEXT:    or $a5, $a7, $a5
+; LA64-NEXT:    bstrpick.d $a5, $a5, 15, 0
+; LA64-NEXT:    sll.w $a5, $a5, $a2
+; LA64-NEXT:    and $a7, $a6, $a3
+; LA64-NEXT:    or $a7, $a7, $a5
 ; LA64-NEXT:  .LBB5_3: # %atomicrmw.start
 ; LA64-NEXT:    # Parent Loop BB5_1 Depth=1
 ; LA64-NEXT:    # => This Inner Loop Header: Depth=2
-; LA64-NEXT:    ll.w $a3, $a0, 0
-; LA64-NEXT:    bne $a3, $a7, .LBB5_5
+; LA64-NEXT:    ll.w $a5, $a0, 0
+; LA64-NEXT:    bne $a5, $a6, .LBB5_5
 ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB5_3 Depth=2
-; LA64-NEXT:    move $t0, $a6
+; LA64-NEXT:    move $t0, $a7
 ; LA64-NEXT:    sc.w $t0, $a0, 0
 ; LA64-NEXT:    beqz $t0, .LBB5_3
 ; LA64-NEXT:    b .LBB5_6
@@ -280,9 +280,9 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
 ; LA64-NEXT:    dbar 20
 ; LA64-NEXT:  .LBB5_6: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB5_1 Depth=1
-; LA64-NEXT:    bne $a3, $a7, .LBB5_1
+; LA64-NEXT:    bne $a5, $a6, .LBB5_1
 ; LA64-NEXT:  # %bb.2: # %atomicrmw.end
-; LA64-NEXT:    srl.w $a0, $a3, $a2
+; LA64-NEXT:    srl.w $a0, $a5, $a2
 ; LA64-NEXT:    ret
   %result = atomicrmw udec_wrap ptr %ptr, i16 %val seq_cst
   ret i16 %result
@@ -297,24 +297,24 @@ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) {
 ; LA64-NEXT:  .LBB6_1: # %atomicrmw.start
 ; LA64-NEXT:    # =>This Loop Header: Depth=1
 ; LA64-NEXT:    # Child Loop BB6_3 Depth 2
-; LA64-NEXT:    addi.d $a4, $a2, -1
-; LA64-NEXT:    addi.w $a5, $a2, 0
-; LA64-NEXT:    sltui $a2, $a5, 1
-; LA64-NEXT:    sltu $a6, $a3, $a5
-; LA64-NEXT:    masknez $a4, $a4, $a6
+; LA64-NEXT:    move $a4, $a2
+; LA64-NEXT:    addi.d $a2, $a2, -1
+; LA64-NEXT:    sltui $a5, $a4, 1
+; LA64-NEXT:    sltu $a6, $a3, $a4
+; LA64-NEXT:    masknez $a2, $a2, $a6
 ; LA64-NEXT:    maskeqz $a6, $a1, $a6
-; LA64-NEXT:    or $a4, $a6, $a4
-; LA64-NEXT:    masknez $a4, $a4, $a2
-; LA64-NEXT:    maskeqz $a2, $a1, $a2
-; LA64-NEXT:    or $a4, $a2, $a4
+; LA64-NEXT:    or $a2, $a6, $a2
+; LA64-NEXT:    masknez $a2, $a2, $a5
+; LA64-NEXT:    maskeqz $a5, $a1, $a5
+; LA64-NEXT:    or $a5, $a5, $a2
 ; LA64-NEXT:  .LBB6_3: # %atomicrmw.start
 ; LA64-NEXT:    # Parent Loop BB6_1 Depth=1
 ; LA64-NEXT:    # => This Inner Loop Header: Depth=2
 ; LA64-NEXT:    ll.w $a2, $a0, 0
-; LA64-NEXT:    bne $a2, $a5, .LBB6_5
+; LA64-NEXT:    bne $a2, $a4, .LBB6_5
 ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB6_3 Depth=2
-; LA64-NEXT:    move $a6, $a4
+; LA64-NEXT:    move $a6, $a5
 ; LA64-NEXT:    sc.w $a6, $a0, 0
 ; LA64-NEXT:    beqz $a6, .LBB6_3
 ; LA64-NEXT:    b .LBB6_6
@@ -323,7 +323,7 @@ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) {
 ; LA64-NEXT:    dbar 20
 ; LA64-NEXT:  .LBB6_6: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB6_1 Depth=1
-; LA64-NEXT:    bne $a2, $a5, .LBB6_1
+; LA64-NEXT:    bne $a2, $a4, .LBB6_1
 ; LA64-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64-NEXT:    move $a0, $a2
 ; LA64-NEXT:    ret
diff --git a/llvm/test/CodeGen/LoongArch/gep-imm.ll b/llvm/test/CodeGen/LoongArch/gep-imm.ll
index 0eef7e4517f3..c88d0b5a4543 100644
--- a/llvm/test/CodeGen/LoongArch/gep-imm.ll
+++ b/llvm/test/CodeGen/LoongArch/gep-imm.ll
@@ -7,19 +7,17 @@ define void @test(ptr %sp, ptr %t, i32 %n) {
 ; CHECK-NEXT:    ld.d $a0, $a0, 0
 ; CHECK-NEXT:    move $a3, $zero
 ; CHECK-NEXT:    addi.w $a2, $a2, 0
-; CHECK-NEXT:    addi.w $a4, $a3, 0
-; CHECK-NEXT:    bge $a4, $a2, .LBB0_2
+; CHECK-NEXT:    bge $a3, $a2, .LBB0_2
 ; CHECK-NEXT:    .p2align 4, , 16
 ; CHECK-NEXT:  .LBB0_1: # %while_body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    addi.d $a4, $a3, 1
+; CHECK-NEXT:    addi.w $a4, $a3, 1
 ; CHECK-NEXT:    stptr.w $a4, $a0, 8000
 ; CHECK-NEXT:    stptr.w $a3, $a0, 8004
 ; CHECK-NEXT:    stptr.w $a4, $a1, 8000
 ; CHECK-NEXT:    stptr.w $a3, $a1, 8004
 ; CHECK-NEXT:    move $a3, $a4
-; CHECK-NEXT:    addi.w $a4, $a3, 0
-; CHECK-NEXT:    blt $a4, $a2, .LBB0_1
+; CHECK-NEXT:    blt $a3, $a2, .LBB0_1
 ; CHECK-NEXT:  .LBB0_2: # %while_end
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/add.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/add.ll
index 2c504efca26d..709e0faeff90 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/add.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/add.ll
@@ -713,7 +713,7 @@ define signext i32 @add_i32_4080(i32 %x) {
 ;
 ; LA64-LABEL: add_i32_4080:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.w $a0, $a0, 2047
+; LA64-NEXT:    addi.d $a0, $a0, 2047
 ; LA64-NEXT:    addi.w $a0, $a0, 2033
 ; LA64-NEXT:    ret
   %add = add i32 %x, 4080
@@ -729,7 +729,7 @@ define signext i32 @add_i32_minus_4080(i32 %x) {
 ;
 ; LA64-LABEL: add_i32_minus_4080:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.w $a0, $a0, -2048
+; LA64-NEXT:    addi.d $a0, $a0, -2048
 ; LA64-NEXT:    addi.w $a0, $a0, -2032
 ; LA64-NEXT:    ret
   %add = add i32 %x, -4080
@@ -745,7 +745,7 @@ define signext i32 @add_i32_2048(i32 %x) {
 ;
 ; LA64-LABEL: add_i32_2048:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.w $a0, $a0, 2047
+; LA64-NEXT:    addi.d $a0, $a0, 2047
 ; LA64-NEXT:    addi.w $a0, $a0, 1
 ; LA64-NEXT:    ret
   %add = add i32 %x, 2048
@@ -761,7 +761,7 @@ define signext i32 @add_i32_4094(i32 %x) {
 ;
 ; LA64-LABEL: add_i32_4094:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.w $a0, $a0, 2047
+; LA64-NEXT:    addi.d $a0, $a0, 2047
 ; LA64-NEXT:    addi.w $a0, $a0, 2047
 ; LA64-NEXT:    ret
   %add = add i32 %x, 4094
@@ -777,7 +777,7 @@ define signext i32 @add_i32_minus_2049(i32 %x) {
 ;
 ; LA64-LABEL: add_i32_minus_2049:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.w $a0, $a0, -2048
+; LA64-NEXT:    addi.d $a0, $a0, -2048
 ; LA64-NEXT:    addi.w $a0, $a0, -1
 ; LA64-NEXT:    ret
   %add = add i32 %x, -2049
@@ -793,7 +793,7 @@ define signext i32 @add_i32_minus_4096(i32 %x) {
 ;
 ; LA64-LABEL: add_i32_minus_4096:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.w $a0, $a0, -2048
+; LA64-NEXT:    addi.d $a0, $a0, -2048
 ; LA64-NEXT:    addi.w $a0, $a0, -2048
 ; LA64-NEXT:    ret
   %add = add i32 %x, -4096
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll
index 06ad89972b84..495974a59ba6 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll
@@ -12,15 +12,12 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; LA64-NEXT:    sll.w $a1, $a1, $a3
 ; LA64-NEXT:    andi $a2, $a2, 255
 ; LA64-NEXT:    sll.w $a2, $a2, $a3
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a2, $a2, 0
-; LA64-NEXT:    addi.w $a3, $a4, 0
 ; LA64-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
-; LA64-NEXT:    ll.w $a4, $a0, 0
-; LA64-NEXT:    and $a5, $a4, $a3
+; LA64-NEXT:    ll.w $a3, $a0, 0
+; LA64-NEXT:    and $a5, $a3, $a4
 ; LA64-NEXT:    bne $a5, $a1, .LBB0_3
 ; LA64-NEXT:  # %bb.2: # in Loop: Header=BB0_1 Depth=1
-; LA64-NEXT:    andn $a5, $a4, $a3
+; LA64-NEXT:    andn $a5, $a3, $a4
 ; LA64-NEXT:    or $a5, $a5, $a2
 ; LA64-NEXT:    sc.w $a5, $a0, 0
 ; LA64-NEXT:    beqz $a5, .LBB0_1
@@ -45,15 +42,12 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; LA64-NEXT:    sll.w $a1, $a1, $a3
 ; LA64-NEXT:    bstrpick.d $a2, $a2, 15, 0
 ; LA64-NEXT:    sll.w $a2, $a2, $a3
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a2, $a2, 0
-; LA64-NEXT:    addi.w $a3, $a4, 0
 ; LA64-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
-; LA64-NEXT:    ll.w $a4, $a0, 0
-; LA64-NEXT:    and $a5, $a4, $a3
+; LA64-NEXT:    ll.w $a3, $a0, 0
+; LA64-NEXT:    and $a5, $a3, $a4
 ; LA64-NEXT:    bne $a5, $a1, .LBB1_3
 ; LA64-NEXT:  # %bb.2: # in Loop: Header=BB1_1 Depth=1
-; LA64-NEXT:    andn $a5, $a4, $a3
+; LA64-NEXT:    andn $a5, $a3, $a4
 ; LA64-NEXT:    or $a5, $a5, $a2
 ; LA64-NEXT:    sc.w $a5, $a0, 0
 ; LA64-NEXT:    beqz $a5, .LBB1_1
@@ -116,15 +110,12 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; LA64-NEXT:    sll.w $a1, $a1, $a3
 ; LA64-NEXT:    andi $a2, $a2, 255
 ; LA64-NEXT:    sll.w $a2, $a2, $a3
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a2, $a2, 0
-; LA64-NEXT:    addi.w $a3, $a4, 0
 ; LA64-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
-; LA64-NEXT:    ll.w $a4, $a0, 0
-; LA64-NEXT:    and $a5, $a4, $a3
+; LA64-NEXT:    ll.w $a3, $a0, 0
+; LA64-NEXT:    and $a5, $a3, $a4
 ; LA64-NEXT:    bne $a5, $a1, .LBB4_3
 ; LA64-NEXT:  # %bb.2: # in Loop: Header=BB4_1 Depth=1
-; LA64-NEXT:    andn $a5, $a4, $a3
+; LA64-NEXT:    andn $a5, $a3, $a4
 ; LA64-NEXT:    or $a5, $a5, $a2
 ; LA64-NEXT:    sc.w $a5, $a0, 0
 ; LA64-NEXT:    beqz $a5, .LBB4_1
@@ -149,15 +140,12 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; LA64-NEXT:    sll.w $a1, $a1, $a3
 ; LA64-NEXT:    bstrpick.d $a2, $a2, 15, 0
 ; LA64-NEXT:    sll.w $a2, $a2, $a3
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a2, $a2, 0
-; LA64-NEXT:    addi.w $a3, $a4, 0
 ; LA64-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
-; LA64-NEXT:    ll.w $a4, $a0, 0
-; LA64-NEXT:    and $a5, $a4, $a3
+; LA64-NEXT:    ll.w $a3, $a0, 0
+; LA64-NEXT:    and $a5, $a3, $a4
 ; LA64-NEXT:    bne $a5, $a1, .LBB5_3
 ; LA64-NEXT:  # %bb.2: # in Loop: Header=BB5_1 Depth=1
-; LA64-NEXT:    andn $a5, $a4, $a3
+; LA64-NEXT:    andn $a5, $a3, $a4
 ; LA64-NEXT:    or $a5, $a5, $a2
 ; LA64-NEXT:    sc.w $a5, $a0, 0
 ; LA64-NEXT:    beqz $a5, .LBB5_1
@@ -220,9 +208,6 @@ define i8 @cmpxchg_i8_acquire_acquire_reti8(ptr %ptr, i8 %cmp, i8 %val) nounwind
 ; LA64-NEXT:    sll.w $a1, $a1, $a3
 ; LA64-NEXT:    andi $a2, $a2, 255
 ; LA64-NEXT:    sll.w $a2, $a2, $a3
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a2, $a2, 0
-; LA64-NEXT:    addi.w $a4, $a4, 0
 ; LA64-NEXT:  .LBB8_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a5, $a0, 0
 ; LA64-NEXT:    and $a6, $a5, $a4
@@ -255,9 +240,6 @@ define i16 @cmpxchg_i16_acquire_acquire_reti16(ptr %ptr, i16 %cmp, i16 %val) nou
 ; LA64-NEXT:    sll.w $a1, $a1, $a3
 ; LA64-NEXT:    bstrpick.d $a2, $a2, 15, 0
 ; LA64-NEXT:    sll.w $a2, $a2, $a3
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a2, $a2, 0
-; LA64-NEXT:    addi.w $a4, $a4, 0
 ; LA64-NEXT:  .LBB9_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a5, $a0, 0
 ; LA64-NEXT:    and $a6, $a5, $a4
@@ -332,24 +314,20 @@ define i1 @cmpxchg_i8_acquire_acquire_reti1(ptr %ptr, i8 %cmp, i8 %val) nounwind
 ; LA64-NEXT:    sll.w $a1, $a1, $a3
 ; LA64-NEXT:    andi $a2, $a2, 255
 ; LA64-NEXT:    sll.w $a2, $a2, $a3
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a2, $a2, 0
-; LA64-NEXT:    addi.w $a3, $a4, 0
 ; LA64-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
-; LA64-NEXT:    ll.w $a5, $a0, 0
-; LA64-NEXT:    and $a6, $a5, $a3
-; LA64-NEXT:    bne $a6, $a1, .LBB12_3
+; LA64-NEXT:    ll.w $a3, $a0, 0
+; LA64-NEXT:    and $a5, $a3, $a4
+; LA64-NEXT:    bne $a5, $a1, .LBB12_3
 ; LA64-NEXT:  # %bb.2: # in Loop: Header=BB12_1 Depth=1
-; LA64-NEXT:    andn $a6, $a5, $a3
-; LA64-NEXT:    or $a6, $a6, $a2
-; LA64-NEXT:    sc.w $a6, $a0, 0
-; LA64-NEXT:    beqz $a6, .LBB12_1
+; LA64-NEXT:    andn $a5, $a3, $a4
+; LA64-NEXT:    or $a5, $a5, $a2
+; LA64-NEXT:    sc.w $a5, $a0, 0
+; LA64-NEXT:    beqz $a5, .LBB12_1
 ; LA64-NEXT:    b .LBB12_4
 ; LA64-NEXT:  .LBB12_3:
 ; LA64-NEXT:    dbar 20
 ; LA64-NEXT:  .LBB12_4:
-; LA64-NEXT:    and $a0, $a5, $a4
-; LA64-NEXT:    addi.w $a0, $a0, 0
+; LA64-NEXT:    and $a0, $a3, $a4
 ; LA64-NEXT:    xor $a0, $a1, $a0
 ; LA64-NEXT:    sltui $a0, $a0, 1
 ; LA64-NEXT:    ret
@@ -370,24 +348,20 @@ define i1 @cmpxchg_i16_acquire_acquire_reti1(ptr %ptr, i16 %cmp, i16 %val) nounw
 ; LA64-NEXT:    sll.w $a1, $a1, $a3
 ; LA64-NEXT:    bstrpick.d $a2, $a2, 15, 0
 ; LA64-NEXT:    sll.w $a2, $a2, $a3
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a2, $a2, 0
-; LA64-NEXT:    addi.w $a3, $a4, 0
 ; LA64-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
-; LA64-NEXT:    ll.w $a5, $a0, 0
-; LA64-NEXT:    and $a6, $a5, $a3
-; LA64-NEXT:    bne $a6, $a1, .LBB13_3
+; LA64-NEXT:    ll.w $a3, $a0, 0
+; LA64-NEXT:    and $a5, $a3, $a4
+; LA64-NEXT:    bne $a5, $a1, .LBB13_3
 ; LA64-NEXT:  # %bb.2: # in Loop: Header=BB13_1 Depth=1
-; LA64-NEXT:    andn $a6, $a5, $a3
-; LA64-NEXT:    or $a6, $a6, $a2
-; LA64-NEXT:    sc.w $a6, $a0, 0
-; LA64-NEXT:    beqz $a6, .LBB13_1
+; LA64-NEXT:    andn $a5, $a3, $a4
+; LA64-NEXT:    or $a5, $a5, $a2
+; LA64-NEXT:    sc.w $a5, $a0, 0
+; LA64-NEXT:    beqz $a5, .LBB13_1
 ; LA64-NEXT:    b .LBB13_4
 ; LA64-NEXT:  .LBB13_3:
 ; LA64-NEXT:    dbar 20
 ; LA64-NEXT:  .LBB13_4:
-; LA64-NEXT:    and $a0, $a5, $a4
-; LA64-NEXT:    addi.w $a0, $a0, 0
+; LA64-NEXT:    and $a0, $a3, $a4
 ; LA64-NEXT:    xor $a0, $a1, $a0
 ; LA64-NEXT:    sltui $a0, $a0, 1
 ; LA64-NEXT:    ret
@@ -452,15 +426,12 @@ define void @cmpxchg_i8_monotonic_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind
 ; LA64-NEXT:    sll.w $a1, $a1, $a3
 ; LA64-NEXT:    andi $a2, $a2, 255
 ; LA64-NEXT:    sll.w $a2, $a2, $a3
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a2, $a2, 0
-; LA64-NEXT:    addi.w $a3, $a4, 0
 ; LA64-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
-; LA64-NEXT:    ll.w $a4, $a0, 0
-; LA64-NEXT:    and $a5, $a4, $a3
+; LA64-NEXT:    ll.w $a3, $a0, 0
+; LA64-NEXT:    and $a5, $a3, $a4
 ; LA64-NEXT:    bne $a5, $a1, .LBB16_3
 ; LA64-NEXT:  # %bb.2: # in Loop: Header=BB16_1 Depth=1
-; LA64-NEXT:    andn $a5, $a4, $a3
+; LA64-NEXT:    andn $a5, $a3, $a4
 ; LA64-NEXT:    or $a5, $a5, $a2
 ; LA64-NEXT:    sc.w $a5, $a0, 0
 ; LA64-NEXT:    beqz $a5, .LBB16_1
@@ -485,15 +456,12 @@ define void @cmpxchg_i16_monotonic_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounw
 ; LA64-NEXT:    sll.w $a1, $a1, $a3
 ; LA64-NEXT:    bstrpick.d $a2, $a2, 15, 0
 ; LA64-NEXT:    sll.w $a2, $a2, $a3
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a2, $a2, 0
-; LA64-NEXT:    addi.w $a3, $a4, 0
 ; LA64-NEXT:  .LBB17_1: # =>This Inner Loop Header: Depth=1
-; LA64-NEXT:    ll.w $a4, $a0, 0
-; LA64-NEXT:    and $a5, $a4, $a3
+; LA64-NEXT:    ll.w $a3, $a0, 0
+; LA64-NEXT:    and $a5, $a3, $a4
 ; LA64-NEXT:    bne $a5, $a1, .LBB17_3
 ; LA64-NEXT:  # %bb.2: # in Loop: Header=BB17_1 Depth=1
-; LA64-NEXT:    andn $a5, $a4, $a3
+; LA64-NEXT:    andn $a5, $a3, $a4
 ; LA64-NEXT:    or $a5, $a5, $a2
 ; LA64-NEXT:    sc.w $a5, $a0, 0
 ; LA64-NEXT:    beqz $a5, .LBB17_1
@@ -556,9 +524,6 @@ define i8 @cmpxchg_i8_monotonic_monotonic_reti8(ptr %ptr, i8 %cmp, i8 %val) noun
 ; LA64-NEXT:    sll.w $a1, $a1, $a3
 ; LA64-NEXT:    andi $a2, $a2, 255
 ; LA64-NEXT:    sll.w $a2, $a2, $a3
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a2, $a2, 0
-; LA64-NEXT:    addi.w $a4, $a4, 0
 ; LA64-NEXT:  .LBB20_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a5, $a0, 0
 ; LA64-NEXT:    and $a6, $a5, $a4
@@ -591,9 +556,6 @@ define i16 @cmpxchg_i16_monotonic_monotonic_reti16(ptr %ptr, i16 %cmp, i16 %val)
 ; LA64-NEXT:    sll.w $a1, $a1, $a3
 ; LA64-NEXT:    bstrpick.d $a2, $a2, 15, 0
 ; LA64-NEXT:    sll.w $a2, $a2, $a3
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a2, $a2, 0
-; LA64-NEXT:    addi.w $a4, $a4, 0
 ; LA64-NEXT:  .LBB21_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a5, $a0, 0
 ; LA64-NEXT:    and $a6, $a5, $a4
@@ -668,24 +630,20 @@ define i1 @cmpxchg_i8_monotonic_monotonic_reti1(ptr %ptr, i8 %cmp, i8 %val) noun
 ; LA64-NEXT:    sll.w $a1, $a1, $a3
 ; LA64-NEXT:    andi $a2, $a2, 255
 ; LA64-NEXT:    sll.w $a2, $a2, $a3
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a2, $a2, 0
-; LA64-NEXT:    addi.w $a3, $a4, 0
 ; LA64-NEXT:  .LBB24_1: # =>This Inner Loop Header: Depth=1
-; LA64-NEXT:    ll.w $a5, $a0, 0
-; LA64-NEXT:    and $a6, $a5, $a3
-; LA64-NEXT:    bne $a6, $a1, .LBB24_3
+; LA64-NEXT:    ll.w $a3, $a0, 0
+; LA64-NEXT:    and $a5, $a3, $a4
+; LA64-NEXT:    bne $a5, $a1, .LBB24_3
 ; LA64-NEXT:  # %bb.2: # in Loop: Header=BB24_1 Depth=1
-; LA64-NEXT:    andn $a6, $a5, $a3
-; LA64-NEXT:    or $a6, $a6, $a2
-; LA64-NEXT:    sc.w $a6, $a0, 0
-; LA64-NEXT:    beqz $a6, .LBB24_1
+; LA64-NEXT:    andn $a5, $a3, $a4
+; LA64-NEXT:    or $a5, $a5, $a2
+; LA64-NEXT:    sc.w $a5, $a0, 0
+; LA64-NEXT:    beqz $a5, .LBB24_1
 ; LA64-NEXT:    b .LBB24_4
 ; LA64-NEXT:  .LBB24_3:
 ; LA64-NEXT:    dbar 1792
 ; LA64-NEXT:  .LBB24_4:
-; LA64-NEXT:    and $a0, $a5, $a4
-; LA64-NEXT:    addi.w $a0, $a0, 0
+; LA64-NEXT:    and $a0, $a3, $a4
 ; LA64-NEXT:    xor $a0, $a1, $a0
 ; LA64-NEXT:    sltui $a0, $a0, 1
 ; LA64-NEXT:    ret
@@ -706,24 +664,20 @@ define i1 @cmpxchg_i16_monotonic_monotonic_reti1(ptr %ptr, i16 %cmp, i16 %val) n
 ; LA64-NEXT:    sll.w $a1, $a1, $a3
 ; LA64-NEXT:    bstrpick.d $a2, $a2, 15, 0
 ; LA64-NEXT:    sll.w $a2, $a2, $a3
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a2, $a2, 0
-; LA64-NEXT:    addi.w $a3, $a4, 0
 ; LA64-NEXT:  .LBB25_1: # =>This Inner Loop Header: Depth=1
-; LA64-NEXT:    ll.w $a5, $a0, 0
-; LA64-NEXT:    and $a6, $a5, $a3
-; LA64-NEXT:    bne $a6, $a1, .LBB25_3
+; LA64-NEXT:    ll.w $a3, $a0, 0
+; LA64-NEXT:    and $a5, $a3, $a4
+; LA64-NEXT:    bne $a5, $a1, .LBB25_3
 ; LA64-NEXT:  # %bb.2: # in Loop: Header=BB25_1 Depth=1
-; LA64-NEXT:    andn $a6, $a5, $a3
-; LA64-NEXT:    or $a6, $a6, $a2
-; LA64-NEXT:    sc.w $a6, $a0, 0
-; LA64-NEXT:    beqz $a6, .LBB25_1
+; LA64-NEXT:    andn $a5, $a3, $a4
+; LA64-NEXT:    or $a5, $a5, $a2
+; LA64-NEXT:    sc.w $a5, $a0, 0
+; LA64-NEXT:    beqz $a5, .LBB25_1
 ; LA64-NEXT:    b .LBB25_4
 ; LA64-NEXT:  .LBB25_3:
 ; LA64-NEXT:    dbar 1792
 ; LA64-NEXT:  .LBB25_4:
-; LA64-NEXT:    and $a0, $a5, $a4
-; LA64-NEXT:    addi.w $a0, $a0, 0
+; LA64-NEXT:    and $a0, $a3, $a4
 ; LA64-NEXT:    xor $a0, $a1, $a0
 ; LA64-NEXT:    sltui $a0, $a0, 1
 ; LA64-NEXT:    ret
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
index ba08790fb7cb..81cc29419a0e 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
@@ -16,7 +16,6 @@ define float @float_fadd_acquire(ptr %p) nounwind {
 ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
-; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB0_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB0_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -51,7 +50,6 @@ define float @float_fadd_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
-; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB0_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB0_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -90,7 +88,6 @@ define float @float_fsub_acquire(ptr %p) nounwind {
 ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
-; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB1_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB1_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -125,7 +122,6 @@ define float @float_fsub_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
-; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB1_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB1_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -165,7 +161,6 @@ define float @float_fmin_acquire(ptr %p) nounwind {
 ; LA64F-NEXT:    fmin.s $fa2, $fa2, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
-; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB2_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB2_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -201,7 +196,6 @@ define float @float_fmin_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:    fmin.s $fa2, $fa2, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
-; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB2_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB2_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -241,7 +235,6 @@ define float @float_fmax_acquire(ptr %p) nounwind {
 ; LA64F-NEXT:    fmax.s $fa2, $fa2, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
-; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB3_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB3_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -277,7 +270,6 @@ define float @float_fmax_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:    fmax.s $fa2, $fa2, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
-; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB3_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB3_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -706,7 +698,6 @@ define float @float_fadd_release(ptr %p) nounwind {
 ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
-; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB8_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB8_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -741,7 +732,6 @@ define float @float_fadd_release(ptr %p) nounwind {
 ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
-; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB8_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB8_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -780,7 +770,6 @@ define float @float_fsub_release(ptr %p) nounwind {
 ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
-; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB9_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB9_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -815,7 +804,6 @@ define float @float_fsub_release(ptr %p) nounwind {
 ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
-; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB9_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB9_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -855,7 +843,6 @@ define float @float_fmin_release(ptr %p) nounwind {
 ; LA64F-NEXT:    fmin.s $fa2, $fa2, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
-; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB10_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB10_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -891,7 +878,6 @@ define float @float_fmin_release(ptr %p) nounwind {
 ; LA64D-NEXT:    fmin.s $fa2, $fa2, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
-; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB10_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB10_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -931,7 +917,6 @@ define float @float_fmax_release(ptr %p) nounwind {
 ; LA64F-NEXT:    fmax.s $fa2, $fa2, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
-; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB11_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB11_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -967,7 +952,6 @@ define float @float_fmax_release(ptr %p) nounwind {
 ; LA64D-NEXT:    fmax.s $fa2, $fa2, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
-; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB11_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB11_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -1396,7 +1380,6 @@ define float @float_fadd_acq_rel(ptr %p) nounwind {
 ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
-; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB16_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB16_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -1431,7 +1414,6 @@ define float @float_fadd_acq_rel(ptr %p) nounwind {
 ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
-; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB16_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB16_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -1470,7 +1452,6 @@ define float @float_fsub_acq_rel(ptr %p) nounwind {
 ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
-; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB17_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB17_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -1505,7 +1486,6 @@ define float @float_fsub_acq_rel(ptr %p) nounwind {
 ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
-; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB17_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB17_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -1545,7 +1525,6 @@ define float @float_fmin_acq_rel(ptr %p) nounwind {
 ; LA64F-NEXT:    fmin.s $fa2, $fa2, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
-; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB18_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB18_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -1581,7 +1560,6 @@ define float @float_fmin_acq_rel(ptr %p) nounwind {
 ; LA64D-NEXT:    fmin.s $fa2, $fa2, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
-; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB18_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB18_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -1621,7 +1599,6 @@ define float @float_fmax_acq_rel(ptr %p) nounwind {
 ; LA64F-NEXT:    fmax.s $fa2, $fa2, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
-; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB19_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB19_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -1657,7 +1634,6 @@ define float @float_fmax_acq_rel(ptr %p) nounwind {
 ; LA64D-NEXT:    fmax.s $fa2, $fa2, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
-; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB19_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB19_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2110,7 +2086,6 @@ define float @float_fadd_seq_cst(ptr %p) nounwind {
 ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
-; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB24_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB24_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2145,7 +2120,6 @@ define float @float_fadd_seq_cst(ptr %p) nounwind {
 ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
-; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB24_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB24_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2184,7 +2158,6 @@ define float @float_fsub_seq_cst(ptr %p) nounwind {
 ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
-; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB25_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB25_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2219,7 +2192,6 @@ define float @float_fsub_seq_cst(ptr %p) nounwind {
 ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
-; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB25_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB25_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2259,7 +2231,6 @@ define float @float_fmin_seq_cst(ptr %p) nounwind {
 ; LA64F-NEXT:    fmin.s $fa2, $fa2, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
-; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB26_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB26_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2295,7 +2266,6 @@ define float @float_fmin_seq_cst(ptr %p) nounwind {
 ; LA64D-NEXT:    fmin.s $fa2, $fa2, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
-; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB26_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB26_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2335,7 +2305,6 @@ define float @float_fmax_seq_cst(ptr %p) nounwind {
 ; LA64F-NEXT:    fmax.s $fa2, $fa2, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
-; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB27_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB27_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2371,7 +2340,6 @@ define float @float_fmax_seq_cst(ptr %p) nounwind {
 ; LA64D-NEXT:    fmax.s $fa2, $fa2, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
-; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB27_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB27_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2800,7 +2768,6 @@ define float @float_fadd_monotonic(ptr %p) nounwind {
 ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
-; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB32_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB32_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2835,7 +2802,6 @@ define float @float_fadd_monotonic(ptr %p) nounwind {
 ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
-; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB32_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB32_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2874,7 +2840,6 @@ define float @float_fsub_monotonic(ptr %p) nounwind {
 ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
-; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB33_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB33_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2909,7 +2874,6 @@ define float @float_fsub_monotonic(ptr %p) nounwind {
 ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
-; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB33_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB33_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2949,7 +2913,6 @@ define float @float_fmin_monotonic(ptr %p) nounwind {
 ; LA64F-NEXT:    fmin.s $fa2, $fa2, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
-; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB34_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB34_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2985,7 +2948,6 @@ define float @float_fmin_monotonic(ptr %p) nounwind {
 ; LA64D-NEXT:    fmin.s $fa2, $fa2, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
-; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB34_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB34_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -3025,7 +2987,6 @@ define float @float_fmax_monotonic(ptr %p) nounwind {
 ; LA64F-NEXT:    fmax.s $fa2, $fa2, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
-; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB35_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB35_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -3061,7 +3022,6 @@ define float @float_fmax_monotonic(ptr %p) nounwind {
 ; LA64D-NEXT:    fmax.s $fa2, $fa2, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
-; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB35_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB35_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-minmax.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-minmax.ll
index c36734e11f01..794242f45fdb 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-minmax.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-minmax.ll
@@ -13,8 +13,6 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    andi $a1, $a1, 255
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    and $a6, $a4, $a3
@@ -44,8 +42,6 @@ define i16 @atomicrmw_umax_i16_acquire(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    and $a6, $a4, $a3
@@ -94,8 +90,6 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    andi $a1, $a1, 255
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    and $a6, $a4, $a3
@@ -125,8 +119,6 @@ define i16 @atomicrmw_umin_i16_acquire(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    and $a6, $a4, $a3
@@ -176,8 +168,6 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a4, $a4, $a2
 ; LA64-NEXT:    ext.w.b $a1, $a1
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a4, $a4, 0
 ; LA64-NEXT:    xori $a3, $a3, 56
 ; LA64-NEXT:  .LBB8_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a5, $a0, 0
@@ -211,8 +201,6 @@ define i16 @atomicrmw_max_i16_acquire(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a4, $a4, $a2
 ; LA64-NEXT:    ext.w.h $a1, $a1
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a4, $a4, 0
 ; LA64-NEXT:    ori $a5, $zero, 48
 ; LA64-NEXT:    sub.d $a3, $a5, $a3
 ; LA64-NEXT:  .LBB9_1: # =>This Inner Loop Header: Depth=1
@@ -266,8 +254,6 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a4, $a4, $a2
 ; LA64-NEXT:    ext.w.b $a1, $a1
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a4, $a4, 0
 ; LA64-NEXT:    xori $a3, $a3, 56
 ; LA64-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a5, $a0, 0
@@ -301,8 +287,6 @@ define i16 @atomicrmw_min_i16_acquire(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a4, $a4, $a2
 ; LA64-NEXT:    ext.w.h $a1, $a1
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a4, $a4, 0
 ; LA64-NEXT:    ori $a5, $zero, 48
 ; LA64-NEXT:    sub.d $a3, $a5, $a3
 ; LA64-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
@@ -355,8 +339,6 @@ define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    andi $a1, $a1, 255
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    and $a6, $a4, $a3
@@ -386,8 +368,6 @@ define i16 @atomicrmw_umax_i16_release(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB17_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    and $a6, $a4, $a3
@@ -436,8 +416,6 @@ define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    andi $a1, $a1, 255
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB20_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    and $a6, $a4, $a3
@@ -467,8 +445,6 @@ define i16 @atomicrmw_umin_i16_release(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB21_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    and $a6, $a4, $a3
@@ -518,8 +494,6 @@ define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a4, $a4, $a2
 ; LA64-NEXT:    ext.w.b $a1, $a1
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a4, $a4, 0
 ; LA64-NEXT:    xori $a3, $a3, 56
 ; LA64-NEXT:  .LBB24_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a5, $a0, 0
@@ -553,8 +527,6 @@ define i16 @atomicrmw_max_i16_release(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a4, $a4, $a2
 ; LA64-NEXT:    ext.w.h $a1, $a1
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a4, $a4, 0
 ; LA64-NEXT:    ori $a5, $zero, 48
 ; LA64-NEXT:    sub.d $a3, $a5, $a3
 ; LA64-NEXT:  .LBB25_1: # =>This Inner Loop Header: Depth=1
@@ -608,8 +580,6 @@ define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a4, $a4, $a2
 ; LA64-NEXT:    ext.w.b $a1, $a1
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a4, $a4, 0
 ; LA64-NEXT:    xori $a3, $a3, 56
 ; LA64-NEXT:  .LBB28_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a5, $a0, 0
@@ -643,8 +613,6 @@ define i16 @atomicrmw_min_i16_release(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a4, $a4, $a2
 ; LA64-NEXT:    ext.w.h $a1, $a1
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a4, $a4, 0
 ; LA64-NEXT:    ori $a5, $zero, 48
 ; LA64-NEXT:    sub.d $a3, $a5, $a3
 ; LA64-NEXT:  .LBB29_1: # =>This Inner Loop Header: Depth=1
@@ -697,8 +665,6 @@ define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    andi $a1, $a1, 255
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB32_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    and $a6, $a4, $a3
@@ -728,8 +694,6 @@ define i16 @atomicrmw_umax_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB33_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    and $a6, $a4, $a3
@@ -778,8 +742,6 @@ define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    andi $a1, $a1, 255
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB36_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    and $a6, $a4, $a3
@@ -809,8 +771,6 @@ define i16 @atomicrmw_umin_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB37_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    and $a6, $a4, $a3
@@ -860,8 +820,6 @@ define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a4, $a4, $a2
 ; LA64-NEXT:    ext.w.b $a1, $a1
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a4, $a4, 0
 ; LA64-NEXT:    xori $a3, $a3, 56
 ; LA64-NEXT:  .LBB40_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a5, $a0, 0
@@ -895,8 +853,6 @@ define i16 @atomicrmw_max_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a4, $a4, $a2
 ; LA64-NEXT:    ext.w.h $a1, $a1
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a4, $a4, 0
 ; LA64-NEXT:    ori $a5, $zero, 48
 ; LA64-NEXT:    sub.d $a3, $a5, $a3
 ; LA64-NEXT:  .LBB41_1: # =>This Inner Loop Header: Depth=1
@@ -950,8 +906,6 @@ define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a4, $a4, $a2
 ; LA64-NEXT:    ext.w.b $a1, $a1
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a4, $a4, 0
 ; LA64-NEXT:    xori $a3, $a3, 56
 ; LA64-NEXT:  .LBB44_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a5, $a0, 0
@@ -985,8 +939,6 @@ define i16 @atomicrmw_min_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a4, $a4, $a2
 ; LA64-NEXT:    ext.w.h $a1, $a1
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a4, $a4, 0
 ; LA64-NEXT:    ori $a5, $zero, 48
 ; LA64-NEXT:    sub.d $a3, $a5, $a3
 ; LA64-NEXT:  .LBB45_1: # =>This Inner Loop Header: Depth=1
@@ -1039,8 +991,6 @@ define i8 @atomicrmw_umax_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    andi $a1, $a1, 255
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB48_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    and $a6, $a4, $a3
@@ -1070,8 +1020,6 @@ define i16 @atomicrmw_umax_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB49_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    and $a6, $a4, $a3
@@ -1120,8 +1068,6 @@ define i8 @atomicrmw_umin_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    andi $a1, $a1, 255
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB52_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    and $a6, $a4, $a3
@@ -1151,8 +1097,6 @@ define i16 @atomicrmw_umin_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB53_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    and $a6, $a4, $a3
@@ -1202,8 +1146,6 @@ define i8 @atomicrmw_max_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a4, $a4, $a2
 ; LA64-NEXT:    ext.w.b $a1, $a1
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a4, $a4, 0
 ; LA64-NEXT:    xori $a3, $a3, 56
 ; LA64-NEXT:  .LBB56_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a5, $a0, 0
@@ -1237,8 +1179,6 @@ define i16 @atomicrmw_max_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a4, $a4, $a2
 ; LA64-NEXT:    ext.w.h $a1, $a1
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a4, $a4, 0
 ; LA64-NEXT:    ori $a5, $zero, 48
 ; LA64-NEXT:    sub.d $a3, $a5, $a3
 ; LA64-NEXT:  .LBB57_1: # =>This Inner Loop Header: Depth=1
@@ -1292,8 +1232,6 @@ define i8 @atomicrmw_min_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a4, $a4, $a2
 ; LA64-NEXT:    ext.w.b $a1, $a1
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a4, $a4, 0
 ; LA64-NEXT:    xori $a3, $a3, 56
 ; LA64-NEXT:  .LBB60_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a5, $a0, 0
@@ -1327,8 +1265,6 @@ define i16 @atomicrmw_min_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a4, $a4, $a2
 ; LA64-NEXT:    ext.w.h $a1, $a1
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a4, $a4, 0
 ; LA64-NEXT:    ori $a5, $zero, 48
 ; LA64-NEXT:    sub.d $a3, $a5, $a3
 ; LA64-NEXT:  .LBB61_1: # =>This Inner Loop Header: Depth=1
@@ -1381,8 +1317,6 @@ define i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    andi $a1, $a1, 255
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB64_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    and $a6, $a4, $a3
@@ -1412,8 +1346,6 @@ define i16 @atomicrmw_umax_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB65_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    and $a6, $a4, $a3
@@ -1462,8 +1394,6 @@ define i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    andi $a1, $a1, 255
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB68_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    and $a6, $a4, $a3
@@ -1493,8 +1423,6 @@ define i16 @atomicrmw_umin_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB69_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    and $a6, $a4, $a3
@@ -1544,8 +1472,6 @@ define i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a4, $a4, $a2
 ; LA64-NEXT:    ext.w.b $a1, $a1
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a4, $a4, 0
 ; LA64-NEXT:    xori $a3, $a3, 56
 ; LA64-NEXT:  .LBB72_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a5, $a0, 0
@@ -1579,8 +1505,6 @@ define i16 @atomicrmw_max_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a4, $a4, $a2
 ; LA64-NEXT:    ext.w.h $a1, $a1
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a4, $a4, 0
 ; LA64-NEXT:    ori $a5, $zero, 48
 ; LA64-NEXT:    sub.d $a3, $a5, $a3
 ; LA64-NEXT:  .LBB73_1: # =>This Inner Loop Header: Depth=1
@@ -1634,8 +1558,6 @@ define i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a4, $a4, $a2
 ; LA64-NEXT:    ext.w.b $a1, $a1
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a4, $a4, 0
 ; LA64-NEXT:    xori $a3, $a3, 56
 ; LA64-NEXT:  .LBB76_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a5, $a0, 0
@@ -1669,8 +1591,6 @@ define i16 @atomicrmw_min_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a4, $a4, $a2
 ; LA64-NEXT:    ext.w.h $a1, $a1
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a4, $a4, 0
 ; LA64-NEXT:    ori $a5, $zero, 48
 ; LA64-NEXT:    sub.d $a3, $a5, $a3
 ; LA64-NEXT:  .LBB77_1: # =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw.ll
index 4669065114f0..9b83b4c9535e 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw.ll
@@ -31,8 +31,6 @@ define i8 @atomicrmw_xchg_i8_acquire(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    andi $a1, $a1, 255
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    addi.w $a5, $a1, 0
@@ -139,8 +137,6 @@ define i16 @atomicrmw_xchg_i16_acquire(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    addi.w $a5, $a1, 0
@@ -290,8 +286,6 @@ define i8 @atomicrmw_add_i8_acquire(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    andi $a1, $a1, 255
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB8_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    add.w $a5, $a4, $a1
@@ -338,8 +332,6 @@ define i16 @atomicrmw_add_i16_acquire(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB9_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    add.w $a5, $a4, $a1
@@ -425,8 +417,6 @@ define i8 @atomicrmw_sub_i8_acquire(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    andi $a1, $a1, 255
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    sub.w $a5, $a4, $a1
@@ -473,8 +463,6 @@ define i16 @atomicrmw_sub_i16_acquire(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    sub.w $a5, $a4, $a1
@@ -563,8 +551,6 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    andi $a1, $a1, 255
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    and $a5, $a4, $a1
@@ -613,8 +599,6 @@ define i16 @atomicrmw_nand_i16_acquire(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB17_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    and $a5, $a4, $a1
@@ -1025,8 +1009,6 @@ define i8 @atomicrmw_xchg_i8_release(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    andi $a1, $a1, 255
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB32_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    addi.w $a5, $a1, 0
@@ -1133,8 +1115,6 @@ define i16 @atomicrmw_xchg_i16_release(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB35_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    addi.w $a5, $a1, 0
@@ -1284,8 +1264,6 @@ define i8 @atomicrmw_add_i8_release(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    andi $a1, $a1, 255
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB40_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    add.w $a5, $a4, $a1
@@ -1332,8 +1310,6 @@ define i16 @atomicrmw_add_i16_release(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB41_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    add.w $a5, $a4, $a1
@@ -1419,8 +1395,6 @@ define i8 @atomicrmw_sub_i8_release(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    andi $a1, $a1, 255
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB44_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    sub.w $a5, $a4, $a1
@@ -1467,8 +1441,6 @@ define i16 @atomicrmw_sub_i16_release(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB45_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    sub.w $a5, $a4, $a1
@@ -1557,8 +1529,6 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    andi $a1, $a1, 255
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB48_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    and $a5, $a4, $a1
@@ -1607,8 +1577,6 @@ define i16 @atomicrmw_nand_i16_release(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB49_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    and $a5, $a4, $a1
@@ -2019,8 +1987,6 @@ define i8 @atomicrmw_xchg_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    andi $a1, $a1, 255
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB64_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    addi.w $a5, $a1, 0
@@ -2127,8 +2093,6 @@ define i16 @atomicrmw_xchg_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB67_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    addi.w $a5, $a1, 0
@@ -2278,8 +2242,6 @@ define i8 @atomicrmw_add_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    andi $a1, $a1, 255
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB72_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    add.w $a5, $a4, $a1
@@ -2326,8 +2288,6 @@ define i16 @atomicrmw_add_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB73_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    add.w $a5, $a4, $a1
@@ -2413,8 +2373,6 @@ define i8 @atomicrmw_sub_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    andi $a1, $a1, 255
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB76_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    sub.w $a5, $a4, $a1
@@ -2461,8 +2419,6 @@ define i16 @atomicrmw_sub_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB77_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    sub.w $a5, $a4, $a1
@@ -2551,8 +2507,6 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    andi $a1, $a1, 255
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB80_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    and $a5, $a4, $a1
@@ -2601,8 +2555,6 @@ define i16 @atomicrmw_nand_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB81_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    and $a5, $a4, $a1
@@ -3013,8 +2965,6 @@ define i8 @atomicrmw_xchg_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    andi $a1, $a1, 255
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB96_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    addi.w $a5, $a1, 0
@@ -3121,8 +3071,6 @@ define i16 @atomicrmw_xchg_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB99_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    addi.w $a5, $a1, 0
@@ -3272,8 +3220,6 @@ define i8 @atomicrmw_add_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    andi $a1, $a1, 255
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB104_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    add.w $a5, $a4, $a1
@@ -3320,8 +3266,6 @@ define i16 @atomicrmw_add_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB105_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    add.w $a5, $a4, $a1
@@ -3407,8 +3351,6 @@ define i8 @atomicrmw_sub_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    andi $a1, $a1, 255
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB108_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    sub.w $a5, $a4, $a1
@@ -3455,8 +3397,6 @@ define i16 @atomicrmw_sub_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB109_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    sub.w $a5, $a4, $a1
@@ -3545,8 +3485,6 @@ define i8 @atomicrmw_nand_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    andi $a1, $a1, 255
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB112_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    and $a5, $a4, $a1
@@ -3595,8 +3533,6 @@ define i16 @atomicrmw_nand_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB113_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    and $a5, $a4, $a1
@@ -4007,8 +3943,6 @@ define i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    andi $a1, $a1, 255
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB128_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    addi.w $a5, $a1, 0
@@ -4115,8 +4049,6 @@ define i16 @atomicrmw_xchg_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB131_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    addi.w $a5, $a1, 0
@@ -4266,8 +4198,6 @@ define i8 @atomicrmw_add_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    andi $a1, $a1, 255
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB136_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    add.w $a5, $a4, $a1
@@ -4314,8 +4244,6 @@ define i16 @atomicrmw_add_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB137_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    add.w $a5, $a4, $a1
@@ -4401,8 +4329,6 @@ define i8 @atomicrmw_sub_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    andi $a1, $a1, 255
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB140_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    sub.w $a5, $a4, $a1
@@ -4449,8 +4375,6 @@ define i16 @atomicrmw_sub_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB141_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    sub.w $a5, $a4, $a1
@@ -4539,8 +4463,6 @@ define i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    andi $a1, $a1, 255
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB144_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    and $a5, $a4, $a1
@@ -4589,8 +4511,6 @@ define i16 @atomicrmw_nand_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; LA64-NEXT:    sll.w $a3, $a3, $a2
 ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
 ; LA64-NEXT:    sll.w $a1, $a1, $a2
-; LA64-NEXT:    addi.w $a1, $a1, 0
-; LA64-NEXT:    addi.w $a3, $a3, 0
 ; LA64-NEXT:  .LBB145_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a4, $a0, 0
 ; LA64-NEXT:    and $a5, $a4, $a1
diff --git a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
index a31eb8d11a35..803985fde215 100644
--- a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
+++ b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
@@ -1,174 +1,175 @@
 ;; When EXPENSIVE_CHECKS are enabled, the machine verifier appears between each
 ;; pass. Ignore it with 'grep -v'.
 ; RUN: llc --mtriple=loongarch32 -O1 --debug-pass=Structure %s -o /dev/null 2>&1 | \
-; RUN:   grep -v "Verify generated machine code" | FileCheck %s
+; RUN:   grep -v "Verify generated machine code" | FileCheck %s --check-prefix=LAXX
 ; RUN: llc --mtriple=loongarch32 -O2 --debug-pass=Structure %s -o /dev/null 2>&1 | \
-; RUN:   grep -v "Verify generated machine code" | FileCheck %s
+; RUN:   grep -v "Verify generated machine code" | FileCheck %s --check-prefix=LAXX
 ; RUN: llc --mtriple=loongarch32 -O3 --debug-pass=Structure %s -o /dev/null 2>&1 | \
-; RUN:   grep -v "Verify generated machine code" | FileCheck %s
+; RUN:   grep -v "Verify generated machine code" | FileCheck %s --check-prefix=LAXX
 ; RUN: llc --mtriple=loongarch64 -O1 --debug-pass=Structure %s -o /dev/null 2>&1 | \
-; RUN:   grep -v "Verify generated machine code" | FileCheck %s
+; RUN:   grep -v "Verify generated machine code" | FileCheck %s --check-prefixes=LAXX,LA64
 ; RUN: llc --mtriple=loongarch64 -O2 --debug-pass=Structure %s -o /dev/null 2>&1 | \
-; RUN:   grep -v "Verify generated machine code" | FileCheck %s
+; RUN:   grep -v "Verify generated machine code" | FileCheck %s --check-prefixes=LAXX,LA64
 ; RUN: llc --mtriple=loongarch64 -O3 --debug-pass=Structure %s -o /dev/null 2>&1 | \
-; RUN:   grep -v "Verify generated machine code" | FileCheck %s
+; RUN:   grep -v "Verify generated machine code" | FileCheck %s --check-prefixes=LAXX,LA64
 
 ; REQUIRES: asserts
 
-; CHECK-LABEL: Pass Arguments:
-; CHECK-NEXT: Target Library Information
-; CHECK-NEXT: Target Pass Configuration
-; CHECK-NEXT: Machine Module Information
-; CHECK-NEXT: Target Transform Information
-; CHECK-NEXT: Type-Based Alias Analysis
-; CHECK-NEXT: Scoped NoAlias Alias Analysis
-; CHECK-NEXT: Assumption Cache Tracker
-; CHECK-NEXT: Profile summary info
-; CHECK-NEXT: Create Garbage Collector Module Metadata
-; CHECK-NEXT: Machine Branch Probability Analysis
-; CHECK-NEXT: Default Regalloc Eviction Advisor
-; CHECK-NEXT: Default Regalloc Priority Advisor
-; CHECK-NEXT:   ModulePass Manager
-; CHECK-NEXT:     Pre-ISel Intrinsic Lowering
-; CHECK-NEXT:     FunctionPass Manager
-; CHECK-NEXT:       Expand large div/rem
-; CHECK-NEXT:       Expand large fp convert
-; CHECK-NEXT:       Expand Atomic instructions
-; CHECK-NEXT:       Module Verifier
-; CHECK-NEXT:       Dominator Tree Construction
-; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
-; CHECK-NEXT:       Natural Loop Information
-; CHECK-NEXT:       Canonicalize natural loops
-; CHECK-NEXT:       Scalar Evolution Analysis
-; CHECK-NEXT:       Loop Pass Manager
-; CHECK-NEXT:         Canonicalize Freeze Instructions in Loops
-; CHECK-NEXT:         Induction Variable Users
-; CHECK-NEXT:         Loop Strength Reduction
-; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
-; CHECK-NEXT:       Function Alias Analysis Results
-; CHECK-NEXT:       Merge contiguous icmps into a memcmp
-; CHECK-NEXT:       Natural Loop Information
-; CHECK-NEXT:       Lazy Branch Probability Analysis
-; CHECK-NEXT:       Lazy Block Frequency Analysis
-; CHECK-NEXT:       Expand memcmp() to load/stores
-; CHECK-NEXT:       Lower Garbage Collection Instructions
-; CHECK-NEXT:       Shadow Stack GC Lowering
-; CHECK-NEXT:       Lower constant intrinsics
-; CHECK-NEXT:       Remove unreachable blocks from the CFG
-; CHECK-NEXT:       Natural Loop Information
-; CHECK-NEXT:       Post-Dominator Tree Construction
-; CHECK-NEXT:       Branch Probability Analysis
-; CHECK-NEXT:       Block Frequency Analysis
-; CHECK-NEXT:       Constant Hoisting
-; CHECK-NEXT:       Replace intrinsics with calls to vector library
-; CHECK-NEXT:       Partially inline calls to library functions
-; CHECK-NEXT:       Expand vector predication intrinsics
-; CHECK-NEXT:       Scalarize Masked Memory Intrinsics
-; CHECK-NEXT:       Expand reduction intrinsics
-; CHECK-NEXT:       Natural Loop Information
-; CHECK-NEXT:       TLS Variable Hoist
-; CHECK-NEXT:       CodeGen Prepare
-; CHECK-NEXT:       Dominator Tree Construction
-; CHECK-NEXT:       Exception handling preparation
-; CHECK-NEXT:       Prepare callbr
-; CHECK-NEXT:       Safe Stack instrumentation pass
-; CHECK-NEXT:       Insert stack protectors
-; CHECK-NEXT:       Module Verifier
-; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
-; CHECK-NEXT:       Function Alias Analysis Results
-; CHECK-NEXT:       Natural Loop Information
-; CHECK-NEXT:       Post-Dominator Tree Construction
-; CHECK-NEXT:       Branch Probability Analysis
-; CHECK-NEXT:       Assignment Tracking Analysis
-; CHECK-NEXT:       Lazy Branch Probability Analysis
-; CHECK-NEXT:       Lazy Block Frequency Analysis
-; CHECK-NEXT:       LoongArch DAG->DAG Pattern Instruction Selection
-; CHECK-NEXT:       Finalize ISel and expand pseudo-instructions
-; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
-; CHECK-NEXT:       Early Tail Duplication
-; CHECK-NEXT:       Optimize machine instruction PHIs
-; CHECK-NEXT:       Slot index numbering
-; CHECK-NEXT:       Merge disjoint stack slots
-; CHECK-NEXT:       Local Stack Slot Allocation
-; CHECK-NEXT:       Remove dead machine instructions
-; CHECK-NEXT:       MachineDominator Tree Construction
-; CHECK-NEXT:       Machine Natural Loop Construction
-; CHECK-NEXT:       Machine Block Frequency Analysis
-; CHECK-NEXT:       Early Machine Loop Invariant Code Motion
-; CHECK-NEXT:       MachineDominator Tree Construction
-; CHECK-NEXT:       Machine Block Frequency Analysis
-; CHECK-NEXT:       Machine Common Subexpression Elimination
-; CHECK-NEXT:       MachinePostDominator Tree Construction
-; CHECK-NEXT:       Machine Cycle Info Analysis
-; CHECK-NEXT:       Machine code sinking
-; CHECK-NEXT:       Peephole Optimizations
-; CHECK-NEXT:       Remove dead machine instructions
-; CHECK-NEXT:       LoongArch Pre-RA pseudo instruction expansion pass
-; CHECK-NEXT:       Detect Dead Lanes
-; CHECK-NEXT:       Init Undef Pass
-; CHECK-NEXT:       Process Implicit Definitions
-; CHECK-NEXT:       Remove unreachable machine basic blocks
-; CHECK-NEXT:       Live Variable Analysis
-; CHECK-NEXT:       Eliminate PHI nodes for register allocation
-; CHECK-NEXT:       Two-Address instruction pass
-; CHECK-NEXT:       MachineDominator Tree Construction
-; CHECK-NEXT:       Slot index numbering
-; CHECK-NEXT:       Live Interval Analysis
-; CHECK-NEXT:       Register Coalescer
-; CHECK-NEXT:       Rename Disconnected Subregister Components
-; CHECK-NEXT:       Machine Instruction Scheduler
-; CHECK-NEXT:       Machine Block Frequency Analysis
-; CHECK-NEXT:       Debug Variable Analysis
-; CHECK-NEXT:       Live Stack Slot Analysis
-; CHECK-NEXT:       Virtual Register Map
-; CHECK-NEXT:       Live Register Matrix
-; CHECK-NEXT:       Bundle Machine CFG Edges
-; CHECK-NEXT:       Spill Code Placement Analysis
-; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
-; CHECK-NEXT:       Machine Optimization Remark Emitter
-; CHECK-NEXT:       Greedy Register Allocator
-; CHECK-NEXT:       Virtual Register Rewriter
-; CHECK-NEXT:       Register Allocation Pass Scoring
-; CHECK-NEXT:       Stack Slot Coloring
-; CHECK-NEXT:       Machine Copy Propagation Pass
-; CHECK-NEXT:       Machine Loop Invariant Code Motion
-; CHECK-NEXT:       Remove Redundant DEBUG_VALUE analysis
-; CHECK-NEXT:       Fixup Statepoint Caller Saved
-; CHECK-NEXT:       PostRA Machine Sink
-; CHECK-NEXT:       Machine Block Frequency Analysis
-; CHECK-NEXT:       MachineDominator Tree Construction
-; CHECK-NEXT:       MachinePostDominator Tree Construction
-; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
-; CHECK-NEXT:       Machine Optimization Remark Emitter
-; CHECK-NEXT:       Shrink Wrapping analysis
-; CHECK-NEXT:       Prologue/Epilogue Insertion & Frame Finalization
-; CHECK-NEXT:       Machine Late Instructions Cleanup Pass
-; CHECK-NEXT:       Control Flow Optimizer
-; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
-; CHECK-NEXT:       Tail Duplication
-; CHECK-NEXT:       Machine Copy Propagation Pass
-; CHECK-NEXT:       Post-RA pseudo instruction expansion pass
-; CHECK-NEXT:       MachineDominator Tree Construction
-; CHECK-NEXT:       Machine Natural Loop Construction
-; CHECK-NEXT:       Post RA top-down list latency scheduler
-; CHECK-NEXT:       Analyze Machine Code For Garbage Collection
-; CHECK-NEXT:       Machine Block Frequency Analysis
-; CHECK-NEXT:       MachinePostDominator Tree Construction
-; CHECK-NEXT:       Branch Probability Basic Block Placement
-; CHECK-NEXT:       Insert fentry calls
-; CHECK-NEXT:       Insert XRay ops
-; CHECK-NEXT:       Implement the 'patchable-function' attribute
-; CHECK-NEXT:       Branch relaxation pass
-; CHECK-NEXT:       Contiguously Lay Out Funclets
-; CHECK-NEXT:       StackMap Liveness Analysis
-; CHECK-NEXT:       Live DEBUG_VALUE analysis
-; CHECK-NEXT:       Machine Sanitizer Binary Metadata
-; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
-; CHECK-NEXT:       Machine Optimization Remark Emitter
-; CHECK-NEXT:       Stack Frame Layout Analysis
-; CHECK-NEXT:       LoongArch pseudo instruction expansion pass
-; CHECK-NEXT:       LoongArch atomic pseudo instruction expansion pass
-; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
-; CHECK-NEXT:       Machine Optimization Remark Emitter
-; CHECK-NEXT:       LoongArch Assembly Printer
-; CHECK-NEXT:       Free MachineFunction
+; LAXX-LABEL: Pass Arguments:
+; LAXX-NEXT: Target Library Information
+; LAXX-NEXT: Target Pass Configuration
+; LAXX-NEXT: Machine Module Information
+; LAXX-NEXT: Target Transform Information
+; LAXX-NEXT: Type-Based Alias Analysis
+; LAXX-NEXT: Scoped NoAlias Alias Analysis
+; LAXX-NEXT: Assumption Cache Tracker
+; LAXX-NEXT: Profile summary info
+; LAXX-NEXT: Create Garbage Collector Module Metadata
+; LAXX-NEXT: Machine Branch Probability Analysis
+; LAXX-NEXT: Default Regalloc Eviction Advisor
+; LAXX-NEXT: Default Regalloc Priority Advisor
+; LAXX-NEXT:   ModulePass Manager
+; LAXX-NEXT:     Pre-ISel Intrinsic Lowering
+; LAXX-NEXT:     FunctionPass Manager
+; LAXX-NEXT:       Expand large div/rem
+; LAXX-NEXT:       Expand large fp convert
+; LAXX-NEXT:       Expand Atomic instructions
+; LAXX-NEXT:       Module Verifier
+; LAXX-NEXT:       Dominator Tree Construction
+; LAXX-NEXT:       Basic Alias Analysis (stateless AA impl)
+; LAXX-NEXT:       Natural Loop Information
+; LAXX-NEXT:       Canonicalize natural loops
+; LAXX-NEXT:       Scalar Evolution Analysis
+; LAXX-NEXT:       Loop Pass Manager
+; LAXX-NEXT:         Canonicalize Freeze Instructions in Loops
+; LAXX-NEXT:         Induction Variable Users
+; LAXX-NEXT:         Loop Strength Reduction
+; LAXX-NEXT:       Basic Alias Analysis (stateless AA impl)
+; LAXX-NEXT:       Function Alias Analysis Results
+; LAXX-NEXT:       Merge contiguous icmps into a memcmp
+; LAXX-NEXT:       Natural Loop Information
+; LAXX-NEXT:       Lazy Branch Probability Analysis
+; LAXX-NEXT:       Lazy Block Frequency Analysis
+; LAXX-NEXT:       Expand memcmp() to load/stores
+; LAXX-NEXT:       Lower Garbage Collection Instructions
+; LAXX-NEXT:       Shadow Stack GC Lowering
+; LAXX-NEXT:       Lower constant intrinsics
+; LAXX-NEXT:       Remove unreachable blocks from the CFG
+; LAXX-NEXT:       Natural Loop Information
+; LAXX-NEXT:       Post-Dominator Tree Construction
+; LAXX-NEXT:       Branch Probability Analysis
+; LAXX-NEXT:       Block Frequency Analysis
+; LAXX-NEXT:       Constant Hoisting
+; LAXX-NEXT:       Replace intrinsics with calls to vector library
+; LAXX-NEXT:       Partially inline calls to library functions
+; LAXX-NEXT:       Expand vector predication intrinsics
+; LAXX-NEXT:       Scalarize Masked Memory Intrinsics
+; LAXX-NEXT:       Expand reduction intrinsics
+; LAXX-NEXT:       Natural Loop Information
+; LAXX-NEXT:       TLS Variable Hoist
+; LAXX-NEXT:       CodeGen Prepare
+; LAXX-NEXT:       Dominator Tree Construction
+; LAXX-NEXT:       Exception handling preparation
+; LAXX-NEXT:       Prepare callbr
+; LAXX-NEXT:       Safe Stack instrumentation pass
+; LAXX-NEXT:       Insert stack protectors
+; LAXX-NEXT:       Module Verifier
+; LAXX-NEXT:       Basic Alias Analysis (stateless AA impl)
+; LAXX-NEXT:       Function Alias Analysis Results
+; LAXX-NEXT:       Natural Loop Information
+; LAXX-NEXT:       Post-Dominator Tree Construction
+; LAXX-NEXT:       Branch Probability Analysis
+; LAXX-NEXT:       Assignment Tracking Analysis
+; LAXX-NEXT:       Lazy Branch Probability Analysis
+; LAXX-NEXT:       Lazy Block Frequency Analysis
+; LAXX-NEXT:       LoongArch DAG->DAG Pattern Instruction Selection
+; LAXX-NEXT:       Finalize ISel and expand pseudo-instructions
+; LAXX-NEXT:       Lazy Machine Block Frequency Analysis
+; LAXX-NEXT:       Early Tail Duplication
+; LAXX-NEXT:       Optimize machine instruction PHIs
+; LAXX-NEXT:       Slot index numbering
+; LAXX-NEXT:       Merge disjoint stack slots
+; LAXX-NEXT:       Local Stack Slot Allocation
+; LAXX-NEXT:       Remove dead machine instructions
+; LAXX-NEXT:       MachineDominator Tree Construction
+; LAXX-NEXT:       Machine Natural Loop Construction
+; LAXX-NEXT:       Machine Block Frequency Analysis
+; LAXX-NEXT:       Early Machine Loop Invariant Code Motion
+; LAXX-NEXT:       MachineDominator Tree Construction
+; LAXX-NEXT:       Machine Block Frequency Analysis
+; LAXX-NEXT:       Machine Common Subexpression Elimination
+; LAXX-NEXT:       MachinePostDominator Tree Construction
+; LAXX-NEXT:       Machine Cycle Info Analysis
+; LAXX-NEXT:       Machine code sinking
+; LAXX-NEXT:       Peephole Optimizations
+; LAXX-NEXT:       Remove dead machine instructions
+; LA64-NEXT:       LoongArch Optimize W Instructions
+; LAXX-NEXT:       LoongArch Pre-RA pseudo instruction expansion pass
+; LAXX-NEXT:       Detect Dead Lanes
+; LAXX-NEXT:       Init Undef Pass
+; LAXX-NEXT:       Process Implicit Definitions
+; LAXX-NEXT:       Remove unreachable machine basic blocks
+; LAXX-NEXT:       Live Variable Analysis
+; LAXX-NEXT:       Eliminate PHI nodes for register allocation
+; LAXX-NEXT:       Two-Address instruction pass
+; LAXX-NEXT:       MachineDominator Tree Construction
+; LAXX-NEXT:       Slot index numbering
+; LAXX-NEXT:       Live Interval Analysis
+; LAXX-NEXT:       Register Coalescer
+; LAXX-NEXT:       Rename Disconnected Subregister Components
+; LAXX-NEXT:       Machine Instruction Scheduler
+; LAXX-NEXT:       Machine Block Frequency Analysis
+; LAXX-NEXT:       Debug Variable Analysis
+; LAXX-NEXT:       Live Stack Slot Analysis
+; LAXX-NEXT:       Virtual Register Map
+; LAXX-NEXT:       Live Register Matrix
+; LAXX-NEXT:       Bundle Machine CFG Edges
+; LAXX-NEXT:       Spill Code Placement Analysis
+; LAXX-NEXT:       Lazy Machine Block Frequency Analysis
+; LAXX-NEXT:       Machine Optimization Remark Emitter
+; LAXX-NEXT:       Greedy Register Allocator
+; LAXX-NEXT:       Virtual Register Rewriter
+; LAXX-NEXT:       Register Allocation Pass Scoring
+; LAXX-NEXT:       Stack Slot Coloring
+; LAXX-NEXT:       Machine Copy Propagation Pass
+; LAXX-NEXT:       Machine Loop Invariant Code Motion
+; LAXX-NEXT:       Remove Redundant DEBUG_VALUE analysis
+; LAXX-NEXT:       Fixup Statepoint Caller Saved
+; LAXX-NEXT:       PostRA Machine Sink
+; LAXX-NEXT:       Machine Block Frequency Analysis
+; LAXX-NEXT:       MachineDominator Tree Construction
+; LAXX-NEXT:       MachinePostDominator Tree Construction
+; LAXX-NEXT:       Lazy Machine Block Frequency Analysis
+; LAXX-NEXT:       Machine Optimization Remark Emitter
+; LAXX-NEXT:       Shrink Wrapping analysis
+; LAXX-NEXT:       Prologue/Epilogue Insertion & Frame Finalization
+; LAXX-NEXT:       Machine Late Instructions Cleanup Pass
+; LAXX-NEXT:       Control Flow Optimizer
+; LAXX-NEXT:       Lazy Machine Block Frequency Analysis
+; LAXX-NEXT:       Tail Duplication
+; LAXX-NEXT:       Machine Copy Propagation Pass
+; LAXX-NEXT:       Post-RA pseudo instruction expansion pass
+; LAXX-NEXT:       MachineDominator Tree Construction
+; LAXX-NEXT:       Machine Natural Loop Construction
+; LAXX-NEXT:       Post RA top-down list latency scheduler
+; LAXX-NEXT:       Analyze Machine Code For Garbage Collection
+; LAXX-NEXT:       Machine Block Frequency Analysis
+; LAXX-NEXT:       MachinePostDominator Tree Construction
+; LAXX-NEXT:       Branch Probability Basic Block Placement
+; LAXX-NEXT:       Insert fentry calls
+; LAXX-NEXT:       Insert XRay ops
+; LAXX-NEXT:       Implement the 'patchable-function' attribute
+; LAXX-NEXT:       Branch relaxation pass
+; LAXX-NEXT:       Contiguously Lay Out Funclets
+; LAXX-NEXT:       StackMap Liveness Analysis
+; LAXX-NEXT:       Live DEBUG_VALUE analysis
+; LAXX-NEXT:       Machine Sanitizer Binary Metadata
+; LAXX-NEXT:       Lazy Machine Block Frequency Analysis
+; LAXX-NEXT:       Machine Optimization Remark Emitter
+; LAXX-NEXT:       Stack Frame Layout Analysis
+; LAXX-NEXT:       LoongArch pseudo instruction expansion pass
+; LAXX-NEXT:       LoongArch atomic pseudo instruction expansion pass
+; LAXX-NEXT:       Lazy Machine Block Frequency Analysis
+; LAXX-NEXT:       Machine Optimization Remark Emitter
+; LAXX-NEXT:       LoongArch Assembly Printer
+; LAXX-NEXT:       Free MachineFunction
diff --git a/llvm/test/CodeGen/LoongArch/prefer-w-inst.ll b/llvm/test/CodeGen/LoongArch/prefer-w-inst.ll
new file mode 100644
index 000000000000..385f27f04d5f
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/prefer-w-inst.ll
@@ -0,0 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch64 --verify-machineinstrs < %s \
+; RUN:   | FileCheck --check-prefixes=NO-PREFER-W-INST %s
+; RUN: llc --mtriple=loongarch64 --loongarch-disable-cvt-to-d-suffix --verify-machineinstrs < %s \
+; RUN:   | FileCheck --check-prefixes=NO-CVT-D-INST %s
+; RUN: llc --mtriple=loongarch64 --mattr=+prefer-w-inst --verify-machineinstrs < %s \
+; RUN:   | FileCheck --check-prefixes=PREFER-W-INST %s
+
+define signext i32 @addw(i32 %x) {
+; NO-PREFER-W-INST-LABEL: addw:
+; NO-PREFER-W-INST:       # %bb.0:
+; NO-PREFER-W-INST-NEXT:    addi.d $a0, $a0, 2047
+; NO-PREFER-W-INST-NEXT:    addi.w $a0, $a0, 2033
+; NO-PREFER-W-INST-NEXT:    ret
+;
+; NO-CVT-D-INST-LABEL: addw:
+; NO-CVT-D-INST:       # %bb.0:
+; NO-CVT-D-INST-NEXT:    addi.w $a0, $a0, 2047
+; NO-CVT-D-INST-NEXT:    addi.w $a0, $a0, 2033
+; NO-CVT-D-INST-NEXT:    ret
+;
+; PREFER-W-INST-LABEL: addw:
+; PREFER-W-INST:       # %bb.0:
+; PREFER-W-INST-NEXT:    addi.w $a0, $a0, 2047
+; PREFER-W-INST-NEXT:    addi.w $a0, $a0, 2033
+; PREFER-W-INST-NEXT:    ret
+  %add = add i32 %x, 4080
+  ret i32 %add
+}
diff --git a/llvm/test/CodeGen/LoongArch/preferred-alignments.ll b/llvm/test/CodeGen/LoongArch/preferred-alignments.ll
index 2b6a109228a6..30305127b94f 100644
--- a/llvm/test/CodeGen/LoongArch/preferred-alignments.ll
+++ b/llvm/test/CodeGen/LoongArch/preferred-alignments.ll
@@ -13,16 +13,16 @@ define signext i32 @sum(ptr noalias nocapture noundef readonly %0, i32 noundef s
 ; LA464-NEXT:    .p2align 4, , 16
 ; LA464-NEXT:  .LBB0_2: # =>This Inner Loop Header: Depth=1
 ; LA464-NEXT:    ld.w $a3, $a0, 0
-; LA464-NEXT:    add.d $a2, $a3, $a2
+; LA464-NEXT:    add.w $a2, $a3, $a2
 ; LA464-NEXT:    addi.d $a1, $a1, -1
 ; LA464-NEXT:    addi.d $a0, $a0, 4
 ; LA464-NEXT:    bnez $a1, .LBB0_2
 ; LA464-NEXT:  # %bb.3:
-; LA464-NEXT:    addi.w $a0, $a2, 0
+; LA464-NEXT:    move $a0, $a2
 ; LA464-NEXT:    ret
 ; LA464-NEXT:  .LBB0_4:
 ; LA464-NEXT:    move $a2, $zero
-; LA464-NEXT:    addi.w $a0, $a2, 0
+; LA464-NEXT:    move $a0, $a2
 ; LA464-NEXT:    ret
   %3 = icmp sgt i32 %1, 0
   br i1 %3, label %4, label %6
diff --git a/llvm/test/CodeGen/LoongArch/rotl-rotr.ll b/llvm/test/CodeGen/LoongArch/rotl-rotr.ll
index b067eb9cfa92..b9fbd962e6bb 100644
--- a/llvm/test/CodeGen/LoongArch/rotl-rotr.ll
+++ b/llvm/test/CodeGen/LoongArch/rotl-rotr.ll
@@ -2,9 +2,7 @@
 ; RUN: llc --mtriple=loongarch32 < %s | FileCheck %s --check-prefix=LA32
 ; RUN: llc --mtriple=loongarch64 < %s | FileCheck %s --check-prefix=LA64
 
-;; TODO: Add optimization to ISD::ROTL
-
-define i32 @rotl_32(i32 %x, i32 %y) nounwind {
+define signext i32 @rotl_32(i32 signext %x, i32 signext %y) nounwind {
 ; LA32-LABEL: rotl_32:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    ori $a2, $zero, 32
@@ -14,10 +12,9 @@ define i32 @rotl_32(i32 %x, i32 %y) nounwind {
 ;
 ; LA64-LABEL: rotl_32:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    sll.w $a2, $a0, $a1
-; LA64-NEXT:    sub.d $a1, $zero, $a1
-; LA64-NEXT:    srl.w $a0, $a0, $a1
-; LA64-NEXT:    or $a0, $a2, $a0
+; LA64-NEXT:    ori $a2, $zero, 32
+; LA64-NEXT:    sub.d $a1, $a2, $a1
+; LA64-NEXT:    rotr.w $a0, $a0, $a1
 ; LA64-NEXT:    ret
   %z = sub i32 32, %y
   %b = shl i32 %x, %y
@@ -26,7 +23,7 @@ define i32 @rotl_32(i32 %x, i32 %y) nounwind {
   ret i32 %d
 }
 
-define i32 @rotr_32(i32 %x, i32 %y) nounwind {
+define signext i32 @rotr_32(i32 signext %x, i32 signext %y) nounwind {
 ; LA32-LABEL: rotr_32:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    rotr.w $a0, $a0, $a1
@@ -143,7 +140,7 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind {
   ret i64 %d
 }
 
-define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind {
+define signext i32 @rotl_32_mask(i32 signext %x, i32 signext %y) nounwind {
 ; LA32-LABEL: rotl_32_mask:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    sub.w $a1, $zero, $a1
@@ -152,10 +149,9 @@ define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind {
 ;
 ; LA64-LABEL: rotl_32_mask:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    sll.w $a2, $a0, $a1
-; LA64-NEXT:    sub.d $a1, $zero, $a1
-; LA64-NEXT:    srl.w $a0, $a0, $a1
-; LA64-NEXT:    or $a0, $a2, $a0
+; LA64-NEXT:    ori $a2, $zero, 32
+; LA64-NEXT:    sub.d $a1, $a2, $a1
+; LA64-NEXT:    rotr.w $a0, $a0, $a1
 ; LA64-NEXT:    ret
   %z = sub i32 0, %y
   %and = and i32 %z, 31
@@ -165,7 +161,7 @@ define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind {
   ret i32 %d
 }
 
-define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
+define signext i32 @rotl_32_mask_and_63_and_31(i32 signext %x, i32 signext %y) nounwind {
 ; LA32-LABEL: rotl_32_mask_and_63_and_31:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    sub.w $a1, $zero, $a1
@@ -174,10 +170,9 @@ define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
 ;
 ; LA64-LABEL: rotl_32_mask_and_63_and_31:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    sll.w $a2, $a0, $a1
-; LA64-NEXT:    sub.d $a1, $zero, $a1
-; LA64-NEXT:    srl.w $a0, $a0, $a1
-; LA64-NEXT:    or $a0, $a2, $a0
+; LA64-NEXT:    ori $a2, $zero, 32
+; LA64-NEXT:    sub.d $a1, $a2, $a1
+; LA64-NEXT:    rotr.w $a0, $a0, $a1
 ; LA64-NEXT:    ret
   %a = and i32 %y, 63
   %b = shl i32 %x, %a
@@ -188,7 +183,7 @@ define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
   ret i32 %f
 }
 
-define i32 @rotl_32_mask_or_64_or_32(i32 %x, i32 %y) nounwind {
+define signext i32 @rotl_32_mask_or_64_or_32(i32 signext %x, i32 signext %y) nounwind {
 ; LA32-LABEL: rotl_32_mask_or_64_or_32:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    sub.w $a1, $zero, $a1
@@ -197,10 +192,9 @@ define i32 @rotl_32_mask_or_64_or_32(i32 %x, i32 %y) nounwind {
 ;
 ; LA64-LABEL: rotl_32_mask_or_64_or_32:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    sll.w $a2, $a0, $a1
-; LA64-NEXT:    sub.d $a1, $zero, $a1
-; LA64-NEXT:    srl.w $a0, $a0, $a1
-; LA64-NEXT:    or $a0, $a2, $a0
+; LA64-NEXT:    ori $a2, $zero, 32
+; LA64-NEXT:    sub.d $a1, $a2, $a1
+; LA64-NEXT:    rotr.w $a0, $a0, $a1
 ; LA64-NEXT:    ret
   %a = or i32 %y, 64
   %b = shl i32 %x, %a
@@ -211,7 +205,7 @@ define i32 @rotl_32_mask_or_64_or_32(i32 %x, i32 %y) nounwind {
   ret i32 %f
 }
 
-define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind {
+define signext i32 @rotr_32_mask(i32 signext %x, i32 signext %y) nounwind {
 ; LA32-LABEL: rotr_32_mask:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    rotr.w $a0, $a0, $a1
@@ -229,7 +223,7 @@ define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind {
   ret i32 %d
 }
 
-define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
+define signext i32 @rotr_32_mask_and_63_and_31(i32 signext %x, i32 signext %y) nounwind {
 ; LA32-LABEL: rotr_32_mask_and_63_and_31:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    rotr.w $a0, $a0, $a1
@@ -248,7 +242,7 @@ define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
   ret i32 %f
 }
 
-define i32 @rotr_32_mask_or_64_or_32(i32 %x, i32 %y) nounwind {
+define signext i32 @rotr_32_mask_or_64_or_32(i32 signext %x, i32 signext %y) nounwind {
 ; LA32-LABEL: rotr_32_mask_or_64_or_32:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    rotr.w $a0, $a0, $a1
@@ -510,7 +504,7 @@ define i64 @rotr_64_mask_or_128_or_64(i64 %x, i64 %y) nounwind {
   ret i64 %f
 }
 
-define i32 @rotri_i32(i32 %a) nounwind {
+define signext i32 @rotri_i32(i32 signext %a) nounwind {
 ; LA32-LABEL: rotri_i32:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    rotri.w $a0, $a0, 16
@@ -591,10 +585,7 @@ define signext i32 @rotr_i32_fshr(i32 signext %a) nounwind {
 ;
 ; LA64-LABEL: rotr_i32_fshr:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    slli.d $a1, $a0, 20
-; LA64-NEXT:    bstrpick.d $a0, $a0, 31, 12
-; LA64-NEXT:    or $a0, $a0, $a1
-; LA64-NEXT:    addi.w $a0, $a0, 0
+; LA64-NEXT:    rotri.w $a0, $a0, 12
 ; LA64-NEXT:    ret
   %or = tail call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 12)
   ret i32 %or
diff --git a/llvm/test/CodeGen/LoongArch/sextw-removal.ll b/llvm/test/CodeGen/LoongArch/sextw-removal.ll
index 6db9c1608b3c..7708873e264d 100644
--- a/llvm/test/CodeGen/LoongArch/sextw-removal.ll
+++ b/llvm/test/CodeGen/LoongArch/sextw-removal.ll
@@ -1,5 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s --mtriple=loongarch64 | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s --mtriple=loongarch64 --loongarch-disable-sextw-removal | \
+; RUN:   FileCheck %s --check-prefix=NORMV
 
 define void @test1(i32 signext %arg, i32 signext %arg1) nounwind {
 ; CHECK-LABEL: test1:
@@ -13,7 +15,7 @@ define void @test1(i32 signext %arg, i32 signext %arg1) nounwind {
 ; CHECK-NEXT:    .p2align 4, , 16
 ; CHECK-NEXT:  .LBB0_1: # %bb2
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    addi.w $a0, $s0, 0
+; CHECK-NEXT:    move $a0, $s0
 ; CHECK-NEXT:    bl %plt(bar)
 ; CHECK-NEXT:    sll.w $s0, $s0, $fp
 ; CHECK-NEXT:    bnez $a0, .LBB0_1
@@ -23,6 +25,28 @@ define void @test1(i32 signext %arg, i32 signext %arg1) nounwind {
 ; CHECK-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
 ; CHECK-NEXT:    addi.d $sp, $sp, 32
 ; CHECK-NEXT:    ret
+;
+; NORMV-LABEL: test1:
+; NORMV:       # %bb.0: # %bb
+; NORMV-NEXT:    addi.d $sp, $sp, -32
+; NORMV-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
+; NORMV-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
+; NORMV-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
+; NORMV-NEXT:    move $fp, $a1
+; NORMV-NEXT:    sra.w $s0, $a0, $a1
+; NORMV-NEXT:    .p2align 4, , 16
+; NORMV-NEXT:  .LBB0_1: # %bb2
+; NORMV-NEXT:    # =>This Inner Loop Header: Depth=1
+; NORMV-NEXT:    addi.w $a0, $s0, 0
+; NORMV-NEXT:    bl %plt(bar)
+; NORMV-NEXT:    sll.w $s0, $s0, $fp
+; NORMV-NEXT:    bnez $a0, .LBB0_1
+; NORMV-NEXT:  # %bb.2: # %bb7
+; NORMV-NEXT:    ld.d $s0, $sp, 8 # 8-byte Folded Reload
+; NORMV-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
+; NORMV-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+; NORMV-NEXT:    addi.d $sp, $sp, 32
+; NORMV-NEXT:    ret
 bb:
   %i = ashr i32 %arg, %arg1
   br label %bb2
@@ -47,8 +71,16 @@ define signext i32 @test2(ptr %p, i32 signext %b) nounwind {
 ; CHECK-NEXT:    ori $a2, $zero, 1
 ; CHECK-NEXT:    sll.w $a1, $a2, $a1
 ; CHECK-NEXT:    andn $a0, $a0, $a1
-; CHECK-NEXT:    addi.w $a0, $a0, 0
 ; CHECK-NEXT:    ret
+;
+; NORMV-LABEL: test2:
+; NORMV:       # %bb.0:
+; NORMV-NEXT:    ld.w $a0, $a0, 0
+; NORMV-NEXT:    ori $a2, $zero, 1
+; NORMV-NEXT:    sll.w $a1, $a2, $a1
+; NORMV-NEXT:    andn $a0, $a0, $a1
+; NORMV-NEXT:    addi.w $a0, $a0, 0
+; NORMV-NEXT:    ret
   %a = load i32, ptr %p
   %shl = shl i32 1, %b
   %neg = xor i32 %shl, -1
@@ -63,8 +95,16 @@ define signext i32 @test3(ptr %p, i32 signext %b) nounwind {
 ; CHECK-NEXT:    ori $a2, $zero, 1
 ; CHECK-NEXT:    sll.w $a1, $a2, $a1
 ; CHECK-NEXT:    orn $a0, $a0, $a1
-; CHECK-NEXT:    addi.w $a0, $a0, 0
 ; CHECK-NEXT:    ret
+;
+; NORMV-LABEL: test3:
+; NORMV:       # %bb.0:
+; NORMV-NEXT:    ld.w $a0, $a0, 0
+; NORMV-NEXT:    ori $a2, $zero, 1
+; NORMV-NEXT:    sll.w $a1, $a2, $a1
+; NORMV-NEXT:    orn $a0, $a0, $a1
+; NORMV-NEXT:    addi.w $a0, $a0, 0
+; NORMV-NEXT:    ret
   %a = load i32, ptr %p
   %shl = shl i32 1, %b
   %neg = xor i32 %shl, -1
@@ -82,6 +122,16 @@ define signext i32 @test4(ptr %p, i32 signext %b) nounwind {
 ; CHECK-NEXT:    nor $a0, $a0, $zero
 ; CHECK-NEXT:    addi.w $a0, $a0, 0
 ; CHECK-NEXT:    ret
+;
+; NORMV-LABEL: test4:
+; NORMV:       # %bb.0:
+; NORMV-NEXT:    ld.w $a0, $a0, 0
+; NORMV-NEXT:    ori $a2, $zero, 1
+; NORMV-NEXT:    sll.w $a1, $a2, $a1
+; NORMV-NEXT:    xor $a0, $a1, $a0
+; NORMV-NEXT:    nor $a0, $a0, $zero
+; NORMV-NEXT:    addi.w $a0, $a0, 0
+; NORMV-NEXT:    ret
   %a = load i32, ptr %p
   %shl = shl i32 1, %b
   %neg = xor i32 %shl, -1
@@ -133,6 +183,50 @@ define void @test5(i32 signext %arg, i32 signext %arg1) nounwind {
 ; CHECK-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
 ; CHECK-NEXT:    addi.d $sp, $sp, 48
 ; CHECK-NEXT:    ret
+;
+; NORMV-LABEL: test5:
+; NORMV:       # %bb.0: # %bb
+; NORMV-NEXT:    addi.d $sp, $sp, -48
+; NORMV-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; NORMV-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
+; NORMV-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
+; NORMV-NEXT:    st.d $s1, $sp, 16 # 8-byte Folded Spill
+; NORMV-NEXT:    st.d $s2, $sp, 8 # 8-byte Folded Spill
+; NORMV-NEXT:    sra.w $a1, $a0, $a1
+; NORMV-NEXT:    lu12i.w $a0, 349525
+; NORMV-NEXT:    ori $fp, $a0, 1365
+; NORMV-NEXT:    lu12i.w $a0, 209715
+; NORMV-NEXT:    ori $s0, $a0, 819
+; NORMV-NEXT:    lu12i.w $a0, 61680
+; NORMV-NEXT:    ori $s1, $a0, 3855
+; NORMV-NEXT:    lu12i.w $a0, 4112
+; NORMV-NEXT:    ori $s2, $a0, 257
+; NORMV-NEXT:    .p2align 4, , 16
+; NORMV-NEXT:  .LBB4_1: # %bb2
+; NORMV-NEXT:    # =>This Inner Loop Header: Depth=1
+; NORMV-NEXT:    addi.w $a0, $a1, 0
+; NORMV-NEXT:    bl %plt(bar)
+; NORMV-NEXT:    srli.d $a1, $a0, 1
+; NORMV-NEXT:    and $a1, $a1, $fp
+; NORMV-NEXT:    sub.d $a1, $a0, $a1
+; NORMV-NEXT:    and $a2, $a1, $s0
+; NORMV-NEXT:    srli.d $a1, $a1, 2
+; NORMV-NEXT:    and $a1, $a1, $s0
+; NORMV-NEXT:    add.d $a1, $a2, $a1
+; NORMV-NEXT:    srli.d $a2, $a1, 4
+; NORMV-NEXT:    add.d $a1, $a1, $a2
+; NORMV-NEXT:    and $a1, $a1, $s1
+; NORMV-NEXT:    mul.d $a1, $a1, $s2
+; NORMV-NEXT:    bstrpick.d $a1, $a1, 31, 24
+; NORMV-NEXT:    bnez $a0, .LBB4_1
+; NORMV-NEXT:  # %bb.2: # %bb7
+; NORMV-NEXT:    ld.d $s2, $sp, 8 # 8-byte Folded Reload
+; NORMV-NEXT:    ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; NORMV-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; NORMV-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; NORMV-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; NORMV-NEXT:    addi.d $sp, $sp, 48
+; NORMV-NEXT:    ret
 bb:
   %i = ashr i32 %arg, %arg1
   br label %bb2
@@ -177,6 +271,33 @@ define void @test6(i32 signext %arg, i32 signext %arg1) nounwind {
 ; CHECK-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
 ; CHECK-NEXT:    addi.d $sp, $sp, 32
 ; CHECK-NEXT:    ret
+;
+; NORMV-LABEL: test6:
+; NORMV:       # %bb.0: # %bb
+; NORMV-NEXT:    addi.d $sp, $sp, -32
+; NORMV-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
+; NORMV-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
+; NORMV-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
+; NORMV-NEXT:    sra.w $fp, $a0, $a1
+; NORMV-NEXT:    .p2align 4, , 16
+; NORMV-NEXT:  .LBB5_1: # %bb2
+; NORMV-NEXT:    # =>This Inner Loop Header: Depth=1
+; NORMV-NEXT:    addi.w $a0, $fp, 0
+; NORMV-NEXT:    bl %plt(baz)
+; NORMV-NEXT:    bstrpick.d $s0, $a0, 31, 0
+; NORMV-NEXT:    move $a0, $s0
+; NORMV-NEXT:    bl %plt(__fixsfsi)
+; NORMV-NEXT:    move $fp, $a0
+; NORMV-NEXT:    move $a0, $s0
+; NORMV-NEXT:    move $a1, $zero
+; NORMV-NEXT:    bl %plt(__nesf2)
+; NORMV-NEXT:    bnez $a0, .LBB5_1
+; NORMV-NEXT:  # %bb.2: # %bb7
+; NORMV-NEXT:    ld.d $s0, $sp, 8 # 8-byte Folded Reload
+; NORMV-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
+; NORMV-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+; NORMV-NEXT:    addi.d $sp, $sp, 32
+; NORMV-NEXT:    ret
 bb:
   %i = ashr i32 %arg, %arg1
   br label %bb2
@@ -222,7 +343,6 @@ define void @test7(i32 signext %arg, i32 signext %arg1) nounwind {
 ; CHECK-NEXT:    .p2align 4, , 16
 ; CHECK-NEXT:  .LBB6_1: # %bb2
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    addi.w $a0, $a0, 0
 ; CHECK-NEXT:    bl %plt(foo)
 ; CHECK-NEXT:    srli.d $a1, $a0, 1
 ; CHECK-NEXT:    and $a1, $a1, $fp
@@ -245,6 +365,58 @@ define void @test7(i32 signext %arg, i32 signext %arg1) nounwind {
 ; CHECK-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
 ; CHECK-NEXT:    addi.d $sp, $sp, 48
 ; CHECK-NEXT:    ret
+;
+; NORMV-LABEL: test7:
+; NORMV:       # %bb.0: # %bb
+; NORMV-NEXT:    addi.d $sp, $sp, -48
+; NORMV-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; NORMV-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
+; NORMV-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
+; NORMV-NEXT:    st.d $s1, $sp, 16 # 8-byte Folded Spill
+; NORMV-NEXT:    st.d $s2, $sp, 8 # 8-byte Folded Spill
+; NORMV-NEXT:    sra.w $a0, $a0, $a1
+; NORMV-NEXT:    lu12i.w $a1, 349525
+; NORMV-NEXT:    ori $a1, $a1, 1365
+; NORMV-NEXT:    lu32i.d $a1, 349525
+; NORMV-NEXT:    lu52i.d $fp, $a1, 1365
+; NORMV-NEXT:    lu12i.w $a1, 209715
+; NORMV-NEXT:    ori $a1, $a1, 819
+; NORMV-NEXT:    lu32i.d $a1, 209715
+; NORMV-NEXT:    lu52i.d $s0, $a1, 819
+; NORMV-NEXT:    lu12i.w $a1, 61680
+; NORMV-NEXT:    ori $a1, $a1, 3855
+; NORMV-NEXT:    lu32i.d $a1, -61681
+; NORMV-NEXT:    lu52i.d $s1, $a1, 240
+; NORMV-NEXT:    lu12i.w $a1, 4112
+; NORMV-NEXT:    ori $a1, $a1, 257
+; NORMV-NEXT:    lu32i.d $a1, 65793
+; NORMV-NEXT:    lu52i.d $s2, $a1, 16
+; NORMV-NEXT:    .p2align 4, , 16
+; NORMV-NEXT:  .LBB6_1: # %bb2
+; NORMV-NEXT:    # =>This Inner Loop Header: Depth=1
+; NORMV-NEXT:    addi.w $a0, $a0, 0
+; NORMV-NEXT:    bl %plt(foo)
+; NORMV-NEXT:    srli.d $a1, $a0, 1
+; NORMV-NEXT:    and $a1, $a1, $fp
+; NORMV-NEXT:    sub.d $a0, $a0, $a1
+; NORMV-NEXT:    and $a1, $a0, $s0
+; NORMV-NEXT:    srli.d $a0, $a0, 2
+; NORMV-NEXT:    and $a0, $a0, $s0
+; NORMV-NEXT:    add.d $a0, $a1, $a0
+; NORMV-NEXT:    srli.d $a1, $a0, 4
+; NORMV-NEXT:    add.d $a0, $a0, $a1
+; NORMV-NEXT:    and $a0, $a0, $s1
+; NORMV-NEXT:    mul.d $a0, $a0, $s2
+; NORMV-NEXT:    srli.d $a0, $a0, 56
+; NORMV-NEXT:    bnez $a0, .LBB6_1
+; NORMV-NEXT:  # %bb.2: # %bb7
+; NORMV-NEXT:    ld.d $s2, $sp, 8 # 8-byte Folded Reload
+; NORMV-NEXT:    ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; NORMV-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; NORMV-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; NORMV-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; NORMV-NEXT:    addi.d $sp, $sp, 48
+; NORMV-NEXT:    ret
 bb:
   %i = ashr i32 %arg, %arg1
   br label %bb2
@@ -283,6 +455,26 @@ define void @test8(i32 signext %arg, i32 signext %arg1) nounwind {
 ; CHECK-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
 ; CHECK-NEXT:    ret
+;
+; NORMV-LABEL: test8:
+; NORMV:       # %bb.0: # %bb
+; NORMV-NEXT:    addi.d $sp, $sp, -16
+; NORMV-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; NORMV-NEXT:    st.d $fp, $sp, 0 # 8-byte Folded Spill
+; NORMV-NEXT:    sra.w $a0, $a0, $a1
+; NORMV-NEXT:    addi.w $fp, $zero, -256
+; NORMV-NEXT:    .p2align 4, , 16
+; NORMV-NEXT:  .LBB7_1: # %bb2
+; NORMV-NEXT:    # =>This Inner Loop Header: Depth=1
+; NORMV-NEXT:    addi.w $a0, $a0, 0
+; NORMV-NEXT:    bl %plt(foo)
+; NORMV-NEXT:    or $a0, $a0, $fp
+; NORMV-NEXT:    bnez $a0, .LBB7_1
+; NORMV-NEXT:  # %bb.2: # %bb7
+; NORMV-NEXT:    ld.d $fp, $sp, 0 # 8-byte Folded Reload
+; NORMV-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; NORMV-NEXT:    addi.d $sp, $sp, 16
+; NORMV-NEXT:    ret
 bb:
   %i = ashr i32 %arg, %arg1
   br label %bb2
@@ -307,20 +499,40 @@ define void @test9(i32 signext %arg, i32 signext %arg1) nounwind {
 ; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
 ; CHECK-NEXT:    st.d $fp, $sp, 0 # 8-byte Folded Spill
-; CHECK-NEXT:    sra.w $a1, $a0, $a1
+; CHECK-NEXT:    sra.w $a0, $a0, $a1
 ; CHECK-NEXT:    ori $fp, $zero, 254
 ; CHECK-NEXT:    .p2align 4, , 16
 ; CHECK-NEXT:  .LBB8_1: # %bb2
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    addi.w $a0, $a1, 0
 ; CHECK-NEXT:    bl %plt(bar)
-; CHECK-NEXT:    slti $a1, $a0, 255
-; CHECK-NEXT:    blt $fp, $a0, .LBB8_1
+; CHECK-NEXT:    move $a1, $a0
+; CHECK-NEXT:    slti $a0, $a0, 255
+; CHECK-NEXT:    blt $fp, $a1, .LBB8_1
 ; CHECK-NEXT:  # %bb.2: # %bb7
 ; CHECK-NEXT:    ld.d $fp, $sp, 0 # 8-byte Folded Reload
 ; CHECK-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
 ; CHECK-NEXT:    ret
+;
+; NORMV-LABEL: test9:
+; NORMV:       # %bb.0: # %bb
+; NORMV-NEXT:    addi.d $sp, $sp, -16
+; NORMV-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; NORMV-NEXT:    st.d $fp, $sp, 0 # 8-byte Folded Spill
+; NORMV-NEXT:    sra.w $a1, $a0, $a1
+; NORMV-NEXT:    ori $fp, $zero, 254
+; NORMV-NEXT:    .p2align 4, , 16
+; NORMV-NEXT:  .LBB8_1: # %bb2
+; NORMV-NEXT:    # =>This Inner Loop Header: Depth=1
+; NORMV-NEXT:    addi.w $a0, $a1, 0
+; NORMV-NEXT:    bl %plt(bar)
+; NORMV-NEXT:    slti $a1, $a0, 255
+; NORMV-NEXT:    blt $fp, $a0, .LBB8_1
+; NORMV-NEXT:  # %bb.2: # %bb7
+; NORMV-NEXT:    ld.d $fp, $sp, 0 # 8-byte Folded Reload
+; NORMV-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; NORMV-NEXT:    addi.d $sp, $sp, 16
+; NORMV-NEXT:    ret
 bb:
   %i = ashr i32 %arg, %arg1
   br label %bb2
@@ -359,6 +571,28 @@ define void @test10(i32 signext %arg, i32 signext %arg1) nounwind {
 ; CHECK-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
 ; CHECK-NEXT:    ret
+;
+; NORMV-LABEL: test10:
+; NORMV:       # %bb.0: # %bb
+; NORMV-NEXT:    addi.d $sp, $sp, -16
+; NORMV-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; NORMV-NEXT:    st.d $fp, $sp, 0 # 8-byte Folded Spill
+; NORMV-NEXT:    sra.w $fp, $a0, $a1
+; NORMV-NEXT:    .p2align 4, , 16
+; NORMV-NEXT:  .LBB9_1: # %bb2
+; NORMV-NEXT:    # =>This Inner Loop Header: Depth=1
+; NORMV-NEXT:    addi.w $a0, $fp, 0
+; NORMV-NEXT:    bl %plt(baz)
+; NORMV-NEXT:    move $fp, $a0
+; NORMV-NEXT:    bstrpick.d $a0, $a0, 31, 0
+; NORMV-NEXT:    move $a1, $zero
+; NORMV-NEXT:    bl %plt(__nesf2)
+; NORMV-NEXT:    bnez $a0, .LBB9_1
+; NORMV-NEXT:  # %bb.2: # %bb7
+; NORMV-NEXT:    ld.d $fp, $sp, 0 # 8-byte Folded Reload
+; NORMV-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; NORMV-NEXT:    addi.d $sp, $sp, 16
+; NORMV-NEXT:    ret
 bb:
   %i = ashr i32 %arg, %arg1
   br label %bb2
@@ -384,11 +618,25 @@ define signext i32 @test11(i64 %arg1, i64 %arg2, i64 %arg3)  {
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    andi $a0, $a0, 1234
 ; CHECK-NEXT:    addi.d $a2, $a2, 1
-; CHECK-NEXT:    add.d $a0, $a0, $a1
+; CHECK-NEXT:    add.w $a0, $a0, $a1
 ; CHECK-NEXT:    bltu $a2, $a3, .LBB10_1
 ; CHECK-NEXT:  # %bb.2: # %bb7
-; CHECK-NEXT:    addi.w $a0, $a0, 0
 ; CHECK-NEXT:    ret
+;
+; NORMV-LABEL: test11:
+; NORMV:       # %bb.0: # %entry
+; NORMV-NEXT:    addi.d $a2, $a2, -1
+; NORMV-NEXT:    ori $a3, $zero, 256
+; NORMV-NEXT:    .p2align 4, , 16
+; NORMV-NEXT:  .LBB10_1: # %bb2
+; NORMV-NEXT:    # =>This Inner Loop Header: Depth=1
+; NORMV-NEXT:    andi $a0, $a0, 1234
+; NORMV-NEXT:    addi.d $a2, $a2, 1
+; NORMV-NEXT:    add.d $a0, $a0, $a1
+; NORMV-NEXT:    bltu $a2, $a3, .LBB10_1
+; NORMV-NEXT:  # %bb.2: # %bb7
+; NORMV-NEXT:    addi.w $a0, $a0, 0
+; NORMV-NEXT:    ret
 entry:
   br label %bb2
 
@@ -409,21 +657,39 @@ bb7:                                              ; preds = %bb2
 define signext i32 @test12(i64 %arg1, i64 %arg2, i64 %arg3)  {
 ; CHECK-LABEL: test12:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $a2, $a2, -1
-; CHECK-NEXT:    ori $a3, $zero, 256
+; CHECK-NEXT:    addi.d $a3, $a2, -1
+; CHECK-NEXT:    ori $a4, $zero, 256
 ; CHECK-NEXT:    .p2align 4, , 16
 ; CHECK-NEXT:  .LBB11_1: # %bb2
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    xor $a0, $a0, $a1
-; CHECK-NEXT:    mul.d $a4, $a0, $a1
-; CHECK-NEXT:    add.d $a0, $a0, $a4
-; CHECK-NEXT:    and $a4, $a4, $a0
-; CHECK-NEXT:    addi.d $a2, $a2, 1
-; CHECK-NEXT:    add.d $a0, $a4, $a1
-; CHECK-NEXT:    bltu $a2, $a3, .LBB11_1
+; CHECK-NEXT:    mul.w $a2, $a0, $a1
+; CHECK-NEXT:    add.w $a0, $a0, $a2
+; CHECK-NEXT:    and $a2, $a2, $a0
+; CHECK-NEXT:    addi.d $a3, $a3, 1
+; CHECK-NEXT:    add.d $a0, $a2, $a1
+; CHECK-NEXT:    bltu $a3, $a4, .LBB11_1
 ; CHECK-NEXT:  # %bb.2: # %bb7
-; CHECK-NEXT:    addi.w $a0, $a4, 0
+; CHECK-NEXT:    move $a0, $a2
 ; CHECK-NEXT:    ret
+;
+; NORMV-LABEL: test12:
+; NORMV:       # %bb.0: # %entry
+; NORMV-NEXT:    addi.d $a2, $a2, -1
+; NORMV-NEXT:    ori $a3, $zero, 256
+; NORMV-NEXT:    .p2align 4, , 16
+; NORMV-NEXT:  .LBB11_1: # %bb2
+; NORMV-NEXT:    # =>This Inner Loop Header: Depth=1
+; NORMV-NEXT:    xor $a0, $a0, $a1
+; NORMV-NEXT:    mul.d $a4, $a0, $a1
+; NORMV-NEXT:    add.d $a0, $a0, $a4
+; NORMV-NEXT:    and $a4, $a4, $a0
+; NORMV-NEXT:    addi.d $a2, $a2, 1
+; NORMV-NEXT:    add.d $a0, $a4, $a1
+; NORMV-NEXT:    bltu $a2, $a3, .LBB11_1
+; NORMV-NEXT:  # %bb.2: # %bb7
+; NORMV-NEXT:    addi.w $a0, $a4, 0
+; NORMV-NEXT:    ret
 entry:
   br label %bb2
 
@@ -459,6 +725,21 @@ define signext i32 @test13(i64 %arg1, i64 %arg2, i64 %arg3)  {
 ; CHECK-NEXT:  # %bb.2: # %bb7
 ; CHECK-NEXT:    addi.w $a0, $a0, 0
 ; CHECK-NEXT:    ret
+;
+; NORMV-LABEL: test13:
+; NORMV:       # %bb.0: # %entry
+; NORMV-NEXT:    addi.d $a2, $a2, -1
+; NORMV-NEXT:    ori $a3, $zero, 256
+; NORMV-NEXT:    .p2align 4, , 16
+; NORMV-NEXT:  .LBB12_1: # %bb2
+; NORMV-NEXT:    # =>This Inner Loop Header: Depth=1
+; NORMV-NEXT:    div.d $a0, $a0, $a1
+; NORMV-NEXT:    addi.d $a2, $a2, 1
+; NORMV-NEXT:    add.d $a0, $a0, $a1
+; NORMV-NEXT:    bltu $a2, $a3, .LBB12_1
+; NORMV-NEXT:  # %bb.2: # %bb7
+; NORMV-NEXT:    addi.w $a0, $a0, 0
+; NORMV-NEXT:    ret
 entry:
   br label %bb2
 
@@ -489,10 +770,9 @@ define signext i32 @test14(i32 signext %0, i32 signext %1) {
 ; CHECK-NEXT:    ori $a4, $zero, 1000
 ; CHECK-NEXT:    .p2align 4, , 16
 ; CHECK-NEXT:  .LBB13_2: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    addi.w $a5, $a0, 0
-; CHECK-NEXT:    blt $a4, $a5, .LBB13_5
+; CHECK-NEXT:    blt $a4, $a0, .LBB13_5
 ; CHECK-NEXT:  # %bb.3: # in Loop: Header=BB13_2 Depth=1
-; CHECK-NEXT:    add.d $a0, $a3, $a0
+; CHECK-NEXT:    add.w $a0, $a3, $a0
 ; CHECK-NEXT:    addi.w $a3, $a3, 1
 ; CHECK-NEXT:    blt $a3, $a1, .LBB13_2
 ; CHECK-NEXT:  .LBB13_4:
@@ -501,6 +781,30 @@ define signext i32 @test14(i32 signext %0, i32 signext %1) {
 ; CHECK-NEXT:  .LBB13_5:
 ; CHECK-NEXT:    addi.w $a0, $a2, 0
 ; CHECK-NEXT:    ret
+;
+; NORMV-LABEL: test14:
+; NORMV:       # %bb.0:
+; NORMV-NEXT:    ori $a2, $zero, 2
+; NORMV-NEXT:    blt $a1, $a2, .LBB13_4
+; NORMV-NEXT:  # %bb.1: # %.preheader
+; NORMV-NEXT:    ori $a3, $zero, 1
+; NORMV-NEXT:    addi.w $a2, $zero, -1
+; NORMV-NEXT:    lu32i.d $a2, 0
+; NORMV-NEXT:    ori $a4, $zero, 1000
+; NORMV-NEXT:    .p2align 4, , 16
+; NORMV-NEXT:  .LBB13_2: # =>This Inner Loop Header: Depth=1
+; NORMV-NEXT:    addi.w $a5, $a0, 0
+; NORMV-NEXT:    blt $a4, $a5, .LBB13_5
+; NORMV-NEXT:  # %bb.3: # in Loop: Header=BB13_2 Depth=1
+; NORMV-NEXT:    add.d $a0, $a3, $a0
+; NORMV-NEXT:    addi.w $a3, $a3, 1
+; NORMV-NEXT:    blt $a3, $a1, .LBB13_2
+; NORMV-NEXT:  .LBB13_4:
+; NORMV-NEXT:    addi.w $a0, $a0, 0
+; NORMV-NEXT:    ret
+; NORMV-NEXT:  .LBB13_5:
+; NORMV-NEXT:    addi.w $a0, $a2, 0
+; NORMV-NEXT:    ret
   %3 = icmp sgt i32 %1, 1
   br i1 %3, label %4, label %12
 
@@ -545,6 +849,30 @@ define signext i32 @test14b(i32 %0, i32 signext %1) {
 ; CHECK-NEXT:  .LBB14_5:
 ; CHECK-NEXT:    addi.w $a0, $a2, 0
 ; CHECK-NEXT:    ret
+;
+; NORMV-LABEL: test14b:
+; NORMV:       # %bb.0:
+; NORMV-NEXT:    ori $a2, $zero, 2
+; NORMV-NEXT:    blt $a1, $a2, .LBB14_4
+; NORMV-NEXT:  # %bb.1: # %.preheader
+; NORMV-NEXT:    ori $a3, $zero, 1
+; NORMV-NEXT:    addi.w $a2, $zero, -1
+; NORMV-NEXT:    lu32i.d $a2, 0
+; NORMV-NEXT:    ori $a4, $zero, 1000
+; NORMV-NEXT:    .p2align 4, , 16
+; NORMV-NEXT:  .LBB14_2: # =>This Inner Loop Header: Depth=1
+; NORMV-NEXT:    addi.w $a5, $a0, 0
+; NORMV-NEXT:    blt $a4, $a5, .LBB14_5
+; NORMV-NEXT:  # %bb.3: # in Loop: Header=BB14_2 Depth=1
+; NORMV-NEXT:    add.d $a0, $a3, $a0
+; NORMV-NEXT:    addi.w $a3, $a3, 1
+; NORMV-NEXT:    blt $a3, $a1, .LBB14_2
+; NORMV-NEXT:  .LBB14_4:
+; NORMV-NEXT:    addi.w $a0, $a0, 0
+; NORMV-NEXT:    ret
+; NORMV-NEXT:  .LBB14_5:
+; NORMV-NEXT:    addi.w $a0, $a2, 0
+; NORMV-NEXT:    ret
   %3 = icmp sgt i32 %1, 1
   br i1 %3, label %4, label %12
 
@@ -589,6 +917,30 @@ define signext i32 @test14c(i32 zeroext %0, i32 signext %1) {
 ; CHECK-NEXT:  .LBB15_5:
 ; CHECK-NEXT:    addi.w $a0, $a2, 0
 ; CHECK-NEXT:    ret
+;
+; NORMV-LABEL: test14c:
+; NORMV:       # %bb.0:
+; NORMV-NEXT:    ori $a2, $zero, 2
+; NORMV-NEXT:    blt $a1, $a2, .LBB15_4
+; NORMV-NEXT:  # %bb.1: # %.preheader
+; NORMV-NEXT:    ori $a3, $zero, 1
+; NORMV-NEXT:    addi.w $a2, $zero, -1
+; NORMV-NEXT:    lu32i.d $a2, 0
+; NORMV-NEXT:    ori $a4, $zero, 1000
+; NORMV-NEXT:    .p2align 4, , 16
+; NORMV-NEXT:  .LBB15_2: # =>This Inner Loop Header: Depth=1
+; NORMV-NEXT:    addi.w $a5, $a0, 0
+; NORMV-NEXT:    blt $a4, $a5, .LBB15_5
+; NORMV-NEXT:  # %bb.3: # in Loop: Header=BB15_2 Depth=1
+; NORMV-NEXT:    add.d $a0, $a3, $a0
+; NORMV-NEXT:    addi.w $a3, $a3, 1
+; NORMV-NEXT:    blt $a3, $a1, .LBB15_2
+; NORMV-NEXT:  .LBB15_4:
+; NORMV-NEXT:    addi.w $a0, $a0, 0
+; NORMV-NEXT:    ret
+; NORMV-NEXT:  .LBB15_5:
+; NORMV-NEXT:    addi.w $a0, $a2, 0
+; NORMV-NEXT:    ret
   %3 = icmp sgt i32 %1, 1
   br i1 %3, label %4, label %12
 
@@ -621,10 +973,9 @@ define signext i32 @test14d(i31 zeroext %0, i32 signext %1) {
 ; CHECK-NEXT:    ori $a4, $zero, 1000
 ; CHECK-NEXT:    .p2align 4, , 16
 ; CHECK-NEXT:  .LBB16_2: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    addi.w $a5, $a0, 0
-; CHECK-NEXT:    blt $a4, $a5, .LBB16_5
+; CHECK-NEXT:    blt $a4, $a0, .LBB16_5
 ; CHECK-NEXT:  # %bb.3: # in Loop: Header=BB16_2 Depth=1
-; CHECK-NEXT:    add.d $a0, $a3, $a0
+; CHECK-NEXT:    add.w $a0, $a3, $a0
 ; CHECK-NEXT:    addi.w $a3, $a3, 1
 ; CHECK-NEXT:    blt $a3, $a1, .LBB16_2
 ; CHECK-NEXT:  .LBB16_4:
@@ -633,6 +984,30 @@ define signext i32 @test14d(i31 zeroext %0, i32 signext %1) {
 ; CHECK-NEXT:  .LBB16_5:
 ; CHECK-NEXT:    addi.w $a0, $a2, 0
 ; CHECK-NEXT:    ret
+;
+; NORMV-LABEL: test14d:
+; NORMV:       # %bb.0:
+; NORMV-NEXT:    ori $a2, $zero, 2
+; NORMV-NEXT:    blt $a1, $a2, .LBB16_4
+; NORMV-NEXT:  # %bb.1: # %.preheader
+; NORMV-NEXT:    ori $a3, $zero, 1
+; NORMV-NEXT:    addi.w $a2, $zero, -1
+; NORMV-NEXT:    lu32i.d $a2, 0
+; NORMV-NEXT:    ori $a4, $zero, 1000
+; NORMV-NEXT:    .p2align 4, , 16
+; NORMV-NEXT:  .LBB16_2: # =>This Inner Loop Header: Depth=1
+; NORMV-NEXT:    addi.w $a5, $a0, 0
+; NORMV-NEXT:    blt $a4, $a5, .LBB16_5
+; NORMV-NEXT:  # %bb.3: # in Loop: Header=BB16_2 Depth=1
+; NORMV-NEXT:    add.d $a0, $a3, $a0
+; NORMV-NEXT:    addi.w $a3, $a3, 1
+; NORMV-NEXT:    blt $a3, $a1, .LBB16_2
+; NORMV-NEXT:  .LBB16_4:
+; NORMV-NEXT:    addi.w $a0, $a0, 0
+; NORMV-NEXT:    ret
+; NORMV-NEXT:  .LBB16_5:
+; NORMV-NEXT:    addi.w $a0, $a2, 0
+; NORMV-NEXT:    ret
   %zext = zext i31 %0 to i32
   %3 = icmp sgt i32 %1, 1
   br i1 %3, label %4, label %12
@@ -663,13 +1038,28 @@ define signext i32 @test15(i64 %arg1, i64 %arg2, i64 %arg3, ptr %arg4)  {
 ; CHECK-NEXT:  .LBB17_1: # %bb2
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    andi $a0, $a0, 1234
-; CHECK-NEXT:    add.d $a0, $a0, $a1
+; CHECK-NEXT:    add.w $a0, $a0, $a1
 ; CHECK-NEXT:    addi.d $a2, $a2, 1
 ; CHECK-NEXT:    st.w $a0, $a3, 0
 ; CHECK-NEXT:    bltu $a2, $a4, .LBB17_1
 ; CHECK-NEXT:  # %bb.2: # %bb7
-; CHECK-NEXT:    addi.w $a0, $a0, 0
 ; CHECK-NEXT:    ret
+;
+; NORMV-LABEL: test15:
+; NORMV:       # %bb.0: # %entry
+; NORMV-NEXT:    addi.d $a2, $a2, -1
+; NORMV-NEXT:    ori $a4, $zero, 256
+; NORMV-NEXT:    .p2align 4, , 16
+; NORMV-NEXT:  .LBB17_1: # %bb2
+; NORMV-NEXT:    # =>This Inner Loop Header: Depth=1
+; NORMV-NEXT:    andi $a0, $a0, 1234
+; NORMV-NEXT:    add.d $a0, $a0, $a1
+; NORMV-NEXT:    addi.d $a2, $a2, 1
+; NORMV-NEXT:    st.w $a0, $a3, 0
+; NORMV-NEXT:    bltu $a2, $a4, .LBB17_1
+; NORMV-NEXT:  # %bb.2: # %bb7
+; NORMV-NEXT:    addi.w $a0, $a0, 0
+; NORMV-NEXT:    ret
 entry:
   br label %bb2
 
@@ -738,12 +1128,66 @@ define signext i32 @bug(i32 signext %x) {
 ; CHECK-NEXT:    or $a1, $a2, $a1
 ; CHECK-NEXT:    srai.d $a0, $a0, 31
 ; CHECK-NEXT:    nor $a0, $a0, $zero
-; CHECK-NEXT:    add.d $a0, $a1, $a0
-; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    add.w $a0, $a1, $a0
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB18_2:
-; CHECK-NEXT:    addi.w $a0, $zero, 0
+; CHECK-NEXT:    move $a0, $zero
 ; CHECK-NEXT:    ret
+;
+; NORMV-LABEL: bug:
+; NORMV:       # %bb.0: # %entry
+; NORMV-NEXT:    beqz $a0, .LBB18_2
+; NORMV-NEXT:  # %bb.1: # %if.end
+; NORMV-NEXT:    bstrpick.d $a1, $a0, 31, 16
+; NORMV-NEXT:    sltui $a1, $a1, 1
+; NORMV-NEXT:    slli.d $a2, $a0, 16
+; NORMV-NEXT:    masknez $a0, $a0, $a1
+; NORMV-NEXT:    maskeqz $a2, $a2, $a1
+; NORMV-NEXT:    or $a0, $a2, $a0
+; NORMV-NEXT:    ori $a2, $zero, 32
+; NORMV-NEXT:    masknez $a2, $a2, $a1
+; NORMV-NEXT:    ori $a3, $zero, 16
+; NORMV-NEXT:    maskeqz $a1, $a3, $a1
+; NORMV-NEXT:    or $a1, $a1, $a2
+; NORMV-NEXT:    bstrpick.d $a2, $a0, 31, 24
+; NORMV-NEXT:    sltui $a2, $a2, 1
+; NORMV-NEXT:    slli.d $a3, $a0, 8
+; NORMV-NEXT:    addi.d $a4, $a1, -8
+; NORMV-NEXT:    masknez $a0, $a0, $a2
+; NORMV-NEXT:    maskeqz $a3, $a3, $a2
+; NORMV-NEXT:    or $a0, $a3, $a0
+; NORMV-NEXT:    masknez $a1, $a1, $a2
+; NORMV-NEXT:    maskeqz $a2, $a4, $a2
+; NORMV-NEXT:    or $a1, $a2, $a1
+; NORMV-NEXT:    bstrpick.d $a2, $a0, 31, 28
+; NORMV-NEXT:    sltui $a2, $a2, 1
+; NORMV-NEXT:    slli.d $a3, $a0, 4
+; NORMV-NEXT:    addi.d $a4, $a1, -4
+; NORMV-NEXT:    masknez $a0, $a0, $a2
+; NORMV-NEXT:    maskeqz $a3, $a3, $a2
+; NORMV-NEXT:    or $a0, $a3, $a0
+; NORMV-NEXT:    masknez $a1, $a1, $a2
+; NORMV-NEXT:    maskeqz $a2, $a4, $a2
+; NORMV-NEXT:    or $a1, $a2, $a1
+; NORMV-NEXT:    bstrpick.d $a2, $a0, 31, 30
+; NORMV-NEXT:    sltui $a2, $a2, 1
+; NORMV-NEXT:    slli.d $a3, $a0, 2
+; NORMV-NEXT:    addi.d $a4, $a1, -2
+; NORMV-NEXT:    masknez $a0, $a0, $a2
+; NORMV-NEXT:    maskeqz $a3, $a3, $a2
+; NORMV-NEXT:    or $a0, $a3, $a0
+; NORMV-NEXT:    addi.w $a0, $a0, 0
+; NORMV-NEXT:    masknez $a1, $a1, $a2
+; NORMV-NEXT:    maskeqz $a2, $a4, $a2
+; NORMV-NEXT:    or $a1, $a2, $a1
+; NORMV-NEXT:    srai.d $a0, $a0, 31
+; NORMV-NEXT:    nor $a0, $a0, $zero
+; NORMV-NEXT:    add.d $a0, $a1, $a0
+; NORMV-NEXT:    addi.w $a0, $a0, 0
+; NORMV-NEXT:    ret
+; NORMV-NEXT:  .LBB18_2:
+; NORMV-NEXT:    addi.w $a0, $zero, 0
+; NORMV-NEXT:    ret
 entry:
   %tobool.not = icmp eq i32 %x, 0
   br i1 %tobool.not, label %cleanup, label %if.end
@@ -791,7 +1235,7 @@ define void @test16(i32 signext %arg, i32 signext %arg1) nounwind {
 ; CHECK-NEXT:    .p2align 4, , 16
 ; CHECK-NEXT:  .LBB19_1: # %bb2
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    addi.w $a0, $s0, 0
+; CHECK-NEXT:    move $a0, $s0
 ; CHECK-NEXT:    bl %plt(bar)
 ; CHECK-NEXT:    sll.w $s0, $s0, $fp
 ; CHECK-NEXT:    bnez $a0, .LBB19_1
@@ -801,6 +1245,29 @@ define void @test16(i32 signext %arg, i32 signext %arg1) nounwind {
 ; CHECK-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
 ; CHECK-NEXT:    addi.d $sp, $sp, 32
 ; CHECK-NEXT:    ret
+;
+; NORMV-LABEL: test16:
+; NORMV:       # %bb.0: # %bb
+; NORMV-NEXT:    addi.d $sp, $sp, -32
+; NORMV-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
+; NORMV-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
+; NORMV-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
+; NORMV-NEXT:    move $fp, $a1
+; NORMV-NEXT:    bl %plt(bar)
+; NORMV-NEXT:    move $s0, $a0
+; NORMV-NEXT:    .p2align 4, , 16
+; NORMV-NEXT:  .LBB19_1: # %bb2
+; NORMV-NEXT:    # =>This Inner Loop Header: Depth=1
+; NORMV-NEXT:    addi.w $a0, $s0, 0
+; NORMV-NEXT:    bl %plt(bar)
+; NORMV-NEXT:    sll.w $s0, $s0, $fp
+; NORMV-NEXT:    bnez $a0, .LBB19_1
+; NORMV-NEXT:  # %bb.2: # %bb7
+; NORMV-NEXT:    ld.d $s0, $sp, 8 # 8-byte Folded Reload
+; NORMV-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
+; NORMV-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+; NORMV-NEXT:    addi.d $sp, $sp, 32
+; NORMV-NEXT:    ret
 bb:
   %i = call signext i32 @bar(i32 signext %arg)
   br label %bb2
@@ -829,7 +1296,7 @@ define void @test17(i32 signext %arg, i32 signext %arg1) nounwind {
 ; CHECK-NEXT:    .p2align 4, , 16
 ; CHECK-NEXT:  .LBB20_1: # %bb2
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    addi.w $a0, $s0, 0
+; CHECK-NEXT:    move $a0, $s0
 ; CHECK-NEXT:    bl %plt(bar)
 ; CHECK-NEXT:    sll.w $s0, $s0, $fp
 ; CHECK-NEXT:    bnez $a0, .LBB20_1
@@ -839,6 +1306,29 @@ define void @test17(i32 signext %arg, i32 signext %arg1) nounwind {
 ; CHECK-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
 ; CHECK-NEXT:    addi.d $sp, $sp, 32
 ; CHECK-NEXT:    ret
+;
+; NORMV-LABEL: test17:
+; NORMV:       # %bb.0: # %bb
+; NORMV-NEXT:    addi.d $sp, $sp, -32
+; NORMV-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
+; NORMV-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
+; NORMV-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
+; NORMV-NEXT:    move $fp, $a1
+; NORMV-NEXT:    bl %plt(bat)
+; NORMV-NEXT:    move $s0, $a0
+; NORMV-NEXT:    .p2align 4, , 16
+; NORMV-NEXT:  .LBB20_1: # %bb2
+; NORMV-NEXT:    # =>This Inner Loop Header: Depth=1
+; NORMV-NEXT:    addi.w $a0, $s0, 0
+; NORMV-NEXT:    bl %plt(bar)
+; NORMV-NEXT:    sll.w $s0, $s0, $fp
+; NORMV-NEXT:    bnez $a0, .LBB20_1
+; NORMV-NEXT:  # %bb.2: # %bb7
+; NORMV-NEXT:    ld.d $s0, $sp, 8 # 8-byte Folded Reload
+; NORMV-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
+; NORMV-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+; NORMV-NEXT:    addi.d $sp, $sp, 32
+; NORMV-NEXT:    ret
 bb:
   %i = call zeroext i16 @bat(i32 signext %arg)
   %zext = zext i16 %i to i32
@@ -866,6 +1356,16 @@ define signext i32 @sextw_sh2add(i1 zeroext %0, ptr %1, i32 signext %2, i32 sign
 ; CHECK-NEXT:  .LBB21_2:
 ; CHECK-NEXT:    add.w $a0, $a2, $a4
 ; CHECK-NEXT:    ret
+;
+; NORMV-LABEL: sextw_sh2add:
+; NORMV:       # %bb.0:
+; NORMV-NEXT:    alsl.d $a2, $a2, $a3, 2
+; NORMV-NEXT:    beqz $a0, .LBB21_2
+; NORMV-NEXT:  # %bb.1:
+; NORMV-NEXT:    st.w $a2, $a1, 0
+; NORMV-NEXT:  .LBB21_2:
+; NORMV-NEXT:    add.w $a0, $a2, $a4
+; NORMV-NEXT:    ret
   %6 = shl i32 %2, 2
   %7 = add i32 %6, %3
   br i1 %0, label %8, label %9
@@ -901,6 +1401,28 @@ define signext i32 @test19(i64 %arg, i1 zeroext %c1, i1 zeroext %c2, ptr %p) nou
 ; CHECK-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
 ; CHECK-NEXT:    ret
+;
+; NORMV-LABEL: test19:
+; NORMV:       # %bb.0: # %bb
+; NORMV-NEXT:    addi.d $sp, $sp, -16
+; NORMV-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; NORMV-NEXT:    st.d $fp, $sp, 0 # 8-byte Folded Spill
+; NORMV-NEXT:    ori $a0, $zero, 35
+; NORMV-NEXT:    lu32i.d $a0, 1
+; NORMV-NEXT:    maskeqz $fp, $a0, $a1
+; NORMV-NEXT:    st.d $fp, $a3, 0
+; NORMV-NEXT:    beqz $a2, .LBB22_2
+; NORMV-NEXT:  # %bb.1: # %bb2
+; NORMV-NEXT:    move $a0, $zero
+; NORMV-NEXT:    bl %plt(bar)
+; NORMV-NEXT:    move $fp, $a0
+; NORMV-NEXT:  .LBB22_2: # %bb7
+; NORMV-NEXT:    bl %plt(side_effect)
+; NORMV-NEXT:    addi.w $a0, $fp, 0
+; NORMV-NEXT:    ld.d $fp, $sp, 0 # 8-byte Folded Reload
+; NORMV-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; NORMV-NEXT:    addi.d $sp, $sp, 16
+; NORMV-NEXT:    ret
 bb:
   %sel = select i1 %c1, i64 4294967331, i64 0
   store i64 %sel, ptr %p, align 8
diff --git a/llvm/test/CodeGen/M68k/Arith/divide-by-constant.ll b/llvm/test/CodeGen/M68k/Arith/divide-by-constant.ll
index fcc8dd3e7662..4b60256751fc 100644
--- a/llvm/test/CodeGen/M68k/Arith/divide-by-constant.ll
+++ b/llvm/test/CodeGen/M68k/Arith/divide-by-constant.ll
@@ -77,7 +77,7 @@ define i32 @test5(i32 %A) nounwind {
 ; CHECK-NEXT:    suba.l #12, %sp
 ; CHECK-NEXT:    move.l #1577682821, (4,%sp)
 ; CHECK-NEXT:    move.l (16,%sp), (%sp)
-; CHECK-NEXT:    jsr __udivsi3@PLT
+; CHECK-NEXT:    jsr __udivsi3
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
   %tmp1 = udiv i32 %A, 1577682821         ; <i32> [#uses=1]
@@ -114,7 +114,7 @@ define i32 @test7(i32 %x) nounwind {
 ; CHECK-NEXT:    suba.l #12, %sp
 ; CHECK-NEXT:    move.l #28, (4,%sp)
 ; CHECK-NEXT:    move.l (16,%sp), (%sp)
-; CHECK-NEXT:    jsr __udivsi3@PLT
+; CHECK-NEXT:    jsr __udivsi3
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
   %div = udiv i32 %x, 28
@@ -178,7 +178,7 @@ define i32 @testsize2(i32 %x) minsize nounwind {
 ; CHECK-NEXT:    suba.l #12, %sp
 ; CHECK-NEXT:    move.l #33, (4,%sp)
 ; CHECK-NEXT:    move.l (16,%sp), (%sp)
-; CHECK-NEXT:    jsr __divsi3@PLT
+; CHECK-NEXT:    jsr __divsi3
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
 entry:
@@ -203,7 +203,7 @@ define i32 @testsize4(i32 %x) minsize nounwind {
 ; CHECK-NEXT:    suba.l #12, %sp
 ; CHECK-NEXT:    move.l #33, (4,%sp)
 ; CHECK-NEXT:    move.l (16,%sp), (%sp)
-; CHECK-NEXT:    jsr __udivsi3@PLT
+; CHECK-NEXT:    jsr __udivsi3
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
 entry:
diff --git a/llvm/test/CodeGen/M68k/Arith/imul.ll b/llvm/test/CodeGen/M68k/Arith/imul.ll
index a1846e4d51bd..f68188537339 100644
--- a/llvm/test/CodeGen/M68k/Arith/imul.ll
+++ b/llvm/test/CodeGen/M68k/Arith/imul.ll
@@ -116,7 +116,7 @@ define i32 @mul_32(i32 %a, i32 %b) {
 ; CHECK-NEXT:    .cfi_def_cfa_offset -16
 ; CHECK-NEXT:    move.l (20,%sp), (4,%sp)
 ; CHECK-NEXT:    move.l (16,%sp), (%sp)
-; CHECK-NEXT:    jsr __mulsi3@PLT
+; CHECK-NEXT:    jsr __mulsi3
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
     %mul = mul i32 %a, %b
@@ -162,7 +162,7 @@ define i64 @mul_64(i64 %a, i64 %b) {
 ; CHECK-NEXT:    move.l (32,%sp), (8,%sp)
 ; CHECK-NEXT:    move.l (28,%sp), (4,%sp)
 ; CHECK-NEXT:    move.l (24,%sp), (%sp)
-; CHECK-NEXT:    jsr __muldi3@PLT
+; CHECK-NEXT:    jsr __muldi3
 ; CHECK-NEXT:    adda.l #20, %sp
 ; CHECK-NEXT:    rts
     %mul = mul i64 %a, %b
@@ -179,7 +179,7 @@ define i64 @mul3_64(i64 %A) {
 ; CHECK-NEXT:    move.l #0, (8,%sp)
 ; CHECK-NEXT:    move.l (28,%sp), (4,%sp)
 ; CHECK-NEXT:    move.l (24,%sp), (%sp)
-; CHECK-NEXT:    jsr __muldi3@PLT
+; CHECK-NEXT:    jsr __muldi3
 ; CHECK-NEXT:    adda.l #20, %sp
 ; CHECK-NEXT:    rts
     %mul = mul i64 %A, 3
@@ -196,7 +196,7 @@ define i64 @mul40_64(i64 %A) {
 ; CHECK-NEXT:    move.l #0, (8,%sp)
 ; CHECK-NEXT:    move.l (28,%sp), (4,%sp)
 ; CHECK-NEXT:    move.l (24,%sp), (%sp)
-; CHECK-NEXT:    jsr __muldi3@PLT
+; CHECK-NEXT:    jsr __muldi3
 ; CHECK-NEXT:    adda.l #20, %sp
 ; CHECK-NEXT:    rts
     %mul = mul i64 %A, 40
diff --git a/llvm/test/CodeGen/M68k/Arith/mul64.ll b/llvm/test/CodeGen/M68k/Arith/mul64.ll
index f6228d4c6347..12967025ab46 100644
--- a/llvm/test/CodeGen/M68k/Arith/mul64.ll
+++ b/llvm/test/CodeGen/M68k/Arith/mul64.ll
@@ -11,7 +11,7 @@ define i64 @foo(i64 %t, i64 %u) nounwind {
 ; CHECK-NEXT:    move.l (32,%sp), (8,%sp)
 ; CHECK-NEXT:    move.l (28,%sp), (4,%sp)
 ; CHECK-NEXT:    move.l (24,%sp), (%sp)
-; CHECK-NEXT:    jsr __muldi3@PLT
+; CHECK-NEXT:    jsr __muldi3
 ; CHECK-NEXT:    adda.l #20, %sp
 ; CHECK-NEXT:    rts
   %k = mul i64 %t, %u
diff --git a/llvm/test/CodeGen/M68k/Arith/sdiv-exact.ll b/llvm/test/CodeGen/M68k/Arith/sdiv-exact.ll
index bb6b4acc034d..96cc8b237202 100644
--- a/llvm/test/CodeGen/M68k/Arith/sdiv-exact.ll
+++ b/llvm/test/CodeGen/M68k/Arith/sdiv-exact.ll
@@ -9,7 +9,7 @@ define i32 @test1(i32 %x) {
 ; CHECK-NEXT:    .cfi_def_cfa_offset -16
 ; CHECK-NEXT:    move.l #-1030792151, (4,%sp)
 ; CHECK-NEXT:    move.l (16,%sp), (%sp)
-; CHECK-NEXT:    jsr __mulsi3@PLT
+; CHECK-NEXT:    jsr __mulsi3
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
   %div = sdiv exact i32 %x, 25
@@ -26,7 +26,7 @@ define i32 @test2(i32 %x) {
 ; CHECK-NEXT:    asr.l #3, %d0
 ; CHECK-NEXT:    move.l %d0, (%sp)
 ; CHECK-NEXT:    move.l #-1431655765, (4,%sp)
-; CHECK-NEXT:    jsr __mulsi3@PLT
+; CHECK-NEXT:    jsr __mulsi3
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
   %div = sdiv exact i32 %x, 24
diff --git a/llvm/test/CodeGen/M68k/Arith/smul-with-overflow.ll b/llvm/test/CodeGen/M68k/Arith/smul-with-overflow.ll
index 10a797f13441..423431750f20 100644
--- a/llvm/test/CodeGen/M68k/Arith/smul-with-overflow.ll
+++ b/llvm/test/CodeGen/M68k/Arith/smul-with-overflow.ll
@@ -69,7 +69,7 @@ define fastcc i1 @test1(i32 %v1, i32 %v2) nounwind {
 ; CHECK-NEXT:  ; %bb.2: ; %overflow
 ; CHECK-NEXT:    lea (no,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
-; CHECK-NEXT:    jsr printf@PLT
+; CHECK-NEXT:    jsr printf
 ; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
@@ -77,7 +77,7 @@ define fastcc i1 @test1(i32 %v1, i32 %v2) nounwind {
 ; CHECK-NEXT:    move.l %d0, (4,%sp)
 ; CHECK-NEXT:    lea (ok,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
-; CHECK-NEXT:    jsr printf@PLT
+; CHECK-NEXT:    jsr printf
 ; CHECK-NEXT:    moveq #1, %d0
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
@@ -107,7 +107,7 @@ define fastcc i1 @test2(i32 %v1, i32 %v2) nounwind {
 ; CHECK-NEXT:  ; %bb.1: ; %overflow
 ; CHECK-NEXT:    lea (no,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
-; CHECK-NEXT:    jsr printf@PLT
+; CHECK-NEXT:    jsr printf
 ; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
@@ -115,7 +115,7 @@ define fastcc i1 @test2(i32 %v1, i32 %v2) nounwind {
 ; CHECK-NEXT:    move.l %d0, (4,%sp)
 ; CHECK-NEXT:    lea (ok,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
-; CHECK-NEXT:    jsr printf@PLT
+; CHECK-NEXT:    jsr printf
 ; CHECK-NEXT:    moveq #1, %d0
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
diff --git a/llvm/test/CodeGen/M68k/Arith/sub-with-overflow.ll b/llvm/test/CodeGen/M68k/Arith/sub-with-overflow.ll
index be3223156986..85a1e35f1a69 100644
--- a/llvm/test/CodeGen/M68k/Arith/sub-with-overflow.ll
+++ b/llvm/test/CodeGen/M68k/Arith/sub-with-overflow.ll
@@ -18,7 +18,7 @@ define i1 @func1(i32 %v1, i32 %v2) nounwind {
 ; CHECK-NEXT:  ; %bb.2: ; %overflow
 ; CHECK-NEXT:    lea (no,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
-; CHECK-NEXT:    jsr printf@PLT
+; CHECK-NEXT:    jsr printf
 ; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
@@ -26,7 +26,7 @@ define i1 @func1(i32 %v1, i32 %v2) nounwind {
 ; CHECK-NEXT:    move.l %d0, (4,%sp)
 ; CHECK-NEXT:    lea (ok,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
-; CHECK-NEXT:    jsr printf@PLT
+; CHECK-NEXT:    jsr printf
 ; CHECK-NEXT:    moveq #1, %d0
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
@@ -55,7 +55,7 @@ define i1 @func2(i32 %v1, i32 %v2) nounwind {
 ; CHECK-NEXT:  ; %bb.2: ; %carry
 ; CHECK-NEXT:    lea (no,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
-; CHECK-NEXT:    jsr printf@PLT
+; CHECK-NEXT:    jsr printf
 ; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
@@ -63,7 +63,7 @@ define i1 @func2(i32 %v1, i32 %v2) nounwind {
 ; CHECK-NEXT:    move.l %d0, (4,%sp)
 ; CHECK-NEXT:    lea (ok,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
-; CHECK-NEXT:    jsr printf@PLT
+; CHECK-NEXT:    jsr printf
 ; CHECK-NEXT:    moveq #1, %d0
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
diff --git a/llvm/test/CodeGen/M68k/Atomics/cmpxchg.ll b/llvm/test/CodeGen/M68k/Atomics/cmpxchg.ll
index b018ea4af4aa..42c0a333fa1b 100644
--- a/llvm/test/CodeGen/M68k/Atomics/cmpxchg.ll
+++ b/llvm/test/CodeGen/M68k/Atomics/cmpxchg.ll
@@ -18,7 +18,7 @@ define i1 @cmpxchg_i8_monotonic_monotonic(i8 %cmp, i8 %new, ptr %mem) nounwind {
 ; NO-ATOMIC-NEXT:    and.l #255, %d0
 ; NO-ATOMIC-NEXT:    move.l %d0, (4,%sp)
 ; NO-ATOMIC-NEXT:    move.l (32,%sp), (%sp)
-; NO-ATOMIC-NEXT:    jsr __sync_val_compare_and_swap_1@PLT
+; NO-ATOMIC-NEXT:    jsr __sync_val_compare_and_swap_1
 ; NO-ATOMIC-NEXT:    sub.b %d2, %d0
 ; NO-ATOMIC-NEXT:    seq %d0
 ; NO-ATOMIC-NEXT:    movem.l (16,%sp), %d2 ; 8-byte Folded Reload
@@ -55,7 +55,7 @@ define i16 @cmpxchg_i16_release_monotonic(i16 %cmp, i16 %new, ptr %mem) nounwind
 ; NO-ATOMIC-NEXT:    and.l #65535, %d0
 ; NO-ATOMIC-NEXT:    move.l %d0, (4,%sp)
 ; NO-ATOMIC-NEXT:    move.l (24,%sp), (%sp)
-; NO-ATOMIC-NEXT:    jsr __sync_val_compare_and_swap_2@PLT
+; NO-ATOMIC-NEXT:    jsr __sync_val_compare_and_swap_2
 ; NO-ATOMIC-NEXT:    adda.l #12, %sp
 ; NO-ATOMIC-NEXT:    rts
 ;
@@ -78,7 +78,7 @@ define i32 @cmpxchg_i32_release_acquire(i32 %cmp, i32 %new, ptr %mem) nounwind {
 ; NO-ATOMIC-NEXT:    move.l (20,%sp), (8,%sp)
 ; NO-ATOMIC-NEXT:    move.l (16,%sp), (4,%sp)
 ; NO-ATOMIC-NEXT:    move.l (24,%sp), (%sp)
-; NO-ATOMIC-NEXT:    jsr __sync_val_compare_and_swap_4@PLT
+; NO-ATOMIC-NEXT:    jsr __sync_val_compare_and_swap_4
 ; NO-ATOMIC-NEXT:    adda.l #12, %sp
 ; NO-ATOMIC-NEXT:    rts
 ;
@@ -107,7 +107,7 @@ define i64 @cmpxchg_i64_seqcst_seqcst(i64 %cmp, i64 %new, ptr %mem) nounwind {
 ; NO-ATOMIC-NEXT:    move.l (52,%sp), (12,%sp)
 ; NO-ATOMIC-NEXT:    move.l (48,%sp), (8,%sp)
 ; NO-ATOMIC-NEXT:    move.l (56,%sp), (%sp)
-; NO-ATOMIC-NEXT:    jsr __atomic_compare_exchange_8@PLT
+; NO-ATOMIC-NEXT:    jsr __atomic_compare_exchange_8
 ; NO-ATOMIC-NEXT:    move.l (28,%sp), %d1
 ; NO-ATOMIC-NEXT:    move.l (24,%sp), %d0
 ; NO-ATOMIC-NEXT:    adda.l #36, %sp
@@ -125,7 +125,7 @@ define i64 @cmpxchg_i64_seqcst_seqcst(i64 %cmp, i64 %new, ptr %mem) nounwind {
 ; ATOMIC-NEXT:    move.l (52,%sp), (12,%sp)
 ; ATOMIC-NEXT:    move.l (48,%sp), (8,%sp)
 ; ATOMIC-NEXT:    move.l (56,%sp), (%sp)
-; ATOMIC-NEXT:    jsr __atomic_compare_exchange_8@PLT
+; ATOMIC-NEXT:    jsr __atomic_compare_exchange_8
 ; ATOMIC-NEXT:    move.l (28,%sp), %d1
 ; ATOMIC-NEXT:    move.l (24,%sp), %d0
 ; ATOMIC-NEXT:    adda.l #36, %sp
diff --git a/llvm/test/CodeGen/M68k/Atomics/load-store.ll b/llvm/test/CodeGen/M68k/Atomics/load-store.ll
index b238172c2f12..23fdfad05cab 100644
--- a/llvm/test/CodeGen/M68k/Atomics/load-store.ll
+++ b/llvm/test/CodeGen/M68k/Atomics/load-store.ll
@@ -203,7 +203,7 @@ define i64 @atomic_load_i64_unordered(ptr %a) nounwind {
 ; NO-ATOMIC-NEXT:    suba.l #12, %sp
 ; NO-ATOMIC-NEXT:    move.l #0, (4,%sp)
 ; NO-ATOMIC-NEXT:    move.l (16,%sp), (%sp)
-; NO-ATOMIC-NEXT:    jsr __atomic_load_8@PLT
+; NO-ATOMIC-NEXT:    jsr __atomic_load_8
 ; NO-ATOMIC-NEXT:    adda.l #12, %sp
 ; NO-ATOMIC-NEXT:    rts
 ;
@@ -212,7 +212,7 @@ define i64 @atomic_load_i64_unordered(ptr %a) nounwind {
 ; ATOMIC-NEXT:    suba.l #12, %sp
 ; ATOMIC-NEXT:    move.l #0, (4,%sp)
 ; ATOMIC-NEXT:    move.l (16,%sp), (%sp)
-; ATOMIC-NEXT:    jsr __atomic_load_8@PLT
+; ATOMIC-NEXT:    jsr __atomic_load_8
 ; ATOMIC-NEXT:    adda.l #12, %sp
 ; ATOMIC-NEXT:    rts
   %1 = load atomic i64, ptr %a unordered, align 8
@@ -225,7 +225,7 @@ define i64 @atomic_load_i64_monotonic(ptr %a) nounwind {
 ; NO-ATOMIC-NEXT:    suba.l #12, %sp
 ; NO-ATOMIC-NEXT:    move.l #0, (4,%sp)
 ; NO-ATOMIC-NEXT:    move.l (16,%sp), (%sp)
-; NO-ATOMIC-NEXT:    jsr __atomic_load_8@PLT
+; NO-ATOMIC-NEXT:    jsr __atomic_load_8
 ; NO-ATOMIC-NEXT:    adda.l #12, %sp
 ; NO-ATOMIC-NEXT:    rts
 ;
@@ -234,7 +234,7 @@ define i64 @atomic_load_i64_monotonic(ptr %a) nounwind {
 ; ATOMIC-NEXT:    suba.l #12, %sp
 ; ATOMIC-NEXT:    move.l #0, (4,%sp)
 ; ATOMIC-NEXT:    move.l (16,%sp), (%sp)
-; ATOMIC-NEXT:    jsr __atomic_load_8@PLT
+; ATOMIC-NEXT:    jsr __atomic_load_8
 ; ATOMIC-NEXT:    adda.l #12, %sp
 ; ATOMIC-NEXT:    rts
   %1 = load atomic i64, ptr %a monotonic, align 8
@@ -247,7 +247,7 @@ define i64 @atomic_load_i64_acquire(ptr %a) nounwind {
 ; NO-ATOMIC-NEXT:    suba.l #12, %sp
 ; NO-ATOMIC-NEXT:    move.l #2, (4,%sp)
 ; NO-ATOMIC-NEXT:    move.l (16,%sp), (%sp)
-; NO-ATOMIC-NEXT:    jsr __atomic_load_8@PLT
+; NO-ATOMIC-NEXT:    jsr __atomic_load_8
 ; NO-ATOMIC-NEXT:    adda.l #12, %sp
 ; NO-ATOMIC-NEXT:    rts
 ;
@@ -256,7 +256,7 @@ define i64 @atomic_load_i64_acquire(ptr %a) nounwind {
 ; ATOMIC-NEXT:    suba.l #12, %sp
 ; ATOMIC-NEXT:    move.l #2, (4,%sp)
 ; ATOMIC-NEXT:    move.l (16,%sp), (%sp)
-; ATOMIC-NEXT:    jsr __atomic_load_8@PLT
+; ATOMIC-NEXT:    jsr __atomic_load_8
 ; ATOMIC-NEXT:    adda.l #12, %sp
 ; ATOMIC-NEXT:    rts
   %1 = load atomic i64, ptr %a acquire, align 8
@@ -269,7 +269,7 @@ define i64 @atomic_load_i64_seq_cst(ptr %a) nounwind {
 ; NO-ATOMIC-NEXT:    suba.l #12, %sp
 ; NO-ATOMIC-NEXT:    move.l #5, (4,%sp)
 ; NO-ATOMIC-NEXT:    move.l (16,%sp), (%sp)
-; NO-ATOMIC-NEXT:    jsr __atomic_load_8@PLT
+; NO-ATOMIC-NEXT:    jsr __atomic_load_8
 ; NO-ATOMIC-NEXT:    adda.l #12, %sp
 ; NO-ATOMIC-NEXT:    rts
 ;
@@ -278,7 +278,7 @@ define i64 @atomic_load_i64_seq_cst(ptr %a) nounwind {
 ; ATOMIC-NEXT:    suba.l #12, %sp
 ; ATOMIC-NEXT:    move.l #5, (4,%sp)
 ; ATOMIC-NEXT:    move.l (16,%sp), (%sp)
-; ATOMIC-NEXT:    jsr __atomic_load_8@PLT
+; ATOMIC-NEXT:    jsr __atomic_load_8
 ; ATOMIC-NEXT:    adda.l #12, %sp
 ; ATOMIC-NEXT:    rts
   %1 = load atomic i64, ptr %a seq_cst, align 8
@@ -509,7 +509,7 @@ define void @atomic_store_i64_unordered(ptr %a, i64 %val) nounwind {
 ; NO-ATOMIC-NEXT:    move.l (32,%sp), (8,%sp)
 ; NO-ATOMIC-NEXT:    move.l (28,%sp), (4,%sp)
 ; NO-ATOMIC-NEXT:    move.l (24,%sp), (%sp)
-; NO-ATOMIC-NEXT:    jsr __atomic_store_8@PLT
+; NO-ATOMIC-NEXT:    jsr __atomic_store_8
 ; NO-ATOMIC-NEXT:    adda.l #20, %sp
 ; NO-ATOMIC-NEXT:    rts
 ;
@@ -520,7 +520,7 @@ define void @atomic_store_i64_unordered(ptr %a, i64 %val) nounwind {
 ; ATOMIC-NEXT:    move.l (32,%sp), (8,%sp)
 ; ATOMIC-NEXT:    move.l (28,%sp), (4,%sp)
 ; ATOMIC-NEXT:    move.l (24,%sp), (%sp)
-; ATOMIC-NEXT:    jsr __atomic_store_8@PLT
+; ATOMIC-NEXT:    jsr __atomic_store_8
 ; ATOMIC-NEXT:    adda.l #20, %sp
 ; ATOMIC-NEXT:    rts
   store atomic i64 %val, ptr %a unordered, align 8
@@ -535,7 +535,7 @@ define void @atomic_store_i64_monotonic(ptr %a, i64 %val) nounwind {
 ; NO-ATOMIC-NEXT:    move.l (32,%sp), (8,%sp)
 ; NO-ATOMIC-NEXT:    move.l (28,%sp), (4,%sp)
 ; NO-ATOMIC-NEXT:    move.l (24,%sp), (%sp)
-; NO-ATOMIC-NEXT:    jsr __atomic_store_8@PLT
+; NO-ATOMIC-NEXT:    jsr __atomic_store_8
 ; NO-ATOMIC-NEXT:    adda.l #20, %sp
 ; NO-ATOMIC-NEXT:    rts
 ;
@@ -546,7 +546,7 @@ define void @atomic_store_i64_monotonic(ptr %a, i64 %val) nounwind {
 ; ATOMIC-NEXT:    move.l (32,%sp), (8,%sp)
 ; ATOMIC-NEXT:    move.l (28,%sp), (4,%sp)
 ; ATOMIC-NEXT:    move.l (24,%sp), (%sp)
-; ATOMIC-NEXT:    jsr __atomic_store_8@PLT
+; ATOMIC-NEXT:    jsr __atomic_store_8
 ; ATOMIC-NEXT:    adda.l #20, %sp
 ; ATOMIC-NEXT:    rts
   store atomic i64 %val, ptr %a monotonic, align 8
@@ -561,7 +561,7 @@ define void @atomic_store_i64_release(ptr %a, i64 %val) nounwind {
 ; NO-ATOMIC-NEXT:    move.l (32,%sp), (8,%sp)
 ; NO-ATOMIC-NEXT:    move.l (28,%sp), (4,%sp)
 ; NO-ATOMIC-NEXT:    move.l (24,%sp), (%sp)
-; NO-ATOMIC-NEXT:    jsr __atomic_store_8@PLT
+; NO-ATOMIC-NEXT:    jsr __atomic_store_8
 ; NO-ATOMIC-NEXT:    adda.l #20, %sp
 ; NO-ATOMIC-NEXT:    rts
 ;
@@ -572,7 +572,7 @@ define void @atomic_store_i64_release(ptr %a, i64 %val) nounwind {
 ; ATOMIC-NEXT:    move.l (32,%sp), (8,%sp)
 ; ATOMIC-NEXT:    move.l (28,%sp), (4,%sp)
 ; ATOMIC-NEXT:    move.l (24,%sp), (%sp)
-; ATOMIC-NEXT:    jsr __atomic_store_8@PLT
+; ATOMIC-NEXT:    jsr __atomic_store_8
 ; ATOMIC-NEXT:    adda.l #20, %sp
 ; ATOMIC-NEXT:    rts
   store atomic i64 %val, ptr %a release, align 8
@@ -587,7 +587,7 @@ define void @atomic_store_i64_seq_cst(ptr %a, i64 %val) nounwind {
 ; NO-ATOMIC-NEXT:    move.l (32,%sp), (8,%sp)
 ; NO-ATOMIC-NEXT:    move.l (28,%sp), (4,%sp)
 ; NO-ATOMIC-NEXT:    move.l (24,%sp), (%sp)
-; NO-ATOMIC-NEXT:    jsr __atomic_store_8@PLT
+; NO-ATOMIC-NEXT:    jsr __atomic_store_8
 ; NO-ATOMIC-NEXT:    adda.l #20, %sp
 ; NO-ATOMIC-NEXT:    rts
 ;
@@ -598,7 +598,7 @@ define void @atomic_store_i64_seq_cst(ptr %a, i64 %val) nounwind {
 ; ATOMIC-NEXT:    move.l (32,%sp), (8,%sp)
 ; ATOMIC-NEXT:    move.l (28,%sp), (4,%sp)
 ; ATOMIC-NEXT:    move.l (24,%sp), (%sp)
-; ATOMIC-NEXT:    jsr __atomic_store_8@PLT
+; ATOMIC-NEXT:    jsr __atomic_store_8
 ; ATOMIC-NEXT:    adda.l #20, %sp
 ; ATOMIC-NEXT:    rts
   store atomic i64 %val, ptr %a seq_cst, align 8
diff --git a/llvm/test/CodeGen/M68k/Atomics/rmw.ll b/llvm/test/CodeGen/M68k/Atomics/rmw.ll
index 1036a0a8ba3d..ce456f0960ee 100644
--- a/llvm/test/CodeGen/M68k/Atomics/rmw.ll
+++ b/llvm/test/CodeGen/M68k/Atomics/rmw.ll
@@ -15,7 +15,7 @@ define i8 @atomicrmw_add_i8(i8 %val, ptr %ptr) {
 ; NO-ATOMIC-NEXT:    and.l #255, %d0
 ; NO-ATOMIC-NEXT:    move.l %d0, (4,%sp)
 ; NO-ATOMIC-NEXT:    move.l (20,%sp), (%sp)
-; NO-ATOMIC-NEXT:    jsr __sync_fetch_and_add_1@PLT
+; NO-ATOMIC-NEXT:    jsr __sync_fetch_and_add_1
 ; NO-ATOMIC-NEXT:    adda.l #12, %sp
 ; NO-ATOMIC-NEXT:    rts
 ;
@@ -58,7 +58,7 @@ define i16 @atomicrmw_sub_i16(i16 %val, ptr %ptr) {
 ; NO-ATOMIC-NEXT:    and.l #65535, %d0
 ; NO-ATOMIC-NEXT:    move.l %d0, (4,%sp)
 ; NO-ATOMIC-NEXT:    move.l (20,%sp), (%sp)
-; NO-ATOMIC-NEXT:    jsr __sync_fetch_and_sub_2@PLT
+; NO-ATOMIC-NEXT:    jsr __sync_fetch_and_sub_2
 ; NO-ATOMIC-NEXT:    adda.l #12, %sp
 ; NO-ATOMIC-NEXT:    rts
 ;
@@ -99,7 +99,7 @@ define i32 @atomicrmw_and_i32(i32 %val, ptr %ptr) {
 ; NO-ATOMIC-NEXT:    .cfi_def_cfa_offset -16
 ; NO-ATOMIC-NEXT:    move.l (16,%sp), (4,%sp)
 ; NO-ATOMIC-NEXT:    move.l (20,%sp), (%sp)
-; NO-ATOMIC-NEXT:    jsr __sync_fetch_and_and_4@PLT
+; NO-ATOMIC-NEXT:    jsr __sync_fetch_and_and_4
 ; NO-ATOMIC-NEXT:    adda.l #12, %sp
 ; NO-ATOMIC-NEXT:    rts
 ;
@@ -142,7 +142,7 @@ define i64 @atomicrmw_xor_i64(i64 %val, ptr %ptr) {
 ; NO-ATOMIC-NEXT:    move.l (28,%sp), (8,%sp)
 ; NO-ATOMIC-NEXT:    move.l (24,%sp), (4,%sp)
 ; NO-ATOMIC-NEXT:    move.l (32,%sp), (%sp)
-; NO-ATOMIC-NEXT:    jsr __atomic_fetch_xor_8@PLT
+; NO-ATOMIC-NEXT:    jsr __atomic_fetch_xor_8
 ; NO-ATOMIC-NEXT:    adda.l #20, %sp
 ; NO-ATOMIC-NEXT:    rts
 ;
@@ -155,7 +155,7 @@ define i64 @atomicrmw_xor_i64(i64 %val, ptr %ptr) {
 ; ATOMIC-NEXT:    move.l (28,%sp), (8,%sp)
 ; ATOMIC-NEXT:    move.l (24,%sp), (4,%sp)
 ; ATOMIC-NEXT:    move.l (32,%sp), (%sp)
-; ATOMIC-NEXT:    jsr __atomic_fetch_xor_8@PLT
+; ATOMIC-NEXT:    jsr __atomic_fetch_xor_8
 ; ATOMIC-NEXT:    adda.l #20, %sp
 ; ATOMIC-NEXT:    rts
   %old = atomicrmw xor ptr %ptr, i64 %val release
@@ -172,7 +172,7 @@ define i8 @atomicrmw_or_i8(i8 %val, ptr %ptr) {
 ; NO-ATOMIC-NEXT:    and.l #255, %d0
 ; NO-ATOMIC-NEXT:    move.l %d0, (4,%sp)
 ; NO-ATOMIC-NEXT:    move.l (20,%sp), (%sp)
-; NO-ATOMIC-NEXT:    jsr __sync_fetch_and_or_1@PLT
+; NO-ATOMIC-NEXT:    jsr __sync_fetch_and_or_1
 ; NO-ATOMIC-NEXT:    adda.l #12, %sp
 ; NO-ATOMIC-NEXT:    rts
 ;
@@ -217,7 +217,7 @@ define i16 @atmoicrmw_nand_i16(i16 %val, ptr %ptr) {
 ; NO-ATOMIC-NEXT:    and.l #65535, %d0
 ; NO-ATOMIC-NEXT:    move.l %d0, (4,%sp)
 ; NO-ATOMIC-NEXT:    move.l (20,%sp), (%sp)
-; NO-ATOMIC-NEXT:    jsr __sync_fetch_and_nand_2@PLT
+; NO-ATOMIC-NEXT:    jsr __sync_fetch_and_nand_2
 ; NO-ATOMIC-NEXT:    move.w %d2, %d0
 ; NO-ATOMIC-NEXT:    movem.l (8,%sp), %d2 ; 8-byte Folded Reload
 ; NO-ATOMIC-NEXT:    adda.l #12, %sp
@@ -261,7 +261,7 @@ define i32 @atomicrmw_min_i32(i32 %val, ptr %ptr) {
 ; NO-ATOMIC-NEXT:    .cfi_def_cfa_offset -16
 ; NO-ATOMIC-NEXT:    move.l (16,%sp), (4,%sp)
 ; NO-ATOMIC-NEXT:    move.l (20,%sp), (%sp)
-; NO-ATOMIC-NEXT:    jsr __sync_fetch_and_min_4@PLT
+; NO-ATOMIC-NEXT:    jsr __sync_fetch_and_min_4
 ; NO-ATOMIC-NEXT:    adda.l #12, %sp
 ; NO-ATOMIC-NEXT:    rts
 ;
@@ -323,7 +323,7 @@ define i64 @atomicrmw_max_i64(i64 %val, ptr %ptr) {
 ; NO-ATOMIC-NEXT:    move.l %d0, (8,%sp)
 ; NO-ATOMIC-NEXT:    move.l #5, (20,%sp)
 ; NO-ATOMIC-NEXT:    move.l #5, (16,%sp)
-; NO-ATOMIC-NEXT:    jsr __atomic_compare_exchange_8@PLT
+; NO-ATOMIC-NEXT:    jsr __atomic_compare_exchange_8
 ; NO-ATOMIC-NEXT:    move.b %d0, %d2
 ; NO-ATOMIC-NEXT:    move.l (28,%sp), %d1
 ; NO-ATOMIC-NEXT:    move.l (24,%sp), %d0
@@ -371,7 +371,7 @@ define i64 @atomicrmw_max_i64(i64 %val, ptr %ptr) {
 ; ATOMIC-NEXT:    move.l %d0, (8,%sp)
 ; ATOMIC-NEXT:    move.l #5, (20,%sp)
 ; ATOMIC-NEXT:    move.l #5, (16,%sp)
-; ATOMIC-NEXT:    jsr __atomic_compare_exchange_8@PLT
+; ATOMIC-NEXT:    jsr __atomic_compare_exchange_8
 ; ATOMIC-NEXT:    move.b %d0, %d2
 ; ATOMIC-NEXT:    move.l (28,%sp), %d1
 ; ATOMIC-NEXT:    move.l (24,%sp), %d0
@@ -413,7 +413,7 @@ define i8 @atomicrmw_i8_umin(i8 %val, ptr %ptr) {
 ; NO-ATOMIC-NEXT:    and.l #255, %d0
 ; NO-ATOMIC-NEXT:    move.l %d0, (4,%sp)
 ; NO-ATOMIC-NEXT:    move.l (20,%sp), (%sp)
-; NO-ATOMIC-NEXT:    jsr __sync_fetch_and_umin_1@PLT
+; NO-ATOMIC-NEXT:    jsr __sync_fetch_and_umin_1
 ; NO-ATOMIC-NEXT:    adda.l #12, %sp
 ; NO-ATOMIC-NEXT:    rts
 ;
@@ -465,7 +465,7 @@ define i16 @atomicrmw_umax_i16(i16 %val, ptr %ptr) {
 ; NO-ATOMIC-NEXT:    and.l #65535, %d0
 ; NO-ATOMIC-NEXT:    move.l %d0, (4,%sp)
 ; NO-ATOMIC-NEXT:    move.l (20,%sp), (%sp)
-; NO-ATOMIC-NEXT:    jsr __sync_fetch_and_umax_2@PLT
+; NO-ATOMIC-NEXT:    jsr __sync_fetch_and_umax_2
 ; NO-ATOMIC-NEXT:    adda.l #12, %sp
 ; NO-ATOMIC-NEXT:    rts
 ;
@@ -517,7 +517,7 @@ define i16 @atomicrmw_xchg_i16(i16 %val, ptr %ptr) {
 ; NO-ATOMIC-NEXT:    and.l #65535, %d0
 ; NO-ATOMIC-NEXT:    move.l %d0, (4,%sp)
 ; NO-ATOMIC-NEXT:    move.l (20,%sp), (%sp)
-; NO-ATOMIC-NEXT:    jsr __sync_lock_test_and_set_2@PLT
+; NO-ATOMIC-NEXT:    jsr __sync_lock_test_and_set_2
 ; NO-ATOMIC-NEXT:    adda.l #12, %sp
 ; NO-ATOMIC-NEXT:    rts
 ;
@@ -557,7 +557,7 @@ define i32 @atomicrmw_xchg_i32(i32 %val, ptr %ptr) {
 ; NO-ATOMIC-NEXT:    .cfi_def_cfa_offset -16
 ; NO-ATOMIC-NEXT:    move.l (16,%sp), (4,%sp)
 ; NO-ATOMIC-NEXT:    move.l (20,%sp), (%sp)
-; NO-ATOMIC-NEXT:    jsr __sync_lock_test_and_set_4@PLT
+; NO-ATOMIC-NEXT:    jsr __sync_lock_test_and_set_4
 ; NO-ATOMIC-NEXT:    adda.l #12, %sp
 ; NO-ATOMIC-NEXT:    rts
 ;
diff --git a/llvm/test/CodeGen/M68k/CodeModel/medium-static.ll b/llvm/test/CodeGen/M68k/CodeModel/medium-static.ll
index 87d8380d6cc9..79512dcc8639 100644
--- a/llvm/test/CodeGen/M68k/CodeModel/medium-static.ll
+++ b/llvm/test/CodeGen/M68k/CodeModel/medium-static.ll
@@ -44,7 +44,7 @@ define void @test2() nounwind {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    suba.l #4, %sp
 ; CHECK-NEXT:    move.l #40, (%sp)
-; CHECK-NEXT:    jsr malloc@PLT
+; CHECK-NEXT:    jsr malloc
 ; CHECK-NEXT:    adda.l #4, %sp
 ; CHECK-NEXT:    rts
 entry:
@@ -60,7 +60,7 @@ define void @test3() nounwind {
 ; CHECK-LABEL: test3:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    suba.l #4, %sp
-; CHECK-NEXT:    jsr afoo@PLT
+; CHECK-NEXT:    jsr afoo
 ; CHECK-NEXT:    move.l %a0, pfoo
 ; CHECK-NEXT:    jsr (%a0)
 ; CHECK-NEXT:    adda.l #4, %sp
@@ -79,7 +79,7 @@ define void @test4() nounwind {
 ; CHECK-LABEL: test4:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    suba.l #4, %sp
-; CHECK-NEXT:    jsr foo@PLT
+; CHECK-NEXT:    jsr foo
 ; CHECK-NEXT:    adda.l #4, %sp
 ; CHECK-NEXT:    rts
 entry:
@@ -118,17 +118,17 @@ define void @test7(i32 %n.u) nounwind {
 ; CHECK-NEXT:    move.l (0,%a0,%d0), %a0
 ; CHECK-NEXT:    jmp (%a0)
 ; CHECK-NEXT:  .LBB6_12: ; %bb2
-; CHECK-NEXT:    bra foo6@PLT ; TAILCALL
+; CHECK-NEXT:    bra foo6 ; TAILCALL
 ; CHECK-NEXT:  .LBB6_3: ; %bb5
-; CHECK-NEXT:    bra foo5@PLT ; TAILCALL
+; CHECK-NEXT:    bra foo5 ; TAILCALL
 ; CHECK-NEXT:  .LBB6_5: ; %bb1
-; CHECK-NEXT:    bra foo2@PLT ; TAILCALL
+; CHECK-NEXT:    bra foo2 ; TAILCALL
 ; CHECK-NEXT:  .LBB6_2: ; %bb
-; CHECK-NEXT:    bra foo1@PLT ; TAILCALL
+; CHECK-NEXT:    bra foo1 ; TAILCALL
 ; CHECK-NEXT:  .LBB6_9: ; %bb4
-; CHECK-NEXT:    bra foo4@PLT ; TAILCALL
+; CHECK-NEXT:    bra foo4 ; TAILCALL
 ; CHECK-NEXT:  .LBB6_8: ; %bb3
-; CHECK-NEXT:    bra foo3@PLT ; TAILCALL
+; CHECK-NEXT:    bra foo3 ; TAILCALL
 entry:
     switch i32 %n.u, label %bb12 [i32 1, label %bb i32 2, label %bb6 i32 4, label %bb7 i32 5, label %bb8 i32 6, label %bb10 i32 7, label %bb1 i32 8, label %bb3 i32 9, label %bb4 i32 10, label %bb9 i32 11, label %bb2 i32 12, label %bb5 i32 13, label %bb11 ]
 bb:
diff --git a/llvm/test/CodeGen/M68k/CodeModel/small-static.ll b/llvm/test/CodeGen/M68k/CodeModel/small-static.ll
index d7fa5b0ea4e5..1b946c19b250 100644
--- a/llvm/test/CodeGen/M68k/CodeModel/small-static.ll
+++ b/llvm/test/CodeGen/M68k/CodeModel/small-static.ll
@@ -46,7 +46,7 @@ define void @test2() nounwind {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    suba.l #4, %sp
 ; CHECK-NEXT:    move.l #40, (%sp)
-; CHECK-NEXT:    jsr malloc@PLT
+; CHECK-NEXT:    jsr malloc
 ; CHECK-NEXT:    adda.l #4, %sp
 ; CHECK-NEXT:    rts
 entry:
@@ -61,7 +61,7 @@ define void @test3() nounwind {
 ; CHECK-LABEL: test3:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    suba.l #4, %sp
-; CHECK-NEXT:    jsr afoo@PLT
+; CHECK-NEXT:    jsr afoo
 ; CHECK-NEXT:    move.l %a0, (pfoo,%pc)
 ; CHECK-NEXT:    jsr (%a0)
 ; CHECK-NEXT:    adda.l #4, %sp
@@ -80,7 +80,7 @@ define void @test4() nounwind {
 ; CHECK-LABEL: test4:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    suba.l #4, %sp
-; CHECK-NEXT:    jsr foo@PLT
+; CHECK-NEXT:    jsr foo
 ; CHECK-NEXT:    adda.l #4, %sp
 ; CHECK-NEXT:    rts
 entry:
@@ -120,17 +120,17 @@ define void @test7(i32 %n.u) nounwind {
 ; CHECK-NEXT:    move.l (0,%a0,%d0), %a0
 ; CHECK-NEXT:    jmp (%a0)
 ; CHECK-NEXT:  .LBB6_12: ; %bb2
-; CHECK-NEXT:    bra foo6@PLT ; TAILCALL
+; CHECK-NEXT:    bra foo6 ; TAILCALL
 ; CHECK-NEXT:  .LBB6_3: ; %bb5
-; CHECK-NEXT:    bra foo5@PLT ; TAILCALL
+; CHECK-NEXT:    bra foo5 ; TAILCALL
 ; CHECK-NEXT:  .LBB6_5: ; %bb1
-; CHECK-NEXT:    bra foo2@PLT ; TAILCALL
+; CHECK-NEXT:    bra foo2 ; TAILCALL
 ; CHECK-NEXT:  .LBB6_2: ; %bb
-; CHECK-NEXT:    bra foo1@PLT ; TAILCALL
+; CHECK-NEXT:    bra foo1 ; TAILCALL
 ; CHECK-NEXT:  .LBB6_9: ; %bb4
-; CHECK-NEXT:    bra foo4@PLT ; TAILCALL
+; CHECK-NEXT:    bra foo4 ; TAILCALL
 ; CHECK-NEXT:  .LBB6_8: ; %bb3
-; CHECK-NEXT:    bra foo3@PLT ; TAILCALL
+; CHECK-NEXT:    bra foo3 ; TAILCALL
 entry:
     switch i32 %n.u, label %bb12 [i32 1, label %bb i32 2, label %bb6 i32 4, label %bb7 i32 5, label %bb8 i32 6, label %bb10 i32 7, label %bb1 i32 8, label %bb3 i32 9, label %bb4 i32 10, label %bb9 i32 11, label %bb2 i32 12, label %bb5 i32 13, label %bb11 ]
 bb:
diff --git a/llvm/test/CodeGen/M68k/TLS/tlsie.ll b/llvm/test/CodeGen/M68k/TLS/tlsie.ll
index 8e402ce67962..2060574249c4 100644
--- a/llvm/test/CodeGen/M68k/TLS/tlsie.ll
+++ b/llvm/test/CodeGen/M68k/TLS/tlsie.ll
@@ -7,7 +7,7 @@ define dso_local ptr @get_addr() nounwind {
 ; CHECK-LABEL: get_addr:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    suba.l #4, %sp
-; CHECK-NEXT:    jsr __m68k_read_tp@PLT
+; CHECK-NEXT:    jsr __m68k_read_tp
 ; CHECK-NEXT:    move.l %a0, %d0
 ; CHECK-NEXT:    lea (_GLOBAL_OFFSET_TABLE_@GOTPCREL,%pc), %a0
 ; CHECK-NEXT:    add.l (myvar@GOTTPOFF,%a0), %d0
diff --git a/llvm/test/CodeGen/M68k/TLS/tlsle.ll b/llvm/test/CodeGen/M68k/TLS/tlsle.ll
index a08898fb33eb..7a5f0ab57b37 100644
--- a/llvm/test/CodeGen/M68k/TLS/tlsle.ll
+++ b/llvm/test/CodeGen/M68k/TLS/tlsle.ll
@@ -7,7 +7,7 @@ define dso_local ptr @get_addr() nounwind {
 ; CHECK-LABEL: get_addr:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    suba.l #4, %sp
-; CHECK-NEXT:    jsr __m68k_read_tp@PLT
+; CHECK-NEXT:    jsr __m68k_read_tp
 ; CHECK-NEXT:    lea (myvar@TPOFF,%a0), %a0
 ; CHECK-NEXT:    adda.l #4, %sp
 ; CHECK-NEXT:    rts
diff --git a/llvm/test/CodeGen/M68k/gcc_except_table.ll b/llvm/test/CodeGen/M68k/gcc_except_table.ll
index fe0ed7861dfe..f1c2e126d98f 100644
--- a/llvm/test/CodeGen/M68k/gcc_except_table.ll
+++ b/llvm/test/CodeGen/M68k/gcc_except_table.ll
@@ -16,7 +16,7 @@ define i32 @foo() uwtable ssp personality ptr @__gxx_personality_v0 {
 ; CHECK-NEXT:    suba.l #4, %sp
 ; CHECK-NEXT:    .cfi_def_cfa_offset -8
 ; CHECK-NEXT:  .Ltmp0:
-; CHECK-NEXT:    jsr _Z1fv@PLT
+; CHECK-NEXT:    jsr _Z1fv
 ; CHECK-NEXT:  .Ltmp1:
 ; CHECK-NEXT:  ; %bb.1: ; %try.cont
 ; CHECK-NEXT:    moveq #0, %d0
@@ -25,7 +25,7 @@ define i32 @foo() uwtable ssp personality ptr @__gxx_personality_v0 {
 ; CHECK-NEXT:  .LBB0_2: ; %lpad
 ; CHECK-NEXT:  .Ltmp2:
 ; CHECK-NEXT:    move.l %d0, (%sp)
-; CHECK-NEXT:    jsr _Unwind_Resume@PLT
+; CHECK-NEXT:    jsr _Unwind_Resume
 entry:
   invoke void @_Z1fv() optsize
           to label %try.cont unwind label %lpad
diff --git a/llvm/test/CodeGen/MIR/X86/dbg-value-list.mir b/llvm/test/CodeGen/MIR/X86/dbg-value-list.mir
index c419638be669..d2886d0fff31 100644
--- a/llvm/test/CodeGen/MIR/X86/dbg-value-list.mir
+++ b/llvm/test/CodeGen/MIR/X86/dbg-value-list.mir
@@ -1,64 +1,64 @@
-# RUN: llc -march=x86-64 -run-pass machineverifier -o - %s | FileCheck %s
-# Simple round-trip test for DBG_VALUE_LIST.
-# CHECK: [[VAR_C:![0-9]+]] = !DILocalVariable(name: "c"
-# CHECK: DBG_VALUE_LIST [[VAR_C]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus, DW_OP_stack_value), $edi, $esi, debug-location
---- |
-  ; ModuleID = 'test.cpp'
-  source_filename = "test.cpp"
-  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-  target triple = "x86_64-unknown-linux-gnu"
-  
-  ; Function Attrs: norecurse nounwind readnone uwtable
-  define dso_local i32 @_Z3fooii(i32 %a, i32 %b) local_unnamed_addr !dbg !7 {
-  entry:
-    call void @llvm.dbg.value(metadata i32 %a, metadata !12, metadata !DIExpression()), !dbg !15
-    call void @llvm.dbg.value(metadata i32 %b, metadata !13, metadata !DIExpression()), !dbg !15
-    call void @llvm.dbg.value(metadata i32 undef, metadata !14, metadata !DIExpression()), !dbg !15
-    %mul = mul nsw i32 %b, %a, !dbg !16
-    ret i32 %mul, !dbg !17
-  }
-  
-  ; Function Attrs: nounwind readnone speculatable willreturn
-  declare void @llvm.dbg.value(metadata, metadata, metadata)
-  
-  !llvm.dbg.cu = !{!0}
-  !llvm.module.flags = !{!3, !4, !5}
-  !llvm.ident = !{!6}
-  
-  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
-  !1 = !DIFile(filename: "test.cpp", directory: "/")
-  !2 = !{}
-  !3 = !{i32 7, !"Dwarf Version", i32 4}
-  !4 = !{i32 2, !"Debug Info Version", i32 3}
-  !5 = !{i32 1, !"wchar_size", i32 4}
-  !6 = !{!"clang version 11.0.0"}
-  !7 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooii", scope: !1, file: !1, line: 2, type: !8, scopeLine: 2, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11)
-  !8 = !DISubroutineType(types: !9)
-  !9 = !{!10, !10, !10}
-  !10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-  !11 = !{!12, !13, !14}
-  !12 = !DILocalVariable(name: "a", arg: 1, scope: !7, file: !1, line: 2, type: !10)
-  !13 = !DILocalVariable(name: "b", arg: 2, scope: !7, file: !1, line: 2, type: !10)
-  !14 = !DILocalVariable(name: "c", scope: !7, file: !1, line: 3, type: !10)
-  !15 = !DILocation(line: 0, scope: !7)
-  !16 = !DILocation(line: 4, column: 12, scope: !7)
-  !17 = !DILocation(line: 4, column: 3, scope: !7)
-
-...
----
-name:            _Z3fooii
-body:             |
-  bb.0.entry:
-    liveins: $edi, $esi
-  
-    DBG_VALUE $edi, $noreg, !12, !DIExpression(), debug-location !15
-    DBG_VALUE $esi, $noreg, !13, !DIExpression(), debug-location !15
-    $eax = MOV32rr $edi
-    DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus, DW_OP_stack_value), $edi, $esi, debug-location !15
-    DBG_VALUE $esi, $noreg, !13, !DIExpression(), debug-location !15
-    DBG_VALUE $eax, $noreg, !12, !DIExpression(), debug-location !15
-    renamable $eax = nsw IMUL32rr killed renamable $eax, killed renamable $esi, implicit-def dead $eflags, debug-location !16
-    RET64 $eax, debug-location !17
-
-...
-
+# RUN: llc -march=x86-64 -run-pass machineverifier -o - %s | FileCheck %s
+# Simple round-trip test for DBG_VALUE_LIST.
+# CHECK: [[VAR_C:![0-9]+]] = !DILocalVariable(name: "c"
+# CHECK: DBG_VALUE_LIST [[VAR_C]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus, DW_OP_stack_value), $edi, $esi, debug-location
+--- |
+  ; ModuleID = 'test.cpp'
+  source_filename = "test.cpp"
+  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-unknown-linux-gnu"
+
+  ; Function Attrs: norecurse nounwind readnone uwtable
+  define dso_local i32 @_Z3fooii(i32 %a, i32 %b) local_unnamed_addr !dbg !7 {
+  entry:
+    call void @llvm.dbg.value(metadata i32 %a, metadata !12, metadata !DIExpression()), !dbg !15
+    call void @llvm.dbg.value(metadata i32 %b, metadata !13, metadata !DIExpression()), !dbg !15
+    call void @llvm.dbg.value(metadata i32 undef, metadata !14, metadata !DIExpression()), !dbg !15
+    %mul = mul nsw i32 %b, %a, !dbg !16
+    ret i32 %mul, !dbg !17
+  }
+
+  ; Function Attrs: nounwind readnone speculatable willreturn
+  declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!3, !4, !5}
+  !llvm.ident = !{!6}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+  !1 = !DIFile(filename: "test.cpp", directory: "/")
+  !2 = !{}
+  !3 = !{i32 7, !"Dwarf Version", i32 4}
+  !4 = !{i32 2, !"Debug Info Version", i32 3}
+  !5 = !{i32 1, !"wchar_size", i32 4}
+  !6 = !{!"clang version 11.0.0"}
+  !7 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooii", scope: !1, file: !1, line: 2, type: !8, scopeLine: 2, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11)
+  !8 = !DISubroutineType(types: !9)
+  !9 = !{!10, !10, !10}
+  !10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !11 = !{!12, !13, !14}
+  !12 = !DILocalVariable(name: "a", arg: 1, scope: !7, file: !1, line: 2, type: !10)
+  !13 = !DILocalVariable(name: "b", arg: 2, scope: !7, file: !1, line: 2, type: !10)
+  !14 = !DILocalVariable(name: "c", scope: !7, file: !1, line: 3, type: !10)
+  !15 = !DILocation(line: 0, scope: !7)
+  !16 = !DILocation(line: 4, column: 12, scope: !7)
+  !17 = !DILocation(line: 4, column: 3, scope: !7)
+
+...
+---
+name:            _Z3fooii
+body:             |
+  bb.0.entry:
+    liveins: $edi, $esi
+
+    DBG_VALUE $edi, $noreg, !12, !DIExpression(), debug-location !15
+    DBG_VALUE $esi, $noreg, !13, !DIExpression(), debug-location !15
+    $eax = MOV32rr $edi
+    DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus, DW_OP_stack_value), $edi, $esi, debug-location !15
+    DBG_VALUE $esi, $noreg, !13, !DIExpression(), debug-location !15
+    DBG_VALUE $eax, $noreg, !12, !DIExpression(), debug-location !15
+    renamable $eax = nsw IMUL32rr killed renamable $eax, killed renamable $esi, implicit-def dead $eflags, debug-location !16
+    RET64 $eax, debug-location !17
+
+...
+
diff --git a/llvm/test/CodeGen/MIR/X86/machine-verifier.mir b/llvm/test/CodeGen/MIR/X86/machine-verifier.mir
index 5cf5e8f0adc9..6966b3e6778e 100644
--- a/llvm/test/CodeGen/MIR/X86/machine-verifier.mir
+++ b/llvm/test/CodeGen/MIR/X86/machine-verifier.mir
@@ -1,5 +1,5 @@
 # RUN: not --crash llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
-# This test ensures that the MIR parser runs the machine verifier after parsing.
+# This test ensures that the VerifyInstrumentation works for machine function.
 
 --- |
 
diff --git a/llvm/test/CodeGen/Mips/ehframe-indirect.ll b/llvm/test/CodeGen/Mips/ehframe-indirect.ll
index e36fa2f9ce42..1cd2b86a8e15 100644
--- a/llvm/test/CodeGen/Mips/ehframe-indirect.ll
+++ b/llvm/test/CodeGen/Mips/ehframe-indirect.ll
@@ -17,9 +17,9 @@ define i32 @main() personality ptr @__gxx_personality_v0 {
 ; ALL: .cfi_startproc
 
 ; Linux must rely on the assembler/linker converting the encodings.
-; LINUX: .cfi_personality 128, DW.ref.__gxx_personality_v0
-; LINUX-O32: .cfi_lsda 0, $exception0
-; LINUX-NEW: .cfi_lsda 0, .Lexception0
+; LINUX: .cfi_personality 155, DW.ref.__gxx_personality_v0
+; LINUX-O32: .cfi_lsda 27, $exception0
+; LINUX-NEW: .cfi_lsda 27, .Lexception0
 
 ; FreeBSD can (and must) be more direct about the encodings it wants.
 ; FREEBSD: .cfi_personality 155, DW.ref.__gxx_personality_v0
diff --git a/llvm/test/CodeGen/NVPTX/addrspacecast.ll b/llvm/test/CodeGen/NVPTX/addrspacecast.ll
index b680490ac5b1..85752bb95eb3 100644
--- a/llvm/test/CodeGen/NVPTX/addrspacecast.ll
+++ b/llvm/test/CodeGen/NVPTX/addrspacecast.ll
@@ -98,3 +98,95 @@ define i32 @conv8(ptr %ptr) {
   %val = load i32, ptr addrspace(5) %specptr
   ret i32 %val
 }
+
+; Check that we support addrspacecast when splitting the vector
+; result (<2 x ptr> => 2 x <1 x ptr>).
+; This also checks that scalarization works for addrspacecast
+; (when going from <1 x ptr> to ptr.)
+; ALL-LABEL: split1To0
+define void @split1To0(ptr nocapture noundef readonly %xs) {
+; CLS32: cvta.global.u32
+; CLS32: cvta.global.u32
+; CLS64: cvta.global.u64
+; CLS64: cvta.global.u64
+; ALL: st.u32
+; ALL: st.u32
+  %vec_addr = load <2 x ptr addrspace(1)>, ptr %xs, align 16
+  %addrspacecast = addrspacecast <2 x ptr addrspace(1)> %vec_addr to <2 x ptr>
+  %extractelement0 = extractelement <2 x ptr> %addrspacecast, i64 0
+  store float 0.5, ptr %extractelement0, align 4
+  %extractelement1 = extractelement <2 x ptr> %addrspacecast, i64 1
+  store float 1.0, ptr %extractelement1, align 4
+  ret void
+}
+
+; Same as split1To0 but from 0 to 1, to make sure the addrspacecast preserve
+; the source and destination addrspaces properly.
+; ALL-LABEL: split0To1
+define void @split0To1(ptr nocapture noundef readonly %xs) {
+; CLS32: cvta.to.global.u32
+; CLS32: cvta.to.global.u32
+; CLS64: cvta.to.global.u64
+; CLS64: cvta.to.global.u64
+; ALL: st.global.u32
+; ALL: st.global.u32
+  %vec_addr = load <2 x ptr>, ptr %xs, align 16
+  %addrspacecast = addrspacecast <2 x ptr> %vec_addr to <2 x ptr addrspace(1)>
+  %extractelement0 = extractelement <2 x ptr addrspace(1)> %addrspacecast, i64 0
+  store float 0.5, ptr addrspace(1) %extractelement0, align 4
+  %extractelement1 = extractelement <2 x ptr addrspace(1)> %addrspacecast, i64 1
+  store float 1.0, ptr addrspace(1) %extractelement1, align 4
+  ret void
+}
+
+; Check that we support addrspacecast when a widening is required
+; (3 x ptr => 4 x ptr).
+; ALL-LABEL: widen1To0
+define void @widen1To0(ptr nocapture noundef readonly %xs) {
+; CLS32: cvta.global.u32
+; CLS32: cvta.global.u32
+; CLS32: cvta.global.u32
+
+; CLS64: cvta.global.u64
+; CLS64: cvta.global.u64
+; CLS64: cvta.global.u64
+
+; ALL: st.u32
+; ALL: st.u32
+; ALL: st.u32
+  %vec_addr = load <3 x ptr addrspace(1)>, ptr %xs, align 16
+  %addrspacecast = addrspacecast <3 x ptr addrspace(1)> %vec_addr to <3 x ptr>
+  %extractelement0 = extractelement <3 x ptr> %addrspacecast, i64 0
+  store float 0.5, ptr %extractelement0, align 4
+  %extractelement1 = extractelement <3 x ptr> %addrspacecast, i64 1
+  store float 1.0, ptr %extractelement1, align 4
+  %extractelement2 = extractelement <3 x ptr> %addrspacecast, i64 2
+  store float 1.5, ptr %extractelement2, align 4
+  ret void
+}
+
+; Same as widen1To0 but from 0 to 1, to make sure the addrspacecast preserve
+; the source and destination addrspaces properly.
+; ALL-LABEL: widen0To1
+define void @widen0To1(ptr nocapture noundef readonly %xs) {
+; CLS32: cvta.to.global.u32
+; CLS32: cvta.to.global.u32
+; CLS32: cvta.to.global.u32
+
+; CLS64: cvta.to.global.u64
+; CLS64: cvta.to.global.u64
+; CLS64: cvta.to.global.u64
+
+; ALL: st.global.u32
+; ALL: st.global.u32
+; ALL: st.global.u32
+  %vec_addr = load <3 x ptr>, ptr %xs, align 16
+  %addrspacecast = addrspacecast <3 x ptr> %vec_addr to <3 x ptr addrspace(1)>
+  %extractelement0 = extractelement <3 x ptr addrspace(1)> %addrspacecast, i64 0
+  store float 0.5, ptr addrspace(1) %extractelement0, align 4
+  %extractelement1 = extractelement <3 x ptr addrspace(1)> %addrspacecast, i64 1
+  store float 1.0, ptr addrspace(1) %extractelement1, align 4
+  %extractelement2 = extractelement <3 x ptr addrspace(1)> %addrspacecast, i64 2
+  store float 1.5, ptr addrspace(1) %extractelement2, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
new file mode 100644
index 000000000000..9301ea44c693
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
@@ -0,0 +1,151 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -march=nvptx -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s --check-prefixes=CHECK64
+; RUN: llc < %s -march=nvptx -mcpu=sm_86 -mattr=+ptx71 | FileCheck %s --check-prefixes=CHECKPTX71
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_86 -mattr=+ptx71 | %ptxas-verify -arch=sm_86 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat %val) {
+; CHECK-LABEL: test(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [test_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs1, [test_param_3];
+; CHECK-NEXT:    atom.add.noftz.bf16 %rs2, [%r1], %rs1;
+; CHECK-NEXT:    ld.param.u32 %r2, [test_param_1];
+; CHECK-NEXT:    mov.b16 %rs3, 0x3F80;
+; CHECK-NEXT:    atom.add.noftz.bf16 %rs4, [%r1], %rs3;
+; CHECK-NEXT:    ld.param.u32 %r3, [test_param_2];
+; CHECK-NEXT:    atom.global.add.noftz.bf16 %rs5, [%r2], %rs1;
+; CHECK-NEXT:    atom.shared.add.noftz.bf16 %rs6, [%r3], %rs1;
+; CHECK-NEXT:    ret;
+;
+; CHECK64-LABEL: test(
+; CHECK64:       {
+; CHECK64-NEXT:    .reg .b16 %rs<7>;
+; CHECK64-NEXT:    .reg .b64 %rd<4>;
+; CHECK64-EMPTY:
+; CHECK64-NEXT:  // %bb.0:
+; CHECK64-NEXT:    ld.param.u64 %rd1, [test_param_0];
+; CHECK64-NEXT:    ld.param.b16 %rs1, [test_param_3];
+; CHECK64-NEXT:    atom.add.noftz.bf16 %rs2, [%rd1], %rs1;
+; CHECK64-NEXT:    ld.param.u64 %rd2, [test_param_1];
+; CHECK64-NEXT:    mov.b16 %rs3, 0x3F80;
+; CHECK64-NEXT:    atom.add.noftz.bf16 %rs4, [%rd1], %rs3;
+; CHECK64-NEXT:    ld.param.u64 %rd3, [test_param_2];
+; CHECK64-NEXT:    atom.global.add.noftz.bf16 %rs5, [%rd2], %rs1;
+; CHECK64-NEXT:    atom.shared.add.noftz.bf16 %rs6, [%rd3], %rs1;
+; CHECK64-NEXT:    ret;
+;
+; CHECKPTX71-LABEL: test(
+; CHECKPTX71:       {
+; CHECKPTX71-NEXT:    .reg .pred %p<5>;
+; CHECKPTX71-NEXT:    .reg .b16 %rs<18>;
+; CHECKPTX71-NEXT:    .reg .b32 %r<58>;
+; CHECKPTX71-NEXT:    .reg .f32 %f<12>;
+; CHECKPTX71-EMPTY:
+; CHECKPTX71-NEXT:  // %bb.0:
+; CHECKPTX71-NEXT:    ld.param.b16 %rs1, [test_param_3];
+; CHECKPTX71-NEXT:    ld.param.u32 %r23, [test_param_2];
+; CHECKPTX71-NEXT:    ld.param.u32 %r22, [test_param_1];
+; CHECKPTX71-NEXT:    ld.param.u32 %r24, [test_param_0];
+; CHECKPTX71-NEXT:    and.b32 %r1, %r24, -4;
+; CHECKPTX71-NEXT:    and.b32 %r25, %r24, 3;
+; CHECKPTX71-NEXT:    shl.b32 %r2, %r25, 3;
+; CHECKPTX71-NEXT:    mov.b32 %r26, 65535;
+; CHECKPTX71-NEXT:    shl.b32 %r27, %r26, %r2;
+; CHECKPTX71-NEXT:    not.b32 %r3, %r27;
+; CHECKPTX71-NEXT:    ld.u32 %r54, [%r1];
+; CHECKPTX71-NEXT:    cvt.f32.bf16 %f2, %rs1;
+; CHECKPTX71-NEXT:  $L__BB0_1: // %atomicrmw.start
+; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECKPTX71-NEXT:    shr.u32 %r28, %r54, %r2;
+; CHECKPTX71-NEXT:    cvt.u16.u32 %rs2, %r28;
+; CHECKPTX71-NEXT:    cvt.f32.bf16 %f1, %rs2;
+; CHECKPTX71-NEXT:    add.rn.f32 %f3, %f1, %f2;
+; CHECKPTX71-NEXT:    cvt.rn.bf16.f32 %rs4, %f3;
+; CHECKPTX71-NEXT:    cvt.u32.u16 %r29, %rs4;
+; CHECKPTX71-NEXT:    shl.b32 %r30, %r29, %r2;
+; CHECKPTX71-NEXT:    and.b32 %r31, %r54, %r3;
+; CHECKPTX71-NEXT:    or.b32 %r32, %r31, %r30;
+; CHECKPTX71-NEXT:    atom.cas.b32 %r6, [%r1], %r54, %r32;
+; CHECKPTX71-NEXT:    setp.ne.s32 %p1, %r6, %r54;
+; CHECKPTX71-NEXT:    mov.u32 %r54, %r6;
+; CHECKPTX71-NEXT:    @%p1 bra $L__BB0_1;
+; CHECKPTX71-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECKPTX71-NEXT:    ld.u32 %r55, [%r1];
+; CHECKPTX71-NEXT:  $L__BB0_3: // %atomicrmw.start9
+; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECKPTX71-NEXT:    shr.u32 %r33, %r55, %r2;
+; CHECKPTX71-NEXT:    cvt.u16.u32 %rs6, %r33;
+; CHECKPTX71-NEXT:    cvt.f32.bf16 %f4, %rs6;
+; CHECKPTX71-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
+; CHECKPTX71-NEXT:    cvt.rn.bf16.f32 %rs8, %f5;
+; CHECKPTX71-NEXT:    cvt.u32.u16 %r34, %rs8;
+; CHECKPTX71-NEXT:    shl.b32 %r35, %r34, %r2;
+; CHECKPTX71-NEXT:    and.b32 %r36, %r55, %r3;
+; CHECKPTX71-NEXT:    or.b32 %r37, %r36, %r35;
+; CHECKPTX71-NEXT:    atom.cas.b32 %r9, [%r1], %r55, %r37;
+; CHECKPTX71-NEXT:    setp.ne.s32 %p2, %r9, %r55;
+; CHECKPTX71-NEXT:    mov.u32 %r55, %r9;
+; CHECKPTX71-NEXT:    @%p2 bra $L__BB0_3;
+; CHECKPTX71-NEXT:  // %bb.4: // %atomicrmw.end8
+; CHECKPTX71-NEXT:    and.b32 %r10, %r22, -4;
+; CHECKPTX71-NEXT:    shl.b32 %r38, %r22, 3;
+; CHECKPTX71-NEXT:    and.b32 %r11, %r38, 24;
+; CHECKPTX71-NEXT:    shl.b32 %r40, %r26, %r11;
+; CHECKPTX71-NEXT:    not.b32 %r12, %r40;
+; CHECKPTX71-NEXT:    ld.global.u32 %r56, [%r10];
+; CHECKPTX71-NEXT:  $L__BB0_5: // %atomicrmw.start27
+; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECKPTX71-NEXT:    shr.u32 %r41, %r56, %r11;
+; CHECKPTX71-NEXT:    cvt.u16.u32 %rs10, %r41;
+; CHECKPTX71-NEXT:    cvt.f32.bf16 %f6, %rs10;
+; CHECKPTX71-NEXT:    add.rn.f32 %f8, %f6, %f2;
+; CHECKPTX71-NEXT:    cvt.rn.bf16.f32 %rs12, %f8;
+; CHECKPTX71-NEXT:    cvt.u32.u16 %r42, %rs12;
+; CHECKPTX71-NEXT:    shl.b32 %r43, %r42, %r11;
+; CHECKPTX71-NEXT:    and.b32 %r44, %r56, %r12;
+; CHECKPTX71-NEXT:    or.b32 %r45, %r44, %r43;
+; CHECKPTX71-NEXT:    atom.global.cas.b32 %r15, [%r10], %r56, %r45;
+; CHECKPTX71-NEXT:    setp.ne.s32 %p3, %r15, %r56;
+; CHECKPTX71-NEXT:    mov.u32 %r56, %r15;
+; CHECKPTX71-NEXT:    @%p3 bra $L__BB0_5;
+; CHECKPTX71-NEXT:  // %bb.6: // %atomicrmw.end26
+; CHECKPTX71-NEXT:    and.b32 %r16, %r23, -4;
+; CHECKPTX71-NEXT:    shl.b32 %r46, %r23, 3;
+; CHECKPTX71-NEXT:    and.b32 %r17, %r46, 24;
+; CHECKPTX71-NEXT:    shl.b32 %r48, %r26, %r17;
+; CHECKPTX71-NEXT:    not.b32 %r18, %r48;
+; CHECKPTX71-NEXT:    ld.shared.u32 %r57, [%r16];
+; CHECKPTX71-NEXT:  $L__BB0_7: // %atomicrmw.start45
+; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECKPTX71-NEXT:    shr.u32 %r49, %r57, %r17;
+; CHECKPTX71-NEXT:    cvt.u16.u32 %rs14, %r49;
+; CHECKPTX71-NEXT:    cvt.f32.bf16 %f9, %rs14;
+; CHECKPTX71-NEXT:    add.rn.f32 %f11, %f9, %f2;
+; CHECKPTX71-NEXT:    cvt.rn.bf16.f32 %rs16, %f11;
+; CHECKPTX71-NEXT:    cvt.u32.u16 %r50, %rs16;
+; CHECKPTX71-NEXT:    shl.b32 %r51, %r50, %r17;
+; CHECKPTX71-NEXT:    and.b32 %r52, %r57, %r18;
+; CHECKPTX71-NEXT:    or.b32 %r53, %r52, %r51;
+; CHECKPTX71-NEXT:    atom.shared.cas.b32 %r21, [%r16], %r57, %r53;
+; CHECKPTX71-NEXT:    setp.ne.s32 %p4, %r21, %r57;
+; CHECKPTX71-NEXT:    mov.u32 %r57, %r21;
+; CHECKPTX71-NEXT:    @%p4 bra $L__BB0_7;
+; CHECKPTX71-NEXT:  // %bb.8: // %atomicrmw.end44
+; CHECKPTX71-NEXT:    ret;
+  %r1 = atomicrmw fadd ptr %dp0, bfloat %val seq_cst
+  %r2 = atomicrmw fadd ptr %dp0, bfloat 1.0 seq_cst
+  %r3 = atomicrmw fadd ptr addrspace(1) %dp1, bfloat %val seq_cst
+  %r4 = atomicrmw fadd ptr addrspace(3) %dp3, bfloat %val seq_cst
+  ret void
+}
+
+attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll
index dde8555c35af..f61205eb88fc 100644
--- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll
@@ -1,42 +1,42 @@
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 --enable-unsafe-fp-math | FileCheck --check-prefixes=CHECK %s
-; RUN: %if ptxas-11.8 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 --enable-unsafe-fp-math | %ptxas-verify -arch=sm_80 %}
-
-target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-
-declare <2 x bfloat> @llvm.sin.f16(<2 x bfloat> %a) #0
-declare <2 x bfloat> @llvm.cos.f16(<2 x bfloat> %a) #0
-
-; CHECK-LABEL: test_sin(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_sin_param_0];
-; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  cvt.f32.bf16     [[AF0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.f32.bf16     [[AF1:%f[0-9]+]], [[A1]];
-; CHECK-DAG:  sin.approx.f32  [[RF0:%f[0-9]+]], [[AF0]];
-; CHECK-DAG:  sin.approx.f32  [[RF1:%f[0-9]+]], [[AF1]];
-; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%rs[0-9]+]], [[RF0]];
-; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%rs[0-9]+]], [[RF1]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x bfloat> @test_sin(<2 x bfloat> %a) #0 #1 {
-  %r = call <2 x bfloat> @llvm.sin.f16(<2 x bfloat> %a)
-  ret <2 x bfloat> %r
-}
-
-; CHECK-LABEL: test_cos(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_cos_param_0];
-; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  cvt.f32.bf16     [[AF0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.f32.bf16     [[AF1:%f[0-9]+]], [[A1]];
-; CHECK-DAG:  cos.approx.f32  [[RF0:%f[0-9]+]], [[AF0]];
-; CHECK-DAG:  cos.approx.f32  [[RF1:%f[0-9]+]], [[AF1]];
-; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%rs[0-9]+]], [[RF0]];
-; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%rs[0-9]+]], [[RF1]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x bfloat> @test_cos(<2 x bfloat> %a) #0 #1 {
-  %r = call <2 x bfloat> @llvm.cos.f16(<2 x bfloat> %a)
-  ret <2 x bfloat> %r
-}
-
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 --enable-unsafe-fp-math | FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-11.8 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 --enable-unsafe-fp-math | %ptxas-verify -arch=sm_80 %}
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+declare <2 x bfloat> @llvm.sin.f16(<2 x bfloat> %a) #0
+declare <2 x bfloat> @llvm.cos.f16(<2 x bfloat> %a) #0
+
+; CHECK-LABEL: test_sin(
+; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_sin_param_0];
+; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-DAG:  cvt.f32.bf16     [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16     [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  sin.approx.f32  [[RF0:%f[0-9]+]], [[AF0]];
+; CHECK-DAG:  sin.approx.f32  [[RF1:%f[0-9]+]], [[AF1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%rs[0-9]+]], [[RF0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%rs[0-9]+]], [[RF1]];
+; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x bfloat> @test_sin(<2 x bfloat> %a) #0 #1 {
+  %r = call <2 x bfloat> @llvm.sin.f16(<2 x bfloat> %a)
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_cos(
+; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_cos_param_0];
+; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-DAG:  cvt.f32.bf16     [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16     [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cos.approx.f32  [[RF0:%f[0-9]+]], [[AF0]];
+; CHECK-DAG:  cos.approx.f32  [[RF1:%f[0-9]+]], [[AF1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%rs[0-9]+]], [[RF0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%rs[0-9]+]], [[RF1]];
+; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x bfloat> @test_cos(<2 x bfloat> %a) #0 #1 {
+  %r = call <2 x bfloat> @llvm.cos.f16(<2 x bfloat> %a)
+  ret <2 x bfloat> %r
+}
+
diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
index 9bde89cdf044..7030e5435f72 100644
--- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
@@ -1,532 +1,532 @@
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM80 %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK,SM90 %s
-; RUN: %if ptxas-11.8 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %}
-; RUN: %if ptxas-11.8 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
-
-target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-
-; CHECK-LABEL: test_ret_const(
-; CHECK:     mov.b32         [[T:%r[0-9+]]], 1073758080;
-; CHECK:     st.param.b32    [func_retval0+0], [[T]];
-; CHECK-NEXT: ret;
-
-define <2 x bfloat> @test_ret_const() #0 {
-  ret <2 x bfloat> <bfloat 1.0, bfloat 2.0>
-}
-
-; Check that we can lower fadd with immediate arguments.
-; CHECK-LABEL: test_fadd_imm_0(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fadd_imm_0_param_0];
-;
-; SM90-DAG:        mov.b32        [[I:%r[0-9+]]], 1073758080;
-; SM90-DAG:        add.rn.bf16x2   [[R:%r[0-9]+]], [[A]], [[I]];
-;
-; SM80-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; SM80-DAG:  cvt.f32.bf16    [[FA0:%f[0-9]+]], [[A0]]
-; SM80-DAG:  cvt.f32.bf16    [[FA1:%f[0-9]+]], [[A1]]
-; SM80-DAG:  add.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000;
-; SM80-DAG:  add.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], 0f40000000;
-; SM80-DAG:  cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[FR0]]
-; SM80-DAG:  cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[FR1]]
-; SM80-DAG:  mov.b32        [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-;
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-
-define <2 x bfloat> @test_fadd_imm_0(<2 x bfloat> %a) #0 {
-  %r = fadd <2 x bfloat> <bfloat 1.0, bfloat 2.0>, %a
-  ret <2 x bfloat> %r
-}
-
-; CHECK-LABEL: test_fadd_imm_1(
-; CHECK:      ld.param.b16    [[A:%rs[0-9]+]], [test_fadd_imm_1_param_0];
-; SM90:       mov.b16         [[B:%rs[0-9]+]], 0x3F80;
-; SM90:       add.rn.bf16     [[R:%rs[0-9]+]], [[A]], [[B]];
-
-; SM80-DAG:   cvt.f32.bf16    [[FA:%f[0-9]+]], [[A]];
-; SM80:       add.rn.f32      [[FR:%f[0-9]+]], [[FA]], 0f3F800000;
-; SM80:       cvt.rn.bf16.f32 [[R:%rs[0-9]+]], [[FR]];
-
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-
-define bfloat @test_fadd_imm_1(bfloat %a) #0 {
-  %r = fadd bfloat %a, 1.0
-  ret bfloat %r
-}
-
-; CHECK-LABEL: test_fsubx2(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fsubx2_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fsubx2_param_1];
-; SM90:       sub.rn.bf16x2   [[R:%r[0-9]+]], [[A]], [[B]];
-
-; SM80-DAG:   mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]];
-; SM80-DAG:   mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]];
-; SM80-DAG:   cvt.f32.bf16    [[FA1:%f[0-9]+]], [[A1]];
-; SM80-DAG:   cvt.f32.bf16    [[FA0:%f[0-9]+]], [[A0]];
-; SM80-DAG:   cvt.f32.bf16    [[FB0:%f[0-9]+]], [[B0]];
-; SM80-DAG:   cvt.f32.bf16    [[FB1:%f[0-9]+]], [[B1]];
-; SM80-DAG:   sub.rn.f32      [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
-; SM80-DAG:   sub.rn.f32      [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
-; SM80-DAG:   cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[FR0]];
-; SM80-DAG:   cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[FR1]];
-; SM80:       mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]};
-
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-
-define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
-  %r = fsub <2 x bfloat> %a, %b
-  ret <2 x bfloat> %r
-}
-
-; CHECK-LABEL: test_fmulx2(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fmulx2_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fmulx2_param_1];
-; SM90:       mul.rn.bf16x2   [[R:%r[0-9]+]], [[A]], [[B]];
-
-; SM80-DAG:   mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]];
-; SM80-DAG:   mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]];
-; SM80-DAG:   cvt.f32.bf16    [[FA1:%f[0-9]+]], [[A1]];
-; SM80-DAG:   cvt.f32.bf16    [[FA0:%f[0-9]+]], [[A0]];
-; SM80-DAG:   cvt.f32.bf16    [[FB0:%f[0-9]+]], [[B0]];
-; SM80-DAG:   cvt.f32.bf16    [[FB1:%f[0-9]+]], [[B1]];
-; SM80-DAG:   mul.rn.f32      [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
-; SM80-DAG:   mul.rn.f32      [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
-; SM80-DAG:   cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[FR0]];
-; SM80-DAG:   cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[FR1]];
-; SM80:       mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]};
-
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-
-define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
-  %r = fmul <2 x bfloat> %a, %b
-  ret <2 x bfloat> %r
-}
-
-; CHECK-LABEL: test_fdiv(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fdiv_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fdiv_param_1];
-; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-DAG:  cvt.f32.bf16     [[FA0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.f32.bf16     [[FA1:%f[0-9]+]], [[A1]];
-; CHECK-DAG:  cvt.f32.bf16     [[FB0:%f[0-9]+]], [[B0]];
-; CHECK-DAG:  cvt.f32.bf16     [[FB1:%f[0-9]+]], [[B1]];
-; CHECK-DAG:  div.rn.f32      [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
-; CHECK-DAG:  div.rn.f32      [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
-; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%rs[0-9]+]], [[FR0]];
-; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%rs[0-9]+]], [[FR1]];
-; CHECK-NEXT: mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-
-define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
-  %r = fdiv <2 x bfloat> %a, %b
-  ret <2 x bfloat> %r
-}
-
-; CHECK-LABEL: test_fneg(
-; CHECK-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_fneg_param_0];
-
-; CHECK-DAG:        xor.b32        [[IHH0:%r[0-9]+]], [[A]], -2147450880;
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[IHH0]];
-; CHECK-NEXT: ret;
-define <2 x bfloat> @test_fneg(<2 x bfloat> %a) #0 {
-  %r = fneg <2 x bfloat> %a
-  ret <2 x bfloat> %r
-}
-
-; CHECK-LABEL: .func test_ldst_v2bf16(
-; CHECK-DAG:    ld.param.u64    %[[A:rd[0-9]+]], [test_ldst_v2bf16_param_0];
-; CHECK-DAG:    ld.param.u64    %[[B:rd[0-9]+]], [test_ldst_v2bf16_param_1];
-; CHECK-DAG:    ld.b32          [[E:%r[0-9]+]], [%[[A]]]
-; CHECK-DAG:    st.b32          [%[[B]]], [[E]];
-; CHECK:        ret;
-define void @test_ldst_v2bf16(ptr %a, ptr %b) {
-  %t1 = load <2 x bfloat>, ptr %a
-  store <2 x bfloat> %t1, ptr %b, align 16
-  ret void
-}
-
-; CHECK-LABEL: .func test_ldst_v3bf16(
-; CHECK-DAG:    ld.param.u64    %[[A:rd[0-9]+]], [test_ldst_v3bf16_param_0];
-; CHECK-DAG:    ld.param.u64    %[[B:rd[0-9]+]], [test_ldst_v3bf16_param_1];
-; -- v3 is inconvenient to capture as it's lowered as ld.b64 + fair
-;    number of bitshifting instructions that may change at llvm's whim.
-;    So we only verify that we only issue correct number of writes using
-;    correct offset, but not the values we write.
-; CHECK-DAG:    ld.u64
-; CHECK-DAG:    st.u32          [%[[B]]],
-; CHECK-DAG:    st.b16          [%[[B]]+4],
-; CHECK:        ret;
-define void @test_ldst_v3bf16(ptr %a, ptr %b) {
-  %t1 = load <3 x bfloat>, ptr %a
-  store <3 x bfloat> %t1, ptr %b, align 16
-  ret void
-}
-
-declare <2 x bfloat> @test_callee(<2 x bfloat> %a, <2 x bfloat> %b) #0
-
-; CHECK-LABEL: test_call(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_call_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_call_param_1];
-; CHECK:      {
-; CHECK-DAG:  .param .align 4 .b8 param0[4];
-; CHECK-DAG:  .param .align 4 .b8 param1[4];
-; CHECK-DAG:  st.param.b32    [param0+0], [[A]];
-; CHECK-DAG:  st.param.b32    [param1+0], [[B]];
-; CHECK-DAG:  .param .align 4 .b8 retval0[4];
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT:        test_callee,
-; CHECK:      );
-; CHECK-NEXT: ld.param.b32    [[R:%r[0-9]+]], [retval0+0];
-; CHECK-NEXT: }
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-
-define <2 x bfloat> @test_call(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
-  %r = call <2 x bfloat> @test_callee(<2 x bfloat> %a, <2 x bfloat> %b)
-  ret <2 x bfloat> %r
-}
-
-; CHECK-LABEL: test_select(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_select_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_select_param_1];
-; CHECK-DAG:  ld.param.u8     [[C:%rs[0-9]+]], [test_select_param_2]
-; CHECK-DAG:  setp.eq.b16     [[PRED:%p[0-9]+]], %rs{{.*}}, 1;
-; CHECK-NEXT: selp.b32        [[R:%r[0-9]+]], [[A]], [[B]], [[PRED]];
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-
-define <2 x bfloat> @test_select(<2 x bfloat> %a, <2 x bfloat> %b, i1 zeroext %c) #0 {
-  %r = select i1 %c, <2 x bfloat> %a, <2 x bfloat> %b
-  ret <2 x bfloat> %r
-}
-
-; CHECK-LABEL: test_select_cc(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_select_cc_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_select_cc_param_1];
-; CHECK-DAG:  ld.param.b32    [[C:%r[0-9]+]], [test_select_cc_param_2];
-; CHECK-DAG:  ld.param.b32    [[D:%r[0-9]+]], [test_select_cc_param_3];
-;
-; SM90:  setp.neu.bf16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]]
-;
-; SM80-DAG: mov.b32        {[[C0:%rs[0-9]+]], [[C1:%rs[0-9]+]]}, [[C]]
-; SM80-DAG: mov.b32        {[[D0:%rs[0-9]+]], [[D1:%rs[0-9]+]]}, [[D]]
-; SM80-DAG: cvt.f32.bf16 [[DF0:%f[0-9]+]], [[D0]];
-; SM80-DAG: cvt.f32.bf16 [[CF0:%f[0-9]+]], [[C0]];
-; SM80-DAG: cvt.f32.bf16 [[DF1:%f[0-9]+]], [[D1]];
-; SM80-DAG: cvt.f32.bf16 [[CF1:%f[0-9]+]], [[C1]];
-; SM80-DAG: setp.neu.f32    [[P0:%p[0-9]+]], [[CF0]], [[DF0]]
-; SM80-DAG: setp.neu.f32    [[P1:%p[0-9]+]], [[CF1]], [[DF1]]
-;
-; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-DAG:  selp.b16        [[R0:%rs[0-9]+]], [[A0]], [[B0]], [[P0]];
-; CHECK-DAG:  selp.b16        [[R1:%rs[0-9]+]], [[A1]], [[B1]], [[P1]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-
-define <2 x bfloat> @test_select_cc(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c, <2 x bfloat> %d) #0 {
-  %cc = fcmp une <2 x bfloat> %c, %d
-  %r = select <2 x i1> %cc, <2 x bfloat> %a, <2 x bfloat> %b
-  ret <2 x bfloat> %r
-}
-
-
-; CHECK-LABEL: test_select_cc_f32_bf16(
-; CHECK-DAG:  ld.param.v2.f32    {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_select_cc_f32_bf16_param_0];
-; CHECK-DAG:  ld.param.b32    [[C:%r[0-9]+]], [test_select_cc_f32_bf16_param_2];
-; CHECK-DAG:  ld.param.b32    [[D:%r[0-9]+]], [test_select_cc_f32_bf16_param_3];
-; SM90:  setp.neu.bf16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]]
-; CHECK-DAG:  ld.param.v2.f32    {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_select_cc_f32_bf16_param_1];
-
-; SM80-DAG: mov.b32         {[[C0:%rs[0-9]+]], [[C1:%rs[0-9]+]]}, [[C]]
-; SM80-DAG: mov.b32         {[[D0:%rs[0-9]+]], [[D1:%rs[0-9]+]]}, [[D]]
-; SM80-DAG: cvt.f32.bf16 [[DF0:%f[0-9]+]], [[D0]];
-; SM80-DAG: cvt.f32.bf16 [[CF0:%f[0-9]+]], [[C0]];
-; SM80-DAG: cvt.f32.bf16 [[DF1:%f[0-9]+]], [[D1]];
-; SM80-DAG: cvt.f32.bf16 [[CF1:%f[0-9]+]], [[C1]];
-; SM80-DAG: setp.neu.f32    [[P0:%p[0-9]+]], [[CF0]], [[DF0]]
-; SM80-DAG: setp.neu.f32    [[P1:%p[0-9]+]], [[CF1]], [[DF1]]
-;
-; CHECK-DAG: selp.f32        [[R0:%f[0-9]+]], [[A0]], [[B0]], [[P0]];
-; CHECK-DAG: selp.f32        [[R1:%f[0-9]+]], [[A1]], [[B1]], [[P1]];
-; CHECK-NEXT: st.param.v2.f32    [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK-NEXT: ret;
-define <2 x float> @test_select_cc_f32_bf16(<2 x float> %a, <2 x float> %b,
-                                           <2 x bfloat> %c, <2 x bfloat> %d) #0 {
-  %cc = fcmp une <2 x bfloat> %c, %d
-  %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b
-  ret <2 x float> %r
-}
-
-; CHECK-LABEL: test_select_cc_bf16_f32(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_select_cc_bf16_f32_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_select_cc_bf16_f32_param_1];
-; CHECK-DAG:  ld.param.v2.f32 {[[C0:%f[0-9]+]], [[C1:%f[0-9]+]]}, [test_select_cc_bf16_f32_param_2];
-; CHECK-DAG:  ld.param.v2.f32 {[[D0:%f[0-9]+]], [[D1:%f[0-9]+]]}, [test_select_cc_bf16_f32_param_3];
-; CHECK-DAG:  setp.neu.f32    [[P0:%p[0-9]+]], [[C0]], [[D0]]
-; CHECK-DAG:  setp.neu.f32    [[P1:%p[0-9]+]], [[C1]], [[D1]]
-; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-DAG:  selp.b16        [[R0:%rs[0-9]+]], [[A0]], [[B0]], [[P0]];
-; CHECK-DAG:  selp.b16        [[R1:%rs[0-9]+]], [[A1]], [[B1]], [[P1]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define <2 x bfloat> @test_select_cc_bf16_f32(<2 x bfloat> %a, <2 x bfloat> %b,
-                                          <2 x float> %c, <2 x float> %d) #0 {
-  %cc = fcmp une <2 x float> %c, %d
-  %r = select <2 x i1> %cc, <2 x bfloat> %a, <2 x bfloat> %b
-  ret <2 x bfloat> %r
-}
-
-; CHECK-LABEL: test_fptrunc_2xfloat(
-; CHECK:      ld.param.v2.f32 {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_fptrunc_2xfloat_param_0];
-; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%rs[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%rs[0-9]+]], [[A1]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x bfloat> @test_fptrunc_2xfloat(<2 x float> %a) #0 {
-  %r = fptrunc <2 x float> %a to <2 x bfloat>
-  ret <2 x bfloat> %r
-}
-
-; CHECK-LABEL: test_fpext_2xfloat(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_fpext_2xfloat_param_0];
-; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  cvt.f32.bf16     [[R0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.f32.bf16     [[R1:%f[0-9]+]], [[A1]];
-; CHECK-NEXT: st.param.v2.f32 [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK:      ret;
-define <2 x float> @test_fpext_2xfloat(<2 x bfloat> %a) #0 {
-  %r = fpext <2 x bfloat> %a to <2 x float>
-  ret <2 x float> %r
-}
-
-; CHECK-LABEL: test_bitcast_2xbf16_to_2xi16(
-; CHECK:      ld.param.u32    [[A:%r[0-9]+]], [test_bitcast_2xbf16_to_2xi16_param_0];
-; CHECK:      st.param.b32 [func_retval0+0], [[A]]
-; CHECK:      ret;
-define <2 x i16> @test_bitcast_2xbf16_to_2xi16(<2 x bfloat> %a) #0 {
-  %r = bitcast <2 x bfloat> %a to <2 x i16>
-  ret <2 x i16> %r
-}
-
-
-; CHECK-LABEL: test_bitcast_2xi16_to_2xbf16(
-; CHECK:      ld.param.b32     [[R]], [test_bitcast_2xi16_to_2xbf16_param_0];
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x bfloat> @test_bitcast_2xi16_to_2xbf16(<2 x i16> %a) #0 {
-  %r = bitcast <2 x i16> %a to <2 x bfloat>
-  ret <2 x bfloat> %r
-}
-
-declare <2 x bfloat> @llvm.sqrt.f16(<2 x bfloat> %a) #0
-declare <2 x bfloat> @llvm.powi.f16(<2 x bfloat> %a, <2 x i32> %b) #0
-declare <2 x bfloat> @llvm.sin.f16(<2 x bfloat> %a) #0
-declare <2 x bfloat> @llvm.cos.f16(<2 x bfloat> %a) #0
-declare <2 x bfloat> @llvm.pow.f16(<2 x bfloat> %a, <2 x bfloat> %b) #0
-declare <2 x bfloat> @llvm.exp.f16(<2 x bfloat> %a) #0
-declare <2 x bfloat> @llvm.exp2.f16(<2 x bfloat> %a) #0
-declare <2 x bfloat> @llvm.log.f16(<2 x bfloat> %a) #0
-declare <2 x bfloat> @llvm.log10.f16(<2 x bfloat> %a) #0
-declare <2 x bfloat> @llvm.log2.f16(<2 x bfloat> %a) #0
-declare <2 x bfloat> @llvm.fma.f16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0
-declare <2 x bfloat> @llvm.fabs.f16(<2 x bfloat> %a) #0
-declare <2 x bfloat> @llvm.minnum.f16(<2 x bfloat> %a, <2 x bfloat> %b) #0
-declare <2 x bfloat> @llvm.maxnum.f16(<2 x bfloat> %a, <2 x bfloat> %b) #0
-declare <2 x bfloat> @llvm.copysign.f16(<2 x bfloat> %a, <2 x bfloat> %b) #0
-declare <2 x bfloat> @llvm.floor.f16(<2 x bfloat> %a) #0
-declare <2 x bfloat> @llvm.ceil.f16(<2 x bfloat> %a) #0
-declare <2 x bfloat> @llvm.trunc.f16(<2 x bfloat> %a) #0
-declare <2 x bfloat> @llvm.rint.f16(<2 x bfloat> %a) #0
-declare <2 x bfloat> @llvm.nearbyint.f16(<2 x bfloat> %a) #0
-declare <2 x bfloat> @llvm.round.f16(<2 x bfloat> %a) #0
-declare <2 x bfloat> @llvm.fmuladd.f16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0
-
-
-; CHECK-LABEL: test_sqrt(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_sqrt_param_0];
-; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  cvt.f32.bf16     [[AF0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.f32.bf16     [[AF1:%f[0-9]+]], [[A1]];
-; CHECK-DAG:  sqrt.rn.f32     [[RF0:%f[0-9]+]], [[AF0]];
-; CHECK-DAG:  sqrt.rn.f32     [[RF1:%f[0-9]+]], [[AF1]];
-; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%rs[0-9]+]], [[RF0]];
-; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%rs[0-9]+]], [[RF1]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x bfloat> @test_sqrt(<2 x bfloat> %a) #0 {
-  %r = call <2 x bfloat> @llvm.sqrt.f16(<2 x bfloat> %a)
-  ret <2 x bfloat> %r
-}
-
-; CHECK-LABEL: test_fmuladd(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fmuladd_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fmuladd_param_1];
-; CHECK-DAG:  ld.param.b32    [[C:%r[0-9]+]], [test_fmuladd_param_2];
-;
-; CHECK:       fma.rn.bf16x2   [[RA:%r[0-9]+]], [[A]], [[B]], [[C]];
-; CHECK-NEXT: st.param.b32    [func_retval0+0], [[RA]];
-; CHECK:      ret;
-define <2 x bfloat> @test_fmuladd(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 {
-  %r = call <2 x bfloat> @llvm.fmuladd.f16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
-  ret <2 x bfloat> %r
-}
-
-; CHECK-LABEL: test_fabs(
-; CHECK:      ld.param.u32    [[A:%r[0-9]+]], [test_fabs_param_0];
-; CHECK:      and.b32         [[R:%r[0-9]+]], [[A]], 2147450879;
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x bfloat> @test_fabs(<2 x bfloat> %a) #0 {
-  %r = call <2 x bfloat> @llvm.fabs.f16(<2 x bfloat> %a)
-  ret <2 x bfloat> %r
-}
-
-; CHECK-LABEL: test_fabs_add(
-; CHECK:      abs.bf16x2
-; CHECK:      ret;
-define <2 x bfloat> @test_fabs_add(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
-  %s = fadd <2 x bfloat> %a, %a
-  %r = call <2 x bfloat> @llvm.fabs.f16(<2 x bfloat> %s)
-  %d = fadd <2 x bfloat> %r, %b
-  ret <2 x bfloat> %d
-}
-
-
-; CHECK-LABEL: test_minnum(
-; CHECK-DAG:  ld.param.b32    [[AF0:%r[0-9]+]], [test_minnum_param_0];
-; CHECK-DAG:  ld.param.b32    [[BF0:%r[0-9]+]], [test_minnum_param_1];
-; CHECK-DAG:  min.bf16x2         [[RF0:%r[0-9]+]], [[AF0]], [[BF0]];
-; CHECK:      st.param.b32    [func_retval0+0], [[RF0]];
-; CHECK:      ret;
-define <2 x bfloat> @test_minnum(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
-  %r = call <2 x bfloat> @llvm.minnum.f16(<2 x bfloat> %a, <2 x bfloat> %b)
-  ret <2 x bfloat> %r
-}
-
-; CHECK-LABEL: test_maxnum(
-; CHECK-DAG:  ld.param.b32    [[AF0:%r[0-9]+]], [test_maxnum_param_0];
-; CHECK-DAG:  ld.param.b32    [[BF0:%r[0-9]+]], [test_maxnum_param_1];
-; CHECK-DAG:  max.bf16x2         [[RF0:%r[0-9]+]], [[AF0]], [[BF0]];
-; CHECK:      st.param.b32    [func_retval0+0], [[RF0]];
-; CHECK:      ret;
-define <2 x bfloat> @test_maxnum(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
-  %r = call <2 x bfloat> @llvm.maxnum.f16(<2 x bfloat> %a, <2 x bfloat> %b)
-  ret <2 x bfloat> %r
-}
-
-
-
-; CHECK-LABEL: test_floor(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_floor_param_0];
-; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]];
-; SM90:  cvt.rmi.bf16.bf16 [[R1:%rs[0-9]+]], [[A1]];
-; SM90:  cvt.rmi.bf16.bf16 [[R0:%rs[0-9]+]], [[A0]];
-; SM80-DAG:   cvt.f32.bf16     [[FA0:%f[0-9]+]], [[A0]];
-; SM80-DAG:   cvt.f32.bf16     [[FA1:%f[0-9]+]], [[A1]];
-; SM80-DAG:  cvt.rmi.f32.f32 [[RF0:%f[0-9]+]], [[FA0]];
-; SM80-DAG:  cvt.rmi.f32.f32 [[RF1:%f[0-9]+]], [[FA1]];
-; SM80-DAG:  cvt.rn.bf16.f32  [[R0:%rs[0-9]+]], [[RF0]];
-; SM80-DAG:  cvt.rn.bf16.f32  [[R1:%rs[0-9]+]], [[RF1]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x bfloat> @test_floor(<2 x bfloat> %a) #0 {
-  %r = call <2 x bfloat> @llvm.floor.f16(<2 x bfloat> %a)
-  ret <2 x bfloat> %r
-}
-
-; CHECK-LABEL: test_ceil(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_ceil_param_0];
-; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]];
-; SM90:  cvt.rpi.bf16.bf16 [[R1:%rs[0-9]+]], [[A1]];
-; SM90:  cvt.rpi.bf16.bf16 [[R0:%rs[0-9]+]], [[A0]];
-; SM80-DAG:   cvt.f32.bf16     [[FA0:%f[0-9]+]], [[A0]];
-; SM80-DAG:   cvt.f32.bf16     [[FA1:%f[0-9]+]], [[A1]];
-; SM80-DAG:   cvt.rpi.f32.f32 [[RF0:%f[0-9]+]], [[FA0]];
-; SM80-DAG:   cvt.rpi.f32.f32 [[RF1:%f[0-9]+]], [[FA1]];
-; SM80-DAG:  cvt.rn.bf16.f32  [[R0:%rs[0-9]+]], [[RF0]];
-; SM80-DAG:  cvt.rn.bf16.f32  [[R1:%rs[0-9]+]], [[RF1]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x bfloat> @test_ceil(<2 x bfloat> %a) #0 {
-  %r = call <2 x bfloat> @llvm.ceil.f16(<2 x bfloat> %a)
-  ret <2 x bfloat> %r
-}
-
-; CHECK-LABEL: test_trunc(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_trunc_param_0];
-; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]];
-; SM90:  cvt.rzi.bf16.bf16 [[R1:%rs[0-9]+]], [[A1]];
-; SM90:  cvt.rzi.bf16.bf16 [[R0:%rs[0-9]+]], [[A0]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x bfloat> @test_trunc(<2 x bfloat> %a) #0 {
-  %r = call <2 x bfloat> @llvm.trunc.f16(<2 x bfloat> %a)
-  ret <2 x bfloat> %r
-}
-
-; CHECK-LABEL: test_rint(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_rint_param_0];
-; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]];
-; SM90:  cvt.rni.bf16.bf16 [[R1:%rs[0-9]+]], [[A1]];
-; SM90:  cvt.rni.bf16.bf16 [[R0:%rs[0-9]+]], [[A0]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x bfloat> @test_rint(<2 x bfloat> %a) #0 {
-  %r = call <2 x bfloat> @llvm.rint.f16(<2 x bfloat> %a)
-  ret <2 x bfloat> %r
-}
-
-; CHECK-LABEL: test_round(
-; CHECK:      ld.param.b32    {{.*}}, [test_round_param_0];
-; check the use of sign mask and 0.5 to implement round
-; CHECK:      and.b32 [[R1:%r[0-9]+]], {{.*}}, -2147483648;
-; CHECK:      or.b32 {{.*}}, [[R1]], 1056964608;
-; CHECK:      and.b32 [[R2:%r[0-9]+]], {{.*}}, -2147483648;
-; CHECK:      or.b32 {{.*}}, [[R2]], 1056964608;
-; CHECK:      st.param.b32    [func_retval0+0], {{.*}};
-; CHECK:      ret;
-define <2 x bfloat> @test_round(<2 x bfloat> %a) #0 {
-  %r = call <2 x bfloat> @llvm.round.f16(<2 x bfloat> %a)
-  ret <2 x bfloat> %r
-}
-
-; CHECK-LABEL: test_copysign(
-; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_copysign_param_0];
-; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_copysign_param_1];
-; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
-; CHECK-DAG:  abs.bf16        [[AW1:%rs[0-9]+]], [[A1]];
-; CHECK-DAG:  neg.bf16        [[AY1:%rs[0-9]+]], [[AW1]];
-; CHECK-DAG:  shr.u16         [[BS1:%rs[0-9]+]], [[B1]], 15;
-; CHECK-DAG:  and.b16         [[BR1:%rs[0-9]+]], [[BS1]], 1;
-; CHECK-DAG:  setp.eq.b16     [[P1:%p[0-9]+]], [[BR1]], 1;
-; CHECK-DAG:  selp.b16        [[RS1:%rs[0-9]+]], [[AY1]], [[AW1]], [[P1]]
-; CHECK-DAG:  abs.bf16        [[AW0:%rs[0-9]+]], [[A0]];
-; CHECK-DAG:  neg.bf16        [[AY0:%rs[0-9]+]], [[AW0]];
-; CHECK-DAG:  shr.u16         [[BS0:%rs[0-9]+]], [[B0]], 15;
-; CHECK-DAG:  and.b16         [[BR0:%rs[0-9]+]], [[BS0]], 1;
-; CHECK-DAG:  setp.eq.b16     [[P0:%p[0-9]+]], [[BR0]], 1;
-; CHECK-DAG:  selp.b16        [[RS0:%rs[0-9]+]], [[AY0]], [[AW0]], [[P0]]
-; CHECK-DAG:  mov.b32         [[R:%r[0-9]+]], {[[RS0]], [[RS1]]}
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
-define <2 x bfloat> @test_copysign(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
-  %r = call <2 x bfloat> @llvm.copysign.f16(<2 x bfloat> %a, <2 x bfloat> %b)
-  ret <2 x bfloat> %r
-}
-
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM80 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK,SM90 %s
+; RUN: %if ptxas-11.8 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %}
+; RUN: %if ptxas-11.8 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: test_ret_const(
+; CHECK:     mov.b32         [[T:%r[0-9+]]], 1073758080;
+; CHECK:     st.param.b32    [func_retval0+0], [[T]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_ret_const() #0 {
+  ret <2 x bfloat> <bfloat 1.0, bfloat 2.0>
+}
+
+; Check that we can lower fadd with immediate arguments.
+; CHECK-LABEL: test_fadd_imm_0(
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fadd_imm_0_param_0];
+;
+; SM90-DAG:        mov.b32        [[I:%r[0-9+]]], 1073758080;
+; SM90-DAG:        add.rn.bf16x2   [[R:%r[0-9]+]], [[A]], [[I]];
+;
+; SM80-DAG:  mov.b32        {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; SM80-DAG:  cvt.f32.bf16    [[FA0:%f[0-9]+]], [[A0]]
+; SM80-DAG:  cvt.f32.bf16    [[FA1:%f[0-9]+]], [[A1]]
+; SM80-DAG:  add.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000;
+; SM80-DAG:  add.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], 0f40000000;
+; SM80-DAG:  cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[FR0]]
+; SM80-DAG:  cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[FR1]]
+; SM80-DAG:  mov.b32        [[R:%r[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_fadd_imm_0(<2 x bfloat> %a) #0 {
+  %r = fadd <2 x bfloat> <bfloat 1.0, bfloat 2.0>, %a
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fadd_imm_1(
+; CHECK:      ld.param.b16    [[A:%rs[0-9]+]], [test_fadd_imm_1_param_0];
+; SM90:       mov.b16         [[B:%rs[0-9]+]], 0x3F80;
+; SM90:       add.rn.bf16     [[R:%rs[0-9]+]], [[A]], [[B]];
+
+; SM80-DAG:   cvt.f32.bf16    [[FA:%f[0-9]+]], [[A]];
+; SM80:       add.rn.f32      [[FR:%f[0-9]+]], [[FA]], 0f3F800000;
+; SM80:       cvt.rn.bf16.f32 [[R:%rs[0-9]+]], [[FR]];
+
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd_imm_1(bfloat %a) #0 {
+  %r = fadd bfloat %a, 1.0
+  ret bfloat %r
+}
+
+; CHECK-LABEL: test_fsubx2(
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fsubx2_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fsubx2_param_1];
+; SM90:       sub.rn.bf16x2   [[R:%r[0-9]+]], [[A]], [[B]];
+
+; SM80-DAG:   mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]];
+; SM80-DAG:   mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]];
+; SM80-DAG:   cvt.f32.bf16    [[FA1:%f[0-9]+]], [[A1]];
+; SM80-DAG:   cvt.f32.bf16    [[FA0:%f[0-9]+]], [[A0]];
+; SM80-DAG:   cvt.f32.bf16    [[FB0:%f[0-9]+]], [[B0]];
+; SM80-DAG:   cvt.f32.bf16    [[FB1:%f[0-9]+]], [[B1]];
+; SM80-DAG:   sub.rn.f32      [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; SM80-DAG:   sub.rn.f32      [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; SM80-DAG:   cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[FR0]];
+; SM80-DAG:   cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[FR1]];
+; SM80:       mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]};
+
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+
+define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fmulx2(
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fmulx2_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fmulx2_param_1];
+; SM90:       mul.rn.bf16x2   [[R:%r[0-9]+]], [[A]], [[B]];
+
+; SM80-DAG:   mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]];
+; SM80-DAG:   mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]];
+; SM80-DAG:   cvt.f32.bf16    [[FA1:%f[0-9]+]], [[A1]];
+; SM80-DAG:   cvt.f32.bf16    [[FA0:%f[0-9]+]], [[A0]];
+; SM80-DAG:   cvt.f32.bf16    [[FB0:%f[0-9]+]], [[B0]];
+; SM80-DAG:   cvt.f32.bf16    [[FB1:%f[0-9]+]], [[B1]];
+; SM80-DAG:   mul.rn.f32      [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; SM80-DAG:   mul.rn.f32      [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; SM80-DAG:   cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[FR0]];
+; SM80-DAG:   cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[FR1]];
+; SM80:       mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]};
+
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+
+define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.bf16     [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16     [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.bf16     [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.bf16     [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32      [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32      [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%rs[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%rs[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fdiv <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fneg(
+; CHECK-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_fneg_param_0];
+
+; CHECK-DAG:        xor.b32        [[IHH0:%r[0-9]+]], [[A]], -2147450880;
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[IHH0]];
+; CHECK-NEXT: ret;
+define <2 x bfloat> @test_fneg(<2 x bfloat> %a) #0 {
+  %r = fneg <2 x bfloat> %a
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: .func test_ldst_v2bf16(
+; CHECK-DAG:    ld.param.u64    %[[A:rd[0-9]+]], [test_ldst_v2bf16_param_0];
+; CHECK-DAG:    ld.param.u64    %[[B:rd[0-9]+]], [test_ldst_v2bf16_param_1];
+; CHECK-DAG:    ld.b32          [[E:%r[0-9]+]], [%[[A]]]
+; CHECK-DAG:    st.b32          [%[[B]]], [[E]];
+; CHECK:        ret;
+define void @test_ldst_v2bf16(ptr %a, ptr %b) {
+  %t1 = load <2 x bfloat>, ptr %a
+  store <2 x bfloat> %t1, ptr %b, align 16
+  ret void
+}
+
+; CHECK-LABEL: .func test_ldst_v3bf16(
+; CHECK-DAG:    ld.param.u64    %[[A:rd[0-9]+]], [test_ldst_v3bf16_param_0];
+; CHECK-DAG:    ld.param.u64    %[[B:rd[0-9]+]], [test_ldst_v3bf16_param_1];
+; -- v3 is inconvenient to capture as it's lowered as ld.b64 + fair
+;    number of bitshifting instructions that may change at llvm's whim.
+;    So we only verify that we only issue correct number of writes using
+;    correct offset, but not the values we write.
+; CHECK-DAG:    ld.u64
+; CHECK-DAG:    st.u32          [%[[B]]],
+; CHECK-DAG:    st.b16          [%[[B]]+4],
+; CHECK:        ret;
+define void @test_ldst_v3bf16(ptr %a, ptr %b) {
+  %t1 = load <3 x bfloat>, ptr %a
+  store <3 x bfloat> %t1, ptr %b, align 16
+  ret void
+}
+
+declare <2 x bfloat> @test_callee(<2 x bfloat> %a, <2 x bfloat> %b) #0
+
+; CHECK-LABEL: test_call(
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_call_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_call_param_1];
+; CHECK:      {
+; CHECK-DAG:  .param .align 4 .b8 param0[4];
+; CHECK-DAG:  .param .align 4 .b8 param1[4];
+; CHECK-DAG:  st.param.b32    [param0+0], [[A]];
+; CHECK-DAG:  st.param.b32    [param1+0], [[B]];
+; CHECK-DAG:  .param .align 4 .b8 retval0[4];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT:        test_callee,
+; CHECK:      );
+; CHECK-NEXT: ld.param.b32    [[R:%r[0-9]+]], [retval0+0];
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_call(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = call <2 x bfloat> @test_callee(<2 x bfloat> %a, <2 x bfloat> %b)
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_select(
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_select_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_select_param_1];
+; CHECK-DAG:  ld.param.u8     [[C:%rs[0-9]+]], [test_select_param_2]
+; CHECK-DAG:  setp.eq.b16     [[PRED:%p[0-9]+]], %rs{{.*}}, 1;
+; CHECK-NEXT: selp.b32        [[R:%r[0-9]+]], [[A]], [[B]], [[PRED]];
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_select(<2 x bfloat> %a, <2 x bfloat> %b, i1 zeroext %c) #0 {
+  %r = select i1 %c, <2 x bfloat> %a, <2 x bfloat> %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_select_cc(
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_select_cc_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_select_cc_param_1];
+; CHECK-DAG:  ld.param.b32    [[C:%r[0-9]+]], [test_select_cc_param_2];
+; CHECK-DAG:  ld.param.b32    [[D:%r[0-9]+]], [test_select_cc_param_3];
+;
+; SM90:  setp.neu.bf16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]]
+;
+; SM80-DAG: mov.b32        {[[C0:%rs[0-9]+]], [[C1:%rs[0-9]+]]}, [[C]]
+; SM80-DAG: mov.b32        {[[D0:%rs[0-9]+]], [[D1:%rs[0-9]+]]}, [[D]]
+; SM80-DAG: cvt.f32.bf16 [[DF0:%f[0-9]+]], [[D0]];
+; SM80-DAG: cvt.f32.bf16 [[CF0:%f[0-9]+]], [[C0]];
+; SM80-DAG: cvt.f32.bf16 [[DF1:%f[0-9]+]], [[D1]];
+; SM80-DAG: cvt.f32.bf16 [[CF1:%f[0-9]+]], [[C1]];
+; SM80-DAG: setp.neu.f32    [[P0:%p[0-9]+]], [[CF0]], [[DF0]]
+; SM80-DAG: setp.neu.f32    [[P1:%p[0-9]+]], [[CF1]], [[DF1]]
+;
+; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
+; CHECK-DAG:  selp.b16        [[R0:%rs[0-9]+]], [[A0]], [[B0]], [[P0]];
+; CHECK-DAG:  selp.b16        [[R1:%rs[0-9]+]], [[A1]], [[B1]], [[P1]];
+; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_select_cc(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c, <2 x bfloat> %d) #0 {
+  %cc = fcmp une <2 x bfloat> %c, %d
+  %r = select <2 x i1> %cc, <2 x bfloat> %a, <2 x bfloat> %b
+  ret <2 x bfloat> %r
+}
+
+
+; CHECK-LABEL: test_select_cc_f32_bf16(
+; CHECK-DAG:  ld.param.v2.f32    {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_select_cc_f32_bf16_param_0];
+; CHECK-DAG:  ld.param.b32    [[C:%r[0-9]+]], [test_select_cc_f32_bf16_param_2];
+; CHECK-DAG:  ld.param.b32    [[D:%r[0-9]+]], [test_select_cc_f32_bf16_param_3];
+; SM90:  setp.neu.bf16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]]
+; CHECK-DAG:  ld.param.v2.f32    {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_select_cc_f32_bf16_param_1];
+
+; SM80-DAG: mov.b32         {[[C0:%rs[0-9]+]], [[C1:%rs[0-9]+]]}, [[C]]
+; SM80-DAG: mov.b32         {[[D0:%rs[0-9]+]], [[D1:%rs[0-9]+]]}, [[D]]
+; SM80-DAG: cvt.f32.bf16 [[DF0:%f[0-9]+]], [[D0]];
+; SM80-DAG: cvt.f32.bf16 [[CF0:%f[0-9]+]], [[C0]];
+; SM80-DAG: cvt.f32.bf16 [[DF1:%f[0-9]+]], [[D1]];
+; SM80-DAG: cvt.f32.bf16 [[CF1:%f[0-9]+]], [[C1]];
+; SM80-DAG: setp.neu.f32    [[P0:%p[0-9]+]], [[CF0]], [[DF0]]
+; SM80-DAG: setp.neu.f32    [[P1:%p[0-9]+]], [[CF1]], [[DF1]]
+;
+; CHECK-DAG: selp.f32        [[R0:%f[0-9]+]], [[A0]], [[B0]], [[P0]];
+; CHECK-DAG: selp.f32        [[R1:%f[0-9]+]], [[A1]], [[B1]], [[P1]];
+; CHECK-NEXT: st.param.v2.f32    [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x float> @test_select_cc_f32_bf16(<2 x float> %a, <2 x float> %b,
+                                           <2 x bfloat> %c, <2 x bfloat> %d) #0 {
+  %cc = fcmp une <2 x bfloat> %c, %d
+  %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b
+  ret <2 x float> %r
+}
+
+; CHECK-LABEL: test_select_cc_bf16_f32(
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_select_cc_bf16_f32_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_select_cc_bf16_f32_param_1];
+; CHECK-DAG:  ld.param.v2.f32 {[[C0:%f[0-9]+]], [[C1:%f[0-9]+]]}, [test_select_cc_bf16_f32_param_2];
+; CHECK-DAG:  ld.param.v2.f32 {[[D0:%f[0-9]+]], [[D1:%f[0-9]+]]}, [test_select_cc_bf16_f32_param_3];
+; CHECK-DAG:  setp.neu.f32    [[P0:%p[0-9]+]], [[C0]], [[D0]]
+; CHECK-DAG:  setp.neu.f32    [[P1:%p[0-9]+]], [[C1]], [[D1]]
+; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
+; CHECK-DAG:  selp.b16        [[R0:%rs[0-9]+]], [[A0]], [[B0]], [[P0]];
+; CHECK-DAG:  selp.b16        [[R1:%rs[0-9]+]], [[A1]], [[B1]], [[P1]];
+; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x bfloat> @test_select_cc_bf16_f32(<2 x bfloat> %a, <2 x bfloat> %b,
+                                          <2 x float> %c, <2 x float> %d) #0 {
+  %cc = fcmp une <2 x float> %c, %d
+  %r = select <2 x i1> %cc, <2 x bfloat> %a, <2 x bfloat> %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fptrunc_2xfloat(
+; CHECK:      ld.param.v2.f32 {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_fptrunc_2xfloat_param_0];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%rs[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%rs[0-9]+]], [[A1]];
+; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x bfloat> @test_fptrunc_2xfloat(<2 x float> %a) #0 {
+  %r = fptrunc <2 x float> %a to <2 x bfloat>
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fpext_2xfloat(
+; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_fpext_2xfloat_param_0];
+; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-DAG:  cvt.f32.bf16     [[R0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16     [[R1:%f[0-9]+]], [[A1]];
+; CHECK-NEXT: st.param.v2.f32 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK:      ret;
+define <2 x float> @test_fpext_2xfloat(<2 x bfloat> %a) #0 {
+  %r = fpext <2 x bfloat> %a to <2 x float>
+  ret <2 x float> %r
+}
+
+; CHECK-LABEL: test_bitcast_2xbf16_to_2xi16(
+; CHECK:      ld.param.u32    [[A:%r[0-9]+]], [test_bitcast_2xbf16_to_2xi16_param_0];
+; CHECK:      st.param.b32 [func_retval0+0], [[A]]
+; CHECK:      ret;
+define <2 x i16> @test_bitcast_2xbf16_to_2xi16(<2 x bfloat> %a) #0 {
+  %r = bitcast <2 x bfloat> %a to <2 x i16>
+  ret <2 x i16> %r
+}
+
+
+; CHECK-LABEL: test_bitcast_2xi16_to_2xbf16(
+; CHECK:      ld.param.b32     [[R]], [test_bitcast_2xi16_to_2xbf16_param_0];
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x bfloat> @test_bitcast_2xi16_to_2xbf16(<2 x i16> %a) #0 {
+  %r = bitcast <2 x i16> %a to <2 x bfloat>
+  ret <2 x bfloat> %r
+}
+
+declare <2 x bfloat> @llvm.sqrt.f16(<2 x bfloat> %a) #0
+declare <2 x bfloat> @llvm.powi.f16(<2 x bfloat> %a, <2 x i32> %b) #0
+declare <2 x bfloat> @llvm.sin.f16(<2 x bfloat> %a) #0
+declare <2 x bfloat> @llvm.cos.f16(<2 x bfloat> %a) #0
+declare <2 x bfloat> @llvm.pow.f16(<2 x bfloat> %a, <2 x bfloat> %b) #0
+declare <2 x bfloat> @llvm.exp.f16(<2 x bfloat> %a) #0
+declare <2 x bfloat> @llvm.exp2.f16(<2 x bfloat> %a) #0
+declare <2 x bfloat> @llvm.log.f16(<2 x bfloat> %a) #0
+declare <2 x bfloat> @llvm.log10.f16(<2 x bfloat> %a) #0
+declare <2 x bfloat> @llvm.log2.f16(<2 x bfloat> %a) #0
+declare <2 x bfloat> @llvm.fma.f16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0
+declare <2 x bfloat> @llvm.fabs.f16(<2 x bfloat> %a) #0
+declare <2 x bfloat> @llvm.minnum.f16(<2 x bfloat> %a, <2 x bfloat> %b) #0
+declare <2 x bfloat> @llvm.maxnum.f16(<2 x bfloat> %a, <2 x bfloat> %b) #0
+declare <2 x bfloat> @llvm.copysign.f16(<2 x bfloat> %a, <2 x bfloat> %b) #0
+declare <2 x bfloat> @llvm.floor.f16(<2 x bfloat> %a) #0
+declare <2 x bfloat> @llvm.ceil.f16(<2 x bfloat> %a) #0
+declare <2 x bfloat> @llvm.trunc.f16(<2 x bfloat> %a) #0
+declare <2 x bfloat> @llvm.rint.f16(<2 x bfloat> %a) #0
+declare <2 x bfloat> @llvm.nearbyint.f16(<2 x bfloat> %a) #0
+declare <2 x bfloat> @llvm.round.f16(<2 x bfloat> %a) #0
+declare <2 x bfloat> @llvm.fmuladd.f16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0
+
+
+; CHECK-LABEL: test_sqrt(
+; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_sqrt_param_0];
+; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-DAG:  cvt.f32.bf16     [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16     [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  sqrt.rn.f32     [[RF0:%f[0-9]+]], [[AF0]];
+; CHECK-DAG:  sqrt.rn.f32     [[RF1:%f[0-9]+]], [[AF1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%rs[0-9]+]], [[RF0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%rs[0-9]+]], [[RF1]];
+; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x bfloat> @test_sqrt(<2 x bfloat> %a) #0 {
+  %r = call <2 x bfloat> @llvm.sqrt.f16(<2 x bfloat> %a)
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fmuladd(
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_fmuladd_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_fmuladd_param_1];
+; CHECK-DAG:  ld.param.b32    [[C:%r[0-9]+]], [test_fmuladd_param_2];
+;
+; CHECK:       fma.rn.bf16x2   [[RA:%r[0-9]+]], [[A]], [[B]], [[C]];
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[RA]];
+; CHECK:      ret;
+define <2 x bfloat> @test_fmuladd(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 {
+  %r = call <2 x bfloat> @llvm.fmuladd.f16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fabs(
+; CHECK:      ld.param.u32    [[A:%r[0-9]+]], [test_fabs_param_0];
+; CHECK:      and.b32         [[R:%r[0-9]+]], [[A]], 2147450879;
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x bfloat> @test_fabs(<2 x bfloat> %a) #0 {
+  %r = call <2 x bfloat> @llvm.fabs.f16(<2 x bfloat> %a)
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fabs_add(
+; CHECK:      abs.bf16x2
+; CHECK:      ret;
+define <2 x bfloat> @test_fabs_add(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %s = fadd <2 x bfloat> %a, %a
+  %r = call <2 x bfloat> @llvm.fabs.f16(<2 x bfloat> %s)
+  %d = fadd <2 x bfloat> %r, %b
+  ret <2 x bfloat> %d
+}
+
+
+; CHECK-LABEL: test_minnum(
+; CHECK-DAG:  ld.param.b32    [[AF0:%r[0-9]+]], [test_minnum_param_0];
+; CHECK-DAG:  ld.param.b32    [[BF0:%r[0-9]+]], [test_minnum_param_1];
+; CHECK-DAG:  min.bf16x2         [[RF0:%r[0-9]+]], [[AF0]], [[BF0]];
+; CHECK:      st.param.b32    [func_retval0+0], [[RF0]];
+; CHECK:      ret;
+define <2 x bfloat> @test_minnum(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = call <2 x bfloat> @llvm.minnum.f16(<2 x bfloat> %a, <2 x bfloat> %b)
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_maxnum(
+; CHECK-DAG:  ld.param.b32    [[AF0:%r[0-9]+]], [test_maxnum_param_0];
+; CHECK-DAG:  ld.param.b32    [[BF0:%r[0-9]+]], [test_maxnum_param_1];
+; CHECK-DAG:  max.bf16x2         [[RF0:%r[0-9]+]], [[AF0]], [[BF0]];
+; CHECK:      st.param.b32    [func_retval0+0], [[RF0]];
+; CHECK:      ret;
+define <2 x bfloat> @test_maxnum(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = call <2 x bfloat> @llvm.maxnum.f16(<2 x bfloat> %a, <2 x bfloat> %b)
+  ret <2 x bfloat> %r
+}
+
+
+
+; CHECK-LABEL: test_floor(
+; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_floor_param_0];
+; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]];
+; SM90:  cvt.rmi.bf16.bf16 [[R1:%rs[0-9]+]], [[A1]];
+; SM90:  cvt.rmi.bf16.bf16 [[R0:%rs[0-9]+]], [[A0]];
+; SM80-DAG:   cvt.f32.bf16     [[FA0:%f[0-9]+]], [[A0]];
+; SM80-DAG:   cvt.f32.bf16     [[FA1:%f[0-9]+]], [[A1]];
+; SM80-DAG:  cvt.rmi.f32.f32 [[RF0:%f[0-9]+]], [[FA0]];
+; SM80-DAG:  cvt.rmi.f32.f32 [[RF1:%f[0-9]+]], [[FA1]];
+; SM80-DAG:  cvt.rn.bf16.f32  [[R0:%rs[0-9]+]], [[RF0]];
+; SM80-DAG:  cvt.rn.bf16.f32  [[R1:%rs[0-9]+]], [[RF1]];
+; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x bfloat> @test_floor(<2 x bfloat> %a) #0 {
+  %r = call <2 x bfloat> @llvm.floor.f16(<2 x bfloat> %a)
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_ceil(
+; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_ceil_param_0];
+; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]];
+; SM90:  cvt.rpi.bf16.bf16 [[R1:%rs[0-9]+]], [[A1]];
+; SM90:  cvt.rpi.bf16.bf16 [[R0:%rs[0-9]+]], [[A0]];
+; SM80-DAG:   cvt.f32.bf16     [[FA0:%f[0-9]+]], [[A0]];
+; SM80-DAG:   cvt.f32.bf16     [[FA1:%f[0-9]+]], [[A1]];
+; SM80-DAG:   cvt.rpi.f32.f32 [[RF0:%f[0-9]+]], [[FA0]];
+; SM80-DAG:   cvt.rpi.f32.f32 [[RF1:%f[0-9]+]], [[FA1]];
+; SM80-DAG:  cvt.rn.bf16.f32  [[R0:%rs[0-9]+]], [[RF0]];
+; SM80-DAG:  cvt.rn.bf16.f32  [[R1:%rs[0-9]+]], [[RF1]];
+; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x bfloat> @test_ceil(<2 x bfloat> %a) #0 {
+  %r = call <2 x bfloat> @llvm.ceil.f16(<2 x bfloat> %a)
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_trunc(
+; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_trunc_param_0];
+; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]];
+; SM90:  cvt.rzi.bf16.bf16 [[R1:%rs[0-9]+]], [[A1]];
+; SM90:  cvt.rzi.bf16.bf16 [[R0:%rs[0-9]+]], [[A0]];
+; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x bfloat> @test_trunc(<2 x bfloat> %a) #0 {
+  %r = call <2 x bfloat> @llvm.trunc.f16(<2 x bfloat> %a)
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_rint(
+; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_rint_param_0];
+; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]];
+; SM90:  cvt.rni.bf16.bf16 [[R1:%rs[0-9]+]], [[A1]];
+; SM90:  cvt.rni.bf16.bf16 [[R0:%rs[0-9]+]], [[A0]];
+; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x bfloat> @test_rint(<2 x bfloat> %a) #0 {
+  %r = call <2 x bfloat> @llvm.rint.f16(<2 x bfloat> %a)
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_round(
+; CHECK:      ld.param.b32    {{.*}}, [test_round_param_0];
+; check the use of sign mask and 0.5 to implement round
+; CHECK:      and.b32 [[R1:%r[0-9]+]], {{.*}}, -2147483648;
+; CHECK:      or.b32 {{.*}}, [[R1]], 1056964608;
+; CHECK:      and.b32 [[R2:%r[0-9]+]], {{.*}}, -2147483648;
+; CHECK:      or.b32 {{.*}}, [[R2]], 1056964608;
+; CHECK:      st.param.b32    [func_retval0+0], {{.*}};
+; CHECK:      ret;
+define <2 x bfloat> @test_round(<2 x bfloat> %a) #0 {
+  %r = call <2 x bfloat> @llvm.round.f16(<2 x bfloat> %a)
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_copysign(
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_copysign_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%r[0-9]+]], [test_copysign_param_1];
+; CHECK-DAG:  mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32         {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]]
+; CHECK-DAG:  abs.bf16        [[AW1:%rs[0-9]+]], [[A1]];
+; CHECK-DAG:  neg.bf16        [[AY1:%rs[0-9]+]], [[AW1]];
+; CHECK-DAG:  shr.u16         [[BS1:%rs[0-9]+]], [[B1]], 15;
+; CHECK-DAG:  and.b16         [[BR1:%rs[0-9]+]], [[BS1]], 1;
+; CHECK-DAG:  setp.eq.b16     [[P1:%p[0-9]+]], [[BR1]], 1;
+; CHECK-DAG:  selp.b16        [[RS1:%rs[0-9]+]], [[AY1]], [[AW1]], [[P1]]
+; CHECK-DAG:  abs.bf16        [[AW0:%rs[0-9]+]], [[A0]];
+; CHECK-DAG:  neg.bf16        [[AY0:%rs[0-9]+]], [[AW0]];
+; CHECK-DAG:  shr.u16         [[BS0:%rs[0-9]+]], [[B0]], 15;
+; CHECK-DAG:  and.b16         [[BR0:%rs[0-9]+]], [[BS0]], 1;
+; CHECK-DAG:  setp.eq.b16     [[P0:%p[0-9]+]], [[BR0]], 1;
+; CHECK-DAG:  selp.b16        [[RS0:%rs[0-9]+]], [[AY0]], [[AW0]], [[P0]]
+; CHECK-DAG:  mov.b32         [[R:%r[0-9]+]], {[[RS0]], [[RS1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x bfloat> @test_copysign(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = call <2 x bfloat> @llvm.copysign.f16(<2 x bfloat> %a, <2 x bfloat> %b)
+  ret <2 x bfloat> %r
+}
+
diff --git a/llvm/test/CodeGen/NVPTX/zeroext-32bit.ll b/llvm/test/CodeGen/NVPTX/zeroext-32bit.ll
index 371543e30591..03a6626b9af2 100644
--- a/llvm/test/CodeGen/NVPTX/zeroext-32bit.ll
+++ b/llvm/test/CodeGen/NVPTX/zeroext-32bit.ll
@@ -1,27 +1,27 @@
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -verify-machineinstrs | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 -verify-machineinstrs | %ptxas-verify %}
-
-; The zeroext attribute below should be silently ignored because
-; we can pass a 32-bit integer across a function call without
-; needing to extend it.
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
-target triple = "nvptx64-unknown-cuda"
-
-; CHECK-LABEL: .visible .func zeroext_test
-; CHECK-NOT: cvt.u32.u16
-define void @zeroext_test()  {
-  tail call void @call1(i32 zeroext 0)
-  ret void
-}
-
-declare void @call1(i32 zeroext)
-
-; CHECK-LABEL: .visible .func signext_test
-; CHECK-NOT: cvt.s32.s16
-define void @signext_test()  {
-  tail call void @call2(i32 zeroext 0)
-  ret void
-}
-
-declare void @call2(i32 zeroext)
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 -verify-machineinstrs | %ptxas-verify %}
+
+; The zeroext attribute below should be silently ignored because
+; we can pass a 32-bit integer across a function call without
+; needing to extend it.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-unknown-cuda"
+
+; CHECK-LABEL: .visible .func zeroext_test
+; CHECK-NOT: cvt.u32.u16
+define void @zeroext_test()  {
+  tail call void @call1(i32 zeroext 0)
+  ret void
+}
+
+declare void @call1(i32 zeroext)
+
+; CHECK-LABEL: .visible .func signext_test
+; CHECK-NOT: cvt.s32.s16
+define void @signext_test()  {
+  tail call void @call2(i32 zeroext 0)
+  ret void
+}
+
+declare void @call2(i32 zeroext)
diff --git a/llvm/test/CodeGen/PowerPC/aix-csr-vector-extabi.ll b/llvm/test/CodeGen/PowerPC/aix-csr-vector-extabi.ll
index 67397e4adf4e..b99ef4904d54 100644
--- a/llvm/test/CodeGen/PowerPC/aix-csr-vector-extabi.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-csr-vector-extabi.ll
@@ -23,92 +23,259 @@ entry:
 
 ; MIR32:         name:            vec_regs
 
-; MIR32-LABEL:   fixedStack:
-; MIR32-NEXT:    - { id: 0, type: spill-slot, offset: -16, size: 16, alignment: 16, stack-id: default,
-; MIR32-NEXT:        callee-saved-register: '$v31', callee-saved-restored: true, debug-info-variable: '',
-; MIR32-NEXT:        debug-info-expression: '', debug-info-location: '' }
-; MIR32-NEXT:    - { id: 1, type: spill-slot, offset: -96, size: 16, alignment: 16, stack-id: default,
-; MIR32-NEXT:        callee-saved-register: '$v26', callee-saved-restored: true, debug-info-variable: '',
-; MIR32-NEXT:        debug-info-expression: '', debug-info-location: '' }
-; MIR32-NEXT:    - { id: 2, type: spill-slot, offset: -192, size: 16, alignment: 16, stack-id: default,
-; MIR32-NEXT:        callee-saved-register: '$v20', callee-saved-restored: true, debug-info-variable: '',
-; MIR32-NEXT:        debug-info-expression: '', debug-info-location: '' }
+; MIR32-LABEL:  fixedStack:
+; MIR32-NEXT:     - { id: 0, type: spill-slot, offset: -16, size: 16, alignment: 16, stack-id: default,
+; MIR32-NEXT:         callee-saved-register: '$v31', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:         debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:     - { id: 1, type: spill-slot, offset: -32, size: 16, alignment: 16, stack-id: default,
+; MIR32-NEXT:         callee-saved-register: '$v30', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:         debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:     - { id: 2, type: spill-slot, offset: -48, size: 16, alignment: 16, stack-id: default,
+; MIR32-NEXT:         callee-saved-register: '$v29', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:         debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:     - { id: 3, type: spill-slot, offset: -64, size: 16, alignment: 16, stack-id: default,
+; MIR32-NEXT:         callee-saved-register: '$v28', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:         debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:     - { id: 4, type: spill-slot, offset: -80, size: 16, alignment: 16, stack-id: default,
+; MIR32-NEXT:         callee-saved-register: '$v27', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:         debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:     - { id: 5, type: spill-slot, offset: -96, size: 16, alignment: 16, stack-id: default,
+; MIR32-NEXT:         callee-saved-register: '$v26', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:         debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:     - { id: 6, type: spill-slot, offset: -112, size: 16, alignment: 16, stack-id: default,
+; MIR32-NEXT:         callee-saved-register: '$v25', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:         debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:     - { id: 7, type: spill-slot, offset: -128, size: 16, alignment: 16, stack-id: default,
+; MIR32-NEXT:         callee-saved-register: '$v24', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:         debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:     - { id: 8, type: spill-slot, offset: -144, size: 16, alignment: 16, stack-id: default,
+; MIR32-NEXT:         callee-saved-register: '$v23', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:         debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:     - { id: 9, type: spill-slot, offset: -160, size: 16, alignment: 16, stack-id: default,
+; MIR32-NEXT:         callee-saved-register: '$v22', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:         debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:     - { id: 10, type: spill-slot, offset: -176, size: 16, alignment: 16,
+; MIR32-NEXT:         stack-id: default, callee-saved-register: '$v21', callee-saved-restored: true,
+; MIR32-NEXT:         debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:     - { id: 11, type: spill-slot, offset: -192, size: 16, alignment: 16,
+; MIR32-NEXT:         stack-id: default, callee-saved-register: '$v20', callee-saved-restored: true,
+; MIR32-NEXT:         debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
 ; MIR32-NEXT:    stack:
 
-; MIR32:         liveins: $v20, $v26, $v31
+; MIR32: liveins: $v20, $v21, $v22, $v23, $v24, $v25, $v26, $v27, $v28, $v29, $v30, $v31
 
-; MIR32-DAG:     STXVD2X killed $v20, $r1, killed $r{{[0-9]+}} :: (store (s128) into %fixed-stack.2)
-; MIR32-DAG:     STXVD2X killed $v26, $r1, killed $r{{[0-9]+}} :: (store (s128) into %fixed-stack.1)
+; MIR32-DAG:     STXVD2X killed $v20, $r1, killed $r{{[0-9]+}} :: (store (s128) into %fixed-stack.11)
+; MIR32-DAG:     STXVD2X killed $v21, $r1, killed $r{{[0-9]+}} :: (store (s128) into %fixed-stack.10)
+; MIR32-DAG:     STXVD2X killed $v22, $r1, killed $r{{[0-9]+}} :: (store (s128) into %fixed-stack.9)
+; MIR32-DAG:     STXVD2X killed $v23, $r1, killed $r{{[0-9]+}} :: (store (s128) into %fixed-stack.8)
+; MIR32-DAG:     STXVD2X killed $v24, $r1, killed $r{{[0-9]+}} :: (store (s128) into %fixed-stack.7)
+; MIR32-DAG:     STXVD2X killed $v25, $r1, killed $r{{[0-9]+}} :: (store (s128) into %fixed-stack.6)
+; MIR32-DAG:     STXVD2X killed $v26, $r1, killed $r{{[0-9]+}} :: (store (s128) into %fixed-stack.5)
+; MIR32-DAG:     STXVD2X killed $v27, $r1, killed $r{{[0-9]+}} :: (store (s128) into %fixed-stack.4)
+; MIR32-DAG:     STXVD2X killed $v28, $r1, killed $r{{[0-9]+}} :: (store (s128) into %fixed-stack.3)
+; MIR32-DAG:     STXVD2X killed $v29, $r1, killed $r{{[0-9]+}} :: (store (s128) into %fixed-stack.2)
+; MIR32-DAG:     STXVD2X killed $v30, $r1, killed $r{{[0-9]+}} :: (store (s128) into %fixed-stack.1)
 ; MIR32-DAG:     STXVD2X killed $v31, $r1, killed $r{{[0-9]+}} :: (store (s128) into %fixed-stack.0)
 
 ; MIR32:         INLINEASM
 
-; MIR32-DAG:     $v20 = LXVD2X $r1, killed $r{{[0-9]+}} :: (load (s128) from %fixed-stack.2)
-; MIR32-DAG:     $v26 = LXVD2X $r1, killed $r{{[0-9]+}} :: (load (s128) from %fixed-stack.1)
 ; MIR32-DAG:     $v31 = LXVD2X $r1, killed $r{{[0-9]+}} :: (load (s128) from %fixed-stack.0)
+; MIR32-DAG:     $v30 = LXVD2X $r1, killed $r{{[0-9]+}} :: (load (s128) from %fixed-stack.1)
+; MIR32-DAG:     $v29 = LXVD2X $r1, killed $r{{[0-9]+}} :: (load (s128) from %fixed-stack.2)
+; MIR32-DAG:     $v28 = LXVD2X $r1, killed $r{{[0-9]+}} :: (load (s128) from %fixed-stack.3)
+; MIR32-DAG:     $v27 = LXVD2X $r1, killed $r{{[0-9]+}} :: (load (s128) from %fixed-stack.4)
+; MIR32-DAG:     $v26 = LXVD2X $r1, killed $r{{[0-9]+}} :: (load (s128) from %fixed-stack.5)
+; MIR32-DAG:     $v25 = LXVD2X $r1, killed $r{{[0-9]+}} :: (load (s128) from %fixed-stack.6)
+; MIR32-DAG:     $v24 = LXVD2X $r1, killed $r{{[0-9]+}} :: (load (s128) from %fixed-stack.7)
+; MIR32-DAG:     $v23 = LXVD2X $r1, killed $r{{[0-9]+}} :: (load (s128) from %fixed-stack.8)
+; MIR32-DAG:     $v22 = LXVD2X $r1, killed $r{{[0-9]+}} :: (load (s128) from %fixed-stack.9)
+; MIR32-DAG:     $v21 = LXVD2X $r1, killed $r{{[0-9]+}} :: (load (s128) from %fixed-stack.10)
+; MIR32-DAG:     $v20 = LXVD2X $r1, killed $r{{[0-9]+}} :: (load (s128) from %fixed-stack.11)
 ; MIR32:         BLR implicit $lr, implicit $rm
 
 ; MIR64:         name:            vec_regs
 
 ; MIR64-LABEL:   fixedStack:
-; MIR64-NEXT:    - { id: 0, type: spill-slot, offset: -16, size: 16, alignment: 16, stack-id: default,
-; MIR64-NEXT:        callee-saved-register: '$v31', callee-saved-restored: true, debug-info-variable: '',
-; MIR64-NEXT:        debug-info-expression: '', debug-info-location: '' }
-; MIR64-NEXT:    - { id: 1, type: spill-slot, offset: -96, size: 16, alignment: 16, stack-id: default,
-; MIR64-NEXT:        callee-saved-register: '$v26', callee-saved-restored: true, debug-info-variable: '',
-; MIR64-NEXT:        debug-info-expression: '', debug-info-location: '' }
-; MIR64-NEXT:    - { id: 2, type: spill-slot, offset: -192, size: 16, alignment: 16, stack-id: default,
-; MIR64-NEXT:        callee-saved-register: '$v20', callee-saved-restored: true, debug-info-variable: '',
-; MIR64-NEXT:        debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 0, type: spill-slot, offset: -16, size: 16, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$v31', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 1, type: spill-slot, offset: -32, size: 16, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$v30', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 2, type: spill-slot, offset: -48, size: 16, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$v29', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 3, type: spill-slot, offset: -64, size: 16, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$v28', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 4, type: spill-slot, offset: -80, size: 16, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$v27', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 5, type: spill-slot, offset: -96, size: 16, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$v26', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 6, type: spill-slot, offset: -112, size: 16, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$v25', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 7, type: spill-slot, offset: -128, size: 16, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$v24', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 8, type: spill-slot, offset: -144, size: 16, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$v23', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 9, type: spill-slot, offset: -160, size: 16, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$v22', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 10, type: spill-slot, offset: -176, size: 16, alignment: 16,
+; MIR64-DAG:           stack-id: default, callee-saved-register: '$v21', callee-saved-restored: true,
+; MIR64-DAG:           debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 11, type: spill-slot, offset: -192, size: 16, alignment: 16,
+; MIR64-DAG:           stack-id: default, callee-saved-register: '$v20', callee-saved-restored: true,
+; MIR64-DAG:           debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
 ; MIR64-NEXT:    stack:
 
-; MIR64:         liveins: $v20, $v26, $v31
+; MIR64: liveins: $v20, $v21, $v22, $v23, $v24, $v25, $v26, $v27, $v28, $v29, $v30, $v31
 
-; MIR64-DAG:     STXVD2X killed $v20, $x1, killed $x{{[0-9]+}} :: (store (s128) into %fixed-stack.2)
-; MIR64-DAG:     STXVD2X killed $v26, $x1, killed $x{{[0-9]+}} :: (store (s128) into %fixed-stack.1)
-; MIR64-DAG:     STXVD2X killed $v31, $x1, killed $x{{[0-9]+}} :: (store (s128) into %fixed-stack.0)
+; MIR64-DAG:   STXVD2X killed $v20, $x1, killed $x{{[0-9]+}} :: (store (s128) into %fixed-stack.11)
+; MIR64-DAG:   STXVD2X killed $v21, $x1, killed $x{{[0-9]+}} :: (store (s128) into %fixed-stack.10)
+; MIR64-DAG:   STXVD2X killed $v22, $x1, killed $x{{[0-9]+}} :: (store (s128) into %fixed-stack.9)
+; MIR64-DAG:   STXVD2X killed $v23, $x1, killed $x{{[0-9]+}} :: (store (s128) into %fixed-stack.8)
+; MIR64-DAG:   STXVD2X killed $v24, $x1, killed $x{{[0-9]+}} :: (store (s128) into %fixed-stack.7)
+; MIR64-DAG:   STXVD2X killed $v25, $x1, killed $x{{[0-9]+}} :: (store (s128) into %fixed-stack.6)
+; MIR64-DAG:   STXVD2X killed $v26, $x1, killed $x{{[0-9]+}} :: (store (s128) into %fixed-stack.5)
+; MIR64-DAG:   STXVD2X killed $v27, $x1, killed $x{{[0-9]+}} :: (store (s128) into %fixed-stack.4)
+; MIR64-DAG:   STXVD2X killed $v28, $x1, killed $x{{[0-9]+}} :: (store (s128) into %fixed-stack.3)
+; MIR64-DAG:   STXVD2X killed $v29, $x1, killed $x{{[0-9]+}} :: (store (s128) into %fixed-stack.2)
+; MIR64-DAG:   STXVD2X killed $v30, $x1, killed $x{{[0-9]+}} :: (store (s128) into %fixed-stack.1)
+; MIR64-DAG:   STXVD2X killed $v31, $x1, killed $x{{[0-9]+}} :: (store (s128) into %fixed-stack.0)
 
-; MIR64:         INLINEASM
+; MIR64:       INLINEASM
 
-; MIR64-DAG:     $v20 = LXVD2X $x1, killed $x{{[0-9]+}} :: (load (s128) from %fixed-stack.2)
-; MIR64-DAG:     $v26 = LXVD2X $x1, killed $x{{[0-9]+}} :: (load (s128) from %fixed-stack.1)
-; MIR64-DAG:     $v31 = LXVD2X $x1, killed $x{{[0-9]+}} :: (load (s128) from %fixed-stack.0)
-; MIR64:         BLR8 implicit $lr8, implicit $rm
+; MIR64-DAG:   $v31 = LXVD2X $x1, killed $x{{[0-9]+}} :: (load (s128) from %fixed-stack.0)
+; MIR64-DAG:   $v30 = LXVD2X $x1, killed $x{{[0-9]+}} :: (load (s128) from %fixed-stack.1)
+; MIR64-DAG:   $v29 = LXVD2X $x1, killed $x{{[0-9]+}} :: (load (s128) from %fixed-stack.2)
+; MIR64-DAG:   $v28 = LXVD2X $x1, killed $x{{[0-9]+}} :: (load (s128) from %fixed-stack.3)
+; MIR64-DAG:   $v27 = LXVD2X $x1, killed $x{{[0-9]+}} :: (load (s128) from %fixed-stack.4)
+; MIR64-DAG:   $v26 = LXVD2X $x1, killed $x{{[0-9]+}} :: (load (s128) from %fixed-stack.5)
+; MIR64-DAG:   $v25 = LXVD2X $x1, killed $x{{[0-9]+}} :: (load (s128) from %fixed-stack.6)
+; MIR64-DAG:   $v24 = LXVD2X $x1, killed $x{{[0-9]+}} :: (load (s128) from %fixed-stack.7)
+; MIR64-DAG:   $v23 = LXVD2X $x1, killed $x{{[0-9]+}} :: (load (s128) from %fixed-stack.8)
+; MIR64-DAG:   $v22 = LXVD2X $x1, killed $x{{[0-9]+}} :: (load (s128) from %fixed-stack.9)
+; MIR64-DAG:   $v21 = LXVD2X $x1, killed $x{{[0-9]+}} :: (load (s128) from %fixed-stack.10)
+; MIR64-DAG:   $v20 = LXVD2X $x1, killed $x{{[0-9]+}} :: (load (s128) from %fixed-stack.11)
+; MIR64:       BLR8 implicit $lr8, implicit $rm
 
 
 ; ASM32-LABEL:   .vec_regs:
 
-; ASM32:         li {{[0-9]+}}, -192
-; ASM32-DAG:     stxvd2x 52, 1, {{[0-9]+}}               # 16-byte Folded Spill
-; ASM32-DAG:     li {{[0-9]+}}, -96
-; ASM32-DAG:     stxvd2x 58, 1, {{[0-9]+}}               # 16-byte Folded Spill
-; ASM32-DAG:     li {{[0-9]+}}, -16
-; ASM32-DAG:     stxvd2x 63, 1, {{[0-9]+}}               # 16-byte Folded Spill
-; ASM32:         #APP
-; ASM32-DAG:     #NO_APP
-; ASM32-DAG:     lxvd2x 63, 1, {{[0-9]+}}       # 16-byte Folded Reload
-; ASM32-DAG:     li {{[0-9]+}}, -96
-; ASM32-DAG:     lxvd2x 58, 1, {{[0-9]+}}       # 16-byte Folded Reload
-; ASM32-DAG:     li {{[0-9]+}}, -192
-; ASM32-DAG:     lxvd2x 52, 1, {{[0-9]+}}       # 16-byte Folded Reload
-; ASM32:         blr
+; ASM32-DAG:       li [[FIXEDSTACK11:[0-9]+]], -192
+; ASM32-DAG:       stxvd2x 52, 1, [[FIXEDSTACK11]]                       # 16-byte Folded Spill
+; ASM32-DAG:       li [[FIXEDSTACK10:[0-9]+]], -176
+; ASM32-DAG:       stxvd2x 53, 1, [[FIXEDSTACK10]]                       # 16-byte Folded Spill
+; ASM32-DAG:       li [[FIXEDSTACK9:[0-9]+]], -160
+; ASM32-DAG:       stxvd2x 54, 1, [[FIXEDSTACK9]]                       # 16-byte Folded Spill
+; ASM32-DAG:       li [[FIXEDSTACK8:[0-9]+]], -144
+; ASM32-DAG:       stxvd2x 55, 1, [[FIXEDSTACK8]]                       # 16-byte Folded Spill
+; ASM32-DAG:       li [[FIXEDSTACK7:[0-9]+]], -128
+; ASM32-DAG:       stxvd2x 56, 1, [[FIXEDSTACK7]]                       # 16-byte Folded Spill
+; ASM32-DAG:       li [[FIXEDSTACK6:[0-9]+]], -112
+; ASM32-DAG:       stxvd2x 57, 1, [[FIXEDSTACK6]]                       # 16-byte Folded Spill
+; ASM32-DAG:       li [[FIXEDSTACK5:[0-9]+]], -96
+; ASM32-DAG:       stxvd2x 58, 1, [[FIXEDSTACK5]]                       # 16-byte Folded Spill
+; ASM32-DAG:       li [[FIXEDSTACK4:[0-9]+]], -80
+; ASM32-DAG:       stxvd2x 59, 1, [[FIXEDSTACK4]]                       # 16-byte Folded Spill
+; ASM32-DAG:       li [[FIXEDSTACK3:[0-9]+]], -64
+; ASM32-DAG:       stxvd2x 60, 1, [[FIXEDSTACK3]]                       # 16-byte Folded Spill
+; ASM32-DAG:       li [[FIXEDSTACK2:[0-9]+]], -48
+; ASM32-DAG:       stxvd2x 61, 1, [[FIXEDSTACK2]]                       # 16-byte Folded Spill
+; ASM32-DAG:       li [[FIXEDSTACK1:[0-9]+]], -32
+; ASM32-DAG:       stxvd2x 62, 1, [[FIXEDSTACK1]]                       # 16-byte Folded Spill
+; ASM32-DAG:       li [[FIXEDSTACK0:[0-9]+]], -16
+; ASM32-DAG:       stxvd2x 63, 1, [[FIXEDSTACK0]]                       # 16-byte Folded Spill
+
+; ASM32:           #APP
+; ASM32-NEXT:      #NO_APP
+
+; ASM32-DAG:       lxvd2x 63, 1, [[FIXEDSTACK0]]                        # 16-byte Folded Reload
+; ASM32-DAG:       li [[FIXEDSTACK1:[0-9]+]], -32
+; ASM32-DAG:       lxvd2x 62, 1, [[FIXEDSTACK1]]                        # 16-byte Folded Reload
+; ASM32-DAG:       li [[FIXEDSTACK2:[0-9]+]], -48
+; ASM32-DAG:       lxvd2x 61, 1, [[FIXEDSTACK2]]                        # 16-byte Folded Reload
+; ASM32-DAG:       li [[FIXEDSTACK3:[0-9]+]], -64
+; ASM32-DAG:       lxvd2x 60, 1, [[FIXEDSTACK3]]                        # 16-byte Folded Reload
+; ASM32-DAG:       li [[FIXEDSTACK4:[0-9]+]], -80
+; ASM32-DAG:       lxvd2x 59, 1, [[FIXEDSTACK4]]                        # 16-byte Folded Reload
+; ASM32-DAG:       li [[FIXEDSTACK5:[0-9]+]], -96
+; ASM32-DAG:       lxvd2x 58, 1, [[FIXEDSTACK5]]                        # 16-byte Folded Reload
+; ASM32-DAG:       li [[FIXEDSTACK6:[0-9]+]], -112
+; ASM32-DAG:       lxvd2x 57, 1, [[FIXEDSTACK6]]                        # 16-byte Folded Reload
+; ASM32-DAG:       li [[FIXEDSTACK7:[0-9]+]], -128
+; ASM32-DAG:       lxvd2x 56, 1, [[FIXEDSTACK7]]                        # 16-byte Folded Reload
+; ASM32-DAG:       li [[FIXEDSTACK8:[0-9]+]], -144
+; ASM32-DAG:       lxvd2x 55, 1, [[FIXEDSTACK8]]                        # 16-byte Folded Reload
+; ASM32-DAG:       li [[FIXEDSTACK9:[0-9]+]], -160
+; ASM32-DAG:       lxvd2x 54, 1, [[FIXEDSTACK9]]                        # 16-byte Folded Reload
+; ASM32-DAG:       li [[FIXEDSTACK10:[0-9]+]], -176
+; ASM32-DAG:       lxvd2x 53, 1, [[FIXEDSTACK10]]                       # 16-byte Folded Reload
+; ASM32-DAG:       li [[FIXEDSTACK11:[0-9]+]], -192
+; ASM32-DAG:       lxvd2x 52, 1, [[FIXEDSTACK11]]                       # 16-byte Folded Reload
+; ASM32:           blr
 
 ; ASM64-LABEL:   .vec_regs:
 
-; ASM64-DAG:     li {{[0-9]+}}, -192
-; ASM64-DAG:     stxvd2x 52, 1, {{[0-9]+}}               # 16-byte Folded Spill
-; ASM64-DAG:     li {{[0-9]+}}, -96
-; ASM64-DAG:     stxvd2x 58, 1, {{[0-9]+}}               # 16-byte Folded Spill
-; ASM64-DAG:     li {{[0-9]+}}, -16
-; ASM64-DAG:     stxvd2x {{[0-9]+}}, 1, {{[0-9]+}}      # 16-byte Folded Spill
+; ASM64-DAG:       li [[FIXEDSTACK11:[0-9]+]], -192
+; ASM64-DAG:       stxvd2x 52, 1, [[FIXEDSTACK11]]                   # 16-byte Folded Spill
+; ASM64-DAG:       li [[FIXEDSTACK10:[0-9]+]], -176
+; ASM64-DAG:       stxvd2x 53, 1, [[FIXEDSTACK10]]                   # 16-byte Folded Spill
+; ASM64-DAG:       li [[FIXEDSTACK9:[0-9]+]], -160
+; ASM64-DAG:       stxvd2x 54, 1, [[FIXEDSTACK9]]                    # 16-byte Folded Spill
+; ASM64-DAG:       li [[FIXEDSTACK8:[0-9]+]], -144
+; ASM64-DAG:       stxvd2x 55, 1, [[FIXEDSTACK8]]                    # 16-byte Folded Spill
+; ASM64-DAG:       li [[FIXEDSTACK7:[0-9]+]], -128
+; ASM64-DAG:       stxvd2x 56, 1, [[FIXEDSTACK7]]                    # 16-byte Folded Spill
+; ASM64-DAG:       li [[FIXEDSTACK6:[0-9]+]], -112
+; ASM64-DAG:       stxvd2x 57, 1, [[FIXEDSTACK6]]                    # 16-byte Folded Spill
+; ASM64-DAG:       li [[FIXEDSTACK5:[0-9]+]], -96
+; ASM64-DAG:       stxvd2x 58, 1, [[FIXEDSTACK5]]                    # 16-byte Folded Spill
+; ASM64-DAG:       li [[FIXEDSTACK4:[0-9]+]], -80
+; ASM64-DAG:       stxvd2x 59, 1, [[FIXEDSTACK4]]                    # 16-byte Folded Spill
+; ASM64-DAG:       li [[FIXEDSTACK3:[0-9]+]], -64
+; ASM64-DAG:       stxvd2x 60, 1, [[FIXEDSTACK3]]                    # 16-byte Folded Spill
+; ASM64-DAG:       li [[FIXEDSTACK2:[0-9]+]], -48
+; ASM64-DAG:       stxvd2x 61, 1, [[FIXEDSTACK2]]                    # 16-byte Folded Spill
+; ASM64-DAG:       li [[FIXEDSTACK1:[0-9]+]], -32
+; ASM64-DAG:       stxvd2x 62, 1, [[FIXEDSTACK1]]                    # 16-byte Folded Spill
+; ASM64-DAG:       li [[FIXEDSTACK0:[0-9]+]], -16
+; ASM64-DAG:       stxvd2x 63, 1, [[FIXEDSTACK0]]                    # 16-byte Folded Spill
+
 ; ASM64-DAG:     #APP
 ; ASM64-DAG:     #NO_APP
-; ASM64-DAG:     lxvd2x {{[0-9]+}}, 1, {{[0-9]+}}       # 16-byte Folded Reload
-; ASM64-DAG:     li {{[0-9]+}}, -96
-; ASM64-DAG:     lxvd2x 58, 1, {{[0-9]+}}                # 16-byte Folded Reload
-; ASM64-DAG:     li {{[0-9]+}}, -192
-; ASM64-DAG:     lxvd2x 52, 1, {{[0-9]+}}                # 16-byte Folded Reload
-; ASM64-DAG:     blr
+
+; ASM64-DAG:     lxvd2x 63, 1, [[FIXEDSTACK0]]                         # 16-byte Folded Reload
+; ASM64-DAG:     li [[FIXEDSTACK1:[0-9]+]], -32
+; ASM64-DAG:     lxvd2x 62, 1, [[FIXEDSTACK1]]                         # 16-byte Folded Reload
+; ASM64-DAG:     li [[FIXEDSTACK2:[0-9]+]], -48
+; ASM64-DAG:     lxvd2x 61, 1, [[FIXEDSTACK2]]                         # 16-byte Folded Reload
+; ASM64-DAG:     li [[FIXEDSTACK3:[0-9]+]], -64
+; ASM64-DAG:     lxvd2x 60, 1, [[FIXEDSTACK3]]                         # 16-byte Folded Reload
+; ASM64-DAG:     li [[FIXEDSTACK4:[0-9]+]], -80
+; ASM64-DAG:     lxvd2x 59, 1, [[FIXEDSTACK4]]                         # 16-byte Folded Reload
+; ASM64-DAG:     li [[FIXEDSTACK5:[0-9]+]], -96
+; ASM64-DAG:     lxvd2x 58, 1, [[FIXEDSTACK5]]                         # 16-byte Folded Reload
+; ASM64-DAG:     li [[FIXEDSTACK6:[0-9]+]], -112
+; ASM64-DAG:     lxvd2x 57, 1, [[FIXEDSTACK6]]                         # 16-byte Folded Reload
+; ASM64-DAG:     li [[FIXEDSTACK7:[0-9]+]], -128
+; ASM64-DAG:     lxvd2x 56, 1, [[FIXEDSTACK7]]                         # 16-byte Folded Reload
+; ASM64-DAG:     li [[FIXEDSTACK8:[0-9]+]], -144
+; ASM64-DAG:     lxvd2x 55, 1, [[FIXEDSTACK8]]                         # 16-byte Folded Reload
+; ASM64-DAG:     li [[FIXEDSTACK9:[0-9]+]], -160
+; ASM64-DAG:     lxvd2x 54, 1, [[FIXEDSTACK9]]                         # 16-byte Folded Reload
+; ASM64-DAG:     li [[FIXEDSTACK10:[0-9]+]], -176
+; ASM64-DAG:     lxvd2x 53, 1, [[FIXEDSTACK10]]                        # 16-byte Folded Reload
+; ASM64-DAG:     li [[FIXEDSTACK11:[0-9]+]], -192
+; ASM64-DAG:     lxvd2x 52, 1, [[FIXEDSTACK11]]                        # 16-byte Folded Reload
+
+; ASM64:         blr
 
 define dso_local void @fprs_gprs_vecregs() {
   call void asm sideeffect "", "~{r14},~{r25},~{r31},~{f14},~{f21},~{f31},~{v20},~{v26},~{v31}"()
@@ -118,191 +285,767 @@ define dso_local void @fprs_gprs_vecregs() {
 ; MIR32:         name:            fprs_gprs_vecregs
 
 ; MIR32-LABEL:   fixedStack:
-; MIR32-NEXT:    - { id: 0, type: spill-slot, offset: -240, size: 16, alignment: 16, stack-id: default,
-; MIR32-NEXT:        callee-saved-register: '$v31', callee-saved-restored: true, debug-info-variable: '',
-; MIR32-NEXT:        debug-info-expression: '', debug-info-location: '' }
-; MIR32-NEXT:    - { id: 1, type: spill-slot, offset: -320, size: 16, alignment: 16, stack-id: default,
-; MIR32-NEXT:        callee-saved-register: '$v26', callee-saved-restored: true, debug-info-variable: '',
-; MIR32-NEXT:        debug-info-expression: '', debug-info-location: '' }
-; MIR32-NEXT:    - { id: 2, type: spill-slot, offset: -416, size: 16, alignment: 16, stack-id: default,
-; MIR32-NEXT:        callee-saved-register: '$v20', callee-saved-restored: true, debug-info-variable: '',
-; MIR32-NEXT:        debug-info-expression: '', debug-info-location: '' }
-; MIR32-NEXT:    - { id: 3, type: spill-slot, offset: -8, size: 8, alignment: 8, stack-id: default,
-; MIR32-NEXT:        callee-saved-register: '$f31', callee-saved-restored: true, debug-info-variable: '',
-; MIR32-NEXT:        debug-info-expression: '', debug-info-location: '' }
-; MIR32-NEXT:    - { id: 4, type: spill-slot, offset: -88, size: 8, alignment: 8, stack-id: default,
-; MIR32-NEXT:        callee-saved-register: '$f21', callee-saved-restored: true, debug-info-variable: '',
-; MIR32-NEXT:        debug-info-expression: '', debug-info-location: '' }
-; MIR32-NEXT:    - { id: 5, type: spill-slot, offset: -144, size: 8, alignment: 16, stack-id: default,
-; MIR32-NEXT:        callee-saved-register: '$f14', callee-saved-restored: true, debug-info-variable: '',
-; MIR32-NEXT:        debug-info-expression: '', debug-info-location: '' }
-; MIR32-NEXT:    - { id: 6, type: spill-slot, offset: -148, size: 4, alignment: 4, stack-id: default,
-; MIR32-NEXT:        callee-saved-register: '$r31', callee-saved-restored: true, debug-info-variable: '',
-; MIR32-NEXT:        debug-info-expression: '', debug-info-location: '' }
-; MIR32-NEXT:    - { id: 7, type: spill-slot, offset: -172, size: 4, alignment: 4, stack-id: default,
-; MIR32-NEXT:        callee-saved-register: '$r25', callee-saved-restored: true, debug-info-variable: '',
-; MIR32-NEXT:        debug-info-expression: '', debug-info-location: '' }
-; MIR32-NEXT:    - { id: 8, type: spill-slot, offset: -216, size: 4, alignment: 8, stack-id: default,
-; MIR32-NEXT:        callee-saved-register: '$r14', callee-saved-restored: true, debug-info-variable: '',
-; MIR32-NEXT:        debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 0, type: spill-slot, offset: -240, size: 16, alignment: 16, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$v31', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 1, type: spill-slot, offset: -256, size: 16, alignment: 16, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$v30', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 2, type: spill-slot, offset: -272, size: 16, alignment: 16, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$v29', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 3, type: spill-slot, offset: -288, size: 16, alignment: 16, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$v28', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 4, type: spill-slot, offset: -304, size: 16, alignment: 16, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$v27', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 5, type: spill-slot, offset: -320, size: 16, alignment: 16, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$v26', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 6, type: spill-slot, offset: -336, size: 16, alignment: 16, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$v25', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 7, type: spill-slot, offset: -352, size: 16, alignment: 16, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$v24', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 8, type: spill-slot, offset: -368, size: 16, alignment: 16, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$v23', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 9, type: spill-slot, offset: -384, size: 16, alignment: 16, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$v22', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 10, type: spill-slot, offset: -400, size: 16, alignment: 16,
+; MIR32-NEXT:          stack-id: default, callee-saved-register: '$v21', callee-saved-restored: true,
+; MIR32-NEXT:          debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 11, type: spill-slot, offset: -416, size: 16, alignment: 16,
+; MIR32-NEXT:          stack-id: default, callee-saved-register: '$v20', callee-saved-restored: true,
+; MIR32-NEXT:          debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 12, type: spill-slot, offset: -8, size: 8, alignment: 8, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$f31', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 13, type: spill-slot, offset: -16, size: 8, alignment: 16, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$f30', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 14, type: spill-slot, offset: -24, size: 8, alignment: 8, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$f29', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 15, type: spill-slot, offset: -32, size: 8, alignment: 16, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$f28', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 16, type: spill-slot, offset: -40, size: 8, alignment: 8, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$f27', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 17, type: spill-slot, offset: -48, size: 8, alignment: 16, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$f26', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 18, type: spill-slot, offset: -56, size: 8, alignment: 8, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$f25', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 19, type: spill-slot, offset: -64, size: 8, alignment: 16, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$f24', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 20, type: spill-slot, offset: -72, size: 8, alignment: 8, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$f23', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 21, type: spill-slot, offset: -80, size: 8, alignment: 16, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$f22', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 22, type: spill-slot, offset: -88, size: 8, alignment: 8, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$f21', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 23, type: spill-slot, offset: -96, size: 8, alignment: 16, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$f20', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 24, type: spill-slot, offset: -104, size: 8, alignment: 8, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$f19', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 25, type: spill-slot, offset: -112, size: 8, alignment: 16, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$f18', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 26, type: spill-slot, offset: -120, size: 8, alignment: 8, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$f17', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 27, type: spill-slot, offset: -128, size: 8, alignment: 16, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$f16', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 28, type: spill-slot, offset: -136, size: 8, alignment: 8, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$f15', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 29, type: spill-slot, offset: -144, size: 8, alignment: 16, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$f14', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 30, type: spill-slot, offset: -148, size: 4, alignment: 4, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$r31', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 31, type: spill-slot, offset: -152, size: 4, alignment: 8, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$r30', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 32, type: spill-slot, offset: -156, size: 4, alignment: 4, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$r29', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 33, type: spill-slot, offset: -160, size: 4, alignment: 16, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$r28', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 34, type: spill-slot, offset: -164, size: 4, alignment: 4, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$r27', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 35, type: spill-slot, offset: -168, size: 4, alignment: 8, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$r26', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 36, type: spill-slot, offset: -172, size: 4, alignment: 4, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$r25', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 37, type: spill-slot, offset: -176, size: 4, alignment: 16, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$r24', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 38, type: spill-slot, offset: -180, size: 4, alignment: 4, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$r23', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 39, type: spill-slot, offset: -184, size: 4, alignment: 8, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$r22', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 40, type: spill-slot, offset: -188, size: 4, alignment: 4, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$r21', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 41, type: spill-slot, offset: -192, size: 4, alignment: 16, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$r20', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 42, type: spill-slot, offset: -196, size: 4, alignment: 4, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$r19', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 43, type: spill-slot, offset: -200, size: 4, alignment: 8, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$r18', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 44, type: spill-slot, offset: -204, size: 4, alignment: 4, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$r17', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 45, type: spill-slot, offset: -208, size: 4, alignment: 16, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$r16', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 46, type: spill-slot, offset: -212, size: 4, alignment: 4, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$r15', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:      - { id: 47, type: spill-slot, offset: -216, size: 4, alignment: 8, stack-id: default,
+; MIR32-NEXT:          callee-saved-register: '$r14', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:          debug-info-expression: '', debug-info-location: '' }
 ; MIR32-NEXT:    stack:
 
-; MIR32:         liveins: $r14, $r25, $r31, $f14, $f21, $f31, $v20, $v26, $v31
-
-; MIR32-DAG:     STW killed $r14, 232, $r1 :: (store (s32) into %fixed-stack.8, align 8)
-; MIR32-DAG:     STW killed $r25, 276, $r1 :: (store (s32) into %fixed-stack.7)
-; MIR32-DAG:     STW killed $r31, 300, $r1 :: (store (s32) into %fixed-stack.6)
-; MIR32-DAG:     STFD killed $f14, 304, $r1 :: (store (s64) into %fixed-stack.5, align 16)
-; MIR32-DAG:     STFD killed $f21, 360, $r1 :: (store (s64) into %fixed-stack.4)
-; MIR32-DAG:     STFD killed $f31, 440, $r1 :: (store (s64) into %fixed-stack.3)
-; MIR32-DAG:     $r{{[0-9]+}} = LI 32
-; MIR32-DAG:     STXVD2X killed $v20, $r1, killed $r{{[0-9]+}} :: (store (s128) into %fixed-stack.2)
-; MIR32-DAG:     $r{{[0-9]+}} = LI 128
-; MIR32-DAG:     STXVD2X killed $v26, $r1, killed $r{{[0-9]+}} :: (store (s128) into %fixed-stack.1)
-; MIR32-DAG:     $r{{[0-9]+}} = LI 208
+; MIR32: liveins: $r14, $r15, $r16, $r17, $r18, $r19, $r20, $r21, $r22, $r23, $r24, $r25, $r26, $r27, $r28, $r29, $r30, $r31, $f14, $f15, $f16, $f17, $f18, $f19, $f20, $f21, $f22, $f23, $f24, $f25, $f26, $f27, $f28, $f29, $f30, $f31, $v20, $v21, $v22, $v23, $v24, $v25, $v26, $v27, $v28, $v29, $v30, $v31
+
+; MIR32-DAG:     STW killed $r14, 232, $r1 :: (store (s32) into %fixed-stack.47, align 8)
+; MIR32-DAG:     STW killed $r15, 236, $r1 :: (store (s32) into %fixed-stack.46)
+; MIR32-DAG:     STW killed $r16, 240, $r1 :: (store (s32) into %fixed-stack.45, align 16)
+; MIR32-DAG:     STW killed $r17, 244, $r1 :: (store (s32) into %fixed-stack.44)
+; MIR32-DAG:     STW killed $r18, 248, $r1 :: (store (s32) into %fixed-stack.43, align 8)
+; MIR32-DAG:     STW killed $r19, 252, $r1 :: (store (s32) into %fixed-stack.42)
+; MIR32-DAG:     STW killed $r20, 256, $r1 :: (store (s32) into %fixed-stack.41, align 16)
+; MIR32-DAG:     STW killed $r21, 260, $r1 :: (store (s32) into %fixed-stack.40)
+; MIR32-DAG:     STW killed $r22, 264, $r1 :: (store (s32) into %fixed-stack.39, align 8)
+; MIR32-DAG:     STW killed $r23, 268, $r1 :: (store (s32) into %fixed-stack.38)
+; MIR32-DAG:     STW killed $r24, 272, $r1 :: (store (s32) into %fixed-stack.37, align 16)
+; MIR32-DAG:     STW killed $r25, 276, $r1 :: (store (s32) into %fixed-stack.36)
+; MIR32-DAG:     STW killed $r26, 280, $r1 :: (store (s32) into %fixed-stack.35, align 8)
+; MIR32-DAG:     STW killed $r27, 284, $r1 :: (store (s32) into %fixed-stack.34)
+; MIR32-DAG:     STW killed $r28, 288, $r1 :: (store (s32) into %fixed-stack.33, align 16)
+; MIR32-DAG:     STW killed $r29, 292, $r1 :: (store (s32) into %fixed-stack.32)
+; MIR32-DAG:     STW killed $r30, 296, $r1 :: (store (s32) into %fixed-stack.31, align 8)
+; MIR32-DAG:     STW killed $r31, 300, $r1 :: (store (s32) into %fixed-stack.30)
+; MIR32-DAG:     STFD killed $f14, 304, $r1 :: (store (s64) into %fixed-stack.29, align 16)
+; MIR32-DAG:     STFD killed $f15, 312, $r1 :: (store (s64) into %fixed-stack.28)
+; MIR32-DAG:     STFD killed $f16, 320, $r1 :: (store (s64) into %fixed-stack.27, align 16)
+; MIR32-DAG:     STFD killed $f17, 328, $r1 :: (store (s64) into %fixed-stack.26)
+; MIR32-DAG:     STFD killed $f18, 336, $r1 :: (store (s64) into %fixed-stack.25, align 16)
+; MIR32-DAG:     STFD killed $f19, 344, $r1 :: (store (s64) into %fixed-stack.24)
+; MIR32-DAG:     STFD killed $f20, 352, $r1 :: (store (s64) into %fixed-stack.23, align 16)
+; MIR32-DAG:     STFD killed $f21, 360, $r1 :: (store (s64) into %fixed-stack.22)
+; MIR32-DAG:     STFD killed $f22, 368, $r1 :: (store (s64) into %fixed-stack.21, align 16)
+; MIR32-DAG:     STFD killed $f23, 376, $r1 :: (store (s64) into %fixed-stack.20)
+; MIR32-DAG:     STFD killed $f24, 384, $r1 :: (store (s64) into %fixed-stack.19, align 16)
+; MIR32-DAG:     STFD killed $f25, 392, $r1 :: (store (s64) into %fixed-stack.18)
+; MIR32-DAG:     STFD killed $f26, 400, $r1 :: (store (s64) into %fixed-stack.17, align 16)
+; MIR32-DAG:     STFD killed $f27, 408, $r1 :: (store (s64) into %fixed-stack.16)
+; MIR32-DAG:     STFD killed $f28, 416, $r1 :: (store (s64) into %fixed-stack.15, align 16)
+; MIR32-DAG:     STFD killed $f29, 424, $r1 :: (store (s64) into %fixed-stack.14)
+; MIR32-DAG:     STFD killed $f30, 432, $r1 :: (store (s64) into %fixed-stack.13, align 16)
+; MIR32-DAG:     STFD killed $f31, 440, $r1 :: (store (s64) into %fixed-stack.12)
+; MIR32-DAG:     STXVD2X killed $v20, $r1, killed $r{{[0-9]+}} :: (store (s128) into %fixed-stack.11)
+; MIR32-DAG:     STXVD2X killed $v21, $r1, killed $r{{[0-9]+}} :: (store (s128) into %fixed-stack.10)
+; MIR32-DAG:     STXVD2X killed $v22, $r1, killed $r{{[0-9]+}} :: (store (s128) into %fixed-stack.9)
+; MIR32-DAG:     STXVD2X killed $v23, $r1, killed $r{{[0-9]+}} :: (store (s128) into %fixed-stack.8)
+; MIR32-DAG:     STXVD2X killed $v24, $r1, killed $r{{[0-9]+}} :: (store (s128) into %fixed-stack.7)
+; MIR32-DAG:     STXVD2X killed $v25, $r1, killed $r{{[0-9]+}} :: (store (s128) into %fixed-stack.6)
+; MIR32-DAG:     STXVD2X killed $v26, $r1, killed $r{{[0-9]+}} :: (store (s128) into %fixed-stack.5)
+; MIR32-DAG:     STXVD2X killed $v27, $r1, killed $r{{[0-9]+}} :: (store (s128) into %fixed-stack.4)
+; MIR32-DAG:     STXVD2X killed $v28, $r1, killed $r{{[0-9]+}} :: (store (s128) into %fixed-stack.3)
+; MIR32-DAG:     STXVD2X killed $v29, $r1, killed $r{{[0-9]+}} :: (store (s128) into %fixed-stack.2)
+; MIR32-DAG:     STXVD2X killed $v30, $r1, killed $r{{[0-9]+}} :: (store (s128) into %fixed-stack.1)
 ; MIR32-DAG:     STXVD2X killed $v31, $r1, killed $r{{[0-9]+}} :: (store (s128) into %fixed-stack.0)
-; MIR32-DAG:     $r1 = STWU $r1, -448, $r1
 
 ; MIR32:         INLINEASM
 
-; MIR32-DAG:     $r14 = LWZ 232, $r1 :: (load (s32) from %fixed-stack.8, align 8)
-; MIR32-DAG:     $r25 = LWZ 276, $r1 :: (load (s32) from %fixed-stack.7)
-; MIR32-DAG:     $r31 = LWZ 300, $r1 :: (load (s32) from %fixed-stack.6)
-; MIR32-DAG:     $f14 = LFD 304, $r1 :: (load (s64) from %fixed-stack.5, align 16)
-; MIR32-DAG:     $f21 = LFD 360, $r1 :: (load (s64) from %fixed-stack.4)
-; MIR32-DAG:     $f31 = LFD 440, $r1 :: (load (s64) from %fixed-stack.3)
-; MIR32-DAG:     $v20 = LXVD2X $r1, killed $r{{[0-9]+}} :: (load (s128) from %fixed-stack.2)
-; MIR32-DAG:     $r{{[0-9]+}} = LI 32
-; MIR32-DAG:     $v26 = LXVD2X $r1, killed $r{{[0-9]+}} :: (load (s128) from %fixed-stack.1)
-; MIR32-DAG:     $r{{[0-9]+}} = LI 128
-; MIR32-DAG:     $v31 = LXVD2X $r1, killed $r{{[0-9]+}} :: (load (s128) from %fixed-stack.0)
-; MIR32-DAG:     $r{{[0-9]+}} = LI 208
-; MIR32-DAG:     $r1 = ADDI $r1, 448
-; MIR32-DAG:     BLR implicit $lr, implicit $rm
+; MIR32-DAG:     $v31 = LXVD2X $r1, killed $r3 :: (load (s128) from %fixed-stack.0)
+; MIR32-DAG:     $v30 = LXVD2X $r1, killed $r3 :: (load (s128) from %fixed-stack.1)
+; MIR32-DAG:     $v29 = LXVD2X $r1, killed $r3 :: (load (s128) from %fixed-stack.2)
+; MIR32-DAG:     $v28 = LXVD2X $r1, killed $r3 :: (load (s128) from %fixed-stack.3)
+; MIR32-DAG:     $v27 = LXVD2X $r1, killed $r3 :: (load (s128) from %fixed-stack.4)
+; MIR32-DAG:     $v26 = LXVD2X $r1, killed $r3 :: (load (s128) from %fixed-stack.5)
+; MIR32-DAG:     $v25 = LXVD2X $r1, killed $r3 :: (load (s128) from %fixed-stack.6)
+; MIR32-DAG:     $v24 = LXVD2X $r1, killed $r3 :: (load (s128) from %fixed-stack.7)
+; MIR32-DAG:     $v23 = LXVD2X $r1, killed $r3 :: (load (s128) from %fixed-stack.8)
+; MIR32-DAG:     $v22 = LXVD2X $r1, killed $r3 :: (load (s128) from %fixed-stack.9)
+; MIR32-DAG:     $v21 = LXVD2X $r1, killed $r3 :: (load (s128) from %fixed-stack.10)
+; MIR32-DAG:     $v20 = LXVD2X $r1, killed $r3 :: (load (s128) from %fixed-stack.11)
+; MIR32-DAG:     $f31 = LFD 440, $r1 :: (load (s64) from %fixed-stack.12)
+; MIR32-DAG:     $f30 = LFD 432, $r1 :: (load (s64) from %fixed-stack.13, align 16)
+; MIR32-DAG:     $f29 = LFD 424, $r1 :: (load (s64) from %fixed-stack.14)
+; MIR32-DAG:     $f28 = LFD 416, $r1 :: (load (s64) from %fixed-stack.15, align 16)
+; MIR32-DAG:     $f27 = LFD 408, $r1 :: (load (s64) from %fixed-stack.16)
+; MIR32-DAG:     $f26 = LFD 400, $r1 :: (load (s64) from %fixed-stack.17, align 16)
+; MIR32-DAG:     $f25 = LFD 392, $r1 :: (load (s64) from %fixed-stack.18)
+; MIR32-DAG:     $f24 = LFD 384, $r1 :: (load (s64) from %fixed-stack.19, align 16)
+; MIR32-DAG:     $f23 = LFD 376, $r1 :: (load (s64) from %fixed-stack.20)
+; MIR32-DAG:     $f22 = LFD 368, $r1 :: (load (s64) from %fixed-stack.21, align 16)
+; MIR32-DAG:     $f21 = LFD 360, $r1 :: (load (s64) from %fixed-stack.22)
+; MIR32-DAG:     $f20 = LFD 352, $r1 :: (load (s64) from %fixed-stack.23, align 16)
+; MIR32-DAG:     $f19 = LFD 344, $r1 :: (load (s64) from %fixed-stack.24)
+; MIR32-DAG:     $f18 = LFD 336, $r1 :: (load (s64) from %fixed-stack.25, align 16)
+; MIR32-DAG:     $f17 = LFD 328, $r1 :: (load (s64) from %fixed-stack.26)
+; MIR32-DAG:     $f16 = LFD 320, $r1 :: (load (s64) from %fixed-stack.27, align 16)
+; MIR32-DAG:     $f15 = LFD 312, $r1 :: (load (s64) from %fixed-stack.28)
+; MIR32-DAG:     $f14 = LFD 304, $r1 :: (load (s64) from %fixed-stack.29, align 16)
+; MIR32-DAG:     $r31 = LWZ 300, $r1 :: (load (s32) from %fixed-stack.30)
+; MIR32-DAG:     $r30 = LWZ 296, $r1 :: (load (s32) from %fixed-stack.31, align 8)
+; MIR32-DAG:     $r29 = LWZ 292, $r1 :: (load (s32) from %fixed-stack.32)
+; MIR32-DAG:     $r28 = LWZ 288, $r1 :: (load (s32) from %fixed-stack.33, align 16)
+; MIR32-DAG:     $r27 = LWZ 284, $r1 :: (load (s32) from %fixed-stack.34)
+; MIR32-DAG:     $r26 = LWZ 280, $r1 :: (load (s32) from %fixed-stack.35, align 8)
+; MIR32-DAG:     $r25 = LWZ 276, $r1 :: (load (s32) from %fixed-stack.36)
+; MIR32-DAG:     $r24 = LWZ 272, $r1 :: (load (s32) from %fixed-stack.37, align 16)
+; MIR32-DAG:     $r23 = LWZ 268, $r1 :: (load (s32) from %fixed-stack.38)
+; MIR32-DAG:     $r22 = LWZ 264, $r1 :: (load (s32) from %fixed-stack.39, align 8)
+; MIR32-DAG:     $r21 = LWZ 260, $r1 :: (load (s32) from %fixed-stack.40)
+; MIR32-DAG:     $r20 = LWZ 256, $r1 :: (load (s32) from %fixed-stack.41, align 16)
+; MIR32-DAG:     $r19 = LWZ 252, $r1 :: (load (s32) from %fixed-stack.42)
+; MIR32-DAG:     $r18 = LWZ 248, $r1 :: (load (s32) from %fixed-stack.43, align 8)
+; MIR32-DAG:     $r17 = LWZ 244, $r1 :: (load (s32) from %fixed-stack.44)
+; MIR32-DAG:     $r16 = LWZ 240, $r1 :: (load (s32) from %fixed-stack.45, align 16)
+; MIR32-DAG:     $r15 = LWZ 236, $r1 :: (load (s32) from %fixed-stack.46)
+; MIR32-DAG:     $r14 = LWZ 232, $r1 :: (load (s32) from %fixed-stack.47, align 8)
+; MIR32:         $r1 = ADDI $r1, 448
+; MIR32-NEXT:    BLR implicit $lr, implicit $rm
+
 
 ; MIR64:         name:            fprs_gprs_vecregs
 
 ; MIR64-LABEL:   fixedStack:
-; MIR64-NEXT:    - { id: 0, type: spill-slot, offset: -304, size: 16, alignment: 16, stack-id: default,
-; MIR64-NEXT:        callee-saved-register: '$v31', callee-saved-restored: true, debug-info-variable: '',
-; MIR64-NEXT:        debug-info-expression: '', debug-info-location: '' }
-; MIR64-NEXT:    - { id: 1, type: spill-slot, offset: -384, size: 16, alignment: 16, stack-id: default,
-; MIR64-NEXT:        callee-saved-register: '$v26', callee-saved-restored: true, debug-info-variable: '',
-; MIR64-NEXT:        debug-info-expression: '', debug-info-location: '' }
-; MIR64-NEXT:    - { id: 2, type: spill-slot, offset: -480, size: 16, alignment: 16, stack-id: default,
-; MIR64-NEXT:        callee-saved-register: '$v20', callee-saved-restored: true, debug-info-variable: '',
-; MIR64-NEXT:        debug-info-expression: '', debug-info-location: '' }
-; MIR64-NEXT:    - { id: 3, type: spill-slot, offset: -8, size: 8, alignment: 8, stack-id: default,
-; MIR64-NEXT:        callee-saved-register: '$f31', callee-saved-restored: true, debug-info-variable: '',
-; MIR64-NEXT:        debug-info-expression: '', debug-info-location: '' }
-; MIR64-NEXT:    - { id: 4, type: spill-slot, offset: -88, size: 8, alignment: 8, stack-id: default,
-; MIR64-NEXT:        callee-saved-register: '$f21', callee-saved-restored: true, debug-info-variable: '',
-; MIR64-NEXT:        debug-info-expression: '', debug-info-location: '' }
-; MIR64-NEXT:    - { id: 5, type: spill-slot, offset: -144, size: 8, alignment: 16, stack-id: default,
-; MIR64-NEXT:        callee-saved-register: '$f14', callee-saved-restored: true, debug-info-variable: '',
-; MIR64-NEXT:        debug-info-expression: '', debug-info-location: '' }
-; MIR64-NEXT:    - { id: 6, type: spill-slot, offset: -152, size: 8, alignment: 8, stack-id: default,
-; MIR64-NEXT:        callee-saved-register: '$x31', callee-saved-restored: true, debug-info-variable: '',
-; MIR64-NEXT:        debug-info-expression: '', debug-info-location: '' }
-; MIR64-NEXT:    - { id: 7, type: spill-slot, offset: -200, size: 8, alignment: 8, stack-id: default,
-; MIR64-NEXT:        callee-saved-register: '$x25', callee-saved-restored: true, debug-info-variable: '',
-; MIR64-NEXT:        debug-info-expression: '', debug-info-location: '' }
-; MIR64-NEXT:    - { id: 8, type: spill-slot, offset: -288, size: 8, alignment: 16, stack-id: default,
-; MIR64-NEXT:        callee-saved-register: '$x14', callee-saved-restored: true, debug-info-variable: '',
-; MIR64-NEXT:        debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 0, type: spill-slot, offset: -304, size: 16, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$v31', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 1, type: spill-slot, offset: -320, size: 16, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$v30', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 2, type: spill-slot, offset: -336, size: 16, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$v29', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 3, type: spill-slot, offset: -352, size: 16, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$v28', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 4, type: spill-slot, offset: -368, size: 16, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$v27', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 5, type: spill-slot, offset: -384, size: 16, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$v26', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 6, type: spill-slot, offset: -400, size: 16, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$v25', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 7, type: spill-slot, offset: -416, size: 16, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$v24', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 8, type: spill-slot, offset: -432, size: 16, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$v23', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 9, type: spill-slot, offset: -448, size: 16, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$v22', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 10, type: spill-slot, offset: -464, size: 16, alignment: 16,
+; MIR64-DAG:           stack-id: default, callee-saved-register: '$v21', callee-saved-restored: true,
+; MIR64-DAG:           debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 11, type: spill-slot, offset: -480, size: 16, alignment: 16,
+; MIR64-DAG:           stack-id: default, callee-saved-register: '$v20', callee-saved-restored: true,
+; MIR64-DAG:           debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 12, type: spill-slot, offset: -8, size: 8, alignment: 8, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$f31', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 13, type: spill-slot, offset: -16, size: 8, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$f30', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 14, type: spill-slot, offset: -24, size: 8, alignment: 8, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$f29', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 15, type: spill-slot, offset: -32, size: 8, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$f28', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 16, type: spill-slot, offset: -40, size: 8, alignment: 8, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$f27', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 17, type: spill-slot, offset: -48, size: 8, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$f26', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 18, type: spill-slot, offset: -56, size: 8, alignment: 8, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$f25', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 19, type: spill-slot, offset: -64, size: 8, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$f24', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 20, type: spill-slot, offset: -72, size: 8, alignment: 8, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$f23', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 21, type: spill-slot, offset: -80, size: 8, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$f22', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 22, type: spill-slot, offset: -88, size: 8, alignment: 8, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$f21', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 23, type: spill-slot, offset: -96, size: 8, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$f20', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 24, type: spill-slot, offset: -104, size: 8, alignment: 8, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$f19', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 25, type: spill-slot, offset: -112, size: 8, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$f18', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 26, type: spill-slot, offset: -120, size: 8, alignment: 8, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$f17', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 27, type: spill-slot, offset: -128, size: 8, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$f16', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 28, type: spill-slot, offset: -136, size: 8, alignment: 8, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$f15', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 29, type: spill-slot, offset: -144, size: 8, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$f14', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 30, type: spill-slot, offset: -152, size: 8, alignment: 8, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$x31', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 31, type: spill-slot, offset: -160, size: 8, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$x30', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 32, type: spill-slot, offset: -168, size: 8, alignment: 8, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$x29', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 33, type: spill-slot, offset: -176, size: 8, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$x28', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 34, type: spill-slot, offset: -184, size: 8, alignment: 8, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$x27', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 35, type: spill-slot, offset: -192, size: 8, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$x26', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 36, type: spill-slot, offset: -200, size: 8, alignment: 8, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$x25', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 37, type: spill-slot, offset: -208, size: 8, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$x24', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 38, type: spill-slot, offset: -216, size: 8, alignment: 8, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$x23', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 39, type: spill-slot, offset: -224, size: 8, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$x22', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 40, type: spill-slot, offset: -232, size: 8, alignment: 8, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$x21', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 41, type: spill-slot, offset: -240, size: 8, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$x20', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 42, type: spill-slot, offset: -248, size: 8, alignment: 8, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$x19', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 43, type: spill-slot, offset: -256, size: 8, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$x18', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 44, type: spill-slot, offset: -264, size: 8, alignment: 8, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$x17', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 45, type: spill-slot, offset: -272, size: 8, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$x16', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 46, type: spill-slot, offset: -280, size: 8, alignment: 8, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$x15', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
+; MIR64-DAG:       - { id: 47, type: spill-slot, offset: -288, size: 8, alignment: 16, stack-id: default,
+; MIR64-DAG:           callee-saved-register: '$x14', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-DAG:           debug-info-expression: '', debug-info-location: '' }
 ; MIR64-NEXT:    stack:
 
-; MIR64:         liveins: $x14, $x25, $x31, $f14, $f21, $f31, $v20, $v26, $v31
-
-; MIR64-DAG:     $x1 = STDU $x1, -544, $x1
-; MIR64-DAG:     STD killed $x14, 256, $x1 :: (store (s64) into %fixed-stack.8, align 16)
-; MIR64-DAG:     STD killed $x25, 344, $x1 :: (store (s64) into %fixed-stack.7)
-; MIR64-DAG:     STD killed $x31, 392, $x1 :: (store (s64) into %fixed-stack.6)
-; MIR64-DAG:     STFD killed $f14, 400, $x1 :: (store (s64) into %fixed-stack.5, align 16)
-; MIR64-DAG:     STFD killed $f21, 456, $x1 :: (store (s64) into %fixed-stack.4)
-; MIR64-DAG:     STFD killed $f31, 536, $x1 :: (store (s64) into %fixed-stack.3)
-; MIR64-DAG:     $x{{[0-9]+}} = LI8 64
-; MIR64-DAG:     STXVD2X killed $v20, $x1, killed $x{{[0-9]+}} :: (store (s128) into %fixed-stack.2)
-; MIR64-DAG:     $x{{[0-9]+}} = LI8 160
-; MIR64-DAG:     STXVD2X killed $v26, $x1, killed $x{{[0-9]+}} :: (store (s128) into %fixed-stack.1)
-; MIR64-DAG:     $x{{[0-9]+}} = LI8 240
-; MIR64-DAG:     STXVD2X killed $v31, $x1, killed $x{{[0-9]+}} :: (store (s128) into %fixed-stack.0)
+; MIR64: liveins: $x14, $x15, $x16, $x17, $x18, $x19, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $x29, $x30, $x31, $f14, $f15, $f16, $f17, $f18, $f19, $f20, $f21, $f22, $f23, $f24, $f25, $f26, $f27, $f28, $f29, $f30, $f31, $v20, $v21, $v22, $v23, $v24, $v25, $v26, $v27, $v28, $v29, $v30, $v31
+
+; MIR64:         $x1 = STDU $x1, -544, $x1
+;MIR64-DAG:      STD killed $x14, 256, $x1 :: (store (s64) into %fixed-stack.47, align 16)
+;MIR64-DAG:      STD killed $x15, 264, $x1 :: (store (s64) into %fixed-stack.46)
+;MIR64-DAG:      STD killed $x16, 272, $x1 :: (store (s64) into %fixed-stack.45, align 16)
+;MIR64-DAG:      STD killed $x17, 280, $x1 :: (store (s64) into %fixed-stack.44)
+;MIR64-DAG:      STD killed $x18, 288, $x1 :: (store (s64) into %fixed-stack.43, align 16)
+;MIR64-DAG:      STD killed $x19, 296, $x1 :: (store (s64) into %fixed-stack.42)
+;MIR64-DAG:      STD killed $x20, 304, $x1 :: (store (s64) into %fixed-stack.41, align 16)
+;MIR64-DAG:      STD killed $x21, 312, $x1 :: (store (s64) into %fixed-stack.40)
+;MIR64-DAG:      STD killed $x22, 320, $x1 :: (store (s64) into %fixed-stack.39, align 16)
+;MIR64-DAG:      STD killed $x23, 328, $x1 :: (store (s64) into %fixed-stack.38)
+;MIR64-DAG:      STD killed $x24, 336, $x1 :: (store (s64) into %fixed-stack.37, align 16)
+;MIR64-DAG:      STD killed $x25, 344, $x1 :: (store (s64) into %fixed-stack.36)
+;MIR64-DAG:      STD killed $x26, 352, $x1 :: (store (s64) into %fixed-stack.35, align 16)
+;MIR64-DAG:      STD killed $x27, 360, $x1 :: (store (s64) into %fixed-stack.34)
+;MIR64-DAG:      STD killed $x28, 368, $x1 :: (store (s64) into %fixed-stack.33, align 16)
+;MIR64-DAG:      STD killed $x29, 376, $x1 :: (store (s64) into %fixed-stack.32)
+;MIR64-DAG:      STD killed $x30, 384, $x1 :: (store (s64) into %fixed-stack.31, align 16)
+;MIR64-DAG:      STD killed $x31, 392, $x1 :: (store (s64) into %fixed-stack.30)
+;MIR64-DAG:      STFD killed $f14, 400, $x1 :: (store (s64) into %fixed-stack.29, align 16)
+;MIR64-DAG:      STFD killed $f15, 408, $x1 :: (store (s64) into %fixed-stack.28)
+;MIR64-DAG:      STFD killed $f16, 416, $x1 :: (store (s64) into %fixed-stack.27, align 16)
+;MIR64-DAG:      STFD killed $f17, 424, $x1 :: (store (s64) into %fixed-stack.26)
+;MIR64-DAG:      STFD killed $f18, 432, $x1 :: (store (s64) into %fixed-stack.25, align 16)
+;MIR64-DAG:      STFD killed $f19, 440, $x1 :: (store (s64) into %fixed-stack.24)
+;MIR64-DAG:      STFD killed $f20, 448, $x1 :: (store (s64) into %fixed-stack.23, align 16)
+;MIR64-DAG:      STFD killed $f21, 456, $x1 :: (store (s64) into %fixed-stack.22)
+;MIR64-DAG:      STFD killed $f22, 464, $x1 :: (store (s64) into %fixed-stack.21, align 16)
+;MIR64-DAG:      STFD killed $f23, 472, $x1 :: (store (s64) into %fixed-stack.20)
+;MIR64-DAG:      STFD killed $f24, 480, $x1 :: (store (s64) into %fixed-stack.19, align 16)
+;MIR64-DAG:      STFD killed $f25, 488, $x1 :: (store (s64) into %fixed-stack.18)
+;MIR64-DAG:      STFD killed $f26, 496, $x1 :: (store (s64) into %fixed-stack.17, align 16)
+;MIR64-DAG:      STFD killed $f27, 504, $x1 :: (store (s64) into %fixed-stack.16)
+;MIR64-DAG:      STFD killed $f28, 512, $x1 :: (store (s64) into %fixed-stack.15, align 16)
+;MIR64-DAG:      STFD killed $f29, 520, $x1 :: (store (s64) into %fixed-stack.14)
+;MIR64-DAG:      STFD killed $f30, 528, $x1 :: (store (s64) into %fixed-stack.13, align 16)
+;MIR64-DAG:      STFD killed $f31, 536, $x1 :: (store (s64) into %fixed-stack.12)
+;MIR64-DAG:      STXVD2X killed $v20, $x1, killed $x{{[0-9]+}} :: (store (s128) into %fixed-stack.11)
+;MIR64-DAG:      STXVD2X killed $v21, $x1, killed $x{{[0-9]+}} :: (store (s128) into %fixed-stack.10)
+;MIR64-DAG:      STXVD2X killed $v22, $x1, killed $x{{[0-9]+}} :: (store (s128) into %fixed-stack.9)
+;MIR64-DAG:      STXVD2X killed $v23, $x1, killed $x{{[0-9]+}} :: (store (s128) into %fixed-stack.8)
+;MIR64-DAG:      STXVD2X killed $v24, $x1, killed $x{{[0-9]+}} :: (store (s128) into %fixed-stack.7)
+;MIR64-DAG:      STXVD2X killed $v25, $x1, killed $x{{[0-9]+}} :: (store (s128) into %fixed-stack.6)
+;MIR64-DAG:      STXVD2X killed $v26, $x1, killed $x{{[0-9]+}} :: (store (s128) into %fixed-stack.5)
+;MIR64-DAG:      STXVD2X killed $v27, $x1, killed $x{{[0-9]+}} :: (store (s128) into %fixed-stack.4)
+;MIR64-DAG:      STXVD2X killed $v28, $x1, killed $x{{[0-9]+}} :: (store (s128) into %fixed-stack.3)
+;MIR64-DAG:      STXVD2X killed $v29, $x1, killed $x{{[0-9]+}} :: (store (s128) into %fixed-stack.2)
+;MIR64-DAG:      STXVD2X killed $v30, $x1, killed $x{{[0-9]+}} :: (store (s128) into %fixed-stack.1)
+;MIR64-DAG:      STXVD2X killed $v31, $x1, killed $x{{[0-9]+}} :: (store (s128) into %fixed-stack.0)
 
 ; MIR64:         INLINEASM
 
-; MIR64-DAG:     $x14 = LD 256, $x1 :: (load (s64) from %fixed-stack.8, align 16)
-; MIR64-DAG:     $x25 = LD 344, $x1 :: (load (s64) from %fixed-stack.7)
-; MIR64-DAG:     $x31 = LD 392, $x1 :: (load (s64) from %fixed-stack.6)
-; MIR64-DAG:     $f14 = LFD 400, $x1 :: (load (s64) from %fixed-stack.5, align 16)
-; MIR64-DAG:     $f21 = LFD 456, $x1 :: (load (s64) from %fixed-stack.4)
-; MIR64-DAG:     $f31 = LFD 536, $x1 :: (load (s64) from %fixed-stack.3)
-; MIR64-DAG:     $v20 = LXVD2X $x1, killed $x{{[0-9]+}} :: (load (s128) from %fixed-stack.2)
-; MIR64-DAG:     $x{{[0-9]+}} = LI8 64
-; MIR64-DAG:     $v26 = LXVD2X $x1, killed $x{{[0-9]+}} :: (load (s128) from %fixed-stack.1)
-; MIR64-DAG:     $x{{[0-9]+}} = LI8 160
 ; MIR64-DAG:     $v31 = LXVD2X $x1, killed $x{{[0-9]+}} :: (load (s128) from %fixed-stack.0)
-; MIR64-DAG:     $x{{[0-9]+}} = LI8 240
-; MIR64-DAG:     $x1 = ADDI8 $x1, 544
-; MIR64-DAG:     BLR8 implicit $lr8, implicit $rm
-
-; ASM32-LABEL:   .fprs_gprs_vecregs:
-
-; ASM32:         stwu 1, -448(1)
-; ASM32-DAG:     li {{[0-9]+}}, 32
-; ASM32-DAG:     stw 14, 232(1)                          # 4-byte Folded Spill
-; ASM32-DAG:     stfd 14, 304(1)                         # 8-byte Folded Spill
-; ASM32-DAG:     stxvd2x 52, 1, {{[0-9]+}}               # 16-byte Folded Spill
-; ASM32-DAG:     li {{[0-9]+}}, 128
-; ASM32-DAG:     stw 25, 276(1)                          # 4-byte Folded Spill
-; ASM32-DAG:     stxvd2x 58, 1, {{[0-9]+}}               # 16-byte Folded Spill
-; ASM32-DAG:     li {{[0-9]+}}, 208
-; ASM32-DAG:     stw 31, 300(1)                          # 4-byte Folded Spill
-; ASM32-DAG:     stfd 21, 360(1)                         # 8-byte Folded Spill
-; ASM32-DAG:     stfd 31, 440(1)                         # 8-byte Folded Spill
-; ASM32-DAG:     stxvd2x 63, 1, {{[0-9]+}}               # 16-byte Folded Spill
-; ASM32-DAG:     #APP
-; ASM32-DAG:     #NO_APP
-; ASM32-DAG:     lxvd2x 63, 1, {{[0-9]+}}                # 16-byte Folded Reload
-; ASM32-DAG:     li {{[0-9]+}}, 128
-; ASM32-DAG:     lfd 31, 440(1)                          # 8-byte Folded Reload
-; ASM32-DAG:     lxvd2x 58, 1, {{[0-9]+}}                # 16-byte Folded Reload
-; ASM32-DAG:     li {{[0-9]+}}, 32
-; ASM32-DAG:     lfd 21, 360(1)                          # 8-byte Folded Reload
-; ASM32-DAG:     lxvd2x 52, 1, {{[0-9]+}}                # 16-byte Folded Reload
-; ASM32-DAG:     lfd 14, 304(1)                          # 8-byte Folded Reload
-; ASM32-DAG:     lwz 31, 300(1)                          # 4-byte Folded Reload
-; ASM32-DAG:     lwz 25, 276(1)                          # 4-byte Folded Reload
-; ASM32-DAG:     lwz 14, 232(1)                          # 4-byte Folded Reload
-; ASM32-DAG:     addi 1, 1, 448
-; ASM32:         blr
+; MIR64-DAG:     $v30 = LXVD2X $x1, killed $x{{[0-9]+}} :: (load (s128) from %fixed-stack.1)
+; MIR64-DAG:     $v29 = LXVD2X $x1, killed $x{{[0-9]+}} :: (load (s128) from %fixed-stack.2)
+; MIR64-DAG:     $v28 = LXVD2X $x1, killed $x{{[0-9]+}} :: (load (s128) from %fixed-stack.3)
+; MIR64-DAG:     $v27 = LXVD2X $x1, killed $x{{[0-9]+}} :: (load (s128) from %fixed-stack.4)
+; MIR64-DAG:     $v26 = LXVD2X $x1, killed $x{{[0-9]+}} :: (load (s128) from %fixed-stack.5)
+; MIR64-DAG:     $v25 = LXVD2X $x1, killed $x{{[0-9]+}} :: (load (s128) from %fixed-stack.6)
+; MIR64-DAG:     $v24 = LXVD2X $x1, killed $x{{[0-9]+}} :: (load (s128) from %fixed-stack.7)
+; MIR64-DAG:     $v23 = LXVD2X $x1, killed $x{{[0-9]+}} :: (load (s128) from %fixed-stack.8)
+; MIR64-DAG:     $v22 = LXVD2X $x1, killed $x{{[0-9]+}} :: (load (s128) from %fixed-stack.9)
+; MIR64-DAG:     $v21 = LXVD2X $x1, killed $x{{[0-9]+}} :: (load (s128) from %fixed-stack.10)
+; MIR64-DAG:     $v20 = LXVD2X $x1, killed $x{{[0-9]+}} :: (load (s128) from %fixed-stack.11)
+; MIR64-DAG:     $f31 = LFD 536, $x1 :: (load (s64) from %fixed-stack.12)
+; MIR64-DAG:     $f30 = LFD 528, $x1 :: (load (s64) from %fixed-stack.13, align 16)
+; MIR64-DAG:     $f29 = LFD 520, $x1 :: (load (s64) from %fixed-stack.14)
+; MIR64-DAG:     $f28 = LFD 512, $x1 :: (load (s64) from %fixed-stack.15, align 16)
+; MIR64-DAG:     $f27 = LFD 504, $x1 :: (load (s64) from %fixed-stack.16)
+; MIR64-DAG:     $f26 = LFD 496, $x1 :: (load (s64) from %fixed-stack.17, align 16)
+; MIR64-DAG:     $f25 = LFD 488, $x1 :: (load (s64) from %fixed-stack.18)
+; MIR64-DAG:     $f24 = LFD 480, $x1 :: (load (s64) from %fixed-stack.19, align 16)
+; MIR64-DAG:     $f23 = LFD 472, $x1 :: (load (s64) from %fixed-stack.20)
+; MIR64-DAG:     $f22 = LFD 464, $x1 :: (load (s64) from %fixed-stack.21, align 16)
+; MIR64-DAG:     $f21 = LFD 456, $x1 :: (load (s64) from %fixed-stack.22)
+; MIR64-DAG:     $f20 = LFD 448, $x1 :: (load (s64) from %fixed-stack.23, align 16)
+; MIR64-DAG:     $f19 = LFD 440, $x1 :: (load (s64) from %fixed-stack.24)
+; MIR64-DAG:     $f18 = LFD 432, $x1 :: (load (s64) from %fixed-stack.25, align 16)
+; MIR64-DAG:     $f17 = LFD 424, $x1 :: (load (s64) from %fixed-stack.26)
+; MIR64-DAG:     $f16 = LFD 416, $x1 :: (load (s64) from %fixed-stack.27, align 16)
+; MIR64-DAG:     $f15 = LFD 408, $x1 :: (load (s64) from %fixed-stack.28)
+; MIR64-DAG:     $f14 = LFD 400, $x1 :: (load (s64) from %fixed-stack.29, align 16)
+; MIR64-DAG:     $x31 = LD 392, $x1 :: (load (s64) from %fixed-stack.30)
+; MIR64-DAG:     $x30 = LD 384, $x1 :: (load (s64) from %fixed-stack.31, align 16)
+; MIR64-DAG:     $x29 = LD 376, $x1 :: (load (s64) from %fixed-stack.32)
+; MIR64-DAG:     $x28 = LD 368, $x1 :: (load (s64) from %fixed-stack.33, align 16)
+; MIR64-DAG:     $x27 = LD 360, $x1 :: (load (s64) from %fixed-stack.34)
+; MIR64-DAG:     $x26 = LD 352, $x1 :: (load (s64) from %fixed-stack.35, align 16)
+; MIR64-DAG:     $x25 = LD 344, $x1 :: (load (s64) from %fixed-stack.36)
+; MIR64-DAG:     $x24 = LD 336, $x1 :: (load (s64) from %fixed-stack.37, align 16)
+; MIR64-DAG:     $x23 = LD 328, $x1 :: (load (s64) from %fixed-stack.38)
+; MIR64-DAG:     $x22 = LD 320, $x1 :: (load (s64) from %fixed-stack.39, align 16)
+; MIR64-DAG:     $x21 = LD 312, $x1 :: (load (s64) from %fixed-stack.40)
+; MIR64-DAG:     $x20 = LD 304, $x1 :: (load (s64) from %fixed-stack.41, align 16)
+; MIR64-DAG:     $x19 = LD 296, $x1 :: (load (s64) from %fixed-stack.42)
+; MIR64-DAG:     $x18 = LD 288, $x1 :: (load (s64) from %fixed-stack.43, align 16)
+; MIR64-DAG:     $x17 = LD 280, $x1 :: (load (s64) from %fixed-stack.44)
+; MIR64-DAG:     $x16 = LD 272, $x1 :: (load (s64) from %fixed-stack.45, align 16)
+; MIR64-DAG:     $x15 = LD 264, $x1 :: (load (s64) from %fixed-stack.46)
+; MIR64-DAG:     $x14 = LD 256, $x1 :: (load (s64) from %fixed-stack.47, align 16)
+; MIR64:         $x1 = ADDI8 $x1, 544
+; MIR64-NEXT:    BLR8 implicit $lr8, implicit $rm
+
+; ASM32-LABEL:  .fprs_gprs_vecregs:
+
+; ASM32:          stwu 1, -448(1)
+; ASM32-DAG:      li [[FIXEDSTACK11:[0-9]+]], 32
+; ASM32-DAG:      stxvd2x 52, 1, [[FIXEDSTACK11]]                      # 16-byte Folded Spill
+; ASM32-DAG:      li [[FIXEDSTACK10:[0-9]+]], 48
+; ASM32-DAG:      stxvd2x 53, 1, [[FIXEDSTACK10]]                      # 16-byte Folded Spill
+; ASM32-DAG:      li [[FIXEDSTACK9:[0-9]+]], 64
+; ASM32-DAG:      stxvd2x 54, 1, [[FIXEDSTACK9]]                       # 16-byte Folded Spill
+; ASM32-DAG:      li [[FIXEDSTACK8:[0-9]+]], 80
+; ASM32-DAG:      stxvd2x 55, 1, [[FIXEDSTACK8]]                       # 16-byte Folded Spill
+; ASM32-DAG:      li [[FIXEDSTACK7:[0-9]+]], 96
+; ASM32-DAG:      stxvd2x 56, 1, [[FIXEDSTACK7]]                       # 16-byte Folded Spill
+; ASM32-DAG:      li [[FIXEDSTACK6:[0-9]+]], 112
+; ASM32-DAG:      stxvd2x 57, 1, [[FIXEDSTACK6]]                       # 16-byte Folded Spill
+; ASM32-DAG:      li [[FIXEDSTACK5:[0-9]+]], 128
+; ASM32-DAG:      stxvd2x 58, 1, [[FIXEDSTACK5]]                       # 16-byte Folded Spill
+; ASM32-DAG:      li [[FIXEDSTACK4:[0-9]+]], 144
+; ASM32-DAG:      stxvd2x 59, 1, [[FIXEDSTACK4]]                       # 16-byte Folded Spill
+; ASM32-DAG:      li [[FIXEDSTACK3:[0-9]+]], 160
+; ASM32-DAG:      stxvd2x 60, 1, [[FIXEDSTACK3]]                       # 16-byte Folded Spill
+; ASM32-DAG:      li [[FIXEDSTACK2:[0-9]+]], 176
+; ASM32-DAG:      stxvd2x 61, 1, [[FIXEDSTACK2]]                       # 16-byte Folded Spill
+; ASM32-DAG:      li [[FIXEDSTACK1:[0-9]+]], 192
+; ASM32-DAG:      stxvd2x 62, 1, [[FIXEDSTACK1]]                       # 16-byte Folded Spill
+; ASM32-DAG:      li [[FIXEDSTACK0:[0-9]+]], 208
+; ASM32-DAG:      stxvd2x 63, 1, [[FIXEDSTACK0]]                       # 16-byte Folded Spill
+; ASM32-DAG:      stw 14, 232(1)                          # 4-byte Folded Spill
+; ASM32-DAG:      stw 15, 236(1)                          # 4-byte Folded Spill
+; ASM32-DAG:      stw 16, 240(1)                          # 4-byte Folded Spill
+; ASM32-DAG:      stw 17, 244(1)                          # 4-byte Folded Spill
+; ASM32-DAG:      stw 18, 248(1)                          # 4-byte Folded Spill
+; ASM32-DAG:      stw 19, 252(1)                          # 4-byte Folded Spill
+; ASM32-DAG:      stw 20, 256(1)                          # 4-byte Folded Spill
+; ASM32-DAG:      stw 21, 260(1)                          # 4-byte Folded Spill
+; ASM32-DAG:      stw 22, 264(1)                          # 4-byte Folded Spill
+; ASM32-DAG:      stw 23, 268(1)                          # 4-byte Folded Spill
+; ASM32-DAG:      stw 24, 272(1)                          # 4-byte Folded Spill
+; ASM32-DAG:      stw 25, 276(1)                          # 4-byte Folded Spill
+; ASM32-DAG:      stw 26, 280(1)                          # 4-byte Folded Spill
+; ASM32-DAG:      stw 27, 284(1)                          # 4-byte Folded Spill
+; ASM32-DAG:      stw 28, 288(1)                          # 4-byte Folded Spill
+; ASM32-DAG:      stw 29, 292(1)                          # 4-byte Folded Spill
+; ASM32-DAG:      stw 30, 296(1)                          # 4-byte Folded Spill
+; ASM32-DAG:      stw 31, 300(1)                          # 4-byte Folded Spill
+; ASM32-DAG:      stfd 14, 304(1)                         # 8-byte Folded Spill
+; ASM32-DAG:      stfd 15, 312(1)                         # 8-byte Folded Spill
+; ASM32-DAG:      stfd 16, 320(1)                         # 8-byte Folded Spill
+; ASM32-DAG:      stfd 17, 328(1)                         # 8-byte Folded Spill
+; ASM32-DAG:      stfd 18, 336(1)                         # 8-byte Folded Spill
+; ASM32-DAG:      stfd 19, 344(1)                         # 8-byte Folded Spill
+; ASM32-DAG:      stfd 20, 352(1)                         # 8-byte Folded Spill
+; ASM32-DAG:      stfd 21, 360(1)                         # 8-byte Folded Spill
+; ASM32-DAG:      stfd 22, 368(1)                         # 8-byte Folded Spill
+; ASM32-DAG:      stfd 23, 376(1)                         # 8-byte Folded Spill
+; ASM32-DAG:      stfd 24, 384(1)                         # 8-byte Folded Spill
+; ASM32-DAG:      stfd 25, 392(1)                         # 8-byte Folded Spill
+; ASM32-DAG:      stfd 26, 400(1)                         # 8-byte Folded Spill
+; ASM32-DAG:      stfd 27, 408(1)                         # 8-byte Folded Spill
+; ASM32-DAG:      stfd 28, 416(1)                         # 8-byte Folded Spill
+; ASM32-DAG:      stfd 29, 424(1)                         # 8-byte Folded Spill
+; ASM32-DAG:      stfd 30, 432(1)                         # 8-byte Folded Spill
+; ASM32-DAG:      stfd 31, 440(1)                         # 8-byte Folded Spill
+
+; ASM32:          #APP
+; ASM32-NEXT:     #NO_APP
+
+; ASM32-DAG:      lxvd2x 63, 1, [[FIXEDSTACK0]]                        # 16-byte Folded Reload
+; ASM32-DAG:      li [[FIXEDSTACK1:[0-9]+]], 192
+; ASM32-DAG:      lxvd2x 62, 1, [[FIXEDSTACK1]]                        # 16-byte Folded Reload
+; ASM32-DAG:      li [[FIXEDSTACK2:[0-9]+]], 176
+; ASM32-DAG:      lxvd2x 61, 1, [[FIXEDSTACK2]]                        # 16-byte Folded Reload
+; ASM32-DAG:      li [[FIXEDSTACK3:[0-9]+]], 160
+; ASM32-DAG:      lxvd2x 60, 1, [[FIXEDSTACK3]]                        # 16-byte Folded Reload
+; ASM32-DAG:      li [[FIXEDSTACK4:[0-9]+]], 144
+; ASM32-DAG:      lxvd2x 59, 1, [[FIXEDSTACK4]]                        # 16-byte Folded Reload
+; ASM32-DAG:      li [[FIXEDSTACK5:[0-9]+]], 128
+; ASM32-DAG:      lxvd2x 58, 1, [[FIXEDSTACK5]]                        # 16-byte Folded Reload
+; ASM32-DAG:      li [[FIXEDSTACK6:[0-9]+]], 112
+; ASM32-DAG:      lxvd2x 57, 1, [[FIXEDSTACK6]]                        # 16-byte Folded Reload
+; ASM32-DAG:      li [[FIXEDSTACK7:[0-9]+]], 96
+; ASM32-DAG:      lxvd2x 56, 1, [[FIXEDSTACK7]]                        # 16-byte Folded Reload
+; ASM32-DAG:      li [[FIXEDSTACK8:[0-9]+]], 80
+; ASM32-DAG:      lxvd2x 55, 1, [[FIXEDSTACK8]]                        # 16-byte Folded Reload
+; ASM32-DAG:      li [[FIXEDSTACK9:[0-9]+]], 64
+; ASM32-DAG:      lxvd2x 54, 1, [[FIXEDSTACK9]]                        # 16-byte Folded Reload
+; ASM32-DAG:      li [[FIXEDSTACK10:[0-9]+]], 48
+; ASM32-DAG:      lxvd2x 53, 1, [[FIXEDSTACK10]]                        # 16-byte Folded Reload
+; ASM32-DAG:      li [[FIXEDSTACK11:[0-9]+]], 32
+; ASM32-DAG:      lxvd2x 52, 1, [[FIXEDSTACK11]]                        # 16-byte Folded Reload
+; ASM32-DAG:      lfd 31, 440(1)                          # 8-byte Folded Reload
+; ASM32-DAG:      lfd 30, 432(1)                          # 8-byte Folded Reload
+; ASM32-DAG:      lfd 29, 424(1)                          # 8-byte Folded Reload
+; ASM32-DAG:      lfd 28, 416(1)                          # 8-byte Folded Reload
+; ASM32-DAG:      lfd 27, 408(1)                          # 8-byte Folded Reload
+; ASM32-DAG:      lfd 26, 400(1)                          # 8-byte Folded Reload
+; ASM32-DAG:      lfd 25, 392(1)                          # 8-byte Folded Reload
+; ASM32-DAG:      lfd 24, 384(1)                          # 8-byte Folded Reload
+; ASM32-DAG:      lfd 23, 376(1)                          # 8-byte Folded Reload
+; ASM32-DAG:      lfd 22, 368(1)                          # 8-byte Folded Reload
+; ASM32-DAG:      lfd 21, 360(1)                          # 8-byte Folded Reload
+; ASM32-DAG:      lfd 20, 352(1)                          # 8-byte Folded Reload
+; ASM32-DAG:      lfd 19, 344(1)                          # 8-byte Folded Reload
+; ASM32-DAG:      lfd 18, 336(1)                          # 8-byte Folded Reload
+; ASM32-DAG:      lfd 17, 328(1)                          # 8-byte Folded Reload
+; ASM32-DAG:      lfd 16, 320(1)                          # 8-byte Folded Reload
+; ASM32-DAG:      lfd 15, 312(1)                          # 8-byte Folded Reload
+; ASM32-DAG:      lfd 14, 304(1)                          # 8-byte Folded Reload
+; ASM32-DAG:      lwz 31, 300(1)                          # 4-byte Folded Reload
+; ASM32-DAG:      lwz 30, 296(1)                          # 4-byte Folded Reload
+; ASM32-DAG:      lwz 29, 292(1)                          # 4-byte Folded Reload
+; ASM32-DAG:      lwz 28, 288(1)                          # 4-byte Folded Reload
+; ASM32-DAG:      lwz 27, 284(1)                          # 4-byte Folded Reload
+; ASM32-DAG:      lwz 26, 280(1)                          # 4-byte Folded Reload
+; ASM32-DAG:      lwz 25, 276(1)                          # 4-byte Folded Reload
+; ASM32-DAG:      lwz 24, 272(1)                          # 4-byte Folded Reload
+; ASM32-DAG:      lwz 23, 268(1)                          # 4-byte Folded Reload
+; ASM32-DAG:      lwz 22, 264(1)                          # 4-byte Folded Reload
+; ASM32-DAG:      lwz 21, 260(1)                          # 4-byte Folded Reload
+; ASM32-DAG:      lwz 20, 256(1)                          # 4-byte Folded Reload
+; ASM32-DAG:      lwz 19, 252(1)                          # 4-byte Folded Reload
+; ASM32-DAG:      lwz 18, 248(1)                          # 4-byte Folded Reload
+; ASM32-DAG:      lwz 17, 244(1)                          # 4-byte Folded Reload
+; ASM32-DAG:      lwz 16, 240(1)                          # 4-byte Folded Reload
+; ASM32-DAG:      lwz 15, 236(1)                          # 4-byte Folded Reload
+; ASM32-DAG:      lwz 14, 232(1)                          # 4-byte Folded Reload
+
+; ASM32:          addi 1, 1, 448
+; ASM32-NEXT:     blr
 
 ; ASM64-LABEL:    .fprs_gprs_vecregs:
 
-; ASM64:         stdu 1, -544(1)
-; ASM64-DAG:     li {{[0-9]+}}, 64
-; ASM64-DAG:     std 14, 256(1)                          # 8-byte Folded Spill
-; ASM64-DAG:     stfd 14, 400(1)                         # 8-byte Folded Spill
-; ASM64-DAG:     stxvd2x 52, 1, {{[0-9]+}}               # 16-byte Folded Spill
-; ASM64-DAG:     li {{[0-9]+}}, 160
-; ASM64-DAG:     std 25, 344(1)                          # 8-byte Folded Spill
-; ASM64-DAG:     stxvd2x 58, 1, {{[0-9]+}}               # 16-byte Folded Spill
-; ASM64-DAG:     li {{[0-9]+}}, 240
-; ASM64-DAG:     std 31, 392(1)                          # 8-byte Folded Spill
-; ASM64-DAG:     stfd 21, 456(1)                         # 8-byte Folded Spill
-; ASM64-DAG:     stfd 31, 536(1)                         # 8-byte Folded Spill
-; ASM64-DAG:     stxvd2x 63, 1, {{[0-9]+}}               # 16-byte Folded Spill
-; ASM64-DAG:     #APP
-; ASM64-DAG:     #NO_APP
-; ASM64-DAG:     lxvd2x 63, 1, {{[0-9]+}}                # 16-byte Folded Reload
-; ASM64-DAG:     li {{[0-9]+}}, 160
-; ASM64-DAG:     lfd 31, 536(1)                          # 8-byte Folded Reload
-; ASM64-DAG:     lxvd2x 58, 1, {{[0-9]+}}                # 16-byte Folded Reload
-; ASM64-DAG:     li {{[0-9]+}}, 64
-; ASM64-DAG:     lfd 21, 456(1)                          # 8-byte Folded Reload
-; ASM64-DAG:     lxvd2x 52, 1, {{[0-9]+}}                # 16-byte Folded Reload
-; ASM64-DAG:     lfd 14, 400(1)                          # 8-byte Folded Reload
-; ASM64-DAG:     ld 31, 392(1)                           # 8-byte Folded Reload
-; ASM64-DAG:     ld 25, 344(1)                           # 8-byte Folded Reload
-; ASM64-DAG:     ld 14, 256(1)                           # 8-byte Folded Reload
-; ASM64-DAG:     addi 1, 1, 544
-; ASM64:         blr
+; ASM64:            stdu 1, -544(1)
+; ASM64-DAG:        li [[FIXEDSTACK11:[0-9]+]], 64
+; ASM64-DAG:        stxvd2x 52, 1, [[FIXEDSTACK11]]                       # 16-byte Folded Spill
+; ASM64-DAG:        li [[FIXEDSTACK10:[0-9]+]], 80
+; ASM64-DAG:        stxvd2x 53, 1, [[FIXEDSTACK10]]                       # 16-byte Folded Spill
+; ASM64-DAG:        li [[FIXEDSTACK9:[0-9]+]], 96
+; ASM64-DAG:        stxvd2x 54, 1, [[FIXEDSTACK9]]                        # 16-byte Folded Spill
+; ASM64-DAG:        li [[FIXEDSTACK8:[0-9]+]], 112
+; ASM64-DAG:        stxvd2x 55, 1, [[FIXEDSTACK8]]                        # 16-byte Folded Spill
+; ASM64-DAG:        li [[FIXEDSTACK7:[0-9]+]], 128
+; ASM64-DAG:        stxvd2x 56, 1, [[FIXEDSTACK7]]                        # 16-byte Folded Spill
+; ASM64-DAG:        li [[FIXEDSTACK6:[0-9]+]], 144
+; ASM64-DAG:        stxvd2x 57, 1, [[FIXEDSTACK6]]                        # 16-byte Folded Spill
+; ASM64-DAG:        li [[FIXEDSTACK5:[0-9]+]], 160
+; ASM64-DAG:        stxvd2x 58, 1, [[FIXEDSTACK5]]                        # 16-byte Folded Spill
+; ASM64-DAG:        li [[FIXEDSTACK4:[0-9]+]], 176
+; ASM64-DAG:        stxvd2x 59, 1, [[FIXEDSTACK4]]                        # 16-byte Folded Spill
+; ASM64-DAG:        li [[FIXEDSTACK3:[0-9]+]], 192
+; ASM64-DAG:        stxvd2x 60, 1, [[FIXEDSTACK3]]                        # 16-byte Folded Spill
+; ASM64-DAG:        li [[FIXEDSTACK2:[0-9]+]], 208
+; ASM64-DAG:        stxvd2x 61, 1, [[FIXEDSTACK2]]                        # 16-byte Folded Spill
+; ASM64-DAG:        li [[FIXEDSTACK1:[0-9]+]], 224
+; ASM64-DAG:        stxvd2x 62, 1, [[FIXEDSTACK1]]                        # 16-byte Folded Spill
+; ASM64-DAG:        li [[FIXEDSTACK0:[0-9]+]], 240
+; ASM64-DAG:        stxvd2x 63, 1, [[FIXEDSTACK0]]                        # 16-byte Folded Spill
+; ASM64-DAG:        std 14, 256(1)                          # 8-byte Folded Spill
+; ASM64-DAG:        std 15, 264(1)                          # 8-byte Folded Spill
+; ASM64-DAG:        std 16, 272(1)                          # 8-byte Folded Spill
+; ASM64-DAG:        std 17, 280(1)                          # 8-byte Folded Spill
+; ASM64-DAG:        std 18, 288(1)                          # 8-byte Folded Spill
+; ASM64-DAG:        std 19, 296(1)                          # 8-byte Folded Spill
+; ASM64-DAG:        std 20, 304(1)                          # 8-byte Folded Spill
+; ASM64-DAG:        std 21, 312(1)                          # 8-byte Folded Spill
+; ASM64-DAG:        std 22, 320(1)                          # 8-byte Folded Spill
+; ASM64-DAG:        std 23, 328(1)                          # 8-byte Folded Spill
+; ASM64-DAG:        std 24, 336(1)                          # 8-byte Folded Spill
+; ASM64-DAG:        std 25, 344(1)                          # 8-byte Folded Spill
+; ASM64-DAG:        std 26, 352(1)                          # 8-byte Folded Spill
+; ASM64-DAG:        std 27, 360(1)                          # 8-byte Folded Spill
+; ASM64-DAG:        std 28, 368(1)                          # 8-byte Folded Spill
+; ASM64-DAG:        std 29, 376(1)                          # 8-byte Folded Spill
+; ASM64-DAG:        std 30, 384(1)                          # 8-byte Folded Spill
+; ASM64-DAG:        std 31, 392(1)                          # 8-byte Folded Spill
+; ASM64-DAG:        stfd 14, 400(1)                         # 8-byte Folded Spill
+; ASM64-DAG:        stfd 15, 408(1)                         # 8-byte Folded Spill
+; ASM64-DAG:        stfd 16, 416(1)                         # 8-byte Folded Spill
+; ASM64-DAG:        stfd 17, 424(1)                         # 8-byte Folded Spill
+; ASM64-DAG:        stfd 18, 432(1)                         # 8-byte Folded Spill
+; ASM64-DAG:        stfd 19, 440(1)                         # 8-byte Folded Spill
+; ASM64-DAG:        stfd 20, 448(1)                         # 8-byte Folded Spill
+; ASM64-DAG:        stfd 21, 456(1)                         # 8-byte Folded Spill
+; ASM64-DAG:        stfd 22, 464(1)                         # 8-byte Folded Spill
+; ASM64-DAG:        stfd 23, 472(1)                         # 8-byte Folded Spill
+; ASM64-DAG:        stfd 24, 480(1)                         # 8-byte Folded Spill
+; ASM64-DAG:        stfd 25, 488(1)                         # 8-byte Folded Spill
+; ASM64-DAG:        stfd 26, 496(1)                         # 8-byte Folded Spill
+; ASM64-DAG:        stfd 27, 504(1)                         # 8-byte Folded Spill
+; ASM64-DAG:        stfd 28, 512(1)                         # 8-byte Folded Spill
+; ASM64-DAG:        stfd 29, 520(1)                         # 8-byte Folded Spill
+; ASM64-DAG:        stfd 30, 528(1)                         # 8-byte Folded Spill
+; ASM64-DAG:        stfd 31, 536(1)                         # 8-byte Folded Spill
+
+; ASM64:            #APP
+; ASM64-NEXT:       #NO_APP
+
+; ASM64-DAG:        lxvd2x 63, 1, [[FIXEDSTACK0]]                         # 16-byte Folded Reload
+; ASM64-DAG:        li [[FIXEDSTACK1:[0-9]+]], 224
+; ASM64-DAG:        lxvd2x 62, 1, [[FIXEDSTACK1]]                         # 16-byte Folded Reload
+; ASM64-DAG:        li [[FIXEDSTACK2:[0-9]+]], 208
+; ASM64-DAG:        lxvd2x 61, 1, [[FIXEDSTACK2]]                         # 16-byte Folded Reload
+; ASM64-DAG:        li [[FIXEDSTACK3:[0-9]+]], 192
+; ASM64-DAG:        lxvd2x 60, 1, [[FIXEDSTACK3]]                         # 16-byte Folded Reload
+; ASM64-DAG:        li [[FIXEDSTACK4:[0-9]+]], 176
+; ASM64-DAG:        lxvd2x 59, 1, [[FIXEDSTACK4]]                         # 16-byte Folded Reload
+; ASM64-DAG:        li [[FIXEDSTACK5:[0-9]+]], 160
+; ASM64-DAG:        lxvd2x 58, 1, [[FIXEDSTACK5]]                         # 16-byte Folded Reload
+; ASM64-DAG:        li [[FIXEDSTACK6:[0-9]+]], 144
+; ASM64-DAG:        lxvd2x 57, 1, [[FIXEDSTACK6]]                         # 16-byte Folded Reload
+; ASM64-DAG:        li [[FIXEDSTACK7:[0-9]+]], 128
+; ASM64-DAG:        lxvd2x 56, 1, [[FIXEDSTACK7]]                         # 16-byte Folded Reload
+; ASM64-DAG:        li [[FIXEDSTACK8:[0-9]+]], 112
+; ASM64-DAG:        lxvd2x 55, 1, [[FIXEDSTACK8]]                         # 16-byte Folded Reload
+; ASM64-DAG:        li [[FIXEDSTACK9:[0-9]+]], 96
+; ASM64-DAG:        lxvd2x 54, 1, [[FIXEDSTACK9]]                         # 16-byte Folded Reload
+; ASM64-DAG:        li [[FIXEDSTACK10:[0-9]+]], 80
+; ASM64-DAG:        lxvd2x 53, 1, [[FIXEDSTACK10]]                        # 16-byte Folded Reload
+; ASM64-DAG:        li [[FIXEDSTACK11:[0-9]+]], 64
+; ASM64-DAG:        lxvd2x 52, 1, [[FIXEDSTACK11]]                        # 16-byte Folded Reload
+; ASM64-DAG:        lfd 31, 536(1)                          # 8-byte Folded Reload
+; ASM64-DAG:        lfd 30, 528(1)                          # 8-byte Folded Reload
+; ASM64-DAG:        lfd 29, 520(1)                          # 8-byte Folded Reload
+; ASM64-DAG:        lfd 28, 512(1)                          # 8-byte Folded Reload
+; ASM64-DAG:        lfd 27, 504(1)                          # 8-byte Folded Reload
+; ASM64-DAG:        lfd 26, 496(1)                          # 8-byte Folded Reload
+; ASM64-DAG:        lfd 25, 488(1)                          # 8-byte Folded Reload
+; ASM64-DAG:        lfd 24, 480(1)                          # 8-byte Folded Reload
+; ASM64-DAG:        lfd 23, 472(1)                          # 8-byte Folded Reload
+; ASM64-DAG:        lfd 22, 464(1)                          # 8-byte Folded Reload
+; ASM64-DAG:        lfd 21, 456(1)                          # 8-byte Folded Reload
+; ASM64-DAG:        lfd 20, 448(1)                          # 8-byte Folded Reload
+; ASM64-DAG:        lfd 19, 440(1)                          # 8-byte Folded Reload
+; ASM64-DAG:        lfd 18, 432(1)                          # 8-byte Folded Reload
+; ASM64-DAG:        lfd 17, 424(1)                          # 8-byte Folded Reload
+; ASM64-DAG:        lfd 16, 416(1)                          # 8-byte Folded Reload
+; ASM64-DAG:        lfd 15, 408(1)                          # 8-byte Folded Reload
+; ASM64-DAG:        lfd 14, 400(1)                          # 8-byte Folded Reload
+; ASM64-DAG:        ld 31, 392(1)                           # 8-byte Folded Reload
+; ASM64-DAG:        ld 30, 384(1)                           # 8-byte Folded Reload
+; ASM64-DAG:        ld 29, 376(1)                           # 8-byte Folded Reload
+; ASM64-DAG:        ld 28, 368(1)                           # 8-byte Folded Reload
+; ASM64-DAG:        ld 27, 360(1)                           # 8-byte Folded Reload
+; ASM64-DAG:        ld 26, 352(1)                           # 8-byte Folded Reload
+; ASM64-DAG:        ld 25, 344(1)                           # 8-byte Folded Reload
+; ASM64-DAG:        ld 24, 336(1)                           # 8-byte Folded Reload
+; ASM64-DAG:        ld 23, 328(1)                           # 8-byte Folded Reload
+; ASM64-DAG:        ld 22, 320(1)                           # 8-byte Folded Reload
+; ASM64-DAG:        ld 21, 312(1)                           # 8-byte Folded Reload
+; ASM64-DAG:        ld 20, 304(1)                           # 8-byte Folded Reload
+; ASM64-DAG:        ld 19, 296(1)                           # 8-byte Folded Reload
+; ASM64-DAG:        ld 18, 288(1)                           # 8-byte Folded Reload
+; ASM64-DAG:        ld 17, 280(1)                           # 8-byte Folded Reload
+; ASM64-DAG:        ld 16, 272(1)                           # 8-byte Folded Reload
+; ASM64-DAG:        ld 15, 264(1)                           # 8-byte Folded Reload
+; ASM64-DAG:        ld 14, 256(1)                           # 8-byte Folded Reload
+
+; ASM64:            addi 1, 1, 544
+; ASM64-NEXT:       blr
diff --git a/llvm/test/CodeGen/PowerPC/aix-csr-vector.ll b/llvm/test/CodeGen/PowerPC/aix-csr-vector.ll
index 45ec7357656b..9dc06dca3d3b 100644
--- a/llvm/test/CodeGen/PowerPC/aix-csr-vector.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-csr-vector.ll
@@ -63,24 +63,34 @@ define dso_local void @vec_regs() {
 ; ASM64:         blr
 
 define dso_local void @fprs_gprs_vecregs() {
-    call void asm sideeffect "", "~{r14},~{r25},~{r31},~{f14},~{f21},~{f31},~{v20},~{v26},~{v31}"()
+    call void asm sideeffect "", "~{r25},~{r28},~{r31},~{f21},~{f25},~{f31},~{v20},~{v26},~{v31}"()
       ret void
 }
 
 ; MIR32-LABEL:   name:            fprs_gprs_vecregs
 
-; MIR32:         fixedStack:
-
-; MIR32:         liveins: $r14, $r25, $r31, $f14, $f21, $f31
+; MIR32: liveins: $r25, $r26, $r27, $r28, $r29, $r30, $r31, $f21, $f22, $f23, $f24, $f25, $f26, $f27, $f28, $f29, $f30, $f31
 
 ; MIR32-NOT:     STXVD2X killed $v20
 ; MIR32-NOT:     STXVD2X killed $v26
 ; MIR32-NOT:     STXVD2X killed $v31
-; MIR32-DAG:     STW killed $r14, -216, $r1 :: (store (s32) into %fixed-stack.5, align 8)
-; MIR32-DAG:     STW killed $r25, -172, $r1 :: (store (s32) into %fixed-stack.4)
-; MIR32-DAG:     STW killed $r31, -148, $r1 :: (store (s32) into %fixed-stack.3)
-; MIR32-DAG:     STFD killed $f14, -144, $r1 :: (store (s64) into %fixed-stack.2, align 16)
-; MIR32-DAG:     STFD killed $f21, -88, $r1 :: (store (s64) into %fixed-stack.1)
+; MIR32-DAG:     STW killed $r25, -116, $r1 :: (store (s32) into %fixed-stack.17)
+; MIR32-DAG:     STW killed $r26, -112, $r1 :: (store (s32) into %fixed-stack.16, align 8)
+; MIR32-DAG:     STW killed $r27, -108, $r1 :: (store (s32) into %fixed-stack.15)
+; MIR32-DAG:     STW killed $r28, -104, $r1 :: (store (s32) into %fixed-stack.14, align 16)
+; MIR32-DAG:     STW killed $r29, -100, $r1 :: (store (s32) into %fixed-stack.13)
+; MIR32-DAG:     STW killed $r30, -96, $r1 :: (store (s32) into %fixed-stack.12, align 8)
+; MIR32-DAG:     STW killed $r31, -92, $r1 :: (store (s32) into %fixed-stack.11)
+; MIR32-DAG:     STFD killed $f21, -88, $r1 :: (store (s64) into %fixed-stack.10)
+; MIR32-DAG:     STFD killed $f22, -80, $r1 :: (store (s64) into %fixed-stack.9, align 16)
+; MIR32-DAG:     STFD killed $f23, -72, $r1 :: (store (s64) into %fixed-stack.8)
+; MIR32-DAG:     STFD killed $f24, -64, $r1 :: (store (s64) into %fixed-stack.7, align 16)
+; MIR32-DAG:     STFD killed $f25, -56, $r1 :: (store (s64) into %fixed-stack.6)
+; MIR32-DAG:     STFD killed $f26, -48, $r1 :: (store (s64) into %fixed-stack.5, align 16)
+; MIR32-DAG:     STFD killed $f27, -40, $r1 :: (store (s64) into %fixed-stack.4)
+; MIR32-DAG:     STFD killed $f28, -32, $r1 :: (store (s64) into %fixed-stack.3, align 16)
+; MIR32-DAG:     STFD killed $f29, -24, $r1 :: (store (s64) into %fixed-stack.2)
+; MIR32-DAG:     STFD killed $f30, -16, $r1 :: (store (s64) into %fixed-stack.1, align 16)
 ; MIR32-DAG:     STFD killed $f31, -8, $r1 :: (store (s64) into %fixed-stack.0)
 
 ; MIR32-LABEL:   INLINEASM
@@ -88,28 +98,50 @@ define dso_local void @fprs_gprs_vecregs() {
 ; MIR32-NOT:     $v20 = LXVD2X
 ; MIR32-NOT:     $v26 = LXVD2X
 ; MIR32-NOT:     $v31 = LXVD2X
-; MIR32-DAG:     $r14 = LWZ -216, $r1 :: (load (s32) from %fixed-stack.5, align 8)
-; MIR32-DAG:     $r25 = LWZ -172, $r1 :: (load (s32) from %fixed-stack.4)
-; MIR32-DAG:     $r31 = LWZ -148, $r1 :: (load (s32) from %fixed-stack.3)
-; MIR32-DAG:     $f14 = LFD -144, $r1 :: (load (s64) from %fixed-stack.2, align 16)
-; MIR32-DAG:     $f21 = LFD -88, $r1 :: (load (s64) from %fixed-stack.1)
 ; MIR32-DAG:     $f31 = LFD -8, $r1 :: (load (s64) from %fixed-stack.0)
-; MIR32-DAG:     BLR implicit $lr, implicit $rm
+; MIR32-DAG:     $f30 = LFD -16, $r1 :: (load (s64) from %fixed-stack.1, align 16)
+; MIR32-DAG:     $f29 = LFD -24, $r1 :: (load (s64) from %fixed-stack.2)
+; MIR32-DAG:     $f28 = LFD -32, $r1 :: (load (s64) from %fixed-stack.3, align 16)
+; MIR32-DAG:     $f27 = LFD -40, $r1 :: (load (s64) from %fixed-stack.4)
+; MIR32-DAG:     $f26 = LFD -48, $r1 :: (load (s64) from %fixed-stack.5, align 16)
+; MIR32-DAG:     $f25 = LFD -56, $r1 :: (load (s64) from %fixed-stack.6)
+; MIR32-DAG:     $f24 = LFD -64, $r1 :: (load (s64) from %fixed-stack.7, align 16)
+; MIR32-DAG:     $f23 = LFD -72, $r1 :: (load (s64) from %fixed-stack.8)
+; MIR32-DAG:     $f22 = LFD -80, $r1 :: (load (s64) from %fixed-stack.9, align 16)
+; MIR32-DAG:     $f21 = LFD -88, $r1 :: (load (s64) from %fixed-stack.10)
+; MIR32-DAG:     $r31 = LWZ -92, $r1 :: (load (s32) from %fixed-stack.11)
+; MIR32-DAG:     $r30 = LWZ -96, $r1 :: (load (s32) from %fixed-stack.12, align 8)
+; MIR32-DAG:     $r29 = LWZ -100, $r1 :: (load (s32) from %fixed-stack.13)
+; MIR32-DAG:     $r28 = LWZ -104, $r1 :: (load (s32) from %fixed-stack.14, align 16)
+; MIR32-DAG:     $r27 = LWZ -108, $r1 :: (load (s32) from %fixed-stack.15)
+; MIR32-DAG:     $r26 = LWZ -112, $r1 :: (load (s32) from %fixed-stack.16, align 8)
+; MIR32-DAG:     $r25 = LWZ -116, $r1 :: (load (s32) from %fixed-stack.17)
+; MIR32:         BLR implicit $lr, implicit $rm
 
 ; MIR64-LABEL:   name:            fprs_gprs_vecregs
 
-; MIR64:         fixedStack:
-
-; MIR64:         liveins: $x14, $x25, $x31, $f14, $f21, $f31
+; MIR64: liveins: $x25, $x26, $x27, $x28, $x29, $x30, $x31, $f21, $f22, $f23, $f24, $f25, $f26, $f27, $f28, $f29, $f30, $f31
 
 ; MIR64-NOT:     STXVD2X killed $v20
 ; MIR64-NOT:     STXVD2X killed $v26
 ; MIR64-NOT:     STXVD2X killed $v31
-; MIR64-DAG:     STD killed $x14, -288, $x1 :: (store (s64) into %fixed-stack.5, align 16)
-; MIR64-DAG:     STD killed $x25, -200, $x1 :: (store (s64) into %fixed-stack.4)
-; MIR64-DAG:     STD killed $x31, -152, $x1 :: (store (s64) into %fixed-stack.3)
-; MIR64-DAG:     STFD killed $f14, -144, $x1 :: (store (s64) into %fixed-stack.2, align 16)
-; MIR64-DAG:     STFD killed $f21, -88, $x1 :: (store (s64) into %fixed-stack.1)
+; MIR64-DAG:     STD killed $x25, -144, $x1 :: (store (s64) into %fixed-stack.17)
+; MIR64-DAG:     STD killed $x26, -136, $x1 :: (store (s64) into %fixed-stack.16, align 16)
+; MIR64-DAG:     STD killed $x27, -128, $x1 :: (store (s64) into %fixed-stack.15)
+; MIR64-DAG:     STD killed $x28, -120, $x1 :: (store (s64) into %fixed-stack.14, align 16)
+; MIR64-DAG:     STD killed $x29, -112, $x1 :: (store (s64) into %fixed-stack.13)
+; MIR64-DAG:     STD killed $x30, -104, $x1 :: (store (s64) into %fixed-stack.12, align 16)
+; MIR64-DAG:     STD killed $x31, -96, $x1 :: (store (s64) into %fixed-stack.11)
+; MIR64-DAG:     STFD killed $f21, -88, $x1 :: (store (s64) into %fixed-stack.10)
+; MIR64-DAG:     STFD killed $f22, -80, $x1 :: (store (s64) into %fixed-stack.9, align 16)
+; MIR64-DAG:     STFD killed $f23, -72, $x1 :: (store (s64) into %fixed-stack.8)
+; MIR64-DAG:     STFD killed $f24, -64, $x1 :: (store (s64) into %fixed-stack.7, align 16)
+; MIR64-DAG:     STFD killed $f25, -56, $x1 :: (store (s64) into %fixed-stack.6)
+; MIR64-DAG:     STFD killed $f26, -48, $x1 :: (store (s64) into %fixed-stack.5, align 16)
+; MIR64-DAG:     STFD killed $f27, -40, $x1 :: (store (s64) into %fixed-stack.4)
+; MIR64-DAG:     STFD killed $f28, -32, $x1 :: (store (s64) into %fixed-stack.3, align 16)
+; MIR64-DAG:     STFD killed $f29, -24, $x1 :: (store (s64) into %fixed-stack.2)
+; MIR64-DAG:     STFD killed $f30, -16, $x1 :: (store (s64) into %fixed-stack.1, align 16)
 ; MIR64-DAG:     STFD killed $f31, -8, $x1 :: (store (s64) into %fixed-stack.0)
 
 ; MIR64-LABEL:   INLINEASM
@@ -117,12 +149,25 @@ define dso_local void @fprs_gprs_vecregs() {
 ; MIR64-NOT:     $v20 = LXVD2X
 ; MIR64-NOT:     $v26 = LXVD2X
 ; MIR64-NOT:     $v31 = LXVD2X
-; MIR64-DAG:     $x14 = LD -288, $x1 :: (load (s64) from %fixed-stack.5, align 16)
-; MIR64-DAG:     $x25 = LD -200, $x1 :: (load (s64) from %fixed-stack.4)
-; MIR64-DAG:     $x31 = LD -152, $x1 :: (load (s64) from %fixed-stack.3)
-; MIR64-DAG:     $f14 = LFD -144, $x1 :: (load (s64) from %fixed-stack.2, align 16)
-; MIR64-DAG:     $f21 = LFD -88, $x1 :: (load (s64) from %fixed-stack.1)
 ; MIR64-DAG:     $f31 = LFD -8, $x1 :: (load (s64) from %fixed-stack.0)
+; MIR64-DAG:     $f30 = LFD -16, $x1 :: (load (s64) from %fixed-stack.1, align 16)
+; MIR64-DAG:     $f29 = LFD -24, $x1 :: (load (s64) from %fixed-stack.2)
+; MIR64-DAG:     $f28 = LFD -32, $x1 :: (load (s64) from %fixed-stack.3, align 16)
+; MIR64-DAG:     $f27 = LFD -40, $x1 :: (load (s64) from %fixed-stack.4)
+; MIR64-DAG:     $f26 = LFD -48, $x1 :: (load (s64) from %fixed-stack.5, align 16)
+; MIR64-DAG:     $f25 = LFD -56, $x1 :: (load (s64) from %fixed-stack.6)
+; MIR64-DAG:     $f24 = LFD -64, $x1 :: (load (s64) from %fixed-stack.7, align 16)
+; MIR64-DAG:     $f23 = LFD -72, $x1 :: (load (s64) from %fixed-stack.8)
+; MIR64-DAG:     $f22 = LFD -80, $x1 :: (load (s64) from %fixed-stack.9, align 16)
+; MIR64-DAG:     $f21 = LFD -88, $x1 :: (load (s64) from %fixed-stack.10)
+; MIR64-DAG:     $x31 = LD -96, $x1 :: (load (s64) from %fixed-stack.11)
+; MIR64-DAG:     $x30 = LD -104, $x1 :: (load (s64) from %fixed-stack.12, align 16)
+; MIR64-DAG:     $x29 = LD -112, $x1 :: (load (s64) from %fixed-stack.13)
+; MIR64-DAG:     $x28 = LD -120, $x1 :: (load (s64) from %fixed-stack.14, align 16)
+; MIR64-DAG:     $x27 = LD -128, $x1 :: (load (s64) from %fixed-stack.15)
+; MIR64-DAG:     $x26 = LD -136, $x1 :: (load (s64) from %fixed-stack.16, align 16)
+; MIR64-DAG:     $x25 = LD -144, $x1 :: (load (s64) from %fixed-stack.17)
+
 ; MIR64:         BLR8 implicit $lr8, implicit $rm
 
 ;; We don't have -ppc-full-reg-names on AIX so can't reliably check-not for
@@ -130,38 +175,87 @@ define dso_local void @fprs_gprs_vecregs() {
 
 ; ASM32-LABEL:   .fprs_gprs_vecregs:
 
-; ASM32-DAG:     stw 14, -216(1)                         # 4-byte Folded Spill
-; ASM32-DAG:     stw 25, -172(1)                         # 4-byte Folded Spill
-; ASM32-DAG:     stw 31, -148(1)                         # 4-byte Folded Spill
-; ASM32-DAG:     stfd 14, -144(1)                        # 8-byte Folded Spill
-; ASM32-DAG:     stfd 21, -88(1)                         # 8-byte Folded Spill
-; ASM32-DAG:     stfd 31, -8(1)                          # 8-byte Folded Spill
-; ASM32-DAG:     #APP
-; ASM32-DAG:     #NO_APP
-; ASM32-DAG:     lfd 31, -8(1)                           # 8-byte Folded Reload
-; ASM32-DAG:     lfd 21, -88(1)                          # 8-byte Folded Reload
-; ASM32-DAG:     lfd 14, -144(1)                         # 8-byte Folded Reload
-; ASM32-DAG:     lwz 31, -148(1)                         # 4-byte Folded Reload
-; ASM32-DAG:     lwz 25, -172(1)                         # 4-byte Folded Reload
-; ASM32-DAG:     lwz 14, -216(1)                         # 4-byte Folded Reload
+; ASM32-DAG:   stw 25, -116(1)                         # 4-byte Folded Spill
+; ASM32-DAG:   stw 26, -112(1)                         # 4-byte Folded Spill
+; ASM32-DAG:   stw 27, -108(1)                         # 4-byte Folded Spill
+; ASM32-DAG:   stw 28, -104(1)                         # 4-byte Folded Spill
+; ASM32-DAG:   stw 29, -100(1)                         # 4-byte Folded Spill
+; ASM32-DAG:   stw 30, -96(1)                          # 4-byte Folded Spill
+; ASM32-DAG:   stw 31, -92(1)                          # 4-byte Folded Spill
+; ASM32-DAG:   stfd 21, -88(1)                         # 8-byte Folded Spill
+; ASM32-DAG:   stfd 22, -80(1)                         # 8-byte Folded Spill
+; ASM32-DAG:   stfd 23, -72(1)                         # 8-byte Folded Spill
+; ASM32-DAG:   stfd 24, -64(1)                         # 8-byte Folded Spill
+; ASM32-DAG:   stfd 25, -56(1)                         # 8-byte Folded Spill
+; ASM32-DAG:   stfd 26, -48(1)                         # 8-byte Folded Spill
+; ASM32-DAG:   stfd 27, -40(1)                         # 8-byte Folded Spill
+; ASM32-DAG:   stfd 28, -32(1)                         # 8-byte Folded Spill
+; ASM32-DAG:   stfd 29, -24(1)                         # 8-byte Folded Spill
+; ASM32-DAG:   stfd 30, -16(1)                         # 8-byte Folded Spill
+; ASM32-DAG:   stfd 31, -8(1)                          # 8-byte Folded Spill
+; ASM32:       #APP
+; ASM32-NEXT:  #NO_APP
+; ASM32-DAG:   lfd 31, -8(1)                           # 8-byte Folded Reload
+; ASM32-DAG:   lfd 30, -16(1)                          # 8-byte Folded Reload
+; ASM32-DAG:   lfd 29, -24(1)                          # 8-byte Folded Reload
+; ASM32-DAG:   lfd 28, -32(1)                          # 8-byte Folded Reload
+; ASM32-DAG:   lfd 27, -40(1)                          # 8-byte Folded Reload
+; ASM32-DAG:   lfd 26, -48(1)                          # 8-byte Folded Reload
+; ASM32-DAG:   lfd 25, -56(1)                          # 8-byte Folded Reload
+; ASM32-DAG:   lfd 24, -64(1)                          # 8-byte Folded Reload
+; ASM32-DAG:   lfd 23, -72(1)                          # 8-byte Folded Reload
+; ASM32-DAG:   lfd 22, -80(1)                          # 8-byte Folded Reload
+; ASM32-DAG:   lfd 21, -88(1)                          # 8-byte Folded Reload
+; ASM32-DAG:   lwz 31, -92(1)                          # 4-byte Folded Reload
+; ASM32-DAG:   lwz 30, -96(1)                          # 4-byte Folded Reload
+; ASM32-DAG:   lwz 29, -100(1)                         # 4-byte Folded Reload
+; ASM32-DAG:   lwz 28, -104(1)                         # 4-byte Folded Reload
+; ASM32-DAG:   lwz 27, -108(1)                         # 4-byte Folded Reload
+; ASM32-DAG:   lwz 26, -112(1)                         # 4-byte Folded Reload
+; ASM32-DAG:   lwz 25, -116(1)                         # 4-byte Folded Reload
 ; ASM32:         blr
 
 ; ASM64-LABEL:    .fprs_gprs_vecregs:
 
-; ASM64-DAG:     std 14, -288(1)                         # 8-byte Folded Spill
-; ASM64-DAG:     std 25, -200(1)                         # 8-byte Folded Spill
-; ASM64-DAG:     std 31, -152(1)                         # 8-byte Folded Spill
-; ASM64-DAG:     stfd 14, -144(1)                        # 8-byte Folded Spill
+; ASM64-DAG:     std 25, -144(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     std 26, -136(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     std 27, -128(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     std 28, -120(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     std 29, -112(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     std 30, -104(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     std 31, -96(1)                          # 8-byte Folded Spill
 ; ASM64-DAG:     stfd 21, -88(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     stfd 22, -80(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     stfd 23, -72(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     stfd 24, -64(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     stfd 25, -56(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     stfd 26, -48(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     stfd 27, -40(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     stfd 28, -32(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     stfd 29, -24(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     stfd 30, -16(1)                         # 8-byte Folded Spill
 ; ASM64-DAG:     stfd 31, -8(1)                          # 8-byte Folded Spill
-; ASM64-DAG:     #APP
-; ASM64-DAG:     #NO_APP
+; ASM64:         #APP
+; ASM64-NEXT:    #NO_APP
 ; ASM64-DAG:     lfd 31, -8(1)                           # 8-byte Folded Reload
+; ASM64-DAG:     lfd 30, -16(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     lfd 29, -24(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     lfd 28, -32(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     lfd 27, -40(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     lfd 26, -48(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     lfd 25, -56(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     lfd 24, -64(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     lfd 23, -72(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     lfd 22, -80(1)                          # 8-byte Folded Reload
 ; ASM64-DAG:     lfd 21, -88(1)                          # 8-byte Folded Reload
-; ASM64-DAG:     lfd 14, -144(1)                         # 8-byte Folded Reload
-; ASM64-DAG:     ld 31, -152(1)                          # 8-byte Folded Reload
-; ASM64-DAG:     ld 25, -200(1)                          # 8-byte Folded Reload
-; ASM64-DAG:     ld 14, -288(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     ld 31, -96(1)                           # 8-byte Folded Reload
+; ASM64-DAG:     ld 30, -104(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     ld 29, -112(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     ld 28, -120(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     ld 27, -128(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     ld 26, -136(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     ld 25, -144(1)                          # 8-byte Folded Reload
+
 ; ASM64:         blr
 
 define dso_local void @all_fprs_and_vecregs() {
diff --git a/llvm/test/CodeGen/PowerPC/aix-csr.ll b/llvm/test/CodeGen/PowerPC/aix-csr.ll
index a9a85c8be5a1..1dadacf1faab 100644
--- a/llvm/test/CodeGen/PowerPC/aix-csr.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-csr.ll
@@ -20,77 +20,260 @@ entry:
 
 ; MIR64:       name:            gprs_only
 ; MIR64-LABEL: fixedStack:
-; MIR64-NEXT:   - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 16, stack-id: default,
-; MIR64-NEXT:       callee-saved-register: '$x30', callee-saved-restored: true, debug-info-variable: '',
-; MIR64-NEXT:       debug-info-expression: '', debug-info-location: '' }
-; MIR64-NEXT:   - { id: 1, type: spill-slot, offset: -80, size: 8, alignment: 16, stack-id: default,
-; MIR64-NEXT:       callee-saved-register: '$x22', callee-saved-restored: true, debug-info-variable: '',
-; MIR64-NEXT:       debug-info-expression: '', debug-info-location: '' }
-; MIR64-NEXT:   - { id: 2, type: spill-slot, offset: -128, size: 8, alignment: 16, stack-id: default,
-; MIR64-NEXT:       callee-saved-register: '$x16', callee-saved-restored: true, debug-info-variable: '',
-; MIR64-NEXT:       debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT:  - { id: 0, type: spill-slot, offset: -8, size: 8, alignment: 8, stack-id: default,
+; MIR64-NEXT:      callee-saved-register: '$x31', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT:  - { id: 1, type: spill-slot, offset: -16, size: 8, alignment: 16, stack-id: default,
+; MIR64-NEXT:      callee-saved-register: '$x30', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT:  - { id: 2, type: spill-slot, offset: -24, size: 8, alignment: 8, stack-id: default,
+; MIR64-NEXT:      callee-saved-register: '$x29', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT:  - { id: 3, type: spill-slot, offset: -32, size: 8, alignment: 16, stack-id: default,
+; MIR64-NEXT:      callee-saved-register: '$x28', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT:  - { id: 4, type: spill-slot, offset: -40, size: 8, alignment: 8, stack-id: default,
+; MIR64-NEXT:      callee-saved-register: '$x27', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT:  - { id: 5, type: spill-slot, offset: -48, size: 8, alignment: 16, stack-id: default,
+; MIR64-NEXT:      callee-saved-register: '$x26', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT:  - { id: 6, type: spill-slot, offset: -56, size: 8, alignment: 8, stack-id: default,
+; MIR64-NEXT:      callee-saved-register: '$x25', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT:  - { id: 7, type: spill-slot, offset: -64, size: 8, alignment: 16, stack-id: default,
+; MIR64-NEXT:      callee-saved-register: '$x24', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT:  - { id: 8, type: spill-slot, offset: -72, size: 8, alignment: 8, stack-id: default,
+; MIR64-NEXT:      callee-saved-register: '$x23', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT:  - { id: 9, type: spill-slot, offset: -80, size: 8, alignment: 16, stack-id: default,
+; MIR64-NEXT:      callee-saved-register: '$x22', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT:  - { id: 10, type: spill-slot, offset: -88, size: 8, alignment: 8, stack-id: default,
+; MIR64-NEXT:      callee-saved-register: '$x21', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT:  - { id: 11, type: spill-slot, offset: -96, size: 8, alignment: 16, stack-id: default,
+; MIR64-NEXT:      callee-saved-register: '$x20', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT:  - { id: 12, type: spill-slot, offset: -104, size: 8, alignment: 8, stack-id: default,
+; MIR64-NEXT:      callee-saved-register: '$x19', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT:  - { id: 13, type: spill-slot, offset: -112, size: 8, alignment: 16, stack-id: default,
+; MIR64-NEXT:      callee-saved-register: '$x18', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT:  - { id: 14, type: spill-slot, offset: -120, size: 8, alignment: 8, stack-id: default,
+; MIR64-NEXT:      callee-saved-register: '$x17', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT:  - { id: 15, type: spill-slot, offset: -128, size: 8, alignment: 16, stack-id: default,
+; MIR64-NEXT:      callee-saved-register: '$x16', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:      debug-info-expression: '', debug-info-location: '' }
 ; MIR64-NEXT:  stack:           []
 
 ; MIR32:       name:            gprs_only
 ; MIR32-LABEL: fixedStack:
-; MIR32:        - { id: 0, type: spill-slot, offset: -8, size: 4, alignment: 8, stack-id: default,
-; MIR32-NEXT:       callee-saved-register: '$r30', callee-saved-restored: true, debug-info-variable: '',
-; MIR32-NEXT:       debug-info-expression: '', debug-info-location: '' }
-; MIR32-NEXT:   - { id: 1, type: spill-slot, offset: -40, size: 4, alignment: 8, stack-id: default,
-; MIR32-NEXT:       callee-saved-register: '$r22', callee-saved-restored: true, debug-info-variable: '',
-; MIR32-NEXT:       debug-info-expression: '', debug-info-location: '' }
-; MIR32-NEXT:   - { id: 2, type: spill-slot, offset: -64, size: 4, alignment: 16, stack-id: default,
-; MIR32-NEXT:       callee-saved-register: '$r16', callee-saved-restored: true, debug-info-variable: '',
-; MIR32-NEXT:       debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 0, type: spill-slot, offset: -4, size: 4, alignment: 4, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r31', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 1, type: spill-slot, offset: -8, size: 4, alignment: 8, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r30', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 2, type: spill-slot, offset: -12, size: 4, alignment: 4, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r29', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 3, type: spill-slot, offset: -16, size: 4, alignment: 16, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r28', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 4, type: spill-slot, offset: -20, size: 4, alignment: 4, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r27', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 5, type: spill-slot, offset: -24, size: 4, alignment: 8, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r26', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 6, type: spill-slot, offset: -28, size: 4, alignment: 4, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r25', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 7, type: spill-slot, offset: -32, size: 4, alignment: 16, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r24', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 8, type: spill-slot, offset: -36, size: 4, alignment: 4, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r23', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 9, type: spill-slot, offset: -40, size: 4, alignment: 8, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r22', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 10, type: spill-slot, offset: -44, size: 4, alignment: 4, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r21', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 11, type: spill-slot, offset: -48, size: 4, alignment: 16, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r20', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 12, type: spill-slot, offset: -52, size: 4, alignment: 4, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r19', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 13, type: spill-slot, offset: -56, size: 4, alignment: 8, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r18', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 14, type: spill-slot, offset: -60, size: 4, alignment: 4, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r17', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 15, type: spill-slot, offset: -64, size: 4, alignment: 16, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r16', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
 ; MIR32-NEXT:  stack:           []
 
 
-; MIR64: liveins: $x3, $x16, $x22, $x30
-
-; MIR64-DAG: STD killed $x16, -128, $x1 :: (store (s64) into %fixed-stack.2, align 16)
-; MIR64-DAG: STD killed $x22, -80, $x1 :: (store (s64) into %fixed-stack.1, align 16)
-; MIR64-DAG: STD killed $x30, -16, $x1 :: (store (s64) into %fixed-stack.0, align 16)
+; MIR64: liveins: $x3, $x16, $x17, $x18, $x19, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $x29, $x30, $x31
+
+; MIR64-DAG:       STD killed $x16, -128, $x1 :: (store (s64) into %fixed-stack.15, align 16)
+; MIR64-DAG:  STD killed $x17, -120, $x1 :: (store (s64) into %fixed-stack.14)
+; MIR64-DAG:  STD killed $x18, -112, $x1 :: (store (s64) into %fixed-stack.13, align 16)
+; MIR64-DAG:  STD killed $x19, -104, $x1 :: (store (s64) into %fixed-stack.12)
+; MIR64-DAG:  STD killed $x20, -96, $x1 :: (store (s64) into %fixed-stack.11, align 16)
+; MIR64-DAG:  STD killed $x21, -88, $x1 :: (store (s64) into %fixed-stack.10)
+; MIR64-DAG:  STD killed $x22, -80, $x1 :: (store (s64) into %fixed-stack.9, align 16)
+; MIR64-DAG:  STD killed $x23, -72, $x1 :: (store (s64) into %fixed-stack.8)
+; MIR64-DAG:  STD killed $x24, -64, $x1 :: (store (s64) into %fixed-stack.7, align 16)
+; MIR64-DAG:  STD killed $x25, -56, $x1 :: (store (s64) into %fixed-stack.6)
+; MIR64-DAG:  STD killed $x26, -48, $x1 :: (store (s64) into %fixed-stack.5, align 16)
+; MIR64-DAG:  STD killed $x27, -40, $x1 :: (store (s64) into %fixed-stack.4)
+; MIR64-DAG:  STD killed $x28, -32, $x1 :: (store (s64) into %fixed-stack.3, align 16)
+; MIR64-DAG:  STD killed $x29, -24, $x1 :: (store (s64) into %fixed-stack.2)
+; MIR64-DAG:  STD killed $x30, -16, $x1 :: (store (s64) into %fixed-stack.1, align 16)
+; MIR64-DAG:  STD killed $x31, -8, $x1 :: (store (s64) into %fixed-stack.0)
 
 ; MIR64:     INLINEASM
 
-; MIR64-DAG: $x30 = LD -16, $x1 :: (load (s64) from %fixed-stack.0, align 16)
-; MIR64-DAG: $x22 = LD -80, $x1 :: (load (s64) from %fixed-stack.1, align 16)
-; MIR64-DAG: $x16 = LD -128, $x1 :: (load (s64) from %fixed-stack.2, align 16)
-; MIR64:     BLR8 implicit $lr8, implicit $rm, implicit $x3
-
-
-; MIR32: liveins: $r3, $r16, $r22, $r30
 
-; MIR32-DAG: STW killed $r16, -64, $r1 :: (store (s32) into %fixed-stack.2, align 16)
-; MIR32-DAG: STW killed $r22, -40, $r1 :: (store (s32) into %fixed-stack.1, align 8)
-; MIR32-DAG: STW killed $r30, -8, $r1 :: (store (s32) into %fixed-stack.0, align 8)
+; MIR64-DAG:    $x31 = LD -8, $x1 :: (load (s64) from %fixed-stack.0)
+; MIR64-DAG:    $x30 = LD -16, $x1 :: (load (s64) from %fixed-stack.1, align 16)
+; MIR64-DAG:    $x29 = LD -24, $x1 :: (load (s64) from %fixed-stack.2)
+; MIR64-DAG:    $x28 = LD -32, $x1 :: (load (s64) from %fixed-stack.3, align 16)
+; MIR64-DAG:    $x27 = LD -40, $x1 :: (load (s64) from %fixed-stack.4)
+; MIR64-DAG:    $x26 = LD -48, $x1 :: (load (s64) from %fixed-stack.5, align 16)
+; MIR64-DAG:    $x25 = LD -56, $x1 :: (load (s64) from %fixed-stack.6)
+; MIR64-DAG:    $x24 = LD -64, $x1 :: (load (s64) from %fixed-stack.7, align 16)
+; MIR64-DAG:    $x23 = LD -72, $x1 :: (load (s64) from %fixed-stack.8)
+; MIR64-DAG:    $x22 = LD -80, $x1 :: (load (s64) from %fixed-stack.9, align 16)
+; MIR64-DAG:    $x21 = LD -88, $x1 :: (load (s64) from %fixed-stack.10)
+; MIR64-DAG:    $x20 = LD -96, $x1 :: (load (s64) from %fixed-stack.11, align 16)
+; MIR64-DAG:    $x19 = LD -104, $x1 :: (load (s64) from %fixed-stack.12)
+; MIR64-DAG:    $x18 = LD -112, $x1 :: (load (s64) from %fixed-stack.13, align 16)
+; MIR64-DAG:    $x17 = LD -120, $x1 :: (load (s64) from %fixed-stack.14)
+; MIR64-DAG:    $x16 = LD -128, $x1 :: (load (s64) from %fixed-stack.15, align 16)
+; MIR64:        BLR8 implicit $lr8, implicit $rm, implicit $x3
+
+
+; MIR32:  liveins: $r3, $r16, $r17, $r18, $r19, $r20, $r21, $r22, $r23, $r24, $r25, $r26, $r27, $r28, $r29, $r30, $r31
+
+; MIR32-DAG:  STW killed $r16, -64, $r1 :: (store (s32) into %fixed-stack.15, align 16)
+; MIR32-DAG:  STW killed $r17, -60, $r1 :: (store (s32) into %fixed-stack.14)
+; MIR32-DAG:  STW killed $r18, -56, $r1 :: (store (s32) into %fixed-stack.13, align 8)
+; MIR32-DAG:  STW killed $r19, -52, $r1 :: (store (s32) into %fixed-stack.12)
+; MIR32-DAG:  STW killed $r20, -48, $r1 :: (store (s32) into %fixed-stack.11, align 16)
+; MIR32-DAG:  STW killed $r21, -44, $r1 :: (store (s32) into %fixed-stack.10)
+; MIR32-DAG:  STW killed $r22, -40, $r1 :: (store (s32) into %fixed-stack.9, align 8)
+; MIR32-DAG:  STW killed $r23, -36, $r1 :: (store (s32) into %fixed-stack.8)
+; MIR32-DAG:  STW killed $r24, -32, $r1 :: (store (s32) into %fixed-stack.7, align 16)
+; MIR32-DAG:  STW killed $r25, -28, $r1 :: (store (s32) into %fixed-stack.6)
+; MIR32-DAG:  STW killed $r26, -24, $r1 :: (store (s32) into %fixed-stack.5, align 8)
+; MIR32-DAG:  STW killed $r27, -20, $r1 :: (store (s32) into %fixed-stack.4)
+; MIR32-DAG:  STW killed $r28, -16, $r1 :: (store (s32) into %fixed-stack.3, align 16)
+; MIR32-DAG:  STW killed $r29, -12, $r1 :: (store (s32) into %fixed-stack.2)
+; MIR32-DAG:  STW killed $r30, -8, $r1 :: (store (s32) into %fixed-stack.1, align 8)
+; MIR32-DAG:  STW killed $r31, -4, $r1 :: (store (s32) into %fixed-stack.0)
 
-; MIR32:     INLINEASM
+; MIR32:      INLINEASM
 
-; MIR32-DAG: $r30 = LWZ -8, $r1 :: (load (s32) from %fixed-stack.0, align 8)
-; MIR32-DAG: $r22 = LWZ -40, $r1 :: (load (s32) from %fixed-stack.1, align 8)
-; MIR32-DAG: $r16 = LWZ -64, $r1 :: (load (s32) from %fixed-stack.2, align 16)
-; MIR32:     BLR implicit $lr, implicit $rm, implicit $r3
+; MIR32-DAG:  $r31 = LWZ -4, $r1 :: (load (s32) from %fixed-stack.0)
+; MIR32-DAG:  $r30 = LWZ -8, $r1 :: (load (s32) from %fixed-stack.1, align 8)
+; MIR32-DAG:  $r29 = LWZ -12, $r1 :: (load (s32) from %fixed-stack.2)
+; MIR32-DAG:  $r28 = LWZ -16, $r1 :: (load (s32) from %fixed-stack.3, align 16)
+; MIR32-DAG:  $r27 = LWZ -20, $r1 :: (load (s32) from %fixed-stack.4)
+; MIR32-DAG:  $r26 = LWZ -24, $r1 :: (load (s32) from %fixed-stack.5, align 8)
+; MIR32-DAG:  $r25 = LWZ -28, $r1 :: (load (s32) from %fixed-stack.6)
+; MIR32-DAG:  $r24 = LWZ -32, $r1 :: (load (s32) from %fixed-stack.7, align 16)
+; MIR32-DAG:  $r23 = LWZ -36, $r1 :: (load (s32) from %fixed-stack.8)
+; MIR32-DAG:  $r22 = LWZ -40, $r1 :: (load (s32) from %fixed-stack.9, align 8)
+; MIR32-DAG:  $r21 = LWZ -44, $r1 :: (load (s32) from %fixed-stack.10)
+; MIR32-DAG:  $r20 = LWZ -48, $r1 :: (load (s32) from %fixed-stack.11, align 16)
+; MIR32-DAG:  $r19 = LWZ -52, $r1 :: (load (s32) from %fixed-stack.12)
+; MIR32-DAG:  $r18 = LWZ -56, $r1 :: (load (s32) from %fixed-stack.13, align 8)
+; MIR32-DAG:  $r17 = LWZ -60, $r1 :: (load (s32) from %fixed-stack.14)
+; MIR32-DAG:  $r16 = LWZ -64, $r1 :: (load (s32) from %fixed-stack.15, align 16)
+; MIR32:      BLR implicit $lr, implicit $rm, implicit $r3
 
 
 ; ASM64-LABEL: .gprs_only:
-; ASM64-DAG:      std 16, -128(1)                 # 8-byte Folded Spill
-; ASM64-DAG:      std 22, -80(1)                  # 8-byte Folded Spill
-; ASM64-DAG:      std 30, -16(1)                  # 8-byte Folded Spill
-; ASM64:          #APP
-; ASM64-DAG:      ld 30, -16(1)                   # 8-byte Folded Reload
-; ASM64-DAG:      ld 22, -80(1)                   # 8-byte Folded Reload
-; ASM64-DAG:      ld 16, -128(1)                  # 8-byte Folded Reload
+; ASM64-DAG:     std 16, -128(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     std 17, -120(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     std 18, -112(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     std 19, -104(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     std 20, -96(1)                          # 8-byte Folded Spill
+; ASM64-DAG:     std 21, -88(1)                          # 8-byte Folded Spill
+; ASM64-DAG:     std 22, -80(1)                          # 8-byte Folded Spill
+; ASM64-DAG:     std 23, -72(1)                          # 8-byte Folded Spill
+; ASM64-DAG:     std 24, -64(1)                          # 8-byte Folded Spill
+; ASM64-DAG:     std 25, -56(1)                          # 8-byte Folded Spill
+; ASM64-DAG:     std 26, -48(1)                          # 8-byte Folded Spill
+; ASM64-DAG:     std 27, -40(1)                          # 8-byte Folded Spill
+; ASM64-DAG:     std 28, -32(1)                          # 8-byte Folded Spill
+; ASM64-DAG:     std 29, -24(1)                          # 8-byte Folded Spill
+; ASM64-DAG:     std 30, -16(1)                          # 8-byte Folded Spill
+; ASM64-DAG:     std 31, -8(1)                           # 8-byte Folded Spill
+; ASM64:         #APP
+; AMS64-DAG:     ld 31, -8(1)                            # 8-byte Folded Reload
+; ASM64-DAG:     ld 30, -16(1)                           # 8-byte Folded Reload
+; ASM64-DAG:     ld 29, -24(1)                           # 8-byte Folded Reload
+; ASM64-DAG:      ld 28, -32(1)                           # 8-byte Folded Reload
+; ASM64-DAG:     ld 27, -40(1)                           # 8-byte Folded Reload
+; ASM64-DAG:     ld 26, -48(1)                           # 8-byte Folded Reload
+; ASM64-DAG:     ld 25, -56(1)                           # 8-byte Folded Reload
+; ASM64-DAG:     ld 24, -64(1)                           # 8-byte Folded Reload
+; ASM64-DAG:     ld 23, -72(1)                           # 8-byte Folded Reload
+; ASM64-DAG:     ld 22, -80(1)                           # 8-byte Folded Reload
+; ASM64-DAG:     ld 21, -88(1)                           # 8-byte Folded Reload
+; ASM64-DAG:     ld 20, -96(1)                           # 8-byte Folded Reload
+; ASM64-DAG:     ld 19, -104(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     ld 18, -112(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     ld 17, -120(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     ld 16, -128(1)                          # 8-byte Folded Reload
 ; ASM64:          blr
 
 ; ASM32-LABEL: .gprs_only:
-; ASM32-DAG:     stw 16, -64(1)                  # 4-byte Folded Spill
-; ASM32-DAG:     stw 22, -40(1)                  # 4-byte Folded Spill
-; ASM32-DAG:     stw 30, -8(1)                   # 4-byte Folded Spill
+; ASM32-DAG:     stw 16, -64(1)                          # 4-byte Folded Spill
+; ASM32-DAG:     stw 17, -60(1)                          # 4-byte Folded Spill
+; ASM32-DAG:     stw 18, -56(1)                          # 4-byte Folded Spill
+; ASM32-DAG:     stw 19, -52(1)                          # 4-byte Folded Spill
+; ASM32-DAG:     stw 20, -48(1)                          # 4-byte Folded Spill
+; ASM32-DAG:     stw 21, -44(1)                          # 4-byte Folded Spill
+; ASM32-DAG:     stw 22, -40(1)                          # 4-byte Folded Spill
+; ASM32-DAG:     stw 23, -36(1)                          # 4-byte Folded Spill
+; ASM32-DAG:     stw 24, -32(1)                          # 4-byte Folded Spill
+; ASM32-DAG:     stw 25, -28(1)                          # 4-byte Folded Spill
+; ASM32-DAG:     stw 26, -24(1)                          # 4-byte Folded Spill
+; ASM32-DAG:     stw 27, -20(1)                          # 4-byte Folded Spill
+; ASM32-DAG:     stw 28, -16(1)                          # 4-byte Folded Spill
+; ASM32-DAG:     stw 29, -12(1)                          # 4-byte Folded Spill
+; ASM32-DAG:     stw 30, -8(1)                           # 4-byte Folded Spill
+; ASM32-DAG:     stw 31, -4(1)                           # 4-byte Folded Spill
 ; ASM32:         #APP
-; ASM32-DAG:     lwz 30, -8(1)                   # 4-byte Folded Reload
-; ASM32-DAG:     lwz 22, -40(1)                  # 4-byte Folded Reload
-; ASM32-DAG:     lwz 16, -64(1)                  # 4-byte Folded Reload
+; ASM32-DAG:     lwz 31, -4(1)                           # 4-byte Folded Reload
+; ASM32-DAG:     lwz 30, -8(1)                           # 4-byte Folded Reload
+; ASM32-DAG:     lwz 29, -12(1)                          # 4-byte Folded Reload
+; ASM32-DAG:     lwz 28, -16(1)                          # 4-byte Folded Reload
+; ASM32-DAG:     lwz 27, -20(1)                          # 4-byte Folded Reload
+; ASM32-DAG:     lwz 26, -24(1)                          # 4-byte Folded Reload
+; ASM32-DAG:     lwz 25, -28(1)                          # 4-byte Folded Reload
+; ASM32-DAG:     lwz 24, -32(1)                          # 4-byte Folded Reload
+; ASM32-DAG:     lwz 23, -36(1)                          # 4-byte Folded Reload
+; ASM32-DAG:     lwz 22, -40(1)                          # 4-byte Folded Reload
+; ASM32-DAG:     lwz 21, -44(1)                          # 4-byte Folded Reload
+; ASM32-DAG:     lwz 20, -48(1)                          # 4-byte Folded Reload
+; ASM32-DAG:     lwz 19, -52(1)                          # 4-byte Folded Reload
+; ASM32-DAG:     lwz 18, -56(1)                          # 4-byte Folded Reload
+; ASM32-DAG:     lwz 17, -60(1)                          # 4-byte Folded Reload
+; ASM32-DAG:     lwz 16, -64(1)                          # 4-byte Folded Reload
 ; ASM32-DAG:     blr
 
 
@@ -104,112 +287,402 @@ define dso_local double @fprs_and_gprs(i32 signext %i) {
 
 ; MIR64:       name:            fprs_and_gprs
 ; MIR64-LABEL: fixedStack:
-; MIR64-NEXT:    - { id: 0, type: spill-slot, offset: -8, size: 8, alignment: 8, stack-id: default,
-; MIR64-NEXT:        callee-saved-register: '$f31', callee-saved-restored: true, debug-info-variable: '',
-; MIR64-NEXT:        debug-info-expression: '', debug-info-location: '' }
-; MIR64-NEXT:    - { id: 1, type: spill-slot, offset: -88, size: 8, alignment: 8, stack-id: default,
-; MIR64-NEXT:        callee-saved-register: '$f21', callee-saved-restored: true, debug-info-variable: '',
-; MIR64-NEXT:        debug-info-expression: '', debug-info-location: '' }
-; MIR64-NEXT:    - { id: 2, type: spill-slot, offset: -104, size: 8, alignment: 8, stack-id: default,
-; MIR64-NEXT:        callee-saved-register: '$f19', callee-saved-restored: true, debug-info-variable: '',
-; MIR64-NEXT:        debug-info-expression: '', debug-info-location: '' }
-; MIR64-NEXT:    - { id: 3, type: spill-slot, offset: -144, size: 8, alignment: 16, stack-id: default,
-; MIR64-NEXT:        callee-saved-register: '$f14', callee-saved-restored: true, debug-info-variable: '',
-; MIR64-NEXT:        debug-info-expression: '', debug-info-location: '' }
-; MIR64-NEXT:    - { id: 4, type: spill-slot, offset: -152, size: 8, alignment: 8, stack-id: default,
-; MIR64-NEXT:        callee-saved-register: '$x31', callee-saved-restored: true, debug-info-variable: '',
-; MIR64-NEXT:        debug-info-expression: '', debug-info-location: '' }
-; MIR64-NEXT:    - { id: 5, type: spill-slot, offset: -200, size: 8, alignment: 8, stack-id: default,
-; MIR64-NEXT:        callee-saved-register: '$x25', callee-saved-restored: true, debug-info-variable: '',
-; MIR64-NEXT:        debug-info-expression: '', debug-info-location: '' }
-; MIR64-NEXT:    - { id: 6, type: spill-slot, offset: -288, size: 8, alignment: 16, stack-id: default,
-; MIR64-NEXT:        callee-saved-register: '$x14', callee-saved-restored: true, debug-info-variable: '',
-; MIR64-NEXT:        debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 0, type: spill-slot, offset: -8, size: 8, alignment: 8, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$f31', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 1, type: spill-slot, offset: -16, size: 8, alignment: 16, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$f30', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 2, type: spill-slot, offset: -24, size: 8, alignment: 8, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$f29', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 3, type: spill-slot, offset: -32, size: 8, alignment: 16, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$f28', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 4, type: spill-slot, offset: -40, size: 8, alignment: 8, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$f27', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 5, type: spill-slot, offset: -48, size: 8, alignment: 16, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$f26', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 6, type: spill-slot, offset: -56, size: 8, alignment: 8, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$f25', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 7, type: spill-slot, offset: -64, size: 8, alignment: 16, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$f24', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 8, type: spill-slot, offset: -72, size: 8, alignment: 8, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$f23', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 9, type: spill-slot, offset: -80, size: 8, alignment: 16, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$f22', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 10, type: spill-slot, offset: -88, size: 8, alignment: 8, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$f21', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 11, type: spill-slot, offset: -96, size: 8, alignment: 16, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$f20', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 12, type: spill-slot, offset: -104, size: 8, alignment: 8, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$f19', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 13, type: spill-slot, offset: -112, size: 8, alignment: 16, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$f18', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 14, type: spill-slot, offset: -120, size: 8, alignment: 8, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$f17', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 15, type: spill-slot, offset: -128, size: 8, alignment: 16, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$f16', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 16, type: spill-slot, offset: -136, size: 8, alignment: 8, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$f15', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 17, type: spill-slot, offset: -144, size: 8, alignment: 16, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$f14', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 18, type: spill-slot, offset: -152, size: 8, alignment: 8, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$x31', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 19, type: spill-slot, offset: -160, size: 8, alignment: 16, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$x30', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 20, type: spill-slot, offset: -168, size: 8, alignment: 8, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$x29', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 21, type: spill-slot, offset: -176, size: 8, alignment: 16, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$x28', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 22, type: spill-slot, offset: -184, size: 8, alignment: 8, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$x27', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 23, type: spill-slot, offset: -192, size: 8, alignment: 16, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$x26', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 24, type: spill-slot, offset: -200, size: 8, alignment: 8, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$x25', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 25, type: spill-slot, offset: -208, size: 8, alignment: 16, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$x24', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 26, type: spill-slot, offset: -216, size: 8, alignment: 8, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$x23', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 27, type: spill-slot, offset: -224, size: 8, alignment: 16, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$x22', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 28, type: spill-slot, offset: -232, size: 8, alignment: 8, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$x21', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 29, type: spill-slot, offset: -240, size: 8, alignment: 16, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$x20', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 30, type: spill-slot, offset: -248, size: 8, alignment: 8, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$x19', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 31, type: spill-slot, offset: -256, size: 8, alignment: 16, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$x18', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 32, type: spill-slot, offset: -264, size: 8, alignment: 8, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$x17', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 33, type: spill-slot, offset: -272, size: 8, alignment: 16, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$x16', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 34, type: spill-slot, offset: -280, size: 8, alignment: 8, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$x15', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT: - { id: 35, type: spill-slot, offset: -288, size: 8, alignment: 16, stack-id: default,
+; MIR64-NEXT:     callee-saved-register: '$x14', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:     debug-info-expression: '', debug-info-location: '' }
 ; MIR64-NEXT:  stack:           []
 
 ; MIR32:       name:            fprs_and_gprs
 ; MIR32-LABEL: fixedStack:
-; MIR32-NEXT:    - { id: 0, type: spill-slot, offset: -8, size: 8, alignment: 8, stack-id: default,
-; MIR32-NEXT:        callee-saved-register: '$f31', callee-saved-restored: true, debug-info-variable: '',
-; MIR32-NEXT:        debug-info-expression: '', debug-info-location: '' }
-; MIR32-NEXT:    - { id: 1, type: spill-slot, offset: -88, size: 8, alignment: 8, stack-id: default,
-; MIR32-NEXT:        callee-saved-register: '$f21', callee-saved-restored: true, debug-info-variable: '',
-; MIR32-NEXT:        debug-info-expression: '', debug-info-location: '' }
-; MIR32-NEXT:    - { id: 2, type: spill-slot, offset: -104, size: 8, alignment: 8, stack-id: default,
-; MIR32-NEXT:        callee-saved-register: '$f19', callee-saved-restored: true, debug-info-variable: '',
-; MIR32-NEXT:        debug-info-expression: '', debug-info-location: '' }
-; MIR32-NEXT:    - { id: 3, type: spill-slot, offset: -144, size: 8, alignment: 16, stack-id: default,
-; MIR32-NEXT:        callee-saved-register: '$f14', callee-saved-restored: true, debug-info-variable: '',
-; MIR32-NEXT:        debug-info-expression: '', debug-info-location: '' }
-; MIR32-NEXT:    - { id: 4, type: spill-slot, offset: -148, size: 4, alignment: 4, stack-id: default,
-; MIR32-NEXT:        callee-saved-register: '$r31', callee-saved-restored: true, debug-info-variable: '',
-; MIR32-NEXT:        debug-info-expression: '', debug-info-location: '' }
-; MIR32-NEXT:    - { id: 5, type: spill-slot, offset: -172, size: 4, alignment: 4, stack-id: default,
-; MIR32-NEXT:        callee-saved-register: '$r25', callee-saved-restored: true, debug-info-variable: '',
-; MIR32-NEXT:        debug-info-expression: '', debug-info-location: '' }
-; MIR32-NEXT:    - { id: 6, type: spill-slot, offset: -216, size: 4, alignment: 8, stack-id: default,
-; MIR32-NEXT:        callee-saved-register: '$r14', callee-saved-restored: true, debug-info-variable: '',
-; MIR32-NEXT:        debug-info-expression: '', debug-info-location: '' }
-; MIR32-NEXT:    - { id: 7, type: spill-slot, offset: -220, size: 4, alignment: 4, stack-id: default,
-; MIR32-NEXT:        callee-saved-register: '$r13', callee-saved-restored: true, debug-info-variable: '',
-; MIR32-NEXT:        debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 0, type: spill-slot, offset: -8, size: 8, alignment: 8, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$f31', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 1, type: spill-slot, offset: -16, size: 8, alignment: 16, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$f30', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 2, type: spill-slot, offset: -24, size: 8, alignment: 8, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$f29', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 3, type: spill-slot, offset: -32, size: 8, alignment: 16, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$f28', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 4, type: spill-slot, offset: -40, size: 8, alignment: 8, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$f27', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 5, type: spill-slot, offset: -48, size: 8, alignment: 16, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$f26', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 6, type: spill-slot, offset: -56, size: 8, alignment: 8, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$f25', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 7, type: spill-slot, offset: -64, size: 8, alignment: 16, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$f24', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 8, type: spill-slot, offset: -72, size: 8, alignment: 8, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$f23', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 9, type: spill-slot, offset: -80, size: 8, alignment: 16, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$f22', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 10, type: spill-slot, offset: -88, size: 8, alignment: 8, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$f21', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 11, type: spill-slot, offset: -96, size: 8, alignment: 16, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$f20', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 12, type: spill-slot, offset: -104, size: 8, alignment: 8, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$f19', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 13, type: spill-slot, offset: -112, size: 8, alignment: 16, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$f18', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 14, type: spill-slot, offset: -120, size: 8, alignment: 8, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$f17', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 15, type: spill-slot, offset: -128, size: 8, alignment: 16, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$f16', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 16, type: spill-slot, offset: -136, size: 8, alignment: 8, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$f15', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 17, type: spill-slot, offset: -144, size: 8, alignment: 16, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$f14', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 18, type: spill-slot, offset: -148, size: 4, alignment: 4, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r31', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 19, type: spill-slot, offset: -152, size: 4, alignment: 8, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r30', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 20, type: spill-slot, offset: -156, size: 4, alignment: 4, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r29', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 21, type: spill-slot, offset: -160, size: 4, alignment: 16, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r28', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 22, type: spill-slot, offset: -164, size: 4, alignment: 4, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r27', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 23, type: spill-slot, offset: -168, size: 4, alignment: 8, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r26', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 24, type: spill-slot, offset: -172, size: 4, alignment: 4, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r25', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 25, type: spill-slot, offset: -176, size: 4, alignment: 16, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r24', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 26, type: spill-slot, offset: -180, size: 4, alignment: 4, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r23', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 27, type: spill-slot, offset: -184, size: 4, alignment: 8, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r22', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 28, type: spill-slot, offset: -188, size: 4, alignment: 4, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r21', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 29, type: spill-slot, offset: -192, size: 4, alignment: 16, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r20', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 30, type: spill-slot, offset: -196, size: 4, alignment: 4, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r19', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 31, type: spill-slot, offset: -200, size: 4, alignment: 8, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r18', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 32, type: spill-slot, offset: -204, size: 4, alignment: 4, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r17', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 33, type: spill-slot, offset: -208, size: 4, alignment: 16, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r16', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 34, type: spill-slot, offset: -212, size: 4, alignment: 4, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r15', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 35, type: spill-slot, offset: -216, size: 4, alignment: 8, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r14', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  - { id: 36, type: spill-slot, offset: -220, size: 4, alignment: 4, stack-id: default,
+; MIR32-NEXT:      callee-saved-register: '$r13', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:      debug-info-expression: '', debug-info-location: '' }
 ; MIR32-NEXT:  stack:           []
 
 
-; MIR64: liveins: $x3, $x14, $x25, $x31, $f14, $f19, $f21, $f31
+; MIR64: liveins: $x3, $x14, $x15, $x16, $x17, $x18, $x19, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $x29, $x30, $x31, $f14, $f15, $f16, $f17, $f18, $f19, $f20, $f21, $f22, $f23, $f24, $f25, $f26, $f27, $f28, $f29, $f30, $f31
 
 ; MIR64:       $x0 = MFLR8 implicit $lr8
 ; MIR64-NEXT:  $x1 = STDU $x1, -400, $x1
 ; MIR64-NEXT:  STD killed $x0, 416, $x1
-; MIR64-DAG:   STD killed $x14, 112, $x1 :: (store (s64) into %fixed-stack.6, align 16)
-; MIR64-DAG:   STD killed $x25, 200, $x1 :: (store (s64) into %fixed-stack.5)
-; MIR64-DAG:   STD killed $x31, 248, $x1 :: (store (s64) into %fixed-stack.4)
-; MIR64-DAG:   STFD killed $f14, 256, $x1 :: (store (s64) into %fixed-stack.3, align 16)
-; MIR64-DAG:   STFD killed $f19, 296, $x1 :: (store (s64) into %fixed-stack.2)
-; MIR64-DAG:   STFD killed $f21, 312, $x1 :: (store (s64) into %fixed-stack.1)
+; MIR64-DAG:   STD killed $x14, 112, $x1 :: (store (s64) into %fixed-stack.35, align 16)
+; MIR64-DAG:   STD killed $x15, 120, $x1 :: (store (s64) into %fixed-stack.34)
+; MIR64-DAG:   STD killed $x16, 128, $x1 :: (store (s64) into %fixed-stack.33, align 16)
+; MIR64-DAG:   STD killed $x17, 136, $x1 :: (store (s64) into %fixed-stack.32)
+; MIR64-DAG:   STD killed $x18, 144, $x1 :: (store (s64) into %fixed-stack.31, align 16)
+; MIR64-DAG:   STD killed $x19, 152, $x1 :: (store (s64) into %fixed-stack.30)
+; MIR64-DAG:   STD killed $x20, 160, $x1 :: (store (s64) into %fixed-stack.29, align 16)
+; MIR64-DAG:   STD killed $x21, 168, $x1 :: (store (s64) into %fixed-stack.28)
+; MIR64-DAG:   STD killed $x22, 176, $x1 :: (store (s64) into %fixed-stack.27, align 16)
+; MIR64-DAG:   STD killed $x23, 184, $x1 :: (store (s64) into %fixed-stack.26)
+; MIR64-DAG:   STD killed $x24, 192, $x1 :: (store (s64) into %fixed-stack.25, align 16)
+; MIR64-DAG:   STD killed $x25, 200, $x1 :: (store (s64) into %fixed-stack.24)
+; MIR64-DAG:   STD killed $x26, 208, $x1 :: (store (s64) into %fixed-stack.23, align 16)
+; MIR64-DAG:   STD killed $x27, 216, $x1 :: (store (s64) into %fixed-stack.22)
+; MIR64-DAG:   STD killed $x28, 224, $x1 :: (store (s64) into %fixed-stack.21, align 16)
+; MIR64-DAG:   STD killed $x29, 232, $x1 :: (store (s64) into %fixed-stack.20)
+; MIR64-DAG:   STD killed $x30, 240, $x1 :: (store (s64) into %fixed-stack.19, align 16)
+; MIR64-DAG:   STD killed $x31, 248, $x1 :: (store (s64) into %fixed-stack.18)
+; MIR64-DAG:   STFD killed $f14, 256, $x1 :: (store (s64) into %fixed-stack.17, align 16)
+; MIR64-DAG:   STFD killed $f15, 264, $x1 :: (store (s64) into %fixed-stack.16)
+; MIR64-DAG:   STFD killed $f16, 272, $x1 :: (store (s64) into %fixed-stack.15, align 16)
+; MIR64-DAG:   STFD killed $f17, 280, $x1 :: (store (s64) into %fixed-stack.14)
+; MIR64-DAG:   STFD killed $f18, 288, $x1 :: (store (s64) into %fixed-stack.13, align 16)
+; MIR64-DAG:   STFD killed $f19, 296, $x1 :: (store (s64) into %fixed-stack.12)
+; MIR64-DAG:   STFD killed $f20, 304, $x1 :: (store (s64) into %fixed-stack.11, align 16)
+; MIR64-DAG:   STFD killed $f21, 312, $x1 :: (store (s64) into %fixed-stack.10)
+; MIR64-DAG:   STFD killed $f22, 320, $x1 :: (store (s64) into %fixed-stack.9, align 16)
+; MIR64-DAG:   STFD killed $f23, 328, $x1 :: (store (s64) into %fixed-stack.8)
+; MIR64-DAG:   STFD killed $f24, 336, $x1 :: (store (s64) into %fixed-stack.7, align 16)
+; MIR64-DAG:   STFD killed $f25, 344, $x1 :: (store (s64) into %fixed-stack.6)
+; MIR64-DAG:   STFD killed $f26, 352, $x1 :: (store (s64) into %fixed-stack.5, align 16)
+; MIR64-DAG:   STFD killed $f27, 360, $x1 :: (store (s64) into %fixed-stack.4)
+; MIR64-DAG:   STFD killed $f28, 368, $x1 :: (store (s64) into %fixed-stack.3, align 16)
+; MIR64-DAG:   STFD killed $f29, 376, $x1 :: (store (s64) into %fixed-stack.2)
+; MIR64-DAG:   STFD killed $f30, 384, $x1 :: (store (s64) into %fixed-stack.1, align 16)
 ; MIR64-DAG:   STFD killed $f31, 392, $x1 :: (store (s64) into %fixed-stack.0)
 
 ; MIR64:       INLINEASM
 ; MIR64-NEXT:  BL8_NOP
 
 ; MIR64-DAG:   $f31 = LFD 392, $x1 :: (load (s64) from %fixed-stack.0)
-; MIR64-DAG:   $f21 = LFD 312, $x1 :: (load (s64) from %fixed-stack.1)
-; MIR64-DAG:   $f19 = LFD 296, $x1 :: (load (s64) from %fixed-stack.2)
-; MIR64-DAG:   $f14 = LFD 256, $x1 :: (load (s64) from %fixed-stack.3, align 16)
-; MIR64-DAG:   $x31 = LD 248, $x1 :: (load (s64) from %fixed-stack.4)
-; MIR64-DAG:   $x25 = LD 200, $x1 :: (load (s64) from %fixed-stack.5)
-; MIR64-DAG:   $x14 = LD 112, $x1 :: (load (s64) from %fixed-stack.6, align 16)
+; MIR64-DAG:   $f30 = LFD 384, $x1 :: (load (s64) from %fixed-stack.1, align 16)
+; MIR64-DAG:   $f29 = LFD 376, $x1 :: (load (s64) from %fixed-stack.2)
+; MIR64-DAG:   $f28 = LFD 368, $x1 :: (load (s64) from %fixed-stack.3, align 16)
+; MIR64-DAG:   $f27 = LFD 360, $x1 :: (load (s64) from %fixed-stack.4)
+; MIR64-DAG:   $f26 = LFD 352, $x1 :: (load (s64) from %fixed-stack.5, align 16)
+; MIR64-DAG:   $f25 = LFD 344, $x1 :: (load (s64) from %fixed-stack.6)
+; MIR64-DAG:   $f24 = LFD 336, $x1 :: (load (s64) from %fixed-stack.7, align 16)
+; MIR64-DAG:   $f23 = LFD 328, $x1 :: (load (s64) from %fixed-stack.8)
+; MIR64-DAG:   $f22 = LFD 320, $x1 :: (load (s64) from %fixed-stack.9, align 16)
+; MIR64-DAG:   $f21 = LFD 312, $x1 :: (load (s64) from %fixed-stack.10)
+; MIR64-DAG:   $f20 = LFD 304, $x1 :: (load (s64) from %fixed-stack.11, align 16)
+; MIR64-DAG:   $f19 = LFD 296, $x1 :: (load (s64) from %fixed-stack.12)
+; MIR64-DAG:   $f18 = LFD 288, $x1 :: (load (s64) from %fixed-stack.13, align 16)
+; MIR64-DAG:   $f17 = LFD 280, $x1 :: (load (s64) from %fixed-stack.14)
+; MIR64-DAG:   $f16 = LFD 272, $x1 :: (load (s64) from %fixed-stack.15, align 16)
+; MIR64-DAG:   $f15 = LFD 264, $x1 :: (load (s64) from %fixed-stack.16)
+; MIR64-DAG:   $f14 = LFD 256, $x1 :: (load (s64) from %fixed-stack.17, align 16)
+; MIR64-DAG:   $x31 = LD 248, $x1 :: (load (s64) from %fixed-stack.18)
+; MIR64-DAG:   $x30 = LD 240, $x1 :: (load (s64) from %fixed-stack.19, align 16)
+; MIR64-DAG:   $x29 = LD 232, $x1 :: (load (s64) from %fixed-stack.20)
+; MIR64-DAG:   $x28 = LD 224, $x1 :: (load (s64) from %fixed-stack.21, align 16)
+; MIR64-DAG:   $x27 = LD 216, $x1 :: (load (s64) from %fixed-stack.22)
+; MIR64-DAG:   $x26 = LD 208, $x1 :: (load (s64) from %fixed-stack.23, align 16)
+; MIR64-DAG:   $x25 = LD 200, $x1 :: (load (s64) from %fixed-stack.24)
+; MIR64-DAG:   $x24 = LD 192, $x1 :: (load (s64) from %fixed-stack.25, align 16)
+; MIR64-DAG:   $x23 = LD 184, $x1 :: (load (s64) from %fixed-stack.26)
+; MIR64-DAG:   $x22 = LD 176, $x1 :: (load (s64) from %fixed-stack.27, align 16)
+; MIR64-DAG:   $x21 = LD 168, $x1 :: (load (s64) from %fixed-stack.28)
+; MIR64-DAG:   $x20 = LD 160, $x1 :: (load (s64) from %fixed-stack.29, align 16)
+; MIR64-DAG:   $x19 = LD 152, $x1 :: (load (s64) from %fixed-stack.30)
+; MIR64-DAG:   $x18 = LD 144, $x1 :: (load (s64) from %fixed-stack.31, align 16)
+; MIR64-DAG:   $x17 = LD 136, $x1 :: (load (s64) from %fixed-stack.32)
+; MIR64-DAG:   $x16 = LD 128, $x1 :: (load (s64) from %fixed-stack.33, align 16)
+; MIR64-DAG:   $x15 = LD 120, $x1 :: (load (s64) from %fixed-stack.34)
+; MIR64-DAG:   $x14 = LD 112, $x1 :: (load (s64) from %fixed-stack.35, align 16)
+
 ; MIR64:       $x1 = ADDI8 $x1, 400
 ; MIR64-NEXT:  $x0 = LD 16, $x1
 ; MIR64-NEXT:  MTLR8 $x0, implicit-def $lr8
 ; MIR64-NEXT:  BLR8 implicit $lr8, implicit $rm, implicit $f1
 
-
-; MIR32: liveins: $r3, $r13, $r14, $r25, $r31, $f14, $f19, $f21, $f31
+; MIR32: liveins: $r3, $r13, $r14, $r15, $r16, $r17, $r18, $r19, $r20, $r21, $r22, $r23, $r24, $r25, $r26, $r27, $r28, $r29, $r30, $r31, $f14, $f15, $f16, $f17, $f18, $f19, $f20, $f21, $f22, $f23, $f24, $f25, $f26, $f27, $f28, $f29, $f30, $f31
 
 ; MIR32:      $r0 = MFLR implicit $lr
 ; MIR32-NEXT: $r1 = STWU $r1, -288, $r1
 ; MIR32-NEXT: STW killed $r0, 296, $r1
-; MIR32-DAG:  STW killed $r13, 68, $r1 :: (store (s32) into %fixed-stack.7)
-; MIR32-DAG:  STW killed $r14, 72, $r1 :: (store (s32) into %fixed-stack.6, align 8)
-; MIR32-DAG:  STW killed $r25, 116, $r1 :: (store (s32) into %fixed-stack.5)
-; MIR32-DAG:  STW killed $r31, 140, $r1 :: (store (s32) into %fixed-stack.4)
-; MIR32-DAG:  STFD killed $f14, 144, $r1 :: (store (s64) into %fixed-stack.3, align 16)
-; MIR32-DAG:  STFD killed $f19, 184, $r1 :: (store (s64) into %fixed-stack.2)
-; MIR32-DAG:  STFD killed $f21, 200, $r1 :: (store (s64) into %fixed-stack.1)
+; MIR32-DAG:  STW killed $r13, 68, $r1 :: (store (s32) into %fixed-stack.36)
+; MIR32-DAG:  STW killed $r14, 72, $r1 :: (store (s32) into %fixed-stack.35, align 8)
+; MIR32-DAG:  STW killed $r15, 76, $r1 :: (store (s32) into %fixed-stack.34)
+; MIR32-DAG:  STW killed $r16, 80, $r1 :: (store (s32) into %fixed-stack.33, align 16)
+; MIR32-DAG:  STW killed $r17, 84, $r1 :: (store (s32) into %fixed-stack.32)
+; MIR32-DAG:  STW killed $r18, 88, $r1 :: (store (s32) into %fixed-stack.31, align 8)
+; MIR32-DAG:  STW killed $r19, 92, $r1 :: (store (s32) into %fixed-stack.30)
+; MIR32-DAG:  STW killed $r20, 96, $r1 :: (store (s32) into %fixed-stack.29, align 16)
+; MIR32-DAG:  STW killed $r21, 100, $r1 :: (store (s32) into %fixed-stack.28)
+; MIR32-DAG:  STW killed $r22, 104, $r1 :: (store (s32) into %fixed-stack.27, align 8)
+; MIR32-DAG:  STW killed $r23, 108, $r1 :: (store (s32) into %fixed-stack.26)
+; MIR32-DAG:  STW killed $r24, 112, $r1 :: (store (s32) into %fixed-stack.25, align 16)
+; MIR32-DAG:  STW killed $r25, 116, $r1 :: (store (s32) into %fixed-stack.24)
+; MIR32-DAG:  STW killed $r26, 120, $r1 :: (store (s32) into %fixed-stack.23, align 8)
+; MIR32-DAG:  STW killed $r27, 124, $r1 :: (store (s32) into %fixed-stack.22)
+; MIR32-DAG:  STW killed $r28, 128, $r1 :: (store (s32) into %fixed-stack.21, align 16)
+; MIR32-DAG:  STW killed $r29, 132, $r1 :: (store (s32) into %fixed-stack.20)
+; MIR32-DAG:  STW killed $r30, 136, $r1 :: (store (s32) into %fixed-stack.19, align 8)
+; MIR32-DAG:  STW killed $r31, 140, $r1 :: (store (s32) into %fixed-stack.18)
+; MIR32-DAG:  STFD killed $f14, 144, $r1 :: (store (s64) into %fixed-stack.17, align 16)
+; MIR32-DAG:  STFD killed $f15, 152, $r1 :: (store (s64) into %fixed-stack.16)
+; MIR32-DAG:  STFD killed $f16, 160, $r1 :: (store (s64) into %fixed-stack.15, align 16)
+; MIR32-DAG:  STFD killed $f17, 168, $r1 :: (store (s64) into %fixed-stack.14)
+; MIR32-DAG:  STFD killed $f18, 176, $r1 :: (store (s64) into %fixed-stack.13, align 16)
+; MIR32-DAG:  STFD killed $f19, 184, $r1 :: (store (s64) into %fixed-stack.12)
+; MIR32-DAG:  STFD killed $f20, 192, $r1 :: (store (s64) into %fixed-stack.11, align 16)
+; MIR32-DAG:  STFD killed $f21, 200, $r1 :: (store (s64) into %fixed-stack.10)
+; MIR32-DAG:  STFD killed $f22, 208, $r1 :: (store (s64) into %fixed-stack.9, align 16)
+; MIR32-DAG:  STFD killed $f23, 216, $r1 :: (store (s64) into %fixed-stack.8)
+; MIR32-DAG:  STFD killed $f24, 224, $r1 :: (store (s64) into %fixed-stack.7, align 16)
+; MIR32-DAG:  STFD killed $f25, 232, $r1 :: (store (s64) into %fixed-stack.6)
+; MIR32-DAG:  STFD killed $f26, 240, $r1 :: (store (s64) into %fixed-stack.5, align 16)
+; MIR32-DAG:  STFD killed $f27, 248, $r1 :: (store (s64) into %fixed-stack.4)
+; MIR32-DAG:  STFD killed $f28, 256, $r1 :: (store (s64) into %fixed-stack.3, align 16)
+; MIR32-DAG:  STFD killed $f29, 264, $r1 :: (store (s64) into %fixed-stack.2)
+; MIR32-DAG:  STFD killed $f30, 272, $r1 :: (store (s64) into %fixed-stack.1, align 16)
 ; MIR32-DAG:  STFD killed $f31, 280, $r1 :: (store (s64) into %fixed-stack.0)
 
 ; MIR32:      INLINEASM
 ; MIR32:      BL_NOP
 
 ; MIR32-DAG:  $f31 = LFD 280, $r1 :: (load (s64) from %fixed-stack.0)
-; MIR32-DAG:  $f21 = LFD 200, $r1 :: (load (s64) from %fixed-stack.1)
-; MIR32-DAG:  $f19 = LFD 184, $r1 :: (load (s64) from %fixed-stack.2)
-; MIR32-DAG:  $f14 = LFD 144, $r1 :: (load (s64) from %fixed-stack.3, align 16)
-; MIR32-DAG:  $r31 = LWZ 140, $r1 :: (load (s32) from %fixed-stack.4)
-; MIR32-DAG:  $r25 = LWZ 116, $r1 :: (load (s32) from %fixed-stack.5)
-; MIR32-DAG:  $r14 = LWZ 72, $r1 :: (load (s32) from %fixed-stack.6, align 8)
-; MIR32-DAG:  $r13 = LWZ 68, $r1 :: (load (s32) from %fixed-stack.7)
+; MIR32-DAG:  $f30 = LFD 272, $r1 :: (load (s64) from %fixed-stack.1, align 16)
+; MIR32-DAG:  $f29 = LFD 264, $r1 :: (load (s64) from %fixed-stack.2)
+; MIR32-DAG:  $f28 = LFD 256, $r1 :: (load (s64) from %fixed-stack.3, align 16)
+; MIR32-DAG:  $f27 = LFD 248, $r1 :: (load (s64) from %fixed-stack.4)
+; MIR32-DAG:  $f26 = LFD 240, $r1 :: (load (s64) from %fixed-stack.5, align 16)
+; MIR32-DAG:  $f25 = LFD 232, $r1 :: (load (s64) from %fixed-stack.6)
+; MIR32-DAG:  $f24 = LFD 224, $r1 :: (load (s64) from %fixed-stack.7, align 16)
+; MIR32-DAG:  $f23 = LFD 216, $r1 :: (load (s64) from %fixed-stack.8)
+; MIR32-DAG:  $f22 = LFD 208, $r1 :: (load (s64) from %fixed-stack.9, align 16)
+; MIR32-DAG:  $f21 = LFD 200, $r1 :: (load (s64) from %fixed-stack.10)
+; MIR32-DAG:  $f20 = LFD 192, $r1 :: (load (s64) from %fixed-stack.11, align 16)
+; MIR32-DAG:  $f19 = LFD 184, $r1 :: (load (s64) from %fixed-stack.12)
+; MIR32-DAG:  $f18 = LFD 176, $r1 :: (load (s64) from %fixed-stack.13, align 16)
+; MIR32-DAG:  $f17 = LFD 168, $r1 :: (load (s64) from %fixed-stack.14)
+; MIR32-DAG:  $f16 = LFD 160, $r1 :: (load (s64) from %fixed-stack.15, align 16)
+; MIR32-DAG:  $f15 = LFD 152, $r1 :: (load (s64) from %fixed-stack.16)
+; MIR32-DAG:  $f14 = LFD 144, $r1 :: (load (s64) from %fixed-stack.17, align 16)
+; MIR32-DAG:  $r31 = LWZ 140, $r1 :: (load (s32) from %fixed-stack.18)
+; MIR32-DAG:  $r30 = LWZ 136, $r1 :: (load (s32) from %fixed-stack.19, align 8)
+; MIR32-DAG:  $r29 = LWZ 132, $r1 :: (load (s32) from %fixed-stack.20)
+; MIR32-DAG:  $r28 = LWZ 128, $r1 :: (load (s32) from %fixed-stack.21, align 16)
+; MIR32-DAG:  $r27 = LWZ 124, $r1 :: (load (s32) from %fixed-stack.22)
+; MIR32-DAG:  $r26 = LWZ 120, $r1 :: (load (s32) from %fixed-stack.23, align 8)
+; MIR32-DAG:  $r25 = LWZ 116, $r1 :: (load (s32) from %fixed-stack.24)
+; MIR32-DAG:  $r24 = LWZ 112, $r1 :: (load (s32) from %fixed-stack.25, align 16)
+; MIR32-DAG:  $r23 = LWZ 108, $r1 :: (load (s32) from %fixed-stack.26)
+; MIR32-DAG:  $r22 = LWZ 104, $r1 :: (load (s32) from %fixed-stack.27, align 8)
+; MIR32-DAG:  $r21 = LWZ 100, $r1 :: (load (s32) from %fixed-stack.28)
+; MIR32-DAG:  $r20 = LWZ 96, $r1 :: (load (s32) from %fixed-stack.29, align 16)
+; MIR32-DAG:  $r19 = LWZ 92, $r1 :: (load (s32) from %fixed-stack.30)
+; MIR32-DAG:  $r18 = LWZ 88, $r1 :: (load (s32) from %fixed-stack.31, align 8)
+; MIR32-DAG:  $r17 = LWZ 84, $r1 :: (load (s32) from %fixed-stack.32)
+; MIR32-DAG:  $r16 = LWZ 80, $r1 :: (load (s32) from %fixed-stack.33, align 16)
+; MIR32-DAG:  $r15 = LWZ 76, $r1 :: (load (s32) from %fixed-stack.34)
+; MIR32-DAG:  $r14 = LWZ 72, $r1 :: (load (s32) from %fixed-stack.35, align 8)
+; MIR32-DAG:  $r13 = LWZ 68, $r1 :: (load (s32) from %fixed-stack.36)
 ; MIR32:      $r1 = ADDI $r1, 288
 ; MIR32-NEXT: $r0 = LWZ 8, $r1
 ; MIR32-NEXT: MTLR $r0, implicit-def $lr
@@ -219,23 +692,81 @@ define dso_local double @fprs_and_gprs(i32 signext %i) {
 ; ASM64:         mflr 0
 ; ASM64-NEXT:    stdu 1, -400(1)
 ; ASM64-NEXT:    std 0, 416(1)
-; ASM64-DAG:     std 14, 112(1)                  # 8-byte Folded Spill
-; ASM64-DAG:     std 25, 200(1)                  # 8-byte Folded Spill
-; ASM64-DAG:     std 31, 248(1)                  # 8-byte Folded Spill
-; ASM64-DAG:     stfd 14, 256(1)                 # 8-byte Folded Spill
-; ASM64-DAG:     stfd 19, 296(1)                 # 8-byte Folded Spill
-; ASM64-DAG:     stfd 21, 312(1)                 # 8-byte Folded Spill
-; ASM64-DAG:     stfd 31, 392(1)                 # 8-byte Folded Spill
+; ASM64-DAG:     std 14, 112(1)                          # 8-byte Folded Spill
+; ASM64-DAG:     std 15, 120(1)                          # 8-byte Folded Spill
+; ASM64-DAG:     std 16, 128(1)                          # 8-byte Folded Spill
+; ASM64-DAG:     std 17, 136(1)                          # 8-byte Folded Spill
+; ASM64-DAG:     std 18, 144(1)                          # 8-byte Folded Spill
+; ASM64-DAG:     std 19, 152(1)                          # 8-byte Folded Spill
+; ASM64-DAG:     std 20, 160(1)                          # 8-byte Folded Spill
+; ASM64-DAG:     std 21, 168(1)                          # 8-byte Folded Spill
+; ASM64-DAG:     std 22, 176(1)                          # 8-byte Folded Spill
+; ASM64-DAG:     std 23, 184(1)                          # 8-byte Folded Spill
+; ASM64-DAG:     std 24, 192(1)                          # 8-byte Folded Spill
+; ASM64-DAG:     std 25, 200(1)                          # 8-byte Folded Spill
+; ASM64-DAG:     std 26, 208(1)                          # 8-byte Folded Spill
+; ASM64-DAG:     std 27, 216(1)                          # 8-byte Folded Spill
+; ASM64-DAG:     std 28, 224(1)                          # 8-byte Folded Spill
+; ASM64-DAG:     std 29, 232(1)                          # 8-byte Folded Spill
+; ASM64-DAG:     std 30, 240(1)                          # 8-byte Folded Spill
+; ASM64-DAG:     std 31, 248(1)                          # 8-byte Folded Spill
+; ASM64-DAG:     stfd 14, 256(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     stfd 15, 264(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     stfd 16, 272(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     stfd 17, 280(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     stfd 18, 288(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     stfd 19, 296(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     stfd 20, 304(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     stfd 21, 312(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     stfd 22, 320(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     stfd 23, 328(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     stfd 24, 336(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     stfd 25, 344(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     stfd 26, 352(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     stfd 27, 360(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     stfd 28, 368(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     stfd 29, 376(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     stfd 30, 384(1)                         # 8-byte Folded Spill
+; ASM64-DAG:     stfd 31, 392(1)                         # 8-byte Folded Spill
 
 ; ASM64:         bl .dummy
+; ASM64-DAG:     lfd 31, 392(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     lfd 30, 384(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     lfd 29, 376(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     lfd 28, 368(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     lfd 27, 360(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     lfd 26, 352(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     lfd 25, 344(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     lfd 24, 336(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     lfd 23, 328(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     lfd 22, 320(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     lfd 21, 312(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     lfd 20, 304(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     lfd 19, 296(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     lfd 18, 288(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     lfd 17, 280(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     lfd 16, 272(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     lfd 15, 264(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     lfd 14, 256(1)                          # 8-byte Folded Reload
+; ASM64-DAG:     ld 31, 248(1)                           # 8-byte Folded Reload
+; ASM64-DAG:     ld 30, 240(1)                           # 8-byte Folded Reload
+; ASM64-DAG:     ld 29, 232(1)                           # 8-byte Folded Reload
+; ASM64-DAG:     ld 28, 224(1)                           # 8-byte Folded Reload
+; ASM64-DAG:     ld 27, 216(1)                           # 8-byte Folded Reload
+; ASM64-DAG:     ld 26, 208(1)                           # 8-byte Folded Reload
+; ASM64-DAG:     ld 25, 200(1)                           # 8-byte Folded Reload
+; ASM64-DAG:     ld 24, 192(1)                           # 8-byte Folded Reload
+; ASM64-DAG:     ld 23, 184(1)                           # 8-byte Folded Reload
+; ASM64-DAG:     ld 22, 176(1)                           # 8-byte Folded Reload
+; ASM64-DAG:     ld 21, 168(1)                           # 8-byte Folded Reload
+; ASM64-DAG:     ld 20, 160(1)                           # 8-byte Folded Reload
+; ASM64-DAG:     ld 19, 152(1)                           # 8-byte Folded Reload
+; ASM64-DAG:     ld 18, 144(1)                           # 8-byte Folded Reload
+; ASM64-DAG:     ld 17, 136(1)                           # 8-byte Folded Reload
+; ASM64-DAG:     ld 16, 128(1)                           # 8-byte Folded Reload
+; ASM64-DAG:     ld 15, 120(1)                           # 8-byte Folded Reload
+; ASM64-DAG:     ld 14, 112(1)                           # 8-byte Folded Reload
 
-; ASM64-DAG:     lfd 31, 392(1)                  # 8-byte Folded Reload
-; ASM64-DAG:     lfd 21, 312(1)                  # 8-byte Folded Reload
-; ASM64-DAG:     lfd 19, 296(1)                  # 8-byte Folded Reload
-; ASM64-DAG:     lfd 14, 256(1)                  # 8-byte Folded Reload
-; ASM64-DAG:     ld 31, 248(1)                   # 8-byte Folded Reload
-; ASM64-DAG:     ld 25, 200(1)                   # 8-byte Folded Reload
-; ASM64-DAG:     ld 14, 112(1)                   # 8-byte Folded Reload
 ; ASM64:         addi 1, 1, 400
 ; ASM64-NEXT:    ld 0, 16(1)
 ; ASM64-NEXT:    mtlr 0
diff --git a/llvm/test/CodeGen/PowerPC/aix-shared-lib-tls-model-opt-small-local-dynamic-tls.ll b/llvm/test/CodeGen/PowerPC/aix-shared-lib-tls-model-opt-small-local-dynamic-tls.ll
new file mode 100644
index 000000000000..cfb652ceeb8a
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-shared-lib-tls-model-opt-small-local-dynamic-tls.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-altivec -mtriple powerpc64-ibm-aix-xcoff \
+; RUN:      -mattr=+aix-shared-lib-tls-model-opt --code-model=large < %s | FileCheck %s --check-prefixes=OPT
+; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-altivec -mtriple powerpc64-ibm-aix-xcoff \
+; RUN:      -mattr=+aix-small-local-dynamic-tls --code-model=large < %s | FileCheck %s --check-prefixes=SMALL
+; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-altivec -mtriple powerpc64-ibm-aix-xcoff \
+; RUN:      -mattr=+aix-shared-lib-tls-model-opt -mattr=+aix-small-local-dynamic-tls \
+; RUN:      --code-model=large < %s | FileCheck %s --check-prefixes=BOTH
+
+@VarTLSLD1 = internal thread_local(localdynamic) global i32 42, align 4
+
+define i32 @Single_LD(i32 %P, i32 %Q) {
+; OPT-LABEL: Single_LD:
+; OPT:       # %bb.0: # %entry
+; OPT-NEXT:    and 4, 3, 4
+; OPT-NEXT:    addis 3, L..C0@u(2)
+; OPT-NEXT:    ld 3, L..C0@l(3)
+; OPT-NEXT:    cmpwi 4, -1
+; OPT-NEXT:    lwzx 3, 13, 3
+; OPT-NEXT:    blr
+;
+; SMALL-LABEL: Single_LD:
+; SMALL:       # %bb.0: # %entry
+; SMALL-NEXT:    mflr 0
+; SMALL-NEXT:    stdu 1, -48(1)
+; SMALL-NEXT:    and 6, 3, 4
+; SMALL-NEXT:    addis 3, L..C0@u(2)
+; SMALL-NEXT:    std 0, 64(1)
+; SMALL-NEXT:    ld 3, L..C0@l(3)
+; SMALL-NEXT:    bla .__tls_get_mod[PR]
+; SMALL-NEXT:    cmpwi 6, -1
+; SMALL-NEXT:    lwz 3, VarTLSLD1[TL]@ld(3)
+; SMALL-NEXT:    addi 1, 1, 48
+; SMALL-NEXT:    ld 0, 16(1)
+; SMALL-NEXT:    mtlr 0
+; SMALL-NEXT:    blr
+;
+; BOTH-LABEL: Single_LD:
+; BOTH:       # %bb.0: # %entry
+; BOTH-NEXT:    and 4, 3, 4
+; BOTH-NEXT:    addis 3, L..C0@u(2)
+; BOTH-NEXT:    ld 3, L..C0@l(3)
+; BOTH-NEXT:    cmpwi 4, -1
+; BOTH-NEXT:    lwzx 3, 13, 3
+; BOTH-NEXT:    blr
+entry:
+  %a = icmp slt i32 %P, 0
+  %b = icmp slt i32 %Q, 0
+  %c = and i1 %a, %b
+  %tls1 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @VarTLSLD1)
+  %load1 = load i32, ptr %tls1, align 4
+  br i1 %c, label %bb1, label %return
+
+bb1:
+  %tls2 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @VarTLSLD1)
+  %load2 = load i32, ptr %tls2, align 4
+  ret i32 %load2
+
+return:
+  ret i32 %load1
+}
+
+; OPT-LABEL: .toc
+; OPT-LABEL: L..C0:
+; OPT-NEXT: .tc VarTLSLD1[TE],VarTLSLD1[TL]@ie
+
+; SMALL-LABEL: .toc
+; SMALL-LABEL: L..C0:
+; SMALL-NEXT: .tc _Renamed..5f24__TLSML[TC],_Renamed..5f24__TLSML[TC]@ml
+; SMALL-NEXT: .rename _Renamed..5f24__TLSML[TC],"_$TLSML"
+
+; BOTH-LABEL: .toc
+; BOTH-LABEL: L..C0:
+; BOTH-NEXT: .tc VarTLSLD1[TE],VarTLSLD1[TL]@ie
diff --git a/llvm/test/CodeGen/PowerPC/aix-shared-lib-tls-model-opt.ll b/llvm/test/CodeGen/PowerPC/aix-shared-lib-tls-model-opt.ll
new file mode 100644
index 000000000000..140377270d6d
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-shared-lib-tls-model-opt.ll
@@ -0,0 +1,627 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-altivec -mtriple powerpc64-ibm-aix-xcoff \
+; RUN:      --code-model=small < %s | FileCheck %s --check-prefixes=DEFAULT_SMALL64
+; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-altivec -mtriple powerpc64-ibm-aix-xcoff \
+; RUN:      --code-model=large < %s | FileCheck %s --check-prefixes=DEFAULT_LARGE64
+; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-altivec -mtriple powerpc64-ibm-aix-xcoff \
+; RUN:      -mattr=+aix-shared-lib-tls-model-opt --code-model=small < %s | FileCheck %s --check-prefixes=TLS_MODEL_OPT_SMALL64
+; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-altivec -mtriple powerpc64-ibm-aix-xcoff \
+; RUN:      -mattr=+aix-shared-lib-tls-model-opt --code-model=large < %s | FileCheck %s --check-prefixes=TLS_MODEL_OPT_LARGE64
+; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-altivec -mtriple powerpc64-ibm-aix-xcoff \
+; RUN:      -mattr=+aix-shared-lib-tls-model-opt -ppc-aix-shared-lib-tls-model-opt-limit=2 \
+; RUN:      --code-model=small < %s | FileCheck %s --check-prefixes=TLS_MODEL_OPT_LIMIT2_SMALL64
+; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-altivec -mtriple powerpc64-ibm-aix-xcoff \
+; RUN:      -mattr=+aix-shared-lib-tls-model-opt -ppc-aix-shared-lib-tls-model-opt-limit=2 \
+; RUN:      --code-model=large < %s | FileCheck %s --check-prefixes=TLS_MODEL_OPT_LIMIT2_LARGE64
+; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-altivec -mtriple powerpc64-ibm-aix-xcoff \
+; RUN:      -mattr=+aix-shared-lib-tls-model-opt -ppc-aix-shared-lib-tls-model-opt-limit=3 \
+; RUN:      --code-model=small < %s | FileCheck %s --check-prefixes=TLS_MODEL_OPT_LIMIT3_SMALL64
+; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-altivec -mtriple powerpc64-ibm-aix-xcoff \
+; RUN:      -mattr=+aix-shared-lib-tls-model-opt -ppc-aix-shared-lib-tls-model-opt-limit=3 \
+; RUN:      --code-model=large < %s | FileCheck %s --check-prefixes=TLS_MODEL_OPT_LIMIT3_LARGE64
+
+@VarTLSLD1 = internal thread_local(localdynamic) global i32 42, align 4
+@VarTLSLD2 = internal thread_local(localdynamic) global i32 0, align 4
+@VarTLSLD3 = internal thread_local(localdynamic) global i32 0, align 4
+
+; Tune function level TLS model settings:
+; Use initial-exec when we have a function accessing only one TLS variable.
+; Use local-dynamic when we have a function accessing a handful or more different TLS variables.
+
+define i32 @Single_LD(i32 %P, i32 %Q) {
+; DEFAULT_SMALL64-LABEL: Single_LD:
+; DEFAULT_SMALL64:       # %bb.0: # %entry
+; DEFAULT_SMALL64-NEXT:    mflr 0
+; DEFAULT_SMALL64-NEXT:    stdu 1, -48(1)
+; DEFAULT_SMALL64-NEXT:    and 6, 3, 4
+; DEFAULT_SMALL64-NEXT:    ld 3, L..C0(2) # target-flags(ppc-tlsldm) @"_$TLSML"
+; DEFAULT_SMALL64-NEXT:    std 0, 64(1)
+; DEFAULT_SMALL64-NEXT:    bla .__tls_get_mod[PR]
+; DEFAULT_SMALL64-NEXT:    ld 4, L..C1(2) # target-flags(ppc-tlsld) @VarTLSLD1
+; DEFAULT_SMALL64-NEXT:    cmpwi 6, -1
+; DEFAULT_SMALL64-NEXT:    lwzx 3, 3, 4
+; DEFAULT_SMALL64-NEXT:    addi 1, 1, 48
+; DEFAULT_SMALL64-NEXT:    ld 0, 16(1)
+; DEFAULT_SMALL64-NEXT:    mtlr 0
+; DEFAULT_SMALL64-NEXT:    blr
+;
+; DEFAULT_LARGE64-LABEL: Single_LD:
+; DEFAULT_LARGE64:       # %bb.0: # %entry
+; DEFAULT_LARGE64-NEXT:    mflr 0
+; DEFAULT_LARGE64-NEXT:    stdu 1, -48(1)
+; DEFAULT_LARGE64-NEXT:    and 6, 3, 4
+; DEFAULT_LARGE64-NEXT:    addis 3, L..C0@u(2)
+; DEFAULT_LARGE64-NEXT:    addis 7, L..C1@u(2)
+; DEFAULT_LARGE64-NEXT:    ld 3, L..C0@l(3)
+; DEFAULT_LARGE64-NEXT:    std 0, 64(1)
+; DEFAULT_LARGE64-NEXT:    bla .__tls_get_mod[PR]
+; DEFAULT_LARGE64-NEXT:    ld 4, L..C1@l(7)
+; DEFAULT_LARGE64-NEXT:    lwzx 3, 3, 4
+; DEFAULT_LARGE64-NEXT:    cmpwi 6, -1
+; DEFAULT_LARGE64-NEXT:    addi 1, 1, 48
+; DEFAULT_LARGE64-NEXT:    ld 0, 16(1)
+; DEFAULT_LARGE64-NEXT:    mtlr 0
+; DEFAULT_LARGE64-NEXT:    blr
+;
+; TLS_MODEL_OPT_SMALL64-LABEL: Single_LD:
+; TLS_MODEL_OPT_SMALL64:       # %bb.0: # %entry
+; TLS_MODEL_OPT_SMALL64-NEXT:    and 4, 3, 4
+; TLS_MODEL_OPT_SMALL64-NEXT:    ld 3, L..C0(2) # target-flags(ppc-tprel) @VarTLSLD1
+; TLS_MODEL_OPT_SMALL64-NEXT:    cmpwi 4, -1
+; TLS_MODEL_OPT_SMALL64-NEXT:    lwzx 3, 13, 3
+; TLS_MODEL_OPT_SMALL64-NEXT:    blr
+;
+; TLS_MODEL_OPT_LARGE64-LABEL: Single_LD:
+; TLS_MODEL_OPT_LARGE64:       # %bb.0: # %entry
+; TLS_MODEL_OPT_LARGE64-NEXT:    and 4, 3, 4
+; TLS_MODEL_OPT_LARGE64-NEXT:    addis 3, L..C0@u(2)
+; TLS_MODEL_OPT_LARGE64-NEXT:    ld 3, L..C0@l(3)
+; TLS_MODEL_OPT_LARGE64-NEXT:    cmpwi 4, -1
+; TLS_MODEL_OPT_LARGE64-NEXT:    lwzx 3, 13, 3
+; TLS_MODEL_OPT_LARGE64-NEXT:    blr
+;
+; TLS_MODEL_OPT_LIMIT2_SMALL64-LABEL: Single_LD:
+; TLS_MODEL_OPT_LIMIT2_SMALL64:       # %bb.0: # %entry
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    and 4, 3, 4
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    ld 3, L..C0(2) # target-flags(ppc-tprel) @VarTLSLD1
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    cmpwi 4, -1
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    lwzx 3, 13, 3
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    blr
+;
+; TLS_MODEL_OPT_LIMIT2_LARGE64-LABEL: Single_LD:
+; TLS_MODEL_OPT_LIMIT2_LARGE64:       # %bb.0: # %entry
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    and 4, 3, 4
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    addis 3, L..C0@u(2)
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    ld 3, L..C0@l(3)
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    cmpwi 4, -1
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    lwzx 3, 13, 3
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    blr
+;
+; TLS_MODEL_OPT_LIMIT3_SMALL64-LABEL: Single_LD:
+; TLS_MODEL_OPT_LIMIT3_SMALL64:       # %bb.0: # %entry
+; TLS_MODEL_OPT_LIMIT3_SMALL64-NEXT:    and 4, 3, 4
+; TLS_MODEL_OPT_LIMIT3_SMALL64-NEXT:    ld 3, L..C0(2) # target-flags(ppc-tprel) @VarTLSLD1
+; TLS_MODEL_OPT_LIMIT3_SMALL64-NEXT:    cmpwi 4, -1
+; TLS_MODEL_OPT_LIMIT3_SMALL64-NEXT:    lwzx 3, 13, 3
+; TLS_MODEL_OPT_LIMIT3_SMALL64-NEXT:    blr
+;
+; TLS_MODEL_OPT_LIMIT3_LARGE64-LABEL: Single_LD:
+; TLS_MODEL_OPT_LIMIT3_LARGE64:       # %bb.0: # %entry
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:    and 4, 3, 4
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:    addis 3, L..C0@u(2)
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:    ld 3, L..C0@l(3)
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:    cmpwi 4, -1
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:    lwzx 3, 13, 3
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:    blr
+entry:
+  %a = icmp slt i32 %P, 0
+  %b = icmp slt i32 %Q, 0
+  %c = and i1 %a, %b
+  %tls1 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @VarTLSLD1)
+  %load1 = load i32, ptr %tls1, align 4
+  br i1 %c, label %bb1, label %return
+
+bb1:
+  %tls2 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @VarTLSLD1)
+  %load2 = load i32, ptr %tls2, align 4
+  ret i32 %load2
+
+return:
+  ret i32 %load1
+}
+
+define i32 @Two_LDs(i32 %P, i32 %Q) {
+; DEFAULT_SMALL64-LABEL: Two_LDs:
+; DEFAULT_SMALL64:       # %bb.0: # %entry
+; DEFAULT_SMALL64-NEXT:    mflr 0
+; DEFAULT_SMALL64-NEXT:    stdu 1, -48(1)
+; DEFAULT_SMALL64-NEXT:    and 6, 3, 4
+; DEFAULT_SMALL64-NEXT:    ld 3, L..C0(2) # target-flags(ppc-tlsldm) @"_$TLSML"
+; DEFAULT_SMALL64-NEXT:    std 0, 64(1)
+; DEFAULT_SMALL64-NEXT:    bla .__tls_get_mod[PR]
+; DEFAULT_SMALL64-NEXT:    cmpwi 6, -1
+; DEFAULT_SMALL64-NEXT:    bgt 0, L..BB1_2
+; DEFAULT_SMALL64-NEXT:  # %bb.1: # %bb1
+; DEFAULT_SMALL64-NEXT:    ld 4, L..C2(2) # target-flags(ppc-tlsld) @VarTLSLD2
+; DEFAULT_SMALL64-NEXT:    lwzx 3, 3, 4
+; DEFAULT_SMALL64-NEXT:    b L..BB1_3
+; DEFAULT_SMALL64-NEXT:  L..BB1_2: # %return
+; DEFAULT_SMALL64-NEXT:    ld 4, L..C1(2) # target-flags(ppc-tlsld) @VarTLSLD1
+; DEFAULT_SMALL64-NEXT:    lwzx 3, 3, 4
+; DEFAULT_SMALL64-NEXT:  L..BB1_3: # %bb1
+; DEFAULT_SMALL64-NEXT:    addi 1, 1, 48
+; DEFAULT_SMALL64-NEXT:    ld 0, 16(1)
+; DEFAULT_SMALL64-NEXT:    mtlr 0
+; DEFAULT_SMALL64-NEXT:    blr
+;
+; DEFAULT_LARGE64-LABEL: Two_LDs:
+; DEFAULT_LARGE64:       # %bb.0: # %entry
+; DEFAULT_LARGE64-NEXT:    mflr 0
+; DEFAULT_LARGE64-NEXT:    stdu 1, -48(1)
+; DEFAULT_LARGE64-NEXT:    and 6, 3, 4
+; DEFAULT_LARGE64-NEXT:    addis 3, L..C0@u(2)
+; DEFAULT_LARGE64-NEXT:    std 0, 64(1)
+; DEFAULT_LARGE64-NEXT:    ld 3, L..C0@l(3)
+; DEFAULT_LARGE64-NEXT:    bla .__tls_get_mod[PR]
+; DEFAULT_LARGE64-NEXT:    cmpwi 6, -1
+; DEFAULT_LARGE64-NEXT:    bgt 0, L..BB1_2
+; DEFAULT_LARGE64-NEXT:  # %bb.1: # %bb1
+; DEFAULT_LARGE64-NEXT:    addis 4, L..C2@u(2)
+; DEFAULT_LARGE64-NEXT:    ld 4, L..C2@l(4)
+; DEFAULT_LARGE64-NEXT:    lwzx 3, 3, 4
+; DEFAULT_LARGE64-NEXT:    b L..BB1_3
+; DEFAULT_LARGE64-NEXT:  L..BB1_2: # %return
+; DEFAULT_LARGE64-NEXT:    addis 4, L..C1@u(2)
+; DEFAULT_LARGE64-NEXT:    ld 4, L..C1@l(4)
+; DEFAULT_LARGE64-NEXT:    lwzx 3, 3, 4
+; DEFAULT_LARGE64-NEXT:  L..BB1_3: # %bb1
+; DEFAULT_LARGE64-NEXT:    addi 1, 1, 48
+; DEFAULT_LARGE64-NEXT:    ld 0, 16(1)
+; DEFAULT_LARGE64-NEXT:    mtlr 0
+; DEFAULT_LARGE64-NEXT:    blr
+;
+; TLS_MODEL_OPT_SMALL64-LABEL: Two_LDs:
+; TLS_MODEL_OPT_SMALL64:       # %bb.0: # %entry
+; TLS_MODEL_OPT_SMALL64-NEXT:    mflr 0
+; TLS_MODEL_OPT_SMALL64-NEXT:    stdu 1, -48(1)
+; TLS_MODEL_OPT_SMALL64-NEXT:    and 6, 3, 4
+; TLS_MODEL_OPT_SMALL64-NEXT:    ld 3, L..C1(2) # target-flags(ppc-tlsldm) @"_$TLSML"
+; TLS_MODEL_OPT_SMALL64-NEXT:    std 0, 64(1)
+; TLS_MODEL_OPT_SMALL64-NEXT:    bla .__tls_get_mod[PR]
+; TLS_MODEL_OPT_SMALL64-NEXT:    cmpwi 6, -1
+; TLS_MODEL_OPT_SMALL64-NEXT:    bgt 0, L..BB1_2
+; TLS_MODEL_OPT_SMALL64-NEXT:  # %bb.1: # %bb1
+; TLS_MODEL_OPT_SMALL64-NEXT:    ld 4, L..C2(2) # target-flags(ppc-tlsld) @VarTLSLD2
+; TLS_MODEL_OPT_SMALL64-NEXT:    lwzx 3, 3, 4
+; TLS_MODEL_OPT_SMALL64-NEXT:    b L..BB1_3
+; TLS_MODEL_OPT_SMALL64-NEXT:  L..BB1_2: # %return
+; TLS_MODEL_OPT_SMALL64-NEXT:    ld 4, L..C3(2) # target-flags(ppc-tlsld) @VarTLSLD1
+; TLS_MODEL_OPT_SMALL64-NEXT:    lwzx 3, 3, 4
+; TLS_MODEL_OPT_SMALL64-NEXT:  L..BB1_3: # %bb1
+; TLS_MODEL_OPT_SMALL64-NEXT:    addi 1, 1, 48
+; TLS_MODEL_OPT_SMALL64-NEXT:    ld 0, 16(1)
+; TLS_MODEL_OPT_SMALL64-NEXT:    mtlr 0
+; TLS_MODEL_OPT_SMALL64-NEXT:    blr
+;
+; TLS_MODEL_OPT_LARGE64-LABEL: Two_LDs:
+; TLS_MODEL_OPT_LARGE64:       # %bb.0: # %entry
+; TLS_MODEL_OPT_LARGE64-NEXT:    mflr 0
+; TLS_MODEL_OPT_LARGE64-NEXT:    stdu 1, -48(1)
+; TLS_MODEL_OPT_LARGE64-NEXT:    and 6, 3, 4
+; TLS_MODEL_OPT_LARGE64-NEXT:    addis 3, L..C1@u(2)
+; TLS_MODEL_OPT_LARGE64-NEXT:    std 0, 64(1)
+; TLS_MODEL_OPT_LARGE64-NEXT:    ld 3, L..C1@l(3)
+; TLS_MODEL_OPT_LARGE64-NEXT:    bla .__tls_get_mod[PR]
+; TLS_MODEL_OPT_LARGE64-NEXT:    cmpwi 6, -1
+; TLS_MODEL_OPT_LARGE64-NEXT:    bgt 0, L..BB1_2
+; TLS_MODEL_OPT_LARGE64-NEXT:  # %bb.1: # %bb1
+; TLS_MODEL_OPT_LARGE64-NEXT:    addis 4, L..C2@u(2)
+; TLS_MODEL_OPT_LARGE64-NEXT:    ld 4, L..C2@l(4)
+; TLS_MODEL_OPT_LARGE64-NEXT:    lwzx 3, 3, 4
+; TLS_MODEL_OPT_LARGE64-NEXT:    b L..BB1_3
+; TLS_MODEL_OPT_LARGE64-NEXT:  L..BB1_2: # %return
+; TLS_MODEL_OPT_LARGE64-NEXT:    addis 4, L..C3@u(2)
+; TLS_MODEL_OPT_LARGE64-NEXT:    ld 4, L..C3@l(4)
+; TLS_MODEL_OPT_LARGE64-NEXT:    lwzx 3, 3, 4
+; TLS_MODEL_OPT_LARGE64-NEXT:  L..BB1_3: # %bb1
+; TLS_MODEL_OPT_LARGE64-NEXT:    addi 1, 1, 48
+; TLS_MODEL_OPT_LARGE64-NEXT:    ld 0, 16(1)
+; TLS_MODEL_OPT_LARGE64-NEXT:    mtlr 0
+; TLS_MODEL_OPT_LARGE64-NEXT:    blr
+;
+; TLS_MODEL_OPT_LIMIT2_SMALL64-LABEL: Two_LDs:
+; TLS_MODEL_OPT_LIMIT2_SMALL64:       # %bb.0: # %entry
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    and 3, 3, 4
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    cmpwi 3, -1
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    bgt 0, L..BB1_2
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:  # %bb.1: # %bb1
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    ld 3, L..C1(2) # target-flags(ppc-tprel) @VarTLSLD2
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    lwzx 3, 13, 3
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    blr
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:  L..BB1_2: # %return
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    ld 3, L..C0(2) # target-flags(ppc-tprel) @VarTLSLD1
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    lwzx 3, 13, 3
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    blr
+;
+; TLS_MODEL_OPT_LIMIT2_LARGE64-LABEL: Two_LDs:
+; TLS_MODEL_OPT_LIMIT2_LARGE64:       # %bb.0: # %entry
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    and 3, 3, 4
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    cmpwi 3, -1
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    bgt 0, L..BB1_2
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:  # %bb.1: # %bb1
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    addis 3, L..C1@u(2)
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    ld 3, L..C1@l(3)
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    lwzx 3, 13, 3
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    blr
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:  L..BB1_2: # %return
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    addis 3, L..C0@u(2)
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    ld 3, L..C0@l(3)
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    lwzx 3, 13, 3
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    blr
+;
+; TLS_MODEL_OPT_LIMIT3_SMALL64-LABEL: Two_LDs:
+; TLS_MODEL_OPT_LIMIT3_SMALL64:       # %bb.0: # %entry
+; TLS_MODEL_OPT_LIMIT3_SMALL64-NEXT:    and 3, 3, 4
+; TLS_MODEL_OPT_LIMIT3_SMALL64-NEXT:    cmpwi 3, -1
+; TLS_MODEL_OPT_LIMIT3_SMALL64-NEXT:    bgt 0, L..BB1_2
+; TLS_MODEL_OPT_LIMIT3_SMALL64-NEXT:  # %bb.1: # %bb1
+; TLS_MODEL_OPT_LIMIT3_SMALL64-NEXT:    ld 3, L..C1(2) # target-flags(ppc-tprel) @VarTLSLD2
+; TLS_MODEL_OPT_LIMIT3_SMALL64-NEXT:    lwzx 3, 13, 3
+; TLS_MODEL_OPT_LIMIT3_SMALL64-NEXT:    blr
+; TLS_MODEL_OPT_LIMIT3_SMALL64-NEXT:  L..BB1_2: # %return
+; TLS_MODEL_OPT_LIMIT3_SMALL64-NEXT:    ld 3, L..C0(2) # target-flags(ppc-tprel) @VarTLSLD1
+; TLS_MODEL_OPT_LIMIT3_SMALL64-NEXT:    lwzx 3, 13, 3
+; TLS_MODEL_OPT_LIMIT3_SMALL64-NEXT:    blr
+;
+; TLS_MODEL_OPT_LIMIT3_LARGE64-LABEL: Two_LDs:
+; TLS_MODEL_OPT_LIMIT3_LARGE64:       # %bb.0: # %entry
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:    and 3, 3, 4
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:    cmpwi 3, -1
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:    bgt 0, L..BB1_2
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:  # %bb.1: # %bb1
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:    addis 3, L..C1@u(2)
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:    ld 3, L..C1@l(3)
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:    lwzx 3, 13, 3
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:    blr
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:  L..BB1_2: # %return
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:    addis 3, L..C0@u(2)
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:    ld 3, L..C0@l(3)
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:    lwzx 3, 13, 3
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:    blr
+entry:
+  %a = icmp slt i32 %P, 0
+  %b = icmp slt i32 %Q, 0
+  %c = and i1 %a, %b
+  %tls1 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @VarTLSLD1)
+  %load1 = load i32, ptr %tls1, align 4
+  br i1 %c, label %bb1, label %return
+
+bb1:
+  %tls2 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @VarTLSLD2)
+  %load2 = load i32, ptr %tls2, align 4
+  ret i32 %load2
+
+return:
+  ret i32 %load1
+}
+
+define i32 @Three_LDs(i32 %P, i32 %Q) {
+; DEFAULT_SMALL64-LABEL: Three_LDs:
+; DEFAULT_SMALL64:       # %bb.0: # %entry
+; DEFAULT_SMALL64-NEXT:    mflr 0
+; DEFAULT_SMALL64-NEXT:    stdu 1, -48(1)
+; DEFAULT_SMALL64-NEXT:    and 6, 3, 4
+; DEFAULT_SMALL64-NEXT:    ld 3, L..C0(2) # target-flags(ppc-tlsldm) @"_$TLSML"
+; DEFAULT_SMALL64-NEXT:    std 0, 64(1)
+; DEFAULT_SMALL64-NEXT:    bla .__tls_get_mod[PR]
+; DEFAULT_SMALL64-NEXT:    cmpwi 6, -1
+; DEFAULT_SMALL64-NEXT:    bgt 0, L..BB2_2
+; DEFAULT_SMALL64-NEXT:  # %bb.1: # %bb1
+; DEFAULT_SMALL64-NEXT:    ld 4, L..C2(2) # target-flags(ppc-tlsld) @VarTLSLD2
+; DEFAULT_SMALL64-NEXT:    ld 5, L..C3(2) # target-flags(ppc-tlsld) @VarTLSLD3
+; DEFAULT_SMALL64-NEXT:    lwzx 4, 3, 4
+; DEFAULT_SMALL64-NEXT:    lwzx 3, 3, 5
+; DEFAULT_SMALL64-NEXT:    add 3, 4, 3
+; DEFAULT_SMALL64-NEXT:    b L..BB2_3
+; DEFAULT_SMALL64-NEXT:  L..BB2_2: # %return
+; DEFAULT_SMALL64-NEXT:    ld 4, L..C1(2) # target-flags(ppc-tlsld) @VarTLSLD1
+; DEFAULT_SMALL64-NEXT:    lwzx 3, 3, 4
+; DEFAULT_SMALL64-NEXT:  L..BB2_3: # %return
+; DEFAULT_SMALL64-NEXT:    addi 1, 1, 48
+; DEFAULT_SMALL64-NEXT:    ld 0, 16(1)
+; DEFAULT_SMALL64-NEXT:    mtlr 0
+; DEFAULT_SMALL64-NEXT:    blr
+;
+; DEFAULT_LARGE64-LABEL: Three_LDs:
+; DEFAULT_LARGE64:       # %bb.0: # %entry
+; DEFAULT_LARGE64-NEXT:    mflr 0
+; DEFAULT_LARGE64-NEXT:    stdu 1, -48(1)
+; DEFAULT_LARGE64-NEXT:    and 6, 3, 4
+; DEFAULT_LARGE64-NEXT:    addis 3, L..C0@u(2)
+; DEFAULT_LARGE64-NEXT:    std 0, 64(1)
+; DEFAULT_LARGE64-NEXT:    ld 3, L..C0@l(3)
+; DEFAULT_LARGE64-NEXT:    bla .__tls_get_mod[PR]
+; DEFAULT_LARGE64-NEXT:    cmpwi 6, -1
+; DEFAULT_LARGE64-NEXT:    bgt 0, L..BB2_2
+; DEFAULT_LARGE64-NEXT:  # %bb.1: # %bb1
+; DEFAULT_LARGE64-NEXT:    addis 4, L..C2@u(2)
+; DEFAULT_LARGE64-NEXT:    addis 5, L..C3@u(2)
+; DEFAULT_LARGE64-NEXT:    ld 4, L..C2@l(4)
+; DEFAULT_LARGE64-NEXT:    ld 5, L..C3@l(5)
+; DEFAULT_LARGE64-NEXT:    lwzx 4, 3, 4
+; DEFAULT_LARGE64-NEXT:    lwzx 3, 3, 5
+; DEFAULT_LARGE64-NEXT:    add 3, 4, 3
+; DEFAULT_LARGE64-NEXT:    b L..BB2_3
+; DEFAULT_LARGE64-NEXT:  L..BB2_2: # %return
+; DEFAULT_LARGE64-NEXT:    addis 4, L..C1@u(2)
+; DEFAULT_LARGE64-NEXT:    ld 4, L..C1@l(4)
+; DEFAULT_LARGE64-NEXT:    lwzx 3, 3, 4
+; DEFAULT_LARGE64-NEXT:  L..BB2_3: # %return
+; DEFAULT_LARGE64-NEXT:    addi 1, 1, 48
+; DEFAULT_LARGE64-NEXT:    ld 0, 16(1)
+; DEFAULT_LARGE64-NEXT:    mtlr 0
+; DEFAULT_LARGE64-NEXT:    blr
+;
+; TLS_MODEL_OPT_SMALL64-LABEL: Three_LDs:
+; TLS_MODEL_OPT_SMALL64:       # %bb.0: # %entry
+; TLS_MODEL_OPT_SMALL64-NEXT:    mflr 0
+; TLS_MODEL_OPT_SMALL64-NEXT:    stdu 1, -48(1)
+; TLS_MODEL_OPT_SMALL64-NEXT:    and 6, 3, 4
+; TLS_MODEL_OPT_SMALL64-NEXT:    ld 3, L..C1(2) # target-flags(ppc-tlsldm) @"_$TLSML"
+; TLS_MODEL_OPT_SMALL64-NEXT:    std 0, 64(1)
+; TLS_MODEL_OPT_SMALL64-NEXT:    bla .__tls_get_mod[PR]
+; TLS_MODEL_OPT_SMALL64-NEXT:    cmpwi 6, -1
+; TLS_MODEL_OPT_SMALL64-NEXT:    bgt 0, L..BB2_2
+; TLS_MODEL_OPT_SMALL64-NEXT:  # %bb.1: # %bb1
+; TLS_MODEL_OPT_SMALL64-NEXT:    ld 4, L..C2(2) # target-flags(ppc-tlsld) @VarTLSLD2
+; TLS_MODEL_OPT_SMALL64-NEXT:    ld 5, L..C4(2) # target-flags(ppc-tlsld) @VarTLSLD3
+; TLS_MODEL_OPT_SMALL64-NEXT:    lwzx 4, 3, 4
+; TLS_MODEL_OPT_SMALL64-NEXT:    lwzx 3, 3, 5
+; TLS_MODEL_OPT_SMALL64-NEXT:    add 3, 4, 3
+; TLS_MODEL_OPT_SMALL64-NEXT:    b L..BB2_3
+; TLS_MODEL_OPT_SMALL64-NEXT:  L..BB2_2: # %return
+; TLS_MODEL_OPT_SMALL64-NEXT:    ld 4, L..C3(2) # target-flags(ppc-tlsld) @VarTLSLD1
+; TLS_MODEL_OPT_SMALL64-NEXT:    lwzx 3, 3, 4
+; TLS_MODEL_OPT_SMALL64-NEXT:  L..BB2_3: # %return
+; TLS_MODEL_OPT_SMALL64-NEXT:    addi 1, 1, 48
+; TLS_MODEL_OPT_SMALL64-NEXT:    ld 0, 16(1)
+; TLS_MODEL_OPT_SMALL64-NEXT:    mtlr 0
+; TLS_MODEL_OPT_SMALL64-NEXT:    blr
+;
+; TLS_MODEL_OPT_LARGE64-LABEL: Three_LDs:
+; TLS_MODEL_OPT_LARGE64:       # %bb.0: # %entry
+; TLS_MODEL_OPT_LARGE64-NEXT:    mflr 0
+; TLS_MODEL_OPT_LARGE64-NEXT:    stdu 1, -48(1)
+; TLS_MODEL_OPT_LARGE64-NEXT:    and 6, 3, 4
+; TLS_MODEL_OPT_LARGE64-NEXT:    addis 3, L..C1@u(2)
+; TLS_MODEL_OPT_LARGE64-NEXT:    std 0, 64(1)
+; TLS_MODEL_OPT_LARGE64-NEXT:    ld 3, L..C1@l(3)
+; TLS_MODEL_OPT_LARGE64-NEXT:    bla .__tls_get_mod[PR]
+; TLS_MODEL_OPT_LARGE64-NEXT:    cmpwi 6, -1
+; TLS_MODEL_OPT_LARGE64-NEXT:    bgt 0, L..BB2_2
+; TLS_MODEL_OPT_LARGE64-NEXT:  # %bb.1: # %bb1
+; TLS_MODEL_OPT_LARGE64-NEXT:    addis 4, L..C2@u(2)
+; TLS_MODEL_OPT_LARGE64-NEXT:    addis 5, L..C4@u(2)
+; TLS_MODEL_OPT_LARGE64-NEXT:    ld 4, L..C2@l(4)
+; TLS_MODEL_OPT_LARGE64-NEXT:    ld 5, L..C4@l(5)
+; TLS_MODEL_OPT_LARGE64-NEXT:    lwzx 4, 3, 4
+; TLS_MODEL_OPT_LARGE64-NEXT:    lwzx 3, 3, 5
+; TLS_MODEL_OPT_LARGE64-NEXT:    add 3, 4, 3
+; TLS_MODEL_OPT_LARGE64-NEXT:    b L..BB2_3
+; TLS_MODEL_OPT_LARGE64-NEXT:  L..BB2_2: # %return
+; TLS_MODEL_OPT_LARGE64-NEXT:    addis 4, L..C3@u(2)
+; TLS_MODEL_OPT_LARGE64-NEXT:    ld 4, L..C3@l(4)
+; TLS_MODEL_OPT_LARGE64-NEXT:    lwzx 3, 3, 4
+; TLS_MODEL_OPT_LARGE64-NEXT:  L..BB2_3: # %return
+; TLS_MODEL_OPT_LARGE64-NEXT:    addi 1, 1, 48
+; TLS_MODEL_OPT_LARGE64-NEXT:    ld 0, 16(1)
+; TLS_MODEL_OPT_LARGE64-NEXT:    mtlr 0
+; TLS_MODEL_OPT_LARGE64-NEXT:    blr
+;
+; TLS_MODEL_OPT_LIMIT2_SMALL64-LABEL: Three_LDs:
+; TLS_MODEL_OPT_LIMIT2_SMALL64:       # %bb.0: # %entry
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    mflr 0
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    stdu 1, -48(1)
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    and 6, 3, 4
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    ld 3, L..C2(2) # target-flags(ppc-tlsldm) @"_$TLSML"
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    std 0, 64(1)
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    bla .__tls_get_mod[PR]
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    cmpwi 6, -1
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    bgt 0, L..BB2_2
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:  # %bb.1: # %bb1
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    ld 4, L..C3(2) # target-flags(ppc-tlsld) @VarTLSLD2
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    ld 5, L..C4(2) # target-flags(ppc-tlsld) @VarTLSLD3
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    lwzx 4, 3, 4
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    lwzx 3, 3, 5
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    add 3, 4, 3
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    b L..BB2_3
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:  L..BB2_2: # %return
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    ld 4, L..C5(2) # target-flags(ppc-tlsld) @VarTLSLD1
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    lwzx 3, 3, 4
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:  L..BB2_3: # %return
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    addi 1, 1, 48
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    ld 0, 16(1)
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    mtlr 0
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT:    blr
+;
+; TLS_MODEL_OPT_LIMIT2_LARGE64-LABEL: Three_LDs:
+; TLS_MODEL_OPT_LIMIT2_LARGE64:       # %bb.0: # %entry
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    mflr 0
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    stdu 1, -48(1)
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    and 6, 3, 4
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    addis 3, L..C2@u(2)
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    std 0, 64(1)
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    ld 3, L..C2@l(3)
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    bla .__tls_get_mod[PR]
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    cmpwi 6, -1
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    bgt 0, L..BB2_2
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:  # %bb.1: # %bb1
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    addis 4, L..C3@u(2)
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    addis 5, L..C4@u(2)
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    ld 4, L..C3@l(4)
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    ld 5, L..C4@l(5)
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    lwzx 4, 3, 4
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    lwzx 3, 3, 5
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    add 3, 4, 3
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    b L..BB2_3
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:  L..BB2_2: # %return
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    addis 4, L..C5@u(2)
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    ld 4, L..C5@l(4)
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    lwzx 3, 3, 4
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:  L..BB2_3: # %return
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    addi 1, 1, 48
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    ld 0, 16(1)
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    mtlr 0
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT:    blr
+;
+; TLS_MODEL_OPT_LIMIT3_SMALL64-LABEL: Three_LDs:
+; TLS_MODEL_OPT_LIMIT3_SMALL64:       # %bb.0: # %entry
+; TLS_MODEL_OPT_LIMIT3_SMALL64-NEXT:    and 3, 3, 4
+; TLS_MODEL_OPT_LIMIT3_SMALL64-NEXT:    cmpwi 3, -1
+; TLS_MODEL_OPT_LIMIT3_SMALL64-NEXT:    bgt 0, L..BB2_2
+; TLS_MODEL_OPT_LIMIT3_SMALL64-NEXT:  # %bb.1: # %bb1
+; TLS_MODEL_OPT_LIMIT3_SMALL64-NEXT:    ld 3, L..C1(2) # target-flags(ppc-tprel) @VarTLSLD2
+; TLS_MODEL_OPT_LIMIT3_SMALL64-NEXT:    ld 4, L..C2(2) # target-flags(ppc-tprel) @VarTLSLD3
+; TLS_MODEL_OPT_LIMIT3_SMALL64-NEXT:    lwzx 3, 13, 3
+; TLS_MODEL_OPT_LIMIT3_SMALL64-NEXT:    lwzx 4, 13, 4
+; TLS_MODEL_OPT_LIMIT3_SMALL64-NEXT:    add 3, 3, 4
+; TLS_MODEL_OPT_LIMIT3_SMALL64-NEXT:    blr
+; TLS_MODEL_OPT_LIMIT3_SMALL64-NEXT:  L..BB2_2: # %return
+; TLS_MODEL_OPT_LIMIT3_SMALL64-NEXT:    ld 3, L..C0(2) # target-flags(ppc-tprel) @VarTLSLD1
+; TLS_MODEL_OPT_LIMIT3_SMALL64-NEXT:    lwzx 3, 13, 3
+; TLS_MODEL_OPT_LIMIT3_SMALL64-NEXT:    blr
+;
+; TLS_MODEL_OPT_LIMIT3_LARGE64-LABEL: Three_LDs:
+; TLS_MODEL_OPT_LIMIT3_LARGE64:       # %bb.0: # %entry
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:    and 3, 3, 4
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:    cmpwi 3, -1
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:    bgt 0, L..BB2_2
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:  # %bb.1: # %bb1
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:    addis 3, L..C1@u(2)
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:    addis 4, L..C2@u(2)
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:    ld 3, L..C1@l(3)
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:    ld 4, L..C2@l(4)
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:    lwzx 3, 13, 3
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:    lwzx 4, 13, 4
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:    add 3, 3, 4
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:    blr
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:  L..BB2_2: # %return
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:    addis 3, L..C0@u(2)
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:    ld 3, L..C0@l(3)
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:    lwzx 3, 13, 3
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT:    blr
+entry:
+  %a = icmp slt i32 %P, 0
+  %b = icmp slt i32 %Q, 0
+  %c = and i1 %a, %b
+  %tls1 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @VarTLSLD1)
+  %load1 = load i32, ptr %tls1, align 4
+  br i1 %c, label %bb1, label %return
+
+bb1:
+  %tls2 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @VarTLSLD2)
+  %load2 = load i32, ptr %tls2, align 4
+  %tls3 = tail call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @VarTLSLD3)
+  %load3 = load i32, ptr %tls3, align 4
+  %sum = add i32 %load2, %load3
+  ret i32 %sum
+
+return:
+  ret i32 %load1
+}
+
+declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull)
+
+; DEFAULT_SMALL64-LABEL: .toc
+; DEFAULT_SMALL64-LABEL: L..C0:
+; DEFAULT_SMALL64-NEXT: .tc _Renamed..5f24__TLSML[TC],_Renamed..5f24__TLSML[TC]@ml
+; DEFAULT_SMALL64-NEXT: .rename _Renamed..5f24__TLSML[TC],"_$TLSML"
+; DEFAULT_SMALL64-LABEL: L..C1:
+; DEFAULT_SMALL64-NEXT: .tc VarTLSLD1[TC],VarTLSLD1[TL]@ld
+; DEFAULT_SMALL64-LABEL: L..C2:
+; DEFAULT_SMALL64-NEXT: .tc VarTLSLD2[TC],VarTLSLD2[UL]@ld
+; DEFAULT_SMALL64-LABEL: L..C3:
+; DEFAULT_SMALL64-NEXT: .tc VarTLSLD3[TC],VarTLSLD3[UL]@ld
+
+; DEFAULT_LARGE64-LABEL: .toc
+; DEFAULT_LARGE64-LABEL: L..C0:
+; DEFAULT_LARGE64-NEXT: .tc _Renamed..5f24__TLSML[TC],_Renamed..5f24__TLSML[TC]@ml
+; DEFAULT_LARGE64-NEXT: .rename _Renamed..5f24__TLSML[TC],"_$TLSML"
+; DEFAULT_LARGE64-LABEL: L..C1:
+; DEFAULT_LARGE64-NEXT: .tc VarTLSLD1[TE],VarTLSLD1[TL]@ld
+; DEFAULT_LARGE64-LABEL: L..C2:
+; DEFAULT_LARGE64-NEXT: .tc VarTLSLD2[TE],VarTLSLD2[UL]@ld
+; DEFAULT_LARGE64-LABEL: L..C3:
+; DEFAULT_LARGE64-NEXT: .tc VarTLSLD3[TE],VarTLSLD3[UL]@ld
+
+; TLS_MODEL_OPT_SMALL64-LABEL: .toc
+; TLS_MODEL_OPT_SMALL64-LABEL: L..C0:
+; TLS_MODEL_OPT_SMALL64-NEXT: .tc VarTLSLD1[TC],VarTLSLD1[TL]@ie
+; TLS_MODEL_OPT_SMALL64-LABEL: L..C1:
+; TLS_MODEL_OPT_SMALL64-NEXT: .tc _Renamed..5f24__TLSML[TC],_Renamed..5f24__TLSML[TC]@ml
+; TLS_MODEL_OPT_SMALL64-NEXT: .rename _Renamed..5f24__TLSML[TC],"_$TLSML"
+; TLS_MODEL_OPT_SMALL64-LABEL: L..C2:
+; TLS_MODEL_OPT_SMALL64-NEXT: .tc .VarTLSLD2[TC],VarTLSLD2[UL]@ld
+; TLS_MODEL_OPT_SMALL64-LABEL: L..C3:
+; TLS_MODEL_OPT_SMALL64-NEXT: .tc .VarTLSLD1[TC],VarTLSLD1[TL]@ld
+; TLS_MODEL_OPT_SMALL64-LABEL: L..C4:
+; TLS_MODEL_OPT_SMALL64-NEXT: .tc .VarTLSLD3[TC],VarTLSLD3[UL]@ld
+
+; TLS_MODEL_OPT_LARGE64-LABEL: .toc
+; TLS_MODEL_OPT_LARGE64-LABEL: L..C0:
+; TLS_MODEL_OPT_LARGE64-NEXT: .tc VarTLSLD1[TE],VarTLSLD1[TL]@ie
+; TLS_MODEL_OPT_LARGE64-LABEL: L..C1:
+; TLS_MODEL_OPT_LARGE64-NEXT: .tc _Renamed..5f24__TLSML[TC],_Renamed..5f24__TLSML[TC]@ml
+; TLS_MODEL_OPT_LARGE64-NEXT: .rename _Renamed..5f24__TLSML[TC],"_$TLSML"
+; TLS_MODEL_OPT_LARGE64-LABEL: L..C2:
+; TLS_MODEL_OPT_LARGE64-NEXT: .tc .VarTLSLD2[TE],VarTLSLD2[UL]@ld
+; TLS_MODEL_OPT_LARGE64-LABEL: L..C3:
+; TLS_MODEL_OPT_LARGE64-NEXT: .tc .VarTLSLD1[TE],VarTLSLD1[TL]@ld
+; TLS_MODEL_OPT_LARGE64-LABEL: L..C4:
+; TLS_MODEL_OPT_LARGE64-NEXT: .tc .VarTLSLD3[TE],VarTLSLD3[UL]@ld
+
+; TLS_MODEL_OPT_LIMIT2_SMALL64-LABEL: .toc
+; TLS_MODEL_OPT_LIMIT2_SMALL64-LABEL: L..C0:
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT: .tc VarTLSLD1[TC],VarTLSLD1[TL]@ie
+; TLS_MODEL_OPT_LIMIT2_SMALL64-LABEL: L..C1:
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT: .tc VarTLSLD2[TC],VarTLSLD2[UL]@ie
+; TLS_MODEL_OPT_LIMIT2_SMALL64-LABEL: L..C2:
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT: .tc _Renamed..5f24__TLSML[TC],_Renamed..5f24__TLSML[TC]@ml
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT: .rename _Renamed..5f24__TLSML[TC],"_$TLSML"
+; TLS_MODEL_OPT_LIMIT2_SMALL64-LABEL: L..C3:
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT: .tc .VarTLSLD2[TC],VarTLSLD2[UL]@ld
+; TLS_MODEL_OPT_LIMIT2_SMALL64-LABEL: L..C4:
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT: .tc .VarTLSLD3[TC],VarTLSLD3[UL]@ld
+; TLS_MODEL_OPT_LIMIT2_SMALL64-LABEL: L..C5:
+; TLS_MODEL_OPT_LIMIT2_SMALL64-NEXT: .tc .VarTLSLD1[TC],VarTLSLD1[TL]@ld
+
+; TLS_MODEL_OPT_LIMIT2_LARGE64-LABEL: .toc
+; TLS_MODEL_OPT_LIMIT2_LARGE64-LABEL: L..C0:
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT: .tc VarTLSLD1[TE],VarTLSLD1[TL]@ie
+; TLS_MODEL_OPT_LIMIT2_LARGE64-LABEL: L..C1:
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT: .tc VarTLSLD2[TE],VarTLSLD2[UL]@ie
+; TLS_MODEL_OPT_LIMIT2_LARGE64-LABEL: L..C2:
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT: .tc _Renamed..5f24__TLSML[TC],_Renamed..5f24__TLSML[TC]@ml
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT: .rename _Renamed..5f24__TLSML[TC],"_$TLSML"
+; TLS_MODEL_OPT_LIMIT2_LARGE64-LABEL: L..C3:
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT: .tc .VarTLSLD2[TE],VarTLSLD2[UL]@ld
+; TLS_MODEL_OPT_LIMIT2_LARGE64-LABEL: L..C4:
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT: .tc .VarTLSLD3[TE],VarTLSLD3[UL]@ld
+; TLS_MODEL_OPT_LIMIT2_LARGE64-LABEL: L..C5:
+; TLS_MODEL_OPT_LIMIT2_LARGE64-NEXT: .tc .VarTLSLD1[TE],VarTLSLD1[TL]@ld
+
+; TLS_MODEL_OPT_LIMIT3_SMALL64-LABEL: .toc
+; TLS_MODEL_OPT_LIMIT3_SMALL64-LABEL: L..C0:
+; TLS_MODEL_OPT_LIMIT3_SMALL64-NEXT: .tc VarTLSLD1[TC],VarTLSLD1[TL]@ie
+; TLS_MODEL_OPT_LIMIT3_SMALL64-LABEL: L..C1:
+; TLS_MODEL_OPT_LIMIT3_SMALL64-NEXT: .tc VarTLSLD2[TC],VarTLSLD2[UL]@ie
+; TLS_MODEL_OPT_LIMIT3_SMALL64-LABEL: L..C2:
+; TLS_MODEL_OPT_LIMIT3_SMALL64-NEXT: .tc VarTLSLD3[TC],VarTLSLD3[UL]@ie
+
+; TLS_MODEL_OPT_LIMIT3_LARGE64-LABEL: .toc
+; TLS_MODEL_OPT_LIMIT3_LARGE64-LABEL: L..C0:
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT: .tc VarTLSLD1[TE],VarTLSLD1[TL]@ie
+; TLS_MODEL_OPT_LIMIT3_LARGE64-LABEL: L..C1:
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT: .tc VarTLSLD2[TE],VarTLSLD2[UL]@ie
+; TLS_MODEL_OPT_LIMIT3_LARGE64-LABEL: L..C2:
+; TLS_MODEL_OPT_LIMIT3_LARGE64-NEXT: .tc VarTLSLD3[TE],VarTLSLD3[UL]@ie
diff --git a/llvm/test/CodeGen/PowerPC/aix-small-local-dynamic-tls-largeaccess.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-dynamic-tls-largeaccess.ll
index d3fa94779dd7..44d62124ac58 100644
--- a/llvm/test/CodeGen/PowerPC/aix-small-local-dynamic-tls-largeaccess.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-small-local-dynamic-tls-largeaccess.ll
@@ -39,23 +39,18 @@ define signext i32 @test1() {
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    stdu r1, -48(r1)
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    ld r3, L..C0(r2) # target-flags(ppc-tlsldm) @"_$TLSML"
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    std r0, 64(r1)
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    li r6, 4
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    bla .__tls_get_mod[PR]
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    li r5, 1
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    la r4, ElementIntTLSv1[TL]@ld(r3)
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    stw r5, ElementIntTLSv1[TL]@ld(r3)
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    la r5, ElementIntTLS2[TL]@ld(r3)
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    stw r6, 24(r4)
+; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    li r4, 1
+; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    li r5, 4
+; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    stw r4, ElementIntTLSv1[TL]@ld(r3)
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    li r4, 2
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    stw r4, 320(r5)
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    la r4, ElementIntTLS3[TL]@ld(r3)
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    li r5, 3
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    stw r5, 324(r4)
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    la r4, ElementIntTLS4[TL]@ld(r3)
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    la r3, ElementIntTLS5[TL]@ld(r3)
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    stw r6, 328(r4)
+; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    stw r5, ElementIntTLSv1[TL]@ld+24(r3)
+; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    stw r5, (ElementIntTLS4[TL]@ld+328)-65536(r3)
+; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    stw r4, (ElementIntTLS2[TL]@ld+320)-65536(r3)
+; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    li r4, 3
+; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    stw r4, (ElementIntTLS3[TL]@ld+324)-65536(r3)
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    li r4, 88
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    stw r4, 332(r3)
+; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    stw r4, (ElementIntTLS5[TL]@ld+332)-65536(r3)
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    li r3, 102
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    addi r1, r1, 48
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    ld r0, 16(r1)
@@ -68,24 +63,19 @@ define signext i32 @test1() {
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    stdu r1, -48(r1)
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    addis r3, L..C0@u(r2)
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    std r0, 64(r1)
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    li r6, 4
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    ld r3, L..C0@l(r3)
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    bla .__tls_get_mod[PR]
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    li r5, 1
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    la r4, ElementIntTLSv1[TL]@ld(r3)
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    stw r5, ElementIntTLSv1[TL]@ld(r3)
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    la r5, ElementIntTLS2[TL]@ld(r3)
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    stw r6, 24(r4)
+; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    li r4, 1
+; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    li r5, 4
+; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    stw r4, ElementIntTLSv1[TL]@ld(r3)
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    li r4, 2
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    stw r4, 320(r5)
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    la r4, ElementIntTLS3[TL]@ld(r3)
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    li r5, 3
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    stw r5, 324(r4)
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    la r4, ElementIntTLS4[TL]@ld(r3)
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    la r3, ElementIntTLS5[TL]@ld(r3)
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    stw r6, 328(r4)
+; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    stw r5, ElementIntTLSv1[TL]@ld+24(r3)
+; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    stw r5, (ElementIntTLS4[TL]@ld+328)-65536(r3)
+; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    stw r4, (ElementIntTLS2[TL]@ld+320)-65536(r3)
+; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    li r4, 3
+; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    stw r4, (ElementIntTLS3[TL]@ld+324)-65536(r3)
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    li r4, 88
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    stw r4, 332(r3)
+; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    stw r4, (ElementIntTLS5[TL]@ld+332)-65536(r3)
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    li r3, 102
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    addi r1, r1, 48
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    ld r0, 16(r1)
@@ -132,26 +122,21 @@ define i64 @test2() {
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    std r0, 64(r1)
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    bla .__tls_get_mod[PR]
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    mr r6, r3
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    la r3, ElementLongTLS6[UL]@ld(r3)
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    li r4, 212
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    std r4, 424(r3)
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    la r3, ElementLongTLS2[TL]@ld(r6)
+; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    li r3, 212
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    li r4, 203
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    std r4, 1200(r3)
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    ld r3, L..C1(r2) # target-flags(ppc-tlsgdm) @MyTLSGDVar
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    ld r4, L..C2(r2) # target-flags(ppc-tlsgd) @MyTLSGDVar
+; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    std r4, (ElementLongTLS2[TL]@ld+1200)-131072(r6)
+; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    ld r4, L..C1(r2) # target-flags(ppc-tlsgd) @MyTLSGDVar
+; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    std r3, ElementLongTLS6[UL]@ld+424(r6)
+; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    ld r3, L..C2(r2) # target-flags(ppc-tlsgdm) @MyTLSGDVar
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    bla .__tls_get_addr[PR]
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    li r4, 44
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    std r4, 440(r3)
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    la r3, ElementLongTLS3[TL]@ld(r6)
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    li r4, 6
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    std r4, 2000(r3)
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    la r3, ElementLongTLS4[TL]@ld(r6)
+; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    li r3, 6
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    li r4, 100
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    std r4, 6800(r3)
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    la r3, ElementLongTLS5[TL]@ld(r6)
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    li r4, 882
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    std r4, 8400(r3)
+; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    std r3, (ElementLongTLS3[TL]@ld+2000)-196608(r6)
+; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    li r3, 882
+; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    std r4, (ElementLongTLS4[TL]@ld+6800)-196608(r6)
+; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    std r3, (ElementLongTLS5[TL]@ld+8400)-196608(r6)
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    li r3, 1191
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    addi r1, r1, 48
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    ld r0, 16(r1)
@@ -166,29 +151,24 @@ define i64 @test2() {
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    std r0, 64(r1)
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    ld r3, L..C0@l(r3)
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    bla .__tls_get_mod[PR]
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    li r4, 212
+; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    addis r4, L..C1@u(r2)
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    mr r6, r3
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    la r3, ElementLongTLS6[UL]@ld(r3)
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    std r4, 424(r3)
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    la r3, ElementLongTLS2[TL]@ld(r6)
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    li r4, 203
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    std r4, 1200(r3)
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    addis r3, L..C1@u(r2)
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    addis r4, L..C2@u(r2)
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    ld r3, L..C1@l(r3)
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    ld r4, L..C2@l(r4)
+; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    li r3, 212
+; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    ld r4, L..C1@l(r4)
+; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    std r3, ElementLongTLS6[UL]@ld+424(r6)
+; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    li r3, 203
+; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    std r3, (ElementLongTLS2[TL]@ld+1200)-131072(r6)
+; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    addis r3, L..C2@u(r2)
+; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    ld r3, L..C2@l(r3)
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    bla .__tls_get_addr[PR]
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    li r4, 44
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    std r4, 440(r3)
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    la r3, ElementLongTLS3[TL]@ld(r6)
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    li r4, 6
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    std r4, 2000(r3)
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    la r3, ElementLongTLS4[TL]@ld(r6)
+; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    li r3, 6
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    li r4, 100
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    std r4, 6800(r3)
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    la r3, ElementLongTLS5[TL]@ld(r6)
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    li r4, 882
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    std r4, 8400(r3)
+; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    std r3, (ElementLongTLS3[TL]@ld+2000)-196608(r6)
+; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    li r3, 882
+; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    std r4, (ElementLongTLS4[TL]@ld+6800)-196608(r6)
+; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    std r3, (ElementLongTLS5[TL]@ld+8400)-196608(r6)
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    li r3, 1191
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    addi r1, r1, 48
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    ld r0, 16(r1)
@@ -230,23 +210,19 @@ define signext i32 @test3() {
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    stdu r1, -48(r1)
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    ld r3, L..C0(r2) # target-flags(ppc-tlsldm) @"_$TLSML"
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    std r0, 64(r1)
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    li r6, 2
+; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    li r6, 3
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    bla .__tls_get_mod[PR]
+; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    li r5, 2
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    li r4, 1
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    la r5, ElementIntTLS2[TL]@ld(r3)
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    la r7, ElementIntTLS3[TL]@ld(r3)
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    stw r6, 320(r5)
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    li r5, 3
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    la r6, ElementIntTLS4[TL]@ld(r3)
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    stw r5, 324(r7)
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    ld r5, L..C3(r2) # target-flags(ppc-tlsld) @ElementIntTLSv2
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    la r7, ElementIntTLS5[TL]@ld(r3)
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    stwux r4, r3, r5
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    li r4, 4
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    stw r4, 24(r3)
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    li r3, 88
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    stw r4, 328(r6)
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    stw r3, 332(r7)
+; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    stw r6, (ElementIntTLS3[TL]@ld+324)-65536(r3)
+; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    ld r6, L..C3(r2) # target-flags(ppc-tlsld) @ElementIntTLSv2
+; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    stw r5, (ElementIntTLS2[TL]@ld+320)-65536(r3)
+; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    li r5, 88
+; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    stw r5, (ElementIntTLS5[TL]@ld+332)-65536(r3)
+; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    li r5, 4
+; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    stw r5, (ElementIntTLS4[TL]@ld+328)-65536(r3)
+; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    stwux r4, r3, r6
+; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    stw r5, 24(r3)
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    li r3, 102
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    addi r1, r1, 48
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    ld r0, 16(r1)
@@ -262,22 +238,18 @@ define signext i32 @test3() {
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    addis r6, L..C3@u(r2)
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    ld r3, L..C0@l(r3)
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    ld r6, L..C3@l(r6)
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    li r7, 3
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    bla .__tls_get_mod[PR]
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    li r5, 2
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    la r4, ElementIntTLS2[TL]@ld(r3)
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    stw r5, 320(r4)
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    li r4, 1
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    la r5, ElementIntTLS3[TL]@ld(r3)
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    stw r7, 324(r5)
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    la r5, ElementIntTLS4[TL]@ld(r3)
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    la r7, ElementIntTLS5[TL]@ld(r3)
+; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    stw r5, (ElementIntTLS2[TL]@ld+320)-65536(r3)
+; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    li r5, 3
+; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    stw r5, (ElementIntTLS3[TL]@ld+324)-65536(r3)
+; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    li r5, 88
+; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    stw r5, (ElementIntTLS5[TL]@ld+332)-65536(r3)
+; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    li r5, 4
+; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    stw r5, (ElementIntTLS4[TL]@ld+328)-65536(r3)
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    stwux r4, r3, r6
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    li r4, 4
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    stw r4, 24(r3)
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    li r3, 88
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    stw r4, 328(r5)
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    stw r3, 332(r7)
+; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    stw r5, 24(r3)
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    li r3, 102
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    addi r1, r1, 48
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    ld r0, 16(r1)
diff --git a/llvm/test/CodeGen/PowerPC/aix-small-local-dynamic-tls-types.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-dynamic-tls-types.ll
index 161a58a90296..489260b4e0ae 100644
--- a/llvm/test/CodeGen/PowerPC/aix-small-local-dynamic-tls-types.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-small-local-dynamic-tls-types.ll
@@ -51,8 +51,7 @@ define nonnull ptr @AddrTest1() local_unnamed_addr {
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    ld r3, L..C0(r2) # target-flags(ppc-tlsldm) @"_$TLSML"
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    std r0, 64(r1)
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    bla .__tls_get_mod[PR]
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    la r3, a[TL]@ld(r3)
-; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    addi r3, r3, 12
+; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    la r3, a[TL]@ld+12(r3)
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    addi r1, r1, 48
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    ld r0, 16(r1)
 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT:    mtlr r0
@@ -66,8 +65,7 @@ define nonnull ptr @AddrTest1() local_unnamed_addr {
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    std r0, 64(r1)
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    ld r3, L..C0@l(r3)
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    bla .__tls_get_mod[PR]
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    la r3, a[TL]@ld(r3)
-; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    addi r3, r3, 12
+; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    la r3, a[TL]@ld+12(r3)
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    addi r1, r1, 48
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    ld r0, 16(r1)
 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT:    mtlr r0
diff --git a/llvm/test/CodeGen/PowerPC/aix-spills-for-eh.ll b/llvm/test/CodeGen/PowerPC/aix-spills-for-eh.ll
new file mode 100644
index 000000000000..73004e875873
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-spills-for-eh.ll
@@ -0,0 +1,301 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mcpu=pwr9 -mattr=+altivec -verify-machineinstrs --vec-extabi \
+; RUN:   -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mtriple=powerpc-unknown-aix < %s  | FileCheck %s --check-prefix 32BIT
+
+; RUN: llc -mcpu=pwr9 -mattr=+altivec -verify-machineinstrs --vec-extabi \
+; RUN:   -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mtriple=powerpc64-unknown-aix < %s | FileCheck %s --check-prefix 64BIT
+
+@_ZTIi = external constant ptr
+
+; Function Attrs: uwtable mustprogress
+define dso_local signext i32 @_Z5test2iPPKc(i32 signext %argc, ptr nocapture readnone %argv) local_unnamed_addr #0 personality ptr @__gxx_personality_v0{
+; 32BIT-LABEL: _Z5test2iPPKc:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr r0
+; 32BIT-NEXT:    stwu r1, -464(r1)
+; 32BIT-NEXT:    stw r0, 472(r1)
+; 32BIT-NEXT:    stw r30, 320(r1) # 4-byte Folded Spill
+; 32BIT-NEXT:    li r30, 0
+; 32BIT-NEXT:    stxv v20, 64(r1) # 16-byte Folded Spill
+; 32BIT-NEXT:    stxv v21, 80(r1) # 16-byte Folded Spill
+; 32BIT-NEXT:    stw r31, 324(r1) # 4-byte Folded Spill
+; 32BIT-NEXT:    mr r31, r3
+; 32BIT-NEXT:    stw r14, 256(r1) # 4-byte Folded Spill
+; 32BIT-NEXT:    stxv v22, 96(r1) # 16-byte Folded Spill
+; 32BIT-NEXT:    stw r15, 260(r1) # 4-byte Folded Spill
+; 32BIT-NEXT:    stxv v23, 112(r1) # 16-byte Folded Spill
+; 32BIT-NEXT:    stw r16, 264(r1) # 4-byte Folded Spill
+; 32BIT-NEXT:    stxv v24, 128(r1) # 16-byte Folded Spill
+; 32BIT-NEXT:    stw r17, 268(r1) # 4-byte Folded Spill
+; 32BIT-NEXT:    stw r18, 272(r1) # 4-byte Folded Spill
+; 32BIT-NEXT:    stxv v25, 144(r1) # 16-byte Folded Spill
+; 32BIT-NEXT:    stw r19, 276(r1) # 4-byte Folded Spill
+; 32BIT-NEXT:    stxv v26, 160(r1) # 16-byte Folded Spill
+; 32BIT-NEXT:    stw r20, 280(r1) # 4-byte Folded Spill
+; 32BIT-NEXT:    stxv v27, 176(r1) # 16-byte Folded Spill
+; 32BIT-NEXT:    stw r21, 284(r1) # 4-byte Folded Spill
+; 32BIT-NEXT:    stw r22, 288(r1) # 4-byte Folded Spill
+; 32BIT-NEXT:    stxv v28, 192(r1) # 16-byte Folded Spill
+; 32BIT-NEXT:    stw r23, 292(r1) # 4-byte Folded Spill
+; 32BIT-NEXT:    stxv v29, 208(r1) # 16-byte Folded Spill
+; 32BIT-NEXT:    stw r24, 296(r1) # 4-byte Folded Spill
+; 32BIT-NEXT:    stxv v30, 224(r1) # 16-byte Folded Spill
+; 32BIT-NEXT:    stw r25, 300(r1) # 4-byte Folded Spill
+; 32BIT-NEXT:    stw r26, 304(r1) # 4-byte Folded Spill
+; 32BIT-NEXT:    stxv v31, 240(r1) # 16-byte Folded Spill
+; 32BIT-NEXT:    stw r27, 308(r1) # 4-byte Folded Spill
+; 32BIT-NEXT:    stw r28, 312(r1) # 4-byte Folded Spill
+; 32BIT-NEXT:    stw r29, 316(r1) # 4-byte Folded Spill
+; 32BIT-NEXT:    stfd f15, 328(r1) # 8-byte Folded Spill
+; 32BIT-NEXT:    stfd f16, 336(r1) # 8-byte Folded Spill
+; 32BIT-NEXT:    stfd f17, 344(r1) # 8-byte Folded Spill
+; 32BIT-NEXT:    stfd f18, 352(r1) # 8-byte Folded Spill
+; 32BIT-NEXT:    stfd f19, 360(r1) # 8-byte Folded Spill
+; 32BIT-NEXT:    stfd f20, 368(r1) # 8-byte Folded Spill
+; 32BIT-NEXT:    stfd f21, 376(r1) # 8-byte Folded Spill
+; 32BIT-NEXT:    stfd f22, 384(r1) # 8-byte Folded Spill
+; 32BIT-NEXT:    stfd f23, 392(r1) # 8-byte Folded Spill
+; 32BIT-NEXT:    stfd f24, 400(r1) # 8-byte Folded Spill
+; 32BIT-NEXT:    stfd f25, 408(r1) # 8-byte Folded Spill
+; 32BIT-NEXT:    stfd f26, 416(r1) # 8-byte Folded Spill
+; 32BIT-NEXT:    stfd f27, 424(r1) # 8-byte Folded Spill
+; 32BIT-NEXT:    stfd f28, 432(r1) # 8-byte Folded Spill
+; 32BIT-NEXT:    stfd f29, 440(r1) # 8-byte Folded Spill
+; 32BIT-NEXT:    stfd f30, 448(r1) # 8-byte Folded Spill
+; 32BIT-NEXT:    stfd f31, 456(r1) # 8-byte Folded Spill
+; 32BIT-NEXT:    #APP
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    #NO_APP
+; 32BIT-NEXT:  L..tmp0:
+; 32BIT-NEXT:    bl ._Z4testi[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:  L..tmp1:
+; 32BIT-NEXT:  L..BB0_1: # %return
+; 32BIT-NEXT:    lxv v31, 240(r1) # 16-byte Folded Reload
+; 32BIT-NEXT:    lxv v30, 224(r1) # 16-byte Folded Reload
+; 32BIT-NEXT:    lxv v29, 208(r1) # 16-byte Folded Reload
+; 32BIT-NEXT:    lxv v28, 192(r1) # 16-byte Folded Reload
+; 32BIT-NEXT:    mr r3, r30
+; 32BIT-NEXT:    lxv v27, 176(r1) # 16-byte Folded Reload
+; 32BIT-NEXT:    lxv v26, 160(r1) # 16-byte Folded Reload
+; 32BIT-NEXT:    lxv v25, 144(r1) # 16-byte Folded Reload
+; 32BIT-NEXT:    lxv v24, 128(r1) # 16-byte Folded Reload
+; 32BIT-NEXT:    lxv v23, 112(r1) # 16-byte Folded Reload
+; 32BIT-NEXT:    lxv v22, 96(r1) # 16-byte Folded Reload
+; 32BIT-NEXT:    lxv v21, 80(r1) # 16-byte Folded Reload
+; 32BIT-NEXT:    lxv v20, 64(r1) # 16-byte Folded Reload
+; 32BIT-NEXT:    lfd f31, 456(r1) # 8-byte Folded Reload
+; 32BIT-NEXT:    lfd f30, 448(r1) # 8-byte Folded Reload
+; 32BIT-NEXT:    lfd f29, 440(r1) # 8-byte Folded Reload
+; 32BIT-NEXT:    lfd f28, 432(r1) # 8-byte Folded Reload
+; 32BIT-NEXT:    lwz r31, 324(r1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lwz r30, 320(r1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lwz r29, 316(r1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lfd f27, 424(r1) # 8-byte Folded Reload
+; 32BIT-NEXT:    lwz r28, 312(r1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lwz r27, 308(r1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lwz r26, 304(r1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lfd f26, 416(r1) # 8-byte Folded Reload
+; 32BIT-NEXT:    lwz r25, 300(r1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lwz r24, 296(r1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lwz r23, 292(r1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lfd f25, 408(r1) # 8-byte Folded Reload
+; 32BIT-NEXT:    lwz r22, 288(r1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lwz r21, 284(r1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lwz r20, 280(r1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lfd f24, 400(r1) # 8-byte Folded Reload
+; 32BIT-NEXT:    lwz r19, 276(r1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lwz r18, 272(r1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lwz r17, 268(r1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lfd f23, 392(r1) # 8-byte Folded Reload
+; 32BIT-NEXT:    lwz r16, 264(r1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lwz r15, 260(r1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lwz r14, 256(r1) # 4-byte Folded Reload
+; 32BIT-NEXT:    lfd f22, 384(r1) # 8-byte Folded Reload
+; 32BIT-NEXT:    lfd f21, 376(r1) # 8-byte Folded Reload
+; 32BIT-NEXT:    lfd f20, 368(r1) # 8-byte Folded Reload
+; 32BIT-NEXT:    lfd f19, 360(r1) # 8-byte Folded Reload
+; 32BIT-NEXT:    lfd f18, 352(r1) # 8-byte Folded Reload
+; 32BIT-NEXT:    lfd f17, 344(r1) # 8-byte Folded Reload
+; 32BIT-NEXT:    lfd f16, 336(r1) # 8-byte Folded Reload
+; 32BIT-NEXT:    lfd f15, 328(r1) # 8-byte Folded Reload
+; 32BIT-NEXT:    addi r1, r1, 464
+; 32BIT-NEXT:    lwz r0, 8(r1)
+; 32BIT-NEXT:    mtlr r0
+; 32BIT-NEXT:    blr
+; 32BIT-NEXT:  L..BB0_2: # %lpad
+; 32BIT-NEXT:  L..tmp2:
+; 32BIT-NEXT:    bl .__cxa_begin_catch[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    lwz r3, 0(r3)
+; 32BIT-NEXT:    add r30, r3, r31
+; 32BIT-NEXT:    bl .__cxa_end_catch[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    b L..BB0_1
+;
+; 64BIT-LABEL: _Z5test2iPPKc:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr r0
+; 64BIT-NEXT:    stdu r1, -592(r1)
+; 64BIT-NEXT:    std r0, 608(r1)
+; 64BIT-NEXT:    std r30, 440(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    li r30, 0
+; 64BIT-NEXT:    stxv v20, 112(r1) # 16-byte Folded Spill
+; 64BIT-NEXT:    stxv v21, 128(r1) # 16-byte Folded Spill
+; 64BIT-NEXT:    std r31, 448(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    mr r31, r3
+; 64BIT-NEXT:    std r14, 312(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    stxv v22, 144(r1) # 16-byte Folded Spill
+; 64BIT-NEXT:    std r15, 320(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    stxv v23, 160(r1) # 16-byte Folded Spill
+; 64BIT-NEXT:    std r16, 328(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    stxv v24, 176(r1) # 16-byte Folded Spill
+; 64BIT-NEXT:    std r17, 336(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    std r18, 344(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    stxv v25, 192(r1) # 16-byte Folded Spill
+; 64BIT-NEXT:    std r19, 352(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    stxv v26, 208(r1) # 16-byte Folded Spill
+; 64BIT-NEXT:    std r20, 360(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    stxv v27, 224(r1) # 16-byte Folded Spill
+; 64BIT-NEXT:    std r21, 368(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    std r22, 376(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    stxv v28, 240(r1) # 16-byte Folded Spill
+; 64BIT-NEXT:    std r23, 384(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    stxv v29, 256(r1) # 16-byte Folded Spill
+; 64BIT-NEXT:    std r24, 392(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    stxv v30, 272(r1) # 16-byte Folded Spill
+; 64BIT-NEXT:    std r25, 400(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    std r26, 408(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    stxv v31, 288(r1) # 16-byte Folded Spill
+; 64BIT-NEXT:    std r27, 416(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    std r28, 424(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    std r29, 432(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    stfd f15, 456(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    stfd f16, 464(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    stfd f17, 472(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    stfd f18, 480(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    stfd f19, 488(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    stfd f20, 496(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    stfd f21, 504(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    stfd f22, 512(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    stfd f23, 520(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    stfd f24, 528(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    stfd f25, 536(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    stfd f26, 544(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    stfd f27, 552(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    stfd f28, 560(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    stfd f29, 568(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    stfd f30, 576(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    stfd f31, 584(r1) # 8-byte Folded Spill
+; 64BIT-NEXT:    #APP
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    #NO_APP
+; 64BIT-NEXT:  L..tmp0:
+; 64BIT-NEXT:    bl ._Z4testi[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:  L..tmp1:
+; 64BIT-NEXT:  L..BB0_1: # %return
+; 64BIT-NEXT:    lxv v31, 288(r1) # 16-byte Folded Reload
+; 64BIT-NEXT:    lxv v30, 272(r1) # 16-byte Folded Reload
+; 64BIT-NEXT:    lxv v29, 256(r1) # 16-byte Folded Reload
+; 64BIT-NEXT:    lxv v28, 240(r1) # 16-byte Folded Reload
+; 64BIT-NEXT:    extsw r3, r30
+; 64BIT-NEXT:    lxv v27, 224(r1) # 16-byte Folded Reload
+; 64BIT-NEXT:    lxv v26, 208(r1) # 16-byte Folded Reload
+; 64BIT-NEXT:    lxv v25, 192(r1) # 16-byte Folded Reload
+; 64BIT-NEXT:    lxv v24, 176(r1) # 16-byte Folded Reload
+; 64BIT-NEXT:    lxv v23, 160(r1) # 16-byte Folded Reload
+; 64BIT-NEXT:    lxv v22, 144(r1) # 16-byte Folded Reload
+; 64BIT-NEXT:    lxv v21, 128(r1) # 16-byte Folded Reload
+; 64BIT-NEXT:    lxv v20, 112(r1) # 16-byte Folded Reload
+; 64BIT-NEXT:    lfd f31, 584(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    lfd f30, 576(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    lfd f29, 568(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    lfd f28, 560(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    ld r31, 448(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    ld r30, 440(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    ld r29, 432(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    lfd f27, 552(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    ld r28, 424(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    ld r27, 416(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    ld r26, 408(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    lfd f26, 544(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    ld r25, 400(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    ld r24, 392(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    ld r23, 384(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    lfd f25, 536(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    ld r22, 376(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    ld r21, 368(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    ld r20, 360(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    lfd f24, 528(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    ld r19, 352(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    ld r18, 344(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    ld r17, 336(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    lfd f23, 520(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    ld r16, 328(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    ld r15, 320(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    ld r14, 312(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    lfd f22, 512(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    lfd f21, 504(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    lfd f20, 496(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    lfd f19, 488(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    lfd f18, 480(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    lfd f17, 472(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    lfd f16, 464(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    lfd f15, 456(r1) # 8-byte Folded Reload
+; 64BIT-NEXT:    addi r1, r1, 592
+; 64BIT-NEXT:    ld r0, 16(r1)
+; 64BIT-NEXT:    mtlr r0
+; 64BIT-NEXT:    blr
+; 64BIT-NEXT:  L..BB0_2: # %lpad
+; 64BIT-NEXT:  L..tmp2:
+; 64BIT-NEXT:    bl .__cxa_begin_catch[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    lwz r3, 0(r3)
+; 64BIT-NEXT:    add r30, r3, r31
+; 64BIT-NEXT:    bl .__cxa_end_catch[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    b L..BB0_1
+entry:
+  tail call void asm sideeffect "nop", "~{r14},~{f15},~{v20}"()
+  %call = invoke signext i32 @_Z4testi(i32 signext %argc)
+          to label %return unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { ptr, i32 }
+          catch ptr @_ZTIi
+  %1 = extractvalue { ptr, i32 } %0, 1
+  %2 = tail call i32 @llvm.eh.typeid.for(ptr @_ZTIi) #3
+  %matches = icmp eq i32 %1, %2
+  br i1 %matches, label %catch, label %eh.resume
+
+catch:                                            ; preds = %lpad
+  %3 = extractvalue { ptr, i32 } %0, 0
+  %4 = tail call ptr @__cxa_begin_catch(ptr %3) #3
+  %5 = load i32, ptr %4, align 4
+  %add = add nsw i32 %5, %argc
+  tail call void @__cxa_end_catch()
+  br label %return
+
+return:                                           ; preds = %entry, %catch
+  %retval.0 = phi i32 [ %add, %catch ], [ 0, %entry ]
+  ret i32 %retval.0
+
+eh.resume:                                        ; preds = %lpad
+  resume { ptr, i32 } %0
+}
+
+declare signext i32 @_Z4testi(i32 signext) local_unnamed_addr
+
+declare i32 @__gxx_personality_v0(...)
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.eh.typeid.for(ptr)
+
+declare ptr @__cxa_begin_catch(ptr) local_unnamed_addr
+
+declare void @__cxa_end_catch() local_unnamed_addr
+
+attributes #0 = { uwtable }
diff --git a/llvm/test/CodeGen/PowerPC/aix-tocdata-fastisel.ll b/llvm/test/CodeGen/PowerPC/aix-tocdata-fastisel.ll
new file mode 100644
index 000000000000..5a7fcd1d0ddd
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-tocdata-fastisel.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=powerpc64-ibm-aix-xcoff -fast-isel -verify-machineinstrs \
+; RUN:   -code-model=small | FileCheck %s --check-prefix=SMALL
+
+;; FIXME: when toc data for 64 big large code model is supported,
+;; add a run line for large code model too.
+
+@a = global i32 0, align 4 #0
+
+define signext i32 @foo() #1 {
+; SMALL-LABEL: foo:
+; SMALL:       # %bb.0: # %entry
+; SMALL-NEXT:    la 3, a[TD](2)
+; SMALL-NEXT:    lwz 3, 0(3)
+; SMALL-NEXT:    extsw 3, 3
+; SMALL-NEXT:    blr
+entry:
+  %0 = load i32, ptr @a, align 4
+  ret i32 %0
+}
+
+attributes #0 = { "toc-data" }
+attributes #1 = { noinline optnone }
diff --git a/llvm/test/CodeGen/PowerPC/aix32-crsave.mir b/llvm/test/CodeGen/PowerPC/aix32-crsave.mir
index cf51f79c7e98..73736d6d5353 100644
--- a/llvm/test/CodeGen/PowerPC/aix32-crsave.mir
+++ b/llvm/test/CodeGen/PowerPC/aix32-crsave.mir
@@ -18,23 +18,33 @@ body:             |
     BLR implicit $lr, implicit $rm, implicit $r3
 
     ; CHECK-LABEL:  fixedStack:
-    ; CHECK-NEXT:   - { id: 0, type: spill-slot, offset: -12, size: 4, alignment: 4, stack-id: default,
+    ; CHECK-NEXT:   - { id: 0, type: spill-slot, offset: -4, size: 4, alignment: 4, stack-id: default, 
+    ; CHECK-NEXT:       callee-saved-register: '$r31', callee-saved-restored: true, debug-info-variable: '', 
+    ; CHECK-NEXT:       debug-info-expression: '', debug-info-location: '' }
+    ; CHECK-NEXT:   - { id: 1, type: spill-slot, offset: -8, size: 4, alignment: 8, stack-id: default, 
+    ; CHECK-NEXT:       callee-saved-register: '$r30', callee-saved-restored: true, debug-info-variable: '', 
+    ; CHECK-NEXT:       debug-info-expression: '', debug-info-location: '' }
+    ; CHECK-NEXT:   - { id: 2, type: spill-slot, offset: -12, size: 4, alignment: 4, stack-id: default,
     ; CHECK-NEXT:       callee-saved-register: '$r29', callee-saved-restored: true, debug-info-variable: '',
     ; CHECK-NEXT:       debug-info-expression: '', debug-info-location: '' }
-    ; CHECK-NEXT:   - { id: 1, type: default, offset: 4, size: 4, alignment: 4, stack-id: default,
+    ; CHECK-NEXT:   - { id: 3, type: default, offset: 4, size: 4, alignment: 4, stack-id: default,
     ; CHECK-NEXT:       isImmutable: true, isAliased: false, callee-saved-register: '$cr4',
     ; CHECK-NEXT:       callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '',
     ; CHECK-NEXT:       debug-info-location: '' }
     ; CHECK-LABEL:  stack:
 
     ; CHECK:      bb.0.entry:
-    ; CHECK-NEXT:  liveins: $r3, $r29, $cr2, $cr4
+    ; CHECK-NEXT:  liveins: $r3, $r29, $r30, $r31, $cr2, $cr4
 
     ; CHECK:      $r12 = MFCR implicit killed $cr2, implicit killed $cr4
     ; CHECK-NEXT: STW killed $r12, 4, $r1
-    ; CHECK-NEXT: STW killed $r29, -12, $r1 :: (store (s32) into %fixed-stack.0)
+    ; CHECK-NEXT: STW killed $r29, -12, $r1 :: (store (s32) into %fixed-stack.2)
+    ; CHECK-NEXT: STW killed $r30, -8, $r1 :: (store (s32) into %fixed-stack.1, align 8)
+    ; CHECK-NEXT: STW killed $r31, -4, $r1 :: (store (s32) into %fixed-stack.0)
 
-    ; CHECK:      $r29 = LWZ -12, $r1 :: (load (s32) from %fixed-stack.0)
+    ; CHECK:      $r31 = LWZ -4, $r1 :: (load (s32) from %fixed-stack.0)
+    ; CHECK-NEXT: $r30 = LWZ -8, $r1 :: (load (s32) from %fixed-stack.1, align 8)
+    ; CHECK-NEXT: $r29 = LWZ -12, $r1 :: (load (s32) from %fixed-stack.2)
     ; CHECK-NEXT: $r12 = LWZ 4, $r1
     ; CHECK-NEXT: $cr2 = MTOCRF $r12
     ; CHECK-NEXT: $cr4 = MTOCRF killed $r12
@@ -49,14 +59,14 @@ liveins:
 body:             |
   bb.0.entry:
     liveins: $r3
-    renamable $r14 = ANDI_rec killed renamable $r3, 1, implicit-def dead $cr0, implicit-def $cr0gt
+    renamable $r31 = ANDI_rec killed renamable $r3, 1, implicit-def dead $cr0, implicit-def $cr0gt
     renamable $cr3lt = COPY $cr0gt
-    renamable $r3 = COPY $r14
+    renamable $r3 = COPY $r31
     BLR implicit $lr, implicit $rm, implicit $r3
 
     ; CHECK-LABEL: fixedStack:
-    ; CHECK-NEXT:  - { id: 0, type: spill-slot, offset: -72, size: 4, alignment: 8, stack-id: default,
-    ; CHECK-NEXT:      callee-saved-register: '$r14', callee-saved-restored: true, debug-info-variable: '',
+    ; CHECK-NEXT:  - { id: 0, type: spill-slot, offset: -4, size: 4, alignment: 4, stack-id: default,
+    ; CHECK-NEXT:      callee-saved-register: '$r31', callee-saved-restored: true, debug-info-variable: '',
     ; CHECK-NEXT:      debug-info-expression: '', debug-info-location: '' }
     ; CHECK-NEXT:  - { id: 1, type: default, offset: 4, size: 4, alignment: 4, stack-id: default,
     ; CHECK-NEXT:      isImmutable: true, isAliased: false, callee-saved-register: '$cr3',
@@ -65,12 +75,12 @@ body:             |
     ; CHECK-LABEL: stack:
 
     ; CHECK:      bb.0.entry:
-    ; CHECK-NEXT:   liveins: $r3, $r14, $cr3
+    ; CHECK-NEXT:   liveins: $r3, $r31, $cr3
 
     ; CHECK:      $r12 = MFCR implicit killed $cr3
     ; CHECK-NEXT: STW killed $r12, 4, $r1
-    ; CHECK-NEXT: STW killed $r14, -72, $r1 :: (store (s32) into %fixed-stack.0, align 8)
+    ; CHECK-NEXT: STW killed $r31, -4, $r1 :: (store (s32) into %fixed-stack.0)
 
-    ; CHECK:      $r14 = LWZ -72, $r1 :: (load (s32) from %fixed-stack.0, align 8)
+    ; CHECK:      $r31 = LWZ -4, $r1 :: (load (s32) from %fixed-stack.0)
     ; CHECK-NEXT: $r12 = LWZ 4, $r1
     ; CHECK-NEXT: $cr3 = MTOCRF killed $r12
diff --git a/llvm/test/CodeGen/PowerPC/check-aix-shared-lib-tls-model-opt-IRattribute.ll b/llvm/test/CodeGen/PowerPC/check-aix-shared-lib-tls-model-opt-IRattribute.ll
new file mode 100644
index 000000000000..15fac2d0c0ad
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/check-aix-shared-lib-tls-model-opt-IRattribute.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -ppc-asm-full-reg-names \
+; RUN:   < %s | FileCheck %s
+; RUN: not llc -mtriple powerpc-ibm-aix-xcoff -ppc-asm-full-reg-names \
+; RUN:   < %s 2>&1 | FileCheck %s --check-prefix=CHECK-NOT-SUPPORTED
+; RUN: not llc -mtriple powerpc64le-unknown-linux-gnu -ppc-asm-full-reg-names \
+; RUN:   < %s 2>&1 | FileCheck %s --check-prefix=CHECK-NOT-SUPPORTED
+
+define dso_local signext i32 @testWithIRAttr() #0 {
+entry:
+  ret i32 0
+}
+; Check that the aix-shared-lib-tls-model-opt attribute is not supported on Linux and AIX (32-bit).
+; CHECK-NOT-SUPPORTED: The aix-shared-lib-tls-model-opt attribute is only supported on AIX in 64-bit mode.
+
+; Make sure that the test was actually compiled successfully after using the
+; aix-shared-lib-tls-model-opt attribute.
+; CHECK-LABEL: testWithIRAttr:
+; CHECK:        li r3, 0
+; CHECK-NEXT:   blr
+
+attributes #0 = { "target-features"="+aix-shared-lib-tls-model-opt" }
diff --git a/llvm/test/CodeGen/PowerPC/check-aix-shared-lib-tls-model-opt-Option.ll b/llvm/test/CodeGen/PowerPC/check-aix-shared-lib-tls-model-opt-Option.ll
new file mode 100644
index 000000000000..36f8bc78c77a
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/check-aix-shared-lib-tls-model-opt-Option.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -mattr=+aix-shared-lib-tls-model-opt \
+; RUN:   -ppc-asm-full-reg-names < %s | FileCheck %s
+; RUN: not llc -mtriple powerpc-ibm-aix-xcoff -mattr=+aix-shared-lib-tls-model-opt \
+; RUN:   -ppc-asm-full-reg-names < %s 2>&1 | \
+; RUN:   FileCheck %s --check-prefix=CHECK-NOT-SUPPORTED
+; RUN: not llc -mtriple powerpc64le-unknown-linux-gnu -mattr=+aix-shared-lib-tls-model-opt \
+; RUN:   -ppc-asm-full-reg-names < %s 2>&1 | \
+; RUN:   FileCheck %s --check-prefix=CHECK-NOT-SUPPORTED
+
+define dso_local signext i32 @testNoIRAttr() {
+entry:
+  ret i32 0
+}
+
+; Check that the aix-shared-lib-tls-model-opt attribute is not supported on Linux and AIX (32-bit).
+; CHECK-NOT-SUPPORTED: The aix-shared-lib-tls-model-opt attribute is only supported on AIX in 64-bit mode.
+
+; Make sure that the test was actually compiled successfully after using the
+; aix-shared-lib-tls-model-opt attribute.
+; CHECK-LABEL: testNoIRAttr:
+; CHECK:        li r3, 0
+; CHECK-NEXT:   blr
diff --git a/llvm/test/CodeGen/PowerPC/fminimum-fmaximum.ll b/llvm/test/CodeGen/PowerPC/fminimum-fmaximum.ll
index c33875dbfee4..a99c25a4e447 100644
--- a/llvm/test/CodeGen/PowerPC/fminimum-fmaximum.ll
+++ b/llvm/test/CodeGen/PowerPC/fminimum-fmaximum.ll
@@ -45,74 +45,26 @@ define float @f32_minimum(float %a, float %b) {
 ;
 ; VSX-LABEL: f32_minimum:
 ; VSX:       # %bb.0: # %entry
-; VSX-NEXT:    xscvdpspn 0, 1
 ; VSX-NEXT:    fcmpu 0, 1, 2
-; VSX-NEXT:    xscvdpspn 3, 2
-; VSX-NEXT:    mffprwz 3, 0
 ; VSX-NEXT:    bc 12, 3, .LBB0_2
 ; VSX-NEXT:  # %bb.1: # %entry
-; VSX-NEXT:    xsmindp 0, 1, 2
-; VSX-NEXT:    b .LBB0_3
+; VSX-NEXT:    xsmindp 1, 1, 2
+; VSX-NEXT:    blr
 ; VSX-NEXT:  .LBB0_2:
-; VSX-NEXT:    addis 4, 2, .LCPI0_0@toc@ha
-; VSX-NEXT:    lfs 0, .LCPI0_0@toc@l(4)
-; VSX-NEXT:  .LBB0_3: # %entry
-; VSX-NEXT:    xoris 3, 3, 32768
-; VSX-NEXT:    mffprwz 4, 3
-; VSX-NEXT:    cmplwi 3, 0
-; VSX-NEXT:    bc 12, 2, .LBB0_5
-; VSX-NEXT:  # %bb.4: # %entry
-; VSX-NEXT:    fmr 1, 0
-; VSX-NEXT:  .LBB0_5: # %entry
-; VSX-NEXT:    xoris 3, 4, 32768
-; VSX-NEXT:    cmplwi 3, 0
-; VSX-NEXT:    bc 12, 2, .LBB0_7
-; VSX-NEXT:  # %bb.6: # %entry
-; VSX-NEXT:    fmr 2, 1
-; VSX-NEXT:  .LBB0_7: # %entry
-; VSX-NEXT:    xxlxor 1, 1, 1
-; VSX-NEXT:    fcmpu 0, 0, 1
-; VSX-NEXT:    bc 12, 2, .LBB0_9
-; VSX-NEXT:  # %bb.8: # %entry
-; VSX-NEXT:    fmr 2, 0
-; VSX-NEXT:  .LBB0_9: # %entry
-; VSX-NEXT:    fmr 1, 2
+; VSX-NEXT:    addis 3, 2, .LCPI0_0@toc@ha
+; VSX-NEXT:    lfs 1, .LCPI0_0@toc@l(3)
 ; VSX-NEXT:    blr
 ;
 ; AIX-LABEL: f32_minimum:
 ; AIX:       # %bb.0: # %entry
-; AIX-NEXT:    xscvdpspn 0, 1
 ; AIX-NEXT:    fcmpu 0, 1, 2
-; AIX-NEXT:    xscvdpspn 3, 2
-; AIX-NEXT:    mffprwz 3, 0
 ; AIX-NEXT:    bc 12, 3, L..BB0_2
 ; AIX-NEXT:  # %bb.1: # %entry
-; AIX-NEXT:    xsmindp 0, 1, 2
-; AIX-NEXT:    b L..BB0_3
+; AIX-NEXT:    xsmindp 1, 1, 2
+; AIX-NEXT:    blr
 ; AIX-NEXT:  L..BB0_2:
-; AIX-NEXT:    ld 4, L..C0(2) # %const.0
-; AIX-NEXT:    lfs 0, 0(4)
-; AIX-NEXT:  L..BB0_3: # %entry
-; AIX-NEXT:    xoris 3, 3, 32768
-; AIX-NEXT:    mffprwz 4, 3
-; AIX-NEXT:    cmplwi 3, 0
-; AIX-NEXT:    bc 12, 2, L..BB0_5
-; AIX-NEXT:  # %bb.4: # %entry
-; AIX-NEXT:    fmr 1, 0
-; AIX-NEXT:  L..BB0_5: # %entry
-; AIX-NEXT:    xoris 3, 4, 32768
-; AIX-NEXT:    cmplwi 3, 0
-; AIX-NEXT:    bc 12, 2, L..BB0_7
-; AIX-NEXT:  # %bb.6: # %entry
-; AIX-NEXT:    fmr 2, 1
-; AIX-NEXT:  L..BB0_7: # %entry
-; AIX-NEXT:    xxlxor 1, 1, 1
-; AIX-NEXT:    fcmpu 0, 0, 1
-; AIX-NEXT:    bc 12, 2, L..BB0_9
-; AIX-NEXT:  # %bb.8: # %entry
-; AIX-NEXT:    fmr 2, 0
-; AIX-NEXT:  L..BB0_9: # %entry
-; AIX-NEXT:    fmr 1, 2
+; AIX-NEXT:    ld 3, L..C0(2) # %const.0
+; AIX-NEXT:    lfs 1, 0(3)
 ; AIX-NEXT:    blr
 entry:
   %m = call float @llvm.minimum.f32(float %a, float %b)
@@ -159,70 +111,26 @@ define float @f32_maximum(float %a, float %b) {
 ;
 ; VSX-LABEL: f32_maximum:
 ; VSX:       # %bb.0: # %entry
-; VSX-NEXT:    xscvdpspn 0, 1
 ; VSX-NEXT:    fcmpu 0, 1, 2
-; VSX-NEXT:    xscvdpspn 3, 2
-; VSX-NEXT:    mffprwz 3, 0
 ; VSX-NEXT:    bc 12, 3, .LBB1_2
 ; VSX-NEXT:  # %bb.1: # %entry
-; VSX-NEXT:    xsmaxdp 0, 1, 2
-; VSX-NEXT:    b .LBB1_3
+; VSX-NEXT:    xsmaxdp 1, 1, 2
+; VSX-NEXT:    blr
 ; VSX-NEXT:  .LBB1_2:
-; VSX-NEXT:    addis 4, 2, .LCPI1_0@toc@ha
-; VSX-NEXT:    lfs 0, .LCPI1_0@toc@l(4)
-; VSX-NEXT:  .LBB1_3: # %entry
-; VSX-NEXT:    mffprwz 4, 3
-; VSX-NEXT:    cmpwi 3, 0
-; VSX-NEXT:    bc 12, 2, .LBB1_5
-; VSX-NEXT:  # %bb.4: # %entry
-; VSX-NEXT:    fmr 1, 0
-; VSX-NEXT:  .LBB1_5: # %entry
-; VSX-NEXT:    cmpwi 4, 0
-; VSX-NEXT:    bc 12, 2, .LBB1_7
-; VSX-NEXT:  # %bb.6: # %entry
-; VSX-NEXT:    fmr 2, 1
-; VSX-NEXT:  .LBB1_7: # %entry
-; VSX-NEXT:    xxlxor 1, 1, 1
-; VSX-NEXT:    fcmpu 0, 0, 1
-; VSX-NEXT:    bc 12, 2, .LBB1_9
-; VSX-NEXT:  # %bb.8: # %entry
-; VSX-NEXT:    fmr 2, 0
-; VSX-NEXT:  .LBB1_9: # %entry
-; VSX-NEXT:    fmr 1, 2
+; VSX-NEXT:    addis 3, 2, .LCPI1_0@toc@ha
+; VSX-NEXT:    lfs 1, .LCPI1_0@toc@l(3)
 ; VSX-NEXT:    blr
 ;
 ; AIX-LABEL: f32_maximum:
 ; AIX:       # %bb.0: # %entry
-; AIX-NEXT:    xscvdpspn 0, 1
 ; AIX-NEXT:    fcmpu 0, 1, 2
-; AIX-NEXT:    xscvdpspn 3, 2
-; AIX-NEXT:    mffprwz 3, 0
 ; AIX-NEXT:    bc 12, 3, L..BB1_2
 ; AIX-NEXT:  # %bb.1: # %entry
-; AIX-NEXT:    xsmaxdp 0, 1, 2
-; AIX-NEXT:    b L..BB1_3
+; AIX-NEXT:    xsmaxdp 1, 1, 2
+; AIX-NEXT:    blr
 ; AIX-NEXT:  L..BB1_2:
-; AIX-NEXT:    ld 4, L..C1(2) # %const.0
-; AIX-NEXT:    lfs 0, 0(4)
-; AIX-NEXT:  L..BB1_3: # %entry
-; AIX-NEXT:    mffprwz 4, 3
-; AIX-NEXT:    cmpwi 3, 0
-; AIX-NEXT:    bc 12, 2, L..BB1_5
-; AIX-NEXT:  # %bb.4: # %entry
-; AIX-NEXT:    fmr 1, 0
-; AIX-NEXT:  L..BB1_5: # %entry
-; AIX-NEXT:    cmpwi 4, 0
-; AIX-NEXT:    bc 12, 2, L..BB1_7
-; AIX-NEXT:  # %bb.6: # %entry
-; AIX-NEXT:    fmr 2, 1
-; AIX-NEXT:  L..BB1_7: # %entry
-; AIX-NEXT:    xxlxor 1, 1, 1
-; AIX-NEXT:    fcmpu 0, 0, 1
-; AIX-NEXT:    bc 12, 2, L..BB1_9
-; AIX-NEXT:  # %bb.8: # %entry
-; AIX-NEXT:    fmr 2, 0
-; AIX-NEXT:  L..BB1_9: # %entry
-; AIX-NEXT:    fmr 1, 2
+; AIX-NEXT:    ld 3, L..C1(2) # %const.0
+; AIX-NEXT:    lfs 1, 0(3)
 ; AIX-NEXT:    blr
 entry:
   %m = call float @llvm.maximum.f32(float %a, float %b)
@@ -272,69 +180,25 @@ define double @f64_minimum(double %a, double %b) {
 ; VSX-LABEL: f64_minimum:
 ; VSX:       # %bb.0: # %entry
 ; VSX-NEXT:    fcmpu 0, 1, 2
-; VSX-NEXT:    mffprd 3, 1
 ; VSX-NEXT:    bc 12, 3, .LBB2_2
 ; VSX-NEXT:  # %bb.1: # %entry
-; VSX-NEXT:    xsmindp 0, 1, 2
-; VSX-NEXT:    b .LBB2_3
+; VSX-NEXT:    xsmindp 1, 1, 2
+; VSX-NEXT:    blr
 ; VSX-NEXT:  .LBB2_2:
-; VSX-NEXT:    addis 4, 2, .LCPI2_0@toc@ha
-; VSX-NEXT:    lfs 0, .LCPI2_0@toc@l(4)
-; VSX-NEXT:  .LBB2_3: # %entry
-; VSX-NEXT:    li 5, 1
-; VSX-NEXT:    mffprd 4, 2
-; VSX-NEXT:    rldic 5, 5, 63, 0
-; VSX-NEXT:    cmpd 3, 5
-; VSX-NEXT:    bc 12, 2, .LBB2_5
-; VSX-NEXT:  # %bb.4: # %entry
-; VSX-NEXT:    fmr 1, 0
-; VSX-NEXT:  .LBB2_5: # %entry
-; VSX-NEXT:    cmpd 4, 5
-; VSX-NEXT:    bc 12, 2, .LBB2_7
-; VSX-NEXT:  # %bb.6: # %entry
-; VSX-NEXT:    fmr 2, 1
-; VSX-NEXT:  .LBB2_7: # %entry
-; VSX-NEXT:    xxlxor 1, 1, 1
-; VSX-NEXT:    fcmpu 0, 0, 1
-; VSX-NEXT:    bc 12, 2, .LBB2_9
-; VSX-NEXT:  # %bb.8: # %entry
-; VSX-NEXT:    fmr 2, 0
-; VSX-NEXT:  .LBB2_9: # %entry
-; VSX-NEXT:    fmr 1, 2
+; VSX-NEXT:    addis 3, 2, .LCPI2_0@toc@ha
+; VSX-NEXT:    lfs 1, .LCPI2_0@toc@l(3)
 ; VSX-NEXT:    blr
 ;
 ; AIX-LABEL: f64_minimum:
 ; AIX:       # %bb.0: # %entry
 ; AIX-NEXT:    fcmpu 0, 1, 2
-; AIX-NEXT:    mffprd 3, 1
 ; AIX-NEXT:    bc 12, 3, L..BB2_2
 ; AIX-NEXT:  # %bb.1: # %entry
-; AIX-NEXT:    xsmindp 0, 1, 2
-; AIX-NEXT:    b L..BB2_3
+; AIX-NEXT:    xsmindp 1, 1, 2
+; AIX-NEXT:    blr
 ; AIX-NEXT:  L..BB2_2:
-; AIX-NEXT:    ld 4, L..C2(2) # %const.0
-; AIX-NEXT:    lfs 0, 0(4)
-; AIX-NEXT:  L..BB2_3: # %entry
-; AIX-NEXT:    li 5, 1
-; AIX-NEXT:    mffprd 4, 2
-; AIX-NEXT:    rldic 5, 5, 63, 0
-; AIX-NEXT:    cmpd 3, 5
-; AIX-NEXT:    bc 12, 2, L..BB2_5
-; AIX-NEXT:  # %bb.4: # %entry
-; AIX-NEXT:    fmr 1, 0
-; AIX-NEXT:  L..BB2_5: # %entry
-; AIX-NEXT:    cmpd 4, 5
-; AIX-NEXT:    bc 12, 2, L..BB2_7
-; AIX-NEXT:  # %bb.6: # %entry
-; AIX-NEXT:    fmr 2, 1
-; AIX-NEXT:  L..BB2_7: # %entry
-; AIX-NEXT:    xxlxor 1, 1, 1
-; AIX-NEXT:    fcmpu 0, 0, 1
-; AIX-NEXT:    bc 12, 2, L..BB2_9
-; AIX-NEXT:  # %bb.8: # %entry
-; AIX-NEXT:    fmr 2, 0
-; AIX-NEXT:  L..BB2_9: # %entry
-; AIX-NEXT:    fmr 1, 2
+; AIX-NEXT:    ld 3, L..C2(2) # %const.0
+; AIX-NEXT:    lfs 1, 0(3)
 ; AIX-NEXT:    blr
 entry:
   %m = call double @llvm.minimum.f64(double %a, double %b)
@@ -382,65 +246,25 @@ define double @f64_maximum(double %a, double %b) {
 ; VSX-LABEL: f64_maximum:
 ; VSX:       # %bb.0: # %entry
 ; VSX-NEXT:    fcmpu 0, 1, 2
-; VSX-NEXT:    mffprd 3, 1
 ; VSX-NEXT:    bc 12, 3, .LBB3_2
 ; VSX-NEXT:  # %bb.1: # %entry
-; VSX-NEXT:    xsmaxdp 0, 1, 2
-; VSX-NEXT:    b .LBB3_3
+; VSX-NEXT:    xsmaxdp 1, 1, 2
+; VSX-NEXT:    blr
 ; VSX-NEXT:  .LBB3_2:
-; VSX-NEXT:    addis 4, 2, .LCPI3_0@toc@ha
-; VSX-NEXT:    lfs 0, .LCPI3_0@toc@l(4)
-; VSX-NEXT:  .LBB3_3: # %entry
-; VSX-NEXT:    mffprd 4, 2
-; VSX-NEXT:    cmpdi 3, 0
-; VSX-NEXT:    bc 12, 2, .LBB3_5
-; VSX-NEXT:  # %bb.4: # %entry
-; VSX-NEXT:    fmr 1, 0
-; VSX-NEXT:  .LBB3_5: # %entry
-; VSX-NEXT:    cmpdi 4, 0
-; VSX-NEXT:    bc 12, 2, .LBB3_7
-; VSX-NEXT:  # %bb.6: # %entry
-; VSX-NEXT:    fmr 2, 1
-; VSX-NEXT:  .LBB3_7: # %entry
-; VSX-NEXT:    xxlxor 1, 1, 1
-; VSX-NEXT:    fcmpu 0, 0, 1
-; VSX-NEXT:    bc 12, 2, .LBB3_9
-; VSX-NEXT:  # %bb.8: # %entry
-; VSX-NEXT:    fmr 2, 0
-; VSX-NEXT:  .LBB3_9: # %entry
-; VSX-NEXT:    fmr 1, 2
+; VSX-NEXT:    addis 3, 2, .LCPI3_0@toc@ha
+; VSX-NEXT:    lfs 1, .LCPI3_0@toc@l(3)
 ; VSX-NEXT:    blr
 ;
 ; AIX-LABEL: f64_maximum:
 ; AIX:       # %bb.0: # %entry
 ; AIX-NEXT:    fcmpu 0, 1, 2
-; AIX-NEXT:    mffprd 3, 1
 ; AIX-NEXT:    bc 12, 3, L..BB3_2
 ; AIX-NEXT:  # %bb.1: # %entry
-; AIX-NEXT:    xsmaxdp 0, 1, 2
-; AIX-NEXT:    b L..BB3_3
+; AIX-NEXT:    xsmaxdp 1, 1, 2
+; AIX-NEXT:    blr
 ; AIX-NEXT:  L..BB3_2:
-; AIX-NEXT:    ld 4, L..C3(2) # %const.0
-; AIX-NEXT:    lfs 0, 0(4)
-; AIX-NEXT:  L..BB3_3: # %entry
-; AIX-NEXT:    mffprd 4, 2
-; AIX-NEXT:    cmpdi 3, 0
-; AIX-NEXT:    bc 12, 2, L..BB3_5
-; AIX-NEXT:  # %bb.4: # %entry
-; AIX-NEXT:    fmr 1, 0
-; AIX-NEXT:  L..BB3_5: # %entry
-; AIX-NEXT:    cmpdi 4, 0
-; AIX-NEXT:    bc 12, 2, L..BB3_7
-; AIX-NEXT:  # %bb.6: # %entry
-; AIX-NEXT:    fmr 2, 1
-; AIX-NEXT:  L..BB3_7: # %entry
-; AIX-NEXT:    xxlxor 1, 1, 1
-; AIX-NEXT:    fcmpu 0, 0, 1
-; AIX-NEXT:    bc 12, 2, L..BB3_9
-; AIX-NEXT:  # %bb.8: # %entry
-; AIX-NEXT:    fmr 2, 0
-; AIX-NEXT:  L..BB3_9: # %entry
-; AIX-NEXT:    fmr 1, 2
+; AIX-NEXT:    ld 3, L..C3(2) # %const.0
+; AIX-NEXT:    lfs 1, 0(3)
 ; AIX-NEXT:    blr
 entry:
   %m = call double @llvm.maximum.f64(double %a, double %b)
diff --git a/llvm/test/CodeGen/PowerPC/merge-string-used-by-metadata.mir b/llvm/test/CodeGen/PowerPC/merge-string-used-by-metadata.mir
index 2a791966be4e..4a40974a2a22 100644
--- a/llvm/test/CodeGen/PowerPC/merge-string-used-by-metadata.mir
+++ b/llvm/test/CodeGen/PowerPC/merge-string-used-by-metadata.mir
@@ -14,16 +14,14 @@
 
   define noundef ptr @func1(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 !dbg !6 {
   ; CHECK-LABEL: func1
-  ; CHECK:       %0 = getelementptr { [7 x i8], [7 x i8] }, ptr @__ModuleStringPool, i32 0, i32 1
-  ; CHECK-NEXT:  ret ptr %0, !dbg !14
+  ; CHECK:       ret ptr getelementptr inbounds ({ [7 x i8], [7 x i8] }, ptr @__ModuleStringPool, i32 0, i32 1), !dbg !14
   entry:
     ret ptr @const.2, !dbg !14
   }
 
   define noundef ptr @func2(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 {
   ; CHECK-LABEL: func2
-  ; CHECK:       %0 = getelementptr { [7 x i8], [7 x i8] }, ptr @__ModuleStringPool, i32 0, i32 0
-  ; CHECK-NEXT:  ret ptr %0
+  ; CHECK:       ret ptr @__ModuleStringPool
   entry:
     ret ptr @const.1
   }
diff --git a/llvm/test/CodeGen/PowerPC/mergeable-string-pool-exceptions.ll b/llvm/test/CodeGen/PowerPC/mergeable-string-pool-exceptions.ll
new file mode 100644
index 000000000000..03a830e087d2
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/mergeable-string-pool-exceptions.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=ppc64le-unknown-linux-gnu < %s | FileCheck %s
+
+@id = private unnamed_addr constant [4 x i8] c"@id\00", align 1
+@id2 = private unnamed_addr constant [5 x i8] c"@id2\00", align 1
+
+; Higher-aligned dummy to make sure it is first in the string pool.
+@dummy = private unnamed_addr constant [1 x i32] [i32 42], align 4
+
+define ptr @test1() personality ptr @__gnu_objc_personality_v0 {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr 0
+; CHECK-NEXT:    stdu 1, -32(1)
+; CHECK-NEXT:    std 0, 48(1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    addis 3, 2, .Ldummy@toc@ha
+; CHECK-NEXT:    addi 3, 3, .Ldummy@toc@l
+; CHECK-NEXT:    bl foo
+; CHECK-NEXT:    nop
+  invoke void @foo(ptr @dummy)
+          to label %cont unwind label %unwind
+
+cont:
+  unreachable
+
+unwind:
+  %lp = landingpad { ptr, i32 }
+          catch ptr @id
+  resume { ptr, i32 } %lp
+}
+
+define i32 @test2() personality ptr @__gnu_objc_personality_v0 {
+; CHECK-LABEL: test2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li 3, 1
+; CHECK-NEXT:    blr
+  %id = tail call i32 @llvm.eh.typeid.for(ptr @id2)
+  ret i32 %id
+}
+
+declare i32 @__gnu_objc_personality_v0(...)
+
+declare i32 @llvm.eh.typeid.for(ptr)
+
+declare void @foo()
diff --git a/llvm/test/CodeGen/PowerPC/mergeable-string-pool-pass-only.mir b/llvm/test/CodeGen/PowerPC/mergeable-string-pool-pass-only.mir
index e2fb0ced8f34..3d8afb604fd3 100644
--- a/llvm/test/CodeGen/PowerPC/mergeable-string-pool-pass-only.mir
+++ b/llvm/test/CodeGen/PowerPC/mergeable-string-pool-pass-only.mir
@@ -35,8 +35,7 @@
     ret i32 %call
 
   ; CHECK-LABEL: test1
-  ; CHECK:         %0 = getelementptr { [7 x double], [7 x double], [6 x i32], [6 x i32], [7 x float], [7 x float], [8 x i8], [16 x i8] }, ptr @__ModuleStringPool, i32 0, i32 6
-  ; CHECK:         tail call signext i32 @calleeStr
+  ; CHECK:         %call = tail call signext i32 @calleeStr(ptr noundef nonnull getelementptr inbounds ({ [7 x double], [7 x double], [6 x i32], [6 x i32], [7 x float], [7 x float], [8 x i8], [16 x i8] }, ptr @__ModuleStringPool, i32 0, i32 6))
   }
 
   define dso_local signext i32 @test2() local_unnamed_addr #0 {
@@ -49,7 +48,7 @@
     ret i32 %call
 
   ; CHECK-LABEL: test2
-  ; CHECK:         %0 = getelementptr { [7 x double], [7 x double], [6 x i32], [6 x i32], [7 x float], [7 x float], [8 x i8], [16 x i8] }, ptr @__ModuleStringPool, i32 0, i32 2
+  ; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(24) %A, ptr noundef nonnull align 4 dereferenceable(24) getelementptr inbounds ({ [7 x double], [7 x double], [6 x i32], [6 x i32], [7 x float], [7 x float], [8 x i8], [16 x i8] }, ptr @__ModuleStringPool, i32 0, i32 2), i64 24, i1 false)
   ; CHECK:         call signext i32 @calleeInt
   }
 
@@ -62,7 +61,7 @@
     call void @llvm.lifetime.end.p0(i64 28, ptr nonnull %A) #0
     ret i32 %call
   ; CHECK-LABEL: test3
-  ; CHECK:         %0 = getelementptr { [7 x double], [7 x double], [6 x i32], [6 x i32], [7 x float], [7 x float], [8 x i8], [16 x i8] }, ptr @__ModuleStringPool, i32 0, i32 4
+  ; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(28) %A, ptr noundef nonnull align 4 dereferenceable(28) getelementptr inbounds ({ [7 x double], [7 x double], [6 x i32], [6 x i32], [7 x float], [7 x float], [8 x i8], [16 x i8] }, ptr @__ModuleStringPool, i32 0, i32 4), i64 28, i1 false)
   ; CHECK:         call signext i32 @calleeFloat
   }
 
@@ -75,7 +74,7 @@
     call void @llvm.lifetime.end.p0(i64 56, ptr nonnull %A) #0
     ret i32 %call
   ; CHECK-LABEL: test4
-  ; CHECK:         %0 = getelementptr { [7 x double], [7 x double], [6 x i32], [6 x i32], [7 x float], [7 x float], [8 x i8], [16 x i8] }, ptr @__ModuleStringPool, i32 0, i32 0
+  ; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(56) %A, ptr noundef nonnull align 8 dereferenceable(56) @__ModuleStringPool, i64 56, i1 false)
   ; CHECK:         call signext i32 @calleeDouble
   }
 
@@ -102,11 +101,10 @@
     call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %B) #0
     ret i32 %add7
   ; CHECK-LABEL: test5
-  ; CHECK:         %0 = getelementptr { [7 x double], [7 x double], [6 x i32], [6 x i32], [7 x float], [7 x float], [8 x i8], [16 x i8] }, ptr @__ModuleStringPool, i32 0, i32 3
-  ; CHECK:         %1 = getelementptr { [7 x double], [7 x double], [6 x i32], [6 x i32], [7 x float], [7 x float], [8 x i8], [16 x i8] }, ptr @__ModuleStringPool, i32 0, i32 5
-  ; CHECK:         %2 = getelementptr { [7 x double], [7 x double], [6 x i32], [6 x i32], [7 x float], [7 x float], [8 x i8], [16 x i8] }, ptr @__ModuleStringPool, i32 0, i32 1
-  ; CHECK:         %3 = getelementptr { [7 x double], [7 x double], [6 x i32], [6 x i32], [7 x float], [7 x float], [8 x i8], [16 x i8] }, ptr @__ModuleStringPool, i32 0, i32 7
-  ; CHECK:         call signext i32 @calleeStr
+  ; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(24) %B, ptr noundef nonnull align 4 dereferenceable(24) getelementptr inbounds ({ [7 x double], [7 x double], [6 x i32], [6 x i32], [7 x float], [7 x float], [8 x i8], [16 x i8] }, ptr @__ModuleStringPool, i32 0, i32 3), i64 24, i1 false)
+  ; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(28) %C, ptr noundef nonnull align 4 dereferenceable(28) getelementptr inbounds ({ [7 x double], [7 x double], [6 x i32], [6 x i32], [7 x float], [7 x float], [8 x i8], [16 x i8] }, ptr @__ModuleStringPool, i32 0, i32 5), i64 28, i1 false)
+  ; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(56) %D, ptr noundef nonnull align 8 dereferenceable(56) getelementptr inbounds ({ [7 x double], [7 x double], [6 x i32], [6 x i32], [7 x float], [7 x float], [8 x i8], [16 x i8] }, ptr @__ModuleStringPool, i32 0, i32 1), i64 56, i1 false)
+  ; CHECK:         call signext i32 @calleeStr(ptr noundef nonnull getelementptr inbounds ({ [7 x double], [7 x double], [6 x i32], [6 x i32], [7 x float], [7 x float], [8 x i8], [16 x i8] }, ptr @__ModuleStringPool, i32 0, i32 7))
   ; CHECK:         call signext i32 @calleeInt
   ; CHECK:         call signext i32 @calleeFloat
   ; CHECK:         call signext i32 @calleeDouble
diff --git a/llvm/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll b/llvm/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll
index f22aeffdbb46..412cb758ad60 100644
--- a/llvm/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll
@@ -31,7 +31,7 @@
 ; After the prologue is set.
 ; DISABLE: cmpw 3, 4
 ; DISABLE-32: stw 0,
-; DISABLE-64-AIX: std 0, 
+; DISABLE-64-AIX: std 0,
 ; DISABLE-NEXT: bge 0, {{.*}}[[EXIT_LABEL:BB[0-9_]+]]
 ;
 ; Store %a on the stack
@@ -421,14 +421,14 @@ entry:
 ; ENABLE-NEXT: beq 0, {{.*}}[[ELSE_LABEL:BB[0-9_]+]]
 ;
 ; Prologue code.
-; Make sure we save the CSR used in the inline asm: r14
+; Make sure we save the CSR used in the inline asm: r31
 ; ENABLE-DAG: li [[IV:[0-9]+]], 10
-; ENABLE-64-DAG: std 14, -[[STACK_OFFSET:[0-9]+]](1) # 8-byte Folded Spill
-; ENABLE-32-DAG: stw 14, -[[STACK_OFFSET:[0-9]+]](1) # 4-byte Folded Spill
+; ENABLE-64-DAG: std 31, -[[STACK_OFFSET:[0-9]+]](1) # 8-byte Folded Spill
+; ENABLE-32-DAG: stw 31, -[[STACK_OFFSET:[0-9]+]](1) # 4-byte Folded Spill
 ;
 ; DISABLE: cmplwi 3, 0
-; DISABLE-64-NEXT: std 14, -[[STACK_OFFSET:[0-9]+]](1) # 8-byte Folded Spill
-; DISABLE-32-NEXT: stw 14, -[[STACK_OFFSET:[0-9]+]](1) # 4-byte Folded Spill
+; DISABLE-64-NEXT: std 31, -[[STACK_OFFSET:[0-9]+]](1) # 8-byte Folded Spill
+; DISABLE-32-NEXT: stw 31, -[[STACK_OFFSET:[0-9]+]](1) # 4-byte Folded Spill
 ; DISABLE-NEXT: beq 0, {{.*}}[[ELSE_LABEL:BB[0-9_]+]]
 ; DISABLE: li [[IV:[0-9]+]], 10
 ;
@@ -437,20 +437,20 @@ entry:
 ;
 ; CHECK: {{.*}}[[LOOP_LABEL:BB[0-9_]+]]: # %for.body
 ; Inline asm statement.
-; CHECK: addi 14, 14, 1
+; CHECK: addi 31, 14, 1
 ; CHECK: bdnz {{.*}}[[LOOP_LABEL]]
 ;
 ; Epilogue code.
 ; CHECK: li 3, 0
-; CHECK-64-DAG: ld 14, -[[STACK_OFFSET]](1) # 8-byte Folded Reload
-; CHECK-32-DAG: lwz 14, -[[STACK_OFFSET]](1) # 4-byte Folded Reload
+; CHECK-64-DAG: ld 31, -[[STACK_OFFSET]](1) # 8-byte Folded Reload
+; CHECK-32-DAG: lwz 31, -[[STACK_OFFSET]](1) # 4-byte Folded Reload
 ; CHECK-DAG: nop
 ; CHECK: blr
 ;
 ; CHECK: [[ELSE_LABEL]]
 ; CHECK-NEXT: slwi 3, 4, 1
-; DISABLE-64-NEXT: ld 14, -[[STACK_OFFSET]](1) # 8-byte Folded Reload
-; DISABLE-32-NEXT: lwz 14, -[[STACK_OFFSET]](1) # 4-byte Folded Reload
+; DISABLE-64-NEXT: ld 31, -[[STACK_OFFSET]](1) # 8-byte Folded Reload
+; DISABLE-32-NEXT: lwz 31, -[[STACK_OFFSET]](1) # 4-byte Folded Reload
 ; CHECK-NEXT: blr
 define i32 @inlineAsm(i32 %cond, i32 %N) {
 entry:
@@ -463,7 +463,7 @@ for.preheader:
 
 for.body:                                         ; preds = %entry, %for.body
   %i.03 = phi i32 [ %inc, %for.body ], [ 0, %for.preheader ]
-  tail call void asm "addi 14, 14, 1", "~{r14}"()
+  tail call void asm "addi 31, 14, 1", "~{r31}"()
   %inc = add nuw nsw i32 %i.03, 1
   %exitcond = icmp eq i32 %inc, 10
   br i1 %exitcond, label %for.exit, label %for.body
diff --git a/llvm/test/CodeGen/PowerPC/ppc64-crsave.mir b/llvm/test/CodeGen/PowerPC/ppc64-crsave.mir
index f4af2ad21a56..196ad134bfa5 100644
--- a/llvm/test/CodeGen/PowerPC/ppc64-crsave.mir
+++ b/llvm/test/CodeGen/PowerPC/ppc64-crsave.mir
@@ -1,15 +1,15 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple powerpc64le-unknown-linux-gnu -x mir -mcpu=pwr8 -mattr=-altivec \
-# RUN: -run-pass=prologepilog --verify-machineinstrs < %s | \
-# RUN: FileCheck %s --check-prefixes=CHECK,SAVEONE
+# RUN: -run-pass=prologepilog --verify-machineinstrs %s -o - | \
+# RUN: FileCheck %s --check-prefix=SAVEONE
 
 # RUN: llc -mtriple powerpc64-unknown-linux-gnu -x mir -mcpu=pwr7 -mattr=-altivec \
-# RUN: -run-pass=prologepilog --verify-machineinstrs < %s | \
-# RUN: FileCheck %s --check-prefixes=CHECK,SAVEALL
-
+# RUN: -run-pass=prologepilog --verify-machineinstrs %s -o - | \
+# RUN: FileCheck %s --check-prefix=SAVEALL
 
 # RUN: llc -mtriple powerpc64-unknown-aix-xcoff -x mir -mcpu=pwr4 -mattr=-altivec \
-# RUN: -run-pass=prologepilog --verify-machineinstrs < %s | \
-# RUN: FileCheck %s --check-prefixes=CHECK,SAVEALL
+# RUN: -run-pass=prologepilog --verify-machineinstrs %s -o - | \
+# RUN: FileCheck %s --check-prefix=SAVEALL
 
 ---
 name:            CRAllSave
@@ -20,33 +20,39 @@ liveins:
 body:             |
   bb.0.entry:
     liveins: $x3
-    renamable $x29 = ANDI8_rec killed renamable $x3, 1, implicit-def dead $cr0, implicit-def $cr0gt
+    ; SAVEONE-LABEL: name: CRAllSave
+    ; SAVEONE: liveins: $x3, $cr2, $cr4
+    ; SAVEONE-NEXT: {{  $}}
+    ; SAVEONE-NEXT: $x12 = MFCR8 implicit killed $cr2, implicit killed $cr4
+    ; SAVEONE-NEXT: STW8 killed $x12, 8, $x1
+    ; SAVEONE-NEXT: renamable $x3 = ANDI8_rec killed renamable $x3, 1, implicit-def dead $cr0, implicit-def $cr0gt
+    ; SAVEONE-NEXT: renamable $cr2lt = COPY $cr0gt
+    ; SAVEONE-NEXT: renamable $cr4lt = COPY $cr0gt
+    ; SAVEONE-NEXT: $x12 = LWZ8 8, $x1
+    ; SAVEONE-NEXT: $cr2 = MTOCRF8 $x12
+    ; SAVEONE-NEXT: $cr4 = MTOCRF8 killed $x12
+    ; SAVEONE-NEXT: BLR8 implicit $lr8, implicit $rm, implicit $x3
+    ;
+    ; SAVEALL-LABEL: name: CRAllSave
+    ; SAVEALL: liveins: $x3, $cr2, $cr4
+    ; SAVEALL-NEXT: {{  $}}
+    ; SAVEALL-NEXT: $x12 = MFCR8 implicit killed $cr2, implicit killed $cr4
+    ; SAVEALL-NEXT: STW8 killed $x12, 8, $x1
+    ; SAVEALL-NEXT: renamable $x3 = ANDI8_rec killed renamable $x3, 1, implicit-def dead $cr0, implicit-def $cr0gt
+    ; SAVEALL-NEXT: renamable $cr2lt = COPY $cr0gt
+    ; SAVEALL-NEXT: renamable $cr4lt = COPY $cr0gt
+    ; SAVEALL-NEXT: $x12 = LWZ8 8, $x1
+    ; SAVEALL-NEXT: $cr2 = MTOCRF8 $x12
+    ; SAVEALL-NEXT: $cr4 = MTOCRF8 killed $x12
+    ; SAVEALL-NEXT: BLR8 implicit $lr8, implicit $rm, implicit $x3
+    renamable $x3 = ANDI8_rec killed renamable $x3, 1, implicit-def dead $cr0, implicit-def $cr0gt
     renamable $cr2lt = COPY $cr0gt
     renamable $cr4lt = COPY $cr0gt
-    renamable $x3 = COPY $x29
     BLR8 implicit $lr8, implicit $rm, implicit $x3
 
-    ; CHECK-LABEL: fixedStack:
-    ; CHECK-NEXT:     - { id: 0, type: spill-slot, offset: -24, size: 8, alignment: 8, stack-id: default,
-    ; CHECK-NEXT:         callee-saved-register: '$x29', callee-saved-restored: true, debug-info-variable: '',
-    ; CHECK-NEXT:         debug-info-expression: '', debug-info-location: '' }
-    ; CHECK-NEXT:     - { id: 1, type: default, offset: 8, size: 4, alignment: 8, stack-id: default,
-    ; CHECK-NEXT:         isImmutable: true, isAliased: false, callee-saved-register: '$cr4',
-    ; CHECK-NEXT:         callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '',
-    ; CHECK-NEXT:         debug-info-location: '' }
-    ; CHECK-LABEL:  stack:
 
-    ; Verify the proper live-ins have been added in the prologue.
-    ; CHECK:    liveins: $x3, $x29, $cr2, $cr4
 
-    ; CHECK:     $x12 = MFCR8 implicit killed $cr2, implicit killed $cr4
-    ; CHECK-DAG: STD killed $x29, -24, $x1 :: (store (s64) into %fixed-stack.0)
-    ; CHECK-DAG: STW8 killed $x12, 8, $x1
 
-    ; CHECK:     $x29 = LD -24, $x1 :: (load (s64) from %fixed-stack.0)
-    ; CHECK:     $x12 = LWZ8 8, $x1
-    ; CHECK:     $cr2 = MTOCRF8 $x12
-    ; CHECK:     $cr4 = MTOCRF8 killed $x12
 
 ...
 ---
@@ -58,37 +64,36 @@ liveins:
 body:             |
   bb.0.entry:
     liveins: $x3
-    renamable $x14 = ANDI8_rec killed renamable $x3, 1, implicit-def dead $cr0, implicit-def $cr0gt
+    ; SAVEONE-LABEL: name: CR2Save
+    ; SAVEONE: liveins: $x3, $cr2
+    ; SAVEONE-NEXT: {{  $}}
+    ; SAVEONE-NEXT: $x12 = MFOCRF8 killed $cr2
+    ; SAVEONE-NEXT: STW8 killed $x12, 8, $x1
+    ; SAVEONE-NEXT: renamable $x3 = ANDI8_rec killed renamable $x3, 1, implicit-def dead $cr0, implicit-def $cr0gt
+    ; SAVEONE-NEXT: renamable $cr2lt = COPY $cr0gt
+    ; SAVEONE-NEXT: $x12 = LWZ8 8, $x1
+    ; SAVEONE-NEXT: $cr2 = MTOCRF8 killed $x12
+    ; SAVEONE-NEXT: BLR8 implicit $lr8, implicit $rm, implicit $x3
+    ;
+    ; SAVEALL-LABEL: name: CR2Save
+    ; SAVEALL: liveins: $x3, $cr2
+    ; SAVEALL-NEXT: {{  $}}
+    ; SAVEALL-NEXT: $x12 = MFCR8 implicit killed $cr2
+    ; SAVEALL-NEXT: STW8 killed $x12, 8, $x1
+    ; SAVEALL-NEXT: renamable $x3 = ANDI8_rec killed renamable $x3, 1, implicit-def dead $cr0, implicit-def $cr0gt
+    ; SAVEALL-NEXT: renamable $cr2lt = COPY $cr0gt
+    ; SAVEALL-NEXT: $x12 = LWZ8 8, $x1
+    ; SAVEALL-NEXT: $cr2 = MTOCRF8 killed $x12
+    ; SAVEALL-NEXT: BLR8 implicit $lr8, implicit $rm, implicit $x3
+    renamable $x3 = ANDI8_rec killed renamable $x3, 1, implicit-def dead $cr0, implicit-def $cr0gt
     renamable $cr2lt = COPY $cr0gt
-    renamable $x3 = COPY $x14
     BLR8 implicit $lr8, implicit $rm, implicit $x3
 
-    ; CHECK-LABEL: CR2Save
 
-    ; CHECK-LABEL: fixedStack:
-    ; CHECK-NEXT:   - { id: 0, type: spill-slot, offset: -144, size: 8, alignment: 16, stack-id: default,
-    ; CHECK-NEXT:       callee-saved-register: '$x14', callee-saved-restored: true, debug-info-variable: '',
-    ; CHECK-NEXT:       debug-info-expression: '', debug-info-location: '' }
-    ; CHECK-NEXT:   - { id: 1, type: default, offset: 8, size: 4, alignment: 8, stack-id: default,
-    ; CHECK-NEXT:       isImmutable: true, isAliased: false, callee-saved-register: '$cr2',
-    ; CHECK-NEXT:       callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '',
-    ; CHECK-NEXT:       debug-info-location: '' }
-    ; CHECK-LABEL:  stack:
 
-    ; Verify the proper live-ins have been added in the prologue.
-    ; CHECK:    liveins: $x3, $x14, $cr2
 
-    ; ELF V2 ABI allows saving only the clobbered cr fields,
-    ; whereas the other ABIs do not.
-    ; SAVEONE:     $x12 = MFOCRF8 killed $cr2
-    ; SAVEALL:     $x12 = MFCR8 implicit killed $cr2
 
-    ; CHECK-DAG: STD killed $x14, -144, $x1 :: (store (s64) into %fixed-stack.0, align 16)
-    ; CHECK-DAG: STW8 killed $x12, 8, $x1
 
-    ; CHECK:     $x14 = LD -144, $x1 :: (load (s64) from %fixed-stack.0, align 16)
-    ; CHECK:     $x12 = LWZ8 8, $x1
-    ; CHECK:     $cr2 = MTOCRF8 killed $x12
 
 
 ...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/shift-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/shift-rv32.mir
index 7d6c228c8086..4d0b5c2a2c86 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/shift-rv32.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/shift-rv32.mir
@@ -188,3 +188,59 @@ body:             |
     $x10 = COPY %4(s32)
     PseudoRET implicit $x10
 ...
+
+---
+name:            srl_and_needed
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: srl_and_needed
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; CHECK-NEXT: [[ANDI:%[0-9]+]]:gpr = ANDI [[COPY]], 15
+    ; CHECK-NEXT: [[SRL:%[0-9]+]]:gpr = SRL [[COPY1]], [[ANDI]]
+    ; CHECK-NEXT: $x10 = COPY [[SRL]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:gprb(s32) = COPY $x10
+    %1:gprb(s32) = COPY $x11
+    %2:gprb(s32) = G_CONSTANT i32 15
+    %3:gprb(s32) = G_AND %0, %2
+    %4:gprb(s32) = G_LSHR %1, %3(s32)
+    $x10 = COPY %4(s32)
+    PseudoRET implicit $x10
+...
+
+---
+name:            srl_and_eliminated
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: srl_and_eliminated
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; CHECK-NEXT: [[ANDI:%[0-9]+]]:gpr = ANDI [[COPY]], 47
+    ; CHECK-NEXT: [[SRL:%[0-9]+]]:gpr = SRL [[COPY1]], [[ANDI]]
+    ; CHECK-NEXT: $x10 = COPY [[SRL]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:gprb(s32) = COPY $x10
+    %1:gprb(s32) = COPY $x11
+    %2:gprb(s32) = G_CONSTANT i32 15
+    %3:gprb(s32) = G_CONSTANT i32 47
+    %4:gprb(s32) = G_AND %0, %3
+    %5:gprb(s32) = G_AND %4, %2
+    %6:gprb(s32) = G_LSHR %1, %5(s32)
+    $x10 = COPY %6(s32)
+    PseudoRET implicit $x10
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/shift-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/shift-rv64.mir
index 1e6890098498..5e2c60323fcb 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/shift-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/shift-rv64.mir
@@ -241,3 +241,63 @@ body:             |
     $x10 = COPY %6(s64)
     PseudoRET implicit $x10
 ...
+
+---
+name:            srl_and_needed
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: srl_and_needed
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; CHECK-NEXT: [[ANDI:%[0-9]+]]:gpr = ANDI [[COPY]], 15
+    ; CHECK-NEXT: [[SRL:%[0-9]+]]:gpr = SRL [[COPY1]], [[ANDI]]
+    ; CHECK-NEXT: $x10 = COPY [[SRL]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:gprb(s64) = COPY $x10
+    %1:gprb(s64) = COPY $x11
+    %2:gprb(s32) = G_CONSTANT i32 15
+    %3:gprb(s32) = G_TRUNC %0(s64)
+    %4:gprb(s32) = G_AND %3, %2
+    %5:gprb(s64) = nneg G_ZEXT %4(s32)
+    %6:gprb(s64) = G_LSHR %1, %5(s64)
+    $x10 = COPY %6(s64)
+    PseudoRET implicit $x10
+...
+
+---
+name:            srl_and_eliminated
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: srl_and_eliminated
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; CHECK-NEXT: [[ANDI:%[0-9]+]]:gpr = ANDI [[COPY]], 79
+    ; CHECK-NEXT: [[SRL:%[0-9]+]]:gpr = SRL [[COPY1]], [[ANDI]]
+    ; CHECK-NEXT: $x10 = COPY [[SRL]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:gprb(s64) = COPY $x10
+    %1:gprb(s64) = COPY $x11
+    %2:gprb(s32) = G_CONSTANT i32 15
+    %3:gprb(s32) = G_TRUNC %0(s64)
+    %7:gprb(s32) = G_CONSTANT i32 79
+    %8:gprb(s32) = G_AND %3, %7
+    %4:gprb(s32) = G_AND %8, %2
+    %5:gprb(s64) = nneg G_ZEXT %4(s32)
+    %6:gprb(s64) = G_LSHR %1, %5(s64)
+    $x10 = COPY %6(s64)
+    PseudoRET implicit $x10
+...
diff --git a/llvm/test/CodeGen/RISCV/O0-pipeline.ll b/llvm/test/CodeGen/RISCV/O0-pipeline.ll
index 56bd4bd0c08f..c4a7f9562534 100644
--- a/llvm/test/CodeGen/RISCV/O0-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O0-pipeline.ll
@@ -40,9 +40,9 @@
 ; CHECK-NEXT:       Finalize ISel and expand pseudo-instructions
 ; CHECK-NEXT:       Local Stack Slot Allocation
 ; CHECK-NEXT:       RISC-V Pre-RA pseudo instruction expansion pass
-; CHECK-NEXT:       RISC-V Insert VSETVLI pass
 ; CHECK-NEXT:       RISC-V Insert Read/Write CSR Pass
 ; CHECK-NEXT:       RISC-V Insert Write VXRM Pass
+; CHECK-NEXT:       RISC-V Insert VSETVLI pass
 ; CHECK-NEXT:       Init Undef Pass
 ; CHECK-NEXT:       Eliminate PHI nodes for register allocation
 ; CHECK-NEXT:       Two-Address instruction pass
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index 4121d1110911..4a71d3276d26 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -115,10 +115,9 @@
 ; RV64-NEXT:        RISC-V Optimize W Instructions
 ; CHECK-NEXT:       RISC-V Pre-RA pseudo instruction expansion pass
 ; CHECK-NEXT:       RISC-V Merge Base Offset
-; CHECK-NEXT:       RISC-V Insert VSETVLI pass
-; CHECK-NEXT:       RISC-V Dead register definitions
 ; CHECK-NEXT:       RISC-V Insert Read/Write CSR Pass
 ; CHECK-NEXT:       RISC-V Insert Write VXRM Pass
+; CHECK-NEXT:       RISC-V Insert VSETVLI pass
 ; CHECK-NEXT:       Detect Dead Lanes
 ; CHECK-NEXT:       Init Undef Pass
 ; CHECK-NEXT:       Process Implicit Definitions
@@ -144,6 +143,7 @@
 ; CHECK-NEXT:       Greedy Register Allocator
 ; CHECK-NEXT:       Virtual Register Rewriter
 ; CHECK-NEXT:       RISC-V Coalesce VSETVLI pass
+; CHECK-NEXT:       RISC-V Dead register definitions
 ; CHECK-NEXT:       Virtual Register Map
 ; CHECK-NEXT:       Live Register Matrix
 ; CHECK-NEXT:       Greedy Register Allocator
diff --git a/llvm/test/CodeGen/RISCV/addimm-mulimm.ll b/llvm/test/CodeGen/RISCV/addimm-mulimm.ll
index 8fb251a75bd1..e2f7be2e6d7f 100644
--- a/llvm/test/CodeGen/RISCV/addimm-mulimm.ll
+++ b/llvm/test/CodeGen/RISCV/addimm-mulimm.ll
@@ -600,8 +600,9 @@ define i64 @add_mul_combine_infinite_loop(i64 %x) {
 ; RV32IMB-NEXT:    sh3add a1, a1, a2
 ; RV32IMB-NEXT:    sh1add a0, a0, a0
 ; RV32IMB-NEXT:    slli a2, a0, 3
-; RV32IMB-NEXT:    addi a0, a2, 2047
-; RV32IMB-NEXT:    addi a0, a0, 1
+; RV32IMB-NEXT:    li a3, 1
+; RV32IMB-NEXT:    slli a3, a3, 11
+; RV32IMB-NEXT:    sh3add a0, a0, a3
 ; RV32IMB-NEXT:    sltu a2, a0, a2
 ; RV32IMB-NEXT:    add a1, a1, a2
 ; RV32IMB-NEXT:    ret
@@ -610,8 +611,8 @@ define i64 @add_mul_combine_infinite_loop(i64 %x) {
 ; RV64IMB:       # %bb.0:
 ; RV64IMB-NEXT:    addi a0, a0, 86
 ; RV64IMB-NEXT:    sh1add a0, a0, a0
-; RV64IMB-NEXT:    li a1, -16
-; RV64IMB-NEXT:    sh3add a0, a0, a1
+; RV64IMB-NEXT:    slli a0, a0, 3
+; RV64IMB-NEXT:    addi a0, a0, -16
 ; RV64IMB-NEXT:    ret
   %tmp0 = mul i64 %x, 24
   %tmp1 = add i64 %tmp0, 2048
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index 7bd3440c9dc0..8f49f6648ad2 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -44,6 +44,7 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+ssccptr %s -o - | FileCheck --check-prefixes=CHECK,RV32SSCCPTR %s
 ; RUN: llc -mtriple=riscv32 -mattr=+sscofpmf %s -o - | FileCheck --check-prefixes=CHECK,RV32SSCOFPMF %s
 ; RUN: llc -mtriple=riscv32 -mattr=+sscounterenw %s -o - | FileCheck --check-prefixes=CHECK,RV32SSCOUNTERENW %s
+; RUN: llc -mtriple=riscv32 -mattr=+smstateen %s -o - | FileCheck --check-prefixes=CHECK,RV32SMSTATEEN %s
 ; RUN: llc -mtriple=riscv32 -mattr=+ssstateen %s -o - | FileCheck --check-prefixes=CHECK,RV32SSSTATEEN %s
 ; RUN: llc -mtriple=riscv32 -mattr=+ssstrict %s -o - | FileCheck --check-prefixes=CHECK,RV32SSSTRICT %s
 ; RUN: llc -mtriple=riscv32 -mattr=+sstc %s -o - | FileCheck --check-prefixes=CHECK,RV32SSTC %s
@@ -170,6 +171,7 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+ssccptr %s -o - | FileCheck --check-prefixes=CHECK,RV64SSCCPTR %s
 ; RUN: llc -mtriple=riscv64 -mattr=+sscofpmf %s -o - | FileCheck --check-prefixes=CHECK,RV64SSCOFPMF %s
 ; RUN: llc -mtriple=riscv64 -mattr=+sscounterenw %s -o - | FileCheck --check-prefixes=CHECK,RV64SSCOUNTERENW %s
+; RUN: llc -mtriple=riscv64 -mattr=+smstateen %s -o - | FileCheck --check-prefixes=CHECK,RV64SMSTATEEN %s
 ; RUN: llc -mtriple=riscv64 -mattr=+ssstateen %s -o - | FileCheck --check-prefixes=CHECK,RV64SSSTATEEN %s
 ; RUN: llc -mtriple=riscv64 -mattr=+ssstrict %s -o - | FileCheck --check-prefixes=CHECK,RV64SSSTRICT %s
 ; RUN: llc -mtriple=riscv64 -mattr=+sstc %s -o - | FileCheck --check-prefixes=CHECK,RV64SSTC %s
@@ -314,6 +316,7 @@
 ; RV32SSCCPTR: .attribute 5, "rv32i2p1_ssccptr1p0"
 ; RV32SSCOFPMF: .attribute 5, "rv32i2p1_sscofpmf1p0"
 ; RV32SSCOUNTERENW: .attribute 5, "rv32i2p1_sscounterenw1p0"
+; RV32SMSTATEEN: .attribute 5, "rv32i2p1_smstateen1p0"
 ; RV32SSSTATEEN: .attribute 5, "rv32i2p1_ssstateen1p0"
 ; RV32SSSTRICT: .attribute 5, "rv32i2p1_ssstrict1p0"
 ; RV32SSTC: .attribute 5, "rv32i2p1_sstc1p0"
@@ -443,6 +446,7 @@
 ; RV64SSCCPTR: .attribute 5, "rv64i2p1_ssccptr1p0"
 ; RV64SSCOFPMF: .attribute 5, "rv64i2p1_sscofpmf1p0"
 ; RV64SSCOUNTERENW: .attribute 5, "rv64i2p1_sscounterenw1p0"
+; RV64SMSTATEEN: .attribute 5, "rv64i2p1_smstateen1p0"
 ; RV64SSSTATEEN: .attribute 5, "rv64i2p1_ssstateen1p0"
 ; RV64SSSTRICT: .attribute 5, "rv64i2p1_ssstrict1p0"
 ; RV64SSTC: .attribute 5, "rv64i2p1_sstc1p0"
diff --git a/llvm/test/CodeGen/RISCV/bitreverse-shift.ll b/llvm/test/CodeGen/RISCV/bitreverse-shift.ll
index b0281ba7d238..92610f22c4b7 100644
--- a/llvm/test/CodeGen/RISCV/bitreverse-shift.ll
+++ b/llvm/test/CodeGen/RISCV/bitreverse-shift.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -mattr=+zbkb -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s -check-prefixes=RV32ZBKB
+; RUN:   | FileCheck %s -check-prefixes=CHECK,RV32ZBKB
 ; RUN: llc -mtriple=riscv64 -mattr=+zbkb -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s -check-prefixes=RV64ZBKB
+; RUN:   | FileCheck %s -check-prefixes=CHECK,RV64ZBKB
 
-; TODO: These tests can be optimised
+; These tests can be optimised
 ;       fold (bitreverse(srl (bitreverse c), x)) -> (shl c, x)
 ;       fold (bitreverse(shl (bitreverse c), x)) -> (srl c, x)
 
@@ -14,25 +14,10 @@ declare i32 @llvm.bitreverse.i32(i32)
 declare i64 @llvm.bitreverse.i64(i64)
 
 define i8 @test_bitreverse_srli_bitreverse_i8(i8 %a) nounwind {
-; RV32ZBKB-LABEL: test_bitreverse_srli_bitreverse_i8:
-; RV32ZBKB:       # %bb.0:
-; RV32ZBKB-NEXT:    rev8 a0, a0
-; RV32ZBKB-NEXT:    brev8 a0, a0
-; RV32ZBKB-NEXT:    srli a0, a0, 27
-; RV32ZBKB-NEXT:    rev8 a0, a0
-; RV32ZBKB-NEXT:    brev8 a0, a0
-; RV32ZBKB-NEXT:    srli a0, a0, 24
-; RV32ZBKB-NEXT:    ret
-;
-; RV64ZBKB-LABEL: test_bitreverse_srli_bitreverse_i8:
-; RV64ZBKB:       # %bb.0:
-; RV64ZBKB-NEXT:    rev8 a0, a0
-; RV64ZBKB-NEXT:    brev8 a0, a0
-; RV64ZBKB-NEXT:    srli a0, a0, 59
-; RV64ZBKB-NEXT:    rev8 a0, a0
-; RV64ZBKB-NEXT:    brev8 a0, a0
-; RV64ZBKB-NEXT:    srli a0, a0, 56
-; RV64ZBKB-NEXT:    ret
+; CHECK-LABEL: test_bitreverse_srli_bitreverse_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    ret
     %1 = call i8 @llvm.bitreverse.i8(i8 %a)
     %2 = lshr i8 %1, 3
     %3 = call i8 @llvm.bitreverse.i8(i8 %2)
@@ -40,25 +25,10 @@ define i8 @test_bitreverse_srli_bitreverse_i8(i8 %a) nounwind {
 }
 
 define i16 @test_bitreverse_srli_bitreverse_i16(i16 %a) nounwind {
-; RV32ZBKB-LABEL: test_bitreverse_srli_bitreverse_i16:
-; RV32ZBKB:       # %bb.0:
-; RV32ZBKB-NEXT:    rev8 a0, a0
-; RV32ZBKB-NEXT:    brev8 a0, a0
-; RV32ZBKB-NEXT:    srli a0, a0, 23
-; RV32ZBKB-NEXT:    rev8 a0, a0
-; RV32ZBKB-NEXT:    brev8 a0, a0
-; RV32ZBKB-NEXT:    srli a0, a0, 16
-; RV32ZBKB-NEXT:    ret
-;
-; RV64ZBKB-LABEL: test_bitreverse_srli_bitreverse_i16:
-; RV64ZBKB:       # %bb.0:
-; RV64ZBKB-NEXT:    rev8 a0, a0
-; RV64ZBKB-NEXT:    brev8 a0, a0
-; RV64ZBKB-NEXT:    srli a0, a0, 55
-; RV64ZBKB-NEXT:    rev8 a0, a0
-; RV64ZBKB-NEXT:    brev8 a0, a0
-; RV64ZBKB-NEXT:    srli a0, a0, 48
-; RV64ZBKB-NEXT:    ret
+; CHECK-LABEL: test_bitreverse_srli_bitreverse_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 7
+; CHECK-NEXT:    ret
     %1 = call i16 @llvm.bitreverse.i16(i16 %a)
     %2 = lshr i16 %1, 7
     %3 = call i16 @llvm.bitreverse.i16(i16 %2)
@@ -68,21 +38,12 @@ define i16 @test_bitreverse_srli_bitreverse_i16(i16 %a) nounwind {
 define i32 @test_bitreverse_srli_bitreverse_i32(i32 %a) nounwind {
 ; RV32ZBKB-LABEL: test_bitreverse_srli_bitreverse_i32:
 ; RV32ZBKB:       # %bb.0:
-; RV32ZBKB-NEXT:    rev8 a0, a0
-; RV32ZBKB-NEXT:    brev8 a0, a0
-; RV32ZBKB-NEXT:    srli a0, a0, 15
-; RV32ZBKB-NEXT:    rev8 a0, a0
-; RV32ZBKB-NEXT:    brev8 a0, a0
+; RV32ZBKB-NEXT:    slli a0, a0, 15
 ; RV32ZBKB-NEXT:    ret
 ;
 ; RV64ZBKB-LABEL: test_bitreverse_srli_bitreverse_i32:
 ; RV64ZBKB:       # %bb.0:
-; RV64ZBKB-NEXT:    rev8 a0, a0
-; RV64ZBKB-NEXT:    brev8 a0, a0
-; RV64ZBKB-NEXT:    srli a0, a0, 47
-; RV64ZBKB-NEXT:    rev8 a0, a0
-; RV64ZBKB-NEXT:    brev8 a0, a0
-; RV64ZBKB-NEXT:    srli a0, a0, 32
+; RV64ZBKB-NEXT:    slliw a0, a0, 15
 ; RV64ZBKB-NEXT:    ret
     %1 = call i32 @llvm.bitreverse.i32(i32 %a)
     %2 = lshr i32 %1, 15
@@ -93,21 +54,13 @@ define i32 @test_bitreverse_srli_bitreverse_i32(i32 %a) nounwind {
 define i64 @test_bitreverse_srli_bitreverse_i64(i64 %a) nounwind {
 ; RV32ZBKB-LABEL: test_bitreverse_srli_bitreverse_i64:
 ; RV32ZBKB:       # %bb.0:
-; RV32ZBKB-NEXT:    rev8 a0, a0
-; RV32ZBKB-NEXT:    brev8 a0, a0
-; RV32ZBKB-NEXT:    srli a0, a0, 1
-; RV32ZBKB-NEXT:    rev8 a0, a0
-; RV32ZBKB-NEXT:    brev8 a1, a0
+; RV32ZBKB-NEXT:    slli a1, a0, 1
 ; RV32ZBKB-NEXT:    li a0, 0
 ; RV32ZBKB-NEXT:    ret
 ;
 ; RV64ZBKB-LABEL: test_bitreverse_srli_bitreverse_i64:
 ; RV64ZBKB:       # %bb.0:
-; RV64ZBKB-NEXT:    rev8 a0, a0
-; RV64ZBKB-NEXT:    brev8 a0, a0
-; RV64ZBKB-NEXT:    srli a0, a0, 33
-; RV64ZBKB-NEXT:    rev8 a0, a0
-; RV64ZBKB-NEXT:    brev8 a0, a0
+; RV64ZBKB-NEXT:    slli a0, a0, 33
 ; RV64ZBKB-NEXT:    ret
     %1 = call i64 @llvm.bitreverse.i64(i64 %a)
     %2 = lshr i64 %1, 33
@@ -118,24 +71,14 @@ define i64 @test_bitreverse_srli_bitreverse_i64(i64 %a) nounwind {
 define i8 @test_bitreverse_shli_bitreverse_i8(i8 %a) nounwind {
 ; RV32ZBKB-LABEL: test_bitreverse_shli_bitreverse_i8:
 ; RV32ZBKB:       # %bb.0:
-; RV32ZBKB-NEXT:    rev8 a0, a0
-; RV32ZBKB-NEXT:    brev8 a0, a0
-; RV32ZBKB-NEXT:    srli a0, a0, 24
-; RV32ZBKB-NEXT:    slli a0, a0, 3
-; RV32ZBKB-NEXT:    rev8 a0, a0
-; RV32ZBKB-NEXT:    brev8 a0, a0
-; RV32ZBKB-NEXT:    srli a0, a0, 24
+; RV32ZBKB-NEXT:    slli a0, a0, 24
+; RV32ZBKB-NEXT:    srli a0, a0, 27
 ; RV32ZBKB-NEXT:    ret
 ;
 ; RV64ZBKB-LABEL: test_bitreverse_shli_bitreverse_i8:
 ; RV64ZBKB:       # %bb.0:
-; RV64ZBKB-NEXT:    rev8 a0, a0
-; RV64ZBKB-NEXT:    brev8 a0, a0
-; RV64ZBKB-NEXT:    srli a0, a0, 56
-; RV64ZBKB-NEXT:    slli a0, a0, 3
-; RV64ZBKB-NEXT:    rev8 a0, a0
-; RV64ZBKB-NEXT:    brev8 a0, a0
-; RV64ZBKB-NEXT:    srli a0, a0, 56
+; RV64ZBKB-NEXT:    slli a0, a0, 56
+; RV64ZBKB-NEXT:    srli a0, a0, 59
 ; RV64ZBKB-NEXT:    ret
     %1 = call i8 @llvm.bitreverse.i8(i8 %a)
     %2 = shl i8 %1, 3
@@ -146,24 +89,14 @@ define i8 @test_bitreverse_shli_bitreverse_i8(i8 %a) nounwind {
 define i16 @test_bitreverse_shli_bitreverse_i16(i16 %a) nounwind {
 ; RV32ZBKB-LABEL: test_bitreverse_shli_bitreverse_i16:
 ; RV32ZBKB:       # %bb.0:
-; RV32ZBKB-NEXT:    rev8 a0, a0
-; RV32ZBKB-NEXT:    brev8 a0, a0
-; RV32ZBKB-NEXT:    srli a0, a0, 16
-; RV32ZBKB-NEXT:    slli a0, a0, 7
-; RV32ZBKB-NEXT:    rev8 a0, a0
-; RV32ZBKB-NEXT:    brev8 a0, a0
-; RV32ZBKB-NEXT:    srli a0, a0, 16
+; RV32ZBKB-NEXT:    slli a0, a0, 16
+; RV32ZBKB-NEXT:    srli a0, a0, 23
 ; RV32ZBKB-NEXT:    ret
 ;
 ; RV64ZBKB-LABEL: test_bitreverse_shli_bitreverse_i16:
 ; RV64ZBKB:       # %bb.0:
-; RV64ZBKB-NEXT:    rev8 a0, a0
-; RV64ZBKB-NEXT:    brev8 a0, a0
-; RV64ZBKB-NEXT:    srli a0, a0, 48
-; RV64ZBKB-NEXT:    slli a0, a0, 7
-; RV64ZBKB-NEXT:    rev8 a0, a0
-; RV64ZBKB-NEXT:    brev8 a0, a0
-; RV64ZBKB-NEXT:    srli a0, a0, 48
+; RV64ZBKB-NEXT:    slli a0, a0, 48
+; RV64ZBKB-NEXT:    srli a0, a0, 55
 ; RV64ZBKB-NEXT:    ret
     %1 = call i16 @llvm.bitreverse.i16(i16 %a)
     %2 = shl i16 %1, 7
@@ -174,22 +107,12 @@ define i16 @test_bitreverse_shli_bitreverse_i16(i16 %a) nounwind {
 define i32 @test_bitreverse_shli_bitreverse_i32(i32 %a) nounwind {
 ; RV32ZBKB-LABEL: test_bitreverse_shli_bitreverse_i32:
 ; RV32ZBKB:       # %bb.0:
-; RV32ZBKB-NEXT:    rev8 a0, a0
-; RV32ZBKB-NEXT:    brev8 a0, a0
-; RV32ZBKB-NEXT:    slli a0, a0, 15
-; RV32ZBKB-NEXT:    rev8 a0, a0
-; RV32ZBKB-NEXT:    brev8 a0, a0
+; RV32ZBKB-NEXT:    srli a0, a0, 15
 ; RV32ZBKB-NEXT:    ret
 ;
 ; RV64ZBKB-LABEL: test_bitreverse_shli_bitreverse_i32:
 ; RV64ZBKB:       # %bb.0:
-; RV64ZBKB-NEXT:    rev8 a0, a0
-; RV64ZBKB-NEXT:    brev8 a0, a0
-; RV64ZBKB-NEXT:    srli a0, a0, 32
-; RV64ZBKB-NEXT:    slli a0, a0, 15
-; RV64ZBKB-NEXT:    rev8 a0, a0
-; RV64ZBKB-NEXT:    brev8 a0, a0
-; RV64ZBKB-NEXT:    srli a0, a0, 32
+; RV64ZBKB-NEXT:    srliw a0, a0, 15
 ; RV64ZBKB-NEXT:    ret
     %1 = call i32 @llvm.bitreverse.i32(i32 %a)
     %2 = shl i32 %1, 15
@@ -200,21 +123,13 @@ define i32 @test_bitreverse_shli_bitreverse_i32(i32 %a) nounwind {
 define i64 @test_bitreverse_shli_bitreverse_i64(i64 %a) nounwind {
 ; RV32ZBKB-LABEL: test_bitreverse_shli_bitreverse_i64:
 ; RV32ZBKB:       # %bb.0:
-; RV32ZBKB-NEXT:    rev8 a0, a1
-; RV32ZBKB-NEXT:    brev8 a0, a0
-; RV32ZBKB-NEXT:    slli a0, a0, 1
-; RV32ZBKB-NEXT:    rev8 a0, a0
-; RV32ZBKB-NEXT:    brev8 a0, a0
+; RV32ZBKB-NEXT:    srli a0, a1, 1
 ; RV32ZBKB-NEXT:    li a1, 0
 ; RV32ZBKB-NEXT:    ret
 ;
 ; RV64ZBKB-LABEL: test_bitreverse_shli_bitreverse_i64:
 ; RV64ZBKB:       # %bb.0:
-; RV64ZBKB-NEXT:    rev8 a0, a0
-; RV64ZBKB-NEXT:    brev8 a0, a0
-; RV64ZBKB-NEXT:    slli a0, a0, 33
-; RV64ZBKB-NEXT:    rev8 a0, a0
-; RV64ZBKB-NEXT:    brev8 a0, a0
+; RV64ZBKB-NEXT:    srli a0, a0, 33
 ; RV64ZBKB-NEXT:    ret
     %1 = call i64 @llvm.bitreverse.i64(i64 %a)
     %2 = shl i64 %1, 33
diff --git a/llvm/test/CodeGen/RISCV/imm.ll b/llvm/test/CodeGen/RISCV/imm.ll
index 9e356a93526c..c5c1657b526a 100644
--- a/llvm/test/CodeGen/RISCV/imm.ll
+++ b/llvm/test/CodeGen/RISCV/imm.ll
@@ -3994,3 +3994,183 @@ define i64 @imm64_same_lo_hi_negative() nounwind {
 ; RV64-REMAT-NEXT:    ret
   ret i64 9259542123273814144 ; 0x8080808080808080
 }
+
+define i64 @imm64_0x8000080000000() {
+; RV32I-LABEL: imm64_0x8000080000000:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a0, 524288
+; RV32I-NEXT:    lui a1, 128
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: imm64_0x8000080000000:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 256
+; RV64I-NEXT:    addiw a0, a0, 1
+; RV64I-NEXT:    slli a0, a0, 31
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm64_0x8000080000000:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 256
+; RV64IZBA-NEXT:    addiw a0, a0, 1
+; RV64IZBA-NEXT:    slli a0, a0, 31
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm64_0x8000080000000:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 256
+; RV64IZBB-NEXT:    addiw a0, a0, 1
+; RV64IZBB-NEXT:    slli a0, a0, 31
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm64_0x8000080000000:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    bseti a0, zero, 31
+; RV64IZBS-NEXT:    bseti a0, a0, 51
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm64_0x8000080000000:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 256
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, 1
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 31
+; RV64IXTHEADBB-NEXT:    ret
+;
+; RV32-REMAT-LABEL: imm64_0x8000080000000:
+; RV32-REMAT:       # %bb.0:
+; RV32-REMAT-NEXT:    lui a0, 524288
+; RV32-REMAT-NEXT:    lui a1, 128
+; RV32-REMAT-NEXT:    ret
+;
+; RV64-REMAT-LABEL: imm64_0x8000080000000:
+; RV64-REMAT:       # %bb.0:
+; RV64-REMAT-NEXT:    lui a0, 256
+; RV64-REMAT-NEXT:    addiw a0, a0, 1
+; RV64-REMAT-NEXT:    slli a0, a0, 31
+; RV64-REMAT-NEXT:    ret
+  ret i64 2251801961168896 ; 0x8000080000000
+}
+
+define i64 @imm64_0x10000100000000() {
+; RV32I-LABEL: imm64_0x10000100000000:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a1, 256
+; RV32I-NEXT:    addi a1, a1, 1
+; RV32I-NEXT:    li a0, 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: imm64_0x10000100000000:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 256
+; RV64I-NEXT:    addi a0, a0, 1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm64_0x10000100000000:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 256
+; RV64IZBA-NEXT:    addi a0, a0, 1
+; RV64IZBA-NEXT:    slli a0, a0, 32
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm64_0x10000100000000:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 256
+; RV64IZBB-NEXT:    addi a0, a0, 1
+; RV64IZBB-NEXT:    slli a0, a0, 32
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm64_0x10000100000000:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    bseti a0, zero, 32
+; RV64IZBS-NEXT:    bseti a0, a0, 52
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm64_0x10000100000000:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 256
+; RV64IXTHEADBB-NEXT:    addi a0, a0, 1
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 32
+; RV64IXTHEADBB-NEXT:    ret
+;
+; RV32-REMAT-LABEL: imm64_0x10000100000000:
+; RV32-REMAT:       # %bb.0:
+; RV32-REMAT-NEXT:    lui a1, 256
+; RV32-REMAT-NEXT:    addi a1, a1, 1
+; RV32-REMAT-NEXT:    li a0, 0
+; RV32-REMAT-NEXT:    ret
+;
+; RV64-REMAT-LABEL: imm64_0x10000100000000:
+; RV64-REMAT:       # %bb.0:
+; RV64-REMAT-NEXT:    lui a0, 256
+; RV64-REMAT-NEXT:    addi a0, a0, 1
+; RV64-REMAT-NEXT:    slli a0, a0, 32
+; RV64-REMAT-NEXT:    ret
+  ret i64 4503603922337792 ; 0x10000100000000
+}
+
+define i64 @imm64_0xFF7FFFFF7FFFFFFE() {
+; RV32I-LABEL: imm64_0xFF7FFFFF7FFFFFFE:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a0, 524288
+; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    lui a1, 1046528
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: imm64_0xFF7FFFFF7FFFFFFE:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 1044480
+; RV64I-NEXT:    addiw a0, a0, -1
+; RV64I-NEXT:    slli a0, a0, 31
+; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm64_0xFF7FFFFF7FFFFFFE:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 1044480
+; RV64IZBA-NEXT:    addiw a0, a0, -1
+; RV64IZBA-NEXT:    slli a0, a0, 31
+; RV64IZBA-NEXT:    addi a0, a0, -1
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm64_0xFF7FFFFF7FFFFFFE:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 1044480
+; RV64IZBB-NEXT:    addiw a0, a0, -1
+; RV64IZBB-NEXT:    slli a0, a0, 31
+; RV64IZBB-NEXT:    addi a0, a0, -1
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm64_0xFF7FFFFF7FFFFFFE:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    li a0, -1
+; RV64IZBS-NEXT:    bclri a0, a0, 31
+; RV64IZBS-NEXT:    bclri a0, a0, 55
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm64_0xFF7FFFFF7FFFFFFE:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 1044480
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, -1
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 31
+; RV64IXTHEADBB-NEXT:    addi a0, a0, -1
+; RV64IXTHEADBB-NEXT:    ret
+;
+; RV32-REMAT-LABEL: imm64_0xFF7FFFFF7FFFFFFE:
+; RV32-REMAT:       # %bb.0:
+; RV32-REMAT-NEXT:    lui a0, 524288
+; RV32-REMAT-NEXT:    addi a0, a0, -1
+; RV32-REMAT-NEXT:    lui a1, 1046528
+; RV32-REMAT-NEXT:    addi a1, a1, -1
+; RV32-REMAT-NEXT:    ret
+;
+; RV64-REMAT-LABEL: imm64_0xFF7FFFFF7FFFFFFE:
+; RV64-REMAT:       # %bb.0:
+; RV64-REMAT-NEXT:    lui a0, 1044480
+; RV64-REMAT-NEXT:    addiw a0, a0, -1
+; RV64-REMAT-NEXT:    slli a0, a0, 31
+; RV64-REMAT-NEXT:    addi a0, a0, -1
+; RV64-REMAT-NEXT:    ret
+  ret i64 -36028799166447617 ; 0xFF7FFFFF7FFFFFFE
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/imm.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/imm.ll
index 0ef17ca964db..561686374a9b 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/imm.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/imm.ll
@@ -2562,3 +2562,180 @@ define i64 @imm64_same_lo_hi_optsize() nounwind optsize {
 ; RV64IXTHEADBB-NEXT:    ret
   ret i64 1157442765409226768 ; 0x0101010101010101
 }
+; Hi and lo are the same and also negative.
+define i64 @imm64_same_lo_hi_negative() nounwind {
+; RV64-NOPOOL-LABEL: imm64_same_lo_hi_negative:
+; RV64-NOPOOL:       # %bb.0:
+; RV64-NOPOOL-NEXT:    lui a0, 983297
+; RV64-NOPOOL-NEXT:    slli a0, a0, 4
+; RV64-NOPOOL-NEXT:    addi a0, a0, 257
+; RV64-NOPOOL-NEXT:    slli a0, a0, 16
+; RV64-NOPOOL-NEXT:    addi a0, a0, 257
+; RV64-NOPOOL-NEXT:    slli a0, a0, 15
+; RV64-NOPOOL-NEXT:    addi a0, a0, 128
+; RV64-NOPOOL-NEXT:    ret
+;
+; RV64I-POOL-LABEL: imm64_same_lo_hi_negative:
+; RV64I-POOL:       # %bb.0:
+; RV64I-POOL-NEXT:    lui a0, %hi(.LCPI65_0)
+; RV64I-POOL-NEXT:    ld a0, %lo(.LCPI65_0)(a0)
+; RV64I-POOL-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm64_same_lo_hi_negative:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 526344
+; RV64IZBA-NEXT:    addi a0, a0, 128
+; RV64IZBA-NEXT:    slli a1, a0, 32
+; RV64IZBA-NEXT:    add.uw a0, a0, a1
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm64_same_lo_hi_negative:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 983297
+; RV64IZBB-NEXT:    slli a0, a0, 4
+; RV64IZBB-NEXT:    addi a0, a0, 257
+; RV64IZBB-NEXT:    slli a0, a0, 16
+; RV64IZBB-NEXT:    addi a0, a0, 257
+; RV64IZBB-NEXT:    slli a0, a0, 15
+; RV64IZBB-NEXT:    addi a0, a0, 128
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm64_same_lo_hi_negative:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 983297
+; RV64IZBS-NEXT:    slli a0, a0, 4
+; RV64IZBS-NEXT:    addi a0, a0, 257
+; RV64IZBS-NEXT:    slli a0, a0, 16
+; RV64IZBS-NEXT:    addi a0, a0, 257
+; RV64IZBS-NEXT:    slli a0, a0, 15
+; RV64IZBS-NEXT:    addi a0, a0, 128
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm64_same_lo_hi_negative:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 983297
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 4
+; RV64IXTHEADBB-NEXT:    addi a0, a0, 257
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 16
+; RV64IXTHEADBB-NEXT:    addi a0, a0, 257
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 15
+; RV64IXTHEADBB-NEXT:    addi a0, a0, 128
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 9259542123273814144 ; 0x8080808080808080
+}
+
+define i64 @imm64_0x8000080000000() {
+; RV64I-LABEL: imm64_0x8000080000000:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 256
+; RV64I-NEXT:    addiw a0, a0, 1
+; RV64I-NEXT:    slli a0, a0, 31
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm64_0x8000080000000:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 256
+; RV64IZBA-NEXT:    addiw a0, a0, 1
+; RV64IZBA-NEXT:    slli a0, a0, 31
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm64_0x8000080000000:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 256
+; RV64IZBB-NEXT:    addiw a0, a0, 1
+; RV64IZBB-NEXT:    slli a0, a0, 31
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm64_0x8000080000000:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    bseti a0, zero, 31
+; RV64IZBS-NEXT:    bseti a0, a0, 51
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm64_0x8000080000000:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 256
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, 1
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 31
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 2251801961168896 ; 0x8000080000000
+}
+
+define i64 @imm64_0x10000100000000() {
+; RV64I-LABEL: imm64_0x10000100000000:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 256
+; RV64I-NEXT:    addi a0, a0, 1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm64_0x10000100000000:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 256
+; RV64IZBA-NEXT:    addi a0, a0, 1
+; RV64IZBA-NEXT:    slli a0, a0, 32
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm64_0x10000100000000:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 256
+; RV64IZBB-NEXT:    addi a0, a0, 1
+; RV64IZBB-NEXT:    slli a0, a0, 32
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm64_0x10000100000000:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    bseti a0, zero, 32
+; RV64IZBS-NEXT:    bseti a0, a0, 52
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm64_0x10000100000000:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 256
+; RV64IXTHEADBB-NEXT:    addi a0, a0, 1
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 32
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 4503603922337792 ; 0x10000100000000
+}
+
+define i64 @imm64_0xFF7FFFFF7FFFFFFE() {
+; RV64I-LABEL: imm64_0xFF7FFFFF7FFFFFFE:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 1044480
+; RV64I-NEXT:    addiw a0, a0, -1
+; RV64I-NEXT:    slli a0, a0, 31
+; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm64_0xFF7FFFFF7FFFFFFE:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 1044480
+; RV64IZBA-NEXT:    addiw a0, a0, -1
+; RV64IZBA-NEXT:    slli a0, a0, 31
+; RV64IZBA-NEXT:    addi a0, a0, -1
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm64_0xFF7FFFFF7FFFFFFE:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 1044480
+; RV64IZBB-NEXT:    addiw a0, a0, -1
+; RV64IZBB-NEXT:    slli a0, a0, 31
+; RV64IZBB-NEXT:    addi a0, a0, -1
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm64_0xFF7FFFFF7FFFFFFE:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    li a0, -1
+; RV64IZBS-NEXT:    bclri a0, a0, 31
+; RV64IZBS-NEXT:    bclri a0, a0, 55
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm64_0xFF7FFFFF7FFFFFFE:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 1044480
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, -1
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 31
+; RV64IXTHEADBB-NEXT:    addi a0, a0, -1
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 -36028799166447617 ; 0xFF7FFFFF7FFFFFFE
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll
index c3ae40124ba0..2db0d40b0ce5 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll
@@ -646,8 +646,8 @@ define i64 @zext_mul12884901888(i32 signext %a) {
 ;
 ; RV64ZBA-LABEL: zext_mul12884901888:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    sh1add a0, a0, a0
 ; RV64ZBA-NEXT:    slli a0, a0, 32
+; RV64ZBA-NEXT:    sh1add a0, a0, a0
 ; RV64ZBA-NEXT:    ret
   %b = zext i32 %a to i64
   %c = mul i64 %b, 12884901888
@@ -667,8 +667,8 @@ define i64 @zext_mul21474836480(i32 signext %a) {
 ;
 ; RV64ZBA-LABEL: zext_mul21474836480:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    sh2add a0, a0, a0
 ; RV64ZBA-NEXT:    slli a0, a0, 32
+; RV64ZBA-NEXT:    sh2add a0, a0, a0
 ; RV64ZBA-NEXT:    ret
   %b = zext i32 %a to i64
   %c = mul i64 %b, 21474836480
@@ -688,8 +688,8 @@ define i64 @zext_mul38654705664(i32 signext %a) {
 ;
 ; RV64ZBA-LABEL: zext_mul38654705664:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    sh3add a0, a0, a0
 ; RV64ZBA-NEXT:    slli a0, a0, 32
+; RV64ZBA-NEXT:    sh3add a0, a0, a0
 ; RV64ZBA-NEXT:    ret
   %b = zext i32 %a to i64
   %c = mul i64 %b, 38654705664
diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll
index 817e2b7d0bd9..dc93c0215a25 100644
--- a/llvm/test/CodeGen/RISCV/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zba.ll
@@ -865,8 +865,8 @@ define i64 @zext_mul12884901888(i32 signext %a) {
 ;
 ; RV64ZBA-LABEL: zext_mul12884901888:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    sh1add a0, a0, a0
 ; RV64ZBA-NEXT:    slli a0, a0, 32
+; RV64ZBA-NEXT:    sh1add a0, a0, a0
 ; RV64ZBA-NEXT:    ret
   %b = zext i32 %a to i64
   %c = mul i64 %b, 12884901888
@@ -886,8 +886,8 @@ define i64 @zext_mul21474836480(i32 signext %a) {
 ;
 ; RV64ZBA-LABEL: zext_mul21474836480:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    sh2add a0, a0, a0
 ; RV64ZBA-NEXT:    slli a0, a0, 32
+; RV64ZBA-NEXT:    sh2add a0, a0, a0
 ; RV64ZBA-NEXT:    ret
   %b = zext i32 %a to i64
   %c = mul i64 %b, 21474836480
@@ -907,8 +907,8 @@ define i64 @zext_mul38654705664(i32 signext %a) {
 ;
 ; RV64ZBA-LABEL: zext_mul38654705664:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    sh3add a0, a0, a0
 ; RV64ZBA-NEXT:    slli a0, a0, 32
+; RV64ZBA-NEXT:    sh3add a0, a0, a0
 ; RV64ZBA-NEXT:    ret
   %b = zext i32 %a to i64
   %c = mul i64 %b, 38654705664
@@ -2853,3 +2853,98 @@ entry:
   ret i64 %6
 }
 
+define ptr @gep_lshr_i32(ptr %0, i64 %1) {
+; RV64I-LABEL: gep_lshr_i32:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    srli a1, a1, 2
+; RV64I-NEXT:    li a2, 5
+; RV64I-NEXT:    slli a2, a2, 36
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    mulhu a1, a1, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: gep_lshr_i32:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    srli a1, a1, 2
+; RV64ZBA-NEXT:    slli.uw a1, a1, 4
+; RV64ZBA-NEXT:    sh2add a1, a1, a1
+; RV64ZBA-NEXT:    add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+entry:
+  %2 = lshr exact i64 %1, 2
+  %3 = and i64 %2, 4294967295
+  %5 = getelementptr [80 x i8], ptr %0, i64 %3
+  ret ptr %5
+}
+
+define i64 @srli_slliw(i64 %1) {
+; RV64I-LABEL: srli_slliw:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    slli a0, a0, 2
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    slli a1, a1, 36
+; RV64I-NEXT:    addi a1, a1, -16
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: srli_slliw:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    srli a0, a0, 2
+; RV64ZBA-NEXT:    slli.uw a0, a0, 4
+; RV64ZBA-NEXT:    ret
+entry:
+  %2 = lshr exact i64 %1, 2
+  %3 = and i64 %2, 4294967295
+  %4 = shl i64 %3, 4
+  ret i64 %4
+}
+
+define i64 @srli_slliw_canonical(i64 %0) {
+; RV64I-LABEL: srli_slliw_canonical:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    slli a0, a0, 2
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    slli a1, a1, 36
+; RV64I-NEXT:    addi a1, a1, -16
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: srli_slliw_canonical:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    srli a0, a0, 2
+; RV64ZBA-NEXT:    slli.uw a0, a0, 4
+; RV64ZBA-NEXT:    ret
+entry:
+  %1 = shl i64 %0, 2
+  %2 = and i64 %1, 68719476720
+  ret i64 %2
+}
+
+; Make sure we don't accidentally use slli.uw with a shift of 32.
+define i64 @srli_slliuw_negative_test(i64 %0) {
+; CHECK-LABEL: srli_slliuw_negative_test:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    srli a0, a0, 6
+; CHECK-NEXT:    slli a0, a0, 32
+; CHECK-NEXT:    ret
+entry:
+  %1 = lshr i64 %0, 6
+  %2 = shl i64 %1, 32
+  ret i64 %2
+}
+
+define i64 @srli_slli_i16(i64 %1) {
+; CHECK-LABEL: srli_slli_i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    lui a1, 256
+; CHECK-NEXT:    addiw a1, a1, -16
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    ret
+entry:
+  %2 = lshr exact i64 %1, 2
+  %3 = and i64 %2, 65535
+  %4 = shl i64 %3, 4
+  ret i64 %4
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/addi-scalable-offset.mir b/llvm/test/CodeGen/RISCV/rvv/addi-scalable-offset.mir
index a54da97d2548..f976adcfe931 100644
--- a/llvm/test/CodeGen/RISCV/rvv/addi-scalable-offset.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/addi-scalable-offset.mir
@@ -57,8 +57,7 @@ body: |
     ; CHECK-NEXT: PseudoRET
     %1:gprnox0 = COPY $x11
     %0:gpr = COPY $x10
-    %pt:vr = IMPLICIT_DEF
-    %2:vr = PseudoVLE64_V_M1 %pt, %0, %1, 6, 0 :: (load unknown-size from %ir.pa, align 8)
+    %2:vr = PseudoVLE64_V_M1 undef $noreg, %0, %1, 6, 0 :: (load unknown-size from %ir.pa, align 8)
     %3:gpr = ADDI %stack.2, 0
     VS1R_V killed %2:vr, %3:gpr
     PseudoRET
diff --git a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
index 5b271606f08a..aa11e012af20 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
@@ -15,8 +15,8 @@ define <vscale x 1 x half> @vp_ceil_vv_nxv1f16(<vscale x 1 x half> %va, <vscale
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -57,8 +57,8 @@ define <vscale x 2 x half> @vp_ceil_vv_nxv2f16(<vscale x 2 x half> %va, <vscale
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -99,8 +99,8 @@ define <vscale x 4 x half> @vp_ceil_vv_nxv4f16(<vscale x 4 x half> %va, <vscale
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -142,8 +142,8 @@ define <vscale x 8 x half> @vp_ceil_vv_nxv8f16(<vscale x 8 x half> %va, <vscale
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -186,8 +186,8 @@ define <vscale x 16 x half> @vp_ceil_vv_nxv16f16(<vscale x 16 x half> %va, <vsca
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -230,8 +230,8 @@ define <vscale x 32 x half> @vp_ceil_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -273,8 +273,8 @@ define <vscale x 1 x float> @vp_ceil_vv_nxv1f32(<vscale x 1 x float> %va, <vscal
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -315,8 +315,8 @@ define <vscale x 2 x float> @vp_ceil_vv_nxv2f32(<vscale x 2 x float> %va, <vscal
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -358,8 +358,8 @@ define <vscale x 4 x float> @vp_ceil_vv_nxv4f32(<vscale x 4 x float> %va, <vscal
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -402,8 +402,8 @@ define <vscale x 8 x float> @vp_ceil_vv_nxv8f32(<vscale x 8 x float> %va, <vscal
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -446,8 +446,8 @@ define <vscale x 16 x float> @vp_ceil_vv_nxv16f32(<vscale x 16 x float> %va, <vs
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -489,8 +489,8 @@ define <vscale x 1 x double> @vp_ceil_vv_nxv1f64(<vscale x 1 x double> %va, <vsc
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -532,8 +532,8 @@ define <vscale x 2 x double> @vp_ceil_vv_nxv2f64(<vscale x 2 x double> %va, <vsc
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -576,8 +576,8 @@ define <vscale x 4 x double> @vp_ceil_vv_nxv4f64(<vscale x 4 x double> %va, <vsc
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -620,8 +620,8 @@ define <vscale x 7 x double> @vp_ceil_vv_nxv7f64(<vscale x 7 x double> %va, <vsc
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -664,8 +664,8 @@ define <vscale x 8 x double> @vp_ceil_vv_nxv8f64(<vscale x 8 x double> %va, <vsc
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -726,8 +726,8 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64(<vscale x 16 x double> %va, <
 ; CHECK-NEXT:    vfabs.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v25, v8, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a2, 3
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    fsrm a2
@@ -750,8 +750,8 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64(<vscale x 16 x double> %va, <
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v24, v16, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/coalesce-vsetvli.mir b/llvm/test/CodeGen/RISCV/rvv/coalesce-vsetvli.mir
new file mode 100644
index 000000000000..f888534ebc03
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/coalesce-vsetvli.mir
@@ -0,0 +1,66 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc %s -o - -mtriple=riscv64 -mattr=v -run-pass=riscv-coalesce-vsetvli -verify-machineinstrs | FileCheck %s
+
+---
+name: dead_avl_addi
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: dead_avl_addi
+    ; CHECK: $x0 = PseudoVSETIVLI 3, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead %x:gpr = PseudoVMV_X_S $noreg, 6 /* e64 */
+    ; CHECK-NEXT: $v0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 3, 6 /* e64 */, 0 /* tu, mu */
+    ; CHECK-NEXT: PseudoRET
+    %avl:gprnox0 = ADDI $x0, 42
+    dead $x0 = PseudoVSETVLI killed %avl, 216, implicit-def $vl, implicit-def $vtype
+    %x:gpr = PseudoVMV_X_S $noreg, 6
+    dead $x0 = PseudoVSETIVLI 3, 216, implicit-def $vl, implicit-def $vtype
+    $v0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 3, 6, 0
+    PseudoRET
+...
+---
+name: dead_avl_nonvolatile_load
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x1
+    ; CHECK-LABEL: name: dead_avl_nonvolatile_load
+    ; CHECK: liveins: $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %ptr:gpr = COPY $x1
+    ; CHECK-NEXT: dead %avl:gprnox0 = LW %ptr, 0 :: (dereferenceable load (s32))
+    ; CHECK-NEXT: $x0 = PseudoVSETIVLI 3, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead %x:gpr = PseudoVMV_X_S $noreg, 6 /* e64 */
+    ; CHECK-NEXT: $v0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 3, 6 /* e64 */, 0 /* tu, mu */
+    ; CHECK-NEXT: PseudoRET
+    %ptr:gpr = COPY $x1
+    %avl:gprnox0 = LW killed %ptr, 0 :: (dereferenceable load (s32))
+    dead $x0 = PseudoVSETVLI killed %avl, 216, implicit-def $vl, implicit-def $vtype
+    %x:gpr = PseudoVMV_X_S $noreg, 6
+    dead $x0 = PseudoVSETIVLI 3, 216, implicit-def $vl, implicit-def $vtype
+    $v0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 3, 6, 0
+    PseudoRET
+...
+---
+name: dead_avl_volatile_load
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x1
+    ; CHECK-LABEL: name: dead_avl_volatile_load
+    ; CHECK: liveins: $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %ptr:gpr = COPY $x1
+    ; CHECK-NEXT: dead %avl:gprnox0 = LW %ptr, 0 :: (volatile dereferenceable load (s32))
+    ; CHECK-NEXT: $x0 = PseudoVSETIVLI 3, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead %x:gpr = PseudoVMV_X_S $noreg, 6 /* e64 */
+    ; CHECK-NEXT: $v0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 3, 6 /* e64 */, 0 /* tu, mu */
+    ; CHECK-NEXT: PseudoRET
+    %ptr:gpr = COPY $x1
+    %avl:gprnox0 = LW killed %ptr, 0 :: (volatile dereferenceable load (s32))
+    dead $x0 = PseudoVSETVLI killed %avl, 216, implicit-def $vl, implicit-def $vtype
+    %x:gpr = PseudoVMV_X_S $noreg, 6
+    dead $x0 = PseudoVSETIVLI 3, 216, implicit-def $vl, implicit-def $vtype
+    $v0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 3, 6, 0
+    PseudoRET
+...
diff --git a/llvm/test/CodeGen/RISCV/rvv/commutable.ll b/llvm/test/CodeGen/RISCV/rvv/commutable.ll
index d94b529bac01..5bca2eeb3fdd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/commutable.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/commutable.ll
@@ -720,8 +720,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vaadd.nxv1i64.nxv1i64(<vscale x 1 x i64>,
 define <vscale x 1 x i64> @commutable_vaadd_vv(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: commutable_vaadd_vv:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vaadd.vv v8, v8, v9
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
@@ -737,8 +737,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vaadd.mask.nxv1i64.nxv1i64(<vscale x 1 x
 define <vscale x 1 x i64> @commutable_vaadd_vv_masked(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %mask, iXLen %2) {
 ; CHECK-LABEL: commutable_vaadd_vv_masked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vaadd.vv v10, v8, v9, v0.t
 ; CHECK-NEXT:    vaadd.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
@@ -755,8 +755,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vaaddu.nxv1i64.nxv1i64(<vscale x 1 x i64>
 define <vscale x 1 x i64> @commutable_vaaddu_vv(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: commutable_vaaddu_vv:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v9
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
@@ -772,8 +772,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vaaddu.mask.nxv1i64.nxv1i64(<vscale x 1 x
 define <vscale x 1 x i64> @commutable_vaaddu_vv_masked(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %mask, iXLen %2) {
 ; CHECK-LABEL: commutable_vaaddu_vv_masked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v10, v8, v9, v0.t
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
@@ -790,8 +790,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vsmul.nxv1i64.nxv1i64(<vscale x 1 x i64>,
 define <vscale x 1 x i64> @commutable_vsmul_vv(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: commutable_vsmul_vv:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v9
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
@@ -807,8 +807,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vsmul.mask.nxv1i64.nxv1i64(<vscale x 1 x
 define <vscale x 1 x i64> @commutable_vsmul_vv_masked(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %mask, iXLen %2) {
 ; CHECK-LABEL: commutable_vsmul_vv_masked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vsmul.vv v10, v8, v9, v0.t
 ; CHECK-NEXT:    vsmul.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/copyprop.mir b/llvm/test/CodeGen/RISCV/rvv/copyprop.mir
index 95c227518f5c..1718dc90eed4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/copyprop.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/copyprop.mir
@@ -43,16 +43,12 @@ body:             |
     %2:gpr = COPY $x11
     %1:gpr = COPY $x10
     %3:vr = COPY $v8
-    %pt5:vr = IMPLICIT_DEF
-    %17:vr = PseudoVSLL_VI_M1 %pt5, %3, 5, 1, 6 /* e64 */, 0
+    %17:vr = PseudoVSLL_VI_M1 undef $noreg, %3, 5, 1, 6 /* e64 */, 0
     %22:vr = PseudoVMSNE_VI_M1 %3, 0, 1, 6 /* e64 */
     $v0 = COPY %22
-    %26:vrnov0 = IMPLICIT_DEF
-    %25:vrnov0 = PseudoVMERGE_VIM_M1 %26, %17, -1, $v0, 1, 6 /* e64 */
-    %pt8:vr = IMPLICIT_DEF
+    %25:vrnov0 = PseudoVMERGE_VIM_M1 undef $noreg, %17, -1, $v0, 1, 6 /* e64 */
     %29:vr = PseudoVC_V_X_SE_M1 3, 31, %2, 1, 6 /* e64 */, implicit-def dead $vcix_state, implicit $vcix_state
-    %pt9:vr = IMPLICIT_DEF
-    %30:vr = PseudoVMV_V_I_M1 %pt9, 0, 1, 6 /* e64 */, 0
+    %30:vr = PseudoVMV_V_I_M1 undef $noreg, 0, 1, 6 /* e64 */, 0
     BGEU %1, $x0, %bb.2
 
   bb.1.entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
index d756cfcf7077..41ec102c34ef 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
@@ -806,8 +806,8 @@ define <vscale x 1 x i32> @ctlz_nxv1i32(<vscale x 1 x i32> %va) {
 ;
 ; CHECK-F-LABEL: ctlz_nxv1i32:
 ; CHECK-F:       # %bb.0:
-; CHECK-F-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    li a1, 158
@@ -878,8 +878,8 @@ define <vscale x 2 x i32> @ctlz_nxv2i32(<vscale x 2 x i32> %va) {
 ;
 ; CHECK-F-LABEL: ctlz_nxv2i32:
 ; CHECK-F:       # %bb.0:
-; CHECK-F-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    li a1, 158
@@ -950,8 +950,8 @@ define <vscale x 4 x i32> @ctlz_nxv4i32(<vscale x 4 x i32> %va) {
 ;
 ; CHECK-F-LABEL: ctlz_nxv4i32:
 ; CHECK-F:       # %bb.0:
-; CHECK-F-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    li a1, 158
@@ -1022,8 +1022,8 @@ define <vscale x 8 x i32> @ctlz_nxv8i32(<vscale x 8 x i32> %va) {
 ;
 ; CHECK-F-LABEL: ctlz_nxv8i32:
 ; CHECK-F:       # %bb.0:
-; CHECK-F-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    li a1, 158
@@ -1094,8 +1094,8 @@ define <vscale x 16 x i32> @ctlz_nxv16i32(<vscale x 16 x i32> %va) {
 ;
 ; CHECK-F-LABEL: ctlz_nxv16i32:
 ; CHECK-F:       # %bb.0:
-; CHECK-F-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    li a1, 158
@@ -1107,8 +1107,8 @@ define <vscale x 16 x i32> @ctlz_nxv16i32(<vscale x 16 x i32> %va) {
 ;
 ; CHECK-D-LABEL: ctlz_nxv16i32:
 ; CHECK-D:       # %bb.0:
-; CHECK-D-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-D-NEXT:    fsrmi a0, 1
+; CHECK-D-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-D-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-D-NEXT:    li a1, 158
@@ -1234,8 +1234,8 @@ define <vscale x 1 x i64> @ctlz_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV32F-NEXT:    li a0, 190
 ; RV32F-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; RV32F-NEXT:    vmv.v.x v9, a0
-; RV32F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV32F-NEXT:    fsrmi a0, 1
+; RV32F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV32F-NEXT:    vfncvt.f.xu.w v10, v8
 ; RV32F-NEXT:    vsrl.vi v8, v10, 23
 ; RV32F-NEXT:    vwsubu.wv v9, v9, v8
@@ -1262,8 +1262,8 @@ define <vscale x 1 x i64> @ctlz_nxv1i64(<vscale x 1 x i64> %va) {
 ;
 ; CHECK-D-LABEL: ctlz_nxv1i64:
 ; CHECK-D:       # %bb.0:
-; CHECK-D-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-D-NEXT:    fsrmi a0, 1
+; CHECK-D-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-D-NEXT:    li a1, 52
 ; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
@@ -1390,8 +1390,8 @@ define <vscale x 2 x i64> @ctlz_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV32F-NEXT:    li a0, 190
 ; RV32F-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; RV32F-NEXT:    vmv.v.x v10, a0
-; RV32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32F-NEXT:    fsrmi a0, 1
+; RV32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32F-NEXT:    vfncvt.f.xu.w v12, v8
 ; RV32F-NEXT:    vsrl.vi v8, v12, 23
 ; RV32F-NEXT:    vwsubu.wv v10, v10, v8
@@ -1418,8 +1418,8 @@ define <vscale x 2 x i64> @ctlz_nxv2i64(<vscale x 2 x i64> %va) {
 ;
 ; CHECK-D-LABEL: ctlz_nxv2i64:
 ; CHECK-D:       # %bb.0:
-; CHECK-D-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-D-NEXT:    fsrmi a0, 1
+; CHECK-D-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-D-NEXT:    li a1, 52
 ; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
@@ -1546,8 +1546,8 @@ define <vscale x 4 x i64> @ctlz_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV32F-NEXT:    li a0, 190
 ; RV32F-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; RV32F-NEXT:    vmv.v.x v12, a0
-; RV32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32F-NEXT:    fsrmi a0, 1
+; RV32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32F-NEXT:    vfncvt.f.xu.w v16, v8
 ; RV32F-NEXT:    vsrl.vi v8, v16, 23
 ; RV32F-NEXT:    vwsubu.wv v12, v12, v8
@@ -1574,8 +1574,8 @@ define <vscale x 4 x i64> @ctlz_nxv4i64(<vscale x 4 x i64> %va) {
 ;
 ; CHECK-D-LABEL: ctlz_nxv4i64:
 ; CHECK-D:       # %bb.0:
-; CHECK-D-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-D-NEXT:    fsrmi a0, 1
+; CHECK-D-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-D-NEXT:    li a1, 52
 ; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
@@ -1702,8 +1702,8 @@ define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32F-NEXT:    li a0, 190
 ; RV32F-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV32F-NEXT:    vmv.v.x v16, a0
-; RV32F-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; RV32F-NEXT:    fsrmi a0, 1
+; RV32F-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; RV32F-NEXT:    vfncvt.f.xu.w v24, v8
 ; RV32F-NEXT:    vsrl.vi v8, v24, 23
 ; RV32F-NEXT:    vwsubu.wv v16, v16, v8
@@ -1730,8 +1730,8 @@ define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
 ;
 ; CHECK-D-LABEL: ctlz_nxv8i64:
 ; CHECK-D:       # %bb.0:
-; CHECK-D-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-D-NEXT:    fsrmi a0, 1
+; CHECK-D-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-D-NEXT:    li a1, 52
 ; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
@@ -2497,8 +2497,8 @@ define <vscale x 1 x i32> @ctlz_zero_undef_nxv1i32(<vscale x 1 x i32> %va) {
 ;
 ; CHECK-F-LABEL: ctlz_zero_undef_nxv1i32:
 ; CHECK-F:       # %bb.0:
-; CHECK-F-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    li a1, 158
@@ -2564,8 +2564,8 @@ define <vscale x 2 x i32> @ctlz_zero_undef_nxv2i32(<vscale x 2 x i32> %va) {
 ;
 ; CHECK-F-LABEL: ctlz_zero_undef_nxv2i32:
 ; CHECK-F:       # %bb.0:
-; CHECK-F-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    li a1, 158
@@ -2631,8 +2631,8 @@ define <vscale x 4 x i32> @ctlz_zero_undef_nxv4i32(<vscale x 4 x i32> %va) {
 ;
 ; CHECK-F-LABEL: ctlz_zero_undef_nxv4i32:
 ; CHECK-F:       # %bb.0:
-; CHECK-F-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    li a1, 158
@@ -2698,8 +2698,8 @@ define <vscale x 8 x i32> @ctlz_zero_undef_nxv8i32(<vscale x 8 x i32> %va) {
 ;
 ; CHECK-F-LABEL: ctlz_zero_undef_nxv8i32:
 ; CHECK-F:       # %bb.0:
-; CHECK-F-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    li a1, 158
@@ -2765,8 +2765,8 @@ define <vscale x 16 x i32> @ctlz_zero_undef_nxv16i32(<vscale x 16 x i32> %va) {
 ;
 ; CHECK-F-LABEL: ctlz_zero_undef_nxv16i32:
 ; CHECK-F:       # %bb.0:
-; CHECK-F-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    li a1, 158
@@ -2776,8 +2776,8 @@ define <vscale x 16 x i32> @ctlz_zero_undef_nxv16i32(<vscale x 16 x i32> %va) {
 ;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv16i32:
 ; CHECK-D:       # %bb.0:
-; CHECK-D-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-D-NEXT:    fsrmi a0, 1
+; CHECK-D-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-D-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-D-NEXT:    li a1, 158
@@ -2900,8 +2900,8 @@ define <vscale x 1 x i64> @ctlz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV32F-NEXT:    li a0, 190
 ; RV32F-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; RV32F-NEXT:    vmv.v.x v9, a0
-; RV32F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV32F-NEXT:    fsrmi a0, 1
+; RV32F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV32F-NEXT:    vfncvt.f.xu.w v10, v8
 ; RV32F-NEXT:    vsrl.vi v8, v10, 23
 ; RV32F-NEXT:    vwsubu.wv v9, v9, v8
@@ -2923,8 +2923,8 @@ define <vscale x 1 x i64> @ctlz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
 ;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv1i64:
 ; CHECK-D:       # %bb.0:
-; CHECK-D-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-D-NEXT:    fsrmi a0, 1
+; CHECK-D-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-D-NEXT:    li a1, 52
 ; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
@@ -3048,8 +3048,8 @@ define <vscale x 2 x i64> @ctlz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV32F-NEXT:    li a0, 190
 ; RV32F-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; RV32F-NEXT:    vmv.v.x v10, a0
-; RV32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32F-NEXT:    fsrmi a0, 1
+; RV32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32F-NEXT:    vfncvt.f.xu.w v12, v8
 ; RV32F-NEXT:    vsrl.vi v8, v12, 23
 ; RV32F-NEXT:    vwsubu.wv v10, v10, v8
@@ -3071,8 +3071,8 @@ define <vscale x 2 x i64> @ctlz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
 ;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv2i64:
 ; CHECK-D:       # %bb.0:
-; CHECK-D-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-D-NEXT:    fsrmi a0, 1
+; CHECK-D-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-D-NEXT:    li a1, 52
 ; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
@@ -3196,8 +3196,8 @@ define <vscale x 4 x i64> @ctlz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV32F-NEXT:    li a0, 190
 ; RV32F-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; RV32F-NEXT:    vmv.v.x v12, a0
-; RV32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32F-NEXT:    fsrmi a0, 1
+; RV32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32F-NEXT:    vfncvt.f.xu.w v16, v8
 ; RV32F-NEXT:    vsrl.vi v8, v16, 23
 ; RV32F-NEXT:    vwsubu.wv v12, v12, v8
@@ -3219,8 +3219,8 @@ define <vscale x 4 x i64> @ctlz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
 ;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv4i64:
 ; CHECK-D:       # %bb.0:
-; CHECK-D-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-D-NEXT:    fsrmi a0, 1
+; CHECK-D-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-D-NEXT:    li a1, 52
 ; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
@@ -3345,8 +3345,8 @@ define <vscale x 8 x i64> @ctlz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32F-NEXT:    li a0, 190
 ; RV32F-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV32F-NEXT:    vmv.v.x v8, a0
-; RV32F-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; RV32F-NEXT:    fsrmi a0, 1
+; RV32F-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; RV32F-NEXT:    vfncvt.f.xu.w v24, v16
 ; RV32F-NEXT:    vsrl.vi v16, v24, 23
 ; RV32F-NEXT:    vwsubu.wv v8, v8, v16
@@ -3367,8 +3367,8 @@ define <vscale x 8 x i64> @ctlz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
 ;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv8i64:
 ; CHECK-D:       # %bb.0:
-; CHECK-D-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-D-NEXT:    fsrmi a0, 1
+; CHECK-D-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-D-NEXT:    li a1, 52
 ; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
index 2a75e5ce7175..86086f5dc88f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
@@ -937,15 +937,15 @@ declare <vscale x 16 x i32> @llvm.vp.ctlz.nxv16i32(<vscale x 16 x i32>, i1 immar
 define <vscale x 16 x i32> @vp_ctlz_nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv16i32:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
-; CHECK-NEXT:    li a1, 158
-; CHECK-NEXT:    vrsub.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vminu.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 158
+; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv16i32:
@@ -960,15 +960,15 @@ define <vscale x 16 x i32> @vp_ctlz_nxv16i32(<vscale x 16 x i32> %va, <vscale x
 define <vscale x 16 x i32> @vp_ctlz_nxv16i32_unmasked(<vscale x 16 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv16i32_unmasked:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-NEXT:    li a1, 158
-; CHECK-NEXT:    vrsub.vx v8, v8, a1
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vminu.vx v8, v8, a1
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 158
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vminu.vx v8, v8, a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv16i32_unmasked:
@@ -985,16 +985,16 @@ declare <vscale x 1 x i64> @llvm.vp.ctlz.nxv1i64(<vscale x 1 x i64>, i1 immarg,
 define <vscale x 1 x i64> @vp_ctlz_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv1i64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    li a1, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    li a1, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    li a1, 64
-; CHECK-NEXT:    vminu.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 64
+; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv1i64:
@@ -1009,16 +1009,16 @@ define <vscale x 1 x i64> @vp_ctlz_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x
 define <vscale x 1 x i64> @vp_ctlz_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv1i64_unmasked:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    li a1, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a1
-; CHECK-NEXT:    li a1, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a1
-; CHECK-NEXT:    li a1, 64
-; CHECK-NEXT:    vminu.vx v8, v8, a1
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 64
+; CHECK-NEXT:    vminu.vx v8, v8, a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv1i64_unmasked:
@@ -1035,16 +1035,16 @@ declare <vscale x 2 x i64> @llvm.vp.ctlz.nxv2i64(<vscale x 2 x i64>, i1 immarg,
 define <vscale x 2 x i64> @vp_ctlz_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv2i64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    li a1, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    li a1, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    li a1, 64
-; CHECK-NEXT:    vminu.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 64
+; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv2i64:
@@ -1059,16 +1059,16 @@ define <vscale x 2 x i64> @vp_ctlz_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x
 define <vscale x 2 x i64> @vp_ctlz_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv2i64_unmasked:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    li a1, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a1
-; CHECK-NEXT:    li a1, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a1
-; CHECK-NEXT:    li a1, 64
-; CHECK-NEXT:    vminu.vx v8, v8, a1
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 64
+; CHECK-NEXT:    vminu.vx v8, v8, a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv2i64_unmasked:
@@ -1085,16 +1085,16 @@ declare <vscale x 4 x i64> @llvm.vp.ctlz.nxv4i64(<vscale x 4 x i64>, i1 immarg,
 define <vscale x 4 x i64> @vp_ctlz_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv4i64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    li a1, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    li a1, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    li a1, 64
-; CHECK-NEXT:    vminu.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 64
+; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv4i64:
@@ -1109,16 +1109,16 @@ define <vscale x 4 x i64> @vp_ctlz_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x
 define <vscale x 4 x i64> @vp_ctlz_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv4i64_unmasked:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    li a1, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a1
-; CHECK-NEXT:    li a1, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a1
-; CHECK-NEXT:    li a1, 64
-; CHECK-NEXT:    vminu.vx v8, v8, a1
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 64
+; CHECK-NEXT:    vminu.vx v8, v8, a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv4i64_unmasked:
@@ -1135,16 +1135,16 @@ declare <vscale x 7 x i64> @llvm.vp.ctlz.nxv7i64(<vscale x 7 x i64>, i1 immarg,
 define <vscale x 7 x i64> @vp_ctlz_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv7i64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    li a1, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    li a1, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    li a1, 64
-; CHECK-NEXT:    vminu.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 64
+; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv7i64:
@@ -1159,16 +1159,16 @@ define <vscale x 7 x i64> @vp_ctlz_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x
 define <vscale x 7 x i64> @vp_ctlz_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv7i64_unmasked:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    li a1, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a1
-; CHECK-NEXT:    li a1, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a1
-; CHECK-NEXT:    li a1, 64
-; CHECK-NEXT:    vminu.vx v8, v8, a1
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 64
+; CHECK-NEXT:    vminu.vx v8, v8, a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv7i64_unmasked:
@@ -1185,16 +1185,16 @@ declare <vscale x 8 x i64> @llvm.vp.ctlz.nxv8i64(<vscale x 8 x i64>, i1 immarg,
 define <vscale x 8 x i64> @vp_ctlz_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv8i64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    li a1, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    li a1, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    li a1, 64
-; CHECK-NEXT:    vminu.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 64
+; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv8i64:
@@ -1209,16 +1209,16 @@ define <vscale x 8 x i64> @vp_ctlz_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x
 define <vscale x 8 x i64> @vp_ctlz_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_nxv8i64_unmasked:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    li a1, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a1
-; CHECK-NEXT:    li a1, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a1
-; CHECK-NEXT:    li a1, 64
-; CHECK-NEXT:    vminu.vx v8, v8, a1
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 64
+; CHECK-NEXT:    vminu.vx v8, v8, a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv8i64_unmasked:
@@ -1244,10 +1244,10 @@ define <vscale x 16 x i64> @vp_ctlz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    fsrmi a3, 1
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    fsrmi a2, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v16, v16, v0.t
-; CHECK-NEXT:    fsrm a2
+; CHECK-NEXT:    fsrm a3
 ; CHECK-NEXT:    li a2, 52
 ; CHECK-NEXT:    vsrl.vx v16, v16, a2, v0.t
 ; CHECK-NEXT:    li a3, 1086
@@ -1258,14 +1258,14 @@ define <vscale x 16 x i64> @vp_ctlz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB46_2:
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vrsub.vx v8, v8, a3, v0.t
 ; CHECK-NEXT:    vminu.vx v8, v8, a4, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv16i64:
@@ -1301,10 +1301,10 @@ define <vscale x 16 x i64> @vp_ctlz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    fsrmi a3, 1
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    fsrmi a2, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
-; CHECK-NEXT:    fsrm a2
+; CHECK-NEXT:    fsrm a3
 ; CHECK-NEXT:    li a2, 52
 ; CHECK-NEXT:    vsrl.vx v16, v16, a2
 ; CHECK-NEXT:    li a3, 1086
@@ -1315,13 +1315,13 @@ define <vscale x 16 x i64> @vp_ctlz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB47_2:
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    vsrl.vx v8, v8, a2
 ; CHECK-NEXT:    vrsub.vx v8, v8, a3
 ; CHECK-NEXT:    vminu.vx v8, v8, a4
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv16i64_unmasked:
@@ -2198,13 +2198,13 @@ define <vscale x 8 x i32> @vp_ctlz_zero_undef_nxv8i32_unmasked(<vscale x 8 x i32
 define <vscale x 16 x i32> @vp_ctlz_zero_undef_nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv16i32:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
-; CHECK-NEXT:    li a1, 158
-; CHECK-NEXT:    vrsub.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 158
+; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv16i32:
@@ -2219,13 +2219,13 @@ define <vscale x 16 x i32> @vp_ctlz_zero_undef_nxv16i32(<vscale x 16 x i32> %va,
 define <vscale x 16 x i32> @vp_ctlz_zero_undef_nxv16i32_unmasked(<vscale x 16 x i32> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv16i32_unmasked:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-NEXT:    li a1, 158
-; CHECK-NEXT:    vrsub.vx v8, v8, a1
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 158
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv16i32_unmasked:
@@ -2241,14 +2241,14 @@ define <vscale x 16 x i32> @vp_ctlz_zero_undef_nxv16i32_unmasked(<vscale x 16 x
 define <vscale x 1 x i64> @vp_ctlz_zero_undef_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv1i64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    li a1, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    li a1, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv1i64:
@@ -2263,14 +2263,14 @@ define <vscale x 1 x i64> @vp_ctlz_zero_undef_nxv1i64(<vscale x 1 x i64> %va, <v
 define <vscale x 1 x i64> @vp_ctlz_zero_undef_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv1i64_unmasked:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    li a1, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a1
-; CHECK-NEXT:    li a1, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a1
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv1i64_unmasked:
@@ -2286,14 +2286,14 @@ define <vscale x 1 x i64> @vp_ctlz_zero_undef_nxv1i64_unmasked(<vscale x 1 x i64
 define <vscale x 2 x i64> @vp_ctlz_zero_undef_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv2i64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    li a1, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    li a1, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv2i64:
@@ -2308,14 +2308,14 @@ define <vscale x 2 x i64> @vp_ctlz_zero_undef_nxv2i64(<vscale x 2 x i64> %va, <v
 define <vscale x 2 x i64> @vp_ctlz_zero_undef_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv2i64_unmasked:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    li a1, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a1
-; CHECK-NEXT:    li a1, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a1
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv2i64_unmasked:
@@ -2331,14 +2331,14 @@ define <vscale x 2 x i64> @vp_ctlz_zero_undef_nxv2i64_unmasked(<vscale x 2 x i64
 define <vscale x 4 x i64> @vp_ctlz_zero_undef_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv4i64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    li a1, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    li a1, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv4i64:
@@ -2353,14 +2353,14 @@ define <vscale x 4 x i64> @vp_ctlz_zero_undef_nxv4i64(<vscale x 4 x i64> %va, <v
 define <vscale x 4 x i64> @vp_ctlz_zero_undef_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv4i64_unmasked:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    li a1, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a1
-; CHECK-NEXT:    li a1, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a1
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv4i64_unmasked:
@@ -2376,14 +2376,14 @@ define <vscale x 4 x i64> @vp_ctlz_zero_undef_nxv4i64_unmasked(<vscale x 4 x i64
 define <vscale x 7 x i64> @vp_ctlz_zero_undef_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv7i64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    li a1, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    li a1, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv7i64:
@@ -2398,14 +2398,14 @@ define <vscale x 7 x i64> @vp_ctlz_zero_undef_nxv7i64(<vscale x 7 x i64> %va, <v
 define <vscale x 7 x i64> @vp_ctlz_zero_undef_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv7i64_unmasked:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    li a1, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a1
-; CHECK-NEXT:    li a1, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a1
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv7i64_unmasked:
@@ -2421,14 +2421,14 @@ define <vscale x 7 x i64> @vp_ctlz_zero_undef_nxv7i64_unmasked(<vscale x 7 x i64
 define <vscale x 8 x i64> @vp_ctlz_zero_undef_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv8i64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    li a1, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    li a1, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv8i64:
@@ -2443,14 +2443,14 @@ define <vscale x 8 x i64> @vp_ctlz_zero_undef_nxv8i64(<vscale x 8 x i64> %va, <v
 define <vscale x 8 x i64> @vp_ctlz_zero_undef_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv8i64_unmasked:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    li a1, 52
-; CHECK-NEXT:    vsrl.vx v8, v8, a1
-; CHECK-NEXT:    li a1, 1086
-; CHECK-NEXT:    vrsub.vx v8, v8, a1
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 1086
+; CHECK-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv8i64_unmasked:
@@ -2474,10 +2474,10 @@ define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    fsrmi a3, 1
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    fsrmi a2, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v16, v16, v0.t
-; CHECK-NEXT:    fsrm a2
+; CHECK-NEXT:    fsrm a3
 ; CHECK-NEXT:    li a2, 52
 ; CHECK-NEXT:    vsrl.vx v16, v16, a2, v0.t
 ; CHECK-NEXT:    li a3, 1086
@@ -2486,13 +2486,13 @@ define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB94_2:
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vrsub.vx v8, v8, a3, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv16i64:
@@ -2528,10 +2528,10 @@ define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64_unmasked(<vscale x 16 x
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    fsrmi a3, 1
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    fsrmi a2, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
-; CHECK-NEXT:    fsrm a2
+; CHECK-NEXT:    fsrm a3
 ; CHECK-NEXT:    li a2, 52
 ; CHECK-NEXT:    vsrl.vx v16, v16, a2
 ; CHECK-NEXT:    li a3, 1086
@@ -2540,12 +2540,12 @@ define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64_unmasked(<vscale x 16 x
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB95_2:
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    vsrl.vx v8, v8, a2
 ; CHECK-NEXT:    vrsub.vx v8, v8, a3
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv16i64_unmasked:
diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
index d13f4d2dca1f..479664c6f5f6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
@@ -1219,8 +1219,8 @@ define <vscale x 1 x i64> @cttz_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV32F-NEXT:    vmseq.vx v0, v8, zero
 ; RV32F-NEXT:    vrsub.vi v9, v8, 0
 ; RV32F-NEXT:    vand.vv v8, v8, v9
-; RV32F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV32F-NEXT:    fsrmi a0, 1
+; RV32F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV32F-NEXT:    vfncvt.f.xu.w v9, v8
 ; RV32F-NEXT:    vsrl.vi v8, v9, 23
 ; RV32F-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
@@ -1237,8 +1237,8 @@ define <vscale x 1 x i64> @cttz_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV64F-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV64F-NEXT:    vrsub.vi v9, v8, 0
 ; RV64F-NEXT:    vand.vv v9, v8, v9
-; RV64F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV64F-NEXT:    fsrmi a0, 1
+; RV64F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV64F-NEXT:    vfncvt.f.xu.w v10, v9
 ; RV64F-NEXT:    vsrl.vi v9, v10, 23
 ; RV64F-NEXT:    li a1, 127
@@ -1381,8 +1381,8 @@ define <vscale x 2 x i64> @cttz_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV32F-NEXT:    vmseq.vx v0, v8, zero
 ; RV32F-NEXT:    vrsub.vi v10, v8, 0
 ; RV32F-NEXT:    vand.vv v8, v8, v10
-; RV32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32F-NEXT:    fsrmi a0, 1
+; RV32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32F-NEXT:    vfncvt.f.xu.w v10, v8
 ; RV32F-NEXT:    vsrl.vi v8, v10, 23
 ; RV32F-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
@@ -1399,8 +1399,8 @@ define <vscale x 2 x i64> @cttz_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV64F-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; RV64F-NEXT:    vrsub.vi v10, v8, 0
 ; RV64F-NEXT:    vand.vv v10, v8, v10
-; RV64F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV64F-NEXT:    fsrmi a0, 1
+; RV64F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV64F-NEXT:    vfncvt.f.xu.w v12, v10
 ; RV64F-NEXT:    vsrl.vi v10, v12, 23
 ; RV64F-NEXT:    li a1, 127
@@ -1543,8 +1543,8 @@ define <vscale x 4 x i64> @cttz_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV32F-NEXT:    vmseq.vx v0, v8, zero
 ; RV32F-NEXT:    vrsub.vi v12, v8, 0
 ; RV32F-NEXT:    vand.vv v8, v8, v12
-; RV32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32F-NEXT:    fsrmi a0, 1
+; RV32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32F-NEXT:    vfncvt.f.xu.w v12, v8
 ; RV32F-NEXT:    vsrl.vi v8, v12, 23
 ; RV32F-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
@@ -1561,8 +1561,8 @@ define <vscale x 4 x i64> @cttz_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV64F-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV64F-NEXT:    vrsub.vi v12, v8, 0
 ; RV64F-NEXT:    vand.vv v12, v8, v12
-; RV64F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV64F-NEXT:    fsrmi a0, 1
+; RV64F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV64F-NEXT:    vfncvt.f.xu.w v16, v12
 ; RV64F-NEXT:    vsrl.vi v12, v16, 23
 ; RV64F-NEXT:    li a1, 127
@@ -1705,8 +1705,8 @@ define <vscale x 8 x i64> @cttz_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32F-NEXT:    vmseq.vx v0, v8, zero
 ; RV32F-NEXT:    vrsub.vi v16, v8, 0
 ; RV32F-NEXT:    vand.vv v8, v8, v16
-; RV32F-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; RV32F-NEXT:    fsrmi a0, 1
+; RV32F-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; RV32F-NEXT:    vfncvt.f.xu.w v16, v8
 ; RV32F-NEXT:    vsrl.vi v8, v16, 23
 ; RV32F-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
@@ -1723,8 +1723,8 @@ define <vscale x 8 x i64> @cttz_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV64F-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; RV64F-NEXT:    vrsub.vi v16, v8, 0
 ; RV64F-NEXT:    vand.vv v16, v8, v16
-; RV64F-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; RV64F-NEXT:    fsrmi a0, 1
+; RV64F-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; RV64F-NEXT:    vfncvt.f.xu.w v24, v16
 ; RV64F-NEXT:    vsrl.vi v16, v24, 23
 ; RV64F-NEXT:    li a1, 127
@@ -2892,8 +2892,8 @@ define <vscale x 1 x i64> @cttz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV32F-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV32F-NEXT:    vrsub.vi v9, v8, 0
 ; RV32F-NEXT:    vand.vv v8, v8, v9
-; RV32F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV32F-NEXT:    fsrmi a0, 1
+; RV32F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV32F-NEXT:    vfncvt.f.xu.w v9, v8
 ; RV32F-NEXT:    vsrl.vi v8, v9, 23
 ; RV32F-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
@@ -2908,8 +2908,8 @@ define <vscale x 1 x i64> @cttz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV64F-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV64F-NEXT:    vrsub.vi v9, v8, 0
 ; RV64F-NEXT:    vand.vv v8, v8, v9
-; RV64F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV64F-NEXT:    fsrmi a0, 1
+; RV64F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV64F-NEXT:    vfncvt.f.xu.w v9, v8
 ; RV64F-NEXT:    vsrl.vi v9, v9, 23
 ; RV64F-NEXT:    li a1, 127
@@ -3026,8 +3026,8 @@ define <vscale x 2 x i64> @cttz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV32F-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; RV32F-NEXT:    vrsub.vi v10, v8, 0
 ; RV32F-NEXT:    vand.vv v8, v8, v10
-; RV32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32F-NEXT:    fsrmi a0, 1
+; RV32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32F-NEXT:    vfncvt.f.xu.w v10, v8
 ; RV32F-NEXT:    vsrl.vi v8, v10, 23
 ; RV32F-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
@@ -3042,8 +3042,8 @@ define <vscale x 2 x i64> @cttz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV64F-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; RV64F-NEXT:    vrsub.vi v10, v8, 0
 ; RV64F-NEXT:    vand.vv v8, v8, v10
-; RV64F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV64F-NEXT:    fsrmi a0, 1
+; RV64F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV64F-NEXT:    vfncvt.f.xu.w v10, v8
 ; RV64F-NEXT:    vsrl.vi v10, v10, 23
 ; RV64F-NEXT:    li a1, 127
@@ -3160,8 +3160,8 @@ define <vscale x 4 x i64> @cttz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV32F-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV32F-NEXT:    vrsub.vi v12, v8, 0
 ; RV32F-NEXT:    vand.vv v8, v8, v12
-; RV32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32F-NEXT:    fsrmi a0, 1
+; RV32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32F-NEXT:    vfncvt.f.xu.w v12, v8
 ; RV32F-NEXT:    vsrl.vi v8, v12, 23
 ; RV32F-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
@@ -3176,8 +3176,8 @@ define <vscale x 4 x i64> @cttz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV64F-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV64F-NEXT:    vrsub.vi v12, v8, 0
 ; RV64F-NEXT:    vand.vv v8, v8, v12
-; RV64F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV64F-NEXT:    fsrmi a0, 1
+; RV64F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV64F-NEXT:    vfncvt.f.xu.w v12, v8
 ; RV64F-NEXT:    vsrl.vi v12, v12, 23
 ; RV64F-NEXT:    li a1, 127
@@ -3294,8 +3294,8 @@ define <vscale x 8 x i64> @cttz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32F-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; RV32F-NEXT:    vrsub.vi v16, v8, 0
 ; RV32F-NEXT:    vand.vv v8, v8, v16
-; RV32F-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; RV32F-NEXT:    fsrmi a0, 1
+; RV32F-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; RV32F-NEXT:    vfncvt.f.xu.w v16, v8
 ; RV32F-NEXT:    vsrl.vi v8, v16, 23
 ; RV32F-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
@@ -3310,8 +3310,8 @@ define <vscale x 8 x i64> @cttz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV64F-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; RV64F-NEXT:    vrsub.vi v16, v8, 0
 ; RV64F-NEXT:    vand.vv v8, v8, v16
-; RV64F-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; RV64F-NEXT:    fsrmi a0, 1
+; RV64F-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; RV64F-NEXT:    vfncvt.f.xu.w v16, v8
 ; RV64F-NEXT:    vsrl.vi v16, v16, 23
 ; RV64F-NEXT:    li a1, 127
diff --git a/llvm/test/CodeGen/RISCV/rvv/double-round-conv.ll b/llvm/test/CodeGen/RISCV/rvv/double-round-conv.ll
index ee9ad097b442..8c63c2d4be8c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/double-round-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/double-round-conv.ll
@@ -677,8 +677,8 @@ define <vscale x 1 x i16> @ceil_nxv1f64_to_ui16(<vscale x 1 x double> %x) {
 define <vscale x 1 x i32> @ceil_nxv1f64_to_si32(<vscale x 1 x double> %x) {
 ; RV32-LABEL: ceil_nxv1f64_to_si32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; RV32-NEXT:    fsrmi a0, 3
+; RV32-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
 ; RV32-NEXT:    vfncvt.x.f.w v9, v8
 ; RV32-NEXT:    fsrm a0
 ; RV32-NEXT:    vmv1r.v v8, v9
@@ -686,8 +686,8 @@ define <vscale x 1 x i32> @ceil_nxv1f64_to_si32(<vscale x 1 x double> %x) {
 ;
 ; RV64-LABEL: ceil_nxv1f64_to_si32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; RV64-NEXT:    fsrmi a0, 3
+; RV64-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
 ; RV64-NEXT:    vfncvt.x.f.w v9, v8
 ; RV64-NEXT:    fsrm a0
 ; RV64-NEXT:    vmv1r.v v8, v9
@@ -700,8 +700,8 @@ define <vscale x 1 x i32> @ceil_nxv1f64_to_si32(<vscale x 1 x double> %x) {
 define <vscale x 1 x i32> @ceil_nxv1f64_to_ui32(<vscale x 1 x double> %x) {
 ; RV32-LABEL: ceil_nxv1f64_to_ui32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; RV32-NEXT:    fsrmi a0, 3
+; RV32-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
 ; RV32-NEXT:    vfncvt.xu.f.w v9, v8
 ; RV32-NEXT:    fsrm a0
 ; RV32-NEXT:    vmv1r.v v8, v9
@@ -709,8 +709,8 @@ define <vscale x 1 x i32> @ceil_nxv1f64_to_ui32(<vscale x 1 x double> %x) {
 ;
 ; RV64-LABEL: ceil_nxv1f64_to_ui32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; RV64-NEXT:    fsrmi a0, 3
+; RV64-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
 ; RV64-NEXT:    vfncvt.xu.f.w v9, v8
 ; RV64-NEXT:    fsrm a0
 ; RV64-NEXT:    vmv1r.v v8, v9
@@ -723,16 +723,16 @@ define <vscale x 1 x i32> @ceil_nxv1f64_to_ui32(<vscale x 1 x double> %x) {
 define <vscale x 1 x i64> @ceil_nxv1f64_to_si64(<vscale x 1 x double> %x) {
 ; RV32-LABEL: ceil_nxv1f64_to_si64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV32-NEXT:    fsrmi a0, 3
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; RV32-NEXT:    vfcvt.x.f.v v8, v8
 ; RV32-NEXT:    fsrm a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: ceil_nxv1f64_to_si64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV64-NEXT:    fsrmi a0, 3
+; RV64-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; RV64-NEXT:    vfcvt.x.f.v v8, v8
 ; RV64-NEXT:    fsrm a0
 ; RV64-NEXT:    ret
@@ -744,16 +744,16 @@ define <vscale x 1 x i64> @ceil_nxv1f64_to_si64(<vscale x 1 x double> %x) {
 define <vscale x 1 x i64> @ceil_nxv1f64_to_ui64(<vscale x 1 x double> %x) {
 ; RV32-LABEL: ceil_nxv1f64_to_ui64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV32-NEXT:    fsrmi a0, 3
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; RV32-NEXT:    vfcvt.xu.f.v v8, v8
 ; RV32-NEXT:    fsrm a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: ceil_nxv1f64_to_ui64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV64-NEXT:    fsrmi a0, 3
+; RV64-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; RV64-NEXT:    vfcvt.xu.f.v v8, v8
 ; RV64-NEXT:    fsrm a0
 ; RV64-NEXT:    ret
@@ -951,8 +951,8 @@ define <vscale x 4 x i16> @ceil_nxv4f64_to_ui16(<vscale x 4 x double> %x) {
 define <vscale x 4 x i32> @ceil_nxv4f64_to_si32(<vscale x 4 x double> %x) {
 ; RV32-LABEL: ceil_nxv4f64_to_si32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; RV32-NEXT:    fsrmi a0, 3
+; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vfncvt.x.f.w v12, v8
 ; RV32-NEXT:    fsrm a0
 ; RV32-NEXT:    vmv.v.v v8, v12
@@ -960,8 +960,8 @@ define <vscale x 4 x i32> @ceil_nxv4f64_to_si32(<vscale x 4 x double> %x) {
 ;
 ; RV64-LABEL: ceil_nxv4f64_to_si32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; RV64-NEXT:    fsrmi a0, 3
+; RV64-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; RV64-NEXT:    vfncvt.x.f.w v12, v8
 ; RV64-NEXT:    fsrm a0
 ; RV64-NEXT:    vmv.v.v v8, v12
@@ -974,8 +974,8 @@ define <vscale x 4 x i32> @ceil_nxv4f64_to_si32(<vscale x 4 x double> %x) {
 define <vscale x 4 x i32> @ceil_nxv4f64_to_ui32(<vscale x 4 x double> %x) {
 ; RV32-LABEL: ceil_nxv4f64_to_ui32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; RV32-NEXT:    fsrmi a0, 3
+; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vfncvt.xu.f.w v12, v8
 ; RV32-NEXT:    fsrm a0
 ; RV32-NEXT:    vmv.v.v v8, v12
@@ -983,8 +983,8 @@ define <vscale x 4 x i32> @ceil_nxv4f64_to_ui32(<vscale x 4 x double> %x) {
 ;
 ; RV64-LABEL: ceil_nxv4f64_to_ui32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; RV64-NEXT:    fsrmi a0, 3
+; RV64-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; RV64-NEXT:    vfncvt.xu.f.w v12, v8
 ; RV64-NEXT:    fsrm a0
 ; RV64-NEXT:    vmv.v.v v8, v12
@@ -997,16 +997,16 @@ define <vscale x 4 x i32> @ceil_nxv4f64_to_ui32(<vscale x 4 x double> %x) {
 define <vscale x 4 x i64> @ceil_nxv4f64_to_si64(<vscale x 4 x double> %x) {
 ; RV32-LABEL: ceil_nxv4f64_to_si64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV32-NEXT:    fsrmi a0, 3
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; RV32-NEXT:    vfcvt.x.f.v v8, v8
 ; RV32-NEXT:    fsrm a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: ceil_nxv4f64_to_si64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV64-NEXT:    fsrmi a0, 3
+; RV64-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; RV64-NEXT:    vfcvt.x.f.v v8, v8
 ; RV64-NEXT:    fsrm a0
 ; RV64-NEXT:    ret
@@ -1018,16 +1018,16 @@ define <vscale x 4 x i64> @ceil_nxv4f64_to_si64(<vscale x 4 x double> %x) {
 define <vscale x 4 x i64> @ceil_nxv4f64_to_ui64(<vscale x 4 x double> %x) {
 ; RV32-LABEL: ceil_nxv4f64_to_ui64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV32-NEXT:    fsrmi a0, 3
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; RV32-NEXT:    vfcvt.xu.f.v v8, v8
 ; RV32-NEXT:    fsrm a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: ceil_nxv4f64_to_ui64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV64-NEXT:    fsrmi a0, 3
+; RV64-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; RV64-NEXT:    vfcvt.xu.f.v v8, v8
 ; RV64-NEXT:    fsrm a0
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
index e15e6452163b..4f1fcfbe8cc5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple riscv32 -mattr=+m,+d,+zfh,+zvfh,+v -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple riscv64 -mattr=+m,+d,+zfh,+zvfh,+v -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple riscv32 -mattr=+m,+d,+zfh,+zvfh,+v,+experimental-zvfbfmin -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple riscv64 -mattr=+m,+d,+zfh,+zvfh,+v,+experimental-zvfbfmin -verify-machineinstrs < %s | FileCheck %s
 
 define <vscale x 4 x i32> @extract_nxv8i32_nxv4i32_0(<vscale x 8 x i32> %vec) {
 ; CHECK-LABEL: extract_nxv8i32_nxv4i32_0:
@@ -481,6 +481,60 @@ define <vscale x 6 x half> @extract_nxv6f16_nxv12f16_6(<vscale x 12 x half> %in)
   ret <vscale x 6 x half> %res
 }
 
+define <vscale x 2 x bfloat> @extract_nxv2bf16_nxv16bf16_0(<vscale x 16 x bfloat> %vec) {
+; CHECK-LABEL: extract_nxv2bf16_nxv16bf16_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret
+  %c = call <vscale x 2 x bfloat> @llvm.vector.extract.nxv2bf16.nxv16bf16(<vscale x 16 x bfloat> %vec, i64 0)
+  ret <vscale x 2 x bfloat> %c
+}
+
+define <vscale x 2 x bfloat> @extract_nxv2bf16_nxv16bf16_2(<vscale x 16 x bfloat> %vec) {
+; CHECK-LABEL: extract_nxv2bf16_nxv16bf16_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %c = call <vscale x 2 x bfloat> @llvm.vector.extract.nxv2bf16.nxv16bf16(<vscale x 16 x bfloat> %vec, i64 2)
+  ret <vscale x 2 x bfloat> %c
+}
+
+define <vscale x 2 x bfloat> @extract_nxv2bf16_nxv16bf16_4(<vscale x 16 x bfloat> %vec) {
+; CHECK-LABEL: extract_nxv2bf16_nxv16bf16_4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %c = call <vscale x 2 x bfloat> @llvm.vector.extract.nxv2bf16.nxv16bf16(<vscale x 16 x bfloat> %vec, i64 4)
+  ret <vscale x 2 x bfloat> %c
+}
+
+define <vscale x 6 x bfloat> @extract_nxv6bf16_nxv12bf16_0(<vscale x 12 x bfloat> %in) {
+; CHECK-LABEL: extract_nxv6bf16_nxv12bf16_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret
+  %res = call <vscale x 6 x bfloat> @llvm.vector.extract.nxv6bf16.nxv12bf16(<vscale x 12 x bfloat> %in, i64 0)
+  ret <vscale x 6 x bfloat> %res
+}
+
+define <vscale x 6 x bfloat> @extract_nxv6bf16_nxv12bf16_6(<vscale x 12 x bfloat> %in) {
+; CHECK-LABEL: extract_nxv6bf16_nxv12bf16_6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v13, v10, a0
+; CHECK-NEXT:    vslidedown.vx v12, v9, a0
+; CHECK-NEXT:    add a1, a0, a0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v12, v10, a0
+; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    ret
+  %res = call <vscale x 6 x bfloat> @llvm.vector.extract.nxv6bf16.nxv12bf16(<vscale x 12 x bfloat> %in, i64 6)
+  ret <vscale x 6 x bfloat> %res
+}
+
 declare <vscale x 6 x half> @llvm.vector.extract.nxv6f16.nxv12f16(<vscale x 12 x half>, i64)
 
 declare <vscale x 1 x i8> @llvm.vector.extract.nxv1i8.nxv4i8(<vscale x 4 x i8> %vec, i64 %idx)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
index 5d024f140fd5..3e2af7e8267b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
@@ -19,8 +19,8 @@ define <2 x half> @vp_ceil_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl)
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 3
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -38,8 +38,8 @@ define <2 x half> @vp_ceil_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -101,8 +101,8 @@ define <4 x half> @vp_ceil_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl)
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 3
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -120,8 +120,8 @@ define <4 x half> @vp_ceil_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -183,8 +183,8 @@ define <8 x half> @vp_ceil_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl)
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 3
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -203,8 +203,8 @@ define <8 x half> @vp_ceil_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v9
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
@@ -268,8 +268,8 @@ define <16 x half> @vp_ceil_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %e
 ; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 3
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v0, v10
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -289,8 +289,8 @@ define <16 x half> @vp_ceil_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %e
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
@@ -353,8 +353,8 @@ define <2 x float> @vp_ceil_v2f32(<2 x float> %va, <2 x i1> %m, i32 zeroext %evl
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -395,8 +395,8 @@ define <4 x float> @vp_ceil_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroext %evl
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -438,8 +438,8 @@ define <8 x float> @vp_ceil_v8f32(<8 x float> %va, <8 x i1> %m, i32 zeroext %evl
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -482,8 +482,8 @@ define <16 x float> @vp_ceil_v16f32(<16 x float> %va, <16 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -525,8 +525,8 @@ define <2 x double> @vp_ceil_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroext %e
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -568,8 +568,8 @@ define <4 x double> @vp_ceil_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %e
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -612,8 +612,8 @@ define <8 x double> @vp_ceil_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %e
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -656,8 +656,8 @@ define <15 x double> @vp_ceil_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroex
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -700,8 +700,8 @@ define <16 x double> @vp_ceil_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroex
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -764,8 +764,8 @@ define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a1, 3
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
@@ -788,8 +788,8 @@ define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
 ; CHECK-NEXT:    vfabs.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v24, v8, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
index 277146cc1403..49e5a1c79c43 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
@@ -353,8 +353,8 @@ define void @ctlz_v2i64(ptr %x, ptr %y) nounwind {
 ; RV32F-NEXT:    vle64.v v8, (a0)
 ; RV32F-NEXT:    li a1, 190
 ; RV32F-NEXT:    vmv.v.x v9, a1
-; RV32F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV32F-NEXT:    fsrmi a1, 1
+; RV32F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV32F-NEXT:    vfncvt.f.xu.w v10, v8
 ; RV32F-NEXT:    fsrm a1
 ; RV32F-NEXT:    vsrl.vi v8, v10, 23
@@ -762,8 +762,8 @@ define void @ctlz_v4i64(ptr %x, ptr %y) nounwind {
 ; RV32F-NEXT:    vle64.v v8, (a0)
 ; RV32F-NEXT:    li a1, 190
 ; RV32F-NEXT:    vmv.v.x v10, a1
-; RV32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32F-NEXT:    fsrmi a1, 1
+; RV32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32F-NEXT:    vfncvt.f.xu.w v12, v8
 ; RV32F-NEXT:    fsrm a1
 ; RV32F-NEXT:    vsrl.vi v8, v12, 23
@@ -1152,8 +1152,8 @@ define void @ctlz_zero_undef_v2i64(ptr %x, ptr %y) nounwind {
 ; RV32F-NEXT:    vle64.v v8, (a0)
 ; RV32F-NEXT:    li a1, 190
 ; RV32F-NEXT:    vmv.v.x v9, a1
-; RV32F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV32F-NEXT:    fsrmi a1, 1
+; RV32F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV32F-NEXT:    vfncvt.f.xu.w v10, v8
 ; RV32F-NEXT:    fsrm a1
 ; RV32F-NEXT:    vsrl.vi v8, v10, 23
@@ -1537,8 +1537,8 @@ define void @ctlz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
 ; RV32F-NEXT:    vle64.v v8, (a0)
 ; RV32F-NEXT:    li a1, 190
 ; RV32F-NEXT:    vmv.v.x v10, a1
-; RV32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32F-NEXT:    fsrmi a1, 1
+; RV32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32F-NEXT:    vfncvt.f.xu.w v12, v8
 ; RV32F-NEXT:    fsrm a1
 ; RV32F-NEXT:    vsrl.vi v8, v12, 23
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
index 8c8da6d1e003..ea3a78ae0bec 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
@@ -336,8 +336,8 @@ define void @cttz_v2i64(ptr %x, ptr %y) nounwind {
 ; RV32F-NEXT:    vle64.v v8, (a0)
 ; RV32F-NEXT:    vrsub.vi v9, v8, 0
 ; RV32F-NEXT:    vand.vv v9, v8, v9
-; RV32F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV32F-NEXT:    fsrmi a1, 1
+; RV32F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV32F-NEXT:    vfncvt.f.xu.w v10, v9
 ; RV32F-NEXT:    fsrm a1
 ; RV32F-NEXT:    vsrl.vi v9, v10, 23
@@ -357,8 +357,8 @@ define void @cttz_v2i64(ptr %x, ptr %y) nounwind {
 ; RV64F-NEXT:    vle64.v v8, (a0)
 ; RV64F-NEXT:    vrsub.vi v9, v8, 0
 ; RV64F-NEXT:    vand.vv v9, v8, v9
-; RV64F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV64F-NEXT:    fsrmi a1, 1
+; RV64F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV64F-NEXT:    vfncvt.f.xu.w v10, v9
 ; RV64F-NEXT:    fsrm a1
 ; RV64F-NEXT:    vsrl.vi v9, v10, 23
@@ -737,8 +737,8 @@ define void @cttz_v4i64(ptr %x, ptr %y) nounwind {
 ; RV32F-NEXT:    vle64.v v8, (a0)
 ; RV32F-NEXT:    vrsub.vi v10, v8, 0
 ; RV32F-NEXT:    vand.vv v10, v8, v10
-; RV32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32F-NEXT:    fsrmi a1, 1
+; RV32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32F-NEXT:    vfncvt.f.xu.w v12, v10
 ; RV32F-NEXT:    fsrm a1
 ; RV32F-NEXT:    vsrl.vi v10, v12, 23
@@ -758,8 +758,8 @@ define void @cttz_v4i64(ptr %x, ptr %y) nounwind {
 ; RV64F-NEXT:    vle64.v v8, (a0)
 ; RV64F-NEXT:    vrsub.vi v10, v8, 0
 ; RV64F-NEXT:    vand.vv v10, v8, v10
-; RV64F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV64F-NEXT:    fsrmi a1, 1
+; RV64F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV64F-NEXT:    vfncvt.f.xu.w v12, v10
 ; RV64F-NEXT:    fsrm a1
 ; RV64F-NEXT:    vsrl.vi v10, v12, 23
@@ -1115,8 +1115,8 @@ define void @cttz_zero_undef_v2i64(ptr %x, ptr %y) nounwind {
 ; RV32F-NEXT:    vle64.v v8, (a0)
 ; RV32F-NEXT:    vrsub.vi v9, v8, 0
 ; RV32F-NEXT:    vand.vv v8, v8, v9
-; RV32F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV32F-NEXT:    fsrmi a1, 1
+; RV32F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV32F-NEXT:    vfncvt.f.xu.w v9, v8
 ; RV32F-NEXT:    fsrm a1
 ; RV32F-NEXT:    vsrl.vi v8, v9, 23
@@ -1133,8 +1133,8 @@ define void @cttz_zero_undef_v2i64(ptr %x, ptr %y) nounwind {
 ; RV64F-NEXT:    vle64.v v8, (a0)
 ; RV64F-NEXT:    vrsub.vi v9, v8, 0
 ; RV64F-NEXT:    vand.vv v8, v8, v9
-; RV64F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV64F-NEXT:    fsrmi a1, 1
+; RV64F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV64F-NEXT:    vfncvt.f.xu.w v9, v8
 ; RV64F-NEXT:    fsrm a1
 ; RV64F-NEXT:    vsrl.vi v8, v9, 23
@@ -1486,8 +1486,8 @@ define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
 ; RV32F-NEXT:    vle64.v v8, (a0)
 ; RV32F-NEXT:    vrsub.vi v10, v8, 0
 ; RV32F-NEXT:    vand.vv v8, v8, v10
-; RV32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32F-NEXT:    fsrmi a1, 1
+; RV32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32F-NEXT:    vfncvt.f.xu.w v10, v8
 ; RV32F-NEXT:    fsrm a1
 ; RV32F-NEXT:    vsrl.vi v8, v10, 23
@@ -1504,8 +1504,8 @@ define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
 ; RV64F-NEXT:    vle64.v v8, (a0)
 ; RV64F-NEXT:    vrsub.vi v10, v8, 0
 ; RV64F-NEXT:    vand.vv v8, v8, v10
-; RV64F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV64F-NEXT:    fsrmi a1, 1
+; RV64F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV64F-NEXT:    vfncvt.f.xu.w v10, v8
 ; RV64F-NEXT:    fsrm a1
 ; RV64F-NEXT:    vsrl.vi v8, v10, 23
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
index 6c2be509f7c2..287dd510674d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
@@ -19,8 +19,8 @@ define <2 x half> @vp_floor_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl)
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 2
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -38,8 +38,8 @@ define <2 x half> @vp_floor_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -101,8 +101,8 @@ define <4 x half> @vp_floor_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl)
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 2
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -120,8 +120,8 @@ define <4 x half> @vp_floor_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -183,8 +183,8 @@ define <8 x half> @vp_floor_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl)
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 2
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -203,8 +203,8 @@ define <8 x half> @vp_floor_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v9
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
@@ -268,8 +268,8 @@ define <16 x half> @vp_floor_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %
 ; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 2
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v0, v10
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -289,8 +289,8 @@ define <16 x half> @vp_floor_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
@@ -353,8 +353,8 @@ define <2 x float> @vp_floor_v2f32(<2 x float> %va, <2 x i1> %m, i32 zeroext %ev
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -395,8 +395,8 @@ define <4 x float> @vp_floor_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroext %ev
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -438,8 +438,8 @@ define <8 x float> @vp_floor_v8f32(<8 x float> %va, <8 x i1> %m, i32 zeroext %ev
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -482,8 +482,8 @@ define <16 x float> @vp_floor_v16f32(<16 x float> %va, <16 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -525,8 +525,8 @@ define <2 x double> @vp_floor_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroext %
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -568,8 +568,8 @@ define <4 x double> @vp_floor_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -612,8 +612,8 @@ define <8 x double> @vp_floor_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -656,8 +656,8 @@ define <15 x double> @vp_floor_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -700,8 +700,8 @@ define <16 x double> @vp_floor_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -764,8 +764,8 @@ define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a1, 2
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
@@ -788,8 +788,8 @@ define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vfabs.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v24, v8, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll
index 51ac27acaf47..48cc3f17a626 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+experimental-zvfbfmin -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+experimental-zvfbfmin -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+experimental-zvfbfmin -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+experimental-zvfbfmin -verify-machineinstrs < %s | FileCheck %s
 
 declare <2 x float> @llvm.vp.fpext.v2f32.v2f16(<2 x half>, <2 x i1>, i32)
 
@@ -120,3 +120,53 @@ define <32 x double> @vfpext_v32f32_v32f64(<32 x float> %a, <32 x i1> %m, i32 ze
   %v = call <32 x double> @llvm.vp.fpext.v32f64.v32f32(<32 x float> %a, <32 x i1> %m, i32 %vl)
   ret <32 x double> %v
 }
+
+declare <2 x float> @llvm.vp.fpext.v2f32.v2bf16(<2 x bfloat>, <2 x i1>, i32)
+
+define <2 x float> @vfpext_v2bf16_v2f32(<2 x bfloat> %a, <2 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: vfpext_v2bf16_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %v = call <2 x float> @llvm.vp.fpext.v2f32.v2bf16(<2 x bfloat> %a, <2 x i1> %m, i32 %vl)
+  ret <2 x float> %v
+}
+
+define <2 x float> @vfpext_v2bf16_v2f32_unmasked(<2 x bfloat> %a, i32 zeroext %vl) {
+; CHECK-LABEL: vfpext_v2bf16_v2f32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %v = call <2 x float> @llvm.vp.fpext.v2f32.v2bf16(<2 x bfloat> %a, <2 x i1> shufflevector (<2 x i1> insertelement (<2 x i1> undef, i1 true, i32 0), <2 x i1> undef, <2 x i32> zeroinitializer), i32 %vl)
+  ret <2 x float> %v
+}
+
+declare <2 x double> @llvm.vp.fpext.v2f64.v2bf16(<2 x bfloat>, <2 x i1>, i32)
+
+define <2 x double> @vfpext_v2bf16_v2f64(<2 x bfloat> %a, <2 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: vfpext_v2bf16_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x double> @llvm.vp.fpext.v2f64.v2bf16(<2 x bfloat> %a, <2 x i1> %m, i32 %vl)
+  ret <2 x double> %v
+}
+
+define <2 x double> @vfpext_v2bf16_v2f64_unmasked(<2 x bfloat> %a, i32 zeroext %vl) {
+; CHECK-LABEL: vfpext_v2bf16_v2f64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v9
+; CHECK-NEXT:    ret
+  %v = call <2 x double> @llvm.vp.fpext.v2f64.v2bf16(<2 x bfloat> %a, <2 x i1> shufflevector (<2 x i1> insertelement (<2 x i1> undef, i1 true, i32 0), <2 x i1> undef, <2 x i32> zeroinitializer), i32 %vl)
+  ret <2 x double> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll
index de11f9e8a9fa..d890bf5412f9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+experimental-zvfbfmin -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+experimental-zvfbfmin -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+experimental-zvfbfmin -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+experimental-zvfbfmin -verify-machineinstrs < %s | FileCheck %s
 
 
 declare <2 x half> @llvm.vp.fptrunc.v2f16.v2f32(<2 x float>, <2 x i1>, i32)
@@ -122,3 +122,53 @@ define <32 x float> @vfptrunc_v32f32_v32f64(<32 x double> %a, <32 x i1> %m, i32
   %v = call <32 x float> @llvm.vp.fptrunc.v32f64.v32f32(<32 x double> %a, <32 x i1> %m, i32 %vl)
   ret <32 x float> %v
 }
+
+declare <2 x bfloat> @llvm.vp.fptrunc.v2bf16.v2f32(<2 x float>, <2 x i1>, i32)
+
+define <2 x bfloat> @vfptrunc_v2bf16_v2f32(<2 x float> %a, <2 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: vfptrunc_v2bf16_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v9, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %v = call <2 x bfloat> @llvm.vp.fptrunc.v2bf16.v2f32(<2 x float> %a, <2 x i1> %m, i32 %vl)
+  ret <2 x bfloat> %v
+}
+
+define <2 x bfloat> @vfptrunc_v2bf16_v2f32_unmasked(<2 x float> %a, i32 zeroext %vl) {
+; CHECK-LABEL: vfptrunc_v2bf16_v2f32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v9, v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %v = call <2 x bfloat> @llvm.vp.fptrunc.v2bf16.v2f32(<2 x float> %a, <2 x i1> shufflevector (<2 x i1> insertelement (<2 x i1> undef, i1 true, i32 0), <2 x i1> undef, <2 x i32> zeroinitializer), i32 %vl)
+  ret <2 x bfloat> %v
+}
+
+declare <2 x bfloat> @llvm.vp.fptrunc.v2bf16.v2f64(<2 x double>, <2 x i1>, i32)
+
+define <2 x bfloat> @vfptrunc_v2bf16_v2f64(<2 x double> %a, <2 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: vfptrunc_v2bf16_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vfncvt.rod.f.f.w v9, v8, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x bfloat> @llvm.vp.fptrunc.v2bf16.v2f64(<2 x double> %a, <2 x i1> %m, i32 %vl)
+  ret <2 x bfloat> %v
+}
+
+define <2 x bfloat> @vfptrunc_v2bf16_v2f64_unmasked(<2 x double> %a, i32 zeroext %vl) {
+; CHECK-LABEL: vfptrunc_v2bf16_v2f64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vfncvt.rod.f.f.w v9, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    ret
+  %v = call <2 x bfloat> @llvm.vp.fptrunc.v2bf16.v2f64(<2 x double> %a, <2 x i1> shufflevector (<2 x i1> insertelement (<2 x i1> undef, i1 true, i32 0), <2 x i1> undef, <2 x i32> zeroinitializer), i32 %vl)
+  ret <2 x bfloat> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load-store.ll
index 38aee567e2b5..fbe8bcbc0d3c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load-store.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV32 %s
-; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV64 %s
+; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh,+experimental-zvfbfmin -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV32 %s
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+experimental-zvfbfmin -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV64 %s
 
 define void @v2i8(ptr %p, ptr %q) {
 ; CHECK-LABEL: v2i8:
@@ -301,3 +301,15 @@ define void @v2i8_volatile_store(ptr %p, ptr %q) {
   store volatile <2 x i8> %v, ptr %q
   ret void
 }
+
+define void @v8bf16(ptr %p, ptr %q) {
+; CHECK-LABEL: v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vse16.v v8, (a1)
+; CHECK-NEXT:    ret
+  %v = load <8 x bfloat>, ptr %p
+  store <8 x bfloat> %v, ptr %q
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll
index 791e6eb5ff30..d80d75d3d5d0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV32 %s
-; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV64 %s
+; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh,+experimental-zvfbfmin -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV32 %s
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+experimental-zvfbfmin -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV64 %s
 
 define <5 x i8> @load_v5i8(ptr %p) {
 ; CHECK-LABEL: load_v5i8:
@@ -181,3 +181,13 @@ define <16 x i64> @exact_vlen_i64_m8(ptr %p) vscale_range(2,2) {
   %v = load <16 x i64>, ptr %p
   ret <16 x i64> %v
 }
+
+define <8 x bfloat> @load_v8bf16(ptr %p) {
+; CHECK-LABEL: load_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %x = load <8 x bfloat>, ptr %p
+  ret <8 x bfloat> %x
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
index 6f045349423c..716cf7b0f46f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
@@ -19,8 +19,8 @@ define <2 x half> @vp_round_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl)
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 4
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -38,8 +38,8 @@ define <2 x half> @vp_round_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -101,8 +101,8 @@ define <4 x half> @vp_round_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl)
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 4
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -120,8 +120,8 @@ define <4 x half> @vp_round_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -183,8 +183,8 @@ define <8 x half> @vp_round_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl)
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 4
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -203,8 +203,8 @@ define <8 x half> @vp_round_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v9
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
@@ -268,8 +268,8 @@ define <16 x half> @vp_round_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %
 ; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 4
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v0, v10
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -289,8 +289,8 @@ define <16 x half> @vp_round_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
@@ -353,8 +353,8 @@ define <2 x float> @vp_round_v2f32(<2 x float> %va, <2 x i1> %m, i32 zeroext %ev
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -395,8 +395,8 @@ define <4 x float> @vp_round_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroext %ev
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -438,8 +438,8 @@ define <8 x float> @vp_round_v8f32(<8 x float> %va, <8 x i1> %m, i32 zeroext %ev
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -482,8 +482,8 @@ define <16 x float> @vp_round_v16f32(<16 x float> %va, <16 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -525,8 +525,8 @@ define <2 x double> @vp_round_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroext %
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -568,8 +568,8 @@ define <4 x double> @vp_round_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -612,8 +612,8 @@ define <8 x double> @vp_round_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -656,8 +656,8 @@ define <15 x double> @vp_round_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -700,8 +700,8 @@ define <16 x double> @vp_round_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -764,8 +764,8 @@ define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a1, 4
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
@@ -788,8 +788,8 @@ define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vfabs.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v24, v8, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
index 738d7e37c50b..603f9397dc90 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
@@ -19,8 +19,8 @@ define <2 x half> @vp_roundeven_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 0
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -38,8 +38,8 @@ define <2 x half> @vp_roundeven_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -101,8 +101,8 @@ define <4 x half> @vp_roundeven_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 0
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -120,8 +120,8 @@ define <4 x half> @vp_roundeven_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -183,8 +183,8 @@ define <8 x half> @vp_roundeven_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 0
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -203,8 +203,8 @@ define <8 x half> @vp_roundeven_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v9
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
@@ -268,8 +268,8 @@ define <16 x half> @vp_roundeven_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroe
 ; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 0
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v0, v10
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -289,8 +289,8 @@ define <16 x half> @vp_roundeven_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroe
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
@@ -353,8 +353,8 @@ define <2 x float> @vp_roundeven_v2f32(<2 x float> %va, <2 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -395,8 +395,8 @@ define <4 x float> @vp_roundeven_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -438,8 +438,8 @@ define <8 x float> @vp_roundeven_v8f32(<8 x float> %va, <8 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -482,8 +482,8 @@ define <16 x float> @vp_roundeven_v16f32(<16 x float> %va, <16 x i1> %m, i32 zer
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -525,8 +525,8 @@ define <2 x double> @vp_roundeven_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -568,8 +568,8 @@ define <4 x double> @vp_roundeven_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -612,8 +612,8 @@ define <8 x double> @vp_roundeven_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -656,8 +656,8 @@ define <15 x double> @vp_roundeven_v15f64(<15 x double> %va, <15 x i1> %m, i32 z
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -700,8 +700,8 @@ define <16 x double> @vp_roundeven_v16f64(<16 x double> %va, <16 x i1> %m, i32 z
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -764,8 +764,8 @@ define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
@@ -788,8 +788,8 @@ define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; CHECK-NEXT:    vfabs.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v24, v8, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
index 6f5b7875266b..a5adfc36887a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
@@ -19,8 +19,8 @@ define <2 x half> @vp_roundtozero_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 1
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -38,8 +38,8 @@ define <2 x half> @vp_roundtozero_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -101,8 +101,8 @@ define <4 x half> @vp_roundtozero_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 1
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -120,8 +120,8 @@ define <4 x half> @vp_roundtozero_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -183,8 +183,8 @@ define <8 x half> @vp_roundtozero_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 1
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -203,8 +203,8 @@ define <8 x half> @vp_roundtozero_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v9
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
@@ -268,8 +268,8 @@ define <16 x half> @vp_roundtozero_v16f16(<16 x half> %va, <16 x i1> %m, i32 zer
 ; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 1
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v0, v10
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -289,8 +289,8 @@ define <16 x half> @vp_roundtozero_v16f16(<16 x half> %va, <16 x i1> %m, i32 zer
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
@@ -353,8 +353,8 @@ define <2 x float> @vp_roundtozero_v2f32(<2 x float> %va, <2 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -395,8 +395,8 @@ define <4 x float> @vp_roundtozero_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -438,8 +438,8 @@ define <8 x float> @vp_roundtozero_v8f32(<8 x float> %va, <8 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -482,8 +482,8 @@ define <16 x float> @vp_roundtozero_v16f32(<16 x float> %va, <16 x i1> %m, i32 z
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -525,8 +525,8 @@ define <2 x double> @vp_roundtozero_v2f64(<2 x double> %va, <2 x i1> %m, i32 zer
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -568,8 +568,8 @@ define <4 x double> @vp_roundtozero_v4f64(<4 x double> %va, <4 x i1> %m, i32 zer
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -612,8 +612,8 @@ define <8 x double> @vp_roundtozero_v8f64(<8 x double> %va, <8 x i1> %m, i32 zer
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -656,8 +656,8 @@ define <15 x double> @vp_roundtozero_v15f64(<15 x double> %va, <15 x i1> %m, i32
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -700,8 +700,8 @@ define <16 x double> @vp_roundtozero_v16f64(<16 x double> %va, <16 x i1> %m, i32
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -764,8 +764,8 @@ define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a1, 1
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
@@ -788,8 +788,8 @@ define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32
 ; CHECK-NEXT:    vfabs.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v24, v8, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll
index b747d73ce353..6317a4977562 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV32 %s
-; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV64 %s
+; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh,+experimental-zvfbfmin -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV32 %s
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+experimental-zvfbfmin -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV64 %s
 
 define void @store_v5i8(ptr %p, <5 x i8> %v) {
 ; CHECK-LABEL: store_v5i8:
@@ -294,6 +294,16 @@ define void @exact_vlen_i64_m8(ptr %p) vscale_range(2,2) {
   ret void
 }
 
+define void @store_v8bf16(ptr %p, <8 x bfloat> %v) {
+; CHECK-LABEL: store_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+  store <8 x bfloat> %v, ptr %p
+  ret void
+}
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; RV32: {{.*}}
 ; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vaaddu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vaaddu.ll
index 70b547759938..600290a62515 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vaaddu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vaaddu.ll
@@ -5,8 +5,8 @@
 define <8 x i8> @vaaddu_vv_v8i8_floor(<8 x i8> %x, <8 x i8> %y) {
 ; CHECK-LABEL: vaaddu_vv_v8i8_floor:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 2
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %xzv = zext <8 x i8> %x to <8 x i16>
@@ -20,8 +20,8 @@ define <8 x i8> @vaaddu_vv_v8i8_floor(<8 x i8> %x, <8 x i8> %y) {
 define <8 x i8> @vaaddu_vx_v8i8_floor(<8 x i8> %x, i8 %y) {
 ; CHECK-LABEL: vaaddu_vx_v8i8_floor:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 2
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vaaddu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %xzv = zext <8 x i8> %x to <8 x i16>
@@ -53,8 +53,8 @@ define <8 x i8> @vaaddu_vv_v8i8_floor_sexti16(<8 x i8> %x, <8 x i8> %y) {
 define <8 x i8> @vaaddu_vv_v8i8_floor_zexti32(<8 x i8> %x, <8 x i8> %y) {
 ; CHECK-LABEL: vaaddu_vv_v8i8_floor_zexti32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 2
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %xzv = zext <8 x i8> %x to <8 x i32>
@@ -83,8 +83,8 @@ define <8 x i8> @vaaddu_vv_v8i8_floor_lshr2(<8 x i8> %x, <8 x i8> %y) {
 define <8 x i16> @vaaddu_vv_v8i16_floor(<8 x i16> %x, <8 x i16> %y) {
 ; CHECK-LABEL: vaaddu_vv_v8i16_floor:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 2
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %xzv = zext <8 x i16> %x to <8 x i32>
@@ -98,8 +98,8 @@ define <8 x i16> @vaaddu_vv_v8i16_floor(<8 x i16> %x, <8 x i16> %y) {
 define <8 x i16> @vaaddu_vx_v8i16_floor(<8 x i16> %x, i16 %y) {
 ; CHECK-LABEL: vaaddu_vx_v8i16_floor:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 2
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vaaddu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %xzv = zext <8 x i16> %x to <8 x i32>
@@ -115,8 +115,8 @@ define <8 x i16> @vaaddu_vx_v8i16_floor(<8 x i16> %x, i16 %y) {
 define <8 x i32> @vaaddu_vv_v8i32_floor(<8 x i32> %x, <8 x i32> %y) {
 ; CHECK-LABEL: vaaddu_vv_v8i32_floor:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 2
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %xzv = zext <8 x i32> %x to <8 x i64>
@@ -130,8 +130,8 @@ define <8 x i32> @vaaddu_vv_v8i32_floor(<8 x i32> %x, <8 x i32> %y) {
 define <8 x i32> @vaaddu_vx_v8i32_floor(<8 x i32> %x, i32 %y) {
 ; CHECK-LABEL: vaaddu_vx_v8i32_floor:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 2
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vaaddu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %xzv = zext <8 x i32> %x to <8 x i64>
@@ -147,8 +147,8 @@ define <8 x i32> @vaaddu_vx_v8i32_floor(<8 x i32> %x, i32 %y) {
 define <8 x i64> @vaaddu_vv_v8i64_floor(<8 x i64> %x, <8 x i64> %y) {
 ; CHECK-LABEL: vaaddu_vv_v8i64_floor:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 2
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %xzv = zext <8 x i64> %x to <8 x i128>
@@ -197,8 +197,8 @@ define <8 x i64> @vaaddu_vx_v8i64_floor(<8 x i64> %x, i64 %y) {
 ;
 ; RV64-LABEL: vaaddu_vx_v8i64_floor:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-NEXT:    csrwi vxrm, 2
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-NEXT:    vaaddu.vx v8, v8, a0
 ; RV64-NEXT:    ret
   %xzv = zext <8 x i64> %x to <8 x i128>
@@ -214,8 +214,8 @@ define <8 x i64> @vaaddu_vx_v8i64_floor(<8 x i64> %x, i64 %y) {
 define <8 x i8> @vaaddu_vv_v8i8_ceil(<8 x i8> %x, <8 x i8> %y) {
 ; CHECK-LABEL: vaaddu_vv_v8i8_ceil:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %xzv = zext <8 x i8> %x to <8 x i16>
@@ -230,8 +230,8 @@ define <8 x i8> @vaaddu_vv_v8i8_ceil(<8 x i8> %x, <8 x i8> %y) {
 define <8 x i8> @vaaddu_vx_v8i8_ceil(<8 x i8> %x, i8 %y) {
 ; CHECK-LABEL: vaaddu_vx_v8i8_ceil:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vaaddu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %xzv = zext <8 x i8> %x to <8 x i16>
@@ -267,8 +267,8 @@ define <8 x i8> @vaaddu_vv_v8i8_ceil_sexti16(<8 x i8> %x, <8 x i8> %y) {
 define <8 x i8> @vaaddu_vv_v8i8_ceil_zexti32(<8 x i8> %x, <8 x i8> %y) {
 ; CHECK-LABEL: vaaddu_vv_v8i8_ceil_zexti32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %xzv = zext <8 x i8> %x to <8 x i32>
@@ -305,8 +305,8 @@ define <8 x i8> @vaaddu_vv_v8i8_ceil_add2(<8 x i8> %x, <8 x i8> %y) {
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vwaddu.vv v10, v8, v9
 ; CHECK-NEXT:    li a0, 2
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vaaddu.vx v8, v10, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0
@@ -323,8 +323,8 @@ define <8 x i8> @vaaddu_vv_v8i8_ceil_add2(<8 x i8> %x, <8 x i8> %y) {
 define <8 x i16> @vaaddu_vv_v8i16_ceil(<8 x i16> %x, <8 x i16> %y) {
 ; CHECK-LABEL: vaaddu_vv_v8i16_ceil:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %xzv = zext <8 x i16> %x to <8 x i32>
@@ -339,8 +339,8 @@ define <8 x i16> @vaaddu_vv_v8i16_ceil(<8 x i16> %x, <8 x i16> %y) {
 define <8 x i16> @vaaddu_vx_v8i16_ceil(<8 x i16> %x, i16 %y) {
 ; CHECK-LABEL: vaaddu_vx_v8i16_ceil:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vaaddu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %xzv = zext <8 x i16> %x to <8 x i32>
@@ -357,8 +357,8 @@ define <8 x i16> @vaaddu_vx_v8i16_ceil(<8 x i16> %x, i16 %y) {
 define <8 x i32> @vaaddu_vv_v8i32_ceil(<8 x i32> %x, <8 x i32> %y) {
 ; CHECK-LABEL: vaaddu_vv_v8i32_ceil:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %xzv = zext <8 x i32> %x to <8 x i64>
@@ -373,8 +373,8 @@ define <8 x i32> @vaaddu_vv_v8i32_ceil(<8 x i32> %x, <8 x i32> %y) {
 define <8 x i32> @vaaddu_vx_v8i32_ceil(<8 x i32> %x, i32 %y) {
 ; CHECK-LABEL: vaaddu_vx_v8i32_ceil:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vaaddu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %xzv = zext <8 x i32> %x to <8 x i64>
@@ -391,8 +391,8 @@ define <8 x i32> @vaaddu_vx_v8i32_ceil(<8 x i32> %x, i32 %y) {
 define <8 x i64> @vaaddu_vv_v8i64_ceil(<8 x i64> %x, <8 x i64> %y) {
 ; CHECK-LABEL: vaaddu_vv_v8i64_ceil:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %xzv = zext <8 x i64> %x to <8 x i128>
@@ -443,8 +443,8 @@ define <8 x i64> @vaaddu_vx_v8i64_ceil(<8 x i64> %x, i64 %y) {
 ;
 ; RV64-LABEL: vaaddu_vx_v8i64_ceil:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-NEXT:    vaaddu.vx v8, v8, a0
 ; RV64-NEXT:    ret
   %xzv = zext <8 x i64> %x to <8 x i128>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfpext-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfpext-constrained-sdnode.ll
index b0e6a6a56051..5d9076208988 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfpext-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfpext-constrained-sdnode.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+experimental-zvfbfmin -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+experimental-zvfbfmin -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
 
 declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half>, metadata)
@@ -114,3 +114,78 @@ define <8 x double> @vfpext_v8f32_v8f64(<8 x float> %va) strictfp {
   %evec = call <8 x double> @llvm.experimental.constrained.fpext.v8f64.v8f32(<8 x float> %va, metadata !"fpexcept.strict")
   ret <8 x double> %evec
 }
+
+declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2bf16(<2 x bfloat>, metadata)
+define <2 x float> @vfpext_v2bf16_v2f32(<2 x bfloat> %va) strictfp {
+; CHECK-LABEL: vfpext_v2bf16_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %evec = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2bf16(<2 x bfloat> %va, metadata !"fpexcept.strict")
+  ret <2 x float> %evec
+}
+
+declare <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2bf16(<2 x bfloat>, metadata)
+define <2 x double> @vfpext_v2bf16_v2f64(<2 x bfloat> %va) strictfp {
+; CHECK-LABEL: vfpext_v2bf16_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v9
+; CHECK-NEXT:    ret
+  %evec = call <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2bf16(<2 x bfloat> %va, metadata !"fpexcept.strict")
+  ret <2 x double> %evec
+}
+
+declare <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4bf16(<4 x bfloat>, metadata)
+define <4 x float> @vfpext_v4bf16_v4f32(<4 x bfloat> %va) strictfp {
+; CHECK-LABEL: vfpext_v4bf16_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %evec = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4bf16(<4 x bfloat> %va, metadata !"fpexcept.strict")
+  ret <4 x float> %evec
+}
+
+declare <4 x double> @llvm.experimental.constrained.fpext.v4f64.v4bf16(<4 x bfloat>, metadata)
+define <4 x double> @vfpext_v4bf16_v4f64(<4 x bfloat> %va) strictfp {
+; CHECK-LABEL: vfpext_v4bf16_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v10
+; CHECK-NEXT:    ret
+  %evec = call <4 x double> @llvm.experimental.constrained.fpext.v4f64.v4bf16(<4 x bfloat> %va, metadata !"fpexcept.strict")
+  ret <4 x double> %evec
+}
+
+declare <8 x float> @llvm.experimental.constrained.fpext.v8f32.v8bf16(<8 x bfloat>, metadata)
+define <8 x float> @vfpext_v8bf16_v8f32(<8 x bfloat> %va) strictfp {
+; CHECK-LABEL: vfpext_v8bf16_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    ret
+  %evec = call <8 x float> @llvm.experimental.constrained.fpext.v8f32.v8bf16(<8 x bfloat> %va, metadata !"fpexcept.strict")
+  ret <8 x float> %evec
+}
+
+declare <8 x double> @llvm.experimental.constrained.fpext.v8f64.v8bf16(<8 x bfloat>, metadata)
+define <8 x double> @vfpext_v8bf16_v8f64(<8 x bfloat> %va) strictfp {
+; CHECK-LABEL: vfpext_v8bf16_v8f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v12
+; CHECK-NEXT:    ret
+  %evec = call <8 x double> @llvm.experimental.constrained.fpext.v8f64.v8bf16(<8 x bfloat> %va, metadata !"fpexcept.strict")
+  ret <8 x double> %evec
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfptrunc-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfptrunc-constrained-sdnode.ll
index fd53113741de..5781223a5326 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfptrunc-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfptrunc-constrained-sdnode.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+experimental-zvfbfmin -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+experimental-zvfbfmin -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+experimental-zvfbfmin -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+experimental-zvfbfmin -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
 
 declare <2 x float> @llvm.experimental.constrained.fptrunc.v2f32.v2f64(<2 x double>, metadata, metadata)
@@ -118,3 +118,78 @@ define <8 x half> @vfptrunc_v8f32_v8f16(<8 x float> %va) strictfp {
   %evec = call <8 x half> @llvm.experimental.constrained.fptrunc.v8f16.v8f32(<8 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <8 x half> %evec
 }
+
+declare <2 x bfloat> @llvm.experimental.constrained.fptrunc.v2bf16.v2f64(<2 x double>, metadata, metadata)
+define <2 x bfloat> @vfptrunc_v2f64_v2bf16(<2 x double> %va) strictfp {
+; CHECK-LABEL: vfptrunc_v2f64_v2bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vfncvt.rod.f.f.w v9, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    ret
+  %evec = call <2 x bfloat> @llvm.experimental.constrained.fptrunc.v2bf16.v2f64(<2 x double> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <2 x bfloat> %evec
+}
+
+declare <2 x bfloat> @llvm.experimental.constrained.fptrunc.v2bf16.v2f32(<2 x float>, metadata, metadata)
+define <2 x bfloat> @vfptrunc_v2f32_v2bf16(<2 x float> %va) strictfp {
+; CHECK-LABEL: vfptrunc_v2f32_v2bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v9, v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %evec = call <2 x bfloat> @llvm.experimental.constrained.fptrunc.v2bf16.v2f32(<2 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <2 x bfloat> %evec
+}
+
+declare <4 x bfloat> @llvm.experimental.constrained.fptrunc.v4bf16.v4f64(<4 x double>, metadata, metadata)
+define <4 x bfloat> @vfptrunc_v4f64_v4bf16(<4 x double> %va) strictfp {
+; CHECK-LABEL: vfptrunc_v4f64_v4bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vfncvt.rod.f.f.w v10, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    ret
+  %evec = call <4 x bfloat> @llvm.experimental.constrained.fptrunc.v4bf16.v4f64(<4 x double> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <4 x bfloat> %evec
+}
+
+declare <4 x bfloat> @llvm.experimental.constrained.fptrunc.v4bf16.v4f32(<4 x float>, metadata, metadata)
+define <4 x bfloat> @vfptrunc_v4f32_v4bf16(<4 x float> %va) strictfp {
+; CHECK-LABEL: vfptrunc_v4f32_v4bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v9, v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %evec = call <4 x bfloat> @llvm.experimental.constrained.fptrunc.v4bf16.v4f32(<4 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <4 x bfloat> %evec
+}
+
+declare <8 x bfloat> @llvm.experimental.constrained.fptrunc.v8bf16.v8f64(<8 x double>, metadata, metadata)
+define <8 x bfloat> @vfptrunc_v8f64_v8bf16(<8 x double> %va) strictfp {
+; CHECK-LABEL: vfptrunc_v8f64_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vfncvt.rod.f.f.w v12, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    ret
+  %evec = call <8 x bfloat> @llvm.experimental.constrained.fptrunc.v8bf16.v8f64(<8 x double> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <8 x bfloat> %evec
+}
+
+declare <8 x bfloat> @llvm.experimental.constrained.fptrunc.v8bf16.v8f32(<8 x float>, metadata, metadata)
+define <8 x bfloat> @vfptrunc_v8f32_v8bf16(<8 x float> %va) strictfp {
+; CHECK-LABEL: vfptrunc_v8f32_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %evec = call <8 x bfloat> @llvm.experimental.constrained.fptrunc.v8bf16.v8f32(<8 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <8 x bfloat> %evec
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/float-round-conv.ll b/llvm/test/CodeGen/RISCV/rvv/float-round-conv.ll
index 9dcb6d211cb9..b7661bd826fe 100644
--- a/llvm/test/CodeGen/RISCV/rvv/float-round-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/float-round-conv.ll
@@ -487,8 +487,8 @@ define <vscale x 1 x i8> @ceil_nxv1f32_to_ui8(<vscale x 1 x float> %x) {
 define <vscale x 1 x i16> @ceil_nxv1f32_to_si16(<vscale x 1 x float> %x) {
 ; RV32-LABEL: ceil_nxv1f32_to_si16:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; RV32-NEXT:    fsrmi a0, 3
+; RV32-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; RV32-NEXT:    vfncvt.x.f.w v9, v8
 ; RV32-NEXT:    fsrm a0
 ; RV32-NEXT:    vmv1r.v v8, v9
@@ -496,8 +496,8 @@ define <vscale x 1 x i16> @ceil_nxv1f32_to_si16(<vscale x 1 x float> %x) {
 ;
 ; RV64-LABEL: ceil_nxv1f32_to_si16:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; RV64-NEXT:    fsrmi a0, 3
+; RV64-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; RV64-NEXT:    vfncvt.x.f.w v9, v8
 ; RV64-NEXT:    fsrm a0
 ; RV64-NEXT:    vmv1r.v v8, v9
@@ -510,8 +510,8 @@ define <vscale x 1 x i16> @ceil_nxv1f32_to_si16(<vscale x 1 x float> %x) {
 define <vscale x 1 x i16> @ceil_nxv1f32_to_ui16(<vscale x 1 x float> %x) {
 ; RV32-LABEL: ceil_nxv1f32_to_ui16:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; RV32-NEXT:    fsrmi a0, 3
+; RV32-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; RV32-NEXT:    vfncvt.xu.f.w v9, v8
 ; RV32-NEXT:    fsrm a0
 ; RV32-NEXT:    vmv1r.v v8, v9
@@ -519,8 +519,8 @@ define <vscale x 1 x i16> @ceil_nxv1f32_to_ui16(<vscale x 1 x float> %x) {
 ;
 ; RV64-LABEL: ceil_nxv1f32_to_ui16:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; RV64-NEXT:    fsrmi a0, 3
+; RV64-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; RV64-NEXT:    vfncvt.xu.f.w v9, v8
 ; RV64-NEXT:    fsrm a0
 ; RV64-NEXT:    vmv1r.v v8, v9
@@ -533,16 +533,16 @@ define <vscale x 1 x i16> @ceil_nxv1f32_to_ui16(<vscale x 1 x float> %x) {
 define <vscale x 1 x i32> @ceil_nxv1f32_to_si32(<vscale x 1 x float> %x) {
 ; RV32-LABEL: ceil_nxv1f32_to_si32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; RV32-NEXT:    fsrmi a0, 3
+; RV32-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
 ; RV32-NEXT:    vfcvt.x.f.v v8, v8
 ; RV32-NEXT:    fsrm a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: ceil_nxv1f32_to_si32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; RV64-NEXT:    fsrmi a0, 3
+; RV64-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
 ; RV64-NEXT:    vfcvt.x.f.v v8, v8
 ; RV64-NEXT:    fsrm a0
 ; RV64-NEXT:    ret
@@ -554,16 +554,16 @@ define <vscale x 1 x i32> @ceil_nxv1f32_to_si32(<vscale x 1 x float> %x) {
 define <vscale x 1 x i32> @ceil_nxv1f32_to_ui32(<vscale x 1 x float> %x) {
 ; RV32-LABEL: ceil_nxv1f32_to_ui32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; RV32-NEXT:    fsrmi a0, 3
+; RV32-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
 ; RV32-NEXT:    vfcvt.xu.f.v v8, v8
 ; RV32-NEXT:    fsrm a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: ceil_nxv1f32_to_ui32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; RV64-NEXT:    fsrmi a0, 3
+; RV64-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
 ; RV64-NEXT:    vfcvt.xu.f.v v8, v8
 ; RV64-NEXT:    fsrm a0
 ; RV64-NEXT:    ret
@@ -575,8 +575,8 @@ define <vscale x 1 x i32> @ceil_nxv1f32_to_ui32(<vscale x 1 x float> %x) {
 define <vscale x 1 x i64> @ceil_nxv1f32_to_si64(<vscale x 1 x float> %x) {
 ; RV32-LABEL: ceil_nxv1f32_to_si64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; RV32-NEXT:    fsrmi a0, 3
+; RV32-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
 ; RV32-NEXT:    vfwcvt.x.f.v v9, v8
 ; RV32-NEXT:    fsrm a0
 ; RV32-NEXT:    vmv1r.v v8, v9
@@ -584,8 +584,8 @@ define <vscale x 1 x i64> @ceil_nxv1f32_to_si64(<vscale x 1 x float> %x) {
 ;
 ; RV64-LABEL: ceil_nxv1f32_to_si64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; RV64-NEXT:    fsrmi a0, 3
+; RV64-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
 ; RV64-NEXT:    vfwcvt.x.f.v v9, v8
 ; RV64-NEXT:    fsrm a0
 ; RV64-NEXT:    vmv1r.v v8, v9
@@ -598,8 +598,8 @@ define <vscale x 1 x i64> @ceil_nxv1f32_to_si64(<vscale x 1 x float> %x) {
 define <vscale x 1 x i64> @ceil_nxv1f32_to_ui64(<vscale x 1 x float> %x) {
 ; RV32-LABEL: ceil_nxv1f32_to_ui64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; RV32-NEXT:    fsrmi a0, 3
+; RV32-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
 ; RV32-NEXT:    vfwcvt.xu.f.v v9, v8
 ; RV32-NEXT:    fsrm a0
 ; RV32-NEXT:    vmv1r.v v8, v9
@@ -607,8 +607,8 @@ define <vscale x 1 x i64> @ceil_nxv1f32_to_ui64(<vscale x 1 x float> %x) {
 ;
 ; RV64-LABEL: ceil_nxv1f32_to_ui64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; RV64-NEXT:    fsrmi a0, 3
+; RV64-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
 ; RV64-NEXT:    vfwcvt.xu.f.v v9, v8
 ; RV64-NEXT:    fsrm a0
 ; RV64-NEXT:    vmv1r.v v8, v9
@@ -713,8 +713,8 @@ define <vscale x 4 x i8> @ceil_nxv4f32_to_ui8(<vscale x 4 x float> %x) {
 define <vscale x 4 x i16> @ceil_nxv4f32_to_si16(<vscale x 4 x float> %x) {
 ; RV32-LABEL: ceil_nxv4f32_to_si16:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; RV32-NEXT:    fsrmi a0, 3
+; RV32-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; RV32-NEXT:    vfncvt.x.f.w v10, v8
 ; RV32-NEXT:    fsrm a0
 ; RV32-NEXT:    vmv.v.v v8, v10
@@ -722,8 +722,8 @@ define <vscale x 4 x i16> @ceil_nxv4f32_to_si16(<vscale x 4 x float> %x) {
 ;
 ; RV64-LABEL: ceil_nxv4f32_to_si16:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; RV64-NEXT:    fsrmi a0, 3
+; RV64-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; RV64-NEXT:    vfncvt.x.f.w v10, v8
 ; RV64-NEXT:    fsrm a0
 ; RV64-NEXT:    vmv.v.v v8, v10
@@ -736,8 +736,8 @@ define <vscale x 4 x i16> @ceil_nxv4f32_to_si16(<vscale x 4 x float> %x) {
 define <vscale x 4 x i16> @ceil_nxv4f32_to_ui16(<vscale x 4 x float> %x) {
 ; RV32-LABEL: ceil_nxv4f32_to_ui16:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; RV32-NEXT:    fsrmi a0, 3
+; RV32-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; RV32-NEXT:    vfncvt.xu.f.w v10, v8
 ; RV32-NEXT:    fsrm a0
 ; RV32-NEXT:    vmv.v.v v8, v10
@@ -745,8 +745,8 @@ define <vscale x 4 x i16> @ceil_nxv4f32_to_ui16(<vscale x 4 x float> %x) {
 ;
 ; RV64-LABEL: ceil_nxv4f32_to_ui16:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; RV64-NEXT:    fsrmi a0, 3
+; RV64-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; RV64-NEXT:    vfncvt.xu.f.w v10, v8
 ; RV64-NEXT:    fsrm a0
 ; RV64-NEXT:    vmv.v.v v8, v10
@@ -759,16 +759,16 @@ define <vscale x 4 x i16> @ceil_nxv4f32_to_ui16(<vscale x 4 x float> %x) {
 define <vscale x 4 x i32> @ceil_nxv4f32_to_si32(<vscale x 4 x float> %x) {
 ; RV32-LABEL: ceil_nxv4f32_to_si32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; RV32-NEXT:    fsrmi a0, 3
+; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vfcvt.x.f.v v8, v8
 ; RV32-NEXT:    fsrm a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: ceil_nxv4f32_to_si32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; RV64-NEXT:    fsrmi a0, 3
+; RV64-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; RV64-NEXT:    vfcvt.x.f.v v8, v8
 ; RV64-NEXT:    fsrm a0
 ; RV64-NEXT:    ret
@@ -780,16 +780,16 @@ define <vscale x 4 x i32> @ceil_nxv4f32_to_si32(<vscale x 4 x float> %x) {
 define <vscale x 4 x i32> @ceil_nxv4f32_to_ui32(<vscale x 4 x float> %x) {
 ; RV32-LABEL: ceil_nxv4f32_to_ui32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; RV32-NEXT:    fsrmi a0, 3
+; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vfcvt.xu.f.v v8, v8
 ; RV32-NEXT:    fsrm a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: ceil_nxv4f32_to_ui32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; RV64-NEXT:    fsrmi a0, 3
+; RV64-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; RV64-NEXT:    vfcvt.xu.f.v v8, v8
 ; RV64-NEXT:    fsrm a0
 ; RV64-NEXT:    ret
@@ -801,8 +801,8 @@ define <vscale x 4 x i32> @ceil_nxv4f32_to_ui32(<vscale x 4 x float> %x) {
 define <vscale x 4 x i64> @ceil_nxv4f32_to_si64(<vscale x 4 x float> %x) {
 ; RV32-LABEL: ceil_nxv4f32_to_si64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; RV32-NEXT:    fsrmi a0, 3
+; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vfwcvt.x.f.v v12, v8
 ; RV32-NEXT:    fsrm a0
 ; RV32-NEXT:    vmv4r.v v8, v12
@@ -810,8 +810,8 @@ define <vscale x 4 x i64> @ceil_nxv4f32_to_si64(<vscale x 4 x float> %x) {
 ;
 ; RV64-LABEL: ceil_nxv4f32_to_si64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; RV64-NEXT:    fsrmi a0, 3
+; RV64-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; RV64-NEXT:    vfwcvt.x.f.v v12, v8
 ; RV64-NEXT:    fsrm a0
 ; RV64-NEXT:    vmv4r.v v8, v12
@@ -824,8 +824,8 @@ define <vscale x 4 x i64> @ceil_nxv4f32_to_si64(<vscale x 4 x float> %x) {
 define <vscale x 4 x i64> @ceil_nxv4f32_to_ui64(<vscale x 4 x float> %x) {
 ; RV32-LABEL: ceil_nxv4f32_to_ui64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; RV32-NEXT:    fsrmi a0, 3
+; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vfwcvt.xu.f.v v12, v8
 ; RV32-NEXT:    fsrm a0
 ; RV32-NEXT:    vmv4r.v v8, v12
@@ -833,8 +833,8 @@ define <vscale x 4 x i64> @ceil_nxv4f32_to_ui64(<vscale x 4 x float> %x) {
 ;
 ; RV64-LABEL: ceil_nxv4f32_to_ui64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; RV64-NEXT:    fsrmi a0, 3
+; RV64-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; RV64-NEXT:    vfwcvt.xu.f.v v12, v8
 ; RV64-NEXT:    fsrm a0
 ; RV64-NEXT:    vmv4r.v v8, v12
diff --git a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
index 9c4706b2bda7..d464b491bbbe 100644
--- a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
@@ -15,8 +15,8 @@ define <vscale x 1 x half> @vp_floor_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -57,8 +57,8 @@ define <vscale x 2 x half> @vp_floor_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -99,8 +99,8 @@ define <vscale x 4 x half> @vp_floor_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -142,8 +142,8 @@ define <vscale x 8 x half> @vp_floor_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -186,8 +186,8 @@ define <vscale x 16 x half> @vp_floor_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -230,8 +230,8 @@ define <vscale x 32 x half> @vp_floor_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -273,8 +273,8 @@ define <vscale x 1 x float> @vp_floor_nxv1f32(<vscale x 1 x float> %va, <vscale
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -315,8 +315,8 @@ define <vscale x 2 x float> @vp_floor_nxv2f32(<vscale x 2 x float> %va, <vscale
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -358,8 +358,8 @@ define <vscale x 4 x float> @vp_floor_nxv4f32(<vscale x 4 x float> %va, <vscale
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -402,8 +402,8 @@ define <vscale x 8 x float> @vp_floor_nxv8f32(<vscale x 8 x float> %va, <vscale
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -446,8 +446,8 @@ define <vscale x 16 x float> @vp_floor_nxv16f32(<vscale x 16 x float> %va, <vsca
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -489,8 +489,8 @@ define <vscale x 1 x double> @vp_floor_nxv1f64(<vscale x 1 x double> %va, <vscal
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -532,8 +532,8 @@ define <vscale x 2 x double> @vp_floor_nxv2f64(<vscale x 2 x double> %va, <vscal
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -576,8 +576,8 @@ define <vscale x 4 x double> @vp_floor_nxv4f64(<vscale x 4 x double> %va, <vscal
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -620,8 +620,8 @@ define <vscale x 7 x double> @vp_floor_nxv7f64(<vscale x 7 x double> %va, <vscal
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -664,8 +664,8 @@ define <vscale x 8 x double> @vp_floor_nxv8f64(<vscale x 8 x double> %va, <vscal
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -726,8 +726,8 @@ define <vscale x 16 x double> @vp_floor_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    vfabs.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v25, v8, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a2, 2
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    fsrm a2
@@ -750,8 +750,8 @@ define <vscale x 16 x double> @vp_floor_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v24, v16, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/frm-insert.ll b/llvm/test/CodeGen/RISCV/rvv/frm-insert.ll
index cdbc6e8d8d55..0e102d98c79c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/frm-insert.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/frm-insert.ll
@@ -13,19 +13,19 @@ declare <vscale x 1 x float> @llvm.riscv.vfadd.nxv1f32.nxv1f32(
 define <vscale x 1 x float> @test(<vscale x 1 x float> %0, <vscale x 1 x float> %1, i64 %2) nounwind {
 ; CHECK-LABEL: test:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v8, v9
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; UNOPT-LABEL: test:
 ; UNOPT:       # %bb.0: # %entry
+; UNOPT-NEXT:    fsrmi a1, 0
 ; UNOPT-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; UNOPT-NEXT:    fsrmi a0, 0
 ; UNOPT-NEXT:    vfadd.vv v8, v8, v9
-; UNOPT-NEXT:    fsrm a0
+; UNOPT-NEXT:    fsrm a1
 ; UNOPT-NEXT:    fsrmi a0, 0
 ; UNOPT-NEXT:    vfadd.vv v8, v8, v8
 ; UNOPT-NEXT:    fsrm a0
@@ -48,20 +48,20 @@ entry:
 define <vscale x 1 x float> @test2(<vscale x 1 x float> %0, <vscale x 1 x float> %1, i64 %2) nounwind {
 ; CHECK-LABEL: test2:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v8, v9
 ; CHECK-NEXT:    fsrmi 1
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; UNOPT-LABEL: test2:
 ; UNOPT:       # %bb.0: # %entry
+; UNOPT-NEXT:    fsrmi a1, 0
 ; UNOPT-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; UNOPT-NEXT:    fsrmi a0, 0
 ; UNOPT-NEXT:    vfadd.vv v8, v8, v9
-; UNOPT-NEXT:    fsrm a0
+; UNOPT-NEXT:    fsrm a1
 ; UNOPT-NEXT:    fsrmi a0, 1
 ; UNOPT-NEXT:    vfadd.vv v8, v8, v8
 ; UNOPT-NEXT:    fsrm a0
@@ -132,12 +132,12 @@ define <vscale x 1 x float> @before_call1(<vscale x 1 x float> %0, <vscale x 1 x
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v8, v9
-; CHECK-NEXT:    addi a1, sp, 32
-; CHECK-NEXT:    vs1r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    addi a0, sp, 32
+; CHECK-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    call foo
 ; CHECK-NEXT:    addi a0, sp, 32
 ; CHECK-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
@@ -155,12 +155,12 @@ define <vscale x 1 x float> @before_call1(<vscale x 1 x float> %0, <vscale x 1 x
 ; UNOPT-NEXT:    csrr a1, vlenb
 ; UNOPT-NEXT:    slli a1, a1, 1
 ; UNOPT-NEXT:    sub sp, sp, a1
+; UNOPT-NEXT:    fsrmi a1, 0
 ; UNOPT-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; UNOPT-NEXT:    fsrmi a0, 0
 ; UNOPT-NEXT:    vfadd.vv v8, v8, v9
-; UNOPT-NEXT:    addi a1, sp, 32
-; UNOPT-NEXT:    vs1r.v v8, (a1) # Unknown-size Folded Spill
-; UNOPT-NEXT:    fsrm a0
+; UNOPT-NEXT:    addi a0, sp, 32
+; UNOPT-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; UNOPT-NEXT:    fsrm a1
 ; UNOPT-NEXT:    call foo
 ; UNOPT-NEXT:    addi a0, sp, 32
 ; UNOPT-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
@@ -240,12 +240,12 @@ define <vscale x 1 x float> @after_call1(<vscale x 1 x float> %0, <vscale x 1 x
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v8, v9
-; CHECK-NEXT:    addi a1, sp, 32
-; CHECK-NEXT:    vs1r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    addi a0, sp, 32
+; CHECK-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    call foo
 ; CHECK-NEXT:    addi a0, sp, 32
 ; CHECK-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
@@ -263,12 +263,12 @@ define <vscale x 1 x float> @after_call1(<vscale x 1 x float> %0, <vscale x 1 x
 ; UNOPT-NEXT:    csrr a1, vlenb
 ; UNOPT-NEXT:    slli a1, a1, 1
 ; UNOPT-NEXT:    sub sp, sp, a1
+; UNOPT-NEXT:    fsrmi a1, 0
 ; UNOPT-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; UNOPT-NEXT:    fsrmi a0, 0
 ; UNOPT-NEXT:    vfadd.vv v8, v8, v9
-; UNOPT-NEXT:    addi a1, sp, 32
-; UNOPT-NEXT:    vs1r.v v8, (a1) # Unknown-size Folded Spill
-; UNOPT-NEXT:    fsrm a0
+; UNOPT-NEXT:    addi a0, sp, 32
+; UNOPT-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; UNOPT-NEXT:    fsrm a1
 ; UNOPT-NEXT:    call foo
 ; UNOPT-NEXT:    addi a0, sp, 32
 ; UNOPT-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
@@ -360,20 +360,20 @@ entry:
 define <vscale x 1 x float> @before_asm1(<vscale x 1 x float> %0, <vscale x 1 x float> %1, i64 %2) nounwind {
 ; CHECK-LABEL: before_asm1:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    ret
 ;
 ; UNOPT-LABEL: before_asm1:
 ; UNOPT:       # %bb.0: # %entry
+; UNOPT-NEXT:    fsrmi a1, 0
 ; UNOPT-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; UNOPT-NEXT:    fsrmi a0, 0
 ; UNOPT-NEXT:    vfadd.vv v8, v8, v9
-; UNOPT-NEXT:    fsrm a0
+; UNOPT-NEXT:    fsrm a1
 ; UNOPT-NEXT:    #APP
 ; UNOPT-NEXT:    #NO_APP
 ; UNOPT-NEXT:    ret
@@ -416,20 +416,20 @@ entry:
 define <vscale x 1 x float> @after_asm1(<vscale x 1 x float> %0, <vscale x 1 x float> %1, i64 %2) nounwind {
 ; CHECK-LABEL: after_asm1:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    ret
 ;
 ; UNOPT-LABEL: after_asm1:
 ; UNOPT:       # %bb.0: # %entry
+; UNOPT-NEXT:    fsrmi a1, 0
 ; UNOPT-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; UNOPT-NEXT:    fsrmi a0, 0
 ; UNOPT-NEXT:    vfadd.vv v8, v8, v9
-; UNOPT-NEXT:    fsrm a0
+; UNOPT-NEXT:    fsrm a1
 ; UNOPT-NEXT:    #APP
 ; UNOPT-NEXT:    #NO_APP
 ; UNOPT-NEXT:    ret
@@ -476,10 +476,10 @@ declare i32 @llvm.get.rounding()
 define <vscale x 1 x float> @test5(<vscale x 1 x float> %0, <vscale x 1 x float> %1, i64 %2, ptr %p) nounwind {
 ; CHECK-LABEL: test5:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a2, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a2
 ; CHECK-NEXT:    frrm a0
 ; CHECK-NEXT:    slli a0, a0, 2
 ; CHECK-NEXT:    lui a2, 66
@@ -492,10 +492,10 @@ define <vscale x 1 x float> @test5(<vscale x 1 x float> %0, <vscale x 1 x float>
 ;
 ; UNOPT-LABEL: test5:
 ; UNOPT:       # %bb.0: # %entry
+; UNOPT-NEXT:    fsrmi a2, 0
 ; UNOPT-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; UNOPT-NEXT:    fsrmi a0, 0
 ; UNOPT-NEXT:    vfadd.vv v8, v8, v9
-; UNOPT-NEXT:    fsrm a0
+; UNOPT-NEXT:    fsrm a2
 ; UNOPT-NEXT:    frrm a0
 ; UNOPT-NEXT:    slli a0, a0, 2
 ; UNOPT-NEXT:    lui a2, 66
@@ -559,10 +559,10 @@ define <vscale x 1 x float> @after_fsrm2(<vscale x 1 x float> %0, <vscale x 1 x
 ; UNOPT-LABEL: after_fsrm2:
 ; UNOPT:       # %bb.0: # %entry
 ; UNOPT-NEXT:    fsrmi 4
+; UNOPT-NEXT:    fsrmi a1, 4
 ; UNOPT-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; UNOPT-NEXT:    fsrmi a0, 4
 ; UNOPT-NEXT:    vfadd.vv v8, v8, v9
-; UNOPT-NEXT:    fsrm a0
+; UNOPT-NEXT:    fsrm a1
 ; UNOPT-NEXT:    ret
 entry:
   call void @llvm.set.rounding(i32 4)
@@ -579,19 +579,19 @@ define <vscale x 1 x float> @after_fsrm3(<vscale x 1 x float> %0, <vscale x 1 x
 ; CHECK-LABEL: after_fsrm3:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    fsrmi 4
+; CHECK-NEXT:    fsrmi a1, 5
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 5
 ; CHECK-NEXT:    vfadd.vv v8, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; UNOPT-LABEL: after_fsrm3:
 ; UNOPT:       # %bb.0: # %entry
 ; UNOPT-NEXT:    fsrmi 4
+; UNOPT-NEXT:    fsrmi a1, 5
 ; UNOPT-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; UNOPT-NEXT:    fsrmi a0, 5
 ; UNOPT-NEXT:    vfadd.vv v8, v8, v9
-; UNOPT-NEXT:    fsrm a0
+; UNOPT-NEXT:    fsrm a1
 ; UNOPT-NEXT:    ret
 entry:
   call void @llvm.set.rounding(i32 4)
diff --git a/llvm/test/CodeGen/RISCV/rvv/half-round-conv.ll b/llvm/test/CodeGen/RISCV/rvv/half-round-conv.ll
index 6de62214ccc4..5cd9996c7ba3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/half-round-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/half-round-conv.ll
@@ -249,8 +249,8 @@ declare <vscale x 1 x half> @llvm.ceil.nxv1f16(<vscale x 1 x half>)
 define <vscale x 1 x i8> @ceil_nxv1f16_to_si8(<vscale x 1 x half> %x) {
 ; CHECK-LABEL: ceil_nxv1f16_to_si8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vfncvt.x.f.w v9, v8
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vmv1r.v v8, v9
@@ -263,8 +263,8 @@ define <vscale x 1 x i8> @ceil_nxv1f16_to_si8(<vscale x 1 x half> %x) {
 define <vscale x 1 x i8> @ceil_nxv1f16_to_ui8(<vscale x 1 x half> %x) {
 ; CHECK-LABEL: ceil_nxv1f16_to_ui8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vfncvt.xu.f.w v9, v8
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vmv1r.v v8, v9
@@ -277,8 +277,8 @@ define <vscale x 1 x i8> @ceil_nxv1f16_to_ui8(<vscale x 1 x half> %x) {
 define <vscale x 1 x i16> @ceil_nxv1f16_to_si16(<vscale x 1 x half> %x) {
 ; CHECK-LABEL: ceil_nxv1f16_to_si16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v8
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
@@ -290,8 +290,8 @@ define <vscale x 1 x i16> @ceil_nxv1f16_to_si16(<vscale x 1 x half> %x) {
 define <vscale x 1 x i16> @ceil_nxv1f16_to_ui16(<vscale x 1 x half> %x) {
 ; CHECK-LABEL: ceil_nxv1f16_to_ui16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.xu.f.v v8, v8
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
@@ -303,8 +303,8 @@ define <vscale x 1 x i16> @ceil_nxv1f16_to_ui16(<vscale x 1 x half> %x) {
 define <vscale x 1 x i32> @ceil_nxv1f16_to_si32(<vscale x 1 x half> %x) {
 ; CHECK-LABEL: ceil_nxv1f16_to_si32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvt.x.f.v v9, v8
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vmv1r.v v8, v9
@@ -317,8 +317,8 @@ define <vscale x 1 x i32> @ceil_nxv1f16_to_si32(<vscale x 1 x half> %x) {
 define <vscale x 1 x i32> @ceil_nxv1f16_to_ui32(<vscale x 1 x half> %x) {
 ; CHECK-LABEL: ceil_nxv1f16_to_ui32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvt.xu.f.v v9, v8
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vmv1r.v v8, v9
@@ -451,8 +451,8 @@ declare <vscale x 4 x half> @llvm.ceil.nxv4f16(<vscale x 4 x half>)
 define <vscale x 4 x i8> @ceil_nxv4f16_to_si8(<vscale x 4 x half> %x) {
 ; CHECK-LABEL: ceil_nxv4f16_to_si8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vfncvt.x.f.w v9, v8
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vmv1r.v v8, v9
@@ -465,8 +465,8 @@ define <vscale x 4 x i8> @ceil_nxv4f16_to_si8(<vscale x 4 x half> %x) {
 define <vscale x 4 x i8> @ceil_nxv4f16_to_ui8(<vscale x 4 x half> %x) {
 ; CHECK-LABEL: ceil_nxv4f16_to_ui8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vfncvt.xu.f.w v9, v8
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vmv1r.v v8, v9
@@ -479,8 +479,8 @@ define <vscale x 4 x i8> @ceil_nxv4f16_to_ui8(<vscale x 4 x half> %x) {
 define <vscale x 4 x i16> @ceil_nxv4f16_to_si16(<vscale x 4 x half> %x) {
 ; CHECK-LABEL: ceil_nxv4f16_to_si16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v8
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
@@ -492,8 +492,8 @@ define <vscale x 4 x i16> @ceil_nxv4f16_to_si16(<vscale x 4 x half> %x) {
 define <vscale x 4 x i16> @ceil_nxv4f16_to_ui16(<vscale x 4 x half> %x) {
 ; CHECK-LABEL: ceil_nxv4f16_to_ui16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.xu.f.v v8, v8
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
@@ -505,8 +505,8 @@ define <vscale x 4 x i16> @ceil_nxv4f16_to_ui16(<vscale x 4 x half> %x) {
 define <vscale x 4 x i32> @ceil_nxv4f16_to_si32(<vscale x 4 x half> %x) {
 ; CHECK-LABEL: ceil_nxv4f16_to_si32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvt.x.f.v v10, v8
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vmv2r.v v8, v10
@@ -519,8 +519,8 @@ define <vscale x 4 x i32> @ceil_nxv4f16_to_si32(<vscale x 4 x half> %x) {
 define <vscale x 4 x i32> @ceil_nxv4f16_to_ui32(<vscale x 4 x half> %x) {
 ; CHECK-LABEL: ceil_nxv4f16_to_ui32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvt.xu.f.v v10, v8
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vmv2r.v v8, v10
diff --git a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
index b15896580d42..0cd4f423a9df 100644
--- a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple riscv32 -mattr=+m,+d,+zfh,+zvfh,+v -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple riscv64 -mattr=+m,+d,+zfh,+zvfh,+v -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple riscv32 -mattr=+m,+d,+zfh,+zvfh,+v,+experimental-zvfbfmin -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple riscv64 -mattr=+m,+d,+zfh,+zvfh,+v,+experimental-zvfbfmin -verify-machineinstrs < %s | FileCheck %s
 
 define <vscale x 8 x i32> @insert_nxv8i32_nxv4i32_0(<vscale x 8 x i32> %vec, <vscale x 4 x i32> %subvec) {
 ; CHECK-LABEL: insert_nxv8i32_nxv4i32_0:
@@ -531,6 +531,65 @@ define <vscale x 8 x i32> @insert_insert_combine2(<vscale x 2 x i32> %subvec) {
   ret <vscale x 8 x i32> %outer
 }
 
+define <vscale x 32 x bfloat> @insert_nxv32bf16_nxv2bf16_0(<vscale x 32 x bfloat> %vec, <vscale x 2 x bfloat> %subvec) {
+; CHECK-LABEL: insert_nxv32bf16_nxv2bf16_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x bfloat> @llvm.vector.insert.nxv2bf16.nxv32bf16(<vscale x 32 x bfloat> %vec, <vscale x 2 x bfloat> %subvec, i64 0)
+  ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @insert_nxv32bf16_nxv2bf16_2(<vscale x 32 x bfloat> %vec, <vscale x 2 x bfloat> %subvec) {
+; CHECK-LABEL: insert_nxv32bf16_nxv2bf16_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    add a1, a0, a0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v16, a0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x bfloat> @llvm.vector.insert.nxv2bf16.nxv32bf16(<vscale x 32 x bfloat> %vec, <vscale x 2 x bfloat> %subvec, i64 2)
+  ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @insert_nxv32bf16_nxv2bf16_26(<vscale x 32 x bfloat> %vec, <vscale x 2 x bfloat> %subvec) {
+; CHECK-LABEL: insert_nxv32bf16_nxv2bf16_26:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    add a1, a0, a0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v14, v16, a0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x bfloat> @llvm.vector.insert.nxv2bf16.nxv32bf16(<vscale x 32 x bfloat> %vec, <vscale x 2 x bfloat> %subvec, i64 26)
+  ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @insert_nxv32bf16_undef_nxv1bf16_0(<vscale x 1 x bfloat> %subvec) {
+; CHECK-LABEL: insert_nxv32bf16_undef_nxv1bf16_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x bfloat> @llvm.vector.insert.nxv1bf16.nxv32bf16(<vscale x 32 x bfloat> undef, <vscale x 1 x bfloat> %subvec, i64 0)
+  ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @insert_nxv32bf16_undef_nxv1bf16_26(<vscale x 1 x bfloat> %subvec) {
+; CHECK-LABEL: insert_nxv32bf16_undef_nxv1bf16_26:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a1, a0, 3
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v14, v8, a0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x bfloat> @llvm.vector.insert.nxv1bf16.nxv32bf16(<vscale x 32 x bfloat> undef, <vscale x 1 x bfloat> %subvec, i64 26)
+  ret <vscale x 32 x bfloat> %v
+}
 
 attributes #0 = { vscale_range(2,1024) }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/mask-reg-alloc.mir b/llvm/test/CodeGen/RISCV/rvv/mask-reg-alloc.mir
index 0e207731e020..b891207341b3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mask-reg-alloc.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/mask-reg-alloc.mir
@@ -27,13 +27,10 @@ body:             |
     %2:vr = COPY $v2
     %3:vr = COPY $v3
     %4:vmv0 = COPY %0
-    %pt1:vrnov0 = IMPLICIT_DEF
-    %5:vrnov0 = PseudoVMERGE_VIM_M1 %pt1, killed %2, 1, %4, 1, 3
+    %5:vrnov0 = PseudoVMERGE_VIM_M1 undef $noreg, killed %2, 1, %4, 1, 3
     %6:vmv0 = COPY %1
-    %pt2:vrnov0 = IMPLICIT_DEF
-    %7:vrnov0 = PseudoVMERGE_VIM_M1 %pt2, killed %3, 1, %6, 1, 3
-    %pt:vr = IMPLICIT_DEF
-    %8:vr = PseudoVADD_VV_M1 %pt, killed %5, killed %7, 1, 3, 0
+    %7:vrnov0 = PseudoVMERGE_VIM_M1 undef $noreg, killed %3, 1, %6, 1, 3
+    %8:vr = PseudoVADD_VV_M1 undef $noreg, killed %5, killed %7, 1, 3, 0
     $v0 = COPY %8
     PseudoRET implicit $v0
 ...
diff --git a/llvm/test/CodeGen/RISCV/rvv/masked-tama.ll b/llvm/test/CodeGen/RISCV/rvv/masked-tama.ll
index d81079da64bd..f87fa3ec6f16 100644
--- a/llvm/test/CodeGen/RISCV/rvv/masked-tama.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/masked-tama.ll
@@ -516,8 +516,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vaadd.mask.nxv1i8.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vaadd_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vaadd.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -543,8 +543,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vsmul.mask.nxv1i8.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vsmul_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -570,8 +570,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vssrl.mask.nxv1i8.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vssrl_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vssrl_mask_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -597,8 +597,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vnclip.mask.nxv1i8.nxv1i16.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vnclip_mask_wv_nxv1i8_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, <vscale x 1 x i8> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_wv_nxv1i8_nxv1i16_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vnclip.wv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/masked-tamu.ll b/llvm/test/CodeGen/RISCV/rvv/masked-tamu.ll
index c8bff58b00e4..4098270d365a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/masked-tamu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/masked-tamu.ll
@@ -489,8 +489,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vaadd.mask.nxv1i8.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vaadd_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    vaadd.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -515,8 +515,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vsmul.mask.nxv1i8.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vsmul_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -541,8 +541,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vssrl.mask.nxv1i8.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vssrl_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssrl_mask_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    vssrl.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -567,8 +567,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vnclip.mask.nxv1i8.nxv1i16.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vnclip_mask_wv_nxv1i8_nxv1i16_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i16> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_wv_nxv1i8_nxv1i16_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    vnclip.wv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/masked-tuma.ll b/llvm/test/CodeGen/RISCV/rvv/masked-tuma.ll
index 409a008ec7cf..4cd7e143be66 100644
--- a/llvm/test/CodeGen/RISCV/rvv/masked-tuma.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/masked-tuma.ll
@@ -489,8 +489,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vaadd.mask.nxv1i8.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vaadd_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, ma
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, ma
 ; CHECK-NEXT:    vaadd.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -515,8 +515,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vsmul.mask.nxv1i8.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vsmul_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, ma
 ; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -541,8 +541,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vssrl.mask.nxv1i8.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vssrl_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssrl_mask_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, ma
 ; CHECK-NEXT:    vssrl.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -567,8 +567,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vnclip.mask.nxv1i8.nxv1i16.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vnclip_mask_wv_nxv1i8_nxv1i16_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i16> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_wv_nxv1i8_nxv1i16_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, ma
 ; CHECK-NEXT:    vnclip.wv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/masked-tumu.ll b/llvm/test/CodeGen/RISCV/rvv/masked-tumu.ll
index 90054bcc5f36..c8719e6a2e7c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/masked-tumu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/masked-tumu.ll
@@ -489,8 +489,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vaadd.mask.nxv1i8.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vaadd_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, mu
 ; CHECK-NEXT:    vaadd.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -515,8 +515,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vsmul.mask.nxv1i8.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vsmul_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, mu
 ; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -541,8 +541,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vssrl.mask.nxv1i8.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vssrl_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssrl_mask_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, mu
 ; CHECK-NEXT:    vssrl.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -567,8 +567,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vnclip.mask.nxv1i8.nxv1i16.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vnclip_mask_wv_nxv1i8_nxv1i16_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i16> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_wv_nxv1i8_nxv1i16_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, mu
 ; CHECK-NEXT:    vnclip.wv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/round-vp.ll
index eb4994914fad..edeac1acf3b0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/round-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/round-vp.ll
@@ -19,8 +19,8 @@ define <vscale x 1 x half> @vp_round_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 4
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -38,8 +38,8 @@ define <vscale x 1 x half> @vp_round_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -101,8 +101,8 @@ define <vscale x 2 x half> @vp_round_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 4
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -120,8 +120,8 @@ define <vscale x 2 x half> @vp_round_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -183,8 +183,8 @@ define <vscale x 4 x half> @vp_round_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 4
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -203,8 +203,8 @@ define <vscale x 4 x half> @vp_round_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v9
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
@@ -268,8 +268,8 @@ define <vscale x 8 x half> @vp_round_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 4
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v0, v10
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -289,8 +289,8 @@ define <vscale x 8 x half> @vp_round_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
@@ -354,8 +354,8 @@ define <vscale x 16 x half> @vp_round_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; ZVFH-NEXT:    vfabs.v v16, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 4
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v0, v12
 ; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -375,8 +375,8 @@ define <vscale x 16 x half> @vp_round_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
@@ -440,8 +440,8 @@ define <vscale x 32 x half> @vp_round_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFH-NEXT:    vfabs.v v24, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 4
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v0, v16
 ; ZVFH-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -479,8 +479,8 @@ define <vscale x 32 x half> @vp_round_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a2, 4
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v17
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    fsrm a2
@@ -501,8 +501,8 @@ define <vscale x 32 x half> @vp_round_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v16, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v16
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
@@ -567,8 +567,8 @@ define <vscale x 32 x half> @vp_round_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v16, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a2, 4
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v16
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    fsrm a2
@@ -615,8 +615,8 @@ define <vscale x 1 x float> @vp_round_nxv1f32(<vscale x 1 x float> %va, <vscale
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -657,8 +657,8 @@ define <vscale x 2 x float> @vp_round_nxv2f32(<vscale x 2 x float> %va, <vscale
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -700,8 +700,8 @@ define <vscale x 4 x float> @vp_round_nxv4f32(<vscale x 4 x float> %va, <vscale
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -744,8 +744,8 @@ define <vscale x 8 x float> @vp_round_nxv8f32(<vscale x 8 x float> %va, <vscale
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -788,8 +788,8 @@ define <vscale x 16 x float> @vp_round_nxv16f32(<vscale x 16 x float> %va, <vsca
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -831,8 +831,8 @@ define <vscale x 1 x double> @vp_round_nxv1f64(<vscale x 1 x double> %va, <vscal
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -874,8 +874,8 @@ define <vscale x 2 x double> @vp_round_nxv2f64(<vscale x 2 x double> %va, <vscal
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -918,8 +918,8 @@ define <vscale x 4 x double> @vp_round_nxv4f64(<vscale x 4 x double> %va, <vscal
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -962,8 +962,8 @@ define <vscale x 7 x double> @vp_round_nxv7f64(<vscale x 7 x double> %va, <vscal
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -1006,8 +1006,8 @@ define <vscale x 8 x double> @vp_round_nxv8f64(<vscale x 8 x double> %va, <vscal
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -1068,8 +1068,8 @@ define <vscale x 16 x double> @vp_round_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    vfabs.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v25, v8, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a2, 4
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    fsrm a2
@@ -1092,8 +1092,8 @@ define <vscale x 16 x double> @vp_round_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v24, v16, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
index f366a2922d07..a77c58ba9ec5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
@@ -19,8 +19,8 @@ define <vscale x 1 x half> @vp_roundeven_nxv1f16(<vscale x 1 x half> %va, <vscal
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 0
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -38,8 +38,8 @@ define <vscale x 1 x half> @vp_roundeven_nxv1f16(<vscale x 1 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -101,8 +101,8 @@ define <vscale x 2 x half> @vp_roundeven_nxv2f16(<vscale x 2 x half> %va, <vscal
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 0
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -120,8 +120,8 @@ define <vscale x 2 x half> @vp_roundeven_nxv2f16(<vscale x 2 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -183,8 +183,8 @@ define <vscale x 4 x half> @vp_roundeven_nxv4f16(<vscale x 4 x half> %va, <vscal
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 0
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -203,8 +203,8 @@ define <vscale x 4 x half> @vp_roundeven_nxv4f16(<vscale x 4 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v9
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
@@ -268,8 +268,8 @@ define <vscale x 8 x half> @vp_roundeven_nxv8f16(<vscale x 8 x half> %va, <vscal
 ; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 0
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v0, v10
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -289,8 +289,8 @@ define <vscale x 8 x half> @vp_roundeven_nxv8f16(<vscale x 8 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
@@ -354,8 +354,8 @@ define <vscale x 16 x half> @vp_roundeven_nxv16f16(<vscale x 16 x half> %va, <vs
 ; ZVFH-NEXT:    vfabs.v v16, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 0
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v0, v12
 ; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -375,8 +375,8 @@ define <vscale x 16 x half> @vp_roundeven_nxv16f16(<vscale x 16 x half> %va, <vs
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
@@ -440,8 +440,8 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16(<vscale x 32 x half> %va, <vs
 ; ZVFH-NEXT:    vfabs.v v24, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 0
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v0, v16
 ; ZVFH-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -479,8 +479,8 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16(<vscale x 32 x half> %va, <vs
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a2, 0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v17
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    fsrm a2
@@ -501,8 +501,8 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16(<vscale x 32 x half> %va, <vs
 ; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v16, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v16
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
@@ -567,8 +567,8 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16_unmasked(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v16, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a2, 0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v16
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    fsrm a2
@@ -615,8 +615,8 @@ define <vscale x 1 x float> @vp_roundeven_nxv1f32(<vscale x 1 x float> %va, <vsc
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -657,8 +657,8 @@ define <vscale x 2 x float> @vp_roundeven_nxv2f32(<vscale x 2 x float> %va, <vsc
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -700,8 +700,8 @@ define <vscale x 4 x float> @vp_roundeven_nxv4f32(<vscale x 4 x float> %va, <vsc
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -744,8 +744,8 @@ define <vscale x 8 x float> @vp_roundeven_nxv8f32(<vscale x 8 x float> %va, <vsc
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -788,8 +788,8 @@ define <vscale x 16 x float> @vp_roundeven_nxv16f32(<vscale x 16 x float> %va, <
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -831,8 +831,8 @@ define <vscale x 1 x double> @vp_roundeven_nxv1f64(<vscale x 1 x double> %va, <v
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -874,8 +874,8 @@ define <vscale x 2 x double> @vp_roundeven_nxv2f64(<vscale x 2 x double> %va, <v
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -918,8 +918,8 @@ define <vscale x 4 x double> @vp_roundeven_nxv4f64(<vscale x 4 x double> %va, <v
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -962,8 +962,8 @@ define <vscale x 7 x double> @vp_roundeven_nxv7f64(<vscale x 7 x double> %va, <v
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -1006,8 +1006,8 @@ define <vscale x 8 x double> @vp_roundeven_nxv8f64(<vscale x 8 x double> %va, <v
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -1068,8 +1068,8 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64(<vscale x 16 x double> %va,
 ; CHECK-NEXT:    vfabs.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v25, v8, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a2, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    fsrm a2
@@ -1092,8 +1092,8 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64(<vscale x 16 x double> %va,
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v24, v16, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
index 79c940bdf089..71a53c525551 100644
--- a/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
@@ -19,8 +19,8 @@ define <vscale x 1 x half> @vp_roundtozero_nxv1f16(<vscale x 1 x half> %va, <vsc
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 1
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -38,8 +38,8 @@ define <vscale x 1 x half> @vp_roundtozero_nxv1f16(<vscale x 1 x half> %va, <vsc
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -101,8 +101,8 @@ define <vscale x 2 x half> @vp_roundtozero_nxv2f16(<vscale x 2 x half> %va, <vsc
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 1
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -120,8 +120,8 @@ define <vscale x 2 x half> @vp_roundtozero_nxv2f16(<vscale x 2 x half> %va, <vsc
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -183,8 +183,8 @@ define <vscale x 4 x half> @vp_roundtozero_nxv4f16(<vscale x 4 x half> %va, <vsc
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 1
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -203,8 +203,8 @@ define <vscale x 4 x half> @vp_roundtozero_nxv4f16(<vscale x 4 x half> %va, <vsc
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v9
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
@@ -268,8 +268,8 @@ define <vscale x 8 x half> @vp_roundtozero_nxv8f16(<vscale x 8 x half> %va, <vsc
 ; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 1
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v0, v10
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -289,8 +289,8 @@ define <vscale x 8 x half> @vp_roundtozero_nxv8f16(<vscale x 8 x half> %va, <vsc
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
@@ -354,8 +354,8 @@ define <vscale x 16 x half> @vp_roundtozero_nxv16f16(<vscale x 16 x half> %va, <
 ; ZVFH-NEXT:    vfabs.v v16, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 1
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v0, v12
 ; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -375,8 +375,8 @@ define <vscale x 16 x half> @vp_roundtozero_nxv16f16(<vscale x 16 x half> %va, <
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
@@ -440,8 +440,8 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16(<vscale x 32 x half> %va, <
 ; ZVFH-NEXT:    vfabs.v v24, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; ZVFH-NEXT:    fsrmi a0, 1
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v0, v16
 ; ZVFH-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -479,8 +479,8 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16(<vscale x 32 x half> %va, <
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a2, 1
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v17
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    fsrm a2
@@ -501,8 +501,8 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16(<vscale x 32 x half> %va, <
 ; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v16, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v16
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
@@ -567,8 +567,8 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16_unmasked(<vscale x 32 x hal
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v16, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    fsrmi a2, 1
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v16
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    fsrm a2
@@ -615,8 +615,8 @@ define <vscale x 1 x float> @vp_roundtozero_nxv1f32(<vscale x 1 x float> %va, <v
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -657,8 +657,8 @@ define <vscale x 2 x float> @vp_roundtozero_nxv2f32(<vscale x 2 x float> %va, <v
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -700,8 +700,8 @@ define <vscale x 4 x float> @vp_roundtozero_nxv4f32(<vscale x 4 x float> %va, <v
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -744,8 +744,8 @@ define <vscale x 8 x float> @vp_roundtozero_nxv8f32(<vscale x 8 x float> %va, <v
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -788,8 +788,8 @@ define <vscale x 16 x float> @vp_roundtozero_nxv16f32(<vscale x 16 x float> %va,
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -831,8 +831,8 @@ define <vscale x 1 x double> @vp_roundtozero_nxv1f64(<vscale x 1 x double> %va,
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -874,8 +874,8 @@ define <vscale x 2 x double> @vp_roundtozero_nxv2f64(<vscale x 2 x double> %va,
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -918,8 +918,8 @@ define <vscale x 4 x double> @vp_roundtozero_nxv4f64(<vscale x 4 x double> %va,
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -962,8 +962,8 @@ define <vscale x 7 x double> @vp_roundtozero_nxv7f64(<vscale x 7 x double> %va,
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -1006,8 +1006,8 @@ define <vscale x 8 x double> @vp_roundtozero_nxv8f64(<vscale x 8 x double> %va,
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -1068,8 +1068,8 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64(<vscale x 16 x double> %v
 ; CHECK-NEXT:    vfabs.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v25, v8, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a2, 1
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    fsrm a2
@@ -1092,8 +1092,8 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64(<vscale x 16 x double> %v
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v24, v16, fa5, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/rv32-spill-zvlsseg.ll b/llvm/test/CodeGen/RISCV/rvv/rv32-spill-zvlsseg.ll
index 407c782d3377..e7913fc53df0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rv32-spill-zvlsseg.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rv32-spill-zvlsseg.ll
@@ -13,13 +13,8 @@ define <vscale x 1 x i32> @spill_zvlsseg_nxv1i32(ptr %base, i32 %vl) nounwind {
 ; SPILL-O0-NEXT:    csrr a2, vlenb
 ; SPILL-O0-NEXT:    slli a2, a2, 1
 ; SPILL-O0-NEXT:    sub sp, sp, a2
-; SPILL-O0-NEXT:    # implicit-def: $v8
-; SPILL-O0-NEXT:    # implicit-def: $v9
-; SPILL-O0-NEXT:    # implicit-def: $v10
-; SPILL-O0-NEXT:    # implicit-def: $v9
-; SPILL-O0-NEXT:    # kill: def $v8 killed $v8 def $v8_v9
-; SPILL-O0-NEXT:    vmv1r.v v9, v10
 ; SPILL-O0-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; SPILL-O0-NEXT:    # implicit-def: $v8_v9
 ; SPILL-O0-NEXT:    vlseg2e32.v v8, (a0)
 ; SPILL-O0-NEXT:    vmv1r.v v8, v9
 ; SPILL-O0-NEXT:    addi a0, sp, 16
@@ -95,13 +90,8 @@ define <vscale x 2 x i32> @spill_zvlsseg_nxv2i32(ptr %base, i32 %vl) nounwind {
 ; SPILL-O0-NEXT:    csrr a2, vlenb
 ; SPILL-O0-NEXT:    slli a2, a2, 1
 ; SPILL-O0-NEXT:    sub sp, sp, a2
-; SPILL-O0-NEXT:    # implicit-def: $v8
-; SPILL-O0-NEXT:    # implicit-def: $v9
-; SPILL-O0-NEXT:    # implicit-def: $v10
-; SPILL-O0-NEXT:    # implicit-def: $v9
-; SPILL-O0-NEXT:    # kill: def $v8 killed $v8 def $v8_v9
-; SPILL-O0-NEXT:    vmv1r.v v9, v10
 ; SPILL-O0-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; SPILL-O0-NEXT:    # implicit-def: $v8_v9
 ; SPILL-O0-NEXT:    vlseg2e32.v v8, (a0)
 ; SPILL-O0-NEXT:    vmv1r.v v8, v9
 ; SPILL-O0-NEXT:    addi a0, sp, 16
@@ -177,13 +167,8 @@ define <vscale x 4 x i32> @spill_zvlsseg_nxv4i32(ptr %base, i32 %vl) nounwind {
 ; SPILL-O0-NEXT:    csrr a2, vlenb
 ; SPILL-O0-NEXT:    slli a2, a2, 1
 ; SPILL-O0-NEXT:    sub sp, sp, a2
-; SPILL-O0-NEXT:    # implicit-def: $v8m2
-; SPILL-O0-NEXT:    # implicit-def: $v10m2
-; SPILL-O0-NEXT:    # implicit-def: $v12m2
-; SPILL-O0-NEXT:    # implicit-def: $v10m2
-; SPILL-O0-NEXT:    # kill: def $v8m2 killed $v8m2 def $v8m2_v10m2
-; SPILL-O0-NEXT:    vmv2r.v v10, v12
 ; SPILL-O0-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; SPILL-O0-NEXT:    # implicit-def: $v8m2_v10m2
 ; SPILL-O0-NEXT:    vlseg2e32.v v8, (a0)
 ; SPILL-O0-NEXT:    vmv2r.v v8, v10
 ; SPILL-O0-NEXT:    addi a0, sp, 16
@@ -262,13 +247,8 @@ define <vscale x 8 x i32> @spill_zvlsseg_nxv8i32(ptr %base, i32 %vl) nounwind {
 ; SPILL-O0-NEXT:    csrr a2, vlenb
 ; SPILL-O0-NEXT:    slli a2, a2, 2
 ; SPILL-O0-NEXT:    sub sp, sp, a2
-; SPILL-O0-NEXT:    # implicit-def: $v8m4
-; SPILL-O0-NEXT:    # implicit-def: $v12m4
-; SPILL-O0-NEXT:    # implicit-def: $v16m4
-; SPILL-O0-NEXT:    # implicit-def: $v12m4
-; SPILL-O0-NEXT:    # kill: def $v8m4 killed $v8m4 def $v8m4_v12m4
-; SPILL-O0-NEXT:    vmv4r.v v12, v16
 ; SPILL-O0-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; SPILL-O0-NEXT:    # implicit-def: $v8m4_v12m4
 ; SPILL-O0-NEXT:    vlseg2e32.v v8, (a0)
 ; SPILL-O0-NEXT:    vmv4r.v v8, v12
 ; SPILL-O0-NEXT:    addi a0, sp, 16
@@ -347,16 +327,8 @@ define <vscale x 4 x i32> @spill_zvlsseg3_nxv4i32(ptr %base, i32 %vl) nounwind {
 ; SPILL-O0-NEXT:    csrr a2, vlenb
 ; SPILL-O0-NEXT:    slli a2, a2, 1
 ; SPILL-O0-NEXT:    sub sp, sp, a2
-; SPILL-O0-NEXT:    # implicit-def: $v8m2
-; SPILL-O0-NEXT:    # implicit-def: $v10m2
-; SPILL-O0-NEXT:    # implicit-def: $v16m2
-; SPILL-O0-NEXT:    # implicit-def: $v10m2
-; SPILL-O0-NEXT:    # implicit-def: $v14m2
-; SPILL-O0-NEXT:    # implicit-def: $v10m2
-; SPILL-O0-NEXT:    # kill: def $v8m2 killed $v8m2 def $v8m2_v10m2_v12m2
-; SPILL-O0-NEXT:    vmv2r.v v10, v16
-; SPILL-O0-NEXT:    vmv2r.v v12, v14
 ; SPILL-O0-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; SPILL-O0-NEXT:    # implicit-def: $v8m2_v10m2_v12m2
 ; SPILL-O0-NEXT:    vlseg3e32.v v8, (a0)
 ; SPILL-O0-NEXT:    vmv2r.v v8, v10
 ; SPILL-O0-NEXT:    addi a0, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/rv64-spill-zvlsseg.ll b/llvm/test/CodeGen/RISCV/rvv/rv64-spill-zvlsseg.ll
index 1c1544b4efa0..dd575b3fceb5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rv64-spill-zvlsseg.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rv64-spill-zvlsseg.ll
@@ -13,13 +13,8 @@ define <vscale x 1 x i32> @spill_zvlsseg_nxv1i32(ptr %base, i64 %vl) nounwind {
 ; SPILL-O0-NEXT:    csrr a2, vlenb
 ; SPILL-O0-NEXT:    slli a2, a2, 1
 ; SPILL-O0-NEXT:    sub sp, sp, a2
-; SPILL-O0-NEXT:    # implicit-def: $v8
-; SPILL-O0-NEXT:    # implicit-def: $v9
-; SPILL-O0-NEXT:    # implicit-def: $v10
-; SPILL-O0-NEXT:    # implicit-def: $v9
-; SPILL-O0-NEXT:    # kill: def $v8 killed $v8 def $v8_v9
-; SPILL-O0-NEXT:    vmv1r.v v9, v10
 ; SPILL-O0-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; SPILL-O0-NEXT:    # implicit-def: $v8_v9
 ; SPILL-O0-NEXT:    vlseg2e32.v v8, (a0)
 ; SPILL-O0-NEXT:    vmv1r.v v8, v9
 ; SPILL-O0-NEXT:    addi a0, sp, 16
@@ -95,13 +90,8 @@ define <vscale x 2 x i32> @spill_zvlsseg_nxv2i32(ptr %base, i64 %vl) nounwind {
 ; SPILL-O0-NEXT:    csrr a2, vlenb
 ; SPILL-O0-NEXT:    slli a2, a2, 1
 ; SPILL-O0-NEXT:    sub sp, sp, a2
-; SPILL-O0-NEXT:    # implicit-def: $v8
-; SPILL-O0-NEXT:    # implicit-def: $v9
-; SPILL-O0-NEXT:    # implicit-def: $v10
-; SPILL-O0-NEXT:    # implicit-def: $v9
-; SPILL-O0-NEXT:    # kill: def $v8 killed $v8 def $v8_v9
-; SPILL-O0-NEXT:    vmv1r.v v9, v10
 ; SPILL-O0-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; SPILL-O0-NEXT:    # implicit-def: $v8_v9
 ; SPILL-O0-NEXT:    vlseg2e32.v v8, (a0)
 ; SPILL-O0-NEXT:    vmv1r.v v8, v9
 ; SPILL-O0-NEXT:    addi a0, sp, 16
@@ -177,13 +167,8 @@ define <vscale x 4 x i32> @spill_zvlsseg_nxv4i32(ptr %base, i64 %vl) nounwind {
 ; SPILL-O0-NEXT:    csrr a2, vlenb
 ; SPILL-O0-NEXT:    slli a2, a2, 1
 ; SPILL-O0-NEXT:    sub sp, sp, a2
-; SPILL-O0-NEXT:    # implicit-def: $v8m2
-; SPILL-O0-NEXT:    # implicit-def: $v10m2
-; SPILL-O0-NEXT:    # implicit-def: $v12m2
-; SPILL-O0-NEXT:    # implicit-def: $v10m2
-; SPILL-O0-NEXT:    # kill: def $v8m2 killed $v8m2 def $v8m2_v10m2
-; SPILL-O0-NEXT:    vmv2r.v v10, v12
 ; SPILL-O0-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; SPILL-O0-NEXT:    # implicit-def: $v8m2_v10m2
 ; SPILL-O0-NEXT:    vlseg2e32.v v8, (a0)
 ; SPILL-O0-NEXT:    vmv2r.v v8, v10
 ; SPILL-O0-NEXT:    addi a0, sp, 16
@@ -262,13 +247,8 @@ define <vscale x 8 x i32> @spill_zvlsseg_nxv8i32(ptr %base, i64 %vl) nounwind {
 ; SPILL-O0-NEXT:    csrr a2, vlenb
 ; SPILL-O0-NEXT:    slli a2, a2, 2
 ; SPILL-O0-NEXT:    sub sp, sp, a2
-; SPILL-O0-NEXT:    # implicit-def: $v8m4
-; SPILL-O0-NEXT:    # implicit-def: $v12m4
-; SPILL-O0-NEXT:    # implicit-def: $v16m4
-; SPILL-O0-NEXT:    # implicit-def: $v12m4
-; SPILL-O0-NEXT:    # kill: def $v8m4 killed $v8m4 def $v8m4_v12m4
-; SPILL-O0-NEXT:    vmv4r.v v12, v16
 ; SPILL-O0-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; SPILL-O0-NEXT:    # implicit-def: $v8m4_v12m4
 ; SPILL-O0-NEXT:    vlseg2e32.v v8, (a0)
 ; SPILL-O0-NEXT:    vmv4r.v v8, v12
 ; SPILL-O0-NEXT:    addi a0, sp, 16
@@ -347,16 +327,8 @@ define <vscale x 4 x i32> @spill_zvlsseg3_nxv4i32(ptr %base, i64 %vl) nounwind {
 ; SPILL-O0-NEXT:    csrr a2, vlenb
 ; SPILL-O0-NEXT:    slli a2, a2, 1
 ; SPILL-O0-NEXT:    sub sp, sp, a2
-; SPILL-O0-NEXT:    # implicit-def: $v8m2
-; SPILL-O0-NEXT:    # implicit-def: $v10m2
-; SPILL-O0-NEXT:    # implicit-def: $v16m2
-; SPILL-O0-NEXT:    # implicit-def: $v10m2
-; SPILL-O0-NEXT:    # implicit-def: $v14m2
-; SPILL-O0-NEXT:    # implicit-def: $v10m2
-; SPILL-O0-NEXT:    # kill: def $v8m2 killed $v8m2 def $v8m2_v10m2_v12m2
-; SPILL-O0-NEXT:    vmv2r.v v10, v16
-; SPILL-O0-NEXT:    vmv2r.v v12, v14
 ; SPILL-O0-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; SPILL-O0-NEXT:    # implicit-def: $v8m2_v10m2_v12m2
 ; SPILL-O0-NEXT:    vlseg3e32.v v8, (a0)
 ; SPILL-O0-NEXT:    vmv2r.v v8, v10
 ; SPILL-O0-NEXT:    addi a0, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-masked-vops.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-masked-vops.ll
index 8cefbac59ce6..033a1d7e297f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-masked-vops.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-masked-vops.ll
@@ -215,10 +215,10 @@ declare <vscale x 2 x i32> @llvm.vp.merge.nxv2i32(<vscale x 2 x i1>, <vscale x 2
 define <vscale x 2 x i32> @vmerge_vfcvt_rm(<vscale x 2 x i32> %passthru, <vscale x 2 x float> %a, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmerge_vfcvt_rm:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 2
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %floor = call <vscale x 2 x float> @llvm.floor.nxv2f32(<vscale x 2 x float> %a)
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
index 64b3a6f2b4b3..1a3a1a6c1ee6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
@@ -919,8 +919,8 @@ entry:
 define <vscale x 1 x i16> @test_vaaddu(<vscale x 1 x i16> %var_11, i16 zeroext %var_9, <vscale x 1 x i1> %var_5, <vscale x 1 x i16> %var_0) {
 ; CHECK-LABEL: test_vaaddu:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 3, e16, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetivli zero, 3, e16, mf4, ta, mu
 ; CHECK-NEXT:    vaaddu.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
@@ -966,13 +966,13 @@ declare <vscale x 2 x float> @llvm.riscv.vfredusum.nxv2f32.nxv2f32(
 define <vscale x 2 x float> @vfredusum(<vscale x 2 x float> %passthru, <vscale x 2 x float> %x, <vscale x 2 x float> %y, <vscale x 2 x i1> %m, i64 %vl) {
 ; CHECK-LABEL: vfredusum:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vmv1r.v v11, v8
 ; CHECK-NEXT:    vfredusum.vs v11, v9, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, tu, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v11, v0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
   %a = call <vscale x 2 x float> @llvm.riscv.vfredusum.nxv2f32.nxv2f32(
     <vscale x 2 x float> %passthru,
@@ -1002,10 +1002,10 @@ define <vscale x 2 x i32> @vredsum_allones_mask(<vscale x 2 x i32> %passthru, <v
 define <vscale x 2 x float> @vfredusum_allones_mask(<vscale x 2 x float> %passthru, <vscale x 2 x float> %x, <vscale x 2 x float> %y, i64 %vl) {
 ; CHECK-LABEL: vfredusum_allones_mask:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredusum.vs v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
   %a = call <vscale x 2 x float> @llvm.riscv.vfredusum.nxv2f32.nxv2f32(
     <vscale x 2 x float> %passthru,
@@ -1136,10 +1136,10 @@ define <vscale x 2 x i64> @vpmerge_vwsub.w_tied(<vscale x 2 x i64> %passthru, <v
 define <vscale x 2 x double> @vpmerge_vfwsub.w_tied(<vscale x 2 x double> %passthru, <vscale x 2 x double> %x, <vscale x 2 x float> %y, <vscale x 2 x i1> %mask, i32 zeroext %vl) {
 ; CHECK-LABEL: vpmerge_vfwsub.w_tied:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfwsub.wv v8, v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
   %vl.zext = zext i32 %vl to i64
   %a = call <vscale x 2 x double> @llvm.riscv.vfwsub.w.nxv2f64.nxv2f32(<vscale x 2 x double> %passthru, <vscale x 2 x double> %passthru, <vscale x 2 x float> %y, i64 1, i64 %vl.zext)
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vfnrclip_x_f_qf.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vfnrclip_x_f_qf.ll
index b44b57394321..3c19616576f5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sf_vfnrclip_x_f_qf.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vfnrclip_x_f_qf.ll
@@ -13,10 +13,10 @@ declare <vscale x 1 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.nxv1i8.nxv1f32.iXLen(
 define <vscale x 1 x i8> @intrinsic_sf_vfnrclip_x_f_qf_nxv1i8_nxv1f32(<vscale x 1 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_x_f_qf_nxv1i8_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.x.f.qf v9, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -39,10 +39,10 @@ declare <vscale x 1 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.mask.nxv1i8.nxv1f32.iXL
 define <vscale x 1 x i8> @intrinsic_sf_vfnrclip_x_f_qf_mask_nxv1i8_nxv1f32(<vscale x 1 x i8> %0, <vscale x 1 x float> %1, float %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_x_f_qf_mask_nxv1i8_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.x.f.qf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.mask.nxv1i8.nxv1f32.iXLen(
@@ -64,10 +64,10 @@ declare <vscale x 2 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.nxv2i8.nxv2f32.iXLen(
 define <vscale x 2 x i8> @intrinsic_sf_vfnrclip_x_f_qf_nxv2i8_nxv2f32(<vscale x 2 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_x_f_qf_nxv2i8_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.x.f.qf v9, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -90,10 +90,10 @@ declare <vscale x 2 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.mask.nxv2i8.nxv2f32.iXL
 define <vscale x 2 x i8> @intrinsic_sf_vfnrclip_x_f_qf_mask_nxv2i8_nxv2f32(<vscale x 2 x i8> %0, <vscale x 2 x float> %1, float %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_x_f_qf_mask_nxv2i8_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.x.f.qf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.mask.nxv2i8.nxv2f32.iXLen(
@@ -115,10 +115,10 @@ declare <vscale x 4 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.nxv4i8.nxv4f32.iXLen(
 define <vscale x 4 x i8> @intrinsic_sf_vfnrclip_x_f_qf_nxv4i8_nxv4f32(<vscale x 4 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_x_f_qf_nxv4i8_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.x.f.qf v10, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -141,10 +141,10 @@ declare <vscale x 4 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.mask.nxv4i8.nxv4f32.iXL
 define <vscale x 4 x i8> @intrinsic_sf_vfnrclip_x_f_qf_mask_nxv4i8_nxv4f32(<vscale x 4 x i8> %0, <vscale x 4 x float> %1, float %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_x_f_qf_mask_nxv4i8_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.x.f.qf v8, v10, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.mask.nxv4i8.nxv4f32.iXLen(
@@ -166,10 +166,10 @@ declare <vscale x 8 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.nxv8i8.nxv8f32.iXLen(
 define <vscale x 8 x i8> @intrinsic_sf_vfnrclip_x_f_qf_nxv8i8_nxv8f32(<vscale x 8 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_x_f_qf_nxv8i8_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.x.f.qf v12, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -192,10 +192,10 @@ declare <vscale x 8 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.mask.nxv8i8.nxv8f32.iXL
 define <vscale x 8 x i8> @intrinsic_sf_vfnrclip_x_f_qf_mask_nxv8i8_nxv8f32(<vscale x 8 x i8> %0, <vscale x 8 x float> %1, float %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_x_f_qf_mask_nxv8i8_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.x.f.qf v8, v12, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.mask.nxv8i8.nxv8f32.iXLen(
@@ -217,10 +217,10 @@ declare <vscale x 16 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.nxv16i8.nxv16f32.iXLen
 define <vscale x 16 x i8> @intrinsic_sf_vfnrclip_x_f_qf_nxv16i8_nxv16f32(<vscale x 16 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_x_f_qf_nxv16i8_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.x.f.qf v16, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -243,10 +243,10 @@ declare <vscale x 16 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.mask.nxv16i8.nxv16f32.
 define <vscale x 16 x i8> @intrinsic_sf_vfnrclip_x_f_qf_mask_nxv16i8_nxv16f32(<vscale x 16 x i8> %0, <vscale x 16 x float> %1, float %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_x_f_qf_mask_nxv16i8_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.x.f.qf v8, v16, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.mask.nxv16i8.nxv16f32.iXLen(
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vfnrclip_xu_f_qf.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vfnrclip_xu_f_qf.ll
index bc2f7ca7dc86..dbcee311c6e3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sf_vfnrclip_xu_f_qf.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vfnrclip_xu_f_qf.ll
@@ -13,10 +13,10 @@ declare <vscale x 1 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.nxv1i8.nxv1f32.iXLen(
 define <vscale x 1 x i8> @intrinsic_sf_vfnrclip_xu_f_qf_nxv1i8_nxv1f32(<vscale x 1 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_xu_f_qf_nxv1i8_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.xu.f.qf v9, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -39,10 +39,10 @@ declare <vscale x 1 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.mask.nxv1i8.nxv1f32.iX
 define <vscale x 1 x i8> @intrinsic_sf_vfnrclip_xu_f_qf_mask_nxv1i8_nxv1f32(<vscale x 1 x i8> %0, <vscale x 1 x float> %1, float %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_xu_f_qf_mask_nxv1i8_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.xu.f.qf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.mask.nxv1i8.nxv1f32.iXLen(
@@ -64,10 +64,10 @@ declare <vscale x 2 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.nxv2i8.nxv2f32.iXLen(
 define <vscale x 2 x i8> @intrinsic_sf_vfnrclip_xu_f_qf_nxv2i8_nxv2f32(<vscale x 2 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_xu_f_qf_nxv2i8_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.xu.f.qf v9, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -90,10 +90,10 @@ declare <vscale x 2 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.mask.nxv2i8.nxv2f32.iX
 define <vscale x 2 x i8> @intrinsic_sf_vfnrclip_xu_f_qf_mask_nxv2i8_nxv2f32(<vscale x 2 x i8> %0, <vscale x 2 x float> %1, float %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_xu_f_qf_mask_nxv2i8_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.xu.f.qf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.mask.nxv2i8.nxv2f32.iXLen(
@@ -115,10 +115,10 @@ declare <vscale x 4 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.nxv4i8.nxv4f32.iXLen(
 define <vscale x 4 x i8> @intrinsic_sf_vfnrclip_xu_f_qf_nxv4i8_nxv4f32(<vscale x 4 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_xu_f_qf_nxv4i8_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.xu.f.qf v10, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -141,10 +141,10 @@ declare <vscale x 4 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.mask.nxv4i8.nxv4f32.iX
 define <vscale x 4 x i8> @intrinsic_sf_vfnrclip_xu_f_qf_mask_nxv4i8_nxv4f32(<vscale x 4 x i8> %0, <vscale x 4 x float> %1, float %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_xu_f_qf_mask_nxv4i8_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.xu.f.qf v8, v10, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.mask.nxv4i8.nxv4f32.iXLen(
@@ -166,10 +166,10 @@ declare <vscale x 8 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.nxv8i8.nxv8f32.iXLen(
 define <vscale x 8 x i8> @intrinsic_sf_vfnrclip_xu_f_qf_nxv8i8_nxv8f32(<vscale x 8 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_xu_f_qf_nxv8i8_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.xu.f.qf v12, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -192,10 +192,10 @@ declare <vscale x 8 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.mask.nxv8i8.nxv8f32.iX
 define <vscale x 8 x i8> @intrinsic_sf_vfnrclip_xu_f_qf_mask_nxv8i8_nxv8f32(<vscale x 8 x i8> %0, <vscale x 8 x float> %1, float %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_xu_f_qf_mask_nxv8i8_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.xu.f.qf v8, v12, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.mask.nxv8i8.nxv8f32.iXLen(
@@ -217,10 +217,10 @@ declare <vscale x 16 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.nxv16i8.nxv16f32.iXLe
 define <vscale x 16 x i8> @intrinsic_sf_vfnrclip_xu_f_qf_nxv16i8_nxv16f32(<vscale x 16 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_xu_f_qf_nxv16i8_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.xu.f.qf v16, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -243,10 +243,10 @@ declare <vscale x 16 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.mask.nxv16i8.nxv16f32
 define <vscale x 16 x i8> @intrinsic_sf_vfnrclip_xu_f_qf_mask_nxv16i8_nxv16f32(<vscale x 16 x i8> %0, <vscale x 16 x float> %1, float %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_xu_f_qf_mask_nxv16i8_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.xu.f.qf v8, v16, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.mask.nxv16i8.nxv16f32.iXLen(
diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
index 9046c861c336..618672344fe7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
@@ -5423,3 +5423,40 @@ vector.body:                                      ; preds = %vector.body, %entry
 for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
+
+define void @sink_splat_select(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_select:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a3, 42
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:  .LBB117_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmseq.vx v0, v8, a3
+; CHECK-NEXT:    vmerge.vxm v8, v8, a1, v0
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    bne a0, a2, .LBB117_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i32, ptr %a, i64 %index
+  %load = load <4 x i32>, ptr %0, align 4
+  %cond = icmp eq <4 x i32> %load, splat (i32 42)
+  %1 = select <4 x i1> %cond, <4 x i32> %broadcast.splat, <4 x i32> %load
+  store <4 x i32> %1, ptr %0, align 4
+  %index.next = add nuw i64 %index, 4
+  %2 = icmp eq i64 %index.next, 1024
+  br i1 %2, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/undef-vp-ops.ll b/llvm/test/CodeGen/RISCV/rvv/undef-vp-ops.ll
index e9259b3a1d92..1bfc0f432eb5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/undef-vp-ops.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/undef-vp-ops.ll
@@ -631,3 +631,35 @@ define float @vreduce_fmax_v4f32_false_mask(float %start, <4 x float> %val, i32
   %s = call float @llvm.vp.reduce.fmax.v4f32(float %start, <4 x float> %val, <4 x i1> zeroinitializer, i32 %evl)
   ret float %s
 }
+
+define float @vreduce_fminimum_v4f32_zero_evl(float %start, <4 x float> %val, <4 x i1> %m) {
+; CHECK-LABEL: vreduce_fminimum_v4f32_zero_evl:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret
+  %s = call float @llvm.vp.reduce.fminimum.v4f32(float %start, <4 x float> %val, <4 x i1> %m, i32 0)
+  ret float %s
+}
+
+define float @vreduce_fminimum_v4f32_false_mask(float %start, <4 x float> %val, i32 %evl) {
+; CHECK-LABEL: vreduce_fminimum_v4f32_false_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret
+  %s = call float @llvm.vp.reduce.fminimum.v4f32(float %start, <4 x float> %val, <4 x i1> zeroinitializer, i32 %evl)
+  ret float %s
+}
+
+define float @vreduce_fmaximum_v4f32_zero_evl(float %start, <4 x float> %val, <4 x i1> %m) {
+; CHECK-LABEL: vreduce_fmaximum_v4f32_zero_evl:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret
+  %s = call float @llvm.vp.reduce.fmaximum.v4f32(float %start, <4 x float> %val, <4 x i1> %m, i32 0)
+  ret float %s
+}
+
+define float @vreduce_fmaximum_v4f32_false_mask(float %start, <4 x float> %val, i32 %evl) {
+; CHECK-LABEL: vreduce_fmaximum_v4f32_false_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret
+  %s = call float @llvm.vp.reduce.fmaximum.v4f32(float %start, <4 x float> %val, <4 x i1> zeroinitializer, i32 %evl)
+  ret float %s
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/unmasked-tu.ll b/llvm/test/CodeGen/RISCV/rvv/unmasked-tu.ll
index 2926a23c8b27..25e3468dcb62 100644
--- a/llvm/test/CodeGen/RISCV/rvv/unmasked-tu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/unmasked-tu.ll
@@ -110,8 +110,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vaadd.rm.nxv1i8.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vaadd_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, ma
 ; CHECK-NEXT:    vaadd.vv v8, v9, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -133,8 +133,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vaaddu.rm.nxv1i8.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vaaddu_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v9, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -199,8 +199,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vasub.rm.nxv1i8.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vasub_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, ma
 ; CHECK-NEXT:    vasub.vv v8, v9, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -222,8 +222,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vasubu.rm.nxv1i8.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vasubu_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, ma
 ; CHECK-NEXT:    vasubu.vv v8, v9, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -950,8 +950,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vnclip.nxv1i8.nxv1i16.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vnclip_wv_nxv1i8_nxv1i16_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i16> %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_wv_nxv1i8_nxv1i16_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, ma
 ; CHECK-NEXT:    vnclip.wv v8, v9, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -974,8 +974,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vnclipu.nxv1i8.nxv1i16.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vnclipu_wv_nxv1i8_nxv1i16_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i16> %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_wv_nxv1i8_nxv1i16_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, ma
 ; CHECK-NEXT:    vnclipu.wv v8, v9, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -1271,8 +1271,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vsmul.nxv1i8.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vsmul_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, ma
 ; CHECK-NEXT:    vsmul.vv v8, v9, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -1301,16 +1301,16 @@ define <vscale x 1 x i64> @intrinsic_vsmul_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vsetvli zero, zero, e64, m1, tu, ma
 ; RV32-NEXT:    csrwi vxrm, 0
+; RV32-NEXT:    vsetvli zero, zero, e64, m1, tu, ma
 ; RV32-NEXT:    vsmul.vv v8, v9, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: intrinsic_vsmul_vx_nxv1i64_nxv1i64_i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    vsetvli zero, a1, e64, m1, tu, ma
 ; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, tu, ma
 ; RV64-NEXT:    vsmul.vx v8, v9, a0
 ; RV64-NEXT:    ret
 entry:
@@ -1376,8 +1376,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vssra.nxv1i8.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vssra_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vssra_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, ma
 ; CHECK-NEXT:    vssra.vv v8, v9, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -1400,8 +1400,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vssrl.nxv1i8.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vssrl_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vssrl_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, ma
 ; CHECK-NEXT:    vssrl.vv v8, v9, v10
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vaadd.ll b/llvm/test/CodeGen/RISCV/rvv/vaadd.ll
index 82cd4bf162b9..096e60b6285f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vaadd.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vaadd.ll
@@ -13,8 +13,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vaadd_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vaadd.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -37,8 +37,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vaadd.mask.nxv1i8.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vaadd_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    vaadd.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -61,8 +61,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vaadd.nxv2i8.nxv2i8(
 define <vscale x 2 x i8> @intrinsic_vaadd_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vaadd.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -85,8 +85,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vaadd.mask.nxv2i8.nxv2i8(
 define <vscale x 2 x i8> @intrinsic_vaadd_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
 ; CHECK-NEXT:    vaadd.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -109,8 +109,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vaadd.nxv4i8.nxv4i8(
 define <vscale x 4 x i8> @intrinsic_vaadd_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vaadd.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -133,8 +133,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vaadd.mask.nxv4i8.nxv4i8(
 define <vscale x 4 x i8> @intrinsic_vaadd_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
 ; CHECK-NEXT:    vaadd.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -157,8 +157,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vaadd.nxv8i8.nxv8i8(
 define <vscale x 8 x i8> @intrinsic_vaadd_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vaadd.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -181,8 +181,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vaadd.mask.nxv8i8.nxv8i8(
 define <vscale x 8 x i8> @intrinsic_vaadd_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
 ; CHECK-NEXT:    vaadd.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -205,8 +205,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vaadd.nxv16i8.nxv16i8(
 define <vscale x 16 x i8> @intrinsic_vaadd_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vaadd.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -229,8 +229,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vaadd.mask.nxv16i8.nxv16i8(
 define <vscale x 16 x i8> @intrinsic_vaadd_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
 ; CHECK-NEXT:    vaadd.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -253,8 +253,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vaadd.nxv32i8.nxv32i8(
 define <vscale x 32 x i8> @intrinsic_vaadd_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vaadd.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -277,8 +277,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vaadd.mask.nxv32i8.nxv32i8(
 define <vscale x 32 x i8> @intrinsic_vaadd_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
 ; CHECK-NEXT:    vaadd.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -301,8 +301,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vaadd.nxv64i8.nxv64i8(
 define <vscale x 64 x i8> @intrinsic_vaadd_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vaadd.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -326,8 +326,8 @@ define <vscale x 64 x i8> @intrinsic_vaadd_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vsca
 ; CHECK-LABEL: intrinsic_vaadd_mask_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
 ; CHECK-NEXT:    vaadd.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -350,8 +350,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vaadd.nxv1i16.nxv1i16(
 define <vscale x 1 x i16> @intrinsic_vaadd_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vaadd.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -374,8 +374,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vaadd.mask.nxv1i16.nxv1i16(
 define <vscale x 1 x i16> @intrinsic_vaadd_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
 ; CHECK-NEXT:    vaadd.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -398,8 +398,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vaadd.nxv2i16.nxv2i16(
 define <vscale x 2 x i16> @intrinsic_vaadd_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vaadd.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -422,8 +422,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vaadd.mask.nxv2i16.nxv2i16(
 define <vscale x 2 x i16> @intrinsic_vaadd_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
 ; CHECK-NEXT:    vaadd.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -446,8 +446,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vaadd.nxv4i16.nxv4i16(
 define <vscale x 4 x i16> @intrinsic_vaadd_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vaadd.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -470,8 +470,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vaadd.mask.nxv4i16.nxv4i16(
 define <vscale x 4 x i16> @intrinsic_vaadd_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
 ; CHECK-NEXT:    vaadd.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -494,8 +494,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vaadd.nxv8i16.nxv8i16(
 define <vscale x 8 x i16> @intrinsic_vaadd_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vaadd.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -518,8 +518,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vaadd.mask.nxv8i16.nxv8i16(
 define <vscale x 8 x i16> @intrinsic_vaadd_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
 ; CHECK-NEXT:    vaadd.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -542,8 +542,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vaadd.nxv16i16.nxv16i16(
 define <vscale x 16 x i16> @intrinsic_vaadd_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vaadd.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -566,8 +566,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vaadd.mask.nxv16i16.nxv16i16(
 define <vscale x 16 x i16> @intrinsic_vaadd_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vaadd.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -590,8 +590,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vaadd.nxv32i16.nxv32i16(
 define <vscale x 32 x i16> @intrinsic_vaadd_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vaadd.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -615,8 +615,8 @@ define <vscale x 32 x i16> @intrinsic_vaadd_mask_vv_nxv32i16_nxv32i16_nxv32i16(<
 ; CHECK-LABEL: intrinsic_vaadd_mask_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    vaadd.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -639,8 +639,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vaadd.nxv1i32.nxv1i32(
 define <vscale x 1 x i32> @intrinsic_vaadd_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vaadd.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -663,8 +663,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vaadd.mask.nxv1i32.nxv1i32(
 define <vscale x 1 x i32> @intrinsic_vaadd_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
 ; CHECK-NEXT:    vaadd.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -687,8 +687,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vaadd.nxv2i32.nxv2i32(
 define <vscale x 2 x i32> @intrinsic_vaadd_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vaadd.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -711,8 +711,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vaadd.mask.nxv2i32.nxv2i32(
 define <vscale x 2 x i32> @intrinsic_vaadd_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
 ; CHECK-NEXT:    vaadd.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -735,8 +735,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vaadd.nxv4i32.nxv4i32(
 define <vscale x 4 x i32> @intrinsic_vaadd_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vaadd.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -759,8 +759,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vaadd.mask.nxv4i32.nxv4i32(
 define <vscale x 4 x i32> @intrinsic_vaadd_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
 ; CHECK-NEXT:    vaadd.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -783,8 +783,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vaadd.nxv8i32.nxv8i32(
 define <vscale x 8 x i32> @intrinsic_vaadd_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vaadd.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -807,8 +807,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vaadd.mask.nxv8i32.nxv8i32(
 define <vscale x 8 x i32> @intrinsic_vaadd_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
 ; CHECK-NEXT:    vaadd.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -831,8 +831,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vaadd.nxv16i32.nxv16i32(
 define <vscale x 16 x i32> @intrinsic_vaadd_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vaadd.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -856,8 +856,8 @@ define <vscale x 16 x i32> @intrinsic_vaadd_mask_vv_nxv16i32_nxv16i32_nxv16i32(<
 ; CHECK-LABEL: intrinsic_vaadd_mask_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re32.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    vaadd.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -880,8 +880,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vaadd.nxv1i64.nxv1i64(
 define <vscale x 1 x i64> @intrinsic_vaadd_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vaadd.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -904,8 +904,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vaadd.mask.nxv1i64.nxv1i64(
 define <vscale x 1 x i64> @intrinsic_vaadd_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
 ; CHECK-NEXT:    vaadd.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -928,8 +928,8 @@ declare <vscale x 2 x i64> @llvm.riscv.vaadd.nxv2i64.nxv2i64(
 define <vscale x 2 x i64> @intrinsic_vaadd_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vaadd.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -952,8 +952,8 @@ declare <vscale x 2 x i64> @llvm.riscv.vaadd.mask.nxv2i64.nxv2i64(
 define <vscale x 2 x i64> @intrinsic_vaadd_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
 ; CHECK-NEXT:    vaadd.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -976,8 +976,8 @@ declare <vscale x 4 x i64> @llvm.riscv.vaadd.nxv4i64.nxv4i64(
 define <vscale x 4 x i64> @intrinsic_vaadd_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vaadd.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -1000,8 +1000,8 @@ declare <vscale x 4 x i64> @llvm.riscv.vaadd.mask.nxv4i64.nxv4i64(
 define <vscale x 4 x i64> @intrinsic_vaadd_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
 ; CHECK-NEXT:    vaadd.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1024,8 +1024,8 @@ declare <vscale x 8 x i64> @llvm.riscv.vaadd.nxv8i64.nxv8i64(
 define <vscale x 8 x i64> @intrinsic_vaadd_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vaadd.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -1049,8 +1049,8 @@ define <vscale x 8 x i64> @intrinsic_vaadd_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vsca
 ; CHECK-LABEL: intrinsic_vaadd_mask_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re64.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
 ; CHECK-NEXT:    vaadd.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1073,8 +1073,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.i8(
 define <vscale x 1 x i8> @intrinsic_vaadd_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vaadd.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1097,8 +1097,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vaadd.mask.nxv1i8.i8(
 define <vscale x 1 x i8> @intrinsic_vaadd_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
 ; CHECK-NEXT:    vaadd.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1121,8 +1121,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vaadd.nxv2i8.i8(
 define <vscale x 2 x i8> @intrinsic_vaadd_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    vaadd.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1145,8 +1145,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vaadd.mask.nxv2i8.i8(
 define <vscale x 2 x i8> @intrinsic_vaadd_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
 ; CHECK-NEXT:    vaadd.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1169,8 +1169,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vaadd.nxv4i8.i8(
 define <vscale x 4 x i8> @intrinsic_vaadd_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vaadd.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1193,8 +1193,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vaadd.mask.nxv4i8.i8(
 define <vscale x 4 x i8> @intrinsic_vaadd_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
 ; CHECK-NEXT:    vaadd.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1217,8 +1217,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vaadd.nxv8i8.i8(
 define <vscale x 8 x i8> @intrinsic_vaadd_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    vaadd.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1241,8 +1241,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vaadd.mask.nxv8i8.i8(
 define <vscale x 8 x i8> @intrinsic_vaadd_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
 ; CHECK-NEXT:    vaadd.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1265,8 +1265,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vaadd.nxv16i8.i8(
 define <vscale x 16 x i8> @intrinsic_vaadd_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vaadd.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1289,8 +1289,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vaadd.mask.nxv16i8.i8(
 define <vscale x 16 x i8> @intrinsic_vaadd_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
 ; CHECK-NEXT:    vaadd.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1313,8 +1313,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vaadd.nxv32i8.i8(
 define <vscale x 32 x i8> @intrinsic_vaadd_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    vaadd.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1337,8 +1337,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vaadd.mask.nxv32i8.i8(
 define <vscale x 32 x i8> @intrinsic_vaadd_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
 ; CHECK-NEXT:    vaadd.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1361,8 +1361,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vaadd.nxv64i8.i8(
 define <vscale x 64 x i8> @intrinsic_vaadd_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vaadd.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1385,8 +1385,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vaadd.mask.nxv64i8.i8(
 define <vscale x 64 x i8> @intrinsic_vaadd_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
 ; CHECK-NEXT:    vaadd.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1409,8 +1409,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vaadd.nxv1i16.i16(
 define <vscale x 1 x i16> @intrinsic_vaadd_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vaadd.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1433,8 +1433,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vaadd.mask.nxv1i16.i16(
 define <vscale x 1 x i16> @intrinsic_vaadd_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vaadd.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1457,8 +1457,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vaadd.nxv2i16.i16(
 define <vscale x 2 x i16> @intrinsic_vaadd_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    vaadd.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1481,8 +1481,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vaadd.mask.nxv2i16.i16(
 define <vscale x 2 x i16> @intrinsic_vaadd_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
 ; CHECK-NEXT:    vaadd.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1505,8 +1505,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vaadd.nxv4i16.i16(
 define <vscale x 4 x i16> @intrinsic_vaadd_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vaadd.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1529,8 +1529,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vaadd.mask.nxv4i16.i16(
 define <vscale x 4 x i16> @intrinsic_vaadd_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
 ; CHECK-NEXT:    vaadd.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1553,8 +1553,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vaadd.nxv8i16.i16(
 define <vscale x 8 x i16> @intrinsic_vaadd_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    vaadd.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1577,8 +1577,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vaadd.mask.nxv8i16.i16(
 define <vscale x 8 x i16> @intrinsic_vaadd_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
 ; CHECK-NEXT:    vaadd.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1601,8 +1601,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vaadd.nxv16i16.i16(
 define <vscale x 16 x i16> @intrinsic_vaadd_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    vaadd.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1625,8 +1625,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vaadd.mask.nxv16i16.i16(
 define <vscale x 16 x i16> @intrinsic_vaadd_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
 ; CHECK-NEXT:    vaadd.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1649,8 +1649,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vaadd.nxv32i16.i16(
 define <vscale x 32 x i16> @intrinsic_vaadd_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vaadd.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1673,8 +1673,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vaadd.mask.nxv32i16.i16(
 define <vscale x 32 x i16> @intrinsic_vaadd_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    vaadd.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1697,8 +1697,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vaadd.nxv1i32.i32(
 define <vscale x 1 x i32> @intrinsic_vaadd_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    vaadd.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1721,8 +1721,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vaadd.mask.nxv1i32.i32(
 define <vscale x 1 x i32> @intrinsic_vaadd_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vaadd.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1745,8 +1745,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vaadd.nxv2i32.i32(
 define <vscale x 2 x i32> @intrinsic_vaadd_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vaadd.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1769,8 +1769,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vaadd.mask.nxv2i32.i32(
 define <vscale x 2 x i32> @intrinsic_vaadd_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
 ; CHECK-NEXT:    vaadd.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1793,8 +1793,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vaadd.nxv4i32.i32(
 define <vscale x 4 x i32> @intrinsic_vaadd_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    vaadd.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1817,8 +1817,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vaadd.mask.nxv4i32.i32(
 define <vscale x 4 x i32> @intrinsic_vaadd_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
 ; CHECK-NEXT:    vaadd.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1841,8 +1841,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vaadd.nxv8i32.i32(
 define <vscale x 8 x i32> @intrinsic_vaadd_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    vaadd.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1865,8 +1865,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vaadd.mask.nxv8i32.i32(
 define <vscale x 8 x i32> @intrinsic_vaadd_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
 ; CHECK-NEXT:    vaadd.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1889,8 +1889,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vaadd.nxv16i32.i32(
 define <vscale x 16 x i32> @intrinsic_vaadd_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vaadd.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1913,8 +1913,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vaadd.mask.nxv16i32.i32(
 define <vscale x 16 x i32> @intrinsic_vaadd_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaadd_mask_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    vaadd.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1950,8 +1950,8 @@ define <vscale x 1 x i64> @intrinsic_vaadd_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x
 ;
 ; RV64-LABEL: intrinsic_vaadd_vx_nxv1i64_nxv1i64_i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
 ; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
 ; RV64-NEXT:    vaadd.vx v8, v8, a0
 ; RV64-NEXT:    ret
 entry:
@@ -1987,8 +1987,8 @@ define <vscale x 1 x i64> @intrinsic_vaadd_mask_vx_nxv1i64_nxv1i64_i64(<vscale x
 ;
 ; RV64-LABEL: intrinsic_vaadd_mask_vx_nxv1i64_nxv1i64_i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
 ; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
 ; RV64-NEXT:    vaadd.vx v8, v9, a0, v0.t
 ; RV64-NEXT:    ret
 entry:
@@ -2024,8 +2024,8 @@ define <vscale x 2 x i64> @intrinsic_vaadd_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x
 ;
 ; RV64-LABEL: intrinsic_vaadd_vx_nxv2i64_nxv2i64_i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
 ; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
 ; RV64-NEXT:    vaadd.vx v8, v8, a0
 ; RV64-NEXT:    ret
 entry:
@@ -2061,8 +2061,8 @@ define <vscale x 2 x i64> @intrinsic_vaadd_mask_vx_nxv2i64_nxv2i64_i64(<vscale x
 ;
 ; RV64-LABEL: intrinsic_vaadd_mask_vx_nxv2i64_nxv2i64_i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
 ; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
 ; RV64-NEXT:    vaadd.vx v8, v10, a0, v0.t
 ; RV64-NEXT:    ret
 entry:
@@ -2098,8 +2098,8 @@ define <vscale x 4 x i64> @intrinsic_vaadd_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x
 ;
 ; RV64-LABEL: intrinsic_vaadd_vx_nxv4i64_nxv4i64_i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; RV64-NEXT:    vaadd.vx v8, v8, a0
 ; RV64-NEXT:    ret
 entry:
@@ -2135,8 +2135,8 @@ define <vscale x 4 x i64> @intrinsic_vaadd_mask_vx_nxv4i64_nxv4i64_i64(<vscale x
 ;
 ; RV64-LABEL: intrinsic_vaadd_mask_vx_nxv4i64_nxv4i64_i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
 ; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
 ; RV64-NEXT:    vaadd.vx v8, v12, a0, v0.t
 ; RV64-NEXT:    ret
 entry:
@@ -2172,8 +2172,8 @@ define <vscale x 8 x i64> @intrinsic_vaadd_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x
 ;
 ; RV64-LABEL: intrinsic_vaadd_vx_nxv8i64_nxv8i64_i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vaadd.vx v8, v8, a0
 ; RV64-NEXT:    ret
 entry:
@@ -2209,8 +2209,8 @@ define <vscale x 8 x i64> @intrinsic_vaadd_mask_vx_nxv8i64_nxv8i64_i64(<vscale x
 ;
 ; RV64-LABEL: intrinsic_vaadd_mask_vx_nxv8i64_nxv8i64_i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
 ; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
 ; RV64-NEXT:    vaadd.vx v8, v16, a0, v0.t
 ; RV64-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vaaddu-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vaaddu-sdnode.ll
index 5b14014a252f..dd2c14b037ee 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vaaddu-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vaaddu-sdnode.ll
@@ -5,8 +5,8 @@
 define <vscale x 8 x i8> @vaaddu_vv_nxv8i8_floor(<vscale x 8 x i8> %x, <vscale x 8 x i8> %y) {
 ; CHECK-LABEL: vaaddu_vv_nxv8i8_floor:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 2
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %xzv = zext <vscale x 8 x i8> %x to <vscale x 8 x i16>
@@ -20,8 +20,8 @@ define <vscale x 8 x i8> @vaaddu_vv_nxv8i8_floor(<vscale x 8 x i8> %x, <vscale x
 define <vscale x 8 x i8> @vaaddu_vx_nxv8i8_floor(<vscale x 8 x i8> %x, i8 %y) {
 ; CHECK-LABEL: vaaddu_vx_nxv8i8_floor:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 2
+; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vaaddu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %xzv = zext <vscale x 8 x i8> %x to <vscale x 8 x i16>
@@ -52,8 +52,8 @@ define <vscale x 8 x i8> @vaaddu_vv_nxv8i8_floor_sexti16(<vscale x 8 x i8> %x, <
 define <vscale x 8 x i8> @vaaddu_vv_nxv8i8_floor_zexti32(<vscale x 8 x i8> %x, <vscale x 8 x i8> %y) {
 ; CHECK-LABEL: vaaddu_vv_nxv8i8_floor_zexti32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 2
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %xzv = zext <vscale x 8 x i8> %x to <vscale x 8 x i32>
@@ -82,8 +82,8 @@ define <vscale x 8 x i8> @vaaddu_vv_nxv8i8_floor_lshr2(<vscale x 8 x i8> %x, <vs
 define <vscale x 8 x i16> @vaaddu_vv_nxv8i16_floor(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y) {
 ; CHECK-LABEL: vaaddu_vv_nxv8i16_floor:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 2
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %xzv = zext <vscale x 8 x i16> %x to <vscale x 8 x i32>
@@ -97,8 +97,8 @@ define <vscale x 8 x i16> @vaaddu_vv_nxv8i16_floor(<vscale x 8 x i16> %x, <vscal
 define <vscale x 8 x i16> @vaaddu_vx_nxv8i16_floor(<vscale x 8 x i16> %x, i16 %y) {
 ; CHECK-LABEL: vaaddu_vx_nxv8i16_floor:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 2
+; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vaaddu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %xzv = zext <vscale x 8 x i16> %x to <vscale x 8 x i32>
@@ -114,8 +114,8 @@ define <vscale x 8 x i16> @vaaddu_vx_nxv8i16_floor(<vscale x 8 x i16> %x, i16 %y
 define <vscale x 8 x i32> @vaaddu_vv_nxv8i32_floor(<vscale x 8 x i32> %x, <vscale x 8 x i32> %y) {
 ; CHECK-LABEL: vaaddu_vv_nxv8i32_floor:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 2
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %xzv = zext <vscale x 8 x i32> %x to <vscale x 8 x i64>
@@ -129,8 +129,8 @@ define <vscale x 8 x i32> @vaaddu_vv_nxv8i32_floor(<vscale x 8 x i32> %x, <vscal
 define <vscale x 8 x i32> @vaaddu_vx_nxv8i32_floor(<vscale x 8 x i32> %x, i32 %y) {
 ; CHECK-LABEL: vaaddu_vx_nxv8i32_floor:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 2
+; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vaaddu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %xzv = zext <vscale x 8 x i32> %x to <vscale x 8 x i64>
@@ -146,8 +146,8 @@ define <vscale x 8 x i32> @vaaddu_vx_nxv8i32_floor(<vscale x 8 x i32> %x, i32 %y
 define <vscale x 8 x i64> @vaaddu_vv_nxv8i64_floor(<vscale x 8 x i64> %x, <vscale x 8 x i64> %y) {
 ; CHECK-LABEL: vaaddu_vv_nxv8i64_floor:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 2
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v16
 ; CHECK-NEXT:    ret
   %xzv = zext <vscale x 8 x i64> %x to <vscale x 8 x i128>
@@ -175,8 +175,8 @@ define <vscale x 8 x i64> @vaaddu_vx_nxv8i64_floor(<vscale x 8 x i64> %x, i64 %y
 ;
 ; RV64-LABEL: vaaddu_vx_nxv8i64_floor:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV64-NEXT:    csrwi vxrm, 2
+; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vaaddu.vx v8, v8, a0
 ; RV64-NEXT:    ret
   %xzv = zext <vscale x 8 x i64> %x to <vscale x 8 x i128>
@@ -192,8 +192,8 @@ define <vscale x 8 x i64> @vaaddu_vx_nxv8i64_floor(<vscale x 8 x i64> %x, i64 %y
 define <vscale x 8 x i8> @vaaddu_vv_nxv8i8_ceil(<vscale x 8 x i8> %x, <vscale x 8 x i8> %y) {
 ; CHECK-LABEL: vaaddu_vv_nxv8i8_ceil:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %xzv = zext <vscale x 8 x i8> %x to <vscale x 8 x i16>
@@ -208,8 +208,8 @@ define <vscale x 8 x i8> @vaaddu_vv_nxv8i8_ceil(<vscale x 8 x i8> %x, <vscale x
 define <vscale x 8 x i8> @vaaddu_vx_nxv8i8_ceil(<vscale x 8 x i8> %x, i8 %y) {
 ; CHECK-LABEL: vaaddu_vx_nxv8i8_ceil:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vaaddu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %xzv = zext <vscale x 8 x i8> %x to <vscale x 8 x i16>
@@ -245,8 +245,8 @@ define <vscale x 8 x i8> @vaaddu_vv_nxv8i8_ceil_sexti16(<vscale x 8 x i8> %x, <v
 define <vscale x 8 x i8> @vaaddu_vv_nxv8i8_ceil_zexti32(<vscale x 8 x i8> %x, <vscale x 8 x i8> %y) {
 ; CHECK-LABEL: vaaddu_vv_nxv8i8_ceil_zexti32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %xzv = zext <vscale x 8 x i8> %x to <vscale x 8 x i32>
@@ -299,8 +299,8 @@ define <vscale x 8 x i8> @vaaddu_vv_nxv8i8_ceil_add2(<vscale x 8 x i8> %x, <vsca
 define <vscale x 8 x i16> @vaaddu_vv_nxv8i16_ceil(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y) {
 ; CHECK-LABEL: vaaddu_vv_nxv8i16_ceil:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %xzv = zext <vscale x 8 x i16> %x to <vscale x 8 x i32>
@@ -315,8 +315,8 @@ define <vscale x 8 x i16> @vaaddu_vv_nxv8i16_ceil(<vscale x 8 x i16> %x, <vscale
 define <vscale x 8 x i16> @vaaddu_vx_nxv8i16_ceil(<vscale x 8 x i16> %x, i16 %y) {
 ; CHECK-LABEL: vaaddu_vx_nxv8i16_ceil:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vaaddu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %xzv = zext <vscale x 8 x i16> %x to <vscale x 8 x i32>
@@ -333,8 +333,8 @@ define <vscale x 8 x i16> @vaaddu_vx_nxv8i16_ceil(<vscale x 8 x i16> %x, i16 %y)
 define <vscale x 8 x i32> @vaaddu_vv_nxv8i32_ceil(<vscale x 8 x i32> %x, <vscale x 8 x i32> %y) {
 ; CHECK-LABEL: vaaddu_vv_nxv8i32_ceil:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %xzv = zext <vscale x 8 x i32> %x to <vscale x 8 x i64>
@@ -349,8 +349,8 @@ define <vscale x 8 x i32> @vaaddu_vv_nxv8i32_ceil(<vscale x 8 x i32> %x, <vscale
 define <vscale x 8 x i32> @vaaddu_vx_nxv8i32_ceil(<vscale x 8 x i32> %x, i32 %y) {
 ; CHECK-LABEL: vaaddu_vx_nxv8i32_ceil:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vaaddu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %xzv = zext <vscale x 8 x i32> %x to <vscale x 8 x i64>
@@ -367,8 +367,8 @@ define <vscale x 8 x i32> @vaaddu_vx_nxv8i32_ceil(<vscale x 8 x i32> %x, i32 %y)
 define <vscale x 8 x i64> @vaaddu_vv_nxv8i64_ceil(<vscale x 8 x i64> %x, <vscale x 8 x i64> %y) {
 ; CHECK-LABEL: vaaddu_vv_nxv8i64_ceil:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v16
 ; CHECK-NEXT:    ret
   %xzv = zext <vscale x 8 x i64> %x to <vscale x 8 x i128>
@@ -397,8 +397,8 @@ define <vscale x 8 x i64> @vaaddu_vx_nxv8i64_ceil(<vscale x 8 x i64> %x, i64 %y)
 ;
 ; RV64-LABEL: vaaddu_vx_nxv8i64_ceil:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vaaddu.vx v8, v8, a0
 ; RV64-NEXT:    ret
   %xzv = zext <vscale x 8 x i64> %x to <vscale x 8 x i128>
diff --git a/llvm/test/CodeGen/RISCV/rvv/vaaddu.ll b/llvm/test/CodeGen/RISCV/rvv/vaaddu.ll
index eba87d7061d3..a15a1932360a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vaaddu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vaaddu.ll
@@ -13,8 +13,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vaaddu.nxv1i8.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vaaddu_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -37,8 +37,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vaaddu.mask.nxv1i8.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vaaddu_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    vaaddu.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -61,8 +61,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vaaddu.nxv2i8.nxv2i8(
 define <vscale x 2 x i8> @intrinsic_vaaddu_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -85,8 +85,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vaaddu.mask.nxv2i8.nxv2i8(
 define <vscale x 2 x i8> @intrinsic_vaaddu_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
 ; CHECK-NEXT:    vaaddu.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -109,8 +109,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vaaddu.nxv4i8.nxv4i8(
 define <vscale x 4 x i8> @intrinsic_vaaddu_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -133,8 +133,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vaaddu.mask.nxv4i8.nxv4i8(
 define <vscale x 4 x i8> @intrinsic_vaaddu_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
 ; CHECK-NEXT:    vaaddu.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -157,8 +157,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vaaddu.nxv8i8.nxv8i8(
 define <vscale x 8 x i8> @intrinsic_vaaddu_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -181,8 +181,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vaaddu.mask.nxv8i8.nxv8i8(
 define <vscale x 8 x i8> @intrinsic_vaaddu_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
 ; CHECK-NEXT:    vaaddu.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -205,8 +205,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vaaddu.nxv16i8.nxv16i8(
 define <vscale x 16 x i8> @intrinsic_vaaddu_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -229,8 +229,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vaaddu.mask.nxv16i8.nxv16i8(
 define <vscale x 16 x i8> @intrinsic_vaaddu_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
 ; CHECK-NEXT:    vaaddu.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -253,8 +253,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vaaddu.nxv32i8.nxv32i8(
 define <vscale x 32 x i8> @intrinsic_vaaddu_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -277,8 +277,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vaaddu.mask.nxv32i8.nxv32i8(
 define <vscale x 32 x i8> @intrinsic_vaaddu_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
 ; CHECK-NEXT:    vaaddu.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -301,8 +301,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vaaddu.nxv64i8.nxv64i8(
 define <vscale x 64 x i8> @intrinsic_vaaddu_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -326,8 +326,8 @@ define <vscale x 64 x i8> @intrinsic_vaaddu_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vsc
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
 ; CHECK-NEXT:    vaaddu.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -350,8 +350,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vaaddu.nxv1i16.nxv1i16(
 define <vscale x 1 x i16> @intrinsic_vaaddu_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -374,8 +374,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vaaddu.mask.nxv1i16.nxv1i16(
 define <vscale x 1 x i16> @intrinsic_vaaddu_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
 ; CHECK-NEXT:    vaaddu.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -398,8 +398,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vaaddu.nxv2i16.nxv2i16(
 define <vscale x 2 x i16> @intrinsic_vaaddu_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -422,8 +422,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vaaddu.mask.nxv2i16.nxv2i16(
 define <vscale x 2 x i16> @intrinsic_vaaddu_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
 ; CHECK-NEXT:    vaaddu.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -446,8 +446,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vaaddu.nxv4i16.nxv4i16(
 define <vscale x 4 x i16> @intrinsic_vaaddu_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -470,8 +470,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vaaddu.mask.nxv4i16.nxv4i16(
 define <vscale x 4 x i16> @intrinsic_vaaddu_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
 ; CHECK-NEXT:    vaaddu.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -494,8 +494,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vaaddu.nxv8i16.nxv8i16(
 define <vscale x 8 x i16> @intrinsic_vaaddu_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -518,8 +518,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vaaddu.mask.nxv8i16.nxv8i16(
 define <vscale x 8 x i16> @intrinsic_vaaddu_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
 ; CHECK-NEXT:    vaaddu.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -542,8 +542,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vaaddu.nxv16i16.nxv16i16(
 define <vscale x 16 x i16> @intrinsic_vaaddu_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -566,8 +566,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vaaddu.mask.nxv16i16.nxv16i16(
 define <vscale x 16 x i16> @intrinsic_vaaddu_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vaaddu.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -590,8 +590,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vaaddu.nxv32i16.nxv32i16(
 define <vscale x 32 x i16> @intrinsic_vaaddu_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -615,8 +615,8 @@ define <vscale x 32 x i16> @intrinsic_vaaddu_mask_vv_nxv32i16_nxv32i16_nxv32i16(
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    vaaddu.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -639,8 +639,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vaaddu.nxv1i32.nxv1i32(
 define <vscale x 1 x i32> @intrinsic_vaaddu_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -663,8 +663,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vaaddu.mask.nxv1i32.nxv1i32(
 define <vscale x 1 x i32> @intrinsic_vaaddu_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
 ; CHECK-NEXT:    vaaddu.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -687,8 +687,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vaaddu.nxv2i32.nxv2i32(
 define <vscale x 2 x i32> @intrinsic_vaaddu_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -711,8 +711,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vaaddu.mask.nxv2i32.nxv2i32(
 define <vscale x 2 x i32> @intrinsic_vaaddu_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
 ; CHECK-NEXT:    vaaddu.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -735,8 +735,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vaaddu.nxv4i32.nxv4i32(
 define <vscale x 4 x i32> @intrinsic_vaaddu_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -759,8 +759,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vaaddu.mask.nxv4i32.nxv4i32(
 define <vscale x 4 x i32> @intrinsic_vaaddu_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
 ; CHECK-NEXT:    vaaddu.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -783,8 +783,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vaaddu.nxv8i32.nxv8i32(
 define <vscale x 8 x i32> @intrinsic_vaaddu_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -807,8 +807,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vaaddu.mask.nxv8i32.nxv8i32(
 define <vscale x 8 x i32> @intrinsic_vaaddu_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
 ; CHECK-NEXT:    vaaddu.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -831,8 +831,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vaaddu.nxv16i32.nxv16i32(
 define <vscale x 16 x i32> @intrinsic_vaaddu_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -856,8 +856,8 @@ define <vscale x 16 x i32> @intrinsic_vaaddu_mask_vv_nxv16i32_nxv16i32_nxv16i32(
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re32.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    vaaddu.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -880,8 +880,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vaaddu.nxv1i64.nxv1i64(
 define <vscale x 1 x i64> @intrinsic_vaaddu_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -904,8 +904,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vaaddu.mask.nxv1i64.nxv1i64(
 define <vscale x 1 x i64> @intrinsic_vaaddu_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
 ; CHECK-NEXT:    vaaddu.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -928,8 +928,8 @@ declare <vscale x 2 x i64> @llvm.riscv.vaaddu.nxv2i64.nxv2i64(
 define <vscale x 2 x i64> @intrinsic_vaaddu_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -952,8 +952,8 @@ declare <vscale x 2 x i64> @llvm.riscv.vaaddu.mask.nxv2i64.nxv2i64(
 define <vscale x 2 x i64> @intrinsic_vaaddu_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
 ; CHECK-NEXT:    vaaddu.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -976,8 +976,8 @@ declare <vscale x 4 x i64> @llvm.riscv.vaaddu.nxv4i64.nxv4i64(
 define <vscale x 4 x i64> @intrinsic_vaaddu_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -1000,8 +1000,8 @@ declare <vscale x 4 x i64> @llvm.riscv.vaaddu.mask.nxv4i64.nxv4i64(
 define <vscale x 4 x i64> @intrinsic_vaaddu_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
 ; CHECK-NEXT:    vaaddu.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1024,8 +1024,8 @@ declare <vscale x 8 x i64> @llvm.riscv.vaaddu.nxv8i64.nxv8i64(
 define <vscale x 8 x i64> @intrinsic_vaaddu_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vaaddu.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -1049,8 +1049,8 @@ define <vscale x 8 x i64> @intrinsic_vaaddu_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vsc
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re64.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
 ; CHECK-NEXT:    vaaddu.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1073,8 +1073,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vaaddu.nxv1i8.i8(
 define <vscale x 1 x i8> @intrinsic_vaaddu_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vaaddu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1097,8 +1097,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vaaddu.mask.nxv1i8.i8(
 define <vscale x 1 x i8> @intrinsic_vaaddu_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
 ; CHECK-NEXT:    vaaddu.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1121,8 +1121,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vaaddu.nxv2i8.i8(
 define <vscale x 2 x i8> @intrinsic_vaaddu_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    vaaddu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1145,8 +1145,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vaaddu.mask.nxv2i8.i8(
 define <vscale x 2 x i8> @intrinsic_vaaddu_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
 ; CHECK-NEXT:    vaaddu.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1169,8 +1169,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vaaddu.nxv4i8.i8(
 define <vscale x 4 x i8> @intrinsic_vaaddu_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vaaddu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1193,8 +1193,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vaaddu.mask.nxv4i8.i8(
 define <vscale x 4 x i8> @intrinsic_vaaddu_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
 ; CHECK-NEXT:    vaaddu.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1217,8 +1217,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vaaddu.nxv8i8.i8(
 define <vscale x 8 x i8> @intrinsic_vaaddu_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    vaaddu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1241,8 +1241,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vaaddu.mask.nxv8i8.i8(
 define <vscale x 8 x i8> @intrinsic_vaaddu_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
 ; CHECK-NEXT:    vaaddu.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1265,8 +1265,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vaaddu.nxv16i8.i8(
 define <vscale x 16 x i8> @intrinsic_vaaddu_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vaaddu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1289,8 +1289,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vaaddu.mask.nxv16i8.i8(
 define <vscale x 16 x i8> @intrinsic_vaaddu_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
 ; CHECK-NEXT:    vaaddu.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1313,8 +1313,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vaaddu.nxv32i8.i8(
 define <vscale x 32 x i8> @intrinsic_vaaddu_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    vaaddu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1337,8 +1337,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vaaddu.mask.nxv32i8.i8(
 define <vscale x 32 x i8> @intrinsic_vaaddu_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
 ; CHECK-NEXT:    vaaddu.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1361,8 +1361,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vaaddu.nxv64i8.i8(
 define <vscale x 64 x i8> @intrinsic_vaaddu_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vaaddu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1385,8 +1385,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vaaddu.mask.nxv64i8.i8(
 define <vscale x 64 x i8> @intrinsic_vaaddu_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
 ; CHECK-NEXT:    vaaddu.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1409,8 +1409,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vaaddu.nxv1i16.i16(
 define <vscale x 1 x i16> @intrinsic_vaaddu_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vaaddu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1433,8 +1433,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vaaddu.mask.nxv1i16.i16(
 define <vscale x 1 x i16> @intrinsic_vaaddu_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vaaddu.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1457,8 +1457,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vaaddu.nxv2i16.i16(
 define <vscale x 2 x i16> @intrinsic_vaaddu_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    vaaddu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1481,8 +1481,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vaaddu.mask.nxv2i16.i16(
 define <vscale x 2 x i16> @intrinsic_vaaddu_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
 ; CHECK-NEXT:    vaaddu.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1505,8 +1505,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vaaddu.nxv4i16.i16(
 define <vscale x 4 x i16> @intrinsic_vaaddu_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vaaddu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1529,8 +1529,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vaaddu.mask.nxv4i16.i16(
 define <vscale x 4 x i16> @intrinsic_vaaddu_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
 ; CHECK-NEXT:    vaaddu.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1553,8 +1553,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vaaddu.nxv8i16.i16(
 define <vscale x 8 x i16> @intrinsic_vaaddu_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    vaaddu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1577,8 +1577,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vaaddu.mask.nxv8i16.i16(
 define <vscale x 8 x i16> @intrinsic_vaaddu_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
 ; CHECK-NEXT:    vaaddu.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1601,8 +1601,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vaaddu.nxv16i16.i16(
 define <vscale x 16 x i16> @intrinsic_vaaddu_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    vaaddu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1625,8 +1625,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vaaddu.mask.nxv16i16.i16(
 define <vscale x 16 x i16> @intrinsic_vaaddu_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
 ; CHECK-NEXT:    vaaddu.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1649,8 +1649,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vaaddu.nxv32i16.i16(
 define <vscale x 32 x i16> @intrinsic_vaaddu_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vaaddu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1673,8 +1673,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vaaddu.mask.nxv32i16.i16(
 define <vscale x 32 x i16> @intrinsic_vaaddu_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    vaaddu.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1697,8 +1697,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vaaddu.nxv1i32.i32(
 define <vscale x 1 x i32> @intrinsic_vaaddu_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    vaaddu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1721,8 +1721,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vaaddu.mask.nxv1i32.i32(
 define <vscale x 1 x i32> @intrinsic_vaaddu_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vaaddu.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1745,8 +1745,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vaaddu.nxv2i32.i32(
 define <vscale x 2 x i32> @intrinsic_vaaddu_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vaaddu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1769,8 +1769,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vaaddu.mask.nxv2i32.i32(
 define <vscale x 2 x i32> @intrinsic_vaaddu_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
 ; CHECK-NEXT:    vaaddu.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1793,8 +1793,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vaaddu.nxv4i32.i32(
 define <vscale x 4 x i32> @intrinsic_vaaddu_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    vaaddu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1817,8 +1817,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vaaddu.mask.nxv4i32.i32(
 define <vscale x 4 x i32> @intrinsic_vaaddu_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
 ; CHECK-NEXT:    vaaddu.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1841,8 +1841,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vaaddu.nxv8i32.i32(
 define <vscale x 8 x i32> @intrinsic_vaaddu_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    vaaddu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1865,8 +1865,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vaaddu.mask.nxv8i32.i32(
 define <vscale x 8 x i32> @intrinsic_vaaddu_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
 ; CHECK-NEXT:    vaaddu.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1889,8 +1889,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vaaddu.nxv16i32.i32(
 define <vscale x 16 x i32> @intrinsic_vaaddu_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vaaddu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1913,8 +1913,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vaaddu.mask.nxv16i32.i32(
 define <vscale x 16 x i32> @intrinsic_vaaddu_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vaaddu_mask_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    vaaddu.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1950,8 +1950,8 @@ define <vscale x 1 x i64> @intrinsic_vaaddu_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x
 ;
 ; RV64-LABEL: intrinsic_vaaddu_vx_nxv1i64_nxv1i64_i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
 ; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
 ; RV64-NEXT:    vaaddu.vx v8, v8, a0
 ; RV64-NEXT:    ret
 entry:
@@ -1987,8 +1987,8 @@ define <vscale x 1 x i64> @intrinsic_vaaddu_mask_vx_nxv1i64_nxv1i64_i64(<vscale
 ;
 ; RV64-LABEL: intrinsic_vaaddu_mask_vx_nxv1i64_nxv1i64_i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
 ; RV64-NEXT:    csrwi vxrm, 1
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
 ; RV64-NEXT:    vaaddu.vx v8, v9, a0, v0.t
 ; RV64-NEXT:    ret
 entry:
@@ -2024,8 +2024,8 @@ define <vscale x 2 x i64> @intrinsic_vaaddu_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x
 ;
 ; RV64-LABEL: intrinsic_vaaddu_vx_nxv2i64_nxv2i64_i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
 ; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
 ; RV64-NEXT:    vaaddu.vx v8, v8, a0
 ; RV64-NEXT:    ret
 entry:
@@ -2061,8 +2061,8 @@ define <vscale x 2 x i64> @intrinsic_vaaddu_mask_vx_nxv2i64_nxv2i64_i64(<vscale
 ;
 ; RV64-LABEL: intrinsic_vaaddu_mask_vx_nxv2i64_nxv2i64_i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
 ; RV64-NEXT:    csrwi vxrm, 1
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
 ; RV64-NEXT:    vaaddu.vx v8, v10, a0, v0.t
 ; RV64-NEXT:    ret
 entry:
@@ -2098,8 +2098,8 @@ define <vscale x 4 x i64> @intrinsic_vaaddu_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x
 ;
 ; RV64-LABEL: intrinsic_vaaddu_vx_nxv4i64_nxv4i64_i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; RV64-NEXT:    vaaddu.vx v8, v8, a0
 ; RV64-NEXT:    ret
 entry:
@@ -2135,8 +2135,8 @@ define <vscale x 4 x i64> @intrinsic_vaaddu_mask_vx_nxv4i64_nxv4i64_i64(<vscale
 ;
 ; RV64-LABEL: intrinsic_vaaddu_mask_vx_nxv4i64_nxv4i64_i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
 ; RV64-NEXT:    csrwi vxrm, 1
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
 ; RV64-NEXT:    vaaddu.vx v8, v12, a0, v0.t
 ; RV64-NEXT:    ret
 entry:
@@ -2172,8 +2172,8 @@ define <vscale x 8 x i64> @intrinsic_vaaddu_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x
 ;
 ; RV64-LABEL: intrinsic_vaaddu_vx_nxv8i64_nxv8i64_i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vaaddu.vx v8, v8, a0
 ; RV64-NEXT:    ret
 entry:
@@ -2209,8 +2209,8 @@ define <vscale x 8 x i64> @intrinsic_vaaddu_mask_vx_nxv8i64_nxv8i64_i64(<vscale
 ;
 ; RV64-LABEL: intrinsic_vaaddu_mask_vx_nxv8i64_nxv8i64_i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
 ; RV64-NEXT:    csrwi vxrm, 1
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
 ; RV64-NEXT:    vaaddu.vx v8, v16, a0, v0.t
 ; RV64-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vasub.ll b/llvm/test/CodeGen/RISCV/rvv/vasub.ll
index d69910efb0ee..c3cb66f7f230 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vasub.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vasub.ll
@@ -13,8 +13,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vasub.nxv1i8.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vasub_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vasub.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -37,8 +37,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vasub.mask.nxv1i8.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vasub_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    vasub.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -61,8 +61,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vasub.nxv2i8.nxv2i8(
 define <vscale x 2 x i8> @intrinsic_vasub_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vasub.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -85,8 +85,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vasub.mask.nxv2i8.nxv2i8(
 define <vscale x 2 x i8> @intrinsic_vasub_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
 ; CHECK-NEXT:    vasub.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -109,8 +109,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vasub.nxv4i8.nxv4i8(
 define <vscale x 4 x i8> @intrinsic_vasub_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vasub.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -133,8 +133,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vasub.mask.nxv4i8.nxv4i8(
 define <vscale x 4 x i8> @intrinsic_vasub_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
 ; CHECK-NEXT:    vasub.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -157,8 +157,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vasub.nxv8i8.nxv8i8(
 define <vscale x 8 x i8> @intrinsic_vasub_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vasub.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -181,8 +181,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vasub.mask.nxv8i8.nxv8i8(
 define <vscale x 8 x i8> @intrinsic_vasub_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
 ; CHECK-NEXT:    vasub.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -205,8 +205,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vasub.nxv16i8.nxv16i8(
 define <vscale x 16 x i8> @intrinsic_vasub_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vasub.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -229,8 +229,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vasub.mask.nxv16i8.nxv16i8(
 define <vscale x 16 x i8> @intrinsic_vasub_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
 ; CHECK-NEXT:    vasub.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -253,8 +253,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vasub.nxv32i8.nxv32i8(
 define <vscale x 32 x i8> @intrinsic_vasub_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vasub.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -277,8 +277,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vasub.mask.nxv32i8.nxv32i8(
 define <vscale x 32 x i8> @intrinsic_vasub_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
 ; CHECK-NEXT:    vasub.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -301,8 +301,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vasub.nxv64i8.nxv64i8(
 define <vscale x 64 x i8> @intrinsic_vasub_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vasub.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -326,8 +326,8 @@ define <vscale x 64 x i8> @intrinsic_vasub_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vsca
 ; CHECK-LABEL: intrinsic_vasub_mask_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
 ; CHECK-NEXT:    vasub.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -350,8 +350,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vasub.nxv1i16.nxv1i16(
 define <vscale x 1 x i16> @intrinsic_vasub_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vasub.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -374,8 +374,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vasub.mask.nxv1i16.nxv1i16(
 define <vscale x 1 x i16> @intrinsic_vasub_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
 ; CHECK-NEXT:    vasub.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -398,8 +398,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vasub.nxv2i16.nxv2i16(
 define <vscale x 2 x i16> @intrinsic_vasub_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vasub.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -422,8 +422,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vasub.mask.nxv2i16.nxv2i16(
 define <vscale x 2 x i16> @intrinsic_vasub_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
 ; CHECK-NEXT:    vasub.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -446,8 +446,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vasub.nxv4i16.nxv4i16(
 define <vscale x 4 x i16> @intrinsic_vasub_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vasub.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -470,8 +470,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vasub.mask.nxv4i16.nxv4i16(
 define <vscale x 4 x i16> @intrinsic_vasub_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
 ; CHECK-NEXT:    vasub.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -494,8 +494,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vasub.nxv8i16.nxv8i16(
 define <vscale x 8 x i16> @intrinsic_vasub_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vasub.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -518,8 +518,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vasub.mask.nxv8i16.nxv8i16(
 define <vscale x 8 x i16> @intrinsic_vasub_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
 ; CHECK-NEXT:    vasub.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -542,8 +542,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vasub.nxv16i16.nxv16i16(
 define <vscale x 16 x i16> @intrinsic_vasub_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vasub.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -566,8 +566,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vasub.mask.nxv16i16.nxv16i16(
 define <vscale x 16 x i16> @intrinsic_vasub_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vasub.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -590,8 +590,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vasub.nxv32i16.nxv32i16(
 define <vscale x 32 x i16> @intrinsic_vasub_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vasub.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -615,8 +615,8 @@ define <vscale x 32 x i16> @intrinsic_vasub_mask_vv_nxv32i16_nxv32i16_nxv32i16(<
 ; CHECK-LABEL: intrinsic_vasub_mask_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    vasub.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -639,8 +639,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vasub.nxv1i32.nxv1i32(
 define <vscale x 1 x i32> @intrinsic_vasub_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vasub.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -663,8 +663,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vasub.mask.nxv1i32.nxv1i32(
 define <vscale x 1 x i32> @intrinsic_vasub_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
 ; CHECK-NEXT:    vasub.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -687,8 +687,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vasub.nxv2i32.nxv2i32(
 define <vscale x 2 x i32> @intrinsic_vasub_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vasub.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -711,8 +711,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vasub.mask.nxv2i32.nxv2i32(
 define <vscale x 2 x i32> @intrinsic_vasub_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
 ; CHECK-NEXT:    vasub.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -735,8 +735,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vasub.nxv4i32.nxv4i32(
 define <vscale x 4 x i32> @intrinsic_vasub_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vasub.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -759,8 +759,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vasub.mask.nxv4i32.nxv4i32(
 define <vscale x 4 x i32> @intrinsic_vasub_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
 ; CHECK-NEXT:    vasub.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -783,8 +783,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vasub.nxv8i32.nxv8i32(
 define <vscale x 8 x i32> @intrinsic_vasub_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vasub.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -807,8 +807,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vasub.mask.nxv8i32.nxv8i32(
 define <vscale x 8 x i32> @intrinsic_vasub_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
 ; CHECK-NEXT:    vasub.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -831,8 +831,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vasub.nxv16i32.nxv16i32(
 define <vscale x 16 x i32> @intrinsic_vasub_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vasub.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -856,8 +856,8 @@ define <vscale x 16 x i32> @intrinsic_vasub_mask_vv_nxv16i32_nxv16i32_nxv16i32(<
 ; CHECK-LABEL: intrinsic_vasub_mask_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re32.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    vasub.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -880,8 +880,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vasub.nxv1i64.nxv1i64(
 define <vscale x 1 x i64> @intrinsic_vasub_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vasub.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -904,8 +904,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vasub.mask.nxv1i64.nxv1i64(
 define <vscale x 1 x i64> @intrinsic_vasub_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
 ; CHECK-NEXT:    vasub.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -928,8 +928,8 @@ declare <vscale x 2 x i64> @llvm.riscv.vasub.nxv2i64.nxv2i64(
 define <vscale x 2 x i64> @intrinsic_vasub_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vasub.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -952,8 +952,8 @@ declare <vscale x 2 x i64> @llvm.riscv.vasub.mask.nxv2i64.nxv2i64(
 define <vscale x 2 x i64> @intrinsic_vasub_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
 ; CHECK-NEXT:    vasub.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -976,8 +976,8 @@ declare <vscale x 4 x i64> @llvm.riscv.vasub.nxv4i64.nxv4i64(
 define <vscale x 4 x i64> @intrinsic_vasub_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vasub.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -1000,8 +1000,8 @@ declare <vscale x 4 x i64> @llvm.riscv.vasub.mask.nxv4i64.nxv4i64(
 define <vscale x 4 x i64> @intrinsic_vasub_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
 ; CHECK-NEXT:    vasub.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1024,8 +1024,8 @@ declare <vscale x 8 x i64> @llvm.riscv.vasub.nxv8i64.nxv8i64(
 define <vscale x 8 x i64> @intrinsic_vasub_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vasub.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -1049,8 +1049,8 @@ define <vscale x 8 x i64> @intrinsic_vasub_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vsca
 ; CHECK-LABEL: intrinsic_vasub_mask_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re64.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
 ; CHECK-NEXT:    vasub.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1073,8 +1073,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vasub.nxv1i8.i8(
 define <vscale x 1 x i8> @intrinsic_vasub_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vasub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1097,8 +1097,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vasub.mask.nxv1i8.i8(
 define <vscale x 1 x i8> @intrinsic_vasub_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
 ; CHECK-NEXT:    vasub.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1121,8 +1121,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vasub.nxv2i8.i8(
 define <vscale x 2 x i8> @intrinsic_vasub_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    vasub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1145,8 +1145,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vasub.mask.nxv2i8.i8(
 define <vscale x 2 x i8> @intrinsic_vasub_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
 ; CHECK-NEXT:    vasub.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1169,8 +1169,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vasub.nxv4i8.i8(
 define <vscale x 4 x i8> @intrinsic_vasub_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vasub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1193,8 +1193,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vasub.mask.nxv4i8.i8(
 define <vscale x 4 x i8> @intrinsic_vasub_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
 ; CHECK-NEXT:    vasub.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1217,8 +1217,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vasub.nxv8i8.i8(
 define <vscale x 8 x i8> @intrinsic_vasub_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    vasub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1241,8 +1241,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vasub.mask.nxv8i8.i8(
 define <vscale x 8 x i8> @intrinsic_vasub_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
 ; CHECK-NEXT:    vasub.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1265,8 +1265,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vasub.nxv16i8.i8(
 define <vscale x 16 x i8> @intrinsic_vasub_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vasub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1289,8 +1289,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vasub.mask.nxv16i8.i8(
 define <vscale x 16 x i8> @intrinsic_vasub_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
 ; CHECK-NEXT:    vasub.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1313,8 +1313,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vasub.nxv32i8.i8(
 define <vscale x 32 x i8> @intrinsic_vasub_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    vasub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1337,8 +1337,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vasub.mask.nxv32i8.i8(
 define <vscale x 32 x i8> @intrinsic_vasub_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
 ; CHECK-NEXT:    vasub.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1361,8 +1361,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vasub.nxv64i8.i8(
 define <vscale x 64 x i8> @intrinsic_vasub_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vasub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1385,8 +1385,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vasub.mask.nxv64i8.i8(
 define <vscale x 64 x i8> @intrinsic_vasub_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
 ; CHECK-NEXT:    vasub.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1409,8 +1409,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vasub.nxv1i16.i16(
 define <vscale x 1 x i16> @intrinsic_vasub_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vasub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1433,8 +1433,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vasub.mask.nxv1i16.i16(
 define <vscale x 1 x i16> @intrinsic_vasub_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vasub.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1457,8 +1457,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vasub.nxv2i16.i16(
 define <vscale x 2 x i16> @intrinsic_vasub_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    vasub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1481,8 +1481,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vasub.mask.nxv2i16.i16(
 define <vscale x 2 x i16> @intrinsic_vasub_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
 ; CHECK-NEXT:    vasub.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1505,8 +1505,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vasub.nxv4i16.i16(
 define <vscale x 4 x i16> @intrinsic_vasub_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vasub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1529,8 +1529,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vasub.mask.nxv4i16.i16(
 define <vscale x 4 x i16> @intrinsic_vasub_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
 ; CHECK-NEXT:    vasub.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1553,8 +1553,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vasub.nxv8i16.i16(
 define <vscale x 8 x i16> @intrinsic_vasub_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    vasub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1577,8 +1577,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vasub.mask.nxv8i16.i16(
 define <vscale x 8 x i16> @intrinsic_vasub_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
 ; CHECK-NEXT:    vasub.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1601,8 +1601,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vasub.nxv16i16.i16(
 define <vscale x 16 x i16> @intrinsic_vasub_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    vasub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1625,8 +1625,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vasub.mask.nxv16i16.i16(
 define <vscale x 16 x i16> @intrinsic_vasub_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
 ; CHECK-NEXT:    vasub.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1649,8 +1649,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vasub.nxv32i16.i16(
 define <vscale x 32 x i16> @intrinsic_vasub_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vasub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1673,8 +1673,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vasub.mask.nxv32i16.i16(
 define <vscale x 32 x i16> @intrinsic_vasub_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    vasub.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1697,8 +1697,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vasub.nxv1i32.i32(
 define <vscale x 1 x i32> @intrinsic_vasub_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    vasub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1721,8 +1721,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vasub.mask.nxv1i32.i32(
 define <vscale x 1 x i32> @intrinsic_vasub_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vasub.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1745,8 +1745,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vasub.nxv2i32.i32(
 define <vscale x 2 x i32> @intrinsic_vasub_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vasub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1769,8 +1769,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vasub.mask.nxv2i32.i32(
 define <vscale x 2 x i32> @intrinsic_vasub_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
 ; CHECK-NEXT:    vasub.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1793,8 +1793,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vasub.nxv4i32.i32(
 define <vscale x 4 x i32> @intrinsic_vasub_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    vasub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1817,8 +1817,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vasub.mask.nxv4i32.i32(
 define <vscale x 4 x i32> @intrinsic_vasub_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
 ; CHECK-NEXT:    vasub.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1841,8 +1841,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vasub.nxv8i32.i32(
 define <vscale x 8 x i32> @intrinsic_vasub_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    vasub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1865,8 +1865,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vasub.mask.nxv8i32.i32(
 define <vscale x 8 x i32> @intrinsic_vasub_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
 ; CHECK-NEXT:    vasub.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1889,8 +1889,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vasub.nxv16i32.i32(
 define <vscale x 16 x i32> @intrinsic_vasub_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vasub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1913,8 +1913,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vasub.mask.nxv16i32.i32(
 define <vscale x 16 x i32> @intrinsic_vasub_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasub_mask_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    vasub.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1950,8 +1950,8 @@ define <vscale x 1 x i64> @intrinsic_vasub_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x
 ;
 ; RV64-LABEL: intrinsic_vasub_vx_nxv1i64_nxv1i64_i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
 ; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
 ; RV64-NEXT:    vasub.vx v8, v8, a0
 ; RV64-NEXT:    ret
 entry:
@@ -1987,8 +1987,8 @@ define <vscale x 1 x i64> @intrinsic_vasub_mask_vx_nxv1i64_nxv1i64_i64(<vscale x
 ;
 ; RV64-LABEL: intrinsic_vasub_mask_vx_nxv1i64_nxv1i64_i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
 ; RV64-NEXT:    csrwi vxrm, 1
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
 ; RV64-NEXT:    vasub.vx v8, v9, a0, v0.t
 ; RV64-NEXT:    ret
 entry:
@@ -2024,8 +2024,8 @@ define <vscale x 2 x i64> @intrinsic_vasub_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x
 ;
 ; RV64-LABEL: intrinsic_vasub_vx_nxv2i64_nxv2i64_i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
 ; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
 ; RV64-NEXT:    vasub.vx v8, v8, a0
 ; RV64-NEXT:    ret
 entry:
@@ -2061,8 +2061,8 @@ define <vscale x 2 x i64> @intrinsic_vasub_mask_vx_nxv2i64_nxv2i64_i64(<vscale x
 ;
 ; RV64-LABEL: intrinsic_vasub_mask_vx_nxv2i64_nxv2i64_i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
 ; RV64-NEXT:    csrwi vxrm, 1
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
 ; RV64-NEXT:    vasub.vx v8, v10, a0, v0.t
 ; RV64-NEXT:    ret
 entry:
@@ -2098,8 +2098,8 @@ define <vscale x 4 x i64> @intrinsic_vasub_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x
 ;
 ; RV64-LABEL: intrinsic_vasub_vx_nxv4i64_nxv4i64_i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; RV64-NEXT:    vasub.vx v8, v8, a0
 ; RV64-NEXT:    ret
 entry:
@@ -2135,8 +2135,8 @@ define <vscale x 4 x i64> @intrinsic_vasub_mask_vx_nxv4i64_nxv4i64_i64(<vscale x
 ;
 ; RV64-LABEL: intrinsic_vasub_mask_vx_nxv4i64_nxv4i64_i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
 ; RV64-NEXT:    csrwi vxrm, 1
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
 ; RV64-NEXT:    vasub.vx v8, v12, a0, v0.t
 ; RV64-NEXT:    ret
 entry:
@@ -2172,8 +2172,8 @@ define <vscale x 8 x i64> @intrinsic_vasub_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x
 ;
 ; RV64-LABEL: intrinsic_vasub_vx_nxv8i64_nxv8i64_i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vasub.vx v8, v8, a0
 ; RV64-NEXT:    ret
 entry:
@@ -2209,8 +2209,8 @@ define <vscale x 8 x i64> @intrinsic_vasub_mask_vx_nxv8i64_nxv8i64_i64(<vscale x
 ;
 ; RV64-LABEL: intrinsic_vasub_mask_vx_nxv8i64_nxv8i64_i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
 ; RV64-NEXT:    csrwi vxrm, 1
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
 ; RV64-NEXT:    vasub.vx v8, v16, a0, v0.t
 ; RV64-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vasubu.ll b/llvm/test/CodeGen/RISCV/rvv/vasubu.ll
index 4228e067199f..6d790a9ce027 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vasubu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vasubu.ll
@@ -13,8 +13,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vasubu.nxv1i8.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vasubu_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vasubu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -37,8 +37,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vasubu.mask.nxv1i8.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vasubu_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    vasubu.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -61,8 +61,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vasubu.nxv2i8.nxv2i8(
 define <vscale x 2 x i8> @intrinsic_vasubu_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vasubu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -85,8 +85,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vasubu.mask.nxv2i8.nxv2i8(
 define <vscale x 2 x i8> @intrinsic_vasubu_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
 ; CHECK-NEXT:    vasubu.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -109,8 +109,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vasubu.nxv4i8.nxv4i8(
 define <vscale x 4 x i8> @intrinsic_vasubu_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vasubu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -133,8 +133,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vasubu.mask.nxv4i8.nxv4i8(
 define <vscale x 4 x i8> @intrinsic_vasubu_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
 ; CHECK-NEXT:    vasubu.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -157,8 +157,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vasubu.nxv8i8.nxv8i8(
 define <vscale x 8 x i8> @intrinsic_vasubu_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vasubu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -181,8 +181,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vasubu.mask.nxv8i8.nxv8i8(
 define <vscale x 8 x i8> @intrinsic_vasubu_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
 ; CHECK-NEXT:    vasubu.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -205,8 +205,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vasubu.nxv16i8.nxv16i8(
 define <vscale x 16 x i8> @intrinsic_vasubu_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vasubu.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -229,8 +229,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vasubu.mask.nxv16i8.nxv16i8(
 define <vscale x 16 x i8> @intrinsic_vasubu_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
 ; CHECK-NEXT:    vasubu.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -253,8 +253,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vasubu.nxv32i8.nxv32i8(
 define <vscale x 32 x i8> @intrinsic_vasubu_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vasubu.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -277,8 +277,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vasubu.mask.nxv32i8.nxv32i8(
 define <vscale x 32 x i8> @intrinsic_vasubu_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
 ; CHECK-NEXT:    vasubu.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -301,8 +301,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vasubu.nxv64i8.nxv64i8(
 define <vscale x 64 x i8> @intrinsic_vasubu_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vasubu.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -326,8 +326,8 @@ define <vscale x 64 x i8> @intrinsic_vasubu_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vsc
 ; CHECK-LABEL: intrinsic_vasubu_mask_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
 ; CHECK-NEXT:    vasubu.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -350,8 +350,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vasubu.nxv1i16.nxv1i16(
 define <vscale x 1 x i16> @intrinsic_vasubu_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vasubu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -374,8 +374,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vasubu.mask.nxv1i16.nxv1i16(
 define <vscale x 1 x i16> @intrinsic_vasubu_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
 ; CHECK-NEXT:    vasubu.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -398,8 +398,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vasubu.nxv2i16.nxv2i16(
 define <vscale x 2 x i16> @intrinsic_vasubu_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vasubu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -422,8 +422,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vasubu.mask.nxv2i16.nxv2i16(
 define <vscale x 2 x i16> @intrinsic_vasubu_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
 ; CHECK-NEXT:    vasubu.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -446,8 +446,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vasubu.nxv4i16.nxv4i16(
 define <vscale x 4 x i16> @intrinsic_vasubu_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vasubu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -470,8 +470,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vasubu.mask.nxv4i16.nxv4i16(
 define <vscale x 4 x i16> @intrinsic_vasubu_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
 ; CHECK-NEXT:    vasubu.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -494,8 +494,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vasubu.nxv8i16.nxv8i16(
 define <vscale x 8 x i16> @intrinsic_vasubu_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vasubu.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -518,8 +518,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vasubu.mask.nxv8i16.nxv8i16(
 define <vscale x 8 x i16> @intrinsic_vasubu_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
 ; CHECK-NEXT:    vasubu.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -542,8 +542,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vasubu.nxv16i16.nxv16i16(
 define <vscale x 16 x i16> @intrinsic_vasubu_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vasubu.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -566,8 +566,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vasubu.mask.nxv16i16.nxv16i16(
 define <vscale x 16 x i16> @intrinsic_vasubu_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vasubu.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -590,8 +590,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vasubu.nxv32i16.nxv32i16(
 define <vscale x 32 x i16> @intrinsic_vasubu_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vasubu.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -615,8 +615,8 @@ define <vscale x 32 x i16> @intrinsic_vasubu_mask_vv_nxv32i16_nxv32i16_nxv32i16(
 ; CHECK-LABEL: intrinsic_vasubu_mask_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    vasubu.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -639,8 +639,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vasubu.nxv1i32.nxv1i32(
 define <vscale x 1 x i32> @intrinsic_vasubu_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vasubu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -663,8 +663,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vasubu.mask.nxv1i32.nxv1i32(
 define <vscale x 1 x i32> @intrinsic_vasubu_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
 ; CHECK-NEXT:    vasubu.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -687,8 +687,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vasubu.nxv2i32.nxv2i32(
 define <vscale x 2 x i32> @intrinsic_vasubu_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vasubu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -711,8 +711,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vasubu.mask.nxv2i32.nxv2i32(
 define <vscale x 2 x i32> @intrinsic_vasubu_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
 ; CHECK-NEXT:    vasubu.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -735,8 +735,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vasubu.nxv4i32.nxv4i32(
 define <vscale x 4 x i32> @intrinsic_vasubu_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vasubu.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -759,8 +759,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vasubu.mask.nxv4i32.nxv4i32(
 define <vscale x 4 x i32> @intrinsic_vasubu_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
 ; CHECK-NEXT:    vasubu.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -783,8 +783,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vasubu.nxv8i32.nxv8i32(
 define <vscale x 8 x i32> @intrinsic_vasubu_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vasubu.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -807,8 +807,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vasubu.mask.nxv8i32.nxv8i32(
 define <vscale x 8 x i32> @intrinsic_vasubu_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
 ; CHECK-NEXT:    vasubu.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -831,8 +831,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vasubu.nxv16i32.nxv16i32(
 define <vscale x 16 x i32> @intrinsic_vasubu_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vasubu.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -856,8 +856,8 @@ define <vscale x 16 x i32> @intrinsic_vasubu_mask_vv_nxv16i32_nxv16i32_nxv16i32(
 ; CHECK-LABEL: intrinsic_vasubu_mask_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re32.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    vasubu.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -880,8 +880,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vasubu.nxv1i64.nxv1i64(
 define <vscale x 1 x i64> @intrinsic_vasubu_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vasubu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -904,8 +904,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vasubu.mask.nxv1i64.nxv1i64(
 define <vscale x 1 x i64> @intrinsic_vasubu_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
 ; CHECK-NEXT:    vasubu.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -928,8 +928,8 @@ declare <vscale x 2 x i64> @llvm.riscv.vasubu.nxv2i64.nxv2i64(
 define <vscale x 2 x i64> @intrinsic_vasubu_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vasubu.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -952,8 +952,8 @@ declare <vscale x 2 x i64> @llvm.riscv.vasubu.mask.nxv2i64.nxv2i64(
 define <vscale x 2 x i64> @intrinsic_vasubu_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
 ; CHECK-NEXT:    vasubu.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -976,8 +976,8 @@ declare <vscale x 4 x i64> @llvm.riscv.vasubu.nxv4i64.nxv4i64(
 define <vscale x 4 x i64> @intrinsic_vasubu_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vasubu.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -1000,8 +1000,8 @@ declare <vscale x 4 x i64> @llvm.riscv.vasubu.mask.nxv4i64.nxv4i64(
 define <vscale x 4 x i64> @intrinsic_vasubu_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
 ; CHECK-NEXT:    vasubu.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1024,8 +1024,8 @@ declare <vscale x 8 x i64> @llvm.riscv.vasubu.nxv8i64.nxv8i64(
 define <vscale x 8 x i64> @intrinsic_vasubu_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vasubu.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -1049,8 +1049,8 @@ define <vscale x 8 x i64> @intrinsic_vasubu_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vsc
 ; CHECK-LABEL: intrinsic_vasubu_mask_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re64.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
 ; CHECK-NEXT:    vasubu.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1073,8 +1073,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vasubu.nxv1i8.i8(
 define <vscale x 1 x i8> @intrinsic_vasubu_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vasubu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1097,8 +1097,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vasubu.mask.nxv1i8.i8(
 define <vscale x 1 x i8> @intrinsic_vasubu_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
 ; CHECK-NEXT:    vasubu.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1121,8 +1121,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vasubu.nxv2i8.i8(
 define <vscale x 2 x i8> @intrinsic_vasubu_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    vasubu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1145,8 +1145,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vasubu.mask.nxv2i8.i8(
 define <vscale x 2 x i8> @intrinsic_vasubu_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
 ; CHECK-NEXT:    vasubu.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1169,8 +1169,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vasubu.nxv4i8.i8(
 define <vscale x 4 x i8> @intrinsic_vasubu_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vasubu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1193,8 +1193,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vasubu.mask.nxv4i8.i8(
 define <vscale x 4 x i8> @intrinsic_vasubu_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
 ; CHECK-NEXT:    vasubu.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1217,8 +1217,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vasubu.nxv8i8.i8(
 define <vscale x 8 x i8> @intrinsic_vasubu_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    vasubu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1241,8 +1241,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vasubu.mask.nxv8i8.i8(
 define <vscale x 8 x i8> @intrinsic_vasubu_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
 ; CHECK-NEXT:    vasubu.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1265,8 +1265,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vasubu.nxv16i8.i8(
 define <vscale x 16 x i8> @intrinsic_vasubu_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vasubu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1289,8 +1289,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vasubu.mask.nxv16i8.i8(
 define <vscale x 16 x i8> @intrinsic_vasubu_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
 ; CHECK-NEXT:    vasubu.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1313,8 +1313,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vasubu.nxv32i8.i8(
 define <vscale x 32 x i8> @intrinsic_vasubu_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    vasubu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1337,8 +1337,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vasubu.mask.nxv32i8.i8(
 define <vscale x 32 x i8> @intrinsic_vasubu_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
 ; CHECK-NEXT:    vasubu.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1361,8 +1361,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vasubu.nxv64i8.i8(
 define <vscale x 64 x i8> @intrinsic_vasubu_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vasubu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1385,8 +1385,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vasubu.mask.nxv64i8.i8(
 define <vscale x 64 x i8> @intrinsic_vasubu_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
 ; CHECK-NEXT:    vasubu.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1409,8 +1409,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vasubu.nxv1i16.i16(
 define <vscale x 1 x i16> @intrinsic_vasubu_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vasubu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1433,8 +1433,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vasubu.mask.nxv1i16.i16(
 define <vscale x 1 x i16> @intrinsic_vasubu_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vasubu.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1457,8 +1457,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vasubu.nxv2i16.i16(
 define <vscale x 2 x i16> @intrinsic_vasubu_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    vasubu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1481,8 +1481,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vasubu.mask.nxv2i16.i16(
 define <vscale x 2 x i16> @intrinsic_vasubu_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
 ; CHECK-NEXT:    vasubu.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1505,8 +1505,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vasubu.nxv4i16.i16(
 define <vscale x 4 x i16> @intrinsic_vasubu_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vasubu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1529,8 +1529,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vasubu.mask.nxv4i16.i16(
 define <vscale x 4 x i16> @intrinsic_vasubu_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
 ; CHECK-NEXT:    vasubu.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1553,8 +1553,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vasubu.nxv8i16.i16(
 define <vscale x 8 x i16> @intrinsic_vasubu_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    vasubu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1577,8 +1577,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vasubu.mask.nxv8i16.i16(
 define <vscale x 8 x i16> @intrinsic_vasubu_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
 ; CHECK-NEXT:    vasubu.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1601,8 +1601,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vasubu.nxv16i16.i16(
 define <vscale x 16 x i16> @intrinsic_vasubu_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    vasubu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1625,8 +1625,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vasubu.mask.nxv16i16.i16(
 define <vscale x 16 x i16> @intrinsic_vasubu_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
 ; CHECK-NEXT:    vasubu.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1649,8 +1649,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vasubu.nxv32i16.i16(
 define <vscale x 32 x i16> @intrinsic_vasubu_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vasubu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1673,8 +1673,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vasubu.mask.nxv32i16.i16(
 define <vscale x 32 x i16> @intrinsic_vasubu_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    vasubu.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1697,8 +1697,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vasubu.nxv1i32.i32(
 define <vscale x 1 x i32> @intrinsic_vasubu_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    vasubu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1721,8 +1721,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vasubu.mask.nxv1i32.i32(
 define <vscale x 1 x i32> @intrinsic_vasubu_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vasubu.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1745,8 +1745,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vasubu.nxv2i32.i32(
 define <vscale x 2 x i32> @intrinsic_vasubu_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vasubu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1769,8 +1769,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vasubu.mask.nxv2i32.i32(
 define <vscale x 2 x i32> @intrinsic_vasubu_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
 ; CHECK-NEXT:    vasubu.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1793,8 +1793,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vasubu.nxv4i32.i32(
 define <vscale x 4 x i32> @intrinsic_vasubu_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    vasubu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1817,8 +1817,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vasubu.mask.nxv4i32.i32(
 define <vscale x 4 x i32> @intrinsic_vasubu_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
 ; CHECK-NEXT:    vasubu.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1841,8 +1841,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vasubu.nxv8i32.i32(
 define <vscale x 8 x i32> @intrinsic_vasubu_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    vasubu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1865,8 +1865,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vasubu.mask.nxv8i32.i32(
 define <vscale x 8 x i32> @intrinsic_vasubu_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
 ; CHECK-NEXT:    vasubu.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1889,8 +1889,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vasubu.nxv16i32.i32(
 define <vscale x 16 x i32> @intrinsic_vasubu_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vasubu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1913,8 +1913,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vasubu.mask.nxv16i32.i32(
 define <vscale x 16 x i32> @intrinsic_vasubu_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vasubu_mask_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 1
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    vasubu.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1950,8 +1950,8 @@ define <vscale x 1 x i64> @intrinsic_vasubu_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x
 ;
 ; RV64-LABEL: intrinsic_vasubu_vx_nxv1i64_nxv1i64_i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
 ; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
 ; RV64-NEXT:    vasubu.vx v8, v8, a0
 ; RV64-NEXT:    ret
 entry:
@@ -1987,8 +1987,8 @@ define <vscale x 1 x i64> @intrinsic_vasubu_mask_vx_nxv1i64_nxv1i64_i64(<vscale
 ;
 ; RV64-LABEL: intrinsic_vasubu_mask_vx_nxv1i64_nxv1i64_i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
 ; RV64-NEXT:    csrwi vxrm, 1
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
 ; RV64-NEXT:    vasubu.vx v8, v9, a0, v0.t
 ; RV64-NEXT:    ret
 entry:
@@ -2024,8 +2024,8 @@ define <vscale x 2 x i64> @intrinsic_vasubu_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x
 ;
 ; RV64-LABEL: intrinsic_vasubu_vx_nxv2i64_nxv2i64_i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
 ; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
 ; RV64-NEXT:    vasubu.vx v8, v8, a0
 ; RV64-NEXT:    ret
 entry:
@@ -2061,8 +2061,8 @@ define <vscale x 2 x i64> @intrinsic_vasubu_mask_vx_nxv2i64_nxv2i64_i64(<vscale
 ;
 ; RV64-LABEL: intrinsic_vasubu_mask_vx_nxv2i64_nxv2i64_i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
 ; RV64-NEXT:    csrwi vxrm, 1
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
 ; RV64-NEXT:    vasubu.vx v8, v10, a0, v0.t
 ; RV64-NEXT:    ret
 entry:
@@ -2098,8 +2098,8 @@ define <vscale x 4 x i64> @intrinsic_vasubu_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x
 ;
 ; RV64-LABEL: intrinsic_vasubu_vx_nxv4i64_nxv4i64_i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; RV64-NEXT:    vasubu.vx v8, v8, a0
 ; RV64-NEXT:    ret
 entry:
@@ -2135,8 +2135,8 @@ define <vscale x 4 x i64> @intrinsic_vasubu_mask_vx_nxv4i64_nxv4i64_i64(<vscale
 ;
 ; RV64-LABEL: intrinsic_vasubu_mask_vx_nxv4i64_nxv4i64_i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
 ; RV64-NEXT:    csrwi vxrm, 1
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
 ; RV64-NEXT:    vasubu.vx v8, v12, a0, v0.t
 ; RV64-NEXT:    ret
 entry:
@@ -2172,8 +2172,8 @@ define <vscale x 8 x i64> @intrinsic_vasubu_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x
 ;
 ; RV64-LABEL: intrinsic_vasubu_vx_nxv8i64_nxv8i64_i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vasubu.vx v8, v8, a0
 ; RV64-NEXT:    ret
 entry:
@@ -2209,8 +2209,8 @@ define <vscale x 8 x i64> @intrinsic_vasubu_mask_vx_nxv8i64_nxv8i64_i64(<vscale
 ;
 ; RV64-LABEL: intrinsic_vasubu_mask_vx_nxv8i64_nxv8i64_i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
 ; RV64-NEXT:    csrwi vxrm, 1
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
 ; RV64-NEXT:    vasubu.vx v8, v16, a0, v0.t
 ; RV64-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd.ll
index 6816307d1096..ae7d7d5d1962 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfadd.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd.ll
@@ -23,10 +23,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfadd.nxv1f16.nxv1f16(
 define <vscale x 1 x half> @intrinsic_vfadd_vv_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_vv_nxv1f16_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfadd.nxv1f16.nxv1f16(
@@ -48,10 +48,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfadd.mask.nxv1f16.nxv1f16(
 define <vscale x 1 x half> @intrinsic_vfadd_mask_vv_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_mask_vv_nxv1f16_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfadd.mask.nxv1f16.nxv1f16(
@@ -73,10 +73,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfadd.nxv2f16.nxv2f16(
 define <vscale x 2 x half> @intrinsic_vfadd_vv_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_vv_nxv2f16_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfadd.nxv2f16.nxv2f16(
@@ -98,10 +98,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfadd.mask.nxv2f16.nxv2f16(
 define <vscale x 2 x half> @intrinsic_vfadd_mask_vv_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_mask_vv_nxv2f16_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfadd.mask.nxv2f16.nxv2f16(
@@ -123,10 +123,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfadd.nxv4f16.nxv4f16(
 define <vscale x 4 x half> @intrinsic_vfadd_vv_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_vv_nxv4f16_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfadd.nxv4f16.nxv4f16(
@@ -148,10 +148,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfadd.mask.nxv4f16.nxv4f16(
 define <vscale x 4 x half> @intrinsic_vfadd_mask_vv_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_mask_vv_nxv4f16_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfadd.mask.nxv4f16.nxv4f16(
@@ -173,10 +173,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfadd.nxv8f16.nxv8f16(
 define <vscale x 8 x half> @intrinsic_vfadd_vv_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_vv_nxv8f16_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v8, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfadd.nxv8f16.nxv8f16(
@@ -198,10 +198,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfadd.mask.nxv8f16.nxv8f16(
 define <vscale x 8 x half> @intrinsic_vfadd_mask_vv_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_mask_vv_nxv8f16_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfadd.mask.nxv8f16.nxv8f16(
@@ -223,10 +223,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfadd.nxv16f16.nxv16f16(
 define <vscale x 16 x half> @intrinsic_vfadd_vv_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_vv_nxv16f16_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v8, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfadd.nxv16f16.nxv16f16(
@@ -248,10 +248,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfadd.mask.nxv16f16.nxv16f16(
 define <vscale x 16 x half> @intrinsic_vfadd_mask_vv_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_mask_vv_nxv16f16_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfadd.mask.nxv16f16.nxv16f16(
@@ -273,10 +273,10 @@ declare <vscale x 32 x half> @llvm.riscv.vfadd.nxv32f16.nxv32f16(
 define <vscale x 32 x half> @intrinsic_vfadd_vv_nxv32f16_nxv32f16_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_vv_nxv32f16_nxv32f16_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v8, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vfadd.nxv32f16.nxv32f16(
@@ -299,8 +299,8 @@ define <vscale x 32 x half> @intrinsic_vfadd_mask_vv_nxv32f16_nxv32f16_nxv32f16(
 ; CHECK-LABEL: intrinsic_vfadd_mask_vv_nxv32f16_nxv32f16_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    vfadd.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
@@ -324,10 +324,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfadd.nxv1f32.nxv1f32(
 define <vscale x 1 x float> @intrinsic_vfadd_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_vv_nxv1f32_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfadd.nxv1f32.nxv1f32(
@@ -349,10 +349,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfadd.mask.nxv1f32.nxv1f32(
 define <vscale x 1 x float> @intrinsic_vfadd_mask_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_mask_vv_nxv1f32_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfadd.mask.nxv1f32.nxv1f32(
@@ -374,10 +374,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfadd.nxv2f32.nxv2f32(
 define <vscale x 2 x float> @intrinsic_vfadd_vv_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_vv_nxv2f32_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfadd.nxv2f32.nxv2f32(
@@ -399,10 +399,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfadd.mask.nxv2f32.nxv2f32(
 define <vscale x 2 x float> @intrinsic_vfadd_mask_vv_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_mask_vv_nxv2f32_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfadd.mask.nxv2f32.nxv2f32(
@@ -424,10 +424,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfadd.nxv4f32.nxv4f32(
 define <vscale x 4 x float> @intrinsic_vfadd_vv_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_vv_nxv4f32_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v8, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfadd.nxv4f32.nxv4f32(
@@ -449,10 +449,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfadd.mask.nxv4f32.nxv4f32(
 define <vscale x 4 x float> @intrinsic_vfadd_mask_vv_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_mask_vv_nxv4f32_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfadd.mask.nxv4f32.nxv4f32(
@@ -474,10 +474,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfadd.nxv8f32.nxv8f32(
 define <vscale x 8 x float> @intrinsic_vfadd_vv_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_vv_nxv8f32_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v8, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfadd.nxv8f32.nxv8f32(
@@ -499,10 +499,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfadd.mask.nxv8f32.nxv8f32(
 define <vscale x 8 x float> @intrinsic_vfadd_mask_vv_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_mask_vv_nxv8f32_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfadd.mask.nxv8f32.nxv8f32(
@@ -524,10 +524,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfadd.nxv16f32.nxv16f32(
 define <vscale x 16 x float> @intrinsic_vfadd_vv_nxv16f32_nxv16f32_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_vv_nxv16f32_nxv16f32_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v8, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfadd.nxv16f32.nxv16f32(
@@ -550,8 +550,8 @@ define <vscale x 16 x float> @intrinsic_vfadd_mask_vv_nxv16f32_nxv16f32_nxv16f32
 ; CHECK-LABEL: intrinsic_vfadd_mask_vv_nxv16f32_nxv16f32_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re32.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    vfadd.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
@@ -575,10 +575,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfadd.nxv1f64.nxv1f64(
 define <vscale x 1 x double> @intrinsic_vfadd_vv_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_vv_nxv1f64_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfadd.nxv1f64.nxv1f64(
@@ -600,10 +600,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfadd.mask.nxv1f64.nxv1f64(
 define <vscale x 1 x double> @intrinsic_vfadd_mask_vv_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_mask_vv_nxv1f64_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfadd.mask.nxv1f64.nxv1f64(
@@ -625,10 +625,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfadd.nxv2f64.nxv2f64(
 define <vscale x 2 x double> @intrinsic_vfadd_vv_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_vv_nxv2f64_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v8, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfadd.nxv2f64.nxv2f64(
@@ -650,10 +650,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfadd.mask.nxv2f64.nxv2f64(
 define <vscale x 2 x double> @intrinsic_vfadd_mask_vv_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_mask_vv_nxv2f64_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfadd.mask.nxv2f64.nxv2f64(
@@ -675,10 +675,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfadd.nxv4f64.nxv4f64(
 define <vscale x 4 x double> @intrinsic_vfadd_vv_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_vv_nxv4f64_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v8, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfadd.nxv4f64.nxv4f64(
@@ -700,10 +700,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfadd.mask.nxv4f64.nxv4f64(
 define <vscale x 4 x double> @intrinsic_vfadd_mask_vv_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_mask_vv_nxv4f64_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfadd.mask.nxv4f64.nxv4f64(
@@ -725,10 +725,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfadd.nxv8f64.nxv8f64(
 define <vscale x 8 x double> @intrinsic_vfadd_vv_nxv8f64_nxv8f64_nxv8f64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_vv_nxv8f64_nxv8f64_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vv v8, v8, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfadd.nxv8f64.nxv8f64(
@@ -751,8 +751,8 @@ define <vscale x 8 x double> @intrinsic_vfadd_mask_vv_nxv8f64_nxv8f64_nxv8f64(<v
 ; CHECK-LABEL: intrinsic_vfadd_mask_vv_nxv8f64_nxv8f64_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re64.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
 ; CHECK-NEXT:    vfadd.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
@@ -776,10 +776,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfadd.nxv1f16.f16(
 define <vscale x 1 x half> @intrinsic_vfadd_vf_nxv1f16_nxv1f16_f16(<vscale x 1 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_vf_nxv1f16_nxv1f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfadd.nxv1f16.f16(
@@ -801,10 +801,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfadd.mask.nxv1f16.f16(
 define <vscale x 1 x half> @intrinsic_vfadd_mask_vf_nxv1f16_nxv1f16_f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, half %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_mask_vf_nxv1f16_nxv1f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfadd.mask.nxv1f16.f16(
@@ -826,10 +826,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfadd.nxv2f16.f16(
 define <vscale x 2 x half> @intrinsic_vfadd_vf_nxv2f16_nxv2f16_f16(<vscale x 2 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_vf_nxv2f16_nxv2f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfadd.nxv2f16.f16(
@@ -851,10 +851,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfadd.mask.nxv2f16.f16(
 define <vscale x 2 x half> @intrinsic_vfadd_mask_vf_nxv2f16_nxv2f16_f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, half %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_mask_vf_nxv2f16_nxv2f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfadd.mask.nxv2f16.f16(
@@ -876,10 +876,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfadd.nxv4f16.f16(
 define <vscale x 4 x half> @intrinsic_vfadd_vf_nxv4f16_nxv4f16_f16(<vscale x 4 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_vf_nxv4f16_nxv4f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfadd.nxv4f16.f16(
@@ -901,10 +901,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfadd.mask.nxv4f16.f16(
 define <vscale x 4 x half> @intrinsic_vfadd_mask_vf_nxv4f16_nxv4f16_f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, half %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_mask_vf_nxv4f16_nxv4f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfadd.mask.nxv4f16.f16(
@@ -926,10 +926,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfadd.nxv8f16.f16(
 define <vscale x 8 x half> @intrinsic_vfadd_vf_nxv8f16_nxv8f16_f16(<vscale x 8 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_vf_nxv8f16_nxv8f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfadd.nxv8f16.f16(
@@ -951,10 +951,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfadd.mask.nxv8f16.f16(
 define <vscale x 8 x half> @intrinsic_vfadd_mask_vf_nxv8f16_nxv8f16_f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, half %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_mask_vf_nxv8f16_nxv8f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vf v8, v10, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfadd.mask.nxv8f16.f16(
@@ -976,10 +976,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfadd.nxv16f16.f16(
 define <vscale x 16 x half> @intrinsic_vfadd_vf_nxv16f16_nxv16f16_f16(<vscale x 16 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_vf_nxv16f16_nxv16f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfadd.nxv16f16.f16(
@@ -1001,10 +1001,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfadd.mask.nxv16f16.f16(
 define <vscale x 16 x half> @intrinsic_vfadd_mask_vf_nxv16f16_nxv16f16_f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, half %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_mask_vf_nxv16f16_nxv16f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vf v8, v12, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfadd.mask.nxv16f16.f16(
@@ -1026,10 +1026,10 @@ declare <vscale x 32 x half> @llvm.riscv.vfadd.nxv32f16.f16(
 define <vscale x 32 x half> @intrinsic_vfadd_vf_nxv32f16_nxv32f16_f16(<vscale x 32 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_vf_nxv32f16_nxv32f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vfadd.nxv32f16.f16(
@@ -1051,10 +1051,10 @@ declare <vscale x 32 x half> @llvm.riscv.vfadd.mask.nxv32f16.f16(
 define <vscale x 32 x half> @intrinsic_vfadd_mask_vf_nxv32f16_nxv32f16_f16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, half %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_mask_vf_nxv32f16_nxv32f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vf v8, v16, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vfadd.mask.nxv32f16.f16(
@@ -1076,10 +1076,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfadd.nxv1f32.f32(
 define <vscale x 1 x float> @intrinsic_vfadd_vf_nxv1f32_nxv1f32_f32(<vscale x 1 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_vf_nxv1f32_nxv1f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfadd.nxv1f32.f32(
@@ -1101,10 +1101,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfadd.mask.nxv1f32.f32(
 define <vscale x 1 x float> @intrinsic_vfadd_mask_vf_nxv1f32_nxv1f32_f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, float %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_mask_vf_nxv1f32_nxv1f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfadd.mask.nxv1f32.f32(
@@ -1126,10 +1126,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfadd.nxv2f32.f32(
 define <vscale x 2 x float> @intrinsic_vfadd_vf_nxv2f32_nxv2f32_f32(<vscale x 2 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_vf_nxv2f32_nxv2f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfadd.nxv2f32.f32(
@@ -1151,10 +1151,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfadd.mask.nxv2f32.f32(
 define <vscale x 2 x float> @intrinsic_vfadd_mask_vf_nxv2f32_nxv2f32_f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, float %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_mask_vf_nxv2f32_nxv2f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfadd.mask.nxv2f32.f32(
@@ -1176,10 +1176,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfadd.nxv4f32.f32(
 define <vscale x 4 x float> @intrinsic_vfadd_vf_nxv4f32_nxv4f32_f32(<vscale x 4 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_vf_nxv4f32_nxv4f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfadd.nxv4f32.f32(
@@ -1201,10 +1201,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfadd.mask.nxv4f32.f32(
 define <vscale x 4 x float> @intrinsic_vfadd_mask_vf_nxv4f32_nxv4f32_f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, float %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_mask_vf_nxv4f32_nxv4f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vf v8, v10, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfadd.mask.nxv4f32.f32(
@@ -1226,10 +1226,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfadd.nxv8f32.f32(
 define <vscale x 8 x float> @intrinsic_vfadd_vf_nxv8f32_nxv8f32_f32(<vscale x 8 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_vf_nxv8f32_nxv8f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfadd.nxv8f32.f32(
@@ -1251,10 +1251,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfadd.mask.nxv8f32.f32(
 define <vscale x 8 x float> @intrinsic_vfadd_mask_vf_nxv8f32_nxv8f32_f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, float %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_mask_vf_nxv8f32_nxv8f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vf v8, v12, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfadd.mask.nxv8f32.f32(
@@ -1276,10 +1276,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfadd.nxv16f32.f32(
 define <vscale x 16 x float> @intrinsic_vfadd_vf_nxv16f32_nxv16f32_f32(<vscale x 16 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_vf_nxv16f32_nxv16f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfadd.nxv16f32.f32(
@@ -1301,10 +1301,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfadd.mask.nxv16f32.f32(
 define <vscale x 16 x float> @intrinsic_vfadd_mask_vf_nxv16f32_nxv16f32_f32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, float %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_mask_vf_nxv16f32_nxv16f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vf v8, v16, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfadd.mask.nxv16f32.f32(
@@ -1326,10 +1326,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfadd.nxv1f64.f64(
 define <vscale x 1 x double> @intrinsic_vfadd_vf_nxv1f64_nxv1f64_f64(<vscale x 1 x double> %0, double %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_vf_nxv1f64_nxv1f64_f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfadd.nxv1f64.f64(
@@ -1351,10 +1351,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfadd.mask.nxv1f64.f64(
 define <vscale x 1 x double> @intrinsic_vfadd_mask_vf_nxv1f64_nxv1f64_f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, double %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_mask_vf_nxv1f64_nxv1f64_f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfadd.mask.nxv1f64.f64(
@@ -1376,10 +1376,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfadd.nxv2f64.f64(
 define <vscale x 2 x double> @intrinsic_vfadd_vf_nxv2f64_nxv2f64_f64(<vscale x 2 x double> %0, double %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_vf_nxv2f64_nxv2f64_f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfadd.nxv2f64.f64(
@@ -1401,10 +1401,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfadd.mask.nxv2f64.f64(
 define <vscale x 2 x double> @intrinsic_vfadd_mask_vf_nxv2f64_nxv2f64_f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, double %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_mask_vf_nxv2f64_nxv2f64_f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vf v8, v10, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfadd.mask.nxv2f64.f64(
@@ -1426,10 +1426,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfadd.nxv4f64.f64(
 define <vscale x 4 x double> @intrinsic_vfadd_vf_nxv4f64_nxv4f64_f64(<vscale x 4 x double> %0, double %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_vf_nxv4f64_nxv4f64_f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfadd.nxv4f64.f64(
@@ -1451,10 +1451,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfadd.mask.nxv4f64.f64(
 define <vscale x 4 x double> @intrinsic_vfadd_mask_vf_nxv4f64_nxv4f64_f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, double %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_mask_vf_nxv4f64_nxv4f64_f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vf v8, v12, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfadd.mask.nxv4f64.f64(
@@ -1476,10 +1476,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfadd.nxv8f64.f64(
 define <vscale x 8 x double> @intrinsic_vfadd_vf_nxv8f64_nxv8f64_f64(<vscale x 8 x double> %0, double %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_vf_nxv8f64_nxv8f64_f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfadd.nxv8f64.f64(
@@ -1501,10 +1501,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfadd.mask.nxv8f64.f64(
 define <vscale x 8 x double> @intrinsic_vfadd_mask_vf_nxv8f64_nxv8f64_f64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, double %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfadd_mask_vf_nxv8f64_nxv8f64_f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfadd.vf v8, v16, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfadd.mask.nxv8f64.f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfcvt-f-x.ll b/llvm/test/CodeGen/RISCV/rvv/vfcvt-f-x.ll
index 626848839b07..bc8440920cd8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfcvt-f-x.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfcvt-f-x.ll
@@ -12,10 +12,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfcvt.f.x.v.nxv1f16.nxv1i16(
 define <vscale x 1 x half> @intrinsic_vfcvt_f.x.v_nxv1f16_nxv1i16(<vscale x 1 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_f.x.v_nxv1f16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfcvt.f.x.v.nxv1f16.nxv1i16(
@@ -35,10 +35,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfcvt.f.x.v.mask.nxv1f16.nxv1i16(
 define <vscale x 1 x half> @intrinsic_vfcvt_mask_f.x.v_nxv1f16_nxv1i16(<vscale x 1 x half> %0, <vscale x 1 x i16> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_f.x.v_nxv1f16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfcvt.f.x.v.mask.nxv1f16.nxv1i16(
@@ -58,10 +58,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfcvt.f.x.v.nxv2f16.nxv2i16(
 define <vscale x 2 x half> @intrinsic_vfcvt_f.x.v_nxv2f16_nxv2i16(<vscale x 2 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_f.x.v_nxv2f16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfcvt.f.x.v.nxv2f16.nxv2i16(
@@ -81,10 +81,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfcvt.f.x.v.mask.nxv2f16.nxv2i16(
 define <vscale x 2 x half> @intrinsic_vfcvt_mask_f.x.v_nxv2f16_nxv2i16(<vscale x 2 x half> %0, <vscale x 2 x i16> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_f.x.v_nxv2f16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfcvt.f.x.v.mask.nxv2f16.nxv2i16(
@@ -104,10 +104,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfcvt.f.x.v.nxv4f16.nxv4i16(
 define <vscale x 4 x half> @intrinsic_vfcvt_f.x.v_nxv4f16_nxv4i16(<vscale x 4 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_f.x.v_nxv4f16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfcvt.f.x.v.nxv4f16.nxv4i16(
@@ -127,10 +127,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfcvt.f.x.v.mask.nxv4f16.nxv4i16(
 define <vscale x 4 x half> @intrinsic_vfcvt_mask_f.x.v_nxv4f16_nxv4i16(<vscale x 4 x half> %0, <vscale x 4 x i16> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_f.x.v_nxv4f16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfcvt.f.x.v.mask.nxv4f16.nxv4i16(
@@ -150,10 +150,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfcvt.f.x.v.nxv8f16.nxv8i16(
 define <vscale x 8 x half> @intrinsic_vfcvt_f.x.v_nxv8f16_nxv8i16(<vscale x 8 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_f.x.v_nxv8f16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfcvt.f.x.v.nxv8f16.nxv8i16(
@@ -173,10 +173,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfcvt.f.x.v.mask.nxv8f16.nxv8i16(
 define <vscale x 8 x half> @intrinsic_vfcvt_mask_f.x.v_nxv8f16_nxv8i16(<vscale x 8 x half> %0, <vscale x 8 x i16> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_f.x.v_nxv8f16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfcvt.f.x.v.mask.nxv8f16.nxv8i16(
@@ -196,10 +196,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfcvt.f.x.v.nxv16f16.nxv16i16(
 define <vscale x 16 x half> @intrinsic_vfcvt_f.x.v_nxv16f16_nxv16i16(<vscale x 16 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_f.x.v_nxv16f16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfcvt.f.x.v.nxv16f16.nxv16i16(
@@ -219,10 +219,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfcvt.f.x.v.mask.nxv16f16.nxv16i16(
 define <vscale x 16 x half> @intrinsic_vfcvt_mask_f.x.v_nxv16f16_nxv16i16(<vscale x 16 x half> %0, <vscale x 16 x i16> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_f.x.v_nxv16f16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfcvt.f.x.v.mask.nxv16f16.nxv16i16(
@@ -242,10 +242,10 @@ declare <vscale x 32 x half> @llvm.riscv.vfcvt.f.x.v.nxv32f16.nxv32i16(
 define <vscale x 32 x half> @intrinsic_vfcvt_f.x.v_nxv32f16_nxv32i16(<vscale x 32 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_f.x.v_nxv32f16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vfcvt.f.x.v.nxv32f16.nxv32i16(
@@ -265,10 +265,10 @@ declare <vscale x 32 x half> @llvm.riscv.vfcvt.f.x.v.mask.nxv32f16.nxv32i16(
 define <vscale x 32 x half> @intrinsic_vfcvt_mask_f.x.v_nxv32f16_nxv32i16(<vscale x 32 x half> %0, <vscale x 32 x i16> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_f.x.v_nxv32f16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vfcvt.f.x.v.mask.nxv32f16.nxv32i16(
@@ -288,10 +288,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfcvt.f.x.v.nxv1f32.nxv1i32(
 define <vscale x 1 x float> @intrinsic_vfcvt_f.x.v_nxv1f32_nxv1i32(<vscale x 1 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_f.x.v_nxv1f32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfcvt.f.x.v.nxv1f32.nxv1i32(
@@ -311,10 +311,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfcvt.f.x.v.mask.nxv1f32.nxv1i32(
 define <vscale x 1 x float> @intrinsic_vfcvt_mask_f.x.v_nxv1f32_nxv1i32(<vscale x 1 x float> %0, <vscale x 1 x i32> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_f.x.v_nxv1f32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfcvt.f.x.v.mask.nxv1f32.nxv1i32(
@@ -334,10 +334,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfcvt.f.x.v.nxv2f32.nxv2i32(
 define <vscale x 2 x float> @intrinsic_vfcvt_f.x.v_nxv2f32_nxv2i32(<vscale x 2 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_f.x.v_nxv2f32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfcvt.f.x.v.nxv2f32.nxv2i32(
@@ -357,10 +357,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfcvt.f.x.v.mask.nxv2f32.nxv2i32(
 define <vscale x 2 x float> @intrinsic_vfcvt_mask_f.x.v_nxv2f32_nxv2i32(<vscale x 2 x float> %0, <vscale x 2 x i32> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_f.x.v_nxv2f32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfcvt.f.x.v.mask.nxv2f32.nxv2i32(
@@ -380,10 +380,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfcvt.f.x.v.nxv4f32.nxv4i32(
 define <vscale x 4 x float> @intrinsic_vfcvt_f.x.v_nxv4f32_nxv4i32(<vscale x 4 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_f.x.v_nxv4f32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfcvt.f.x.v.nxv4f32.nxv4i32(
@@ -403,10 +403,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfcvt.f.x.v.mask.nxv4f32.nxv4i32(
 define <vscale x 4 x float> @intrinsic_vfcvt_mask_f.x.v_nxv4f32_nxv4i32(<vscale x 4 x float> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_f.x.v_nxv4f32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfcvt.f.x.v.mask.nxv4f32.nxv4i32(
@@ -426,10 +426,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfcvt.f.x.v.nxv8f32.nxv8i32(
 define <vscale x 8 x float> @intrinsic_vfcvt_f.x.v_nxv8f32_nxv8i32(<vscale x 8 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_f.x.v_nxv8f32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfcvt.f.x.v.nxv8f32.nxv8i32(
@@ -449,10 +449,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfcvt.f.x.v.mask.nxv8f32.nxv8i32(
 define <vscale x 8 x float> @intrinsic_vfcvt_mask_f.x.v_nxv8f32_nxv8i32(<vscale x 8 x float> %0, <vscale x 8 x i32> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_f.x.v_nxv8f32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfcvt.f.x.v.mask.nxv8f32.nxv8i32(
@@ -472,10 +472,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfcvt.f.x.v.nxv16f32.nxv16i32(
 define <vscale x 16 x float> @intrinsic_vfcvt_f.x.v_nxv16f32_nxv16i32(<vscale x 16 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_f.x.v_nxv16f32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfcvt.f.x.v.nxv16f32.nxv16i32(
@@ -495,10 +495,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfcvt.f.x.v.mask.nxv16f32.nxv16i32(
 define <vscale x 16 x float> @intrinsic_vfcvt_mask_f.x.v_nxv16f32_nxv16i32(<vscale x 16 x float> %0, <vscale x 16 x i32> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_f.x.v_nxv16f32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfcvt.f.x.v.mask.nxv16f32.nxv16i32(
@@ -518,10 +518,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfcvt.f.x.v.nxv1f64.nxv1i64(
 define <vscale x 1 x double> @intrinsic_vfcvt_f.x.v_nxv1f64_nxv1i64(<vscale x 1 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_f.x.v_nxv1f64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfcvt.f.x.v.nxv1f64.nxv1i64(
@@ -541,10 +541,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfcvt.f.x.v.mask.nxv1f64.nxv1i64(
 define <vscale x 1 x double> @intrinsic_vfcvt_mask_f.x.v_nxv1f64_nxv1i64(<vscale x 1 x double> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_f.x.v_nxv1f64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfcvt.f.x.v.mask.nxv1f64.nxv1i64(
@@ -564,10 +564,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfcvt.f.x.v.nxv2f64.nxv2i64(
 define <vscale x 2 x double> @intrinsic_vfcvt_f.x.v_nxv2f64_nxv2i64(<vscale x 2 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_f.x.v_nxv2f64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfcvt.f.x.v.nxv2f64.nxv2i64(
@@ -587,10 +587,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfcvt.f.x.v.mask.nxv2f64.nxv2i64(
 define <vscale x 2 x double> @intrinsic_vfcvt_mask_f.x.v_nxv2f64_nxv2i64(<vscale x 2 x double> %0, <vscale x 2 x i64> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_f.x.v_nxv2f64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfcvt.f.x.v.mask.nxv2f64.nxv2i64(
@@ -610,10 +610,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfcvt.f.x.v.nxv4f64.nxv4i64(
 define <vscale x 4 x double> @intrinsic_vfcvt_f.x.v_nxv4f64_nxv4i64(<vscale x 4 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_f.x.v_nxv4f64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfcvt.f.x.v.nxv4f64.nxv4i64(
@@ -633,10 +633,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfcvt.f.x.v.mask.nxv4f64.nxv4i64(
 define <vscale x 4 x double> @intrinsic_vfcvt_mask_f.x.v_nxv4f64_nxv4i64(<vscale x 4 x double> %0, <vscale x 4 x i64> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_f.x.v_nxv4f64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfcvt.f.x.v.mask.nxv4f64.nxv4i64(
@@ -656,10 +656,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfcvt.f.x.v.nxv8f64.nxv8i64(
 define <vscale x 8 x double> @intrinsic_vfcvt_f.x.v_nxv8f64_nxv8i64(<vscale x 8 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_f.x.v_nxv8f64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfcvt.f.x.v.nxv8f64.nxv8i64(
@@ -679,10 +679,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfcvt.f.x.v.mask.nxv8f64.nxv8i64(
 define <vscale x 8 x double> @intrinsic_vfcvt_mask_f.x.v_nxv8f64_nxv8i64(<vscale x 8 x double> %0, <vscale x 8 x i64> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_f.x.v_nxv8f64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfcvt.f.x.v.mask.nxv8f64.nxv8i64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfcvt-f-xu.ll b/llvm/test/CodeGen/RISCV/rvv/vfcvt-f-xu.ll
index 9109df44ec7f..9cf47f993ee4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfcvt-f-xu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfcvt-f-xu.ll
@@ -12,10 +12,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfcvt.f.xu.v.nxv1f16.nxv1i16(
 define <vscale x 1 x half> @intrinsic_vfcvt_f.xu.v_nxv1f16_nxv1i16(<vscale x 1 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_f.xu.v_nxv1f16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfcvt.f.xu.v.nxv1f16.nxv1i16(
@@ -35,10 +35,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfcvt.f.xu.v.mask.nxv1f16.nxv1i16(
 define <vscale x 1 x half> @intrinsic_vfcvt_mask_f.xu.v_nxv1f16_nxv1i16(<vscale x 1 x half> %0, <vscale x 1 x i16> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_f.xu.v_nxv1f16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfcvt.f.xu.v.mask.nxv1f16.nxv1i16(
@@ -58,10 +58,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfcvt.f.xu.v.nxv2f16.nxv2i16(
 define <vscale x 2 x half> @intrinsic_vfcvt_f.xu.v_nxv2f16_nxv2i16(<vscale x 2 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_f.xu.v_nxv2f16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfcvt.f.xu.v.nxv2f16.nxv2i16(
@@ -81,10 +81,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfcvt.f.xu.v.mask.nxv2f16.nxv2i16(
 define <vscale x 2 x half> @intrinsic_vfcvt_mask_f.xu.v_nxv2f16_nxv2i16(<vscale x 2 x half> %0, <vscale x 2 x i16> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_f.xu.v_nxv2f16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfcvt.f.xu.v.mask.nxv2f16.nxv2i16(
@@ -104,10 +104,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfcvt.f.xu.v.nxv4f16.nxv4i16(
 define <vscale x 4 x half> @intrinsic_vfcvt_f.xu.v_nxv4f16_nxv4i16(<vscale x 4 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_f.xu.v_nxv4f16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfcvt.f.xu.v.nxv4f16.nxv4i16(
@@ -127,10 +127,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfcvt.f.xu.v.mask.nxv4f16.nxv4i16(
 define <vscale x 4 x half> @intrinsic_vfcvt_mask_f.xu.v_nxv4f16_nxv4i16(<vscale x 4 x half> %0, <vscale x 4 x i16> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_f.xu.v_nxv4f16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfcvt.f.xu.v.mask.nxv4f16.nxv4i16(
@@ -150,10 +150,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfcvt.f.xu.v.nxv8f16.nxv8i16(
 define <vscale x 8 x half> @intrinsic_vfcvt_f.xu.v_nxv8f16_nxv8i16(<vscale x 8 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_f.xu.v_nxv8f16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfcvt.f.xu.v.nxv8f16.nxv8i16(
@@ -173,10 +173,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfcvt.f.xu.v.mask.nxv8f16.nxv8i16(
 define <vscale x 8 x half> @intrinsic_vfcvt_mask_f.xu.v_nxv8f16_nxv8i16(<vscale x 8 x half> %0, <vscale x 8 x i16> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_f.xu.v_nxv8f16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfcvt.f.xu.v.mask.nxv8f16.nxv8i16(
@@ -196,10 +196,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfcvt.f.xu.v.nxv16f16.nxv16i16(
 define <vscale x 16 x half> @intrinsic_vfcvt_f.xu.v_nxv16f16_nxv16i16(<vscale x 16 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_f.xu.v_nxv16f16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfcvt.f.xu.v.nxv16f16.nxv16i16(
@@ -219,10 +219,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfcvt.f.xu.v.mask.nxv16f16.nxv16i16(
 define <vscale x 16 x half> @intrinsic_vfcvt_mask_f.xu.v_nxv16f16_nxv16i16(<vscale x 16 x half> %0, <vscale x 16 x i16> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_f.xu.v_nxv16f16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfcvt.f.xu.v.mask.nxv16f16.nxv16i16(
@@ -242,10 +242,10 @@ declare <vscale x 32 x half> @llvm.riscv.vfcvt.f.xu.v.nxv32f16.nxv32i16(
 define <vscale x 32 x half> @intrinsic_vfcvt_f.xu.v_nxv32f16_nxv32i16(<vscale x 32 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_f.xu.v_nxv32f16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vfcvt.f.xu.v.nxv32f16.nxv32i16(
@@ -265,10 +265,10 @@ declare <vscale x 32 x half> @llvm.riscv.vfcvt.f.xu.v.mask.nxv32f16.nxv32i16(
 define <vscale x 32 x half> @intrinsic_vfcvt_mask_f.xu.v_nxv32f16_nxv32i16(<vscale x 32 x half> %0, <vscale x 32 x i16> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_f.xu.v_nxv32f16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vfcvt.f.xu.v.mask.nxv32f16.nxv32i16(
@@ -288,10 +288,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfcvt.f.xu.v.nxv1f32.nxv1i32(
 define <vscale x 1 x float> @intrinsic_vfcvt_f.xu.v_nxv1f32_nxv1i32(<vscale x 1 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_f.xu.v_nxv1f32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfcvt.f.xu.v.nxv1f32.nxv1i32(
@@ -311,10 +311,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfcvt.f.xu.v.mask.nxv1f32.nxv1i32(
 define <vscale x 1 x float> @intrinsic_vfcvt_mask_f.xu.v_nxv1f32_nxv1i32(<vscale x 1 x float> %0, <vscale x 1 x i32> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_f.xu.v_nxv1f32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfcvt.f.xu.v.mask.nxv1f32.nxv1i32(
@@ -334,10 +334,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfcvt.f.xu.v.nxv2f32.nxv2i32(
 define <vscale x 2 x float> @intrinsic_vfcvt_f.xu.v_nxv2f32_nxv2i32(<vscale x 2 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_f.xu.v_nxv2f32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfcvt.f.xu.v.nxv2f32.nxv2i32(
@@ -357,10 +357,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfcvt.f.xu.v.mask.nxv2f32.nxv2i32(
 define <vscale x 2 x float> @intrinsic_vfcvt_mask_f.xu.v_nxv2f32_nxv2i32(<vscale x 2 x float> %0, <vscale x 2 x i32> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_f.xu.v_nxv2f32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfcvt.f.xu.v.mask.nxv2f32.nxv2i32(
@@ -380,10 +380,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfcvt.f.xu.v.nxv4f32.nxv4i32(
 define <vscale x 4 x float> @intrinsic_vfcvt_f.xu.v_nxv4f32_nxv4i32(<vscale x 4 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_f.xu.v_nxv4f32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfcvt.f.xu.v.nxv4f32.nxv4i32(
@@ -403,10 +403,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfcvt.f.xu.v.mask.nxv4f32.nxv4i32(
 define <vscale x 4 x float> @intrinsic_vfcvt_mask_f.xu.v_nxv4f32_nxv4i32(<vscale x 4 x float> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_f.xu.v_nxv4f32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfcvt.f.xu.v.mask.nxv4f32.nxv4i32(
@@ -426,10 +426,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfcvt.f.xu.v.nxv8f32.nxv8i32(
 define <vscale x 8 x float> @intrinsic_vfcvt_f.xu.v_nxv8f32_nxv8i32(<vscale x 8 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_f.xu.v_nxv8f32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfcvt.f.xu.v.nxv8f32.nxv8i32(
@@ -449,10 +449,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfcvt.f.xu.v.mask.nxv8f32.nxv8i32(
 define <vscale x 8 x float> @intrinsic_vfcvt_mask_f.xu.v_nxv8f32_nxv8i32(<vscale x 8 x float> %0, <vscale x 8 x i32> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_f.xu.v_nxv8f32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfcvt.f.xu.v.mask.nxv8f32.nxv8i32(
@@ -472,10 +472,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfcvt.f.xu.v.nxv16f32.nxv16i32(
 define <vscale x 16 x float> @intrinsic_vfcvt_f.xu.v_nxv16f32_nxv16i32(<vscale x 16 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_f.xu.v_nxv16f32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfcvt.f.xu.v.nxv16f32.nxv16i32(
@@ -495,10 +495,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfcvt.f.xu.v.mask.nxv16f32.nxv16i32(
 define <vscale x 16 x float> @intrinsic_vfcvt_mask_f.xu.v_nxv16f32_nxv16i32(<vscale x 16 x float> %0, <vscale x 16 x i32> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_f.xu.v_nxv16f32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfcvt.f.xu.v.mask.nxv16f32.nxv16i32(
@@ -518,10 +518,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfcvt.f.xu.v.nxv1f64.nxv1i64(
 define <vscale x 1 x double> @intrinsic_vfcvt_f.xu.v_nxv1f64_nxv1i64(<vscale x 1 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_f.xu.v_nxv1f64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfcvt.f.xu.v.nxv1f64.nxv1i64(
@@ -541,10 +541,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfcvt.f.xu.v.mask.nxv1f64.nxv1i64(
 define <vscale x 1 x double> @intrinsic_vfcvt_mask_f.xu.v_nxv1f64_nxv1i64(<vscale x 1 x double> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_f.xu.v_nxv1f64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfcvt.f.xu.v.mask.nxv1f64.nxv1i64(
@@ -564,10 +564,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfcvt.f.xu.v.nxv2f64.nxv2i64(
 define <vscale x 2 x double> @intrinsic_vfcvt_f.xu.v_nxv2f64_nxv2i64(<vscale x 2 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_f.xu.v_nxv2f64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfcvt.f.xu.v.nxv2f64.nxv2i64(
@@ -587,10 +587,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfcvt.f.xu.v.mask.nxv2f64.nxv2i64(
 define <vscale x 2 x double> @intrinsic_vfcvt_mask_f.xu.v_nxv2f64_nxv2i64(<vscale x 2 x double> %0, <vscale x 2 x i64> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_f.xu.v_nxv2f64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfcvt.f.xu.v.mask.nxv2f64.nxv2i64(
@@ -610,10 +610,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfcvt.f.xu.v.nxv4f64.nxv4i64(
 define <vscale x 4 x double> @intrinsic_vfcvt_f.xu.v_nxv4f64_nxv4i64(<vscale x 4 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_f.xu.v_nxv4f64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfcvt.f.xu.v.nxv4f64.nxv4i64(
@@ -633,10 +633,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfcvt.f.xu.v.mask.nxv4f64.nxv4i64(
 define <vscale x 4 x double> @intrinsic_vfcvt_mask_f.xu.v_nxv4f64_nxv4i64(<vscale x 4 x double> %0, <vscale x 4 x i64> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_f.xu.v_nxv4f64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfcvt.f.xu.v.mask.nxv4f64.nxv4i64(
@@ -656,10 +656,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfcvt.f.xu.v.nxv8f64.nxv8i64(
 define <vscale x 8 x double> @intrinsic_vfcvt_f.xu.v_nxv8f64_nxv8i64(<vscale x 8 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_f.xu.v_nxv8f64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfcvt.f.xu.v.nxv8f64.nxv8i64(
@@ -679,10 +679,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfcvt.f.xu.v.mask.nxv8f64.nxv8i64(
 define <vscale x 8 x double> @intrinsic_vfcvt_mask_f.xu.v_nxv8f64_nxv8i64(<vscale x 8 x double> %0, <vscale x 8 x i64> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_f.xu.v_nxv8f64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfcvt.f.xu.v.mask.nxv8f64.nxv8i64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfcvt-x-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfcvt-x-f.ll
index 1147ec331b78..68a85530ea24 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfcvt-x-f.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfcvt-x-f.ll
@@ -12,10 +12,10 @@ declare <vscale x 1 x i16> @llvm.riscv.vfcvt.x.f.v.nxv1i16.nxv1f16(
 define <vscale x 1 x i16> @intrinsic_vfcvt_x.f.v_nxv1i16_nxv1f16(<vscale x 1 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_x.f.v_nxv1i16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vfcvt.x.f.v.nxv1i16.nxv1f16(
@@ -35,10 +35,10 @@ declare <vscale x 1 x i16> @llvm.riscv.vfcvt.x.f.v.mask.nxv1i16.nxv1f16(
 define <vscale x 1 x i16> @intrinsic_vfcvt_mask_x.f.v_nxv1i16_nxv1f16(<vscale x 1 x i16> %0, <vscale x 1 x half> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_x.f.v_nxv1i16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vfcvt.x.f.v.mask.nxv1i16.nxv1f16(
@@ -58,10 +58,10 @@ declare <vscale x 2 x i16> @llvm.riscv.vfcvt.x.f.v.nxv2i16.nxv2f16(
 define <vscale x 2 x i16> @intrinsic_vfcvt_x.f.v_nxv2i16_nxv2f16(<vscale x 2 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_x.f.v_nxv2i16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vfcvt.x.f.v.nxv2i16.nxv2f16(
@@ -81,10 +81,10 @@ declare <vscale x 2 x i16> @llvm.riscv.vfcvt.x.f.v.mask.nxv2i16.nxv2f16(
 define <vscale x 2 x i16> @intrinsic_vfcvt_mask_x.f.v_nxv2i16_nxv2f16(<vscale x 2 x i16> %0, <vscale x 2 x half> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_x.f.v_nxv2i16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vfcvt.x.f.v.mask.nxv2i16.nxv2f16(
@@ -104,10 +104,10 @@ declare <vscale x 4 x i16> @llvm.riscv.vfcvt.x.f.v.nxv4i16.nxv4f16(
 define <vscale x 4 x i16> @intrinsic_vfcvt_x.f.v_nxv4i16_nxv4f16(<vscale x 4 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_x.f.v_nxv4i16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vfcvt.x.f.v.nxv4i16.nxv4f16(
@@ -127,10 +127,10 @@ declare <vscale x 4 x i16> @llvm.riscv.vfcvt.x.f.v.mask.nxv4i16.nxv4f16(
 define <vscale x 4 x i16> @intrinsic_vfcvt_mask_x.f.v_nxv4i16_nxv4f16(<vscale x 4 x i16> %0, <vscale x 4 x half> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_x.f.v_nxv4i16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vfcvt.x.f.v.mask.nxv4i16.nxv4f16(
@@ -150,10 +150,10 @@ declare <vscale x 8 x i16> @llvm.riscv.vfcvt.x.f.v.nxv8i16.nxv8f16(
 define <vscale x 8 x i16> @intrinsic_vfcvt_x.f.v_nxv8i16_nxv8f16(<vscale x 8 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_x.f.v_nxv8i16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vfcvt.x.f.v.nxv8i16.nxv8f16(
@@ -173,10 +173,10 @@ declare <vscale x 8 x i16> @llvm.riscv.vfcvt.x.f.v.mask.nxv8i16.nxv8f16(
 define <vscale x 8 x i16> @intrinsic_vfcvt_mask_x.f.v_nxv8i16_nxv8f16(<vscale x 8 x i16> %0, <vscale x 8 x half> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_x.f.v_nxv8i16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vfcvt.x.f.v.mask.nxv8i16.nxv8f16(
@@ -196,10 +196,10 @@ declare <vscale x 16 x i16> @llvm.riscv.vfcvt.x.f.v.nxv16i16.nxv16f16(
 define <vscale x 16 x i16> @intrinsic_vfcvt_x.f.v_nxv16i16_nxv16f16(<vscale x 16 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_x.f.v_nxv16i16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vfcvt.x.f.v.nxv16i16.nxv16f16(
@@ -219,10 +219,10 @@ declare <vscale x 16 x i16> @llvm.riscv.vfcvt.x.f.v.mask.nxv16i16.nxv16f16(
 define <vscale x 16 x i16> @intrinsic_vfcvt_mask_x.f.v_nxv16i16_nxv16f16(<vscale x 16 x i16> %0, <vscale x 16 x half> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_x.f.v_nxv16i16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vfcvt.x.f.v.mask.nxv16i16.nxv16f16(
@@ -242,10 +242,10 @@ declare <vscale x 32 x i16> @llvm.riscv.vfcvt.x.f.v.nxv32i16.nxv32f16(
 define <vscale x 32 x i16> @intrinsic_vfcvt_x.f.v_nxv32i16_nxv32f16(<vscale x 32 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_x.f.v_nxv32i16_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vfcvt.x.f.v.nxv32i16.nxv32f16(
@@ -265,10 +265,10 @@ declare <vscale x 32 x i16> @llvm.riscv.vfcvt.x.f.v.mask.nxv32i16.nxv32f16(
 define <vscale x 32 x i16> @intrinsic_vfcvt_mask_x.f.v_nxv32i16_nxv32f16(<vscale x 32 x i16> %0, <vscale x 32 x half> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_x.f.v_nxv32i16_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vfcvt.x.f.v.mask.nxv32i16.nxv32f16(
@@ -288,10 +288,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vfcvt.x.f.v.nxv1i32.nxv1f32(
 define <vscale x 1 x i32> @intrinsic_vfcvt_x.f.v_nxv1i32_nxv1f32(<vscale x 1 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_x.f.v_nxv1i32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vfcvt.x.f.v.nxv1i32.nxv1f32(
@@ -311,10 +311,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vfcvt.x.f.v.mask.nxv1i32.nxv1f32(
 define <vscale x 1 x i32> @intrinsic_vfcvt_mask_x.f.v_nxv1i32_nxv1f32(<vscale x 1 x i32> %0, <vscale x 1 x float> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_x.f.v_nxv1i32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vfcvt.x.f.v.mask.nxv1i32.nxv1f32(
@@ -334,10 +334,10 @@ declare <vscale x 2 x i32> @llvm.riscv.vfcvt.x.f.v.nxv2i32.nxv2f32(
 define <vscale x 2 x i32> @intrinsic_vfcvt_x.f.v_nxv2i32_nxv2f32(<vscale x 2 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_x.f.v_nxv2i32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vfcvt.x.f.v.nxv2i32.nxv2f32(
@@ -357,10 +357,10 @@ declare <vscale x 2 x i32> @llvm.riscv.vfcvt.x.f.v.mask.nxv2i32.nxv2f32(
 define <vscale x 2 x i32> @intrinsic_vfcvt_mask_x.f.v_nxv2i32_nxv2f32(<vscale x 2 x i32> %0, <vscale x 2 x float> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_x.f.v_nxv2i32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vfcvt.x.f.v.mask.nxv2i32.nxv2f32(
@@ -380,10 +380,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vfcvt.x.f.v.nxv4i32.nxv4f32(
 define <vscale x 4 x i32> @intrinsic_vfcvt_x.f.v_nxv4i32_nxv4f32(<vscale x 4 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_x.f.v_nxv4i32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vfcvt.x.f.v.nxv4i32.nxv4f32(
@@ -403,10 +403,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vfcvt.x.f.v.mask.nxv4i32.nxv4f32(
 define <vscale x 4 x i32> @intrinsic_vfcvt_mask_x.f.v_nxv4i32_nxv4f32(<vscale x 4 x i32> %0, <vscale x 4 x float> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_x.f.v_nxv4i32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vfcvt.x.f.v.mask.nxv4i32.nxv4f32(
@@ -426,10 +426,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vfcvt.x.f.v.nxv8i32.nxv8f32(
 define <vscale x 8 x i32> @intrinsic_vfcvt_x.f.v_nxv8i32_nxv8f32(<vscale x 8 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_x.f.v_nxv8i32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vfcvt.x.f.v.nxv8i32.nxv8f32(
@@ -449,10 +449,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vfcvt.x.f.v.mask.nxv8i32.nxv8f32(
 define <vscale x 8 x i32> @intrinsic_vfcvt_mask_x.f.v_nxv8i32_nxv8f32(<vscale x 8 x i32> %0, <vscale x 8 x float> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_x.f.v_nxv8i32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vfcvt.x.f.v.mask.nxv8i32.nxv8f32(
@@ -472,10 +472,10 @@ declare <vscale x 16 x i32> @llvm.riscv.vfcvt.x.f.v.nxv16i32.nxv16f32(
 define <vscale x 16 x i32> @intrinsic_vfcvt_x.f.v_nxv16i32_nxv16f32(<vscale x 16 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_x.f.v_nxv16i32_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vfcvt.x.f.v.nxv16i32.nxv16f32(
@@ -495,10 +495,10 @@ declare <vscale x 16 x i32> @llvm.riscv.vfcvt.x.f.v.mask.nxv16i32.nxv16f32(
 define <vscale x 16 x i32> @intrinsic_vfcvt_mask_x.f.v_nxv16i32_nxv16f32(<vscale x 16 x i32> %0, <vscale x 16 x float> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_x.f.v_nxv16i32_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vfcvt.x.f.v.mask.nxv16i32.nxv16f32(
@@ -518,10 +518,10 @@ declare <vscale x 1 x i64> @llvm.riscv.vfcvt.x.f.v.nxv1i64.nxv1f64(
 define <vscale x 1 x i64> @intrinsic_vfcvt_x.f.v_nxv1i64_nxv1f64(<vscale x 1 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_x.f.v_nxv1i64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vfcvt.x.f.v.nxv1i64.nxv1f64(
@@ -541,10 +541,10 @@ declare <vscale x 1 x i64> @llvm.riscv.vfcvt.x.f.v.mask.nxv1i64.nxv1f64(
 define <vscale x 1 x i64> @intrinsic_vfcvt_mask_x.f.v_nxv1i64_nxv1f64(<vscale x 1 x i64> %0, <vscale x 1 x double> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_x.f.v_nxv1i64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vfcvt.x.f.v.mask.nxv1i64.nxv1f64(
@@ -564,10 +564,10 @@ declare <vscale x 2 x i64> @llvm.riscv.vfcvt.x.f.v.nxv2i64.nxv2f64(
 define <vscale x 2 x i64> @intrinsic_vfcvt_x.f.v_nxv2i64_nxv2f64(<vscale x 2 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_x.f.v_nxv2i64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vfcvt.x.f.v.nxv2i64.nxv2f64(
@@ -587,10 +587,10 @@ declare <vscale x 2 x i64> @llvm.riscv.vfcvt.x.f.v.mask.nxv2i64.nxv2f64(
 define <vscale x 2 x i64> @intrinsic_vfcvt_mask_x.f.v_nxv2i64_nxv2f64(<vscale x 2 x i64> %0, <vscale x 2 x double> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_x.f.v_nxv2i64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vfcvt.x.f.v.mask.nxv2i64.nxv2f64(
@@ -610,10 +610,10 @@ declare <vscale x 4 x i64> @llvm.riscv.vfcvt.x.f.v.nxv4i64.nxv4f64(
 define <vscale x 4 x i64> @intrinsic_vfcvt_x.f.v_nxv4i64_nxv4f64(<vscale x 4 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_x.f.v_nxv4i64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vfcvt.x.f.v.nxv4i64.nxv4f64(
@@ -633,10 +633,10 @@ declare <vscale x 4 x i64> @llvm.riscv.vfcvt.x.f.v.mask.nxv4i64.nxv4f64(
 define <vscale x 4 x i64> @intrinsic_vfcvt_mask_x.f.v_nxv4i64_nxv4f64(<vscale x 4 x i64> %0, <vscale x 4 x double> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_x.f.v_nxv4i64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vfcvt.x.f.v.mask.nxv4i64.nxv4f64(
@@ -656,10 +656,10 @@ declare <vscale x 8 x i64> @llvm.riscv.vfcvt.x.f.v.nxv8i64.nxv8f64(
 define <vscale x 8 x i64> @intrinsic_vfcvt_x.f.v_nxv8i64_nxv8f64(<vscale x 8 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_x.f.v_nxv8i64_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vfcvt.x.f.v.nxv8i64.nxv8f64(
@@ -679,10 +679,10 @@ declare <vscale x 8 x i64> @llvm.riscv.vfcvt.x.f.v.mask.nxv8i64.nxv8f64(
 define <vscale x 8 x i64> @intrinsic_vfcvt_mask_x.f.v_nxv8i64_nxv8f64(<vscale x 8 x i64> %0, <vscale x 8 x double> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_x.f.v_nxv8i64_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vfcvt.x.f.v.mask.nxv8i64.nxv8f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfcvt-xu-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfcvt-xu-f.ll
index cd227196b4f4..93716ba7f451 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfcvt-xu-f.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfcvt-xu-f.ll
@@ -12,10 +12,10 @@ declare <vscale x 1 x i16> @llvm.riscv.vfcvt.xu.f.v.nxv1i16.nxv1f16(
 define <vscale x 1 x i16> @intrinsic_vfcvt_xu.f.v_nxv1i16_nxv1f16(<vscale x 1 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_xu.f.v_nxv1i16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.xu.f.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vfcvt.xu.f.v.nxv1i16.nxv1f16(
@@ -35,10 +35,10 @@ declare <vscale x 1 x i16> @llvm.riscv.vfcvt.xu.f.v.mask.nxv1i16.nxv1f16(
 define <vscale x 1 x i16> @intrinsic_vfcvt_mask_xu.f.v_nxv1i16_nxv1f16(<vscale x 1 x i16> %0, <vscale x 1 x half> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_xu.f.v_nxv1i16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.xu.f.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vfcvt.xu.f.v.mask.nxv1i16.nxv1f16(
@@ -58,10 +58,10 @@ declare <vscale x 2 x i16> @llvm.riscv.vfcvt.xu.f.v.nxv2i16.nxv2f16(
 define <vscale x 2 x i16> @intrinsic_vfcvt_xu.f.v_nxv2i16_nxv2f16(<vscale x 2 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_xu.f.v_nxv2i16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.xu.f.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vfcvt.xu.f.v.nxv2i16.nxv2f16(
@@ -81,10 +81,10 @@ declare <vscale x 2 x i16> @llvm.riscv.vfcvt.xu.f.v.mask.nxv2i16.nxv2f16(
 define <vscale x 2 x i16> @intrinsic_vfcvt_mask_xu.f.v_nxv2i16_nxv2f16(<vscale x 2 x i16> %0, <vscale x 2 x half> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_xu.f.v_nxv2i16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.xu.f.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vfcvt.xu.f.v.mask.nxv2i16.nxv2f16(
@@ -104,10 +104,10 @@ declare <vscale x 4 x i16> @llvm.riscv.vfcvt.xu.f.v.nxv4i16.nxv4f16(
 define <vscale x 4 x i16> @intrinsic_vfcvt_xu.f.v_nxv4i16_nxv4f16(<vscale x 4 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_xu.f.v_nxv4i16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.xu.f.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vfcvt.xu.f.v.nxv4i16.nxv4f16(
@@ -127,10 +127,10 @@ declare <vscale x 4 x i16> @llvm.riscv.vfcvt.xu.f.v.mask.nxv4i16.nxv4f16(
 define <vscale x 4 x i16> @intrinsic_vfcvt_mask_xu.f.v_nxv4i16_nxv4f16(<vscale x 4 x i16> %0, <vscale x 4 x half> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_xu.f.v_nxv4i16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.xu.f.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vfcvt.xu.f.v.mask.nxv4i16.nxv4f16(
@@ -150,10 +150,10 @@ declare <vscale x 8 x i16> @llvm.riscv.vfcvt.xu.f.v.nxv8i16.nxv8f16(
 define <vscale x 8 x i16> @intrinsic_vfcvt_xu.f.v_nxv8i16_nxv8f16(<vscale x 8 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_xu.f.v_nxv8i16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.xu.f.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vfcvt.xu.f.v.nxv8i16.nxv8f16(
@@ -173,10 +173,10 @@ declare <vscale x 8 x i16> @llvm.riscv.vfcvt.xu.f.v.mask.nxv8i16.nxv8f16(
 define <vscale x 8 x i16> @intrinsic_vfcvt_mask_xu.f.v_nxv8i16_nxv8f16(<vscale x 8 x i16> %0, <vscale x 8 x half> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_xu.f.v_nxv8i16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.xu.f.v v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vfcvt.xu.f.v.mask.nxv8i16.nxv8f16(
@@ -196,10 +196,10 @@ declare <vscale x 16 x i16> @llvm.riscv.vfcvt.xu.f.v.nxv16i16.nxv16f16(
 define <vscale x 16 x i16> @intrinsic_vfcvt_xu.f.v_nxv16i16_nxv16f16(<vscale x 16 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_xu.f.v_nxv16i16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.xu.f.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vfcvt.xu.f.v.nxv16i16.nxv16f16(
@@ -219,10 +219,10 @@ declare <vscale x 16 x i16> @llvm.riscv.vfcvt.xu.f.v.mask.nxv16i16.nxv16f16(
 define <vscale x 16 x i16> @intrinsic_vfcvt_mask_xu.f.v_nxv16i16_nxv16f16(<vscale x 16 x i16> %0, <vscale x 16 x half> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_xu.f.v_nxv16i16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.xu.f.v v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vfcvt.xu.f.v.mask.nxv16i16.nxv16f16(
@@ -242,10 +242,10 @@ declare <vscale x 32 x i16> @llvm.riscv.vfcvt.xu.f.v.nxv32i16.nxv32f16(
 define <vscale x 32 x i16> @intrinsic_vfcvt_xu.f.v_nxv32i16_nxv32f16(<vscale x 32 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_xu.f.v_nxv32i16_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.xu.f.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vfcvt.xu.f.v.nxv32i16.nxv32f16(
@@ -265,10 +265,10 @@ declare <vscale x 32 x i16> @llvm.riscv.vfcvt.xu.f.v.mask.nxv32i16.nxv32f16(
 define <vscale x 32 x i16> @intrinsic_vfcvt_mask_xu.f.v_nxv32i16_nxv32f16(<vscale x 32 x i16> %0, <vscale x 32 x half> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_xu.f.v_nxv32i16_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.xu.f.v v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vfcvt.xu.f.v.mask.nxv32i16.nxv32f16(
@@ -288,10 +288,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vfcvt.xu.f.v.nxv1i32.nxv1f32(
 define <vscale x 1 x i32> @intrinsic_vfcvt_xu.f.v_nxv1i32_nxv1f32(<vscale x 1 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_xu.f.v_nxv1i32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.xu.f.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vfcvt.xu.f.v.nxv1i32.nxv1f32(
@@ -311,10 +311,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vfcvt.xu.f.v.mask.nxv1i32.nxv1f32(
 define <vscale x 1 x i32> @intrinsic_vfcvt_mask_xu.f.v_nxv1i32_nxv1f32(<vscale x 1 x i32> %0, <vscale x 1 x float> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_xu.f.v_nxv1i32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.xu.f.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vfcvt.xu.f.v.mask.nxv1i32.nxv1f32(
@@ -334,10 +334,10 @@ declare <vscale x 2 x i32> @llvm.riscv.vfcvt.xu.f.v.nxv2i32.nxv2f32(
 define <vscale x 2 x i32> @intrinsic_vfcvt_xu.f.v_nxv2i32_nxv2f32(<vscale x 2 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_xu.f.v_nxv2i32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.xu.f.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vfcvt.xu.f.v.nxv2i32.nxv2f32(
@@ -357,10 +357,10 @@ declare <vscale x 2 x i32> @llvm.riscv.vfcvt.xu.f.v.mask.nxv2i32.nxv2f32(
 define <vscale x 2 x i32> @intrinsic_vfcvt_mask_xu.f.v_nxv2i32_nxv2f32(<vscale x 2 x i32> %0, <vscale x 2 x float> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_xu.f.v_nxv2i32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.xu.f.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vfcvt.xu.f.v.mask.nxv2i32.nxv2f32(
@@ -380,10 +380,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vfcvt.xu.f.v.nxv4i32.nxv4f32(
 define <vscale x 4 x i32> @intrinsic_vfcvt_xu.f.v_nxv4i32_nxv4f32(<vscale x 4 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_xu.f.v_nxv4i32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.xu.f.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vfcvt.xu.f.v.nxv4i32.nxv4f32(
@@ -403,10 +403,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vfcvt.xu.f.v.mask.nxv4i32.nxv4f32(
 define <vscale x 4 x i32> @intrinsic_vfcvt_mask_xu.f.v_nxv4i32_nxv4f32(<vscale x 4 x i32> %0, <vscale x 4 x float> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_xu.f.v_nxv4i32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.xu.f.v v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vfcvt.xu.f.v.mask.nxv4i32.nxv4f32(
@@ -426,10 +426,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vfcvt.xu.f.v.nxv8i32.nxv8f32(
 define <vscale x 8 x i32> @intrinsic_vfcvt_xu.f.v_nxv8i32_nxv8f32(<vscale x 8 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_xu.f.v_nxv8i32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.xu.f.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vfcvt.xu.f.v.nxv8i32.nxv8f32(
@@ -449,10 +449,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vfcvt.xu.f.v.mask.nxv8i32.nxv8f32(
 define <vscale x 8 x i32> @intrinsic_vfcvt_mask_xu.f.v_nxv8i32_nxv8f32(<vscale x 8 x i32> %0, <vscale x 8 x float> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_xu.f.v_nxv8i32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.xu.f.v v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vfcvt.xu.f.v.mask.nxv8i32.nxv8f32(
@@ -472,10 +472,10 @@ declare <vscale x 16 x i32> @llvm.riscv.vfcvt.xu.f.v.nxv16i32.nxv16f32(
 define <vscale x 16 x i32> @intrinsic_vfcvt_xu.f.v_nxv16i32_nxv16f32(<vscale x 16 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_xu.f.v_nxv16i32_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.xu.f.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vfcvt.xu.f.v.nxv16i32.nxv16f32(
@@ -495,10 +495,10 @@ declare <vscale x 16 x i32> @llvm.riscv.vfcvt.xu.f.v.mask.nxv16i32.nxv16f32(
 define <vscale x 16 x i32> @intrinsic_vfcvt_mask_xu.f.v_nxv16i32_nxv16f32(<vscale x 16 x i32> %0, <vscale x 16 x float> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_xu.f.v_nxv16i32_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.xu.f.v v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vfcvt.xu.f.v.mask.nxv16i32.nxv16f32(
@@ -518,10 +518,10 @@ declare <vscale x 1 x i64> @llvm.riscv.vfcvt.xu.f.v.nxv1i64.nxv1f64(
 define <vscale x 1 x i64> @intrinsic_vfcvt_xu.f.v_nxv1i64_nxv1f64(<vscale x 1 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_xu.f.v_nxv1i64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.xu.f.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vfcvt.xu.f.v.nxv1i64.nxv1f64(
@@ -541,10 +541,10 @@ declare <vscale x 1 x i64> @llvm.riscv.vfcvt.xu.f.v.mask.nxv1i64.nxv1f64(
 define <vscale x 1 x i64> @intrinsic_vfcvt_mask_xu.f.v_nxv1i64_nxv1f64(<vscale x 1 x i64> %0, <vscale x 1 x double> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_xu.f.v_nxv1i64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.xu.f.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vfcvt.xu.f.v.mask.nxv1i64.nxv1f64(
@@ -564,10 +564,10 @@ declare <vscale x 2 x i64> @llvm.riscv.vfcvt.xu.f.v.nxv2i64.nxv2f64(
 define <vscale x 2 x i64> @intrinsic_vfcvt_xu.f.v_nxv2i64_nxv2f64(<vscale x 2 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_xu.f.v_nxv2i64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.xu.f.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vfcvt.xu.f.v.nxv2i64.nxv2f64(
@@ -587,10 +587,10 @@ declare <vscale x 2 x i64> @llvm.riscv.vfcvt.xu.f.v.mask.nxv2i64.nxv2f64(
 define <vscale x 2 x i64> @intrinsic_vfcvt_mask_xu.f.v_nxv2i64_nxv2f64(<vscale x 2 x i64> %0, <vscale x 2 x double> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_xu.f.v_nxv2i64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.xu.f.v v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vfcvt.xu.f.v.mask.nxv2i64.nxv2f64(
@@ -610,10 +610,10 @@ declare <vscale x 4 x i64> @llvm.riscv.vfcvt.xu.f.v.nxv4i64.nxv4f64(
 define <vscale x 4 x i64> @intrinsic_vfcvt_xu.f.v_nxv4i64_nxv4f64(<vscale x 4 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_xu.f.v_nxv4i64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.xu.f.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vfcvt.xu.f.v.nxv4i64.nxv4f64(
@@ -633,10 +633,10 @@ declare <vscale x 4 x i64> @llvm.riscv.vfcvt.xu.f.v.mask.nxv4i64.nxv4f64(
 define <vscale x 4 x i64> @intrinsic_vfcvt_mask_xu.f.v_nxv4i64_nxv4f64(<vscale x 4 x i64> %0, <vscale x 4 x double> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_xu.f.v_nxv4i64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.xu.f.v v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vfcvt.xu.f.v.mask.nxv4i64.nxv4f64(
@@ -656,10 +656,10 @@ declare <vscale x 8 x i64> @llvm.riscv.vfcvt.xu.f.v.nxv8i64.nxv8f64(
 define <vscale x 8 x i64> @intrinsic_vfcvt_xu.f.v_nxv8i64_nxv8f64(<vscale x 8 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_xu.f.v_nxv8i64_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.xu.f.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vfcvt.xu.f.v.nxv8i64.nxv8f64(
@@ -679,10 +679,10 @@ declare <vscale x 8 x i64> @llvm.riscv.vfcvt.xu.f.v.mask.nxv8i64.nxv8f64(
 define <vscale x 8 x i64> @intrinsic_vfcvt_mask_xu.f.v_nxv8i64_nxv8f64(<vscale x 8 x i64> %0, <vscale x 8 x double> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfcvt_mask_xu.f.v_nxv8i64_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.xu.f.v v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vfcvt.xu.f.v.mask.nxv8i64.nxv8f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv.ll
index 7e77fb7dc2ed..3f67c433bcbf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfdiv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv.ll
@@ -13,10 +13,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfdiv.nxv1f16.nxv1f16(
 define <vscale x 1 x half> @intrinsic_vfdiv_vv_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_vv_nxv1f16_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vv v8, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfdiv.nxv1f16.nxv1f16(
@@ -38,10 +38,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfdiv.mask.nxv1f16.nxv1f16(
 define <vscale x 1 x half> @intrinsic_vfdiv_mask_vv_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_mask_vv_nxv1f16_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfdiv.mask.nxv1f16.nxv1f16(
@@ -63,10 +63,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfdiv.nxv2f16.nxv2f16(
 define <vscale x 2 x half> @intrinsic_vfdiv_vv_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_vv_nxv2f16_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vv v8, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfdiv.nxv2f16.nxv2f16(
@@ -88,10 +88,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfdiv.mask.nxv2f16.nxv2f16(
 define <vscale x 2 x half> @intrinsic_vfdiv_mask_vv_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_mask_vv_nxv2f16_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfdiv.mask.nxv2f16.nxv2f16(
@@ -113,10 +113,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfdiv.nxv4f16.nxv4f16(
 define <vscale x 4 x half> @intrinsic_vfdiv_vv_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_vv_nxv4f16_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vv v8, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfdiv.nxv4f16.nxv4f16(
@@ -138,10 +138,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfdiv.mask.nxv4f16.nxv4f16(
 define <vscale x 4 x half> @intrinsic_vfdiv_mask_vv_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_mask_vv_nxv4f16_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfdiv.mask.nxv4f16.nxv4f16(
@@ -163,10 +163,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfdiv.nxv8f16.nxv8f16(
 define <vscale x 8 x half> @intrinsic_vfdiv_vv_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_vv_nxv8f16_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vv v8, v8, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfdiv.nxv8f16.nxv8f16(
@@ -188,10 +188,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfdiv.mask.nxv8f16.nxv8f16(
 define <vscale x 8 x half> @intrinsic_vfdiv_mask_vv_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_mask_vv_nxv8f16_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfdiv.mask.nxv8f16.nxv8f16(
@@ -213,10 +213,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfdiv.nxv16f16.nxv16f16(
 define <vscale x 16 x half> @intrinsic_vfdiv_vv_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_vv_nxv16f16_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vv v8, v8, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfdiv.nxv16f16.nxv16f16(
@@ -238,10 +238,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfdiv.mask.nxv16f16.nxv16f16(
 define <vscale x 16 x half> @intrinsic_vfdiv_mask_vv_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_mask_vv_nxv16f16_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfdiv.mask.nxv16f16.nxv16f16(
@@ -263,10 +263,10 @@ declare <vscale x 32 x half> @llvm.riscv.vfdiv.nxv32f16.nxv32f16(
 define <vscale x 32 x half> @intrinsic_vfdiv_vv_nxv32f16_nxv32f16_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_vv_nxv32f16_nxv32f16_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vv v8, v8, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vfdiv.nxv32f16.nxv32f16(
@@ -289,8 +289,8 @@ define <vscale x 32 x half> @intrinsic_vfdiv_mask_vv_nxv32f16_nxv32f16_nxv32f16(
 ; CHECK-LABEL: intrinsic_vfdiv_mask_vv_nxv32f16_nxv32f16_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    vfdiv.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
@@ -314,10 +314,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfdiv.nxv1f32.nxv1f32(
 define <vscale x 1 x float> @intrinsic_vfdiv_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_vv_nxv1f32_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vv v8, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfdiv.nxv1f32.nxv1f32(
@@ -339,10 +339,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfdiv.mask.nxv1f32.nxv1f32(
 define <vscale x 1 x float> @intrinsic_vfdiv_mask_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_mask_vv_nxv1f32_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfdiv.mask.nxv1f32.nxv1f32(
@@ -364,10 +364,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfdiv.nxv2f32.nxv2f32(
 define <vscale x 2 x float> @intrinsic_vfdiv_vv_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_vv_nxv2f32_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vv v8, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfdiv.nxv2f32.nxv2f32(
@@ -389,10 +389,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfdiv.mask.nxv2f32.nxv2f32(
 define <vscale x 2 x float> @intrinsic_vfdiv_mask_vv_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_mask_vv_nxv2f32_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfdiv.mask.nxv2f32.nxv2f32(
@@ -414,10 +414,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfdiv.nxv4f32.nxv4f32(
 define <vscale x 4 x float> @intrinsic_vfdiv_vv_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_vv_nxv4f32_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vv v8, v8, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfdiv.nxv4f32.nxv4f32(
@@ -439,10 +439,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfdiv.mask.nxv4f32.nxv4f32(
 define <vscale x 4 x float> @intrinsic_vfdiv_mask_vv_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_mask_vv_nxv4f32_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfdiv.mask.nxv4f32.nxv4f32(
@@ -464,10 +464,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfdiv.nxv8f32.nxv8f32(
 define <vscale x 8 x float> @intrinsic_vfdiv_vv_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_vv_nxv8f32_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vv v8, v8, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfdiv.nxv8f32.nxv8f32(
@@ -489,10 +489,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfdiv.mask.nxv8f32.nxv8f32(
 define <vscale x 8 x float> @intrinsic_vfdiv_mask_vv_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_mask_vv_nxv8f32_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfdiv.mask.nxv8f32.nxv8f32(
@@ -514,10 +514,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfdiv.nxv16f32.nxv16f32(
 define <vscale x 16 x float> @intrinsic_vfdiv_vv_nxv16f32_nxv16f32_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_vv_nxv16f32_nxv16f32_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vv v8, v8, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfdiv.nxv16f32.nxv16f32(
@@ -540,8 +540,8 @@ define <vscale x 16 x float> @intrinsic_vfdiv_mask_vv_nxv16f32_nxv16f32_nxv16f32
 ; CHECK-LABEL: intrinsic_vfdiv_mask_vv_nxv16f32_nxv16f32_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re32.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    vfdiv.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
@@ -565,10 +565,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfdiv.nxv1f64.nxv1f64(
 define <vscale x 1 x double> @intrinsic_vfdiv_vv_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_vv_nxv1f64_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vv v8, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfdiv.nxv1f64.nxv1f64(
@@ -590,10 +590,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfdiv.mask.nxv1f64.nxv1f64(
 define <vscale x 1 x double> @intrinsic_vfdiv_mask_vv_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_mask_vv_nxv1f64_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfdiv.mask.nxv1f64.nxv1f64(
@@ -615,10 +615,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfdiv.nxv2f64.nxv2f64(
 define <vscale x 2 x double> @intrinsic_vfdiv_vv_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_vv_nxv2f64_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vv v8, v8, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfdiv.nxv2f64.nxv2f64(
@@ -640,10 +640,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfdiv.mask.nxv2f64.nxv2f64(
 define <vscale x 2 x double> @intrinsic_vfdiv_mask_vv_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_mask_vv_nxv2f64_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfdiv.mask.nxv2f64.nxv2f64(
@@ -665,10 +665,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfdiv.nxv4f64.nxv4f64(
 define <vscale x 4 x double> @intrinsic_vfdiv_vv_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_vv_nxv4f64_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vv v8, v8, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfdiv.nxv4f64.nxv4f64(
@@ -690,10 +690,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfdiv.mask.nxv4f64.nxv4f64(
 define <vscale x 4 x double> @intrinsic_vfdiv_mask_vv_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_mask_vv_nxv4f64_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfdiv.mask.nxv4f64.nxv4f64(
@@ -715,10 +715,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfdiv.nxv8f64.nxv8f64(
 define <vscale x 8 x double> @intrinsic_vfdiv_vv_nxv8f64_nxv8f64_nxv8f64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_vv_nxv8f64_nxv8f64_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vv v8, v8, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfdiv.nxv8f64.nxv8f64(
@@ -741,8 +741,8 @@ define <vscale x 8 x double> @intrinsic_vfdiv_mask_vv_nxv8f64_nxv8f64_nxv8f64(<v
 ; CHECK-LABEL: intrinsic_vfdiv_mask_vv_nxv8f64_nxv8f64_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re64.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
 ; CHECK-NEXT:    vfdiv.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
@@ -766,10 +766,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfdiv.nxv1f16.f16(
 define <vscale x 1 x half> @intrinsic_vfdiv_vf_nxv1f16_nxv1f16_f16(<vscale x 1 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_vf_nxv1f16_nxv1f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfdiv.nxv1f16.f16(
@@ -791,10 +791,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfdiv.mask.nxv1f16.f16(
 define <vscale x 1 x half> @intrinsic_vfdiv_mask_vf_nxv1f16_nxv1f16_f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, half %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_mask_vf_nxv1f16_nxv1f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfdiv.mask.nxv1f16.f16(
@@ -816,10 +816,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfdiv.nxv2f16.f16(
 define <vscale x 2 x half> @intrinsic_vfdiv_vf_nxv2f16_nxv2f16_f16(<vscale x 2 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_vf_nxv2f16_nxv2f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfdiv.nxv2f16.f16(
@@ -841,10 +841,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfdiv.mask.nxv2f16.f16(
 define <vscale x 2 x half> @intrinsic_vfdiv_mask_vf_nxv2f16_nxv2f16_f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, half %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_mask_vf_nxv2f16_nxv2f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfdiv.mask.nxv2f16.f16(
@@ -866,10 +866,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfdiv.nxv4f16.f16(
 define <vscale x 4 x half> @intrinsic_vfdiv_vf_nxv4f16_nxv4f16_f16(<vscale x 4 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_vf_nxv4f16_nxv4f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfdiv.nxv4f16.f16(
@@ -891,10 +891,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfdiv.mask.nxv4f16.f16(
 define <vscale x 4 x half> @intrinsic_vfdiv_mask_vf_nxv4f16_nxv4f16_f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, half %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_mask_vf_nxv4f16_nxv4f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfdiv.mask.nxv4f16.f16(
@@ -916,10 +916,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfdiv.nxv8f16.f16(
 define <vscale x 8 x half> @intrinsic_vfdiv_vf_nxv8f16_nxv8f16_f16(<vscale x 8 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_vf_nxv8f16_nxv8f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfdiv.nxv8f16.f16(
@@ -941,10 +941,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfdiv.mask.nxv8f16.f16(
 define <vscale x 8 x half> @intrinsic_vfdiv_mask_vf_nxv8f16_nxv8f16_f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, half %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_mask_vf_nxv8f16_nxv8f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vf v8, v10, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfdiv.mask.nxv8f16.f16(
@@ -966,10 +966,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfdiv.nxv16f16.f16(
 define <vscale x 16 x half> @intrinsic_vfdiv_vf_nxv16f16_nxv16f16_f16(<vscale x 16 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_vf_nxv16f16_nxv16f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfdiv.nxv16f16.f16(
@@ -991,10 +991,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfdiv.mask.nxv16f16.f16(
 define <vscale x 16 x half> @intrinsic_vfdiv_mask_vf_nxv16f16_nxv16f16_f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, half %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_mask_vf_nxv16f16_nxv16f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vf v8, v12, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfdiv.mask.nxv16f16.f16(
@@ -1016,10 +1016,10 @@ declare <vscale x 32 x half> @llvm.riscv.vfdiv.nxv32f16.f16(
 define <vscale x 32 x half> @intrinsic_vfdiv_vf_nxv32f16_nxv32f16_f16(<vscale x 32 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_vf_nxv32f16_nxv32f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vfdiv.nxv32f16.f16(
@@ -1041,10 +1041,10 @@ declare <vscale x 32 x half> @llvm.riscv.vfdiv.mask.nxv32f16.f16(
 define <vscale x 32 x half> @intrinsic_vfdiv_mask_vf_nxv32f16_nxv32f16_f16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, half %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_mask_vf_nxv32f16_nxv32f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vf v8, v16, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vfdiv.mask.nxv32f16.f16(
@@ -1066,10 +1066,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfdiv.nxv1f32.f32(
 define <vscale x 1 x float> @intrinsic_vfdiv_vf_nxv1f32_nxv1f32_f32(<vscale x 1 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_vf_nxv1f32_nxv1f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfdiv.nxv1f32.f32(
@@ -1091,10 +1091,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfdiv.mask.nxv1f32.f32(
 define <vscale x 1 x float> @intrinsic_vfdiv_mask_vf_nxv1f32_nxv1f32_f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, float %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_mask_vf_nxv1f32_nxv1f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfdiv.mask.nxv1f32.f32(
@@ -1116,10 +1116,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfdiv.nxv2f32.f32(
 define <vscale x 2 x float> @intrinsic_vfdiv_vf_nxv2f32_nxv2f32_f32(<vscale x 2 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_vf_nxv2f32_nxv2f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfdiv.nxv2f32.f32(
@@ -1141,10 +1141,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfdiv.mask.nxv2f32.f32(
 define <vscale x 2 x float> @intrinsic_vfdiv_mask_vf_nxv2f32_nxv2f32_f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, float %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_mask_vf_nxv2f32_nxv2f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfdiv.mask.nxv2f32.f32(
@@ -1166,10 +1166,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfdiv.nxv4f32.f32(
 define <vscale x 4 x float> @intrinsic_vfdiv_vf_nxv4f32_nxv4f32_f32(<vscale x 4 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_vf_nxv4f32_nxv4f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfdiv.nxv4f32.f32(
@@ -1191,10 +1191,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfdiv.mask.nxv4f32.f32(
 define <vscale x 4 x float> @intrinsic_vfdiv_mask_vf_nxv4f32_nxv4f32_f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, float %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_mask_vf_nxv4f32_nxv4f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vf v8, v10, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfdiv.mask.nxv4f32.f32(
@@ -1216,10 +1216,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfdiv.nxv8f32.f32(
 define <vscale x 8 x float> @intrinsic_vfdiv_vf_nxv8f32_nxv8f32_f32(<vscale x 8 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_vf_nxv8f32_nxv8f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfdiv.nxv8f32.f32(
@@ -1241,10 +1241,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfdiv.mask.nxv8f32.f32(
 define <vscale x 8 x float> @intrinsic_vfdiv_mask_vf_nxv8f32_nxv8f32_f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, float %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_mask_vf_nxv8f32_nxv8f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vf v8, v12, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfdiv.mask.nxv8f32.f32(
@@ -1266,10 +1266,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfdiv.nxv16f32.f32(
 define <vscale x 16 x float> @intrinsic_vfdiv_vf_nxv16f32_nxv16f32_f32(<vscale x 16 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_vf_nxv16f32_nxv16f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfdiv.nxv16f32.f32(
@@ -1291,10 +1291,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfdiv.mask.nxv16f32.f32(
 define <vscale x 16 x float> @intrinsic_vfdiv_mask_vf_nxv16f32_nxv16f32_f32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, float %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_mask_vf_nxv16f32_nxv16f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vf v8, v16, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfdiv.mask.nxv16f32.f32(
@@ -1316,10 +1316,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfdiv.nxv1f64.f64(
 define <vscale x 1 x double> @intrinsic_vfdiv_vf_nxv1f64_nxv1f64_f64(<vscale x 1 x double> %0, double %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_vf_nxv1f64_nxv1f64_f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfdiv.nxv1f64.f64(
@@ -1341,10 +1341,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfdiv.mask.nxv1f64.f64(
 define <vscale x 1 x double> @intrinsic_vfdiv_mask_vf_nxv1f64_nxv1f64_f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, double %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_mask_vf_nxv1f64_nxv1f64_f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfdiv.mask.nxv1f64.f64(
@@ -1366,10 +1366,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfdiv.nxv2f64.f64(
 define <vscale x 2 x double> @intrinsic_vfdiv_vf_nxv2f64_nxv2f64_f64(<vscale x 2 x double> %0, double %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_vf_nxv2f64_nxv2f64_f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfdiv.nxv2f64.f64(
@@ -1391,10 +1391,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfdiv.mask.nxv2f64.f64(
 define <vscale x 2 x double> @intrinsic_vfdiv_mask_vf_nxv2f64_nxv2f64_f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, double %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_mask_vf_nxv2f64_nxv2f64_f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vf v8, v10, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfdiv.mask.nxv2f64.f64(
@@ -1416,10 +1416,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfdiv.nxv4f64.f64(
 define <vscale x 4 x double> @intrinsic_vfdiv_vf_nxv4f64_nxv4f64_f64(<vscale x 4 x double> %0, double %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_vf_nxv4f64_nxv4f64_f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfdiv.nxv4f64.f64(
@@ -1441,10 +1441,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfdiv.mask.nxv4f64.f64(
 define <vscale x 4 x double> @intrinsic_vfdiv_mask_vf_nxv4f64_nxv4f64_f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, double %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_mask_vf_nxv4f64_nxv4f64_f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vf v8, v12, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfdiv.mask.nxv4f64.f64(
@@ -1466,10 +1466,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfdiv.nxv8f64.f64(
 define <vscale x 8 x double> @intrinsic_vfdiv_vf_nxv8f64_nxv8f64_f64(<vscale x 8 x double> %0, double %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_vf_nxv8f64_nxv8f64_f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfdiv.nxv8f64.f64(
@@ -1491,10 +1491,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfdiv.mask.nxv8f64.f64(
 define <vscale x 8 x double> @intrinsic_vfdiv_mask_vf_nxv8f64_nxv8f64_f64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, double %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfdiv_mask_vf_nxv8f64_nxv8f64_f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfdiv.vf v8, v16, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfdiv.mask.nxv8f64.f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmacc.ll b/llvm/test/CodeGen/RISCV/rvv/vfmacc.ll
index 73d0178a939c..5586b52b64ec 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmacc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmacc.ll
@@ -13,10 +13,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfmacc.nxv1f16.nxv1f16(
 define <vscale x 1 x half>  @intrinsic_vfmacc_vv_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_vv_nxv1f16_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfmacc.nxv1f16.nxv1f16(
@@ -38,10 +38,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfmacc.mask.nxv1f16.nxv1f16(
 define <vscale x 1 x half>  @intrinsic_vfmacc_mask_vv_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_mask_vv_nxv1f16_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfmacc.mask.nxv1f16.nxv1f16(
@@ -63,10 +63,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfmacc.nxv2f16.nxv2f16(
 define <vscale x 2 x half>  @intrinsic_vfmacc_vv_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_vv_nxv2f16_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfmacc.nxv2f16.nxv2f16(
@@ -88,10 +88,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfmacc.mask.nxv2f16.nxv2f16(
 define <vscale x 2 x half>  @intrinsic_vfmacc_mask_vv_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_mask_vv_nxv2f16_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfmacc.mask.nxv2f16.nxv2f16(
@@ -113,10 +113,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfmacc.nxv4f16.nxv4f16(
 define <vscale x 4 x half>  @intrinsic_vfmacc_vv_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_vv_nxv4f16_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfmacc.nxv4f16.nxv4f16(
@@ -138,10 +138,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfmacc.mask.nxv4f16.nxv4f16(
 define <vscale x 4 x half>  @intrinsic_vfmacc_mask_vv_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_mask_vv_nxv4f16_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfmacc.mask.nxv4f16.nxv4f16(
@@ -163,10 +163,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfmacc.nxv8f16.nxv8f16(
 define <vscale x 8 x half>  @intrinsic_vfmacc_vv_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_vv_nxv8f16_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vv v8, v10, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfmacc.nxv8f16.nxv8f16(
@@ -188,10 +188,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfmacc.mask.nxv8f16.nxv8f16(
 define <vscale x 8 x half>  @intrinsic_vfmacc_mask_vv_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_mask_vv_nxv8f16_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfmacc.mask.nxv8f16.nxv8f16(
@@ -213,10 +213,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfmacc.nxv16f16.nxv16f16(
 define <vscale x 16 x half>  @intrinsic_vfmacc_vv_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_vv_nxv16f16_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vv v8, v12, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfmacc.nxv16f16.nxv16f16(
@@ -238,10 +238,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfmacc.mask.nxv16f16.nxv16f16(
 define <vscale x 16 x half>  @intrinsic_vfmacc_mask_vv_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_mask_vv_nxv16f16_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfmacc.mask.nxv16f16.nxv16f16(
@@ -263,10 +263,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfmacc.nxv1f32.nxv1f32(
 define <vscale x 1 x float>  @intrinsic_vfmacc_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_vv_nxv1f32_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfmacc.nxv1f32.nxv1f32(
@@ -288,10 +288,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfmacc.mask.nxv1f32.nxv1f32(
 define <vscale x 1 x float>  @intrinsic_vfmacc_mask_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_mask_vv_nxv1f32_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfmacc.mask.nxv1f32.nxv1f32(
@@ -313,10 +313,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfmacc.nxv2f32.nxv2f32(
 define <vscale x 2 x float>  @intrinsic_vfmacc_vv_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_vv_nxv2f32_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfmacc.nxv2f32.nxv2f32(
@@ -338,10 +338,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfmacc.mask.nxv2f32.nxv2f32(
 define <vscale x 2 x float>  @intrinsic_vfmacc_mask_vv_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_mask_vv_nxv2f32_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfmacc.mask.nxv2f32.nxv2f32(
@@ -363,10 +363,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfmacc.nxv4f32.nxv4f32(
 define <vscale x 4 x float>  @intrinsic_vfmacc_vv_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_vv_nxv4f32_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vv v8, v10, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfmacc.nxv4f32.nxv4f32(
@@ -388,10 +388,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfmacc.mask.nxv4f32.nxv4f32(
 define <vscale x 4 x float>  @intrinsic_vfmacc_mask_vv_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_mask_vv_nxv4f32_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfmacc.mask.nxv4f32.nxv4f32(
@@ -413,10 +413,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfmacc.nxv8f32.nxv8f32(
 define <vscale x 8 x float>  @intrinsic_vfmacc_vv_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_vv_nxv8f32_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vv v8, v12, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfmacc.nxv8f32.nxv8f32(
@@ -438,10 +438,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfmacc.mask.nxv8f32.nxv8f32(
 define <vscale x 8 x float>  @intrinsic_vfmacc_mask_vv_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_mask_vv_nxv8f32_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfmacc.mask.nxv8f32.nxv8f32(
@@ -463,10 +463,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfmacc.nxv1f64.nxv1f64(
 define <vscale x 1 x double>  @intrinsic_vfmacc_vv_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_vv_nxv1f64_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfmacc.nxv1f64.nxv1f64(
@@ -488,10 +488,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfmacc.mask.nxv1f64.nxv1f64(
 define <vscale x 1 x double>  @intrinsic_vfmacc_mask_vv_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_mask_vv_nxv1f64_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfmacc.mask.nxv1f64.nxv1f64(
@@ -513,10 +513,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfmacc.nxv2f64.nxv2f64(
 define <vscale x 2 x double>  @intrinsic_vfmacc_vv_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_vv_nxv2f64_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vv v8, v10, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfmacc.nxv2f64.nxv2f64(
@@ -538,10 +538,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfmacc.mask.nxv2f64.nxv2f64(
 define <vscale x 2 x double>  @intrinsic_vfmacc_mask_vv_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_mask_vv_nxv2f64_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfmacc.mask.nxv2f64.nxv2f64(
@@ -563,10 +563,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfmacc.nxv4f64.nxv4f64(
 define <vscale x 4 x double>  @intrinsic_vfmacc_vv_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_vv_nxv4f64_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vv v8, v12, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfmacc.nxv4f64.nxv4f64(
@@ -588,10 +588,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfmacc.mask.nxv4f64.nxv4f64(
 define <vscale x 4 x double>  @intrinsic_vfmacc_mask_vv_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_mask_vv_nxv4f64_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfmacc.mask.nxv4f64.nxv4f64(
@@ -613,10 +613,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfmacc.nxv1f16.f16(
 define <vscale x 1 x half>  @intrinsic_vfmacc_vf_nxv1f16_f16_nxv1f16(<vscale x 1 x half> %0, half %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_vf_nxv1f16_f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfmacc.nxv1f16.f16(
@@ -638,10 +638,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfmacc.mask.nxv1f16.f16(
 define <vscale x 1 x half> @intrinsic_vfmacc_mask_vf_nxv1f16_f16_nxv1f16(<vscale x 1 x half> %0, half %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_mask_vf_nxv1f16_f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfmacc.mask.nxv1f16.f16(
@@ -663,10 +663,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfmacc.nxv2f16.f16(
 define <vscale x 2 x half>  @intrinsic_vfmacc_vf_nxv2f16_f16_nxv2f16(<vscale x 2 x half> %0, half %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_vf_nxv2f16_f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfmacc.nxv2f16.f16(
@@ -688,10 +688,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfmacc.mask.nxv2f16.f16(
 define <vscale x 2 x half> @intrinsic_vfmacc_mask_vf_nxv2f16_f16_nxv2f16(<vscale x 2 x half> %0, half %1, <vscale x 2 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_mask_vf_nxv2f16_f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfmacc.mask.nxv2f16.f16(
@@ -713,10 +713,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfmacc.nxv4f16.f16(
 define <vscale x 4 x half>  @intrinsic_vfmacc_vf_nxv4f16_f16_nxv4f16(<vscale x 4 x half> %0, half %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_vf_nxv4f16_f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfmacc.nxv4f16.f16(
@@ -738,10 +738,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfmacc.mask.nxv4f16.f16(
 define <vscale x 4 x half> @intrinsic_vfmacc_mask_vf_nxv4f16_f16_nxv4f16(<vscale x 4 x half> %0, half %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_mask_vf_nxv4f16_f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfmacc.mask.nxv4f16.f16(
@@ -763,10 +763,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfmacc.nxv8f16.f16(
 define <vscale x 8 x half>  @intrinsic_vfmacc_vf_nxv8f16_f16_nxv8f16(<vscale x 8 x half> %0, half %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_vf_nxv8f16_f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vf v8, fa0, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfmacc.nxv8f16.f16(
@@ -788,10 +788,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfmacc.mask.nxv8f16.f16(
 define <vscale x 8 x half> @intrinsic_vfmacc_mask_vf_nxv8f16_f16_nxv8f16(<vscale x 8 x half> %0, half %1, <vscale x 8 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_mask_vf_nxv8f16_f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vf v8, fa0, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfmacc.mask.nxv8f16.f16(
@@ -813,10 +813,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfmacc.nxv16f16.f16(
 define <vscale x 16 x half>  @intrinsic_vfmacc_vf_nxv16f16_f16_nxv16f16(<vscale x 16 x half> %0, half %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_vf_nxv16f16_f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vf v8, fa0, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfmacc.nxv16f16.f16(
@@ -838,10 +838,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfmacc.mask.nxv16f16.f16(
 define <vscale x 16 x half> @intrinsic_vfmacc_mask_vf_nxv16f16_f16_nxv16f16(<vscale x 16 x half> %0, half %1, <vscale x 16 x half> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_mask_vf_nxv16f16_f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vf v8, fa0, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfmacc.mask.nxv16f16.f16(
@@ -863,10 +863,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfmacc.nxv1f32.f32(
 define <vscale x 1 x float>  @intrinsic_vfmacc_vf_nxv1f32_f32_nxv1f32(<vscale x 1 x float> %0, float %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_vf_nxv1f32_f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfmacc.nxv1f32.f32(
@@ -888,10 +888,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfmacc.mask.nxv1f32.f32(
 define <vscale x 1 x float> @intrinsic_vfmacc_mask_vf_nxv1f32_f32_nxv1f32(<vscale x 1 x float> %0, float %1, <vscale x 1 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_mask_vf_nxv1f32_f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfmacc.mask.nxv1f32.f32(
@@ -913,10 +913,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfmacc.nxv2f32.f32(
 define <vscale x 2 x float>  @intrinsic_vfmacc_vf_nxv2f32_f32_nxv2f32(<vscale x 2 x float> %0, float %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_vf_nxv2f32_f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfmacc.nxv2f32.f32(
@@ -938,10 +938,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfmacc.mask.nxv2f32.f32(
 define <vscale x 2 x float> @intrinsic_vfmacc_mask_vf_nxv2f32_f32_nxv2f32(<vscale x 2 x float> %0, float %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_mask_vf_nxv2f32_f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfmacc.mask.nxv2f32.f32(
@@ -963,10 +963,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfmacc.nxv4f32.f32(
 define <vscale x 4 x float>  @intrinsic_vfmacc_vf_nxv4f32_f32_nxv4f32(<vscale x 4 x float> %0, float %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_vf_nxv4f32_f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vf v8, fa0, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfmacc.nxv4f32.f32(
@@ -988,10 +988,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfmacc.mask.nxv4f32.f32(
 define <vscale x 4 x float> @intrinsic_vfmacc_mask_vf_nxv4f32_f32_nxv4f32(<vscale x 4 x float> %0, float %1, <vscale x 4 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_mask_vf_nxv4f32_f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vf v8, fa0, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfmacc.mask.nxv4f32.f32(
@@ -1013,10 +1013,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfmacc.nxv8f32.f32(
 define <vscale x 8 x float>  @intrinsic_vfmacc_vf_nxv8f32_f32_nxv8f32(<vscale x 8 x float> %0, float %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_vf_nxv8f32_f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vf v8, fa0, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfmacc.nxv8f32.f32(
@@ -1038,10 +1038,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfmacc.mask.nxv8f32.f32(
 define <vscale x 8 x float> @intrinsic_vfmacc_mask_vf_nxv8f32_f32_nxv8f32(<vscale x 8 x float> %0, float %1, <vscale x 8 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_mask_vf_nxv8f32_f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vf v8, fa0, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfmacc.mask.nxv8f32.f32(
@@ -1063,10 +1063,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfmacc.nxv1f64.f64(
 define <vscale x 1 x double>  @intrinsic_vfmacc_vf_nxv1f64_f64_nxv1f64(<vscale x 1 x double> %0, double %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_vf_nxv1f64_f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfmacc.nxv1f64.f64(
@@ -1088,10 +1088,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfmacc.mask.nxv1f64.f64(
 define <vscale x 1 x double> @intrinsic_vfmacc_mask_vf_nxv1f64_f64_nxv1f64(<vscale x 1 x double> %0, double %1, <vscale x 1 x double> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_mask_vf_nxv1f64_f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfmacc.mask.nxv1f64.f64(
@@ -1113,10 +1113,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfmacc.nxv2f64.f64(
 define <vscale x 2 x double>  @intrinsic_vfmacc_vf_nxv2f64_f64_nxv2f64(<vscale x 2 x double> %0, double %1, <vscale x 2 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_vf_nxv2f64_f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vf v8, fa0, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfmacc.nxv2f64.f64(
@@ -1138,10 +1138,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfmacc.mask.nxv2f64.f64(
 define <vscale x 2 x double> @intrinsic_vfmacc_mask_vf_nxv2f64_f64_nxv2f64(<vscale x 2 x double> %0, double %1, <vscale x 2 x double> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_mask_vf_nxv2f64_f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vf v8, fa0, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfmacc.mask.nxv2f64.f64(
@@ -1163,10 +1163,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfmacc.nxv4f64.f64(
 define <vscale x 4 x double>  @intrinsic_vfmacc_vf_nxv4f64_f64_nxv4f64(<vscale x 4 x double> %0, double %1, <vscale x 4 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_vf_nxv4f64_f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vf v8, fa0, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfmacc.nxv4f64.f64(
@@ -1188,10 +1188,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfmacc.mask.nxv4f64.f64(
 define <vscale x 4 x double> @intrinsic_vfmacc_mask_vf_nxv4f64_f64_nxv4f64(<vscale x 4 x double> %0, double %1, <vscale x 4 x double> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmacc_mask_vf_nxv4f64_f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmacc.vf v8, fa0, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfmacc.mask.nxv4f64.f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmadd.ll b/llvm/test/CodeGen/RISCV/rvv/vfmadd.ll
index caad65c78e66..c44690d23f08 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmadd.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmadd.ll
@@ -13,10 +13,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfmadd.nxv1f16.nxv1f16(
 define <vscale x 1 x half>  @intrinsic_vfmadd_vv_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_vv_nxv1f16_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfmadd.nxv1f16.nxv1f16(
@@ -38,10 +38,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfmadd.mask.nxv1f16.nxv1f16(
 define <vscale x 1 x half>  @intrinsic_vfmadd_mask_vv_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_mask_vv_nxv1f16_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfmadd.mask.nxv1f16.nxv1f16(
@@ -63,10 +63,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfmadd.nxv2f16.nxv2f16(
 define <vscale x 2 x half>  @intrinsic_vfmadd_vv_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_vv_nxv2f16_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfmadd.nxv2f16.nxv2f16(
@@ -88,10 +88,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfmadd.mask.nxv2f16.nxv2f16(
 define <vscale x 2 x half>  @intrinsic_vfmadd_mask_vv_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_mask_vv_nxv2f16_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfmadd.mask.nxv2f16.nxv2f16(
@@ -113,10 +113,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfmadd.nxv4f16.nxv4f16(
 define <vscale x 4 x half>  @intrinsic_vfmadd_vv_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_vv_nxv4f16_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfmadd.nxv4f16.nxv4f16(
@@ -138,10 +138,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfmadd.mask.nxv4f16.nxv4f16(
 define <vscale x 4 x half>  @intrinsic_vfmadd_mask_vv_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_mask_vv_nxv4f16_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfmadd.mask.nxv4f16.nxv4f16(
@@ -163,10 +163,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfmadd.nxv8f16.nxv8f16(
 define <vscale x 8 x half>  @intrinsic_vfmadd_vv_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_vv_nxv8f16_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vv v8, v10, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfmadd.nxv8f16.nxv8f16(
@@ -188,10 +188,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfmadd.mask.nxv8f16.nxv8f16(
 define <vscale x 8 x half>  @intrinsic_vfmadd_mask_vv_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_mask_vv_nxv8f16_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfmadd.mask.nxv8f16.nxv8f16(
@@ -213,10 +213,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfmadd.nxv16f16.nxv16f16(
 define <vscale x 16 x half>  @intrinsic_vfmadd_vv_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_vv_nxv16f16_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vv v8, v12, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfmadd.nxv16f16.nxv16f16(
@@ -238,10 +238,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfmadd.mask.nxv16f16.nxv16f16(
 define <vscale x 16 x half>  @intrinsic_vfmadd_mask_vv_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_mask_vv_nxv16f16_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfmadd.mask.nxv16f16.nxv16f16(
@@ -263,10 +263,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfmadd.nxv1f32.nxv1f32(
 define <vscale x 1 x float>  @intrinsic_vfmadd_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_vv_nxv1f32_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfmadd.nxv1f32.nxv1f32(
@@ -288,10 +288,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfmadd.mask.nxv1f32.nxv1f32(
 define <vscale x 1 x float>  @intrinsic_vfmadd_mask_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_mask_vv_nxv1f32_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfmadd.mask.nxv1f32.nxv1f32(
@@ -313,10 +313,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfmadd.nxv2f32.nxv2f32(
 define <vscale x 2 x float>  @intrinsic_vfmadd_vv_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_vv_nxv2f32_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfmadd.nxv2f32.nxv2f32(
@@ -338,10 +338,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfmadd.mask.nxv2f32.nxv2f32(
 define <vscale x 2 x float>  @intrinsic_vfmadd_mask_vv_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_mask_vv_nxv2f32_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfmadd.mask.nxv2f32.nxv2f32(
@@ -363,10 +363,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfmadd.nxv4f32.nxv4f32(
 define <vscale x 4 x float>  @intrinsic_vfmadd_vv_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_vv_nxv4f32_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vv v8, v10, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfmadd.nxv4f32.nxv4f32(
@@ -388,10 +388,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfmadd.mask.nxv4f32.nxv4f32(
 define <vscale x 4 x float>  @intrinsic_vfmadd_mask_vv_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_mask_vv_nxv4f32_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfmadd.mask.nxv4f32.nxv4f32(
@@ -413,10 +413,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfmadd.nxv8f32.nxv8f32(
 define <vscale x 8 x float>  @intrinsic_vfmadd_vv_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_vv_nxv8f32_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vv v8, v12, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfmadd.nxv8f32.nxv8f32(
@@ -438,10 +438,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfmadd.mask.nxv8f32.nxv8f32(
 define <vscale x 8 x float>  @intrinsic_vfmadd_mask_vv_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_mask_vv_nxv8f32_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfmadd.mask.nxv8f32.nxv8f32(
@@ -463,10 +463,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfmadd.nxv1f64.nxv1f64(
 define <vscale x 1 x double>  @intrinsic_vfmadd_vv_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_vv_nxv1f64_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfmadd.nxv1f64.nxv1f64(
@@ -488,10 +488,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfmadd.mask.nxv1f64.nxv1f64(
 define <vscale x 1 x double>  @intrinsic_vfmadd_mask_vv_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_mask_vv_nxv1f64_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfmadd.mask.nxv1f64.nxv1f64(
@@ -513,10 +513,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfmadd.nxv2f64.nxv2f64(
 define <vscale x 2 x double>  @intrinsic_vfmadd_vv_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_vv_nxv2f64_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vv v8, v10, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfmadd.nxv2f64.nxv2f64(
@@ -538,10 +538,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfmadd.mask.nxv2f64.nxv2f64(
 define <vscale x 2 x double>  @intrinsic_vfmadd_mask_vv_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_mask_vv_nxv2f64_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfmadd.mask.nxv2f64.nxv2f64(
@@ -563,10 +563,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfmadd.nxv4f64.nxv4f64(
 define <vscale x 4 x double>  @intrinsic_vfmadd_vv_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_vv_nxv4f64_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vv v8, v12, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfmadd.nxv4f64.nxv4f64(
@@ -588,10 +588,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfmadd.mask.nxv4f64.nxv4f64(
 define <vscale x 4 x double>  @intrinsic_vfmadd_mask_vv_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_mask_vv_nxv4f64_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfmadd.mask.nxv4f64.nxv4f64(
@@ -613,10 +613,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfmadd.nxv1f16.f16(
 define <vscale x 1 x half>  @intrinsic_vfmadd_vf_nxv1f16_f16_nxv1f16(<vscale x 1 x half> %0, half %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_vf_nxv1f16_f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfmadd.nxv1f16.f16(
@@ -638,10 +638,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfmadd.mask.nxv1f16.f16(
 define <vscale x 1 x half> @intrinsic_vfmadd_mask_vf_nxv1f16_f16_nxv1f16(<vscale x 1 x half> %0, half %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_mask_vf_nxv1f16_f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfmadd.mask.nxv1f16.f16(
@@ -663,10 +663,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfmadd.nxv2f16.f16(
 define <vscale x 2 x half>  @intrinsic_vfmadd_vf_nxv2f16_f16_nxv2f16(<vscale x 2 x half> %0, half %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_vf_nxv2f16_f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfmadd.nxv2f16.f16(
@@ -688,10 +688,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfmadd.mask.nxv2f16.f16(
 define <vscale x 2 x half> @intrinsic_vfmadd_mask_vf_nxv2f16_f16_nxv2f16(<vscale x 2 x half> %0, half %1, <vscale x 2 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_mask_vf_nxv2f16_f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfmadd.mask.nxv2f16.f16(
@@ -713,10 +713,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfmadd.nxv4f16.f16(
 define <vscale x 4 x half>  @intrinsic_vfmadd_vf_nxv4f16_f16_nxv4f16(<vscale x 4 x half> %0, half %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_vf_nxv4f16_f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfmadd.nxv4f16.f16(
@@ -738,10 +738,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfmadd.mask.nxv4f16.f16(
 define <vscale x 4 x half> @intrinsic_vfmadd_mask_vf_nxv4f16_f16_nxv4f16(<vscale x 4 x half> %0, half %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_mask_vf_nxv4f16_f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfmadd.mask.nxv4f16.f16(
@@ -763,10 +763,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfmadd.nxv8f16.f16(
 define <vscale x 8 x half>  @intrinsic_vfmadd_vf_nxv8f16_f16_nxv8f16(<vscale x 8 x half> %0, half %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_vf_nxv8f16_f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vf v8, fa0, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfmadd.nxv8f16.f16(
@@ -788,10 +788,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfmadd.mask.nxv8f16.f16(
 define <vscale x 8 x half> @intrinsic_vfmadd_mask_vf_nxv8f16_f16_nxv8f16(<vscale x 8 x half> %0, half %1, <vscale x 8 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_mask_vf_nxv8f16_f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vf v8, fa0, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfmadd.mask.nxv8f16.f16(
@@ -813,10 +813,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfmadd.nxv16f16.f16(
 define <vscale x 16 x half>  @intrinsic_vfmadd_vf_nxv16f16_f16_nxv16f16(<vscale x 16 x half> %0, half %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_vf_nxv16f16_f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vf v8, fa0, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfmadd.nxv16f16.f16(
@@ -838,10 +838,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfmadd.mask.nxv16f16.f16(
 define <vscale x 16 x half> @intrinsic_vfmadd_mask_vf_nxv16f16_f16_nxv16f16(<vscale x 16 x half> %0, half %1, <vscale x 16 x half> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_mask_vf_nxv16f16_f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vf v8, fa0, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfmadd.mask.nxv16f16.f16(
@@ -863,10 +863,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfmadd.nxv1f32.f32(
 define <vscale x 1 x float>  @intrinsic_vfmadd_vf_nxv1f32_f32_nxv1f32(<vscale x 1 x float> %0, float %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_vf_nxv1f32_f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfmadd.nxv1f32.f32(
@@ -888,10 +888,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfmadd.mask.nxv1f32.f32(
 define <vscale x 1 x float> @intrinsic_vfmadd_mask_vf_nxv1f32_f32_nxv1f32(<vscale x 1 x float> %0, float %1, <vscale x 1 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_mask_vf_nxv1f32_f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfmadd.mask.nxv1f32.f32(
@@ -913,10 +913,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfmadd.nxv2f32.f32(
 define <vscale x 2 x float>  @intrinsic_vfmadd_vf_nxv2f32_f32_nxv2f32(<vscale x 2 x float> %0, float %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_vf_nxv2f32_f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfmadd.nxv2f32.f32(
@@ -938,10 +938,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfmadd.mask.nxv2f32.f32(
 define <vscale x 2 x float> @intrinsic_vfmadd_mask_vf_nxv2f32_f32_nxv2f32(<vscale x 2 x float> %0, float %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_mask_vf_nxv2f32_f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfmadd.mask.nxv2f32.f32(
@@ -963,10 +963,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfmadd.nxv4f32.f32(
 define <vscale x 4 x float>  @intrinsic_vfmadd_vf_nxv4f32_f32_nxv4f32(<vscale x 4 x float> %0, float %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_vf_nxv4f32_f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vf v8, fa0, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfmadd.nxv4f32.f32(
@@ -988,10 +988,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfmadd.mask.nxv4f32.f32(
 define <vscale x 4 x float> @intrinsic_vfmadd_mask_vf_nxv4f32_f32_nxv4f32(<vscale x 4 x float> %0, float %1, <vscale x 4 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_mask_vf_nxv4f32_f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vf v8, fa0, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfmadd.mask.nxv4f32.f32(
@@ -1013,10 +1013,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfmadd.nxv8f32.f32(
 define <vscale x 8 x float>  @intrinsic_vfmadd_vf_nxv8f32_f32_nxv8f32(<vscale x 8 x float> %0, float %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_vf_nxv8f32_f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vf v8, fa0, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfmadd.nxv8f32.f32(
@@ -1038,10 +1038,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfmadd.mask.nxv8f32.f32(
 define <vscale x 8 x float> @intrinsic_vfmadd_mask_vf_nxv8f32_f32_nxv8f32(<vscale x 8 x float> %0, float %1, <vscale x 8 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_mask_vf_nxv8f32_f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vf v8, fa0, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfmadd.mask.nxv8f32.f32(
@@ -1063,10 +1063,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfmadd.nxv1f64.f64(
 define <vscale x 1 x double>  @intrinsic_vfmadd_vf_nxv1f64_f64_nxv1f64(<vscale x 1 x double> %0, double %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_vf_nxv1f64_f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfmadd.nxv1f64.f64(
@@ -1088,10 +1088,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfmadd.mask.nxv1f64.f64(
 define <vscale x 1 x double> @intrinsic_vfmadd_mask_vf_nxv1f64_f64_nxv1f64(<vscale x 1 x double> %0, double %1, <vscale x 1 x double> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_mask_vf_nxv1f64_f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfmadd.mask.nxv1f64.f64(
@@ -1113,10 +1113,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfmadd.nxv2f64.f64(
 define <vscale x 2 x double>  @intrinsic_vfmadd_vf_nxv2f64_f64_nxv2f64(<vscale x 2 x double> %0, double %1, <vscale x 2 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_vf_nxv2f64_f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vf v8, fa0, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfmadd.nxv2f64.f64(
@@ -1138,10 +1138,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfmadd.mask.nxv2f64.f64(
 define <vscale x 2 x double> @intrinsic_vfmadd_mask_vf_nxv2f64_f64_nxv2f64(<vscale x 2 x double> %0, double %1, <vscale x 2 x double> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_mask_vf_nxv2f64_f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vf v8, fa0, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfmadd.mask.nxv2f64.f64(
@@ -1163,10 +1163,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfmadd.nxv4f64.f64(
 define <vscale x 4 x double>  @intrinsic_vfmadd_vf_nxv4f64_f64_nxv4f64(<vscale x 4 x double> %0, double %1, <vscale x 4 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_vf_nxv4f64_f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vf v8, fa0, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfmadd.nxv4f64.f64(
@@ -1188,10 +1188,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfmadd.mask.nxv4f64.f64(
 define <vscale x 4 x double> @intrinsic_vfmadd_mask_vf_nxv4f64_f64_nxv4f64(<vscale x 4 x double> %0, double %1, <vscale x 4 x double> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmadd_mask_vf_nxv4f64_f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmadd.vf v8, fa0, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfmadd.mask.nxv4f64.f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmsac.ll b/llvm/test/CodeGen/RISCV/rvv/vfmsac.ll
index e668a70050e4..4eac7b63fd88 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmsac.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmsac.ll
@@ -13,10 +13,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfmsac.nxv1f16.nxv1f16(
 define <vscale x 1 x half>  @intrinsic_vfmsac_vv_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_vv_nxv1f16_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfmsac.nxv1f16.nxv1f16(
@@ -38,10 +38,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfmsac.mask.nxv1f16.nxv1f16(
 define <vscale x 1 x half>  @intrinsic_vfmsac_mask_vv_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_mask_vv_nxv1f16_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfmsac.mask.nxv1f16.nxv1f16(
@@ -63,10 +63,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfmsac.nxv2f16.nxv2f16(
 define <vscale x 2 x half>  @intrinsic_vfmsac_vv_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_vv_nxv2f16_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfmsac.nxv2f16.nxv2f16(
@@ -88,10 +88,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfmsac.mask.nxv2f16.nxv2f16(
 define <vscale x 2 x half>  @intrinsic_vfmsac_mask_vv_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_mask_vv_nxv2f16_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfmsac.mask.nxv2f16.nxv2f16(
@@ -113,10 +113,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfmsac.nxv4f16.nxv4f16(
 define <vscale x 4 x half>  @intrinsic_vfmsac_vv_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_vv_nxv4f16_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfmsac.nxv4f16.nxv4f16(
@@ -138,10 +138,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfmsac.mask.nxv4f16.nxv4f16(
 define <vscale x 4 x half>  @intrinsic_vfmsac_mask_vv_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_mask_vv_nxv4f16_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfmsac.mask.nxv4f16.nxv4f16(
@@ -163,10 +163,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfmsac.nxv8f16.nxv8f16(
 define <vscale x 8 x half>  @intrinsic_vfmsac_vv_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_vv_nxv8f16_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vv v8, v10, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfmsac.nxv8f16.nxv8f16(
@@ -188,10 +188,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfmsac.mask.nxv8f16.nxv8f16(
 define <vscale x 8 x half>  @intrinsic_vfmsac_mask_vv_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_mask_vv_nxv8f16_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfmsac.mask.nxv8f16.nxv8f16(
@@ -213,10 +213,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfmsac.nxv16f16.nxv16f16(
 define <vscale x 16 x half>  @intrinsic_vfmsac_vv_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_vv_nxv16f16_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vv v8, v12, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfmsac.nxv16f16.nxv16f16(
@@ -238,10 +238,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfmsac.mask.nxv16f16.nxv16f16(
 define <vscale x 16 x half>  @intrinsic_vfmsac_mask_vv_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_mask_vv_nxv16f16_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfmsac.mask.nxv16f16.nxv16f16(
@@ -263,10 +263,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfmsac.nxv1f32.nxv1f32(
 define <vscale x 1 x float>  @intrinsic_vfmsac_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_vv_nxv1f32_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfmsac.nxv1f32.nxv1f32(
@@ -288,10 +288,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfmsac.mask.nxv1f32.nxv1f32(
 define <vscale x 1 x float>  @intrinsic_vfmsac_mask_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_mask_vv_nxv1f32_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfmsac.mask.nxv1f32.nxv1f32(
@@ -313,10 +313,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfmsac.nxv2f32.nxv2f32(
 define <vscale x 2 x float>  @intrinsic_vfmsac_vv_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_vv_nxv2f32_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfmsac.nxv2f32.nxv2f32(
@@ -338,10 +338,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfmsac.mask.nxv2f32.nxv2f32(
 define <vscale x 2 x float>  @intrinsic_vfmsac_mask_vv_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_mask_vv_nxv2f32_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfmsac.mask.nxv2f32.nxv2f32(
@@ -363,10 +363,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfmsac.nxv4f32.nxv4f32(
 define <vscale x 4 x float>  @intrinsic_vfmsac_vv_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_vv_nxv4f32_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vv v8, v10, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfmsac.nxv4f32.nxv4f32(
@@ -388,10 +388,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfmsac.mask.nxv4f32.nxv4f32(
 define <vscale x 4 x float>  @intrinsic_vfmsac_mask_vv_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_mask_vv_nxv4f32_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfmsac.mask.nxv4f32.nxv4f32(
@@ -413,10 +413,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfmsac.nxv8f32.nxv8f32(
 define <vscale x 8 x float>  @intrinsic_vfmsac_vv_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_vv_nxv8f32_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vv v8, v12, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfmsac.nxv8f32.nxv8f32(
@@ -438,10 +438,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfmsac.mask.nxv8f32.nxv8f32(
 define <vscale x 8 x float>  @intrinsic_vfmsac_mask_vv_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_mask_vv_nxv8f32_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfmsac.mask.nxv8f32.nxv8f32(
@@ -463,10 +463,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfmsac.nxv1f64.nxv1f64(
 define <vscale x 1 x double>  @intrinsic_vfmsac_vv_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_vv_nxv1f64_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfmsac.nxv1f64.nxv1f64(
@@ -488,10 +488,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfmsac.mask.nxv1f64.nxv1f64(
 define <vscale x 1 x double>  @intrinsic_vfmsac_mask_vv_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_mask_vv_nxv1f64_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfmsac.mask.nxv1f64.nxv1f64(
@@ -513,10 +513,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfmsac.nxv2f64.nxv2f64(
 define <vscale x 2 x double>  @intrinsic_vfmsac_vv_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_vv_nxv2f64_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vv v8, v10, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfmsac.nxv2f64.nxv2f64(
@@ -538,10 +538,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfmsac.mask.nxv2f64.nxv2f64(
 define <vscale x 2 x double>  @intrinsic_vfmsac_mask_vv_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_mask_vv_nxv2f64_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfmsac.mask.nxv2f64.nxv2f64(
@@ -563,10 +563,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfmsac.nxv4f64.nxv4f64(
 define <vscale x 4 x double>  @intrinsic_vfmsac_vv_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_vv_nxv4f64_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vv v8, v12, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfmsac.nxv4f64.nxv4f64(
@@ -588,10 +588,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfmsac.mask.nxv4f64.nxv4f64(
 define <vscale x 4 x double>  @intrinsic_vfmsac_mask_vv_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_mask_vv_nxv4f64_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfmsac.mask.nxv4f64.nxv4f64(
@@ -613,10 +613,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfmsac.nxv1f16.f16(
 define <vscale x 1 x half>  @intrinsic_vfmsac_vf_nxv1f16_f16_nxv1f16(<vscale x 1 x half> %0, half %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_vf_nxv1f16_f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfmsac.nxv1f16.f16(
@@ -638,10 +638,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfmsac.mask.nxv1f16.f16(
 define <vscale x 1 x half> @intrinsic_vfmsac_mask_vf_nxv1f16_f16_nxv1f16(<vscale x 1 x half> %0, half %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_mask_vf_nxv1f16_f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfmsac.mask.nxv1f16.f16(
@@ -663,10 +663,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfmsac.nxv2f16.f16(
 define <vscale x 2 x half>  @intrinsic_vfmsac_vf_nxv2f16_f16_nxv2f16(<vscale x 2 x half> %0, half %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_vf_nxv2f16_f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfmsac.nxv2f16.f16(
@@ -688,10 +688,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfmsac.mask.nxv2f16.f16(
 define <vscale x 2 x half> @intrinsic_vfmsac_mask_vf_nxv2f16_f16_nxv2f16(<vscale x 2 x half> %0, half %1, <vscale x 2 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_mask_vf_nxv2f16_f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfmsac.mask.nxv2f16.f16(
@@ -713,10 +713,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfmsac.nxv4f16.f16(
 define <vscale x 4 x half>  @intrinsic_vfmsac_vf_nxv4f16_f16_nxv4f16(<vscale x 4 x half> %0, half %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_vf_nxv4f16_f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfmsac.nxv4f16.f16(
@@ -738,10 +738,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfmsac.mask.nxv4f16.f16(
 define <vscale x 4 x half> @intrinsic_vfmsac_mask_vf_nxv4f16_f16_nxv4f16(<vscale x 4 x half> %0, half %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_mask_vf_nxv4f16_f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfmsac.mask.nxv4f16.f16(
@@ -763,10 +763,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfmsac.nxv8f16.f16(
 define <vscale x 8 x half>  @intrinsic_vfmsac_vf_nxv8f16_f16_nxv8f16(<vscale x 8 x half> %0, half %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_vf_nxv8f16_f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vf v8, fa0, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfmsac.nxv8f16.f16(
@@ -788,10 +788,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfmsac.mask.nxv8f16.f16(
 define <vscale x 8 x half> @intrinsic_vfmsac_mask_vf_nxv8f16_f16_nxv8f16(<vscale x 8 x half> %0, half %1, <vscale x 8 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_mask_vf_nxv8f16_f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vf v8, fa0, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfmsac.mask.nxv8f16.f16(
@@ -813,10 +813,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfmsac.nxv16f16.f16(
 define <vscale x 16 x half>  @intrinsic_vfmsac_vf_nxv16f16_f16_nxv16f16(<vscale x 16 x half> %0, half %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_vf_nxv16f16_f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vf v8, fa0, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfmsac.nxv16f16.f16(
@@ -838,10 +838,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfmsac.mask.nxv16f16.f16(
 define <vscale x 16 x half> @intrinsic_vfmsac_mask_vf_nxv16f16_f16_nxv16f16(<vscale x 16 x half> %0, half %1, <vscale x 16 x half> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_mask_vf_nxv16f16_f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vf v8, fa0, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfmsac.mask.nxv16f16.f16(
@@ -863,10 +863,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfmsac.nxv1f32.f32(
 define <vscale x 1 x float>  @intrinsic_vfmsac_vf_nxv1f32_f32_nxv1f32(<vscale x 1 x float> %0, float %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_vf_nxv1f32_f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfmsac.nxv1f32.f32(
@@ -888,10 +888,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfmsac.mask.nxv1f32.f32(
 define <vscale x 1 x float> @intrinsic_vfmsac_mask_vf_nxv1f32_f32_nxv1f32(<vscale x 1 x float> %0, float %1, <vscale x 1 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_mask_vf_nxv1f32_f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfmsac.mask.nxv1f32.f32(
@@ -913,10 +913,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfmsac.nxv2f32.f32(
 define <vscale x 2 x float>  @intrinsic_vfmsac_vf_nxv2f32_f32_nxv2f32(<vscale x 2 x float> %0, float %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_vf_nxv2f32_f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfmsac.nxv2f32.f32(
@@ -938,10 +938,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfmsac.mask.nxv2f32.f32(
 define <vscale x 2 x float> @intrinsic_vfmsac_mask_vf_nxv2f32_f32_nxv2f32(<vscale x 2 x float> %0, float %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_mask_vf_nxv2f32_f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfmsac.mask.nxv2f32.f32(
@@ -963,10 +963,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfmsac.nxv4f32.f32(
 define <vscale x 4 x float>  @intrinsic_vfmsac_vf_nxv4f32_f32_nxv4f32(<vscale x 4 x float> %0, float %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_vf_nxv4f32_f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vf v8, fa0, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfmsac.nxv4f32.f32(
@@ -988,10 +988,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfmsac.mask.nxv4f32.f32(
 define <vscale x 4 x float> @intrinsic_vfmsac_mask_vf_nxv4f32_f32_nxv4f32(<vscale x 4 x float> %0, float %1, <vscale x 4 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_mask_vf_nxv4f32_f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vf v8, fa0, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfmsac.mask.nxv4f32.f32(
@@ -1013,10 +1013,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfmsac.nxv8f32.f32(
 define <vscale x 8 x float>  @intrinsic_vfmsac_vf_nxv8f32_f32_nxv8f32(<vscale x 8 x float> %0, float %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_vf_nxv8f32_f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vf v8, fa0, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfmsac.nxv8f32.f32(
@@ -1038,10 +1038,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfmsac.mask.nxv8f32.f32(
 define <vscale x 8 x float> @intrinsic_vfmsac_mask_vf_nxv8f32_f32_nxv8f32(<vscale x 8 x float> %0, float %1, <vscale x 8 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_mask_vf_nxv8f32_f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vf v8, fa0, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfmsac.mask.nxv8f32.f32(
@@ -1063,10 +1063,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfmsac.nxv1f64.f64(
 define <vscale x 1 x double>  @intrinsic_vfmsac_vf_nxv1f64_f64_nxv1f64(<vscale x 1 x double> %0, double %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_vf_nxv1f64_f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfmsac.nxv1f64.f64(
@@ -1088,10 +1088,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfmsac.mask.nxv1f64.f64(
 define <vscale x 1 x double> @intrinsic_vfmsac_mask_vf_nxv1f64_f64_nxv1f64(<vscale x 1 x double> %0, double %1, <vscale x 1 x double> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_mask_vf_nxv1f64_f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfmsac.mask.nxv1f64.f64(
@@ -1113,10 +1113,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfmsac.nxv2f64.f64(
 define <vscale x 2 x double>  @intrinsic_vfmsac_vf_nxv2f64_f64_nxv2f64(<vscale x 2 x double> %0, double %1, <vscale x 2 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_vf_nxv2f64_f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vf v8, fa0, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfmsac.nxv2f64.f64(
@@ -1138,10 +1138,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfmsac.mask.nxv2f64.f64(
 define <vscale x 2 x double> @intrinsic_vfmsac_mask_vf_nxv2f64_f64_nxv2f64(<vscale x 2 x double> %0, double %1, <vscale x 2 x double> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_mask_vf_nxv2f64_f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vf v8, fa0, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfmsac.mask.nxv2f64.f64(
@@ -1163,10 +1163,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfmsac.nxv4f64.f64(
 define <vscale x 4 x double>  @intrinsic_vfmsac_vf_nxv4f64_f64_nxv4f64(<vscale x 4 x double> %0, double %1, <vscale x 4 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_vf_nxv4f64_f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vf v8, fa0, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfmsac.nxv4f64.f64(
@@ -1188,10 +1188,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfmsac.mask.nxv4f64.f64(
 define <vscale x 4 x double> @intrinsic_vfmsac_mask_vf_nxv4f64_f64_nxv4f64(<vscale x 4 x double> %0, double %1, <vscale x 4 x double> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsac_mask_vf_nxv4f64_f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsac.vf v8, fa0, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfmsac.mask.nxv4f64.f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmsub.ll b/llvm/test/CodeGen/RISCV/rvv/vfmsub.ll
index 4cda25e18911..626b40e132c7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmsub.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmsub.ll
@@ -13,10 +13,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfmsub.nxv1f16.nxv1f16(
 define <vscale x 1 x half>  @intrinsic_vfmsub_vv_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_vv_nxv1f16_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfmsub.nxv1f16.nxv1f16(
@@ -38,10 +38,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfmsub.mask.nxv1f16.nxv1f16(
 define <vscale x 1 x half>  @intrinsic_vfmsub_mask_vv_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_mask_vv_nxv1f16_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfmsub.mask.nxv1f16.nxv1f16(
@@ -63,10 +63,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfmsub.nxv2f16.nxv2f16(
 define <vscale x 2 x half>  @intrinsic_vfmsub_vv_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_vv_nxv2f16_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfmsub.nxv2f16.nxv2f16(
@@ -88,10 +88,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfmsub.mask.nxv2f16.nxv2f16(
 define <vscale x 2 x half>  @intrinsic_vfmsub_mask_vv_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_mask_vv_nxv2f16_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfmsub.mask.nxv2f16.nxv2f16(
@@ -113,10 +113,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfmsub.nxv4f16.nxv4f16(
 define <vscale x 4 x half>  @intrinsic_vfmsub_vv_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_vv_nxv4f16_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfmsub.nxv4f16.nxv4f16(
@@ -138,10 +138,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfmsub.mask.nxv4f16.nxv4f16(
 define <vscale x 4 x half>  @intrinsic_vfmsub_mask_vv_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_mask_vv_nxv4f16_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfmsub.mask.nxv4f16.nxv4f16(
@@ -163,10 +163,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfmsub.nxv8f16.nxv8f16(
 define <vscale x 8 x half>  @intrinsic_vfmsub_vv_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_vv_nxv8f16_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vv v8, v10, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfmsub.nxv8f16.nxv8f16(
@@ -188,10 +188,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfmsub.mask.nxv8f16.nxv8f16(
 define <vscale x 8 x half>  @intrinsic_vfmsub_mask_vv_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_mask_vv_nxv8f16_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfmsub.mask.nxv8f16.nxv8f16(
@@ -213,10 +213,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfmsub.nxv16f16.nxv16f16(
 define <vscale x 16 x half>  @intrinsic_vfmsub_vv_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_vv_nxv16f16_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vv v8, v12, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfmsub.nxv16f16.nxv16f16(
@@ -238,10 +238,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfmsub.mask.nxv16f16.nxv16f16(
 define <vscale x 16 x half>  @intrinsic_vfmsub_mask_vv_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_mask_vv_nxv16f16_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfmsub.mask.nxv16f16.nxv16f16(
@@ -263,10 +263,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfmsub.nxv1f32.nxv1f32(
 define <vscale x 1 x float>  @intrinsic_vfmsub_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_vv_nxv1f32_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfmsub.nxv1f32.nxv1f32(
@@ -288,10 +288,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfmsub.mask.nxv1f32.nxv1f32(
 define <vscale x 1 x float>  @intrinsic_vfmsub_mask_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_mask_vv_nxv1f32_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfmsub.mask.nxv1f32.nxv1f32(
@@ -313,10 +313,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfmsub.nxv2f32.nxv2f32(
 define <vscale x 2 x float>  @intrinsic_vfmsub_vv_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_vv_nxv2f32_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfmsub.nxv2f32.nxv2f32(
@@ -338,10 +338,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfmsub.mask.nxv2f32.nxv2f32(
 define <vscale x 2 x float>  @intrinsic_vfmsub_mask_vv_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_mask_vv_nxv2f32_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfmsub.mask.nxv2f32.nxv2f32(
@@ -363,10 +363,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfmsub.nxv4f32.nxv4f32(
 define <vscale x 4 x float>  @intrinsic_vfmsub_vv_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_vv_nxv4f32_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vv v8, v10, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfmsub.nxv4f32.nxv4f32(
@@ -388,10 +388,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfmsub.mask.nxv4f32.nxv4f32(
 define <vscale x 4 x float>  @intrinsic_vfmsub_mask_vv_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_mask_vv_nxv4f32_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfmsub.mask.nxv4f32.nxv4f32(
@@ -413,10 +413,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfmsub.nxv8f32.nxv8f32(
 define <vscale x 8 x float>  @intrinsic_vfmsub_vv_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_vv_nxv8f32_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vv v8, v12, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfmsub.nxv8f32.nxv8f32(
@@ -438,10 +438,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfmsub.mask.nxv8f32.nxv8f32(
 define <vscale x 8 x float>  @intrinsic_vfmsub_mask_vv_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_mask_vv_nxv8f32_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfmsub.mask.nxv8f32.nxv8f32(
@@ -463,10 +463,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfmsub.nxv1f64.nxv1f64(
 define <vscale x 1 x double>  @intrinsic_vfmsub_vv_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_vv_nxv1f64_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfmsub.nxv1f64.nxv1f64(
@@ -488,10 +488,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfmsub.mask.nxv1f64.nxv1f64(
 define <vscale x 1 x double>  @intrinsic_vfmsub_mask_vv_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_mask_vv_nxv1f64_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfmsub.mask.nxv1f64.nxv1f64(
@@ -513,10 +513,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfmsub.nxv2f64.nxv2f64(
 define <vscale x 2 x double>  @intrinsic_vfmsub_vv_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_vv_nxv2f64_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vv v8, v10, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfmsub.nxv2f64.nxv2f64(
@@ -538,10 +538,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfmsub.mask.nxv2f64.nxv2f64(
 define <vscale x 2 x double>  @intrinsic_vfmsub_mask_vv_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_mask_vv_nxv2f64_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfmsub.mask.nxv2f64.nxv2f64(
@@ -563,10 +563,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfmsub.nxv4f64.nxv4f64(
 define <vscale x 4 x double>  @intrinsic_vfmsub_vv_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_vv_nxv4f64_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vv v8, v12, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfmsub.nxv4f64.nxv4f64(
@@ -588,10 +588,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfmsub.mask.nxv4f64.nxv4f64(
 define <vscale x 4 x double>  @intrinsic_vfmsub_mask_vv_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_mask_vv_nxv4f64_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfmsub.mask.nxv4f64.nxv4f64(
@@ -613,10 +613,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfmsub.nxv1f16.f16(
 define <vscale x 1 x half>  @intrinsic_vfmsub_vf_nxv1f16_f16_nxv1f16(<vscale x 1 x half> %0, half %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_vf_nxv1f16_f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfmsub.nxv1f16.f16(
@@ -638,10 +638,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfmsub.mask.nxv1f16.f16(
 define <vscale x 1 x half> @intrinsic_vfmsub_mask_vf_nxv1f16_f16_nxv1f16(<vscale x 1 x half> %0, half %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_mask_vf_nxv1f16_f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfmsub.mask.nxv1f16.f16(
@@ -663,10 +663,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfmsub.nxv2f16.f16(
 define <vscale x 2 x half>  @intrinsic_vfmsub_vf_nxv2f16_f16_nxv2f16(<vscale x 2 x half> %0, half %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_vf_nxv2f16_f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfmsub.nxv2f16.f16(
@@ -688,10 +688,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfmsub.mask.nxv2f16.f16(
 define <vscale x 2 x half> @intrinsic_vfmsub_mask_vf_nxv2f16_f16_nxv2f16(<vscale x 2 x half> %0, half %1, <vscale x 2 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_mask_vf_nxv2f16_f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfmsub.mask.nxv2f16.f16(
@@ -713,10 +713,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfmsub.nxv4f16.f16(
 define <vscale x 4 x half>  @intrinsic_vfmsub_vf_nxv4f16_f16_nxv4f16(<vscale x 4 x half> %0, half %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_vf_nxv4f16_f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfmsub.nxv4f16.f16(
@@ -738,10 +738,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfmsub.mask.nxv4f16.f16(
 define <vscale x 4 x half> @intrinsic_vfmsub_mask_vf_nxv4f16_f16_nxv4f16(<vscale x 4 x half> %0, half %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_mask_vf_nxv4f16_f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfmsub.mask.nxv4f16.f16(
@@ -763,10 +763,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfmsub.nxv8f16.f16(
 define <vscale x 8 x half>  @intrinsic_vfmsub_vf_nxv8f16_f16_nxv8f16(<vscale x 8 x half> %0, half %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_vf_nxv8f16_f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vf v8, fa0, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfmsub.nxv8f16.f16(
@@ -788,10 +788,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfmsub.mask.nxv8f16.f16(
 define <vscale x 8 x half> @intrinsic_vfmsub_mask_vf_nxv8f16_f16_nxv8f16(<vscale x 8 x half> %0, half %1, <vscale x 8 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_mask_vf_nxv8f16_f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vf v8, fa0, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfmsub.mask.nxv8f16.f16(
@@ -813,10 +813,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfmsub.nxv16f16.f16(
 define <vscale x 16 x half>  @intrinsic_vfmsub_vf_nxv16f16_f16_nxv16f16(<vscale x 16 x half> %0, half %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_vf_nxv16f16_f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vf v8, fa0, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfmsub.nxv16f16.f16(
@@ -838,10 +838,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfmsub.mask.nxv16f16.f16(
 define <vscale x 16 x half> @intrinsic_vfmsub_mask_vf_nxv16f16_f16_nxv16f16(<vscale x 16 x half> %0, half %1, <vscale x 16 x half> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_mask_vf_nxv16f16_f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vf v8, fa0, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfmsub.mask.nxv16f16.f16(
@@ -863,10 +863,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfmsub.nxv1f32.f32(
 define <vscale x 1 x float>  @intrinsic_vfmsub_vf_nxv1f32_f32_nxv1f32(<vscale x 1 x float> %0, float %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_vf_nxv1f32_f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfmsub.nxv1f32.f32(
@@ -888,10 +888,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfmsub.mask.nxv1f32.f32(
 define <vscale x 1 x float> @intrinsic_vfmsub_mask_vf_nxv1f32_f32_nxv1f32(<vscale x 1 x float> %0, float %1, <vscale x 1 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_mask_vf_nxv1f32_f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfmsub.mask.nxv1f32.f32(
@@ -913,10 +913,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfmsub.nxv2f32.f32(
 define <vscale x 2 x float>  @intrinsic_vfmsub_vf_nxv2f32_f32_nxv2f32(<vscale x 2 x float> %0, float %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_vf_nxv2f32_f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfmsub.nxv2f32.f32(
@@ -938,10 +938,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfmsub.mask.nxv2f32.f32(
 define <vscale x 2 x float> @intrinsic_vfmsub_mask_vf_nxv2f32_f32_nxv2f32(<vscale x 2 x float> %0, float %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_mask_vf_nxv2f32_f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfmsub.mask.nxv2f32.f32(
@@ -963,10 +963,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfmsub.nxv4f32.f32(
 define <vscale x 4 x float>  @intrinsic_vfmsub_vf_nxv4f32_f32_nxv4f32(<vscale x 4 x float> %0, float %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_vf_nxv4f32_f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vf v8, fa0, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfmsub.nxv4f32.f32(
@@ -988,10 +988,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfmsub.mask.nxv4f32.f32(
 define <vscale x 4 x float> @intrinsic_vfmsub_mask_vf_nxv4f32_f32_nxv4f32(<vscale x 4 x float> %0, float %1, <vscale x 4 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_mask_vf_nxv4f32_f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vf v8, fa0, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfmsub.mask.nxv4f32.f32(
@@ -1013,10 +1013,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfmsub.nxv8f32.f32(
 define <vscale x 8 x float>  @intrinsic_vfmsub_vf_nxv8f32_f32_nxv8f32(<vscale x 8 x float> %0, float %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_vf_nxv8f32_f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vf v8, fa0, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfmsub.nxv8f32.f32(
@@ -1038,10 +1038,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfmsub.mask.nxv8f32.f32(
 define <vscale x 8 x float> @intrinsic_vfmsub_mask_vf_nxv8f32_f32_nxv8f32(<vscale x 8 x float> %0, float %1, <vscale x 8 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_mask_vf_nxv8f32_f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vf v8, fa0, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfmsub.mask.nxv8f32.f32(
@@ -1063,10 +1063,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfmsub.nxv1f64.f64(
 define <vscale x 1 x double>  @intrinsic_vfmsub_vf_nxv1f64_f64_nxv1f64(<vscale x 1 x double> %0, double %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_vf_nxv1f64_f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfmsub.nxv1f64.f64(
@@ -1088,10 +1088,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfmsub.mask.nxv1f64.f64(
 define <vscale x 1 x double> @intrinsic_vfmsub_mask_vf_nxv1f64_f64_nxv1f64(<vscale x 1 x double> %0, double %1, <vscale x 1 x double> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_mask_vf_nxv1f64_f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfmsub.mask.nxv1f64.f64(
@@ -1113,10 +1113,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfmsub.nxv2f64.f64(
 define <vscale x 2 x double>  @intrinsic_vfmsub_vf_nxv2f64_f64_nxv2f64(<vscale x 2 x double> %0, double %1, <vscale x 2 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_vf_nxv2f64_f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vf v8, fa0, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfmsub.nxv2f64.f64(
@@ -1138,10 +1138,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfmsub.mask.nxv2f64.f64(
 define <vscale x 2 x double> @intrinsic_vfmsub_mask_vf_nxv2f64_f64_nxv2f64(<vscale x 2 x double> %0, double %1, <vscale x 2 x double> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_mask_vf_nxv2f64_f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vf v8, fa0, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfmsub.mask.nxv2f64.f64(
@@ -1163,10 +1163,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfmsub.nxv4f64.f64(
 define <vscale x 4 x double>  @intrinsic_vfmsub_vf_nxv4f64_f64_nxv4f64(<vscale x 4 x double> %0, double %1, <vscale x 4 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_vf_nxv4f64_f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vf v8, fa0, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfmsub.nxv4f64.f64(
@@ -1188,10 +1188,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfmsub.mask.nxv4f64.f64(
 define <vscale x 4 x double> @intrinsic_vfmsub_mask_vf_nxv4f64_f64_nxv4f64(<vscale x 4 x double> %0, double %1, <vscale x 4 x double> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmsub_mask_vf_nxv4f64_f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmsub.vf v8, fa0, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfmsub.mask.nxv4f64.f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmul.ll b/llvm/test/CodeGen/RISCV/rvv/vfmul.ll
index ee1d197e091f..b73d03fe36c7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmul.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmul.ll
@@ -13,10 +13,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfmul.nxv1f16.nxv1f16(
 define <vscale x 1 x half> @intrinsic_vfmul_vv_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_vv_nxv1f16_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vv v8, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfmul.nxv1f16.nxv1f16(
@@ -38,10 +38,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfmul.mask.nxv1f16.nxv1f16(
 define <vscale x 1 x half> @intrinsic_vfmul_mask_vv_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_mask_vv_nxv1f16_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfmul.mask.nxv1f16.nxv1f16(
@@ -63,10 +63,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfmul.nxv2f16.nxv2f16(
 define <vscale x 2 x half> @intrinsic_vfmul_vv_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_vv_nxv2f16_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vv v8, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfmul.nxv2f16.nxv2f16(
@@ -88,10 +88,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfmul.mask.nxv2f16.nxv2f16(
 define <vscale x 2 x half> @intrinsic_vfmul_mask_vv_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_mask_vv_nxv2f16_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfmul.mask.nxv2f16.nxv2f16(
@@ -113,10 +113,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfmul.nxv4f16.nxv4f16(
 define <vscale x 4 x half> @intrinsic_vfmul_vv_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_vv_nxv4f16_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vv v8, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfmul.nxv4f16.nxv4f16(
@@ -138,10 +138,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfmul.mask.nxv4f16.nxv4f16(
 define <vscale x 4 x half> @intrinsic_vfmul_mask_vv_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_mask_vv_nxv4f16_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfmul.mask.nxv4f16.nxv4f16(
@@ -163,10 +163,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfmul.nxv8f16.nxv8f16(
 define <vscale x 8 x half> @intrinsic_vfmul_vv_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_vv_nxv8f16_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vv v8, v8, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfmul.nxv8f16.nxv8f16(
@@ -188,10 +188,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfmul.mask.nxv8f16.nxv8f16(
 define <vscale x 8 x half> @intrinsic_vfmul_mask_vv_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_mask_vv_nxv8f16_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfmul.mask.nxv8f16.nxv8f16(
@@ -213,10 +213,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfmul.nxv16f16.nxv16f16(
 define <vscale x 16 x half> @intrinsic_vfmul_vv_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_vv_nxv16f16_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vv v8, v8, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfmul.nxv16f16.nxv16f16(
@@ -238,10 +238,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfmul.mask.nxv16f16.nxv16f16(
 define <vscale x 16 x half> @intrinsic_vfmul_mask_vv_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_mask_vv_nxv16f16_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfmul.mask.nxv16f16.nxv16f16(
@@ -263,10 +263,10 @@ declare <vscale x 32 x half> @llvm.riscv.vfmul.nxv32f16.nxv32f16(
 define <vscale x 32 x half> @intrinsic_vfmul_vv_nxv32f16_nxv32f16_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_vv_nxv32f16_nxv32f16_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vv v8, v8, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vfmul.nxv32f16.nxv32f16(
@@ -289,8 +289,8 @@ define <vscale x 32 x half> @intrinsic_vfmul_mask_vv_nxv32f16_nxv32f16_nxv32f16(
 ; CHECK-LABEL: intrinsic_vfmul_mask_vv_nxv32f16_nxv32f16_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    vfmul.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
@@ -314,10 +314,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfmul.nxv1f32.nxv1f32(
 define <vscale x 1 x float> @intrinsic_vfmul_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_vv_nxv1f32_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vv v8, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfmul.nxv1f32.nxv1f32(
@@ -339,10 +339,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfmul.mask.nxv1f32.nxv1f32(
 define <vscale x 1 x float> @intrinsic_vfmul_mask_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_mask_vv_nxv1f32_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfmul.mask.nxv1f32.nxv1f32(
@@ -364,10 +364,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfmul.nxv2f32.nxv2f32(
 define <vscale x 2 x float> @intrinsic_vfmul_vv_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_vv_nxv2f32_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vv v8, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfmul.nxv2f32.nxv2f32(
@@ -389,10 +389,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfmul.mask.nxv2f32.nxv2f32(
 define <vscale x 2 x float> @intrinsic_vfmul_mask_vv_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_mask_vv_nxv2f32_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfmul.mask.nxv2f32.nxv2f32(
@@ -414,10 +414,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfmul.nxv4f32.nxv4f32(
 define <vscale x 4 x float> @intrinsic_vfmul_vv_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_vv_nxv4f32_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vv v8, v8, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfmul.nxv4f32.nxv4f32(
@@ -439,10 +439,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfmul.mask.nxv4f32.nxv4f32(
 define <vscale x 4 x float> @intrinsic_vfmul_mask_vv_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_mask_vv_nxv4f32_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfmul.mask.nxv4f32.nxv4f32(
@@ -464,10 +464,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfmul.nxv8f32.nxv8f32(
 define <vscale x 8 x float> @intrinsic_vfmul_vv_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_vv_nxv8f32_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vv v8, v8, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfmul.nxv8f32.nxv8f32(
@@ -489,10 +489,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfmul.mask.nxv8f32.nxv8f32(
 define <vscale x 8 x float> @intrinsic_vfmul_mask_vv_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_mask_vv_nxv8f32_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfmul.mask.nxv8f32.nxv8f32(
@@ -514,10 +514,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfmul.nxv16f32.nxv16f32(
 define <vscale x 16 x float> @intrinsic_vfmul_vv_nxv16f32_nxv16f32_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_vv_nxv16f32_nxv16f32_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vv v8, v8, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfmul.nxv16f32.nxv16f32(
@@ -540,8 +540,8 @@ define <vscale x 16 x float> @intrinsic_vfmul_mask_vv_nxv16f32_nxv16f32_nxv16f32
 ; CHECK-LABEL: intrinsic_vfmul_mask_vv_nxv16f32_nxv16f32_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re32.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    vfmul.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
@@ -565,10 +565,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfmul.nxv1f64.nxv1f64(
 define <vscale x 1 x double> @intrinsic_vfmul_vv_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_vv_nxv1f64_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vv v8, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfmul.nxv1f64.nxv1f64(
@@ -590,10 +590,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfmul.mask.nxv1f64.nxv1f64(
 define <vscale x 1 x double> @intrinsic_vfmul_mask_vv_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_mask_vv_nxv1f64_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfmul.mask.nxv1f64.nxv1f64(
@@ -615,10 +615,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfmul.nxv2f64.nxv2f64(
 define <vscale x 2 x double> @intrinsic_vfmul_vv_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_vv_nxv2f64_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vv v8, v8, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfmul.nxv2f64.nxv2f64(
@@ -640,10 +640,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfmul.mask.nxv2f64.nxv2f64(
 define <vscale x 2 x double> @intrinsic_vfmul_mask_vv_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_mask_vv_nxv2f64_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfmul.mask.nxv2f64.nxv2f64(
@@ -665,10 +665,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfmul.nxv4f64.nxv4f64(
 define <vscale x 4 x double> @intrinsic_vfmul_vv_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_vv_nxv4f64_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vv v8, v8, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfmul.nxv4f64.nxv4f64(
@@ -690,10 +690,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfmul.mask.nxv4f64.nxv4f64(
 define <vscale x 4 x double> @intrinsic_vfmul_mask_vv_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_mask_vv_nxv4f64_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfmul.mask.nxv4f64.nxv4f64(
@@ -715,10 +715,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfmul.nxv8f64.nxv8f64(
 define <vscale x 8 x double> @intrinsic_vfmul_vv_nxv8f64_nxv8f64_nxv8f64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_vv_nxv8f64_nxv8f64_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vv v8, v8, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfmul.nxv8f64.nxv8f64(
@@ -741,8 +741,8 @@ define <vscale x 8 x double> @intrinsic_vfmul_mask_vv_nxv8f64_nxv8f64_nxv8f64(<v
 ; CHECK-LABEL: intrinsic_vfmul_mask_vv_nxv8f64_nxv8f64_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re64.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
 ; CHECK-NEXT:    vfmul.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
@@ -766,10 +766,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfmul.nxv1f16.f16(
 define <vscale x 1 x half> @intrinsic_vfmul_vf_nxv1f16_nxv1f16_f16(<vscale x 1 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_vf_nxv1f16_nxv1f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfmul.nxv1f16.f16(
@@ -791,10 +791,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfmul.mask.nxv1f16.f16(
 define <vscale x 1 x half> @intrinsic_vfmul_mask_vf_nxv1f16_nxv1f16_f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, half %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_mask_vf_nxv1f16_nxv1f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfmul.mask.nxv1f16.f16(
@@ -816,10 +816,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfmul.nxv2f16.f16(
 define <vscale x 2 x half> @intrinsic_vfmul_vf_nxv2f16_nxv2f16_f16(<vscale x 2 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_vf_nxv2f16_nxv2f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfmul.nxv2f16.f16(
@@ -841,10 +841,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfmul.mask.nxv2f16.f16(
 define <vscale x 2 x half> @intrinsic_vfmul_mask_vf_nxv2f16_nxv2f16_f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, half %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_mask_vf_nxv2f16_nxv2f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfmul.mask.nxv2f16.f16(
@@ -866,10 +866,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfmul.nxv4f16.f16(
 define <vscale x 4 x half> @intrinsic_vfmul_vf_nxv4f16_nxv4f16_f16(<vscale x 4 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_vf_nxv4f16_nxv4f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfmul.nxv4f16.f16(
@@ -891,10 +891,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfmul.mask.nxv4f16.f16(
 define <vscale x 4 x half> @intrinsic_vfmul_mask_vf_nxv4f16_nxv4f16_f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, half %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_mask_vf_nxv4f16_nxv4f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfmul.mask.nxv4f16.f16(
@@ -916,10 +916,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfmul.nxv8f16.f16(
 define <vscale x 8 x half> @intrinsic_vfmul_vf_nxv8f16_nxv8f16_f16(<vscale x 8 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_vf_nxv8f16_nxv8f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfmul.nxv8f16.f16(
@@ -941,10 +941,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfmul.mask.nxv8f16.f16(
 define <vscale x 8 x half> @intrinsic_vfmul_mask_vf_nxv8f16_nxv8f16_f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, half %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_mask_vf_nxv8f16_nxv8f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vf v8, v10, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfmul.mask.nxv8f16.f16(
@@ -966,10 +966,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfmul.nxv16f16.f16(
 define <vscale x 16 x half> @intrinsic_vfmul_vf_nxv16f16_nxv16f16_f16(<vscale x 16 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_vf_nxv16f16_nxv16f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfmul.nxv16f16.f16(
@@ -991,10 +991,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfmul.mask.nxv16f16.f16(
 define <vscale x 16 x half> @intrinsic_vfmul_mask_vf_nxv16f16_nxv16f16_f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, half %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_mask_vf_nxv16f16_nxv16f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vf v8, v12, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfmul.mask.nxv16f16.f16(
@@ -1016,10 +1016,10 @@ declare <vscale x 32 x half> @llvm.riscv.vfmul.nxv32f16.f16(
 define <vscale x 32 x half> @intrinsic_vfmul_vf_nxv32f16_nxv32f16_f16(<vscale x 32 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_vf_nxv32f16_nxv32f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vfmul.nxv32f16.f16(
@@ -1041,10 +1041,10 @@ declare <vscale x 32 x half> @llvm.riscv.vfmul.mask.nxv32f16.f16(
 define <vscale x 32 x half> @intrinsic_vfmul_mask_vf_nxv32f16_nxv32f16_f16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, half %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_mask_vf_nxv32f16_nxv32f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vf v8, v16, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vfmul.mask.nxv32f16.f16(
@@ -1066,10 +1066,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfmul.nxv1f32.f32(
 define <vscale x 1 x float> @intrinsic_vfmul_vf_nxv1f32_nxv1f32_f32(<vscale x 1 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_vf_nxv1f32_nxv1f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfmul.nxv1f32.f32(
@@ -1091,10 +1091,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfmul.mask.nxv1f32.f32(
 define <vscale x 1 x float> @intrinsic_vfmul_mask_vf_nxv1f32_nxv1f32_f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, float %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_mask_vf_nxv1f32_nxv1f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfmul.mask.nxv1f32.f32(
@@ -1116,10 +1116,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfmul.nxv2f32.f32(
 define <vscale x 2 x float> @intrinsic_vfmul_vf_nxv2f32_nxv2f32_f32(<vscale x 2 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_vf_nxv2f32_nxv2f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfmul.nxv2f32.f32(
@@ -1141,10 +1141,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfmul.mask.nxv2f32.f32(
 define <vscale x 2 x float> @intrinsic_vfmul_mask_vf_nxv2f32_nxv2f32_f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, float %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_mask_vf_nxv2f32_nxv2f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfmul.mask.nxv2f32.f32(
@@ -1166,10 +1166,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfmul.nxv4f32.f32(
 define <vscale x 4 x float> @intrinsic_vfmul_vf_nxv4f32_nxv4f32_f32(<vscale x 4 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_vf_nxv4f32_nxv4f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfmul.nxv4f32.f32(
@@ -1191,10 +1191,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfmul.mask.nxv4f32.f32(
 define <vscale x 4 x float> @intrinsic_vfmul_mask_vf_nxv4f32_nxv4f32_f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, float %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_mask_vf_nxv4f32_nxv4f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vf v8, v10, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfmul.mask.nxv4f32.f32(
@@ -1216,10 +1216,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfmul.nxv8f32.f32(
 define <vscale x 8 x float> @intrinsic_vfmul_vf_nxv8f32_nxv8f32_f32(<vscale x 8 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_vf_nxv8f32_nxv8f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfmul.nxv8f32.f32(
@@ -1241,10 +1241,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfmul.mask.nxv8f32.f32(
 define <vscale x 8 x float> @intrinsic_vfmul_mask_vf_nxv8f32_nxv8f32_f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, float %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_mask_vf_nxv8f32_nxv8f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vf v8, v12, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfmul.mask.nxv8f32.f32(
@@ -1266,10 +1266,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfmul.nxv16f32.f32(
 define <vscale x 16 x float> @intrinsic_vfmul_vf_nxv16f32_nxv16f32_f32(<vscale x 16 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_vf_nxv16f32_nxv16f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfmul.nxv16f32.f32(
@@ -1291,10 +1291,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfmul.mask.nxv16f32.f32(
 define <vscale x 16 x float> @intrinsic_vfmul_mask_vf_nxv16f32_nxv16f32_f32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, float %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_mask_vf_nxv16f32_nxv16f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vf v8, v16, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfmul.mask.nxv16f32.f32(
@@ -1316,10 +1316,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfmul.nxv1f64.f64(
 define <vscale x 1 x double> @intrinsic_vfmul_vf_nxv1f64_nxv1f64_f64(<vscale x 1 x double> %0, double %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_vf_nxv1f64_nxv1f64_f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfmul.nxv1f64.f64(
@@ -1341,10 +1341,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfmul.mask.nxv1f64.f64(
 define <vscale x 1 x double> @intrinsic_vfmul_mask_vf_nxv1f64_nxv1f64_f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, double %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_mask_vf_nxv1f64_nxv1f64_f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfmul.mask.nxv1f64.f64(
@@ -1366,10 +1366,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfmul.nxv2f64.f64(
 define <vscale x 2 x double> @intrinsic_vfmul_vf_nxv2f64_nxv2f64_f64(<vscale x 2 x double> %0, double %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_vf_nxv2f64_nxv2f64_f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfmul.nxv2f64.f64(
@@ -1391,10 +1391,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfmul.mask.nxv2f64.f64(
 define <vscale x 2 x double> @intrinsic_vfmul_mask_vf_nxv2f64_nxv2f64_f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, double %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_mask_vf_nxv2f64_nxv2f64_f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vf v8, v10, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfmul.mask.nxv2f64.f64(
@@ -1416,10 +1416,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfmul.nxv4f64.f64(
 define <vscale x 4 x double> @intrinsic_vfmul_vf_nxv4f64_nxv4f64_f64(<vscale x 4 x double> %0, double %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_vf_nxv4f64_nxv4f64_f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfmul.nxv4f64.f64(
@@ -1441,10 +1441,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfmul.mask.nxv4f64.f64(
 define <vscale x 4 x double> @intrinsic_vfmul_mask_vf_nxv4f64_nxv4f64_f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, double %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_mask_vf_nxv4f64_nxv4f64_f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vf v8, v12, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfmul.mask.nxv4f64.f64(
@@ -1466,10 +1466,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfmul.nxv8f64.f64(
 define <vscale x 8 x double> @intrinsic_vfmul_vf_nxv8f64_nxv8f64_f64(<vscale x 8 x double> %0, double %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_vf_nxv8f64_nxv8f64_f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfmul.nxv8f64.f64(
@@ -1491,10 +1491,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfmul.mask.nxv8f64.f64(
 define <vscale x 8 x double> @intrinsic_vfmul_mask_vf_nxv8f64_nxv8f64_f64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, double %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfmul_mask_vf_nxv8f64_nxv8f64_f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfmul.vf v8, v16, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfmul.mask.nxv8f64.f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfncvt-f-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfncvt-f-f.ll
index 2de7d78df881..183ffa8a668a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfncvt-f-f.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfncvt-f-f.ll
@@ -15,10 +15,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.nxv1f16.nxv1f32(
 define <vscale x 1 x half> @intrinsic_vfncvt_f.f.w_nxv1f16_nxv1f32(<vscale x 1 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_f.f.w_nxv1f16_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.f.w v9, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -39,10 +39,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32(
 define <vscale x 1 x half> @intrinsic_vfncvt_mask_f.f.w_nxv1f16_nxv1f32(<vscale x 1 x half> %0, <vscale x 1 x float> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_f.f.w_nxv1f16_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.f.w v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f16.nxv1f32(
@@ -62,10 +62,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.nxv2f16.nxv2f32(
 define <vscale x 2 x half> @intrinsic_vfncvt_f.f.w_nxv2f16_nxv2f32(<vscale x 2 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_f.f.w_nxv2f16_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.f.w v9, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -86,10 +86,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32(
 define <vscale x 2 x half> @intrinsic_vfncvt_mask_f.f.w_nxv2f16_nxv2f32(<vscale x 2 x half> %0, <vscale x 2 x float> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_f.f.w_nxv2f16_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.f.w v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f16.nxv2f32(
@@ -109,10 +109,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.nxv4f16.nxv4f32(
 define <vscale x 4 x half> @intrinsic_vfncvt_f.f.w_nxv4f16_nxv4f32(<vscale x 4 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_f.f.w_nxv4f16_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.f.w v10, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -133,10 +133,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32(
 define <vscale x 4 x half> @intrinsic_vfncvt_mask_f.f.w_nxv4f16_nxv4f32(<vscale x 4 x half> %0, <vscale x 4 x float> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_f.f.w_nxv4f16_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.f.w v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f16.nxv4f32(
@@ -156,10 +156,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.nxv8f16.nxv8f32(
 define <vscale x 8 x half> @intrinsic_vfncvt_f.f.w_nxv8f16_nxv8f32(<vscale x 8 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_f.f.w_nxv8f16_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.f.w v12, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -180,10 +180,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32(
 define <vscale x 8 x half> @intrinsic_vfncvt_mask_f.f.w_nxv8f16_nxv8f32(<vscale x 8 x half> %0, <vscale x 8 x float> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_f.f.w_nxv8f16_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.f.w v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f16.nxv8f32(
@@ -203,10 +203,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.nxv16f16.nxv16f32(
 define <vscale x 16 x half> @intrinsic_vfncvt_f.f.w_nxv16f16_nxv16f32(<vscale x 16 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_f.f.w_nxv16f16_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.f.w v16, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -227,10 +227,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32(
 define <vscale x 16 x half> @intrinsic_vfncvt_mask_f.f.w_nxv16f16_nxv16f32(<vscale x 16 x half> %0, <vscale x 16 x float> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_f.f.w_nxv16f16_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.f.w v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.f.w.mask.nxv16f16.nxv16f32(
@@ -250,10 +250,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.nxv1f32.nxv1f64(
 define <vscale x 1 x float> @intrinsic_vfncvt_f.f.w_nxv1f32_nxv1f64(<vscale x 1 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_f.f.w_nxv1f32_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.f.w v9, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -274,10 +274,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64(
 define <vscale x 1 x float> @intrinsic_vfncvt_mask_f.f.w_nxv1f32_nxv1f64(<vscale x 1 x float> %0, <vscale x 1 x double> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_f.f.w_nxv1f32_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.f.w v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv1f32.nxv1f64(
@@ -297,10 +297,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.nxv2f32.nxv2f64(
 define <vscale x 2 x float> @intrinsic_vfncvt_f.f.w_nxv2f32_nxv2f64(<vscale x 2 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_f.f.w_nxv2f32_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.f.w v10, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -321,10 +321,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64(
 define <vscale x 2 x float> @intrinsic_vfncvt_mask_f.f.w_nxv2f32_nxv2f64(<vscale x 2 x float> %0, <vscale x 2 x double> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_f.f.w_nxv2f32_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.f.w v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv2f32.nxv2f64(
@@ -344,10 +344,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.nxv4f32.nxv4f64(
 define <vscale x 4 x float> @intrinsic_vfncvt_f.f.w_nxv4f32_nxv4f64(<vscale x 4 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_f.f.w_nxv4f32_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.f.w v12, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -368,10 +368,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64(
 define <vscale x 4 x float> @intrinsic_vfncvt_mask_f.f.w_nxv4f32_nxv4f64(<vscale x 4 x float> %0, <vscale x 4 x double> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_f.f.w_nxv4f32_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.f.w v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv4f32.nxv4f64(
@@ -391,10 +391,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.nxv8f32.nxv8f64(
 define <vscale x 8 x float> @intrinsic_vfncvt_f.f.w_nxv8f32_nxv8f64(<vscale x 8 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_f.f.w_nxv8f32_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.f.w v16, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -415,10 +415,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64(
 define <vscale x 8 x float> @intrinsic_vfncvt_mask_f.f.w_nxv8f32_nxv8f64(<vscale x 8 x float> %0, <vscale x 8 x double> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_f.f.w_nxv8f32_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.f.w v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.f.w.mask.nxv8f32.nxv8f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfncvt-f-x.ll b/llvm/test/CodeGen/RISCV/rvv/vfncvt-f-x.ll
index 7f2714b2fbfc..aef119faf5f7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfncvt-f-x.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfncvt-f-x.ll
@@ -12,10 +12,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfncvt.f.x.w.nxv1f16.nxv1i32(
 define <vscale x 1 x half> @intrinsic_vfncvt_f.x.w_nxv1f16_nxv1i32(<vscale x 1 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_f.x.w_nxv1f16_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.x.w v9, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -36,10 +36,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfncvt.f.x.w.mask.nxv1f16.nxv1i32(
 define <vscale x 1 x half> @intrinsic_vfncvt_mask_f.x.w_nxv1f16_nxv1i32(<vscale x 1 x half> %0, <vscale x 1 x i32> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_f.x.w_nxv1f16_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.x.w v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.x.w.mask.nxv1f16.nxv1i32(
@@ -59,10 +59,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfncvt.f.x.w.nxv2f16.nxv2i32(
 define <vscale x 2 x half> @intrinsic_vfncvt_f.x.w_nxv2f16_nxv2i32(<vscale x 2 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_f.x.w_nxv2f16_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.x.w v9, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -83,10 +83,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfncvt.f.x.w.mask.nxv2f16.nxv2i32(
 define <vscale x 2 x half> @intrinsic_vfncvt_mask_f.x.w_nxv2f16_nxv2i32(<vscale x 2 x half> %0, <vscale x 2 x i32> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_f.x.w_nxv2f16_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.x.w v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.x.w.mask.nxv2f16.nxv2i32(
@@ -106,10 +106,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfncvt.f.x.w.nxv4f16.nxv4i32(
 define <vscale x 4 x half> @intrinsic_vfncvt_f.x.w_nxv4f16_nxv4i32(<vscale x 4 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_f.x.w_nxv4f16_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.x.w v10, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -130,10 +130,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfncvt.f.x.w.mask.nxv4f16.nxv4i32(
 define <vscale x 4 x half> @intrinsic_vfncvt_mask_f.x.w_nxv4f16_nxv4i32(<vscale x 4 x half> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_f.x.w_nxv4f16_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.x.w v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.x.w.mask.nxv4f16.nxv4i32(
@@ -153,10 +153,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfncvt.f.x.w.nxv8f16.nxv8i32(
 define <vscale x 8 x half> @intrinsic_vfncvt_f.x.w_nxv8f16_nxv8i32(<vscale x 8 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_f.x.w_nxv8f16_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.x.w v12, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -177,10 +177,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfncvt.f.x.w.mask.nxv8f16.nxv8i32(
 define <vscale x 8 x half> @intrinsic_vfncvt_mask_f.x.w_nxv8f16_nxv8i32(<vscale x 8 x half> %0, <vscale x 8 x i32> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_f.x.w_nxv8f16_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.x.w v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.x.w.mask.nxv8f16.nxv8i32(
@@ -200,10 +200,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfncvt.f.x.w.nxv16f16.nxv16i32(
 define <vscale x 16 x half> @intrinsic_vfncvt_f.x.w_nxv16f16_nxv16i32(<vscale x 16 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_f.x.w_nxv16f16_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.x.w v16, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -224,10 +224,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfncvt.f.x.w.mask.nxv16f16.nxv16i32(
 define <vscale x 16 x half> @intrinsic_vfncvt_mask_f.x.w_nxv16f16_nxv16i32(<vscale x 16 x half> %0, <vscale x 16 x i32> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_f.x.w_nxv16f16_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.x.w v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.x.w.mask.nxv16f16.nxv16i32(
@@ -247,10 +247,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfncvt.f.x.w.nxv1f32.nxv1i64(
 define <vscale x 1 x float> @intrinsic_vfncvt_f.x.w_nxv1f32_nxv1i64(<vscale x 1 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_f.x.w_nxv1f32_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.x.w v9, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -271,10 +271,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfncvt.f.x.w.mask.nxv1f32.nxv1i64(
 define <vscale x 1 x float> @intrinsic_vfncvt_mask_f.x.w_nxv1f32_nxv1i64(<vscale x 1 x float> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_f.x.w_nxv1f32_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.x.w v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.x.w.mask.nxv1f32.nxv1i64(
@@ -294,10 +294,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfncvt.f.x.w.nxv2f32.nxv2i64(
 define <vscale x 2 x float> @intrinsic_vfncvt_f.x.w_nxv2f32_nxv2i64(<vscale x 2 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_f.x.w_nxv2f32_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.x.w v10, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -318,10 +318,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfncvt.f.x.w.mask.nxv2f32.nxv2i64(
 define <vscale x 2 x float> @intrinsic_vfncvt_mask_f.x.w_nxv2f32_nxv2i64(<vscale x 2 x float> %0, <vscale x 2 x i64> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_f.x.w_nxv2f32_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.x.w v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.x.w.mask.nxv2f32.nxv2i64(
@@ -341,10 +341,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfncvt.f.x.w.nxv4f32.nxv4i64(
 define <vscale x 4 x float> @intrinsic_vfncvt_f.x.w_nxv4f32_nxv4i64(<vscale x 4 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_f.x.w_nxv4f32_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.x.w v12, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -365,10 +365,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfncvt.f.x.w.mask.nxv4f32.nxv4i64(
 define <vscale x 4 x float> @intrinsic_vfncvt_mask_f.x.w_nxv4f32_nxv4i64(<vscale x 4 x float> %0, <vscale x 4 x i64> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_f.x.w_nxv4f32_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.x.w v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.x.w.mask.nxv4f32.nxv4i64(
@@ -388,10 +388,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfncvt.f.x.w.nxv8f32.nxv8i64(
 define <vscale x 8 x float> @intrinsic_vfncvt_f.x.w_nxv8f32_nxv8i64(<vscale x 8 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_f.x.w_nxv8f32_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.x.w v16, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -412,10 +412,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfncvt.f.x.w.mask.nxv8f32.nxv8i64(
 define <vscale x 8 x float> @intrinsic_vfncvt_mask_f.x.w_nxv8f32_nxv8i64(<vscale x 8 x float> %0, <vscale x 8 x i64> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_f.x.w_nxv8f32_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.x.w v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.x.w.mask.nxv8f32.nxv8i64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfncvt-f-xu.ll b/llvm/test/CodeGen/RISCV/rvv/vfncvt-f-xu.ll
index 1aeee4317cb3..bc287e4bdef1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfncvt-f-xu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfncvt-f-xu.ll
@@ -12,10 +12,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfncvt.f.xu.w.nxv1f16.nxv1i32(
 define <vscale x 1 x half> @intrinsic_vfncvt_f.xu.w_nxv1f16_nxv1i32(<vscale x 1 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_f.xu.w_nxv1f16_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.xu.w v9, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -36,10 +36,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfncvt.f.xu.w.mask.nxv1f16.nxv1i32(
 define <vscale x 1 x half> @intrinsic_vfncvt_mask_f.xu.w_nxv1f16_nxv1i32(<vscale x 1 x half> %0, <vscale x 1 x i32> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_f.xu.w_nxv1f16_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.xu.w v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfncvt.f.xu.w.mask.nxv1f16.nxv1i32(
@@ -59,10 +59,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfncvt.f.xu.w.nxv2f16.nxv2i32(
 define <vscale x 2 x half> @intrinsic_vfncvt_f.xu.w_nxv2f16_nxv2i32(<vscale x 2 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_f.xu.w_nxv2f16_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.xu.w v9, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -83,10 +83,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfncvt.f.xu.w.mask.nxv2f16.nxv2i32(
 define <vscale x 2 x half> @intrinsic_vfncvt_mask_f.xu.w_nxv2f16_nxv2i32(<vscale x 2 x half> %0, <vscale x 2 x i32> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_f.xu.w_nxv2f16_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.xu.w v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfncvt.f.xu.w.mask.nxv2f16.nxv2i32(
@@ -106,10 +106,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfncvt.f.xu.w.nxv4f16.nxv4i32(
 define <vscale x 4 x half> @intrinsic_vfncvt_f.xu.w_nxv4f16_nxv4i32(<vscale x 4 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_f.xu.w_nxv4f16_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.xu.w v10, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -130,10 +130,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfncvt.f.xu.w.mask.nxv4f16.nxv4i32(
 define <vscale x 4 x half> @intrinsic_vfncvt_mask_f.xu.w_nxv4f16_nxv4i32(<vscale x 4 x half> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_f.xu.w_nxv4f16_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.xu.w v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfncvt.f.xu.w.mask.nxv4f16.nxv4i32(
@@ -153,10 +153,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfncvt.f.xu.w.nxv8f16.nxv8i32(
 define <vscale x 8 x half> @intrinsic_vfncvt_f.xu.w_nxv8f16_nxv8i32(<vscale x 8 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_f.xu.w_nxv8f16_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.xu.w v12, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -177,10 +177,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfncvt.f.xu.w.mask.nxv8f16.nxv8i32(
 define <vscale x 8 x half> @intrinsic_vfncvt_mask_f.xu.w_nxv8f16_nxv8i32(<vscale x 8 x half> %0, <vscale x 8 x i32> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_f.xu.w_nxv8f16_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.xu.w v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfncvt.f.xu.w.mask.nxv8f16.nxv8i32(
@@ -200,10 +200,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfncvt.f.xu.w.nxv16f16.nxv16i32(
 define <vscale x 16 x half> @intrinsic_vfncvt_f.xu.w_nxv16f16_nxv16i32(<vscale x 16 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_f.xu.w_nxv16f16_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.xu.w v16, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -224,10 +224,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfncvt.f.xu.w.mask.nxv16f16.nxv16i32(
 define <vscale x 16 x half> @intrinsic_vfncvt_mask_f.xu.w_nxv16f16_nxv16i32(<vscale x 16 x half> %0, <vscale x 16 x i32> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_f.xu.w_nxv16f16_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.xu.w v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfncvt.f.xu.w.mask.nxv16f16.nxv16i32(
@@ -247,10 +247,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfncvt.f.xu.w.nxv1f32.nxv1i64(
 define <vscale x 1 x float> @intrinsic_vfncvt_f.xu.w_nxv1f32_nxv1i64(<vscale x 1 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_f.xu.w_nxv1f32_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.xu.w v9, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -271,10 +271,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfncvt.f.xu.w.mask.nxv1f32.nxv1i64(
 define <vscale x 1 x float> @intrinsic_vfncvt_mask_f.xu.w_nxv1f32_nxv1i64(<vscale x 1 x float> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_f.xu.w_nxv1f32_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.xu.w v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfncvt.f.xu.w.mask.nxv1f32.nxv1i64(
@@ -294,10 +294,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfncvt.f.xu.w.nxv2f32.nxv2i64(
 define <vscale x 2 x float> @intrinsic_vfncvt_f.xu.w_nxv2f32_nxv2i64(<vscale x 2 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_f.xu.w_nxv2f32_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.xu.w v10, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -318,10 +318,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfncvt.f.xu.w.mask.nxv2f32.nxv2i64(
 define <vscale x 2 x float> @intrinsic_vfncvt_mask_f.xu.w_nxv2f32_nxv2i64(<vscale x 2 x float> %0, <vscale x 2 x i64> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_f.xu.w_nxv2f32_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.xu.w v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfncvt.f.xu.w.mask.nxv2f32.nxv2i64(
@@ -341,10 +341,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfncvt.f.xu.w.nxv4f32.nxv4i64(
 define <vscale x 4 x float> @intrinsic_vfncvt_f.xu.w_nxv4f32_nxv4i64(<vscale x 4 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_f.xu.w_nxv4f32_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.xu.w v12, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -365,10 +365,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfncvt.f.xu.w.mask.nxv4f32.nxv4i64(
 define <vscale x 4 x float> @intrinsic_vfncvt_mask_f.xu.w_nxv4f32_nxv4i64(<vscale x 4 x float> %0, <vscale x 4 x i64> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_f.xu.w_nxv4f32_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.xu.w v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfncvt.f.xu.w.mask.nxv4f32.nxv4i64(
@@ -388,10 +388,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfncvt.f.xu.w.nxv8f32.nxv8i64(
 define <vscale x 8 x float> @intrinsic_vfncvt_f.xu.w_nxv8f32_nxv8i64(<vscale x 8 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_f.xu.w_nxv8f32_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.xu.w v16, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -412,10 +412,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfncvt.f.xu.w.mask.nxv8f32.nxv8i64(
 define <vscale x 8 x float> @intrinsic_vfncvt_mask_f.xu.w_nxv8f32_nxv8i64(<vscale x 8 x float> %0, <vscale x 8 x i64> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_f.xu.w_nxv8f32_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.f.xu.w v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfncvt.f.xu.w.mask.nxv8f32.nxv8i64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfncvt-x-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfncvt-x-f.ll
index 8309e3fb857f..e4b39c655a10 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfncvt-x-f.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfncvt-x-f.ll
@@ -12,10 +12,10 @@ declare <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.nxv1i8.nxv1f16(
 define <vscale x 1 x i8> @intrinsic_vfncvt_x.f.w_nxv1i8_nxv1f16(<vscale x 1 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_x.f.w_nxv1i8_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.x.f.w v9, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -36,10 +36,10 @@ declare <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv1i8.nxv1f16(
 define <vscale x 1 x i8> @intrinsic_vfncvt_mask_x.f.w_nxv1i8_nxv1f16(<vscale x 1 x i8> %0, <vscale x 1 x half> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_x.f.w_nxv1i8_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.x.f.w v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv1i8.nxv1f16(
@@ -59,10 +59,10 @@ declare <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.nxv2i8.nxv2f16(
 define <vscale x 2 x i8> @intrinsic_vfncvt_x.f.w_nxv2i8_nxv2f16(<vscale x 2 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_x.f.w_nxv2i8_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.x.f.w v9, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -83,10 +83,10 @@ declare <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv2i8.nxv2f16(
 define <vscale x 2 x i8> @intrinsic_vfncvt_mask_x.f.w_nxv2i8_nxv2f16(<vscale x 2 x i8> %0, <vscale x 2 x half> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_x.f.w_nxv2i8_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.x.f.w v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv2i8.nxv2f16(
@@ -106,10 +106,10 @@ declare <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.nxv4i8.nxv4f16(
 define <vscale x 4 x i8> @intrinsic_vfncvt_x.f.w_nxv4i8_nxv4f16(<vscale x 4 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_x.f.w_nxv4i8_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.x.f.w v9, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -130,10 +130,10 @@ declare <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv4i8.nxv4f16(
 define <vscale x 4 x i8> @intrinsic_vfncvt_mask_x.f.w_nxv4i8_nxv4f16(<vscale x 4 x i8> %0, <vscale x 4 x half> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_x.f.w_nxv4i8_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.x.f.w v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv4i8.nxv4f16(
@@ -153,10 +153,10 @@ declare <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.nxv8i8.nxv8f16(
 define <vscale x 8 x i8> @intrinsic_vfncvt_x.f.w_nxv8i8_nxv8f16(<vscale x 8 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_x.f.w_nxv8i8_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.x.f.w v10, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -177,10 +177,10 @@ declare <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv8i8.nxv8f16(
 define <vscale x 8 x i8> @intrinsic_vfncvt_mask_x.f.w_nxv8i8_nxv8f16(<vscale x 8 x i8> %0, <vscale x 8 x half> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_x.f.w_nxv8i8_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.x.f.w v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv8i8.nxv8f16(
@@ -200,10 +200,10 @@ declare <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.nxv16i8.nxv16f16(
 define <vscale x 16 x i8> @intrinsic_vfncvt_x.f.w_nxv16i8_nxv16f16(<vscale x 16 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_x.f.w_nxv16i8_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.x.f.w v12, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -224,10 +224,10 @@ declare <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv16i8.nxv16f16(
 define <vscale x 16 x i8> @intrinsic_vfncvt_mask_x.f.w_nxv16i8_nxv16f16(<vscale x 16 x i8> %0, <vscale x 16 x half> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_x.f.w_nxv16i8_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.x.f.w v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv16i8.nxv16f16(
@@ -247,10 +247,10 @@ declare <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.nxv32i8.nxv32f16(
 define <vscale x 32 x i8> @intrinsic_vfncvt_x.f.w_nxv32i8_nxv32f16(<vscale x 32 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_x.f.w_nxv32i8_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.x.f.w v16, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -271,10 +271,10 @@ declare <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv32i8.nxv32f16(
 define <vscale x 32 x i8> @intrinsic_vfncvt_mask_x.f.w_nxv32i8_nxv32f16(<vscale x 32 x i8> %0, <vscale x 32 x half> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_x.f.w_nxv32i8_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.x.f.w v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv32i8.nxv32f16(
@@ -294,10 +294,10 @@ declare <vscale x 1 x i16> @llvm.riscv.vfncvt.x.f.w.nxv1i16.nxv1f32(
 define <vscale x 1 x i16> @intrinsic_vfncvt_x.f.w_nxv1i16_nxv1f32(<vscale x 1 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_x.f.w_nxv1i16_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.x.f.w v9, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -318,10 +318,10 @@ declare <vscale x 1 x i16> @llvm.riscv.vfncvt.x.f.w.mask.nxv1i16.nxv1f32(
 define <vscale x 1 x i16> @intrinsic_vfncvt_mask_x.f.w_nxv1i16_nxv1f32(<vscale x 1 x i16> %0, <vscale x 1 x float> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_x.f.w_nxv1i16_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.x.f.w v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vfncvt.x.f.w.mask.nxv1i16.nxv1f32(
@@ -341,10 +341,10 @@ declare <vscale x 2 x i16> @llvm.riscv.vfncvt.x.f.w.nxv2i16.nxv2f32(
 define <vscale x 2 x i16> @intrinsic_vfncvt_x.f.w_nxv2i16_nxv2f32(<vscale x 2 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_x.f.w_nxv2i16_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.x.f.w v9, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -365,10 +365,10 @@ declare <vscale x 2 x i16> @llvm.riscv.vfncvt.x.f.w.mask.nxv2i16.nxv2f32(
 define <vscale x 2 x i16> @intrinsic_vfncvt_mask_x.f.w_nxv2i16_nxv2f32(<vscale x 2 x i16> %0, <vscale x 2 x float> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_x.f.w_nxv2i16_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.x.f.w v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vfncvt.x.f.w.mask.nxv2i16.nxv2f32(
@@ -388,10 +388,10 @@ declare <vscale x 4 x i16> @llvm.riscv.vfncvt.x.f.w.nxv4i16.nxv4f32(
 define <vscale x 4 x i16> @intrinsic_vfncvt_x.f.w_nxv4i16_nxv4f32(<vscale x 4 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_x.f.w_nxv4i16_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.x.f.w v10, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -412,10 +412,10 @@ declare <vscale x 4 x i16> @llvm.riscv.vfncvt.x.f.w.mask.nxv4i16.nxv4f32(
 define <vscale x 4 x i16> @intrinsic_vfncvt_mask_x.f.w_nxv4i16_nxv4f32(<vscale x 4 x i16> %0, <vscale x 4 x float> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_x.f.w_nxv4i16_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.x.f.w v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vfncvt.x.f.w.mask.nxv4i16.nxv4f32(
@@ -435,10 +435,10 @@ declare <vscale x 8 x i16> @llvm.riscv.vfncvt.x.f.w.nxv8i16.nxv8f32(
 define <vscale x 8 x i16> @intrinsic_vfncvt_x.f.w_nxv8i16_nxv8f32(<vscale x 8 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_x.f.w_nxv8i16_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.x.f.w v12, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -459,10 +459,10 @@ declare <vscale x 8 x i16> @llvm.riscv.vfncvt.x.f.w.mask.nxv8i16.nxv8f32(
 define <vscale x 8 x i16> @intrinsic_vfncvt_mask_x.f.w_nxv8i16_nxv8f32(<vscale x 8 x i16> %0, <vscale x 8 x float> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_x.f.w_nxv8i16_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.x.f.w v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vfncvt.x.f.w.mask.nxv8i16.nxv8f32(
@@ -482,10 +482,10 @@ declare <vscale x 16 x i16> @llvm.riscv.vfncvt.x.f.w.nxv16i16.nxv16f32(
 define <vscale x 16 x i16> @intrinsic_vfncvt_x.f.w_nxv16i16_nxv16f32(<vscale x 16 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_x.f.w_nxv16i16_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.x.f.w v16, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -506,10 +506,10 @@ declare <vscale x 16 x i16> @llvm.riscv.vfncvt.x.f.w.mask.nxv16i16.nxv16f32(
 define <vscale x 16 x i16> @intrinsic_vfncvt_mask_x.f.w_nxv16i16_nxv16f32(<vscale x 16 x i16> %0, <vscale x 16 x float> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_x.f.w_nxv16i16_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.x.f.w v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vfncvt.x.f.w.mask.nxv16i16.nxv16f32(
@@ -529,10 +529,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vfncvt.x.f.w.nxv1i32.nxv1f64(
 define <vscale x 1 x i32> @intrinsic_vfncvt_x.f.w_nxv1i32_nxv1f64(<vscale x 1 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_x.f.w_nxv1i32_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.x.f.w v9, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -553,10 +553,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vfncvt.x.f.w.mask.nxv1i32.nxv1f64(
 define <vscale x 1 x i32> @intrinsic_vfncvt_mask_x.f.w_nxv1i32_nxv1f64(<vscale x 1 x i32> %0, <vscale x 1 x double> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_x.f.w_nxv1i32_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.x.f.w v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vfncvt.x.f.w.mask.nxv1i32.nxv1f64(
@@ -576,10 +576,10 @@ declare <vscale x 2 x i32> @llvm.riscv.vfncvt.x.f.w.nxv2i32.nxv2f64(
 define <vscale x 2 x i32> @intrinsic_vfncvt_x.f.w_nxv2i32_nxv2f64(<vscale x 2 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_x.f.w_nxv2i32_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.x.f.w v10, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -600,10 +600,10 @@ declare <vscale x 2 x i32> @llvm.riscv.vfncvt.x.f.w.mask.nxv2i32.nxv2f64(
 define <vscale x 2 x i32> @intrinsic_vfncvt_mask_x.f.w_nxv2i32_nxv2f64(<vscale x 2 x i32> %0, <vscale x 2 x double> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_x.f.w_nxv2i32_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.x.f.w v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vfncvt.x.f.w.mask.nxv2i32.nxv2f64(
@@ -623,10 +623,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vfncvt.x.f.w.nxv4i32.nxv4f64(
 define <vscale x 4 x i32> @intrinsic_vfncvt_x.f.w_nxv4i32_nxv4f64(<vscale x 4 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_x.f.w_nxv4i32_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.x.f.w v12, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -647,10 +647,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vfncvt.x.f.w.mask.nxv4i32.nxv4f64(
 define <vscale x 4 x i32> @intrinsic_vfncvt_mask_x.f.w_nxv4i32_nxv4f64(<vscale x 4 x i32> %0, <vscale x 4 x double> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_x.f.w_nxv4i32_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.x.f.w v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vfncvt.x.f.w.mask.nxv4i32.nxv4f64(
@@ -670,10 +670,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vfncvt.x.f.w.nxv8i32.nxv8f64(
 define <vscale x 8 x i32> @intrinsic_vfncvt_x.f.w_nxv8i32_nxv8f64(<vscale x 8 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_x.f.w_nxv8i32_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.x.f.w v16, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -694,10 +694,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vfncvt.x.f.w.mask.nxv8i32.nxv8f64(
 define <vscale x 8 x i32> @intrinsic_vfncvt_mask_x.f.w_nxv8i32_nxv8f64(<vscale x 8 x i32> %0, <vscale x 8 x double> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_x.f.w_nxv8i32_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.x.f.w v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vfncvt.x.f.w.mask.nxv8i32.nxv8f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfncvt-xu-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfncvt-xu-f.ll
index 3a3abacc8fc3..fd922438d05b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfncvt-xu-f.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfncvt-xu-f.ll
@@ -12,10 +12,10 @@ declare <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv1i8.nxv1f16(
 define <vscale x 1 x i8> @intrinsic_vfncvt_xu.f.w_nxv1i8_nxv1f16(<vscale x 1 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_xu.f.w_nxv1i8_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.xu.f.w v9, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -36,10 +36,10 @@ declare <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv1i8.nxv1f16(
 define <vscale x 1 x i8> @intrinsic_vfncvt_mask_xu.f.w_nxv1i8_nxv1f16(<vscale x 1 x i8> %0, <vscale x 1 x half> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_xu.f.w_nxv1i8_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.xu.f.w v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv1i8.nxv1f16(
@@ -59,10 +59,10 @@ declare <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv2i8.nxv2f16(
 define <vscale x 2 x i8> @intrinsic_vfncvt_xu.f.w_nxv2i8_nxv2f16(<vscale x 2 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_xu.f.w_nxv2i8_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.xu.f.w v9, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -83,10 +83,10 @@ declare <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv2i8.nxv2f16(
 define <vscale x 2 x i8> @intrinsic_vfncvt_mask_xu.f.w_nxv2i8_nxv2f16(<vscale x 2 x i8> %0, <vscale x 2 x half> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_xu.f.w_nxv2i8_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.xu.f.w v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv2i8.nxv2f16(
@@ -106,10 +106,10 @@ declare <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv4i8.nxv4f16(
 define <vscale x 4 x i8> @intrinsic_vfncvt_xu.f.w_nxv4i8_nxv4f16(<vscale x 4 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_xu.f.w_nxv4i8_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.xu.f.w v9, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -130,10 +130,10 @@ declare <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv4i8.nxv4f16(
 define <vscale x 4 x i8> @intrinsic_vfncvt_mask_xu.f.w_nxv4i8_nxv4f16(<vscale x 4 x i8> %0, <vscale x 4 x half> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_xu.f.w_nxv4i8_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.xu.f.w v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv4i8.nxv4f16(
@@ -153,10 +153,10 @@ declare <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv8i8.nxv8f16(
 define <vscale x 8 x i8> @intrinsic_vfncvt_xu.f.w_nxv8i8_nxv8f16(<vscale x 8 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_xu.f.w_nxv8i8_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.xu.f.w v10, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -177,10 +177,10 @@ declare <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv8i8.nxv8f16(
 define <vscale x 8 x i8> @intrinsic_vfncvt_mask_xu.f.w_nxv8i8_nxv8f16(<vscale x 8 x i8> %0, <vscale x 8 x half> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_xu.f.w_nxv8i8_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.xu.f.w v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv8i8.nxv8f16(
@@ -200,10 +200,10 @@ declare <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv16i8.nxv16f16(
 define <vscale x 16 x i8> @intrinsic_vfncvt_xu.f.w_nxv16i8_nxv16f16(<vscale x 16 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_xu.f.w_nxv16i8_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.xu.f.w v12, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -224,10 +224,10 @@ declare <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv16i8.nxv16f16(
 define <vscale x 16 x i8> @intrinsic_vfncvt_mask_xu.f.w_nxv16i8_nxv16f16(<vscale x 16 x i8> %0, <vscale x 16 x half> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_xu.f.w_nxv16i8_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.xu.f.w v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv16i8.nxv16f16(
@@ -247,10 +247,10 @@ declare <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv32i8.nxv32f16(
 define <vscale x 32 x i8> @intrinsic_vfncvt_xu.f.w_nxv32i8_nxv32f16(<vscale x 32 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_xu.f.w_nxv32i8_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.xu.f.w v16, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -271,10 +271,10 @@ declare <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv32i8.nxv32f16(
 define <vscale x 32 x i8> @intrinsic_vfncvt_mask_xu.f.w_nxv32i8_nxv32f16(<vscale x 32 x i8> %0, <vscale x 32 x half> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_xu.f.w_nxv32i8_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.xu.f.w v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv32i8.nxv32f16(
@@ -294,10 +294,10 @@ declare <vscale x 1 x i16> @llvm.riscv.vfncvt.xu.f.w.nxv1i16.nxv1f32(
 define <vscale x 1 x i16> @intrinsic_vfncvt_xu.f.w_nxv1i16_nxv1f32(<vscale x 1 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_xu.f.w_nxv1i16_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.xu.f.w v9, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -318,10 +318,10 @@ declare <vscale x 1 x i16> @llvm.riscv.vfncvt.xu.f.w.mask.nxv1i16.nxv1f32(
 define <vscale x 1 x i16> @intrinsic_vfncvt_mask_xu.f.w_nxv1i16_nxv1f32(<vscale x 1 x i16> %0, <vscale x 1 x float> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_xu.f.w_nxv1i16_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.xu.f.w v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vfncvt.xu.f.w.mask.nxv1i16.nxv1f32(
@@ -341,10 +341,10 @@ declare <vscale x 2 x i16> @llvm.riscv.vfncvt.xu.f.w.nxv2i16.nxv2f32(
 define <vscale x 2 x i16> @intrinsic_vfncvt_xu.f.w_nxv2i16_nxv2f32(<vscale x 2 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_xu.f.w_nxv2i16_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.xu.f.w v9, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -365,10 +365,10 @@ declare <vscale x 2 x i16> @llvm.riscv.vfncvt.xu.f.w.mask.nxv2i16.nxv2f32(
 define <vscale x 2 x i16> @intrinsic_vfncvt_mask_xu.f.w_nxv2i16_nxv2f32(<vscale x 2 x i16> %0, <vscale x 2 x float> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_xu.f.w_nxv2i16_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.xu.f.w v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vfncvt.xu.f.w.mask.nxv2i16.nxv2f32(
@@ -388,10 +388,10 @@ declare <vscale x 4 x i16> @llvm.riscv.vfncvt.xu.f.w.nxv4i16.nxv4f32(
 define <vscale x 4 x i16> @intrinsic_vfncvt_xu.f.w_nxv4i16_nxv4f32(<vscale x 4 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_xu.f.w_nxv4i16_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.xu.f.w v10, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -412,10 +412,10 @@ declare <vscale x 4 x i16> @llvm.riscv.vfncvt.xu.f.w.mask.nxv4i16.nxv4f32(
 define <vscale x 4 x i16> @intrinsic_vfncvt_mask_xu.f.w_nxv4i16_nxv4f32(<vscale x 4 x i16> %0, <vscale x 4 x float> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_xu.f.w_nxv4i16_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.xu.f.w v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vfncvt.xu.f.w.mask.nxv4i16.nxv4f32(
@@ -435,10 +435,10 @@ declare <vscale x 8 x i16> @llvm.riscv.vfncvt.xu.f.w.nxv8i16.nxv8f32(
 define <vscale x 8 x i16> @intrinsic_vfncvt_xu.f.w_nxv8i16_nxv8f32(<vscale x 8 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_xu.f.w_nxv8i16_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.xu.f.w v12, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -459,10 +459,10 @@ declare <vscale x 8 x i16> @llvm.riscv.vfncvt.xu.f.w.mask.nxv8i16.nxv8f32(
 define <vscale x 8 x i16> @intrinsic_vfncvt_mask_xu.f.w_nxv8i16_nxv8f32(<vscale x 8 x i16> %0, <vscale x 8 x float> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_xu.f.w_nxv8i16_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.xu.f.w v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vfncvt.xu.f.w.mask.nxv8i16.nxv8f32(
@@ -482,10 +482,10 @@ declare <vscale x 16 x i16> @llvm.riscv.vfncvt.xu.f.w.nxv16i16.nxv16f32(
 define <vscale x 16 x i16> @intrinsic_vfncvt_xu.f.w_nxv16i16_nxv16f32(<vscale x 16 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_xu.f.w_nxv16i16_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.xu.f.w v16, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -506,10 +506,10 @@ declare <vscale x 16 x i16> @llvm.riscv.vfncvt.xu.f.w.mask.nxv16i16.nxv16f32(
 define <vscale x 16 x i16> @intrinsic_vfncvt_mask_xu.f.w_nxv16i16_nxv16f32(<vscale x 16 x i16> %0, <vscale x 16 x float> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_xu.f.w_nxv16i16_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.xu.f.w v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vfncvt.xu.f.w.mask.nxv16i16.nxv16f32(
@@ -529,10 +529,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vfncvt.xu.f.w.nxv1i32.nxv1f64(
 define <vscale x 1 x i32> @intrinsic_vfncvt_xu.f.w_nxv1i32_nxv1f64(<vscale x 1 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_xu.f.w_nxv1i32_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.xu.f.w v9, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -553,10 +553,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vfncvt.xu.f.w.mask.nxv1i32.nxv1f64(
 define <vscale x 1 x i32> @intrinsic_vfncvt_mask_xu.f.w_nxv1i32_nxv1f64(<vscale x 1 x i32> %0, <vscale x 1 x double> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_xu.f.w_nxv1i32_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.xu.f.w v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vfncvt.xu.f.w.mask.nxv1i32.nxv1f64(
@@ -576,10 +576,10 @@ declare <vscale x 2 x i32> @llvm.riscv.vfncvt.xu.f.w.nxv2i32.nxv2f64(
 define <vscale x 2 x i32> @intrinsic_vfncvt_xu.f.w_nxv2i32_nxv2f64(<vscale x 2 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_xu.f.w_nxv2i32_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.xu.f.w v10, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -600,10 +600,10 @@ declare <vscale x 2 x i32> @llvm.riscv.vfncvt.xu.f.w.mask.nxv2i32.nxv2f64(
 define <vscale x 2 x i32> @intrinsic_vfncvt_mask_xu.f.w_nxv2i32_nxv2f64(<vscale x 2 x i32> %0, <vscale x 2 x double> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_xu.f.w_nxv2i32_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.xu.f.w v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vfncvt.xu.f.w.mask.nxv2i32.nxv2f64(
@@ -623,10 +623,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vfncvt.xu.f.w.nxv4i32.nxv4f64(
 define <vscale x 4 x i32> @intrinsic_vfncvt_xu.f.w_nxv4i32_nxv4f64(<vscale x 4 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_xu.f.w_nxv4i32_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.xu.f.w v12, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -647,10 +647,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vfncvt.xu.f.w.mask.nxv4i32.nxv4f64(
 define <vscale x 4 x i32> @intrinsic_vfncvt_mask_xu.f.w_nxv4i32_nxv4f64(<vscale x 4 x i32> %0, <vscale x 4 x double> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_xu.f.w_nxv4i32_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.xu.f.w v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vfncvt.xu.f.w.mask.nxv4i32.nxv4f64(
@@ -670,10 +670,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vfncvt.xu.f.w.nxv8i32.nxv8f64(
 define <vscale x 8 x i32> @intrinsic_vfncvt_xu.f.w_nxv8i32_nxv8f64(<vscale x 8 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_xu.f.w_nxv8i32_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.xu.f.w v16, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -694,10 +694,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vfncvt.xu.f.w.mask.nxv8i32.nxv8f64(
 define <vscale x 8 x i32> @intrinsic_vfncvt_mask_xu.f.w_nxv8i32_nxv8f64(<vscale x 8 x i32> %0, <vscale x 8 x double> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfncvt_mask_xu.f.w_nxv8i32_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfncvt.xu.f.w v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vfncvt.xu.f.w.mask.nxv8i32.nxv8f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfnmacc.ll b/llvm/test/CodeGen/RISCV/rvv/vfnmacc.ll
index bdfa211dfdcb..01f4715274b6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfnmacc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfnmacc.ll
@@ -13,10 +13,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfnmacc.nxv1f16.nxv1f16(
 define <vscale x 1 x half>  @intrinsic_vfnmacc_vv_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_vv_nxv1f16_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfnmacc.nxv1f16.nxv1f16(
@@ -38,10 +38,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfnmacc.mask.nxv1f16.nxv1f16(
 define <vscale x 1 x half>  @intrinsic_vfnmacc_mask_vv_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_mask_vv_nxv1f16_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfnmacc.mask.nxv1f16.nxv1f16(
@@ -63,10 +63,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfnmacc.nxv2f16.nxv2f16(
 define <vscale x 2 x half>  @intrinsic_vfnmacc_vv_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_vv_nxv2f16_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfnmacc.nxv2f16.nxv2f16(
@@ -88,10 +88,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfnmacc.mask.nxv2f16.nxv2f16(
 define <vscale x 2 x half>  @intrinsic_vfnmacc_mask_vv_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_mask_vv_nxv2f16_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfnmacc.mask.nxv2f16.nxv2f16(
@@ -113,10 +113,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfnmacc.nxv4f16.nxv4f16(
 define <vscale x 4 x half>  @intrinsic_vfnmacc_vv_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_vv_nxv4f16_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfnmacc.nxv4f16.nxv4f16(
@@ -138,10 +138,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfnmacc.mask.nxv4f16.nxv4f16(
 define <vscale x 4 x half>  @intrinsic_vfnmacc_mask_vv_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_mask_vv_nxv4f16_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfnmacc.mask.nxv4f16.nxv4f16(
@@ -163,10 +163,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfnmacc.nxv8f16.nxv8f16(
 define <vscale x 8 x half>  @intrinsic_vfnmacc_vv_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_vv_nxv8f16_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vv v8, v10, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfnmacc.nxv8f16.nxv8f16(
@@ -188,10 +188,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfnmacc.mask.nxv8f16.nxv8f16(
 define <vscale x 8 x half>  @intrinsic_vfnmacc_mask_vv_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_mask_vv_nxv8f16_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfnmacc.mask.nxv8f16.nxv8f16(
@@ -213,10 +213,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfnmacc.nxv16f16.nxv16f16(
 define <vscale x 16 x half>  @intrinsic_vfnmacc_vv_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_vv_nxv16f16_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vv v8, v12, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfnmacc.nxv16f16.nxv16f16(
@@ -238,10 +238,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfnmacc.mask.nxv16f16.nxv16f16(
 define <vscale x 16 x half>  @intrinsic_vfnmacc_mask_vv_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_mask_vv_nxv16f16_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfnmacc.mask.nxv16f16.nxv16f16(
@@ -263,10 +263,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfnmacc.nxv1f32.nxv1f32(
 define <vscale x 1 x float>  @intrinsic_vfnmacc_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_vv_nxv1f32_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfnmacc.nxv1f32.nxv1f32(
@@ -288,10 +288,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfnmacc.mask.nxv1f32.nxv1f32(
 define <vscale x 1 x float>  @intrinsic_vfnmacc_mask_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_mask_vv_nxv1f32_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfnmacc.mask.nxv1f32.nxv1f32(
@@ -313,10 +313,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfnmacc.nxv2f32.nxv2f32(
 define <vscale x 2 x float>  @intrinsic_vfnmacc_vv_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_vv_nxv2f32_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfnmacc.nxv2f32.nxv2f32(
@@ -338,10 +338,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfnmacc.mask.nxv2f32.nxv2f32(
 define <vscale x 2 x float>  @intrinsic_vfnmacc_mask_vv_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_mask_vv_nxv2f32_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfnmacc.mask.nxv2f32.nxv2f32(
@@ -363,10 +363,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfnmacc.nxv4f32.nxv4f32(
 define <vscale x 4 x float>  @intrinsic_vfnmacc_vv_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_vv_nxv4f32_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vv v8, v10, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfnmacc.nxv4f32.nxv4f32(
@@ -388,10 +388,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfnmacc.mask.nxv4f32.nxv4f32(
 define <vscale x 4 x float>  @intrinsic_vfnmacc_mask_vv_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_mask_vv_nxv4f32_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfnmacc.mask.nxv4f32.nxv4f32(
@@ -413,10 +413,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfnmacc.nxv8f32.nxv8f32(
 define <vscale x 8 x float>  @intrinsic_vfnmacc_vv_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_vv_nxv8f32_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vv v8, v12, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfnmacc.nxv8f32.nxv8f32(
@@ -438,10 +438,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfnmacc.mask.nxv8f32.nxv8f32(
 define <vscale x 8 x float>  @intrinsic_vfnmacc_mask_vv_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_mask_vv_nxv8f32_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfnmacc.mask.nxv8f32.nxv8f32(
@@ -463,10 +463,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfnmacc.nxv1f64.nxv1f64(
 define <vscale x 1 x double>  @intrinsic_vfnmacc_vv_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_vv_nxv1f64_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfnmacc.nxv1f64.nxv1f64(
@@ -488,10 +488,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfnmacc.mask.nxv1f64.nxv1f64(
 define <vscale x 1 x double>  @intrinsic_vfnmacc_mask_vv_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_mask_vv_nxv1f64_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfnmacc.mask.nxv1f64.nxv1f64(
@@ -513,10 +513,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfnmacc.nxv2f64.nxv2f64(
 define <vscale x 2 x double>  @intrinsic_vfnmacc_vv_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_vv_nxv2f64_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vv v8, v10, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfnmacc.nxv2f64.nxv2f64(
@@ -538,10 +538,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfnmacc.mask.nxv2f64.nxv2f64(
 define <vscale x 2 x double>  @intrinsic_vfnmacc_mask_vv_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_mask_vv_nxv2f64_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfnmacc.mask.nxv2f64.nxv2f64(
@@ -563,10 +563,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfnmacc.nxv4f64.nxv4f64(
 define <vscale x 4 x double>  @intrinsic_vfnmacc_vv_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_vv_nxv4f64_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vv v8, v12, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfnmacc.nxv4f64.nxv4f64(
@@ -588,10 +588,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfnmacc.mask.nxv4f64.nxv4f64(
 define <vscale x 4 x double>  @intrinsic_vfnmacc_mask_vv_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_mask_vv_nxv4f64_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfnmacc.mask.nxv4f64.nxv4f64(
@@ -613,10 +613,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfnmacc.nxv1f16.f16(
 define <vscale x 1 x half>  @intrinsic_vfnmacc_vf_nxv1f16_f16_nxv1f16(<vscale x 1 x half> %0, half %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_vf_nxv1f16_f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfnmacc.nxv1f16.f16(
@@ -638,10 +638,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfnmacc.mask.nxv1f16.f16(
 define <vscale x 1 x half> @intrinsic_vfnmacc_mask_vf_nxv1f16_f16_nxv1f16(<vscale x 1 x half> %0, half %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_mask_vf_nxv1f16_f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfnmacc.mask.nxv1f16.f16(
@@ -663,10 +663,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfnmacc.nxv2f16.f16(
 define <vscale x 2 x half>  @intrinsic_vfnmacc_vf_nxv2f16_f16_nxv2f16(<vscale x 2 x half> %0, half %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_vf_nxv2f16_f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfnmacc.nxv2f16.f16(
@@ -688,10 +688,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfnmacc.mask.nxv2f16.f16(
 define <vscale x 2 x half> @intrinsic_vfnmacc_mask_vf_nxv2f16_f16_nxv2f16(<vscale x 2 x half> %0, half %1, <vscale x 2 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_mask_vf_nxv2f16_f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfnmacc.mask.nxv2f16.f16(
@@ -713,10 +713,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfnmacc.nxv4f16.f16(
 define <vscale x 4 x half>  @intrinsic_vfnmacc_vf_nxv4f16_f16_nxv4f16(<vscale x 4 x half> %0, half %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_vf_nxv4f16_f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfnmacc.nxv4f16.f16(
@@ -738,10 +738,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfnmacc.mask.nxv4f16.f16(
 define <vscale x 4 x half> @intrinsic_vfnmacc_mask_vf_nxv4f16_f16_nxv4f16(<vscale x 4 x half> %0, half %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_mask_vf_nxv4f16_f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfnmacc.mask.nxv4f16.f16(
@@ -763,10 +763,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfnmacc.nxv8f16.f16(
 define <vscale x 8 x half>  @intrinsic_vfnmacc_vf_nxv8f16_f16_nxv8f16(<vscale x 8 x half> %0, half %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_vf_nxv8f16_f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vf v8, fa0, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfnmacc.nxv8f16.f16(
@@ -788,10 +788,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfnmacc.mask.nxv8f16.f16(
 define <vscale x 8 x half> @intrinsic_vfnmacc_mask_vf_nxv8f16_f16_nxv8f16(<vscale x 8 x half> %0, half %1, <vscale x 8 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_mask_vf_nxv8f16_f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vf v8, fa0, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfnmacc.mask.nxv8f16.f16(
@@ -813,10 +813,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfnmacc.nxv16f16.f16(
 define <vscale x 16 x half>  @intrinsic_vfnmacc_vf_nxv16f16_f16_nxv16f16(<vscale x 16 x half> %0, half %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_vf_nxv16f16_f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vf v8, fa0, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfnmacc.nxv16f16.f16(
@@ -838,10 +838,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfnmacc.mask.nxv16f16.f16(
 define <vscale x 16 x half> @intrinsic_vfnmacc_mask_vf_nxv16f16_f16_nxv16f16(<vscale x 16 x half> %0, half %1, <vscale x 16 x half> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_mask_vf_nxv16f16_f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vf v8, fa0, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfnmacc.mask.nxv16f16.f16(
@@ -863,10 +863,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfnmacc.nxv1f32.f32(
 define <vscale x 1 x float>  @intrinsic_vfnmacc_vf_nxv1f32_f32_nxv1f32(<vscale x 1 x float> %0, float %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_vf_nxv1f32_f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfnmacc.nxv1f32.f32(
@@ -888,10 +888,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfnmacc.mask.nxv1f32.f32(
 define <vscale x 1 x float> @intrinsic_vfnmacc_mask_vf_nxv1f32_f32_nxv1f32(<vscale x 1 x float> %0, float %1, <vscale x 1 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_mask_vf_nxv1f32_f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfnmacc.mask.nxv1f32.f32(
@@ -913,10 +913,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfnmacc.nxv2f32.f32(
 define <vscale x 2 x float>  @intrinsic_vfnmacc_vf_nxv2f32_f32_nxv2f32(<vscale x 2 x float> %0, float %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_vf_nxv2f32_f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfnmacc.nxv2f32.f32(
@@ -938,10 +938,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfnmacc.mask.nxv2f32.f32(
 define <vscale x 2 x float> @intrinsic_vfnmacc_mask_vf_nxv2f32_f32_nxv2f32(<vscale x 2 x float> %0, float %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_mask_vf_nxv2f32_f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfnmacc.mask.nxv2f32.f32(
@@ -963,10 +963,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfnmacc.nxv4f32.f32(
 define <vscale x 4 x float>  @intrinsic_vfnmacc_vf_nxv4f32_f32_nxv4f32(<vscale x 4 x float> %0, float %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_vf_nxv4f32_f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vf v8, fa0, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfnmacc.nxv4f32.f32(
@@ -988,10 +988,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfnmacc.mask.nxv4f32.f32(
 define <vscale x 4 x float> @intrinsic_vfnmacc_mask_vf_nxv4f32_f32_nxv4f32(<vscale x 4 x float> %0, float %1, <vscale x 4 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_mask_vf_nxv4f32_f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vf v8, fa0, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfnmacc.mask.nxv4f32.f32(
@@ -1013,10 +1013,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfnmacc.nxv8f32.f32(
 define <vscale x 8 x float>  @intrinsic_vfnmacc_vf_nxv8f32_f32_nxv8f32(<vscale x 8 x float> %0, float %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_vf_nxv8f32_f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vf v8, fa0, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfnmacc.nxv8f32.f32(
@@ -1038,10 +1038,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfnmacc.mask.nxv8f32.f32(
 define <vscale x 8 x float> @intrinsic_vfnmacc_mask_vf_nxv8f32_f32_nxv8f32(<vscale x 8 x float> %0, float %1, <vscale x 8 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_mask_vf_nxv8f32_f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vf v8, fa0, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfnmacc.mask.nxv8f32.f32(
@@ -1063,10 +1063,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfnmacc.nxv1f64.f64(
 define <vscale x 1 x double>  @intrinsic_vfnmacc_vf_nxv1f64_f64_nxv1f64(<vscale x 1 x double> %0, double %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_vf_nxv1f64_f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfnmacc.nxv1f64.f64(
@@ -1088,10 +1088,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfnmacc.mask.nxv1f64.f64(
 define <vscale x 1 x double> @intrinsic_vfnmacc_mask_vf_nxv1f64_f64_nxv1f64(<vscale x 1 x double> %0, double %1, <vscale x 1 x double> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_mask_vf_nxv1f64_f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfnmacc.mask.nxv1f64.f64(
@@ -1113,10 +1113,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfnmacc.nxv2f64.f64(
 define <vscale x 2 x double>  @intrinsic_vfnmacc_vf_nxv2f64_f64_nxv2f64(<vscale x 2 x double> %0, double %1, <vscale x 2 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_vf_nxv2f64_f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vf v8, fa0, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfnmacc.nxv2f64.f64(
@@ -1138,10 +1138,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfnmacc.mask.nxv2f64.f64(
 define <vscale x 2 x double> @intrinsic_vfnmacc_mask_vf_nxv2f64_f64_nxv2f64(<vscale x 2 x double> %0, double %1, <vscale x 2 x double> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_mask_vf_nxv2f64_f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vf v8, fa0, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfnmacc.mask.nxv2f64.f64(
@@ -1163,10 +1163,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfnmacc.nxv4f64.f64(
 define <vscale x 4 x double>  @intrinsic_vfnmacc_vf_nxv4f64_f64_nxv4f64(<vscale x 4 x double> %0, double %1, <vscale x 4 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_vf_nxv4f64_f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vf v8, fa0, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfnmacc.nxv4f64.f64(
@@ -1188,10 +1188,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfnmacc.mask.nxv4f64.f64(
 define <vscale x 4 x double> @intrinsic_vfnmacc_mask_vf_nxv4f64_f64_nxv4f64(<vscale x 4 x double> %0, double %1, <vscale x 4 x double> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmacc_mask_vf_nxv4f64_f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmacc.vf v8, fa0, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfnmacc.mask.nxv4f64.f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfnmadd.ll b/llvm/test/CodeGen/RISCV/rvv/vfnmadd.ll
index 4eb2e7caba24..ae4cfef35e61 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfnmadd.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfnmadd.ll
@@ -13,10 +13,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfnmadd.nxv1f16.nxv1f16(
 define <vscale x 1 x half>  @intrinsic_vfnmadd_vv_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_vv_nxv1f16_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfnmadd.nxv1f16.nxv1f16(
@@ -38,10 +38,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfnmadd.mask.nxv1f16.nxv1f16(
 define <vscale x 1 x half>  @intrinsic_vfnmadd_mask_vv_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_mask_vv_nxv1f16_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfnmadd.mask.nxv1f16.nxv1f16(
@@ -63,10 +63,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfnmadd.nxv2f16.nxv2f16(
 define <vscale x 2 x half>  @intrinsic_vfnmadd_vv_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_vv_nxv2f16_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfnmadd.nxv2f16.nxv2f16(
@@ -88,10 +88,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfnmadd.mask.nxv2f16.nxv2f16(
 define <vscale x 2 x half>  @intrinsic_vfnmadd_mask_vv_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_mask_vv_nxv2f16_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfnmadd.mask.nxv2f16.nxv2f16(
@@ -113,10 +113,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfnmadd.nxv4f16.nxv4f16(
 define <vscale x 4 x half>  @intrinsic_vfnmadd_vv_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_vv_nxv4f16_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfnmadd.nxv4f16.nxv4f16(
@@ -138,10 +138,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfnmadd.mask.nxv4f16.nxv4f16(
 define <vscale x 4 x half>  @intrinsic_vfnmadd_mask_vv_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_mask_vv_nxv4f16_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfnmadd.mask.nxv4f16.nxv4f16(
@@ -163,10 +163,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfnmadd.nxv8f16.nxv8f16(
 define <vscale x 8 x half>  @intrinsic_vfnmadd_vv_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_vv_nxv8f16_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vv v8, v10, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfnmadd.nxv8f16.nxv8f16(
@@ -188,10 +188,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfnmadd.mask.nxv8f16.nxv8f16(
 define <vscale x 8 x half>  @intrinsic_vfnmadd_mask_vv_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_mask_vv_nxv8f16_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfnmadd.mask.nxv8f16.nxv8f16(
@@ -213,10 +213,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfnmadd.nxv16f16.nxv16f16(
 define <vscale x 16 x half>  @intrinsic_vfnmadd_vv_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_vv_nxv16f16_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vv v8, v12, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfnmadd.nxv16f16.nxv16f16(
@@ -238,10 +238,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfnmadd.mask.nxv16f16.nxv16f16(
 define <vscale x 16 x half>  @intrinsic_vfnmadd_mask_vv_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_mask_vv_nxv16f16_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfnmadd.mask.nxv16f16.nxv16f16(
@@ -263,10 +263,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfnmadd.nxv1f32.nxv1f32(
 define <vscale x 1 x float>  @intrinsic_vfnmadd_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_vv_nxv1f32_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfnmadd.nxv1f32.nxv1f32(
@@ -288,10 +288,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfnmadd.mask.nxv1f32.nxv1f32(
 define <vscale x 1 x float>  @intrinsic_vfnmadd_mask_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_mask_vv_nxv1f32_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfnmadd.mask.nxv1f32.nxv1f32(
@@ -313,10 +313,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfnmadd.nxv2f32.nxv2f32(
 define <vscale x 2 x float>  @intrinsic_vfnmadd_vv_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_vv_nxv2f32_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfnmadd.nxv2f32.nxv2f32(
@@ -338,10 +338,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfnmadd.mask.nxv2f32.nxv2f32(
 define <vscale x 2 x float>  @intrinsic_vfnmadd_mask_vv_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_mask_vv_nxv2f32_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfnmadd.mask.nxv2f32.nxv2f32(
@@ -363,10 +363,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfnmadd.nxv4f32.nxv4f32(
 define <vscale x 4 x float>  @intrinsic_vfnmadd_vv_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_vv_nxv4f32_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vv v8, v10, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfnmadd.nxv4f32.nxv4f32(
@@ -388,10 +388,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfnmadd.mask.nxv4f32.nxv4f32(
 define <vscale x 4 x float>  @intrinsic_vfnmadd_mask_vv_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_mask_vv_nxv4f32_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfnmadd.mask.nxv4f32.nxv4f32(
@@ -413,10 +413,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfnmadd.nxv8f32.nxv8f32(
 define <vscale x 8 x float>  @intrinsic_vfnmadd_vv_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_vv_nxv8f32_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vv v8, v12, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfnmadd.nxv8f32.nxv8f32(
@@ -438,10 +438,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfnmadd.mask.nxv8f32.nxv8f32(
 define <vscale x 8 x float>  @intrinsic_vfnmadd_mask_vv_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_mask_vv_nxv8f32_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfnmadd.mask.nxv8f32.nxv8f32(
@@ -463,10 +463,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfnmadd.nxv1f64.nxv1f64(
 define <vscale x 1 x double>  @intrinsic_vfnmadd_vv_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_vv_nxv1f64_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfnmadd.nxv1f64.nxv1f64(
@@ -488,10 +488,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfnmadd.mask.nxv1f64.nxv1f64(
 define <vscale x 1 x double>  @intrinsic_vfnmadd_mask_vv_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_mask_vv_nxv1f64_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfnmadd.mask.nxv1f64.nxv1f64(
@@ -513,10 +513,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfnmadd.nxv2f64.nxv2f64(
 define <vscale x 2 x double>  @intrinsic_vfnmadd_vv_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_vv_nxv2f64_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vv v8, v10, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfnmadd.nxv2f64.nxv2f64(
@@ -538,10 +538,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfnmadd.mask.nxv2f64.nxv2f64(
 define <vscale x 2 x double>  @intrinsic_vfnmadd_mask_vv_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_mask_vv_nxv2f64_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfnmadd.mask.nxv2f64.nxv2f64(
@@ -563,10 +563,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfnmadd.nxv4f64.nxv4f64(
 define <vscale x 4 x double>  @intrinsic_vfnmadd_vv_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_vv_nxv4f64_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vv v8, v12, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfnmadd.nxv4f64.nxv4f64(
@@ -588,10 +588,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfnmadd.mask.nxv4f64.nxv4f64(
 define <vscale x 4 x double>  @intrinsic_vfnmadd_mask_vv_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_mask_vv_nxv4f64_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfnmadd.mask.nxv4f64.nxv4f64(
@@ -613,10 +613,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfnmadd.nxv1f16.f16(
 define <vscale x 1 x half>  @intrinsic_vfnmadd_vf_nxv1f16_f16_nxv1f16(<vscale x 1 x half> %0, half %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_vf_nxv1f16_f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfnmadd.nxv1f16.f16(
@@ -638,10 +638,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfnmadd.mask.nxv1f16.f16(
 define <vscale x 1 x half> @intrinsic_vfnmadd_mask_vf_nxv1f16_f16_nxv1f16(<vscale x 1 x half> %0, half %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_mask_vf_nxv1f16_f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfnmadd.mask.nxv1f16.f16(
@@ -663,10 +663,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfnmadd.nxv2f16.f16(
 define <vscale x 2 x half>  @intrinsic_vfnmadd_vf_nxv2f16_f16_nxv2f16(<vscale x 2 x half> %0, half %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_vf_nxv2f16_f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfnmadd.nxv2f16.f16(
@@ -688,10 +688,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfnmadd.mask.nxv2f16.f16(
 define <vscale x 2 x half> @intrinsic_vfnmadd_mask_vf_nxv2f16_f16_nxv2f16(<vscale x 2 x half> %0, half %1, <vscale x 2 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_mask_vf_nxv2f16_f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfnmadd.mask.nxv2f16.f16(
@@ -713,10 +713,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfnmadd.nxv4f16.f16(
 define <vscale x 4 x half>  @intrinsic_vfnmadd_vf_nxv4f16_f16_nxv4f16(<vscale x 4 x half> %0, half %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_vf_nxv4f16_f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfnmadd.nxv4f16.f16(
@@ -738,10 +738,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfnmadd.mask.nxv4f16.f16(
 define <vscale x 4 x half> @intrinsic_vfnmadd_mask_vf_nxv4f16_f16_nxv4f16(<vscale x 4 x half> %0, half %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_mask_vf_nxv4f16_f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfnmadd.mask.nxv4f16.f16(
@@ -763,10 +763,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfnmadd.nxv8f16.f16(
 define <vscale x 8 x half>  @intrinsic_vfnmadd_vf_nxv8f16_f16_nxv8f16(<vscale x 8 x half> %0, half %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_vf_nxv8f16_f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vf v8, fa0, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfnmadd.nxv8f16.f16(
@@ -788,10 +788,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfnmadd.mask.nxv8f16.f16(
 define <vscale x 8 x half> @intrinsic_vfnmadd_mask_vf_nxv8f16_f16_nxv8f16(<vscale x 8 x half> %0, half %1, <vscale x 8 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_mask_vf_nxv8f16_f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vf v8, fa0, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfnmadd.mask.nxv8f16.f16(
@@ -813,10 +813,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfnmadd.nxv16f16.f16(
 define <vscale x 16 x half>  @intrinsic_vfnmadd_vf_nxv16f16_f16_nxv16f16(<vscale x 16 x half> %0, half %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_vf_nxv16f16_f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vf v8, fa0, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfnmadd.nxv16f16.f16(
@@ -838,10 +838,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfnmadd.mask.nxv16f16.f16(
 define <vscale x 16 x half> @intrinsic_vfnmadd_mask_vf_nxv16f16_f16_nxv16f16(<vscale x 16 x half> %0, half %1, <vscale x 16 x half> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_mask_vf_nxv16f16_f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vf v8, fa0, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfnmadd.mask.nxv16f16.f16(
@@ -863,10 +863,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfnmadd.nxv1f32.f32(
 define <vscale x 1 x float>  @intrinsic_vfnmadd_vf_nxv1f32_f32_nxv1f32(<vscale x 1 x float> %0, float %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_vf_nxv1f32_f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfnmadd.nxv1f32.f32(
@@ -888,10 +888,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfnmadd.mask.nxv1f32.f32(
 define <vscale x 1 x float> @intrinsic_vfnmadd_mask_vf_nxv1f32_f32_nxv1f32(<vscale x 1 x float> %0, float %1, <vscale x 1 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_mask_vf_nxv1f32_f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfnmadd.mask.nxv1f32.f32(
@@ -913,10 +913,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfnmadd.nxv2f32.f32(
 define <vscale x 2 x float>  @intrinsic_vfnmadd_vf_nxv2f32_f32_nxv2f32(<vscale x 2 x float> %0, float %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_vf_nxv2f32_f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfnmadd.nxv2f32.f32(
@@ -938,10 +938,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfnmadd.mask.nxv2f32.f32(
 define <vscale x 2 x float> @intrinsic_vfnmadd_mask_vf_nxv2f32_f32_nxv2f32(<vscale x 2 x float> %0, float %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_mask_vf_nxv2f32_f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfnmadd.mask.nxv2f32.f32(
@@ -963,10 +963,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfnmadd.nxv4f32.f32(
 define <vscale x 4 x float>  @intrinsic_vfnmadd_vf_nxv4f32_f32_nxv4f32(<vscale x 4 x float> %0, float %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_vf_nxv4f32_f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vf v8, fa0, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfnmadd.nxv4f32.f32(
@@ -988,10 +988,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfnmadd.mask.nxv4f32.f32(
 define <vscale x 4 x float> @intrinsic_vfnmadd_mask_vf_nxv4f32_f32_nxv4f32(<vscale x 4 x float> %0, float %1, <vscale x 4 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_mask_vf_nxv4f32_f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vf v8, fa0, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfnmadd.mask.nxv4f32.f32(
@@ -1013,10 +1013,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfnmadd.nxv8f32.f32(
 define <vscale x 8 x float>  @intrinsic_vfnmadd_vf_nxv8f32_f32_nxv8f32(<vscale x 8 x float> %0, float %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_vf_nxv8f32_f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vf v8, fa0, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfnmadd.nxv8f32.f32(
@@ -1038,10 +1038,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfnmadd.mask.nxv8f32.f32(
 define <vscale x 8 x float> @intrinsic_vfnmadd_mask_vf_nxv8f32_f32_nxv8f32(<vscale x 8 x float> %0, float %1, <vscale x 8 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_mask_vf_nxv8f32_f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vf v8, fa0, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfnmadd.mask.nxv8f32.f32(
@@ -1063,10 +1063,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfnmadd.nxv1f64.f64(
 define <vscale x 1 x double>  @intrinsic_vfnmadd_vf_nxv1f64_f64_nxv1f64(<vscale x 1 x double> %0, double %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_vf_nxv1f64_f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfnmadd.nxv1f64.f64(
@@ -1088,10 +1088,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfnmadd.mask.nxv1f64.f64(
 define <vscale x 1 x double> @intrinsic_vfnmadd_mask_vf_nxv1f64_f64_nxv1f64(<vscale x 1 x double> %0, double %1, <vscale x 1 x double> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_mask_vf_nxv1f64_f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfnmadd.mask.nxv1f64.f64(
@@ -1113,10 +1113,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfnmadd.nxv2f64.f64(
 define <vscale x 2 x double>  @intrinsic_vfnmadd_vf_nxv2f64_f64_nxv2f64(<vscale x 2 x double> %0, double %1, <vscale x 2 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_vf_nxv2f64_f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vf v8, fa0, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfnmadd.nxv2f64.f64(
@@ -1138,10 +1138,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfnmadd.mask.nxv2f64.f64(
 define <vscale x 2 x double> @intrinsic_vfnmadd_mask_vf_nxv2f64_f64_nxv2f64(<vscale x 2 x double> %0, double %1, <vscale x 2 x double> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_mask_vf_nxv2f64_f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vf v8, fa0, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfnmadd.mask.nxv2f64.f64(
@@ -1163,10 +1163,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfnmadd.nxv4f64.f64(
 define <vscale x 4 x double>  @intrinsic_vfnmadd_vf_nxv4f64_f64_nxv4f64(<vscale x 4 x double> %0, double %1, <vscale x 4 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_vf_nxv4f64_f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vf v8, fa0, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfnmadd.nxv4f64.f64(
@@ -1188,10 +1188,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfnmadd.mask.nxv4f64.f64(
 define <vscale x 4 x double> @intrinsic_vfnmadd_mask_vf_nxv4f64_f64_nxv4f64(<vscale x 4 x double> %0, double %1, <vscale x 4 x double> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmadd_mask_vf_nxv4f64_f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmadd.vf v8, fa0, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfnmadd.mask.nxv4f64.f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfnmsac.ll b/llvm/test/CodeGen/RISCV/rvv/vfnmsac.ll
index dc30540bc0af..071f546b4f60 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfnmsac.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfnmsac.ll
@@ -13,10 +13,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfnmsac.nxv1f16.nxv1f16(
 define <vscale x 1 x half>  @intrinsic_vfnmsac_vv_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_vv_nxv1f16_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfnmsac.nxv1f16.nxv1f16(
@@ -38,10 +38,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfnmsac.mask.nxv1f16.nxv1f16(
 define <vscale x 1 x half>  @intrinsic_vfnmsac_mask_vv_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_mask_vv_nxv1f16_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfnmsac.mask.nxv1f16.nxv1f16(
@@ -63,10 +63,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfnmsac.nxv2f16.nxv2f16(
 define <vscale x 2 x half>  @intrinsic_vfnmsac_vv_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_vv_nxv2f16_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfnmsac.nxv2f16.nxv2f16(
@@ -88,10 +88,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfnmsac.mask.nxv2f16.nxv2f16(
 define <vscale x 2 x half>  @intrinsic_vfnmsac_mask_vv_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_mask_vv_nxv2f16_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfnmsac.mask.nxv2f16.nxv2f16(
@@ -113,10 +113,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfnmsac.nxv4f16.nxv4f16(
 define <vscale x 4 x half>  @intrinsic_vfnmsac_vv_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_vv_nxv4f16_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfnmsac.nxv4f16.nxv4f16(
@@ -138,10 +138,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfnmsac.mask.nxv4f16.nxv4f16(
 define <vscale x 4 x half>  @intrinsic_vfnmsac_mask_vv_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_mask_vv_nxv4f16_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfnmsac.mask.nxv4f16.nxv4f16(
@@ -163,10 +163,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfnmsac.nxv8f16.nxv8f16(
 define <vscale x 8 x half>  @intrinsic_vfnmsac_vv_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_vv_nxv8f16_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vv v8, v10, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfnmsac.nxv8f16.nxv8f16(
@@ -188,10 +188,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfnmsac.mask.nxv8f16.nxv8f16(
 define <vscale x 8 x half>  @intrinsic_vfnmsac_mask_vv_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_mask_vv_nxv8f16_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfnmsac.mask.nxv8f16.nxv8f16(
@@ -213,10 +213,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfnmsac.nxv16f16.nxv16f16(
 define <vscale x 16 x half>  @intrinsic_vfnmsac_vv_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_vv_nxv16f16_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vv v8, v12, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfnmsac.nxv16f16.nxv16f16(
@@ -238,10 +238,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfnmsac.mask.nxv16f16.nxv16f16(
 define <vscale x 16 x half>  @intrinsic_vfnmsac_mask_vv_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_mask_vv_nxv16f16_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfnmsac.mask.nxv16f16.nxv16f16(
@@ -263,10 +263,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfnmsac.nxv1f32.nxv1f32(
 define <vscale x 1 x float>  @intrinsic_vfnmsac_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_vv_nxv1f32_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfnmsac.nxv1f32.nxv1f32(
@@ -288,10 +288,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfnmsac.mask.nxv1f32.nxv1f32(
 define <vscale x 1 x float>  @intrinsic_vfnmsac_mask_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_mask_vv_nxv1f32_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfnmsac.mask.nxv1f32.nxv1f32(
@@ -313,10 +313,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfnmsac.nxv2f32.nxv2f32(
 define <vscale x 2 x float>  @intrinsic_vfnmsac_vv_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_vv_nxv2f32_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfnmsac.nxv2f32.nxv2f32(
@@ -338,10 +338,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfnmsac.mask.nxv2f32.nxv2f32(
 define <vscale x 2 x float>  @intrinsic_vfnmsac_mask_vv_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_mask_vv_nxv2f32_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfnmsac.mask.nxv2f32.nxv2f32(
@@ -363,10 +363,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfnmsac.nxv4f32.nxv4f32(
 define <vscale x 4 x float>  @intrinsic_vfnmsac_vv_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_vv_nxv4f32_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vv v8, v10, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfnmsac.nxv4f32.nxv4f32(
@@ -388,10 +388,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfnmsac.mask.nxv4f32.nxv4f32(
 define <vscale x 4 x float>  @intrinsic_vfnmsac_mask_vv_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_mask_vv_nxv4f32_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfnmsac.mask.nxv4f32.nxv4f32(
@@ -413,10 +413,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfnmsac.nxv8f32.nxv8f32(
 define <vscale x 8 x float>  @intrinsic_vfnmsac_vv_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_vv_nxv8f32_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vv v8, v12, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfnmsac.nxv8f32.nxv8f32(
@@ -438,10 +438,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfnmsac.mask.nxv8f32.nxv8f32(
 define <vscale x 8 x float>  @intrinsic_vfnmsac_mask_vv_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_mask_vv_nxv8f32_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfnmsac.mask.nxv8f32.nxv8f32(
@@ -463,10 +463,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfnmsac.nxv1f64.nxv1f64(
 define <vscale x 1 x double>  @intrinsic_vfnmsac_vv_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_vv_nxv1f64_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfnmsac.nxv1f64.nxv1f64(
@@ -488,10 +488,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfnmsac.mask.nxv1f64.nxv1f64(
 define <vscale x 1 x double>  @intrinsic_vfnmsac_mask_vv_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_mask_vv_nxv1f64_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfnmsac.mask.nxv1f64.nxv1f64(
@@ -513,10 +513,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfnmsac.nxv2f64.nxv2f64(
 define <vscale x 2 x double>  @intrinsic_vfnmsac_vv_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_vv_nxv2f64_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vv v8, v10, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfnmsac.nxv2f64.nxv2f64(
@@ -538,10 +538,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfnmsac.mask.nxv2f64.nxv2f64(
 define <vscale x 2 x double>  @intrinsic_vfnmsac_mask_vv_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_mask_vv_nxv2f64_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfnmsac.mask.nxv2f64.nxv2f64(
@@ -563,10 +563,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfnmsac.nxv4f64.nxv4f64(
 define <vscale x 4 x double>  @intrinsic_vfnmsac_vv_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_vv_nxv4f64_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vv v8, v12, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfnmsac.nxv4f64.nxv4f64(
@@ -588,10 +588,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfnmsac.mask.nxv4f64.nxv4f64(
 define <vscale x 4 x double>  @intrinsic_vfnmsac_mask_vv_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_mask_vv_nxv4f64_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfnmsac.mask.nxv4f64.nxv4f64(
@@ -613,10 +613,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfnmsac.nxv1f16.f16(
 define <vscale x 1 x half>  @intrinsic_vfnmsac_vf_nxv1f16_f16_nxv1f16(<vscale x 1 x half> %0, half %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_vf_nxv1f16_f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfnmsac.nxv1f16.f16(
@@ -638,10 +638,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfnmsac.mask.nxv1f16.f16(
 define <vscale x 1 x half> @intrinsic_vfnmsac_mask_vf_nxv1f16_f16_nxv1f16(<vscale x 1 x half> %0, half %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_mask_vf_nxv1f16_f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfnmsac.mask.nxv1f16.f16(
@@ -663,10 +663,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfnmsac.nxv2f16.f16(
 define <vscale x 2 x half>  @intrinsic_vfnmsac_vf_nxv2f16_f16_nxv2f16(<vscale x 2 x half> %0, half %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_vf_nxv2f16_f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfnmsac.nxv2f16.f16(
@@ -688,10 +688,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfnmsac.mask.nxv2f16.f16(
 define <vscale x 2 x half> @intrinsic_vfnmsac_mask_vf_nxv2f16_f16_nxv2f16(<vscale x 2 x half> %0, half %1, <vscale x 2 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_mask_vf_nxv2f16_f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfnmsac.mask.nxv2f16.f16(
@@ -713,10 +713,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfnmsac.nxv4f16.f16(
 define <vscale x 4 x half>  @intrinsic_vfnmsac_vf_nxv4f16_f16_nxv4f16(<vscale x 4 x half> %0, half %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_vf_nxv4f16_f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfnmsac.nxv4f16.f16(
@@ -738,10 +738,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfnmsac.mask.nxv4f16.f16(
 define <vscale x 4 x half> @intrinsic_vfnmsac_mask_vf_nxv4f16_f16_nxv4f16(<vscale x 4 x half> %0, half %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_mask_vf_nxv4f16_f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfnmsac.mask.nxv4f16.f16(
@@ -763,10 +763,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfnmsac.nxv8f16.f16(
 define <vscale x 8 x half>  @intrinsic_vfnmsac_vf_nxv8f16_f16_nxv8f16(<vscale x 8 x half> %0, half %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_vf_nxv8f16_f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vf v8, fa0, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfnmsac.nxv8f16.f16(
@@ -788,10 +788,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfnmsac.mask.nxv8f16.f16(
 define <vscale x 8 x half> @intrinsic_vfnmsac_mask_vf_nxv8f16_f16_nxv8f16(<vscale x 8 x half> %0, half %1, <vscale x 8 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_mask_vf_nxv8f16_f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vf v8, fa0, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfnmsac.mask.nxv8f16.f16(
@@ -813,10 +813,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfnmsac.nxv16f16.f16(
 define <vscale x 16 x half>  @intrinsic_vfnmsac_vf_nxv16f16_f16_nxv16f16(<vscale x 16 x half> %0, half %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_vf_nxv16f16_f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vf v8, fa0, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfnmsac.nxv16f16.f16(
@@ -838,10 +838,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfnmsac.mask.nxv16f16.f16(
 define <vscale x 16 x half> @intrinsic_vfnmsac_mask_vf_nxv16f16_f16_nxv16f16(<vscale x 16 x half> %0, half %1, <vscale x 16 x half> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_mask_vf_nxv16f16_f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vf v8, fa0, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfnmsac.mask.nxv16f16.f16(
@@ -863,10 +863,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfnmsac.nxv1f32.f32(
 define <vscale x 1 x float>  @intrinsic_vfnmsac_vf_nxv1f32_f32_nxv1f32(<vscale x 1 x float> %0, float %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_vf_nxv1f32_f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfnmsac.nxv1f32.f32(
@@ -888,10 +888,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfnmsac.mask.nxv1f32.f32(
 define <vscale x 1 x float> @intrinsic_vfnmsac_mask_vf_nxv1f32_f32_nxv1f32(<vscale x 1 x float> %0, float %1, <vscale x 1 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_mask_vf_nxv1f32_f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfnmsac.mask.nxv1f32.f32(
@@ -913,10 +913,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfnmsac.nxv2f32.f32(
 define <vscale x 2 x float>  @intrinsic_vfnmsac_vf_nxv2f32_f32_nxv2f32(<vscale x 2 x float> %0, float %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_vf_nxv2f32_f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfnmsac.nxv2f32.f32(
@@ -938,10 +938,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfnmsac.mask.nxv2f32.f32(
 define <vscale x 2 x float> @intrinsic_vfnmsac_mask_vf_nxv2f32_f32_nxv2f32(<vscale x 2 x float> %0, float %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_mask_vf_nxv2f32_f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfnmsac.mask.nxv2f32.f32(
@@ -963,10 +963,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfnmsac.nxv4f32.f32(
 define <vscale x 4 x float>  @intrinsic_vfnmsac_vf_nxv4f32_f32_nxv4f32(<vscale x 4 x float> %0, float %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_vf_nxv4f32_f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vf v8, fa0, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfnmsac.nxv4f32.f32(
@@ -988,10 +988,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfnmsac.mask.nxv4f32.f32(
 define <vscale x 4 x float> @intrinsic_vfnmsac_mask_vf_nxv4f32_f32_nxv4f32(<vscale x 4 x float> %0, float %1, <vscale x 4 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_mask_vf_nxv4f32_f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vf v8, fa0, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfnmsac.mask.nxv4f32.f32(
@@ -1013,10 +1013,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfnmsac.nxv8f32.f32(
 define <vscale x 8 x float>  @intrinsic_vfnmsac_vf_nxv8f32_f32_nxv8f32(<vscale x 8 x float> %0, float %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_vf_nxv8f32_f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vf v8, fa0, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfnmsac.nxv8f32.f32(
@@ -1038,10 +1038,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfnmsac.mask.nxv8f32.f32(
 define <vscale x 8 x float> @intrinsic_vfnmsac_mask_vf_nxv8f32_f32_nxv8f32(<vscale x 8 x float> %0, float %1, <vscale x 8 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_mask_vf_nxv8f32_f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vf v8, fa0, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfnmsac.mask.nxv8f32.f32(
@@ -1063,10 +1063,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfnmsac.nxv1f64.f64(
 define <vscale x 1 x double>  @intrinsic_vfnmsac_vf_nxv1f64_f64_nxv1f64(<vscale x 1 x double> %0, double %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_vf_nxv1f64_f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfnmsac.nxv1f64.f64(
@@ -1088,10 +1088,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfnmsac.mask.nxv1f64.f64(
 define <vscale x 1 x double> @intrinsic_vfnmsac_mask_vf_nxv1f64_f64_nxv1f64(<vscale x 1 x double> %0, double %1, <vscale x 1 x double> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_mask_vf_nxv1f64_f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfnmsac.mask.nxv1f64.f64(
@@ -1113,10 +1113,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfnmsac.nxv2f64.f64(
 define <vscale x 2 x double>  @intrinsic_vfnmsac_vf_nxv2f64_f64_nxv2f64(<vscale x 2 x double> %0, double %1, <vscale x 2 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_vf_nxv2f64_f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vf v8, fa0, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfnmsac.nxv2f64.f64(
@@ -1138,10 +1138,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfnmsac.mask.nxv2f64.f64(
 define <vscale x 2 x double> @intrinsic_vfnmsac_mask_vf_nxv2f64_f64_nxv2f64(<vscale x 2 x double> %0, double %1, <vscale x 2 x double> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_mask_vf_nxv2f64_f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vf v8, fa0, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfnmsac.mask.nxv2f64.f64(
@@ -1163,10 +1163,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfnmsac.nxv4f64.f64(
 define <vscale x 4 x double>  @intrinsic_vfnmsac_vf_nxv4f64_f64_nxv4f64(<vscale x 4 x double> %0, double %1, <vscale x 4 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_vf_nxv4f64_f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vf v8, fa0, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfnmsac.nxv4f64.f64(
@@ -1188,10 +1188,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfnmsac.mask.nxv4f64.f64(
 define <vscale x 4 x double> @intrinsic_vfnmsac_mask_vf_nxv4f64_f64_nxv4f64(<vscale x 4 x double> %0, double %1, <vscale x 4 x double> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsac_mask_vf_nxv4f64_f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsac.vf v8, fa0, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfnmsac.mask.nxv4f64.f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfnmsub.ll b/llvm/test/CodeGen/RISCV/rvv/vfnmsub.ll
index cadddb016c4f..4922cf40e503 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfnmsub.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfnmsub.ll
@@ -13,10 +13,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfnmsub.nxv1f16.nxv1f16(
 define <vscale x 1 x half>  @intrinsic_vfnmsub_vv_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_vv_nxv1f16_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfnmsub.nxv1f16.nxv1f16(
@@ -38,10 +38,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfnmsub.mask.nxv1f16.nxv1f16(
 define <vscale x 1 x half>  @intrinsic_vfnmsub_mask_vv_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_mask_vv_nxv1f16_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfnmsub.mask.nxv1f16.nxv1f16(
@@ -63,10 +63,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfnmsub.nxv2f16.nxv2f16(
 define <vscale x 2 x half>  @intrinsic_vfnmsub_vv_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_vv_nxv2f16_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfnmsub.nxv2f16.nxv2f16(
@@ -88,10 +88,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfnmsub.mask.nxv2f16.nxv2f16(
 define <vscale x 2 x half>  @intrinsic_vfnmsub_mask_vv_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_mask_vv_nxv2f16_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfnmsub.mask.nxv2f16.nxv2f16(
@@ -113,10 +113,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfnmsub.nxv4f16.nxv4f16(
 define <vscale x 4 x half>  @intrinsic_vfnmsub_vv_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_vv_nxv4f16_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfnmsub.nxv4f16.nxv4f16(
@@ -138,10 +138,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfnmsub.mask.nxv4f16.nxv4f16(
 define <vscale x 4 x half>  @intrinsic_vfnmsub_mask_vv_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_mask_vv_nxv4f16_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfnmsub.mask.nxv4f16.nxv4f16(
@@ -163,10 +163,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfnmsub.nxv8f16.nxv8f16(
 define <vscale x 8 x half>  @intrinsic_vfnmsub_vv_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_vv_nxv8f16_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vv v8, v10, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfnmsub.nxv8f16.nxv8f16(
@@ -188,10 +188,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfnmsub.mask.nxv8f16.nxv8f16(
 define <vscale x 8 x half>  @intrinsic_vfnmsub_mask_vv_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_mask_vv_nxv8f16_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfnmsub.mask.nxv8f16.nxv8f16(
@@ -213,10 +213,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfnmsub.nxv16f16.nxv16f16(
 define <vscale x 16 x half>  @intrinsic_vfnmsub_vv_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_vv_nxv16f16_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vv v8, v12, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfnmsub.nxv16f16.nxv16f16(
@@ -238,10 +238,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfnmsub.mask.nxv16f16.nxv16f16(
 define <vscale x 16 x half>  @intrinsic_vfnmsub_mask_vv_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_mask_vv_nxv16f16_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfnmsub.mask.nxv16f16.nxv16f16(
@@ -263,10 +263,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfnmsub.nxv1f32.nxv1f32(
 define <vscale x 1 x float>  @intrinsic_vfnmsub_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_vv_nxv1f32_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfnmsub.nxv1f32.nxv1f32(
@@ -288,10 +288,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfnmsub.mask.nxv1f32.nxv1f32(
 define <vscale x 1 x float>  @intrinsic_vfnmsub_mask_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_mask_vv_nxv1f32_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfnmsub.mask.nxv1f32.nxv1f32(
@@ -313,10 +313,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfnmsub.nxv2f32.nxv2f32(
 define <vscale x 2 x float>  @intrinsic_vfnmsub_vv_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_vv_nxv2f32_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfnmsub.nxv2f32.nxv2f32(
@@ -338,10 +338,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfnmsub.mask.nxv2f32.nxv2f32(
 define <vscale x 2 x float>  @intrinsic_vfnmsub_mask_vv_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_mask_vv_nxv2f32_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfnmsub.mask.nxv2f32.nxv2f32(
@@ -363,10 +363,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfnmsub.nxv4f32.nxv4f32(
 define <vscale x 4 x float>  @intrinsic_vfnmsub_vv_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_vv_nxv4f32_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vv v8, v10, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfnmsub.nxv4f32.nxv4f32(
@@ -388,10 +388,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfnmsub.mask.nxv4f32.nxv4f32(
 define <vscale x 4 x float>  @intrinsic_vfnmsub_mask_vv_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_mask_vv_nxv4f32_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfnmsub.mask.nxv4f32.nxv4f32(
@@ -413,10 +413,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfnmsub.nxv8f32.nxv8f32(
 define <vscale x 8 x float>  @intrinsic_vfnmsub_vv_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_vv_nxv8f32_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vv v8, v12, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfnmsub.nxv8f32.nxv8f32(
@@ -438,10 +438,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfnmsub.mask.nxv8f32.nxv8f32(
 define <vscale x 8 x float>  @intrinsic_vfnmsub_mask_vv_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_mask_vv_nxv8f32_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfnmsub.mask.nxv8f32.nxv8f32(
@@ -463,10 +463,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfnmsub.nxv1f64.nxv1f64(
 define <vscale x 1 x double>  @intrinsic_vfnmsub_vv_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_vv_nxv1f64_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfnmsub.nxv1f64.nxv1f64(
@@ -488,10 +488,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfnmsub.mask.nxv1f64.nxv1f64(
 define <vscale x 1 x double>  @intrinsic_vfnmsub_mask_vv_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_mask_vv_nxv1f64_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfnmsub.mask.nxv1f64.nxv1f64(
@@ -513,10 +513,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfnmsub.nxv2f64.nxv2f64(
 define <vscale x 2 x double>  @intrinsic_vfnmsub_vv_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_vv_nxv2f64_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vv v8, v10, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfnmsub.nxv2f64.nxv2f64(
@@ -538,10 +538,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfnmsub.mask.nxv2f64.nxv2f64(
 define <vscale x 2 x double>  @intrinsic_vfnmsub_mask_vv_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_mask_vv_nxv2f64_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfnmsub.mask.nxv2f64.nxv2f64(
@@ -563,10 +563,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfnmsub.nxv4f64.nxv4f64(
 define <vscale x 4 x double>  @intrinsic_vfnmsub_vv_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_vv_nxv4f64_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vv v8, v12, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfnmsub.nxv4f64.nxv4f64(
@@ -588,10 +588,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfnmsub.mask.nxv4f64.nxv4f64(
 define <vscale x 4 x double>  @intrinsic_vfnmsub_mask_vv_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_mask_vv_nxv4f64_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfnmsub.mask.nxv4f64.nxv4f64(
@@ -613,10 +613,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfnmsub.nxv1f16.f16(
 define <vscale x 1 x half>  @intrinsic_vfnmsub_vf_nxv1f16_f16_nxv1f16(<vscale x 1 x half> %0, half %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_vf_nxv1f16_f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfnmsub.nxv1f16.f16(
@@ -638,10 +638,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfnmsub.mask.nxv1f16.f16(
 define <vscale x 1 x half> @intrinsic_vfnmsub_mask_vf_nxv1f16_f16_nxv1f16(<vscale x 1 x half> %0, half %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_mask_vf_nxv1f16_f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfnmsub.mask.nxv1f16.f16(
@@ -663,10 +663,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfnmsub.nxv2f16.f16(
 define <vscale x 2 x half>  @intrinsic_vfnmsub_vf_nxv2f16_f16_nxv2f16(<vscale x 2 x half> %0, half %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_vf_nxv2f16_f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfnmsub.nxv2f16.f16(
@@ -688,10 +688,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfnmsub.mask.nxv2f16.f16(
 define <vscale x 2 x half> @intrinsic_vfnmsub_mask_vf_nxv2f16_f16_nxv2f16(<vscale x 2 x half> %0, half %1, <vscale x 2 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_mask_vf_nxv2f16_f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfnmsub.mask.nxv2f16.f16(
@@ -713,10 +713,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfnmsub.nxv4f16.f16(
 define <vscale x 4 x half>  @intrinsic_vfnmsub_vf_nxv4f16_f16_nxv4f16(<vscale x 4 x half> %0, half %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_vf_nxv4f16_f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfnmsub.nxv4f16.f16(
@@ -738,10 +738,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfnmsub.mask.nxv4f16.f16(
 define <vscale x 4 x half> @intrinsic_vfnmsub_mask_vf_nxv4f16_f16_nxv4f16(<vscale x 4 x half> %0, half %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_mask_vf_nxv4f16_f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfnmsub.mask.nxv4f16.f16(
@@ -763,10 +763,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfnmsub.nxv8f16.f16(
 define <vscale x 8 x half>  @intrinsic_vfnmsub_vf_nxv8f16_f16_nxv8f16(<vscale x 8 x half> %0, half %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_vf_nxv8f16_f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vf v8, fa0, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfnmsub.nxv8f16.f16(
@@ -788,10 +788,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfnmsub.mask.nxv8f16.f16(
 define <vscale x 8 x half> @intrinsic_vfnmsub_mask_vf_nxv8f16_f16_nxv8f16(<vscale x 8 x half> %0, half %1, <vscale x 8 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_mask_vf_nxv8f16_f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vf v8, fa0, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfnmsub.mask.nxv8f16.f16(
@@ -813,10 +813,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfnmsub.nxv16f16.f16(
 define <vscale x 16 x half>  @intrinsic_vfnmsub_vf_nxv16f16_f16_nxv16f16(<vscale x 16 x half> %0, half %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_vf_nxv16f16_f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vf v8, fa0, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfnmsub.nxv16f16.f16(
@@ -838,10 +838,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfnmsub.mask.nxv16f16.f16(
 define <vscale x 16 x half> @intrinsic_vfnmsub_mask_vf_nxv16f16_f16_nxv16f16(<vscale x 16 x half> %0, half %1, <vscale x 16 x half> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_mask_vf_nxv16f16_f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vf v8, fa0, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfnmsub.mask.nxv16f16.f16(
@@ -863,10 +863,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfnmsub.nxv1f32.f32(
 define <vscale x 1 x float>  @intrinsic_vfnmsub_vf_nxv1f32_f32_nxv1f32(<vscale x 1 x float> %0, float %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_vf_nxv1f32_f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfnmsub.nxv1f32.f32(
@@ -888,10 +888,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfnmsub.mask.nxv1f32.f32(
 define <vscale x 1 x float> @intrinsic_vfnmsub_mask_vf_nxv1f32_f32_nxv1f32(<vscale x 1 x float> %0, float %1, <vscale x 1 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_mask_vf_nxv1f32_f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfnmsub.mask.nxv1f32.f32(
@@ -913,10 +913,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfnmsub.nxv2f32.f32(
 define <vscale x 2 x float>  @intrinsic_vfnmsub_vf_nxv2f32_f32_nxv2f32(<vscale x 2 x float> %0, float %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_vf_nxv2f32_f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfnmsub.nxv2f32.f32(
@@ -938,10 +938,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfnmsub.mask.nxv2f32.f32(
 define <vscale x 2 x float> @intrinsic_vfnmsub_mask_vf_nxv2f32_f32_nxv2f32(<vscale x 2 x float> %0, float %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_mask_vf_nxv2f32_f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfnmsub.mask.nxv2f32.f32(
@@ -963,10 +963,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfnmsub.nxv4f32.f32(
 define <vscale x 4 x float>  @intrinsic_vfnmsub_vf_nxv4f32_f32_nxv4f32(<vscale x 4 x float> %0, float %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_vf_nxv4f32_f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vf v8, fa0, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfnmsub.nxv4f32.f32(
@@ -988,10 +988,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfnmsub.mask.nxv4f32.f32(
 define <vscale x 4 x float> @intrinsic_vfnmsub_mask_vf_nxv4f32_f32_nxv4f32(<vscale x 4 x float> %0, float %1, <vscale x 4 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_mask_vf_nxv4f32_f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vf v8, fa0, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfnmsub.mask.nxv4f32.f32(
@@ -1013,10 +1013,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfnmsub.nxv8f32.f32(
 define <vscale x 8 x float>  @intrinsic_vfnmsub_vf_nxv8f32_f32_nxv8f32(<vscale x 8 x float> %0, float %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_vf_nxv8f32_f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vf v8, fa0, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfnmsub.nxv8f32.f32(
@@ -1038,10 +1038,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfnmsub.mask.nxv8f32.f32(
 define <vscale x 8 x float> @intrinsic_vfnmsub_mask_vf_nxv8f32_f32_nxv8f32(<vscale x 8 x float> %0, float %1, <vscale x 8 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_mask_vf_nxv8f32_f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vf v8, fa0, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfnmsub.mask.nxv8f32.f32(
@@ -1063,10 +1063,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfnmsub.nxv1f64.f64(
 define <vscale x 1 x double>  @intrinsic_vfnmsub_vf_nxv1f64_f64_nxv1f64(<vscale x 1 x double> %0, double %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_vf_nxv1f64_f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfnmsub.nxv1f64.f64(
@@ -1088,10 +1088,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfnmsub.mask.nxv1f64.f64(
 define <vscale x 1 x double> @intrinsic_vfnmsub_mask_vf_nxv1f64_f64_nxv1f64(<vscale x 1 x double> %0, double %1, <vscale x 1 x double> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_mask_vf_nxv1f64_f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfnmsub.mask.nxv1f64.f64(
@@ -1113,10 +1113,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfnmsub.nxv2f64.f64(
 define <vscale x 2 x double>  @intrinsic_vfnmsub_vf_nxv2f64_f64_nxv2f64(<vscale x 2 x double> %0, double %1, <vscale x 2 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_vf_nxv2f64_f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vf v8, fa0, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfnmsub.nxv2f64.f64(
@@ -1138,10 +1138,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfnmsub.mask.nxv2f64.f64(
 define <vscale x 2 x double> @intrinsic_vfnmsub_mask_vf_nxv2f64_f64_nxv2f64(<vscale x 2 x double> %0, double %1, <vscale x 2 x double> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_mask_vf_nxv2f64_f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vf v8, fa0, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfnmsub.mask.nxv2f64.f64(
@@ -1163,10 +1163,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfnmsub.nxv4f64.f64(
 define <vscale x 4 x double>  @intrinsic_vfnmsub_vf_nxv4f64_f64_nxv4f64(<vscale x 4 x double> %0, double %1, <vscale x 4 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_vf_nxv4f64_f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vf v8, fa0, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfnmsub.nxv4f64.f64(
@@ -1188,10 +1188,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfnmsub.mask.nxv4f64.f64(
 define <vscale x 4 x double> @intrinsic_vfnmsub_mask_vf_nxv4f64_f64_nxv4f64(<vscale x 4 x double> %0, double %1, <vscale x 4 x double> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfnmsub_mask_vf_nxv4f64_f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfnmsub.vf v8, fa0, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfnmsub.mask.nxv4f64.f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfpext-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfpext-constrained-sdnode.ll
index 5de309757c6d..8b49b720e851 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfpext-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfpext-constrained-sdnode.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+experimental-zvfbfmin -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+experimental-zvfbfmin -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
 
 declare <vscale x 1 x float> @llvm.experimental.constrained.fpext.nxv1f32.nxv1f16(<vscale x 1 x half>, metadata)
@@ -151,3 +151,103 @@ define <vscale x 8 x double> @vfpext_nxv8f32_nxv8f64(<vscale x 8 x float> %va) s
   %evec = call <vscale x 8 x double> @llvm.experimental.constrained.fpext.nxv8f64.nxv8f32(<vscale x 8 x float> %va, metadata !"fpexcept.strict")
   ret <vscale x 8 x double> %evec
 }
+
+declare <vscale x 1 x float> @llvm.experimental.constrained.fpext.nxv1f32.nxv1bf16(<vscale x 1 x bfloat>, metadata)
+define <vscale x 1 x float> @vfpext_nxv1bf16_nxv1f32(<vscale x 1 x bfloat> %va) strictfp {
+; CHECK-LABEL: vfpext_nxv1bf16_nxv1f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %evec = call <vscale x 1 x float> @llvm.experimental.constrained.fpext.nxv1f32.nxv1bf16(<vscale x 1 x bfloat> %va, metadata !"fpexcept.strict")
+  ret <vscale x 1 x float> %evec
+}
+
+declare <vscale x 1 x double> @llvm.experimental.constrained.fpext.nxv1f64.nxv1bf16(<vscale x 1 x bfloat>, metadata)
+define <vscale x 1 x double> @vfpext_nxv1bf16_nxv1f64(<vscale x 1 x bfloat> %va) strictfp {
+; CHECK-LABEL: vfpext_nxv1bf16_nxv1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v9
+; CHECK-NEXT:    ret
+  %evec = call <vscale x 1 x double> @llvm.experimental.constrained.fpext.nxv1f64.nxv1bf16(<vscale x 1 x bfloat> %va, metadata !"fpexcept.strict")
+  ret <vscale x 1 x double> %evec
+}
+
+declare <vscale x 2 x float> @llvm.experimental.constrained.fpext.nxv2f32.nxv2bf16(<vscale x 2 x bfloat>, metadata)
+define <vscale x 2 x float> @vfpext_nxv2bf16_nxv2f32(<vscale x 2 x bfloat> %va) strictfp {
+; CHECK-LABEL: vfpext_nxv2bf16_nxv2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %evec = call <vscale x 2 x float> @llvm.experimental.constrained.fpext.nxv2f32.nxv2bf16(<vscale x 2 x bfloat> %va, metadata !"fpexcept.strict")
+  ret <vscale x 2 x float> %evec
+}
+
+declare <vscale x 2 x double> @llvm.experimental.constrained.fpext.nxv2f64.nxv2bf16(<vscale x 2 x bfloat>, metadata)
+define <vscale x 2 x double> @vfpext_nxv2bf16_nxv2f64(<vscale x 2 x bfloat> %va) strictfp {
+; CHECK-LABEL: vfpext_nxv2bf16_nxv2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v10
+; CHECK-NEXT:    ret
+  %evec = call <vscale x 2 x double> @llvm.experimental.constrained.fpext.nxv2f64.nxv2bf16(<vscale x 2 x bfloat> %va, metadata !"fpexcept.strict")
+  ret <vscale x 2 x double> %evec
+}
+
+declare <vscale x 4 x float> @llvm.experimental.constrained.fpext.nxv4f32.nxv4bf16(<vscale x 4 x bfloat>, metadata)
+define <vscale x 4 x float> @vfpext_nxv4bf16_nxv4f32(<vscale x 4 x bfloat> %va) strictfp {
+; CHECK-LABEL: vfpext_nxv4bf16_nxv4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    ret
+  %evec = call <vscale x 4 x float> @llvm.experimental.constrained.fpext.nxv4f32.nxv4bf16(<vscale x 4 x bfloat> %va, metadata !"fpexcept.strict")
+  ret <vscale x 4 x float> %evec
+}
+
+declare <vscale x 4 x double> @llvm.experimental.constrained.fpext.nxv4f64.nxv4bf16(<vscale x 4 x bfloat>, metadata)
+define <vscale x 4 x double> @vfpext_nxv4bf16_nxv4f64(<vscale x 4 x bfloat> %va) strictfp {
+; CHECK-LABEL: vfpext_nxv4bf16_nxv4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v12
+; CHECK-NEXT:    ret
+  %evec = call <vscale x 4 x double> @llvm.experimental.constrained.fpext.nxv4f64.nxv4bf16(<vscale x 4 x bfloat> %va, metadata !"fpexcept.strict")
+  ret <vscale x 4 x double> %evec
+}
+
+declare <vscale x 8 x float> @llvm.experimental.constrained.fpext.nxv8f32.nxv8bf16(<vscale x 8 x bfloat>, metadata)
+define <vscale x 8 x float> @vfpext_nxv8bf16_nxv8f32(<vscale x 8 x bfloat> %va) strictfp {
+; CHECK-LABEL: vfpext_nxv8bf16_nxv8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    ret
+  %evec = call <vscale x 8 x float> @llvm.experimental.constrained.fpext.nxv8f32.nxv8bf16(<vscale x 8 x bfloat> %va, metadata !"fpexcept.strict")
+  ret <vscale x 8 x float> %evec
+}
+
+declare <vscale x 8 x double> @llvm.experimental.constrained.fpext.nxv8f64.nxv8bf16(<vscale x 8 x bfloat>, metadata)
+define <vscale x 8 x double> @vfpext_nxv8bf16_nxv8f64(<vscale x 8 x bfloat> %va) strictfp {
+; CHECK-LABEL: vfpext_nxv8bf16_nxv8f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v16
+; CHECK-NEXT:    ret
+  %evec = call <vscale x 8 x double> @llvm.experimental.constrained.fpext.nxv8f64.nxv8bf16(<vscale x 8 x bfloat> %va, metadata !"fpexcept.strict")
+  ret <vscale x 8 x double> %evec
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfpext-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfpext-sdnode.ll
index d805a103aafd..b002b8e76566 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfpext-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfpext-sdnode.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+experimental-zvfbfmin -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+experimental-zvfbfmin -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+experimental-zvfbfmin -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+experimental-zvfbfmin -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
 
 define <vscale x 1 x float> @vfpext_nxv1f16_nxv1f32(<vscale x 1 x half> %va) {
@@ -167,3 +167,115 @@ define <vscale x 8 x double> @vfpext_nxv8f32_nxv8f64(<vscale x 8 x float> %va) {
   %evec = fpext <vscale x 8 x float> %va to <vscale x 8 x double>
   ret <vscale x 8 x double> %evec
 }
+
+define <vscale x 1 x float> @vfpext_nxv1bf16_nxv1f32(<vscale x 1 x bfloat> %va) {
+;
+; CHECK-LABEL: vfpext_nxv1bf16_nxv1f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %evec = fpext <vscale x 1 x bfloat> %va to <vscale x 1 x float>
+  ret <vscale x 1 x float> %evec
+}
+
+define <vscale x 1 x double> @vfpext_nxv1bf16_nxv1f64(<vscale x 1 x bfloat> %va) {
+;
+; CHECK-LABEL: vfpext_nxv1bf16_nxv1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v9
+; CHECK-NEXT:    ret
+  %evec = fpext <vscale x 1 x bfloat> %va to <vscale x 1 x double>
+  ret <vscale x 1 x double> %evec
+}
+
+define <vscale x 2 x float> @vfpext_nxv2bf16_nxv2f32(<vscale x 2 x bfloat> %va) {
+;
+; CHECK-LABEL: vfpext_nxv2bf16_nxv2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %evec = fpext <vscale x 2 x bfloat> %va to <vscale x 2 x float>
+  ret <vscale x 2 x float> %evec
+}
+
+define <vscale x 2 x double> @vfpext_nxv2bf16_nxv2f64(<vscale x 2 x bfloat> %va) {
+;
+; CHECK-LABEL: vfpext_nxv2bf16_nxv2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v10
+; CHECK-NEXT:    ret
+  %evec = fpext <vscale x 2 x bfloat> %va to <vscale x 2 x double>
+  ret <vscale x 2 x double> %evec
+}
+
+define <vscale x 4 x float> @vfpext_nxv4bf16_nxv4f32(<vscale x 4 x bfloat> %va) {
+;
+; CHECK-LABEL: vfpext_nxv4bf16_nxv4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    ret
+  %evec = fpext <vscale x 4 x bfloat> %va to <vscale x 4 x float>
+  ret <vscale x 4 x float> %evec
+}
+
+define <vscale x 4 x double> @vfpext_nxv4bf16_nxv4f64(<vscale x 4 x bfloat> %va) {
+;
+; CHECK-LABEL: vfpext_nxv4bf16_nxv4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v12
+; CHECK-NEXT:    ret
+  %evec = fpext <vscale x 4 x bfloat> %va to <vscale x 4 x double>
+  ret <vscale x 4 x double> %evec
+}
+
+define <vscale x 8 x float> @vfpext_nxv8bf16_nxv8f32(<vscale x 8 x bfloat> %va) {
+;
+; CHECK-LABEL: vfpext_nxv8bf16_nxv8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    ret
+  %evec = fpext <vscale x 8 x bfloat> %va to <vscale x 8 x float>
+  ret <vscale x 8 x float> %evec
+}
+
+define <vscale x 8 x double> @vfpext_nxv8bf16_nxv8f64(<vscale x 8 x bfloat> %va) {
+;
+; CHECK-LABEL: vfpext_nxv8bf16_nxv8f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v16
+; CHECK-NEXT:    ret
+  %evec = fpext <vscale x 8 x bfloat> %va to <vscale x 8 x double>
+  ret <vscale x 8 x double> %evec
+}
+
+define <vscale x 16 x float> @vfpext_nxv16bf16_nxv16f32(<vscale x 16 x bfloat> %va) {
+;
+; CHECK-LABEL: vfpext_nxv16bf16_nxv16f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    ret
+  %evec = fpext <vscale x 16 x bfloat> %va to <vscale x 16 x float>
+  ret <vscale x 16 x float> %evec
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll
index 5cfa98916a2d..aaaf4ad46071 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+experimental-zvfbfmin -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+experimental-zvfbfmin -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+experimental-zvfbfmin -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+experimental-zvfbfmin -verify-machineinstrs < %s | FileCheck %s
 
 declare <vscale x 2 x float> @llvm.vp.fpext.nxv2f32.nxv2f16(<vscale x 2 x half>, <vscale x 2 x i1>, i32)
 
@@ -120,3 +120,54 @@ define <vscale x 32 x float> @vfpext_nxv32f16_nxv32f32(<vscale x 32 x half> %a,
   %v = call <vscale x 32 x float> @llvm.vp.fpext.nxv32f32.nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x i1> %m, i32 %vl)
   ret <vscale x 32 x float> %v
 }
+
+declare <vscale x 2 x float> @llvm.vp.fpext.nxv2f32.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x float> @vfpext_nxv2bf16_nxv2f32(<vscale x 2 x bfloat> %a, <vscale x 2 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: vfpext_nxv2bf16_nxv2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x float> @llvm.vp.fpext.nxv2f32.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x i1> %m, i32 %vl)
+  ret <vscale x 2 x float> %v
+}
+
+define <vscale x 2 x float> @vfpext_nxv2bf16_nxv2f32_unmasked(<vscale x 2 x bfloat> %a, i32 zeroext %vl) {
+; CHECK-LABEL: vfpext_nxv2bf16_nxv2f32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x float> @llvm.vp.fpext.nxv2f32.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
+  ret <vscale x 2 x float> %v
+}
+
+declare <vscale x 2 x double> @llvm.vp.fpext.nxv2f64.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x double> @vfpext_nxv2bf16_nxv2f64(<vscale x 2 x bfloat> %a, <vscale x 2 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: vfpext_nxv2bf16_nxv2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x double> @llvm.vp.fpext.nxv2f64.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x i1> %m, i32 %vl)
+  ret <vscale x 2 x double> %v
+}
+
+define <vscale x 2 x double> @vfpext_nxv2bf16_nxv2f64_unmasked(<vscale x 2 x bfloat> %a, i32 zeroext %vl) {
+; CHECK-LABEL: vfpext_nxv2bf16_nxv2f64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vfwcvt.f.f.v v8, v10
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x double> @llvm.vp.fpext.nxv2f64.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
+  ret <vscale x 2 x double> %v
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-constrained-sdnode.ll
index 4404a275858f..4341f45dd6c7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-constrained-sdnode.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+experimental-zvfbfmin -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+experimental-zvfbfmin -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+experimental-zvfbfmin -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+experimental-zvfbfmin -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
 
 declare <vscale x 1 x float> @llvm.experimental.constrained.fptrunc.nxv1f32.nxv1f64(<vscale x 1 x double>, metadata, metadata)
@@ -155,3 +155,103 @@ define <vscale x 8 x half> @vfptrunc_nxv8f32_nxv8f16(<vscale x 8 x float> %va) s
   %evec = call <vscale x 8 x half> @llvm.experimental.constrained.fptrunc.nxv8f16.nxv8f32(<vscale x 8 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 8 x half> %evec
 }
+
+declare <vscale x 1 x bfloat> @llvm.experimental.constrained.fptrunc.nxv1bf16.nxv1f64(<vscale x 1 x double>, metadata, metadata)
+define <vscale x 1 x bfloat> @vfptrunc_nxv1f64_nxv1bf16(<vscale x 1 x double> %va) strictfp {
+; CHECK-LABEL: vfptrunc_nxv1f64_nxv1bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vfncvt.rod.f.f.w v9, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    ret
+  %evec = call <vscale x 1 x bfloat> @llvm.experimental.constrained.fptrunc.nxv1bf16.nxv1f64(<vscale x 1 x double> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <vscale x 1 x bfloat> %evec
+}
+
+declare <vscale x 1 x bfloat> @llvm.experimental.constrained.fptrunc.nxv1bf16.nxv1f32(<vscale x 1 x float>, metadata, metadata)
+define <vscale x 1 x bfloat> @vfptrunc_nxv1f32_nxv1bf16(<vscale x 1 x float> %va) strictfp {
+; CHECK-LABEL: vfptrunc_nxv1f32_nxv1bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v9, v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %evec = call <vscale x 1 x bfloat> @llvm.experimental.constrained.fptrunc.nxv1bf16.nxv1f32(<vscale x 1 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <vscale x 1 x bfloat> %evec
+}
+
+declare <vscale x 2 x bfloat> @llvm.experimental.constrained.fptrunc.nxv2bf16.nxv2f64(<vscale x 2 x double>, metadata, metadata)
+define <vscale x 2 x bfloat> @vfptrunc_nxv2f64_nxv2bf16(<vscale x 2 x double> %va) strictfp {
+; CHECK-LABEL: vfptrunc_nxv2f64_nxv2bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vfncvt.rod.f.f.w v10, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    ret
+  %evec = call <vscale x 2 x bfloat> @llvm.experimental.constrained.fptrunc.nxv2bf16.nxv2f64(<vscale x 2 x double> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <vscale x 2 x bfloat> %evec
+}
+
+declare <vscale x 2 x bfloat> @llvm.experimental.constrained.fptrunc.nxv2bf16.nxv2f32(<vscale x 2 x float>, metadata, metadata)
+define <vscale x 2 x bfloat> @vfptrunc_nxv2f32_nxv2bf16(<vscale x 2 x float> %va) strictfp {
+; CHECK-LABEL: vfptrunc_nxv2f32_nxv2bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v9, v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %evec = call <vscale x 2 x bfloat> @llvm.experimental.constrained.fptrunc.nxv2bf16.nxv2f32(<vscale x 2 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <vscale x 2 x bfloat> %evec
+}
+
+declare <vscale x 4 x bfloat> @llvm.experimental.constrained.fptrunc.nxv4bf16.nxv4f64(<vscale x 4 x double>, metadata, metadata)
+define <vscale x 4 x bfloat> @vfptrunc_nxv4f64_nxv4bf16(<vscale x 4 x double> %va) strictfp {
+; CHECK-LABEL: vfptrunc_nxv4f64_nxv4bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfncvt.rod.f.f.w v12, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    ret
+  %evec = call <vscale x 4 x bfloat> @llvm.experimental.constrained.fptrunc.nxv4bf16.nxv4f64(<vscale x 4 x double> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <vscale x 4 x bfloat> %evec
+}
+
+declare <vscale x 4 x bfloat> @llvm.experimental.constrained.fptrunc.nxv4bf16.nxv4f32(<vscale x 4 x float>, metadata, metadata)
+define <vscale x 4 x bfloat> @vfptrunc_nxv4f32_nxv4bf16(<vscale x 4 x float> %va) strictfp {
+; CHECK-LABEL: vfptrunc_nxv4f32_nxv4bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %evec = call <vscale x 4 x bfloat> @llvm.experimental.constrained.fptrunc.nxv4bf16.nxv4f32(<vscale x 4 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <vscale x 4 x bfloat> %evec
+}
+
+declare <vscale x 8 x bfloat> @llvm.experimental.constrained.fptrunc.nxv8bf16.nxv8f64(<vscale x 8 x double>, metadata, metadata)
+define <vscale x 8 x bfloat> @vfptrunc_nxv8f64_nxv8bf16(<vscale x 8 x double> %va) strictfp {
+; CHECK-LABEL: vfptrunc_nxv8f64_nxv8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vfncvt.rod.f.f.w v16, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    ret
+  %evec = call <vscale x 8 x bfloat> @llvm.experimental.constrained.fptrunc.nxv8bf16.nxv8f64(<vscale x 8 x double> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <vscale x 8 x bfloat> %evec
+}
+
+declare <vscale x 8 x bfloat> @llvm.experimental.constrained.fptrunc.nxv8bf16.nxv8f32(<vscale x 8 x float>, metadata, metadata)
+define <vscale x 8 x bfloat> @vfptrunc_nxv8f32_nxv8bf16(<vscale x 8 x float> %va) strictfp {
+; CHECK-LABEL: vfptrunc_nxv8f32_nxv8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+  %evec = call <vscale x 8 x bfloat> @llvm.experimental.constrained.fptrunc.nxv8bf16.nxv8f32(<vscale x 8 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <vscale x 8 x bfloat> %evec
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-sdnode.ll
index d715b46e95fe..9148a79cb740 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-sdnode.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+experimental-zvfbfmin -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+experimental-zvfbfmin -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+experimental-zvfbfmin -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+experimental-zvfbfmin -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
 
 define <vscale x 1 x half> @vfptrunc_nxv1f32_nxv1f16(<vscale x 1 x float> %va) {
@@ -167,3 +167,76 @@ define <vscale x 8 x float> @vfptrunc_nxv8f64_nxv8f32(<vscale x 8 x double> %va)
   %evec = fptrunc <vscale x 8 x double> %va to <vscale x 8 x float>
   ret <vscale x 8 x float> %evec
 }
+
+define <vscale x 1 x bfloat> @vfptrunc_nxv1f32_nxv1bf16(<vscale x 1 x float> %va) {
+;
+; CHECK-LABEL: vfptrunc_nxv1f32_nxv1bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v9, v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %evec = fptrunc <vscale x 1 x float> %va to <vscale x 1 x bfloat>
+  ret <vscale x 1 x bfloat> %evec
+}
+
+define <vscale x 2 x bfloat> @vfptrunc_nxv2f32_nxv2bf16(<vscale x 2 x float> %va) {
+;
+; CHECK-LABEL: vfptrunc_nxv2f32_nxv2bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v9, v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %evec = fptrunc <vscale x 2 x float> %va to <vscale x 2 x bfloat>
+  ret <vscale x 2 x bfloat> %evec
+}
+
+define <vscale x 4 x bfloat> @vfptrunc_nxv4f32_nxv4bf16(<vscale x 4 x float> %va) {
+;
+; CHECK-LABEL: vfptrunc_nxv4f32_nxv4bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %evec = fptrunc <vscale x 4 x float> %va to <vscale x 4 x bfloat>
+  ret <vscale x 4 x bfloat> %evec
+}
+
+define <vscale x 8 x bfloat> @vfptrunc_nxv8f32_nxv8bf16(<vscale x 8 x float> %va) {
+;
+; CHECK-LABEL: vfptrunc_nxv8f32_nxv8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+  %evec = fptrunc <vscale x 8 x float> %va to <vscale x 8 x bfloat>
+  ret <vscale x 8 x bfloat> %evec
+}
+
+define <vscale x 16 x bfloat> @vfptrunc_nxv16f32_nxv16bf16(<vscale x 16 x float> %va) {
+;
+; CHECK-LABEL: vfptrunc_nxv16f32_nxv16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v16, v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+  %evec = fptrunc <vscale x 16 x float> %va to <vscale x 16 x bfloat>
+  ret <vscale x 16 x bfloat> %evec
+}
+
+define <vscale x 1 x bfloat> @vfptrunc_nxv1f64_nxv1bf16(<vscale x 1 x double> %va) {
+;
+; CHECK-LABEL: vfptrunc_nxv1f64_nxv1bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vfncvt.rod.f.f.w v9, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    ret
+  %evec = fptrunc <vscale x 1 x double> %va to <vscale x 1 x bfloat>
+  ret <vscale x 1 x bfloat> %evec
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
index dd122f1f2511..0c3abe37af27 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+m -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+m -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m,+experimental-zvfbfmin -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m,+experimental-zvfbfmin -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+m,+experimental-zvfbfmin -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+m,+experimental-zvfbfmin -verify-machineinstrs < %s | FileCheck %s
 
 declare <vscale x 2 x half> @llvm.vp.fptrunc.nxv2f16.nxv2f32(<vscale x 2 x float>, <vscale x 2 x i1>, i32)
 
@@ -218,3 +218,53 @@ define <vscale x 32 x float> @vfptrunc_nxv32f32_nxv32f64(<vscale x 32 x double>
   %v = call <vscale x 32 x float> @llvm.vp.fptrunc.nxv32f64.nxv32f32(<vscale x 32 x double> %a, <vscale x 32 x i1> %m, i32 %vl)
   ret <vscale x 32 x float> %v
 }
+
+declare <vscale x 2 x bfloat> @llvm.vp.fptrunc.nxv2bf16.nxv2f32(<vscale x 2 x float>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x bfloat> @vfptrunc_nxv2bf16_nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: vfptrunc_nxv2bf16_nxv2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v9, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x bfloat> @llvm.vp.fptrunc.nxv2bf16.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x i1> %m, i32 %vl)
+  ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vfptrunc_nxv2bf16_nxv2f32_unmasked(<vscale x 2 x float> %a, i32 zeroext %vl) {
+; CHECK-LABEL: vfptrunc_nxv2bf16_nxv2f32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v9, v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x bfloat> @llvm.vp.fptrunc.nxv2bf16.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
+  ret <vscale x 2 x bfloat> %v
+}
+
+declare <vscale x 2 x bfloat> @llvm.vp.fptrunc.nxv2bf16.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x bfloat> @vfptrunc_nxv2bf16_nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: vfptrunc_nxv2bf16_nxv2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vfncvt.rod.f.f.w v10, v8, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x bfloat> @llvm.vp.fptrunc.nxv2bf16.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x i1> %m, i32 %vl)
+  ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vfptrunc_nxv2bf16_nxv2f64_unmasked(<vscale x 2 x double> %a, i32 zeroext %vl) {
+; CHECK-LABEL: vfptrunc_nxv2bf16_nxv2f64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vfncvt.rod.f.f.w v10, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x bfloat> @llvm.vp.fptrunc.nxv2bf16.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
+  ret <vscale x 2 x bfloat> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfrdiv.ll b/llvm/test/CodeGen/RISCV/rvv/vfrdiv.ll
index f17c226ada0d..f73e7dce9212 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfrdiv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfrdiv.ll
@@ -13,10 +13,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfrdiv.nxv1f16.f16(
 define <vscale x 1 x half> @intrinsic_vfrdiv_vf_nxv1f16_nxv1f16_f16(<vscale x 1 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfrdiv_vf_nxv1f16_nxv1f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrdiv.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfrdiv.nxv1f16.f16(
@@ -38,10 +38,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfrdiv.mask.nxv1f16.f16(
 define <vscale x 1 x half> @intrinsic_vfrdiv_mask_vf_nxv1f16_nxv1f16_f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, half %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfrdiv_mask_vf_nxv1f16_nxv1f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrdiv.vf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfrdiv.mask.nxv1f16.f16(
@@ -63,10 +63,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfrdiv.nxv2f16.f16(
 define <vscale x 2 x half> @intrinsic_vfrdiv_vf_nxv2f16_nxv2f16_f16(<vscale x 2 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfrdiv_vf_nxv2f16_nxv2f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrdiv.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfrdiv.nxv2f16.f16(
@@ -88,10 +88,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfrdiv.mask.nxv2f16.f16(
 define <vscale x 2 x half> @intrinsic_vfrdiv_mask_vf_nxv2f16_nxv2f16_f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, half %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfrdiv_mask_vf_nxv2f16_nxv2f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrdiv.vf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfrdiv.mask.nxv2f16.f16(
@@ -113,10 +113,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfrdiv.nxv4f16.f16(
 define <vscale x 4 x half> @intrinsic_vfrdiv_vf_nxv4f16_nxv4f16_f16(<vscale x 4 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfrdiv_vf_nxv4f16_nxv4f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrdiv.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfrdiv.nxv4f16.f16(
@@ -138,10 +138,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfrdiv.mask.nxv4f16.f16(
 define <vscale x 4 x half> @intrinsic_vfrdiv_mask_vf_nxv4f16_nxv4f16_f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, half %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfrdiv_mask_vf_nxv4f16_nxv4f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrdiv.vf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfrdiv.mask.nxv4f16.f16(
@@ -163,10 +163,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfrdiv.nxv8f16.f16(
 define <vscale x 8 x half> @intrinsic_vfrdiv_vf_nxv8f16_nxv8f16_f16(<vscale x 8 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfrdiv_vf_nxv8f16_nxv8f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrdiv.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfrdiv.nxv8f16.f16(
@@ -188,10 +188,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfrdiv.mask.nxv8f16.f16(
 define <vscale x 8 x half> @intrinsic_vfrdiv_mask_vf_nxv8f16_nxv8f16_f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, half %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfrdiv_mask_vf_nxv8f16_nxv8f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrdiv.vf v8, v10, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfrdiv.mask.nxv8f16.f16(
@@ -213,10 +213,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfrdiv.nxv16f16.f16(
 define <vscale x 16 x half> @intrinsic_vfrdiv_vf_nxv16f16_nxv16f16_f16(<vscale x 16 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfrdiv_vf_nxv16f16_nxv16f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrdiv.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfrdiv.nxv16f16.f16(
@@ -238,10 +238,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfrdiv.mask.nxv16f16.f16(
 define <vscale x 16 x half> @intrinsic_vfrdiv_mask_vf_nxv16f16_nxv16f16_f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, half %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfrdiv_mask_vf_nxv16f16_nxv16f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrdiv.vf v8, v12, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfrdiv.mask.nxv16f16.f16(
@@ -263,10 +263,10 @@ declare <vscale x 32 x half> @llvm.riscv.vfrdiv.nxv32f16.f16(
 define <vscale x 32 x half> @intrinsic_vfrdiv_vf_nxv32f16_nxv32f16_f16(<vscale x 32 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfrdiv_vf_nxv32f16_nxv32f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrdiv.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vfrdiv.nxv32f16.f16(
@@ -288,10 +288,10 @@ declare <vscale x 32 x half> @llvm.riscv.vfrdiv.mask.nxv32f16.f16(
 define <vscale x 32 x half> @intrinsic_vfrdiv_mask_vf_nxv32f16_nxv32f16_f16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, half %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfrdiv_mask_vf_nxv32f16_nxv32f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrdiv.vf v8, v16, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vfrdiv.mask.nxv32f16.f16(
@@ -313,10 +313,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfrdiv.nxv1f32.f32(
 define <vscale x 1 x float> @intrinsic_vfrdiv_vf_nxv1f32_nxv1f32_f32(<vscale x 1 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfrdiv_vf_nxv1f32_nxv1f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrdiv.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfrdiv.nxv1f32.f32(
@@ -338,10 +338,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfrdiv.mask.nxv1f32.f32(
 define <vscale x 1 x float> @intrinsic_vfrdiv_mask_vf_nxv1f32_nxv1f32_f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, float %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfrdiv_mask_vf_nxv1f32_nxv1f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrdiv.vf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfrdiv.mask.nxv1f32.f32(
@@ -363,10 +363,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfrdiv.nxv2f32.f32(
 define <vscale x 2 x float> @intrinsic_vfrdiv_vf_nxv2f32_nxv2f32_f32(<vscale x 2 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfrdiv_vf_nxv2f32_nxv2f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrdiv.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfrdiv.nxv2f32.f32(
@@ -388,10 +388,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfrdiv.mask.nxv2f32.f32(
 define <vscale x 2 x float> @intrinsic_vfrdiv_mask_vf_nxv2f32_nxv2f32_f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, float %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfrdiv_mask_vf_nxv2f32_nxv2f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrdiv.vf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfrdiv.mask.nxv2f32.f32(
@@ -413,10 +413,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfrdiv.nxv4f32.f32(
 define <vscale x 4 x float> @intrinsic_vfrdiv_vf_nxv4f32_nxv4f32_f32(<vscale x 4 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfrdiv_vf_nxv4f32_nxv4f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrdiv.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfrdiv.nxv4f32.f32(
@@ -438,10 +438,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfrdiv.mask.nxv4f32.f32(
 define <vscale x 4 x float> @intrinsic_vfrdiv_mask_vf_nxv4f32_nxv4f32_f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, float %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfrdiv_mask_vf_nxv4f32_nxv4f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrdiv.vf v8, v10, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfrdiv.mask.nxv4f32.f32(
@@ -463,10 +463,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfrdiv.nxv8f32.f32(
 define <vscale x 8 x float> @intrinsic_vfrdiv_vf_nxv8f32_nxv8f32_f32(<vscale x 8 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfrdiv_vf_nxv8f32_nxv8f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrdiv.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfrdiv.nxv8f32.f32(
@@ -488,10 +488,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfrdiv.mask.nxv8f32.f32(
 define <vscale x 8 x float> @intrinsic_vfrdiv_mask_vf_nxv8f32_nxv8f32_f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, float %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfrdiv_mask_vf_nxv8f32_nxv8f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrdiv.vf v8, v12, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfrdiv.mask.nxv8f32.f32(
@@ -513,10 +513,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfrdiv.nxv16f32.f32(
 define <vscale x 16 x float> @intrinsic_vfrdiv_vf_nxv16f32_nxv16f32_f32(<vscale x 16 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfrdiv_vf_nxv16f32_nxv16f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrdiv.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfrdiv.nxv16f32.f32(
@@ -538,10 +538,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfrdiv.mask.nxv16f32.f32(
 define <vscale x 16 x float> @intrinsic_vfrdiv_mask_vf_nxv16f32_nxv16f32_f32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, float %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfrdiv_mask_vf_nxv16f32_nxv16f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrdiv.vf v8, v16, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfrdiv.mask.nxv16f32.f32(
@@ -563,10 +563,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfrdiv.nxv1f64.f64(
 define <vscale x 1 x double> @intrinsic_vfrdiv_vf_nxv1f64_nxv1f64_f64(<vscale x 1 x double> %0, double %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfrdiv_vf_nxv1f64_nxv1f64_f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrdiv.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfrdiv.nxv1f64.f64(
@@ -588,10 +588,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfrdiv.mask.nxv1f64.f64(
 define <vscale x 1 x double> @intrinsic_vfrdiv_mask_vf_nxv1f64_nxv1f64_f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, double %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfrdiv_mask_vf_nxv1f64_nxv1f64_f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrdiv.vf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfrdiv.mask.nxv1f64.f64(
@@ -613,10 +613,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfrdiv.nxv2f64.f64(
 define <vscale x 2 x double> @intrinsic_vfrdiv_vf_nxv2f64_nxv2f64_f64(<vscale x 2 x double> %0, double %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfrdiv_vf_nxv2f64_nxv2f64_f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrdiv.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfrdiv.nxv2f64.f64(
@@ -638,10 +638,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfrdiv.mask.nxv2f64.f64(
 define <vscale x 2 x double> @intrinsic_vfrdiv_mask_vf_nxv2f64_nxv2f64_f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, double %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfrdiv_mask_vf_nxv2f64_nxv2f64_f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrdiv.vf v8, v10, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfrdiv.mask.nxv2f64.f64(
@@ -663,10 +663,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfrdiv.nxv4f64.f64(
 define <vscale x 4 x double> @intrinsic_vfrdiv_vf_nxv4f64_nxv4f64_f64(<vscale x 4 x double> %0, double %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfrdiv_vf_nxv4f64_nxv4f64_f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrdiv.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfrdiv.nxv4f64.f64(
@@ -688,10 +688,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfrdiv.mask.nxv4f64.f64(
 define <vscale x 4 x double> @intrinsic_vfrdiv_mask_vf_nxv4f64_nxv4f64_f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, double %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfrdiv_mask_vf_nxv4f64_nxv4f64_f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrdiv.vf v8, v12, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfrdiv.mask.nxv4f64.f64(
@@ -713,10 +713,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfrdiv.nxv8f64.f64(
 define <vscale x 8 x double> @intrinsic_vfrdiv_vf_nxv8f64_nxv8f64_f64(<vscale x 8 x double> %0, double %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfrdiv_vf_nxv8f64_nxv8f64_f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrdiv.vf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfrdiv.nxv8f64.f64(
@@ -738,10 +738,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfrdiv.mask.nxv8f64.f64(
 define <vscale x 8 x double> @intrinsic_vfrdiv_mask_vf_nxv8f64_nxv8f64_f64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, double %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfrdiv_mask_vf_nxv8f64_nxv8f64_f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrdiv.vf v8, v16, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfrdiv.mask.nxv8f64.f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfrec7.ll b/llvm/test/CodeGen/RISCV/rvv/vfrec7.ll
index 0204f0373d93..914b3b33fbe5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfrec7.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfrec7.ll
@@ -12,10 +12,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfrec7.nxv1f16(
 define <vscale x 1 x half> @intrinsic_vfrec7_v_nxv1f16_nxv1f16(<vscale x 1 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfrec7_v_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrec7.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfrec7.nxv1f16(
@@ -35,10 +35,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfrec7.mask.nxv1f16(
 define <vscale x 1 x half> @intrinsic_vfrec7_mask_v_nxv1f16_nxv1f16(<vscale x 1 x i1> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrec7.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfrec7.mask.nxv1f16(
@@ -58,10 +58,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfrec7.nxv2f16(
 define <vscale x 2 x half> @intrinsic_vfrec7_v_nxv2f16_nxv2f16(<vscale x 2 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfrec7_v_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrec7.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfrec7.nxv2f16(
@@ -81,10 +81,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfrec7.mask.nxv2f16(
 define <vscale x 2 x half> @intrinsic_vfrec7_mask_v_nxv2f16_nxv2f16(<vscale x 2 x i1> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrec7.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfrec7.mask.nxv2f16(
@@ -104,10 +104,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfrec7.nxv4f16(
 define <vscale x 4 x half> @intrinsic_vfrec7_v_nxv4f16_nxv4f16(<vscale x 4 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfrec7_v_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrec7.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfrec7.nxv4f16(
@@ -127,10 +127,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfrec7.mask.nxv4f16(
 define <vscale x 4 x half> @intrinsic_vfrec7_mask_v_nxv4f16_nxv4f16(<vscale x 4 x i1> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrec7.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfrec7.mask.nxv4f16(
@@ -150,10 +150,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfrec7.nxv8f16(
 define <vscale x 8 x half> @intrinsic_vfrec7_v_nxv8f16_nxv8f16(<vscale x 8 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfrec7_v_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrec7.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfrec7.nxv8f16(
@@ -173,10 +173,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfrec7.mask.nxv8f16(
 define <vscale x 8 x half> @intrinsic_vfrec7_mask_v_nxv8f16_nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrec7.v v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfrec7.mask.nxv8f16(
@@ -196,10 +196,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfrec7.nxv16f16(
 define <vscale x 16 x half> @intrinsic_vfrec7_v_nxv16f16_nxv16f16(<vscale x 16 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfrec7_v_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrec7.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfrec7.nxv16f16(
@@ -219,10 +219,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfrec7.mask.nxv16f16(
 define <vscale x 16 x half> @intrinsic_vfrec7_mask_v_nxv16f16_nxv16f16(<vscale x 16 x i1> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrec7.v v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfrec7.mask.nxv16f16(
@@ -242,10 +242,10 @@ declare <vscale x 32 x half> @llvm.riscv.vfrec7.nxv32f16(
 define <vscale x 32 x half> @intrinsic_vfrec7_v_nxv32f16_nxv32f16(<vscale x 32 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfrec7_v_nxv32f16_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrec7.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vfrec7.nxv32f16(
@@ -265,10 +265,10 @@ declare <vscale x 32 x half> @llvm.riscv.vfrec7.mask.nxv32f16(
 define <vscale x 32 x half> @intrinsic_vfrec7_mask_v_nxv32f16_nxv32f16(<vscale x 32 x i1> %0, <vscale x 32 x half> %1, <vscale x 32 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv32f16_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrec7.v v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vfrec7.mask.nxv32f16(
@@ -288,10 +288,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfrec7.nxv1f32(
 define <vscale x 1 x float> @intrinsic_vfrec7_v_nxv1f32_nxv1f32(<vscale x 1 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfrec7_v_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrec7.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfrec7.nxv1f32(
@@ -311,10 +311,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfrec7.mask.nxv1f32(
 define <vscale x 1 x float> @intrinsic_vfrec7_mask_v_nxv1f32_nxv1f32(<vscale x 1 x i1> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrec7.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfrec7.mask.nxv1f32(
@@ -334,10 +334,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfrec7.nxv2f32(
 define <vscale x 2 x float> @intrinsic_vfrec7_v_nxv2f32_nxv2f32(<vscale x 2 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfrec7_v_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrec7.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfrec7.nxv2f32(
@@ -357,10 +357,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfrec7.mask.nxv2f32(
 define <vscale x 2 x float> @intrinsic_vfrec7_mask_v_nxv2f32_nxv2f32(<vscale x 2 x i1> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrec7.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfrec7.mask.nxv2f32(
@@ -380,10 +380,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfrec7.nxv4f32(
 define <vscale x 4 x float> @intrinsic_vfrec7_v_nxv4f32_nxv4f32(<vscale x 4 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfrec7_v_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrec7.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfrec7.nxv4f32(
@@ -403,10 +403,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfrec7.mask.nxv4f32(
 define <vscale x 4 x float> @intrinsic_vfrec7_mask_v_nxv4f32_nxv4f32(<vscale x 4 x i1> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrec7.v v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfrec7.mask.nxv4f32(
@@ -426,10 +426,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfrec7.nxv8f32(
 define <vscale x 8 x float> @intrinsic_vfrec7_v_nxv8f32_nxv8f32(<vscale x 8 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfrec7_v_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrec7.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfrec7.nxv8f32(
@@ -449,10 +449,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfrec7.mask.nxv8f32(
 define <vscale x 8 x float> @intrinsic_vfrec7_mask_v_nxv8f32_nxv8f32(<vscale x 8 x i1> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrec7.v v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfrec7.mask.nxv8f32(
@@ -472,10 +472,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfrec7.nxv16f32(
 define <vscale x 16 x float> @intrinsic_vfrec7_v_nxv16f32_nxv16f32(<vscale x 16 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfrec7_v_nxv16f32_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrec7.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfrec7.nxv16f32(
@@ -495,10 +495,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfrec7.mask.nxv16f32(
 define <vscale x 16 x float> @intrinsic_vfrec7_mask_v_nxv16f32_nxv16f32(<vscale x 16 x i1> %0, <vscale x 16 x float> %1, <vscale x 16 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv16f32_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrec7.v v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfrec7.mask.nxv16f32(
@@ -518,10 +518,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfrec7.nxv1f64(
 define <vscale x 1 x double> @intrinsic_vfrec7_v_nxv1f64_nxv1f64(<vscale x 1 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfrec7_v_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrec7.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfrec7.nxv1f64(
@@ -541,10 +541,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfrec7.mask.nxv1f64(
 define <vscale x 1 x double> @intrinsic_vfrec7_mask_v_nxv1f64_nxv1f64(<vscale x 1 x i1> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrec7.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfrec7.mask.nxv1f64(
@@ -564,10 +564,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfrec7.nxv2f64(
 define <vscale x 2 x double> @intrinsic_vfrec7_v_nxv2f64_nxv2f64(<vscale x 2 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfrec7_v_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrec7.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfrec7.nxv2f64(
@@ -587,10 +587,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfrec7.mask.nxv2f64(
 define <vscale x 2 x double> @intrinsic_vfrec7_mask_v_nxv2f64_nxv2f64(<vscale x 2 x i1> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrec7.v v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfrec7.mask.nxv2f64(
@@ -610,10 +610,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfrec7.nxv4f64(
 define <vscale x 4 x double> @intrinsic_vfrec7_v_nxv4f64_nxv4f64(<vscale x 4 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfrec7_v_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrec7.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfrec7.nxv4f64(
@@ -633,10 +633,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfrec7.mask.nxv4f64(
 define <vscale x 4 x double> @intrinsic_vfrec7_mask_v_nxv4f64_nxv4f64(<vscale x 4 x i1> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrec7.v v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfrec7.mask.nxv4f64(
@@ -656,10 +656,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfrec7.nxv8f64(
 define <vscale x 8 x double> @intrinsic_vfrec7_v_nxv8f64_nxv8f64(<vscale x 8 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfrec7_v_nxv8f64_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrec7.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfrec7.nxv8f64(
@@ -679,10 +679,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfrec7.mask.nxv8f64(
 define <vscale x 8 x double> @intrinsic_vfrec7_mask_v_nxv8f64_nxv8f64(<vscale x 8 x i1> %0, <vscale x 8 x double> %1, <vscale x 8 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv8f64_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfrec7.v v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfrec7.mask.nxv8f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfredosum.ll b/llvm/test/CodeGen/RISCV/rvv/vfredosum.ll
index 19dde75969e3..6de9c82002f5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfredosum.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfredosum.ll
@@ -13,10 +13,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfredosum.nxv4f16.nxv1f16(
 define <vscale x 4 x half> @intrinsic_vfredosum_vs_nxv4f16_nxv1f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 1 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfredosum_vs_nxv4f16_nxv1f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredosum.vs v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfredosum.nxv4f16.nxv1f16(
@@ -38,10 +38,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfredosum.mask.nxv4f16.nxv1f16.nxv1i1(
 define <vscale x 4 x half> @intrinsic_vfredosum_mask_vs_nxv4f16_nxv1f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 1 x half> %1, <vscale x 4 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfredosum_mask_vs_nxv4f16_nxv1f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredosum.vs v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfredosum.mask.nxv4f16.nxv1f16.nxv1i1(
@@ -63,10 +63,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfredosum.nxv4f16.nxv2f16(
 define <vscale x 4 x half> @intrinsic_vfredosum_vs_nxv4f16_nxv2f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 2 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfredosum_vs_nxv4f16_nxv2f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredosum.vs v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfredosum.nxv4f16.nxv2f16(
@@ -88,10 +88,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfredosum.mask.nxv4f16.nxv2f16.nxv2i1(
 define <vscale x 4 x half> @intrinsic_vfredosum_mask_vs_nxv4f16_nxv2f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 2 x half> %1, <vscale x 4 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfredosum_mask_vs_nxv4f16_nxv2f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredosum.vs v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfredosum.mask.nxv4f16.nxv2f16.nxv2i1(
@@ -113,10 +113,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfredosum.nxv4f16.nxv4f16(
 define <vscale x 4 x half> @intrinsic_vfredosum_vs_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfredosum_vs_nxv4f16_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredosum.vs v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfredosum.nxv4f16.nxv4f16(
@@ -138,10 +138,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfredosum.mask.nxv4f16.nxv4f16.nxv4i1(
 define <vscale x 4 x half> @intrinsic_vfredosum_mask_vs_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfredosum_mask_vs_nxv4f16_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredosum.vs v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfredosum.mask.nxv4f16.nxv4f16.nxv4i1(
@@ -163,10 +163,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfredosum.nxv4f16.nxv8f16(
 define <vscale x 4 x half> @intrinsic_vfredosum_vs_nxv4f16_nxv8f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 8 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfredosum_vs_nxv4f16_nxv8f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredosum.vs v8, v10, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfredosum.nxv4f16.nxv8f16(
@@ -188,10 +188,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfredosum.mask.nxv4f16.nxv8f16.nxv8i1(
 define <vscale x 4 x half> @intrinsic_vfredosum_mask_vs_nxv4f16_nxv8f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 8 x half> %1, <vscale x 4 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfredosum_mask_vs_nxv4f16_nxv8f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredosum.vs v8, v10, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfredosum.mask.nxv4f16.nxv8f16.nxv8i1(
@@ -213,10 +213,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfredosum.nxv4f16.nxv16f16(
 define <vscale x 4 x half> @intrinsic_vfredosum_vs_nxv4f16_nxv16f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 16 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfredosum_vs_nxv4f16_nxv16f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredosum.vs v8, v12, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfredosum.nxv4f16.nxv16f16(
@@ -238,10 +238,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfredosum.mask.nxv4f16.nxv16f16.nxv16i1(
 define <vscale x 4 x half> @intrinsic_vfredosum_mask_vs_nxv4f16_nxv16f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 16 x half> %1, <vscale x 4 x half> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfredosum_mask_vs_nxv4f16_nxv16f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredosum.vs v8, v12, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfredosum.mask.nxv4f16.nxv16f16.nxv16i1(
@@ -263,10 +263,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfredosum.nxv4f16.nxv32f16(
 define <vscale x 4 x half> @intrinsic_vfredosum_vs_nxv4f16_nxv32f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 32 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfredosum_vs_nxv4f16_nxv32f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredosum.vs v8, v16, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfredosum.nxv4f16.nxv32f16(
@@ -288,10 +288,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfredosum.mask.nxv4f16.nxv32f16.nxv32i1(
 define <vscale x 4 x half> @intrinsic_vfredosum_mask_vs_nxv4f16_nxv32f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 32 x half> %1, <vscale x 4 x half> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfredosum_mask_vs_nxv4f16_nxv32f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredosum.vs v8, v16, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfredosum.mask.nxv4f16.nxv32f16.nxv32i1(
@@ -313,10 +313,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfredosum.nxv2f32.nxv1f32(
 define <vscale x 2 x float> @intrinsic_vfredosum_vs_nxv2f32_nxv1f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 1 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfredosum_vs_nxv2f32_nxv1f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredosum.vs v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfredosum.nxv2f32.nxv1f32(
@@ -338,10 +338,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfredosum.mask.nxv2f32.nxv1f32.nxv1i1(
 define <vscale x 2 x float> @intrinsic_vfredosum_mask_vs_nxv2f32_nxv1f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 1 x float> %1, <vscale x 2 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfredosum_mask_vs_nxv2f32_nxv1f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredosum.vs v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfredosum.mask.nxv2f32.nxv1f32.nxv1i1(
@@ -363,10 +363,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfredosum.nxv2f32.nxv2f32(
 define <vscale x 2 x float> @intrinsic_vfredosum_vs_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfredosum_vs_nxv2f32_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredosum.vs v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfredosum.nxv2f32.nxv2f32(
@@ -388,10 +388,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfredosum.mask.nxv2f32.nxv2f32.nxv2i1(
 define <vscale x 2 x float> @intrinsic_vfredosum_mask_vs_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfredosum_mask_vs_nxv2f32_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredosum.vs v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfredosum.mask.nxv2f32.nxv2f32.nxv2i1(
@@ -413,10 +413,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfredosum.nxv2f32.nxv4f32(
 define <vscale x 2 x float> @intrinsic_vfredosum_vs_nxv2f32_nxv4f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 4 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfredosum_vs_nxv2f32_nxv4f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredosum.vs v8, v10, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfredosum.nxv2f32.nxv4f32(
@@ -438,10 +438,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfredosum.mask.nxv2f32.nxv4f32.nxv4i1(
 define <vscale x 2 x float> @intrinsic_vfredosum_mask_vs_nxv2f32_nxv4f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 4 x float> %1, <vscale x 2 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfredosum_mask_vs_nxv2f32_nxv4f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredosum.vs v8, v10, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfredosum.mask.nxv2f32.nxv4f32.nxv4i1(
@@ -463,10 +463,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfredosum.nxv2f32.nxv8f32(
 define <vscale x 2 x float> @intrinsic_vfredosum_vs_nxv2f32_nxv8f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 8 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfredosum_vs_nxv2f32_nxv8f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredosum.vs v8, v12, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfredosum.nxv2f32.nxv8f32(
@@ -488,10 +488,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfredosum.mask.nxv2f32.nxv8f32.nxv8i1(
 define <vscale x 2 x float> @intrinsic_vfredosum_mask_vs_nxv2f32_nxv8f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 8 x float> %1, <vscale x 2 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfredosum_mask_vs_nxv2f32_nxv8f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredosum.vs v8, v12, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfredosum.mask.nxv2f32.nxv8f32.nxv8i1(
@@ -513,10 +513,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfredosum.nxv2f32.nxv16f32(
 define <vscale x 2 x float> @intrinsic_vfredosum_vs_nxv2f32_nxv16f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 16 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfredosum_vs_nxv2f32_nxv16f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredosum.vs v8, v16, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfredosum.nxv2f32.nxv16f32(
@@ -538,10 +538,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfredosum.mask.nxv2f32.nxv16f32.nxv16i1
 define <vscale x 2 x float> @intrinsic_vfredosum_mask_vs_nxv2f32_nxv16f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 16 x float> %1, <vscale x 2 x float> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfredosum_mask_vs_nxv2f32_nxv16f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredosum.vs v8, v16, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfredosum.mask.nxv2f32.nxv16f32.nxv16i1(
@@ -563,10 +563,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfredosum.nxv1f64.nxv1f64(
 define <vscale x 1 x double> @intrinsic_vfredosum_vs_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfredosum_vs_nxv1f64_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredosum.vs v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfredosum.nxv1f64.nxv1f64(
@@ -588,10 +588,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfredosum.mask.nxv1f64.nxv1f64.nxv1i1(
 define <vscale x 1 x double> @intrinsic_vfredosum_mask_vs_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfredosum_mask_vs_nxv1f64_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredosum.vs v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfredosum.mask.nxv1f64.nxv1f64.nxv1i1(
@@ -613,10 +613,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfredosum.nxv1f64.nxv2f64(
 define <vscale x 1 x double> @intrinsic_vfredosum_vs_nxv1f64_nxv2f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 2 x double> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfredosum_vs_nxv1f64_nxv2f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredosum.vs v8, v10, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfredosum.nxv1f64.nxv2f64(
@@ -638,10 +638,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfredosum.mask.nxv1f64.nxv2f64.nxv2i1(
 define <vscale x 1 x double> @intrinsic_vfredosum_mask_vs_nxv1f64_nxv2f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 2 x double> %1, <vscale x 1 x double> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfredosum_mask_vs_nxv1f64_nxv2f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredosum.vs v8, v10, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfredosum.mask.nxv1f64.nxv2f64.nxv2i1(
@@ -663,10 +663,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfredosum.nxv1f64.nxv4f64(
 define <vscale x 1 x double> @intrinsic_vfredosum_vs_nxv1f64_nxv4f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 4 x double> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfredosum_vs_nxv1f64_nxv4f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredosum.vs v8, v12, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfredosum.nxv1f64.nxv4f64(
@@ -688,10 +688,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfredosum.mask.nxv1f64.nxv4f64.nxv4i1(
 define <vscale x 1 x double> @intrinsic_vfredosum_mask_vs_nxv1f64_nxv4f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 4 x double> %1, <vscale x 1 x double> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfredosum_mask_vs_nxv1f64_nxv4f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredosum.vs v8, v12, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfredosum.mask.nxv1f64.nxv4f64.nxv4i1(
@@ -713,10 +713,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfredosum.nxv1f64.nxv8f64(
 define <vscale x 1 x double> @intrinsic_vfredosum_vs_nxv1f64_nxv8f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 8 x double> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfredosum_vs_nxv1f64_nxv8f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredosum.vs v8, v16, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfredosum.nxv1f64.nxv8f64(
@@ -738,10 +738,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfredosum.mask.nxv1f64.nxv8f64.nxv8i1(
 define <vscale x 1 x double> @intrinsic_vfredosum_mask_vs_nxv1f64_nxv8f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 8 x double> %1, <vscale x 1 x double> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfredosum_mask_vs_nxv1f64_nxv8f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredosum.vs v8, v16, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfredosum.mask.nxv1f64.nxv8f64.nxv8i1(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfredusum.ll b/llvm/test/CodeGen/RISCV/rvv/vfredusum.ll
index bd2a5a901fb8..ffef9ef728a1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfredusum.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfredusum.ll
@@ -13,10 +13,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfredusum.nxv4f16.nxv1f16(
 define <vscale x 4 x half> @intrinsic_vfredusum_vs_nxv4f16_nxv1f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 1 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfredusum_vs_nxv4f16_nxv1f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredusum.vs v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfredusum.nxv4f16.nxv1f16(
@@ -38,10 +38,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfredusum.mask.nxv4f16.nxv1f16.nxv1i1(
 define <vscale x 4 x half> @intrinsic_vfredusum_mask_vs_nxv4f16_nxv1f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 1 x half> %1, <vscale x 4 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfredusum_mask_vs_nxv4f16_nxv1f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredusum.vs v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfredusum.mask.nxv4f16.nxv1f16.nxv1i1(
@@ -63,10 +63,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfredusum.nxv4f16.nxv2f16(
 define <vscale x 4 x half> @intrinsic_vfredusum_vs_nxv4f16_nxv2f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 2 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfredusum_vs_nxv4f16_nxv2f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredusum.vs v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfredusum.nxv4f16.nxv2f16(
@@ -88,10 +88,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfredusum.mask.nxv4f16.nxv2f16.nxv2i1(
 define <vscale x 4 x half> @intrinsic_vfredusum_mask_vs_nxv4f16_nxv2f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 2 x half> %1, <vscale x 4 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfredusum_mask_vs_nxv4f16_nxv2f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredusum.vs v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfredusum.mask.nxv4f16.nxv2f16.nxv2i1(
@@ -113,10 +113,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfredusum.nxv4f16.nxv4f16(
 define <vscale x 4 x half> @intrinsic_vfredusum_vs_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfredusum_vs_nxv4f16_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredusum.vs v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfredusum.nxv4f16.nxv4f16(
@@ -138,10 +138,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfredusum.mask.nxv4f16.nxv4f16.nxv4i1(
 define <vscale x 4 x half> @intrinsic_vfredusum_mask_vs_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfredusum_mask_vs_nxv4f16_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredusum.vs v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfredusum.mask.nxv4f16.nxv4f16.nxv4i1(
@@ -163,10 +163,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfredusum.nxv4f16.nxv8f16(
 define <vscale x 4 x half> @intrinsic_vfredusum_vs_nxv4f16_nxv8f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 8 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfredusum_vs_nxv4f16_nxv8f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredusum.vs v8, v10, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfredusum.nxv4f16.nxv8f16(
@@ -188,10 +188,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfredusum.mask.nxv4f16.nxv8f16.nxv8i1(
 define <vscale x 4 x half> @intrinsic_vfredusum_mask_vs_nxv4f16_nxv8f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 8 x half> %1, <vscale x 4 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfredusum_mask_vs_nxv4f16_nxv8f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredusum.vs v8, v10, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfredusum.mask.nxv4f16.nxv8f16.nxv8i1(
@@ -213,10 +213,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfredusum.nxv4f16.nxv16f16(
 define <vscale x 4 x half> @intrinsic_vfredusum_vs_nxv4f16_nxv16f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 16 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfredusum_vs_nxv4f16_nxv16f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredusum.vs v8, v12, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfredusum.nxv4f16.nxv16f16(
@@ -238,10 +238,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfredusum.mask.nxv4f16.nxv16f16.nxv16i1(
 define <vscale x 4 x half> @intrinsic_vfredusum_mask_vs_nxv4f16_nxv16f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 16 x half> %1, <vscale x 4 x half> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfredusum_mask_vs_nxv4f16_nxv16f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredusum.vs v8, v12, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfredusum.mask.nxv4f16.nxv16f16.nxv16i1(
@@ -263,10 +263,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfredusum.nxv4f16.nxv32f16(
 define <vscale x 4 x half> @intrinsic_vfredusum_vs_nxv4f16_nxv32f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 32 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfredusum_vs_nxv4f16_nxv32f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredusum.vs v8, v16, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfredusum.nxv4f16.nxv32f16(
@@ -288,10 +288,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfredusum.mask.nxv4f16.nxv32f16.nxv32i1(
 define <vscale x 4 x half> @intrinsic_vfredusum_mask_vs_nxv4f16_nxv32f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 32 x half> %1, <vscale x 4 x half> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfredusum_mask_vs_nxv4f16_nxv32f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredusum.vs v8, v16, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfredusum.mask.nxv4f16.nxv32f16.nxv32i1(
@@ -313,10 +313,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfredusum.nxv2f32.nxv1f32(
 define <vscale x 2 x float> @intrinsic_vfredusum_vs_nxv2f32_nxv1f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 1 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfredusum_vs_nxv2f32_nxv1f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredusum.vs v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfredusum.nxv2f32.nxv1f32(
@@ -338,10 +338,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfredusum.mask.nxv2f32.nxv1f32.nxv1i1(
 define <vscale x 2 x float> @intrinsic_vfredusum_mask_vs_nxv2f32_nxv1f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 1 x float> %1, <vscale x 2 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfredusum_mask_vs_nxv2f32_nxv1f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredusum.vs v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfredusum.mask.nxv2f32.nxv1f32.nxv1i1(
@@ -363,10 +363,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfredusum.nxv2f32.nxv2f32(
 define <vscale x 2 x float> @intrinsic_vfredusum_vs_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfredusum_vs_nxv2f32_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredusum.vs v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfredusum.nxv2f32.nxv2f32(
@@ -388,10 +388,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfredusum.mask.nxv2f32.nxv2f32.nxv2i1(
 define <vscale x 2 x float> @intrinsic_vfredusum_mask_vs_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfredusum_mask_vs_nxv2f32_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredusum.vs v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfredusum.mask.nxv2f32.nxv2f32.nxv2i1(
@@ -413,10 +413,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfredusum.nxv2f32.nxv4f32(
 define <vscale x 2 x float> @intrinsic_vfredusum_vs_nxv2f32_nxv4f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 4 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfredusum_vs_nxv2f32_nxv4f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredusum.vs v8, v10, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfredusum.nxv2f32.nxv4f32(
@@ -438,10 +438,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfredusum.mask.nxv2f32.nxv4f32.nxv4i1(
 define <vscale x 2 x float> @intrinsic_vfredusum_mask_vs_nxv2f32_nxv4f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 4 x float> %1, <vscale x 2 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfredusum_mask_vs_nxv2f32_nxv4f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredusum.vs v8, v10, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfredusum.mask.nxv2f32.nxv4f32.nxv4i1(
@@ -463,10 +463,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfredusum.nxv2f32.nxv8f32(
 define <vscale x 2 x float> @intrinsic_vfredusum_vs_nxv2f32_nxv8f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 8 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfredusum_vs_nxv2f32_nxv8f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredusum.vs v8, v12, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfredusum.nxv2f32.nxv8f32(
@@ -488,10 +488,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfredusum.mask.nxv2f32.nxv8f32.nxv8i1(
 define <vscale x 2 x float> @intrinsic_vfredusum_mask_vs_nxv2f32_nxv8f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 8 x float> %1, <vscale x 2 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfredusum_mask_vs_nxv2f32_nxv8f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredusum.vs v8, v12, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfredusum.mask.nxv2f32.nxv8f32.nxv8i1(
@@ -513,10 +513,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfredusum.nxv2f32.nxv16f32(
 define <vscale x 2 x float> @intrinsic_vfredusum_vs_nxv2f32_nxv16f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 16 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfredusum_vs_nxv2f32_nxv16f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredusum.vs v8, v16, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfredusum.nxv2f32.nxv16f32(
@@ -538,10 +538,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfredusum.mask.nxv2f32.nxv16f32.nxv16i1
 define <vscale x 2 x float> @intrinsic_vfredusum_mask_vs_nxv2f32_nxv16f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 16 x float> %1, <vscale x 2 x float> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfredusum_mask_vs_nxv2f32_nxv16f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredusum.vs v8, v16, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfredusum.mask.nxv2f32.nxv16f32.nxv16i1(
@@ -563,10 +563,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfredusum.nxv1f64.nxv1f64(
 define <vscale x 1 x double> @intrinsic_vfredusum_vs_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfredusum_vs_nxv1f64_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredusum.vs v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfredusum.nxv1f64.nxv1f64(
@@ -588,10 +588,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfredusum.mask.nxv1f64.nxv1f64.nxv1i1(
 define <vscale x 1 x double> @intrinsic_vfredusum_mask_vs_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfredusum_mask_vs_nxv1f64_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredusum.vs v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfredusum.mask.nxv1f64.nxv1f64.nxv1i1(
@@ -613,10 +613,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfredusum.nxv1f64.nxv2f64(
 define <vscale x 1 x double> @intrinsic_vfredusum_vs_nxv1f64_nxv2f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 2 x double> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfredusum_vs_nxv1f64_nxv2f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredusum.vs v8, v10, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfredusum.nxv1f64.nxv2f64(
@@ -638,10 +638,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfredusum.mask.nxv1f64.nxv2f64.nxv2i1(
 define <vscale x 1 x double> @intrinsic_vfredusum_mask_vs_nxv1f64_nxv2f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 2 x double> %1, <vscale x 1 x double> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfredusum_mask_vs_nxv1f64_nxv2f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredusum.vs v8, v10, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfredusum.mask.nxv1f64.nxv2f64.nxv2i1(
@@ -663,10 +663,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfredusum.nxv1f64.nxv4f64(
 define <vscale x 1 x double> @intrinsic_vfredusum_vs_nxv1f64_nxv4f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 4 x double> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfredusum_vs_nxv1f64_nxv4f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredusum.vs v8, v12, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfredusum.nxv1f64.nxv4f64(
@@ -688,10 +688,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfredusum.mask.nxv1f64.nxv4f64.nxv4i1(
 define <vscale x 1 x double> @intrinsic_vfredusum_mask_vs_nxv1f64_nxv4f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 4 x double> %1, <vscale x 1 x double> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfredusum_mask_vs_nxv1f64_nxv4f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredusum.vs v8, v12, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfredusum.mask.nxv1f64.nxv4f64.nxv4i1(
@@ -713,10 +713,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfredusum.nxv1f64.nxv8f64(
 define <vscale x 1 x double> @intrinsic_vfredusum_vs_nxv1f64_nxv8f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 8 x double> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfredusum_vs_nxv1f64_nxv8f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredusum.vs v8, v16, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfredusum.nxv1f64.nxv8f64(
@@ -738,10 +738,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfredusum.mask.nxv1f64.nxv8f64.nxv8i1(
 define <vscale x 1 x double> @intrinsic_vfredusum_mask_vs_nxv1f64_nxv8f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 8 x double> %1, <vscale x 1 x double> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfredusum_mask_vs_nxv1f64_nxv8f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfredusum.vs v8, v16, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfredusum.mask.nxv1f64.nxv8f64.nxv8i1(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt.ll
index 0f61e6a7d406..3e3eea9f353c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt.ll
@@ -12,10 +12,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfsqrt.nxv1f16(
 define <vscale x 1 x half> @intrinsic_vfsqrt_v_nxv1f16_nxv1f16(<vscale x 1 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfsqrt_v_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfsqrt.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfsqrt.nxv1f16(
@@ -35,10 +35,10 @@ declare <vscale x 1 x half> @llvm.riscv.vfsqrt.mask.nxv1f16(
 define <vscale x 1 x half> @intrinsic_vfsqrt_mask_v_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfsqrt_mask_v_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfsqrt.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfsqrt.mask.nxv1f16(
@@ -58,10 +58,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfsqrt.nxv2f16(
 define <vscale x 2 x half> @intrinsic_vfsqrt_v_nxv2f16_nxv2f16(<vscale x 2 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfsqrt_v_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfsqrt.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfsqrt.nxv2f16(
@@ -81,10 +81,10 @@ declare <vscale x 2 x half> @llvm.riscv.vfsqrt.mask.nxv2f16(
 define <vscale x 2 x half> @intrinsic_vfsqrt_mask_v_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfsqrt_mask_v_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfsqrt.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfsqrt.mask.nxv2f16(
@@ -104,10 +104,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfsqrt.nxv4f16(
 define <vscale x 4 x half> @intrinsic_vfsqrt_v_nxv4f16_nxv4f16(<vscale x 4 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfsqrt_v_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfsqrt.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfsqrt.nxv4f16(
@@ -127,10 +127,10 @@ declare <vscale x 4 x half> @llvm.riscv.vfsqrt.mask.nxv4f16(
 define <vscale x 4 x half> @intrinsic_vfsqrt_mask_v_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfsqrt_mask_v_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfsqrt.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfsqrt.mask.nxv4f16(
@@ -150,10 +150,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfsqrt.nxv8f16(
 define <vscale x 8 x half> @intrinsic_vfsqrt_v_nxv8f16_nxv8f16(<vscale x 8 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfsqrt_v_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfsqrt.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfsqrt.nxv8f16(
@@ -173,10 +173,10 @@ declare <vscale x 8 x half> @llvm.riscv.vfsqrt.mask.nxv8f16(
 define <vscale x 8 x half> @intrinsic_vfsqrt_mask_v_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfsqrt_mask_v_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfsqrt.v v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfsqrt.mask.nxv8f16(
@@ -196,10 +196,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfsqrt.nxv16f16(
 define <vscale x 16 x half> @intrinsic_vfsqrt_v_nxv16f16_nxv16f16(<vscale x 16 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfsqrt_v_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfsqrt.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfsqrt.nxv16f16(
@@ -219,10 +219,10 @@ declare <vscale x 16 x half> @llvm.riscv.vfsqrt.mask.nxv16f16(
 define <vscale x 16 x half> @intrinsic_vfsqrt_mask_v_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfsqrt_mask_v_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfsqrt.v v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfsqrt.mask.nxv16f16(
@@ -242,10 +242,10 @@ declare <vscale x 32 x half> @llvm.riscv.vfsqrt.nxv32f16(
 define <vscale x 32 x half> @intrinsic_vfsqrt_v_nxv32f16_nxv32f16(<vscale x 32 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfsqrt_v_nxv32f16_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfsqrt.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vfsqrt.nxv32f16(
@@ -265,10 +265,10 @@ declare <vscale x 32 x half> @llvm.riscv.vfsqrt.mask.nxv32f16(
 define <vscale x 32 x half> @intrinsic_vfsqrt_mask_v_nxv32f16_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfsqrt_mask_v_nxv32f16_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfsqrt.v v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vfsqrt.mask.nxv32f16(
@@ -288,10 +288,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfsqrt.nxv1f32(
 define <vscale x 1 x float> @intrinsic_vfsqrt_v_nxv1f32_nxv1f32(<vscale x 1 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfsqrt_v_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfsqrt.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfsqrt.nxv1f32(
@@ -311,10 +311,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfsqrt.mask.nxv1f32(
 define <vscale x 1 x float> @intrinsic_vfsqrt_mask_v_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfsqrt_mask_v_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfsqrt.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfsqrt.mask.nxv1f32(
@@ -334,10 +334,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfsqrt.nxv2f32(
 define <vscale x 2 x float> @intrinsic_vfsqrt_v_nxv2f32_nxv2f32(<vscale x 2 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfsqrt_v_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfsqrt.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfsqrt.nxv2f32(
@@ -357,10 +357,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfsqrt.mask.nxv2f32(
 define <vscale x 2 x float> @intrinsic_vfsqrt_mask_v_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfsqrt_mask_v_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfsqrt.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfsqrt.mask.nxv2f32(
@@ -380,10 +380,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfsqrt.nxv4f32(
 define <vscale x 4 x float> @intrinsic_vfsqrt_v_nxv4f32_nxv4f32(<vscale x 4 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfsqrt_v_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfsqrt.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfsqrt.nxv4f32(
@@ -403,10 +403,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfsqrt.mask.nxv4f32(
 define <vscale x 4 x float> @intrinsic_vfsqrt_mask_v_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfsqrt_mask_v_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfsqrt.v v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfsqrt.mask.nxv4f32(
@@ -426,10 +426,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfsqrt.nxv8f32(
 define <vscale x 8 x float> @intrinsic_vfsqrt_v_nxv8f32_nxv8f32(<vscale x 8 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfsqrt_v_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfsqrt.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfsqrt.nxv8f32(
@@ -449,10 +449,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfsqrt.mask.nxv8f32(
 define <vscale x 8 x float> @intrinsic_vfsqrt_mask_v_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfsqrt_mask_v_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfsqrt.v v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfsqrt.mask.nxv8f32(
@@ -472,10 +472,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfsqrt.nxv16f32(
 define <vscale x 16 x float> @intrinsic_vfsqrt_v_nxv16f32_nxv16f32(<vscale x 16 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfsqrt_v_nxv16f32_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfsqrt.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfsqrt.nxv16f32(
@@ -495,10 +495,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfsqrt.mask.nxv16f32(
 define <vscale x 16 x float> @intrinsic_vfsqrt_mask_v_nxv16f32_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfsqrt_mask_v_nxv16f32_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfsqrt.v v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfsqrt.mask.nxv16f32(
@@ -518,10 +518,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfsqrt.nxv1f64(
 define <vscale x 1 x double> @intrinsic_vfsqrt_v_nxv1f64_nxv1f64(<vscale x 1 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfsqrt_v_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfsqrt.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfsqrt.nxv1f64(
@@ -541,10 +541,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfsqrt.mask.nxv1f64(
 define <vscale x 1 x double> @intrinsic_vfsqrt_mask_v_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfsqrt_mask_v_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfsqrt.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfsqrt.mask.nxv1f64(
@@ -564,10 +564,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfsqrt.nxv2f64(
 define <vscale x 2 x double> @intrinsic_vfsqrt_v_nxv2f64_nxv2f64(<vscale x 2 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfsqrt_v_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfsqrt.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfsqrt.nxv2f64(
@@ -587,10 +587,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfsqrt.mask.nxv2f64(
 define <vscale x 2 x double> @intrinsic_vfsqrt_mask_v_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfsqrt_mask_v_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfsqrt.v v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfsqrt.mask.nxv2f64(
@@ -610,10 +610,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfsqrt.nxv4f64(
 define <vscale x 4 x double> @intrinsic_vfsqrt_v_nxv4f64_nxv4f64(<vscale x 4 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfsqrt_v_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfsqrt.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfsqrt.nxv4f64(
@@ -633,10 +633,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfsqrt.mask.nxv4f64(
 define <vscale x 4 x double> @intrinsic_vfsqrt_mask_v_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfsqrt_mask_v_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfsqrt.v v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfsqrt.mask.nxv4f64(
@@ -656,10 +656,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfsqrt.nxv8f64(
 define <vscale x 8 x double> @intrinsic_vfsqrt_v_nxv8f64_nxv8f64(<vscale x 8 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfsqrt_v_nxv8f64_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfsqrt.v v8, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfsqrt.nxv8f64(
@@ -679,10 +679,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfsqrt.mask.nxv8f64(
 define <vscale x 8 x double> @intrinsic_vfsqrt_mask_v_nxv8f64_nxv8f64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfsqrt_mask_v_nxv8f64_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfsqrt.v v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfsqrt.mask.nxv8f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwadd.ll b/llvm/test/CodeGen/RISCV/rvv/vfwadd.ll
index cb7047be9753..b42a1fe46e67 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwadd.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwadd.ll
@@ -13,10 +13,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwadd.nxv1f32.nxv1f16.nxv1f16(
 define <vscale x 1 x float> @intrinsic_vfwadd_vv_nxv1f32_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_vv_nxv1f32_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vv v10, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -39,10 +39,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1f16.nxv1f16(
 define <vscale x 1 x float> @intrinsic_vfwadd_mask_vv_nxv1f32_nxv1f16_nxv1f16(<vscale x 1 x float> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_mask_vv_nxv1f32_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1f16.nxv1f16(
@@ -64,10 +64,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwadd.nxv2f32.nxv2f16.nxv2f16(
 define <vscale x 2 x float> @intrinsic_vfwadd_vv_nxv2f32_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_vv_nxv2f32_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vv v10, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -90,10 +90,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2f16.nxv2f16(
 define <vscale x 2 x float> @intrinsic_vfwadd_mask_vv_nxv2f32_nxv2f16_nxv2f16(<vscale x 2 x float> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_mask_vv_nxv2f32_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2f16.nxv2f16(
@@ -115,10 +115,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwadd.nxv4f32.nxv4f16.nxv4f16(
 define <vscale x 4 x float> @intrinsic_vfwadd_vv_nxv4f32_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_vv_nxv4f32_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vv v10, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -141,10 +141,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4f16.nxv4f16(
 define <vscale x 4 x float> @intrinsic_vfwadd_mask_vv_nxv4f32_nxv4f16_nxv4f16(<vscale x 4 x float> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_mask_vv_nxv4f32_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vv v8, v10, v11, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4f16.nxv4f16(
@@ -166,10 +166,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwadd.nxv8f32.nxv8f16.nxv8f16(
 define <vscale x 8 x float> @intrinsic_vfwadd_vv_nxv8f32_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_vv_nxv8f32_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vv v12, v8, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv4r.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -192,10 +192,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8f16.nxv8f16(
 define <vscale x 8 x float> @intrinsic_vfwadd_mask_vv_nxv8f32_nxv8f16_nxv8f16(<vscale x 8 x float> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_mask_vv_nxv8f32_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vv v8, v12, v14, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8f16.nxv8f16(
@@ -217,10 +217,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwadd.nxv16f32.nxv16f16.nxv16f16(
 define <vscale x 16 x float> @intrinsic_vfwadd_vv_nxv16f32_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_vv_nxv16f32_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vv v16, v8, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -243,10 +243,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16f16.nxv16f16
 define <vscale x 16 x float> @intrinsic_vfwadd_mask_vv_nxv16f32_nxv16f16_nxv16f16(<vscale x 16 x float> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_mask_vv_nxv16f32_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vv v8, v16, v20, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16f16.nxv16f16(
@@ -268,10 +268,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwadd.nxv1f64.nxv1f32.nxv1f32(
 define <vscale x 1 x double> @intrinsic_vfwadd_vv_nxv1f64_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_vv_nxv1f64_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vv v10, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -294,10 +294,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwadd.mask.nxv1f64.nxv1f32.nxv1f32(
 define <vscale x 1 x double> @intrinsic_vfwadd_mask_vv_nxv1f64_nxv1f32_nxv1f32(<vscale x 1 x double> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_mask_vv_nxv1f64_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwadd.mask.nxv1f64.nxv1f32.nxv1f32(
@@ -319,10 +319,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwadd.nxv2f64.nxv2f32.nxv2f32(
 define <vscale x 2 x double> @intrinsic_vfwadd_vv_nxv2f64_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_vv_nxv2f64_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vv v10, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -345,10 +345,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwadd.mask.nxv2f64.nxv2f32.nxv2f32(
 define <vscale x 2 x double> @intrinsic_vfwadd_mask_vv_nxv2f64_nxv2f32_nxv2f32(<vscale x 2 x double> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_mask_vv_nxv2f64_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vv v8, v10, v11, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwadd.mask.nxv2f64.nxv2f32.nxv2f32(
@@ -370,10 +370,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwadd.nxv4f64.nxv4f32.nxv4f32(
 define <vscale x 4 x double> @intrinsic_vfwadd_vv_nxv4f64_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_vv_nxv4f64_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vv v12, v8, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv4r.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -396,10 +396,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwadd.mask.nxv4f64.nxv4f32.nxv4f32(
 define <vscale x 4 x double> @intrinsic_vfwadd_mask_vv_nxv4f64_nxv4f32_nxv4f32(<vscale x 4 x double> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_mask_vv_nxv4f64_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vv v8, v12, v14, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwadd.mask.nxv4f64.nxv4f32.nxv4f32(
@@ -421,10 +421,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwadd.nxv8f64.nxv8f32.nxv8f32(
 define <vscale x 8 x double> @intrinsic_vfwadd_vv_nxv8f64_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_vv_nxv8f64_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vv v16, v8, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -447,10 +447,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwadd.mask.nxv8f64.nxv8f32.nxv8f32(
 define <vscale x 8 x double> @intrinsic_vfwadd_mask_vv_nxv8f64_nxv8f32_nxv8f32(<vscale x 8 x double> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_mask_vv_nxv8f64_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vv v8, v16, v20, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwadd.mask.nxv8f64.nxv8f32.nxv8f32(
@@ -472,10 +472,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwadd.nxv1f32.nxv1f16.f16(
 define <vscale x 1 x float> @intrinsic_vfwadd_vf_nxv1f32_nxv1f16_f16(<vscale x 1 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_vf_nxv1f32_nxv1f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vf v9, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -498,10 +498,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1f16.f16(
 define <vscale x 1 x float> @intrinsic_vfwadd_mask_vf_nxv1f32_nxv1f16_f16(<vscale x 1 x float> %0, <vscale x 1 x half> %1, half %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_mask_vf_nxv1f32_nxv1f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1f16.f16(
@@ -523,10 +523,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwadd.nxv2f32.nxv2f16.f16(
 define <vscale x 2 x float> @intrinsic_vfwadd_vf_nxv2f32_nxv2f16_f16(<vscale x 2 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_vf_nxv2f32_nxv2f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vf v9, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -549,10 +549,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2f16.f16(
 define <vscale x 2 x float> @intrinsic_vfwadd_mask_vf_nxv2f32_nxv2f16_f16(<vscale x 2 x float> %0, <vscale x 2 x half> %1, half %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_mask_vf_nxv2f32_nxv2f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2f16.f16(
@@ -574,10 +574,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwadd.nxv4f32.nxv4f16.f16(
 define <vscale x 4 x float> @intrinsic_vfwadd_vf_nxv4f32_nxv4f16_f16(<vscale x 4 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_vf_nxv4f32_nxv4f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vf v10, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -600,10 +600,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4f16.f16(
 define <vscale x 4 x float> @intrinsic_vfwadd_mask_vf_nxv4f32_nxv4f16_f16(<vscale x 4 x float> %0, <vscale x 4 x half> %1, half %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_mask_vf_nxv4f32_nxv4f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vf v8, v10, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4f16.f16(
@@ -625,10 +625,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwadd.nxv8f32.nxv8f16.f16(
 define <vscale x 8 x float> @intrinsic_vfwadd_vf_nxv8f32_nxv8f16_f16(<vscale x 8 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_vf_nxv8f32_nxv8f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vf v12, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv4r.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -651,10 +651,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8f16.f16(
 define <vscale x 8 x float> @intrinsic_vfwadd_mask_vf_nxv8f32_nxv8f16_f16(<vscale x 8 x float> %0, <vscale x 8 x half> %1, half %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_mask_vf_nxv8f32_nxv8f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vf v8, v12, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8f16.f16(
@@ -676,10 +676,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwadd.nxv16f32.nxv16f16.f16(
 define <vscale x 16 x float> @intrinsic_vfwadd_vf_nxv16f32_nxv16f16_f16(<vscale x 16 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_vf_nxv16f32_nxv16f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vf v16, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -702,10 +702,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16f16.f16(
 define <vscale x 16 x float> @intrinsic_vfwadd_mask_vf_nxv16f32_nxv16f16_f16(<vscale x 16 x float> %0, <vscale x 16 x half> %1, half %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_mask_vf_nxv16f32_nxv16f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vf v8, v16, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16f16.f16(
@@ -727,10 +727,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwadd.nxv1f64.nxv1f32.f32(
 define <vscale x 1 x double> @intrinsic_vfwadd_vf_nxv1f64_nxv1f32_f32(<vscale x 1 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_vf_nxv1f64_nxv1f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vf v9, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -753,10 +753,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwadd.mask.nxv1f64.nxv1f32.f32(
 define <vscale x 1 x double> @intrinsic_vfwadd_mask_vf_nxv1f64_nxv1f32_f32(<vscale x 1 x double> %0, <vscale x 1 x float> %1, float %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_mask_vf_nxv1f64_nxv1f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwadd.mask.nxv1f64.nxv1f32.f32(
@@ -778,10 +778,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwadd.nxv2f64.nxv2f32.f32(
 define <vscale x 2 x double> @intrinsic_vfwadd_vf_nxv2f64_nxv2f32_f32(<vscale x 2 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_vf_nxv2f64_nxv2f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vf v10, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -804,10 +804,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwadd.mask.nxv2f64.nxv2f32.f32(
 define <vscale x 2 x double> @intrinsic_vfwadd_mask_vf_nxv2f64_nxv2f32_f32(<vscale x 2 x double> %0, <vscale x 2 x float> %1, float %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_mask_vf_nxv2f64_nxv2f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vf v8, v10, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwadd.mask.nxv2f64.nxv2f32.f32(
@@ -829,10 +829,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwadd.nxv4f64.nxv4f32.f32(
 define <vscale x 4 x double> @intrinsic_vfwadd_vf_nxv4f64_nxv4f32_f32(<vscale x 4 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_vf_nxv4f64_nxv4f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vf v12, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv4r.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -855,10 +855,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwadd.mask.nxv4f64.nxv4f32.f32(
 define <vscale x 4 x double> @intrinsic_vfwadd_mask_vf_nxv4f64_nxv4f32_f32(<vscale x 4 x double> %0, <vscale x 4 x float> %1, float %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_mask_vf_nxv4f64_nxv4f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vf v8, v12, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwadd.mask.nxv4f64.nxv4f32.f32(
@@ -880,10 +880,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwadd.nxv8f64.nxv8f32.f32(
 define <vscale x 8 x double> @intrinsic_vfwadd_vf_nxv8f64_nxv8f32_f32(<vscale x 8 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_vf_nxv8f64_nxv8f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vf v16, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -906,10 +906,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwadd.mask.nxv8f64.nxv8f32.f32(
 define <vscale x 8 x double> @intrinsic_vfwadd_mask_vf_nxv8f64_nxv8f32_f32(<vscale x 8 x double> %0, <vscale x 8 x float> %1, float %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd_mask_vf_nxv8f64_nxv8f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.vf v8, v16, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwadd.mask.nxv8f64.nxv8f32.f32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwadd.w.ll b/llvm/test/CodeGen/RISCV/rvv/vfwadd.w.ll
index 2a318c53a5fb..76246eba9480 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwadd.w.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwadd.w.ll
@@ -13,10 +13,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwadd.w.nxv1f32.nxv1f16(
 define <vscale x 1 x float> @intrinsic_vfwadd.w_wv_nxv1f32_nxv1f32_nxv1f16(<vscale x 1 x float> %0, <vscale x 1 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_wv_nxv1f32_nxv1f32_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wv v8, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.nxv1f32.nxv1f16(
@@ -38,10 +38,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.nxv1f16(
 define <vscale x 1 x float> @intrinsic_vfwadd.w_mask_wv_nxv1f32_nxv1f32_nxv1f16(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_nxv1f32_nxv1f32_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.nxv1f16(
@@ -63,10 +63,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwadd.w.nxv2f32.nxv2f16(
 define <vscale x 2 x float> @intrinsic_vfwadd.w_wv_nxv2f32_nxv2f32_nxv2f16(<vscale x 2 x float> %0, <vscale x 2 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_wv_nxv2f32_nxv2f32_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wv v8, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.nxv2f32.nxv2f16(
@@ -88,10 +88,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.nxv2f16(
 define <vscale x 2 x float> @intrinsic_vfwadd.w_mask_wv_nxv2f32_nxv2f32_nxv2f16(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_nxv2f32_nxv2f32_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.nxv2f16(
@@ -113,10 +113,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwadd.w.nxv4f32.nxv4f16(
 define <vscale x 4 x float> @intrinsic_vfwadd.w_wv_nxv4f32_nxv4f32_nxv4f16(<vscale x 4 x float> %0, <vscale x 4 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_wv_nxv4f32_nxv4f32_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wv v8, v8, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.nxv4f32.nxv4f16(
@@ -138,10 +138,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.nxv4f16(
 define <vscale x 4 x float> @intrinsic_vfwadd.w_mask_wv_nxv4f32_nxv4f32_nxv4f16(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_nxv4f32_nxv4f32_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.nxv4f16(
@@ -163,10 +163,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwadd.w.nxv8f32.nxv8f16(
 define <vscale x 8 x float> @intrinsic_vfwadd.w_wv_nxv8f32_nxv8f32_nxv8f16(<vscale x 8 x float> %0, <vscale x 8 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_wv_nxv8f32_nxv8f32_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wv v8, v8, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.nxv8f32.nxv8f16(
@@ -188,10 +188,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.nxv8f16(
 define <vscale x 8 x float> @intrinsic_vfwadd.w_mask_wv_nxv8f32_nxv8f32_nxv8f16(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_nxv8f32_nxv8f32_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.nxv8f16(
@@ -213,10 +213,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwadd.w.nxv16f32.nxv16f16(
 define <vscale x 16 x float> @intrinsic_vfwadd.w_wv_nxv16f32_nxv16f32_nxv16f16(<vscale x 16 x float> %0, <vscale x 16 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_wv_nxv16f32_nxv16f32_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wv v8, v8, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.nxv16f32.nxv16f16(
@@ -239,8 +239,8 @@ define <vscale x 16 x float> @intrinsic_vfwadd.w_mask_wv_nxv16f32_nxv16f32_nxv16
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_nxv16f32_nxv16f32_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl4re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
 ; CHECK-NEXT:    vfwadd.wv v8, v16, v24, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
@@ -264,10 +264,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwadd.w.nxv1f64.nxv1f32(
 define <vscale x 1 x double> @intrinsic_vfwadd.w_wv_nxv1f64_nxv1f64_nxv1f32(<vscale x 1 x double> %0, <vscale x 1 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_wv_nxv1f64_nxv1f64_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wv v8, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwadd.w.nxv1f64.nxv1f32(
@@ -289,10 +289,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwadd.w.mask.nxv1f64.nxv1f32(
 define <vscale x 1 x double> @intrinsic_vfwadd.w_mask_wv_nxv1f64_nxv1f64_nxv1f32(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_nxv1f64_nxv1f64_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwadd.w.mask.nxv1f64.nxv1f32(
@@ -314,10 +314,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwadd.w.nxv2f64.nxv2f32(
 define <vscale x 2 x double> @intrinsic_vfwadd.w_wv_nxv2f64_nxv2f64_nxv2f32(<vscale x 2 x double> %0, <vscale x 2 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_wv_nxv2f64_nxv2f64_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wv v8, v8, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwadd.w.nxv2f64.nxv2f32(
@@ -339,10 +339,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwadd.w.mask.nxv2f64.nxv2f32(
 define <vscale x 2 x double> @intrinsic_vfwadd.w_mask_wv_nxv2f64_nxv2f64_nxv2f32(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_nxv2f64_nxv2f64_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwadd.w.mask.nxv2f64.nxv2f32(
@@ -364,10 +364,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwadd.w.nxv4f64.nxv4f32(
 define <vscale x 4 x double> @intrinsic_vfwadd.w_wv_nxv4f64_nxv4f64_nxv4f32(<vscale x 4 x double> %0, <vscale x 4 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_wv_nxv4f64_nxv4f64_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wv v8, v8, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwadd.w.nxv4f64.nxv4f32(
@@ -389,10 +389,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwadd.w.mask.nxv4f64.nxv4f32(
 define <vscale x 4 x double> @intrinsic_vfwadd.w_mask_wv_nxv4f64_nxv4f64_nxv4f32(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_nxv4f64_nxv4f64_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwadd.w.mask.nxv4f64.nxv4f32(
@@ -414,10 +414,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwadd.w.nxv8f64.nxv8f32(
 define <vscale x 8 x double> @intrinsic_vfwadd.w_wv_nxv8f64_nxv8f64_nxv8f32(<vscale x 8 x double> %0, <vscale x 8 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_wv_nxv8f64_nxv8f64_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wv v8, v8, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwadd.w.nxv8f64.nxv8f32(
@@ -440,8 +440,8 @@ define <vscale x 8 x double> @intrinsic_vfwadd.w_mask_wv_nxv8f64_nxv8f64_nxv8f32
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_nxv8f64_nxv8f64_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl4re32.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
 ; CHECK-NEXT:    vfwadd.wv v8, v16, v24, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
@@ -465,10 +465,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwadd.w.nxv1f32.f16(
 define <vscale x 1 x float> @intrinsic_vfwadd.w_wf_nxv1f32_nxv1f32_f16(<vscale x 1 x float> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_wf_nxv1f32_nxv1f32_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.nxv1f32.f16(
@@ -490,10 +490,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.f16(
 define <vscale x 1 x float> @intrinsic_vfwadd.w_mask_wf_nxv1f32_nxv1f32_f16(<vscale x 1 x float> %0, <vscale x 1 x float> %1, half %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_nxv1f32_nxv1f32_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.f16(
@@ -515,10 +515,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwadd.w.nxv2f32.f16(
 define <vscale x 2 x float> @intrinsic_vfwadd.w_wf_nxv2f32_nxv2f32_f16(<vscale x 2 x float> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_wf_nxv2f32_nxv2f32_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.nxv2f32.f16(
@@ -540,10 +540,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.f16(
 define <vscale x 2 x float> @intrinsic_vfwadd.w_mask_wf_nxv2f32_nxv2f32_f16(<vscale x 2 x float> %0, <vscale x 2 x float> %1, half %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_nxv2f32_nxv2f32_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.f16(
@@ -565,10 +565,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwadd.w.nxv4f32.f16(
 define <vscale x 4 x float> @intrinsic_vfwadd.w_wf_nxv4f32_nxv4f32_f16(<vscale x 4 x float> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_wf_nxv4f32_nxv4f32_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.nxv4f32.f16(
@@ -590,10 +590,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.f16(
 define <vscale x 4 x float> @intrinsic_vfwadd.w_mask_wf_nxv4f32_nxv4f32_f16(<vscale x 4 x float> %0, <vscale x 4 x float> %1, half %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_nxv4f32_nxv4f32_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wf v8, v10, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.f16(
@@ -615,10 +615,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwadd.w.nxv8f32.f16(
 define <vscale x 8 x float> @intrinsic_vfwadd.w_wf_nxv8f32_nxv8f32_f16(<vscale x 8 x float> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_wf_nxv8f32_nxv8f32_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.nxv8f32.f16(
@@ -640,10 +640,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.f16(
 define <vscale x 8 x float> @intrinsic_vfwadd.w_mask_wf_nxv8f32_nxv8f32_f16(<vscale x 8 x float> %0, <vscale x 8 x float> %1, half %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_nxv8f32_nxv8f32_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wf v8, v12, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.f16(
@@ -665,10 +665,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwadd.w.nxv16f32.f16(
 define <vscale x 16 x float> @intrinsic_vfwadd.w_wf_nxv16f32_nxv16f32_f16(<vscale x 16 x float> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_wf_nxv16f32_nxv16f32_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.nxv16f32.f16(
@@ -690,10 +690,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.f16(
 define <vscale x 16 x float> @intrinsic_vfwadd.w_mask_wf_nxv16f32_nxv16f32_f16(<vscale x 16 x float> %0, <vscale x 16 x float> %1, half %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_nxv16f32_nxv16f32_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wf v8, v16, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.f16(
@@ -715,10 +715,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwadd.w.nxv1f64.f32(
 define <vscale x 1 x double> @intrinsic_vfwadd.w_wf_nxv1f64_nxv1f64_f32(<vscale x 1 x double> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_wf_nxv1f64_nxv1f64_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwadd.w.nxv1f64.f32(
@@ -740,10 +740,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwadd.w.mask.nxv1f64.f32(
 define <vscale x 1 x double> @intrinsic_vfwadd.w_mask_wf_nxv1f64_nxv1f64_f32(<vscale x 1 x double> %0, <vscale x 1 x double> %1, float %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_nxv1f64_nxv1f64_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwadd.w.mask.nxv1f64.f32(
@@ -765,10 +765,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwadd.w.nxv2f64.f32(
 define <vscale x 2 x double> @intrinsic_vfwadd.w_wf_nxv2f64_nxv2f64_f32(<vscale x 2 x double> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_wf_nxv2f64_nxv2f64_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwadd.w.nxv2f64.f32(
@@ -790,10 +790,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwadd.w.mask.nxv2f64.f32(
 define <vscale x 2 x double> @intrinsic_vfwadd.w_mask_wf_nxv2f64_nxv2f64_f32(<vscale x 2 x double> %0, <vscale x 2 x double> %1, float %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_nxv2f64_nxv2f64_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wf v8, v10, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwadd.w.mask.nxv2f64.f32(
@@ -815,10 +815,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwadd.w.nxv4f64.f32(
 define <vscale x 4 x double> @intrinsic_vfwadd.w_wf_nxv4f64_nxv4f64_f32(<vscale x 4 x double> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_wf_nxv4f64_nxv4f64_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwadd.w.nxv4f64.f32(
@@ -840,10 +840,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwadd.w.mask.nxv4f64.f32(
 define <vscale x 4 x double> @intrinsic_vfwadd.w_mask_wf_nxv4f64_nxv4f64_f32(<vscale x 4 x double> %0, <vscale x 4 x double> %1, float %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_nxv4f64_nxv4f64_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wf v8, v12, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwadd.w.mask.nxv4f64.f32(
@@ -865,10 +865,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwadd.w.nxv8f64.f32(
 define <vscale x 8 x double> @intrinsic_vfwadd.w_wf_nxv8f64_nxv8f64_f32(<vscale x 8 x double> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_wf_nxv8f64_nxv8f64_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwadd.w.nxv8f64.f32(
@@ -890,10 +890,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwadd.w.mask.nxv8f64.f32(
 define <vscale x 8 x double> @intrinsic_vfwadd.w_mask_wf_nxv8f64_nxv8f64_f32(<vscale x 8 x double> %0, <vscale x 8 x double> %1, float %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_nxv8f64_nxv8f64_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wf v8, v16, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwadd.w.mask.nxv8f64.f32(
@@ -909,10 +909,10 @@ entry:
 define <vscale x 1 x float> @intrinsic_vfwadd.w_mask_wv_tie_nxv1f32_nxv1f32_nxv1f16(<vscale x 1 x float> %0, <vscale x 1 x half> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_tie_nxv1f32_nxv1f32_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wv v8, v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.nxv1f16(
@@ -928,10 +928,10 @@ entry:
 define <vscale x 2 x float> @intrinsic_vfwadd.w_mask_wv_tie_nxv2f32_nxv2f32_nxv2f16(<vscale x 2 x float> %0, <vscale x 2 x half> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_tie_nxv2f32_nxv2f32_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wv v8, v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.nxv2f16(
@@ -947,10 +947,10 @@ entry:
 define <vscale x 4 x float> @intrinsic_vfwadd.w_mask_wv_tie_nxv4f32_nxv4f32_nxv4f16(<vscale x 4 x float> %0, <vscale x 4 x half> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_tie_nxv4f32_nxv4f32_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wv v8, v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.nxv4f16(
@@ -966,10 +966,10 @@ entry:
 define <vscale x 8 x float> @intrinsic_vfwadd.w_mask_wv_tie_nxv8f32_nxv8f32_nxv8f16(<vscale x 8 x float> %0, <vscale x 8 x half> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_tie_nxv8f32_nxv8f32_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wv v8, v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.nxv8f16(
@@ -985,10 +985,10 @@ entry:
 define <vscale x 16 x float> @intrinsic_vfwadd.w_mask_wv_tie_nxv16f32_nxv16f32_nxv16f16(<vscale x 16 x float> %0, <vscale x 16 x half> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_tie_nxv16f32_nxv16f32_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wv v8, v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.nxv16f16(
@@ -1004,10 +1004,10 @@ entry:
 define <vscale x 1 x double> @intrinsic_vfwadd.w_mask_wv_tie_nxv1f64_nxv1f64_nxv1f32(<vscale x 1 x double> %0, <vscale x 1 x float> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_tie_nxv1f64_nxv1f64_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wv v8, v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwadd.w.mask.nxv1f64.nxv1f32(
@@ -1023,10 +1023,10 @@ entry:
 define <vscale x 2 x double> @intrinsic_vfwadd.w_mask_wv_tie_nxv2f64_nxv2f64_nxv2f32(<vscale x 2 x double> %0, <vscale x 2 x float> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_tie_nxv2f64_nxv2f64_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wv v8, v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwadd.w.mask.nxv2f64.nxv2f32(
@@ -1042,10 +1042,10 @@ entry:
 define <vscale x 4 x double> @intrinsic_vfwadd.w_mask_wv_tie_nxv4f64_nxv4f64_nxv4f32(<vscale x 4 x double> %0, <vscale x 4 x float> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_tie_nxv4f64_nxv4f64_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wv v8, v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwadd.w.mask.nxv4f64.nxv4f32(
@@ -1061,10 +1061,10 @@ entry:
 define <vscale x 8 x double> @intrinsic_vfwadd.w_mask_wv_tie_nxv8f64_nxv8f64_nxv8f32(<vscale x 8 x double> %0, <vscale x 8 x float> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_tie_nxv8f64_nxv8f64_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wv v8, v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwadd.w.mask.nxv8f64.nxv8f32(
@@ -1080,10 +1080,10 @@ entry:
 define <vscale x 1 x float> @intrinsic_vfwadd.w_mask_wf_tie_nxv1f32_nxv1f32_f16(<vscale x 1 x float> %0, half %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_tie_nxv1f32_nxv1f32_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wf v8, v8, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.f16(
@@ -1099,10 +1099,10 @@ entry:
 define <vscale x 2 x float> @intrinsic_vfwadd.w_mask_wf_tie_nxv2f32_nxv2f32_f16(<vscale x 2 x float> %0, half %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_tie_nxv2f32_nxv2f32_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wf v8, v8, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.f16(
@@ -1118,10 +1118,10 @@ entry:
 define <vscale x 4 x float> @intrinsic_vfwadd.w_mask_wf_tie_nxv4f32_nxv4f32_f16(<vscale x 4 x float> %0, half %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_tie_nxv4f32_nxv4f32_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wf v8, v8, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.f16(
@@ -1137,10 +1137,10 @@ entry:
 define <vscale x 8 x float> @intrinsic_vfwadd.w_mask_wf_tie_nxv8f32_nxv8f32_f16(<vscale x 8 x float> %0, half %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_tie_nxv8f32_nxv8f32_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wf v8, v8, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.f16(
@@ -1156,10 +1156,10 @@ entry:
 define <vscale x 16 x float> @intrinsic_vfwadd.w_mask_wf_tie_nxv16f32_nxv16f32_f16(<vscale x 16 x float> %0, half %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_tie_nxv16f32_nxv16f32_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wf v8, v8, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.f16(
@@ -1175,10 +1175,10 @@ entry:
 define <vscale x 1 x double> @intrinsic_vfwadd.w_mask_wf_tie_nxv1f64_nxv1f64_f32(<vscale x 1 x double> %0, float %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_tie_nxv1f64_nxv1f64_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wf v8, v8, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwadd.w.mask.nxv1f64.f32(
@@ -1194,10 +1194,10 @@ entry:
 define <vscale x 2 x double> @intrinsic_vfwadd.w_mask_wf_tie_nxv2f64_nxv2f64_f32(<vscale x 2 x double> %0, float %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_tie_nxv2f64_nxv2f64_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wf v8, v8, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwadd.w.mask.nxv2f64.f32(
@@ -1213,10 +1213,10 @@ entry:
 define <vscale x 4 x double> @intrinsic_vfwadd.w_mask_wf_tie_nxv4f64_nxv4f64_f32(<vscale x 4 x double> %0, float %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_tie_nxv4f64_nxv4f64_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wf v8, v8, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwadd.w.mask.nxv4f64.f32(
@@ -1232,10 +1232,10 @@ entry:
 define <vscale x 8 x double> @intrinsic_vfwadd.w_mask_wf_tie_nxv8f64_nxv8f64_f32(<vscale x 8 x double> %0, float %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_tie_nxv8f64_nxv8f64_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wf v8, v8, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwadd.w.mask.nxv8f64.f32(
@@ -1251,10 +1251,10 @@ entry:
 define <vscale x 1 x float> @intrinsic_vfwadd.w_wv_untie_nxv1f32_nxv1f32_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_wv_untie_nxv1f32_nxv1f32_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wv v10, v9, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -1270,10 +1270,10 @@ entry:
 define <vscale x 2 x float> @intrinsic_vfwadd.w_wv_untie_nxv2f32_nxv2f32_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_wv_untie_nxv2f32_nxv2f32_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wv v10, v9, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -1289,10 +1289,10 @@ entry:
 define <vscale x 4 x float> @intrinsic_vfwadd.w_wv_untie_nxv4f32_nxv4f32_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_wv_untie_nxv4f32_nxv4f32_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wv v12, v10, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv2r.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -1308,10 +1308,10 @@ entry:
 define <vscale x 8 x float> @intrinsic_vfwadd.w_wv_untie_nxv8f32_nxv8f32_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_wv_untie_nxv8f32_nxv8f32_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wv v16, v12, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv4r.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -1327,10 +1327,10 @@ entry:
 define <vscale x 1 x double> @intrinsic_vfwadd.w_wv_untie_nxv1f64_nxv1f64_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x double> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_wv_untie_nxv1f64_nxv1f64_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wv v10, v9, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -1346,10 +1346,10 @@ entry:
 define <vscale x 2 x double> @intrinsic_vfwadd.w_wv_untie_nxv2f64_nxv2f64_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x double> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_wv_untie_nxv2f64_nxv2f64_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wv v12, v10, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv2r.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -1365,10 +1365,10 @@ entry:
 define <vscale x 4 x double> @intrinsic_vfwadd.w_wv_untie_nxv4f64_nxv4f64_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x double> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_wv_untie_nxv4f64_nxv4f64_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wv v16, v12, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv4r.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -1384,10 +1384,10 @@ entry:
 define <vscale x 8 x double> @intrinsic_vfwadd.w_wv_untie_nxv8f64_nxv8f64_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x double> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwadd.w_wv_untie_nxv8f64_nxv8f64_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwadd.wv v24, v16, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv8r.v v8, v24
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-x-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-x-f.ll
index ba7ba4e4c2bb..23b10250dfa4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-x-f.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-x-f.ll
@@ -12,10 +12,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vfwcvt.x.f.v.nxv1i32.nxv1f16(
 define <vscale x 1 x i32> @intrinsic_vfwcvt_x.f.v_nxv1i32_nxv1f16(<vscale x 1 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_x.f.v_nxv1i32_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.x.f.v v9, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -36,10 +36,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vfwcvt.x.f.v.mask.nxv1i32.nxv1f16(
 define <vscale x 1 x i32> @intrinsic_vfwcvt_mask_x.f.v_nxv1i32_nxv1f16(<vscale x 1 x i32> %0, <vscale x 1 x half> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_mask_x.f.v_nxv1i32_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.x.f.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vfwcvt.x.f.v.mask.nxv1i32.nxv1f16(
@@ -59,10 +59,10 @@ declare <vscale x 2 x i32> @llvm.riscv.vfwcvt.x.f.v.nxv2i32.nxv2f16(
 define <vscale x 2 x i32> @intrinsic_vfwcvt_x.f.v_nxv2i32_nxv2f16(<vscale x 2 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_x.f.v_nxv2i32_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.x.f.v v9, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -83,10 +83,10 @@ declare <vscale x 2 x i32> @llvm.riscv.vfwcvt.x.f.v.mask.nxv2i32.nxv2f16(
 define <vscale x 2 x i32> @intrinsic_vfwcvt_mask_x.f.v_nxv2i32_nxv2f16(<vscale x 2 x i32> %0, <vscale x 2 x half> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_mask_x.f.v_nxv2i32_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.x.f.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vfwcvt.x.f.v.mask.nxv2i32.nxv2f16(
@@ -106,10 +106,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vfwcvt.x.f.v.nxv4i32.nxv4f16(
 define <vscale x 4 x i32> @intrinsic_vfwcvt_x.f.v_nxv4i32_nxv4f16(<vscale x 4 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_x.f.v_nxv4i32_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.x.f.v v10, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -130,10 +130,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vfwcvt.x.f.v.mask.nxv4i32.nxv4f16(
 define <vscale x 4 x i32> @intrinsic_vfwcvt_mask_x.f.v_nxv4i32_nxv4f16(<vscale x 4 x i32> %0, <vscale x 4 x half> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_mask_x.f.v_nxv4i32_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.x.f.v v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vfwcvt.x.f.v.mask.nxv4i32.nxv4f16(
@@ -153,10 +153,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vfwcvt.x.f.v.nxv8i32.nxv8f16(
 define <vscale x 8 x i32> @intrinsic_vfwcvt_x.f.v_nxv8i32_nxv8f16(<vscale x 8 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_x.f.v_nxv8i32_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.x.f.v v12, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv4r.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -177,10 +177,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vfwcvt.x.f.v.mask.nxv8i32.nxv8f16(
 define <vscale x 8 x i32> @intrinsic_vfwcvt_mask_x.f.v_nxv8i32_nxv8f16(<vscale x 8 x i32> %0, <vscale x 8 x half> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_mask_x.f.v_nxv8i32_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.x.f.v v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vfwcvt.x.f.v.mask.nxv8i32.nxv8f16(
@@ -200,10 +200,10 @@ declare <vscale x 16 x i32> @llvm.riscv.vfwcvt.x.f.v.nxv16i32.nxv16f16(
 define <vscale x 16 x i32> @intrinsic_vfwcvt_x.f.v_nxv16i32_nxv16f16(<vscale x 16 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_x.f.v_nxv16i32_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.x.f.v v16, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -224,10 +224,10 @@ declare <vscale x 16 x i32> @llvm.riscv.vfwcvt.x.f.v.mask.nxv16i32.nxv16f16(
 define <vscale x 16 x i32> @intrinsic_vfwcvt_mask_x.f.v_nxv16i32_nxv16f16(<vscale x 16 x i32> %0, <vscale x 16 x half> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_mask_x.f.v_nxv16i32_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.x.f.v v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vfwcvt.x.f.v.mask.nxv16i32.nxv16f16(
@@ -247,10 +247,10 @@ declare <vscale x 1 x i64> @llvm.riscv.vfwcvt.x.f.v.nxv1i64.nxv1f32(
 define <vscale x 1 x i64> @intrinsic_vfwcvt_x.f.v_nxv1i64_nxv1f32(<vscale x 1 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_x.f.v_nxv1i64_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.x.f.v v9, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -271,10 +271,10 @@ declare <vscale x 1 x i64> @llvm.riscv.vfwcvt.x.f.v.mask.nxv1i64.nxv1f32(
 define <vscale x 1 x i64> @intrinsic_vfwcvt_mask_x.f.v_nxv1i64_nxv1f32(<vscale x 1 x i64> %0, <vscale x 1 x float> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_mask_x.f.v_nxv1i64_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.x.f.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vfwcvt.x.f.v.mask.nxv1i64.nxv1f32(
@@ -294,10 +294,10 @@ declare <vscale x 2 x i64> @llvm.riscv.vfwcvt.x.f.v.nxv2i64.nxv2f32(
 define <vscale x 2 x i64> @intrinsic_vfwcvt_x.f.v_nxv2i64_nxv2f32(<vscale x 2 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_x.f.v_nxv2i64_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.x.f.v v10, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -318,10 +318,10 @@ declare <vscale x 2 x i64> @llvm.riscv.vfwcvt.x.f.v.mask.nxv2i64.nxv2f32(
 define <vscale x 2 x i64> @intrinsic_vfwcvt_mask_x.f.v_nxv2i64_nxv2f32(<vscale x 2 x i64> %0, <vscale x 2 x float> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_mask_x.f.v_nxv2i64_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.x.f.v v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vfwcvt.x.f.v.mask.nxv2i64.nxv2f32(
@@ -341,10 +341,10 @@ declare <vscale x 4 x i64> @llvm.riscv.vfwcvt.x.f.v.nxv4i64.nxv4f32(
 define <vscale x 4 x i64> @intrinsic_vfwcvt_x.f.v_nxv4i64_nxv4f32(<vscale x 4 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_x.f.v_nxv4i64_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.x.f.v v12, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv4r.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -365,10 +365,10 @@ declare <vscale x 4 x i64> @llvm.riscv.vfwcvt.x.f.v.mask.nxv4i64.nxv4f32(
 define <vscale x 4 x i64> @intrinsic_vfwcvt_mask_x.f.v_nxv4i64_nxv4f32(<vscale x 4 x i64> %0, <vscale x 4 x float> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_mask_x.f.v_nxv4i64_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.x.f.v v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vfwcvt.x.f.v.mask.nxv4i64.nxv4f32(
@@ -388,10 +388,10 @@ declare <vscale x 8 x i64> @llvm.riscv.vfwcvt.x.f.v.nxv8i64.nxv8f32(
 define <vscale x 8 x i64> @intrinsic_vfwcvt_x.f.v_nxv8i64_nxv8f32(<vscale x 8 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_x.f.v_nxv8i64_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.x.f.v v16, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -412,10 +412,10 @@ declare <vscale x 8 x i64> @llvm.riscv.vfwcvt.x.f.v.mask.nxv8i64.nxv8f32(
 define <vscale x 8 x i64> @intrinsic_vfwcvt_mask_x.f.v_nxv8i64_nxv8f32(<vscale x 8 x i64> %0, <vscale x 8 x float> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_mask_x.f.v_nxv8i64_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.x.f.v v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vfwcvt.x.f.v.mask.nxv8i64.nxv8f32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-xu-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-xu-f.ll
index 82cea184920b..f6779ec9ba5a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-xu-f.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-xu-f.ll
@@ -12,10 +12,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vfwcvt.xu.f.v.nxv1i32.nxv1f16(
 define <vscale x 1 x i32> @intrinsic_vfwcvt_xu.f.v_nxv1i32_nxv1f16(<vscale x 1 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_xu.f.v_nxv1i32_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.xu.f.v v9, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -36,10 +36,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vfwcvt.xu.f.v.mask.nxv1i32.nxv1f16(
 define <vscale x 1 x i32> @intrinsic_vfwcvt_mask_xu.f.v_nxv1i32_nxv1f16(<vscale x 1 x i32> %0, <vscale x 1 x half> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_mask_xu.f.v_nxv1i32_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.xu.f.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vfwcvt.xu.f.v.mask.nxv1i32.nxv1f16(
@@ -59,10 +59,10 @@ declare <vscale x 2 x i32> @llvm.riscv.vfwcvt.xu.f.v.nxv2i32.nxv2f16(
 define <vscale x 2 x i32> @intrinsic_vfwcvt_xu.f.v_nxv2i32_nxv2f16(<vscale x 2 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_xu.f.v_nxv2i32_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.xu.f.v v9, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -83,10 +83,10 @@ declare <vscale x 2 x i32> @llvm.riscv.vfwcvt.xu.f.v.mask.nxv2i32.nxv2f16(
 define <vscale x 2 x i32> @intrinsic_vfwcvt_mask_xu.f.v_nxv2i32_nxv2f16(<vscale x 2 x i32> %0, <vscale x 2 x half> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_mask_xu.f.v_nxv2i32_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.xu.f.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vfwcvt.xu.f.v.mask.nxv2i32.nxv2f16(
@@ -106,10 +106,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vfwcvt.xu.f.v.nxv4i32.nxv4f16(
 define <vscale x 4 x i32> @intrinsic_vfwcvt_xu.f.v_nxv4i32_nxv4f16(<vscale x 4 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_xu.f.v_nxv4i32_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.xu.f.v v10, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -130,10 +130,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vfwcvt.xu.f.v.mask.nxv4i32.nxv4f16(
 define <vscale x 4 x i32> @intrinsic_vfwcvt_mask_xu.f.v_nxv4i32_nxv4f16(<vscale x 4 x i32> %0, <vscale x 4 x half> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_mask_xu.f.v_nxv4i32_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.xu.f.v v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vfwcvt.xu.f.v.mask.nxv4i32.nxv4f16(
@@ -153,10 +153,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vfwcvt.xu.f.v.nxv8i32.nxv8f16(
 define <vscale x 8 x i32> @intrinsic_vfwcvt_xu.f.v_nxv8i32_nxv8f16(<vscale x 8 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_xu.f.v_nxv8i32_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.xu.f.v v12, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv4r.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -177,10 +177,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vfwcvt.xu.f.v.mask.nxv8i32.nxv8f16(
 define <vscale x 8 x i32> @intrinsic_vfwcvt_mask_xu.f.v_nxv8i32_nxv8f16(<vscale x 8 x i32> %0, <vscale x 8 x half> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_mask_xu.f.v_nxv8i32_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.xu.f.v v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vfwcvt.xu.f.v.mask.nxv8i32.nxv8f16(
@@ -200,10 +200,10 @@ declare <vscale x 16 x i32> @llvm.riscv.vfwcvt.xu.f.v.nxv16i32.nxv16f16(
 define <vscale x 16 x i32> @intrinsic_vfwcvt_xu.f.v_nxv16i32_nxv16f16(<vscale x 16 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_xu.f.v_nxv16i32_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.xu.f.v v16, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -224,10 +224,10 @@ declare <vscale x 16 x i32> @llvm.riscv.vfwcvt.xu.f.v.mask.nxv16i32.nxv16f16(
 define <vscale x 16 x i32> @intrinsic_vfwcvt_mask_xu.f.v_nxv16i32_nxv16f16(<vscale x 16 x i32> %0, <vscale x 16 x half> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_mask_xu.f.v_nxv16i32_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.xu.f.v v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vfwcvt.xu.f.v.mask.nxv16i32.nxv16f16(
@@ -247,10 +247,10 @@ declare <vscale x 1 x i64> @llvm.riscv.vfwcvt.xu.f.v.nxv1i64.nxv1f32(
 define <vscale x 1 x i64> @intrinsic_vfwcvt_xu.f.v_nxv1i64_nxv1f32(<vscale x 1 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_xu.f.v_nxv1i64_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.xu.f.v v9, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -271,10 +271,10 @@ declare <vscale x 1 x i64> @llvm.riscv.vfwcvt.xu.f.v.mask.nxv1i64.nxv1f32(
 define <vscale x 1 x i64> @intrinsic_vfwcvt_mask_xu.f.v_nxv1i64_nxv1f32(<vscale x 1 x i64> %0, <vscale x 1 x float> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_mask_xu.f.v_nxv1i64_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.xu.f.v v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vfwcvt.xu.f.v.mask.nxv1i64.nxv1f32(
@@ -294,10 +294,10 @@ declare <vscale x 2 x i64> @llvm.riscv.vfwcvt.xu.f.v.nxv2i64.nxv2f32(
 define <vscale x 2 x i64> @intrinsic_vfwcvt_xu.f.v_nxv2i64_nxv2f32(<vscale x 2 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_xu.f.v_nxv2i64_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.xu.f.v v10, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -318,10 +318,10 @@ declare <vscale x 2 x i64> @llvm.riscv.vfwcvt.xu.f.v.mask.nxv2i64.nxv2f32(
 define <vscale x 2 x i64> @intrinsic_vfwcvt_mask_xu.f.v_nxv2i64_nxv2f32(<vscale x 2 x i64> %0, <vscale x 2 x float> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_mask_xu.f.v_nxv2i64_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.xu.f.v v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vfwcvt.xu.f.v.mask.nxv2i64.nxv2f32(
@@ -341,10 +341,10 @@ declare <vscale x 4 x i64> @llvm.riscv.vfwcvt.xu.f.v.nxv4i64.nxv4f32(
 define <vscale x 4 x i64> @intrinsic_vfwcvt_xu.f.v_nxv4i64_nxv4f32(<vscale x 4 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_xu.f.v_nxv4i64_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.xu.f.v v12, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv4r.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -365,10 +365,10 @@ declare <vscale x 4 x i64> @llvm.riscv.vfwcvt.xu.f.v.mask.nxv4i64.nxv4f32(
 define <vscale x 4 x i64> @intrinsic_vfwcvt_mask_xu.f.v_nxv4i64_nxv4f32(<vscale x 4 x i64> %0, <vscale x 4 x float> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_mask_xu.f.v_nxv4i64_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.xu.f.v v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vfwcvt.xu.f.v.mask.nxv4i64.nxv4f32(
@@ -388,10 +388,10 @@ declare <vscale x 8 x i64> @llvm.riscv.vfwcvt.xu.f.v.nxv8i64.nxv8f32(
 define <vscale x 8 x i64> @intrinsic_vfwcvt_xu.f.v_nxv8i64_nxv8f32(<vscale x 8 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_xu.f.v_nxv8i64_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.xu.f.v v16, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -412,10 +412,10 @@ declare <vscale x 8 x i64> @llvm.riscv.vfwcvt.xu.f.v.mask.nxv8i64.nxv8f32(
 define <vscale x 8 x i64> @intrinsic_vfwcvt_mask_xu.f.v_nxv8i64_nxv8f32(<vscale x 8 x i64> %0, <vscale x 8 x float> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwcvt_mask_xu.f.v_nxv8i64_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwcvt.xu.f.v v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vfwcvt.xu.f.v.mask.nxv8i64.nxv8f32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwmacc.ll b/llvm/test/CodeGen/RISCV/rvv/vfwmacc.ll
index b3ff91d92ce9..225ba1c14031 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwmacc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwmacc.ll
@@ -13,10 +13,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwmacc.nxv1f32.nxv1f16(
 define <vscale x 1 x float>  @intrinsic_vfwmacc_vv_nxv1f32_nxv1f16_nxv1f16(<vscale x 1 x float> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_vv_nxv1f32_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwmacc.nxv1f32.nxv1f16(
@@ -38,10 +38,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.nxv1f16(
 define <vscale x 1 x float>  @intrinsic_vfwmacc_mask_vv_nxv1f32_nxv1f16_nxv1f16(<vscale x 1 x float> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_mask_vv_nxv1f32_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.nxv1f16(
@@ -63,10 +63,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwmacc.nxv2f32.nxv2f16(
 define <vscale x 2 x float>  @intrinsic_vfwmacc_vv_nxv2f32_nxv2f16_nxv2f16(<vscale x 2 x float> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_vv_nxv2f32_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwmacc.nxv2f32.nxv2f16(
@@ -88,10 +88,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.nxv2f16(
 define <vscale x 2 x float>  @intrinsic_vfwmacc_mask_vv_nxv2f32_nxv2f16_nxv2f16(<vscale x 2 x float> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_mask_vv_nxv2f32_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.nxv2f16(
@@ -113,10 +113,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwmacc.nxv4f32.nxv4f16(
 define <vscale x 4 x float>  @intrinsic_vfwmacc_vv_nxv4f32_nxv4f16_nxv4f16(<vscale x 4 x float> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_vv_nxv4f32_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vv v8, v10, v11
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwmacc.nxv4f32.nxv4f16(
@@ -138,10 +138,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.nxv4f16(
 define <vscale x 4 x float>  @intrinsic_vfwmacc_mask_vv_nxv4f32_nxv4f16_nxv4f16(<vscale x 4 x float> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_mask_vv_nxv4f32_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vv v8, v10, v11, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.nxv4f16(
@@ -163,10 +163,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwmacc.nxv8f32.nxv8f16(
 define <vscale x 8 x float>  @intrinsic_vfwmacc_vv_nxv8f32_nxv8f16_nxv8f16(<vscale x 8 x float> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_vv_nxv8f32_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vv v8, v12, v14
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwmacc.nxv8f32.nxv8f16(
@@ -188,10 +188,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.nxv8f16(
 define <vscale x 8 x float>  @intrinsic_vfwmacc_mask_vv_nxv8f32_nxv8f16_nxv8f16(<vscale x 8 x float> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_mask_vv_nxv8f32_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vv v8, v12, v14, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.nxv8f16(
@@ -213,10 +213,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwmacc.nxv16f32.nxv16f16(
 define <vscale x 16 x float>  @intrinsic_vfwmacc_vv_nxv16f32_nxv16f16_nxv16f16(<vscale x 16 x float> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_vv_nxv16f32_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vv v8, v16, v20
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwmacc.nxv16f32.nxv16f16(
@@ -238,10 +238,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.nxv16f16(
 define <vscale x 16 x float>  @intrinsic_vfwmacc_mask_vv_nxv16f32_nxv16f16_nxv16f16(<vscale x 16 x float> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_mask_vv_nxv16f32_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vv v8, v16, v20, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.nxv16f16(
@@ -263,10 +263,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwmacc.nxv1f64.nxv1f32(
 define <vscale x 1 x double>  @intrinsic_vfwmacc_vv_nxv1f64_nxv1f32_nxv1f32(<vscale x 1 x double> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_vv_nxv1f64_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwmacc.nxv1f64.nxv1f32(
@@ -288,10 +288,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwmacc.mask.nxv1f64.nxv1f32(
 define <vscale x 1 x double>  @intrinsic_vfwmacc_mask_vv_nxv1f64_nxv1f32_nxv1f32(<vscale x 1 x double> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_mask_vv_nxv1f64_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwmacc.mask.nxv1f64.nxv1f32(
@@ -313,10 +313,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwmacc.nxv2f64.nxv2f32(
 define <vscale x 2 x double>  @intrinsic_vfwmacc_vv_nxv2f64_nxv2f32_nxv2f32(<vscale x 2 x double> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_vv_nxv2f64_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vv v8, v10, v11
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwmacc.nxv2f64.nxv2f32(
@@ -338,10 +338,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwmacc.mask.nxv2f64.nxv2f32(
 define <vscale x 2 x double>  @intrinsic_vfwmacc_mask_vv_nxv2f64_nxv2f32_nxv2f32(<vscale x 2 x double> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_mask_vv_nxv2f64_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vv v8, v10, v11, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwmacc.mask.nxv2f64.nxv2f32(
@@ -363,10 +363,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwmacc.nxv4f64.nxv4f32(
 define <vscale x 4 x double>  @intrinsic_vfwmacc_vv_nxv4f64_nxv4f32_nxv4f32(<vscale x 4 x double> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_vv_nxv4f64_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vv v8, v12, v14
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwmacc.nxv4f64.nxv4f32(
@@ -388,10 +388,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwmacc.mask.nxv4f64.nxv4f32(
 define <vscale x 4 x double>  @intrinsic_vfwmacc_mask_vv_nxv4f64_nxv4f32_nxv4f32(<vscale x 4 x double> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_mask_vv_nxv4f64_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vv v8, v12, v14, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwmacc.mask.nxv4f64.nxv4f32(
@@ -413,10 +413,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwmacc.nxv8f64.nxv8f32(
 define <vscale x 8 x double>  @intrinsic_vfwmacc_vv_nxv8f64_nxv8f32_nxv8f32(<vscale x 8 x double> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_vv_nxv8f64_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vv v8, v16, v20
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwmacc.nxv8f64.nxv8f32(
@@ -438,10 +438,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwmacc.mask.nxv8f64.nxv8f32(
 define <vscale x 8 x double>  @intrinsic_vfwmacc_mask_vv_nxv8f64_nxv8f32_nxv8f32(<vscale x 8 x double> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_mask_vv_nxv8f64_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vv v8, v16, v20, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwmacc.mask.nxv8f64.nxv8f32(
@@ -463,10 +463,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwmacc.nxv1f32.f16(
 define <vscale x 1 x float>  @intrinsic_vfwmacc_vf_nxv1f32_f16_nxv1f16(<vscale x 1 x float> %0, half %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_vf_nxv1f32_f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwmacc.nxv1f32.f16(
@@ -488,10 +488,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.f16(
 define <vscale x 1 x float> @intrinsic_vfwmacc_mask_vf_nxv1f32_f16_nxv1f16(<vscale x 1 x float> %0, half %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_mask_vf_nxv1f32_f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.f16(
@@ -513,10 +513,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwmacc.nxv2f32.f16(
 define <vscale x 2 x float>  @intrinsic_vfwmacc_vf_nxv2f32_f16_nxv2f16(<vscale x 2 x float> %0, half %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_vf_nxv2f32_f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwmacc.nxv2f32.f16(
@@ -538,10 +538,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.f16(
 define <vscale x 2 x float> @intrinsic_vfwmacc_mask_vf_nxv2f32_f16_nxv2f16(<vscale x 2 x float> %0, half %1, <vscale x 2 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_mask_vf_nxv2f32_f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.f16(
@@ -563,10 +563,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwmacc.nxv4f32.f16(
 define <vscale x 4 x float>  @intrinsic_vfwmacc_vf_nxv4f32_f16_nxv4f16(<vscale x 4 x float> %0, half %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_vf_nxv4f32_f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vf v8, fa0, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwmacc.nxv4f32.f16(
@@ -588,10 +588,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.f16(
 define <vscale x 4 x float> @intrinsic_vfwmacc_mask_vf_nxv4f32_f16_nxv4f16(<vscale x 4 x float> %0, half %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_mask_vf_nxv4f32_f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vf v8, fa0, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.f16(
@@ -613,10 +613,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwmacc.nxv8f32.f16(
 define <vscale x 8 x float>  @intrinsic_vfwmacc_vf_nxv8f32_f16_nxv8f16(<vscale x 8 x float> %0, half %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_vf_nxv8f32_f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vf v8, fa0, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwmacc.nxv8f32.f16(
@@ -638,10 +638,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.f16(
 define <vscale x 8 x float> @intrinsic_vfwmacc_mask_vf_nxv8f32_f16_nxv8f16(<vscale x 8 x float> %0, half %1, <vscale x 8 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_mask_vf_nxv8f32_f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vf v8, fa0, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.f16(
@@ -663,10 +663,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwmacc.nxv16f32.f16(
 define <vscale x 16 x float>  @intrinsic_vfwmacc_vf_nxv16f32_f16_nxv16f16(<vscale x 16 x float> %0, half %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_vf_nxv16f32_f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vf v8, fa0, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwmacc.nxv16f32.f16(
@@ -688,10 +688,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.f16(
 define <vscale x 16 x float> @intrinsic_vfwmacc_mask_vf_nxv16f32_f16_nxv16f16(<vscale x 16 x float> %0, half %1, <vscale x 16 x half> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_mask_vf_nxv16f32_f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vf v8, fa0, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.f16(
@@ -713,10 +713,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwmacc.nxv1f64.f32(
 define <vscale x 1 x double>  @intrinsic_vfwmacc_vf_nxv1f64_f32_nxv1f32(<vscale x 1 x double> %0, float %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_vf_nxv1f64_f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwmacc.nxv1f64.f32(
@@ -738,10 +738,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwmacc.mask.nxv1f64.f32(
 define <vscale x 1 x double> @intrinsic_vfwmacc_mask_vf_nxv1f64_f32_nxv1f32(<vscale x 1 x double> %0, float %1, <vscale x 1 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_mask_vf_nxv1f64_f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwmacc.mask.nxv1f64.f32(
@@ -763,10 +763,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwmacc.nxv2f64.f32(
 define <vscale x 2 x double>  @intrinsic_vfwmacc_vf_nxv2f64_f32_nxv2f32(<vscale x 2 x double> %0, float %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_vf_nxv2f64_f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vf v8, fa0, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwmacc.nxv2f64.f32(
@@ -788,10 +788,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwmacc.mask.nxv2f64.f32(
 define <vscale x 2 x double> @intrinsic_vfwmacc_mask_vf_nxv2f64_f32_nxv2f32(<vscale x 2 x double> %0, float %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_mask_vf_nxv2f64_f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vf v8, fa0, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwmacc.mask.nxv2f64.f32(
@@ -813,10 +813,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwmacc.nxv4f64.f32(
 define <vscale x 4 x double>  @intrinsic_vfwmacc_vf_nxv4f64_f32_nxv4f32(<vscale x 4 x double> %0, float %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_vf_nxv4f64_f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vf v8, fa0, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwmacc.nxv4f64.f32(
@@ -838,10 +838,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwmacc.mask.nxv4f64.f32(
 define <vscale x 4 x double> @intrinsic_vfwmacc_mask_vf_nxv4f64_f32_nxv4f32(<vscale x 4 x double> %0, float %1, <vscale x 4 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_mask_vf_nxv4f64_f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vf v8, fa0, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwmacc.mask.nxv4f64.f32(
@@ -863,10 +863,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwmacc.nxv8f64.f32(
 define <vscale x 8 x double>  @intrinsic_vfwmacc_vf_nxv8f64_f32_nxv8f32(<vscale x 8 x double> %0, float %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_vf_nxv8f64_f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vf v8, fa0, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwmacc.nxv8f64.f32(
@@ -888,10 +888,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwmacc.mask.nxv8f64.f32(
 define <vscale x 8 x double> @intrinsic_vfwmacc_mask_vf_nxv8f64_f32_nxv8f32(<vscale x 8 x double> %0, float %1, <vscale x 8 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmacc_mask_vf_nxv8f64_f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmacc.vf v8, fa0, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwmacc.mask.nxv8f64.f32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwmsac.ll b/llvm/test/CodeGen/RISCV/rvv/vfwmsac.ll
index 103eeb08f8c8..5e3f63b95b2f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwmsac.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwmsac.ll
@@ -13,10 +13,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.nxv1f16(
 define <vscale x 1 x float>  @intrinsic_vfwmsac_vv_nxv1f32_nxv1f16_nxv1f16(<vscale x 1 x float> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_vv_nxv1f32_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.nxv1f16(
@@ -38,10 +38,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.nxv1f16(
 define <vscale x 1 x float>  @intrinsic_vfwmsac_mask_vv_nxv1f32_nxv1f16_nxv1f16(<vscale x 1 x float> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_mask_vv_nxv1f32_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.nxv1f16(
@@ -63,10 +63,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.nxv2f16(
 define <vscale x 2 x float>  @intrinsic_vfwmsac_vv_nxv2f32_nxv2f16_nxv2f16(<vscale x 2 x float> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_vv_nxv2f32_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.nxv2f16(
@@ -88,10 +88,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.nxv2f16(
 define <vscale x 2 x float>  @intrinsic_vfwmsac_mask_vv_nxv2f32_nxv2f16_nxv2f16(<vscale x 2 x float> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_mask_vv_nxv2f32_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.nxv2f16(
@@ -113,10 +113,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.nxv4f16(
 define <vscale x 4 x float>  @intrinsic_vfwmsac_vv_nxv4f32_nxv4f16_nxv4f16(<vscale x 4 x float> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_vv_nxv4f32_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vv v8, v10, v11
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.nxv4f16(
@@ -138,10 +138,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.nxv4f16(
 define <vscale x 4 x float>  @intrinsic_vfwmsac_mask_vv_nxv4f32_nxv4f16_nxv4f16(<vscale x 4 x float> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_mask_vv_nxv4f32_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vv v8, v10, v11, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.nxv4f16(
@@ -163,10 +163,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.nxv8f16(
 define <vscale x 8 x float>  @intrinsic_vfwmsac_vv_nxv8f32_nxv8f16_nxv8f16(<vscale x 8 x float> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_vv_nxv8f32_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vv v8, v12, v14
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.nxv8f16(
@@ -188,10 +188,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.nxv8f16(
 define <vscale x 8 x float>  @intrinsic_vfwmsac_mask_vv_nxv8f32_nxv8f16_nxv8f16(<vscale x 8 x float> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_mask_vv_nxv8f32_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vv v8, v12, v14, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.nxv8f16(
@@ -213,10 +213,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.nxv16f16(
 define <vscale x 16 x float>  @intrinsic_vfwmsac_vv_nxv16f32_nxv16f16_nxv16f16(<vscale x 16 x float> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_vv_nxv16f32_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vv v8, v16, v20
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.nxv16f16(
@@ -238,10 +238,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.nxv16f16(
 define <vscale x 16 x float>  @intrinsic_vfwmsac_mask_vv_nxv16f32_nxv16f16_nxv16f16(<vscale x 16 x float> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_mask_vv_nxv16f32_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vv v8, v16, v20, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.nxv16f16(
@@ -263,10 +263,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwmsac.nxv1f64.nxv1f32(
 define <vscale x 1 x double>  @intrinsic_vfwmsac_vv_nxv1f64_nxv1f32_nxv1f32(<vscale x 1 x double> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_vv_nxv1f64_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwmsac.nxv1f64.nxv1f32(
@@ -288,10 +288,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwmsac.mask.nxv1f64.nxv1f32(
 define <vscale x 1 x double>  @intrinsic_vfwmsac_mask_vv_nxv1f64_nxv1f32_nxv1f32(<vscale x 1 x double> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_mask_vv_nxv1f64_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwmsac.mask.nxv1f64.nxv1f32(
@@ -313,10 +313,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwmsac.nxv2f64.nxv2f32(
 define <vscale x 2 x double>  @intrinsic_vfwmsac_vv_nxv2f64_nxv2f32_nxv2f32(<vscale x 2 x double> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_vv_nxv2f64_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vv v8, v10, v11
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwmsac.nxv2f64.nxv2f32(
@@ -338,10 +338,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwmsac.mask.nxv2f64.nxv2f32(
 define <vscale x 2 x double>  @intrinsic_vfwmsac_mask_vv_nxv2f64_nxv2f32_nxv2f32(<vscale x 2 x double> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_mask_vv_nxv2f64_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vv v8, v10, v11, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwmsac.mask.nxv2f64.nxv2f32(
@@ -363,10 +363,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwmsac.nxv4f64.nxv4f32(
 define <vscale x 4 x double>  @intrinsic_vfwmsac_vv_nxv4f64_nxv4f32_nxv4f32(<vscale x 4 x double> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_vv_nxv4f64_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vv v8, v12, v14
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwmsac.nxv4f64.nxv4f32(
@@ -388,10 +388,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwmsac.mask.nxv4f64.nxv4f32(
 define <vscale x 4 x double>  @intrinsic_vfwmsac_mask_vv_nxv4f64_nxv4f32_nxv4f32(<vscale x 4 x double> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_mask_vv_nxv4f64_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vv v8, v12, v14, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwmsac.mask.nxv4f64.nxv4f32(
@@ -413,10 +413,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwmsac.nxv8f64.nxv8f32(
 define <vscale x 8 x double>  @intrinsic_vfwmsac_vv_nxv8f64_nxv8f32_nxv8f32(<vscale x 8 x double> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_vv_nxv8f64_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vv v8, v16, v20
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwmsac.nxv8f64.nxv8f32(
@@ -438,10 +438,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwmsac.mask.nxv8f64.nxv8f32(
 define <vscale x 8 x double>  @intrinsic_vfwmsac_mask_vv_nxv8f64_nxv8f32_nxv8f32(<vscale x 8 x double> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_mask_vv_nxv8f64_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vv v8, v16, v20, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwmsac.mask.nxv8f64.nxv8f32(
@@ -463,10 +463,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.f16(
 define <vscale x 1 x float>  @intrinsic_vfwmsac_vf_nxv1f32_f16_nxv1f16(<vscale x 1 x float> %0, half %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_vf_nxv1f32_f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.f16(
@@ -488,10 +488,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.f16(
 define <vscale x 1 x float> @intrinsic_vfwmsac_mask_vf_nxv1f32_f16_nxv1f16(<vscale x 1 x float> %0, half %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_mask_vf_nxv1f32_f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.f16(
@@ -513,10 +513,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.f16(
 define <vscale x 2 x float>  @intrinsic_vfwmsac_vf_nxv2f32_f16_nxv2f16(<vscale x 2 x float> %0, half %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_vf_nxv2f32_f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.f16(
@@ -538,10 +538,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.f16(
 define <vscale x 2 x float> @intrinsic_vfwmsac_mask_vf_nxv2f32_f16_nxv2f16(<vscale x 2 x float> %0, half %1, <vscale x 2 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_mask_vf_nxv2f32_f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.f16(
@@ -563,10 +563,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.f16(
 define <vscale x 4 x float>  @intrinsic_vfwmsac_vf_nxv4f32_f16_nxv4f16(<vscale x 4 x float> %0, half %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_vf_nxv4f32_f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vf v8, fa0, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.f16(
@@ -588,10 +588,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.f16(
 define <vscale x 4 x float> @intrinsic_vfwmsac_mask_vf_nxv4f32_f16_nxv4f16(<vscale x 4 x float> %0, half %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_mask_vf_nxv4f32_f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vf v8, fa0, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.f16(
@@ -613,10 +613,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.f16(
 define <vscale x 8 x float>  @intrinsic_vfwmsac_vf_nxv8f32_f16_nxv8f16(<vscale x 8 x float> %0, half %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_vf_nxv8f32_f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vf v8, fa0, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.f16(
@@ -638,10 +638,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.f16(
 define <vscale x 8 x float> @intrinsic_vfwmsac_mask_vf_nxv8f32_f16_nxv8f16(<vscale x 8 x float> %0, half %1, <vscale x 8 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_mask_vf_nxv8f32_f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vf v8, fa0, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.f16(
@@ -663,10 +663,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.f16(
 define <vscale x 16 x float>  @intrinsic_vfwmsac_vf_nxv16f32_f16_nxv16f16(<vscale x 16 x float> %0, half %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_vf_nxv16f32_f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vf v8, fa0, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.f16(
@@ -688,10 +688,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.f16(
 define <vscale x 16 x float> @intrinsic_vfwmsac_mask_vf_nxv16f32_f16_nxv16f16(<vscale x 16 x float> %0, half %1, <vscale x 16 x half> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_mask_vf_nxv16f32_f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vf v8, fa0, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.f16(
@@ -713,10 +713,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwmsac.nxv1f64.f32(
 define <vscale x 1 x double>  @intrinsic_vfwmsac_vf_nxv1f64_f32_nxv1f32(<vscale x 1 x double> %0, float %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_vf_nxv1f64_f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwmsac.nxv1f64.f32(
@@ -738,10 +738,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwmsac.mask.nxv1f64.f32(
 define <vscale x 1 x double> @intrinsic_vfwmsac_mask_vf_nxv1f64_f32_nxv1f32(<vscale x 1 x double> %0, float %1, <vscale x 1 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_mask_vf_nxv1f64_f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwmsac.mask.nxv1f64.f32(
@@ -763,10 +763,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwmsac.nxv2f64.f32(
 define <vscale x 2 x double>  @intrinsic_vfwmsac_vf_nxv2f64_f32_nxv2f32(<vscale x 2 x double> %0, float %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_vf_nxv2f64_f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vf v8, fa0, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwmsac.nxv2f64.f32(
@@ -788,10 +788,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwmsac.mask.nxv2f64.f32(
 define <vscale x 2 x double> @intrinsic_vfwmsac_mask_vf_nxv2f64_f32_nxv2f32(<vscale x 2 x double> %0, float %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_mask_vf_nxv2f64_f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vf v8, fa0, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwmsac.mask.nxv2f64.f32(
@@ -813,10 +813,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwmsac.nxv4f64.f32(
 define <vscale x 4 x double>  @intrinsic_vfwmsac_vf_nxv4f64_f32_nxv4f32(<vscale x 4 x double> %0, float %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_vf_nxv4f64_f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vf v8, fa0, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwmsac.nxv4f64.f32(
@@ -838,10 +838,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwmsac.mask.nxv4f64.f32(
 define <vscale x 4 x double> @intrinsic_vfwmsac_mask_vf_nxv4f64_f32_nxv4f32(<vscale x 4 x double> %0, float %1, <vscale x 4 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_mask_vf_nxv4f64_f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vf v8, fa0, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwmsac.mask.nxv4f64.f32(
@@ -863,10 +863,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwmsac.nxv8f64.f32(
 define <vscale x 8 x double>  @intrinsic_vfwmsac_vf_nxv8f64_f32_nxv8f32(<vscale x 8 x double> %0, float %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_vf_nxv8f64_f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vf v8, fa0, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwmsac.nxv8f64.f32(
@@ -888,10 +888,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwmsac.mask.nxv8f64.f32(
 define <vscale x 8 x double> @intrinsic_vfwmsac_mask_vf_nxv8f64_f32_nxv8f32(<vscale x 8 x double> %0, float %1, <vscale x 8 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmsac_mask_vf_nxv8f64_f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmsac.vf v8, fa0, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwmsac.mask.nxv8f64.f32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwmul.ll b/llvm/test/CodeGen/RISCV/rvv/vfwmul.ll
index 2f9fc24de3aa..bc5759f469ad 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwmul.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwmul.ll
@@ -13,10 +13,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwmul.nxv1f32.nxv1f16.nxv1f16(
 define <vscale x 1 x float> @intrinsic_vfwmul_vv_nxv1f32_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_vv_nxv1f32_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vv v10, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -39,10 +39,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1f16.nxv1f16(
 define <vscale x 1 x float> @intrinsic_vfwmul_mask_vv_nxv1f32_nxv1f16_nxv1f16(<vscale x 1 x float> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_mask_vv_nxv1f32_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1f16.nxv1f16(
@@ -64,10 +64,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwmul.nxv2f32.nxv2f16.nxv2f16(
 define <vscale x 2 x float> @intrinsic_vfwmul_vv_nxv2f32_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_vv_nxv2f32_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vv v10, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -90,10 +90,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2f16.nxv2f16(
 define <vscale x 2 x float> @intrinsic_vfwmul_mask_vv_nxv2f32_nxv2f16_nxv2f16(<vscale x 2 x float> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_mask_vv_nxv2f32_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2f16.nxv2f16(
@@ -115,10 +115,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwmul.nxv4f32.nxv4f16.nxv4f16(
 define <vscale x 4 x float> @intrinsic_vfwmul_vv_nxv4f32_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_vv_nxv4f32_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vv v10, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -141,10 +141,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4f16.nxv4f16(
 define <vscale x 4 x float> @intrinsic_vfwmul_mask_vv_nxv4f32_nxv4f16_nxv4f16(<vscale x 4 x float> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_mask_vv_nxv4f32_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vv v8, v10, v11, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4f16.nxv4f16(
@@ -166,10 +166,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwmul.nxv8f32.nxv8f16.nxv8f16(
 define <vscale x 8 x float> @intrinsic_vfwmul_vv_nxv8f32_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_vv_nxv8f32_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vv v12, v8, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv4r.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -192,10 +192,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8f16.nxv8f16(
 define <vscale x 8 x float> @intrinsic_vfwmul_mask_vv_nxv8f32_nxv8f16_nxv8f16(<vscale x 8 x float> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_mask_vv_nxv8f32_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vv v8, v12, v14, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8f16.nxv8f16(
@@ -217,10 +217,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwmul.nxv16f32.nxv16f16.nxv16f16(
 define <vscale x 16 x float> @intrinsic_vfwmul_vv_nxv16f32_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_vv_nxv16f32_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vv v16, v8, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -243,10 +243,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16f16.nxv16f16
 define <vscale x 16 x float> @intrinsic_vfwmul_mask_vv_nxv16f32_nxv16f16_nxv16f16(<vscale x 16 x float> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_mask_vv_nxv16f32_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vv v8, v16, v20, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16f16.nxv16f16(
@@ -268,10 +268,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwmul.nxv1f64.nxv1f32.nxv1f32(
 define <vscale x 1 x double> @intrinsic_vfwmul_vv_nxv1f64_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_vv_nxv1f64_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vv v10, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -294,10 +294,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwmul.mask.nxv1f64.nxv1f32.nxv1f32(
 define <vscale x 1 x double> @intrinsic_vfwmul_mask_vv_nxv1f64_nxv1f32_nxv1f32(<vscale x 1 x double> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_mask_vv_nxv1f64_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwmul.mask.nxv1f64.nxv1f32.nxv1f32(
@@ -319,10 +319,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwmul.nxv2f64.nxv2f32.nxv2f32(
 define <vscale x 2 x double> @intrinsic_vfwmul_vv_nxv2f64_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_vv_nxv2f64_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vv v10, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -345,10 +345,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwmul.mask.nxv2f64.nxv2f32.nxv2f32(
 define <vscale x 2 x double> @intrinsic_vfwmul_mask_vv_nxv2f64_nxv2f32_nxv2f32(<vscale x 2 x double> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_mask_vv_nxv2f64_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vv v8, v10, v11, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwmul.mask.nxv2f64.nxv2f32.nxv2f32(
@@ -370,10 +370,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwmul.nxv4f64.nxv4f32.nxv4f32(
 define <vscale x 4 x double> @intrinsic_vfwmul_vv_nxv4f64_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_vv_nxv4f64_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vv v12, v8, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv4r.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -396,10 +396,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwmul.mask.nxv4f64.nxv4f32.nxv4f32(
 define <vscale x 4 x double> @intrinsic_vfwmul_mask_vv_nxv4f64_nxv4f32_nxv4f32(<vscale x 4 x double> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_mask_vv_nxv4f64_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vv v8, v12, v14, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwmul.mask.nxv4f64.nxv4f32.nxv4f32(
@@ -421,10 +421,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwmul.nxv8f64.nxv8f32.nxv8f32(
 define <vscale x 8 x double> @intrinsic_vfwmul_vv_nxv8f64_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_vv_nxv8f64_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vv v16, v8, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -447,10 +447,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwmul.mask.nxv8f64.nxv8f32.nxv8f32(
 define <vscale x 8 x double> @intrinsic_vfwmul_mask_vv_nxv8f64_nxv8f32_nxv8f32(<vscale x 8 x double> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_mask_vv_nxv8f64_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vv v8, v16, v20, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwmul.mask.nxv8f64.nxv8f32.nxv8f32(
@@ -472,10 +472,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwmul.nxv1f32.nxv1f16.f16(
 define <vscale x 1 x float> @intrinsic_vfwmul_vf_nxv1f32_nxv1f16_f16(<vscale x 1 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_vf_nxv1f32_nxv1f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vf v9, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -498,10 +498,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1f16.f16(
 define <vscale x 1 x float> @intrinsic_vfwmul_mask_vf_nxv1f32_nxv1f16_f16(<vscale x 1 x float> %0, <vscale x 1 x half> %1, half %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_mask_vf_nxv1f32_nxv1f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1f16.f16(
@@ -523,10 +523,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwmul.nxv2f32.nxv2f16.f16(
 define <vscale x 2 x float> @intrinsic_vfwmul_vf_nxv2f32_nxv2f16_f16(<vscale x 2 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_vf_nxv2f32_nxv2f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vf v9, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -549,10 +549,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2f16.f16(
 define <vscale x 2 x float> @intrinsic_vfwmul_mask_vf_nxv2f32_nxv2f16_f16(<vscale x 2 x float> %0, <vscale x 2 x half> %1, half %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_mask_vf_nxv2f32_nxv2f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2f16.f16(
@@ -574,10 +574,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwmul.nxv4f32.nxv4f16.f16(
 define <vscale x 4 x float> @intrinsic_vfwmul_vf_nxv4f32_nxv4f16_f16(<vscale x 4 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_vf_nxv4f32_nxv4f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vf v10, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -600,10 +600,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4f16.f16(
 define <vscale x 4 x float> @intrinsic_vfwmul_mask_vf_nxv4f32_nxv4f16_f16(<vscale x 4 x float> %0, <vscale x 4 x half> %1, half %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_mask_vf_nxv4f32_nxv4f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vf v8, v10, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4f16.f16(
@@ -625,10 +625,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwmul.nxv8f32.nxv8f16.f16(
 define <vscale x 8 x float> @intrinsic_vfwmul_vf_nxv8f32_nxv8f16_f16(<vscale x 8 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_vf_nxv8f32_nxv8f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vf v12, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv4r.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -651,10 +651,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8f16.f16(
 define <vscale x 8 x float> @intrinsic_vfwmul_mask_vf_nxv8f32_nxv8f16_f16(<vscale x 8 x float> %0, <vscale x 8 x half> %1, half %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_mask_vf_nxv8f32_nxv8f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vf v8, v12, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8f16.f16(
@@ -676,10 +676,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwmul.nxv16f32.nxv16f16.f16(
 define <vscale x 16 x float> @intrinsic_vfwmul_vf_nxv16f32_nxv16f16_f16(<vscale x 16 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_vf_nxv16f32_nxv16f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vf v16, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -702,10 +702,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16f16.f16(
 define <vscale x 16 x float> @intrinsic_vfwmul_mask_vf_nxv16f32_nxv16f16_f16(<vscale x 16 x float> %0, <vscale x 16 x half> %1, half %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_mask_vf_nxv16f32_nxv16f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vf v8, v16, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16f16.f16(
@@ -727,10 +727,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwmul.nxv1f64.nxv1f32.f32(
 define <vscale x 1 x double> @intrinsic_vfwmul_vf_nxv1f64_nxv1f32_f32(<vscale x 1 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_vf_nxv1f64_nxv1f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vf v9, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -753,10 +753,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwmul.mask.nxv1f64.nxv1f32.f32(
 define <vscale x 1 x double> @intrinsic_vfwmul_mask_vf_nxv1f64_nxv1f32_f32(<vscale x 1 x double> %0, <vscale x 1 x float> %1, float %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_mask_vf_nxv1f64_nxv1f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwmul.mask.nxv1f64.nxv1f32.f32(
@@ -778,10 +778,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwmul.nxv2f64.nxv2f32.f32(
 define <vscale x 2 x double> @intrinsic_vfwmul_vf_nxv2f64_nxv2f32_f32(<vscale x 2 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_vf_nxv2f64_nxv2f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vf v10, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -804,10 +804,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwmul.mask.nxv2f64.nxv2f32.f32(
 define <vscale x 2 x double> @intrinsic_vfwmul_mask_vf_nxv2f64_nxv2f32_f32(<vscale x 2 x double> %0, <vscale x 2 x float> %1, float %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_mask_vf_nxv2f64_nxv2f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vf v8, v10, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwmul.mask.nxv2f64.nxv2f32.f32(
@@ -829,10 +829,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwmul.nxv4f64.nxv4f32.f32(
 define <vscale x 4 x double> @intrinsic_vfwmul_vf_nxv4f64_nxv4f32_f32(<vscale x 4 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_vf_nxv4f64_nxv4f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vf v12, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv4r.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -855,10 +855,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwmul.mask.nxv4f64.nxv4f32.f32(
 define <vscale x 4 x double> @intrinsic_vfwmul_mask_vf_nxv4f64_nxv4f32_f32(<vscale x 4 x double> %0, <vscale x 4 x float> %1, float %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_mask_vf_nxv4f64_nxv4f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vf v8, v12, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwmul.mask.nxv4f64.nxv4f32.f32(
@@ -880,10 +880,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwmul.nxv8f64.nxv8f32.f32(
 define <vscale x 8 x double> @intrinsic_vfwmul_vf_nxv8f64_nxv8f32_f32(<vscale x 8 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_vf_nxv8f64_nxv8f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vf v16, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -906,10 +906,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwmul.mask.nxv8f64.nxv8f32.f32(
 define <vscale x 8 x double> @intrinsic_vfwmul_mask_vf_nxv8f64_nxv8f32_f32(<vscale x 8 x double> %0, <vscale x 8 x float> %1, float %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwmul_mask_vf_nxv8f64_nxv8f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwmul.vf v8, v16, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwmul.mask.nxv8f64.nxv8f32.f32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwnmacc.ll b/llvm/test/CodeGen/RISCV/rvv/vfwnmacc.ll
index ca2d2a33159b..fc8e15273f08 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwnmacc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwnmacc.ll
@@ -13,10 +13,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.nxv1f16(
 define <vscale x 1 x float>  @intrinsic_vfwnmacc_vv_nxv1f32_nxv1f16_nxv1f16(<vscale x 1 x float> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_vv_nxv1f32_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.nxv1f16(
@@ -38,10 +38,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.nxv1f16(
 define <vscale x 1 x float>  @intrinsic_vfwnmacc_mask_vv_nxv1f32_nxv1f16_nxv1f16(<vscale x 1 x float> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_mask_vv_nxv1f32_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.nxv1f16(
@@ -63,10 +63,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.nxv2f16(
 define <vscale x 2 x float>  @intrinsic_vfwnmacc_vv_nxv2f32_nxv2f16_nxv2f16(<vscale x 2 x float> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_vv_nxv2f32_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.nxv2f16(
@@ -88,10 +88,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.nxv2f16(
 define <vscale x 2 x float>  @intrinsic_vfwnmacc_mask_vv_nxv2f32_nxv2f16_nxv2f16(<vscale x 2 x float> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_mask_vv_nxv2f32_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.nxv2f16(
@@ -113,10 +113,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.nxv4f16(
 define <vscale x 4 x float>  @intrinsic_vfwnmacc_vv_nxv4f32_nxv4f16_nxv4f16(<vscale x 4 x float> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_vv_nxv4f32_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vv v8, v10, v11
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.nxv4f16(
@@ -138,10 +138,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.nxv4f16(
 define <vscale x 4 x float>  @intrinsic_vfwnmacc_mask_vv_nxv4f32_nxv4f16_nxv4f16(<vscale x 4 x float> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_mask_vv_nxv4f32_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vv v8, v10, v11, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.nxv4f16(
@@ -163,10 +163,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.nxv8f16(
 define <vscale x 8 x float>  @intrinsic_vfwnmacc_vv_nxv8f32_nxv8f16_nxv8f16(<vscale x 8 x float> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_vv_nxv8f32_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vv v8, v12, v14
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.nxv8f16(
@@ -188,10 +188,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.nxv8f16(
 define <vscale x 8 x float>  @intrinsic_vfwnmacc_mask_vv_nxv8f32_nxv8f16_nxv8f16(<vscale x 8 x float> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_mask_vv_nxv8f32_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vv v8, v12, v14, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.nxv8f16(
@@ -213,10 +213,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.nxv16f16(
 define <vscale x 16 x float>  @intrinsic_vfwnmacc_vv_nxv16f32_nxv16f16_nxv16f16(<vscale x 16 x float> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_vv_nxv16f32_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vv v8, v16, v20
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.nxv16f16(
@@ -238,10 +238,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.nxv16f16(
 define <vscale x 16 x float>  @intrinsic_vfwnmacc_mask_vv_nxv16f32_nxv16f16_nxv16f16(<vscale x 16 x float> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_mask_vv_nxv16f32_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vv v8, v16, v20, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.nxv16f16(
@@ -263,10 +263,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwnmacc.nxv1f64.nxv1f32(
 define <vscale x 1 x double>  @intrinsic_vfwnmacc_vv_nxv1f64_nxv1f32_nxv1f32(<vscale x 1 x double> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_vv_nxv1f64_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwnmacc.nxv1f64.nxv1f32(
@@ -288,10 +288,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwnmacc.mask.nxv1f64.nxv1f32(
 define <vscale x 1 x double>  @intrinsic_vfwnmacc_mask_vv_nxv1f64_nxv1f32_nxv1f32(<vscale x 1 x double> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_mask_vv_nxv1f64_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwnmacc.mask.nxv1f64.nxv1f32(
@@ -313,10 +313,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwnmacc.nxv2f64.nxv2f32(
 define <vscale x 2 x double>  @intrinsic_vfwnmacc_vv_nxv2f64_nxv2f32_nxv2f32(<vscale x 2 x double> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_vv_nxv2f64_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vv v8, v10, v11
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwnmacc.nxv2f64.nxv2f32(
@@ -338,10 +338,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwnmacc.mask.nxv2f64.nxv2f32(
 define <vscale x 2 x double>  @intrinsic_vfwnmacc_mask_vv_nxv2f64_nxv2f32_nxv2f32(<vscale x 2 x double> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_mask_vv_nxv2f64_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vv v8, v10, v11, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwnmacc.mask.nxv2f64.nxv2f32(
@@ -363,10 +363,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwnmacc.nxv4f64.nxv4f32(
 define <vscale x 4 x double>  @intrinsic_vfwnmacc_vv_nxv4f64_nxv4f32_nxv4f32(<vscale x 4 x double> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_vv_nxv4f64_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vv v8, v12, v14
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwnmacc.nxv4f64.nxv4f32(
@@ -388,10 +388,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwnmacc.mask.nxv4f64.nxv4f32(
 define <vscale x 4 x double>  @intrinsic_vfwnmacc_mask_vv_nxv4f64_nxv4f32_nxv4f32(<vscale x 4 x double> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_mask_vv_nxv4f64_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vv v8, v12, v14, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwnmacc.mask.nxv4f64.nxv4f32(
@@ -413,10 +413,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwnmacc.nxv8f64.nxv8f32(
 define <vscale x 8 x double>  @intrinsic_vfwnmacc_vv_nxv8f64_nxv8f32_nxv8f32(<vscale x 8 x double> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_vv_nxv8f64_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vv v8, v16, v20
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwnmacc.nxv8f64.nxv8f32(
@@ -438,10 +438,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwnmacc.mask.nxv8f64.nxv8f32(
 define <vscale x 8 x double>  @intrinsic_vfwnmacc_mask_vv_nxv8f64_nxv8f32_nxv8f32(<vscale x 8 x double> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_mask_vv_nxv8f64_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vv v8, v16, v20, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwnmacc.mask.nxv8f64.nxv8f32(
@@ -463,10 +463,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.f16(
 define <vscale x 1 x float>  @intrinsic_vfwnmacc_vf_nxv1f32_f16_nxv1f16(<vscale x 1 x float> %0, half %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_vf_nxv1f32_f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.f16(
@@ -488,10 +488,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.f16(
 define <vscale x 1 x float> @intrinsic_vfwnmacc_mask_vf_nxv1f32_f16_nxv1f16(<vscale x 1 x float> %0, half %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_mask_vf_nxv1f32_f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.f16(
@@ -513,10 +513,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.f16(
 define <vscale x 2 x float>  @intrinsic_vfwnmacc_vf_nxv2f32_f16_nxv2f16(<vscale x 2 x float> %0, half %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_vf_nxv2f32_f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.f16(
@@ -538,10 +538,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.f16(
 define <vscale x 2 x float> @intrinsic_vfwnmacc_mask_vf_nxv2f32_f16_nxv2f16(<vscale x 2 x float> %0, half %1, <vscale x 2 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_mask_vf_nxv2f32_f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.f16(
@@ -563,10 +563,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.f16(
 define <vscale x 4 x float>  @intrinsic_vfwnmacc_vf_nxv4f32_f16_nxv4f16(<vscale x 4 x float> %0, half %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_vf_nxv4f32_f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vf v8, fa0, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.f16(
@@ -588,10 +588,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.f16(
 define <vscale x 4 x float> @intrinsic_vfwnmacc_mask_vf_nxv4f32_f16_nxv4f16(<vscale x 4 x float> %0, half %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_mask_vf_nxv4f32_f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vf v8, fa0, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.f16(
@@ -613,10 +613,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.f16(
 define <vscale x 8 x float>  @intrinsic_vfwnmacc_vf_nxv8f32_f16_nxv8f16(<vscale x 8 x float> %0, half %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_vf_nxv8f32_f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vf v8, fa0, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.f16(
@@ -638,10 +638,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.f16(
 define <vscale x 8 x float> @intrinsic_vfwnmacc_mask_vf_nxv8f32_f16_nxv8f16(<vscale x 8 x float> %0, half %1, <vscale x 8 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_mask_vf_nxv8f32_f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vf v8, fa0, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.f16(
@@ -663,10 +663,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.f16(
 define <vscale x 16 x float>  @intrinsic_vfwnmacc_vf_nxv16f32_f16_nxv16f16(<vscale x 16 x float> %0, half %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_vf_nxv16f32_f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vf v8, fa0, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.f16(
@@ -688,10 +688,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.f16(
 define <vscale x 16 x float> @intrinsic_vfwnmacc_mask_vf_nxv16f32_f16_nxv16f16(<vscale x 16 x float> %0, half %1, <vscale x 16 x half> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_mask_vf_nxv16f32_f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vf v8, fa0, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.f16(
@@ -713,10 +713,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwnmacc.nxv1f64.f32(
 define <vscale x 1 x double>  @intrinsic_vfwnmacc_vf_nxv1f64_f32_nxv1f32(<vscale x 1 x double> %0, float %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_vf_nxv1f64_f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwnmacc.nxv1f64.f32(
@@ -738,10 +738,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwnmacc.mask.nxv1f64.f32(
 define <vscale x 1 x double> @intrinsic_vfwnmacc_mask_vf_nxv1f64_f32_nxv1f32(<vscale x 1 x double> %0, float %1, <vscale x 1 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_mask_vf_nxv1f64_f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwnmacc.mask.nxv1f64.f32(
@@ -763,10 +763,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwnmacc.nxv2f64.f32(
 define <vscale x 2 x double>  @intrinsic_vfwnmacc_vf_nxv2f64_f32_nxv2f32(<vscale x 2 x double> %0, float %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_vf_nxv2f64_f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vf v8, fa0, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwnmacc.nxv2f64.f32(
@@ -788,10 +788,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwnmacc.mask.nxv2f64.f32(
 define <vscale x 2 x double> @intrinsic_vfwnmacc_mask_vf_nxv2f64_f32_nxv2f32(<vscale x 2 x double> %0, float %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_mask_vf_nxv2f64_f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vf v8, fa0, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwnmacc.mask.nxv2f64.f32(
@@ -813,10 +813,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwnmacc.nxv4f64.f32(
 define <vscale x 4 x double>  @intrinsic_vfwnmacc_vf_nxv4f64_f32_nxv4f32(<vscale x 4 x double> %0, float %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_vf_nxv4f64_f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vf v8, fa0, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwnmacc.nxv4f64.f32(
@@ -838,10 +838,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwnmacc.mask.nxv4f64.f32(
 define <vscale x 4 x double> @intrinsic_vfwnmacc_mask_vf_nxv4f64_f32_nxv4f32(<vscale x 4 x double> %0, float %1, <vscale x 4 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_mask_vf_nxv4f64_f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vf v8, fa0, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwnmacc.mask.nxv4f64.f32(
@@ -863,10 +863,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwnmacc.nxv8f64.f32(
 define <vscale x 8 x double>  @intrinsic_vfwnmacc_vf_nxv8f64_f32_nxv8f32(<vscale x 8 x double> %0, float %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_vf_nxv8f64_f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vf v8, fa0, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwnmacc.nxv8f64.f32(
@@ -888,10 +888,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwnmacc.mask.nxv8f64.f32(
 define <vscale x 8 x double> @intrinsic_vfwnmacc_mask_vf_nxv8f64_f32_nxv8f32(<vscale x 8 x double> %0, float %1, <vscale x 8 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmacc_mask_vf_nxv8f64_f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmacc.vf v8, fa0, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwnmacc.mask.nxv8f64.f32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwnmsac.ll b/llvm/test/CodeGen/RISCV/rvv/vfwnmsac.ll
index 648727dce246..b51faf9082c8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwnmsac.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwnmsac.ll
@@ -13,10 +13,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.nxv1f16(
 define <vscale x 1 x float>  @intrinsic_vfwnmsac_vv_nxv1f32_nxv1f16_nxv1f16(<vscale x 1 x float> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_vv_nxv1f32_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.nxv1f16(
@@ -38,10 +38,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.nxv1f16(
 define <vscale x 1 x float>  @intrinsic_vfwnmsac_mask_vv_nxv1f32_nxv1f16_nxv1f16(<vscale x 1 x float> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_mask_vv_nxv1f32_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.nxv1f16(
@@ -63,10 +63,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.nxv2f16(
 define <vscale x 2 x float>  @intrinsic_vfwnmsac_vv_nxv2f32_nxv2f16_nxv2f16(<vscale x 2 x float> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_vv_nxv2f32_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.nxv2f16(
@@ -88,10 +88,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.nxv2f16(
 define <vscale x 2 x float>  @intrinsic_vfwnmsac_mask_vv_nxv2f32_nxv2f16_nxv2f16(<vscale x 2 x float> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_mask_vv_nxv2f32_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.nxv2f16(
@@ -113,10 +113,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.nxv4f16(
 define <vscale x 4 x float>  @intrinsic_vfwnmsac_vv_nxv4f32_nxv4f16_nxv4f16(<vscale x 4 x float> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_vv_nxv4f32_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vv v8, v10, v11
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.nxv4f16(
@@ -138,10 +138,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.nxv4f16(
 define <vscale x 4 x float>  @intrinsic_vfwnmsac_mask_vv_nxv4f32_nxv4f16_nxv4f16(<vscale x 4 x float> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_mask_vv_nxv4f32_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vv v8, v10, v11, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.nxv4f16(
@@ -163,10 +163,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.nxv8f16(
 define <vscale x 8 x float>  @intrinsic_vfwnmsac_vv_nxv8f32_nxv8f16_nxv8f16(<vscale x 8 x float> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_vv_nxv8f32_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vv v8, v12, v14
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.nxv8f16(
@@ -188,10 +188,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.nxv8f16(
 define <vscale x 8 x float>  @intrinsic_vfwnmsac_mask_vv_nxv8f32_nxv8f16_nxv8f16(<vscale x 8 x float> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_mask_vv_nxv8f32_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vv v8, v12, v14, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.nxv8f16(
@@ -213,10 +213,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.nxv16f16(
 define <vscale x 16 x float>  @intrinsic_vfwnmsac_vv_nxv16f32_nxv16f16_nxv16f16(<vscale x 16 x float> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_vv_nxv16f32_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vv v8, v16, v20
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.nxv16f16(
@@ -238,10 +238,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.nxv16f16(
 define <vscale x 16 x float>  @intrinsic_vfwnmsac_mask_vv_nxv16f32_nxv16f16_nxv16f16(<vscale x 16 x float> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_mask_vv_nxv16f32_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vv v8, v16, v20, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.nxv16f16(
@@ -263,10 +263,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwnmsac.nxv1f64.nxv1f32(
 define <vscale x 1 x double>  @intrinsic_vfwnmsac_vv_nxv1f64_nxv1f32_nxv1f32(<vscale x 1 x double> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_vv_nxv1f64_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vv v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwnmsac.nxv1f64.nxv1f32(
@@ -288,10 +288,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwnmsac.mask.nxv1f64.nxv1f32(
 define <vscale x 1 x double>  @intrinsic_vfwnmsac_mask_vv_nxv1f64_nxv1f32_nxv1f32(<vscale x 1 x double> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_mask_vv_nxv1f64_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwnmsac.mask.nxv1f64.nxv1f32(
@@ -313,10 +313,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwnmsac.nxv2f64.nxv2f32(
 define <vscale x 2 x double>  @intrinsic_vfwnmsac_vv_nxv2f64_nxv2f32_nxv2f32(<vscale x 2 x double> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_vv_nxv2f64_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vv v8, v10, v11
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwnmsac.nxv2f64.nxv2f32(
@@ -338,10 +338,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwnmsac.mask.nxv2f64.nxv2f32(
 define <vscale x 2 x double>  @intrinsic_vfwnmsac_mask_vv_nxv2f64_nxv2f32_nxv2f32(<vscale x 2 x double> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_mask_vv_nxv2f64_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vv v8, v10, v11, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwnmsac.mask.nxv2f64.nxv2f32(
@@ -363,10 +363,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwnmsac.nxv4f64.nxv4f32(
 define <vscale x 4 x double>  @intrinsic_vfwnmsac_vv_nxv4f64_nxv4f32_nxv4f32(<vscale x 4 x double> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_vv_nxv4f64_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vv v8, v12, v14
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwnmsac.nxv4f64.nxv4f32(
@@ -388,10 +388,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwnmsac.mask.nxv4f64.nxv4f32(
 define <vscale x 4 x double>  @intrinsic_vfwnmsac_mask_vv_nxv4f64_nxv4f32_nxv4f32(<vscale x 4 x double> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_mask_vv_nxv4f64_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vv v8, v12, v14, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwnmsac.mask.nxv4f64.nxv4f32(
@@ -413,10 +413,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwnmsac.nxv8f64.nxv8f32(
 define <vscale x 8 x double>  @intrinsic_vfwnmsac_vv_nxv8f64_nxv8f32_nxv8f32(<vscale x 8 x double> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_vv_nxv8f64_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vv v8, v16, v20
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwnmsac.nxv8f64.nxv8f32(
@@ -438,10 +438,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwnmsac.mask.nxv8f64.nxv8f32(
 define <vscale x 8 x double>  @intrinsic_vfwnmsac_mask_vv_nxv8f64_nxv8f32_nxv8f32(<vscale x 8 x double> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_mask_vv_nxv8f64_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vv v8, v16, v20, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwnmsac.mask.nxv8f64.nxv8f32(
@@ -463,10 +463,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.f16(
 define <vscale x 1 x float>  @intrinsic_vfwnmsac_vf_nxv1f32_f16_nxv1f16(<vscale x 1 x float> %0, half %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_vf_nxv1f32_f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.f16(
@@ -488,10 +488,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.f16(
 define <vscale x 1 x float> @intrinsic_vfwnmsac_mask_vf_nxv1f32_f16_nxv1f16(<vscale x 1 x float> %0, half %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_mask_vf_nxv1f32_f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.f16(
@@ -513,10 +513,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.f16(
 define <vscale x 2 x float>  @intrinsic_vfwnmsac_vf_nxv2f32_f16_nxv2f16(<vscale x 2 x float> %0, half %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_vf_nxv2f32_f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.f16(
@@ -538,10 +538,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.f16(
 define <vscale x 2 x float> @intrinsic_vfwnmsac_mask_vf_nxv2f32_f16_nxv2f16(<vscale x 2 x float> %0, half %1, <vscale x 2 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_mask_vf_nxv2f32_f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.f16(
@@ -563,10 +563,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.f16(
 define <vscale x 4 x float>  @intrinsic_vfwnmsac_vf_nxv4f32_f16_nxv4f16(<vscale x 4 x float> %0, half %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_vf_nxv4f32_f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vf v8, fa0, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.f16(
@@ -588,10 +588,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.f16(
 define <vscale x 4 x float> @intrinsic_vfwnmsac_mask_vf_nxv4f32_f16_nxv4f16(<vscale x 4 x float> %0, half %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_mask_vf_nxv4f32_f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vf v8, fa0, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.f16(
@@ -613,10 +613,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.f16(
 define <vscale x 8 x float>  @intrinsic_vfwnmsac_vf_nxv8f32_f16_nxv8f16(<vscale x 8 x float> %0, half %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_vf_nxv8f32_f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vf v8, fa0, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.f16(
@@ -638,10 +638,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.f16(
 define <vscale x 8 x float> @intrinsic_vfwnmsac_mask_vf_nxv8f32_f16_nxv8f16(<vscale x 8 x float> %0, half %1, <vscale x 8 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_mask_vf_nxv8f32_f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vf v8, fa0, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.f16(
@@ -663,10 +663,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.f16(
 define <vscale x 16 x float>  @intrinsic_vfwnmsac_vf_nxv16f32_f16_nxv16f16(<vscale x 16 x float> %0, half %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_vf_nxv16f32_f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vf v8, fa0, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.f16(
@@ -688,10 +688,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.f16(
 define <vscale x 16 x float> @intrinsic_vfwnmsac_mask_vf_nxv16f32_f16_nxv16f16(<vscale x 16 x float> %0, half %1, <vscale x 16 x half> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_mask_vf_nxv16f32_f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vf v8, fa0, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.f16(
@@ -713,10 +713,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwnmsac.nxv1f64.f32(
 define <vscale x 1 x double>  @intrinsic_vfwnmsac_vf_nxv1f64_f32_nxv1f32(<vscale x 1 x double> %0, float %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_vf_nxv1f64_f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vf v8, fa0, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwnmsac.nxv1f64.f32(
@@ -738,10 +738,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwnmsac.mask.nxv1f64.f32(
 define <vscale x 1 x double> @intrinsic_vfwnmsac_mask_vf_nxv1f64_f32_nxv1f32(<vscale x 1 x double> %0, float %1, <vscale x 1 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_mask_vf_nxv1f64_f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vf v8, fa0, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwnmsac.mask.nxv1f64.f32(
@@ -763,10 +763,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwnmsac.nxv2f64.f32(
 define <vscale x 2 x double>  @intrinsic_vfwnmsac_vf_nxv2f64_f32_nxv2f32(<vscale x 2 x double> %0, float %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_vf_nxv2f64_f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vf v8, fa0, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwnmsac.nxv2f64.f32(
@@ -788,10 +788,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwnmsac.mask.nxv2f64.f32(
 define <vscale x 2 x double> @intrinsic_vfwnmsac_mask_vf_nxv2f64_f32_nxv2f32(<vscale x 2 x double> %0, float %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_mask_vf_nxv2f64_f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vf v8, fa0, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwnmsac.mask.nxv2f64.f32(
@@ -813,10 +813,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwnmsac.nxv4f64.f32(
 define <vscale x 4 x double>  @intrinsic_vfwnmsac_vf_nxv4f64_f32_nxv4f32(<vscale x 4 x double> %0, float %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_vf_nxv4f64_f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vf v8, fa0, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwnmsac.nxv4f64.f32(
@@ -838,10 +838,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwnmsac.mask.nxv4f64.f32(
 define <vscale x 4 x double> @intrinsic_vfwnmsac_mask_vf_nxv4f64_f32_nxv4f32(<vscale x 4 x double> %0, float %1, <vscale x 4 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_mask_vf_nxv4f64_f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vf v8, fa0, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwnmsac.mask.nxv4f64.f32(
@@ -863,10 +863,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwnmsac.nxv8f64.f32(
 define <vscale x 8 x double>  @intrinsic_vfwnmsac_vf_nxv8f64_f32_nxv8f32(<vscale x 8 x double> %0, float %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_vf_nxv8f64_f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vf v8, fa0, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwnmsac.nxv8f64.f32(
@@ -888,10 +888,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwnmsac.mask.nxv8f64.f32(
 define <vscale x 8 x double> @intrinsic_vfwnmsac_mask_vf_nxv8f64_f32_nxv8f32(<vscale x 8 x double> %0, float %1, <vscale x 8 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwnmsac_mask_vf_nxv8f64_f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwnmsac.vf v8, fa0, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwnmsac.mask.nxv8f64.f32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwredosum.ll b/llvm/test/CodeGen/RISCV/rvv/vfwredosum.ll
index 2184ab413c55..cb2bea0b50e1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwredosum.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwredosum.ll
@@ -13,10 +13,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwredosum.nxv2f32.nxv1f16(
 define <vscale x 2 x float> @intrinsic_vfwredosum_vs_nxv2f32_nxv1f16_nxv2f32(<vscale x 2 x float> %0, <vscale x 1 x half> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredosum_vs_nxv2f32_nxv1f16_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredosum.vs v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwredosum.nxv2f32.nxv1f16(
@@ -38,10 +38,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwredosum.mask.nxv2f32.nxv1f16.nxv2f32
 define <vscale x 2 x float> @intrinsic_vfwredosum_mask_vs_nxv2f32_nxv1f16_nxv2f32(<vscale x 2 x float> %0, <vscale x 1 x half> %1, <vscale x 2 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredosum_mask_vs_nxv2f32_nxv1f16_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredosum.vs v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwredosum.mask.nxv2f32.nxv1f16.nxv2f32(
@@ -63,10 +63,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwredosum.nxv2f32.nxv2f16(
 define <vscale x 2 x float> @intrinsic_vfwredosum_vs_nxv2f32_nxv2f16_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x half> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredosum_vs_nxv2f32_nxv2f16_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredosum.vs v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwredosum.nxv2f32.nxv2f16(
@@ -88,10 +88,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwredosum.mask.nxv2f32.nxv2f16.nxv2f32
 define <vscale x 2 x float> @intrinsic_vfwredosum_mask_vs_nxv2f32_nxv2f16_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x half> %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredosum_mask_vs_nxv2f32_nxv2f16_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredosum.vs v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwredosum.mask.nxv2f32.nxv2f16.nxv2f32(
@@ -113,10 +113,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwredosum.nxv2f32.nxv4f16(
 define <vscale x 2 x float> @intrinsic_vfwredosum_vs_nxv2f32_nxv4f16_nxv2f32(<vscale x 2 x float> %0, <vscale x 4 x half> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredosum_vs_nxv2f32_nxv4f16_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredosum.vs v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwredosum.nxv2f32.nxv4f16(
@@ -138,10 +138,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwredosum.mask.nxv2f32.nxv4f16.nxv2f32
 define <vscale x 2 x float> @intrinsic_vfwredosum_mask_vs_nxv2f32_nxv4f16_nxv2f32(<vscale x 2 x float> %0, <vscale x 4 x half> %1, <vscale x 2 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredosum_mask_vs_nxv2f32_nxv4f16_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredosum.vs v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwredosum.mask.nxv2f32.nxv4f16.nxv2f32(
@@ -163,10 +163,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwredosum.nxv2f32.nxv8f16(
 define <vscale x 2 x float> @intrinsic_vfwredosum_vs_nxv2f32_nxv8f16_nxv2f32(<vscale x 2 x float> %0, <vscale x 8 x half> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredosum_vs_nxv2f32_nxv8f16_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredosum.vs v8, v10, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwredosum.nxv2f32.nxv8f16(
@@ -188,10 +188,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwredosum.mask.nxv2f32.nxv8f16.nxv2f32
 define <vscale x 2 x float> @intrinsic_vfwredosum_mask_vs_nxv2f32_nxv8f16_nxv2f32(<vscale x 2 x float> %0, <vscale x 8 x half> %1, <vscale x 2 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredosum_mask_vs_nxv2f32_nxv8f16_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredosum.vs v8, v10, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwredosum.mask.nxv2f32.nxv8f16.nxv2f32(
@@ -213,10 +213,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwredosum.nxv2f32.nxv16f16(
 define <vscale x 2 x float> @intrinsic_vfwredosum_vs_nxv2f32_nxv16f16_nxv2f32(<vscale x 2 x float> %0, <vscale x 16 x half> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredosum_vs_nxv2f32_nxv16f16_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredosum.vs v8, v12, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwredosum.nxv2f32.nxv16f16(
@@ -238,10 +238,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwredosum.mask.nxv2f32.nxv16f16.nxv2f3
 define <vscale x 2 x float> @intrinsic_vfwredosum_mask_vs_nxv2f32_nxv16f16_nxv2f32(<vscale x 2 x float> %0, <vscale x 16 x half> %1, <vscale x 2 x float> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredosum_mask_vs_nxv2f32_nxv16f16_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredosum.vs v8, v12, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwredosum.mask.nxv2f32.nxv16f16.nxv2f32(
@@ -263,10 +263,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwredosum.nxv2f32.nxv32f16(
 define <vscale x 2 x float> @intrinsic_vfwredosum_vs_nxv2f32_nxv32f16_nxv2f32(<vscale x 2 x float> %0, <vscale x 32 x half> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredosum_vs_nxv2f32_nxv32f16_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredosum.vs v8, v16, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwredosum.nxv2f32.nxv32f16(
@@ -288,10 +288,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwredosum.mask.nxv2f32.nxv32f16(
 define <vscale x 2 x float> @intrinsic_vfwredosum_mask_vs_nxv2f32_nxv32f16_nxv2f32(<vscale x 2 x float> %0, <vscale x 32 x half> %1, <vscale x 2 x float> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredosum_mask_vs_nxv2f32_nxv32f16_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredosum.vs v8, v16, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwredosum.mask.nxv2f32.nxv32f16(
@@ -313,10 +313,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwredosum.nxv1f64.nxv1f32(
 define <vscale x 1 x double> @intrinsic_vfwredosum_vs_nxv1f64_nxv1f32_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x float> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredosum_vs_nxv1f64_nxv1f32_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredosum.vs v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwredosum.nxv1f64.nxv1f32(
@@ -338,10 +338,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwredosum.mask.nxv1f64.nxv1f32.nxv1f6
 define <vscale x 1 x double> @intrinsic_vfwredosum_mask_vs_nxv1f64_nxv1f32_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x float> %1, <vscale x 1 x double> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredosum_mask_vs_nxv1f64_nxv1f32_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredosum.vs v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwredosum.mask.nxv1f64.nxv1f32.nxv1f64(
@@ -363,10 +363,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwredosum.nxv1f64.nxv2f32(
 define <vscale x 1 x double> @intrinsic_vfwredosum_vs_nxv1f64_nxv2f32_nxv1f64(<vscale x 1 x double> %0, <vscale x 2 x float> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredosum_vs_nxv1f64_nxv2f32_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredosum.vs v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwredosum.nxv1f64.nxv2f32(
@@ -388,10 +388,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwredosum.mask.nxv1f64.nxv2f32.nxv1f6
 define <vscale x 1 x double> @intrinsic_vfwredosum_mask_vs_nxv1f64_nxv2f32_nxv1f64(<vscale x 1 x double> %0, <vscale x 2 x float> %1, <vscale x 1 x double> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredosum_mask_vs_nxv1f64_nxv2f32_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredosum.vs v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwredosum.mask.nxv1f64.nxv2f32.nxv1f64(
@@ -413,10 +413,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwredosum.nxv1f64.nxv4f32(
 define <vscale x 1 x double> @intrinsic_vfwredosum_vs_nxv1f64_nxv4f32_nxv1f64(<vscale x 1 x double> %0, <vscale x 4 x float> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredosum_vs_nxv1f64_nxv4f32_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredosum.vs v8, v10, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwredosum.nxv1f64.nxv4f32(
@@ -438,10 +438,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwredosum.mask.nxv1f64.nxv4f32.nxv1f6
 define <vscale x 1 x double> @intrinsic_vfwredosum_mask_vs_nxv1f64_nxv4f32_nxv1f64(<vscale x 1 x double> %0, <vscale x 4 x float> %1, <vscale x 1 x double> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredosum_mask_vs_nxv1f64_nxv4f32_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredosum.vs v8, v10, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwredosum.mask.nxv1f64.nxv4f32.nxv1f64(
@@ -463,10 +463,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwredosum.nxv1f64.nxv8f32(
 define <vscale x 1 x double> @intrinsic_vfwredosum_vs_nxv1f64_nxv8f32_nxv1f64(<vscale x 1 x double> %0, <vscale x 8 x float> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredosum_vs_nxv1f64_nxv8f32_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredosum.vs v8, v12, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwredosum.nxv1f64.nxv8f32(
@@ -488,10 +488,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwredosum.mask.nxv1f64.nxv8f32.nxv1f6
 define <vscale x 1 x double> @intrinsic_vfwredosum_mask_vs_nxv1f64_nxv8f32_nxv1f64(<vscale x 1 x double> %0, <vscale x 8 x float> %1, <vscale x 1 x double> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredosum_mask_vs_nxv1f64_nxv8f32_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredosum.vs v8, v12, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwredosum.mask.nxv1f64.nxv8f32.nxv1f64(
@@ -513,10 +513,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwredosum.nxv1f64.nxv16f32(
 define <vscale x 1 x double> @intrinsic_vfwredosum_vs_nxv1f64_nxv16f32_nxv1f64(<vscale x 1 x double> %0, <vscale x 16 x float> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredosum_vs_nxv1f64_nxv16f32_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredosum.vs v8, v16, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwredosum.nxv1f64.nxv16f32(
@@ -538,10 +538,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwredosum.mask.nxv1f64.nxv16f32.nxv1f
 define <vscale x 1 x double> @intrinsic_vfwredosum_mask_vs_nxv1f64_nxv16f32_nxv1f64(<vscale x 1 x double> %0, <vscale x 16 x float> %1, <vscale x 1 x double> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredosum_mask_vs_nxv1f64_nxv16f32_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredosum.vs v8, v16, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwredosum.mask.nxv1f64.nxv16f32.nxv1f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwredusum.ll b/llvm/test/CodeGen/RISCV/rvv/vfwredusum.ll
index d3d76e575978..66c2da047cfa 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwredusum.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwredusum.ll
@@ -13,10 +13,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwredusum.nxv2f32.nxv1f16(
 define <vscale x 2 x float> @intrinsic_vfwredusum_vs_nxv2f32_nxv1f16_nxv2f32(<vscale x 2 x float> %0, <vscale x 1 x half> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredusum_vs_nxv2f32_nxv1f16_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredusum.vs v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwredusum.nxv2f32.nxv1f16(
@@ -38,10 +38,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwredusum.mask.nxv2f32.nxv1f16.nxv2f32
 define <vscale x 2 x float> @intrinsic_vfwredusum_mask_vs_nxv2f32_nxv1f16_nxv2f32(<vscale x 2 x float> %0, <vscale x 1 x half> %1, <vscale x 2 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredusum_mask_vs_nxv2f32_nxv1f16_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredusum.vs v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwredusum.mask.nxv2f32.nxv1f16.nxv2f32(
@@ -63,10 +63,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwredusum.nxv2f32.nxv2f16(
 define <vscale x 2 x float> @intrinsic_vfwredusum_vs_nxv2f32_nxv2f16_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x half> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredusum_vs_nxv2f32_nxv2f16_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredusum.vs v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwredusum.nxv2f32.nxv2f16(
@@ -88,10 +88,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwredusum.mask.nxv2f32.nxv2f16.nxv2f32
 define <vscale x 2 x float> @intrinsic_vfwredusum_mask_vs_nxv2f32_nxv2f16_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x half> %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredusum_mask_vs_nxv2f32_nxv2f16_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredusum.vs v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwredusum.mask.nxv2f32.nxv2f16.nxv2f32(
@@ -113,10 +113,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwredusum.nxv2f32.nxv4f16(
 define <vscale x 2 x float> @intrinsic_vfwredusum_vs_nxv2f32_nxv4f16_nxv2f32(<vscale x 2 x float> %0, <vscale x 4 x half> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredusum_vs_nxv2f32_nxv4f16_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredusum.vs v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwredusum.nxv2f32.nxv4f16(
@@ -138,10 +138,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwredusum.mask.nxv2f32.nxv4f16.nxv2f32
 define <vscale x 2 x float> @intrinsic_vfwredusum_mask_vs_nxv2f32_nxv4f16_nxv2f32(<vscale x 2 x float> %0, <vscale x 4 x half> %1, <vscale x 2 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredusum_mask_vs_nxv2f32_nxv4f16_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredusum.vs v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwredusum.mask.nxv2f32.nxv4f16.nxv2f32(
@@ -163,10 +163,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwredusum.nxv2f32.nxv8f16(
 define <vscale x 2 x float> @intrinsic_vfwredusum_vs_nxv2f32_nxv8f16_nxv2f32(<vscale x 2 x float> %0, <vscale x 8 x half> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredusum_vs_nxv2f32_nxv8f16_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredusum.vs v8, v10, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwredusum.nxv2f32.nxv8f16(
@@ -188,10 +188,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwredusum.mask.nxv2f32.nxv8f16.nxv2f32
 define <vscale x 2 x float> @intrinsic_vfwredusum_mask_vs_nxv2f32_nxv8f16_nxv2f32(<vscale x 2 x float> %0, <vscale x 8 x half> %1, <vscale x 2 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredusum_mask_vs_nxv2f32_nxv8f16_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredusum.vs v8, v10, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwredusum.mask.nxv2f32.nxv8f16.nxv2f32(
@@ -213,10 +213,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwredusum.nxv2f32.nxv16f16(
 define <vscale x 2 x float> @intrinsic_vfwredusum_vs_nxv2f32_nxv16f16_nxv2f32(<vscale x 2 x float> %0, <vscale x 16 x half> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredusum_vs_nxv2f32_nxv16f16_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredusum.vs v8, v12, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwredusum.nxv2f32.nxv16f16(
@@ -238,10 +238,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwredusum.mask.nxv2f32.nxv16f16.nxv2f3
 define <vscale x 2 x float> @intrinsic_vfwredusum_mask_vs_nxv2f32_nxv16f16_nxv2f32(<vscale x 2 x float> %0, <vscale x 16 x half> %1, <vscale x 2 x float> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredusum_mask_vs_nxv2f32_nxv16f16_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredusum.vs v8, v12, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwredusum.mask.nxv2f32.nxv16f16.nxv2f32(
@@ -263,10 +263,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwredusum.nxv2f32.nxv32f16(
 define <vscale x 2 x float> @intrinsic_vfwredusum_vs_nxv2f32_nxv32f16_nxv2f32(<vscale x 2 x float> %0, <vscale x 32 x half> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredusum_vs_nxv2f32_nxv32f16_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredusum.vs v8, v16, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwredusum.nxv2f32.nxv32f16(
@@ -288,10 +288,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwredusum.mask.nxv2f32.nxv32f16(
 define <vscale x 2 x float> @intrinsic_vfwredusum_mask_vs_nxv2f32_nxv32f16_nxv2f32(<vscale x 2 x float> %0, <vscale x 32 x half> %1, <vscale x 2 x float> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredusum_mask_vs_nxv2f32_nxv32f16_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredusum.vs v8, v16, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwredusum.mask.nxv2f32.nxv32f16(
@@ -313,10 +313,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwredusum.nxv1f64.nxv1f32(
 define <vscale x 1 x double> @intrinsic_vfwredusum_vs_nxv1f64_nxv1f32_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x float> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredusum_vs_nxv1f64_nxv1f32_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredusum.vs v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwredusum.nxv1f64.nxv1f32(
@@ -338,10 +338,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwredusum.mask.nxv1f64.nxv1f32.nxv1f6
 define <vscale x 1 x double> @intrinsic_vfwredusum_mask_vs_nxv1f64_nxv1f32_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x float> %1, <vscale x 1 x double> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredusum_mask_vs_nxv1f64_nxv1f32_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredusum.vs v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwredusum.mask.nxv1f64.nxv1f32.nxv1f64(
@@ -363,10 +363,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwredusum.nxv1f64.nxv2f32(
 define <vscale x 1 x double> @intrinsic_vfwredusum_vs_nxv1f64_nxv2f32_nxv1f64(<vscale x 1 x double> %0, <vscale x 2 x float> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredusum_vs_nxv1f64_nxv2f32_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredusum.vs v8, v9, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwredusum.nxv1f64.nxv2f32(
@@ -388,10 +388,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwredusum.mask.nxv1f64.nxv2f32.nxv1f6
 define <vscale x 1 x double> @intrinsic_vfwredusum_mask_vs_nxv1f64_nxv2f32_nxv1f64(<vscale x 1 x double> %0, <vscale x 2 x float> %1, <vscale x 1 x double> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredusum_mask_vs_nxv1f64_nxv2f32_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredusum.vs v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwredusum.mask.nxv1f64.nxv2f32.nxv1f64(
@@ -413,10 +413,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwredusum.nxv1f64.nxv4f32(
 define <vscale x 1 x double> @intrinsic_vfwredusum_vs_nxv1f64_nxv4f32_nxv1f64(<vscale x 1 x double> %0, <vscale x 4 x float> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredusum_vs_nxv1f64_nxv4f32_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredusum.vs v8, v10, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwredusum.nxv1f64.nxv4f32(
@@ -438,10 +438,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwredusum.mask.nxv1f64.nxv4f32.nxv1f6
 define <vscale x 1 x double> @intrinsic_vfwredusum_mask_vs_nxv1f64_nxv4f32_nxv1f64(<vscale x 1 x double> %0, <vscale x 4 x float> %1, <vscale x 1 x double> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredusum_mask_vs_nxv1f64_nxv4f32_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredusum.vs v8, v10, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwredusum.mask.nxv1f64.nxv4f32.nxv1f64(
@@ -463,10 +463,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwredusum.nxv1f64.nxv8f32(
 define <vscale x 1 x double> @intrinsic_vfwredusum_vs_nxv1f64_nxv8f32_nxv1f64(<vscale x 1 x double> %0, <vscale x 8 x float> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredusum_vs_nxv1f64_nxv8f32_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredusum.vs v8, v12, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwredusum.nxv1f64.nxv8f32(
@@ -488,10 +488,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwredusum.mask.nxv1f64.nxv8f32.nxv1f6
 define <vscale x 1 x double> @intrinsic_vfwredusum_mask_vs_nxv1f64_nxv8f32_nxv1f64(<vscale x 1 x double> %0, <vscale x 8 x float> %1, <vscale x 1 x double> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredusum_mask_vs_nxv1f64_nxv8f32_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredusum.vs v8, v12, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwredusum.mask.nxv1f64.nxv8f32.nxv1f64(
@@ -513,10 +513,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwredusum.nxv1f64.nxv16f32(
 define <vscale x 1 x double> @intrinsic_vfwredusum_vs_nxv1f64_nxv16f32_nxv1f64(<vscale x 1 x double> %0, <vscale x 16 x float> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredusum_vs_nxv1f64_nxv16f32_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredusum.vs v8, v16, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwredusum.nxv1f64.nxv16f32(
@@ -538,10 +538,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwredusum.mask.nxv1f64.nxv16f32.nxv1f
 define <vscale x 1 x double> @intrinsic_vfwredusum_mask_vs_nxv1f64_nxv16f32_nxv1f64(<vscale x 1 x double> %0, <vscale x 16 x float> %1, <vscale x 1 x double> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwredusum_mask_vs_nxv1f64_nxv16f32_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwredusum.vs v8, v16, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwredusum.mask.nxv1f64.nxv16f32.nxv1f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwsub.ll b/llvm/test/CodeGen/RISCV/rvv/vfwsub.ll
index bb72f70f111b..0e3e5f8aabfd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwsub.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwsub.ll
@@ -13,10 +13,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwsub.nxv1f32.nxv1f16.nxv1f16(
 define <vscale x 1 x float> @intrinsic_vfwsub_vv_nxv1f32_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_vv_nxv1f32_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vv v10, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -39,10 +39,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1f16.nxv1f16(
 define <vscale x 1 x float> @intrinsic_vfwsub_mask_vv_nxv1f32_nxv1f16_nxv1f16(<vscale x 1 x float> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_mask_vv_nxv1f32_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1f16.nxv1f16(
@@ -64,10 +64,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwsub.nxv2f32.nxv2f16.nxv2f16(
 define <vscale x 2 x float> @intrinsic_vfwsub_vv_nxv2f32_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_vv_nxv2f32_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vv v10, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -90,10 +90,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2f16.nxv2f16(
 define <vscale x 2 x float> @intrinsic_vfwsub_mask_vv_nxv2f32_nxv2f16_nxv2f16(<vscale x 2 x float> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_mask_vv_nxv2f32_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2f16.nxv2f16(
@@ -115,10 +115,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwsub.nxv4f32.nxv4f16.nxv4f16(
 define <vscale x 4 x float> @intrinsic_vfwsub_vv_nxv4f32_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_vv_nxv4f32_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vv v10, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -141,10 +141,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4f16.nxv4f16(
 define <vscale x 4 x float> @intrinsic_vfwsub_mask_vv_nxv4f32_nxv4f16_nxv4f16(<vscale x 4 x float> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_mask_vv_nxv4f32_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vv v8, v10, v11, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4f16.nxv4f16(
@@ -166,10 +166,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwsub.nxv8f32.nxv8f16.nxv8f16(
 define <vscale x 8 x float> @intrinsic_vfwsub_vv_nxv8f32_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_vv_nxv8f32_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vv v12, v8, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv4r.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -192,10 +192,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8f16.nxv8f16(
 define <vscale x 8 x float> @intrinsic_vfwsub_mask_vv_nxv8f32_nxv8f16_nxv8f16(<vscale x 8 x float> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_mask_vv_nxv8f32_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vv v8, v12, v14, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8f16.nxv8f16(
@@ -217,10 +217,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwsub.nxv16f32.nxv16f16.nxv16f16(
 define <vscale x 16 x float> @intrinsic_vfwsub_vv_nxv16f32_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_vv_nxv16f32_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vv v16, v8, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -243,10 +243,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16f16.nxv16f16
 define <vscale x 16 x float> @intrinsic_vfwsub_mask_vv_nxv16f32_nxv16f16_nxv16f16(<vscale x 16 x float> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_mask_vv_nxv16f32_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vv v8, v16, v20, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16f16.nxv16f16(
@@ -268,10 +268,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwsub.nxv1f64.nxv1f32.nxv1f32(
 define <vscale x 1 x double> @intrinsic_vfwsub_vv_nxv1f64_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_vv_nxv1f64_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vv v10, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -294,10 +294,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwsub.mask.nxv1f64.nxv1f32.nxv1f32(
 define <vscale x 1 x double> @intrinsic_vfwsub_mask_vv_nxv1f64_nxv1f32_nxv1f32(<vscale x 1 x double> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_mask_vv_nxv1f64_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwsub.mask.nxv1f64.nxv1f32.nxv1f32(
@@ -319,10 +319,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwsub.nxv2f64.nxv2f32.nxv2f32(
 define <vscale x 2 x double> @intrinsic_vfwsub_vv_nxv2f64_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_vv_nxv2f64_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vv v10, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -345,10 +345,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwsub.mask.nxv2f64.nxv2f32.nxv2f32(
 define <vscale x 2 x double> @intrinsic_vfwsub_mask_vv_nxv2f64_nxv2f32_nxv2f32(<vscale x 2 x double> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_mask_vv_nxv2f64_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vv v8, v10, v11, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwsub.mask.nxv2f64.nxv2f32.nxv2f32(
@@ -370,10 +370,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwsub.nxv4f64.nxv4f32.nxv4f32(
 define <vscale x 4 x double> @intrinsic_vfwsub_vv_nxv4f64_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_vv_nxv4f64_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vv v12, v8, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv4r.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -396,10 +396,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwsub.mask.nxv4f64.nxv4f32.nxv4f32(
 define <vscale x 4 x double> @intrinsic_vfwsub_mask_vv_nxv4f64_nxv4f32_nxv4f32(<vscale x 4 x double> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_mask_vv_nxv4f64_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vv v8, v12, v14, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwsub.mask.nxv4f64.nxv4f32.nxv4f32(
@@ -421,10 +421,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwsub.nxv8f64.nxv8f32.nxv8f32(
 define <vscale x 8 x double> @intrinsic_vfwsub_vv_nxv8f64_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_vv_nxv8f64_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vv v16, v8, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -447,10 +447,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwsub.mask.nxv8f64.nxv8f32.nxv8f32(
 define <vscale x 8 x double> @intrinsic_vfwsub_mask_vv_nxv8f64_nxv8f32_nxv8f32(<vscale x 8 x double> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_mask_vv_nxv8f64_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vv v8, v16, v20, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwsub.mask.nxv8f64.nxv8f32.nxv8f32(
@@ -472,10 +472,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwsub.nxv1f32.nxv1f16.f16(
 define <vscale x 1 x float> @intrinsic_vfwsub_vf_nxv1f32_nxv1f16_f16(<vscale x 1 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_vf_nxv1f32_nxv1f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vf v9, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -498,10 +498,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1f16.f16(
 define <vscale x 1 x float> @intrinsic_vfwsub_mask_vf_nxv1f32_nxv1f16_f16(<vscale x 1 x float> %0, <vscale x 1 x half> %1, half %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_mask_vf_nxv1f32_nxv1f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1f16.f16(
@@ -523,10 +523,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwsub.nxv2f32.nxv2f16.f16(
 define <vscale x 2 x float> @intrinsic_vfwsub_vf_nxv2f32_nxv2f16_f16(<vscale x 2 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_vf_nxv2f32_nxv2f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vf v9, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -549,10 +549,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2f16.f16(
 define <vscale x 2 x float> @intrinsic_vfwsub_mask_vf_nxv2f32_nxv2f16_f16(<vscale x 2 x float> %0, <vscale x 2 x half> %1, half %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_mask_vf_nxv2f32_nxv2f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2f16.f16(
@@ -574,10 +574,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwsub.nxv4f32.nxv4f16.f16(
 define <vscale x 4 x float> @intrinsic_vfwsub_vf_nxv4f32_nxv4f16_f16(<vscale x 4 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_vf_nxv4f32_nxv4f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vf v10, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -600,10 +600,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4f16.f16(
 define <vscale x 4 x float> @intrinsic_vfwsub_mask_vf_nxv4f32_nxv4f16_f16(<vscale x 4 x float> %0, <vscale x 4 x half> %1, half %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_mask_vf_nxv4f32_nxv4f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vf v8, v10, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4f16.f16(
@@ -625,10 +625,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwsub.nxv8f32.nxv8f16.f16(
 define <vscale x 8 x float> @intrinsic_vfwsub_vf_nxv8f32_nxv8f16_f16(<vscale x 8 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_vf_nxv8f32_nxv8f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vf v12, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv4r.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -651,10 +651,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8f16.f16(
 define <vscale x 8 x float> @intrinsic_vfwsub_mask_vf_nxv8f32_nxv8f16_f16(<vscale x 8 x float> %0, <vscale x 8 x half> %1, half %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_mask_vf_nxv8f32_nxv8f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vf v8, v12, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8f16.f16(
@@ -676,10 +676,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwsub.nxv16f32.nxv16f16.f16(
 define <vscale x 16 x float> @intrinsic_vfwsub_vf_nxv16f32_nxv16f16_f16(<vscale x 16 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_vf_nxv16f32_nxv16f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vf v16, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -702,10 +702,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16f16.f16(
 define <vscale x 16 x float> @intrinsic_vfwsub_mask_vf_nxv16f32_nxv16f16_f16(<vscale x 16 x float> %0, <vscale x 16 x half> %1, half %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_mask_vf_nxv16f32_nxv16f16_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vf v8, v16, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16f16.f16(
@@ -727,10 +727,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwsub.nxv1f64.nxv1f32.f32(
 define <vscale x 1 x double> @intrinsic_vfwsub_vf_nxv1f64_nxv1f32_f32(<vscale x 1 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_vf_nxv1f64_nxv1f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vf v9, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -753,10 +753,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwsub.mask.nxv1f64.nxv1f32.f32(
 define <vscale x 1 x double> @intrinsic_vfwsub_mask_vf_nxv1f64_nxv1f32_f32(<vscale x 1 x double> %0, <vscale x 1 x float> %1, float %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_mask_vf_nxv1f64_nxv1f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwsub.mask.nxv1f64.nxv1f32.f32(
@@ -778,10 +778,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwsub.nxv2f64.nxv2f32.f32(
 define <vscale x 2 x double> @intrinsic_vfwsub_vf_nxv2f64_nxv2f32_f32(<vscale x 2 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_vf_nxv2f64_nxv2f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vf v10, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -804,10 +804,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwsub.mask.nxv2f64.nxv2f32.f32(
 define <vscale x 2 x double> @intrinsic_vfwsub_mask_vf_nxv2f64_nxv2f32_f32(<vscale x 2 x double> %0, <vscale x 2 x float> %1, float %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_mask_vf_nxv2f64_nxv2f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vf v8, v10, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwsub.mask.nxv2f64.nxv2f32.f32(
@@ -829,10 +829,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwsub.nxv4f64.nxv4f32.f32(
 define <vscale x 4 x double> @intrinsic_vfwsub_vf_nxv4f64_nxv4f32_f32(<vscale x 4 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_vf_nxv4f64_nxv4f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vf v12, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv4r.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -855,10 +855,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwsub.mask.nxv4f64.nxv4f32.f32(
 define <vscale x 4 x double> @intrinsic_vfwsub_mask_vf_nxv4f64_nxv4f32_f32(<vscale x 4 x double> %0, <vscale x 4 x float> %1, float %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_mask_vf_nxv4f64_nxv4f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vf v8, v12, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwsub.mask.nxv4f64.nxv4f32.f32(
@@ -880,10 +880,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwsub.nxv8f64.nxv8f32.f32(
 define <vscale x 8 x double> @intrinsic_vfwsub_vf_nxv8f64_nxv8f32_f32(<vscale x 8 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_vf_nxv8f64_nxv8f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vf v16, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -906,10 +906,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwsub.mask.nxv8f64.nxv8f32.f32(
 define <vscale x 8 x double> @intrinsic_vfwsub_mask_vf_nxv8f64_nxv8f32_f32(<vscale x 8 x double> %0, <vscale x 8 x float> %1, float %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub_mask_vf_nxv8f64_nxv8f32_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.vf v8, v16, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwsub.mask.nxv8f64.nxv8f32.f32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwsub.w.ll b/llvm/test/CodeGen/RISCV/rvv/vfwsub.w.ll
index 722fed5138f7..90f92226dcdd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwsub.w.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwsub.w.ll
@@ -13,10 +13,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwsub.w.nxv1f32.nxv1f16(
 define <vscale x 1 x float> @intrinsic_vfwsub.w_wv_nxv1f32_nxv1f32_nxv1f16(<vscale x 1 x float> %0, <vscale x 1 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_wv_nxv1f32_nxv1f32_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wv v8, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.nxv1f32.nxv1f16(
@@ -38,10 +38,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.nxv1f16(
 define <vscale x 1 x float> @intrinsic_vfwsub.w_mask_wv_nxv1f32_nxv1f32_nxv1f16(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_nxv1f32_nxv1f32_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.nxv1f16(
@@ -63,10 +63,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwsub.w.nxv2f32.nxv2f16(
 define <vscale x 2 x float> @intrinsic_vfwsub.w_wv_nxv2f32_nxv2f32_nxv2f16(<vscale x 2 x float> %0, <vscale x 2 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_wv_nxv2f32_nxv2f32_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wv v8, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.nxv2f32.nxv2f16(
@@ -88,10 +88,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.nxv2f16(
 define <vscale x 2 x float> @intrinsic_vfwsub.w_mask_wv_nxv2f32_nxv2f32_nxv2f16(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x half> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_nxv2f32_nxv2f32_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.nxv2f16(
@@ -113,10 +113,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwsub.w.nxv4f32.nxv4f16(
 define <vscale x 4 x float> @intrinsic_vfwsub.w_wv_nxv4f32_nxv4f32_nxv4f16(<vscale x 4 x float> %0, <vscale x 4 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_wv_nxv4f32_nxv4f32_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wv v8, v8, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.nxv4f32.nxv4f16(
@@ -138,10 +138,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.nxv4f16(
 define <vscale x 4 x float> @intrinsic_vfwsub.w_mask_wv_nxv4f32_nxv4f32_nxv4f16(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x half> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_nxv4f32_nxv4f32_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.nxv4f16(
@@ -163,10 +163,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwsub.w.nxv8f32.nxv8f16(
 define <vscale x 8 x float> @intrinsic_vfwsub.w_wv_nxv8f32_nxv8f32_nxv8f16(<vscale x 8 x float> %0, <vscale x 8 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_wv_nxv8f32_nxv8f32_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wv v8, v8, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.nxv8f32.nxv8f16(
@@ -188,10 +188,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.nxv8f16(
 define <vscale x 8 x float> @intrinsic_vfwsub.w_mask_wv_nxv8f32_nxv8f32_nxv8f16(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x half> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_nxv8f32_nxv8f32_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.nxv8f16(
@@ -213,10 +213,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwsub.w.nxv16f32.nxv16f16(
 define <vscale x 16 x float> @intrinsic_vfwsub.w_wv_nxv16f32_nxv16f32_nxv16f16(<vscale x 16 x float> %0, <vscale x 16 x half> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_wv_nxv16f32_nxv16f32_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wv v8, v8, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.nxv16f32.nxv16f16(
@@ -239,8 +239,8 @@ define <vscale x 16 x float> @intrinsic_vfwsub.w_mask_wv_nxv16f32_nxv16f32_nxv16
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_nxv16f32_nxv16f32_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl4re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
 ; CHECK-NEXT:    vfwsub.wv v8, v16, v24, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
@@ -264,10 +264,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwsub.w.nxv1f64.nxv1f32(
 define <vscale x 1 x double> @intrinsic_vfwsub.w_wv_nxv1f64_nxv1f64_nxv1f32(<vscale x 1 x double> %0, <vscale x 1 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_wv_nxv1f64_nxv1f64_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wv v8, v8, v9
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwsub.w.nxv1f64.nxv1f32(
@@ -289,10 +289,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwsub.w.mask.nxv1f64.nxv1f32(
 define <vscale x 1 x double> @intrinsic_vfwsub.w_mask_wv_nxv1f64_nxv1f64_nxv1f32(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x float> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_nxv1f64_nxv1f64_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wv v8, v9, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwsub.w.mask.nxv1f64.nxv1f32(
@@ -314,10 +314,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwsub.w.nxv2f64.nxv2f32(
 define <vscale x 2 x double> @intrinsic_vfwsub.w_wv_nxv2f64_nxv2f64_nxv2f32(<vscale x 2 x double> %0, <vscale x 2 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_wv_nxv2f64_nxv2f64_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wv v8, v8, v10
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwsub.w.nxv2f64.nxv2f32(
@@ -339,10 +339,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwsub.w.mask.nxv2f64.nxv2f32(
 define <vscale x 2 x double> @intrinsic_vfwsub.w_mask_wv_nxv2f64_nxv2f64_nxv2f32(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x float> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_nxv2f64_nxv2f64_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wv v8, v10, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwsub.w.mask.nxv2f64.nxv2f32(
@@ -364,10 +364,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwsub.w.nxv4f64.nxv4f32(
 define <vscale x 4 x double> @intrinsic_vfwsub.w_wv_nxv4f64_nxv4f64_nxv4f32(<vscale x 4 x double> %0, <vscale x 4 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_wv_nxv4f64_nxv4f64_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wv v8, v8, v12
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwsub.w.nxv4f64.nxv4f32(
@@ -389,10 +389,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwsub.w.mask.nxv4f64.nxv4f32(
 define <vscale x 4 x double> @intrinsic_vfwsub.w_mask_wv_nxv4f64_nxv4f64_nxv4f32(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x float> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_nxv4f64_nxv4f64_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wv v8, v12, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwsub.w.mask.nxv4f64.nxv4f32(
@@ -414,10 +414,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwsub.w.nxv8f64.nxv8f32(
 define <vscale x 8 x double> @intrinsic_vfwsub.w_wv_nxv8f64_nxv8f64_nxv8f32(<vscale x 8 x double> %0, <vscale x 8 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_wv_nxv8f64_nxv8f64_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wv v8, v8, v16
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwsub.w.nxv8f64.nxv8f32(
@@ -440,8 +440,8 @@ define <vscale x 8 x double> @intrinsic_vfwsub.w_mask_wv_nxv8f64_nxv8f64_nxv8f32
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_nxv8f64_nxv8f64_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl4re32.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
 ; CHECK-NEXT:    vfwsub.wv v8, v16, v24, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
@@ -465,10 +465,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwsub.w.nxv1f32.f16(
 define <vscale x 1 x float> @intrinsic_vfwsub.w_wf_nxv1f32_nxv1f32_f16(<vscale x 1 x float> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_wf_nxv1f32_nxv1f32_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.nxv1f32.f16(
@@ -490,10 +490,10 @@ declare <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.f16(
 define <vscale x 1 x float> @intrinsic_vfwsub.w_mask_wf_nxv1f32_nxv1f32_f16(<vscale x 1 x float> %0, <vscale x 1 x float> %1, half %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_nxv1f32_nxv1f32_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.f16(
@@ -515,10 +515,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwsub.w.nxv2f32.f16(
 define <vscale x 2 x float> @intrinsic_vfwsub.w_wf_nxv2f32_nxv2f32_f16(<vscale x 2 x float> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_wf_nxv2f32_nxv2f32_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.nxv2f32.f16(
@@ -540,10 +540,10 @@ declare <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.f16(
 define <vscale x 2 x float> @intrinsic_vfwsub.w_mask_wf_nxv2f32_nxv2f32_f16(<vscale x 2 x float> %0, <vscale x 2 x float> %1, half %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_nxv2f32_nxv2f32_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.f16(
@@ -565,10 +565,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwsub.w.nxv4f32.f16(
 define <vscale x 4 x float> @intrinsic_vfwsub.w_wf_nxv4f32_nxv4f32_f16(<vscale x 4 x float> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_wf_nxv4f32_nxv4f32_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.nxv4f32.f16(
@@ -590,10 +590,10 @@ declare <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.f16(
 define <vscale x 4 x float> @intrinsic_vfwsub.w_mask_wf_nxv4f32_nxv4f32_f16(<vscale x 4 x float> %0, <vscale x 4 x float> %1, half %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_nxv4f32_nxv4f32_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wf v8, v10, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.f16(
@@ -615,10 +615,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwsub.w.nxv8f32.f16(
 define <vscale x 8 x float> @intrinsic_vfwsub.w_wf_nxv8f32_nxv8f32_f16(<vscale x 8 x float> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_wf_nxv8f32_nxv8f32_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.nxv8f32.f16(
@@ -640,10 +640,10 @@ declare <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.f16(
 define <vscale x 8 x float> @intrinsic_vfwsub.w_mask_wf_nxv8f32_nxv8f32_f16(<vscale x 8 x float> %0, <vscale x 8 x float> %1, half %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_nxv8f32_nxv8f32_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wf v8, v12, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.f16(
@@ -665,10 +665,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwsub.w.nxv16f32.f16(
 define <vscale x 16 x float> @intrinsic_vfwsub.w_wf_nxv16f32_nxv16f32_f16(<vscale x 16 x float> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_wf_nxv16f32_nxv16f32_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.nxv16f32.f16(
@@ -690,10 +690,10 @@ declare <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.f16(
 define <vscale x 16 x float> @intrinsic_vfwsub.w_mask_wf_nxv16f32_nxv16f32_f16(<vscale x 16 x float> %0, <vscale x 16 x float> %1, half %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_nxv16f32_nxv16f32_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wf v8, v16, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.f16(
@@ -715,10 +715,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwsub.w.nxv1f64.f32(
 define <vscale x 1 x double> @intrinsic_vfwsub.w_wf_nxv1f64_nxv1f64_f32(<vscale x 1 x double> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_wf_nxv1f64_nxv1f64_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwsub.w.nxv1f64.f32(
@@ -740,10 +740,10 @@ declare <vscale x 1 x double> @llvm.riscv.vfwsub.w.mask.nxv1f64.f32(
 define <vscale x 1 x double> @intrinsic_vfwsub.w_mask_wf_nxv1f64_nxv1f64_f32(<vscale x 1 x double> %0, <vscale x 1 x double> %1, float %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_nxv1f64_nxv1f64_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wf v8, v9, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwsub.w.mask.nxv1f64.f32(
@@ -765,10 +765,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwsub.w.nxv2f64.f32(
 define <vscale x 2 x double> @intrinsic_vfwsub.w_wf_nxv2f64_nxv2f64_f32(<vscale x 2 x double> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_wf_nxv2f64_nxv2f64_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwsub.w.nxv2f64.f32(
@@ -790,10 +790,10 @@ declare <vscale x 2 x double> @llvm.riscv.vfwsub.w.mask.nxv2f64.f32(
 define <vscale x 2 x double> @intrinsic_vfwsub.w_mask_wf_nxv2f64_nxv2f64_f32(<vscale x 2 x double> %0, <vscale x 2 x double> %1, float %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_nxv2f64_nxv2f64_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wf v8, v10, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwsub.w.mask.nxv2f64.f32(
@@ -815,10 +815,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwsub.w.nxv4f64.f32(
 define <vscale x 4 x double> @intrinsic_vfwsub.w_wf_nxv4f64_nxv4f64_f32(<vscale x 4 x double> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_wf_nxv4f64_nxv4f64_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwsub.w.nxv4f64.f32(
@@ -840,10 +840,10 @@ declare <vscale x 4 x double> @llvm.riscv.vfwsub.w.mask.nxv4f64.f32(
 define <vscale x 4 x double> @intrinsic_vfwsub.w_mask_wf_nxv4f64_nxv4f64_f32(<vscale x 4 x double> %0, <vscale x 4 x double> %1, float %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_nxv4f64_nxv4f64_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wf v8, v12, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwsub.w.mask.nxv4f64.f32(
@@ -865,10 +865,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwsub.w.nxv8f64.f32(
 define <vscale x 8 x double> @intrinsic_vfwsub.w_wf_nxv8f64_nxv8f64_f32(<vscale x 8 x double> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_wf_nxv8f64_nxv8f64_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wf v8, v8, fa0
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwsub.w.nxv8f64.f32(
@@ -890,10 +890,10 @@ declare <vscale x 8 x double> @llvm.riscv.vfwsub.w.mask.nxv8f64.f32(
 define <vscale x 8 x double> @intrinsic_vfwsub.w_mask_wf_nxv8f64_nxv8f64_f32(<vscale x 8 x double> %0, <vscale x 8 x double> %1, float %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_nxv8f64_nxv8f64_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wf v8, v16, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwsub.w.mask.nxv8f64.f32(
@@ -909,10 +909,10 @@ entry:
 define <vscale x 1 x float> @intrinsic_vfwsub.w_mask_wv_tie_nxv1f32_nxv1f32_nxv1f16(<vscale x 1 x float> %0, <vscale x 1 x half> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_tie_nxv1f32_nxv1f32_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wv v8, v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.nxv1f16(
@@ -928,10 +928,10 @@ entry:
 define <vscale x 2 x float> @intrinsic_vfwsub.w_mask_wv_tie_nxv2f32_nxv2f32_nxv2f16(<vscale x 2 x float> %0, <vscale x 2 x half> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_tie_nxv2f32_nxv2f32_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wv v8, v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.nxv2f16(
@@ -947,10 +947,10 @@ entry:
 define <vscale x 4 x float> @intrinsic_vfwsub.w_mask_wv_tie_nxv4f32_nxv4f32_nxv4f16(<vscale x 4 x float> %0, <vscale x 4 x half> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_tie_nxv4f32_nxv4f32_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wv v8, v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.nxv4f16(
@@ -966,10 +966,10 @@ entry:
 define <vscale x 8 x float> @intrinsic_vfwsub.w_mask_wv_tie_nxv8f32_nxv8f32_nxv8f16(<vscale x 8 x float> %0, <vscale x 8 x half> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_tie_nxv8f32_nxv8f32_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wv v8, v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.nxv8f16(
@@ -985,10 +985,10 @@ entry:
 define <vscale x 16 x float> @intrinsic_vfwsub.w_mask_wv_tie_nxv16f32_nxv16f32_nxv16f16(<vscale x 16 x float> %0, <vscale x 16 x half> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_tie_nxv16f32_nxv16f32_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wv v8, v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.nxv16f16(
@@ -1004,10 +1004,10 @@ entry:
 define <vscale x 1 x double> @intrinsic_vfwsub.w_mask_wv_tie_nxv1f64_nxv1f64_nxv1f32(<vscale x 1 x double> %0, <vscale x 1 x float> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_tie_nxv1f64_nxv1f64_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wv v8, v8, v9, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwsub.w.mask.nxv1f64.nxv1f32(
@@ -1023,10 +1023,10 @@ entry:
 define <vscale x 2 x double> @intrinsic_vfwsub.w_mask_wv_tie_nxv2f64_nxv2f64_nxv2f32(<vscale x 2 x double> %0, <vscale x 2 x float> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_tie_nxv2f64_nxv2f64_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wv v8, v8, v10, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwsub.w.mask.nxv2f64.nxv2f32(
@@ -1042,10 +1042,10 @@ entry:
 define <vscale x 4 x double> @intrinsic_vfwsub.w_mask_wv_tie_nxv4f64_nxv4f64_nxv4f32(<vscale x 4 x double> %0, <vscale x 4 x float> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_tie_nxv4f64_nxv4f64_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wv v8, v8, v12, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwsub.w.mask.nxv4f64.nxv4f32(
@@ -1061,10 +1061,10 @@ entry:
 define <vscale x 8 x double> @intrinsic_vfwsub.w_mask_wv_tie_nxv8f64_nxv8f64_nxv8f32(<vscale x 8 x double> %0, <vscale x 8 x float> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_tie_nxv8f64_nxv8f64_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wv v8, v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwsub.w.mask.nxv8f64.nxv8f32(
@@ -1080,10 +1080,10 @@ entry:
 define <vscale x 1 x float> @intrinsic_vfwsub.w_mask_wf_tie_nxv1f32_nxv1f32_f16(<vscale x 1 x float> %0, half %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_tie_nxv1f32_nxv1f32_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wf v8, v8, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.f16(
@@ -1099,10 +1099,10 @@ entry:
 define <vscale x 2 x float> @intrinsic_vfwsub.w_mask_wf_tie_nxv2f32_nxv2f32_f16(<vscale x 2 x float> %0, half %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_tie_nxv2f32_nxv2f32_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wf v8, v8, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.f16(
@@ -1118,10 +1118,10 @@ entry:
 define <vscale x 4 x float> @intrinsic_vfwsub.w_mask_wf_tie_nxv4f32_nxv4f32_f16(<vscale x 4 x float> %0, half %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_tie_nxv4f32_nxv4f32_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wf v8, v8, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.f16(
@@ -1137,10 +1137,10 @@ entry:
 define <vscale x 8 x float> @intrinsic_vfwsub.w_mask_wf_tie_nxv8f32_nxv8f32_f16(<vscale x 8 x float> %0, half %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_tie_nxv8f32_nxv8f32_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wf v8, v8, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.f16(
@@ -1156,10 +1156,10 @@ entry:
 define <vscale x 16 x float> @intrinsic_vfwsub.w_mask_wf_tie_nxv16f32_nxv16f32_f16(<vscale x 16 x float> %0, half %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_tie_nxv16f32_nxv16f32_f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wf v8, v8, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.f16(
@@ -1175,10 +1175,10 @@ entry:
 define <vscale x 1 x double> @intrinsic_vfwsub.w_mask_wf_tie_nxv1f64_nxv1f64_f32(<vscale x 1 x double> %0, float %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_tie_nxv1f64_nxv1f64_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wf v8, v8, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfwsub.w.mask.nxv1f64.f32(
@@ -1194,10 +1194,10 @@ entry:
 define <vscale x 2 x double> @intrinsic_vfwsub.w_mask_wf_tie_nxv2f64_nxv2f64_f32(<vscale x 2 x double> %0, float %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_tie_nxv2f64_nxv2f64_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wf v8, v8, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfwsub.w.mask.nxv2f64.f32(
@@ -1213,10 +1213,10 @@ entry:
 define <vscale x 4 x double> @intrinsic_vfwsub.w_mask_wf_tie_nxv4f64_nxv4f64_f32(<vscale x 4 x double> %0, float %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_tie_nxv4f64_nxv4f64_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wf v8, v8, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfwsub.w.mask.nxv4f64.f32(
@@ -1232,10 +1232,10 @@ entry:
 define <vscale x 8 x double> @intrinsic_vfwsub.w_mask_wf_tie_nxv8f64_nxv8f64_f32(<vscale x 8 x double> %0, float %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_tie_nxv8f64_nxv8f64_f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wf v8, v8, fa0, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfwsub.w.mask.nxv8f64.f32(
@@ -1251,10 +1251,10 @@ entry:
 define <vscale x 1 x float> @intrinsic_vfwsub.w_wv_untie_nxv1f32_nxv1f32_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_wv_untie_nxv1f32_nxv1f32_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wv v10, v9, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -1270,10 +1270,10 @@ entry:
 define <vscale x 2 x float> @intrinsic_vfwsub.w_wv_untie_nxv2f32_nxv2f32_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_wv_untie_nxv2f32_nxv2f32_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wv v10, v9, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -1289,10 +1289,10 @@ entry:
 define <vscale x 4 x float> @intrinsic_vfwsub.w_wv_untie_nxv4f32_nxv4f32_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_wv_untie_nxv4f32_nxv4f32_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wv v12, v10, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv2r.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -1308,10 +1308,10 @@ entry:
 define <vscale x 8 x float> @intrinsic_vfwsub.w_wv_untie_nxv8f32_nxv8f32_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x float> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_wv_untie_nxv8f32_nxv8f32_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wv v16, v12, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv4r.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -1327,10 +1327,10 @@ entry:
 define <vscale x 1 x double> @intrinsic_vfwsub.w_wv_untie_nxv1f64_nxv1f64_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x double> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_wv_untie_nxv1f64_nxv1f64_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wv v10, v9, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -1346,10 +1346,10 @@ entry:
 define <vscale x 2 x double> @intrinsic_vfwsub.w_wv_untie_nxv2f64_nxv2f64_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x double> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_wv_untie_nxv2f64_nxv2f64_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wv v12, v10, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv2r.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -1365,10 +1365,10 @@ entry:
 define <vscale x 4 x double> @intrinsic_vfwsub.w_wv_untie_nxv4f64_nxv4f64_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x double> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_wv_untie_nxv4f64_nxv4f64_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wv v16, v12, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv4r.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -1384,10 +1384,10 @@ entry:
 define <vscale x 8 x double> @intrinsic_vfwsub.w_wv_untie_nxv8f64_nxv8f64_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x double> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfwsub.w_wv_untie_nxv8f64_nxv8f64_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfwsub.wv v24, v16, v8
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vmv8r.v v8, v24
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll b/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll
index 15cb42bacf17..390647fd9e6c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll
@@ -66,12 +66,7 @@ define i64 @test_vlseg2ff_nxv8i8(ptr %base, i64 %vl, ptr %outvl) {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gprnox0 = COPY $x11
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr = COPY $x10
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vr = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vr = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vrn2m1 = REG_SEQUENCE [[DEF]], %subreg.sub_vrm1_0, [[DEF2]], %subreg.sub_vrm1_1
-  ; CHECK-NEXT:   [[PseudoVLSEG2E8FF_V_M1_:%[0-9]+]]:vrn2m1, [[PseudoVLSEG2E8FF_V_M1_1:%[0-9]+]]:gpr = PseudoVLSEG2E8FF_V_M1 [[REG_SEQUENCE]], [[COPY1]], [[COPY]], 3 /* e8 */, 2 /* tu, ma */, implicit-def dead $vl :: (load unknown-size from %ir.base, align 1)
+  ; CHECK-NEXT:   [[PseudoVLSEG2E8FF_V_M1_:%[0-9]+]]:vrn2m1, [[PseudoVLSEG2E8FF_V_M1_1:%[0-9]+]]:gpr = PseudoVLSEG2E8FF_V_M1 $noreg, [[COPY1]], [[COPY]], 3 /* e8 */, 2 /* tu, ma */, implicit-def dead $vl :: (load unknown-size from %ir.base, align 1)
   ; CHECK-NEXT:   $x10 = COPY [[PseudoVLSEG2E8FF_V_M1_1]]
   ; CHECK-NEXT:   PseudoRET implicit $x10
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vnclip.ll b/llvm/test/CodeGen/RISCV/rvv/vnclip.ll
index 54f4c17dd7ed..8902b1a28f8c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vnclip.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vnclip.ll
@@ -13,8 +13,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vnclip.nxv1i8.nxv1i16.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vnclip_wv_nxv1i8_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_wv_nxv1i8_nxv1i16_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vnclip.wv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -37,8 +37,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vnclip.mask.nxv1i8.nxv1i16.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vnclip_mask_wv_nxv1i8_nxv1i16_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i16> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_wv_nxv1i8_nxv1i16_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    vnclip.wv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -61,8 +61,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vnclip.nxv2i8.nxv2i16.nxv2i8(
 define <vscale x 2 x i8> @intrinsic_vnclip_wv_nxv2i8_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_wv_nxv2i8_nxv2i16_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnclip.wv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -85,8 +85,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vnclip.mask.nxv2i8.nxv2i16.nxv2i8(
 define <vscale x 2 x i8> @intrinsic_vnclip_mask_wv_nxv2i8_nxv2i16_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i16> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_wv_nxv2i8_nxv2i16_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
 ; CHECK-NEXT:    vnclip.wv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -109,8 +109,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vnclip.nxv4i8.nxv4i16.nxv4i8(
 define <vscale x 4 x i8> @intrinsic_vnclip_wv_nxv4i8_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_wv_nxv4i8_nxv4i16_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vnclip.wv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -133,8 +133,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vnclip.mask.nxv4i8.nxv4i16.nxv4i8(
 define <vscale x 4 x i8> @intrinsic_vnclip_mask_wv_nxv4i8_nxv4i16_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i16> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_wv_nxv4i8_nxv4i16_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
 ; CHECK-NEXT:    vnclip.wv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -157,8 +157,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vnclip.nxv8i8.nxv8i16.nxv8i8(
 define <vscale x 8 x i8> @intrinsic_vnclip_wv_nxv8i8_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_wv_nxv8i8_nxv8i16_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vnclip.wv v11, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v11
 ; CHECK-NEXT:    ret
@@ -182,8 +182,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vnclip.mask.nxv8i8.nxv8i16.nxv8i8(
 define <vscale x 8 x i8> @intrinsic_vnclip_mask_wv_nxv8i8_nxv8i16_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i16> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_wv_nxv8i8_nxv8i16_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
 ; CHECK-NEXT:    vnclip.wv v8, v10, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -206,8 +206,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vnclip.nxv16i8.nxv16i16.nxv16i8(
 define <vscale x 16 x i8> @intrinsic_vnclip_wv_nxv16i8_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_wv_nxv16i8_nxv16i16_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vnclip.wv v14, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v14
 ; CHECK-NEXT:    ret
@@ -231,8 +231,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vnclip.mask.nxv16i8.nxv16i16.nxv16i8(
 define <vscale x 16 x i8> @intrinsic_vnclip_mask_wv_nxv16i8_nxv16i16_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i16> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_wv_nxv16i8_nxv16i16_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
 ; CHECK-NEXT:    vnclip.wv v8, v12, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -255,8 +255,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vnclip.nxv32i8.nxv32i16.nxv32i8(
 define <vscale x 32 x i8> @intrinsic_vnclip_wv_nxv32i8_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_wv_nxv32i8_nxv32i16_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vnclip.wv v20, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v20
 ; CHECK-NEXT:    ret
@@ -280,8 +280,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vnclip.mask.nxv32i8.nxv32i16.nxv32i8(
 define <vscale x 32 x i8> @intrinsic_vnclip_mask_wv_nxv32i8_nxv32i16_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i16> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_wv_nxv32i8_nxv32i16_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
 ; CHECK-NEXT:    vnclip.wv v8, v16, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -304,8 +304,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vnclip.nxv1i16.nxv1i32.nxv1i16(
 define <vscale x 1 x i16> @intrinsic_vnclip_wv_nxv1i16_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_wv_nxv1i16_nxv1i32_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vnclip.wv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -328,8 +328,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vnclip.mask.nxv1i16.nxv1i32.nxv1i16(
 define <vscale x 1 x i16> @intrinsic_vnclip_mask_wv_nxv1i16_nxv1i32_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i32> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_wv_nxv1i16_nxv1i32_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
 ; CHECK-NEXT:    vnclip.wv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -352,8 +352,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vnclip.nxv2i16.nxv2i32.nxv2i16(
 define <vscale x 2 x i16> @intrinsic_vnclip_wv_nxv2i16_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_wv_nxv2i16_nxv2i32_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vnclip.wv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -376,8 +376,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vnclip.mask.nxv2i16.nxv2i32.nxv2i16(
 define <vscale x 2 x i16> @intrinsic_vnclip_mask_wv_nxv2i16_nxv2i32_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i32> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_wv_nxv2i16_nxv2i32_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
 ; CHECK-NEXT:    vnclip.wv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -400,8 +400,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vnclip.nxv4i16.nxv4i32.nxv4i16(
 define <vscale x 4 x i16> @intrinsic_vnclip_wv_nxv4i16_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_wv_nxv4i16_nxv4i32_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vnclip.wv v11, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v11
 ; CHECK-NEXT:    ret
@@ -425,8 +425,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vnclip.mask.nxv4i16.nxv4i32.nxv4i16(
 define <vscale x 4 x i16> @intrinsic_vnclip_mask_wv_nxv4i16_nxv4i32_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i32> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_wv_nxv4i16_nxv4i32_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
 ; CHECK-NEXT:    vnclip.wv v8, v10, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -449,8 +449,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vnclip.nxv8i16.nxv8i32.nxv8i16(
 define <vscale x 8 x i16> @intrinsic_vnclip_wv_nxv8i16_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_wv_nxv8i16_nxv8i32_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vnclip.wv v14, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v14
 ; CHECK-NEXT:    ret
@@ -474,8 +474,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vnclip.mask.nxv8i16.nxv8i32.nxv8i16(
 define <vscale x 8 x i16> @intrinsic_vnclip_mask_wv_nxv8i16_nxv8i32_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i32> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_wv_nxv8i16_nxv8i32_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
 ; CHECK-NEXT:    vnclip.wv v8, v12, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -498,8 +498,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vnclip.nxv16i16.nxv16i32.nxv16i16(
 define <vscale x 16 x i16> @intrinsic_vnclip_wv_nxv16i16_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_wv_nxv16i16_nxv16i32_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vnclip.wv v20, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v20
 ; CHECK-NEXT:    ret
@@ -523,8 +523,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vnclip.mask.nxv16i16.nxv16i32.nxv16i16(
 define <vscale x 16 x i16> @intrinsic_vnclip_mask_wv_nxv16i16_nxv16i32_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i32> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_wv_nxv16i16_nxv16i32_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vnclip.wv v8, v16, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -547,8 +547,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vnclip.nxv1i32.nxv1i64.nxv1i32(
 define <vscale x 1 x i32> @intrinsic_vnclip_wv_nxv1i32_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_wv_nxv1i32_nxv1i64_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vnclip.wv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -571,8 +571,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vnclip.mask.nxv1i32.nxv1i64.nxv1i32(
 define <vscale x 1 x i32> @intrinsic_vnclip_mask_wv_nxv1i32_nxv1i64_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i64> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_wv_nxv1i32_nxv1i64_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
 ; CHECK-NEXT:    vnclip.wv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -595,8 +595,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vnclip.nxv2i32.nxv2i64.nxv2i32(
 define <vscale x 2 x i32> @intrinsic_vnclip_wv_nxv2i32_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_wv_nxv2i32_nxv2i64_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vnclip.wv v11, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v11
 ; CHECK-NEXT:    ret
@@ -620,8 +620,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vnclip.mask.nxv2i32.nxv2i64.nxv2i32(
 define <vscale x 2 x i32> @intrinsic_vnclip_mask_wv_nxv2i32_nxv2i64_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i64> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_wv_nxv2i32_nxv2i64_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
 ; CHECK-NEXT:    vnclip.wv v8, v10, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -644,8 +644,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vnclip.nxv4i32.nxv4i64.nxv4i32(
 define <vscale x 4 x i32> @intrinsic_vnclip_wv_nxv4i32_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_wv_nxv4i32_nxv4i64_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vnclip.wv v14, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v14
 ; CHECK-NEXT:    ret
@@ -669,8 +669,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vnclip.mask.nxv4i32.nxv4i64.nxv4i32(
 define <vscale x 4 x i32> @intrinsic_vnclip_mask_wv_nxv4i32_nxv4i64_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i64> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_wv_nxv4i32_nxv4i64_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
 ; CHECK-NEXT:    vnclip.wv v8, v12, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -693,8 +693,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vnclip.nxv8i32.nxv8i64.nxv8i32(
 define <vscale x 8 x i32> @intrinsic_vnclip_wv_nxv8i32_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_wv_nxv8i32_nxv8i64_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vnclip.wv v20, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v20
 ; CHECK-NEXT:    ret
@@ -718,8 +718,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vnclip.mask.nxv8i32.nxv8i64.nxv8i32(
 define <vscale x 8 x i32> @intrinsic_vnclip_mask_wv_nxv8i32_nxv8i64_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i64> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_wv_nxv8i32_nxv8i64_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
 ; CHECK-NEXT:    vnclip.wv v8, v16, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -741,8 +741,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vnclip.nxv1i8.nxv1i16(
 define <vscale x 1 x i8> @intrinsic_vnclip_vx_nxv1i8_nxv1i16(<vscale x 1 x i16> %0, iXLen %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_vx_nxv1i8_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vnclip.wx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -765,8 +765,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vnclip.mask.nxv1i8.nxv1i16(
 define <vscale x 1 x i8> @intrinsic_vnclip_mask_vx_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, <vscale x 1 x i16> %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_vx_nxv1i8_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
 ; CHECK-NEXT:    vnclip.wx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -788,8 +788,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vnclip.nxv2i8.nxv2i16(
 define <vscale x 2 x i8> @intrinsic_vnclip_vx_nxv2i8_nxv2i16(<vscale x 2 x i16> %0, iXLen %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_vx_nxv2i8_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnclip.wx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -812,8 +812,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vnclip.mask.nxv2i8.nxv2i16(
 define <vscale x 2 x i8> @intrinsic_vnclip_mask_vx_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, <vscale x 2 x i16> %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_vx_nxv2i8_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
 ; CHECK-NEXT:    vnclip.wx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -835,8 +835,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vnclip.nxv4i8.nxv4i16(
 define <vscale x 4 x i8> @intrinsic_vnclip_vx_nxv4i8_nxv4i16(<vscale x 4 x i16> %0, iXLen %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_vx_nxv4i8_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vnclip.wx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -859,8 +859,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vnclip.mask.nxv4i8.nxv4i16(
 define <vscale x 4 x i8> @intrinsic_vnclip_mask_vx_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, <vscale x 4 x i16> %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_vx_nxv4i8_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
 ; CHECK-NEXT:    vnclip.wx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -882,8 +882,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vnclip.nxv8i8.nxv8i16(
 define <vscale x 8 x i8> @intrinsic_vnclip_vx_nxv8i8_nxv8i16(<vscale x 8 x i16> %0, iXLen %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_vx_nxv8i8_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    vnclip.wx v10, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
@@ -907,8 +907,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vnclip.mask.nxv8i8.nxv8i16(
 define <vscale x 8 x i8> @intrinsic_vnclip_mask_vx_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, <vscale x 8 x i16> %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_vx_nxv8i8_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
 ; CHECK-NEXT:    vnclip.wx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -930,8 +930,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vnclip.nxv16i8.nxv16i16(
 define <vscale x 16 x i8> @intrinsic_vnclip_vx_nxv16i8_nxv16i16(<vscale x 16 x i16> %0, iXLen %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_vx_nxv16i8_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vnclip.wx v12, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
@@ -955,8 +955,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vnclip.mask.nxv16i8.nxv16i16(
 define <vscale x 16 x i8> @intrinsic_vnclip_mask_vx_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, <vscale x 16 x i16> %1, iXLen %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_vx_nxv16i8_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
 ; CHECK-NEXT:    vnclip.wx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -978,8 +978,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vnclip.nxv32i8.nxv32i16(
 define <vscale x 32 x i8> @intrinsic_vnclip_vx_nxv32i8_nxv32i16(<vscale x 32 x i16> %0, iXLen %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_vx_nxv32i8_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    vnclip.wx v16, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
@@ -1003,8 +1003,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vnclip.mask.nxv32i8.nxv32i16(
 define <vscale x 32 x i8> @intrinsic_vnclip_mask_vx_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, <vscale x 32 x i16> %1, iXLen %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_vx_nxv32i8_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
 ; CHECK-NEXT:    vnclip.wx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1026,8 +1026,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vnclip.nxv1i16.nxv1i32(
 define <vscale x 1 x i16> @intrinsic_vnclip_vx_nxv1i16_nxv1i32(<vscale x 1 x i32> %0, iXLen %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_vx_nxv1i16_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vnclip.wx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1050,8 +1050,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vnclip.mask.nxv1i16.nxv1i32(
 define <vscale x 1 x i16> @intrinsic_vnclip_mask_vx_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, <vscale x 1 x i32> %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_vx_nxv1i16_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vnclip.wx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1073,8 +1073,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vnclip.nxv2i16.nxv2i32(
 define <vscale x 2 x i16> @intrinsic_vnclip_vx_nxv2i16_nxv2i32(<vscale x 2 x i32> %0, iXLen %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_vx_nxv2i16_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    vnclip.wx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1097,8 +1097,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vnclip.mask.nxv2i16.nxv2i32(
 define <vscale x 2 x i16> @intrinsic_vnclip_mask_vx_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, <vscale x 2 x i32> %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_vx_nxv2i16_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
 ; CHECK-NEXT:    vnclip.wx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1120,8 +1120,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vnclip.nxv4i16.nxv4i32(
 define <vscale x 4 x i16> @intrinsic_vnclip_vx_nxv4i16_nxv4i32(<vscale x 4 x i32> %0, iXLen %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_vx_nxv4i16_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vnclip.wx v10, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
@@ -1145,8 +1145,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vnclip.mask.nxv4i16.nxv4i32(
 define <vscale x 4 x i16> @intrinsic_vnclip_mask_vx_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, <vscale x 4 x i32> %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_vx_nxv4i16_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
 ; CHECK-NEXT:    vnclip.wx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1168,8 +1168,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vnclip.nxv8i16.nxv8i32(
 define <vscale x 8 x i16> @intrinsic_vnclip_vx_nxv8i16_nxv8i32(<vscale x 8 x i32> %0, iXLen %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_vx_nxv8i16_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    vnclip.wx v12, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
@@ -1193,8 +1193,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vnclip.mask.nxv8i16.nxv8i32(
 define <vscale x 8 x i16> @intrinsic_vnclip_mask_vx_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, <vscale x 8 x i32> %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_vx_nxv8i16_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
 ; CHECK-NEXT:    vnclip.wx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1216,8 +1216,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vnclip.nxv16i16.nxv16i32(
 define <vscale x 16 x i16> @intrinsic_vnclip_vx_nxv16i16_nxv16i32(<vscale x 16 x i32> %0, iXLen %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_vx_nxv16i16_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    vnclip.wx v16, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
@@ -1241,8 +1241,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vnclip.mask.nxv16i16.nxv16i32(
 define <vscale x 16 x i16> @intrinsic_vnclip_mask_vx_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, <vscale x 16 x i32> %1, iXLen %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_vx_nxv16i16_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
 ; CHECK-NEXT:    vnclip.wx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1264,8 +1264,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vnclip.nxv1i32.nxv1i64(
 define <vscale x 1 x i32> @intrinsic_vnclip_vx_nxv1i32_nxv1i64(<vscale x 1 x i64> %0, iXLen %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_vx_nxv1i32_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    vnclip.wx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1288,8 +1288,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vnclip.mask.nxv1i32.nxv1i64(
 define <vscale x 1 x i32> @intrinsic_vnclip_mask_vx_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, <vscale x 1 x i64> %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_vx_nxv1i32_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vnclip.wx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1311,8 +1311,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vnclip.nxv2i32.nxv2i64(
 define <vscale x 2 x i32> @intrinsic_vnclip_vx_nxv2i32_nxv2i64(<vscale x 2 x i64> %0, iXLen %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_vx_nxv2i32_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vnclip.wx v10, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
@@ -1336,8 +1336,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vnclip.mask.nxv2i32.nxv2i64(
 define <vscale x 2 x i32> @intrinsic_vnclip_mask_vx_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, <vscale x 2 x i64> %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_vx_nxv2i32_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
 ; CHECK-NEXT:    vnclip.wx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1359,8 +1359,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vnclip.nxv4i32.nxv4i64(
 define <vscale x 4 x i32> @intrinsic_vnclip_vx_nxv4i32_nxv4i64(<vscale x 4 x i64> %0, iXLen %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_vx_nxv4i32_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    vnclip.wx v12, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
@@ -1384,8 +1384,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vnclip.mask.nxv4i32.nxv4i64(
 define <vscale x 4 x i32> @intrinsic_vnclip_mask_vx_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, <vscale x 4 x i64> %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_vx_nxv4i32_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
 ; CHECK-NEXT:    vnclip.wx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1407,8 +1407,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vnclip.nxv8i32.nxv8i64(
 define <vscale x 8 x i32> @intrinsic_vnclip_vx_nxv8i32_nxv8i64(<vscale x 8 x i64> %0, iXLen %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_vx_nxv8i32_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    vnclip.wx v16, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
@@ -1432,8 +1432,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vnclip.mask.nxv8i32.nxv8i64(
 define <vscale x 8 x i32> @intrinsic_vnclip_mask_vx_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, <vscale x 8 x i64> %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_vx_nxv8i32_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
 ; CHECK-NEXT:    vnclip.wx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1450,8 +1450,8 @@ entry:
 define <vscale x 1 x i8> @intrinsic_vnclip_vi_nxv1i8_nxv1i16_i8(<vscale x 1 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_vi_nxv1i8_nxv1i16_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vnclip.wi v8, v8, 9
 ; CHECK-NEXT:    ret
 entry:
@@ -1467,8 +1467,8 @@ entry:
 define <vscale x 1 x i8> @intrinsic_vnclip_mask_vi_nxv1i8_nxv1i16_i8(<vscale x 1 x i8> %0, <vscale x 1 x i16> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_vi_nxv1i8_nxv1i16_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    vnclip.wi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1485,8 +1485,8 @@ entry:
 define <vscale x 2 x i8> @intrinsic_vnclip_vi_nxv2i8_nxv2i16_i8(<vscale x 2 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_vi_nxv2i8_nxv2i16_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnclip.wi v8, v8, 9
 ; CHECK-NEXT:    ret
 entry:
@@ -1502,8 +1502,8 @@ entry:
 define <vscale x 2 x i8> @intrinsic_vnclip_mask_vi_nxv2i8_nxv2i16_i8(<vscale x 2 x i8> %0, <vscale x 2 x i16> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_vi_nxv2i8_nxv2i16_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
 ; CHECK-NEXT:    vnclip.wi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1520,8 +1520,8 @@ entry:
 define <vscale x 4 x i8> @intrinsic_vnclip_vi_nxv4i8_nxv4i16_i8(<vscale x 4 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_vi_nxv4i8_nxv4i16_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vnclip.wi v8, v8, 9
 ; CHECK-NEXT:    ret
 entry:
@@ -1537,8 +1537,8 @@ entry:
 define <vscale x 4 x i8> @intrinsic_vnclip_mask_vi_nxv4i8_nxv4i16_i8(<vscale x 4 x i8> %0, <vscale x 4 x i16> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_vi_nxv4i8_nxv4i16_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
 ; CHECK-NEXT:    vnclip.wi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1555,8 +1555,8 @@ entry:
 define <vscale x 8 x i8> @intrinsic_vnclip_vi_nxv8i8_nxv8i16_i8(<vscale x 8 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_vi_nxv8i8_nxv8i16_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vnclip.wi v10, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
@@ -1573,8 +1573,8 @@ entry:
 define <vscale x 8 x i8> @intrinsic_vnclip_mask_vi_nxv8i8_nxv8i16_i8(<vscale x 8 x i8> %0, <vscale x 8 x i16> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_vi_nxv8i8_nxv8i16_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
 ; CHECK-NEXT:    vnclip.wi v8, v10, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1591,8 +1591,8 @@ entry:
 define <vscale x 16 x i8> @intrinsic_vnclip_vi_nxv16i8_nxv16i16_i8(<vscale x 16 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_vi_nxv16i8_nxv16i16_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vnclip.wi v12, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
@@ -1609,8 +1609,8 @@ entry:
 define <vscale x 16 x i8> @intrinsic_vnclip_mask_vi_nxv16i8_nxv16i16_i8(<vscale x 16 x i8> %0, <vscale x 16 x i16> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_vi_nxv16i8_nxv16i16_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
 ; CHECK-NEXT:    vnclip.wi v8, v12, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1627,8 +1627,8 @@ entry:
 define <vscale x 32 x i8> @intrinsic_vnclip_vi_nxv32i8_nxv32i16_i8(<vscale x 32 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_vi_nxv32i8_nxv32i16_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vnclip.wi v16, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
@@ -1645,8 +1645,8 @@ entry:
 define <vscale x 32 x i8> @intrinsic_vnclip_mask_vi_nxv32i8_nxv32i16_i8(<vscale x 32 x i8> %0, <vscale x 32 x i16> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_vi_nxv32i8_nxv32i16_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
 ; CHECK-NEXT:    vnclip.wi v8, v16, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1663,8 +1663,8 @@ entry:
 define <vscale x 1 x i16> @intrinsic_vnclip_vi_nxv1i16_nxv1i32_i16(<vscale x 1 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_vi_nxv1i16_nxv1i32_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vnclip.wi v8, v8, 9
 ; CHECK-NEXT:    ret
 entry:
@@ -1680,8 +1680,8 @@ entry:
 define <vscale x 1 x i16> @intrinsic_vnclip_mask_vi_nxv1i16_nxv1i32_i16(<vscale x 1 x i16> %0, <vscale x 1 x i32> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_vi_nxv1i16_nxv1i32_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
 ; CHECK-NEXT:    vnclip.wi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1698,8 +1698,8 @@ entry:
 define <vscale x 2 x i16> @intrinsic_vnclip_vi_nxv2i16_nxv2i32_i16(<vscale x 2 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_vi_nxv2i16_nxv2i32_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vnclip.wi v8, v8, 9
 ; CHECK-NEXT:    ret
 entry:
@@ -1715,8 +1715,8 @@ entry:
 define <vscale x 2 x i16> @intrinsic_vnclip_mask_vi_nxv2i16_nxv2i32_i16(<vscale x 2 x i16> %0, <vscale x 2 x i32> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_vi_nxv2i16_nxv2i32_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
 ; CHECK-NEXT:    vnclip.wi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1733,8 +1733,8 @@ entry:
 define <vscale x 4 x i16> @intrinsic_vnclip_vi_nxv4i16_nxv4i32_i16(<vscale x 4 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_vi_nxv4i16_nxv4i32_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vnclip.wi v10, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
@@ -1751,8 +1751,8 @@ entry:
 define <vscale x 4 x i16> @intrinsic_vnclip_mask_vi_nxv4i16_nxv4i32_i16(<vscale x 4 x i16> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_vi_nxv4i16_nxv4i32_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
 ; CHECK-NEXT:    vnclip.wi v8, v10, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1769,8 +1769,8 @@ entry:
 define <vscale x 8 x i16> @intrinsic_vnclip_vi_nxv8i16_nxv8i32_i16(<vscale x 8 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_vi_nxv8i16_nxv8i32_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vnclip.wi v12, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
@@ -1787,8 +1787,8 @@ entry:
 define <vscale x 8 x i16> @intrinsic_vnclip_mask_vi_nxv8i16_nxv8i32_i16(<vscale x 8 x i16> %0, <vscale x 8 x i32> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_vi_nxv8i16_nxv8i32_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
 ; CHECK-NEXT:    vnclip.wi v8, v12, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1805,8 +1805,8 @@ entry:
 define <vscale x 16 x i16> @intrinsic_vnclip_vi_nxv16i16_nxv16i32_i16(<vscale x 16 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_vi_nxv16i16_nxv16i32_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vnclip.wi v16, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
@@ -1823,8 +1823,8 @@ entry:
 define <vscale x 16 x i16> @intrinsic_vnclip_mask_vi_nxv16i16_nxv16i32_i16(<vscale x 16 x i16> %0, <vscale x 16 x i32> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_vi_nxv16i16_nxv16i32_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vnclip.wi v8, v16, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1841,8 +1841,8 @@ entry:
 define <vscale x 1 x i32> @intrinsic_vnclip_vi_nxv1i32_nxv1i64_i32(<vscale x 1 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_vi_nxv1i32_nxv1i64_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vnclip.wi v8, v8, 9
 ; CHECK-NEXT:    ret
 entry:
@@ -1858,8 +1858,8 @@ entry:
 define <vscale x 1 x i32> @intrinsic_vnclip_mask_vi_nxv1i32_nxv1i64_i32(<vscale x 1 x i32> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_vi_nxv1i32_nxv1i64_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
 ; CHECK-NEXT:    vnclip.wi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1876,8 +1876,8 @@ entry:
 define <vscale x 2 x i32> @intrinsic_vnclip_vi_nxv2i32_nxv2i64_i32(<vscale x 2 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_vi_nxv2i32_nxv2i64_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vnclip.wi v10, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
@@ -1894,8 +1894,8 @@ entry:
 define <vscale x 2 x i32> @intrinsic_vnclip_mask_vi_nxv2i32_nxv2i64_i32(<vscale x 2 x i32> %0, <vscale x 2 x i64> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_vi_nxv2i32_nxv2i64_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
 ; CHECK-NEXT:    vnclip.wi v8, v10, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1912,8 +1912,8 @@ entry:
 define <vscale x 4 x i32> @intrinsic_vnclip_vi_nxv4i32_nxv4i64_i32(<vscale x 4 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_vi_nxv4i32_nxv4i64_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vnclip.wi v12, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
@@ -1930,8 +1930,8 @@ entry:
 define <vscale x 4 x i32> @intrinsic_vnclip_mask_vi_nxv4i32_nxv4i64_i32(<vscale x 4 x i32> %0, <vscale x 4 x i64> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_vi_nxv4i32_nxv4i64_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
 ; CHECK-NEXT:    vnclip.wi v8, v12, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1948,8 +1948,8 @@ entry:
 define <vscale x 8 x i32> @intrinsic_vnclip_vi_nxv8i32_nxv8i64_i32(<vscale x 8 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_vi_nxv8i32_nxv8i64_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vnclip.wi v16, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
@@ -1966,8 +1966,8 @@ entry:
 define <vscale x 8 x i32> @intrinsic_vnclip_mask_vi_nxv8i32_nxv8i64_i32(<vscale x 8 x i32> %0, <vscale x 8 x i64> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vnclip_mask_vi_nxv8i32_nxv8i64_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
 ; CHECK-NEXT:    vnclip.wi v8, v16, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vnclipu.ll b/llvm/test/CodeGen/RISCV/rvv/vnclipu.ll
index 39980504f887..a1804e7d98a4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vnclipu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vnclipu.ll
@@ -13,8 +13,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vnclipu.nxv1i8.nxv1i16.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vnclipu_wv_nxv1i8_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_wv_nxv1i8_nxv1i16_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vnclipu.wv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -37,8 +37,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vnclipu.mask.nxv1i8.nxv1i16.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vnclipu_mask_wv_nxv1i8_nxv1i16_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i16> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_wv_nxv1i8_nxv1i16_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    vnclipu.wv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -61,8 +61,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vnclipu.nxv2i8.nxv2i16.nxv2i8(
 define <vscale x 2 x i8> @intrinsic_vnclipu_wv_nxv2i8_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_wv_nxv2i8_nxv2i16_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnclipu.wv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -85,8 +85,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vnclipu.mask.nxv2i8.nxv2i16.nxv2i8(
 define <vscale x 2 x i8> @intrinsic_vnclipu_mask_wv_nxv2i8_nxv2i16_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i16> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_wv_nxv2i8_nxv2i16_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
 ; CHECK-NEXT:    vnclipu.wv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -109,8 +109,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vnclipu.nxv4i8.nxv4i16.nxv4i8(
 define <vscale x 4 x i8> @intrinsic_vnclipu_wv_nxv4i8_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_wv_nxv4i8_nxv4i16_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vnclipu.wv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -133,8 +133,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vnclipu.mask.nxv4i8.nxv4i16.nxv4i8(
 define <vscale x 4 x i8> @intrinsic_vnclipu_mask_wv_nxv4i8_nxv4i16_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i16> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_wv_nxv4i8_nxv4i16_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
 ; CHECK-NEXT:    vnclipu.wv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -157,8 +157,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vnclipu.nxv8i8.nxv8i16.nxv8i8(
 define <vscale x 8 x i8> @intrinsic_vnclipu_wv_nxv8i8_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_wv_nxv8i8_nxv8i16_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vnclipu.wv v11, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v11
 ; CHECK-NEXT:    ret
@@ -182,8 +182,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vnclipu.mask.nxv8i8.nxv8i16.nxv8i8(
 define <vscale x 8 x i8> @intrinsic_vnclipu_mask_wv_nxv8i8_nxv8i16_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i16> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_wv_nxv8i8_nxv8i16_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
 ; CHECK-NEXT:    vnclipu.wv v8, v10, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -206,8 +206,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vnclipu.nxv16i8.nxv16i16.nxv16i8(
 define <vscale x 16 x i8> @intrinsic_vnclipu_wv_nxv16i8_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_wv_nxv16i8_nxv16i16_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vnclipu.wv v14, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v14
 ; CHECK-NEXT:    ret
@@ -231,8 +231,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vnclipu.mask.nxv16i8.nxv16i16.nxv16i8(
 define <vscale x 16 x i8> @intrinsic_vnclipu_mask_wv_nxv16i8_nxv16i16_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i16> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_wv_nxv16i8_nxv16i16_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
 ; CHECK-NEXT:    vnclipu.wv v8, v12, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -255,8 +255,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vnclipu.nxv32i8.nxv32i16.nxv32i8(
 define <vscale x 32 x i8> @intrinsic_vnclipu_wv_nxv32i8_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_wv_nxv32i8_nxv32i16_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vnclipu.wv v20, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v20
 ; CHECK-NEXT:    ret
@@ -280,8 +280,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vnclipu.mask.nxv32i8.nxv32i16.nxv32i8(
 define <vscale x 32 x i8> @intrinsic_vnclipu_mask_wv_nxv32i8_nxv32i16_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i16> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_wv_nxv32i8_nxv32i16_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
 ; CHECK-NEXT:    vnclipu.wv v8, v16, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -304,8 +304,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vnclipu.nxv1i16.nxv1i32.nxv1i16(
 define <vscale x 1 x i16> @intrinsic_vnclipu_wv_nxv1i16_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_wv_nxv1i16_nxv1i32_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vnclipu.wv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -328,8 +328,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vnclipu.mask.nxv1i16.nxv1i32.nxv1i16(
 define <vscale x 1 x i16> @intrinsic_vnclipu_mask_wv_nxv1i16_nxv1i32_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i32> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_wv_nxv1i16_nxv1i32_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
 ; CHECK-NEXT:    vnclipu.wv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -352,8 +352,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vnclipu.nxv2i16.nxv2i32.nxv2i16(
 define <vscale x 2 x i16> @intrinsic_vnclipu_wv_nxv2i16_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_wv_nxv2i16_nxv2i32_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vnclipu.wv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -376,8 +376,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vnclipu.mask.nxv2i16.nxv2i32.nxv2i16(
 define <vscale x 2 x i16> @intrinsic_vnclipu_mask_wv_nxv2i16_nxv2i32_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i32> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_wv_nxv2i16_nxv2i32_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
 ; CHECK-NEXT:    vnclipu.wv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -400,8 +400,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vnclipu.nxv4i16.nxv4i32.nxv4i16(
 define <vscale x 4 x i16> @intrinsic_vnclipu_wv_nxv4i16_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_wv_nxv4i16_nxv4i32_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vnclipu.wv v11, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v11
 ; CHECK-NEXT:    ret
@@ -425,8 +425,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vnclipu.mask.nxv4i16.nxv4i32.nxv4i16(
 define <vscale x 4 x i16> @intrinsic_vnclipu_mask_wv_nxv4i16_nxv4i32_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i32> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_wv_nxv4i16_nxv4i32_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
 ; CHECK-NEXT:    vnclipu.wv v8, v10, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -449,8 +449,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vnclipu.nxv8i16.nxv8i32.nxv8i16(
 define <vscale x 8 x i16> @intrinsic_vnclipu_wv_nxv8i16_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_wv_nxv8i16_nxv8i32_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vnclipu.wv v14, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v14
 ; CHECK-NEXT:    ret
@@ -474,8 +474,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vnclipu.mask.nxv8i16.nxv8i32.nxv8i16(
 define <vscale x 8 x i16> @intrinsic_vnclipu_mask_wv_nxv8i16_nxv8i32_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i32> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_wv_nxv8i16_nxv8i32_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
 ; CHECK-NEXT:    vnclipu.wv v8, v12, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -498,8 +498,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vnclipu.nxv16i16.nxv16i32.nxv16i16(
 define <vscale x 16 x i16> @intrinsic_vnclipu_wv_nxv16i16_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_wv_nxv16i16_nxv16i32_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vnclipu.wv v20, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v20
 ; CHECK-NEXT:    ret
@@ -523,8 +523,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vnclipu.mask.nxv16i16.nxv16i32.nxv16i16(
 define <vscale x 16 x i16> @intrinsic_vnclipu_mask_wv_nxv16i16_nxv16i32_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i32> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_wv_nxv16i16_nxv16i32_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vnclipu.wv v8, v16, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -547,8 +547,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vnclipu.nxv1i32.nxv1i64.nxv1i32(
 define <vscale x 1 x i32> @intrinsic_vnclipu_wv_nxv1i32_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_wv_nxv1i32_nxv1i64_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vnclipu.wv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -571,8 +571,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vnclipu.mask.nxv1i32.nxv1i64.nxv1i32(
 define <vscale x 1 x i32> @intrinsic_vnclipu_mask_wv_nxv1i32_nxv1i64_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i64> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_wv_nxv1i32_nxv1i64_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
 ; CHECK-NEXT:    vnclipu.wv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -595,8 +595,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vnclipu.nxv2i32.nxv2i64.nxv2i32(
 define <vscale x 2 x i32> @intrinsic_vnclipu_wv_nxv2i32_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_wv_nxv2i32_nxv2i64_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vnclipu.wv v11, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v11
 ; CHECK-NEXT:    ret
@@ -620,8 +620,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vnclipu.mask.nxv2i32.nxv2i64.nxv2i32(
 define <vscale x 2 x i32> @intrinsic_vnclipu_mask_wv_nxv2i32_nxv2i64_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i64> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_wv_nxv2i32_nxv2i64_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
 ; CHECK-NEXT:    vnclipu.wv v8, v10, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -644,8 +644,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vnclipu.nxv4i32.nxv4i64.nxv4i32(
 define <vscale x 4 x i32> @intrinsic_vnclipu_wv_nxv4i32_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_wv_nxv4i32_nxv4i64_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vnclipu.wv v14, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v14
 ; CHECK-NEXT:    ret
@@ -669,8 +669,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vnclipu.mask.nxv4i32.nxv4i64.nxv4i32(
 define <vscale x 4 x i32> @intrinsic_vnclipu_mask_wv_nxv4i32_nxv4i64_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i64> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_wv_nxv4i32_nxv4i64_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
 ; CHECK-NEXT:    vnclipu.wv v8, v12, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -693,8 +693,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vnclipu.nxv8i32.nxv8i64.nxv8i32(
 define <vscale x 8 x i32> @intrinsic_vnclipu_wv_nxv8i32_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_wv_nxv8i32_nxv8i64_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vnclipu.wv v20, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v20
 ; CHECK-NEXT:    ret
@@ -718,8 +718,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vnclipu.mask.nxv8i32.nxv8i64.nxv8i32(
 define <vscale x 8 x i32> @intrinsic_vnclipu_mask_wv_nxv8i32_nxv8i64_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i64> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_wv_nxv8i32_nxv8i64_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
 ; CHECK-NEXT:    vnclipu.wv v8, v16, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -741,8 +741,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vnclipu.nxv1i8.nxv1i16(
 define <vscale x 1 x i8> @intrinsic_vnclipu_vx_nxv1i8_nxv1i16(<vscale x 1 x i16> %0, iXLen %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_vx_nxv1i8_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vnclipu.wx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -765,8 +765,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vnclipu.mask.nxv1i8.nxv1i16(
 define <vscale x 1 x i8> @intrinsic_vnclipu_mask_vx_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, <vscale x 1 x i16> %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_vx_nxv1i8_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
 ; CHECK-NEXT:    vnclipu.wx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -788,8 +788,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vnclipu.nxv2i8.nxv2i16(
 define <vscale x 2 x i8> @intrinsic_vnclipu_vx_nxv2i8_nxv2i16(<vscale x 2 x i16> %0, iXLen %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_vx_nxv2i8_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnclipu.wx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -812,8 +812,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vnclipu.mask.nxv2i8.nxv2i16(
 define <vscale x 2 x i8> @intrinsic_vnclipu_mask_vx_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, <vscale x 2 x i16> %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_vx_nxv2i8_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
 ; CHECK-NEXT:    vnclipu.wx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -835,8 +835,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vnclipu.nxv4i8.nxv4i16(
 define <vscale x 4 x i8> @intrinsic_vnclipu_vx_nxv4i8_nxv4i16(<vscale x 4 x i16> %0, iXLen %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_vx_nxv4i8_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vnclipu.wx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -859,8 +859,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vnclipu.mask.nxv4i8.nxv4i16(
 define <vscale x 4 x i8> @intrinsic_vnclipu_mask_vx_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, <vscale x 4 x i16> %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_vx_nxv4i8_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
 ; CHECK-NEXT:    vnclipu.wx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -882,8 +882,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vnclipu.nxv8i8.nxv8i16(
 define <vscale x 8 x i8> @intrinsic_vnclipu_vx_nxv8i8_nxv8i16(<vscale x 8 x i16> %0, iXLen %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_vx_nxv8i8_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    vnclipu.wx v10, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
@@ -907,8 +907,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vnclipu.mask.nxv8i8.nxv8i16(
 define <vscale x 8 x i8> @intrinsic_vnclipu_mask_vx_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, <vscale x 8 x i16> %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_vx_nxv8i8_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
 ; CHECK-NEXT:    vnclipu.wx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -930,8 +930,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vnclipu.nxv16i8.nxv16i16(
 define <vscale x 16 x i8> @intrinsic_vnclipu_vx_nxv16i8_nxv16i16(<vscale x 16 x i16> %0, iXLen %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_vx_nxv16i8_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vnclipu.wx v12, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
@@ -955,8 +955,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vnclipu.mask.nxv16i8.nxv16i16(
 define <vscale x 16 x i8> @intrinsic_vnclipu_mask_vx_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, <vscale x 16 x i16> %1, iXLen %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_vx_nxv16i8_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
 ; CHECK-NEXT:    vnclipu.wx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -978,8 +978,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vnclipu.nxv32i8.nxv32i16(
 define <vscale x 32 x i8> @intrinsic_vnclipu_vx_nxv32i8_nxv32i16(<vscale x 32 x i16> %0, iXLen %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_vx_nxv32i8_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    vnclipu.wx v16, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
@@ -1003,8 +1003,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vnclipu.mask.nxv32i8.nxv32i16(
 define <vscale x 32 x i8> @intrinsic_vnclipu_mask_vx_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, <vscale x 32 x i16> %1, iXLen %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_vx_nxv32i8_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
 ; CHECK-NEXT:    vnclipu.wx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1026,8 +1026,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vnclipu.nxv1i16.nxv1i32(
 define <vscale x 1 x i16> @intrinsic_vnclipu_vx_nxv1i16_nxv1i32(<vscale x 1 x i32> %0, iXLen %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_vx_nxv1i16_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vnclipu.wx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1050,8 +1050,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vnclipu.mask.nxv1i16.nxv1i32(
 define <vscale x 1 x i16> @intrinsic_vnclipu_mask_vx_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, <vscale x 1 x i32> %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_vx_nxv1i16_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vnclipu.wx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1073,8 +1073,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vnclipu.nxv2i16.nxv2i32(
 define <vscale x 2 x i16> @intrinsic_vnclipu_vx_nxv2i16_nxv2i32(<vscale x 2 x i32> %0, iXLen %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_vx_nxv2i16_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    vnclipu.wx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1097,8 +1097,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vnclipu.mask.nxv2i16.nxv2i32(
 define <vscale x 2 x i16> @intrinsic_vnclipu_mask_vx_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, <vscale x 2 x i32> %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_vx_nxv2i16_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
 ; CHECK-NEXT:    vnclipu.wx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1120,8 +1120,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vnclipu.nxv4i16.nxv4i32(
 define <vscale x 4 x i16> @intrinsic_vnclipu_vx_nxv4i16_nxv4i32(<vscale x 4 x i32> %0, iXLen %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_vx_nxv4i16_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vnclipu.wx v10, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
@@ -1145,8 +1145,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vnclipu.mask.nxv4i16.nxv4i32(
 define <vscale x 4 x i16> @intrinsic_vnclipu_mask_vx_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, <vscale x 4 x i32> %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_vx_nxv4i16_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
 ; CHECK-NEXT:    vnclipu.wx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1168,8 +1168,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vnclipu.nxv8i16.nxv8i32(
 define <vscale x 8 x i16> @intrinsic_vnclipu_vx_nxv8i16_nxv8i32(<vscale x 8 x i32> %0, iXLen %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_vx_nxv8i16_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    vnclipu.wx v12, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
@@ -1193,8 +1193,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vnclipu.mask.nxv8i16.nxv8i32(
 define <vscale x 8 x i16> @intrinsic_vnclipu_mask_vx_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, <vscale x 8 x i32> %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_vx_nxv8i16_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
 ; CHECK-NEXT:    vnclipu.wx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1216,8 +1216,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vnclipu.nxv16i16.nxv16i32(
 define <vscale x 16 x i16> @intrinsic_vnclipu_vx_nxv16i16_nxv16i32(<vscale x 16 x i32> %0, iXLen %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_vx_nxv16i16_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    vnclipu.wx v16, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
@@ -1241,8 +1241,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vnclipu.mask.nxv16i16.nxv16i32(
 define <vscale x 16 x i16> @intrinsic_vnclipu_mask_vx_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, <vscale x 16 x i32> %1, iXLen %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_vx_nxv16i16_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
 ; CHECK-NEXT:    vnclipu.wx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1264,8 +1264,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vnclipu.nxv1i32.nxv1i64(
 define <vscale x 1 x i32> @intrinsic_vnclipu_vx_nxv1i32_nxv1i64(<vscale x 1 x i64> %0, iXLen %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_vx_nxv1i32_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    vnclipu.wx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1288,8 +1288,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vnclipu.mask.nxv1i32.nxv1i64(
 define <vscale x 1 x i32> @intrinsic_vnclipu_mask_vx_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, <vscale x 1 x i64> %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_vx_nxv1i32_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vnclipu.wx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1311,8 +1311,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vnclipu.nxv2i32.nxv2i64(
 define <vscale x 2 x i32> @intrinsic_vnclipu_vx_nxv2i32_nxv2i64(<vscale x 2 x i64> %0, iXLen %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_vx_nxv2i32_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vnclipu.wx v10, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
@@ -1336,8 +1336,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vnclipu.mask.nxv2i32.nxv2i64(
 define <vscale x 2 x i32> @intrinsic_vnclipu_mask_vx_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, <vscale x 2 x i64> %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_vx_nxv2i32_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
 ; CHECK-NEXT:    vnclipu.wx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1359,8 +1359,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vnclipu.nxv4i32.nxv4i64(
 define <vscale x 4 x i32> @intrinsic_vnclipu_vx_nxv4i32_nxv4i64(<vscale x 4 x i64> %0, iXLen %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_vx_nxv4i32_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    vnclipu.wx v12, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
@@ -1384,8 +1384,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vnclipu.mask.nxv4i32.nxv4i64(
 define <vscale x 4 x i32> @intrinsic_vnclipu_mask_vx_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, <vscale x 4 x i64> %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_vx_nxv4i32_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
 ; CHECK-NEXT:    vnclipu.wx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1407,8 +1407,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vnclipu.nxv8i32.nxv8i64(
 define <vscale x 8 x i32> @intrinsic_vnclipu_vx_nxv8i32_nxv8i64(<vscale x 8 x i64> %0, iXLen %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_vx_nxv8i32_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    vnclipu.wx v16, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
@@ -1432,8 +1432,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vnclipu.mask.nxv8i32.nxv8i64(
 define <vscale x 8 x i32> @intrinsic_vnclipu_mask_vx_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, <vscale x 8 x i64> %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_vx_nxv8i32_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
 ; CHECK-NEXT:    vnclipu.wx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1450,8 +1450,8 @@ entry:
 define <vscale x 1 x i8> @intrinsic_vnclipu_vi_nxv1i8_nxv1i16_i8(<vscale x 1 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_vi_nxv1i8_nxv1i16_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vnclipu.wi v8, v8, 9
 ; CHECK-NEXT:    ret
 entry:
@@ -1467,8 +1467,8 @@ entry:
 define <vscale x 1 x i8> @intrinsic_vnclipu_mask_vi_nxv1i8_nxv1i16_i8(<vscale x 1 x i8> %0, <vscale x 1 x i16> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_vi_nxv1i8_nxv1i16_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    vnclipu.wi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1485,8 +1485,8 @@ entry:
 define <vscale x 2 x i8> @intrinsic_vnclipu_vi_nxv2i8_nxv2i16_i8(<vscale x 2 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_vi_nxv2i8_nxv2i16_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnclipu.wi v8, v8, 9
 ; CHECK-NEXT:    ret
 entry:
@@ -1502,8 +1502,8 @@ entry:
 define <vscale x 2 x i8> @intrinsic_vnclipu_mask_vi_nxv2i8_nxv2i16_i8(<vscale x 2 x i8> %0, <vscale x 2 x i16> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_vi_nxv2i8_nxv2i16_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
 ; CHECK-NEXT:    vnclipu.wi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1520,8 +1520,8 @@ entry:
 define <vscale x 4 x i8> @intrinsic_vnclipu_vi_nxv4i8_nxv4i16_i8(<vscale x 4 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_vi_nxv4i8_nxv4i16_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vnclipu.wi v8, v8, 9
 ; CHECK-NEXT:    ret
 entry:
@@ -1537,8 +1537,8 @@ entry:
 define <vscale x 4 x i8> @intrinsic_vnclipu_mask_vi_nxv4i8_nxv4i16_i8(<vscale x 4 x i8> %0, <vscale x 4 x i16> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_vi_nxv4i8_nxv4i16_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
 ; CHECK-NEXT:    vnclipu.wi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1555,8 +1555,8 @@ entry:
 define <vscale x 8 x i8> @intrinsic_vnclipu_vi_nxv8i8_nxv8i16_i8(<vscale x 8 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_vi_nxv8i8_nxv8i16_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vnclipu.wi v10, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
@@ -1573,8 +1573,8 @@ entry:
 define <vscale x 8 x i8> @intrinsic_vnclipu_mask_vi_nxv8i8_nxv8i16_i8(<vscale x 8 x i8> %0, <vscale x 8 x i16> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_vi_nxv8i8_nxv8i16_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
 ; CHECK-NEXT:    vnclipu.wi v8, v10, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1591,8 +1591,8 @@ entry:
 define <vscale x 16 x i8> @intrinsic_vnclipu_vi_nxv16i8_nxv16i16_i8(<vscale x 16 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_vi_nxv16i8_nxv16i16_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vnclipu.wi v12, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
@@ -1609,8 +1609,8 @@ entry:
 define <vscale x 16 x i8> @intrinsic_vnclipu_mask_vi_nxv16i8_nxv16i16_i8(<vscale x 16 x i8> %0, <vscale x 16 x i16> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_vi_nxv16i8_nxv16i16_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
 ; CHECK-NEXT:    vnclipu.wi v8, v12, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1627,8 +1627,8 @@ entry:
 define <vscale x 32 x i8> @intrinsic_vnclipu_vi_nxv32i8_nxv32i16_i8(<vscale x 32 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_vi_nxv32i8_nxv32i16_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vnclipu.wi v16, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
@@ -1645,8 +1645,8 @@ entry:
 define <vscale x 32 x i8> @intrinsic_vnclipu_mask_vi_nxv32i8_nxv32i16_i8(<vscale x 32 x i8> %0, <vscale x 32 x i16> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_vi_nxv32i8_nxv32i16_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
 ; CHECK-NEXT:    vnclipu.wi v8, v16, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1663,8 +1663,8 @@ entry:
 define <vscale x 1 x i16> @intrinsic_vnclipu_vi_nxv1i16_nxv1i32_i16(<vscale x 1 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_vi_nxv1i16_nxv1i32_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vnclipu.wi v8, v8, 9
 ; CHECK-NEXT:    ret
 entry:
@@ -1680,8 +1680,8 @@ entry:
 define <vscale x 1 x i16> @intrinsic_vnclipu_mask_vi_nxv1i16_nxv1i32_i16(<vscale x 1 x i16> %0, <vscale x 1 x i32> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_vi_nxv1i16_nxv1i32_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
 ; CHECK-NEXT:    vnclipu.wi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1698,8 +1698,8 @@ entry:
 define <vscale x 2 x i16> @intrinsic_vnclipu_vi_nxv2i16_nxv2i32_i16(<vscale x 2 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_vi_nxv2i16_nxv2i32_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vnclipu.wi v8, v8, 9
 ; CHECK-NEXT:    ret
 entry:
@@ -1715,8 +1715,8 @@ entry:
 define <vscale x 2 x i16> @intrinsic_vnclipu_mask_vi_nxv2i16_nxv2i32_i16(<vscale x 2 x i16> %0, <vscale x 2 x i32> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_vi_nxv2i16_nxv2i32_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
 ; CHECK-NEXT:    vnclipu.wi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1733,8 +1733,8 @@ entry:
 define <vscale x 4 x i16> @intrinsic_vnclipu_vi_nxv4i16_nxv4i32_i16(<vscale x 4 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_vi_nxv4i16_nxv4i32_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vnclipu.wi v10, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
@@ -1751,8 +1751,8 @@ entry:
 define <vscale x 4 x i16> @intrinsic_vnclipu_mask_vi_nxv4i16_nxv4i32_i16(<vscale x 4 x i16> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_vi_nxv4i16_nxv4i32_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
 ; CHECK-NEXT:    vnclipu.wi v8, v10, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1769,8 +1769,8 @@ entry:
 define <vscale x 8 x i16> @intrinsic_vnclipu_vi_nxv8i16_nxv8i32_i16(<vscale x 8 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_vi_nxv8i16_nxv8i32_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vnclipu.wi v12, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
@@ -1787,8 +1787,8 @@ entry:
 define <vscale x 8 x i16> @intrinsic_vnclipu_mask_vi_nxv8i16_nxv8i32_i16(<vscale x 8 x i16> %0, <vscale x 8 x i32> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_vi_nxv8i16_nxv8i32_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
 ; CHECK-NEXT:    vnclipu.wi v8, v12, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1805,8 +1805,8 @@ entry:
 define <vscale x 16 x i16> @intrinsic_vnclipu_vi_nxv16i16_nxv16i32_i16(<vscale x 16 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_vi_nxv16i16_nxv16i32_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vnclipu.wi v16, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
@@ -1823,8 +1823,8 @@ entry:
 define <vscale x 16 x i16> @intrinsic_vnclipu_mask_vi_nxv16i16_nxv16i32_i16(<vscale x 16 x i16> %0, <vscale x 16 x i32> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_vi_nxv16i16_nxv16i32_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vnclipu.wi v8, v16, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1841,8 +1841,8 @@ entry:
 define <vscale x 1 x i32> @intrinsic_vnclipu_vi_nxv1i32_nxv1i64_i32(<vscale x 1 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_vi_nxv1i32_nxv1i64_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vnclipu.wi v8, v8, 9
 ; CHECK-NEXT:    ret
 entry:
@@ -1858,8 +1858,8 @@ entry:
 define <vscale x 1 x i32> @intrinsic_vnclipu_mask_vi_nxv1i32_nxv1i64_i32(<vscale x 1 x i32> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_vi_nxv1i32_nxv1i64_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
 ; CHECK-NEXT:    vnclipu.wi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1876,8 +1876,8 @@ entry:
 define <vscale x 2 x i32> @intrinsic_vnclipu_vi_nxv2i32_nxv2i64_i32(<vscale x 2 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_vi_nxv2i32_nxv2i64_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vnclipu.wi v10, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
@@ -1894,8 +1894,8 @@ entry:
 define <vscale x 2 x i32> @intrinsic_vnclipu_mask_vi_nxv2i32_nxv2i64_i32(<vscale x 2 x i32> %0, <vscale x 2 x i64> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_vi_nxv2i32_nxv2i64_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
 ; CHECK-NEXT:    vnclipu.wi v8, v10, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1912,8 +1912,8 @@ entry:
 define <vscale x 4 x i32> @intrinsic_vnclipu_vi_nxv4i32_nxv4i64_i32(<vscale x 4 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_vi_nxv4i32_nxv4i64_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vnclipu.wi v12, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
@@ -1930,8 +1930,8 @@ entry:
 define <vscale x 4 x i32> @intrinsic_vnclipu_mask_vi_nxv4i32_nxv4i64_i32(<vscale x 4 x i32> %0, <vscale x 4 x i64> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_vi_nxv4i32_nxv4i64_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
 ; CHECK-NEXT:    vnclipu.wi v8, v12, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1948,8 +1948,8 @@ entry:
 define <vscale x 8 x i32> @intrinsic_vnclipu_vi_nxv8i32_nxv8i64_i32(<vscale x 8 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_vi_nxv8i32_nxv8i64_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vnclipu.wi v16, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
@@ -1966,8 +1966,8 @@ entry:
 define <vscale x 8 x i32> @intrinsic_vnclipu_mask_vi_nxv8i32_nxv8i64_i32(<vscale x 8 x i32> %0, <vscale x 8 x i64> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vnclipu_mask_vi_nxv8i32_nxv8i64_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
 ; CHECK-NEXT:    vnclipu.wi v8, v16, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
index 4f7cb84c0864..46560fc501c6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
@@ -579,3 +579,125 @@ define double @vpreduce_ord_fadd_nxv4f64(double %s, <vscale x 4 x double> %v, <v
   %r = call double @llvm.vp.reduce.fadd.nxv4f64(double %s, <vscale x 4 x double> %v, <vscale x 4 x i1> %m, i32 %evl)
   ret double %r
 }
+
+define float @vreduce_fminimum_nxv4f32(float %start, <vscale x 4 x float> %val, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vreduce_fminimum_nxv4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vfmv.s.f v10, fa0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vfredmin.vs v10, v8, v10, v0.t
+; CHECK-NEXT:    vmfne.vv v11, v8, v8, v0.t
+; CHECK-NEXT:    vcpop.m a0, v11, v0.t
+; CHECK-NEXT:    feq.s a1, fa0, fa0
+; CHECK-NEXT:    xori a1, a1, 1
+; CHECK-NEXT:    or a0, a0, a1
+; CHECK-NEXT:    beqz a0, .LBB22_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, 523264
+; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB22_2:
+; CHECK-NEXT:    vfmv.f.s fa0, v10
+; CHECK-NEXT:    ret
+  %s = call float @llvm.vp.reduce.fminimum.nxv4f32(float %start, <vscale x 4 x float> %val, <vscale x 4 x i1> %m, i32 %evl)
+  ret float %s
+}
+
+define float @vreduce_fmaximum_nxv4f32(float %start, <vscale x 4 x float> %val, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vreduce_fmaximum_nxv4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vfmv.s.f v10, fa0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vfredmax.vs v10, v8, v10, v0.t
+; CHECK-NEXT:    vmfne.vv v11, v8, v8, v0.t
+; CHECK-NEXT:    vcpop.m a0, v11, v0.t
+; CHECK-NEXT:    feq.s a1, fa0, fa0
+; CHECK-NEXT:    xori a1, a1, 1
+; CHECK-NEXT:    or a0, a0, a1
+; CHECK-NEXT:    beqz a0, .LBB23_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, 523264
+; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB23_2:
+; CHECK-NEXT:    vfmv.f.s fa0, v10
+; CHECK-NEXT:    ret
+  %s = call float @llvm.vp.reduce.fmaximum.nxv4f32(float %start, <vscale x 4 x float> %val, <vscale x 4 x i1> %m, i32 %evl)
+  ret float %s
+}
+
+define float @vreduce_fminimum_nnan_nxv4f32(float %start, <vscale x 4 x float> %val, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vreduce_fminimum_nnan_nxv4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vfmv.s.f v10, fa0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vfredmin.vs v10, v8, v10, v0.t
+; CHECK-NEXT:    vfmv.f.s fa0, v10
+; CHECK-NEXT:    ret
+  %s = call nnan float @llvm.vp.reduce.fminimum.nxv4f32(float %start, <vscale x 4 x float> %val, <vscale x 4 x i1> %m, i32 %evl)
+  ret float %s
+}
+
+define float @vreduce_fmaximum_nnan_nxv4f32(float %start, <vscale x 4 x float> %val, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vreduce_fmaximum_nnan_nxv4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vfmv.s.f v10, fa0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vfredmax.vs v10, v8, v10, v0.t
+; CHECK-NEXT:    vfmv.f.s fa0, v10
+; CHECK-NEXT:    ret
+  %s = call nnan float @llvm.vp.reduce.fmaximum.nxv4f32(float %start, <vscale x 4 x float> %val, <vscale x 4 x i1> %m, i32 %evl)
+  ret float %s
+}
+
+define float @vreduce_fminimum_v4f32(float %start, <4 x float> %val, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vreduce_fminimum_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vfmv.s.f v9, fa0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vfredmin.vs v9, v8, v9, v0.t
+; CHECK-NEXT:    vmfne.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    vcpop.m a0, v8, v0.t
+; CHECK-NEXT:    feq.s a1, fa0, fa0
+; CHECK-NEXT:    xori a1, a1, 1
+; CHECK-NEXT:    or a0, a0, a1
+; CHECK-NEXT:    beqz a0, .LBB26_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, 523264
+; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB26_2:
+; CHECK-NEXT:    vfmv.f.s fa0, v9
+; CHECK-NEXT:    ret
+  %s = call float @llvm.vp.reduce.fminimum.v4f32(float %start, <4 x float> %val, <4 x i1> %m, i32 %evl)
+  ret float %s
+}
+
+define float @vreduce_fmaximum_v4f32(float %start, <4 x float> %val, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vreduce_fmaximum_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vfmv.s.f v9, fa0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vfredmax.vs v9, v8, v9, v0.t
+; CHECK-NEXT:    vmfne.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    vcpop.m a0, v8, v0.t
+; CHECK-NEXT:    feq.s a1, fa0, fa0
+; CHECK-NEXT:    xori a1, a1, 1
+; CHECK-NEXT:    or a0, a0, a1
+; CHECK-NEXT:    beqz a0, .LBB27_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, 523264
+; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB27_2:
+; CHECK-NEXT:    vfmv.f.s fa0, v9
+; CHECK-NEXT:    ret
+  %s = call float @llvm.vp.reduce.fmaximum.v4f32(float %start, <4 x float> %val, <4 x i1> %m, i32 %evl)
+  ret float %s
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
index 4ff2fc7a5fff..088d121564bc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
@@ -88,13 +88,13 @@ define <vscale x 1 x double> @test3(i64 %avl, i8 zeroext %cond, <vscale x 1 x do
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    beqz a1, .LBB2_2
 ; CHECK-NEXT:  # %bb.1: # %if.then
-; CHECK-NEXT:    vsetvli a0, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vfadd.vv v9, v8, v9
 ; CHECK-NEXT:    vfmul.vv v8, v9, v8
 ; CHECK-NEXT:    # implicit-def: $x10
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB2_2: # %if.else
-; CHECK-NEXT:    vsetvli a0, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vfsub.vv v9, v8, v9
 ; CHECK-NEXT:    vfmul.vv v8, v9, v8
 ; CHECK-NEXT:    # implicit-def: $x10
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir
index c66eb5717048..ef834403fb4f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir
@@ -499,7 +499,7 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr = COPY $x11
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr = COPY $x10
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY undef $noreg
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:gpr = IMPLICIT_DEF
   ; CHECK-NEXT:   dead [[PseudoVSETVLIX0_:%[0-9]+]]:gpr = PseudoVSETVLIX0 killed $x0, 223 /* e64, mf2, ta, ma */, implicit-def $vl, implicit-def $vtype
   ; CHECK-NEXT:   [[PseudoVID_V_MF2_:%[0-9]+]]:vr = PseudoVID_V_MF2 undef $noreg, -1, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   dead [[PseudoVSETVLIX0_1:%[0-9]+]]:gpr = PseudoVSETVLIX0 killed $x0, 215 /* e32, mf2, ta, ma */, implicit-def $vl, implicit-def $vtype
@@ -514,8 +514,8 @@ body:             |
   ; CHECK-NEXT:   [[PseudoVLE32_V_MF2_MASK:%[0-9]+]]:vrnov0 = PseudoVLE32_V_MF2_MASK [[PseudoVMV_V_I_MF2_]], killed [[COPY]], $v0, -1, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   dead $x0 = PseudoVSETVLIX0 killed $x0, 197 /* e8, mf8, ta, ma */, implicit-def $vl, implicit-def $vtype, implicit $vl
   ; CHECK-NEXT:   [[PseudoVCPOP_M_B1_:%[0-9]+]]:gpr = PseudoVCPOP_M_B1 [[PseudoVMSEQ_VI_MF2_]], -1, 0 /* e8 */, implicit $vl, implicit $vtype
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gpr = COPY $x0
-  ; CHECK-NEXT:   BEQ killed [[PseudoVCPOP_M_B1_]], [[COPY3]], %bb.3
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $x0
+  ; CHECK-NEXT:   BEQ killed [[PseudoVCPOP_M_B1_]], [[COPY2]], %bb.3
   ; CHECK-NEXT:   PseudoBR %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
@@ -524,7 +524,7 @@ body:             |
   ; CHECK-NEXT:   [[LWU:%[0-9]+]]:gpr = LWU [[COPY1]], 0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gpr = PHI [[COPY2]], %bb.1, [[LWU]], %bb.2
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gpr = PHI [[DEF]], %bb.1, [[LWU]], %bb.2
   ; CHECK-NEXT:   dead $x0 = PseudoVSETVLIX0 killed $x0, 215 /* e32, mf2, ta, ma */, implicit-def $vl, implicit-def $vtype, implicit $vl
   ; CHECK-NEXT:   [[PseudoVADD_VX_MF2_:%[0-9]+]]:vr = nsw PseudoVADD_VX_MF2 undef $noreg, [[PseudoVLE32_V_MF2_MASK]], [[PHI]], -1, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   $v0 = COPY [[PseudoVADD_VX_MF2_]]
@@ -535,7 +535,7 @@ body:             |
 
     %0:gpr = COPY $x11
     %1:gpr = COPY $x10
-    %2:gpr = COPY undef $noreg
+    %2:gpr = IMPLICIT_DEF
     %3:vr = PseudoVID_V_MF2 undef $noreg, -1, 6, 0
     %4:vrnov0 = PseudoVMV_V_I_MF2 undef $noreg, 0, -1, 5, 0
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsmul-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vsmul-rv32.ll
index d1fcb0f47cb5..e7d8ae635f75 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsmul-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsmul-rv32.ll
@@ -15,8 +15,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vsmul.nxv1i8.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vsmul_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -39,8 +39,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vsmul.mask.nxv1i8.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vsmul_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -63,8 +63,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vsmul.nxv2i8.nxv2i8(
 define <vscale x 2 x i8> @intrinsic_vsmul_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -87,8 +87,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vsmul.mask.nxv2i8.nxv2i8(
 define <vscale x 2 x i8> @intrinsic_vsmul_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -111,8 +111,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vsmul.nxv4i8.nxv4i8(
 define <vscale x 4 x i8> @intrinsic_vsmul_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -135,8 +135,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vsmul.mask.nxv4i8.nxv4i8(
 define <vscale x 4 x i8> @intrinsic_vsmul_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -159,8 +159,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vsmul.nxv8i8.nxv8i8(
 define <vscale x 8 x i8> @intrinsic_vsmul_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -183,8 +183,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vsmul.mask.nxv8i8.nxv8i8(
 define <vscale x 8 x i8> @intrinsic_vsmul_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -207,8 +207,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vsmul.nxv16i8.nxv16i8(
 define <vscale x 16 x i8> @intrinsic_vsmul_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -231,8 +231,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vsmul.mask.nxv16i8.nxv16i8(
 define <vscale x 16 x i8> @intrinsic_vsmul_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -255,8 +255,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vsmul.nxv32i8.nxv32i8(
 define <vscale x 32 x i8> @intrinsic_vsmul_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -279,8 +279,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vsmul.mask.nxv32i8.nxv32i8(
 define <vscale x 32 x i8> @intrinsic_vsmul_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -303,8 +303,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vsmul.nxv64i8.nxv64i8(
 define <vscale x 64 x i8> @intrinsic_vsmul_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -328,8 +328,8 @@ define <vscale x 64 x i8> @intrinsic_vsmul_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vsca
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -352,8 +352,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vsmul.nxv1i16.nxv1i16(
 define <vscale x 1 x i16> @intrinsic_vsmul_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -376,8 +376,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vsmul.mask.nxv1i16.nxv1i16(
 define <vscale x 1 x i16> @intrinsic_vsmul_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -400,8 +400,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vsmul.nxv2i16.nxv2i16(
 define <vscale x 2 x i16> @intrinsic_vsmul_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -424,8 +424,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vsmul.mask.nxv2i16.nxv2i16(
 define <vscale x 2 x i16> @intrinsic_vsmul_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -448,8 +448,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vsmul.nxv4i16.nxv4i16(
 define <vscale x 4 x i16> @intrinsic_vsmul_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -472,8 +472,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vsmul.mask.nxv4i16.nxv4i16(
 define <vscale x 4 x i16> @intrinsic_vsmul_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -496,8 +496,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vsmul.nxv8i16.nxv8i16(
 define <vscale x 8 x i16> @intrinsic_vsmul_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -520,8 +520,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vsmul.mask.nxv8i16.nxv8i16(
 define <vscale x 8 x i16> @intrinsic_vsmul_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -544,8 +544,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vsmul.nxv16i16.nxv16i16(
 define <vscale x 16 x i16> @intrinsic_vsmul_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -568,8 +568,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vsmul.mask.nxv16i16.nxv16i16(
 define <vscale x 16 x i16> @intrinsic_vsmul_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -592,8 +592,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vsmul.nxv32i16.nxv32i16(
 define <vscale x 32 x i16> @intrinsic_vsmul_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -617,8 +617,8 @@ define <vscale x 32 x i16> @intrinsic_vsmul_mask_vv_nxv32i16_nxv32i16_nxv32i16(<
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -641,8 +641,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vsmul.nxv1i32.nxv1i32(
 define <vscale x 1 x i32> @intrinsic_vsmul_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -665,8 +665,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vsmul.mask.nxv1i32.nxv1i32(
 define <vscale x 1 x i32> @intrinsic_vsmul_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -689,8 +689,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vsmul.nxv2i32.nxv2i32(
 define <vscale x 2 x i32> @intrinsic_vsmul_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -713,8 +713,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vsmul.mask.nxv2i32.nxv2i32(
 define <vscale x 2 x i32> @intrinsic_vsmul_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -737,8 +737,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vsmul.nxv4i32.nxv4i32(
 define <vscale x 4 x i32> @intrinsic_vsmul_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -761,8 +761,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vsmul.mask.nxv4i32.nxv4i32(
 define <vscale x 4 x i32> @intrinsic_vsmul_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -785,8 +785,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vsmul.nxv8i32.nxv8i32(
 define <vscale x 8 x i32> @intrinsic_vsmul_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -809,8 +809,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vsmul.mask.nxv8i32.nxv8i32(
 define <vscale x 8 x i32> @intrinsic_vsmul_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -833,8 +833,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vsmul.nxv16i32.nxv16i32(
 define <vscale x 16 x i32> @intrinsic_vsmul_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -858,8 +858,8 @@ define <vscale x 16 x i32> @intrinsic_vsmul_mask_vv_nxv16i32_nxv16i32_nxv16i32(<
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re32.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -882,8 +882,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vsmul.nxv1i64.nxv1i64(
 define <vscale x 1 x i64> @intrinsic_vsmul_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -906,8 +906,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vsmul.mask.nxv1i64.nxv1i64(
 define <vscale x 1 x i64> @intrinsic_vsmul_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -930,8 +930,8 @@ declare <vscale x 2 x i64> @llvm.riscv.vsmul.nxv2i64.nxv2i64(
 define <vscale x 2 x i64> @intrinsic_vsmul_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -954,8 +954,8 @@ declare <vscale x 2 x i64> @llvm.riscv.vsmul.mask.nxv2i64.nxv2i64(
 define <vscale x 2 x i64> @intrinsic_vsmul_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -978,8 +978,8 @@ declare <vscale x 4 x i64> @llvm.riscv.vsmul.nxv4i64.nxv4i64(
 define <vscale x 4 x i64> @intrinsic_vsmul_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -1002,8 +1002,8 @@ declare <vscale x 4 x i64> @llvm.riscv.vsmul.mask.nxv4i64.nxv4i64(
 define <vscale x 4 x i64> @intrinsic_vsmul_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1026,8 +1026,8 @@ declare <vscale x 8 x i64> @llvm.riscv.vsmul.nxv8i64.nxv8i64(
 define <vscale x 8 x i64> @intrinsic_vsmul_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -1051,8 +1051,8 @@ define <vscale x 8 x i64> @intrinsic_vsmul_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vsca
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re64.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1075,8 +1075,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vsmul.nxv1i8.i8(
 define <vscale x 1 x i8> @intrinsic_vsmul_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1099,8 +1099,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vsmul.mask.nxv1i8.i8(
 define <vscale x 1 x i8> @intrinsic_vsmul_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1123,8 +1123,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vsmul.nxv2i8.i8(
 define <vscale x 2 x i8> @intrinsic_vsmul_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1147,8 +1147,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vsmul.mask.nxv2i8.i8(
 define <vscale x 2 x i8> @intrinsic_vsmul_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1171,8 +1171,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vsmul.nxv4i8.i8(
 define <vscale x 4 x i8> @intrinsic_vsmul_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1195,8 +1195,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vsmul.mask.nxv4i8.i8(
 define <vscale x 4 x i8> @intrinsic_vsmul_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1219,8 +1219,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vsmul.nxv8i8.i8(
 define <vscale x 8 x i8> @intrinsic_vsmul_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1243,8 +1243,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vsmul.mask.nxv8i8.i8(
 define <vscale x 8 x i8> @intrinsic_vsmul_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1267,8 +1267,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vsmul.nxv16i8.i8(
 define <vscale x 16 x i8> @intrinsic_vsmul_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1291,8 +1291,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vsmul.mask.nxv16i8.i8(
 define <vscale x 16 x i8> @intrinsic_vsmul_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1315,8 +1315,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vsmul.nxv32i8.i8(
 define <vscale x 32 x i8> @intrinsic_vsmul_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1339,8 +1339,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vsmul.mask.nxv32i8.i8(
 define <vscale x 32 x i8> @intrinsic_vsmul_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1363,8 +1363,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vsmul.nxv64i8.i8(
 define <vscale x 64 x i8> @intrinsic_vsmul_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1387,8 +1387,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vsmul.mask.nxv64i8.i8(
 define <vscale x 64 x i8> @intrinsic_vsmul_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1411,8 +1411,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vsmul.nxv1i16.i16(
 define <vscale x 1 x i16> @intrinsic_vsmul_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1435,8 +1435,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vsmul.mask.nxv1i16.i16(
 define <vscale x 1 x i16> @intrinsic_vsmul_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1459,8 +1459,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vsmul.nxv2i16.i16(
 define <vscale x 2 x i16> @intrinsic_vsmul_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1483,8 +1483,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vsmul.mask.nxv2i16.i16(
 define <vscale x 2 x i16> @intrinsic_vsmul_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1507,8 +1507,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vsmul.nxv4i16.i16(
 define <vscale x 4 x i16> @intrinsic_vsmul_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1531,8 +1531,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vsmul.mask.nxv4i16.i16(
 define <vscale x 4 x i16> @intrinsic_vsmul_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1555,8 +1555,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vsmul.nxv8i16.i16(
 define <vscale x 8 x i16> @intrinsic_vsmul_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1579,8 +1579,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vsmul.mask.nxv8i16.i16(
 define <vscale x 8 x i16> @intrinsic_vsmul_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1603,8 +1603,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vsmul.nxv16i16.i16(
 define <vscale x 16 x i16> @intrinsic_vsmul_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1627,8 +1627,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vsmul.mask.nxv16i16.i16(
 define <vscale x 16 x i16> @intrinsic_vsmul_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1651,8 +1651,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vsmul.nxv32i16.i16(
 define <vscale x 32 x i16> @intrinsic_vsmul_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1675,8 +1675,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vsmul.mask.nxv32i16.i16(
 define <vscale x 32 x i16> @intrinsic_vsmul_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1698,8 +1698,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vsmul.nxv1i32.i32(
 define <vscale x 1 x i32> @intrinsic_vsmul_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1722,8 +1722,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vsmul.mask.nxv1i32.i32(
 define <vscale x 1 x i32> @intrinsic_vsmul_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1745,8 +1745,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vsmul.nxv2i32.i32(
 define <vscale x 2 x i32> @intrinsic_vsmul_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1769,8 +1769,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vsmul.mask.nxv2i32.i32(
 define <vscale x 2 x i32> @intrinsic_vsmul_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1792,8 +1792,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vsmul.nxv4i32.i32(
 define <vscale x 4 x i32> @intrinsic_vsmul_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1816,8 +1816,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vsmul.mask.nxv4i32.i32(
 define <vscale x 4 x i32> @intrinsic_vsmul_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1839,8 +1839,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vsmul.nxv8i32.i32(
 define <vscale x 8 x i32> @intrinsic_vsmul_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1863,8 +1863,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vsmul.mask.nxv8i32.i32(
 define <vscale x 8 x i32> @intrinsic_vsmul_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1886,8 +1886,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vsmul.nxv16i32.i32(
 define <vscale x 16 x i32> @intrinsic_vsmul_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, i32 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1910,8 +1910,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vsmul.mask.nxv16i32.i32(
 define <vscale x 16 x i32> @intrinsic_vsmul_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsmul-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vsmul-rv64.ll
index 1fe1baf1cef2..66bc5c9103a4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsmul-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsmul-rv64.ll
@@ -15,8 +15,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vsmul.nxv1i8.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vsmul_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -39,8 +39,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vsmul.mask.nxv1i8.nxv1i8(
 define <vscale x 1 x i8> @intrinsic_vsmul_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -63,8 +63,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vsmul.nxv2i8.nxv2i8(
 define <vscale x 2 x i8> @intrinsic_vsmul_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -87,8 +87,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vsmul.mask.nxv2i8.nxv2i8(
 define <vscale x 2 x i8> @intrinsic_vsmul_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -111,8 +111,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vsmul.nxv4i8.nxv4i8(
 define <vscale x 4 x i8> @intrinsic_vsmul_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -135,8 +135,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vsmul.mask.nxv4i8.nxv4i8(
 define <vscale x 4 x i8> @intrinsic_vsmul_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -159,8 +159,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vsmul.nxv8i8.nxv8i8(
 define <vscale x 8 x i8> @intrinsic_vsmul_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -183,8 +183,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vsmul.mask.nxv8i8.nxv8i8(
 define <vscale x 8 x i8> @intrinsic_vsmul_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -207,8 +207,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vsmul.nxv16i8.nxv16i8(
 define <vscale x 16 x i8> @intrinsic_vsmul_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -231,8 +231,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vsmul.mask.nxv16i8.nxv16i8(
 define <vscale x 16 x i8> @intrinsic_vsmul_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -255,8 +255,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vsmul.nxv32i8.nxv32i8(
 define <vscale x 32 x i8> @intrinsic_vsmul_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -279,8 +279,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vsmul.mask.nxv32i8.nxv32i8(
 define <vscale x 32 x i8> @intrinsic_vsmul_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -303,8 +303,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vsmul.nxv64i8.nxv64i8(
 define <vscale x 64 x i8> @intrinsic_vsmul_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -328,8 +328,8 @@ define <vscale x 64 x i8> @intrinsic_vsmul_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vsca
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -352,8 +352,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vsmul.nxv1i16.nxv1i16(
 define <vscale x 1 x i16> @intrinsic_vsmul_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -376,8 +376,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vsmul.mask.nxv1i16.nxv1i16(
 define <vscale x 1 x i16> @intrinsic_vsmul_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -400,8 +400,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vsmul.nxv2i16.nxv2i16(
 define <vscale x 2 x i16> @intrinsic_vsmul_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -424,8 +424,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vsmul.mask.nxv2i16.nxv2i16(
 define <vscale x 2 x i16> @intrinsic_vsmul_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -448,8 +448,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vsmul.nxv4i16.nxv4i16(
 define <vscale x 4 x i16> @intrinsic_vsmul_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -472,8 +472,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vsmul.mask.nxv4i16.nxv4i16(
 define <vscale x 4 x i16> @intrinsic_vsmul_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -496,8 +496,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vsmul.nxv8i16.nxv8i16(
 define <vscale x 8 x i16> @intrinsic_vsmul_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -520,8 +520,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vsmul.mask.nxv8i16.nxv8i16(
 define <vscale x 8 x i16> @intrinsic_vsmul_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -544,8 +544,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vsmul.nxv16i16.nxv16i16(
 define <vscale x 16 x i16> @intrinsic_vsmul_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -568,8 +568,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vsmul.mask.nxv16i16.nxv16i16(
 define <vscale x 16 x i16> @intrinsic_vsmul_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -592,8 +592,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vsmul.nxv32i16.nxv32i16(
 define <vscale x 32 x i16> @intrinsic_vsmul_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -617,8 +617,8 @@ define <vscale x 32 x i16> @intrinsic_vsmul_mask_vv_nxv32i16_nxv32i16_nxv32i16(<
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -641,8 +641,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vsmul.nxv1i32.nxv1i32(
 define <vscale x 1 x i32> @intrinsic_vsmul_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -665,8 +665,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vsmul.mask.nxv1i32.nxv1i32(
 define <vscale x 1 x i32> @intrinsic_vsmul_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -689,8 +689,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vsmul.nxv2i32.nxv2i32(
 define <vscale x 2 x i32> @intrinsic_vsmul_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -713,8 +713,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vsmul.mask.nxv2i32.nxv2i32(
 define <vscale x 2 x i32> @intrinsic_vsmul_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -737,8 +737,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vsmul.nxv4i32.nxv4i32(
 define <vscale x 4 x i32> @intrinsic_vsmul_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -761,8 +761,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vsmul.mask.nxv4i32.nxv4i32(
 define <vscale x 4 x i32> @intrinsic_vsmul_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -785,8 +785,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vsmul.nxv8i32.nxv8i32(
 define <vscale x 8 x i32> @intrinsic_vsmul_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -809,8 +809,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vsmul.mask.nxv8i32.nxv8i32(
 define <vscale x 8 x i32> @intrinsic_vsmul_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -833,8 +833,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vsmul.nxv16i32.nxv16i32(
 define <vscale x 16 x i32> @intrinsic_vsmul_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -858,8 +858,8 @@ define <vscale x 16 x i32> @intrinsic_vsmul_mask_vv_nxv16i32_nxv16i32_nxv16i32(<
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re32.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -882,8 +882,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vsmul.nxv1i64.nxv1i64(
 define <vscale x 1 x i64> @intrinsic_vsmul_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -906,8 +906,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vsmul.mask.nxv1i64.nxv1i64(
 define <vscale x 1 x i64> @intrinsic_vsmul_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -930,8 +930,8 @@ declare <vscale x 2 x i64> @llvm.riscv.vsmul.nxv2i64.nxv2i64(
 define <vscale x 2 x i64> @intrinsic_vsmul_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -954,8 +954,8 @@ declare <vscale x 2 x i64> @llvm.riscv.vsmul.mask.nxv2i64.nxv2i64(
 define <vscale x 2 x i64> @intrinsic_vsmul_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -978,8 +978,8 @@ declare <vscale x 4 x i64> @llvm.riscv.vsmul.nxv4i64.nxv4i64(
 define <vscale x 4 x i64> @intrinsic_vsmul_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -1002,8 +1002,8 @@ declare <vscale x 4 x i64> @llvm.riscv.vsmul.mask.nxv4i64.nxv4i64(
 define <vscale x 4 x i64> @intrinsic_vsmul_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1026,8 +1026,8 @@ declare <vscale x 8 x i64> @llvm.riscv.vsmul.nxv8i64.nxv8i64(
 define <vscale x 8 x i64> @intrinsic_vsmul_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vsmul.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -1051,8 +1051,8 @@ define <vscale x 8 x i64> @intrinsic_vsmul_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vsca
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re64.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
 ; CHECK-NEXT:    vsmul.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1075,8 +1075,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vsmul.nxv1i8.i8(
 define <vscale x 1 x i8> @intrinsic_vsmul_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1099,8 +1099,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vsmul.mask.nxv1i8.i8(
 define <vscale x 1 x i8> @intrinsic_vsmul_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1123,8 +1123,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vsmul.nxv2i8.i8(
 define <vscale x 2 x i8> @intrinsic_vsmul_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1147,8 +1147,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vsmul.mask.nxv2i8.i8(
 define <vscale x 2 x i8> @intrinsic_vsmul_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1171,8 +1171,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vsmul.nxv4i8.i8(
 define <vscale x 4 x i8> @intrinsic_vsmul_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1195,8 +1195,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vsmul.mask.nxv4i8.i8(
 define <vscale x 4 x i8> @intrinsic_vsmul_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1219,8 +1219,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vsmul.nxv8i8.i8(
 define <vscale x 8 x i8> @intrinsic_vsmul_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1243,8 +1243,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vsmul.mask.nxv8i8.i8(
 define <vscale x 8 x i8> @intrinsic_vsmul_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1267,8 +1267,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vsmul.nxv16i8.i8(
 define <vscale x 16 x i8> @intrinsic_vsmul_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1291,8 +1291,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vsmul.mask.nxv16i8.i8(
 define <vscale x 16 x i8> @intrinsic_vsmul_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1315,8 +1315,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vsmul.nxv32i8.i8(
 define <vscale x 32 x i8> @intrinsic_vsmul_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1339,8 +1339,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vsmul.mask.nxv32i8.i8(
 define <vscale x 32 x i8> @intrinsic_vsmul_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1363,8 +1363,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vsmul.nxv64i8.i8(
 define <vscale x 64 x i8> @intrinsic_vsmul_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1387,8 +1387,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vsmul.mask.nxv64i8.i8(
 define <vscale x 64 x i8> @intrinsic_vsmul_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1411,8 +1411,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vsmul.nxv1i16.i16(
 define <vscale x 1 x i16> @intrinsic_vsmul_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1435,8 +1435,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vsmul.mask.nxv1i16.i16(
 define <vscale x 1 x i16> @intrinsic_vsmul_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1459,8 +1459,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vsmul.nxv2i16.i16(
 define <vscale x 2 x i16> @intrinsic_vsmul_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1483,8 +1483,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vsmul.mask.nxv2i16.i16(
 define <vscale x 2 x i16> @intrinsic_vsmul_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1507,8 +1507,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vsmul.nxv4i16.i16(
 define <vscale x 4 x i16> @intrinsic_vsmul_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1531,8 +1531,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vsmul.mask.nxv4i16.i16(
 define <vscale x 4 x i16> @intrinsic_vsmul_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1555,8 +1555,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vsmul.nxv8i16.i16(
 define <vscale x 8 x i16> @intrinsic_vsmul_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1579,8 +1579,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vsmul.mask.nxv8i16.i16(
 define <vscale x 8 x i16> @intrinsic_vsmul_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1603,8 +1603,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vsmul.nxv16i16.i16(
 define <vscale x 16 x i16> @intrinsic_vsmul_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1627,8 +1627,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vsmul.mask.nxv16i16.i16(
 define <vscale x 16 x i16> @intrinsic_vsmul_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1651,8 +1651,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vsmul.nxv32i16.i16(
 define <vscale x 32 x i16> @intrinsic_vsmul_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1675,8 +1675,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vsmul.mask.nxv32i16.i16(
 define <vscale x 32 x i16> @intrinsic_vsmul_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1699,8 +1699,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vsmul.nxv1i32.i32(
 define <vscale x 1 x i32> @intrinsic_vsmul_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1723,8 +1723,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vsmul.mask.nxv1i32.i32(
 define <vscale x 1 x i32> @intrinsic_vsmul_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1747,8 +1747,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vsmul.nxv2i32.i32(
 define <vscale x 2 x i32> @intrinsic_vsmul_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1771,8 +1771,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vsmul.mask.nxv2i32.i32(
 define <vscale x 2 x i32> @intrinsic_vsmul_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1795,8 +1795,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vsmul.nxv4i32.i32(
 define <vscale x 4 x i32> @intrinsic_vsmul_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1819,8 +1819,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vsmul.mask.nxv4i32.i32(
 define <vscale x 4 x i32> @intrinsic_vsmul_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1843,8 +1843,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vsmul.nxv8i32.i32(
 define <vscale x 8 x i32> @intrinsic_vsmul_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1867,8 +1867,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vsmul.mask.nxv8i32.i32(
 define <vscale x 8 x i32> @intrinsic_vsmul_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1891,8 +1891,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vsmul.nxv16i32.i32(
 define <vscale x 16 x i32> @intrinsic_vsmul_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1915,8 +1915,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vsmul.mask.nxv16i32.i32(
 define <vscale x 16 x i32> @intrinsic_vsmul_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1938,8 +1938,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vsmul.nxv1i64.i64(
 define <vscale x 1 x i64> @intrinsic_vsmul_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv1i64_nxv1i64_i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -1962,8 +1962,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vsmul.mask.nxv1i64.i64(
 define <vscale x 1 x i64> @intrinsic_vsmul_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv1i64_nxv1i64_i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1985,8 +1985,8 @@ declare <vscale x 2 x i64> @llvm.riscv.vsmul.nxv2i64.i64(
 define <vscale x 2 x i64> @intrinsic_vsmul_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv2i64_nxv2i64_i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -2009,8 +2009,8 @@ declare <vscale x 2 x i64> @llvm.riscv.vsmul.mask.nxv2i64.i64(
 define <vscale x 2 x i64> @intrinsic_vsmul_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv2i64_nxv2i64_i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -2032,8 +2032,8 @@ declare <vscale x 4 x i64> @llvm.riscv.vsmul.nxv4i64.i64(
 define <vscale x 4 x i64> @intrinsic_vsmul_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv4i64_nxv4i64_i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -2056,8 +2056,8 @@ declare <vscale x 4 x i64> @llvm.riscv.vsmul.mask.nxv4i64.i64(
 define <vscale x 4 x i64> @intrinsic_vsmul_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv4i64_nxv4i64_i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -2079,8 +2079,8 @@ declare <vscale x 8 x i64> @llvm.riscv.vsmul.nxv8i64.i64(
 define <vscale x 8 x i64> @intrinsic_vsmul_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, i64 %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv8i64_nxv8i64_i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vsmul.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -2103,8 +2103,8 @@ declare <vscale x 8 x i64> @llvm.riscv.vsmul.mask.nxv8i64.i64(
 define <vscale x 8 x i64> @intrinsic_vsmul_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv8i64_nxv8i64_i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
 ; CHECK-NEXT:    vsmul.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssra-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vssra-rv32.ll
index 8e28dd490a87..7fd1b05bb444 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vssra-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vssra-rv32.ll
@@ -5,8 +5,8 @@
 define <vscale x 1 x i8> @test_vssra_vv_i8mf8(<vscale x 1 x i8> %op1, <vscale x 1 x i8> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i8mf8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -19,8 +19,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vssra.nxv1i8.nxv1i8.i32(<vscale x 1 x i8>,
 define <vscale x 1 x i8> @test_vssra_vx_i8mf8(<vscale x 1 x i8> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i8mf8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -33,8 +33,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vssra.nxv1i8.i32.i32(<vscale x 1 x i8>, <v
 define <vscale x 2 x i8> @test_vssra_vv_i8mf4(<vscale x 2 x i8> %op1, <vscale x 2 x i8> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i8mf4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -47,8 +47,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vssra.nxv2i8.nxv2i8.i32(<vscale x 2 x i8>,
 define <vscale x 2 x i8> @test_vssra_vx_i8mf4(<vscale x 2 x i8> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i8mf4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -61,8 +61,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vssra.nxv2i8.i32.i32(<vscale x 2 x i8>, <v
 define <vscale x 4 x i8> @test_vssra_vv_i8mf2(<vscale x 4 x i8> %op1, <vscale x 4 x i8> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i8mf2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -75,8 +75,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vssra.nxv4i8.nxv4i8.i32(<vscale x 4 x i8>,
 define <vscale x 4 x i8> @test_vssra_vx_i8mf2(<vscale x 4 x i8> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i8mf2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -89,8 +89,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vssra.nxv4i8.i32.i32(<vscale x 4 x i8>, <v
 define <vscale x 8 x i8> @test_vssra_vv_i8m1(<vscale x 8 x i8> %op1, <vscale x 8 x i8> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i8m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -103,8 +103,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vssra.nxv8i8.nxv8i8.i32(<vscale x 8 x i8>,
 define <vscale x 8 x i8> @test_vssra_vx_i8m1(<vscale x 8 x i8> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i8m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -117,8 +117,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vssra.nxv8i8.i32.i32(<vscale x 8 x i8>, <v
 define <vscale x 16 x i8> @test_vssra_vv_i8m2(<vscale x 16 x i8> %op1, <vscale x 16 x i8> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i8m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -131,8 +131,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vssra.nxv16i8.nxv16i8.i32(<vscale x 16 x
 define <vscale x 16 x i8> @test_vssra_vx_i8m2(<vscale x 16 x i8> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i8m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -145,8 +145,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vssra.nxv16i8.i32.i32(<vscale x 16 x i8>,
 define <vscale x 32 x i8> @test_vssra_vv_i8m4(<vscale x 32 x i8> %op1, <vscale x 32 x i8> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i8m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -159,8 +159,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vssra.nxv32i8.nxv32i8.i32(<vscale x 32 x
 define <vscale x 32 x i8> @test_vssra_vx_i8m4(<vscale x 32 x i8> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i8m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -173,8 +173,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vssra.nxv32i8.i32.i32(<vscale x 32 x i8>,
 define <vscale x 64 x i8> @test_vssra_vv_i8m8(<vscale x 64 x i8> %op1, <vscale x 64 x i8> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i8m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -187,8 +187,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vssra.nxv64i8.nxv64i8.i32(<vscale x 64 x
 define <vscale x 64 x i8> @test_vssra_vx_i8m8(<vscale x 64 x i8> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i8m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -201,8 +201,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vssra.nxv64i8.i32.i32(<vscale x 64 x i8>,
 define <vscale x 1 x i16> @test_vssra_vv_i16mf4(<vscale x 1 x i16> %op1, <vscale x 1 x i16> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i16mf4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -215,8 +215,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vssra.nxv1i16.nxv1i16.i32(<vscale x 1 x i
 define <vscale x 1 x i16> @test_vssra_vx_i16mf4(<vscale x 1 x i16> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i16mf4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -229,8 +229,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vssra.nxv1i16.i32.i32(<vscale x 1 x i16>,
 define <vscale x 2 x i16> @test_vssra_vv_i16mf2(<vscale x 2 x i16> %op1, <vscale x 2 x i16> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i16mf2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -243,8 +243,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vssra.nxv2i16.nxv2i16.i32(<vscale x 2 x i
 define <vscale x 2 x i16> @test_vssra_vx_i16mf2(<vscale x 2 x i16> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i16mf2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -257,8 +257,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vssra.nxv2i16.i32.i32(<vscale x 2 x i16>,
 define <vscale x 4 x i16> @test_vssra_vv_i16m1(<vscale x 4 x i16> %op1, <vscale x 4 x i16> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i16m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -271,8 +271,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vssra.nxv4i16.nxv4i16.i32(<vscale x 4 x i
 define <vscale x 4 x i16> @test_vssra_vx_i16m1(<vscale x 4 x i16> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i16m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -285,8 +285,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vssra.nxv4i16.i32.i32(<vscale x 4 x i16>,
 define <vscale x 8 x i16> @test_vssra_vv_i16m2(<vscale x 8 x i16> %op1, <vscale x 8 x i16> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i16m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -299,8 +299,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vssra.nxv8i16.nxv8i16.i32(<vscale x 8 x i
 define <vscale x 8 x i16> @test_vssra_vx_i16m2(<vscale x 8 x i16> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i16m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -313,8 +313,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vssra.nxv8i16.i32.i32(<vscale x 8 x i16>,
 define <vscale x 16 x i16> @test_vssra_vv_i16m4(<vscale x 16 x i16> %op1, <vscale x 16 x i16> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i16m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -327,8 +327,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vssra.nxv16i16.nxv16i16.i32(<vscale x 16
 define <vscale x 16 x i16> @test_vssra_vx_i16m4(<vscale x 16 x i16> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i16m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -341,8 +341,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vssra.nxv16i16.i32.i32(<vscale x 16 x i1
 define <vscale x 32 x i16> @test_vssra_vv_i16m8(<vscale x 32 x i16> %op1, <vscale x 32 x i16> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i16m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -355,8 +355,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vssra.nxv32i16.nxv32i16.i32(<vscale x 32
 define <vscale x 32 x i16> @test_vssra_vx_i16m8(<vscale x 32 x i16> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i16m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -369,8 +369,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vssra.nxv32i16.i32.i32(<vscale x 32 x i1
 define <vscale x 1 x i32> @test_vssra_vv_i32mf2(<vscale x 1 x i32> %op1, <vscale x 1 x i32> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i32mf2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -383,8 +383,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vssra.nxv1i32.nxv1i32.i32(<vscale x 1 x i
 define <vscale x 1 x i32> @test_vssra_vx_i32mf2(<vscale x 1 x i32> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i32mf2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -397,8 +397,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vssra.nxv1i32.i32.i32(<vscale x 1 x i32>,
 define <vscale x 2 x i32> @test_vssra_vv_i32m1(<vscale x 2 x i32> %op1, <vscale x 2 x i32> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i32m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -411,8 +411,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vssra.nxv2i32.nxv2i32.i32(<vscale x 2 x i
 define <vscale x 2 x i32> @test_vssra_vx_i32m1(<vscale x 2 x i32> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i32m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -425,8 +425,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vssra.nxv2i32.i32.i32(<vscale x 2 x i32>,
 define <vscale x 4 x i32> @test_vssra_vv_i32m2(<vscale x 4 x i32> %op1, <vscale x 4 x i32> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i32m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -439,8 +439,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vssra.nxv4i32.nxv4i32.i32(<vscale x 4 x i
 define <vscale x 4 x i32> @test_vssra_vx_i32m2(<vscale x 4 x i32> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i32m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -453,8 +453,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vssra.nxv4i32.i32.i32(<vscale x 4 x i32>,
 define <vscale x 8 x i32> @test_vssra_vv_i32m4(<vscale x 8 x i32> %op1, <vscale x 8 x i32> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i32m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -467,8 +467,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vssra.nxv8i32.nxv8i32.i32(<vscale x 8 x i
 define <vscale x 8 x i32> @test_vssra_vx_i32m4(<vscale x 8 x i32> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i32m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -481,8 +481,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vssra.nxv8i32.i32.i32(<vscale x 8 x i32>,
 define <vscale x 16 x i32> @test_vssra_vv_i32m8(<vscale x 16 x i32> %op1, <vscale x 16 x i32> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i32m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -495,8 +495,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vssra.nxv16i32.nxv16i32.i32(<vscale x 16
 define <vscale x 16 x i32> @test_vssra_vx_i32m8(<vscale x 16 x i32> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i32m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -509,8 +509,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vssra.nxv16i32.i32.i32(<vscale x 16 x i3
 define <vscale x 1 x i64> @test_vssra_vv_i64m1(<vscale x 1 x i64> %op1, <vscale x 1 x i64> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i64m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -523,8 +523,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vssra.nxv1i64.nxv1i64.i32(<vscale x 1 x i
 define <vscale x 1 x i64> @test_vssra_vx_i64m1(<vscale x 1 x i64> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i64m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -537,8 +537,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vssra.nxv1i64.i32.i32(<vscale x 1 x i64>,
 define <vscale x 2 x i64> @test_vssra_vv_i64m2(<vscale x 2 x i64> %op1, <vscale x 2 x i64> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i64m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -551,8 +551,8 @@ declare <vscale x 2 x i64> @llvm.riscv.vssra.nxv2i64.nxv2i64.i32(<vscale x 2 x i
 define <vscale x 2 x i64> @test_vssra_vx_i64m2(<vscale x 2 x i64> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i64m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -565,8 +565,8 @@ declare <vscale x 2 x i64> @llvm.riscv.vssra.nxv2i64.i32.i32(<vscale x 2 x i64>,
 define <vscale x 4 x i64> @test_vssra_vv_i64m4(<vscale x 4 x i64> %op1, <vscale x 4 x i64> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i64m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -579,8 +579,8 @@ declare <vscale x 4 x i64> @llvm.riscv.vssra.nxv4i64.nxv4i64.i32(<vscale x 4 x i
 define <vscale x 4 x i64> @test_vssra_vx_i64m4(<vscale x 4 x i64> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i64m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -593,8 +593,8 @@ declare <vscale x 4 x i64> @llvm.riscv.vssra.nxv4i64.i32.i32(<vscale x 4 x i64>,
 define <vscale x 8 x i64> @test_vssra_vv_i64m8(<vscale x 8 x i64> %op1, <vscale x 8 x i64> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i64m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -607,8 +607,8 @@ declare <vscale x 8 x i64> @llvm.riscv.vssra.nxv8i64.nxv8i64.i32(<vscale x 8 x i
 define <vscale x 8 x i64> @test_vssra_vx_i64m8(<vscale x 8 x i64> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i64m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -621,8 +621,8 @@ declare <vscale x 8 x i64> @llvm.riscv.vssra.nxv8i64.i32.i32(<vscale x 8 x i64>,
 define <vscale x 1 x i8> @test_vssra_vv_i8mf8_m(<vscale x 1 x i1> %mask, <vscale x 1 x i8> %op1, <vscale x 1 x i8> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i8mf8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -635,8 +635,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vssra.mask.nxv1i8.nxv1i8.i32(<vscale x 1 x
 define <vscale x 1 x i8> @test_vssra_vx_i8mf8_m(<vscale x 1 x i1> %mask, <vscale x 1 x i8> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i8mf8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -649,8 +649,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vssra.mask.nxv1i8.i32.i32(<vscale x 1 x i8
 define <vscale x 2 x i8> @test_vssra_vv_i8mf4_m(<vscale x 2 x i1> %mask, <vscale x 2 x i8> %op1, <vscale x 2 x i8> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i8mf4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -663,8 +663,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vssra.mask.nxv2i8.nxv2i8.i32(<vscale x 2 x
 define <vscale x 2 x i8> @test_vssra_vx_i8mf4_m(<vscale x 2 x i1> %mask, <vscale x 2 x i8> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i8mf4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -677,8 +677,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vssra.mask.nxv2i8.i32.i32(<vscale x 2 x i8
 define <vscale x 4 x i8> @test_vssra_vv_i8mf2_m(<vscale x 4 x i1> %mask, <vscale x 4 x i8> %op1, <vscale x 4 x i8> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i8mf2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -691,8 +691,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vssra.mask.nxv4i8.nxv4i8.i32(<vscale x 4 x
 define <vscale x 4 x i8> @test_vssra_vx_i8mf2_m(<vscale x 4 x i1> %mask, <vscale x 4 x i8> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i8mf2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -705,8 +705,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vssra.mask.nxv4i8.i32.i32(<vscale x 4 x i8
 define <vscale x 8 x i8> @test_vssra_vv_i8m1_m(<vscale x 8 x i1> %mask, <vscale x 8 x i8> %op1, <vscale x 8 x i8> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i8m1_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -719,8 +719,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vssra.mask.nxv8i8.nxv8i8.i32(<vscale x 8 x
 define <vscale x 8 x i8> @test_vssra_vx_i8m1_m(<vscale x 8 x i1> %mask, <vscale x 8 x i8> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i8m1_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -733,8 +733,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vssra.mask.nxv8i8.i32.i32(<vscale x 8 x i8
 define <vscale x 16 x i8> @test_vssra_vv_i8m2_m(<vscale x 16 x i1> %mask, <vscale x 16 x i8> %op1, <vscale x 16 x i8> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i8m2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -747,8 +747,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vssra.mask.nxv16i8.nxv16i8.i32(<vscale x
 define <vscale x 16 x i8> @test_vssra_vx_i8m2_m(<vscale x 16 x i1> %mask, <vscale x 16 x i8> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i8m2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -761,8 +761,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vssra.mask.nxv16i8.i32.i32(<vscale x 16 x
 define <vscale x 32 x i8> @test_vssra_vv_i8m4_m(<vscale x 32 x i1> %mask, <vscale x 32 x i8> %op1, <vscale x 32 x i8> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i8m4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -775,8 +775,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vssra.mask.nxv32i8.nxv32i8.i32(<vscale x
 define <vscale x 32 x i8> @test_vssra_vx_i8m4_m(<vscale x 32 x i1> %mask, <vscale x 32 x i8> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i8m4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -789,8 +789,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vssra.mask.nxv32i8.i32.i32(<vscale x 32 x
 define <vscale x 64 x i8> @test_vssra_vv_i8m8_m(<vscale x 64 x i1> %mask, <vscale x 64 x i8> %op1, <vscale x 64 x i8> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i8m8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -803,8 +803,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vssra.mask.nxv64i8.nxv64i8.i32(<vscale x
 define <vscale x 64 x i8> @test_vssra_vx_i8m8_m(<vscale x 64 x i1> %mask, <vscale x 64 x i8> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i8m8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -817,8 +817,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vssra.mask.nxv64i8.i32.i32(<vscale x 64 x
 define <vscale x 1 x i16> @test_vssra_vv_i16mf4_m(<vscale x 1 x i1> %mask, <vscale x 1 x i16> %op1, <vscale x 1 x i16> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i16mf4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -831,8 +831,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vssra.mask.nxv1i16.nxv1i16.i32(<vscale x
 define <vscale x 1 x i16> @test_vssra_vx_i16mf4_m(<vscale x 1 x i1> %mask, <vscale x 1 x i16> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i16mf4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -845,8 +845,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vssra.mask.nxv1i16.i32.i32(<vscale x 1 x
 define <vscale x 2 x i16> @test_vssra_vv_i16mf2_m(<vscale x 2 x i1> %mask, <vscale x 2 x i16> %op1, <vscale x 2 x i16> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i16mf2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -859,8 +859,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vssra.mask.nxv2i16.nxv2i16.i32(<vscale x
 define <vscale x 2 x i16> @test_vssra_vx_i16mf2_m(<vscale x 2 x i1> %mask, <vscale x 2 x i16> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i16mf2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -873,8 +873,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vssra.mask.nxv2i16.i32.i32(<vscale x 2 x
 define <vscale x 4 x i16> @test_vssra_vv_i16m1_m(<vscale x 4 x i1> %mask, <vscale x 4 x i16> %op1, <vscale x 4 x i16> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i16m1_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -887,8 +887,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vssra.mask.nxv4i16.nxv4i16.i32(<vscale x
 define <vscale x 4 x i16> @test_vssra_vx_i16m1_m(<vscale x 4 x i1> %mask, <vscale x 4 x i16> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i16m1_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -901,8 +901,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vssra.mask.nxv4i16.i32.i32(<vscale x 4 x
 define <vscale x 8 x i16> @test_vssra_vv_i16m2_m(<vscale x 8 x i1> %mask, <vscale x 8 x i16> %op1, <vscale x 8 x i16> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i16m2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -915,8 +915,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vssra.mask.nxv8i16.nxv8i16.i32(<vscale x
 define <vscale x 8 x i16> @test_vssra_vx_i16m2_m(<vscale x 8 x i1> %mask, <vscale x 8 x i16> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i16m2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -929,8 +929,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vssra.mask.nxv8i16.i32.i32(<vscale x 8 x
 define <vscale x 16 x i16> @test_vssra_vv_i16m4_m(<vscale x 16 x i1> %mask, <vscale x 16 x i16> %op1, <vscale x 16 x i16> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i16m4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -943,8 +943,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vssra.mask.nxv16i16.nxv16i16.i32(<vscale
 define <vscale x 16 x i16> @test_vssra_vx_i16m4_m(<vscale x 16 x i1> %mask, <vscale x 16 x i16> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i16m4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -957,8 +957,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vssra.mask.nxv16i16.i32.i32(<vscale x 16
 define <vscale x 32 x i16> @test_vssra_vv_i16m8_m(<vscale x 32 x i1> %mask, <vscale x 32 x i16> %op1, <vscale x 32 x i16> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i16m8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -971,8 +971,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vssra.mask.nxv32i16.nxv32i16.i32(<vscale
 define <vscale x 32 x i16> @test_vssra_vx_i16m8_m(<vscale x 32 x i1> %mask, <vscale x 32 x i16> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i16m8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -985,8 +985,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vssra.mask.nxv32i16.i32.i32(<vscale x 32
 define <vscale x 1 x i32> @test_vssra_vv_i32mf2_m(<vscale x 1 x i1> %mask, <vscale x 1 x i32> %op1, <vscale x 1 x i32> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i32mf2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -999,8 +999,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vssra.mask.nxv1i32.nxv1i32.i32(<vscale x
 define <vscale x 1 x i32> @test_vssra_vx_i32mf2_m(<vscale x 1 x i1> %mask, <vscale x 1 x i32> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i32mf2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1013,8 +1013,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vssra.mask.nxv1i32.i32.i32(<vscale x 1 x
 define <vscale x 2 x i32> @test_vssra_vv_i32m1_m(<vscale x 2 x i1> %mask, <vscale x 2 x i32> %op1, <vscale x 2 x i32> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i32m1_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1027,8 +1027,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vssra.mask.nxv2i32.nxv2i32.i32(<vscale x
 define <vscale x 2 x i32> @test_vssra_vx_i32m1_m(<vscale x 2 x i1> %mask, <vscale x 2 x i32> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i32m1_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1041,8 +1041,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vssra.mask.nxv2i32.i32.i32(<vscale x 2 x
 define <vscale x 4 x i32> @test_vssra_vv_i32m2_m(<vscale x 4 x i1> %mask, <vscale x 4 x i32> %op1, <vscale x 4 x i32> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i32m2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1055,8 +1055,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vssra.mask.nxv4i32.nxv4i32.i32(<vscale x
 define <vscale x 4 x i32> @test_vssra_vx_i32m2_m(<vscale x 4 x i1> %mask, <vscale x 4 x i32> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i32m2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1069,8 +1069,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vssra.mask.nxv4i32.i32.i32(<vscale x 4 x
 define <vscale x 8 x i32> @test_vssra_vv_i32m4_m(<vscale x 8 x i1> %mask, <vscale x 8 x i32> %op1, <vscale x 8 x i32> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i32m4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1083,8 +1083,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vssra.mask.nxv8i32.nxv8i32.i32(<vscale x
 define <vscale x 8 x i32> @test_vssra_vx_i32m4_m(<vscale x 8 x i1> %mask, <vscale x 8 x i32> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i32m4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1097,8 +1097,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vssra.mask.nxv8i32.i32.i32(<vscale x 8 x
 define <vscale x 16 x i32> @test_vssra_vv_i32m8_m(<vscale x 16 x i1> %mask, <vscale x 16 x i32> %op1, <vscale x 16 x i32> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i32m8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1111,8 +1111,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vssra.mask.nxv16i32.nxv16i32.i32(<vscale
 define <vscale x 16 x i32> @test_vssra_vx_i32m8_m(<vscale x 16 x i1> %mask, <vscale x 16 x i32> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i32m8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1125,8 +1125,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vssra.mask.nxv16i32.i32.i32(<vscale x 16
 define <vscale x 1 x i64> @test_vssra_vv_i64m1_m(<vscale x 1 x i1> %mask, <vscale x 1 x i64> %op1, <vscale x 1 x i64> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i64m1_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1139,8 +1139,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vssra.mask.nxv1i64.nxv1i64.i32(<vscale x
 define <vscale x 1 x i64> @test_vssra_vx_i64m1_m(<vscale x 1 x i1> %mask, <vscale x 1 x i64> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i64m1_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1153,8 +1153,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vssra.mask.nxv1i64.i32.i32(<vscale x 1 x
 define <vscale x 2 x i64> @test_vssra_vv_i64m2_m(<vscale x 2 x i1> %mask, <vscale x 2 x i64> %op1, <vscale x 2 x i64> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i64m2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1167,8 +1167,8 @@ declare <vscale x 2 x i64> @llvm.riscv.vssra.mask.nxv2i64.nxv2i64.i32(<vscale x
 define <vscale x 2 x i64> @test_vssra_vx_i64m2_m(<vscale x 2 x i1> %mask, <vscale x 2 x i64> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i64m2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1181,8 +1181,8 @@ declare <vscale x 2 x i64> @llvm.riscv.vssra.mask.nxv2i64.i32.i32(<vscale x 2 x
 define <vscale x 4 x i64> @test_vssra_vv_i64m4_m(<vscale x 4 x i1> %mask, <vscale x 4 x i64> %op1, <vscale x 4 x i64> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i64m4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1195,8 +1195,8 @@ declare <vscale x 4 x i64> @llvm.riscv.vssra.mask.nxv4i64.nxv4i64.i32(<vscale x
 define <vscale x 4 x i64> @test_vssra_vx_i64m4_m(<vscale x 4 x i1> %mask, <vscale x 4 x i64> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i64m4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1209,8 +1209,8 @@ declare <vscale x 4 x i64> @llvm.riscv.vssra.mask.nxv4i64.i32.i32(<vscale x 4 x
 define <vscale x 8 x i64> @test_vssra_vv_i64m8_m(<vscale x 8 x i1> %mask, <vscale x 8 x i64> %op1, <vscale x 8 x i64> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i64m8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1223,8 +1223,8 @@ declare <vscale x 8 x i64> @llvm.riscv.vssra.mask.nxv8i64.nxv8i64.i32(<vscale x
 define <vscale x 8 x i64> @test_vssra_vx_i64m8_m(<vscale x 8 x i1> %mask, <vscale x 8 x i64> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i64m8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssra-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vssra-rv64.ll
index 96ca5e32cf36..b7a84e58e6e6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vssra-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vssra-rv64.ll
@@ -5,8 +5,8 @@
 define <vscale x 1 x i8> @test_vssra_vv_i8mf8(<vscale x 1 x i8> %op1, <vscale x 1 x i8> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i8mf8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -19,8 +19,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vssra.nxv1i8.nxv1i8.i64(<vscale x 1 x i8>,
 define <vscale x 1 x i8> @test_vssra_vx_i8mf8(<vscale x 1 x i8> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i8mf8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -33,8 +33,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vssra.nxv1i8.i64.i64(<vscale x 1 x i8>, <v
 define <vscale x 2 x i8> @test_vssra_vv_i8mf4(<vscale x 2 x i8> %op1, <vscale x 2 x i8> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i8mf4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -47,8 +47,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vssra.nxv2i8.nxv2i8.i64(<vscale x 2 x i8>,
 define <vscale x 2 x i8> @test_vssra_vx_i8mf4(<vscale x 2 x i8> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i8mf4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -61,8 +61,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vssra.nxv2i8.i64.i64(<vscale x 2 x i8>, <v
 define <vscale x 4 x i8> @test_vssra_vv_i8mf2(<vscale x 4 x i8> %op1, <vscale x 4 x i8> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i8mf2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -75,8 +75,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vssra.nxv4i8.nxv4i8.i64(<vscale x 4 x i8>,
 define <vscale x 4 x i8> @test_vssra_vx_i8mf2(<vscale x 4 x i8> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i8mf2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -89,8 +89,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vssra.nxv4i8.i64.i64(<vscale x 4 x i8>, <v
 define <vscale x 8 x i8> @test_vssra_vv_i8m1(<vscale x 8 x i8> %op1, <vscale x 8 x i8> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i8m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -103,8 +103,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vssra.nxv8i8.nxv8i8.i64(<vscale x 8 x i8>,
 define <vscale x 8 x i8> @test_vssra_vx_i8m1(<vscale x 8 x i8> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i8m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -117,8 +117,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vssra.nxv8i8.i64.i64(<vscale x 8 x i8>, <v
 define <vscale x 16 x i8> @test_vssra_vv_i8m2(<vscale x 16 x i8> %op1, <vscale x 16 x i8> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i8m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -131,8 +131,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vssra.nxv16i8.nxv16i8.i64(<vscale x 16 x
 define <vscale x 16 x i8> @test_vssra_vx_i8m2(<vscale x 16 x i8> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i8m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -145,8 +145,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vssra.nxv16i8.i64.i64(<vscale x 16 x i8>,
 define <vscale x 32 x i8> @test_vssra_vv_i8m4(<vscale x 32 x i8> %op1, <vscale x 32 x i8> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i8m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -159,8 +159,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vssra.nxv32i8.nxv32i8.i64(<vscale x 32 x
 define <vscale x 32 x i8> @test_vssra_vx_i8m4(<vscale x 32 x i8> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i8m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -173,8 +173,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vssra.nxv32i8.i64.i64(<vscale x 32 x i8>,
 define <vscale x 64 x i8> @test_vssra_vv_i8m8(<vscale x 64 x i8> %op1, <vscale x 64 x i8> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i8m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -187,8 +187,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vssra.nxv64i8.nxv64i8.i64(<vscale x 64 x
 define <vscale x 64 x i8> @test_vssra_vx_i8m8(<vscale x 64 x i8> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i8m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -201,8 +201,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vssra.nxv64i8.i64.i64(<vscale x 64 x i8>,
 define <vscale x 1 x i16> @test_vssra_vv_i16mf4(<vscale x 1 x i16> %op1, <vscale x 1 x i16> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i16mf4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -215,8 +215,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vssra.nxv1i16.nxv1i16.i64(<vscale x 1 x i
 define <vscale x 1 x i16> @test_vssra_vx_i16mf4(<vscale x 1 x i16> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i16mf4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -229,8 +229,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vssra.nxv1i16.i64.i64(<vscale x 1 x i16>,
 define <vscale x 2 x i16> @test_vssra_vv_i16mf2(<vscale x 2 x i16> %op1, <vscale x 2 x i16> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i16mf2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -243,8 +243,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vssra.nxv2i16.nxv2i16.i64(<vscale x 2 x i
 define <vscale x 2 x i16> @test_vssra_vx_i16mf2(<vscale x 2 x i16> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i16mf2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -257,8 +257,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vssra.nxv2i16.i64.i64(<vscale x 2 x i16>,
 define <vscale x 4 x i16> @test_vssra_vv_i16m1(<vscale x 4 x i16> %op1, <vscale x 4 x i16> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i16m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -271,8 +271,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vssra.nxv4i16.nxv4i16.i64(<vscale x 4 x i
 define <vscale x 4 x i16> @test_vssra_vx_i16m1(<vscale x 4 x i16> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i16m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -285,8 +285,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vssra.nxv4i16.i64.i64(<vscale x 4 x i16>,
 define <vscale x 8 x i16> @test_vssra_vv_i16m2(<vscale x 8 x i16> %op1, <vscale x 8 x i16> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i16m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -299,8 +299,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vssra.nxv8i16.nxv8i16.i64(<vscale x 8 x i
 define <vscale x 8 x i16> @test_vssra_vx_i16m2(<vscale x 8 x i16> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i16m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -313,8 +313,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vssra.nxv8i16.i64.i64(<vscale x 8 x i16>,
 define <vscale x 16 x i16> @test_vssra_vv_i16m4(<vscale x 16 x i16> %op1, <vscale x 16 x i16> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i16m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -327,8 +327,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vssra.nxv16i16.nxv16i16.i64(<vscale x 16
 define <vscale x 16 x i16> @test_vssra_vx_i16m4(<vscale x 16 x i16> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i16m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -341,8 +341,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vssra.nxv16i16.i64.i64(<vscale x 16 x i1
 define <vscale x 32 x i16> @test_vssra_vv_i16m8(<vscale x 32 x i16> %op1, <vscale x 32 x i16> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i16m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -355,8 +355,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vssra.nxv32i16.nxv32i16.i64(<vscale x 32
 define <vscale x 32 x i16> @test_vssra_vx_i16m8(<vscale x 32 x i16> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i16m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -369,8 +369,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vssra.nxv32i16.i64.i64(<vscale x 32 x i1
 define <vscale x 1 x i32> @test_vssra_vv_i32mf2(<vscale x 1 x i32> %op1, <vscale x 1 x i32> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i32mf2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -383,8 +383,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vssra.nxv1i32.nxv1i32.i64(<vscale x 1 x i
 define <vscale x 1 x i32> @test_vssra_vx_i32mf2(<vscale x 1 x i32> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i32mf2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -397,8 +397,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vssra.nxv1i32.i64.i64(<vscale x 1 x i32>,
 define <vscale x 2 x i32> @test_vssra_vv_i32m1(<vscale x 2 x i32> %op1, <vscale x 2 x i32> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i32m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -411,8 +411,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vssra.nxv2i32.nxv2i32.i64(<vscale x 2 x i
 define <vscale x 2 x i32> @test_vssra_vx_i32m1(<vscale x 2 x i32> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i32m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -425,8 +425,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vssra.nxv2i32.i64.i64(<vscale x 2 x i32>,
 define <vscale x 4 x i32> @test_vssra_vv_i32m2(<vscale x 4 x i32> %op1, <vscale x 4 x i32> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i32m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -439,8 +439,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vssra.nxv4i32.nxv4i32.i64(<vscale x 4 x i
 define <vscale x 4 x i32> @test_vssra_vx_i32m2(<vscale x 4 x i32> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i32m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -453,8 +453,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vssra.nxv4i32.i64.i64(<vscale x 4 x i32>,
 define <vscale x 8 x i32> @test_vssra_vv_i32m4(<vscale x 8 x i32> %op1, <vscale x 8 x i32> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i32m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -467,8 +467,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vssra.nxv8i32.nxv8i32.i64(<vscale x 8 x i
 define <vscale x 8 x i32> @test_vssra_vx_i32m4(<vscale x 8 x i32> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i32m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -481,8 +481,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vssra.nxv8i32.i64.i64(<vscale x 8 x i32>,
 define <vscale x 16 x i32> @test_vssra_vv_i32m8(<vscale x 16 x i32> %op1, <vscale x 16 x i32> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i32m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -495,8 +495,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vssra.nxv16i32.nxv16i32.i64(<vscale x 16
 define <vscale x 16 x i32> @test_vssra_vx_i32m8(<vscale x 16 x i32> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i32m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -509,8 +509,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vssra.nxv16i32.i64.i64(<vscale x 16 x i3
 define <vscale x 1 x i64> @test_vssra_vv_i64m1(<vscale x 1 x i64> %op1, <vscale x 1 x i64> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i64m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -523,8 +523,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vssra.nxv1i64.nxv1i64.i64(<vscale x 1 x i
 define <vscale x 1 x i64> @test_vssra_vx_i64m1(<vscale x 1 x i64> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i64m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -537,8 +537,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vssra.nxv1i64.i64.i64(<vscale x 1 x i64>,
 define <vscale x 2 x i64> @test_vssra_vv_i64m2(<vscale x 2 x i64> %op1, <vscale x 2 x i64> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i64m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -551,8 +551,8 @@ declare <vscale x 2 x i64> @llvm.riscv.vssra.nxv2i64.nxv2i64.i64(<vscale x 2 x i
 define <vscale x 2 x i64> @test_vssra_vx_i64m2(<vscale x 2 x i64> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i64m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -565,8 +565,8 @@ declare <vscale x 2 x i64> @llvm.riscv.vssra.nxv2i64.i64.i64(<vscale x 2 x i64>,
 define <vscale x 4 x i64> @test_vssra_vv_i64m4(<vscale x 4 x i64> %op1, <vscale x 4 x i64> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i64m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -579,8 +579,8 @@ declare <vscale x 4 x i64> @llvm.riscv.vssra.nxv4i64.nxv4i64.i64(<vscale x 4 x i
 define <vscale x 4 x i64> @test_vssra_vx_i64m4(<vscale x 4 x i64> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i64m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -593,8 +593,8 @@ declare <vscale x 4 x i64> @llvm.riscv.vssra.nxv4i64.i64.i64(<vscale x 4 x i64>,
 define <vscale x 8 x i64> @test_vssra_vv_i64m8(<vscale x 8 x i64> %op1, <vscale x 8 x i64> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i64m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -607,8 +607,8 @@ declare <vscale x 8 x i64> @llvm.riscv.vssra.nxv8i64.nxv8i64.i64(<vscale x 8 x i
 define <vscale x 8 x i64> @test_vssra_vx_i64m8(<vscale x 8 x i64> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i64m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -621,8 +621,8 @@ declare <vscale x 8 x i64> @llvm.riscv.vssra.nxv8i64.i64.i64(<vscale x 8 x i64>,
 define <vscale x 1 x i8> @test_vssra_vv_i8mf8_m(<vscale x 1 x i1> %mask, <vscale x 1 x i8> %op1, <vscale x 1 x i8> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i8mf8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -635,8 +635,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vssra.mask.nxv1i8.nxv1i8.i64(<vscale x 1 x
 define <vscale x 1 x i8> @test_vssra_vx_i8mf8_m(<vscale x 1 x i1> %mask, <vscale x 1 x i8> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i8mf8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -649,8 +649,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vssra.mask.nxv1i8.i64.i64(<vscale x 1 x i8
 define <vscale x 2 x i8> @test_vssra_vv_i8mf4_m(<vscale x 2 x i1> %mask, <vscale x 2 x i8> %op1, <vscale x 2 x i8> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i8mf4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -663,8 +663,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vssra.mask.nxv2i8.nxv2i8.i64(<vscale x 2 x
 define <vscale x 2 x i8> @test_vssra_vx_i8mf4_m(<vscale x 2 x i1> %mask, <vscale x 2 x i8> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i8mf4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -677,8 +677,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vssra.mask.nxv2i8.i64.i64(<vscale x 2 x i8
 define <vscale x 4 x i8> @test_vssra_vv_i8mf2_m(<vscale x 4 x i1> %mask, <vscale x 4 x i8> %op1, <vscale x 4 x i8> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i8mf2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -691,8 +691,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vssra.mask.nxv4i8.nxv4i8.i64(<vscale x 4 x
 define <vscale x 4 x i8> @test_vssra_vx_i8mf2_m(<vscale x 4 x i1> %mask, <vscale x 4 x i8> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i8mf2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -705,8 +705,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vssra.mask.nxv4i8.i64.i64(<vscale x 4 x i8
 define <vscale x 8 x i8> @test_vssra_vv_i8m1_m(<vscale x 8 x i1> %mask, <vscale x 8 x i8> %op1, <vscale x 8 x i8> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i8m1_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -719,8 +719,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vssra.mask.nxv8i8.nxv8i8.i64(<vscale x 8 x
 define <vscale x 8 x i8> @test_vssra_vx_i8m1_m(<vscale x 8 x i1> %mask, <vscale x 8 x i8> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i8m1_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -733,8 +733,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vssra.mask.nxv8i8.i64.i64(<vscale x 8 x i8
 define <vscale x 16 x i8> @test_vssra_vv_i8m2_m(<vscale x 16 x i1> %mask, <vscale x 16 x i8> %op1, <vscale x 16 x i8> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i8m2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -747,8 +747,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vssra.mask.nxv16i8.nxv16i8.i64(<vscale x
 define <vscale x 16 x i8> @test_vssra_vx_i8m2_m(<vscale x 16 x i1> %mask, <vscale x 16 x i8> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i8m2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -761,8 +761,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vssra.mask.nxv16i8.i64.i64(<vscale x 16 x
 define <vscale x 32 x i8> @test_vssra_vv_i8m4_m(<vscale x 32 x i1> %mask, <vscale x 32 x i8> %op1, <vscale x 32 x i8> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i8m4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -775,8 +775,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vssra.mask.nxv32i8.nxv32i8.i64(<vscale x
 define <vscale x 32 x i8> @test_vssra_vx_i8m4_m(<vscale x 32 x i1> %mask, <vscale x 32 x i8> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i8m4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -789,8 +789,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vssra.mask.nxv32i8.i64.i64(<vscale x 32 x
 define <vscale x 64 x i8> @test_vssra_vv_i8m8_m(<vscale x 64 x i1> %mask, <vscale x 64 x i8> %op1, <vscale x 64 x i8> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i8m8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -803,8 +803,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vssra.mask.nxv64i8.nxv64i8.i64(<vscale x
 define <vscale x 64 x i8> @test_vssra_vx_i8m8_m(<vscale x 64 x i1> %mask, <vscale x 64 x i8> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i8m8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -817,8 +817,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vssra.mask.nxv64i8.i64.i64(<vscale x 64 x
 define <vscale x 1 x i16> @test_vssra_vv_i16mf4_m(<vscale x 1 x i1> %mask, <vscale x 1 x i16> %op1, <vscale x 1 x i16> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i16mf4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -831,8 +831,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vssra.mask.nxv1i16.nxv1i16.i64(<vscale x
 define <vscale x 1 x i16> @test_vssra_vx_i16mf4_m(<vscale x 1 x i1> %mask, <vscale x 1 x i16> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i16mf4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -845,8 +845,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vssra.mask.nxv1i16.i64.i64(<vscale x 1 x
 define <vscale x 2 x i16> @test_vssra_vv_i16mf2_m(<vscale x 2 x i1> %mask, <vscale x 2 x i16> %op1, <vscale x 2 x i16> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i16mf2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -859,8 +859,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vssra.mask.nxv2i16.nxv2i16.i64(<vscale x
 define <vscale x 2 x i16> @test_vssra_vx_i16mf2_m(<vscale x 2 x i1> %mask, <vscale x 2 x i16> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i16mf2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -873,8 +873,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vssra.mask.nxv2i16.i64.i64(<vscale x 2 x
 define <vscale x 4 x i16> @test_vssra_vv_i16m1_m(<vscale x 4 x i1> %mask, <vscale x 4 x i16> %op1, <vscale x 4 x i16> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i16m1_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -887,8 +887,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vssra.mask.nxv4i16.nxv4i16.i64(<vscale x
 define <vscale x 4 x i16> @test_vssra_vx_i16m1_m(<vscale x 4 x i1> %mask, <vscale x 4 x i16> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i16m1_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -901,8 +901,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vssra.mask.nxv4i16.i64.i64(<vscale x 4 x
 define <vscale x 8 x i16> @test_vssra_vv_i16m2_m(<vscale x 8 x i1> %mask, <vscale x 8 x i16> %op1, <vscale x 8 x i16> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i16m2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -915,8 +915,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vssra.mask.nxv8i16.nxv8i16.i64(<vscale x
 define <vscale x 8 x i16> @test_vssra_vx_i16m2_m(<vscale x 8 x i1> %mask, <vscale x 8 x i16> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i16m2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -929,8 +929,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vssra.mask.nxv8i16.i64.i64(<vscale x 8 x
 define <vscale x 16 x i16> @test_vssra_vv_i16m4_m(<vscale x 16 x i1> %mask, <vscale x 16 x i16> %op1, <vscale x 16 x i16> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i16m4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -943,8 +943,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vssra.mask.nxv16i16.nxv16i16.i64(<vscale
 define <vscale x 16 x i16> @test_vssra_vx_i16m4_m(<vscale x 16 x i1> %mask, <vscale x 16 x i16> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i16m4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -957,8 +957,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vssra.mask.nxv16i16.i64.i64(<vscale x 16
 define <vscale x 32 x i16> @test_vssra_vv_i16m8_m(<vscale x 32 x i1> %mask, <vscale x 32 x i16> %op1, <vscale x 32 x i16> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i16m8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -971,8 +971,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vssra.mask.nxv32i16.nxv32i16.i64(<vscale
 define <vscale x 32 x i16> @test_vssra_vx_i16m8_m(<vscale x 32 x i1> %mask, <vscale x 32 x i16> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i16m8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -985,8 +985,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vssra.mask.nxv32i16.i64.i64(<vscale x 32
 define <vscale x 1 x i32> @test_vssra_vv_i32mf2_m(<vscale x 1 x i1> %mask, <vscale x 1 x i32> %op1, <vscale x 1 x i32> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i32mf2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -999,8 +999,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vssra.mask.nxv1i32.nxv1i32.i64(<vscale x
 define <vscale x 1 x i32> @test_vssra_vx_i32mf2_m(<vscale x 1 x i1> %mask, <vscale x 1 x i32> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i32mf2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1013,8 +1013,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vssra.mask.nxv1i32.i64.i64(<vscale x 1 x
 define <vscale x 2 x i32> @test_vssra_vv_i32m1_m(<vscale x 2 x i1> %mask, <vscale x 2 x i32> %op1, <vscale x 2 x i32> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i32m1_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1027,8 +1027,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vssra.mask.nxv2i32.nxv2i32.i64(<vscale x
 define <vscale x 2 x i32> @test_vssra_vx_i32m1_m(<vscale x 2 x i1> %mask, <vscale x 2 x i32> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i32m1_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1041,8 +1041,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vssra.mask.nxv2i32.i64.i64(<vscale x 2 x
 define <vscale x 4 x i32> @test_vssra_vv_i32m2_m(<vscale x 4 x i1> %mask, <vscale x 4 x i32> %op1, <vscale x 4 x i32> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i32m2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1055,8 +1055,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vssra.mask.nxv4i32.nxv4i32.i64(<vscale x
 define <vscale x 4 x i32> @test_vssra_vx_i32m2_m(<vscale x 4 x i1> %mask, <vscale x 4 x i32> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i32m2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1069,8 +1069,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vssra.mask.nxv4i32.i64.i64(<vscale x 4 x
 define <vscale x 8 x i32> @test_vssra_vv_i32m4_m(<vscale x 8 x i1> %mask, <vscale x 8 x i32> %op1, <vscale x 8 x i32> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i32m4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1083,8 +1083,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vssra.mask.nxv8i32.nxv8i32.i64(<vscale x
 define <vscale x 8 x i32> @test_vssra_vx_i32m4_m(<vscale x 8 x i1> %mask, <vscale x 8 x i32> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i32m4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1097,8 +1097,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vssra.mask.nxv8i32.i64.i64(<vscale x 8 x
 define <vscale x 16 x i32> @test_vssra_vv_i32m8_m(<vscale x 16 x i1> %mask, <vscale x 16 x i32> %op1, <vscale x 16 x i32> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i32m8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1111,8 +1111,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vssra.mask.nxv16i32.nxv16i32.i64(<vscale
 define <vscale x 16 x i32> @test_vssra_vx_i32m8_m(<vscale x 16 x i1> %mask, <vscale x 16 x i32> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i32m8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1125,8 +1125,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vssra.mask.nxv16i32.i64.i64(<vscale x 16
 define <vscale x 1 x i64> @test_vssra_vv_i64m1_m(<vscale x 1 x i1> %mask, <vscale x 1 x i64> %op1, <vscale x 1 x i64> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i64m1_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1139,8 +1139,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vssra.mask.nxv1i64.nxv1i64.i64(<vscale x
 define <vscale x 1 x i64> @test_vssra_vx_i64m1_m(<vscale x 1 x i1> %mask, <vscale x 1 x i64> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i64m1_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1153,8 +1153,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vssra.mask.nxv1i64.i64.i64(<vscale x 1 x
 define <vscale x 2 x i64> @test_vssra_vv_i64m2_m(<vscale x 2 x i1> %mask, <vscale x 2 x i64> %op1, <vscale x 2 x i64> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i64m2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1167,8 +1167,8 @@ declare <vscale x 2 x i64> @llvm.riscv.vssra.mask.nxv2i64.nxv2i64.i64(<vscale x
 define <vscale x 2 x i64> @test_vssra_vx_i64m2_m(<vscale x 2 x i1> %mask, <vscale x 2 x i64> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i64m2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1181,8 +1181,8 @@ declare <vscale x 2 x i64> @llvm.riscv.vssra.mask.nxv2i64.i64.i64(<vscale x 2 x
 define <vscale x 4 x i64> @test_vssra_vv_i64m4_m(<vscale x 4 x i1> %mask, <vscale x 4 x i64> %op1, <vscale x 4 x i64> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i64m4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1195,8 +1195,8 @@ declare <vscale x 4 x i64> @llvm.riscv.vssra.mask.nxv4i64.nxv4i64.i64(<vscale x
 define <vscale x 4 x i64> @test_vssra_vx_i64m4_m(<vscale x 4 x i1> %mask, <vscale x 4 x i64> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i64m4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1209,8 +1209,8 @@ declare <vscale x 4 x i64> @llvm.riscv.vssra.mask.nxv4i64.i64.i64(<vscale x 4 x
 define <vscale x 8 x i64> @test_vssra_vv_i64m8_m(<vscale x 8 x i1> %mask, <vscale x 8 x i64> %op1, <vscale x 8 x i64> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vv_i64m8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vssra.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1223,8 +1223,8 @@ declare <vscale x 8 x i64> @llvm.riscv.vssra.mask.nxv8i64.nxv8i64.i64(<vscale x
 define <vscale x 8 x i64> @test_vssra_vx_i64m8_m(<vscale x 8 x i1> %mask, <vscale x 8 x i64> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssra_vx_i64m8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vssra.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssrl-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vssrl-rv32.ll
index c1a064984dcc..0c2cdff65776 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vssrl-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vssrl-rv32.ll
@@ -5,8 +5,8 @@
 define <vscale x 1 x i8> @test_vssrl_vv_u8mf8(<vscale x 1 x i8> %op1, <vscale x 1 x i8> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u8mf8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -19,8 +19,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vssrl.nxv1i8.nxv1i8.i32(<vscale x 1 x i8>,
 define <vscale x 1 x i8> @test_vssrl_vx_u8mf8(<vscale x 1 x i8> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u8mf8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -33,8 +33,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vssrl.nxv1i8.i32.i32(<vscale x 1 x i8>, <v
 define <vscale x 2 x i8> @test_vssrl_vv_u8mf4(<vscale x 2 x i8> %op1, <vscale x 2 x i8> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u8mf4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -47,8 +47,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vssrl.nxv2i8.nxv2i8.i32(<vscale x 2 x i8>,
 define <vscale x 2 x i8> @test_vssrl_vx_u8mf4(<vscale x 2 x i8> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u8mf4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -61,8 +61,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vssrl.nxv2i8.i32.i32(<vscale x 2 x i8>, <v
 define <vscale x 4 x i8> @test_vssrl_vv_u8mf2(<vscale x 4 x i8> %op1, <vscale x 4 x i8> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u8mf2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -75,8 +75,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vssrl.nxv4i8.nxv4i8.i32(<vscale x 4 x i8>,
 define <vscale x 4 x i8> @test_vssrl_vx_u8mf2(<vscale x 4 x i8> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u8mf2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -89,8 +89,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vssrl.nxv4i8.i32.i32(<vscale x 4 x i8>, <v
 define <vscale x 8 x i8> @test_vssrl_vv_u8m1(<vscale x 8 x i8> %op1, <vscale x 8 x i8> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u8m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -103,8 +103,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vssrl.nxv8i8.nxv8i8.i32(<vscale x 8 x i8>,
 define <vscale x 8 x i8> @test_vssrl_vx_u8m1(<vscale x 8 x i8> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u8m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -117,8 +117,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vssrl.nxv8i8.i32.i32(<vscale x 8 x i8>, <v
 define <vscale x 16 x i8> @test_vssrl_vv_u8m2(<vscale x 16 x i8> %op1, <vscale x 16 x i8> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u8m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -131,8 +131,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vssrl.nxv16i8.nxv16i8.i32(<vscale x 16 x
 define <vscale x 16 x i8> @test_vssrl_vx_u8m2(<vscale x 16 x i8> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u8m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -145,8 +145,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vssrl.nxv16i8.i32.i32(<vscale x 16 x i8>,
 define <vscale x 32 x i8> @test_vssrl_vv_u8m4(<vscale x 32 x i8> %op1, <vscale x 32 x i8> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u8m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -159,8 +159,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vssrl.nxv32i8.nxv32i8.i32(<vscale x 32 x
 define <vscale x 32 x i8> @test_vssrl_vx_u8m4(<vscale x 32 x i8> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u8m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -173,8 +173,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vssrl.nxv32i8.i32.i32(<vscale x 32 x i8>,
 define <vscale x 64 x i8> @test_vssrl_vv_u8m8(<vscale x 64 x i8> %op1, <vscale x 64 x i8> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u8m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -187,8 +187,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vssrl.nxv64i8.nxv64i8.i32(<vscale x 64 x
 define <vscale x 64 x i8> @test_vssrl_vx_u8m8(<vscale x 64 x i8> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u8m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -201,8 +201,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vssrl.nxv64i8.i32.i32(<vscale x 64 x i8>,
 define <vscale x 1 x i16> @test_vssrl_vv_u16mf4(<vscale x 1 x i16> %op1, <vscale x 1 x i16> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u16mf4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -215,8 +215,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vssrl.nxv1i16.nxv1i16.i32(<vscale x 1 x i
 define <vscale x 1 x i16> @test_vssrl_vx_u16mf4(<vscale x 1 x i16> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u16mf4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -229,8 +229,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vssrl.nxv1i16.i32.i32(<vscale x 1 x i16>,
 define <vscale x 2 x i16> @test_vssrl_vv_u16mf2(<vscale x 2 x i16> %op1, <vscale x 2 x i16> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u16mf2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -243,8 +243,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vssrl.nxv2i16.nxv2i16.i32(<vscale x 2 x i
 define <vscale x 2 x i16> @test_vssrl_vx_u16mf2(<vscale x 2 x i16> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u16mf2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -257,8 +257,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vssrl.nxv2i16.i32.i32(<vscale x 2 x i16>,
 define <vscale x 4 x i16> @test_vssrl_vv_u16m1(<vscale x 4 x i16> %op1, <vscale x 4 x i16> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u16m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -271,8 +271,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vssrl.nxv4i16.nxv4i16.i32(<vscale x 4 x i
 define <vscale x 4 x i16> @test_vssrl_vx_u16m1(<vscale x 4 x i16> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u16m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -285,8 +285,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vssrl.nxv4i16.i32.i32(<vscale x 4 x i16>,
 define <vscale x 8 x i16> @test_vssrl_vv_u16m2(<vscale x 8 x i16> %op1, <vscale x 8 x i16> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u16m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -299,8 +299,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vssrl.nxv8i16.nxv8i16.i32(<vscale x 8 x i
 define <vscale x 8 x i16> @test_vssrl_vx_u16m2(<vscale x 8 x i16> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u16m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -313,8 +313,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vssrl.nxv8i16.i32.i32(<vscale x 8 x i16>,
 define <vscale x 16 x i16> @test_vssrl_vv_u16m4(<vscale x 16 x i16> %op1, <vscale x 16 x i16> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u16m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -327,8 +327,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vssrl.nxv16i16.nxv16i16.i32(<vscale x 16
 define <vscale x 16 x i16> @test_vssrl_vx_u16m4(<vscale x 16 x i16> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u16m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -341,8 +341,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vssrl.nxv16i16.i32.i32(<vscale x 16 x i1
 define <vscale x 32 x i16> @test_vssrl_vv_u16m8(<vscale x 32 x i16> %op1, <vscale x 32 x i16> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u16m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -355,8 +355,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vssrl.nxv32i16.nxv32i16.i32(<vscale x 32
 define <vscale x 32 x i16> @test_vssrl_vx_u16m8(<vscale x 32 x i16> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u16m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -369,8 +369,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vssrl.nxv32i16.i32.i32(<vscale x 32 x i1
 define <vscale x 1 x i32> @test_vssrl_vv_u32mf2(<vscale x 1 x i32> %op1, <vscale x 1 x i32> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u32mf2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -383,8 +383,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vssrl.nxv1i32.nxv1i32.i32(<vscale x 1 x i
 define <vscale x 1 x i32> @test_vssrl_vx_u32mf2(<vscale x 1 x i32> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u32mf2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -397,8 +397,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vssrl.nxv1i32.i32.i32(<vscale x 1 x i32>,
 define <vscale x 2 x i32> @test_vssrl_vv_u32m1(<vscale x 2 x i32> %op1, <vscale x 2 x i32> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u32m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -411,8 +411,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vssrl.nxv2i32.nxv2i32.i32(<vscale x 2 x i
 define <vscale x 2 x i32> @test_vssrl_vx_u32m1(<vscale x 2 x i32> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u32m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -425,8 +425,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vssrl.nxv2i32.i32.i32(<vscale x 2 x i32>,
 define <vscale x 4 x i32> @test_vssrl_vv_u32m2(<vscale x 4 x i32> %op1, <vscale x 4 x i32> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u32m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -439,8 +439,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vssrl.nxv4i32.nxv4i32.i32(<vscale x 4 x i
 define <vscale x 4 x i32> @test_vssrl_vx_u32m2(<vscale x 4 x i32> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u32m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -453,8 +453,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vssrl.nxv4i32.i32.i32(<vscale x 4 x i32>,
 define <vscale x 8 x i32> @test_vssrl_vv_u32m4(<vscale x 8 x i32> %op1, <vscale x 8 x i32> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u32m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -467,8 +467,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vssrl.nxv8i32.nxv8i32.i32(<vscale x 8 x i
 define <vscale x 8 x i32> @test_vssrl_vx_u32m4(<vscale x 8 x i32> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u32m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -481,8 +481,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vssrl.nxv8i32.i32.i32(<vscale x 8 x i32>,
 define <vscale x 16 x i32> @test_vssrl_vv_u32m8(<vscale x 16 x i32> %op1, <vscale x 16 x i32> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u32m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -495,8 +495,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vssrl.nxv16i32.nxv16i32.i32(<vscale x 16
 define <vscale x 16 x i32> @test_vssrl_vx_u32m8(<vscale x 16 x i32> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u32m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -509,8 +509,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vssrl.nxv16i32.i32.i32(<vscale x 16 x i3
 define <vscale x 1 x i64> @test_vssrl_vv_u64m1(<vscale x 1 x i64> %op1, <vscale x 1 x i64> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u64m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -523,8 +523,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vssrl.nxv1i64.nxv1i64.i32(<vscale x 1 x i
 define <vscale x 1 x i64> @test_vssrl_vx_u64m1(<vscale x 1 x i64> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u64m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -537,8 +537,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vssrl.nxv1i64.i32.i32(<vscale x 1 x i64>,
 define <vscale x 2 x i64> @test_vssrl_vv_u64m2(<vscale x 2 x i64> %op1, <vscale x 2 x i64> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u64m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -551,8 +551,8 @@ declare <vscale x 2 x i64> @llvm.riscv.vssrl.nxv2i64.nxv2i64.i32(<vscale x 2 x i
 define <vscale x 2 x i64> @test_vssrl_vx_u64m2(<vscale x 2 x i64> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u64m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -565,8 +565,8 @@ declare <vscale x 2 x i64> @llvm.riscv.vssrl.nxv2i64.i32.i32(<vscale x 2 x i64>,
 define <vscale x 4 x i64> @test_vssrl_vv_u64m4(<vscale x 4 x i64> %op1, <vscale x 4 x i64> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u64m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -579,8 +579,8 @@ declare <vscale x 4 x i64> @llvm.riscv.vssrl.nxv4i64.nxv4i64.i32(<vscale x 4 x i
 define <vscale x 4 x i64> @test_vssrl_vx_u64m4(<vscale x 4 x i64> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u64m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -593,8 +593,8 @@ declare <vscale x 4 x i64> @llvm.riscv.vssrl.nxv4i64.i32.i32(<vscale x 4 x i64>,
 define <vscale x 8 x i64> @test_vssrl_vv_u64m8(<vscale x 8 x i64> %op1, <vscale x 8 x i64> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u64m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -607,8 +607,8 @@ declare <vscale x 8 x i64> @llvm.riscv.vssrl.nxv8i64.nxv8i64.i32(<vscale x 8 x i
 define <vscale x 8 x i64> @test_vssrl_vx_u64m8(<vscale x 8 x i64> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u64m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -621,8 +621,8 @@ declare <vscale x 8 x i64> @llvm.riscv.vssrl.nxv8i64.i32.i32(<vscale x 8 x i64>,
 define <vscale x 1 x i8> @test_vssrl_vv_u8mf8_m(<vscale x 1 x i1> %mask, <vscale x 1 x i8> %op1, <vscale x 1 x i8> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u8mf8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -635,8 +635,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vssrl.mask.nxv1i8.nxv1i8.i32(<vscale x 1 x
 define <vscale x 1 x i8> @test_vssrl_vx_u8mf8_m(<vscale x 1 x i1> %mask, <vscale x 1 x i8> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u8mf8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -649,8 +649,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vssrl.mask.nxv1i8.i32.i32(<vscale x 1 x i8
 define <vscale x 2 x i8> @test_vssrl_vv_u8mf4_m(<vscale x 2 x i1> %mask, <vscale x 2 x i8> %op1, <vscale x 2 x i8> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u8mf4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -663,8 +663,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vssrl.mask.nxv2i8.nxv2i8.i32(<vscale x 2 x
 define <vscale x 2 x i8> @test_vssrl_vx_u8mf4_m(<vscale x 2 x i1> %mask, <vscale x 2 x i8> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u8mf4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -677,8 +677,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vssrl.mask.nxv2i8.i32.i32(<vscale x 2 x i8
 define <vscale x 4 x i8> @test_vssrl_vv_u8mf2_m(<vscale x 4 x i1> %mask, <vscale x 4 x i8> %op1, <vscale x 4 x i8> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u8mf2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -691,8 +691,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vssrl.mask.nxv4i8.nxv4i8.i32(<vscale x 4 x
 define <vscale x 4 x i8> @test_vssrl_vx_u8mf2_m(<vscale x 4 x i1> %mask, <vscale x 4 x i8> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u8mf2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -705,8 +705,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vssrl.mask.nxv4i8.i32.i32(<vscale x 4 x i8
 define <vscale x 8 x i8> @test_vssrl_vv_u8m1_m(<vscale x 8 x i1> %mask, <vscale x 8 x i8> %op1, <vscale x 8 x i8> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u8m1_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -719,8 +719,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vssrl.mask.nxv8i8.nxv8i8.i32(<vscale x 8 x
 define <vscale x 8 x i8> @test_vssrl_vx_u8m1_m(<vscale x 8 x i1> %mask, <vscale x 8 x i8> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u8m1_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -733,8 +733,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vssrl.mask.nxv8i8.i32.i32(<vscale x 8 x i8
 define <vscale x 16 x i8> @test_vssrl_vv_u8m2_m(<vscale x 16 x i1> %mask, <vscale x 16 x i8> %op1, <vscale x 16 x i8> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u8m2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -747,8 +747,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vssrl.mask.nxv16i8.nxv16i8.i32(<vscale x
 define <vscale x 16 x i8> @test_vssrl_vx_u8m2_m(<vscale x 16 x i1> %mask, <vscale x 16 x i8> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u8m2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -761,8 +761,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vssrl.mask.nxv16i8.i32.i32(<vscale x 16 x
 define <vscale x 32 x i8> @test_vssrl_vv_u8m4_m(<vscale x 32 x i1> %mask, <vscale x 32 x i8> %op1, <vscale x 32 x i8> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u8m4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -775,8 +775,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vssrl.mask.nxv32i8.nxv32i8.i32(<vscale x
 define <vscale x 32 x i8> @test_vssrl_vx_u8m4_m(<vscale x 32 x i1> %mask, <vscale x 32 x i8> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u8m4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -789,8 +789,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vssrl.mask.nxv32i8.i32.i32(<vscale x 32 x
 define <vscale x 64 x i8> @test_vssrl_vv_u8m8_m(<vscale x 64 x i1> %mask, <vscale x 64 x i8> %op1, <vscale x 64 x i8> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u8m8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -803,8 +803,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vssrl.mask.nxv64i8.nxv64i8.i32(<vscale x
 define <vscale x 64 x i8> @test_vssrl_vx_u8m8_m(<vscale x 64 x i1> %mask, <vscale x 64 x i8> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u8m8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -817,8 +817,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vssrl.mask.nxv64i8.i32.i32(<vscale x 64 x
 define <vscale x 1 x i16> @test_vssrl_vv_u16mf4_m(<vscale x 1 x i1> %mask, <vscale x 1 x i16> %op1, <vscale x 1 x i16> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u16mf4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -831,8 +831,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vssrl.mask.nxv1i16.nxv1i16.i32(<vscale x
 define <vscale x 1 x i16> @test_vssrl_vx_u16mf4_m(<vscale x 1 x i1> %mask, <vscale x 1 x i16> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u16mf4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -845,8 +845,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vssrl.mask.nxv1i16.i32.i32(<vscale x 1 x
 define <vscale x 2 x i16> @test_vssrl_vv_u16mf2_m(<vscale x 2 x i1> %mask, <vscale x 2 x i16> %op1, <vscale x 2 x i16> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u16mf2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -859,8 +859,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vssrl.mask.nxv2i16.nxv2i16.i32(<vscale x
 define <vscale x 2 x i16> @test_vssrl_vx_u16mf2_m(<vscale x 2 x i1> %mask, <vscale x 2 x i16> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u16mf2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -873,8 +873,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vssrl.mask.nxv2i16.i32.i32(<vscale x 2 x
 define <vscale x 4 x i16> @test_vssrl_vv_u16m1_m(<vscale x 4 x i1> %mask, <vscale x 4 x i16> %op1, <vscale x 4 x i16> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u16m1_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -887,8 +887,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vssrl.mask.nxv4i16.nxv4i16.i32(<vscale x
 define <vscale x 4 x i16> @test_vssrl_vx_u16m1_m(<vscale x 4 x i1> %mask, <vscale x 4 x i16> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u16m1_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -901,8 +901,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vssrl.mask.nxv4i16.i32.i32(<vscale x 4 x
 define <vscale x 8 x i16> @test_vssrl_vv_u16m2_m(<vscale x 8 x i1> %mask, <vscale x 8 x i16> %op1, <vscale x 8 x i16> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u16m2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -915,8 +915,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vssrl.mask.nxv8i16.nxv8i16.i32(<vscale x
 define <vscale x 8 x i16> @test_vssrl_vx_u16m2_m(<vscale x 8 x i1> %mask, <vscale x 8 x i16> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u16m2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -929,8 +929,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vssrl.mask.nxv8i16.i32.i32(<vscale x 8 x
 define <vscale x 16 x i16> @test_vssrl_vv_u16m4_m(<vscale x 16 x i1> %mask, <vscale x 16 x i16> %op1, <vscale x 16 x i16> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u16m4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -943,8 +943,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vssrl.mask.nxv16i16.nxv16i16.i32(<vscale
 define <vscale x 16 x i16> @test_vssrl_vx_u16m4_m(<vscale x 16 x i1> %mask, <vscale x 16 x i16> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u16m4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -957,8 +957,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vssrl.mask.nxv16i16.i32.i32(<vscale x 16
 define <vscale x 32 x i16> @test_vssrl_vv_u16m8_m(<vscale x 32 x i1> %mask, <vscale x 32 x i16> %op1, <vscale x 32 x i16> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u16m8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -971,8 +971,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vssrl.mask.nxv32i16.nxv32i16.i32(<vscale
 define <vscale x 32 x i16> @test_vssrl_vx_u16m8_m(<vscale x 32 x i1> %mask, <vscale x 32 x i16> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u16m8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -985,8 +985,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vssrl.mask.nxv32i16.i32.i32(<vscale x 32
 define <vscale x 1 x i32> @test_vssrl_vv_u32mf2_m(<vscale x 1 x i1> %mask, <vscale x 1 x i32> %op1, <vscale x 1 x i32> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u32mf2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -999,8 +999,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vssrl.mask.nxv1i32.nxv1i32.i32(<vscale x
 define <vscale x 1 x i32> @test_vssrl_vx_u32mf2_m(<vscale x 1 x i1> %mask, <vscale x 1 x i32> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u32mf2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1013,8 +1013,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vssrl.mask.nxv1i32.i32.i32(<vscale x 1 x
 define <vscale x 2 x i32> @test_vssrl_vv_u32m1_m(<vscale x 2 x i1> %mask, <vscale x 2 x i32> %op1, <vscale x 2 x i32> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u32m1_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1027,8 +1027,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vssrl.mask.nxv2i32.nxv2i32.i32(<vscale x
 define <vscale x 2 x i32> @test_vssrl_vx_u32m1_m(<vscale x 2 x i1> %mask, <vscale x 2 x i32> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u32m1_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1041,8 +1041,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vssrl.mask.nxv2i32.i32.i32(<vscale x 2 x
 define <vscale x 4 x i32> @test_vssrl_vv_u32m2_m(<vscale x 4 x i1> %mask, <vscale x 4 x i32> %op1, <vscale x 4 x i32> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u32m2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1055,8 +1055,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vssrl.mask.nxv4i32.nxv4i32.i32(<vscale x
 define <vscale x 4 x i32> @test_vssrl_vx_u32m2_m(<vscale x 4 x i1> %mask, <vscale x 4 x i32> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u32m2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1069,8 +1069,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vssrl.mask.nxv4i32.i32.i32(<vscale x 4 x
 define <vscale x 8 x i32> @test_vssrl_vv_u32m4_m(<vscale x 8 x i1> %mask, <vscale x 8 x i32> %op1, <vscale x 8 x i32> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u32m4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1083,8 +1083,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vssrl.mask.nxv8i32.nxv8i32.i32(<vscale x
 define <vscale x 8 x i32> @test_vssrl_vx_u32m4_m(<vscale x 8 x i1> %mask, <vscale x 8 x i32> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u32m4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1097,8 +1097,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vssrl.mask.nxv8i32.i32.i32(<vscale x 8 x
 define <vscale x 16 x i32> @test_vssrl_vv_u32m8_m(<vscale x 16 x i1> %mask, <vscale x 16 x i32> %op1, <vscale x 16 x i32> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u32m8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1111,8 +1111,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vssrl.mask.nxv16i32.nxv16i32.i32(<vscale
 define <vscale x 16 x i32> @test_vssrl_vx_u32m8_m(<vscale x 16 x i1> %mask, <vscale x 16 x i32> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u32m8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1125,8 +1125,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vssrl.mask.nxv16i32.i32.i32(<vscale x 16
 define <vscale x 1 x i64> @test_vssrl_vv_u64m1_m(<vscale x 1 x i1> %mask, <vscale x 1 x i64> %op1, <vscale x 1 x i64> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u64m1_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1139,8 +1139,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vssrl.mask.nxv1i64.nxv1i64.i32(<vscale x
 define <vscale x 1 x i64> @test_vssrl_vx_u64m1_m(<vscale x 1 x i1> %mask, <vscale x 1 x i64> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u64m1_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1153,8 +1153,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vssrl.mask.nxv1i64.i32.i32(<vscale x 1 x
 define <vscale x 2 x i64> @test_vssrl_vv_u64m2_m(<vscale x 2 x i1> %mask, <vscale x 2 x i64> %op1, <vscale x 2 x i64> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u64m2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1167,8 +1167,8 @@ declare <vscale x 2 x i64> @llvm.riscv.vssrl.mask.nxv2i64.nxv2i64.i32(<vscale x
 define <vscale x 2 x i64> @test_vssrl_vx_u64m2_m(<vscale x 2 x i1> %mask, <vscale x 2 x i64> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u64m2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1181,8 +1181,8 @@ declare <vscale x 2 x i64> @llvm.riscv.vssrl.mask.nxv2i64.i32.i32(<vscale x 2 x
 define <vscale x 4 x i64> @test_vssrl_vv_u64m4_m(<vscale x 4 x i1> %mask, <vscale x 4 x i64> %op1, <vscale x 4 x i64> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u64m4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1195,8 +1195,8 @@ declare <vscale x 4 x i64> @llvm.riscv.vssrl.mask.nxv4i64.nxv4i64.i32(<vscale x
 define <vscale x 4 x i64> @test_vssrl_vx_u64m4_m(<vscale x 4 x i1> %mask, <vscale x 4 x i64> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u64m4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1209,8 +1209,8 @@ declare <vscale x 4 x i64> @llvm.riscv.vssrl.mask.nxv4i64.i32.i32(<vscale x 4 x
 define <vscale x 8 x i64> @test_vssrl_vv_u64m8_m(<vscale x 8 x i1> %mask, <vscale x 8 x i64> %op1, <vscale x 8 x i64> %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u64m8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1223,8 +1223,8 @@ declare <vscale x 8 x i64> @llvm.riscv.vssrl.mask.nxv8i64.nxv8i64.i32(<vscale x
 define <vscale x 8 x i64> @test_vssrl_vx_u64m8_m(<vscale x 8 x i1> %mask, <vscale x 8 x i64> %op1, i32 %shift, i32 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u64m8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssrl-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vssrl-rv64.ll
index 0a465db64b7a..fe80854bb264 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vssrl-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vssrl-rv64.ll
@@ -5,8 +5,8 @@
 define <vscale x 1 x i8> @test_vssrl_vv_u8mf8(<vscale x 1 x i8> %op1, <vscale x 1 x i8> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u8mf8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -19,8 +19,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vssrl.nxv1i8.nxv1i8.i64(<vscale x 1 x i8>,
 define <vscale x 1 x i8> @test_vssrl_vx_u8mf8(<vscale x 1 x i8> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u8mf8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -33,8 +33,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vssrl.nxv1i8.i64.i64(<vscale x 1 x i8>, <v
 define <vscale x 2 x i8> @test_vssrl_vv_u8mf4(<vscale x 2 x i8> %op1, <vscale x 2 x i8> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u8mf4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -47,8 +47,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vssrl.nxv2i8.nxv2i8.i64(<vscale x 2 x i8>,
 define <vscale x 2 x i8> @test_vssrl_vx_u8mf4(<vscale x 2 x i8> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u8mf4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -61,8 +61,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vssrl.nxv2i8.i64.i64(<vscale x 2 x i8>, <v
 define <vscale x 4 x i8> @test_vssrl_vv_u8mf2(<vscale x 4 x i8> %op1, <vscale x 4 x i8> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u8mf2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -75,8 +75,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vssrl.nxv4i8.nxv4i8.i64(<vscale x 4 x i8>,
 define <vscale x 4 x i8> @test_vssrl_vx_u8mf2(<vscale x 4 x i8> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u8mf2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -89,8 +89,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vssrl.nxv4i8.i64.i64(<vscale x 4 x i8>, <v
 define <vscale x 8 x i8> @test_vssrl_vv_u8m1(<vscale x 8 x i8> %op1, <vscale x 8 x i8> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u8m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -103,8 +103,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vssrl.nxv8i8.nxv8i8.i64(<vscale x 8 x i8>,
 define <vscale x 8 x i8> @test_vssrl_vx_u8m1(<vscale x 8 x i8> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u8m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -117,8 +117,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vssrl.nxv8i8.i64.i64(<vscale x 8 x i8>, <v
 define <vscale x 16 x i8> @test_vssrl_vv_u8m2(<vscale x 16 x i8> %op1, <vscale x 16 x i8> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u8m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -131,8 +131,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vssrl.nxv16i8.nxv16i8.i64(<vscale x 16 x
 define <vscale x 16 x i8> @test_vssrl_vx_u8m2(<vscale x 16 x i8> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u8m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -145,8 +145,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vssrl.nxv16i8.i64.i64(<vscale x 16 x i8>,
 define <vscale x 32 x i8> @test_vssrl_vv_u8m4(<vscale x 32 x i8> %op1, <vscale x 32 x i8> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u8m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -159,8 +159,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vssrl.nxv32i8.nxv32i8.i64(<vscale x 32 x
 define <vscale x 32 x i8> @test_vssrl_vx_u8m4(<vscale x 32 x i8> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u8m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -173,8 +173,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vssrl.nxv32i8.i64.i64(<vscale x 32 x i8>,
 define <vscale x 64 x i8> @test_vssrl_vv_u8m8(<vscale x 64 x i8> %op1, <vscale x 64 x i8> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u8m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -187,8 +187,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vssrl.nxv64i8.nxv64i8.i64(<vscale x 64 x
 define <vscale x 64 x i8> @test_vssrl_vx_u8m8(<vscale x 64 x i8> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u8m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -201,8 +201,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vssrl.nxv64i8.i64.i64(<vscale x 64 x i8>,
 define <vscale x 1 x i16> @test_vssrl_vv_u16mf4(<vscale x 1 x i16> %op1, <vscale x 1 x i16> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u16mf4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -215,8 +215,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vssrl.nxv1i16.nxv1i16.i64(<vscale x 1 x i
 define <vscale x 1 x i16> @test_vssrl_vx_u16mf4(<vscale x 1 x i16> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u16mf4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -229,8 +229,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vssrl.nxv1i16.i64.i64(<vscale x 1 x i16>,
 define <vscale x 2 x i16> @test_vssrl_vv_u16mf2(<vscale x 2 x i16> %op1, <vscale x 2 x i16> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u16mf2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -243,8 +243,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vssrl.nxv2i16.nxv2i16.i64(<vscale x 2 x i
 define <vscale x 2 x i16> @test_vssrl_vx_u16mf2(<vscale x 2 x i16> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u16mf2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -257,8 +257,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vssrl.nxv2i16.i64.i64(<vscale x 2 x i16>,
 define <vscale x 4 x i16> @test_vssrl_vv_u16m1(<vscale x 4 x i16> %op1, <vscale x 4 x i16> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u16m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -271,8 +271,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vssrl.nxv4i16.nxv4i16.i64(<vscale x 4 x i
 define <vscale x 4 x i16> @test_vssrl_vx_u16m1(<vscale x 4 x i16> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u16m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -285,8 +285,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vssrl.nxv4i16.i64.i64(<vscale x 4 x i16>,
 define <vscale x 8 x i16> @test_vssrl_vv_u16m2(<vscale x 8 x i16> %op1, <vscale x 8 x i16> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u16m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -299,8 +299,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vssrl.nxv8i16.nxv8i16.i64(<vscale x 8 x i
 define <vscale x 8 x i16> @test_vssrl_vx_u16m2(<vscale x 8 x i16> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u16m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -313,8 +313,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vssrl.nxv8i16.i64.i64(<vscale x 8 x i16>,
 define <vscale x 16 x i16> @test_vssrl_vv_u16m4(<vscale x 16 x i16> %op1, <vscale x 16 x i16> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u16m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -327,8 +327,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vssrl.nxv16i16.nxv16i16.i64(<vscale x 16
 define <vscale x 16 x i16> @test_vssrl_vx_u16m4(<vscale x 16 x i16> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u16m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -341,8 +341,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vssrl.nxv16i16.i64.i64(<vscale x 16 x i1
 define <vscale x 32 x i16> @test_vssrl_vv_u16m8(<vscale x 32 x i16> %op1, <vscale x 32 x i16> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u16m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -355,8 +355,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vssrl.nxv32i16.nxv32i16.i64(<vscale x 32
 define <vscale x 32 x i16> @test_vssrl_vx_u16m8(<vscale x 32 x i16> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u16m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -369,8 +369,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vssrl.nxv32i16.i64.i64(<vscale x 32 x i1
 define <vscale x 1 x i32> @test_vssrl_vv_u32mf2(<vscale x 1 x i32> %op1, <vscale x 1 x i32> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u32mf2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -383,8 +383,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vssrl.nxv1i32.nxv1i32.i64(<vscale x 1 x i
 define <vscale x 1 x i32> @test_vssrl_vx_u32mf2(<vscale x 1 x i32> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u32mf2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -397,8 +397,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vssrl.nxv1i32.i64.i64(<vscale x 1 x i32>,
 define <vscale x 2 x i32> @test_vssrl_vv_u32m1(<vscale x 2 x i32> %op1, <vscale x 2 x i32> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u32m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -411,8 +411,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vssrl.nxv2i32.nxv2i32.i64(<vscale x 2 x i
 define <vscale x 2 x i32> @test_vssrl_vx_u32m1(<vscale x 2 x i32> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u32m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -425,8 +425,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vssrl.nxv2i32.i64.i64(<vscale x 2 x i32>,
 define <vscale x 4 x i32> @test_vssrl_vv_u32m2(<vscale x 4 x i32> %op1, <vscale x 4 x i32> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u32m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -439,8 +439,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vssrl.nxv4i32.nxv4i32.i64(<vscale x 4 x i
 define <vscale x 4 x i32> @test_vssrl_vx_u32m2(<vscale x 4 x i32> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u32m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -453,8 +453,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vssrl.nxv4i32.i64.i64(<vscale x 4 x i32>,
 define <vscale x 8 x i32> @test_vssrl_vv_u32m4(<vscale x 8 x i32> %op1, <vscale x 8 x i32> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u32m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -467,8 +467,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vssrl.nxv8i32.nxv8i32.i64(<vscale x 8 x i
 define <vscale x 8 x i32> @test_vssrl_vx_u32m4(<vscale x 8 x i32> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u32m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -481,8 +481,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vssrl.nxv8i32.i64.i64(<vscale x 8 x i32>,
 define <vscale x 16 x i32> @test_vssrl_vv_u32m8(<vscale x 16 x i32> %op1, <vscale x 16 x i32> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u32m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -495,8 +495,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vssrl.nxv16i32.nxv16i32.i64(<vscale x 16
 define <vscale x 16 x i32> @test_vssrl_vx_u32m8(<vscale x 16 x i32> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u32m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -509,8 +509,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vssrl.nxv16i32.i64.i64(<vscale x 16 x i3
 define <vscale x 1 x i64> @test_vssrl_vv_u64m1(<vscale x 1 x i64> %op1, <vscale x 1 x i64> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u64m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -523,8 +523,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vssrl.nxv1i64.nxv1i64.i64(<vscale x 1 x i
 define <vscale x 1 x i64> @test_vssrl_vx_u64m1(<vscale x 1 x i64> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u64m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -537,8 +537,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vssrl.nxv1i64.i64.i64(<vscale x 1 x i64>,
 define <vscale x 2 x i64> @test_vssrl_vv_u64m2(<vscale x 2 x i64> %op1, <vscale x 2 x i64> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u64m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -551,8 +551,8 @@ declare <vscale x 2 x i64> @llvm.riscv.vssrl.nxv2i64.nxv2i64.i64(<vscale x 2 x i
 define <vscale x 2 x i64> @test_vssrl_vx_u64m2(<vscale x 2 x i64> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u64m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -565,8 +565,8 @@ declare <vscale x 2 x i64> @llvm.riscv.vssrl.nxv2i64.i64.i64(<vscale x 2 x i64>,
 define <vscale x 4 x i64> @test_vssrl_vv_u64m4(<vscale x 4 x i64> %op1, <vscale x 4 x i64> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u64m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v12
 ; CHECK-NEXT:    ret
 entry:
@@ -579,8 +579,8 @@ declare <vscale x 4 x i64> @llvm.riscv.vssrl.nxv4i64.nxv4i64.i64(<vscale x 4 x i
 define <vscale x 4 x i64> @test_vssrl_vx_u64m4(<vscale x 4 x i64> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u64m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -593,8 +593,8 @@ declare <vscale x 4 x i64> @llvm.riscv.vssrl.nxv4i64.i64.i64(<vscale x 4 x i64>,
 define <vscale x 8 x i64> @test_vssrl_vv_u64m8(<vscale x 8 x i64> %op1, <vscale x 8 x i64> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u64m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v16
 ; CHECK-NEXT:    ret
 entry:
@@ -607,8 +607,8 @@ declare <vscale x 8 x i64> @llvm.riscv.vssrl.nxv8i64.nxv8i64.i64(<vscale x 8 x i
 define <vscale x 8 x i64> @test_vssrl_vx_u64m8(<vscale x 8 x i64> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u64m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 entry:
@@ -621,8 +621,8 @@ declare <vscale x 8 x i64> @llvm.riscv.vssrl.nxv8i64.i64.i64(<vscale x 8 x i64>,
 define <vscale x 1 x i8> @test_vssrl_vv_u8mf8_m(<vscale x 1 x i1> %mask, <vscale x 1 x i8> %op1, <vscale x 1 x i8> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u8mf8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -635,8 +635,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vssrl.mask.nxv1i8.nxv1i8.i64(<vscale x 1 x
 define <vscale x 1 x i8> @test_vssrl_vx_u8mf8_m(<vscale x 1 x i1> %mask, <vscale x 1 x i8> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u8mf8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -649,8 +649,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vssrl.mask.nxv1i8.i64.i64(<vscale x 1 x i8
 define <vscale x 2 x i8> @test_vssrl_vv_u8mf4_m(<vscale x 2 x i1> %mask, <vscale x 2 x i8> %op1, <vscale x 2 x i8> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u8mf4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -663,8 +663,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vssrl.mask.nxv2i8.nxv2i8.i64(<vscale x 2 x
 define <vscale x 2 x i8> @test_vssrl_vx_u8mf4_m(<vscale x 2 x i1> %mask, <vscale x 2 x i8> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u8mf4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -677,8 +677,8 @@ declare <vscale x 2 x i8> @llvm.riscv.vssrl.mask.nxv2i8.i64.i64(<vscale x 2 x i8
 define <vscale x 4 x i8> @test_vssrl_vv_u8mf2_m(<vscale x 4 x i1> %mask, <vscale x 4 x i8> %op1, <vscale x 4 x i8> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u8mf2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -691,8 +691,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vssrl.mask.nxv4i8.nxv4i8.i64(<vscale x 4 x
 define <vscale x 4 x i8> @test_vssrl_vx_u8mf2_m(<vscale x 4 x i1> %mask, <vscale x 4 x i8> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u8mf2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -705,8 +705,8 @@ declare <vscale x 4 x i8> @llvm.riscv.vssrl.mask.nxv4i8.i64.i64(<vscale x 4 x i8
 define <vscale x 8 x i8> @test_vssrl_vv_u8m1_m(<vscale x 8 x i1> %mask, <vscale x 8 x i8> %op1, <vscale x 8 x i8> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u8m1_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -719,8 +719,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vssrl.mask.nxv8i8.nxv8i8.i64(<vscale x 8 x
 define <vscale x 8 x i8> @test_vssrl_vx_u8m1_m(<vscale x 8 x i1> %mask, <vscale x 8 x i8> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u8m1_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -733,8 +733,8 @@ declare <vscale x 8 x i8> @llvm.riscv.vssrl.mask.nxv8i8.i64.i64(<vscale x 8 x i8
 define <vscale x 16 x i8> @test_vssrl_vv_u8m2_m(<vscale x 16 x i1> %mask, <vscale x 16 x i8> %op1, <vscale x 16 x i8> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u8m2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -747,8 +747,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vssrl.mask.nxv16i8.nxv16i8.i64(<vscale x
 define <vscale x 16 x i8> @test_vssrl_vx_u8m2_m(<vscale x 16 x i1> %mask, <vscale x 16 x i8> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u8m2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -761,8 +761,8 @@ declare <vscale x 16 x i8> @llvm.riscv.vssrl.mask.nxv16i8.i64.i64(<vscale x 16 x
 define <vscale x 32 x i8> @test_vssrl_vv_u8m4_m(<vscale x 32 x i1> %mask, <vscale x 32 x i8> %op1, <vscale x 32 x i8> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u8m4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -775,8 +775,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vssrl.mask.nxv32i8.nxv32i8.i64(<vscale x
 define <vscale x 32 x i8> @test_vssrl_vx_u8m4_m(<vscale x 32 x i1> %mask, <vscale x 32 x i8> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u8m4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -789,8 +789,8 @@ declare <vscale x 32 x i8> @llvm.riscv.vssrl.mask.nxv32i8.i64.i64(<vscale x 32 x
 define <vscale x 64 x i8> @test_vssrl_vv_u8m8_m(<vscale x 64 x i1> %mask, <vscale x 64 x i8> %op1, <vscale x 64 x i8> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u8m8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -803,8 +803,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vssrl.mask.nxv64i8.nxv64i8.i64(<vscale x
 define <vscale x 64 x i8> @test_vssrl_vx_u8m8_m(<vscale x 64 x i1> %mask, <vscale x 64 x i8> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u8m8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -817,8 +817,8 @@ declare <vscale x 64 x i8> @llvm.riscv.vssrl.mask.nxv64i8.i64.i64(<vscale x 64 x
 define <vscale x 1 x i16> @test_vssrl_vv_u16mf4_m(<vscale x 1 x i1> %mask, <vscale x 1 x i16> %op1, <vscale x 1 x i16> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u16mf4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -831,8 +831,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vssrl.mask.nxv1i16.nxv1i16.i64(<vscale x
 define <vscale x 1 x i16> @test_vssrl_vx_u16mf4_m(<vscale x 1 x i1> %mask, <vscale x 1 x i16> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u16mf4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -845,8 +845,8 @@ declare <vscale x 1 x i16> @llvm.riscv.vssrl.mask.nxv1i16.i64.i64(<vscale x 1 x
 define <vscale x 2 x i16> @test_vssrl_vv_u16mf2_m(<vscale x 2 x i1> %mask, <vscale x 2 x i16> %op1, <vscale x 2 x i16> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u16mf2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -859,8 +859,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vssrl.mask.nxv2i16.nxv2i16.i64(<vscale x
 define <vscale x 2 x i16> @test_vssrl_vx_u16mf2_m(<vscale x 2 x i1> %mask, <vscale x 2 x i16> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u16mf2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -873,8 +873,8 @@ declare <vscale x 2 x i16> @llvm.riscv.vssrl.mask.nxv2i16.i64.i64(<vscale x 2 x
 define <vscale x 4 x i16> @test_vssrl_vv_u16m1_m(<vscale x 4 x i1> %mask, <vscale x 4 x i16> %op1, <vscale x 4 x i16> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u16m1_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -887,8 +887,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vssrl.mask.nxv4i16.nxv4i16.i64(<vscale x
 define <vscale x 4 x i16> @test_vssrl_vx_u16m1_m(<vscale x 4 x i1> %mask, <vscale x 4 x i16> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u16m1_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -901,8 +901,8 @@ declare <vscale x 4 x i16> @llvm.riscv.vssrl.mask.nxv4i16.i64.i64(<vscale x 4 x
 define <vscale x 8 x i16> @test_vssrl_vv_u16m2_m(<vscale x 8 x i1> %mask, <vscale x 8 x i16> %op1, <vscale x 8 x i16> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u16m2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -915,8 +915,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vssrl.mask.nxv8i16.nxv8i16.i64(<vscale x
 define <vscale x 8 x i16> @test_vssrl_vx_u16m2_m(<vscale x 8 x i1> %mask, <vscale x 8 x i16> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u16m2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -929,8 +929,8 @@ declare <vscale x 8 x i16> @llvm.riscv.vssrl.mask.nxv8i16.i64.i64(<vscale x 8 x
 define <vscale x 16 x i16> @test_vssrl_vv_u16m4_m(<vscale x 16 x i1> %mask, <vscale x 16 x i16> %op1, <vscale x 16 x i16> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u16m4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -943,8 +943,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vssrl.mask.nxv16i16.nxv16i16.i64(<vscale
 define <vscale x 16 x i16> @test_vssrl_vx_u16m4_m(<vscale x 16 x i1> %mask, <vscale x 16 x i16> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u16m4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -957,8 +957,8 @@ declare <vscale x 16 x i16> @llvm.riscv.vssrl.mask.nxv16i16.i64.i64(<vscale x 16
 define <vscale x 32 x i16> @test_vssrl_vv_u16m8_m(<vscale x 32 x i1> %mask, <vscale x 32 x i16> %op1, <vscale x 32 x i16> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u16m8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -971,8 +971,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vssrl.mask.nxv32i16.nxv32i16.i64(<vscale
 define <vscale x 32 x i16> @test_vssrl_vx_u16m8_m(<vscale x 32 x i1> %mask, <vscale x 32 x i16> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u16m8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -985,8 +985,8 @@ declare <vscale x 32 x i16> @llvm.riscv.vssrl.mask.nxv32i16.i64.i64(<vscale x 32
 define <vscale x 1 x i32> @test_vssrl_vv_u32mf2_m(<vscale x 1 x i1> %mask, <vscale x 1 x i32> %op1, <vscale x 1 x i32> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u32mf2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -999,8 +999,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vssrl.mask.nxv1i32.nxv1i32.i64(<vscale x
 define <vscale x 1 x i32> @test_vssrl_vx_u32mf2_m(<vscale x 1 x i1> %mask, <vscale x 1 x i32> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u32mf2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1013,8 +1013,8 @@ declare <vscale x 1 x i32> @llvm.riscv.vssrl.mask.nxv1i32.i64.i64(<vscale x 1 x
 define <vscale x 2 x i32> @test_vssrl_vv_u32m1_m(<vscale x 2 x i1> %mask, <vscale x 2 x i32> %op1, <vscale x 2 x i32> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u32m1_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1027,8 +1027,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vssrl.mask.nxv2i32.nxv2i32.i64(<vscale x
 define <vscale x 2 x i32> @test_vssrl_vx_u32m1_m(<vscale x 2 x i1> %mask, <vscale x 2 x i32> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u32m1_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1041,8 +1041,8 @@ declare <vscale x 2 x i32> @llvm.riscv.vssrl.mask.nxv2i32.i64.i64(<vscale x 2 x
 define <vscale x 4 x i32> @test_vssrl_vv_u32m2_m(<vscale x 4 x i1> %mask, <vscale x 4 x i32> %op1, <vscale x 4 x i32> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u32m2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1055,8 +1055,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vssrl.mask.nxv4i32.nxv4i32.i64(<vscale x
 define <vscale x 4 x i32> @test_vssrl_vx_u32m2_m(<vscale x 4 x i1> %mask, <vscale x 4 x i32> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u32m2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1069,8 +1069,8 @@ declare <vscale x 4 x i32> @llvm.riscv.vssrl.mask.nxv4i32.i64.i64(<vscale x 4 x
 define <vscale x 8 x i32> @test_vssrl_vv_u32m4_m(<vscale x 8 x i1> %mask, <vscale x 8 x i32> %op1, <vscale x 8 x i32> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u32m4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1083,8 +1083,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vssrl.mask.nxv8i32.nxv8i32.i64(<vscale x
 define <vscale x 8 x i32> @test_vssrl_vx_u32m4_m(<vscale x 8 x i1> %mask, <vscale x 8 x i32> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u32m4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1097,8 +1097,8 @@ declare <vscale x 8 x i32> @llvm.riscv.vssrl.mask.nxv8i32.i64.i64(<vscale x 8 x
 define <vscale x 16 x i32> @test_vssrl_vv_u32m8_m(<vscale x 16 x i1> %mask, <vscale x 16 x i32> %op1, <vscale x 16 x i32> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u32m8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1111,8 +1111,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vssrl.mask.nxv16i32.nxv16i32.i64(<vscale
 define <vscale x 16 x i32> @test_vssrl_vx_u32m8_m(<vscale x 16 x i1> %mask, <vscale x 16 x i32> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u32m8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1125,8 +1125,8 @@ declare <vscale x 16 x i32> @llvm.riscv.vssrl.mask.nxv16i32.i64.i64(<vscale x 16
 define <vscale x 1 x i64> @test_vssrl_vv_u64m1_m(<vscale x 1 x i1> %mask, <vscale x 1 x i64> %op1, <vscale x 1 x i64> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u64m1_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1139,8 +1139,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vssrl.mask.nxv1i64.nxv1i64.i64(<vscale x
 define <vscale x 1 x i64> @test_vssrl_vx_u64m1_m(<vscale x 1 x i1> %mask, <vscale x 1 x i64> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u64m1_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1153,8 +1153,8 @@ declare <vscale x 1 x i64> @llvm.riscv.vssrl.mask.nxv1i64.i64.i64(<vscale x 1 x
 define <vscale x 2 x i64> @test_vssrl_vv_u64m2_m(<vscale x 2 x i1> %mask, <vscale x 2 x i64> %op1, <vscale x 2 x i64> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u64m2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1167,8 +1167,8 @@ declare <vscale x 2 x i64> @llvm.riscv.vssrl.mask.nxv2i64.nxv2i64.i64(<vscale x
 define <vscale x 2 x i64> @test_vssrl_vx_u64m2_m(<vscale x 2 x i1> %mask, <vscale x 2 x i64> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u64m2_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1181,8 +1181,8 @@ declare <vscale x 2 x i64> @llvm.riscv.vssrl.mask.nxv2i64.i64.i64(<vscale x 2 x
 define <vscale x 4 x i64> @test_vssrl_vv_u64m4_m(<vscale x 4 x i1> %mask, <vscale x 4 x i64> %op1, <vscale x 4 x i64> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u64m4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1195,8 +1195,8 @@ declare <vscale x 4 x i64> @llvm.riscv.vssrl.mask.nxv4i64.nxv4i64.i64(<vscale x
 define <vscale x 4 x i64> @test_vssrl_vx_u64m4_m(<vscale x 4 x i1> %mask, <vscale x 4 x i64> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u64m4_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1209,8 +1209,8 @@ declare <vscale x 4 x i64> @llvm.riscv.vssrl.mask.nxv4i64.i64.i64(<vscale x 4 x
 define <vscale x 8 x i64> @test_vssrl_vv_u64m8_m(<vscale x 8 x i1> %mask, <vscale x 8 x i64> %op1, <vscale x 8 x i64> %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vv_u64m8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vssrl.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
@@ -1223,8 +1223,8 @@ declare <vscale x 8 x i64> @llvm.riscv.vssrl.mask.nxv8i64.nxv8i64.i64(<vscale x
 define <vscale x 8 x i64> @test_vssrl_vx_u64m8_m(<vscale x 8 x i1> %mask, <vscale x 8 x i64> %op1, i64 %shift, i64 %vl) {
 ; CHECK-LABEL: test_vssrl_vx_u64m8_m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vssrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert.ll b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert.ll
index 10175218a440..c5f34eee3118 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert.ll
@@ -20,8 +20,8 @@ declare <vscale x 1 x i8> @llvm.riscv.vasub.nxv1i8.nxv1i8(
 define <vscale x 1 x i8> @test1(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: test1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vaadd.vv v8, v8, v9
 ; CHECK-NEXT:    vaadd.vv v8, v8, v10
 ; CHECK-NEXT:    ret
@@ -44,8 +44,8 @@ entry:
 define <vscale x 1 x i8> @test2(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: test2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 2
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vaadd.vv v8, v8, v9
 ; CHECK-NEXT:    csrwi vxrm, 0
 ; CHECK-NEXT:    vaadd.vv v8, v8, v10
@@ -80,12 +80,12 @@ define <vscale x 1 x i8> @test3(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vsc
 ; RV32-NEXT:    mv s0, a0
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vs1r.v v10, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; RV32-NEXT:    csrwi vxrm, 0
+; RV32-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; RV32-NEXT:    vaadd.vv v8, v8, v9
 ; RV32-NEXT:    call foo
-; RV32-NEXT:    vsetvli zero, s0, e8, mf8, ta, ma
 ; RV32-NEXT:    csrwi vxrm, 0
+; RV32-NEXT:    vsetvli zero, s0, e8, mf8, ta, ma
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vaadd.vv v8, v8, v9
@@ -108,12 +108,12 @@ define <vscale x 1 x i8> @test3(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vsc
 ; RV64-NEXT:    mv s0, a0
 ; RV64-NEXT:    addi a1, sp, 16
 ; RV64-NEXT:    vs1r.v v10, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; RV64-NEXT:    vaadd.vv v8, v8, v9
 ; RV64-NEXT:    call foo
-; RV64-NEXT:    vsetvli zero, s0, e8, mf8, ta, ma
 ; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, s0, e8, mf8, ta, ma
 ; RV64-NEXT:    addi a0, sp, 16
 ; RV64-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    vaadd.vv v8, v8, v9
@@ -144,13 +144,13 @@ entry:
 define <vscale x 1 x i8> @test4(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: test4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vaadd.vv v8, v8, v9
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vaadd.vv v8, v8, v10
 ; CHECK-NEXT:    ret
 entry:
@@ -174,8 +174,8 @@ define <vscale x 1 x i8> @test5(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vsc
 ; CHECK-LABEL: test5:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andi a1, a1, 1
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vaadd.vv v8, v8, v9
 ; CHECK-NEXT:    beqz a1, .LBB4_2
 ; CHECK-NEXT:  # %bb.1: # %condblock
@@ -249,8 +249,8 @@ define <vscale x 1 x i8> @test7(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vsc
 ; CHECK-LABEL: test7:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andi a1, a1, 1
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vaadd.vv v8, v8, v9
 ; CHECK-NEXT:    beqz a1, .LBB6_2
 ; CHECK-NEXT:  # %bb.1: # %trueblock
@@ -480,8 +480,8 @@ define <vscale x 1 x i8> @test12(i1 %c1, <vscale x 1 x i8> %0, <vscale x 1 x i8>
 ; CHECK-LABEL: test12:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andi a0, a0, 1
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vaadd.vv v9, v8, v9
 ; CHECK-NEXT:    beqz a0, .LBB11_2
 ; CHECK-NEXT:  # %bb.1: # %block1
@@ -513,8 +513,8 @@ define <vscale x 1 x i8> @test13(i1 %c1, i1 %c2, i1 %c3, <vscale x 1 x i8> %0, <
 ; CHECK-LABEL: test13:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andi a0, a0, 1
-; CHECK-NEXT:    vsetvli zero, a3, e8, mf8, ta, ma
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a3, e8, mf8, ta, ma
 ; CHECK-NEXT:    vaadd.vv v10, v8, v9
 ; CHECK-NEXT:    beqz a0, .LBB12_2
 ; CHECK-NEXT:  # %bb.1: # %block1
diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm.mir b/llvm/test/CodeGen/RISCV/rvv/vxrm.mir
index a588677bec8e..2bac1eeb9060 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vxrm.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vxrm.mir
@@ -11,20 +11,19 @@ body:     |
     ; MIR-LABEL: name: verify_vxrm
     ; MIR: liveins: $v8, $v9, $x10
     ; MIR-NEXT: {{  $}}
-    ; MIR-NEXT: dead $x0 = PseudoVSETVLI killed renamable $x10, 197 /* e8, mf8, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; MIR-NEXT: WriteVXRMImm 0, implicit-def $vxrm
-    ; MIR-NEXT: renamable $v8 = PseudoVAADD_VV_MF8 undef $v8, killed renamable $v8, killed renamable $v9, 0, $noreg, 3 /* e8 */, 0  /* tu, mu */, implicit $vl, implicit $vtype, implicit $vxrm
+    ; MIR-NEXT: dead $x0 = PseudoVSETVLI killed renamable $x10, 197 /* e8, mf8, ta, ma */, implicit-def $vl, implicit-def $vtype
+    ; MIR-NEXT: renamable $v8 = PseudoVAADD_VV_MF8 undef $v8, killed renamable $v8, killed renamable $v9, 0, $noreg, 3 /* e8 */, 0  /* tu, mu */, implicit $vxrm, implicit $vl, implicit $vtype
     ; MIR-NEXT: PseudoRET implicit $v8
     ; ASM-LABEL: verify_vxrm:
     ; ASM:        # %bb.0:
-    ; ASM-NEXT:    vsetvli	zero, a0, e8, mf8, ta, ma
     ; ASM-NEXT:    csrwi	vxrm, 0
+    ; ASM-NEXT:    vsetvli	zero, a0, e8, mf8, ta, ma
     ; ASM-NEXT:    vaadd.vv	v8, v8, v9
     ; ASM-NEXT:    ret
     %0:vr = COPY $v8
     %1:vr = COPY $v9
     %2:gprnox0 = COPY $x10
-    %pt:vr = IMPLICIT_DEF
-    renamable $v8 = PseudoVAADD_VV_MF8 %pt, %0, %1, 0, %2, 3 /* e8 */, 0
+    renamable $v8 = PseudoVAADD_VV_MF8 undef $noreg, %0, %1, 0, %2, 3 /* e8 */, 0
     PseudoRET implicit $v8
 ...
diff --git a/llvm/test/CodeGen/RISCV/spill-fpr-scalar.ll b/llvm/test/CodeGen/RISCV/spill-fpr-scalar.ll
index 48fb21dc5a8a..6b9b88d90de6 100644
--- a/llvm/test/CodeGen/RISCV/spill-fpr-scalar.ll
+++ b/llvm/test/CodeGen/RISCV/spill-fpr-scalar.ll
@@ -1,75 +1,58 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zfh,+zvfh -target-abi=lp64 \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh -target-abi=lp64 \
 ; RUN:   -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s
 
-declare half @llvm.riscv.vfmv.f.s.nxv1f16(<vscale x 1 x half>)
-declare float @llvm.riscv.vfmv.f.s.nxv1f32(<vscale x 1 x float>)
-declare double @llvm.riscv.vfmv.f.s.nxv1f64(<vscale x 1 x double>)
-
-declare <vscale x 1 x half> @llvm.riscv.vfmv.v.f.nxv1f16(<vscale x 1 x half>, half, i64);
-declare <vscale x 1 x float> @llvm.riscv.vfmv.v.f.nxv1f32(<vscale x 1 x float>, float, i64);
-declare <vscale x 1 x double> @llvm.riscv.vfmv.v.f.nxv1f64(<vscale x 1 x double>, double, i64);
-
-define <vscale x 1 x half> @intrinsic_vfmv.f.s_s_nxv1f16(<vscale x 1 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfmv.f.s_s_nxv1f16:
-; CHECK:       # %bb.0: # %entry
+define void @spill_half(ptr) nounwind {
+; CHECK-LABEL: spill_half:
+; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT:    vfmv.f.s fa5, v8
+; CHECK-NEXT:    flh fa5, 0(a0)
 ; CHECK-NEXT:    fsh fa5, 14(sp) # 2-byte Folded Spill
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    flh fa5, 14(sp) # 2-byte Folded Reload
-; CHECK-NEXT:    vfmv.v.f v8, fa5
+; CHECK-NEXT:    fsh fa5, 0(a0)
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
-entry:
-  %a = call half @llvm.riscv.vfmv.f.s.nxv1f16(<vscale x 1 x half> %0)
-  tail call void asm sideeffect "", "~{f0_d},~{f1_d},~{f2_d},~{f3_d},~{f4_d},~{f5_d},~{f6_d},~{f7_d},~{f8_d},~{f9_d},~{f10_d},~{f11_d},~{f12_d},~{f13_d},~{f14_d},~{f15_d},~{f16_d},~{f17_d},~{f18_d},~{f19_d},~{f20_d},~{f21_d},~{f22_d},~{f23_d},~{f24_d},~{f25_d},~{f26_d},~{f27_d},~{f28_d},~{f29_d},~{f30_d},~{f31_d}"()
-  %b = call <vscale x 1 x half> @llvm.riscv.vfmv.v.f.nxv1f16(<vscale x 1 x half> undef, half %a, i64 %1)
-  ret <vscale x 1 x half> %b
+  %2 = load volatile half, ptr %0
+  call void asm sideeffect "", "~{f0_d},~{f1_d},~{f2_d},~{f3_d},~{f4_d},~{f5_d},~{f6_d},~{f7_d},~{f8_d},~{f9_d},~{f10_d},~{f11_d},~{f12_d},~{f13_d},~{f14_d},~{f15_d},~{f16_d},~{f17_d},~{f18_d},~{f19_d},~{f20_d},~{f21_d},~{f22_d},~{f23_d},~{f24_d},~{f25_d},~{f26_d},~{f27_d},~{f28_d},~{f29_d},~{f30_d},~{f31_d}"()
+  store volatile half %2, ptr %0
+  ret void
 }
 
-define <vscale x 1 x float> @intrinsic_vfmv.f.s_s_nxv1f32(<vscale x 1 x float> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfmv.f.s_s_nxv1f32:
-; CHECK:       # %bb.0: # %entry
+define void @spill_float(ptr) nounwind {
+; CHECK-LABEL: spill_float:
+; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vfmv.f.s fa5, v8
+; CHECK-NEXT:    flw fa5, 0(a0)
 ; CHECK-NEXT:    fsw fa5, 12(sp) # 4-byte Folded Spill
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    flw fa5, 12(sp) # 4-byte Folded Reload
-; CHECK-NEXT:    vfmv.v.f v8, fa5
+; CHECK-NEXT:    fsw fa5, 0(a0)
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
-entry:
-  %a = call float @llvm.riscv.vfmv.f.s.nxv1f32(<vscale x 1 x float> %0)
-  tail call void asm sideeffect "", "~{f0_d},~{f1_d},~{f2_d},~{f3_d},~{f4_d},~{f5_d},~{f6_d},~{f7_d},~{f8_d},~{f9_d},~{f10_d},~{f11_d},~{f12_d},~{f13_d},~{f14_d},~{f15_d},~{f16_d},~{f17_d},~{f18_d},~{f19_d},~{f20_d},~{f21_d},~{f22_d},~{f23_d},~{f24_d},~{f25_d},~{f26_d},~{f27_d},~{f28_d},~{f29_d},~{f30_d},~{f31_d}"()
-  %b = call <vscale x 1 x float> @llvm.riscv.vfmv.v.f.nxv1f32(<vscale x 1 x float> undef, float %a, i64 %1)
-  ret <vscale x 1 x float> %b
+  %2 = load volatile float, ptr %0
+  call void asm sideeffect "", "~{f0_d},~{f1_d},~{f2_d},~{f3_d},~{f4_d},~{f5_d},~{f6_d},~{f7_d},~{f8_d},~{f9_d},~{f10_d},~{f11_d},~{f12_d},~{f13_d},~{f14_d},~{f15_d},~{f16_d},~{f17_d},~{f18_d},~{f19_d},~{f20_d},~{f21_d},~{f22_d},~{f23_d},~{f24_d},~{f25_d},~{f26_d},~{f27_d},~{f28_d},~{f29_d},~{f30_d},~{f31_d}"()
+  store volatile float %2, ptr %0
+  ret void
 }
 
-define <vscale x 1 x double> @intrinsic_vfmv.f.s_s_nxv1f64(<vscale x 1 x double> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfmv.f.s_s_nxv1f64:
-; CHECK:       # %bb.0: # %entry
+define void @spill_double(ptr) nounwind {
+; CHECK-LABEL: spill_double:
+; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vfmv.f.s fa5, v8
+; CHECK-NEXT:    fld fa5, 0(a0)
 ; CHECK-NEXT:    fsd fa5, 8(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    fld fa5, 8(sp) # 8-byte Folded Reload
-; CHECK-NEXT:    vfmv.v.f v8, fa5
+; CHECK-NEXT:    fsd fa5, 0(a0)
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
-entry:
-  %a = call double @llvm.riscv.vfmv.f.s.nxv1f64(<vscale x 1 x double> %0)
-  tail call void asm sideeffect "", "~{f0_d},~{f1_d},~{f2_d},~{f3_d},~{f4_d},~{f5_d},~{f6_d},~{f7_d},~{f8_d},~{f9_d},~{f10_d},~{f11_d},~{f12_d},~{f13_d},~{f14_d},~{f15_d},~{f16_d},~{f17_d},~{f18_d},~{f19_d},~{f20_d},~{f21_d},~{f22_d},~{f23_d},~{f24_d},~{f25_d},~{f26_d},~{f27_d},~{f28_d},~{f29_d},~{f30_d},~{f31_d}"()
-  %b = call <vscale x 1 x double> @llvm.riscv.vfmv.v.f.nxv1f64(<vscale x 1 x double> undef, double %a, i64 %1)
-  ret <vscale x 1 x double> %b
+  %2 = load volatile double, ptr %0
+  call void asm sideeffect "", "~{f0_d},~{f1_d},~{f2_d},~{f3_d},~{f4_d},~{f5_d},~{f6_d},~{f7_d},~{f8_d},~{f9_d},~{f10_d},~{f11_d},~{f12_d},~{f13_d},~{f14_d},~{f15_d},~{f16_d},~{f17_d},~{f18_d},~{f19_d},~{f20_d},~{f21_d},~{f22_d},~{f23_d},~{f24_d},~{f25_d},~{f26_d},~{f27_d},~{f28_d},~{f29_d},~{f30_d},~{f31_d}"()
+  store volatile double %2, ptr %0
+  ret void
 }
diff --git a/llvm/test/CodeGen/RISCV/stack-protector-target.ll b/llvm/test/CodeGen/RISCV/stack-protector-target.ll
index 50531d384982..a4bd0e9ceac9 100644
--- a/llvm/test/CodeGen/RISCV/stack-protector-target.ll
+++ b/llvm/test/CodeGen/RISCV/stack-protector-target.ll
@@ -50,21 +50,18 @@ define void @func() sspreq nounwind {
 ; ANDROID-RISCV64:       # %bb.0:
 ; ANDROID-RISCV64-NEXT:    addi sp, sp, -32
 ; ANDROID-RISCV64-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; ANDROID-RISCV64-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; ANDROID-RISCV64-NEXT:    lui s0, %hi(__stack_chk_guard)
-; ANDROID-RISCV64-NEXT:    ld a0, %lo(__stack_chk_guard)(s0)
-; ANDROID-RISCV64-NEXT:    sd a0, 8(sp)
-; ANDROID-RISCV64-NEXT:    addi a0, sp, 4
+; ANDROID-RISCV64-NEXT:    ld a0, -24(tp)
+; ANDROID-RISCV64-NEXT:    sd a0, 16(sp)
+; ANDROID-RISCV64-NEXT:    addi a0, sp, 12
 ; ANDROID-RISCV64-NEXT:    call capture
-; ANDROID-RISCV64-NEXT:    ld a0, %lo(__stack_chk_guard)(s0)
-; ANDROID-RISCV64-NEXT:    ld a1, 8(sp)
+; ANDROID-RISCV64-NEXT:    ld a0, -24(tp)
+; ANDROID-RISCV64-NEXT:    ld a1, 16(sp)
 ; ANDROID-RISCV64-NEXT:    bne a0, a1, .LBB0_2
-; ANDROID-RISCV64-NEXT:  # %bb.1:
+; ANDROID-RISCV64-NEXT:  # %bb.1: # %SP_return
 ; ANDROID-RISCV64-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; ANDROID-RISCV64-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; ANDROID-RISCV64-NEXT:    addi sp, sp, 32
 ; ANDROID-RISCV64-NEXT:    ret
-; ANDROID-RISCV64-NEXT:  .LBB0_2:
+; ANDROID-RISCV64-NEXT:  .LBB0_2: # %CallStackCheckFailBlk
 ; ANDROID-RISCV64-NEXT:    call __stack_chk_fail
   %1 = alloca i32, align 4
   call void @capture(ptr %1)
diff --git a/llvm/test/CodeGen/RISCV/xcvbi.ll b/llvm/test/CodeGen/RISCV/xcvbi.ll
new file mode 100644
index 000000000000..afd30faa56f9
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/xcvbi.ll
@@ -0,0 +1,248 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 -mtriple=riscv32 -mattr=+xcvbi -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK_NOPT
+; RUN: llc -O3 -mtriple=riscv32 -mattr=+xcvbi -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK_OPT
+
+define i32 @beqimm(i32 %a) {
+; CHECK_NOPT-LABEL: beqimm:
+; CHECK_NOPT:       # %bb.0:
+; CHECK_NOPT-NEXT:    cv.beqimm a0, 5, .LBB0_2
+; CHECK_NOPT-NEXT:    j .LBB0_1
+; CHECK_NOPT-NEXT:  .LBB0_1: # %f
+; CHECK_NOPT-NEXT:    li a0, 0
+; CHECK_NOPT-NEXT:    ret
+; CHECK_NOPT-NEXT:  .LBB0_2: # %t
+; CHECK_NOPT-NEXT:    li a0, 1
+; CHECK_NOPT-NEXT:    ret
+;
+; CHECK_OPT-LABEL: beqimm:
+; CHECK_OPT:       # %bb.0:
+; CHECK_OPT-NEXT:    cv.bneimm a0, 5, .LBB0_2
+; CHECK_OPT-NEXT:  # %bb.1: # %t
+; CHECK_OPT-NEXT:    li a0, 1
+; CHECK_OPT-NEXT:    ret
+; CHECK_OPT-NEXT:  .LBB0_2: # %f
+; CHECK_OPT-NEXT:    li a0, 0
+; CHECK_OPT-NEXT:    ret
+  %1 = icmp eq i32 %a, 5
+  br i1 %1, label %t, label %f
+f:
+  ret i32 0
+t:
+  ret i32 1
+}
+
+define i32 @bneimm(i32 %a) {
+; CHECK_NOPT-LABEL: bneimm:
+; CHECK_NOPT:       # %bb.0:
+; CHECK_NOPT-NEXT:    cv.bneimm a0, 5, .LBB1_2
+; CHECK_NOPT-NEXT:    j .LBB1_1
+; CHECK_NOPT-NEXT:  .LBB1_1: # %f
+; CHECK_NOPT-NEXT:    li a0, 0
+; CHECK_NOPT-NEXT:    ret
+; CHECK_NOPT-NEXT:  .LBB1_2: # %t
+; CHECK_NOPT-NEXT:    li a0, 1
+; CHECK_NOPT-NEXT:    ret
+;
+; CHECK_OPT-LABEL: bneimm:
+; CHECK_OPT:       # %bb.0:
+; CHECK_OPT-NEXT:    cv.beqimm a0, 5, .LBB1_2
+; CHECK_OPT-NEXT:  # %bb.1: # %t
+; CHECK_OPT-NEXT:    li a0, 1
+; CHECK_OPT-NEXT:    ret
+; CHECK_OPT-NEXT:  .LBB1_2: # %f
+; CHECK_OPT-NEXT:    li a0, 0
+; CHECK_OPT-NEXT:    ret
+  %1 = icmp ne i32 %a, 5
+  br i1 %1, label %t, label %f
+f:
+  ret i32 0
+t:
+  ret i32 1
+}
+
+define i32 @select_beqimm_1(i32 %a, i32 %x, i32 %y) {
+; CHECK_NOPT-LABEL: select_beqimm_1:
+; CHECK_NOPT:       # %bb.0: # %entry
+; CHECK_NOPT-NEXT:    addi sp, sp, -16
+; CHECK_NOPT-NEXT:    .cfi_def_cfa_offset 16
+; CHECK_NOPT-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:    sw a2, 12(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:    cv.beqimm a0, -16, .LBB2_2
+; CHECK_NOPT-NEXT:  # %bb.1: # %entry
+; CHECK_NOPT-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; CHECK_NOPT-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:  .LBB2_2: # %entry
+; CHECK_NOPT-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; CHECK_NOPT-NEXT:    addi sp, sp, 16
+; CHECK_NOPT-NEXT:    ret
+;
+; CHECK_OPT-LABEL: select_beqimm_1:
+; CHECK_OPT:       # %bb.0: # %entry
+; CHECK_OPT-NEXT:    cv.beqimm a0, -16, .LBB2_2
+; CHECK_OPT-NEXT:  # %bb.1: # %entry
+; CHECK_OPT-NEXT:    mv a2, a1
+; CHECK_OPT-NEXT:  .LBB2_2: # %entry
+; CHECK_OPT-NEXT:    mv a0, a2
+; CHECK_OPT-NEXT:    ret
+entry:
+  %cmp.not = icmp eq i32 %a, -16
+  %cond = select i1 %cmp.not, i32 %y, i32 %x
+  ret i32 %cond
+}
+
+define i32 @select_beqimm_2(i32 %a, i32 %x, i32 %y) {
+; CHECK_NOPT-LABEL: select_beqimm_2:
+; CHECK_NOPT:       # %bb.0: # %entry
+; CHECK_NOPT-NEXT:    addi sp, sp, -16
+; CHECK_NOPT-NEXT:    .cfi_def_cfa_offset 16
+; CHECK_NOPT-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:    sw a2, 12(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:    cv.beqimm a0, 0, .LBB3_2
+; CHECK_NOPT-NEXT:  # %bb.1: # %entry
+; CHECK_NOPT-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; CHECK_NOPT-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:  .LBB3_2: # %entry
+; CHECK_NOPT-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; CHECK_NOPT-NEXT:    addi sp, sp, 16
+; CHECK_NOPT-NEXT:    ret
+;
+; CHECK_OPT-LABEL: select_beqimm_2:
+; CHECK_OPT:       # %bb.0: # %entry
+; CHECK_OPT-NEXT:    cv.beqimm a0, 0, .LBB3_2
+; CHECK_OPT-NEXT:  # %bb.1: # %entry
+; CHECK_OPT-NEXT:    mv a2, a1
+; CHECK_OPT-NEXT:  .LBB3_2: # %entry
+; CHECK_OPT-NEXT:    mv a0, a2
+; CHECK_OPT-NEXT:    ret
+entry:
+  %cmp.not = icmp eq i32 %a, 0
+  %cond = select i1 %cmp.not, i32 %y, i32 %x
+  ret i32 %cond
+}
+
+define i32 @select_beqimm_3(i32 %a, i32 %x, i32 %y) {
+; CHECK_NOPT-LABEL: select_beqimm_3:
+; CHECK_NOPT:       # %bb.0: # %entry
+; CHECK_NOPT-NEXT:    addi sp, sp, -16
+; CHECK_NOPT-NEXT:    .cfi_def_cfa_offset 16
+; CHECK_NOPT-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:    sw a2, 12(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:    cv.beqimm a0, 15, .LBB4_2
+; CHECK_NOPT-NEXT:  # %bb.1: # %entry
+; CHECK_NOPT-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; CHECK_NOPT-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:  .LBB4_2: # %entry
+; CHECK_NOPT-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; CHECK_NOPT-NEXT:    addi sp, sp, 16
+; CHECK_NOPT-NEXT:    ret
+;
+; CHECK_OPT-LABEL: select_beqimm_3:
+; CHECK_OPT:       # %bb.0: # %entry
+; CHECK_OPT-NEXT:    cv.beqimm a0, 15, .LBB4_2
+; CHECK_OPT-NEXT:  # %bb.1: # %entry
+; CHECK_OPT-NEXT:    mv a2, a1
+; CHECK_OPT-NEXT:  .LBB4_2: # %entry
+; CHECK_OPT-NEXT:    mv a0, a2
+; CHECK_OPT-NEXT:    ret
+entry:
+  %cmp.not = icmp eq i32 %a, 15
+  %cond = select i1 %cmp.not, i32 %y, i32 %x
+  ret i32 %cond
+}
+
+define i32 @select_no_beqimm_1(i32 %a, i32 %x, i32 %y) {
+; CHECK_NOPT-LABEL: select_no_beqimm_1:
+; CHECK_NOPT:       # %bb.0: # %entry
+; CHECK_NOPT-NEXT:    addi sp, sp, -16
+; CHECK_NOPT-NEXT:    .cfi_def_cfa_offset 16
+; CHECK_NOPT-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:    li a1, -17
+; CHECK_NOPT-NEXT:    sw a2, 12(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:    beq a0, a1, .LBB5_2
+; CHECK_NOPT-NEXT:  # %bb.1: # %entry
+; CHECK_NOPT-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; CHECK_NOPT-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:  .LBB5_2: # %entry
+; CHECK_NOPT-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; CHECK_NOPT-NEXT:    addi sp, sp, 16
+; CHECK_NOPT-NEXT:    ret
+;
+; CHECK_OPT-LABEL: select_no_beqimm_1:
+; CHECK_OPT:       # %bb.0: # %entry
+; CHECK_OPT-NEXT:    li a3, -17
+; CHECK_OPT-NEXT:    beq a0, a3, .LBB5_2
+; CHECK_OPT-NEXT:  # %bb.1: # %entry
+; CHECK_OPT-NEXT:    mv a2, a1
+; CHECK_OPT-NEXT:  .LBB5_2: # %entry
+; CHECK_OPT-NEXT:    mv a0, a2
+; CHECK_OPT-NEXT:    ret
+entry:
+  %cmp.not = icmp eq i32 %a, -17
+  %cond = select i1 %cmp.not, i32 %y, i32 %x
+  ret i32 %cond
+}
+
+define i32 @select_no_beqimm_2(i32 %a, i32 %x, i32 %y) {
+; CHECK_NOPT-LABEL: select_no_beqimm_2:
+; CHECK_NOPT:       # %bb.0: # %entry
+; CHECK_NOPT-NEXT:    addi sp, sp, -16
+; CHECK_NOPT-NEXT:    .cfi_def_cfa_offset 16
+; CHECK_NOPT-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:    li a1, 16
+; CHECK_NOPT-NEXT:    sw a2, 12(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:    beq a0, a1, .LBB6_2
+; CHECK_NOPT-NEXT:  # %bb.1: # %entry
+; CHECK_NOPT-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; CHECK_NOPT-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:  .LBB6_2: # %entry
+; CHECK_NOPT-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; CHECK_NOPT-NEXT:    addi sp, sp, 16
+; CHECK_NOPT-NEXT:    ret
+;
+; CHECK_OPT-LABEL: select_no_beqimm_2:
+; CHECK_OPT:       # %bb.0: # %entry
+; CHECK_OPT-NEXT:    li a3, 16
+; CHECK_OPT-NEXT:    beq a0, a3, .LBB6_2
+; CHECK_OPT-NEXT:  # %bb.1: # %entry
+; CHECK_OPT-NEXT:    mv a2, a1
+; CHECK_OPT-NEXT:  .LBB6_2: # %entry
+; CHECK_OPT-NEXT:    mv a0, a2
+; CHECK_OPT-NEXT:    ret
+entry:
+  %cmp.not = icmp eq i32 %a, 16
+  %cond = select i1 %cmp.not, i32 %y, i32 %x
+  ret i32 %cond
+}
+
+define i32 @select_bneimm_1(i32 %a, i32 %x, i32 %y) {
+; CHECK_NOPT-LABEL: select_bneimm_1:
+; CHECK_NOPT:       # %bb.0: # %entry
+; CHECK_NOPT-NEXT:    addi sp, sp, -16
+; CHECK_NOPT-NEXT:    .cfi_def_cfa_offset 16
+; CHECK_NOPT-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:    sw a2, 12(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:    cv.bneimm a0, 0, .LBB7_2
+; CHECK_NOPT-NEXT:  # %bb.1: # %entry
+; CHECK_NOPT-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; CHECK_NOPT-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:  .LBB7_2: # %entry
+; CHECK_NOPT-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; CHECK_NOPT-NEXT:    addi sp, sp, 16
+; CHECK_NOPT-NEXT:    ret
+;
+; CHECK_OPT-LABEL: select_bneimm_1:
+; CHECK_OPT:       # %bb.0: # %entry
+; CHECK_OPT-NEXT:    cv.bneimm a0, 0, .LBB7_2
+; CHECK_OPT-NEXT:  # %bb.1: # %entry
+; CHECK_OPT-NEXT:    mv a2, a1
+; CHECK_OPT-NEXT:  .LBB7_2: # %entry
+; CHECK_OPT-NEXT:    mv a0, a2
+; CHECK_OPT-NEXT:    ret
+entry:
+  %cmp.not = icmp ne i32 %a, 0
+  %cond = select i1 %cmp.not, i32 %y, i32 %x
+  ret i32 %cond
+}
+
diff --git a/llvm/test/CodeGen/RISCV/zicfilp-indirect-branch.ll b/llvm/test/CodeGen/RISCV/zicfilp-indirect-branch.ll
new file mode 100644
index 000000000000..bccd28ee7e2b
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/zicfilp-indirect-branch.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=riscv64 -stop-after=finalize-isel < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-zicfilp -stop-after=finalize-isel < %s | FileCheck -check-prefixes=ZICFILP %s
+
+@brind.arr = internal unnamed_addr constant [2 x ptr] [ptr blockaddress(@brind, %5), ptr blockaddress(@brind, %8)], align 8
+@x = dso_local global i32 0, align 4
+
+define void @brind(i32 noundef signext %0) {
+  ; CHECK-LABEL: name: brind
+  ; CHECK:   PseudoBRIND killed [[VAR:%.*]], 0
+  ; ZICFILP-LABEL: name: brind
+  ; ZICFILP:   PseudoBRINDNonX7 killed [[VAR:%.*]], 0
+  %2 = sext i32 %0 to i64
+  %3 = getelementptr inbounds [2 x ptr], ptr @brind.arr, i64 0, i64 %2
+  %4 = load ptr, ptr %3, align 8
+  indirectbr ptr %4, [label %5, label %8]
+
+5:                                                ; preds = %1
+  %6 = load i32, ptr @x, align 4
+  %7 = add nsw i32 %6, 2
+  store i32 %7, ptr @x, align 4
+  br label %8
+
+8:                                                ; preds = %5, %1
+  %9 = load i32, ptr @x, align 4
+  %10 = add nsw i32 %9, 1
+  store i32 %10, ptr @x, align 4
+  ret void
+}
+
+define i32 @indirect_call(ptr %0) {
+  ; CHECK-LABEL: name: indirect_call
+  ; CHECK: PseudoCALLIndirect
+  ; ZICFILP-LABEL: name: indirect_call
+  ; ZICFILP: PseudoCALLIndirectNonX7
+  call void %0()
+  ret i32 0
+}
+
+
+define void @indirect_tail(ptr %0) {
+  ; CHECK-LABEL: name: indirect_tail
+  ; CHECK: PseudoTAILIndirect
+  ; ZICFILP-LABEL: name: indirect_tail
+  ; ZICFILP: PseudoTAILIndirectNonX7
+  tail call void %0()
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPARC/LeonSMACUMACInstructionUT.ll b/llvm/test/CodeGen/SPARC/LeonSMACUMACInstructionUT.ll
index 109fa8bd2bd9..1d960ee53a86 100644
--- a/llvm/test/CodeGen/SPARC/LeonSMACUMACInstructionUT.ll
+++ b/llvm/test/CodeGen/SPARC/LeonSMACUMACInstructionUT.ll
@@ -1,20 +1,20 @@
-; RUN: llc %s -O0 -march=sparc -mcpu=leon2 -o - | FileCheck %s
-; RUN: llc %s -O0 -march=sparc -mcpu=leon3 -o - | FileCheck %s
-; RUN: llc %s -O0 -march=sparc -mcpu=leon4 -o - | FileCheck %s
-
-; CHECK-LABEL: smac_test:
-; CHECK:       smac %i1, %i0, %i0
-define i32 @smac_test(ptr %a, ptr %b) {
-entry:
-;  %0 = tail call i32 asm sideeffect "smac $2, $1, $0", "={r2},{r3},{r4}"(i16* %a, i16* %b)
-  %0 = tail call i32 asm sideeffect "smac $2, $1, $0", "=r,rI,r"(ptr %a, ptr %b)
-  ret i32 %0
-}
-
-; CHECK-LABEL: umac_test:
-; CHECK:       umac %i1, %i0, %i0
-define i32 @umac_test(ptr %a, ptr %b) {
-entry:
-  %0 = tail call i32 asm sideeffect "umac $2, $1, $0", "=r,rI,r"(ptr %a, ptr %b)
-  ret i32 %0
-}
+; RUN: llc %s -O0 -march=sparc -mcpu=leon2 -o - | FileCheck %s
+; RUN: llc %s -O0 -march=sparc -mcpu=leon3 -o - | FileCheck %s
+; RUN: llc %s -O0 -march=sparc -mcpu=leon4 -o - | FileCheck %s
+
+; CHECK-LABEL: smac_test:
+; CHECK:       smac %i1, %i0, %i0
+define i32 @smac_test(ptr %a, ptr %b) {
+entry:
+;  %0 = tail call i32 asm sideeffect "smac $2, $1, $0", "={r2},{r3},{r4}"(i16* %a, i16* %b)
+  %0 = tail call i32 asm sideeffect "smac $2, $1, $0", "=r,rI,r"(ptr %a, ptr %b)
+  ret i32 %0
+}
+
+; CHECK-LABEL: umac_test:
+; CHECK:       umac %i1, %i0, %i0
+define i32 @umac_test(ptr %a, ptr %b) {
+entry:
+  %0 = tail call i32 asm sideeffect "umac $2, $1, $0", "=r,rI,r"(ptr %a, ptr %b)
+  ret i32 %0
+}
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/tan.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/tan.ll
new file mode 100644
index 000000000000..7bdce99dbfaa
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/tan.ll
@@ -0,0 +1,45 @@
+; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450"
+; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
+; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4
+; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4
+
+define noundef float @tan_float(float noundef %a) {
+entry:
+; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
+; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] Tan %[[#arg0]]
+  %elt.tan = call float @llvm.tan.f32(float %a)
+  ret float %elt.tan
+}
+
+define noundef half @tan_half(half noundef %a) {
+entry:
+; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
+; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] Tan %[[#arg0]]
+  %elt.tan = call half @llvm.tan.f16(half %a)
+  ret half %elt.tan
+}
+
+define noundef <4 x float> @tan_float4(<4 x float> noundef %a) {
+entry:
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
+  ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] Tan %[[#arg0]]
+  %elt.tan = call <4 x float> @llvm.tan.v4f32(<4 x float> %a)
+  ret <4 x float> %elt.tan
+}
+
+define noundef <4 x half> @tan_half4(<4 x half> noundef %a) {
+entry:
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
+  ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] Tan %[[#arg0]]
+  %elt.tan = call <4 x half> @llvm.tan.v4f16(<4 x half> %a)
+  ret <4 x half> %elt.tan
+}
+
+declare half @llvm.tan.f16(half)
+declare float @llvm.tan.f32(float)
+declare <4 x half> @llvm.tan.v4f16(<4 x half>)
+declare <4 x float> @llvm.tan.v4f32(<4 x float>)
diff --git a/llvm/test/CodeGen/SystemZ/atomic-load-08.ll b/llvm/test/CodeGen/SystemZ/atomic-load-08.ll
index 83050ef87591..90d4214037d2 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-load-08.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-load-08.ll
@@ -1,9 +1,8 @@
-; Test long double atomic loads. These are emitted by the Clang FE as i128
-; loads with a bitcast, and this test case gets converted into that form as
-; well by the AtomicExpand pass.
+; Test long double atomic loads - via i128.
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck -check-prefixes=CHECK,BASE %s
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck -check-prefixes=CHECK,Z13 %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mattr=+soft-float | FileCheck -check-prefixes=SOFTFP %s
 
 define void @f1(ptr %ret, ptr %src) {
 ; CHECK-LABEL: f1:
@@ -12,6 +11,13 @@ define void @f1(ptr %ret, ptr %src) {
 ; CHECK-NEXT:    stg %r1, 8(%r2)
 ; CHECK-NEXT:    stg %r0, 0(%r2)
 ; CHECK-NEXT:    br %r14
+
+; SOFTFP-LABEL: f1:
+; SOFTFP:       # %bb.0:
+; SOFTFP-NEXT:    lpq %r0, 0(%r3)
+; SOFTFP-NEXT:    stg %r1, 8(%r2)
+; SOFTFP-NEXT:    stg %r0, 0(%r2)
+; SOFTFP-NEXT:    br %r14
   %val = load atomic fp128, ptr %src seq_cst, align 16
   store fp128 %val, ptr %ret, align 8
   ret void
@@ -20,15 +26,10 @@ define void @f1(ptr %ret, ptr %src) {
 define void @f1_fpuse(ptr %ret, ptr %src) {
 ; CHECK-LABEL: f1_fpuse:
 ; CHECK:       # %bb.0:
-; BASE-NEXT: aghi	%r15, -176
-; BASE-NEXT: .cfi_def_cfa_offset 336
-
 ; CHECK-NEXT:	lpq	%r0, 0(%r3)
 
-; BASE-NEXT: stg %r1, 168(%r15)
-; BASE-NEXT: stg %r0, 160(%r15)
-; BASE-NEXT: ld	%f0, 160(%r15)
-; BASE-NEXT: ld	%f2, 168(%r15)
+; BASE-NEXT: ldgr	%f0, %r0
+; BASE-NEXT: ldgr	%f2, %r1
 
 ; Z13-NEXT: vlvgp %v0, %r0, %r1
 ; Z13-NEXT: vrepg %v2, %v0, 1
@@ -36,9 +37,28 @@ define void @f1_fpuse(ptr %ret, ptr %src) {
 ; CHECK-NEXT:	axbr	%f0, %f0
 ; CHECK-NEXT:	std	%f0, 0(%r2)
 ; CHECK-NEXT:	std	%f2, 8(%r2)
-; BASE-NEXT:	aghi	%r15, 176
 ; CHECK-NEXT:	br	%r14
 
+
+; SOFTFP-LABEL: f1_fpuse:
+; SOFTFP: stmg	%r13, %r15, 104(%r15)
+; SOFTFP: aghi	%r15, -208
+; SOFTFP:	lpq	%r0, 0(%r3)
+; SOFTFP-NEXT: lgr	%r13, %r2
+; SOFTFP-NEXT: stg	%r1, 168(%r15)
+; SOFTFP-NEXT: stg	%r0, 160(%r15)
+; SOFTFP-NEXT: stg	%r1, 184(%r15)
+; SOFTFP-NEXT: la	%r2, 192(%r15)
+; SOFTFP-NEXT: la	%r3, 176(%r15)
+; SOFTFP-NEXT: la	%r4, 160(%r15)
+; SOFTFP-NEXT: stg	%r0, 176(%r15)
+; SOFTFP-NEXT: brasl	%r14, __addtf3@PLT
+; SOFTFP-NEXT: lg	%r0, 200(%r15)
+; SOFTFP-NEXT: lg	%r1, 192(%r15)
+; SOFTFP-NEXT: stg	%r0, 8(%r13)
+; SOFTFP-NEXT: stg	%r1, 0(%r13)
+; SOFTFP-NEXT: lmg	%r13, %r15, 312(%r15)
+; SOFTFP-NEXT: br	%r14
   %val = load atomic fp128, ptr %src seq_cst, align 16
   %use = fadd fp128 %val, %val
   store fp128 %use, ptr %ret, align 8
diff --git a/llvm/test/CodeGen/SystemZ/atomic-load-09.ll b/llvm/test/CodeGen/SystemZ/atomic-load-09.ll
new file mode 100644
index 000000000000..61b8e2f0efa8
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomic-load-09.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; Test long double atomic loads on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+define void @f1(ptr %ret, ptr %src) {
+; CHECK-LABEL: f1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lpq %r0, 0(%r3)
+; CHECK-NEXT:    stg %r1, 8(%r2)
+; CHECK-NEXT:    stg %r0, 0(%r2)
+; CHECK-NEXT:    br %r14
+  %val = load atomic fp128, ptr %src seq_cst, align 16
+  store fp128 %val, ptr %ret, align 8
+  ret void
+}
+
+define void @f1_fpuse(ptr %ret, ptr %src) {
+; CHECK-LABEL: f1_fpuse:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lpq %r0, 0(%r3)
+; CHECK-NEXT:    vlvgp %v0, %r0, %r1
+; CHECK-NEXT:    wfaxb %v0, %v0, %v0
+; CHECK-NEXT:    vst %v0, 0(%r2), 3
+; CHECK-NEXT:    br %r14
+  %val = load atomic fp128, ptr %src seq_cst, align 16
+  %use = fadd fp128 %val, %val
+  store fp128 %use, ptr %ret, align 8
+  ret void
+}
+
+define void @f2(ptr %ret, ptr %src) {
+; CHECK-LABEL: f2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    stmg %r13, %r15, 104(%r15)
+; CHECK-NEXT:    .cfi_offset %r13, -56
+; CHECK-NEXT:    .cfi_offset %r14, -48
+; CHECK-NEXT:    .cfi_offset %r15, -40
+; CHECK-NEXT:    aghi %r15, -176
+; CHECK-NEXT:    .cfi_def_cfa_offset 336
+; CHECK-NEXT:    lgr %r13, %r2
+; CHECK-NEXT:    la %r4, 160(%r15)
+; CHECK-NEXT:    lghi %r2, 16
+; CHECK-NEXT:    lhi %r5, 5
+; CHECK-NEXT:    brasl %r14, __atomic_load@PLT
+; CHECK-NEXT:    vl %v0, 160(%r15), 3
+; CHECK-NEXT:    vst %v0, 0(%r13), 3
+; CHECK-NEXT:    lmg %r13, %r15, 280(%r15)
+; CHECK-NEXT:    br %r14
+  %val = load atomic fp128, ptr %src seq_cst, align 8
+  store fp128 %val, ptr %ret, align 8
+  ret void
+}
+
+define void @f2_fpuse(ptr %ret, ptr %src) {
+; CHECK-LABEL: f2_fpuse:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    stmg %r13, %r15, 104(%r15)
+; CHECK-NEXT:    .cfi_offset %r13, -56
+; CHECK-NEXT:    .cfi_offset %r14, -48
+; CHECK-NEXT:    .cfi_offset %r15, -40
+; CHECK-NEXT:    aghi %r15, -176
+; CHECK-NEXT:    .cfi_def_cfa_offset 336
+; CHECK-NEXT:    lgr %r13, %r2
+; CHECK-NEXT:    la %r4, 160(%r15)
+; CHECK-NEXT:    lghi %r2, 16
+; CHECK-NEXT:    lhi %r5, 5
+; CHECK-NEXT:    brasl %r14, __atomic_load@PLT
+; CHECK-NEXT:    vl %v0, 160(%r15), 3
+; CHECK-NEXT:    wfaxb %v0, %v0, %v0
+; CHECK-NEXT:    vst %v0, 0(%r13), 3
+; CHECK-NEXT:    lmg %r13, %r15, 280(%r15)
+; CHECK-NEXT:    br %r14
+  %val = load atomic fp128, ptr %src seq_cst, align 8
+  %use = fadd fp128 %val, %val
+  store fp128 %use, ptr %ret, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomic-store-08.ll b/llvm/test/CodeGen/SystemZ/atomic-store-08.ll
index 4d1693477f01..57f1319365c4 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-store-08.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-store-08.ll
@@ -1,8 +1,8 @@
-; Test long double atomic stores. The atomic store is converted to i128 by
-; the AtomicExpand pass.
+; Test long double atomic stores - via i128.
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck -check-prefixes=CHECK,BASE %s
-; xUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck -check-prefixes=CHECK,Z13 %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck -check-prefixes=CHECK,Z13 %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mattr=+soft-float | FileCheck -check-prefixes=SOFTFP %s
 
 define void @f1(ptr %dst, ptr %src) {
 ; CHECK-LABEL: f1:
@@ -12,6 +12,14 @@ define void @f1(ptr %dst, ptr %src) {
 ; CHECK-NEXT:    stpq %r0, 0(%r2)
 ; CHECK-NEXT:    bcr 1{{[45]}}, %r0
 ; CHECK-NEXT:    br %r14
+
+; SOFTFP-LABEL: f1:
+; SOFTFP:       # %bb.0:
+; SOFTFP-NEXT:    lg %r1, 8(%r3)
+; SOFTFP-NEXT:    lg %r0, 0(%r3)
+; SOFTFP-NEXT:    stpq %r0, 0(%r2)
+; SOFTFP-NEXT:    bcr 1{{[45]}}, %r0
+; SOFTFP-NEXT:    br %r14
   %val = load fp128, ptr %src, align 8
   store atomic fp128 %val, ptr %dst seq_cst, align 16
   ret void
@@ -32,8 +40,28 @@ define void @f1_fpsrc(ptr %dst, ptr %src) {
 ; Z13-NEXT: vlgvg	%r0, %v0, 0
 
 ; CHECK-NEXT: stpq	%r0, 0(%r2)
-; CHECK-NEXT: bcr	15, %r0
+; CHECK-NEXT: bcr	1{{[45]}}, %r0
 ; CHECK-NEXT: br	%r14
+
+; SOFTFP-LABEL: f1_fpsrc:
+; SOFTFP: lg	%r0, 8(%r3)
+; SOFTFP-NEXT: lg	%r1, 0(%r3)
+; SOFTFP-NEXT:	lgr	%r13, %r2
+; SOFTFP-NEXT:	stg	%r0, 168(%r15)
+; SOFTFP-NEXT:	stg	%r1, 160(%r15)
+; SOFTFP-NEXT:	stg	%r0, 184(%r15)
+; SOFTFP-NEXT:	la	%r2, 192(%r15)
+; SOFTFP-NEXT:	la	%r3, 176(%r15)
+; SOFTFP-NEXT:	la	%r4, 160(%r15)
+; SOFTFP-NEXT:	stg	%r1, 176(%r15)
+; SOFTFP-NEXT:	brasl	%r14, __addtf3@PLT
+; SOFTFP-NEXT:	lg	%r1, 200(%r15)
+; SOFTFP-NEXT:	lg	%r0, 192(%r15)
+; SOFTFP-NEXT:	stpq	%r0, 0(%r13)
+; SOFTFP-NEXT:	bcr	1{{[45]}}, %r0
+; SOFTFP-NEXT:	lmg	%r13, %r15, 312(%r15)
+; SOFTFP-NEXT:	br	%r14
+
   %val = load fp128, ptr %src, align 8
   %add = fadd fp128 %val, %val
   store atomic fp128 %add, ptr %dst seq_cst, align 16
@@ -58,8 +86,8 @@ define void @f2_fpuse(ptr %dst, ptr %src) {
 ; CHECK-NEXT:	.cfi_def_cfa_offset 336
 ; CHECK-NEXT:	ld	%f0, 0(%r3)
 ; CHECK-NEXT:	ld	%f2, 8(%r3)
-; CHECK-NEXT:	lgr	%r3, %r2
-; CHECK-NEXT:	axbr	%f0, %f0
+; CHECK-DAG:	lgr	%r3, %r2
+; CHECK-DAG:	axbr	%f0, %f0
 ; CHECK-NEXT:	la	%r4, 160(%r15)
 ; CHECK-NEXT:	lghi	%r2, 16
 ; CHECK-NEXT:	lhi	%r5, 5
diff --git a/llvm/test/CodeGen/SystemZ/atomic-store-09.ll b/llvm/test/CodeGen/SystemZ/atomic-store-09.ll
new file mode 100644
index 000000000000..3af16490b34b
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomic-store-09.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; Test long double atomic stores on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+define void @f1(ptr %dst, ptr %src) {
+; CHECK-LABEL: f1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lg %r1, 8(%r3)
+; CHECK-NEXT:    lg %r0, 0(%r3)
+; CHECK-NEXT:    stpq %r0, 0(%r2)
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %val = load fp128, ptr %src, align 8
+  store atomic fp128 %val, ptr %dst seq_cst, align 16
+  ret void
+}
+
+define void @f1_fpsrc(ptr %dst, ptr %src) {
+; CHECK-LABEL: f1_fpsrc:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    wfaxb %v0, %v0, %v0
+; CHECK-NEXT:    vlgvg %r1, %v0, 1
+; CHECK-NEXT:    vlgvg %r0, %v0, 0
+; CHECK-NEXT:    stpq %r0, 0(%r2)
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %val = load fp128, ptr %src, align 8
+  %add = fadd fp128 %val, %val
+  store atomic fp128 %add, ptr %dst seq_cst, align 16
+  ret void
+}
+
+define void @f2(ptr %dst, ptr %src) {
+; CHECK-LABEL: f2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT:    .cfi_offset %r14, -48
+; CHECK-NEXT:    .cfi_offset %r15, -40
+; CHECK-NEXT:    aghi %r15, -176
+; CHECK-NEXT:    .cfi_def_cfa_offset 336
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    lgr %r0, %r2
+; CHECK-NEXT:    la %r4, 160(%r15)
+; CHECK-NEXT:    lghi %r2, 16
+; CHECK-NEXT:    lgr %r3, %r0
+; CHECK-NEXT:    lhi %r5, 5
+; CHECK-NEXT:    vst %v0, 160(%r15), 3
+; CHECK-NEXT:    brasl %r14, __atomic_store@PLT
+; CHECK-NEXT:    lmg %r14, %r15, 288(%r15)
+; CHECK-NEXT:    br %r14
+  %val = load fp128, ptr %src, align 8
+  store atomic fp128 %val, ptr %dst seq_cst, align 8
+  ret void
+}
+
+define void @f2_fpuse(ptr %dst, ptr %src) {
+; CHECK-LABEL: f2_fpuse:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT:    .cfi_offset %r14, -48
+; CHECK-NEXT:    .cfi_offset %r15, -40
+; CHECK-NEXT:    aghi %r15, -176
+; CHECK-NEXT:    .cfi_def_cfa_offset 336
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    wfaxb %v0, %v0, %v0
+; CHECK-NEXT:    lgr %r0, %r2
+; CHECK-NEXT:    la %r4, 160(%r15)
+; CHECK-NEXT:    lghi %r2, 16
+; CHECK-NEXT:    lgr %r3, %r0
+; CHECK-NEXT:    lhi %r5, 5
+; CHECK-NEXT:    vst %v0, 160(%r15), 3
+; CHECK-NEXT:    brasl %r14, __atomic_store@PLT
+; CHECK-NEXT:    lmg %r14, %r15, 288(%r15)
+; CHECK-NEXT:    br %r14
+  %val = load fp128, ptr %src, align 8
+  %add = fadd fp128 %val, %val
+  store atomic fp128 %add, ptr %dst seq_cst, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-fmax-03.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-fmax-03.ll
index 3c8ea19f86f8..21e7c6e586df 100644
--- a/llvm/test/CodeGen/SystemZ/atomicrmw-fmax-03.ll
+++ b/llvm/test/CodeGen/SystemZ/atomicrmw-fmax-03.ll
@@ -20,10 +20,8 @@ define void @f1(ptr %ret, ptr %src, ptr %b) {
 ; CHECK: std [[FSL]], 176(%r15)
 ; CHECK: std [[FSH]], 184(%r15)
 ; CHECK: brasl %r14, fmaxl@PLT
-; CHECK: ld [[FL:%f[0-9]+]], 192(%r15)
-; CHECK: ld [[FH:%f[0-9]+]], 200(%r15)
-; CHECK: lgdr [[RH:%r[0-9]+]], [[FH]]
-; CHECK: lgdr [[RL:%r[0-9]+]], [[FL]]
+; CHECK: lg [[RH:%r[0-9]+]], 200(%r15)
+; CHECK: lg [[RL:%r[0-9]+]], 192(%r15)
 ; CHECK: lgdr [[RSH:%r[0-9]+]], [[FSH]]
 ; CHECK: lgdr [[RSL:%r[0-9]+]], [[FSL]]
 ; CHECK: cdsg [[RSL]], [[RL]], 0([[SRC]])
diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-fmin-03.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-fmin-03.ll
index dfa2cc021d16..1c6f8e20aa4f 100644
--- a/llvm/test/CodeGen/SystemZ/atomicrmw-fmin-03.ll
+++ b/llvm/test/CodeGen/SystemZ/atomicrmw-fmin-03.ll
@@ -20,10 +20,8 @@ define void @f1(ptr %ret, ptr %src, ptr %b) {
 ; CHECK: std [[FSL]], 176(%r15)
 ; CHECK: std [[FSH]], 184(%r15)
 ; CHECK: brasl %r14, fminl@PLT
-; CHECK: ld [[FL:%f[0-9]+]], 192(%r15)
-; CHECK: ld [[FH:%f[0-9]+]], 200(%r15)
-; CHECK: lgdr [[RH:%r[0-9]+]], [[FH]]
-; CHECK: lgdr [[RL:%r[0-9]+]], [[FL]]
+; CHECK: lg [[RH:%r[0-9]+]], 200(%r15)
+; CHECK: lg [[RL:%r[0-9]+]], 192(%r15)
 ; CHECK: lgdr [[RSH:%r[0-9]+]], [[FSH]]
 ; CHECK: lgdr [[RSL:%r[0-9]+]], [[FSL]]
 ; CHECK: cdsg [[RSL]], [[RL]], 0([[SRC]])
diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-xchg-07.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-xchg-07.ll
index f5d8dc092a7e..acc9ce9b4e4e 100644
--- a/llvm/test/CodeGen/SystemZ/atomicrmw-xchg-07.ll
+++ b/llvm/test/CodeGen/SystemZ/atomicrmw-xchg-07.ll
@@ -1,67 +1,123 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; Test long double atomic exchange.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck -check-prefixes=CHECK,HARDFP %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mattr=+soft-float | FileCheck -check-prefixes=CHECK,SOFTFP %s
 
 define void @f1(ptr align 16 %ret, ptr align 16 %src, ptr align 16 %b) {
 ; CHECK-LABEL: f1:
-; CHECK:       lg      %r1, 8(%r4)
-; CHECK-NEXT:  lg      %r0, 0(%r4)
-; CHECK-NEXT:  lg      %r4, 8(%r3)
-; CHECK-NEXT:  lg      %r5, 0(%r3)
-; CHECK-NEXT:.LBB0_1:                          # %atomicrmw.start
-; CHECK-NEXT:                                  # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:  lgr     %r12, %r5
-; CHECK-NEXT:  lgr     %r13, %r4
-; CHECK-NEXT:  cdsg    %r12, %r0, 0(%r3)
-; CHECK-NEXT:  lgr     %r4, %r13
-; CHECK-NEXT:  lgr     %r5, %r12
-; CHECK-NEXT:  jl      .LBB0_1
-; CHECK-NEXT:# %bb.2:                          # %atomicrmw.end
-; CHECK-NEXT:  stg     %r5, 0(%r2)
-; CHECK-NEXT:  stg     %r4, 8(%r2)
-; CHECK-NEXT:  lmg     %r12, %r15, 96(%r15)
-; CHECK-NEXT:  br      %r14
-  %val = load fp128, ptr %b, align 16
-  %res = atomicrmw xchg ptr %src, fp128 %val seq_cst
-  store fp128 %res, ptr %ret, align 16
-  ret void
-}
-
-define void @f1_fpuse(ptr align 16 %ret, ptr align 16 %src, ptr align 16 %b) {
-; CHECK-LABEL: f1_fpuse:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    stmg %r12, %r15, 96(%r15)
 ; CHECK-NEXT:    .cfi_offset %r12, -64
 ; CHECK-NEXT:    .cfi_offset %r13, -56
 ; CHECK-NEXT:    .cfi_offset %r15, -40
-; CHECK-NEXT:    aghi %r15, -176
-; CHECK-NEXT:    .cfi_def_cfa_offset 336
-; CHECK-NEXT:    ld %f0, 0(%r4)
-; CHECK-NEXT:    ld %f2, 8(%r4)
-; CHECK-NEXT:    lg %r0, 8(%r3)
-; CHECK-NEXT:    lg %r1, 0(%r3)
-; CHECK-NEXT:    axbr %f0, %f0
-; CHECK-NEXT:    lgdr %r5, %f2
-; CHECK-NEXT:    lgdr %r4, %f0
-; CHECK-NEXT:  .LBB1_1: # %atomicrmw.start
+; CHECK-NEXT:    lg %r1, 8(%r4)
+; CHECK-NEXT:    lg %r0, 0(%r4)
+; CHECK-NEXT:    lg %r4, 8(%r3)
+; CHECK-NEXT:    lg %r5, 0(%r3)
+; CHECK-NEXT:  .LBB0_1: # %atomicrmw.start
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    lgr %r12, %r1
-; CHECK-NEXT:    lgr %r13, %r0
-; CHECK-NEXT:    cdsg %r12, %r4, 0(%r3)
-; CHECK-NEXT:    lgr %r0, %r13
-; CHECK-NEXT:    lgr %r1, %r12
-; CHECK-NEXT:    jl .LBB1_1
+; CHECK-NEXT:    lgr %r12, %r5
+; CHECK-NEXT:    lgr %r13, %r4
+; CHECK-NEXT:    cdsg %r12, %r0, 0(%r3)
+; CHECK-NEXT:    lgr %r4, %r13
+; CHECK-NEXT:    lgr %r5, %r12
+; CHECK-NEXT:    jl .LBB0_1
 ; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
-; CHECK-NEXT:    stg %r1, 160(%r15)
-; CHECK-NEXT:    stg %r0, 168(%r15)
-; CHECK-NEXT:    ld %f0, 160(%r15)
-; CHECK-NEXT:    ld %f2, 168(%r15)
-; CHECK-NEXT:    axbr %f0, %f0
-; CHECK-NEXT:    std %f0, 0(%r2)
-; CHECK-NEXT:    std %f2, 8(%r2)
-; CHECK-NEXT:    lmg %r12, %r15, 272(%r15)
+; CHECK-NEXT:    stg %r5, 0(%r2)
+; CHECK-NEXT:    stg %r4, 8(%r2)
+; CHECK-NEXT:    lmg %r12, %r15, 96(%r15)
 ; CHECK-NEXT:    br %r14
   %val = load fp128, ptr %b, align 16
+  %res = atomicrmw xchg ptr %src, fp128 %val seq_cst
+  store fp128 %res, ptr %ret, align 16
+  ret void
+}
+
+define void @f1_fpuse(ptr align 16 %ret, ptr align 16 %src, ptr align 16 %b) {
+; HARDFP-LABEL: f1_fpuse:
+; HARDFP:       # %bb.0:
+; HARDFP-NEXT:    stmg %r12, %r15, 96(%r15)
+; HARDFP-NEXT:    .cfi_offset %r12, -64
+; HARDFP-NEXT:    .cfi_offset %r13, -56
+; HARDFP-NEXT:    .cfi_offset %r15, -40
+; HARDFP-NEXT:    aghi %r15, -176
+; HARDFP-NEXT:    .cfi_def_cfa_offset 336
+; HARDFP-NEXT:    ld %f0, 0(%r4)
+; HARDFP-NEXT:    ld %f2, 8(%r4)
+; HARDFP-NEXT:    lg %r0, 8(%r3)
+; HARDFP-NEXT:    lg %r1, 0(%r3)
+; HARDFP-NEXT:    axbr %f0, %f0
+; HARDFP-NEXT:    lgdr %r5, %f2
+; HARDFP-NEXT:    lgdr %r4, %f0
+; HARDFP-NEXT:  .LBB1_1: # %atomicrmw.start
+; HARDFP-NEXT:    # =>This Inner Loop Header: Depth=1
+; HARDFP-NEXT:    lgr %r12, %r1
+; HARDFP-NEXT:    lgr %r13, %r0
+; HARDFP-NEXT:    cdsg %r12, %r4, 0(%r3)
+; HARDFP-NEXT:    lgr %r0, %r13
+; HARDFP-NEXT:    lgr %r1, %r12
+; HARDFP-NEXT:    jl .LBB1_1
+; HARDFP-NEXT:  # %bb.2: # %atomicrmw.end
+; HARDFP-NEXT:    stg %r1, 160(%r15)
+; HARDFP-NEXT:    stg %r0, 168(%r15)
+; HARDFP-NEXT:    ld %f0, 160(%r15)
+; HARDFP-NEXT:    ld %f2, 168(%r15)
+; HARDFP-NEXT:    axbr %f0, %f0
+; HARDFP-NEXT:    std %f0, 0(%r2)
+; HARDFP-NEXT:    std %f2, 8(%r2)
+; HARDFP-NEXT:    lmg %r12, %r15, 272(%r15)
+; HARDFP-NEXT:    br %r14
+;
+; SOFTFP-LABEL: f1_fpuse:
+; SOFTFP:       # %bb.0:
+; SOFTFP-NEXT:    stmg %r12, %r15, 96(%r15)
+; SOFTFP-NEXT:    .cfi_offset %r12, -64
+; SOFTFP-NEXT:    .cfi_offset %r13, -56
+; SOFTFP-NEXT:    .cfi_offset %r14, -48
+; SOFTFP-NEXT:    .cfi_offset %r15, -40
+; SOFTFP-NEXT:    aghi %r15, -256
+; SOFTFP-NEXT:    .cfi_def_cfa_offset 416
+; SOFTFP-NEXT:    lg %r0, 8(%r4)
+; SOFTFP-NEXT:    lg %r1, 0(%r4)
+; SOFTFP-NEXT:    lgr %r12, %r3
+; SOFTFP-NEXT:    lgr %r13, %r2
+; SOFTFP-NEXT:    stg %r0, 216(%r15)
+; SOFTFP-NEXT:    stg %r1, 208(%r15)
+; SOFTFP-NEXT:    stg %r0, 232(%r15)
+; SOFTFP-NEXT:    la %r2, 240(%r15)
+; SOFTFP-NEXT:    la %r3, 224(%r15)
+; SOFTFP-NEXT:    la %r4, 208(%r15)
+; SOFTFP-NEXT:    stg %r1, 224(%r15)
+; SOFTFP-NEXT:    brasl %r14, __addtf3@PLT
+; SOFTFP-NEXT:    lg %r3, 248(%r15)
+; SOFTFP-NEXT:    lg %r2, 240(%r15)
+; SOFTFP-NEXT:    lg %r0, 8(%r12)
+; SOFTFP-NEXT:    lg %r1, 0(%r12)
+; SOFTFP-NEXT:  .LBB1_1: # %atomicrmw.start
+; SOFTFP-NEXT:    # =>This Inner Loop Header: Depth=1
+; SOFTFP-NEXT:    lgr %r4, %r1
+; SOFTFP-NEXT:    lgr %r5, %r0
+; SOFTFP-NEXT:    cdsg %r4, %r2, 0(%r12)
+; SOFTFP-NEXT:    lgr %r0, %r5
+; SOFTFP-NEXT:    lgr %r1, %r4
+; SOFTFP-NEXT:    jl .LBB1_1
+; SOFTFP-NEXT:  # %bb.2: # %atomicrmw.end
+; SOFTFP-NEXT:    stg %r1, 160(%r15)
+; SOFTFP-NEXT:    stg %r1, 176(%r15)
+; SOFTFP-NEXT:    stg %r0, 168(%r15)
+; SOFTFP-NEXT:    la %r2, 192(%r15)
+; SOFTFP-NEXT:    la %r3, 176(%r15)
+; SOFTFP-NEXT:    la %r4, 160(%r15)
+; SOFTFP-NEXT:    stg %r0, 184(%r15)
+; SOFTFP-NEXT:    brasl %r14, __addtf3@PLT
+; SOFTFP-NEXT:    lg %r0, 200(%r15)
+; SOFTFP-NEXT:    lg %r1, 192(%r15)
+; SOFTFP-NEXT:    stg %r0, 8(%r13)
+; SOFTFP-NEXT:    stg %r1, 0(%r13)
+; SOFTFP-NEXT:    lmg %r12, %r15, 352(%r15)
+; SOFTFP-NEXT:    br %r14
+  %val = load fp128, ptr %b, align 16
   %add.src = fadd fp128 %val, %val
   %res = atomicrmw xchg ptr %src, fp128 %add.src seq_cst
   %res.x2 = fadd fp128 %res, %res
diff --git a/llvm/test/CodeGen/SystemZ/copy-phys-reg-gr128-to-fp128.mir b/llvm/test/CodeGen/SystemZ/copy-phys-reg-gr128-to-fp128.mir
new file mode 100644
index 000000000000..2fa0e585f742
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/copy-phys-reg-gr128-to-fp128.mir
@@ -0,0 +1,49 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -mtriple=s390x-ibm-linux -mcpu=z13 -run-pass=postrapseudos -verify-machineinstrs -o - %s | FileCheck %s
+
+---
+name:            copy_gr128_to_fp128__r0q_to_f0q
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $r0q
+    ; CHECK-LABEL: name: copy_gr128_to_fp128__r0q_to_f0q
+    ; CHECK: liveins: $r0q
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $f0d = LDGR $r0d, implicit-def $f0q
+    ; CHECK-NEXT: $f2d = LDGR $r1d
+    ; CHECK-NEXT: Return implicit $f0q
+    $f0q = COPY $r0q
+    Return implicit $f0q
+...
+
+---
+name:            copy_gr128_to_fp128__r0q_to_f0q_killed
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $r0q
+    ; CHECK-LABEL: name: copy_gr128_to_fp128__r0q_to_f0q_killed
+    ; CHECK: liveins: $r0q
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $f0d = LDGR $r0d, implicit-def $f0q
+    ; CHECK-NEXT: $f2d = LDGR killed $r1d
+    ; CHECK-NEXT: Return implicit $f0q
+    $f0q = COPY killed $r0q
+    Return implicit $f0q
+...
+
+---
+name:            copy_gr128_to_fp128__r0q_to_f0q_undef
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $r0q
+    ; CHECK-LABEL: name: copy_gr128_to_fp128__r0q_to_f0q_undef
+    ; CHECK: liveins: $r0q
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $f0q = KILL undef $r0q
+    ; CHECK-NEXT: Return implicit $f0q
+    $f0q = COPY undef $r0q
+    Return implicit $f0q
+...
diff --git a/llvm/test/CodeGen/SystemZ/copy-phys-reg-gr128-to-vr128.mir b/llvm/test/CodeGen/SystemZ/copy-phys-reg-gr128-to-vr128.mir
index a2a07ac5c7f5..36580fed3baa 100644
--- a/llvm/test/CodeGen/SystemZ/copy-phys-reg-gr128-to-vr128.mir
+++ b/llvm/test/CodeGen/SystemZ/copy-phys-reg-gr128-to-vr128.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-# RUN: llc -mtriple=s390x-ibm-linux -mcpu=z13 -run-pass=postrapseudos -o - %s | FileCheck %s
+# RUN: llc -mtriple=s390x-ibm-linux -mcpu=z13 -run-pass=postrapseudos -verify-machineinstrs -o - %s | FileCheck %s
 
 ---
 name:            copy_gr128_to_vr128__r0q_to_v0
@@ -45,34 +45,3 @@ body:             |
     $v0 = COPY undef $r0q
     Return implicit $v0
 ...
-
----
-name:            copy_gr128_to_vr128__r0q_to_v0_subreg0
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    liveins: $r0d
-    ; CHECK-LABEL: name: copy_gr128_to_vr128__r0q_to_v0_subreg0
-    ; CHECK: liveins: $r0d
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: $v0 = VLVGP $r0d, $r1d
-    ; CHECK-NEXT: Return implicit $v0
-    $v0 = COPY $r0q
-    Return implicit $v0
-...
-
----
-name:            copy_gr128_to_vr128__r0q_to_v0_subreg1
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    liveins: $r1d
-    ; CHECK-LABEL: name: copy_gr128_to_vr128__r0q_to_v0_subreg1
-    ; CHECK: liveins: $r1d
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: $v0 = VLVGP $r0d, $r1d
-    ; CHECK-NEXT: Return implicit $v0
-    $v0 = COPY $r0q
-    Return implicit $v0
-...
-
diff --git a/llvm/test/CodeGen/SystemZ/fold-copy-vector-immediate.mir b/llvm/test/CodeGen/SystemZ/fold-copy-vector-immediate.mir
new file mode 100644
index 000000000000..8b86631c7797
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/fold-copy-vector-immediate.mir
@@ -0,0 +1,206 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -mtriple=s390x-linux-gnu -mcpu=z13 -run-pass=peephole-opt -o - %s | FileCheck %s
+
+---
+name:            fold_vgbm_0_copyvr128_to_gr128_virtreg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+      liveins: $r2d
+    ; CHECK-LABEL: name: fold_vgbm_0_copyvr128_to_gr128_virtreg
+    ; CHECK: liveins: $r2d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64bit = COPY $r2d
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:addr64bit = COPY [[COPY]]
+    ; CHECK-NEXT: [[LGHI:%[0-9]+]]:gr64bit = LGHI 0
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:gr128bit = REG_SEQUENCE [[LGHI]], %subreg.subreg_h64, [[LGHI]], %subreg.subreg_l64
+    ; CHECK-NEXT: $r0q = COPY [[REG_SEQUENCE]]
+    ; CHECK-NEXT: Return implicit $r0q
+    %0:gr64bit = COPY $r2d
+    %1:addr64bit = COPY %0
+    %2:vr128bit = VGBM 0
+    %3:gr128bit = COPY %2
+    $r0q = COPY %3
+    Return implicit $r0q
+...
+
+---
+name:            fold_vgbm_0_copyvr128_to_gr128_virtreg_dbg_use
+tracksRegLiveness: true
+body:             |
+  bb.0:
+      liveins: $r2d
+    ; CHECK-LABEL: name: fold_vgbm_0_copyvr128_to_gr128_virtreg_dbg_use
+    ; CHECK: liveins: $r2d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64bit = COPY $r2d
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:addr64bit = COPY [[COPY]]
+    ; CHECK-NEXT: [[LGHI:%[0-9]+]]:gr64bit = LGHI 0
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:gr128bit = REG_SEQUENCE [[LGHI]], %subreg.subreg_h64, [[LGHI]], %subreg.subreg_l64
+    ; CHECK-NEXT: DBG_VALUE %2:vr128bit
+    ; CHECK-NEXT: $r0q = COPY [[REG_SEQUENCE]]
+    ; CHECK-NEXT: Return implicit $r0q
+    %0:gr64bit = COPY $r2d
+    %1:addr64bit = COPY %0
+    %2:vr128bit = VGBM 0
+    %3:gr128bit = COPY %2
+    DBG_VALUE %2
+    $r0q = COPY %3
+    Return implicit $r0q
+...
+
+---
+name:            fold_vgbm_0_copyvr128_to_gr128_virtreg_multi_use
+tracksRegLiveness: true
+body:             |
+  bb.0:
+      liveins: $r2d
+    ; CHECK-LABEL: name: fold_vgbm_0_copyvr128_to_gr128_virtreg_multi_use
+    ; CHECK: liveins: $r2d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64bit = COPY $r2d
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:addr64bit = COPY [[COPY]]
+    ; CHECK-NEXT: [[VGBM:%[0-9]+]]:vr128bit = VGBM 0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr128bit = COPY [[VGBM]]
+    ; CHECK-NEXT: $r0q = COPY [[COPY2]]
+    ; CHECK-NEXT: $r2q = COPY [[COPY2]]
+    ; CHECK-NEXT: Return implicit $r0q, implicit $r2q
+    %0:gr64bit = COPY $r2d
+    %1:addr64bit = COPY %0
+    %2:vr128bit = VGBM 0
+    %3:gr128bit = COPY %2
+    %4:gr128bit = COPY %2
+    $r0q = COPY %3
+    $r2q = COPY %4
+    Return implicit $r0q, implicit $r2q
+...
+
+---
+name:            fold_vgbm_0_copyvr128_to_gr128_physreg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+      liveins: $r2d
+    ; CHECK-LABEL: name: fold_vgbm_0_copyvr128_to_gr128_physreg
+    ; CHECK: liveins: $r2d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64bit = COPY $r2d
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:addr64bit = COPY [[COPY]]
+    ; CHECK-NEXT: [[VGBM:%[0-9]+]]:vr128bit = VGBM 0
+    ; CHECK-NEXT: $r0q = COPY [[VGBM]]
+    ; CHECK-NEXT: Return implicit $r0q
+    %0:gr64bit = COPY $r2d
+    %1:addr64bit = COPY %0
+    %2:vr128bit = VGBM 0
+    $r0q = COPY %2
+    Return implicit $r0q
+...
+
+---
+name:            no_fold_vgbm_0_copyvr128_to_vr128_virtreg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+      liveins: $r2d
+    ; CHECK-LABEL: name: no_fold_vgbm_0_copyvr128_to_vr128_virtreg
+    ; CHECK: liveins: $r2d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64bit = COPY $r2d
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:addr64bit = COPY [[COPY]]
+    ; CHECK-NEXT: [[VGBM:%[0-9]+]]:vr128bit = VGBM 0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vr128bit = COPY [[VGBM]]
+    ; CHECK-NEXT: $v0 = COPY [[COPY2]]
+    ; CHECK-NEXT: Return implicit $v0
+    %0:gr64bit = COPY $r2d
+    %1:addr64bit = COPY %0
+    %2:vr128bit = VGBM 0
+    %3:vr128bit = COPY %2
+    $v0 = COPY %3
+    Return implicit $v0
+...
+
+---
+name:            no_fold_vgbm_0_copyvr128_to_vr128_physreg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+      liveins: $r2d
+    ; CHECK-LABEL: name: no_fold_vgbm_0_copyvr128_to_vr128_physreg
+    ; CHECK: liveins: $r2d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64bit = COPY $r2d
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:addr64bit = COPY [[COPY]]
+    ; CHECK-NEXT: [[VGBM:%[0-9]+]]:vr128bit = VGBM 0
+    ; CHECK-NEXT: $v0 = COPY [[VGBM]]
+    ; CHECK-NEXT: Return implicit $v0
+    %0:gr64bit = COPY $r2d
+    %1:addr64bit = COPY %0
+    %2:vr128bit = VGBM 0
+    $v0 = COPY %2
+    Return implicit $v0
+...
+
+---
+name:            fold_vgbm_1_copyvr128_to_gr128_virtreg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+      liveins: $r2d
+    ; CHECK-LABEL: name: fold_vgbm_1_copyvr128_to_gr128_virtreg
+    ; CHECK: liveins: $r2d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64bit = COPY $r2d
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:addr64bit = COPY [[COPY]]
+    ; CHECK-NEXT: [[VGBM:%[0-9]+]]:vr128bit = VGBM 1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr128bit = COPY [[VGBM]]
+    ; CHECK-NEXT: $r0q = COPY [[COPY2]]
+    ; CHECK-NEXT: Return implicit $r0q
+    %0:gr64bit = COPY $r2d
+    %1:addr64bit = COPY %0
+    %2:vr128bit = VGBM 1
+    %3:gr128bit = COPY %2
+    $r0q = COPY %3
+    Return implicit $r0q
+...
+
+---
+name:            no_fold_vgbm_0_noncopy_use
+tracksRegLiveness: true
+body:             |
+  bb.0:
+      liveins: $r2d
+    ; CHECK-LABEL: name: no_fold_vgbm_0_noncopy_use
+    ; CHECK: liveins: $r2d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64bit = COPY $r2d
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:addr64bit = COPY [[COPY]]
+    ; CHECK-NEXT: [[VGBM:%[0-9]+]]:vr128bit = VGBM 0
+    ; CHECK-NEXT: Return implicit [[VGBM]]
+    %0:gr64bit = COPY $r2d
+    %1:addr64bit = COPY %0
+    %2:vr128bit = VGBM 0
+    Return implicit %2
+...
+
+---
+name:            fold_vgbm_0_copyvr128_to_gr64_subreg_h64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+      liveins: $r2d
+    ; CHECK-LABEL: name: fold_vgbm_0_copyvr128_to_gr64_subreg_h64
+    ; CHECK: liveins: $r2d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64bit = COPY $r2d
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:addr64bit = COPY [[COPY]]
+    ; CHECK-NEXT: [[VGBM:%[0-9]+]]:vr128bit = VGBM 0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64bit = COPY [[VGBM]].subreg_h64
+    ; CHECK-NEXT: $r0d = COPY [[COPY2]]
+    ; CHECK-NEXT: Return implicit $r0d
+    %0:gr64bit = COPY $r2d
+    %1:addr64bit = COPY %0
+    %2:vr128bit = VGBM 0
+    %3:gr64bit = COPY %2.subreg_h64
+    $r0d = COPY %3
+    Return implicit $r0d
+...
diff --git a/llvm/test/CodeGen/SystemZ/frame-26.mir b/llvm/test/CodeGen/SystemZ/frame-26.mir
index d37f565bdea7..becd176cfb1b 100644
--- a/llvm/test/CodeGen/SystemZ/frame-26.mir
+++ b/llvm/test/CodeGen/SystemZ/frame-26.mir
@@ -1,6 +1,5 @@
 # RUN: llc -mtriple=s390x-linux-gnu -start-before=prologepilog %s -o - -print-after=prologepilog \
 # RUN:  -verify-machineinstrs 2>&1 | FileCheck %s
-# REQUIRES: asserts
 #
 # Test that R6 when used for an argument is modelled as being live throughout
 # the function when not saved in the prologue..
@@ -15,21 +14,21 @@
 
 
 --- |
-  
+
   @g_181 = external dso_local global i32, align 4
   @g_1390 = external dso_local constant ptr, align 8
-  
+
   define internal i8 @fun0(i8 %arg, i8 %arg1, i32 %arg2, i8 %arg3, ptr %arg4, float %F0, float %F1) #0 {
     ret i8 0
   }
-  
+
   ; Same function but in a single block which will make the verifier complain
   ; if R6 is killed by the original store before the point where the
   ; RegScavenger inserts its (killing) store of R6.
   define internal i8 @fun1(i8 %arg, i8 %arg1, i32 %arg2, i8 %arg3, ptr %arg4) #0 {
     ret i8 0
   }
-  
+
   attributes #0 = { "frame-pointer"="all" }
 
 ...
@@ -74,21 +73,21 @@ machineFunctionInfo: {}
 body:             |
   bb.0:
     liveins: $f0s, $f2s, $r6d
-  
+
     STG killed renamable $r6d, undef renamable $r1d, 0, $noreg :: (store (s64) into `ptr undef`)
     renamable $r0d = LARL @g_181
     nofpexcept CEBR renamable $f0s, renamable $f2s, implicit-def $cc, implicit $fpc
     STG renamable $r0d, undef renamable $r1d, 0, $noreg :: (store (s64) into `ptr undef`)
     BRC 15, 4, %bb.2, implicit killed $cc
-  
+
   bb.1:
     liveins: $f2s, $r0d
-  
+
     renamable $f0s = COPY killed renamable $f2s
-  
+
   bb.2:
     liveins: $f0s, $r0d
-  
+
     STE killed renamable $f0s, undef renamable $r1d, 0, $noreg :: (volatile store (s32) into `ptr undef`)
     renamable $r1d = nuw LA %stack.0, 16, $noreg
     renamable $r2d = nuw LA %stack.0, 24, $noreg
@@ -161,7 +160,7 @@ machineFunctionInfo: {}
 body:             |
   bb.0:
     liveins: $r6d
-  
+
     STG killed renamable $r6d, undef renamable $r1d, 0, $noreg :: (store (s64) into `ptr undef`)
     renamable $r0d = LARL @g_181
     STG renamable $r0d, undef renamable $r1d, 0, $noreg :: (store (s64) into `ptr undef`)
diff --git a/llvm/test/CodeGen/SystemZ/frame-28.mir b/llvm/test/CodeGen/SystemZ/frame-28.mir
index 254b8a2cf246..353fe3dec9a3 100644
--- a/llvm/test/CodeGen/SystemZ/frame-28.mir
+++ b/llvm/test/CodeGen/SystemZ/frame-28.mir
@@ -1,6 +1,5 @@
 # RUN: llc -mtriple=s390x-linux-gnu -start-before=prologepilog %s -o - -mcpu=z14 \
 # RUN:   -verify-machineinstrs 2>&1 | FileCheck %s
-# REQUIRES: asserts
 #
 # Test that redundant frame addressing anchor points are removed by
 # MachineLateInstrsCleanup.
@@ -262,16 +261,16 @@ machineFunctionInfo: {}
 body:             |
   bb.0:
     successors: %bb.2(0x30000000), %bb.1(0x50000000)
-  
+
     renamable $r1d = LGRL @ptr :: (load (s64) from got)
     CGHSI killed renamable $r1d, 0, 0, implicit-def $cc :: (volatile dereferenceable load (s64) from @ptr)
     BRC 14, 8, %bb.2, implicit killed $cc
     J %bb.1
-  
+
   bb.1:
     renamable $r1d = LGRL @ptr :: (load (s64) from got)
     MVGHI killed renamable $r1d, 0, 0
-  
+
   bb.2:
     Return
 
diff --git a/llvm/test/CodeGen/SystemZ/memcmp-03.ll b/llvm/test/CodeGen/SystemZ/memcmp-03.ll
index f11e9f08f57d..950e10a99993 100644
--- a/llvm/test/CodeGen/SystemZ/memcmp-03.ll
+++ b/llvm/test/CodeGen/SystemZ/memcmp-03.ll
@@ -1,7 +1,6 @@
 ; Test memcmp with 0 size.
 
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
-; REQUIRES: asserts
 
 declare i32 @memcmp(ptr nocapture, ptr nocapture, i64)
 
diff --git a/llvm/test/CodeGen/SystemZ/pr60413.ll b/llvm/test/CodeGen/SystemZ/pr60413.ll
index 5a629567d070..8a6a30318ae5 100644
--- a/llvm/test/CodeGen/SystemZ/pr60413.ll
+++ b/llvm/test/CodeGen/SystemZ/pr60413.ll
@@ -13,114 +13,110 @@ declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0
 define dso_local void @m() local_unnamed_addr #1 {
 ; CHECK-LABEL: m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    stmg %r12, %r15, 96(%r15)
+; CHECK-NEXT:    stmg %r13, %r15, 104(%r15)
 ; CHECK-NEXT:    aghi %r15, -168
-; CHECK-NEXT:    llhrl %r2, f+4
-; CHECK-NEXT:    sll %r2, 8
-; CHECK-NEXT:    larl %r1, f
-; CHECK-NEXT:    ic %r2, 6(%r1)
-; CHECK-NEXT:    larl %r1, e
-; CHECK-NEXT:    lb %r0, 3(%r1)
-; CHECK-NEXT:    clfi %r2, 128
+; CHECK-NEXT:    lhrl %r1, f+4
+; CHECK-NEXT:    sll %r1, 8
+; CHECK-NEXT:    larl %r2, f
+; CHECK-NEXT:    ic %r1, 6(%r2)
+; CHECK-NEXT:    larl %r2, e
+; CHECK-NEXT:    lb %r0, 3(%r2)
+; CHECK-NEXT:    vlvgp %v0, %r0, %r1
+; CHECK-NEXT:    vlvgp %v1, %r1, %r0
+; CHECK-NEXT:    vlvgf %v1, %r1, 0
+; CHECK-NEXT:    vlvgf %v1, %r1, 2
+; CHECK-NEXT:    vlvgp %v2, %r1, %r1
+; CHECK-NEXT:    # kill: def $r1l killed $r1l killed $r1d
+; CHECK-NEXT:    nilh %r1, 255
+; CHECK-NEXT:    chi %r1, 128
 ; CHECK-NEXT:    ipm %r1
 ; CHECK-NEXT:    risbg %r1, %r1, 63, 191, 36
-; CHECK-NEXT:    vlvgp %v1, %r2, %r0
-; CHECK-NEXT:    vlvgf %v1, %r2, 0
-; CHECK-NEXT:    vlvgf %v1, %r2, 2
-; CHECK-NEXT:    vlvgp %v0, %r0, %r2
-; CHECK-NEXT:    vlvgp %v2, %r2, %r2
-; CHECK-NEXT:    # kill: def $r2l killed $r2l killed $r2d
-; CHECK-NEXT:    nilh %r2, 255
-; CHECK-NEXT:    chi %r2, 128
-; CHECK-NEXT:    ipm %r2
-; CHECK-NEXT:    risbg %r2, %r2, 63, 191, 36
 ; CHECK-NEXT:    vlvgf %v0, %r0, 0
 ; CHECK-NEXT:    vlvgf %v0, %r0, 2
-; CHECK-NEXT:    vrepf %v2, %v2, 1
 ; CHECK-NEXT:    vgbm %v3, 30583
 ; CHECK-NEXT:    vn %v0, %v0, %v3
 ; CHECK-NEXT:    vn %v1, %v1, %v3
+; CHECK-NEXT:    vrepf %v2, %v2, 1
 ; CHECK-NEXT:    vn %v2, %v2, %v3
 ; CHECK-NEXT:    vrepif %v3, 127
 ; CHECK-NEXT:    vchlf %v1, %v1, %v3
-; CHECK-NEXT:    vlgvf %r12, %v1, 0
+; CHECK-NEXT:    vlgvf %r13, %v1, 0
 ; CHECK-NEXT:    vchlf %v2, %v2, %v3
-; CHECK-NEXT:    vlgvf %r4, %v2, 1
-; CHECK-NEXT:    nilf %r4, 1
-; CHECK-NEXT:    vlgvf %r5, %v2, 0
-; CHECK-NEXT:    risbg %r3, %r5, 48, 176, 15
-; CHECK-NEXT:    rosbg %r3, %r4, 32, 49, 14
-; CHECK-NEXT:    vlgvf %r14, %v2, 2
+; CHECK-NEXT:    vlgvf %r3, %v2, 1
+; CHECK-NEXT:    nilf %r3, 1
+; CHECK-NEXT:    vlgvf %r4, %v2, 0
+; CHECK-NEXT:    risbg %r2, %r4, 48, 176, 15
+; CHECK-NEXT:    rosbg %r2, %r3, 32, 49, 14
+; CHECK-NEXT:    vlgvf %r5, %v2, 2
+; CHECK-NEXT:    nilf %r5, 1
+; CHECK-NEXT:    rosbg %r2, %r5, 32, 50, 13
+; CHECK-NEXT:    vlgvf %r14, %v2, 3
 ; CHECK-NEXT:    nilf %r14, 1
-; CHECK-NEXT:    rosbg %r3, %r14, 32, 50, 13
-; CHECK-NEXT:    vlgvf %r13, %v2, 3
-; CHECK-NEXT:    nilf %r13, 1
-; CHECK-NEXT:    rosbg %r3, %r13, 32, 51, 12
-; CHECK-NEXT:    rosbg %r3, %r12, 52, 52, 11
-; CHECK-NEXT:    vlgvf %r12, %v1, 1
-; CHECK-NEXT:    rosbg %r3, %r12, 53, 53, 10
-; CHECK-NEXT:    vlgvf %r12, %v1, 2
-; CHECK-NEXT:    rosbg %r3, %r12, 54, 54, 9
-; CHECK-NEXT:    vlgvf %r12, %v1, 3
-; CHECK-NEXT:    rosbg %r3, %r12, 55, 55, 8
+; CHECK-NEXT:    rosbg %r2, %r14, 32, 51, 12
+; CHECK-NEXT:    rosbg %r2, %r13, 52, 52, 11
+; CHECK-NEXT:    vlgvf %r13, %v1, 1
+; CHECK-NEXT:    rosbg %r2, %r13, 53, 53, 10
+; CHECK-NEXT:    vlgvf %r13, %v1, 2
+; CHECK-NEXT:    rosbg %r2, %r13, 54, 54, 9
+; CHECK-NEXT:    vlgvf %r13, %v1, 3
+; CHECK-NEXT:    rosbg %r2, %r13, 55, 55, 8
 ; CHECK-NEXT:    vchlf %v0, %v0, %v3
-; CHECK-NEXT:    vlgvf %r12, %v0, 0
-; CHECK-NEXT:    rosbg %r3, %r12, 56, 56, 7
-; CHECK-NEXT:    vlgvf %r12, %v0, 1
-; CHECK-NEXT:    rosbg %r3, %r12, 57, 57, 6
-; CHECK-NEXT:    vlgvf %r12, %v0, 2
-; CHECK-NEXT:    rosbg %r3, %r12, 58, 58, 5
-; CHECK-NEXT:    vlgvf %r12, %v0, 3
-; CHECK-NEXT:    rosbg %r3, %r12, 59, 59, 4
-; CHECK-NEXT:    nilf %r5, 1
-; CHECK-NEXT:    rosbg %r3, %r5, 32, 60, 3
-; CHECK-NEXT:    rosbg %r3, %r4, 32, 61, 2
-; CHECK-NEXT:    rosbg %r3, %r14, 32, 62, 1
-; CHECK-NEXT:    or %r3, %r13
-; CHECK-NEXT:    vlgvb %r5, %v0, 1
-; CHECK-NEXT:    vlgvb %r4, %v0, 0
-; CHECK-NEXT:    risbg %r4, %r4, 48, 176, 15
-; CHECK-NEXT:    rosbg %r4, %r5, 49, 49, 14
-; CHECK-NEXT:    vlgvb %r5, %v0, 2
-; CHECK-NEXT:    rosbg %r4, %r5, 50, 50, 13
-; CHECK-NEXT:    vlgvb %r5, %v0, 3
-; CHECK-NEXT:    rosbg %r4, %r5, 51, 51, 12
-; CHECK-NEXT:    vlgvb %r5, %v0, 4
-; CHECK-NEXT:    rosbg %r4, %r5, 52, 52, 11
-; CHECK-NEXT:    vlgvb %r5, %v0, 5
-; CHECK-NEXT:    rosbg %r4, %r5, 53, 53, 10
-; CHECK-NEXT:    vlgvb %r5, %v0, 6
-; CHECK-NEXT:    rosbg %r4, %r5, 54, 54, 9
-; CHECK-NEXT:    vlgvb %r5, %v0, 7
-; CHECK-NEXT:    rosbg %r4, %r5, 55, 55, 8
-; CHECK-NEXT:    vlgvb %r5, %v0, 8
-; CHECK-NEXT:    rosbg %r4, %r5, 56, 56, 7
-; CHECK-NEXT:    vlgvb %r5, %v0, 9
-; CHECK-NEXT:    rosbg %r4, %r5, 57, 57, 6
-; CHECK-NEXT:    vlgvb %r5, %v0, 10
-; CHECK-NEXT:    rosbg %r4, %r5, 58, 58, 5
-; CHECK-NEXT:    vlgvb %r5, %v0, 11
-; CHECK-NEXT:    rosbg %r4, %r5, 59, 59, 4
-; CHECK-NEXT:    vlgvb %r5, %v0, 12
-; CHECK-NEXT:    rosbg %r4, %r5, 60, 60, 3
-; CHECK-NEXT:    vlgvb %r5, %v0, 13
-; CHECK-NEXT:    rosbg %r4, %r5, 61, 61, 2
-; CHECK-NEXT:    vlgvb %r5, %v0, 14
-; CHECK-NEXT:    rosbg %r4, %r5, 62, 62, 1
-; CHECK-NEXT:    vlgvb %r5, %v0, 15
-; CHECK-NEXT:    rosbg %r4, %r5, 63, 63, 0
-; CHECK-NEXT:    xilf %r4, 4294967295
-; CHECK-NEXT:    or %r4, %r3
-; CHECK-NEXT:    tmll %r4, 65535
-; CHECK-NEXT:    ipm %r3
-; CHECK-NEXT:    afi %r3, -268435456
-; CHECK-NEXT:    srl %r3, 31
+; CHECK-NEXT:    vlgvf %r13, %v0, 0
+; CHECK-NEXT:    rosbg %r2, %r13, 56, 56, 7
+; CHECK-NEXT:    vlgvf %r13, %v0, 1
+; CHECK-NEXT:    rosbg %r2, %r13, 57, 57, 6
+; CHECK-NEXT:    vlgvf %r13, %v0, 2
+; CHECK-NEXT:    rosbg %r2, %r13, 58, 58, 5
+; CHECK-NEXT:    vlgvf %r13, %v0, 3
+; CHECK-NEXT:    rosbg %r2, %r13, 59, 59, 4
+; CHECK-NEXT:    nilf %r4, 1
+; CHECK-NEXT:    rosbg %r2, %r4, 32, 60, 3
+; CHECK-NEXT:    rosbg %r2, %r3, 32, 61, 2
+; CHECK-NEXT:    rosbg %r2, %r5, 32, 62, 1
+; CHECK-NEXT:    or %r2, %r14
+; CHECK-NEXT:    vlgvb %r4, %v0, 1
+; CHECK-NEXT:    vlgvb %r3, %v0, 0
+; CHECK-NEXT:    risbg %r3, %r3, 48, 176, 15
+; CHECK-NEXT:    rosbg %r3, %r4, 49, 49, 14
+; CHECK-NEXT:    vlgvb %r4, %v0, 2
+; CHECK-NEXT:    rosbg %r3, %r4, 50, 50, 13
+; CHECK-NEXT:    vlgvb %r4, %v0, 3
+; CHECK-NEXT:    rosbg %r3, %r4, 51, 51, 12
+; CHECK-NEXT:    vlgvb %r4, %v0, 4
+; CHECK-NEXT:    rosbg %r3, %r4, 52, 52, 11
+; CHECK-NEXT:    vlgvb %r4, %v0, 5
+; CHECK-NEXT:    rosbg %r3, %r4, 53, 53, 10
+; CHECK-NEXT:    vlgvb %r4, %v0, 6
+; CHECK-NEXT:    rosbg %r3, %r4, 54, 54, 9
+; CHECK-NEXT:    vlgvb %r4, %v0, 7
+; CHECK-NEXT:    rosbg %r3, %r4, 55, 55, 8
+; CHECK-NEXT:    vlgvb %r4, %v0, 8
+; CHECK-NEXT:    rosbg %r3, %r4, 56, 56, 7
+; CHECK-NEXT:    vlgvb %r4, %v0, 9
+; CHECK-NEXT:    rosbg %r3, %r4, 57, 57, 6
+; CHECK-NEXT:    vlgvb %r4, %v0, 10
+; CHECK-NEXT:    rosbg %r3, %r4, 58, 58, 5
+; CHECK-NEXT:    vlgvb %r4, %v0, 11
+; CHECK-NEXT:    rosbg %r3, %r4, 59, 59, 4
+; CHECK-NEXT:    vlgvb %r4, %v0, 12
+; CHECK-NEXT:    rosbg %r3, %r4, 60, 60, 3
+; CHECK-NEXT:    vlgvb %r4, %v0, 13
+; CHECK-NEXT:    rosbg %r3, %r4, 61, 61, 2
+; CHECK-NEXT:    vlgvb %r4, %v0, 14
+; CHECK-NEXT:    rosbg %r3, %r4, 62, 62, 1
+; CHECK-NEXT:    vlgvb %r4, %v0, 15
+; CHECK-NEXT:    rosbg %r3, %r4, 63, 63, 0
+; CHECK-NEXT:    xilf %r3, 4294967295
+; CHECK-NEXT:    or %r3, %r2
+; CHECK-NEXT:    tmll %r3, 65535
+; CHECK-NEXT:    ipm %r2
+; CHECK-NEXT:    afi %r2, -268435456
+; CHECK-NEXT:    srl %r2, 31
 ; CHECK-NEXT:    nr %r2, %r1
-; CHECK-NEXT:    nr %r2, %r3
 ; CHECK-NEXT:    nr %r2, %r0
 ; CHECK-NEXT:    larl %r1, g
 ; CHECK-NEXT:    stc %r2, 0(%r1)
-; CHECK-NEXT:    lmg %r12, %r15, 264(%r15)
+; CHECK-NEXT:    lmg %r13, %r15, 272(%r15)
 ; CHECK-NEXT:    br %r14
 entry:
   %n = alloca i32, align 4
diff --git a/llvm/test/CodeGen/SystemZ/zos-ppa2.ll b/llvm/test/CodeGen/SystemZ/zos-ppa2.ll
index 189b5a3757ee..07025091fb24 100644
--- a/llvm/test/CodeGen/SystemZ/zos-ppa2.ll
+++ b/llvm/test/CodeGen/SystemZ/zos-ppa2.ll
@@ -1,5 +1,4 @@
 ; RUN: llc -mtriple s390x-ibm-zos -mcpu=z15 -asm-verbose=true < %s | FileCheck %s
-; REQUIRES: systemz-registered-target
 
 ; CHECK:    .section    ".ppa2"
 ; CHECK: L#PPA2:
diff --git a/llvm/test/CodeGen/WebAssembly/half-precision.ll b/llvm/test/CodeGen/WebAssembly/half-precision.ll
new file mode 100644
index 000000000000..89e9c42637c1
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/half-precision.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s --mtriple=wasm32-unknown-unknown -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+half-precision | FileCheck %s
+; RUN: llc < %s --mtriple=wasm64-unknown-unknown -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+half-precision | FileCheck %s
+
+declare float @llvm.wasm.loadf32.f16(ptr)
+declare void @llvm.wasm.storef16.f32(float, ptr)
+
+; CHECK-LABEL: ldf16_32:
+; CHECK:      f32.load_f16 $push[[NUM0:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[NUM0]]{{$}}
+define float @ldf16_32(ptr %p) {
+  %v = call float @llvm.wasm.loadf16.f32(ptr %p)
+  ret float %v
+}
+
+; CHECK-LABEL: stf16_32:
+; CHECK:       f32.store_f16 0($1), $0
+; CHECK-NEXT:  return
+define void @stf16_32(float %v, ptr %p) {
+  tail call void @llvm.wasm.storef16.f32(float %v, ptr %p)
+  ret void
+}
diff --git a/llvm/test/CodeGen/WebAssembly/offset.ll b/llvm/test/CodeGen/WebAssembly/offset.ll
index 0d9fcf05ab1b..65de341780e3 100644
--- a/llvm/test/CodeGen/WebAssembly/offset.ll
+++ b/llvm/test/CodeGen/WebAssembly/offset.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false -wasm-disable-explicit-locals -wasm-keep-registers -disable-wasm-fallthrough-return-opt | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -wasm-disable-explicit-locals -wasm-keep-registers -disable-wasm-fallthrough-return-opt -mattr=+half-precision | FileCheck %s
 
 ; Test constant load and store address offsets.
 
@@ -666,3 +666,56 @@ define {i32,i32,i32,i32} @aggregate_return() {
 define {i64,i32,i16,i8} @aggregate_return_without_merge() {
   ret {i64,i32,i16,i8} zeroinitializer
 }
+
+;===----------------------------------------------------------------------------
+; Loads: Half Precision
+;===----------------------------------------------------------------------------
+
+; Fold an offset into a zero-extending load.
+
+; CHECK-LABEL: load_f16_f32_with_folded_offset:
+; CHECK: f32.load_f16 $push0=, 24($0){{$}}
+define float @load_f16_f32_with_folded_offset(ptr %p) {
+  %q = ptrtoint ptr %p to i32
+  %r = add nuw i32 %q, 24
+  %s = inttoptr i32 %r to ptr
+  %t = call float @llvm.wasm.loadf16.f32(ptr %s)
+  ret float %t
+}
+
+; Fold a gep offset into a zero-extending load.
+
+; CHECK-LABEL: load_f16_f32_with_folded_gep_offset:
+; CHECK: f32.load_f16 $push0=, 24($0){{$}}
+define float @load_f16_f32_with_folded_gep_offset(ptr %p) {
+  %s = getelementptr inbounds i8, ptr %p, i32 24
+  %t = call float @llvm.wasm.loadf16.f32(ptr %s)
+  ret float %t
+}
+
+;===----------------------------------------------------------------------------
+; Stores: Half Precision
+;===----------------------------------------------------------------------------
+
+; Basic store.
+
+; CHECK-LABEL: store_f16_f32_no_offset:
+; CHECK-NEXT: .functype store_f16_f32_no_offset (i32, f32) -> (){{$}}
+; CHECK-NEXT: f32.store_f16 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_f16_f32_no_offset(ptr %p, float %v) {
+  call void @llvm.wasm.storef16.f32(float %v, ptr %p)
+  ret void
+}
+
+; Storing to a fixed address.
+
+; CHECK-LABEL: store_f16_f32_to_numeric_address:
+; CHECK:      i32.const $push1=, 0{{$}}
+; CHECK-NEXT: f32.const $push0=, 0x0p0{{$}}
+; CHECK-NEXT: f32.store_f16 42($pop1), $pop0{{$}}
+define void @store_f16_f32_to_numeric_address() {
+  %s = inttoptr i32 42 to ptr
+  call void @llvm.wasm.storef16.f32(float 0.0, ptr %s)
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/2008-08-31-EH_RETURN32.ll b/llvm/test/CodeGen/X86/2008-08-31-EH_RETURN32.ll
index 01e169a09a26..6be9281dc923 100644
--- a/llvm/test/CodeGen/X86/2008-08-31-EH_RETURN32.ll
+++ b/llvm/test/CodeGen/X86/2008-08-31-EH_RETURN32.ll
@@ -1,36 +1,58 @@
-; Check that eh_return & unwind_init were properly lowered
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mcpu=corei7 < %s -verify-machineinstrs | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
 target triple = "i386-pc-linux"
 
-; CHECK: test1
-; CHECK: pushl %ebp
-define ptr @test1(i32 %a, ptr %b)  {
+; Check that eh_return & unwind_init were properly lowered
+
+define ptr @test1(i32 %a, ptr %b) nounwind {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    pushl %edx
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    movl 12(%ebp), %ecx
+; CHECK-NEXT:    movl 8(%ebp), %eax
+; CHECK-NEXT:    movl %ecx, 4(%ebp,%eax)
+; CHECK-NEXT:    leal 4(%ebp,%eax), %ecx
+; CHECK-NEXT:    addl $4, %esp
+; CHECK-NEXT:    popl %eax
+; CHECK-NEXT:    popl %edx
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    popl %edi
+; CHECK-NEXT:    popl %ebx
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    movl %ecx, %esp
+; CHECK-NEXT:    retl # eh_return, addr: %ecx
 entry:
   call void @llvm.eh.unwind.init()
   %foo   = alloca i32
   call void @llvm.eh.return.i32(i32 %a, ptr %b)
-; CHECK: movl 12(%ebp), %[[ECX:e..]]
-; CHECK: movl 8(%ebp), %[[EAX:e..]]
-; CHECK: movl %[[ECX]], 4(%ebp,%[[EAX]])
-; CHECK: leal 4(%ebp,%[[EAX]]), %[[ECX2:e..]]
-; CHECK: movl %[[ECX2]], %esp
-; CHECK: ret
   unreachable
 }
 
-; CHECK: test2
-; CHECK: pushl %ebp
-define ptr @test2(i32 %a, ptr %b)  {
+define ptr @test2(i32 %a, ptr %b) nounwind {
+; CHECK-LABEL: test2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    movl 12(%ebp), %ecx
+; CHECK-NEXT:    movl 8(%ebp), %eax
+; CHECK-NEXT:    movl %ecx, 4(%ebp,%eax)
+; CHECK-NEXT:    leal 4(%ebp,%eax), %ecx
+; CHECK-NEXT:    popl %eax
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    movl %ecx, %esp
+; CHECK-NEXT:    retl # eh_return, addr: %ecx
 entry:
   call void @llvm.eh.return.i32(i32 %a, ptr %b)
-; CHECK: movl 12(%ebp), %[[ECX:e..]]
-; CHECK: movl 8(%ebp), %[[EAX:e..]]
-; CHECK: movl %[[ECX]], 4(%ebp,%[[EAX]])
-; CHECK: leal 4(%ebp,%[[EAX]]), %[[ECX2:e..]]
-; CHECK: movl %[[ECX2]], %esp
-; CHECK: ret
   unreachable
 }
 
diff --git a/llvm/test/CodeGen/X86/arithmetic_fence2.ll b/llvm/test/CodeGen/X86/arithmetic_fence2.ll
index 6a854b58fc02..3c2ef21527f5 100644
--- a/llvm/test/CodeGen/X86/arithmetic_fence2.ll
+++ b/llvm/test/CodeGen/X86/arithmetic_fence2.ll
@@ -157,6 +157,160 @@ define <8 x float> @f6(<8 x float> %a) {
   ret <8 x float> %3
 }
 
+define half @f7(half %a) nounwind {
+; X86-LABEL: f7:
+; X86:       # %bb.0:
+; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    #ARITH_FENCE
+; X86-NEXT:    retl
+;
+; X64-LABEL: f7:
+; X64:       # %bb.0:
+; X64-NEXT:    #ARITH_FENCE
+; X64-NEXT:    retq
+  %b = call half @llvm.arithmetic.fence.f16(half %a)
+  ret half %b
+}
+
+define bfloat @f8(bfloat %a) nounwind {
+; X86-LABEL: f8:
+; X86:       # %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    #ARITH_FENCE
+; X86-NEXT:    pinsrw $0, %eax, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: f8:
+; X64:       # %bb.0:
+; X64-NEXT:    pextrw $0, %xmm0, %eax
+; X64-NEXT:    #ARITH_FENCE
+; X64-NEXT:    pinsrw $0, %eax, %xmm0
+; X64-NEXT:    retq
+  %b = call bfloat @llvm.arithmetic.fence.bf16(bfloat %a)
+  ret bfloat %b
+}
+
+define <2 x half> @f9(<2 x half> %a) nounwind {
+; X86-LABEL: f9:
+; X86:       # %bb.0:
+; X86-NEXT:    movdqa %xmm0, %xmm1
+; X86-NEXT:    psrld $16, %xmm1
+; X86-NEXT:    #ARITH_FENCE
+; X86-NEXT:    #ARITH_FENCE
+; X86-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-NEXT:    retl
+;
+; X64-LABEL: f9:
+; X64:       # %bb.0:
+; X64-NEXT:    movdqa %xmm0, %xmm1
+; X64-NEXT:    psrld $16, %xmm1
+; X64-NEXT:    #ARITH_FENCE
+; X64-NEXT:    #ARITH_FENCE
+; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT:    retq
+  %b = call <2 x half> @llvm.arithmetic.fence.v2f16(<2 x half> %a)
+  ret <2 x half> %b
+}
+
+define <3 x bfloat> @f10(<3 x bfloat> %a) nounwind {
+; X86-LABEL: f10:
+; X86:       # %bb.0:
+; X86-NEXT:    pextrw $0, %xmm0, %eax
+; X86-NEXT:    movdqa %xmm0, %xmm1
+; X86-NEXT:    psrld $16, %xmm1
+; X86-NEXT:    pextrw $0, %xmm1, %ecx
+; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NEXT:    pextrw $0, %xmm0, %edx
+; X86-NEXT:    #ARITH_FENCE
+; X86-NEXT:    #ARITH_FENCE
+; X86-NEXT:    #ARITH_FENCE
+; X86-NEXT:    pinsrw $0, %eax, %xmm0
+; X86-NEXT:    pinsrw $0, %ecx, %xmm1
+; X86-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-NEXT:    pinsrw $0, %edx, %xmm1
+; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT:    retl
+;
+; X64-LABEL: f10:
+; X64:       # %bb.0:
+; X64-NEXT:    pextrw $0, %xmm0, %eax
+; X64-NEXT:    movdqa %xmm0, %xmm1
+; X64-NEXT:    psrld $16, %xmm1
+; X64-NEXT:    pextrw $0, %xmm1, %ecx
+; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X64-NEXT:    pextrw $0, %xmm0, %edx
+; X64-NEXT:    #ARITH_FENCE
+; X64-NEXT:    #ARITH_FENCE
+; X64-NEXT:    #ARITH_FENCE
+; X64-NEXT:    pinsrw $0, %eax, %xmm0
+; X64-NEXT:    pinsrw $0, %ecx, %xmm1
+; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT:    pinsrw $0, %edx, %xmm1
+; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT:    retq
+  %b = call <3 x bfloat> @llvm.arithmetic.fence.v3bf16(<3 x bfloat> %a)
+  ret <3 x bfloat> %b
+}
+
+define <4 x bfloat> @f11(<4 x bfloat> %a) nounwind {
+; X86-LABEL: f11:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movdqa %xmm0, %xmm1
+; X86-NEXT:    psrlq $48, %xmm1
+; X86-NEXT:    pextrw $0, %xmm1, %eax
+; X86-NEXT:    movdqa %xmm0, %xmm1
+; X86-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
+; X86-NEXT:    pextrw $0, %xmm1, %edx
+; X86-NEXT:    pextrw $0, %xmm0, %ecx
+; X86-NEXT:    psrld $16, %xmm0
+; X86-NEXT:    pextrw $0, %xmm0, %esi
+; X86-NEXT:    #ARITH_FENCE
+; X86-NEXT:    #ARITH_FENCE
+; X86-NEXT:    #ARITH_FENCE
+; X86-NEXT:    #ARITH_FENCE
+; X86-NEXT:    pinsrw $0, %eax, %xmm0
+; X86-NEXT:    pinsrw $0, %edx, %xmm1
+; X86-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-NEXT:    pinsrw $0, %ecx, %xmm0
+; X86-NEXT:    pinsrw $0, %esi, %xmm2
+; X86-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: f11:
+; X64:       # %bb.0:
+; X64-NEXT:    movdqa %xmm0, %xmm1
+; X64-NEXT:    psrlq $48, %xmm1
+; X64-NEXT:    pextrw $0, %xmm1, %eax
+; X64-NEXT:    movdqa %xmm0, %xmm1
+; X64-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
+; X64-NEXT:    pextrw $0, %xmm1, %ecx
+; X64-NEXT:    pextrw $0, %xmm0, %edx
+; X64-NEXT:    psrld $16, %xmm0
+; X64-NEXT:    pextrw $0, %xmm0, %esi
+; X64-NEXT:    #ARITH_FENCE
+; X64-NEXT:    #ARITH_FENCE
+; X64-NEXT:    #ARITH_FENCE
+; X64-NEXT:    #ARITH_FENCE
+; X64-NEXT:    pinsrw $0, %eax, %xmm0
+; X64-NEXT:    pinsrw $0, %ecx, %xmm1
+; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X64-NEXT:    pinsrw $0, %edx, %xmm0
+; X64-NEXT:    pinsrw $0, %esi, %xmm2
+; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT:    retq
+  %b = call <4 x bfloat> @llvm.arithmetic.fence.v4bf16(<4 x bfloat> %a)
+  ret <4 x bfloat> %b
+}
+
+declare half @llvm.arithmetic.fence.f16(half)
+declare bfloat @llvm.arithmetic.fence.bf16(bfloat)
+declare <2 x half> @llvm.arithmetic.fence.v2f16(<2 x half>)
+declare <3 x bfloat> @llvm.arithmetic.fence.v3bf16(<3 x bfloat>)
+declare <4 x bfloat> @llvm.arithmetic.fence.v4bf16(<4 x bfloat>)
 declare float @llvm.arithmetic.fence.f32(float)
 declare double @llvm.arithmetic.fence.f64(double)
 declare <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float>)
diff --git a/llvm/test/CodeGen/X86/bypass-slow-division-64.ll b/llvm/test/CodeGen/X86/bypass-slow-division-64.ll
index 14a71050e94a..6e0cfdd26a78 100644
--- a/llvm/test/CodeGen/X86/bypass-slow-division-64.ll
+++ b/llvm/test/CodeGen/X86/bypass-slow-division-64.ll
@@ -1,73 +1,196 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; Check that 64-bit division is bypassed correctly.
-; RUN: llc < %s -mattr=+idivq-to-divl -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mattr=-idivq-to-divl | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+idivq-to-divl | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64          | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2       | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3       | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4       | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; Intel
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=nehalem         | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=sandybridge     | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=haswell         | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=skylake         | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=alderlake       | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; AMD
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=barcelona       | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver1          | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver2          | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver1          | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver2          | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver3          | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver4          | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1          | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2          | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3          | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4          | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
 
 ; Additional tests for 64-bit divide bypass
 
-define i64 @Test_get_quotient(i64 %a, i64 %b) nounwind {
-; CHECK-LABEL: Test_get_quotient:
+;
+; SDIV
+;
+
+define i64 @sdiv_quotient(i64 %a, i64 %b) nounwind {
+; FAST-DIVQ-LABEL: sdiv_quotient:
+; FAST-DIVQ:       # %bb.0:
+; FAST-DIVQ-NEXT:    movq %rdi, %rax
+; FAST-DIVQ-NEXT:    cqto
+; FAST-DIVQ-NEXT:    idivq %rsi
+; FAST-DIVQ-NEXT:    retq
+;
+; SLOW-DIVQ-LABEL: sdiv_quotient:
+; SLOW-DIVQ:       # %bb.0:
+; SLOW-DIVQ-DAG:     movq %rdi, %rax
+; SLOW-DIVQ-DAG:     movq %rdi, %rcx
+; SLOW-DIVQ-DAG:     orq %rsi, %rcx
+; SLOW-DIVQ-DAG:     shrq $32, %rcx
+; SLOW-DIVQ-NEXT:    je .LBB0_1
+; SLOW-DIVQ-NEXT:  # %bb.2:
+; SLOW-DIVQ-NEXT:    cqto
+; SLOW-DIVQ-NEXT:    idivq %rsi
+; SLOW-DIVQ-NEXT:    retq
+; SLOW-DIVQ-NEXT:  .LBB0_1:
+; SLOW-DIVQ-DAG:     # kill: def $eax killed $eax killed $rax
+; SLOW-DIVQ-DAG:     xorl %edx, %edx
+; SLOW-DIVQ-NEXT:    divl %esi
+; SLOW-DIVQ-NEXT:    # kill: def $eax killed $eax def $rax
+; SLOW-DIVQ-NEXT:    retq
+  %result = sdiv i64 %a, %b
+  ret i64 %result
+}
+
+define i64 @sdiv_quotient_optsize(i64 %a, i64 %b) nounwind optsize {
+; CHECK-LABEL: sdiv_quotient_optsize:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    movq %rdi, %rcx
-; CHECK-NEXT:    orq %rsi, %rcx
-; CHECK-NEXT:    shrq $32, %rcx
-; CHECK-NEXT:    je .LBB0_1
-; CHECK-NEXT:  # %bb.2:
 ; CHECK-NEXT:    cqto
 ; CHECK-NEXT:    idivq %rsi
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB0_1:
-; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
-; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    divl %esi
-; CHECK-NEXT:    # kill: def $eax killed $eax def $rax
+  %result = sdiv i64 %a, %b
+  ret i64 %result
+}
+
+define i64 @sdiv_quotient_minsize(i64 %a, i64 %b) nounwind minsize {
+; CHECK-LABEL: sdiv_quotient_minsize:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    cqto
+; CHECK-NEXT:    idivq %rsi
 ; CHECK-NEXT:    retq
   %result = sdiv i64 %a, %b
   ret i64 %result
 }
 
-define i64 @Test_get_remainder(i64 %a, i64 %b) nounwind {
-; CHECK-LABEL: Test_get_remainder:
+define i64 @sdiv_remainder(i64 %a, i64 %b) nounwind {
+; FAST-DIVQ-LABEL: sdiv_remainder:
+; FAST-DIVQ:       # %bb.0:
+; FAST-DIVQ-NEXT:    movq %rdi, %rax
+; FAST-DIVQ-NEXT:    cqto
+; FAST-DIVQ-NEXT:    idivq %rsi
+; FAST-DIVQ-NEXT:    movq %rdx, %rax
+; FAST-DIVQ-NEXT:    retq
+;
+; SLOW-DIVQ-LABEL: sdiv_remainder:
+; SLOW-DIVQ:       # %bb.0:
+; SLOW-DIVQ-DAG:     movq %rdi, %rax
+; SLOW-DIVQ-DAG:     movq %rdi, %rcx
+; SLOW-DIVQ-DAG:     orq %rsi, %rcx
+; SLOW-DIVQ-DAG:     shrq $32, %rcx
+; SLOW-DIVQ-NEXT:    je .LBB3_1
+; SLOW-DIVQ-NEXT:  # %bb.2:
+; SLOW-DIVQ-NEXT:    cqto
+; SLOW-DIVQ-NEXT:    idivq %rsi
+; SLOW-DIVQ-NEXT:    movq %rdx, %rax
+; SLOW-DIVQ-NEXT:    retq
+; SLOW-DIVQ-NEXT:  .LBB3_1:
+; SLOW-DIVQ-DAG:     # kill: def $eax killed $eax killed $rax
+; SLOW-DIVQ-DAG:     xorl %edx, %edx
+; SLOW-DIVQ-NEXT:    divl %esi
+; SLOW-DIVQ-NEXT:    movl %edx, %eax
+; SLOW-DIVQ-NEXT:    retq
+  %result = srem i64 %a, %b
+  ret i64 %result
+}
+
+define i64 @sdiv_remainder_optsize(i64 %a, i64 %b) nounwind optsize {
+; CHECK-LABEL: sdiv_remainder_optsize:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    movq %rdi, %rcx
-; CHECK-NEXT:    orq %rsi, %rcx
-; CHECK-NEXT:    shrq $32, %rcx
-; CHECK-NEXT:    je .LBB1_1
-; CHECK-NEXT:  # %bb.2:
 ; CHECK-NEXT:    cqto
 ; CHECK-NEXT:    idivq %rsi
 ; CHECK-NEXT:    movq %rdx, %rax
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB1_1:
-; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
-; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    divl %esi
-; CHECK-NEXT:    movl %edx, %eax
+  %result = srem i64 %a, %b
+  ret i64 %result
+}
+
+define i64 @sdiv_remainder_minsize(i64 %a, i64 %b) nounwind minsize {
+; CHECK-LABEL: sdiv_remainder_minsize:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    cqto
+; CHECK-NEXT:    idivq %rsi
+; CHECK-NEXT:    movq %rdx, %rax
 ; CHECK-NEXT:    retq
   %result = srem i64 %a, %b
   ret i64 %result
 }
 
-define i64 @Test_get_quotient_and_remainder(i64 %a, i64 %b) nounwind {
-; CHECK-LABEL: Test_get_quotient_and_remainder:
+define i64 @sdiv_quotient_and_remainder(i64 %a, i64 %b) nounwind {
+; FAST-DIVQ-LABEL: sdiv_quotient_and_remainder:
+; FAST-DIVQ:       # %bb.0:
+; FAST-DIVQ-NEXT:    movq %rdi, %rax
+; FAST-DIVQ-NEXT:    cqto
+; FAST-DIVQ-NEXT:    idivq %rsi
+; FAST-DIVQ-NEXT:    addq %rdx, %rax
+; FAST-DIVQ-NEXT:    retq
+;
+; SLOW-DIVQ-LABEL: sdiv_quotient_and_remainder:
+; SLOW-DIVQ:       # %bb.0:
+; SLOW-DIVQ-DAG:     movq %rdi, %rax
+; SLOW-DIVQ-DAG:     movq %rdi, %rcx
+; SLOW-DIVQ-DAG:     orq %rsi, %rcx
+; SLOW-DIVQ-DAG:     shrq $32, %rcx
+; SLOW-DIVQ-NEXT:    je .LBB6_1
+; SLOW-DIVQ-NEXT:  # %bb.2:
+; SLOW-DIVQ-NEXT:    cqto
+; SLOW-DIVQ-NEXT:    idivq %rsi
+; SLOW-DIVQ-NEXT:    addq %rdx, %rax
+; SLOW-DIVQ-NEXT:    retq
+; SLOW-DIVQ-NEXT:  .LBB6_1:
+; SLOW-DIVQ-DAG:     # kill: def $eax killed $eax killed $rax
+; SLOW-DIVQ-DAG:     xorl %edx, %edx
+; SLOW-DIVQ-NEXT:    divl %esi
+; SLOW-DIVQ-NEXT:    # kill: def $edx killed $edx def $rdx
+; SLOW-DIVQ-NEXT:    # kill: def $eax killed $eax def $rax
+; SLOW-DIVQ-NEXT:    addq %rdx, %rax
+; SLOW-DIVQ-NEXT:    retq
+  %resultdiv = sdiv i64 %a, %b
+  %resultrem = srem i64 %a, %b
+  %result = add i64 %resultdiv, %resultrem
+  ret i64 %result
+}
+
+define i64 @sdiv_quotient_and_remainder_optsize(i64 %a, i64 %b) nounwind optsize {
+; CHECK-LABEL: sdiv_quotient_and_remainder_optsize:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    movq %rdi, %rcx
-; CHECK-NEXT:    orq %rsi, %rcx
-; CHECK-NEXT:    shrq $32, %rcx
-; CHECK-NEXT:    je .LBB2_1
-; CHECK-NEXT:  # %bb.2:
 ; CHECK-NEXT:    cqto
 ; CHECK-NEXT:    idivq %rsi
 ; CHECK-NEXT:    addq %rdx, %rax
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB2_1:
-; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
-; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    divl %esi
-; CHECK-NEXT:    # kill: def $edx killed $edx def $rdx
-; CHECK-NEXT:    # kill: def $eax killed $eax def $rax
+  %resultdiv = sdiv i64 %a, %b
+  %resultrem = srem i64 %a, %b
+  %result = add i64 %resultdiv, %resultrem
+  ret i64 %result
+}
+
+define i64 @sdiv_quotient_and_remainder_minsize(i64 %a, i64 %b) nounwind minsize {
+; CHECK-LABEL: sdiv_quotient_and_remainder_minsize:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    cqto
+; CHECK-NEXT:    idivq %rsi
 ; CHECK-NEXT:    addq %rdx, %rax
 ; CHECK-NEXT:    retq
   %resultdiv = sdiv i64 %a, %b
@@ -76,6 +199,179 @@ define i64 @Test_get_quotient_and_remainder(i64 %a, i64 %b) nounwind {
   ret i64 %result
 }
 
+;
+; UDIV
+;
+
+define i64 @udiv_quotient(i64 %a, i64 %b) nounwind {
+; FAST-DIVQ-LABEL: udiv_quotient:
+; FAST-DIVQ:       # %bb.0:
+; FAST-DIVQ-NEXT:    movq %rdi, %rax
+; FAST-DIVQ-NEXT:    xorl %edx, %edx
+; FAST-DIVQ-NEXT:    divq %rsi
+; FAST-DIVQ-NEXT:    retq
+;
+; SLOW-DIVQ-LABEL: udiv_quotient:
+; SLOW-DIVQ:       # %bb.0:
+; SLOW-DIVQ-DAG:     movq %rdi, %rax
+; SLOW-DIVQ-DAG:     movq %rdi, %rcx
+; SLOW-DIVQ-DAG:     orq %rsi, %rcx
+; SLOW-DIVQ-DAG:     shrq $32, %rcx
+; SLOW-DIVQ-NEXT:    je .LBB9_1
+; SLOW-DIVQ-NEXT:  # %bb.2:
+; SLOW-DIVQ-NEXT:    xorl %edx, %edx
+; SLOW-DIVQ-NEXT:    divq %rsi
+; SLOW-DIVQ-NEXT:    retq
+; SLOW-DIVQ-NEXT:  .LBB9_1:
+; SLOW-DIVQ-DAG:     # kill: def $eax killed $eax killed $rax
+; SLOW-DIVQ-DAG:     xorl %edx, %edx
+; SLOW-DIVQ-NEXT:    divl %esi
+; SLOW-DIVQ-NEXT:    # kill: def $eax killed $eax def $rax
+; SLOW-DIVQ-NEXT:    retq
+  %result = udiv i64 %a, %b
+  ret i64 %result
+}
+
+define i64 @udiv_quotient_optsize(i64 %a, i64 %b) nounwind optsize {
+; CHECK-LABEL: udiv_quotient_optsize:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divq %rsi
+; CHECK-NEXT:    retq
+  %result = udiv i64 %a, %b
+  ret i64 %result
+}
+
+define i64 @udiv_quotient_minsize(i64 %a, i64 %b) nounwind minsize {
+; CHECK-LABEL: udiv_quotient_minsize:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divq %rsi
+; CHECK-NEXT:    retq
+  %result = udiv i64 %a, %b
+  ret i64 %result
+}
+
+define i64 @udiv_remainder(i64 %a, i64 %b) nounwind {
+; FAST-DIVQ-LABEL: udiv_remainder:
+; FAST-DIVQ:       # %bb.0:
+; FAST-DIVQ-NEXT:    movq %rdi, %rax
+; FAST-DIVQ-NEXT:    xorl %edx, %edx
+; FAST-DIVQ-NEXT:    divq %rsi
+; FAST-DIVQ-NEXT:    movq %rdx, %rax
+; FAST-DIVQ-NEXT:    retq
+;
+; SLOW-DIVQ-LABEL: udiv_remainder:
+; SLOW-DIVQ:       # %bb.0:
+; SLOW-DIVQ-DAG:     movq %rdi, %rax
+; SLOW-DIVQ-DAG:     movq %rdi, %rcx
+; SLOW-DIVQ-DAG:     orq %rsi, %rcx
+; SLOW-DIVQ-DAG:     shrq $32, %rcx
+; SLOW-DIVQ-NEXT:    je .LBB12_1
+; SLOW-DIVQ-NEXT:  # %bb.2:
+; SLOW-DIVQ-NEXT:    xorl %edx, %edx
+; SLOW-DIVQ-NEXT:    divq %rsi
+; SLOW-DIVQ-NEXT:    movq %rdx, %rax
+; SLOW-DIVQ-NEXT:    retq
+; SLOW-DIVQ-NEXT:  .LBB12_1:
+; SLOW-DIVQ-DAG:     # kill: def $eax killed $eax killed $rax
+; SLOW-DIVQ-DAG:     xorl %edx, %edx
+; SLOW-DIVQ-NEXT:    divl %esi
+; SLOW-DIVQ-NEXT:    movl %edx, %eax
+; SLOW-DIVQ-NEXT:    retq
+  %result = urem i64 %a, %b
+  ret i64 %result
+}
+
+define i64 @udiv_remainder_optsize(i64 %a, i64 %b) nounwind optsize {
+; CHECK-LABEL: udiv_remainder_optsize:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divq %rsi
+; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    retq
+  %result = urem i64 %a, %b
+  ret i64 %result
+}
+
+define i64 @udiv_remainder_minsize(i64 %a, i64 %b) nounwind minsize {
+; CHECK-LABEL: udiv_remainder_minsize:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divq %rsi
+; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    retq
+  %result = urem i64 %a, %b
+  ret i64 %result
+}
+
+define i64 @udiv_quotient_and_remainder(i64 %a, i64 %b) nounwind {
+; FAST-DIVQ-LABEL: udiv_quotient_and_remainder:
+; FAST-DIVQ:       # %bb.0:
+; FAST-DIVQ-NEXT:    movq %rdi, %rax
+; FAST-DIVQ-NEXT:    xorl %edx, %edx
+; FAST-DIVQ-NEXT:    divq %rsi
+; FAST-DIVQ-NEXT:    addq %rdx, %rax
+; FAST-DIVQ-NEXT:    retq
+;
+; SLOW-DIVQ-LABEL: udiv_quotient_and_remainder:
+; SLOW-DIVQ:       # %bb.0:
+; SLOW-DIVQ-DAG:     movq %rdi, %rax
+; SLOW-DIVQ-DAG:     movq %rdi, %rcx
+; SLOW-DIVQ-DAG:     orq %rsi, %rcx
+; SLOW-DIVQ-DAG:     shrq $32, %rcx
+; SLOW-DIVQ-NEXT:    je .LBB15_1
+; SLOW-DIVQ-NEXT:  # %bb.2:
+; SLOW-DIVQ-NEXT:    xorl %edx, %edx
+; SLOW-DIVQ-NEXT:    divq %rsi
+; SLOW-DIVQ-NEXT:    addq %rdx, %rax
+; SLOW-DIVQ-NEXT:    retq
+; SLOW-DIVQ-NEXT:  .LBB15_1:
+; SLOW-DIVQ-DAG:     # kill: def $eax killed $eax killed $rax
+; SLOW-DIVQ-DAG:     xorl %edx, %edx
+; SLOW-DIVQ-NEXT:    divl %esi
+; SLOW-DIVQ-NEXT:    # kill: def $edx killed $edx def $rdx
+; SLOW-DIVQ-NEXT:    # kill: def $eax killed $eax def $rax
+; SLOW-DIVQ-NEXT:    addq %rdx, %rax
+; SLOW-DIVQ-NEXT:    retq
+  %resultdiv = udiv i64 %a, %b
+  %resultrem = urem i64 %a, %b
+  %result = add i64 %resultdiv, %resultrem
+  ret i64 %result
+}
+
+define i64 @udiv_quotient_and_remainder_optsize(i64 %a, i64 %b) nounwind optsize {
+; CHECK-LABEL: udiv_quotient_and_remainder_optsize:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divq %rsi
+; CHECK-NEXT:    addq %rdx, %rax
+; CHECK-NEXT:    retq
+  %resultdiv = udiv i64 %a, %b
+  %resultrem = urem i64 %a, %b
+  %result = add i64 %resultdiv, %resultrem
+  ret i64 %result
+}
+
+define i64 @udiv_quotient_and_remainder_minsize(i64 %a, i64 %b) nounwind minsize {
+; CHECK-LABEL: udiv_quotient_and_remainder_minsize:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divq %rsi
+; CHECK-NEXT:    addq %rdx, %rax
+; CHECK-NEXT:    retq
+  %resultdiv = udiv i64 %a, %b
+  %resultrem = urem i64 %a, %b
+  %result = add i64 %resultdiv, %resultrem
+  ret i64 %result
+}
+
 define void @PR43514(i32 %x, i32 %y) {
 ; CHECK-LABEL: PR43514:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll b/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll
index bc546fe857a3..67070b989786 100644
--- a/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll
+++ b/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll
@@ -174,7 +174,7 @@ define i1 @shl_to_shr_eq_i64_s44(i64 %x) {
 ; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    shrq $44, %rax
 ; CHECK-NEXT:    andl $1048575, %edi # imm = 0xFFFFF
-; CHECK-NEXT:    cmpq %rax, %rdi
+; CHECK-NEXT:    cmpl %eax, %edi
 ; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    retq
   %shl = shl i64 %x, 44
@@ -186,9 +186,9 @@ define i1 @shl_to_shr_eq_i64_s44(i64 %x) {
 define i1 @shr_to_shl_ne_i64_s32(i64 %x) {
 ; CHECK-NOBMI-LABEL: shr_to_shl_ne_i64_s32:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    movl %edi, %eax
-; CHECK-NOBMI-NEXT:    shrq $32, %rdi
-; CHECK-NOBMI-NEXT:    cmpq %rdi, %rax
+; CHECK-NOBMI-NEXT:    movq %rdi, %rax
+; CHECK-NOBMI-NEXT:    shrq $32, %rax
+; CHECK-NOBMI-NEXT:    cmpl %eax, %edi
 ; CHECK-NOBMI-NEXT:    setne %al
 ; CHECK-NOBMI-NEXT:    retq
 ;
@@ -244,7 +244,7 @@ define i1 @shl_to_shr_eq_i64_s63(i64 %x) {
 ; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    shrq $63, %rax
 ; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    cmpq %rax, %rdi
+; CHECK-NEXT:    cmpl %eax, %edi
 ; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    retq
   %shl = shl i64 %x, 63
diff --git a/llvm/test/CodeGen/X86/cmp.ll b/llvm/test/CodeGen/X86/cmp.ll
index 402da547613c..5a63d36a6be4 100644
--- a/llvm/test/CodeGen/X86/cmp.ll
+++ b/llvm/test/CodeGen/X86/cmp.ll
@@ -787,25 +787,15 @@ define i1 @shifted_mask64_testl(i64 %a) {
 }
 
 define i1 @shifted_mask64_extra_use_const(i64 %a) {
-; NO-NDD-LABEL: shifted_mask64_extra_use_const:
-; NO-NDD:       # %bb.0:
-; NO-NDD-NEXT:    movabsq $287104476244869120, %rcx # encoding: [0x48,0xb9,0x00,0x00,0x00,0x00,0x00,0x00,0xfc,0x03]
-; NO-NDD-NEXT:    # imm = 0x3FC000000000000
-; NO-NDD-NEXT:    testq %rcx, %rdi # encoding: [0x48,0x85,0xcf]
-; NO-NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; NO-NDD-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
-; NO-NDD-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
-; NO-NDD-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: shifted_mask64_extra_use_const:
-; NDD:       # %bb.0:
-; NDD-NEXT:    movabsq $287104476244869120, %rcx # encoding: [0x48,0xb9,0x00,0x00,0x00,0x00,0x00,0x00,0xfc,0x03]
-; NDD-NEXT:    # imm = 0x3FC000000000000
-; NDD-NEXT:    andq %rcx, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x21,0xcf]
-; NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; NDD-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
-; NDD-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
-; NDD-NEXT:    retq # encoding: [0xc3]
+; CHECK-LABEL: shifted_mask64_extra_use_const:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movabsq $287104476244869120, %rcx # encoding: [0x48,0xb9,0x00,0x00,0x00,0x00,0x00,0x00,0xfc,0x03]
+; CHECK-NEXT:    # imm = 0x3FC000000000000
+; CHECK-NEXT:    testq %rcx, %rdi # encoding: [0x48,0x85,0xcf]
+; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
+; CHECK-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
+; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; CHECK-NEXT:    retq # encoding: [0xc3]
   %v0 = and i64 %a, 287104476244869120  ; 0xff << 50
   %v1 = icmp ne i64 %v0, 0
   store i64 287104476244869120, ptr @d64
@@ -954,19 +944,12 @@ declare i32 @f()
 ; The store makes sure the chain result of the load is used which used to
 ; prevent the post isel peephole from catching this.
 define i1 @fold_test_and_with_chain(ptr %x, ptr %y, i32 %z) {
-; NO-NDD-LABEL: fold_test_and_with_chain:
-; NO-NDD:       # %bb.0:
-; NO-NDD-NEXT:    testl %edx, (%rdi) # encoding: [0x85,0x17]
-; NO-NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; NO-NDD-NEXT:    movl %edx, (%rsi) # encoding: [0x89,0x16]
-; NO-NDD-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: fold_test_and_with_chain:
-; NDD:       # %bb.0:
-; NDD-NEXT:    andl (%rdi), %edx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x23,0x17]
-; NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; NDD-NEXT:    movl %edx, (%rsi) # encoding: [0x89,0x16]
-; NDD-NEXT:    retq # encoding: [0xc3]
+; CHECK-LABEL: fold_test_and_with_chain:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    testl %edx, (%rdi) # encoding: [0x85,0x17]
+; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; CHECK-NEXT:    movl %edx, (%rsi) # encoding: [0x89,0x16]
+; CHECK-NEXT:    retq # encoding: [0xc3]
   %a = load i32, ptr %x
   %b = and i32 %z, %a
   %c = icmp eq i32 %b, 0
diff --git a/llvm/test/CodeGen/X86/cmp16.ll b/llvm/test/CodeGen/X86/cmp16.ll
index 760c8e404499..699ea3e4dd47 100644
--- a/llvm/test/CodeGen/X86/cmp16.ll
+++ b/llvm/test/CodeGen/X86/cmp16.ll
@@ -1,8 +1,18 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefixes=X86,X86-GENERIC
 ; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefixes=X64,X64-GENERIC
+; RUN: llc < %s -mtriple=i686-- -mattr=+fast-imm16 | FileCheck %s --check-prefixes=X86,X86-FAST
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+fast-imm16 | FileCheck %s --check-prefixes=X64,X64-FAST
 ; RUN: llc < %s -mtriple=i686-- -mcpu=atom | FileCheck %s --check-prefixes=X86,X86-ATOM
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=atom | FileCheck %s --check-prefixes=X64,X64-ATOM
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=slm | FileCheck %s --check-prefixes=X64,X64-FAST
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=knl | FileCheck %s --check-prefixes=X64,X64-FAST
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver1 | FileCheck %s --check-prefixes=X64,X64-FAST
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver2 | FileCheck %s --check-prefixes=X64,X64-FAST
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1 | FileCheck %s --check-prefixes=X64,X64-FAST
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s --check-prefixes=X64,X64-FAST
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3 | FileCheck %s --check-prefixes=X64,X64-FAST
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4 | FileCheck %s --check-prefixes=X64,X64-FAST
 
 define i1 @cmp16_reg_eq_reg(i16 %a0, i16 %a1) {
 ; X86-GENERIC-LABEL: cmp16_reg_eq_reg:
@@ -18,6 +28,19 @@ define i1 @cmp16_reg_eq_reg(i16 %a0, i16 %a1) {
 ; X64-GENERIC-NEXT:    sete %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_reg_eq_reg:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    cmpw {{[0-9]+}}(%esp), %ax
+; X86-FAST-NEXT:    sete %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_reg_eq_reg:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw %si, %di
+; X64-FAST-NEXT:    sete %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_reg_eq_reg:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -52,6 +75,18 @@ define i1 @cmp16_reg_eq_imm8(i16 %a0) {
 ; X64-GENERIC-NEXT:    sete %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_reg_eq_imm8:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    cmpw $15, {{[0-9]+}}(%esp)
+; X86-FAST-NEXT:    sete %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_reg_eq_imm8:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw $15, %di
+; X64-FAST-NEXT:    sete %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_reg_eq_imm8:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    cmpw $15, {{[0-9]+}}(%esp)
@@ -90,6 +125,18 @@ define i1 @cmp16_reg_eq_imm16(i16 %a0) {
 ; X64-GENERIC-NEXT:    sete %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_reg_eq_imm16:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    cmpw $1024, {{[0-9]+}}(%esp) # imm = 0x400
+; X86-FAST-NEXT:    sete %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_reg_eq_imm16:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw $1024, %di # imm = 0x400
+; X64-FAST-NEXT:    sete %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_reg_eq_imm16:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    cmpw $1024, {{[0-9]+}}(%esp) # imm = 0x400
@@ -144,6 +191,18 @@ define i1 @cmp16_reg_eq_imm16_optsize(i16 %a0) optsize {
 ; X64-GENERIC-NEXT:    sete %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_reg_eq_imm16_optsize:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    cmpw $1024, {{[0-9]+}}(%esp) # imm = 0x400
+; X86-FAST-NEXT:    sete %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_reg_eq_imm16_optsize:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw $1024, %di # imm = 0x400
+; X64-FAST-NEXT:    sete %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_reg_eq_imm16_optsize:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    cmpw $1024, {{[0-9]+}}(%esp) # imm = 0x400
@@ -172,6 +231,18 @@ define i1 @cmp16_reg_sgt_imm8(i16 %a0) {
 ; X64-GENERIC-NEXT:    setge %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_reg_sgt_imm8:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    cmpw $16, {{[0-9]+}}(%esp)
+; X86-FAST-NEXT:    setge %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_reg_sgt_imm8:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw $16, %di
+; X64-FAST-NEXT:    setge %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_reg_sgt_imm8:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    cmpw $16, {{[0-9]+}}(%esp)
@@ -210,6 +281,18 @@ define i1 @cmp16_reg_sgt_imm16(i16 %a0) {
 ; X64-GENERIC-NEXT:    setge %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_reg_sgt_imm16:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    cmpw $-1023, {{[0-9]+}}(%esp) # imm = 0xFC01
+; X86-FAST-NEXT:    setge %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_reg_sgt_imm16:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw $-1023, %di # imm = 0xFC01
+; X64-FAST-NEXT:    setge %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_reg_sgt_imm16:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    cmpw $-1023, {{[0-9]+}}(%esp) # imm = 0xFC01
@@ -264,6 +347,18 @@ define i1 @cmp16_reg_sgt_imm16_optsize(i16 %a0) optsize {
 ; X64-GENERIC-NEXT:    setge %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_reg_sgt_imm16_optsize:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    cmpw $-1023, {{[0-9]+}}(%esp) # imm = 0xFC01
+; X86-FAST-NEXT:    setge %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_reg_sgt_imm16_optsize:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw $-1023, %di # imm = 0xFC01
+; X64-FAST-NEXT:    setge %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_reg_sgt_imm16_optsize:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    cmpw $-1023, {{[0-9]+}}(%esp) # imm = 0xFC01
@@ -294,6 +389,18 @@ define i1 @cmp16_reg_uge_imm16(i16 %a0) {
 ; X64-GENERIC-NEXT:    setae %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_reg_uge_imm16:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    cmpw $-1024, {{[0-9]+}}(%esp) # imm = 0xFC00
+; X86-FAST-NEXT:    setae %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_reg_uge_imm16:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw $-1024, %di # imm = 0xFC00
+; X64-FAST-NEXT:    setae %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_reg_uge_imm16:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    cmpw $-1024, {{[0-9]+}}(%esp) # imm = 0xFC00
@@ -348,6 +455,18 @@ define i1 @cmp16_reg_uge_imm16_optsize(i16 %a0) optsize {
 ; X64-GENERIC-NEXT:    setae %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_reg_uge_imm16_optsize:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    cmpw $-1024, {{[0-9]+}}(%esp) # imm = 0xFC00
+; X86-FAST-NEXT:    setae %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_reg_uge_imm16_optsize:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw $-1024, %di # imm = 0xFC00
+; X64-FAST-NEXT:    setae %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_reg_uge_imm16_optsize:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    cmpw $-1024, {{[0-9]+}}(%esp) # imm = 0xFC00
@@ -380,6 +499,22 @@ define i1 @cmp16_load_ne_load(ptr %p0, ptr %p1) {
 ; X64-GENERIC-NEXT:    setne %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_load_ne_load:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-FAST-NEXT:    movzwl (%ecx), %ecx
+; X86-FAST-NEXT:    cmpw (%eax), %cx
+; X86-FAST-NEXT:    setne %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_load_ne_load:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    movzwl (%rdi), %eax
+; X64-FAST-NEXT:    cmpw (%rsi), %ax
+; X64-FAST-NEXT:    setne %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_load_ne_load:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -417,6 +552,19 @@ define i1 @cmp16_load_ne_imm8(ptr %p0) {
 ; X64-GENERIC-NEXT:    setne %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_load_ne_imm8:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    cmpw $15, (%eax)
+; X86-FAST-NEXT:    setne %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_load_ne_imm8:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw $15, (%rdi)
+; X64-FAST-NEXT:    setne %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_load_ne_imm8:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -456,6 +604,19 @@ define i1 @cmp16_load_ne_imm16(ptr %p0) {
 ; X64-GENERIC-NEXT:    setne %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_load_ne_imm16:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    cmpw $512, (%eax) # imm = 0x200
+; X86-FAST-NEXT:    setne %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_load_ne_imm16:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw $512, (%rdi) # imm = 0x200
+; X64-FAST-NEXT:    setne %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_load_ne_imm16:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -493,6 +654,19 @@ define i1 @cmp16_load_slt_imm8(ptr %p0) {
 ; X64-GENERIC-NEXT:    setl %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_load_slt_imm8:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    cmpw $15, (%eax)
+; X86-FAST-NEXT:    setl %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_load_slt_imm8:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw $15, (%rdi)
+; X64-FAST-NEXT:    setl %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_load_slt_imm8:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -532,6 +706,19 @@ define i1 @cmp16_load_slt_imm16(ptr %p0) {
 ; X64-GENERIC-NEXT:    setl %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_load_slt_imm16:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    cmpw $512, (%eax) # imm = 0x200
+; X86-FAST-NEXT:    setl %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_load_slt_imm16:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw $512, (%rdi) # imm = 0x200
+; X64-FAST-NEXT:    setl %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_load_slt_imm16:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -589,6 +776,19 @@ define i1 @cmp16_load_slt_imm16_optsize(ptr %p0) optsize {
 ; X64-GENERIC-NEXT:    setl %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_load_slt_imm16_optsize:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    cmpw $512, (%eax) # imm = 0x200
+; X86-FAST-NEXT:    setl %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_load_slt_imm16_optsize:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw $512, (%rdi) # imm = 0x200
+; X64-FAST-NEXT:    setl %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_load_slt_imm16_optsize:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -620,6 +820,19 @@ define i1 @cmp16_load_ule_imm8(ptr %p0) {
 ; X64-GENERIC-NEXT:    setb %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_load_ule_imm8:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    cmpw $16, (%eax)
+; X86-FAST-NEXT:    setb %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_load_ule_imm8:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw $16, (%rdi)
+; X64-FAST-NEXT:    setb %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_load_ule_imm8:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -659,6 +872,19 @@ define i1 @cmp16_load_ule_imm16(ptr %p0) {
 ; X64-GENERIC-NEXT:    setb %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_load_ule_imm16:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    cmpw $513, (%eax) # imm = 0x201
+; X86-FAST-NEXT:    setb %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_load_ule_imm16:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw $513, (%rdi) # imm = 0x201
+; X64-FAST-NEXT:    setb %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_load_ule_imm16:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -716,6 +942,19 @@ define i1 @cmp16_load_ule_imm16_optsize(ptr %p0) optsize {
 ; X64-GENERIC-NEXT:    setb %al
 ; X64-GENERIC-NEXT:    retq
 ;
+; X86-FAST-LABEL: cmp16_load_ule_imm16_optsize:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    cmpw $513, (%eax) # imm = 0x201
+; X86-FAST-NEXT:    setb %al
+; X86-FAST-NEXT:    retl
+;
+; X64-FAST-LABEL: cmp16_load_ule_imm16_optsize:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    cmpw $513, (%rdi) # imm = 0x201
+; X64-FAST-NEXT:    setb %al
+; X64-FAST-NEXT:    retq
+;
 ; X86-ATOM-LABEL: cmp16_load_ule_imm16_optsize:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
diff --git a/llvm/test/CodeGen/X86/combine-bitreverse.ll b/llvm/test/CodeGen/X86/combine-bitreverse.ll
index 9f81fab54a49..f3d4d691b453 100644
--- a/llvm/test/CodeGen/X86/combine-bitreverse.ll
+++ b/llvm/test/CodeGen/X86/combine-bitreverse.ll
@@ -39,86 +39,18 @@ define i32 @test_bitreverse_bitreverse(i32 %a0) nounwind {
   ret i32 %c
 }
 
-; TODO: fold (bitreverse(srl (bitreverse c), x)) -> (shl c, x)
+; fold (bitreverse(srl (bitreverse c), x)) -> (shl c, x)
 define i32 @test_bitreverse_srli_bitreverse(i32 %a0) nounwind {
 ; X86-LABEL: test_bitreverse_srli_bitreverse:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $1431655744, %ecx # imm = 0x55555540
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $1431655680, %eax # imm = 0x55555500
-; X86-NEXT:    leal (%eax,%ecx,2), %eax
-; X86-NEXT:    shrl $7, %eax
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $252645121, %ecx # imm = 0xF0F0F01
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $252645120, %eax # imm = 0xF0F0F00
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $858993424, %ecx # imm = 0x33333310
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $858993408, %eax # imm = 0x33333300
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    shll $7, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_bitreverse_srli_bitreverse:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    bswapl %edi
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X64-NEXT:    shll $4, %eax
-; X64-NEXT:    shrl $4, %edi
-; X64-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; X64-NEXT:    orl %eax, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X64-NEXT:    shrl $2, %edi
-; X64-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X64-NEXT:    leal (%rdi,%rax,4), %eax
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andl $1431655744, %ecx # imm = 0x55555540
-; X64-NEXT:    shrl %eax
-; X64-NEXT:    andl $1431655680, %eax # imm = 0x55555500
-; X64-NEXT:    leal (%rax,%rcx,2), %eax
-; X64-NEXT:    shrl $7, %eax
-; X64-NEXT:    bswapl %eax
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andl $252645121, %ecx # imm = 0xF0F0F01
-; X64-NEXT:    shll $4, %ecx
-; X64-NEXT:    shrl $4, %eax
-; X64-NEXT:    andl $252645120, %eax # imm = 0xF0F0F00
-; X64-NEXT:    orl %ecx, %eax
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andl $858993424, %ecx # imm = 0x33333310
-; X64-NEXT:    shrl $2, %eax
-; X64-NEXT:    andl $858993408, %eax # imm = 0x33333300
-; X64-NEXT:    leal (%rax,%rcx,4), %eax
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X64-NEXT:    shrl %eax
-; X64-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X64-NEXT:    leal (%rax,%rcx,2), %eax
+; X64-NEXT:    shll $7, %eax
 ; X64-NEXT:    retq
   %b = call i32 @llvm.bitreverse.i32(i32 %a0)
   %c = lshr i32 %b, 7
@@ -129,88 +61,15 @@ define i32 @test_bitreverse_srli_bitreverse(i32 %a0) nounwind {
 define i64 @test_bitreverse_srli_bitreverse_i64(i64 %a) nounwind {
 ; X86-LABEL: test_bitreverse_srli_bitreverse_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $1431655764, %eax # imm = 0x55555554
-; X86-NEXT:    leal (%eax,%ecx,2), %eax
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $252645127, %eax # imm = 0xF0F0F07
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $858993457, %eax # imm = 0x33333331
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    addl %edx, %edx
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_bitreverse_srli_bitreverse_i64:
 ; X64:       # %bb.0:
-; X64-NEXT:    bswapq %rdi
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    shrq $4, %rax
-; X64-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT:    andq %rcx, %rax
-; X64-NEXT:    andq %rcx, %rdi
-; X64-NEXT:    shlq $4, %rdi
-; X64-NEXT:    orq %rax, %rdi
-; X64-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
-; X64-NEXT:    movq %rdi, %rcx
-; X64-NEXT:    andq %rax, %rcx
-; X64-NEXT:    shrq $2, %rdi
-; X64-NEXT:    andq %rax, %rdi
-; X64-NEXT:    leaq (%rdi,%rcx,4), %rax
-; X64-NEXT:    movabsq $6148914689804861440, %rcx # imm = 0x5555555500000000
-; X64-NEXT:    andq %rax, %rcx
-; X64-NEXT:    shrq %rax
-; X64-NEXT:    movabsq $6148914685509894144, %rdx # imm = 0x5555555400000000
-; X64-NEXT:    andq %rax, %rdx
-; X64-NEXT:    leaq (%rdx,%rcx,2), %rax
-; X64-NEXT:    shrq $33, %rax
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    movabsq $1085102592318504960, %rcx # imm = 0xF0F0F0F00000000
-; X64-NEXT:    andq %rax, %rcx
-; X64-NEXT:    shrq $4, %rax
-; X64-NEXT:    movabsq $1085102557958766592, %rdx # imm = 0xF0F0F0700000000
-; X64-NEXT:    andq %rax, %rdx
-; X64-NEXT:    shlq $4, %rcx
-; X64-NEXT:    orq %rdx, %rcx
-; X64-NEXT:    movabsq $3689348813882916864, %rax # imm = 0x3333333300000000
-; X64-NEXT:    andq %rcx, %rax
-; X64-NEXT:    shrq $2, %rcx
-; X64-NEXT:    movabsq $3689348805292982272, %rdx # imm = 0x3333333100000000
-; X64-NEXT:    andq %rcx, %rdx
-; X64-NEXT:    leaq (%rdx,%rax,4), %rax
-; X64-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
-; X64-NEXT:    movq %rax, %rdx
-; X64-NEXT:    andq %rcx, %rdx
-; X64-NEXT:    shrq %rax
-; X64-NEXT:    andq %rcx, %rax
-; X64-NEXT:    leaq (%rax,%rdx,2), %rax
+; X64-NEXT:    shlq $33, %rax
 ; X64-NEXT:    retq
     %1 = call i64 @llvm.bitreverse.i64(i64 %a)
     %2 = lshr i64 %1, 33
@@ -218,86 +77,18 @@ define i64 @test_bitreverse_srli_bitreverse_i64(i64 %a) nounwind {
     ret i64 %3
 }
 
-; TODO: fold (bitreverse(shl (bitreverse c), x)) -> (srl c, x)
+; fold (bitreverse(shl (bitreverse c), x)) -> (srl c, x)
 define i32 @test_bitreverse_shli_bitreverse(i32 %a0) nounwind {
 ; X86-LABEL: test_bitreverse_shli_bitreverse:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%ecx,4), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl $5592405, %eax # imm = 0x555555
-; X86-NEXT:    shll $6, %ecx
-; X86-NEXT:    andl $-1431655808, %ecx # imm = 0xAAAAAA80
-; X86-NEXT:    shll $8, %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $986895, %ecx # imm = 0xF0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $135204623, %eax # imm = 0x80F0F0F
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $3355443, %ecx # imm = 0x333333
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $36909875, %eax # imm = 0x2333333
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    shrl $7, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_bitreverse_shli_bitreverse:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    bswapl %edi
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X64-NEXT:    shll $4, %eax
-; X64-NEXT:    shrl $4, %edi
-; X64-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; X64-NEXT:    orl %eax, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X64-NEXT:    shrl $2, %edi
-; X64-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X64-NEXT:    leal (%rdi,%rax,4), %eax
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andl $5592405, %ecx # imm = 0x555555
-; X64-NEXT:    shll $6, %eax
-; X64-NEXT:    andl $-1431655808, %eax # imm = 0xAAAAAA80
-; X64-NEXT:    shll $8, %ecx
-; X64-NEXT:    orl %eax, %ecx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:    andl $986895, %eax # imm = 0xF0F0F
-; X64-NEXT:    shll $4, %eax
-; X64-NEXT:    shrl $4, %ecx
-; X64-NEXT:    andl $135204623, %ecx # imm = 0x80F0F0F
-; X64-NEXT:    orl %eax, %ecx
-; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:    andl $3355443, %eax # imm = 0x333333
-; X64-NEXT:    shrl $2, %ecx
-; X64-NEXT:    andl $36909875, %ecx # imm = 0x2333333
-; X64-NEXT:    leal (%rcx,%rax,4), %eax
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X64-NEXT:    shrl %eax
-; X64-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X64-NEXT:    leal (%rax,%rcx,2), %eax
+; X64-NEXT:    shrl $7, %eax
 ; X64-NEXT:    retq
   %b = call i32 @llvm.bitreverse.i32(i32 %a0)
   %c = shl i32 %b, 7
@@ -309,79 +100,14 @@ define i64 @test_bitreverse_shli_bitreverse_i64(i64 %a) nounwind {
 ; X86-LABEL: test_bitreverse_shli_bitreverse_i64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $357913941, %ecx # imm = 0x15555555
-; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $235867919, %ecx # imm = 0xE0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $590558003, %ecx # imm = 0x23333333
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
 ; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_bitreverse_shli_bitreverse_i64:
 ; X64:       # %bb.0:
-; X64-NEXT:    bswapq %rdi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X64-NEXT:    shll $4, %eax
-; X64-NEXT:    shrl $4, %edi
-; X64-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; X64-NEXT:    orl %eax, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X64-NEXT:    shrl $2, %edi
-; X64-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X64-NEXT:    leal (%rdi,%rax,4), %eax
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andl $357913941, %ecx # imm = 0x15555555
-; X64-NEXT:    shrl %eax
-; X64-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X64-NEXT:    leal (%rax,%rcx,2), %eax
-; X64-NEXT:    shlq $33, %rax
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andl $235867919, %ecx # imm = 0xE0F0F0F
-; X64-NEXT:    shll $4, %ecx
-; X64-NEXT:    shrl $4, %eax
-; X64-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X64-NEXT:    orl %ecx, %eax
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andl $590558003, %ecx # imm = 0x23333333
-; X64-NEXT:    shrl $2, %eax
-; X64-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X64-NEXT:    leal (%rax,%rcx,4), %eax
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andl $357913941, %ecx # imm = 0x15555555
-; X64-NEXT:    shrl %eax
-; X64-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X64-NEXT:    leal (%rax,%rcx,2), %eax
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    shrq $33, %rax
 ; X64-NEXT:    retq
     %1 = call i64 @llvm.bitreverse.i64(i64 %a)
     %2 = shl i64 %1, 33
diff --git a/llvm/test/CodeGen/X86/dbg-value-superreg-copy2.mir b/llvm/test/CodeGen/X86/dbg-value-superreg-copy2.mir
index 86319da8db01..e29d00afcd41 100644
--- a/llvm/test/CodeGen/X86/dbg-value-superreg-copy2.mir
+++ b/llvm/test/CodeGen/X86/dbg-value-superreg-copy2.mir
@@ -40,7 +40,7 @@ body:             |
     %0:gr16_abcd = MOV16ri 1, debug-instr-number 1, debug-location !9
 
   bb.1:
-    DBG_INSTR_REF !7, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(2, 0), debug-location !9
+    DBG_INSTR_REF !7, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(2, 0), debug-location !9
     %1:gr16 = COPY %0, debug-location !9
     %2:gr16 = COPY %0
 
diff --git a/llvm/test/CodeGen/X86/dynamic-regmask-preserve-none.ll b/llvm/test/CodeGen/X86/dynamic-regmask-preserve-none.ll
index 7dad3e1081c6..281bd98f615f 100644
--- a/llvm/test/CodeGen/X86/dynamic-regmask-preserve-none.ll
+++ b/llvm/test/CodeGen/X86/dynamic-regmask-preserve-none.ll
@@ -12,7 +12,7 @@ define preserve_nonecc i64 @callee1(i64 %a0, i64 %b0, i64 %c0, i64 %d0, i64 %e0)
 ; CHECK-NOT: calleeSavedRegisters:
 ; CHECK:     RET 0, $rax
 
-; Check that RegMask is csr_noregs.
+; Check that RegMask is csr_64_noneregs.
 define i64 @caller1(i64 %a0) nounwind {
   %b1 = call preserve_nonecc i64 @callee1(i64 %a0, i64 %a0, i64 %a0, i64 %a0, i64 %a0)
   %b2 = add i64 %b1, %a0
@@ -38,13 +38,13 @@ define preserve_nonecc {i64, i64} @callee2(i64 %a0, i64 %b0, i64 %c0, i64 %d0, i
 ; CHECK:     RET 0, $rax, $rdx
 
 
-; Check that RegMask is csr_noregs.
+; Check that RegMask is csr_64_noneregs.
 define {i64, i64} @caller2(i64 %a0) nounwind {
   %b1 = call preserve_nonecc {i64, i64} @callee2(i64 %a0, i64 %a0, i64 %a0, i64 %a0, i64 %a0)
   ret {i64, i64} %b1
 }
 ; CHECK:    name: caller2
-; CHECL:    CALL64pcrel32 @callee2, csr_noregs
+; CHECK:    CALL64pcrel32 @callee2, csr_64_noneregs
 ; CHECK:    RET 0, $rax, $rdx
 
 
@@ -53,7 +53,7 @@ define {i64, i64} @caller2(i64 %a0) nounwind {
 ; Declare the callee with a sret parameter.
 declare preserve_nonecc void @callee3(ptr noalias nocapture writeonly sret(%struct.Large) align 4 %a0, i64 %b0) nounwind;
 
-; Check that RegMask is csr_noregs.
+; Check that RegMask is csr_64_noneregs.
 define void @caller3(i64 %a0) nounwind {
   %a1 = alloca %struct.Large, align 8
   call preserve_nonecc void @callee3(ptr nonnull sret(%struct.Large) align 8 %a1, i64 %a0)
@@ -78,7 +78,7 @@ define preserve_nonecc {i64, double} @callee4(i64 %a0, i64 %b0, i64 %c0, i64 %d0
 ; CHECK-NOT: calleeSavedRegisters:
 ; CHECK:     RET 0, $rax, $xmm0
 
-; Check that RegMask is csr_noregs.
+; Check that RegMask is csr_64_noneregs.
 define {i64, double} @caller4(i64 %a0) nounwind {
   %b1 = call preserve_nonecc {i64, double} @callee4(i64 %a0, i64 %a0, i64 %a0, i64 %a0, i64 %a0)
   ret {i64, double} %b1
diff --git a/llvm/test/CodeGen/X86/elf-separate-named-sections.ll b/llvm/test/CodeGen/X86/elf-separate-named-sections.ll
new file mode 100644
index 000000000000..18efc20aa945
--- /dev/null
+++ b/llvm/test/CodeGen/X86/elf-separate-named-sections.ll
@@ -0,0 +1,36 @@
+; Test that global values with explicit sections are placed into unique sections.
+
+; RUN: llc < %s | FileCheck %s
+; RUN: llc -separate-named-sections < %s | FileCheck %s --check-prefix=SEPARATE
+target triple="x86_64-unknown-unknown-elf"
+
+define i32 @f() section "custom_text" {
+    entry:
+    ret i32 0
+}
+
+define i32 @g() section "custom_text" {
+    entry:
+    ret i32 0
+}
+
+; CHECK: .section custom_text,"ax",@progbits{{$}}
+; CHECK: f:
+; CHECK: g:
+
+; SEPARATE: .section custom_text,"ax",@progbits,unique,1{{$}}
+; SEPARATE: f:
+; SEPARATE: .section custom_text,"ax",@progbits,unique,2{{$}}
+; SEPARATE: g:
+
+@i = global i32 0, section "custom_data", align 8
+@j = global i32 0, section "custom_data", align 8
+
+; CHECK: .section custom_data,"aw",@progbits{{$}}
+; CHECK: i:
+; CHECK: j:
+
+; SEPARATE: .section custom_data,"aw",@progbits,unique,3{{$}}
+; SEPARATE: i:
+; SEPARATE: .section custom_data,"aw",@progbits,unique,4{{$}}
+; SEPARATE: j:
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index 8f875c70a25f..96b2e1ef9827 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -840,44 +840,18 @@ define double @fmul_pow_shl_cnt_fail_maybe_non_pow2(i64 %v, i64 %cnt) nounwind {
 define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nounwind {
 ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
 ; CHECK-SSE:       # %bb.0:
-; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; CHECK-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [2,2]
-; CHECK-SSE-NEXT:    movdqa %xmm3, %xmm1
-; CHECK-SSE-NEXT:    psllq %xmm2, %xmm1
-; CHECK-SSE-NEXT:    psllq %xmm0, %xmm3
-; CHECK-SSE-NEXT:    movq %xmm3, %rax
-; CHECK-SSE-NEXT:    testq %rax, %rax
-; CHECK-SSE-NEXT:    js .LBB12_1
-; CHECK-SSE-NEXT:  # %bb.2:
-; CHECK-SSE-NEXT:    xorps %xmm0, %xmm0
-; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm0
-; CHECK-SSE-NEXT:    jmp .LBB12_3
-; CHECK-SSE-NEXT:  .LBB12_1:
-; CHECK-SSE-NEXT:    movq %rax, %rcx
-; CHECK-SSE-NEXT:    shrq %rcx
-; CHECK-SSE-NEXT:    andl $1, %eax
-; CHECK-SSE-NEXT:    orq %rcx, %rax
+; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; CHECK-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2,2]
+; CHECK-SSE-NEXT:    movdqa %xmm2, %xmm3
+; CHECK-SSE-NEXT:    psllq %xmm1, %xmm3
+; CHECK-SSE-NEXT:    psllq %xmm0, %xmm2
+; CHECK-SSE-NEXT:    movq %xmm2, %rax
 ; CHECK-SSE-NEXT:    xorps %xmm0, %xmm0
 ; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm0
-; CHECK-SSE-NEXT:    addss %xmm0, %xmm0
-; CHECK-SSE-NEXT:  .LBB12_3:
-; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
 ; CHECK-SSE-NEXT:    movq %xmm1, %rax
-; CHECK-SSE-NEXT:    testq %rax, %rax
-; CHECK-SSE-NEXT:    js .LBB12_4
-; CHECK-SSE-NEXT:  # %bb.5:
-; CHECK-SSE-NEXT:    xorps %xmm1, %xmm1
-; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm1
-; CHECK-SSE-NEXT:    jmp .LBB12_6
-; CHECK-SSE-NEXT:  .LBB12_4:
-; CHECK-SSE-NEXT:    movq %rax, %rcx
-; CHECK-SSE-NEXT:    shrq %rcx
-; CHECK-SSE-NEXT:    andl $1, %eax
-; CHECK-SSE-NEXT:    orq %rcx, %rax
 ; CHECK-SSE-NEXT:    xorps %xmm1, %xmm1
 ; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm1
-; CHECK-SSE-NEXT:    addss %xmm1, %xmm1
-; CHECK-SSE-NEXT:  .LBB12_6:
 ; CHECK-SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-SSE-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE-NEXT:    retq
@@ -886,18 +860,11 @@ define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nou
 ; CHECK-AVX2:       # %bb.0:
 ; CHECK-AVX2-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [2,2]
 ; CHECK-AVX2-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT:    vpsrlq $1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vpextrq $1, %xmm1, %rax
-; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
-; CHECK-AVX2-NEXT:    vmovq %xmm1, %rax
-; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm1
-; CHECK-AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero
-; CHECK-AVX2-NEXT:    vaddps %xmm1, %xmm1, %xmm2
-; CHECK-AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX2-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm0
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; CHECK-AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; CHECK-AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm1
+; CHECK-AVX2-NEXT:    vmovq %xmm0, %rax
+; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
+; CHECK-AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
 ; CHECK-AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1]
 ; CHECK-AVX2-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
@@ -907,9 +874,9 @@ define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nou
 ; CHECK-NO-FASTFMA-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [2,2]
 ; CHECK-NO-FASTFMA-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    vpextrq $1, %xmm0, %rax
-; CHECK-NO-FASTFMA-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm1
+; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm1
 ; CHECK-NO-FASTFMA-NEXT:    vmovq %xmm0, %rax
-; CHECK-NO-FASTFMA-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
 ; CHECK-NO-FASTFMA-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1]
 ; CHECK-NO-FASTFMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
@@ -919,7 +886,7 @@ define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nou
 ; CHECK-FMA:       # %bb.0:
 ; CHECK-FMA-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2,2]
 ; CHECK-FMA-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    vcvtuqq2ps %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vcvtqq2ps %xmm0, %xmm0
 ; CHECK-FMA-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
 ; CHECK-FMA-NEXT:    retq
   %shl = shl nsw nuw <2 x i64> <i64 2, i64 2>, %cnt
@@ -986,7 +953,7 @@ define <4 x float> @fmul_pow_shl_cnt_vec_preserve_fma(<4 x i32> %cnt, <4 x float
 ; CHECK-FMA:       # %bb.0:
 ; CHECK-FMA-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2]
 ; CHECK-FMA-NEXT:    vpsllvd %xmm0, %xmm2, %xmm0
-; CHECK-FMA-NEXT:    vcvtudq2ps %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; CHECK-FMA-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * mem) + xmm1
 ; CHECK-FMA-NEXT:    retq
   %shl = shl nsw nuw <4 x i32> <i32 2, i32 2, i32 2, i32 2>, %cnt
diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll
index c79da37988e4..dbc027495297 100644
--- a/llvm/test/CodeGen/X86/freeze-binary.ll
+++ b/llvm/test/CodeGen/X86/freeze-binary.ll
@@ -546,8 +546,8 @@ define <8 x i16> @freeze_ashr_vec(<8 x i16> %a0) nounwind {
 define <4 x i32> @freeze_ashr_vec_outofrange(<4 x i32> %a0) nounwind {
 ; X86-LABEL: freeze_ashr_vec_outofrange:
 ; X86:       # %bb.0:
-; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
 ; X86-NEXT:    psrad $1, %xmm0
+; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
 ; X86-NEXT:    psrad $2, %xmm0
 ; X86-NEXT:    retl
 ;
@@ -660,8 +660,8 @@ define <8 x i16> @freeze_lshr_vec(<8 x i16> %a0) nounwind {
 define <4 x i32> @freeze_lshr_vec_outofrange(<4 x i32> %a0) nounwind {
 ; X86-LABEL: freeze_lshr_vec_outofrange:
 ; X86:       # %bb.0:
-; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
 ; X86-NEXT:    psrld $1, %xmm0
+; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
 ; X86-NEXT:    psrld $2, %xmm0
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
index afe0ebb9dcb4..b3ca9fb04aeb 100644
--- a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
@@ -107,16 +107,15 @@ define <16 x i8> @var_fshl_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt) nou
 ; GFNIAVX512VL-LABEL: var_fshl_v16i8:
 ; GFNIAVX512VL:       # %bb.0:
 ; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; GFNIAVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; GFNIAVX512VL-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; GFNIAVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
-; GFNIAVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; GFNIAVX512VL-NEXT:    vpsllvd %zmm4, %zmm0, %zmm0
-; GFNIAVX512VL-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; GFNIAVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
-; GFNIAVX512VL-NEXT:    vpsrlw $1, %xmm1, %xmm1
-; GFNIAVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm1
 ; GFNIAVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; GFNIAVX512VL-NEXT:    vpsrlvd %zmm2, %zmm1, %zmm1
+; GFNIAVX512VL-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm1
+; GFNIAVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; GFNIAVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; GFNIAVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; GFNIAVX512VL-NEXT:    vpsllvd %zmm2, %zmm0, %zmm0
 ; GFNIAVX512VL-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; GFNIAVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
 ; GFNIAVX512VL-NEXT:    vzeroupper
@@ -151,17 +150,14 @@ define <16 x i8> @var_fshr_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt) nou
 ; GFNISSE-NEXT:    movdqa %xmm0, %xmm4
 ; GFNISSE-NEXT:    paddb %xmm0, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm6
-; GFNISSE-NEXT:    psrlw $4, %xmm6
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm6, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm6
-; GFNISSE-NEXT:    psrlw $2, %xmm6
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm6, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm6
-; GFNISSE-NEXT:    psrlw $1, %xmm6
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
 ; GFNISSE-NEXT:    paddb %xmm4, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm6, %xmm1
@@ -171,13 +167,11 @@ define <16 x i8> @var_fshr_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt) nou
 ; GFNISSE-NEXT:    paddb %xmm3, %xmm4
 ; GFNISSE-NEXT:    paddb %xmm2, %xmm2
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm5
-; GFNISSE-NEXT:    psllw $4, %xmm5
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm2
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm3
-; GFNISSE-NEXT:    psllw $2, %xmm3
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm3
@@ -195,25 +189,20 @@ define <16 x i8> @var_fshr_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt) nou
 ; GFNIAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm4, %xmm4
 ; GFNIAVX1-NEXT:    vpaddb %xmm4, %xmm4, %xmm5
-; GFNIAVX1-NEXT:    vpsrlw $4, %xmm1, %xmm6
-; GFNIAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm6
 ; GFNIAVX1-NEXT:    vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpsrlw $2, %xmm1, %xmm4
-; GFNIAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
 ; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpsrlw $1, %xmm1, %xmm4
-; GFNIAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
 ; GFNIAVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
 ; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
 ; GFNIAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm2, %xmm2
 ; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm3
 ; GFNIAVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpsllw $4, %xmm0, %xmm4
-; GFNIAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4
 ; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpsllw $2, %xmm0, %xmm2
-; GFNIAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
 ; GFNIAVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
 ; GFNIAVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
 ; GFNIAVX1-NEXT:    vpaddb %xmm3, %xmm3, %xmm3
@@ -227,25 +216,20 @@ define <16 x i8> @var_fshr_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt) nou
 ; GFNIAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; GFNIAVX2-NEXT:    vpsllw $5, %xmm4, %xmm4
 ; GFNIAVX2-NEXT:    vpaddb %xmm4, %xmm4, %xmm5
-; GFNIAVX2-NEXT:    vpsrlw $4, %xmm1, %xmm6
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm6
 ; GFNIAVX2-NEXT:    vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
-; GFNIAVX2-NEXT:    vpsrlw $2, %xmm1, %xmm4
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
 ; GFNIAVX2-NEXT:    vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
-; GFNIAVX2-NEXT:    vpsrlw $1, %xmm1, %xmm4
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
 ; GFNIAVX2-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
 ; GFNIAVX2-NEXT:    vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
 ; GFNIAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
 ; GFNIAVX2-NEXT:    vpsllw $5, %xmm2, %xmm2
 ; GFNIAVX2-NEXT:    vpaddb %xmm2, %xmm2, %xmm3
 ; GFNIAVX2-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
-; GFNIAVX2-NEXT:    vpsllw $4, %xmm0, %xmm4
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4
 ; GFNIAVX2-NEXT:    vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
-; GFNIAVX2-NEXT:    vpsllw $2, %xmm0, %xmm2
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
 ; GFNIAVX2-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
 ; GFNIAVX2-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
 ; GFNIAVX2-NEXT:    vpaddb %xmm3, %xmm3, %xmm3
@@ -492,19 +476,15 @@ define <16 x i8> @constant_fshr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 define <16 x i8> @splatconstant_fshl_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; GFNISSE-LABEL: splatconstant_fshl_v16i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    psrlw $5, %xmm1
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; GFNISSE-NEXT:    psllw $3, %xmm0
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; GFNISSE-NEXT:    por %xmm1, %xmm0
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX1OR2-LABEL: splatconstant_fshl_v16i8:
 ; GFNIAVX1OR2:       # %bb.0:
-; GFNIAVX1OR2-NEXT:    vpsrlw $5, %xmm1, %xmm1
-; GFNIAVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; GFNIAVX1OR2-NEXT:    vpsllw $3, %xmm0, %xmm0
-; GFNIAVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; GFNIAVX1OR2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; GFNIAVX1OR2-NEXT:    retq
 ;
@@ -522,25 +502,23 @@ declare <16 x i8> @llvm.fshl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
 define <16 x i8> @splatconstant_fshr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; GFNISSE-LABEL: splatconstant_fshr_v16i8:
 ; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; GFNISSE-NEXT:    paddb %xmm0, %xmm0
-; GFNISSE-NEXT:    psrlw $7, %xmm1
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; GFNISSE-NEXT:    por %xmm1, %xmm0
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX1OR2-LABEL: splatconstant_fshr_v16i8:
 ; GFNIAVX1OR2:       # %bb.0:
+; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; GFNIAVX1OR2-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
-; GFNIAVX1OR2-NEXT:    vpsrlw $7, %xmm1, %xmm1
-; GFNIAVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; GFNIAVX1OR2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; GFNIAVX1OR2-NEXT:    retq
 ;
 ; GFNIAVX512-LABEL: splatconstant_fshr_v16i8:
 ; GFNIAVX512:       # %bb.0:
-; GFNIAVX512-NEXT:    vpsrlw $7, %xmm1, %xmm1
-; GFNIAVX512-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
-; GFNIAVX512-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
+; GFNIAVX512-NEXT:    vpaddw %xmm0, %xmm0, %xmm2
+; GFNIAVX512-NEXT:    vpsrlw $7, %xmm1, %xmm0
+; GFNIAVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0
 ; GFNIAVX512-NEXT:    retq
   %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>)
   ret <16 x i8> %res
@@ -721,28 +699,22 @@ define <32 x i8> @var_fshl_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou
 ; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; GFNIAVX512VL-NEXT:    vpandn %ymm3, %ymm2, %ymm4
 ; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm4, %ymm4
-; GFNIAVX512VL-NEXT:    vpsrlw $1, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; GFNIAVX512VL-NEXT:    vpand %ymm5, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm6
-; GFNIAVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm6, %ymm6
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [145249953336295424,145249953336295424,145249953336295424,145249953336295424]
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm6
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpsrlw $2, %ymm1, %ymm6
-; GFNIAVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm6, %ymm6
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm6
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpsrlw $1, %ymm1, %ymm6
-; GFNIAVX512VL-NEXT:    vpand %ymm5, %ymm6, %ymm5
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm1, %ymm5
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm4, %ymm5, %ymm1, %ymm1
 ; GFNIAVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm2
 ; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm2, %ymm2
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm3
-; GFNIAVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm4
-; GFNIAVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm4
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm2
-; GFNIAVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
@@ -771,40 +743,35 @@ define <32 x i8> @var_fshr_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou
 ; GFNISSE:       # %bb.0:
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm6
 ; GFNISSE-NEXT:    movdqa %xmm0, %xmm4
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm8 = [1161999622361579520,1161999622361579520]
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm9
-; GFNISSE-NEXT:    psrlw $4, %xmm9
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; GFNISSE-NEXT:    pand %xmm8, %xmm9
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm8, %xmm9
 ; GFNISSE-NEXT:    movdqa {{.*#+}} xmm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; GFNISSE-NEXT:    movdqa %xmm6, %xmm0
 ; GFNISSE-NEXT:    pand %xmm7, %xmm0
 ; GFNISSE-NEXT:    psllw $5, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm9, %xmm2
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm9 = [290499906672525312,290499906672525312]
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm10
-; GFNISSE-NEXT:    psrlw $2, %xmm10
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; GFNISSE-NEXT:    pand %xmm9, %xmm10
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm10
 ; GFNISSE-NEXT:    paddb %xmm0, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm10, %xmm2
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm10 = [145249953336295424,145249953336295424]
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm11
-; GFNISSE-NEXT:    psrlw $1, %xmm11
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm10 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; GFNISSE-NEXT:    pand %xmm10, %xmm11
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm11
 ; GFNISSE-NEXT:    paddb %xmm0, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm11, %xmm2
 ; GFNISSE-NEXT:    paddb %xmm4, %xmm4
+; GFNISSE-NEXT:    pmovsxdq {{.*#+}} xmm11 = [16909320,16909320]
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm12
-; GFNISSE-NEXT:    psllw $4, %xmm12
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm11 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; GFNISSE-NEXT:    pand %xmm11, %xmm12
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm12
 ; GFNISSE-NEXT:    pandn %xmm7, %xmm6
 ; GFNISSE-NEXT:    psllw $5, %xmm6
 ; GFNISSE-NEXT:    movdqa %xmm6, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm12, %xmm4
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm12 = [1108169199648,1108169199648]
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm13
-; GFNISSE-NEXT:    psllw $2, %xmm13
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm12 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; GFNISSE-NEXT:    pand %xmm12, %xmm13
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm12, %xmm13
 ; GFNISSE-NEXT:    paddb %xmm6, %xmm6
 ; GFNISSE-NEXT:    movdqa %xmm6, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm13, %xmm4
@@ -815,33 +782,28 @@ define <32 x i8> @var_fshr_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm13, %xmm4
 ; GFNISSE-NEXT:    por %xmm2, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm2
-; GFNISSE-NEXT:    psrlw $4, %xmm2
-; GFNISSE-NEXT:    pand %xmm8, %xmm2
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm8, %xmm2
 ; GFNISSE-NEXT:    movdqa %xmm5, %xmm0
 ; GFNISSE-NEXT:    pand %xmm7, %xmm0
 ; GFNISSE-NEXT:    psllw $5, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm2, %xmm3
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm2
-; GFNISSE-NEXT:    psrlw $2, %xmm2
-; GFNISSE-NEXT:    pand %xmm9, %xmm2
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm2
 ; GFNISSE-NEXT:    paddb %xmm0, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm2, %xmm3
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm2
-; GFNISSE-NEXT:    psrlw $1, %xmm2
-; GFNISSE-NEXT:    pand %xmm10, %xmm2
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm2
 ; GFNISSE-NEXT:    paddb %xmm0, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm2, %xmm3
 ; GFNISSE-NEXT:    paddb %xmm1, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm2
-; GFNISSE-NEXT:    psllw $4, %xmm2
-; GFNISSE-NEXT:    pand %xmm11, %xmm2
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm2
 ; GFNISSE-NEXT:    pandn %xmm7, %xmm5
 ; GFNISSE-NEXT:    psllw $5, %xmm5
 ; GFNISSE-NEXT:    movdqa %xmm5, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm2
-; GFNISSE-NEXT:    psllw $2, %xmm2
-; GFNISSE-NEXT:    pand %xmm12, %xmm2
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm12, %xmm2
 ; GFNISSE-NEXT:    paddb %xmm5, %xmm5
 ; GFNISSE-NEXT:    movdqa %xmm5, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
@@ -856,100 +818,95 @@ define <32 x i8> @var_fshr_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou
 ;
 ; GFNIAVX1-LABEL: var_fshr_v32i8:
 ; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; GFNIAVX1-NEXT:    vpsrlw $4, %xmm5, %xmm3
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm3, %xmm6
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [1161999622361579520,1161999622361579520]
+; GFNIAVX1-NEXT:    # xmm5 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm4, %xmm6
 ; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; GFNIAVX1-NEXT:    vandps %ymm3, %ymm2, %ymm2
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm7, %xmm8
-; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm6, %xmm5, %xmm6
-; GFNIAVX1-NEXT:    vpsrlw $2, %xmm6, %xmm9
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; GFNIAVX1-NEXT:    vpand %xmm5, %xmm9, %xmm9
+; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm6, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm6 = [290499906672525312,290499906672525312]
+; GFNIAVX1-NEXT:    # xmm6 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm6, %xmm4, %xmm9
 ; GFNIAVX1-NEXT:    vpaddb %xmm8, %xmm8, %xmm8
-; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm9, %xmm6, %xmm9
-; GFNIAVX1-NEXT:    vpsrlw $1, %xmm9, %xmm10
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; GFNIAVX1-NEXT:    vpand %xmm6, %xmm10, %xmm10
+; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm9, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm9 = [145249953336295424,145249953336295424]
+; GFNIAVX1-NEXT:    # xmm9 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm9, %xmm4, %xmm10
 ; GFNIAVX1-NEXT:    vpaddb %xmm8, %xmm8, %xmm8
-; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm10, %xmm9, %xmm8
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm9
-; GFNIAVX1-NEXT:    vpaddb %xmm9, %xmm9, %xmm9
-; GFNIAVX1-NEXT:    vpsllw $4, %xmm9, %xmm10
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm11 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; GFNIAVX1-NEXT:    vpand %xmm11, %xmm10, %xmm10
+; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm10, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm8
+; GFNIAVX1-NEXT:    vpaddb %xmm8, %xmm8, %xmm8
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm10 = [16909320,16909320]
+; GFNIAVX1-NEXT:    # xmm10 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm10, %xmm8, %xmm11
 ; GFNIAVX1-NEXT:    vpxor %xmm3, %xmm7, %xmm7
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm7, %xmm7
-; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm10, %xmm9, %xmm9
-; GFNIAVX1-NEXT:    vpsllw $2, %xmm9, %xmm10
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm12 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; GFNIAVX1-NEXT:    vpand %xmm12, %xmm10, %xmm10
+; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm11, %xmm8, %xmm8
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm11 = [1108169199648,1108169199648]
+; GFNIAVX1-NEXT:    # xmm11 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm11, %xmm8, %xmm12
 ; GFNIAVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm7
-; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm10, %xmm9, %xmm9
-; GFNIAVX1-NEXT:    vpaddb %xmm9, %xmm9, %xmm10
+; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm12, %xmm8, %xmm8
+; GFNIAVX1-NEXT:    vpaddb %xmm8, %xmm8, %xmm12
 ; GFNIAVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm7
-; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm10, %xmm9, %xmm7
-; GFNIAVX1-NEXT:    vpor %xmm7, %xmm8, %xmm7
-; GFNIAVX1-NEXT:    vpsrlw $4, %xmm1, %xmm8
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm8, %xmm4
-; GFNIAVX1-NEXT:    vpsllw $5, %xmm2, %xmm8
-; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm4, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpsrlw $2, %xmm1, %xmm4
-; GFNIAVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
-; GFNIAVX1-NEXT:    vpaddb %xmm8, %xmm8, %xmm5
-; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpsrlw $1, %xmm1, %xmm4
-; GFNIAVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
-; GFNIAVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
-; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm12, %xmm8, %xmm7
+; GFNIAVX1-NEXT:    vpor %xmm4, %xmm7, %xmm4
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm1, %xmm5
+; GFNIAVX1-NEXT:    vpsllw $5, %xmm2, %xmm7
+; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm5, %xmm1, %xmm1
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm6, %xmm1, %xmm5
+; GFNIAVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm6
+; GFNIAVX1-NEXT:    vpblendvb %xmm6, %xmm5, %xmm1, %xmm1
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm9, %xmm1, %xmm5
+; GFNIAVX1-NEXT:    vpaddb %xmm6, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpblendvb %xmm6, %xmm5, %xmm1, %xmm1
 ; GFNIAVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpsllw $4, %xmm0, %xmm4
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm11, %xmm4
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm10, %xmm0, %xmm5
 ; GFNIAVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpsllw $2, %xmm0, %xmm3
-; GFNIAVX1-NEXT:    vpand %xmm3, %xmm12, %xmm3
+; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm11, %xmm0, %xmm3
 ; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
 ; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm3, %xmm0, %xmm0
 ; GFNIAVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm3
 ; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
 ; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm3, %xmm0, %xmm0
 ; GFNIAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
 ; GFNIAVX1-NEXT:    retq
 ;
 ; GFNIAVX2-LABEL: var_fshr_v32i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; GFNIAVX2-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; GFNIAVX2-NEXT:    vpsllw $5, %ymm4, %ymm4
-; GFNIAVX2-NEXT:    vpaddb %ymm4, %ymm4, %ymm5
-; GFNIAVX2-NEXT:    vpsrlw $4, %ymm1, %ymm6
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6
-; GFNIAVX2-NEXT:    vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpsrlw $2, %ymm1, %ymm4
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
-; GFNIAVX2-NEXT:    vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpsrlw $1, %ymm1, %ymm4
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm3
+; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNIAVX2-NEXT:    vpand %ymm4, %ymm2, %ymm5
+; GFNIAVX2-NEXT:    vpsllw $5, %ymm5, %ymm5
+; GFNIAVX2-NEXT:    vpblendvb %ymm5, %ymm3, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [290499906672525312,290499906672525312,290499906672525312,290499906672525312]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm3
 ; GFNIAVX2-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
-; GFNIAVX2-NEXT:    vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpandn %ymm3, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpsllw $5, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm3
+; GFNIAVX2-NEXT:    vpblendvb %ymm5, %ymm3, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [145249953336295424,145249953336295424,145249953336295424,145249953336295424]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm3
+; GFNIAVX2-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
+; GFNIAVX2-NEXT:    vpblendvb %ymm5, %ymm3, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpsllw $4, %ymm0, %ymm4
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
-; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpsllw $2, %ymm0, %ymm2
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
-; GFNIAVX2-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
-; GFNIAVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [16909320,16909320,16909320,16909320]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm3
+; GFNIAVX2-NEXT:    vpandn %ymm4, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpsllw $5, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm3, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1108169199648,1108169199648,1108169199648,1108169199648]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm3
+; GFNIAVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm3, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm3
+; GFNIAVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm3, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    retq
 ;
@@ -959,25 +916,20 @@ define <32 x i8> @var_fshr_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou
 ; GFNIAVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm4
 ; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm4, %ymm4
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm4, %ymm4, %ymm5
-; GFNIAVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm6
-; GFNIAVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm6, %ymm6
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm6
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpsrlw $2, %ymm1, %ymm4
-; GFNIAVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm4
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpsrlw $1, %ymm1, %ymm4
-; GFNIAVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm4
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
 ; GFNIAVX512VL-NEXT:    vpandn %ymm3, %ymm2, %ymm2
 ; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm2, %ymm2
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm3
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm4
-; GFNIAVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm4
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm2
-; GFNIAVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
@@ -1336,45 +1288,29 @@ define <32 x i8> @constant_fshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 define <32 x i8> @splatconstant_fshl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; GFNISSE-LABEL: splatconstant_fshl_v32i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    psrlw $4, %xmm2
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; GFNISSE-NEXT:    movdqa %xmm4, %xmm5
-; GFNISSE-NEXT:    pandn %xmm2, %xmm5
-; GFNISSE-NEXT:    psllw $4, %xmm0
-; GFNISSE-NEXT:    pand %xmm4, %xmm0
-; GFNISSE-NEXT:    por %xmm5, %xmm0
-; GFNISSE-NEXT:    psrlw $4, %xmm3
-; GFNISSE-NEXT:    psllw $4, %xmm1
-; GFNISSE-NEXT:    pand %xmm4, %xmm1
-; GFNISSE-NEXT:    pandn %xmm3, %xmm4
-; GFNISSE-NEXT:    por %xmm4, %xmm1
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [1161999622361579520,1161999622361579520]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm2
+; GFNISSE-NEXT:    pmovsxdq {{.*#+}} xmm5 = [16909320,16909320]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
+; GFNISSE-NEXT:    por %xmm2, %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm3
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
+; GFNISSE-NEXT:    por %xmm3, %xmm1
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX1-LABEL: splatconstant_fshl_v32i8:
 ; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; GFNIAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; GFNIAVX1-NEXT:    vpsllw $4, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; GFNIAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsllw $4, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; GFNIAVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; GFNIAVX1-NEXT:    retq
 ;
 ; GFNIAVX2-LABEL: splatconstant_fshl_v32i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpsllw $4, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [16909320,16909320,16909320,16909320]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    retq
 ;
@@ -1392,45 +1328,29 @@ declare <32 x i8> @llvm.fshl.v32i8(<32 x i8>, <32 x i8>, <32 x i8>)
 define <32 x i8> @splatconstant_fshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; GFNISSE-LABEL: splatconstant_fshr_v32i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    psrlw $6, %xmm2
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; GFNISSE-NEXT:    movdqa %xmm4, %xmm5
-; GFNISSE-NEXT:    pandn %xmm2, %xmm5
-; GFNISSE-NEXT:    psllw $2, %xmm0
-; GFNISSE-NEXT:    pand %xmm4, %xmm0
-; GFNISSE-NEXT:    por %xmm5, %xmm0
-; GFNISSE-NEXT:    psrlw $6, %xmm3
-; GFNISSE-NEXT:    psllw $2, %xmm1
-; GFNISSE-NEXT:    pand %xmm4, %xmm1
-; GFNISSE-NEXT:    pandn %xmm3, %xmm4
-; GFNISSE-NEXT:    por %xmm4, %xmm1
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [4647714815446351872,4647714815446351872]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm2
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [1108169199648,1108169199648]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
+; GFNISSE-NEXT:    por %xmm2, %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm3
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
+; GFNISSE-NEXT:    por %xmm3, %xmm1
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX1-LABEL: splatconstant_fshr_v32i8:
 ; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $6, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
-; GFNIAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $6, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; GFNIAVX1-NEXT:    vpsllw $2, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; GFNIAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsllw $2, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; GFNIAVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; GFNIAVX1-NEXT:    retq
 ;
 ; GFNIAVX2-LABEL: splatconstant_fshr_v32i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpsrlw $6, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpsllw $2, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4647714815446351872,4647714815446351872,4647714815446351872,4647714815446351872]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [1108169199648,1108169199648,1108169199648,1108169199648]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    retq
 ;
@@ -1766,63 +1686,51 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
 ; GFNIAVX512VL-LABEL: var_fshl_v64i8:
 ; GFNIAVX512VL:       # %bb.0:
 ; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
-; GFNIAVX512VL-NEXT:    vpsrlw $1, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; GFNIAVX512VL-NEXT:    vpand %ymm4, %ymm3, %ymm5
-; GFNIAVX512VL-NEXT:    vpsrlw $4, %ymm5, %ymm3
-; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm3, %ymm7
-; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} zmm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; GFNIAVX512VL-NEXT:    vpandq %zmm8, %zmm2, %zmm2
-; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; GFNIAVX512VL-NEXT:    vpxor %ymm3, %ymm8, %ymm9
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [145249953336295424,145249953336295424,145249953336295424,145249953336295424]
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520]
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm3, %ymm6
+; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} zmm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNIAVX512VL-NEXT:    vpandq %zmm7, %zmm2, %zmm2
+; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm2, %ymm8
+; GFNIAVX512VL-NEXT:    vpxor %ymm7, %ymm8, %ymm9
 ; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm9, %ymm9
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm9, %ymm7, %ymm5, %ymm5
-; GFNIAVX512VL-NEXT:    vpsrlw $2, %ymm5, %ymm7
-; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; GFNIAVX512VL-NEXT:    vpand %ymm7, %ymm10, %ymm7
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm9, %ymm6, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [290499906672525312,290499906672525312,290499906672525312,290499906672525312]
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm6, %ymm3, %ymm10
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm9, %ymm9, %ymm9
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm9, %ymm7, %ymm5, %ymm5
-; GFNIAVX512VL-NEXT:    vpsrlw $1, %ymm5, %ymm7
-; GFNIAVX512VL-NEXT:    vpand %ymm4, %ymm7, %ymm7
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm9, %ymm10, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm3, %ymm10
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm9, %ymm9, %ymm9
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm9, %ymm7, %ymm5, %ymm5
-; GFNIAVX512VL-NEXT:    vpsrlw $1, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpand %ymm4, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm7
-; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm7, %ymm6
-; GFNIAVX512VL-NEXT:    vpxor %ymm2, %ymm8, %ymm7
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm9, %ymm10, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm1, %ymm5
+; GFNIAVX512VL-NEXT:    vpxor %ymm7, %ymm2, %ymm7
 ; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm7, %ymm7
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm7, %ymm6, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpsrlw $2, %ymm1, %ymm6
-; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm10, %ymm6
-; GFNIAVX512VL-NEXT:    vpaddb %ymm7, %ymm7, %ymm7
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm7, %ymm6, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpsrlw $1, %ymm1, %ymm6
-; GFNIAVX512VL-NEXT:    vpand %ymm4, %ymm6, %ymm4
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm7, %ymm5, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm6, %ymm1, %ymm5
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm7, %ymm7, %ymm6
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm6, %ymm4, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
-; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
-; GFNIAVX512VL-NEXT:    vpsllw $4, %ymm4, %ymm5
-; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm5, %ymm5
-; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
-; GFNIAVX512VL-NEXT:    vpsllw $2, %ymm4, %ymm5
-; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; GFNIAVX512VL-NEXT:    vpand %ymm7, %ymm5, %ymm5
-; GFNIAVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
-; GFNIAVX512VL-NEXT:    vpaddb %ymm4, %ymm4, %ymm5
-; GFNIAVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
-; GFNIAVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm4
-; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm1, %ymm4
+; GFNIAVX512VL-NEXT:    vpaddb %ymm6, %ymm6, %ymm5
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [16909320,16909320,16909320,16909320]
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm3, %ymm5
+; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm8, %ymm6
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm6, %ymm5, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [1108169199648,1108169199648,1108169199648,1108169199648]
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm3, %ymm7
+; GFNIAVX512VL-NEXT:    vpaddb %ymm6, %ymm6, %ymm6
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm6, %ymm7, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm7
+; GFNIAVX512VL-NEXT:    vpaddb %ymm6, %ymm6, %ymm6
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm6, %ymm7, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm0, %ymm4
 ; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm2, %ymm2
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm4
-; GFNIAVX512VL-NEXT:    vpand %ymm7, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm0, %ymm4
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm4
@@ -1863,35 +1771,30 @@ define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
 ; GFNISSE-NEXT:    movdqa %xmm0, %xmm1
 ; GFNISSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
 ; GFNISSE-NEXT:    movdqa %xmm5, %xmm12
-; GFNISSE-NEXT:    psrlw $4, %xmm12
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm12
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm12
 ; GFNISSE-NEXT:    movdqa {{.*#+}} xmm11 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; GFNISSE-NEXT:    movdqa %xmm9, %xmm0
 ; GFNISSE-NEXT:    pand %xmm11, %xmm0
 ; GFNISSE-NEXT:    psllw $5, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm12, %xmm5
 ; GFNISSE-NEXT:    movdqa %xmm5, %xmm13
-; GFNISSE-NEXT:    psrlw $2, %xmm13
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm13
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm13
 ; GFNISSE-NEXT:    paddb %xmm0, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm13, %xmm5
 ; GFNISSE-NEXT:    movdqa %xmm5, %xmm14
-; GFNISSE-NEXT:    psrlw $1, %xmm14
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm14
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm14
 ; GFNISSE-NEXT:    paddb %xmm0, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm14, %xmm5
 ; GFNISSE-NEXT:    paddb %xmm1, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm15
-; GFNISSE-NEXT:    psllw $4, %xmm15
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm15
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm15
 ; GFNISSE-NEXT:    movdqa %xmm11, %xmm12
 ; GFNISSE-NEXT:    pandn %xmm11, %xmm9
 ; GFNISSE-NEXT:    psllw $5, %xmm9
 ; GFNISSE-NEXT:    movdqa %xmm9, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm15, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm8
-; GFNISSE-NEXT:    psllw $2, %xmm8
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8
 ; GFNISSE-NEXT:    paddb %xmm9, %xmm9
 ; GFNISSE-NEXT:    movdqa %xmm9, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm1
@@ -1902,38 +1805,33 @@ define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm1
 ; GFNISSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
 ; GFNISSE-NEXT:    movdqa %xmm6, %xmm8
-; GFNISSE-NEXT:    psrlw $4, %xmm8
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm11 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; GFNISSE-NEXT:    pand %xmm11, %xmm8
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm11 = [1161999622361579520,1161999622361579520]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm8
 ; GFNISSE-NEXT:    movdqa %xmm9, %xmm0
 ; GFNISSE-NEXT:    pand %xmm12, %xmm0
 ; GFNISSE-NEXT:    psllw $5, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm6
 ; GFNISSE-NEXT:    movdqa %xmm6, %xmm8
-; GFNISSE-NEXT:    psrlw $2, %xmm8
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm13 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; GFNISSE-NEXT:    pand %xmm13, %xmm8
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm13 = [290499906672525312,290499906672525312]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm13, %xmm8
 ; GFNISSE-NEXT:    paddb %xmm0, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm6
 ; GFNISSE-NEXT:    movdqa %xmm6, %xmm8
-; GFNISSE-NEXT:    psrlw $1, %xmm8
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm14 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; GFNISSE-NEXT:    pand %xmm14, %xmm8
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm14 = [145249953336295424,145249953336295424]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm14, %xmm8
 ; GFNISSE-NEXT:    paddb %xmm0, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm6
 ; GFNISSE-NEXT:    paddb %xmm2, %xmm2
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm8
-; GFNISSE-NEXT:    psllw $4, %xmm8
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm15 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; GFNISSE-NEXT:    pand %xmm15, %xmm8
+; GFNISSE-NEXT:    pmovsxdq {{.*#+}} xmm15 = [16909320,16909320]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm15, %xmm8
 ; GFNISSE-NEXT:    pandn %xmm12, %xmm9
 ; GFNISSE-NEXT:    psllw $5, %xmm9
 ; GFNISSE-NEXT:    movdqa %xmm9, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm2
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm8
-; GFNISSE-NEXT:    psllw $2, %xmm8
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm0 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; GFNISSE-NEXT:    pand %xmm0, %xmm8
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm0 = [1108169199648,1108169199648]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm0, %xmm8
 ; GFNISSE-NEXT:    paddb %xmm9, %xmm9
 ; GFNISSE-NEXT:    movdqa %xmm9, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm2
@@ -1944,33 +1842,28 @@ define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm2
 ; GFNISSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
 ; GFNISSE-NEXT:    movdqa %xmm7, %xmm8
-; GFNISSE-NEXT:    psrlw $4, %xmm8
-; GFNISSE-NEXT:    pand %xmm11, %xmm8
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm8
 ; GFNISSE-NEXT:    movdqa %xmm9, %xmm0
 ; GFNISSE-NEXT:    pand %xmm12, %xmm0
 ; GFNISSE-NEXT:    psllw $5, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm7
 ; GFNISSE-NEXT:    movdqa %xmm7, %xmm8
-; GFNISSE-NEXT:    psrlw $2, %xmm8
-; GFNISSE-NEXT:    pand %xmm13, %xmm8
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm13, %xmm8
 ; GFNISSE-NEXT:    paddb %xmm0, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm7
 ; GFNISSE-NEXT:    movdqa %xmm7, %xmm8
-; GFNISSE-NEXT:    psrlw $1, %xmm8
-; GFNISSE-NEXT:    pand %xmm14, %xmm8
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm14, %xmm8
 ; GFNISSE-NEXT:    paddb %xmm0, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm7
 ; GFNISSE-NEXT:    paddb %xmm3, %xmm3
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm8
-; GFNISSE-NEXT:    psllw $4, %xmm8
-; GFNISSE-NEXT:    pand %xmm15, %xmm8
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm15, %xmm8
 ; GFNISSE-NEXT:    pandn %xmm12, %xmm9
 ; GFNISSE-NEXT:    psllw $5, %xmm9
 ; GFNISSE-NEXT:    movdqa %xmm9, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm3
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm8
-; GFNISSE-NEXT:    psllw $2, %xmm8
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8
 ; GFNISSE-NEXT:    paddb %xmm9, %xmm9
 ; GFNISSE-NEXT:    movdqa %xmm9, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm3
@@ -1981,33 +1874,28 @@ define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm3
 ; GFNISSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
 ; GFNISSE-NEXT:    movdqa %xmm10, %xmm8
-; GFNISSE-NEXT:    psrlw $4, %xmm8
-; GFNISSE-NEXT:    pand %xmm11, %xmm8
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm8
 ; GFNISSE-NEXT:    movdqa %xmm9, %xmm0
 ; GFNISSE-NEXT:    pand %xmm12, %xmm0
 ; GFNISSE-NEXT:    psllw $5, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm10
 ; GFNISSE-NEXT:    movdqa %xmm10, %xmm8
-; GFNISSE-NEXT:    psrlw $2, %xmm8
-; GFNISSE-NEXT:    pand %xmm13, %xmm8
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm13, %xmm8
 ; GFNISSE-NEXT:    paddb %xmm0, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm10
 ; GFNISSE-NEXT:    movdqa %xmm10, %xmm8
-; GFNISSE-NEXT:    psrlw $1, %xmm8
-; GFNISSE-NEXT:    pand %xmm14, %xmm8
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm14, %xmm8
 ; GFNISSE-NEXT:    paddb %xmm0, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm10
 ; GFNISSE-NEXT:    paddb %xmm4, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm8
-; GFNISSE-NEXT:    psllw $4, %xmm8
-; GFNISSE-NEXT:    pand %xmm15, %xmm8
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm15, %xmm8
 ; GFNISSE-NEXT:    pandn %xmm12, %xmm9
 ; GFNISSE-NEXT:    psllw $5, %xmm9
 ; GFNISSE-NEXT:    movdqa %xmm9, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm8
-; GFNISSE-NEXT:    psllw $2, %xmm8
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8
 ; GFNISSE-NEXT:    paddb %xmm9, %xmm9
 ; GFNISSE-NEXT:    movdqa %xmm9, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm4
@@ -2029,61 +1917,56 @@ define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
 ; GFNIAVX1-LABEL: var_fshr_v64i8:
 ; GFNIAVX1:       # %bb.0:
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm8
-; GFNIAVX1-NEXT:    vpsrlw $4, %xmm8, %xmm6
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; GFNIAVX1-NEXT:    vpand %xmm7, %xmm6, %xmm9
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm7 = [1161999622361579520,1161999622361579520]
+; GFNIAVX1-NEXT:    # xmm7 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm7, %xmm8, %xmm9
 ; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} ymm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; GFNIAVX1-NEXT:    vandps %ymm6, %ymm4, %ymm11
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm11, %xmm10
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm10, %xmm12
 ; GFNIAVX1-NEXT:    vpblendvb %xmm12, %xmm9, %xmm8, %xmm8
-; GFNIAVX1-NEXT:    vpsrlw $2, %xmm8, %xmm9
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm9, %xmm9
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [290499906672525312,290499906672525312]
+; GFNIAVX1-NEXT:    # xmm4 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm8, %xmm9
 ; GFNIAVX1-NEXT:    vpaddb %xmm12, %xmm12, %xmm12
 ; GFNIAVX1-NEXT:    vpblendvb %xmm12, %xmm9, %xmm8, %xmm9
-; GFNIAVX1-NEXT:    vpsrlw $1, %xmm9, %xmm13
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; GFNIAVX1-NEXT:    vpand %xmm8, %xmm13, %xmm13
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm8 = [145249953336295424,145249953336295424]
+; GFNIAVX1-NEXT:    # xmm8 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm8, %xmm9, %xmm13
 ; GFNIAVX1-NEXT:    vpaddb %xmm12, %xmm12, %xmm12
 ; GFNIAVX1-NEXT:    vpblendvb %xmm12, %xmm13, %xmm9, %xmm12
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm9
 ; GFNIAVX1-NEXT:    vpaddb %xmm9, %xmm9, %xmm13
-; GFNIAVX1-NEXT:    vpsllw $4, %xmm13, %xmm14
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm9 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; GFNIAVX1-NEXT:    vpand %xmm9, %xmm14, %xmm14
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm9 = [16909320,16909320]
+; GFNIAVX1-NEXT:    # xmm9 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm9, %xmm13, %xmm14
 ; GFNIAVX1-NEXT:    vpxor %xmm6, %xmm10, %xmm10
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm10, %xmm15
 ; GFNIAVX1-NEXT:    vpblendvb %xmm15, %xmm14, %xmm13, %xmm13
-; GFNIAVX1-NEXT:    vpsllw $2, %xmm13, %xmm14
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm10 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; GFNIAVX1-NEXT:    vpand %xmm10, %xmm14, %xmm14
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm10 = [1108169199648,1108169199648]
+; GFNIAVX1-NEXT:    # xmm10 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm10, %xmm13, %xmm14
 ; GFNIAVX1-NEXT:    vpaddb %xmm15, %xmm15, %xmm15
 ; GFNIAVX1-NEXT:    vpblendvb %xmm15, %xmm14, %xmm13, %xmm13
 ; GFNIAVX1-NEXT:    vpaddb %xmm13, %xmm13, %xmm14
 ; GFNIAVX1-NEXT:    vpaddb %xmm15, %xmm15, %xmm15
 ; GFNIAVX1-NEXT:    vpblendvb %xmm15, %xmm14, %xmm13, %xmm13
 ; GFNIAVX1-NEXT:    vpor %xmm12, %xmm13, %xmm12
-; GFNIAVX1-NEXT:    vpsrlw $4, %xmm2, %xmm13
-; GFNIAVX1-NEXT:    vpand %xmm7, %xmm13, %xmm13
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm7, %xmm2, %xmm13
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm11, %xmm14
 ; GFNIAVX1-NEXT:    vpblendvb %xmm14, %xmm13, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $2, %xmm2, %xmm13
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm13, %xmm13
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm13
 ; GFNIAVX1-NEXT:    vpaddb %xmm14, %xmm14, %xmm14
 ; GFNIAVX1-NEXT:    vpblendvb %xmm14, %xmm13, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $1, %xmm2, %xmm13
-; GFNIAVX1-NEXT:    vpand %xmm8, %xmm13, %xmm13
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm8, %xmm2, %xmm13
 ; GFNIAVX1-NEXT:    vpaddb %xmm14, %xmm14, %xmm14
 ; GFNIAVX1-NEXT:    vpblendvb %xmm14, %xmm13, %xmm2, %xmm2
 ; GFNIAVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpsllw $4, %xmm0, %xmm13
-; GFNIAVX1-NEXT:    vpand %xmm9, %xmm13, %xmm13
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm9, %xmm0, %xmm13
 ; GFNIAVX1-NEXT:    vpxor %xmm6, %xmm11, %xmm11
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm11, %xmm11
 ; GFNIAVX1-NEXT:    vpblendvb %xmm11, %xmm13, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpsllw $2, %xmm0, %xmm13
-; GFNIAVX1-NEXT:    vpand %xmm10, %xmm13, %xmm13
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm10, %xmm0, %xmm13
 ; GFNIAVX1-NEXT:    vpaddb %xmm11, %xmm11, %xmm11
 ; GFNIAVX1-NEXT:    vpblendvb %xmm11, %xmm13, %xmm0, %xmm0
 ; GFNIAVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm13
@@ -2092,55 +1975,45 @@ define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
 ; GFNIAVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
 ; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm0
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm3, %xmm11
-; GFNIAVX1-NEXT:    vpsrlw $4, %xmm11, %xmm2
-; GFNIAVX1-NEXT:    vpand %xmm7, %xmm2, %xmm12
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm7, %xmm11, %xmm12
 ; GFNIAVX1-NEXT:    vandps %ymm6, %ymm5, %ymm2
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm5, %xmm13
 ; GFNIAVX1-NEXT:    vpblendvb %xmm13, %xmm12, %xmm11, %xmm11
-; GFNIAVX1-NEXT:    vpsrlw $2, %xmm11, %xmm12
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm12, %xmm12
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm11, %xmm12
 ; GFNIAVX1-NEXT:    vpaddb %xmm13, %xmm13, %xmm13
 ; GFNIAVX1-NEXT:    vpblendvb %xmm13, %xmm12, %xmm11, %xmm11
-; GFNIAVX1-NEXT:    vpsrlw $1, %xmm11, %xmm12
-; GFNIAVX1-NEXT:    vpand %xmm8, %xmm12, %xmm12
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm8, %xmm11, %xmm12
 ; GFNIAVX1-NEXT:    vpaddb %xmm13, %xmm13, %xmm13
 ; GFNIAVX1-NEXT:    vpblendvb %xmm13, %xmm12, %xmm11, %xmm11
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm12
 ; GFNIAVX1-NEXT:    vpaddb %xmm12, %xmm12, %xmm12
-; GFNIAVX1-NEXT:    vpsllw $4, %xmm12, %xmm13
-; GFNIAVX1-NEXT:    vpand %xmm9, %xmm13, %xmm13
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm9, %xmm12, %xmm13
 ; GFNIAVX1-NEXT:    vpxor %xmm6, %xmm5, %xmm5
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm5, %xmm5
 ; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm13, %xmm12, %xmm12
-; GFNIAVX1-NEXT:    vpsllw $2, %xmm12, %xmm13
-; GFNIAVX1-NEXT:    vpand %xmm10, %xmm13, %xmm13
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm10, %xmm12, %xmm13
 ; GFNIAVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
 ; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm13, %xmm12, %xmm12
 ; GFNIAVX1-NEXT:    vpaddb %xmm12, %xmm12, %xmm13
 ; GFNIAVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
 ; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm13, %xmm12, %xmm5
 ; GFNIAVX1-NEXT:    vpor %xmm5, %xmm11, %xmm5
-; GFNIAVX1-NEXT:    vpsrlw $4, %xmm3, %xmm11
-; GFNIAVX1-NEXT:    vpand %xmm7, %xmm11, %xmm7
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm7, %xmm3, %xmm7
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm2, %xmm11
 ; GFNIAVX1-NEXT:    vpblendvb %xmm11, %xmm7, %xmm3, %xmm3
-; GFNIAVX1-NEXT:    vpsrlw $2, %xmm3, %xmm7
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm7, %xmm4
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm3, %xmm4
 ; GFNIAVX1-NEXT:    vpaddb %xmm11, %xmm11, %xmm7
 ; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm4, %xmm3, %xmm3
-; GFNIAVX1-NEXT:    vpsrlw $1, %xmm3, %xmm4
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm8, %xmm4
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm8, %xmm3, %xmm4
 ; GFNIAVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm7
 ; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm4, %xmm3, %xmm3
 ; GFNIAVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpsllw $4, %xmm1, %xmm4
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm9, %xmm4
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm9, %xmm1, %xmm4
 ; GFNIAVX1-NEXT:    vpxor %xmm6, %xmm2, %xmm2
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm2, %xmm2
 ; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm4, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpsllw $2, %xmm1, %xmm4
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm10, %xmm4
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm10, %xmm1, %xmm4
 ; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
 ; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm4, %xmm1, %xmm1
 ; GFNIAVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm4
@@ -2152,60 +2025,50 @@ define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
 ;
 ; GFNIAVX2-LABEL: var_fshr_v64i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpsrlw $4, %ymm2, %ymm6
-; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; GFNIAVX2-NEXT:    vpand %ymm7, %ymm6, %ymm8
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm7, %ymm2, %ymm8
 ; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; GFNIAVX2-NEXT:    vpand %ymm6, %ymm4, %ymm9
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm9, %ymm9
 ; GFNIAVX2-NEXT:    vpblendvb %ymm9, %ymm8, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpsrlw $2, %ymm2, %ymm8
-; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; GFNIAVX2-NEXT:    vpand %ymm10, %ymm8, %ymm8
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [290499906672525312,290499906672525312,290499906672525312,290499906672525312]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm8, %ymm2, %ymm10
 ; GFNIAVX2-NEXT:    vpaddb %ymm9, %ymm9, %ymm9
-; GFNIAVX2-NEXT:    vpblendvb %ymm9, %ymm8, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpsrlw $1, %ymm2, %ymm8
-; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; GFNIAVX2-NEXT:    vpand %ymm11, %ymm8, %ymm8
+; GFNIAVX2-NEXT:    vpblendvb %ymm9, %ymm10, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [145249953336295424,145249953336295424,145249953336295424,145249953336295424]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm10, %ymm2, %ymm11
 ; GFNIAVX2-NEXT:    vpaddb %ymm9, %ymm9, %ymm9
-; GFNIAVX2-NEXT:    vpblendvb %ymm9, %ymm8, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpblendvb %ymm9, %ymm11, %ymm2, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpsllw $4, %ymm0, %ymm8
-; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm9 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; GFNIAVX2-NEXT:    vpand %ymm9, %ymm8, %ymm8
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [16909320,16909320,16909320,16909320]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm9, %ymm0, %ymm11
 ; GFNIAVX2-NEXT:    vpandn %ymm6, %ymm4, %ymm4
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm4, %ymm4
-; GFNIAVX2-NEXT:    vpblendvb %ymm4, %ymm8, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpsllw $2, %ymm0, %ymm8
-; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm12 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; GFNIAVX2-NEXT:    vpand %ymm12, %ymm8, %ymm8
+; GFNIAVX2-NEXT:    vpblendvb %ymm4, %ymm11, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm11 = [1108169199648,1108169199648,1108169199648,1108169199648]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm11, %ymm0, %ymm12
 ; GFNIAVX2-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
-; GFNIAVX2-NEXT:    vpblendvb %ymm4, %ymm8, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm8
+; GFNIAVX2-NEXT:    vpblendvb %ymm4, %ymm12, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm12
 ; GFNIAVX2-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
-; GFNIAVX2-NEXT:    vpblendvb %ymm4, %ymm8, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpblendvb %ymm4, %ymm12, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpsrlw $4, %ymm3, %ymm2
-; GFNIAVX2-NEXT:    vpand %ymm7, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm7, %ymm3, %ymm2
 ; GFNIAVX2-NEXT:    vpand %ymm6, %ymm5, %ymm4
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm4, %ymm4
 ; GFNIAVX2-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
-; GFNIAVX2-NEXT:    vpsrlw $2, %ymm2, %ymm3
-; GFNIAVX2-NEXT:    vpand %ymm3, %ymm10, %ymm3
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm8, %ymm2, %ymm3
 ; GFNIAVX2-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
 ; GFNIAVX2-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpsrlw $1, %ymm2, %ymm3
-; GFNIAVX2-NEXT:    vpand %ymm3, %ymm11, %ymm3
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm10, %ymm2, %ymm3
 ; GFNIAVX2-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
 ; GFNIAVX2-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpsllw $4, %ymm1, %ymm3
-; GFNIAVX2-NEXT:    vpand %ymm3, %ymm9, %ymm3
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm9, %ymm1, %ymm3
 ; GFNIAVX2-NEXT:    vpandn %ymm6, %ymm5, %ymm4
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm4, %ymm4
 ; GFNIAVX2-NEXT:    vpblendvb %ymm4, %ymm3, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpsllw $2, %ymm1, %ymm3
-; GFNIAVX2-NEXT:    vpand %ymm3, %ymm12, %ymm3
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm11, %ymm1, %ymm3
 ; GFNIAVX2-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
 ; GFNIAVX2-NEXT:    vpblendvb %ymm4, %ymm3, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm3
@@ -2216,62 +2079,52 @@ define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
 ;
 ; GFNIAVX512VL-LABEL: var_fshr_v64i8:
 ; GFNIAVX512VL:       # %bb.0:
-; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; GFNIAVX512VL-NEXT:    vpsrlw $4, %ymm4, %ymm3
-; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; GFNIAVX512VL-NEXT:    vpand %ymm5, %ymm3, %ymm6
-; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} zmm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; GFNIAVX512VL-NEXT:    vpandq %zmm7, %zmm2, %zmm2
-; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm3, %ymm8
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
-; GFNIAVX512VL-NEXT:    vpsrlw $2, %ymm4, %ymm6
-; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm9, %ymm6
+; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520]
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm3, %ymm5
+; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNIAVX512VL-NEXT:    vpandq %zmm6, %zmm2, %zmm2
+; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm2, %ymm7
+; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm7, %ymm8
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm8, %ymm5, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [290499906672525312,290499906672525312,290499906672525312,290499906672525312]
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm3, %ymm9
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm8, %ymm8, %ymm8
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
-; GFNIAVX512VL-NEXT:    vpsrlw $1, %ymm4, %ymm6
-; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm10, %ymm6
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm8, %ymm9, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [145249953336295424,145249953336295424,145249953336295424,145249953336295424]
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm9, %ymm3, %ymm10
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm8, %ymm8, %ymm8
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
-; GFNIAVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm6
-; GFNIAVX512VL-NEXT:    vpand %ymm5, %ymm6, %ymm5
-; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm2, %ymm6
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpsrlw $2, %ymm1, %ymm5
-; GFNIAVX512VL-NEXT:    vpand %ymm5, %ymm9, %ymm5
-; GFNIAVX512VL-NEXT:    vpaddb %ymm6, %ymm6, %ymm6
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpsrlw $1, %ymm1, %ymm5
-; GFNIAVX512VL-NEXT:    vpand %ymm5, %ymm10, %ymm5
-; GFNIAVX512VL-NEXT:    vpaddb %ymm6, %ymm6, %ymm6
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm4, %zmm1, %zmm1
-; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
-; GFNIAVX512VL-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
-; GFNIAVX512VL-NEXT:    vpsllw $4, %ymm4, %ymm5
-; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm5, %ymm5
-; GFNIAVX512VL-NEXT:    vpxor %ymm7, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
-; GFNIAVX512VL-NEXT:    vpsllw $2, %ymm4, %ymm5
-; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; GFNIAVX512VL-NEXT:    vpand %ymm5, %ymm8, %ymm5
-; GFNIAVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
-; GFNIAVX512VL-NEXT:    vpaddb %ymm4, %ymm4, %ymm5
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm8, %ymm10, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm1, %ymm4
+; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm2, %ymm8
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm8, %ymm4, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm1, %ymm4
+; GFNIAVX512VL-NEXT:    vpaddb %ymm8, %ymm8, %ymm5
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm9, %ymm1, %ymm4
+; GFNIAVX512VL-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [16909320,16909320,16909320,16909320]
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm3, %ymm5
+; GFNIAVX512VL-NEXT:    vpxor %ymm6, %ymm7, %ymm7
+; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm7, %ymm7
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm7, %ymm5, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [1108169199648,1108169199648,1108169199648,1108169199648]
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm3, %ymm8
+; GFNIAVX512VL-NEXT:    vpaddb %ymm7, %ymm7, %ymm7
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm7, %ymm8, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm8
+; GFNIAVX512VL-NEXT:    vpaddb %ymm7, %ymm7, %ymm7
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm7, %ymm8, %ymm3, %ymm3
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm4
-; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm4, %ymm4
-; GFNIAVX512VL-NEXT:    vpxor %ymm7, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm0, %ymm4
+; GFNIAVX512VL-NEXT:    vpxor %ymm6, %ymm2, %ymm2
 ; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm2, %ymm2
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm4
-; GFNIAVX512VL-NEXT:    vpand %ymm4, %ymm8, %ymm4
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm0, %ymm4
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm4
@@ -2874,45 +2727,31 @@ define <64 x i8> @constant_fshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 define <64 x i8> @splatconstant_fshl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNISSE-LABEL: splatconstant_fshl_v64i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    psrlw $7, %xmm4
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; GFNISSE-NEXT:    pand %xmm8, %xmm4
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm8, %xmm4
 ; GFNISSE-NEXT:    paddb %xmm0, %xmm0
 ; GFNISSE-NEXT:    por %xmm4, %xmm0
-; GFNISSE-NEXT:    psrlw $7, %xmm5
-; GFNISSE-NEXT:    pand %xmm8, %xmm5
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm8, %xmm5
 ; GFNISSE-NEXT:    paddb %xmm1, %xmm1
 ; GFNISSE-NEXT:    por %xmm5, %xmm1
-; GFNISSE-NEXT:    psrlw $7, %xmm6
-; GFNISSE-NEXT:    pand %xmm8, %xmm6
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm8, %xmm6
 ; GFNISSE-NEXT:    paddb %xmm2, %xmm2
 ; GFNISSE-NEXT:    por %xmm6, %xmm2
-; GFNISSE-NEXT:    psrlw $7, %xmm7
-; GFNISSE-NEXT:    pand %xmm7, %xmm8
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm8, %xmm7
 ; GFNISSE-NEXT:    paddb %xmm3, %xmm3
-; GFNISSE-NEXT:    por %xmm8, %xmm3
+; GFNISSE-NEXT:    por %xmm7, %xmm3
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX1-LABEL: splatconstant_fshl_v64i8:
 ; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; GFNIAVX1-NEXT:    vpsrlw $7, %xmm4, %xmm4
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; GFNIAVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
-; GFNIAVX1-NEXT:    vpsrlw $7, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; GFNIAVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm4
+; GFNIAVX1-NEXT:    vbroadcastsd {{.*#+}} ymm4 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm2, %ymm2
+; GFNIAVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm5
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; GFNIAVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm0
+; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm5, %ymm0
 ; GFNIAVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $7, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $7, %xmm3, %xmm3
-; GFNIAVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm3, %ymm2
 ; GFNIAVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm3
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; GFNIAVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
@@ -2922,35 +2761,30 @@ define <64 x i8> @splatconstant_fshl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind
 ;
 ; GFNIAVX2-LABEL: splatconstant_fshl_v64i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpsrlw $7, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; GFNIAVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm2, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpsrlw $7, %ymm3, %ymm2
-; GFNIAVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm3, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vpor %ymm2, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    retq
 ;
 ; GFNIAVX512VL-LABEL: splatconstant_fshl_v64i8:
 ; GFNIAVX512VL:       # %bb.0:
-; GFNIAVX512VL-NEXT:    vpsrlw $7, %ymm1, %ymm2
-; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpsrlw $7, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
 ; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
-; GFNIAVX512VL-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
+; GFNIAVX512VL-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; GFNIAVX512VL-NEXT:    retq
 ;
 ; GFNIAVX512BW-LABEL: splatconstant_fshl_v64i8:
 ; GFNIAVX512BW:       # %bb.0:
-; GFNIAVX512BW-NEXT:    vpsrlw $7, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT:    vpaddb %zmm0, %zmm0, %zmm0
-; GFNIAVX512BW-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; GFNIAVX512BW-NEXT:    vpaddw %zmm0, %zmm0, %zmm2
+; GFNIAVX512BW-NEXT:    vpsrlw $7, %zmm1, %zmm0
+; GFNIAVX512BW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
 ; GFNIAVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   ret <64 x i8> %res
@@ -2960,90 +2794,51 @@ declare <64 x i8> @llvm.fshl.v64i8(<64 x i8>, <64 x i8>, <64 x i8>)
 define <64 x i8> @splatconstant_fshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNISSE-LABEL: splatconstant_fshr_v64i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    psrlw $2, %xmm4
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm8 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192]
-; GFNISSE-NEXT:    movdqa %xmm8, %xmm9
-; GFNISSE-NEXT:    pandn %xmm4, %xmm9
-; GFNISSE-NEXT:    psllw $6, %xmm0
-; GFNISSE-NEXT:    pand %xmm8, %xmm0
-; GFNISSE-NEXT:    por %xmm9, %xmm0
-; GFNISSE-NEXT:    psrlw $2, %xmm5
-; GFNISSE-NEXT:    movdqa %xmm8, %xmm4
-; GFNISSE-NEXT:    pandn %xmm5, %xmm4
-; GFNISSE-NEXT:    psllw $6, %xmm1
-; GFNISSE-NEXT:    pand %xmm8, %xmm1
-; GFNISSE-NEXT:    por %xmm4, %xmm1
-; GFNISSE-NEXT:    psrlw $2, %xmm6
-; GFNISSE-NEXT:    movdqa %xmm8, %xmm4
-; GFNISSE-NEXT:    pandn %xmm6, %xmm4
-; GFNISSE-NEXT:    psllw $6, %xmm2
-; GFNISSE-NEXT:    pand %xmm8, %xmm2
-; GFNISSE-NEXT:    por %xmm4, %xmm2
-; GFNISSE-NEXT:    psrlw $2, %xmm7
-; GFNISSE-NEXT:    psllw $6, %xmm3
-; GFNISSE-NEXT:    pand %xmm8, %xmm3
-; GFNISSE-NEXT:    pandn %xmm7, %xmm8
-; GFNISSE-NEXT:    por %xmm8, %xmm3
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm8 = [290499906672525312,290499906672525312]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm8, %xmm4
+; GFNISSE-NEXT:    pmovsxwq {{.*#+}} xmm9 = [258,258]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm0
+; GFNISSE-NEXT:    por %xmm4, %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm8, %xmm5
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm1
+; GFNISSE-NEXT:    por %xmm5, %xmm1
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm8, %xmm6
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm2
+; GFNISSE-NEXT:    por %xmm6, %xmm2
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm8, %xmm7
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm3
+; GFNISSE-NEXT:    por %xmm7, %xmm3
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX1-LABEL: splatconstant_fshr_v64i8:
 ; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; GFNIAVX1-NEXT:    vpsrlw $2, %xmm4, %xmm4
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; GFNIAVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
-; GFNIAVX1-NEXT:    vpsrlw $2, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; GFNIAVX1-NEXT:    vpsllw $6, %xmm4, %xmm4
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm6 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192]
-; GFNIAVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
-; GFNIAVX1-NEXT:    vpsllw $6, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpand %xmm6, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vbroadcastsd {{.*#+}} ymm4 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm2, %ymm2
+; GFNIAVX1-NEXT:    vbroadcastsd {{.*#+}} ymm5 = [2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm0, %ymm0
 ; GFNIAVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $2, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $2, %xmm3, %xmm3
-; GFNIAVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; GFNIAVX1-NEXT:    vpsllw $6, %xmm3, %xmm3
-; GFNIAVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
-; GFNIAVX1-NEXT:    vpsllw $6, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpand %xmm6, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm3, %ymm2
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm1, %ymm1
 ; GFNIAVX1-NEXT:    vorps %ymm2, %ymm1, %ymm1
 ; GFNIAVX1-NEXT:    retq
 ;
 ; GFNIAVX2-LABEL: splatconstant_fshr_v64i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpsrlw $2, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm4 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192]
-; GFNIAVX2-NEXT:    vpandn %ymm2, %ymm4, %ymm2
-; GFNIAVX2-NEXT:    vpsllw $6, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [290499906672525312,290499906672525312,290499906672525312,290499906672525312]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [258,258,258,258]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpsrlw $2, %ymm3, %ymm2
-; GFNIAVX2-NEXT:    vpandn %ymm2, %ymm4, %ymm2
-; GFNIAVX2-NEXT:    vpsllw $6, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm3, %ymm2
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vpor %ymm2, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    retq
 ;
 ; GFNIAVX512VL-LABEL: splatconstant_fshr_v64i8:
 ; GFNIAVX512VL:       # %bb.0:
-; GFNIAVX512VL-NEXT:    vpsllw $6, %ymm0, %ymm2
-; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; GFNIAVX512VL-NEXT:    vpsllw $6, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm2
-; GFNIAVX512VL-NEXT:    vpsrlw $2, %ymm1, %ymm0
-; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpsrlw $2, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; GFNIAVX512VL-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; GFNIAVX512VL-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; GFNIAVX512VL-NEXT:    retq
 ;
 ; GFNIAVX512BW-LABEL: splatconstant_fshr_v64i8:
diff --git a/llvm/test/CodeGen/X86/gfni-rotates.ll b/llvm/test/CodeGen/X86/gfni-rotates.ll
index 96aff5b2af31..9ddadca380fe 100644
--- a/llvm/test/CodeGen/X86/gfni-rotates.ll
+++ b/llvm/test/CodeGen/X86/gfni-rotates.ll
@@ -14,28 +14,23 @@ define <16 x i8> @var_rotl_v16i8(<16 x i8> %a, <16 x i8> %amt) nounwind {
 ; GFNISSE:       # %bb.0:
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm2
 ; GFNISSE-NEXT:    movdqa %xmm0, %xmm1
-; GFNISSE-NEXT:    psrlw $4, %xmm0
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm3
-; GFNISSE-NEXT:    psllw $4, %xmm3
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
 ; GFNISSE-NEXT:    por %xmm0, %xmm3
 ; GFNISSE-NEXT:    psllw $5, %xmm2
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
-; GFNISSE-NEXT:    psrlw $6, %xmm0
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm3
-; GFNISSE-NEXT:    psllw $2, %xmm3
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
 ; GFNISSE-NEXT:    por %xmm0, %xmm3
 ; GFNISSE-NEXT:    paddb %xmm2, %xmm2
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
-; GFNISSE-NEXT:    psrlw $7, %xmm0
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm3
 ; GFNISSE-NEXT:    paddb %xmm1, %xmm3
 ; GFNISSE-NEXT:    por %xmm0, %xmm3
@@ -47,22 +42,17 @@ define <16 x i8> @var_rotl_v16i8(<16 x i8> %a, <16 x i8> %amt) nounwind {
 ;
 ; GFNIAVX1OR2-LABEL: var_rotl_v16i8:
 ; GFNIAVX1OR2:       # %bb.0:
-; GFNIAVX1OR2-NEXT:    vpsrlw $4, %xmm0, %xmm2
-; GFNIAVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; GFNIAVX1OR2-NEXT:    vpsllw $4, %xmm0, %xmm3
-; GFNIAVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
 ; GFNIAVX1OR2-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; GFNIAVX1OR2-NEXT:    vpsllw $5, %xmm1, %xmm1
 ; GFNIAVX1OR2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; GFNIAVX1OR2-NEXT:    vpsrlw $6, %xmm0, %xmm2
-; GFNIAVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; GFNIAVX1OR2-NEXT:    vpsllw $2, %xmm0, %xmm3
-; GFNIAVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
 ; GFNIAVX1OR2-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; GFNIAVX1OR2-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
 ; GFNIAVX1OR2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; GFNIAVX1OR2-NEXT:    vpsrlw $7, %xmm0, %xmm2
-; GFNIAVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
 ; GFNIAVX1OR2-NEXT:    vpaddb %xmm0, %xmm0, %xmm3
 ; GFNIAVX1OR2-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; GFNIAVX1OR2-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
@@ -103,28 +93,23 @@ define <16 x i8> @var_rotr_v16i8(<16 x i8> %a, <16 x i8> %amt) nounwind {
 ; GFNISSE-LABEL: var_rotr_v16i8:
 ; GFNISSE:       # %bb.0:
 ; GFNISSE-NEXT:    movdqa %xmm0, %xmm2
-; GFNISSE-NEXT:    psrlw $4, %xmm0
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm3
-; GFNISSE-NEXT:    psllw $4, %xmm3
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
 ; GFNISSE-NEXT:    por %xmm0, %xmm3
 ; GFNISSE-NEXT:    pxor %xmm0, %xmm0
 ; GFNISSE-NEXT:    psubb %xmm1, %xmm0
 ; GFNISSE-NEXT:    psllw $5, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm1
-; GFNISSE-NEXT:    psrlw $6, %xmm1
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm3
-; GFNISSE-NEXT:    psllw $2, %xmm3
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
 ; GFNISSE-NEXT:    por %xmm1, %xmm3
 ; GFNISSE-NEXT:    paddb %xmm0, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm1
-; GFNISSE-NEXT:    psrlw $7, %xmm1
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm3
 ; GFNISSE-NEXT:    paddb %xmm2, %xmm3
 ; GFNISSE-NEXT:    por %xmm1, %xmm3
@@ -135,24 +120,19 @@ define <16 x i8> @var_rotr_v16i8(<16 x i8> %a, <16 x i8> %amt) nounwind {
 ;
 ; GFNIAVX1OR2-LABEL: var_rotr_v16i8:
 ; GFNIAVX1OR2:       # %bb.0:
-; GFNIAVX1OR2-NEXT:    vpsrlw $4, %xmm0, %xmm2
-; GFNIAVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; GFNIAVX1OR2-NEXT:    vpsllw $4, %xmm0, %xmm3
-; GFNIAVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
 ; GFNIAVX1OR2-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; GFNIAVX1OR2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; GFNIAVX1OR2-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
 ; GFNIAVX1OR2-NEXT:    vpsllw $5, %xmm1, %xmm1
 ; GFNIAVX1OR2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; GFNIAVX1OR2-NEXT:    vpsrlw $6, %xmm0, %xmm2
-; GFNIAVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; GFNIAVX1OR2-NEXT:    vpsllw $2, %xmm0, %xmm3
-; GFNIAVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
 ; GFNIAVX1OR2-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; GFNIAVX1OR2-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
 ; GFNIAVX1OR2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; GFNIAVX1OR2-NEXT:    vpsrlw $7, %xmm0, %xmm2
-; GFNIAVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
 ; GFNIAVX1OR2-NEXT:    vpaddb %xmm0, %xmm0, %xmm3
 ; GFNIAVX1OR2-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; GFNIAVX1OR2-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
@@ -389,28 +369,17 @@ define <16 x i8> @constant_rotr_v16i8(<16 x i8> %a) nounwind {
 define <16 x i8> @splatconstant_rotl_v16i8(<16 x i8> %a) nounwind {
 ; GFNISSE-LABEL: splatconstant_rotl_v16i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa %xmm0, %xmm1
-; GFNISSE-NEXT:    psrlw $5, %xmm1
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; GFNISSE-NEXT:    psllw $3, %xmm0
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; GFNISSE-NEXT:    por %xmm1, %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX1OR2-LABEL: splatconstant_rotl_v16i8:
 ; GFNIAVX1OR2:       # %bb.0:
-; GFNIAVX1OR2-NEXT:    vpsrlw $5, %xmm0, %xmm1
-; GFNIAVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; GFNIAVX1OR2-NEXT:    vpsllw $3, %xmm0, %xmm0
-; GFNIAVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; GFNIAVX1OR2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; GFNIAVX1OR2-NEXT:    retq
 ;
 ; GFNIAVX512-LABEL: splatconstant_rotl_v16i8:
 ; GFNIAVX512:       # %bb.0:
-; GFNIAVX512-NEXT:    vpsllw $3, %xmm0, %xmm1
-; GFNIAVX512-NEXT:    vpsrlw $5, %xmm0, %xmm0
-; GFNIAVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
+; GFNIAVX512-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
 ; GFNIAVX512-NEXT:    retq
   %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
   ret <16 x i8> %res
@@ -420,26 +389,17 @@ declare <16 x i8> @llvm.fshl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
 define <16 x i8> @splatconstant_rotr_v16i8(<16 x i8> %a) nounwind {
 ; GFNISSE-LABEL: splatconstant_rotr_v16i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa %xmm0, %xmm1
-; GFNISSE-NEXT:    paddb %xmm0, %xmm1
-; GFNISSE-NEXT:    psrlw $7, %xmm0
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; GFNISSE-NEXT:    por %xmm1, %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX1OR2-LABEL: splatconstant_rotr_v16i8:
 ; GFNIAVX1OR2:       # %bb.0:
-; GFNIAVX1OR2-NEXT:    vpaddb %xmm0, %xmm0, %xmm1
-; GFNIAVX1OR2-NEXT:    vpsrlw $7, %xmm0, %xmm0
-; GFNIAVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; GFNIAVX1OR2-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; GFNIAVX1OR2-NEXT:    retq
 ;
 ; GFNIAVX512-LABEL: splatconstant_rotr_v16i8:
 ; GFNIAVX512:       # %bb.0:
-; GFNIAVX512-NEXT:    vpsrlw $7, %xmm0, %xmm1
-; GFNIAVX512-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
-; GFNIAVX512-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
+; GFNIAVX512-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
 ; GFNIAVX512-NEXT:    retq
   %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>)
   ret <16 x i8> %res
@@ -455,62 +415,52 @@ define <32 x i8> @var_rotl_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
 ; GFNISSE:       # %bb.0:
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm0, %xmm2
-; GFNISSE-NEXT:    psrlw $4, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; GFNISSE-NEXT:    movdqa %xmm5, %xmm6
-; GFNISSE-NEXT:    pandn %xmm0, %xmm6
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [1161999622361579520,1161999622361579520]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
+; GFNISSE-NEXT:    pmovsxdq {{.*#+}} xmm6 = [16909320,16909320]
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm7
-; GFNISSE-NEXT:    psllw $4, %xmm7
-; GFNISSE-NEXT:    pand %xmm5, %xmm7
-; GFNISSE-NEXT:    por %xmm6, %xmm7
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm6, %xmm7
+; GFNISSE-NEXT:    por %xmm0, %xmm7
 ; GFNISSE-NEXT:    psllw $5, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm7, %xmm2
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm7 = [4647714815446351872,4647714815446351872]
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm0
-; GFNISSE-NEXT:    psrlw $6, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; GFNISSE-NEXT:    movdqa %xmm6, %xmm7
-; GFNISSE-NEXT:    pandn %xmm0, %xmm7
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm8
-; GFNISSE-NEXT:    psllw $2, %xmm8
-; GFNISSE-NEXT:    pand %xmm6, %xmm8
-; GFNISSE-NEXT:    por %xmm7, %xmm8
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm7, %xmm0
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm8 = [1108169199648,1108169199648]
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm9
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm8, %xmm9
+; GFNISSE-NEXT:    por %xmm0, %xmm9
 ; GFNISSE-NEXT:    paddb %xmm4, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm2
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm9, %xmm2
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm0
-; GFNISSE-NEXT:    psrlw $7, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; GFNISSE-NEXT:    pand %xmm7, %xmm0
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm8
-; GFNISSE-NEXT:    paddb %xmm2, %xmm8
-; GFNISSE-NEXT:    por %xmm0, %xmm8
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm0
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm10
+; GFNISSE-NEXT:    paddb %xmm2, %xmm10
+; GFNISSE-NEXT:    por %xmm0, %xmm10
 ; GFNISSE-NEXT:    paddb %xmm4, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm2
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm10, %xmm2
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
-; GFNISSE-NEXT:    psrlw $4, %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm4
-; GFNISSE-NEXT:    psllw $4, %xmm4
-; GFNISSE-NEXT:    pand %xmm5, %xmm4
-; GFNISSE-NEXT:    pandn %xmm0, %xmm5
-; GFNISSE-NEXT:    por %xmm4, %xmm5
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm6, %xmm4
+; GFNISSE-NEXT:    por %xmm0, %xmm4
 ; GFNISSE-NEXT:    psllw $5, %xmm3
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm1
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm4, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
-; GFNISSE-NEXT:    psrlw $6, %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm7, %xmm0
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm4
-; GFNISSE-NEXT:    psllw $2, %xmm4
-; GFNISSE-NEXT:    pand %xmm6, %xmm4
-; GFNISSE-NEXT:    pandn %xmm0, %xmm6
-; GFNISSE-NEXT:    por %xmm4, %xmm6
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm8, %xmm4
+; GFNISSE-NEXT:    por %xmm0, %xmm4
 ; GFNISSE-NEXT:    paddb %xmm3, %xmm3
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm6, %xmm1
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm4, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
-; GFNISSE-NEXT:    psrlw $7, %xmm0
-; GFNISSE-NEXT:    pand %xmm7, %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm0
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm4
 ; GFNISSE-NEXT:    paddb %xmm1, %xmm4
 ; GFNISSE-NEXT:    por %xmm0, %xmm4
@@ -523,46 +473,43 @@ define <32 x i8> @var_rotl_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
 ; GFNIAVX1-LABEL: var_rotl_v32i8:
 ; GFNIAVX1:       # %bb.0:
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $4, %xmm2, %xmm3
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; GFNIAVX1-NEXT:    vpandn %xmm3, %xmm4, %xmm3
-; GFNIAVX1-NEXT:    vpsllw $4, %xmm2, %xmm5
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [1161999622361579520,1161999622361579520]
+; GFNIAVX1-NEXT:    # xmm3 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm2, %xmm4
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [16909320,16909320]
+; GFNIAVX1-NEXT:    # xmm5 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm2, %xmm6
+; GFNIAVX1-NEXT:    vpor %xmm4, %xmm6, %xmm4
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; GFNIAVX1-NEXT:    vpsllw $5, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpblendvb %xmm6, %xmm4, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [4647714815446351872,4647714815446351872]
+; GFNIAVX1-NEXT:    # xmm4 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm7
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm8 = [1108169199648,1108169199648]
+; GFNIAVX1-NEXT:    # xmm8 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm8, %xmm2, %xmm9
+; GFNIAVX1-NEXT:    vpor %xmm7, %xmm9, %xmm7
+; GFNIAVX1-NEXT:    vpaddb %xmm6, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpblendvb %xmm6, %xmm7, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808]
+; GFNIAVX1-NEXT:    # xmm7 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm7, %xmm2, %xmm9
+; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm10
+; GFNIAVX1-NEXT:    vpor %xmm9, %xmm10, %xmm9
+; GFNIAVX1-NEXT:    vpaddb %xmm6, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpblendvb %xmm6, %xmm9, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm0, %xmm3
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm0, %xmm5
 ; GFNIAVX1-NEXT:    vpor %xmm3, %xmm5, %xmm3
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; GFNIAVX1-NEXT:    vpsllw $5, %xmm5, %xmm5
-; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $6, %xmm2, %xmm3
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; GFNIAVX1-NEXT:    vpandn %xmm3, %xmm6, %xmm3
-; GFNIAVX1-NEXT:    vpsllw $2, %xmm2, %xmm7
-; GFNIAVX1-NEXT:    vpand %xmm6, %xmm7, %xmm7
-; GFNIAVX1-NEXT:    vpor %xmm3, %xmm7, %xmm3
-; GFNIAVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
-; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $7, %xmm2, %xmm3
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; GFNIAVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
-; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm8
-; GFNIAVX1-NEXT:    vpor %xmm3, %xmm8, %xmm3
-; GFNIAVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
-; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $4, %xmm0, %xmm3
-; GFNIAVX1-NEXT:    vpandn %xmm3, %xmm4, %xmm3
-; GFNIAVX1-NEXT:    vpsllw $4, %xmm0, %xmm5
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm5, %xmm4
-; GFNIAVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
 ; GFNIAVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpsrlw $6, %xmm0, %xmm3
-; GFNIAVX1-NEXT:    vpandn %xmm3, %xmm6, %xmm3
-; GFNIAVX1-NEXT:    vpsllw $2, %xmm0, %xmm4
-; GFNIAVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm0, %xmm3
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm8, %xmm0, %xmm4
 ; GFNIAVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
 ; GFNIAVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
 ; GFNIAVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpsrlw $7, %xmm0, %xmm3
-; GFNIAVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm7, %xmm0, %xmm3
 ; GFNIAVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm4
 ; GFNIAVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
 ; GFNIAVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
@@ -572,22 +519,22 @@ define <32 x i8> @var_rotl_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
 ;
 ; GFNIAVX2-LABEL: var_rotl_v32i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpsrlw $4, %ymm0, %ymm2
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpsllw $4, %ymm0, %ymm3
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [16909320,16909320,16909320,16909320]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm3
 ; GFNIAVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpsrlw $6, %ymm0, %ymm2
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpsllw $2, %ymm0, %ymm3
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4647714815446351872,4647714815446351872,4647714815446351872,4647714815446351872]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1108169199648,1108169199648,1108169199648,1108169199648]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm3
 ; GFNIAVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpsrlw $7, %ymm0, %ymm2
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm3
 ; GFNIAVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
@@ -596,21 +543,21 @@ define <32 x i8> @var_rotl_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
 ;
 ; GFNIAVX512VL-LABEL: var_rotl_v32i8:
 ; GFNIAVX512VL:       # %bb.0:
-; GFNIAVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm2
-; GFNIAVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm3
+; GFNIAVX512VL-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm2
-; GFNIAVX512VL-NEXT:    vpsrlw $6, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm3
+; GFNIAVX512VL-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vpsrlw $7, %ymm0, %ymm2
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3
+; GFNIAVX512VL-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    retq
 ;
 ; GFNIAVX512BW-LABEL: var_rotl_v32i8:
@@ -634,63 +581,53 @@ define <32 x i8> @var_rotr_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
 ; GFNISSE-LABEL: var_rotr_v32i8:
 ; GFNISSE:       # %bb.0:
 ; GFNISSE-NEXT:    movdqa %xmm0, %xmm5
-; GFNISSE-NEXT:    psrlw $4, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; GFNISSE-NEXT:    movdqa %xmm6, %xmm4
-; GFNISSE-NEXT:    pandn %xmm0, %xmm4
-; GFNISSE-NEXT:    movdqa %xmm5, %xmm7
-; GFNISSE-NEXT:    psllw $4, %xmm7
-; GFNISSE-NEXT:    pand %xmm6, %xmm7
-; GFNISSE-NEXT:    por %xmm4, %xmm7
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm6 = [1161999622361579520,1161999622361579520]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm6, %xmm0
+; GFNISSE-NEXT:    pmovsxdq {{.*#+}} xmm7 = [16909320,16909320]
+; GFNISSE-NEXT:    movdqa %xmm5, %xmm8
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm7, %xmm8
+; GFNISSE-NEXT:    por %xmm0, %xmm8
 ; GFNISSE-NEXT:    pxor %xmm4, %xmm4
 ; GFNISSE-NEXT:    pxor %xmm0, %xmm0
 ; GFNISSE-NEXT:    psubb %xmm2, %xmm0
 ; GFNISSE-NEXT:    psllw $5, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm7, %xmm5
-; GFNISSE-NEXT:    movdqa %xmm5, %xmm7
-; GFNISSE-NEXT:    psrlw $6, %xmm7
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm8
-; GFNISSE-NEXT:    pandn %xmm7, %xmm8
-; GFNISSE-NEXT:    movdqa %xmm5, %xmm7
-; GFNISSE-NEXT:    psllw $2, %xmm7
-; GFNISSE-NEXT:    pand %xmm2, %xmm7
-; GFNISSE-NEXT:    por %xmm8, %xmm7
-; GFNISSE-NEXT:    paddb %xmm0, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm7, %xmm5
-; GFNISSE-NEXT:    movdqa %xmm5, %xmm8
-; GFNISSE-NEXT:    psrlw $7, %xmm8
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; GFNISSE-NEXT:    pand %xmm7, %xmm8
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm5
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [4647714815446351872,4647714815446351872]
 ; GFNISSE-NEXT:    movdqa %xmm5, %xmm9
-; GFNISSE-NEXT:    paddb %xmm5, %xmm9
-; GFNISSE-NEXT:    por %xmm8, %xmm9
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm9
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm8 = [1108169199648,1108169199648]
+; GFNISSE-NEXT:    movdqa %xmm5, %xmm10
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm8, %xmm10
+; GFNISSE-NEXT:    por %xmm9, %xmm10
 ; GFNISSE-NEXT:    paddb %xmm0, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm9, %xmm5
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm10, %xmm5
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
+; GFNISSE-NEXT:    movdqa %xmm5, %xmm10
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm10
+; GFNISSE-NEXT:    movdqa %xmm5, %xmm11
+; GFNISSE-NEXT:    paddb %xmm5, %xmm11
+; GFNISSE-NEXT:    por %xmm10, %xmm11
+; GFNISSE-NEXT:    paddb %xmm0, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm11, %xmm5
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
-; GFNISSE-NEXT:    psrlw $4, %xmm0
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm8
-; GFNISSE-NEXT:    psllw $4, %xmm8
-; GFNISSE-NEXT:    pand %xmm6, %xmm8
-; GFNISSE-NEXT:    pandn %xmm0, %xmm6
-; GFNISSE-NEXT:    por %xmm8, %xmm6
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm6, %xmm0
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm6
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm7, %xmm6
+; GFNISSE-NEXT:    por %xmm0, %xmm6
 ; GFNISSE-NEXT:    psubb %xmm3, %xmm4
 ; GFNISSE-NEXT:    psllw $5, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm6, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
-; GFNISSE-NEXT:    psrlw $6, %xmm0
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm3
-; GFNISSE-NEXT:    psllw $2, %xmm3
-; GFNISSE-NEXT:    pand %xmm2, %xmm3
-; GFNISSE-NEXT:    pandn %xmm0, %xmm2
-; GFNISSE-NEXT:    por %xmm3, %xmm2
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm0
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm2
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm8, %xmm2
+; GFNISSE-NEXT:    por %xmm0, %xmm2
 ; GFNISSE-NEXT:    paddb %xmm4, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
-; GFNISSE-NEXT:    psrlw $7, %xmm0
-; GFNISSE-NEXT:    pand %xmm7, %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm0
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm2
 ; GFNISSE-NEXT:    paddb %xmm1, %xmm2
 ; GFNISSE-NEXT:    por %xmm0, %xmm2
@@ -703,49 +640,46 @@ define <32 x i8> @var_rotr_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
 ; GFNIAVX1-LABEL: var_rotr_v32i8:
 ; GFNIAVX1:       # %bb.0:
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $4, %xmm2, %xmm3
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; GFNIAVX1-NEXT:    vpandn %xmm3, %xmm4, %xmm3
-; GFNIAVX1-NEXT:    vpsllw $4, %xmm2, %xmm5
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [1161999622361579520,1161999622361579520]
+; GFNIAVX1-NEXT:    # xmm3 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm2, %xmm4
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [16909320,16909320]
+; GFNIAVX1-NEXT:    # xmm5 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm2, %xmm6
+; GFNIAVX1-NEXT:    vpor %xmm4, %xmm6, %xmm4
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; GFNIAVX1-NEXT:    vpxor %xmm7, %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vpsubb %xmm6, %xmm7, %xmm6
+; GFNIAVX1-NEXT:    vpsllw $5, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpblendvb %xmm6, %xmm4, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [4647714815446351872,4647714815446351872]
+; GFNIAVX1-NEXT:    # xmm4 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm8
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm9 = [1108169199648,1108169199648]
+; GFNIAVX1-NEXT:    # xmm9 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm9, %xmm2, %xmm10
+; GFNIAVX1-NEXT:    vpor %xmm8, %xmm10, %xmm8
+; GFNIAVX1-NEXT:    vpaddb %xmm6, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpblendvb %xmm6, %xmm8, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
+; GFNIAVX1-NEXT:    # xmm8 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm8, %xmm2, %xmm10
+; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm11
+; GFNIAVX1-NEXT:    vpor %xmm10, %xmm11, %xmm10
+; GFNIAVX1-NEXT:    vpaddb %xmm6, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpblendvb %xmm6, %xmm10, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm0, %xmm3
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm0, %xmm5
 ; GFNIAVX1-NEXT:    vpor %xmm3, %xmm5, %xmm3
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; GFNIAVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; GFNIAVX1-NEXT:    vpsubb %xmm5, %xmm6, %xmm5
-; GFNIAVX1-NEXT:    vpsllw $5, %xmm5, %xmm5
-; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $6, %xmm2, %xmm3
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; GFNIAVX1-NEXT:    vpandn %xmm3, %xmm7, %xmm3
-; GFNIAVX1-NEXT:    vpsllw $2, %xmm2, %xmm8
-; GFNIAVX1-NEXT:    vpand %xmm7, %xmm8, %xmm8
-; GFNIAVX1-NEXT:    vpor %xmm3, %xmm8, %xmm3
-; GFNIAVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
-; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $7, %xmm2, %xmm3
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; GFNIAVX1-NEXT:    vpand %xmm3, %xmm8, %xmm3
-; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm9
-; GFNIAVX1-NEXT:    vpor %xmm3, %xmm9, %xmm3
-; GFNIAVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
-; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $4, %xmm0, %xmm3
-; GFNIAVX1-NEXT:    vpandn %xmm3, %xmm4, %xmm3
-; GFNIAVX1-NEXT:    vpsllw $4, %xmm0, %xmm5
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm5, %xmm4
-; GFNIAVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
-; GFNIAVX1-NEXT:    vpsubb %xmm1, %xmm6, %xmm1
+; GFNIAVX1-NEXT:    vpsubb %xmm1, %xmm7, %xmm1
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
 ; GFNIAVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpsrlw $6, %xmm0, %xmm3
-; GFNIAVX1-NEXT:    vpandn %xmm3, %xmm7, %xmm3
-; GFNIAVX1-NEXT:    vpsllw $2, %xmm0, %xmm4
-; GFNIAVX1-NEXT:    vpand %xmm7, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm0, %xmm3
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm9, %xmm0, %xmm4
 ; GFNIAVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
 ; GFNIAVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
 ; GFNIAVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpsrlw $7, %xmm0, %xmm3
-; GFNIAVX1-NEXT:    vpand %xmm3, %xmm8, %xmm3
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm8, %xmm0, %xmm3
 ; GFNIAVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm4
 ; GFNIAVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
 ; GFNIAVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
@@ -755,24 +689,24 @@ define <32 x i8> @var_rotr_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
 ;
 ; GFNIAVX2-LABEL: var_rotr_v32i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpsrlw $4, %ymm0, %ymm2
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpsllw $4, %ymm0, %ymm3
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [16909320,16909320,16909320,16909320]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm3
 ; GFNIAVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; GFNIAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; GFNIAVX2-NEXT:    vpsubb %ymm1, %ymm3, %ymm1
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpsrlw $6, %ymm0, %ymm2
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpsllw $2, %ymm0, %ymm3
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4647714815446351872,4647714815446351872,4647714815446351872,4647714815446351872]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1108169199648,1108169199648,1108169199648,1108169199648]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm3
 ; GFNIAVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpsrlw $7, %ymm0, %ymm2
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm3
 ; GFNIAVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
@@ -781,21 +715,21 @@ define <32 x i8> @var_rotr_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
 ;
 ; GFNIAVX512VL-LABEL: var_rotr_v32i8:
 ; GFNIAVX512VL:       # %bb.0:
-; GFNIAVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm2
-; GFNIAVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm3
+; GFNIAVX512VL-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vpsrlw $2, %ymm0, %ymm2
-; GFNIAVX512VL-NEXT:    vpsllw $6, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm3
+; GFNIAVX512VL-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vpsrlw $1, %ymm0, %ymm2
-; GFNIAVX512VL-NEXT:    vpsllw $7, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm3
+; GFNIAVX512VL-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    retq
 ;
 ; GFNIAVX512BW-LABEL: var_rotr_v32i8:
@@ -1141,53 +1075,25 @@ define <32 x i8> @constant_rotr_v32i8(<32 x i8> %a) nounwind {
 define <32 x i8> @splatconstant_rotl_v32i8(<32 x i8> %a) nounwind {
 ; GFNISSE-LABEL: splatconstant_rotl_v32i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa %xmm0, %xmm2
-; GFNISSE-NEXT:    psrlw $4, %xmm2
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; GFNISSE-NEXT:    movdqa %xmm3, %xmm4
-; GFNISSE-NEXT:    pandn %xmm2, %xmm4
-; GFNISSE-NEXT:    psllw $4, %xmm0
-; GFNISSE-NEXT:    pand %xmm3, %xmm0
-; GFNISSE-NEXT:    por %xmm4, %xmm0
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm2
-; GFNISSE-NEXT:    psrlw $4, %xmm2
-; GFNISSE-NEXT:    psllw $4, %xmm1
-; GFNISSE-NEXT:    pand %xmm3, %xmm1
-; GFNISSE-NEXT:    pandn %xmm2, %xmm3
-; GFNISSE-NEXT:    por %xmm3, %xmm1
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [1161999622378488840,1161999622378488840]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm1
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX1-LABEL: splatconstant_rotl_v32i8:
 ; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; GFNIAVX1-NEXT:    vpsrlw $4, %xmm1, %xmm2
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; GFNIAVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm2
-; GFNIAVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpsrlw $4, %xmm0, %xmm2
-; GFNIAVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm2
-; GFNIAVX1-NEXT:    vpsllw $4, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; GFNIAVX1-NEXT:    retq
 ;
 ; GFNIAVX2-LABEL: splatconstant_rotl_v32i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpsrlw $4, %ymm0, %ymm1
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpsllw $4, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [1161999622378488840,1161999622378488840,1161999622378488840,1161999622378488840]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    retq
 ;
 ; GFNIAVX512-LABEL: splatconstant_rotl_v32i8:
 ; GFNIAVX512:       # %bb.0:
-; GFNIAVX512-NEXT:    vpsllw $4, %ymm0, %ymm1
-; GFNIAVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; GFNIAVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
+; GFNIAVX512-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
 ; GFNIAVX512-NEXT:    retq
   %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a, <32 x i8> %a, <32 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
   ret <32 x i8> %res
@@ -1197,53 +1103,25 @@ declare <32 x i8> @llvm.fshl.v32i8(<32 x i8>, <32 x i8>, <32 x i8>)
 define <32 x i8> @splatconstant_rotr_v32i8(<32 x i8> %a) nounwind {
 ; GFNISSE-LABEL: splatconstant_rotr_v32i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa %xmm0, %xmm2
-; GFNISSE-NEXT:    psrlw $6, %xmm2
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; GFNISSE-NEXT:    movdqa %xmm3, %xmm4
-; GFNISSE-NEXT:    pandn %xmm2, %xmm4
-; GFNISSE-NEXT:    psllw $2, %xmm0
-; GFNISSE-NEXT:    pand %xmm3, %xmm0
-; GFNISSE-NEXT:    por %xmm4, %xmm0
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm2
-; GFNISSE-NEXT:    psrlw $6, %xmm2
-; GFNISSE-NEXT:    psllw $2, %xmm1
-; GFNISSE-NEXT:    pand %xmm3, %xmm1
-; GFNISSE-NEXT:    pandn %xmm2, %xmm3
-; GFNISSE-NEXT:    por %xmm3, %xmm1
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [4647715923615551520,4647715923615551520]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm1
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX1-LABEL: splatconstant_rotr_v32i8:
 ; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; GFNIAVX1-NEXT:    vpsrlw $6, %xmm1, %xmm2
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; GFNIAVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm2
-; GFNIAVX1-NEXT:    vpsllw $2, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpsrlw $6, %xmm0, %xmm2
-; GFNIAVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm2
-; GFNIAVX1-NEXT:    vpsllw $2, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; GFNIAVX1-NEXT:    retq
 ;
 ; GFNIAVX2-LABEL: splatconstant_rotr_v32i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpsrlw $6, %ymm0, %ymm1
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpsllw $2, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4647715923615551520,4647715923615551520,4647715923615551520,4647715923615551520]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    retq
 ;
 ; GFNIAVX512-LABEL: splatconstant_rotr_v32i8:
 ; GFNIAVX512:       # %bb.0:
-; GFNIAVX512-NEXT:    vpsllw $2, %ymm0, %ymm1
-; GFNIAVX512-NEXT:    vpsrlw $6, %ymm0, %ymm0
-; GFNIAVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
+; GFNIAVX512-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
 ; GFNIAVX512-NEXT:    retq
   %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a, <32 x i8> %a, <32 x i8> <i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6>)
   ret <32 x i8> %res
@@ -1259,64 +1137,52 @@ define <64 x i8> @var_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
 ; GFNISSE:       # %bb.0:
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm8
 ; GFNISSE-NEXT:    movdqa %xmm0, %xmm4
-; GFNISSE-NEXT:    psrlw $4, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm9 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; GFNISSE-NEXT:    movdqa %xmm9, %xmm10
-; GFNISSE-NEXT:    pandn %xmm0, %xmm10
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm9 = [1161999622361579520,1161999622361579520]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm0
+; GFNISSE-NEXT:    pmovsxdq {{.*#+}} xmm10 = [16909320,16909320]
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm11
-; GFNISSE-NEXT:    psllw $4, %xmm11
-; GFNISSE-NEXT:    pand %xmm9, %xmm11
-; GFNISSE-NEXT:    por %xmm10, %xmm11
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm11
+; GFNISSE-NEXT:    por %xmm0, %xmm11
 ; GFNISSE-NEXT:    psllw $5, %xmm8
 ; GFNISSE-NEXT:    movdqa %xmm8, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm11, %xmm4
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm11 = [4647714815446351872,4647714815446351872]
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
-; GFNISSE-NEXT:    psrlw $6, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm10 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; GFNISSE-NEXT:    movdqa %xmm10, %xmm11
-; GFNISSE-NEXT:    pandn %xmm0, %xmm11
-; GFNISSE-NEXT:    movdqa %xmm4, %xmm12
-; GFNISSE-NEXT:    psllw $2, %xmm12
-; GFNISSE-NEXT:    pand %xmm10, %xmm12
-; GFNISSE-NEXT:    por %xmm11, %xmm12
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm0
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm12 = [1108169199648,1108169199648]
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm13
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm12, %xmm13
+; GFNISSE-NEXT:    por %xmm0, %xmm13
 ; GFNISSE-NEXT:    paddb %xmm8, %xmm8
 ; GFNISSE-NEXT:    movdqa %xmm8, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm12, %xmm4
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm13, %xmm4
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm13 = [9223372036854775808,9223372036854775808]
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
-; GFNISSE-NEXT:    psrlw $7, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm11 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; GFNISSE-NEXT:    pand %xmm11, %xmm0
-; GFNISSE-NEXT:    movdqa %xmm4, %xmm12
-; GFNISSE-NEXT:    paddb %xmm4, %xmm12
-; GFNISSE-NEXT:    por %xmm0, %xmm12
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm13, %xmm0
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm14
+; GFNISSE-NEXT:    paddb %xmm4, %xmm14
+; GFNISSE-NEXT:    por %xmm0, %xmm14
 ; GFNISSE-NEXT:    paddb %xmm8, %xmm8
 ; GFNISSE-NEXT:    movdqa %xmm8, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm12, %xmm4
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm14, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
-; GFNISSE-NEXT:    psrlw $4, %xmm0
-; GFNISSE-NEXT:    movdqa %xmm9, %xmm8
-; GFNISSE-NEXT:    pandn %xmm0, %xmm8
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm12
-; GFNISSE-NEXT:    psllw $4, %xmm12
-; GFNISSE-NEXT:    pand %xmm9, %xmm12
-; GFNISSE-NEXT:    por %xmm8, %xmm12
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm0
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm8
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm8
+; GFNISSE-NEXT:    por %xmm0, %xmm8
 ; GFNISSE-NEXT:    psllw $5, %xmm5
 ; GFNISSE-NEXT:    movdqa %xmm5, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm12, %xmm1
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
-; GFNISSE-NEXT:    psrlw $6, %xmm0
-; GFNISSE-NEXT:    movdqa %xmm10, %xmm8
-; GFNISSE-NEXT:    pandn %xmm0, %xmm8
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm12
-; GFNISSE-NEXT:    psllw $2, %xmm12
-; GFNISSE-NEXT:    pand %xmm10, %xmm12
-; GFNISSE-NEXT:    por %xmm8, %xmm12
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm0
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm8
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm12, %xmm8
+; GFNISSE-NEXT:    por %xmm0, %xmm8
 ; GFNISSE-NEXT:    paddb %xmm5, %xmm5
 ; GFNISSE-NEXT:    movdqa %xmm5, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm12, %xmm1
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
-; GFNISSE-NEXT:    psrlw $7, %xmm0
-; GFNISSE-NEXT:    pand %xmm11, %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm13, %xmm0
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm8
 ; GFNISSE-NEXT:    paddb %xmm1, %xmm8
 ; GFNISSE-NEXT:    por %xmm0, %xmm8
@@ -1324,30 +1190,23 @@ define <64 x i8> @var_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
 ; GFNISSE-NEXT:    movdqa %xmm5, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm0
-; GFNISSE-NEXT:    psrlw $4, %xmm0
-; GFNISSE-NEXT:    movdqa %xmm9, %xmm5
-; GFNISSE-NEXT:    pandn %xmm0, %xmm5
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm8
-; GFNISSE-NEXT:    psllw $4, %xmm8
-; GFNISSE-NEXT:    pand %xmm9, %xmm8
-; GFNISSE-NEXT:    por %xmm5, %xmm8
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm0
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm5
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm5
+; GFNISSE-NEXT:    por %xmm0, %xmm5
 ; GFNISSE-NEXT:    psllw $5, %xmm6
 ; GFNISSE-NEXT:    movdqa %xmm6, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm2
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm2
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm0
-; GFNISSE-NEXT:    psrlw $6, %xmm0
-; GFNISSE-NEXT:    movdqa %xmm10, %xmm5
-; GFNISSE-NEXT:    pandn %xmm0, %xmm5
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm8
-; GFNISSE-NEXT:    psllw $2, %xmm8
-; GFNISSE-NEXT:    pand %xmm10, %xmm8
-; GFNISSE-NEXT:    por %xmm5, %xmm8
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm0
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm5
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm12, %xmm5
+; GFNISSE-NEXT:    por %xmm0, %xmm5
 ; GFNISSE-NEXT:    paddb %xmm6, %xmm6
 ; GFNISSE-NEXT:    movdqa %xmm6, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm2
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm2
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm0
-; GFNISSE-NEXT:    psrlw $7, %xmm0
-; GFNISSE-NEXT:    pand %xmm11, %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm13, %xmm0
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm5
 ; GFNISSE-NEXT:    paddb %xmm2, %xmm5
 ; GFNISSE-NEXT:    por %xmm0, %xmm5
@@ -1355,28 +1214,23 @@ define <64 x i8> @var_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
 ; GFNISSE-NEXT:    movdqa %xmm6, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm2
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
-; GFNISSE-NEXT:    psrlw $4, %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm0
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm5
-; GFNISSE-NEXT:    psllw $4, %xmm5
-; GFNISSE-NEXT:    pand %xmm9, %xmm5
-; GFNISSE-NEXT:    pandn %xmm0, %xmm9
-; GFNISSE-NEXT:    por %xmm5, %xmm9
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm5
+; GFNISSE-NEXT:    por %xmm0, %xmm5
 ; GFNISSE-NEXT:    psllw $5, %xmm7
 ; GFNISSE-NEXT:    movdqa %xmm7, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm9, %xmm3
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm3
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
-; GFNISSE-NEXT:    psrlw $6, %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm0
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm5
-; GFNISSE-NEXT:    psllw $2, %xmm5
-; GFNISSE-NEXT:    pand %xmm10, %xmm5
-; GFNISSE-NEXT:    pandn %xmm0, %xmm10
-; GFNISSE-NEXT:    por %xmm5, %xmm10
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm12, %xmm5
+; GFNISSE-NEXT:    por %xmm0, %xmm5
 ; GFNISSE-NEXT:    paddb %xmm7, %xmm7
 ; GFNISSE-NEXT:    movdqa %xmm7, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm10, %xmm3
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm3
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
-; GFNISSE-NEXT:    psrlw $7, %xmm0
-; GFNISSE-NEXT:    pand %xmm11, %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm13, %xmm0
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm5
 ; GFNISSE-NEXT:    paddb %xmm3, %xmm5
 ; GFNISSE-NEXT:    por %xmm0, %xmm5
@@ -1388,90 +1242,77 @@ define <64 x i8> @var_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
 ;
 ; GFNIAVX1-LABEL: var_rotl_v64i8:
 ; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; GFNIAVX1-NEXT:    vpsrlw $4, %xmm5, %xmm6
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; GFNIAVX1-NEXT:    vpandn %xmm6, %xmm4, %xmm6
-; GFNIAVX1-NEXT:    vpsllw $4, %xmm5, %xmm7
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm7, %xmm7
-; GFNIAVX1-NEXT:    vpor %xmm6, %xmm7, %xmm6
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
-; GFNIAVX1-NEXT:    vpsllw $5, %xmm7, %xmm7
-; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm6, %xmm5, %xmm6
-; GFNIAVX1-NEXT:    vpsrlw $6, %xmm6, %xmm8
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; GFNIAVX1-NEXT:    vpandn %xmm8, %xmm5, %xmm8
-; GFNIAVX1-NEXT:    vpsllw $2, %xmm6, %xmm9
-; GFNIAVX1-NEXT:    vpand %xmm5, %xmm9, %xmm9
-; GFNIAVX1-NEXT:    vpor %xmm8, %xmm9, %xmm8
-; GFNIAVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm7
-; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm8, %xmm6, %xmm8
-; GFNIAVX1-NEXT:    vpsrlw $7, %xmm8, %xmm9
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; GFNIAVX1-NEXT:    vpand %xmm6, %xmm9, %xmm9
-; GFNIAVX1-NEXT:    vpaddb %xmm8, %xmm8, %xmm10
-; GFNIAVX1-NEXT:    vpor %xmm9, %xmm10, %xmm9
-; GFNIAVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm7
-; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm9, %xmm8, %xmm7
-; GFNIAVX1-NEXT:    vpsrlw $4, %xmm0, %xmm8
-; GFNIAVX1-NEXT:    vpandn %xmm8, %xmm4, %xmm8
-; GFNIAVX1-NEXT:    vpsllw $4, %xmm0, %xmm9
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm9, %xmm9
-; GFNIAVX1-NEXT:    vpor %xmm8, %xmm9, %xmm8
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [1161999622361579520,1161999622361579520]
+; GFNIAVX1-NEXT:    # xmm4 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm6, %xmm7
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [16909320,16909320]
+; GFNIAVX1-NEXT:    # xmm5 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm6, %xmm8
+; GFNIAVX1-NEXT:    vpor %xmm7, %xmm8, %xmm7
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm8
+; GFNIAVX1-NEXT:    vpsllw $5, %xmm8, %xmm8
+; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm7, %xmm6, %xmm9
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm6 = [4647714815446351872,4647714815446351872]
+; GFNIAVX1-NEXT:    # xmm6 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm6, %xmm9, %xmm10
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm7 = [1108169199648,1108169199648]
+; GFNIAVX1-NEXT:    # xmm7 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm7, %xmm9, %xmm11
+; GFNIAVX1-NEXT:    vpor %xmm10, %xmm11, %xmm10
+; GFNIAVX1-NEXT:    vpaddb %xmm8, %xmm8, %xmm11
+; GFNIAVX1-NEXT:    vpblendvb %xmm11, %xmm10, %xmm9, %xmm9
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
+; GFNIAVX1-NEXT:    # xmm8 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm8, %xmm9, %xmm10
+; GFNIAVX1-NEXT:    vpaddb %xmm9, %xmm9, %xmm12
+; GFNIAVX1-NEXT:    vpor %xmm10, %xmm12, %xmm10
+; GFNIAVX1-NEXT:    vpaddb %xmm11, %xmm11, %xmm11
+; GFNIAVX1-NEXT:    vpblendvb %xmm11, %xmm10, %xmm9, %xmm9
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm0, %xmm10
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm0, %xmm11
+; GFNIAVX1-NEXT:    vpor %xmm10, %xmm11, %xmm10
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm8, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpsrlw $6, %xmm0, %xmm8
-; GFNIAVX1-NEXT:    vpandn %xmm8, %xmm5, %xmm8
-; GFNIAVX1-NEXT:    vpsllw $2, %xmm0, %xmm9
-; GFNIAVX1-NEXT:    vpand %xmm5, %xmm9, %xmm9
-; GFNIAVX1-NEXT:    vpor %xmm8, %xmm9, %xmm8
+; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm10, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm6, %xmm0, %xmm10
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm7, %xmm0, %xmm11
+; GFNIAVX1-NEXT:    vpor %xmm10, %xmm11, %xmm10
 ; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm8, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpsrlw $7, %xmm0, %xmm8
-; GFNIAVX1-NEXT:    vpand %xmm6, %xmm8, %xmm8
-; GFNIAVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm9
-; GFNIAVX1-NEXT:    vpor %xmm8, %xmm9, %xmm8
+; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm10, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm8, %xmm0, %xmm10
+; GFNIAVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm11
+; GFNIAVX1-NEXT:    vpor %xmm10, %xmm11, %xmm10
 ; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm8, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm10, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm0
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $4, %xmm2, %xmm7
-; GFNIAVX1-NEXT:    vpandn %xmm7, %xmm4, %xmm7
-; GFNIAVX1-NEXT:    vpsllw $4, %xmm2, %xmm8
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm8, %xmm8
-; GFNIAVX1-NEXT:    vpor %xmm7, %xmm8, %xmm7
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm3, %xmm8
-; GFNIAVX1-NEXT:    vpsllw $5, %xmm8, %xmm8
-; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm7, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $6, %xmm2, %xmm7
-; GFNIAVX1-NEXT:    vpandn %xmm7, %xmm5, %xmm7
-; GFNIAVX1-NEXT:    vpsllw $2, %xmm2, %xmm9
-; GFNIAVX1-NEXT:    vpand %xmm5, %xmm9, %xmm9
-; GFNIAVX1-NEXT:    vpor %xmm7, %xmm9, %xmm7
-; GFNIAVX1-NEXT:    vpaddb %xmm8, %xmm8, %xmm8
-; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm7, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $7, %xmm2, %xmm7
-; GFNIAVX1-NEXT:    vpand %xmm6, %xmm7, %xmm7
-; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm9
-; GFNIAVX1-NEXT:    vpor %xmm7, %xmm9, %xmm7
-; GFNIAVX1-NEXT:    vpaddb %xmm8, %xmm8, %xmm8
-; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm7, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $4, %xmm1, %xmm7
-; GFNIAVX1-NEXT:    vpandn %xmm7, %xmm4, %xmm7
-; GFNIAVX1-NEXT:    vpsllw $4, %xmm1, %xmm8
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm8, %xmm4
-; GFNIAVX1-NEXT:    vpor %xmm7, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm9
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm2, %xmm10
+; GFNIAVX1-NEXT:    vpor %xmm9, %xmm10, %xmm9
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm3, %xmm10
+; GFNIAVX1-NEXT:    vpsllw $5, %xmm10, %xmm10
+; GFNIAVX1-NEXT:    vpblendvb %xmm10, %xmm9, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm6, %xmm2, %xmm9
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm7, %xmm2, %xmm11
+; GFNIAVX1-NEXT:    vpor %xmm9, %xmm11, %xmm9
+; GFNIAVX1-NEXT:    vpaddb %xmm10, %xmm10, %xmm10
+; GFNIAVX1-NEXT:    vpblendvb %xmm10, %xmm9, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm8, %xmm2, %xmm9
+; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm11
+; GFNIAVX1-NEXT:    vpor %xmm9, %xmm11, %xmm9
+; GFNIAVX1-NEXT:    vpaddb %xmm10, %xmm10, %xmm10
+; GFNIAVX1-NEXT:    vpblendvb %xmm10, %xmm9, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm1, %xmm4
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm1, %xmm5
+; GFNIAVX1-NEXT:    vpor %xmm4, %xmm5, %xmm4
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm3, %xmm3
 ; GFNIAVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpsrlw $6, %xmm1, %xmm4
-; GFNIAVX1-NEXT:    vpandn %xmm4, %xmm5, %xmm4
-; GFNIAVX1-NEXT:    vpsllw $2, %xmm1, %xmm7
-; GFNIAVX1-NEXT:    vpand %xmm5, %xmm7, %xmm5
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm6, %xmm1, %xmm4
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm7, %xmm1, %xmm5
 ; GFNIAVX1-NEXT:    vpor %xmm4, %xmm5, %xmm4
 ; GFNIAVX1-NEXT:    vpaddb %xmm3, %xmm3, %xmm3
 ; GFNIAVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpsrlw $7, %xmm1, %xmm4
-; GFNIAVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm8, %xmm1, %xmm4
 ; GFNIAVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm5
 ; GFNIAVX1-NEXT:    vpor %xmm4, %xmm5, %xmm4
 ; GFNIAVX1-NEXT:    vpaddb %xmm3, %xmm3, %xmm3
@@ -1481,45 +1322,37 @@ define <64 x i8> @var_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
 ;
 ; GFNIAVX2-LABEL: var_rotl_v64i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; GFNIAVX2-NEXT:    vpandn %ymm4, %ymm5, %ymm4
-; GFNIAVX2-NEXT:    vpsllw $4, %ymm0, %ymm6
-; GFNIAVX2-NEXT:    vpand %ymm5, %ymm6, %ymm6
-; GFNIAVX2-NEXT:    vpor %ymm4, %ymm6, %ymm4
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm0, %ymm5
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [16909320,16909320,16909320,16909320]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm6, %ymm0, %ymm7
+; GFNIAVX2-NEXT:    vpor %ymm5, %ymm7, %ymm5
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpsrlw $6, %ymm0, %ymm4
-; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; GFNIAVX2-NEXT:    vpandn %ymm4, %ymm6, %ymm4
-; GFNIAVX2-NEXT:    vpsllw $2, %ymm0, %ymm7
-; GFNIAVX2-NEXT:    vpand %ymm6, %ymm7, %ymm7
-; GFNIAVX2-NEXT:    vpor %ymm4, %ymm7, %ymm4
+; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm5, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [4647714815446351872,4647714815446351872,4647714815446351872,4647714815446351872]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm0, %ymm7
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [1108169199648,1108169199648,1108169199648,1108169199648]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm8, %ymm0, %ymm9
+; GFNIAVX2-NEXT:    vpor %ymm7, %ymm9, %ymm7
 ; GFNIAVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpsrlw $7, %ymm0, %ymm4
-; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; GFNIAVX2-NEXT:    vpand %ymm7, %ymm4, %ymm4
-; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm8
-; GFNIAVX2-NEXT:    vpor %ymm4, %ymm8, %ymm4
+; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm7, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm7, %ymm0, %ymm9
+; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm10
+; GFNIAVX2-NEXT:    vpor %ymm9, %ymm10, %ymm9
 ; GFNIAVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpsrlw $4, %ymm1, %ymm2
-; GFNIAVX2-NEXT:    vpandn %ymm2, %ymm5, %ymm2
-; GFNIAVX2-NEXT:    vpsllw $4, %ymm1, %ymm4
-; GFNIAVX2-NEXT:    vpand %ymm5, %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm9, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm1, %ymm2
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm6, %ymm1, %ymm4
 ; GFNIAVX2-NEXT:    vpor %ymm2, %ymm4, %ymm2
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm3, %ymm3
 ; GFNIAVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpsrlw $6, %ymm1, %ymm2
-; GFNIAVX2-NEXT:    vpandn %ymm2, %ymm6, %ymm2
-; GFNIAVX2-NEXT:    vpsllw $2, %ymm1, %ymm4
-; GFNIAVX2-NEXT:    vpand %ymm6, %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm1, %ymm2
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm8, %ymm1, %ymm4
 ; GFNIAVX2-NEXT:    vpor %ymm2, %ymm4, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
 ; GFNIAVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpsrlw $7, %ymm1, %ymm2
-; GFNIAVX2-NEXT:    vpand %ymm7, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm7, %ymm1, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm4
 ; GFNIAVX2-NEXT:    vpor %ymm2, %ymm4, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
@@ -1529,40 +1362,42 @@ define <64 x i8> @var_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
 ; GFNIAVX512VL-LABEL: var_rotl_v64i8:
 ; GFNIAVX512VL:       # %bb.0:
 ; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; GFNIAVX512VL-NEXT:    vpsrlw $4, %ymm2, %ymm3
-; GFNIAVX512VL-NEXT:    vpsllw $4, %ymm2, %ymm4
-; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160]
-; GFNIAVX512VL-NEXT:    vpternlogd $226, %ymm3, %ymm5, %ymm4
-; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
-; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT:    vpsrlw $6, %ymm2, %ymm4
-; GFNIAVX512VL-NEXT:    vpsllw $2, %ymm2, %ymm6
-; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268]
-; GFNIAVX512VL-NEXT:    vpternlogd $226, %ymm4, %ymm7, %ymm6
-; GFNIAVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT:    vpsrlw $7, %ymm2, %ymm4
-; GFNIAVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm6
-; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; GFNIAVX512VL-NEXT:    vpternlogq $248, %ymm8, %ymm4, %ymm6
-; GFNIAVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm4
-; GFNIAVX512VL-NEXT:    vpternlogd $226, %ymm3, %ymm5, %ymm4
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520]
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm2, %ymm4
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [16909320,16909320,16909320,16909320]
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm2, %ymm6
+; GFNIAVX512VL-NEXT:    vpor %ymm4, %ymm6, %ymm4
+; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
+; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm6, %ymm6
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [4647714815446351872,4647714815446351872,4647714815446351872,4647714815446351872]
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm2, %ymm7
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [1108169199648,1108169199648,1108169199648,1108169199648]
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm8, %ymm2, %ymm9
+; GFNIAVX512VL-NEXT:    vpor %ymm7, %ymm9, %ymm7
+; GFNIAVX512VL-NEXT:    vpaddb %ymm6, %ymm6, %ymm6
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm6, %ymm7, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm7, %ymm2, %ymm9
+; GFNIAVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm10
+; GFNIAVX512VL-NEXT:    vpor %ymm9, %ymm10, %ymm9
+; GFNIAVX512VL-NEXT:    vpaddb %ymm6, %ymm6, %ymm6
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm6, %ymm9, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm3
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm0, %ymm5
+; GFNIAVX512VL-NEXT:    vpor %ymm3, %ymm5, %ymm3
 ; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vpsrlw $6, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm4
-; GFNIAVX512VL-NEXT:    vpternlogd $226, %ymm3, %ymm7, %ymm4
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm0, %ymm3
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm8, %ymm0, %ymm4
+; GFNIAVX512VL-NEXT:    vpor %ymm3, %ymm4, %ymm3
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vpsrlw $7, %ymm0, %ymm3
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm7, %ymm0, %ymm3
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm4
-; GFNIAVX512VL-NEXT:    vpternlogq $248, %ymm8, %ymm3, %ymm4
+; GFNIAVX512VL-NEXT:    vpor %ymm3, %ymm4, %ymm3
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; GFNIAVX512VL-NEXT:    retq
 ;
@@ -1587,123 +1422,99 @@ define <64 x i8> @var_rotr_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
 ; GFNISSE-LABEL: var_rotr_v64i8:
 ; GFNISSE:       # %bb.0:
 ; GFNISSE-NEXT:    movdqa %xmm0, %xmm9
-; GFNISSE-NEXT:    psrlw $4, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm10 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; GFNISSE-NEXT:    movdqa %xmm10, %xmm8
-; GFNISSE-NEXT:    pandn %xmm0, %xmm8
-; GFNISSE-NEXT:    movdqa %xmm9, %xmm11
-; GFNISSE-NEXT:    psllw $4, %xmm11
-; GFNISSE-NEXT:    pand %xmm10, %xmm11
-; GFNISSE-NEXT:    por %xmm8, %xmm11
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm10 = [1161999622361579520,1161999622361579520]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm0
+; GFNISSE-NEXT:    pmovsxdq {{.*#+}} xmm11 = [16909320,16909320]
+; GFNISSE-NEXT:    movdqa %xmm9, %xmm12
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm12
+; GFNISSE-NEXT:    por %xmm0, %xmm12
 ; GFNISSE-NEXT:    pxor %xmm8, %xmm8
 ; GFNISSE-NEXT:    pxor %xmm0, %xmm0
 ; GFNISSE-NEXT:    psubb %xmm4, %xmm0
 ; GFNISSE-NEXT:    psllw $5, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm11, %xmm9
-; GFNISSE-NEXT:    movdqa %xmm9, %xmm11
-; GFNISSE-NEXT:    psrlw $6, %xmm11
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; GFNISSE-NEXT:    movdqa %xmm4, %xmm12
-; GFNISSE-NEXT:    pandn %xmm11, %xmm12
-; GFNISSE-NEXT:    movdqa %xmm9, %xmm11
-; GFNISSE-NEXT:    psllw $2, %xmm11
-; GFNISSE-NEXT:    pand %xmm4, %xmm11
-; GFNISSE-NEXT:    por %xmm12, %xmm11
-; GFNISSE-NEXT:    paddb %xmm0, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm11, %xmm9
-; GFNISSE-NEXT:    movdqa %xmm9, %xmm12
-; GFNISSE-NEXT:    psrlw $7, %xmm12
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm11 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; GFNISSE-NEXT:    pand %xmm11, %xmm12
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm12, %xmm9
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [4647714815446351872,4647714815446351872]
 ; GFNISSE-NEXT:    movdqa %xmm9, %xmm13
-; GFNISSE-NEXT:    paddb %xmm9, %xmm13
-; GFNISSE-NEXT:    por %xmm12, %xmm13
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm13
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm12 = [1108169199648,1108169199648]
+; GFNISSE-NEXT:    movdqa %xmm9, %xmm14
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm12, %xmm14
+; GFNISSE-NEXT:    por %xmm13, %xmm14
 ; GFNISSE-NEXT:    paddb %xmm0, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm13, %xmm9
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm14, %xmm9
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm13 = [9223372036854775808,9223372036854775808]
+; GFNISSE-NEXT:    movdqa %xmm9, %xmm14
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm13, %xmm14
+; GFNISSE-NEXT:    movdqa %xmm9, %xmm15
+; GFNISSE-NEXT:    paddb %xmm9, %xmm15
+; GFNISSE-NEXT:    por %xmm14, %xmm15
+; GFNISSE-NEXT:    paddb %xmm0, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm15, %xmm9
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
-; GFNISSE-NEXT:    psrlw $4, %xmm0
-; GFNISSE-NEXT:    movdqa %xmm10, %xmm12
-; GFNISSE-NEXT:    pandn %xmm0, %xmm12
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm13
-; GFNISSE-NEXT:    psllw $4, %xmm13
-; GFNISSE-NEXT:    pand %xmm10, %xmm13
-; GFNISSE-NEXT:    por %xmm12, %xmm13
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm0
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm14
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm14
+; GFNISSE-NEXT:    por %xmm0, %xmm14
 ; GFNISSE-NEXT:    pxor %xmm0, %xmm0
 ; GFNISSE-NEXT:    psubb %xmm5, %xmm0
 ; GFNISSE-NEXT:    psllw $5, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm13, %xmm1
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm5
-; GFNISSE-NEXT:    psrlw $6, %xmm5
-; GFNISSE-NEXT:    movdqa %xmm4, %xmm12
-; GFNISSE-NEXT:    pandn %xmm5, %xmm12
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm14, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm5
-; GFNISSE-NEXT:    psllw $2, %xmm5
-; GFNISSE-NEXT:    pand %xmm4, %xmm5
-; GFNISSE-NEXT:    por %xmm12, %xmm5
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm5
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm14
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm12, %xmm14
+; GFNISSE-NEXT:    por %xmm5, %xmm14
 ; GFNISSE-NEXT:    paddb %xmm0, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm1
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm14, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm5
-; GFNISSE-NEXT:    psrlw $7, %xmm5
-; GFNISSE-NEXT:    pand %xmm11, %xmm5
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm12
-; GFNISSE-NEXT:    paddb %xmm1, %xmm12
-; GFNISSE-NEXT:    por %xmm5, %xmm12
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm13, %xmm5
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm14
+; GFNISSE-NEXT:    paddb %xmm1, %xmm14
+; GFNISSE-NEXT:    por %xmm5, %xmm14
 ; GFNISSE-NEXT:    paddb %xmm0, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm12, %xmm1
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm14, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm0
-; GFNISSE-NEXT:    psrlw $4, %xmm0
-; GFNISSE-NEXT:    movdqa %xmm10, %xmm5
-; GFNISSE-NEXT:    pandn %xmm0, %xmm5
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm12
-; GFNISSE-NEXT:    psllw $4, %xmm12
-; GFNISSE-NEXT:    pand %xmm10, %xmm12
-; GFNISSE-NEXT:    por %xmm5, %xmm12
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm0
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm5
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm5
+; GFNISSE-NEXT:    por %xmm0, %xmm5
 ; GFNISSE-NEXT:    pxor %xmm0, %xmm0
 ; GFNISSE-NEXT:    psubb %xmm6, %xmm0
 ; GFNISSE-NEXT:    psllw $5, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm12, %xmm2
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm5
-; GFNISSE-NEXT:    psrlw $6, %xmm5
-; GFNISSE-NEXT:    movdqa %xmm4, %xmm6
-; GFNISSE-NEXT:    pandn %xmm5, %xmm6
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm2
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm5
-; GFNISSE-NEXT:    psllw $2, %xmm5
-; GFNISSE-NEXT:    pand %xmm4, %xmm5
-; GFNISSE-NEXT:    por %xmm6, %xmm5
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm5
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm6
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm12, %xmm6
+; GFNISSE-NEXT:    por %xmm5, %xmm6
 ; GFNISSE-NEXT:    paddb %xmm0, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm2
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm6, %xmm2
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm5
-; GFNISSE-NEXT:    psrlw $7, %xmm5
-; GFNISSE-NEXT:    pand %xmm11, %xmm5
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm13, %xmm5
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm6
 ; GFNISSE-NEXT:    paddb %xmm2, %xmm6
 ; GFNISSE-NEXT:    por %xmm5, %xmm6
 ; GFNISSE-NEXT:    paddb %xmm0, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm6, %xmm2
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
-; GFNISSE-NEXT:    psrlw $4, %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm0
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm5
-; GFNISSE-NEXT:    psllw $4, %xmm5
-; GFNISSE-NEXT:    pand %xmm10, %xmm5
-; GFNISSE-NEXT:    pandn %xmm0, %xmm10
-; GFNISSE-NEXT:    por %xmm5, %xmm10
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm5
+; GFNISSE-NEXT:    por %xmm0, %xmm5
 ; GFNISSE-NEXT:    psubb %xmm7, %xmm8
 ; GFNISSE-NEXT:    psllw $5, %xmm8
 ; GFNISSE-NEXT:    movdqa %xmm8, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm10, %xmm3
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm3
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
-; GFNISSE-NEXT:    psrlw $6, %xmm0
-; GFNISSE-NEXT:    movdqa %xmm3, %xmm5
-; GFNISSE-NEXT:    psllw $2, %xmm5
-; GFNISSE-NEXT:    pand %xmm4, %xmm5
-; GFNISSE-NEXT:    pandn %xmm0, %xmm4
-; GFNISSE-NEXT:    por %xmm5, %xmm4
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm0
+; GFNISSE-NEXT:    movdqa %xmm3, %xmm4
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm12, %xmm4
+; GFNISSE-NEXT:    por %xmm0, %xmm4
 ; GFNISSE-NEXT:    paddb %xmm8, %xmm8
 ; GFNISSE-NEXT:    movdqa %xmm8, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
-; GFNISSE-NEXT:    psrlw $7, %xmm0
-; GFNISSE-NEXT:    pand %xmm11, %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm13, %xmm0
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm4
 ; GFNISSE-NEXT:    paddb %xmm3, %xmm4
 ; GFNISSE-NEXT:    por %xmm0, %xmm4
@@ -1715,95 +1526,82 @@ define <64 x i8> @var_rotr_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
 ;
 ; GFNIAVX1-LABEL: var_rotr_v64i8:
 ; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; GFNIAVX1-NEXT:    vpsrlw $4, %xmm5, %xmm6
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; GFNIAVX1-NEXT:    vpandn %xmm6, %xmm4, %xmm6
-; GFNIAVX1-NEXT:    vpsllw $4, %xmm5, %xmm7
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm7, %xmm7
-; GFNIAVX1-NEXT:    vpor %xmm6, %xmm7, %xmm7
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm8
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [1161999622361579520,1161999622361579520]
+; GFNIAVX1-NEXT:    # xmm4 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm7, %xmm6
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [16909320,16909320]
+; GFNIAVX1-NEXT:    # xmm5 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm7, %xmm8
+; GFNIAVX1-NEXT:    vpor %xmm6, %xmm8, %xmm8
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm9
 ; GFNIAVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; GFNIAVX1-NEXT:    vpsubb %xmm8, %xmm6, %xmm8
-; GFNIAVX1-NEXT:    vpsllw $5, %xmm8, %xmm8
-; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm7, %xmm5, %xmm7
-; GFNIAVX1-NEXT:    vpsrlw $6, %xmm7, %xmm9
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; GFNIAVX1-NEXT:    vpandn %xmm9, %xmm5, %xmm9
-; GFNIAVX1-NEXT:    vpsllw $2, %xmm7, %xmm10
-; GFNIAVX1-NEXT:    vpand %xmm5, %xmm10, %xmm10
-; GFNIAVX1-NEXT:    vpor %xmm9, %xmm10, %xmm9
-; GFNIAVX1-NEXT:    vpaddb %xmm8, %xmm8, %xmm8
-; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm9, %xmm7, %xmm9
-; GFNIAVX1-NEXT:    vpsrlw $7, %xmm9, %xmm10
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; GFNIAVX1-NEXT:    vpand %xmm7, %xmm10, %xmm10
-; GFNIAVX1-NEXT:    vpaddb %xmm9, %xmm9, %xmm11
-; GFNIAVX1-NEXT:    vpor %xmm10, %xmm11, %xmm10
-; GFNIAVX1-NEXT:    vpaddb %xmm8, %xmm8, %xmm8
-; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm10, %xmm9, %xmm8
-; GFNIAVX1-NEXT:    vpsrlw $4, %xmm0, %xmm9
-; GFNIAVX1-NEXT:    vpandn %xmm9, %xmm4, %xmm9
-; GFNIAVX1-NEXT:    vpsllw $4, %xmm0, %xmm10
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm10, %xmm10
-; GFNIAVX1-NEXT:    vpor %xmm9, %xmm10, %xmm9
+; GFNIAVX1-NEXT:    vpsubb %xmm9, %xmm6, %xmm9
+; GFNIAVX1-NEXT:    vpsllw $5, %xmm9, %xmm9
+; GFNIAVX1-NEXT:    vpblendvb %xmm9, %xmm8, %xmm7, %xmm10
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm7 = [4647714815446351872,4647714815446351872]
+; GFNIAVX1-NEXT:    # xmm7 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm7, %xmm10, %xmm11
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm8 = [1108169199648,1108169199648]
+; GFNIAVX1-NEXT:    # xmm8 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm8, %xmm10, %xmm12
+; GFNIAVX1-NEXT:    vpor %xmm11, %xmm12, %xmm11
+; GFNIAVX1-NEXT:    vpaddb %xmm9, %xmm9, %xmm12
+; GFNIAVX1-NEXT:    vpblendvb %xmm12, %xmm11, %xmm10, %xmm10
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
+; GFNIAVX1-NEXT:    # xmm9 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm9, %xmm10, %xmm11
+; GFNIAVX1-NEXT:    vpaddb %xmm10, %xmm10, %xmm13
+; GFNIAVX1-NEXT:    vpor %xmm11, %xmm13, %xmm11
+; GFNIAVX1-NEXT:    vpaddb %xmm12, %xmm12, %xmm12
+; GFNIAVX1-NEXT:    vpblendvb %xmm12, %xmm11, %xmm10, %xmm10
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm0, %xmm11
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm0, %xmm12
+; GFNIAVX1-NEXT:    vpor %xmm11, %xmm12, %xmm11
 ; GFNIAVX1-NEXT:    vpsubb %xmm2, %xmm6, %xmm2
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm9, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpsrlw $6, %xmm0, %xmm9
-; GFNIAVX1-NEXT:    vpandn %xmm9, %xmm5, %xmm9
-; GFNIAVX1-NEXT:    vpsllw $2, %xmm0, %xmm10
-; GFNIAVX1-NEXT:    vpand %xmm5, %xmm10, %xmm10
-; GFNIAVX1-NEXT:    vpor %xmm9, %xmm10, %xmm9
+; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm11, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm7, %xmm0, %xmm11
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm8, %xmm0, %xmm12
+; GFNIAVX1-NEXT:    vpor %xmm11, %xmm12, %xmm11
 ; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm9, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpsrlw $7, %xmm0, %xmm9
-; GFNIAVX1-NEXT:    vpand %xmm7, %xmm9, %xmm9
-; GFNIAVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm10
-; GFNIAVX1-NEXT:    vpor %xmm9, %xmm10, %xmm9
+; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm11, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm9, %xmm0, %xmm11
+; GFNIAVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm12
+; GFNIAVX1-NEXT:    vpor %xmm11, %xmm12, %xmm11
 ; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm9, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm11, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm0, %ymm0
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $4, %xmm2, %xmm8
-; GFNIAVX1-NEXT:    vpandn %xmm8, %xmm4, %xmm8
-; GFNIAVX1-NEXT:    vpsllw $4, %xmm2, %xmm9
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm9, %xmm9
-; GFNIAVX1-NEXT:    vpor %xmm8, %xmm9, %xmm8
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm3, %xmm9
-; GFNIAVX1-NEXT:    vpsubb %xmm9, %xmm6, %xmm9
-; GFNIAVX1-NEXT:    vpsllw $5, %xmm9, %xmm9
-; GFNIAVX1-NEXT:    vpblendvb %xmm9, %xmm8, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $6, %xmm2, %xmm8
-; GFNIAVX1-NEXT:    vpandn %xmm8, %xmm5, %xmm8
-; GFNIAVX1-NEXT:    vpsllw $2, %xmm2, %xmm10
-; GFNIAVX1-NEXT:    vpand %xmm5, %xmm10, %xmm10
-; GFNIAVX1-NEXT:    vpor %xmm8, %xmm10, %xmm8
-; GFNIAVX1-NEXT:    vpaddb %xmm9, %xmm9, %xmm9
-; GFNIAVX1-NEXT:    vpblendvb %xmm9, %xmm8, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $7, %xmm2, %xmm8
-; GFNIAVX1-NEXT:    vpand %xmm7, %xmm8, %xmm8
-; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm10
-; GFNIAVX1-NEXT:    vpor %xmm8, %xmm10, %xmm8
-; GFNIAVX1-NEXT:    vpaddb %xmm9, %xmm9, %xmm9
-; GFNIAVX1-NEXT:    vpblendvb %xmm9, %xmm8, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $4, %xmm1, %xmm8
-; GFNIAVX1-NEXT:    vpandn %xmm8, %xmm4, %xmm8
-; GFNIAVX1-NEXT:    vpsllw $4, %xmm1, %xmm9
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm9, %xmm4
-; GFNIAVX1-NEXT:    vpor %xmm4, %xmm8, %xmm4
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm10
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm2, %xmm11
+; GFNIAVX1-NEXT:    vpor %xmm10, %xmm11, %xmm10
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm3, %xmm11
+; GFNIAVX1-NEXT:    vpsubb %xmm11, %xmm6, %xmm11
+; GFNIAVX1-NEXT:    vpsllw $5, %xmm11, %xmm11
+; GFNIAVX1-NEXT:    vpblendvb %xmm11, %xmm10, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm7, %xmm2, %xmm10
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm8, %xmm2, %xmm12
+; GFNIAVX1-NEXT:    vpor %xmm10, %xmm12, %xmm10
+; GFNIAVX1-NEXT:    vpaddb %xmm11, %xmm11, %xmm11
+; GFNIAVX1-NEXT:    vpblendvb %xmm11, %xmm10, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm9, %xmm2, %xmm10
+; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm12
+; GFNIAVX1-NEXT:    vpor %xmm10, %xmm12, %xmm10
+; GFNIAVX1-NEXT:    vpaddb %xmm11, %xmm11, %xmm11
+; GFNIAVX1-NEXT:    vpblendvb %xmm11, %xmm10, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm1, %xmm4
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm1, %xmm5
+; GFNIAVX1-NEXT:    vpor %xmm4, %xmm5, %xmm4
 ; GFNIAVX1-NEXT:    vpsubb %xmm3, %xmm6, %xmm3
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm3, %xmm3
 ; GFNIAVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpsrlw $6, %xmm1, %xmm4
-; GFNIAVX1-NEXT:    vpandn %xmm4, %xmm5, %xmm4
-; GFNIAVX1-NEXT:    vpsllw $2, %xmm1, %xmm6
-; GFNIAVX1-NEXT:    vpand %xmm5, %xmm6, %xmm5
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm7, %xmm1, %xmm4
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm8, %xmm1, %xmm5
 ; GFNIAVX1-NEXT:    vpor %xmm4, %xmm5, %xmm4
 ; GFNIAVX1-NEXT:    vpaddb %xmm3, %xmm3, %xmm3
 ; GFNIAVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpsrlw $7, %xmm1, %xmm4
-; GFNIAVX1-NEXT:    vpand %xmm7, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm9, %xmm1, %xmm4
 ; GFNIAVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm5
 ; GFNIAVX1-NEXT:    vpor %xmm4, %xmm5, %xmm4
 ; GFNIAVX1-NEXT:    vpaddb %xmm3, %xmm3, %xmm3
@@ -1813,48 +1611,40 @@ define <64 x i8> @var_rotr_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
 ;
 ; GFNIAVX2-LABEL: var_rotr_v64i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; GFNIAVX2-NEXT:    vpandn %ymm4, %ymm5, %ymm4
-; GFNIAVX2-NEXT:    vpsllw $4, %ymm0, %ymm6
-; GFNIAVX2-NEXT:    vpand %ymm5, %ymm6, %ymm6
-; GFNIAVX2-NEXT:    vpor %ymm4, %ymm6, %ymm4
-; GFNIAVX2-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; GFNIAVX2-NEXT:    vpsubb %ymm2, %ymm6, %ymm2
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm0, %ymm5
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [16909320,16909320,16909320,16909320]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm6, %ymm0, %ymm7
+; GFNIAVX2-NEXT:    vpor %ymm5, %ymm7, %ymm5
+; GFNIAVX2-NEXT:    vpxor %xmm7, %xmm7, %xmm7
+; GFNIAVX2-NEXT:    vpsubb %ymm2, %ymm7, %ymm2
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpsrlw $6, %ymm0, %ymm4
-; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; GFNIAVX2-NEXT:    vpandn %ymm4, %ymm7, %ymm4
-; GFNIAVX2-NEXT:    vpsllw $2, %ymm0, %ymm8
-; GFNIAVX2-NEXT:    vpand %ymm7, %ymm8, %ymm8
-; GFNIAVX2-NEXT:    vpor %ymm4, %ymm8, %ymm4
+; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm5, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [4647714815446351872,4647714815446351872,4647714815446351872,4647714815446351872]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm0, %ymm8
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [1108169199648,1108169199648,1108169199648,1108169199648]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm9, %ymm0, %ymm10
+; GFNIAVX2-NEXT:    vpor %ymm8, %ymm10, %ymm8
 ; GFNIAVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpsrlw $7, %ymm0, %ymm4
-; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; GFNIAVX2-NEXT:    vpand %ymm4, %ymm8, %ymm4
-; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm9
-; GFNIAVX2-NEXT:    vpor %ymm4, %ymm9, %ymm4
+; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm8, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm8, %ymm0, %ymm10
+; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm11
+; GFNIAVX2-NEXT:    vpor %ymm10, %ymm11, %ymm10
 ; GFNIAVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpsrlw $4, %ymm1, %ymm2
-; GFNIAVX2-NEXT:    vpandn %ymm2, %ymm5, %ymm2
-; GFNIAVX2-NEXT:    vpsllw $4, %ymm1, %ymm4
-; GFNIAVX2-NEXT:    vpand %ymm5, %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm10, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm1, %ymm2
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm6, %ymm1, %ymm4
 ; GFNIAVX2-NEXT:    vpor %ymm2, %ymm4, %ymm2
-; GFNIAVX2-NEXT:    vpsubb %ymm3, %ymm6, %ymm3
+; GFNIAVX2-NEXT:    vpsubb %ymm3, %ymm7, %ymm3
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm3, %ymm3
 ; GFNIAVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpsrlw $6, %ymm1, %ymm2
-; GFNIAVX2-NEXT:    vpandn %ymm2, %ymm7, %ymm2
-; GFNIAVX2-NEXT:    vpsllw $2, %ymm1, %ymm4
-; GFNIAVX2-NEXT:    vpand %ymm7, %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm1, %ymm2
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm9, %ymm1, %ymm4
 ; GFNIAVX2-NEXT:    vpor %ymm2, %ymm4, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
 ; GFNIAVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpsrlw $7, %ymm1, %ymm2
-; GFNIAVX2-NEXT:    vpand %ymm2, %ymm8, %ymm2
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm8, %ymm1, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm4
 ; GFNIAVX2-NEXT:    vpor %ymm2, %ymm4, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
@@ -1864,40 +1654,43 @@ define <64 x i8> @var_rotr_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
 ; GFNIAVX512VL-LABEL: var_rotr_v64i8:
 ; GFNIAVX512VL:       # %bb.0:
 ; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; GFNIAVX512VL-NEXT:    vpsllw $4, %ymm2, %ymm3
-; GFNIAVX512VL-NEXT:    vpsrlw $4, %ymm2, %ymm4
-; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135]
-; GFNIAVX512VL-NEXT:    vpternlogd $226, %ymm3, %ymm5, %ymm4
-; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
-; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT:    vpsllw $6, %ymm2, %ymm4
-; GFNIAVX512VL-NEXT:    vpsrlw $2, %ymm2, %ymm6
-; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567]
-; GFNIAVX512VL-NEXT:    vpternlogd $226, %ymm4, %ymm7, %ymm6
-; GFNIAVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT:    vpsllw $7, %ymm2, %ymm4
-; GFNIAVX512VL-NEXT:    vpsrlw $1, %ymm2, %ymm6
-; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143]
-; GFNIAVX512VL-NEXT:    vpternlogd $226, %ymm4, %ymm8, %ymm6
-; GFNIAVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; GFNIAVX512VL-NEXT:    vpternlogd $226, %ymm3, %ymm5, %ymm4
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [16909320,16909320,16909320,16909320]
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm2, %ymm4
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520]
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm2, %ymm6
+; GFNIAVX512VL-NEXT:    vpor %ymm4, %ymm6, %ymm4
+; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
+; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm6, %ymm6
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [258,258,258,258]
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm2, %ymm7
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [290499906672525312,290499906672525312,290499906672525312,290499906672525312]
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm8, %ymm2, %ymm9
+; GFNIAVX512VL-NEXT:    vpor %ymm7, %ymm9, %ymm7
+; GFNIAVX512VL-NEXT:    vpaddb %ymm6, %ymm6, %ymm6
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm6, %ymm7, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [1,1,1,1]
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm7, %ymm2, %ymm9
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [145249953336295424,145249953336295424,145249953336295424,145249953336295424]
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm10, %ymm2, %ymm11
+; GFNIAVX512VL-NEXT:    vpor %ymm9, %ymm11, %ymm9
+; GFNIAVX512VL-NEXT:    vpaddb %ymm6, %ymm6, %ymm6
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm6, %ymm9, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm3
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm0, %ymm5
+; GFNIAVX512VL-NEXT:    vpor %ymm3, %ymm5, %ymm3
 ; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vpsllw $6, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT:    vpsrlw $2, %ymm0, %ymm4
-; GFNIAVX512VL-NEXT:    vpternlogd $226, %ymm3, %ymm7, %ymm4
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm0, %ymm3
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm8, %ymm0, %ymm4
+; GFNIAVX512VL-NEXT:    vpor %ymm3, %ymm4, %ymm3
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vpsllw $7, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT:    vpsrlw $1, %ymm0, %ymm4
-; GFNIAVX512VL-NEXT:    vpternlogd $226, %ymm3, %ymm8, %ymm4
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm7, %ymm0, %ymm3
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm10, %ymm0, %ymm4
+; GFNIAVX512VL-NEXT:    vpor %ymm3, %ymm4, %ymm3
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; GFNIAVX512VL-NEXT:    retq
 ;
@@ -2464,85 +2257,31 @@ define <64 x i8> @constant_rotr_v64i8(<64 x i8> %a) nounwind {
 define <64 x i8> @splatconstant_rotl_v64i8(<64 x i8> %a) nounwind {
 ; GFNISSE-LABEL: splatconstant_rotl_v64i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa %xmm0, %xmm4
-; GFNISSE-NEXT:    psrlw $7, %xmm4
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; GFNISSE-NEXT:    pand %xmm5, %xmm4
-; GFNISSE-NEXT:    paddb %xmm0, %xmm0
-; GFNISSE-NEXT:    por %xmm4, %xmm0
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm4
-; GFNISSE-NEXT:    psrlw $7, %xmm4
-; GFNISSE-NEXT:    pand %xmm5, %xmm4
-; GFNISSE-NEXT:    paddb %xmm1, %xmm1
-; GFNISSE-NEXT:    por %xmm4, %xmm1
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm4
-; GFNISSE-NEXT:    psrlw $7, %xmm4
-; GFNISSE-NEXT:    pand %xmm5, %xmm4
-; GFNISSE-NEXT:    paddb %xmm2, %xmm2
-; GFNISSE-NEXT:    por %xmm4, %xmm2
-; GFNISSE-NEXT:    movdqa %xmm3, %xmm4
-; GFNISSE-NEXT:    psrlw $7, %xmm4
-; GFNISSE-NEXT:    pand %xmm5, %xmm4
-; GFNISSE-NEXT:    paddb %xmm3, %xmm3
-; GFNISSE-NEXT:    por %xmm4, %xmm3
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [9223655728169885760,9223655728169885760]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm1
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm2
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm3
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX1-LABEL: splatconstant_rotl_v64i8:
 ; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $7, %xmm2, %xmm3
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
-; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $7, %xmm0, %xmm3
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
-; GFNIAVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $7, %xmm2, %xmm3
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
-; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $7, %xmm1, %xmm3
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
-; GFNIAVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; GFNIAVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [9223655728169885760,9223655728169885760,9223655728169885760,9223655728169885760]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
 ; GFNIAVX1-NEXT:    retq
 ;
 ; GFNIAVX2-LABEL: splatconstant_rotl_v64i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpsrlw $7, %ymm0, %ymm2
-; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; GFNIAVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpsrlw $7, %ymm1, %ymm2
-; GFNIAVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223655728169885760,9223655728169885760,9223655728169885760,9223655728169885760]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    retq
 ;
-; GFNIAVX512VL-LABEL: splatconstant_rotl_v64i8:
-; GFNIAVX512VL:       # %bb.0:
-; GFNIAVX512VL-NEXT:    vpsrlw $7, %ymm0, %ymm1
-; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; GFNIAVX512VL-NEXT:    vpsrlw $7, %ymm2, %ymm3
-; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; GFNIAVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; GFNIAVX512VL-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
-; GFNIAVX512VL-NEXT:    retq
-;
-; GFNIAVX512BW-LABEL: splatconstant_rotl_v64i8:
-; GFNIAVX512BW:       # %bb.0:
-; GFNIAVX512BW-NEXT:    vpsrlw $7, %zmm0, %zmm1
-; GFNIAVX512BW-NEXT:    vpaddb %zmm0, %zmm0, %zmm0
-; GFNIAVX512BW-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
-; GFNIAVX512BW-NEXT:    retq
+; GFNIAVX512-LABEL: splatconstant_rotl_v64i8:
+; GFNIAVX512:       # %bb.0:
+; GFNIAVX512-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; GFNIAVX512-NEXT:    retq
   %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a, <64 x i8> %a, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   ret <64 x i8> %res
 }
@@ -2551,98 +2290,31 @@ declare <64 x i8> @llvm.fshl.v64i8(<64 x i8>, <64 x i8>, <64 x i8>)
 define <64 x i8> @splatconstant_rotr_v64i8(<64 x i8> %a) nounwind {
 ; GFNISSE-LABEL: splatconstant_rotr_v64i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa %xmm0, %xmm5
-; GFNISSE-NEXT:    psrlw $2, %xmm5
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192]
-; GFNISSE-NEXT:    movdqa %xmm4, %xmm6
-; GFNISSE-NEXT:    pandn %xmm5, %xmm6
-; GFNISSE-NEXT:    psllw $6, %xmm0
-; GFNISSE-NEXT:    pand %xmm4, %xmm0
-; GFNISSE-NEXT:    por %xmm6, %xmm0
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm5
-; GFNISSE-NEXT:    psrlw $2, %xmm5
-; GFNISSE-NEXT:    movdqa %xmm4, %xmm6
-; GFNISSE-NEXT:    pandn %xmm5, %xmm6
-; GFNISSE-NEXT:    psllw $6, %xmm1
-; GFNISSE-NEXT:    pand %xmm4, %xmm1
-; GFNISSE-NEXT:    por %xmm6, %xmm1
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm5
-; GFNISSE-NEXT:    psrlw $2, %xmm5
-; GFNISSE-NEXT:    movdqa %xmm4, %xmm6
-; GFNISSE-NEXT:    pandn %xmm5, %xmm6
-; GFNISSE-NEXT:    psllw $6, %xmm2
-; GFNISSE-NEXT:    pand %xmm4, %xmm2
-; GFNISSE-NEXT:    por %xmm6, %xmm2
-; GFNISSE-NEXT:    movdqa %xmm3, %xmm5
-; GFNISSE-NEXT:    psrlw $2, %xmm5
-; GFNISSE-NEXT:    psllw $6, %xmm3
-; GFNISSE-NEXT:    pand %xmm4, %xmm3
-; GFNISSE-NEXT:    pandn %xmm5, %xmm4
-; GFNISSE-NEXT:    por %xmm4, %xmm3
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [290499906672525570,290499906672525570]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm1
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm2
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm3
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX1-LABEL: splatconstant_rotr_v64i8:
 ; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $2, %xmm2, %xmm3
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192]
-; GFNIAVX1-NEXT:    vpandn %xmm3, %xmm4, %xmm3
-; GFNIAVX1-NEXT:    vpsllw $6, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $2, %xmm0, %xmm3
-; GFNIAVX1-NEXT:    vpandn %xmm3, %xmm4, %xmm3
-; GFNIAVX1-NEXT:    vpsllw $6, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $2, %xmm2, %xmm3
-; GFNIAVX1-NEXT:    vpandn %xmm3, %xmm4, %xmm3
-; GFNIAVX1-NEXT:    vpsllw $6, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $2, %xmm1, %xmm3
-; GFNIAVX1-NEXT:    vpandn %xmm3, %xmm4, %xmm3
-; GFNIAVX1-NEXT:    vpsllw $6, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; GFNIAVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [290499906672525570,290499906672525570,290499906672525570,290499906672525570]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
 ; GFNIAVX1-NEXT:    retq
 ;
 ; GFNIAVX2-LABEL: splatconstant_rotr_v64i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpsrlw $2, %ymm0, %ymm2
-; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192]
-; GFNIAVX2-NEXT:    vpandn %ymm2, %ymm3, %ymm2
-; GFNIAVX2-NEXT:    vpsllw $6, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpsrlw $2, %ymm1, %ymm2
-; GFNIAVX2-NEXT:    vpandn %ymm2, %ymm3, %ymm2
-; GFNIAVX2-NEXT:    vpsllw $6, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [290499906672525570,290499906672525570,290499906672525570,290499906672525570]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    retq
 ;
-; GFNIAVX512VL-LABEL: splatconstant_rotr_v64i8:
-; GFNIAVX512VL:       # %bb.0:
-; GFNIAVX512VL-NEXT:    vpsllw $6, %ymm0, %ymm1
-; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; GFNIAVX512VL-NEXT:    vpsllw $6, %ymm2, %ymm3
-; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; GFNIAVX512VL-NEXT:    vpsrlw $2, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vpsrlw $2, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; GFNIAVX512VL-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
-; GFNIAVX512VL-NEXT:    retq
-;
-; GFNIAVX512BW-LABEL: splatconstant_rotr_v64i8:
-; GFNIAVX512BW:       # %bb.0:
-; GFNIAVX512BW-NEXT:    vpsllw $6, %zmm0, %zmm1
-; GFNIAVX512BW-NEXT:    vpsrlw $2, %zmm0, %zmm0
-; GFNIAVX512BW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
-; GFNIAVX512BW-NEXT:    retq
+; GFNIAVX512-LABEL: splatconstant_rotr_v64i8:
+; GFNIAVX512:       # %bb.0:
+; GFNIAVX512-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; GFNIAVX512-NEXT:    retq
   %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a, <64 x i8> %a, <64 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>)
   ret <64 x i8> %res
 }
diff --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll
index f79407d08ab0..6232488bea71 100644
--- a/llvm/test/CodeGen/X86/gfni-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-shifts.ll
@@ -15,13 +15,11 @@ define <16 x i8> @var_shl_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; GFNISSE-NEXT:    movdqa %xmm0, %xmm2
 ; GFNISSE-NEXT:    psllw $5, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm0, %xmm3
-; GFNISSE-NEXT:    psllw $4, %xmm3
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm3
-; GFNISSE-NEXT:    psllw $2, %xmm3
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
 ; GFNISSE-NEXT:    paddb %xmm1, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
@@ -36,11 +34,9 @@ define <16 x i8> @var_shl_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; GFNIAVX1OR2-LABEL: var_shl_v16i8:
 ; GFNIAVX1OR2:       # %bb.0:
 ; GFNIAVX1OR2-NEXT:    vpsllw $5, %xmm1, %xmm1
-; GFNIAVX1OR2-NEXT:    vpsllw $4, %xmm0, %xmm2
-; GFNIAVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
 ; GFNIAVX1OR2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; GFNIAVX1OR2-NEXT:    vpsllw $2, %xmm0, %xmm2
-; GFNIAVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
 ; GFNIAVX1OR2-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
 ; GFNIAVX1OR2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
 ; GFNIAVX1OR2-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
@@ -75,19 +71,16 @@ define <16 x i8> @var_lshr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; GFNISSE-NEXT:    movdqa %xmm0, %xmm2
 ; GFNISSE-NEXT:    psllw $5, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm0, %xmm3
-; GFNISSE-NEXT:    psrlw $4, %xmm3
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm3
-; GFNISSE-NEXT:    psrlw $2, %xmm3
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
 ; GFNISSE-NEXT:    paddb %xmm1, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm3
-; GFNISSE-NEXT:    psrlw $1, %xmm3
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
 ; GFNISSE-NEXT:    paddb %xmm1, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
@@ -97,15 +90,12 @@ define <16 x i8> @var_lshr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; GFNIAVX1OR2-LABEL: var_lshr_v16i8:
 ; GFNIAVX1OR2:       # %bb.0:
 ; GFNIAVX1OR2-NEXT:    vpsllw $5, %xmm1, %xmm1
-; GFNIAVX1OR2-NEXT:    vpsrlw $4, %xmm0, %xmm2
-; GFNIAVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
 ; GFNIAVX1OR2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; GFNIAVX1OR2-NEXT:    vpsrlw $2, %xmm0, %xmm2
-; GFNIAVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
 ; GFNIAVX1OR2-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
 ; GFNIAVX1OR2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; GFNIAVX1OR2-NEXT:    vpsrlw $1, %xmm0, %xmm2
-; GFNIAVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
 ; GFNIAVX1OR2-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
 ; GFNIAVX1OR2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
 ; GFNIAVX1OR2-NEXT:    retq
@@ -562,20 +552,17 @@ define <16 x i8> @constant_ashr_v16i8(<16 x i8> %a) nounwind {
 define <16 x i8> @splatconstant_shl_v16i8(<16 x i8> %a) nounwind {
 ; GFNISSE-LABEL: splatconstant_shl_v16i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    psllw $3, %xmm0
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX1OR2-LABEL: splatconstant_shl_v16i8:
 ; GFNIAVX1OR2:       # %bb.0:
-; GFNIAVX1OR2-NEXT:    vpsllw $3, %xmm0, %xmm0
-; GFNIAVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; GFNIAVX1OR2-NEXT:    retq
 ;
 ; GFNIAVX512-LABEL: splatconstant_shl_v16i8:
 ; GFNIAVX512:       # %bb.0:
-; GFNIAVX512-NEXT:    vpsllw $3, %xmm0, %xmm0
-; GFNIAVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; GFNIAVX512-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
 ; GFNIAVX512-NEXT:    retq
   %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <16 x i8> %shift
@@ -584,20 +571,17 @@ define <16 x i8> @splatconstant_shl_v16i8(<16 x i8> %a) nounwind {
 define <16 x i8> @splatconstant_lshr_v16i8(<16 x i8> %a) nounwind {
 ; GFNISSE-LABEL: splatconstant_lshr_v16i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    psrlw $7, %xmm0
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX1OR2-LABEL: splatconstant_lshr_v16i8:
 ; GFNIAVX1OR2:       # %bb.0:
-; GFNIAVX1OR2-NEXT:    vpsrlw $7, %xmm0, %xmm0
-; GFNIAVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; GFNIAVX1OR2-NEXT:    retq
 ;
 ; GFNIAVX512-LABEL: splatconstant_lshr_v16i8:
 ; GFNIAVX512:       # %bb.0:
-; GFNIAVX512-NEXT:    vpsrlw $7, %xmm0, %xmm0
-; GFNIAVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; GFNIAVX512-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
 ; GFNIAVX512-NEXT:    retq
   %shift = lshr <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   ret <16 x i8> %shift
@@ -606,46 +590,18 @@ define <16 x i8> @splatconstant_lshr_v16i8(<16 x i8> %a) nounwind {
 define <16 x i8> @splatconstant_ashr_v16i8(<16 x i8> %a) nounwind {
 ; GFNISSE-LABEL: splatconstant_ashr_v16i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    psrlw $4, %xmm0
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; GFNISSE-NEXT:    pxor %xmm1, %xmm0
-; GFNISSE-NEXT:    psubb %xmm1, %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; GFNISSE-NEXT:    retq
 ;
-; GFNIAVX1-LABEL: splatconstant_ashr_v16i8:
-; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; GFNIAVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    retq
-;
-; GFNIAVX2-LABEL: splatconstant_ashr_v16i8:
-; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; GFNIAVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; GFNIAVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
-; GFNIAVX2-NEXT:    retq
-;
-; GFNIAVX512VL-LABEL: splatconstant_ashr_v16i8:
-; GFNIAVX512VL:       # %bb.0:
-; GFNIAVX512VL-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; GFNIAVX512VL-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
-; GFNIAVX512VL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
-; GFNIAVX512VL-NEXT:    retq
+; GFNIAVX1OR2-LABEL: splatconstant_ashr_v16i8:
+; GFNIAVX1OR2:       # %bb.0:
+; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; GFNIAVX1OR2-NEXT:    retq
 ;
-; GFNIAVX512BW-LABEL: splatconstant_ashr_v16i8:
-; GFNIAVX512BW:       # %bb.0:
-; GFNIAVX512BW-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; GFNIAVX512BW-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; GFNIAVX512BW-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
-; GFNIAVX512BW-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
-; GFNIAVX512BW-NEXT:    retq
+; GFNIAVX512-LABEL: splatconstant_ashr_v16i8:
+; GFNIAVX512:       # %bb.0:
+; GFNIAVX512-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; GFNIAVX512-NEXT:    retq
   %shift = ashr <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
   ret <16 x i8> %shift
 }
@@ -659,34 +615,30 @@ define <32 x i8> @var_shl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; GFNISSE:       # %bb.0:
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm0, %xmm2
-; GFNISSE-NEXT:    movdqa %xmm0, %xmm5
-; GFNISSE-NEXT:    psllw $4, %xmm5
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; GFNISSE-NEXT:    pand %xmm6, %xmm5
+; GFNISSE-NEXT:    pmovsxdq {{.*#+}} xmm5 = [16909320,16909320]
+; GFNISSE-NEXT:    movdqa %xmm0, %xmm6
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm6
 ; GFNISSE-NEXT:    psllw $5, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm2
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm5
-; GFNISSE-NEXT:    psllw $2, %xmm5
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; GFNISSE-NEXT:    pand %xmm7, %xmm5
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm6, %xmm2
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm6 = [1108169199648,1108169199648]
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm7
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm6, %xmm7
 ; GFNISSE-NEXT:    paddb %xmm4, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm2
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm5
-; GFNISSE-NEXT:    paddb %xmm2, %xmm5
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm7, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm7
+; GFNISSE-NEXT:    paddb %xmm2, %xmm7
 ; GFNISSE-NEXT:    paddb %xmm4, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm2
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm7, %xmm2
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm4
-; GFNISSE-NEXT:    psllw $4, %xmm4
-; GFNISSE-NEXT:    pand %xmm6, %xmm4
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm4
 ; GFNISSE-NEXT:    psllw $5, %xmm3
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm4, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm4
-; GFNISSE-NEXT:    psllw $2, %xmm4
-; GFNISSE-NEXT:    pand %xmm7, %xmm4
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm6, %xmm4
 ; GFNISSE-NEXT:    paddb %xmm3, %xmm3
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm4, %xmm1
@@ -701,26 +653,24 @@ define <32 x i8> @var_shl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; GFNIAVX1-LABEL: var_shl_v32i8:
 ; GFNIAVX1:       # %bb.0:
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; GFNIAVX1-NEXT:    vpsllw $4, %xmm2, %xmm3
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [16909320,16909320]
+; GFNIAVX1-NEXT:    # xmm3 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm2, %xmm4
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm5, %xmm5
-; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsllw $2, %xmm2, %xmm3
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; GFNIAVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [1108169199648,1108169199648]
+; GFNIAVX1-NEXT:    # xmm4 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm6
 ; GFNIAVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
-; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm3
+; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm6, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm6
 ; GFNIAVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
-; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsllw $4, %xmm0, %xmm3
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm6, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm0, %xmm3
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
 ; GFNIAVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpsllw $2, %xmm0, %xmm3
-; GFNIAVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm0, %xmm3
 ; GFNIAVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
 ; GFNIAVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
 ; GFNIAVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm3
@@ -731,12 +681,12 @@ define <32 x i8> @var_shl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ;
 ; GFNIAVX2-LABEL: var_shl_v32i8:
 ; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [16909320,16909320,16909320,16909320]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpsllw $4, %ymm0, %ymm2
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
 ; GFNIAVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpsllw $2, %ymm0, %ymm2
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [1108169199648,1108169199648,1108169199648,1108169199648]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
@@ -747,11 +697,9 @@ define <32 x i8> @var_shl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; GFNIAVX512VL-LABEL: var_shl_v32i8:
 ; GFNIAVX512VL:       # %bb.0:
 ; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm2
-; GFNIAVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm2
-; GFNIAVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
@@ -775,42 +723,36 @@ define <32 x i8> @var_lshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; GFNISSE:       # %bb.0:
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm0, %xmm2
-; GFNISSE-NEXT:    movdqa %xmm0, %xmm5
-; GFNISSE-NEXT:    psrlw $4, %xmm5
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; GFNISSE-NEXT:    pand %xmm6, %xmm5
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [1161999622361579520,1161999622361579520]
+; GFNISSE-NEXT:    movdqa %xmm0, %xmm6
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm6
 ; GFNISSE-NEXT:    psllw $5, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm2
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm5
-; GFNISSE-NEXT:    psrlw $2, %xmm5
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; GFNISSE-NEXT:    pand %xmm7, %xmm5
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm6, %xmm2
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm6 = [290499906672525312,290499906672525312]
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm7
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm6, %xmm7
 ; GFNISSE-NEXT:    paddb %xmm4, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm2
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm5
-; GFNISSE-NEXT:    psrlw $1, %xmm5
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; GFNISSE-NEXT:    pand %xmm8, %xmm5
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm7, %xmm2
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm7 = [145249953336295424,145249953336295424]
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm8
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm7, %xmm8
 ; GFNISSE-NEXT:    paddb %xmm4, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm2
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm2
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm4
-; GFNISSE-NEXT:    psrlw $4, %xmm4
-; GFNISSE-NEXT:    pand %xmm6, %xmm4
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm4
 ; GFNISSE-NEXT:    psllw $5, %xmm3
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm4, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm4
-; GFNISSE-NEXT:    psrlw $2, %xmm4
-; GFNISSE-NEXT:    pand %xmm7, %xmm4
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm6, %xmm4
 ; GFNISSE-NEXT:    paddb %xmm3, %xmm3
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm4, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm4
-; GFNISSE-NEXT:    psrlw $1, %xmm4
-; GFNISSE-NEXT:    pand %xmm8, %xmm4
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm7, %xmm4
 ; GFNISSE-NEXT:    paddb %xmm3, %xmm3
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm4, %xmm1
@@ -820,32 +762,29 @@ define <32 x i8> @var_lshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; GFNIAVX1-LABEL: var_lshr_v32i8:
 ; GFNIAVX1:       # %bb.0:
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $4, %xmm2, %xmm3
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [1161999622361579520,1161999622361579520]
+; GFNIAVX1-NEXT:    # xmm3 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm2, %xmm4
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm5, %xmm5
-; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $2, %xmm2, %xmm3
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; GFNIAVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [290499906672525312,290499906672525312]
+; GFNIAVX1-NEXT:    # xmm4 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm6
 ; GFNIAVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
-; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $1, %xmm2, %xmm3
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; GFNIAVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm6, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm6 = [145249953336295424,145249953336295424]
+; GFNIAVX1-NEXT:    # xmm6 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm6, %xmm2, %xmm7
 ; GFNIAVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
-; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $4, %xmm0, %xmm3
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm7, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm0, %xmm3
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
 ; GFNIAVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpsrlw $2, %xmm0, %xmm3
-; GFNIAVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm0, %xmm3
 ; GFNIAVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
 ; GFNIAVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpsrlw $1, %xmm0, %xmm3
-; GFNIAVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm6, %xmm0, %xmm3
 ; GFNIAVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
 ; GFNIAVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
 ; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -853,16 +792,16 @@ define <32 x i8> @var_lshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ;
 ; GFNIAVX2-LABEL: var_lshr_v32i8:
 ; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpsrlw $4, %ymm0, %ymm2
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
 ; GFNIAVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpsrlw $2, %ymm0, %ymm2
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [290499906672525312,290499906672525312,290499906672525312,290499906672525312]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpsrlw $1, %ymm0, %ymm2
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [145249953336295424,145249953336295424,145249953336295424,145249953336295424]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    retq
@@ -870,15 +809,12 @@ define <32 x i8> @var_lshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; GFNIAVX512VL-LABEL: var_lshr_v32i8:
 ; GFNIAVX512VL:       # %bb.0:
 ; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm2
-; GFNIAVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vpsrlw $2, %ymm0, %ymm2
-; GFNIAVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vpsrlw $1, %ymm0, %ymm2
-; GFNIAVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    retq
@@ -1539,34 +1475,25 @@ define <32 x i8> @constant_ashr_v32i8(<32 x i8> %a) nounwind {
 define <32 x i8> @splatconstant_shl_v32i8(<32 x i8> %a) nounwind {
 ; GFNISSE-LABEL: splatconstant_shl_v32i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    psllw $6, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192]
-; GFNISSE-NEXT:    pand %xmm2, %xmm0
-; GFNISSE-NEXT:    psllw $6, %xmm1
-; GFNISSE-NEXT:    pand %xmm2, %xmm1
+; GFNISSE-NEXT:    pmovsxwq {{.*#+}} xmm2 = [258,258]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm1
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX1-LABEL: splatconstant_shl_v32i8:
 ; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; GFNIAVX1-NEXT:    vpsllw $6, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192]
-; GFNIAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpsllw $6, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; GFNIAVX1-NEXT:    retq
 ;
 ; GFNIAVX2-LABEL: splatconstant_shl_v32i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpsllw $6, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [258,258,258,258]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    retq
 ;
 ; GFNIAVX512-LABEL: splatconstant_shl_v32i8:
 ; GFNIAVX512:       # %bb.0:
-; GFNIAVX512-NEXT:    vpsllw $6, %ymm0, %ymm0
-; GFNIAVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
+; GFNIAVX512-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
 ; GFNIAVX512-NEXT:    retq
   %shift = shl <32 x i8> %a, <i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6>
   ret <32 x i8> %shift
@@ -1575,34 +1502,25 @@ define <32 x i8> @splatconstant_shl_v32i8(<32 x i8> %a) nounwind {
 define <32 x i8> @splatconstant_lshr_v32i8(<32 x i8> %a) nounwind {
 ; GFNISSE-LABEL: splatconstant_lshr_v32i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    psrlw $1, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; GFNISSE-NEXT:    pand %xmm2, %xmm0
-; GFNISSE-NEXT:    psrlw $1, %xmm1
-; GFNISSE-NEXT:    pand %xmm2, %xmm1
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [145249953336295424,145249953336295424]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm1
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX1-LABEL: splatconstant_lshr_v32i8:
 ; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; GFNIAVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; GFNIAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; GFNIAVX1-NEXT:    retq
 ;
 ; GFNIAVX2-LABEL: splatconstant_lshr_v32i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [145249953336295424,145249953336295424,145249953336295424,145249953336295424]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    retq
 ;
 ; GFNIAVX512-LABEL: splatconstant_lshr_v32i8:
 ; GFNIAVX512:       # %bb.0:
-; GFNIAVX512-NEXT:    vpsrlw $1, %ymm0, %ymm0
-; GFNIAVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
+; GFNIAVX512-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
 ; GFNIAVX512-NEXT:    retq
   %shift = lshr <32 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   ret <32 x i8> %shift
@@ -1611,58 +1529,26 @@ define <32 x i8> @splatconstant_lshr_v32i8(<32 x i8> %a) nounwind {
 define <32 x i8> @splatconstant_ashr_v32i8(<32 x i8> %a) nounwind {
 ; GFNISSE-LABEL: splatconstant_ashr_v32i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    psrlw $2, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; GFNISSE-NEXT:    pand %xmm2, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; GFNISSE-NEXT:    pxor %xmm3, %xmm0
-; GFNISSE-NEXT:    psubb %xmm3, %xmm0
-; GFNISSE-NEXT:    psrlw $2, %xmm1
-; GFNISSE-NEXT:    pand %xmm2, %xmm1
-; GFNISSE-NEXT:    pxor %xmm3, %xmm1
-; GFNISSE-NEXT:    psubb %xmm3, %xmm1
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [290499906672558208,290499906672558208]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm1
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX1-LABEL: splatconstant_ashr_v32i8:
 ; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; GFNIAVX1-NEXT:    vpsrlw $2, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; GFNIAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; GFNIAVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpsrlw $2, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; GFNIAVX1-NEXT:    retq
 ;
 ; GFNIAVX2-LABEL: splatconstant_ashr_v32i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpsrlw $2, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; GFNIAVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [290499906672558208,290499906672558208,290499906672558208,290499906672558208]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    retq
 ;
-; GFNIAVX512VL-LABEL: splatconstant_ashr_v32i8:
-; GFNIAVX512VL:       # %bb.0:
-; GFNIAVX512VL-NEXT:    vpsrlw $2, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; GFNIAVX512VL-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
-; GFNIAVX512VL-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    retq
-;
-; GFNIAVX512BW-LABEL: splatconstant_ashr_v32i8:
-; GFNIAVX512BW:       # %bb.0:
-; GFNIAVX512BW-NEXT:    vpsrlw $2, %ymm0, %ymm0
-; GFNIAVX512BW-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; GFNIAVX512BW-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
-; GFNIAVX512BW-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
-; GFNIAVX512BW-NEXT:    retq
+; GFNIAVX512-LABEL: splatconstant_ashr_v32i8:
+; GFNIAVX512:       # %bb.0:
+; GFNIAVX512-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
+; GFNIAVX512-NEXT:    retq
   %shift = ashr <32 x i8> %a, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
   ret <32 x i8> %shift
 }
@@ -1676,17 +1562,15 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNISSE:       # %bb.0:
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm8
 ; GFNISSE-NEXT:    movdqa %xmm0, %xmm4
+; GFNISSE-NEXT:    pmovsxdq {{.*#+}} xmm9 = [16909320,16909320]
 ; GFNISSE-NEXT:    movdqa %xmm0, %xmm10
-; GFNISSE-NEXT:    psllw $4, %xmm10
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm9 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; GFNISSE-NEXT:    pand %xmm9, %xmm10
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm10
 ; GFNISSE-NEXT:    psllw $5, %xmm8
 ; GFNISSE-NEXT:    movdqa %xmm8, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm10, %xmm4
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm10 = [1108169199648,1108169199648]
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm11
-; GFNISSE-NEXT:    psllw $2, %xmm11
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm10 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; GFNISSE-NEXT:    pand %xmm10, %xmm11
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm11
 ; GFNISSE-NEXT:    paddb %xmm8, %xmm8
 ; GFNISSE-NEXT:    movdqa %xmm8, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm11, %xmm4
@@ -1696,14 +1580,12 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNISSE-NEXT:    movdqa %xmm8, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm11, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm8
-; GFNISSE-NEXT:    psllw $4, %xmm8
-; GFNISSE-NEXT:    pand %xmm9, %xmm8
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm8
 ; GFNISSE-NEXT:    psllw $5, %xmm5
 ; GFNISSE-NEXT:    movdqa %xmm5, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm8
-; GFNISSE-NEXT:    psllw $2, %xmm8
-; GFNISSE-NEXT:    pand %xmm10, %xmm8
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm8
 ; GFNISSE-NEXT:    paddb %xmm5, %xmm5
 ; GFNISSE-NEXT:    movdqa %xmm5, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm1
@@ -1713,14 +1595,12 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNISSE-NEXT:    movdqa %xmm5, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm5
-; GFNISSE-NEXT:    psllw $4, %xmm5
-; GFNISSE-NEXT:    pand %xmm9, %xmm5
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm5
 ; GFNISSE-NEXT:    psllw $5, %xmm6
 ; GFNISSE-NEXT:    movdqa %xmm6, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm2
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm5
-; GFNISSE-NEXT:    psllw $2, %xmm5
-; GFNISSE-NEXT:    pand %xmm10, %xmm5
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm5
 ; GFNISSE-NEXT:    paddb %xmm6, %xmm6
 ; GFNISSE-NEXT:    movdqa %xmm6, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm2
@@ -1730,14 +1610,12 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNISSE-NEXT:    movdqa %xmm6, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm2
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm5
-; GFNISSE-NEXT:    psllw $4, %xmm5
-; GFNISSE-NEXT:    pand %xmm9, %xmm5
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm5
 ; GFNISSE-NEXT:    psllw $5, %xmm7
 ; GFNISSE-NEXT:    movdqa %xmm7, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm3
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm5
-; GFNISSE-NEXT:    psllw $2, %xmm5
-; GFNISSE-NEXT:    pand %xmm10, %xmm5
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm5
 ; GFNISSE-NEXT:    paddb %xmm7, %xmm7
 ; GFNISSE-NEXT:    movdqa %xmm7, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm3
@@ -1752,26 +1630,24 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNIAVX1-LABEL: var_shl_v64i8:
 ; GFNIAVX1:       # %bb.0:
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; GFNIAVX1-NEXT:    vpsllw $4, %xmm5, %xmm6
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [16909320,16909320]
+; GFNIAVX1-NEXT:    # xmm4 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm5, %xmm6
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm7, %xmm7
 ; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm6, %xmm5, %xmm6
-; GFNIAVX1-NEXT:    vpsllw $2, %xmm6, %xmm8
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; GFNIAVX1-NEXT:    vpand %xmm5, %xmm8, %xmm8
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [1108169199648,1108169199648]
+; GFNIAVX1-NEXT:    # xmm5 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm6, %xmm8
 ; GFNIAVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm7
 ; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm8, %xmm6, %xmm6
 ; GFNIAVX1-NEXT:    vpaddb %xmm6, %xmm6, %xmm8
 ; GFNIAVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm7
 ; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm8, %xmm6, %xmm6
-; GFNIAVX1-NEXT:    vpsllw $4, %xmm0, %xmm7
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm0, %xmm7
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm2, %xmm2
 ; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm7, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpsllw $2, %xmm0, %xmm7
-; GFNIAVX1-NEXT:    vpand %xmm5, %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm0, %xmm7
 ; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
 ; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm7, %xmm0, %xmm0
 ; GFNIAVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm7
@@ -1779,24 +1655,20 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm7, %xmm0, %xmm0
 ; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; GFNIAVX1-NEXT:    vpsllw $4, %xmm2, %xmm6
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm6
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm3, %xmm7
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm7, %xmm7
 ; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm6, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsllw $2, %xmm2, %xmm6
-; GFNIAVX1-NEXT:    vpand %xmm5, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm2, %xmm6
 ; GFNIAVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm7
 ; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm6, %xmm2, %xmm2
 ; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm6
 ; GFNIAVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm7
 ; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm6, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsllw $4, %xmm1, %xmm6
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm6, %xmm4
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm1, %xmm4
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm3, %xmm3
 ; GFNIAVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpsllw $2, %xmm1, %xmm4
-; GFNIAVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm1, %xmm4
 ; GFNIAVX1-NEXT:    vpaddb %xmm3, %xmm3, %xmm3
 ; GFNIAVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm1, %xmm1
 ; GFNIAVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm4
@@ -1807,25 +1679,21 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ;
 ; GFNIAVX2-LABEL: var_shl_v64i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpsllw $4, %ymm0, %ymm4
-; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; GFNIAVX2-NEXT:    vpand %ymm5, %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [16909320,16909320,16909320,16909320]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm0, %ymm5
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpsllw $2, %ymm0, %ymm4
-; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; GFNIAVX2-NEXT:    vpand %ymm6, %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm5, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [1108169199648,1108169199648,1108169199648,1108169199648]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm0, %ymm6
 ; GFNIAVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm4
+; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm6, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm6
 ; GFNIAVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpsllw $4, %ymm1, %ymm2
-; GFNIAVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm6, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm1, %ymm2
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm3, %ymm3
 ; GFNIAVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpsllw $2, %ymm1, %ymm2
-; GFNIAVX2-NEXT:    vpand %ymm6, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm1, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
 ; GFNIAVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm2
@@ -1836,26 +1704,22 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNIAVX512VL-LABEL: var_shl_v64i8:
 ; GFNIAVX512VL:       # %bb.0:
 ; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; GFNIAVX512VL-NEXT:    vpsllw $4, %ymm2, %ymm3
-; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; GFNIAVX512VL-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [16909320,16909320,16909320,16909320]
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm2, %ymm4
 ; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm5
 ; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm5, %ymm5
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT:    vpsllw $2, %ymm2, %ymm3
-; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [1108169199648,1108169199648,1108169199648,1108169199648]
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm2, %ymm6
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm3
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm5, %ymm6, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm6
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm5, %ymm6, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm3
 ; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm0, %ymm3
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm3
@@ -1866,16 +1730,12 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ;
 ; GFNIAVX512BW-LABEL: var_shl_v64i8:
 ; GFNIAVX512BW:       # %bb.0:
-; GFNIAVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm2
-; GFNIAVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
 ; GFNIAVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
 ; GFNIAVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT:    vpsllw $2, %zmm0, %zmm2
-; GFNIAVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
+; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
 ; GFNIAVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
 ; GFNIAVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
 ; GFNIAVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
 ; GFNIAVX512BW-NEXT:    vpmovb2m %zmm1, %k1
 ; GFNIAVX512BW-NEXT:    vpaddb %zmm0, %zmm0, %zmm0 {%k1}
@@ -1889,78 +1749,66 @@ define <64 x i8> @var_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNISSE:       # %bb.0:
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm8
 ; GFNISSE-NEXT:    movdqa %xmm0, %xmm4
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm9 = [1161999622361579520,1161999622361579520]
 ; GFNISSE-NEXT:    movdqa %xmm0, %xmm10
-; GFNISSE-NEXT:    psrlw $4, %xmm10
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; GFNISSE-NEXT:    pand %xmm9, %xmm10
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm10
 ; GFNISSE-NEXT:    psllw $5, %xmm8
 ; GFNISSE-NEXT:    movdqa %xmm8, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm10, %xmm4
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm10 = [290499906672525312,290499906672525312]
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm11
-; GFNISSE-NEXT:    psrlw $2, %xmm11
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; GFNISSE-NEXT:    pand %xmm10, %xmm11
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm11
 ; GFNISSE-NEXT:    paddb %xmm8, %xmm8
 ; GFNISSE-NEXT:    movdqa %xmm8, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm11, %xmm4
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm11 = [145249953336295424,145249953336295424]
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm12
-; GFNISSE-NEXT:    psrlw $1, %xmm12
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; GFNISSE-NEXT:    pand %xmm11, %xmm12
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm12
 ; GFNISSE-NEXT:    paddb %xmm8, %xmm8
 ; GFNISSE-NEXT:    movdqa %xmm8, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm12, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm8
-; GFNISSE-NEXT:    psrlw $4, %xmm8
-; GFNISSE-NEXT:    pand %xmm9, %xmm8
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm8
 ; GFNISSE-NEXT:    psllw $5, %xmm5
 ; GFNISSE-NEXT:    movdqa %xmm5, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm8
-; GFNISSE-NEXT:    psrlw $2, %xmm8
-; GFNISSE-NEXT:    pand %xmm10, %xmm8
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm8
 ; GFNISSE-NEXT:    paddb %xmm5, %xmm5
 ; GFNISSE-NEXT:    movdqa %xmm5, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm8
-; GFNISSE-NEXT:    psrlw $1, %xmm8
-; GFNISSE-NEXT:    pand %xmm11, %xmm8
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm8
 ; GFNISSE-NEXT:    paddb %xmm5, %xmm5
 ; GFNISSE-NEXT:    movdqa %xmm5, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm5
-; GFNISSE-NEXT:    psrlw $4, %xmm5
-; GFNISSE-NEXT:    pand %xmm9, %xmm5
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm5
 ; GFNISSE-NEXT:    psllw $5, %xmm6
 ; GFNISSE-NEXT:    movdqa %xmm6, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm2
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm5
-; GFNISSE-NEXT:    psrlw $2, %xmm5
-; GFNISSE-NEXT:    pand %xmm10, %xmm5
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm5
 ; GFNISSE-NEXT:    paddb %xmm6, %xmm6
 ; GFNISSE-NEXT:    movdqa %xmm6, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm2
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm5
-; GFNISSE-NEXT:    psrlw $1, %xmm5
-; GFNISSE-NEXT:    pand %xmm11, %xmm5
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm5
 ; GFNISSE-NEXT:    paddb %xmm6, %xmm6
 ; GFNISSE-NEXT:    movdqa %xmm6, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm2
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm5
-; GFNISSE-NEXT:    psrlw $4, %xmm5
-; GFNISSE-NEXT:    pand %xmm9, %xmm5
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm5
 ; GFNISSE-NEXT:    psllw $5, %xmm7
 ; GFNISSE-NEXT:    movdqa %xmm7, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm3
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm5
-; GFNISSE-NEXT:    psrlw $2, %xmm5
-; GFNISSE-NEXT:    pand %xmm10, %xmm5
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm5
 ; GFNISSE-NEXT:    paddb %xmm7, %xmm7
 ; GFNISSE-NEXT:    movdqa %xmm7, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm3
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm5
-; GFNISSE-NEXT:    psrlw $1, %xmm5
-; GFNISSE-NEXT:    pand %xmm11, %xmm5
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm5
 ; GFNISSE-NEXT:    paddb %xmm7, %xmm7
 ; GFNISSE-NEXT:    movdqa %xmm7, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm3
@@ -1970,59 +1818,50 @@ define <64 x i8> @var_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNIAVX1-LABEL: var_lshr_v64i8:
 ; GFNIAVX1:       # %bb.0:
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; GFNIAVX1-NEXT:    vpsrlw $4, %xmm5, %xmm6
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [1161999622361579520,1161999622361579520]
+; GFNIAVX1-NEXT:    # xmm4 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm5, %xmm6
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm7, %xmm7
 ; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm6, %xmm5, %xmm6
-; GFNIAVX1-NEXT:    vpsrlw $2, %xmm6, %xmm8
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; GFNIAVX1-NEXT:    vpand %xmm5, %xmm8, %xmm8
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [290499906672525312,290499906672525312]
+; GFNIAVX1-NEXT:    # xmm5 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm6, %xmm8
 ; GFNIAVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm7
 ; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm8, %xmm6, %xmm8
-; GFNIAVX1-NEXT:    vpsrlw $1, %xmm8, %xmm9
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; GFNIAVX1-NEXT:    vpand %xmm6, %xmm9, %xmm9
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm6 = [145249953336295424,145249953336295424]
+; GFNIAVX1-NEXT:    # xmm6 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm6, %xmm8, %xmm9
 ; GFNIAVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm7
 ; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm9, %xmm8, %xmm7
-; GFNIAVX1-NEXT:    vpsrlw $4, %xmm0, %xmm8
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm8, %xmm8
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm0, %xmm8
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm2, %xmm2
 ; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm8, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpsrlw $2, %xmm0, %xmm8
-; GFNIAVX1-NEXT:    vpand %xmm5, %xmm8, %xmm8
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm0, %xmm8
 ; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
 ; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm8, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpsrlw $1, %xmm0, %xmm8
-; GFNIAVX1-NEXT:    vpand %xmm6, %xmm8, %xmm8
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm6, %xmm0, %xmm8
 ; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
 ; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm8, %xmm0, %xmm0
 ; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm0
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $4, %xmm2, %xmm7
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm7
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm3, %xmm8
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm8, %xmm8
 ; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm7, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $2, %xmm2, %xmm7
-; GFNIAVX1-NEXT:    vpand %xmm5, %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm2, %xmm7
 ; GFNIAVX1-NEXT:    vpaddb %xmm8, %xmm8, %xmm8
 ; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm7, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $1, %xmm2, %xmm7
-; GFNIAVX1-NEXT:    vpand %xmm6, %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm6, %xmm2, %xmm7
 ; GFNIAVX1-NEXT:    vpaddb %xmm8, %xmm8, %xmm8
 ; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm7, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $4, %xmm1, %xmm7
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm7, %xmm4
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm1, %xmm4
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm3, %xmm3
 ; GFNIAVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpsrlw $2, %xmm1, %xmm4
-; GFNIAVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm1, %xmm4
 ; GFNIAVX1-NEXT:    vpaddb %xmm3, %xmm3, %xmm3
 ; GFNIAVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpsrlw $1, %xmm1, %xmm4
-; GFNIAVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm6, %xmm1, %xmm4
 ; GFNIAVX1-NEXT:    vpaddb %xmm3, %xmm3, %xmm3
 ; GFNIAVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm1, %xmm1
 ; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -2030,31 +1869,25 @@ define <64 x i8> @var_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ;
 ; GFNIAVX2-LABEL: var_lshr_v64i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; GFNIAVX2-NEXT:    vpand %ymm5, %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm0, %ymm5
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpsrlw $2, %ymm0, %ymm4
-; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; GFNIAVX2-NEXT:    vpand %ymm6, %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm5, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [290499906672525312,290499906672525312,290499906672525312,290499906672525312]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm0, %ymm6
 ; GFNIAVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpsrlw $1, %ymm0, %ymm4
-; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; GFNIAVX2-NEXT:    vpand %ymm7, %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm6, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [145249953336295424,145249953336295424,145249953336295424,145249953336295424]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm6, %ymm0, %ymm7
 ; GFNIAVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpsrlw $4, %ymm1, %ymm2
-; GFNIAVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm7, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm1, %ymm2
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm3, %ymm3
 ; GFNIAVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpsrlw $2, %ymm1, %ymm2
-; GFNIAVX2-NEXT:    vpand %ymm6, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm1, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
 ; GFNIAVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpsrlw $1, %ymm1, %ymm2
-; GFNIAVX2-NEXT:    vpand %ymm7, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm6, %ymm1, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
 ; GFNIAVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    retq
@@ -2062,32 +1895,26 @@ define <64 x i8> @var_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNIAVX512VL-LABEL: var_lshr_v64i8:
 ; GFNIAVX512VL:       # %bb.0:
 ; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; GFNIAVX512VL-NEXT:    vpsrlw $4, %ymm2, %ymm3
-; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; GFNIAVX512VL-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520]
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm2, %ymm4
 ; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm5
 ; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm5, %ymm5
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT:    vpsrlw $2, %ymm2, %ymm3
-; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [290499906672525312,290499906672525312,290499906672525312,290499906672525312]
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm2, %ymm6
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT:    vpsrlw $1, %ymm2, %ymm3
-; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; GFNIAVX512VL-NEXT:    vpand %ymm7, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm5, %ymm6, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [145249953336295424,145249953336295424,145249953336295424,145249953336295424]
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm6, %ymm2, %ymm7
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm5, %ymm7, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm3
 ; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vpsrlw $2, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm0, %ymm3
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vpsrlw $1, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT:    vpand %ymm7, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm6, %ymm0, %ymm3
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
@@ -2095,21 +1922,15 @@ define <64 x i8> @var_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ;
 ; GFNIAVX512BW-LABEL: var_lshr_v64i8:
 ; GFNIAVX512BW:       # %bb.0:
-; GFNIAVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm2
-; GFNIAVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
 ; GFNIAVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
 ; GFNIAVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT:    vpsrlw $2, %zmm0, %zmm2
-; GFNIAVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
+; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
 ; GFNIAVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
 ; GFNIAVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT:    vpsrlw $1, %zmm0, %zmm2
-; GFNIAVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
+; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
 ; GFNIAVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
 ; GFNIAVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
 ; GFNIAVX512BW-NEXT:    retq
   %shift = lshr <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -3214,57 +3035,31 @@ define <64 x i8> @constant_ashr_v64i8(<64 x i8> %a) nounwind {
 define <64 x i8> @splatconstant_shl_v64i8(<64 x i8> %a) nounwind {
 ; GFNISSE-LABEL: splatconstant_shl_v64i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    psllw $5, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224]
-; GFNISSE-NEXT:    pand %xmm4, %xmm0
-; GFNISSE-NEXT:    psllw $5, %xmm1
-; GFNISSE-NEXT:    pand %xmm4, %xmm1
-; GFNISSE-NEXT:    psllw $5, %xmm2
-; GFNISSE-NEXT:    pand %xmm4, %xmm2
-; GFNISSE-NEXT:    psllw $5, %xmm3
-; GFNISSE-NEXT:    pand %xmm4, %xmm3
+; GFNISSE-NEXT:    pmovsxdq {{.*#+}} xmm4 = [66052,66052]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm1
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm2
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm3
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX1-LABEL: splatconstant_shl_v64i8:
 ; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; GFNIAVX1-NEXT:    vpsllw $5, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224]
-; GFNIAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsllw $5, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; GFNIAVX1-NEXT:    vpsllw $5, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; GFNIAVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [4,2,1,0,0,0,0,0,4,2,1,0,0,0,0,0,4,2,1,0,0,0,0,0,4,2,1,0,0,0,0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
 ; GFNIAVX1-NEXT:    retq
 ;
 ; GFNIAVX2-LABEL: splatconstant_shl_v64i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpsllw $5, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224]
-; GFNIAVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [66052,66052,66052,66052]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    retq
 ;
-; GFNIAVX512VL-LABEL: splatconstant_shl_v64i8:
-; GFNIAVX512VL:       # %bb.0:
-; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm0, %ymm1
-; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; GFNIAVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; GFNIAVX512VL-NEXT:    retq
-;
-; GFNIAVX512BW-LABEL: splatconstant_shl_v64i8:
-; GFNIAVX512BW:       # %bb.0:
-; GFNIAVX512BW-NEXT:    vpsllw $5, %zmm0, %zmm0
-; GFNIAVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; GFNIAVX512BW-NEXT:    retq
+; GFNIAVX512-LABEL: splatconstant_shl_v64i8:
+; GFNIAVX512:       # %bb.0:
+; GFNIAVX512-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; GFNIAVX512-NEXT:    retq
   %shift = shl <64 x i8> %a, <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>
   ret <64 x i8> %shift
 }
@@ -3272,57 +3067,31 @@ define <64 x i8> @splatconstant_shl_v64i8(<64 x i8> %a) nounwind {
 define <64 x i8> @splatconstant_lshr_v64i8(<64 x i8> %a) nounwind {
 ; GFNISSE-LABEL: splatconstant_lshr_v64i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    psrlw $7, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; GFNISSE-NEXT:    pand %xmm4, %xmm0
-; GFNISSE-NEXT:    psrlw $7, %xmm1
-; GFNISSE-NEXT:    pand %xmm4, %xmm1
-; GFNISSE-NEXT:    psrlw $7, %xmm2
-; GFNISSE-NEXT:    pand %xmm4, %xmm2
-; GFNISSE-NEXT:    psrlw $7, %xmm3
-; GFNISSE-NEXT:    pand %xmm4, %xmm3
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm1
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm2
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm3
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX1-LABEL: splatconstant_lshr_v64i8:
 ; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $7, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; GFNIAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $7, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $7, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $7, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; GFNIAVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
 ; GFNIAVX1-NEXT:    retq
 ;
 ; GFNIAVX2-LABEL: splatconstant_lshr_v64i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpsrlw $7, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; GFNIAVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpsrlw $7, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    retq
 ;
-; GFNIAVX512VL-LABEL: splatconstant_lshr_v64i8:
-; GFNIAVX512VL:       # %bb.0:
-; GFNIAVX512VL-NEXT:    vpsrlw $7, %ymm0, %ymm1
-; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; GFNIAVX512VL-NEXT:    vpsrlw $7, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; GFNIAVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; GFNIAVX512VL-NEXT:    retq
-;
-; GFNIAVX512BW-LABEL: splatconstant_lshr_v64i8:
-; GFNIAVX512BW:       # %bb.0:
-; GFNIAVX512BW-NEXT:    vpsrlw $7, %zmm0, %zmm0
-; GFNIAVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; GFNIAVX512BW-NEXT:    retq
+; GFNIAVX512-LABEL: splatconstant_lshr_v64i8:
+; GFNIAVX512:       # %bb.0:
+; GFNIAVX512-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; GFNIAVX512-NEXT:    retq
   %shift = lshr <64 x i8> %a, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   ret <64 x i8> %shift
 }
@@ -3330,87 +3099,31 @@ define <64 x i8> @splatconstant_lshr_v64i8(<64 x i8> %a) nounwind {
 define <64 x i8> @splatconstant_ashr_v64i8(<64 x i8> %a) nounwind {
 ; GFNISSE-LABEL: splatconstant_ashr_v64i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    psrlw $1, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; GFNISSE-NEXT:    pand %xmm4, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
-; GFNISSE-NEXT:    pxor %xmm5, %xmm0
-; GFNISSE-NEXT:    psubb %xmm5, %xmm0
-; GFNISSE-NEXT:    psrlw $1, %xmm1
-; GFNISSE-NEXT:    pand %xmm4, %xmm1
-; GFNISSE-NEXT:    pxor %xmm5, %xmm1
-; GFNISSE-NEXT:    psubb %xmm5, %xmm1
-; GFNISSE-NEXT:    psrlw $1, %xmm2
-; GFNISSE-NEXT:    pand %xmm4, %xmm2
-; GFNISSE-NEXT:    pxor %xmm5, %xmm2
-; GFNISSE-NEXT:    psubb %xmm5, %xmm2
-; GFNISSE-NEXT:    psrlw $1, %xmm3
-; GFNISSE-NEXT:    pand %xmm4, %xmm3
-; GFNISSE-NEXT:    pxor %xmm5, %xmm3
-; GFNISSE-NEXT:    psubb %xmm5, %xmm3
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [145249953336295552,145249953336295552]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm1
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm2
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm3
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX1-LABEL: splatconstant_ashr_v64i8:
 ; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $1, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; GFNIAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
-; GFNIAVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsubb %xmm4, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpsubb %xmm4, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $1, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsubb %xmm4, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpsubb %xmm4, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; GFNIAVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [128,128,64,32,16,8,4,2,128,128,64,32,16,8,4,2,128,128,64,32,16,8,4,2,128,128,64,32,16,8,4,2]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
 ; GFNIAVX1-NEXT:    retq
 ;
 ; GFNIAVX2-LABEL: splatconstant_ashr_v64i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; GFNIAVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
-; GFNIAVX2-NEXT:    vpxor %ymm3, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpsubb %ymm3, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpsrlw $1, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpsubb %ymm3, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [145249953336295552,145249953336295552,145249953336295552,145249953336295552]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    retq
 ;
-; GFNIAVX512VL-LABEL: splatconstant_ashr_v64i8:
-; GFNIAVX512VL:       # %bb.0:
-; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; GFNIAVX512VL-NEXT:    vpsrlw $1, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
-; GFNIAVX512VL-NEXT:    vpternlogq $108, %ymm2, %ymm3, %ymm1
-; GFNIAVX512VL-NEXT:    vpsubb %ymm3, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpsrlw $1, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vpternlogq $108, %ymm2, %ymm3, %ymm0
-; GFNIAVX512VL-NEXT:    vpsubb %ymm3, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; GFNIAVX512VL-NEXT:    retq
-;
-; GFNIAVX512BW-LABEL: splatconstant_ashr_v64i8:
-; GFNIAVX512BW:       # %bb.0:
-; GFNIAVX512BW-NEXT:    vpsrlw $1, %zmm0, %zmm0
-; GFNIAVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
-; GFNIAVX512BW-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
-; GFNIAVX512BW-NEXT:    vpsubb %zmm1, %zmm0, %zmm0
-; GFNIAVX512BW-NEXT:    retq
+; GFNIAVX512-LABEL: splatconstant_ashr_v64i8:
+; GFNIAVX512:       # %bb.0:
+; GFNIAVX512-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; GFNIAVX512-NEXT:    retq
   %shift = ashr <64 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   ret <64 x i8> %shift
 }
diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll
index a026757a0264..5fe1e2996ee9 100644
--- a/llvm/test/CodeGen/X86/horizontal-sum.ll
+++ b/llvm/test/CodeGen/X86/horizontal-sum.ll
@@ -679,9 +679,8 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
 ; AVX1-SLOW-NEXT:    vphaddd %xmm1, %xmm0, %xmm4
 ; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
 ; AVX1-SLOW-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX1-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; AVX1-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
 ; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
 ; AVX1-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
@@ -704,9 +703,8 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
 ; AVX1-FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm4
 ; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
 ; AVX1-FAST-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX1-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; AVX1-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; AVX1-FAST-NEXT:    vphaddd %xmm2, %xmm2, %xmm1
 ; AVX1-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
 ; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
@@ -727,9 +725,8 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
 ; AVX2-SLOW-NEXT:    vphaddd %xmm1, %xmm0, %xmm4
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
 ; AVX2-SLOW-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
 ; AVX2-SLOW-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
 ; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
@@ -752,9 +749,8 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
 ; AVX2-FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm4
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
 ; AVX2-FAST-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; AVX2-FAST-NEXT:    vphaddd %xmm2, %xmm2, %xmm1
 ; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
diff --git a/llvm/test/CodeGen/X86/known-never-zero.ll b/llvm/test/CodeGen/X86/known-never-zero.ll
index 2f780e3c6fe1..f0504e7dbdb6 100644
--- a/llvm/test/CodeGen/X86/known-never-zero.ll
+++ b/llvm/test/CodeGen/X86/known-never-zero.ll
@@ -1612,7 +1612,7 @@ define i32 @sext_known_nonzero(i16 %xx) {
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $256, %eax # imm = 0x100
 ; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    cwtl
+; X86-NEXT:    movzwl %ax, %eax
 ; X86-NEXT:    rep bsfl %eax, %eax
 ; X86-NEXT:    retl
 ;
@@ -1622,7 +1622,7 @@ define i32 @sext_known_nonzero(i16 %xx) {
 ; X64-NEXT:    movl $256, %eax # imm = 0x100
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shll %cl, %eax
-; X64-NEXT:    cwtl
+; X64-NEXT:    movzwl %ax, %eax
 ; X64-NEXT:    rep bsfl %eax, %eax
 ; X64-NEXT:    retq
   %x = shl nuw nsw i16 256, %xx
diff --git a/llvm/test/CodeGen/X86/memcpy-scoped-aa.ll b/llvm/test/CodeGen/X86/memcpy-scoped-aa.ll
index 7765297dc673..d3b86786a630 100644
--- a/llvm/test/CodeGen/X86/memcpy-scoped-aa.ll
+++ b/llvm/test/CodeGen/X86/memcpy-scoped-aa.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=x86_64-linux-gnu -stop-after=finalize-isel -o - %s | FileCheck --check-prefix=MIR %s
 
 ; Ensure that the scoped AA is attached on loads/stores lowered from mem ops.
@@ -10,12 +11,21 @@
 ; MIR-DAG: ![[SET0:[0-9]+]] = !{![[SCOPE0]]}
 ; MIR-DAG: ![[SET1:[0-9]+]] = !{![[SCOPE1]]}
 
-; MIR-LABEL: name: test_memcpy
-; MIR:      %2:gr64 = MOV64rm %0, 1, $noreg, 16, $noreg :: (load (s64) from %ir.p1, align 4, !alias.scope ![[SET0]], !noalias ![[SET1]])
-; MIR-NEXT: %3:gr64 = MOV64rm %0, 1, $noreg, 24, $noreg :: (load (s64) from %ir.p1 + 8, align 4, !alias.scope ![[SET0]], !noalias ![[SET1]])
-; MIR-NEXT: MOV64mr %0, 1, $noreg, 8, $noreg, killed %3 :: (store (s64) into %ir.p0 + 8, align 4, !alias.scope ![[SET0]], !noalias ![[SET1]])
-; MIR-NEXT: MOV64mr %0, 1, $noreg, 0, $noreg, killed %2 :: (store (s64) into %ir.p0, align 4, !alias.scope ![[SET0]], !noalias ![[SET1]])
 define i32 @test_memcpy(ptr nocapture %p, ptr nocapture readonly %q) {
+  ; MIR-LABEL: name: test_memcpy
+  ; MIR: bb.0 (%ir-block.0):
+  ; MIR-NEXT:   liveins: $rdi, $rsi
+  ; MIR-NEXT: {{  $}}
+  ; MIR-NEXT:   [[COPY:%[0-9]+]]:gr64 = COPY $rsi
+  ; MIR-NEXT:   [[COPY1:%[0-9]+]]:gr64 = COPY $rdi
+  ; MIR-NEXT:   [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm [[COPY1]], 1, $noreg, 16, $noreg :: (load (s64) from %ir.p1, align 4, !alias.scope !0, !noalias !3)
+  ; MIR-NEXT:   [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm [[COPY1]], 1, $noreg, 24, $noreg :: (load (s64) from %ir.p1 + 8, align 4, !alias.scope !0, !noalias !3)
+  ; MIR-NEXT:   MOV64mr [[COPY1]], 1, $noreg, 8, $noreg, killed [[MOV64rm1]] :: (store (s64) into %ir.p0 + 8, align 4, !alias.scope !0, !noalias !3)
+  ; MIR-NEXT:   MOV64mr [[COPY1]], 1, $noreg, 0, $noreg, killed [[MOV64rm]] :: (store (s64) into %ir.p0, align 4, !alias.scope !0, !noalias !3)
+  ; MIR-NEXT:   [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm [[COPY]], 1, $noreg, 0, $noreg :: (load (s32) from %ir.q, !alias.scope !3, !noalias !0)
+  ; MIR-NEXT:   [[ADD32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[COPY]], 1, $noreg, 4, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.q1, !alias.scope !3, !noalias !0)
+  ; MIR-NEXT:   $eax = COPY [[ADD32rm]]
+  ; MIR-NEXT:   RET 0, $eax
   %p0 = bitcast ptr %p to ptr
   %add.ptr = getelementptr inbounds i32, ptr %p, i64 4
   %p1 = bitcast ptr %add.ptr to ptr
@@ -27,12 +37,21 @@ define i32 @test_memcpy(ptr nocapture %p, ptr nocapture readonly %q) {
   ret i32 %add
 }
 
-; MIR-LABEL: name: test_memcpy_inline
-; MIR:      %2:gr64 = MOV64rm %0, 1, $noreg, 16, $noreg :: (load (s64) from %ir.p1, align 4, !alias.scope ![[SET0]], !noalias ![[SET1]])
-; MIR-NEXT: %3:gr64 = MOV64rm %0, 1, $noreg, 24, $noreg :: (load (s64) from %ir.p1 + 8, align 4, !alias.scope ![[SET0]], !noalias ![[SET1]])
-; MIR-NEXT: MOV64mr %0, 1, $noreg, 8, $noreg, killed %3 :: (store (s64) into %ir.p0 + 8, align 4, !alias.scope ![[SET0]], !noalias ![[SET1]])
-; MIR-NEXT: MOV64mr %0, 1, $noreg, 0, $noreg, killed %2 :: (store (s64) into %ir.p0, align 4, !alias.scope ![[SET0]], !noalias ![[SET1]])
 define i32 @test_memcpy_inline(ptr nocapture %p, ptr nocapture readonly %q) {
+  ; MIR-LABEL: name: test_memcpy_inline
+  ; MIR: bb.0 (%ir-block.0):
+  ; MIR-NEXT:   liveins: $rdi, $rsi
+  ; MIR-NEXT: {{  $}}
+  ; MIR-NEXT:   [[COPY:%[0-9]+]]:gr64 = COPY $rsi
+  ; MIR-NEXT:   [[COPY1:%[0-9]+]]:gr64 = COPY $rdi
+  ; MIR-NEXT:   [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm [[COPY1]], 1, $noreg, 16, $noreg :: (load (s64) from %ir.p1, align 4, !alias.scope !0, !noalias !3)
+  ; MIR-NEXT:   [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm [[COPY1]], 1, $noreg, 24, $noreg :: (load (s64) from %ir.p1 + 8, align 4, !alias.scope !0, !noalias !3)
+  ; MIR-NEXT:   MOV64mr [[COPY1]], 1, $noreg, 8, $noreg, killed [[MOV64rm1]] :: (store (s64) into %ir.p0 + 8, align 4, !alias.scope !0, !noalias !3)
+  ; MIR-NEXT:   MOV64mr [[COPY1]], 1, $noreg, 0, $noreg, killed [[MOV64rm]] :: (store (s64) into %ir.p0, align 4, !alias.scope !0, !noalias !3)
+  ; MIR-NEXT:   [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm [[COPY]], 1, $noreg, 0, $noreg :: (load (s32) from %ir.q, !alias.scope !3, !noalias !0)
+  ; MIR-NEXT:   [[ADD32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[COPY]], 1, $noreg, 4, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.q1, !alias.scope !3, !noalias !0)
+  ; MIR-NEXT:   $eax = COPY [[ADD32rm]]
+  ; MIR-NEXT:   RET 0, $eax
   %p0 = bitcast ptr %p to ptr
   %add.ptr = getelementptr inbounds i32, ptr %p, i64 4
   %p1 = bitcast ptr %add.ptr to ptr
@@ -44,12 +63,21 @@ define i32 @test_memcpy_inline(ptr nocapture %p, ptr nocapture readonly %q) {
   ret i32 %add
 }
 
-; MIR-LABEL: name: test_memmove
-; MIR:      %2:gr64 = MOV64rm %0, 1, $noreg, 16, $noreg :: (load (s64) from %ir.p1, align 4, !alias.scope ![[SET0]], !noalias ![[SET1]])
-; MIR-NEXT: %3:gr64 = MOV64rm %0, 1, $noreg, 24, $noreg :: (load (s64) from %ir.p1 + 8, align 4, !alias.scope ![[SET0]], !noalias ![[SET1]])
-; MIR-NEXT: MOV64mr %0, 1, $noreg, 0, $noreg, killed %2 :: (store (s64) into %ir.p0, align 4, !alias.scope ![[SET0]], !noalias ![[SET1]])
-; MIR-NEXT: MOV64mr %0, 1, $noreg, 8, $noreg, killed %3 :: (store (s64) into %ir.p0 + 8, align 4, !alias.scope ![[SET0]], !noalias ![[SET1]])
 define i32 @test_memmove(ptr nocapture %p, ptr nocapture readonly %q) {
+  ; MIR-LABEL: name: test_memmove
+  ; MIR: bb.0 (%ir-block.0):
+  ; MIR-NEXT:   liveins: $rdi, $rsi
+  ; MIR-NEXT: {{  $}}
+  ; MIR-NEXT:   [[COPY:%[0-9]+]]:gr64 = COPY $rsi
+  ; MIR-NEXT:   [[COPY1:%[0-9]+]]:gr64 = COPY $rdi
+  ; MIR-NEXT:   [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm [[COPY1]], 1, $noreg, 16, $noreg :: (load (s64) from %ir.p1, align 4, !alias.scope !0, !noalias !3)
+  ; MIR-NEXT:   [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm [[COPY1]], 1, $noreg, 24, $noreg :: (load (s64) from %ir.p1 + 8, align 4, !alias.scope !0, !noalias !3)
+  ; MIR-NEXT:   MOV64mr [[COPY1]], 1, $noreg, 0, $noreg, killed [[MOV64rm]] :: (store (s64) into %ir.p0, align 4, !alias.scope !0, !noalias !3)
+  ; MIR-NEXT:   MOV64mr [[COPY1]], 1, $noreg, 8, $noreg, killed [[MOV64rm1]] :: (store (s64) into %ir.p0 + 8, align 4, !alias.scope !0, !noalias !3)
+  ; MIR-NEXT:   [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm [[COPY]], 1, $noreg, 0, $noreg :: (load (s32) from %ir.q, !alias.scope !3, !noalias !0)
+  ; MIR-NEXT:   [[ADD32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[COPY]], 1, $noreg, 4, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.q1, !alias.scope !3, !noalias !0)
+  ; MIR-NEXT:   $eax = COPY [[ADD32rm]]
+  ; MIR-NEXT:   RET 0, $eax
   %p0 = bitcast ptr %p to ptr
   %add.ptr = getelementptr inbounds i32, ptr %p, i64 4
   %p1 = bitcast ptr %add.ptr to ptr
@@ -61,11 +89,20 @@ define i32 @test_memmove(ptr nocapture %p, ptr nocapture readonly %q) {
   ret i32 %add
 }
 
-; MIR-LABEL: name: test_memset
-; MIR:      %2:gr64 = MOV64ri -6148914691236517206
-; MIR-NEXT: MOV64mr %0, 1, $noreg, 8, $noreg, %2 :: (store (s64) into %ir.p0 + 8, align 4, !alias.scope ![[SET0]], !noalias ![[SET1]])
-; MIR-NEXT: MOV64mr %0, 1, $noreg, 0, $noreg, %2 :: (store (s64) into %ir.p0, align 4, !alias.scope ![[SET0]], !noalias ![[SET1]])
 define i32 @test_memset(ptr nocapture %p, ptr nocapture readonly %q) {
+  ; MIR-LABEL: name: test_memset
+  ; MIR: bb.0 (%ir-block.0):
+  ; MIR-NEXT:   liveins: $rdi, $rsi
+  ; MIR-NEXT: {{  $}}
+  ; MIR-NEXT:   [[COPY:%[0-9]+]]:gr64 = COPY $rsi
+  ; MIR-NEXT:   [[COPY1:%[0-9]+]]:gr64 = COPY $rdi
+  ; MIR-NEXT:   [[MOV64ri:%[0-9]+]]:gr64 = MOV64ri -6148914691236517206
+  ; MIR-NEXT:   MOV64mr [[COPY1]], 1, $noreg, 8, $noreg, [[MOV64ri]] :: (store (s64) into %ir.p0 + 8, align 4, !alias.scope !0, !noalias !3)
+  ; MIR-NEXT:   MOV64mr [[COPY1]], 1, $noreg, 0, $noreg, [[MOV64ri]] :: (store (s64) into %ir.p0, align 4, !alias.scope !0, !noalias !3)
+  ; MIR-NEXT:   [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm [[COPY]], 1, $noreg, 0, $noreg :: (load (s32) from %ir.q, !alias.scope !3, !noalias !0)
+  ; MIR-NEXT:   [[ADD32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[COPY]], 1, $noreg, 4, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.q1, !alias.scope !3, !noalias !0)
+  ; MIR-NEXT:   $eax = COPY [[ADD32rm]]
+  ; MIR-NEXT:   RET 0, $eax
   %p0 = bitcast ptr %p to ptr
   tail call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 dereferenceable(16) %p0, i8 170, i64 16, i1 false), !alias.scope !2, !noalias !4
   %v0 = load i32, ptr %q, align 4, !alias.scope !4, !noalias !2
@@ -75,12 +112,21 @@ define i32 @test_memset(ptr nocapture %p, ptr nocapture readonly %q) {
   ret i32 %add
 }
 
-; MIR-LABEL: name: test_mempcpy
-; MIR:      %2:gr64 = MOV64rm %0, 1, $noreg, 16, $noreg :: (load (s64) from %ir.p1, align 1, !alias.scope ![[SET0]], !noalias ![[SET1]])
-; MIR-NEXT: %3:gr64 = MOV64rm %0, 1, $noreg, 24, $noreg :: (load (s64) from %ir.p1 + 8, align 1, !alias.scope ![[SET0]], !noalias ![[SET1]])
-; MIR-NEXT: MOV64mr %0, 1, $noreg, 8, $noreg, killed %3 :: (store (s64) into %ir.p0 + 8, align 1, !alias.scope ![[SET0]], !noalias ![[SET1]])
-; MIR-NEXT: MOV64mr %0, 1, $noreg, 0, $noreg, killed %2 :: (store (s64) into %ir.p0, align 1, !alias.scope ![[SET0]], !noalias ![[SET1]])
 define i32 @test_mempcpy(ptr nocapture %p, ptr nocapture readonly %q) {
+  ; MIR-LABEL: name: test_mempcpy
+  ; MIR: bb.0 (%ir-block.0):
+  ; MIR-NEXT:   liveins: $rdi, $rsi
+  ; MIR-NEXT: {{  $}}
+  ; MIR-NEXT:   [[COPY:%[0-9]+]]:gr64 = COPY $rsi
+  ; MIR-NEXT:   [[COPY1:%[0-9]+]]:gr64 = COPY $rdi
+  ; MIR-NEXT:   [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm [[COPY1]], 1, $noreg, 16, $noreg :: (load (s64) from %ir.p1, align 1, !alias.scope !0, !noalias !3)
+  ; MIR-NEXT:   [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm [[COPY1]], 1, $noreg, 24, $noreg :: (load (s64) from %ir.p1 + 8, align 1, !alias.scope !0, !noalias !3)
+  ; MIR-NEXT:   MOV64mr [[COPY1]], 1, $noreg, 8, $noreg, killed [[MOV64rm1]] :: (store (s64) into %ir.p0 + 8, align 1, !alias.scope !0, !noalias !3)
+  ; MIR-NEXT:   MOV64mr [[COPY1]], 1, $noreg, 0, $noreg, killed [[MOV64rm]] :: (store (s64) into %ir.p0, align 1, !alias.scope !0, !noalias !3)
+  ; MIR-NEXT:   [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm [[COPY]], 1, $noreg, 0, $noreg :: (load (s32) from %ir.q, !alias.scope !3, !noalias !0)
+  ; MIR-NEXT:   [[ADD32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[COPY]], 1, $noreg, 4, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.q1, !alias.scope !3, !noalias !0)
+  ; MIR-NEXT:   $eax = COPY [[ADD32rm]]
+  ; MIR-NEXT:   RET 0, $eax
   %p0 = bitcast ptr %p to ptr
   %add.ptr = getelementptr inbounds i32, ptr %p, i64 4
   %p1 = bitcast ptr %add.ptr to ptr
diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
index a953c505cd8e..f3a8ca4de997 100644
--- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -5,10 +5,10 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=cascadelake | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=cooperlake | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=cannonlake | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=icelake-client | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=icelake-server | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=cannonlake | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI,CHECK-VBMI1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=icelake-client | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI,CHECK-GFNI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=icelake-server | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI,CHECK-GFNI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI,CHECK-GFNI
 
 ; This file primarily contains tests for specific places in X86ISelLowering.cpp that needed be made aware of the legalizer not allowing 512-bit vectors due to prefer-256-bit even though AVX512 is enabled.
 
@@ -2006,12 +2006,31 @@ define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind "min-legal-vector
 }
 
 define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind "min-legal-vector-width"="256" {
-; CHECK-LABEL: splatconstant_rotate_v32i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpsllw $4, %ymm0, %ymm1
-; CHECK-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; CHECK-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
-; CHECK-NEXT:    retq
+; CHECK-SKX-LABEL: splatconstant_rotate_v32i8:
+; CHECK-SKX:       # %bb.0:
+; CHECK-SKX-NEXT:    vpsllw $4, %ymm0, %ymm1
+; CHECK-SKX-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; CHECK-SKX-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
+; CHECK-SKX-NEXT:    retq
+;
+; CHECK-AVX512-LABEL: splatconstant_rotate_v32i8:
+; CHECK-AVX512:       # %bb.0:
+; CHECK-AVX512-NEXT:    vpsllw $4, %ymm0, %ymm1
+; CHECK-AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; CHECK-AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
+; CHECK-AVX512-NEXT:    retq
+;
+; CHECK-VBMI1-LABEL: splatconstant_rotate_v32i8:
+; CHECK-VBMI1:       # %bb.0:
+; CHECK-VBMI1-NEXT:    vpsllw $4, %ymm0, %ymm1
+; CHECK-VBMI1-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; CHECK-VBMI1-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
+; CHECK-VBMI1-NEXT:    retq
+;
+; CHECK-GFNI-LABEL: splatconstant_rotate_v32i8:
+; CHECK-GFNI:       # %bb.0:
+; CHECK-GFNI-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
+; CHECK-GFNI-NEXT:    retq
   %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
   %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
   %or = or <32 x i8> %shl, %lshr
@@ -2019,13 +2038,35 @@ define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind "min-legal-v
 }
 
 define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind "min-legal-vector-width"="256" {
-; CHECK-LABEL: splatconstant_rotate_mask_v32i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpsllw $4, %ymm0, %ymm1
-; CHECK-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; CHECK-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
-; CHECK-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; CHECK-SKX-LABEL: splatconstant_rotate_mask_v32i8:
+; CHECK-SKX:       # %bb.0:
+; CHECK-SKX-NEXT:    vpsllw $4, %ymm0, %ymm1
+; CHECK-SKX-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; CHECK-SKX-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
+; CHECK-SKX-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
+; CHECK-SKX-NEXT:    retq
+;
+; CHECK-AVX512-LABEL: splatconstant_rotate_mask_v32i8:
+; CHECK-AVX512:       # %bb.0:
+; CHECK-AVX512-NEXT:    vpsllw $4, %ymm0, %ymm1
+; CHECK-AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; CHECK-AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
+; CHECK-AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
+; CHECK-AVX512-NEXT:    retq
+;
+; CHECK-VBMI1-LABEL: splatconstant_rotate_mask_v32i8:
+; CHECK-VBMI1:       # %bb.0:
+; CHECK-VBMI1-NEXT:    vpsllw $4, %ymm0, %ymm1
+; CHECK-VBMI1-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; CHECK-VBMI1-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
+; CHECK-VBMI1-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
+; CHECK-VBMI1-NEXT:    retq
+;
+; CHECK-GFNI-LABEL: splatconstant_rotate_mask_v32i8:
+; CHECK-GFNI:       # %bb.0:
+; CHECK-GFNI-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
+; CHECK-GFNI-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
+; CHECK-GFNI-NEXT:    retq
   %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
   %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
   %rmask = and <32 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>
diff --git a/llvm/test/CodeGen/X86/non-value-mem-operand.mir b/llvm/test/CodeGen/X86/non-value-mem-operand.mir
index f188e821c2da..db8f0617a873 100644
--- a/llvm/test/CodeGen/X86/non-value-mem-operand.mir
+++ b/llvm/test/CodeGen/X86/non-value-mem-operand.mir
@@ -1,293 +1,293 @@
-# RUN: llc  -run-pass implicit-null-checks -mtriple=x86_64-apple-macosx -o - %s | FileCheck %s
-
-# CHECK-NOT: FAULTING_OP
-
---- |
-  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-  target triple = "x86_64-unknown-linux-gnu"
-  
-  @global = external global i8*
-  @global.1 = external global i8*
-  
-  declare i8* @ham(i8*, i8**)
-  
-  define void @eggs(i8* %arg) gc "statepoint-example" {
-  bb:
-    %tmp = call i8* undef(i8* undef, i8** undef)
-    %tmp1 = icmp eq i8* %tmp, null
-    br i1 %tmp1, label %bb2, label %bb3, !make.implicit !0
-  
-  bb2:                                              ; preds = %bb
-    br i1 undef, label %bb51, label %bb59
-  
-  bb3:                                              ; preds = %bb
-    %tmp4 = getelementptr inbounds i8, i8* %tmp, i64 16
-    %tmp5 = bitcast i8* %tmp4 to i64*
-    br label %bb7
-  
-  bb7:                                              ; preds = %bb37, %bb3
-    %tmp8 = phi i64* [ %tmp5, %bb3 ], [ %tmp18, %bb37 ]
-    %tmp10 = phi i32 [ undef, %bb3 ], [ %tmp48, %bb37 ]
-    %tmp12 = phi i32 [ 0, %bb3 ], [ 6, %bb37 ]
-    %tmp13 = phi double [ 0.000000e+00, %bb3 ], [ 2.000000e+00, %bb37 ]
-    %tmp14 = zext i32 %tmp10 to i64
-    br i1 undef, label %bb26, label %bb15
-  
-  bb15:                                             ; preds = %bb7
-    %tmp16 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* nonnull elementtype(void ()) @wibble, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 1, i32 0, i32 99, i32 0, i32 12, i32 0, i32 10, i32 %tmp10, i32 10, i32 0, i32 10, i32 %tmp12, i32 10, i32 undef, i32 6, float undef, i32 7, double %tmp13, i32 99, i8* null, i32 7, double undef, i32 99, i8* null, i32 13, i8* %tmp, i32 7, double undef, i32 99, i8* null, i8* undef)]
-    br label %bb26
-  
-  bb26:                                             ; preds = %bb15, %bb7
-    %tmp18 = phi i64* [ %tmp8, %bb7 ], [ undef, %bb15 ]
-    %tmp20 = sub i32 0, 0
-    %tmp21 = select i1 undef, i32 0, i32 %tmp20
-    %tmp22 = sext i32 %tmp21 to i64
-    %tmp23 = load i8*, i8** @global.1, align 8
-    %tmp24 = icmp eq i8* %tmp23, null
-    %tmp25 = select i1 %tmp24, i8* null, i8* undef
-    %tmp27 = load i32, i32* undef, align 4
-    %sunkaddr = mul i64 %tmp14, 8
-    %tmp2 = bitcast i64* %tmp18 to i8*
-    %sunkaddr1 = getelementptr i8, i8* %tmp2, i64 %sunkaddr
-    %tmp3 = bitcast i8* %sunkaddr1 to i64*
-    %tmp28 = load i64, i64* %tmp3, align 8
-    %tmp29 = add i64 %tmp28, 1
-    store i64 %tmp29, i64* %tmp3, align 8
-    %tmp30 = trunc i64 %tmp28 to i32
-    %tmp31 = sub i32 %tmp27, %tmp30
-    store i32 %tmp31, i32* undef, align 4
-    %tmp32 = getelementptr inbounds i8, i8* %tmp25, i64 768
-    %tmp33 = bitcast i8* %tmp32 to i64*
-    %tmp34 = load i64, i64* %tmp33, align 8
-    br i1 undef, label %bb37, label %bb35
-  
-  bb35:                                             ; preds = %bb26
-    %tmp36 = call i8* @ham(i8* undef, i8** nonnull @global)
-    br label %bb37
-  
-  bb37:                                             ; preds = %bb35, %bb26
-    %tmp38 = phi i8* [ %tmp36, %bb35 ], [ undef, %bb26 ]
-    %tmp39 = getelementptr inbounds i8, i8* %tmp38, i64 760
-    %tmp40 = bitcast i8* %tmp39 to i64*
-    %tmp41 = load i64, i64* %tmp40, align 8
-    %tmp42 = icmp slt i64 %tmp34, %tmp41
-    %tmp43 = select i1 %tmp42, i64 %tmp41, i64 %tmp34
-    %tmp44 = and i64 %tmp43, 63
-    %tmp45 = ashr i64 %tmp29, %tmp44
-    %sunkaddr2 = mul i64 %tmp14, 8
-    %tmp6 = bitcast i64* %tmp18 to i8*
-    %sunkaddr3 = getelementptr i8, i8* %tmp6, i64 %sunkaddr2
-    %tmp7 = bitcast i8* %sunkaddr3 to i64*
-    store i64 %tmp45, i64* %tmp7, align 8
-    %tmp46 = sub i64 0, %tmp22
-    store i64 %tmp46, i64* undef, align 8
-    %tmp47 = add nsw i32 %tmp12, 1
-    %tmp48 = add i32 %tmp10, 1
-    %tmp49 = icmp sgt i32 %tmp48, 15140
-    br i1 %tmp49, label %bb51.loopexit, label %bb7
-  
-  bb51.loopexit:                                    ; preds = %bb37
-    %tmp9 = add i32 %tmp10, 1
-    br label %bb51
-  
-  bb51:                                             ; preds = %bb51.loopexit, %bb2
-    %tmp52 = phi i32 [ %tmp47, %bb51.loopexit ], [ 0, %bb2 ]
-    %tmp53 = phi double [ 2.000000e+00, %bb51.loopexit ], [ 0.000000e+00, %bb2 ]
-    %tmp54 = phi i32 [ %tmp9, %bb51.loopexit ], [ undef, %bb2 ]
-    %tmp56 = add i32 %tmp54, 0
-    %tmp57 = call token (i64, i32, void (i32)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidi32f(i64 2882400000, i32 0, void (i32)* nonnull elementtype(void (i32)) @wobble, i32 1, i32 0, i32 -121, i32 0, i32 0) ["deopt" (i32 1, i32 0, i32 270, i32 4, i32 12, i32 0, i32 11, i64 undef, i32 99, i8* null, i32 10, i32 %tmp56, i32 6, float undef, i32 99, i8* null, i32 99, i8* null, i32 10, i32 %tmp52, i32 10, i32 undef, i32 99, i8* null, i32 7, double %tmp53, i32 99, i8* null, i32 7, double undef, i32 99, i8* null, i32 13, i8* undef, i32 99, i8* null, i32 99, i8* null, i8* undef)]
-    unreachable
-  
-  bb59:                                             ; preds = %bb2
-    %tmp61 = call token (i64, i32, void (i32)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidi32f(i64 2882400000, i32 0, void (i32)* nonnull elementtype(void (i32)) @wobble, i32 1, i32 0, i32 8, i32 0, i32 0) ["deopt" (i32 1, i32 0, i32 123, i32 4, i32 12, i32 0, i32 13, i8* null, i32 99, i32 undef, i32 13, i8* null, i32 10, i32 undef, i32 99, i32 undef, i32 99, i32 undef, i32 99, i32 undef, i32 99, i8* null, i32 99, float undef, i32 99, double undef, i32 99, i8* null, i32 99, double undef, i32 99, i8* null, i32 13, i8* null, i32 99, double undef, i32 99, i8* null)]
-    unreachable
-  }
-  
-  declare void @wibble()
-  
-  declare void @wobble(i32)
-  
-  declare token @llvm.experimental.gc.statepoint.p0f_isVoidi32f(i64, i32, void (i32)*, i32, i32, ...)
-  
-  declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)
-  
-  ; Function Attrs: nounwind
-  declare void @llvm.stackprotector(i8*, i8**) #0
-  
-  attributes #0 = { nounwind }
-  
-  !0 = !{}
-...
----
-name:            eggs
-alignment:       16
-tracksRegLiveness: true
-fixedStack:      
-  - { id: 0, type: spill-slot, offset: -56, size: 8, alignment: 8, callee-saved-register: '$rbx' }
-  - { id: 1, type: spill-slot, offset: -48, size: 8, alignment: 16, callee-saved-register: '$r12' }
-  - { id: 2, type: spill-slot, offset: -40, size: 8, alignment: 8, callee-saved-register: '$r13' }
-  - { id: 3, type: spill-slot, offset: -32, size: 8, alignment: 16, callee-saved-register: '$r14' }
-  - { id: 4, type: spill-slot, offset: -24, size: 8, alignment: 8, callee-saved-register: '$r15' }
-  - { id: 5, type: spill-slot, offset: -16, size: 8, alignment: 16, callee-saved-register: '$rbp' }
-stack:           
-  - { id: 0, offset: -88, size: 8, alignment: 8 }
-  - { id: 1, offset: -96, size: 8, alignment: 8 }
-  - { id: 2, offset: -104, size: 8, alignment: 8 }
-  - { id: 3, offset: -64, size: 8, alignment: 8 }
-  - { id: 4, type: spill-slot, offset: -72, size: 8, alignment: 8 }
-  - { id: 5, type: spill-slot, offset: -80, size: 8, alignment: 8 }
-constants:       
-  - id:              0
-    value:           'double 2.000000e+00'
-    alignment:       8
-body:             |
-  bb.0.bb:
-    successors: %bb.1.bb2(0x00000800), %bb.3.bb3(0x7ffff800)
-    liveins: $rbp, $r15, $r14, $r13, $r12, $rbx
-  
-    frame-setup PUSH64r killed $rbp, implicit-def $rsp, implicit $rsp
-    frame-setup PUSH64r killed $r15, implicit-def $rsp, implicit $rsp
-    frame-setup PUSH64r killed $r14, implicit-def $rsp, implicit $rsp
-    frame-setup PUSH64r killed $r13, implicit-def $rsp, implicit $rsp
-    frame-setup PUSH64r killed $r12, implicit-def $rsp, implicit $rsp
-    frame-setup PUSH64r killed $rbx, implicit-def $rsp, implicit $rsp
-    $rsp = frame-setup SUB64ri8 $rsp, 56, implicit-def dead $eflags
-    CALL64r undef $rax, csr_64, implicit $rsp, implicit undef $rdi, implicit undef $rsi, implicit-def $rsp, implicit-def $rax
-    TEST64rr $rax, $rax, implicit-def $eflags
-    JCC_1 %bb.3.bb3, 5, implicit killed $eflags
-  
-  bb.1.bb2:
-    successors: %bb.2(0x40000000), %bb.13.bb59(0x40000000)
-  
-    $ebp = XOR32rr undef $ebp, undef $ebp, implicit-def dead $eflags
-    TEST8rr $bpl, $bpl, implicit-def $eflags
-    JCC_1 %bb.13.bb59, 4, implicit killed $eflags
-  
-  bb.2:
-    successors: %bb.12.bb51(0x80000000)
-    liveins: $ebp
-  
-    $xmm0 = XORPSrr undef $xmm0, undef $xmm0
-    $ebx = IMPLICIT_DEF implicit-def $rbx
-    JMP_1 %bb.12.bb51
-  
-  bb.3.bb3:
-    successors: %bb.4.bb7(0x80000000)
-    liveins: $rax
-  
-    MOV64mr $rsp, 1, $noreg, 32, $noreg, $rax :: (store (s64) into %stack.5)
-    $r12 = MOV64rr killed $rax
-    $r12 = ADD64ri8 killed $r12, 16, implicit-def dead $eflags
-    $xmm0 = XORPSrr undef $xmm0, undef $xmm0
-    $esi = XOR32rr undef $esi, undef $esi, implicit-def dead $eflags
-    $rax = MOV64ri %const.0
-    $xmm1 = MOVSDrm_alt killed $rax, 1, $noreg, 0, $noreg :: (load (s64) from constant-pool)
-    MOVSDmr $rsp, 1, $noreg, 40, $noreg, killed $xmm1 :: (store (s64) into %stack.4)
-    $eax = IMPLICIT_DEF
-    $ecx = XOR32rr undef $ecx, undef $ecx, implicit-def dead $eflags
-  
-  bb.4.bb7:
-    successors: %bb.6.bb26(0x40000000), %bb.5.bb15(0x40000000)
-    liveins: $eax, $ecx, $esi, $r12, $xmm0
-  
-    $ebp = MOV32rr killed $ecx
-    $ebx = MOV32rr killed $eax, implicit-def $rbx
-    $r14d = MOV32rr $ebx, implicit-def $r14
-    TEST8rr $sil, $sil, implicit-def $eflags
-    JCC_1 %bb.6.bb26, 5, implicit $eflags
-  
-  bb.5.bb15:
-    successors: %bb.6.bb26(0x80000000)
-    liveins: $ebp, $rbx, $r14, $xmm0
-  
-    MOV32mr $rsp, 1, $noreg, 24, $noreg, $ebx :: (store (s32) into %stack.0, align 8)
-    MOV32mr $rsp, 1, $noreg, 16, $noreg, $ebp :: (store (s32) into %stack.1, align 8)
-    MOVSDmr $rsp, 1, $noreg, 8, $noreg, killed $xmm0 :: (store (s64) into %stack.2)
-    $rax = MOV64rm $rsp, 1, $noreg, 32, $noreg :: (load (s64) from %stack.5)
-    MOV64mr $rsp, 1, $noreg, 48, $noreg, killed $rax :: (store (s64) into %stack.3)
-    $rax = MOV64ri @wibble
-    STATEPOINT 2882400000, 0, 0, killed $rax, 2, 0, 2, 0, 2, 30, 2, 1, 2, 0, 2, 99, 2, 0, 2, 12, 2, 0, 2, 10, 1, 8, $rsp, 24, 2, 10, 2, 0, 2, 10, 1, 8, $rsp, 16, 2, 10, 2, 4278124286, 2, 6, 2, 4278124286, 2, 7, 1, 8, $rsp, 8, 2, 99, 2, 0, 2, 7, 2, 4278124286, 2, 99, 2, 0, 2, 13, 1, 8, $rsp, 48, 2, 7, 2, 4278124286, 2, 99, 2, 0, 2, 0, 2, 0, 2, 0, csr_64, implicit-def $rsp :: (volatile load (s64) from %stack.0), (volatile load (s64) from %stack.1), (volatile load (s64) from %stack.2), (volatile load (s64) from %stack.3)
-    $esi = XOR32rr undef $esi, undef $esi, implicit-def dead $eflags
-    $r12 = IMPLICIT_DEF
-  
-  bb.6.bb26:
-    successors: %bb.8.bb37(0x40000000), %bb.7.bb35(0x40000000)
-    liveins: $ebp, $esi, $rbx, $r12, $r14
-  
-    $rax = MOV64ri @global.1
-    $rax = MOV64rm killed $rax, 1, $noreg, 0, $noreg :: (dereferenceable load (s64) from @global.1)
-    TEST64rr $rax, $rax, implicit-def $eflags
-    $rax = CMOV64rr undef $rax, killed $rax, 4, implicit killed $eflags
-    $ecx = MOV32rm undef $rax, 1, $noreg, 0, $noreg :: (load (s32) from `i32* undef`)
-    $rdx = MOV64rm $r12, 8, $r14, 0, $noreg :: (load (s64) from %ir.tmp3)
-    $r15 = LEA64r $rdx, 1, $noreg, 1, _
-    MOV64mr $r12, 8, $r14, 0, $noreg, $r15 :: (store (s64) into %ir.tmp3)
-    $ecx = SUB32rr killed $ecx, $edx, implicit-def dead $eflags, implicit killed $rdx
-    MOV32mr undef $rax, 1, $noreg, 0, $noreg, killed $ecx :: (store (s32) into `i32* undef`)
-    $r13 = MOV64rm killed $rax, 1, $noreg, 768, $noreg :: (load (s64) from %ir.tmp33)
-    TEST8rr $sil, $sil, implicit-def $eflags
-    $rax = IMPLICIT_DEF
-    JCC_1 %bb.8.bb37, 5, implicit $eflags
-  
-  bb.7.bb35:
-    successors: %bb.8.bb37(0x80000000)
-    liveins: $ebp, $rbx, $r12, $r13, $r14, $r15
-  
-    $rsi = MOV64ri @global
-    $rax = MOV64ri @ham
-    CALL64r killed $rax, csr_64, implicit $rsp, implicit undef $rdi, implicit $rsi, implicit-def $rsp, implicit-def $rax
-    $esi = XOR32rr undef $esi, undef $esi, implicit-def dead $eflags
-  
-  bb.8.bb37:
-    successors: %bb.9.bb37(0x40000000), %bb.10.bb37(0x40000000)
-    liveins: $ebp, $esi, $rax, $rbx, $r12, $r13, $r14, $r15
-  
-    $rcx = MOV64rm killed $rax, 1, $noreg, 760, $noreg :: (load (s64) from %ir.tmp40)
-    CMP64rr $r13, $rcx, implicit-def $eflags
-    JCC_1 %bb.10.bb37, 12, implicit $eflags
-  
-  bb.9.bb37:
-    successors: %bb.10.bb37(0x80000000)
-    liveins: $ebp, $esi, $rbx, $r12, $r13, $r14, $r15
-  
-    $cl = MOV8rr $r13b, implicit killed $r13, implicit-def $rcx
-  
-  bb.10.bb37:
-    successors: %bb.11.bb51.loopexit(0x00000800), %bb.4.bb7(0x7ffff800)
-    liveins: $ebp, $esi, $rbx, $rcx, $r12, $r14, $r15
-  
-    $cl = KILL $cl, implicit killed $rcx
-    $r15 = SAR64rCL killed $r15, implicit-def dead $eflags, implicit $cl
-    MOV64mr $r12, 8, killed $r14, 0, $noreg, killed $r15 :: (store (s64) into %ir.tmp7)
-    MOV64mi32 undef $rax, 1, $noreg, 0, $noreg, 0 :: (store (s64) into `i64* undef`)
-    $eax = LEA64_32r $rbx, 1, $noreg, 1, _
-    $ecx = MOV32ri 6
-    CMP32ri $eax, 15141, implicit-def $eflags
-    $xmm0 = MOVSDrm_alt $rsp, 1, $noreg, 40, $noreg :: (load (s64) from %stack.4)
-    JCC_1 %bb.4.bb7, 12, implicit $eflags
-  
-  bb.11.bb51.loopexit:
-    successors: %bb.12.bb51(0x80000000)
-    liveins: $ebp, $rbx
-  
-    $ebp = INC32r killed $ebp, implicit-def dead $eflags
-    $ebx = INC32r $ebx, implicit-def dead $eflags, implicit killed $rbx, implicit-def $rbx
-    $rax = MOV64ri %const.0
-    $xmm0 = MOVSDrm_alt killed $rax, 1, $noreg, 0, $noreg :: (load (s64) from constant-pool)
-  
-  bb.12.bb51:
-    liveins: $ebp, $rbx, $xmm0
-  
-    MOV32mr $rsp, 1, $noreg, 24, $noreg, $ebx, implicit killed $rbx :: (store (s32) into %stack.0, align 8)
-    MOV32mr $rsp, 1, $noreg, 16, $noreg, killed $ebp :: (store (s32) into %stack.1, align 8)
-    MOVSDmr $rsp, 1, $noreg, 8, $noreg, killed $xmm0 :: (store (s64) into %stack.2)
-    $rax = MOV64ri @wobble
-    $edi = MOV32ri -121
-    STATEPOINT 2882400000, 0, 1, killed $rax, $edi, 2, 0, 2, 0, 2, 38, 2, 1, 2, 0, 2, 270, 2, 4, 2, 12, 2, 0, 2, 11, 2, 4278124286, 2, 99, 2, 0, 2, 10, 1, 8, $rsp, 24, 2, 6, 2, 4278124286, 2, 99, 2, 0, 2, 99, 2, 0, 2, 10, 1, 8, $rsp, 16, 2, 10, 2, 4278124286, 2, 99, 2, 0, 2, 7, 1, 8, $rsp, 8, 2, 99, 2, 0, 2, 7, 2, 4278124286, 2, 99, 2, 0, 2, 13, 2, 4278124286, 2, 99, 2, 0, 2, 99, 2, 0, 2, 0, 2, 0, 2, 0, csr_64, implicit-def $rsp :: (volatile load (s64) from %stack.0), (volatile load (s64) from %stack.1), (volatile load (s64) from %stack.2)
-  
-  bb.13.bb59:
-    $rax = MOV64ri @wobble
-    $edi = MOV32ri 8
-    STATEPOINT 2882400000, 0, 1, killed $rax, $edi, 2, 0, 2, 0, 2, 38, 2, 1, 2, 0, 2, 123, 2, 4, 2, 12, 2, 0, 2, 13, 2, 0, 2, 99, 2, 4278124286, 2, 13, 2, 0, 2, 10, 2, 4278124286, 2, 99, 2, 4278124286, 2, 99, 2, 4278124286, 2, 99, 2, 4278124286, 2, 99, 2, 0, 2, 99, 2, 4278124286, 2, 99, 2, 4278124286, 2, 99, 2, 0, 2, 99, 2, 4278124286, 2, 99, 2, 0, 2, 13, 2, 0, 2, 99, 2, 4278124286, 2, 99, 2, 0, 2, 0, 2, 0, 2, 0, csr_64, implicit-def $rsp
-
-...
+# RUN: llc  -run-pass implicit-null-checks -mtriple=x86_64-apple-macosx -o - %s | FileCheck %s
+
+# CHECK-NOT: FAULTING_OP
+
+--- |
+  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-unknown-linux-gnu"
+
+  @global = external global i8*
+  @global.1 = external global i8*
+
+  declare i8* @ham(i8*, i8**)
+
+  define void @eggs(i8* %arg) gc "statepoint-example" {
+  bb:
+    %tmp = call i8* undef(i8* undef, i8** undef)
+    %tmp1 = icmp eq i8* %tmp, null
+    br i1 %tmp1, label %bb2, label %bb3, !make.implicit !0
+
+  bb2:                                              ; preds = %bb
+    br i1 undef, label %bb51, label %bb59
+
+  bb3:                                              ; preds = %bb
+    %tmp4 = getelementptr inbounds i8, i8* %tmp, i64 16
+    %tmp5 = bitcast i8* %tmp4 to i64*
+    br label %bb7
+
+  bb7:                                              ; preds = %bb37, %bb3
+    %tmp8 = phi i64* [ %tmp5, %bb3 ], [ %tmp18, %bb37 ]
+    %tmp10 = phi i32 [ undef, %bb3 ], [ %tmp48, %bb37 ]
+    %tmp12 = phi i32 [ 0, %bb3 ], [ 6, %bb37 ]
+    %tmp13 = phi double [ 0.000000e+00, %bb3 ], [ 2.000000e+00, %bb37 ]
+    %tmp14 = zext i32 %tmp10 to i64
+    br i1 undef, label %bb26, label %bb15
+
+  bb15:                                             ; preds = %bb7
+    %tmp16 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* nonnull elementtype(void ()) @wibble, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 1, i32 0, i32 99, i32 0, i32 12, i32 0, i32 10, i32 %tmp10, i32 10, i32 0, i32 10, i32 %tmp12, i32 10, i32 undef, i32 6, float undef, i32 7, double %tmp13, i32 99, i8* null, i32 7, double undef, i32 99, i8* null, i32 13, i8* %tmp, i32 7, double undef, i32 99, i8* null, i8* undef)]
+    br label %bb26
+
+  bb26:                                             ; preds = %bb15, %bb7
+    %tmp18 = phi i64* [ %tmp8, %bb7 ], [ undef, %bb15 ]
+    %tmp20 = sub i32 0, 0
+    %tmp21 = select i1 undef, i32 0, i32 %tmp20
+    %tmp22 = sext i32 %tmp21 to i64
+    %tmp23 = load i8*, i8** @global.1, align 8
+    %tmp24 = icmp eq i8* %tmp23, null
+    %tmp25 = select i1 %tmp24, i8* null, i8* undef
+    %tmp27 = load i32, i32* undef, align 4
+    %sunkaddr = mul i64 %tmp14, 8
+    %tmp2 = bitcast i64* %tmp18 to i8*
+    %sunkaddr1 = getelementptr i8, i8* %tmp2, i64 %sunkaddr
+    %tmp3 = bitcast i8* %sunkaddr1 to i64*
+    %tmp28 = load i64, i64* %tmp3, align 8
+    %tmp29 = add i64 %tmp28, 1
+    store i64 %tmp29, i64* %tmp3, align 8
+    %tmp30 = trunc i64 %tmp28 to i32
+    %tmp31 = sub i32 %tmp27, %tmp30
+    store i32 %tmp31, i32* undef, align 4
+    %tmp32 = getelementptr inbounds i8, i8* %tmp25, i64 768
+    %tmp33 = bitcast i8* %tmp32 to i64*
+    %tmp34 = load i64, i64* %tmp33, align 8
+    br i1 undef, label %bb37, label %bb35
+
+  bb35:                                             ; preds = %bb26
+    %tmp36 = call i8* @ham(i8* undef, i8** nonnull @global)
+    br label %bb37
+
+  bb37:                                             ; preds = %bb35, %bb26
+    %tmp38 = phi i8* [ %tmp36, %bb35 ], [ undef, %bb26 ]
+    %tmp39 = getelementptr inbounds i8, i8* %tmp38, i64 760
+    %tmp40 = bitcast i8* %tmp39 to i64*
+    %tmp41 = load i64, i64* %tmp40, align 8
+    %tmp42 = icmp slt i64 %tmp34, %tmp41
+    %tmp43 = select i1 %tmp42, i64 %tmp41, i64 %tmp34
+    %tmp44 = and i64 %tmp43, 63
+    %tmp45 = ashr i64 %tmp29, %tmp44
+    %sunkaddr2 = mul i64 %tmp14, 8
+    %tmp6 = bitcast i64* %tmp18 to i8*
+    %sunkaddr3 = getelementptr i8, i8* %tmp6, i64 %sunkaddr2
+    %tmp7 = bitcast i8* %sunkaddr3 to i64*
+    store i64 %tmp45, i64* %tmp7, align 8
+    %tmp46 = sub i64 0, %tmp22
+    store i64 %tmp46, i64* undef, align 8
+    %tmp47 = add nsw i32 %tmp12, 1
+    %tmp48 = add i32 %tmp10, 1
+    %tmp49 = icmp sgt i32 %tmp48, 15140
+    br i1 %tmp49, label %bb51.loopexit, label %bb7
+
+  bb51.loopexit:                                    ; preds = %bb37
+    %tmp9 = add i32 %tmp10, 1
+    br label %bb51
+
+  bb51:                                             ; preds = %bb51.loopexit, %bb2
+    %tmp52 = phi i32 [ %tmp47, %bb51.loopexit ], [ 0, %bb2 ]
+    %tmp53 = phi double [ 2.000000e+00, %bb51.loopexit ], [ 0.000000e+00, %bb2 ]
+    %tmp54 = phi i32 [ %tmp9, %bb51.loopexit ], [ undef, %bb2 ]
+    %tmp56 = add i32 %tmp54, 0
+    %tmp57 = call token (i64, i32, void (i32)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidi32f(i64 2882400000, i32 0, void (i32)* nonnull elementtype(void (i32)) @wobble, i32 1, i32 0, i32 -121, i32 0, i32 0) ["deopt" (i32 1, i32 0, i32 270, i32 4, i32 12, i32 0, i32 11, i64 undef, i32 99, i8* null, i32 10, i32 %tmp56, i32 6, float undef, i32 99, i8* null, i32 99, i8* null, i32 10, i32 %tmp52, i32 10, i32 undef, i32 99, i8* null, i32 7, double %tmp53, i32 99, i8* null, i32 7, double undef, i32 99, i8* null, i32 13, i8* undef, i32 99, i8* null, i32 99, i8* null, i8* undef)]
+    unreachable
+
+  bb59:                                             ; preds = %bb2
+    %tmp61 = call token (i64, i32, void (i32)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidi32f(i64 2882400000, i32 0, void (i32)* nonnull elementtype(void (i32)) @wobble, i32 1, i32 0, i32 8, i32 0, i32 0) ["deopt" (i32 1, i32 0, i32 123, i32 4, i32 12, i32 0, i32 13, i8* null, i32 99, i32 undef, i32 13, i8* null, i32 10, i32 undef, i32 99, i32 undef, i32 99, i32 undef, i32 99, i32 undef, i32 99, i8* null, i32 99, float undef, i32 99, double undef, i32 99, i8* null, i32 99, double undef, i32 99, i8* null, i32 13, i8* null, i32 99, double undef, i32 99, i8* null)]
+    unreachable
+  }
+
+  declare void @wibble()
+
+  declare void @wobble(i32)
+
+  declare token @llvm.experimental.gc.statepoint.p0f_isVoidi32f(i64, i32, void (i32)*, i32, i32, ...)
+
+  declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)
+
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #0
+
+  attributes #0 = { nounwind }
+
+  !0 = !{}
+...
+---
+name:            eggs
+alignment:       16
+tracksRegLiveness: true
+fixedStack:
+  - { id: 0, type: spill-slot, offset: -56, size: 8, alignment: 8, callee-saved-register: '$rbx' }
+  - { id: 1, type: spill-slot, offset: -48, size: 8, alignment: 16, callee-saved-register: '$r12' }
+  - { id: 2, type: spill-slot, offset: -40, size: 8, alignment: 8, callee-saved-register: '$r13' }
+  - { id: 3, type: spill-slot, offset: -32, size: 8, alignment: 16, callee-saved-register: '$r14' }
+  - { id: 4, type: spill-slot, offset: -24, size: 8, alignment: 8, callee-saved-register: '$r15' }
+  - { id: 5, type: spill-slot, offset: -16, size: 8, alignment: 16, callee-saved-register: '$rbp' }
+stack:
+  - { id: 0, offset: -88, size: 8, alignment: 8 }
+  - { id: 1, offset: -96, size: 8, alignment: 8 }
+  - { id: 2, offset: -104, size: 8, alignment: 8 }
+  - { id: 3, offset: -64, size: 8, alignment: 8 }
+  - { id: 4, type: spill-slot, offset: -72, size: 8, alignment: 8 }
+  - { id: 5, type: spill-slot, offset: -80, size: 8, alignment: 8 }
+constants:
+  - id:              0
+    value:           'double 2.000000e+00'
+    alignment:       8
+body:             |
+  bb.0.bb:
+    successors: %bb.1.bb2(0x00000800), %bb.3.bb3(0x7ffff800)
+    liveins: $rbp, $r15, $r14, $r13, $r12, $rbx
+
+    frame-setup PUSH64r killed $rbp, implicit-def $rsp, implicit $rsp
+    frame-setup PUSH64r killed $r15, implicit-def $rsp, implicit $rsp
+    frame-setup PUSH64r killed $r14, implicit-def $rsp, implicit $rsp
+    frame-setup PUSH64r killed $r13, implicit-def $rsp, implicit $rsp
+    frame-setup PUSH64r killed $r12, implicit-def $rsp, implicit $rsp
+    frame-setup PUSH64r killed $rbx, implicit-def $rsp, implicit $rsp
+    $rsp = frame-setup SUB64ri8 $rsp, 56, implicit-def dead $eflags
+    CALL64r undef $rax, csr_64, implicit $rsp, implicit undef $rdi, implicit undef $rsi, implicit-def $rsp, implicit-def $rax
+    TEST64rr $rax, $rax, implicit-def $eflags
+    JCC_1 %bb.3.bb3, 5, implicit killed $eflags
+
+  bb.1.bb2:
+    successors: %bb.2(0x40000000), %bb.13.bb59(0x40000000)
+
+    $ebp = XOR32rr undef $ebp, undef $ebp, implicit-def dead $eflags
+    TEST8rr $bpl, $bpl, implicit-def $eflags
+    JCC_1 %bb.13.bb59, 4, implicit killed $eflags
+
+  bb.2:
+    successors: %bb.12.bb51(0x80000000)
+    liveins: $ebp
+
+    $xmm0 = XORPSrr undef $xmm0, undef $xmm0
+    $ebx = IMPLICIT_DEF implicit-def $rbx
+    JMP_1 %bb.12.bb51
+
+  bb.3.bb3:
+    successors: %bb.4.bb7(0x80000000)
+    liveins: $rax
+
+    MOV64mr $rsp, 1, $noreg, 32, $noreg, $rax :: (store (s64) into %stack.5)
+    $r12 = MOV64rr killed $rax
+    $r12 = ADD64ri8 killed $r12, 16, implicit-def dead $eflags
+    $xmm0 = XORPSrr undef $xmm0, undef $xmm0
+    $esi = XOR32rr undef $esi, undef $esi, implicit-def dead $eflags
+    $rax = MOV64ri %const.0
+    $xmm1 = MOVSDrm_alt killed $rax, 1, $noreg, 0, $noreg :: (load (s64) from constant-pool)
+    MOVSDmr $rsp, 1, $noreg, 40, $noreg, killed $xmm1 :: (store (s64) into %stack.4)
+    $eax = IMPLICIT_DEF
+    $ecx = XOR32rr undef $ecx, undef $ecx, implicit-def dead $eflags
+
+  bb.4.bb7:
+    successors: %bb.6.bb26(0x40000000), %bb.5.bb15(0x40000000)
+    liveins: $eax, $ecx, $esi, $r12, $xmm0
+
+    $ebp = MOV32rr killed $ecx
+    $ebx = MOV32rr killed $eax, implicit-def $rbx
+    $r14d = MOV32rr $ebx, implicit-def $r14
+    TEST8rr $sil, $sil, implicit-def $eflags
+    JCC_1 %bb.6.bb26, 5, implicit $eflags
+
+  bb.5.bb15:
+    successors: %bb.6.bb26(0x80000000)
+    liveins: $ebp, $rbx, $r14, $xmm0
+
+    MOV32mr $rsp, 1, $noreg, 24, $noreg, $ebx :: (store (s32) into %stack.0, align 8)
+    MOV32mr $rsp, 1, $noreg, 16, $noreg, $ebp :: (store (s32) into %stack.1, align 8)
+    MOVSDmr $rsp, 1, $noreg, 8, $noreg, killed $xmm0 :: (store (s64) into %stack.2)
+    $rax = MOV64rm $rsp, 1, $noreg, 32, $noreg :: (load (s64) from %stack.5)
+    MOV64mr $rsp, 1, $noreg, 48, $noreg, killed $rax :: (store (s64) into %stack.3)
+    $rax = MOV64ri @wibble
+    STATEPOINT 2882400000, 0, 0, killed $rax, 2, 0, 2, 0, 2, 30, 2, 1, 2, 0, 2, 99, 2, 0, 2, 12, 2, 0, 2, 10, 1, 8, $rsp, 24, 2, 10, 2, 0, 2, 10, 1, 8, $rsp, 16, 2, 10, 2, 4278124286, 2, 6, 2, 4278124286, 2, 7, 1, 8, $rsp, 8, 2, 99, 2, 0, 2, 7, 2, 4278124286, 2, 99, 2, 0, 2, 13, 1, 8, $rsp, 48, 2, 7, 2, 4278124286, 2, 99, 2, 0, 2, 0, 2, 0, 2, 0, csr_64, implicit-def $rsp :: (volatile load (s64) from %stack.0), (volatile load (s64) from %stack.1), (volatile load (s64) from %stack.2), (volatile load (s64) from %stack.3)
+    $esi = XOR32rr undef $esi, undef $esi, implicit-def dead $eflags
+    $r12 = IMPLICIT_DEF
+
+  bb.6.bb26:
+    successors: %bb.8.bb37(0x40000000), %bb.7.bb35(0x40000000)
+    liveins: $ebp, $esi, $rbx, $r12, $r14
+
+    $rax = MOV64ri @global.1
+    $rax = MOV64rm killed $rax, 1, $noreg, 0, $noreg :: (dereferenceable load (s64) from @global.1)
+    TEST64rr $rax, $rax, implicit-def $eflags
+    $rax = CMOV64rr undef $rax, killed $rax, 4, implicit killed $eflags
+    $ecx = MOV32rm undef $rax, 1, $noreg, 0, $noreg :: (load (s32) from `i32* undef`)
+    $rdx = MOV64rm $r12, 8, $r14, 0, $noreg :: (load (s64) from %ir.tmp3)
+    $r15 = LEA64r $rdx, 1, $noreg, 1, _
+    MOV64mr $r12, 8, $r14, 0, $noreg, $r15 :: (store (s64) into %ir.tmp3)
+    $ecx = SUB32rr killed $ecx, $edx, implicit-def dead $eflags, implicit killed $rdx
+    MOV32mr undef $rax, 1, $noreg, 0, $noreg, killed $ecx :: (store (s32) into `i32* undef`)
+    $r13 = MOV64rm killed $rax, 1, $noreg, 768, $noreg :: (load (s64) from %ir.tmp33)
+    TEST8rr $sil, $sil, implicit-def $eflags
+    $rax = IMPLICIT_DEF
+    JCC_1 %bb.8.bb37, 5, implicit $eflags
+
+  bb.7.bb35:
+    successors: %bb.8.bb37(0x80000000)
+    liveins: $ebp, $rbx, $r12, $r13, $r14, $r15
+
+    $rsi = MOV64ri @global
+    $rax = MOV64ri @ham
+    CALL64r killed $rax, csr_64, implicit $rsp, implicit undef $rdi, implicit $rsi, implicit-def $rsp, implicit-def $rax
+    $esi = XOR32rr undef $esi, undef $esi, implicit-def dead $eflags
+
+  bb.8.bb37:
+    successors: %bb.9.bb37(0x40000000), %bb.10.bb37(0x40000000)
+    liveins: $ebp, $esi, $rax, $rbx, $r12, $r13, $r14, $r15
+
+    $rcx = MOV64rm killed $rax, 1, $noreg, 760, $noreg :: (load (s64) from %ir.tmp40)
+    CMP64rr $r13, $rcx, implicit-def $eflags
+    JCC_1 %bb.10.bb37, 12, implicit $eflags
+
+  bb.9.bb37:
+    successors: %bb.10.bb37(0x80000000)
+    liveins: $ebp, $esi, $rbx, $r12, $r13, $r14, $r15
+
+    $cl = MOV8rr $r13b, implicit killed $r13, implicit-def $rcx
+
+  bb.10.bb37:
+    successors: %bb.11.bb51.loopexit(0x00000800), %bb.4.bb7(0x7ffff800)
+    liveins: $ebp, $esi, $rbx, $rcx, $r12, $r14, $r15
+
+    $cl = KILL $cl, implicit killed $rcx
+    $r15 = SAR64rCL killed $r15, implicit-def dead $eflags, implicit $cl
+    MOV64mr $r12, 8, killed $r14, 0, $noreg, killed $r15 :: (store (s64) into %ir.tmp7)
+    MOV64mi32 undef $rax, 1, $noreg, 0, $noreg, 0 :: (store (s64) into `i64* undef`)
+    $eax = LEA64_32r $rbx, 1, $noreg, 1, _
+    $ecx = MOV32ri 6
+    CMP32ri $eax, 15141, implicit-def $eflags
+    $xmm0 = MOVSDrm_alt $rsp, 1, $noreg, 40, $noreg :: (load (s64) from %stack.4)
+    JCC_1 %bb.4.bb7, 12, implicit $eflags
+
+  bb.11.bb51.loopexit:
+    successors: %bb.12.bb51(0x80000000)
+    liveins: $ebp, $rbx
+
+    $ebp = INC32r killed $ebp, implicit-def dead $eflags
+    $ebx = INC32r $ebx, implicit-def dead $eflags, implicit killed $rbx, implicit-def $rbx
+    $rax = MOV64ri %const.0
+    $xmm0 = MOVSDrm_alt killed $rax, 1, $noreg, 0, $noreg :: (load (s64) from constant-pool)
+
+  bb.12.bb51:
+    liveins: $ebp, $rbx, $xmm0
+
+    MOV32mr $rsp, 1, $noreg, 24, $noreg, $ebx, implicit killed $rbx :: (store (s32) into %stack.0, align 8)
+    MOV32mr $rsp, 1, $noreg, 16, $noreg, killed $ebp :: (store (s32) into %stack.1, align 8)
+    MOVSDmr $rsp, 1, $noreg, 8, $noreg, killed $xmm0 :: (store (s64) into %stack.2)
+    $rax = MOV64ri @wobble
+    $edi = MOV32ri -121
+    STATEPOINT 2882400000, 0, 1, killed $rax, $edi, 2, 0, 2, 0, 2, 38, 2, 1, 2, 0, 2, 270, 2, 4, 2, 12, 2, 0, 2, 11, 2, 4278124286, 2, 99, 2, 0, 2, 10, 1, 8, $rsp, 24, 2, 6, 2, 4278124286, 2, 99, 2, 0, 2, 99, 2, 0, 2, 10, 1, 8, $rsp, 16, 2, 10, 2, 4278124286, 2, 99, 2, 0, 2, 7, 1, 8, $rsp, 8, 2, 99, 2, 0, 2, 7, 2, 4278124286, 2, 99, 2, 0, 2, 13, 2, 4278124286, 2, 99, 2, 0, 2, 99, 2, 0, 2, 0, 2, 0, 2, 0, csr_64, implicit-def $rsp :: (volatile load (s64) from %stack.0), (volatile load (s64) from %stack.1), (volatile load (s64) from %stack.2)
+
+  bb.13.bb59:
+    $rax = MOV64ri @wobble
+    $edi = MOV32ri 8
+    STATEPOINT 2882400000, 0, 1, killed $rax, $edi, 2, 0, 2, 0, 2, 38, 2, 1, 2, 0, 2, 123, 2, 4, 2, 12, 2, 0, 2, 13, 2, 0, 2, 99, 2, 4278124286, 2, 13, 2, 0, 2, 10, 2, 4278124286, 2, 99, 2, 4278124286, 2, 99, 2, 4278124286, 2, 99, 2, 4278124286, 2, 99, 2, 0, 2, 99, 2, 4278124286, 2, 99, 2, 4278124286, 2, 99, 2, 0, 2, 99, 2, 4278124286, 2, 99, 2, 0, 2, 13, 2, 0, 2, 99, 2, 4278124286, 2, 99, 2, 0, 2, 0, 2, 0, 2, 0, csr_64, implicit-def $rsp
+
+...
diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index 01056a8b2c24..d3a3b1e980db 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -975,18 +975,15 @@ define void @interleave_24i16_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
 ; SSE42-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6],xmm1[7]
 ; SSE42-NEXT:    pshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
 ; SSE42-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm3[6,7]
-; SSE42-NEXT:    movdqa %xmm2, %xmm3
-; SSE42-NEXT:    pshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,0,1,6,7,12,13]
-; SSE42-NEXT:    movdqa %xmm0, %xmm5
-; SSE42-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3,4],xmm1[5],xmm5[6,7]
-; SSE42-NEXT:    pshufb {{.*#+}} xmm5 = xmm5[2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u]
-; SSE42-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm3[5,6,7]
-; SSE42-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,2,3,8,9,14,15]
+; SSE42-NEXT:    movdqa %xmm0, %xmm3
+; SSE42-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
+; SSE42-NEXT:    pblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7]
+; SSE42-NEXT:    pshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
 ; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
-; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u]
-; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
+; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7]
+; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15]
 ; SSE42-NEXT:    movdqu %xmm4, (%rsi)
-; SSE42-NEXT:    movdqu %xmm5, (%rdx)
+; SSE42-NEXT:    movdqu %xmm3, (%rdx)
 ; SSE42-NEXT:    movdqu %xmm1, (%rcx)
 ; SSE42-NEXT:    retq
 ;
@@ -1000,14 +997,12 @@ define void @interleave_24i16_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5],xmm3[6,7]
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,0,1,6,7,12,13]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4],xmm4[5,6,7]
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,2,3,8,9,14,15]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2],xmm2[3],xmm4[4,5],xmm2[6],xmm4[7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6],xmm2[7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15]
 ; AVX1-NEXT:    vmovdqu %xmm3, (%rsi)
 ; AVX1-NEXT:    vmovdqu %xmm4, (%rdx)
 ; AVX1-NEXT:    vmovdqu %xmm0, (%rcx)
@@ -1125,18 +1120,15 @@ define void @interleave_24i16_out_reverse(ptr %p, ptr %q1, ptr %q2, ptr %q3) nou
 ; SSE42-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5,6],xmm2[7]
 ; SSE42-NEXT:    pshufb {{.*#+}} xmm4 = xmm4[14,15,8,9,2,3,12,13,6,7,0,1,u,u,u,u]
 ; SSE42-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm3[6,7]
-; SSE42-NEXT:    movdqa %xmm0, %xmm3
-; SSE42-NEXT:    pshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,14,15,8,9,2,3]
-; SSE42-NEXT:    movdqa %xmm2, %xmm5
-; SSE42-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3,4],xmm1[5],xmm5[6,7]
-; SSE42-NEXT:    pshufb {{.*#+}} xmm5 = xmm5[12,13,6,7,0,1,10,11,4,5,u,u,u,u,u,u]
-; SSE42-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm3[5,6,7]
-; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,12,13,6,7,0,1]
+; SSE42-NEXT:    movdqa %xmm2, %xmm3
+; SSE42-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
+; SSE42-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7]
+; SSE42-NEXT:    pshufb {{.*#+}} xmm3 = xmm3[12,13,6,7,0,1,10,11,4,5,14,15,8,9,2,3]
 ; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
-; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[10,11,4,5,14,15,8,9,2,3,u,u,u,u,u,u]
-; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7]
+; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
+; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[10,11,4,5,14,15,8,9,2,3,12,13,6,7,0,1]
 ; SSE42-NEXT:    movdqu %xmm4, (%rsi)
-; SSE42-NEXT:    movdqu %xmm5, (%rdx)
+; SSE42-NEXT:    movdqu %xmm3, (%rdx)
 ; SSE42-NEXT:    movdqu %xmm1, (%rcx)
 ; SSE42-NEXT:    retq
 ;
@@ -1145,14 +1137,12 @@ define void @interleave_24i16_out_reverse(ptr %p, ptr %q1, ptr %q2, ptr %q3) nou
 ; AVX1-NEXT:    vmovdqu (%rdi), %xmm0
 ; AVX1-NEXT:    vmovdqu 16(%rdi), %xmm1
 ; AVX1-NEXT:    vmovdqu 32(%rdi), %xmm2
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[14,15,8,9,2,3,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,12,13,6,7,0,1,10,11,4,5]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3,4,5,6,7]
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm4 = xmm2[12,13,6,7,0,1,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,10,11,4,5,14,15,8,9,2,3]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3,4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[14,15,8,9,2,3,12,13,6,7,0,1,10,11,4,5]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2],xmm2[3],xmm4[4,5],xmm2[6],xmm4[7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[12,13,6,7,0,1,10,11,4,5,14,15,8,9,2,3]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
@@ -1450,19 +1440,17 @@ define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
 ; SSE42-NEXT:    pshufd {{.*#+}} xmm9 = xmm5[2,3,2,3]
 ; SSE42-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm1[2,3]
 ; SSE42-NEXT:    insertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm0[1]
-; SSE42-NEXT:    pshufd {{.*#+}} xmm10 = xmm3[2,2,2,2]
-; SSE42-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,0,3,3]
-; SSE42-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm10[6,7]
-; SSE42-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[1,0,3,3]
-; SSE42-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[2,2,2,2]
-; SSE42-NEXT:    pblendw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,5],xmm10[6,7]
+; SSE42-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm3[4,5],xmm6[6,7]
+; SSE42-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,0,3,2]
+; SSE42-NEXT:    pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm0[4,5],xmm8[6,7]
+; SSE42-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[1,0,3,2]
 ; SSE42-NEXT:    pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm2[2,3],xmm7[4,5,6,7]
 ; SSE42-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1],xmm3[0,3]
 ; SSE42-NEXT:    pblendw {{.*#+}} xmm9 = xmm9[0,1],xmm1[2,3],xmm9[4,5,6,7]
 ; SSE42-NEXT:    shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,3]
 ; SSE42-NEXT:    movups %xmm5, 16(%rsi)
 ; SSE42-NEXT:    movups %xmm4, (%rsi)
-; SSE42-NEXT:    movdqu %xmm10, 16(%rdx)
+; SSE42-NEXT:    movdqu %xmm8, 16(%rdx)
 ; SSE42-NEXT:    movdqu %xmm6, (%rdx)
 ; SSE42-NEXT:    movups %xmm9, 16(%rcx)
 ; SSE42-NEXT:    movups %xmm7, (%rcx)
@@ -1504,19 +1492,14 @@ define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
 ; AVX2-SLOW-NEXT:    vmovups (%rdi), %ymm0
 ; AVX2-SLOW-NEXT:    vmovups 32(%rdi), %ymm1
 ; AVX2-SLOW-NEXT:    vmovups 64(%rdi), %ymm2
-; AVX2-SLOW-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
-; AVX2-SLOW-NEXT:    vpermps %ymm2, %ymm3, %ymm3
-; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm5 = [0,3,6,1,4,7,u,u]
+; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm4 = [0,3,6,1,4,7,2,5]
+; AVX2-SLOW-NEXT:    vpermps %ymm3, %ymm4, %ymm3
+; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm5 = [1,4,7,2,5,0,3,6]
 ; AVX2-SLOW-NEXT:    vpermps %ymm4, %ymm5, %ymm4
-; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-SLOW-NEXT:    vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
-; AVX2-SLOW-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX2-SLOW-NEXT:    vpermps %ymm2, %ymm4, %ymm4
-; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm6 = [1,4,7,2,5,u,u,u]
-; AVX2-SLOW-NEXT:    vpermps %ymm5, %ymm6, %ymm5
-; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
 ; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
 ; AVX2-SLOW-NEXT:    vpermps %ymm0, %ymm1, %ymm0
@@ -1534,26 +1517,18 @@ define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
 ; AVX2-FAST-ALL-NEXT:    vmovups (%rdi), %ymm0
 ; AVX2-FAST-ALL-NEXT:    vmovups 32(%rdi), %ymm1
 ; AVX2-FAST-ALL-NEXT:    vmovups 64(%rdi), %ymm2
-; AVX2-FAST-ALL-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
-; AVX2-FAST-ALL-NEXT:    vpermps %ymm2, %ymm3, %ymm3
-; AVX2-FAST-ALL-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm5 = [0,3,6,1,4,7,u,u]
+; AVX2-FAST-ALL-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX2-FAST-ALL-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
+; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm4 = [0,3,6,1,4,7,2,5]
+; AVX2-FAST-ALL-NEXT:    vpermps %ymm3, %ymm4, %ymm3
+; AVX2-FAST-ALL-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-FAST-ALL-NEXT:    vblendps {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
+; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm5 = [1,4,7,2,5,0,3,6]
 ; AVX2-FAST-ALL-NEXT:    vpermps %ymm4, %ymm5, %ymm4
-; AVX2-FAST-ALL-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-FAST-ALL-NEXT:    vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
-; AVX2-FAST-ALL-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX2-FAST-ALL-NEXT:    vpermps %ymm2, %ymm4, %ymm4
-; AVX2-FAST-ALL-NEXT:    vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm6 = [1,4,7,2,5,u,u,u]
-; AVX2-FAST-ALL-NEXT:    vpermps %ymm5, %ymm6, %ymm5
-; AVX2-FAST-ALL-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
-; AVX2-FAST-ALL-NEXT:    vbroadcastf128 {{.*#+}} ymm5 = [0,1,4,7,0,1,4,7]
-; AVX2-FAST-ALL-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX2-FAST-ALL-NEXT:    vpermps %ymm2, %ymm5, %ymm2
 ; AVX2-FAST-ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
+; AVX2-FAST-ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
+; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,1,4,7]
 ; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2-FAST-ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FAST-ALL-NEXT:    vmovups %ymm3, (%rsi)
 ; AVX2-FAST-ALL-NEXT:    vmovups %ymm4, (%rdx)
 ; AVX2-FAST-ALL-NEXT:    vmovups %ymm0, (%rcx)
@@ -1565,19 +1540,14 @@ define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
 ; AVX2-FAST-PERLANE-NEXT:    vmovups (%rdi), %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovups 32(%rdi), %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovups 64(%rdi), %ymm2
-; AVX2-FAST-PERLANE-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
-; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm2, %ymm3, %ymm3
-; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm5 = [0,3,6,1,4,7,u,u]
+; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm4 = [0,3,6,1,4,7,2,5]
+; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm3, %ymm4, %ymm3
+; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm5 = [1,4,7,2,5,0,3,6]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm4, %ymm5, %ymm4
-; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-FAST-PERLANE-NEXT:    vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
-; AVX2-FAST-PERLANE-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm2, %ymm4, %ymm4
-; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm6 = [1,4,7,2,5,u,u,u]
-; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm5, %ymm6, %ymm5
-; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm0, %ymm1, %ymm0
diff --git a/llvm/test/CodeGen/X86/opt-shuff-tstore.ll b/llvm/test/CodeGen/X86/opt-shuff-tstore.ll
index 0a2d4e9ba9fe..c331f8ffb369 100644
--- a/llvm/test/CodeGen/X86/opt-shuff-tstore.ll
+++ b/llvm/test/CodeGen/X86/opt-shuff-tstore.ll
@@ -1,37 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s  -mattr=+sse2,+sse4.1 | FileCheck %s
 
-; CHECK: func_4_8
 ; A single memory write
-; CHECK: movd
-; CHECK-NEXT: ret
 define void @func_4_8(<4 x i8> %param, ptr %p) {
+; CHECK-LABEL: func_4_8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movd %xmm0, (%rdi)
+; CHECK-NEXT:    retq
   %r = add <4 x i8> %param, <i8 1, i8 2, i8 3, i8 4>
   store <4 x i8> %r, ptr %p
   ret void
 }
 
-; CHECK: func_4_16
-; CHECK: movq
-; CHECK-NEXT: ret
 define void @func_4_16(<4 x i16> %param, ptr %p) {
+; CHECK-LABEL: func_4_16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movq %xmm0, (%rdi)
+; CHECK-NEXT:    retq
   %r = add <4 x i16> %param, <i16 1, i16 2, i16 3, i16 4>
   store <4 x i16> %r, ptr %p
   ret void
 }
 
-; CHECK: func_8_8
-; CHECK: movq
-; CHECK-NEXT: ret
 define void @func_8_8(<8 x i8> %param, ptr %p) {
+; CHECK-LABEL: func_8_8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movq %xmm0, (%rdi)
+; CHECK-NEXT:    retq
   %r = add <8 x i8> %param, <i8 1, i8 2, i8 3, i8 4, i8 1, i8 2, i8 3, i8 4>
   store <8 x i8> %r, ptr %p
   ret void
 }
 
-; CHECK: func_2_32
-; CHECK: movq
-; CHECK-NEXT: ret
 define void @func_2_32(<2 x i32> %param, ptr %p) {
+; CHECK-LABEL: func_2_32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movq %xmm0, (%rdi)
+; CHECK-NEXT:    retq
   %r = add <2 x i32> %param, <i32 1, i32 2>
   store <2 x i32> %r, ptr %p
   ret void
diff --git a/llvm/test/CodeGen/X86/patchable-prologue-debuginfo.ll b/llvm/test/CodeGen/X86/patchable-prologue-debuginfo.ll
index e713418c8d8e..8802f97d958f 100644
--- a/llvm/test/CodeGen/X86/patchable-prologue-debuginfo.ll
+++ b/llvm/test/CodeGen/X86/patchable-prologue-debuginfo.ll
@@ -1,56 +1,56 @@
-; RUN: llc -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK
-
-; Regression test for function patching asserting in some cases when debug info activated.
-; The code below reproduces this crash.
-
-; Compilation flag:  clang -target x86_64-none-linux-gnu -c -O2 -g -fms-hotpatch patchable-prologue-debuginfo.c
-; int func( int val ) {
-;   int neg = -val;
-;   return neg + 1;
-; }
-
-; CHECK: # -- Begin function func
-
-; ModuleID = 'patchable-prologue-debuginfo.c'
-source_filename = "patchable-prologue-debuginfo.c"
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-none-linux-gnu"
-
-; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn uwtable
-define dso_local i32 @func(i32 noundef %val) local_unnamed_addr #0 !dbg !9 {
-entry:
-  call void @llvm.dbg.value(metadata i32 %val, metadata !14, metadata !DIExpression()), !dbg !16
-  call void @llvm.dbg.value(metadata !DIArgList(i32 0, i32 %val), metadata !15, metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_minus, DW_OP_stack_value)), !dbg !16
-  %add = sub i32 1, %val, !dbg !17
-  ret i32 %add, !dbg !18
-}
-
-; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { mustprogress nofree norecurse nosync nounwind readnone willreturn uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "patchable-function"="prologue-short-redirect" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
-attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!2, !3, !4, !5, !6, !7}
-!llvm.ident = !{!8}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 15.0.4 (git@gitlab-ncsa.ubisoft.org:LLVM/llvm-project.git 17850fb41c5bddcd80a9c2714f7e293f49fa8bb2)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
-!1 = !DIFile(filename: "patchable-prologue-debuginfo.c", directory: "D:\\saudi\\bugrepro-llvm-hotpatch-crash")
-!2 = !{i32 7, !"Dwarf Version", i32 4}
-!3 = !{i32 2, !"Debug Info Version", i32 3}
-!4 = !{i32 1, !"wchar_size", i32 4}
-!5 = !{i32 7, !"PIC Level", i32 2}
-!6 = !{i32 7, !"PIE Level", i32 2}
-!7 = !{i32 7, !"uwtable", i32 2}
-!8 = !{!"clang version 15.0.4 (git@gitlab-ncsa.ubisoft.org:LLVM/llvm-project.git 17850fb41c5bddcd80a9c2714f7e293f49fa8bb2)"}
-!9 = distinct !DISubprogram(name: "func", scope: !1, file: !1, line: 1, type: !10, scopeLine: 2, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !13)
-!10 = !DISubroutineType(types: !11)
-!11 = !{!12, !12}
-!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-!13 = !{!14, !15}
-!14 = !DILocalVariable(name: "val", arg: 1, scope: !9, file: !1, line: 1, type: !12)
-!15 = !DILocalVariable(name: "neg", scope: !9, file: !1, line: 3, type: !12)
-!16 = !DILocation(line: 0, scope: !9)
-!17 = !DILocation(line: 4, column: 16, scope: !9)
-!18 = !DILocation(line: 4, column: 5, scope: !9)
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK
+
+; Regression test for function patching asserting in some cases when debug info activated.
+; The code below reproduces this crash.
+
+; Compilation flag:  clang -target x86_64-none-linux-gnu -c -O2 -g -fms-hotpatch patchable-prologue-debuginfo.c
+; int func( int val ) {
+;   int neg = -val;
+;   return neg + 1;
+; }
+
+; CHECK: # -- Begin function func
+
+; ModuleID = 'patchable-prologue-debuginfo.c'
+source_filename = "patchable-prologue-debuginfo.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-none-linux-gnu"
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn uwtable
+define dso_local i32 @func(i32 noundef %val) local_unnamed_addr #0 !dbg !9 {
+entry:
+  call void @llvm.dbg.value(metadata i32 %val, metadata !14, metadata !DIExpression()), !dbg !16
+  call void @llvm.dbg.value(metadata !DIArgList(i32 0, i32 %val), metadata !15, metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_minus, DW_OP_stack_value)), !dbg !16
+  %add = sub i32 1, %val, !dbg !17
+  ret i32 %add, !dbg !18
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind readnone willreturn uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "patchable-function"="prologue-short-redirect" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7}
+!llvm.ident = !{!8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 15.0.4 (git@gitlab-ncsa.ubisoft.org:LLVM/llvm-project.git 17850fb41c5bddcd80a9c2714f7e293f49fa8bb2)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "patchable-prologue-debuginfo.c", directory: "D:\\saudi\\bugrepro-llvm-hotpatch-crash")
+!2 = !{i32 7, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 7, !"PIC Level", i32 2}
+!6 = !{i32 7, !"PIE Level", i32 2}
+!7 = !{i32 7, !"uwtable", i32 2}
+!8 = !{!"clang version 15.0.4 (git@gitlab-ncsa.ubisoft.org:LLVM/llvm-project.git 17850fb41c5bddcd80a9c2714f7e293f49fa8bb2)"}
+!9 = distinct !DISubprogram(name: "func", scope: !1, file: !1, line: 1, type: !10, scopeLine: 2, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !13)
+!10 = !DISubroutineType(types: !11)
+!11 = !{!12, !12}
+!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!13 = !{!14, !15}
+!14 = !DILocalVariable(name: "val", arg: 1, scope: !9, file: !1, line: 1, type: !12)
+!15 = !DILocalVariable(name: "neg", scope: !9, file: !1, line: 3, type: !12)
+!16 = !DILocation(line: 0, scope: !9)
+!17 = !DILocation(line: 4, column: 16, scope: !9)
+!18 = !DILocation(line: 4, column: 5, scope: !9)
diff --git a/llvm/test/CodeGen/X86/post-ra-sched-with-debug.mir b/llvm/test/CodeGen/X86/post-ra-sched-with-debug.mir
index 523940e2d675..65675ced011f 100644
--- a/llvm/test/CodeGen/X86/post-ra-sched-with-debug.mir
+++ b/llvm/test/CodeGen/X86/post-ra-sched-with-debug.mir
@@ -298,8 +298,8 @@ body:             |
     $rcx = CMOV64rr killed $rcx, killed $rdx, 5, implicit killed $eflags
     $rcx = OR64rr killed $rcx, killed $rsi, implicit-def dead $eflags
     $rdx = MOVSX64rm32 $rbx, 1, $noreg, 0, $noreg :: (load (s32), align 8)
-    DBG_INSTR_REF !46, !17, dbg-instr-ref(1, 0), debug-location !48
-    DBG_INSTR_REF !39, !17, dbg-instr-ref(2, 0), debug-location !44
+    DBG_INSTR_REF !46, !17, dbg-instr-ref(1, 0), debug-location !48
+    DBG_INSTR_REF !39, !17, dbg-instr-ref(2, 0), debug-location !44
     TEST32mr killed $rcx, 4, killed $rdx, 0, $noreg, killed $eax, implicit-def $eflags :: (load (s32))
     JCC_1 %bb.2, 5, implicit $eflags
     JMP_1 %bb.3
diff --git a/llvm/test/CodeGen/X86/pr34592.ll b/llvm/test/CodeGen/X86/pr34592.ll
index 23de746ecb35..aed5ea3ed217 100644
--- a/llvm/test/CodeGen/X86/pr34592.ll
+++ b/llvm/test/CodeGen/X86/pr34592.ll
@@ -8,38 +8,40 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1
 ; CHECK-O0-NEXT:    pushq %rbp
 ; CHECK-O0-NEXT:    movq %rsp, %rbp
 ; CHECK-O0-NEXT:    andq $-32, %rsp
-; CHECK-O0-NEXT:    subq $32, %rsp
+; CHECK-O0-NEXT:    subq $64, %rsp
 ; CHECK-O0-NEXT:    vmovaps %ymm4, %ymm10
 ; CHECK-O0-NEXT:    vmovaps %ymm3, %ymm9
+; CHECK-O0-NEXT:    vmovaps %ymm2, (%rsp) # 32-byte Spill
 ; CHECK-O0-NEXT:    vmovaps %ymm1, %ymm8
+; CHECK-O0-NEXT:    vmovaps %ymm0, %ymm3
+; CHECK-O0-NEXT:    vmovaps (%rsp), %ymm0 # 32-byte Reload
 ; CHECK-O0-NEXT:    vmovaps 240(%rbp), %ymm4
-; CHECK-O0-NEXT:    vmovaps 208(%rbp), %ymm3
-; CHECK-O0-NEXT:    vmovaps 176(%rbp), %ymm1
-; CHECK-O0-NEXT:    vmovaps 144(%rbp), %ymm1
+; CHECK-O0-NEXT:    vmovaps 208(%rbp), %ymm1
+; CHECK-O0-NEXT:    vmovaps 176(%rbp), %ymm2
+; CHECK-O0-NEXT:    vmovaps 144(%rbp), %ymm2
 ; CHECK-O0-NEXT:    vmovaps 112(%rbp), %ymm11
 ; CHECK-O0-NEXT:    vmovaps 80(%rbp), %ymm11
 ; CHECK-O0-NEXT:    vmovaps 48(%rbp), %ymm11
 ; CHECK-O0-NEXT:    vmovaps 16(%rbp), %ymm11
-; CHECK-O0-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm2[6,7]
-; CHECK-O0-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
-; CHECK-O0-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; CHECK-O0-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
+; CHECK-O0-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
+; CHECK-O0-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
 ; CHECK-O0-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,1,2,1]
-; CHECK-O0-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
-; CHECK-O0-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm7[2,3],ymm6[0,1]
-; CHECK-O0-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-O0-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
+; CHECK-O0-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5],ymm0[6,7]
+; CHECK-O0-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm6[0,1]
+; CHECK-O0-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-O0-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7]
+; CHECK-O0-NEXT:    vmovaps %xmm1, %xmm3
+; CHECK-O0-NEXT:    vmovaps %xmm7, %xmm1
+; CHECK-O0-NEXT:    vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
+; CHECK-O0-NEXT:    # implicit-def: $ymm1
+; CHECK-O0-NEXT:    vmovaps %xmm3, %xmm1
+; CHECK-O0-NEXT:    vpermq {{.*#+}} ymm3 = ymm1[0,0,1,3]
+; CHECK-O0-NEXT:    vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20,21,22,23]
+; CHECK-O0-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm3[2,3,4,5],ymm1[6,7]
 ; CHECK-O0-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm5[0],ymm7[2],ymm5[2]
-; CHECK-O0-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
-; CHECK-O0-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[1,1,1,1]
-; CHECK-O0-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3,4,5],ymm1[6,7]
-; CHECK-O0-NEXT:    vmovaps %xmm3, %xmm4
-; CHECK-O0-NEXT:    vmovaps %xmm7, %xmm3
-; CHECK-O0-NEXT:    vpblendd {{.*#+}} xmm4 = xmm3[0,1],xmm4[2,3]
-; CHECK-O0-NEXT:    # implicit-def: $ymm3
-; CHECK-O0-NEXT:    vmovaps %xmm4, %xmm3
-; CHECK-O0-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[0,0,1,3]
-; CHECK-O0-NEXT:    vpslldq {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20,21,22,23]
-; CHECK-O0-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5],ymm3[6,7]
+; CHECK-O0-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-O0-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,1,3]
 ; CHECK-O0-NEXT:    movq %rbp, %rsp
 ; CHECK-O0-NEXT:    popq %rbp
 ; CHECK-O0-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/pr89877.ll b/llvm/test/CodeGen/X86/pr89877.ll
new file mode 100644
index 000000000000..9820ec42f5b8
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr89877.ll
@@ -0,0 +1,115 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X64
+
+define i32 @sext_known_nonzero(i16 %xx) {
+; X86-LABEL: sext_known_nonzero:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $256, %eax # imm = 0x100
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    cwtl
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    je .LBB0_1
+; X86-NEXT:  # %bb.2: # %cond.false
+; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_1:
+; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: sext_known_nonzero:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    movl $256, %eax # imm = 0x100
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shll %cl, %eax
+; X64-NEXT:    cwtl
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    je .LBB0_1
+; X64-NEXT:  # %bb.2: # %cond.false
+; X64-NEXT:    rep bsfl %eax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB0_1:
+; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    retq
+  %x = shl i16 256, %xx
+  %z = sext i16 %x to i32
+  %r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
+  ret i32 %r
+}
+
+define i32 @sext_known_nonzero_nuw(i16 %xx) {
+; X86-LABEL: sext_known_nonzero_nuw:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $256, %eax # imm = 0x100
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    cwtl
+; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: sext_known_nonzero_nuw:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    movl $256, %eax # imm = 0x100
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shll %cl, %eax
+; X64-NEXT:    cwtl
+; X64-NEXT:    rep bsfl %eax, %eax
+; X64-NEXT:    retq
+  %x = shl nuw i16 256, %xx
+  %z = sext i16 %x to i32
+  %r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
+  ret i32 %r
+}
+
+define i32 @sext_known_nonzero_nsw(i16 %xx) {
+; X86-LABEL: sext_known_nonzero_nsw:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $256, %eax # imm = 0x100
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movzwl %ax, %eax
+; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: sext_known_nonzero_nsw:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    movl $256, %eax # imm = 0x100
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shll %cl, %eax
+; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    rep bsfl %eax, %eax
+; X64-NEXT:    retq
+  %x = shl nsw i16 256, %xx
+  %z = sext i16 %x to i32
+  %r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
+  ret i32 %r
+}
+
+define i32 @sext_known_nonzero_nuw_nsw(i16 %xx) {
+; X86-LABEL: sext_known_nonzero_nuw_nsw:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $256, %eax # imm = 0x100
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movzwl %ax, %eax
+; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: sext_known_nonzero_nuw_nsw:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    movl $256, %eax # imm = 0x100
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shll %cl, %eax
+; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    rep bsfl %eax, %eax
+; X64-NEXT:    retq
+  %x = shl nuw nsw i16 256, %xx
+  %z = sext i16 %x to i32
+  %r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
+  ret i32 %r
+}
diff --git a/llvm/test/CodeGen/X86/pr90844.ll b/llvm/test/CodeGen/X86/pr90844.ll
new file mode 100644
index 000000000000..6feece7f66d8
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr90844.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx512f,-evex512 < %s | FileCheck %s
+
+define void @PR90844() {
+; CHECK-LABEL: PR90844:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm0, (%rax)
+; CHECK-NEXT:    retq
+entry:
+  %0 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> poison, <2 x i32> poison, <2 x i32> <i32 8, i32 24>)
+  %1 = and <2 x i32> %0, <i32 16711935, i32 -134152448>
+  %2 = or disjoint <2 x i32> zeroinitializer, %1
+  %3 = zext <2 x i32> %2 to <2 x i64>
+  %4 = shl nuw <2 x i64> %3, <i64 32, i64 32>
+  %5 = or disjoint <2 x i64> %4, zeroinitializer
+  store <2 x i64> %5, ptr poison, align 16
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/pr90847.ll b/llvm/test/CodeGen/X86/pr90847.ll
new file mode 100644
index 000000000000..7aa0ceb26e1a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr90847.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx  | FileCheck %s --check-prefixes=AVX1
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
+
+; PR90847 - failure to peek through FREEZE(SETCC()) results in VPMOVSMSKB(TRUNC()) instead of VMOVMSKPS
+
+define i32 @PR90847(<8 x float> %x) nounwind {
+; AVX1-LABEL: PR90847:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vshufps {{.*#+}} ymm1 = ymm0[1,0,3,2,5,4,7,6]
+; AVX1-NEXT:    vminps %ymm1, %ymm0, %ymm1
+; AVX1-NEXT:    vshufpd {{.*#+}} ymm2 = ymm1[1,0,3,2]
+; AVX1-NEXT:    vminps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX1-NEXT:    vminps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vcmpeqps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmovmskps %ymm0, %eax
+; AVX1-NEXT:    testl %eax, %eax
+; AVX1-NEXT:    je .LBB0_1
+; AVX1-NEXT:  # %bb.2: # %cond.false
+; AVX1-NEXT:    rep bsfl %eax, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+; AVX1-NEXT:  .LBB0_1:
+; AVX1-NEXT:    movl $32, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: PR90847:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm0[1,0,3,2,5,4,7,6]
+; AVX2-NEXT:    vminps %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vshufpd {{.*#+}} ymm2 = ymm1[1,0,3,2]
+; AVX2-NEXT:    vminps %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX2-NEXT:    vminps %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vcmpeqps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vmovmskps %ymm0, %eax
+; AVX2-NEXT:    testl %eax, %eax
+; AVX2-NEXT:    je .LBB0_1
+; AVX2-NEXT:  # %bb.2: # %cond.false
+; AVX2-NEXT:    rep bsfl %eax, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+; AVX2-NEXT:  .LBB0_1:
+; AVX2-NEXT:    movl $32, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+entry:
+  %shuf1 = shufflevector <8 x float> %x, <8 x float> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+  %min1 = tail call noundef <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %x, <8 x float> %shuf1)
+  %shuf2 = shufflevector <8 x float> %min1, <8 x float> poison, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
+  %min2 = tail call noundef <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %min1, <8 x float> %shuf2)
+  %shuf3 = shufflevector <8 x float> %min2, <8 x float> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  %min3 = tail call noundef <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %min2, <8 x float> %shuf3)
+  %fcmp = fcmp oeq <8 x float> %min3, %x
+  %mask = bitcast <8 x i1> %fcmp to i8
+  %zext = zext i8 %mask to i32
+  %cmp = icmp eq i8 %mask, 0
+  %tz = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 %zext, i1 false)
+  %conv = select i1 %cmp, i32 undef, i32 %tz
+  ret i32 %conv
+}
diff --git a/llvm/test/CodeGen/X86/pr91005.ll b/llvm/test/CodeGen/X86/pr91005.ll
new file mode 100644
index 000000000000..97fd1ce45688
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr91005.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+f16c < %s | FileCheck %s
+
+define void @PR91005(ptr %0) minsize {
+; CHECK-LABEL: PR91005:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    je .LBB0_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm0 = [31744,31744,31744,31744]
+; CHECK-NEXT:    vpcmpeqw %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; CHECK-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vcvtph2ps %xmm0, %xmm0
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-NEXT:    vmovd %xmm0, %eax
+; CHECK-NEXT:    movw %ax, (%rdi)
+; CHECK-NEXT:  .LBB0_2: # %common.ret
+; CHECK-NEXT:    retq
+  %2 = bitcast <2 x half> poison to <2 x i16>
+  %3 = icmp eq <2 x i16> %2, <i16 31744, i16 31744>
+  br i1 poison, label %4, label %common.ret
+
+common.ret:                                       ; preds = %4, %1
+  ret void
+
+4:                                                ; preds = %1
+  %5 = select <2 x i1> %3, <2 x half> <half 0xH3C00, half 0xH3C00>, <2 x half> zeroinitializer
+  %6 = fmul <2 x half> %5, zeroinitializer
+  %7 = fsub <2 x half> %6, zeroinitializer
+  %8 = extractelement <2 x half> %7, i64 0
+  store half %8, ptr %0, align 2
+  br label %common.ret
+}
+
+declare <2 x half> @llvm.fabs.v2f16(<2 x half>)
diff --git a/llvm/test/CodeGen/X86/preserve_nonecc_call.ll b/llvm/test/CodeGen/X86/preserve_nonecc_call.ll
index e4ad056913c5..500ebb139811 100644
--- a/llvm/test/CodeGen/X86/preserve_nonecc_call.ll
+++ b/llvm/test/CodeGen/X86/preserve_nonecc_call.ll
@@ -27,6 +27,7 @@ define void @caller1(ptr %a) {
 ; CHECK-NEXT:    .cfi_offset %r13, -32
 ; CHECK-NEXT:    .cfi_offset %r14, -24
 ; CHECK-NEXT:    .cfi_offset %r15, -16
+; CHECK-NEXT:    movq %rdi, %r12
 ; CHECK-NEXT:    callq callee@PLT
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 40
@@ -61,17 +62,17 @@ define preserve_nonecc i64 @callee_with_many_param(i64 %a1, i64 %a2, i64 %a3, i6
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movq %r13, %r12
+; CHECK-NEXT:    movq %r14, %r13
+; CHECK-NEXT:    movq %r15, %r14
+; CHECK-NEXT:    movq %rdi, %r15
 ; CHECK-NEXT:    movq %rsi, %rdi
 ; CHECK-NEXT:    movq %rdx, %rsi
 ; CHECK-NEXT:    movq %rcx, %rdx
 ; CHECK-NEXT:    movq %r8, %rcx
 ; CHECK-NEXT:    movq %r9, %r8
 ; CHECK-NEXT:    movq %r11, %r9
-; CHECK-NEXT:    movq %r12, %r11
-; CHECK-NEXT:    movq %r13, %r12
-; CHECK-NEXT:    movq %r14, %r13
-; CHECK-NEXT:    movq %r15, %r14
-; CHECK-NEXT:    movq %rax, %r15
+; CHECK-NEXT:    movq %rax, %r11
 ; CHECK-NEXT:    callq callee_with_many_param2@PLT
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
@@ -98,17 +99,17 @@ define i64 @caller3() {
 ; CHECK-NEXT:    .cfi_offset %r13, -32
 ; CHECK-NEXT:    .cfi_offset %r14, -24
 ; CHECK-NEXT:    .cfi_offset %r15, -16
-; CHECK-NEXT:    movl $1, %edi
-; CHECK-NEXT:    movl $2, %esi
-; CHECK-NEXT:    movl $3, %edx
-; CHECK-NEXT:    movl $4, %ecx
-; CHECK-NEXT:    movl $5, %r8d
-; CHECK-NEXT:    movl $6, %r9d
-; CHECK-NEXT:    movl $7, %r11d
-; CHECK-NEXT:    movl $8, %r12d
-; CHECK-NEXT:    movl $9, %r13d
-; CHECK-NEXT:    movl $10, %r14d
-; CHECK-NEXT:    movl $11, %r15d
+; CHECK-NEXT:    movl $1, %r12d
+; CHECK-NEXT:    movl $2, %r13d
+; CHECK-NEXT:    movl $3, %r14d
+; CHECK-NEXT:    movl $4, %r15d
+; CHECK-NEXT:    movl $5, %edi
+; CHECK-NEXT:    movl $6, %esi
+; CHECK-NEXT:    movl $7, %edx
+; CHECK-NEXT:    movl $8, %ecx
+; CHECK-NEXT:    movl $9, %r8d
+; CHECK-NEXT:    movl $10, %r9d
+; CHECK-NEXT:    movl $11, %r11d
 ; CHECK-NEXT:    movl $12, %eax
 ; CHECK-NEXT:    callq callee_with_many_param@PLT
 ; CHECK-NEXT:    popq %rbx
@@ -125,3 +126,20 @@ define i64 @caller3() {
   %ret = call preserve_nonecc i64 @callee_with_many_param(i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12)
   ret i64 %ret
 }
+
+; Non-volatile registers are used to pass the first few parameters.
+declare void @boring()
+declare preserve_nonecc void @continuation(ptr, ptr, ptr, ptr)
+define preserve_nonecc void @entry(ptr %r12, ptr %r13, ptr %r14, ptr %r15) {
+; CHECK-LABEL: entry:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    callq boring@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    jmp continuation@PLT # TAILCALL
+  call void @boring()
+  musttail call preserve_nonecc void @continuation(ptr %r12, ptr %r13, ptr %r14, ptr %r15)
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/preserve_nonecc_call_win.ll b/llvm/test/CodeGen/X86/preserve_nonecc_call_win.ll
new file mode 100644
index 000000000000..232ac3450578
--- /dev/null
+++ b/llvm/test/CodeGen/X86/preserve_nonecc_call_win.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=x86_64-pc-windows-msvc -mcpu=corei7 < %s | FileCheck %s
+
+; Non-volatile registers are used to pass the first few parameters.
+declare void @boring()
+declare preserve_nonecc void @continuation(ptr, ptr, ptr, ptr, ptr, ptr)
+define preserve_nonecc void @entry(ptr %r12, ptr %r13, ptr %r14, ptr %r15, ptr %rdi, ptr %rsi) {
+; CHECK-LABEL: entry:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .seh_stackalloc 40
+; CHECK-NEXT:    .seh_endprologue
+; CHECK-NEXT:    callq boring
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    jmp continuation # TAILCALL
+; CHECK-NEXT:    .seh_endproc
+  call void @boring()
+  musttail call preserve_nonecc void @continuation(ptr %r12, ptr %r13, ptr %r14, ptr %r15, ptr %rdi, ptr %rsi)
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
index a1cabb433d87..e7727a0ab617 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
@@ -563,20 +563,18 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X64-NEXT:    subq $120, %rsp
 ; X64-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT:    pxor %xmm3, %xmm3
-; X64-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
+; X64-NEXT:    pxor %xmm2, %xmm2
+; X64-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; X64-NEXT:    psrlq $31, %xmm2
+; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
 ; X64-NEXT:    psrad $31, %xmm2
-; X64-NEXT:    psrlq $31, %xmm3
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT:    movq %xmm0, %rbp
-; X64-NEXT:    movq %rbp, %r14
-; X64-NEXT:    sarq $63, %r14
-; X64-NEXT:    shldq $31, %rbp, %r14
-; X64-NEXT:    movq %rbp, %r15
-; X64-NEXT:    shlq $31, %r15
+; X64-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X64-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT:    movq %xmm3, %rbx
+; X64-NEXT:    movq %rbx, %r13
+; X64-NEXT:    sarq $63, %r13
+; X64-NEXT:    shldq $31, %rbx, %r13
 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
 ; X64-NEXT:    pxor %xmm0, %xmm0
 ; X64-NEXT:    pcmpgtd %xmm1, %xmm0
@@ -584,113 +582,112 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X64-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; X64-NEXT:    movq %xmm1, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %rbx
-; X64-NEXT:    sarq $63, %rbx
-; X64-NEXT:    movq %r15, %rdi
-; X64-NEXT:    movq %r14, %rsi
-; X64-NEXT:    movq %rbx, %rcx
+; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    sarq $63, %r15
+; X64-NEXT:    movq %rbx, %r12
+; X64-NEXT:    shlq $31, %r12
+; X64-NEXT:    movq %r12, %rdi
+; X64-NEXT:    movq %r13, %rsi
+; X64-NEXT:    movq %r15, %rcx
 ; X64-NEXT:    callq __divti3@PLT
-; X64-NEXT:    movq %rax, %r13
+; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r12
+; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    subq $1, %r13
-; X64-NEXT:    sbbq $0, %r12
-; X64-NEXT:    movq %r15, %rdi
-; X64-NEXT:    movq %r14, %rsi
+; X64-NEXT:    subq $1, %rbp
+; X64-NEXT:    sbbq $0, %r14
+; X64-NEXT:    shrq $63, %rbx
+; X64-NEXT:    xorl %r15d, %ebx
+; X64-NEXT:    movq %r12, %rdi
+; X64-NEXT:    movq %r13, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    movq %rbx, %rcx
+; X64-NEXT:    movq %r15, %rcx
 ; X64-NEXT:    callq __modti3@PLT
 ; X64-NEXT:    orq %rax, %rdx
 ; X64-NEXT:    setne %al
-; X64-NEXT:    shrq $63, %rbp
-; X64-NEXT:    xorl %ebp, %ebx
 ; X64-NEXT:    testb %bl, %al
-; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
 ; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    movl $4294967295, %edx # imm = 0xFFFFFFFF
-; X64-NEXT:    cmpq %rdx, %r13
-; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    cmpq %rdx, %rbp
+; X64-NEXT:    movq %r14, %rax
 ; X64-NEXT:    sbbq $0, %rax
-; X64-NEXT:    cmovgeq %rdx, %r13
-; X64-NEXT:    cmovgeq %rcx, %r12
+; X64-NEXT:    cmovgeq %rcx, %r14
+; X64-NEXT:    cmovgeq %rdx, %rbp
 ; X64-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; X64-NEXT:    cmpq %r13, %rcx
+; X64-NEXT:    cmpq %rbp, %rcx
 ; X64-NEXT:    movq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; X64-NEXT:    movq $-1, %rax
-; X64-NEXT:    sbbq %r12, %rax
-; X64-NEXT:    cmovgeq %rcx, %r13
-; X64-NEXT:    movq %r13, %xmm0
+; X64-NEXT:    sbbq %r14, %rax
+; X64-NEXT:    cmovgeq %rcx, %rbp
+; X64-NEXT:    movq %rbp, %xmm0
 ; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; X64-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; X64-NEXT:    # xmm0 = mem[2,3,2,3]
-; X64-NEXT:    movq %xmm0, %rbp
-; X64-NEXT:    movq %rbp, %r14
-; X64-NEXT:    sarq $63, %r14
-; X64-NEXT:    shldq $31, %rbp, %r14
-; X64-NEXT:    movq %rbp, %r15
-; X64-NEXT:    shlq $31, %r15
+; X64-NEXT:    movq %xmm0, %rbx
+; X64-NEXT:    movq %rbx, %r13
+; X64-NEXT:    sarq $63, %r13
+; X64-NEXT:    shldq $31, %rbx, %r13
 ; X64-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; X64-NEXT:    # xmm0 = mem[2,3,2,3]
 ; X64-NEXT:    movq %xmm0, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %rbx
-; X64-NEXT:    sarq $63, %rbx
-; X64-NEXT:    movq %r15, %rdi
-; X64-NEXT:    movq %r14, %rsi
-; X64-NEXT:    movq %rbx, %rcx
+; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    sarq $63, %r15
+; X64-NEXT:    movq %rbx, %r12
+; X64-NEXT:    shlq $31, %r12
+; X64-NEXT:    movq %r12, %rdi
+; X64-NEXT:    movq %r13, %rsi
+; X64-NEXT:    movq %r15, %rcx
 ; X64-NEXT:    callq __divti3@PLT
-; X64-NEXT:    movq %rax, %r13
+; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r12
+; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    subq $1, %r13
-; X64-NEXT:    sbbq $0, %r12
-; X64-NEXT:    movq %r15, %rdi
-; X64-NEXT:    movq %r14, %rsi
+; X64-NEXT:    subq $1, %rbp
+; X64-NEXT:    sbbq $0, %r14
+; X64-NEXT:    shrq $63, %rbx
+; X64-NEXT:    xorl %r15d, %ebx
+; X64-NEXT:    movq %r12, %rdi
+; X64-NEXT:    movq %r13, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    movq %rbx, %rcx
+; X64-NEXT:    movq %r15, %rcx
 ; X64-NEXT:    callq __modti3@PLT
 ; X64-NEXT:    orq %rax, %rdx
 ; X64-NEXT:    setne %al
-; X64-NEXT:    shrq $63, %rbp
-; X64-NEXT:    xorl %ebp, %ebx
 ; X64-NEXT:    testb %bl, %al
-; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
 ; X64-NEXT:    movl $4294967295, %ecx # imm = 0xFFFFFFFF
-; X64-NEXT:    cmpq %rcx, %r13
-; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    cmpq %rcx, %rbp
+; X64-NEXT:    movq %r14, %rax
 ; X64-NEXT:    sbbq $0, %rax
-; X64-NEXT:    cmovgeq %rcx, %r13
 ; X64-NEXT:    movl $0, %eax
-; X64-NEXT:    cmovgeq %rax, %r12
+; X64-NEXT:    cmovgeq %rax, %r14
+; X64-NEXT:    cmovgeq %rcx, %rbp
 ; X64-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; X64-NEXT:    cmpq %r13, %rcx
+; X64-NEXT:    cmpq %rbp, %rcx
 ; X64-NEXT:    movq $-1, %rax
-; X64-NEXT:    sbbq %r12, %rax
-; X64-NEXT:    cmovgeq %rcx, %r13
-; X64-NEXT:    movq %r13, %xmm0
+; X64-NEXT:    sbbq %r14, %rax
+; X64-NEXT:    cmovgeq %rcx, %rbp
+; X64-NEXT:    movq %rbp, %xmm0
 ; X64-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; X64-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; X64-NEXT:    psrlq $1, %xmm1
 ; X64-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; X64-NEXT:    pxor %xmm0, %xmm0
-; X64-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; X64-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; X64-NEXT:    psrad $31, %xmm1
+; X64-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X64-NEXT:    psrlq $31, %xmm0
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT:    psrad $31, %xmm1
 ; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT:    movq %xmm0, %rbp
-; X64-NEXT:    movq %rbp, %r14
-; X64-NEXT:    sarq $63, %r14
-; X64-NEXT:    shldq $31, %rbp, %r14
-; X64-NEXT:    movq %rbp, %r15
-; X64-NEXT:    shlq $31, %r15
+; X64-NEXT:    movq %xmm0, %rbx
+; X64-NEXT:    movq %rbx, %r13
+; X64-NEXT:    sarq $63, %r13
+; X64-NEXT:    shldq $31, %rbx, %r13
 ; X64-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; X64-NEXT:    pxor %xmm1, %xmm1
 ; X64-NEXT:    pcmpgtd %xmm0, %xmm1
@@ -698,92 +695,94 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; X64-NEXT:    movq %xmm0, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %rbx
-; X64-NEXT:    sarq $63, %rbx
-; X64-NEXT:    movq %r15, %rdi
-; X64-NEXT:    movq %r14, %rsi
-; X64-NEXT:    movq %rbx, %rcx
+; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    sarq $63, %r15
+; X64-NEXT:    movq %rbx, %r12
+; X64-NEXT:    shlq $31, %r12
+; X64-NEXT:    movq %r12, %rdi
+; X64-NEXT:    movq %r13, %rsi
+; X64-NEXT:    movq %r15, %rcx
 ; X64-NEXT:    callq __divti3@PLT
-; X64-NEXT:    movq %rax, %r13
+; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r12
+; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    subq $1, %r13
-; X64-NEXT:    sbbq $0, %r12
-; X64-NEXT:    movq %r15, %rdi
-; X64-NEXT:    movq %r14, %rsi
+; X64-NEXT:    subq $1, %rbp
+; X64-NEXT:    sbbq $0, %r14
+; X64-NEXT:    shrq $63, %rbx
+; X64-NEXT:    xorl %r15d, %ebx
+; X64-NEXT:    movq %r12, %rdi
+; X64-NEXT:    movq %r13, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    movq %rbx, %rcx
+; X64-NEXT:    movq %r15, %rcx
 ; X64-NEXT:    callq __modti3@PLT
 ; X64-NEXT:    orq %rax, %rdx
 ; X64-NEXT:    setne %al
-; X64-NEXT:    shrq $63, %rbp
-; X64-NEXT:    xorl %ebp, %ebx
 ; X64-NEXT:    testb %bl, %al
-; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
 ; X64-NEXT:    movl $4294967295, %ecx # imm = 0xFFFFFFFF
-; X64-NEXT:    cmpq %rcx, %r13
-; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    cmpq %rcx, %rbp
+; X64-NEXT:    movq %r14, %rax
 ; X64-NEXT:    sbbq $0, %rax
-; X64-NEXT:    cmovgeq %rcx, %r13
 ; X64-NEXT:    movl $0, %eax
-; X64-NEXT:    cmovgeq %rax, %r12
+; X64-NEXT:    cmovgeq %rax, %r14
+; X64-NEXT:    cmovgeq %rcx, %rbp
 ; X64-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; X64-NEXT:    cmpq %r13, %rcx
+; X64-NEXT:    cmpq %rbp, %rcx
 ; X64-NEXT:    movq $-1, %rax
-; X64-NEXT:    sbbq %r12, %rax
-; X64-NEXT:    cmovgeq %rcx, %r13
-; X64-NEXT:    movq %r13, %xmm0
+; X64-NEXT:    sbbq %r14, %rax
+; X64-NEXT:    cmovgeq %rcx, %rbp
+; X64-NEXT:    movq %rbp, %xmm0
 ; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; X64-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; X64-NEXT:    # xmm0 = mem[2,3,2,3]
-; X64-NEXT:    movq %xmm0, %rbp
-; X64-NEXT:    movq %rbp, %r14
-; X64-NEXT:    sarq $63, %r14
-; X64-NEXT:    shldq $31, %rbp, %r14
-; X64-NEXT:    movq %rbp, %r15
-; X64-NEXT:    shlq $31, %r15
+; X64-NEXT:    movq %xmm0, %rbx
+; X64-NEXT:    movq %rbx, %r13
+; X64-NEXT:    sarq $63, %r13
+; X64-NEXT:    shldq $31, %rbx, %r13
 ; X64-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; X64-NEXT:    # xmm0 = mem[2,3,2,3]
 ; X64-NEXT:    movq %xmm0, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %rbx
-; X64-NEXT:    sarq $63, %rbx
-; X64-NEXT:    movq %r15, %rdi
-; X64-NEXT:    movq %r14, %rsi
-; X64-NEXT:    movq %rbx, %rcx
+; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    sarq $63, %r15
+; X64-NEXT:    movq %rbx, %r12
+; X64-NEXT:    shlq $31, %r12
+; X64-NEXT:    movq %r12, %rdi
+; X64-NEXT:    movq %r13, %rsi
+; X64-NEXT:    movq %r15, %rcx
 ; X64-NEXT:    callq __divti3@PLT
-; X64-NEXT:    movq %rax, %r13
+; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r12
+; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    subq $1, %r13
-; X64-NEXT:    sbbq $0, %r12
-; X64-NEXT:    movq %r15, %rdi
-; X64-NEXT:    movq %r14, %rsi
+; X64-NEXT:    subq $1, %rbp
+; X64-NEXT:    sbbq $0, %r14
+; X64-NEXT:    shrq $63, %rbx
+; X64-NEXT:    xorl %r15d, %ebx
+; X64-NEXT:    movq %r12, %rdi
+; X64-NEXT:    movq %r13, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    movq %rbx, %rcx
+; X64-NEXT:    movq %r15, %rcx
 ; X64-NEXT:    callq __modti3@PLT
 ; X64-NEXT:    orq %rax, %rdx
 ; X64-NEXT:    setne %al
-; X64-NEXT:    shrq $63, %rbp
-; X64-NEXT:    xorl %ebp, %ebx
 ; X64-NEXT:    testb %bl, %al
-; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
 ; X64-NEXT:    movl $4294967295, %ecx # imm = 0xFFFFFFFF
-; X64-NEXT:    cmpq %rcx, %r13
-; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    cmpq %rcx, %rbp
+; X64-NEXT:    movq %r14, %rax
 ; X64-NEXT:    sbbq $0, %rax
-; X64-NEXT:    cmovgeq %rcx, %r13
 ; X64-NEXT:    movl $0, %eax
-; X64-NEXT:    cmovgeq %rax, %r12
+; X64-NEXT:    cmovgeq %rax, %r14
+; X64-NEXT:    cmovgeq %rcx, %rbp
 ; X64-NEXT:    movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000
-; X64-NEXT:    cmpq %r13, %rax
-; X64-NEXT:    sbbq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; X64-NEXT:    cmovgeq %rax, %r13
-; X64-NEXT:    movq %r13, %xmm1
+; X64-NEXT:    cmpq %rbp, %rax
+; X64-NEXT:    sbbq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; X64-NEXT:    cmovgeq %rax, %rbp
+; X64-NEXT:    movq %rbp, %xmm1
 ; X64-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; X64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; X64-NEXT:    psrlq $1, %xmm0
diff --git a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
index a80d8d8cd01b..76cf2423a254 100644
--- a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
+++ b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
@@ -121,12 +121,10 @@ define void @failing(ptr %0, ptr %1) nounwind {
 ; CHECK-AVX2-NEXT:    # => This Inner Loop Header: Depth=2
 ; CHECK-AVX2-NEXT:    vmovdqu 1024(%rdx,%rsi), %xmm5
 ; CHECK-AVX2-NEXT:    vmovdqu 1040(%rdx,%rsi), %xmm6
-; CHECK-AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm5[0],xmm6[0]
-; CHECK-AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
-; CHECK-AVX2-NEXT:    vmovq %xmm5, %rdi
-; CHECK-AVX2-NEXT:    vpextrq $1, %xmm5, %r8
-; CHECK-AVX2-NEXT:    vmovq %xmm7, %r9
-; CHECK-AVX2-NEXT:    vpextrq $1, %xmm7, %r10
+; CHECK-AVX2-NEXT:    vpextrq $1, %xmm5, %rdi
+; CHECK-AVX2-NEXT:    vpextrq $1, %xmm6, %r8
+; CHECK-AVX2-NEXT:    vmovq %xmm5, %r9
+; CHECK-AVX2-NEXT:    vmovq %xmm6, %r10
 ; CHECK-AVX2-NEXT:    negq %r10
 ; CHECK-AVX2-NEXT:    movq %rcx, %r10
 ; CHECK-AVX2-NEXT:    sbbq %r8, %r10
diff --git a/llvm/test/CodeGen/X86/sext-subreg.ll b/llvm/test/CodeGen/X86/sext-subreg.ll
index 3e54f24d13af..20451ff208cc 100644
--- a/llvm/test/CodeGen/X86/sext-subreg.ll
+++ b/llvm/test/CodeGen/X86/sext-subreg.ll
@@ -1,16 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
 ; rdar://7529457
 
 define i64 @t(i64 %A, i64 %B, ptr %P, ptr%P2) nounwind {
 ; CHECK-LABEL: t:
-; CHECK: movslq %e{{.*}}, %rax
-; CHECK: movq %rax
-; CHECK: movl %eax
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addq %rsi, %rdi
+; CHECK-NEXT:    movl %edi, (%rdx)
+; CHECK-NEXT:    movslq %edi, %rax
+; CHECK-NEXT:    movq %rax, (%rcx)
+; CHECK-NEXT:    movl %eax, (%rdx)
+; CHECK-NEXT:    retq
   %C = add i64 %A, %B
   %D = trunc i64 %C to i32
   store volatile i32 %D, ptr %P
   %E = shl i64 %C, 32
-  %F = ashr i64 %E, 32  
+  %F = ashr i64 %E, 32
   store volatile i64 %F, ptr%P2
   store volatile i32 %D, ptr %P
   ret i64 undef
diff --git a/llvm/test/CodeGen/X86/stackmap-liveness.ll b/llvm/test/CodeGen/X86/stackmap-liveness.ll
index 798eab9249df..10a8f950baeb 100644
--- a/llvm/test/CodeGen/X86/stackmap-liveness.ll
+++ b/llvm/test/CodeGen/X86/stackmap-liveness.ll
@@ -46,9 +46,29 @@ entry:
 ; Padding
 ; PATCH-NEXT:   .p2align  3
 ; PATCH-NEXT:   .short  0
-; Num LiveOut Entries: 1
-; PATCH-NEXT:   .short  1
-; LiveOut Entry 1: %ymm2 (16 bytes) --> %xmm2
+; Num LiveOut Entries: 6
+; PATCH-NEXT:   .short  6
+; LiveOut Entry 1:
+; PATCH-NEXT:   .short 3
+; PATCH-NEXT:   .byte 0
+; PATCH-NEXT:   .byte 8
+; LiveOut Entry 2:
+; PATCH-NEXT:   .short 12
+; PATCH-NEXT:   .byte 0
+; PATCH-NEXT:   .byte 8
+; LiveOut Entry 3:
+; PATCH-NEXT:   .short 13
+; PATCH-NEXT:   .byte 0
+; PATCH-NEXT:   .byte 8
+; LiveOut Entry 4:
+; PATCH-NEXT:   .short 14
+; PATCH-NEXT:   .byte 0
+; PATCH-NEXT:   .byte 8
+; LiveOut Entry 5:
+; PATCH-NEXT:   .short 15
+; PATCH-NEXT:   .byte 0
+; PATCH-NEXT:   .byte 8
+; LiveOut Entry 6: %ymm2 (16 bytes) --> %xmm2
 ; PATCH-NEXT:   .short  19
 ; PATCH-NEXT:   .byte 0
 ; PATCH-NEXT:   .byte 16
@@ -79,25 +99,46 @@ entry:
 ; Padding
 ; PATCH-NEXT:   .p2align  3
 ; PATCH-NEXT:   .short  0
-; Num LiveOut Entries: 5
-; PATCH-NEXT:   .short  5
+; Num LiveOut Entries: 10
+; PATCH-NEXT:   .short 10
+
 ; LiveOut Entry 1: %rax (1 bytes) --> %al or %ah
 ; PATCH-NEXT:   .short  0
 ; PATCH-NEXT:   .byte 0
 ; PATCH-NEXT:   .byte 1
-; LiveOut Entry 2: %r8 (8 bytes)
+; LiveOut Entry 2:
+; PATCH-NEXT:   .short 3
+; PATCH-NEXT:   .byte 0
+; PATCH-NEXT:   .byte 8
+; LiveOut Entry 3: %r8 (8 bytes)
 ; PATCH-NEXT:   .short  8
 ; PATCH-NEXT:   .byte 0
 ; PATCH-NEXT:   .byte 8
-; LiveOut Entry 3: %ymm0 (32 bytes)
+; LiveOut Entry 4:
+; PATCH-NEXT:   .short 12
+; PATCH-NEXT:   .byte 0
+; PATCH-NEXT:   .byte 8
+; LiveOut Entry 5:
+; PATCH-NEXT:   .short 13
+; PATCH-NEXT:   .byte 0
+; PATCH-NEXT:   .byte 8
+; LiveOut Entry 6:
+; PATCH-NEXT:   .short 14
+; PATCH-NEXT:   .byte 0
+; PATCH-NEXT:   .byte 8
+; LiveOut Entry 7:
+; PATCH-NEXT:   .short 15
+; PATCH-NEXT:   .byte 0
+; PATCH-NEXT:   .byte 8
+; LiveOut Entry 8: %ymm0 (32 bytes)
 ; PATCH-NEXT:   .short  17
 ; PATCH-NEXT:   .byte 0
 ; PATCH-NEXT:   .byte 32
-; LiveOut Entry 4: %ymm1 (32 bytes)
+; LiveOut Entry 9: %ymm1 (32 bytes)
 ; PATCH-NEXT:   .short  18
 ; PATCH-NEXT:   .byte 0
 ; PATCH-NEXT:   .byte 32
-; LiveOut Entry 5: %ymm2 (16 bytes) --> %xmm2
+; LiveOut Entry 10: %ymm2 (16 bytes) --> %xmm2
 ; PATCH-NEXT:   .short  19
 ; PATCH-NEXT:   .byte 0
 ; PATCH-NEXT:   .byte 16
@@ -125,13 +166,33 @@ entry:
 ; Padding
 ; PATCH-NEXT:   .p2align  3
 ; PATCH-NEXT:   .short  0
-; Num LiveOut Entries: 2
-; PATCH-NEXT:   .short  2
-; LiveOut Entry 1: %rsp (8 bytes)
+; Num LiveOut Entries: 7
+; PATCH-NEXT:   .short 7
+; LiveOut Entry 1:
+; PATCH-NEXT:   .short 3
+; PATCH-NEXT:   .byte 0
+; PATCH-NEXT:   .byte 8
+; LiveOut Entry 2: %rsp (8 bytes)
 ; PATCH-NEXT:   .short  7
 ; PATCH-NEXT:   .byte 0
 ; PATCH-NEXT:   .byte 8
-; LiveOut Entry 2: %ymm2 (16 bytes) --> %xmm2
+; LiveOut Entry 3:
+; PATCH-NEXT:   .short 12
+; PATCH-NEXT:   .byte 0
+; PATCH-NEXT:   .byte 8
+; LiveOut Entry 4:
+; PATCH-NEXT:   .short 13
+; PATCH-NEXT:   .byte 0
+; PATCH-NEXT:   .byte 8
+; LiveOut Entry 5:
+; PATCH-NEXT:   .short 14
+; PATCH-NEXT:   .byte 0
+; PATCH-NEXT:   .byte 8
+; LiveOut Entry 6:
+; PATCH-NEXT:   .short 15
+; PATCH-NEXT:   .byte 0
+; PATCH-NEXT:   .byte 8
+; LiveOut Entry 7: %ymm2 (16 bytes) --> %xmm2
 ; PATCH-NEXT:   .short  19
 ; PATCH-NEXT:   .byte 0
 ; PATCH-NEXT:   .byte 16
@@ -164,13 +225,33 @@ entry:
 ; Padding
 ; PATCH-NEXT:   .p2align  3
 ; PATCH-NEXT:   .short  0
-; Num LiveOut Entries: 2
-; PATCH-NEXT:   .short  2
-; LiveOut Entry 1: %rsp (8 bytes)
+; Num LiveOut Entries: 7
+; PATCH-NEXT:   .short 7
+; LiveOut Entry 1:
+; PATCH-NEXT:   .short 3
+; PATCH-NEXT:   .byte 0
+; PATCH-NEXT:   .byte 8
+; LiveOut Entry 2: %rsp (8 bytes)
 ; PATCH-NEXT:   .short  7
 ; PATCH-NEXT:   .byte 0
 ; PATCH-NEXT:   .byte 8
-; LiveOut Entry 2: %ymm2 (16 bytes) --> %xmm2
+; LiveOut Entry 3:
+; PATCH-NEXT:   .short 12
+; PATCH-NEXT:   .byte 0
+; PATCH-NEXT:   .byte 8
+; LiveOut Entry 4:
+; PATCH-NEXT:   .short 13
+; PATCH-NEXT:   .byte 0
+; PATCH-NEXT:   .byte 8
+; LiveOut Entry 5:
+; PATCH-NEXT:   .short 14
+; PATCH-NEXT:   .byte 0
+; PATCH-NEXT:   .byte 8
+; LiveOut Entry 6:
+; PATCH-NEXT:   .short 15
+; PATCH-NEXT:   .byte 0
+; PATCH-NEXT:   .byte 8
+; LiveOut Entry 7: %ymm2 (16 bytes) --> %xmm2
 ; PATCH-NEXT:   .short  19
 ; PATCH-NEXT:   .byte 0
 ; PATCH-NEXT:   .byte 16
diff --git a/llvm/test/CodeGen/X86/tailcall-range.ll b/llvm/test/CodeGen/X86/tailcall-range.ll
new file mode 100644
index 000000000000..6ae7405ebc4a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tailcall-range.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=x86_64-linux < %s | FileCheck %s
+
+define range(i32 0, 2) i32 @foo(ptr %this) {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movzbl (%rdi), %eax
+; CHECK-NEXT:    retq
+entry:
+  %call = load volatile i1, ptr %this, align 1
+  %spec.select = zext i1 %call to i32
+  ret i32 %spec.select
+}
+
+define range(i32 0, 2) i32 @bar(ptr %this) {
+; CHECK-LABEL: bar:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xorl %edi, %edi
+; CHECK-NEXT:    jmp foo@PLT # TAILCALL
+entry:
+  %ret = musttail call i32 @foo(ptr null)
+  ret i32 %ret
+}
+
+declare i64 @llvm.llround.f32(float) nounwind readnone
+define range(i64 0, 8) i64 @testmsxs(float %x) {
+; CHECK-LABEL: testmsxs:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    jmp llroundf@PLT # TAILCALL
+entry:
+  %ret = tail call i64 @llvm.llround.f32(float %x)
+  ret i64 %ret
+}
+
+declare i32 @callee()
+
+define range(i32 0, 2) i32 @func_with_range_attr() {
+; CHECK-LABEL: func_with_range_attr:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    jmp callee@PLT # TAILCALL
+entry:
+  %ret = musttail call i32 @callee()
+  ret i32 %ret
+}
+
+define i32 @call_with_range_attr() {
+; CHECK-LABEL: call_with_range_attr:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    jmp callee@PLT # TAILCALL
+entry:
+  %ret = musttail call range(i32 0, 2) i32 @callee()
+  ret i32 %ret
+}
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
index 6d5fc9ed0ab5..f105e065866a 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
@@ -482,14 +482,12 @@ define void @load_i16_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5],xmm3[6,7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,0,1,6,7,12,13]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4],xmm4[5,6,7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,2,3,8,9,14,15]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2],xmm2[3],xmm4[4,5],xmm2[6],xmm4[7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6],xmm2[7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15]
 ; AVX-NEXT:    vmovdqa %xmm3, (%rsi)
 ; AVX-NEXT:    vmovdqa %xmm4, (%rdx)
 ; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
@@ -827,27 +825,25 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,5],xmm7[6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm7, %ymm2
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = xmm6[2,3,8,9,14,15,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4,5,6,7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,0,1,6,7,12,13]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm6[1],xmm7[2,3],xmm6[4],xmm7[5,6],xmm6[7]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm8 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3,4],xmm8[5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm5[0],xmm9[1,2],xmm5[3],xmm9[4,5],xmm5[6],xmm9[7]
+; AVX-NEXT:    vpshufb %xmm8, %xmm9, %xmm8
+; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm8, %ymm7
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm6[2,1,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,2,3,8,9,14,15]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4],xmm1[5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6],xmm5[7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX-NEXT:    vmovaps %ymm2, (%rsi)
-; AVX-NEXT:    vmovdqa %xmm8, (%rdx)
-; AVX-NEXT:    vmovdqa %xmm7, 16(%rdx)
-; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
-; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
+; AVX-NEXT:    vmovaps %ymm7, (%rdx)
+; AVX-NEXT:    vmovaps %ymm0, (%rcx)
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -1472,105 +1468,87 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX-LABEL: load_i16_stride3_vf32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa 80(%rdi), %xmm0
-; AVX-NEXT:    vmovdqa 64(%rdi), %xmm1
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
-; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovdqa 80(%rdi), %xmm2
+; AVX-NEXT:    vmovdqa 64(%rdi), %xmm5
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7]
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm9 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
-; AVX-NEXT:    vpshufb %xmm9, %xmm2, %xmm3
-; AVX-NEXT:    vmovdqa (%rdi), %xmm5
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm8
-; AVX-NEXT:    vmovdqa 32(%rdi), %xmm6
-; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm2[0,3,2,3,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0],xmm8[1],xmm5[2,3],xmm8[4],xmm5[5,6],xmm8[7]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
-; AVX-NEXT:    vpshufb %xmm10, %xmm4, %xmm4
-; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[0,1,2,1]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,5]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm7[6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 176(%rdi), %xmm4
-; AVX-NEXT:    vmovdqa 160(%rdi), %xmm7
-; AVX-NEXT:    vpblendw {{.*#+}} xmm11 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7]
-; AVX-NEXT:    vpshufb %xmm9, %xmm11, %xmm11
+; AVX-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm3
+; AVX-NEXT:    vmovdqa 32(%rdi), %xmm4
+; AVX-NEXT:    vmovdqa 48(%rdi), %xmm6
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm6[0,3,2,3,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
+; AVX-NEXT:    vpshufb %xmm12, %xmm7, %xmm7
+; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm4[0,1,2,1]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm7, %ymm0
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 176(%rdi), %xmm7
+; AVX-NEXT:    vmovdqa 160(%rdi), %xmm8
+; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm8[0,1],xmm7[2],xmm8[3,4],xmm7[5],xmm8[6,7]
+; AVX-NEXT:    vpshufb %xmm9, %xmm10, %xmm10
 ; AVX-NEXT:    vmovdqa 144(%rdi), %xmm9
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm9[0,3,2,3,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3,4,5,6,7]
-; AVX-NEXT:    vmovdqa 112(%rdi), %xmm12
-; AVX-NEXT:    vmovdqa 96(%rdi), %xmm13
-; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm13[0],xmm12[1],xmm13[2,3],xmm12[4],xmm13[5,6],xmm12[7]
-; AVX-NEXT:    vpshufb %xmm10, %xmm14, %xmm10
-; AVX-NEXT:    vmovdqa 128(%rdi), %xmm14
-; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm14[0,1,2,1]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm9[0,3,2,3,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm11[0,1,2],xmm10[3,4,5,6,7]
+; AVX-NEXT:    vmovdqa 112(%rdi), %xmm10
+; AVX-NEXT:    vmovdqa 96(%rdi), %xmm11
+; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6],xmm10[7]
+; AVX-NEXT:    vpshufb %xmm12, %xmm14, %xmm14
+; AVX-NEXT:    vmovdqa 128(%rdi), %xmm12
+; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm12[0,1,2,1]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5],xmm15[6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm10, %ymm3
-; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vpblendw {{.*#+}} xmm11 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm10 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
-; AVX-NEXT:    vpshufb %xmm10, %xmm11, %xmm11
-; AVX-NEXT:    vmovq {{.*#+}} xmm3 = [2,3,8,9,14,15,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm15
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm15[0,1,2],xmm11[3,4,5,6,7]
-; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpblendw {{.*#+}} xmm15 = xmm5[0,1],xmm8[2],xmm5[3,4],xmm8[5],xmm5[6,7]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u]
-; AVX-NEXT:    vpshufb %xmm1, %xmm15, %xmm15
-; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = [0,0,0,1,6,7,12,13,0,0,0,1,6,7,12,13]
-; AVX-NEXT:    # xmm0 = mem[0,0]
-; AVX-NEXT:    vpshufb %xmm0, %xmm6, %xmm11
-; AVX-NEXT:    vpblendw {{.*#+}} xmm15 = xmm15[0,1,2,3,4],xmm11[5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7]
-; AVX-NEXT:    vpshufb %xmm10, %xmm11, %xmm10
-; AVX-NEXT:    vpshufb %xmm3, %xmm9, %xmm3
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm10[3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm13[0,1],xmm12[2],xmm13[3,4],xmm12[5],xmm13[6,7]
-; AVX-NEXT:    vpshufb %xmm1, %xmm10, %xmm1
-; AVX-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7]
-; AVX-NEXT:    vmovddup {{.*#+}} xmm5 = [0,0,2,3,8,9,14,15,0,0,2,3,8,9,14,15]
-; AVX-NEXT:    # xmm5 = mem[0,0]
-; AVX-NEXT:    vpshufb %xmm5, %xmm6, %xmm6
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm8 = [4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u]
-; AVX-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7]
-; AVX-NEXT:    vpshufb %xmm5, %xmm14, %xmm5
-; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7]
-; AVX-NEXT:    vpshufb %xmm8, %xmm6, %xmm6
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX-NEXT:    vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm6 = xmm6[0],mem[1],xmm6[2,3],mem[4],xmm6[5,6],mem[7]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX-NEXT:    vpshufb %xmm8, %xmm6, %xmm6
-; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3],xmm4[4],xmm7[5,6],xmm4[7]
-; AVX-NEXT:    vpshufb %xmm8, %xmm4, %xmm4
-; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm9[2,1,2,3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm14, %ymm0
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0],xmm6[1],xmm14[2,3],xmm6[4],xmm14[5,6],xmm6[7]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm15 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX-NEXT:    vpshufb %xmm15, %xmm14, %xmm14
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm3[2],xmm1[3,4],xmm3[5],xmm1[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1,2],xmm4[3],xmm0[4,5],xmm4[6],xmm0[7]
+; AVX-NEXT:    vpshufb %xmm15, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm14
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3],xmm9[4],xmm0[5,6],xmm9[7]
+; AVX-NEXT:    vpshufb %xmm15, %xmm0, %xmm0
+; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm11[0,1],xmm10[2],xmm11[3,4],xmm10[5],xmm11[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm12[0],xmm13[1,2],xmm12[3],xmm13[4,5],xmm12[6],xmm13[7]
+; AVX-NEXT:    vpshufb %xmm15, %xmm13, %xmm13
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm13, %ymm0
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3],xmm2[4],xmm5[5,6],xmm2[7]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
+; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3,4,5,6,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, 32(%rsi)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm6, (%rsi)
-; AVX-NEXT:    vmovdqa %xmm0, 32(%rdx)
-; AVX-NEXT:    vmovdqa %xmm3, 48(%rdx)
-; AVX-NEXT:    vmovdqa %xmm15, (%rdx)
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vmovaps %xmm0, 16(%rdx)
-; AVX-NEXT:    vmovdqa %xmm5, 32(%rcx)
-; AVX-NEXT:    vmovdqa %xmm4, 48(%rcx)
-; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
-; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6],xmm4[7]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6],xmm7[7]
+; AVX-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
+; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm9[2,1,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm11[2],xmm10[3,4],xmm11[5],xmm10[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm12[1],xmm4[2,3],xmm12[4],xmm4[5,6],xmm12[7]
+; AVX-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, 32(%rsi)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm3, (%rsi)
+; AVX-NEXT:    vmovaps %ymm0, 32(%rdx)
+; AVX-NEXT:    vmovaps %ymm14, (%rdx)
+; AVX-NEXT:    vmovaps %ymm2, 32(%rcx)
+; AVX-NEXT:    vmovaps %ymm1, (%rcx)
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -2724,249 +2702,216 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX-LABEL: load_i16_stride3_vf64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    subq $456, %rsp # imm = 0x1C8
-; AVX-NEXT:    vmovdqa 272(%rdi), %xmm8
-; AVX-NEXT:    vmovdqa 256(%rdi), %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7]
-; AVX-NEXT:    vmovdqa %xmm2, %xmm13
-; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    subq $408, %rsp # imm = 0x198
+; AVX-NEXT:    vmovdqa 176(%rdi), %xmm6
+; AVX-NEXT:    vmovdqa 160(%rdi), %xmm5
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7]
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
 ; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa 240(%rdi), %xmm2
+; AVX-NEXT:    vmovdqa 144(%rdi), %xmm2
 ; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7]
-; AVX-NEXT:    vmovdqa 208(%rdi), %xmm10
-; AVX-NEXT:    vmovdqa 192(%rdi), %xmm9
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm9[0],xmm10[1],xmm9[2,3],xmm10[4],xmm9[5,6],xmm10[7]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
-; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
-; AVX-NEXT:    vmovdqa 224(%rdi), %xmm4
-; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
+; AVX-NEXT:    vmovdqa 112(%rdi), %xmm10
+; AVX-NEXT:    vmovdqa 96(%rdi), %xmm7
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm7[0],xmm10[1],xmm7[2,3],xmm10[4],xmm7[5,6],xmm10[7]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
+; AVX-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
+; AVX-NEXT:    vmovdqa 128(%rdi), %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 80(%rdi), %xmm15
-; AVX-NEXT:    vmovdqa 64(%rdi), %xmm11
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm11[0,1],xmm15[2],xmm11[3,4],xmm15[5],xmm11[6,7]
-; AVX-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm0
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 368(%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 352(%rdi), %xmm2
+; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
+; AVX-NEXT:    vmovdqa %xmm0, %xmm14
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
-; AVX-NEXT:    vmovdqa (%rdi), %xmm5
-; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm7
-; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa 48(%rdi), %xmm6
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm6[0,3,2,3,4,5,6,7]
-; AVX-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovdqa 336(%rdi), %xmm9
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm9[0,3,2,3,4,5,6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0],xmm7[1],xmm5[2,3],xmm7[4],xmm5[5,6],xmm7[7]
-; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
-; AVX-NEXT:    vmovdqa 32(%rdi), %xmm5
-; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm5[0,1,2,1]
+; AVX-NEXT:    vmovdqa 304(%rdi), %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovdqa 288(%rdi), %xmm3
+; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7]
+; AVX-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
+; AVX-NEXT:    vmovdqa 320(%rdi), %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 176(%rdi), %xmm3
-; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa 160(%rdi), %xmm2
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm0
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 272(%rdi), %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovdqa 256(%rdi), %xmm2
 ; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
 ; AVX-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
-; AVX-NEXT:    vmovdqa 144(%rdi), %xmm3
-; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
+; AVX-NEXT:    vmovdqa 240(%rdi), %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm0[0,3,2,3,4,5,6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6,7]
-; AVX-NEXT:    vmovdqa 112(%rdi), %xmm3
+; AVX-NEXT:    vmovdqa 208(%rdi), %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovdqa 192(%rdi), %xmm3
 ; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa 96(%rdi), %xmm12
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm12[0],xmm3[1],xmm12[2,3],xmm3[4],xmm12[5,6],xmm3[7]
-; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
-; AVX-NEXT:    vmovdqa 128(%rdi), %xmm4
-; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7]
+; AVX-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
+; AVX-NEXT:    vmovdqa 224(%rdi), %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovdqa 368(%rdi), %xmm3
-; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa 352(%rdi), %xmm2
-; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm0
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa 80(%rdi), %xmm13
+; AVX-NEXT:    vmovdqa 64(%rdi), %xmm11
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm11[0,1],xmm13[2],xmm11[3,4],xmm13[5],xmm11[6,7]
+; AVX-NEXT:    vmovdqa %xmm11, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
-; AVX-NEXT:    vmovdqa 336(%rdi), %xmm2
-; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
+; AVX-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[0,3,2,3,4,5,6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm1[3,4,5,6,7]
-; AVX-NEXT:    vmovdqa 304(%rdi), %xmm1
-; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa 288(%rdi), %xmm2
-; AVX-NEXT:    vmovdqa %xmm2, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
-; AVX-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
-; AVX-NEXT:    vmovdqa 320(%rdi), %xmm0
-; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm0[0,1,2,1]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,6,5]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm14[6,7]
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm13[2],xmm8[3,4],xmm13[5],xmm8[6,7]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
-; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm14
-; AVX-NEXT:    vmovq {{.*#+}} xmm3 = [2,3,8,9,14,15,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm13
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm13[0,1,2],xmm14[3,4,5,6,7]
-; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm15[0,1],xmm11[2],xmm15[3,4],xmm11[5],xmm15[6,7]
-; AVX-NEXT:    vpshufb %xmm1, %xmm13, %xmm13
-; AVX-NEXT:    vmovdqa %xmm1, %xmm8
-; AVX-NEXT:    vpshufb %xmm3, %xmm6, %xmm14
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm14[0,1,2],xmm13[3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm4
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6],xmm4[7]
+; AVX-NEXT:    vmovdqa %xmm0, %xmm15
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm9[0,1],xmm10[2],xmm9[3,4],xmm10[5],xmm9[6,7]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm14 = [2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u]
-; AVX-NEXT:    vpshufb %xmm14, %xmm13, %xmm13
-; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = [0,0,0,1,6,7,12,13,0,0,0,1,6,7,12,13]
-; AVX-NEXT:    # xmm0 = mem[0,0]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX-NEXT:    vpshufb %xmm0, %xmm11, %xmm15
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm13[0,1,2,3,4],xmm15[5,6,7]
-; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT:    vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm13 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm13 = xmm1[0,1],mem[2],xmm1[3,4],mem[5],xmm1[6,7]
-; AVX-NEXT:    vpshufb %xmm14, %xmm13, %xmm13
-; AVX-NEXT:    vmovdqa %xmm5, %xmm4
-; AVX-NEXT:    vpshufb %xmm0, %xmm5, %xmm15
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm13[0,1,2,3,4],xmm15[5,6,7]
-; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX-NEXT:    vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm13 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm13 = xmm5[0,1],mem[2],xmm5[3,4],mem[5],xmm5[6,7]
-; AVX-NEXT:    vpshufb %xmm8, %xmm13, %xmm13
-; AVX-NEXT:    vmovdqa %xmm8, %xmm6
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX-NEXT:    vpshufb %xmm3, %xmm5, %xmm15
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm15[0,1,2],xmm13[3,4,5,6,7]
+; AVX-NEXT:    vpshufb %xmm8, %xmm1, %xmm0
+; AVX-NEXT:    vmovdqa 32(%rdi), %xmm1
+; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm1[0,1,2,1]
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,6,5]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm12[6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm12[0,1],xmm8[2],xmm12[3,4],xmm8[5],xmm12[6,7]
-; AVX-NEXT:    vpshufb %xmm14, %xmm13, %xmm13
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX-NEXT:    vpshufb %xmm0, %xmm7, %xmm15
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm13[0,1,2,3,4],xmm15[5,6,7]
-; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovdqa (%rsp), %xmm2 # 16-byte Reload
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
-; AVX-NEXT:    vpshufb %xmm14, %xmm13, %xmm13
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3],xmm8[4],xmm0[5,6],xmm8[7]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm7[0,1],xmm10[2],xmm7[3,4],xmm10[5],xmm7[6,7]
+; AVX-NEXT:    vpblendw $73, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm12 = mem[0],xmm12[1,2],mem[3],xmm12[4,5],mem[6],xmm12[7]
+; AVX-NEXT:    vpshufb %xmm2, %xmm12, %xmm12
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm12, %ymm0
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = xmm14[0,1],mem[2],xmm14[3,4],mem[5],xmm14[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3],xmm9[4],xmm0[5,6],xmm9[7]
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX-NEXT:    vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm12 = xmm12[0,1],mem[2],xmm12[3,4],mem[5],xmm12[6,7]
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3,4],xmm0[5,6,7]
-; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1,2],xmm14[3],xmm12[4,5],xmm14[6],xmm12[7]
+; AVX-NEXT:    vpshufb %xmm2, %xmm12, %xmm12
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm12, %ymm0
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm11[2],xmm13[3,4],xmm11[5],xmm13[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6],xmm3[7]
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm1[0],xmm12[1,2],xmm1[3],xmm12[4,5],xmm1[6],xmm12[7]
+; AVX-NEXT:    vpshufb %xmm2, %xmm12, %xmm12
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm12, %ymm0
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm11[0,1],xmm13[2],xmm11[3,4],xmm13[5],xmm11[6,7]
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7]
-; AVX-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3,4,5,6,7]
-; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7]
-; AVX-NEXT:    vmovddup {{.*#+}} xmm13 = [0,0,2,3,8,9,14,15,0,0,2,3,8,9,14,15]
-; AVX-NEXT:    # xmm13 = mem[0,0]
-; AVX-NEXT:    vpshufb %xmm13, %xmm11, %xmm3
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm9 = [4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u]
-; AVX-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3,4],xmm3[5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm15[1],xmm0[2,3],xmm15[4],xmm0[5,6],xmm15[7]
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX-NEXT:    vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm12 = mem[0,1],xmm12[2],mem[3,4],xmm12[5],mem[6,7]
+; AVX-NEXT:    vpblendw $73, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm12 = mem[0],xmm12[1,2],mem[3],xmm12[4,5],mem[6],xmm12[7]
+; AVX-NEXT:    vpshufb %xmm2, %xmm12, %xmm2
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5,6],xmm6[7]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm8[2,1,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[2,1,2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm10[0,1],xmm7[2],xmm10[3,4],xmm7[5],xmm10[6,7]
+; AVX-NEXT:    vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm12 = xmm12[0],mem[1],xmm12[2,3],mem[4],xmm12[5,6],mem[7]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX-NEXT:    vpshufb %xmm1, %xmm12, %xmm12
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm12, %ymm8
 ; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm0 = mem[0,1],xmm0[2],mem[3,4],xmm0[5],mem[6,7]
-; AVX-NEXT:    vpshufb %xmm13, %xmm4, %xmm4
-; AVX-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
-; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0,1,2,3,4],xmm4[5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
-; AVX-NEXT:    vpshufb %xmm13, %xmm14, %xmm1
-; AVX-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
-; AVX-NEXT:    vpshufb %xmm13, %xmm7, %xmm1
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm8[0,1],xmm12[2],xmm8[3,4],xmm12[5],xmm8[6,7]
-; AVX-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm1[5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm15[0],xmm5[1],xmm15[2,3],xmm5[4],xmm15[5,6],xmm5[7]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
-; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
+; AVX-NEXT:    vpblendw $109, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = mem[0],xmm0[1],mem[2,3],xmm0[4],mem[5,6],xmm0[7]
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm9[2,1,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[2,1,2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3,4,5,6,7]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX-NEXT:    vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm10 = mem[0,1],xmm5[2],mem[3,4],xmm5[5],mem[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0],xmm14[1],xmm10[2,3],xmm14[4],xmm10[5,6],xmm14[7]
+; AVX-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm9, %ymm0
+; AVX-NEXT:    vmovdqa (%rsp), %xmm5 # 16-byte Reload
+; AVX-NEXT:    vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm7 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm7 = xmm5[0],mem[1],xmm5[2,3],mem[4],xmm5[5,6],mem[7]
+; AVX-NEXT:    vpshufb %xmm2, %xmm7, %xmm7
+; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[2,1,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3,4,5,6,7]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX-NEXT:    vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm6 = xmm6[0],mem[1],xmm6[2,3],mem[4],xmm6[5,6],mem[7]
-; AVX-NEXT:    vpshufb %xmm5, %xmm6, %xmm6
-; AVX-NEXT:    vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm7 = mem[2,1,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3,4,5,6,7]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX-NEXT:    vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm7 = xmm7[0],mem[1],xmm7[2,3],mem[4],xmm7[5,6],mem[7]
-; AVX-NEXT:    vpshufb %xmm5, %xmm7, %xmm7
-; AVX-NEXT:    vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm8 = mem[2,1,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3,4,5,6,7]
-; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT:    vpblendw $109, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm8 = mem[0],xmm8[1],mem[2,3],xmm8[4],mem[5,6],xmm8[7]
-; AVX-NEXT:    vpshufb %xmm5, %xmm8, %xmm5
-; AVX-NEXT:    vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
-; AVX-NEXT:    # xmm8 = mem[2,1,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3,4,5,6,7]
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm8, 96(%rsi)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm8, 32(%rsi)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm8, (%rsi)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm8, 64(%rsi)
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT:    vmovaps %xmm8, 112(%rdx)
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT:    vmovaps %xmm8, 96(%rdx)
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT:    vmovaps %xmm8, 32(%rdx)
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT:    vmovaps %xmm8, 48(%rdx)
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT:    vmovaps %xmm8, (%rdx)
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT:    vmovaps %xmm8, 64(%rdx)
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT:    vmovaps %xmm8, 16(%rdx)
-; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT:    vmovaps %xmm8, 80(%rdx)
-; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
-; AVX-NEXT:    vmovdqa %xmm5, 48(%rcx)
-; AVX-NEXT:    vmovdqa %xmm0, 96(%rcx)
-; AVX-NEXT:    vmovdqa %xmm4, (%rcx)
-; AVX-NEXT:    vmovdqa %xmm7, 16(%rcx)
-; AVX-NEXT:    vmovdqa %xmm3, 64(%rcx)
-; AVX-NEXT:    vmovdqa %xmm6, 80(%rcx)
-; AVX-NEXT:    vmovdqa %xmm1, 112(%rcx)
-; AVX-NEXT:    addq $456, %rsp # imm = 0x1C8
+; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = xmm4[0,1],mem[2],xmm4[3,4],mem[5],xmm4[6,7]
+; AVX-NEXT:    vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm3 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm3 = xmm4[0],mem[1],xmm4[2,3],mem[4],xmm4[5,6],mem[7]
+; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
+; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm3, %ymm3
+; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm13[0],xmm11[1],xmm13[2,3],xmm11[4],xmm13[5,6],xmm11[7]
+; AVX-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
+; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm15[2,1,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3,4,5,6,7]
+; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX-NEXT:    vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = xmm4[0,1],mem[2],xmm4[3,4],mem[5],xmm4[6,7]
+; AVX-NEXT:    vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm4 = xmm4[0],mem[1],xmm4[2,3],mem[4],xmm4[5,6],mem[7]
+; AVX-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm2, (%rsi)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm2, 64(%rsi)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm2, 96(%rsi)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm2, 32(%rsi)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm2, 64(%rdx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm2, (%rdx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm2, 96(%rdx)
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm2, 32(%rdx)
+; AVX-NEXT:    vmovaps %ymm1, 64(%rcx)
+; AVX-NEXT:    vmovaps %ymm3, (%rcx)
+; AVX-NEXT:    vmovaps %ymm0, 96(%rcx)
+; AVX-NEXT:    vmovaps %ymm8, 32(%rcx)
+; AVX-NEXT:    addq $408, %rsp # imm = 0x198
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
index 6c978da50d53..1ddd8166c998 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
@@ -456,9 +456,8 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm6[4,5,0,1,12,13,u,u,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3],xmm7[4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm2[2,2,3,3]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,2,3,14,15,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5],xmm6[6,7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[1,1,1,1]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[2,3,2,3]
 ; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
@@ -1198,22 +1197,26 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm1
 ; AVX512-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm5
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm3[0,1,12,13,u,u,4,5,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm7
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm3
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm4
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm7
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm7[0,2,0,3]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3]
 ; AVX512-NEXT:    vpbroadcastw 74(%rdi), %xmm6
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[2,3,14,15,u,u,6,7,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2],xmm3[3],xmm7[4,5],xmm3[6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm6[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[2,1,0,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6],xmm7[7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm6, %xmm6
@@ -1230,22 +1233,22 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
-; AVX512-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm5[2,2,2,2,4,5,6,7]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm9 = xmm4[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
+; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm4[2,2,2,2,4,5,6,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm9 = xmm3[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4],xmm1[5,6,7]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[10,11,u,u,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5,6,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3,4],xmm0[5,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7]
 ; AVX512-NEXT:    vmovdqa %xmm2, (%rsi)
-; AVX512-NEXT:    vmovdqa %xmm3, (%rdx)
+; AVX512-NEXT:    vmovdqa %xmm5, (%rdx)
 ; AVX512-NEXT:    vmovdqa %xmm8, (%rcx)
 ; AVX512-NEXT:    vmovdqa %xmm6, (%r8)
 ; AVX512-NEXT:    vmovdqa %xmm1, (%r9)
@@ -1272,9 +1275,10 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3]
 ; AVX512-FCP-NEXT:    vpbroadcastw 74(%rdi), %xmm6
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[2,3,14,15,u,u,6,7,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3],xmm7[4,5],xmm5[6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,u,u,14,15,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6],xmm7[7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3]
@@ -1321,22 +1325,26 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm1
 ; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm5
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm3[0,1,12,13,u,u,4,5,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm7
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm3
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm4
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm7
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm7[0,2,0,3]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3]
 ; AVX512DQ-NEXT:    vpbroadcastw 74(%rdi), %xmm6
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[2,3,14,15,u,u,6,7,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2],xmm3[3],xmm7[4,5],xmm3[6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm6[3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[2,1,0,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6],xmm7[7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm6, %xmm6
@@ -1353,22 +1361,22 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm5[2,2,2,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = xmm4[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm4[2,2,2,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = xmm3[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4],xmm1[5,6,7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[10,11,u,u,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5,6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3,4],xmm0[5,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa %xmm2, (%rsi)
-; AVX512DQ-NEXT:    vmovdqa %xmm3, (%rdx)
+; AVX512DQ-NEXT:    vmovdqa %xmm5, (%rdx)
 ; AVX512DQ-NEXT:    vmovdqa %xmm8, (%rcx)
 ; AVX512DQ-NEXT:    vmovdqa %xmm6, (%r8)
 ; AVX512DQ-NEXT:    vmovdqa %xmm1, (%r9)
@@ -1395,9 +1403,10 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3]
 ; AVX512DQ-FCP-NEXT:    vpbroadcastw 74(%rdi), %xmm6
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[2,3,14,15,u,u,6,7,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3],xmm7[4,5],xmm5[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,u,u,14,15,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6],xmm7[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3]
@@ -2052,11 +2061,10 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm1
 ; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm2
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm8[2,2,2,2,4,5,6,7]
 ; AVX2-NEXT:    vextracti128 $1, %ymm8, %xmm9
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm9[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm8[2,2,2,2,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,2]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5],xmm7[6],xmm6[7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3,4],xmm9[5,6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,6,7,0,1,12,13,8,9,4,5]
 ; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm10
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7]
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
@@ -2071,13 +2079,17 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7]
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm12[2,1,0,3]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm10 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3],xmm9[4,5],xmm10[6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm10[1,1,1,1,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm11[3,1,2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,3]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6],xmm9[7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,0,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,7,6,7]
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm3[3,4,5,6,7]
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6,7],ymm3[8,9,10],ymm8[11,12,13,14,15]
@@ -2161,10 +2173,10 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX2-FP-LABEL: load_i16_stride6_vf16:
 ; AVX2-FP:       # %bb.0:
-; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm3
+; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm4
 ; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm0
-; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm3
+; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm5
 ; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
@@ -2173,106 +2185,110 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm9[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm10
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm11, %xmm7
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm7[2,1,0,3]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm12[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm6[0,1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6,7]
-; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm3[2,3]
-; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm3[0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
+; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm5[2,3]
+; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm5[0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6,7]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6,7],ymm3[8,9,10],ymm8[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm9 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
+; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm11, %xmm10
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm12[1,1,1,1,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2],xmm10[3,4],xmm11[5],xmm10[6],xmm11[7]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm10[0,1,2],ymm5[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3,4,5,6,7],ymm5[8,9,10],ymm8[11,12,13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm9
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm8[2,1,0,3]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm11[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm10
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1]
+; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,6,5,6,4]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm8[2,1,0,3]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm12[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5,6],xmm11[7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm11[2,1,2,3]
 ; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm12, %xmm12
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm11, %xmm11
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm11[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,5,4]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5,6],xmm9[7]
-; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm13[3,1,2,1,4,5,6,7]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,2]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm5[2,2,2,2,4,5,6,7]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm4[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm15[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15]
+; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3,4],xmm0[5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,5,6,5]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm12[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm10[0,1,2,3],xmm0[4],xmm10[5,6],xmm0[7]
+; AVX2-FP-NEXT:    vpshufb %ymm9, %ymm15, %ymm9
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm13[3,1,2,1,4,5,6,7]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1,2],xmm10[3],xmm11[4,5,6,7]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,2]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm3[2,2,2,2,4,5,6,7]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm0[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm10[0],xmm4[1],xmm10[2,3],xmm4[4],xmm10[5,6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,0,0,0]
-; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm10, %ymm7, %ymm7
+; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,0,0]
+; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm4, %ymm7, %ymm4
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm5[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5,6,7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm4
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm2, %ymm5, %ymm2
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm0, %ymm3, %ymm0
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13]
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5],xmm5[6,7]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm3[4],xmm6[5],xmm3[6,7]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FP-NEXT:    vmovdqa %ymm0, (%rsi)
-; AVX2-FP-NEXT:    vmovdqa %ymm3, (%rdx)
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm1, (%rsi)
+; AVX2-FP-NEXT:    vmovdqa %ymm5, (%rdx)
 ; AVX2-FP-NEXT:    vmovdqa %ymm8, (%rcx)
 ; AVX2-FP-NEXT:    vmovdqa %ymm9, (%r8)
-; AVX2-FP-NEXT:    vmovdqa %ymm5, (%r9)
+; AVX2-FP-NEXT:    vmovdqa %ymm3, (%r9)
 ; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-FP-NEXT:    vmovdqa %ymm1, (%rax)
+; AVX2-FP-NEXT:    vmovdqa %ymm0, (%rax)
 ; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
 ;
 ; AVX2-FCP-LABEL: load_i16_stride6_vf16:
 ; AVX2-FCP:       # %bb.0:
-; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm3
+; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm4
 ; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm0
-; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm3
+; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm5
 ; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
@@ -2281,97 +2297,101 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm9[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm10
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm7
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm7[2,1,0,3]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm12[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm6[0,1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6,7]
-; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm3[2,3]
-; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm3[0,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
+; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm5[2,3]
+; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm5[0,1]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6,7]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6,7],ymm3[8,9,10],ymm8[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
+; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm11, %xmm10
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm12[1,1,1,1,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2],xmm10[3,4],xmm11[5],xmm10[6],xmm11[7]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm10[0,1,2],ymm5[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3,4,5,6,7],ymm5[8,9,10],ymm8[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm8[2,1,0,3]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm11[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm10
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1]
+; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,6,5,6,4]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm8[2,1,0,3]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm12[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5,6],xmm11[7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm11[2,1,2,3]
 ; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm12
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm11
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm11[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,5,4]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5,6],xmm9[7]
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm13[3,1,2,1,4,5,6,7]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,2]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm5[2,2,2,2,4,5,6,7]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm4[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm15[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3,4],xmm0[5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,5,6,5]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm12[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm10[0,1,2,3],xmm0[4],xmm10[5,6],xmm0[7]
+; AVX2-FCP-NEXT:    vpshufb %ymm9, %ymm15, %ymm9
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm13[3,1,2,1,4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1,2],xmm10[3],xmm11[4,5,6,7]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,2]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm3[2,2,2,2,4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm0[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm10[0],xmm4[1],xmm10[2,3],xmm4[4],xmm10[5,6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,0,0,0]
-; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm10, %ymm7, %ymm7
+; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,0,0]
+; AVX2-FCP-NEXT:    vpblendvb %ymm10, %ymm4, %ymm7, %ymm4
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm5[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5,6,7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm4
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm2, %ymm5, %ymm2
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpblendvb %ymm10, %ymm0, %ymm3, %ymm0
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13]
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5],xmm5[6,7]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm3[4],xmm6[5],xmm3[6,7]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
-; AVX2-FCP-NEXT:    vmovdqa %ymm3, (%rdx)
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm1, (%rsi)
+; AVX2-FCP-NEXT:    vmovdqa %ymm5, (%rdx)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm8, (%rcx)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm9, (%r8)
-; AVX2-FCP-NEXT:    vmovdqa %ymm5, (%r9)
+; AVX2-FCP-NEXT:    vmovdqa %ymm3, (%r9)
 ; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-FCP-NEXT:    vmovdqa %ymm1, (%rax)
+; AVX2-FCP-NEXT:    vmovdqa %ymm0, (%rax)
 ; AVX2-FCP-NEXT:    vzeroupper
 ; AVX2-FCP-NEXT:    retq
 ;
@@ -2383,11 +2403,10 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm1
 ; AVX512-NEXT:    vmovdqa 128(%rdi), %ymm2
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm5[2,2,2,2,4,5,6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm8
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm8[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm5[2,2,2,2,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5],xmm7[6],xmm6[7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3,4],xmm8[5,6,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,6,7,0,1,12,13,8,9,4,5]
 ; AVX512-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm9
 ; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm1[2,3],mem[2,3]
 ; AVX512-NEXT:    vinserti128 $1, 96(%rdi), %ymm1, %ymm7
@@ -2601,11 +2620,10 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm1
 ; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %ymm2
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm5[2,2,2,2,4,5,6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm8
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm8[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm5[2,2,2,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,2]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5],xmm7[6],xmm6[7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3,4],xmm8[5,6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,6,7,0,1,12,13,8,9,4,5]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm9
 ; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm1[2,3],mem[2,3]
 ; AVX512DQ-NEXT:    vinserti128 $1, 96(%rdi), %ymm1, %ymm7
@@ -4101,50 +4119,50 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-LABEL: load_i16_stride6_vf32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    subq $488, %rsp # imm = 0x1E8
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX2-NEXT:    vmovdqu %ymm4, (%rsp) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm5
 ; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm7
+; AVX2-NEXT:    vmovdqu %ymm7, (%rsp) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm0
 ; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm1
 ; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm10
 ; AVX2-NEXT:    vmovdqa 192(%rdi), %ymm11
 ; AVX2-NEXT:    vmovdqa 288(%rdi), %ymm2
 ; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm3
-; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm3[2,3],ymm2[2,3]
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm3[2,3],ymm2[2,3]
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1]
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm8 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7]
-; AVX2-NEXT:    vpshufb %ymm8, %ymm1, %ymm6
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX2-NEXT:    vpshufb %xmm7, %xmm2, %xmm0
+; AVX2-NEXT:    vpshufb %ymm6, %ymm1, %ymm4
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm8 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
+; AVX2-NEXT:    vpshufb %xmm8, %xmm2, %xmm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm3[2,2,2,2,4,5,6,7]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm9 = xmm0[0],xmm9[1],xmm0[2,3],xmm9[4],xmm0[5,6,7]
 ; AVX2-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
-; AVX2-NEXT:    vpblendvb %ymm0, %ymm9, %ymm6, %ymm4
+; AVX2-NEXT:    vpblendvb %ymm0, %ymm9, %ymm4, %ymm4
 ; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa %ymm11, %ymm4
+; AVX2-NEXT:    vmovdqa %ymm11, %ymm5
 ; AVX2-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7]
-; AVX2-NEXT:    vpshufb %xmm7, %xmm6, %xmm9
-; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm7
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm7[2,2,2,2,4,5,6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1],xmm9[2,3],xmm11[4],xmm9[5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7]
+; AVX2-NEXT:    vpshufb %xmm8, %xmm4, %xmm8
+; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm9
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm9[2,2,2,2,4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3],xmm11[4],xmm8[5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm13[0],ymm14[1],ymm13[2,3,4,5],ymm14[6],ymm13[7]
-; AVX2-NEXT:    vpshufb %ymm8, %ymm11, %ymm8
-; AVX2-NEXT:    vpblendvb %ymm0, %ymm9, %ymm8, %ymm5
-; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm8 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
-; AVX2-NEXT:    vpshufb %xmm8, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7]
+; AVX2-NEXT:    vpshufb %ymm6, %ymm11, %ymm6
+; AVX2-NEXT:    vpblendvb %ymm0, %ymm8, %ymm6, %ymm6
+; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
+; AVX2-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
@@ -4153,98 +4171,101 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpshufb %ymm3, %ymm11, %ymm1
-; AVX2-NEXT:    vpshufb %xmm8, %xmm6, %xmm3
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0],ymm10[1],ymm4[2,3],ymm10[4],ymm4[5,6],ymm10[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm7[1,1,2,3]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3],xmm6[4],xmm3[5,6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
+; AVX2-NEXT:    vpshufb %xmm6, %xmm4, %xmm3
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5,6],ymm10[7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm9[1,1,2,3]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
 ; AVX2-NEXT:    vpblendvb %ymm0, %ymm3, %ymm1, %ymm0
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm5
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm5[0,2,0,3]
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm9
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm9[0,2,0,3]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; AVX2-NEXT:    vpshufb %xmm6, %xmm2, %xmm3
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm14[0],ymm13[1],ymm14[2,3,4,5],ymm13[6],ymm14[7]
-; AVX2-NEXT:    vpshufb %ymm12, %ymm7, %ymm3
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm0[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-NEXT:    vmovdqa 352(%rdi), %ymm0
+; AVX2-NEXT:    vpshufb %xmm7, %xmm2, %xmm1
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4,5],ymm12[6],ymm14[7]
+; AVX2-NEXT:    vpshufb %ymm15, %ymm12, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-NEXT:    vmovdqa 352(%rdi), %ymm11
+; AVX2-NEXT:    vmovdqa 320(%rdi), %ymm4
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm11[2],ymm4[3,4],ymm11[5],ymm4[6,7]
+; AVX2-NEXT:    vmovdqa %ymm4, %ymm13
+; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7]
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm8
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3,4],xmm8[5,6,7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5]
+; AVX2-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
+; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 320(%rdi), %ymm14
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7]
-; AVX2-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm0[2,2,2,2,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm9 = xmm3[0,1,2,2]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
+; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm14
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm3[2,2,2,2,4,5,6,7]
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm5
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3,4],xmm5[5,6,7]
+; AVX2-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vpblendd $146, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
+; AVX2-NEXT:    vpshufb %xmm7, %xmm0, %xmm7
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT:    vpshufb %xmm15, %xmm4, %xmm11
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3],xmm11[4,5],xmm9[6],xmm11[7]
-; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm9 = ymm8[0,1,2],ymm9[3,4,5,6,7],ymm8[8,9,10],ymm9[11,12,13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm1
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm3
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7]
-; AVX2-NEXT:    vmovdqa %ymm3, %ymm9
-; AVX2-NEXT:    vextracti128 $1, %ymm11, %xmm3
-; AVX2-NEXT:    vpshufb %xmm15, %xmm3, %xmm15
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm11[2,2,2,2,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,2]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm10 = xmm15[0,1,2],xmm10[3],xmm15[4,5],xmm10[6],xmm15[7]
-; AVX2-NEXT:    vmovdqu (%rsp), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm15 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6],mem[7]
-; AVX2-NEXT:    vpshufb %xmm6, %xmm15, %xmm6
-; AVX2-NEXT:    vextracti128 $1, %ymm15, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm13 = xmm1[0,2,0,3]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm13[2],xmm6[3],xmm13[4,5],xmm6[6,7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm13 = mem[0],ymm8[1],mem[2,3,4,5],ymm8[6],mem[7]
-; AVX2-NEXT:    vpshufb %ymm12, %ymm13, %ymm12
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3,4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm10 = ymm6[0,1,2],ymm10[3,4,5,6,7],ymm6[8,9,10],ymm10[11,12,13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm4[0,2,0,3]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm10[2],xmm7[3],xmm10[4,5],xmm7[6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm10 = mem[0],ymm10[1],mem[2,3,4,5],ymm10[6],mem[7]
+; AVX2-NEXT:    vpshufb %ymm15, %ymm10, %ymm15
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm15[3,4,5,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0]
-; AVX2-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm10 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
-; AVX2-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3],xmm5[4,5],xmm2[6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX2-NEXT:    vpshufb %ymm5, %ymm7, %ymm7
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm9[2,1,0,3]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[1,1,1,1,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3,4],xmm6[5],xmm2[6],xmm6[7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX2-NEXT:    vpshufb %ymm6, %ymm12, %ymm7
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,0,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3,4,5,6,7]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
-; AVX2-NEXT:    vpshufb %xmm7, %xmm4, %xmm4
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3],xmm4[4,5],xmm0[6],xmm4[7]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufb %xmm7, %xmm3, %xmm0
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,5,5,5,5]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7]
-; AVX2-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm10, %xmm15, %xmm2
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7]
-; AVX2-NEXT:    vpshufb %ymm5, %ymm13, %ymm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vpshufb %xmm7, %xmm8, %xmm8
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3],xmm8[4,5],xmm1[6],xmm8[7]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpshufb %xmm7, %xmm5, %xmm1
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,5,5,5,5]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
+; AVX2-NEXT:    vpshufb %ymm6, %ymm10, %ymm2
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm4[2,1,0,3]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3,4],xmm3[5],xmm0[6],xmm3[7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,0,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm8 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm0 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7]
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[0,1,2,1]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[2,1,0,3]
@@ -4270,21 +4291,20 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa %ymm9, %ymm14
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm14[2],ymm9[3,4],ymm14[5],ymm9[6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7]
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[0,1,2,1]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,1,0,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm3[0,0,0,0,4,5,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[0,1,2,1]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,1,0,3]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm4[0,0,0,0,4,5,6,7]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,6,5,6,4]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm13 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,6,4]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm9 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-NEXT:    vmovdqu (%rsp), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vpblendd $36, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,1,2,3]
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
@@ -4293,12 +4313,12 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[0,1,3,3]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm1[2,1,2,0,4,5,6,7]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0],xmm15[1,2],xmm12[3],xmm15[4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
 ; AVX2-NEXT:    vpshufb %ymm11, %ymm2, %ymm11
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm13 = ymm11[0,1,2],ymm13[3,4,5,6,7],ymm11[8,9,10],ymm13[11,12,13,14,15]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7],ymm11[8,9,10],ymm9[11,12,13,14,15]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,6,5,4]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4],xmm11[5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[1,1,1,1,4,5,6,7]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,7]
@@ -4314,10 +4334,10 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,2]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm8[5,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
 ; AVX2-NEXT:    vpshufb %ymm7, %ymm2, %ymm2
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
@@ -4328,7 +4348,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0],ymm9[1],ymm14[2,3],ymm9[4],ymm14[5,6],ymm9[7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7]
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm1[0,1,0,2,4,5,6,7]
@@ -4376,7 +4396,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovaps %ymm6, (%rdx)
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX2-NEXT:    vmovaps %ymm6, 32(%rcx)
-; AVX2-NEXT:    vmovdqa %ymm11, (%rcx)
+; AVX2-NEXT:    vmovdqa %ymm9, (%rcx)
 ; AVX2-NEXT:    vmovdqa %ymm5, 32(%r8)
 ; AVX2-NEXT:    vmovdqa %ymm0, (%r8)
 ; AVX2-NEXT:    vmovdqa %ymm4, 32(%r9)
@@ -4391,139 +4411,140 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-LABEL: load_i16_stride6_vf32:
 ; AVX2-FP:       # %bb.0:
 ; AVX2-FP-NEXT:    subq $488, %rsp # imm = 0x1E8
-; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX2-FP-NEXT:    vmovdqu %ymm4, (%rsp) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm5
-; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm5
+; AVX2-FP-NEXT:    vmovdqu %ymm5, (%rsp) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm7
+; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm0
 ; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm9
-; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %ymm11
+; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %ymm10
 ; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm3
 ; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm3[2,3],ymm2[2,3]
-; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1]
+; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm3[0,1],ymm2[0,1]
 ; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm7 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7]
-; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm1, %ymm6
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm8 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufb %xmm8, %xmm2, %xmm0
+; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm1, %ymm6
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm2, %xmm0
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm3[2,2,2,2,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm0[0],xmm10[1],xmm0[2,3],xmm10[4],xmm0[5,6,7]
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm3[2,2,2,2,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm0[0],xmm8[1],xmm0[2,3],xmm8[4],xmm0[5,6,7]
 ; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
-; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm10, %ymm6, %ymm4
-; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm8, %ymm6, %ymm5
+; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm8, %xmm6, %xmm10
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm6, %xmm8
-; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm8[2,2,2,2,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3],xmm12[4],xmm10[5,6,7]
+; AVX2-FP-NEXT:    vmovdqa %ymm10, %ymm5
+; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm6, %xmm8
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm6, %xmm7
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm7[2,2,2,2,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa %ymm14, %ymm4
-; AVX2-FP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0],ymm14[1],ymm13[2,3,4,5],ymm14[6],ymm13[7]
-; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm12, %ymm7
-; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm10, %ymm7, %ymm5
-; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpbroadcastd {{.*#+}} xmm7 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm10 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
+; AVX2-FP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm13[0],ymm12[1],ymm13[2,3,4,5],ymm12[6],ymm13[7]
+; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm10, %ymm4
+; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm8, %ymm4, %ymm4
+; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm8 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm8, %xmm2, %xmm2
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm3 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
 ; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm12, %ymm1
-; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm8, %xmm3
-; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm6, %xmm6
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6],ymm9[7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3],xmm3[4],xmm6[5,6,7]
+; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm10, %ymm1
+; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm7, %xmm3
+; AVX2-FP-NEXT:    vpshufb %xmm8, %xmm6, %xmm4
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0],ymm9[1],ymm5[2,3],ymm9[4],ymm5[5,6],ymm9[7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
 ; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm3, %ymm1, %ymm0
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufb %xmm15, %xmm5, %xmm0
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm1
+; AVX2-FP-NEXT:    vpshufb %xmm15, %xmm2, %xmm0
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm1
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm1[2,1,0,3]
-; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0]
-; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm11, %xmm1
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm4[0],ymm13[1],ymm4[2,3,4,5],ymm13[6],ymm4[7]
-; AVX2-FP-NEXT:    vpshufb %ymm14, %ymm7, %ymm1
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm12[0],ymm13[1],ymm12[2,3,4,5],ymm13[6],ymm12[7]
+; AVX2-FP-NEXT:    vpshufb %ymm14, %ymm8, %ymm1
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa 352(%rdi), %ymm2
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %ymm1
+; AVX2-FP-NEXT:    vmovdqa 352(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
-; AVX2-FP-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
-; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm1, %xmm6
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm10
-; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm10, %xmm8
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7]
+; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %ymm3
+; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7]
+; AVX2-FP-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm7, %xmm6
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm7, %xmm4
+; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm4, %xmm9
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3],xmm9[4,5],xmm6[6],xmm9[7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm0[0,1,2],ymm6[3,4,5,6,7],ymm0[8,9,10],ymm6[11,12,13,14,15]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm0
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm8
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm0, %xmm4
+; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm9
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2],ymm9[3,4],ymm0[5],ymm9[6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm3, %xmm9
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3],xmm9[4,5],xmm4[6],xmm9[7]
-; AVX2-FP-NEXT:    vmovdqu (%rsp), %ymm2 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm9 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7]
-; AVX2-FP-NEXT:    vpshufb %xmm15, %xmm9, %xmm15
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm9, %xmm13
+; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm3, %xmm10
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm5[3],xmm10[4,5],xmm5[6],xmm10[7]
+; AVX2-FP-NEXT:    vmovdqu (%rsp), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm5 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6],mem[7]
+; AVX2-FP-NEXT:    vpshufb %xmm15, %xmm5, %xmm15
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm13
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[2,1,0,3]
-; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm13, %xmm12
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm15[0,1],xmm12[2],xmm15[3],xmm12[4,5],xmm15[6,7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7]
-; AVX2-FP-NEXT:    vpshufb %ymm14, %ymm2, %ymm14
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm1 = ymm1[0],mem[1],ymm1[2,3,4,5],mem[6],ymm1[7]
+; AVX2-FP-NEXT:    vpshufb %ymm14, %ymm1, %ymm14
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3,4,5,6,7]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm12[0,1,2],ymm4[3,4,5,6,7],ymm12[8,9,10],ymm4[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm4 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
-; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm14 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0]
-; AVX2-FP-NEXT:    vpshufb %xmm14, %xmm11, %xmm11
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm11[2],xmm5[3],xmm11[4,5],xmm5[6,7]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7],ymm12[8,9,10],ymm10[11,12,13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [6,7,2,3,12,13,14,15,6,7,2,3,12,13,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm11[1,1,1,1,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm10[2],xmm2[3,4],xmm10[5],xmm2[6],xmm10[7]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX2-FP-NEXT:    vpshufb %ymm11, %ymm7, %ymm7
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm7 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm10, %xmm10
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3],xmm10[4,5],xmm1[6],xmm10[7]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3,4,5,6,7],ymm5[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm3, %xmm1
+; AVX2-FP-NEXT:    vpshufb %ymm11, %ymm8, %ymm8
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm14 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm14, %xmm2, %xmm2
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm8[3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm8 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm8, %xmm4, %xmm4
+; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3],xmm4[4,5],xmm7[6],xmm4[7]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3,4,5,6,7],ymm2[8,9,10],ymm4[11,12,13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpshufb %xmm8, %xmm3, %xmm2
 ; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
-; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm9, %xmm1
-; AVX2-FP-NEXT:    vpshufb %xmm14, %xmm13, %xmm3
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7]
-; AVX2-FP-NEXT:    vpshufb %ymm11, %ymm2, %ymm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7]
+; AVX2-FP-NEXT:    vpshufb %ymm11, %ymm1, %ymm1
+; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm5, %xmm2
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm13[1,1,1,1,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6],xmm3[7]
+; AVX2-FP-NEXT:    vpshufb %xmm14, %xmm2, %xmm2
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
@@ -4539,30 +4560,30 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
 ; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm5, %xmm1
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,4]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm7
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm7[0,3,2,1]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,1,2,3]
-; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
-; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm11, %xmm7
-; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm3[2,1,2,0,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm13[0],xmm7[1,2],xmm13[3],xmm7[4,5,6,7]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,6,4]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm8
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm8[0,3,2,1]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,1,2,3]
+; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm8 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
+; AVX2-FP-NEXT:    vpshufb %xmm8, %xmm11, %xmm12
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm2[2,1,2,0,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1,2],xmm13[3],xmm12[4,5,6,7]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm13 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
 ; AVX2-FP-NEXT:    vpshufb %ymm13, %ymm6, %ymm14
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3,4,5,6,7],ymm14[8,9,10],ymm1[11,12,13,14,15]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm14[0,1,2],ymm3[3,4,5,6,7],ymm14[8,9,10],ymm3[11,12,13,14,15]
 ; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,5,4]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm14[5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm14[5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm14[2,1,0,3]
-; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm3[0,1],ymm9[2],ymm3[3,4],ymm9[5],ymm3[6,7]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm14[2,1,0,3]
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm14, %xmm14
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1]
 ; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm14[0,1,2,3,6,5,6,4]
@@ -4573,96 +4594,96 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqu (%rsp), %ymm0 # 32-byte Reload
 ; AVX2-FP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm12
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1]
-; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm12, %xmm9
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm10
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,1]
+; AVX2-FP-NEXT:    vpshufb %xmm8, %xmm10, %xmm8
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm0[2,1,2,0,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1,2],xmm10[3],xmm9[4,5,6,7]
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm0[2,1,2,0,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1,2],xmm7[3],xmm8[4,5,6,7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FP-NEXT:    vpshufb %ymm13, %ymm15, %ymm10
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3,4,5,6,7],ymm10[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,5,4]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FP-NEXT:    vpshufb %ymm13, %ymm15, %ymm8
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3,4,5,6,7],ymm8[8,9,10],ymm1[11,12,13,14,15]
+; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,5,4]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
 ; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm5, %xmm5
 ; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4],xmm5[5,6],xmm4[7]
 ; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
-; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm11, %xmm10
-; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1,2],xmm3[3],xmm10[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm10 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX2-FP-NEXT:    vpshufb %ymm10, %ymm6, %ymm6
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm11, %xmm7
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm7[1,2],xmm2[3],xmm7[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm7 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
+; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm6, %ymm6
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15]
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm6[5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm12, %xmm1
 ; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,7,5,6,5]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
-; AVX2-FP-NEXT:    vpshufb %ymm10, %ymm15, %ymm2
-; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm12, %xmm3
+; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm15, %ymm2
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm10, %xmm5
 ; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1,2],xmm0[3],xmm5[4,5,6,7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6],ymm7[7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0],ymm3[1],ymm9[2,3],ymm3[4],ymm9[5,6],ymm3[7]
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
-; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
-; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm2, %xmm5
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u]
-; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm1, %xmm8
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4],xmm8[5],xmm5[6,7]
+; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm2, %xmm6
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u]
+; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm1, %xmm9
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm9[0,1,2,3],xmm6[4],xmm9[5],xmm6[6,7]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm9 = mem[0],ymm9[1],mem[2,3],ymm9[4],mem[5,6],ymm9[7]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm9, %xmm10
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm10, %xmm5
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,3,2,1]
+; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm9, %xmm7
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4],xmm7[5],xmm5[6,7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
 ; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm8 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5,6],ymm8[7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm10
-; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm10, %xmm3
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,1]
-; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm8, %xmm6
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm3[4],xmm6[5],xmm3[6,7]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7]
-; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
-; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
+; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u]
 ; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm1, %xmm1
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm10, %xmm2
-; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm8, %xmm6
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm10, %xmm2
+; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm9, %xmm7
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX2-FP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 32(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, (%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 32(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, (%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 32(%rcx)
-; AVX2-FP-NEXT:    vmovdqa %ymm9, (%rcx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 32(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, (%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 32(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm3, (%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm3, 32(%rcx)
+; AVX2-FP-NEXT:    vmovdqa %ymm8, (%rcx)
 ; AVX2-FP-NEXT:    vmovdqa %ymm4, 32(%r8)
 ; AVX2-FP-NEXT:    vmovdqa %ymm0, (%r8)
-; AVX2-FP-NEXT:    vmovdqa %ymm3, 32(%r9)
-; AVX2-FP-NEXT:    vmovdqa %ymm5, (%r9)
+; AVX2-FP-NEXT:    vmovdqa %ymm5, 32(%r9)
+; AVX2-FP-NEXT:    vmovdqa %ymm6, (%r9)
 ; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FP-NEXT:    vmovdqa %ymm2, 32(%rax)
 ; AVX2-FP-NEXT:    vmovdqa %ymm1, (%rax)
@@ -4673,139 +4694,140 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-LABEL: load_i16_stride6_vf32:
 ; AVX2-FCP:       # %bb.0:
 ; AVX2-FCP-NEXT:    subq $488, %rsp # imm = 0x1E8
-; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX2-FCP-NEXT:    vmovdqu %ymm4, (%rsp) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
-; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm5
+; AVX2-FCP-NEXT:    vmovdqu %ymm5, (%rsp) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm7
+; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm0
 ; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm9
-; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %ymm11
+; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %ymm10
 ; AVX2-FCP-NEXT:    vmovdqa 288(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm3
 ; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm3[2,3],ymm2[2,3]
-; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1]
+; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm3[0,1],ymm2[0,1]
 ; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7]
-; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm1, %ymm6
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufb %xmm8, %xmm2, %xmm0
+; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm1, %ymm6
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm2, %xmm0
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm3[2,2,2,2,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm0[0],xmm10[1],xmm0[2,3],xmm10[4],xmm0[5,6,7]
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm3[2,2,2,2,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm0[0],xmm8[1],xmm0[2,3],xmm8[4],xmm0[5,6,7]
 ; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
-; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm10, %ymm6, %ymm4
-; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm8, %ymm6, %ymm5
+; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7]
-; AVX2-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm10
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm8
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm8[2,2,2,2,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3],xmm12[4],xmm10[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa %ymm10, %ymm5
+; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm8
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm7[2,2,2,2,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa %ymm14, %ymm4
-; AVX2-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0],ymm14[1],ymm13[2,3,4,5],ymm14[6],ymm13[7]
-; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm12, %ymm7
-; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm10, %ymm7, %ymm5
-; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm7 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
-; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
+; AVX2-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm13[0],ymm12[1],ymm13[2,3,4,5],ymm12[6],ymm13[7]
+; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm10, %ymm4
+; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm8, %ymm4, %ymm4
+; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm8, %xmm2, %xmm2
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
 ; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm12, %ymm1
-; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm8, %xmm3
-; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm6, %xmm6
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6],ymm9[7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3],xmm3[4],xmm6[5,6,7]
+; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm10, %ymm1
+; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm7, %xmm3
+; AVX2-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm4
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0],ymm9[1],ymm5[2,3],ymm9[4],ymm5[5,6],ymm9[7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm3, %ymm1, %ymm0
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm5, %xmm0
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm1
+; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm2, %xmm0
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm1
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm1[2,1,0,3]
-; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0]
-; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm11, %xmm1
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm4[0],ymm13[1],ymm4[2,3,4,5],ymm13[6],ymm4[7]
-; AVX2-FCP-NEXT:    vpshufb %ymm14, %ymm7, %ymm1
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm12[0],ymm13[1],ymm12[2,3,4,5],ymm13[6],ymm12[7]
+; AVX2-FCP-NEXT:    vpshufb %ymm14, %ymm8, %ymm1
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
-; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm1, %xmm6
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm10
-; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm10, %xmm8
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7]
+; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm3
+; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm7, %xmm6
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm4
+; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm4, %xmm9
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3],xmm9[4,5],xmm6[6],xmm9[7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm0[0,1,2],ymm6[3,4,5,6,7],ymm0[8,9,10],ymm6[11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm0
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm8
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7]
-; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm4
+; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm9
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2],ymm9[3,4],ymm0[5],ymm9[6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm3, %xmm9
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3],xmm9[4,5],xmm4[6],xmm9[7]
-; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm9 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7]
-; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm9, %xmm15
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm13
+; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm3, %xmm10
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm5[3],xmm10[4,5],xmm5[6],xmm10[7]
+; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm5 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6],mem[7]
+; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm5, %xmm15
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm13
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[2,1,0,3]
-; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm13, %xmm12
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm15[0,1],xmm12[2],xmm15[3],xmm12[4,5],xmm15[6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7]
-; AVX2-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm14
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm1 = ymm1[0],mem[1],ymm1[2,3,4,5],mem[6],ymm1[7]
+; AVX2-FCP-NEXT:    vpshufb %ymm14, %ymm1, %ymm14
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm12[0,1,2],ymm4[3,4,5,6,7],ymm12[8,9,10],ymm4[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
-; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm14 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0]
-; AVX2-FCP-NEXT:    vpshufb %xmm14, %xmm11, %xmm11
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm11[2],xmm5[3],xmm11[4,5],xmm5[6,7]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7],ymm12[8,9,10],ymm10[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [6,7,2,3,12,13,14,15,6,7,2,3,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm11[1,1,1,1,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm10[2],xmm2[3,4],xmm10[5],xmm2[6],xmm10[7]
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX2-FCP-NEXT:    vpshufb %ymm11, %ymm7, %ymm7
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
-; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm10, %xmm10
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3],xmm10[4,5],xmm1[6],xmm10[7]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3,4,5,6,7],ymm5[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm1
+; AVX2-FCP-NEXT:    vpshufb %ymm11, %ymm8, %ymm8
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm14, %xmm2, %xmm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm8[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm8, %xmm4, %xmm4
+; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3],xmm4[4,5],xmm7[6],xmm4[7]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3,4,5,6,7],ymm2[8,9,10],ymm4[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpshufb %xmm8, %xmm3, %xmm2
 ; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
-; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm9, %xmm1
-; AVX2-FCP-NEXT:    vpshufb %xmm14, %xmm13, %xmm3
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7]
-; AVX2-FCP-NEXT:    vpshufb %ymm11, %ymm2, %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7]
+; AVX2-FCP-NEXT:    vpshufb %ymm11, %ymm1, %ymm1
+; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm5, %xmm2
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm13[1,1,1,1,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6],xmm3[7]
+; AVX2-FCP-NEXT:    vpshufb %xmm14, %xmm2, %xmm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
@@ -4821,30 +4843,30 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1]
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
 ; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm1
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,4]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm7
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm7[0,3,2,1]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,1,2,3]
-; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
-; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm11, %xmm7
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm3[2,1,2,0,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm13[0],xmm7[1,2],xmm13[3],xmm7[4,5,6,7]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,6,4]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm8
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm8[0,3,2,1]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,1,2,3]
+; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm8 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
+; AVX2-FCP-NEXT:    vpshufb %xmm8, %xmm11, %xmm12
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm2[2,1,2,0,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1,2],xmm13[3],xmm12[4,5,6,7]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
 ; AVX2-FCP-NEXT:    vpshufb %ymm13, %ymm6, %ymm14
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3,4,5,6,7],ymm14[8,9,10],ymm1[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm14[0,1,2],ymm3[3,4,5,6,7],ymm14[8,9,10],ymm3[11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,5,4]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm14[5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm14[5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm14[2,1,0,3]
-; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm3[0,1],ymm9[2],ymm3[3,4],ymm9[5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm14[2,1,0,3]
+; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1]
 ; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm14[0,1,2,3,6,5,6,4]
@@ -4855,96 +4877,96 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm0 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm12
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1]
-; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm12, %xmm9
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm10
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,1]
+; AVX2-FCP-NEXT:    vpshufb %xmm8, %xmm10, %xmm8
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm0[2,1,2,0,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1,2],xmm10[3],xmm9[4,5,6,7]
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm0[2,1,2,0,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1,2],xmm7[3],xmm8[4,5,6,7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vpshufb %ymm13, %ymm15, %ymm10
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3,4,5,6,7],ymm10[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,5,4]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufb %ymm13, %ymm15, %ymm8
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3,4,5,6,7],ymm8[8,9,10],ymm1[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,5,4]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
 ; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm5
 ; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5]
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4],xmm5[5,6],xmm4[7]
 ; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
-; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm11, %xmm10
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1,2],xmm3[3],xmm10[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX2-FCP-NEXT:    vpshufb %ymm10, %ymm6, %ymm6
+; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm11, %xmm7
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm7[1,2],xmm2[3],xmm7[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
+; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm6, %ymm6
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm6[5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm12, %xmm1
 ; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,7,5,6,5]
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
-; AVX2-FCP-NEXT:    vpshufb %ymm10, %ymm15, %ymm2
-; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm12, %xmm3
+; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm15, %ymm2
+; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm10, %xmm5
 ; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1,2],xmm0[3],xmm5[4,5,6,7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6],ymm7[7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0],ymm3[1],ymm9[2,3],ymm3[4],ymm9[5,6],ymm3[7]
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
-; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
-; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm5
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u]
-; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm8
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4],xmm8[5],xmm5[6,7]
+; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
+; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm2, %xmm6
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u]
+; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm1, %xmm9
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm9[0,1,2,3],xmm6[4],xmm9[5],xmm6[6,7]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm9 = mem[0],ymm9[1],mem[2,3],ymm9[4],mem[5,6],ymm9[7]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm10
+; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm10, %xmm5
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,3,2,1]
+; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm9, %xmm7
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4],xmm7[5],xmm5[6,7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
 ; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm8 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5,6],ymm8[7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm10
-; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm10, %xmm3
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,1]
-; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm8, %xmm6
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm3[4],xmm6[5],xmm3[6,7]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
-; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
+; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm1, %xmm1
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm10, %xmm2
-; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm8, %xmm6
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm10, %xmm2
+; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm9, %xmm7
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX2-FCP-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 32(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, (%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 32(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, (%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 32(%rcx)
-; AVX2-FCP-NEXT:    vmovdqa %ymm9, (%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 32(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, (%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 32(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm3, (%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm3, 32(%rcx)
+; AVX2-FCP-NEXT:    vmovdqa %ymm8, (%rcx)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm4, 32(%r8)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm0, (%r8)
-; AVX2-FCP-NEXT:    vmovdqa %ymm3, 32(%r9)
-; AVX2-FCP-NEXT:    vmovdqa %ymm5, (%r9)
+; AVX2-FCP-NEXT:    vmovdqa %ymm5, 32(%r9)
+; AVX2-FCP-NEXT:    vmovdqa %ymm6, (%r9)
 ; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FCP-NEXT:    vmovdqa %ymm2, 32(%rax)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm1, (%rax)
@@ -4954,101 +4976,99 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512-LABEL: load_i16_stride6_vf32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    subq $136, %rsp
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
-; AVX512-NEXT:    vmovdqa 224(%rdi), %ymm12
+; AVX512-NEXT:    subq $72, %rsp
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
+; AVX512-NEXT:    vmovdqa 224(%rdi), %ymm0
 ; AVX512-NEXT:    vmovdqa 192(%rdi), %ymm1
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0],ymm12[1],ymm1[2,3],ymm12[4],ymm1[5,6],ymm12[7]
-; AVX512-NEXT:    vmovdqa %ymm1, %ymm14
-; AVX512-NEXT:    vpshufb %xmm4, %xmm2, %xmm1
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
+; AVX512-NEXT:    vmovdqa %ymm1, %ymm11
+; AVX512-NEXT:    vmovdqa %ymm0, %ymm14
+; AVX512-NEXT:    vpshufb %xmm5, %xmm2, %xmm1
 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[0,2,0,3]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7]
-; AVX512-NEXT:    vmovdqa 160(%rdi), %ymm5
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm15
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm11
+; AVX512-NEXT:    vmovdqa 160(%rdi), %ymm4
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm13
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm10
 ; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm6
 ; AVX512-NEXT:    vmovdqa 128(%rdi), %ymm7
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm7, %ymm23
-; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm25
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm5
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
-; AVX512-NEXT:    vpshufb %xmm8, %xmm5, %xmm7
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm1[2,2,2,2,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3],xmm7[4,5],xmm9[6],xmm7[7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm7, %ymm24
+; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm26
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm4
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm1[2,2,2,2,4,5,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3,4],xmm4[5,6,7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5]
+; AVX512-NEXT:    vpshufb %xmm9, %xmm7, %xmm7
 ; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm3, %zmm7, %zmm3
 ; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm6[2,3],mem[2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm15[0],ymm11[1],ymm15[2,3],ymm11[4],ymm15[5,6],ymm11[7]
-; AVX512-NEXT:    vmovdqa64 %ymm11, %ymm16
-; AVX512-NEXT:    vpshufb %xmm4, %xmm3, %xmm7
-; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm4[0,2,0,3]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2],xmm7[3],xmm9[4,5],xmm7[6,7]
-; AVX512-NEXT:    vinserti128 $1, 96(%rdi), %ymm6, %ymm9
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0],ymm13[1],ymm9[2,3,4,5],ymm13[6],ymm9[7]
-; AVX512-NEXT:    vmovdqa64 %ymm9, %ymm22
-; AVX512-NEXT:    vmovdqa64 %ymm13, %ymm28
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7]
-; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 352(%rdi), %ymm9
+; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm6[2,3],mem[2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6],ymm10[7]
+; AVX512-NEXT:    vmovdqa64 %ymm10, %ymm16
+; AVX512-NEXT:    vpshufb %xmm5, %xmm3, %xmm7
+; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm5
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm5[0,2,0,3]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6,7]
+; AVX512-NEXT:    vinserti128 $1, 96(%rdi), %ymm6, %ymm8
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0],ymm12[1],ymm8[2,3,4,5],ymm12[6],ymm8[7]
+; AVX512-NEXT:    vmovdqa64 %ymm8, %ymm29
+; AVX512-NEXT:    vmovdqa64 %ymm12, %ymm28
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7]
+; AVX512-NEXT:    vmovdqu64 %zmm7, (%rsp) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 352(%rdi), %ymm8
 ; AVX512-NEXT:    vmovdqa 320(%rdi), %ymm10
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm10, %ymm19
-; AVX512-NEXT:    vmovdqa64 %ymm9, %ymm20
-; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm10
-; AVX512-NEXT:    vpshufb %xmm8, %xmm10, %xmm8
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm7[2,2,2,2,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3],xmm8[4,5],xmm9[6],xmm8[7]
-; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm9
-; AVX512-NEXT:    vmovdqa 256(%rdi), %ymm8
-; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm8[2,3],mem[2,3]
-; AVX512-NEXT:    vinserti128 $1, 288(%rdi), %ymm8, %ymm13
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0],ymm11[1],ymm13[2,3,4,5],ymm11[6],ymm13[7]
-; AVX512-NEXT:    vmovdqa64 %ymm13, %ymm26
-; AVX512-NEXT:    vmovdqa64 %ymm11, %ymm27
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm11 = ymm8[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7],ymm11[8,9,10],ymm9[11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,6]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm9, %ymm29
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0]
-; AVX512-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm11 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
-; AVX512-NEXT:    vpshufb %xmm11, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm8[2],ymm10[3,4],ymm8[5],ymm10[6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm10, %ymm18
+; AVX512-NEXT:    vmovdqa64 %ymm8, %ymm20
+; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm8
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm7[2,2,2,2,4,5,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm10 = xmm8[0,1,2],xmm10[3,4],xmm8[5,6,7]
+; AVX512-NEXT:    vpshufb %xmm9, %xmm10, %xmm9
+; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm10
+; AVX512-NEXT:    vmovdqa 256(%rdi), %ymm9
+; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm9[2,3],mem[2,3]
+; AVX512-NEXT:    vinserti128 $1, 288(%rdi), %ymm9, %ymm15
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm15[0],ymm12[1],ymm15[2,3,4,5],ymm12[6],ymm15[7]
+; AVX512-NEXT:    vmovdqa64 %ymm15, %ymm25
+; AVX512-NEXT:    vmovdqa64 %ymm12, %ymm27
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm12 = ymm9[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7],ymm12[8,9,10],ymm10[11,12,13,14,15]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,6]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm10, %ymm30
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0]
+; AVX512-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm12 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
+; AVX512-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
-; AVX512-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
+; AVX512-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3],xmm5[4,5],xmm1[6],xmm5[7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3],xmm4[4,5],xmm1[6],xmm4[7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufb %xmm9, %xmm4, %xmm0
-; AVX512-NEXT:    vpshufb %xmm11, %xmm3, %xmm1
+; AVX512-NEXT:    vpshufb %xmm10, %xmm5, %xmm0
+; AVX512-NEXT:    vpshufb %xmm12, %xmm3, %xmm1
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX512-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512-NEXT:    vpshufb %xmm2, %xmm10, %xmm0
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm15 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX512-NEXT:    vpshufb %xmm2, %xmm8, %xmm0
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,5,5,5,5]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm8[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm9[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm18
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm12, %ymm30
+; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm22
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7]
 ; AVX512-NEXT:    vmovdqa64 %ymm14, %ymm31
+; AVX512-NEXT:    vmovdqa64 %ymm11, %ymm21
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm0[2,1,2,3]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm1[0,3,2,1]
@@ -5056,8 +5076,8 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm10[2,1,2,0,4,5,6,7]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm1
-; AVX512-NEXT:    vmovdqa64 %ymm25, %ymm2
+; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm1
+; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm2
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[2,1,0,3]
@@ -5068,10 +5088,10 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm17
-; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm24
+; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm23
 ; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm0
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm15, %ymm21
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm13, %ymm19
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[0,3,2,1]
@@ -5079,15 +5099,17 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm6[2,1,2,0,4,5,6,7]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm22, %ymm13
+; AVX512-NEXT:    vmovdqa64 %ymm29, %ymm13
 ; AVX512-NEXT:    vmovdqa64 %ymm28, %ymm12
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm12[2],ymm13[3],ymm12[4],ymm13[5,6],ymm12[7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm5[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm15
-; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm0
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm28
+; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm29
+; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm0
+; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm1
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7]
@@ -5096,7 +5118,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,6,5,6,4]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm14[4],xmm1[5,6],xmm14[7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm14
-; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm0
+; AVX512-NEXT:    vmovdqa64 %ymm25, %ymm0
 ; AVX512-NEXT:    vmovdqa64 %ymm27, %ymm1
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4],ymm0[5,6],ymm1[7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
@@ -5118,8 +5140,8 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4],xmm9[5,6],xmm8[7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm2, %zmm8, %zmm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm29, %zmm0, %zmm22
-; AVX512-NEXT:    vinserti64x4 $1, %ymm18, %zmm0, %zmm18
+; AVX512-NEXT:    vinserti64x4 $1, %ymm30, %zmm0, %zmm30
+; AVX512-NEXT:    vinserti64x4 $1, %ymm22, %zmm0, %zmm18
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7]
@@ -5140,15 +5162,15 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm20
 ; AVX512-NEXT:    vpternlogq $184, %zmm5, %zmm17, %zmm20
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm30, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm31, %ymm1
+; AVX512-NEXT:    vmovdqa64 %ymm31, %ymm0
+; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm1
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
 ; AVX512-NEXT:    vpshufb %xmm7, %xmm0, %xmm2
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm3
-; AVX512-NEXT:    vmovdqa64 %ymm25, %ymm4
+; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm3
+; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm4
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm4
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[0,3,2,1]
@@ -5160,8 +5182,8 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm2, %zmm3, %zmm2
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm12[0],ymm13[1],ymm12[2,3,4,5],ymm13[6],ymm12[7]
-; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm3
-; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm8
+; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm3
+; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm8
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7]
 ; AVX512-NEXT:    vpshufb %xmm7, %xmm3, %xmm8
 ; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm7
@@ -5173,23 +5195,24 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    movw $31, %ax
 ; AVX512-NEXT:    kmovw %eax, %k1
 ; AVX512-NEXT:    vmovdqa32 %zmm8, %zmm2 {%k1}
-; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm8
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm8[0],ymm15[1],ymm8[2,3],ymm15[4],ymm8[5,6],ymm15[7]
+; AVX512-NEXT:    vmovdqa64 %ymm28, %ymm8
+; AVX512-NEXT:    vmovdqa64 %ymm29, %ymm10
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm8[0],ymm10[1],ymm8[2,3],ymm10[4],ymm8[5,6],ymm10[7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm10, %xmm8
-; AVX512-NEXT:    vpshufb %xmm9, %xmm8, %xmm14
+; AVX512-NEXT:    vpshufb %xmm9, %xmm8, %xmm12
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm10[0,3,2,1]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm9[0,1,0,2,4,5,6,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm14[4],xmm10[5],xmm14[6,7]
-; AVX512-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm14
-; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm10
-; AVX512-NEXT:    vmovdqa64 %ymm27, %ymm12
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3,4,5],ymm10[6],ymm12[7]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm15 = ymm10[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5,6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm15 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufb %xmm15, %xmm0, %xmm0
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm12[4],xmm10[5],xmm12[6,7]
+; AVX512-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512-NEXT:    vmovdqa64 %ymm25, %ymm12
+; AVX512-NEXT:    vmovdqa64 %ymm27, %ymm13
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3,4,5],ymm12[6],ymm13[7]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm14 = ymm12[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4],ymm10[5,6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm10
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm14 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufb %xmm14, %xmm0, %xmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7]
@@ -5200,7 +5223,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm4, %zmm0
-; AVX512-NEXT:    vpshufb %xmm15, %xmm3, %xmm3
+; AVX512-NEXT:    vpshufb %xmm14, %xmm3, %xmm3
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
@@ -5212,28 +5235,27 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,3]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5],xmm1[6,7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = ymm10[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = ymm12[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512-NEXT:    vmovdqu64 (%rsp), %zmm4 # 64-byte Reload
 ; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
 ; AVX512-NEXT:    movw $-2048, %ax # imm = 0xF800
 ; AVX512-NEXT:    kmovw %eax, %k1
-; AVX512-NEXT:    vmovdqa32 %zmm22, %zmm4 {%k1}
+; AVX512-NEXT:    vmovdqa32 %zmm30, %zmm4 {%k1}
 ; AVX512-NEXT:    vmovdqa64 %zmm4, (%rsi)
-; AVX512-NEXT:    vmovdqu64 (%rsp), %zmm4 # 64-byte Reload
-; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
-; AVX512-NEXT:    vmovdqa32 %zmm18, %zmm4 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm4, (%rdx)
+; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm15 # 64-byte Folded Reload
+; AVX512-NEXT:    vmovdqa32 %zmm18, %zmm15 {%k1}
+; AVX512-NEXT:    vmovdqa64 %zmm15, (%rdx)
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vpternlogq $184, %zmm2, %zmm17, %zmm14
+; AVX512-NEXT:    vpternlogq $184, %zmm2, %zmm17, %zmm10
 ; AVX512-NEXT:    vpternlogq $184, %zmm0, %zmm17, %zmm1
 ; AVX512-NEXT:    vmovdqa64 %zmm16, (%rcx)
 ; AVX512-NEXT:    vmovdqa64 %zmm20, (%r8)
-; AVX512-NEXT:    vmovdqa64 %zmm14, (%r9)
+; AVX512-NEXT:    vmovdqa64 %zmm10, (%r9)
 ; AVX512-NEXT:    vmovdqa64 %zmm1, (%rax)
-; AVX512-NEXT:    addq $136, %rsp
+; AVX512-NEXT:    addq $72, %rsp
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
@@ -5516,57 +5538,55 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-LABEL: load_i16_stride6_vf32:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    pushq %rax
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
-; AVX512DQ-NEXT:    vmovdqa 224(%rdi), %ymm0
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqa 224(%rdi), %ymm1
 ; AVX512DQ-NEXT:    vmovdqa 192(%rdi), %ymm13
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0],ymm0[1],ymm13[2,3],ymm0[4],ymm13[5,6],ymm0[7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm18
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm8, %xmm0
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm14
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm14[0,2,0,3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
-; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %ymm4
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm9
-; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm3
-; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %ymm6
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm6, %ymm22
-; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm23
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm15
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
-; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm15, %xmm4
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm5[2,2,2,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,2]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3],xmm4[4,5],xmm6[6],xmm4[7]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm1, %zmm4, %zmm16
-; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm10 = ymm3[2,3],mem[2,3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm9, %ymm19
-; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm21
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm6
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm6[0,2,0,3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3],xmm4[4,5],xmm2[6,7]
-; AVX512DQ-NEXT:    vinserti128 $1, 96(%rdi), %ymm3, %ymm12
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0],ymm10[1],ymm12[2,3,4,5],ymm10[6],ymm12[7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm10, %ymm29
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm2[0,1,2],ymm4[3,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa 352(%rdi), %ymm0
-; AVX512DQ-NEXT:    vmovdqa 320(%rdi), %ymm2
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0],ymm1[1],ymm13[2,3],ymm1[4],ymm13[5,6],ymm1[7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm18
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm1
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm9
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm9[0,2,0,3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7]
+; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %ymm2
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm5
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm6
+; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm4
+; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %ymm7
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm14 = ymm7[0,1],ymm2[2],ymm7[3,4],ymm2[5],ymm7[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm7, %ymm22
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm24
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm14, %xmm15
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm15[0,1,2],xmm2[3,4],xmm15[5,6,7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5]
+; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm16
+; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],mem[2,3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm6, %ymm19
+; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm21
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm6
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm6[0,2,0,3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2],xmm0[3],xmm5[4,5],xmm0[6,7]
+; AVX512DQ-NEXT:    vinserti128 $1, 96(%rdi), %ymm4, %ymm12
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm12[0],ymm1[1],ymm12[2,3,4,5],ymm1[6],ymm12[7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm29
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm0[0,1,2],ymm5[3,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa 352(%rdi), %ymm0
+; AVX512DQ-NEXT:    vmovdqa 320(%rdi), %ymm5
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm23
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm25
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm2
-; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm2, %xmm7
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm4[2,2,2,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,2]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3],xmm7[4,5],xmm11[6],xmm7[7]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm9
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm5
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm1[2,2,2,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm11 = xmm5[0,1,2],xmm11[3,4],xmm5[5,6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm11, %xmm7
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm8
 ; AVX512DQ-NEXT:    vmovdqa 256(%rdi), %ymm7
 ; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm7[2,3],mem[2,3]
 ; AVX512DQ-NEXT:    vinserti128 $1, 288(%rdi), %ymm7, %ymm11
@@ -5574,9 +5594,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm11, %ymm26
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm27
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm7[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm0[0,1,2],ymm9[3,4,5,6,7],ymm0[8,9,10],ymm9[11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
 ; AVX512DQ-NEXT:    vpternlogq $226, %zmm16, %zmm17, %zmm10
 ; AVX512DQ-NEXT:    movw $-2048, %ax # imm = 0xF800
@@ -5584,32 +5604,32 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1}
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm14, %xmm9
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm14 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm8, %xmm8
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3],xmm9[4,5],xmm8[6,7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm9 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm15, %xmm10
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm10[0,1,2],xmm5[3],xmm10[4,5],xmm5[6],xmm10[7]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm8, %zmm5, %zmm5
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm9, %xmm8
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm9 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm3, %xmm3
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm8[2],xmm3[3],xmm8[4,5],xmm3[6,7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm8 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm15, %xmm10
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3],xmm10[4,5],xmm14[6],xmm10[7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm3, %zmm10, %zmm3
 ; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
-; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm2, %xmm0
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5]
+; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm5, %xmm0
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm5, %zmm17, %zmm3
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm3, %zmm17, %zmm2
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm7[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1}
-; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1}
+; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm20
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm0
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7]
@@ -5622,7 +5642,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm15[2,1,2,0,4,5,6,7]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm1
-; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm2
+; AVX512DQ-NEXT:    vmovdqa64 %ymm24, %ymm2
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm14 = xmm1[2,1,0,3]
@@ -5637,23 +5657,23 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm0
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,1,2,3]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm5[0,0,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm4[2,1,2,0,4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm6[2,1,2,0,4,5,6,7]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm29, %ymm11
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm24, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm1
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[2,1,0,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm6[0,0,0,0,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,1,0,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm3[0,0,0,0,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,4,6,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[0,1,2,1]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4]
@@ -5681,74 +5701,74 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3],xmm7[4],xmm9[5,6],xmm7[7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm8, %zmm7, %zmm7
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2],xmm4[3],xmm5[4,5,6,7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2],xmm6[3],xmm5[4,5,6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm4[5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm6[1,1,1,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4],xmm4[5,6],xmm2[7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm7, %zmm0, %zmm3
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm7, %zmm0, %zmm4
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm16
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm3, %zmm17, %zmm16
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm5 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
+; AVX512DQ-NEXT:    vpternlogq $184, %zmm4, %zmm17, %zmm16
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm30, %ymm1
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm0, %xmm2
+; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm0, %xmm2
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm2
-; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm4
+; AVX512DQ-NEXT:    vmovdqa64 %ymm24, %ymm4
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm2
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm4[0,3,2,1]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm8[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm4[0,3,2,1]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm9[0,1,0,2,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6]
 ; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
-; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm2, %xmm6
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm2, %xmm5
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6,7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm3, %zmm4, %zmm3
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm11[0],ymm12[1],ymm11[2,3,4,5],ymm12[6],ymm11[7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm6
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2],ymm6[3,4],ymm13[5],ymm6[6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm6, %xmm13
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm13[2,2,2,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm9[1],xmm5[2,3],xmm9[4],xmm5[5,6,7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm3, %zmm4, %zmm4
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm11[0],ymm12[1],ymm11[2,3,4,5],ymm12[6],ymm11[7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm5
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2],ymm5[3,4],ymm13[5],ymm5[6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm5, %xmm6
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm13
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm13[2,2,2,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5,6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $236, %ymm10, %ymm9, %ymm5
+; AVX512DQ-NEXT:    vpternlogq $236, %ymm10, %ymm8, %ymm6
 ; AVX512DQ-NEXT:    movw $31, %ax
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    vinserti32x8 $0, %ymm5, %zmm0, %zmm3 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %ymm24, %ymm5
-; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm9
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm9 = ymm5[0],ymm9[1],ymm5[2,3],ymm9[4],ymm5[5,6],ymm9[7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm9, %xmm5
-; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm5, %xmm14
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm9[0,3,2,1]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm7[0,1,0,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm14[4],xmm9[5],xmm14[6,7]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512DQ-NEXT:    vinserti32x8 $0, %ymm6, %zmm0, %zmm4 {%k1}
+; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm6
+; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm8
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6],ymm8[7]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm6
+; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm6, %xmm14
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm8[0,3,2,1]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm7[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm14[4],xmm8[5],xmm14[6,7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm27, %ymm14
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm11
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4,5],ymm11[6],ymm14[7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm14 = ymm11[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3,4],ymm9[5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm9
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3,4],ymm8[5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm8
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm14 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
@@ -5756,19 +5776,19 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7]
 ; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,3]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5],xmm2[6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,3]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm9[0,1,2,3],xmm2[4],xmm9[5],xmm2[6,7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm0
-; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm6, %xmm2
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm13[1,1,2,3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1],xmm2[2,3],xmm6[4],xmm2[5,6,7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vpternlogq $236, %ymm10, %ymm4, %ymm2
+; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm5, %xmm2
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm13[1,1,2,3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5,6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vpternlogq $236, %ymm10, %ymm3, %ymm2
 ; AVX512DQ-NEXT:    vinserti32x8 $0, %ymm2, %zmm0, %zmm0 {%k1}
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm7[0,1,1,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5],xmm1[6,7]
@@ -5781,11 +5801,11 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512DQ-NEXT:    vmovaps %zmm2, (%rdx)
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm3, %zmm17, %zmm9
+; AVX512DQ-NEXT:    vpternlogq $184, %zmm4, %zmm17, %zmm8
 ; AVX512DQ-NEXT:    vpternlogq $184, %zmm0, %zmm17, %zmm1
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm18, (%rcx)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm16, (%r8)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm9, (%r9)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm8, (%r9)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm1, (%rax)
 ; AVX512DQ-NEXT:    popq %rax
 ; AVX512DQ-NEXT:    vzeroupper
@@ -8614,18 +8634,17 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1]
 ; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm5[2,3],ymm4[2,3]
-; AVX2-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm5[0,1],ymm4[0,1]
-; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm5[2,3],ymm4[2,3]
+; AVX2-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm15 = ymm5[0,1],ymm4[0,1]
 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3]
 ; AVX2-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1]
 ; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm0[2,3],ymm1[2,3]
-; AVX2-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm0[0,1],ymm1[0,1]
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3]
 ; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm0[0,1],ymm1[0,1]
+; AVX2-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
@@ -8644,11 +8663,11 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7]
 ; AVX2-NEXT:    vpshufb %xmm6, %xmm9, %xmm3
-; AVX2-NEXT:    vextracti128 $1, %ymm9, %xmm13
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm13[2,2,2,2,4,5,6,7]
+; AVX2-NEXT:    vextracti128 $1, %ymm9, %xmm11
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm11[2,2,2,2,4,5,6,7]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3],xmm7[4],xmm3[5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4,5],ymm12[6],ymm14[7]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm12, %ymm7
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm10, %ymm7
 ; AVX2-NEXT:    vpblendvb %ymm0, %ymm3, %ymm7, %ymm3
 ; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm7
@@ -8658,37 +8677,37 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7]
 ; AVX2-NEXT:    vpshufb %xmm6, %xmm3, %xmm7
 ; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm8
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm8[2,2,2,2,4,5,6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm10 = xmm7[0],xmm10[1],xmm7[2,3],xmm10[4],xmm7[5,6,7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0],ymm15[1],ymm11[2,3,4,5],ymm15[6],ymm11[7]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm7, %ymm11
-; AVX2-NEXT:    vpblendvb %ymm0, %ymm10, %ymm11, %ymm10
-; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 608(%rdi), %ymm11
-; AVX2-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 576(%rdi), %ymm10
-; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7]
-; AVX2-NEXT:    vpshufb %xmm6, %xmm10, %xmm11
-; AVX2-NEXT:    vextracti128 $1, %ymm10, %xmm6
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm8[2,2,2,2,4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm12 = xmm7[0],xmm12[1],xmm7[2,3],xmm12[4],xmm7[5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm13[0],ymm15[1],ymm13[2,3,4,5],ymm15[6],ymm13[7]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm7, %ymm13
+; AVX2-NEXT:    vpblendvb %ymm0, %ymm12, %ymm13, %ymm12
+; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 608(%rdi), %ymm13
+; AVX2-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 576(%rdi), %ymm12
+; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7]
+; AVX2-NEXT:    vpshufb %xmm6, %xmm12, %xmm13
+; AVX2-NEXT:    vextracti128 $1, %ymm12, %xmm6
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm6[2,2,2,2,4,5,6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm14 = xmm11[0],xmm14[1],xmm11[2,3],xmm14[4],xmm11[5,6,7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm11 = mem[0],ymm11[1],mem[2,3,4,5],ymm11[6],mem[7]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm11, %ymm2
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm14 = xmm13[0],xmm14[1],xmm13[2,3],xmm14[4],xmm13[5,6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm13 = mem[0],ymm13[1],mem[2,3,4,5],ymm13[6],mem[7]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm13, %ymm2
 ; AVX2-NEXT:    vpblendvb %ymm0, %ymm14, %ymm2, %ymm2
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm9, %xmm9
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,3]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,5,5,5]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm13 = xmm9[0],xmm13[1],xmm9[2,3],xmm13[4],xmm9[5,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,3]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm11 = xmm9[0],xmm11[1],xmm9[2,3],xmm11[4],xmm9[5,6,7]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
-; AVX2-NEXT:    vpshufb %ymm9, %ymm12, %ymm12
-; AVX2-NEXT:    vpblendvb %ymm0, %ymm13, %ymm12, %ymm12
-; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpshufb %ymm9, %ymm10, %ymm10
+; AVX2-NEXT:    vpblendvb %ymm0, %ymm11, %ymm10, %ymm10
+; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
@@ -8703,188 +8722,194 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpshufb %ymm9, %ymm7, %ymm3
 ; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm3, %ymm1
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufb %ymm9, %ymm11, %ymm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm10, %xmm2
+; AVX2-NEXT:    vpshufb %ymm9, %ymm13, %ymm1
+; AVX2-NEXT:    vpshufb %xmm2, %xmm12, %xmm2
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm6[1,1,2,3]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
 ; AVX2-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm0
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm13 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-NEXT:    vextracti128 $1, %ymm13, %xmm14
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm14[0,2,0,3]
+; AVX2-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm5 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm0
+; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
-; AVX2-NEXT:    vpshufb %xmm9, %xmm13, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
+; AVX2-NEXT:    vpshufb %xmm10, %xmm5, %xmm1
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm1 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vmovdqa %ymm2, %ymm4
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-NEXT:    vmovdqa 544(%rdi), %ymm0
-; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpshufb %ymm13, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-NEXT:    vmovdqa 544(%rdi), %ymm1
+; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 512(%rdi), %ymm2
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm11[2,2,2,2,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,2]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
-; AVX2-NEXT:    vextracti128 $1, %ymm11, %xmm3
-; AVX2-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm7[2,2,2,2,4,5,6,7]
+; AVX2-NEXT:    vextracti128 $1, %ymm7, %xmm2
+; AVX2-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5]
+; AVX2-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm10 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7]
-; AVX2-NEXT:    vextracti128 $1, %ymm10, %xmm1
-; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,0,3]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; AVX2-NEXT:    vpshufb %xmm9, %xmm10, %xmm2
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm6 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm0
+; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vpshufb %xmm10, %xmm6, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
 ; AVX2-NEXT:    vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm2 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7]
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
-; AVX2-NEXT:    vmovdqa %ymm4, %ymm15
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-NEXT:    vpshufb %ymm13, %ymm2, %ymm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-NEXT:    vmovdqa 352(%rdi), %ymm2
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 320(%rdi), %ymm3
 ; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm7[2,2,2,2,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,2]
-; AVX2-NEXT:    vextracti128 $1, %ymm7, %xmm3
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm4[2,2,2,2,4,5,6,7]
+; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm3
 ; AVX2-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
+; AVX2-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm4 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7]
-; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm1
-; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,0,3]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; AVX2-NEXT:    vpshufb %xmm9, %xmm4, %xmm2
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm3 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm14
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm14[0,2,0,3]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vpshufb %xmm10, %xmm3, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-NEXT:    vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7]
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufb %ymm15, %ymm2, %ymm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-NEXT:    vmovdqa 736(%rdi), %ymm2
-; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 704(%rdi), %ymm3
-; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm3[2,2,2,2,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,2]
-; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm2
-; AVX2-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpshufb %xmm0, %xmm2, %xmm6
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4,5],xmm5[6],xmm6[7]
-; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm5 = ymm1[0,1,2],ymm5[3,4,5,6,7],ymm1[8,9,10],ymm5[11,12,13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm1
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm2
+; AVX2-NEXT:    vpshufb %ymm13, %ymm2, %ymm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-NEXT:    vmovdqa 736(%rdi), %ymm8
+; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 704(%rdi), %ymm2
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm8[2],ymm2[3,4],ymm8[5],ymm2[6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm2[2,2,2,2,4,5,6,7]
 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm12
-; AVX2-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm2[2,2,2,2,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,2]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm0[0,1,2],xmm5[3],xmm0[4,5],xmm5[6],xmm0[7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm1 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
-; AVX2-NEXT:    vpshufb %xmm9, %xmm1, %xmm5
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm6
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm9 = xmm6[0,2,0,3]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm9 = xmm5[0,1],xmm9[2],xmm5[3],xmm9[4,5],xmm5[6,7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm5 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7]
-; AVX2-NEXT:    vpshufb %ymm15, %ymm5, %ymm15
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm15[3,4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm12[0,1,2],xmm8[3,4],xmm12[5,6,7]
+; AVX2-NEXT:    vpshufb %xmm1, %xmm8, %xmm8
 ; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4,5,6,7],ymm9[8,9,10],ymm8[11,12,13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0]
-; AVX2-NEXT:    vpshufb %xmm9, %xmm14, %xmm15
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm8 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
-; AVX2-NEXT:    vpshufb %xmm8, %xmm13, %xmm13
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm15 = xmm13[0,1],xmm15[2],xmm13[3],xmm15[4,5],xmm13[6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vpshufb %ymm13, %ymm0, %ymm14
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm15 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
-; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-NEXT:    vpshufb %xmm15, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm11[3],xmm0[4,5],xmm11[6],xmm0[7]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3,4,5,6,7],ymm14[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm8
+; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm0
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufb %xmm8, %xmm10, %xmm10
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm10[0,1],xmm0[2],xmm10[3],xmm0[4,5],xmm10[6,7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-NEXT:    vpshufb %ymm13, %ymm10, %ymm10
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3,4,5,6,7]
-; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX2-NEXT:    vpshufb %xmm15, %xmm10, %xmm10
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm0[2,2,2,2,4,5,6,7]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm11
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3,4],xmm11[5,6,7]
+; AVX2-NEXT:    vpshufb %xmm1, %xmm8, %xmm9
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6],mem[7]
+; AVX2-NEXT:    vpshufb %xmm10, %xmm1, %xmm8
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm10
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm15 = xmm10[0,2,0,3]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm15 = xmm8[0,1],xmm15[2],xmm8[3],xmm15[4,5],xmm8[6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm8 = mem[0],ymm8[1],mem[2,3,4,5],ymm8[6],mem[7]
+; AVX2-NEXT:    vpshufb %ymm13, %ymm8, %ymm13
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm9 = ymm13[0,1,2],ymm9[3,4,5,6,7],ymm13[8,9,10],ymm9[11,12,13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpshufd $198, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm9 = mem[2,1,0,3]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[1,1,1,1,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm9[2],xmm5[3,4],xmm9[5],xmm5[6],xmm9[7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-NEXT:    vpshufb %ymm9, %ymm13, %ymm13
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm5[0,1,2],ymm13[3,4,5,6,7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
+; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; AVX2-NEXT:    vpshufb %xmm5, %xmm15, %xmm15
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3],xmm10[4,5],xmm7[6],xmm10[7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm7 = xmm15[0,1,2],xmm7[3],xmm15[4,5],xmm7[6],xmm15[7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm7 = ymm0[0,1,2],ymm7[3,4,5,6,7],ymm0[8,9,10],ymm7[11,12,13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufb %xmm8, %xmm4, %xmm4
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3],xmm0[4,5],xmm4[6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm7 = ymm13[0,1,2],ymm7[3,4,5,6,7],ymm13[8,9,10],ymm7[11,12,13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpshufd $198, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm7 = mem[2,1,0,3]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6],xmm7[7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vpshufb %ymm9, %ymm7, %ymm7
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,0,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7]
+; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX2-NEXT:    vpshufb %xmm5, %xmm7, %xmm7
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3],xmm7[4,5],xmm4[6],xmm7[7]
+; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm14[2,1,0,3]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6],xmm4[7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT:    vpshufb %ymm13, %ymm4, %ymm4
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-NEXT:    vpshufb %xmm15, %xmm4, %xmm4
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7]
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufb %xmm15, %xmm12, %xmm0
+; AVX2-NEXT:    vpshufb %ymm9, %ymm4, %ymm4
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,0,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
+; AVX2-NEXT:    vpshufb %xmm5, %xmm12, %xmm4
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7]
-; AVX2-NEXT:    vpshufb %xmm9, %xmm6, %xmm2
-; AVX2-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7]
-; AVX2-NEXT:    vpshufb %ymm13, %ymm5, %ymm2
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3],xmm4[4,5],xmm2[6],xmm4[7]
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpshufb %xmm5, %xmm11, %xmm2
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7]
+; AVX2-NEXT:    vpshufb %ymm9, %ymm8, %ymm2
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm10[2,1,0,3]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3,4],xmm3[5],xmm1[6],xmm3[7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,0,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
@@ -8911,8 +8936,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[2,1,2,3]
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm5[0,0,2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm6[0,0,2,3,4,5,6,7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
@@ -8966,11 +8991,11 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[0,1,2,1]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[0,1,2,1]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,1,0,3]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm3[0,0,0,0,4,5,6,7]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,6,4]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,6,5,6,4]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
@@ -8994,26 +9019,26 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    # ymm4 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
 ; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm12
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm13 = xmm4[2,1,0,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm13[0,0,0,0,4,5,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm14 = xmm4[2,1,0,3]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm14[0,0,0,0,4,5,6,7]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm12[0,1,2,3,6,5,6,4]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm14[4],xmm4[5,6],xmm14[7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm12[0,1,2,3,6,5,6,4]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm13[4],xmm4[5,6],xmm13[7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm14 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm14 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7]
+; AVX2-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm15 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm15 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
 ; AVX2-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm4 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm15 = xmm4[2,1,2,3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm13 = xmm4[2,1,2,3]
 ; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm4
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm4[0,0,2,3,4,5,6,7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm15[2,1,2,0,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm13[2,1,2,0,4,5,6,7]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1,2],xmm11[3],xmm10[4,5,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb %ymm8, %ymm14, %ymm8
+; AVX2-NEXT:    vpshufb %ymm8, %ymm15, %ymm8
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,5,4]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6,7]
@@ -9026,19 +9051,19 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,7,7]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3],xmm0[4],xmm8[5,6],xmm0[7]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm5[1,2],xmm7[3],xmm5[4,5,6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm7 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-NEXT:    vpshufb %ymm5, %ymm8, %ymm8
+; AVX2-NEXT:    vpshufb %ymm7, %ymm8, %ymm8
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,2]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm8[5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,5,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,7,5,6,5]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6],xmm0[7]
@@ -9046,13 +9071,12 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
-; AVX2-NEXT:    vpshufb %ymm5, %ymm9, %ymm2
+; AVX2-NEXT:    vpshufb %ymm7, %ymm9, %ymm2
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-NEXT:    vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm0 = mem[0,1,2,3,7,5,6,5]
 ; AVX2-NEXT:    vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
@@ -9066,150 +9090,149 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
+; AVX2-NEXT:    vpshufb %ymm7, %ymm2, %ymm2
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,7,5,6,5]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm13[1,1,1,1,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5,6],xmm1[7]
-; AVX2-NEXT:    vpshufb %ymm5, %ymm14, %ymm2
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm15[3,1,2,1,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,7,5,6,5]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm14[1,1,1,1,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6],xmm0[7]
+; AVX2-NEXT:    vpshufb %ymm7, %ymm15, %ymm1
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm13[3,1,2,1,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm4[0,1,3,3,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm6[0,1,0,2,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6]
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
-; AVX2-NEXT:    vpshufb %xmm7, %xmm5, %xmm3
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm3 = mem[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,1]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm7[0,1,0,2,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
+; AVX2-NEXT:    vpshufb %xmm9, %xmm1, %xmm3
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm3 = mem[0,1,2,3,4],ymm0[5,6,7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm11
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm12 = xmm1[0,3,2,1]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm12[0,1,0,2,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6]
-; AVX2-NEXT:    vpshufb %xmm7, %xmm11, %xmm4
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm4 = mem[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm11
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm12 = xmm0[0,3,2,1]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm12[0,1,0,2,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
+; AVX2-NEXT:    vpshufb %xmm9, %xmm11, %xmm4
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm4 = mem[0,1,2,3,4],ymm0[5,6,7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm13
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm14[0,1,0,2,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6]
-; AVX2-NEXT:    vpshufb %xmm7, %xmm13, %xmm10
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm10[4],xmm1[5],xmm10[6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = mem[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm13
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm14 = xmm0[0,3,2,1]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm14[0,1,0,2,4,5,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
+; AVX2-NEXT:    vpshufb %xmm9, %xmm13, %xmm10
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm10[4],xmm0[5],xmm10[6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = mem[0,1,2,3,4],ymm0[5,6,7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
 ; AVX2-NEXT:    vextracti128 $1, %ymm10, %xmm15
-; AVX2-NEXT:    vpshufb %xmm7, %xmm15, %xmm7
+; AVX2-NEXT:    vpshufb %xmm9, %xmm15, %xmm9
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm10[0,3,2,1]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm0[0,1,0,2,4,5,6,7]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,6,6,6]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm7 = xmm10[0,1,2,3],xmm7[4],xmm10[5],xmm7[6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm10 = mem[0,1,2,3,4],ymm7[5,6,7]
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
-; AVX2-NEXT:    vpshufb %xmm1, %xmm11, %xmm7
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm12[0,1,1,3,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,3]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm7 = xmm11[0,1,2,3],xmm7[4],xmm11[5],xmm7[6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4],xmm10[5],xmm9[6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm10 = mem[0,1,2,3,4],ymm9[5,6,7]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
+; AVX2-NEXT:    vpshufb %xmm9, %xmm11, %xmm11
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,1,3,3]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5],xmm11[6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm11 = mem[0,1,2,3,4],ymm11[5,6,7]
+; AVX2-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[0,1,3,3]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4],xmm7[5],xmm1[6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-NEXT:    vpshufb %xmm9, %xmm13, %xmm7
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm14[0,1,1,3,4,5,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,1,3,3]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm7 = xmm12[0,1,2,3],xmm7[4],xmm12[5],xmm7[6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
 ; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7]
-; AVX2-NEXT:    vpshufb %xmm1, %xmm5, %xmm5
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,3]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5],xmm5[6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7]
-; AVX2-NEXT:    vpshufb %xmm1, %xmm13, %xmm6
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm14[0,1,1,3,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,3]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm11[0,1,2,3],xmm6[4],xmm11[5],xmm6[6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7]
-; AVX2-NEXT:    vpshufb %xmm1, %xmm15, %xmm1
+; AVX2-NEXT:    vpshufb %xmm9, %xmm15, %xmm9
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4],xmm0[5],xmm9[6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, 96(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, 32(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, 64(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, (%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, 96(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, 32(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, 64(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, (%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, 32(%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, 96(%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, 64(%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, (%rcx)
-; AVX2-NEXT:    vmovdqa %ymm9, 96(%r8)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, 32(%r8)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, 64(%r8)
-; AVX2-NEXT:    vmovdqa %ymm8, (%r8)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm9, 96(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm9, 32(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm9, 64(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm9, (%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm9, 96(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm9, 32(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm9, 64(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm9, (%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm9, 32(%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm9, 96(%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm9, 64(%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm9, (%rcx)
+; AVX2-NEXT:    vmovdqa %ymm6, 96(%r8)
+; AVX2-NEXT:    vmovdqa %ymm8, 32(%r8)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm6, 64(%r8)
+; AVX2-NEXT:    vmovdqa %ymm5, (%r8)
 ; AVX2-NEXT:    vmovdqa %ymm10, 96(%r9)
 ; AVX2-NEXT:    vmovdqa %ymm2, 32(%r9)
 ; AVX2-NEXT:    vmovdqa %ymm4, (%r9)
 ; AVX2-NEXT:    vmovdqa %ymm3, 64(%r9)
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    vmovdqa %ymm0, 96(%rax)
-; AVX2-NEXT:    vmovdqa %ymm6, 32(%rax)
-; AVX2-NEXT:    vmovdqa %ymm5, 64(%rax)
-; AVX2-NEXT:    vmovdqa %ymm7, (%rax)
+; AVX2-NEXT:    vmovdqa %ymm7, 32(%rax)
+; AVX2-NEXT:    vmovdqa %ymm1, 64(%rax)
+; AVX2-NEXT:    vmovdqa %ymm11, (%rax)
 ; AVX2-NEXT:    addq $1272, %rsp # imm = 0x4F8
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX2-FP-LABEL: load_i16_stride6_vf64:
 ; AVX2-FP:       # %bb.0:
-; AVX2-FP-NEXT:    subq $1256, %rsp # imm = 0x4E8
+; AVX2-FP-NEXT:    subq $1304, %rsp # imm = 0x518
 ; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm0
 ; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovaps 672(%rdi), %ymm2
@@ -9223,7 +9246,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqa 480(%rdi), %ymm6
 ; AVX2-FP-NEXT:    vmovdqa 448(%rdi), %ymm7
 ; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm10 = ymm7[2,3],ymm6[2,3]
-; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu %ymm10, (%rsp) # 32-byte Spill
 ; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1]
 ; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm5[2,3],ymm4[2,3]
@@ -9322,15 +9345,16 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm10, %xmm0
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm0[2,1,0,3]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm10, %xmm0
-; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0]
-; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm14, %xmm1
-; AVX2-FP-NEXT:    vmovdqa %xmm2, %xmm4
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3]
+; AVX2-FP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm10, %xmm0
+; AVX2-FP-NEXT:    vmovdqa %xmm2, %xmm6
+; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0]
+; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovdqu (%rsp), %ymm1 # 32-byte Reload
 ; AVX2-FP-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm1 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -9341,27 +9365,26 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 512(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
-; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm9, %xmm2
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm9, %xmm3
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm13, %xmm2
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm13, %xmm3
 ; AVX2-FP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vpshufb %xmm13, %xmm3, %xmm3
+; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm8 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm1
+; AVX2-FP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm11 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm11, %xmm1
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3]
 ; AVX2-FP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm8, %xmm1
-; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-FP-NEXT:    vmovdqa %xmm4, %xmm6
+; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm11, %xmm1
+; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7]
 ; AVX2-FP-NEXT:    vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm2 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7]
@@ -9373,41 +9396,41 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %ymm3
 ; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm5, %xmm2
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm3
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm8, %xmm2
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm3
 ; AVX2-FP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vpshufb %xmm13, %xmm3, %xmm3
+; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm4 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm1
+; AVX2-FP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm7 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm7, %xmm1
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3]
 ; AVX2-FP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm4, %xmm1
-; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
+; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm7, %xmm1
+; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7]
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-FP-NEXT:    vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufb %ymm15, %ymm2, %ymm2
+; AVX2-FP-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa 736(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 704(%rdi), %ymm3
 ; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm3, %xmm6
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm2
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm5, %xmm6
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm2
 ; AVX2-FP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vpshufb %xmm13, %xmm2, %xmm7
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7]
+; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm2, %xmm9
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3],xmm9[4,5],xmm6[6],xmm9[7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm1[0,1,2],ymm6[3,4,5,6,7],ymm1[8,9,10],ymm6[11,12,13,14,15]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
@@ -9416,89 +9439,94 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm12
-; AVX2-FP-NEXT:    vpshufb %xmm13, %xmm12, %xmm6
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm6[0,1,2],xmm0[3],xmm6[4,5],xmm0[6],xmm6[7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm14
+; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm14, %xmm4
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm4[0,1,2],xmm0[3],xmm4[4,5],xmm0[6],xmm4[7]
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm1 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
-; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm1, %xmm6
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm11
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm11[2,1,0,3]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm6[0,1],xmm11[2],xmm6[3],xmm11[4,5],xmm6[6,7]
+; AVX2-FP-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm2[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm9
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[2,1,0,3]
+; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm9, %xmm12
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm4[0,1],xmm12[2],xmm4[3],xmm12[4,5],xmm4[6,7]
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm6 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7]
-; AVX2-FP-NEXT:    vpshufb %ymm15, %ymm6, %ymm15
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm15[3,4,5,6,7]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7],ymm11[8,9,10],ymm7[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm4 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7]
+; AVX2-FP-NEXT:    vpshufb %ymm15, %ymm4, %ymm15
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3,4,5,6,7]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm12[0,1,2],ymm6[3,4,5,6,7],ymm12[8,9,10],ymm6[11,12,13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm6[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm11 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm10, %xmm7
-; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0]
-; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm14, %xmm15
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm15 = xmm7[0,1],xmm15[2],xmm7[3],xmm15[4,5],xmm7[6,7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [6,7,2,3,12,13,14,15,6,7,2,3,12,13,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm10, %xmm6
+; AVX2-FP-NEXT:    vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; AVX2-FP-NEXT:    # xmm10 = mem[1,1,1,1,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm15 = xmm6[0,1],xmm10[2],xmm6[3,4],xmm10[5],xmm6[6],xmm10[7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm0, %ymm14
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm10 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm15, %xmm15
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm15 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
-; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FP-NEXT:    vpshufb %xmm15, %xmm0, %xmm0
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm9[3],xmm0[4,5],xmm9[6],xmm0[7]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3,4,5,6,7],ymm14[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-FP-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
+; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,5,5,5]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[3],xmm1[4,5],xmm13[6],xmm1[7]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm8, %xmm0
-; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm8, %xmm8
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2],xmm0[3],xmm8[4,5],xmm0[6,7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm8, %ymm8
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX2-FP-NEXT:    vpshufb %xmm15, %xmm8, %xmm8
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm0[0,1,2],ymm5[3,4,5,6,7],ymm0[8,9,10],ymm5[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm11, %xmm0
+; AVX2-FP-NEXT:    vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX2-FP-NEXT:    # xmm1 = mem[1,1,1,1,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vpshufb %ymm6, %ymm1, %ymm1
+; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-FP-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
+; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[3],xmm1[4,5],xmm8[6],xmm1[7]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm4, %xmm0
-; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm4, %xmm4
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2],xmm0[3],xmm4[4,5],xmm0[6,7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-FP-NEXT:    vpshufb %xmm15, %xmm4, %xmm4
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm7, %xmm0
+; AVX2-FP-NEXT:    vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX2-FP-NEXT:    # xmm1 = mem[1,1,1,1,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vpshufb %ymm6, %ymm1, %ymm1
+; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-FP-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
+; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3],xmm1[4,5],xmm5[6],xmm1[7]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufb %xmm15, %xmm12, %xmm0
-; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7]
-; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm1, %xmm1
-; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm13, %xmm2
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7]
-; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm6, %ymm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpshufb %xmm15, %xmm14, %xmm0
+; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,5,5,5,5]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
+; AVX2-FP-NEXT:    vpshufb %ymm6, %ymm4, %ymm1
+; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm9[1,1,1,1,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6],xmm3[7]
+; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovdqu (%rsp), %ymm0 # 32-byte Reload
 ; AVX2-FP-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -9506,7 +9534,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3]
-; AVX2-FP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1]
 ; AVX2-FP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -9622,7 +9650,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
-; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX2-FP-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
 ; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-FP-NEXT:    vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
 ; AVX2-FP-NEXT:    # xmm3 = mem[0,1,2,3,7,5,6,5]
@@ -9793,13 +9821,13 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqa %ymm7, 32(%rax)
 ; AVX2-FP-NEXT:    vmovdqa %ymm6, 64(%rax)
 ; AVX2-FP-NEXT:    vmovdqa %ymm12, (%rax)
-; AVX2-FP-NEXT:    addq $1256, %rsp # imm = 0x4E8
+; AVX2-FP-NEXT:    addq $1304, %rsp # imm = 0x518
 ; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
 ;
 ; AVX2-FCP-LABEL: load_i16_stride6_vf64:
 ; AVX2-FCP:       # %bb.0:
-; AVX2-FCP-NEXT:    subq $1256, %rsp # imm = 0x4E8
+; AVX2-FCP-NEXT:    subq $1304, %rsp # imm = 0x518
 ; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm0
 ; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovaps 672(%rdi), %ymm2
@@ -9813,7 +9841,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqa 480(%rdi), %ymm6
 ; AVX2-FCP-NEXT:    vmovdqa 448(%rdi), %ymm7
 ; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm10 = ymm7[2,3],ymm6[2,3]
-; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu %ymm10, (%rsp) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm5[2,3],ymm4[2,3]
@@ -9912,15 +9940,16 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm0
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm0[2,1,0,3]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm10, %xmm0
-; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0]
-; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm14, %xmm1
-; AVX2-FCP-NEXT:    vmovdqa %xmm2, %xmm4
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3]
+; AVX2-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm10, %xmm0
+; AVX2-FCP-NEXT:    vmovdqa %xmm2, %xmm6
+; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0]
+; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm1 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm1 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -9931,27 +9960,26 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 512(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
 ; AVX2-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
-; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm9, %xmm2
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm3
+; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm13, %xmm2
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm3
 ; AVX2-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpshufb %xmm13, %xmm3, %xmm3
+; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm8 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm1
+; AVX2-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm11 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm1
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3]
 ; AVX2-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm8, %xmm1
-; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-FCP-NEXT:    vmovdqa %xmm4, %xmm6
+; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm11, %xmm1
+; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7]
 ; AVX2-FCP-NEXT:    vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm2 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7]
@@ -9963,41 +9991,41 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm3
 ; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm2
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm3
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm8, %xmm2
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm3
 ; AVX2-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpshufb %xmm13, %xmm3, %xmm3
+; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm4 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm1
+; AVX2-FCP-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm7 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm1
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3]
 ; AVX2-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm4, %xmm1
-; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
+; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm7, %xmm1
+; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufb %ymm15, %ymm2, %ymm2
+; AVX2-FCP-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa 736(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 704(%rdi), %ymm3
 ; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm6
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm6
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm2
 ; AVX2-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpshufb %xmm13, %xmm2, %xmm7
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7]
+; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm9
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3],xmm9[4,5],xmm6[6],xmm9[7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm1[0,1,2],ymm6[3,4,5,6,7],ymm1[8,9,10],ymm6[11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
@@ -10006,89 +10034,94 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm12
-; AVX2-FCP-NEXT:    vpshufb %xmm13, %xmm12, %xmm6
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm6[0,1,2],xmm0[3],xmm6[4,5],xmm0[6],xmm6[7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm14
+; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm14, %xmm4
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm4[0,1,2],xmm0[3],xmm4[4,5],xmm0[6],xmm4[7]
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
-; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm1, %xmm6
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm11
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm11[2,1,0,3]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm6[0,1],xmm11[2],xmm6[3],xmm11[4,5],xmm6[6,7]
+; AVX2-FCP-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm2[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm9
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[2,1,0,3]
+; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm9, %xmm12
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm4[0,1],xmm12[2],xmm4[3],xmm12[4,5],xmm4[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm6 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7]
-; AVX2-FCP-NEXT:    vpshufb %ymm15, %ymm6, %ymm15
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm15[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7],ymm11[8,9,10],ymm7[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm4 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7]
+; AVX2-FCP-NEXT:    vpshufb %ymm15, %ymm4, %ymm15
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm12[0,1,2],ymm6[3,4,5,6,7],ymm12[8,9,10],ymm6[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm6[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm10, %xmm7
-; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0]
-; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm14, %xmm15
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm15 = xmm7[0,1],xmm15[2],xmm7[3],xmm15[4,5],xmm7[6,7]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [6,7,2,3,12,13,14,15,6,7,2,3,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm10, %xmm6
+; AVX2-FCP-NEXT:    vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm10 = mem[1,1,1,1,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm15 = xmm6[0,1],xmm10[2],xmm6[3,4],xmm10[5],xmm6[6],xmm10[7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm0, %ymm14
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm15, %xmm15
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
-; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm0, %xmm0
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm9[3],xmm0[4,5],xmm9[6],xmm0[7]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3,4,5,6,7],ymm14[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
+; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,5,5,5]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[3],xmm1[4,5],xmm13[6],xmm1[7]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm8, %xmm0
-; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm8, %xmm8
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2],xmm0[3],xmm8[4,5],xmm0[6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm8, %ymm8
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm8, %xmm8
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm0[0,1,2],ymm5[3,4,5,6,7],ymm0[8,9,10],ymm5[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm11, %xmm0
+; AVX2-FCP-NEXT:    vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm1 = mem[1,1,1,1,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm1
+; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
+; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[3],xmm1[4,5],xmm8[6],xmm1[7]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm4, %xmm0
-; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm4, %xmm4
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2],xmm0[3],xmm4[4,5],xmm0[6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm4, %xmm4
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm7, %xmm0
+; AVX2-FCP-NEXT:    vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm1 = mem[1,1,1,1,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm1
+; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
+; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3],xmm1[4,5],xmm5[6],xmm1[7]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm12, %xmm0
-; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7]
-; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm1, %xmm1
-; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm13, %xmm2
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7]
-; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm6, %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm14, %xmm0
+; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,5,5,5,5]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
+; AVX2-FCP-NEXT:    vpshufb %ymm6, %ymm4, %ymm1
+; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm9[1,1,1,1,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6],xmm3[7]
+; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm0 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -10096,7 +10129,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3]
-; AVX2-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1]
 ; AVX2-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -10212,7 +10245,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
-; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX2-FCP-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
 ; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-FCP-NEXT:    vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
 ; AVX2-FCP-NEXT:    # xmm3 = mem[0,1,2,3,7,5,6,5]
@@ -10383,38 +10416,37 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqa %ymm7, 32(%rax)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm6, 64(%rax)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm12, (%rax)
-; AVX2-FCP-NEXT:    addq $1256, %rsp # imm = 0x4E8
+; AVX2-FCP-NEXT:    addq $1304, %rsp # imm = 0x518
 ; AVX2-FCP-NEXT:    vzeroupper
 ; AVX2-FCP-NEXT:    retq
 ;
 ; AVX512-LABEL: load_i16_stride6_vf64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    subq $1480, %rsp # imm = 0x5C8
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
 ; AVX512-NEXT:    vmovdqa 608(%rdi), %ymm0
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vmovdqa 576(%rdi), %ymm1
 ; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
-; AVX512-NEXT:    vpshufb %xmm10, %xmm1, %xmm0
+; AVX512-NEXT:    vpshufb %xmm9, %xmm1, %xmm0
 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm20
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm16
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[0,2,0,3]
-; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm16
+; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm20
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
 ; AVX512-NEXT:    vmovdqa 544(%rdi), %ymm1
 ; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vmovdqa 512(%rdi), %ymm2
 ; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm14 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512-NEXT:    vextracti128 $1, %ymm14, %xmm2
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
-; AVX512-NEXT:    vpshufb %xmm12, %xmm2, %xmm1
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX512-NEXT:    vextracti128 $1, %ymm13, %xmm2
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm13[2,2,2,2,4,5,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
 ; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm21
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5]
+; AVX512-NEXT:    vpshufb %xmm11, %xmm1, %xmm1
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -10423,9 +10455,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vmovdqa 384(%rdi), %ymm2
 ; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
-; AVX512-NEXT:    vpshufb %xmm10, %xmm11, %xmm1
-; AVX512-NEXT:    vextracti128 $1, %ymm11, %xmm3
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm12 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
+; AVX512-NEXT:    vpshufb %xmm9, %xmm12, %xmm1
+; AVX512-NEXT:    vextracti128 $1, %ymm12, %xmm3
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[0,2,0,3]
 ; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm22
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
@@ -10435,57 +10467,55 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vinserti128 $1, 480(%rdi), %ymm0, %ymm0
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX512-NEXT:    vpshufb %ymm5, %ymm2, %ymm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm10 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
+; AVX512-NEXT:    vpshufb %ymm10, %ymm2, %ymm0
 ; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm23
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 736(%rdi), %ymm0
-; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa 704(%rdi), %ymm1
+; AVX512-NEXT:    vmovdqa 640(%rdi), %ymm0
+; AVX512-NEXT:    vmovdqa 736(%rdi), %ymm1
 ; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm8[2,2,2,2,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
+; AVX512-NEXT:    vmovdqa 704(%rdi), %ymm2
+; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm2
-; AVX512-NEXT:    vpshufb %xmm12, %xmm2, %xmm1
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm8[2,2,2,2,4,5,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
 ; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm28
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
-; AVX512-NEXT:    vmovdqa 640(%rdi), %ymm1
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3]
+; AVX512-NEXT:    vpshufb %xmm11, %xmm1, %xmm1
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],mem[2,3]
 ; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vinserti128 $1, 672(%rdi), %ymm1, %ymm1
-; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7]
+; AVX512-NEXT:    vinserti128 $1, 672(%rdi), %ymm0, %ymm0
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
-; AVX512-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
+; AVX512-NEXT:    vpshufb %ymm3, %ymm2, %ymm0
 ; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm17
 ; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm29
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vmovdqa 224(%rdi), %ymm0
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vmovdqa 192(%rdi), %ymm1
 ; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
-; AVX512-NEXT:    vextracti128 $1, %ymm13, %xmm15
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm14 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
+; AVX512-NEXT:    vextracti128 $1, %ymm14, %xmm15
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm15[0,2,0,3]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; AVX512-NEXT:    vpshufb %xmm10, %xmm13, %xmm1
+; AVX512-NEXT:    vpshufb %xmm9, %xmm14, %xmm1
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
 ; AVX512-NEXT:    vmovdqa 160(%rdi), %ymm1
 ; AVX512-NEXT:    vmovdqa 128(%rdi), %ymm2
 ; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
 ; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm30
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,2]
 ; AVX512-NEXT:    vextracti128 $1, %ymm4, %xmm6
-; AVX512-NEXT:    vpshufb %xmm12, %xmm6, %xmm2
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3,4],xmm6[5,6,7]
+; AVX512-NEXT:    vpshufb %xmm11, %xmm1, %xmm1
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -10493,31 +10523,30 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX512-NEXT:    vpshufb %xmm10, %xmm3, %xmm1
-; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm10
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm10[0,2,0,3]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7]
-; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm2
-; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],mem[2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX512-NEXT:    vpshufb %xmm9, %xmm5, %xmm1
+; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm9
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm9[0,2,0,3]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7]
+; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm3
+; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],mem[2,3]
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vinserti128 $1, 96(%rdi), %ymm2, %ymm2
+; AVX512-NEXT:    vinserti128 $1, 96(%rdi), %ymm3, %ymm2
 ; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7]
-; AVX512-NEXT:    vpshufb %ymm5, %ymm7, %ymm2
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3,4,5,6,7]
+; AVX512-NEXT:    vpshufb %ymm10, %ymm7, %ymm3
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm3[3,4,5,6,7]
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa 352(%rdi), %ymm0
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vmovdqa 320(%rdi), %ymm1
 ; AVX512-NEXT:    vmovdqu %ymm1, (%rsp) # 32-byte Spill
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX512-NEXT:    vextracti128 $1, %ymm9, %xmm5
-; AVX512-NEXT:    vpshufb %xmm12, %xmm5, %xmm1
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm9[2,2,2,2,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm12 = xmm1[0,1,2],xmm12[3],xmm1[4,5],xmm12[6],xmm1[7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX512-NEXT:    vextracti128 $1, %ymm10, %xmm3
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm10[2,2,2,2,4,5,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7]
+; AVX512-NEXT:    vpshufb %xmm11, %xmm1, %xmm11
 ; AVX512-NEXT:    vmovdqa 256(%rdi), %ymm1
 ; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3]
 ; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -10526,64 +10555,64 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm31
 ; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm0
 ; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
-; AVX512-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15]
+; AVX512-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm11 = ymm0[0,1,2],ymm11[3,4,5,6,7],ymm0[8,9,10],ymm11[11,12,13,14,15]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0]
-; AVX512-NEXT:    vpshufb %xmm12, %xmm15, %xmm0
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm11 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0]
+; AVX512-NEXT:    vpshufb %xmm11, %xmm15, %xmm0
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm15 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
-; AVX512-NEXT:    vpshufb %xmm15, %xmm13, %xmm13
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2],xmm13[3],xmm0[4,5],xmm13[6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm13 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
-; AVX512-NEXT:    vpshufb %xmm13, %xmm6, %xmm6
+; AVX512-NEXT:    vpshufb %xmm15, %xmm14, %xmm14
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm0[2],xmm14[3],xmm0[4,5],xmm14[6,7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm14 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
+; AVX512-NEXT:    vpshufb %xmm14, %xmm6, %xmm6
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4,5],xmm4[6],xmm6[7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm4, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufb %xmm12, %xmm10, %xmm0
-; AVX512-NEXT:    vpshufb %xmm15, %xmm3, %xmm3
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX512-NEXT:    vpshufb %ymm3, %ymm7, %ymm4
+; AVX512-NEXT:    vpshufb %xmm11, %xmm9, %xmm0
+; AVX512-NEXT:    vpshufb %xmm15, %xmm5, %xmm2
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX512-NEXT:    vpshufb %ymm2, %ymm7, %ymm4
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7]
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufb %xmm13, %xmm5, %xmm0
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,5,5,5,5]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
-; AVX512-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX512-NEXT:    vpshufb %xmm14, %xmm3, %xmm0
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,5,5,5,5]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5],xmm3[6],xmm0[7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
+; AVX512-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm27
-; AVX512-NEXT:    vmovdqa64 %xmm16, %xmm0
-; AVX512-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
-; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm1
+; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm0
+; AVX512-NEXT:    vpshufb %xmm11, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm1
 ; AVX512-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
 ; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm1
-; AVX512-NEXT:    vpshufb %xmm13, %xmm1, %xmm1
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm14[0,1,2,3,5,5,5,5]
+; AVX512-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,5,5,5,5]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5],xmm4[6],xmm1[7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm0
-; AVX512-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm1
-; AVX512-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
-; AVX512-NEXT:    vpshufb %xmm15, %xmm11, %xmm3
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4,5],xmm3[6,7]
+; AVX512-NEXT:    vpshufb %xmm11, %xmm1, %xmm1
+; AVX512-NEXT:    vpshufb %xmm15, %xmm12, %xmm2
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %ymm29, %ymm0
-; AVX512-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovdqa64 %xmm28, %xmm1
-; AVX512-NEXT:    vpshufb %xmm13, %xmm1, %xmm1
+; AVX512-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,5,5,5,5]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
@@ -10592,8 +10621,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm26
 ; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX512-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,1,2,3]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
@@ -10801,8 +10830,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpternlogq $226, %zmm0, %zmm29, %zmm2
 ; AVX512-NEXT:    vpternlogq $184, %zmm2, %zmm22, %zmm28
 ; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; AVX512-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm2, %xmm0
 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm4
@@ -10819,7 +10848,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6]
 ; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm13 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
 ; AVX512-NEXT:    vpshufb %xmm13, %xmm5, %xmm4
-; AVX512-NEXT:    vmovdqa64 %xmm5, %xmm21
+; AVX512-NEXT:    vmovdqa64 %xmm5, %xmm20
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6,7]
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm3, %zmm30
@@ -10832,12 +10861,12 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm10, %xmm0
 ; AVX512-NEXT:    vextracti128 $1, %ymm10, %xmm4
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm4[2,2,2,2,4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %xmm4, %xmm26
+; AVX512-NEXT:    vmovdqa64 %xmm4, %xmm21
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7]
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} ymm29 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512-NEXT:    vpshufb %ymm0, %ymm5, %ymm4
-; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm27
+; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm26
 ; AVX512-NEXT:    vpternlogq $236, %ymm29, %ymm4, %ymm3
 ; AVX512-NEXT:    movw $31, %ax
 ; AVX512-NEXT:    kmovw %eax, %k1
@@ -10851,7 +10880,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm6
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[0,3,2,1]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm4[0,1,0,2,4,5,6,7]
-; AVX512-NEXT:    vmovdqa64 %xmm4, %xmm20
+; AVX512-NEXT:    vmovdqa64 %xmm4, %xmm27
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6]
 ; AVX512-NEXT:    vpshufb %xmm13, %xmm6, %xmm4
 ; AVX512-NEXT:    vmovdqa64 %xmm6, %xmm18
@@ -10918,7 +10947,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
 ; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
-; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm0
+; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm0
 ; AVX512-NEXT:    vpshufb %xmm6, %xmm0, %xmm3
 ; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm0
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm0[0,1,1,3,4,5,6,7]
@@ -10927,17 +10956,17 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm2, %zmm3, %zmm2
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm10, %xmm3
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm26[1,1,2,3]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm21[1,1,2,3]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1],xmm3[2,3],xmm10[4],xmm3[5,6,7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512-NEXT:    vmovdqa64 %ymm27, %ymm0
+; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm0
 ; AVX512-NEXT:    vpshufb %ymm10, %ymm0, %ymm13
 ; AVX512-NEXT:    vpternlogq $236, %ymm29, %ymm13, %ymm3
 ; AVX512-NEXT:    vmovdqa32 %zmm3, %zmm2 {%k1}
 ; AVX512-NEXT:    vmovdqa64 %xmm18, %xmm0
 ; AVX512-NEXT:    vpshufb %xmm6, %xmm0, %xmm3
-; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm0
+; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm0
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm0[0,1,1,3,4,5,6,7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm13[0,1,2,3],xmm3[4],xmm13[5],xmm3[6,7]
@@ -11659,15 +11688,14 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 512(%rdi), %ymm2
 ; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm2
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm2, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm23
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm21
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,2]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm2[2,2,2,2,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm23
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm22
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5]
+; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
 ; AVX512DQ-NEXT:    vmovdqa 448(%rdi), %ymm1
@@ -11677,9 +11705,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm15 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
 ; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm15, %xmm2
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm15, %xmm4
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm4[0,2,0,3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm22
+; AVX512DQ-NEXT:    vextracti32x4 $1, %ymm15, %xmm21
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm21[0,2,0,3]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7]
 ; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3]
@@ -11691,31 +11718,30 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm3, %ymm1
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm20
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa 736(%rdi), %ymm1
-; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 704(%rdi), %ymm2
+; AVX512DQ-NEXT:    vmovdqa 640(%rdi), %ymm1
+; AVX512DQ-NEXT:    vmovdqa 736(%rdi), %ymm2
 ; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm11[2,2,2,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,2]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm11, %xmm4
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm4, %xmm2
+; AVX512DQ-NEXT:    vmovdqa 704(%rdi), %ymm4
+; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm12, %xmm4
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm12[2,2,2,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3,4],xmm4[5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm19
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
-; AVX512DQ-NEXT:    vmovdqa 640(%rdi), %ymm2
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm2[2,3],mem[2,3]
+; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm1[2,3],mem[2,3]
 ; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vinserti128 $1, 672(%rdi), %ymm2, %ymm2
-; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0],ymm4[1],ymm2[2,3,4,5],ymm4[6],ymm2[7]
+; AVX512DQ-NEXT:    vinserti128 $1, 672(%rdi), %ymm1, %ymm1
+; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0],ymm4[1],ymm1[2,3,4,5],ymm4[6],ymm1[7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
-; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm4, %ymm2
+; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm4, %ymm1
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm27
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm18
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
 ; AVX512DQ-NEXT:    vpternlogq $226, %zmm0, %zmm16, %zmm3
 ; AVX512DQ-NEXT:    movw $-2048, %ax # imm = 0xF800
@@ -11735,43 +11761,41 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %ymm1
 ; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %ymm2
-; AVX512DQ-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,2]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm8
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm8, %xmm2
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3,4],xmm8[5,6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm17
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm3, %xmm1
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm9
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm9[0,2,0,3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7]
-; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm2
-; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],mem[2,3]
-; AVX512DQ-NEXT:    vinserti128 $1, 96(%rdi), %ymm2, %ymm2
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm30
-; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm31
-; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm5, %ymm2
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm1[0,1,2],ymm2[3,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm5, %xmm1
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm9
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm9[0,2,0,3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7]
+; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm3
+; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],mem[2,3]
+; AVX512DQ-NEXT:    vinserti128 $1, 96(%rdi), %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm3[0],ymm0[1],ymm3[2,3,4,5],ymm0[6],ymm3[7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm31
+; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm30
+; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm7, %ymm3
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm1[0,1,2],ymm3[3,4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa 352(%rdi), %ymm0
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 320(%rdi), %ymm1
 ; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm10, %xmm7
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm7, %xmm1
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm10[2,2,2,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,2]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm12 = xmm1[0,1,2],xmm12[3],xmm1[4,5],xmm12[6],xmm1[7]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm10, %xmm3
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm10[2,2,2,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm1, %xmm11
 ; AVX512DQ-NEXT:    vmovdqa 256(%rdi), %ymm1
 ; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3]
 ; AVX512DQ-NEXT:    vinserti128 $1, 288(%rdi), %ymm1, %ymm0
@@ -11780,15 +11804,15 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm29
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm27, %ymm0
 ; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm0[0,1,2],ymm11[3,4,5,6,7],ymm0[8,9,10],ymm11[11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
 ; AVX512DQ-NEXT:    vpternlogq $226, %zmm17, %zmm16, %zmm6
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1}
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0]
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm14, %xmm0
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm11 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0]
+; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm14, %xmm0
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm14 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm13, %xmm13
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2],xmm13[3],xmm0[4,5],xmm13[6,7]
@@ -11798,13 +11822,13 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4,5],xmm4[6],xmm6[7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm4, %zmm0
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm9, %xmm4
-; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm3, %xmm3
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3],xmm4[4,5],xmm3[6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm9, %xmm4
+; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm5, %xmm2
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3],xmm4[4,5],xmm2[6,7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm5, %ymm5
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm5[3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm7, %xmm2
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm7, %ymm5
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0,1,2],ymm5[3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm3, %xmm2
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,5,5,5,5]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
@@ -11817,21 +11841,21 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm5 {%k1}
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm0
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm1
 ; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm23, %xmm1
+; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm1
 ; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm2
+; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm2
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm1
 ; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm2
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm2
+; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm2, %xmm2
 ; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm15, %xmm4
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm1[3,4,5,6,7]
@@ -11839,7 +11863,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm2
 ; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,5,5,5,5]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,5,5,5,5]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
@@ -11849,20 +11873,20 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1}
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; AVX512DQ-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,1,2,3]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm21
+; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm20
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm2[2,1,2,0,4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm22
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqu (%rsp), %ymm1 # 32-byte Reload
-; AVX512DQ-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512DQ-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm3[0,0,0,0,4,5,6,7]
@@ -11874,7 +11898,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm2
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-NEXT:    vmovdqu (%rsp), %ymm0 # 32-byte Reload
 ; AVX512DQ-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -11886,8 +11910,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm3[2,1,2,0,4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm16
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm30, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm31, %ymm3
+; AVX512DQ-NEXT:    vmovdqa64 %ymm31, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm30, %ymm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm0[0,1],ymm3[2],ymm0[3],ymm3[4],ymm0[5,6],ymm3[7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
 ; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm4, %ymm3
@@ -11915,8 +11939,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm3
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm2, %zmm20, %zmm1
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm2, %zmm21, %zmm1
 ; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
 ; AVX512DQ-NEXT:    vpternlogq $184, %zmm1, %zmm2, %zmm3
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm18
@@ -11979,11 +12003,11 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm28
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm19, %zmm20, %zmm2
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm19, %zmm21, %zmm2
 ; AVX512DQ-NEXT:    vpternlogq $184, %zmm2, %zmm18, %zmm28
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm0
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm2
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
@@ -11994,7 +12018,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,7,7]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm14[0,1,2,3],xmm2[4],xmm14[5,6],xmm2[7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm21
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm20
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm16, %xmm0
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[3,1,2,1,4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm0
@@ -12017,7 +12041,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,4,5]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm27
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm21, %zmm20, %zmm0
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm20, %zmm21, %zmm0
 ; AVX512DQ-NEXT:    vpternlogq $184, %zmm0, %zmm18, %zmm27
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm11[3,1,2,1,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,3,4,5,6,7]
@@ -12045,178 +12069,178 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm22
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm0, %zmm20, %zmm2
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm2, %zmm18, %zmm22
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm20
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm0, %zmm21, %zmm2
+; AVX512DQ-NEXT:    vpternlogq $184, %zmm2, %zmm18, %zmm20
 ; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm4 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm4, %xmm1
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm14
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7]
-; AVX512DQ-NEXT:    vmovdqu (%rsp), %ymm2 # 32-byte Reload
-; AVX512DQ-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7]
+; AVX512DQ-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm0
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm13
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm13[2,2,2,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512DQ-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm5
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,1]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[0,1,0,2,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm19
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm4[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm19
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6]
-; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm5, %xmm3
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm11 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
+; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm5, %xmm4
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm18
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6,7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm23
-; AVX512DQ-NEXT:    vmovdqa64 %ymm30, %ymm1
-; AVX512DQ-NEXT:    vmovdqa64 %ymm31, %ymm2
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7]
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512DQ-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm10 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm10, %xmm1
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm10, %xmm3
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm30
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm5, %ymm3
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm23
+; AVX512DQ-NEXT:    vmovdqa64 %ymm31, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm30, %ymm2
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7]
+; AVX512DQ-NEXT:    vmovdqu (%rsp), %ymm0 # 32-byte Reload
+; AVX512DQ-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm9 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm9, %xmm0
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm9, %xmm4
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm4[2,2,2,2,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm30
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} ymm22 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm5, %ymm4
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm31
-; AVX512DQ-NEXT:    vpternlogq $236, %ymm20, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vpternlogq $236, %ymm22, %ymm4, %ymm2
 ; AVX512DQ-NEXT:    movw $31, %ax
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-NEXT:    vinserti32x8 $0, %ymm2, %zmm0, %zmm23 {%k1}
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm2
-; AVX512DQ-NEXT:    vmovdqa64 %ymm29, %ymm3
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3,4,5],ymm2[6],ymm3[7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm29, %ymm4
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm4[0],ymm2[1],ymm4[2,3,4,5],ymm2[6],ymm4[7]
 ; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX512DQ-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm6
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,1]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[0,1,0,2,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm29
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm4[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm29
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm6, %xmm3
+; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm6, %xmm4
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm16
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6,7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
-; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm5, %ymm3
+; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm5, %ymm4
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm6, %ymm21
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm17
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm24
 ; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512DQ-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm15 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7]
+; AVX512DQ-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm14 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7]
 ; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512DQ-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm3 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm2
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm13
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm13[2,2,2,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5,6,7]
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm15, %ymm1
-; AVX512DQ-NEXT:    vpternlogq $236, %ymm20, %ymm1, %ymm2
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512DQ-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm11
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm11[2,2,2,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3],xmm5[4],xmm0[5,6,7]
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512DQ-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm5 = mem[0],ymm5[1],mem[2,3],ymm5[4],mem[5,6],ymm5[7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm9
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm5[0,3,2,1]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm8[0,1,0,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm9, %xmm6
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6,7]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm5, %zmm25
+; AVX512DQ-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm15 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm15, %xmm2
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm15, %xmm12
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm12[2,2,2,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6,7]
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm14, %ymm0
+; AVX512DQ-NEXT:    vpternlogq $236, %ymm22, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm6 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm6, %xmm10
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm10[2,2,2,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7]
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm4 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm8
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm4[0,3,2,1]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm7[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6]
+; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm8, %xmm5
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6,7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm1, %zmm4, %zmm25
 ; AVX512DQ-NEXT:    vinserti32x8 $0, %ymm2, %zmm0, %zmm25 {%k1}
 ; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-NEXT:    vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm7 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7]
+; AVX512DQ-NEXT:    vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7]
 ; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm6
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm6, %xmm12
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[0,3,2,1]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm5[0,1,0,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm12[4],xmm0[5],xmm12[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm2
-; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm7, %ymm12
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm21
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm2 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm14[1,1,2,3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm12 = xmm4[0],xmm12[1],xmm4[2,3],xmm12[4],xmm4[5,6,7]
-; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
+; AVX512DQ-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm4
+; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm4, %xmm11
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[0,3,2,1]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm2[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4],xmm1[5],xmm11[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm5, %ymm11
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4],ymm1[5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm21
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm11 = xmm13[1,1,2,3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm11 = xmm3[0],xmm11[1],xmm3[2,3],xmm11[4],xmm3[5,6,7]
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm0
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm14
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm14 = xmm14[0,1,3,3]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3],xmm0[4],xmm14[5],xmm0[6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm13
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm0[4],xmm13[5],xmm0[6,7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm12, %zmm0, %zmm26
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm10, %xmm10
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm30[1,1,2,3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3],xmm12[4],xmm10[5,6,7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm11, %zmm0, %zmm26
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm11 = xmm30[1,1,2,3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1],xmm9[2,3],xmm11[4],xmm9[5,6,7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm31, %ymm0
-; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm0, %ymm14
-; AVX512DQ-NEXT:    vpternlogq $236, %ymm20, %ymm14, %ymm10
-; AVX512DQ-NEXT:    vinserti32x8 $0, %ymm10, %zmm0, %zmm26 {%k1}
+; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm0, %ymm13
+; AVX512DQ-NEXT:    vpternlogq $236, %ymm22, %ymm13, %ymm9
+; AVX512DQ-NEXT:    vinserti32x8 $0, %ymm9, %zmm0, %zmm26 {%k1}
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm16, %xmm0
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm0, %xmm10
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm0, %xmm9
 ; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm0
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm0[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm14 = xmm14[0,1,3,3]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm10 = xmm14[0,1,2,3],xmm10[4],xmm14[5],xmm10[6,7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm14 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm0[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm9 = xmm13[0,1,2,3],xmm9[4],xmm13[5],xmm9[6,7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm13 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm0
-; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7]
+; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm15, %ymm10
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm13[1,1,2,3]
+; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm14, %ymm9
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm15, %xmm11
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,3]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm12[1],xmm3[2,3],xmm12[4],xmm3[5,6,7]
-; AVX512DQ-NEXT:    vpternlogq $236, %ymm20, %ymm10, %ymm3
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm11[1,1,2,3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7]
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm9, %xmm2
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,3]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5],xmm2[6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3],xmm12[4],xmm11[5,6,7]
+; AVX512DQ-NEXT:    vpternlogq $236, %ymm22, %ymm9, %ymm11
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm10[1,1,2,3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3],xmm6[4],xmm1[5,6,7]
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm8, %xmm6
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[0,1,3,3]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4],xmm7[5],xmm6[6,7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm1, %zmm6, %zmm1
+; AVX512DQ-NEXT:    vinserti32x8 $0, %ymm11, %zmm0, %zmm1 {%k1}
+; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm5, %ymm5
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6,7]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm1
-; AVX512DQ-NEXT:    vinserti32x8 $0, %ymm3, %zmm0, %zmm1 {%k1}
-; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm7, %ymm2
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm6, %xmm3
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm5[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,3]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5],xmm3[6,7]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm2
 ; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-NEXT:    vmovaps %zmm3, (%rsi)
@@ -12234,7 +12258,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm28, 64(%rcx)
 ; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQ-NEXT:    vmovaps %zmm1, (%rcx)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm22, 64(%r8)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm20, 64(%r8)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm27, (%r8)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm21, 64(%r9)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm24, (%r9)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
index ea3bf7b9b720..9134e490535b 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
@@ -1413,22 +1413,20 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[0,1,14,15,12,13,10,11,8,9,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %xmm7
-; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %xmm8
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm7[0,1],xmm8[2,3]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm2[3],xmm9[4,5,6,7]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,u,8,9,6,7,4,5]
+; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %xmm9
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm7[0,1],xmm9[2,3]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm2[3],xmm8[4,5,6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm10, %xmm11
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[2,3,0,1,14,15,12,13,10,11,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3,4],xmm6[5,6,7]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm2[4],xmm9[5,6,7]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,10,11,8,9,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3,4],xmm10[5,6,7]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,10,11,8,9,6,7,4,5]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm2[4],xmm8[5,6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7]
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm10, %xmm11
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[4,5,2,3,0,1,14,15,12,13,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3,4,5],xmm10[6,7]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,10,11,8,9,6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2],xmm1[3]
 ; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1]
@@ -1451,7 +1449,7 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vpshufb %xmm13, %xmm14, %xmm13
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm2[0],xmm7[1,2,3,4,5,6],xmm2[7]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3]
@@ -1466,7 +1464,7 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
 ; AVX2-FP-NEXT:    vmovdqa %xmm5, (%rsi)
 ; AVX2-FP-NEXT:    vmovdqa %xmm6, (%rdx)
-; AVX2-FP-NEXT:    vmovdqa %xmm9, (%rcx)
+; AVX2-FP-NEXT:    vmovdqa %xmm8, (%rcx)
 ; AVX2-FP-NEXT:    vmovdqa %xmm10, (%r8)
 ; AVX2-FP-NEXT:    vmovdqa %xmm11, (%r9)
 ; AVX2-FP-NEXT:    vmovdqa %xmm7, (%r10)
@@ -1492,22 +1490,20 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[0,1,14,15,12,13,10,11,8,9,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %xmm7
-; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %xmm8
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm7[0,1],xmm8[2,3]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm2[3],xmm9[4,5,6,7]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,u,8,9,6,7,4,5]
+; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %xmm9
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm7[0,1],xmm9[2,3]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm2[3],xmm8[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm11
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[2,3,0,1,14,15,12,13,10,11,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3,4],xmm6[5,6,7]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm2[4],xmm9[5,6,7]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,10,11,8,9,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3,4],xmm10[5,6,7]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,10,11,8,9,6,7,4,5]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm2[4],xmm8[5,6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7]
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm11
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[4,5,2,3,0,1,14,15,12,13,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3,4,5],xmm10[6,7]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,10,11,8,9,6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2],xmm1[3]
 ; AVX2-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1]
@@ -1530,7 +1526,7 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vpshufb %xmm13, %xmm14, %xmm13
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3]
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm2[0],xmm7[1,2,3,4,5,6],xmm2[7]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3]
@@ -1545,7 +1541,7 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
 ; AVX2-FCP-NEXT:    vmovdqa %xmm5, (%rsi)
 ; AVX2-FCP-NEXT:    vmovdqa %xmm6, (%rdx)
-; AVX2-FCP-NEXT:    vmovdqa %xmm9, (%rcx)
+; AVX2-FCP-NEXT:    vmovdqa %xmm8, (%rcx)
 ; AVX2-FCP-NEXT:    vmovdqa %xmm10, (%r8)
 ; AVX2-FCP-NEXT:    vmovdqa %xmm11, (%r9)
 ; AVX2-FCP-NEXT:    vmovdqa %xmm7, (%r10)
@@ -1656,19 +1652,17 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3,4],xmm3[5,6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm0[0,1],xmm1[2,3]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm2[3],xmm7[4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,u,8,9,6,7,4,5]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[2,3,0,1,14,15,12,13,10,11,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0,1,2,3,4],xmm6[5,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3,4],xmm8[5,6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,10,11,8,9,6,7,4,5]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4],xmm7[5,6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,10,11,8,9,6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7]
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3,4],xmm7[5,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4,5],xmm8[6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,0,1,14,15,12,13,10,11,8,9,6,7]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3]
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1]
@@ -1817,19 +1811,17 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3,4],xmm3[5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm0[0,1],xmm1[2,3]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm2[3],xmm7[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,u,8,9,6,7,4,5]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[2,3,0,1,14,15,12,13,10,11,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0,1,2,3,4],xmm6[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3,4],xmm8[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,10,11,8,9,6,7,4,5]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4],xmm7[5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,10,11,8,9,6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7]
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3,4],xmm7[5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4,5],xmm8[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,0,1,14,15,12,13,10,11,8,9,6,7]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3]
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
index a0ea6ddeca7d..afdeebc45ed0 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
@@ -519,19 +519,14 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX2-NEXT:    vmovaps 32(%rdi), %ymm1
 ; AVX2-NEXT:    vmovaps 64(%rdi), %ymm2
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
-; AVX2-NEXT:    vpermps %ymm2, %ymm3, %ymm3
-; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX2-NEXT:    vmovaps {{.*#+}} ymm5 = [0,3,6,1,4,7,u,u]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm4 = [0,3,6,1,4,7,2,5]
+; AVX2-NEXT:    vpermps %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm5 = [1,4,7,2,5,0,3,6]
 ; AVX2-NEXT:    vpermps %ymm4, %ymm5, %ymm4
-; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
-; AVX2-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX2-NEXT:    vpermps %ymm2, %ymm4, %ymm4
-; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX2-NEXT:    vmovaps {{.*#+}} ymm6 = [1,4,7,2,5,u,u,u]
-; AVX2-NEXT:    vpermps %ymm5, %ymm6, %ymm5
-; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
 ; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
 ; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
@@ -549,19 +544,14 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX2-FP-NEXT:    vmovaps 32(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovaps 64(%rdi), %ymm2
-; AVX2-FP-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm3, %ymm3
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm5 = [0,3,6,1,4,7,u,u]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
+; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm4 = [0,3,6,1,4,7,2,5]
+; AVX2-FP-NEXT:    vpermps %ymm3, %ymm4, %ymm3
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
+; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm5 = [1,4,7,2,5,0,3,6]
 ; AVX2-FP-NEXT:    vpermps %ymm4, %ymm5, %ymm4
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-FP-NEXT:    vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
-; AVX2-FP-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm4, %ymm4
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm6 = [1,4,7,2,5,u,u,u]
-; AVX2-FP-NEXT:    vpermps %ymm5, %ymm6, %ymm5
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
 ; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
 ; AVX2-FP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
@@ -579,26 +569,18 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm3, %ymm3
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm5 = [0,3,6,1,4,7,u,u]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm4 = [0,3,6,1,4,7,2,5]
+; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm4, %ymm3
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm5 = [1,4,7,2,5,0,3,6]
 ; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm5, %ymm4
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
-; AVX2-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm4, %ymm4
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm6 = [1,4,7,2,5,u,u,u]
-; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm6, %ymm5
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
-; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm5 = [0,1,4,7,0,1,4,7]
-; AVX2-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm5, %ymm2
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,1,4,7]
 ; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FCP-NEXT:    vmovaps %ymm3, (%rsi)
 ; AVX2-FCP-NEXT:    vmovaps %ymm4, (%rdx)
 ; AVX2-FCP-NEXT:    vmovaps %ymm0, (%rcx)
@@ -899,33 +881,26 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX2-LABEL: load_i32_stride3_vf16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovaps 128(%rdi), %ymm1
 ; AVX2-NEXT:    vmovaps 160(%rdi), %ymm0
+; AVX2-NEXT:    vmovaps 128(%rdi), %ymm1
 ; AVX2-NEXT:    vmovaps (%rdi), %ymm2
 ; AVX2-NEXT:    vmovaps 32(%rdi), %ymm3
 ; AVX2-NEXT:    vmovaps 64(%rdi), %ymm4
 ; AVX2-NEXT:    vmovaps 96(%rdi), %ymm5
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm6 = [2,5,2,5,2,5,2,5]
-; AVX2-NEXT:    vpermps %ymm4, %ymm6, %ymm7
-; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
-; AVX2-NEXT:    vmovaps {{.*#+}} ymm9 = [0,3,6,1,4,7,u,u]
-; AVX2-NEXT:    vpermps %ymm8, %ymm9, %ymm8
-; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
-; AVX2-NEXT:    vpermps %ymm0, %ymm6, %ymm6
+; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7]
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm7 = [0,3,6,1,4,7,2,5]
+; AVX2-NEXT:    vpermps %ymm6, %ymm7, %ymm6
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7]
+; AVX2-NEXT:    vpermps %ymm8, %ymm7, %ymm7
+; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm4[0],ymm8[1,2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7]
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm9 = [1,4,7,2,5,0,3,6]
 ; AVX2-NEXT:    vpermps %ymm8, %ymm9, %ymm8
-; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7]
-; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm8 = [0,0,3,6,0,0,3,6]
-; AVX2-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX2-NEXT:    vpermps %ymm4, %ymm8, %ymm9
-; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
-; AVX2-NEXT:    vmovaps {{.*#+}} ymm11 = [1,4,7,2,5,u,u,u]
-; AVX2-NEXT:    vpermps %ymm10, %ymm11, %ymm10
-; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5,6,7]
-; AVX2-NEXT:    vpermps %ymm0, %ymm8, %ymm8
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7]
-; AVX2-NEXT:    vpermps %ymm10, %ymm11, %ymm10
-; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm0[0],ymm10[1,2],ymm0[3],ymm10[4,5],ymm0[6],ymm10[7]
+; AVX2-NEXT:    vpermps %ymm10, %ymm9, %ymm9
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
 ; AVX2-NEXT:    vmovaps {{.*#+}} ymm3 = [2,5,0,3,6,u,u,u]
 ; AVX2-NEXT:    vpermps %ymm2, %ymm3, %ymm2
@@ -937,10 +912,10 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,3,4,5,4,7]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-NEXT:    vmovaps %ymm6, 32(%rsi)
-; AVX2-NEXT:    vmovaps %ymm7, (%rsi)
-; AVX2-NEXT:    vmovaps %ymm8, 32(%rdx)
-; AVX2-NEXT:    vmovaps %ymm9, (%rdx)
+; AVX2-NEXT:    vmovaps %ymm7, 32(%rsi)
+; AVX2-NEXT:    vmovaps %ymm6, (%rsi)
+; AVX2-NEXT:    vmovaps %ymm9, 32(%rdx)
+; AVX2-NEXT:    vmovaps %ymm8, (%rdx)
 ; AVX2-NEXT:    vmovaps %ymm0, 32(%rcx)
 ; AVX2-NEXT:    vmovaps %ymm2, (%rcx)
 ; AVX2-NEXT:    vzeroupper
@@ -948,33 +923,26 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX2-FP-LABEL: load_i32_stride3_vf16:
 ; AVX2-FP:       # %bb.0:
-; AVX2-FP-NEXT:    vmovaps 128(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovaps 160(%rdi), %ymm0
+; AVX2-FP-NEXT:    vmovaps 128(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovaps (%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovaps 32(%rdi), %ymm3
 ; AVX2-FP-NEXT:    vmovaps 64(%rdi), %ymm4
 ; AVX2-FP-NEXT:    vmovaps 96(%rdi), %ymm5
-; AVX2-FP-NEXT:    vbroadcastsd {{.*#+}} ymm6 = [2,5,2,5,2,5,2,5]
-; AVX2-FP-NEXT:    vpermps %ymm4, %ymm6, %ymm7
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
-; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm9 = [0,3,6,1,4,7,u,u]
-; AVX2-FP-NEXT:    vpermps %ymm8, %ymm9, %ymm8
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
-; AVX2-FP-NEXT:    vpermps %ymm0, %ymm6, %ymm6
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7]
+; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm7 = [0,3,6,1,4,7,2,5]
+; AVX2-FP-NEXT:    vpermps %ymm6, %ymm7, %ymm6
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm8, %ymm7, %ymm7
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm4[0],ymm8[1,2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7]
+; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm9 = [1,4,7,2,5,0,3,6]
 ; AVX2-FP-NEXT:    vpermps %ymm8, %ymm9, %ymm8
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7]
-; AVX2-FP-NEXT:    vbroadcastf128 {{.*#+}} ymm8 = [0,0,3,6,0,0,3,6]
-; AVX2-FP-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX2-FP-NEXT:    vpermps %ymm4, %ymm8, %ymm9
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
-; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm11 = [1,4,7,2,5,u,u,u]
-; AVX2-FP-NEXT:    vpermps %ymm10, %ymm11, %ymm10
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm0, %ymm8, %ymm8
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7]
-; AVX2-FP-NEXT:    vpermps %ymm10, %ymm11, %ymm10
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm0[0],ymm10[1,2],ymm0[3],ymm10[4,5],ymm0[6],ymm10[7]
+; AVX2-FP-NEXT:    vpermps %ymm10, %ymm9, %ymm9
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
 ; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm3 = [2,5,0,3,6,u,u,u]
 ; AVX2-FP-NEXT:    vpermps %ymm2, %ymm3, %ymm2
@@ -986,10 +954,10 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,3,4,5,4,7]
 ; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-FP-NEXT:    vmovaps %ymm6, 32(%rsi)
-; AVX2-FP-NEXT:    vmovaps %ymm7, (%rsi)
-; AVX2-FP-NEXT:    vmovaps %ymm8, 32(%rdx)
-; AVX2-FP-NEXT:    vmovaps %ymm9, (%rdx)
+; AVX2-FP-NEXT:    vmovaps %ymm7, 32(%rsi)
+; AVX2-FP-NEXT:    vmovaps %ymm6, (%rsi)
+; AVX2-FP-NEXT:    vmovaps %ymm9, 32(%rdx)
+; AVX2-FP-NEXT:    vmovaps %ymm8, (%rdx)
 ; AVX2-FP-NEXT:    vmovaps %ymm0, 32(%rcx)
 ; AVX2-FP-NEXT:    vmovaps %ymm2, (%rcx)
 ; AVX2-FP-NEXT:    vzeroupper
@@ -997,47 +965,37 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX2-FCP-LABEL: load_i32_stride3_vf16:
 ; AVX2-FCP:       # %bb.0:
-; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %ymm0
-; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %ymm0
+; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm3
 ; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %ymm4
 ; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm5
-; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm6 = [2,5,2,5,2,5,2,5]
-; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm6, %ymm7
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
-; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm9 = [0,3,6,1,4,7,u,u]
-; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm9, %ymm8
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm6, %ymm6
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5,6],ymm0[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7]
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm7 = [0,3,6,1,4,7,2,5]
+; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm7, %ymm6
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm7, %ymm7
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm4[0],ymm8[1,2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7]
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm9 = [1,4,7,2,5,0,3,6]
 ; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm9, %ymm8
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7]
-; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm8 = [0,0,3,6,0,0,3,6]
-; AVX2-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm8, %ymm9
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
-; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm11 = [1,4,7,2,5,u,u,u]
-; AVX2-FCP-NEXT:    vpermps %ymm10, %ymm11, %ymm10
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm8, %ymm8
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm10, %ymm11, %ymm10
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5,6,7]
-; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm10 = [0,1,0,3,0,1,4,7]
-; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm10, %ymm4
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm0[0],ymm10[1,2],ymm0[3],ymm10[4,5],ymm0[6],ymm10[7]
+; AVX2-FCP-NEXT:    vpermps %ymm10, %ymm9, %ymm9
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm3 = [2,5,0,3,6,u,u,u]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7]
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm3 = [2,5,0,3,6,1,4,7]
 ; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm3, %ymm2
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm10, %ymm1
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
 ; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm3, %ymm0
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 32(%rsi)
-; AVX2-FCP-NEXT:    vmovaps %ymm7, (%rsi)
-; AVX2-FCP-NEXT:    vmovaps %ymm8, 32(%rdx)
-; AVX2-FCP-NEXT:    vmovaps %ymm9, (%rdx)
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 32(%rsi)
+; AVX2-FCP-NEXT:    vmovaps %ymm6, (%rsi)
+; AVX2-FCP-NEXT:    vmovaps %ymm9, 32(%rdx)
+; AVX2-FCP-NEXT:    vmovaps %ymm8, (%rdx)
 ; AVX2-FCP-NEXT:    vmovaps %ymm0, 32(%rcx)
 ; AVX2-FCP-NEXT:    vmovaps %ymm2, (%rcx)
 ; AVX2-FCP-NEXT:    vzeroupper
@@ -1608,317 +1566,261 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX2-LABEL: load_i32_stride3_vf32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    subq $136, %rsp
-; AVX2-NEXT:    vmovaps 224(%rdi), %ymm0
-; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 320(%rdi), %ymm4
-; AVX2-NEXT:    vmovaps 288(%rdi), %ymm7
-; AVX2-NEXT:    vmovaps 352(%rdi), %ymm5
-; AVX2-NEXT:    vmovaps 128(%rdi), %ymm10
-; AVX2-NEXT:    vmovaps (%rdi), %ymm12
-; AVX2-NEXT:    vmovaps 32(%rdi), %ymm6
-; AVX2-NEXT:    vmovaps 64(%rdi), %ymm3
-; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 96(%rdi), %ymm13
-; AVX2-NEXT:    vmovaps 160(%rdi), %ymm11
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [2,5,2,5,2,5,2,5]
-; AVX2-NEXT:    vpermps %ymm11, %ymm1, %ymm8
-; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6],ymm10[7]
-; AVX2-NEXT:    vmovaps {{.*#+}} ymm14 = [0,3,6,1,4,7,u,u]
-; AVX2-NEXT:    vpermps %ymm9, %ymm14, %ymm9
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm8[6,7]
+; AVX2-NEXT:    subq $40, %rsp
+; AVX2-NEXT:    vmovaps 256(%rdi), %ymm15
+; AVX2-NEXT:    vmovaps 224(%rdi), %ymm5
+; AVX2-NEXT:    vmovaps 192(%rdi), %ymm3
+; AVX2-NEXT:    vmovaps 352(%rdi), %ymm4
+; AVX2-NEXT:    vmovaps 320(%rdi), %ymm8
+; AVX2-NEXT:    vmovaps 288(%rdi), %ymm10
+; AVX2-NEXT:    vmovaps 160(%rdi), %ymm13
+; AVX2-NEXT:    vmovaps 128(%rdi), %ymm1
+; AVX2-NEXT:    vmovaps (%rdi), %ymm6
+; AVX2-NEXT:    vmovaps 32(%rdi), %ymm9
+; AVX2-NEXT:    vmovaps 64(%rdi), %ymm7
+; AVX2-NEXT:    vmovaps 96(%rdi), %ymm0
+; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7]
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm12 = [0,3,6,1,4,7,2,5]
+; AVX2-NEXT:    vpermps %ymm11, %ymm12, %ymm2
+; AVX2-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
+; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5,6],ymm8[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm4[2],ymm11[3,4],ymm4[5],ymm11[6,7]
+; AVX2-NEXT:    vpermps %ymm11, %ymm12, %ymm2
 ; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps %ymm5, %ymm1, %ymm8
-; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5,6],ymm4[7]
-; AVX2-NEXT:    vpermps %ymm9, %ymm14, %ymm9
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm8[6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm6[0],ymm9[1],ymm6[2,3],ymm9[4],ymm6[5,6],ymm9[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7]
+; AVX2-NEXT:    vpermps %ymm11, %ymm12, %ymm2
 ; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps %ymm3, %ymm1, %ymm8
-; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5,6],ymm6[7]
-; AVX2-NEXT:    vmovaps %ymm12, %ymm3
-; AVX2-NEXT:    vpermps %ymm9, %ymm14, %ymm9
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm8[6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7]
+; AVX2-NEXT:    vpermps %ymm14, %ymm12, %ymm2
 ; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 192(%rdi), %ymm12
-; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6],ymm0[7]
-; AVX2-NEXT:    vpermps %ymm8, %ymm14, %ymm8
-; AVX2-NEXT:    vmovaps 256(%rdi), %ymm15
-; AVX2-NEXT:    vpermps %ymm15, %ymm1, %ymm1
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7]
-; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = [1,4,7,2,5,u,u,u]
-; AVX2-NEXT:    vpermps %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6]
-; AVX2-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX2-NEXT:    vpermps %ymm11, %ymm0, %ymm8
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7]
-; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7]
-; AVX2-NEXT:    vpermps %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vpermps %ymm5, %ymm0, %ymm9
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm9[5,6,7]
-; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7]
-; AVX2-NEXT:    vmovaps %ymm6, %ymm8
-; AVX2-NEXT:    vpermps %ymm9, %ymm2, %ymm9
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vpermps %ymm6, %ymm0, %ymm14
-; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm1[2],ymm12[3,4],ymm1[5],ymm12[6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7]
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = [1,4,7,2,5,0,3,6]
+; AVX2-NEXT:    vpermps %ymm12, %ymm2, %ymm11
+; AVX2-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm10[0,1],ymm8[2],ymm10[3,4],ymm8[5],ymm10[6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm4[0],ymm11[1,2],ymm4[3],ymm11[4,5],ymm4[6],ymm11[7]
+; AVX2-NEXT:    vpermps %ymm11, %ymm2, %ymm11
+; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm7[0],ymm12[1,2],ymm7[3],ymm12[4,5],ymm7[6],ymm12[7]
+; AVX2-NEXT:    vpermps %ymm12, %ymm2, %ymm12
+; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7]
 ; AVX2-NEXT:    vpermps %ymm14, %ymm2, %ymm2
-; AVX2-NEXT:    vpermps %ymm15, %ymm0, %ymm0
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7]
-; AVX2-NEXT:    vmovaps {{.*#+}} ymm10 = [2,5,0,3,6,u,u,u]
-; AVX2-NEXT:    vpermps %ymm2, %ymm10, %ymm2
-; AVX2-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[0,1,0,3,4,5,4,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm11 = ymm11[0,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7]
-; AVX2-NEXT:    vpermps %ymm4, %ymm10, %ymm4
-; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[0,1,0,3,4,5,4,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7]
-; AVX2-NEXT:    vpermps %ymm5, %ymm10, %ymm5
-; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm6[0,1,0,3,4,5,4,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[0,1,0,3,4,5,4,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2],ymm8[3,4],ymm10[5],ymm8[6,7]
+; AVX2-NEXT:    vpermps %ymm8, %ymm1, %ymm8
+; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[0,1,0,3,4,5,4,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2],ymm9[3,4],ymm6[5],ymm9[6,7]
+; AVX2-NEXT:    vpermps %ymm6, %ymm1, %ymm6
+; AVX2-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[0,1,0,3,4,5,4,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7]
+; AVX2-NEXT:    vpermps %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm15[0,1,0,3,4,5,4,7]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm12[2],ymm1[3,4],ymm12[5],ymm1[6,7]
-; AVX2-NEXT:    vpermps %ymm5, %ymm10, %ymm5
-; AVX2-NEXT:    vshufps {{.*#+}} ymm6 = ymm15[0,1,0,3,4,5,4,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7]
-; AVX2-NEXT:    vmovups (%rsp), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 64(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, (%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 96(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 32(%rsi)
-; AVX2-NEXT:    vmovaps %ymm0, 64(%rdx)
-; AVX2-NEXT:    vmovaps %ymm9, (%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, 96(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, 32(%rdx)
-; AVX2-NEXT:    vmovaps %ymm5, 64(%rcx)
-; AVX2-NEXT:    vmovaps %ymm3, (%rcx)
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm3, 64(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm3, (%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm3, 96(%rsi)
+; AVX2-NEXT:    vmovups (%rsp), %ymm3 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm3, 32(%rsi)
+; AVX2-NEXT:    vmovaps %ymm2, 64(%rdx)
+; AVX2-NEXT:    vmovaps %ymm12, (%rdx)
+; AVX2-NEXT:    vmovaps %ymm11, 96(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm2, 32(%rdx)
+; AVX2-NEXT:    vmovaps %ymm1, 64(%rcx)
+; AVX2-NEXT:    vmovaps %ymm6, (%rcx)
 ; AVX2-NEXT:    vmovaps %ymm4, 96(%rcx)
-; AVX2-NEXT:    vmovaps %ymm2, 32(%rcx)
-; AVX2-NEXT:    addq $136, %rsp
+; AVX2-NEXT:    vmovaps %ymm0, 32(%rcx)
+; AVX2-NEXT:    addq $40, %rsp
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX2-FP-LABEL: load_i32_stride3_vf32:
 ; AVX2-FP:       # %bb.0:
-; AVX2-FP-NEXT:    subq $136, %rsp
-; AVX2-FP-NEXT:    vmovaps 224(%rdi), %ymm0
-; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 320(%rdi), %ymm4
-; AVX2-FP-NEXT:    vmovaps 288(%rdi), %ymm7
-; AVX2-FP-NEXT:    vmovaps 352(%rdi), %ymm5
-; AVX2-FP-NEXT:    vmovaps 128(%rdi), %ymm10
-; AVX2-FP-NEXT:    vmovaps (%rdi), %ymm12
-; AVX2-FP-NEXT:    vmovaps 32(%rdi), %ymm6
-; AVX2-FP-NEXT:    vmovaps 64(%rdi), %ymm3
-; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 96(%rdi), %ymm13
-; AVX2-FP-NEXT:    vmovaps 160(%rdi), %ymm11
-; AVX2-FP-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [2,5,2,5,2,5,2,5]
-; AVX2-FP-NEXT:    vpermps %ymm11, %ymm1, %ymm8
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6],ymm10[7]
-; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm14 = [0,3,6,1,4,7,u,u]
-; AVX2-FP-NEXT:    vpermps %ymm9, %ymm14, %ymm9
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm8[6,7]
+; AVX2-FP-NEXT:    subq $40, %rsp
+; AVX2-FP-NEXT:    vmovaps 256(%rdi), %ymm15
+; AVX2-FP-NEXT:    vmovaps 224(%rdi), %ymm5
+; AVX2-FP-NEXT:    vmovaps 192(%rdi), %ymm3
+; AVX2-FP-NEXT:    vmovaps 352(%rdi), %ymm4
+; AVX2-FP-NEXT:    vmovaps 320(%rdi), %ymm8
+; AVX2-FP-NEXT:    vmovaps 288(%rdi), %ymm10
+; AVX2-FP-NEXT:    vmovaps 160(%rdi), %ymm13
+; AVX2-FP-NEXT:    vmovaps 128(%rdi), %ymm1
+; AVX2-FP-NEXT:    vmovaps (%rdi), %ymm6
+; AVX2-FP-NEXT:    vmovaps 32(%rdi), %ymm9
+; AVX2-FP-NEXT:    vmovaps 64(%rdi), %ymm7
+; AVX2-FP-NEXT:    vmovaps 96(%rdi), %ymm0
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7]
+; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm12 = [0,3,6,1,4,7,2,5]
+; AVX2-FP-NEXT:    vpermps %ymm11, %ymm12, %ymm2
+; AVX2-FP-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5,6],ymm8[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm4[2],ymm11[3,4],ymm4[5],ymm11[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm11, %ymm12, %ymm2
 ; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps %ymm5, %ymm1, %ymm8
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5,6],ymm4[7]
-; AVX2-FP-NEXT:    vpermps %ymm9, %ymm14, %ymm9
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm8[6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm6[0],ymm9[1],ymm6[2,3],ymm9[4],ymm6[5,6],ymm9[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm11, %ymm12, %ymm2
 ; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps %ymm3, %ymm1, %ymm8
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5,6],ymm6[7]
-; AVX2-FP-NEXT:    vmovaps %ymm12, %ymm3
-; AVX2-FP-NEXT:    vpermps %ymm9, %ymm14, %ymm9
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm8[6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm14, %ymm12, %ymm2
 ; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 192(%rdi), %ymm12
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6],ymm0[7]
-; AVX2-FP-NEXT:    vpermps %ymm8, %ymm14, %ymm8
-; AVX2-FP-NEXT:    vmovaps 256(%rdi), %ymm15
-; AVX2-FP-NEXT:    vpermps %ymm15, %ymm1, %ymm1
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-FP-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7]
-; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm2 = [1,4,7,2,5,u,u,u]
-; AVX2-FP-NEXT:    vpermps %ymm1, %ymm2, %ymm1
-; AVX2-FP-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6]
-; AVX2-FP-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX2-FP-NEXT:    vpermps %ymm11, %ymm0, %ymm8
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7]
-; AVX2-FP-NEXT:    vpermps %ymm1, %ymm2, %ymm1
-; AVX2-FP-NEXT:    vpermps %ymm5, %ymm0, %ymm9
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm9[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7]
-; AVX2-FP-NEXT:    vmovaps %ymm6, %ymm8
-; AVX2-FP-NEXT:    vpermps %ymm9, %ymm2, %ymm9
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vpermps %ymm6, %ymm0, %ymm14
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm1[2],ymm12[3,4],ymm1[5],ymm12[6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7]
+; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm2 = [1,4,7,2,5,0,3,6]
+; AVX2-FP-NEXT:    vpermps %ymm12, %ymm2, %ymm11
+; AVX2-FP-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm10[0,1],ymm8[2],ymm10[3,4],ymm8[5],ymm10[6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm4[0],ymm11[1,2],ymm4[3],ymm11[4,5],ymm4[6],ymm11[7]
+; AVX2-FP-NEXT:    vpermps %ymm11, %ymm2, %ymm11
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm7[0],ymm12[1,2],ymm7[3],ymm12[4,5],ymm7[6],ymm12[7]
+; AVX2-FP-NEXT:    vpermps %ymm12, %ymm2, %ymm12
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7]
 ; AVX2-FP-NEXT:    vpermps %ymm14, %ymm2, %ymm2
-; AVX2-FP-NEXT:    vpermps %ymm15, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7]
-; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm10 = [2,5,0,3,6,u,u,u]
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm10, %ymm2
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[0,1,0,3,4,5,4,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm11 = ymm11[0,1,0,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7]
-; AVX2-FP-NEXT:    vpermps %ymm4, %ymm10, %ymm4
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[0,1,0,3,4,5,4,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7]
-; AVX2-FP-NEXT:    vpermps %ymm5, %ymm10, %ymm5
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm6[0,1,0,3,4,5,4,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
+; AVX2-FP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[0,1,0,3,4,5,4,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2],ymm8[3,4],ymm10[5],ymm8[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm8, %ymm1, %ymm8
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[0,1,0,3,4,5,4,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2],ymm9[3,4],ymm6[5],ymm9[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm6, %ymm1, %ymm6
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[0,1,0,3,4,5,4,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm3, %ymm1, %ymm1
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm15[0,1,0,3,4,5,4,7]
 ; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm12[2],ymm1[3,4],ymm12[5],ymm1[6,7]
-; AVX2-FP-NEXT:    vpermps %ymm5, %ymm10, %ymm5
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm6 = ymm15[0,1,0,3,4,5,4,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7]
-; AVX2-FP-NEXT:    vmovups (%rsp), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 64(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, (%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 96(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 32(%rsi)
-; AVX2-FP-NEXT:    vmovaps %ymm0, 64(%rdx)
-; AVX2-FP-NEXT:    vmovaps %ymm9, (%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm0, 96(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm0, 32(%rdx)
-; AVX2-FP-NEXT:    vmovaps %ymm5, 64(%rcx)
-; AVX2-FP-NEXT:    vmovaps %ymm3, (%rcx)
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm3, 64(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm3, (%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm3, 96(%rsi)
+; AVX2-FP-NEXT:    vmovups (%rsp), %ymm3 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm3, 32(%rsi)
+; AVX2-FP-NEXT:    vmovaps %ymm2, 64(%rdx)
+; AVX2-FP-NEXT:    vmovaps %ymm12, (%rdx)
+; AVX2-FP-NEXT:    vmovaps %ymm11, 96(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm2, 32(%rdx)
+; AVX2-FP-NEXT:    vmovaps %ymm1, 64(%rcx)
+; AVX2-FP-NEXT:    vmovaps %ymm6, (%rcx)
 ; AVX2-FP-NEXT:    vmovaps %ymm4, 96(%rcx)
-; AVX2-FP-NEXT:    vmovaps %ymm2, 32(%rcx)
-; AVX2-FP-NEXT:    addq $136, %rsp
+; AVX2-FP-NEXT:    vmovaps %ymm0, 32(%rcx)
+; AVX2-FP-NEXT:    addq $40, %rsp
 ; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
 ;
 ; AVX2-FCP-LABEL: load_i32_stride3_vf32:
 ; AVX2-FCP:       # %bb.0:
-; AVX2-FCP-NEXT:    subq $104, %rsp
-; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 320(%rdi), %ymm3
-; AVX2-FCP-NEXT:    vmovaps 288(%rdi), %ymm6
+; AVX2-FCP-NEXT:    subq $72, %rsp
+; AVX2-FCP-NEXT:    vmovaps 256(%rdi), %ymm0
+; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovaps 352(%rdi), %ymm4
-; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %ymm10
-; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm2
-; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm15
-; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %ymm5
-; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm13
-; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %ymm12
-; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm8 = [2,5,2,5,2,5,2,5]
-; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm8, %ymm0
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6],ymm10[7]
-; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm14 = [0,3,6,1,4,7,u,u]
-; AVX2-FCP-NEXT:    vpermps %ymm9, %ymm14, %ymm9
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm8, %ymm0
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6],ymm3[7]
-; AVX2-FCP-NEXT:    vpermps %ymm9, %ymm14, %ymm9
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm8, %ymm0
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm2[0],ymm15[1],ymm2[2,3],ymm15[4],ymm2[5,6],ymm15[7]
-; AVX2-FCP-NEXT:    vmovaps %ymm2, %ymm7
-; AVX2-FCP-NEXT:    vpermps %ymm9, %ymm14, %ymm9
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %ymm11
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5,6],ymm1[7]
-; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm14, %ymm9
-; AVX2-FCP-NEXT:    vmovaps 256(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm8, %ymm8
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm8[6,7]
+; AVX2-FCP-NEXT:    vmovaps 320(%rdi), %ymm6
+; AVX2-FCP-NEXT:    vmovaps 288(%rdi), %ymm7
+; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %ymm13
+; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %ymm14
+; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm8
+; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm9
+; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %ymm10
+; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm15
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6],ymm14[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm13[2],ymm3[3,4],ymm13[5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm12 = [0,3,6,1,4,7,2,5]
+; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm12, %ymm3
+; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm12, %ymm3
+; AVX2-FCP-NEXT:    vmovups %ymm3, (%rsp) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm12, %ymm3
+; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vmovaps %ymm0, %ymm2
+; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm12, %ymm0
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7]
-; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm1 = [1,4,7,2,5,u,u,u]
-; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm1, %ymm8
-; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6]
-; AVX2-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm0, %ymm9
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm1, %ymm8
-; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm0, %ymm9
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm7[0,1],ymm15[2],ymm7[3,4],ymm15[5],ymm7[6,7]
-; AVX2-FCP-NEXT:    vmovaps %ymm7, %ymm9
-; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm1, %ymm8
-; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm0, %ymm14
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm14[5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm14, %ymm1, %ymm1
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7]
-; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm10 = [0,1,0,3,0,1,4,7]
-; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm10, %ymm12
-; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm13 = [2,5,0,3,6,u,u,u]
-; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm13, %ymm1
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm12[5,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm10, %ymm6
-; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm13, %ymm3
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm15[0,1],ymm9[2],ymm15[3,4],ymm9[5],ymm15[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm10, %ymm5
-; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm13, %ymm4
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm10, %ymm2
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm11[2],ymm7[3,4],ymm11[5],ymm7[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm13, %ymm5
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 64(%rsi)
-; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, (%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 96(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 32(%rsi)
-; AVX2-FCP-NEXT:    vmovaps %ymm0, 64(%rdx)
-; AVX2-FCP-NEXT:    vmovaps %ymm8, (%rdx)
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7]
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm3 = [1,4,7,2,5,0,3,6]
+; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm3, %ymm5
+; AVX2-FCP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
+; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm3, %ymm5
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7]
+; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm3, %ymm11
+; AVX2-FCP-NEXT:    vmovaps %ymm1, %ymm0
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm2[0],ymm12[1,2],ymm2[3],ymm12[4,5],ymm2[6],ymm12[7]
+; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm3, %ymm3
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6],ymm13[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6],ymm4[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2,3],ymm10[4],ymm6[5,6],ymm10[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm2 = [2,5,0,3,6,1,4,7]
+; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm2, %ymm7
+; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm2, %ymm4
+; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm2, %ymm6
+; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm2, %ymm1
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm0, 96(%rdx)
+; AVX2-FCP-NEXT:    vmovaps %ymm0, 64(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm2, (%rsi)
+; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm2, 96(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm2, 32(%rsi)
+; AVX2-FCP-NEXT:    vmovaps %ymm3, 64(%rdx)
+; AVX2-FCP-NEXT:    vmovaps %ymm11, (%rdx)
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 96(%rdx)
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vmovaps %ymm0, 32(%rdx)
-; AVX2-FCP-NEXT:    vmovaps %ymm2, 64(%rcx)
-; AVX2-FCP-NEXT:    vmovaps %ymm4, (%rcx)
-; AVX2-FCP-NEXT:    vmovaps %ymm3, 96(%rcx)
-; AVX2-FCP-NEXT:    vmovaps %ymm1, 32(%rcx)
-; AVX2-FCP-NEXT:    addq $104, %rsp
+; AVX2-FCP-NEXT:    vmovaps %ymm1, 64(%rcx)
+; AVX2-FCP-NEXT:    vmovaps %ymm6, (%rcx)
+; AVX2-FCP-NEXT:    vmovaps %ymm4, 96(%rcx)
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 32(%rcx)
+; AVX2-FCP-NEXT:    addq $72, %rsp
 ; AVX2-FCP-NEXT:    vzeroupper
 ; AVX2-FCP-NEXT:    retq
 ;
@@ -3014,245 +2916,229 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-LABEL: load_i32_stride3_vf64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    subq $1032, %rsp # imm = 0x408
+; AVX2-NEXT:    vmovaps 736(%rdi), %ymm2
+; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 704(%rdi), %ymm3
 ; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 512(%rdi), %ymm4
+; AVX2-NEXT:    vmovaps 672(%rdi), %ymm4
 ; AVX2-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 480(%rdi), %ymm6
-; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 544(%rdi), %ymm5
 ; AVX2-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 320(%rdi), %ymm7
+; AVX2-NEXT:    vmovaps 512(%rdi), %ymm6
+; AVX2-NEXT:    vmovups %ymm6, (%rsp) # 32-byte Spill
+; AVX2-NEXT:    vmovaps 480(%rdi), %ymm7
 ; AVX2-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 288(%rdi), %ymm10
+; AVX2-NEXT:    vmovaps 352(%rdi), %ymm8
+; AVX2-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovaps 320(%rdi), %ymm10
 ; AVX2-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 352(%rdi), %ymm9
+; AVX2-NEXT:    vmovaps 288(%rdi), %ymm11
+; AVX2-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovaps 160(%rdi), %ymm9
 ; AVX2-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 128(%rdi), %ymm13
-; AVX2-NEXT:    vmovaps 96(%rdi), %ymm14
-; AVX2-NEXT:    vmovaps 160(%rdi), %ymm1
+; AVX2-NEXT:    vmovaps 128(%rdi), %ymm0
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovaps 96(%rdi), %ymm1
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm0 = [2,5,2,5,2,5,2,5]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7]
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [0,3,6,1,4,7,2,5]
 ; AVX2-NEXT:    vpermps %ymm1, %ymm0, %ymm1
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7]
-; AVX2-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{.*#+}} ymm8 = [0,3,6,1,4,7,u,u]
-; AVX2-NEXT:    vpermps %ymm2, %ymm8, %ymm2
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps %ymm9, %ymm0, %ymm1
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5,6],ymm7[7]
-; AVX2-NEXT:    vpermps %ymm2, %ymm8, %ymm2
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps %ymm5, %ymm0, %ymm1
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6],ymm4[7]
-; AVX2-NEXT:    vpermps %ymm2, %ymm8, %ymm2
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6],ymm10[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7]
+; AVX2-NEXT:    vpermps %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 672(%rdi), %ymm1
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7]
+; AVX2-NEXT:    vpermps %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7]
-; AVX2-NEXT:    vpermps %ymm1, %ymm8, %ymm1
-; AVX2-NEXT:    vmovaps 736(%rdi), %ymm2
-; AVX2-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
-; AVX2-NEXT:    vpermps %ymm2, %ymm0, %ymm2
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
+; AVX2-NEXT:    vpermps %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps (%rdi), %ymm1
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 32(%rdi), %ymm3
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7]
-; AVX2-NEXT:    vmovaps %ymm3, %ymm5
-; AVX2-NEXT:    vpermps %ymm1, %ymm8, %ymm1
+; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 64(%rdi), %ymm2
+; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7]
+; AVX2-NEXT:    vmovaps %ymm2, %ymm8
 ; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps %ymm2, %ymm0, %ymm4
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 224(%rdi), %ymm7
-; AVX2-NEXT:    vmovaps 192(%rdi), %ymm15
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm15[0],ymm7[1],ymm15[2,3],ymm7[4],ymm15[5,6],ymm7[7]
-; AVX2-NEXT:    vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps %ymm1, %ymm8, %ymm1
-; AVX2-NEXT:    vmovaps 256(%rdi), %ymm2
-; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps %ymm2, %ymm0, %ymm6
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7]
-; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 416(%rdi), %ymm1
-; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 384(%rdi), %ymm3
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7]
-; AVX2-NEXT:    vmovaps %ymm3, %ymm6
-; AVX2-NEXT:    vpermps %ymm1, %ymm8, %ymm1
-; AVX2-NEXT:    vmovaps 448(%rdi), %ymm2
-; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps %ymm2, %ymm0, %ymm12
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7]
+; AVX2-NEXT:    vpermps %ymm4, %ymm0, %ymm4
+; AVX2-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovaps 256(%rdi), %ymm1
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 608(%rdi), %ymm9
-; AVX2-NEXT:    vmovaps 576(%rdi), %ymm11
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6],ymm9[7]
-; AVX2-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps %ymm1, %ymm8, %ymm1
-; AVX2-NEXT:    vmovaps 640(%rdi), %ymm2
+; AVX2-NEXT:    vmovaps 224(%rdi), %ymm2
 ; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-NEXT:    vmovaps 192(%rdi), %ymm3
+; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7]
+; AVX2-NEXT:    vpermps %ymm6, %ymm0, %ymm6
+; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovaps 448(%rdi), %ymm13
+; AVX2-NEXT:    vmovaps 416(%rdi), %ymm12
+; AVX2-NEXT:    vmovaps 384(%rdi), %ymm14
+; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6],ymm12[7]
+; AVX2-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7]
+; AVX2-NEXT:    vpermps %ymm10, %ymm0, %ymm10
+; AVX2-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovaps 640(%rdi), %ymm6
+; AVX2-NEXT:    vmovaps 608(%rdi), %ymm5
+; AVX2-NEXT:    vmovaps 576(%rdi), %ymm7
+; AVX2-NEXT:    vblendps {{.*#+}} ymm15 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5,6],ymm5[7]
+; AVX2-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm6[2],ymm15[3,4],ymm6[5],ymm15[6,7]
+; AVX2-NEXT:    vpermps %ymm15, %ymm0, %ymm0
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm4[2],ymm9[3,4],ymm4[5],ymm9[6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0],ymm0[1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7]
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm15 = [1,4,7,2,5,0,3,6]
+; AVX2-NEXT:    vpermps %ymm0, %ymm15, %ymm0
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX2-NEXT:    vblendps $73, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7]
+; AVX2-NEXT:    vpermps %ymm0, %ymm15, %ymm0
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX2-NEXT:    vblendps $73, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7]
+; AVX2-NEXT:    vpermps %ymm0, %ymm15, %ymm0
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX2-NEXT:    vblendps $73, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7]
+; AVX2-NEXT:    vpermps %ymm0, %ymm15, %ymm0
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7]
-; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [1,4,7,2,5,u,u,u]
-; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm2
-; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6]
-; AVX2-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-NEXT:    vpermps %ymm10, %ymm0, %ymm14
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7]
-; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = mem[0,1],ymm12[2],mem[3,4],ymm12[5],mem[6,7]
-; AVX2-NEXT:    vpermps %ymm2, %ymm1, %ymm2
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-NEXT:    vpermps %ymm13, %ymm0, %ymm14
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7]
-; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
-; AVX2-NEXT:    vpermps %ymm2, %ymm1, %ymm2
-; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7]
-; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
-; AVX2-NEXT:    vpermps %ymm2, %ymm1, %ymm2
-; AVX2-NEXT:    vpermps (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7]
-; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps %ymm5, %ymm4
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1,2],ymm8[3],ymm0[4,5],ymm8[6],ymm0[7]
+; AVX2-NEXT:    vpermps %ymm0, %ymm15, %ymm0
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7]
-; AVX2-NEXT:    vpermps %ymm2, %ymm1, %ymm2
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT:    vpermps %ymm5, %ymm0, %ymm14
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7]
-; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm15[0,1],ymm7[2],ymm15[3,4],ymm7[5],ymm15[6,7]
-; AVX2-NEXT:    vpermps %ymm2, %ymm1, %ymm2
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-NEXT:    vpermps %ymm15, %ymm0, %ymm14
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7]
-; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm10[0],ymm0[1,2],ymm10[3],ymm0[4,5],ymm10[6],ymm0[7]
+; AVX2-NEXT:    vpermps %ymm0, %ymm15, %ymm0
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm13[0],ymm0[1,2],ymm13[3],ymm0[4,5],ymm13[6],ymm0[7]
+; AVX2-NEXT:    vmovaps %ymm13, %ymm14
+; AVX2-NEXT:    vpermps %ymm0, %ymm15, %ymm13
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0],ymm0[1,2],ymm6[3],ymm0[4,5],ymm6[6],ymm0[7]
 ; AVX2-NEXT:    vmovaps %ymm6, %ymm7
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7]
-; AVX2-NEXT:    vpermps %ymm2, %ymm1, %ymm2
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-NEXT:    vpermps %ymm8, %ymm0, %ymm14
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7]
-; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7]
-; AVX2-NEXT:    vpermps %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT:    vpermps %ymm9, %ymm0, %ymm0
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-NEXT:    vpermps %ymm0, %ymm15, %ymm15
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7]
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm12 = [2,5,0,3,6,u,u,u]
+; AVX2-NEXT:    vpermps %ymm0, %ymm12, %ymm0
+; AVX2-NEXT:    vshufps {{.*#+}} ymm9 = ymm11[0,1,0,3,4,5,4,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm9[5,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX2-NEXT:    vpermps %ymm0, %ymm12, %ymm0
+; AVX2-NEXT:    vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm1 = mem[0,1,0,3,4,5,4,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX2-NEXT:    vmovaps {{.*#+}} ymm14 = [2,5,0,3,6,u,u,u]
-; AVX2-NEXT:    vpermps %ymm1, %ymm14, %ymm1
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm10[0,1,0,3,4,5,4,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
-; AVX2-NEXT:    vpermps %ymm1, %ymm14, %ymm1
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm5[0,1,0,3,4,5,4,7]
+; AVX2-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
+; AVX2-NEXT:    vpermps %ymm1, %ymm12, %ymm1
+; AVX2-NEXT:    vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = mem[0,1,0,3,4,5,4,7]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = ymm12[0,1],mem[2],ymm12[3,4],mem[5],ymm12[6,7]
-; AVX2-NEXT:    vpermps %ymm2, %ymm14, %ymm2
-; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm13[0,1,0,3,4,5,4,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7]
+; AVX2-NEXT:    vpermps %ymm2, %ymm12, %ymm2
+; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm10[0,1,0,3,4,5,4,7]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3,4],ymm3[5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
-; AVX2-NEXT:    vpermps %ymm2, %ymm14, %ymm2
-; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm15[0,1,0,3,4,5,4,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-NEXT:    vmovups (%rsp), %ymm3 # 32-byte Reload
+; AVX2-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7]
+; AVX2-NEXT:    vpermps %ymm3, %ymm12, %ymm3
+; AVX2-NEXT:    vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm4 = mem[0,1,0,3,4,5,4,7]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7]
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm4 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7]
-; AVX2-NEXT:    vpermps %ymm4, %ymm14, %ymm4
-; AVX2-NEXT:    vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm5 = mem[0,1,0,3,4,5,4,7]
+; AVX2-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm4 = mem[0,1],ymm4[2],mem[3,4],ymm4[5],mem[6,7]
+; AVX2-NEXT:    vpermps %ymm4, %ymm12, %ymm4
+; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm14[0,1,0,3,4,5,4,7]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7]
-; AVX2-NEXT:    vpermps %ymm5, %ymm14, %ymm5
-; AVX2-NEXT:    vshufps {{.*#+}} ymm6 = ymm8[0,1,0,3,4,5,4,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm5 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7]
+; AVX2-NEXT:    vpermps %ymm5, %ymm12, %ymm5
+; AVX2-NEXT:    vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm6 = mem[0,1,0,3,4,5,4,7]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7]
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX2-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm6 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7]
-; AVX2-NEXT:    vpermps %ymm6, %ymm14, %ymm6
-; AVX2-NEXT:    vpermilps $196, (%rsp), %ymm7 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm7 = mem[0,1,0,3,4,5,4,7]
+; AVX2-NEXT:    vpermps %ymm6, %ymm12, %ymm6
+; AVX2-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[0,1,0,3,4,5,4,7]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7]
-; AVX2-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm7 = mem[0,1],ymm11[2],mem[3,4],ymm11[5],mem[6,7]
-; AVX2-NEXT:    vpermps %ymm7, %ymm14, %ymm7
-; AVX2-NEXT:    vshufps {{.*#+}} ymm8 = ymm9[0,1,0,3,4,5,4,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm8, 192(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm8, 128(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm8, 64(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm8, (%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm8, 224(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm8, 160(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm8, 96(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm8, 32(%rsi)
-; AVX2-NEXT:    vmovaps %ymm0, 192(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, 128(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, 64(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, (%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, 224(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, 160(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, 96(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, 32(%rdx)
-; AVX2-NEXT:    vmovaps %ymm7, 192(%rcx)
-; AVX2-NEXT:    vmovaps %ymm6, 224(%rcx)
-; AVX2-NEXT:    vmovaps %ymm5, 128(%rcx)
-; AVX2-NEXT:    vmovaps %ymm4, 160(%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 192(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 128(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 64(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, (%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 224(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 160(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 96(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 32(%rsi)
+; AVX2-NEXT:    vmovaps %ymm15, 192(%rdx)
+; AVX2-NEXT:    vmovaps %ymm13, 128(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 64(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, (%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 224(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 160(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 96(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 32(%rdx)
+; AVX2-NEXT:    vmovaps %ymm6, 192(%rcx)
+; AVX2-NEXT:    vmovaps %ymm5, 224(%rcx)
+; AVX2-NEXT:    vmovaps %ymm4, 128(%rcx)
+; AVX2-NEXT:    vmovaps %ymm3, 160(%rcx)
 ; AVX2-NEXT:    vmovaps %ymm2, 64(%rcx)
-; AVX2-NEXT:    vmovaps %ymm3, 96(%rcx)
-; AVX2-NEXT:    vmovaps %ymm1, (%rcx)
-; AVX2-NEXT:    vmovaps %ymm10, 32(%rcx)
+; AVX2-NEXT:    vmovaps %ymm1, 96(%rcx)
+; AVX2-NEXT:    vmovaps %ymm0, (%rcx)
+; AVX2-NEXT:    vmovaps %ymm9, 32(%rcx)
 ; AVX2-NEXT:    addq $1032, %rsp # imm = 0x408
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -3260,245 +3146,229 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-LABEL: load_i32_stride3_vf64:
 ; AVX2-FP:       # %bb.0:
 ; AVX2-FP-NEXT:    subq $1032, %rsp # imm = 0x408
+; AVX2-FP-NEXT:    vmovaps 736(%rdi), %ymm2
+; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 704(%rdi), %ymm3
 ; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 512(%rdi), %ymm4
+; AVX2-FP-NEXT:    vmovaps 672(%rdi), %ymm4
 ; AVX2-FP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 480(%rdi), %ymm6
-; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 544(%rdi), %ymm5
 ; AVX2-FP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 320(%rdi), %ymm7
+; AVX2-FP-NEXT:    vmovaps 512(%rdi), %ymm6
+; AVX2-FP-NEXT:    vmovups %ymm6, (%rsp) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovaps 480(%rdi), %ymm7
 ; AVX2-FP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 288(%rdi), %ymm10
+; AVX2-FP-NEXT:    vmovaps 352(%rdi), %ymm8
+; AVX2-FP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovaps 320(%rdi), %ymm10
 ; AVX2-FP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 352(%rdi), %ymm9
+; AVX2-FP-NEXT:    vmovaps 288(%rdi), %ymm11
+; AVX2-FP-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovaps 160(%rdi), %ymm9
 ; AVX2-FP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 128(%rdi), %ymm13
-; AVX2-FP-NEXT:    vmovaps 96(%rdi), %ymm14
-; AVX2-FP-NEXT:    vmovaps 160(%rdi), %ymm1
+; AVX2-FP-NEXT:    vmovaps 128(%rdi), %ymm0
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovaps 96(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vbroadcastsd {{.*#+}} ymm0 = [2,5,2,5,2,5,2,5]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7]
+; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm0 = [0,3,6,1,4,7,2,5]
 ; AVX2-FP-NEXT:    vpermps %ymm1, %ymm0, %ymm1
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7]
-; AVX2-FP-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm8 = [0,3,6,1,4,7,u,u]
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm8, %ymm2
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps %ymm9, %ymm0, %ymm1
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5,6],ymm7[7]
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm8, %ymm2
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps %ymm5, %ymm0, %ymm1
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6],ymm4[7]
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm8, %ymm2
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6],ymm10[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 672(%rdi), %ymm1
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7]
-; AVX2-FP-NEXT:    vpermps %ymm1, %ymm8, %ymm1
-; AVX2-FP-NEXT:    vmovaps 736(%rdi), %ymm2
-; AVX2-FP-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm0, %ymm2
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps (%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 32(%rdi), %ymm3
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7]
-; AVX2-FP-NEXT:    vmovaps %ymm3, %ymm5
-; AVX2-FP-NEXT:    vpermps %ymm1, %ymm8, %ymm1
+; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 64(%rdi), %ymm2
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7]
+; AVX2-FP-NEXT:    vmovaps %ymm2, %ymm8
 ; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm0, %ymm4
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 224(%rdi), %ymm7
-; AVX2-FP-NEXT:    vmovaps 192(%rdi), %ymm15
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm15[0],ymm7[1],ymm15[2,3],ymm7[4],ymm15[5,6],ymm7[7]
-; AVX2-FP-NEXT:    vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps %ymm1, %ymm8, %ymm1
-; AVX2-FP-NEXT:    vmovaps 256(%rdi), %ymm2
-; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm0, %ymm6
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7]
-; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 416(%rdi), %ymm1
-; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 384(%rdi), %ymm3
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7]
-; AVX2-FP-NEXT:    vmovaps %ymm3, %ymm6
-; AVX2-FP-NEXT:    vpermps %ymm1, %ymm8, %ymm1
-; AVX2-FP-NEXT:    vmovaps 448(%rdi), %ymm2
-; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm0, %ymm12
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm4, %ymm0, %ymm4
+; AVX2-FP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovaps 256(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 608(%rdi), %ymm9
-; AVX2-FP-NEXT:    vmovaps 576(%rdi), %ymm11
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6],ymm9[7]
-; AVX2-FP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps %ymm1, %ymm8, %ymm1
-; AVX2-FP-NEXT:    vmovaps 640(%rdi), %ymm2
+; AVX2-FP-NEXT:    vmovaps 224(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FP-NEXT:    vmovaps 192(%rdi), %ymm3
+; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm6, %ymm0, %ymm6
+; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovaps 448(%rdi), %ymm13
+; AVX2-FP-NEXT:    vmovaps 416(%rdi), %ymm12
+; AVX2-FP-NEXT:    vmovaps 384(%rdi), %ymm14
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6],ymm12[7]
+; AVX2-FP-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm10, %ymm0, %ymm10
+; AVX2-FP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovaps 640(%rdi), %ymm6
+; AVX2-FP-NEXT:    vmovaps 608(%rdi), %ymm5
+; AVX2-FP-NEXT:    vmovaps 576(%rdi), %ymm7
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm15 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5,6],ymm5[7]
+; AVX2-FP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm6[2],ymm15[3,4],ymm6[5],ymm15[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm15, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm4[2],ymm9[3,4],ymm4[5],ymm9[6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0],ymm0[1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7]
+; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm15 = [1,4,7,2,5,0,3,6]
+; AVX2-FP-NEXT:    vpermps %ymm0, %ymm15, %ymm0
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX2-FP-NEXT:    vblendps $73, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7]
+; AVX2-FP-NEXT:    vpermps %ymm0, %ymm15, %ymm0
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX2-FP-NEXT:    vblendps $73, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7]
+; AVX2-FP-NEXT:    vpermps %ymm0, %ymm15, %ymm0
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX2-FP-NEXT:    vblendps $73, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7]
+; AVX2-FP-NEXT:    vpermps %ymm0, %ymm15, %ymm0
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7]
-; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm1 = [1,4,7,2,5,u,u,u]
-; AVX2-FP-NEXT:    vpermps %ymm0, %ymm1, %ymm2
-; AVX2-FP-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6]
-; AVX2-FP-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FP-NEXT:    vpermps %ymm10, %ymm0, %ymm14
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = mem[0,1],ymm12[2],mem[3,4],ymm12[5],mem[6,7]
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm1, %ymm2
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-FP-NEXT:    vpermps %ymm13, %ymm0, %ymm14
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm1, %ymm2
-; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm1, %ymm2
-; AVX2-FP-NEXT:    vpermps (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps %ymm5, %ymm4
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1,2],ymm8[3],ymm0[4,5],ymm8[6],ymm0[7]
+; AVX2-FP-NEXT:    vpermps %ymm0, %ymm15, %ymm0
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7]
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm1, %ymm2
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FP-NEXT:    vpermps %ymm5, %ymm0, %ymm14
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm15[0,1],ymm7[2],ymm15[3,4],ymm7[5],ymm15[6,7]
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm1, %ymm2
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FP-NEXT:    vpermps %ymm15, %ymm0, %ymm14
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm10[0],ymm0[1,2],ymm10[3],ymm0[4,5],ymm10[6],ymm0[7]
+; AVX2-FP-NEXT:    vpermps %ymm0, %ymm15, %ymm0
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm13[0],ymm0[1,2],ymm13[3],ymm0[4,5],ymm13[6],ymm0[7]
+; AVX2-FP-NEXT:    vmovaps %ymm13, %ymm14
+; AVX2-FP-NEXT:    vpermps %ymm0, %ymm15, %ymm13
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0],ymm0[1,2],ymm6[3],ymm0[4,5],ymm6[6],ymm0[7]
 ; AVX2-FP-NEXT:    vmovaps %ymm6, %ymm7
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7]
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm1, %ymm2
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FP-NEXT:    vpermps %ymm8, %ymm0, %ymm14
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7]
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm1, %ymm1
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FP-NEXT:    vpermps %ymm9, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm0, %ymm15, %ymm15
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7]
+; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm12 = [2,5,0,3,6,u,u,u]
+; AVX2-FP-NEXT:    vpermps %ymm0, %ymm12, %ymm0
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm9 = ymm11[0,1,0,3,4,5,4,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm9[5,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm0, %ymm12, %ymm0
+; AVX2-FP-NEXT:    vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm1 = mem[0,1,0,3,4,5,4,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm14 = [2,5,0,3,6,u,u,u]
-; AVX2-FP-NEXT:    vpermps %ymm1, %ymm14, %ymm1
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm10[0,1,0,3,4,5,4,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
-; AVX2-FP-NEXT:    vpermps %ymm1, %ymm14, %ymm1
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm5[0,1,0,3,4,5,4,7]
+; AVX2-FP-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm1, %ymm12, %ymm1
+; AVX2-FP-NEXT:    vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = mem[0,1,0,3,4,5,4,7]
 ; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FP-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = ymm12[0,1],mem[2],ymm12[3,4],mem[5],ymm12[6,7]
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm14, %ymm2
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm13[0,1,0,3,4,5,4,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm2, %ymm12, %ymm2
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm10[0,1,0,3,4,5,4,7]
 ; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3,4],ymm3[5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm14, %ymm2
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm4 = ymm15[0,1,0,3,4,5,4,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-FP-NEXT:    vmovups (%rsp), %ymm3 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm3, %ymm12, %ymm3
+; AVX2-FP-NEXT:    vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm4 = mem[0,1,0,3,4,5,4,7]
 ; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7]
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm4 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7]
-; AVX2-FP-NEXT:    vpermps %ymm4, %ymm14, %ymm4
-; AVX2-FP-NEXT:    vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm5 = mem[0,1,0,3,4,5,4,7]
+; AVX2-FP-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm4 = mem[0,1],ymm4[2],mem[3,4],ymm4[5],mem[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm4, %ymm12, %ymm4
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm14[0,1,0,3,4,5,4,7]
 ; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7]
-; AVX2-FP-NEXT:    vpermps %ymm5, %ymm14, %ymm5
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm6 = ymm8[0,1,0,3,4,5,4,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm5 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm5, %ymm12, %ymm5
+; AVX2-FP-NEXT:    vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm6 = mem[0,1,0,3,4,5,4,7]
 ; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7]
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX2-FP-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm6 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7]
-; AVX2-FP-NEXT:    vpermps %ymm6, %ymm14, %ymm6
-; AVX2-FP-NEXT:    vpermilps $196, (%rsp), %ymm7 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm7 = mem[0,1,0,3,4,5,4,7]
+; AVX2-FP-NEXT:    vpermps %ymm6, %ymm12, %ymm6
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[0,1,0,3,4,5,4,7]
 ; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7]
-; AVX2-FP-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm7 = mem[0,1],ymm11[2],mem[3,4],ymm11[5],mem[6,7]
-; AVX2-FP-NEXT:    vpermps %ymm7, %ymm14, %ymm7
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm8 = ymm9[0,1,0,3,4,5,4,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm8, 192(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm8, 128(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm8, 64(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm8, (%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm8, 224(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm8, 160(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm8, 96(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm8, 32(%rsi)
-; AVX2-FP-NEXT:    vmovaps %ymm0, 192(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm0, 128(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm0, 64(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm0, (%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm0, 224(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm0, 160(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm0, 96(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm0, 32(%rdx)
-; AVX2-FP-NEXT:    vmovaps %ymm7, 192(%rcx)
-; AVX2-FP-NEXT:    vmovaps %ymm6, 224(%rcx)
-; AVX2-FP-NEXT:    vmovaps %ymm5, 128(%rcx)
-; AVX2-FP-NEXT:    vmovaps %ymm4, 160(%rcx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 192(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 128(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 64(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, (%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 224(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 160(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 96(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 32(%rsi)
+; AVX2-FP-NEXT:    vmovaps %ymm15, 192(%rdx)
+; AVX2-FP-NEXT:    vmovaps %ymm13, 128(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 64(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, (%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 224(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 160(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 96(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 32(%rdx)
+; AVX2-FP-NEXT:    vmovaps %ymm6, 192(%rcx)
+; AVX2-FP-NEXT:    vmovaps %ymm5, 224(%rcx)
+; AVX2-FP-NEXT:    vmovaps %ymm4, 128(%rcx)
+; AVX2-FP-NEXT:    vmovaps %ymm3, 160(%rcx)
 ; AVX2-FP-NEXT:    vmovaps %ymm2, 64(%rcx)
-; AVX2-FP-NEXT:    vmovaps %ymm3, 96(%rcx)
-; AVX2-FP-NEXT:    vmovaps %ymm1, (%rcx)
-; AVX2-FP-NEXT:    vmovaps %ymm10, 32(%rcx)
+; AVX2-FP-NEXT:    vmovaps %ymm1, 96(%rcx)
+; AVX2-FP-NEXT:    vmovaps %ymm0, (%rcx)
+; AVX2-FP-NEXT:    vmovaps %ymm9, 32(%rcx)
 ; AVX2-FP-NEXT:    addq $1032, %rsp # imm = 0x408
 ; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
@@ -3506,233 +3376,214 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-LABEL: load_i32_stride3_vf64:
 ; AVX2-FCP:       # %bb.0:
 ; AVX2-FCP-NEXT:    subq $1032, %rsp # imm = 0x408
+; AVX2-FCP-NEXT:    vmovaps 736(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps 704(%rdi), %ymm3
 ; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 512(%rdi), %ymm4
+; AVX2-FCP-NEXT:    vmovaps 672(%rdi), %ymm4
 ; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 480(%rdi), %ymm5
+; AVX2-FCP-NEXT:    vmovaps 544(%rdi), %ymm5
 ; AVX2-FCP-NEXT:    vmovups %ymm5, (%rsp) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 544(%rdi), %ymm7
+; AVX2-FCP-NEXT:    vmovaps 512(%rdi), %ymm6
+; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 480(%rdi), %ymm7
 ; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 320(%rdi), %ymm8
+; AVX2-FCP-NEXT:    vmovaps 352(%rdi), %ymm8
 ; AVX2-FCP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 288(%rdi), %ymm9
-; AVX2-FCP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 352(%rdi), %ymm10
+; AVX2-FCP-NEXT:    vmovaps 320(%rdi), %ymm9
+; AVX2-FCP-NEXT:    vmovaps 288(%rdi), %ymm10
 ; AVX2-FCP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %ymm11
-; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm14
-; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm0 = [2,5,2,5,2,5,2,5]
-; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm14[0],ymm11[1],ymm14[2,3],ymm11[4],ymm14[5,6],ymm11[7]
-; AVX2-FCP-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %ymm11
 ; AVX2-FCP-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm6 = [0,3,6,1,4,7,u,u]
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm6, %ymm2
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm13
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm13[0],ymm1[1],ymm13[2,3],ymm1[4],ymm13[5,6],ymm1[7]
+; AVX2-FCP-NEXT:    vmovaps %ymm1, %ymm14
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm0 = [0,3,6,1,4,7,2,5]
+; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermps %ymm10, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6],ymm8[7]
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm6, %ymm2
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6],ymm9[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm6, %ymm2
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 672(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7]
-; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm6, %ymm1
-; AVX2-FCP-NEXT:    vmovaps 736(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm4
-; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm5
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7]
-; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm6, %ymm1
-; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %ymm3
+; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
+; AVX2-FCP-NEXT:    vmovaps %ymm2, %ymm5
 ; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm0, %ymm3
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %ymm15
-; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3],ymm15[4],ymm1[5,6],ymm15[7]
-; AVX2-FCP-NEXT:    vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm6, %ymm1
 ; AVX2-FCP-NEXT:    vmovaps 256(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm0, %ymm7
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7]
+; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %ymm3
+; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 416(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7]
+; AVX2-FCP-NEXT:    vmovaps %ymm3, %ymm4
+; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 384(%rdi), %ymm7
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6],ymm1[7]
-; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm6, %ymm1
-; AVX2-FCP-NEXT:    vmovaps 448(%rdi), %ymm10
-; AVX2-FCP-NEXT:    vpermps %ymm10, %ymm0, %ymm12
-; AVX2-FCP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7]
+; AVX2-FCP-NEXT:    vmovaps 448(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 608(%rdi), %ymm13
-; AVX2-FCP-NEXT:    vmovaps 576(%rdi), %ymm9
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm9[0],ymm13[1],ymm9[2,3],ymm13[4],ymm9[5,6],ymm13[7]
-; AVX2-FCP-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm6, %ymm1
-; AVX2-FCP-NEXT:    vmovaps 640(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovaps 416(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vmovaps 384(%rdi), %ymm3
+; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm0, %ymm1
+; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 640(%rdi), %ymm8
+; AVX2-FCP-NEXT:    vmovaps 608(%rdi), %ymm7
+; AVX2-FCP-NEXT:    vmovaps 576(%rdi), %ymm11
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm15 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6],ymm7[7]
+; AVX2-FCP-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm8[2],ymm15[3,4],ymm8[5],ymm15[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermps %ymm15, %ymm0, %ymm0
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm11[2],ymm14[3,4],ymm11[5],ymm14[6,7]
-; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm1 = [1,4,7,2,5,u,u,u]
-; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm2
-; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6]
-; AVX2-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm0, %ymm14
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps %ymm14, %ymm10
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7]
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm2 = mem[0,1],ymm12[2],mem[3,4],ymm12[5],mem[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm1, %ymm2
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm0, %ymm14
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $219, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm1, %ymm2
-; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm1, %ymm2
-; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm1, %ymm2
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm0, %ymm14
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm2 = mem[0,1],ymm15[2],mem[3,4],ymm15[5],mem[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm1, %ymm2
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm0, %ymm14
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm15[2],ymm7[3,4],ymm15[5],ymm7[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm1, %ymm2
-; AVX2-FCP-NEXT:    vpermps %ymm10, %ymm0, %ymm14
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm13[2],ymm9[3,4],ymm13[5],ymm9[6,7]
-; AVX2-FCP-NEXT:    vmovaps %ymm9, %ymm13
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm1, %ymm1
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpermps %ymm9, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm12[0],ymm0[1,2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7]
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm15 = [1,4,7,2,5,0,3,6]
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm15, %ymm0
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps %ymm9, %ymm14
+; AVX2-FCP-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm0 = mem[0,1],ymm9[2],mem[3,4],ymm9[5],mem[6,7]
+; AVX2-FCP-NEXT:    vblendps $73, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7]
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm15, %ymm0
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
-; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm14 = [0,1,0,3,0,1,4,7]
-; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm14, %ymm0
-; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm1, %ymm2
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm14, %ymm2
-; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FCP-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm0 = ymm12[0,1],mem[2],ymm12[3,4],mem[5],ymm12[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm14, %ymm2
-; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm14, %ymm2
-; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FCP-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX2-FCP-NEXT:    vblendps $73, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7]
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm15, %ymm0
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $36, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm3[5,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm7[2],ymm15[3,4],ymm7[5],ymm15[6,7]
-; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm7 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7]
-; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm1, %ymm7
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm9, %ymm14, %ymm6
-; AVX2-FCP-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm8 = mem[0,1],ymm13[2],mem[3,4],ymm13[5],mem[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm1, %ymm1
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm4, 192(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm4, 128(%rsi)
+; AVX2-FCP-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX2-FCP-NEXT:    vblendps $73, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7]
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm15, %ymm0
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7]
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm15, %ymm0
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm4, 64(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, (%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 224(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 160(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 96(%rsi)
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7]
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm15, %ymm0
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 32(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm4, 192(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm4, 128(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm4, 64(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm4, (%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm4, 224(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm4, 160(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm4, 96(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%rdx)
-; AVX2-FCP-NEXT:    vmovaps %ymm1, 192(%rcx)
-; AVX2-FCP-NEXT:    vmovaps %ymm7, 224(%rcx)
-; AVX2-FCP-NEXT:    vmovaps %ymm0, 128(%rcx)
-; AVX2-FCP-NEXT:    vmovaps %ymm3, 160(%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0],ymm0[1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7]
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm15, %ymm0
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1,2],ymm8[3],ymm0[4,5],ymm8[6],ymm0[7]
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm15, %ymm11
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5,6],ymm12[7]
+; AVX2-FCP-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm8 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm15 = ymm8[0],ymm2[1],ymm8[2,3],ymm2[4],ymm8[5,6],ymm2[7]
+; AVX2-FCP-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm8 = ymm14[0,1],mem[2],ymm14[3,4],mem[5],ymm14[6,7]
+; AVX2-FCP-NEXT:    vblendps $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm8 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5,6],mem[7]
+; AVX2-FCP-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm7 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vblendps $146, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm7 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5,6],mem[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3],ymm9[4],ymm1[5,6],ymm9[7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm6 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vblendps $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm6 = ymm6[0],mem[1],ymm6[2,3],mem[4],ymm6[5,6],mem[7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm3 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7]
+; AVX2-FCP-NEXT:    vblendps $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5,6],mem[7]
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm5 = [2,5,0,3,6,1,4,7]
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm5, %ymm0
+; AVX2-FCP-NEXT:    vpermps %ymm15, %ymm5, %ymm4
+; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm5, %ymm8
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm5, %ymm2
+; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm5, %ymm7
+; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm5, %ymm1
+; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm5, %ymm6
+; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm5, %ymm3
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 192(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 128(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 64(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, (%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 224(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 160(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 96(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 32(%rsi)
+; AVX2-FCP-NEXT:    vmovaps %ymm11, 192(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 128(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 64(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, (%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 224(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 160(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 96(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 32(%rdx)
+; AVX2-FCP-NEXT:    vmovaps %ymm3, 192(%rcx)
+; AVX2-FCP-NEXT:    vmovaps %ymm6, 224(%rcx)
+; AVX2-FCP-NEXT:    vmovaps %ymm1, 128(%rcx)
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 160(%rcx)
 ; AVX2-FCP-NEXT:    vmovaps %ymm2, 64(%rcx)
-; AVX2-FCP-NEXT:    vmovaps %ymm11, 96(%rcx)
-; AVX2-FCP-NEXT:    vmovaps %ymm5, (%rcx)
-; AVX2-FCP-NEXT:    vmovaps %ymm10, 32(%rcx)
+; AVX2-FCP-NEXT:    vmovaps %ymm8, 96(%rcx)
+; AVX2-FCP-NEXT:    vmovaps %ymm4, (%rcx)
+; AVX2-FCP-NEXT:    vmovaps %ymm0, 32(%rcx)
 ; AVX2-FCP-NEXT:    addq $1032, %rsp # imm = 0x408
 ; AVX2-FCP-NEXT:    vzeroupper
 ; AVX2-FCP-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
index f01aa90e3efc..dd94dffa8593 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
@@ -851,14 +851,11 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
 ; AVX2-NEXT:    vinserti128 $1, 128(%rdi), %ymm6, %ymm6
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5,6],ymm6[7]
-; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [1,6,3,0]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
-; AVX2-NEXT:    vpermd %ymm7, %ymm6, %ymm6
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7]
-; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [5,2,7,0,5,2,7,0]
-; AVX2-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX2-NEXT:    vpermd %ymm7, %ymm8, %ymm7
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7]
+; AVX2-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [1,6,3,0,5,2,7,0]
+; AVX2-NEXT:    vpermd %ymm6, %ymm7, %ymm6
 ; AVX2-NEXT:    vpbroadcastd 144(%rdi), %ymm7
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
 ; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [2,7,4,0]
@@ -874,10 +871,10 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [1,6,1,6,1,6,1,6]
 ; AVX2-NEXT:    vpermd %ymm0, %ymm8, %ymm8
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[3,0,2,2,7,4,6,6]
 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm10 = ymm1[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm1[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7]
+; AVX2-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,1,6,2,7,4,0,0]
+; AVX2-NEXT:    vpermd %ymm9, %ymm10, %ymm9
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm3[0,1],ymm4[0,1]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
@@ -911,14 +908,11 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
 ; AVX2-FP-NEXT:    vinserti128 $1, 128(%rdi), %ymm6, %ymm6
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5,6],ymm6[7]
-; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [1,6,3,0]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
-; AVX2-FP-NEXT:    vpermd %ymm7, %ymm6, %ymm6
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7]
-; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [5,2,7,0,5,2,7,0]
-; AVX2-FP-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX2-FP-NEXT:    vpermd %ymm7, %ymm8, %ymm7
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7]
+; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [1,6,3,0,5,2,7,0]
+; AVX2-FP-NEXT:    vpermd %ymm6, %ymm7, %ymm6
 ; AVX2-FP-NEXT:    vpbroadcastd 144(%rdi), %ymm7
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
 ; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [2,7,4,0]
@@ -934,10 +928,10 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [1,6,1,6,1,6,1,6]
 ; AVX2-FP-NEXT:    vpermd %ymm0, %ymm8, %ymm8
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[3,0,2,2,7,4,6,6]
 ; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm10 = ymm1[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm1[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7]
+; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,1,6,2,7,4,0,0]
+; AVX2-FP-NEXT:    vpermd %ymm9, %ymm10, %ymm9
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
 ; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm3[0,1],ymm4[0,1]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
@@ -971,14 +965,11 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, 128(%rdi), %ymm6, %ymm6
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5,6],ymm6[7]
-; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [1,6,3,0]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm7, %ymm6, %ymm6
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7]
-; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [5,2,7,0,5,2,7,0]
-; AVX2-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX2-FCP-NEXT:    vpermd %ymm7, %ymm8, %ymm7
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [1,6,3,0,5,2,7,0]
+; AVX2-FCP-NEXT:    vpermd %ymm6, %ymm7, %ymm6
 ; AVX2-FCP-NEXT:    vpbroadcastd 144(%rdi), %ymm7
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
 ; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [2,7,4,0]
@@ -994,10 +985,10 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [1,6,1,6,1,6,1,6]
 ; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm8, %ymm8
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[3,0,2,2,7,4,6,6]
 ; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm10 = ymm1[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm1[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,1,6,2,7,4,0,0]
+; AVX2-FCP-NEXT:    vpermd %ymm9, %ymm10, %ymm9
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
 ; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm3[0,1],ymm4[0,1]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
@@ -1672,349 +1663,355 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX2-LABEL: load_i32_stride5_vf16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rax
-; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm0
+; AVX2-NEXT:    subq $72, %rsp
+; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm1
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm4
 ; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm6
 ; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm8
 ; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm9
-; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm1
+; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm0
 ; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm2
+; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 192(%rdi), %ymm3
 ; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm5
-; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [0,5,2,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7]
-; AVX2-NEXT:    vpermd %ymm10, %ymm7, %ymm10
-; AVX2-NEXT:    vinserti128 $1, 288(%rdi), %ymm10, %ymm11
+; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [0,5,2,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7]
+; AVX2-NEXT:    vpermd %ymm7, %ymm10, %ymm7
+; AVX2-NEXT:    vinserti128 $1, 288(%rdi), %ymm7, %ymm11
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm12 = ymm2[0,1,0,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm1[4],ymm12[5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6],ymm11[7]
-; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm9[0,1,0,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4],ymm10[5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7]
-; AVX2-NEXT:    vpermd %ymm11, %ymm7, %ymm7
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, 128(%rdi), %ymm7, %ymm7
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm0[4],ymm12[5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5,6],ymm11[7]
 ; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [1,6,3,0]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7]
-; AVX2-NEXT:    vpermd %ymm11, %ymm7, %ymm11
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7]
-; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [5,2,7,0,5,2,7,0]
-; AVX2-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX2-NEXT:    vpermd %ymm12, %ymm13, %ymm12
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7]
-; AVX2-NEXT:    vpbroadcastd 144(%rdi), %ymm12
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5,6],ymm12[7]
-; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7]
-; AVX2-NEXT:    vpermd %ymm12, %ymm7, %ymm7
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
-; AVX2-NEXT:    vpermd %ymm12, %ymm13, %ymm12
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm12[3,4,5,6,7]
-; AVX2-NEXT:    vpbroadcastd 304(%rdi), %ymm12
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm12[7]
-; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [2,7,4,0]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm11 = ymm9[0,1,0,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4],ymm11[5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7]
+; AVX2-NEXT:    vpermd %ymm12, %ymm10, %ymm10
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-NEXT:    vinserti128 $1, 128(%rdi), %ymm10, %ymm10
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7]
+; AVX2-NEXT:    vmovdqu %ymm7, (%rsp) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
+; AVX2-NEXT:    vmovdqa %ymm0, %ymm7
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7]
+; AVX2-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [1,6,3,0,5,2,7,0]
+; AVX2-NEXT:    vpermd %ymm11, %ymm12, %ymm11
+; AVX2-NEXT:    vpbroadcastd 304(%rdi), %ymm13
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5,6],ymm13[7]
+; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7]
+; AVX2-NEXT:    vpermd %ymm13, %ymm12, %ymm12
+; AVX2-NEXT:    vpbroadcastd 144(%rdi), %ymm13
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm13[7]
+; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm14 = [2,7,4,0]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7]
-; AVX2-NEXT:    vpermd %ymm13, %ymm7, %ymm13
-; AVX2-NEXT:    vinserti128 $1, 96(%rdi), %ymm0, %ymm14
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm8[0,1,2,3,4,5,6],ymm14[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7]
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [0,5,0,5,0,5,0,5]
-; AVX2-NEXT:    vpermd %ymm0, %ymm14, %ymm15
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-NEXT:    vpermd %ymm13, %ymm14, %ymm13
+; AVX2-NEXT:    vinserti128 $1, 96(%rdi), %ymm0, %ymm15
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3,4,5,6],ymm15[7]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4,5,6,7]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5]
+; AVX2-NEXT:    vmovdqa %ymm1, %ymm12
+; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm15
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7]
-; AVX2-NEXT:    vpermd %ymm15, %ymm7, %ymm7
+; AVX2-NEXT:    vpermd %ymm15, %ymm14, %ymm14
 ; AVX2-NEXT:    vinserti128 $1, 256(%rdi), %ymm0, %ymm15
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3,4,5,6],ymm15[7]
+; AVX2-NEXT:    vmovdqa %ymm7, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3,4,5,6],ymm15[7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm15[3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7]
 ; AVX2-NEXT:    vmovdqa 288(%rdi), %ymm15
-; AVX2-NEXT:    vpermd %ymm15, %ymm14, %ymm14
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm7[0,1,2,3,4,5],ymm14[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[3,0,2,2,7,4,6,6]
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm10 = ymm4[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3,4,5,6,7]
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [1,6,1,6,1,6,1,6]
-; AVX2-NEXT:    vpermd %ymm0, %ymm10, %ymm11
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm11 = ymm11[3,0,2,2,7,4,6,6]
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm12 = ymm5[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm5[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,3,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7]
-; AVX2-NEXT:    vpermd %ymm15, %ymm10, %ymm10
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7]
-; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm8[0,1],ymm9[0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7]
+; AVX2-NEXT:    vpermd %ymm15, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7]
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm7 = ymm4[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2],ymm7[3],ymm0[4],ymm7[5,6],ymm0[7]
+; AVX2-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,6,2,7,4,0,0]
+; AVX2-NEXT:    vpermd %ymm0, %ymm7, %ymm2
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [1,6,1,6,1,6,1,6]
+; AVX2-NEXT:    vpermd %ymm12, %ymm0, %ymm10
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3,4,5],ymm10[6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm11 = ymm5[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm5[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7]
+; AVX2-NEXT:    vpermd %ymm10, %ymm7, %ymm7
+; AVX2-NEXT:    vpermd %ymm15, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm8[0,1],ymm9[0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5],ymm7[6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5],ymm6[6,7]
 ; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,1,6,0]
 ; AVX2-NEXT:    vpermd %ymm4, %ymm6, %ymm4
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3,4,5,6,7]
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [2,7,2,7,2,7,2,7]
-; AVX2-NEXT:    vpermd %ymm0, %ymm8, %ymm0
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7]
+; AVX2-NEXT:    vpermd %ymm12, %ymm7, %ymm8
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7]
 ; AVX2-NEXT:    vpermd %ymm3, %ymm6, %ymm3
-; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm2[0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-NEXT:    vpermd %ymm15, %ymm8, %ymm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm2, 32(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm2, (%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm2, 32(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm2, (%rdx)
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm2[0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-NEXT:    vpermd %ymm15, %ymm7, %ymm3
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm3, 32(%rsi)
+; AVX2-NEXT:    vmovups (%rsp), %ymm3 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm3, (%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm3, 32(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm1, (%rdx)
 ; AVX2-NEXT:    vmovdqa %ymm14, 32(%rcx)
-; AVX2-NEXT:    vmovdqa %ymm13, (%rcx)
-; AVX2-NEXT:    vmovdqa %ymm10, 32(%r8)
-; AVX2-NEXT:    vmovdqa %ymm7, (%r8)
-; AVX2-NEXT:    vmovdqa %ymm1, 32(%r9)
-; AVX2-NEXT:    vmovdqa %ymm0, (%r9)
-; AVX2-NEXT:    popq %rax
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm1, (%rcx)
+; AVX2-NEXT:    vmovdqa %ymm0, 32(%r8)
+; AVX2-NEXT:    vmovdqa %ymm13, (%r8)
+; AVX2-NEXT:    vmovdqa %ymm2, 32(%r9)
+; AVX2-NEXT:    vmovdqa %ymm4, (%r9)
+; AVX2-NEXT:    addq $72, %rsp
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX2-FP-LABEL: load_i32_stride5_vf16:
 ; AVX2-FP:       # %bb.0:
-; AVX2-FP-NEXT:    pushq %rax
-; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm0
+; AVX2-FP-NEXT:    subq $72, %rsp
+; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm4
 ; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm6
 ; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm8
 ; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm9
-; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm1
+; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm0
 ; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm2
+; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %ymm3
 ; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm5
-; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [0,5,2,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7]
-; AVX2-FP-NEXT:    vpermd %ymm10, %ymm7, %ymm10
-; AVX2-FP-NEXT:    vinserti128 $1, 288(%rdi), %ymm10, %ymm11
+; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [0,5,2,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7]
+; AVX2-FP-NEXT:    vpermd %ymm7, %ymm10, %ymm7
+; AVX2-FP-NEXT:    vinserti128 $1, 288(%rdi), %ymm7, %ymm11
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm2[0,1,0,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm1[4],ymm12[5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6],ymm11[7]
-; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm9[0,1,0,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4],ymm10[5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7]
-; AVX2-FP-NEXT:    vpermd %ymm11, %ymm7, %ymm7
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-FP-NEXT:    vinserti128 $1, 128(%rdi), %ymm7, %ymm7
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7]
-; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [1,6,3,0]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7]
-; AVX2-FP-NEXT:    vpermd %ymm11, %ymm7, %ymm11
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7]
-; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [5,2,7,0,5,2,7,0]
-; AVX2-FP-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX2-FP-NEXT:    vpermd %ymm12, %ymm13, %ymm12
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpbroadcastd 144(%rdi), %ymm12
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5,6],ymm12[7]
-; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7]
-; AVX2-FP-NEXT:    vpermd %ymm12, %ymm7, %ymm7
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
-; AVX2-FP-NEXT:    vpermd %ymm12, %ymm13, %ymm12
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm12[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpbroadcastd 304(%rdi), %ymm12
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm12[7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm0[4],ymm12[5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5,6],ymm11[7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [2,7,4,0]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm11 = ymm9[0,1,0,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4],ymm11[5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7]
+; AVX2-FP-NEXT:    vpermd %ymm12, %ymm10, %ymm10
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-FP-NEXT:    vinserti128 $1, 128(%rdi), %ymm10, %ymm10
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7]
+; AVX2-FP-NEXT:    vmovdqu %ymm7, (%rsp) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
+; AVX2-FP-NEXT:    vmovdqa %ymm0, %ymm7
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7]
+; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [1,6,3,0,5,2,7,0]
+; AVX2-FP-NEXT:    vpermd %ymm11, %ymm12, %ymm11
+; AVX2-FP-NEXT:    vpbroadcastd 304(%rdi), %ymm13
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5,6],ymm13[7]
+; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7]
+; AVX2-FP-NEXT:    vpermd %ymm13, %ymm12, %ymm12
+; AVX2-FP-NEXT:    vpbroadcastd 144(%rdi), %ymm13
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm13[7]
+; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm14 = [2,7,4,0]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7]
-; AVX2-FP-NEXT:    vpermd %ymm13, %ymm7, %ymm13
-; AVX2-FP-NEXT:    vinserti128 $1, 96(%rdi), %ymm0, %ymm14
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm8[0,1,2,3,4,5,6],ymm14[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [0,5,0,5,0,5,0,5]
-; AVX2-FP-NEXT:    vpermd %ymm0, %ymm14, %ymm15
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-FP-NEXT:    vpermd %ymm13, %ymm14, %ymm13
+; AVX2-FP-NEXT:    vinserti128 $1, 96(%rdi), %ymm0, %ymm15
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3,4,5,6],ymm15[7]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5]
+; AVX2-FP-NEXT:    vmovdqa %ymm1, %ymm12
+; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm15
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7]
-; AVX2-FP-NEXT:    vpermd %ymm15, %ymm7, %ymm7
+; AVX2-FP-NEXT:    vpermd %ymm15, %ymm14, %ymm14
 ; AVX2-FP-NEXT:    vinserti128 $1, 256(%rdi), %ymm0, %ymm15
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3,4,5,6],ymm15[7]
+; AVX2-FP-NEXT:    vmovdqa %ymm7, %ymm1
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3,4,5,6],ymm15[7]
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm15[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %ymm15
-; AVX2-FP-NEXT:    vpermd %ymm15, %ymm14, %ymm14
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm7[0,1,2,3,4,5],ymm14[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[3,0,2,2,7,4,6,6]
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm10 = ymm4[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [1,6,1,6,1,6,1,6]
-; AVX2-FP-NEXT:    vpermd %ymm0, %ymm10, %ymm11
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm11 = ymm11[3,0,2,2,7,4,6,6]
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm12 = ymm5[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm5[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,3,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermd %ymm15, %ymm10, %ymm10
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7]
-; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm8[0,1],ymm9[0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7]
+; AVX2-FP-NEXT:    vpermd %ymm15, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7]
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm7 = ymm4[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2],ymm7[3],ymm0[4],ymm7[5,6],ymm0[7]
+; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,6,2,7,4,0,0]
+; AVX2-FP-NEXT:    vpermd %ymm0, %ymm7, %ymm2
+; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [1,6,1,6,1,6,1,6]
+; AVX2-FP-NEXT:    vpermd %ymm12, %ymm0, %ymm10
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3,4,5],ymm10[6,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm11 = ymm5[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm5[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7]
+; AVX2-FP-NEXT:    vpermd %ymm10, %ymm7, %ymm7
+; AVX2-FP-NEXT:    vpermd %ymm15, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm8[0,1],ymm9[0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5],ymm7[6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5],ymm6[6,7]
 ; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,1,6,0]
 ; AVX2-FP-NEXT:    vpermd %ymm4, %ymm6, %ymm4
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [2,7,2,7,2,7,2,7]
-; AVX2-FP-NEXT:    vpermd %ymm0, %ymm8, %ymm0
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7]
+; AVX2-FP-NEXT:    vpermd %ymm12, %ymm7, %ymm8
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7]
 ; AVX2-FP-NEXT:    vpermd %ymm3, %ymm6, %ymm3
-; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm2[0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermd %ymm15, %ymm8, %ymm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm2, 32(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm2, (%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm2, 32(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm2, (%rdx)
+; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm2[0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermd %ymm15, %ymm7, %ymm3
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm3, 32(%rsi)
+; AVX2-FP-NEXT:    vmovups (%rsp), %ymm3 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm3, (%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm3, 32(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm1, (%rdx)
 ; AVX2-FP-NEXT:    vmovdqa %ymm14, 32(%rcx)
-; AVX2-FP-NEXT:    vmovdqa %ymm13, (%rcx)
-; AVX2-FP-NEXT:    vmovdqa %ymm10, 32(%r8)
-; AVX2-FP-NEXT:    vmovdqa %ymm7, (%r8)
-; AVX2-FP-NEXT:    vmovdqa %ymm1, 32(%r9)
-; AVX2-FP-NEXT:    vmovdqa %ymm0, (%r9)
-; AVX2-FP-NEXT:    popq %rax
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm1, (%rcx)
+; AVX2-FP-NEXT:    vmovdqa %ymm0, 32(%r8)
+; AVX2-FP-NEXT:    vmovdqa %ymm13, (%r8)
+; AVX2-FP-NEXT:    vmovdqa %ymm2, 32(%r9)
+; AVX2-FP-NEXT:    vmovdqa %ymm4, (%r9)
+; AVX2-FP-NEXT:    addq $72, %rsp
 ; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
 ;
 ; AVX2-FCP-LABEL: load_i32_stride5_vf16:
 ; AVX2-FCP:       # %bb.0:
-; AVX2-FCP-NEXT:    pushq %rax
-; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm0
+; AVX2-FCP-NEXT:    subq $72, %rsp
+; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm4
 ; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm6
 ; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm8
 ; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm9
-; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm0
 ; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %ymm3
 ; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm5
-; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [0,5,2,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm7, %ymm10
-; AVX2-FCP-NEXT:    vinserti128 $1, 288(%rdi), %ymm10, %ymm11
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [0,5,2,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm7, %ymm10, %ymm7
+; AVX2-FCP-NEXT:    vinserti128 $1, 288(%rdi), %ymm7, %ymm11
 ; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm2[0,1,0,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm1[4],ymm12[5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6],ymm11[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm9[0,1,0,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4],ymm10[5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm7, %ymm7
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-FCP-NEXT:    vinserti128 $1, 128(%rdi), %ymm7, %ymm7
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [1,6,3,0]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm7, %ymm11
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7]
-; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [5,2,7,0,5,2,7,0]
-; AVX2-FCP-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX2-FCP-NEXT:    vpermd %ymm12, %ymm13, %ymm12
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 144(%rdi), %ymm12
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5,6],ymm12[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm12, %ymm7, %ymm7
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm12, %ymm13, %ymm12
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm12[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 304(%rdi), %ymm12
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm12[7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm0[4],ymm12[5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5,6],ymm11[7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [2,7,4,0]
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm9[0,1,0,3]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4],ymm11[5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm12, %ymm10, %ymm10
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-FCP-NEXT:    vinserti128 $1, 128(%rdi), %ymm10, %ymm10
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm7, (%rsp) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vmovdqa %ymm0, %ymm7
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [1,6,3,0,5,2,7,0]
+; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm12, %ymm11
+; AVX2-FCP-NEXT:    vpbroadcastd 304(%rdi), %ymm13
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5,6],ymm13[7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7]
+; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm12, %ymm12
+; AVX2-FCP-NEXT:    vpbroadcastd 144(%rdi), %ymm13
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm13[7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm14 = [2,7,4,0]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm7, %ymm13
-; AVX2-FCP-NEXT:    vinserti128 $1, 96(%rdi), %ymm0, %ymm14
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm8[0,1,2,3,4,5,6],ymm14[7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [0,5,0,5,0,5,0,5]
-; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm14, %ymm15
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm14, %ymm13
+; AVX2-FCP-NEXT:    vinserti128 $1, 96(%rdi), %ymm0, %ymm15
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3,4,5,6],ymm15[7]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5]
+; AVX2-FCP-NEXT:    vmovdqa %ymm1, %ymm12
+; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm15
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm15, %ymm7, %ymm7
+; AVX2-FCP-NEXT:    vpermd %ymm15, %ymm14, %ymm14
 ; AVX2-FCP-NEXT:    vinserti128 $1, 256(%rdi), %ymm0, %ymm15
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3,4,5,6],ymm15[7]
+; AVX2-FCP-NEXT:    vmovdqa %ymm7, %ymm1
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3,4,5,6],ymm15[7]
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm15[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa 288(%rdi), %ymm15
-; AVX2-FCP-NEXT:    vpermd %ymm15, %ymm14, %ymm14
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm7[0,1,2,3,4,5],ymm14[6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[3,0,2,2,7,4,6,6]
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm10 = ymm4[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [1,6,1,6,1,6,1,6]
-; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm10, %ymm11
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm11 = ymm11[3,0,2,2,7,4,6,6]
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm12 = ymm5[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm5[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,3,2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm15, %ymm10, %ymm10
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7]
-; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm8[0,1],ymm9[0,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm15, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7]
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm7 = ymm4[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2],ymm7[3],ymm0[4],ymm7[5,6],ymm0[7]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,6,2,7,4,0,0]
+; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm7, %ymm2
+; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [1,6,1,6,1,6,1,6]
+; AVX2-FCP-NEXT:    vpermd %ymm12, %ymm0, %ymm10
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3,4,5],ymm10[6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm11 = ymm5[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm5[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7]
+; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm7, %ymm7
+; AVX2-FCP-NEXT:    vpermd %ymm15, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm8[0,1],ymm9[0,1]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5],ymm7[6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5],ymm6[6,7]
 ; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [4,1,6,0]
 ; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm6, %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [2,7,2,7,2,7,2,7]
-; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm8, %ymm0
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7]
+; AVX2-FCP-NEXT:    vpermd %ymm12, %ymm7, %ymm8
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7]
 ; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm6, %ymm3
-; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm2[0,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm15, %ymm8, %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm2, 32(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm2, (%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm2, 32(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm2, (%rdx)
+; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm2[0,1]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm15, %ymm7, %ymm3
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm3, 32(%rsi)
+; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm3, (%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm3, 32(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm1, (%rdx)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm14, 32(%rcx)
-; AVX2-FCP-NEXT:    vmovdqa %ymm13, (%rcx)
-; AVX2-FCP-NEXT:    vmovdqa %ymm10, 32(%r8)
-; AVX2-FCP-NEXT:    vmovdqa %ymm7, (%r8)
-; AVX2-FCP-NEXT:    vmovdqa %ymm1, 32(%r9)
-; AVX2-FCP-NEXT:    vmovdqa %ymm0, (%r9)
-; AVX2-FCP-NEXT:    popq %rax
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm1, (%rcx)
+; AVX2-FCP-NEXT:    vmovdqa %ymm0, 32(%r8)
+; AVX2-FCP-NEXT:    vmovdqa %ymm13, (%r8)
+; AVX2-FCP-NEXT:    vmovdqa %ymm2, 32(%r9)
+; AVX2-FCP-NEXT:    vmovdqa %ymm4, (%r9)
+; AVX2-FCP-NEXT:    addq $72, %rsp
 ; AVX2-FCP-NEXT:    vzeroupper
 ; AVX2-FCP-NEXT:    retq
 ;
@@ -3354,33 +3351,31 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX2-LABEL: load_i32_stride5_vf32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    subq $1000, %rsp # imm = 0x3E8
-; AVX2-NEXT:    vmovdqa 384(%rdi), %ymm14
+; AVX2-NEXT:    subq $968, %rsp # imm = 0x3C8
+; AVX2-NEXT:    vmovdqa 384(%rdi), %ymm4
 ; AVX2-NEXT:    vmovdqa 416(%rdi), %ymm5
-; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 352(%rdi), %ymm6
-; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 320(%rdi), %ymm7
-; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 544(%rdi), %ymm8
+; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 576(%rdi), %ymm9
 ; AVX2-NEXT:    vmovdqa 512(%rdi), %ymm10
-; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu %ymm10, (%rsp) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 480(%rdi), %ymm15
 ; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm13
-; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm11
+; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm14
 ; AVX2-NEXT:    vmovdqa 192(%rdi), %ymm2
-; AVX2-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm1
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
+; AVX2-NEXT:    vmovdqa %ymm2, %ymm11
 ; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vinserti128 $1, 288(%rdi), %ymm1, %ymm2
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm11[0,1,0,3]
-; AVX2-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm14[0,1,0,3]
+; AVX2-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4],ymm3[5,6,7]
-; AVX2-NEXT:    vmovdqa %ymm13, %ymm12
+; AVX2-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -3388,105 +3383,99 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3]
-; AVX2-NEXT:    vmovdqa %ymm9, %ymm13
 ; AVX2-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7]
-; AVX2-NEXT:    vmovdqa %ymm8, %ymm10
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-NEXT:    vinserti128 $1, 608(%rdi), %ymm1, %ymm1
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7]
+; AVX2-NEXT:    vmovdqa %ymm7, %ymm10
+; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm5[0,1,0,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4],ymm2[5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-NEXT:    vinserti128 $1, 448(%rdi), %ymm1, %ymm1
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm7
-; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7]
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm3
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7]
+; AVX2-NEXT:    vmovdqa %ymm3, %ymm8
+; AVX2-NEXT:    vmovdqa %ymm2, %ymm7
 ; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm6
-; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm9
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm9[0,1,0,3]
-; AVX2-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4],ymm1[5,6,7]
+; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm12
+; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm1
+; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4],ymm1[5,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-NEXT:    vinserti128 $1, 128(%rdi), %ymm0, %ymm0
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [1,6,3,0]
-; AVX2-NEXT:    vmovdqu (%rsp), %ymm8 # 32-byte Reload
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm8[2,3],ymm4[4,5,6,7]
-; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7]
-; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [5,2,7,0,5,2,7,0]
-; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX2-NEXT:    vpermd %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm11[2,3],ymm3[4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [1,6,3,0,5,2,7,0]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vpbroadcastd 304(%rdi), %ymm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm5[2,3],ymm15[4,5,6,7]
-; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm13[2,3],ymm10[4,5],ymm13[6,7]
-; AVX2-NEXT:    vmovdqa %ymm10, %ymm11
-; AVX2-NEXT:    vpermd %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu (%rsp), %ymm14 # 32-byte Reload
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm9[2,3],ymm15[4,5],ymm9[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vpbroadcastd 624(%rdi), %ymm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm6[2,3],ymm10[4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpbroadcastd 464(%rdi), %ymm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm10[2,3],ymm7[4,5,6,7]
-; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7]
-; AVX2-NEXT:    vpermd %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-NEXT:    vpbroadcastd 144(%rdi), %ymm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm1 = mem[0,1],ymm15[2,3],mem[4,5,6,7]
-; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm1 = ymm14[0,1],mem[2,3],ymm14[4,5],mem[6,7]
-; AVX2-NEXT:    vpermd %ymm1, %ymm3, %ymm1
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-NEXT:    vpbroadcastd 464(%rdi), %ymm1
+; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7]
+; AVX2-NEXT:    vmovdqa %ymm12, %ymm9
+; AVX2-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = ymm12[0,1],mem[2,3],ymm12[4,5],mem[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpbroadcastd 144(%rdi), %ymm1
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [2,7,4,0]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm8[4,5],ymm4[6,7]
-; AVX2-NEXT:    vmovdqa %ymm8, %ymm14
-; AVX2-NEXT:    vpermd %ymm1, %ymm9, %ymm1
+; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [2,7,4,0]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm11[4,5],ymm3[6,7]
+; AVX2-NEXT:    vmovdqa %ymm3, %ymm12
+; AVX2-NEXT:    vmovdqa %ymm11, %ymm13
+; AVX2-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermd %ymm1, %ymm5, %ymm1
 ; AVX2-NEXT:    vinserti128 $1, 256(%rdi), %ymm0, %ymm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-NEXT:    vmovdqa %ymm12, %ymm8
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-NEXT:    vmovdqa 288(%rdi), %ymm3
+; AVX2-NEXT:    vmovdqa 288(%rdi), %ymm2
+; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5]
-; AVX2-NEXT:    vpermd %ymm3, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa %ymm3, %ymm12
-; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7]
-; AVX2-NEXT:    vmovdqa %ymm5, %ymm13
-; AVX2-NEXT:    vpermd %ymm1, %ymm9, %ymm1
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm14[4,5],ymm4[6,7]
+; AVX2-NEXT:    vpermd %ymm1, %ymm5, %ymm1
 ; AVX2-NEXT:    vinserti128 $1, 576(%rdi), %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa %ymm11, %ymm4
-; AVX2-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-NEXT:    vmovdqa %ymm15, %ymm14
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-NEXT:    vmovdqa 608(%rdi), %ymm2
@@ -3494,112 +3483,110 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm10[4,5],ymm7[6,7]
-; AVX2-NEXT:    vpermd %ymm1, %ymm9, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7]
+; AVX2-NEXT:    vpermd %ymm1, %ymm5, %ymm1
 ; AVX2-NEXT:    vinserti128 $1, 96(%rdi), %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa %ymm6, %ymm7
-; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm6
-; AVX2-NEXT:    vpermd %ymm6, %ymm0, %ymm10
-; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm7
+; AVX2-NEXT:    vpermd %ymm7, %ymm0, %ymm10
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm15[4,5],ymm2[6,7]
-; AVX2-NEXT:    vpermd %ymm1, %ymm9, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7]
+; AVX2-NEXT:    vpermd %ymm1, %ymm5, %ymm8
 ; AVX2-NEXT:    vinserti128 $1, 416(%rdi), %ymm0, %ymm5
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5,6],ymm5[7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2],ymm5[3,4,5,6,7]
-; AVX2-NEXT:    vmovdqa 448(%rdi), %ymm1
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7]
+; AVX2-NEXT:    vmovdqa 448(%rdi), %ymm8
+; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermd %ymm8, %ymm0, %ymm0
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm1[4,5],ymm8[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm5 = ymm11[12,13,14,15],ymm14[0,1,2,3,4,5,6,7,8,9,10,11],ymm11[28,29,30,31],ymm14[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,3,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [1,6,1,6,1,6,1,6]
-; AVX2-NEXT:    vpermd %ymm12, %ymm5, %ymm10
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7]
-; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7]
+; AVX2-NEXT:    vmovdqa %ymm12, %ymm6
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm5 = ymm12[12,13,14,15],ymm13[0,1,2,3,4,5,6,7,8,9,10,11],ymm12[28,29,30,31],ymm13[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2],ymm5[3],ymm0[4],ymm5[5,6],ymm0[7]
+; AVX2-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,1,6,2,7,4,0,0]
+; AVX2-NEXT:    vpermd %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [1,6,1,6,1,6,1,6]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm12[4,5],ymm4[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6]
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm10 = ymm3[12,13,14,15],ymm13[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm13[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vpermd %ymm3, %ymm5, %ymm10
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7]
+; AVX2-NEXT:    vpermd %ymm12, %ymm10, %ymm15
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm13[4,5],ymm7[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm10 = ymm14[12,13,14,15],ymm15[0,1,2,3,4,5,6,7,8,9,10,11],ymm14[28,29,30,31],ymm15[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-NEXT:    vpermd %ymm6, %ymm5, %ymm10
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm13[4,5],ymm14[6,7]
+; AVX2-NEXT:    vmovdqu (%rsp), %ymm8 # 32-byte Reload
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm15 = ymm4[12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3],ymm0[4],ymm15[5,6],ymm0[7]
+; AVX2-NEXT:    vpermd %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm0 = ymm9[0,1,2,3],mem[4,5],ymm9[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm15 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm15 = mem[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3],ymm0[4],ymm15[5,6],ymm0[7]
+; AVX2-NEXT:    vpermd %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    vpermd %ymm7, %ymm10, %ymm15
+; AVX2-NEXT:    vmovdqa %ymm7, %ymm14
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm10 = ymm2[12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT:    vpermd %ymm9, %ymm5, %ymm5
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm7[4,5],ymm1[6,7]
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm15 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3],ymm0[4],ymm15[5,6],ymm0[7]
+; AVX2-NEXT:    vpermd %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX2-NEXT:    vpermd %ymm15, %ymm10, %ymm5
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd $207, (%rsp), %ymm11, %ymm0 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm0 = mem[0,1,2,3],ymm11[4,5],mem[6,7]
-; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm8[0,1],ymm1[0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm1[5],ymm5[6,7]
+; AVX2-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[0,1,2,3],ymm6[4,5],mem[6,7]
+; AVX2-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm5 = mem[0,1],ymm11[0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm11[5],ymm5[6,7]
 ; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [4,1,6,0]
 ; AVX2-NEXT:    vpermd %ymm0, %ymm10, %ymm0
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7]
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [2,7,2,7,2,7,2,7]
-; AVX2-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload
+; AVX2-NEXT:    vpermd %ymm12, %ymm5, %ymm4
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm4 = mem[0,1,2,3],ymm1[4,5],mem[6,7]
-; AVX2-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm11 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm11 = mem[0,1],ymm12[0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6,7]
+; AVX2-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm4 = ymm8[0,1,2,3],mem[4,5],ymm8[6,7]
+; AVX2-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm11 = mem[0,1],ymm13[0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7]
 ; AVX2-NEXT:    vpermd %ymm4, %ymm10, %ymm4
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm11[3,4,5,6,7]
-; AVX2-NEXT:    vpermd %ymm3, %ymm5, %ymm3
+; AVX2-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm4 = mem[0,1],ymm13[0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm13[5],ymm4[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm14[4,5],ymm15[6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm9[0,1],ymm1[0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm6 = mem[0,1,2,3],ymm1[4,5],mem[6,7]
 ; AVX2-NEXT:    vpermd %ymm6, %ymm10, %ymm6
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    vpermd %ymm14, %ymm5, %ymm2
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm4 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm4 = mem[0,1,2,3],ymm1[4,5],mem[6,7]
 ; AVX2-NEXT:    vpermd %ymm4, %ymm10, %ymm4
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
 ; AVX2-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm6 = mem[0,1],ymm7[0,1]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7]
-; AVX2-NEXT:    vpermd %ymm9, %ymm5, %ymm1
+; AVX2-NEXT:    vpermd %ymm15, %ymm5, %ymm1
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
 ; AVX2-NEXT:    vmovaps %ymm4, 64(%rsi)
@@ -3637,39 +3624,37 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqa %ymm2, (%r9)
 ; AVX2-NEXT:    vmovdqa %ymm3, 96(%r9)
 ; AVX2-NEXT:    vmovdqa %ymm0, 32(%r9)
-; AVX2-NEXT:    addq $1000, %rsp # imm = 0x3E8
+; AVX2-NEXT:    addq $968, %rsp # imm = 0x3C8
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX2-FP-LABEL: load_i32_stride5_vf32:
 ; AVX2-FP:       # %bb.0:
-; AVX2-FP-NEXT:    subq $1000, %rsp # imm = 0x3E8
-; AVX2-FP-NEXT:    vmovdqa 384(%rdi), %ymm14
+; AVX2-FP-NEXT:    subq $968, %rsp # imm = 0x3C8
+; AVX2-FP-NEXT:    vmovdqa 384(%rdi), %ymm4
 ; AVX2-FP-NEXT:    vmovdqa 416(%rdi), %ymm5
-; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 352(%rdi), %ymm6
-; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %ymm7
-; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 544(%rdi), %ymm8
+; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 576(%rdi), %ymm9
 ; AVX2-FP-NEXT:    vmovdqa 512(%rdi), %ymm10
-; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu %ymm10, (%rsp) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 480(%rdi), %ymm15
 ; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm13
-; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm11
+; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm14
 ; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %ymm2
-; AVX2-FP-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
+; AVX2-FP-NEXT:    vmovdqa %ymm2, %ymm11
 ; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vinserti128 $1, 288(%rdi), %ymm1, %ymm2
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm11[0,1,0,3]
-; AVX2-FP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm14[0,1,0,3]
+; AVX2-FP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4],ymm3[5,6,7]
-; AVX2-FP-NEXT:    vmovdqa %ymm13, %ymm12
+; AVX2-FP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -3677,105 +3662,99 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3]
-; AVX2-FP-NEXT:    vmovdqa %ymm9, %ymm13
 ; AVX2-FP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7]
-; AVX2-FP-NEXT:    vmovdqa %ymm8, %ymm10
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-FP-NEXT:    vinserti128 $1, 608(%rdi), %ymm1, %ymm1
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7]
+; AVX2-FP-NEXT:    vmovdqa %ymm7, %ymm10
+; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
+; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm5[0,1,0,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4],ymm2[5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-FP-NEXT:    vinserti128 $1, 448(%rdi), %ymm1, %ymm1
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm7
-; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm3
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7]
+; AVX2-FP-NEXT:    vmovdqa %ymm3, %ymm8
+; AVX2-FP-NEXT:    vmovdqa %ymm2, %ymm7
 ; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm6
-; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm9
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm9[0,1,0,3]
-; AVX2-FP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4],ymm1[5,6,7]
+; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm12
+; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm1
+; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4],ymm1[5,6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FP-NEXT:    vinserti128 $1, 128(%rdi), %ymm0, %ymm0
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [1,6,3,0]
-; AVX2-FP-NEXT:    vmovdqu (%rsp), %ymm8 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm8[2,3],ymm4[4,5,6,7]
-; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7]
-; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [5,2,7,0,5,2,7,0]
-; AVX2-FP-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX2-FP-NEXT:    vpermd %ymm2, %ymm3, %ymm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm11[2,3],ymm3[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
+; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [1,6,3,0,5,2,7,0]
+; AVX2-FP-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-FP-NEXT:    vpbroadcastd 304(%rdi), %ymm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm5[2,3],ymm15[4,5,6,7]
-; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm13[2,3],ymm10[4,5],ymm13[6,7]
-; AVX2-FP-NEXT:    vmovdqa %ymm10, %ymm11
-; AVX2-FP-NEXT:    vpermd %ymm2, %ymm3, %ymm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu (%rsp), %ymm14 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm9[2,3],ymm15[4,5],ymm9[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7]
+; AVX2-FP-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-FP-NEXT:    vpbroadcastd 624(%rdi), %ymm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm6[2,3],ymm10[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7]
+; AVX2-FP-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FP-NEXT:    vpbroadcastd 464(%rdi), %ymm2
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm10[2,3],ymm7[4,5,6,7]
-; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7]
-; AVX2-FP-NEXT:    vpermd %ymm2, %ymm3, %ymm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpbroadcastd 144(%rdi), %ymm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm1 = mem[0,1],ymm15[2,3],mem[4,5,6,7]
-; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm1 = ymm14[0,1],mem[2,3],ymm14[4,5],mem[6,7]
-; AVX2-FP-NEXT:    vpermd %ymm1, %ymm3, %ymm1
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpbroadcastd 464(%rdi), %ymm1
+; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa %ymm12, %ymm9
+; AVX2-FP-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = ymm12[0,1],mem[2,3],ymm12[4,5],mem[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7]
+; AVX2-FP-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FP-NEXT:    vpbroadcastd 144(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [2,7,4,0]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm8[4,5],ymm4[6,7]
-; AVX2-FP-NEXT:    vmovdqa %ymm8, %ymm14
-; AVX2-FP-NEXT:    vpermd %ymm1, %ymm9, %ymm1
+; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [2,7,4,0]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm11[4,5],ymm3[6,7]
+; AVX2-FP-NEXT:    vmovdqa %ymm3, %ymm12
+; AVX2-FP-NEXT:    vmovdqa %ymm11, %ymm13
+; AVX2-FP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpermd %ymm1, %ymm5, %ymm1
 ; AVX2-FP-NEXT:    vinserti128 $1, 256(%rdi), %ymm0, %ymm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-FP-NEXT:    vmovdqa %ymm12, %ymm8
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %ymm3
+; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %ymm2
+; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5]
-; AVX2-FP-NEXT:    vpermd %ymm3, %ymm0, %ymm2
-; AVX2-FP-NEXT:    vmovdqa %ymm3, %ymm12
-; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7]
-; AVX2-FP-NEXT:    vmovdqa %ymm5, %ymm13
-; AVX2-FP-NEXT:    vpermd %ymm1, %ymm9, %ymm1
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm14[4,5],ymm4[6,7]
+; AVX2-FP-NEXT:    vpermd %ymm1, %ymm5, %ymm1
 ; AVX2-FP-NEXT:    vinserti128 $1, 576(%rdi), %ymm0, %ymm2
-; AVX2-FP-NEXT:    vmovdqa %ymm11, %ymm4
-; AVX2-FP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FP-NEXT:    vmovdqa %ymm15, %ymm14
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa 608(%rdi), %ymm2
@@ -3783,112 +3762,110 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm10[4,5],ymm7[6,7]
-; AVX2-FP-NEXT:    vpermd %ymm1, %ymm9, %ymm1
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7]
+; AVX2-FP-NEXT:    vpermd %ymm1, %ymm5, %ymm1
 ; AVX2-FP-NEXT:    vinserti128 $1, 96(%rdi), %ymm0, %ymm2
-; AVX2-FP-NEXT:    vmovdqa %ymm6, %ymm7
-; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm6
-; AVX2-FP-NEXT:    vpermd %ymm6, %ymm0, %ymm10
-; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm7
+; AVX2-FP-NEXT:    vpermd %ymm7, %ymm0, %ymm10
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm15[4,5],ymm2[6,7]
-; AVX2-FP-NEXT:    vpermd %ymm1, %ymm9, %ymm1
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7]
+; AVX2-FP-NEXT:    vpermd %ymm1, %ymm5, %ymm8
 ; AVX2-FP-NEXT:    vinserti128 $1, 416(%rdi), %ymm0, %ymm5
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5,6],ymm5[7]
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2],ymm5[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa 448(%rdi), %ymm1
-; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa 448(%rdi), %ymm8
+; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpermd %ymm8, %ymm0, %ymm0
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm1[4,5],ymm8[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6]
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm5 = ymm11[12,13,14,15],ymm14[0,1,2,3,4,5,6,7,8,9,10,11],ymm11[28,29,30,31],ymm14[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,3,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [1,6,1,6,1,6,1,6]
-; AVX2-FP-NEXT:    vpermd %ymm12, %ymm5, %ymm10
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7]
+; AVX2-FP-NEXT:    vmovdqa %ymm12, %ymm6
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm5 = ymm12[12,13,14,15],ymm13[0,1,2,3,4,5,6,7,8,9,10,11],ymm12[28,29,30,31],ymm13[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2],ymm5[3],ymm0[4],ymm5[5,6],ymm0[7]
+; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,1,6,2,7,4,0,0]
+; AVX2-FP-NEXT:    vpermd %ymm0, %ymm5, %ymm0
+; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [1,6,1,6,1,6,1,6]
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm12[4,5],ymm4[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6]
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm10 = ymm3[12,13,14,15],ymm13[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm13[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vpermd %ymm3, %ymm5, %ymm10
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7]
+; AVX2-FP-NEXT:    vpermd %ymm12, %ymm10, %ymm15
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm13[4,5],ymm7[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm10 = ymm14[12,13,14,15],ymm15[0,1,2,3,4,5,6,7,8,9,10,11],ymm14[28,29,30,31],ymm15[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermd %ymm6, %ymm5, %ymm10
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm13[4,5],ymm14[6,7]
+; AVX2-FP-NEXT:    vmovdqu (%rsp), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm15 = ymm4[12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3],ymm0[4],ymm15[5,6],ymm0[7]
+; AVX2-FP-NEXT:    vpermd %ymm0, %ymm5, %ymm0
+; AVX2-FP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm0 = ymm9[0,1,2,3],mem[4,5],ymm9[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm15 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm15 = mem[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3],ymm0[4],ymm15[5,6],ymm0[7]
+; AVX2-FP-NEXT:    vpermd %ymm0, %ymm5, %ymm0
+; AVX2-FP-NEXT:    vpermd %ymm7, %ymm10, %ymm15
+; AVX2-FP-NEXT:    vmovdqa %ymm7, %ymm14
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm10 = ymm2[12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FP-NEXT:    vpermd %ymm9, %ymm5, %ymm5
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm7[4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm15 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3],ymm0[4],ymm15[5,6],ymm0[7]
+; AVX2-FP-NEXT:    vpermd %ymm0, %ymm5, %ymm0
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX2-FP-NEXT:    vpermd %ymm15, %ymm10, %ymm5
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd $207, (%rsp), %ymm11, %ymm0 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm0 = mem[0,1,2,3],ymm11[4,5],mem[6,7]
-; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm8[0,1],ymm1[0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm1[5],ymm5[6,7]
+; AVX2-FP-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm0 = mem[0,1,2,3],ymm6[4,5],mem[6,7]
+; AVX2-FP-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm5 = mem[0,1],ymm11[0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm11[5],ymm5[6,7]
 ; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [4,1,6,0]
 ; AVX2-FP-NEXT:    vpermd %ymm0, %ymm10, %ymm0
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [2,7,2,7,2,7,2,7]
-; AVX2-FP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vpermd %ymm12, %ymm5, %ymm4
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm4 = mem[0,1,2,3],ymm1[4,5],mem[6,7]
-; AVX2-FP-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm11 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm11 = mem[0,1],ymm12[0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6,7]
+; AVX2-FP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm4 = ymm8[0,1,2,3],mem[4,5],ymm8[6,7]
+; AVX2-FP-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm11 = mem[0,1],ymm13[0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7]
 ; AVX2-FP-NEXT:    vpermd %ymm4, %ymm10, %ymm4
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm11[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermd %ymm3, %ymm5, %ymm3
+; AVX2-FP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-FP-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm4 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm4 = mem[0,1],ymm13[0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm13[5],ymm4[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm14[4,5],ymm15[6,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm9[0,1],ymm1[0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm6 = mem[0,1,2,3],ymm1[4,5],mem[6,7]
 ; AVX2-FP-NEXT:    vpermd %ymm6, %ymm10, %ymm6
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vpermd %ymm14, %ymm5, %ymm2
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-FP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm4 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm4 = mem[0,1,2,3],ymm1[4,5],mem[6,7]
 ; AVX2-FP-NEXT:    vpermd %ymm4, %ymm10, %ymm4
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
 ; AVX2-FP-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm6 = mem[0,1],ymm7[0,1]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermd %ymm9, %ymm5, %ymm1
+; AVX2-FP-NEXT:    vpermd %ymm15, %ymm5, %ymm1
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
 ; AVX2-FP-NEXT:    vmovaps %ymm4, 64(%rsi)
@@ -3926,39 +3903,37 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqa %ymm2, (%r9)
 ; AVX2-FP-NEXT:    vmovdqa %ymm3, 96(%r9)
 ; AVX2-FP-NEXT:    vmovdqa %ymm0, 32(%r9)
-; AVX2-FP-NEXT:    addq $1000, %rsp # imm = 0x3E8
+; AVX2-FP-NEXT:    addq $968, %rsp # imm = 0x3C8
 ; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
 ;
 ; AVX2-FCP-LABEL: load_i32_stride5_vf32:
 ; AVX2-FCP:       # %bb.0:
-; AVX2-FCP-NEXT:    subq $1000, %rsp # imm = 0x3E8
-; AVX2-FCP-NEXT:    vmovdqa 384(%rdi), %ymm14
+; AVX2-FCP-NEXT:    subq $968, %rsp # imm = 0x3C8
+; AVX2-FCP-NEXT:    vmovdqa 384(%rdi), %ymm4
 ; AVX2-FCP-NEXT:    vmovdqa 416(%rdi), %ymm5
-; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %ymm6
-; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm7
-; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 544(%rdi), %ymm8
+; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 576(%rdi), %ymm9
 ; AVX2-FCP-NEXT:    vmovdqa 512(%rdi), %ymm10
-; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu %ymm10, (%rsp) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 480(%rdi), %ymm15
 ; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm13
-; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm11
+; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm14
 ; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vmovdqa %ymm2, %ymm11
 ; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vinserti128 $1, 288(%rdi), %ymm1, %ymm2
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm11[0,1,0,3]
-; AVX2-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm14[0,1,0,3]
+; AVX2-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4],ymm3[5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa %ymm13, %ymm12
+; AVX2-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -3966,105 +3941,99 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3]
-; AVX2-FCP-NEXT:    vmovdqa %ymm9, %ymm13
 ; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa %ymm8, %ymm10
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, 608(%rdi), %ymm1, %ymm1
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7]
+; AVX2-FCP-NEXT:    vmovdqa %ymm7, %ymm10
+; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
+; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm5[0,1,0,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4],ymm2[5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, 448(%rdi), %ymm1, %ymm1
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm7
-; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vmovdqa %ymm3, %ymm8
+; AVX2-FCP-NEXT:    vmovdqa %ymm2, %ymm7
 ; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm6
-; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm9
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm9[0,1,0,3]
-; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4],ymm1[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm12
+; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4],ymm1[5,6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, 128(%rdi), %ymm0, %ymm0
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [1,6,3,0]
-; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm8 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm8[2,3],ymm4[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7]
-; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [5,2,7,0,5,2,7,0]
-; AVX2-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm3, %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm11[2,3],ymm3[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [1,6,3,0,5,2,7,0]
+; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-FCP-NEXT:    vpbroadcastd 304(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm5[2,3],ymm15[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm13[2,3],ymm10[4,5],ymm13[6,7]
-; AVX2-FCP-NEXT:    vmovdqa %ymm10, %ymm11
-; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm3, %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm14 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm9[2,3],ymm15[4,5],ymm9[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7]
+; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-FCP-NEXT:    vpbroadcastd 624(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm6[2,3],ymm10[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7]
+; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FCP-NEXT:    vpbroadcastd 464(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm10[2,3],ymm7[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm3, %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 144(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = mem[0,1],ymm15[2,3],mem[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = ymm14[0,1],mem[2,3],ymm14[4,5],mem[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm3, %ymm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 464(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa %ymm12, %ymm9
+; AVX2-FCP-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = ymm12[0,1],mem[2,3],ymm12[4,5],mem[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7]
+; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FCP-NEXT:    vpbroadcastd 144(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [2,7,4,0]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm8[4,5],ymm4[6,7]
-; AVX2-FCP-NEXT:    vmovdqa %ymm8, %ymm14
-; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm9, %ymm1
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [2,7,4,0]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm11[4,5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vmovdqa %ymm3, %ymm12
+; AVX2-FCP-NEXT:    vmovdqa %ymm11, %ymm13
+; AVX2-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm5, %ymm1
 ; AVX2-FCP-NEXT:    vinserti128 $1, 256(%rdi), %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-FCP-NEXT:    vmovdqa %ymm12, %ymm8
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa 288(%rdi), %ymm3
+; AVX2-FCP-NEXT:    vmovdqa 288(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5]
-; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vmovdqa %ymm3, %ymm12
-; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vmovdqa %ymm5, %ymm13
-; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm9, %ymm1
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm14[4,5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm5, %ymm1
 ; AVX2-FCP-NEXT:    vinserti128 $1, 576(%rdi), %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vmovdqa %ymm11, %ymm4
-; AVX2-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FCP-NEXT:    vmovdqa %ymm15, %ymm14
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa 608(%rdi), %ymm2
@@ -4072,112 +4041,110 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm10[4,5],ymm7[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm9, %ymm1
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm5, %ymm1
 ; AVX2-FCP-NEXT:    vinserti128 $1, 96(%rdi), %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vmovdqa %ymm6, %ymm7
-; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm6
-; AVX2-FCP-NEXT:    vpermd %ymm6, %ymm0, %ymm10
-; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm7
+; AVX2-FCP-NEXT:    vpermd %ymm7, %ymm0, %ymm10
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm15[4,5],ymm2[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm9, %ymm1
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm5, %ymm8
 ; AVX2-FCP-NEXT:    vinserti128 $1, 416(%rdi), %ymm0, %ymm5
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5,6],ymm5[7]
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2],ymm5[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa 448(%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa 448(%rdi), %ymm8
+; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermd %ymm8, %ymm0, %ymm0
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm1[4,5],ymm8[6,7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6]
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm5 = ymm11[12,13,14,15],ymm14[0,1,2,3,4,5,6,7,8,9,10,11],ymm11[28,29,30,31],ymm14[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,3,2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [1,6,1,6,1,6,1,6]
-; AVX2-FCP-NEXT:    vpermd %ymm12, %ymm5, %ymm10
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7]
+; AVX2-FCP-NEXT:    vmovdqa %ymm12, %ymm6
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm5 = ymm12[12,13,14,15],ymm13[0,1,2,3,4,5,6,7,8,9,10,11],ymm12[28,29,30,31],ymm13[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2],ymm5[3],ymm0[4],ymm5[5,6],ymm0[7]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,1,6,2,7,4,0,0]
+; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm5, %ymm0
+; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [1,6,1,6,1,6,1,6]
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm12[4,5],ymm4[6,7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6]
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm10 = ymm3[12,13,14,15],ymm13[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm13[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm5, %ymm10
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm12, %ymm10, %ymm15
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm13[4,5],ymm7[6,7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm10 = ymm14[12,13,14,15],ymm15[0,1,2,3,4,5,6,7,8,9,10,11],ymm14[28,29,30,31],ymm15[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm6, %ymm5, %ymm10
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm13[4,5],ymm14[6,7]
+; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm15 = ymm4[12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3],ymm0[4],ymm15[5,6],ymm0[7]
+; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm5, %ymm0
+; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm0 = ymm9[0,1,2,3],mem[4,5],ymm9[6,7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm15 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm15 = mem[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3],ymm0[4],ymm15[5,6],ymm0[7]
+; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm5, %ymm0
+; AVX2-FCP-NEXT:    vpermd %ymm7, %ymm10, %ymm15
+; AVX2-FCP-NEXT:    vmovdqa %ymm7, %ymm14
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm10 = ymm2[12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpermd %ymm9, %ymm5, %ymm5
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm7[4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm15 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3],ymm0[4],ymm15[5,6],ymm0[7]
+; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm5, %ymm0
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpermd %ymm15, %ymm10, %ymm5
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd $207, (%rsp), %ymm11, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm0 = mem[0,1,2,3],ymm11[4,5],mem[6,7]
-; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm8[0,1],ymm1[0,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm1[5],ymm5[6,7]
+; AVX2-FCP-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm0 = mem[0,1,2,3],ymm6[4,5],mem[6,7]
+; AVX2-FCP-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm5 = mem[0,1],ymm11[0,1]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm11[5],ymm5[6,7]
 ; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [4,1,6,0]
 ; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm10, %ymm0
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [2,7,2,7,2,7,2,7]
-; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vpermd %ymm12, %ymm5, %ymm4
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm4 = mem[0,1,2,3],ymm1[4,5],mem[6,7]
-; AVX2-FCP-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm11 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm11 = mem[0,1],ymm12[0,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6,7]
+; AVX2-FCP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm4 = ymm8[0,1,2,3],mem[4,5],ymm8[6,7]
+; AVX2-FCP-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm11 = mem[0,1],ymm13[0,1]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7]
 ; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm10, %ymm4
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm11[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm5, %ymm3
+; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm4 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm4 = mem[0,1],ymm13[0,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm13[5],ymm4[6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm14[4,5],ymm15[6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm9[0,1],ymm1[0,1]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm6 = mem[0,1,2,3],ymm1[4,5],mem[6,7]
 ; AVX2-FCP-NEXT:    vpermd %ymm6, %ymm10, %ymm6
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vpermd %ymm14, %ymm5, %ymm2
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-FCP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm4 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm4 = mem[0,1,2,3],ymm1[4,5],mem[6,7]
 ; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm10, %ymm4
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm6 = mem[0,1],ymm7[0,1]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm9, %ymm5, %ymm1
+; AVX2-FCP-NEXT:    vpermd %ymm15, %ymm5, %ymm1
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vmovaps %ymm4, 64(%rsi)
@@ -4215,7 +4182,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqa %ymm2, (%r9)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm3, 96(%r9)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm0, 32(%r9)
-; AVX2-FCP-NEXT:    addq $1000, %rsp # imm = 0x3E8
+; AVX2-FCP-NEXT:    addq $968, %rsp # imm = 0x3C8
 ; AVX2-FCP-NEXT:    vzeroupper
 ; AVX2-FCP-NEXT:    retq
 ;
@@ -6777,7 +6744,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 896(%rdi), %ymm5
 ; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 832(%rdi), %ymm6
+; AVX2-NEXT:    vmovdqa 832(%rdi), %ymm15
 ; AVX2-NEXT:    vmovdqa 800(%rdi), %ymm7
 ; AVX2-NEXT:    vmovdqa 544(%rdi), %ymm8
 ; AVX2-NEXT:    vmovdqa 576(%rdi), %ymm9
@@ -6787,7 +6754,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqa 480(%rdi), %ymm11
 ; AVX2-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm12
-; AVX2-NEXT:    vmovdqu %ymm12, (%rsp) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm13
 ; AVX2-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 192(%rdi), %ymm2
@@ -6807,16 +6774,14 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7]
-; AVX2-NEXT:    vmovdqa %ymm8, %ymm13
-; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa %ymm8, %ymm9
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-NEXT:    vinserti128 $1, 608(%rdi), %ymm1, %ymm1
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7]
-; AVX2-NEXT:    vmovdqa %ymm7, %ymm12
-; AVX2-NEXT:    vmovdqa %ymm6, %ymm14
-; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm15[2,3],ymm7[4,5],ymm15[6,7]
+; AVX2-NEXT:    vmovdqa %ymm7, %ymm10
+; AVX2-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm5[0,1,0,3]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7]
@@ -6824,11 +6789,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vinserti128 $1, 928(%rdi), %ymm1, %ymm1
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 1152(%rdi), %ymm1
+; AVX2-NEXT:    vmovdqa 1152(%rdi), %ymm14
+; AVX2-NEXT:    vmovdqa 1120(%rdi), %ymm1
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 1120(%rdi), %ymm2
-; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm14[2,3],ymm1[4,5],ymm14[6,7]
+; AVX2-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vmovdqa 1184(%rdi), %ymm3
 ; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -6840,11 +6805,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vinserti128 $1, 1248(%rdi), %ymm1, %ymm1
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 352(%rdi), %ymm1
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 320(%rdi), %ymm2
+; AVX2-NEXT:    vmovdqa 352(%rdi), %ymm2
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
+; AVX2-NEXT:    vmovdqa 320(%rdi), %ymm1
+; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
 ; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vmovdqa 384(%rdi), %ymm3
 ; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -6872,137 +6837,127 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vinserti128 $1, 768(%rdi), %ymm1, %ymm1
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 992(%rdi), %ymm2
-; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 960(%rdi), %ymm1
+; AVX2-NEXT:    vmovdqa 992(%rdi), %ymm1
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
+; AVX2-NEXT:    vmovdqa 960(%rdi), %ymm2
+; AVX2-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
 ; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT:    vmovdqa 1024(%rdi), %ymm3
-; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 1024(%rdi), %ymm13
 ; AVX2-NEXT:    vmovdqa 1056(%rdi), %ymm2
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4],ymm2[5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-NEXT:    vinserti128 $1, 1088(%rdi), %ymm1, %ymm1
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm7
-; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm10
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7]
-; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm4
+; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7]
+; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm4
-; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm15
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm15[0,1,0,3]
-; AVX2-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4],ymm1[5,6,7]
+; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm2
+; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm11
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm11[0,1,0,3]
+; AVX2-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-NEXT:    vinserti128 $1, 128(%rdi), %ymm0, %ymm0
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [1,6,3,0]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7]
-; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqu (%rsp), %ymm11 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm3 = ymm11[0,1],mem[2,3],ymm11[4,5],mem[6,7]
-; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,2,7,0,5,2,7,0]
-; AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX2-NEXT:    vpermd %ymm3, %ymm1, %ymm3
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-NEXT:    vpbroadcastd 304(%rdi), %ymm3
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7]
-; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
-; AVX2-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm3 = ymm13[0,1],mem[2,3],ymm13[4,5],mem[6,7]
-; AVX2-NEXT:    vpermd %ymm3, %ymm1, %ymm3
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-NEXT:    vpbroadcastd 624(%rdi), %ymm3
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm14[2,3],ymm12[4,5,6,7]
-; AVX2-NEXT:    vmovdqa %ymm12, %ymm14
-; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm3 = ymm12[0,1],mem[2,3],ymm12[4,5],mem[6,7]
-; AVX2-NEXT:    vpermd %ymm3, %ymm1, %ymm3
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-NEXT:    vpbroadcastd 944(%rdi), %ymm3
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7]
-; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm3 = ymm13[0,1],mem[2,3],ymm13[4,5],mem[6,7]
-; AVX2-NEXT:    vpermd %ymm3, %ymm1, %ymm3
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-NEXT:    vpbroadcastd 1264(%rdi), %ymm3
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm10[2,3],ymm7[4,5,6,7]
-; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm15[2,3],ymm4[4,5],ymm15[6,7]
-; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermd %ymm3, %ymm1, %ymm3
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-NEXT:    vpbroadcastd 144(%rdi), %ymm3
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm5[2,3],ymm12[4,5,6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm1 = ymm6[0,1],mem[2,3],ymm6[4,5],mem[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,6,3,0,5,2,7,0]
+; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpbroadcastd 304(%rdi), %ymm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = ymm7[0,1],mem[2,3],ymm7[4,5,6,7]
-; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = ymm9[0,1],mem[2,3],ymm9[4,5],mem[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
+; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpbroadcastd 624(%rdi), %ymm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm15[2,3],ymm10[4,5,6,7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm3 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7]
-; AVX2-NEXT:    vpermd %ymm3, %ymm1, %ymm3
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-NEXT:    vpbroadcastd 464(%rdi), %ymm3
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
+; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpbroadcastd 944(%rdi), %ymm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm1 = mem[0,1],ymm14[2,3],mem[4,5,6,7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = mem[0,1],ymm10[2,3],mem[4,5,6,7]
-; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7]
-; AVX2-NEXT:    vpermd %ymm3, %ymm1, %ymm3
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-NEXT:    vpbroadcastd 784(%rdi), %ymm3
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
-; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = ymm10[0,1],mem[2,3],ymm10[4,5],mem[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
+; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpbroadcastd 1264(%rdi), %ymm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX2-NEXT:    vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm1 = mem[0,1],ymm14[2,3],mem[4,5,6,7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7]
-; AVX2-NEXT:    vpermd %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-NEXT:    vpbroadcastd 1104(%rdi), %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
+; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpbroadcastd 464(%rdi), %ymm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm1 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-NEXT:    vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
+; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpbroadcastd 784(%rdi), %ymm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu (%rsp), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm1 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = ymm13[0,1],mem[2,3],ymm13[4,5],mem[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
+; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpbroadcastd 1104(%rdi), %ymm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm11[2,3],ymm4[4,5],ymm11[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
+; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastd 144(%rdi), %ymm1
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,7,4,0]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7]
-; AVX2-NEXT:    vpermd %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm13 = [2,7,4,0]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm5[4,5],ymm12[6,7]
+; AVX2-NEXT:    vpermd %ymm0, %ymm13, %ymm0
 ; AVX2-NEXT:    vinserti128 $1, 256(%rdi), %ymm0, %ymm1
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX2-NEXT:    vmovdqa 288(%rdi), %ymm2
@@ -7011,11 +6966,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm8[4,5],ymm9[6,7]
-; AVX2-NEXT:    vpermd %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm7[4,5],ymm8[6,7]
+; AVX2-NEXT:    vpermd %ymm1, %ymm13, %ymm1
 ; AVX2-NEXT:    vinserti128 $1, 576(%rdi), %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-NEXT:    vmovdqa 608(%rdi), %ymm2
@@ -7023,11 +6977,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm11[4,5],ymm14[6,7]
-; AVX2-NEXT:    vpermd %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm1 = ymm12[0,1,2,3],mem[4,5],ymm12[6,7]
+; AVX2-NEXT:    vpermd %ymm1, %ymm13, %ymm1
 ; AVX2-NEXT:    vinserti128 $1, 896(%rdi), %ymm0, %ymm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-NEXT:    vmovdqa 928(%rdi), %ymm2
@@ -7035,312 +6990,305 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm6[4,5],ymm12[6,7]
-; AVX2-NEXT:    vpermd %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7]
+; AVX2-NEXT:    vpermd %ymm1, %ymm13, %ymm1
 ; AVX2-NEXT:    vinserti128 $1, 1216(%rdi), %ymm0, %ymm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-NEXT:    vmovdqa %ymm13, %ymm9
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-NEXT:    vmovdqa 1248(%rdi), %ymm2
-; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vmovdqa 1248(%rdi), %ymm9
+; AVX2-NEXT:    vpermd %ymm9, %ymm0, %ymm2
+; AVX2-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7]
-; AVX2-NEXT:    vpermd %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm1 = mem[0,1,2,3],ymm3[4,5],mem[6,7]
+; AVX2-NEXT:    vpermd %ymm1, %ymm13, %ymm1
 ; AVX2-NEXT:    vinserti128 $1, 96(%rdi), %ymm0, %ymm2
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm1
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm4
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm2
+; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm4
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7]
-; AVX2-NEXT:    vpermd %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm1 = mem[0,1,2,3],ymm14[4,5],mem[6,7]
+; AVX2-NEXT:    vpermd %ymm1, %ymm13, %ymm1
 ; AVX2-NEXT:    vinserti128 $1, 416(%rdi), %ymm0, %ymm4
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5,6],ymm4[7]
+; AVX2-NEXT:    vpblendd $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm4 = mem[0,1,2,3,4,5,6],ymm4[7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-NEXT:    vmovdqa 448(%rdi), %ymm1
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm15
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm4 = mem[0,1,2,3],ymm10[4,5],mem[6,7]
-; AVX2-NEXT:    vmovdqa %ymm10, %ymm14
-; AVX2-NEXT:    vpermd %ymm4, %ymm3, %ymm4
-; AVX2-NEXT:    vinserti128 $1, 736(%rdi), %ymm0, %ymm15
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm15 = ymm2[0,1,2,3,4,5,6],ymm15[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm15 = ymm4[0,1,2],ymm15[3,4,5,6,7]
-; AVX2-NEXT:    vmovdqa 768(%rdi), %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7]
+; AVX2-NEXT:    vmovdqa 448(%rdi), %ymm2
+; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm5
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm13
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm13[6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm1 = ymm10[0,1,2,3],mem[4,5],ymm10[6,7]
+; AVX2-NEXT:    vpermd %ymm1, %ymm13, %ymm1
+; AVX2-NEXT:    vinserti128 $1, 736(%rdi), %ymm0, %ymm5
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6],ymm5[7]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7]
+; AVX2-NEXT:    vmovdqa 768(%rdi), %ymm2
+; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm15
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovdqu (%rsp), %ymm7 # 32-byte Reload
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7]
-; AVX2-NEXT:    vpermd %ymm13, %ymm3, %ymm3
-; AVX2-NEXT:    vinserti128 $1, 1056(%rdi), %ymm0, %ymm13
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5,6],ymm13[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[2,3,0,1,6,7,4,5]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm3[0,1,2],ymm13[3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7]
+; AVX2-NEXT:    vpermd %ymm1, %ymm13, %ymm1
+; AVX2-NEXT:    vinserti128 $1, 1056(%rdi), %ymm0, %ymm3
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7]
 ; AVX2-NEXT:    vmovdqa 1088(%rdi), %ymm3
 ; AVX2-NEXT:    vpermd %ymm3, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa %ymm3, %ymm4
+; AVX2-NEXT:    vmovdqa %ymm3, %ymm14
 ; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $207, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5],mem[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm13 = mem[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,3,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[0,1,2,3],ymm3[4,5],mem[6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm1 = mem[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7]
+; AVX2-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,1,6,2,7,4,0,0]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm15
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [1,6,1,6,1,6,1,6]
-; AVX2-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm13[6,7]
+; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
 ; AVX2-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm13 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm15 = mem[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7]
-; AVX2-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm13 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6]
-; AVX2-NEXT:    vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm15 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm15 = mem[12,13,14,15],ymm11[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm11[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm15 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm15 = mem[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7]
+; AVX2-NEXT:    vpermd %ymm13, %ymm1, %ymm13
 ; AVX2-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm13 = ymm9[0,1,2,3],mem[4,5],ymm9[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6]
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm15 = ymm12[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm12[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm13 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm15 = ymm12[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],ymm12[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7]
+; AVX2-NEXT:    vpermd %ymm13, %ymm1, %ymm13
 ; AVX2-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm13 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6]
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm15 = ymm7[12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10,11],ymm7[28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7]
-; AVX2-NEXT:    vpermd %ymm4, %ymm0, %ymm15
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm13 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7]
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm15 = ymm6[12,13,14,15],ymm11[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm11[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7]
+; AVX2-NEXT:    vpermd %ymm13, %ymm1, %ymm13
+; AVX2-NEXT:    vpermd %ymm9, %ymm0, %ymm15
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm13 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6]
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm15 = ymm7[12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10,11],ymm7[28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7]
+; AVX2-NEXT:    vpermd %ymm13, %ymm1, %ymm13
+; AVX2-NEXT:    vpermd %ymm14, %ymm0, %ymm15
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm15 = ymm12[12,13,14,15],ymm10[0,1,2,3,4,5,6,7,8,9,10,11],ymm12[28,29,30,31],ymm10[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm12[4,5],ymm4[6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm15 = ymm10[12,13,14,15],ymm11[0,1,2,3,4,5,6,7,8,9,10,11],ymm10[28,29,30,31],ymm11[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7]
+; AVX2-NEXT:    vpermd %ymm13, %ymm1, %ymm13
 ; AVX2-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm14[4,5],ymm5[6,7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm15 = ymm9[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7]
+; AVX2-NEXT:    vpermd %ymm13, %ymm1, %ymm13
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm4[4,5],ymm10[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6]
+; AVX2-NEXT:    vpermd %ymm10, %ymm0, %ymm15
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm8[0,1,2,3],ymm7[4,5],ymm8[6,7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm15 = ymm9[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vpermd %ymm3, %ymm0, %ymm15
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm15 = ymm5[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm5[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-NEXT:    vpermd %ymm11, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm7[0,1],ymm6[0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5],ymm1[6,7]
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm15 = ymm6[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7]
+; AVX2-NEXT:    vpermd %ymm13, %ymm1, %ymm1
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX2-NEXT:    vpermd %ymm15, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm8[0,1],ymm7[0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm6[4,5],ymm2[6,7]
 ; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [4,1,6,0]
-; AVX2-NEXT:    vpermd %ymm5, %ymm8, %ymm5
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-NEXT:    vpermd %ymm1, %ymm8, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7]
-; AVX2-NEXT:    vpermd %ymm11, %ymm7, %ymm1
+; AVX2-NEXT:    vpermd %ymm15, %ymm7, %ymm1
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-NEXT:    vperm2i128 $2, (%rsp), %ymm11, %ymm5 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm5 = mem[0,1],ymm11[0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm11[5],ymm5[6,7]
+; AVX2-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = mem[0,1],ymm3[0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
 ; AVX2-NEXT:    vpermd %ymm1, %ymm8, %ymm1
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7]
-; AVX2-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm9[4,5],ymm2[6,7]
-; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm10[0,1],ymm4[0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm9[4,5],ymm4[6,7]
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm5[0,1],ymm14[0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5],ymm2[6,7]
 ; AVX2-NEXT:    vpermd %ymm1, %ymm8, %ymm1
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7]
-; AVX2-NEXT:    vpermd %ymm3, %ymm7, %ymm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-NEXT:    vpermd %ymm10, %ymm7, %ymm2
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm2 = mem[0,1,2,3],ymm2[4,5],mem[6,7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm5 = mem[0,1],ymm9[0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm9[5],ymm5[6,7]
+; AVX2-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm4 = mem[0,1],ymm9[0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm9[5],ymm4[6,7]
 ; AVX2-NEXT:    vpermd %ymm2, %ymm8, %ymm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7]
-; AVX2-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm12[4,5],ymm14[6,7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm9 = mem[0,1],ymm10[0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7]
+; AVX2-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = ymm11[0,1,2,3],mem[4,5],ymm11[6,7]
+; AVX2-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm4 = mem[0,1],ymm12[0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm12[5],ymm4[6,7]
 ; AVX2-NEXT:    vpermd %ymm2, %ymm8, %ymm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm9[3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7]
 ; AVX2-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm4 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7]
+; AVX2-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm4 = mem[0,1,2,3],ymm3[4,5],mem[6,7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm9 = mem[0,1],ymm10[0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7]
+; AVX2-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm5 = mem[0,1],ymm10[0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5],ymm5[6,7]
 ; AVX2-NEXT:    vpermd %ymm4, %ymm8, %ymm4
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm9[3,4,5,6,7]
-; AVX2-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm9 = mem[0,1,2,3],ymm3[4,5],mem[6,7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm10 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm10 = mem[0,1],ymm12[0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7]
-; AVX2-NEXT:    vpermd %ymm9, %ymm8, %ymm9
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7]
+; AVX2-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-NEXT:    vmovdqu (%rsp), %ymm3 # 32-byte Reload
+; AVX2-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm5 = mem[0,1,2,3],ymm3[4,5],mem[6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm10 = mem[0,1],ymm11[0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7]
+; AVX2-NEXT:    vpermd %ymm5, %ymm8, %ymm5
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm10[3,4,5,6,7]
 ; AVX2-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm9 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm9 = ymm6[0,1,2,3],mem[4,5],ymm6[6,7]
-; AVX2-NEXT:    vpermd %ymm9, %ymm8, %ymm8
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm5 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7]
+; AVX2-NEXT:    vpermd %ymm5, %ymm8, %ymm5
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm9 = mem[0,1],ymm10[0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7]
+; AVX2-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm8 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm8 = mem[0,1],ymm10[0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5],ymm8[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm8[3,4,5,6,7]
 ; AVX2-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 192(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 128(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm8, 64(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, (%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm8, 224(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm8, 160(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm8, 96(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm8, 32(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 192(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 128(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 64(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, (%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 224(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 160(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 96(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 32(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 192(%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 128(%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 64(%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, (%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 224(%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 160(%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 96(%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 32(%rcx)
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 192(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 128(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 64(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, (%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 224(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 160(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 96(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 32(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 192(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 128(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 64(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, (%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 224(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 160(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 96(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 32(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 192(%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 128(%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 64(%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, (%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 224(%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 160(%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 96(%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 32(%rcx)
 ; AVX2-NEXT:    vmovdqa %ymm13, (%r8)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 64(%r8)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 128(%r8)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 192(%r8)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 224(%r8)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 160(%r8)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 96(%r8)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 32(%r8)
-; AVX2-NEXT:    vmovdqa %ymm7, 224(%r9)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 64(%r8)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 128(%r8)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 192(%r8)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 224(%r8)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 160(%r8)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 96(%r8)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm7, 32(%r8)
+; AVX2-NEXT:    vmovdqa %ymm5, 224(%r9)
 ; AVX2-NEXT:    vmovdqa %ymm3, 192(%r9)
 ; AVX2-NEXT:    vmovdqa %ymm4, 160(%r9)
 ; AVX2-NEXT:    vmovdqa %ymm2, 128(%r9)
-; AVX2-NEXT:    vmovdqa %ymm5, 96(%r9)
+; AVX2-NEXT:    vmovdqa %ymm9, 96(%r9)
 ; AVX2-NEXT:    vmovdqa %ymm1, 64(%r9)
-; AVX2-NEXT:    vmovdqa %ymm11, 32(%r9)
+; AVX2-NEXT:    vmovdqa %ymm6, 32(%r9)
 ; AVX2-NEXT:    vmovdqa %ymm0, (%r9)
 ; AVX2-NEXT:    addq $2152, %rsp # imm = 0x868
 ; AVX2-NEXT:    vzeroupper
@@ -7353,7 +7301,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 896(%rdi), %ymm5
 ; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 832(%rdi), %ymm6
+; AVX2-FP-NEXT:    vmovdqa 832(%rdi), %ymm15
 ; AVX2-FP-NEXT:    vmovdqa 800(%rdi), %ymm7
 ; AVX2-FP-NEXT:    vmovdqa 544(%rdi), %ymm8
 ; AVX2-FP-NEXT:    vmovdqa 576(%rdi), %ymm9
@@ -7363,7 +7311,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqa 480(%rdi), %ymm11
 ; AVX2-FP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm12
-; AVX2-FP-NEXT:    vmovdqu %ymm12, (%rsp) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm13
 ; AVX2-FP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %ymm2
@@ -7383,16 +7331,14 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7]
-; AVX2-FP-NEXT:    vmovdqa %ymm8, %ymm13
-; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa %ymm8, %ymm9
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-FP-NEXT:    vinserti128 $1, 608(%rdi), %ymm1, %ymm1
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7]
-; AVX2-FP-NEXT:    vmovdqa %ymm7, %ymm12
-; AVX2-FP-NEXT:    vmovdqa %ymm6, %ymm14
-; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm15[2,3],ymm7[4,5],ymm15[6,7]
+; AVX2-FP-NEXT:    vmovdqa %ymm7, %ymm10
+; AVX2-FP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm5[0,1,0,3]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7]
@@ -7400,11 +7346,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vinserti128 $1, 928(%rdi), %ymm1, %ymm1
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 1152(%rdi), %ymm1
+; AVX2-FP-NEXT:    vmovdqa 1152(%rdi), %ymm14
+; AVX2-FP-NEXT:    vmovdqa 1120(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 1120(%rdi), %ymm2
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm14[2,3],ymm1[4,5],ymm14[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vmovdqa 1184(%rdi), %ymm3
 ; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -7416,11 +7362,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vinserti128 $1, 1248(%rdi), %ymm1, %ymm1
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 352(%rdi), %ymm1
-; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %ymm2
+; AVX2-FP-NEXT:    vmovdqa 352(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %ymm1
+; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vmovdqa 384(%rdi), %ymm3
 ; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -7448,137 +7394,127 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vinserti128 $1, 768(%rdi), %ymm1, %ymm1
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 992(%rdi), %ymm2
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 960(%rdi), %ymm1
+; AVX2-FP-NEXT:    vmovdqa 992(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
+; AVX2-FP-NEXT:    vmovdqa 960(%rdi), %ymm2
+; AVX2-FP-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
 ; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
-; AVX2-FP-NEXT:    vmovdqa 1024(%rdi), %ymm3
-; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 1024(%rdi), %ymm13
 ; AVX2-FP-NEXT:    vmovdqa 1056(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4],ymm2[5,6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-FP-NEXT:    vinserti128 $1, 1088(%rdi), %ymm1, %ymm1
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm7
-; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm10
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm4
+; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm4
-; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm15
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm15[0,1,0,3]
-; AVX2-FP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4],ymm1[5,6,7]
+; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm2
+; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm11
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm11[0,1,0,3]
+; AVX2-FP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FP-NEXT:    vinserti128 $1, 128(%rdi), %ymm0, %ymm0
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [1,6,3,0]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7]
-; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm2
-; AVX2-FP-NEXT:    vmovdqu (%rsp), %ymm11 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm3 = ymm11[0,1],mem[2,3],ymm11[4,5],mem[6,7]
-; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,2,7,0,5,2,7,0]
-; AVX2-FP-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX2-FP-NEXT:    vpermd %ymm3, %ymm1, %ymm3
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpbroadcastd 304(%rdi), %ymm3
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7]
-; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
-; AVX2-FP-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm3 = ymm13[0,1],mem[2,3],ymm13[4,5],mem[6,7]
-; AVX2-FP-NEXT:    vpermd %ymm3, %ymm1, %ymm3
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpbroadcastd 624(%rdi), %ymm3
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm14[2,3],ymm12[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa %ymm12, %ymm14
-; AVX2-FP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm3 = ymm12[0,1],mem[2,3],ymm12[4,5],mem[6,7]
-; AVX2-FP-NEXT:    vpermd %ymm3, %ymm1, %ymm3
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpbroadcastd 944(%rdi), %ymm3
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7]
-; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm3 = ymm13[0,1],mem[2,3],ymm13[4,5],mem[6,7]
-; AVX2-FP-NEXT:    vpermd %ymm3, %ymm1, %ymm3
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpbroadcastd 1264(%rdi), %ymm3
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm10[2,3],ymm7[4,5,6,7]
-; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm15[2,3],ymm4[4,5],ymm15[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermd %ymm3, %ymm1, %ymm3
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpbroadcastd 144(%rdi), %ymm3
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm5[2,3],ymm12[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm1 = ymm6[0,1],mem[2,3],ymm6[4,5],mem[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
+; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,6,3,0,5,2,7,0]
+; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
+; AVX2-FP-NEXT:    vpbroadcastd 304(%rdi), %ymm2
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = ymm7[0,1],mem[2,3],ymm7[4,5,6,7]
-; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = ymm9[0,1],mem[2,3],ymm9[4,5],mem[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
+; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
+; AVX2-FP-NEXT:    vpbroadcastd 624(%rdi), %ymm2
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm15[2,3],ymm10[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm3 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7]
-; AVX2-FP-NEXT:    vpermd %ymm3, %ymm1, %ymm3
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpbroadcastd 464(%rdi), %ymm3
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
+; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
+; AVX2-FP-NEXT:    vpbroadcastd 944(%rdi), %ymm2
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm1 = mem[0,1],ymm14[2,3],mem[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = mem[0,1],ymm10[2,3],mem[4,5,6,7]
-; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7]
-; AVX2-FP-NEXT:    vpermd %ymm3, %ymm1, %ymm3
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpbroadcastd 784(%rdi), %ymm3
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
-; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = ymm10[0,1],mem[2,3],ymm10[4,5],mem[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
+; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
+; AVX2-FP-NEXT:    vpbroadcastd 1264(%rdi), %ymm2
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm1 = mem[0,1],ymm14[2,3],mem[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-FP-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7]
-; AVX2-FP-NEXT:    vpermd %ymm2, %ymm1, %ymm1
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpbroadcastd 1104(%rdi), %ymm1
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
+; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
+; AVX2-FP-NEXT:    vpbroadcastd 464(%rdi), %ymm2
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm1 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
+; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
+; AVX2-FP-NEXT:    vpbroadcastd 784(%rdi), %ymm2
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu (%rsp), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm1 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = ymm13[0,1],mem[2,3],ymm13[4,5],mem[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
+; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
+; AVX2-FP-NEXT:    vpbroadcastd 1104(%rdi), %ymm2
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm11[2,3],ymm4[4,5],ymm11[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
+; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpbroadcastd 144(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,7,4,0]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7]
-; AVX2-FP-NEXT:    vpermd %ymm0, %ymm3, %ymm0
+; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm13 = [2,7,4,0]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm5[4,5],ymm12[6,7]
+; AVX2-FP-NEXT:    vpermd %ymm0, %ymm13, %ymm0
 ; AVX2-FP-NEXT:    vinserti128 $1, 256(%rdi), %ymm0, %ymm1
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7]
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %ymm2
@@ -7587,11 +7523,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm8[4,5],ymm9[6,7]
-; AVX2-FP-NEXT:    vpermd %ymm1, %ymm3, %ymm1
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm7[4,5],ymm8[6,7]
+; AVX2-FP-NEXT:    vpermd %ymm1, %ymm13, %ymm1
 ; AVX2-FP-NEXT:    vinserti128 $1, 576(%rdi), %ymm0, %ymm2
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa 608(%rdi), %ymm2
@@ -7599,11 +7534,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm11[4,5],ymm14[6,7]
-; AVX2-FP-NEXT:    vpermd %ymm1, %ymm3, %ymm1
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm1 = ymm12[0,1,2,3],mem[4,5],ymm12[6,7]
+; AVX2-FP-NEXT:    vpermd %ymm1, %ymm13, %ymm1
 ; AVX2-FP-NEXT:    vinserti128 $1, 896(%rdi), %ymm0, %ymm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa 928(%rdi), %ymm2
@@ -7611,312 +7547,305 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm6[4,5],ymm12[6,7]
-; AVX2-FP-NEXT:    vpermd %ymm1, %ymm3, %ymm1
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7]
+; AVX2-FP-NEXT:    vpermd %ymm1, %ymm13, %ymm1
 ; AVX2-FP-NEXT:    vinserti128 $1, 1216(%rdi), %ymm0, %ymm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-FP-NEXT:    vmovdqa %ymm13, %ymm9
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa 1248(%rdi), %ymm2
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-FP-NEXT:    vmovdqa 1248(%rdi), %ymm9
+; AVX2-FP-NEXT:    vpermd %ymm9, %ymm0, %ymm2
+; AVX2-FP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7]
-; AVX2-FP-NEXT:    vpermd %ymm1, %ymm3, %ymm1
+; AVX2-FP-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm1 = mem[0,1,2,3],ymm3[4,5],mem[6,7]
+; AVX2-FP-NEXT:    vpermd %ymm1, %ymm13, %ymm1
 ; AVX2-FP-NEXT:    vinserti128 $1, 96(%rdi), %ymm0, %ymm2
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm1
-; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm4
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm2
+; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm4
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7]
-; AVX2-FP-NEXT:    vpermd %ymm2, %ymm3, %ymm2
+; AVX2-FP-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm1 = mem[0,1,2,3],ymm14[4,5],mem[6,7]
+; AVX2-FP-NEXT:    vpermd %ymm1, %ymm13, %ymm1
 ; AVX2-FP-NEXT:    vinserti128 $1, 416(%rdi), %ymm0, %ymm4
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5,6],ymm4[7]
+; AVX2-FP-NEXT:    vpblendd $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm4 = mem[0,1,2,3,4,5,6],ymm4[7]
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa 448(%rdi), %ymm1
-; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm15
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm4 = mem[0,1,2,3],ymm10[4,5],mem[6,7]
-; AVX2-FP-NEXT:    vmovdqa %ymm10, %ymm14
-; AVX2-FP-NEXT:    vpermd %ymm4, %ymm3, %ymm4
-; AVX2-FP-NEXT:    vinserti128 $1, 736(%rdi), %ymm0, %ymm15
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm2[0,1,2,3,4,5,6],ymm15[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm4[0,1,2],ymm15[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa 768(%rdi), %ymm1
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa 448(%rdi), %ymm2
+; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm5
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm13
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm13[6,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm1 = ymm10[0,1,2,3],mem[4,5],ymm10[6,7]
+; AVX2-FP-NEXT:    vpermd %ymm1, %ymm13, %ymm1
+; AVX2-FP-NEXT:    vinserti128 $1, 736(%rdi), %ymm0, %ymm5
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6],ymm5[7]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa 768(%rdi), %ymm2
+; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm15
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovdqu (%rsp), %ymm7 # 32-byte Reload
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7]
-; AVX2-FP-NEXT:    vpermd %ymm13, %ymm3, %ymm3
-; AVX2-FP-NEXT:    vinserti128 $1, 1056(%rdi), %ymm0, %ymm13
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5,6],ymm13[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[2,3,0,1,6,7,4,5]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm3[0,1,2],ymm13[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7]
+; AVX2-FP-NEXT:    vpermd %ymm1, %ymm13, %ymm1
+; AVX2-FP-NEXT:    vinserti128 $1, 1056(%rdi), %ymm0, %ymm3
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa 1088(%rdi), %ymm3
 ; AVX2-FP-NEXT:    vpermd %ymm3, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vmovdqa %ymm3, %ymm4
+; AVX2-FP-NEXT:    vmovdqa %ymm3, %ymm14
 ; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $207, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5],mem[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6]
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm13 = mem[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,3,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm0 = mem[0,1,2,3],ymm3[4,5],mem[6,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm1 = mem[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7]
+; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,1,6,2,7,4,0,0]
+; AVX2-FP-NEXT:    vpermd %ymm0, %ymm1, %ymm15
 ; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [1,6,1,6,1,6,1,6]
-; AVX2-FP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm13[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
 ; AVX2-FP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm13 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm15 = mem[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm13 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6]
-; AVX2-FP-NEXT:    vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm15 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm15 = mem[12,13,14,15],ymm11[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm11[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm15 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm15 = mem[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7]
+; AVX2-FP-NEXT:    vpermd %ymm13, %ymm1, %ymm13
 ; AVX2-FP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm13 = ymm9[0,1,2,3],mem[4,5],ymm9[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6]
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm15 = ymm12[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm12[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm13 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm15 = ymm12[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],ymm12[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7]
+; AVX2-FP-NEXT:    vpermd %ymm13, %ymm1, %ymm13
 ; AVX2-FP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm13 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6]
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm15 = ymm7[12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10,11],ymm7[28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermd %ymm4, %ymm0, %ymm15
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm13 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7]
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm15 = ymm6[12,13,14,15],ymm11[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm11[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7]
+; AVX2-FP-NEXT:    vpermd %ymm13, %ymm1, %ymm13
+; AVX2-FP-NEXT:    vpermd %ymm9, %ymm0, %ymm15
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm13 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6]
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm15 = ymm7[12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10,11],ymm7[28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7]
+; AVX2-FP-NEXT:    vpermd %ymm13, %ymm1, %ymm13
+; AVX2-FP-NEXT:    vpermd %ymm14, %ymm0, %ymm15
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm15 = ymm12[12,13,14,15],ymm10[0,1,2,3,4,5,6,7,8,9,10,11],ymm12[28,29,30,31],ymm10[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm12[4,5],ymm4[6,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm15 = ymm10[12,13,14,15],ymm11[0,1,2,3,4,5,6,7,8,9,10,11],ymm10[28,29,30,31],ymm11[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7]
+; AVX2-FP-NEXT:    vpermd %ymm13, %ymm1, %ymm13
 ; AVX2-FP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm14[4,5],ymm5[6,7]
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm15 = ymm9[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7]
+; AVX2-FP-NEXT:    vpermd %ymm13, %ymm1, %ymm13
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm4[4,5],ymm10[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6]
+; AVX2-FP-NEXT:    vpermd %ymm10, %ymm0, %ymm15
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm8[0,1,2,3],ymm7[4,5],ymm8[6,7]
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm15 = ymm9[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vpermd %ymm3, %ymm0, %ymm15
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm15 = ymm5[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm5[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FP-NEXT:    vpermd %ymm11, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm7[0,1],ymm6[0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm15 = ymm6[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7]
+; AVX2-FP-NEXT:    vpermd %ymm13, %ymm1, %ymm1
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX2-FP-NEXT:    vpermd %ymm15, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm8[0,1],ymm7[0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm6[4,5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [4,1,6,0]
-; AVX2-FP-NEXT:    vpermd %ymm5, %ymm8, %ymm5
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermd %ymm1, %ymm8, %ymm1
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7]
-; AVX2-FP-NEXT:    vpermd %ymm11, %ymm7, %ymm1
+; AVX2-FP-NEXT:    vpermd %ymm15, %ymm7, %ymm1
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-FP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FP-NEXT:    vperm2i128 $2, (%rsp), %ymm11, %ymm5 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm5 = mem[0,1],ymm11[0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm11[5],ymm5[6,7]
+; AVX2-FP-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = mem[0,1],ymm3[0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vpermd %ymm1, %ymm8, %ymm1
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm9[4,5],ymm2[6,7]
-; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm10[0,1],ymm4[0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm9[4,5],ymm4[6,7]
+; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm5[0,1],ymm14[0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vpermd %ymm1, %ymm8, %ymm1
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermd %ymm3, %ymm7, %ymm2
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermd %ymm10, %ymm7, %ymm2
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-FP-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm2 = mem[0,1,2,3],ymm2[4,5],mem[6,7]
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FP-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm5 = mem[0,1],ymm9[0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm9[5],ymm5[6,7]
+; AVX2-FP-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm4 = mem[0,1],ymm9[0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm9[5],ymm4[6,7]
 ; AVX2-FP-NEXT:    vpermd %ymm2, %ymm8, %ymm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm12[4,5],ymm14[6,7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FP-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm9 = mem[0,1],ymm10[0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-FP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = ymm11[0,1,2,3],mem[4,5],ymm11[6,7]
+; AVX2-FP-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm4 = mem[0,1],ymm12[0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm12[5],ymm4[6,7]
 ; AVX2-FP-NEXT:    vpermd %ymm2, %ymm8, %ymm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm9[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm4 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7]
+; AVX2-FP-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm4 = mem[0,1,2,3],ymm3[4,5],mem[6,7]
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FP-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm9 = mem[0,1],ymm10[0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7]
+; AVX2-FP-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm5 = mem[0,1],ymm10[0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5],ymm5[6,7]
 ; AVX2-FP-NEXT:    vpermd %ymm4, %ymm8, %ymm4
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm9[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm9 = mem[0,1,2,3],ymm3[4,5],mem[6,7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FP-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm10 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm10 = mem[0,1],ymm12[0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7]
-; AVX2-FP-NEXT:    vpermd %ymm9, %ymm8, %ymm9
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-FP-NEXT:    vmovdqu (%rsp), %ymm3 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm5 = mem[0,1,2,3],ymm3[4,5],mem[6,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-FP-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm10 = mem[0,1],ymm11[0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7]
+; AVX2-FP-NEXT:    vpermd %ymm5, %ymm8, %ymm5
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm10[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm9 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm9 = ymm6[0,1,2,3],mem[4,5],ymm6[6,7]
-; AVX2-FP-NEXT:    vpermd %ymm9, %ymm8, %ymm8
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm5 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7]
+; AVX2-FP-NEXT:    vpermd %ymm5, %ymm8, %ymm5
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FP-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm9 = mem[0,1],ymm10[0,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7]
+; AVX2-FP-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm8 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm8 = mem[0,1],ymm10[0,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5],ymm8[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm8[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 192(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 128(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm8, 64(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, (%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm8, 224(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm8, 160(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm8, 96(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm8, 32(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 192(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 128(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 64(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, (%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 224(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 160(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 96(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 32(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 192(%rcx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 128(%rcx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 64(%rcx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, (%rcx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 224(%rcx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 160(%rcx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 96(%rcx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 32(%rcx)
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 192(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 128(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 64(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, (%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 224(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 160(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 96(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 32(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 192(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 128(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 64(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, (%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 224(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 160(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 96(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 32(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 192(%rcx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 128(%rcx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 64(%rcx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, (%rcx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 224(%rcx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 160(%rcx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 96(%rcx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 32(%rcx)
 ; AVX2-FP-NEXT:    vmovdqa %ymm13, (%r8)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 64(%r8)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 128(%r8)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 192(%r8)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 224(%r8)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 160(%r8)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 96(%r8)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 32(%r8)
-; AVX2-FP-NEXT:    vmovdqa %ymm7, 224(%r9)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 64(%r8)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 128(%r8)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 192(%r8)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 224(%r8)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 160(%r8)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 96(%r8)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm7, 32(%r8)
+; AVX2-FP-NEXT:    vmovdqa %ymm5, 224(%r9)
 ; AVX2-FP-NEXT:    vmovdqa %ymm3, 192(%r9)
 ; AVX2-FP-NEXT:    vmovdqa %ymm4, 160(%r9)
 ; AVX2-FP-NEXT:    vmovdqa %ymm2, 128(%r9)
-; AVX2-FP-NEXT:    vmovdqa %ymm5, 96(%r9)
+; AVX2-FP-NEXT:    vmovdqa %ymm9, 96(%r9)
 ; AVX2-FP-NEXT:    vmovdqa %ymm1, 64(%r9)
-; AVX2-FP-NEXT:    vmovdqa %ymm11, 32(%r9)
+; AVX2-FP-NEXT:    vmovdqa %ymm6, 32(%r9)
 ; AVX2-FP-NEXT:    vmovdqa %ymm0, (%r9)
 ; AVX2-FP-NEXT:    addq $2152, %rsp # imm = 0x868
 ; AVX2-FP-NEXT:    vzeroupper
@@ -7929,7 +7858,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 896(%rdi), %ymm5
 ; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 832(%rdi), %ymm6
+; AVX2-FCP-NEXT:    vmovdqa 832(%rdi), %ymm15
 ; AVX2-FCP-NEXT:    vmovdqa 800(%rdi), %ymm7
 ; AVX2-FCP-NEXT:    vmovdqa 544(%rdi), %ymm8
 ; AVX2-FCP-NEXT:    vmovdqa 576(%rdi), %ymm9
@@ -7939,7 +7868,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqa 480(%rdi), %ymm11
 ; AVX2-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm12
-; AVX2-FCP-NEXT:    vmovdqu %ymm12, (%rsp) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm13
 ; AVX2-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %ymm2
@@ -7959,16 +7888,14 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa %ymm8, %ymm13
-; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa %ymm8, %ymm9
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, 608(%rdi), %ymm1, %ymm1
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7]
-; AVX2-FCP-NEXT:    vmovdqa %ymm7, %ymm12
-; AVX2-FCP-NEXT:    vmovdqa %ymm6, %ymm14
-; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm15[2,3],ymm7[4,5],ymm15[6,7]
+; AVX2-FCP-NEXT:    vmovdqa %ymm7, %ymm10
+; AVX2-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm5[0,1,0,3]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7]
@@ -7976,11 +7903,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vinserti128 $1, 928(%rdi), %ymm1, %ymm1
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 1152(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vmovdqa 1152(%rdi), %ymm14
+; AVX2-FCP-NEXT:    vmovdqa 1120(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 1120(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm14[2,3],ymm1[4,5],ymm14[6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vmovdqa 1184(%rdi), %ymm3
 ; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -7992,11 +7919,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vinserti128 $1, 1248(%rdi), %ymm1, %ymm1
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
 ; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vmovdqa 384(%rdi), %ymm3
 ; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8024,137 +7951,127 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vinserti128 $1, 768(%rdi), %ymm1, %ymm1
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 992(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 960(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vmovdqa 992(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vmovdqa 960(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
 ; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vmovdqa 1024(%rdi), %ymm3
-; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa 1024(%rdi), %ymm13
 ; AVX2-FCP-NEXT:    vmovdqa 1056(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4],ymm2[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, 1088(%rdi), %ymm1, %ymm1
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm7
-; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm10
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm4
+; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm4
-; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm15
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm15[0,1,0,3]
-; AVX2-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4],ymm1[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm11
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm11[0,1,0,3]
+; AVX2-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FCP-NEXT:    vinserti128 $1, 128(%rdi), %ymm0, %ymm0
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [1,6,3,0]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm11 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = ymm11[0,1],mem[2,3],ymm11[4,5],mem[6,7]
-; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,2,7,0,5,2,7,0]
-; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm1, %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 304(%rdi), %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = ymm13[0,1],mem[2,3],ymm13[4,5],mem[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm1, %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 624(%rdi), %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm14[2,3],ymm12[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa %ymm12, %ymm14
-; AVX2-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = ymm12[0,1],mem[2,3],ymm12[4,5],mem[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm1, %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 944(%rdi), %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = ymm13[0,1],mem[2,3],ymm13[4,5],mem[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm1, %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 1264(%rdi), %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm10[2,3],ymm7[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm15[2,3],ymm4[4,5],ymm15[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm1, %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 144(%rdi), %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm5[2,3],ymm12[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm1 = ymm6[0,1],mem[2,3],ymm6[4,5],mem[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,6,3,0,5,2,7,0]
+; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
+; AVX2-FCP-NEXT:    vpbroadcastd 304(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm2 = ymm7[0,1],mem[2,3],ymm7[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = ymm9[0,1],mem[2,3],ymm9[4,5],mem[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
+; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
+; AVX2-FCP-NEXT:    vpbroadcastd 624(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm15[2,3],ymm10[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm1, %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 464(%rdi), %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
+; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
+; AVX2-FCP-NEXT:    vpbroadcastd 944(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm1 = mem[0,1],ymm14[2,3],mem[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm2 = mem[0,1],ymm10[2,3],mem[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm1, %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 784(%rdi), %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = ymm10[0,1],mem[2,3],ymm10[4,5],mem[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
+; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
+; AVX2-FCP-NEXT:    vpbroadcastd 1264(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm1 = mem[0,1],ymm14[2,3],mem[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 1104(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
+; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
+; AVX2-FCP-NEXT:    vpbroadcastd 464(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm1 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
+; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
+; AVX2-FCP-NEXT:    vpbroadcastd 784(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm1 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = ymm13[0,1],mem[2,3],ymm13[4,5],mem[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
+; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
+; AVX2-FCP-NEXT:    vpbroadcastd 1104(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm11[2,3],ymm4[4,5],ymm11[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
+; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vpbroadcastd 144(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,7,4,0]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm3, %ymm0
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm13 = [2,7,4,0]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm5[4,5],ymm12[6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm13, %ymm0
 ; AVX2-FCP-NEXT:    vinserti128 $1, 256(%rdi), %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7]
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa 288(%rdi), %ymm2
@@ -8163,11 +8080,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm8[4,5],ymm9[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm3, %ymm1
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm7[4,5],ymm8[6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm13, %ymm1
 ; AVX2-FCP-NEXT:    vinserti128 $1, 576(%rdi), %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa 608(%rdi), %ymm2
@@ -8175,11 +8091,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm11[4,5],ymm14[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm3, %ymm1
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm1 = ymm12[0,1,2,3],mem[4,5],ymm12[6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm13, %ymm1
 ; AVX2-FCP-NEXT:    vinserti128 $1, 896(%rdi), %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa 928(%rdi), %ymm2
@@ -8187,312 +8104,305 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm6[4,5],ymm12[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm3, %ymm1
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm13, %ymm1
 ; AVX2-FCP-NEXT:    vinserti128 $1, 1216(%rdi), %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-FCP-NEXT:    vmovdqa %ymm13, %ymm9
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa 1248(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-FCP-NEXT:    vmovdqa 1248(%rdi), %ymm9
+; AVX2-FCP-NEXT:    vpermd %ymm9, %ymm0, %ymm2
+; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm3, %ymm1
+; AVX2-FCP-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,2,3],ymm3[4,5],mem[6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm13, %ymm1
 ; AVX2-FCP-NEXT:    vinserti128 $1, 96(%rdi), %ymm0, %ymm2
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm4
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm2 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm3, %ymm2
+; AVX2-FCP-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,2,3],ymm14[4,5],mem[6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm13, %ymm1
 ; AVX2-FCP-NEXT:    vinserti128 $1, 416(%rdi), %ymm0, %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5,6],ymm4[7]
+; AVX2-FCP-NEXT:    vpblendd $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm4 = mem[0,1,2,3,4,5,6],ymm4[7]
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa 448(%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm15
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm4 = mem[0,1,2,3],ymm10[4,5],mem[6,7]
-; AVX2-FCP-NEXT:    vmovdqa %ymm10, %ymm14
-; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm3, %ymm4
-; AVX2-FCP-NEXT:    vinserti128 $1, 736(%rdi), %ymm0, %ymm15
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm2[0,1,2,3,4,5,6],ymm15[7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm4[0,1,2],ymm15[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa 768(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa 448(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm5
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm13
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm13[6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm1 = ymm10[0,1,2,3],mem[4,5],ymm10[6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm13, %ymm1
+; AVX2-FCP-NEXT:    vinserti128 $1, 736(%rdi), %ymm0, %ymm5
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6],ymm5[7]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa 768(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm15
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm7 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm3, %ymm3
-; AVX2-FCP-NEXT:    vinserti128 $1, 1056(%rdi), %ymm0, %ymm13
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5,6],ymm13[7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[2,3,0,1,6,7,4,5]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm3[0,1,2],ymm13[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm13, %ymm1
+; AVX2-FCP-NEXT:    vinserti128 $1, 1056(%rdi), %ymm0, %ymm3
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa 1088(%rdi), %ymm3
 ; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vmovdqa %ymm3, %ymm4
+; AVX2-FCP-NEXT:    vmovdqa %ymm3, %ymm14
 ; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $207, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5],mem[6,7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6]
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm13 = mem[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,3,2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm0 = mem[0,1,2,3],ymm3[4,5],mem[6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm1 = mem[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,1,6,2,7,4,0,0]
+; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm1, %ymm15
 ; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [1,6,1,6,1,6,1,6]
-; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm13[6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm13 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm15 = mem[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm13 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6]
-; AVX2-FCP-NEXT:    vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm15 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm15 = mem[12,13,14,15],ymm11[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm11[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm15 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm15 = mem[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7]
+; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm1, %ymm13
 ; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm13 = ymm9[0,1,2,3],mem[4,5],ymm9[6,7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6]
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm15 = ymm12[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm12[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm13 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm15 = ymm12[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],ymm12[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7]
+; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm1, %ymm13
 ; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm13 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6]
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm15 = ymm7[12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10,11],ymm7[28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm0, %ymm15
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm13 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7]
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm15 = ymm6[12,13,14,15],ymm11[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm11[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7]
+; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm1, %ymm13
+; AVX2-FCP-NEXT:    vpermd %ymm9, %ymm0, %ymm15
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm13 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6]
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm15 = ymm7[12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10,11],ymm7[28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7]
+; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm1, %ymm13
+; AVX2-FCP-NEXT:    vpermd %ymm14, %ymm0, %ymm15
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm15 = ymm12[12,13,14,15],ymm10[0,1,2,3,4,5,6,7,8,9,10,11],ymm12[28,29,30,31],ymm10[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm12[4,5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm15 = ymm10[12,13,14,15],ymm11[0,1,2,3,4,5,6,7,8,9,10,11],ymm10[28,29,30,31],ymm11[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7]
+; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm1, %ymm13
 ; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm14[4,5],ymm5[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm15 = ymm9[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7]
+; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm1, %ymm13
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm4[4,5],ymm10[6,7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6]
+; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm0, %ymm15
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm8[0,1,2,3],ymm7[4,5],ymm8[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm15 = ymm9[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm0, %ymm15
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm15 = ymm5[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm5[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm7[0,1],ymm6[0,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm15 = ymm6[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7]
+; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm1, %ymm1
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpermd %ymm15, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm8[0,1],ymm7[0,1]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm6[4,5],ymm2[6,7]
 ; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [4,1,6,0]
-; AVX2-FCP-NEXT:    vpermd %ymm5, %ymm8, %ymm5
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm8, %ymm1
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7]
-; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm7, %ymm1
+; AVX2-FCP-NEXT:    vpermd %ymm15, %ymm7, %ymm1
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FCP-NEXT:    vperm2i128 $2, (%rsp), %ymm11, %ymm5 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm5 = mem[0,1],ymm11[0,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm11[5],ymm5[6,7]
+; AVX2-FCP-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = mem[0,1],ymm3[0,1]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
 ; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm8, %ymm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm9[4,5],ymm2[6,7]
-; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm10[0,1],ymm4[0,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm9[4,5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm5[0,1],ymm14[0,1]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5],ymm2[6,7]
 ; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm8, %ymm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm7, %ymm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm7, %ymm2
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm2 = mem[0,1,2,3],ymm2[4,5],mem[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FCP-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm5 = mem[0,1],ymm9[0,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm9[5],ymm5[6,7]
+; AVX2-FCP-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm4 = mem[0,1],ymm9[0,1]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm9[5],ymm4[6,7]
 ; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm8, %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm12[4,5],ymm14[6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FCP-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm9 = mem[0,1],ymm10[0,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = ymm11[0,1,2,3],mem[4,5],ymm11[6,7]
+; AVX2-FCP-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm4 = mem[0,1],ymm12[0,1]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm12[5],ymm4[6,7]
 ; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm8, %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm9[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm4 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm4 = mem[0,1,2,3],ymm3[4,5],mem[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FCP-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm9 = mem[0,1],ymm10[0,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7]
+; AVX2-FCP-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm5 = mem[0,1],ymm10[0,1]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5],ymm5[6,7]
 ; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm8, %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm9[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm9 = mem[0,1,2,3],ymm3[4,5],mem[6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FCP-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm10 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm10 = mem[0,1],ymm12[0,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm9, %ymm8, %ymm9
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm5 = mem[0,1,2,3],ymm3[4,5],mem[6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-FCP-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm10 = mem[0,1],ymm11[0,1]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm5, %ymm8, %ymm5
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm10[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm9 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm9 = ymm6[0,1,2,3],mem[4,5],ymm6[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm9, %ymm8, %ymm8
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm5 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm5, %ymm8, %ymm5
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FCP-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm9 = mem[0,1],ymm10[0,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm8 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm8 = mem[0,1],ymm10[0,1]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5],ymm8[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm8[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 192(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 128(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm8, 64(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, (%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm8, 224(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm8, 160(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm8, 96(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm8, 32(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 192(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 128(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 64(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, (%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 224(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 160(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 96(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 32(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 192(%rcx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 128(%rcx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 64(%rcx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, (%rcx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 224(%rcx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 160(%rcx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 96(%rcx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 32(%rcx)
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 192(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 128(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 64(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, (%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 224(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 160(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 96(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 32(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 192(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 128(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 64(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, (%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 224(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 160(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 96(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 32(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 192(%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 128(%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 64(%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, (%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 224(%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 160(%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 96(%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 32(%rcx)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm13, (%r8)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 64(%r8)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 128(%r8)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 192(%r8)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 224(%r8)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 160(%r8)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 96(%r8)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 32(%r8)
-; AVX2-FCP-NEXT:    vmovdqa %ymm7, 224(%r9)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 64(%r8)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 128(%r8)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 192(%r8)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 224(%r8)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 160(%r8)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 96(%r8)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 32(%r8)
+; AVX2-FCP-NEXT:    vmovdqa %ymm5, 224(%r9)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm3, 192(%r9)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm4, 160(%r9)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm2, 128(%r9)
-; AVX2-FCP-NEXT:    vmovdqa %ymm5, 96(%r9)
+; AVX2-FCP-NEXT:    vmovdqa %ymm9, 96(%r9)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm1, 64(%r9)
-; AVX2-FCP-NEXT:    vmovdqa %ymm11, 32(%r9)
+; AVX2-FCP-NEXT:    vmovdqa %ymm6, 32(%r9)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm0, (%r9)
 ; AVX2-FCP-NEXT:    addq $2152, %rsp # imm = 0x868
 ; AVX2-FCP-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
index aae4d9fa15e2..8820dccc40bf 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
@@ -524,35 +524,31 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [1,7,5,0]
 ; AVX2-NEXT:    vpermd %ymm3, %ymm5, %ymm3
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3]
-; AVX2-NEXT:    vmovdqa (%rdi), %xmm5
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm7 = ymm2[0,0,2,3,4,4,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,3,2,3]
-; AVX2-NEXT:    vmovdqa 80(%rdi), %xmm7
-; AVX2-NEXT:    vpbroadcastd %xmm7, %xmm8
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm8 = ymm2[0,1,3,3,4,5,7,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm8[1,2,3],ymm5[4],ymm8[5,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,3,2,3]
+; AVX2-NEXT:    vmovdqa 80(%rdi), %xmm5
+; AVX2-NEXT:    vpbroadcastd %xmm5, %xmm6
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm8 = ymm7[2,0,2,3,6,4,6,7]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,3,2,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3]
 ; AVX2-NEXT:    vpbroadcastd 84(%rdi), %xmm8
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm7[2,3]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[3,1,3,3,7,5,7,7]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,3,2,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm5[2,3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,2]
 ; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [4,2,0,0]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-NEXT:    vpermd %ymm1, %ymm9, %ymm2
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3]
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3]
-; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [5,3,0,0]
-; AVX2-NEXT:    vpermd %ymm1, %ymm7, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3]
+; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0]
+; AVX2-NEXT:    vpermd %ymm1, %ymm5, %ymm1
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
 ; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
 ; AVX2-NEXT:    vmovdqa %xmm3, (%rdx)
 ; AVX2-NEXT:    vmovdqa %xmm6, (%rcx)
-; AVX2-NEXT:    vmovdqa %xmm5, (%r8)
+; AVX2-NEXT:    vmovdqa %xmm7, (%r8)
 ; AVX2-NEXT:    vmovdqa %xmm2, (%r9)
 ; AVX2-NEXT:    vmovdqa %xmm1, (%rax)
 ; AVX2-NEXT:    vzeroupper
@@ -572,35 +568,31 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [1,7,5,0]
 ; AVX2-FP-NEXT:    vpermd %ymm3, %ymm5, %ymm3
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3]
-; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm5
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm2[0,0,2,3,4,4,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,3,2,3]
-; AVX2-FP-NEXT:    vmovdqa 80(%rdi), %xmm7
-; AVX2-FP-NEXT:    vpbroadcastd %xmm7, %xmm8
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm2[0,1,3,3,4,5,7,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm8[1,2,3],ymm5[4],ymm8[5,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,3,2,3]
+; AVX2-FP-NEXT:    vmovdqa 80(%rdi), %xmm5
+; AVX2-FP-NEXT:    vpbroadcastd %xmm5, %xmm6
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm7[2,0,2,3,6,4,6,7]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,3,2,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3]
 ; AVX2-FP-NEXT:    vpbroadcastd 84(%rdi), %xmm8
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm7[2,3]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[3,1,3,3,7,5,7,7]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,3,2,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm5[2,3]
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,2]
 ; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [4,2,0,0]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FP-NEXT:    vpermd %ymm1, %ymm9, %ymm2
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3]
 ; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3]
-; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [5,3,0,0]
-; AVX2-FP-NEXT:    vpermd %ymm1, %ymm7, %ymm1
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3]
+; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0]
+; AVX2-FP-NEXT:    vpermd %ymm1, %ymm5, %ymm1
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
 ; AVX2-FP-NEXT:    vmovdqa %xmm0, (%rsi)
 ; AVX2-FP-NEXT:    vmovdqa %xmm3, (%rdx)
 ; AVX2-FP-NEXT:    vmovdqa %xmm6, (%rcx)
-; AVX2-FP-NEXT:    vmovdqa %xmm5, (%r8)
+; AVX2-FP-NEXT:    vmovdqa %xmm7, (%r8)
 ; AVX2-FP-NEXT:    vmovdqa %xmm2, (%r9)
 ; AVX2-FP-NEXT:    vmovdqa %xmm1, (%rax)
 ; AVX2-FP-NEXT:    vzeroupper
@@ -620,20 +612,16 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [1,7,5,0]
 ; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm5, %ymm3
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3]
-; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm5
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm2[0,0,2,3,4,4,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6,7]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,3,2,3]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [2,0,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm6, %ymm5, %ymm5
 ; AVX2-FCP-NEXT:    vmovdqa 80(%rdi), %xmm7
 ; AVX2-FCP-NEXT:    vpbroadcastd %xmm7, %xmm8
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm2[0,1,3,3,4,5,7,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm8[1,2,3],ymm5[4],ymm8[5,6,7]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,3,2,3]
-; AVX2-FCP-NEXT:    vpbroadcastd 84(%rdi), %xmm8
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm8 = [3,1,7,7]
+; AVX2-FCP-NEXT:    vpermd %ymm6, %ymm8, %ymm6
+; AVX2-FCP-NEXT:    vpbroadcastd 84(%rdi), %xmm8
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm7[2,3]
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,2]
 ; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [4,2,0,0]
@@ -647,8 +635,8 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
 ; AVX2-FCP-NEXT:    vmovdqa %xmm0, (%rsi)
 ; AVX2-FCP-NEXT:    vmovdqa %xmm3, (%rdx)
-; AVX2-FCP-NEXT:    vmovdqa %xmm6, (%rcx)
-; AVX2-FCP-NEXT:    vmovdqa %xmm5, (%r8)
+; AVX2-FCP-NEXT:    vmovdqa %xmm5, (%rcx)
+; AVX2-FCP-NEXT:    vmovdqa %xmm6, (%r8)
 ; AVX2-FCP-NEXT:    vmovdqa %xmm2, (%r9)
 ; AVX2-FCP-NEXT:    vmovdqa %xmm1, (%rax)
 ; AVX2-FCP-NEXT:    vzeroupper
@@ -1103,28 +1091,21 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
 ; AVX2-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[0,2,0,3]
-; AVX2-NEXT:    vmovaps (%rdi), %xmm11
-; AVX2-NEXT:    vshufps {{.*#+}} xmm12 = xmm11[2,3,2,3]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm13 = ymm4[0,0,2,3,4,4,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0],ymm13[1,2,3],ymm12[4],ymm13[5,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm12 = ymm11[2,0,2,3,6,4,6,7]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm12 = ymm12[0,3,2,3]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm12 = ymm1[0,0,0,0,4,4,4,4]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm13 = ymm0[0,0,2,3,4,4,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3],ymm13[4,5,6],ymm12[7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm12 = ymm12[0,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm12 = ymm5[3,3,3,3,7,7,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0],ymm6[1],ymm12[2,3,4],ymm6[5],ymm12[6,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm12 = ymm12[0,2,0,3]
-; AVX2-NEXT:    vshufps {{.*#+}} xmm11 = xmm11[3,3,3,3]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm13 = ymm4[0,1,3,3,4,5,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0],ymm13[1,2,3],ymm11[4],ymm13[5,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm13 = ymm12[0,0,2,0,4,4,6,4]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm13 = ymm5[3,3,3,3,7,7,7,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0],ymm6[1],ymm13[2,3,4],ymm6[5],ymm13[6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[0,2,0,3]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[3,1,3,3,7,5,7,7]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm11 = ymm11[0,3,2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm12 = ymm1[0,1,0,1,4,5,4,5]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm13 = ymm0[0,1,3,3,4,5,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3],ymm13[4,5,6],ymm12[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3,4,5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,7,5]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm12 = ymm12[0,1,0,3]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5,6,7]
 ; AVX2-NEXT:    vmovaps 80(%rdi), %xmm12
@@ -1186,28 +1167,21 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7]
 ; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[0,2,0,3]
-; AVX2-FP-NEXT:    vmovaps (%rdi), %xmm11
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm12 = xmm11[2,3,2,3]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm13 = ymm4[0,0,2,3,4,4,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0],ymm13[1,2,3],ymm12[4],ymm13[5,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm12 = ymm11[2,0,2,3,6,4,6,7]
 ; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm12 = ymm12[0,3,2,3]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm12 = ymm1[0,0,0,0,4,4,4,4]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm13 = ymm0[0,0,2,3,4,4,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3],ymm13[4,5,6],ymm12[7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm12 = ymm12[0,1,0,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm12 = ymm5[3,3,3,3,7,7,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0],ymm6[1],ymm12[2,3,4],ymm6[5],ymm12[6,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm12 = ymm12[0,2,0,3]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm11 = xmm11[3,3,3,3]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm13 = ymm4[0,1,3,3,4,5,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0],ymm13[1,2,3],ymm11[4],ymm13[5,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm13 = ymm12[0,0,2,0,4,4,6,4]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm13 = ymm5[3,3,3,3,7,7,7,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0],ymm6[1],ymm13[2,3,4],ymm6[5],ymm13[6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[0,2,0,3]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[3,1,3,3,7,5,7,7]
 ; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm11 = ymm11[0,3,2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm12 = ymm1[0,1,0,1,4,5,4,5]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm13 = ymm0[0,1,3,3,4,5,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3],ymm13[4,5,6],ymm12[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3,4,5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,7,5]
 ; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm12 = ymm12[0,1,0,3]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5,6,7]
 ; AVX2-FP-NEXT:    vmovaps 80(%rdi), %xmm12
@@ -1269,30 +1243,24 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
 ; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm11 = [2,4,2,4,2,4,2,4]
 ; AVX2-FCP-NEXT:    vpermps %ymm10, %ymm11, %ymm10
-; AVX2-FCP-NEXT:    vmovaps (%rdi), %xmm11
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm12 = xmm11[2,3,2,3]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm13 = ymm4[0,0,2,3,4,4,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0],ymm13[1,2,3],ymm12[4],ymm13[5,6,7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm12 = ymm12[0,3,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm12 = ymm1[0,0,0,0,4,4,4,4]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm13 = ymm0[0,0,2,3,4,4,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3],ymm13[4,5,6],ymm12[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm12 = ymm12[0,1,0,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm12 = ymm5[3,3,3,3,7,7,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0],ymm6[1],ymm12[2,3,4],ymm6[5],ymm12[6,7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm12 = ymm12[0,2,0,3]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm11 = xmm11[3,3,3,3]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm13 = ymm4[0,1,3,3,4,5,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0],ymm13[1,2,3],ymm11[4],ymm13[5,6,7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm11 = ymm11[0,3,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm12 = ymm1[0,1,0,1,4,5,4,5]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm13 = ymm0[0,1,3,3,4,5,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3],ymm13[4,5,6],ymm12[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm12 = ymm12[0,1,0,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5,6,7]
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm11 = [2,0,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm11, %ymm11
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm13 = [0,0,6,4,0,0,6,4]
+; AVX2-FCP-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm13, %ymm13
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm13 = [0,1,7,5,0,1,7,5]
+; AVX2-FCP-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm13, %ymm11
+; AVX2-FCP-NEXT:    vpermilps {{.*#+}} xmm13 = mem[3,3,3,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0],ymm6[1],ymm13[2,3,4],ymm6[5],ymm13[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4],ymm13[5],ymm12[6,7]
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm13 = [3,1,7,5,0,u,u,u]
+; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm13, %ymm12
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5,6,7]
 ; AVX2-FCP-NEXT:    vmovaps 80(%rdi), %xmm12
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm5[0,1],ymm12[2,3],ymm5[4,5,6,7]
@@ -2171,557 +2139,524 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX2-LABEL: load_i32_stride6_vf16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    subq $360, %rsp # imm = 0x168
-; AVX2-NEXT:    vmovaps 288(%rdi), %ymm8
-; AVX2-NEXT:    vmovaps 224(%rdi), %ymm11
-; AVX2-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 192(%rdi), %ymm1
+; AVX2-NEXT:    subq $392, %rsp # imm = 0x188
+; AVX2-NEXT:    vmovaps 288(%rdi), %ymm10
+; AVX2-NEXT:    vmovaps 224(%rdi), %ymm1
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 128(%rdi), %ymm0
-; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 160(%rdi), %ymm2
+; AVX2-NEXT:    vmovaps 192(%rdi), %ymm2
 ; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 96(%rdi), %ymm15
-; AVX2-NEXT:    vmovaps (%rdi), %ymm3
+; AVX2-NEXT:    vmovaps 128(%rdi), %ymm0
+; AVX2-NEXT:    vmovaps 160(%rdi), %ymm3
 ; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 32(%rdi), %ymm13
-; AVX2-NEXT:    vmovaps 64(%rdi), %ymm6
-; AVX2-NEXT:    vmovaps {{.*#+}} xmm14 = [0,6,4,u]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm13[4,5],ymm3[6,7]
-; AVX2-NEXT:    vpermps %ymm4, %ymm14, %ymm3
-; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm6[0,1],ymm15[0,1]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm7 = ymm9[0,2,2,2,4,6,6,6]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm7[3,4,5,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm0 = [4,2,4,2,4,2,4,2]
-; AVX2-NEXT:    vpermps %ymm10, %ymm0, %ymm12
-; AVX2-NEXT:    vmovaps %ymm0, %ymm5
+; AVX2-NEXT:    vmovaps 96(%rdi), %ymm15
+; AVX2-NEXT:    vmovaps (%rdi), %ymm4
+; AVX2-NEXT:    vmovups %ymm4, (%rsp) # 32-byte Spill
+; AVX2-NEXT:    vmovaps 32(%rdi), %ymm5
+; AVX2-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovaps 64(%rdi), %ymm13
+; AVX2-NEXT:    vmovaps {{.*#+}} xmm6 = [0,6,4,u]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7]
+; AVX2-NEXT:    vpermps %ymm8, %ymm6, %ymm7
+; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm13[0,1],ymm15[0,1]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm11 = ymm9[0,2,2,2,4,6,6,6]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm7[0,1,2],ymm11[3,4,5,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vmovaps %ymm0, %ymm7
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm12[6,7]
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm12 = [4,2,4,2,4,2,4,2]
+; AVX2-NEXT:    vpermps %ymm4, %ymm12, %ymm14
+; AVX2-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm14[6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm11[4,5],ymm1[6,7]
-; AVX2-NEXT:    vpermps %ymm3, %ymm14, %ymm12
-; AVX2-NEXT:    vmovaps 256(%rdi), %ymm7
-; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm7[0,1],ymm8[0,1]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm8[6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
+; AVX2-NEXT:    vpermps %ymm3, %ymm6, %ymm0
+; AVX2-NEXT:    vmovaps 256(%rdi), %ymm11
+; AVX2-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm11[0,1],ymm10[0,1]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm10[6,7]
 ; AVX2-NEXT:    vshufps {{.*#+}} ymm14 = ymm1[0,2,2,2,4,6,6,6]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm12[0,1,2],ymm14[3,4,5,6,7]
-; AVX2-NEXT:    vmovaps 320(%rdi), %ymm12
-; AVX2-NEXT:    vmovaps 352(%rdi), %ymm14
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm0, %ymm5, %ymm11
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1,2],ymm14[3,4,5,6,7]
+; AVX2-NEXT:    vmovaps 320(%rdi), %ymm10
+; AVX2-NEXT:    vmovaps 352(%rdi), %ymm6
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovaps %ymm10, %ymm5
+; AVX2-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermps %ymm0, %ymm12, %ymm10
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7]
 ; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps {{.*#+}} xmm2 = [1,7,5,u]
-; AVX2-NEXT:    vpermps %ymm4, %ymm2, %ymm4
+; AVX2-NEXT:    vpermps %ymm8, %ymm2, %ymm8
 ; AVX2-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm9[3,4,5,6,7]
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm5 = [5,3,5,3,5,3,5,3]
-; AVX2-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps %ymm10, %ymm5, %ymm10
-; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm10[6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7]
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm14 = [5,3,5,3,5,3,5,3]
+; AVX2-NEXT:    vpermps %ymm4, %ymm14, %ymm4
+; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
 ; AVX2-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpermps %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[1,3,2,3,5,7,6,7]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    vpermps %ymm0, %ymm14, %ymm0
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps (%rdi), %xmm5
-; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm5[2,3,2,3]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm13[0,0,2,3,4,4,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm15[2,3],ymm6[4,5],ymm15[6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm15[2,3],ymm13[4,5],ymm15[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[0,0,0,0,4,4,4,4]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm4[0,0,2,3,4,4,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-NEXT:    vmovups (%rsp), %ymm9 # 32-byte Reload
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[2,0,2,3,6,4,6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm2[0,0,2,0,4,4,6,4]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 192(%rdi), %xmm0
-; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm9[0,0,2,3,4,4,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm8[2,3],ymm7[4,5],ymm8[6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm14[0,0,0,0,4,4,4,4]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm12[0,0,2,3,4,4,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm5[3,3,3,3]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm13[0,1,3,3,4,5,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm15[3,3,3,3,7,7,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm7[2,3],ymm11[4,5],ymm7[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm3[2,0,2,3,6,4,6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm8 = ymm4[0,0,2,0,4,4,6,4]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7]
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm15[3,3,3,3,7,7,7,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[3,1,3,3,7,5,7,7]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm10[0,1,0,1,4,5,4,5]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm4[0,1,3,3,4,5,7,7]
-; AVX2-NEXT:    vmovaps %ymm4, %ymm5
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
-; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm9[0,1,3,3,4,5,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm14[0,1,0,1,4,5,4,5]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm12[0,1,3,3,4,5,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,1,3,1,4,5,7,5]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm15[4,5,6,7]
-; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = ymm13[0,1,2,3],mem[4,5,6,7]
-; AVX2-NEXT:    vmovaps 80(%rdi), %xmm3
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7]
+; AVX2-NEXT:    vmovaps %ymm7, %ymm6
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[3,1,3,3,7,5,7,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm4[0,1,3,1,4,5,7,5]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm15[4,5,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-NEXT:    vmovaps 80(%rdi), %xmm4
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7]
 ; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vpermps %ymm2, %ymm0, %ymm6
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm5[2,3],ymm10[4,5,6,7]
-; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm11 = [0,2,0,6,0,2,0,6]
-; AVX2-NEXT:    # ymm11 = mem[0,1,0,1]
-; AVX2-NEXT:    vpermps %ymm6, %ymm11, %ymm13
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vpermps %ymm3, %ymm7, %ymm5
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm5 = ymm12[0,1],mem[2,3],ymm12[4,5,6,7]
+; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm10 = [0,2,0,6,0,2,0,6]
+; AVX2-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX2-NEXT:    vpermps %ymm5, %ymm10, %ymm13
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm13[5,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm7 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm7 = ymm9[0,1,2,3],mem[4,5,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm11 = ymm9[0,1,2,3],mem[4,5,6,7]
 ; AVX2-NEXT:    vmovaps 272(%rdi), %xmm13
-; AVX2-NEXT:    vblendps {{.*#+}} ymm15 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm15 = ymm6[0,1],ymm13[2,3],ymm6[4,5,6,7]
 ; AVX2-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[0,1,0,2,4,5,4,6]
-; AVX2-NEXT:    vpermps %ymm7, %ymm0, %ymm8
-; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm15[2,3,4,5,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm14[0,1],ymm12[2,3],ymm14[4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm12, %ymm11, %ymm11
-; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm11, %ymm7, %ymm7
+; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3,4,5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm12 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm12 = mem[0,1],ymm9[2,3],mem[4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm12, %ymm10, %ymm10
+; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm10[5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm3, %ymm14, %ymm3
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
+; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX2-NEXT:    vpermps %ymm5, %ymm3, %ymm4
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm6[1,1,1,1,5,5,5,5]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm13[3],ymm4[4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm11, %ymm14, %ymm5
+; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm12, %ymm3, %ymm3
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7]
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT:    vpermps %ymm2, %ymm4, %ymm2
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [0,3,1,7,0,3,1,7]
-; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX2-NEXT:    vpermps %ymm6, %ymm2, %ymm3
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm5[1,1,1,1,5,5,5,5]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm13[3],ymm3[4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm7, %ymm4, %ymm5
-; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm12, %ymm2, %ymm2
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm3, 32(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm3, (%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm3, 32(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm3, (%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm3, 32(%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm3, (%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm3, 32(%r8)
-; AVX2-NEXT:    vmovups (%rsp), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm3, (%r8)
-; AVX2-NEXT:    vmovaps %ymm8, 32(%r9)
+; AVX2-NEXT:    vmovaps %ymm4, 32(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm4, (%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm4, 32(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm4, (%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm4, 32(%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm4, (%rcx)
+; AVX2-NEXT:    vmovaps %ymm8, 32(%r8)
+; AVX2-NEXT:    vmovaps %ymm0, (%r8)
+; AVX2-NEXT:    vmovaps %ymm7, 32(%r9)
 ; AVX2-NEXT:    vmovaps %ymm1, (%r9)
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    vmovaps %ymm2, 32(%rax)
-; AVX2-NEXT:    vmovaps %ymm0, (%rax)
-; AVX2-NEXT:    addq $360, %rsp # imm = 0x168
+; AVX2-NEXT:    vmovaps %ymm3, 32(%rax)
+; AVX2-NEXT:    vmovaps %ymm2, (%rax)
+; AVX2-NEXT:    addq $392, %rsp # imm = 0x188
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX2-FP-LABEL: load_i32_stride6_vf16:
 ; AVX2-FP:       # %bb.0:
-; AVX2-FP-NEXT:    subq $360, %rsp # imm = 0x168
-; AVX2-FP-NEXT:    vmovaps 288(%rdi), %ymm8
-; AVX2-FP-NEXT:    vmovaps 224(%rdi), %ymm11
-; AVX2-FP-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 192(%rdi), %ymm1
+; AVX2-FP-NEXT:    subq $392, %rsp # imm = 0x188
+; AVX2-FP-NEXT:    vmovaps 288(%rdi), %ymm10
+; AVX2-FP-NEXT:    vmovaps 224(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 128(%rdi), %ymm0
-; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 160(%rdi), %ymm2
+; AVX2-FP-NEXT:    vmovaps 192(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 96(%rdi), %ymm15
-; AVX2-FP-NEXT:    vmovaps (%rdi), %ymm3
+; AVX2-FP-NEXT:    vmovaps 128(%rdi), %ymm0
+; AVX2-FP-NEXT:    vmovaps 160(%rdi), %ymm3
 ; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 32(%rdi), %ymm13
-; AVX2-FP-NEXT:    vmovaps 64(%rdi), %ymm6
-; AVX2-FP-NEXT:    vmovaps {{.*#+}} xmm14 = [0,6,4,u]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm13[4,5],ymm3[6,7]
-; AVX2-FP-NEXT:    vpermps %ymm4, %ymm14, %ymm3
-; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm6[0,1],ymm15[0,1]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm7 = ymm9[0,2,2,2,4,6,6,6]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm7[3,4,5,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FP-NEXT:    vbroadcastsd {{.*#+}} ymm0 = [4,2,4,2,4,2,4,2]
-; AVX2-FP-NEXT:    vpermps %ymm10, %ymm0, %ymm12
-; AVX2-FP-NEXT:    vmovaps %ymm0, %ymm5
+; AVX2-FP-NEXT:    vmovaps 96(%rdi), %ymm15
+; AVX2-FP-NEXT:    vmovaps (%rdi), %ymm4
+; AVX2-FP-NEXT:    vmovups %ymm4, (%rsp) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovaps 32(%rdi), %ymm5
+; AVX2-FP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovaps 64(%rdi), %ymm13
+; AVX2-FP-NEXT:    vmovaps {{.*#+}} xmm6 = [0,6,4,u]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm8, %ymm6, %ymm7
+; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm13[0,1],ymm15[0,1]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm11 = ymm9[0,2,2,2,4,6,6,6]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm7[0,1,2],ymm11[3,4,5,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vmovaps %ymm0, %ymm7
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm12[6,7]
+; AVX2-FP-NEXT:    vbroadcastsd {{.*#+}} ymm12 = [4,2,4,2,4,2,4,2]
+; AVX2-FP-NEXT:    vpermps %ymm4, %ymm12, %ymm14
+; AVX2-FP-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm14[6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm11[4,5],ymm1[6,7]
-; AVX2-FP-NEXT:    vpermps %ymm3, %ymm14, %ymm12
-; AVX2-FP-NEXT:    vmovaps 256(%rdi), %ymm7
-; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm7[0,1],ymm8[0,1]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm8[6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm3, %ymm6, %ymm0
+; AVX2-FP-NEXT:    vmovaps 256(%rdi), %ymm11
+; AVX2-FP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm11[0,1],ymm10[0,1]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm10[6,7]
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm14 = ymm1[0,2,2,2,4,6,6,6]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm12[0,1,2],ymm14[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovaps 320(%rdi), %ymm12
-; AVX2-FP-NEXT:    vmovaps 352(%rdi), %ymm14
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm0, %ymm5, %ymm11
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1,2],ymm14[3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovaps 320(%rdi), %ymm10
+; AVX2-FP-NEXT:    vmovaps 352(%rdi), %ymm6
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovaps %ymm10, %ymm5
+; AVX2-FP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpermps %ymm0, %ymm12, %ymm10
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps {{.*#+}} xmm2 = [1,7,5,u]
-; AVX2-FP-NEXT:    vpermps %ymm4, %ymm2, %ymm4
+; AVX2-FP-NEXT:    vpermps %ymm8, %ymm2, %ymm8
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm9[3,4,5,6,7]
-; AVX2-FP-NEXT:    vbroadcastsd {{.*#+}} ymm5 = [5,3,5,3,5,3,5,3]
-; AVX2-FP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps %ymm10, %ymm5, %ymm10
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm10[6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7]
+; AVX2-FP-NEXT:    vbroadcastsd {{.*#+}} ymm14 = [5,3,5,3,5,3,5,3]
+; AVX2-FP-NEXT:    vpermps %ymm4, %ymm14, %ymm4
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpermps %ymm3, %ymm2, %ymm2
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[1,3,2,3,5,7,6,7]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm0, %ymm5, %ymm0
+; AVX2-FP-NEXT:    vpermps %ymm0, %ymm14, %ymm0
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps (%rdi), %xmm5
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm5[2,3,2,3]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm13[0,0,2,3,4,4,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm15[2,3],ymm6[4,5],ymm15[6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm15[2,3],ymm13[4,5],ymm15[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[0,0,0,0,4,4,4,4]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm4[0,0,2,3,4,4,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FP-NEXT:    vmovups (%rsp), %ymm9 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[2,0,2,3,6,4,6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm2[0,0,2,0,4,4,6,4]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 192(%rdi), %xmm0
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm9[0,0,2,3,4,4,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm8[2,3],ymm7[4,5],ymm8[6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm14[0,0,0,0,4,4,4,4]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm12[0,0,2,3,4,4,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm1 = xmm5[3,3,3,3]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm13[0,1,3,3,4,5,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm15[3,3,3,3,7,7,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm7[2,3],ymm11[4,5],ymm7[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm4 = ymm3[2,0,2,3,6,4,6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm8 = ymm4[0,0,2,0,4,4,6,4]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm15[3,3,3,3,7,7,7,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[3,1,3,3,7,5,7,7]
 ; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm10[0,1,0,1,4,5,4,5]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm4[0,1,3,3,4,5,7,7]
-; AVX2-FP-NEXT:    vmovaps %ymm4, %ymm5
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm9[0,1,3,3,4,5,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm14[0,1,0,1,4,5,4,5]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm12[0,1,3,3,4,5,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,1,3,1,4,5,7,5]
 ; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm15[4,5,6,7]
-; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = ymm13[0,1,2,3],mem[4,5,6,7]
-; AVX2-FP-NEXT:    vmovaps 80(%rdi), %xmm3
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7]
+; AVX2-FP-NEXT:    vmovaps %ymm7, %ymm6
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[3,1,3,3,7,5,7,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm4[0,1,3,1,4,5,7,5]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm15[4,5,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-FP-NEXT:    vmovaps 80(%rdi), %xmm4
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7]
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm0, %ymm6
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm5[2,3],ymm10[4,5,6,7]
-; AVX2-FP-NEXT:    vbroadcastf128 {{.*#+}} ymm11 = [0,2,0,6,0,2,0,6]
-; AVX2-FP-NEXT:    # ymm11 = mem[0,1,0,1]
-; AVX2-FP-NEXT:    vpermps %ymm6, %ymm11, %ymm13
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vpermps %ymm3, %ymm7, %ymm5
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm5 = ymm12[0,1],mem[2,3],ymm12[4,5,6,7]
+; AVX2-FP-NEXT:    vbroadcastf128 {{.*#+}} ymm10 = [0,2,0,6,0,2,0,6]
+; AVX2-FP-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX2-FP-NEXT:    vpermps %ymm5, %ymm10, %ymm13
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm13[5,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm7 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm7 = ymm9[0,1,2,3],mem[4,5,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm11 = ymm9[0,1,2,3],mem[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovaps 272(%rdi), %xmm13
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm15 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm15 = ymm6[0,1],ymm13[2,3],ymm6[4,5,6,7]
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[0,1,0,2,4,5,4,6]
-; AVX2-FP-NEXT:    vpermps %ymm7, %ymm0, %ymm8
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm15[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm14[0,1],ymm12[2,3],ymm14[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm12, %ymm11, %ymm11
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm11, %ymm7, %ymm7
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm12 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm12 = mem[0,1],ymm9[2,3],mem[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm12, %ymm10, %ymm10
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm10[5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm3, %ymm14, %ymm3
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7]
+; AVX2-FP-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX2-FP-NEXT:    vpermps %ymm5, %ymm3, %ymm4
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm4 = ymm6[1,1,1,1,5,5,5,5]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm13[3],ymm4[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm11, %ymm14, %ymm5
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm12, %ymm3, %ymm3
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7]
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm4, %ymm2
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [0,3,1,7,0,3,1,7]
-; AVX2-FP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX2-FP-NEXT:    vpermps %ymm6, %ymm2, %ymm3
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm5[1,1,1,1,5,5,5,5]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm13[3],ymm3[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm7, %ymm4, %ymm5
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm12, %ymm2, %ymm2
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm3, 32(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm3, (%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm3, 32(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm3, (%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm3, 32(%rcx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm3, (%rcx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm3, 32(%r8)
-; AVX2-FP-NEXT:    vmovups (%rsp), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm3, (%r8)
-; AVX2-FP-NEXT:    vmovaps %ymm8, 32(%r9)
+; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm4, (%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm4, (%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%rcx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm4, (%rcx)
+; AVX2-FP-NEXT:    vmovaps %ymm8, 32(%r8)
+; AVX2-FP-NEXT:    vmovaps %ymm0, (%r8)
+; AVX2-FP-NEXT:    vmovaps %ymm7, 32(%r9)
 ; AVX2-FP-NEXT:    vmovaps %ymm1, (%r9)
 ; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-FP-NEXT:    vmovaps %ymm2, 32(%rax)
-; AVX2-FP-NEXT:    vmovaps %ymm0, (%rax)
-; AVX2-FP-NEXT:    addq $360, %rsp # imm = 0x168
+; AVX2-FP-NEXT:    vmovaps %ymm3, 32(%rax)
+; AVX2-FP-NEXT:    vmovaps %ymm2, (%rax)
+; AVX2-FP-NEXT:    addq $392, %rsp # imm = 0x188
 ; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
 ;
 ; AVX2-FCP-LABEL: load_i32_stride6_vf16:
 ; AVX2-FCP:       # %bb.0:
-; AVX2-FCP-NEXT:    subq $328, %rsp # imm = 0x148
+; AVX2-FCP-NEXT:    subq $360, %rsp # imm = 0x168
 ; AVX2-FCP-NEXT:    vmovaps 288(%rdi), %ymm10
-; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %ymm14
-; AVX2-FCP-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %ymm11
-; AVX2-FCP-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %ymm6
+; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm15
-; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm3
-; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm13
-; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %ymm6
-; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm0 = [0,6,4,u]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm13[4,5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm0, %ymm7
-; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm6[0,1],ymm15[0,1]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm9 = ymm8[0,2,2,2,4,6,6,6]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm7[0,1,2],ymm9[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [4,2,4,2,4,2,4,2]
-; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm1, %ymm12
+; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %ymm0
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %ymm13
+; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm4
+; AVX2-FCP-NEXT:    vmovups %ymm4, (%rsp) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm5
+; AVX2-FCP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %ymm15
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm12 = [0,6,4,u]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm12, %ymm7
 ; AVX2-FCP-NEXT:    vmovaps %ymm1, %ymm5
+; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm15[0,1],ymm1[0,1]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm12[6,7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm11 = ymm9[0,2,2,2,4,6,6,6]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm7[0,1,2],ymm11[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [4,2,4,2,4,2,4,2]
+; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm1, %ymm14
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm14[4,5],ymm11[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm0, %ymm9
-; AVX2-FCP-NEXT:    vmovaps 256(%rdi), %ymm7
-; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm7[0,1],ymm10[0,1]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm10[6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm12 = ymm1[0,2,2,2,4,6,6,6]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm12[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovaps 320(%rdi), %ymm12
-; AVX2-FCP-NEXT:    vmovaps 352(%rdi), %ymm14
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm5, %ymm11
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm11[6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm11 = [1,7,5,u]
-; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm11, %ymm4
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm9 = [5,3,5,3,5,3,5,3]
-; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm9, %ymm3
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm11, %ymm2
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[1,3,2,3,5,7,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm14[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm6[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm12, %ymm12
+; AVX2-FCP-NEXT:    vmovaps 256(%rdi), %ymm11
+; AVX2-FCP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm11[0,1],ymm10[0,1]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5],ymm10[6,7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm14 = ymm6[0,2,2,2,4,6,6,6]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm12[0,1,2],ymm14[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovaps 320(%rdi), %ymm7
+; AVX2-FCP-NEXT:    vmovaps 352(%rdi), %ymm12
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm10
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm2 = [1,7,5,u]
+; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm2, %ymm8
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm14 = [5,3,5,3,5,3,5,3]
+; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm14, %ymm4
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm2, %ymm2
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[1,3,2,3,5,7,6,7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm9, %ymm0
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm14, %ymm0
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps (%rdi), %xmm0
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm13[0,0,2,3,4,4,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm15[2,3],ymm6[4,5],ymm15[6,7]
-; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm3 = [2,0,6,4,2,0,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm3, %ymm2
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm8[0,0,0,0,4,4,4,4]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm5[2,3],ymm15[4,5],ymm5[6,7]
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm1 = [2,0,6,4,2,0,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm2 = [2,0,6,7]
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm4 = ymm5[0,0,2,3,4,4,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm3, %ymm1
-; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %xmm2
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[2,3,2,3]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm4 = ymm11[0,0,2,3,4,4,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm3 = ymm14[0,0,0,0,4,4,4,4]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm4 = ymm12[0,0,2,3,4,4,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm13[0,1,3,3,4,5,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm8[0,1,0,1,4,5,4,5]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm3 = ymm5[0,1,3,3,4,5,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[3,3,3,3]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm11[0,1,3,3,4,5,7,7]
-; AVX2-FCP-NEXT:    vmovaps %ymm11, %ymm8
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm14[0,1,0,1,4,5,4,5]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm12[0,1,3,3,4,5,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm9[2,3],ymm5[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm2, %ymm4
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm4 = mem[0,1,2,3],ymm13[4,5],mem[6,7]
+; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm8 = [0,0,6,4,0,0,6,4]
+; AVX2-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
+; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm8, %ymm10
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm15[4,5,6,7]
-; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm2 = ymm13[0,1,2,3],mem[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovaps 80(%rdi), %xmm3
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm6[2,3],ymm11[4,5],ymm6[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm1 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm2, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm8, %ymm8
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermilps {{.*#+}} xmm0 = mem[3,3,3,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4],ymm15[5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3,4],ymm0[5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm3 = [0,1,7,5,0,1,7,5]
+; AVX2-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm3, %ymm4
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm8 = [3,1,7,5,0,u,u,u]
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm8, %ymm0
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
+; AVX2-FCP-NEXT:    vpermilps {{.*#+}} xmm4 = mem[3,3,3,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3,4],ymm4[5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm8, %ymm1
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm3, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = ymm15[0,1,2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovaps 80(%rdi), %xmm4
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7]
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm0, %ymm6
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm6 = mem[0,1],ymm5[2,3],mem[4,5,6,7]
-; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm11 = [0,2,0,6,0,2,0,6]
-; AVX2-FCP-NEXT:    # ymm11 = mem[0,1,0,1]
-; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm11, %ymm13
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm7, %ymm5
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm5 = ymm13[0,1],mem[2,3],ymm13[4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm10 = [0,2,0,6,0,2,0,6]
+; AVX2-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm10, %ymm13
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm13[5,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm7 = ymm8[0,1,2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm11 = ymm9[0,1,2,3],mem[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovaps 272(%rdi), %xmm13
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm15 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm15 = ymm6[0,1],ymm13[2,3],ymm6[4,5,6,7]
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[0,1,0,2,4,5,4,6]
-; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm0, %ymm10
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm15[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm14[0,1],ymm12[2,3],ymm14[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm11, %ymm11
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5,6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm9, %ymm2
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [0,3,1,7,0,3,1,7]
-; AVX2-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm2, %ymm3
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm3 = ymm5[1,1,1,1,5,5,5,5]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm13[3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm9, %ymm5
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm2, %ymm2
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm3, 32(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm3, (%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm3, 32(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm3, (%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm3, 32(%rcx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm3, (%rcx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm3, 32(%r8)
-; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm3, (%r8)
-; AVX2-FCP-NEXT:    vmovaps %ymm10, 32(%r9)
+; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm7, %ymm7
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm12 = ymm12[0,1],mem[2,3],ymm12[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm10, %ymm10
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm10[5,6,7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm14, %ymm3
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7]
+; AVX2-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm3, %ymm4
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm4 = ymm6[1,1,1,1,5,5,5,5]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm13[3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm14, %ymm5
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm3, %ymm3
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm4, (%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm4, (%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm4, (%rcx)
+; AVX2-FCP-NEXT:    vmovaps %ymm8, 32(%r8)
+; AVX2-FCP-NEXT:    vmovaps %ymm0, (%r8)
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 32(%r9)
 ; AVX2-FCP-NEXT:    vmovaps %ymm1, (%r9)
 ; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-FCP-NEXT:    vmovaps %ymm2, 32(%rax)
-; AVX2-FCP-NEXT:    vmovaps %ymm0, (%rax)
-; AVX2-FCP-NEXT:    addq $328, %rsp # imm = 0x148
+; AVX2-FCP-NEXT:    vmovaps %ymm3, 32(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm2, (%rax)
+; AVX2-FCP-NEXT:    addq $360, %rsp # imm = 0x168
 ; AVX2-FCP-NEXT:    vzeroupper
 ; AVX2-FCP-NEXT:    retq
 ;
@@ -4501,14 +4436,14 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX2-LABEL: load_i32_stride6_vf32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    subq $1256, %rsp # imm = 0x4E8
+; AVX2-NEXT:    subq $1224, %rsp # imm = 0x4C8
 ; AVX2-NEXT:    vmovaps 480(%rdi), %ymm9
-; AVX2-NEXT:    vmovaps 448(%rdi), %ymm10
-; AVX2-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 416(%rdi), %ymm7
-; AVX2-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovaps 448(%rdi), %ymm11
+; AVX2-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovaps 416(%rdi), %ymm8
+; AVX2-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 128(%rdi), %ymm2
-; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 160(%rdi), %ymm3
 ; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 96(%rdi), %ymm4
@@ -4519,35 +4454,36 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 64(%rdi), %ymm5
 ; AVX2-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{.*#+}} xmm8 = [0,6,4,u]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-NEXT:    vpermps %ymm13, %ymm8, %ymm0
+; AVX2-NEXT:    vmovaps {{.*#+}} xmm10 = [0,6,4,u]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermps %ymm0, %ymm10, %ymm0
 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm4[0,1]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5],ymm4[6,7]
 ; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm5[0,2,2,2,4,6,6,6]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [4,2,4,2,4,2,4,2]
-; AVX2-NEXT:    vpermps %ymm6, %ymm2, %ymm1
-; AVX2-NEXT:    vmovaps %ymm2, %ymm11
+; AVX2-NEXT:    vpermps %ymm7, %ymm2, %ymm1
+; AVX2-NEXT:    vmovaps %ymm2, %ymm6
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm10[0,1],ymm9[0,1]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm9[6,7]
+; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm11[0,1],ymm9[0,1]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],ymm9[6,7]
 ; AVX2-NEXT:    vmovaps 384(%rdi), %ymm0
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7]
-; AVX2-NEXT:    vpermps %ymm4, %ymm8, %ymm0
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm3[0,2,2,2,4,6,6,6]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7]
+; AVX2-NEXT:    vpermps %ymm3, %ymm10, %ymm0
+; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX2-NEXT:    vmovaps 512(%rdi), %ymm1
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 544(%rdi), %ymm2
 ; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm2, %ymm11, %ymm1
-; AVX2-NEXT:    vmovaps %ymm11, %ymm12
+; AVX2-NEXT:    vpermps %ymm2, %ymm6, %ymm1
+; AVX2-NEXT:    vmovaps %ymm6, %ymm9
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 288(%rdi), %ymm1
@@ -4555,376 +4491,342 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovaps 256(%rdi), %ymm0
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-NEXT:    vmovaps 224(%rdi), %ymm1
-; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 192(%rdi), %ymm0
-; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-NEXT:    vpermps %ymm1, %ymm8, %ymm14
-; AVX2-NEXT:    vshufps {{.*#+}} ymm9 = ymm10[0,2,2,2,4,6,6,6]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm14[0,1,2],ymm9[3,4,5,6,7]
-; AVX2-NEXT:    vmovaps 320(%rdi), %ymm0
-; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 352(%rdi), %ymm7
-; AVX2-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm7[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm14, %ymm11, %ymm7
-; AVX2-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7]
-; AVX2-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 608(%rdi), %ymm0
-; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 576(%rdi), %ymm7
-; AVX2-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5],ymm7[6,7]
-; AVX2-NEXT:    vpermps %ymm7, %ymm8, %ymm8
-; AVX2-NEXT:    vmovaps 672(%rdi), %ymm0
+; AVX2-NEXT:    vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT:    vmovaps 224(%rdi), %ymm0
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovaps 192(%rdi), %ymm6
+; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7]
+; AVX2-NEXT:    vpermps %ymm1, %ymm10, %ymm8
+; AVX2-NEXT:    vshufps {{.*#+}} ymm11 = ymm13[0,2,2,2,4,6,6,6]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm8[0,1,2],ymm11[3,4,5,6,7]
+; AVX2-NEXT:    vmovaps 320(%rdi), %ymm6
+; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovaps 352(%rdi), %ymm8
+; AVX2-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm8, %ymm9, %ymm14
+; AVX2-NEXT:    vmovaps %ymm9, %ymm0
+; AVX2-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm14[6,7]
+; AVX2-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovaps 608(%rdi), %ymm6
+; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovaps 576(%rdi), %ymm9
+; AVX2-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm9[0,1,2,3],ymm6[4,5],ymm9[6,7]
+; AVX2-NEXT:    vpermps %ymm14, %ymm10, %ymm10
+; AVX2-NEXT:    vmovaps 672(%rdi), %ymm6
+; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 640(%rdi), %ymm9
 ; AVX2-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm9[0,1],ymm0[0,1]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm11 = ymm9[0,2,2,2,4,6,6,6]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm11[3,4,5,6,7]
-; AVX2-NEXT:    vmovaps 704(%rdi), %ymm11
+; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm6[0,1]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm6[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm15 = ymm12[0,2,2,2,4,6,6,6]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm15[3,4,5,6,7]
+; AVX2-NEXT:    vmovaps 704(%rdi), %ymm6
+; AVX2-NEXT:    vmovaps 736(%rdi), %ymm11
+; AVX2-NEXT:    vblendps {{.*#+}} ymm15 = ymm11[0,1,2,3],ymm6[4,5,6,7]
 ; AVX2-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 736(%rdi), %ymm0
-; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm11[4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm11, %ymm12, %ymm15
-; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{.*#+}} xmm8 = [1,7,5,u]
-; AVX2-NEXT:    vpermps %ymm13, %ymm8, %ymm13
+; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermps %ymm15, %ymm0, %ymm9
+; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
+; AVX2-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovaps {{.*#+}} xmm9 = [1,7,5,u]
+; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload
 ; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[1,3,2,3,5,7,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm13[0,1,2],ymm5[3,4,5,6,7]
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm12 = [5,3,5,3,5,3,5,3]
-; AVX2-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps %ymm6, %ymm12, %ymm6
-; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm10[0,1,2],ymm5[3,4,5,6,7]
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm10 = [5,3,5,3,5,3,5,3]
+; AVX2-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermps %ymm7, %ymm10, %ymm7
+; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7]
 ; AVX2-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps %ymm4, %ymm8, %ymm4
-; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[1,3,2,3,5,7,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm2, %ymm12, %ymm2
+; AVX2-NEXT:    vpermps %ymm3, %ymm9, %ymm3
+; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[1,3,2,3,5,7,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm2, %ymm10, %ymm2
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps %ymm1, %ymm8, %ymm0
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[1,3,2,3,5,7,6,7]
+; AVX2-NEXT:    vpermps %ymm1, %ymm9, %ymm0
+; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm13[1,3,2,3,5,7,6,7]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm14, %ymm12, %ymm1
+; AVX2-NEXT:    vpermps %ymm8, %ymm10, %ymm1
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps %ymm7, %ymm8, %ymm0
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm9[1,3,2,3,5,7,6,7]
+; AVX2-NEXT:    vpermps %ymm14, %ymm9, %ymm0
+; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm12[1,3,2,3,5,7,6,7]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm11, %ymm12, %ymm1
+; AVX2-NEXT:    vpermps %ymm15, %ymm10, %ymm1
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps (%rdi), %xmm3
-; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm3[2,3,2,3]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm7[0,0,2,3,4,4,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm9[0,0,0,0,4,4,4,4]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm8[0,0,2,3,4,4,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX2-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[0,1],ymm14[2,3],mem[4,5],ymm14[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm3 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm3[2,0,2,3,6,4,6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm4 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[0,0,2,0,4,4,6,4]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 384(%rdi), %xmm1
-; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; AVX2-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = mem[0,0,2,3,4,4,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = mem[0,0,0,0,4,4,4,4]
-; AVX2-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm4 = mem[0,0,2,3,4,4,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 576(%rdi), %xmm0
-; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX2-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = mem[0,0,2,3,4,4,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = mem[0,0,0,0,4,4,4,4]
-; AVX2-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm5 = mem[0,0,2,3,4,4,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm15 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm15[2,0,2,3,6,4,6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,0,2,0,4,4,6,4]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 192(%rdi), %xmm0
-; AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3]
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm13[0,0,2,3,4,4,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1,2,3],ymm2[4],ymm5[5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[0,2,0,3]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm7 = ymm0[2,0,2,3,6,4,6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,3,2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm8 = ymm1[0,0,2,0,4,4,6,4]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm8[5,6,7]
+; AVX2-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm6[0,1],ymm11[2,3],ymm6[4,5],ymm11[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm8 = mem[0,1],ymm8[2,3],mem[4,5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm9 = ymm8[2,0,2,3,6,4,6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm9 = ymm9[0,3,2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3,4,5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-NEXT:    vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm9 = mem[0,1,2,3],ymm9[4,5],mem[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm10 = ymm9[0,0,2,0,4,4,6,4]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm10[5,6,7]
+; AVX2-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vshufps {{.*#+}} ymm7 = ymm14[3,3,3,3,7,7,7,7]
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm10[0,0,0,0,4,4,4,4]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm6 = ymm4[0,0,2,3,4,4,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7]
-; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm3[3,3,3,3]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm7[0,1,3,3,4,5,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm11[3,3,3,3,7,7,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0],ymm12[1],ymm3[2,3,4],ymm12[5],ymm3[6,7]
-; AVX2-NEXT:    vmovaps %ymm12, %ymm11
+; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[3,1,3,3,7,5,7,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm7[3,4,5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,7,5]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7]
+; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm5[3,3,3,3,7,7,7,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4],ymm7[5],ymm3[6,7]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm9[0,1,0,1,4,5,4,5]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm8[0,1,3,3,4,5,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm15[3,1,3,3,7,5,7,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,1,3,1,4,5,7,5]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7]
+; AVX2-NEXT:    vmovaps %ymm6, %ymm4
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm8[3,1,3,3,7,5,7,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm9[0,1,3,1,4,5,7,5]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
 ; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm12[0,1,3,3,4,5,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm6[3,3,3,3,7,7,7,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[2,3,4],ymm13[5],ymm2[6,7]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm9[0,1,0,1,4,5,4,5]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm7[0,1,3,3,4,5,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX2-NEXT:    vmovaps %ymm13, %ymm3
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm13[0,1,3,3,4,5,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm14[3,3,3,3,7,7,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7]
-; AVX2-NEXT:    vmovaps %ymm15, %ymm14
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,1,3,3,7,5,7,7]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[0,1,0,1,4,5,4,5]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm4[0,1,3,3,4,5,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,3,1,4,5,7,5]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
-; AVX2-NEXT:    vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm1 = mem[0,1,3,3,4,5,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm1 = mem[3,3,3,3,7,7,7,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3,4],ymm8[5],ymm1[6,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm1 = mem[0,1,0,1,4,5,4,5]
-; AVX2-NEXT:    vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = mem[0,1,3,3,4,5,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm5[4,5,6,7]
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = ymm12[0,1,2,3],mem[4,5,6,7]
-; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 464(%rdi), %xmm0
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
 ; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-NEXT:    vpermps %ymm2, %ymm12, %ymm1
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-NEXT:    vpermps %ymm3, %ymm2, %ymm1
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm13 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6,7]
-; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm9 = [0,2,0,6,0,2,0,6]
-; AVX2-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX2-NEXT:    vpermps %ymm13, %ymm9, %ymm1
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm7 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
+; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm3 = [0,2,0,6,0,2,0,6]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX2-NEXT:    vpermps %ymm7, %ymm3, %ymm1
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm7 = ymm11[0,1,2,3],mem[4,5,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm14[4,5,6,7]
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX2-NEXT:    vmovaps 80(%rdi), %xmm6
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7]
+; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm14 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vmovaps 80(%rdi), %xmm5
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7]
 ; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
-; AVX2-NEXT:    vpermps %ymm11, %ymm12, %ymm1
-; AVX2-NEXT:    vmovaps %ymm12, %ymm15
+; AVX2-NEXT:    vpermps %ymm14, %ymm2, %ymm1
+; AVX2-NEXT:    vmovaps %ymm2, %ymm8
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm4 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm4, %ymm9, %ymm1
+; AVX2-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm2, %ymm3, %ymm1
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm10 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm10 = ymm14[0,1,2,3],mem[4,5,6,7]
-; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm5 = ymm3[0,1,2,3],mem[4,5,6,7]
-; AVX2-NEXT:    vmovaps 272(%rdi), %xmm3
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm3[2,3],ymm10[4,5,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm12 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vmovaps 272(%rdi), %xmm4
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm4[2,3],ymm9[4,5,6,7]
 ; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
-; AVX2-NEXT:    vpermps %ymm5, %ymm12, %ymm14
-; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm12, %ymm8, %ymm11
+; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm2, %ymm9, %ymm12
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm12[5,6,7]
-; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm12 = ymm8[0,1,2,3],mem[4,5,6,7]
+; AVX2-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm1 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm1, %ymm3, %ymm15
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm15[5,6,7]
+; AVX2-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm11 = ymm13[0,1,2,3],mem[4,5,6,7]
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm15 = mem[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-NEXT:    vmovaps 656(%rdi), %xmm0
-; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm0[2,3],ymm12[4,5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6]
-; AVX2-NEXT:    vpermps %ymm1, %ymm15, %ymm15
-; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm15 = mem[0,1],ymm15[2,3],mem[4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm15, %ymm9, %ymm9
-; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4],ymm9[5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-NEXT:    vpermps %ymm11, %ymm14, %ymm7
-; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7]
-; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm7 = [0,3,1,7,0,3,1,7]
-; AVX2-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX2-NEXT:    vpermps %ymm4, %ymm7, %ymm4
-; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5,6,7]
-; AVX2-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm6 = mem[1,1,1,1,5,5,5,5]
-; AVX2-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7]
-; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload
-; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3,4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm13, %ymm7, %ymm8
-; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm8 = ymm10[1,1,1,1,5,5,5,5]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3],ymm8[4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm5, %ymm14, %ymm5
-; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm2, %ymm7, %ymm2
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm12[1,1,1,1,5,5,5,5]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm1, %ymm14, %ymm1
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm15, %ymm7, %ymm1
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, 96(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, 32(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, 64(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, (%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, 96(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, 32(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, 64(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, (%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, 32(%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, 96(%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, 64(%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, (%rcx)
-; AVX2-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, 96(%r8)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, 32(%r8)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, 64(%r8)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, (%r8)
-; AVX2-NEXT:    vmovaps %ymm9, 96(%r9)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, 32(%r9)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, (%r9)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm1, 64(%r9)
+; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm0[2,3],ymm11[4,5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[0,1,0,2,4,5,4,6]
+; AVX2-NEXT:    vpermps %ymm15, %ymm8, %ymm8
+; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3,4,5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm10 = ymm10[0,1],mem[2,3],ymm10[4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm10, %ymm3, %ymm3
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[1,1,1,1,5,5,5,5]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-NEXT:    vpermps %ymm14, %ymm13, %ymm6
+; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7]
+; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm6 = [0,3,1,7,0,3,1,7]
+; AVX2-NEXT:    # ymm6 = mem[0,1,0,1]
+; AVX2-NEXT:    vpermps %ymm2, %ymm6, %ymm2
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm5 = mem[1,1,1,1,5,5,5,5]
+; AVX2-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm5 = ymm5[0,1,2],mem[3],ymm5[4,5,6,7]
+; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload
+; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm7, %ymm6, %ymm7
+; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm7 = ymm9[1,1,1,1,5,5,5,5]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm12, %ymm13, %ymm7
+; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm1, %ymm6, %ymm1
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm11[1,1,1,1,5,5,5,5]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm15, %ymm13, %ymm4
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm10, %ymm6, %ymm4
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm4, 96(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm4, 32(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm4, 64(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm4, (%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm4, 96(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm4, 32(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm4, 64(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm4, (%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm4, 32(%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm4, 96(%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm4, 64(%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm4, (%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm4, 96(%r8)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm4, 32(%r8)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm4, 64(%r8)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm4, (%r8)
+; AVX2-NEXT:    vmovaps %ymm3, 96(%r9)
+; AVX2-NEXT:    vmovups (%rsp), %ymm3 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm3, 32(%r9)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm3, (%r9)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm3, 64(%r9)
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    vmovaps %ymm0, 96(%rax)
-; AVX2-NEXT:    vmovaps %ymm2, 32(%rax)
-; AVX2-NEXT:    vmovaps %ymm6, 64(%rax)
-; AVX2-NEXT:    vmovaps %ymm4, (%rax)
-; AVX2-NEXT:    addq $1256, %rsp # imm = 0x4E8
+; AVX2-NEXT:    vmovaps %ymm1, 32(%rax)
+; AVX2-NEXT:    vmovaps %ymm5, 64(%rax)
+; AVX2-NEXT:    vmovaps %ymm2, (%rax)
+; AVX2-NEXT:    addq $1224, %rsp # imm = 0x4C8
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX2-FP-LABEL: load_i32_stride6_vf32:
 ; AVX2-FP:       # %bb.0:
-; AVX2-FP-NEXT:    subq $1256, %rsp # imm = 0x4E8
+; AVX2-FP-NEXT:    subq $1224, %rsp # imm = 0x4C8
 ; AVX2-FP-NEXT:    vmovaps 480(%rdi), %ymm9
-; AVX2-FP-NEXT:    vmovaps 448(%rdi), %ymm10
-; AVX2-FP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 416(%rdi), %ymm7
-; AVX2-FP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovaps 448(%rdi), %ymm11
+; AVX2-FP-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovaps 416(%rdi), %ymm8
+; AVX2-FP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 128(%rdi), %ymm2
-; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 160(%rdi), %ymm3
 ; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 96(%rdi), %ymm4
@@ -4935,35 +4837,36 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 64(%rdi), %ymm5
 ; AVX2-FP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps {{.*#+}} xmm8 = [0,6,4,u]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FP-NEXT:    vpermps %ymm13, %ymm8, %ymm0
+; AVX2-FP-NEXT:    vmovaps {{.*#+}} xmm10 = [0,6,4,u]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpermps %ymm0, %ymm10, %ymm0
 ; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm4[0,1]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5],ymm4[6,7]
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm5[0,2,2,2,4,6,6,6]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-FP-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [4,2,4,2,4,2,4,2]
-; AVX2-FP-NEXT:    vpermps %ymm6, %ymm2, %ymm1
-; AVX2-FP-NEXT:    vmovaps %ymm2, %ymm11
+; AVX2-FP-NEXT:    vpermps %ymm7, %ymm2, %ymm1
+; AVX2-FP-NEXT:    vmovaps %ymm2, %ymm6
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm10[0,1],ymm9[0,1]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm9[6,7]
+; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm11[0,1],ymm9[0,1]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],ymm9[6,7]
 ; AVX2-FP-NEXT:    vmovaps 384(%rdi), %ymm0
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7]
-; AVX2-FP-NEXT:    vpermps %ymm4, %ymm8, %ymm0
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm3[0,2,2,2,4,6,6,6]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm3, %ymm10, %ymm0
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovaps 512(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 544(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm11, %ymm1
-; AVX2-FP-NEXT:    vmovaps %ymm11, %ymm12
+; AVX2-FP-NEXT:    vpermps %ymm2, %ymm6, %ymm1
+; AVX2-FP-NEXT:    vmovaps %ymm6, %ymm9
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 288(%rdi), %ymm1
@@ -4971,374 +4874,340 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovaps 256(%rdi), %ymm0
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-FP-NEXT:    vmovaps 224(%rdi), %ymm1
-; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 192(%rdi), %ymm0
-; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FP-NEXT:    vpermps %ymm1, %ymm8, %ymm14
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm9 = ymm10[0,2,2,2,4,6,6,6]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm14[0,1,2],ymm9[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovaps 320(%rdi), %ymm0
-; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 352(%rdi), %ymm7
-; AVX2-FP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm7[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm14, %ymm11, %ymm7
-; AVX2-FP-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7]
-; AVX2-FP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 608(%rdi), %ymm0
-; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 576(%rdi), %ymm7
-; AVX2-FP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5],ymm7[6,7]
-; AVX2-FP-NEXT:    vpermps %ymm7, %ymm8, %ymm8
-; AVX2-FP-NEXT:    vmovaps 672(%rdi), %ymm0
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vmovaps 224(%rdi), %ymm0
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovaps 192(%rdi), %ymm6
+; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm1, %ymm10, %ymm8
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm11 = ymm13[0,2,2,2,4,6,6,6]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm8[0,1,2],ymm11[3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovaps 320(%rdi), %ymm6
+; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovaps 352(%rdi), %ymm8
+; AVX2-FP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm8, %ymm9, %ymm14
+; AVX2-FP-NEXT:    vmovaps %ymm9, %ymm0
+; AVX2-FP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm14[6,7]
+; AVX2-FP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovaps 608(%rdi), %ymm6
+; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovaps 576(%rdi), %ymm9
+; AVX2-FP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm9[0,1,2,3],ymm6[4,5],ymm9[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm14, %ymm10, %ymm10
+; AVX2-FP-NEXT:    vmovaps 672(%rdi), %ymm6
+; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 640(%rdi), %ymm9
 ; AVX2-FP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm9[0,1],ymm0[0,1]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm11 = ymm9[0,2,2,2,4,6,6,6]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm11[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovaps 704(%rdi), %ymm11
+; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm6[0,1]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm6[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm15 = ymm12[0,2,2,2,4,6,6,6]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm15[3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovaps 704(%rdi), %ymm6
+; AVX2-FP-NEXT:    vmovaps 736(%rdi), %ymm11
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm15 = ymm11[0,1,2,3],ymm6[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 736(%rdi), %ymm0
-; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm11[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm11, %ymm12, %ymm15
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-FP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps {{.*#+}} xmm8 = [1,7,5,u]
-; AVX2-FP-NEXT:    vpermps %ymm13, %ymm8, %ymm13
+; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpermps %ymm15, %ymm0, %ymm9
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
+; AVX2-FP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovaps {{.*#+}} xmm9 = [1,7,5,u]
+; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[1,3,2,3,5,7,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm13[0,1,2],ymm5[3,4,5,6,7]
-; AVX2-FP-NEXT:    vbroadcastsd {{.*#+}} ymm12 = [5,3,5,3,5,3,5,3]
-; AVX2-FP-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps %ymm6, %ymm12, %ymm6
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm10[0,1,2],ymm5[3,4,5,6,7]
+; AVX2-FP-NEXT:    vbroadcastsd {{.*#+}} ymm10 = [5,3,5,3,5,3,5,3]
+; AVX2-FP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpermps %ymm7, %ymm10, %ymm7
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps %ymm4, %ymm8, %ymm4
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[1,3,2,3,5,7,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm12, %ymm2
+; AVX2-FP-NEXT:    vpermps %ymm3, %ymm9, %ymm3
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[1,3,2,3,5,7,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm2, %ymm10, %ymm2
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps %ymm1, %ymm8, %ymm0
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[1,3,2,3,5,7,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm1, %ymm9, %ymm0
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm13[1,3,2,3,5,7,6,7]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm14, %ymm12, %ymm1
+; AVX2-FP-NEXT:    vpermps %ymm8, %ymm10, %ymm1
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps %ymm7, %ymm8, %ymm0
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm9[1,3,2,3,5,7,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm14, %ymm9, %ymm0
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm12[1,3,2,3,5,7,6,7]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm11, %ymm12, %ymm1
+; AVX2-FP-NEXT:    vpermps %ymm15, %ymm10, %ymm1
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps (%rdi), %xmm3
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm3[2,3,2,3]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm7[0,0,2,3,4,4,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm9[0,0,0,0,4,4,4,4]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm8[0,0,2,3,4,4,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm0 = mem[0,1],ymm14[2,3],mem[4,5],ymm14[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm3 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm3[2,0,2,3,6,4,6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm4 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[0,0,2,0,4,4,6,4]
 ; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 384(%rdi), %xmm1
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; AVX2-FP-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = mem[0,0,2,3,4,4,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = mem[0,0,0,0,4,4,4,4]
-; AVX2-FP-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm4 = mem[0,0,2,3,4,4,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 576(%rdi), %xmm0
-; AVX2-FP-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX2-FP-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = mem[0,0,2,3,4,4,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = mem[0,0,0,0,4,4,4,4]
-; AVX2-FP-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm5 = mem[0,0,2,3,4,4,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm15 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm15[2,0,2,3,6,4,6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,0,2,0,4,4,6,4]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 192(%rdi), %xmm0
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3]
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm13[0,0,2,3,4,4,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1,2,3],ymm2[4],ymm5[5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[0,2,0,3]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm0 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm7 = ymm0[2,0,2,3,6,4,6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,3,2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm8 = ymm1[0,0,2,0,4,4,6,4]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm8[5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm6[0,1],ymm11[2,3],ymm6[4,5],ymm11[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm8 = mem[0,1],ymm8[2,3],mem[4,5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm9 = ymm8[2,0,2,3,6,4,6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm9 = ymm9[0,3,2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm9 = mem[0,1,2,3],ymm9[4,5],mem[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm10 = ymm9[0,0,2,0,4,4,6,4]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm10[5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm7 = ymm14[3,3,3,3,7,7,7,7]
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm10[0,0,0,0,4,4,4,4]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm6 = ymm4[0,0,2,3,4,4,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm2 = xmm3[3,3,3,3]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm7[0,1,3,3,4,5,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm11[3,3,3,3,7,7,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0],ymm12[1],ymm3[2,3,4],ymm12[5],ymm3[6,7]
-; AVX2-FP-NEXT:    vmovaps %ymm12, %ymm11
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[3,1,3,3,7,5,7,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm7[3,4,5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,7,5]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm5[3,3,3,3,7,7,7,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4],ymm7[5],ymm3[6,7]
 ; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm9[0,1,0,1,4,5,4,5]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm8[0,1,3,3,4,5,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm4 = ymm15[3,1,3,3,7,5,7,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,1,3,1,4,5,7,5]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7]
+; AVX2-FP-NEXT:    vmovaps %ymm6, %ymm4
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm8[3,1,3,3,7,5,7,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm9[0,1,3,1,4,5,7,5]
 ; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm12[0,1,3,3,4,5,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm6[3,3,3,3,7,7,7,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[2,3,4],ymm13[5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm9[0,1,0,1,4,5,4,5]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm7[0,1,3,3,4,5,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX2-FP-NEXT:    vmovaps %ymm13, %ymm3
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm13[0,1,3,3,4,5,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm14[3,3,3,3,7,7,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7]
-; AVX2-FP-NEXT:    vmovaps %ymm15, %ymm14
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,1,3,3,7,5,7,7]
 ; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[0,1,0,1,4,5,4,5]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm4[0,1,3,3,4,5,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,3,1,4,5,7,5]
 ; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX2-FP-NEXT:    # xmm0 = mem[3,3,3,3]
-; AVX2-FP-NEXT:    vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm1 = mem[0,1,3,3,4,5,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2-FP-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm1 = mem[3,3,3,3,7,7,7,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3,4],ymm8[5],ymm1[6,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm1 = mem[0,1,0,1,4,5,4,5]
-; AVX2-FP-NEXT:    vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = mem[0,1,3,3,4,5,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm5[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = ymm12[0,1,2,3],mem[4,5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 464(%rdi), %xmm0
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm12, %ymm1
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT:    vpermps %ymm3, %ymm2, %ymm1
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm13 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6,7]
-; AVX2-FP-NEXT:    vbroadcastf128 {{.*#+}} ymm9 = [0,2,0,6,0,2,0,6]
-; AVX2-FP-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX2-FP-NEXT:    vpermps %ymm13, %ymm9, %ymm1
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm7 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
+; AVX2-FP-NEXT:    vbroadcastf128 {{.*#+}} ymm3 = [0,2,0,6,0,2,0,6]
+; AVX2-FP-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX2-FP-NEXT:    vpermps %ymm7, %ymm3, %ymm1
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm7 = ymm11[0,1,2,3],mem[4,5,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm14[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX2-FP-NEXT:    vmovaps 80(%rdi), %xmm6
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7]
+; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm14 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vmovaps 80(%rdi), %xmm5
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7]
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
-; AVX2-FP-NEXT:    vpermps %ymm11, %ymm12, %ymm1
-; AVX2-FP-NEXT:    vmovaps %ymm12, %ymm15
+; AVX2-FP-NEXT:    vpermps %ymm14, %ymm2, %ymm1
+; AVX2-FP-NEXT:    vmovaps %ymm2, %ymm8
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm4 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm4, %ymm9, %ymm1
+; AVX2-FP-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm2, %ymm3, %ymm1
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm10 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm10 = ymm14[0,1,2,3],mem[4,5,6,7]
-; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm5 = ymm3[0,1,2,3],mem[4,5,6,7]
-; AVX2-FP-NEXT:    vmovaps 272(%rdi), %xmm3
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm3[2,3],ymm10[4,5,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm12 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vmovaps 272(%rdi), %xmm4
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm4[2,3],ymm9[4,5,6,7]
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
-; AVX2-FP-NEXT:    vpermps %ymm5, %ymm12, %ymm14
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm12, %ymm8, %ymm11
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm9, %ymm12
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm12[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm12 = ymm8[0,1,2,3],mem[4,5,6,7]
+; AVX2-FP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm1 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm1, %ymm3, %ymm15
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm15[5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm11 = ymm13[0,1,2,3],mem[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm15 = mem[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovaps 656(%rdi), %xmm0
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm0[2,3],ymm12[4,5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6]
-; AVX2-FP-NEXT:    vpermps %ymm1, %ymm15, %ymm15
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm15 = mem[0,1],ymm15[2,3],mem[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm15, %ymm9, %ymm9
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4],ymm9[5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FP-NEXT:    vpermps %ymm11, %ymm14, %ymm7
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vbroadcastf128 {{.*#+}} ymm7 = [0,3,1,7,0,3,1,7]
-; AVX2-FP-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX2-FP-NEXT:    vpermps %ymm4, %ymm7, %ymm4
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5,6,7]
-; AVX2-FP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm6 = mem[1,1,1,1,5,5,5,5]
-; AVX2-FP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm13, %ymm7, %ymm8
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm8 = ymm10[1,1,1,1,5,5,5,5]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3],ymm8[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm5, %ymm14, %ymm5
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm7, %ymm2
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm12[1,1,1,1,5,5,5,5]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm1, %ymm14, %ymm1
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm15, %ymm7, %ymm1
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, 96(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, 32(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, 64(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, (%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, 96(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, 32(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, 64(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, (%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, 32(%rcx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, 96(%rcx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, 64(%rcx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, (%rcx)
-; AVX2-FP-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, 96(%r8)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, 32(%r8)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, 64(%r8)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, (%r8)
-; AVX2-FP-NEXT:    vmovaps %ymm9, 96(%r9)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, 32(%r9)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, (%r9)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm1, 64(%r9)
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm0[2,3],ymm11[4,5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[0,1,0,2,4,5,4,6]
+; AVX2-FP-NEXT:    vpermps %ymm15, %ymm8, %ymm8
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm10 = ymm10[0,1],mem[2,3],ymm10[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm10, %ymm3, %ymm3
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[1,1,1,1,5,5,5,5]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-FP-NEXT:    vpermps %ymm14, %ymm13, %ymm6
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vbroadcastf128 {{.*#+}} ymm6 = [0,3,1,7,0,3,1,7]
+; AVX2-FP-NEXT:    # ymm6 = mem[0,1,0,1]
+; AVX2-FP-NEXT:    vpermps %ymm2, %ymm6, %ymm2
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm5 = mem[1,1,1,1,5,5,5,5]
+; AVX2-FP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm5 = ymm5[0,1,2],mem[3],ymm5[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm7, %ymm6, %ymm7
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm7 = ymm9[1,1,1,1,5,5,5,5]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm12, %ymm13, %ymm7
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm1, %ymm6, %ymm1
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm4 = ymm11[1,1,1,1,5,5,5,5]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm15, %ymm13, %ymm4
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm10, %ymm6, %ymm4
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm4, 96(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm4, 64(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm4, (%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm4, 96(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm4, 64(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm4, (%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%rcx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm4, 96(%rcx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm4, 64(%rcx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm4, (%rcx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm4, 96(%r8)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%r8)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm4, 64(%r8)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm4, (%r8)
+; AVX2-FP-NEXT:    vmovaps %ymm3, 96(%r9)
+; AVX2-FP-NEXT:    vmovups (%rsp), %ymm3 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm3, 32(%r9)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm3, (%r9)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm3, 64(%r9)
 ; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FP-NEXT:    vmovaps %ymm0, 96(%rax)
-; AVX2-FP-NEXT:    vmovaps %ymm2, 32(%rax)
-; AVX2-FP-NEXT:    vmovaps %ymm6, 64(%rax)
-; AVX2-FP-NEXT:    vmovaps %ymm4, (%rax)
-; AVX2-FP-NEXT:    addq $1256, %rsp # imm = 0x4E8
+; AVX2-FP-NEXT:    vmovaps %ymm1, 32(%rax)
+; AVX2-FP-NEXT:    vmovaps %ymm5, 64(%rax)
+; AVX2-FP-NEXT:    vmovaps %ymm2, (%rax)
+; AVX2-FP-NEXT:    addq $1224, %rsp # imm = 0x4C8
 ; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
 ;
 ; AVX2-FCP-LABEL: load_i32_stride6_vf32:
 ; AVX2-FCP:       # %bb.0:
-; AVX2-FCP-NEXT:    subq $1256, %rsp # imm = 0x4E8
-; AVX2-FCP-NEXT:    vmovaps 480(%rdi), %ymm9
-; AVX2-FCP-NEXT:    vmovaps 448(%rdi), %ymm10
+; AVX2-FCP-NEXT:    subq $1192, %rsp # imm = 0x4A8
+; AVX2-FCP-NEXT:    vmovaps 480(%rdi), %ymm6
+; AVX2-FCP-NEXT:    vmovaps 448(%rdi), %ymm11
+; AVX2-FCP-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 416(%rdi), %ymm10
 ; AVX2-FCP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 416(%rdi), %ymm7
-; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %ymm3
@@ -5346,38 +5215,40 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm4
 ; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm0
-; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %ymm5
 ; AVX2-FCP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm8 = [0,6,4,u]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm13, %ymm8, %ymm0
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm9 = [0,6,4,u]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm9, %ymm0
 ; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm4[0,1]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm8[0,2,2,2,4,6,6,6]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm12 = [4,2,4,2,4,2,4,2]
-; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm12, %ymm1
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [4,2,4,2,4,2,4,2]
+; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm2, %ymm1
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm10[0,1],ymm9[0,1]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],ymm9[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm11[0,1],ymm6[0,1]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7]
 ; AVX2-FCP-NEXT:    vmovaps 384(%rdi), %ymm0
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm8, %ymm0
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm10[4,5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm9, %ymm0
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovaps 512(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 544(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm12, %ymm1
+; AVX2-FCP-NEXT:    vmovaps 544(%rdi), %ymm3
+; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm2, %ymm1
+; AVX2-FCP-NEXT:    vmovaps %ymm2, %ymm5
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps 288(%rdi), %ymm1
@@ -5385,359 +5256,313 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovaps 256(%rdi), %ymm0
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %ymm0
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %ymm0
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm8, %ymm14
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm9 = ymm10[0,2,2,2,4,6,6,6]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm14[0,1,2],ymm9[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovaps 320(%rdi), %ymm0
-; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 352(%rdi), %ymm7
-; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm7[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm14, %ymm12, %ymm7
-; AVX2-FCP-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 608(%rdi), %ymm0
-; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 576(%rdi), %ymm7
-; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5],ymm7[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm8, %ymm8
-; AVX2-FCP-NEXT:    vmovaps 672(%rdi), %ymm0
+; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm9, %ymm0
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm14 = ymm2[0,2,2,2,4,6,6,6]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovaps 320(%rdi), %ymm10
+; AVX2-FCP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 352(%rdi), %ymm0
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm13, %ymm5, %ymm15
+; AVX2-FCP-NEXT:    vmovaps %ymm5, %ymm0
+; AVX2-FCP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 608(%rdi), %ymm10
+; AVX2-FCP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 576(%rdi), %ymm11
+; AVX2-FCP-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm11[0,1,2,3],ymm10[4,5],ymm11[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm14, %ymm9, %ymm15
+; AVX2-FCP-NEXT:    vmovaps 672(%rdi), %ymm5
+; AVX2-FCP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps 640(%rdi), %ymm9
 ; AVX2-FCP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm9[0,1],ymm0[0,1]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm11 = ymm9[0,2,2,2,4,6,6,6]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm11[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovaps 704(%rdi), %ymm11
-; AVX2-FCP-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 736(%rdi), %ymm0
-; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm11[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm12, %ymm15
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm8 = [1,7,5,u]
-; AVX2-FCP-NEXT:    vpermps %ymm13, %ymm8, %ymm13
+; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm5[0,1]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm11 = ymm12[0,2,2,2,4,6,6,6]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm15[0,1,2],ymm11[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovaps 704(%rdi), %ymm9
+; AVX2-FCP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 736(%rdi), %ymm10
+; AVX2-FCP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm15, %ymm0, %ymm10
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm10[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm10 = [1,7,5,u]
+; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm11[0,1,2],ymm8[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm5 = [5,3,5,3,5,3,5,3]
+; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm5, %ymm7
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm10, %ymm4
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm13[0,1,2],ymm6[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm13 = [5,3,5,3,5,3,5,3]
-; AVX2-FCP-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm13, %ymm5
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm8, %ymm3
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[1,3,2,3,5,7,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm13, %ymm2
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm8, %ymm0
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[1,3,2,3,5,7,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm14, %ymm13, %ymm1
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm5, %ymm3
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm10, %ymm1
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm13, %ymm5, %ymm0
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm8, %ymm0
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm9[1,3,2,3,5,7,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm14, %ymm10, %ymm0
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm12[1,3,2,3,5,7,6,7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm13, %ymm1
+; AVX2-FCP-NEXT:    vpermps %ymm15, %ymm5, %ymm1
+; AVX2-FCP-NEXT:    vmovaps %ymm5, %ymm13
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps (%rdi), %xmm0
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm7[0,0,2,3,4,4,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7]
-; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm10 = [2,0,6,4,2,0,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm10, %ymm3
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm3 = ymm2[0,0,0,0,4,4,4,4]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm4 = ymm8[0,0,2,3,4,4,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 384(%rdi), %xmm1
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; AVX2-FCP-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm4 = mem[0,0,2,3,4,4,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm4 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm10, %ymm4
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm4 = mem[0,0,0,0,4,4,4,4]
-; AVX2-FCP-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm5 = mem[0,0,2,3,4,4,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 576(%rdi), %xmm3
-; AVX2-FCP-NEXT:    vmovaps %xmm3, (%rsp) # 16-byte Spill
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm4 = xmm3[2,3,2,3]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm5 = ymm13[0,0,2,3,4,4,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm5 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm10, %ymm5
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm5 = mem[0,0,0,0,4,4,4,4]
-; AVX2-FCP-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm6 = mem[0,0,2,3,4,4,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm5[5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm4 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm10, %ymm4
-; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %xmm12
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm5 = xmm12[2,3,2,3]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm6 = ymm14[0,0,2,3,4,4,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4],ymm6[5,6,7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm5 = ymm10[0,0,0,0,4,4,4,4]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm6 = ymm3[0,0,2,3,4,4,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm4 = ymm7[0,1,3,3,4,5,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm4 = ymm11[3,3,3,3,7,7,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1],ymm4[2,3,4],ymm9[5],ymm4[6,7]
-; AVX2-FCP-NEXT:    vmovaps %ymm9, %ymm11
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm4 = ymm2[0,1,0,1,4,5,4,5]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm5 = ymm8[0,1,3,3,4,5,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[3,3,3,3]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm8[0,1,3,3,4,5,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm9[0,1,0,1,4,5,4,5]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm4 = ymm7[0,1,3,3,4,5,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
+; AVX2-FCP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm0 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7]
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm7 = [2,0,6,4,2,0,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm7, %ymm0
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm8 = [2,0,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $12, (%rsp), %ymm1, %ymm3 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm3 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm8, %ymm1
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm10 = [0,0,6,4,0,0,6,4]
+; AVX2-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm10, %ymm1
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm12[3,3,3,3]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm14[0,1,3,3,4,5,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm14[3,3,3,3,7,7,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[0,1,0,1,4,5,4,5]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[0,1,3,3,4,5,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm0 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm7, %ymm0
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm14 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm14, %ymm8, %ymm1
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm6 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm10, %ymm1
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm0 = mem[3,3,3,3]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm13[0,1,3,3,4,5,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2-FCP-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = mem[3,3,3,3,7,7,7,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,0,1,4,5,4,5]
-; AVX2-FCP-NEXT:    vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm2 = mem[0,1,3,3,4,5,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm0 = ymm5[0,1],mem[2,3],ymm5[4,5],mem[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm7, %ymm1
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm8, %ymm11
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm10, %ymm12
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm12[5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm11 = ymm9[0,1],mem[2,3],ymm9[4,5],mem[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm7, %ymm7
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm8, %ymm8
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm8 = ymm8[0,1,2,3],mem[4,5],ymm8[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm10, %ymm10
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm10[5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermilps {{.*#+}} xmm7 = mem[3,3,3,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0],ymm15[1],ymm7[2,3,4],ymm15[5],ymm7[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm7[0],ymm3[1,2,3,4],ymm7[5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm7 = [0,1,7,5,0,1,7,5]
+; AVX2-FCP-NEXT:    # ymm7 = mem[0,1,0,1]
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm7, %ymm2
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm10 = [3,1,7,5,0,u,u,u]
+; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm10, %ymm3
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermilps {{.*#+}} xmm2 = mem[3,3,3,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1,2,3,4],ymm2[5],ymm14[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm10, %ymm2
+; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm7, %ymm3
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermilps {{.*#+}} xmm2 = mem[3,3,3,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm11[1,2,3,4],ymm2[5],ymm11[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm10, %ymm2
+; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm7, %ymm3
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermilps {{.*#+}} xmm2 = mem[3,3,3,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm10, %ymm0
+; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm7, %ymm1
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm1 = ymm4[0,1,2,3],mem[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm2 = ymm8[0,1,2,3],mem[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm3 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps 464(%rdi), %xmm0
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm12, %ymm1
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6,7]
-; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm9 = [0,2,0,6,0,2,0,6]
-; AVX2-FCP-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm9, %ymm1
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm7 = ymm11[0,1,2,3],mem[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovaps 80(%rdi), %xmm6
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
-; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm12, %ymm1
-; AVX2-FCP-NEXT:    vmovaps %ymm12, %ymm15
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm2, %ymm1
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm4 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm9, %ymm1
+; AVX2-FCP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm6 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm4 = [0,2,0,6,0,2,0,6]
+; AVX2-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm4, %ymm1
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm10 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm10 = mem[0,1,2,3],ymm14[4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm3 = ymm15[0,1,2,3],mem[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovaps 272(%rdi), %xmm3
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm3[2,3],ymm10[4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps $240, (%rsp), %ymm0, %ymm12 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm12 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovaps 80(%rdi), %xmm14
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7]
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
-; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm12, %ymm14
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm2, %ymm1
+; AVX2-FCP-NEXT:    vmovaps %ymm2, %ymm5
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm9, %ymm12
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm12[5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm4, %ymm8
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm8[5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm10 = ymm9[0,1,2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovaps 272(%rdi), %xmm1
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm1[2,3],ymm10[4,5,6,7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6]
+; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm5, %ymm9
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm8[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm8 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm4, %ymm15
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm15[5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm12 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm12 = ymm13[0,1,2,3],mem[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovaps 656(%rdi), %xmm0
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm0[2,3],ymm12[4,5,6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6]
-; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm15, %ymm15
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm15 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm15 = ymm13[0,1],mem[2,3],ymm13[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm15, %ymm9, %ymm9
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4],ymm9[5,6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm14, %ymm7
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm7 = [0,3,1,7,0,3,1,7]
-; AVX2-FCP-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm7, %ymm4
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5,6,7]
-; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm6 = mem[1,1,1,1,5,5,5,5]
-; AVX2-FCP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm7, %ymm8
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5,6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm8 = ymm10[1,1,1,1,5,5,5,5]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3],ymm8[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm14, %ymm5
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm0[2,3],ymm9[4,5,6,7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[0,1,0,2,4,5,4,6]
+; AVX2-FCP-NEXT:    vpermps %ymm15, %ymm5, %ymm5
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm7 = mem[0,1],ymm7[2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm4, %ymm4
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[1,1,1,1,5,5,5,5]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm13, %ymm5
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm7, %ymm2
+; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm5 = [0,3,1,7,0,3,1,7]
+; AVX2-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm5, %ymm2
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm3 = ymm12[1,1,1,1,5,5,5,5]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm14, %ymm1
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm15, %ymm7, %ymm1
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm1, 96(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm1, 32(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm1, 64(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm1, (%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm1, 96(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm1, 32(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm1, 64(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm1, (%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm1, 32(%rcx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm1, 96(%rcx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm1, 64(%rcx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm1, (%rcx)
-; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm1, 96(%r8)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm1, 32(%r8)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm1, 64(%r8)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm1, (%r8)
-; AVX2-FCP-NEXT:    vmovaps %ymm9, 96(%r9)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm1, 32(%r9)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm1, (%r9)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm1, 64(%r9)
+; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm3 = mem[1,1,1,1,5,5,5,5]
+; AVX2-FCP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm12 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm12[0,1],ymm3[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm5, %ymm6
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm6 = ymm10[1,1,1,1,5,5,5,5]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm13, %ymm6
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm5, %ymm6
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm6 = ymm9[1,1,1,1,5,5,5,5]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm15, %ymm13, %ymm6
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm5, %ymm5
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 96(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 32(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 64(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, (%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 96(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 32(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 64(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, (%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 32(%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 96(%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 64(%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, (%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 96(%r8)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 32(%r8)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 64(%r8)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, (%r8)
+; AVX2-FCP-NEXT:    vmovaps %ymm4, 96(%r9)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%r9)
+; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm4, (%r9)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm4, 64(%r9)
 ; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FCP-NEXT:    vmovaps %ymm0, 96(%rax)
-; AVX2-FCP-NEXT:    vmovaps %ymm2, 32(%rax)
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 64(%rax)
-; AVX2-FCP-NEXT:    vmovaps %ymm4, (%rax)
-; AVX2-FCP-NEXT:    addq $1256, %rsp # imm = 0x4E8
+; AVX2-FCP-NEXT:    vmovaps %ymm1, 32(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm3, 64(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm2, (%rax)
+; AVX2-FCP-NEXT:    addq $1192, %rsp # imm = 0x4A8
 ; AVX2-FCP-NEXT:    vzeroupper
 ; AVX2-FCP-NEXT:    retq
 ;
@@ -9159,59 +8984,56 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX2-LABEL: load_i32_stride6_vf64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    subq $2536, %rsp # imm = 0x9E8
+; AVX2-NEXT:    subq $2568, %rsp # imm = 0xA08
 ; AVX2-NEXT:    vmovaps 672(%rdi), %ymm4
 ; AVX2-NEXT:    vmovaps 640(%rdi), %ymm5
 ; AVX2-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 608(%rdi), %ymm3
 ; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 320(%rdi), %ymm2
-; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 352(%rdi), %ymm6
+; AVX2-NEXT:    vmovaps 320(%rdi), %ymm6
 ; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 288(%rdi), %ymm7
+; AVX2-NEXT:    vmovaps 352(%rdi), %ymm7
 ; AVX2-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 256(%rdi), %ymm9
-; AVX2-NEXT:    vmovups %ymm9, (%rsp) # 32-byte Spill
+; AVX2-NEXT:    vmovaps 288(%rdi), %ymm2
+; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovaps 256(%rdi), %ymm8
+; AVX2-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 224(%rdi), %ymm0
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 192(%rdi), %ymm1
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{.*#+}} xmm8 = [0,6,4,u]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
-; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps %ymm0, %ymm8, %ymm0
-; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm9[0,1],ymm7[0,1]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7]
-; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT:    vmovaps {{.*#+}} xmm9 = [0,6,4,u]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-NEXT:    vpermps %ymm14, %ymm9, %ymm0
+; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm8[0,1],ymm2[0,1]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm15[0,2,2,2,4,6,6,6]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5,6,7]
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [4,2,4,2,4,2,4,2]
-; AVX2-NEXT:    vpermps %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vmovaps %ymm2, %ymm6
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm6 = [4,2,4,2,4,2,4,2]
+; AVX2-NEXT:    vpermps %ymm1, %ymm6, %ymm2
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm5[0,1],ymm4[0,1]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 576(%rdi), %ymm0
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps %ymm0, %ymm8, %ymm0
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm0, %ymm9, %ymm0
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[0,2,2,2,4,6,6,6]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-NEXT:    vmovaps 704(%rdi), %ymm1
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 736(%rdi), %ymm2
 ; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps %ymm1, %ymm6, %ymm1
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT:    vpermps %ymm1, %ymm6, %ymm2
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 1056(%rdi), %ymm1
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -9226,17 +9048,17 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps %ymm0, %ymm8, %ymm0
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm0, %ymm9, %ymm0
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2,2,2,4,6,6,6]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-NEXT:    vmovaps 1088(%rdi), %ymm1
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 1120(%rdi), %ymm2
 ; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps %ymm1, %ymm6, %ymm1
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT:    vpermps %ymm1, %ymm6, %ymm2
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 1440(%rdi), %ymm1
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -9251,686 +9073,629 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps %ymm0, %ymm8, %ymm0
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm0, %ymm9, %ymm0
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2,2,2,4,6,6,6]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-NEXT:    vmovaps 1472(%rdi), %ymm1
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 1504(%rdi), %ymm2
 ; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps %ymm1, %ymm6, %ymm1
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT:    vpermps %ymm1, %ymm6, %ymm2
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 96(%rdi), %ymm1
-; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 64(%rdi), %ymm0
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 32(%rdi), %ymm1
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-NEXT:    vpermps %ymm11, %ymm8, %ymm0
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm13[0,2,2,2,4,6,6,6]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    vpermps %ymm13, %ymm9, %ymm0
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2,2,2,4,6,6,6]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-NEXT:    vmovaps 128(%rdi), %ymm1
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 160(%rdi), %ymm2
 ; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm10, %ymm6, %ymm1
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm12, %ymm6, %ymm2
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 480(%rdi), %ymm1
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 448(%rdi), %ymm0
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-NEXT:    vmovaps 416(%rdi), %ymm0
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 384(%rdi), %ymm1
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
-; AVX2-NEXT:    vpermps %ymm5, %ymm8, %ymm0
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm7[0,2,2,2,4,6,6,6]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-NEXT:    vpermps %ymm10, %ymm9, %ymm0
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm11[0,2,2,2,4,6,6,6]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-NEXT:    vmovaps 512(%rdi), %ymm1
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 544(%rdi), %ymm2
 ; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm4, %ymm6, %ymm1
-; AVX2-NEXT:    vmovaps %ymm6, %ymm12
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm8, %ymm6, %ymm2
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 864(%rdi), %ymm1
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 832(%rdi), %ymm0
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-NEXT:    vmovaps 800(%rdi), %ymm0
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 768(%rdi), %ymm1
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
-; AVX2-NEXT:    vpermps %ymm2, %ymm8, %ymm0
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm3[0,2,2,2,4,6,6,6]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-NEXT:    vpermps %ymm4, %ymm9, %ymm0
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm7[0,2,2,2,4,6,6,6]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-NEXT:    vmovaps 896(%rdi), %ymm1
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 928(%rdi), %ymm6
-; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm1, %ymm12, %ymm6
-; AVX2-NEXT:    vmovaps %ymm12, %ymm14
-; AVX2-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7]
+; AVX2-NEXT:    vmovaps 928(%rdi), %ymm2
+; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm3, %ymm6, %ymm2
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 1184(%rdi), %ymm0
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 1152(%rdi), %ymm6
-; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7]
-; AVX2-NEXT:    vpermps %ymm6, %ymm8, %ymm0
-; AVX2-NEXT:    vmovaps 1248(%rdi), %ymm9
-; AVX2-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 1216(%rdi), %ymm8
-; AVX2-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm8[0,1],ymm9[0,1]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm9 = ymm8[0,2,2,2,4,6,6,6]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3,4,5,6,7]
-; AVX2-NEXT:    vmovaps 1280(%rdi), %ymm9
-; AVX2-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 1312(%rdi), %ymm12
-; AVX2-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm15, %ymm14, %ymm9
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7]
+; AVX2-NEXT:    vmovaps 1152(%rdi), %ymm1
+; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-NEXT:    vpermps %ymm2, %ymm9, %ymm0
+; AVX2-NEXT:    vmovaps 1248(%rdi), %ymm1
+; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovaps 1216(%rdi), %ymm5
+; AVX2-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm5[0,1],ymm1[0,1]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm5[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm9[0,2,2,2,4,6,6,6]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7]
+; AVX2-NEXT:    vmovaps 1280(%rdi), %ymm1
+; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovaps 1312(%rdi), %ymm5
+; AVX2-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm5, %ymm6, %ymm1
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [1,7,5,u]
-; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
-; AVX2-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm14 = mem[1,3,2,3,5,7,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm9[0,1,2],ymm14[3,4,5,6,7]
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm9 = [5,3,5,3,5,3,5,3]
-; AVX2-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm12 # 32-byte Folded Reload
-; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7]
-; AVX2-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
-; AVX2-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm14 = mem[1,3,2,3,5,7,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3,4,5,6,7]
-; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm14 # 32-byte Folded Reload
-; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7]
-; AVX2-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
-; AVX2-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm14 = mem[1,3,2,3,5,7,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3,4,5,6,7]
-; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm14 # 32-byte Folded Reload
-; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7]
-; AVX2-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
+; AVX2-NEXT:    vpermps %ymm14, %ymm0, %ymm1
+; AVX2-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[1,3,2,3,5,7,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm15 = ymm1[0,1,2],ymm15[3,4,5,6,7]
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [5,3,5,3,5,3,5,3]
+; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
+; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7]
+; AVX2-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX2-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm15 = mem[1,3,2,3,5,7,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7]
+; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
+; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX2-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm15 = mem[1,3,2,3,5,7,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7]
+; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
+; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX2-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm15 = mem[1,3,2,3,5,7,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7]
+; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
+; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermps %ymm13, %ymm0, %ymm13
 ; AVX2-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm14 = mem[1,3,2,3,5,7,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3,4,5,6,7]
-; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm14 # 32-byte Folded Reload
-; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm12, %ymm1, %ymm12
+; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
 ; AVX2-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps %ymm11, %ymm0, %ymm11
-; AVX2-NEXT:    vshufps {{.*#+}} ymm12 = ymm13[1,3,2,3,5,7,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm10, %ymm9, %ymm10
-; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7]
-; AVX2-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps %ymm5, %ymm0, %ymm5
+; AVX2-NEXT:    vpermps %ymm10, %ymm0, %ymm10
+; AVX2-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm8, %ymm1, %ymm8
+; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm8[6,7]
+; AVX2-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermps %ymm4, %ymm0, %ymm4
 ; AVX2-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[1,3,2,3,5,7,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm4, %ymm9, %ymm4
-; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps %ymm2, %ymm0, %ymm2
-; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[1,3,2,3,5,7,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm1, %ymm9, %ymm1
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps %ymm6, %ymm0, %ymm0
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm8[1,3,2,3,5,7,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm15, %ymm9, %ymm1
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm3, %ymm1, %ymm3
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermps %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm9[1,3,2,3,5,7,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm5, %ymm1, %ymm2
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 192(%rdi), %xmm5
-; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm5[2,3,2,3]
-; AVX2-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm1 = mem[0,0,2,3,4,4,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm1 = ymm1[0,1],mem[2,3],ymm1[4,5],mem[6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm3[0,0,0,0,4,4,4,4]
-; AVX2-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = mem[0,0,2,3,4,4,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7]
+; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
+; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 576(%rdi), %xmm0
-; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX2-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm1 = mem[0,0,2,3,4,4,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm1 = ymm1[0,1],mem[2,3],ymm1[4,5],mem[6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm1 = mem[0,0,0,0,4,4,4,4]
-; AVX2-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = mem[0,0,2,3,4,4,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7]
+; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
+; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 960(%rdi), %xmm2
-; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3]
-; AVX2-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm1 = mem[0,0,2,3,4,4,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm1 = ymm1[0,1],mem[2,3],ymm1[4,5],mem[6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm1 = mem[0,0,0,0,4,4,4,4]
-; AVX2-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm4 = mem[0,0,2,3,4,4,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7]
+; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
+; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 1344(%rdi), %xmm1
-; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm15[0,0,2,3,4,4,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm4 = mem[0,1],ymm13[2,3],mem[4,5],ymm13[6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm4 = mem[0,0,0,0,4,4,4,4]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm6 = ymm14[0,0,2,3,4,4,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5,6],ymm4[7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7]
+; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
+; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps (%rdi), %xmm0
-; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX2-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm4 = mem[0,0,2,3,4,4,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm4 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm4 = mem[0,0,0,0,4,4,4,4]
-; AVX2-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm6 = mem[0,0,2,3,4,4,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5,6],ymm4[7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
+; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
+; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
+; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 384(%rdi), %xmm0
-; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX2-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm6 = mem[0,0,2,3,4,4,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1,2,3],ymm0[4],ymm6[5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm6 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[0,2,0,3]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3,4,5,6,7]
-; AVX2-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm6 = mem[0,0,0,0,4,4,4,4]
-; AVX2-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm8 = mem[0,0,2,3,4,4,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm0[0,2,0,3]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 768(%rdi), %xmm0
-; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vshufps {{.*#+}} xmm6 = xmm0[2,3,2,3]
-; AVX2-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm8 = mem[0,0,2,3,4,4,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1,2,3],ymm6[4],ymm8[5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm0[2,0,2,3,6,4,6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm8 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3,4,5,6,7]
-; AVX2-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm8 = mem[0,0,0,0,4,4,4,4]
-; AVX2-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm10 = mem[0,0,2,3,4,4,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm8[5,6,7]
+; AVX2-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 1152(%rdi), %xmm12
-; AVX2-NEXT:    vshufps {{.*#+}} xmm8 = xmm12[2,3,2,3]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm10 = ymm6[0,0,2,3,4,4,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0],ymm10[1,2,3],ymm8[4],ymm10[5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm0[0,0,2,0,4,4,6,4]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5,6,7]
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = ymm13[0,1],mem[2,3],ymm13[4,5],mem[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm9[0,1],ymm0[2,3],ymm9[4,5],ymm0[6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[0,2,0,3]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm8 = ymm8[0,3,2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3,4,5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm10 = ymm4[0,0,0,0,4,4,4,4]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm11 = ymm7[0,0,2,3,4,4,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5,6,7]
-; AVX2-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[3,3,3,3]
-; AVX2-NEXT:    vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm8 = mem[0,1,3,3,4,5,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0],ymm8[1,2,3],ymm5[4],ymm8[5,6,7]
-; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm8 = mem[3,3,3,3,7,7,7,7]
-; AVX2-NEXT:    vblendps $34, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3]
+; AVX2-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm14 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm14[2,0,2,3,6,4,6,7]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm8[3,4,5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm8 = ymm3[0,1,0,1,4,5,4,5]
-; AVX2-NEXT:    vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm10 = mem[0,1,3,3,4,5,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm8[5,6,7]
-; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm3 = mem[3,3,3,3]
-; AVX2-NEXT:    vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm5 = mem[0,1,3,3,4,5,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1,2,3],ymm3[4],ymm5[5,6,7]
-; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm5 = mem[3,3,3,3,7,7,7,7]
-; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm15 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm15[0,0,2,0,4,4,6,4]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5,6,7]
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm8[0,1],ymm10[2,3],ymm8[4,5],ymm10[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm2[0,2,0,3]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm2[2,0,2,3,6,4,6,7]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7]
-; AVX2-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm5 = mem[0,1,0,1,4,5,4,5]
-; AVX2-NEXT:    vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm8 = mem[0,1,3,3,4,5,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3],ymm8[4,5,6],ymm5[7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7]
-; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; AVX2-NEXT:    vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm3 = mem[0,1,3,3,4,5,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7]
-; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm3 = mem[3,3,3,3,7,7,7,7]
-; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm3 = mem[0,1,0,1,4,5,4,5]
-; AVX2-NEXT:    vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm5 = mem[0,1,3,3,4,5,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm5 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm5[0,0,2,0,4,4,6,4]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-NEXT:    vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm3 = mem[3,1,3,3,7,5,7,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-NEXT:    vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm3 = mem[0,1,3,1,4,5,7,5]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
-; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm13[3,3,3,3,7,7,7,7]
-; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = mem[0,1,0,1,4,5,4,5]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm14[0,1,3,3,4,5,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm7[3,3,3,3,7,7,7,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-NEXT:    vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm3 = mem[3,1,3,3,7,5,7,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-NEXT:    vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm3 = mem[0,1,3,1,4,5,7,5]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm4[3,3,3,3,7,7,7,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-NEXT:    vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm3 = mem[3,1,3,3,7,5,7,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-NEXT:    vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm3 = mem[0,1,3,1,4,5,7,5]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm7[3,3,3,3,7,7,7,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-NEXT:    vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm3 = mem[3,1,3,3,7,5,7,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-NEXT:    vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm3 = mem[0,1,3,1,4,5,7,5]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm10[3,3,3,3,7,7,7,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[3,1,3,3,7,5,7,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm5[0,1,3,1,4,5,7,5]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm12[3,3,3,3]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm6[0,1,3,3,4,5,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm0[3,3,3,3,7,7,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm4[0,1,0,1,4,5,4,5]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm7[0,1,3,3,4,5,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7]
+; AVX2-NEXT:    vmovaps %ymm13, %ymm5
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm14[3,1,3,3,7,5,7,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm15[0,1,3,1,4,5,7,5]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
-; AVX2-NEXT:    vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm1 = mem[0,1,3,3,4,5,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm13[3,3,3,3,7,7,7,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm13[3,3,3,3,7,7,7,7]
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3,4],ymm12[5],ymm1[6,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm1 = mem[0,1,0,1,4,5,4,5]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm11[0,1,3,3,4,5,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[0,1,0,1,4,5,4,5]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm9[0,1,3,3,4,5,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-NEXT:    vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = mem[3,1,3,3,7,5,7,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-NEXT:    vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = mem[0,1,3,1,4,5,7,5]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[0,1,3,3,4,5,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX2-NEXT:    vmovups (%rsp), %ymm14 # 32-byte Reload
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm14[3,3,3,3,7,7,7,7]
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm5[0,1,0,1,4,5,4,5]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm4[0,1,3,3,4,5,7,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-NEXT:    vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = mem[3,1,3,3,7,5,7,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-NEXT:    vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = mem[0,1,3,1,4,5,7,5]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = ymm6[0,1,2,3],mem[4,5,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm14[4,5,6,7]
 ; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm3 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps 80(%rdi), %xmm0
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
 ; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-NEXT:    vpermps %ymm2, %ymm14, %ymm1
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm4[2,3],ymm5[4,5,6,7]
-; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = [0,2,0,6,0,2,0,6]
-; AVX2-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX2-NEXT:    vpermps %ymm2, %ymm0, %ymm2
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm3, %ymm6, %ymm2
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
 ; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm3 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 272(%rdi), %xmm1
-; AVX2-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
-; AVX2-NEXT:    vpermps %ymm3, %ymm14, %ymm2
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm3 = [0,2,0,6,0,2,0,6]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX2-NEXT:    vpermps %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovaps 272(%rdi), %xmm0
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
+; AVX2-NEXT:    vpermps %ymm14, %ymm6, %ymm2
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
 ; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps %ymm2, %ymm0, %ymm2
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm13[4,5,6,7]
 ; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm3 = ymm11[0,1,2,3],mem[4,5,6,7]
-; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 464(%rdi), %xmm1
-; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
-; AVX2-NEXT:    vpermps %ymm3, %ymm14, %ymm2
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm12 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovaps 464(%rdi), %xmm0
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
+; AVX2-NEXT:    vpermps %ymm12, %ymm6, %ymm2
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
 ; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps %ymm2, %ymm0, %ymm2
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
 ; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm3 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 656(%rdi), %xmm1
-; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
-; AVX2-NEXT:    vpermps %ymm3, %ymm14, %ymm2
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm12 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovaps 656(%rdi), %xmm0
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
+; AVX2-NEXT:    vpermps %ymm12, %ymm6, %ymm2
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
 ; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps %ymm2, %ymm0, %ymm2
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm11[4,5,6,7]
 ; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm3 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 848(%rdi), %xmm1
-; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
-; AVX2-NEXT:    vpermps %ymm3, %ymm14, %ymm2
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
-; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = mem[0,1],ymm15[2,3],mem[4,5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovaps 848(%rdi), %xmm0
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
+; AVX2-NEXT:    vpermps %ymm5, %ymm6, %ymm2
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
 ; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermps %ymm2, %ymm0, %ymm2
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm13 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm12 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX2-NEXT:    vmovaps 1040(%rdi), %xmm11
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm11[2,3],ymm13[4,5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
-; AVX2-NEXT:    vpermps %ymm12, %ymm14, %ymm2
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm15 = ymm9[0,1,2,3],mem[4,5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-NEXT:    vmovaps 1040(%rdi), %xmm13
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
+; AVX2-NEXT:    vpermps %ymm14, %ymm6, %ymm2
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm10 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm10, %ymm0, %ymm2
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm8 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm7 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX2-NEXT:    vmovaps 1232(%rdi), %xmm6
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
-; AVX2-NEXT:    vpermps %ymm7, %ymm14, %ymm2
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm12 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm12, %ymm3, %ymm2
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm10 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-NEXT:    vmovaps 1232(%rdi), %xmm9
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm9[2,3],ymm11[4,5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
+; AVX2-NEXT:    vpermps %ymm10, %ymm6, %ymm2
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm5 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm5, %ymm0, %ymm2
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm4 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm3 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX2-NEXT:    vmovaps 1424(%rdi), %xmm2
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
-; AVX2-NEXT:    vpermps %ymm3, %ymm14, %ymm15
-; AVX2-NEXT:    vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm1[2,3,4,5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm1 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm8 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm8, %ymm3, %ymm2
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-NEXT:    vmovaps 1424(%rdi), %xmm4
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm4[2,3],ymm7[4,5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
+; AVX2-NEXT:    vpermps %ymm5, %ymm6, %ymm2
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm2, %ymm3, %ymm3
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm0 = mem[1,1,1,1,5,5,5,5]
 ; AVX2-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload
-; AVX2-NEXT:    vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7]
 ; AVX2-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
-; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5,6,7]
-; AVX2-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm14 = mem[1,1,1,1,5,5,5,5]
-; AVX2-NEXT:    vblendps $8, (%rsp), %ymm14, %ymm14 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7]
-; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload
-; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7]
-; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7]
-; AVX2-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm14 = mem[1,1,1,1,5,5,5,5]
-; AVX2-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7]
-; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload
-; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7]
-; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7]
-; AVX2-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm14 = mem[1,1,1,1,5,5,5,5]
-; AVX2-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7]
-; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload
-; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7]
-; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7]
-; AVX2-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm14 = mem[1,1,1,1,5,5,5,5]
-; AVX2-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7]
-; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload
-; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7]
-; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[1,1,1,1,5,5,5,5]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3],ymm13[4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm12, %ymm9, %ymm12
-; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm10, %ymm0, %ymm10
-; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[1,1,1,1,5,5,5,5]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm7, %ymm9, %ymm7
-; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm5, %ymm0, %ymm5
-; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[1,1,1,1,5,5,5,5]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm3, %ymm9, %ymm3
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7]
+; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermilps $85, (%rsp), %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm3 = mem[1,1,1,1,5,5,5,5]
+; AVX2-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7]
+; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7]
+; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7]
+; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm3 = mem[1,1,1,1,5,5,5,5]
+; AVX2-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7]
+; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7]
+; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7]
+; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm3 = mem[1,1,1,1,5,5,5,5]
+; AVX2-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7]
+; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7]
+; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7]
+; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm6 = mem[1,1,1,1,5,5,5,5]
+; AVX2-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7]
+; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3,4,5,6,7]
+; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm15[1,1,1,1,5,5,5,5]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm13[3],ymm3[4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm14, %ymm1, %ymm13
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm13[0,1],ymm3[2,3,4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm12, %ymm0, %ymm12
+; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3,4],ymm12[5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm11[1,1,1,1,5,5,5,5]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3],ymm3[4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm10, %ymm1, %ymm9
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm3[2,3,4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm8, %ymm0, %ymm8
+; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3,4],ymm8[5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm7[1,1,1,1,5,5,5,5]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm5, %ymm1, %ymm1
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-NEXT:    vmovaps %ymm1, 192(%rsi)
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
@@ -10013,9 +9778,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovaps %ymm1, (%r9)
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    vmovaps %ymm0, 224(%rax)
-; AVX2-NEXT:    vmovaps %ymm5, 192(%rax)
-; AVX2-NEXT:    vmovaps %ymm10, 160(%rax)
-; AVX2-NEXT:    vmovaps %ymm14, 128(%rax)
+; AVX2-NEXT:    vmovaps %ymm8, 192(%rax)
+; AVX2-NEXT:    vmovaps %ymm12, 160(%rax)
+; AVX2-NEXT:    vmovaps %ymm6, 128(%rax)
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    vmovaps %ymm0, 96(%rax)
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -10024,65 +9789,62 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovaps %ymm0, 32(%rax)
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    vmovaps %ymm0, (%rax)
-; AVX2-NEXT:    addq $2536, %rsp # imm = 0x9E8
+; AVX2-NEXT:    addq $2568, %rsp # imm = 0xA08
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX2-FP-LABEL: load_i32_stride6_vf64:
 ; AVX2-FP:       # %bb.0:
-; AVX2-FP-NEXT:    subq $2536, %rsp # imm = 0x9E8
+; AVX2-FP-NEXT:    subq $2568, %rsp # imm = 0xA08
 ; AVX2-FP-NEXT:    vmovaps 672(%rdi), %ymm4
 ; AVX2-FP-NEXT:    vmovaps 640(%rdi), %ymm5
 ; AVX2-FP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 608(%rdi), %ymm3
 ; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 320(%rdi), %ymm2
-; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 352(%rdi), %ymm6
+; AVX2-FP-NEXT:    vmovaps 320(%rdi), %ymm6
 ; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 288(%rdi), %ymm7
+; AVX2-FP-NEXT:    vmovaps 352(%rdi), %ymm7
 ; AVX2-FP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 256(%rdi), %ymm9
-; AVX2-FP-NEXT:    vmovups %ymm9, (%rsp) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovaps 288(%rdi), %ymm2
+; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovaps 256(%rdi), %ymm8
+; AVX2-FP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 224(%rdi), %ymm0
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 192(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps {{.*#+}} xmm8 = [0,6,4,u]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
-; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps %ymm0, %ymm8, %ymm0
-; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm9[0,1],ymm7[0,1]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7]
-; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FP-NEXT:    vmovaps {{.*#+}} xmm9 = [0,6,4,u]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm14, %ymm9, %ymm0
+; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm8[0,1],ymm2[0,1]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm15[0,2,2,2,4,6,6,6]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [4,2,4,2,4,2,4,2]
-; AVX2-FP-NEXT:    vpermps %ymm1, %ymm2, %ymm1
-; AVX2-FP-NEXT:    vmovaps %ymm2, %ymm6
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vbroadcastsd {{.*#+}} ymm6 = [4,2,4,2,4,2,4,2]
+; AVX2-FP-NEXT:    vpermps %ymm1, %ymm6, %ymm2
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm5[0,1],ymm4[0,1]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 576(%rdi), %ymm0
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps %ymm0, %ymm8, %ymm0
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm0, %ymm9, %ymm0
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[0,2,2,2,4,6,6,6]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovaps 704(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 736(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps %ymm1, %ymm6, %ymm1
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm1, %ymm6, %ymm2
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 1056(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -10097,17 +9859,17 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps %ymm0, %ymm8, %ymm0
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm0, %ymm9, %ymm0
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2,2,2,4,6,6,6]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovaps 1088(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 1120(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps %ymm1, %ymm6, %ymm1
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm1, %ymm6, %ymm2
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 1440(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -10122,686 +9884,629 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps %ymm0, %ymm8, %ymm0
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm0, %ymm9, %ymm0
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2,2,2,4,6,6,6]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovaps 1472(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 1504(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps %ymm1, %ymm6, %ymm1
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm1, %ymm6, %ymm2
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 96(%rdi), %ymm1
-; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 64(%rdi), %ymm0
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 32(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FP-NEXT:    vpermps %ymm11, %ymm8, %ymm0
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm13[0,2,2,2,4,6,6,6]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm13, %ymm9, %ymm0
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2,2,2,4,6,6,6]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovaps 128(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 160(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm10, %ymm6, %ymm1
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm12, %ymm6, %ymm2
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 480(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 448(%rdi), %ymm0
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FP-NEXT:    vmovaps 416(%rdi), %ymm0
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 384(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
-; AVX2-FP-NEXT:    vpermps %ymm5, %ymm8, %ymm0
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm7[0,2,2,2,4,6,6,6]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm10, %ymm9, %ymm0
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm11[0,2,2,2,4,6,6,6]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovaps 512(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 544(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm4, %ymm6, %ymm1
-; AVX2-FP-NEXT:    vmovaps %ymm6, %ymm12
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm8, %ymm6, %ymm2
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 864(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 832(%rdi), %ymm0
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FP-NEXT:    vmovaps 800(%rdi), %ymm0
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 768(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm8, %ymm0
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm3[0,2,2,2,4,6,6,6]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm4, %ymm9, %ymm0
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm7[0,2,2,2,4,6,6,6]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovaps 896(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 928(%rdi), %ymm6
-; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm1, %ymm12, %ymm6
-; AVX2-FP-NEXT:    vmovaps %ymm12, %ymm14
-; AVX2-FP-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7]
+; AVX2-FP-NEXT:    vmovaps 928(%rdi), %ymm2
+; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm3, %ymm6, %ymm2
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 1184(%rdi), %ymm0
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 1152(%rdi), %ymm6
-; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7]
-; AVX2-FP-NEXT:    vpermps %ymm6, %ymm8, %ymm0
-; AVX2-FP-NEXT:    vmovaps 1248(%rdi), %ymm9
-; AVX2-FP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 1216(%rdi), %ymm8
-; AVX2-FP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm8[0,1],ymm9[0,1]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm9 = ymm8[0,2,2,2,4,6,6,6]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovaps 1280(%rdi), %ymm9
-; AVX2-FP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 1312(%rdi), %ymm12
-; AVX2-FP-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm15, %ymm14, %ymm9
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7]
+; AVX2-FP-NEXT:    vmovaps 1152(%rdi), %ymm1
+; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm2, %ymm9, %ymm0
+; AVX2-FP-NEXT:    vmovaps 1248(%rdi), %ymm1
+; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovaps 1216(%rdi), %ymm5
+; AVX2-FP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm5[0,1],ymm1[0,1]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm5[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm9[0,2,2,2,4,6,6,6]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovaps 1280(%rdi), %ymm1
+; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovaps 1312(%rdi), %ymm5
+; AVX2-FP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm5, %ymm6, %ymm1
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps {{.*#+}} xmm0 = [1,7,5,u]
-; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm14 = mem[1,3,2,3,5,7,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm9[0,1,2],ymm14[3,4,5,6,7]
-; AVX2-FP-NEXT:    vbroadcastsd {{.*#+}} ymm9 = [5,3,5,3,5,3,5,3]
-; AVX2-FP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm12 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7]
-; AVX2-FP-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm14 = mem[1,3,2,3,5,7,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm14 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7]
-; AVX2-FP-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm14 = mem[1,3,2,3,5,7,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm14 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7]
-; AVX2-FP-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vpermps %ymm14, %ymm0, %ymm1
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[1,3,2,3,5,7,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm15 = ymm1[0,1,2],ymm15[3,4,5,6,7]
+; AVX2-FP-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [5,3,5,3,5,3,5,3]
+; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7]
+; AVX2-FP-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm15 = mem[1,3,2,3,5,7,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-FP-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm15 = mem[1,3,2,3,5,7,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-FP-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm15 = mem[1,3,2,3,5,7,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-FP-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpermps %ymm13, %ymm0, %ymm13
 ; AVX2-FP-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm14 = mem[1,3,2,3,5,7,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm14 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm12, %ymm1, %ymm12
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps %ymm11, %ymm0, %ymm11
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm12 = ymm13[1,3,2,3,5,7,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm10, %ymm9, %ymm10
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7]
-; AVX2-FP-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps %ymm5, %ymm0, %ymm5
+; AVX2-FP-NEXT:    vpermps %ymm10, %ymm0, %ymm10
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm8, %ymm1, %ymm8
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm8[6,7]
+; AVX2-FP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpermps %ymm4, %ymm0, %ymm4
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[1,3,2,3,5,7,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm4, %ymm9, %ymm4
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-FP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm0, %ymm2
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[1,3,2,3,5,7,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm1, %ymm9, %ymm1
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps %ymm6, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm8[1,3,2,3,5,7,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm15, %ymm9, %ymm1
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm3, %ymm1, %ymm3
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpermps %ymm2, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm9[1,3,2,3,5,7,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm5, %ymm1, %ymm2
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 192(%rdi), %xmm5
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm5[2,3,2,3]
-; AVX2-FP-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm1 = mem[0,0,2,3,4,4,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2-FP-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm1 = ymm1[0,1],mem[2,3],ymm1[4,5],mem[6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm3[0,0,0,0,4,4,4,4]
-; AVX2-FP-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = mem[0,0,2,3,4,4,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
+; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 576(%rdi), %xmm0
-; AVX2-FP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX2-FP-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm1 = mem[0,0,2,3,4,4,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm1 = ymm1[0,1],mem[2,3],ymm1[4,5],mem[6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm1 = mem[0,0,0,0,4,4,4,4]
-; AVX2-FP-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = mem[0,0,2,3,4,4,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
+; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 960(%rdi), %xmm2
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3]
-; AVX2-FP-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm1 = mem[0,0,2,3,4,4,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm1 = ymm1[0,1],mem[2,3],ymm1[4,5],mem[6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm1 = mem[0,0,0,0,4,4,4,4]
-; AVX2-FP-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm4 = mem[0,0,2,3,4,4,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm0 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
+; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 1344(%rdi), %xmm1
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm4 = ymm15[0,0,2,3,4,4,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm4 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm4 = mem[0,1],ymm13[2,3],mem[4,5],ymm13[6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm4 = mem[0,0,0,0,4,4,4,4]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm6 = ymm14[0,0,2,3,4,4,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5,6],ymm4[7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
+; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps (%rdi), %xmm0
-; AVX2-FP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX2-FP-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm4 = mem[0,0,2,3,4,4,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm4 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm4 = mem[0,0,0,0,4,4,4,4]
-; AVX2-FP-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm6 = mem[0,0,2,3,4,4,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5,6],ymm4[7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
+; AVX2-FP-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
+; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 384(%rdi), %xmm0
-; AVX2-FP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX2-FP-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm6 = mem[0,0,2,3,4,4,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1,2,3],ymm0[4],ymm6[5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm6 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[0,2,0,3]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm6 = mem[0,0,0,0,4,4,4,4]
-; AVX2-FP-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm8 = mem[0,0,2,3,4,4,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm0[0,2,0,3]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 768(%rdi), %xmm0
-; AVX2-FP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm6 = xmm0[2,3,2,3]
-; AVX2-FP-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm8 = mem[0,0,2,3,4,4,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1,2,3],ymm6[4],ymm8[5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm0[2,0,2,3,6,4,6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm8 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm8 = mem[0,0,0,0,4,4,4,4]
-; AVX2-FP-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm10 = mem[0,0,2,3,4,4,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm8[5,6,7]
+; AVX2-FP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 1152(%rdi), %xmm12
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm8 = xmm12[2,3,2,3]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm10 = ymm6[0,0,2,3,4,4,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0],ymm10[1,2,3],ymm8[4],ymm10[5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm0[0,0,2,0,4,4,6,4]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = ymm13[0,1],mem[2,3],ymm13[4,5],mem[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm9[0,1],ymm0[2,3],ymm9[4,5],ymm0[6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[0,2,0,3]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm8 = ymm8[0,3,2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm10 = ymm4[0,0,0,0,4,4,4,4]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm11 = ymm7[0,0,2,3,4,4,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[3,3,3,3]
-; AVX2-FP-NEXT:    vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm8 = mem[0,1,3,3,4,5,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0],ymm8[1,2,3],ymm5[4],ymm8[5,6,7]
-; AVX2-FP-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm8 = mem[3,3,3,3,7,7,7,7]
-; AVX2-FP-NEXT:    vblendps $34, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3]
+; AVX2-FP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm14 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm14[2,0,2,3,6,4,6,7]
 ; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm8[3,4,5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm8 = ymm3[0,1,0,1,4,5,4,5]
-; AVX2-FP-NEXT:    vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm10 = mem[0,1,3,3,4,5,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm8[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
-; AVX2-FP-NEXT:    # xmm3 = mem[3,3,3,3]
-; AVX2-FP-NEXT:    vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm5 = mem[0,1,3,3,4,5,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1,2,3],ymm3[4],ymm5[5,6,7]
-; AVX2-FP-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm5 = mem[3,3,3,3,7,7,7,7]
-; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm15 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm15[0,0,2,0,4,4,6,4]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm8[0,1],ymm10[2,3],ymm8[4,5],ymm10[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm5 = ymm2[0,2,0,3]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm2[2,0,2,3,6,4,6,7]
 ; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm5 = mem[0,1,0,1,4,5,4,5]
-; AVX2-FP-NEXT:    vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm8 = mem[0,1,3,3,4,5,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3],ymm8[4,5,6],ymm5[7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; AVX2-FP-NEXT:    vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm3 = mem[0,1,3,3,4,5,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7]
-; AVX2-FP-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm3 = mem[3,3,3,3,7,7,7,7]
-; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm3 = mem[0,1,0,1,4,5,4,5]
-; AVX2-FP-NEXT:    vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm5 = mem[0,1,3,3,4,5,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm5 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm5[0,0,2,0,4,4,6,4]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-FP-NEXT:    vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm3 = mem[3,1,3,3,7,5,7,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm3 = mem[0,1,3,1,4,5,7,5]
 ; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm13[3,3,3,3,7,7,7,7]
-; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = mem[0,1,0,1,4,5,4,5]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm14[0,1,3,3,4,5,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm7[3,3,3,3,7,7,7,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-FP-NEXT:    vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm3 = mem[3,1,3,3,7,5,7,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm3 = mem[0,1,3,1,4,5,7,5]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm4[3,3,3,3,7,7,7,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-FP-NEXT:    vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm3 = mem[3,1,3,3,7,5,7,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm3 = mem[0,1,3,1,4,5,7,5]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm7[3,3,3,3,7,7,7,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-FP-NEXT:    vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm3 = mem[3,1,3,3,7,5,7,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm3 = mem[0,1,3,1,4,5,7,5]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm10[3,3,3,3,7,7,7,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[3,1,3,3,7,5,7,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm5[0,1,3,1,4,5,7,5]
 ; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm1 = xmm12[3,3,3,3]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm6[0,1,3,3,4,5,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm0[3,3,3,3,7,7,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm4[0,1,0,1,4,5,4,5]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm7[0,1,3,3,4,5,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7]
+; AVX2-FP-NEXT:    vmovaps %ymm13, %ymm5
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm14[3,1,3,3,7,5,7,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm15[0,1,3,1,4,5,7,5]
 ; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX2-FP-NEXT:    # xmm0 = mem[3,3,3,3]
-; AVX2-FP-NEXT:    vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm1 = mem[0,1,3,3,4,5,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm13[3,3,3,3,7,7,7,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm13[3,3,3,3,7,7,7,7]
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3,4],ymm12[5],ymm1[6,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm1 = mem[0,1,0,1,4,5,4,5]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX2-FP-NEXT:    # xmm0 = mem[3,3,3,3]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm11[0,1,3,3,4,5,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[0,1,0,1,4,5,4,5]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm9[0,1,3,3,4,5,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-FP-NEXT:    vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = mem[3,1,3,3,7,5,7,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = mem[0,1,3,1,4,5,7,5]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX2-FP-NEXT:    # xmm0 = mem[3,3,3,3]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[0,1,3,3,4,5,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX2-FP-NEXT:    vmovups (%rsp), %ymm14 # 32-byte Reload
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm14[3,3,3,3,7,7,7,7]
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm5[0,1,0,1,4,5,4,5]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm4[0,1,3,3,4,5,7,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3]
+; AVX2-FP-NEXT:    vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = mem[3,1,3,3,7,5,7,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = mem[0,1,3,1,4,5,7,5]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = ymm6[0,1,2,3],mem[4,5,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm14[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm3 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovaps 80(%rdi), %xmm0
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm14, %ymm1
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm4[2,3],ymm5[4,5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = [0,2,0,6,0,2,0,6]
-; AVX2-FP-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm0, %ymm2
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm3, %ymm6, %ymm2
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm3 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 272(%rdi), %xmm1
-; AVX2-FP-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
-; AVX2-FP-NEXT:    vpermps %ymm3, %ymm14, %ymm2
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vbroadcastf128 {{.*#+}} ymm3 = [0,2,0,6,0,2,0,6]
+; AVX2-FP-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX2-FP-NEXT:    vpermps %ymm2, %ymm3, %ymm2
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovaps 272(%rdi), %xmm0
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
+; AVX2-FP-NEXT:    vpermps %ymm14, %ymm6, %ymm2
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm0, %ymm2
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm2, %ymm3, %ymm2
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm13[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm3 = ymm11[0,1,2,3],mem[4,5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 464(%rdi), %xmm1
-; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
-; AVX2-FP-NEXT:    vpermps %ymm3, %ymm14, %ymm2
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm12 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovaps 464(%rdi), %xmm0
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
+; AVX2-FP-NEXT:    vpermps %ymm12, %ymm6, %ymm2
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm0, %ymm2
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm2, %ymm3, %ymm2
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm3 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 656(%rdi), %xmm1
-; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
-; AVX2-FP-NEXT:    vpermps %ymm3, %ymm14, %ymm2
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm12 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovaps 656(%rdi), %xmm0
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
+; AVX2-FP-NEXT:    vpermps %ymm12, %ymm6, %ymm2
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm0, %ymm2
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm2, %ymm3, %ymm2
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm11[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm3 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 848(%rdi), %xmm1
-; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
-; AVX2-FP-NEXT:    vpermps %ymm3, %ymm14, %ymm2
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = mem[0,1],ymm15[2,3],mem[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovaps 848(%rdi), %xmm0
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
+; AVX2-FP-NEXT:    vpermps %ymm5, %ymm6, %ymm2
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm0, %ymm2
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm13 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm12 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX2-FP-NEXT:    vmovaps 1040(%rdi), %xmm11
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm11[2,3],ymm13[4,5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
-; AVX2-FP-NEXT:    vpermps %ymm12, %ymm14, %ymm2
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm2, %ymm3, %ymm2
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm15 = ymm9[0,1,2,3],mem[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-FP-NEXT:    vmovaps 1040(%rdi), %xmm13
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
+; AVX2-FP-NEXT:    vpermps %ymm14, %ymm6, %ymm2
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm10 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm10, %ymm0, %ymm2
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm8 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm7 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX2-FP-NEXT:    vmovaps 1232(%rdi), %xmm6
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
-; AVX2-FP-NEXT:    vpermps %ymm7, %ymm14, %ymm2
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm12 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm12, %ymm3, %ymm2
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm10 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-FP-NEXT:    vmovaps 1232(%rdi), %xmm9
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm9[2,3],ymm11[4,5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
+; AVX2-FP-NEXT:    vpermps %ymm10, %ymm6, %ymm2
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm5 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm5, %ymm0, %ymm2
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm4 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm3 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX2-FP-NEXT:    vmovaps 1424(%rdi), %xmm2
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
-; AVX2-FP-NEXT:    vpermps %ymm3, %ymm14, %ymm15
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm1[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm1 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm1, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm8 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm8, %ymm3, %ymm2
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-FP-NEXT:    vmovaps 1424(%rdi), %xmm4
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm4[2,3],ymm7[4,5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
+; AVX2-FP-NEXT:    vpermps %ymm5, %ymm6, %ymm2
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm2, %ymm3, %ymm3
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm0 = mem[1,1,1,1,5,5,5,5]
 ; AVX2-FP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-FP-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7]
 ; AVX2-FP-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm14 = mem[1,1,1,1,5,5,5,5]
-; AVX2-FP-NEXT:    vblendps $8, (%rsp), %ymm14, %ymm14 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm14 = mem[1,1,1,1,5,5,5,5]
-; AVX2-FP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm14 = mem[1,1,1,1,5,5,5,5]
-; AVX2-FP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm14 = mem[1,1,1,1,5,5,5,5]
-; AVX2-FP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[1,1,1,1,5,5,5,5]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3],ymm13[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm12, %ymm9, %ymm12
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm10, %ymm0, %ymm10
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[1,1,1,1,5,5,5,5]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm7, %ymm9, %ymm7
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm5, %ymm0, %ymm5
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[1,1,1,1,5,5,5,5]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm3, %ymm9, %ymm3
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm1, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpermilps $85, (%rsp), %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm3 = mem[1,1,1,1,5,5,5,5]
+; AVX2-FP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm3 = mem[1,1,1,1,5,5,5,5]
+; AVX2-FP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm3 = mem[1,1,1,1,5,5,5,5]
+; AVX2-FP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm6 = mem[1,1,1,1,5,5,5,5]
+; AVX2-FP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm15[1,1,1,1,5,5,5,5]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm13[3],ymm3[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm14, %ymm1, %ymm13
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm13[0,1],ymm3[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm12, %ymm0, %ymm12
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3,4],ymm12[5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm11[1,1,1,1,5,5,5,5]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3],ymm3[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm10, %ymm1, %ymm9
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm3[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm8, %ymm0, %ymm8
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3,4],ymm8[5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm7[1,1,1,1,5,5,5,5]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm5, %ymm1, %ymm1
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm2, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-FP-NEXT:    vmovaps %ymm1, 192(%rsi)
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
@@ -10884,9 +10589,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovaps %ymm1, (%r9)
 ; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FP-NEXT:    vmovaps %ymm0, 224(%rax)
-; AVX2-FP-NEXT:    vmovaps %ymm5, 192(%rax)
-; AVX2-FP-NEXT:    vmovaps %ymm10, 160(%rax)
-; AVX2-FP-NEXT:    vmovaps %ymm14, 128(%rax)
+; AVX2-FP-NEXT:    vmovaps %ymm8, 192(%rax)
+; AVX2-FP-NEXT:    vmovaps %ymm12, 160(%rax)
+; AVX2-FP-NEXT:    vmovaps %ymm6, 128(%rax)
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FP-NEXT:    vmovaps %ymm0, 96(%rax)
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -10895,773 +10600,677 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovaps %ymm0, 32(%rax)
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FP-NEXT:    vmovaps %ymm0, (%rax)
-; AVX2-FP-NEXT:    addq $2536, %rsp # imm = 0x9E8
+; AVX2-FP-NEXT:    addq $2568, %rsp # imm = 0xA08
 ; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
 ;
 ; AVX2-FCP-LABEL: load_i32_stride6_vf64:
 ; AVX2-FCP:       # %bb.0:
-; AVX2-FCP-NEXT:    subq $2504, %rsp # imm = 0x9C8
-; AVX2-FCP-NEXT:    vmovaps 672(%rdi), %ymm4
-; AVX2-FCP-NEXT:    vmovaps 640(%rdi), %ymm5
-; AVX2-FCP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    subq $2536, %rsp # imm = 0x9E8
+; AVX2-FCP-NEXT:    vmovaps 672(%rdi), %ymm5
+; AVX2-FCP-NEXT:    vmovaps 640(%rdi), %ymm6
+; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps 608(%rdi), %ymm3
 ; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 320(%rdi), %ymm6
-; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 320(%rdi), %ymm4
+; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps 352(%rdi), %ymm7
 ; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 288(%rdi), %ymm8
+; AVX2-FCP-NEXT:    vmovaps 288(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 256(%rdi), %ymm8
 ; AVX2-FCP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 256(%rdi), %ymm9
-; AVX2-FCP-NEXT:    vmovups %ymm9, (%rsp) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vmovaps 224(%rdi), %ymm0
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm0 = [0,6,4,u]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm14, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm9[0,1],ymm8[0,1]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3,4,5],ymm8[6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm15[0,2,2,2,4,6,6,6]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm13 = [4,2,4,2,4,2,4,2]
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm13, %ymm2
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm12 = [0,6,4,u]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm15, %ymm12, %ymm0
+; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm8[0,1],ymm2[0,1]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm4[0,1]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 576(%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm4[0,2,2,2,4,6,6,6]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovaps 704(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 736(%rdi), %ymm3
-; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[0,2,2,2,4,6,6,6]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm4[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm13, %ymm2
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [4,2,4,2,4,2,4,2]
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm1, %ymm2
+; AVX2-FCP-NEXT:    vmovaps %ymm1, %ymm4
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm6[0,1],ymm5[0,1]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 576(%rdi), %ymm0
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm12, %ymm0
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[0,2,2,2,4,6,6,6]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovaps 704(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 1056(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovaps 736(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 1024(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm2[0,1]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 992(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm4, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 1056(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 960(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovaps 1024(%rdi), %ymm0
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vmovaps 992(%rdi), %ymm0
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 960(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[0,2,2,2,4,6,6,6]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovaps 1088(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 1120(%rdi), %ymm3
-; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm13, %ymm2
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm12, %ymm0
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2,2,2,4,6,6,6]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovaps 1088(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 1440(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovaps 1120(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 1408(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm2[0,1]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 1376(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm4, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 1440(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 1344(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovaps 1408(%rdi), %ymm0
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vmovaps 1376(%rdi), %ymm0
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 1344(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[0,2,2,2,4,6,6,6]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovaps 1472(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 1504(%rdi), %ymm3
-; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm13, %ymm2
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm12, %ymm0
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2,2,2,4,6,6,6]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovaps 1472(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovaps 1504(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm2[0,1]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm1
+; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm4, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[0,2,2,2,4,6,6,6]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %ymm0
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %ymm3
-; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm13, %ymm2
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm0
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm13, %ymm12, %ymm0
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2,2,2,4,6,6,6]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovaps 128(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 480(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovaps 160(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 448(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm4, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 480(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm2[0,1]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-FCP-NEXT:    vmovaps 416(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vmovaps 448(%rdi), %ymm0
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vmovaps 416(%rdi), %ymm0
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 384(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 384(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm9, %ymm0, %ymm1
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm9, %ymm12, %ymm0
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm10[0,2,2,2,4,6,6,6]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovaps 512(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 544(%rdi), %ymm3
-; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm13, %ymm2
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovaps 512(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 864(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovaps 544(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 832(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm4, %ymm2
+; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 864(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 832(%rdi), %ymm0
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vmovaps 800(%rdi), %ymm0
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 768(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm2[0,1]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-FCP-NEXT:    vmovaps 800(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm12, %ymm0
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm7[0,2,2,2,4,6,6,6]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovaps 896(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 768(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm6[0,2,2,2,4,6,6,6]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovaps 896(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovaps 928(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 928(%rdi), %ymm3
-; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm13, %ymm2
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm4, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 1184(%rdi), %ymm0
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 1152(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 1184(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm12, %ymm0
+; AVX2-FCP-NEXT:    vmovaps 1248(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 1152(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovaps 1216(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vmovaps 1248(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 1216(%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm2[0,1]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[0,1],ymm1[0,1]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm12[0,2,2,2,4,6,6,6]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovaps 1280(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 1312(%rdi), %ymm7
-; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm13, %ymm7
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7]
+; AVX2-FCP-NEXT:    vmovaps 1312(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm4, %ymm1
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm0 = [1,7,5,u]
-; AVX2-FCP-NEXT:    vpermps %ymm14, %ymm0, %ymm7
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm14 = ymm15[1,3,2,3,5,7,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm7[0,1,2],ymm14[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm7 = [5,3,5,3,5,3,5,3]
-; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm15 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm15, %ymm0, %ymm1
+; AVX2-FCP-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm15 = mem[1,3,2,3,5,7,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm15 = ymm1[0,1,2],ymm15[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [5,3,5,3,5,3,5,3]
+; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm15 = mem[1,3,2,3,5,7,6,7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm15 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm15 = mem[1,3,2,3,5,7,6,7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm15 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm15 = mem[1,3,2,3,5,7,6,7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm15 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm0, %ymm12
+; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermps %ymm13, %ymm0, %ymm13
 ; AVX2-FCP-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm14 = mem[1,3,2,3,5,7,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm7, %ymm11
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm1, %ymm11
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3,4,5],ymm11[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermps %ymm9, %ymm0, %ymm9
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[1,3,2,3,5,7,6,7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm7, %ymm8
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm0, %ymm5
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm7, %ymm4
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm1, %ymm8
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5],ymm8[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm0, %ymm6
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[1,3,2,3,5,7,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm1, %ymm5
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm5[6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm7, %ymm1
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm3 = ymm12[1,3,2,3,5,7,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm1, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 192(%rdi), %xmm1
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; AVX2-FCP-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm2 = mem[0,0,2,3,4,4,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
-; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7]
-; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm0 = [2,0,6,4,2,0,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm0, %ymm3
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = mem[0,0,0,0,4,4,4,4]
-; AVX2-FCP-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm4 = mem[0,0,2,3,4,4,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm0 = ymm7[0,1],mem[2,3],ymm7[4,5],mem[6,7]
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm13 = [2,0,6,4,2,0,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm13, %ymm0
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} xmm12 = [2,0,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 576(%rdi), %xmm3
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm2 = xmm3[2,3,2,3]
-; AVX2-FCP-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm4 = mem[0,0,2,3,4,4,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm4 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm0, %ymm4
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm4 = mem[0,0,0,0,4,4,4,4]
-; AVX2-FCP-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm5 = mem[0,0,2,3,4,4,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm12, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 960(%rdi), %xmm2
-; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm4 = ymm5[0,0,2,3,4,4,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm4 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm0, %ymm4
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm4 = mem[0,0,0,0,4,4,4,4]
-; AVX2-FCP-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm6 = mem[0,0,2,3,4,4,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5,6],ymm4[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm15 = [0,0,6,4,0,0,6,4]
+; AVX2-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm15, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm0 = ymm6[0,1],mem[2,3],ymm6[4,5],mem[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm13, %ymm0
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 1344(%rdi), %xmm2
-; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm12, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm15, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm13, %ymm0
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm12, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm8 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm15, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm6 = ymm4[0,0,2,3,4,4,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm6[1,2,3],ymm2[4],ymm6[5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm0, %ymm6
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm6[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm6 = mem[0,0,0,0,4,4,4,4]
-; AVX2-FCP-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm8 = mem[0,0,2,3,4,4,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm6[5,6,7]
+; AVX2-FCP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm0 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm13, %ymm0
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm14 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm14, %ymm12, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm11 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm15, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm13, %ymm0
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $243, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps (%rdi), %xmm2
-; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; AVX2-FCP-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm6 = mem[0,0,2,3,4,4,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm6[1,2,3],ymm2[4],ymm6[5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm6 = mem[0,1],ymm6[2,3],mem[4,5],ymm6[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm0, %ymm6
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm6[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm6 = mem[0,0,0,0,4,4,4,4]
-; AVX2-FCP-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm9 = mem[0,0,2,3,4,4,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm6[5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm12, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 384(%rdi), %xmm2
-; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; AVX2-FCP-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm9 = mem[0,0,2,3,4,4,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1,2,3],ymm2[4],ymm9[5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm9 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm9 = mem[0,1],ymm6[2,3],mem[4,5],ymm6[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm9, %ymm0, %ymm9
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm9[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm9 = mem[0,0,0,0,4,4,4,4]
-; AVX2-FCP-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm10 = mem[0,0,2,3,4,4,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm9[5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm15, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm13, %ymm0
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 768(%rdi), %xmm2
-; AVX2-FCP-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm9 = xmm2[2,3,2,3]
-; AVX2-FCP-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm10 = mem[0,0,2,3,4,4,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4],ymm10[5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm12, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm10 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm10, %ymm0, %ymm10
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm9 = ymm9[0,3,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm10 = mem[0,0,0,0,4,4,4,4]
-; AVX2-FCP-NEXT:    vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm11 = mem[0,0,2,3,4,4,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4],ymm10[5,6,7]
+; AVX2-FCP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm8[2,3],ymm12[4,5],ymm8[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm9, %ymm0, %ymm9
-; AVX2-FCP-NEXT:    vmovaps 1152(%rdi), %xmm0
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm10 = xmm0[2,3,2,3]
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm15, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm0 = ymm10[0,1],mem[2,3],ymm10[4,5],mem[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm13, %ymm0
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm11 = ymm2[0,0,2,3,4,4,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0],ymm11[1,2,3],ymm10[4],ymm11[5,6,7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[0,3,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm10 = ymm6[0,0,0,0,4,4,4,4]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm11 = ymm7[0,0,2,3,4,4,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; AVX2-FCP-NEXT:    vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm9 = mem[0,1,3,3,4,5,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1,2,3],ymm1[4],ymm9[5,6,7]
-; AVX2-FCP-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm9 = mem[3,3,3,3,7,7,7,7]
-; AVX2-FCP-NEXT:    vblendps $34, (%rsp), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm9 = ymm9[0,2,0,3]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm9 = mem[0,1,0,1,4,5,4,5]
-; AVX2-FCP-NEXT:    vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm10 = mem[0,1,3,3,4,5,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm9[5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3]
-; AVX2-FCP-NEXT:    vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = mem[0,1,3,3,4,5,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3],ymm1[4],ymm3[5,6,7]
-; AVX2-FCP-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = mem[3,3,3,3,7,7,7,7]
-; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = mem[0,1,0,1,4,5,4,5]
-; AVX2-FCP-NEXT:    vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm9 = mem[0,1,3,3,4,5,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm9[0,1,2],ymm3[3],ymm9[4,5,6],ymm3[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm1 = mem[3,3,3,3]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm3 = ymm5[0,1,3,3,4,5,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3],ymm1[4],ymm3[5,6,7]
-; AVX2-FCP-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = mem[3,3,3,3,7,7,7,7]
-; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = mem[0,1,0,1,4,5,4,5]
-; AVX2-FCP-NEXT:    vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm5 = mem[0,1,3,3,4,5,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm1 = mem[3,3,3,3]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm3 = ymm4[0,1,3,3,4,5,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3],ymm1[4],ymm3[5,6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm3 = ymm14[3,3,3,3,7,7,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0],ymm15[1],ymm3[2,3,4],ymm15[5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = mem[0,1,0,1,4,5,4,5]
-; AVX2-FCP-NEXT:    vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm4 = mem[0,1,3,3,4,5,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,1,3,3,4,5,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3,4],ymm12[5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[0,1,0,1,4,5,4,5]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm3 = ymm7[0,1,3,3,4,5,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FCP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm9 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm9, %ymm12, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm15, %ymm5
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm0 = mem[3,3,3,3]
-; AVX2-FCP-NEXT:    vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,3,3,4,5,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm0 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm13, %ymm0
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm13 = ymm5[0,1],mem[2,3],ymm5[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm13, %ymm12, %ymm5
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm15, %ymm12
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm12[5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermilps {{.*#+}} xmm5 = mem[3,3,3,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7]
+; AVX2-FCP-NEXT:    vblendps $222, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm12 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm12 = ymm5[0],mem[1,2,3,4],ymm5[5],mem[6,7]
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm5 = [3,1,7,5,0,u,u,u]
+; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm5, %ymm15
+; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm12 = [0,1,7,5,0,1,7,5]
+; AVX2-FCP-NEXT:    # ymm12 = mem[0,1,0,1]
+; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm15[0,1,2,3,4],ymm7[5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermilps {{.*#+}} xmm7 = mem[3,3,3,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7]
+; AVX2-FCP-NEXT:    vblendps $222, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm7 = ymm7[0],mem[1,2,3,4],ymm7[5],mem[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm5, %ymm7
+; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm15 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm15[5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermilps {{.*#+}} xmm7 = mem[3,3,3,3]
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,0,1,4,5,4,5]
-; AVX2-FCP-NEXT:    vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm2 = mem[0,1,3,3,4,5,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0],ymm15[1],ymm7[2,3,4],ymm15[5],ymm7[6,7]
+; AVX2-FCP-NEXT:    vblendps $222, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm7 = ymm7[0],mem[1,2,3,4],ymm7[5],mem[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm5, %ymm7
+; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm12, %ymm8
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm8[5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermilps {{.*#+}} xmm7 = mem[3,3,3,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0],ymm4[1],ymm7[2,3,4],ymm4[5],ymm7[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0],ymm14[1,2,3,4],ymm7[5],ymm14[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm5, %ymm7
+; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm12, %ymm6
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm6[5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermilps {{.*#+}} xmm6 = mem[3,3,3,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0],ymm13[1,2,3,4],ymm6[5],ymm13[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm5, %ymm6
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm12, %ymm0
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm0 = mem[3,3,3,3]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm12[0,1,3,3,4,5,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm9[3,3,3,3,7,7,7,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3,4],ymm8[5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm11[0,1,0,1,4,5,4,5]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm10[0,1,3,3,4,5,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FCP-NEXT:    vpermilps {{.*#+}} xmm0 = mem[3,3,3,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1,2,3,4],ymm0[5],ymm9[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm5, %ymm0
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm12, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm0 = mem[3,3,3,3]
+; AVX2-FCP-NEXT:    vpermilps {{.*#+}} xmm0 = mem[3,3,3,3]
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vblendps $222, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm0 = ymm0[0],mem[1,2,3,4],ymm0[5],mem[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm5, %ymm0
+; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vpermilps {{.*#+}} xmm2 = mem[3,3,3,3]
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[3,3,3,3,7,7,7,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[0,1,0,1,4,5,4,5]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm5[0,1,3,3,4,5,7,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
-; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vblendps $222, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = ymm2[0],mem[1,2,3,4],ymm2[5],mem[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm5, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm2 = ymm7[0,1,2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = ymm4[0,1,2,3],mem[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $15, (%rsp), %ymm0, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm5 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps 80(%rdi), %xmm0
-; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
-; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm13, %ymm1
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [0,2,0,6,0,2,0,6]
-; AVX2-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm2, %ymm1
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm4, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm5 = [0,2,0,6,0,2,0,6]
+; AVX2-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm5, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps 272(%rdi), %xmm0
-; AVX2-FCP-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
-; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm13, %ymm1
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm2, %ymm1
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm4, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm5, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = ymm12[0,1,2,3],mem[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = ymm7[0,1,2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps 464(%rdi), %xmm0
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
-; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm13, %ymm1
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,3],ymm11[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm2, %ymm1
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm4, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm5, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps 656(%rdi), %xmm0
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
-; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm13, %ymm1
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm2, %ymm1
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm4, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm5, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm15[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = ymm10[0,1,2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovaps 848(%rdi), %xmm0
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
-; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm13, %ymm1
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm2, %ymm1
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm4, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm5, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm15 = ymm15[0,1,2,3],mem[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm12 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovaps 1040(%rdi), %xmm11
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm11[2,3],ymm14[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovaps 1040(%rdi), %xmm13
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6,7]
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
-; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm13, %ymm1
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm10 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm10, %ymm2, %ymm1
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm14, %ymm4, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm12 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm5, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovaps 1232(%rdi), %xmm6
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm6[2,3],ymm9[4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm10 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovaps 1232(%rdi), %xmm9
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm9[2,3],ymm11[4,5,6,7]
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
-; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm13, %ymm1
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm5 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm2, %ymm1
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm10, %ymm4, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm8 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm5, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovaps 1424(%rdi), %xmm1
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm1[2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovaps 1424(%rdi), %xmm3
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm3[2,3],ymm7[4,5,6,7]
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
-; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm13, %ymm13
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm13 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm13 = mem[0,1],ymm7[2,3],mem[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm13, %ymm2, %ymm2
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm4, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm5, %ymm4
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm0 = mem[1,1,1,1,5,5,5,5]
-; AVX2-FCP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vblendps $8, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7]
 ; AVX2-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm2 = mem[1,1,1,1,5,5,5,5]
-; AVX2-FCP-NEXT:    vblendps $8, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm15 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm15[0,1],ymm2[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm2 = mem[1,1,1,1,5,5,5,5]
-; AVX2-FCP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm15 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm15[0,1],ymm2[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm2 = mem[1,1,1,1,5,5,5,5]
-; AVX2-FCP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm15 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm15[0,1],ymm2[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm15 = mem[1,1,1,1,5,5,5,5]
-; AVX2-FCP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm15 = ymm15[0,1,2],mem[3],ymm15[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm15[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3,4],ymm15[5,6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm11[3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm7, %ymm11
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm10, %ymm0, %ymm10
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3,4],ymm10[5,6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm9[1,1,1,1,5,5,5,5]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm6[3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm7, %ymm6
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm0, %ymm5
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3,4],ymm5[5,6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm4[1,1,1,1,5,5,5,5]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm7, %ymm2
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm13, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm4 = mem[1,1,1,1,5,5,5,5]
+; AVX2-FCP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm4 = mem[1,1,1,1,5,5,5,5]
+; AVX2-FCP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm4 = mem[1,1,1,1,5,5,5,5]
+; AVX2-FCP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm4 = mem[1,1,1,1,5,5,5,5]
+; AVX2-FCP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm5 = ymm15[1,1,1,1,5,5,5,5]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm13[3],ymm5[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm14, %ymm1, %ymm13
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm13[0,1],ymm5[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm0, %ymm12
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm5[0,1,2,3,4],ymm12[5,6,7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm5 = ymm11[1,1,1,1,5,5,5,5]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm9[3],ymm5[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm10, %ymm1, %ymm9
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm0, %ymm8
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3,4],ymm8[5,6,7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm5 = ymm7[1,1,1,1,5,5,5,5]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm1, %ymm1
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm0, %ymm0
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vmovaps %ymm1, 192(%rsi)
@@ -11745,9 +11354,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovaps %ymm1, (%r9)
 ; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FCP-NEXT:    vmovaps %ymm0, 224(%rax)
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 192(%rax)
-; AVX2-FCP-NEXT:    vmovaps %ymm10, 160(%rax)
-; AVX2-FCP-NEXT:    vmovaps %ymm15, 128(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm8, 192(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm12, 160(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm4, 128(%rax)
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vmovaps %ymm0, 96(%rax)
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -11756,7 +11365,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovaps %ymm0, 32(%rax)
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vmovaps %ymm0, (%rax)
-; AVX2-FCP-NEXT:    addq $2504, %rsp # imm = 0x9C8
+; AVX2-FCP-NEXT:    addq $2536, %rsp # imm = 0x9E8
 ; AVX2-FCP-NEXT:    vzeroupper
 ; AVX2-FCP-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
index b49c35e08129..ed316990e486 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
@@ -556,24 +556,22 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vmovaps 32(%rdi), %xmm4
 ; AVX-NEXT:    vmovaps 64(%rdi), %xmm5
 ; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,3,2,3]
 ; AVX-NEXT:    vmovaps 80(%rdi), %xmm6
-; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm6[0,1,0,1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm7[3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm8 = xmm4[0],xmm3[1],xmm4[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm8[1,0],mem[3,3]
-; AVX-NEXT:    vinsertps {{.*#+}} xmm8 = xmm8[0,1,2],xmm6[2]
-; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm5[0,1,0,1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm6[1],xmm2[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,3,2,1]
+; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm4[0],xmm3[1],xmm4[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[1,0],mem[3,3]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm7 = xmm7[0,1,2],xmm6[2]
+; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm5[0,1,0,1]
+; AVX-NEXT:    vblendps {{.*#+}} xmm8 = xmm8[0,1,2],xmm6[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm3[2,3,2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm9 = xmm9[0],xmm4[1],xmm9[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
 ; AVX-NEXT:    vmovaps 96(%rdi), %xmm9
 ; AVX-NEXT:    vblendps {{.*#+}} xmm10 = xmm9[0],xmm5[1],xmm9[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm10 = xmm10[0,1,1,0]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm10[0,1],xmm3[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[3,2,1,0]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm10 = xmm9[0,1,0,1]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm10 = xmm5[0,1,2],xmm10[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3]
@@ -584,14 +582,15 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7]
 ; AVX-NEXT:    vextractf128 $1, %ymm10, %xmm10
 ; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm10[0,1],xmm5[3,2]
-; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[0,1,0,1]
+; AVX-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm9[3]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[2,0],ymm0[5,4],ymm1[6,4]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3]
 ; AVX-NEXT:    vmovaps %xmm2, (%rsi)
-; AVX-NEXT:    vmovaps %xmm8, (%rdx)
-; AVX-NEXT:    vmovaps %xmm6, (%rcx)
+; AVX-NEXT:    vmovaps %xmm7, (%rdx)
+; AVX-NEXT:    vmovaps %xmm8, (%rcx)
 ; AVX-NEXT:    vmovaps %xmm3, (%r8)
 ; AVX-NEXT:    vmovaps %xmm4, (%r9)
 ; AVX-NEXT:    vmovaps %xmm5, (%r10)
@@ -623,33 +622,32 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-NEXT:    vbroadcastss %xmm8, %xmm9
 ; AVX2-NEXT:    vunpckhps {{.*#+}} xmm4 = xmm9[2],xmm4[2],xmm9[3],xmm4[3]
 ; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
-; AVX2-NEXT:    vmovaps 96(%rdi), %xmm6
-; AVX2-NEXT:    vblendps {{.*#+}} xmm9 = xmm6[0],xmm8[1],xmm6[2,3]
-; AVX2-NEXT:    vshufps {{.*#+}} xmm9 = xmm9[0,1,1,0]
-; AVX2-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0,1,2],mem[3]
-; AVX2-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} xmm6 = xmm7[0,1,2],mem[3]
+; AVX2-NEXT:    vmovaps 96(%rdi), %xmm7
+; AVX2-NEXT:    vblendps {{.*#+}} xmm9 = xmm7[0],xmm8[1],xmm7[2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3]
+; AVX2-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[3,2,1,0]
 ; AVX2-NEXT:    vbroadcastss 100(%rdi), %xmm9
 ; AVX2-NEXT:    vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3]
 ; AVX2-NEXT:    vmovsd {{.*#+}} xmm10 = [4,3,0,0]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-NEXT:    vpermps %ymm11, %ymm10, %ymm10
 ; AVX2-NEXT:    vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3]
+; AVX2-NEXT:    vblendps {{.*#+}} xmm8 = xmm7[0,1,2],xmm8[3]
 ; AVX2-NEXT:    vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2]
 ; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[1,0,2,3,5,4,6,7]
 ; AVX2-NEXT:    vextractf128 $1, %ymm5, %xmm5
 ; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3]
 ; AVX2-NEXT:    vbroadcastss 80(%rdi), %ymm8
-; AVX2-NEXT:    vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3]
+; AVX2-NEXT:    vblendps {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
 ; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3]
 ; AVX2-NEXT:    vmovaps %xmm2, (%rsi)
 ; AVX2-NEXT:    vmovaps %xmm3, (%rdx)
 ; AVX2-NEXT:    vmovaps %xmm4, (%rcx)
-; AVX2-NEXT:    vmovaps %xmm7, (%r8)
+; AVX2-NEXT:    vmovaps %xmm6, (%r8)
 ; AVX2-NEXT:    vmovaps %xmm9, (%r9)
 ; AVX2-NEXT:    vmovaps %xmm5, (%r10)
 ; AVX2-NEXT:    vmovaps %xmm0, (%rax)
@@ -680,33 +678,32 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vbroadcastss %xmm8, %xmm9
 ; AVX2-FP-NEXT:    vunpckhps {{.*#+}} xmm4 = xmm9[2],xmm4[2],xmm9[3],xmm4[3]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
-; AVX2-FP-NEXT:    vmovaps 96(%rdi), %xmm6
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm9 = xmm6[0],xmm8[1],xmm6[2,3]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm9 = xmm9[0,1,1,0]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0,1,2],mem[3]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm6 = xmm7[0,1,2],mem[3]
+; AVX2-FP-NEXT:    vmovaps 96(%rdi), %xmm7
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm9 = xmm7[0],xmm8[1],xmm7[2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[3,2,1,0]
 ; AVX2-FP-NEXT:    vbroadcastss 100(%rdi), %xmm9
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3]
 ; AVX2-FP-NEXT:    vmovsd {{.*#+}} xmm10 = [4,3,0,0]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FP-NEXT:    vpermps %ymm11, %ymm10, %ymm10
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm8 = xmm7[0,1,2],xmm8[3]
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2]
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[1,0,2,3,5,4,6,7]
 ; AVX2-FP-NEXT:    vextractf128 $1, %ymm5, %xmm5
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3]
 ; AVX2-FP-NEXT:    vbroadcastss 80(%rdi), %ymm8
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
 ; AVX2-FP-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3]
 ; AVX2-FP-NEXT:    vmovaps %xmm2, (%rsi)
 ; AVX2-FP-NEXT:    vmovaps %xmm3, (%rdx)
 ; AVX2-FP-NEXT:    vmovaps %xmm4, (%rcx)
-; AVX2-FP-NEXT:    vmovaps %xmm7, (%r8)
+; AVX2-FP-NEXT:    vmovaps %xmm6, (%r8)
 ; AVX2-FP-NEXT:    vmovaps %xmm9, (%r9)
 ; AVX2-FP-NEXT:    vmovaps %xmm5, (%r10)
 ; AVX2-FP-NEXT:    vmovaps %xmm0, (%rax)
@@ -738,33 +735,32 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vbroadcastss %xmm8, %xmm9
 ; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm5[2],xmm9[3],xmm5[3]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %xmm6
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm9 = xmm6[0],xmm8[1],xmm6[2,3]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm9 = xmm9[0,1,1,0]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0,1,2],mem[3]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm6 = xmm7[0,1,2],mem[3]
+; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %xmm7
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm9 = xmm7[0],xmm8[1],xmm7[2,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[3,2,1,0]
 ; AVX2-FCP-NEXT:    vbroadcastss 100(%rdi), %xmm9
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3]
 ; AVX2-FCP-NEXT:    vmovsd {{.*#+}} xmm10 = [4,3,0,0]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm10, %ymm10
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm8 = xmm7[0,1,2],xmm8[3]
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2]
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[1,0,2,3,5,4,6,7]
 ; AVX2-FCP-NEXT:    vextractf128 $1, %ymm4, %xmm4
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3]
 ; AVX2-FCP-NEXT:    vbroadcastss 80(%rdi), %ymm8
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
 ; AVX2-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3]
 ; AVX2-FCP-NEXT:    vmovaps %xmm2, (%rsi)
 ; AVX2-FCP-NEXT:    vmovaps %xmm3, (%rdx)
 ; AVX2-FCP-NEXT:    vmovaps %xmm5, (%rcx)
-; AVX2-FCP-NEXT:    vmovaps %xmm7, (%r8)
+; AVX2-FCP-NEXT:    vmovaps %xmm6, (%r8)
 ; AVX2-FCP-NEXT:    vmovaps %xmm9, (%r9)
 ; AVX2-FCP-NEXT:    vmovaps %xmm4, (%r10)
 ; AVX2-FCP-NEXT:    vmovaps %xmm0, (%rax)
@@ -1303,12 +1299,10 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,2,0]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5,6],ymm6[7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[1,2,2,3,5,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm11 = ymm8[1,0,3,3,5,4,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,3,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6],ymm8[7]
+; AVX2-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [1,0,7,6,5,0,0,0]
+; AVX2-NEXT:    vpermd %ymm7, %ymm11, %ymm7
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7]
 ; AVX2-NEXT:    vmovdqa 80(%rdi), %xmm7
 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm11 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
@@ -1406,12 +1400,10 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,2,0]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5,6],ymm6[7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[1,2,2,3,5,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm11 = ymm8[1,0,3,3,5,4,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,3,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6],ymm8[7]
+; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [1,0,7,6,5,0,0,0]
+; AVX2-FP-NEXT:    vpermd %ymm7, %ymm11, %ymm7
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa 80(%rdi), %xmm7
 ; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm11 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
@@ -1509,12 +1501,10 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,2,0]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5,6],ymm6[7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [5,6,5,6,5,6,5,6]
-; AVX2-FCP-NEXT:    vpermd %ymm7, %ymm8, %ymm7
-; AVX2-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm11 = [1,7,0,0]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm8, %ymm11, %ymm11
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6],ymm8[7]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [1,0,7,6,5,0,0,0]
+; AVX2-FCP-NEXT:    vpermd %ymm7, %ymm11, %ymm7
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa 80(%rdi), %xmm7
 ; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm11 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
@@ -2605,225 +2595,219 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-LABEL: load_i32_stride7_vf16:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    subq $264, %rsp # imm = 0x108
-; AVX2-NEXT:    vmovdqa 288(%rdi), %ymm15
-; AVX2-NEXT:    vmovdqa 384(%rdi), %ymm8
-; AVX2-NEXT:    vmovdqa 352(%rdi), %ymm6
-; AVX2-NEXT:    vmovdqa 320(%rdi), %ymm5
+; AVX2-NEXT:    vmovdqa 288(%rdi), %ymm5
+; AVX2-NEXT:    vmovdqa 384(%rdi), %ymm9
+; AVX2-NEXT:    vmovdqa 352(%rdi), %ymm7
+; AVX2-NEXT:    vmovdqa 320(%rdi), %ymm4
 ; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm0
-; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm10
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm9
-; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm4
-; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm12
+; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm3
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm10
+; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm6
+; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm15
 ; AVX2-NEXT:    vpbroadcastq 80(%rdi), %ymm1
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7]
 ; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,7,6,0]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm4[6],ymm9[7]
-; AVX2-NEXT:    vmovdqa %ymm4, %ymm7
-; AVX2-NEXT:    vpermd %ymm3, %ymm2, %ymm3
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-NEXT:    vmovdqa 128(%rdi), %xmm3
-; AVX2-NEXT:    vmovdqa 160(%rdi), %xmm4
-; AVX2-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1]
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-NEXT:    vpbroadcastd 196(%rdi), %ymm4
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm6[6],ymm10[7]
+; AVX2-NEXT:    vpermd %ymm8, %ymm2, %ymm8
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-NEXT:    vmovdqa 128(%rdi), %xmm8
+; AVX2-NEXT:    vmovdqa 160(%rdi), %xmm11
+; AVX2-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm11[1]
+; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-NEXT:    vpbroadcastd 196(%rdi), %ymm11
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm11[7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm0[6],ymm10[7]
+; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm0[6],ymm3[7]
 ; AVX2-NEXT:    vpermd %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpbroadcastq 304(%rdi), %ymm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-NEXT:    vmovdqa 352(%rdi), %xmm2
-; AVX2-NEXT:    vmovdqa 384(%rdi), %xmm3
-; AVX2-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
+; AVX2-NEXT:    vmovdqa 384(%rdi), %xmm8
+; AVX2-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm8[1]
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-NEXT:    vpbroadcastd 420(%rdi), %ymm3
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX2-NEXT:    vpbroadcastd 420(%rdi), %ymm8
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm8[7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm8[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm8[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm9[12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26,27]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm15[2,3],ymm5[4,5],ymm15[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,2,2,3,5,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7]
-; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
+; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5,6],ymm8[7]
+; AVX2-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [1,0,7,6,5,6,5,6]
+; AVX2-NEXT:    vpermd %ymm2, %ymm12, %ymm2
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm3
-; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm2
-; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm1
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm4 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,2,0]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm10
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[1,2,2,3,5,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm4[2,2,2,2]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0],ymm9[1],ymm7[2,3,4],ymm9[5],ymm7[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm14 = ymm4[1,0,3,3,5,4,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,3,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm8
+; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm3
+; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm2
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm11 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,1,2,0]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm8[7]
+; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm11
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm6[0],ymm10[1],ymm6[2,3,4],ymm10[5],ymm6[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6],ymm8[7]
+; AVX2-NEXT:    vpermd %ymm0, %ymm12, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 80(%rdi), %xmm0
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm3 = ymm12[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7]
-; AVX2-NEXT:    vpbroadcastd 8(%rdi), %xmm3
-; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm14
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm14[1],xmm3[2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
-; AVX2-NEXT:    vpbroadcastd 204(%rdi), %ymm13
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm13[7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm15[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
+; AVX2-NEXT:    vpbroadcastd 8(%rdi), %xmm1
+; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm12
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm12[1],xmm1[2,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
+; AVX2-NEXT:    vpbroadcastd 204(%rdi), %ymm14
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 304(%rdi), %xmm0
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm3 = ymm5[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7]
-; AVX2-NEXT:    vpbroadcastd 232(%rdi), %xmm3
-; AVX2-NEXT:    vmovdqa 256(%rdi), %xmm13
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm13[1],xmm3[2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2]
-; AVX2-NEXT:    vpbroadcastd 428(%rdi), %ymm11
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
+; AVX2-NEXT:    vpbroadcastd 232(%rdi), %xmm1
+; AVX2-NEXT:    vmovdqa 256(%rdi), %xmm14
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm9[0],ymm7[2],ymm9[2]
+; AVX2-NEXT:    vpbroadcastd 428(%rdi), %ymm13
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm13[7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0],ymm10[1],ymm12[2,3,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm14[0,1,2],mem[3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0],ymm11[1],ymm15[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],mem[3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm1[0,2],ymm2[1,3],ymm1[4,6],ymm2[5,7]
-; AVX2-NEXT:    vbroadcastss 208(%rdi), %ymm10
-; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm10[7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm3[1,3],ymm2[4,6],ymm3[5,7]
+; AVX2-NEXT:    vbroadcastss 208(%rdi), %ymm11
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm11[7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0],ymm15[1],ymm5[2,3,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm13[0,1,2],mem[3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],mem[3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm6[0,2],ymm8[1,3],ymm6[4,6],ymm8[5,7]
-; AVX2-NEXT:    vbroadcastss 432(%rdi), %ymm5
-; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm9[1,3],ymm7[4,6],ymm9[5,7]
+; AVX2-NEXT:    vbroadcastss 432(%rdi), %ymm4
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpbroadcastd 100(%rdi), %xmm0
-; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm3
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3]
-; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [4,3,0,0]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-NEXT:    vmovdqa %ymm7, %ymm12
-; AVX2-NEXT:    vpermd %ymm10, %ymm5, %ymm10
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,3]
+; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [4,3,0,0]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-NEXT:    vmovdqa %ymm6, %ymm15
+; AVX2-NEXT:    vpermd %ymm5, %ymm4, %ymm5
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,3]
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7]
-; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm11
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-NEXT:    vpbroadcastd 212(%rdi), %ymm13
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm13[7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm11
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-NEXT:    vpbroadcastd 212(%rdi), %ymm12
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm11[4,5,6,7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-NEXT:    vpermd %ymm10, %ymm5, %ymm5
-; AVX2-NEXT:    vpbroadcastd 324(%rdi), %xmm10
-; AVX2-NEXT:    vmovdqa 288(%rdi), %xmm11
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm10[2,3]
-; AVX2-NEXT:    vpermd %ymm6, %ymm0, %ymm10
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm8[6,7]
-; AVX2-NEXT:    vpbroadcastd 436(%rdi), %ymm13
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm13[7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,0,3,3,5,4,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3]
-; AVX2-NEXT:    vpbroadcastd 216(%rdi), %ymm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-NEXT:    vmovdqa 96(%rdi), %xmm5
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm3[3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm4[1,0,2,3,5,4,6,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm3
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT:    vmovdqa 320(%rdi), %xmm3
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm11[3]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-NEXT:    vpermd %ymm5, %ymm4, %ymm4
+; AVX2-NEXT:    vpbroadcastd 324(%rdi), %xmm5
+; AVX2-NEXT:    vmovdqa 288(%rdi), %xmm13
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm13[0,1,2],xmm5[3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
+; AVX2-NEXT:    vpermd %ymm7, %ymm0, %ymm5
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7]
+; AVX2-NEXT:    vpbroadcastd 436(%rdi), %ymm11
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm11[7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3]
+; AVX2-NEXT:    vpbroadcastd 216(%rdi), %ymm3
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX2-NEXT:    vmovdqa 96(%rdi), %xmm3
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm8[1,0,2,3,5,4,6,7]
+; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm4
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT:    vmovdqa 320(%rdi), %xmm8
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm8[0,1,2],xmm13[3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
 ; AVX2-NEXT:    vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm4 = mem[1,0,2,3,5,4,6,7]
 ; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm4
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0],ymm7[1],ymm9[2,3,4],ymm7[5],ymm9[6,7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3]
-; AVX2-NEXT:    vpbroadcastd 440(%rdi), %ymm6
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7]
+; AVX2-NEXT:    vpbroadcastd 440(%rdi), %ymm5
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
 ; AVX2-NEXT:    vpbroadcastd 136(%rdi), %xmm4
 ; AVX2-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm4 = xmm4[0],mem[1],xmm4[2,3]
 ; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-NEXT:    vpermd 192(%rdi), %ymm0, %ymm6
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
-; AVX2-NEXT:    vpbroadcastd 80(%rdi), %ymm6
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm6 = ymm9[2,3,2,3,6,7,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0],ymm12[1],ymm6[2,3,4],ymm12[5],ymm6[6,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm6
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-NEXT:    vpbroadcastd 360(%rdi), %xmm5
-; AVX2-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm5 = xmm5[0],mem[1],xmm5[2,3]
-; AVX2-NEXT:    vpermd 416(%rdi), %ymm0, %ymm0
-; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-NEXT:    vpbroadcastd 304(%rdi), %ymm5
+; AVX2-NEXT:    vpermd 192(%rdi), %ymm0, %ymm5
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-NEXT:    vpbroadcastd 80(%rdi), %ymm5
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm7[2,3,2,3,6,7,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm10[2,3,2,3,6,7,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7]
 ; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm5
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm3, 32(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm3, (%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm3, 32(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm3, (%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm3, 32(%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm3, (%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm3, 32(%r8)
-; AVX2-NEXT:    vmovups (%rsp), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm3, (%r8)
-; AVX2-NEXT:    vmovdqa %ymm10, 32(%r9)
-; AVX2-NEXT:    vmovdqa %ymm15, (%r9)
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-NEXT:    vpbroadcastd 360(%rdi), %xmm4
+; AVX2-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm4 = xmm4[0],mem[1],xmm4[2,3]
+; AVX2-NEXT:    vpermd 416(%rdi), %ymm0, %ymm0
+; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-NEXT:    vpbroadcastd 304(%rdi), %ymm4
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm8[3]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm6[2,3,2,3,6,7,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7]
+; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm5
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm4, 32(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm4, (%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm4, 32(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm4, (%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm4, 32(%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm4, (%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm4, 32(%r8)
+; AVX2-NEXT:    vmovups (%rsp), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm4, (%r8)
+; AVX2-NEXT:    vmovdqa %ymm11, 32(%r9)
+; AVX2-NEXT:    vmovdqa %ymm12, (%r9)
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    vmovdqa %ymm2, 32(%rax)
 ; AVX2-NEXT:    vmovdqa %ymm1, (%rax)
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    vmovdqa %ymm0, 32(%rax)
-; AVX2-NEXT:    vmovdqa %ymm4, (%rax)
+; AVX2-NEXT:    vmovdqa %ymm3, (%rax)
 ; AVX2-NEXT:    addq $264, %rsp # imm = 0x108
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -2831,452 +2815,439 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-LABEL: load_i32_stride7_vf16:
 ; AVX2-FP:       # %bb.0:
 ; AVX2-FP-NEXT:    subq $264, %rsp # imm = 0x108
-; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %ymm15
-; AVX2-FP-NEXT:    vmovdqa 384(%rdi), %ymm8
-; AVX2-FP-NEXT:    vmovdqa 352(%rdi), %ymm6
-; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %ymm5
+; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %ymm5
+; AVX2-FP-NEXT:    vmovdqa 384(%rdi), %ymm9
+; AVX2-FP-NEXT:    vmovdqa 352(%rdi), %ymm7
+; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %ymm4
 ; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm0
-; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm10
-; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm9
-; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm4
-; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm12
+; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm3
+; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm10
+; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm6
+; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm15
 ; AVX2-FP-NEXT:    vpbroadcastq 80(%rdi), %ymm1
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7]
 ; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,7,6,0]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm4[6],ymm9[7]
-; AVX2-FP-NEXT:    vmovdqa %ymm4, %ymm7
-; AVX2-FP-NEXT:    vpermd %ymm3, %ymm2, %ymm3
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %xmm3
-; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %xmm4
-; AVX2-FP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FP-NEXT:    vpbroadcastd 196(%rdi), %ymm4
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm6[6],ymm10[7]
+; AVX2-FP-NEXT:    vpermd %ymm8, %ymm2, %ymm8
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %xmm8
+; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %xmm11
+; AVX2-FP-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT:    vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm11[1]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-FP-NEXT:    vpbroadcastd 196(%rdi), %ymm11
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm11[7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm0[6],ymm10[7]
+; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm0[6],ymm3[7]
 ; AVX2-FP-NEXT:    vpermd %ymm1, %ymm2, %ymm1
 ; AVX2-FP-NEXT:    vpbroadcastq 304(%rdi), %ymm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa 352(%rdi), %xmm2
-; AVX2-FP-NEXT:    vmovdqa 384(%rdi), %xmm3
-; AVX2-FP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
+; AVX2-FP-NEXT:    vmovdqa 384(%rdi), %xmm8
+; AVX2-FP-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm8[1]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FP-NEXT:    vpbroadcastd 420(%rdi), %ymm3
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX2-FP-NEXT:    vpbroadcastd 420(%rdi), %ymm8
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm8[7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm8[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm8[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm9[12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26,27]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm15[2,3],ymm5[4,5],ymm15[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,2,2,3,5,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5,6],ymm8[7]
+; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [1,0,7,6,5,6,5,6]
+; AVX2-FP-NEXT:    vpermd %ymm2, %ymm12, %ymm2
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm3
-; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm2
-; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm1
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm4 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,2,0]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm10
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[1,2,2,3,5,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm4[2,2,2,2]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0],ymm9[1],ymm7[2,3,4],ymm9[5],ymm7[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm4[1,0,3,3,5,4,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,3,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm8
+; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm3
+; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm2
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm11 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,1,2,0]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm8[7]
+; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm11
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm6[0],ymm10[1],ymm6[2,3,4],ymm10[5],ymm6[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6],ymm8[7]
+; AVX2-FP-NEXT:    vpermd %ymm0, %ymm12, %ymm0
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 80(%rdi), %xmm0
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm3 = ymm12[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7]
-; AVX2-FP-NEXT:    vpbroadcastd 8(%rdi), %xmm3
-; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %xmm14
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm14[1],xmm3[2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
-; AVX2-FP-NEXT:    vpbroadcastd 204(%rdi), %ymm13
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm13[7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm15[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
+; AVX2-FP-NEXT:    vpbroadcastd 8(%rdi), %xmm1
+; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %xmm12
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm12[1],xmm1[2,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
+; AVX2-FP-NEXT:    vpbroadcastd 204(%rdi), %ymm14
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 304(%rdi), %xmm0
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm3 = ymm5[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7]
-; AVX2-FP-NEXT:    vpbroadcastd 232(%rdi), %xmm3
-; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %xmm13
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm13[1],xmm3[2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2]
-; AVX2-FP-NEXT:    vpbroadcastd 428(%rdi), %ymm11
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
+; AVX2-FP-NEXT:    vpbroadcastd 232(%rdi), %xmm1
+; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %xmm14
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm9[0],ymm7[2],ymm9[2]
+; AVX2-FP-NEXT:    vpbroadcastd 428(%rdi), %ymm13
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm13[7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0],ymm10[1],ymm12[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm14[0,1,2],mem[3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0],ymm11[1],ymm15[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],mem[3]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3]
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm1[0,2],ymm2[1,3],ymm1[4,6],ymm2[5,7]
-; AVX2-FP-NEXT:    vbroadcastss 208(%rdi), %ymm10
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm10[7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm3[1,3],ymm2[4,6],ymm3[5,7]
+; AVX2-FP-NEXT:    vbroadcastss 208(%rdi), %ymm11
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm11[7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0],ymm15[1],ymm5[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm13[0,1,2],mem[3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],mem[3]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3]
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm6[0,2],ymm8[1,3],ymm6[4,6],ymm8[5,7]
-; AVX2-FP-NEXT:    vbroadcastss 432(%rdi), %ymm5
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm9[1,3],ymm7[4,6],ymm9[5,7]
+; AVX2-FP-NEXT:    vbroadcastss 432(%rdi), %ymm4
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpbroadcastd 100(%rdi), %xmm0
-; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %xmm3
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3]
-; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [4,3,0,0]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa %ymm7, %ymm12
-; AVX2-FP-NEXT:    vpermd %ymm10, %ymm5, %ymm10
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,3]
+; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %xmm1
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [4,3,0,0]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa %ymm6, %ymm15
+; AVX2-FP-NEXT:    vpermd %ymm5, %ymm4, %ymm5
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,3]
 ; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7]
-; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm11
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-FP-NEXT:    vpbroadcastd 212(%rdi), %ymm13
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm13[7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm11
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-FP-NEXT:    vpbroadcastd 212(%rdi), %ymm12
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm11[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-FP-NEXT:    vpermd %ymm10, %ymm5, %ymm5
-; AVX2-FP-NEXT:    vpbroadcastd 324(%rdi), %xmm10
-; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %xmm11
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm10[2,3]
-; AVX2-FP-NEXT:    vpermd %ymm6, %ymm0, %ymm10
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm8[6,7]
-; AVX2-FP-NEXT:    vpbroadcastd 436(%rdi), %ymm13
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm13[7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,0,3,3,5,4,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3]
-; AVX2-FP-NEXT:    vpbroadcastd 216(%rdi), %ymm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %xmm5
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm3[3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm4[1,0,2,3,5,4,6,7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm3
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %xmm3
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm11[3]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-FP-NEXT:    vpermd %ymm5, %ymm4, %ymm4
+; AVX2-FP-NEXT:    vpbroadcastd 324(%rdi), %xmm5
+; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %xmm13
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm13[0,1,2],xmm5[3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
+; AVX2-FP-NEXT:    vpermd %ymm7, %ymm0, %ymm5
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7]
+; AVX2-FP-NEXT:    vpbroadcastd 436(%rdi), %ymm11
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm11[7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3]
+; AVX2-FP-NEXT:    vpbroadcastd 216(%rdi), %ymm3
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %xmm3
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm8[1,0,2,3,5,4,6,7]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm4
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %xmm8
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm8[0,1,2],xmm13[3]
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
 ; AVX2-FP-NEXT:    vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm4 = mem[1,0,2,3,5,4,6,7]
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm4
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0],ymm7[1],ymm9[2,3,4],ymm7[5],ymm9[6,7]
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3]
-; AVX2-FP-NEXT:    vpbroadcastd 440(%rdi), %ymm6
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7]
+; AVX2-FP-NEXT:    vpbroadcastd 440(%rdi), %ymm5
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
 ; AVX2-FP-NEXT:    vpbroadcastd 136(%rdi), %xmm4
 ; AVX2-FP-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
 ; AVX2-FP-NEXT:    # xmm4 = xmm4[0],mem[1],xmm4[2,3]
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-FP-NEXT:    vpermd 192(%rdi), %ymm0, %ymm6
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
-; AVX2-FP-NEXT:    vpbroadcastd 80(%rdi), %ymm6
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm9[2,3,2,3,6,7,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0],ymm12[1],ymm6[2,3,4],ymm12[5],ymm6[6,7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm6, %xmm6
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-FP-NEXT:    vpbroadcastd 360(%rdi), %xmm5
-; AVX2-FP-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
-; AVX2-FP-NEXT:    # xmm5 = xmm5[0],mem[1],xmm5[2,3]
-; AVX2-FP-NEXT:    vpermd 416(%rdi), %ymm0, %ymm0
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-FP-NEXT:    vpbroadcastd 304(%rdi), %ymm5
+; AVX2-FP-NEXT:    vpermd 192(%rdi), %ymm0, %ymm5
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-FP-NEXT:    vpbroadcastd 80(%rdi), %ymm5
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm7[2,3,2,3,6,7,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm10[2,3,2,3,6,7,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7]
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm5
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm3, 32(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm3, (%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm3, 32(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm3, (%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm3, 32(%rcx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm3, (%rcx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm3, 32(%r8)
-; AVX2-FP-NEXT:    vmovups (%rsp), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm3, (%r8)
-; AVX2-FP-NEXT:    vmovdqa %ymm10, 32(%r9)
-; AVX2-FP-NEXT:    vmovdqa %ymm15, (%r9)
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FP-NEXT:    vpbroadcastd 360(%rdi), %xmm4
+; AVX2-FP-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX2-FP-NEXT:    # xmm4 = xmm4[0],mem[1],xmm4[2,3]
+; AVX2-FP-NEXT:    vpermd 416(%rdi), %ymm0, %ymm0
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FP-NEXT:    vpbroadcastd 304(%rdi), %ymm4
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm8[3]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm6[2,3,2,3,6,7,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm5
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm4, (%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm4, (%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%rcx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm4, (%rcx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%r8)
+; AVX2-FP-NEXT:    vmovups (%rsp), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm4, (%r8)
+; AVX2-FP-NEXT:    vmovdqa %ymm11, 32(%r9)
+; AVX2-FP-NEXT:    vmovdqa %ymm12, (%r9)
 ; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FP-NEXT:    vmovdqa %ymm2, 32(%rax)
 ; AVX2-FP-NEXT:    vmovdqa %ymm1, (%rax)
 ; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FP-NEXT:    vmovdqa %ymm0, 32(%rax)
-; AVX2-FP-NEXT:    vmovdqa %ymm4, (%rax)
+; AVX2-FP-NEXT:    vmovdqa %ymm3, (%rax)
 ; AVX2-FP-NEXT:    addq $264, %rsp # imm = 0x108
 ; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
 ;
 ; AVX2-FCP-LABEL: load_i32_stride7_vf16:
 ; AVX2-FCP:       # %bb.0:
-; AVX2-FCP-NEXT:    subq $296, %rsp # imm = 0x128
-; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm9
-; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm10
-; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm3
+; AVX2-FCP-NEXT:    subq $264, %rsp # imm = 0x108
+; AVX2-FCP-NEXT:    vmovdqa 288(%rdi), %ymm5
+; AVX2-FCP-NEXT:    vmovdqa 384(%rdi), %ymm9
+; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %ymm7
+; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm4
 ; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm0
-; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm4
-; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm5
+; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm3
+; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm10
 ; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm6
-; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm11
-; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm1
-; AVX2-FCP-NEXT:    vpbroadcastq 80(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [0,7,6,0]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3,4,5],ymm6[6],ymm5[7]
-; AVX2-FCP-NEXT:    vpermd %ymm8, %ymm7, %ymm8
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm15
+; AVX2-FCP-NEXT:    vpbroadcastq 80(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,7,6,0]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm6[6],ymm10[7]
+; AVX2-FCP-NEXT:    vpermd %ymm8, %ymm2, %ymm8
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %xmm8
-; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %xmm12
-; AVX2-FCP-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm12[1]
+; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %xmm11
+; AVX2-FCP-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT:    vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm11[1]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX2-FCP-NEXT:    vpbroadcastd 196(%rdi), %ymm12
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm12[7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm8[5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm0[6],ymm4[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa %ymm0, %ymm13
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm7, %ymm2
-; AVX2-FCP-NEXT:    vpbroadcastq 304(%rdi), %ymm7
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %xmm7
-; AVX2-FCP-NEXT:    vmovdqa 384(%rdi), %xmm0
-; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpunpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm0[1]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-FCP-NEXT:    vpbroadcastd 420(%rdi), %ymm8
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm7[5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 196(%rdi), %ymm11
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm11[7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = mem[2,2,2,2]
+; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm0[6],ymm3[7]
+; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FCP-NEXT:    vpbroadcastq 304(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %xmm2
+; AVX2-FCP-NEXT:    vmovdqa 384(%rdi), %xmm8
+; AVX2-FCP-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm8[1]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm7 = ymm9[12,13,14,15],ymm10[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm10[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,2,0]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm11[2,3],ymm1[4,5],ymm11[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm12 = [5,6,5,6,5,6,5,6]
-; AVX2-FCP-NEXT:    vpermd %ymm7, %ymm12, %ymm8
-; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [1,0,7,7,5,4,7,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm7, %ymm14
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2],ymm8[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 420(%rdi), %ymm8
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm8[7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm9[12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5,6],ymm8[7]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [1,0,7,6,5,6,5,6]
+; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm12, %ymm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = mem[2,2,2,2]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vmovdqa 384(%rdi), %ymm14
-; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %ymm15
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm8 = ymm14[12,13,14,15],ymm15[0,1,2,3,4,5,6,7,8,9,10,11],ymm14[28,29,30,31],ymm15[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,2,0]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-FCP-NEXT:    vmovdqa 288(%rdi), %ymm8
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm8[2,3],ymm3[4,5],ymm8[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm12, %ymm12
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0],ymm4[1],ymm13[2,3,4],ymm4[5],ymm13[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm7, %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm12[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm8
+; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm3
+; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm11 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,1,2,0]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm8[7]
+; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm11
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm6[0],ymm10[1],ymm6[2,3,4],ymm10[5],ymm6[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6],ymm8[7]
+; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm12, %ymm0
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 80(%rdi), %xmm0
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = ymm1[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm4
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm15[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm1
 ; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %xmm12
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm12[1],xmm4[2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm9[0],ymm10[2],ymm9[2]
-; AVX2-FCP-NEXT:    vpbroadcastd 204(%rdi), %ymm7
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm12[1],xmm1[2,3]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
+; AVX2-FCP-NEXT:    vpbroadcastd 204(%rdi), %ymm14
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 304(%rdi), %xmm0
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 232(%rdi), %xmm4
-; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %xmm7
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm7[1],xmm4[2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 232(%rdi), %xmm1
+; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %xmm14
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm9[0],ymm7[2],ymm9[2]
 ; AVX2-FCP-NEXT:    vpbroadcastd 428(%rdi), %ymm13
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm13[7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm13[7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm11[1],ymm1[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0],ymm11[1],ymm15[2,3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],mem[3]
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3]
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm9[1,3],ymm10[4,6],ymm9[5,7]
-; AVX2-FCP-NEXT:    vbroadcastss 208(%rdi), %ymm4
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm3[1,3],ymm2[4,6],ymm3[5,7]
+; AVX2-FCP-NEXT:    vbroadcastss 208(%rdi), %ymm11
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm11[7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0],ymm8[1],ymm3[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm7[0,1,2],mem[3]
+; AVX2-FCP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],mem[3]
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3]
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm15[0,2],ymm14[1,3],ymm15[4,6],ymm14[5,7]
-; AVX2-FCP-NEXT:    vbroadcastss 432(%rdi), %ymm3
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm9[1,3],ymm7[4,6],ymm9[5,7]
+; AVX2-FCP-NEXT:    vbroadcastss 432(%rdi), %ymm4
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpbroadcastd 100(%rdi), %xmm0
 ; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %xmm1
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [4,3,0,0]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa %ymm5, %ymm11
-; AVX2-FCP-NEXT:    vmovdqa %ymm6, %ymm12
-; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm3, %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,3]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [4,3,0,0]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa %ymm6, %ymm15
+; AVX2-FCP-NEXT:    vpermd %ymm5, %ymm4, %ymm5
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,3]
 ; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7]
-; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm0, %ymm7
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 212(%rdi), %ymm8
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm11
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 212(%rdi), %ymm12
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm3, %ymm3
-; AVX2-FCP-NEXT:    vpbroadcastd 324(%rdi), %xmm4
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm5, %ymm4, %ymm4
+; AVX2-FCP-NEXT:    vpbroadcastd 324(%rdi), %xmm5
 ; AVX2-FCP-NEXT:    vmovdqa 288(%rdi), %xmm13
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm13[0,1,2],xmm4[3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
-; AVX2-FCP-NEXT:    vpermd %ymm15, %ymm0, %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm14[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 436(%rdi), %ymm7
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7]
-; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,0,3,3,1,0,7,7]
-; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm4, %ymm3
-; AVX2-FCP-NEXT:    vpbroadcastd 216(%rdi), %ymm7
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7]
-; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %xmm10
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm13[0,1,2],xmm5[3]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
+; AVX2-FCP-NEXT:    vpermd %ymm7, %ymm0, %ymm5
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 436(%rdi), %ymm11
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm11[7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,0,3,3,1,0,7,7]
+; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm3, %ymm2
+; AVX2-FCP-NEXT:    vpbroadcastd 216(%rdi), %ymm4
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7]
+; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %xmm4
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
-; AVX2-FCP-NEXT:    vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm7 = mem[1,0,2,3,5,4,6,7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %xmm9
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm9[0,1,2],xmm13[3]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,3,5,4,6,7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm4, %ymm2
-; AVX2-FCP-NEXT:    vpbroadcastd 440(%rdi), %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 136(%rdi), %xmm1
-; AVX2-FCP-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm1 = xmm1[0],mem[1],xmm1[2,3]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vpermd 192(%rdi), %ymm0, %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 80(%rdi), %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm10[3]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm11[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4],ymm12[5],ymm4[6,7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 360(%rdi), %xmm3
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm8[1,0,2,3,5,4,6,7]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %xmm8
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm8[0,1,2],xmm13[3]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
+; AVX2-FCP-NEXT:    vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm5 = mem[1,0,2,3,5,4,6,7]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0],ymm7[1],ymm9[2,3,4],ymm7[5],ymm9[6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm5, %ymm3, %ymm3
+; AVX2-FCP-NEXT:    vpbroadcastd 440(%rdi), %ymm5
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 136(%rdi), %xmm3
 ; AVX2-FCP-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
 ; AVX2-FCP-NEXT:    # xmm3 = xmm3[0],mem[1],xmm3[2,3]
-; AVX2-FCP-NEXT:    vpermd 416(%rdi), %ymm0, %ymm0
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 304(%rdi), %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm5[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm3, 32(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm3, (%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm3, 32(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm3, (%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm3, 32(%rcx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm3, (%rcx)
-; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm3, 32(%r8)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm3, (%r8)
-; AVX2-FCP-NEXT:    vmovdqa %ymm8, 32(%r9)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm3, (%r9)
+; AVX2-FCP-NEXT:    vpermd 192(%rdi), %ymm0, %ymm5
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 80(%rdi), %ymm5
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm10[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 360(%rdi), %xmm4
+; AVX2-FCP-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm4 = xmm4[0],mem[1],xmm4[2,3]
+; AVX2-FCP-NEXT:    vpermd 416(%rdi), %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 304(%rdi), %ymm4
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm8[3]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm6[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm4, (%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm4, (%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm4, (%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%r8)
+; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm4, (%r8)
+; AVX2-FCP-NEXT:    vmovdqa %ymm11, 32(%r9)
+; AVX2-FCP-NEXT:    vmovdqa %ymm12, (%r9)
 ; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FCP-NEXT:    vmovdqa %ymm2, 32(%rax)
-; AVX2-FCP-NEXT:    vmovdqa %ymm7, (%rax)
+; AVX2-FCP-NEXT:    vmovdqa %ymm1, (%rax)
 ; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FCP-NEXT:    vmovdqa %ymm0, 32(%rax)
-; AVX2-FCP-NEXT:    vmovdqa %ymm1, (%rax)
-; AVX2-FCP-NEXT:    addq $296, %rsp # imm = 0x128
+; AVX2-FCP-NEXT:    vmovdqa %ymm3, (%rax)
+; AVX2-FCP-NEXT:    addq $264, %rsp # imm = 0x108
 ; AVX2-FCP-NEXT:    vzeroupper
 ; AVX2-FCP-NEXT:    retq
 ;
@@ -5516,7 +5487,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX2-LABEL: load_i32_stride7_vf32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    subq $1224, %rsp # imm = 0x4C8
+; AVX2-NEXT:    subq $1192, %rsp # imm = 0x4A8
 ; AVX2-NEXT:    vmovdqa 320(%rdi), %ymm9
 ; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm4
 ; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm5
@@ -5525,13 +5496,12 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqa 448(%rdi), %ymm8
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm3
 ; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm6
-; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm1
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm11
 ; AVX2-NEXT:    vpbroadcastq 80(%rdi), %ymm0
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm11[4,5,6,7]
 ; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7]
-; AVX2-NEXT:    vmovdqa %ymm6, %ymm12
+; AVX2-NEXT:    vmovdqa %ymm6, %ymm13
 ; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa %ymm3, %ymm14
 ; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -5552,7 +5522,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpbroadcastq 528(%rdi), %ymm2
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa %ymm10, %ymm12
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-NEXT:    vmovdqa 576(%rdi), %xmm2
 ; AVX2-NEXT:    vmovdqa 608(%rdi), %xmm3
@@ -5569,7 +5539,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpbroadcastq 304(%rdi), %ymm2
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-NEXT:    vmovdqa %ymm9, %ymm13
+; AVX2-NEXT:    vmovdqa %ymm9, %ymm10
 ; AVX2-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-NEXT:    vmovdqa 352(%rdi), %xmm2
@@ -5589,9 +5559,10 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqa %ymm2, %ymm3
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa 768(%rdi), %ymm11
+; AVX2-NEXT:    vmovdqa 768(%rdi), %ymm2
 ; AVX2-NEXT:    vpbroadcastq 752(%rdi), %ymm1
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT:    vmovdqa %ymm2, %ymm15
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX2-NEXT:    vmovdqa 800(%rdi), %xmm1
 ; AVX2-NEXT:    vmovdqa 832(%rdi), %xmm2
@@ -5610,371 +5581,355 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7]
 ; AVX2-NEXT:    vmovdqa 512(%rdi), %ymm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm2[2,3],ymm12[4,5],ymm2[6,7]
 ; AVX2-NEXT:    vmovdqa %ymm2, %ymm9
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
+; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,2,2,2]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa 384(%rdi), %ymm2
-; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 352(%rdi), %ymm1
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX2-NEXT:    vmovdqa 288(%rdi), %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7]
+; AVX2-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,0,7,6,5,6,5,6]
+; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3],ymm13[4,5],ymm1[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7]
-; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,2,2,2]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa 832(%rdi), %ymm2
-; AVX2-NEXT:    vmovdqa 800(%rdi), %ymm4
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-NEXT:    vmovdqa %ymm4, %ymm13
-; AVX2-NEXT:    vmovdqa %ymm2, %ymm15
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-NEXT:    vmovdqa 384(%rdi), %ymm7
+; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 352(%rdi), %ymm2
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX2-NEXT:    vmovdqa 736(%rdi), %ymm2
-; AVX2-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7]
-; AVX2-NEXT:    vmovdqa %ymm2, %ymm10
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm7[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm7[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-NEXT:    vmovdqa 288(%rdi), %ymm2
 ; AVX2-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7]
-; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,2,2,2]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm2
-; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm3
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7]
+; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6],ymm4[7]
+; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-NEXT:    vmovdqa 832(%rdi), %ymm8
+; AVX2-NEXT:    vmovdqa 800(%rdi), %ymm10
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm8[12,13,14,15],ymm10[0,1,2,3,4,5,6,7,8,9,10,11],ymm8[28,29,30,31],ymm10[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-NEXT:    vmovdqa 736(%rdi), %ymm4
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm4[2,3],ymm15[4,5],ymm4[6,7]
+; AVX2-NEXT:    vmovdqa %ymm4, %ymm7
+; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa %ymm15, %ymm5
+; AVX2-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7]
+; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6],ymm4[7]
+; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm3
+; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm4
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-NEXT:    vmovdqa %ymm4, %ymm15
+; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa %ymm3, %ymm6
-; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa %ymm2, %ymm5
-; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm1[7]
 ; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm1
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,2,2,3,5,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7]
-; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7]
+; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6],ymm4[7]
+; AVX2-NEXT:    vpermd %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 80(%rdi), %xmm0
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm4[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7]
 ; AVX2-NEXT:    vpbroadcastd 8(%rdi), %xmm2
 ; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm3
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm15[0],ymm6[0],ymm15[2],ymm6[2]
+; AVX2-NEXT:    vmovdqa %ymm6, %ymm11
 ; AVX2-NEXT:    vpbroadcastd 204(%rdi), %ymm4
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 528(%rdi), %xmm0
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm12[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7]
 ; AVX2-NEXT:    vpbroadcastd 456(%rdi), %xmm4
 ; AVX2-NEXT:    vmovdqa 480(%rdi), %xmm2
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm7[0],ymm9[0],ymm7[2],ymm9[2]
-; AVX2-NEXT:    vpbroadcastd 652(%rdi), %ymm12
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm12[7]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm9[0],ymm12[2],ymm9[2]
+; AVX2-NEXT:    vpbroadcastd 652(%rdi), %ymm15
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm15[7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 752(%rdi), %xmm0
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm4 = ymm11[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm4 = ymm5[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7]
-; AVX2-NEXT:    vpbroadcastd 680(%rdi), %xmm12
-; AVX2-NEXT:    vmovdqa 704(%rdi), %xmm4
-; AVX2-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm12 = xmm12[0],xmm4[1],xmm12[2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm13[0],ymm15[0],ymm13[2],ymm15[2]
-; AVX2-NEXT:    vmovdqa %ymm13, %ymm10
-; AVX2-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpbroadcastd 876(%rdi), %ymm15
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5,6,7]
+; AVX2-NEXT:    vpbroadcastd 680(%rdi), %xmm15
+; AVX2-NEXT:    vmovdqa 704(%rdi), %xmm7
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm15 = xmm15[0],xmm7[1],xmm15[2,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm8[0],ymm10[2],ymm8[2]
+; AVX2-NEXT:    vpbroadcastd 876(%rdi), %ymm13
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 304(%rdi), %xmm0
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm12 = ymm5[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm0[3],ymm12[4,5,6,7]
+; AVX2-NEXT:    vmovdqu (%rsp), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm13 = ymm4[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm0[3],ymm13[4,5,6,7]
 ; AVX2-NEXT:    vpbroadcastd 232(%rdi), %xmm15
 ; AVX2-NEXT:    vmovdqa 256(%rdi), %xmm0
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3,4,5,6,7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm4[0],ymm13[2],ymm4[2]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5,6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm6[0],ymm8[0],ymm6[2],ymm8[2]
 ; AVX2-NEXT:    vpbroadcastd 428(%rdi), %ymm14
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],mem[3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm14[0,2],ymm15[1,3],ymm14[4,6],ymm15[5,7]
+; AVX2-NEXT:    vmovdqa %ymm11, %ymm13
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm10[0,2],ymm11[1,3],ymm10[4,6],ymm11[5,7]
 ; AVX2-NEXT:    vbroadcastss 208(%rdi), %ymm11
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm1 = ymm8[0],mem[1],ymm8[2,3,4,5,6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
-; AVX2-NEXT:    vmovdqa %ymm9, %ymm12
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm7[0,2],ymm9[1,3],ymm7[4,6],ymm9[5,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm12[0,2],ymm9[1,3],ymm12[4,6],ymm9[5,7]
 ; AVX2-NEXT:    vbroadcastss 656(%rdi), %ymm3
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm13[0,2],ymm4[1,3],ymm13[4,6],ymm4[5,7]
-; AVX2-NEXT:    vmovdqa %ymm4, %ymm8
+; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm8[1,3],ymm6[4,6],ymm8[5,7]
+; AVX2-NEXT:    vmovaps %ymm6, %ymm15
 ; AVX2-NEXT:    vbroadcastss 432(%rdi), %ymm2
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vblendps $2, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7]
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3]
-; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm9[1,3],ymm10[4,6],ymm9[5,7]
+; AVX2-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[0],ymm0[1],mem[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm7[0,1,2],mem[3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm5[0,2],ymm6[1,3],ymm5[4,6],ymm6[5,7]
 ; AVX2-NEXT:    vbroadcastss 880(%rdi), %ymm2
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,3,0,0]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = [4,3,0,0]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpbroadcastd 548(%rdi), %xmm2
-; AVX2-NEXT:    vmovdqa 512(%rdi), %xmm6
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7]
-; AVX2-NEXT:    vpermd %ymm7, %ymm11, %ymm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7]
-; AVX2-NEXT:    vpbroadcastd 660(%rdi), %ymm3
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX2-NEXT:    vpbroadcastd 100(%rdi), %xmm2
-; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm0
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vbroadcastss 548(%rdi), %xmm2
+; AVX2-NEXT:    vmovaps 512(%rdi), %xmm7
+; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3]
+; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7]
+; AVX2-NEXT:    vpermps %ymm12, %ymm11, %ymm2
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7]
+; AVX2-NEXT:    vmovaps %ymm9, %ymm12
+; AVX2-NEXT:    vbroadcastss 660(%rdi), %ymm3
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vbroadcastss 100(%rdi), %xmm2
+; AVX2-NEXT:    vmovaps 64(%rdi), %xmm0
+; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-NEXT:    vpermd %ymm3, %ymm1, %ymm3
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX2-NEXT:    vmovaps %ymm14, %ymm10
-; AVX2-NEXT:    vpermd %ymm14, %ymm11, %ymm3
-; AVX2-NEXT:    vmovaps %ymm15, %ymm7
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-NEXT:    vpbroadcastd 212(%rdi), %ymm4
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    vpermps %ymm3, %ymm1, %ymm3
+; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
+; AVX2-NEXT:    vpermps %ymm10, %ymm11, %ymm3
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7]
+; AVX2-NEXT:    vbroadcastss 212(%rdi), %ymm4
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-NEXT:    vpermd %ymm2, %ymm1, %ymm3
-; AVX2-NEXT:    vpbroadcastd 324(%rdi), %xmm4
-; AVX2-NEXT:    vmovdqa 288(%rdi), %xmm2
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
-; AVX2-NEXT:    vpermd %ymm13, %ymm11, %ymm4
-; AVX2-NEXT:    vmovdqa %ymm13, %ymm14
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7]
-; AVX2-NEXT:    vmovdqa %ymm8, %ymm13
-; AVX2-NEXT:    vpbroadcastd 436(%rdi), %ymm8
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT:    vpermps %ymm2, %ymm1, %ymm3
+; AVX2-NEXT:    vbroadcastss 324(%rdi), %xmm4
+; AVX2-NEXT:    vmovaps 288(%rdi), %xmm2
+; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3]
+; AVX2-NEXT:    vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
+; AVX2-NEXT:    vpermps %ymm15, %ymm11, %ymm4
+; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7]
+; AVX2-NEXT:    vmovaps %ymm8, %ymm9
+; AVX2-NEXT:    vbroadcastss 436(%rdi), %ymm8
+; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-NEXT:    vpermd %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpbroadcastd 772(%rdi), %xmm4
-; AVX2-NEXT:    vmovdqa 736(%rdi), %xmm3
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-NEXT:    vpermd %ymm15, %ymm11, %ymm4
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-NEXT:    vpbroadcastd 884(%rdi), %ymm8
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7]
+; AVX2-NEXT:    vpermps %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vbroadcastss 772(%rdi), %xmm4
+; AVX2-NEXT:    vmovaps 736(%rdi), %xmm3
+; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3]
+; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
+; AVX2-NEXT:    vpermps %ymm5, %ymm11, %ymm4
+; AVX2-NEXT:    vmovaps %ymm5, %ymm14
+; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
+; AVX2-NEXT:    vbroadcastss 884(%rdi), %ymm8
+; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7]
 ; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[1,0,3,3,5,4,7,7]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
 ; AVX2-NEXT:    vbroadcastss 216(%rdi), %ymm4
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
-; AVX2-NEXT:    vmovdqa 96(%rdi), %xmm10
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
-; AVX2-NEXT:    vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX2-NEXT:    vmovaps 96(%rdi), %xmm10
+; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX2-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm4 = mem[1,0,2,3,5,4,6,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
+; AVX2-NEXT:    vextractf128 $1, %ymm4, %xmm4
+; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT:    vmovdqa 544(%rdi), %xmm4
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm6[3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
-; AVX2-NEXT:    vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT:    vmovaps 544(%rdi), %xmm4
+; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm7[3]
+; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2]
+; AVX2-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm5 = mem[1,0,2,3,5,4,6,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm5
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3]
-; AVX2-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT:    vextractf128 $1, %ymm5, %xmm5
+; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3]
+; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm5 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3]
-; AVX2-NEXT:    vpbroadcastd 664(%rdi), %ymm7
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-NEXT:    vmovdqa 320(%rdi), %xmm12
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
-; AVX2-NEXT:    vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
+; AVX2-NEXT:    vbroadcastss 664(%rdi), %ymm7
+; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT:    vmovaps 320(%rdi), %xmm12
+; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3]
+; AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2]
+; AVX2-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm5 = mem[1,0,2,3,5,4,6,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm5
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3]
-; AVX2-NEXT:    vpbroadcastd 440(%rdi), %ymm7
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-NEXT:    vmovdqa 768(%rdi), %xmm2
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2]
-; AVX2-NEXT:    vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX2-NEXT:    vextractf128 $1, %ymm5, %xmm5
+; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
+; AVX2-NEXT:    vbroadcastss 440(%rdi), %ymm7
+; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT:    vmovaps 768(%rdi), %xmm2
+; AVX2-NEXT:    vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3]
+; AVX2-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2]
+; AVX2-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm7 = mem[1,0,2,3,5,4,6,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm7, %xmm7
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,0,3]
-; AVX2-NEXT:    vpbroadcastd 888(%rdi), %ymm8
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-NEXT:    vextractf128 $1, %ymm7, %xmm7
+; AVX2-NEXT:    vblendps {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm6[0],ymm14[1],ymm6[2,3,4],ymm14[5],ymm6[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
+; AVX2-NEXT:    vbroadcastss 888(%rdi), %ymm8
+; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm7[4,5,6,7]
 ; AVX2-NEXT:    vbroadcastss 584(%rdi), %xmm3
 ; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm3 = xmm3[0],mem[1],xmm3[2,3]
 ; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-NEXT:    vpermd 640(%rdi), %ymm11, %ymm8
+; AVX2-NEXT:    vpermps 640(%rdi), %ymm11, %ymm8
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7]
-; AVX2-NEXT:    vpbroadcastd 528(%rdi), %ymm8
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3]
-; AVX2-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
+; AVX2-NEXT:    vbroadcastss 528(%rdi), %ymm8
+; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3]
+; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm8 = mem[2,3,2,3,6,7,6,7]
-; AVX2-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm8, %xmm8
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3]
+; AVX2-NEXT:    vextractf128 $1, %ymm8, %xmm8
+; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-NEXT:    vbroadcastss 808(%rdi), %xmm3
 ; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm3 = xmm3[0],mem[1],xmm3[2,3]
 ; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-NEXT:    vpermd 864(%rdi), %ymm11, %ymm4
+; AVX2-NEXT:    vpermps 864(%rdi), %ymm11, %ymm4
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-NEXT:    vpbroadcastd 752(%rdi), %ymm4
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3]
-; AVX2-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX2-NEXT:    vbroadcastss 752(%rdi), %ymm4
+; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3]
+; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm4 = mem[2,3,2,3,6,7,6,7]
-; AVX2-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
+; AVX2-NEXT:    vextractf128 $1, %ymm4, %xmm4
+; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-NEXT:    vbroadcastss 136(%rdi), %xmm3
 ; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm3 = xmm3[0],mem[1],xmm3[2,3]
 ; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-NEXT:    vpermd 192(%rdi), %ymm11, %ymm4
+; AVX2-NEXT:    vpermps 192(%rdi), %ymm11, %ymm4
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-NEXT:    vpbroadcastd 80(%rdi), %ymm4
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3]
-; AVX2-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX2-NEXT:    vbroadcastss 80(%rdi), %ymm4
+; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3]
+; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm6 = mem[2,3,2,3,6,7,6,7]
-; AVX2-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm6
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
+; AVX2-NEXT:    vextractf128 $1, %ymm6, %xmm6
+; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-NEXT:    vbroadcastss 360(%rdi), %xmm4
 ; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm4 = xmm4[0],mem[1],xmm4[2,3]
-; AVX2-NEXT:    vpermd 416(%rdi), %ymm11, %ymm6
+; AVX2-NEXT:    vpermps 416(%rdi), %ymm11, %ymm6
 ; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
-; AVX2-NEXT:    vpbroadcastd 304(%rdi), %ymm6
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3]
-; AVX2-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm9 = mem[2,3,2,3,6,7,6,7]
-; AVX2-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm9, %xmm9
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3]
+; AVX2-NEXT:    vbroadcastss 304(%rdi), %ymm6
+; AVX2-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3]
+; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm10 = mem[2,3,2,3,6,7,6,7]
+; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm10 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7]
+; AVX2-NEXT:    vextractf128 $1, %ymm10, %xmm10
+; AVX2-NEXT:    vblendps {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX2-NEXT:    vmovaps %ymm6, 96(%rsi)
@@ -6002,7 +5957,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovaps %ymm6, (%rcx)
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX2-NEXT:    vmovaps %ymm6, 96(%r8)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-NEXT:    vmovups (%rsp), %ymm6 # 32-byte Reload
 ; AVX2-NEXT:    vmovaps %ymm6, 32(%r8)
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX2-NEXT:    vmovaps %ymm6, 64(%r8)
@@ -6014,25 +5969,25 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovaps %ymm6, 32(%r9)
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX2-NEXT:    vmovaps %ymm6, (%r9)
-; AVX2-NEXT:    vmovups (%rsp), %ymm6 # 32-byte Reload
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX2-NEXT:    vmovaps %ymm6, 64(%r9)
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    vmovdqa %ymm7, 96(%rax)
-; AVX2-NEXT:    vmovdqa %ymm5, 32(%rax)
-; AVX2-NEXT:    vmovdqa %ymm1, 64(%rax)
+; AVX2-NEXT:    vmovaps %ymm7, 96(%rax)
+; AVX2-NEXT:    vmovaps %ymm5, 32(%rax)
+; AVX2-NEXT:    vmovaps %ymm1, 64(%rax)
 ; AVX2-NEXT:    vmovaps %ymm0, (%rax)
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    vmovaps %ymm4, 32(%rax)
 ; AVX2-NEXT:    vmovaps %ymm3, (%rax)
 ; AVX2-NEXT:    vmovaps %ymm2, 96(%rax)
 ; AVX2-NEXT:    vmovaps %ymm8, 64(%rax)
-; AVX2-NEXT:    addq $1224, %rsp # imm = 0x4C8
+; AVX2-NEXT:    addq $1192, %rsp # imm = 0x4A8
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX2-FP-LABEL: load_i32_stride7_vf32:
 ; AVX2-FP:       # %bb.0:
-; AVX2-FP-NEXT:    subq $1224, %rsp # imm = 0x4C8
+; AVX2-FP-NEXT:    subq $1192, %rsp # imm = 0x4A8
 ; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %ymm9
 ; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm4
 ; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm5
@@ -6041,13 +5996,12 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqa 448(%rdi), %ymm8
 ; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm3
 ; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm6
-; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm1
-; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm11
 ; AVX2-FP-NEXT:    vpbroadcastq 80(%rdi), %ymm0
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm11[4,5,6,7]
 ; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7]
-; AVX2-FP-NEXT:    vmovdqa %ymm6, %ymm12
+; AVX2-FP-NEXT:    vmovdqa %ymm6, %ymm13
 ; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa %ymm3, %ymm14
 ; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -6068,7 +6022,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vpbroadcastq 528(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa %ymm10, %ymm12
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa 576(%rdi), %xmm2
 ; AVX2-FP-NEXT:    vmovdqa 608(%rdi), %xmm3
@@ -6085,7 +6039,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vpbroadcastq 304(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa %ymm9, %ymm13
+; AVX2-FP-NEXT:    vmovdqa %ymm9, %ymm10
 ; AVX2-FP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa 352(%rdi), %xmm2
@@ -6105,9 +6059,10 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqa %ymm2, %ymm3
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vmovdqa 768(%rdi), %ymm11
+; AVX2-FP-NEXT:    vmovdqa 768(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vpbroadcastq 752(%rdi), %ymm1
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqa %ymm2, %ymm15
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa 800(%rdi), %xmm1
 ; AVX2-FP-NEXT:    vmovdqa 832(%rdi), %xmm2
@@ -6126,371 +6081,355 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7]
 ; AVX2-FP-NEXT:    vmovdqa 512(%rdi), %ymm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm2[2,3],ymm12[4,5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vmovdqa %ymm2, %ymm9
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
+; AVX2-FP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,2,2,2]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vmovdqa 384(%rdi), %ymm2
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 352(%rdi), %ymm1
-; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %ymm1
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7]
+; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,0,7,6,5,6,5,6]
+; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3],ymm13[4,5],ymm1[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,2,2,2]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vmovdqa 832(%rdi), %ymm2
-; AVX2-FP-NEXT:    vmovdqa 800(%rdi), %ymm4
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FP-NEXT:    vmovdqa %ymm4, %ymm13
-; AVX2-FP-NEXT:    vmovdqa %ymm2, %ymm15
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FP-NEXT:    vmovdqa 384(%rdi), %ymm7
+; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 352(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX2-FP-NEXT:    vmovdqa 736(%rdi), %ymm2
-; AVX2-FP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7]
-; AVX2-FP-NEXT:    vmovdqa %ymm2, %ymm10
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm7[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm7[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,2,2,2]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm2
-; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm3
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6],ymm4[7]
+; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FP-NEXT:    vmovdqa 832(%rdi), %ymm8
+; AVX2-FP-NEXT:    vmovdqa 800(%rdi), %ymm10
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm8[12,13,14,15],ymm10[0,1,2,3,4,5,6,7,8,9,10,11],ymm8[28,29,30,31],ymm10[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-FP-NEXT:    vmovdqa 736(%rdi), %ymm4
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm4[2,3],ymm15[4,5],ymm4[6,7]
+; AVX2-FP-NEXT:    vmovdqa %ymm4, %ymm7
+; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa %ymm15, %ymm5
+; AVX2-FP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6],ymm4[7]
+; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm3
+; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm4
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FP-NEXT:    vmovdqa %ymm4, %ymm15
+; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa %ymm3, %ymm6
-; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa %ymm2, %ymm5
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm1[7]
 ; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm1
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,2,2,3,5,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6],ymm4[7]
+; AVX2-FP-NEXT:    vpermd %ymm3, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 80(%rdi), %xmm0
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm4[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7]
 ; AVX2-FP-NEXT:    vpbroadcastd 8(%rdi), %xmm2
 ; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %xmm3
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
+; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm15[0],ymm6[0],ymm15[2],ymm6[2]
+; AVX2-FP-NEXT:    vmovdqa %ymm6, %ymm11
 ; AVX2-FP-NEXT:    vpbroadcastd 204(%rdi), %ymm4
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 528(%rdi), %xmm0
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm12[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7]
 ; AVX2-FP-NEXT:    vpbroadcastd 456(%rdi), %xmm4
 ; AVX2-FP-NEXT:    vmovdqa 480(%rdi), %xmm2
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm7[0],ymm9[0],ymm7[2],ymm9[2]
-; AVX2-FP-NEXT:    vpbroadcastd 652(%rdi), %ymm12
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm12[7]
+; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm9[0],ymm12[2],ymm9[2]
+; AVX2-FP-NEXT:    vpbroadcastd 652(%rdi), %ymm15
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm15[7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 752(%rdi), %xmm0
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm4 = ymm11[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm4 = ymm5[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7]
-; AVX2-FP-NEXT:    vpbroadcastd 680(%rdi), %xmm12
-; AVX2-FP-NEXT:    vmovdqa 704(%rdi), %xmm4
-; AVX2-FP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm12 = xmm12[0],xmm4[1],xmm12[2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm13[0],ymm15[0],ymm13[2],ymm15[2]
-; AVX2-FP-NEXT:    vmovdqa %ymm13, %ymm10
-; AVX2-FP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpbroadcastd 876(%rdi), %ymm15
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5,6,7]
+; AVX2-FP-NEXT:    vpbroadcastd 680(%rdi), %xmm15
+; AVX2-FP-NEXT:    vmovdqa 704(%rdi), %xmm7
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm15 = xmm15[0],xmm7[1],xmm15[2,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm8[0],ymm10[2],ymm8[2]
+; AVX2-FP-NEXT:    vpbroadcastd 876(%rdi), %ymm13
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 304(%rdi), %xmm0
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm12 = ymm5[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm0[3],ymm12[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu (%rsp), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm13 = ymm4[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm0[3],ymm13[4,5,6,7]
 ; AVX2-FP-NEXT:    vpbroadcastd 232(%rdi), %xmm15
 ; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %xmm0
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm4[0],ymm13[2],ymm4[2]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm6[0],ymm8[0],ymm6[2],ymm8[2]
 ; AVX2-FP-NEXT:    vpbroadcastd 428(%rdi), %ymm14
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],mem[3]
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3]
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm14[0,2],ymm15[1,3],ymm14[4,6],ymm15[5,7]
+; AVX2-FP-NEXT:    vmovdqa %ymm11, %ymm13
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm10[0,2],ymm11[1,3],ymm10[4,6],ymm11[5,7]
 ; AVX2-FP-NEXT:    vbroadcastss 208(%rdi), %ymm11
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm1 = ymm8[0],mem[1],ymm8[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3]
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3]
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa %ymm9, %ymm12
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm7[0,2],ymm9[1,3],ymm7[4,6],ymm9[5,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm12[0,2],ymm9[1,3],ymm12[4,6],ymm9[5,7]
 ; AVX2-FP-NEXT:    vbroadcastss 656(%rdi), %ymm3
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3]
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm13[0,2],ymm4[1,3],ymm13[4,6],ymm4[5,7]
-; AVX2-FP-NEXT:    vmovdqa %ymm4, %ymm8
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm8[1,3],ymm6[4,6],ymm8[5,7]
+; AVX2-FP-NEXT:    vmovaps %ymm6, %ymm15
 ; AVX2-FP-NEXT:    vbroadcastss 432(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $2, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm9[1,3],ymm10[4,6],ymm9[5,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm0 = mem[0],ymm0[1],mem[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm7[0,1,2],mem[3]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm5[0,2],ymm6[1,3],ymm5[4,6],ymm6[5,7]
 ; AVX2-FP-NEXT:    vbroadcastss 880(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,3,0,0]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovsd {{.*#+}} xmm1 = [4,3,0,0]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FP-NEXT:    vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FP-NEXT:    vpbroadcastd 548(%rdi), %xmm2
-; AVX2-FP-NEXT:    vmovdqa 512(%rdi), %xmm6
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7]
-; AVX2-FP-NEXT:    vpermd %ymm7, %ymm11, %ymm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7]
-; AVX2-FP-NEXT:    vpbroadcastd 660(%rdi), %ymm3
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX2-FP-NEXT:    vpbroadcastd 100(%rdi), %xmm2
-; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %xmm0
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FP-NEXT:    vbroadcastss 548(%rdi), %xmm2
+; AVX2-FP-NEXT:    vmovaps 512(%rdi), %xmm7
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX2-FP-NEXT:    vbroadcastsd {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7]
+; AVX2-FP-NEXT:    vpermps %ymm12, %ymm11, %ymm2
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7]
+; AVX2-FP-NEXT:    vmovaps %ymm9, %ymm12
+; AVX2-FP-NEXT:    vbroadcastss 660(%rdi), %ymm3
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vbroadcastss 100(%rdi), %xmm2
+; AVX2-FP-NEXT:    vmovaps 64(%rdi), %xmm0
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-FP-NEXT:    vpermd %ymm3, %ymm1, %ymm3
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX2-FP-NEXT:    vmovaps %ymm14, %ymm10
-; AVX2-FP-NEXT:    vpermd %ymm14, %ymm11, %ymm3
-; AVX2-FP-NEXT:    vmovaps %ymm15, %ymm7
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-FP-NEXT:    vpbroadcastd 212(%rdi), %ymm4
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vpermps %ymm3, %ymm1, %ymm3
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
+; AVX2-FP-NEXT:    vpermps %ymm10, %ymm11, %ymm3
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7]
+; AVX2-FP-NEXT:    vbroadcastss 212(%rdi), %ymm4
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FP-NEXT:    vpermd %ymm2, %ymm1, %ymm3
-; AVX2-FP-NEXT:    vpbroadcastd 324(%rdi), %xmm4
-; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %xmm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
-; AVX2-FP-NEXT:    vpermd %ymm13, %ymm11, %ymm4
-; AVX2-FP-NEXT:    vmovdqa %ymm13, %ymm14
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7]
-; AVX2-FP-NEXT:    vmovdqa %ymm8, %ymm13
-; AVX2-FP-NEXT:    vpbroadcastd 436(%rdi), %ymm8
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vpermps %ymm2, %ymm1, %ymm3
+; AVX2-FP-NEXT:    vbroadcastss 324(%rdi), %xmm4
+; AVX2-FP-NEXT:    vmovaps 288(%rdi), %xmm2
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
+; AVX2-FP-NEXT:    vpermps %ymm15, %ymm11, %ymm4
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7]
+; AVX2-FP-NEXT:    vmovaps %ymm8, %ymm9
+; AVX2-FP-NEXT:    vbroadcastss 436(%rdi), %ymm8
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-FP-NEXT:    vpermd %ymm3, %ymm1, %ymm1
-; AVX2-FP-NEXT:    vpbroadcastd 772(%rdi), %xmm4
-; AVX2-FP-NEXT:    vmovdqa 736(%rdi), %xmm3
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FP-NEXT:    vpermd %ymm15, %ymm11, %ymm4
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-FP-NEXT:    vpbroadcastd 884(%rdi), %ymm8
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm3, %ymm1, %ymm1
+; AVX2-FP-NEXT:    vbroadcastss 772(%rdi), %xmm4
+; AVX2-FP-NEXT:    vmovaps 736(%rdi), %xmm3
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
+; AVX2-FP-NEXT:    vpermps %ymm5, %ymm11, %ymm4
+; AVX2-FP-NEXT:    vmovaps %ymm5, %ymm14
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
+; AVX2-FP-NEXT:    vbroadcastss 884(%rdi), %ymm8
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7]
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[1,0,3,3,5,4,7,7]
 ; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
 ; AVX2-FP-NEXT:    vbroadcastss 216(%rdi), %ymm4
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
-; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %xmm10
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
-; AVX2-FP-NEXT:    vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vmovaps 96(%rdi), %xmm10
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX2-FP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm4 = mem[1,0,2,3,5,4,6,7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
+; AVX2-FP-NEXT:    vextractf128 $1, %ymm4, %xmm4
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa 544(%rdi), %xmm4
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm6[3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
-; AVX2-FP-NEXT:    vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vmovaps 544(%rdi), %xmm4
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm7[3]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2]
+; AVX2-FP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm5 = mem[1,0,2,3,5,4,6,7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm5
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3]
-; AVX2-FP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vextractf128 $1, %ymm5, %xmm5
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3]
+; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm5 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3]
-; AVX2-FP-NEXT:    vpbroadcastd 664(%rdi), %ymm7
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %xmm12
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
-; AVX2-FP-NEXT:    vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
+; AVX2-FP-NEXT:    vbroadcastss 664(%rdi), %ymm7
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-FP-NEXT:    vmovaps 320(%rdi), %xmm12
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2]
+; AVX2-FP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm5 = mem[1,0,2,3,5,4,6,7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm5
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3]
-; AVX2-FP-NEXT:    vpbroadcastd 440(%rdi), %ymm7
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa 768(%rdi), %xmm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2]
-; AVX2-FP-NEXT:    vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vextractf128 $1, %ymm5, %xmm5
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
+; AVX2-FP-NEXT:    vbroadcastss 440(%rdi), %ymm7
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-FP-NEXT:    vmovaps 768(%rdi), %xmm2
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2]
+; AVX2-FP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm7 = mem[1,0,2,3,5,4,6,7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm7, %xmm7
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,0,3]
-; AVX2-FP-NEXT:    vpbroadcastd 888(%rdi), %ymm8
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FP-NEXT:    vextractf128 $1, %ymm7, %xmm7
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm6[0],ymm14[1],ymm6[2,3,4],ymm14[5],ymm6[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
+; AVX2-FP-NEXT:    vbroadcastss 888(%rdi), %ymm8
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm7[4,5,6,7]
 ; AVX2-FP-NEXT:    vbroadcastss 584(%rdi), %xmm3
 ; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
 ; AVX2-FP-NEXT:    # xmm3 = xmm3[0],mem[1],xmm3[2,3]
 ; AVX2-FP-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FP-NEXT:    vpermd 640(%rdi), %ymm11, %ymm8
+; AVX2-FP-NEXT:    vpermps 640(%rdi), %ymm11, %ymm8
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7]
-; AVX2-FP-NEXT:    vpbroadcastd 528(%rdi), %ymm8
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3]
-; AVX2-FP-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vbroadcastss 528(%rdi), %ymm8
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3]
+; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm8 = mem[2,3,2,3,6,7,6,7]
-; AVX2-FP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm8
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3]
+; AVX2-FP-NEXT:    vextractf128 $1, %ymm8, %xmm8
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-FP-NEXT:    vbroadcastss 808(%rdi), %xmm3
 ; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
 ; AVX2-FP-NEXT:    # xmm3 = xmm3[0],mem[1],xmm3[2,3]
 ; AVX2-FP-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FP-NEXT:    vpermd 864(%rdi), %ymm11, %ymm4
+; AVX2-FP-NEXT:    vpermps 864(%rdi), %ymm11, %ymm4
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-FP-NEXT:    vpbroadcastd 752(%rdi), %ymm4
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3]
-; AVX2-FP-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vbroadcastss 752(%rdi), %ymm4
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3]
+; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm4 = mem[2,3,2,3,6,7,6,7]
-; AVX2-FP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
+; AVX2-FP-NEXT:    vextractf128 $1, %ymm4, %xmm4
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-FP-NEXT:    vbroadcastss 136(%rdi), %xmm3
 ; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
 ; AVX2-FP-NEXT:    # xmm3 = xmm3[0],mem[1],xmm3[2,3]
 ; AVX2-FP-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FP-NEXT:    vpermd 192(%rdi), %ymm11, %ymm4
+; AVX2-FP-NEXT:    vpermps 192(%rdi), %ymm11, %ymm4
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-FP-NEXT:    vpbroadcastd 80(%rdi), %ymm4
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3]
-; AVX2-FP-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vbroadcastss 80(%rdi), %ymm4
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3]
+; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm6 = mem[2,3,2,3,6,7,6,7]
-; AVX2-FP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm6, %xmm6
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
+; AVX2-FP-NEXT:    vextractf128 $1, %ymm6, %xmm6
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-FP-NEXT:    vbroadcastss 360(%rdi), %xmm4
 ; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
 ; AVX2-FP-NEXT:    # xmm4 = xmm4[0],mem[1],xmm4[2,3]
-; AVX2-FP-NEXT:    vpermd 416(%rdi), %ymm11, %ymm6
+; AVX2-FP-NEXT:    vpermps 416(%rdi), %ymm11, %ymm6
 ; AVX2-FP-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
-; AVX2-FP-NEXT:    vpbroadcastd 304(%rdi), %ymm6
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3]
-; AVX2-FP-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm9 = mem[2,3,2,3,6,7,6,7]
-; AVX2-FP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm9, %xmm9
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3]
+; AVX2-FP-NEXT:    vbroadcastss 304(%rdi), %ymm6
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3]
+; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm10 = mem[2,3,2,3,6,7,6,7]
+; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm10 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7]
+; AVX2-FP-NEXT:    vextractf128 $1, %ymm10, %xmm10
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX2-FP-NEXT:    vmovaps %ymm6, 96(%rsi)
@@ -6518,7 +6457,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovaps %ymm6, (%rcx)
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX2-FP-NEXT:    vmovaps %ymm6, 96(%r8)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovups (%rsp), %ymm6 # 32-byte Reload
 ; AVX2-FP-NEXT:    vmovaps %ymm6, 32(%r8)
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX2-FP-NEXT:    vmovaps %ymm6, 64(%r8)
@@ -6530,44 +6469,41 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovaps %ymm6, 32(%r9)
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX2-FP-NEXT:    vmovaps %ymm6, (%r9)
-; AVX2-FP-NEXT:    vmovups (%rsp), %ymm6 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX2-FP-NEXT:    vmovaps %ymm6, 64(%r9)
 ; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-FP-NEXT:    vmovdqa %ymm7, 96(%rax)
-; AVX2-FP-NEXT:    vmovdqa %ymm5, 32(%rax)
-; AVX2-FP-NEXT:    vmovdqa %ymm1, 64(%rax)
+; AVX2-FP-NEXT:    vmovaps %ymm7, 96(%rax)
+; AVX2-FP-NEXT:    vmovaps %ymm5, 32(%rax)
+; AVX2-FP-NEXT:    vmovaps %ymm1, 64(%rax)
 ; AVX2-FP-NEXT:    vmovaps %ymm0, (%rax)
 ; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%rax)
 ; AVX2-FP-NEXT:    vmovaps %ymm3, (%rax)
 ; AVX2-FP-NEXT:    vmovaps %ymm2, 96(%rax)
 ; AVX2-FP-NEXT:    vmovaps %ymm8, 64(%rax)
-; AVX2-FP-NEXT:    addq $1224, %rsp # imm = 0x4C8
+; AVX2-FP-NEXT:    addq $1192, %rsp # imm = 0x4A8
 ; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
 ;
 ; AVX2-FCP-LABEL: load_i32_stride7_vf32:
 ; AVX2-FCP:       # %bb.0:
-; AVX2-FCP-NEXT:    subq $1192, %rsp # imm = 0x4A8
-; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm10
-; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    subq $1224, %rsp # imm = 0x4C8
+; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm9
 ; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm4
 ; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm5
-; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 544(%rdi), %ymm11
-; AVX2-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa 544(%rdi), %ymm12
 ; AVX2-FCP-NEXT:    vmovdqa 480(%rdi), %ymm7
-; AVX2-FCP-NEXT:    vmovdqa 448(%rdi), %ymm13
+; AVX2-FCP-NEXT:    vmovdqa 448(%rdi), %ymm8
 ; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm9
-; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpbroadcastq 80(%rdi), %ymm0
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa %ymm2, %ymm12
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm9[6],ymm3[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa %ymm3, %ymm6
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7]
+; AVX2-FCP-NEXT:    vmovdqa %ymm3, %ymm14
 ; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
@@ -6580,13 +6516,12 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm7[6],ymm13[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa %ymm7, %ymm8
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm7[6],ymm8[7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vpbroadcastq 528(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa 576(%rdi), %xmm2
 ; AVX2-FCP-NEXT:    vmovdqa 608(%rdi), %xmm3
@@ -6598,11 +6533,12 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7]
-; AVX2-FCP-NEXT:    vmovdqa %ymm4, %ymm11
+; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vpbroadcastq 304(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm9, (%rsp) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %xmm2
 ; AVX2-FCP-NEXT:    vmovdqa 384(%rdi), %xmm3
@@ -6613,16 +6549,17 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 704(%rdi), %ymm14
-; AVX2-FCP-NEXT:    vmovdqa 672(%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6],ymm1[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vmovdqa 768(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovdqa 704(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovdqa 672(%rdi), %ymm3
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7]
+; AVX2-FCP-NEXT:    vmovdqa %ymm3, %ymm6
+; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa %ymm2, %ymm3
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vmovdqa 768(%rdi), %ymm11
 ; AVX2-FCP-NEXT:    vpbroadcastq 752(%rdi), %ymm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa 800(%rdi), %xmm1
 ; AVX2-FCP-NEXT:    vmovdqa 832(%rdi), %xmm2
@@ -6635,179 +6572,170 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,2,2,2]
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovdqa 608(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vmovdqa 576(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27]
 ; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm0
-; AVX2-FCP-NEXT:    vmovdqa %ymm12, %ymm7
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vmovdqa %ymm0, %ymm12
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [5,6,5,6,5,6,5,6]
-; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm3
-; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [1,0,7,7,5,4,7,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0],ymm6[1],ymm9[2,3,4],ymm6[5],ymm9[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm1, %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7]
+; AVX2-FCP-NEXT:    vmovdqa 512(%rdi), %ymm13
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = mem[2,2,2,2]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vmovdqa 608(%rdi), %ymm15
-; AVX2-FCP-NEXT:    vmovdqa 576(%rdi), %ymm3
-; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm3 = ymm15[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm15[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-FCP-NEXT:    vmovdqa 512(%rdi), %ymm10
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm10[2,3],ymm5[4,5],ymm10[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm0, %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0],ymm13[1],ymm8[2,3,4],ymm13[5],ymm8[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm1, %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,0,7,6,5,6,5,6]
+; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FCP-NEXT:    vmovdqa 384(%rdi), %ymm7
+; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = mem[2,2,2,2]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vmovdqa 384(%rdi), %ymm4
-; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %ymm3
-; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-FCP-NEXT:    vmovdqa 288(%rdi), %ymm3
-; AVX2-FCP-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm3[2,3],ymm9[4,5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm0, %ymm3
-; AVX2-FCP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm4 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm1, %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm7[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm7[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-FCP-NEXT:    vmovdqa 288(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = mem[2,2,2,2]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vmovdqa 832(%rdi), %ymm6
-; AVX2-FCP-NEXT:    vmovdqa 800(%rdi), %ymm11
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm3 = ymm6[12,13,14,15],ymm11[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm11[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FCP-NEXT:    vmovdqa %ymm11, %ymm13
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6],ymm4[7]
+; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FCP-NEXT:    vmovdqa 832(%rdi), %ymm9
+; AVX2-FCP-NEXT:    vmovdqa 800(%rdi), %ymm5
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm9[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FCP-NEXT:    vmovdqa %ymm5, %ymm15
+; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-FCP-NEXT:    vmovdqa 736(%rdi), %ymm5
 ; AVX2-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-FCP-NEXT:    vmovdqa 736(%rdi), %ymm4
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm4[2,3],ymm8[4,5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm5[2,3],ymm11[4,5],ymm5[6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6],ymm4[7]
+; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm3
+; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm4
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FCP-NEXT:    vmovdqa %ymm4, %ymm8
+; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa %ymm3, %ymm7
 ; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm1, %ymm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm4 = mem[0],ymm14[1],mem[2,3,4],ymm14[5],mem[6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6],ymm4[7]
+; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm0, %ymm0
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 80(%rdi), %xmm0
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm1
-; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm14[0],ymm1[2],ymm14[2]
-; AVX2-FCP-NEXT:    vpbroadcastd 204(%rdi), %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm2
+; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm8[0],ymm7[0],ymm8[2],ymm7[2]
+; AVX2-FCP-NEXT:    vpbroadcastd 204(%rdi), %ymm4
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 528(%rdi), %xmm0
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm5[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 456(%rdi), %xmm3
-; AVX2-FCP-NEXT:    vmovdqa 480(%rdi), %xmm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm12[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 456(%rdi), %xmm4
+; AVX2-FCP-NEXT:    vmovdqa 480(%rdi), %xmm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm15[0],ymm12[2],ymm15[2]
-; AVX2-FCP-NEXT:    vpbroadcastd 652(%rdi), %ymm11
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm8[0],ymm12[0],ymm8[2],ymm12[2]
+; AVX2-FCP-NEXT:    vpbroadcastd 652(%rdi), %ymm13
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm13[7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 752(%rdi), %xmm0
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm3 = ymm8[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 680(%rdi), %xmm11
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm4 = ymm11[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 680(%rdi), %xmm13
 ; AVX2-FCP-NEXT:    vmovdqa 704(%rdi), %xmm10
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm11[0],xmm10[1],xmm11[2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm13[0],ymm6[0],ymm13[2],ymm6[2]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm13[0],xmm10[1],xmm13[2,3]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm9[0],ymm15[2],ymm9[2]
 ; AVX2-FCP-NEXT:    vpbroadcastd 876(%rdi), %ymm15
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm15[7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 304(%rdi), %xmm0
-; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm11 = ymm9[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm0[3],ymm11[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm13 = ymm6[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm0[3],ymm13[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpbroadcastd 232(%rdi), %xmm15
 ; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %xmm0
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm15[0,1],ymm11[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm3[0],ymm8[0],ymm3[2],ymm8[2]
-; AVX2-FCP-NEXT:    vpbroadcastd 428(%rdi), %ymm13
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4],ymm13[5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm7 = ymm7[0],mem[1],ymm7[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
+; AVX2-FCP-NEXT:    vpbroadcastd 428(%rdi), %ymm14
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],mem[3]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm3 = ymm15[0,2],ymm9[1,3],ymm15[4,6],ymm9[5,7]
+; AVX2-FCP-NEXT:    vbroadcastss 208(%rdi), %ymm11
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3]
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[3,1,1,0,7,5,5,4]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm7[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa %ymm14, %ymm5
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm7 = ymm14[0,2],ymm5[1,3],ymm14[4,6],ymm5[5,7]
-; AVX2-FCP-NEXT:    vbroadcastss 208(%rdi), %ymm9
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm9[7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],mem[3]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa %ymm12, %ymm9
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm12[0,2],ymm13[1,3],ymm12[4,6],ymm13[5,7]
-; AVX2-FCP-NEXT:    vbroadcastss 656(%rdi), %ymm6
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm6[7]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa %ymm8, %ymm13
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm8[0,2],ymm12[1,3],ymm8[4,6],ymm12[5,7]
+; AVX2-FCP-NEXT:    vbroadcastss 656(%rdi), %ymm3
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = mem[0],ymm4[1],mem[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3]
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa %ymm3, %ymm7
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm8[1,3],ymm3[4,6],ymm8[5,7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm5[0,2],ymm4[1,3],ymm5[4,6],ymm4[5,7]
+; AVX2-FCP-NEXT:    vmovdqa %ymm4, %ymm8
 ; AVX2-FCP-NEXT:    vbroadcastss 432(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
@@ -6819,235 +6747,239 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3]
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm12[1,3],ymm10[4,6],ymm12[5,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm5[1,3],ymm6[4,6],ymm5[5,7]
 ; AVX2-FCP-NEXT:    vbroadcastss 880(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4,3,0,0]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
+; AVX2-FCP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,3,0,0]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-FCP-NEXT:    vpbroadcastd 548(%rdi), %xmm2
-; AVX2-FCP-NEXT:    vmovdqa 512(%rdi), %xmm15
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm15[0,1,2],xmm2[3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
+; AVX2-FCP-NEXT:    vmovdqa 512(%rdi), %xmm14
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
 ; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7]
-; AVX2-FCP-NEXT:    vpermd %ymm9, %ymm11, %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm13[6,7]
-; AVX2-FCP-NEXT:    vmovdqa %ymm13, %ymm9
+; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm11, %ymm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7]
+; AVX2-FCP-NEXT:    vmovaps %ymm12, %ymm13
 ; AVX2-FCP-NEXT:    vpbroadcastd 660(%rdi), %ymm3
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpbroadcastd 100(%rdi), %xmm1
-; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %xmm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpbroadcastd 100(%rdi), %xmm2
+; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %xmm0
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm1, %ymm3
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
+; AVX2-FCP-NEXT:    vmovaps %ymm15, %ymm10
+; AVX2-FCP-NEXT:    vpermd %ymm15, %ymm11, %ymm3
+; AVX2-FCP-NEXT:    vmovaps %ymm9, %ymm7
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 212(%rdi), %ymm4
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; AVX2-FCP-NEXT:    vpermd %ymm14, %ymm11, %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-FCP-NEXT:    vmovdqa %ymm5, %ymm4
-; AVX2-FCP-NEXT:    vpbroadcastd 212(%rdi), %ymm5
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, (%rsp) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vpbroadcastd 324(%rdi), %xmm2
-; AVX2-FCP-NEXT:    vmovdqa 288(%rdi), %xmm6
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
-; AVX2-FCP-NEXT:    vpermd %ymm7, %ymm11, %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm8[6,7]
-; AVX2-FCP-NEXT:    vmovdqa %ymm8, %ymm13
-; AVX2-FCP-NEXT:    vpbroadcastd 436(%rdi), %ymm5
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm3
+; AVX2-FCP-NEXT:    vpbroadcastd 324(%rdi), %xmm4
+; AVX2-FCP-NEXT:    vmovdqa 288(%rdi), %xmm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpermd %ymm9, %ymm11, %ymm4
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7]
+; AVX2-FCP-NEXT:    vmovdqa %ymm8, %ymm15
+; AVX2-FCP-NEXT:    vpbroadcastd 436(%rdi), %ymm8
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm1, %ymm1
+; AVX2-FCP-NEXT:    vpbroadcastd 772(%rdi), %xmm4
+; AVX2-FCP-NEXT:    vmovdqa 736(%rdi), %xmm3
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
+; AVX2-FCP-NEXT:    vpermd %ymm6, %ymm11, %ymm4
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-FCP-NEXT:    vmovaps %ymm5, %ymm12
+; AVX2-FCP-NEXT:    vpbroadcastd 884(%rdi), %ymm8
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vpbroadcastd 772(%rdi), %xmm1
-; AVX2-FCP-NEXT:    vmovdqa 736(%rdi), %xmm8
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm11, %ymm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7]
-; AVX2-FCP-NEXT:    vmovdqa %ymm12, %ymm2
-; AVX2-FCP-NEXT:    vpbroadcastd 884(%rdi), %ymm5
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0],ymm14[1],ymm4[2,3,4],ymm14[5],ymm4[6,7]
-; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [1,0,3,3,1,0,7,7]
-; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm12, %ymm0
-; AVX2-FCP-NEXT:    vpbroadcastd 216(%rdi), %ymm5
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7]
-; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %xmm7
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7]
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm4 = [1,0,3,3,1,0,7,7]
+; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm4, %ymm1
+; AVX2-FCP-NEXT:    vbroadcastss 216(%rdi), %ymm6
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7]
+; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %xmm6
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX2-FCP-NEXT:    vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm7 = mem[1,0,2,3,5,4,6,7]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa 544(%rdi), %xmm8
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm8[0,1,2],xmm14[3]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
 ; AVX2-FCP-NEXT:    vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm5 = mem[1,0,2,3,5,4,6,7]
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa 544(%rdi), %xmm0
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm0[0,1,2],xmm15[3]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3]
+; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm5 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm4, %ymm5
+; AVX2-FCP-NEXT:    vbroadcastss 664(%rdi), %ymm7
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %xmm13
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
 ; AVX2-FCP-NEXT:    vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm5 = mem[1,0,2,3,5,4,6,7]
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
-; AVX2-FCP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm5 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm5, %ymm12, %ymm5
-; AVX2-FCP-NEXT:    vpbroadcastd 664(%rdi), %ymm9
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm9[7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %xmm9
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm9[0,1,2],xmm6[3]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2]
-; AVX2-FCP-NEXT:    vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm6 = mem[1,0,2,3,5,4,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0],ymm9[1],ymm15[2,3,4],ymm9[5],ymm15[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm4, %ymm5
+; AVX2-FCP-NEXT:    vbroadcastss 440(%rdi), %ymm7
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa 768(%rdi), %xmm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2]
+; AVX2-FCP-NEXT:    vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm7 = mem[1,0,2,3,5,4,6,7]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3]
+; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm7 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm4, %ymm4
+; AVX2-FCP-NEXT:    vbroadcastss 888(%rdi), %ymm7
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 584(%rdi), %xmm3
+; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm3 = xmm3[0],mem[1],xmm3[2,3]
+; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX2-FCP-NEXT:    vpermd 640(%rdi), %ymm11, %ymm4
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 528(%rdi), %ymm4
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm8[3]
+; AVX2-FCP-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm8 = mem[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm8
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 808(%rdi), %xmm3
+; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm3 = xmm3[0],mem[1],xmm3[2,3]
+; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX2-FCP-NEXT:    vpermd 864(%rdi), %ymm11, %ymm4
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 752(%rdi), %ymm4
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3]
+; AVX2-FCP-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm4 = mem[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm4
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 136(%rdi), %xmm3
+; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm3 = xmm3[0],mem[1],xmm3[2,3]
+; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX2-FCP-NEXT:    vpermd 192(%rdi), %ymm11, %ymm4
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 80(%rdi), %ymm4
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3]
+; AVX2-FCP-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm6 = mem[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7]
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
-; AVX2-FCP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm6 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm6, %ymm12, %ymm6
-; AVX2-FCP-NEXT:    vpbroadcastd 440(%rdi), %ymm13
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm13[7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa 768(%rdi), %xmm14
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm14[0,1,2],xmm8[3]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2]
-; AVX2-FCP-NEXT:    vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm8 = mem[1,0,2,3,5,4,6,7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm8
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm2[0],ymm10[1],ymm2[2,3,4],ymm10[5],ymm2[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm8, %ymm12, %ymm8
-; AVX2-FCP-NEXT:    vpbroadcastd 888(%rdi), %ymm10
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm10[7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 584(%rdi), %xmm8
-; AVX2-FCP-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm8 = xmm8[0],mem[1],xmm8[2,3]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX2-FCP-NEXT:    vpermd 640(%rdi), %ymm11, %ymm10
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 528(%rdi), %ymm10
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 360(%rdi), %xmm4
+; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm4 = xmm4[0],mem[1],xmm4[2,3]
+; AVX2-FCP-NEXT:    vpermd 416(%rdi), %ymm11, %ymm6
+; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
+; AVX2-FCP-NEXT:    vpbroadcastd 304(%rdi), %ymm6
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm13[3]
 ; AVX2-FCP-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm10 = mem[2,3,2,3,6,7,6,7]
 ; AVX2-FCP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm10 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7]
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm10
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 808(%rdi), %xmm0
-; AVX2-FCP-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vpermd 864(%rdi), %ymm11, %ymm10
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 752(%rdi), %ymm10
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3]
-; AVX2-FCP-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm12 = mem[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm12
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 136(%rdi), %xmm0
-; AVX2-FCP-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vpermd 192(%rdi), %ymm11, %ymm12
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 80(%rdi), %ymm12
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm12[0,1,2],xmm7[3]
-; AVX2-FCP-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm12 = mem[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm12
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 360(%rdi), %xmm7
-; AVX2-FCP-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm7 = xmm7[0],mem[1],xmm7[2,3]
-; AVX2-FCP-NEXT:    vpermd 416(%rdi), %ymm11, %ymm11
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastd 304(%rdi), %ymm11
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3]
-; AVX2-FCP-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm11 = mem[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm11
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm9, 96(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm9, 32(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm9, 64(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm9, (%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm2, 96(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm2, 32(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm2, 64(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm9, (%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm2, 32(%rcx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm2, 96(%rcx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm2, 64(%rcx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm2, (%rcx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm2, 96(%r8)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm2, 32(%r8)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm2, 64(%r8)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm2, (%r8)
-; AVX2-FCP-NEXT:    vmovdqa %ymm1, 96(%r9)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm1, 32(%r9)
-; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm1, (%r9)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm1, 64(%r9)
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm6, 96(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm6, 32(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm6, 64(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm6, (%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm6, 96(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm6, 32(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm6, 64(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm6, (%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm6, 32(%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm6, 96(%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm6, 64(%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm6, (%rcx)
+; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm6, 96(%r8)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm6, 32(%r8)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm6, 64(%r8)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm6, (%r8)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm6, 96(%r9)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm6, 32(%r9)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm6, (%r9)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm6, 64(%r9)
 ; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-FCP-NEXT:    vmovdqa %ymm6, 96(%rax)
-; AVX2-FCP-NEXT:    vmovdqa %ymm4, 32(%rax)
-; AVX2-FCP-NEXT:    vmovdqa %ymm5, 64(%rax)
-; AVX2-FCP-NEXT:    vmovdqa %ymm3, (%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm7, 96(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 32(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm1, 64(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm0, (%rax)
 ; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-FCP-NEXT:    vmovdqa %ymm7, 32(%rax)
-; AVX2-FCP-NEXT:    vmovdqa %ymm0, (%rax)
-; AVX2-FCP-NEXT:    vmovdqa %ymm10, 96(%rax)
-; AVX2-FCP-NEXT:    vmovdqa %ymm8, 64(%rax)
-; AVX2-FCP-NEXT:    addq $1192, %rsp # imm = 0x4A8
+; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm3, (%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm2, 96(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm8, 64(%rax)
+; AVX2-FCP-NEXT:    addq $1224, %rsp # imm = 0x4C8
 ; AVX2-FCP-NEXT:    vzeroupper
 ; AVX2-FCP-NEXT:    retq
 ;
@@ -11264,13 +11196,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX2-LABEL: load_i32_stride7_vf64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    subq $2680, %rsp # imm = 0xA78
-; AVX2-NEXT:    vmovdqa 1216(%rdi), %ymm12
-; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    subq $2648, %rsp # imm = 0xA58
+; AVX2-NEXT:    vmovdqa 1216(%rdi), %ymm9
+; AVX2-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 1152(%rdi), %ymm4
+; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 1120(%rdi), %ymm5
-; AVX2-NEXT:    vmovdqa 768(%rdi), %ymm13
-; AVX2-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 768(%rdi), %ymm12
+; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 704(%rdi), %ymm6
 ; AVX2-NEXT:    vmovdqa 672(%rdi), %ymm7
 ; AVX2-NEXT:    vmovdqa 320(%rdi), %ymm8
@@ -11297,13 +11230,12 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7]
-; AVX2-NEXT:    vmovdqa %ymm7, %ymm9
 ; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa %ymm6, %ymm8
 ; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpbroadcastq 752(%rdi), %ymm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-NEXT:    vmovdqa 800(%rdi), %xmm2
 ; AVX2-NEXT:    vmovdqa 832(%rdi), %xmm3
@@ -11317,11 +11249,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7]
 ; AVX2-NEXT:    vmovdqa %ymm5, %ymm6
 ; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa %ymm4, %ymm5
-; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpbroadcastq 1200(%rdi), %ymm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-NEXT:    vmovdqa 1248(%rdi), %xmm2
 ; AVX2-NEXT:    vmovdqa 1280(%rdi), %xmm3
@@ -11332,12 +11262,12 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 1600(%rdi), %ymm2
-; AVX2-NEXT:    vmovdqa 1568(%rdi), %ymm1
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
-; AVX2-NEXT:    vmovdqa %ymm2, %ymm14
-; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 1600(%rdi), %ymm13
+; AVX2-NEXT:    vmovdqa 1568(%rdi), %ymm3
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm13[6],ymm3[7]
+; AVX2-NEXT:    vmovdqa %ymm3, %ymm5
+; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vmovdqa 1664(%rdi), %ymm3
 ; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -11357,11 +11287,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpbroadcastq 80(%rdi), %ymm1
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm2
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm2
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7]
+; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm3
+; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7]
 ; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
 ; AVX2-NEXT:    vmovdqa 128(%rdi), %xmm2
@@ -11374,11 +11304,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 480(%rdi), %ymm2
-; AVX2-NEXT:    vmovdqa 448(%rdi), %ymm13
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm2[6],ymm13[7]
-; AVX2-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa %ymm2, %ymm12
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 448(%rdi), %ymm1
+; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
 ; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vmovdqa 544(%rdi), %ymm3
 ; AVX2-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
@@ -11394,12 +11323,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 928(%rdi), %ymm1
+; AVX2-NEXT:    vmovdqa 928(%rdi), %ymm2
+; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 896(%rdi), %ymm1
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 896(%rdi), %ymm3
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6],ymm3[7]
-; AVX2-NEXT:    vmovdqa %ymm3, %ymm15
-; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
 ; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vmovdqa 992(%rdi), %ymm3
 ; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -11415,16 +11343,16 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 1376(%rdi), %ymm2
-; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 1344(%rdi), %ymm1
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
+; AVX2-NEXT:    vmovdqa 1376(%rdi), %ymm14
+; AVX2-NEXT:    vmovdqa 1344(%rdi), %ymm15
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm14[6],ymm15[7]
+; AVX2-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa 1440(%rdi), %ymm7
+; AVX2-NEXT:    vmovdqa 1440(%rdi), %ymm4
 ; AVX2-NEXT:    vpbroadcastq 1424(%rdi), %ymm1
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX2-NEXT:    vmovdqa 1472(%rdi), %xmm1
 ; AVX2-NEXT:    vmovdqa 1504(%rdi), %xmm2
@@ -11443,215 +11371,192 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11],ymm1[28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-NEXT:    vmovdqa 288(%rdi), %ymm1
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
+; AVX2-NEXT:    vmovdqa 288(%rdi), %ymm12
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm12[2,3],ymm9[4,5],ymm12[6,7]
+; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,2,2,2]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa 832(%rdi), %ymm2
-; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 800(%rdi), %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7]
+; AVX2-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,0,7,6,5,6,5,6]
+; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX2-NEXT:    vmovdqa 736(%rdi), %ymm10
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7]
-; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-NEXT:    vmovdqa 832(%rdi), %ymm3
+; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 800(%rdi), %ymm2
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,2,2,2]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa 1280(%rdi), %ymm2
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-NEXT:    vmovdqa 736(%rdi), %ymm2
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 1248(%rdi), %ymm1
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX2-NEXT:    vmovdqa 1184(%rdi), %ymm1
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7]
+; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5,6],ymm8[7]
+; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3],ymm9[4,5],ymm1[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-NEXT:    vmovdqa 1280(%rdi), %ymm3
+; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 1248(%rdi), %ymm2
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,2,2,2]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa 1728(%rdi), %ymm2
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-NEXT:    vmovdqa 1184(%rdi), %ymm2
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 1696(%rdi), %ymm1
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7]
+; AVX2-NEXT:    vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm7 = mem[0],ymm6[1],mem[2,3,4],ymm6[5],mem[6,7]
+; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5,6],ymm7[7]
+; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX2-NEXT:    vmovdqa 1632(%rdi), %ymm6
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7]
-; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-NEXT:    vmovdqa 1728(%rdi), %ymm3
+; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 1696(%rdi), %ymm2
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,2,2,2]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa 608(%rdi), %ymm2
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-NEXT:    vmovdqa 1632(%rdi), %ymm2
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 576(%rdi), %ymm1
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX2-NEXT:    vmovdqa 512(%rdi), %ymm1
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0],ymm5[1],ymm13[2,3,4],ymm5[5],ymm13[6,7]
+; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6],ymm5[7]
+; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu (%rsp), %ymm14 # 32-byte Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-NEXT:    vmovdqa 608(%rdi), %ymm3
+; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 576(%rdi), %ymm2
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,2,2,2]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa 1056(%rdi), %ymm2
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-NEXT:    vmovdqa 512(%rdi), %ymm2
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 1024(%rdi), %ymm1
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX2-NEXT:    vmovdqa 960(%rdi), %ymm1
+; AVX2-NEXT:    vmovdqu (%rsp), %ymm8 # 32-byte Reload
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm2[2,3],ymm8[4,5],ymm2[6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7]
+; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7]
+; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3],ymm8[4,5],ymm1[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-NEXT:    vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = mem[0],ymm15[1],mem[2,3,4],ymm15[5],mem[6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-NEXT:    vmovdqa 1056(%rdi), %ymm3
+; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 1024(%rdi), %ymm2
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,2,2,2]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa 1504(%rdi), %ymm2
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-NEXT:    vmovdqa 960(%rdi), %ymm2
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 1472(%rdi), %ymm1
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX2-NEXT:    vmovdqa 1408(%rdi), %ymm1
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm2[2,3],ymm13[4,5],ymm2[6,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7]
+; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7]
+; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
-; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,2,2,2]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm2
-; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm3
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-NEXT:    vmovdqa %ymm3, %ymm13
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-NEXT:    vmovdqa 1504(%rdi), %ymm3
 ; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa %ymm2, %ymm12
+; AVX2-NEXT:    vmovdqa 1472(%rdi), %ymm2
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm3
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-NEXT:    vmovdqa 1408(%rdi), %ymm2
+; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7]
+; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7]
+; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm3
+; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm5
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-NEXT:    vmovdqa %ymm5, %ymm14
+; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa %ymm3, %ymm15
+; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm4
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm3[2,3],ymm7[4,5],ymm3[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm4[2,3],ymm7[4,5],ymm4[6,7]
+; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT:    vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = mem[0],ymm2[1],mem[2,3,4],ymm2[5],mem[6,7]
-; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7]
+; AVX2-NEXT:    vpermd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 304(%rdi), %xmm0
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
 ; AVX2-NEXT:    vpbroadcastd 232(%rdi), %xmm1
-; AVX2-NEXT:    vmovdqa 256(%rdi), %xmm2
-; AVX2-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
+; AVX2-NEXT:    vmovdqa 256(%rdi), %xmm5
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm1 = ymm4[0],mem[0],ymm4[2],mem[2]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2]
 ; AVX2-NEXT:    vpbroadcastd 428(%rdi), %ymm2
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 752(%rdi), %xmm0
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
 ; AVX2-NEXT:    vpbroadcastd 680(%rdi), %xmm1
 ; AVX2-NEXT:    vmovdqa 704(%rdi), %xmm2
 ; AVX2-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm1 = ymm11[0],mem[0],ymm11[2],mem[2]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
 ; AVX2-NEXT:    vpbroadcastd 876(%rdi), %ymm2
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 1200(%rdi), %xmm0
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
 ; AVX2-NEXT:    vpbroadcastd 1128(%rdi), %xmm1
 ; AVX2-NEXT:    vmovdqa 1152(%rdi), %xmm2
@@ -11666,7 +11571,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 1648(%rdi), %xmm0
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm5[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
 ; AVX2-NEXT:    vpbroadcastd 1576(%rdi), %xmm1
 ; AVX2-NEXT:    vmovdqa 1600(%rdi), %xmm2
@@ -11680,26 +11586,26 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 80(%rdi), %xmm0
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
 ; AVX2-NEXT:    vpbroadcastd 8(%rdi), %xmm1
-; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm3
-; AVX2-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3]
+; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm4
+; AVX2-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[2],ymm12[2]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[2],ymm15[2]
 ; AVX2-NEXT:    vpbroadcastd 204(%rdi), %ymm6
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 528(%rdi), %xmm0
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm14[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
 ; AVX2-NEXT:    vpbroadcastd 456(%rdi), %xmm1
-; AVX2-NEXT:    vmovdqa 480(%rdi), %xmm3
-; AVX2-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3]
+; AVX2-NEXT:    vmovdqa 480(%rdi), %xmm4
+; AVX2-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
@@ -11710,23 +11616,23 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 976(%rdi), %xmm0
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
 ; AVX2-NEXT:    vpbroadcastd 904(%rdi), %xmm15
-; AVX2-NEXT:    vmovdqa 928(%rdi), %xmm12
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm15 = xmm15[0],xmm12[1],xmm15[2,3]
+; AVX2-NEXT:    vmovdqa 928(%rdi), %xmm11
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm15 = xmm15[0],xmm11[1],xmm15[2,3]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm8[0],ymm7[0],ymm8[2],ymm7[2]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm7[0],ymm10[2],ymm7[2]
 ; AVX2-NEXT:    vpbroadcastd 1100(%rdi), %ymm14
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 1424(%rdi), %xmm0
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT:    vpalignr {{.*#+}} ymm14 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-NEXT:    vpalignr {{.*#+}} ymm14 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6,7]
 ; AVX2-NEXT:    vpbroadcastd 1352(%rdi), %xmm15
 ; AVX2-NEXT:    vmovdqa 1376(%rdi), %xmm0
@@ -11739,30 +11645,30 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm13 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7]
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm3[0,1,2],mem[3]
-; AVX2-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3,4,5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm13 = ymm4[0,2],ymm15[1,3],ymm4[4,6],ymm15[5,7]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm13 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],mem[3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[3,2,2,3]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3,4,5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vshufps {{.*#+}} ymm13 = ymm4[0,2],ymm12[1,3],ymm4[4,6],ymm12[5,7]
+; AVX2-NEXT:    vmovaps %ymm4, %ymm12
 ; AVX2-NEXT:    vbroadcastss 432(%rdi), %ymm14
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm13[5,6,7]
-; AVX2-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm5 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7]
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm3[0,1,2],mem[3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm13[5,6,7]
+; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm5 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7]
+; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0,1,2],mem[3]
 ; AVX2-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3]
 ; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7]
-; AVX2-NEXT:    vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm5 = ymm11[0,2],mem[1,3],ymm11[4,6],mem[5,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm14[0,2],ymm3[1,3],ymm14[4,6],ymm3[5,7]
 ; AVX2-NEXT:    vbroadcastss 880(%rdi), %ymm13
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm13[7]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm5[5,6,7]
@@ -11775,9 +11681,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3]
 ; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm13[0,2],ymm14[1,3],ymm13[4,6],ymm14[5,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm4 = ymm4[0,2],mem[1,3],ymm4[4,6],mem[5,7]
 ; AVX2-NEXT:    vbroadcastss 1328(%rdi), %ymm5
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7]
@@ -11789,19 +11695,20 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[3,1,1,0,7,5,5,4]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm3 = ymm3[0,2],mem[1,3],ymm3[4,6],mem[5,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-NEXT:    vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm3 = ymm13[0,2],mem[1,3],ymm13[4,6],mem[5,7]
 ; AVX2-NEXT:    vbroadcastss 1776(%rdi), %ymm4
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0],ymm9[1],ymm10[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2,3,4,5,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
 ; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm6[0,2],ymm1[1,3],ymm6[4,6],ymm1[5,7]
+; AVX2-NEXT:    vmovaps %ymm1, %ymm9
 ; AVX2-NEXT:    vbroadcastss 1552(%rdi), %ymm3
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
@@ -11809,13 +11716,13 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],mem[3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm11[0,1,2],mem[3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm8[0,2],ymm7[1,3],ymm8[4,6],ymm7[5,7]
-; AVX2-NEXT:    vmovaps %ymm7, %ymm9
-; AVX2-NEXT:    vmovaps %ymm8, %ymm11
+; AVX2-NEXT:    vmovdqa %ymm10, %ymm8
+; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm7[1,3],ymm10[4,6],ymm7[5,7]
+; AVX2-NEXT:    vmovaps %ymm7, %ymm11
 ; AVX2-NEXT:    vbroadcastss 1104(%rdi), %ymm2
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
@@ -11828,9 +11735,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
 ; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm8[1,3],ymm6[4,6],ymm8[5,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[0,2],ymm7[1,3],ymm4[4,6],ymm7[5,7]
 ; AVX2-NEXT:    vbroadcastss 656(%rdi), %ymm2
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
@@ -11844,125 +11751,128 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm4[1,3],ymm3[4,6],ymm4[5,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm10[1,3],ymm3[4,6],ymm10[5,7]
 ; AVX2-NEXT:    vbroadcastss 208(%rdi), %ymm2
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vbroadcastss 100(%rdi), %xmm0
-; AVX2-NEXT:    vmovaps 64(%rdi), %xmm1
-; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX2-NEXT:    vmovsd {{.*#+}} xmm7 = [4,3,0,0]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm2, %ymm7, %ymm2
-; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3]
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7]
-; AVX2-NEXT:    vpermps %ymm3, %ymm0, %ymm3
-; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-NEXT:    vbroadcastss 212(%rdi), %ymm4
+; AVX2-NEXT:    vmovaps 64(%rdi), %xmm6
+; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3]
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm5 = [4,3,0,0]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm1, %ymm5, %ymm1
+; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm15 = [0,7,0,7,0,7,0,7]
+; AVX2-NEXT:    vpermps %ymm3, %ymm15, %ymm1
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6,7]
+; AVX2-NEXT:    vbroadcastss 212(%rdi), %ymm2
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    vbroadcastss 324(%rdi), %xmm2
+; AVX2-NEXT:    vmovaps 288(%rdi), %xmm1
+; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
+; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX2-NEXT:    vpermps %ymm12, %ymm15, %ymm2
+; AVX2-NEXT:    vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = ymm2[0,1,2,3,4,5],mem[6,7]
+; AVX2-NEXT:    vbroadcastss 436(%rdi), %ymm3
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    vbroadcastss 548(%rdi), %xmm3
+; AVX2-NEXT:    vmovaps 512(%rdi), %xmm2
+; AVX2-NEXT:    vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3]
+; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3]
+; AVX2-NEXT:    vpermps %ymm4, %ymm15, %ymm3
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6,7]
+; AVX2-NEXT:    vbroadcastss 660(%rdi), %ymm4
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm2, %ymm7, %ymm3
-; AVX2-NEXT:    vbroadcastss 324(%rdi), %xmm4
-; AVX2-NEXT:    vmovaps 288(%rdi), %xmm2
-; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3]
-; AVX2-NEXT:    vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
-; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-NEXT:    vbroadcastss 436(%rdi), %ymm5
-; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-NEXT:    vmovups %ymm3, (%rsp) # 32-byte Spill
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm3, %ymm7, %ymm4
-; AVX2-NEXT:    vbroadcastss 548(%rdi), %xmm5
-; AVX2-NEXT:    vmovaps 512(%rdi), %xmm3
-; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3]
-; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
-; AVX2-NEXT:    vpermps %ymm6, %ymm0, %ymm5
-; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7]
-; AVX2-NEXT:    vbroadcastss 660(%rdi), %ymm6
-; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm4, %ymm7, %ymm5
-; AVX2-NEXT:    vbroadcastss 772(%rdi), %xmm6
-; AVX2-NEXT:    vmovaps 736(%rdi), %xmm4
-; AVX2-NEXT:    vblendps {{.*#+}} xmm6 = xmm4[0,1,2],xmm6[3]
-; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
-; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    vbroadcastss 772(%rdi), %xmm4
+; AVX2-NEXT:    vmovaps 736(%rdi), %xmm3
+; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3]
+; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
+; AVX2-NEXT:    vpermps %ymm14, %ymm15, %ymm4
+; AVX2-NEXT:    vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm4 = ymm4[0,1,2,3,4,5],mem[6,7]
+; AVX2-NEXT:    vbroadcastss 884(%rdi), %ymm7
+; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    vbroadcastss 996(%rdi), %xmm7
+; AVX2-NEXT:    vmovaps 960(%rdi), %xmm4
+; AVX2-NEXT:    vblendps {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3]
+; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3]
+; AVX2-NEXT:    vpermps %ymm8, %ymm15, %ymm7
+; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7]
+; AVX2-NEXT:    vbroadcastss 1108(%rdi), %ymm8
+; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    vbroadcastss 1220(%rdi), %xmm7
+; AVX2-NEXT:    vmovaps 1184(%rdi), %xmm14
+; AVX2-NEXT:    vblendps {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3]
+; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3]
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7]
-; AVX2-NEXT:    vbroadcastss 884(%rdi), %ymm8
-; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm5, %ymm7, %ymm6
-; AVX2-NEXT:    vbroadcastss 996(%rdi), %xmm8
-; AVX2-NEXT:    vmovaps 960(%rdi), %xmm5
-; AVX2-NEXT:    vblendps {{.*#+}} xmm8 = xmm5[0,1,2],xmm8[3]
-; AVX2-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3]
-; AVX2-NEXT:    vpermps %ymm11, %ymm0, %ymm8
+; AVX2-NEXT:    vpermps %ymm10, %ymm15, %ymm7
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm12[6,7]
+; AVX2-NEXT:    vbroadcastss 1332(%rdi), %ymm8
+; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm0, %ymm5, %ymm7
+; AVX2-NEXT:    vbroadcastss 1444(%rdi), %xmm8
+; AVX2-NEXT:    vmovaps 1408(%rdi), %xmm0
+; AVX2-NEXT:    vblendps {{.*#+}} xmm8 = xmm0[0,1,2],xmm8[3]
+; AVX2-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3]
+; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm8 # 32-byte Folded Reload
+; AVX2-NEXT:    vmovaps %ymm9, %ymm11
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-NEXT:    vbroadcastss 1108(%rdi), %ymm9
+; AVX2-NEXT:    vbroadcastss 1556(%rdi), %ymm9
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm6, %ymm7, %ymm8
-; AVX2-NEXT:    vbroadcastss 1220(%rdi), %xmm9
-; AVX2-NEXT:    vmovaps 1184(%rdi), %xmm6
-; AVX2-NEXT:    vblendps {{.*#+}} xmm9 = xmm6[0,1,2],xmm9[3]
-; AVX2-NEXT:    vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3]
-; AVX2-NEXT:    vpermps %ymm13, %ymm0, %ymm9
-; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7]
-; AVX2-NEXT:    vbroadcastss 1332(%rdi), %ymm12
-; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm12[7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm8, %ymm7, %ymm8
-; AVX2-NEXT:    vbroadcastss 1444(%rdi), %xmm9
-; AVX2-NEXT:    vmovaps 1408(%rdi), %xmm13
-; AVX2-NEXT:    vblendps {{.*#+}} xmm9 = xmm13[0,1,2],xmm9[3]
-; AVX2-NEXT:    vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3]
-; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
-; AVX2-NEXT:    vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm9 = ymm9[0,1,2,3,4,5],mem[6,7]
-; AVX2-NEXT:    vbroadcastss 1556(%rdi), %ymm12
-; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm12[7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm8, %ymm7, %ymm7
+; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm7, %ymm5, %ymm7
 ; AVX2-NEXT:    vbroadcastss 1668(%rdi), %xmm8
-; AVX2-NEXT:    vmovaps 1632(%rdi), %xmm14
-; AVX2-NEXT:    vblendps {{.*#+}} xmm8 = xmm14[0,1,2],xmm8[3]
+; AVX2-NEXT:    vmovaps 1632(%rdi), %xmm5
+; AVX2-NEXT:    vblendps {{.*#+}} xmm8 = xmm5[0,1,2],xmm8[3]
 ; AVX2-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3]
-; AVX2-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7]
+; AVX2-NEXT:    vpermps %ymm13, %ymm15, %ymm8
+; AVX2-NEXT:    vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
 ; AVX2-NEXT:    vbroadcastss 1780(%rdi), %ymm9
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
@@ -11974,355 +11884,356 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
 ; AVX2-NEXT:    vbroadcastss 216(%rdi), %ymm8
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
-; AVX2-NEXT:    vmovaps 96(%rdi), %xmm12
-; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3]
-; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2]
+; AVX2-NEXT:    vmovaps 96(%rdi), %xmm9
+; AVX2-NEXT:    vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3]
+; AVX2-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[0,1,3,2]
 ; AVX2-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm8 = mem[1,0,2,3,5,4,6,7]
 ; AVX2-NEXT:    vextractf128 $1, %ymm8, %xmm8
-; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovaps 320(%rdi), %xmm13
+; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3]
+; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2]
+; AVX2-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm6 = mem[1,0,2,3,5,4,6,7]
+; AVX2-NEXT:    vextractf128 $1, %ymm6, %xmm6
+; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-NEXT:    vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm6 = mem[0],ymm6[1],mem[2,3,4],ymm6[5],mem[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3]
+; AVX2-NEXT:    vbroadcastss 440(%rdi), %ymm7
+; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 320(%rdi), %xmm1
-; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
+; AVX2-NEXT:    vmovaps 544(%rdi), %xmm8
+; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm2[3]
+; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2]
+; AVX2-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = mem[1,0,2,3,5,4,6,7]
+; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
+; AVX2-NEXT:    vbroadcastss 664(%rdi), %ymm6
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm6[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovaps 768(%rdi), %xmm1
+; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm3[3]
 ; AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2]
-; AVX2-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm7 = mem[1,0,2,3,5,4,6,7]
-; AVX2-NEXT:    vextractf128 $1, %ymm7, %xmm7
-; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3]
-; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm7 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
-; AVX2-NEXT:    vbroadcastss 440(%rdi), %ymm8
-; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm3 = mem[1,0,2,3,5,4,6,7]
+; AVX2-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-NEXT:    vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
+; AVX2-NEXT:    vbroadcastss 888(%rdi), %ymm6
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 544(%rdi), %xmm2
-; AVX2-NEXT:    vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3]
+; AVX2-NEXT:    vmovaps 992(%rdi), %xmm2
+; AVX2-NEXT:    vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm4[3]
 ; AVX2-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2]
-; AVX2-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm7 = mem[1,0,2,3,5,4,6,7]
-; AVX2-NEXT:    vextractf128 $1, %ymm7, %xmm7
-; AVX2-NEXT:    vblendps {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
-; AVX2-NEXT:    vbroadcastss 664(%rdi), %ymm8
-; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 768(%rdi), %xmm3
-; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3]
+; AVX2-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm4 = mem[1,0,2,3,5,4,6,7]
+; AVX2-NEXT:    vextractf128 $1, %ymm4, %xmm4
+; AVX2-NEXT:    vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
+; AVX2-NEXT:    vbroadcastss 1112(%rdi), %ymm6
+; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-NEXT:    vmovaps 1216(%rdi), %xmm3
+; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm14[3]
 ; AVX2-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2]
-; AVX2-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm7 = mem[1,0,2,3,5,4,6,7]
-; AVX2-NEXT:    vextractf128 $1, %ymm7, %xmm7
-; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3]
-; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm7 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
-; AVX2-NEXT:    vbroadcastss 888(%rdi), %ymm8
-; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 992(%rdi), %xmm4
-; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm4[0,1,2],xmm5[3]
-; AVX2-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2]
-; AVX2-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm7 = mem[1,0,2,3,5,4,6,7]
-; AVX2-NEXT:    vextractf128 $1, %ymm7, %xmm7
-; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
-; AVX2-NEXT:    vbroadcastss 1112(%rdi), %ymm10
-; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm10[7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-NEXT:    vmovaps 1216(%rdi), %xmm15
-; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm6[3]
-; AVX2-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2]
 ; AVX2-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
 ; AVX2-NEXT:    # ymm6 = mem[1,0,2,3,5,4,6,7]
 ; AVX2-NEXT:    vextractf128 $1, %ymm6, %xmm6
-; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7]
+; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm12[0],ymm10[1],ymm12[2,3,4],ymm10[5],ymm12[6,7]
 ; AVX2-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3]
-; AVX2-NEXT:    vbroadcastss 1336(%rdi), %ymm7
-; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-NEXT:    vmovaps 1440(%rdi), %xmm5
-; AVX2-NEXT:    vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm13[3]
-; AVX2-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[0,1,3,2]
-; AVX2-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm7 = mem[1,0,2,3,5,4,6,7]
-; AVX2-NEXT:    vextractf128 $1, %ymm7, %xmm7
-; AVX2-NEXT:    vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-NEXT:    vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm7 = mem[0],ymm7[1],mem[2,3,4],ymm7[5],mem[6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
-; AVX2-NEXT:    vbroadcastss 1560(%rdi), %ymm13
-; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm13[7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-NEXT:    vmovaps 1664(%rdi), %xmm6
-; AVX2-NEXT:    vblendps {{.*#+}} xmm13 = xmm6[0,1,2],xmm14[3]
-; AVX2-NEXT:    vshufps {{.*#+}} xmm13 = xmm13[0,1,3,2]
-; AVX2-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm14 = mem[1,0,2,3,5,4,6,7]
-; AVX2-NEXT:    vextractf128 $1, %ymm14, %xmm14
-; AVX2-NEXT:    vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
-; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm14 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[1,0,3,3,5,4,7,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[0,1,0,3]
-; AVX2-NEXT:    vbroadcastss 1784(%rdi), %ymm9
-; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5,6],ymm9[7]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-NEXT:    vbroadcastss 136(%rdi), %xmm9
-; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm9 = xmm9[0],mem[1],xmm9[2,3]
-; AVX2-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-NEXT:    vpermps 192(%rdi), %ymm0, %ymm14
-; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7]
-; AVX2-NEXT:    vbroadcastss 80(%rdi), %ymm14
-; AVX2-NEXT:    vblendps {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
-; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm14 = mem[2,3,2,3,6,7,6,7]
-; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7]
-; AVX2-NEXT:    vextractf128 $1, %ymm14, %xmm14
-; AVX2-NEXT:    vblendps {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-NEXT:    vbroadcastss 360(%rdi), %xmm9
-; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm9 = xmm9[0],mem[1],xmm9[2,3]
-; AVX2-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-NEXT:    vpermps 416(%rdi), %ymm0, %ymm14
-; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7]
-; AVX2-NEXT:    vbroadcastss 304(%rdi), %ymm14
-; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3]
-; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm14 = mem[2,3,2,3,6,7,6,7]
-; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7]
-; AVX2-NEXT:    vextractf128 $1, %ymm14, %xmm14
-; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-NEXT:    vbroadcastss 584(%rdi), %xmm1
+; AVX2-NEXT:    vbroadcastss 1336(%rdi), %ymm10
+; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-NEXT:    vmovaps 1440(%rdi), %xmm4
+; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3]
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX2-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm10 = mem[1,0,2,3,5,4,6,7]
+; AVX2-NEXT:    vextractf128 $1, %ymm10, %xmm10
+; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
+; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm10 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[1,0,3,3,5,4,7,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3]
+; AVX2-NEXT:    vbroadcastss 1560(%rdi), %ymm12
+; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-NEXT:    vmovaps 1664(%rdi), %xmm14
+; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm14[0,1,2],xmm5[3]
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX2-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm5 = mem[1,0,2,3,5,4,6,7]
+; AVX2-NEXT:    vextractf128 $1, %ymm5, %xmm5
+; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
+; AVX2-NEXT:    vbroadcastss 1784(%rdi), %ymm12
+; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm12[7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT:    vbroadcastss 136(%rdi), %xmm0
+; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
+; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpermps 192(%rdi), %ymm15, %ymm5
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-NEXT:    vbroadcastss 80(%rdi), %ymm5
+; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm9[3]
+; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm11 = mem[2,3,2,3,6,7,6,7]
+; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7]
+; AVX2-NEXT:    vextractf128 $1, %ymm11, %xmm11
+; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vbroadcastss 360(%rdi), %xmm0
+; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
+; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpermps 416(%rdi), %ymm15, %ymm5
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-NEXT:    vbroadcastss 304(%rdi), %ymm5
+; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm13[3]
+; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm13 = mem[2,3,2,3,6,7,6,7]
+; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7]
+; AVX2-NEXT:    vextractf128 $1, %ymm13, %xmm13
+; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm13[0,1],xmm5[2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vbroadcastss 584(%rdi), %xmm0
+; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
+; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpermps 640(%rdi), %ymm15, %ymm5
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-NEXT:    vbroadcastss 528(%rdi), %ymm5
+; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3]
+; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm8 = mem[2,3,2,3,6,7,6,7]
+; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7]
+; AVX2-NEXT:    vextractf128 $1, %ymm8, %xmm8
+; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vbroadcastss 808(%rdi), %xmm0
+; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
+; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpermps 864(%rdi), %ymm15, %ymm5
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-NEXT:    vbroadcastss 752(%rdi), %ymm5
+; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3]
+; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm5 = mem[2,3,2,3,6,7,6,7]
+; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7]
+; AVX2-NEXT:    vextractf128 $1, %ymm5, %xmm5
+; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vbroadcastss 1032(%rdi), %xmm1
 ; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm1 = xmm1[0],mem[1],xmm1[2,3]
 ; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpermps 640(%rdi), %ymm0, %ymm9
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-NEXT:    vbroadcastss 528(%rdi), %ymm9
-; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3]
-; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm9 = mem[2,3,2,3,6,7,6,7]
-; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7]
-; AVX2-NEXT:    vextractf128 $1, %ymm9, %xmm9
-; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3]
+; AVX2-NEXT:    vpermps 1088(%rdi), %ymm15, %ymm5
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-NEXT:    vbroadcastss 976(%rdi), %ymm5
+; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3]
+; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm5 = mem[2,3,2,3,6,7,6,7]
+; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7]
+; AVX2-NEXT:    vextractf128 $1, %ymm5, %xmm5
+; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT:    vbroadcastss 808(%rdi), %xmm2
+; AVX2-NEXT:    vbroadcastss 1256(%rdi), %xmm2
 ; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
 ; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-NEXT:    vpermps 864(%rdi), %ymm0, %ymm9
-; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-NEXT:    vbroadcastss 752(%rdi), %ymm9
-; AVX2-NEXT:    vblendps {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3]
-; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm9 = mem[2,3,2,3,6,7,6,7]
-; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7]
-; AVX2-NEXT:    vextractf128 $1, %ymm9, %xmm9
-; AVX2-NEXT:    vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3]
+; AVX2-NEXT:    vpermps 1312(%rdi), %ymm15, %ymm5
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-NEXT:    vbroadcastss 1200(%rdi), %ymm5
+; AVX2-NEXT:    vblendps {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3]
+; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm5 = mem[2,3,2,3,6,7,6,7]
+; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7]
+; AVX2-NEXT:    vextractf128 $1, %ymm5, %xmm5
+; AVX2-NEXT:    vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-NEXT:    vbroadcastss 1032(%rdi), %xmm3
+; AVX2-NEXT:    vbroadcastss 1480(%rdi), %xmm3
 ; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm3 = xmm3[0],mem[1],xmm3[2,3]
 ; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-NEXT:    vpermps 1088(%rdi), %ymm0, %ymm9
-; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-NEXT:    vbroadcastss 976(%rdi), %ymm9
-; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3]
-; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm9 = mem[2,3,2,3,6,7,6,7]
-; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7]
-; AVX2-NEXT:    vextractf128 $1, %ymm9, %xmm9
-; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3]
+; AVX2-NEXT:    vpermps 1536(%rdi), %ymm15, %ymm5
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-NEXT:    vbroadcastss 1424(%rdi), %ymm5
+; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
+; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm5 = mem[2,3,2,3,6,7,6,7]
+; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7]
+; AVX2-NEXT:    vextractf128 $1, %ymm5, %xmm5
+; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-NEXT:    vbroadcastss 1256(%rdi), %xmm4
+; AVX2-NEXT:    vbroadcastss 1704(%rdi), %xmm4
 ; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm4 = xmm4[0],mem[1],xmm4[2,3]
+; AVX2-NEXT:    vpermps 1760(%rdi), %ymm15, %ymm5
 ; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-NEXT:    vpermps 1312(%rdi), %ymm0, %ymm9
-; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-NEXT:    vbroadcastss 1200(%rdi), %ymm9
-; AVX2-NEXT:    vblendps {{.*#+}} xmm9 = xmm9[0,1,2],xmm15[3]
-; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm15 = mem[2,3,2,3,6,7,6,7]
-; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7]
-; AVX2-NEXT:    vextractf128 $1, %ymm15, %xmm15
-; AVX2-NEXT:    vblendps {{.*#+}} xmm9 = xmm15[0,1],xmm9[2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-NEXT:    vbroadcastss 1480(%rdi), %xmm9
-; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm9 = xmm9[0],mem[1],xmm9[2,3]
-; AVX2-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-NEXT:    vpermps 1536(%rdi), %ymm0, %ymm15
-; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-NEXT:    vbroadcastss 1424(%rdi), %ymm15
-; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3]
-; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm15 = mem[2,3,2,3,6,7,6,7]
-; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7]
-; AVX2-NEXT:    vextractf128 $1, %ymm15, %xmm15
-; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-NEXT:    vbroadcastss 1704(%rdi), %xmm9
-; AVX2-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm9 = xmm9[0],mem[1],xmm9[2,3]
-; AVX2-NEXT:    vpermps 1760(%rdi), %ymm0, %ymm0
-; AVX2-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-NEXT:    vbroadcastss 1648(%rdi), %ymm9
-; AVX2-NEXT:    vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3]
-; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm9 = mem[2,3,2,3,6,7,6,7]
-; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX2-NEXT:    # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7]
-; AVX2-NEXT:    vextractf128 $1, %ymm9, %xmm9
-; AVX2-NEXT:    vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 192(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 128(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 64(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, (%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 224(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 160(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 96(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 32(%rsi)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 192(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 128(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 64(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, (%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 224(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 160(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 96(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 32(%rdx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 192(%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 128(%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 64(%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, (%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 224(%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 160(%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 96(%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 32(%rcx)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, (%r8)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 64(%r8)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 128(%r8)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 192(%r8)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 224(%r8)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 160(%r8)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 96(%r8)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 32(%r8)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 224(%r9)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 192(%r9)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 160(%r9)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 128(%r9)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 96(%r9)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 64(%r9)
-; AVX2-NEXT:    vmovups (%rsp), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 32(%r9)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, (%r9)
+; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-NEXT:    vbroadcastss 1648(%rdi), %ymm5
+; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm14[3]
+; AVX2-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm14 = mem[2,3,2,3,6,7,6,7]
+; AVX2-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7]
+; AVX2-NEXT:    vextractf128 $1, %ymm14, %xmm14
+; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 192(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 128(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 64(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, (%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 224(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 160(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 96(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 32(%rsi)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 192(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 128(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 64(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, (%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 224(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 160(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 96(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 32(%rdx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 192(%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 128(%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 64(%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, (%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 224(%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 160(%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 96(%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 32(%rcx)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, (%r8)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 64(%r8)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 128(%r8)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 192(%r8)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 224(%r8)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 160(%r8)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 96(%r8)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 32(%r8)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 224(%r9)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 192(%r9)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 160(%r9)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 128(%r9)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 96(%r9)
+; AVX2-NEXT:    vmovups (%rsp), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 64(%r9)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 32(%r9)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, (%r9)
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    vmovaps %ymm13, 224(%rax)
-; AVX2-NEXT:    vmovaps %ymm7, 192(%rax)
-; AVX2-NEXT:    vmovaps %ymm8, 160(%rax)
-; AVX2-NEXT:    vmovaps %ymm10, 128(%rax)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 96(%rax)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 64(%rax)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, 32(%rax)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm6, (%rax)
+; AVX2-NEXT:    vmovaps %ymm12, 224(%rax)
+; AVX2-NEXT:    vmovaps %ymm10, 192(%rax)
+; AVX2-NEXT:    vmovaps %ymm6, 160(%rax)
+; AVX2-NEXT:    vmovaps %ymm7, 128(%rax)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 96(%rax)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 64(%rax)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, 32(%rax)
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm5, (%rax)
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    vmovaps %ymm0, 224(%rax)
-; AVX2-NEXT:    vmovaps %ymm5, 192(%rax)
-; AVX2-NEXT:    vmovaps %ymm4, 160(%rax)
-; AVX2-NEXT:    vmovaps %ymm3, 128(%rax)
-; AVX2-NEXT:    vmovaps %ymm2, 96(%rax)
-; AVX2-NEXT:    vmovaps %ymm1, 64(%rax)
-; AVX2-NEXT:    vmovaps %ymm14, 32(%rax)
-; AVX2-NEXT:    vmovaps %ymm12, (%rax)
-; AVX2-NEXT:    addq $2680, %rsp # imm = 0xA78
+; AVX2-NEXT:    vmovaps %ymm4, 224(%rax)
+; AVX2-NEXT:    vmovaps %ymm3, 192(%rax)
+; AVX2-NEXT:    vmovaps %ymm2, 160(%rax)
+; AVX2-NEXT:    vmovaps %ymm1, 128(%rax)
+; AVX2-NEXT:    vmovaps %ymm0, 96(%rax)
+; AVX2-NEXT:    vmovaps %ymm8, 64(%rax)
+; AVX2-NEXT:    vmovaps %ymm13, 32(%rax)
+; AVX2-NEXT:    vmovaps %ymm11, (%rax)
+; AVX2-NEXT:    addq $2648, %rsp # imm = 0xA58
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX2-FP-LABEL: load_i32_stride7_vf64:
 ; AVX2-FP:       # %bb.0:
-; AVX2-FP-NEXT:    subq $2680, %rsp # imm = 0xA78
-; AVX2-FP-NEXT:    vmovdqa 1216(%rdi), %ymm12
-; AVX2-FP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    subq $2648, %rsp # imm = 0xA58
+; AVX2-FP-NEXT:    vmovdqa 1216(%rdi), %ymm9
+; AVX2-FP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 1152(%rdi), %ymm4
+; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 1120(%rdi), %ymm5
-; AVX2-FP-NEXT:    vmovdqa 768(%rdi), %ymm13
-; AVX2-FP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 768(%rdi), %ymm12
+; AVX2-FP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 704(%rdi), %ymm6
 ; AVX2-FP-NEXT:    vmovdqa 672(%rdi), %ymm7
 ; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %ymm8
@@ -12349,13 +12260,12 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7]
-; AVX2-FP-NEXT:    vmovdqa %ymm7, %ymm9
 ; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa %ymm6, %ymm8
 ; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vpbroadcastq 752(%rdi), %ymm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa 800(%rdi), %xmm2
 ; AVX2-FP-NEXT:    vmovdqa 832(%rdi), %xmm3
@@ -12369,11 +12279,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7]
 ; AVX2-FP-NEXT:    vmovdqa %ymm5, %ymm6
 ; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa %ymm4, %ymm5
-; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vpbroadcastq 1200(%rdi), %ymm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa 1248(%rdi), %xmm2
 ; AVX2-FP-NEXT:    vmovdqa 1280(%rdi), %xmm3
@@ -12384,12 +12292,12 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 1600(%rdi), %ymm2
-; AVX2-FP-NEXT:    vmovdqa 1568(%rdi), %ymm1
-; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
-; AVX2-FP-NEXT:    vmovdqa %ymm2, %ymm14
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 1600(%rdi), %ymm13
+; AVX2-FP-NEXT:    vmovdqa 1568(%rdi), %ymm3
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm13[6],ymm3[7]
+; AVX2-FP-NEXT:    vmovdqa %ymm3, %ymm5
+; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vmovdqa 1664(%rdi), %ymm3
 ; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -12409,11 +12317,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpbroadcastq 80(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm2
+; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7]
+; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm3
+; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7]
 ; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %xmm2
@@ -12426,11 +12334,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 480(%rdi), %ymm2
-; AVX2-FP-NEXT:    vmovdqa 448(%rdi), %ymm13
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm2[6],ymm13[7]
-; AVX2-FP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa %ymm2, %ymm12
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 448(%rdi), %ymm1
+; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
 ; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vmovdqa 544(%rdi), %ymm3
 ; AVX2-FP-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
@@ -12446,12 +12353,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 928(%rdi), %ymm1
+; AVX2-FP-NEXT:    vmovdqa 928(%rdi), %ymm2
+; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 896(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 896(%rdi), %ymm3
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6],ymm3[7]
-; AVX2-FP-NEXT:    vmovdqa %ymm3, %ymm15
-; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
 ; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FP-NEXT:    vmovdqa 992(%rdi), %ymm3
 ; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -12467,16 +12373,16 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 1376(%rdi), %ymm2
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 1344(%rdi), %ymm1
-; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
+; AVX2-FP-NEXT:    vmovdqa 1376(%rdi), %ymm14
+; AVX2-FP-NEXT:    vmovdqa 1344(%rdi), %ymm15
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm14[6],ymm15[7]
+; AVX2-FP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpermd %ymm1, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vmovdqa 1440(%rdi), %ymm7
+; AVX2-FP-NEXT:    vmovdqa 1440(%rdi), %ymm4
 ; AVX2-FP-NEXT:    vpbroadcastq 1424(%rdi), %ymm1
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqa 1472(%rdi), %xmm1
 ; AVX2-FP-NEXT:    vmovdqa 1504(%rdi), %xmm2
@@ -12495,215 +12401,192 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11],ymm1[28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %ymm1
-; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
+; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %ymm12
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm12[2,3],ymm9[4,5],ymm12[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,2,2,2]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vmovdqa 832(%rdi), %ymm2
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 800(%rdi), %ymm1
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7]
+; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,0,7,6,5,6,5,6]
+; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX2-FP-NEXT:    vmovdqa 736(%rdi), %ymm10
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FP-NEXT:    vmovdqa 832(%rdi), %ymm3
+; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 800(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,2,2,2]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vmovdqa 1280(%rdi), %ymm2
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-FP-NEXT:    vmovdqa 736(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 1248(%rdi), %ymm1
-; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX2-FP-NEXT:    vmovdqa 1184(%rdi), %ymm1
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5,6],ymm8[7]
+; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3],ymm9[4,5],ymm1[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FP-NEXT:    vmovdqa 1280(%rdi), %ymm3
+; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 1248(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,2,2,2]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vmovdqa 1728(%rdi), %ymm2
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-FP-NEXT:    vmovdqa 1184(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 1696(%rdi), %ymm1
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7]
+; AVX2-FP-NEXT:    vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm7 = mem[0],ymm6[1],mem[2,3,4],ymm6[5],mem[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5,6],ymm7[7]
+; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX2-FP-NEXT:    vmovdqa 1632(%rdi), %ymm6
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-FP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FP-NEXT:    vmovdqa 1728(%rdi), %ymm3
+; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 1696(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,2,2,2]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vmovdqa 608(%rdi), %ymm2
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-FP-NEXT:    vmovdqa 1632(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 576(%rdi), %ymm1
-; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX2-FP-NEXT:    vmovdqa 512(%rdi), %ymm1
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0],ymm5[1],ymm13[2,3,4],ymm5[5],ymm13[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6],ymm5[7]
+; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu (%rsp), %ymm14 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FP-NEXT:    vmovdqa 608(%rdi), %ymm3
+; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 576(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,2,2,2]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vmovdqa 1056(%rdi), %ymm2
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-FP-NEXT:    vmovdqa 512(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 1024(%rdi), %ymm1
-; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX2-FP-NEXT:    vmovdqa 960(%rdi), %ymm1
+; AVX2-FP-NEXT:    vmovdqu (%rsp), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm2[2,3],ymm8[4,5],ymm2[6,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7]
+; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3],ymm8[4,5],ymm1[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-FP-NEXT:    vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = mem[0],ymm15[1],mem[2,3,4],ymm15[5],mem[6,7]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FP-NEXT:    vmovdqa 1056(%rdi), %ymm3
+; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 1024(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,2,2,2]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vmovdqa 1504(%rdi), %ymm2
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-FP-NEXT:    vmovdqa 960(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 1472(%rdi), %ymm1
-; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX2-FP-NEXT:    vmovdqa 1408(%rdi), %ymm1
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm2[2,3],ymm13[4,5],ymm2[6,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7]
+; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,2,2,2]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm2
-; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm3
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FP-NEXT:    vmovdqa %ymm3, %ymm13
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FP-NEXT:    vmovdqa 1504(%rdi), %ymm3
 ; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa %ymm2, %ymm12
+; AVX2-FP-NEXT:    vmovdqa 1472(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm3
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-FP-NEXT:    vmovdqa 1408(%rdi), %ymm2
+; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7]
+; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm3
+; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm5
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FP-NEXT:    vmovdqa %ymm5, %ymm14
+; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa %ymm3, %ymm15
+; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm4
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm3[2,3],ymm7[4,5],ymm3[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm4[2,3],ymm7[4,5],ymm4[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FP-NEXT:    vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = mem[0],ymm2[1],mem[2,3,4],ymm2[5],mem[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7]
+; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 304(%rdi), %xmm0
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
 ; AVX2-FP-NEXT:    vpbroadcastd 232(%rdi), %xmm1
-; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %xmm2
-; AVX2-FP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
+; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %xmm5
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm1 = ymm4[0],mem[0],ymm4[2],mem[2]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2]
 ; AVX2-FP-NEXT:    vpbroadcastd 428(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 752(%rdi), %xmm0
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
 ; AVX2-FP-NEXT:    vpbroadcastd 680(%rdi), %xmm1
 ; AVX2-FP-NEXT:    vmovdqa 704(%rdi), %xmm2
 ; AVX2-FP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FP-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm1 = ymm11[0],mem[0],ymm11[2],mem[2]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
 ; AVX2-FP-NEXT:    vpbroadcastd 876(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 1200(%rdi), %xmm0
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
 ; AVX2-FP-NEXT:    vpbroadcastd 1128(%rdi), %xmm1
 ; AVX2-FP-NEXT:    vmovdqa 1152(%rdi), %xmm2
@@ -12718,7 +12601,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 1648(%rdi), %xmm0
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm5[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
 ; AVX2-FP-NEXT:    vpbroadcastd 1576(%rdi), %xmm1
 ; AVX2-FP-NEXT:    vmovdqa 1600(%rdi), %xmm2
@@ -12732,26 +12616,26 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 80(%rdi), %xmm0
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
 ; AVX2-FP-NEXT:    vpbroadcastd 8(%rdi), %xmm1
-; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %xmm3
-; AVX2-FP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3]
+; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %xmm4
+; AVX2-FP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[2],ymm12[2]
+; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[2],ymm15[2]
 ; AVX2-FP-NEXT:    vpbroadcastd 204(%rdi), %ymm6
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 528(%rdi), %xmm0
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm14[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
 ; AVX2-FP-NEXT:    vpbroadcastd 456(%rdi), %xmm1
-; AVX2-FP-NEXT:    vmovdqa 480(%rdi), %xmm3
-; AVX2-FP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3]
+; AVX2-FP-NEXT:    vmovdqa 480(%rdi), %xmm4
+; AVX2-FP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-FP-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
@@ -12762,23 +12646,23 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 976(%rdi), %xmm0
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
 ; AVX2-FP-NEXT:    vpbroadcastd 904(%rdi), %xmm15
-; AVX2-FP-NEXT:    vmovdqa 928(%rdi), %xmm12
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm15 = xmm15[0],xmm12[1],xmm15[2,3]
+; AVX2-FP-NEXT:    vmovdqa 928(%rdi), %xmm11
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm15 = xmm15[0],xmm11[1],xmm15[2,3]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm8[0],ymm7[0],ymm8[2],ymm7[2]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm7[0],ymm10[2],ymm7[2]
 ; AVX2-FP-NEXT:    vpbroadcastd 1100(%rdi), %ymm14
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 1424(%rdi), %xmm0
-; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm14 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} ymm14 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6,7]
 ; AVX2-FP-NEXT:    vpbroadcastd 1352(%rdi), %xmm15
 ; AVX2-FP-NEXT:    vmovdqa 1376(%rdi), %xmm0
@@ -12791,30 +12675,30 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm13 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm3[0,1,2],mem[3]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm13 = ymm4[0,2],ymm15[1,3],ymm4[4,6],ymm15[5,7]
+; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm13 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],mem[3]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[3,2,2,3]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm13 = ymm4[0,2],ymm12[1,3],ymm4[4,6],ymm12[5,7]
+; AVX2-FP-NEXT:    vmovaps %ymm4, %ymm12
 ; AVX2-FP-NEXT:    vbroadcastss 432(%rdi), %ymm14
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm13[5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm5 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm3[0,1,2],mem[3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm13[5,6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm5 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0,1,2],mem[3]
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3]
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm5 = ymm11[0,2],mem[1,3],ymm11[4,6],mem[5,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm14[0,2],ymm3[1,3],ymm14[4,6],ymm3[5,7]
 ; AVX2-FP-NEXT:    vbroadcastss 880(%rdi), %ymm13
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm13[7]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm5[5,6,7]
@@ -12827,9 +12711,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3]
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm4 = ymm13[0,2],ymm14[1,3],ymm13[4,6],ymm14[5,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm4 = ymm4[0,2],mem[1,3],ymm4[4,6],mem[5,7]
 ; AVX2-FP-NEXT:    vbroadcastss 1328(%rdi), %ymm5
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7]
@@ -12841,19 +12725,20 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3]
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[3,1,1,0,7,5,5,4]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm3 = ymm3[0,2],mem[1,3],ymm3[4,6],mem[5,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-FP-NEXT:    vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm3 = ymm13[0,2],mem[1,3],ymm13[4,6],mem[5,7]
 ; AVX2-FP-NEXT:    vbroadcastss 1776(%rdi), %ymm4
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0],ymm9[1],ymm10[2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2,3,4,5,6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3]
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm6[0,2],ymm1[1,3],ymm6[4,6],ymm1[5,7]
+; AVX2-FP-NEXT:    vmovaps %ymm1, %ymm9
 ; AVX2-FP-NEXT:    vbroadcastss 1552(%rdi), %ymm3
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
@@ -12861,13 +12746,13 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FP-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],mem[3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm11[0,1,2],mem[3]
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3]
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm8[0,2],ymm7[1,3],ymm8[4,6],ymm7[5,7]
-; AVX2-FP-NEXT:    vmovaps %ymm7, %ymm9
-; AVX2-FP-NEXT:    vmovaps %ymm8, %ymm11
+; AVX2-FP-NEXT:    vmovdqa %ymm10, %ymm8
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm7[1,3],ymm10[4,6],ymm7[5,7]
+; AVX2-FP-NEXT:    vmovaps %ymm7, %ymm11
 ; AVX2-FP-NEXT:    vbroadcastss 1104(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
@@ -12880,9 +12765,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm8[1,3],ymm6[4,6],ymm8[5,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[0,2],ymm7[1,3],ymm4[4,6],ymm7[5,7]
 ; AVX2-FP-NEXT:    vbroadcastss 656(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
@@ -12896,125 +12781,128 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm4[1,3],ymm3[4,6],ymm4[5,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm10[1,3],ymm3[4,6],ymm10[5,7]
 ; AVX2-FP-NEXT:    vbroadcastss 208(%rdi), %ymm2
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vbroadcastss 100(%rdi), %xmm0
-; AVX2-FP-NEXT:    vmovaps 64(%rdi), %xmm1
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX2-FP-NEXT:    vmovsd {{.*#+}} xmm7 = [4,3,0,0]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm7, %ymm2
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3]
-; AVX2-FP-NEXT:    vbroadcastsd {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7]
-; AVX2-FP-NEXT:    vpermps %ymm3, %ymm0, %ymm3
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-FP-NEXT:    vbroadcastss 212(%rdi), %ymm4
+; AVX2-FP-NEXT:    vmovaps 64(%rdi), %xmm6
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3]
+; AVX2-FP-NEXT:    vmovsd {{.*#+}} xmm5 = [4,3,0,0]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm1, %ymm5, %ymm1
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-FP-NEXT:    vbroadcastsd {{.*#+}} ymm15 = [0,7,0,7,0,7,0,7]
+; AVX2-FP-NEXT:    vpermps %ymm3, %ymm15, %ymm1
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6,7]
+; AVX2-FP-NEXT:    vbroadcastss 212(%rdi), %ymm2
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm0, %ymm5, %ymm0
+; AVX2-FP-NEXT:    vbroadcastss 324(%rdi), %xmm2
+; AVX2-FP-NEXT:    vmovaps 288(%rdi), %xmm1
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX2-FP-NEXT:    vpermps %ymm12, %ymm15, %ymm2
+; AVX2-FP-NEXT:    vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = ymm2[0,1,2,3,4,5],mem[6,7]
+; AVX2-FP-NEXT:    vbroadcastss 436(%rdi), %ymm3
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm0, %ymm5, %ymm0
+; AVX2-FP-NEXT:    vbroadcastss 548(%rdi), %xmm3
+; AVX2-FP-NEXT:    vmovaps 512(%rdi), %xmm2
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3]
+; AVX2-FP-NEXT:    vpermps %ymm4, %ymm15, %ymm3
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6,7]
+; AVX2-FP-NEXT:    vbroadcastss 660(%rdi), %ymm4
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm2, %ymm7, %ymm3
-; AVX2-FP-NEXT:    vbroadcastss 324(%rdi), %xmm4
-; AVX2-FP-NEXT:    vmovaps 288(%rdi), %xmm2
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
-; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-FP-NEXT:    vbroadcastss 436(%rdi), %ymm5
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm3, (%rsp) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm3, %ymm7, %ymm4
-; AVX2-FP-NEXT:    vbroadcastss 548(%rdi), %xmm5
-; AVX2-FP-NEXT:    vmovaps 512(%rdi), %xmm3
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
-; AVX2-FP-NEXT:    vpermps %ymm6, %ymm0, %ymm5
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7]
-; AVX2-FP-NEXT:    vbroadcastss 660(%rdi), %ymm6
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm4, %ymm7, %ymm5
-; AVX2-FP-NEXT:    vbroadcastss 772(%rdi), %xmm6
-; AVX2-FP-NEXT:    vmovaps 736(%rdi), %xmm4
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm6 = xmm4[0,1,2],xmm6[3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
-; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm0, %ymm5, %ymm0
+; AVX2-FP-NEXT:    vbroadcastss 772(%rdi), %xmm4
+; AVX2-FP-NEXT:    vmovaps 736(%rdi), %xmm3
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
+; AVX2-FP-NEXT:    vpermps %ymm14, %ymm15, %ymm4
+; AVX2-FP-NEXT:    vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm4 = ymm4[0,1,2,3,4,5],mem[6,7]
+; AVX2-FP-NEXT:    vbroadcastss 884(%rdi), %ymm7
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm0, %ymm5, %ymm0
+; AVX2-FP-NEXT:    vbroadcastss 996(%rdi), %xmm7
+; AVX2-FP-NEXT:    vmovaps 960(%rdi), %xmm4
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3]
+; AVX2-FP-NEXT:    vpermps %ymm8, %ymm15, %ymm7
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7]
+; AVX2-FP-NEXT:    vbroadcastss 1108(%rdi), %ymm8
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm0, %ymm5, %ymm0
+; AVX2-FP-NEXT:    vbroadcastss 1220(%rdi), %xmm7
+; AVX2-FP-NEXT:    vmovaps 1184(%rdi), %xmm14
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3]
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7]
-; AVX2-FP-NEXT:    vbroadcastss 884(%rdi), %ymm8
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm5, %ymm7, %ymm6
-; AVX2-FP-NEXT:    vbroadcastss 996(%rdi), %xmm8
-; AVX2-FP-NEXT:    vmovaps 960(%rdi), %xmm5
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm8 = xmm5[0,1,2],xmm8[3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3]
-; AVX2-FP-NEXT:    vpermps %ymm11, %ymm0, %ymm8
+; AVX2-FP-NEXT:    vpermps %ymm10, %ymm15, %ymm7
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm12[6,7]
+; AVX2-FP-NEXT:    vbroadcastss 1332(%rdi), %ymm8
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm0, %ymm5, %ymm7
+; AVX2-FP-NEXT:    vbroadcastss 1444(%rdi), %xmm8
+; AVX2-FP-NEXT:    vmovaps 1408(%rdi), %xmm0
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm8 = xmm0[0,1,2],xmm8[3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3]
+; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm8 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    vmovaps %ymm9, %ymm11
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-FP-NEXT:    vbroadcastss 1108(%rdi), %ymm9
+; AVX2-FP-NEXT:    vbroadcastss 1556(%rdi), %ymm9
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm6, %ymm7, %ymm8
-; AVX2-FP-NEXT:    vbroadcastss 1220(%rdi), %xmm9
-; AVX2-FP-NEXT:    vmovaps 1184(%rdi), %xmm6
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm9 = xmm6[0,1,2],xmm9[3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3]
-; AVX2-FP-NEXT:    vpermps %ymm13, %ymm0, %ymm9
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7]
-; AVX2-FP-NEXT:    vbroadcastss 1332(%rdi), %ymm12
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm12[7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm8, %ymm7, %ymm8
-; AVX2-FP-NEXT:    vbroadcastss 1444(%rdi), %xmm9
-; AVX2-FP-NEXT:    vmovaps 1408(%rdi), %xmm13
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm9 = xmm13[0,1,2],xmm9[3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3]
-; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm9 = ymm9[0,1,2,3,4,5],mem[6,7]
-; AVX2-FP-NEXT:    vbroadcastss 1556(%rdi), %ymm12
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm12[7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm8, %ymm7, %ymm7
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm7, %ymm5, %ymm7
 ; AVX2-FP-NEXT:    vbroadcastss 1668(%rdi), %xmm8
-; AVX2-FP-NEXT:    vmovaps 1632(%rdi), %xmm14
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm8 = xmm14[0,1,2],xmm8[3]
+; AVX2-FP-NEXT:    vmovaps 1632(%rdi), %xmm5
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm8 = xmm5[0,1,2],xmm8[3]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3]
-; AVX2-FP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7]
+; AVX2-FP-NEXT:    vpermps %ymm13, %ymm15, %ymm8
+; AVX2-FP-NEXT:    vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
 ; AVX2-FP-NEXT:    vbroadcastss 1780(%rdi), %ymm9
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
@@ -13026,367 +12914,370 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
 ; AVX2-FP-NEXT:    vbroadcastss 216(%rdi), %ymm8
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
-; AVX2-FP-NEXT:    vmovaps 96(%rdi), %xmm12
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2]
+; AVX2-FP-NEXT:    vmovaps 96(%rdi), %xmm9
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[0,1,3,2]
 ; AVX2-FP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm8 = mem[1,0,2,3,5,4,6,7]
 ; AVX2-FP-NEXT:    vextractf128 $1, %ymm8, %xmm8
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovaps 320(%rdi), %xmm13
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2]
+; AVX2-FP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm6 = mem[1,0,2,3,5,4,6,7]
+; AVX2-FP-NEXT:    vextractf128 $1, %ymm6, %xmm6
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm6 = mem[0],ymm6[1],mem[2,3,4],ymm6[5],mem[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3]
+; AVX2-FP-NEXT:    vbroadcastss 440(%rdi), %ymm7
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 320(%rdi), %xmm1
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
+; AVX2-FP-NEXT:    vmovaps 544(%rdi), %xmm8
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm2[3]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2]
+; AVX2-FP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = mem[1,0,2,3,5,4,6,7]
+; AVX2-FP-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
+; AVX2-FP-NEXT:    vbroadcastss 664(%rdi), %ymm6
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm6[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovaps 768(%rdi), %xmm1
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm3[3]
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2]
-; AVX2-FP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm7 = mem[1,0,2,3,5,4,6,7]
-; AVX2-FP-NEXT:    vextractf128 $1, %ymm7, %xmm7
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3]
-; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm7 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
-; AVX2-FP-NEXT:    vbroadcastss 440(%rdi), %ymm8
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm3 = mem[1,0,2,3,5,4,6,7]
+; AVX2-FP-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
+; AVX2-FP-NEXT:    vbroadcastss 888(%rdi), %ymm6
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-FP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 544(%rdi), %xmm2
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3]
+; AVX2-FP-NEXT:    vmovaps 992(%rdi), %xmm2
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm4[3]
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2]
-; AVX2-FP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm7 = mem[1,0,2,3,5,4,6,7]
-; AVX2-FP-NEXT:    vextractf128 $1, %ymm7, %xmm7
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
-; AVX2-FP-NEXT:    vbroadcastss 664(%rdi), %ymm8
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 768(%rdi), %xmm3
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3]
+; AVX2-FP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm4 = mem[1,0,2,3,5,4,6,7]
+; AVX2-FP-NEXT:    vextractf128 $1, %ymm4, %xmm4
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
+; AVX2-FP-NEXT:    vbroadcastss 1112(%rdi), %ymm6
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FP-NEXT:    vmovaps 1216(%rdi), %xmm3
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm14[3]
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2]
-; AVX2-FP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm7 = mem[1,0,2,3,5,4,6,7]
-; AVX2-FP-NEXT:    vextractf128 $1, %ymm7, %xmm7
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3]
-; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm7 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
-; AVX2-FP-NEXT:    vbroadcastss 888(%rdi), %ymm8
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-FP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovaps 992(%rdi), %xmm4
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm4[0,1,2],xmm5[3]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2]
-; AVX2-FP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm7 = mem[1,0,2,3,5,4,6,7]
-; AVX2-FP-NEXT:    vextractf128 $1, %ymm7, %xmm7
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
-; AVX2-FP-NEXT:    vbroadcastss 1112(%rdi), %ymm10
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm10[7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-FP-NEXT:    vmovaps 1216(%rdi), %xmm15
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm6[3]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2]
 ; AVX2-FP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
 ; AVX2-FP-NEXT:    # ymm6 = mem[1,0,2,3,5,4,6,7]
 ; AVX2-FP-NEXT:    vextractf128 $1, %ymm6, %xmm6
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm12[0],ymm10[1],ymm12[2,3,4],ymm10[5],ymm12[6,7]
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7]
 ; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3]
-; AVX2-FP-NEXT:    vbroadcastss 1336(%rdi), %ymm7
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-FP-NEXT:    vmovaps 1440(%rdi), %xmm5
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm13[3]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[0,1,3,2]
-; AVX2-FP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm7 = mem[1,0,2,3,5,4,6,7]
-; AVX2-FP-NEXT:    vextractf128 $1, %ymm7, %xmm7
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT:    vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm7 = mem[0],ymm7[1],mem[2,3,4],ymm7[5],mem[6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
-; AVX2-FP-NEXT:    vbroadcastss 1560(%rdi), %ymm13
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm13[7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-FP-NEXT:    vmovaps 1664(%rdi), %xmm6
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm13 = xmm6[0,1,2],xmm14[3]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm13 = xmm13[0,1,3,2]
-; AVX2-FP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm14 = mem[1,0,2,3,5,4,6,7]
-; AVX2-FP-NEXT:    vextractf128 $1, %ymm14, %xmm14
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
-; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm14 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[1,0,3,3,5,4,7,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[0,1,0,3]
-; AVX2-FP-NEXT:    vbroadcastss 1784(%rdi), %ymm9
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5,6],ymm9[7]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FP-NEXT:    vbroadcastss 136(%rdi), %xmm9
-; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
-; AVX2-FP-NEXT:    # xmm9 = xmm9[0],mem[1],xmm9[2,3]
-; AVX2-FP-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-FP-NEXT:    vpermps 192(%rdi), %ymm0, %ymm14
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7]
-; AVX2-FP-NEXT:    vbroadcastss 80(%rdi), %ymm14
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
-; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm14 = mem[2,3,2,3,6,7,6,7]
-; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7]
-; AVX2-FP-NEXT:    vextractf128 $1, %ymm14, %xmm14
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FP-NEXT:    vbroadcastss 360(%rdi), %xmm9
-; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
-; AVX2-FP-NEXT:    # xmm9 = xmm9[0],mem[1],xmm9[2,3]
-; AVX2-FP-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-FP-NEXT:    vpermps 416(%rdi), %ymm0, %ymm14
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7]
-; AVX2-FP-NEXT:    vbroadcastss 304(%rdi), %ymm14
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3]
-; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm14 = mem[2,3,2,3,6,7,6,7]
-; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7]
-; AVX2-FP-NEXT:    vextractf128 $1, %ymm14, %xmm14
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FP-NEXT:    vbroadcastss 584(%rdi), %xmm1
+; AVX2-FP-NEXT:    vbroadcastss 1336(%rdi), %ymm10
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-FP-NEXT:    vmovaps 1440(%rdi), %xmm4
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX2-FP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm10 = mem[1,0,2,3,5,4,6,7]
+; AVX2-FP-NEXT:    vextractf128 $1, %ymm10, %xmm10
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
+; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm10 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[1,0,3,3,5,4,7,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3]
+; AVX2-FP-NEXT:    vbroadcastss 1560(%rdi), %ymm12
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-FP-NEXT:    vmovaps 1664(%rdi), %xmm14
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm14[0,1,2],xmm5[3]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX2-FP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm5 = mem[1,0,2,3,5,4,6,7]
+; AVX2-FP-NEXT:    vextractf128 $1, %ymm5, %xmm5
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7]
+; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
+; AVX2-FP-NEXT:    vbroadcastss 1784(%rdi), %ymm12
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm12[7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-FP-NEXT:    vbroadcastss 136(%rdi), %xmm0
+; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-FP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
+; AVX2-FP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpermps 192(%rdi), %ymm15, %ymm5
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-FP-NEXT:    vbroadcastss 80(%rdi), %ymm5
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm9[3]
+; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm11 = mem[2,3,2,3,6,7,6,7]
+; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7]
+; AVX2-FP-NEXT:    vextractf128 $1, %ymm11, %xmm11
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vbroadcastss 360(%rdi), %xmm0
+; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-FP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
+; AVX2-FP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpermps 416(%rdi), %ymm15, %ymm5
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-FP-NEXT:    vbroadcastss 304(%rdi), %ymm5
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm13[3]
+; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm13 = mem[2,3,2,3,6,7,6,7]
+; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7]
+; AVX2-FP-NEXT:    vextractf128 $1, %ymm13, %xmm13
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm13[0,1],xmm5[2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vbroadcastss 584(%rdi), %xmm0
+; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-FP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
+; AVX2-FP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpermps 640(%rdi), %ymm15, %ymm5
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-FP-NEXT:    vbroadcastss 528(%rdi), %ymm5
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3]
+; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm8 = mem[2,3,2,3,6,7,6,7]
+; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7]
+; AVX2-FP-NEXT:    vextractf128 $1, %ymm8, %xmm8
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vbroadcastss 808(%rdi), %xmm0
+; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-FP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
+; AVX2-FP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpermps 864(%rdi), %ymm15, %ymm5
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-FP-NEXT:    vbroadcastss 752(%rdi), %ymm5
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3]
+; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm5 = mem[2,3,2,3,6,7,6,7]
+; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7]
+; AVX2-FP-NEXT:    vextractf128 $1, %ymm5, %xmm5
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vbroadcastss 1032(%rdi), %xmm1
 ; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
 ; AVX2-FP-NEXT:    # xmm1 = xmm1[0],mem[1],xmm1[2,3]
 ; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FP-NEXT:    vpermps 640(%rdi), %ymm0, %ymm9
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-FP-NEXT:    vbroadcastss 528(%rdi), %ymm9
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3]
-; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm9 = mem[2,3,2,3,6,7,6,7]
-; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7]
-; AVX2-FP-NEXT:    vextractf128 $1, %ymm9, %xmm9
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3]
+; AVX2-FP-NEXT:    vpermps 1088(%rdi), %ymm15, %ymm5
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-FP-NEXT:    vbroadcastss 976(%rdi), %ymm5
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3]
+; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm5 = mem[2,3,2,3,6,7,6,7]
+; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7]
+; AVX2-FP-NEXT:    vextractf128 $1, %ymm5, %xmm5
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FP-NEXT:    vbroadcastss 808(%rdi), %xmm2
+; AVX2-FP-NEXT:    vbroadcastss 1256(%rdi), %xmm2
 ; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
 ; AVX2-FP-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
 ; AVX2-FP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FP-NEXT:    vpermps 864(%rdi), %ymm0, %ymm9
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-FP-NEXT:    vbroadcastss 752(%rdi), %ymm9
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3]
-; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm9 = mem[2,3,2,3,6,7,6,7]
-; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7]
-; AVX2-FP-NEXT:    vextractf128 $1, %ymm9, %xmm9
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3]
+; AVX2-FP-NEXT:    vpermps 1312(%rdi), %ymm15, %ymm5
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-FP-NEXT:    vbroadcastss 1200(%rdi), %ymm5
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3]
+; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm5 = mem[2,3,2,3,6,7,6,7]
+; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7]
+; AVX2-FP-NEXT:    vextractf128 $1, %ymm5, %xmm5
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FP-NEXT:    vbroadcastss 1032(%rdi), %xmm3
+; AVX2-FP-NEXT:    vbroadcastss 1480(%rdi), %xmm3
 ; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
 ; AVX2-FP-NEXT:    # xmm3 = xmm3[0],mem[1],xmm3[2,3]
 ; AVX2-FP-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FP-NEXT:    vpermps 1088(%rdi), %ymm0, %ymm9
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-FP-NEXT:    vbroadcastss 976(%rdi), %ymm9
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3]
-; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm9 = mem[2,3,2,3,6,7,6,7]
-; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7]
-; AVX2-FP-NEXT:    vextractf128 $1, %ymm9, %xmm9
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3]
+; AVX2-FP-NEXT:    vpermps 1536(%rdi), %ymm15, %ymm5
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-FP-NEXT:    vbroadcastss 1424(%rdi), %ymm5
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
+; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm5 = mem[2,3,2,3,6,7,6,7]
+; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7]
+; AVX2-FP-NEXT:    vextractf128 $1, %ymm5, %xmm5
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-FP-NEXT:    vbroadcastss 1256(%rdi), %xmm4
+; AVX2-FP-NEXT:    vbroadcastss 1704(%rdi), %xmm4
 ; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
 ; AVX2-FP-NEXT:    # xmm4 = xmm4[0],mem[1],xmm4[2,3]
+; AVX2-FP-NEXT:    vpermps 1760(%rdi), %ymm15, %ymm5
 ; AVX2-FP-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-FP-NEXT:    vpermps 1312(%rdi), %ymm0, %ymm9
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-FP-NEXT:    vbroadcastss 1200(%rdi), %ymm9
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm9 = xmm9[0,1,2],xmm15[3]
-; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm15 = mem[2,3,2,3,6,7,6,7]
-; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7]
-; AVX2-FP-NEXT:    vextractf128 $1, %ymm15, %xmm15
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm9 = xmm15[0,1],xmm9[2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-FP-NEXT:    vbroadcastss 1480(%rdi), %xmm9
-; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
-; AVX2-FP-NEXT:    # xmm9 = xmm9[0],mem[1],xmm9[2,3]
-; AVX2-FP-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-FP-NEXT:    vpermps 1536(%rdi), %ymm0, %ymm15
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-FP-NEXT:    vbroadcastss 1424(%rdi), %ymm15
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3]
-; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm15 = mem[2,3,2,3,6,7,6,7]
-; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7]
-; AVX2-FP-NEXT:    vextractf128 $1, %ymm15, %xmm15
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FP-NEXT:    vbroadcastss 1704(%rdi), %xmm9
-; AVX2-FP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
-; AVX2-FP-NEXT:    # xmm9 = xmm9[0],mem[1],xmm9[2,3]
-; AVX2-FP-NEXT:    vpermps 1760(%rdi), %ymm0, %ymm0
-; AVX2-FP-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-FP-NEXT:    vbroadcastss 1648(%rdi), %ymm9
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3]
-; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm9 = mem[2,3,2,3,6,7,6,7]
-; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX2-FP-NEXT:    # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7]
-; AVX2-FP-NEXT:    vextractf128 $1, %ymm9, %xmm9
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 192(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 128(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 64(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, (%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 224(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 160(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 96(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 32(%rsi)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 192(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 128(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 64(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, (%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 224(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 160(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 96(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 32(%rdx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 192(%rcx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 128(%rcx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 64(%rcx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, (%rcx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 224(%rcx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 160(%rcx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 96(%rcx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 32(%rcx)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, (%r8)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 64(%r8)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 128(%r8)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 192(%r8)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 224(%r8)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 160(%r8)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 96(%r8)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 32(%r8)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 224(%r9)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 192(%r9)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 160(%r9)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 128(%r9)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 96(%r9)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 64(%r9)
-; AVX2-FP-NEXT:    vmovups (%rsp), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 32(%r9)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, (%r9)
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-FP-NEXT:    vbroadcastss 1648(%rdi), %ymm5
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm14[3]
+; AVX2-FP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm14 = mem[2,3,2,3,6,7,6,7]
+; AVX2-FP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
+; AVX2-FP-NEXT:    # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7]
+; AVX2-FP-NEXT:    vextractf128 $1, %ymm14, %xmm14
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 192(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 128(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 64(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, (%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 224(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 160(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 96(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 32(%rsi)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 192(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 128(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 64(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, (%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 224(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 160(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 96(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 32(%rdx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 192(%rcx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 128(%rcx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 64(%rcx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, (%rcx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 224(%rcx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 160(%rcx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 96(%rcx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 32(%rcx)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, (%r8)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 64(%r8)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 128(%r8)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 192(%r8)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 224(%r8)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 160(%r8)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 96(%r8)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 32(%r8)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 224(%r9)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 192(%r9)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 160(%r9)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 128(%r9)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 96(%r9)
+; AVX2-FP-NEXT:    vmovups (%rsp), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 64(%r9)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 32(%r9)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, (%r9)
 ; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-FP-NEXT:    vmovaps %ymm13, 224(%rax)
-; AVX2-FP-NEXT:    vmovaps %ymm7, 192(%rax)
-; AVX2-FP-NEXT:    vmovaps %ymm8, 160(%rax)
-; AVX2-FP-NEXT:    vmovaps %ymm10, 128(%rax)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 96(%rax)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 64(%rax)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, 32(%rax)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm6, (%rax)
+; AVX2-FP-NEXT:    vmovaps %ymm12, 224(%rax)
+; AVX2-FP-NEXT:    vmovaps %ymm10, 192(%rax)
+; AVX2-FP-NEXT:    vmovaps %ymm6, 160(%rax)
+; AVX2-FP-NEXT:    vmovaps %ymm7, 128(%rax)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 96(%rax)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 64(%rax)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, 32(%rax)
+; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm5, (%rax)
 ; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-FP-NEXT:    vmovaps %ymm0, 224(%rax)
-; AVX2-FP-NEXT:    vmovaps %ymm5, 192(%rax)
-; AVX2-FP-NEXT:    vmovaps %ymm4, 160(%rax)
-; AVX2-FP-NEXT:    vmovaps %ymm3, 128(%rax)
-; AVX2-FP-NEXT:    vmovaps %ymm2, 96(%rax)
-; AVX2-FP-NEXT:    vmovaps %ymm1, 64(%rax)
-; AVX2-FP-NEXT:    vmovaps %ymm14, 32(%rax)
-; AVX2-FP-NEXT:    vmovaps %ymm12, (%rax)
-; AVX2-FP-NEXT:    addq $2680, %rsp # imm = 0xA78
+; AVX2-FP-NEXT:    vmovaps %ymm4, 224(%rax)
+; AVX2-FP-NEXT:    vmovaps %ymm3, 192(%rax)
+; AVX2-FP-NEXT:    vmovaps %ymm2, 160(%rax)
+; AVX2-FP-NEXT:    vmovaps %ymm1, 128(%rax)
+; AVX2-FP-NEXT:    vmovaps %ymm0, 96(%rax)
+; AVX2-FP-NEXT:    vmovaps %ymm8, 64(%rax)
+; AVX2-FP-NEXT:    vmovaps %ymm13, 32(%rax)
+; AVX2-FP-NEXT:    vmovaps %ymm11, (%rax)
+; AVX2-FP-NEXT:    addq $2648, %rsp # imm = 0xA58
 ; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
 ;
 ; AVX2-FCP-LABEL: load_i32_stride7_vf64:
 ; AVX2-FCP:       # %bb.0:
-; AVX2-FCP-NEXT:    subq $2680, %rsp # imm = 0xA78
-; AVX2-FCP-NEXT:    vmovdqa 1216(%rdi), %ymm13
+; AVX2-FCP-NEXT:    subq $2648, %rsp # imm = 0xA58
+; AVX2-FCP-NEXT:    vmovdqa 1216(%rdi), %ymm9
+; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 1152(%rdi), %ymm4
 ; AVX2-FCP-NEXT:    vmovdqa 1120(%rdi), %ymm5
-; AVX2-FCP-NEXT:    vmovdqa 768(%rdi), %ymm8
-; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa 768(%rdi), %ymm13
 ; AVX2-FCP-NEXT:    vmovdqa 704(%rdi), %ymm6
 ; AVX2-FCP-NEXT:    vmovdqa 672(%rdi), %ymm7
-; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm12
+; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm8
+; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm10
+; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm3
 ; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm2[6],ymm10[7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa %ymm2, %ymm11
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7]
+; AVX2-FCP-NEXT:    vmovdqa %ymm3, %ymm11
+; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa %ymm2, %ymm10
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vpbroadcastq 304(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %xmm2
 ; AVX2-FCP-NEXT:    vmovdqa 384(%rdi), %xmm3
@@ -13399,11 +13290,12 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa %ymm6, %ymm9
+; AVX2-FCP-NEXT:    vmovdqa %ymm6, %ymm8
 ; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vpbroadcastq 752(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa %ymm13, %ymm6
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa 800(%rdi), %xmm2
 ; AVX2-FCP-NEXT:    vmovdqa 832(%rdi), %xmm3
@@ -13415,14 +13307,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7]
-; AVX2-FCP-NEXT:    vmovdqa %ymm5, %ymm6
-; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa %ymm4, %ymm8
 ; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vpbroadcastq 1200(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa %ymm13, %ymm15
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa 1248(%rdi), %xmm2
 ; AVX2-FCP-NEXT:    vmovdqa 1280(%rdi), %xmm3
@@ -13433,13 +13321,12 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 1600(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovdqa 1600(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 1568(%rdi), %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7]
-; AVX2-FCP-NEXT:    vmovdqa %ymm3, %ymm13
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6],ymm3[7]
+; AVX2-FCP-NEXT:    vmovdqa %ymm3, %ymm5
 ; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa %ymm2, %ymm5
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vmovdqa 1664(%rdi), %ymm3
 ; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -13456,14 +13343,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpbroadcastq 80(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa %ymm2, %ymm14
-; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7]
+; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
+; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7]
 ; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %xmm2
@@ -13475,11 +13362,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 480(%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 448(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovdqa 480(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6],ymm2[7]
+; AVX2-FCP-NEXT:    vmovdqa 448(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
 ; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
 ; AVX2-FCP-NEXT:    vmovdqa 544(%rdi), %ymm3
 ; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -13501,10 +13388,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
 ; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vmovdqa 992(%rdi), %ymm3
-; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa 992(%rdi), %ymm12
 ; AVX2-FCP-NEXT:    vpbroadcastq 976(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm12, (%rsp) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa 1024(%rdi), %xmm2
 ; AVX2-FCP-NEXT:    vmovdqa 1056(%rdi), %xmm3
@@ -13515,16 +13402,16 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 1376(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 1344(%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
+; AVX2-FCP-NEXT:    vmovdqa 1376(%rdi), %ymm15
+; AVX2-FCP-NEXT:    vmovdqa 1344(%rdi), %ymm14
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vmovdqa 1440(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa 1440(%rdi), %ymm9
 ; AVX2-FCP-NEXT:    vpbroadcastq 1424(%rdi), %ymm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqa 1472(%rdi), %xmm1
 ; AVX2-FCP-NEXT:    vmovdqa 1504(%rdi), %xmm2
@@ -13536,197 +13423,184 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,2,2,2]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vmovdqa 384(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm3
+; AVX2-FCP-NEXT:    vmovdqa 384(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %ymm0
+; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11],ymm1[28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27]
 ; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,6],ymm0[7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
 ; AVX2-FCP-NEXT:    vmovdqa 288(%rdi), %ymm0
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [5,6,5,6,5,6,5,6]
-; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm3
-; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [1,0,7,7,5,4,7,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm1, %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3],ymm13[4,5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,0,7,6,5,6,5,6]
+; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FCP-NEXT:    vmovdqa 832(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = mem[2,2,2,2]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vmovdqa 832(%rdi), %ymm4
-; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 800(%rdi), %ymm3
 ; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-FCP-NEXT:    vmovdqa 736(%rdi), %ymm3
-; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm3[2,3],ymm11[4,5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm0, %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0],ymm7[1],ymm9[2,3,4],ymm7[5],ymm9[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm1, %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-FCP-NEXT:    vmovdqa 736(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = mem[2,2,2,2]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vmovdqa 1280(%rdi), %ymm4
-; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 1248(%rdi), %ymm3
+; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-FCP-NEXT:    vmovdqa 1184(%rdi), %ymm3
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7]
+; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FCP-NEXT:    vmovdqa 1280(%rdi), %ymm3
 ; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa %ymm15, %ymm10
-; AVX2-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3],ymm15[4,5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm0, %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm1, %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa 1248(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = mem[2,2,2,2]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vmovdqa 1728(%rdi), %ymm4
-; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 1696(%rdi), %ymm3
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-FCP-NEXT:    vmovdqa 1184(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm2[2,3],ymm8[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm7 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5,6],ymm7[7]
+; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FCP-NEXT:    vmovdqa 1728(%rdi), %ymm3
 ; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-FCP-NEXT:    vmovdqa 1632(%rdi), %ymm15
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm15[2,3],ymm6[4,5],ymm15[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm0, %ymm3
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0],ymm13[1],ymm5[2,3,4],ymm13[5],ymm5[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm1, %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa 1696(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = mem[2,2,2,2]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm4
-; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm3
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-FCP-NEXT:    vmovdqa 1632(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm5 = mem[0],ymm5[1],mem[2,3,4],ymm5[5],mem[6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6],ymm5[7]
+; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FCP-NEXT:    vmovdqa 608(%rdi), %ymm3
 ; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm4
-; AVX2-FCP-NEXT:    vmovdqa %ymm14, %ymm8
-; AVX2-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm4[2,3],ymm14[4,5],ymm4[6,7]
-; AVX2-FCP-NEXT:    vmovdqa %ymm4, %ymm14
-; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm0, %ymm3
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm4 = mem[0],ymm4[1],mem[2,3,4],ymm4[5],mem[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm1, %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa 576(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = mem[2,2,2,2]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vmovdqa 608(%rdi), %ymm4
-; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 576(%rdi), %ymm3
-; AVX2-FCP-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-FCP-NEXT:    vmovdqa 512(%rdi), %ymm13
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm13[2,3],ymm9[4,5],ymm13[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm0, %ymm3
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm4 = mem[0],ymm4[1],mem[2,3,4],ymm4[5],mem[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm1, %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-FCP-NEXT:    vmovdqa 512(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = mem[2,2,2,2]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vmovdqa 1056(%rdi), %ymm4
-; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 1024(%rdi), %ymm3
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-FCP-NEXT:    vmovdqa 960(%rdi), %ymm3
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7]
+; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FCP-NEXT:    vmovdqa 1056(%rdi), %ymm3
 ; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3],ymm7[4,5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm0, %ymm3
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7]
-; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm1, %ymm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqa 1024(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = mem[2,2,2,2]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vmovdqa 1504(%rdi), %ymm4
-; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqa 1472(%rdi), %ymm3
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-FCP-NEXT:    vmovdqa 960(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm2[2,3],ymm12[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-FCP-NEXT:    vmovdqa 1408(%rdi), %ymm3
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7]
+; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FCP-NEXT:    vmovdqa 1504(%rdi), %ymm3
 ; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vmovdqa 1472(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-FCP-NEXT:    vmovdqa 1408(%rdi), %ymm2
+; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm15[0],ymm14[1],ymm15[2,3,4],ymm14[5],ymm15[6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7]
+; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,2,2,2]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm3
+; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm5
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27]
+; AVX2-FCP-NEXT:    vmovdqa %ymm5, %ymm14
+; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovdqa %ymm3, %ymm15
+; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm4
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm4[2,3],ymm12[4,5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm1, %ymm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7]
+; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 304(%rdi), %xmm0
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm12[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpbroadcastd 232(%rdi), %xmm1
-; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %xmm2
-; AVX2-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
+; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %xmm5
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm4[0],ymm12[2],ymm4[2]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm10[0],ymm9[0],ymm10[2],ymm9[2]
 ; AVX2-FCP-NEXT:    vpbroadcastd 428(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 752(%rdi), %xmm0
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpbroadcastd 680(%rdi), %xmm1
 ; AVX2-FCP-NEXT:    vmovdqa 704(%rdi), %xmm2
@@ -13734,30 +13608,31 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
+; AVX2-FCP-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm1 = ymm3[0],mem[0],ymm3[2],mem[2]
 ; AVX2-FCP-NEXT:    vpbroadcastd 876(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 1200(%rdi), %xmm0
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpbroadcastd 1128(%rdi), %xmm1
 ; AVX2-FCP-NEXT:    vmovdqa 1152(%rdi), %xmm2
 ; AVX2-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
+; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2]
 ; AVX2-FCP-NEXT:    vpbroadcastd 1324(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 1648(%rdi), %xmm0
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpbroadcastd 1576(%rdi), %xmm1
 ; AVX2-FCP-NEXT:    vmovdqa 1600(%rdi), %xmm2
@@ -13771,29 +13646,28 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 80(%rdi), %xmm0
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm12[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm1
-; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %xmm6
-; AVX2-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3]
+; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %xmm4
+; AVX2-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
+; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[2],ymm15[2]
 ; AVX2-FCP-NEXT:    vpbroadcastd 204(%rdi), %ymm6
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 528(%rdi), %xmm0
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpbroadcastd 456(%rdi), %xmm1
-; AVX2-FCP-NEXT:    vmovdqa 480(%rdi), %xmm6
-; AVX2-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3]
+; AVX2-FCP-NEXT:    vmovdqa 480(%rdi), %xmm4
+; AVX2-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
 ; AVX2-FCP-NEXT:    vpbroadcastd 652(%rdi), %ymm15
@@ -13802,48 +13676,48 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 976(%rdi), %xmm0
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX2-FCP-NEXT:    vpalignr $8, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpbroadcastd 904(%rdi), %xmm15
-; AVX2-FCP-NEXT:    vmovdqa 928(%rdi), %xmm11
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm15 = xmm15[0],xmm11[1],xmm15[2,3]
+; AVX2-FCP-NEXT:    vmovdqa 928(%rdi), %xmm12
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm15 = xmm15[0],xmm12[1],xmm15[2,3]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm7[0],ymm10[2],ymm7[2]
-; AVX2-FCP-NEXT:    vpbroadcastd 1100(%rdi), %ymm13
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm15 = ymm1[0],mem[0],ymm1[2],mem[2]
+; AVX2-FCP-NEXT:    vpbroadcastd 1100(%rdi), %ymm14
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovdqa 1424(%rdi), %xmm0
-; AVX2-FCP-NEXT:    vmovdqa %ymm5, %ymm9
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm13 = ymm5[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm0[3],ymm13[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} ymm14 = ymm7[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpbroadcastd 1352(%rdi), %xmm15
 ; AVX2-FCP-NEXT:    vmovdqa 1376(%rdi), %xmm0
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm6[0],ymm1[0],ymm6[2],ymm1[2]
-; AVX2-FCP-NEXT:    vpbroadcastd 1548(%rdi), %ymm14
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7]
+; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm1[0],ymm11[2],ymm1[2]
+; AVX2-FCP-NEXT:    vpbroadcastd 1548(%rdi), %ymm13
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm13 = ymm5[0],mem[1],ymm5[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1,2],mem[3]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm13 = ymm12[0,2],ymm4[1,3],ymm12[4,6],ymm4[5,7]
+; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm13 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],mem[3]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[3,2,2,3]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm13 = ymm10[0,2],ymm9[1,3],ymm10[4,6],ymm9[5,7]
 ; AVX2-FCP-NEXT:    vbroadcastss 432(%rdi), %ymm14
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm13[5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm13[5,6,7]
+; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm5 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7]
@@ -13852,11 +13726,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3]
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm5 = ymm15[0,2],ymm3[1,3],ymm15[4,6],ymm3[5,7]
-; AVX2-FCP-NEXT:    vmovaps %ymm3, %ymm13
-; AVX2-FCP-NEXT:    vbroadcastss 880(%rdi), %ymm12
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm12[7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm5 = ymm3[0,2],ymm10[1,3],ymm3[4,6],ymm10[5,7]
+; AVX2-FCP-NEXT:    vbroadcastss 880(%rdi), %ymm13
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm13[7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm5[5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
@@ -13868,8 +13741,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm4 = ymm14[0,2],ymm12[1,3],ymm14[4,6],ymm12[5,7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm4 = ymm14[0,2],ymm8[1,3],ymm14[4,6],ymm8[5,7]
+; AVX2-FCP-NEXT:    vmovaps %ymm8, %ymm13
 ; AVX2-FCP-NEXT:    vbroadcastss 1328(%rdi), %ymm5
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7]
@@ -13888,26 +13761,26 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0],ymm6[1],ymm7[2,3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3]
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm6[0,2],ymm1[1,3],ymm6[4,6],ymm1[5,7]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm11[0,2],ymm1[1,3],ymm11[4,6],ymm1[5,7]
 ; AVX2-FCP-NEXT:    vbroadcastss 1552(%rdi), %ymm3
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm0 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm11[0,1,2],mem[3]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],mem[3]
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3]
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovdqa %ymm10, %ymm8
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm7[1,3],ymm10[4,6],ymm7[5,7]
-; AVX2-FCP-NEXT:    vmovaps %ymm7, %ymm10
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm11[0,2],ymm8[1,3],ymm11[4,6],ymm8[5,7]
 ; AVX2-FCP-NEXT:    vbroadcastss 1104(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
@@ -13920,7 +13793,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm7[1,3],ymm6[4,6],ymm7[5,7]
 ; AVX2-FCP-NEXT:    vbroadcastss 656(%rdi), %ymm2
@@ -13935,285 +13808,327 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm5[1,3],ymm3[4,6],ymm5[5,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[0,2],ymm12[1,3],ymm4[4,6],ymm12[5,7]
 ; AVX2-FCP-NEXT:    vbroadcastss 208(%rdi), %ymm2
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vbroadcastss 100(%rdi), %xmm0
-; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %xmm9
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3]
-; AVX2-FCP-NEXT:    vmovsd {{.*#+}} xmm4 = [4,3,0,0]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm4, %ymm1
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
-; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7]
-; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-FCP-NEXT:    vbroadcastss 212(%rdi), %ymm3
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX2-FCP-NEXT:    vmovaps 64(%rdi), %xmm3
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm0[3]
+; AVX2-FCP-NEXT:    vmovsd {{.*#+}} xmm0 = [4,3,0,0]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm0, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm15 = [0,7,0,7,0,7,0,7]
+; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm15, %ymm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 212(%rdi), %ymm4
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm4, %ymm1
-; AVX2-FCP-NEXT:    vbroadcastss 324(%rdi), %xmm3
-; AVX2-FCP-NEXT:    vmovaps 288(%rdi), %xmm2
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
-; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm3 = ymm3[0,1,2,3,4,5],mem[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm2
+; AVX2-FCP-NEXT:    vbroadcastss 324(%rdi), %xmm4
+; AVX2-FCP-NEXT:    vmovaps 288(%rdi), %xmm1
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
+; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7]
 ; AVX2-FCP-NEXT:    vbroadcastss 436(%rdi), %ymm5
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm4, %ymm1
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm0, %ymm4
 ; AVX2-FCP-NEXT:    vbroadcastss 548(%rdi), %xmm5
-; AVX2-FCP-NEXT:    vmovaps 512(%rdi), %xmm3
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3]
-; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm0, %ymm5
+; AVX2-FCP-NEXT:    vmovaps 512(%rdi), %xmm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
+; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm15, %ymm5
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7]
 ; AVX2-FCP-NEXT:    vbroadcastss 660(%rdi), %ymm6
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm4, %ymm1
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm0, %ymm5
 ; AVX2-FCP-NEXT:    vbroadcastss 772(%rdi), %xmm6
-; AVX2-FCP-NEXT:    vmovaps 736(%rdi), %xmm5
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3]
-; AVX2-FCP-NEXT:    vpermps %ymm15, %ymm0, %ymm6
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm13[6,7]
+; AVX2-FCP-NEXT:    vmovaps 736(%rdi), %xmm4
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm6 = xmm4[0,1,2],xmm6[3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
+; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm6 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7]
 ; AVX2-FCP-NEXT:    vbroadcastss 884(%rdi), %ymm7
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm4, %ymm1
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm5, (%rsp) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm0, %ymm6
 ; AVX2-FCP-NEXT:    vbroadcastss 996(%rdi), %xmm7
-; AVX2-FCP-NEXT:    vmovaps 960(%rdi), %xmm6
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm7 = xmm6[0,1,2],xmm7[3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3]
-; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm0, %ymm7
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7]
+; AVX2-FCP-NEXT:    vmovaps 960(%rdi), %xmm5
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm7 = xmm5[0,1,2],xmm7[3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
+; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm15, %ymm7
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7]
 ; AVX2-FCP-NEXT:    vbroadcastss 1108(%rdi), %ymm8
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm4, %ymm1
-; AVX2-FCP-NEXT:    vbroadcastss 1220(%rdi), %xmm8
-; AVX2-FCP-NEXT:    vmovaps 1184(%rdi), %xmm7
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm8 = xmm7[0,1,2],xmm8[3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3]
-; AVX2-FCP-NEXT:    vpermps %ymm14, %ymm0, %ymm8
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7]
-; AVX2-FCP-NEXT:    vmovaps %ymm12, %ymm15
-; AVX2-FCP-NEXT:    vbroadcastss 1332(%rdi), %ymm10
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm10[7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm4, %ymm1
-; AVX2-FCP-NEXT:    vbroadcastss 1444(%rdi), %xmm10
-; AVX2-FCP-NEXT:    vmovaps 1408(%rdi), %xmm8
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm10 = xmm8[0,1,2],xmm10[3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FCP-NEXT:    vpermps %ymm14, %ymm0, %ymm10
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm0, %ymm6
+; AVX2-FCP-NEXT:    vbroadcastss 1220(%rdi), %xmm7
+; AVX2-FCP-NEXT:    vmovaps 1184(%rdi), %xmm10
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
+; AVX2-FCP-NEXT:    vpermps %ymm14, %ymm15, %ymm7
+; AVX2-FCP-NEXT:    vmovaps %ymm13, %ymm11
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm13[6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 1332(%rdi), %ymm8
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm0, %ymm6
+; AVX2-FCP-NEXT:    vbroadcastss 1444(%rdi), %xmm7
+; AVX2-FCP-NEXT:    vmovaps 1408(%rdi), %xmm14
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7]
-; AVX2-FCP-NEXT:    vbroadcastss 1556(%rdi), %ymm11
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm13, %ymm15, %ymm7
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 1556(%rdi), %ymm8
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm0, %ymm6
+; AVX2-FCP-NEXT:    vbroadcastss 1668(%rdi), %xmm7
+; AVX2-FCP-NEXT:    vmovaps 1632(%rdi), %xmm0
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
+; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm7 = ymm7[0,1,2,3,4,5],mem[6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 1780(%rdi), %ymm8
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm6 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7]
+; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm12 = [1,0,3,3,1,0,7,7]
+; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm12, %ymm6
+; AVX2-FCP-NEXT:    vbroadcastss 216(%rdi), %ymm7
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
+; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %xmm7
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2]
+; AVX2-FCP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm8 = mem[1,0,2,3,5,4,6,7]
+; AVX2-FCP-NEXT:    vextractf128 $1, %ymm8, %xmm8
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 320(%rdi), %xmm8
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2]
+; AVX2-FCP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm3 = mem[1,0,2,3,5,4,6,7]
+; AVX2-FCP-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm12, %ymm3
+; AVX2-FCP-NEXT:    vbroadcastss 440(%rdi), %ymm6
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm1, %ymm4, %ymm1
-; AVX2-FCP-NEXT:    vbroadcastss 1668(%rdi), %xmm4
-; AVX2-FCP-NEXT:    vmovaps 1632(%rdi), %xmm12
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm4 = xmm12[0,1,2],xmm4[3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
-; AVX2-FCP-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm4 = ymm4[0,1,2,3,4,5],mem[6,7]
-; AVX2-FCP-NEXT:    vbroadcastss 1780(%rdi), %ymm10
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm10[7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovaps 544(%rdi), %xmm6
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm2[3]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2]
+; AVX2-FCP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = mem[1,0,2,3,5,4,6,7]
+; AVX2-FCP-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm12, %ymm2
+; AVX2-FCP-NEXT:    vbroadcastss 664(%rdi), %ymm3
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm4 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm1 = [1,0,3,3,1,0,7,7]
-; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm1, %ymm4
-; AVX2-FCP-NEXT:    vbroadcastss 216(%rdi), %ymm10
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm10[7]
-; AVX2-FCP-NEXT:    vmovaps 96(%rdi), %xmm10
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm9 = xmm9[0,1,3,2]
-; AVX2-FCP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm11 = mem[1,0,2,3,5,4,6,7]
-; AVX2-FCP-NEXT:    vextractf128 $1, %ymm11, %xmm11
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 320(%rdi), %xmm4
-; AVX2-FCP-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3]
+; AVX2-FCP-NEXT:    vmovaps 768(%rdi), %xmm3
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm4[3]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2]
+; AVX2-FCP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = mem[1,0,2,3,5,4,6,7]
+; AVX2-FCP-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm12, %ymm2
+; AVX2-FCP-NEXT:    vbroadcastss 888(%rdi), %ymm4
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT:    vmovaps 992(%rdi), %xmm1
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm5[3]
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2]
 ; AVX2-FCP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm4 = mem[1,0,2,3,5,4,6,7]
 ; AVX2-FCP-NEXT:    vextractf128 $1, %ymm4, %xmm4
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm4 = mem[0],ymm4[1],mem[2,3,4],ymm4[5],mem[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm1, %ymm4
-; AVX2-FCP-NEXT:    vbroadcastss 440(%rdi), %ymm9
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm9[7]
+; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm12, %ymm4
+; AVX2-FCP-NEXT:    vbroadcastss 1112(%rdi), %ymm5
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
 ; AVX2-FCP-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 544(%rdi), %xmm2
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2]
-; AVX2-FCP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm4 = mem[1,0,2,3,5,4,6,7]
-; AVX2-FCP-NEXT:    vextractf128 $1, %ymm4, %xmm4
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $34, (%rsp), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm4, %ymm1, %ymm4
-; AVX2-FCP-NEXT:    vbroadcastss 664(%rdi), %ymm9
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm9[7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm3, (%rsp) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 768(%rdi), %xmm4
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm5[3]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2]
+; AVX2-FCP-NEXT:    vmovaps 1216(%rdi), %xmm2
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm10[3]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2]
 ; AVX2-FCP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm5 = mem[1,0,2,3,5,4,6,7]
 ; AVX2-FCP-NEXT:    vextractf128 $1, %ymm5, %xmm5
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm1, %ymm5
-; AVX2-FCP-NEXT:    vbroadcastss 888(%rdi), %ymm9
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm9[7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT:    vmovaps 992(%rdi), %xmm9
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm3 = xmm9[0,1,2],xmm6[3]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2]
-; AVX2-FCP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm5 = mem[1,0,2,3,5,4,6,7]
-; AVX2-FCP-NEXT:    vextractf128 $1, %ymm5, %xmm5
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm1, %ymm5
-; AVX2-FCP-NEXT:    vbroadcastss 1112(%rdi), %ymm6
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovaps 1216(%rdi), %xmm3
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm7[3]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2]
-; AVX2-FCP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm7 = mem[1,0,2,3,5,4,6,7]
-; AVX2-FCP-NEXT:    vextractf128 $1, %ymm7, %xmm7
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
-; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm7 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm1, %ymm7
-; AVX2-FCP-NEXT:    vbroadcastss 1336(%rdi), %ymm15
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm15[7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovaps 1440(%rdi), %xmm5
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm8 = xmm5[0,1,2],xmm8[3]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2]
-; AVX2-FCP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm15 = mem[1,0,2,3,5,4,6,7]
-; AVX2-FCP-NEXT:    vextractf128 $1, %ymm15, %xmm15
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm15 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm15, %ymm1, %ymm15
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
+; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm5 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm5, %ymm12, %ymm5
+; AVX2-FCP-NEXT:    vbroadcastss 1336(%rdi), %ymm10
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm10[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovaps 1440(%rdi), %xmm4
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm10 = xmm4[0,1,2],xmm14[3]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm10 = xmm10[0,1,3,2]
+; AVX2-FCP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm14 = mem[1,0,2,3,5,4,6,7]
+; AVX2-FCP-NEXT:    vextractf128 $1, %ymm14, %xmm14
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm9[0],ymm13[1],ymm9[2,3,4],ymm13[5],ymm9[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm14, %ymm12, %ymm14
 ; AVX2-FCP-NEXT:    vbroadcastss 1560(%rdi), %ymm11
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5,6],ymm11[7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovaps 1664(%rdi), %xmm15
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm11 = xmm15[0,1,2],xmm12[3]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2]
-; AVX2-FCP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm12 = mem[1,0,2,3,5,4,6,7]
-; AVX2-FCP-NEXT:    vextractf128 $1, %ymm12, %xmm12
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm12, %ymm1, %ymm1
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3,4,5,6],ymm11[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovaps 1664(%rdi), %xmm14
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX2-FCP-NEXT:    vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm11 = mem[1,0,2,3,5,4,6,7]
+; AVX2-FCP-NEXT:    vextractf128 $1, %ymm11, %xmm11
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm11[0,1],xmm0[2,3]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm11 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm11, %ymm12, %ymm11
 ; AVX2-FCP-NEXT:    vbroadcastss 1784(%rdi), %ymm12
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm12[7]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm11[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vbroadcastss 136(%rdi), %xmm1
-; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm1 = xmm1[0],mem[1],xmm1[2,3]
-; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vpermps 192(%rdi), %ymm0, %ymm11
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 136(%rdi), %xmm0
+; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
+; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vpermps 192(%rdi), %ymm15, %ymm11
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7]
 ; AVX2-FCP-NEXT:    vbroadcastss 80(%rdi), %ymm11
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm7 = xmm11[0,1,2],xmm7[3]
 ; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm11 = mem[2,3,2,3,6,7,6,7]
 ; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7]
 ; AVX2-FCP-NEXT:    vextractf128 $1, %ymm11, %xmm11
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vbroadcastss 360(%rdi), %xmm1
-; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm1 = xmm1[0],mem[1],xmm1[2,3]
-; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vpermps 416(%rdi), %ymm0, %ymm11
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm7 = xmm11[0,1],xmm7[2,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 360(%rdi), %xmm0
+; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
+; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vpermps 416(%rdi), %ymm15, %ymm11
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7]
 ; AVX2-FCP-NEXT:    vbroadcastss 304(%rdi), %ymm11
-; AVX2-FCP-NEXT:    vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm11 = xmm11[0,1,2],mem[3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm11 = xmm11[0,1,2],xmm8[3]
 ; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm13 = mem[2,3,2,3,6,7,6,7]
 ; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7]
 ; AVX2-FCP-NEXT:    vextractf128 $1, %ymm13, %xmm13
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm11[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vbroadcastss 584(%rdi), %xmm1
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm13 = ymm11[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 584(%rdi), %xmm0
+; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
+; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vpermps 640(%rdi), %ymm15, %ymm11
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 528(%rdi), %ymm11
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm8 = xmm11[0,1,2],xmm6[3]
+; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm11 = mem[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7]
+; AVX2-FCP-NEXT:    vextractf128 $1, %ymm11, %xmm11
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 808(%rdi), %xmm0
+; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
+; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vpermps 864(%rdi), %ymm15, %ymm11
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 752(%rdi), %ymm11
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3]
+; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm11 = mem[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7]
+; AVX2-FCP-NEXT:    vextractf128 $1, %ymm11, %xmm11
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm3 = xmm11[0,1],xmm3[2,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 1032(%rdi), %xmm0
+; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm0 = xmm0[0],mem[1],xmm0[2,3]
+; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vpermps 1088(%rdi), %ymm15, %ymm11
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 976(%rdi), %ymm11
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3]
+; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm11 = mem[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT:    # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7]
+; AVX2-FCP-NEXT:    vextractf128 $1, %ymm11, %xmm11
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 1256(%rdi), %xmm1
 ; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
 ; AVX2-FCP-NEXT:    # xmm1 = xmm1[0],mem[1],xmm1[2,3]
 ; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT:    vpermps 640(%rdi), %ymm0, %ymm11
+; AVX2-FCP-NEXT:    vpermps 1312(%rdi), %ymm15, %ymm11
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7]
-; AVX2-FCP-NEXT:    vbroadcastss 528(%rdi), %ymm11
+; AVX2-FCP-NEXT:    vbroadcastss 1200(%rdi), %ymm11
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[3]
 ; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm11 = mem[2,3,2,3,6,7,6,7]
@@ -14222,13 +14137,13 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    vextractf128 $1, %ymm11, %xmm11
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm11[0,1],xmm2[2,3]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT:    vbroadcastss 808(%rdi), %xmm2
+; AVX2-FCP-NEXT:    vbroadcastss 1480(%rdi), %xmm2
 ; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
 ; AVX2-FCP-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
 ; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vpermps 864(%rdi), %ymm0, %ymm11
+; AVX2-FCP-NEXT:    vpermps 1536(%rdi), %ymm15, %ymm11
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7]
-; AVX2-FCP-NEXT:    vbroadcastss 752(%rdi), %ymm11
+; AVX2-FCP-NEXT:    vbroadcastss 1424(%rdi), %ymm11
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3]
 ; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm11 = mem[2,3,2,3,6,7,6,7]
@@ -14236,170 +14151,126 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FCP-NEXT:    # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7]
 ; AVX2-FCP-NEXT:    vextractf128 $1, %ymm11, %xmm11
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT:    vbroadcastss 1032(%rdi), %xmm2
-; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
-; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vpermps 1088(%rdi), %ymm0, %ymm11
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7]
-; AVX2-FCP-NEXT:    vbroadcastss 976(%rdi), %ymm11
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm11 = xmm11[0,1,2],xmm9[3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 1704(%rdi), %xmm4
+; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX2-FCP-NEXT:    # xmm4 = xmm4[0],mem[1],xmm4[2,3]
+; AVX2-FCP-NEXT:    vpermps 1760(%rdi), %ymm15, %ymm11
+; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm11[6,7]
+; AVX2-FCP-NEXT:    vbroadcastss 1648(%rdi), %ymm11
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm11 = xmm11[0,1,2],xmm14[3]
 ; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm14 = mem[2,3,2,3,6,7,6,7]
 ; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
 ; AVX2-FCP-NEXT:    # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7]
 ; AVX2-FCP-NEXT:    vextractf128 $1, %ymm14, %xmm14
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm14 = ymm11[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT:    vbroadcastss 1256(%rdi), %xmm2
-; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm2 = xmm2[0],mem[1],xmm2[2,3]
-; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FCP-NEXT:    vpermps 1312(%rdi), %ymm0, %ymm11
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7]
-; AVX2-FCP-NEXT:    vbroadcastss 1200(%rdi), %ymm11
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3]
-; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm11 = mem[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7]
-; AVX2-FCP-NEXT:    vextractf128 $1, %ymm11, %xmm11
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm3 = xmm11[0,1],xmm3[2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT:    vbroadcastss 1480(%rdi), %xmm3
-; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm3 = xmm3[0],mem[1],xmm3[2,3]
-; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FCP-NEXT:    vpermps 1536(%rdi), %ymm0, %ymm11
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7]
-; AVX2-FCP-NEXT:    vbroadcastss 1424(%rdi), %ymm11
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3]
-; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm11 = mem[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7]
-; AVX2-FCP-NEXT:    vextractf128 $1, %ymm11, %xmm11
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT:    vbroadcastss 1704(%rdi), %xmm5
-; AVX2-FCP-NEXT:    vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
-; AVX2-FCP-NEXT:    # xmm5 = xmm5[0],mem[1],xmm5[2,3]
-; AVX2-FCP-NEXT:    vpermps 1760(%rdi), %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vbroadcastss 1648(%rdi), %ymm5
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3]
-; AVX2-FCP-NEXT:    vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm11 = mem[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT:    vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX2-FCP-NEXT:    # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7]
-; AVX2-FCP-NEXT:    vextractf128 $1, %ymm11, %xmm11
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 192(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 128(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 64(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, (%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 224(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 160(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 96(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 32(%rsi)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 192(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 128(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 64(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, (%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 224(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 160(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 96(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 32(%rdx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 192(%rcx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 128(%rcx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 64(%rcx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, (%rcx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 224(%rcx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 160(%rcx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 96(%rcx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 32(%rcx)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, (%r8)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 64(%r8)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 128(%r8)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 192(%r8)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 224(%r8)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 160(%r8)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 96(%r8)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 32(%r8)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 224(%r9)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 192(%r9)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 160(%r9)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 128(%r9)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 96(%r9)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 64(%r9)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, 32(%r9)
-; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT:    vmovaps %ymm5, (%r9)
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm11, 192(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm11, 128(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm11, 64(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm11, (%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm11, 224(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm11, 160(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm11, 96(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm11, 32(%rsi)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm9, 192(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm9, 128(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm9, 64(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm9, (%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm9, 224(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm11, 160(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm11, 96(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm11, 32(%rdx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm9, 192(%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm9, 128(%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm9, 64(%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm9, (%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm9, 224(%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm9, 160(%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm9, 96(%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm9, 32(%rcx)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm9, (%r8)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm9, 64(%r8)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm9, 128(%r8)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm9, 192(%r8)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm9, 224(%r8)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm9, 160(%r8)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm9, 96(%r8)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm9, 32(%r8)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm9, 224(%r9)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm9, 192(%r9)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm9, 160(%r9)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm9, 128(%r9)
+; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm9, 96(%r9)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm9, 64(%r9)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm9, 32(%r9)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm9, (%r9)
 ; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FCP-NEXT:    vmovaps %ymm12, 224(%rax)
-; AVX2-FCP-NEXT:    vmovaps %ymm8, 192(%rax)
-; AVX2-FCP-NEXT:    vmovaps %ymm7, 160(%rax)
-; AVX2-FCP-NEXT:    vmovaps %ymm6, 128(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm10, 192(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 160(%rax)
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovaps %ymm5, 128(%rax)
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vmovaps %ymm5, 96(%rax)
-; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vmovaps %ymm5, 64(%rax)
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vmovaps %ymm5, 32(%rax)
 ; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
 ; AVX2-FCP-NEXT:    vmovaps %ymm5, (%rax)
 ; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-FCP-NEXT:    vmovaps %ymm0, 224(%rax)
-; AVX2-FCP-NEXT:    vmovaps %ymm3, 192(%rax)
-; AVX2-FCP-NEXT:    vmovaps %ymm2, 160(%rax)
-; AVX2-FCP-NEXT:    vmovaps %ymm14, 128(%rax)
-; AVX2-FCP-NEXT:    vmovaps %ymm4, 96(%rax)
-; AVX2-FCP-NEXT:    vmovaps %ymm1, 64(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm4, 224(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm2, 192(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm1, 160(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm0, 128(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm3, 96(%rax)
+; AVX2-FCP-NEXT:    vmovaps %ymm8, 64(%rax)
 ; AVX2-FCP-NEXT:    vmovaps %ymm13, 32(%rax)
-; AVX2-FCP-NEXT:    vmovaps %ymm10, (%rax)
-; AVX2-FCP-NEXT:    addq $2680, %rsp # imm = 0xA78
+; AVX2-FCP-NEXT:    vmovaps %ymm7, (%rax)
+; AVX2-FCP-NEXT:    addq $2648, %rsp # imm = 0xA58
 ; AVX2-FCP-NEXT:    vzeroupper
 ; AVX2-FCP-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
index 9e70aef86885..e2a33019fffe 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
@@ -471,9 +471,8 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vpackusdw %xmm8, %xmm7, %xmm7
 ; AVX-NEXT:    vpackusdw %xmm7, %xmm7, %xmm7
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4,5,6],xmm5[7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[10,11,u,u,u,u,u,u,u,u,u,u,4,5,12,13]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5,6,7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[10,11,2,3,6,7,u,u,u,u,u,u,4,5,12,13]
 ; AVX-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm6[5],xmm0[6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
index 32825f291e98..2b268af107f6 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
@@ -1939,137 +1939,87 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-NEXT:    vmovdqa (%rsi), %ymm1
 ; AVX2-NEXT:    vmovdqa (%rax), %xmm5
-; AVX2-NEXT:    vmovdqa (%r10), %xmm7
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[0,0,1,1]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
+; AVX2-NEXT:    vmovdqa (%r10), %xmm8
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
 ; AVX2-NEXT:    vmovdqa (%r9), %xmm10
 ; AVX2-NEXT:    vmovdqa (%r8), %xmm11
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm6[0],zero,xmm6[1],zero
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
 ; AVX2-NEXT:    vmovdqa (%rcx), %xmm12
 ; AVX2-NEXT:    vmovdqa (%rdx), %xmm13
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm8[0,0,1,1]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[0,1,1,3]
-; AVX2-NEXT:    vmovdqa (%rsi), %xmm15
-; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm14 = xmm9[0],zero,xmm9[1],zero
-; AVX2-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7]
-; AVX2-NEXT:    vmovdqa (%rdx), %ymm3
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm4[2,3],ymm14[4,5],ymm4[6,7]
-; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa (%rcx), %ymm4
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7]
-; AVX2-NEXT:    vmovdqa (%r8), %ymm6
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,1,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7]
-; AVX2-NEXT:    vmovdqa (%r9), %ymm8
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
+; AVX2-NEXT:    vmovdqa (%rsi), %xmm14
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm15
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa (%r10), %ymm9
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm7[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm11 = xmm10[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1,2],ymm5[3],ymm11[4,5,6],ymm5[7]
-; AVX2-NEXT:    vmovdqa (%rax), %ymm14
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm12 = xmm11[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm13 = xmm0[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm5[2,3],ymm12[4,5],ymm5[6,7]
+; AVX2-NEXT:    vmovdqa (%rdx), %ymm4
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm6[2],xmm3[2],xmm6[3],xmm3[3]
+; AVX2-NEXT:    vmovdqa (%rcx), %ymm6
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm7[2],xmm9[3],xmm7[3]
+; AVX2-NEXT:    vmovdqa (%r8), %ymm7
+; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm3[2,3],ymm9[4,5],ymm3[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[0,0,1,1]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero
-; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm11[0,0,1,1]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm0[0,1],ymm7[2,3],ymm0[4,5],ymm7[6,7]
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm14[0],ymm9[1],ymm14[1],ymm9[2],ymm14[2],ymm9[3],ymm14[3],ymm9[8],ymm14[8],ymm9[9],ymm14[9],ymm9[10],ymm14[10],ymm9[11],ymm14[11]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm10[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm12 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[8],ymm8[8],ymm6[9],ymm8[9],ymm6[10],ymm8[10],ymm6[11],ymm8[11]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm11 = ymm12[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm0[3],ymm11[4,5,6],ymm0[7]
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm13 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm13[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm15 = ymm0[2,1,3,3]
-; AVX2-NEXT:    vmovdqa (%rsi), %ymm0
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm5[2,1,3,3,6,5,7,7]
+; AVX2-NEXT:    vmovdqa (%r9), %ymm9
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
+; AVX2-NEXT:    vmovdqa (%r10), %ymm5
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
+; AVX2-NEXT:    vmovdqa (%rax), %ymm10
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm13 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm12 = xmm11[2],xmm8[2],xmm11[3],xmm8[3]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,1,1,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,1,1,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3],ymm11[4,5],ymm8[6,7]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm5[0],ymm10[0],ymm5[1],ymm10[1],ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[8],ymm10[8],ymm5[9],ymm10[9],ymm5[10],ymm10[10],ymm5[11],ymm10[11]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm13 = ymm7[0],ymm9[0],ymm7[1],ymm9[1],ymm7[2],ymm9[2],ymm7[3],ymm9[3],ymm7[8],ymm9[8],ymm7[9],ymm9[9],ymm7[10],ymm9[10],ymm7[11],ymm9[11]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm14 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm15 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm3[2],ymm15[2],ymm3[3],ymm15[3],ymm3[6],ymm15[6],ymm3[7],ymm15[7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3,4],ymm15[5],ymm2[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm11[2,3],ymm2[4,5],ymm11[6,7]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm14[4],ymm9[5],ymm14[5],ymm9[6],ymm14[6],ymm9[7],ymm14[7],ymm9[12],ymm14[12],ymm9[13],ymm14[13],ymm9[14],ymm14[14],ymm9[15],ymm14[15]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm6[4],ymm8[4],ymm6[5],ymm8[5],ymm6[6],ymm8[6],ymm6[7],ymm8[7],ymm6[12],ymm8[12],ymm6[13],ymm8[13],ymm6[14],ymm8[14],ymm6[15],ymm8[15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm8 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm9 = ymm6[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm9 = ymm0[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2,3,4],ymm4[5],ymm9[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm8[2,3],ymm4[4,5],ymm8[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3],ymm2[4,5],ymm14[6,7]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm10[4],ymm5[5],ymm10[5],ymm5[6],ymm10[6],ymm5[7],ymm10[7],ymm5[12],ymm10[12],ymm5[13],ymm10[13],ymm5[14],ymm10[14],ymm5[15],ymm10[15]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm7 = ymm7[4],ymm9[4],ymm7[5],ymm9[5],ymm7[6],ymm9[6],ymm7[7],ymm9[7],ymm7[12],ymm9[12],ymm7[13],ymm9[13],ymm7[14],ymm9[14],ymm7[15],ymm9[15]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[12],ymm6[12],ymm4[13],ymm6[13],ymm4[14],ymm6[14],ymm4[15],ymm6[15]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm6 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm10[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm12[0,1,1,3,4,5,5,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm13[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm5[0,1,1,3,4,5,5,7]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm9 = ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[6],ymm4[6],ymm1[7],ymm4[7]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm5[0,2,2,3]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5],ymm4[6,7]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm4 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[4],ymm11[4],ymm13[5],ymm11[5]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm3[0],ymm15[0],ymm3[1],ymm15[1],ymm3[4],ymm15[4],ymm3[5],ymm15[5]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7]
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    vmovdqa %ymm1, 128(%rax)
-; AVX2-NEXT:    vmovdqa %ymm0, 192(%rax)
-; AVX2-NEXT:    vmovdqa %ymm4, 224(%rax)
-; AVX2-NEXT:    vmovdqa %ymm11, 160(%rax)
-; AVX2-NEXT:    vmovdqa %ymm7, 64(%rax)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, 96(%rax)
+; AVX2-NEXT:    vmovdqa %ymm0, 128(%rax)
+; AVX2-NEXT:    vmovdqa %ymm1, 192(%rax)
+; AVX2-NEXT:    vmovdqa %ymm6, 224(%rax)
+; AVX2-NEXT:    vmovdqa %ymm2, 160(%rax)
+; AVX2-NEXT:    vmovdqa %ymm8, 64(%rax)
+; AVX2-NEXT:    vmovdqa %ymm12, 96(%rax)
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    vmovaps %ymm0, 32(%rax)
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -2081,137 +2031,87 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FP:       # %bb.0:
 ; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-FP-NEXT:    vmovdqa (%rsi), %ymm1
 ; AVX2-FP-NEXT:    vmovdqa (%rax), %xmm5
-; AVX2-FP-NEXT:    vmovdqa (%r10), %xmm7
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[0,0,1,1]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
+; AVX2-FP-NEXT:    vmovdqa (%r10), %xmm8
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
 ; AVX2-FP-NEXT:    vmovdqa (%r9), %xmm10
 ; AVX2-FP-NEXT:    vmovdqa (%r8), %xmm11
 ; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm6[0],zero,xmm6[1],zero
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
 ; AVX2-FP-NEXT:    vmovdqa (%rcx), %xmm12
 ; AVX2-FP-NEXT:    vmovdqa (%rdx), %xmm13
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm8[0,0,1,1]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[0,1,1,3]
-; AVX2-FP-NEXT:    vmovdqa (%rsi), %xmm15
-; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
-; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm14 = xmm9[0],zero,xmm9[1],zero
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7]
-; AVX2-FP-NEXT:    vmovdqa (%rdx), %ymm3
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm4[2,3],ymm14[4,5],ymm4[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa (%rcx), %ymm4
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7]
-; AVX2-FP-NEXT:    vmovdqa (%r8), %ymm6
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,1,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7]
-; AVX2-FP-NEXT:    vmovdqa (%r9), %ymm8
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
+; AVX2-FP-NEXT:    vmovdqa (%rsi), %xmm14
+; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm15
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa (%r10), %ymm9
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm7[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1]
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm10[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1,2],ymm5[3],ymm11[4,5,6],ymm5[7]
-; AVX2-FP-NEXT:    vmovdqa (%rax), %ymm14
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm11[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3]
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm0[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm5[2,3],ymm12[4,5],ymm5[6,7]
+; AVX2-FP-NEXT:    vmovdqa (%rdx), %ymm4
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm6[2],xmm3[2],xmm6[3],xmm3[3]
+; AVX2-FP-NEXT:    vmovdqa (%rcx), %ymm6
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm7[2],xmm9[3],xmm7[3]
+; AVX2-FP-NEXT:    vmovdqa (%r8), %ymm7
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm3[2,3],ymm9[4,5],ymm3[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[0,0,1,1]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1]
-; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm11[0,0,1,1]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3]
-; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm0[0,1],ymm7[2,3],ymm0[4,5],ymm7[6,7]
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm14[0],ymm9[1],ymm14[1],ymm9[2],ymm14[2],ymm9[3],ymm14[3],ymm9[8],ymm14[8],ymm9[9],ymm14[9],ymm9[10],ymm14[10],ymm9[11],ymm14[11]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm10[0,2,2,3,4,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm12 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[8],ymm8[8],ymm6[9],ymm8[9],ymm6[10],ymm8[10],ymm6[11],ymm8[11]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm11 = ymm12[2,1,3,3,6,5,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm0[3],ymm11[4,5,6],ymm0[7]
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm13 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm13[0,2,2,3,4,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm15 = ymm0[2,1,3,3]
-; AVX2-FP-NEXT:    vmovdqa (%rsi), %ymm0
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm5[2,1,3,3,6,5,7,7]
+; AVX2-FP-NEXT:    vmovdqa (%r9), %ymm9
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
+; AVX2-FP-NEXT:    vmovdqa (%r10), %ymm5
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
+; AVX2-FP-NEXT:    vmovdqa (%rax), %ymm10
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm13 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm12 = xmm11[2],xmm8[2],xmm11[3],xmm8[3]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,1,1,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,1,1,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3],ymm11[4,5],ymm8[6,7]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm5[0],ymm10[0],ymm5[1],ymm10[1],ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[8],ymm10[8],ymm5[9],ymm10[9],ymm5[10],ymm10[10],ymm5[11],ymm10[11]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm13 = ymm7[0],ymm9[0],ymm7[1],ymm9[1],ymm7[2],ymm9[2],ymm7[3],ymm9[3],ymm7[8],ymm9[8],ymm7[9],ymm9[9],ymm7[10],ymm9[10],ymm7[11],ymm9[11]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm14 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm15 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm3[2],ymm15[2],ymm3[3],ymm15[3],ymm3[6],ymm15[6],ymm3[7],ymm15[7]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3,4],ymm15[5],ymm2[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm11[2,3],ymm2[4,5],ymm11[6,7]
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm14[4],ymm9[5],ymm14[5],ymm9[6],ymm14[6],ymm9[7],ymm14[7],ymm9[12],ymm14[12],ymm9[13],ymm14[13],ymm9[14],ymm14[14],ymm9[15],ymm14[15]
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm6[4],ymm8[4],ymm6[5],ymm8[5],ymm6[6],ymm8[6],ymm6[7],ymm8[7],ymm6[12],ymm8[12],ymm6[13],ymm8[13],ymm6[14],ymm8[14],ymm6[15],ymm8[15]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm6[2,1,3,3,6,5,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7]
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3]
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm0[2,1,3,3,6,5,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2,3,4],ymm4[5],ymm9[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm8[2,3],ymm4[4,5],ymm8[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3],ymm2[4,5],ymm14[6,7]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm10[4],ymm5[5],ymm10[5],ymm5[6],ymm10[6],ymm5[7],ymm10[7],ymm5[12],ymm10[12],ymm5[13],ymm10[13],ymm5[14],ymm10[14],ymm5[15],ymm10[15]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm7 = ymm7[4],ymm9[4],ymm7[5],ymm9[5],ymm7[6],ymm9[6],ymm7[7],ymm9[7],ymm7[12],ymm9[12],ymm7[13],ymm9[13],ymm7[14],ymm9[14],ymm7[15],ymm9[15]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[12],ymm6[12],ymm4[13],ymm6[13],ymm4[14],ymm6[14],ymm4[15],ymm6[15]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm6 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm10[0,0,2,1,4,4,6,5]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm12[0,1,1,3,4,5,5,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm13[0,0,2,1,4,4,6,5]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm5[0,1,1,3,4,5,5,7]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm9 = ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[6],ymm4[6],ymm1[7],ymm4[7]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm5[0,2,2,3]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5],ymm4[6,7]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm4 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[4],ymm11[4],ymm13[5],ymm11[5]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm3[0],ymm15[0],ymm3[1],ymm15[1],ymm3[4],ymm15[4],ymm3[5],ymm15[5]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7]
 ; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-FP-NEXT:    vmovdqa %ymm1, 128(%rax)
-; AVX2-FP-NEXT:    vmovdqa %ymm0, 192(%rax)
-; AVX2-FP-NEXT:    vmovdqa %ymm4, 224(%rax)
-; AVX2-FP-NEXT:    vmovdqa %ymm11, 160(%rax)
-; AVX2-FP-NEXT:    vmovdqa %ymm7, 64(%rax)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm0, 96(%rax)
+; AVX2-FP-NEXT:    vmovdqa %ymm0, 128(%rax)
+; AVX2-FP-NEXT:    vmovdqa %ymm1, 192(%rax)
+; AVX2-FP-NEXT:    vmovdqa %ymm6, 224(%rax)
+; AVX2-FP-NEXT:    vmovdqa %ymm2, 160(%rax)
+; AVX2-FP-NEXT:    vmovdqa %ymm8, 64(%rax)
+; AVX2-FP-NEXT:    vmovdqa %ymm12, 96(%rax)
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FP-NEXT:    vmovaps %ymm0, 32(%rax)
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -3558,289 +3458,189 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX2-LABEL: store_i16_stride8_vf32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    subq $264, %rsp # imm = 0x108
+; AVX2-NEXT:    subq $232, %rsp
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT:    vmovdqa (%rax), %xmm1
+; AVX2-NEXT:    vmovdqa (%rax), %xmm0
+; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vmovdqa 32(%rax), %xmm10
+; AVX2-NEXT:    vmovdqa (%r10), %xmm1
 ; AVX2-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
-; AVX2-NEXT:    vmovdqa 32(%rax), %xmm11
-; AVX2-NEXT:    vmovdqa (%r10), %xmm0
+; AVX2-NEXT:    vmovdqa 32(%r10), %xmm9
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-NEXT:    vmovdqa (%r9), %xmm0
 ; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX2-NEXT:    vmovdqa (%r9), %xmm3
-; AVX2-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-NEXT:    vmovdqa (%r8), %xmm2
 ; AVX2-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7]
-; AVX2-NEXT:    vmovdqa (%rcx), %xmm1
-; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
+; AVX2-NEXT:    vmovdqa (%rcx), %xmm4
+; AVX2-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-NEXT:    vmovdqa (%rdx), %xmm3
 ; AVX2-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm15[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3]
-; AVX2-NEXT:    vmovdqa (%rsi), %xmm1
-; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX2-NEXT:    vmovdqa (%rsi), %xmm12
 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm13
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm14 = xmm1[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7]
-; AVX2-NEXT:    vmovdqa 32(%r10), %xmm9
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm5[2,3],ymm14[4,5],ymm5[6,7]
-; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 32(%r9), %xmm10
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7]
-; AVX2-NEXT:    vmovdqa 32(%r8), %xmm14
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm15[0,0,1,1]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm3[2],xmm14[3],xmm3[3]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,1,1,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm4[0,0,1,1]
+; AVX2-NEXT:    vmovdqa 32(%r9), %xmm8
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX2-NEXT:    vmovdqa 32(%r8), %xmm15
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7]
-; AVX2-NEXT:    vmovdqa 32(%rcx), %xmm15
-; AVX2-NEXT:    vmovdqa 32(%rdx), %xmm8
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3]
-; AVX2-NEXT:    vmovdqa 32(%rsi), %xmm7
-; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm6
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm12 = xmm0[0],zero,xmm0[1],zero
-; AVX2-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0],ymm3[1],ymm12[2,3,4],ymm3[5],ymm12[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7]
-; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm4[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm14[0],xmm3[0],xmm14[1],xmm3[1]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
+; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm14[0],xmm3[0],xmm14[1],xmm3[1]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm2[0,0,2,1]
+; AVX2-NEXT:    vmovdqa 32(%rcx), %xmm6
+; AVX2-NEXT:    vmovdqa 32(%rdx), %xmm5
+; AVX2-NEXT:    vmovdqa 32(%rsi), %xmm4
+; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,1,1,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3],ymm11[4,5],ymm7[6,7]
+; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm14[2],xmm3[2],xmm14[3],xmm3[3]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm3[0,0,2,1]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3]
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3]
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
-; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7]
+; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm14
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm3[2,2,3,3]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm5[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
-; AVX2-NEXT:    vmovdqa 32(%r8), %ymm14
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 32(%r9), %ymm15
-; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-NEXT:    vpunpckhwd (%rsp), %xmm0, %xmm1 # 16-byte Folded Reload
+; AVX2-NEXT:    vmovdqa 32(%r8), %ymm15
+; AVX2-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,0,1,1]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
-; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7]
-; AVX2-NEXT:    vmovdqa 32(%r10), %ymm5
+; AVX2-NEXT:    vmovdqa 32(%r9), %ymm4
+; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX2-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX2-NEXT:    vmovdqa 32(%r10), %ymm6
 ; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm7 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm7[0,0,1,1]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3]
-; AVX2-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm8 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm8 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero
-; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2,3,4],ymm6[5],ymm9[6,7]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm9 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7]
+; AVX2-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 32(%rax), %ymm10
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm3[2,3],ymm6[4,5],ymm3[6,7]
-; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm7[2,2,3,3]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm7[2],xmm9[3],xmm7[3]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm8[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
-; AVX2-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm10[4],ymm5[5],ymm10[5],ymm5[6],ymm10[6],ymm5[7],ymm10[7],ymm5[12],ymm10[12],ymm5[13],ymm10[13],ymm5[14],ymm10[14],ymm5[15],ymm10[15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm8 = ymm3[0,1,1,3,4,5,5,7]
+; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm10[4],ymm6[5],ymm10[5],ymm6[6],ymm10[6],ymm6[7],ymm10[7],ymm6[12],ymm10[12],ymm6[13],ymm10[13],ymm6[14],ymm10[14],ymm6[15],ymm10[15]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm15[4],ymm4[4],ymm15[5],ymm4[5],ymm15[6],ymm4[6],ymm15[7],ymm4[7],ymm15[12],ymm4[12],ymm15[13],ymm4[13],ymm15[14],ymm4[14],ymm15[15],ymm4[15]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm8 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm2[3],ymm8[4,5,6],ymm2[7]
-; AVX2-NEXT:    vmovdqa 32(%rdx), %ymm11
-; AVX2-NEXT:    vmovdqa 32(%rcx), %ymm12
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm9 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm8 = ymm9[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3]
-; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm13
-; AVX2-NEXT:    vmovdqa 32(%rsi), %ymm2
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm2[4],ymm13[5],ymm2[5],ymm13[6],ymm2[6],ymm13[7],ymm2[7],ymm13[12],ymm2[12],ymm13[13],ymm2[13],ymm13[14],ymm2[14],ymm13[15],ymm2[15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm6 = ymm0[0,1,1,3,4,5,5,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7]
-; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm9[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7]
+; AVX2-NEXT:    vmovdqa 32(%rsi), %ymm11
+; AVX2-NEXT:    vmovdqa 32(%rdx), %ymm13
+; AVX2-NEXT:    vmovdqa 32(%rcx), %ymm9
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm13[4],ymm9[4],ymm13[5],ymm9[5],ymm13[6],ymm9[6],ymm13[7],ymm9[7],ymm13[12],ymm9[12],ymm13[13],ymm9[13],ymm13[14],ymm9[14],ymm13[15],ymm9[15]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm14[4],ymm11[4],ymm14[5],ymm11[5],ymm14[6],ymm11[6],ymm14[7],ymm11[7],ymm14[12],ymm11[12],ymm14[13],ymm11[13],ymm14[14],ymm11[14],ymm14[15],ymm11[15]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm8[2,3],ymm5[4,5],ymm8[6,7]
+; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm2[0,2,2,3]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm10[0],ymm5[1],ymm10[1],ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[8],ymm10[8],ymm5[9],ymm10[9],ymm5[10],ymm10[10],ymm5[11],ymm10[11]
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm1[0,1,1,3,4,5,5,7]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm6[0],ymm10[0],ymm6[1],ymm10[1],ymm6[2],ymm10[2],ymm6[3],ymm10[3],ymm6[8],ymm10[8],ymm6[9],ymm10[9],ymm6[10],ymm10[10],ymm6[11],ymm10[11]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm15[0],ymm4[0],ymm15[1],ymm4[1],ymm15[2],ymm4[2],ymm15[3],ymm4[3],ymm15[8],ymm4[8],ymm15[9],ymm4[9],ymm15[10],ymm4[10],ymm15[11],ymm4[11]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm13[0],ymm9[0],ymm13[1],ymm9[1],ymm13[2],ymm9[2],ymm13[3],ymm9[3],ymm13[8],ymm9[8],ymm13[9],ymm9[9],ymm13[10],ymm9[10],ymm13[11],ymm9[11]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7]
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11]
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm2[0],ymm13[1],ymm2[1],ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[8],ymm2[8],ymm13[9],ymm2[9],ymm13[10],ymm2[10],ymm13[11],ymm2[11]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm4[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm6 = ymm2[0,1,1,3,4,5,5,7]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm14[0],ymm11[0],ymm14[1],ymm11[1],ymm14[2],ymm11[2],ymm14[3],ymm11[3],ymm14[8],ymm11[8],ymm14[9],ymm11[9],ymm14[10],ymm11[10],ymm14[11],ymm11[11]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm6 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[4],ymm3[4],ymm5[5],ymm3[5]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7]
-; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa (%r10), %ymm12
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
-; AVX2-NEXT:    vmovdqa (%rax), %ymm13
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm4[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm1[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7]
 ; AVX2-NEXT:    vmovdqa (%r8), %ymm8
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7]
 ; AVX2-NEXT:    vmovdqa (%r9), %ymm7
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm4[0,1,1,3,4,5,5,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm0[3],ymm5[4,5,6],ymm0[7]
-; AVX2-NEXT:    vmovdqa (%rdx), %ymm9
-; AVX2-NEXT:    vmovdqa (%rcx), %ymm6
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm14 = ymm9[4],ymm6[4],ymm9[5],ymm6[5],ymm9[6],ymm6[6],ymm9[7],ymm6[7],ymm9[12],ymm6[12],ymm9[13],ymm6[13],ymm9[14],ymm6[14],ymm9[15],ymm6[15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm15 = ymm14[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3]
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX2-NEXT:    vmovdqa (%rsi), %ymm2
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm10 = ymm0[0,1,1,3,4,5,5,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm15 = ymm10[0,1],ymm5[2,3],ymm10[4,5],ymm5[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm14[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm3 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7]
+; AVX2-NEXT:    vmovdqa (%r10), %ymm5
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7]
+; AVX2-NEXT:    vmovdqa (%rax), %ymm4
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm9 = ymm6[0],ymm3[0],ymm6[1],ymm3[1],ymm6[4],ymm3[4],ymm6[5],ymm3[5]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm13
+; AVX2-NEXT:    vmovdqa (%rsi), %ymm14
+; AVX2-NEXT:    vmovdqa (%rdx), %ymm15
+; AVX2-NEXT:    vmovdqa (%rcx), %ymm2
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm15[4],ymm2[4],ymm15[5],ymm2[5],ymm15[6],ymm2[6],ymm15[7],ymm2[7],ymm15[12],ymm2[12],ymm15[13],ymm2[13],ymm15[14],ymm2[14],ymm15[15],ymm2[15]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm14[4],ymm13[5],ymm14[5],ymm13[6],ymm14[6],ymm13[7],ymm14[7],ymm13[12],ymm14[12],ymm13[13],ymm14[13],ymm13[14],ymm14[14],ymm13[15],ymm14[15]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3],ymm12[4,5],ymm9[6,7]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm3 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm3[0,2,2,3]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11]
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm0[0,0,2,1,4,4,6,5]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[2],ymm2[2],ymm15[3],ymm2[3],ymm15[8],ymm2[8],ymm15[9],ymm2[9],ymm15[10],ymm2[10],ymm15[11],ymm2[11]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm5 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[4],ymm1[4],ymm3[5],ymm1[5]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm7 = ymm4[0,1,1,3,4,5,5,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7]
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[8],ymm6[8],ymm9[9],ymm6[9],ymm9[10],ymm6[10],ymm9[11],ymm6[11]
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm6[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm7 = ymm2[0,1,1,3,4,5,5,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2,3,4],ymm3[5],ymm7[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm6[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm6 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    vmovdqa %ymm0, 160(%rax)
-; AVX2-NEXT:    vmovdqa %ymm3, 128(%rax)
-; AVX2-NEXT:    vmovdqa %ymm1, 224(%rax)
-; AVX2-NEXT:    vmovdqa %ymm15, 192(%rax)
+; AVX2-NEXT:    vmovdqa %ymm1, 160(%rax)
+; AVX2-NEXT:    vmovdqa %ymm5, 128(%rax)
+; AVX2-NEXT:    vmovdqa %ymm0, 224(%rax)
+; AVX2-NEXT:    vmovdqa %ymm9, 192(%rax)
 ; AVX2-NEXT:    vmovdqa %ymm11, 416(%rax)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, 384(%rax)
+; AVX2-NEXT:    vmovdqa %ymm10, 384(%rax)
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    vmovaps %ymm0, 480(%rax)
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    vmovaps %ymm0, 448(%rax)
-; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, 96(%rax)
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm0, 96(%rax)
+; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    vmovaps %ymm0, 64(%rax)
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    vmovaps %ymm0, 288(%rax)
@@ -3854,295 +3654,195 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-NEXT:    vmovaps %ymm0, (%rax)
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    vmovaps %ymm0, 32(%rax)
-; AVX2-NEXT:    addq $264, %rsp # imm = 0x108
+; AVX2-NEXT:    addq $232, %rsp
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX2-FP-LABEL: store_i16_stride8_vf32:
 ; AVX2-FP:       # %bb.0:
-; AVX2-FP-NEXT:    subq $264, %rsp # imm = 0x108
+; AVX2-FP-NEXT:    subq $232, %rsp
 ; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX2-FP-NEXT:    vmovdqa (%rax), %xmm1
+; AVX2-FP-NEXT:    vmovdqa (%rax), %xmm0
+; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 32(%rax), %xmm10
+; AVX2-FP-NEXT:    vmovdqa (%r10), %xmm1
 ; AVX2-FP-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 32(%rax), %xmm11
-; AVX2-FP-NEXT:    vmovdqa (%r10), %xmm0
+; AVX2-FP-NEXT:    vmovdqa 32(%r10), %xmm9
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-FP-NEXT:    vmovdqa (%r9), %xmm0
 ; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX2-FP-NEXT:    vmovdqa (%r9), %xmm3
-; AVX2-FP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa (%r8), %xmm2
 ; AVX2-FP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7]
-; AVX2-FP-NEXT:    vmovdqa (%rcx), %xmm1
-; AVX2-FP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
+; AVX2-FP-NEXT:    vmovdqa (%rcx), %xmm4
+; AVX2-FP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa (%rdx), %xmm3
 ; AVX2-FP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm15[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3]
-; AVX2-FP-NEXT:    vmovdqa (%rsi), %xmm1
-; AVX2-FP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX2-FP-NEXT:    vmovdqa (%rsi), %xmm12
 ; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm13
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm1[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7]
-; AVX2-FP-NEXT:    vmovdqa 32(%r10), %xmm9
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm5[2,3],ymm14[4,5],ymm5[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 32(%r9), %xmm10
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
-; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7]
-; AVX2-FP-NEXT:    vmovdqa 32(%r8), %xmm14
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm15[0,0,1,1]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
-; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm3[2],xmm14[3],xmm3[3]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,1,1,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm4[0,0,1,1]
+; AVX2-FP-NEXT:    vmovdqa 32(%r9), %xmm8
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX2-FP-NEXT:    vmovdqa 32(%r8), %xmm15
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7]
-; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7]
-; AVX2-FP-NEXT:    vmovdqa 32(%rcx), %xmm15
-; AVX2-FP-NEXT:    vmovdqa 32(%rdx), %xmm8
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3]
-; AVX2-FP-NEXT:    vmovdqa 32(%rsi), %xmm7
-; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %xmm6
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
-; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm12 = xmm0[0],zero,xmm0[1],zero
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0],ymm3[1],ymm12[2,3,4],ymm3[5],ymm12[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm4[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm14[0],xmm3[0],xmm14[1],xmm3[1]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm14[0],xmm3[0],xmm14[1],xmm3[1]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm2[0,0,2,1]
+; AVX2-FP-NEXT:    vmovdqa 32(%rcx), %xmm6
+; AVX2-FP-NEXT:    vmovdqa 32(%rdx), %xmm5
+; AVX2-FP-NEXT:    vmovdqa 32(%rsi), %xmm4
+; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,1,1,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3],ymm11[4,5],ymm7[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm14[2],xmm3[2],xmm14[3],xmm3[3]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm3[0,0,2,1]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3]
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3]
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3]
-; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm14
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm3[2,2,3,3]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm5[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
-; AVX2-FP-NEXT:    vmovdqa 32(%r8), %ymm14
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 32(%r9), %ymm15
-; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FP-NEXT:    vpunpckhwd (%rsp), %xmm0, %xmm1 # 16-byte Folded Reload
+; AVX2-FP-NEXT:    vmovdqa 32(%r8), %ymm15
+; AVX2-FP-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX2-FP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
 ; AVX2-FP-NEXT:    # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,0,1,1]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
-; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-FP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; AVX2-FP-NEXT:    # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
-; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7]
-; AVX2-FP-NEXT:    vmovdqa 32(%r10), %ymm5
+; AVX2-FP-NEXT:    vmovdqa 32(%r9), %ymm4
+; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX2-FP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
+; AVX2-FP-NEXT:    # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX2-FP-NEXT:    vmovdqa 32(%r10), %ymm6
 ; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-FP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload
 ; AVX2-FP-NEXT:    # xmm7 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm7[0,0,1,1]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3]
-; AVX2-FP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm8 # 16-byte Folded Reload
-; AVX2-FP-NEXT:    # xmm8 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7]
-; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2,3,4],ymm6[5],ymm9[6,7]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm9 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 32(%rax), %ymm10
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm3[2,3],ymm6[4,5],ymm3[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm7[2,2,3,3]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm7[2],xmm9[3],xmm7[3]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm8[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm10[4],ymm5[5],ymm10[5],ymm5[6],ymm10[6],ymm5[7],ymm10[7],ymm5[12],ymm10[12],ymm5[13],ymm10[13],ymm5[14],ymm10[14],ymm5[15],ymm10[15]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[0,0,2,1,4,4,6,5]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm3[0,1,1,3,4,5,5,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm10[4],ymm6[5],ymm10[5],ymm6[6],ymm10[6],ymm6[7],ymm10[7],ymm6[12],ymm10[12],ymm6[13],ymm10[13],ymm6[14],ymm10[14],ymm6[15],ymm10[15]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm15[4],ymm4[4],ymm15[5],ymm4[5],ymm15[6],ymm4[6],ymm15[7],ymm4[7],ymm15[12],ymm4[12],ymm15[13],ymm4[13],ymm15[14],ymm4[14],ymm15[15],ymm4[15]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm8 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm2[3],ymm8[4,5,6],ymm2[7]
-; AVX2-FP-NEXT:    vmovdqa 32(%rdx), %ymm11
-; AVX2-FP-NEXT:    vmovdqa 32(%rcx), %ymm12
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm9 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm9[0,0,2,1,4,4,6,5]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3]
-; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm13
-; AVX2-FP-NEXT:    vmovdqa 32(%rsi), %ymm2
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm2[4],ymm13[5],ymm2[5],ymm13[6],ymm2[6],ymm13[7],ymm2[7],ymm13[12],ymm2[12],ymm13[13],ymm2[13],ymm13[14],ymm2[14],ymm13[15],ymm2[15]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm0[0,1,1,3,4,5,5,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm9[0,2,2,3,4,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7]
+; AVX2-FP-NEXT:    vmovdqa 32(%rsi), %ymm11
+; AVX2-FP-NEXT:    vmovdqa 32(%rdx), %ymm13
+; AVX2-FP-NEXT:    vmovdqa 32(%rcx), %ymm9
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm13[4],ymm9[4],ymm13[5],ymm9[5],ymm13[6],ymm9[6],ymm13[7],ymm9[7],ymm13[12],ymm9[12],ymm13[13],ymm9[13],ymm13[14],ymm9[14],ymm13[15],ymm9[15]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm14[4],ymm11[4],ymm14[5],ymm11[5],ymm14[6],ymm11[6],ymm14[7],ymm11[7],ymm14[12],ymm11[12],ymm14[13],ymm11[13],ymm14[14],ymm11[14],ymm14[15],ymm11[15]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm8[2,3],ymm5[4,5],ymm8[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm2[0,2,2,3]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm10[0],ymm5[1],ymm10[1],ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[8],ymm10[8],ymm5[9],ymm10[9],ymm5[10],ymm10[10],ymm5[11],ymm10[11]
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[0,0,2,1,4,4,6,5]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm1[0,1,1,3,4,5,5,7]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm6[0],ymm10[0],ymm6[1],ymm10[1],ymm6[2],ymm10[2],ymm6[3],ymm10[3],ymm6[8],ymm10[8],ymm6[9],ymm10[9],ymm6[10],ymm10[10],ymm6[11],ymm10[11]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm15[0],ymm4[0],ymm15[1],ymm4[1],ymm15[2],ymm4[2],ymm15[3],ymm4[3],ymm15[8],ymm4[8],ymm15[9],ymm4[9],ymm15[10],ymm4[10],ymm15[11],ymm4[11]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm13[0],ymm9[0],ymm13[1],ymm9[1],ymm13[2],ymm9[2],ymm13[3],ymm9[3],ymm13[8],ymm9[8],ymm13[9],ymm9[9],ymm13[10],ymm9[10],ymm13[11],ymm9[11]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7]
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11]
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm2[0],ymm13[1],ymm2[1],ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[8],ymm2[8],ymm13[9],ymm2[9],ymm13[10],ymm2[10],ymm13[11],ymm2[11]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm4[0,0,2,1,4,4,6,5]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm2[0,1,1,3,4,5,5,7]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm14[0],ymm11[0],ymm14[1],ymm11[1],ymm14[2],ymm11[2],ymm14[3],ymm11[3],ymm14[8],ymm11[8],ymm14[9],ymm11[9],ymm14[10],ymm11[10],ymm14[11],ymm11[11]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm6 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[4],ymm3[4],ymm5[5],ymm3[5]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa (%r10), %ymm12
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
-; AVX2-FP-NEXT:    vmovdqa (%rax), %ymm13
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm4[0,2,2,3,4,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm1[0,0,2,1,4,4,6,5]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7]
 ; AVX2-FP-NEXT:    vmovdqa (%r8), %ymm8
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7]
 ; AVX2-FP-NEXT:    vmovdqa (%r9), %ymm7
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm4[0,1,1,3,4,5,5,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm0[3],ymm5[4,5,6],ymm0[7]
-; AVX2-FP-NEXT:    vmovdqa (%rdx), %ymm9
-; AVX2-FP-NEXT:    vmovdqa (%rcx), %ymm6
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm14 = ymm9[4],ymm6[4],ymm9[5],ymm6[5],ymm9[6],ymm6[6],ymm9[7],ymm6[7],ymm9[12],ymm6[12],ymm9[13],ymm6[13],ymm9[14],ymm6[14],ymm9[15],ymm6[15]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm14[0,0,2,1,4,4,6,5]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3]
-; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX2-FP-NEXT:    vmovdqa (%rsi), %ymm2
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm10 = ymm0[0,1,1,3,4,5,5,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm10[0,1],ymm5[2,3],ymm10[4,5],ymm5[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm14[0,2,2,3,4,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm3 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7]
+; AVX2-FP-NEXT:    vmovdqa (%r10), %ymm5
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7]
+; AVX2-FP-NEXT:    vmovdqa (%rax), %ymm4
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm9 = ymm6[0],ymm3[0],ymm6[1],ymm3[1],ymm6[4],ymm3[4],ymm6[5],ymm3[5]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
+; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm13
+; AVX2-FP-NEXT:    vmovdqa (%rsi), %ymm14
+; AVX2-FP-NEXT:    vmovdqa (%rdx), %ymm15
+; AVX2-FP-NEXT:    vmovdqa (%rcx), %ymm2
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm15[4],ymm2[4],ymm15[5],ymm2[5],ymm15[6],ymm2[6],ymm15[7],ymm2[7],ymm15[12],ymm2[12],ymm15[13],ymm2[13],ymm15[14],ymm2[14],ymm15[15],ymm2[15]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm14[4],ymm13[5],ymm14[5],ymm13[6],ymm14[6],ymm13[7],ymm14[7],ymm13[12],ymm14[12],ymm13[13],ymm14[13],ymm13[14],ymm14[14],ymm13[15],ymm14[15]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3],ymm12[4,5],ymm9[6,7]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm3 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm3[0,2,2,3]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11]
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm0[0,0,2,1,4,4,6,5]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[2],ymm2[2],ymm15[3],ymm2[3],ymm15[8],ymm2[8],ymm15[9],ymm2[9],ymm15[10],ymm2[10],ymm15[11],ymm2[11]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm5 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[4],ymm1[4],ymm3[5],ymm1[5]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm4[0,1,1,3,4,5,5,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7]
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[8],ymm6[8],ymm9[9],ymm6[9],ymm9[10],ymm6[10],ymm9[11],ymm6[11]
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm6[0,0,2,1,4,4,6,5]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm2[0,1,1,3,4,5,5,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2,3,4],ymm3[5],ymm7[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm6[0,2,2,3,4,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm6 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
 ; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-FP-NEXT:    vmovdqa %ymm0, 160(%rax)
-; AVX2-FP-NEXT:    vmovdqa %ymm3, 128(%rax)
-; AVX2-FP-NEXT:    vmovdqa %ymm1, 224(%rax)
-; AVX2-FP-NEXT:    vmovdqa %ymm15, 192(%rax)
+; AVX2-FP-NEXT:    vmovdqa %ymm1, 160(%rax)
+; AVX2-FP-NEXT:    vmovdqa %ymm5, 128(%rax)
+; AVX2-FP-NEXT:    vmovdqa %ymm0, 224(%rax)
+; AVX2-FP-NEXT:    vmovdqa %ymm9, 192(%rax)
 ; AVX2-FP-NEXT:    vmovdqa %ymm11, 416(%rax)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm0, 384(%rax)
+; AVX2-FP-NEXT:    vmovdqa %ymm10, 384(%rax)
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FP-NEXT:    vmovaps %ymm0, 480(%rax)
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FP-NEXT:    vmovaps %ymm0, 448(%rax)
-; AVX2-FP-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm0, 96(%rax)
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm0, 96(%rax)
+; AVX2-FP-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVX2-FP-NEXT:    vmovaps %ymm0, 64(%rax)
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FP-NEXT:    vmovaps %ymm0, 288(%rax)
@@ -4156,7 +3856,7 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FP-NEXT:    vmovaps %ymm0, (%rax)
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FP-NEXT:    vmovaps %ymm0, 32(%rax)
-; AVX2-FP-NEXT:    addq $264, %rsp # imm = 0x108
+; AVX2-FP-NEXT:    addq $232, %rsp
 ; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
 ;
@@ -6964,533 +6664,339 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX2-LABEL: store_i16_stride8_vf64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    subq $744, %rsp # imm = 0x2E8
+; AVX2-NEXT:    subq $712, %rsp # imm = 0x2C8
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT:    vmovdqa (%rax), %xmm6
+; AVX2-NEXT:    vmovdqa (%rax), %xmm5
 ; AVX2-NEXT:    vmovdqa 32(%rax), %xmm0
-; AVX2-NEXT:    vmovdqa (%r10), %xmm7
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm3[0,0,1,1]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; AVX2-NEXT:    vmovdqa (%r10), %xmm6
+; AVX2-NEXT:    vmovdqa 32(%r10), %xmm1
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
 ; AVX2-NEXT:    vmovdqa (%r9), %xmm8
 ; AVX2-NEXT:    vmovdqa (%r8), %xmm9
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
 ; AVX2-NEXT:    vmovdqa (%rcx), %xmm10
 ; AVX2-NEXT:    vmovdqa (%rdx), %xmm11
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm5[0,0,1,1]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
 ; AVX2-NEXT:    vmovdqa (%rsi), %xmm12
 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm13
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm15 = xmm14[0],zero,xmm14[1],zero
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm15 = xmm14[0],xmm7[0],xmm14[1],xmm7[1]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,1,1,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm15 = ymm15[0],ymm2[1],ymm15[2,3,4],ymm2[5],ymm15[6,7]
-; AVX2-NEXT:    vmovdqa 32(%r10), %xmm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3],ymm15[4,5],ymm1[6,7]
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 32(%r9), %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm15 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7]
-; AVX2-NEXT:    vmovdqa 32(%r8), %xmm3
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm5[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm14[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
-; AVX2-NEXT:    vmovdqa 32(%rcx), %xmm4
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2,3],ymm5[4,5],ymm15[6,7]
-; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 32(%rdx), %xmm5
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero
-; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7]
-; AVX2-NEXT:    vmovdqa 32(%rsi), %xmm7
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm2[2,3],ymm15[4,5],ymm2[6,7]
+; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 32(%r9), %xmm2
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm15 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX2-NEXT:    vmovdqa 32(%r8), %xmm4
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm14[2],xmm7[2],xmm14[3],xmm7[3]
+; AVX2-NEXT:    vmovdqa 32(%rcx), %xmm3
+; AVX2-NEXT:    vpermq {{.*#+}} ymm14 = ymm15[0,0,2,1]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3],ymm7[4,5],ymm14[6,7]
+; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 32(%rdx), %xmm7
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; AVX2-NEXT:    vmovdqa 32(%rsi), %xmm5
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
+; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm8
 ; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm11 = xmm10[0,0,1,1]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,1,1,3]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm11 = xmm9[0],xmm6[0],xmm9[1],xmm6[1]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
 ; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm13 = xmm12[0],zero,xmm12[1],zero
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm13 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm13[0],ymm11[1],ymm13[2,3,4],ymm11[5],ymm13[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1],ymm9[2,3],ymm11[4,5],ymm9[6,7]
-; AVX2-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3],ymm13[4,5],ymm11[6,7]
+; AVX2-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm9[2],xmm6[2],xmm9[3],xmm6[3]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm10[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,1,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm9 = xmm12[2,2,3,3]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm9 = xmm12[2],xmm10[2],xmm12[3],xmm10[3]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm6[0,0,1,1]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm9 = xmm10[0],zero,xmm10[1],zero
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm9[0],xmm6[0],xmm9[1],xmm6[1]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7]
+; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm6[2],xmm9[3],xmm6[3]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm10 = xmm12[2],xmm11[2],xmm12[3],xmm11[3]
+; AVX2-NEXT:    vmovdqa 64(%rax), %xmm6
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7]
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm9 = xmm11[0,0,1,1]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm12 = ymm9[0,1,1,3]
-; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm9
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero
-; AVX2-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3],ymm12[4,5],ymm8[6,7]
-; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm10[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm11[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm13[2,2,3,3]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0],ymm6[1],ymm10[2,3,4],ymm6[5],ymm10[6,7]
-; AVX2-NEXT:    vmovdqa 64(%rax), %xmm6
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7]
-; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 64(%r10), %xmm8
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[0,0,1,1]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7]
+; AVX2-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovdqa 64(%r10), %xmm9
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; AVX2-NEXT:    vmovdqa 64(%r9), %xmm0
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm4[0,0,1,1]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero
-; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0],ymm1[1],ymm7[2,3,4],ymm1[5],ymm7[6,7]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
 ; AVX2-NEXT:    vmovdqa 64(%r8), %xmm1
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7]
-; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm10[0],xmm4[1],xmm10[1]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm4[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm5[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm7[0,0,1,1]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm9[0],zero,xmm9[1],zero
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
 ; AVX2-NEXT:    vmovdqa 64(%rcx), %xmm2
-; AVX2-NEXT:    vmovdqa 64(%rdx), %xmm3
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm11[0,0,1,1]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm12 = ymm4[0,1,1,3]
-; AVX2-NEXT:    vmovdqa 64(%rsi), %xmm4
-; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm5
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero
-; AVX2-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7]
-; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm9 = xmm11[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm13[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7]
-; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm6[0,0,1,1]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero
-; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[0,0,1,1]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7]
+; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1]
+; AVX2-NEXT:    vmovdqa 64(%rdx), %xmm7
+; AVX2-NEXT:    vmovdqa 64(%rsi), %xmm8
+; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm10
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm5[2,3],ymm13[4,5],ymm5[6,7]
+; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm12[2],xmm11[2],xmm12[3],xmm11[3]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
+; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7]
-; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm6[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
+; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa 96(%rax), %xmm0
 ; AVX2-NEXT:    vmovdqa 96(%r10), %xmm1
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm8[0,0,1,1]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[0,0,2,1]
-; AVX2-NEXT:    vmovdqa 96(%r9), %xmm2
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-NEXT:    vmovdqa 96(%r9), %xmm3
 ; AVX2-NEXT:    vmovdqa 96(%r8), %xmm4
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm9[0],zero,xmm9[1],zero
-; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7]
-; AVX2-NEXT:    vmovdqa 96(%rcx), %xmm3
-; AVX2-NEXT:    vmovdqa 96(%rdx), %xmm5
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm11[0,0,1,1]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm12 = ymm6[0,1,1,3]
-; AVX2-NEXT:    vmovdqa 96(%rsi), %xmm6
-; AVX2-NEXT:    vmovdqa 96(%rdi), %xmm7
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero
-; AVX2-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7]
-; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm9 = xmm11[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm13[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
-; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
+; AVX2-NEXT:    vmovdqa 96(%rcx), %xmm7
+; AVX2-NEXT:    vmovdqa 96(%rdx), %xmm8
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; AVX2-NEXT:    vmovdqa 96(%rsi), %xmm10
+; AVX2-NEXT:    vmovdqa 96(%rdi), %xmm11
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm9[0],xmm13[1],xmm9[1]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm12[0,1],ymm6[2,3],ymm12[4,5],ymm6[6,7]
+; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm5[2],xmm2[2],xmm5[3],xmm2[3]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm13[2],xmm9[2],xmm13[3],xmm9[3]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero
-; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[0,0,1,1]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm4[0],zero,xmm4[1],zero
-; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
+; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm3[2,2,3,3]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm4[2,2,3,3]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa (%r10), %ymm0
-; AVX2-NEXT:    vmovdqa (%rax), %ymm1
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-NEXT:    vmovdqa (%r8), %ymm4
-; AVX2-NEXT:    vmovdqa (%r9), %ymm5
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7]
-; AVX2-NEXT:    vmovdqa (%rdx), %ymm7
-; AVX2-NEXT:    vmovdqa (%rcx), %ymm8
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm10 = ymm9[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3]
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm11
-; AVX2-NEXT:    vmovdqa (%rsi), %ymm12
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm13 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm14 = ymm13[0,1,1,3,4,5,5,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm14[0],ymm10[1],ymm14[2,3,4],ymm10[5],ymm14[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3],ymm10[4,5],ymm3[6,7]
-; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm6[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm9[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm6 = ymm13[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
-; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm0[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm1[0,1,1,3,4,5,5,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
+; AVX2-NEXT:    vmovdqa (%r8), %ymm11
+; AVX2-NEXT:    vmovdqa (%r9), %ymm1
+; AVX2-NEXT:    vmovdqa (%r10), %ymm2
+; AVX2-NEXT:    vmovdqa (%rax), %ymm3
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm11[0],ymm1[0],ymm11[1],ymm1[1],ymm11[2],ymm1[2],ymm11[3],ymm1[3],ymm11[8],ymm1[8],ymm11[9],ymm1[9],ymm11[10],ymm1[10],ymm11[11],ymm1[11]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm12 = ymm6[0,2,2,3]
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm7
+; AVX2-NEXT:    vmovdqa (%rsi), %ymm8
+; AVX2-NEXT:    vmovdqa (%rdx), %ymm9
+; AVX2-NEXT:    vmovdqa (%rcx), %ymm6
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[8],ymm6[8],ymm9[9],ymm6[9],ymm9[10],ymm6[10],ymm9[11],ymm6[11]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm13 = ymm0[0],ymm10[0],ymm0[1],ymm10[1],ymm0[4],ymm10[4],ymm0[5],ymm10[5]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7]
+; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm4 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm10[2],ymm0[3],ymm10[3],ymm0[6],ymm10[6],ymm0[7],ymm10[7]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7]
+; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm11[4],ymm1[4],ymm11[5],ymm1[5],ymm11[6],ymm1[6],ymm11[7],ymm1[7],ymm11[12],ymm1[12],ymm11[13],ymm1[13],ymm11[14],ymm1[14],ymm11[15],ymm1[15]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm6[4],ymm9[5],ymm6[5],ymm9[6],ymm6[6],ymm9[7],ymm6[7],ymm9[12],ymm6[12],ymm9[13],ymm6[13],ymm9[14],ymm6[14],ymm9[15],ymm6[15]
 ; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm3[0,0,2,1,4,4,6,5]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm6 = ymm4[0,1,1,3,4,5,5,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
-; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
+; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[0,2,2,3,4,6,6,7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm4[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 32(%r10), %ymm13
-; AVX2-NEXT:    vmovdqa 32(%rax), %ymm11
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-NEXT:    vmovdqa 32(%r8), %ymm4
-; AVX2-NEXT:    vmovdqa 32(%r9), %ymm5
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7]
-; AVX2-NEXT:    vmovdqa 32(%rdx), %ymm7
-; AVX2-NEXT:    vmovdqa 32(%rcx), %ymm8
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm10 = ymm9[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3]
-; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX2-NEXT:    vmovdqa 32(%rsi), %ymm3
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm15 = ymm0[0,1,1,3,4,5,5,7]
+; AVX2-NEXT:    vmovdqa 32(%r8), %ymm9
+; AVX2-NEXT:    vmovdqa 32(%r9), %ymm8
+; AVX2-NEXT:    vmovdqa 32(%r10), %ymm7
+; AVX2-NEXT:    vmovdqa 32(%rax), %ymm3
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[8],ymm3[8],ymm7[9],ymm3[9],ymm7[10],ymm3[10],ymm7[11],ymm3[11]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm6[0,2,2,3]
+; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm11
+; AVX2-NEXT:    vmovdqa 32(%rsi), %ymm13
+; AVX2-NEXT:    vmovdqa 32(%rdx), %ymm2
+; AVX2-NEXT:    vmovdqa 32(%rcx), %ymm1
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[8],ymm13[8],ymm11[9],ymm13[9],ymm11[10],ymm13[10],ymm11[11],ymm13[11]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm15 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[4],ymm6[4],ymm0[5],ymm6[5]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0],ymm10[1],ymm15[2,3,4],ymm10[5],ymm15[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5],ymm12[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm6 = ymm9[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm4 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[6],ymm6[6],ymm0[7],ymm6[7]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
-; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm0[0,0,2,1,4,4,6,5]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7]
+; AVX2-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm7[4],ymm3[4],ymm7[5],ymm3[5],ymm7[6],ymm3[6],ymm7[7],ymm3[7],ymm7[12],ymm3[12],ymm7[13],ymm3[13],ymm7[14],ymm3[14],ymm7[15],ymm3[15]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[12],ymm13[12],ymm11[13],ymm13[13],ymm11[14],ymm13[14],ymm11[15],ymm13[15]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[4],ymm0[4],ymm3[5],ymm0[5]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[0,1,1,3,4,5,5,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm5[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm6 = ymm1[0,1,1,3,4,5,5,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7]
-; AVX2-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
+; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[6],ymm0[6],ymm3[7],ymm0[7]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm5[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 64(%r10), %ymm10
-; AVX2-NEXT:    vmovdqa 64(%rax), %ymm9
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-NEXT:    vmovdqa 64(%r8), %ymm8
-; AVX2-NEXT:    vmovdqa 64(%r9), %ymm5
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[8],ymm5[8],ymm8[9],ymm5[9],ymm8[10],ymm5[10],ymm8[11],ymm5[11]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7]
-; AVX2-NEXT:    vmovdqa 64(%rdx), %ymm11
-; AVX2-NEXT:    vmovdqa 64(%rcx), %ymm13
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm15 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[8],ymm13[8],ymm11[9],ymm13[9],ymm11[10],ymm13[10],ymm11[11],ymm13[11]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm7 = ymm15[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3]
-; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm3
-; AVX2-NEXT:    vmovdqa 64(%rsi), %ymm1
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm14 = ymm0[0,1,1,3,4,5,5,7]
+; AVX2-NEXT:    vmovdqa 64(%r8), %ymm5
+; AVX2-NEXT:    vmovdqa 64(%r9), %ymm4
+; AVX2-NEXT:    vmovdqa 64(%r10), %ymm7
+; AVX2-NEXT:    vmovdqa 64(%rax), %ymm8
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm3 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[4],ymm2[4],ymm9[5],ymm2[5]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm3[0,2,2,3]
+; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm11
+; AVX2-NEXT:    vmovdqa 64(%rsi), %ymm13
+; AVX2-NEXT:    vmovdqa 64(%rdx), %ymm15
+; AVX2-NEXT:    vmovdqa 64(%rcx), %ymm3
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm3[0],ymm15[1],ymm3[1],ymm15[2],ymm3[2],ymm15[3],ymm3[3],ymm15[8],ymm3[8],ymm15[9],ymm3[9],ymm15[10],ymm3[10],ymm15[11],ymm3[11]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[8],ymm13[8],ymm11[9],ymm13[9],ymm11[10],ymm13[10],ymm11[11],ymm13[11]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm14[0],ymm7[1],ymm14[2,3,4],ymm7[5],ymm14[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3],ymm7[4,5],ymm4[6,7]
-; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm6[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm15[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm14[0,1],ymm6[2,3],ymm14[4,5],ymm6[6,7]
+; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[6],ymm2[6],ymm9[7],ymm2[7]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm2[0,2,2,3]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm5[4],ymm8[5],ymm5[5],ymm8[6],ymm5[6],ymm8[7],ymm5[7],ymm8[12],ymm5[12],ymm8[13],ymm5[13],ymm8[14],ymm5[14],ymm8[15],ymm5[15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm0[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[0,1,1,3,4,5,5,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm8 = ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[12],ymm13[12],ymm11[13],ymm13[13],ymm11[14],ymm13[14],ymm11[15],ymm13[15]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm8[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm1[0,1,1,3,4,5,5,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7]
-; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm15[4],ymm3[4],ymm15[5],ymm3[5],ymm15[6],ymm3[6],ymm15[7],ymm3[7],ymm15[12],ymm3[12],ymm15[13],ymm3[13],ymm15[14],ymm3[14],ymm15[15],ymm3[15]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[12],ymm13[12],ymm11[13],ymm13[13],ymm11[14],ymm13[14],ymm11[15],ymm13[15]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm1 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[4],ymm0[4],ymm4[5],ymm0[5]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[4],ymm3[4],ymm5[5],ymm3[5]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[6],ymm0[6],ymm4[7],ymm0[7]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm3 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm8[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-NEXT:    vmovdqa 96(%r10), %ymm6
-; AVX2-NEXT:    vmovdqa 96(%rax), %ymm5
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-NEXT:    vmovdqa 96(%r8), %ymm8
-; AVX2-NEXT:    vmovdqa 96(%r9), %ymm9
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[8],ymm9[8],ymm8[9],ymm9[9],ymm8[10],ymm9[10],ymm8[11],ymm9[11]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm11 = ymm10[0,1,1,3,4,5,5,7]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7]
+; AVX2-NEXT:    vmovdqa 96(%r8), %ymm3
+; AVX2-NEXT:    vmovdqa 96(%r9), %ymm4
+; AVX2-NEXT:    vmovdqa 96(%r10), %ymm5
+; AVX2-NEXT:    vmovdqa 96(%rax), %ymm7
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[8],ymm7[8],ymm5[9],ymm7[9],ymm5[10],ymm7[10],ymm5[11],ymm7[11]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm11 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm3[3],ymm11[4,5,6],ymm3[7]
-; AVX2-NEXT:    vmovdqa 96(%rdx), %ymm11
-; AVX2-NEXT:    vmovdqa 96(%rcx), %ymm13
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[8],ymm13[8],ymm11[9],ymm13[9],ymm11[10],ymm13[10],ymm11[11],ymm13[11]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm15 = ymm14[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3]
-; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm3
-; AVX2-NEXT:    vmovdqa 96(%rsi), %ymm1
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm12 = ymm0[0,1,1,3,4,5,5,7]
+; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm13
+; AVX2-NEXT:    vmovdqa 96(%rsi), %ymm14
+; AVX2-NEXT:    vmovdqa 96(%rdx), %ymm15
+; AVX2-NEXT:    vmovdqa 96(%rcx), %ymm2
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[2],ymm2[2],ymm15[3],ymm2[3],ymm15[8],ymm2[8],ymm15[9],ymm2[9],ymm15[10],ymm2[10],ymm15[11],ymm2[11]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3,4],ymm15[5],ymm12[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm15 = ymm12[0,1],ymm4[2,3],ymm12[4,5],ymm4[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm10[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm14[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm8 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm8[0,2,2,3]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[12],ymm9[12],ymm8[13],ymm9[13],ymm8[14],ymm9[14],ymm8[15],ymm9[15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm0[0,0,2,1,4,4,6,5]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm7[4],ymm5[5],ymm7[5],ymm5[6],ymm7[6],ymm5[7],ymm7[7],ymm5[12],ymm7[12],ymm5[13],ymm7[13],ymm5[14],ymm7[14],ymm5[15],ymm7[15]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm15[4],ymm2[4],ymm15[5],ymm2[5],ymm15[6],ymm2[6],ymm15[7],ymm2[7],ymm15[12],ymm2[12],ymm15[13],ymm2[13],ymm15[14],ymm2[14],ymm15[15],ymm2[15]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm13[4],ymm14[4],ymm13[5],ymm14[5],ymm13[6],ymm14[6],ymm13[7],ymm14[7],ymm13[12],ymm14[12],ymm13[13],ymm14[13],ymm13[14],ymm14[14],ymm13[15],ymm14[15]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm5 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[4],ymm1[4],ymm3[5],ymm1[5]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm6 = ymm4[0,1,1,3,4,5,5,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[12],ymm13[12],ymm11[13],ymm13[13],ymm11[14],ymm13[14],ymm11[15],ymm13[15]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm6[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm8 = ymm1[0,1,1,3,4,5,5,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2,3,4],ymm3[5],ymm8[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm6[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    vmovdqa %ymm0, 992(%rax)
-; AVX2-NEXT:    vmovdqa %ymm3, 960(%rax)
-; AVX2-NEXT:    vmovdqa %ymm2, 928(%rax)
-; AVX2-NEXT:    vmovdqa %ymm15, 896(%rax)
-; AVX2-NEXT:    vmovdqa %ymm7, 736(%rax)
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, 704(%rax)
+; AVX2-NEXT:    vmovdqa %ymm1, 992(%rax)
+; AVX2-NEXT:    vmovdqa %ymm5, 960(%rax)
+; AVX2-NEXT:    vmovdqa %ymm0, 928(%rax)
+; AVX2-NEXT:    vmovdqa %ymm11, 896(%rax)
+; AVX2-NEXT:    vmovdqa %ymm6, 736(%rax)
+; AVX2-NEXT:    vmovdqa %ymm10, 704(%rax)
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    vmovaps %ymm0, 672(%rax)
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    vmovaps %ymm0, 640(%rax)
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    vmovaps %ymm0, 480(%rax)
-; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, 448(%rax)
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vmovaps %ymm0, 448(%rax)
+; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    vmovaps %ymm0, 416(%rax)
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    vmovaps %ymm0, 384(%rax)
@@ -7534,539 +7040,345 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-NEXT:    vmovaps %ymm0, 32(%rax)
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    vmovaps %ymm0, (%rax)
-; AVX2-NEXT:    addq $744, %rsp # imm = 0x2E8
+; AVX2-NEXT:    addq $712, %rsp # imm = 0x2C8
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX2-FP-LABEL: store_i16_stride8_vf64:
 ; AVX2-FP:       # %bb.0:
-; AVX2-FP-NEXT:    subq $744, %rsp # imm = 0x2E8
+; AVX2-FP-NEXT:    subq $712, %rsp # imm = 0x2C8
 ; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX2-FP-NEXT:    vmovdqa (%rax), %xmm6
+; AVX2-FP-NEXT:    vmovdqa (%rax), %xmm5
 ; AVX2-FP-NEXT:    vmovdqa 32(%rax), %xmm0
-; AVX2-FP-NEXT:    vmovdqa (%r10), %xmm7
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm3[0,0,1,1]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; AVX2-FP-NEXT:    vmovdqa (%r10), %xmm6
+; AVX2-FP-NEXT:    vmovdqa 32(%r10), %xmm1
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
 ; AVX2-FP-NEXT:    vmovdqa (%r9), %xmm8
 ; AVX2-FP-NEXT:    vmovdqa (%r8), %xmm9
 ; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
 ; AVX2-FP-NEXT:    vmovdqa (%rcx), %xmm10
 ; AVX2-FP-NEXT:    vmovdqa (%rdx), %xmm11
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm5[0,0,1,1]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
 ; AVX2-FP-NEXT:    vmovdqa (%rsi), %xmm12
 ; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm13
 ; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm15 = xmm14[0],zero,xmm14[1],zero
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm15 = xmm14[0],xmm7[0],xmm14[1],xmm7[1]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,1,1,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm15[0],ymm2[1],ymm15[2,3,4],ymm2[5],ymm15[6,7]
-; AVX2-FP-NEXT:    vmovdqa 32(%r10), %xmm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3],ymm15[4,5],ymm1[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 32(%r9), %xmm1
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7]
-; AVX2-FP-NEXT:    vmovdqa 32(%r8), %xmm3
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm5[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm14[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
-; AVX2-FP-NEXT:    vmovdqa 32(%rcx), %xmm4
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2,3],ymm5[4,5],ymm15[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 32(%rdx), %xmm5
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1]
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
-; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7]
-; AVX2-FP-NEXT:    vmovdqa 32(%rsi), %xmm7
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm2[2,3],ymm15[4,5],ymm2[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 32(%r9), %xmm2
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm15 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX2-FP-NEXT:    vmovdqa 32(%r8), %xmm4
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm14[2],xmm7[2],xmm14[3],xmm7[3]
+; AVX2-FP-NEXT:    vmovdqa 32(%rcx), %xmm3
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm14 = ymm15[0,0,2,1]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3],ymm7[4,5],ymm14[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 32(%rdx), %xmm7
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; AVX2-FP-NEXT:    vmovdqa 32(%rsi), %xmm5
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
+; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %xmm8
 ; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm10[0,0,1,1]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,1,1,3]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm11 = xmm9[0],xmm6[0],xmm9[1],xmm6[1]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
 ; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
-; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm13 = xmm12[0],zero,xmm12[1],zero
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm13 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm13[0],ymm11[1],ymm13[2,3,4],ymm11[5],ymm13[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1],ymm9[2,3],ymm11[4,5],ymm9[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3],ymm13[4,5],ymm11[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm9[2],xmm6[2],xmm9[3],xmm6[3]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm10[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,1,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm12[2,2,3,3]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm9 = xmm12[2],xmm10[2],xmm12[3],xmm10[3]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm6[0,0,1,1]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm9 = xmm10[0],zero,xmm10[1],zero
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm9[0],xmm6[0],xmm9[1],xmm6[1]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm6[2],xmm9[3],xmm6[3]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm10 = xmm12[2],xmm11[2],xmm12[3],xmm11[3]
+; AVX2-FP-NEXT:    vmovdqa 64(%rax), %xmm6
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7]
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm11[0,0,1,1]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm9[0,1,1,3]
-; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %xmm9
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
-; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3],ymm12[4,5],ymm8[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm10[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm11[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm13[2,2,3,3]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0],ymm6[1],ymm10[2,3,4],ymm6[5],ymm10[6,7]
-; AVX2-FP-NEXT:    vmovdqa 64(%rax), %xmm6
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 64(%r10), %xmm8
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[0,0,1,1]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vmovdqa 64(%r10), %xmm9
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; AVX2-FP-NEXT:    vmovdqa 64(%r9), %xmm0
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm4[0,0,1,1]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
-; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0],ymm1[1],ymm7[2,3,4],ymm1[5],ymm7[6,7]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
 ; AVX2-FP-NEXT:    vmovdqa 64(%r8), %xmm1
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm10[0],xmm4[1],xmm10[1]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm4[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm5[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm7[0,0,1,1]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm9[0],zero,xmm9[1],zero
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
 ; AVX2-FP-NEXT:    vmovdqa 64(%rcx), %xmm2
-; AVX2-FP-NEXT:    vmovdqa 64(%rdx), %xmm3
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm11[0,0,1,1]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm4[0,1,1,3]
-; AVX2-FP-NEXT:    vmovdqa 64(%rsi), %xmm4
-; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %xmm5
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm11[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm13[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm6[0,0,1,1]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7]
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[0,0,1,1]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1]
+; AVX2-FP-NEXT:    vmovdqa 64(%rdx), %xmm7
+; AVX2-FP-NEXT:    vmovdqa 64(%rsi), %xmm8
+; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %xmm10
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm5[2,3],ymm13[4,5],ymm5[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm12[2],xmm11[2],xmm12[3],xmm11[3]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3]
-; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm6[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FP-NEXT:    vmovdqa 96(%rax), %xmm0
 ; AVX2-FP-NEXT:    vmovdqa 96(%r10), %xmm1
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm8[0,0,1,1]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[0,0,2,1]
-; AVX2-FP-NEXT:    vmovdqa 96(%r9), %xmm2
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-FP-NEXT:    vmovdqa 96(%r9), %xmm3
 ; AVX2-FP-NEXT:    vmovdqa 96(%r8), %xmm4
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm9[0],zero,xmm9[1],zero
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7]
-; AVX2-FP-NEXT:    vmovdqa 96(%rcx), %xmm3
-; AVX2-FP-NEXT:    vmovdqa 96(%rdx), %xmm5
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm11[0,0,1,1]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm6[0,1,1,3]
-; AVX2-FP-NEXT:    vmovdqa 96(%rsi), %xmm6
-; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %xmm7
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
-; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm11[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm13[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
+; AVX2-FP-NEXT:    vmovdqa 96(%rcx), %xmm7
+; AVX2-FP-NEXT:    vmovdqa 96(%rdx), %xmm8
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; AVX2-FP-NEXT:    vmovdqa 96(%rsi), %xmm10
+; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %xmm11
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm9[0],xmm13[1],xmm9[1]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm12[0,1],ymm6[2,3],ymm12[4,5],ymm6[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm5[2],xmm2[2],xmm5[3],xmm2[3]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm13[2],xmm9[2],xmm13[3],xmm9[3]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7]
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[0,0,1,1]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3]
-; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm4[0],zero,xmm4[1],zero
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm3[2,2,3,3]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm4[2,2,3,3]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa (%r10), %ymm0
-; AVX2-FP-NEXT:    vmovdqa (%rax), %ymm1
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-FP-NEXT:    vmovdqa (%r8), %ymm4
-; AVX2-FP-NEXT:    vmovdqa (%r9), %ymm5
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7]
-; AVX2-FP-NEXT:    vmovdqa (%rdx), %ymm7
-; AVX2-FP-NEXT:    vmovdqa (%rcx), %ymm8
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm10 = ymm9[0,0,2,1,4,4,6,5]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3]
-; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm11
-; AVX2-FP-NEXT:    vmovdqa (%rsi), %ymm12
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm13 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm13[0,1,1,3,4,5,5,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,3,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm14[0],ymm10[1],ymm14[2,3,4],ymm10[5],ymm14[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3],ymm10[4,5],ymm3[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm6[2,1,3,3,6,5,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm9[0,2,2,3,4,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm13[2,1,3,3,6,5,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm0[0,0,2,1,4,4,6,5]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm1[0,1,1,3,4,5,5,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
+; AVX2-FP-NEXT:    vmovdqa (%r8), %ymm11
+; AVX2-FP-NEXT:    vmovdqa (%r9), %ymm1
+; AVX2-FP-NEXT:    vmovdqa (%r10), %ymm2
+; AVX2-FP-NEXT:    vmovdqa (%rax), %ymm3
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm11[0],ymm1[0],ymm11[1],ymm1[1],ymm11[2],ymm1[2],ymm11[3],ymm1[3],ymm11[8],ymm1[8],ymm11[9],ymm1[9],ymm11[10],ymm1[10],ymm11[11],ymm1[11]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm6[0,2,2,3]
+; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm7
+; AVX2-FP-NEXT:    vmovdqa (%rsi), %ymm8
+; AVX2-FP-NEXT:    vmovdqa (%rdx), %ymm9
+; AVX2-FP-NEXT:    vmovdqa (%rcx), %ymm6
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[8],ymm6[8],ymm9[9],ymm6[9],ymm9[10],ymm6[10],ymm9[11],ymm6[11]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm13 = ymm0[0],ymm10[0],ymm0[1],ymm10[1],ymm0[4],ymm10[4],ymm0[5],ymm10[5]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,1,3,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm4 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm10[2],ymm0[3],ymm10[3],ymm0[6],ymm10[6],ymm0[7],ymm10[7]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm11[4],ymm1[4],ymm11[5],ymm1[5],ymm11[6],ymm1[6],ymm11[7],ymm1[7],ymm11[12],ymm1[12],ymm11[13],ymm1[13],ymm11[14],ymm1[14],ymm11[15],ymm1[15]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm6[4],ymm9[5],ymm6[5],ymm9[6],ymm6[6],ymm9[7],ymm6[7],ymm9[12],ymm6[12],ymm9[13],ymm6[13],ymm9[14],ymm6[14],ymm9[15],ymm6[15]
 ; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15]
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm3[0,0,2,1,4,4,6,5]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm4[0,1,1,3,4,5,5,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[0,2,2,3,4,6,6,7]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm4[2,1,3,3,6,5,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 32(%r10), %ymm13
-; AVX2-FP-NEXT:    vmovdqa 32(%rax), %ymm11
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-FP-NEXT:    vmovdqa 32(%r8), %ymm4
-; AVX2-FP-NEXT:    vmovdqa 32(%r9), %ymm5
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7]
-; AVX2-FP-NEXT:    vmovdqa 32(%rdx), %ymm7
-; AVX2-FP-NEXT:    vmovdqa 32(%rcx), %ymm8
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm10 = ymm9[0,0,2,1,4,4,6,5]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3]
-; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX2-FP-NEXT:    vmovdqa 32(%rsi), %ymm3
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm0[0,1,1,3,4,5,5,7]
+; AVX2-FP-NEXT:    vmovdqa 32(%r8), %ymm9
+; AVX2-FP-NEXT:    vmovdqa 32(%r9), %ymm8
+; AVX2-FP-NEXT:    vmovdqa 32(%r10), %ymm7
+; AVX2-FP-NEXT:    vmovdqa 32(%rax), %ymm3
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[8],ymm3[8],ymm7[9],ymm3[9],ymm7[10],ymm3[10],ymm7[11],ymm3[11]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm6[0,2,2,3]
+; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm11
+; AVX2-FP-NEXT:    vmovdqa 32(%rsi), %ymm13
+; AVX2-FP-NEXT:    vmovdqa 32(%rdx), %ymm2
+; AVX2-FP-NEXT:    vmovdqa 32(%rcx), %ymm1
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[8],ymm13[8],ymm11[9],ymm13[9],ymm11[10],ymm13[10],ymm11[11],ymm13[11]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm15 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[4],ymm6[4],ymm0[5],ymm6[5]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0],ymm10[1],ymm15[2,3,4],ymm10[5],ymm15[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5],ymm12[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[2,1,3,3,6,5,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm9[0,2,2,3,4,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm4 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[6],ymm6[6],ymm0[7],ymm6[7]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15]
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm0[0,0,2,1,4,4,6,5]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm7[4],ymm3[4],ymm7[5],ymm3[5],ymm7[6],ymm3[6],ymm7[7],ymm3[7],ymm7[12],ymm3[12],ymm7[13],ymm3[13],ymm7[14],ymm3[14],ymm7[15],ymm3[15]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[12],ymm13[12],ymm11[13],ymm13[13],ymm11[14],ymm13[14],ymm11[15],ymm13[15]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[4],ymm0[4],ymm3[5],ymm0[5]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[0,1,1,3,4,5,5,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7]
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15]
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm5[0,0,2,1,4,4,6,5]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm1[0,1,1,3,4,5,5,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[6],ymm0[6],ymm3[7],ymm0[7]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm5[0,2,2,3,4,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vmovdqa 64(%r10), %ymm10
-; AVX2-FP-NEXT:    vmovdqa 64(%rax), %ymm9
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-FP-NEXT:    vmovdqa 64(%r8), %ymm8
-; AVX2-FP-NEXT:    vmovdqa 64(%r9), %ymm5
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[8],ymm5[8],ymm8[9],ymm5[9],ymm8[10],ymm5[10],ymm8[11],ymm5[11]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7]
-; AVX2-FP-NEXT:    vmovdqa 64(%rdx), %ymm11
-; AVX2-FP-NEXT:    vmovdqa 64(%rcx), %ymm13
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm15 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[8],ymm13[8],ymm11[9],ymm13[9],ymm11[10],ymm13[10],ymm11[11],ymm13[11]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm15[0,0,2,1,4,4,6,5]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3]
-; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm3
-; AVX2-FP-NEXT:    vmovdqa 64(%rsi), %ymm1
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm0[0,1,1,3,4,5,5,7]
+; AVX2-FP-NEXT:    vmovdqa 64(%r8), %ymm5
+; AVX2-FP-NEXT:    vmovdqa 64(%r9), %ymm4
+; AVX2-FP-NEXT:    vmovdqa 64(%r10), %ymm7
+; AVX2-FP-NEXT:    vmovdqa 64(%rax), %ymm8
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm3 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[4],ymm2[4],ymm9[5],ymm2[5]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm3[0,2,2,3]
+; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm11
+; AVX2-FP-NEXT:    vmovdqa 64(%rsi), %ymm13
+; AVX2-FP-NEXT:    vmovdqa 64(%rdx), %ymm15
+; AVX2-FP-NEXT:    vmovdqa 64(%rcx), %ymm3
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm3[0],ymm15[1],ymm3[1],ymm15[2],ymm3[2],ymm15[3],ymm3[3],ymm15[8],ymm3[8],ymm15[9],ymm3[9],ymm15[10],ymm3[10],ymm15[11],ymm3[11]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[8],ymm13[8],ymm11[9],ymm13[9],ymm11[10],ymm13[10],ymm11[11],ymm13[11]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,3,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm14[0],ymm7[1],ymm14[2,3,4],ymm7[5],ymm14[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3],ymm7[4,5],ymm4[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm6[2,1,3,3,6,5,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm15[0,2,2,3,4,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm14[0,1],ymm6[2,3],ymm14[4,5],ymm6[6,7]
+; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[6],ymm2[6],ymm9[7],ymm2[7]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm2[0,2,2,3]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
 ; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15]
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm5[4],ymm8[5],ymm5[5],ymm8[6],ymm5[6],ymm8[7],ymm5[7],ymm8[12],ymm5[12],ymm8[13],ymm5[13],ymm8[14],ymm5[14],ymm8[15],ymm5[15]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm0[0,0,2,1,4,4,6,5]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[0,1,1,3,4,5,5,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7]
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm8 = ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[12],ymm13[12],ymm11[13],ymm13[13],ymm11[14],ymm13[14],ymm11[15],ymm13[15]
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm8[0,0,2,1,4,4,6,5]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm1[0,1,1,3,4,5,5,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7]
-; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm15[4],ymm3[4],ymm15[5],ymm3[5],ymm15[6],ymm3[6],ymm15[7],ymm3[7],ymm15[12],ymm3[12],ymm15[13],ymm3[13],ymm15[14],ymm3[14],ymm15[15],ymm3[15]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[12],ymm13[12],ymm11[13],ymm13[13],ymm11[14],ymm13[14],ymm11[15],ymm13[15]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm1 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[4],ymm0[4],ymm4[5],ymm0[5]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[4],ymm3[4],ymm5[5],ymm3[5]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[6],ymm0[6],ymm4[7],ymm0[7]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm3 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm8[0,2,2,3,4,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FP-NEXT:    vmovdqa 96(%r10), %ymm6
-; AVX2-FP-NEXT:    vmovdqa 96(%rax), %ymm5
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-FP-NEXT:    vmovdqa 96(%r8), %ymm8
-; AVX2-FP-NEXT:    vmovdqa 96(%r9), %ymm9
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[8],ymm9[8],ymm8[9],ymm9[9],ymm8[10],ymm9[10],ymm8[11],ymm9[11]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm11 = ymm10[0,1,1,3,4,5,5,7]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7]
+; AVX2-FP-NEXT:    vmovdqa 96(%r8), %ymm3
+; AVX2-FP-NEXT:    vmovdqa 96(%r9), %ymm4
+; AVX2-FP-NEXT:    vmovdqa 96(%r10), %ymm5
+; AVX2-FP-NEXT:    vmovdqa 96(%rax), %ymm7
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[8],ymm7[8],ymm5[9],ymm7[9],ymm5[10],ymm7[10],ymm5[11],ymm7[11]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm11 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm3[3],ymm11[4,5,6],ymm3[7]
-; AVX2-FP-NEXT:    vmovdqa 96(%rdx), %ymm11
-; AVX2-FP-NEXT:    vmovdqa 96(%rcx), %ymm13
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[8],ymm13[8],ymm11[9],ymm13[9],ymm11[10],ymm13[10],ymm11[11],ymm13[11]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm14[0,0,2,1,4,4,6,5]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3]
-; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm3
-; AVX2-FP-NEXT:    vmovdqa 96(%rsi), %ymm1
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm0[0,1,1,3,4,5,5,7]
+; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm13
+; AVX2-FP-NEXT:    vmovdqa 96(%rsi), %ymm14
+; AVX2-FP-NEXT:    vmovdqa 96(%rdx), %ymm15
+; AVX2-FP-NEXT:    vmovdqa 96(%rcx), %ymm2
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[2],ymm2[2],ymm15[3],ymm2[3],ymm15[8],ymm2[8],ymm15[9],ymm2[9],ymm15[10],ymm2[10],ymm15[11],ymm2[11]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3,4],ymm15[5],ymm12[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm12[0,1],ymm4[2,3],ymm12[4,5],ymm4[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm10[2,1,3,3,6,5,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm14[0,2,2,3,4,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm8 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm8[0,2,2,3]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[12],ymm9[12],ymm8[13],ymm9[13],ymm8[14],ymm9[14],ymm8[15],ymm9[15]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm0[0,0,2,1,4,4,6,5]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm7[4],ymm5[5],ymm7[5],ymm5[6],ymm7[6],ymm5[7],ymm7[7],ymm5[12],ymm7[12],ymm5[13],ymm7[13],ymm5[14],ymm7[14],ymm5[15],ymm7[15]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm15[4],ymm2[4],ymm15[5],ymm2[5],ymm15[6],ymm2[6],ymm15[7],ymm2[7],ymm15[12],ymm2[12],ymm15[13],ymm2[13],ymm15[14],ymm2[14],ymm15[15],ymm2[15]
+; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm13[4],ymm14[4],ymm13[5],ymm14[5],ymm13[6],ymm14[6],ymm13[7],ymm14[7],ymm13[12],ymm14[12],ymm13[13],ymm14[13],ymm13[14],ymm14[14],ymm13[15],ymm14[15]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm5 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[4],ymm1[4],ymm3[5],ymm1[5]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm4[0,1,1,3,4,5,5,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7]
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[12],ymm13[12],ymm11[13],ymm13[13],ymm11[14],ymm13[14],ymm11[15],ymm13[15]
-; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm6[0,0,2,1,4,4,6,5]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm1[0,1,1,3,4,5,5,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2,3,4],ymm3[5],ymm8[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm6[0,2,2,3,4,6,6,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
 ; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-FP-NEXT:    vmovdqa %ymm0, 992(%rax)
-; AVX2-FP-NEXT:    vmovdqa %ymm3, 960(%rax)
-; AVX2-FP-NEXT:    vmovdqa %ymm2, 928(%rax)
-; AVX2-FP-NEXT:    vmovdqa %ymm15, 896(%rax)
-; AVX2-FP-NEXT:    vmovdqa %ymm7, 736(%rax)
-; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm0, 704(%rax)
+; AVX2-FP-NEXT:    vmovdqa %ymm1, 992(%rax)
+; AVX2-FP-NEXT:    vmovdqa %ymm5, 960(%rax)
+; AVX2-FP-NEXT:    vmovdqa %ymm0, 928(%rax)
+; AVX2-FP-NEXT:    vmovdqa %ymm11, 896(%rax)
+; AVX2-FP-NEXT:    vmovdqa %ymm6, 736(%rax)
+; AVX2-FP-NEXT:    vmovdqa %ymm10, 704(%rax)
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FP-NEXT:    vmovaps %ymm0, 672(%rax)
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FP-NEXT:    vmovaps %ymm0, 640(%rax)
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FP-NEXT:    vmovaps %ymm0, 480(%rax)
-; AVX2-FP-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT:    vmovaps %ymm0, 448(%rax)
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT:    vmovaps %ymm0, 448(%rax)
+; AVX2-FP-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVX2-FP-NEXT:    vmovaps %ymm0, 416(%rax)
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FP-NEXT:    vmovaps %ymm0, 384(%rax)
@@ -8110,7 +7422,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FP-NEXT:    vmovaps %ymm0, 32(%rax)
 ; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FP-NEXT:    vmovaps %ymm0, (%rax)
-; AVX2-FP-NEXT:    addq $744, %rsp # imm = 0x2E8
+; AVX2-FP-NEXT:    addq $712, %rsp # imm = 0x2C8
 ; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
index 0cd72be39557..e4f616ed730e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
@@ -388,12 +388,11 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7]
 ; AVX-NEXT:    vunpckhpd {{.*#+}} ymm5 = ymm8[1],ymm9[1],ymm8[3],ymm9[3]
-; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[0,2,3,1,4,6,7,5]
 ; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,3],ymm2[3,3],ymm0[7,7],ymm2[7,7]
-; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5],ymm5[6,7]
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6],ymm5[7]
+; AVX-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,7,5]
 ; AVX-NEXT:    vmovaps %ymm0, 64(%rax)
 ; AVX-NEXT:    vmovaps %ymm4, 32(%rax)
 ; AVX-NEXT:    vmovaps %ymm10, (%rax)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
index 45a76599d3e9..13930bc2c674 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
@@ -521,12 +521,10 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    vinsertf128 $1, (%rsi), %ymm4, %ymm4
 ; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm5, %ymm6
 ; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm7
-; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm8 = [2,6,0,3,2,6,0,3]
-; AVX2-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX2-NEXT:    vpermps %ymm6, %ymm8, %ymm8
-; AVX2-NEXT:    vshufps {{.*#+}} ymm9 = ymm4[3,3,3,3,7,7,7,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,2]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6],ymm8[7]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm8 = ymm4[3,3,3,3,7,7,7,7]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm8 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6,7]
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm9 = [2,6,u,u,u,1,4,3]
+; AVX2-NEXT:    vpermps %ymm8, %ymm9, %ymm8
 ; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm9
 ; AVX2-NEXT:    vunpckhps {{.*#+}} ymm9 = ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[6],ymm3[6],ymm9[7],ymm3[7]
 ; AVX2-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[0,1,0,1,4,5,4,5]
@@ -573,12 +571,10 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FP-NEXT:    vinsertf128 $1, (%rsi), %ymm4, %ymm4
 ; AVX2-FP-NEXT:    vinsertf128 $1, %xmm0, %ymm5, %ymm6
 ; AVX2-FP-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm7
-; AVX2-FP-NEXT:    vbroadcastf128 {{.*#+}} ymm8 = [2,6,0,3,2,6,0,3]
-; AVX2-FP-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX2-FP-NEXT:    vpermps %ymm6, %ymm8, %ymm8
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm9 = ymm4[3,3,3,3,7,7,7,7]
-; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,2]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6],ymm8[7]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm8 = ymm4[3,3,3,3,7,7,7,7]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm8 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6,7]
+; AVX2-FP-NEXT:    vmovaps {{.*#+}} ymm9 = [2,6,u,u,u,1,4,3]
+; AVX2-FP-NEXT:    vpermps %ymm8, %ymm9, %ymm8
 ; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm9
 ; AVX2-FP-NEXT:    vunpckhps {{.*#+}} ymm9 = ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[6],ymm3[6],ymm9[7],ymm3[7]
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[0,1,0,1,4,5,4,5]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
index a343df8428fb..2b4d0b1409a7 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
@@ -3001,214 +3001,206 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    subq $88, %rsp
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT:    vmovdqa (%r10), %xmm2
-; AVX2-NEXT:    vmovdqa (%rax), %xmm3
+; AVX2-NEXT:    vmovdqa (%rsi), %xmm2
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm3
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vmovdqa (%r9), %xmm4
-; AVX2-NEXT:    vmovdqa (%r8), %xmm5
+; AVX2-NEXT:    vmovdqa (%rcx), %xmm4
+; AVX2-NEXT:    vmovdqa (%rdx), %xmm5
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm12[0,1,1,3,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm12[2,1,3,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm12[0,0,2,1,4,5,6,7]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm12[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm6, %ymm6
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6],ymm0[7],ymm6[8,9,10],ymm0[11],ymm6[12,13,14],ymm0[15]
-; AVX2-NEXT:    vmovdqa (%rsi), %xmm6
-; AVX2-NEXT:    vmovdqa (%rdi), %xmm7
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7,8],ymm6[9],ymm0[10,11,12],ymm6[13],ymm0[14,15]
+; AVX2-NEXT:    vmovdqa (%r10), %xmm6
+; AVX2-NEXT:    vmovdqa (%rax), %xmm7
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm8 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm9 = xmm13[1,1,1,1]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm13[0,0,2,1,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm13[0,2,2,3,4,5,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm8, %ymm9
-; AVX2-NEXT:    vmovdqa (%rcx), %xmm10
-; AVX2-NEXT:    vmovdqa (%rdx), %xmm11
+; AVX2-NEXT:    vmovdqa (%r9), %xmm10
+; AVX2-NEXT:    vmovdqa (%r8), %xmm11
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm14[0,0,2,1,4,5,6,7]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm14[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm14[0,1,1,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm14[2,1,3,3,4,5,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm15, %ymm8, %ymm15
-; AVX2-NEXT:    vmovaps 16(%r10), %xmm8
+; AVX2-NEXT:    vmovaps 16(%rsi), %xmm8
 ; AVX2-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm15 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7,8],ymm15[9],ymm9[10,11,12],ymm15[13],ymm9[14,15]
-; AVX2-NEXT:    vmovdqa 16(%rax), %xmm9
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0],ymm0[1],ymm15[2],ymm0[3],ymm15[4],ymm0[5],ymm15[6],ymm0[7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm15 = ymm15[0],ymm9[1],ymm15[2],ymm9[3],ymm15[4,5,6,7,8],ymm9[9],ymm15[10],ymm9[11],ymm15[12,13,14,15]
+; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm9
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[0,0,2,1,4,4,6,5]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2],ymm15[3],ymm0[4],ymm15[5],ymm0[6],ymm15[7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,4,5,5,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,7,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm12, %ymm1, %ymm1
-; AVX2-NEXT:    vmovdqa 16(%r9), %xmm8
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm13[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[3,3,3,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero
+; AVX2-NEXT:    vmovdqa 16(%rcx), %xmm8
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm13, %ymm1, %ymm1
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm14[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm14[0,1,2,3,4,5,5,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,6,5,7,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm14, %ymm13, %ymm13
-; AVX2-NEXT:    vmovdqa 16(%r8), %xmm15
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7,8],ymm13[9],ymm1[10,11,12],ymm13[13],ymm1[14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; AVX2-NEXT:    vmovdqa 16(%rdx), %xmm15
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm13[0,1,2,3,4],ymm1[5],ymm13[6],ymm1[7],ymm13[8,9,10,11,12],ymm1[13],ymm13[14],ymm1[15]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
 ; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
 ; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,5,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,6,5,7,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,6,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm2, %ymm2
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15]
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
 ; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[3,3,3,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,6,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm2, %ymm2
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15]
 ; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,5,5,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,6,5,7,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm10, %ymm7, %ymm7
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3,4],ymm7[5],ymm2[6,7,8],ymm7[9],ymm2[10,11,12],ymm7[13],ymm2[14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5],ymm7[6],ymm2[7],ymm7[8,9,10,11,12],ymm2[13],ymm7[14],ymm2[15]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX2-NEXT:    vmovdqa %xmm9, %xmm5
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm10[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm10[2,3,2,3]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm10[3,3,3,3]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
 ; AVX2-NEXT:    vmovdqa %xmm8, %xmm9
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,5,5,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm11[0,1,2,3,6,5,7,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm11[0,1,2,3,4,6,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX2-NEXT:    vmovdqa 16(%rsi), %xmm8
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm7 = ymm13[0,1,2],ymm7[3],ymm13[4,5,6],ymm7[7],ymm13[8,9,10],ymm7[11],ymm13[12,13,14],ymm7[15]
-; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm4
+; AVX2-NEXT:    vmovdqa 16(%r10), %xmm8
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[2,1,3,3,6,5,7,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0],ymm13[1],ymm7[2,3,4],ymm13[5],ymm7[6,7,8],ymm13[9],ymm7[10,11,12],ymm13[13],ymm7[14,15]
+; AVX2-NEXT:    vmovdqa 16(%rax), %xmm4
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm14 = xmm13[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm13[3,3,3,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,6,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm14, %ymm2
-; AVX2-NEXT:    vmovdqa 16(%rcx), %xmm3
-; AVX2-NEXT:    vmovdqa 16(%rdx), %xmm1
+; AVX2-NEXT:    vmovdqa 16(%r9), %xmm3
+; AVX2-NEXT:    vmovdqa 16(%r8), %xmm1
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm14[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,5,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm14[0,1,2,3,6,5,7,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3],ymm0[4],ymm7[5],ymm0[6],ymm7[7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6],ymm2[7],ymm0[8,9,10,11,12],ymm2[13],ymm0[14],ymm2[15]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2],ymm0[3],ymm7[4],ymm0[5],ymm7[6],ymm0[7]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm2[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm12 = xmm2[3,3,3,3]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm0
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm15[8],xmm9[8],xmm15[9],xmm9[9],xmm15[10],xmm9[10],xmm15[11],xmm9[11],xmm15[12],xmm9[12],xmm15[13],xmm9[13],xmm15[14],xmm9[14],xmm15[15],xmm9[15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm12[0,1,2,3,4,5,5,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm12[0,1,2,3,6,5,7,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm12[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm12[0,1,2,3,4,6,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm15, %ymm7
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7],ymm7[8,9,10],ymm0[11],ymm7[12,13,14],ymm0[15]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[2,1,3,3,6,5,7,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7,8],ymm7[9],ymm0[10,11,12],ymm7[13],ymm0[14,15]
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm4[3,3,3,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,6,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,5,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,6,5,7,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm3, %ymm3
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7,8],ymm3[9],ymm5[10,11,12],ymm3[13],ymm5[14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6],ymm5[7],ymm3[8,9,10,11,12],ymm5[13],ymm3[14],ymm5[15]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm12[0,1,1,3,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm12[2,1,3,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm12[0,0,2,1,4,5,6,7]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm12[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm3, %ymm3
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm4[0,0,2,1,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm1[0,0,2,1,4,5,6,7]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm1[0,1,1,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm4, %ymm1
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4],ymm1[5],ymm3[6,7,8],ymm1[9],ymm3[10,11,12],ymm1[13],ymm3[14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm10[0,0,2,1,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm10[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4,5,6,7,8],ymm3[9],ymm1[10],ymm3[11],ymm1[12,13,14,15]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm10[1,1,1,1]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm11[0,1,1,3,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm11[2,1,3,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm11[0,0,2,1,4,5,6,7]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm11[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm13[0,0,2,1,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm13[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm14[0,1,1,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm14[2,1,3,3,4,5,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm4, %ymm4
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5,6,7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13,14,15]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm13[1,1,1,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
+; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1]
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm14[0,0,2,1,4,5,6,7]
+; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm5[0,0,2,1,4,5,6,7]
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm14[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm4, %ymm4
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
-; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm4[0,0,2,1,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
 ; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm5[0,1,1,3,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm4, %ymm4
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15]
-; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm5[0,0,2,1,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm4, %ymm4
 ; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm6[0,0,2,1,4,5,6,7]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm6[0,1,1,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[2,1,3,3,4,5,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm5, %ymm5
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7,8],ymm5[9],ymm4[10,11,12],ymm5[13],ymm4[14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5,6,7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13,14,15]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    vmovdqa %ymm3, 64(%rax)
 ; AVX2-NEXT:    vmovdqa %ymm2, 128(%rax)
@@ -6002,423 +5994,407 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    subq $328, %rsp # imm = 0x148
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT:    vmovdqa (%r10), %xmm0
+; AVX2-NEXT:    vmovdqa (%rsi), %xmm0
 ; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovdqa (%rax), %xmm1
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[3,3,3,3]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vmovdqa (%r9), %xmm1
+; AVX2-NEXT:    vmovdqa (%rcx), %xmm1
 ; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovdqa (%r8), %xmm3
+; AVX2-NEXT:    vmovdqa (%rdx), %xmm3
 ; AVX2-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,5,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,6,5,7,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,6,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm5 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15]
-; AVX2-NEXT:    vmovdqa (%rsi), %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15]
+; AVX2-NEXT:    vmovdqa (%r10), %xmm0
 ; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX2-NEXT:    vmovdqa (%rax), %xmm1
 ; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm4[3,3,3,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,6,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT:    vmovdqa (%rcx), %xmm0
+; AVX2-NEXT:    vmovdqa (%r9), %xmm0
 ; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovdqa (%rdx), %xmm6
+; AVX2-NEXT:    vmovdqa (%r8), %xmm6
 ; AVX2-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,5,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,6,5,7,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-NEXT:    vmovdqa 48(%r10), %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7,8],ymm7[9],ymm1[10,11,12],ymm7[13],ymm1[14,15]
-; AVX2-NEXT:    vmovdqa 48(%rax), %xmm1
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2],ymm5[3],ymm7[4],ymm5[5],ymm7[6],ymm5[7]
+; AVX2-NEXT:    vmovdqa 48(%rsi), %xmm0
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm1[5],ymm7[6],ymm1[7],ymm7[8,9,10,11,12],ymm1[13],ymm7[14],ymm1[15]
+; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm1
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
 ; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm5
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm3
-; AVX2-NEXT:    vmovdqa 48(%r9), %xmm2
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6],ymm5[7],ymm3[8,9,10],ymm5[11],ymm3[12,13,14],ymm5[15]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX2-NEXT:    vmovdqa 48(%rcx), %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7,8],ymm3[9],ymm5[10,11,12],ymm3[13],ymm5[14,15]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm4[0,0,2,1,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm6[0,0,2,1,4,5,6,7]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm6[0,1,1,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[2,1,3,3,4,5,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm5, %ymm5
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7,8],ymm5[9],ymm4[10,11,12],ymm5[13],ymm4[14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5,6,7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13,14,15]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
 ; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm8[0,0,2,1,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm8[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm8[1,1,1,1]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm4
-; AVX2-NEXT:    vmovdqa 48(%r8), %xmm3
+; AVX2-NEXT:    vmovdqa 48(%rdx), %xmm3
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm9[0,1,1,3,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm9[2,1,3,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm9[0,0,2,1,4,5,6,7]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm9[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm5, %ymm5
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm11 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15]
-; AVX2-NEXT:    vmovdqa 48(%rsi), %xmm4
-; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm5
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm10[1,1,1,1]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm7 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
-; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm7, %ymm12
-; AVX2-NEXT:    vmovdqa 48(%rcx), %xmm6
-; AVX2-NEXT:    vmovdqa 48(%rdx), %xmm7
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm13 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm13[0,0,2,1,4,5,6,7]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm13[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero
-; AVX2-NEXT:    vinserti128 $1, %xmm15, %ymm14, %ymm14
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7,8],ymm14[9],ymm12[10,11,12],ymm14[13],ymm12[14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7]
-; AVX2-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm8[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm11, %ymm8
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,4,5,5,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,7,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm11, %ymm9
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7],ymm9[8,9,10],ymm8[11],ymm9[12,13,14],ymm8[15]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm9 = xmm10[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm10 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7,8],ymm5[9],ymm4[10,11,12],ymm5[13],ymm4[14,15]
+; AVX2-NEXT:    vmovdqa 48(%r10), %xmm4
+; AVX2-NEXT:    vmovdqa 48(%rax), %xmm5
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm11[0,0,2,1,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm11[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm6, %ymm12
+; AVX2-NEXT:    vmovdqa 48(%r9), %xmm6
+; AVX2-NEXT:    vmovdqa 48(%r8), %xmm7
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm14[0,1,1,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm14[2,1,3,3,4,5,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm13, %ymm15, %ymm13
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4,5,6,7,8],ymm12[9],ymm13[10],ymm12[11],ymm13[12,13,14,15]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7]
+; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm8[2,3,2,3]
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[3,3,3,3]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero
+; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm10, %ymm8
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm10, %ymm9
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7,8],ymm9[9],ymm8[10,11,12],ymm9[13],ymm8[14,15]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm11[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm11[0,1,2,3,4,6,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm10, %ymm9, %ymm9
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm13[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm14[0,1,2,3,4,5,5,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm14[0,1,2,3,6,5,7,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm11, %ymm10, %ymm10
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm10 = ymm10[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7,8],ymm10[9],ymm9[10,11,12],ymm10[13],ymm9[14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4],ymm8[5],ymm9[6],ymm8[7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6],ymm9[7],ymm10[8,9,10,11,12],ymm9[13],ymm10[14],ymm9[15]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7]
 ; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm1, %ymm1
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm8, %ymm1
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[0,1,1,3,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm2[2,1,3,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm3, %ymm3
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7,8],ymm3[9],ymm1[10,11,12],ymm3[13],ymm1[14,15]
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm4, %ymm4
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm5[0,1,1,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm5[2,1,3,3,4,5,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm6, %ymm6
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7,8],ymm6[9],ymm4[10,11,12],ymm6[13],ymm4[14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4],ymm1[5],ymm4[6],ymm1[7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4,5,6,7,8],ymm4[9],ymm6[10],ymm4[11],ymm6[12,13,14,15]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,5,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[3,3,3,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,5,5,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,6,5,7,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15]
-; AVX2-NEXT:    vmovdqa 32(%r10), %xmm0
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6],ymm0[7],ymm2[8,9,10,11,12],ymm0[13],ymm2[14],ymm0[15]
+; AVX2-NEXT:    vmovdqa 32(%rsi), %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 32(%rax), %xmm1
+; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm1
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm8[0,0,2,1,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm8[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm4
-; AVX2-NEXT:    vmovdqa 32(%r9), %xmm2
-; AVX2-NEXT:    vmovdqa 32(%r8), %xmm3
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm8[1,1,1,1]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm4
+; AVX2-NEXT:    vmovdqa 32(%rcx), %xmm2
+; AVX2-NEXT:    vmovdqa 32(%rdx), %xmm3
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm9[0,1,1,3,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm9[2,1,3,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm9[0,0,2,1,4,5,6,7]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm9[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm5, %ymm5
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm11 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15]
-; AVX2-NEXT:    vmovdqa 32(%rsi), %xmm4
-; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm5
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm10[1,1,1,1]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm7 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
-; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm7, %ymm12
-; AVX2-NEXT:    vmovdqa 32(%rcx), %xmm6
-; AVX2-NEXT:    vmovdqa 32(%rdx), %xmm7
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm10 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7,8],ymm5[9],ymm4[10,11,12],ymm5[13],ymm4[14,15]
+; AVX2-NEXT:    vmovdqa 32(%r10), %xmm4
+; AVX2-NEXT:    vmovdqa 32(%rax), %xmm5
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm11[0,0,2,1,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm11[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm6, %ymm12
+; AVX2-NEXT:    vmovdqa 32(%r9), %xmm6
+; AVX2-NEXT:    vmovdqa 32(%r8), %xmm7
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm13 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm13[0,0,2,1,4,5,6,7]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm13[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm13[0,1,1,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm13[2,1,3,3,4,5,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm15, %ymm14, %ymm14
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7,8],ymm14[9],ymm12[10,11,12],ymm14[13],ymm12[14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7]
-; AVX2-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm8[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm11, %ymm8
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,4,5,5,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,7,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm11, %ymm9
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7],ymm9[8,9,10],ymm8[11],ymm9[12,13,14],ymm8[15]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm9 = xmm10[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4,5,6,7,8],ymm12[9],ymm14[10],ymm12[11],ymm14[12,13,14,15]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7]
+; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm8[2,3,2,3]
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[3,3,3,3]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero
+; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm10, %ymm8
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm10, %ymm9
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7,8],ymm9[9],ymm8[10,11,12],ymm9[13],ymm8[14,15]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm11[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm11[0,1,2,3,4,6,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm10, %ymm9, %ymm9
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm13[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,4,5,5,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm13[0,1,2,3,6,5,7,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm11, %ymm10, %ymm10
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm10 = ymm10[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7,8],ymm10[9],ymm9[10,11,12],ymm10[13],ymm9[14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4],ymm8[5],ymm9[6],ymm8[7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6],ymm9[7],ymm10[8,9,10,11,12],ymm9[13],ymm10[14],ymm9[15]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7]
 ; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm1, %ymm1
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm8, %ymm1
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[0,1,1,3,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm2[2,1,3,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm3, %ymm3
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7,8],ymm3[9],ymm1[10,11,12],ymm3[13],ymm1[14,15]
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm4, %ymm4
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm5[0,1,1,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm5[2,1,3,3,4,5,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm6, %ymm6
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7,8],ymm6[9],ymm4[10,11,12],ymm6[13],ymm4[14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4],ymm1[5],ymm4[6],ymm1[7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4,5,6,7,8],ymm4[9],ymm6[10],ymm4[11],ymm6[12,13,14,15]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7]
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,5,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[3,3,3,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,5,5,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,6,5,7,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6],ymm1[7],ymm2[8,9,10,11,12],ymm1[13],ymm2[14],ymm1[15]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
 ; AVX2-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa 16(%r10), %xmm13
-; AVX2-NEXT:    vmovdqa 16(%rax), %xmm12
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,1,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa 16(%r9), %xmm10
-; AVX2-NEXT:    vmovdqa 16(%r8), %xmm9
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm4[0,1,1,3,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm4[2,1,3,3,4,5,6,7]
+; AVX2-NEXT:    vmovdqa 16(%rsi), %xmm14
+; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm12
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vmovdqa 16(%rcx), %xmm11
+; AVX2-NEXT:    vmovdqa 16(%rdx), %xmm9
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm1[0,0,2,1,4,5,6,7]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm1[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm10 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15]
+; AVX2-NEXT:    vmovdqa 16(%r10), %xmm7
+; AVX2-NEXT:    vmovdqa 16(%rax), %xmm6
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm13 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm13[0,0,2,1,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm13[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm3
+; AVX2-NEXT:    vmovdqa 16(%r9), %xmm5
+; AVX2-NEXT:    vmovdqa 16(%r8), %xmm4
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,1,1,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm0[2,1,3,3,4,5,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm15, %ymm2, %ymm2
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4,5,6,7,8],ymm3[9],ymm2[10],ymm3[11],ymm2[12,13,14,15]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm11 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15]
-; AVX2-NEXT:    vmovdqa 16(%rsi), %xmm7
-; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm6
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0],ymm2[1],ymm10[2],ymm2[3],ymm10[4],ymm2[5],ymm10[6],ymm2[7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm8[2,3,2,3]
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX2-NEXT:    vmovdqa 16(%rcx), %xmm5
-; AVX2-NEXT:    vmovdqa 16(%rdx), %xmm3
-; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm0[0,0,2,1,4,5,6,7]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero
-; AVX2-NEXT:    vinserti128 $1, %xmm14, %ymm15, %ymm14
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7,8],ymm14[9],ymm2[10,11,12],ymm14[13],ymm2[14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm2[0],ymm11[1],ymm2[2],ymm11[3],ymm2[4],ymm11[5],ymm2[6],ymm11[7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm2, %ymm2
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm4[0,1,2,3,4,5,5,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm8, %ymm4
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm4, %ymm1
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm0
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7,8],ymm0[9],ymm1[10,11,12],ymm0[13],ymm1[14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm2[0,1,1,3,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm2[2,1,3,3,4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm4, %ymm4
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15]
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[1,1,1,1]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm7, %ymm6
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm8[3,3,3,3]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,5,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6],ymm2[7],ymm0[8,9,10,11,12],ymm2[13],ymm0[14],ymm2[15]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3],xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm2[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero
+; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm3, %ymm3
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7,8],ymm3[9],ymm0[10,11,12],ymm3[13],ymm0[14,15]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm3[0,0,2,1,4,5,6,7]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero
+; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm6, %ymm6
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm4[0,1,1,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm4[2,1,3,3,4,5,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7,8],ymm5[9],ymm6[10,11,12],ymm5[13],ymm6[14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4,5,6,7,8],ymm6[9],ymm5[10],ymm6[11],ymm5[12,13,14,15]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2],ymm5[3],ymm0[4],ymm5[5],ymm0[6],ymm5[7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm5, %ymm1
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[3,3,3,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm2, %ymm2
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,4,6,5]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
+; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,5,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6],ymm2[7],ymm3[8,9,10,11,12],ymm2[13],ymm3[14],ymm2[15]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
 ; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX2-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,1,1,1]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
 ; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX2-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm4 = xmm4[8],mem[8],xmm4[9],mem[9],xmm4[10],mem[10],xmm4[11],mem[11],xmm4[12],mem[12],xmm4[13],mem[13],xmm4[14],mem[14],xmm4[15],mem[15]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm4[0,1,1,3,4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm4[2,1,3,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm4[0,0,2,1,4,5,6,7]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm4[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm5, %ymm5
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7],ymm5[8,9,10],ymm3[11],ymm5[12,13,14],ymm3[15]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7,8],ymm5[9],ymm3[10,11,12],ymm5[13],ymm3[14,15]
 ; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX2-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm5 = xmm5[8],mem[8],xmm5[9],mem[9],xmm5[10],mem[10],xmm5[11],mem[11],xmm5[12],mem[12],xmm5[13],mem[13],xmm5[14],mem[14],xmm5[15],mem[15]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm7 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
-; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm7, %ymm6
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm6, %ymm6
 ; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; AVX2-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm7 = xmm7[8],mem[8],xmm7[9],mem[9],xmm7[10],mem[10],xmm7[11],mem[11],xmm7[12],mem[12],xmm7[13],mem[13],xmm7[14],mem[14],xmm7[15],mem[15]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm7[0,0,2,1,4,5,6,7]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm7[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero
-; AVX2-NEXT:    vinserti128 $1, %xmm10, %ymm9, %ymm9
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0],ymm9[1],ymm6[2,3,4],ymm9[5],ymm6[6,7,8],ymm9[9],ymm6[10,11,12],ymm9[13],ymm6[14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4],ymm3[5],ymm6[6],ymm3[7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm7[0,1,1,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm7[2,1,3,3,4,5,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm11, %ymm9, %ymm9
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2],ymm6[3],ymm9[4,5,6,7,8],ymm6[9],ymm9[10],ymm6[11],ymm9[12,13,14,15]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2],ymm6[3],ymm3[4],ymm6[5],ymm3[6],ymm6[7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm6, %ymm2
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,5,5,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm6, %ymm4
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm5[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,4,6,5]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm4, %ymm4
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm7[0,1,2,3,4,4,6,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,6,6,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm7[0,1,2,3,4,5,5,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,6,5,7,7]
 ; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm5, %ymm5
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7,8],ymm5[9],ymm4[10,11,12],ymm5[13],ymm4[14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6],ymm4[7],ymm5[8,9,10,11,12],ymm4[13],ymm5[14],ymm4[15]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7]
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    vmovdqa %ymm2, 96(%rax)
 ; AVX2-NEXT:    vmovdqa %ymm3, 64(%rax)
 ; AVX2-NEXT:    vmovdqa %ymm1, 160(%rax)
 ; AVX2-NEXT:    vmovdqa %ymm0, 128(%rax)
 ; AVX2-NEXT:    vmovdqa %ymm8, 224(%rax)
-; AVX2-NEXT:    vmovdqa %ymm11, 192(%rax)
+; AVX2-NEXT:    vmovdqa %ymm10, 192(%rax)
 ; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    vmovaps %ymm0, 288(%rax)
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
diff --git a/llvm/test/CodeGen/X86/vector-llrint.ll b/llvm/test/CodeGen/X86/vector-llrint.ll
index 46904f82fd5d..7017eb60df41 100644
--- a/llvm/test/CodeGen/X86/vector-llrint.ll
+++ b/llvm/test/CodeGen/X86/vector-llrint.ll
@@ -1,289 +1,674 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64-SSE
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx | FileCheck %s --check-prefix=X64-AVX
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefix=X64-AVX
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512dq,avx512vl | FileCheck %s --check-prefixes=AVX512DQ
 
 define <1 x i64> @llrint_v1i64_v1f32(<1 x float> %x) {
-; X64-SSE-LABEL: llrint_v1i64_v1f32:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    cvtss2si %xmm0, %rax
-; X64-SSE-NEXT:    retq
+; SSE-LABEL: llrint_v1i64_v1f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtss2si %xmm0, %rax
+; SSE-NEXT:    retq
 ;
-; X64-AVX-LABEL: llrint_v1i64_v1f32:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vcvtss2si %xmm0, %rax
-; X64-AVX-NEXT:    retq
+; AVX-LABEL: llrint_v1i64_v1f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtss2si %xmm0, %rax
+; AVX-NEXT:    retq
+;
+; AVX512DQ-LABEL: llrint_v1i64_v1f32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vcvtss2si %xmm0, %rax
+; AVX512DQ-NEXT:    retq
   %a = call <1 x i64> @llvm.llrint.v1i64.v1f32(<1 x float> %x)
   ret <1 x i64> %a
 }
 declare <1 x i64> @llvm.llrint.v1i64.v1f32(<1 x float>)
 
 define <2 x i64> @llrint_v2i64_v2f32(<2 x float> %x) {
-; X64-SSE-LABEL: llrint_v2i64_v2f32:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    cvtss2si %xmm0, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm1
-; X64-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X64-SSE-NEXT:    cvtss2si %xmm0, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm0
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X64-SSE-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE-NEXT:    retq
+; SSE-LABEL: llrint_v2i64_v2f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE-NEXT:    cvtss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: llrint_v2i64_v2f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtss2si %xmm0, %rax
+; AVX-NEXT:    vmovq %rax, %xmm1
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX-NEXT:    vcvtss2si %xmm0, %rax
+; AVX-NEXT:    vmovq %rax, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    retq
 ;
-; X64-AVX-LABEL: llrint_v2i64_v2f32:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vcvtss2si %xmm0, %rax
-; X64-AVX-NEXT:    vmovq %rax, %xmm1
-; X64-AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X64-AVX-NEXT:    vcvtss2si %xmm0, %rax
-; X64-AVX-NEXT:    vmovq %rax, %xmm0
-; X64-AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; X64-AVX-NEXT:    retq
+; AVX512DQ-LABEL: llrint_v2i64_v2f32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vcvtps2qq %xmm0, %xmm0
+; AVX512DQ-NEXT:    retq
   %a = call <2 x i64> @llvm.llrint.v2i64.v2f32(<2 x float> %x)
   ret <2 x i64> %a
 }
 declare <2 x i64> @llvm.llrint.v2i64.v2f32(<2 x float>)
 
 define <4 x i64> @llrint_v4i64_v4f32(<4 x float> %x) {
-; X64-SSE-LABEL: llrint_v4i64_v4f32:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    cvtss2si %xmm0, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm2
-; X64-SSE-NEXT:    movaps %xmm0, %xmm1
-; X64-SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
-; X64-SSE-NEXT:    cvtss2si %xmm1, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm1
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; X64-SSE-NEXT:    movaps %xmm0, %xmm1
-; X64-SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
-; X64-SSE-NEXT:    cvtss2si %xmm1, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm3
-; X64-SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; X64-SSE-NEXT:    cvtss2si %xmm0, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm1
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; X64-SSE-NEXT:    movdqa %xmm2, %xmm0
-; X64-SSE-NEXT:    retq
+; SSE-LABEL: llrint_v4i64_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm2
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
+; SSE-NEXT:    cvtss2si %xmm1, %rax
+; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
+; SSE-NEXT:    cvtss2si %xmm1, %rax
+; SSE-NEXT:    movq %rax, %xmm3
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    cvtss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: llrint_v4i64_v4f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX1-NEXT:    vcvtss2si %xmm1, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm1
+; AVX1-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX1-NEXT:    vcvtss2si %xmm2, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm2
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT:    vcvtss2si %xmm0, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm2
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vcvtss2si %xmm0, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: llrint_v4i64_v4f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX512-NEXT:    vcvtss2si %xmm1, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm1
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX512-NEXT:    vcvtss2si %xmm2, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm2
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-NEXT:    vcvtss2si %xmm0, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm2
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX512-NEXT:    vcvtss2si %xmm0, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; AVX512DQ-LABEL: llrint_v4i64_v4f32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vcvtps2qq %xmm0, %ymm0
+; AVX512DQ-NEXT:    retq
   %a = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> %x)
   ret <4 x i64> %a
 }
 declare <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float>)
 
 define <8 x i64> @llrint_v8i64_v8f32(<8 x float> %x) {
-; X64-SSE-LABEL: llrint_v8i64_v8f32:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movaps %xmm0, %xmm2
-; X64-SSE-NEXT:    cvtss2si %xmm0, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm0
-; X64-SSE-NEXT:    movaps %xmm2, %xmm3
-; X64-SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1]
-; X64-SSE-NEXT:    cvtss2si %xmm3, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm3
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; X64-SSE-NEXT:    movaps %xmm2, %xmm3
-; X64-SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3]
-; X64-SSE-NEXT:    cvtss2si %xmm3, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm3
-; X64-SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
-; X64-SSE-NEXT:    cvtss2si %xmm2, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm4
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
-; X64-SSE-NEXT:    cvtss2si %xmm1, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm2
-; X64-SSE-NEXT:    movaps %xmm1, %xmm3
-; X64-SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1]
-; X64-SSE-NEXT:    cvtss2si %xmm3, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm3
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; X64-SSE-NEXT:    movaps %xmm1, %xmm3
-; X64-SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3],xmm1[3,3]
-; X64-SSE-NEXT:    cvtss2si %xmm3, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm5
-; X64-SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
-; X64-SSE-NEXT:    cvtss2si %xmm1, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm3
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
-; X64-SSE-NEXT:    movdqa %xmm4, %xmm1
-; X64-SSE-NEXT:    retq
+; SSE-LABEL: llrint_v8i64_v8f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps %xmm0, %xmm2
+; SSE-NEXT:    cvtss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movaps %xmm2, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1]
+; SSE-NEXT:    cvtss2si %xmm3, %rax
+; SSE-NEXT:    movq %rax, %xmm3
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; SSE-NEXT:    movaps %xmm2, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3]
+; SSE-NEXT:    cvtss2si %xmm3, %rax
+; SSE-NEXT:    movq %rax, %xmm3
+; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
+; SSE-NEXT:    cvtss2si %xmm2, %rax
+; SSE-NEXT:    movq %rax, %xmm4
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
+; SSE-NEXT:    cvtss2si %xmm1, %rax
+; SSE-NEXT:    movq %rax, %xmm2
+; SSE-NEXT:    movaps %xmm1, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1]
+; SSE-NEXT:    cvtss2si %xmm3, %rax
+; SSE-NEXT:    movq %rax, %xmm3
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE-NEXT:    movaps %xmm1, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3],xmm1[3,3]
+; SSE-NEXT:    cvtss2si %xmm3, %rax
+; SSE-NEXT:    movq %rax, %xmm5
+; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; SSE-NEXT:    cvtss2si %xmm1, %rax
+; SSE-NEXT:    movq %rax, %xmm3
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
+; SSE-NEXT:    movdqa %xmm4, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: llrint_v8i64_v8f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX1-NEXT:    vcvtss2si %xmm1, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm1
+; AVX1-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX1-NEXT:    vcvtss2si %xmm2, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm2
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT:    vcvtss2si %xmm0, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm2
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vcvtss2si %xmm3, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm3
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX1-NEXT:    vcvtss2si %xmm1, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm1
+; AVX1-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX1-NEXT:    vcvtss2si %xmm3, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm3
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; AVX1-NEXT:    vcvtss2si %xmm0, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm3
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vcvtss2si %xmm0, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX1-NEXT:    vmovaps %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: llrint_v8i64_v8f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX512-NEXT:    vcvtss2si %xmm2, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm2
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
+; AVX512-NEXT:    vcvtss2si %xmm3, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm3
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512-NEXT:    vcvtss2si %xmm1, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm3
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX512-NEXT:    vcvtss2si %xmm1, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX512-NEXT:    vcvtss2si %xmm2, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm2
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX512-NEXT:    vcvtss2si %xmm3, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm3
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512-NEXT:    vcvtss2si %xmm0, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm3
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX512-NEXT:    vcvtss2si %xmm0, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+;
+; AVX512DQ-LABEL: llrint_v8i64_v8f32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vcvtps2qq %ymm0, %zmm0
+; AVX512DQ-NEXT:    retq
   %a = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> %x)
   ret <8 x i64> %a
 }
 declare <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float>)
 
 define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) {
-; X64-SSE-LABEL: llrint_v16i64_v16f32:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movq %rdi, %rax
-; X64-SSE-NEXT:    cvtss2si %xmm0, %rcx
-; X64-SSE-NEXT:    movq %rcx, %xmm4
-; X64-SSE-NEXT:    movaps %xmm0, %xmm5
-; X64-SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1]
-; X64-SSE-NEXT:    cvtss2si %xmm5, %rcx
-; X64-SSE-NEXT:    movq %rcx, %xmm5
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; X64-SSE-NEXT:    movaps %xmm0, %xmm5
-; X64-SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,3],xmm0[3,3]
-; X64-SSE-NEXT:    cvtss2si %xmm5, %rcx
-; X64-SSE-NEXT:    movq %rcx, %xmm5
-; X64-SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; X64-SSE-NEXT:    cvtss2si %xmm0, %rcx
-; X64-SSE-NEXT:    movq %rcx, %xmm0
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
-; X64-SSE-NEXT:    cvtss2si %xmm1, %rcx
-; X64-SSE-NEXT:    movq %rcx, %xmm5
-; X64-SSE-NEXT:    movaps %xmm1, %xmm6
-; X64-SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[1,1],xmm1[1,1]
-; X64-SSE-NEXT:    cvtss2si %xmm6, %rcx
-; X64-SSE-NEXT:    movq %rcx, %xmm6
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
-; X64-SSE-NEXT:    movaps %xmm1, %xmm6
-; X64-SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[3,3],xmm1[3,3]
-; X64-SSE-NEXT:    cvtss2si %xmm6, %rcx
-; X64-SSE-NEXT:    movq %rcx, %xmm6
-; X64-SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
-; X64-SSE-NEXT:    cvtss2si %xmm1, %rcx
-; X64-SSE-NEXT:    movq %rcx, %xmm1
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm6[0]
-; X64-SSE-NEXT:    cvtss2si %xmm2, %rcx
-; X64-SSE-NEXT:    movq %rcx, %xmm6
-; X64-SSE-NEXT:    movaps %xmm2, %xmm7
-; X64-SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[1,1],xmm2[1,1]
-; X64-SSE-NEXT:    cvtss2si %xmm7, %rcx
-; X64-SSE-NEXT:    movq %rcx, %xmm7
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; X64-SSE-NEXT:    movaps %xmm2, %xmm7
-; X64-SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[3,3],xmm2[3,3]
-; X64-SSE-NEXT:    cvtss2si %xmm7, %rcx
-; X64-SSE-NEXT:    movq %rcx, %xmm7
-; X64-SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
-; X64-SSE-NEXT:    cvtss2si %xmm2, %rcx
-; X64-SSE-NEXT:    movq %rcx, %xmm2
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm7[0]
-; X64-SSE-NEXT:    cvtss2si %xmm3, %rcx
-; X64-SSE-NEXT:    movq %rcx, %xmm7
-; X64-SSE-NEXT:    movaps %xmm3, %xmm8
-; X64-SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[1,1],xmm3[1,1]
-; X64-SSE-NEXT:    cvtss2si %xmm8, %rcx
-; X64-SSE-NEXT:    movq %rcx, %xmm8
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm8[0]
-; X64-SSE-NEXT:    movaps %xmm3, %xmm8
-; X64-SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[3,3],xmm3[3,3]
-; X64-SSE-NEXT:    cvtss2si %xmm8, %rcx
-; X64-SSE-NEXT:    movq %rcx, %xmm8
-; X64-SSE-NEXT:    movhlps {{.*#+}} xmm3 = xmm3[1,1]
-; X64-SSE-NEXT:    cvtss2si %xmm3, %rcx
-; X64-SSE-NEXT:    movq %rcx, %xmm3
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm8[0]
-; X64-SSE-NEXT:    movdqa %xmm3, 112(%rdi)
-; X64-SSE-NEXT:    movdqa %xmm7, 96(%rdi)
-; X64-SSE-NEXT:    movdqa %xmm2, 80(%rdi)
-; X64-SSE-NEXT:    movdqa %xmm6, 64(%rdi)
-; X64-SSE-NEXT:    movdqa %xmm1, 48(%rdi)
-; X64-SSE-NEXT:    movdqa %xmm5, 32(%rdi)
-; X64-SSE-NEXT:    movdqa %xmm0, 16(%rdi)
-; X64-SSE-NEXT:    movdqa %xmm4, (%rdi)
-; X64-SSE-NEXT:    retq
+; SSE-LABEL: llrint_v16i64_v16f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    cvtss2si %xmm0, %rcx
+; SSE-NEXT:    movq %rcx, %xmm4
+; SSE-NEXT:    movaps %xmm0, %xmm5
+; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1]
+; SSE-NEXT:    cvtss2si %xmm5, %rcx
+; SSE-NEXT:    movq %rcx, %xmm5
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSE-NEXT:    movaps %xmm0, %xmm5
+; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,3],xmm0[3,3]
+; SSE-NEXT:    cvtss2si %xmm5, %rcx
+; SSE-NEXT:    movq %rcx, %xmm5
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    cvtss2si %xmm0, %rcx
+; SSE-NEXT:    movq %rcx, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
+; SSE-NEXT:    cvtss2si %xmm1, %rcx
+; SSE-NEXT:    movq %rcx, %xmm5
+; SSE-NEXT:    movaps %xmm1, %xmm6
+; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[1,1],xmm1[1,1]
+; SSE-NEXT:    cvtss2si %xmm6, %rcx
+; SSE-NEXT:    movq %rcx, %xmm6
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
+; SSE-NEXT:    movaps %xmm1, %xmm6
+; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[3,3],xmm1[3,3]
+; SSE-NEXT:    cvtss2si %xmm6, %rcx
+; SSE-NEXT:    movq %rcx, %xmm6
+; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; SSE-NEXT:    cvtss2si %xmm1, %rcx
+; SSE-NEXT:    movq %rcx, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm6[0]
+; SSE-NEXT:    cvtss2si %xmm2, %rcx
+; SSE-NEXT:    movq %rcx, %xmm6
+; SSE-NEXT:    movaps %xmm2, %xmm7
+; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[1,1],xmm2[1,1]
+; SSE-NEXT:    cvtss2si %xmm7, %rcx
+; SSE-NEXT:    movq %rcx, %xmm7
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; SSE-NEXT:    movaps %xmm2, %xmm7
+; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[3,3],xmm2[3,3]
+; SSE-NEXT:    cvtss2si %xmm7, %rcx
+; SSE-NEXT:    movq %rcx, %xmm7
+; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
+; SSE-NEXT:    cvtss2si %xmm2, %rcx
+; SSE-NEXT:    movq %rcx, %xmm2
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm7[0]
+; SSE-NEXT:    cvtss2si %xmm3, %rcx
+; SSE-NEXT:    movq %rcx, %xmm7
+; SSE-NEXT:    movaps %xmm3, %xmm8
+; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[1,1],xmm3[1,1]
+; SSE-NEXT:    cvtss2si %xmm8, %rcx
+; SSE-NEXT:    movq %rcx, %xmm8
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm8[0]
+; SSE-NEXT:    movaps %xmm3, %xmm8
+; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[3,3],xmm3[3,3]
+; SSE-NEXT:    cvtss2si %xmm8, %rcx
+; SSE-NEXT:    movq %rcx, %xmm8
+; SSE-NEXT:    movhlps {{.*#+}} xmm3 = xmm3[1,1]
+; SSE-NEXT:    cvtss2si %xmm3, %rcx
+; SSE-NEXT:    movq %rcx, %xmm3
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm8[0]
+; SSE-NEXT:    movdqa %xmm3, 112(%rdi)
+; SSE-NEXT:    movdqa %xmm7, 96(%rdi)
+; SSE-NEXT:    movdqa %xmm2, 80(%rdi)
+; SSE-NEXT:    movdqa %xmm6, 64(%rdi)
+; SSE-NEXT:    movdqa %xmm1, 48(%rdi)
+; SSE-NEXT:    movdqa %xmm5, 32(%rdi)
+; SSE-NEXT:    movdqa %xmm0, 16(%rdi)
+; SSE-NEXT:    movdqa %xmm4, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: llrint_v16i64_v16f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps %ymm0, %ymm2
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[3,3,3,3]
+; AVX1-NEXT:    vcvtss2si %xmm0, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    vshufpd {{.*#+}} xmm3 = xmm2[1,0]
+; AVX1-NEXT:    vcvtss2si %xmm3, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm3
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; AVX1-NEXT:    vcvtss2si %xmm2, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm3
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; AVX1-NEXT:    vcvtss2si %xmm4, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm4
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[3,3,3,3]
+; AVX1-NEXT:    vcvtss2si %xmm3, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm3
+; AVX1-NEXT:    vshufpd {{.*#+}} xmm4 = xmm2[1,0]
+; AVX1-NEXT:    vcvtss2si %xmm4, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm4
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX1-NEXT:    vcvtss2si %xmm2, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm4
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX1-NEXT:    vcvtss2si %xmm2, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm2
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm4
+; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX1-NEXT:    vcvtss2si %xmm2, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm2
+; AVX1-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
+; AVX1-NEXT:    vcvtss2si %xmm3, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm3
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX1-NEXT:    vcvtss2si %xmm1, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm3
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; AVX1-NEXT:    vcvtss2si %xmm5, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm5
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3]
+; AVX1-NEXT:    vcvtss2si %xmm3, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm3
+; AVX1-NEXT:    vshufpd {{.*#+}} xmm5 = xmm1[1,0]
+; AVX1-NEXT:    vcvtss2si %xmm5, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm5
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm5[0],xmm3[0]
+; AVX1-NEXT:    vcvtss2si %xmm1, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm5
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT:    vcvtss2si %xmm1, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm1
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm3
+; AVX1-NEXT:    vmovaps %ymm4, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: llrint_v16i64_v16f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX512-NEXT:    vcvtss2si %xmm2, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm2
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
+; AVX512-NEXT:    vcvtss2si %xmm3, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm3
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512-NEXT:    vcvtss2si %xmm1, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm3
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX512-NEXT:    vcvtss2si %xmm1, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX512-NEXT:    vcvtss2si %xmm2, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm2
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX512-NEXT:    vcvtss2si %xmm3, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm3
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512-NEXT:    vcvtss2si %xmm0, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm3
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX512-NEXT:    vcvtss2si %xmm4, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm4
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm2
+; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3]
+; AVX512-NEXT:    vcvtss2si %xmm3, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm3
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm4 = xmm1[1,0]
+; AVX512-NEXT:    vcvtss2si %xmm4, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm4
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX512-NEXT:    vcvtss2si %xmm1, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm4
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX512-NEXT:    vcvtss2si %xmm1, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; AVX512-NEXT:    vcvtss2si %xmm3, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm3
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm4 = xmm0[1,0]
+; AVX512-NEXT:    vcvtss2si %xmm4, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm4
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX512-NEXT:    vcvtss2si %xmm0, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm4
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX512-NEXT:    vcvtss2si %xmm0, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm4[0],xmm0[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
+; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512-NEXT:    retq
+;
+; AVX512DQ-LABEL: llrint_v16i64_v16f32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vcvtps2qq %ymm0, %zmm2
+; AVX512DQ-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; AVX512DQ-NEXT:    vcvtps2qq %ymm0, %zmm1
+; AVX512DQ-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512DQ-NEXT:    retq
   %a = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> %x)
   ret <16 x i64> %a
 }
 declare <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float>)
 
 define <1 x i64> @llrint_v1i64_v1f64(<1 x double> %x) {
-; X64-SSE-LABEL: llrint_v1i64_v1f64:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    cvtsd2si %xmm0, %rax
-; X64-SSE-NEXT:    retq
+; SSE-LABEL: llrint_v1i64_v1f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtsd2si %xmm0, %rax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: llrint_v1i64_v1f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX-NEXT:    retq
 ;
-; X64-AVX-LABEL: llrint_v1i64_v1f64:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vcvtsd2si %xmm0, %rax
-; X64-AVX-NEXT:    retq
+; AVX512DQ-LABEL: llrint_v1i64_v1f64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX512DQ-NEXT:    retq
   %a = call <1 x i64> @llvm.llrint.v1i64.v1f64(<1 x double> %x)
   ret <1 x i64> %a
 }
 declare <1 x i64> @llvm.llrint.v1i64.v1f64(<1 x double>)
 
 define <2 x i64> @llrint_v2i64_v2f64(<2 x double> %x) {
-; X64-SSE-LABEL: llrint_v2i64_v2f64:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    cvtsd2si %xmm0, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm1
-; X64-SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
-; X64-SSE-NEXT:    cvtsd2si %xmm0, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm0
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X64-SSE-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE-NEXT:    retq
+; SSE-LABEL: llrint_v2i64_v2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtsd2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    cvtsd2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: llrint_v2i64_v2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX-NEXT:    vmovq %rax, %xmm1
+; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX-NEXT:    vmovq %rax, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    retq
 ;
-; X64-AVX-LABEL: llrint_v2i64_v2f64:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vcvtsd2si %xmm0, %rax
-; X64-AVX-NEXT:    vmovq %rax, %xmm1
-; X64-AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; X64-AVX-NEXT:    vcvtsd2si %xmm0, %rax
-; X64-AVX-NEXT:    vmovq %rax, %xmm0
-; X64-AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; X64-AVX-NEXT:    retq
+; AVX512DQ-LABEL: llrint_v2i64_v2f64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vcvtpd2qq %xmm0, %xmm0
+; AVX512DQ-NEXT:    retq
   %a = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> %x)
   ret <2 x i64> %a
 }
 declare <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double>)
 
 define <4 x i64> @llrint_v4i64_v4f64(<4 x double> %x) {
-; X64-SSE-LABEL: llrint_v4i64_v4f64:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    cvtsd2si %xmm0, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm2
-; X64-SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
-; X64-SSE-NEXT:    cvtsd2si %xmm0, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm0
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; X64-SSE-NEXT:    cvtsd2si %xmm1, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm3
-; X64-SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
-; X64-SSE-NEXT:    cvtsd2si %xmm1, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm0
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; X64-SSE-NEXT:    movdqa %xmm2, %xmm0
-; X64-SSE-NEXT:    movdqa %xmm3, %xmm1
-; X64-SSE-NEXT:    retq
+; SSE-LABEL: llrint_v4i64_v4f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtsd2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm2
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    cvtsd2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE-NEXT:    cvtsd2si %xmm1, %rax
+; SSE-NEXT:    movq %rax, %xmm3
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
+; SSE-NEXT:    cvtsd2si %xmm1, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    movdqa %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: llrint_v4i64_v4f64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vcvtsd2si %xmm1, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm2
+; AVX1-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-NEXT:    vcvtsd2si %xmm1, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm1
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm2
+; AVX1-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: llrint_v4i64_v4f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vcvtsd2si %xmm1, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm2
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512-NEXT:    vcvtsd2si %xmm1, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm2
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; AVX512DQ-LABEL: llrint_v4i64_v4f64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vcvtpd2qq %ymm0, %ymm0
+; AVX512DQ-NEXT:    retq
   %a = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> %x)
   ret <4 x i64> %a
 }
 declare <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double>)
 
 define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) {
-; X64-SSE-LABEL: llrint_v8i64_v8f64:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    cvtsd2si %xmm0, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm4
-; X64-SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
-; X64-SSE-NEXT:    cvtsd2si %xmm0, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm0
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]
-; X64-SSE-NEXT:    cvtsd2si %xmm1, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm5
-; X64-SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
-; X64-SSE-NEXT:    cvtsd2si %xmm1, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm0
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0]
-; X64-SSE-NEXT:    cvtsd2si %xmm2, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm6
-; X64-SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
-; X64-SSE-NEXT:    cvtsd2si %xmm2, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm0
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm0[0]
-; X64-SSE-NEXT:    cvtsd2si %xmm3, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm7
-; X64-SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
-; X64-SSE-NEXT:    cvtsd2si %xmm3, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm0
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm0[0]
-; X64-SSE-NEXT:    movdqa %xmm4, %xmm0
-; X64-SSE-NEXT:    movdqa %xmm5, %xmm1
-; X64-SSE-NEXT:    movdqa %xmm6, %xmm2
-; X64-SSE-NEXT:    movdqa %xmm7, %xmm3
-; X64-SSE-NEXT:    retq
+; SSE-LABEL: llrint_v8i64_v8f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtsd2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm4
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    cvtsd2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]
+; SSE-NEXT:    cvtsd2si %xmm1, %rax
+; SSE-NEXT:    movq %rax, %xmm5
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
+; SSE-NEXT:    cvtsd2si %xmm1, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0]
+; SSE-NEXT:    cvtsd2si %xmm2, %rax
+; SSE-NEXT:    movq %rax, %xmm6
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
+; SSE-NEXT:    cvtsd2si %xmm2, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm0[0]
+; SSE-NEXT:    cvtsd2si %xmm3, %rax
+; SSE-NEXT:    movq %rax, %xmm7
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
+; SSE-NEXT:    cvtsd2si %xmm3, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm0[0]
+; SSE-NEXT:    movdqa %xmm4, %xmm0
+; SSE-NEXT:    movdqa %xmm5, %xmm1
+; SSE-NEXT:    movdqa %xmm6, %xmm2
+; SSE-NEXT:    movdqa %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: llrint_v8i64_v8f64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vcvtsd2si %xmm2, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm3
+; AVX1-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX1-NEXT:    vcvtsd2si %xmm2, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm2
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX1-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm3
+; AVX1-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vcvtsd2si %xmm2, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm3
+; AVX1-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX1-NEXT:    vcvtsd2si %xmm2, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm2
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX1-NEXT:    vcvtsd2si %xmm1, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm3
+; AVX1-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-NEXT:    vcvtsd2si %xmm1, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm1
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: llrint_v8i64_v8f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm1
+; AVX512-NEXT:    vcvtsd2si %xmm1, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm2
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512-NEXT:    vcvtsd2si %xmm1, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
+; AVX512-NEXT:    vcvtsd2si %xmm2, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm3
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-NEXT:    vcvtsd2si %xmm2, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm2
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX512-NEXT:    vcvtsd2si %xmm2, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm3
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-NEXT:    vcvtsd2si %xmm2, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm2
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm3
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+;
+; AVX512DQ-LABEL: llrint_v8i64_v8f64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vcvtpd2qq %zmm0, %zmm0
+; AVX512DQ-NEXT:    retq
   %a = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> %x)
   ret <8 x i64> %a
 }
diff --git a/llvm/test/CodeGen/X86/vector-lrint.ll b/llvm/test/CodeGen/X86/vector-lrint.ll
index f527a3584f44..3612205bf1bf 100644
--- a/llvm/test/CodeGen/X86/vector-lrint.ll
+++ b/llvm/test/CodeGen/X86/vector-lrint.ll
@@ -1,11 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=i686-unknown -mattr=sse2 | FileCheck %s --check-prefix=X86-SSE2
 ; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=i686-unknown -mattr=avx | FileCheck %s --check-prefixes=X86-AVX,X86-AVX1
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=i686-unknown -mattr=avx512f | FileCheck %s --check-prefixes=X86-AVX,X86-AVX512
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=i686-unknown -mattr=avx512f | FileCheck %s --check-prefixes=X86-AVX,AVX512-i32
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=i686-unknown -mattr=avx512dq,avx512vl | FileCheck %s --check-prefixes=X86-AVX,AVX512-i32
 ; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=x86_64-unknown -mattr=avx | FileCheck %s --check-prefixes=X64-AVX-i32,X64-AVX1-i32
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefixes=X64-AVX-i32,X64-AVX512-i32
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefixes=X64-AVX-i32,AVX512-i32
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=x86_64-unknown -mattr=avx512dq,avx512vl | FileCheck %s --check-prefixes=X64-AVX-i32,AVX512-i32
 ; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=x86_64-unknown -mattr=avx | FileCheck %s --check-prefixes=X64-AVX-i64,X64-AVX1-i64
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefixes=X64-AVX-i64,X64-AVX512-i64
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefixes=X64-AVX-i64,AVX512-i64
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=x86_64-unknown -mattr=avx512dq,avx512vl | FileCheck %s --check-prefixes=X64-AVX-i64,AVX512DQ-i64
 
 define <1 x iXLen> @lrint_v1f32(<1 x float> %x) {
 ; X86-SSE2-LABEL: lrint_v1f32:
@@ -35,64 +38,43 @@ declare <1 x iXLen> @llvm.lrint.v1iXLen.v1f32(<1 x float>)
 define <2 x iXLen> @lrint_v2f32(<2 x float> %x) {
 ; X86-SSE2-LABEL: lrint_v2f32:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movaps %xmm0, %xmm1
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
-; X86-SSE2-NEXT:    cvtss2si %xmm1, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm1
-; X86-SSE2-NEXT:    movaps %xmm0, %xmm2
-; X86-SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; X86-SSE2-NEXT:    cvtss2si %xmm2, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm2
-; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-SSE2-NEXT:    cvtss2si %xmm0, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm1
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-SSE2-NEXT:    cvtss2si %xmm0, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm0
-; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    cvtps2dq %xmm0, %xmm0
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-AVX-LABEL: lrint_v2f32:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-AVX-NEXT:    vcvtss2si %xmm1, %eax
-; X86-AVX-NEXT:    vcvtss2si %xmm0, %ecx
-; X86-AVX-NEXT:    vmovd %ecx, %xmm1
-; X86-AVX-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
-; X86-AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; X86-AVX-NEXT:    vcvtss2si %xmm2, %eax
-; X86-AVX-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
-; X86-AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X86-AVX-NEXT:    vcvtss2si %xmm0, %eax
-; X86-AVX-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
+; X86-AVX-NEXT:    vcvtps2dq %xmm0, %xmm0
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-AVX-i32-LABEL: lrint_v2f32:
 ; X64-AVX-i32:       # %bb.0:
-; X64-AVX-i32-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X64-AVX-i32-NEXT:    vcvtss2si %xmm1, %eax
-; X64-AVX-i32-NEXT:    vcvtss2si %xmm0, %ecx
-; X64-AVX-i32-NEXT:    vmovd %ecx, %xmm1
-; X64-AVX-i32-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
-; X64-AVX-i32-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; X64-AVX-i32-NEXT:    vcvtss2si %xmm2, %eax
-; X64-AVX-i32-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
-; X64-AVX-i32-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X64-AVX-i32-NEXT:    vcvtss2si %xmm0, %eax
-; X64-AVX-i32-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
+; X64-AVX-i32-NEXT:    vcvtps2dq %xmm0, %xmm0
 ; X64-AVX-i32-NEXT:    retq
 ;
-; X64-AVX-i64-LABEL: lrint_v2f32:
-; X64-AVX-i64:       # %bb.0:
-; X64-AVX-i64-NEXT:    vcvtss2si %xmm0, %rax
-; X64-AVX-i64-NEXT:    vmovq %rax, %xmm1
-; X64-AVX-i64-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X64-AVX-i64-NEXT:    vcvtss2si %xmm0, %rax
-; X64-AVX-i64-NEXT:    vmovq %rax, %xmm0
-; X64-AVX-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; X64-AVX-i64-NEXT:    retq
+; X64-AVX1-i64-LABEL: lrint_v2f32:
+; X64-AVX1-i64:       # %bb.0:
+; X64-AVX1-i64-NEXT:    vcvtss2si %xmm0, %rax
+; X64-AVX1-i64-NEXT:    vmovq %rax, %xmm1
+; X64-AVX1-i64-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-AVX1-i64-NEXT:    vcvtss2si %xmm0, %rax
+; X64-AVX1-i64-NEXT:    vmovq %rax, %xmm0
+; X64-AVX1-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X64-AVX1-i64-NEXT:    retq
+;
+; AVX512-i64-LABEL: lrint_v2f32:
+; AVX512-i64:       # %bb.0:
+; AVX512-i64-NEXT:    vcvtss2si %xmm0, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm1
+; AVX512-i64-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX512-i64-NEXT:    vcvtss2si %xmm0, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-i64-NEXT:    retq
+;
+; AVX512DQ-i64-LABEL: lrint_v2f32:
+; AVX512DQ-i64:       # %bb.0:
+; AVX512DQ-i64-NEXT:    vcvtps2qq %xmm0, %xmm0
+; AVX512DQ-i64-NEXT:    retq
   %a = call <2 x iXLen> @llvm.lrint.v2iXLen.v2f32(<2 x float> %x)
   ret <2 x iXLen> %a
 }
@@ -101,53 +83,17 @@ declare <2 x iXLen> @llvm.lrint.v2iXLen.v2f32(<2 x float>)
 define <4 x iXLen> @lrint_v4f32(<4 x float> %x) {
 ; X86-SSE2-LABEL: lrint_v4f32:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movaps %xmm0, %xmm1
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
-; X86-SSE2-NEXT:    cvtss2si %xmm1, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm1
-; X86-SSE2-NEXT:    movaps %xmm0, %xmm2
-; X86-SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; X86-SSE2-NEXT:    cvtss2si %xmm2, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm2
-; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-SSE2-NEXT:    cvtss2si %xmm0, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm1
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-SSE2-NEXT:    cvtss2si %xmm0, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm0
-; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    cvtps2dq %xmm0, %xmm0
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-AVX-LABEL: lrint_v4f32:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-AVX-NEXT:    vcvtss2si %xmm1, %eax
-; X86-AVX-NEXT:    vcvtss2si %xmm0, %ecx
-; X86-AVX-NEXT:    vmovd %ecx, %xmm1
-; X86-AVX-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
-; X86-AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; X86-AVX-NEXT:    vcvtss2si %xmm2, %eax
-; X86-AVX-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
-; X86-AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X86-AVX-NEXT:    vcvtss2si %xmm0, %eax
-; X86-AVX-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
+; X86-AVX-NEXT:    vcvtps2dq %xmm0, %xmm0
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-AVX-i32-LABEL: lrint_v4f32:
 ; X64-AVX-i32:       # %bb.0:
-; X64-AVX-i32-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X64-AVX-i32-NEXT:    vcvtss2si %xmm1, %eax
-; X64-AVX-i32-NEXT:    vcvtss2si %xmm0, %ecx
-; X64-AVX-i32-NEXT:    vmovd %ecx, %xmm1
-; X64-AVX-i32-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
-; X64-AVX-i32-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; X64-AVX-i32-NEXT:    vcvtss2si %xmm2, %eax
-; X64-AVX-i32-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
-; X64-AVX-i32-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X64-AVX-i32-NEXT:    vcvtss2si %xmm0, %eax
-; X64-AVX-i32-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
+; X64-AVX-i32-NEXT:    vcvtps2dq %xmm0, %xmm0
 ; X64-AVX-i32-NEXT:    retq
 ;
 ; X64-AVX1-i64-LABEL: lrint_v4f32:
@@ -168,23 +114,28 @@ define <4 x iXLen> @lrint_v4f32(<4 x float> %x) {
 ; X64-AVX1-i64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X64-AVX1-i64-NEXT:    retq
 ;
-; X64-AVX512-i64-LABEL: lrint_v4f32:
-; X64-AVX512-i64:       # %bb.0:
-; X64-AVX512-i64-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X64-AVX512-i64-NEXT:    vcvtss2si %xmm1, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm1
-; X64-AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; X64-AVX512-i64-NEXT:    vcvtss2si %xmm2, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm2
-; X64-AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; X64-AVX512-i64-NEXT:    vcvtss2si %xmm0, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm2
-; X64-AVX512-i64-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X64-AVX512-i64-NEXT:    vcvtss2si %xmm0, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm0
-; X64-AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; X64-AVX512-i64-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; X64-AVX512-i64-NEXT:    retq
+; AVX512-i64-LABEL: lrint_v4f32:
+; AVX512-i64:       # %bb.0:
+; AVX512-i64-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX512-i64-NEXT:    vcvtss2si %xmm1, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm1
+; AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX512-i64-NEXT:    vcvtss2si %xmm2, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm2
+; AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-i64-NEXT:    vcvtss2si %xmm0, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm2
+; AVX512-i64-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX512-i64-NEXT:    vcvtss2si %xmm0, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX512-i64-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-i64-NEXT:    retq
+;
+; AVX512DQ-i64-LABEL: lrint_v4f32:
+; AVX512DQ-i64:       # %bb.0:
+; AVX512DQ-i64-NEXT:    vcvtps2qq %xmm0, %ymm0
+; AVX512DQ-i64-NEXT:    retq
   %a = call <4 x iXLen> @llvm.lrint.v4iXLen.v4f32(<4 x float> %x)
   ret <4 x iXLen> %a
 }
@@ -193,152 +144,19 @@ declare <4 x iXLen> @llvm.lrint.v4iXLen.v4f32(<4 x float>)
 define <8 x iXLen> @lrint_v8f32(<8 x float> %x) {
 ; X86-SSE2-LABEL: lrint_v8f32:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movaps %xmm0, %xmm2
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X86-SSE2-NEXT:    cvtss2si %xmm0, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm0
-; X86-SSE2-NEXT:    movaps %xmm2, %xmm3
-; X86-SSE2-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1]
-; X86-SSE2-NEXT:    cvtss2si %xmm3, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm3
-; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
-; X86-SSE2-NEXT:    cvtss2si %xmm2, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm0
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,1,1]
-; X86-SSE2-NEXT:    cvtss2si %xmm2, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm2
-; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; X86-SSE2-NEXT:    movaps %xmm1, %xmm2
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3]
-; X86-SSE2-NEXT:    cvtss2si %xmm2, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm2
-; X86-SSE2-NEXT:    movaps %xmm1, %xmm3
-; X86-SSE2-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
-; X86-SSE2-NEXT:    cvtss2si %xmm3, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm3
-; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; X86-SSE2-NEXT:    cvtss2si %xmm1, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm2
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X86-SSE2-NEXT:    cvtss2si %xmm1, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm1
-; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT:    cvtps2dq %xmm0, %xmm0
+; X86-SSE2-NEXT:    cvtps2dq %xmm1, %xmm1
 ; X86-SSE2-NEXT:    retl
 ;
-; X86-AVX1-LABEL: lrint_v8f32:
-; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; X86-AVX1-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; X86-AVX1-NEXT:    vcvtss2si %xmm2, %eax
-; X86-AVX1-NEXT:    vcvtss2si %xmm1, %ecx
-; X86-AVX1-NEXT:    vmovd %ecx, %xmm2
-; X86-AVX1-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
-; X86-AVX1-NEXT:    vcvtss2si %xmm3, %eax
-; X86-AVX1-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; X86-AVX1-NEXT:    vcvtss2si %xmm1, %eax
-; X86-AVX1-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm1
-; X86-AVX1-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X86-AVX1-NEXT:    vcvtss2si %xmm2, %eax
-; X86-AVX1-NEXT:    vcvtss2si %xmm0, %ecx
-; X86-AVX1-NEXT:    vmovd %ecx, %xmm2
-; X86-AVX1-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
-; X86-AVX1-NEXT:    vcvtss2si %xmm3, %eax
-; X86-AVX1-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X86-AVX1-NEXT:    vcvtss2si %xmm0, %eax
-; X86-AVX1-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X86-AVX1-NEXT:    retl
-;
-; X86-AVX512-LABEL: lrint_v8f32:
-; X86-AVX512:       # %bb.0:
-; X86-AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; X86-AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; X86-AVX512-NEXT:    vcvtss2si %xmm2, %eax
-; X86-AVX512-NEXT:    vcvtss2si %xmm1, %ecx
-; X86-AVX512-NEXT:    vmovd %ecx, %xmm2
-; X86-AVX512-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; X86-AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
-; X86-AVX512-NEXT:    vcvtss2si %xmm3, %eax
-; X86-AVX512-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; X86-AVX512-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; X86-AVX512-NEXT:    vcvtss2si %xmm1, %eax
-; X86-AVX512-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm1
-; X86-AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X86-AVX512-NEXT:    vcvtss2si %xmm2, %eax
-; X86-AVX512-NEXT:    vcvtss2si %xmm0, %ecx
-; X86-AVX512-NEXT:    vmovd %ecx, %xmm2
-; X86-AVX512-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; X86-AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
-; X86-AVX512-NEXT:    vcvtss2si %xmm3, %eax
-; X86-AVX512-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; X86-AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X86-AVX512-NEXT:    vcvtss2si %xmm0, %eax
-; X86-AVX512-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; X86-AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; X86-AVX512-NEXT:    retl
-;
-; X64-AVX1-i32-LABEL: lrint_v8f32:
-; X64-AVX1-i32:       # %bb.0:
-; X64-AVX1-i32-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; X64-AVX1-i32-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; X64-AVX1-i32-NEXT:    vcvtss2si %xmm2, %eax
-; X64-AVX1-i32-NEXT:    vcvtss2si %xmm1, %ecx
-; X64-AVX1-i32-NEXT:    vmovd %ecx, %xmm2
-; X64-AVX1-i32-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; X64-AVX1-i32-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
-; X64-AVX1-i32-NEXT:    vcvtss2si %xmm3, %eax
-; X64-AVX1-i32-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; X64-AVX1-i32-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; X64-AVX1-i32-NEXT:    vcvtss2si %xmm1, %eax
-; X64-AVX1-i32-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm1
-; X64-AVX1-i32-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X64-AVX1-i32-NEXT:    vcvtss2si %xmm2, %eax
-; X64-AVX1-i32-NEXT:    vcvtss2si %xmm0, %ecx
-; X64-AVX1-i32-NEXT:    vmovd %ecx, %xmm2
-; X64-AVX1-i32-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; X64-AVX1-i32-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
-; X64-AVX1-i32-NEXT:    vcvtss2si %xmm3, %eax
-; X64-AVX1-i32-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; X64-AVX1-i32-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X64-AVX1-i32-NEXT:    vcvtss2si %xmm0, %eax
-; X64-AVX1-i32-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; X64-AVX1-i32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X64-AVX1-i32-NEXT:    retq
+; X86-AVX-LABEL: lrint_v8f32:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    vcvtps2dq %ymm0, %ymm0
+; X86-AVX-NEXT:    retl
 ;
-; X64-AVX512-i32-LABEL: lrint_v8f32:
-; X64-AVX512-i32:       # %bb.0:
-; X64-AVX512-i32-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; X64-AVX512-i32-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; X64-AVX512-i32-NEXT:    vcvtss2si %xmm2, %eax
-; X64-AVX512-i32-NEXT:    vcvtss2si %xmm1, %ecx
-; X64-AVX512-i32-NEXT:    vmovd %ecx, %xmm2
-; X64-AVX512-i32-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; X64-AVX512-i32-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
-; X64-AVX512-i32-NEXT:    vcvtss2si %xmm3, %eax
-; X64-AVX512-i32-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; X64-AVX512-i32-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; X64-AVX512-i32-NEXT:    vcvtss2si %xmm1, %eax
-; X64-AVX512-i32-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm1
-; X64-AVX512-i32-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X64-AVX512-i32-NEXT:    vcvtss2si %xmm2, %eax
-; X64-AVX512-i32-NEXT:    vcvtss2si %xmm0, %ecx
-; X64-AVX512-i32-NEXT:    vmovd %ecx, %xmm2
-; X64-AVX512-i32-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; X64-AVX512-i32-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
-; X64-AVX512-i32-NEXT:    vcvtss2si %xmm3, %eax
-; X64-AVX512-i32-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; X64-AVX512-i32-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X64-AVX512-i32-NEXT:    vcvtss2si %xmm0, %eax
-; X64-AVX512-i32-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; X64-AVX512-i32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; X64-AVX512-i32-NEXT:    retq
+; X64-AVX-i32-LABEL: lrint_v8f32:
+; X64-AVX-i32:       # %bb.0:
+; X64-AVX-i32-NEXT:    vcvtps2dq %ymm0, %ymm0
+; X64-AVX-i32-NEXT:    retq
 ;
 ; X64-AVX1-i64-LABEL: lrint_v8f32:
 ; X64-AVX1-i64:       # %bb.0:
@@ -374,39 +192,44 @@ define <8 x iXLen> @lrint_v8f32(<8 x float> %x) {
 ; X64-AVX1-i64-NEXT:    vmovaps %ymm2, %ymm0
 ; X64-AVX1-i64-NEXT:    retq
 ;
-; X64-AVX512-i64-LABEL: lrint_v8f32:
-; X64-AVX512-i64:       # %bb.0:
-; X64-AVX512-i64-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; X64-AVX512-i64-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; X64-AVX512-i64-NEXT:    vcvtss2si %xmm2, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm2
-; X64-AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
-; X64-AVX512-i64-NEXT:    vcvtss2si %xmm3, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm3
-; X64-AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; X64-AVX512-i64-NEXT:    vcvtss2si %xmm1, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm3
-; X64-AVX512-i64-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X64-AVX512-i64-NEXT:    vcvtss2si %xmm1, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm1
-; X64-AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
-; X64-AVX512-i64-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; X64-AVX512-i64-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
-; X64-AVX512-i64-NEXT:    vcvtss2si %xmm2, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm2
-; X64-AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
-; X64-AVX512-i64-NEXT:    vcvtss2si %xmm3, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm3
-; X64-AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; X64-AVX512-i64-NEXT:    vcvtss2si %xmm0, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm3
-; X64-AVX512-i64-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X64-AVX512-i64-NEXT:    vcvtss2si %xmm0, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm0
-; X64-AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
-; X64-AVX512-i64-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; X64-AVX512-i64-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; X64-AVX512-i64-NEXT:    retq
+; AVX512-i64-LABEL: lrint_v8f32:
+; AVX512-i64:       # %bb.0:
+; AVX512-i64-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512-i64-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX512-i64-NEXT:    vcvtss2si %xmm2, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm2
+; AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
+; AVX512-i64-NEXT:    vcvtss2si %xmm3, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm3
+; AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512-i64-NEXT:    vcvtss2si %xmm1, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm3
+; AVX512-i64-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX512-i64-NEXT:    vcvtss2si %xmm1, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm1
+; AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; AVX512-i64-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512-i64-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX512-i64-NEXT:    vcvtss2si %xmm2, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm2
+; AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX512-i64-NEXT:    vcvtss2si %xmm3, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm3
+; AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512-i64-NEXT:    vcvtss2si %xmm0, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm3
+; AVX512-i64-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX512-i64-NEXT:    vcvtss2si %xmm0, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; AVX512-i64-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512-i64-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-i64-NEXT:    retq
+;
+; AVX512DQ-i64-LABEL: lrint_v8f32:
+; AVX512DQ-i64:       # %bb.0:
+; AVX512DQ-i64-NEXT:    vcvtps2qq %ymm0, %zmm0
+; AVX512DQ-i64-NEXT:    retq
   %a = call <8 x iXLen> @llvm.lrint.v8iXLen.v8f32(<8 x float> %x)
   ret <8 x iXLen> %a
 }
@@ -473,15 +296,30 @@ define <2 x iXLen> @lrint_v2f64(<2 x double> %x) {
 ; X64-AVX-i32-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
 ; X64-AVX-i32-NEXT:    retq
 ;
-; X64-AVX-i64-LABEL: lrint_v2f64:
-; X64-AVX-i64:       # %bb.0:
-; X64-AVX-i64-NEXT:    vcvtsd2si %xmm0, %rax
-; X64-AVX-i64-NEXT:    vmovq %rax, %xmm1
-; X64-AVX-i64-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; X64-AVX-i64-NEXT:    vcvtsd2si %xmm0, %rax
-; X64-AVX-i64-NEXT:    vmovq %rax, %xmm0
-; X64-AVX-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; X64-AVX-i64-NEXT:    retq
+; X64-AVX1-i64-LABEL: lrint_v2f64:
+; X64-AVX1-i64:       # %bb.0:
+; X64-AVX1-i64-NEXT:    vcvtsd2si %xmm0, %rax
+; X64-AVX1-i64-NEXT:    vmovq %rax, %xmm1
+; X64-AVX1-i64-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X64-AVX1-i64-NEXT:    vcvtsd2si %xmm0, %rax
+; X64-AVX1-i64-NEXT:    vmovq %rax, %xmm0
+; X64-AVX1-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X64-AVX1-i64-NEXT:    retq
+;
+; AVX512-i64-LABEL: lrint_v2f64:
+; AVX512-i64:       # %bb.0:
+; AVX512-i64-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm1
+; AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-i64-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-i64-NEXT:    retq
+;
+; AVX512DQ-i64-LABEL: lrint_v2f64:
+; AVX512DQ-i64:       # %bb.0:
+; AVX512DQ-i64-NEXT:    vcvtpd2qq %xmm0, %xmm0
+; AVX512DQ-i64-NEXT:    retq
   %a = call <2 x iXLen> @llvm.lrint.v2iXLen.v2f64(<2 x double> %x)
   ret <2 x iXLen> %a
 }
@@ -508,33 +346,13 @@ define <4 x iXLen> @lrint_v4f64(<4 x double> %x) {
 ;
 ; X86-AVX-LABEL: lrint_v4f64:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; X86-AVX-NEXT:    vcvtsd2si %xmm1, %eax
-; X86-AVX-NEXT:    vcvtsd2si %xmm0, %ecx
-; X86-AVX-NEXT:    vmovd %ecx, %xmm1
-; X86-AVX-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
-; X86-AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X86-AVX-NEXT:    vcvtsd2si %xmm0, %eax
-; X86-AVX-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
-; X86-AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; X86-AVX-NEXT:    vcvtsd2si %xmm0, %eax
-; X86-AVX-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
+; X86-AVX-NEXT:    vcvtpd2dq %ymm0, %xmm0
 ; X86-AVX-NEXT:    vzeroupper
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-AVX-i32-LABEL: lrint_v4f64:
 ; X64-AVX-i32:       # %bb.0:
-; X64-AVX-i32-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; X64-AVX-i32-NEXT:    vcvtsd2si %xmm1, %eax
-; X64-AVX-i32-NEXT:    vcvtsd2si %xmm0, %ecx
-; X64-AVX-i32-NEXT:    vmovd %ecx, %xmm1
-; X64-AVX-i32-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
-; X64-AVX-i32-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X64-AVX-i32-NEXT:    vcvtsd2si %xmm0, %eax
-; X64-AVX-i32-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
-; X64-AVX-i32-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; X64-AVX-i32-NEXT:    vcvtsd2si %xmm0, %eax
-; X64-AVX-i32-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
+; X64-AVX-i32-NEXT:    vcvtpd2dq %ymm0, %xmm0
 ; X64-AVX-i32-NEXT:    vzeroupper
 ; X64-AVX-i32-NEXT:    retq
 ;
@@ -556,23 +374,28 @@ define <4 x iXLen> @lrint_v4f64(<4 x double> %x) {
 ; X64-AVX1-i64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X64-AVX1-i64-NEXT:    retq
 ;
-; X64-AVX512-i64-LABEL: lrint_v4f64:
-; X64-AVX512-i64:       # %bb.0:
-; X64-AVX512-i64-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; X64-AVX512-i64-NEXT:    vcvtsd2si %xmm1, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm2
-; X64-AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; X64-AVX512-i64-NEXT:    vcvtsd2si %xmm1, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm1
-; X64-AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; X64-AVX512-i64-NEXT:    vcvtsd2si %xmm0, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm2
-; X64-AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; X64-AVX512-i64-NEXT:    vcvtsd2si %xmm0, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm0
-; X64-AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; X64-AVX512-i64-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; X64-AVX512-i64-NEXT:    retq
+; AVX512-i64-LABEL: lrint_v4f64:
+; AVX512-i64:       # %bb.0:
+; AVX512-i64-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512-i64-NEXT:    vcvtsd2si %xmm1, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm2
+; AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512-i64-NEXT:    vcvtsd2si %xmm1, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm1
+; AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-i64-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm2
+; AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-i64-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX512-i64-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-i64-NEXT:    retq
+;
+; AVX512DQ-i64-LABEL: lrint_v4f64:
+; AVX512DQ-i64:       # %bb.0:
+; AVX512DQ-i64-NEXT:    vcvtpd2qq %ymm0, %ymm0
+; AVX512DQ-i64-NEXT:    retq
   %a = call <4 x iXLen> @llvm.lrint.v4iXLen.v4f64(<4 x double> %x)
   ret <4 x iXLen> %a
 }
@@ -623,114 +446,23 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
 ;
 ; X86-AVX1-LABEL: lrint_v8f64:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; X86-AVX1-NEXT:    vcvtsd2si %xmm2, %eax
-; X86-AVX1-NEXT:    vcvtsd2si %xmm1, %ecx
-; X86-AVX1-NEXT:    vmovd %ecx, %xmm2
-; X86-AVX1-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; X86-AVX1-NEXT:    vcvtsd2si %xmm1, %eax
-; X86-AVX1-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; X86-AVX1-NEXT:    vcvtsd2si %xmm1, %eax
-; X86-AVX1-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm1
-; X86-AVX1-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; X86-AVX1-NEXT:    vcvtsd2si %xmm2, %eax
-; X86-AVX1-NEXT:    vcvtsd2si %xmm0, %ecx
-; X86-AVX1-NEXT:    vmovd %ecx, %xmm2
-; X86-AVX1-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X86-AVX1-NEXT:    vcvtsd2si %xmm0, %eax
-; X86-AVX1-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; X86-AVX1-NEXT:    vcvtsd2si %xmm0, %eax
-; X86-AVX1-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
+; X86-AVX1-NEXT:    vcvtpd2dq %ymm0, %xmm0
+; X86-AVX1-NEXT:    vcvtpd2dq %ymm1, %xmm1
 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X86-AVX1-NEXT:    retl
 ;
-; X86-AVX512-LABEL: lrint_v8f64:
-; X86-AVX512:       # %bb.0:
-; X86-AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm1
-; X86-AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; X86-AVX512-NEXT:    vcvtsd2si %xmm2, %eax
-; X86-AVX512-NEXT:    vcvtsd2si %xmm1, %ecx
-; X86-AVX512-NEXT:    vmovd %ecx, %xmm1
-; X86-AVX512-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
-; X86-AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm2
-; X86-AVX512-NEXT:    vcvtsd2si %xmm2, %eax
-; X86-AVX512-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
-; X86-AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
-; X86-AVX512-NEXT:    vcvtsd2si %xmm2, %eax
-; X86-AVX512-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1
-; X86-AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; X86-AVX512-NEXT:    vcvtsd2si %xmm2, %eax
-; X86-AVX512-NEXT:    vcvtsd2si %xmm0, %ecx
-; X86-AVX512-NEXT:    vmovd %ecx, %xmm2
-; X86-AVX512-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; X86-AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X86-AVX512-NEXT:    vcvtsd2si %xmm0, %eax
-; X86-AVX512-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; X86-AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; X86-AVX512-NEXT:    vcvtsd2si %xmm0, %eax
-; X86-AVX512-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; X86-AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; X86-AVX512-NEXT:    retl
+; AVX512-i32-LABEL: lrint_v8f64:
+; AVX512-i32:       # %bb.0:
+; AVX512-i32-NEXT:    vcvtpd2dq %zmm0, %ymm0
+; AVX512-i32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-AVX1-i32-LABEL: lrint_v8f64:
 ; X64-AVX1-i32:       # %bb.0:
-; X64-AVX1-i32-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; X64-AVX1-i32-NEXT:    vcvtsd2si %xmm2, %eax
-; X64-AVX1-i32-NEXT:    vcvtsd2si %xmm1, %ecx
-; X64-AVX1-i32-NEXT:    vmovd %ecx, %xmm2
-; X64-AVX1-i32-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; X64-AVX1-i32-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; X64-AVX1-i32-NEXT:    vcvtsd2si %xmm1, %eax
-; X64-AVX1-i32-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; X64-AVX1-i32-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; X64-AVX1-i32-NEXT:    vcvtsd2si %xmm1, %eax
-; X64-AVX1-i32-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm1
-; X64-AVX1-i32-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; X64-AVX1-i32-NEXT:    vcvtsd2si %xmm2, %eax
-; X64-AVX1-i32-NEXT:    vcvtsd2si %xmm0, %ecx
-; X64-AVX1-i32-NEXT:    vmovd %ecx, %xmm2
-; X64-AVX1-i32-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; X64-AVX1-i32-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X64-AVX1-i32-NEXT:    vcvtsd2si %xmm0, %eax
-; X64-AVX1-i32-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; X64-AVX1-i32-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; X64-AVX1-i32-NEXT:    vcvtsd2si %xmm0, %eax
-; X64-AVX1-i32-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
+; X64-AVX1-i32-NEXT:    vcvtpd2dq %ymm0, %xmm0
+; X64-AVX1-i32-NEXT:    vcvtpd2dq %ymm1, %xmm1
 ; X64-AVX1-i32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X64-AVX1-i32-NEXT:    retq
 ;
-; X64-AVX512-i32-LABEL: lrint_v8f64:
-; X64-AVX512-i32:       # %bb.0:
-; X64-AVX512-i32-NEXT:    vextractf32x4 $2, %zmm0, %xmm1
-; X64-AVX512-i32-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; X64-AVX512-i32-NEXT:    vcvtsd2si %xmm2, %eax
-; X64-AVX512-i32-NEXT:    vcvtsd2si %xmm1, %ecx
-; X64-AVX512-i32-NEXT:    vmovd %ecx, %xmm1
-; X64-AVX512-i32-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
-; X64-AVX512-i32-NEXT:    vextractf32x4 $3, %zmm0, %xmm2
-; X64-AVX512-i32-NEXT:    vcvtsd2si %xmm2, %eax
-; X64-AVX512-i32-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
-; X64-AVX512-i32-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
-; X64-AVX512-i32-NEXT:    vcvtsd2si %xmm2, %eax
-; X64-AVX512-i32-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1
-; X64-AVX512-i32-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; X64-AVX512-i32-NEXT:    vcvtsd2si %xmm2, %eax
-; X64-AVX512-i32-NEXT:    vcvtsd2si %xmm0, %ecx
-; X64-AVX512-i32-NEXT:    vmovd %ecx, %xmm2
-; X64-AVX512-i32-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; X64-AVX512-i32-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X64-AVX512-i32-NEXT:    vcvtsd2si %xmm0, %eax
-; X64-AVX512-i32-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; X64-AVX512-i32-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; X64-AVX512-i32-NEXT:    vcvtsd2si %xmm0, %eax
-; X64-AVX512-i32-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; X64-AVX512-i32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; X64-AVX512-i32-NEXT:    retq
-;
 ; X64-AVX1-i64-LABEL: lrint_v8f64:
 ; X64-AVX1-i64:       # %bb.0:
 ; X64-AVX1-i64-NEXT:    vextractf128 $1, %ymm0, %xmm2
@@ -763,39 +495,44 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
 ; X64-AVX1-i64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; X64-AVX1-i64-NEXT:    retq
 ;
-; X64-AVX512-i64-LABEL: lrint_v8f64:
-; X64-AVX512-i64:       # %bb.0:
-; X64-AVX512-i64-NEXT:    vextractf32x4 $3, %zmm0, %xmm1
-; X64-AVX512-i64-NEXT:    vcvtsd2si %xmm1, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm2
-; X64-AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; X64-AVX512-i64-NEXT:    vcvtsd2si %xmm1, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm1
-; X64-AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; X64-AVX512-i64-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
-; X64-AVX512-i64-NEXT:    vcvtsd2si %xmm2, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm3
-; X64-AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
-; X64-AVX512-i64-NEXT:    vcvtsd2si %xmm2, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm2
-; X64-AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; X64-AVX512-i64-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; X64-AVX512-i64-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; X64-AVX512-i64-NEXT:    vcvtsd2si %xmm2, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm3
-; X64-AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
-; X64-AVX512-i64-NEXT:    vcvtsd2si %xmm2, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm2
-; X64-AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; X64-AVX512-i64-NEXT:    vcvtsd2si %xmm0, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm3
-; X64-AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; X64-AVX512-i64-NEXT:    vcvtsd2si %xmm0, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm0
-; X64-AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
-; X64-AVX512-i64-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; X64-AVX512-i64-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; X64-AVX512-i64-NEXT:    retq
+; AVX512-i64-LABEL: lrint_v8f64:
+; AVX512-i64:       # %bb.0:
+; AVX512-i64-NEXT:    vextractf32x4 $3, %zmm0, %xmm1
+; AVX512-i64-NEXT:    vcvtsd2si %xmm1, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm2
+; AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512-i64-NEXT:    vcvtsd2si %xmm1, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm1
+; AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-i64-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
+; AVX512-i64-NEXT:    vcvtsd2si %xmm2, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm3
+; AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-i64-NEXT:    vcvtsd2si %xmm2, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm2
+; AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512-i64-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512-i64-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX512-i64-NEXT:    vcvtsd2si %xmm2, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm3
+; AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-i64-NEXT:    vcvtsd2si %xmm2, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm2
+; AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512-i64-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm3
+; AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-i64-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; AVX512-i64-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512-i64-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-i64-NEXT:    retq
+;
+; AVX512DQ-i64-LABEL: lrint_v8f64:
+; AVX512DQ-i64:       # %bb.0:
+; AVX512DQ-i64-NEXT:    vcvtpd2qq %zmm0, %zmm0
+; AVX512DQ-i64-NEXT:    retq
   %a = call <8 x iXLen> @llvm.lrint.v8iXLen.v8f64(<8 x double> %x)
   ret <8 x iXLen> %a
 }
diff --git a/llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll b/llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll
index fd29d09d9196..e33c99be0ed0 100644
--- a/llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll
@@ -115,7 +115,7 @@ define void @vector_variable_shift_left_loop(ptr nocapture %arr, ptr nocapture r
 ; SSE-NEXT:    cmpq %rcx, %rdx
 ; SSE-NEXT:    jne .LBB0_4
 ; SSE-NEXT:  # %bb.5: # %middle.block
-; SSE-NEXT:    cmpq %r9, %rdx
+; SSE-NEXT:    cmpl %r9d, %edx
 ; SSE-NEXT:    jne .LBB0_6
 ; SSE-NEXT:  .LBB0_9: # %for.cond.cleanup
 ; SSE-NEXT:    retq
@@ -239,7 +239,7 @@ define void @vector_variable_shift_left_loop(ptr nocapture %arr, ptr nocapture r
 ; AVX1-NEXT:    cmpq %rcx, %rdx
 ; AVX1-NEXT:    jne .LBB0_4
 ; AVX1-NEXT:  # %bb.5: # %middle.block
-; AVX1-NEXT:    cmpq %r9, %rdx
+; AVX1-NEXT:    cmpl %r9d, %edx
 ; AVX1-NEXT:    jne .LBB0_6
 ; AVX1-NEXT:  .LBB0_9: # %for.cond.cleanup
 ; AVX1-NEXT:    vzeroupper
@@ -314,7 +314,7 @@ define void @vector_variable_shift_left_loop(ptr nocapture %arr, ptr nocapture r
 ; AVX2-NEXT:    cmpq %rcx, %rdx
 ; AVX2-NEXT:    jne .LBB0_4
 ; AVX2-NEXT:  # %bb.5: # %middle.block
-; AVX2-NEXT:    cmpq %r9, %rdx
+; AVX2-NEXT:    cmpl %r9d, %edx
 ; AVX2-NEXT:    jne .LBB0_6
 ; AVX2-NEXT:  .LBB0_9: # %for.cond.cleanup
 ; AVX2-NEXT:    vzeroupper
@@ -413,7 +413,7 @@ define void @vector_variable_shift_left_loop(ptr nocapture %arr, ptr nocapture r
 ; XOP-NEXT:    cmpq %rcx, %rdx
 ; XOP-NEXT:    jne .LBB0_4
 ; XOP-NEXT:  # %bb.5: # %middle.block
-; XOP-NEXT:    cmpq %r9, %rdx
+; XOP-NEXT:    cmpl %r9d, %edx
 ; XOP-NEXT:    jne .LBB0_6
 ; XOP-NEXT:  .LBB0_9: # %for.cond.cleanup
 ; XOP-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
index bc95fd42e6b8..ced9304f4c59 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -699,6 +699,28 @@ define <4 x double> @shuffle_v4f64_0437(<4 x double> %a, <4 x double> %b) {
   ret <4 x double> %shuffle
 }
 
+; PR91433
+define <4 x double> @shuffle_v4f64_2303(<4 x double> %a) {
+; AVX1-LABEL: shuffle_v4f64_2303:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,2,3]
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4f64_2303:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,3]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4f64_2303:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,3]
+; AVX512VL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 3>
+  ret <4 x double> %shuffle
+}
+
 define <4 x double> @shuffle_v4f64_0z3z(<4 x double> %a, <4 x double> %b) {
 ; ALL-LABEL: shuffle_v4f64_0z3z:
 ; ALL:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
index 4859a8e0eaaa..81ce14132c87 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -308,16 +308,14 @@ define <4 x float> @combine_vpermilvar_4f32_as_insertps(<4 x float> %a0) {
 define <8 x i32> @combine_blend_of_permutes_v8i32(<4 x i64> %a0, <4 x i64> %a1) {
 ; AVX1-LABEL: combine_blend_of_permutes_v8i32:
 ; AVX1:       # %bb.0:
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
-; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6],ymm1[7]
 ; AVX1-NEXT:    ret{{[l|q]}}
 ;
 ; AVX2-LABEL: combine_blend_of_permutes_v8i32:
 ; AVX2:       # %bb.0:
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,3,0,1]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6],ymm1[7]
 ; AVX2-NEXT:    ret{{[l|q]}}
 ;
 ; AVX512-LABEL: combine_blend_of_permutes_v8i32:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
index 8d213d257743..50be3c5bc6c0 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
@@ -25,23 +25,20 @@ define <16 x i8> @combine_vpshufb_as_movzx(<16 x i8> %a0) {
 define <4 x i32> @combine_blend_of_permutes_v4i32(<2 x i64> %a0, <2 x i64> %a1) {
 ; SSE-LABEL: combine_blend_of_permutes_v4i32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: combine_blend_of_permutes_v4i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,3,0,1]
 ; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: combine_blend_of_permutes_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,3,0,1]
 ; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: combine_blend_of_permutes_v4i32:
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
index f7a27a5b9144..9ae1f270e883 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -65,7 +65,6 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
 ; X64-NO-BMI2-NEXT:    movzwl (%rdi), %eax
-; X64-NO-BMI2-NEXT:    movzwl %ax, %eax
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrl %cl, %eax
@@ -75,7 +74,6 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
 ; X64-BMI2-NEXT:    movzwl (%rdi), %eax
-; X64-BMI2-NEXT:    movzwl %ax, %eax
 ; X64-BMI2-NEXT:    shll $3, %esi
 ; X64-BMI2-NEXT:    shrxl %esi, %eax, %eax
 ; X64-BMI2-NEXT:    movb %al, (%rdx)
@@ -83,15 +81,14 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ;
 ; X86-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2:       # %bb.0:
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NEXT:    movzwl (%edx), %edx
-; X86-NO-BMI2-NEXT:    movzwl %dx, %edx
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NEXT:    movzwl (%eax), %eax
 ; X86-NO-BMI2-NEXT:    shll $3, %ecx
 ; X86-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NEXT:    movb %dl, (%eax)
+; X86-NO-BMI2-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NEXT:    movb %al, (%edx)
 ; X86-NO-BMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
@@ -100,7 +97,6 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    movzwl (%edx), %edx
-; X86-BMI2-NEXT:    movzwl %dx, %edx
 ; X86-BMI2-NEXT:    shll $3, %ecx
 ; X86-BMI2-NEXT:    shrxl %ecx, %edx, %ecx
 ; X86-BMI2-NEXT:    movb %cl, (%eax)
@@ -123,7 +119,6 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-NO-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
 ; X64-NO-BMI2-NEXT:    movzwl (%rdi), %eax
-; X64-NO-BMI2-NEXT:    movzwl %ax, %eax
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrl %cl, %eax
@@ -133,7 +128,6 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
 ; X64-BMI2-NEXT:    movzwl (%rdi), %eax
-; X64-BMI2-NEXT:    movzwl %ax, %eax
 ; X64-BMI2-NEXT:    shll $3, %esi
 ; X64-BMI2-NEXT:    shrxl %esi, %eax, %eax
 ; X64-BMI2-NEXT:    movw %ax, (%rdx)
@@ -145,7 +139,6 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NEXT:    movzwl (%edx), %edx
-; X86-NO-BMI2-NEXT:    movzwl %dx, %edx
 ; X86-NO-BMI2-NEXT:    shll $3, %ecx
 ; X86-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NEXT:    shrl %cl, %edx
@@ -158,7 +151,6 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    movzwl (%edx), %edx
-; X86-BMI2-NEXT:    movzwl %dx, %edx
 ; X86-BMI2-NEXT:    shll $3, %ecx
 ; X86-BMI2-NEXT:    shrxl %ecx, %edx, %ecx
 ; X86-BMI2-NEXT:    movw %cx, (%eax)
@@ -179,9 +171,8 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
-; X64-NO-BMI2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT:    movq %xmm0, %rax
+; X64-NO-BMI2-NEXT:    movl (%rdi), %eax
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrq %cl, %rax
 ; X64-NO-BMI2-NEXT:    movb %al, (%rdx)
@@ -189,9 +180,8 @@ define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 ;
 ; X64-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-BMI2-NEXT:    shll $3, %esi
-; X64-BMI2-NEXT:    movq %xmm0, %rax
+; X64-BMI2-NEXT:    movl (%rdi), %eax
 ; X64-BMI2-NEXT:    shrxq %rsi, %rax, %rax
 ; X64-BMI2-NEXT:    movb %al, (%rdx)
 ; X64-BMI2-NEXT:    retq
@@ -199,99 +189,49 @@ define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, (%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, (%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
-; X86-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
-; X86-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    retl
+; X86-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movl (%edx), %edx
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    xorl %esi, %esi
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT:    testb $32, %cl
+; X86-SHLD-NEXT:    cmovnel %esi, %edx
+; X86-SHLD-NEXT:    movb %dl, (%eax)
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ebx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %bl, (%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
-; X86-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %bl, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
   %init = load <4 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <4 x i8> %init, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <8 x i8> %intermediate.sroa.0.0.vec.expand, <8 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
@@ -308,9 +248,8 @@ define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
-; X64-NO-BMI2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT:    movq %xmm0, %rax
+; X64-NO-BMI2-NEXT:    movl (%rdi), %eax
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrq %cl, %rax
 ; X64-NO-BMI2-NEXT:    movw %ax, (%rdx)
@@ -318,107 +257,58 @@ define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 ;
 ; X64-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-BMI2-NEXT:    shll $3, %esi
-; X64-BMI2-NEXT:    movq %xmm0, %rax
+; X64-BMI2-NEXT:    movl (%rdi), %eax
 ; X64-BMI2-NEXT:    shrxq %rsi, %rax, %rax
 ; X64-BMI2-NEXT:    movw %ax, (%rdx)
 ; X64-BMI2-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movw %si, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movw %si, (%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
-; X86-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
-; X86-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movw %si, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    retl
+; X86-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movl (%edx), %edx
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    xorl %esi, %esi
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT:    testb $32, %cl
+; X86-SHLD-NEXT:    cmovnel %esi, %edx
+; X86-SHLD-NEXT:    movw %dx, (%eax)
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %dx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %si, (%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
-; X86-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %si, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
   %init = load <4 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <4 x i8> %init, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <8 x i8> %intermediate.sroa.0.0.vec.expand, <8 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
@@ -434,9 +324,8 @@ define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 define void @load_4byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
-; X64-NO-BMI2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT:    movq %xmm0, %rax
+; X64-NO-BMI2-NEXT:    movl (%rdi), %eax
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrq %cl, %rax
 ; X64-NO-BMI2-NEXT:    movl %eax, (%rdx)
@@ -444,107 +333,58 @@ define void @load_4byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 ;
 ; X64-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-BMI2-NEXT:    shll $3, %esi
-; X64-BMI2-NEXT:    movq %xmm0, %rax
+; X64-BMI2-NEXT:    movl (%rdi), %eax
 ; X64-BMI2-NEXT:    shrxq %rsi, %rax, %rax
 ; X64-BMI2-NEXT:    movl %eax, (%rdx)
 ; X64-BMI2-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
-; X86-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
-; X86-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    retl
+; X86-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movl (%edx), %edx
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    xorl %esi, %esi
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT:    testb $32, %cl
+; X86-SHLD-NEXT:    cmovnel %esi, %edx
+; X86-SHLD-NEXT:    movl %edx, (%eax)
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
-; X86-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
   %init = load <4 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <4 x i8> %init, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <8 x i8> %intermediate.sroa.0.0.vec.expand, <8 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
@@ -560,88 +400,51 @@ define void @load_4byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movb %al, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movb %sil, (%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb %sil, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+; X64-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-SHLD:       # %bb.0:
+; X64-SHLD-NEXT:    movq %rsi, %rcx
+; X64-SHLD-NEXT:    movq (%rdi), %rax
+; X64-SHLD-NEXT:    shll $3, %ecx
+; X64-SHLD-NEXT:    xorl %esi, %esi
+; X64-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-SHLD-NEXT:    testb $64, %cl
+; X64-SHLD-NEXT:    cmovneq %rsi, %rax
+; X64-SHLD-NEXT:    movb %al, (%rdx)
+; X64-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, (%rdi), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb %al, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %sil, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
 ; X86-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X86:       # %bb.0:
 ; X86-NEXT:    subl $32, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movss %xmm0, (%esp)
+; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -670,88 +473,51 @@ define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movw %ax, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movw %si, (%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movw %si, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+; X64-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-SHLD:       # %bb.0:
+; X64-SHLD-NEXT:    movq %rsi, %rcx
+; X64-SHLD-NEXT:    movq (%rdi), %rax
+; X64-SHLD-NEXT:    shll $3, %ecx
+; X64-SHLD-NEXT:    xorl %esi, %esi
+; X64-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-SHLD-NEXT:    testb $64, %cl
+; X64-SHLD-NEXT:    cmovneq %rsi, %rax
+; X64-SHLD-NEXT:    movw %ax, (%rdx)
+; X64-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, (%rdi), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movw %ax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movw %cx, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %si, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
 ; X86-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X86:       # %bb.0:
 ; X86-NEXT:    subl $32, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movss %xmm0, (%esp)
+; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -779,88 +545,51 @@ define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+; X64-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-SHLD:       # %bb.0:
+; X64-SHLD-NEXT:    movq %rsi, %rcx
+; X64-SHLD-NEXT:    movq (%rdi), %rax
+; X64-SHLD-NEXT:    shll $3, %ecx
+; X64-SHLD-NEXT:    xorl %esi, %esi
+; X64-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-SHLD-NEXT:    testb $64, %cl
+; X64-SHLD-NEXT:    cmovneq %rsi, %rax
+; X64-SHLD-NEXT:    movl %eax, (%rdx)
+; X64-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, (%rdi), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
 ; X86-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X86:       # %bb.0:
 ; X86-NEXT:    subl $32, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movss %xmm0, (%esp)
+; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -888,88 +617,51 @@ define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, (%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+; X64-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-SHLD:       # %bb.0:
+; X64-SHLD-NEXT:    movq %rsi, %rcx
+; X64-SHLD-NEXT:    movq (%rdi), %rax
+; X64-SHLD-NEXT:    shll $3, %ecx
+; X64-SHLD-NEXT:    xorl %esi, %esi
+; X64-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-SHLD-NEXT:    testb $64, %cl
+; X64-SHLD-NEXT:    cmovneq %rsi, %rax
+; X64-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, (%rdi), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
 ; X86-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X86:       # %bb.0:
 ; X86-NEXT:    subl $32, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movss %xmm0, (%esp)
+; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -1941,7 +1633,9 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; ALL: {{.*}}
+; X64-HAVE-BMI2-HAVE-SHLD: {{.*}}
+; X64-NO-BMI2-HAVE-SHLD: {{.*}}
 ; X64-NO-SHLD: {{.*}}
-; X64-SHLD: {{.*}}
+; X86-HAVE-BMI2-HAVE-SHLD: {{.*}}
+; X86-NO-BMI2-HAVE-SHLD: {{.*}}
 ; X86-NO-SHLD: {{.*}}
-; X86-SHLD: {{.*}}
diff --git a/llvm/test/CodeGen/X86/windows-seh-EHa-CppCatchDotDotDot.ll b/llvm/test/CodeGen/X86/windows-seh-EHa-CppCatchDotDotDot.ll
index 25d92033cc6b..944ffab24a5d 100644
--- a/llvm/test/CodeGen/X86/windows-seh-EHa-CppCatchDotDotDot.ll
+++ b/llvm/test/CodeGen/X86/windows-seh-EHa-CppCatchDotDotDot.ll
@@ -1,290 +1,290 @@
-; RUN: llc -verify-machineinstrs < %s | FileCheck %s
-
-; CHECK-LABEL: "$cppxdata$?crash@@YAXH@Z":
-; CHECK:	.long	("$stateUnwindMap$?crash@@YAXH@Z")
-; CHECK:        .long   ("$tryMap$?crash@@YAXH@Z")@IMGREL # TryBlockMap
-; CHECK-NEXT:   .long   6                       # IPMapEntries
-; CHECK-NEXT:	.long	("$ip2state$?crash@@YAXH@Z")
-
-; CHECK-LABEL: "$stateUnwindMap$?crash@@YAXH@Z":
-; CHECK-NEXT:        .long   -1                
-; CHECK-NEXT:        .long   0                 
-; CHECK-NEXT:        .long   0                 
-; CHECK-NEXT:        .long   "?dtor$
-; CHECK-NEXT:        .long   -1                
-; CHECK-NEXT:        .long   0                 
-
-; CHECK-LABEL: "$tryMap$?crash@@YAXH@Z":
-; CHECK-NEXT:        .long   0             
-; CHECK-NEXT:        .long   1             
-; CHECK-NEXT:        .long   2             
-; CHECK-NEXT:        .long   1             
-; CHECK-NEXT:        .long   ("$handlerMap$
-
-; CHECK:       "$handlerMap$0$?crash@@YAXH@Z"
-; CHECK-NEXT:        .long   0             
-; CHECK-NEXT:        .long   0             
-; CHECK-NEXT:        .long   0             
-; CHECK-NEXT:        .long   "?catch$ 
-
-; CHECK-LABEL: "$ip2state$?crash@@YAXH@Z":
-; CHECK-NEXT:	.long	.Lfunc_begin0@IMGREL
-; CHECK-NEXT:	.long	-1                  
-; CHECK-NEXT:	.long	.Ltmp     
-; CHECK-NEXT:	.long	0                   
-; CHECK-NEXT:	.long	.Ltmp     
-; CHECK-NEXT:	.long	1                   
-; CHECK-NEXT:	.long	.Ltmp
-; CHECK-NEXT:	.long	0                  
-; CHECK-NEXT:	.long	.Ltmp
-; CHECK-NEXT:	.long	-1                                  
-; CHECK-NEXT:	.long	"?catch$
-; CHECK-NEXT:	.long	2                  
-
-; ModuleID = 'windows-seh-EHa-CppCatchDotDotDot.cpp'
-source_filename = "windows-seh-EHa-CppCatchDotDotDot.cpp"
-target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-windows-msvc"
-
-%rtti.TypeDescriptor2 = type { ptr, ptr, [3 x i8] }
-%eh.CatchableType = type { i32, i32, i32, i32, i32, i32, i32 }
-%eh.CatchableTypeArray.1 = type { i32, [1 x i32] }
-%eh.ThrowInfo = type { i32, i32, i32, i32 }
-%struct.A = type { i8 }
-
-$"??_C@_0BJ@EIKFKKLB@?5in?5catch?$CI?4?4?4?$CJ?5funclet?5?6?$AA@" = comdat any
-
-$"??_R0H@8" = comdat any
-
-$"_CT??_R0H@84" = comdat any
-
-$_CTA1H = comdat any
-
-$_TI1H = comdat any
-
-$"??_C@_0CN@MKCAOFNA@?5Test?5CPP?5unwind?3?5in?5except?5hand@" = comdat any
-
-$"??_C@_0N@LJHFFAKD@?5in?5A?5ctor?5?6?$AA@" = comdat any
-
-$"??_C@_0N@HMNCGOCN@?5in?5A?5dtor?5?6?$AA@" = comdat any
-
-@"?pt1@@3PEAHEA" = dso_local global ptr null, align 8
-@"?pt2@@3PEAHEA" = dso_local global ptr null, align 8
-@"?pt3@@3PEAHEA" = dso_local global ptr null, align 8
-@"?g@@3HA" = dso_local global i32 0, align 4
-@"??_C@_0BJ@EIKFKKLB@?5in?5catch?$CI?4?4?4?$CJ?5funclet?5?6?$AA@" = linkonce_odr dso_local unnamed_addr constant [25 x i8] c" in catch(...) funclet \0A\00", comdat, align 1
-@"??_7type_info@@6B@" = external constant ptr
-@"??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { ptr @"??_7type_info@@6B@", ptr null, [3 x i8] c".H\00" }, comdat
-@__ImageBase = external dso_local constant i8
-@"_CT??_R0H@84" = linkonce_odr unnamed_addr constant %eh.CatchableType { i32 1, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (ptr @"??_R0H@8" to i64), i64 ptrtoint (ptr @__ImageBase to i64)) to i32), i32 0, i32 -1, i32 0, i32 4, i32 0 }, section ".xdata", comdat
-@_CTA1H = linkonce_odr unnamed_addr constant %eh.CatchableTypeArray.1 { i32 1, [1 x i32] [i32 trunc (i64 sub nuw nsw (i64 ptrtoint (ptr @"_CT??_R0H@84" to i64), i64 ptrtoint (ptr @__ImageBase to i64)) to i32)] }, section ".xdata", comdat
-@_TI1H = linkonce_odr unnamed_addr constant %eh.ThrowInfo { i32 0, i32 0, i32 0, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (ptr @_CTA1H to i64), i64 ptrtoint (ptr @__ImageBase to i64)) to i32) }, section ".xdata", comdat
-@"??_C@_0CN@MKCAOFNA@?5Test?5CPP?5unwind?3?5in?5except?5hand@" = linkonce_odr dso_local unnamed_addr constant [45 x i8] c" Test CPP unwind: in except handler i = %d \0A\00", comdat, align 1
-@"??_C@_0N@LJHFFAKD@?5in?5A?5ctor?5?6?$AA@" = linkonce_odr dso_local unnamed_addr constant [13 x i8] c" in A ctor \0A\00", comdat, align 1
-@"??_C@_0N@HMNCGOCN@?5in?5A?5dtor?5?6?$AA@" = linkonce_odr dso_local unnamed_addr constant [13 x i8] c" in A dtor \0A\00", comdat, align 1
-
-; Function Attrs: noinline nounwind optnone
-define dso_local void @"?foo@@YAXXZ"() #0 {
-entry:
-  store volatile i32 0, ptr inttoptr (i64 17 to ptr), align 4
-  ret void
-}
-
-; Function Attrs: noinline optnone
-define dso_local void @"?crash@@YAXH@Z"(i32 %i) #1 personality ptr @__CxxFrameHandler3 {
-entry:
-  %i.addr = alloca i32, align 4
-  %ObjA = alloca %struct.A, align 1
-  %tmp = alloca i32, align 4
-  store i32 %i, ptr %i.addr, align 4
-  %0 = load i32, ptr %i.addr, align 4
-  store i32 %0, ptr @"?g@@3HA", align 4
-  invoke void @llvm.seh.try.begin()
-          to label %invoke.cont unwind label %catch.dispatch
-
-invoke.cont:                                      ; preds = %entry
-  %call = invoke ptr @"??0A@?1??crash@@YAXH@Z@QEAA@XZ"(ptr %ObjA)
-          to label %invoke.cont1 unwind label %catch.dispatch
-
-invoke.cont1:                                     ; preds = %invoke.cont
-  invoke void @llvm.seh.scope.begin()
-          to label %invoke.cont2 unwind label %ehcleanup
-
-invoke.cont2:                                     ; preds = %invoke.cont1
-  %1 = load i32, ptr %i.addr, align 4
-  %cmp = icmp eq i32 %1, 1
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:                                          ; preds = %invoke.cont2
-  store volatile i32 0, ptr inttoptr (i64 17 to ptr), align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %invoke.cont2
-  invoke void @llvm.seh.scope.end()
-          to label %invoke.cont3 unwind label %ehcleanup
-
-invoke.cont3:                                     ; preds = %if.end
-  call void @"??1A@?1??crash@@YAXH@Z@QEAA@XZ"(ptr %ObjA) #6
-  br label %try.cont
-
-ehcleanup:                                        ; preds = %if.end, %invoke.cont1
-  %2 = cleanuppad within none []
-  call void @"??1A@?1??crash@@YAXH@Z@QEAA@XZ"(ptr %ObjA) #6 [ "funclet"(token %2) ]
-  cleanupret from %2 unwind label %catch.dispatch
-
-catch.dispatch:                                   ; preds = %ehcleanup, %invoke.cont, %entry
-  %3 = catchswitch within none [label %catch] unwind to caller
-
-catch:                                            ; preds = %catch.dispatch
-  %4 = catchpad within %3 [ptr null, i32 0, ptr null]
-  call void (...) @"?printf@@YAXZZ"(ptr @"??_C@_0BJ@EIKFKKLB@?5in?5catch?$CI?4?4?4?$CJ?5funclet?5?6?$AA@") [ "funclet"(token %4) ]
-  %5 = load i32, ptr %i.addr, align 4
-  %cmp4 = icmp eq i32 %5, 1
-  br i1 %cmp4, label %if.then5, label %if.end6
-
-if.then5:                                         ; preds = %catch
-  %6 = load i32, ptr %i.addr, align 4
-  store i32 %6, ptr %tmp, align 4
-  %7 = bitcast ptr %tmp to ptr
-  call void @_CxxThrowException(ptr %7, ptr @_TI1H) #7 [ "funclet"(token %4) ]
-  unreachable
-
-if.end6:                                          ; preds = %catch
-  catchret from %4 to label %catchret.dest
-
-catchret.dest:                                    ; preds = %if.end6
-  br label %try.cont
-
-try.cont:                                         ; preds = %catchret.dest, %invoke.cont3
-  ret void
-}
-
-; Function Attrs: nounwind willreturn
-declare dso_local void @llvm.seh.try.begin() #2
-
-declare dso_local i32 @__CxxFrameHandler3(...)
-
-; Function Attrs: noinline optnone
-define internal ptr @"??0A@?1??crash@@YAXH@Z@QEAA@XZ"(ptr returned %this) unnamed_addr #1 align 2 {
-entry:
-  %retval = alloca ptr, align 8
-  %this.addr = alloca ptr, align 8
-  store ptr %this, ptr %this.addr, align 8
-  %this1 = load ptr, ptr %this.addr, align 8
-  store ptr %this1, ptr %retval, align 8
-  call void (...) @"?printf@@YAXZZ"(ptr @"??_C@_0N@LJHFFAKD@?5in?5A?5ctor?5?6?$AA@")
-  %0 = load i32, ptr @"?g@@3HA", align 4
-  %cmp = icmp eq i32 %0, 0
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:                                          ; preds = %entry
-  store volatile i32 0, ptr inttoptr (i64 17 to ptr), align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %entry
-  %1 = load ptr, ptr %retval, align 8
-  ret ptr %1
-}
-
-; Function Attrs: nounwind readnone
-declare dso_local void @llvm.seh.scope.begin() #3
-
-; Function Attrs: nounwind readnone
-declare dso_local void @llvm.seh.scope.end() #3
-
-; Function Attrs: noinline nounwind optnone
-define internal void @"??1A@?1??crash@@YAXH@Z@QEAA@XZ"(ptr %this) unnamed_addr #0 align 2 {
-entry:
-  %this.addr = alloca ptr, align 8
-  store ptr %this, ptr %this.addr, align 8
-  %this1 = load ptr, ptr %this.addr, align 8
-  call void (...) @"?printf@@YAXZZ"(ptr @"??_C@_0N@HMNCGOCN@?5in?5A?5dtor?5?6?$AA@")
-  ret void
-}
-
-declare dso_local void @"?printf@@YAXZZ"(...) #4
-
-declare dso_local void @_CxxThrowException(ptr, ptr)
-
-; Function Attrs: noinline norecurse optnone
-define dso_local i32 @main() #5 personality ptr @__C_specific_handler {
-entry:
-  %retval = alloca i32, align 4
-  %i = alloca i32, align 4
-  %__exception_code = alloca i32, align 4
-  store i32 0, ptr %retval, align 4
-  store i32 0, ptr %i, align 4
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %entry
-  %0 = load i32, ptr %i, align 4
-  %cmp = icmp slt i32 %0, 2
-  br i1 %cmp, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  invoke void @llvm.seh.try.begin()
-          to label %invoke.cont unwind label %catch.dispatch
-
-invoke.cont:                                      ; preds = %for.body
-  %1 = load volatile i32, ptr %i, align 4
-  invoke void @"?crash@@YAXH@Z"(i32 %1) #8
-          to label %invoke.cont1 unwind label %catch.dispatch
-
-invoke.cont1:                                     ; preds = %invoke.cont
-  invoke void @llvm.seh.try.end()
-          to label %invoke.cont2 unwind label %catch.dispatch
-
-catch.dispatch:                                   ; preds = %invoke.cont1, %invoke.cont, %for.body
-  %2 = catchswitch within none [label %__except] unwind to caller
-
-__except:                                         ; preds = %catch.dispatch
-  %3 = catchpad within %2 [ptr null]
-  catchret from %3 to label %__except3
-
-__except3:                                        ; preds = %__except
-  %4 = call i32 @llvm.eh.exceptioncode(token %3)
-  store i32 %4, ptr %__exception_code, align 4
-  %5 = load i32, ptr %i, align 4
-  call void (...) @"?printf@@YAXZZ"(ptr @"??_C@_0CN@MKCAOFNA@?5Test?5CPP?5unwind?3?5in?5except?5hand@", i32 %5)
-  br label %__try.cont
-
-__try.cont:                                       ; preds = %__except3, %invoke.cont2
-  br label %for.inc
-
-for.inc:                                          ; preds = %__try.cont
-  %6 = load i32, ptr %i, align 4
-  %inc = add nsw i32 %6, 1
-  store i32 %inc, ptr %i, align 4
-  br label %for.cond
-
-invoke.cont2:                                     ; preds = %invoke.cont1
-  br label %__try.cont
-
-for.end:                                          ; preds = %for.cond
-  ret i32 0
-}
-
-declare dso_local i32 @__C_specific_handler(...)
-
-; Function Attrs: nounwind willreturn
-declare dso_local void @llvm.seh.try.end() #2
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.eh.exceptioncode(token) #3
-
-attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind willreturn }
-attributes #3 = { nounwind readnone }
-attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #5 = { noinline norecurse optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #6 = { nounwind }
-attributes #7 = { noreturn }
-attributes #8 = { noinline }
-
-!llvm.module.flags = !{!0, !1}
-
-!0 = !{i32 1, !"wchar_size", i32 2}
-!1 = !{i32 2, !"eh-asynch", i32 1}
-
-
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-LABEL: "$cppxdata$?crash@@YAXH@Z":
+; CHECK:	.long	("$stateUnwindMap$?crash@@YAXH@Z")
+; CHECK:        .long   ("$tryMap$?crash@@YAXH@Z")@IMGREL # TryBlockMap
+; CHECK-NEXT:   .long   6                       # IPMapEntries
+; CHECK-NEXT:	.long	("$ip2state$?crash@@YAXH@Z")
+
+; CHECK-LABEL: "$stateUnwindMap$?crash@@YAXH@Z":
+; CHECK-NEXT:        .long   -1
+; CHECK-NEXT:        .long   0
+; CHECK-NEXT:        .long   0
+; CHECK-NEXT:        .long   "?dtor$
+; CHECK-NEXT:        .long   -1
+; CHECK-NEXT:        .long   0
+
+; CHECK-LABEL: "$tryMap$?crash@@YAXH@Z":
+; CHECK-NEXT:        .long   0
+; CHECK-NEXT:        .long   1
+; CHECK-NEXT:        .long   2
+; CHECK-NEXT:        .long   1
+; CHECK-NEXT:        .long   ("$handlerMap$
+
+; CHECK:       "$handlerMap$0$?crash@@YAXH@Z"
+; CHECK-NEXT:        .long   0
+; CHECK-NEXT:        .long   0
+; CHECK-NEXT:        .long   0
+; CHECK-NEXT:        .long   "?catch$
+
+; CHECK-LABEL: "$ip2state$?crash@@YAXH@Z":
+; CHECK-NEXT:	.long	.Lfunc_begin0@IMGREL
+; CHECK-NEXT:	.long	-1
+; CHECK-NEXT:	.long	.Ltmp
+; CHECK-NEXT:	.long	0
+; CHECK-NEXT:	.long	.Ltmp
+; CHECK-NEXT:	.long	1
+; CHECK-NEXT:	.long	.Ltmp
+; CHECK-NEXT:	.long	0
+; CHECK-NEXT:	.long	.Ltmp
+; CHECK-NEXT:	.long	-1
+; CHECK-NEXT:	.long	"?catch$
+; CHECK-NEXT:	.long	2
+
+; ModuleID = 'windows-seh-EHa-CppCatchDotDotDot.cpp'
+source_filename = "windows-seh-EHa-CppCatchDotDotDot.cpp"
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-windows-msvc"
+
+%rtti.TypeDescriptor2 = type { ptr, ptr, [3 x i8] }
+%eh.CatchableType = type { i32, i32, i32, i32, i32, i32, i32 }
+%eh.CatchableTypeArray.1 = type { i32, [1 x i32] }
+%eh.ThrowInfo = type { i32, i32, i32, i32 }
+%struct.A = type { i8 }
+
+$"??_C@_0BJ@EIKFKKLB@?5in?5catch?$CI?4?4?4?$CJ?5funclet?5?6?$AA@" = comdat any
+
+$"??_R0H@8" = comdat any
+
+$"_CT??_R0H@84" = comdat any
+
+$_CTA1H = comdat any
+
+$_TI1H = comdat any
+
+$"??_C@_0CN@MKCAOFNA@?5Test?5CPP?5unwind?3?5in?5except?5hand@" = comdat any
+
+$"??_C@_0N@LJHFFAKD@?5in?5A?5ctor?5?6?$AA@" = comdat any
+
+$"??_C@_0N@HMNCGOCN@?5in?5A?5dtor?5?6?$AA@" = comdat any
+
+@"?pt1@@3PEAHEA" = dso_local global ptr null, align 8
+@"?pt2@@3PEAHEA" = dso_local global ptr null, align 8
+@"?pt3@@3PEAHEA" = dso_local global ptr null, align 8
+@"?g@@3HA" = dso_local global i32 0, align 4
+@"??_C@_0BJ@EIKFKKLB@?5in?5catch?$CI?4?4?4?$CJ?5funclet?5?6?$AA@" = linkonce_odr dso_local unnamed_addr constant [25 x i8] c" in catch(...) funclet \0A\00", comdat, align 1
+@"??_7type_info@@6B@" = external constant ptr
+@"??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { ptr @"??_7type_info@@6B@", ptr null, [3 x i8] c".H\00" }, comdat
+@__ImageBase = external dso_local constant i8
+@"_CT??_R0H@84" = linkonce_odr unnamed_addr constant %eh.CatchableType { i32 1, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (ptr @"??_R0H@8" to i64), i64 ptrtoint (ptr @__ImageBase to i64)) to i32), i32 0, i32 -1, i32 0, i32 4, i32 0 }, section ".xdata", comdat
+@_CTA1H = linkonce_odr unnamed_addr constant %eh.CatchableTypeArray.1 { i32 1, [1 x i32] [i32 trunc (i64 sub nuw nsw (i64 ptrtoint (ptr @"_CT??_R0H@84" to i64), i64 ptrtoint (ptr @__ImageBase to i64)) to i32)] }, section ".xdata", comdat
+@_TI1H = linkonce_odr unnamed_addr constant %eh.ThrowInfo { i32 0, i32 0, i32 0, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (ptr @_CTA1H to i64), i64 ptrtoint (ptr @__ImageBase to i64)) to i32) }, section ".xdata", comdat
+@"??_C@_0CN@MKCAOFNA@?5Test?5CPP?5unwind?3?5in?5except?5hand@" = linkonce_odr dso_local unnamed_addr constant [45 x i8] c" Test CPP unwind: in except handler i = %d \0A\00", comdat, align 1
+@"??_C@_0N@LJHFFAKD@?5in?5A?5ctor?5?6?$AA@" = linkonce_odr dso_local unnamed_addr constant [13 x i8] c" in A ctor \0A\00", comdat, align 1
+@"??_C@_0N@HMNCGOCN@?5in?5A?5dtor?5?6?$AA@" = linkonce_odr dso_local unnamed_addr constant [13 x i8] c" in A dtor \0A\00", comdat, align 1
+
+; Function Attrs: noinline nounwind optnone
+define dso_local void @"?foo@@YAXXZ"() #0 {
+entry:
+  store volatile i32 0, ptr inttoptr (i64 17 to ptr), align 4
+  ret void
+}
+
+; Function Attrs: noinline optnone
+define dso_local void @"?crash@@YAXH@Z"(i32 %i) #1 personality ptr @__CxxFrameHandler3 {
+entry:
+  %i.addr = alloca i32, align 4
+  %ObjA = alloca %struct.A, align 1
+  %tmp = alloca i32, align 4
+  store i32 %i, ptr %i.addr, align 4
+  %0 = load i32, ptr %i.addr, align 4
+  store i32 %0, ptr @"?g@@3HA", align 4
+  invoke void @llvm.seh.try.begin()
+          to label %invoke.cont unwind label %catch.dispatch
+
+invoke.cont:                                      ; preds = %entry
+  %call = invoke ptr @"??0A@?1??crash@@YAXH@Z@QEAA@XZ"(ptr %ObjA)
+          to label %invoke.cont1 unwind label %catch.dispatch
+
+invoke.cont1:                                     ; preds = %invoke.cont
+  invoke void @llvm.seh.scope.begin()
+          to label %invoke.cont2 unwind label %ehcleanup
+
+invoke.cont2:                                     ; preds = %invoke.cont1
+  %1 = load i32, ptr %i.addr, align 4
+  %cmp = icmp eq i32 %1, 1
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %invoke.cont2
+  store volatile i32 0, ptr inttoptr (i64 17 to ptr), align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %invoke.cont2
+  invoke void @llvm.seh.scope.end()
+          to label %invoke.cont3 unwind label %ehcleanup
+
+invoke.cont3:                                     ; preds = %if.end
+  call void @"??1A@?1??crash@@YAXH@Z@QEAA@XZ"(ptr %ObjA) #6
+  br label %try.cont
+
+ehcleanup:                                        ; preds = %if.end, %invoke.cont1
+  %2 = cleanuppad within none []
+  call void @"??1A@?1??crash@@YAXH@Z@QEAA@XZ"(ptr %ObjA) #6 [ "funclet"(token %2) ]
+  cleanupret from %2 unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %ehcleanup, %invoke.cont, %entry
+  %3 = catchswitch within none [label %catch] unwind to caller
+
+catch:                                            ; preds = %catch.dispatch
+  %4 = catchpad within %3 [ptr null, i32 0, ptr null]
+  call void (...) @"?printf@@YAXZZ"(ptr @"??_C@_0BJ@EIKFKKLB@?5in?5catch?$CI?4?4?4?$CJ?5funclet?5?6?$AA@") [ "funclet"(token %4) ]
+  %5 = load i32, ptr %i.addr, align 4
+  %cmp4 = icmp eq i32 %5, 1
+  br i1 %cmp4, label %if.then5, label %if.end6
+
+if.then5:                                         ; preds = %catch
+  %6 = load i32, ptr %i.addr, align 4
+  store i32 %6, ptr %tmp, align 4
+  %7 = bitcast ptr %tmp to ptr
+  call void @_CxxThrowException(ptr %7, ptr @_TI1H) #7 [ "funclet"(token %4) ]
+  unreachable
+
+if.end6:                                          ; preds = %catch
+  catchret from %4 to label %catchret.dest
+
+catchret.dest:                                    ; preds = %if.end6
+  br label %try.cont
+
+try.cont:                                         ; preds = %catchret.dest, %invoke.cont3
+  ret void
+}
+
+; Function Attrs: nounwind willreturn
+declare dso_local void @llvm.seh.try.begin() #2
+
+declare dso_local i32 @__CxxFrameHandler3(...)
+
+; Function Attrs: noinline optnone
+define internal ptr @"??0A@?1??crash@@YAXH@Z@QEAA@XZ"(ptr returned %this) unnamed_addr #1 align 2 {
+entry:
+  %retval = alloca ptr, align 8
+  %this.addr = alloca ptr, align 8
+  store ptr %this, ptr %this.addr, align 8
+  %this1 = load ptr, ptr %this.addr, align 8
+  store ptr %this1, ptr %retval, align 8
+  call void (...) @"?printf@@YAXZZ"(ptr @"??_C@_0N@LJHFFAKD@?5in?5A?5ctor?5?6?$AA@")
+  %0 = load i32, ptr @"?g@@3HA", align 4
+  %cmp = icmp eq i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  store volatile i32 0, ptr inttoptr (i64 17 to ptr), align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %1 = load ptr, ptr %retval, align 8
+  ret ptr %1
+}
+
+; Function Attrs: nounwind readnone
+declare dso_local void @llvm.seh.scope.begin() #3
+
+; Function Attrs: nounwind readnone
+declare dso_local void @llvm.seh.scope.end() #3
+
+; Function Attrs: noinline nounwind optnone
+define internal void @"??1A@?1??crash@@YAXH@Z@QEAA@XZ"(ptr %this) unnamed_addr #0 align 2 {
+entry:
+  %this.addr = alloca ptr, align 8
+  store ptr %this, ptr %this.addr, align 8
+  %this1 = load ptr, ptr %this.addr, align 8
+  call void (...) @"?printf@@YAXZZ"(ptr @"??_C@_0N@HMNCGOCN@?5in?5A?5dtor?5?6?$AA@")
+  ret void
+}
+
+declare dso_local void @"?printf@@YAXZZ"(...) #4
+
+declare dso_local void @_CxxThrowException(ptr, ptr)
+
+; Function Attrs: noinline norecurse optnone
+define dso_local i32 @main() #5 personality ptr @__C_specific_handler {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  %__exception_code = alloca i32, align 4
+  store i32 0, ptr %retval, align 4
+  store i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, ptr %i, align 4
+  %cmp = icmp slt i32 %0, 2
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  invoke void @llvm.seh.try.begin()
+          to label %invoke.cont unwind label %catch.dispatch
+
+invoke.cont:                                      ; preds = %for.body
+  %1 = load volatile i32, ptr %i, align 4
+  invoke void @"?crash@@YAXH@Z"(i32 %1) #8
+          to label %invoke.cont1 unwind label %catch.dispatch
+
+invoke.cont1:                                     ; preds = %invoke.cont
+  invoke void @llvm.seh.try.end()
+          to label %invoke.cont2 unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %invoke.cont1, %invoke.cont, %for.body
+  %2 = catchswitch within none [label %__except] unwind to caller
+
+__except:                                         ; preds = %catch.dispatch
+  %3 = catchpad within %2 [ptr null]
+  catchret from %3 to label %__except3
+
+__except3:                                        ; preds = %__except
+  %4 = call i32 @llvm.eh.exceptioncode(token %3)
+  store i32 %4, ptr %__exception_code, align 4
+  %5 = load i32, ptr %i, align 4
+  call void (...) @"?printf@@YAXZZ"(ptr @"??_C@_0CN@MKCAOFNA@?5Test?5CPP?5unwind?3?5in?5except?5hand@", i32 %5)
+  br label %__try.cont
+
+__try.cont:                                       ; preds = %__except3, %invoke.cont2
+  br label %for.inc
+
+for.inc:                                          ; preds = %__try.cont
+  %6 = load i32, ptr %i, align 4
+  %inc = add nsw i32 %6, 1
+  store i32 %inc, ptr %i, align 4
+  br label %for.cond
+
+invoke.cont2:                                     ; preds = %invoke.cont1
+  br label %__try.cont
+
+for.end:                                          ; preds = %for.cond
+  ret i32 0
+}
+
+declare dso_local i32 @__C_specific_handler(...)
+
+; Function Attrs: nounwind willreturn
+declare dso_local void @llvm.seh.try.end() #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.eh.exceptioncode(token) #3
+
+attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind willreturn }
+attributes #3 = { nounwind readnone }
+attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { noinline norecurse optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { nounwind }
+attributes #7 = { noreturn }
+attributes #8 = { noinline }
+
+!llvm.module.flags = !{!0, !1}
+
+!0 = !{i32 1, !"wchar_size", i32 2}
+!1 = !{i32 2, !"eh-asynch", i32 1}
+
+
diff --git a/llvm/test/CodeGen/X86/windows-seh-EHa-CppDtors01.ll b/llvm/test/CodeGen/X86/windows-seh-EHa-CppDtors01.ll
index 96b31316f39a..54c1d838a30f 100644
--- a/llvm/test/CodeGen/X86/windows-seh-EHa-CppDtors01.ll
+++ b/llvm/test/CodeGen/X86/windows-seh-EHa-CppDtors01.ll
@@ -1,255 +1,255 @@
-; RUN: llc -verify-machineinstrs < %s | FileCheck %s
-
-; CHECK-LABEL: "$cppxdata$?crash@@YAXH@Z":
-; CHECK:	.long	("$stateUnwindMap$?crash@@YAXH@Z")
-; CHECK:	.long	("$ip2state$?crash@@YAXH@Z")
-
-; CHECK-LABEL: "$stateUnwindMap$?crash@@YAXH@Z":
-; CHECK:	.long	-1 
-; CHECK:	.long	"?dtor$
-; CHECK:	.long	0 
-; CHECK:	.long	"?dtor$
-; CHECK:	.long	1
-; CHECK:	.long	"?dtor$
-
-; CHECK-LABEL: "$ip2state$?crash@@YAXH@Z":
-; CHECK-NEXT:	.long	.Lfunc_begin0@IMGREL
-; CHECK-NEXT:	.long	-1                  
-; CHECK-NEXT:	.long	.Ltmp     
-; CHECK-NEXT:	.long	0                   
-; CHECK-NEXT:	.long	.Ltmp     
-; CHECK-NEXT:	.long	1                   
-; CHECK-NEXT:	.long	.Ltmp
-; CHECK-NEXT:	.long	2                   
-; CHECK-NEXT:	.long	.Ltmp
-; CHECK-NEXT:	.long	1                   
-; CHECK-NEXT:	.long	.Ltmp
-; CHECK-NEXT:	.long	0                   
-; CHECK-NEXT:	.long	.Ltmp
-; CHECK-NEXT:	.long	-1                  
-
-; ModuleID = 'windows-seh-EHa-CppDtors01.cpp'
-source_filename = "windows-seh-EHa-CppDtors01.cpp"
-target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-windows-msvc"
-
-%struct.A = type { i8 }
-%struct.B = type { i8 }
-%struct.C = type { i8 }
-
-$"??_C@_0CM@KAOHJHDK@?5Test?5CPP?5unwind?3?5in?5catch?5handl@" = comdat any
-
-$"??_C@_0N@FCCEEGKL@?5in?5C?5dtor?5?6?$AA@" = comdat any
-
-$"??_C@_0N@EFFPFCOI@?5in?5B?5dtor?5?6?$AA@" = comdat any
-
-$"??_C@_0N@HMNCGOCN@?5in?5A?5dtor?5?6?$AA@" = comdat any
-
-@"?g@@3HA" = dso_local global i32 0, align 4
-@"??_C@_0CM@KAOHJHDK@?5Test?5CPP?5unwind?3?5in?5catch?5handl@" = linkonce_odr dso_local unnamed_addr constant [44 x i8] c" Test CPP unwind: in catch handler i = %d \0A\00", comdat, align 1
-@"??_C@_0N@FCCEEGKL@?5in?5C?5dtor?5?6?$AA@" = linkonce_odr dso_local unnamed_addr constant [13 x i8] c" in C dtor \0A\00", comdat, align 1
-@"??_C@_0N@EFFPFCOI@?5in?5B?5dtor?5?6?$AA@" = linkonce_odr dso_local unnamed_addr constant [13 x i8] c" in B dtor \0A\00", comdat, align 1
-@"??_C@_0N@HMNCGOCN@?5in?5A?5dtor?5?6?$AA@" = linkonce_odr dso_local unnamed_addr constant [13 x i8] c" in A dtor \0A\00", comdat, align 1
-
-; Function Attrs: noinline optnone
-define dso_local void @"?crash@@YAXH@Z"(i32 %i) #0 personality ptr @__CxxFrameHandler3 {
-entry:
-  %i.addr = alloca i32, align 4
-  %ObjA = alloca %struct.A, align 1
-  %ObjB = alloca %struct.B, align 1
-  %ObjC = alloca %struct.C, align 1
-  store i32 %i, ptr %i.addr, align 4
-  invoke void @llvm.seh.scope.begin()
-          to label %invoke.cont unwind label %ehcleanup13
-
-invoke.cont:                                      ; preds = %entry
-  %0 = load i32, ptr %i.addr, align 4
-  %cmp = icmp eq i32 %0, 0
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:                                          ; preds = %invoke.cont
-  store volatile i32 0, ptr inttoptr (i64 17 to ptr), align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %invoke.cont
-  invoke void @llvm.seh.scope.begin()
-          to label %invoke.cont1 unwind label %ehcleanup11
-
-invoke.cont1:                                     ; preds = %if.end
-  %1 = load i32, ptr %i.addr, align 4
-  %cmp2 = icmp eq i32 %1, 1
-  br i1 %cmp2, label %if.then3, label %if.end4
-
-if.then3:                                         ; preds = %invoke.cont1
-  store volatile i32 0, ptr inttoptr (i64 17 to ptr), align 4
-  br label %if.end4
-
-if.end4:                                          ; preds = %if.then3, %invoke.cont1
-  invoke void @llvm.seh.scope.begin()
-          to label %invoke.cont5 unwind label %ehcleanup
-
-invoke.cont5:                                     ; preds = %if.end4
-  %2 = load i32, ptr %i.addr, align 4
-  %cmp6 = icmp eq i32 %2, 2
-  br i1 %cmp6, label %if.then7, label %if.end8
-
-if.then7:                                         ; preds = %invoke.cont5
-  store volatile i32 0, ptr inttoptr (i64 17 to ptr), align 4
-  br label %if.end8
-
-if.end8:                                          ; preds = %if.then7, %invoke.cont5
-  invoke void @llvm.seh.scope.end()
-          to label %invoke.cont9 unwind label %ehcleanup
-
-invoke.cont9:                                     ; preds = %if.end8
-  call void @"??1C@?1??crash@@YAXH@Z@QEAA@XZ"(ptr %ObjC) #6
-  invoke void @llvm.seh.scope.end()
-          to label %invoke.cont10 unwind label %ehcleanup11
-
-invoke.cont10:                                    ; preds = %invoke.cont9
-  call void @"??1B@?1??crash@@YAXH@Z@QEAA@XZ"(ptr %ObjB) #6
-  invoke void @llvm.seh.scope.end()
-          to label %invoke.cont12 unwind label %ehcleanup13
-
-invoke.cont12:                                    ; preds = %invoke.cont10
-  call void @"??1A@?1??crash@@YAXH@Z@QEAA@XZ"(ptr %ObjA) #6
-  ret void
-
-ehcleanup:                                        ; preds = %if.end8, %if.end4
-  %3 = cleanuppad within none []
-  call void @"??1C@?1??crash@@YAXH@Z@QEAA@XZ"(ptr %ObjC) #6 [ "funclet"(token %3) ]
-  cleanupret from %3 unwind label %ehcleanup11
-
-ehcleanup11:                                      ; preds = %invoke.cont9, %ehcleanup, %if.end
-  %4 = cleanuppad within none []
-  call void @"??1B@?1??crash@@YAXH@Z@QEAA@XZ"(ptr %ObjB) #6 [ "funclet"(token %4) ]
-  cleanupret from %4 unwind label %ehcleanup13
-
-ehcleanup13:                                      ; preds = %invoke.cont10, %ehcleanup11, %entry
-  %5 = cleanuppad within none []
-  call void @"??1A@?1??crash@@YAXH@Z@QEAA@XZ"(ptr %ObjA) #6 [ "funclet"(token %5) ]
-  cleanupret from %5 unwind to caller
-}
-
-; Function Attrs: nounwind readnone
-declare dso_local void @llvm.seh.scope.begin() #1
-
-declare dso_local i32 @__CxxFrameHandler3(...)
-
-; Function Attrs: nounwind readnone
-declare dso_local void @llvm.seh.scope.end() #1
-
-; Function Attrs: noinline nounwind optnone
-define internal void @"??1C@?1??crash@@YAXH@Z@QEAA@XZ"(ptr %this) unnamed_addr #2 align 2 {
-entry:
-  %this.addr = alloca ptr, align 8
-  store ptr %this, ptr %this.addr, align 8
-  %this1 = load ptr, ptr %this.addr, align 8
-  call void (...) @"?printf@@YAXZZ"(ptr @"??_C@_0N@FCCEEGKL@?5in?5C?5dtor?5?6?$AA@")
-  ret void
-}
-
-; Function Attrs: noinline nounwind optnone
-define internal void @"??1B@?1??crash@@YAXH@Z@QEAA@XZ"(ptr %this) unnamed_addr #2 align 2 {
-entry:
-  %this.addr = alloca ptr, align 8
-  store ptr %this, ptr %this.addr, align 8
-  %this1 = load ptr, ptr %this.addr, align 8
-  call void (...) @"?printf@@YAXZZ"(ptr @"??_C@_0N@EFFPFCOI@?5in?5B?5dtor?5?6?$AA@")
-  ret void
-}
-
-; Function Attrs: noinline nounwind optnone
-define internal void @"??1A@?1??crash@@YAXH@Z@QEAA@XZ"(ptr %this) unnamed_addr #2 align 2 {
-entry:
-  %this.addr = alloca ptr, align 8
-  store ptr %this, ptr %this.addr, align 8
-  %this1 = load ptr, ptr %this.addr, align 8
-  call void (...) @"?printf@@YAXZZ"(ptr @"??_C@_0N@HMNCGOCN@?5in?5A?5dtor?5?6?$AA@")
-  ret void
-}
-
-; Function Attrs: noinline norecurse optnone
-define dso_local i32 @main() #3 personality ptr @__C_specific_handler {
-entry:
-  %retval = alloca i32, align 4
-  %i = alloca i32, align 4
-  %__exception_code = alloca i32, align 4
-  store i32 0, ptr %retval, align 4
-  store i32 0, ptr %i, align 4
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %entry
-  %0 = load i32, ptr %i, align 4
-  %cmp = icmp slt i32 %0, 3
-  br i1 %cmp, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  invoke void @llvm.seh.try.begin()
-          to label %invoke.cont unwind label %catch.dispatch
-
-invoke.cont:                                      ; preds = %for.body
-  %1 = load volatile i32, ptr %i, align 4
-  invoke void @"?crash@@YAXH@Z"(i32 %1) #7
-          to label %invoke.cont1 unwind label %catch.dispatch
-
-invoke.cont1:                                     ; preds = %invoke.cont
-  invoke void @llvm.seh.try.end()
-          to label %invoke.cont2 unwind label %catch.dispatch
-
-catch.dispatch:                                   ; preds = %invoke.cont1, %invoke.cont, %for.body
-  %2 = catchswitch within none [label %__except] unwind to caller
-
-__except:                                         ; preds = %catch.dispatch
-  %3 = catchpad within %2 [ptr null]
-  catchret from %3 to label %__except3
-
-__except3:                                        ; preds = %__except
-  %4 = call i32 @llvm.eh.exceptioncode(token %3)
-  store i32 %4, ptr %__exception_code, align 4
-  %5 = load i32, ptr %i, align 4
-  call void (...) @"?printf@@YAXZZ"(ptr @"??_C@_0CM@KAOHJHDK@?5Test?5CPP?5unwind?3?5in?5catch?5handl@", i32 %5)
-  br label %__try.cont
-
-__try.cont:                                       ; preds = %__except3, %invoke.cont2
-  br label %for.inc
-
-for.inc:                                          ; preds = %__try.cont
-  %6 = load i32, ptr %i, align 4
-  %inc = add nsw i32 %6, 1
-  store i32 %inc, ptr %i, align 4
-  br label %for.cond
-
-invoke.cont2:                                     ; preds = %invoke.cont1
-  br label %__try.cont
-
-for.end:                                          ; preds = %for.cond
-  ret i32 0
-}
-
-; Function Attrs: nounwind willreturn
-declare dso_local void @llvm.seh.try.begin() #4
-
-declare dso_local i32 @__C_specific_handler(...)
-
-; Function Attrs: nounwind willreturn
-declare dso_local void @llvm.seh.try.end() #4
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.eh.exceptioncode(token) #1
-
-declare dso_local void @"?printf@@YAXZZ"(...) #5
-
-attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { noinline norecurse optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind willreturn }
-attributes #5 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #6 = { nounwind }
-attributes #7 = { noinline }
-
-!llvm.module.flags = !{!0, !1}
-
-!0 = !{i32 1, !"wchar_size", i32 2}
-!1 = !{i32 2, !"eh-asynch", i32 1}
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-LABEL: "$cppxdata$?crash@@YAXH@Z":
+; CHECK:	.long	("$stateUnwindMap$?crash@@YAXH@Z")
+; CHECK:	.long	("$ip2state$?crash@@YAXH@Z")
+
+; CHECK-LABEL: "$stateUnwindMap$?crash@@YAXH@Z":
+; CHECK:	.long	-1
+; CHECK:	.long	"?dtor$
+; CHECK:	.long	0
+; CHECK:	.long	"?dtor$
+; CHECK:	.long	1
+; CHECK:	.long	"?dtor$
+
+; CHECK-LABEL: "$ip2state$?crash@@YAXH@Z":
+; CHECK-NEXT:	.long	.Lfunc_begin0@IMGREL
+; CHECK-NEXT:	.long	-1
+; CHECK-NEXT:	.long	.Ltmp
+; CHECK-NEXT:	.long	0
+; CHECK-NEXT:	.long	.Ltmp
+; CHECK-NEXT:	.long	1
+; CHECK-NEXT:	.long	.Ltmp
+; CHECK-NEXT:	.long	2
+; CHECK-NEXT:	.long	.Ltmp
+; CHECK-NEXT:	.long	1
+; CHECK-NEXT:	.long	.Ltmp
+; CHECK-NEXT:	.long	0
+; CHECK-NEXT:	.long	.Ltmp
+; CHECK-NEXT:	.long	-1
+
+; ModuleID = 'windows-seh-EHa-CppDtors01.cpp'
+source_filename = "windows-seh-EHa-CppDtors01.cpp"
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-windows-msvc"
+
+%struct.A = type { i8 }
+%struct.B = type { i8 }
+%struct.C = type { i8 }
+
+$"??_C@_0CM@KAOHJHDK@?5Test?5CPP?5unwind?3?5in?5catch?5handl@" = comdat any
+
+$"??_C@_0N@FCCEEGKL@?5in?5C?5dtor?5?6?$AA@" = comdat any
+
+$"??_C@_0N@EFFPFCOI@?5in?5B?5dtor?5?6?$AA@" = comdat any
+
+$"??_C@_0N@HMNCGOCN@?5in?5A?5dtor?5?6?$AA@" = comdat any
+
+@"?g@@3HA" = dso_local global i32 0, align 4
+@"??_C@_0CM@KAOHJHDK@?5Test?5CPP?5unwind?3?5in?5catch?5handl@" = linkonce_odr dso_local unnamed_addr constant [44 x i8] c" Test CPP unwind: in catch handler i = %d \0A\00", comdat, align 1
+@"??_C@_0N@FCCEEGKL@?5in?5C?5dtor?5?6?$AA@" = linkonce_odr dso_local unnamed_addr constant [13 x i8] c" in C dtor \0A\00", comdat, align 1
+@"??_C@_0N@EFFPFCOI@?5in?5B?5dtor?5?6?$AA@" = linkonce_odr dso_local unnamed_addr constant [13 x i8] c" in B dtor \0A\00", comdat, align 1
+@"??_C@_0N@HMNCGOCN@?5in?5A?5dtor?5?6?$AA@" = linkonce_odr dso_local unnamed_addr constant [13 x i8] c" in A dtor \0A\00", comdat, align 1
+
+; Function Attrs: noinline optnone
+define dso_local void @"?crash@@YAXH@Z"(i32 %i) #0 personality ptr @__CxxFrameHandler3 {
+entry:
+  %i.addr = alloca i32, align 4
+  %ObjA = alloca %struct.A, align 1
+  %ObjB = alloca %struct.B, align 1
+  %ObjC = alloca %struct.C, align 1
+  store i32 %i, ptr %i.addr, align 4
+  invoke void @llvm.seh.scope.begin()
+          to label %invoke.cont unwind label %ehcleanup13
+
+invoke.cont:                                      ; preds = %entry
+  %0 = load i32, ptr %i.addr, align 4
+  %cmp = icmp eq i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %invoke.cont
+  store volatile i32 0, ptr inttoptr (i64 17 to ptr), align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %invoke.cont
+  invoke void @llvm.seh.scope.begin()
+          to label %invoke.cont1 unwind label %ehcleanup11
+
+invoke.cont1:                                     ; preds = %if.end
+  %1 = load i32, ptr %i.addr, align 4
+  %cmp2 = icmp eq i32 %1, 1
+  br i1 %cmp2, label %if.then3, label %if.end4
+
+if.then3:                                         ; preds = %invoke.cont1
+  store volatile i32 0, ptr inttoptr (i64 17 to ptr), align 4
+  br label %if.end4
+
+if.end4:                                          ; preds = %if.then3, %invoke.cont1
+  invoke void @llvm.seh.scope.begin()
+          to label %invoke.cont5 unwind label %ehcleanup
+
+invoke.cont5:                                     ; preds = %if.end4
+  %2 = load i32, ptr %i.addr, align 4
+  %cmp6 = icmp eq i32 %2, 2
+  br i1 %cmp6, label %if.then7, label %if.end8
+
+if.then7:                                         ; preds = %invoke.cont5
+  store volatile i32 0, ptr inttoptr (i64 17 to ptr), align 4
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.then7, %invoke.cont5
+  invoke void @llvm.seh.scope.end()
+          to label %invoke.cont9 unwind label %ehcleanup
+
+invoke.cont9:                                     ; preds = %if.end8
+  call void @"??1C@?1??crash@@YAXH@Z@QEAA@XZ"(ptr %ObjC) #6
+  invoke void @llvm.seh.scope.end()
+          to label %invoke.cont10 unwind label %ehcleanup11
+
+invoke.cont10:                                    ; preds = %invoke.cont9
+  call void @"??1B@?1??crash@@YAXH@Z@QEAA@XZ"(ptr %ObjB) #6
+  invoke void @llvm.seh.scope.end()
+          to label %invoke.cont12 unwind label %ehcleanup13
+
+invoke.cont12:                                    ; preds = %invoke.cont10
+  call void @"??1A@?1??crash@@YAXH@Z@QEAA@XZ"(ptr %ObjA) #6
+  ret void
+
+ehcleanup:                                        ; preds = %if.end8, %if.end4
+  %3 = cleanuppad within none []
+  call void @"??1C@?1??crash@@YAXH@Z@QEAA@XZ"(ptr %ObjC) #6 [ "funclet"(token %3) ]
+  cleanupret from %3 unwind label %ehcleanup11
+
+ehcleanup11:                                      ; preds = %invoke.cont9, %ehcleanup, %if.end
+  %4 = cleanuppad within none []
+  call void @"??1B@?1??crash@@YAXH@Z@QEAA@XZ"(ptr %ObjB) #6 [ "funclet"(token %4) ]
+  cleanupret from %4 unwind label %ehcleanup13
+
+ehcleanup13:                                      ; preds = %invoke.cont10, %ehcleanup11, %entry
+  %5 = cleanuppad within none []
+  call void @"??1A@?1??crash@@YAXH@Z@QEAA@XZ"(ptr %ObjA) #6 [ "funclet"(token %5) ]
+  cleanupret from %5 unwind to caller
+}
+
+; Function Attrs: nounwind readnone
+declare dso_local void @llvm.seh.scope.begin() #1
+
+declare dso_local i32 @__CxxFrameHandler3(...)
+
+; Function Attrs: nounwind readnone
+declare dso_local void @llvm.seh.scope.end() #1
+
+; Function Attrs: noinline nounwind optnone
+define internal void @"??1C@?1??crash@@YAXH@Z@QEAA@XZ"(ptr %this) unnamed_addr #2 align 2 {
+entry:
+  %this.addr = alloca ptr, align 8
+  store ptr %this, ptr %this.addr, align 8
+  %this1 = load ptr, ptr %this.addr, align 8
+  call void (...) @"?printf@@YAXZZ"(ptr @"??_C@_0N@FCCEEGKL@?5in?5C?5dtor?5?6?$AA@")
+  ret void
+}
+
+; Function Attrs: noinline nounwind optnone
+define internal void @"??1B@?1??crash@@YAXH@Z@QEAA@XZ"(ptr %this) unnamed_addr #2 align 2 {
+entry:
+  %this.addr = alloca ptr, align 8
+  store ptr %this, ptr %this.addr, align 8
+  %this1 = load ptr, ptr %this.addr, align 8
+  call void (...) @"?printf@@YAXZZ"(ptr @"??_C@_0N@EFFPFCOI@?5in?5B?5dtor?5?6?$AA@")
+  ret void
+}
+
+; Function Attrs: noinline nounwind optnone
+define internal void @"??1A@?1??crash@@YAXH@Z@QEAA@XZ"(ptr %this) unnamed_addr #2 align 2 {
+entry:
+  %this.addr = alloca ptr, align 8
+  store ptr %this, ptr %this.addr, align 8
+  %this1 = load ptr, ptr %this.addr, align 8
+  call void (...) @"?printf@@YAXZZ"(ptr @"??_C@_0N@HMNCGOCN@?5in?5A?5dtor?5?6?$AA@")
+  ret void
+}
+
+; Function Attrs: noinline norecurse optnone
+define dso_local i32 @main() #3 personality ptr @__C_specific_handler {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  %__exception_code = alloca i32, align 4
+  store i32 0, ptr %retval, align 4
+  store i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, ptr %i, align 4
+  %cmp = icmp slt i32 %0, 3
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  invoke void @llvm.seh.try.begin()
+          to label %invoke.cont unwind label %catch.dispatch
+
+invoke.cont:                                      ; preds = %for.body
+  %1 = load volatile i32, ptr %i, align 4
+  invoke void @"?crash@@YAXH@Z"(i32 %1) #7
+          to label %invoke.cont1 unwind label %catch.dispatch
+
+invoke.cont1:                                     ; preds = %invoke.cont
+  invoke void @llvm.seh.try.end()
+          to label %invoke.cont2 unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %invoke.cont1, %invoke.cont, %for.body
+  %2 = catchswitch within none [label %__except] unwind to caller
+
+__except:                                         ; preds = %catch.dispatch
+  %3 = catchpad within %2 [ptr null]
+  catchret from %3 to label %__except3
+
+__except3:                                        ; preds = %__except
+  %4 = call i32 @llvm.eh.exceptioncode(token %3)
+  store i32 %4, ptr %__exception_code, align 4
+  %5 = load i32, ptr %i, align 4
+  call void (...) @"?printf@@YAXZZ"(ptr @"??_C@_0CM@KAOHJHDK@?5Test?5CPP?5unwind?3?5in?5catch?5handl@", i32 %5)
+  br label %__try.cont
+
+__try.cont:                                       ; preds = %__except3, %invoke.cont2
+  br label %for.inc
+
+for.inc:                                          ; preds = %__try.cont
+  %6 = load i32, ptr %i, align 4
+  %inc = add nsw i32 %6, 1
+  store i32 %inc, ptr %i, align 4
+  br label %for.cond
+
+invoke.cont2:                                     ; preds = %invoke.cont1
+  br label %__try.cont
+
+for.end:                                          ; preds = %for.cond
+  ret i32 0
+}
+
+; Function Attrs: nounwind willreturn
+declare dso_local void @llvm.seh.try.begin() #4
+
+declare dso_local i32 @__C_specific_handler(...)
+
+; Function Attrs: nounwind willreturn
+declare dso_local void @llvm.seh.try.end() #4
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.eh.exceptioncode(token) #1
+
+declare dso_local void @"?printf@@YAXZZ"(...) #5
+
+attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { noinline norecurse optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind willreturn }
+attributes #5 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { nounwind }
+attributes #7 = { noinline }
+
+!llvm.module.flags = !{!0, !1}
+
+!0 = !{i32 1, !"wchar_size", i32 2}
+!1 = !{i32 2, !"eh-asynch", i32 1}
diff --git a/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll b/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll
index 340a9afe4a3d..16322cbe9980 100644
--- a/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll
+++ b/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll
@@ -1,224 +1,224 @@
-; RUN: llc -verify-machineinstrs < %s | FileCheck %s
-
-; CHECK-LABEL: "?fin$0@0@main@@"
-; CHECK:      .seh_handlerdata
-; CHECK:      .set ".L?fin$0@0@main@@$parent_frame_offset", 48
-; CHECK-NEXT:        .long   (.Llsda_end1-.Llsda_begin1)/16 
-; CHECK-NEXT: .Llsda_begin1:
-; CHECK-NEXT:        .long   .Ltmp
-; CHECK-NEXT:        .long   .Ltmp
-; CHECK-NEXT:        .long   "?dtor$
-; CHECK-NEXT:        .long   0
-; CHECK-NEXT: .Llsda_end1:
-
-; ModuleID = 'windows-seh-EHa-TryInFinally.cpp'
-source_filename = "windows-seh-EHa-TryInFinally.cpp"
-target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-windows-msvc"
-
-$"??_C@_0CI@MDFPIOJJ@?5?9?9?9?5Test?5_Try?5in?5_finally?5?9?9?9?5i@" = comdat any
-
-$"??_C@_0BN@HHKJHLBE@?5?5In?5Inner?5_finally?5i?5?$DN?5?$CFd?5?6?$AA@" = comdat any
-
-$"??_C@_0BN@HAIIIOKI@?5?5In?5outer?5_finally?5i?5?$DN?5?$CFd?5?6?$AA@" = comdat any
-
-$"??_C@_0BJ@OJMMAGCD@?5?5In?5outer?5_try?5i?5?$DN?5?$CFd?5?6?$AA@" = comdat any
-
-$"??_C@_0CG@ENDJHCGA@?5?9?9?9?5In?5outer?5except?5handler?5i?5?$DN@" = comdat any
-
-@"??_C@_0CI@MDFPIOJJ@?5?9?9?9?5Test?5_Try?5in?5_finally?5?9?9?9?5i@" = linkonce_odr dso_local unnamed_addr constant [40 x i8] c" --- Test _Try in _finally --- i = %d \0A\00", comdat, align 1
-@"??_C@_0BN@HHKJHLBE@?5?5In?5Inner?5_finally?5i?5?$DN?5?$CFd?5?6?$AA@" = linkonce_odr dso_local unnamed_addr constant [29 x i8] c"  In Inner _finally i = %d \0A\00", comdat, align 1
-@"??_C@_0BN@HAIIIOKI@?5?5In?5outer?5_finally?5i?5?$DN?5?$CFd?5?6?$AA@" = linkonce_odr dso_local unnamed_addr constant [29 x i8] c"  In outer _finally i = %d \0A\00", comdat, align 1
-@"??_C@_0BJ@OJMMAGCD@?5?5In?5outer?5_try?5i?5?$DN?5?$CFd?5?6?$AA@" = linkonce_odr dso_local unnamed_addr constant [25 x i8] c"  In outer _try i = %d \0A\00", comdat, align 1
-@"??_C@_0CG@ENDJHCGA@?5?9?9?9?5In?5outer?5except?5handler?5i?5?$DN@" = linkonce_odr dso_local unnamed_addr constant [38 x i8] c" --- In outer except handler i = %d \0A\00", comdat, align 1
-
-; Function Attrs: noinline norecurse optnone
-define dso_local i32 @main() #0 personality ptr @__C_specific_handler {
-entry:
-  %retval = alloca i32, align 4
-  %i = alloca i32, align 4
-  %__exception_code = alloca i32, align 4
-  call void (...) @llvm.localescape(ptr %i)
-  store i32 0, ptr %retval, align 4
-  store i32 0, ptr %i, align 4
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %entry
-  %0 = load i32, ptr %i, align 4
-  %cmp = icmp slt i32 %0, 3
-  br i1 %cmp, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %1 = load i32, ptr %i, align 4
-  call void (...) @"?printf@@YAXZZ"(ptr @"??_C@_0CI@MDFPIOJJ@?5?9?9?9?5Test?5_Try?5in?5_finally?5?9?9?9?5i@", i32 %1)
-  invoke void @llvm.seh.try.begin()
-          to label %invoke.cont unwind label %catch.dispatch
-
-invoke.cont:                                      ; preds = %for.body
-  invoke void @llvm.seh.try.begin()
-          to label %invoke.cont1 unwind label %ehcleanup
-
-invoke.cont1:                                     ; preds = %invoke.cont
-  %2 = load volatile i32, ptr %i, align 4
-  invoke void (...) @"?printf@@YAXZZ"(ptr @"??_C@_0BJ@OJMMAGCD@?5?5In?5outer?5_try?5i?5?$DN?5?$CFd?5?6?$AA@", i32 %2) #6
-          to label %invoke.cont2 unwind label %ehcleanup
-
-invoke.cont2:                                     ; preds = %invoke.cont1
-  %3 = load volatile i32, ptr %i, align 4
-  %cmp3 = icmp eq i32 %3, 0
-  br i1 %cmp3, label %if.then, label %if.end
-
-if.then:                                          ; preds = %invoke.cont2
-  store volatile i32 0, ptr inttoptr (i64 17 to ptr), align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %invoke.cont2
-  invoke void @llvm.seh.try.end()
-          to label %invoke.cont4 unwind label %ehcleanup
-
-invoke.cont4:                                     ; preds = %if.end
-  %4 = call ptr @llvm.localaddress()
-  invoke void @"?fin$0@0@main@@"(i8 0, ptr %4) #6
-          to label %invoke.cont5 unwind label %catch.dispatch
-
-invoke.cont5:                                     ; preds = %invoke.cont4
-  invoke void @llvm.seh.try.end()
-          to label %invoke.cont7 unwind label %catch.dispatch
-
-catch.dispatch:                                   ; preds = %invoke.cont5, %invoke.cont6, %ehcleanup, %invoke.cont4, %for.body
-  %5 = catchswitch within none [label %__except] unwind to caller
-
-__except:                                         ; preds = %catch.dispatch
-  %6 = catchpad within %5 [ptr null]
-  catchret from %6 to label %__except8
-
-__except8:                                        ; preds = %__except
-  %7 = call i32 @llvm.eh.exceptioncode(token %6)
-  store i32 %7, ptr %__exception_code, align 4
-  %8 = load i32, ptr %i, align 4
-  call void (...) @"?printf@@YAXZZ"(ptr @"??_C@_0CG@ENDJHCGA@?5?9?9?9?5In?5outer?5except?5handler?5i?5?$DN@", i32 %8)
-  br label %__try.cont
-
-__try.cont:                                       ; preds = %__except8, %invoke.cont7
-  br label %for.inc
-
-for.inc:                                          ; preds = %__try.cont
-  %9 = load i32, ptr %i, align 4
-  %inc = add nsw i32 %9, 1
-  store i32 %inc, ptr %i, align 4
-  br label %for.cond
-
-invoke.cont7:                                     ; preds = %invoke.cont5
-  br label %__try.cont
-
-ehcleanup:                                        ; preds = %if.end, %invoke.cont1, %invoke.cont
-  %10 = cleanuppad within none []
-  %11 = call ptr @llvm.localaddress()
-  invoke void @"?fin$0@0@main@@"(i8 1, ptr %11) #6 [ "funclet"(token %10) ]
-          to label %invoke.cont6 unwind label %catch.dispatch
-
-invoke.cont6:                                     ; preds = %ehcleanup
-  cleanupret from %10 unwind label %catch.dispatch
-
-for.end:                                          ; preds = %for.cond
-  ret i32 0
-}
-
-declare dso_local void @"?printf@@YAXZZ"(...) #1
-
-; Function Attrs: nounwind willreturn
-declare dso_local void @llvm.seh.try.begin() #2
-
-declare dso_local i32 @__C_specific_handler(...)
-
-; Function Attrs: noinline
-define internal void @"?fin$0@0@main@@"(i8 %abnormal_termination, ptr %frame_pointer) #3 personality ptr @__C_specific_handler {
-entry:
-  %frame_pointer.addr = alloca ptr, align 8
-  %abnormal_termination.addr = alloca i8, align 1
-  %0 = call ptr @llvm.localrecover(ptr @main, ptr %frame_pointer, i32 0)
-  %i = bitcast ptr %0 to ptr
-  store ptr %frame_pointer, ptr %frame_pointer.addr, align 8
-  store i8 %abnormal_termination, ptr %abnormal_termination.addr, align 1
-  invoke void @llvm.seh.try.begin()
-          to label %invoke.cont unwind label %ehcleanup
-
-invoke.cont:                                      ; preds = %entry
-  %1 = load volatile i32, ptr %i, align 4
-  invoke void (...) @"?printf@@YAXZZ"(ptr @"??_C@_0BN@HAIIIOKI@?5?5In?5outer?5_finally?5i?5?$DN?5?$CFd?5?6?$AA@", i32 %1) #6
-          to label %invoke.cont1 unwind label %ehcleanup
-
-invoke.cont1:                                     ; preds = %invoke.cont
-  %2 = load volatile i32, ptr %i, align 4
-  %cmp = icmp eq i32 %2, 1
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:                                          ; preds = %invoke.cont1
-  store volatile i32 0, ptr inttoptr (i64 17 to ptr), align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %invoke.cont1
-  invoke void @llvm.seh.try.end()
-          to label %invoke.cont2 unwind label %ehcleanup
-
-invoke.cont2:                                     ; preds = %if.end
-  call void @"?fin$1@0@main@@"(i8 0, ptr %frame_pointer)
-  ret void
-
-ehcleanup:                                        ; preds = %if.end, %invoke.cont, %entry
-  %3 = cleanuppad within none []
-  call void @"?fin$1@0@main@@"(i8 1, ptr %frame_pointer) [ "funclet"(token %3) ]
-  cleanupret from %3 unwind to caller
-}
-
-; Function Attrs: nounwind readnone
-declare ptr @llvm.localrecover(ptr, ptr, i32 immarg) #4
-
-; Function Attrs: noinline
-define internal void @"?fin$1@0@main@@"(i8 %abnormal_termination, ptr %frame_pointer) #3 {
-entry:
-  %frame_pointer.addr = alloca ptr, align 8
-  %abnormal_termination.addr = alloca i8, align 1
-  %0 = call ptr @llvm.localrecover(ptr @main, ptr %frame_pointer, i32 0)
-  %i = bitcast ptr %0 to ptr
-  store ptr %frame_pointer, ptr %frame_pointer.addr, align 8
-  store i8 %abnormal_termination, ptr %abnormal_termination.addr, align 1
-  %1 = load i32, ptr %i, align 4
-  call void (...) @"?printf@@YAXZZ"(ptr @"??_C@_0BN@HHKJHLBE@?5?5In?5Inner?5_finally?5i?5?$DN?5?$CFd?5?6?$AA@", i32 %1)
-  %2 = load i32, ptr %i, align 4
-  %cmp = icmp eq i32 %2, 2
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:                                          ; preds = %entry
-  store volatile i32 0, ptr inttoptr (i64 17 to ptr), align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %entry
-  ret void
-}
-
-; Function Attrs: nounwind willreturn
-declare dso_local void @llvm.seh.try.end() #2
-
-; Function Attrs: nounwind readnone
-declare ptr @llvm.localaddress() #4
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.eh.exceptioncode(token) #4
-
-; Function Attrs: nounwind
-declare void @llvm.localescape(...) #5
-
-attributes #0 = { noinline norecurse optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind willreturn }
-attributes #3 = { noinline "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind readnone }
-attributes #5 = { nounwind }
-attributes #6 = { noinline }
-
-!llvm.module.flags = !{!0, !1}
-
-!0 = !{i32 1, !"wchar_size", i32 2}
-!1 = !{i32 2, !"eh-asynch", i32 1}
-
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-LABEL: "?fin$0@0@main@@"
+; CHECK:      .seh_handlerdata
+; CHECK:      .set ".L?fin$0@0@main@@$parent_frame_offset", 48
+; CHECK-NEXT:        .long   (.Llsda_end1-.Llsda_begin1)/16
+; CHECK-NEXT: .Llsda_begin1:
+; CHECK-NEXT:        .long   .Ltmp
+; CHECK-NEXT:        .long   .Ltmp
+; CHECK-NEXT:        .long   "?dtor$
+; CHECK-NEXT:        .long   0
+; CHECK-NEXT: .Llsda_end1:
+
+; ModuleID = 'windows-seh-EHa-TryInFinally.cpp'
+source_filename = "windows-seh-EHa-TryInFinally.cpp"
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-windows-msvc"
+
+$"??_C@_0CI@MDFPIOJJ@?5?9?9?9?5Test?5_Try?5in?5_finally?5?9?9?9?5i@" = comdat any
+
+$"??_C@_0BN@HHKJHLBE@?5?5In?5Inner?5_finally?5i?5?$DN?5?$CFd?5?6?$AA@" = comdat any
+
+$"??_C@_0BN@HAIIIOKI@?5?5In?5outer?5_finally?5i?5?$DN?5?$CFd?5?6?$AA@" = comdat any
+
+$"??_C@_0BJ@OJMMAGCD@?5?5In?5outer?5_try?5i?5?$DN?5?$CFd?5?6?$AA@" = comdat any
+
+$"??_C@_0CG@ENDJHCGA@?5?9?9?9?5In?5outer?5except?5handler?5i?5?$DN@" = comdat any
+
+@"??_C@_0CI@MDFPIOJJ@?5?9?9?9?5Test?5_Try?5in?5_finally?5?9?9?9?5i@" = linkonce_odr dso_local unnamed_addr constant [40 x i8] c" --- Test _Try in _finally --- i = %d \0A\00", comdat, align 1
+@"??_C@_0BN@HHKJHLBE@?5?5In?5Inner?5_finally?5i?5?$DN?5?$CFd?5?6?$AA@" = linkonce_odr dso_local unnamed_addr constant [29 x i8] c"  In Inner _finally i = %d \0A\00", comdat, align 1
+@"??_C@_0BN@HAIIIOKI@?5?5In?5outer?5_finally?5i?5?$DN?5?$CFd?5?6?$AA@" = linkonce_odr dso_local unnamed_addr constant [29 x i8] c"  In outer _finally i = %d \0A\00", comdat, align 1
+@"??_C@_0BJ@OJMMAGCD@?5?5In?5outer?5_try?5i?5?$DN?5?$CFd?5?6?$AA@" = linkonce_odr dso_local unnamed_addr constant [25 x i8] c"  In outer _try i = %d \0A\00", comdat, align 1
+@"??_C@_0CG@ENDJHCGA@?5?9?9?9?5In?5outer?5except?5handler?5i?5?$DN@" = linkonce_odr dso_local unnamed_addr constant [38 x i8] c" --- In outer except handler i = %d \0A\00", comdat, align 1
+
+; Function Attrs: noinline norecurse optnone
+define dso_local i32 @main() #0 personality ptr @__C_specific_handler {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  %__exception_code = alloca i32, align 4
+  call void (...) @llvm.localescape(ptr %i)
+  store i32 0, ptr %retval, align 4
+  store i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, ptr %i, align 4
+  %cmp = icmp slt i32 %0, 3
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, ptr %i, align 4
+  call void (...) @"?printf@@YAXZZ"(ptr @"??_C@_0CI@MDFPIOJJ@?5?9?9?9?5Test?5_Try?5in?5_finally?5?9?9?9?5i@", i32 %1)
+  invoke void @llvm.seh.try.begin()
+          to label %invoke.cont unwind label %catch.dispatch
+
+invoke.cont:                                      ; preds = %for.body
+  invoke void @llvm.seh.try.begin()
+          to label %invoke.cont1 unwind label %ehcleanup
+
+invoke.cont1:                                     ; preds = %invoke.cont
+  %2 = load volatile i32, ptr %i, align 4
+  invoke void (...) @"?printf@@YAXZZ"(ptr @"??_C@_0BJ@OJMMAGCD@?5?5In?5outer?5_try?5i?5?$DN?5?$CFd?5?6?$AA@", i32 %2) #6
+          to label %invoke.cont2 unwind label %ehcleanup
+
+invoke.cont2:                                     ; preds = %invoke.cont1
+  %3 = load volatile i32, ptr %i, align 4
+  %cmp3 = icmp eq i32 %3, 0
+  br i1 %cmp3, label %if.then, label %if.end
+
+if.then:                                          ; preds = %invoke.cont2
+  store volatile i32 0, ptr inttoptr (i64 17 to ptr), align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %invoke.cont2
+  invoke void @llvm.seh.try.end()
+          to label %invoke.cont4 unwind label %ehcleanup
+
+invoke.cont4:                                     ; preds = %if.end
+  %4 = call ptr @llvm.localaddress()
+  invoke void @"?fin$0@0@main@@"(i8 0, ptr %4) #6
+          to label %invoke.cont5 unwind label %catch.dispatch
+
+invoke.cont5:                                     ; preds = %invoke.cont4
+  invoke void @llvm.seh.try.end()
+          to label %invoke.cont7 unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %invoke.cont5, %invoke.cont6, %ehcleanup, %invoke.cont4, %for.body
+  %5 = catchswitch within none [label %__except] unwind to caller
+
+__except:                                         ; preds = %catch.dispatch
+  %6 = catchpad within %5 [ptr null]
+  catchret from %6 to label %__except8
+
+__except8:                                        ; preds = %__except
+  %7 = call i32 @llvm.eh.exceptioncode(token %6)
+  store i32 %7, ptr %__exception_code, align 4
+  %8 = load i32, ptr %i, align 4
+  call void (...) @"?printf@@YAXZZ"(ptr @"??_C@_0CG@ENDJHCGA@?5?9?9?9?5In?5outer?5except?5handler?5i?5?$DN@", i32 %8)
+  br label %__try.cont
+
+__try.cont:                                       ; preds = %__except8, %invoke.cont7
+  br label %for.inc
+
+for.inc:                                          ; preds = %__try.cont
+  %9 = load i32, ptr %i, align 4
+  %inc = add nsw i32 %9, 1
+  store i32 %inc, ptr %i, align 4
+  br label %for.cond
+
+invoke.cont7:                                     ; preds = %invoke.cont5
+  br label %__try.cont
+
+ehcleanup:                                        ; preds = %if.end, %invoke.cont1, %invoke.cont
+  %10 = cleanuppad within none []
+  %11 = call ptr @llvm.localaddress()
+  invoke void @"?fin$0@0@main@@"(i8 1, ptr %11) #6 [ "funclet"(token %10) ]
+          to label %invoke.cont6 unwind label %catch.dispatch
+
+invoke.cont6:                                     ; preds = %ehcleanup
+  cleanupret from %10 unwind label %catch.dispatch
+
+for.end:                                          ; preds = %for.cond
+  ret i32 0
+}
+
+declare dso_local void @"?printf@@YAXZZ"(...) #1
+
+; Function Attrs: nounwind willreturn
+declare dso_local void @llvm.seh.try.begin() #2
+
+declare dso_local i32 @__C_specific_handler(...)
+
+; Function Attrs: noinline
+define internal void @"?fin$0@0@main@@"(i8 %abnormal_termination, ptr %frame_pointer) #3 personality ptr @__C_specific_handler {
+entry:
+  %frame_pointer.addr = alloca ptr, align 8
+  %abnormal_termination.addr = alloca i8, align 1
+  %0 = call ptr @llvm.localrecover(ptr @main, ptr %frame_pointer, i32 0)
+  %i = bitcast ptr %0 to ptr
+  store ptr %frame_pointer, ptr %frame_pointer.addr, align 8
+  store i8 %abnormal_termination, ptr %abnormal_termination.addr, align 1
+  invoke void @llvm.seh.try.begin()
+          to label %invoke.cont unwind label %ehcleanup
+
+invoke.cont:                                      ; preds = %entry
+  %1 = load volatile i32, ptr %i, align 4
+  invoke void (...) @"?printf@@YAXZZ"(ptr @"??_C@_0BN@HAIIIOKI@?5?5In?5outer?5_finally?5i?5?$DN?5?$CFd?5?6?$AA@", i32 %1) #6
+          to label %invoke.cont1 unwind label %ehcleanup
+
+invoke.cont1:                                     ; preds = %invoke.cont
+  %2 = load volatile i32, ptr %i, align 4
+  %cmp = icmp eq i32 %2, 1
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %invoke.cont1
+  store volatile i32 0, ptr inttoptr (i64 17 to ptr), align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %invoke.cont1
+  invoke void @llvm.seh.try.end()
+          to label %invoke.cont2 unwind label %ehcleanup
+
+invoke.cont2:                                     ; preds = %if.end
+  call void @"?fin$1@0@main@@"(i8 0, ptr %frame_pointer)
+  ret void
+
+ehcleanup:                                        ; preds = %if.end, %invoke.cont, %entry
+  %3 = cleanuppad within none []
+  call void @"?fin$1@0@main@@"(i8 1, ptr %frame_pointer) [ "funclet"(token %3) ]
+  cleanupret from %3 unwind to caller
+}
+
+; Function Attrs: nounwind readnone
+declare ptr @llvm.localrecover(ptr, ptr, i32 immarg) #4
+
+; Function Attrs: noinline
+define internal void @"?fin$1@0@main@@"(i8 %abnormal_termination, ptr %frame_pointer) #3 {
+entry:
+  %frame_pointer.addr = alloca ptr, align 8
+  %abnormal_termination.addr = alloca i8, align 1
+  %0 = call ptr @llvm.localrecover(ptr @main, ptr %frame_pointer, i32 0)
+  %i = bitcast ptr %0 to ptr
+  store ptr %frame_pointer, ptr %frame_pointer.addr, align 8
+  store i8 %abnormal_termination, ptr %abnormal_termination.addr, align 1
+  %1 = load i32, ptr %i, align 4
+  call void (...) @"?printf@@YAXZZ"(ptr @"??_C@_0BN@HHKJHLBE@?5?5In?5Inner?5_finally?5i?5?$DN?5?$CFd?5?6?$AA@", i32 %1)
+  %2 = load i32, ptr %i, align 4
+  %cmp = icmp eq i32 %2, 2
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  store volatile i32 0, ptr inttoptr (i64 17 to ptr), align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+; Function Attrs: nounwind willreturn
+declare dso_local void @llvm.seh.try.end() #2
+
+; Function Attrs: nounwind readnone
+declare ptr @llvm.localaddress() #4
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.eh.exceptioncode(token) #4
+
+; Function Attrs: nounwind
+declare void @llvm.localescape(...) #5
+
+attributes #0 = { noinline norecurse optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind willreturn }
+attributes #3 = { noinline "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind readnone }
+attributes #5 = { nounwind }
+attributes #6 = { noinline }
+
+!llvm.module.flags = !{!0, !1}
+
+!0 = !{i32 1, !"wchar_size", i32 2}
+!1 = !{i32 2, !"eh-asynch", i32 1}
+
diff --git a/llvm/test/CodeGen/X86/x86-64-extend-shift.ll b/llvm/test/CodeGen/X86/x86-64-extend-shift.ll
index 6ebaeee36697..b73da1625969 100644
--- a/llvm/test/CodeGen/X86/x86-64-extend-shift.ll
+++ b/llvm/test/CodeGen/X86/x86-64-extend-shift.ll
@@ -1,10 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
 ; Formerly there were two shifts.
 
 define i64 @baz(i32 %A) nounwind {
-; CHECK:  shlq  $49, %r
-        %tmp1 = shl i32 %A, 17
-        %tmp2 = zext i32 %tmp1 to i64
-        %tmp3 = shl i64 %tmp2, 32
-        ret i64 %tmp3
+; CHECK-LABEL: baz:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shlq $49, %rax
+; CHECK-NEXT:    retq
+  %tmp1 = shl i32 %A, 17
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = shl i64 %tmp2, 32
+  ret i64 %tmp3
 }
diff --git a/llvm/test/DebugInfo/AArch64/instr-ref-const-physreg.ll b/llvm/test/DebugInfo/AArch64/instr-ref-const-physreg.ll
index ffe209c3aff6..f16abb4406f1 100644
--- a/llvm/test/DebugInfo/AArch64/instr-ref-const-physreg.ll
+++ b/llvm/test/DebugInfo/AArch64/instr-ref-const-physreg.ll
@@ -7,7 +7,7 @@
 ; crash, and we don't just drop the information.
 
 ; CHECK: DBG_PHI $xzr, 1
-; CHECK: DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
+; CHECK: DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
 
 define i64 @test() !dbg !7 {
   %foo = add i64 0, 0
diff --git a/llvm/test/DebugInfo/ARM/bitfield.ll b/llvm/test/DebugInfo/ARM/bitfield.ll
index 5bd06b785b15..672c61db6f49 100644
--- a/llvm/test/DebugInfo/ARM/bitfield.ll
+++ b/llvm/test/DebugInfo/ARM/bitfield.ll
@@ -12,7 +12,7 @@
 ; CHECK:          DW_AT_name {{.*}} "reserved"
 ; CHECK:          DW_AT_byte_size  {{.*}} (0x04)
 ; CHECK:          DW_AT_bit_size   {{.*}} (0x1c)
-; CHECK:          DW_AT_bit_offset {{.*}} (0xfffffffffffffff8)
+; CHECK:          DW_AT_bit_offset {{.*}} (-8)
 ; CHECK:          DW_AT_data_member_location {{.*}} (DW_OP_plus_uconst 0x0)
 
 %struct.anon = type { i8, [5 x i8] }
diff --git a/llvm/test/DebugInfo/ARM/instr-ref-tcreturn.ll b/llvm/test/DebugInfo/ARM/instr-ref-tcreturn.ll
index ee49a764acc6..53202f19f760 100644
--- a/llvm/test/DebugInfo/ARM/instr-ref-tcreturn.ll
+++ b/llvm/test/DebugInfo/ARM/instr-ref-tcreturn.ll
@@ -25,7 +25,7 @@ target triple = "thumbv7-apple-ios7.0.0"
 ; CHECK-LABEL:  bb.1.entry:
 ; CHECK:          $r0 = COPY %0
 ; CHECK-NEXT:     $r1 = COPY %1
-; CHECK-NEXT:     DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
+; CHECK-NEXT:     DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
 ; CHECK-NEXT:     TCRETURNdi &__divsi3, 0, csr_ios, implicit $sp, implicit $r0, implicit $r1
 
 declare i1 @ext()
diff --git a/llvm/test/DebugInfo/COFF/AArch64/lit.local.cfg b/llvm/test/DebugInfo/COFF/AArch64/lit.local.cfg
index 338348261354..10d4a0e953ed 100644
--- a/llvm/test/DebugInfo/COFF/AArch64/lit.local.cfg
+++ b/llvm/test/DebugInfo/COFF/AArch64/lit.local.cfg
@@ -1,2 +1,2 @@
-if not "AArch64" in config.root.targets:
-    config.unsupported = True
+if not "AArch64" in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/DebugInfo/COFF/jump-table-with-indirect-ptr-null.ll b/llvm/test/DebugInfo/COFF/jump-table-with-indirect-ptr-null.ll
index 0995db038345..d4a86e25e232 100644
--- a/llvm/test/DebugInfo/COFF/jump-table-with-indirect-ptr-null.ll
+++ b/llvm/test/DebugInfo/COFF/jump-table-with-indirect-ptr-null.ll
@@ -1,73 +1,73 @@
-; REQUIRES: x86-registered-target
-; RUN: llc < %s | FileCheck %s
-
-; Repro for issue https://reviews.llvm.org/D149367#4619121
-; Validates that `indirect ptr null` and a jump table can be used in the same function.
-
-; Verify branch labels match what's in the CodeView
-; CHECK:            .Ltmp2:
-; CHECK-NEXT:       jmpq    *%{{.*}}
-
-; Verify jump table have the same entry size, base offset and shift as what's in the CodeView
-; CHECK:          {{\.?}}LJTI0_0:
-; CHECK-NEXT:     .long   .LBB0_[[#]]-.LJTI0_0
-
-; Verify CodeView
-; CHECK:          .short	4441          # Record kind: S_ARMSWITCHTABLE
-; CHECK-NEXT:     .secrel32	.LJTI0_0    # Base offset
-; CHECK-NEXT:     .secidx	.LJTI0_0      # Base section index
-; CHECK-NEXT:     .short	4             # Switch type
-; CHECK-NEXT:     .secrel32	.Ltmp2      # Branch offset
-; CHECK-NEXT:     .secrel32	.LJTI0_0    # Table offset
-; CHECK-NEXT:     .secidx	.Ltmp2        # Branch section index
-; CHECK-NEXT:     .secidx	.LJTI0_0      # Table section index
-; CHECK-NEXT:     .long	4               # Entries count
-; CHECK-NOT:      .short	4441          # Record kind: S_ARMSWITCHTABLE
-
-target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc19.34.0"
-
-define i32 @f() !dbg !5 {
-entry:
-  indirectbr ptr null, [label %BC_SUCCEED], !dbg !11
-
-BC_SUCCEED:                                       ; preds = %entry
-  %0 = lshr i64 0, 0
-  switch i64 %0, label %sw.default.i.i2445 [
-    i64 3, label %sw.bb15.i.i
-    i64 1, label %sw.bb7.i.i
-    i64 2, label %sw.bb11.i.i2444
-    i64 0, label %sw.bb3.i.i
-  ]
-
-sw.bb3.i.i:                                       ; preds = %BC_SUCCEED
-  ret i32 0
-
-sw.bb7.i.i:                                       ; preds = %BC_SUCCEED
-  ret i32 0
-
-sw.bb11.i.i2444:                                  ; preds = %BC_SUCCEED
-  ret i32 0
-
-sw.bb15.i.i:                                      ; preds = %BC_SUCCEED
-  ret i32 0
-
-sw.default.i.i2445:                               ; preds = %BC_SUCCEED
-  ret i32 0
-}
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2, splitDebugInlining: false, nameTableKind: None)
-!1 = !DIFile(filename: "../../v8/src/regexp\\regexp-interpreter.cc", directory: ".", checksumkind: CSK_MD5, checksum: "ddba353f72137fb1d64b5fc8ee071a9c")
-!2 = !{}
-!3 = !{i32 2, !"CodeView", i32 1}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = distinct !DISubprogram(name: "f", linkageName: "f", scope: !7, file: !6, line: 386, type: !10, scopeLine: 391, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, templateParams: !2, retainedNodes: !2)
-!6 = !DIFile(filename: "../../v8/src/regexp/regexp-interpreter.cc", directory: ".", checksumkind: CSK_MD5, checksum: "ddba353f72137fb1d64b5fc8ee071a9c")
-!7 = !DINamespace(scope: !8)
-!8 = !DINamespace(name: "internal", scope: !9)
-!9 = !DINamespace(name: "v8", scope: null)
-!10 = distinct !DISubroutineType(types: !2)
+; REQUIRES: x86-registered-target
+; RUN: llc < %s | FileCheck %s
+
+; Repro for issue https://reviews.llvm.org/D149367#4619121
+; Validates that `indirect ptr null` and a jump table can be used in the same function.
+
+; Verify branch labels match what's in the CodeView
+; CHECK:            .Ltmp2:
+; CHECK-NEXT:       jmpq    *%{{.*}}
+
+; Verify jump table have the same entry size, base offset and shift as what's in the CodeView
+; CHECK:          {{\.?}}LJTI0_0:
+; CHECK-NEXT:     .long   .LBB0_[[#]]-.LJTI0_0
+
+; Verify CodeView
+; CHECK:          .short	4441          # Record kind: S_ARMSWITCHTABLE
+; CHECK-NEXT:     .secrel32	.LJTI0_0    # Base offset
+; CHECK-NEXT:     .secidx	.LJTI0_0      # Base section index
+; CHECK-NEXT:     .short	4             # Switch type
+; CHECK-NEXT:     .secrel32	.Ltmp2      # Branch offset
+; CHECK-NEXT:     .secrel32	.LJTI0_0    # Table offset
+; CHECK-NEXT:     .secidx	.Ltmp2        # Branch section index
+; CHECK-NEXT:     .secidx	.LJTI0_0      # Table section index
+; CHECK-NEXT:     .long	4               # Entries count
+; CHECK-NOT:      .short	4441          # Record kind: S_ARMSWITCHTABLE
+
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc19.34.0"
+
+define i32 @f() !dbg !5 {
+entry:
+  indirectbr ptr null, [label %BC_SUCCEED], !dbg !11
+
+BC_SUCCEED:                                       ; preds = %entry
+  %0 = lshr i64 0, 0
+  switch i64 %0, label %sw.default.i.i2445 [
+    i64 3, label %sw.bb15.i.i
+    i64 1, label %sw.bb7.i.i
+    i64 2, label %sw.bb11.i.i2444
+    i64 0, label %sw.bb3.i.i
+  ]
+
+sw.bb3.i.i:                                       ; preds = %BC_SUCCEED
+  ret i32 0
+
+sw.bb7.i.i:                                       ; preds = %BC_SUCCEED
+  ret i32 0
+
+sw.bb11.i.i2444:                                  ; preds = %BC_SUCCEED
+  ret i32 0
+
+sw.bb15.i.i:                                      ; preds = %BC_SUCCEED
+  ret i32 0
+
+sw.default.i.i2445:                               ; preds = %BC_SUCCEED
+  ret i32 0
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "../../v8/src/regexp\\regexp-interpreter.cc", directory: ".", checksumkind: CSK_MD5, checksum: "ddba353f72137fb1d64b5fc8ee071a9c")
+!2 = !{}
+!3 = !{i32 2, !"CodeView", i32 1}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "f", linkageName: "f", scope: !7, file: !6, line: 386, type: !10, scopeLine: 391, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, templateParams: !2, retainedNodes: !2)
+!6 = !DIFile(filename: "../../v8/src/regexp/regexp-interpreter.cc", directory: ".", checksumkind: CSK_MD5, checksum: "ddba353f72137fb1d64b5fc8ee071a9c")
+!7 = !DINamespace(scope: !8)
+!8 = !DINamespace(name: "internal", scope: !9)
+!9 = !DINamespace(name: "v8", scope: null)
+!10 = distinct !DISubroutineType(types: !2)
 !11 = !DILocation(line: 1, scope: !5)
 \ No newline at end of file
diff --git a/llvm/test/DebugInfo/COFF/jump-table.ll b/llvm/test/DebugInfo/COFF/jump-table.ll
index 4d16c78c9788..a8039809c8b7 100644
--- a/llvm/test/DebugInfo/COFF/jump-table.ll
+++ b/llvm/test/DebugInfo/COFF/jump-table.ll
@@ -1,262 +1,262 @@
-; REQUIRES: arm-registered-target
-; REQUIRES: aarch64-registered-target
-; REQUIRES: x86-registered-target
-; RUN: llc -mtriple=i686-windows < %s | FileCheck %s --check-prefixes=CHECK,I686,NOTA32
-; RUN: llc -mtriple=x86_64-windows < %s | FileCheck %s --check-prefixes=CHECK,X64,NOTA32
-; RUN: llc -mtriple=aarch64-windows -aarch64-min-jump-table-entries=4 < %s | FileCheck %s --check-prefixes=CHECK,A64,NOTA32
-; RUN: llc -mtriple=thumbv7a-windows < %s | FileCheck %s --check-prefixes=CHECK,A32
-; RUN: llc -mtriple=x86_64-windows -filetype=obj < %s | llvm-readobj - --codeview | FileCheck %s --check-prefixes=CV
-
-; Generated by clang++ -S -c -std=c++11 -emit-llvm -g from the following C++11 source:
-; extern "C" void f1();
-; extern "C" void f2();
-; extern "C" void f3();
-; extern "C" void f4();
-; extern "C" void f5();
-; extern "C" void func(int i){
-;     switch (i) {
-;         case 0: f1(); break;
-;         case 1: f2(); break;
-;         case 2: f3(); break;
-;         case 3: f4(); break;
-;     }
-;     switch (i) {
-;         case 1: f2(); break;
-;         case 2: f3(); break;
-;         case 3: f4(); break;
-;         case 4: f5(); break;
-;         case 5: f1(); break;
-;     }
-; }
-
-; i686 entries are absolute addresses (Base = 0, SwitchType = Pointer).
-; x86_64 entries are fixed-size and relative to the jump table (Base = Table,
-;   SwitchType = Int32).
-; aarch64 entries are variable-sized and relative to the first entry's BB if
-;   compressed (Base = Branch+0x4, SwitchType = UInt8ShiftLeft/UInt16ShiftLeft)
-;   otherwise relative to the ADR instruction (Base = Branch-0xc, SwitchType =
-;   Int32).
-; thumbv7a entries are either absolute addresses (Base = 0, SwitchType =
-;   Pointer) OR variable-sized and relative to *after* the branch instruction
-;   (Base = Branch+0x4, SwitchType = UInt8ShiftLeft/UInt16ShiftLeft/UInt32) but
-;   there appears to be a bug where the offsets are always 0.
-
-; Verify branch labels match what's in the CodeView
-; X64:            .Ltmp1:
-; X64-NEXT:       jmpq    *%{{.*}}
-; X64:            .Ltmp4:
-; X64-NEXT:       jmpq    *%{{.*}}
-; A32:            .LCPI0_0:
-; A32-NEXT        add     pc, r{{.*}}
-; NOTE: thumbv7a places the jump tables just after the branch, so verify the other branch below
-; A64:            .Ltmp1:
-; A64-NEXT:       br      x{{.*}}
-; A64:            .Ltmp4:
-; A64-NEXT:       br      x{{.*}}
-
-; Verify jump table have the same entry size, base offset and shift as what's in the CodeView
-; CHECK:          {{\.?}}LJTI0_0:
-; I686-NEXT:      .long   LBB0_[[#]]
-; X64-NEXT:       .long   .LBB0_[[#]]-.LJTI0_0
-; A32-NEXT:       .byte   (($MBB0_[[#]])-(.LCPI0_0+4))/2
-; A64-NEXT:       .byte   (.LBB0_[[FIRSTBLOCK:[0-9]+]]-.LBB0_[[FIRSTBLOCK]])>>2
-; NOTE: thumbv7a places the jump tables just after the branch, so check for the other branch now
-; A32:            .LCPI0_1:
-; A32-NEXT        add     pc, r{{.*}}
-; CHECK:          {{\.?}}LJTI0_1:
-; I686-NEXT:      .long   LBB0_[[#]]
-; X64-NEXT:       .long   .LBB0_[[#]]-.LJTI0_1
-; A32-NEXT:       .byte   (($MBB0_[[#]])-(.LCPI0_1+4))/2
-; A64-NEXT:       .byte   (.LBB0_[[SECONDBLOCK:[0-9]+]]-.LBB0_[[SECONDBLOCK]])>>2
-
-; Verify CodeView
-; CHECK:          [[INT16:\.short|\.hword]]	4441        [[COMMENT:#|//|@]] Record kind: S_ARMSWITCHTABLE
-; I686-NEXT:      .long 0                               [[COMMENT]] Base offset
-; I686-NEXT:      .short 0                              [[COMMENT]] Base section index
-; X64-NEXT:       .secrel32	.LJTI0_0                    [[COMMENT]] Base offset
-; X64-NEXT:       .secidx	.LJTI0_0                      [[COMMENT]] Base section index
-; A32-NEXT:       .secrel32	.LCPI0_0+4                  [[COMMENT]] Base offset
-; A32-NEXT:       .secidx	.LCPI0_0                      [[COMMENT]] Base section index
-; A64-NEXT:       .secrel32	.LBB0_[[FIRSTBLOCK]]        [[COMMENT]] Base offset
-; A64-NEXT:       .secidx	.LBB0_[[FIRSTBLOCK]]          [[COMMENT]] Base section index
-; I686-NEXT:      .short	6                             [[COMMENT]] Switch type
-; X64-NEXT:       .short	4                             [[COMMENT]] Switch type
-; A32-NEXT:       .short	7                             [[COMMENT]] Switch type
-; A64-NEXT:       .hword	7                             [[COMMENT]] Switch type
-; NOTA32-NEXT:    .secrel32	{{\.?}}Ltmp1                [[COMMENT]] Branch offset
-; A32-NEXT:       .secrel32	.LCPI0_0                    [[COMMENT]] Branch offset
-; CHECK-NEXT:     .secrel32	{{\.?}}LJTI0_0              [[COMMENT]] Table offset
-; NOTA32-NEXT:    .secidx	{{\.?}}Ltmp1                  [[COMMENT]] Branch section index
-; A32-NEXT:       .secidx	.LCPI0_0                      [[COMMENT]] Branch section index
-; CHECK-NEXT:     .secidx	{{\.?}}LJTI0_0                [[COMMENT]] Table section index
-; CHECK-NEXT:     [[INT32:\.long|\.word]]	4             [[COMMENT]] Entries count
-; CHECK:          [[INT16]]	4441                        [[COMMENT]] Record kind: S_ARMSWITCHTABLE
-; I686-NEXT:      .long 0                               [[COMMENT]] Base offset
-; I686-NEXT:      .short 0                              [[COMMENT]] Base section index
-; X64-NEXT:       .secrel32	.LJTI0_1                    [[COMMENT]] Base offset
-; X64-NEXT:       .secidx	.LJTI0_1                      [[COMMENT]] Base section index
-; A32-NEXT:       .secrel32	.LCPI0_1+4                  [[COMMENT]] Base offset
-; A32-NEXT:       .secidx	.LCPI0_1                      [[COMMENT]] Base section index
-; A64-NEXT:       .secrel32	.LBB0_[[SECONDBLOCK]]       [[COMMENT]] Base offset
-; A64-NEXT:       .secidx	.LBB0_[[SECONDBLOCK]]         [[COMMENT]] Base section index
-; I686-NEXT:      .short	6                             [[COMMENT]] Switch type
-; X64-NEXT:       .short	4                             [[COMMENT]] Switch type
-; A32-NEXT:       .short	7                             [[COMMENT]] Switch type
-; A64-NEXT:       .hword	7                             [[COMMENT]] Switch type
-; NOTA32-NEXT:    .secrel32	{{\.?}}Ltmp4                [[COMMENT]] Branch offset
-; A32-NEXT:       .secrel32	.LCPI0_1                    [[COMMENT]] Branch offset
-; CHECK-NEXT:     .secrel32	{{\.?}}LJTI0_1              [[COMMENT]] Table offset
-; NOTA32-NEXT:    .secidx	{{\.?}}Ltmp4                  [[COMMENT]] Branch section index
-; A32-NEXT:       .secidx	.LCPI0_1                      [[COMMENT]] Branch section index
-; CHECK-NEXT:     .secidx	{{\.?}}LJTI0_1                [[COMMENT]] Table section index
-; CHECK-NEXT:     [[INT32]]	5                           [[COMMENT]] Entries count
-; CHECK-NOT:      [[INT16]]	4441                        [[COMMENT]] Record kind: S_ARMSWITCHTABLE
-
-; Verify CodeView as dumped by llvm-readobj
-; CV:      Subsection [
-; CV:         SubSectionType: Symbols (0xF1)
-; CV:         GlobalProcIdSym {
-; CV:           DisplayName: func
-; CV-NOT:     GlobalProcIdSym
-; CV:           JumpTableSym {
-; CV-NEXT:        Kind: S_ARMSWITCHTABLE (0x1159)
-; CV-NEXT:        BaseOffset: 0x0
-; CV-NEXT:        BaseSegment: 0
-; CV-NEXT:        SwitchType: Int32 (0x4)
-; CV-NEXT:        BranchOffset: 0x23
-; CV-NEXT:        TableOffset: 0x0
-; CV-NEXT:        BranchSegment: 0
-; CV-NEXT:        TableSegment: 0
-; CV-NEXT:        EntriesCount: 4
-; CV-NEXT:      }
-; CV-NEXT:      JumpTableSym {
-; CV-NEXT:        Kind: S_ARMSWITCHTABLE (0x1159)
-; CV-NEXT:        BaseOffset: 0x10
-; CV-NEXT:        BaseSegment: 0
-; CV-NEXT:        SwitchType: Int32 (0x4)
-; CV-NEXT:        BranchOffset: 0x5A
-; CV-NEXT:        TableOffset: 0x10
-; CV-NEXT:        BranchSegment: 0
-; CV-NEXT:        TableSegment: 0
-; CV-NEXT:        EntriesCount: 5
-; CV-NEXT:      }
-; CV-NOT:       JumpTableSym {
-
-source_filename = ".\\jump-table.cpp"
-target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc19.35.32216"
-
-; Function Attrs: mustprogress noinline optnone uwtable
-define dso_local void @func(i32 noundef %0) #0 !dbg !8 {
-  %2 = alloca i32, align 4
-  store i32 %0, ptr %2, align 4
-  call void @llvm.dbg.declare(metadata ptr %2, metadata !14, metadata !DIExpression()), !dbg !15
-  %3 = load i32, ptr %2, align 4, !dbg !16
-  switch i32 %3, label %8 [
-    i32 0, label %4
-    i32 1, label %5
-    i32 2, label %6
-    i32 3, label %7
-  ], !dbg !16
-
-4:                                                ; preds = %1
-  call void @f1(), !dbg !17
-  br label %8, !dbg !17
-
-5:                                                ; preds = %1
-  call void @f2(), !dbg !19
-  br label %8, !dbg !19
-
-6:                                                ; preds = %1
-  call void @f3(), !dbg !20
-  br label %8, !dbg !20
-
-7:                                                ; preds = %1
-  call void @f4(), !dbg !21
-  br label %8, !dbg !21
-
-8:                                                ; preds = %1, %7, %6, %5, %4
-  %9 = load i32, ptr %2, align 4, !dbg !22
-  switch i32 %9, label %15 [
-    i32 1, label %10
-    i32 2, label %11
-    i32 3, label %12
-    i32 4, label %13
-    i32 5, label %14
-  ], !dbg !22
-
-10:                                               ; preds = %8
-  call void @f2(), !dbg !23
-  br label %15, !dbg !23
-
-11:                                               ; preds = %8
-  call void @f3(), !dbg !25
-  br label %15, !dbg !25
-
-12:                                               ; preds = %8
-  call void @f4(), !dbg !26
-  br label %15, !dbg !26
-
-13:                                               ; preds = %8
-  call void @f5(), !dbg !27
-  br label %15, !dbg !27
-
-14:                                               ; preds = %8
-  call void @f1(), !dbg !28
-  br label %15, !dbg !28
-
-15:                                               ; preds = %8, %14, %13, %12, %11, %10
-  ret void, !dbg !29
-}
-
-; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-declare dso_local void @f1() #2
-
-declare dso_local void @f2() #2
-
-declare dso_local void @f3() #2
-
-declare dso_local void @f4() #2
-
-declare dso_local void @f5() #2
-
-attributes #0 = { mustprogress noinline optnone uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
-attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
-attributes #2 = { "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!2, !3, !4, !5, !6}
-!llvm.ident = !{!7}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_11, file: !1, producer: "clang version 15.0.1", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
-!1 = !DIFile(filename: "jump-table.cpp", directory: "C:\\llvm", checksumkind: CSK_MD5, checksum: "35610c7104c8080f83e2bf6a02dabfc9")
-!2 = !{i32 2, !"CodeView", i32 1}
-!3 = !{i32 2, !"Debug Info Version", i32 3}
-!4 = !{i32 1, !"wchar_size", i32 2}
-!5 = !{i32 7, !"PIC Level", i32 2}
-!6 = !{i32 7, !"uwtable", i32 2}
-!7 = !{!"clang version 15.0.1"}
-!8 = distinct !DISubprogram(name: "func", scope: !9, file: !9, line: 6, type: !10, scopeLine: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !13)
-!9 = !DIFile(filename: ".\\jump-table.cpp", directory: "C:\\llvm", checksumkind: CSK_MD5, checksum: "35610c7104c8080f83e2bf6a02dabfc9")
-!10 = !DISubroutineType(types: !11)
-!11 = !{null, !12}
-!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-!13 = !{}
-!14 = !DILocalVariable(name: "i", arg: 1, scope: !8, file: !9, line: 6, type: !12)
-!15 = !DILocation(line: 6, scope: !8)
-!16 = !DILocation(line: 7, scope: !8)
-!17 = !DILocation(line: 8, scope: !18)
-!18 = distinct !DILexicalBlock(scope: !8, file: !9, line: 7)
-!19 = !DILocation(line: 9, scope: !18)
-!20 = !DILocation(line: 10, scope: !18)
-!21 = !DILocation(line: 11, scope: !18)
-!22 = !DILocation(line: 13, scope: !8)
-!23 = !DILocation(line: 14, scope: !24)
-!24 = distinct !DILexicalBlock(scope: !8, file: !9, line: 13)
-!25 = !DILocation(line: 15, scope: !24)
-!26 = !DILocation(line: 16, scope: !24)
-!27 = !DILocation(line: 17, scope: !24)
-!28 = !DILocation(line: 18, scope: !24)
-!29 = !DILocation(line: 20, scope: !8)
+; REQUIRES: arm-registered-target
+; REQUIRES: aarch64-registered-target
+; REQUIRES: x86-registered-target
+; RUN: llc -mtriple=i686-windows < %s | FileCheck %s --check-prefixes=CHECK,I686,NOTA32
+; RUN: llc -mtriple=x86_64-windows < %s | FileCheck %s --check-prefixes=CHECK,X64,NOTA32
+; RUN: llc -mtriple=aarch64-windows -aarch64-min-jump-table-entries=4 < %s | FileCheck %s --check-prefixes=CHECK,A64,NOTA32
+; RUN: llc -mtriple=thumbv7a-windows < %s | FileCheck %s --check-prefixes=CHECK,A32
+; RUN: llc -mtriple=x86_64-windows -filetype=obj < %s | llvm-readobj - --codeview | FileCheck %s --check-prefixes=CV
+
+; Generated by clang++ -S -c -std=c++11 -emit-llvm -g from the following C++11 source:
+; extern "C" void f1();
+; extern "C" void f2();
+; extern "C" void f3();
+; extern "C" void f4();
+; extern "C" void f5();
+; extern "C" void func(int i){
+;     switch (i) {
+;         case 0: f1(); break;
+;         case 1: f2(); break;
+;         case 2: f3(); break;
+;         case 3: f4(); break;
+;     }
+;     switch (i) {
+;         case 1: f2(); break;
+;         case 2: f3(); break;
+;         case 3: f4(); break;
+;         case 4: f5(); break;
+;         case 5: f1(); break;
+;     }
+; }
+
+; i686 entries are absolute addresses (Base = 0, SwitchType = Pointer).
+; x86_64 entries are fixed-size and relative to the jump table (Base = Table,
+;   SwitchType = Int32).
+; aarch64 entries are variable-sized and relative to the first entry's BB if
+;   compressed (Base = Branch+0x4, SwitchType = UInt8ShiftLeft/UInt16ShiftLeft)
+;   otherwise relative to the ADR instruction (Base = Branch-0xc, SwitchType =
+;   Int32).
+; thumbv7a entries are either absolute addresses (Base = 0, SwitchType =
+;   Pointer) OR variable-sized and relative to *after* the branch instruction
+;   (Base = Branch+0x4, SwitchType = UInt8ShiftLeft/UInt16ShiftLeft/UInt32) but
+;   there appears to be a bug where the offsets are always 0.
+
+; Verify branch labels match what's in the CodeView
+; X64:            .Ltmp1:
+; X64-NEXT:       jmpq    *%{{.*}}
+; X64:            .Ltmp4:
+; X64-NEXT:       jmpq    *%{{.*}}
+; A32:            .LCPI0_0:
+; A32-NEXT        add     pc, r{{.*}}
+; NOTE: thumbv7a places the jump tables just after the branch, so verify the other branch below
+; A64:            .Ltmp1:
+; A64-NEXT:       br      x{{.*}}
+; A64:            .Ltmp4:
+; A64-NEXT:       br      x{{.*}}
+
+; Verify jump table have the same entry size, base offset and shift as what's in the CodeView
+; CHECK:          {{\.?}}LJTI0_0:
+; I686-NEXT:      .long   LBB0_[[#]]
+; X64-NEXT:       .long   .LBB0_[[#]]-.LJTI0_0
+; A32-NEXT:       .byte   (($MBB0_[[#]])-(.LCPI0_0+4))/2
+; A64-NEXT:       .byte   (.LBB0_[[FIRSTBLOCK:[0-9]+]]-.LBB0_[[FIRSTBLOCK]])>>2
+; NOTE: thumbv7a places the jump tables just after the branch, so check for the other branch now
+; A32:            .LCPI0_1:
+; A32-NEXT        add     pc, r{{.*}}
+; CHECK:          {{\.?}}LJTI0_1:
+; I686-NEXT:      .long   LBB0_[[#]]
+; X64-NEXT:       .long   .LBB0_[[#]]-.LJTI0_1
+; A32-NEXT:       .byte   (($MBB0_[[#]])-(.LCPI0_1+4))/2
+; A64-NEXT:       .byte   (.LBB0_[[SECONDBLOCK:[0-9]+]]-.LBB0_[[SECONDBLOCK]])>>2
+
+; Verify CodeView
+; CHECK:          [[INT16:\.short|\.hword]]	4441        [[COMMENT:#|//|@]] Record kind: S_ARMSWITCHTABLE
+; I686-NEXT:      .long 0                               [[COMMENT]] Base offset
+; I686-NEXT:      .short 0                              [[COMMENT]] Base section index
+; X64-NEXT:       .secrel32	.LJTI0_0                    [[COMMENT]] Base offset
+; X64-NEXT:       .secidx	.LJTI0_0                      [[COMMENT]] Base section index
+; A32-NEXT:       .secrel32	.LCPI0_0+4                  [[COMMENT]] Base offset
+; A32-NEXT:       .secidx	.LCPI0_0                      [[COMMENT]] Base section index
+; A64-NEXT:       .secrel32	.LBB0_[[FIRSTBLOCK]]        [[COMMENT]] Base offset
+; A64-NEXT:       .secidx	.LBB0_[[FIRSTBLOCK]]          [[COMMENT]] Base section index
+; I686-NEXT:      .short	6                             [[COMMENT]] Switch type
+; X64-NEXT:       .short	4                             [[COMMENT]] Switch type
+; A32-NEXT:       .short	7                             [[COMMENT]] Switch type
+; A64-NEXT:       .hword	7                             [[COMMENT]] Switch type
+; NOTA32-NEXT:    .secrel32	{{\.?}}Ltmp1                [[COMMENT]] Branch offset
+; A32-NEXT:       .secrel32	.LCPI0_0                    [[COMMENT]] Branch offset
+; CHECK-NEXT:     .secrel32	{{\.?}}LJTI0_0              [[COMMENT]] Table offset
+; NOTA32-NEXT:    .secidx	{{\.?}}Ltmp1                  [[COMMENT]] Branch section index
+; A32-NEXT:       .secidx	.LCPI0_0                      [[COMMENT]] Branch section index
+; CHECK-NEXT:     .secidx	{{\.?}}LJTI0_0                [[COMMENT]] Table section index
+; CHECK-NEXT:     [[INT32:\.long|\.word]]	4             [[COMMENT]] Entries count
+; CHECK:          [[INT16]]	4441                        [[COMMENT]] Record kind: S_ARMSWITCHTABLE
+; I686-NEXT:      .long 0                               [[COMMENT]] Base offset
+; I686-NEXT:      .short 0                              [[COMMENT]] Base section index
+; X64-NEXT:       .secrel32	.LJTI0_1                    [[COMMENT]] Base offset
+; X64-NEXT:       .secidx	.LJTI0_1                      [[COMMENT]] Base section index
+; A32-NEXT:       .secrel32	.LCPI0_1+4                  [[COMMENT]] Base offset
+; A32-NEXT:       .secidx	.LCPI0_1                      [[COMMENT]] Base section index
+; A64-NEXT:       .secrel32	.LBB0_[[SECONDBLOCK]]       [[COMMENT]] Base offset
+; A64-NEXT:       .secidx	.LBB0_[[SECONDBLOCK]]         [[COMMENT]] Base section index
+; I686-NEXT:      .short	6                             [[COMMENT]] Switch type
+; X64-NEXT:       .short	4                             [[COMMENT]] Switch type
+; A32-NEXT:       .short	7                             [[COMMENT]] Switch type
+; A64-NEXT:       .hword	7                             [[COMMENT]] Switch type
+; NOTA32-NEXT:    .secrel32	{{\.?}}Ltmp4                [[COMMENT]] Branch offset
+; A32-NEXT:       .secrel32	.LCPI0_1                    [[COMMENT]] Branch offset
+; CHECK-NEXT:     .secrel32	{{\.?}}LJTI0_1              [[COMMENT]] Table offset
+; NOTA32-NEXT:    .secidx	{{\.?}}Ltmp4                  [[COMMENT]] Branch section index
+; A32-NEXT:       .secidx	.LCPI0_1                      [[COMMENT]] Branch section index
+; CHECK-NEXT:     .secidx	{{\.?}}LJTI0_1                [[COMMENT]] Table section index
+; CHECK-NEXT:     [[INT32]]	5                           [[COMMENT]] Entries count
+; CHECK-NOT:      [[INT16]]	4441                        [[COMMENT]] Record kind: S_ARMSWITCHTABLE
+
+; Verify CodeView as dumped by llvm-readobj
+; CV:      Subsection [
+; CV:         SubSectionType: Symbols (0xF1)
+; CV:         GlobalProcIdSym {
+; CV:           DisplayName: func
+; CV-NOT:     GlobalProcIdSym
+; CV:           JumpTableSym {
+; CV-NEXT:        Kind: S_ARMSWITCHTABLE (0x1159)
+; CV-NEXT:        BaseOffset: 0x0
+; CV-NEXT:        BaseSegment: 0
+; CV-NEXT:        SwitchType: Int32 (0x4)
+; CV-NEXT:        BranchOffset: 0x23
+; CV-NEXT:        TableOffset: 0x0
+; CV-NEXT:        BranchSegment: 0
+; CV-NEXT:        TableSegment: 0
+; CV-NEXT:        EntriesCount: 4
+; CV-NEXT:      }
+; CV-NEXT:      JumpTableSym {
+; CV-NEXT:        Kind: S_ARMSWITCHTABLE (0x1159)
+; CV-NEXT:        BaseOffset: 0x10
+; CV-NEXT:        BaseSegment: 0
+; CV-NEXT:        SwitchType: Int32 (0x4)
+; CV-NEXT:        BranchOffset: 0x5A
+; CV-NEXT:        TableOffset: 0x10
+; CV-NEXT:        BranchSegment: 0
+; CV-NEXT:        TableSegment: 0
+; CV-NEXT:        EntriesCount: 5
+; CV-NEXT:      }
+; CV-NOT:       JumpTableSym {
+
+source_filename = ".\\jump-table.cpp"
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc19.35.32216"
+
+; Function Attrs: mustprogress noinline optnone uwtable
+define dso_local void @func(i32 noundef %0) #0 !dbg !8 {
+  %2 = alloca i32, align 4
+  store i32 %0, ptr %2, align 4
+  call void @llvm.dbg.declare(metadata ptr %2, metadata !14, metadata !DIExpression()), !dbg !15
+  %3 = load i32, ptr %2, align 4, !dbg !16
+  switch i32 %3, label %8 [
+    i32 0, label %4
+    i32 1, label %5
+    i32 2, label %6
+    i32 3, label %7
+  ], !dbg !16
+
+4:                                                ; preds = %1
+  call void @f1(), !dbg !17
+  br label %8, !dbg !17
+
+5:                                                ; preds = %1
+  call void @f2(), !dbg !19
+  br label %8, !dbg !19
+
+6:                                                ; preds = %1
+  call void @f3(), !dbg !20
+  br label %8, !dbg !20
+
+7:                                                ; preds = %1
+  call void @f4(), !dbg !21
+  br label %8, !dbg !21
+
+8:                                                ; preds = %1, %7, %6, %5, %4
+  %9 = load i32, ptr %2, align 4, !dbg !22
+  switch i32 %9, label %15 [
+    i32 1, label %10
+    i32 2, label %11
+    i32 3, label %12
+    i32 4, label %13
+    i32 5, label %14
+  ], !dbg !22
+
+10:                                               ; preds = %8
+  call void @f2(), !dbg !23
+  br label %15, !dbg !23
+
+11:                                               ; preds = %8
+  call void @f3(), !dbg !25
+  br label %15, !dbg !25
+
+12:                                               ; preds = %8
+  call void @f4(), !dbg !26
+  br label %15, !dbg !26
+
+13:                                               ; preds = %8
+  call void @f5(), !dbg !27
+  br label %15, !dbg !27
+
+14:                                               ; preds = %8
+  call void @f1(), !dbg !28
+  br label %15, !dbg !28
+
+15:                                               ; preds = %8, %14, %13, %12, %11, %10
+  ret void, !dbg !29
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+declare dso_local void @f1() #2
+
+declare dso_local void @f2() #2
+
+declare dso_local void @f3() #2
+
+declare dso_local void @f4() #2
+
+declare dso_local void @f5() #2
+
+attributes #0 = { mustprogress noinline optnone uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
+attributes #2 = { "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6}
+!llvm.ident = !{!7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_11, file: !1, producer: "clang version 15.0.1", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "jump-table.cpp", directory: "C:\\llvm", checksumkind: CSK_MD5, checksum: "35610c7104c8080f83e2bf6a02dabfc9")
+!2 = !{i32 2, !"CodeView", i32 1}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 2}
+!5 = !{i32 7, !"PIC Level", i32 2}
+!6 = !{i32 7, !"uwtable", i32 2}
+!7 = !{!"clang version 15.0.1"}
+!8 = distinct !DISubprogram(name: "func", scope: !9, file: !9, line: 6, type: !10, scopeLine: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !13)
+!9 = !DIFile(filename: ".\\jump-table.cpp", directory: "C:\\llvm", checksumkind: CSK_MD5, checksum: "35610c7104c8080f83e2bf6a02dabfc9")
+!10 = !DISubroutineType(types: !11)
+!11 = !{null, !12}
+!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!13 = !{}
+!14 = !DILocalVariable(name: "i", arg: 1, scope: !8, file: !9, line: 6, type: !12)
+!15 = !DILocation(line: 6, scope: !8)
+!16 = !DILocation(line: 7, scope: !8)
+!17 = !DILocation(line: 8, scope: !18)
+!18 = distinct !DILexicalBlock(scope: !8, file: !9, line: 7)
+!19 = !DILocation(line: 9, scope: !18)
+!20 = !DILocation(line: 10, scope: !18)
+!21 = !DILocation(line: 11, scope: !18)
+!22 = !DILocation(line: 13, scope: !8)
+!23 = !DILocation(line: 14, scope: !24)
+!24 = distinct !DILexicalBlock(scope: !8, file: !9, line: 13)
+!25 = !DILocation(line: 15, scope: !24)
+!26 = !DILocation(line: 16, scope: !24)
+!27 = !DILocation(line: 17, scope: !24)
+!28 = !DILocation(line: 18, scope: !24)
+!29 = !DILocation(line: 20, scope: !8)
diff --git a/llvm/test/DebugInfo/COFF/pieces.ll b/llvm/test/DebugInfo/COFF/pieces.ll
index 2d20b50751e7..8f5b6b2d7c1a 100644
--- a/llvm/test/DebugInfo/COFF/pieces.ll
+++ b/llvm/test/DebugInfo/COFF/pieces.ll
@@ -101,7 +101,7 @@
 ; ASM:         callq   g
 ; ASM:         movl    %eax, [[offset_o_x:[0-9]+]](%rsp)          # 4-byte Spill
 ; ASM: [[spill_o_x_start:\.Ltmp[0-9]+]]:
-; ASM:         #DEBUG_VALUE: bitpiece_spill:o <- [DW_OP_plus_uconst [[offset_o_x]], DW_OP_deref, DW_OP_LLVM_fragment 32 32] $rsp
+; ASM:         #DEBUG_VALUE: bitpiece_spill:o <- [DW_OP_plus_uconst [[offset_o_x]], DW_OP_deref, DW_OP_LLVM_fragment 32 32] $rsp
 ; ASM:         #APP
 ; ASM:         #NO_APP
 ; ASM:         movl    [[offset_o_x]](%rsp), %eax          # 4-byte Reload
diff --git a/llvm/test/DebugInfo/Generic/debug_value_list.ll b/llvm/test/DebugInfo/Generic/debug_value_list.ll
index d5ec03e0b358..10c8ae2ef080 100644
--- a/llvm/test/DebugInfo/Generic/debug_value_list.ll
+++ b/llvm/test/DebugInfo/Generic/debug_value_list.ll
@@ -1,50 +1,50 @@
-; RUN: opt -passes=verify < %s | opt -passes=verify -S | FileCheck %s
-
-; Simple IR-BC-IR round-trip test for a @llvm.dbg.value that uses !DIArgList
-; and DW_OP_LLVM_arg.
-
-source_filename = ".\\debug_value_list.cpp"
-target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-windows-msvc19.16.27034"
-
-; CHECK-COUNT-3: llvm.dbg.value(
-; CHECK-SAME: metadata !DIArgList(i32 %a, i32 %b, i32 5)
-; CHECK-SAME: metadata !16,
-; CHECK-SAME: metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus, DW_OP_LLVM_arg, 2, DW_OP_plus)
-define dso_local i32 @"?foo@@YAHHH@Z"(i32 %a, i32 %b) local_unnamed_addr !dbg !8 {
-entry:
-  call void @llvm.dbg.value(metadata !DIArgList(i32 %b), metadata !14, metadata !DIExpression(DW_OP_LLVM_arg, 0)), !dbg !17
-  call void @llvm.dbg.value(metadata !DIArgList(i32 %a), metadata !15, metadata !DIExpression(DW_OP_LLVM_arg, 0)), !dbg !17
-  call void @llvm.dbg.value(
-    metadata !DIArgList(i32 %a, i32 %b, i32 5),
-    metadata !16,
-    metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus, DW_OP_LLVM_arg, 2, DW_OP_plus)), !dbg !17
-  %mul = mul nsw i32 %b, %a, !dbg !18
-  ret i32 %mul, !dbg !18
-}
-
-declare void @llvm.dbg.value(metadata, metadata, metadata)
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5, !6}
-!llvm.ident = !{!7}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
-!1 = !DIFile(filename: "debug_value_list.cpp", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 2, !"CodeView", i32 1}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 2}
-!6 = !{i32 7, !"PIC Level", i32 2}
-!7 = !{!"clang version 11.0.0"}
-!8 = distinct !DISubprogram(name: "foo", linkageName: "?foo@@YAHHH@Z", scope: !9, file: !9, line: 1, type: !10, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !13)
-!9 = !DIFile(filename: ".\\debug_value_list.cpp", directory: "/tmp")
-!10 = !DISubroutineType(types: !11)
-!11 = !{!12, !12, !12}
-!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-!13 = !{!14, !15, !16}
-!14 = !DILocalVariable(name: "b", arg: 2, scope: !8, file: !9, line: 1, type: !12)
-!15 = !DILocalVariable(name: "a", arg: 1, scope: !8, file: !9, line: 1, type: !12)
-!16 = !DILocalVariable(name: "c", scope: !8, file: !9, line: 2, type: !12)
-!17 = !DILocation(line: 0, scope: !8)
-!18 = !DILocation(line: 3, scope: !8)
+; RUN: opt -passes=verify < %s | opt -passes=verify -S | FileCheck %s
+
+; Simple IR-BC-IR round-trip test for a @llvm.dbg.value that uses !DIArgList
+; and DW_OP_LLVM_arg.
+
+source_filename = ".\\debug_value_list.cpp"
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc19.16.27034"
+
+; CHECK-COUNT-3: llvm.dbg.value(
+; CHECK-SAME: metadata !DIArgList(i32 %a, i32 %b, i32 5)
+; CHECK-SAME: metadata !16,
+; CHECK-SAME: metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus, DW_OP_LLVM_arg, 2, DW_OP_plus)
+define dso_local i32 @"?foo@@YAHHH@Z"(i32 %a, i32 %b) local_unnamed_addr !dbg !8 {
+entry:
+  call void @llvm.dbg.value(metadata !DIArgList(i32 %b), metadata !14, metadata !DIExpression(DW_OP_LLVM_arg, 0)), !dbg !17
+  call void @llvm.dbg.value(metadata !DIArgList(i32 %a), metadata !15, metadata !DIExpression(DW_OP_LLVM_arg, 0)), !dbg !17
+  call void @llvm.dbg.value(
+    metadata !DIArgList(i32 %a, i32 %b, i32 5),
+    metadata !16,
+    metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus, DW_OP_LLVM_arg, 2, DW_OP_plus)), !dbg !17
+  %mul = mul nsw i32 %b, %a, !dbg !18
+  ret i32 %mul, !dbg !18
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6}
+!llvm.ident = !{!7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "debug_value_list.cpp", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"CodeView", i32 1}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 2}
+!6 = !{i32 7, !"PIC Level", i32 2}
+!7 = !{!"clang version 11.0.0"}
+!8 = distinct !DISubprogram(name: "foo", linkageName: "?foo@@YAHHH@Z", scope: !9, file: !9, line: 1, type: !10, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !13)
+!9 = !DIFile(filename: ".\\debug_value_list.cpp", directory: "/tmp")
+!10 = !DISubroutineType(types: !11)
+!11 = !{!12, !12, !12}
+!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!13 = !{!14, !15, !16}
+!14 = !DILocalVariable(name: "b", arg: 2, scope: !8, file: !9, line: 1, type: !12)
+!15 = !DILocalVariable(name: "a", arg: 1, scope: !8, file: !9, line: 1, type: !12)
+!16 = !DILocalVariable(name: "c", scope: !8, file: !9, line: 2, type: !12)
+!17 = !DILocation(line: 0, scope: !8)
+!18 = !DILocation(line: 3, scope: !8)
diff --git a/llvm/test/DebugInfo/MIR/InstrRef/accept-nonlive-reg-phis.mir b/llvm/test/DebugInfo/MIR/InstrRef/accept-nonlive-reg-phis.mir
index 563c7258543c..f7054b9129bb 100644
--- a/llvm/test/DebugInfo/MIR/InstrRef/accept-nonlive-reg-phis.mir
+++ b/llvm/test/DebugInfo/MIR/InstrRef/accept-nonlive-reg-phis.mir
@@ -12,23 +12,23 @@
 --- |
   target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
   target triple = "x86_64-unknown-linux"
-  
+
   @b = dso_local local_unnamed_addr global i32 0, align 4, !dbg !0
   @a = dso_local local_unnamed_addr global i32 0, align 4, !dbg !5
-  
+
   define dso_local i32 @c() local_unnamed_addr !dbg !13 {
   entry:
     ret i32 0, !dbg !36
   }
-  
+
   declare void @llvm.dbg.declare(metadata, metadata, metadata)
-  
+
   declare void @llvm.dbg.value(metadata, metadata, metadata)
-  
+
   !llvm.dbg.cu = !{!2}
   !llvm.module.flags = !{!8, !9, !10, !11}
   !llvm.ident = !{!12}
-  
+
   !0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
   !1 = distinct !DIGlobalVariable(name: "b", scope: !2, file: !3, line: 1, type: !7, isLocal: false, isDefinition: true)
   !2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 14.0.0 (git@github.com:llvm/llvm-project 1b09d0c42b42be219dd0984e0714d68b4a36cd3e)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: None)
@@ -79,9 +79,9 @@ frameInfo:
 machineFunctionInfo: {}
 body:             |
   bb.0.entry:
-  
+
     DBG_PHI $fp0, 3
-    DBG_INSTR_REF !17, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(3, 0), debug-location !30
+    DBG_INSTR_REF !17, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(3, 0), debug-location !30
     $eax = MOV32ri 0
     RET 0, debug-location !36
 
diff --git a/llvm/test/DebugInfo/MIR/InstrRef/dbg-phi-subregister-location.mir b/llvm/test/DebugInfo/MIR/InstrRef/dbg-phi-subregister-location.mir
index 9eb7345f4417..041fface0437 100644
--- a/llvm/test/DebugInfo/MIR/InstrRef/dbg-phi-subregister-location.mir
+++ b/llvm/test/DebugInfo/MIR/InstrRef/dbg-phi-subregister-location.mir
@@ -9,8 +9,8 @@
 #
 # CHECK-LABEL: name: foo
 # CHECK:       DBG_PHI $edi
-# CHECK-NEXT:  DBG_INSTR_REF {{.+}}, dbg-instr-ref(2, 0)
-# CHECK-NEXT:  DBG_VALUE_LIST {{.+}} $dil
+# CHECK-NEXT:  DBG_INSTR_REF {{.+}}, dbg-instr-ref(2, 0)
+# CHECK-NEXT:  DBG_VALUE_LIST {{.+}} $dil
 --- |
   ; ModuleID = 'out.ll'
   source_filename = "out.ll"
@@ -64,7 +64,7 @@ body:             |
     liveins: $edi
 
     DBG_PHI $edi, 1
-    DBG_INSTR_REF !12, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(2, 0), debug-location !13
+    DBG_INSTR_REF !12, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(2, 0), debug-location !13
     renamable $rax = MOV64rm $rip, 1, $noreg, target-flags(x86-gotpcrel) @someglobal, $noreg, debug-location !13 :: (load (s64) from got)
     MOV8mr killed renamable $rax, 1, $noreg, 0, $noreg, renamable $dil, debug-location !13 :: (store (s8) into @someglobal)
     RET64 debug-location !13
diff --git a/llvm/test/DebugInfo/MIR/InstrRef/dbg-phis-in-ldv.mir b/llvm/test/DebugInfo/MIR/InstrRef/dbg-phis-in-ldv.mir
index 43603b88c4e5..665ca0dbaace 100644
--- a/llvm/test/DebugInfo/MIR/InstrRef/dbg-phis-in-ldv.mir
+++ b/llvm/test/DebugInfo/MIR/InstrRef/dbg-phis-in-ldv.mir
@@ -5,13 +5,13 @@
 # Test that a DBG_INSTR_REF that refers to a DBG_PHI, will be translated into a
 # DBG_VALUE of the value read at that DBG_PHI. Same original code as
 # phi-coalescing.mir.
-# 
+#
 --- |
   ; ModuleID = 'phi-coalescing.mir'
   source_filename = "test.c"
   target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
   target triple = "x86_64-unknown-linux-gnu"
-  
+
   define dso_local i32 @foo(i64 %bar, i64 %baz) !dbg !7 {
   entry:
     call void @llvm.dbg.value(metadata i64 %bar, metadata !12, metadata !DIExpression()), !dbg !13
@@ -23,12 +23,12 @@
     %call = call i64 @getlong(), !dbg !18
     %tobool = icmp ne i64 %call, 0, !dbg !18
     br i1 %tobool, label %if.then, label %if.end, !dbg !20
-  
+
   if.then:                                          ; preds = %entry
     %add1 = add nsw i64 %add, 1, !dbg !21
     call void @llvm.dbg.value(metadata i64 %add1, metadata !12, metadata !DIExpression()), !dbg !13
     br label %if.end, !dbg !22
-  
+
   if.end:                                           ; preds = %if.then, %entry
     %bar.addr.0 = phi i64 [ %add1, %if.then ], [ %add, %entry ], !dbg !13
     call void @llvm.dbg.value(metadata i64 %bar.addr.0, metadata !12, metadata !DIExpression()), !dbg !13
@@ -38,21 +38,21 @@
     %conv = trunc i64 %add2 to i32, !dbg !25
     ret i32 %conv, !dbg !26
   }
-  
+
   ; Function Attrs: nounwind readnone speculatable willreturn
   declare void @llvm.dbg.declare(metadata, metadata, metadata)
-  
+
   declare dso_local void @ext(i64)
-  
+
   declare dso_local i64 @getlong()
-  
+
   ; Function Attrs: nounwind readnone speculatable willreturn
   declare void @llvm.dbg.value(metadata, metadata, metadata)
-  
+
   !llvm.dbg.cu = !{!0}
   !llvm.module.flags = !{!3, !4, !5}
   !llvm.ident = !{!6}
-  
+
   !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
   !1 = !DIFile(filename: "test.c", directory: "/tmp/out.c")
   !2 = !{}
@@ -105,7 +105,7 @@ machineFunctionInfo: {}
 body:             |
   bb.0.entry:
     liveins: $rdi, $rsi, $r14, $rbx
-  
+
     frame-setup PUSH64r killed $r14, implicit-def $rsp, implicit $rsp
     CFI_INSTRUCTION def_cfa_offset 16
     frame-setup PUSH64r killed $rbx, implicit-def $rsp, implicit $rsp
@@ -123,19 +123,19 @@ body:             |
     CALL64pcrel32 @getlong, csr_64, implicit $rsp, implicit $ssp, implicit-def $rax, debug-location !18
     CMP64ri8 killed renamable $rax, 0, implicit-def $eflags, debug-location !18
     JCC_1 %bb.2, 4, implicit $eflags, debug-location !20
-  
+
   bb.1.if.then:
     liveins: $rbx, $r14
-  
+
     renamable $rbx = ADD64ri32 killed renamable $rbx, 1, implicit-def $eflags, debug-location !21
-  
+
   bb.2.if.end:
     liveins: $rbx, $r14
-  
+
     DBG_PHI $rbx, 1
     $rax = COPY $rbx
     $rbx = MOV64ri 0
-    DBG_INSTR_REF !12, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !13
+    DBG_INSTR_REF !12, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !13
 
     ; This sequence should mark the contents of rbx on block entry as being the
     ; value for the variable at this DBG_INSTR_REF. We've force it to be in
@@ -143,8 +143,8 @@ body:             |
     ; CHECK:      DBG_PHI $rbx, 1
     ; CHECK-NEXT: $rax = COPY $rbx
     ; CHECK-NEXT: $rbx = MOV64ri 0
-    ; CHECK-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
-    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}} $rax
+    ; CHECK-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
+    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}} $rax
 
     $rbx = COPY $rax
     renamable $rbx = ADD64rr killed renamable $rbx, killed renamable $r14, implicit-def $eflags, debug-location !23
diff --git a/llvm/test/DebugInfo/MIR/InstrRef/dbg-phis-in-ldv2.mir b/llvm/test/DebugInfo/MIR/InstrRef/dbg-phis-in-ldv2.mir
index 945574cb3cd3..b52247f5b43d 100644
--- a/llvm/test/DebugInfo/MIR/InstrRef/dbg-phis-in-ldv2.mir
+++ b/llvm/test/DebugInfo/MIR/InstrRef/dbg-phis-in-ldv2.mir
@@ -5,23 +5,23 @@
 # Test that a DBG_INSTR_REF that refers to a DBG_PHI, will be translated into a
 # DBG_VALUE of the value read at that DBG_PHI -- in this test, when the value
 # is on the stack.
-# 
+#
 --- |
   target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
   target triple = "x86_64-unknown-linux-gnu"
-  
+
   define dso_local i32 @foo(i64 %bar, i64 %baz) !dbg !7 {
     ret i32 0
   }
-  
+
   declare dso_local void @ext(i64)
-  
+
   declare dso_local i64 @getlong()
-  
+
   !llvm.dbg.cu = !{!0}
   !llvm.module.flags = !{!3, !4, !5}
   !llvm.ident = !{!6}
-  
+
   !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
   !1 = !DIFile(filename: "test.c", directory: "/tmp/out.c")
   !2 = !{}
@@ -70,7 +70,7 @@ body:             |
   bb.0:
     liveins: $rdi, $rsi, $r14, $rbx
     ; CHECK-LABEL: bb.0:
-  
+
     $r14 = MOV64rr $rsi
     $rbx = MOV64rr $rdi
     $rax = MOV64ri 0
@@ -85,9 +85,9 @@ body:             |
     MOV64mr $rsp, 1, $noreg, 16, $noreg, $rax :: (store 8 into %stack.0)
 
     ;; This should resolve to the loaded register.
-    DBG_INSTR_REF !12, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !13
-    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
-    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}} $rcx
+    DBG_INSTR_REF !12, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !13
+    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
+    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}} $rcx
 
     ;; And if we say it's a smaller size, we should be able to pick out smaller
     ;; subregisters within the stack slot.
@@ -97,9 +97,9 @@ body:             |
     MOV64mr $rsp, 1, $noreg, 16, $noreg, $rax :: (store 8 into %stack.0)
 
     ;; This should pick out the 32 bit value.
-    DBG_INSTR_REF !12, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(2, 0), debug-location !13
-    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(2, 0)
-    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}} $ecx
+    DBG_INSTR_REF !12, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(2, 0), debug-location !13
+    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(2, 0)
+    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}} $ecx
 
     ;; Try all the other subregs.
     DBG_PHI %stack.0, 3, 16
@@ -107,18 +107,18 @@ body:             |
     $rcx = MOV64rm $rsp, 1, $noreg, 8, $noreg :: (load 8 from %stack.0)
     MOV64mr $rsp, 1, $noreg, 16, $noreg, $rax :: (store 8 into %stack.0)
 
-    DBG_INSTR_REF !12, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(3, 0), debug-location !13
-    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(3, 0)
-    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}} $cx
+    DBG_INSTR_REF !12, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(3, 0), debug-location !13
+    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(3, 0)
+    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}} $cx
 
     DBG_PHI %stack.0, 4, 8
     $rax = MOV64ri 0
     $rcx = MOV64rm $rsp, 1, $noreg, 8, $noreg :: (load 8 from %stack.0)
     MOV64mr $rsp, 1, $noreg, 16, $noreg, $rax :: (store 8 into %stack.0)
 
-    DBG_INSTR_REF !12, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(4, 0), debug-location !13
-    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(4, 0)
-    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}} $cl
+    DBG_INSTR_REF !12, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(4, 0), debug-location !13
+    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(4, 0)
+    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}} $cl
 
     ;; We can't, at this time, describe subregister fields with nonzero offset.
     ;; It's easily achieved by attaching more data to stack DBG_PHIs, but it's
diff --git a/llvm/test/DebugInfo/MIR/InstrRef/dbg-phis-merging-in-ldv.mir b/llvm/test/DebugInfo/MIR/InstrRef/dbg-phis-merging-in-ldv.mir
index 9601aad77d93..12f9361bebea 100644
--- a/llvm/test/DebugInfo/MIR/InstrRef/dbg-phis-merging-in-ldv.mir
+++ b/llvm/test/DebugInfo/MIR/InstrRef/dbg-phis-merging-in-ldv.mir
@@ -11,7 +11,7 @@
   source_filename = "test.c"
   target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
   target triple = "x86_64-unknown-linux-gnu"
-  
+
   define dso_local i32 @foo(i64 %bar, i64 %baz) !dbg !7 {
   entry:
     call void @llvm.dbg.value(metadata i64 %bar, metadata !12, metadata !DIExpression()), !dbg !13
@@ -25,17 +25,17 @@
     %call = call i64 @getlong(), !dbg !19
     %tobool = icmp ne i64 %call, 0, !dbg !19
     br i1 %tobool, label %if.then, label %if.else, !dbg !21
-  
+
   if.then:                                          ; preds = %entry
     %add2 = add nsw i64 %add, 1, !dbg !22
     call void @llvm.dbg.value(metadata i64 %add2, metadata !12, metadata !DIExpression()), !dbg !13
     br label %if.end, !dbg !24
-  
+
   if.else:                                          ; preds = %entry
     %add3 = add nsw i64 %add, 2, !dbg !25
     call void @llvm.dbg.value(metadata i64 %add3, metadata !12, metadata !DIExpression()), !dbg !13
     br label %if.end
-  
+
   if.end:                                           ; preds = %if.else, %if.then
     %bar.addr.0 = phi i64 [ %add2, %if.then ], [ %add3, %if.else ], !dbg !27
     call void @llvm.dbg.value(metadata i64 %bar.addr.0, metadata !12, metadata !DIExpression()), !dbg !13
@@ -45,21 +45,21 @@
     %conv = trunc i64 %add4 to i32, !dbg !30
     ret i32 %conv, !dbg !31
   }
-  
+
   ; Function Attrs: nounwind readnone speculatable willreturn
   declare void @llvm.dbg.declare(metadata, metadata, metadata)
-  
+
   declare dso_local void @ext(i64)
-  
+
   declare dso_local i64 @getlong()
-  
+
   ; Function Attrs: nounwind readnone speculatable willreturn
   declare void @llvm.dbg.value(metadata, metadata, metadata)
-  
+
   !llvm.dbg.cu = !{!0}
   !llvm.module.flags = !{!3, !4, !5}
   !llvm.ident = !{!6}
-  
+
   !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
   !1 = !DIFile(filename: "test.c", directory: ".")
   !2 = !{}
@@ -118,7 +118,7 @@ body:             |
   bb.0.entry:
     successors: %bb.2, %bb.1
     liveins: $rdi, $rsi, $r14, $rbx
-  
+
     frame-setup PUSH64r killed $r14, implicit-def $rsp, implicit $rsp
     CFI_INSTRUCTION def_cfa_offset 16
     frame-setup PUSH64r killed $rbx, implicit-def $rsp, implicit $rsp
@@ -137,20 +137,20 @@ body:             |
     CALL64pcrel32 @getlong, csr_64, implicit $rsp, implicit $ssp, implicit-def $rax, debug-location !19
     CMP64ri8 killed renamable $rax, 0, implicit-def $eflags, debug-location !19
     JCC_1 %bb.1, 5, implicit $eflags, debug-location !21
-  
+
   bb.2.if.else:
     liveins: $rbx, $r14, $rax
-  
+
     renamable $rbx = ADD64ri32 killed renamable $rbx, 2, implicit-def $eflags, debug-location !25
     DBG_PHI $r14, 1
     DBG_PHI $rbx, 2
     DBG_PHI $rax, 3
     $rax = MOV64ri 0
     JMP_1 %bb.3
-  
+
   bb.1.if.then:
     liveins: $rbx, $r14, $rax
-  
+
     renamable $rbx = ADD64ri32 killed renamable $rbx, 1, implicit-def $eflags, debug-location !22
     DBG_PHI $r14, 1
     DBG_PHI $rbx, 2
@@ -158,33 +158,33 @@ body:             |
 
   bb.3.if.end:
     liveins: $rbx, $r14
-  
-    DBG_INSTR_REF !14, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !13
-    DBG_INSTR_REF !12, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(2, 0), debug-location !13
-    DBG_INSTR_REF !12, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(3, 0), debug-location !13
+
+    DBG_INSTR_REF !14, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !13
+    DBG_INSTR_REF !12, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(2, 0), debug-location !13
+    DBG_INSTR_REF !12, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(3, 0), debug-location !13
 
     ; Value number 1 is live-through the above control flow from the two
     ; DBG_PHIs:
-    ; CHECK:       DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
-    ; CHECK-NEXT:  DBG_VALUE_LIST {{.+}} $r14
+    ; CHECK:       DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
+    ; CHECK-NEXT:  DBG_VALUE_LIST {{.+}} $r14
     ;
     ; While value number 2 has different defs that merge on entry to bb.3.
     ; These are both in $rbx though, and we should find its location:
-    ; CHECK:       DBG_INSTR_REF {{.+}}, dbg-instr-ref(2, 0)
-    ; CHECK-NEXT:  DBG_VALUE_LIST {{.+}} $rbx
+    ; CHECK:       DBG_INSTR_REF {{.+}}, dbg-instr-ref(2, 0)
+    ; CHECK-NEXT:  DBG_VALUE_LIST {{.+}} $rbx
     ;
     ; Value number 3 cannot be resolved because $rax is clobbered in bb.2,
     ; meaning the merged value in bb.3 is incorrect. It should produce a
     ; DBG_VALUE $noreg.
-    ; CHECK:       DBG_INSTR_REF {{.+}}, dbg-instr-ref(3, 0)
-    ; CHECK-NEXT:  DBG_VALUE_LIST {{.+}} $noreg
+    ; CHECK:       DBG_INSTR_REF {{.+}}, dbg-instr-ref(3, 0)
+    ; CHECK-NEXT:  DBG_VALUE_LIST {{.+}} $noreg
 
     renamable $rbx = ADD64rr killed renamable $rbx, killed renamable $r14, implicit-def $eflags, debug-location !28
-    DBG_INSTR_REF !12, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(2, 0), debug-location !13
+    DBG_INSTR_REF !12, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(2, 0), debug-location !13
 
     ; After clobbering rbx, the variable location should not be available.
-    ; CHECK:       DBG_INSTR_REF {{.+}}, dbg-instr-ref(2, 0)
-    ; CHECK-NEXT:  DBG_VALUE_LIST {{.+}} $noreg
+    ; CHECK:       DBG_INSTR_REF {{.+}}, dbg-instr-ref(2, 0)
+    ; CHECK-NEXT:  DBG_VALUE_LIST {{.+}} $noreg
 
     $rdi = MOV64rr $rbx, debug-location !29
     CALL64pcrel32 @ext, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, debug-location !29
diff --git a/llvm/test/DebugInfo/MIR/InstrRef/dbg-phis-with-loops.mir b/llvm/test/DebugInfo/MIR/InstrRef/dbg-phis-with-loops.mir
index 8d7a74795ec2..5521203ea6d5 100644
--- a/llvm/test/DebugInfo/MIR/InstrRef/dbg-phis-with-loops.mir
+++ b/llvm/test/DebugInfo/MIR/InstrRef/dbg-phis-with-loops.mir
@@ -10,7 +10,7 @@
   source_filename = "test.c"
   target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
   target triple = "x86_64-unknown-linux-gnu"
-  
+
   define dso_local i32 @foo(i64 %bar, i64 %baz) !dbg !7 {
   entry:
     call void @llvm.dbg.value(metadata i64 %bar, metadata !12, metadata !DIExpression()), !dbg !13
@@ -24,17 +24,17 @@
     %call = call i64 @getlong(), !dbg !19
     %tobool = icmp ne i64 %call, 0, !dbg !19
     br i1 %tobool, label %if.then, label %if.else, !dbg !21
-  
+
   if.then:                                          ; preds = %entry
     %add2 = add nsw i64 %add, 1, !dbg !22
     call void @llvm.dbg.value(metadata i64 %add2, metadata !12, metadata !DIExpression()), !dbg !13
     br label %if.end, !dbg !24
-  
+
   if.else:                                          ; preds = %entry
     %add3 = add nsw i64 %add, 2, !dbg !25
     call void @llvm.dbg.value(metadata i64 %add3, metadata !12, metadata !DIExpression()), !dbg !13
     br label %if.end
-  
+
   if.end:                                           ; preds = %if.else, %if.then
     %bar.addr.0 = phi i64 [ %add2, %if.then ], [ %add3, %if.else ], !dbg !27
     call void @llvm.dbg.value(metadata i64 %bar.addr.0, metadata !12, metadata !DIExpression()), !dbg !13
@@ -44,23 +44,23 @@
     %conv = trunc i64 %add4 to i32, !dbg !30
     ret i32 %conv, !dbg !31
   }
-  
+
   ; Function Attrs: nounwind readnone speculatable willreturn
   declare void @llvm.dbg.declare(metadata, metadata, metadata) #0
-  
+
   declare dso_local void @ext(i64)
-  
+
   declare dso_local i64 @getlong()
-  
+
   ; Function Attrs: nounwind readnone speculatable willreturn
   declare void @llvm.dbg.value(metadata, metadata, metadata) #0
-  
+
   attributes #0 = { nounwind readnone speculatable willreturn }
-  
+
   !llvm.dbg.cu = !{!0}
   !llvm.module.flags = !{!3, !4, !5}
   !llvm.ident = !{!6}
-  
+
   !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
   !1 = !DIFile(filename: "test.c", directory: ".")
   !2 = !{}
@@ -119,7 +119,7 @@ body:             |
   bb.0.entry:
     successors: %bb.2, %bb.1
     liveins: $rdi, $rsi, $r14, $rbx
-  
+
     frame-setup PUSH64r killed $r14, implicit-def $rsp, implicit $rsp
     CFI_INSTRUCTION def_cfa_offset 16
     frame-setup PUSH64r killed $rbx, implicit-def $rsp, implicit $rsp
@@ -138,20 +138,20 @@ body:             |
     CALL64pcrel32 @getlong, csr_64, implicit $rsp, implicit $ssp, implicit-def $rax, debug-location !19
     CMP64ri8 killed renamable $rax, 0, implicit-def $eflags, debug-location !19
     JCC_1 %bb.1, 5, implicit $eflags, debug-location !21
-  
+
   bb.2.if.else:
     liveins: $rbx, $r14, $rax
-  
+
     renamable $rbx = ADD64ri32 killed renamable $rbx, 2, implicit-def $eflags, debug-location !25
     DBG_PHI $r14, 1
     DBG_PHI $rbx, 2
     DBG_PHI $rax, 3
     $rax = MOV64ri 0
     JMP_1 %bb.3
-  
+
   bb.1.if.then:
     liveins: $rbx, $r14, $rax
-  
+
     renamable $rbx = ADD64ri32 killed renamable $rbx, 1, implicit-def $eflags, debug-location !22
     DBG_PHI $r14, 1
     DBG_PHI $rbx, 2
@@ -164,33 +164,33 @@ body:             |
 
   bb.4:
     liveins: $rbx, $r14
-  
-    DBG_INSTR_REF !14, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !13
-    DBG_INSTR_REF !12, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(2, 0), debug-location !13
-    DBG_INSTR_REF !12, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(3, 0), debug-location !13
+
+    DBG_INSTR_REF !14, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !13
+    DBG_INSTR_REF !12, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(2, 0), debug-location !13
+    DBG_INSTR_REF !12, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(3, 0), debug-location !13
 
     ; Value number 1 is live-through the above control flow from the two
     ; DBG_PHIs:
-    ; CHECK:       DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
-    ; CHECK-NEXT:  DBG_VALUE_LIST {{.+}} $r14
+    ; CHECK:       DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
+    ; CHECK-NEXT:  DBG_VALUE_LIST {{.+}} $r14
     ;
     ; While value number 2 has different defs that merge on entry to bb.3.
     ; These are both in $rbx though, and we should find its location:
-    ; CHECK:       DBG_INSTR_REF {{.+}}, dbg-instr-ref(2, 0)
-    ; CHECK-NEXT:  DBG_VALUE_LIST {{.+}} $rbx
+    ; CHECK:       DBG_INSTR_REF {{.+}}, dbg-instr-ref(2, 0)
+    ; CHECK-NEXT:  DBG_VALUE_LIST {{.+}} $rbx
     ;
     ; Value number 3 cannot be resolved because $rax is clobbered in bb.2,
     ; meaning the merged value in bb.3 is incorrect. It should produce a
     ; DBG_VALUE $noreg.
-    ; CHECK:       DBG_INSTR_REF {{.+}}, dbg-instr-ref(3, 0)
-    ; CHECK-NEXT:  DBG_VALUE_LIST {{.+}} $noreg
+    ; CHECK:       DBG_INSTR_REF {{.+}}, dbg-instr-ref(3, 0)
+    ; CHECK-NEXT:  DBG_VALUE_LIST {{.+}} $noreg
 
     renamable $rbx = ADD64rr killed renamable $rbx, killed renamable $r14, implicit-def $eflags, debug-location !28
-    DBG_INSTR_REF !12, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(2, 0), debug-location !13
+    DBG_INSTR_REF !12, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(2, 0), debug-location !13
 
     ; After clobbering rbx, the variable location should not be available.
-    ; CHECK:       DBG_INSTR_REF {{.+}}, dbg-instr-ref(2, 0)
-    ; CHECK-NEXT:  DBG_VALUE_LIST {{.+}} $noreg
+    ; CHECK:       DBG_INSTR_REF {{.+}}, dbg-instr-ref(2, 0)
+    ; CHECK-NEXT:  DBG_VALUE_LIST {{.+}} $noreg
 
     $rdi = MOV64rr $rbx, debug-location !29
     CALL64pcrel32 @ext, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, debug-location !29
diff --git a/llvm/test/DebugInfo/MIR/InstrRef/follow-spill-of-live-value.mir b/llvm/test/DebugInfo/MIR/InstrRef/follow-spill-of-live-value.mir
index a3332785e50f..e625493f1a30 100644
--- a/llvm/test/DebugInfo/MIR/InstrRef/follow-spill-of-live-value.mir
+++ b/llvm/test/DebugInfo/MIR/InstrRef/follow-spill-of-live-value.mir
@@ -20,22 +20,22 @@
 # CHECK: ![[VARNUM:[0-9]+]] = !DILocalVariable
 #
 # CHECK-LABEL: bb.8:
-# CHECK:       DBG_VALUE_LIST ![[VARNUM]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_constu, 8, DW_OP_minus, DW_OP_deref, DW_OP_LLVM_fragment, 64, 64), $rsp,
+# CHECK:       DBG_VALUE_LIST ![[VARNUM]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_constu, 8, DW_OP_minus, DW_OP_deref, DW_OP_LLVM_fragment, 64, 64), $rsp,
 # CHECK-LABEL:  bb.9:
-# CHECK:       DBG_VALUE_LIST ![[VARNUM]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_constu, 8, DW_OP_minus, DW_OP_deref, DW_OP_LLVM_fragment, 64, 64), $rsp,
+# CHECK:       DBG_VALUE_LIST ![[VARNUM]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_constu, 8, DW_OP_minus, DW_OP_deref, DW_OP_LLVM_fragment, 64, 64), $rsp,
 
 --- |
   ; ModuleID = 'missingvar.ll'
   source_filename = "/fast/fs/llvm34/lib/Analysis/LoopPass.cpp"
   target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
   target triple = "x86_64-unknown-linux-gnu"
-  
+
   %"class.std::deque" = type { %"class.std::_Deque_base" }
   %"class.std::_Deque_base" = type { %"struct.std::_Deque_base<llvm::Loop *, std::allocator<llvm::Loop *>>::_Deque_impl" }
   %"struct.std::_Deque_base<llvm::Loop *, std::allocator<llvm::Loop *>>::_Deque_impl" = type { ptr, i64, %"struct.std::_Deque_iterator", %"struct.std::_Deque_iterator" }
   %"class.llvm::Loop" = type opaque
   %"struct.std::_Deque_iterator" = type { ptr, ptr, ptr, ptr }
-  
+
   define linkonce_odr void @_ZNSt5dequeIPN4llvm4LoopESaIS2_EE13_M_insert_auxESt15_Deque_iteratorIS2_RS2_PS2_EmRKS2_(ptr %this, ptr %__pos, i64 %__n) local_unnamed_addr align 2 !dbg !3 {
   entry:
     %0 = load ptr, ptr undef, align 8, !dbg !7
@@ -43,7 +43,7 @@
     %1 = load ptr, ptr %_M_cur6.i, align 8, !dbg !7
     %2 = load ptr, ptr undef, align 8, !dbg !7
     br i1 undef, label %if.then.i851, label %if.end.i856, !dbg !7
-  
+
   if.then.i851:                                     ; preds = %entry
     %.pre1038 = load ptr, ptr undef, align 8, !dbg !7
     %3 = bitcast ptr %this to ptr, !dbg !7
@@ -51,7 +51,7 @@
     %4 = bitcast ptr %sunkaddr to ptr, !dbg !7
     %.pre1039 = load ptr, ptr %4, align 8, !dbg !7
     br label %if.end.i856, !dbg !7
-  
+
   if.end.i856:                                      ; preds = %if.then.i851, %entry
     %5 = phi ptr [ %.pre1039, %if.then.i851 ], [ undef, %entry ], !dbg !7
     %6 = phi ptr [ %.pre1038, %if.then.i851 ], [ %0, %entry ], !dbg !7
@@ -72,16 +72,16 @@
     %14 = bitcast ptr %sunkaddr3 to ptr, !dbg !7
     %15 = load ptr, ptr %14, align 8, !dbg !7
     br i1 undef, label %if.then.i.i775, label %cond.true.i.i777, !dbg !7
-  
+
   if.then.i.i775:                                   ; preds = %if.end.i856
     %add.ptr.i.i774 = getelementptr inbounds ptr, ptr %11, i64 %__n, !dbg !7
     br label %_ZNKSt15_Deque_iteratorIPN4llvm4LoopERS2_PS2_EplEl.exit796, !dbg !7
-  
+
   cond.true.i.i777:                                 ; preds = %if.end.i856
     %16 = load ptr, ptr undef, align 8, !dbg !7
     %.pre1043 = ptrtoint ptr %16 to i64, !dbg !7
     br label %_ZNKSt15_Deque_iteratorIPN4llvm4LoopERS2_PS2_EplEl.exit796
-  
+
   _ZNKSt15_Deque_iteratorIPN4llvm4LoopERS2_PS2_EplEl.exit796: ; preds = %cond.true.i.i777, %if.then.i.i775
     %sub.ptr.rhs.cast3.i.i.i.i.i.i.i.i.i690.pre-phi = phi i64 [ undef, %if.then.i.i775 ], [ %.pre1043, %cond.true.i.i777 ], !dbg !7
     %__tmp.sroa.13.0.i788 = phi ptr [ %15, %if.then.i.i775 ], [ undef, %cond.true.i.i777 ], !dbg !7
@@ -103,12 +103,12 @@
     %add11.i.i.i.i.i.i.i.i.i699 = add i64 %add.i.i.i.i.i.i.i.i.i698, %sub.ptr.div5.i.i.i.i.i.i.i.i.i692, !dbg !7
     %cmp27.i.i.i.i.i.i.i.i700 = icmp sgt i64 %add11.i.i.i.i.i.i.i.i.i699, 0, !dbg !7
     br i1 %cmp27.i.i.i.i.i.i.i.i700, label %for.body.i.i.i.i.i.i.i.i711.preheader, label %_ZSt22__uninitialized_move_aISt15_Deque_iteratorIPN4llvm4LoopERS3_PS3_ES6_SaIS3_EET0_T_S9_S8_RT1_.exit737, !dbg !7
-  
+
   for.body.i.i.i.i.i.i.i.i711.preheader:            ; preds = %_ZNKSt15_Deque_iteratorIPN4llvm4LoopERS2_PS2_EplEl.exit796
     %18 = load ptr, ptr %11, align 8, !dbg !7
     store ptr %18, ptr %add.ptr.i.i.i.i859, align 8, !dbg !7
     ret void
-  
+
   _ZSt22__uninitialized_move_aISt15_Deque_iteratorIPN4llvm4LoopERS3_PS3_ES6_SaIS3_EET0_T_S9_S8_RT1_.exit737: ; preds = %_ZNKSt15_Deque_iteratorIPN4llvm4LoopERS2_PS2_EplEl.exit796
     %19 = ptrtoint ptr %storemerge.i.i791 to i64, !dbg !7
     %20 = ptrtoint ptr %__tmp.sroa.13.0.i788 to i64, !dbg !7
@@ -139,30 +139,30 @@
     %add11.i.i.i611 = add i64 %add.i.i.i610, %sub.i.i.i600, !dbg !7
     %cmp68.i.i = icmp sgt i64 %add11.i.i.i611, 0, !dbg !7
     br i1 %cmp68.i.i, label %while.body.i.i625, label %_ZSt4moveIPN4llvm4LoopEESt15_Deque_iteratorIT_RS4_PS4_ES7_S7_S7_.exit, !dbg !7
-  
+
   while.body.i.i625:                                ; preds = %_ZSt22__uninitialized_move_aISt15_Deque_iteratorIPN4llvm4LoopERS3_PS3_ES6_SaIS3_EET0_T_S9_S8_RT1_.exit737
     ret void
-  
+
   _ZSt4moveIPN4llvm4LoopEESt15_Deque_iteratorIT_RS4_PS4_ES7_S7_S7_.exit: ; preds = %_ZSt22__uninitialized_move_aISt15_Deque_iteratorIPN4llvm4LoopERS3_PS3_ES6_SaIS3_EET0_T_S9_S8_RT1_.exit737
     %add.i.i.i562 = sub i64 %sub.ptr.div5.i.i.i604, %__n, !dbg !7
     %cmp.i.i.i563 = icmp sgt i64 %add.i.i.i562, -1, !dbg !7
     br i1 %cmp.i.i.i563, label %land.lhs.true.i.i.i565, label %cond.false.i.i.i572, !dbg !7
-  
+
   land.lhs.true.i.i.i565:                           ; preds = %_ZSt4moveIPN4llvm4LoopEESt15_Deque_iteratorIT_RS4_PS4_ES7_S7_S7_.exit
     ret void
-  
+
   cond.false.i.i.i572:                              ; preds = %_ZSt4moveIPN4llvm4LoopEESt15_Deque_iteratorIT_RS4_PS4_ES7_S7_S7_.exit
     ret void
   }
-  
+
   ; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
   declare void @llvm.dbg.value(metadata, metadata, metadata) #0
-  
+
   attributes #0 = { nofree nosync nounwind readnone speculatable willreturn }
-  
+
   !llvm.module.flags = !{!0}
   !llvm.dbg.cu = !{!1}
-  
+
   !0 = !{i32 2, !"Debug Info Version", i32 3}
   !1 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !2, producer: "beards", isOptimized: true, runtimeVersion: 4, emissionKind: FullDebug)
   !2 = !DIFile(filename: "bees.cpp", directory: "")
@@ -203,7 +203,7 @@ body:             |
   bb.0.entry:
     successors: %bb.2, %bb.1
     liveins: $rdi, $rdx, $rsi, $rbp, $r15, $r14, $r13, $r12, $rbx
-  
+
     frame-setup PUSH64r killed $rbp, implicit-def $rsp, implicit $rsp, debug-location !7
     CFI_INSTRUCTION def_cfa_offset 16
     frame-setup PUSH64r killed $r15, implicit-def $rsp, implicit $rsp, debug-location !7
@@ -227,23 +227,23 @@ body:             |
     TEST8rr renamable $al, renamable $al, implicit-def $eflags, implicit killed $eax, debug-location !7
     MOV64mr $rsp, 1, $noreg, -8, $noreg, renamable $r10 :: (store 8 into %stack.0)
     JCC_1 %bb.1, 5, implicit $eflags, debug-location !7
-  
+
   bb.2.if.then.i851:
     liveins: $rdi, $rdx, $rsi
-  
+
     renamable $r10 = MOV64rm undef renamable $rax, 1, $noreg, 0, $noreg, debug-location !7 :: (load 8 from `ptr undef`)
     renamable $r9 = MOV64rm renamable $rdi, 1, $noreg, 40, $noreg, debug-location !7 :: (load 8 from %ir.4)
     JMP_1 %bb.3
-  
+
   bb.1:
     liveins: $rdi, $rdx, $rsi, $r10
-  
+
     renamable $r9 = IMPLICIT_DEF debug-location !7
-  
+
   bb.3.if.end.i856:
     successors: %bb.4, %bb.5
     liveins: $rdi, $rdx, $rsi, $r9, $r10
-  
+
     renamable $rax = MOV64rm renamable $rdi, 1, $noreg, 16, $noreg, debug-location !7 :: (load 8 from %ir._M_cur6.i)
     renamable $r15 = LEA64r $noreg, 8, renamable $rdx, 0, $noreg, debug-location !7
     MOV64mr undef renamable $rax, 1, $noreg, 0, $noreg, renamable $r10, debug-location !7 :: (store 8 into `ptr undef`)
@@ -254,24 +254,24 @@ body:             |
     renamable $ebp = XOR32rr undef $ebp, undef $ebp, implicit-def dead $eflags
     TEST8rr renamable $bpl, renamable $bpl, implicit-def $eflags, implicit killed $ebp, debug-location !7
     JCC_1 %bb.5, 5, implicit killed $eflags, debug-location !7
-  
+
   bb.4.if.then.i.i775:
     liveins: $rax, $rdi, $rdx, $rsi, $r8, $r9, $r10, $r11, $r13, $r15
-  
+
     renamable $r14 = LEA64r renamable $r13, 8, renamable $rdx, 0, $noreg, debug-location !7
     renamable $r12 = IMPLICIT_DEF debug-location !7
     JMP_1 %bb.6
-  
+
   bb.5.cond.true.i.i777:
     liveins: $rax, $rdi, $rdx, $rsi, $r8, $r9, $r10, $r11, $r13, $r15
-  
+
     renamable $r12 = MOV64rm undef renamable $rax, 1, $noreg, 0, $noreg, debug-location !7 :: (load 8 from `ptr undef`)
     renamable $r14 = IMPLICIT_DEF debug-location !7
-  
+
   bb.6._ZNKSt15_Deque_iteratorIPN4llvm4LoopERS2_PS2_EplEl.exit796:
     successors: %bb.7(0x50000000), %bb.8(0x30000000)
     liveins: $rax, $rdi, $rdx, $rsi, $r8, $r9, $r10, $r11, $r12, $r13, $r14, $r15
-  
+
     renamable $rax = SUB64rr killed renamable $rax, killed renamable $r15, implicit-def dead $eflags, debug-location !7
     $rbp = MOV64rr $r11, debug-location !7
     renamable $rbp = SUB64rr killed renamable $rbp, renamable $r11, implicit-def dead $eflags, debug-location !7
@@ -285,37 +285,37 @@ body:             |
     renamable $rcx = LEA64r killed renamable $rbx, 1, killed renamable $rcx, -64, $noreg, debug-location !7
     TEST64rr killed renamable $rcx, renamable $rcx, implicit-def $eflags, debug-location !7
     JCC_1 %bb.8, 14, implicit killed $eflags, debug-location !7
-  
+
   bb.7.for.body.i.i.i.i.i.i.i.i711.preheader:
     liveins: $rax, $r13
-  
+
     renamable $rcx = MOV64rm killed renamable $r13, 1, $noreg, 0, $noreg, debug-location !7 :: (load 8 from %ir.11)
     MOV64mr killed renamable $rax, 1, $noreg, 0, $noreg, killed renamable $rcx, debug-location !7 :: (store 8 into %ir.add.ptr.i.i.i.i859)
     JMP_1 %bb.10
-  
+
   bb.8:
     successors: %bb.10(0x50000000), %bb.9(0x30000000)
     liveins: $rax, $rdi, $rdx, $rsi, $r8, $r9, $r10, $r11, $r14
-  
+
     MOV64mr renamable $rdi, 1, $noreg, 16, $noreg, killed renamable $rax, debug-location !7 :: (store 8 into %ir.22)
     renamable $rax = MOV64rm $rsp, 1, $noreg, -8, $noreg :: (load 8 from %stack.0)
     MOV64mr undef renamable $rax, 1, $noreg, 0, $noreg, killed renamable $rax, debug-location !7 :: (store 8 into `ptr undef`)
     MOV64mr undef renamable $rax, 1, $noreg, 0, $noreg, killed renamable $r10, debug-location !7 :: (store 8 into `ptr undef`)
     MOV64mr killed renamable $rdi, 1, $noreg, 40, $noreg, killed renamable $r9, debug-location !7 :: (store 8 into %ir.24)
     renamable $rax = MOV64rm killed renamable $rsi, 1, $noreg, 24, $noreg, debug-location !7 :: (load 8 from %ir.26)
-    DBG_INSTR_REF !8, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 64, 64), dbg-instr-ref(1, 0), debug-location !7
+    DBG_INSTR_REF !8, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 64, 64), dbg-instr-ref(1, 0), debug-location !7
     renamable $rax = SUB64rr killed renamable $rax, killed renamable $r11, implicit-def dead $eflags, debug-location !7
     renamable $r8 = SUB64rr killed renamable $r8, killed renamable $r14, implicit-def dead $eflags, debug-location !7
     renamable $r8 = exact SAR64ri killed renamable $r8, 3, implicit-def dead $eflags, debug-location !7
     renamable $rax = LEA64r killed renamable $r8, 8, killed renamable $rax, -64, $noreg, debug-location !7
     TEST64rr killed renamable $rax, renamable $rax, implicit-def $eflags, debug-location !7
     JCC_1 %bb.10, 15, implicit $eflags, debug-location !7
-  
+
   bb.9:
     liveins: $rdx
-  
+
     dead renamable $rdx = NEG64r killed renamable $rdx, implicit-def $eflags, debug-location !7
-  
+
   bb.10.while.body.i.i625:
     $rbx = frame-destroy POP64r implicit-def $rsp, implicit $rsp
     CFI_INSTRUCTION def_cfa_offset 48
diff --git a/llvm/test/DebugInfo/MIR/InstrRef/livedebugvalues_illegal_locs.mir b/llvm/test/DebugInfo/MIR/InstrRef/livedebugvalues_illegal_locs.mir
index 1a48ab5ed230..d4ed0fba2d7c 100644
--- a/llvm/test/DebugInfo/MIR/InstrRef/livedebugvalues_illegal_locs.mir
+++ b/llvm/test/DebugInfo/MIR/InstrRef/livedebugvalues_illegal_locs.mir
@@ -46,37 +46,37 @@ body:  |
     ; CHECK-LABE: bb.0.entry:
 
     $rax = MOV64ri 1, debug-instr-number 1, debug-location !17
-    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !17
+    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !17
     ;; First check that picking out location works as usual.
-    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
-    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $rax
+    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
+    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $rax
 
     $rax = MOV64ri 1, debug-instr-number 2, debug-location !17
-    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(2, 999), debug-location !17
+    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(2, 999), debug-location !17
     ;; Test out of bounds operand number.
-    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(2, 999)
-    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $noreg
+    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(2, 999)
+    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $noreg
 
     $rax = MOV64ri 1, debug-instr-number 3, debug-location !17
-    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(3, 1), debug-location !17
+    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(3, 1), debug-location !17
     ;; Test non-register operand
-    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(3, 1)
-    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $noreg
+    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(3, 1)
+    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $noreg
 
     ;; FIXME: We should test what happens when this meta-instruction is seen
     ;; by livedbugvalues with an instruction number. However, right now it's
     ;; impossible to turn the machine-code verifier off when loading MIR?
     ;KILL implicit killed $eflags, debug-instr-number 4, debug-location !17
-    ;DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(4, 0), debug-location !17
+    ;DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(4, 0), debug-location !17
     ;;; Test non-def operand
-    ;; check:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(4, 0)
-    ;; check-next: DBG_VALUE_LIST {{.+}}, $noreg
+    ;; check:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(4, 0)
+    ;; check-next: DBG_VALUE_LIST {{.+}}, $noreg
 
     $noreg = MOV32ri 1, debug-instr-number 5, debug-location !17
-    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(5, 0), debug-location !17
+    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(5, 0), debug-location !17
     ;; Def of $noreg?
-    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(5, 0)
-    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $noreg
+    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(5, 0)
+    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $noreg
 
     JCC_1 %bb.1, 1, implicit $eflags
     JMP_1 %bb.2
@@ -86,10 +86,10 @@ body:  |
     ; CHECK-LABEL: bb.1:
 
     DBG_PHI $rax, 6
-    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(6, 1), debug-location !17
+    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(6, 1), debug-location !17
     ;; Test out-of-bounds reference to a DBG_PHI.
-    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(6, 1)
-    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $noreg
+    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(6, 1)
+    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $noreg
 
     DBG_PHI $noreg, 7
     JMP_1 %bb.3
@@ -98,22 +98,22 @@ body:  |
     successors: %bb.3
     ; CHECK-LABEL: bb.2:
     DBG_PHI 1, 6
-    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(6, 0), debug-location !17
+    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(6, 0), debug-location !17
     ;; Test non-reg operand to DBG_PHI. It's not clear if this can ever happen
     ;; as the result of an optimisation, but lets test for it anyway.
-    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(6, 0)
-    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $noreg
+    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(6, 0)
+    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $noreg
 
     DBG_PHI 1, 7
     JMP_1 %bb.3
 
   bb.3:
     ; CHECK-LABEL: bb.3:
-    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(7, 0), debug-location !17
+    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(7, 0), debug-location !17
     ;; PHI resolution of illegal inputs shouldn't crash either. It should also
     ;; come out as a $noreg location.
-    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(7, 0)
-    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $noreg
+    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(7, 0)
+    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $noreg
 
     RET 0, debug-location !17
 
diff --git a/llvm/test/DebugInfo/MIR/InstrRef/livedebugvalues_instrref_tolocs.mir b/llvm/test/DebugInfo/MIR/InstrRef/livedebugvalues_instrref_tolocs.mir
index 12ff645467e8..e8391538d3fe 100644
--- a/llvm/test/DebugInfo/MIR/InstrRef/livedebugvalues_instrref_tolocs.mir
+++ b/llvm/test/DebugInfo/MIR/InstrRef/livedebugvalues_instrref_tolocs.mir
@@ -38,17 +38,17 @@ body:  |
   bb.0.entry:
     $rax = MOV64ri 1, debug-instr-number 1, debug-location !17
     ; This debug instruction should identify the value as being in $rax.
-    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !17
-    ; CHECK: DBG_VALUE_LIST {{.+}}, $rax
+    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !17
+    ; CHECK: DBG_VALUE_LIST {{.+}}, $rax
 
     $rbx = COPY killed $rax, debug-location !17
     $rax = MOV64ri 1, debug-location !17
-    ; CHECK: DBG_VALUE_LIST {{.+}}, $rbx
+    ; CHECK: DBG_VALUE_LIST {{.+}}, $rbx
 
-    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(2, 0), debug-location !17
+    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(2, 0), debug-location !17
     ; No instruction is labelled with the number "2". This should produce an
     ; empty variable location.
-    ; CHECK: DBG_VALUE_LIST {{.+}}, $noreg
+    ; CHECK: DBG_VALUE_LIST {{.+}}, $noreg
 
     $rbx = MOV64ri 1, debug-instr-number 3, debug-location !17
     JMP_1 %bb.1
@@ -57,40 +57,40 @@ body:  |
     ; CHECK-LABEL: bb.1:
   bb.1:
 
-    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(3, 0), debug-location !17
+    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(3, 0), debug-location !17
     ; This refers to a value def'd in a parent block -- but it should be
     ; tracked into this block.
-    ; CHECK: DBG_VALUE_LIST {{.+}}, $rbx
+    ; CHECK: DBG_VALUE_LIST {{.+}}, $rbx
     JMP_1 %bb.2
 
     ; CHECK-LABEL: bb.2:
   bb.2:
     ; Just like any other variable location, live-ins should be created for
     ; any successor blocks.
-    ; CHECK: DBG_VALUE_LIST {{.+}}, $rbx
+    ; CHECK: DBG_VALUE_LIST {{.+}}, $rbx
 
-    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(5, 0), debug-location !17
+    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(5, 0), debug-location !17
     ; This is a debug use-before-def: the value appears a few instructions
     ; later. Any earlier value should be terminated here, _and_ we should
     ; emit a DBG_VALUE when the value becomes available.
-    ; CHECK: DBG_VALUE_LIST {{.+}}, $noreg
+    ; CHECK: DBG_VALUE_LIST {{.+}}, $noreg
 
     $rax = MOV64ri 1, debug-location !17
     $rax = MOV64ri 1, debug-location !17
     $rcx = MOV64ri 1, debug-instr-number 5, debug-location !17
-    ; CHECK: DBG_VALUE_LIST {{.+}}, $rcx
+    ; CHECK: DBG_VALUE_LIST {{.+}}, $rcx
     $rax = MOV64ri 1, debug-location !17
 
-    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(6, 0), debug-location !17
+    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(6, 0), debug-location !17
     ; Another debug use-before-def, but across block boundaries.
-    ; CHECK: DBG_VALUE_LIST {{.+}}, $noreg
+    ; CHECK: DBG_VALUE_LIST {{.+}}, $noreg
     JMP_1 %bb.3
 
     ; CHECK-LABEL: bb.3:
   bb.3:
     $rax = MOV64ri 1, debug-location !17
     $rdx = MOV64ri 1, debug-instr-number 6, debug-location !17
-    ; CHECK: DBG_VALUE_LIST {{.+}}, $rdx
+    ; CHECK: DBG_VALUE_LIST {{.+}}, $rdx
 
     ; Terminate variable location for next few blocks,
     DBG_VALUE $noreg, $noreg, !16, !DIExpression(), debug-location !17
@@ -105,8 +105,8 @@ body:  |
     $rdx = MOV64ri 1, implicit-def $eflags, debug-location !17
     JCC_1 %bb.6, 4, implicit $eflags, debug-location !17
   bb.5:
-    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(7, 0), debug-location !17
-    ; CHECK: DBG_VALUE_LIST {{.+}}, $noreg
+    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(7, 0), debug-location !17
+    ; CHECK: DBG_VALUE_LIST {{.+}}, $noreg
     JMP_1 %bb.6, debug-location !17
   bb.6:
     $rsi = MOV64ri 1, debug-instr-number 7, debug-location !17
@@ -115,8 +115,8 @@ body:  |
   ; A use-before-def shouldn't pass another definition of the variable location
   ; or value.
   bb.7:
-    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(8, 0), debug-location !17
-    ; CHECK: DBG_VALUE_LIST {{.+}}, $noreg
+    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(8, 0), debug-location !17
+    ; CHECK: DBG_VALUE_LIST {{.+}}, $noreg
     DBG_VALUE $rax, $noreg, !16, !DIExpression(), debug-location !17
     ; CHECK: DBG_VALUE $rax, $noreg,
     $rdi = MOV64ri 1, debug-instr-number 8, debug-location !17
@@ -124,33 +124,33 @@ body:  |
   ; Loops: use-before-defs should be live-through loops, assuming that nothing
   ; in that loop modifies the variable location.
   bb.8:
-    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(9, 0), debug-location !17
-    ; CHECK: DBG_VALUE_LIST {{.+}}, $noreg
+    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(9, 0), debug-location !17
+    ; CHECK: DBG_VALUE_LIST {{.+}}, $noreg
     JCC_1 %bb.8, 4, implicit $eflags
   bb.9:
     $rax = MOV64ri 11, debug-instr-number 9, debug-location !17
-    ; CHECK: DBG_VALUE_LIST {{.+}}, $rax
+    ; CHECK: DBG_VALUE_LIST {{.+}}, $rax
 
   ; Likewise, use-before-defs where anything changes the variable location
   ; or value in the loop, should be discarded.
   bb.10:
     ; live-in,
-    ; CHECK: DBG_VALUE_LIST {{.+}}, $rax
-    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(10, 0), debug-location !17
-    ; CHECK: DBG_VALUE_LIST {{.+}}, $noreg
+    ; CHECK: DBG_VALUE_LIST {{.+}}, $rax
+    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(10, 0), debug-location !17
+    ; CHECK: DBG_VALUE_LIST {{.+}}, $noreg
 
   bb.11:
     $rbx = MOV64ri 1, debug-location !17
 
   bb.12:
-    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(9, 0), debug-location !17
+    DBG_INSTR_REF !16, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(9, 0), debug-location !17
     ; This still has a value in $rax,
-    ; CHECK: DBG_VALUE_LIST {{.+}}, $rax
+    ; CHECK: DBG_VALUE_LIST {{.+}}, $rax
     JCC_1 %bb.11, 4, implicit $eflags
 
   bb.13:
     ; Live in,
-    ; CHECK: DBG_VALUE_LIST {{.+}}, $rax
+    ; CHECK: DBG_VALUE_LIST {{.+}}, $rax
     $rbx = MOV64ri 11, debug-instr-number 10, debug-location !17
     ; This is instruction 10 referred to in bb.10. However, as the variable
     ; location/value has been modified in the meantime, no DBG_VALUE should be
diff --git a/llvm/test/DebugInfo/MIR/InstrRef/livedebugvalues_stackslot_subregs.mir b/llvm/test/DebugInfo/MIR/InstrRef/livedebugvalues_stackslot_subregs.mir
index d02a4062d9b7..ff5d3c5e1cba 100644
--- a/llvm/test/DebugInfo/MIR/InstrRef/livedebugvalues_stackslot_subregs.mir
+++ b/llvm/test/DebugInfo/MIR/InstrRef/livedebugvalues_stackslot_subregs.mir
@@ -50,8 +50,8 @@ body:  |
     $rax = MOV64ri 0
     $rdi = MOV64ri 0
 
-    DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !12
+    DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !12
     ; CHECK:      DBG_INSTR_REF
-    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $esi
+    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $esi
     RET64 $rsi, debug-location !12
 ...
diff --git a/llvm/test/DebugInfo/MIR/InstrRef/livedebugvalues_subreg_substitutions.mir b/llvm/test/DebugInfo/MIR/InstrRef/livedebugvalues_subreg_substitutions.mir
index 7c2111977fe7..779c06fe378d 100644
--- a/llvm/test/DebugInfo/MIR/InstrRef/livedebugvalues_subreg_substitutions.mir
+++ b/llvm/test/DebugInfo/MIR/InstrRef/livedebugvalues_subreg_substitutions.mir
@@ -72,39 +72,39 @@ body:  |
   liveins: $rdi, $rax
     CALL64pcrel32 @ext, csr_64, implicit $rsp, implicit $ssp, implicit $edi, implicit-def $rax, debug-instr-number 4, debug-location !12
     ; CHECK:      CALL64pcrel32
-    DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !12
-    ; CHECK-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
-    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $al
-    DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(5, 0), debug-location !12
-    ; CHECK-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(5, 0)
-    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $ah
-    DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(8, 0), debug-location !12
-    ; CHECK-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(8, 0)
-    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $ah
-    DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(13, 0), debug-location !12
-    ; CHECK-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(13, 0)
-    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $noreg
+    DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !12
+    ; CHECK-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
+    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $al
+    DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(5, 0), debug-location !12
+    ; CHECK-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(5, 0)
+    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $ah
+    DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(8, 0), debug-location !12
+    ; CHECK-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(8, 0)
+    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $ah
+    DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(13, 0), debug-location !12
+    ; CHECK-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(13, 0)
+    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $noreg
     MOV64mr $rsp, 1, $noreg, 16, $noreg, $rax :: (store 8 into %stack.0)
     $rax = MOV64ri 0, debug-location !12
     ; CHECK:      $rax = MOV64ri 0
     ; The value is now located in a spill slot, as a subregister within the
     ; slot, which InstrRefBasedLDV should be able to find.
-    DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !12
-    ; CHECK-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
-    ; CHECK-NEXT: DBG_VALUE_LIST !{{[0-9]*}}, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_constu, 8, DW_OP_minus, DW_OP_deref), $rsp
-    DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(5, 0), debug-location !12
+    DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !12
+    ; CHECK-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
+    ; CHECK-NEXT: DBG_VALUE_LIST !{{[0-9]*}}, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_constu, 8, DW_OP_minus, DW_OP_deref), $rsp
+    DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(5, 0), debug-location !12
     ; This and the next DBG_INSTR_REF refer to a value that is on the stack, but
     ; is located at a non-zero offset from the start of the slot -- $ah within
     ; $rax is 8 bits in. Today, InstrRefBasedLDV can't express this. It also
     ; doesn't seem likely to be profitable.
-    ; CHECK-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(5, 0)
-    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $noreg
-    DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(8, 0), debug-location !12
-    ; CHECK-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(8, 0)
-    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $noreg
-    DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(13, 0), debug-location !12
-    ; CHECK-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(13, 0)
-    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $noreg
+    ; CHECK-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(5, 0)
+    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $noreg
+    DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(8, 0), debug-location !12
+    ; CHECK-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(8, 0)
+    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $noreg
+    DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(13, 0), debug-location !12
+    ; CHECK-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(13, 0)
+    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $noreg
     $rax = MOV64rm $rsp, 1, $noreg, 8, $noreg :: (load 8 from %stack.0)
     RET64 $rax, debug-location !12
 ...
diff --git a/llvm/test/DebugInfo/MIR/InstrRef/memory-operand-folding-tieddef.mir b/llvm/test/DebugInfo/MIR/InstrRef/memory-operand-folding-tieddef.mir
index 5ebd1a89ae92..c0dd82545358 100644
--- a/llvm/test/DebugInfo/MIR/InstrRef/memory-operand-folding-tieddef.mir
+++ b/llvm/test/DebugInfo/MIR/InstrRef/memory-operand-folding-tieddef.mir
@@ -23,10 +23,10 @@
   ; ModuleID = 'reduced.ll'
   source_filename = "reduced.ll"
   target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-  
+
   %"class.llvm::APInt" = type { i32, %union.anon }
   %union.anon = type { i64 }
-  
+
   define void @_ZNK4llvm5APInt5magicEv() local_unnamed_addr align 2 !dbg !7 {
     ret void
   }
@@ -34,7 +34,7 @@
   !llvm.dbg.cu = !{!0}
   !llvm.module.flags = !{!3, !4, !5}
   !llvm.ident = !{!6}
-  
+
   !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
   !1 = !DIFile(filename: "test.c", directory: ".")
   !2 = !{}
@@ -124,7 +124,7 @@ body:             |
     %36:gr64 = IMPLICIT_DEF
     %37:gr64 = IMPLICIT_DEF
     %21:gr32 = IMPLICIT_DEF
-  
+
   bb.1:
     %0:gr64 = PHI %14, %bb.0, %12, %bb.5
     %1:gr32 = PHI %15, %bb.0, %11, %bb.5
@@ -140,12 +140,12 @@ body:             |
     TEST8rr %20, %20, implicit-def $eflags, debug-location !13
     JCC_1 %bb.3, 5, implicit $eflags, debug-location !13
     JMP_1 %bb.2, debug-location !13
-  
+
   bb.2:
-  
+
   bb.3:
     successors: %bb.4, %bb.5
-  
+
     %7:gr32 = PHI %5, %bb.1, %21, %bb.2, debug-location !13
     MOV32mr %22, 1, $noreg, 0, $noreg, %7, debug-location !13 :: (store (s32) into `ptr undef`, align 8)
     %8:gr64 = MOV64rm %23, 1, $noreg, 0, $noreg, debug-location !13 :: (load (s64) from `ptr undef`)
@@ -155,11 +155,11 @@ body:             |
     TEST8rr %28, %28, implicit-def $eflags, debug-location !13
     JCC_1 %bb.5, 5, implicit $eflags, debug-location !13
     JMP_1 %bb.4, debug-location !13
-  
+
   bb.4:
     %29:gr64 = ADD64rr %2, %2, implicit-def dead $eflags, debug-location !13
     MOV64mr %30, 1, $noreg, 0, $noreg, killed %29, debug-location !13 :: (store (s64) into `ptr undef`)
-  
+
   bb.5:
     %9:gr32 = MOV32rm %26, 1, $noreg, 0, $noreg, debug-location !13 :: (load (s32) from `ptr undef`, align 8)
     %10:gr64 = MOV64rm %31, 1, $noreg, 0, $noreg, debug-location !13 :: (load (s64) from `ptr undef`)
@@ -173,6 +173,6 @@ body:             |
     CALL64r %37, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit $rsi, implicit-def $rsp, implicit-def $ssp, implicit-def $al, debug-location !13
     ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp, debug-location !13
     %13:gr32 = INC32r %6, implicit-def dead $eflags, debug-instr-number 1, debug-location !13
-    DBG_INSTR_REF !12, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !13
+    DBG_INSTR_REF !12, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !13
     JMP_1 %bb.1, debug-location !13
 ...
diff --git a/llvm/test/DebugInfo/MIR/InstrRef/memory-operand-load-folding.mir b/llvm/test/DebugInfo/MIR/InstrRef/memory-operand-load-folding.mir
index b0bff30c4c82..d112a0b9e614 100644
--- a/llvm/test/DebugInfo/MIR/InstrRef/memory-operand-load-folding.mir
+++ b/llvm/test/DebugInfo/MIR/InstrRef/memory-operand-load-folding.mir
@@ -16,35 +16,35 @@
   source_filename = "reduced.ll"
   target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
   target triple = "x86_64-unknown-linux-gnu"
-  
+
   define i32 @foo(i32 %i, float %f) local_unnamed_addr {
   if.then:
     %call = tail call i32 (i32, ...) undef(i32 %i)
     %cond = icmp eq i32 %call, 1
     br i1 %cond, label %sw.bb, label %sw.epilog
-  
+
   sw.bb:                                            ; preds = %if.then
     %conv = fptosi float %f to i8
     call void @llvm.dbg.value(metadata i8 %conv, metadata !6, metadata !DIExpression()), !dbg !17
     %tobool.not = icmp eq i8 %conv, 0
     br i1 %tobool.not, label %if.end, label %sw.epilog
-  
+
   if.end:                                           ; preds = %sw.bb
     tail call void (...) undef()
     br label %sw.epilog
-  
+
   sw.epilog:                                        ; preds = %if.then, %if.end, %sw.bb
     ret i32 undef
   }
-  
+
   ; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
   declare void @llvm.dbg.value(metadata, metadata, metadata) #0
-  
+
   attributes #0 = { nofree nosync nounwind readnone speculatable willreturn }
-  
+
   !llvm.dbg.cu = !{!0}
   !llvm.module.flags = !{!2, !3, !4, !5}
-  
+
   !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
   !1 = !DIFile(filename: "/tmp/test.c", directory: ".")
   !2 = !{i32 7, !"Dwarf Version", i32 4}
@@ -93,7 +93,7 @@ body:             |
   bb.0.if.then:
     successors: %bb.1(0x40000000), %bb.3(0x40000000)
     liveins: $edi, $xmm0
-  
+
     %1:fr32 = COPY killed $xmm0
     %0:gr32 = COPY killed $edi
     ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
@@ -107,27 +107,27 @@ body:             |
     CMP32ri8 killed %5, 1, implicit-def $eflags
     JCC_1 %bb.3, 5, implicit killed $eflags
     JMP_1 %bb.1
-  
+
   bb.1.sw.bb:
     successors: %bb.2(0x30000000), %bb.3(0x50000000)
-  
+
     %7:gr32 = nofpexcept CVTTSS2SIrr killed %1, implicit $mxcsr, debug-instr-number 1
     %8:gr8 = COPY killed %7.sub_8bit
-    DBG_INSTR_REF !6, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(2, 0), debug-location !17
+    DBG_INSTR_REF !6, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(2, 0), debug-location !17
     TEST8rr killed %8, %8, implicit-def $eflags
     JCC_1 %bb.3, 5, implicit killed $eflags
     JMP_1 %bb.2
-  
+
   bb.2.if.end:
     successors: %bb.3(0x80000000)
-  
+
     ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
     %9:gr32 = MOV32r0 implicit-def dead $eflags
     %10:gr8 = COPY killed %9.sub_8bit
     $al = COPY killed %10
     CALL64r undef %11:gr64, csr_64, implicit $rsp, implicit $ssp, implicit killed $al, implicit-def $rsp, implicit-def $ssp
     ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-  
+
   bb.3.sw.epilog:
     RET 0, undef $eax
 
diff --git a/llvm/test/DebugInfo/MIR/InstrRef/memory-operand-tracking.mir b/llvm/test/DebugInfo/MIR/InstrRef/memory-operand-tracking.mir
index c05161eda807..7d83ccb52cec 100644
--- a/llvm/test/DebugInfo/MIR/InstrRef/memory-operand-tracking.mir
+++ b/llvm/test/DebugInfo/MIR/InstrRef/memory-operand-tracking.mir
@@ -53,33 +53,33 @@ body:  |
     MOV64mr $rsp, 1, $noreg, 16, $noreg, $rdi :: (store 8 into %stack.0)
     $rax = MOV64ri 0, debug-location !12
     INC32m $rsp, 1, $noreg, 4, $noreg, implicit-def dead $eflags, debug-instr-number 3, debug-location !DILocation(line: 0, scope: !7) :: (store (s32) into %stack.0)
-    DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(2, 0), debug-location !12
-    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(2, 0)
-    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $rsp
+    DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(2, 0), debug-location !12
+    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(2, 0)
+    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $rsp
     ;; Test that the old value (from the DBG_PHI) is not tracked anywhere. It
     ;; should not be considered as being on the stack any more.
-    DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !12
-    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
-    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $noreg
+    DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !12
+    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
+    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $noreg
     INC32m $rsp, 1, $noreg, 4, $noreg, implicit-def dead $eflags, debug-location !12 :: (store (s32) into %stack.0)
     ;; The above INC32m should be detected as clobbering the stack location,
     ;; even though it isn't debug labelled.
-    DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(2, 0), debug-location !12
-    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(2, 0)
-    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $noreg
+    DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(2, 0), debug-location !12
+    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(2, 0)
+    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $noreg
 
     ;; Store another debug-labelled value to the stack,
     INC32m $rsp, 1, $noreg, 4, $noreg, implicit-def dead $eflags, debug-instr-number 5, debug-location !DILocation(line: 0, scope: !7) :: (store (s32) into %stack.0)
     ;; Point the variable at that value.
-    DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(4, 0), debug-location !12
-    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(4, 0),
-    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $rsp
+    DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(4, 0), debug-location !12
+    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(4, 0),
+    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $rsp
     ;; Overwrite the stack: LiveDebugValues should explicitly undef the stack
-    ;; location with DBG_VALUE_LIST $noreg, as DbgEntityHistoryCalculator doesn't
+    ;; location with DBG_VALUE_LIST $noreg, as DbgEntityHistoryCalculator doesn't
     ;; look at the stack.
     INC32m $rsp, 1, $noreg, 4, $noreg, implicit-def dead $eflags, debug-location !DILocation(line: 0, scope: !7) :: (store (s32) into %stack.0)
     ; CHECK:      INC32m $rsp
-    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $noreg
+    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $noreg
 
     $rax = MOV64rm $rsp, 1, $noreg, 8, $noreg :: (load 8 from %stack.0)
     RET64 $rax, debug-location !12
diff --git a/llvm/test/DebugInfo/MIR/InstrRef/out-of-scope-blocks.mir b/llvm/test/DebugInfo/MIR/InstrRef/out-of-scope-blocks.mir
index e6bb87b9e2a4..c25c467c10f2 100644
--- a/llvm/test/DebugInfo/MIR/InstrRef/out-of-scope-blocks.mir
+++ b/llvm/test/DebugInfo/MIR/InstrRef/out-of-scope-blocks.mir
@@ -11,32 +11,32 @@
 # CHECK:       ![[FIRSTVAR:[0-9]+]] = !DILocalVariable(name: "_First",
 #
 # CHECK-LABEL: bb.0.entry:
-# CHECK:       DBG_VALUE_LIST ![[FIRSTVAR]],
-# CHECK-SAME:        !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 64, 64), $rbx
+# CHECK:       DBG_VALUE_LIST ![[FIRSTVAR]],
+# CHECK-SAME:        !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 64, 64), $rbx
 #
 # CHECK-LABEL: bb.1.if.then.i.i.i.i.i:
-# CHECK:       DBG_VALUE_LIST ![[FIRSTVAR]],
-# CHECK-SAME:        !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 64, 64), $rbx
-# CHECK:       DBG_INSTR_REF ![[FIRSTVAR]],
-# CHECK:       DBG_VALUE_LIST ![[FIRSTVAR]],
-# CHECK-SAME:        !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 64, 64), $rbx
+# CHECK:       DBG_VALUE_LIST ![[FIRSTVAR]],
+# CHECK-SAME:        !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 64, 64), $rbx
+# CHECK:       DBG_INSTR_REF ![[FIRSTVAR]],
+# CHECK:       DBG_VALUE_LIST ![[FIRSTVAR]],
+# CHECK-SAME:        !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 64, 64), $rbx
 
 # CHECK-LABEL: bb.2._Z17do_insert_cv_testI5_TreeEvv.exit:
-# CHECK:       DBG_VALUE_LIST ![[FIRSTVAR]],
-# CHECK-SAME:        !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 64, 64), $rbx
-# CHECK:       DBG_INSTR_REF ![[FIRSTVAR]],
-# CHECK:       DBG_VALUE_LIST ![[FIRSTVAR]],
-# CHECK-SAME:        !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 64, 64), $rbx
+# CHECK:       DBG_VALUE_LIST ![[FIRSTVAR]],
+# CHECK-SAME:        !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 64, 64), $rbx
+# CHECK:       DBG_INSTR_REF ![[FIRSTVAR]],
+# CHECK:       DBG_VALUE_LIST ![[FIRSTVAR]],
+# CHECK-SAME:        !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 64, 64), $rbx
 
 --- |
   source_filename = "reduced.ll"
   target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-  
+
   %class._Tree = type { i8 }
   %class._Tree_const_iterator = type { %class._Tree_unchecked_const_iterator }
   %class._Tree_unchecked_const_iterator = type { %struct._Iterator_base0, ptr }
   %struct._Iterator_base0 = type { i32 }
-  
+
   define i32 @main({ i32, ptr } %call.i) !dbg !6 {
   entry:
     call void @llvm.dbg.value(metadata i32 2, metadata !10, metadata !DIExpression()), !dbg !12
@@ -45,24 +45,24 @@
     call void @llvm.dbg.value(metadata ptr %0, metadata !13, metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), !dbg !15
     %call.i.i.i.i.i = call i8 undef(ptr null), !dbg !15
     br i1 undef, label %_Z17do_insert_cv_testI5_TreeEvv.exit, label %if.then.i.i.i.i.i
-  
+
   if.then.i.i.i.i.i:
     %call3.i.i.i.i.i = call ptr undef(ptr null)
     call void @llvm.dbg.value(metadata ptr %call3.i.i.i.i.i, metadata !13, metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), !dbg !15
     br label %_Z17do_insert_cv_testI5_TreeEvv.exit
-  
+
   _Z17do_insert_cv_testI5_TreeEvv.exit:
     %_First.sroa.2.0.i.i = phi ptr [ %0, %entry ], [ %call3.i.i.i.i.i, %if.then.i.i.i.i.i ]
     call void @llvm.dbg.value(metadata ptr %_First.sroa.2.0.i.i, metadata !13, metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), !dbg !15
     call void undef(ptr null, i32 0, ptr %_First.sroa.2.0.i.i), !dbg !16
     ret i32 0
   }
-  
+
   declare void @llvm.dbg.value(metadata, metadata, metadata)
-  
+
   !llvm.dbg.cu = !{!0}
   !llvm.module.flags = !{!2, !3, !4, !5}
-  
+
   !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 14.0.0 (git@github.com:llvm/llvm-project ffb249520766d4e2ca120c09dae7afa3d64ef81d)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
   !1 = !DIFile(filename: "toolchain10279.cpp", directory: "/home/jmorse")
   !2 = !{i32 7, !"Dwarf Version", i32 4}
@@ -104,7 +104,7 @@ machineFunctionInfo: {}
 body:             |
   bb.0.entry:
     liveins: $rsi, $r14, $rbx
-  
+
     frame-setup PUSH64r killed $r14, implicit-def $rsp, implicit $rsp
     CFI_INSTRUCTION def_cfa_offset 16
     frame-setup PUSH64r killed $rbx, implicit-def $rsp, implicit $rsp
@@ -119,23 +119,23 @@ body:             |
     renamable $r14d = XOR32rr undef $r14d, undef $r14d, implicit-def dead $eflags, implicit-def $r14
     dead $edi = XOR32rr undef $edi, undef $edi, implicit-def dead $eflags, implicit-def $rdi
     CALL64r undef renamable $rax, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $eax, implicit-def dead $rdx
-    DBG_INSTR_REF !13, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 64, 64), dbg-instr-ref(2, 0), debug-location !15
+    DBG_INSTR_REF !13, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 64, 64), dbg-instr-ref(2, 0), debug-location !15
     dead $edi = XOR32rr undef $edi, undef $edi, implicit-def dead $eflags, implicit-def $rdi, debug-location !15
     CALL64r undef renamable $rax, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $al, debug-location !15
     TEST8rr renamable $r14b, renamable $r14b, implicit-def $eflags, implicit killed $r14
     JCC_1 %bb.2, 5, implicit $eflags
-  
+
   bb.1.if.then.i.i.i.i.i:
     dead $edi = XOR32rr undef $edi, undef $edi, implicit-def dead $eflags, implicit-def $rdi
     CALL64r undef renamable $rax, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def $rax, debug-instr-number 3
     $rbx = MOV64rr killed $rax
-    DBG_INSTR_REF !13, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 64, 64), dbg-instr-ref(3, 7), debug-location !15
-  
+    DBG_INSTR_REF !13, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 64, 64), dbg-instr-ref(3, 7), debug-location !15
+
   bb.2._Z17do_insert_cv_testI5_TreeEvv.exit:
     liveins: $rbx
-  
+
     DBG_PHI $rbx, 1
-    DBG_INSTR_REF !13, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 64, 64), dbg-instr-ref(1, 0), debug-location !15
+    DBG_INSTR_REF !13, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 64, 64), dbg-instr-ref(1, 0), debug-location !15
     dead $edi = XOR32rr undef $edi, undef $edi, implicit-def dead $eflags, implicit-def $rdi, debug-location !16
     $esi = XOR32rr undef $esi, undef $esi, implicit-def dead $eflags, debug-location !16
     $rdx = MOV64rr killed $rbx, debug-location !16
diff --git a/llvm/test/DebugInfo/MIR/InstrRef/phi-coalesce-subreg.mir b/llvm/test/DebugInfo/MIR/InstrRef/phi-coalesce-subreg.mir
index d73d8a375906..fdcfb6c49669 100644
--- a/llvm/test/DebugInfo/MIR/InstrRef/phi-coalesce-subreg.mir
+++ b/llvm/test/DebugInfo/MIR/InstrRef/phi-coalesce-subreg.mir
@@ -19,7 +19,7 @@
   source_filename = "test.c"
   target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
   target triple = "x86_64-unknown-linux-gnu"
-  
+
   ; Function Attrs: noinline nounwind uwtable
   define dso_local i32 @foo(i16 signext %bar, i16 signext %baz) !dbg !7 {
   entry:
@@ -34,14 +34,14 @@
     %call = call signext i16 @getlong(), !dbg !13
     %tobool = icmp ne i16 %call, 0, !dbg !13
     br i1 %tobool, label %if.then, label %if.end, !dbg !13
-  
+
   if.then:                                          ; preds = %entry
     %conv2 = sext i16 %conv1 to i32, !dbg !13
     %add3 = add nsw i32 %conv2, 1, !dbg !13
     %conv4 = trunc i32 %add3 to i16, !dbg !13
     call void @llvm.dbg.value(metadata i16 %conv4, metadata !12, metadata !DIExpression()), !dbg !13
     br label %if.end, !dbg !13
-  
+
   if.end:                                           ; preds = %if.then, %entry
     %bar.addr.0 = phi i16 [ %conv4, %if.then ], [ %conv1, %entry ], !dbg !13
     call void @llvm.dbg.value(metadata i16 %bar.addr.0, metadata !12, metadata !DIExpression()), !dbg !13
@@ -54,21 +54,21 @@
     %conv9 = sext i16 %conv8 to i32, !dbg !13
     ret i32 %conv9, !dbg !13
   }
-  
+
   ; Function Attrs: nounwind readnone speculatable willreturn
   declare void @llvm.dbg.declare(metadata, metadata, metadata)
-  
+
   declare dso_local void @ext(i16 signext)
-  
+
   declare dso_local signext i16 @getlong()
-  
+
   ; Function Attrs: nounwind readnone speculatable willreturn
   declare void @llvm.dbg.value(metadata, metadata, metadata)
-  
+
   !llvm.dbg.cu = !{!0}
   !llvm.module.flags = !{!3, !4, !5}
   !llvm.ident = !{!6}
-  
+
   !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0 (git@github.com:llvm/llvm-project 79a35789efdf2378f97642ae4a5f3099b9087a11)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
   !1 = !DIFile(filename: "test.c", directory: "/fast/fs/llvm3/llvm/test/DebugInfo/MIR/InstrRef")
   !2 = !{}
@@ -103,7 +103,7 @@ machineFunctionInfo: {}
 body:             |
   bb.0.entry:
     liveins: $edi, $esi
-  
+
     %4:gr32 = COPY $esi
     %3:gr32 = COPY $edi
     %6:gr16 = COPY %4.sub_16bit
@@ -127,7 +127,7 @@ body:             |
     %8:gr16 = COPY $ax, debug-location !13
     CMP16ri8 %8, 0, implicit-def $eflags, debug-location !13
     JCC_1 %bb.2, 4, implicit $eflags, debug-location !13
-  
+
   ; DOESCOALESCE-LABEL: bb.1.if.then:
   ; CHECK-LABEL:        bb.1.if.then:
   bb.1.if.then:
@@ -146,7 +146,7 @@ body:             |
   bb.2.if.end:
     %2:gr16 = PHI %11, %bb.0, %17, %bb.1, debug-instr-number 1, debug-location !13
   ; CHECK:              DBG_PHI $bp, 1
-    DBG_INSTR_REF !12, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !13
+    DBG_INSTR_REF !12, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !13
     %31:gr32 = MOVSX32rr16 %6, debug-location !13
     %30:gr32 = MOVSX32rr16 killed %2, debug-location !13
     %29:gr32 = ADD32rr killed %30, killed %31, implicit-def $eflags, debug-location !13
diff --git a/llvm/test/DebugInfo/MIR/InstrRef/phi-on-stack-coalesced.mir b/llvm/test/DebugInfo/MIR/InstrRef/phi-on-stack-coalesced.mir
index 68c9bf6c89dd..b503e0a23d51 100644
--- a/llvm/test/DebugInfo/MIR/InstrRef/phi-on-stack-coalesced.mir
+++ b/llvm/test/DebugInfo/MIR/InstrRef/phi-on-stack-coalesced.mir
@@ -148,9 +148,9 @@ body:             |
     %64:gr32 = PHI %24, %bb.0, %44, %bb.1, debug-location !18
 
     INLINEASM &"", 1, 12, %50, 12, %51, 12, %52, 12, %53, 12, %54, 12, %55, 12, %56, 12, %57, 12, %58, 12, %59, 12, %60, 12, %61, 12, %62, 12, %63, 12, %64
-    DBG_INSTR_REF !14, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !12
+    DBG_INSTR_REF !14, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !12
     ; CHECK:      DBG_PHI %stack.0, 1, 16
-    ; CHECK:      DBG_INSTR_REF {{.+}} dbg-instr-ref(1, 0)
+    ; CHECK:      DBG_INSTR_REF {{.+}} dbg-instr-ref(1, 0)
     ; CHECK:      renamable $eax = MOV32rm %stack.0,
     $eax = COPY killed %0, debug-location !19
     RET 0, killed $eax, debug-location !19
diff --git a/llvm/test/DebugInfo/MIR/InstrRef/phi-on-stack-coalesced2.mir b/llvm/test/DebugInfo/MIR/InstrRef/phi-on-stack-coalesced2.mir
index cf17af4ba430..a9437931af1c 100644
--- a/llvm/test/DebugInfo/MIR/InstrRef/phi-on-stack-coalesced2.mir
+++ b/llvm/test/DebugInfo/MIR/InstrRef/phi-on-stack-coalesced2.mir
@@ -147,9 +147,9 @@ body:             |
     %64:gr32 = PHI %24, %bb.0, %44, %bb.1, debug-location !18
 
     INLINEASM &"", 1, 12, %50, 12, %51, 12, %52, 12, %53, 12, %54, 12, %55, 12, %56, 12, %57, 12, %58, 12, %59, 12, %60, 12, %61, 12, %62, 12, %63, 12, %64
-    DBG_INSTR_REF !14, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !12
+    DBG_INSTR_REF !14, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !12
     ; CHECK-NOT:  DBG_PHI
-    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
+    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
     ; CHECK-NOT:  DBG_PHI
     $eax = COPY killed %0, debug-location !19
     RET 0, killed $eax, debug-location !19
diff --git a/llvm/test/DebugInfo/MIR/InstrRef/phi-regallocd-to-stack.mir b/llvm/test/DebugInfo/MIR/InstrRef/phi-regallocd-to-stack.mir
index cb35bd892eea..ed04647c8406 100644
--- a/llvm/test/DebugInfo/MIR/InstrRef/phi-regallocd-to-stack.mir
+++ b/llvm/test/DebugInfo/MIR/InstrRef/phi-regallocd-to-stack.mir
@@ -141,10 +141,10 @@ body:             |
     %63:gr32 = PHI %23, %bb.0, %43, %bb.1, debug-location !18
     %64:gr32 = PHI %24, %bb.0, %44, %bb.1, debug-location !18
 
-    DBG_INSTR_REF !14, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !12
+    DBG_INSTR_REF !14, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !12
     ; CHECK:      DBG_PHI %stack.1, 1, 32
     ; CHECK:      renamable $eax = MOV32rm %stack.1,
-    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
+    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
     $eax = COPY killed %0, debug-location !19
     RET 0, killed $eax, debug-location !19
 
diff --git a/llvm/test/DebugInfo/MIR/InstrRef/phi-through-regalloc.mir b/llvm/test/DebugInfo/MIR/InstrRef/phi-through-regalloc.mir
index 61dcec49b74c..9e188fa7635c 100644
--- a/llvm/test/DebugInfo/MIR/InstrRef/phi-through-regalloc.mir
+++ b/llvm/test/DebugInfo/MIR/InstrRef/phi-through-regalloc.mir
@@ -122,9 +122,9 @@ body:             |
     ; CHECK-LABEL: bb.2.if.end:
   bb.2.if.end:
     %0:gr32 = PHI %1, %bb.0, %2, %bb.1, debug-instr-number 1, debug-location !18
-    DBG_INSTR_REF !14, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !12
+    DBG_INSTR_REF !14, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !12
     ; CHECK:      DBG_PHI $ebp, 1
-    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
+    ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
     $eax = COPY killed %0, debug-location !19
     ; Confirm that %0 is allocated in $ebp,
     ; CHECK:      $eax = COPY killed renamable $ebp
diff --git a/llvm/test/DebugInfo/MIR/InstrRef/spill-slot-limits.mir b/llvm/test/DebugInfo/MIR/InstrRef/spill-slot-limits.mir
index 8dddb01f52d3..7c70e7722b9b 100644
--- a/llvm/test/DebugInfo/MIR/InstrRef/spill-slot-limits.mir
+++ b/llvm/test/DebugInfo/MIR/InstrRef/spill-slot-limits.mir
@@ -19,17 +19,17 @@
 # CHECK: ![[VARNUM:[0-9]+]] = !DILocalVariable
 #
 ## There should be no variable location, just a single DBG_VALUE $noreg.
-# CHECK:     DBG_VALUE_LIST {{.+}} $noreg
+# CHECK:     DBG_VALUE_LIST {{.+}} $noreg
 #
 ## And then another.
-# CHECK:     DBG_VALUE_LIST {{.+}} $noreg
+# CHECK:     DBG_VALUE_LIST {{.+}} $noreg
 #
 ## Test that if there's no limit, we _do_ get some locations.
-# NOLIMIT:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
-# NOLIMIT-NEXT: DBG_VALUE_LIST {{.+}} $esi
+# NOLIMIT:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
+# NOLIMIT-NEXT: DBG_VALUE_LIST {{.+}} $esi
 #
-# NOLIMIT:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(5,
-# NOLIMIT-NEXT: DBG_VALUE_LIST {{.+}} $rsp
+# NOLIMIT:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(5,
+# NOLIMIT-NEXT: DBG_VALUE_LIST {{.+}} $rsp
 --- |
   define i8 @test(i32 %bar) local_unnamed_addr !dbg !7 {
   entry:
@@ -76,7 +76,7 @@ body:  |
     $rax = MOV64ri 0
     $rdi = MOV64ri 0
 
-    DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !12
+    DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !12
     ; This shouldn't find anything -- we have disabled tracking of spills.
 
     ; In addition to plain spills, spills that are folded into instructions
@@ -84,7 +84,7 @@ body:  |
     INC32m $rsp, 1, $noreg, 4, $noreg, implicit-def dead $eflags, debug-instr-number 5, debug-location !12 :: (store (s32) into %stack.0)
 
 
-    DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(5, 1000000), debug-location !12
+    DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(5, 1000000), debug-location !12
     ; Shouldn't be able to find the reference to instr 5's memory operand.
 
     RET64 $rsi, debug-location !12
diff --git a/llvm/test/DebugInfo/MIR/InstrRef/substitusions-roundtrip.mir b/llvm/test/DebugInfo/MIR/InstrRef/substitusions-roundtrip.mir
index 3951929b0eb7..515180388652 100644
--- a/llvm/test/DebugInfo/MIR/InstrRef/substitusions-roundtrip.mir
+++ b/llvm/test/DebugInfo/MIR/InstrRef/substitusions-roundtrip.mir
@@ -7,7 +7,7 @@
 # CHECK-NEXT: - { srcinst: 1, srcop: 0, dstinst: 2, dstop: 0, subreg: 0 }
 #
 # CHECK:      MOV64rr $rdi, debug-instr-number 2
-# CHECK-NEXT: DBG_INSTR_REF dbg-instr-ref(1, 0)
+# CHECK-NEXT: DBG_INSTR_REF dbg-instr-ref(1, 0)
 ---
 name: test
 tracksRegLiveness: true
@@ -20,7 +20,7 @@ body:  |
   bb.0:
   liveins: $rdi, $rax
     $rbp = MOV64rr $rdi, debug-instr-number 2
-    DBG_INSTR_REF dbg-instr-ref(1, 0)
+    DBG_INSTR_REF dbg-instr-ref(1, 0)
     dead $rcx = MOV64ri 0
     CMP64ri8 renamable $rax, 1, implicit-def $eflags
     RET64 $rax
diff --git a/llvm/test/DebugInfo/MIR/InstrRef/twoaddr-to-threeaddr-sub.mir b/llvm/test/DebugInfo/MIR/InstrRef/twoaddr-to-threeaddr-sub.mir
index d40d3e1ff55f..b600070fb5ae 100644
--- a/llvm/test/DebugInfo/MIR/InstrRef/twoaddr-to-threeaddr-sub.mir
+++ b/llvm/test/DebugInfo/MIR/InstrRef/twoaddr-to-threeaddr-sub.mir
@@ -13,7 +13,7 @@
 # CHECK:      LEA64_32r
 # CHECK-SAME: debug-instr-number 2
 #
-# CHECK:      DBG_INSTR_REF dbg-instr-ref(1, 0)
+# CHECK:      DBG_INSTR_REF dbg-instr-ref(1, 0)
 ---
 name:            test1
 alignment:       16
@@ -35,7 +35,7 @@ body:             |
     %0:gr32 = COPY killed $edi
     %1:gr32 = SHL32ri killed %0, 5, implicit-def dead $eflags
     %2:gr32 = ADD32ri_DB killed %1, 3, implicit-def dead $eflags, debug-instr-number 1
-    DBG_INSTR_REF dbg-instr-ref(1, 0)
+    DBG_INSTR_REF dbg-instr-ref(1, 0)
     $eax = COPY killed %2
     RET 0, killed $eax
 
diff --git a/llvm/test/DebugInfo/MIR/InstrRef/win32-chkctk-modifies-esp.mir b/llvm/test/DebugInfo/MIR/InstrRef/win32-chkctk-modifies-esp.mir
index ed1ba590d746..5fc9b9807742 100644
--- a/llvm/test/DebugInfo/MIR/InstrRef/win32-chkctk-modifies-esp.mir
+++ b/llvm/test/DebugInfo/MIR/InstrRef/win32-chkctk-modifies-esp.mir
@@ -14,14 +14,14 @@
 --- |
   target datalayout = "e-m:x-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:32-n8:16:32-a:0:32-S32"
   target triple = "i686-pc-windows-msvc18.0.31101"
-  
+
   %struct.incomplete_struct = type { i32 }
-  
+
   @"\01?multi_dim_arr@@3PAY146DA" = global [2 x [5 x [7 x i8]]] zeroinitializer, align 1, !dbg !0
   @"\01?p_incomplete_struct_arr@@3PAY02Uincomplete_struct@@A" = global ptr null, align 4, !dbg !6
   @"\01?incomplete_struct_arr@@3PAUincomplete_struct@@A" = global [3 x %struct.incomplete_struct] zeroinitializer, align 4, !dbg !16
   @"\01?typedef_arr@@3SDHD" = constant [4 x i32] zeroinitializer, align 4, !dbg !18
-  
+
   define void @"\01?foo@@YAXH@Z"(i32 %x) !dbg !35 {
   entry:
     %x.addr = alloca i32, align 4
@@ -39,20 +39,20 @@
     call void @llvm.stackrestore(ptr %2), !dbg !49
     ret void, !dbg !49
   }
-  
+
   ; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
   declare void @llvm.dbg.declare(metadata, metadata, metadata)
-  
+
   ; Function Attrs: nofree nosync nounwind willreturn
   declare ptr @llvm.stacksave()
-  
+
   ; Function Attrs: nofree nosync nounwind willreturn
   declare void @llvm.stackrestore(ptr)
-  
+
   !llvm.dbg.cu = !{!2}
   !llvm.module.flags = !{!32, !33}
   !llvm.ident = !{!34}
-  
+
   !0 = distinct !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
   !1 = !DIGlobalVariable(name: "multi_dim_arr", linkageName: "\01?multi_dim_arr@@3PAY146DA", scope: !2, file: !3, line: 1, type: !26, isLocal: false, isDefinition: true)
   !2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, producer: "clang version 3.9.0 (trunk 273874)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5)
@@ -119,7 +119,7 @@ frameInfo:
   maxCallFrameSize: 0
 fixedStack:
   - { id: 0, type: spill-slot, offset: -8, size: 4, alignment: 4 }
-  - { id: 1, size: 4, alignment: 4, debug-info-variable: '!38', debug-info-expression: '!DIExpression()', 
+  - { id: 1, size: 4, alignment: 4, debug-info-variable: '!38', debug-info-expression: '!DIExpression()',
       debug-info-location: '!39' }
 stack:
   - { id: 1, name: saved_stack, offset: -12, size: 4, alignment: 4 }
@@ -143,9 +143,9 @@ body:             |
     CALLpcrel32 &_chkstk, implicit $esp, implicit $ssp, implicit $eax, implicit $esp, implicit-def dead $eax, implicit-def $esp, implicit-def dead $eflags, debug-instr-number 2, debug-location !41
     $ebx = MOV32rr $esp, debug-location !41
     $eax = MOV32ri 0
-    DBG_INSTR_REF !42, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_deref), dbg-instr-ref(2, 6), debug-location !46
-    ; CHECK-LABEL: DBG_INSTR_REF {{.+}}, dbg-instr-ref(2, 6)
-    ; CHECK:       DBG_VALUE_LIST {{.+}}, $esp
+    DBG_INSTR_REF !42, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_deref), dbg-instr-ref(2, 6), debug-location !46
+    ; CHECK-LABEL: DBG_INSTR_REF {{.+}}, dbg-instr-ref(2, 6)
+    ; CHECK:       DBG_VALUE_LIST {{.+}}, $esp
 
     ;; Variable value is $esp / $ebx, will be based on $esp initially. We'll now
     ;; allocate more stack space, and several things should happen:
@@ -157,18 +157,18 @@ body:             |
     ;;    $ebx, which comes from the first modified $esp.
     CALLpcrel32 &_chkstk, implicit $esp, implicit $ssp, implicit $eax, implicit $esp, implicit-def dead $eax, implicit-def $esp, implicit-def dead $eflags, debug-instr-number 3, debug-location !41
     ; CHECK-NEXT: CALLpcrel32
-    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $ebx
+    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $ebx
 
-    DBG_INSTR_REF !42, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_deref), dbg-instr-ref(3, 6), debug-location !46
-    ; CHECK-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(3, 6)
-    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $esp
+    DBG_INSTR_REF !42, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_deref), dbg-instr-ref(3, 6), debug-location !46
+    ; CHECK-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(3, 6)
+    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $esp
 
     $esp = ADD32ri killed $esp, 0, implicit-def dead $eflags
     ; CHECK-NEXT: ADD32ri
 
-    DBG_INSTR_REF !42, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_deref), dbg-instr-ref(3, 6), debug-location !46
-    ; CHECK-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(3, 6)
-    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $noreg
+    DBG_INSTR_REF !42, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_deref), dbg-instr-ref(3, 6), debug-location !46
+    ; CHECK-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(3, 6)
+    ; CHECK-NEXT: DBG_VALUE_LIST {{.+}}, $noreg
 
     $esp = MOV32rr $ebp, debug-location !49
     $ebp = frame-destroy POP32r implicit-def $esp, implicit $esp, debug-location !49
diff --git a/llvm/test/DebugInfo/MIR/InstrRef/x86-drop-compare-inst.mir b/llvm/test/DebugInfo/MIR/InstrRef/x86-drop-compare-inst.mir
index 7a782625188d..17fdccb1b5c9 100644
--- a/llvm/test/DebugInfo/MIR/InstrRef/x86-drop-compare-inst.mir
+++ b/llvm/test/DebugInfo/MIR/InstrRef/x86-drop-compare-inst.mir
@@ -14,14 +14,14 @@
   ; ModuleID = '/fast/fs/build34llvm4/reduced.ll'
   target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
   target triple = "x86_64-unknown-linux-gnu"
-  
+
   %"class.std::vector.534" = type { %"struct.std::_Vector_base.535" }
   %"struct.std::_Vector_base.535" = type { %"struct.std::_Vector_base<unsigned char, std::allocator<unsigned char>>::_Vector_impl" }
   %"struct.std::_Vector_base<unsigned char, std::allocator<unsigned char>>::_Vector_impl" = type { ptr, ptr, ptr }
-  
+
   ; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
   declare void @llvm.dbg.declare(metadata, metadata, metadata) #0
-  
+
   define hidden fastcc void @soup() unnamed_addr !dbg !3 {
   _ZN4llvm11raw_ostreamlsEPKc.exit2752:
     %0 = load ptr, ptr undef, align 8, !dbg !7
@@ -35,22 +35,22 @@
     call void @llvm.dbg.value(metadata i32 %conv373, metadata !8, metadata !DIExpression()), !dbg !7
     %cmp375.not2842 = icmp eq i32 %conv373, 0, !dbg !7
     br i1 %cmp375.not2842, label %for.cond.cleanup376, label %for.body377, !dbg !7
-  
+
   for.cond.cleanup376:                              ; preds = %_ZN4llvm11raw_ostreamlsEPKc.exit2752
     ret void
-  
+
   for.body377:                                      ; preds = %_ZN4llvm11raw_ostreamlsEPKc.exit2752
     ret void
   }
-  
+
   ; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
   declare void @llvm.dbg.value(metadata, metadata, metadata) #0
-  
+
   attributes #0 = { nofree nosync nounwind readnone speculatable willreturn }
-  
+
   !llvm.module.flags = !{!0}
   !llvm.dbg.cu = !{!1}
-  
+
   !0 = !{i32 2, !"Debug Info Version", i32 3}
   !1 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !2, producer: "beards", isOptimized: true, runtimeVersion: 4, emissionKind: FullDebug)
   !2 = !DIFile(filename: "bees.cpp", directory: "")
@@ -78,18 +78,18 @@ machineFunctionInfo: {}
 body:             |
   bb.0._ZN4llvm11raw_ostreamlsEPKc.exit2752:
     successors: %bb.1(0x30000000), %bb.2(0x50000000)
-  
+
     %1:gr64 = IMPLICIT_DEF
     %0:gr64 = MOV64rm killed %1, 1, $noreg, 0, $noreg, debug-location !7 :: (load (s64) from `ptr undef`)
     %2:gr32 = COPY %0.sub_32bit, debug-location !7
     %3:gr32 = SUB32rm %2, %0, 1, $noreg, 0, $noreg, implicit-def $eflags, debug-instr-number 1, debug-location !7 :: (load (s32) from %ir._M_start.i2756, align 8)
-    DBG_INSTR_REF !8, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !7
+    DBG_INSTR_REF !8, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !7
     JCC_1 %bb.2, 5, implicit $eflags, debug-location !7
     JMP_1 %bb.1, debug-location !7
-  
+
   bb.1.for.cond.cleanup376:
     RET 0
-  
+
   bb.2.for.body377:
     RET 0
 
diff --git a/llvm/test/DebugInfo/MIR/X86/dvl-livedebugvalues-clobber.mir b/llvm/test/DebugInfo/MIR/X86/dvl-livedebugvalues-clobber.mir
index 358f27b99764..73217c1d8a1d 100644
--- a/llvm/test/DebugInfo/MIR/X86/dvl-livedebugvalues-clobber.mir
+++ b/llvm/test/DebugInfo/MIR/X86/dvl-livedebugvalues-clobber.mir
@@ -1,106 +1,106 @@
-# RUN: llc -mtriple=x86_64-- -run-pass livedebugvalues -o - %s -experimental-debug-variable-locations=false | FileCheck %s --implicit-check-not=DBG_VALUE_LIST
-#
-# Test that even after a move, clobbering a register terminates a DBG_VALUE_LIST.
-# Check the same for DBG_VALUE $noreg.
-#
-# CHECK: ![[VAR:[0-9]+]] = !DILocalVariable(name: "c"
-# 
-# CHECK-LABEL: bb.0.entry:
-# CHECK:       DBG_VALUE_LIST ![[VAR]], !DIExpression(DW_OP_LLVM_arg, 0,
-# CHECK-SAME:                        DW_OP_LLVM_arg, 1, DW_OP_plus), $rdi, $rsi
-# CHECK:       $rbx = COPY killed $rdi
-# CHECK-NEXT:  DBG_VALUE_LIST ![[VAR]], !DIExpression(DW_OP_LLVM_arg, 0,
-# CHECK-SAME:                        DW_OP_LLVM_arg, 1, DW_OP_plus), $rbx, $rsi
-# CHECK-LABEL: bb.1:
-# CHECK:       DBG_VALUE_LIST ![[VAR]], !DIExpression(DW_OP_LLVM_arg, 0,
-# CHECK-SAME:                        DW_OP_LLVM_arg, 1, DW_OP_plus), $rbx, $rsi
-# CHECK:       $rsi = MOV64ri 0
-# CHECK-LABEL: bb.2:
-# no live-in!
-# CHECK:       $rsi = MOV64ri 0
-# CHECK-NEXT:  DBG_VALUE_LIST ![[VAR]], !DIExpression(DW_OP_LLVM_arg, 0,
-# CHECK-SAME:                        DW_OP_LLVM_arg, 1, DW_OP_plus), $rbx, $rsi
-# CHECK-LABEL: bb.3:
-# CHECK:       DBG_VALUE_LIST ![[VAR]], !DIExpression(DW_OP_LLVM_arg, 0,
-# CHECK-SAME:                        DW_OP_LLVM_arg, 1, DW_OP_plus), $rbx, $rsi
-# live-in to bb.3, then explicitly undef'd, should be no further locations
-# propagated.
-# CHECK-LABEL: bb.4:
---- |
-  ; ModuleID = 'test.cpp'
-  source_filename = "test.cpp"
-  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-  target triple = "x86_64-unknown-linux-gnu"
-  
-  ; Function Attrs: norecurse nounwind readnone uwtable
-  define dso_local i32 @_Z3fooii(i32 %a, i32 %b) local_unnamed_addr !dbg !7 {
-  entry:
-    ret i32 0, !dbg !17
-  }
-  
-  ; Function Attrs: nounwind readnone speculatable willreturn
-  declare void @llvm.dbg.value(metadata, metadata, metadata)
-  
-  !llvm.dbg.cu = !{!0}
-  !llvm.module.flags = !{!3, !4, !5}
-  !llvm.ident = !{!6}
-  
-  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
-  !1 = !DIFile(filename: "test.cpp", directory: "/")
-  !2 = !{}
-  !3 = !{i32 7, !"Dwarf Version", i32 4}
-  !4 = !{i32 2, !"Debug Info Version", i32 3}
-  !5 = !{i32 1, !"wchar_size", i32 4}
-  !6 = !{!"clang version 11.0.0"}
-  !7 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooii", scope: !1, file: !1, line: 2, type: !8, scopeLine: 2, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11)
-  !8 = !DISubroutineType(types: !9)
-  !9 = !{!10, !10, !10}
-  !10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-  !11 = !{!12, !13, !14}
-  !12 = !DILocalVariable(name: "a", arg: 1, scope: !7, file: !1, line: 2, type: !10)
-  !13 = !DILocalVariable(name: "b", arg: 2, scope: !7, file: !1, line: 2, type: !10)
-  !14 = !DILocalVariable(name: "c", scope: !7, file: !1, line: 3, type: !10)
-  !15 = !DILocation(line: 0, scope: !7)
-  !16 = !DILocation(line: 4, column: 12, scope: !7)
-  !17 = !DILocation(line: 4, column: 3, scope: !7)
-
-...
----
-name:            _Z3fooii
-fixedStack:
-  - { id: 0, type: spill-slot, offset: -32, size: 8, alignment: 16, stack-id: default,
-      callee-saved-register: '$rbx', callee-saved-restored: true }
-  - { id: 1, type: spill-slot, offset: -16, size: 8, alignment: 16, stack-id: default,
-      callee-saved-register: '$rbp', callee-saved-restored: true }
-body:             |
-  bb.0.entry:
-    liveins: $rdi, $rsi
-  
-    DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), $rdi, $rsi, debug-location !15
-    $rbx = COPY killed $rdi
-    $rdi = MOV64ri 0
-    JMP_1 %bb.1
-
-  bb.1:
-    liveins: $rbx, $rsi
-    $rsi = MOV64ri 0
-    JMP_1 %bb.2
-
-  bb.2:
-    liveins: $rbx, $rsi
-
-    $rsi = MOV64ri 0
-    DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), $rbx, $rsi, debug-location !15
-    JMP_1 %bb.3
-
-  bb.3:
-    liveins: $rbx, $rsi
-    DBG_VALUE $noreg, $noreg, !14, !DIExpression(), debug-location !15
-    JMP_1 %bb.4
-
-  bb.4:
-    liveins: $rbx, $rsi
-    RET64 $rbx, debug-location !17
-
-...
-
+# RUN: llc -mtriple=x86_64-- -run-pass livedebugvalues -o - %s -experimental-debug-variable-locations=false | FileCheck %s --implicit-check-not=DBG_VALUE_LIST
+#
+# Test that even after a move, clobbering a register terminates a DBG_VALUE_LIST.
+# Check the same for DBG_VALUE $noreg.
+#
+# CHECK: ![[VAR:[0-9]+]] = !DILocalVariable(name: "c"
+#
+# CHECK-LABEL: bb.0.entry:
+# CHECK:       DBG_VALUE_LIST ![[VAR]], !DIExpression(DW_OP_LLVM_arg, 0,
+# CHECK-SAME:                        DW_OP_LLVM_arg, 1, DW_OP_plus), $rdi, $rsi
+# CHECK:       $rbx = COPY killed $rdi
+# CHECK-NEXT:  DBG_VALUE_LIST ![[VAR]], !DIExpression(DW_OP_LLVM_arg, 0,
+# CHECK-SAME:                        DW_OP_LLVM_arg, 1, DW_OP_plus), $rbx, $rsi
+# CHECK-LABEL: bb.1:
+# CHECK:       DBG_VALUE_LIST ![[VAR]], !DIExpression(DW_OP_LLVM_arg, 0,
+# CHECK-SAME:                        DW_OP_LLVM_arg, 1, DW_OP_plus), $rbx, $rsi
+# CHECK:       $rsi = MOV64ri 0
+# CHECK-LABEL: bb.2:
+# no live-in!
+# CHECK:       $rsi = MOV64ri 0
+# CHECK-NEXT:  DBG_VALUE_LIST ![[VAR]], !DIExpression(DW_OP_LLVM_arg, 0,
+# CHECK-SAME:                        DW_OP_LLVM_arg, 1, DW_OP_plus), $rbx, $rsi
+# CHECK-LABEL: bb.3:
+# CHECK:       DBG_VALUE_LIST ![[VAR]], !DIExpression(DW_OP_LLVM_arg, 0,
+# CHECK-SAME:                        DW_OP_LLVM_arg, 1, DW_OP_plus), $rbx, $rsi
+# live-in to bb.3, then explicitly undef'd, should be no further locations
+# propagated.
+# CHECK-LABEL: bb.4:
+--- |
+  ; ModuleID = 'test.cpp'
+  source_filename = "test.cpp"
+  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-unknown-linux-gnu"
+
+  ; Function Attrs: norecurse nounwind readnone uwtable
+  define dso_local i32 @_Z3fooii(i32 %a, i32 %b) local_unnamed_addr !dbg !7 {
+  entry:
+    ret i32 0, !dbg !17
+  }
+
+  ; Function Attrs: nounwind readnone speculatable willreturn
+  declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!3, !4, !5}
+  !llvm.ident = !{!6}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+  !1 = !DIFile(filename: "test.cpp", directory: "/")
+  !2 = !{}
+  !3 = !{i32 7, !"Dwarf Version", i32 4}
+  !4 = !{i32 2, !"Debug Info Version", i32 3}
+  !5 = !{i32 1, !"wchar_size", i32 4}
+  !6 = !{!"clang version 11.0.0"}
+  !7 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooii", scope: !1, file: !1, line: 2, type: !8, scopeLine: 2, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11)
+  !8 = !DISubroutineType(types: !9)
+  !9 = !{!10, !10, !10}
+  !10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !11 = !{!12, !13, !14}
+  !12 = !DILocalVariable(name: "a", arg: 1, scope: !7, file: !1, line: 2, type: !10)
+  !13 = !DILocalVariable(name: "b", arg: 2, scope: !7, file: !1, line: 2, type: !10)
+  !14 = !DILocalVariable(name: "c", scope: !7, file: !1, line: 3, type: !10)
+  !15 = !DILocation(line: 0, scope: !7)
+  !16 = !DILocation(line: 4, column: 12, scope: !7)
+  !17 = !DILocation(line: 4, column: 3, scope: !7)
+
+...
+---
+name:            _Z3fooii
+fixedStack:
+  - { id: 0, type: spill-slot, offset: -32, size: 8, alignment: 16, stack-id: default,
+      callee-saved-register: '$rbx', callee-saved-restored: true }
+  - { id: 1, type: spill-slot, offset: -16, size: 8, alignment: 16, stack-id: default,
+      callee-saved-register: '$rbp', callee-saved-restored: true }
+body:             |
+  bb.0.entry:
+    liveins: $rdi, $rsi
+
+    DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), $rdi, $rsi, debug-location !15
+    $rbx = COPY killed $rdi
+    $rdi = MOV64ri 0
+    JMP_1 %bb.1
+
+  bb.1:
+    liveins: $rbx, $rsi
+    $rsi = MOV64ri 0
+    JMP_1 %bb.2
+
+  bb.2:
+    liveins: $rbx, $rsi
+
+    $rsi = MOV64ri 0
+    DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), $rbx, $rsi, debug-location !15
+    JMP_1 %bb.3
+
+  bb.3:
+    liveins: $rbx, $rsi
+    DBG_VALUE $noreg, $noreg, !14, !DIExpression(), debug-location !15
+    JMP_1 %bb.4
+
+  bb.4:
+    liveins: $rbx, $rsi
+    RET64 $rbx, debug-location !17
+
+...
+
diff --git a/llvm/test/DebugInfo/MIR/X86/dvl-livedebugvalues-join.mir b/llvm/test/DebugInfo/MIR/X86/dvl-livedebugvalues-join.mir
index ccbb3b18a75d..751751c58e34 100644
--- a/llvm/test/DebugInfo/MIR/X86/dvl-livedebugvalues-join.mir
+++ b/llvm/test/DebugInfo/MIR/X86/dvl-livedebugvalues-join.mir
@@ -1,160 +1,160 @@
-# RUN: llc -mtriple=x86_64-- -run-pass livedebugvalues -o - %s -experimental-debug-variable-locations=false | FileCheck %s --implicit-check-not=DBG_VALUE_LIST
-#
-# Test a series of joins, where:
-#  * The locations agree, although registers have changed,
-#  * A register down one of the predecessors has been def'd,
-#  * The register operands to DBG_VALUE_LIST have been swapped,
-#  * A spurious additional operand has been added to one path,
-#  * The expressions are not the same (plus -> minus).
-#
-# Each join block below checks for one DBG_VALUE_LIST: either we re-state the var
-# location in a block for the next test, or it's created as a live-in and we
-# use that for the next test. Two DBG_VALUE_LISTs in a block would represent
-# a live-in that we didn't expect, and a test failure.
-#
-# Each conditional block should have at least one, possibly two.
-#
-# CHECK: ![[VAR:[0-9]+]] = !DILocalVariable(name: "c"
-# 
-# CHECK-LABEL: bb.0.entry:
-# CHECK:       DBG_VALUE_LIST ![[VAR]],
-# CHECK-LABEL: bb.1:
-# CHECK:       DBG_VALUE_LIST ![[VAR]],
-# CHECK:       DBG_VALUE_LIST ![[VAR]],
-# CHECK-LABEL: bb.2:
-# CHECK:       DBG_VALUE_LIST ![[VAR]], !DIExpression(DW_OP_LLVM_arg, 0,
-# CHECK-SAME:                        DW_OP_LLVM_arg, 1, DW_OP_plus), $rdi, $rsi
-# CHECK-LABEL: bb.3:
-# CHECK:       DBG_VALUE_LIST ![[VAR]],
-# CHECK-LABEL: bb.4:
-# CHECK:       DBG_VALUE_LIST ![[VAR]],
-# CHECK-SAME:        $rdi, $rsi
-# CHECK-LABEL: bb.5:
-# CHECK:       DBG_VALUE_LIST ![[VAR]],
-# CHECK:       DBG_VALUE_LIST ![[VAR]],
-# CHECK-SAME:        $rsi, $rdi
-# CHECK-LABEL: bb.6:
-# CHECK:       DBG_VALUE_LIST ![[VAR]],
-# CHECK-LABEL: bb.7:
-# CHECK:       DBG_VALUE_LIST ![[VAR]],
-# CHECK:       DBG_VALUE_LIST ![[VAR]],
-# CHECK-SAME:        $rdi, $rsi, $rax
-# CHECK-LABEL: bb.8:
-# CHECK:       DBG_VALUE_LIST ![[VAR]],
-# CHECK-LABEL: bb.9:
-# CHECK:       DBG_VALUE_LIST ![[VAR]],
-# CHECK:       DBG_VALUE_LIST ![[VAR]],
-# CHECK-SAME:        DW_OP_minus
-# CHECK-LABEL: bb.10:
---- |
-  ; ModuleID = 'test.cpp'
-  source_filename = "test.cpp"
-  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-  target triple = "x86_64-unknown-linux-gnu"
-  
-  ; Function Attrs: norecurse nounwind readnone uwtable
-  define dso_local i32 @_Z3fooii(i32 %a, i32 %b) local_unnamed_addr !dbg !7 {
-  entry:
-    ret i32 0, !dbg !17
-  }
-  
-  ; Function Attrs: nounwind readnone speculatable willreturn
-  declare void @llvm.dbg.value(metadata, metadata, metadata)
-  
-  !llvm.dbg.cu = !{!0}
-  !llvm.module.flags = !{!3, !4, !5}
-  !llvm.ident = !{!6}
-  
-  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
-  !1 = !DIFile(filename: "test.cpp", directory: "/")
-  !2 = !{}
-  !3 = !{i32 7, !"Dwarf Version", i32 4}
-  !4 = !{i32 2, !"Debug Info Version", i32 3}
-  !5 = !{i32 1, !"wchar_size", i32 4}
-  !6 = !{!"clang version 11.0.0"}
-  !7 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooii", scope: !1, file: !1, line: 2, type: !8, scopeLine: 2, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11)
-  !8 = !DISubroutineType(types: !9)
-  !9 = !{!10, !10, !10}
-  !10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-  !11 = !{!12, !13, !14}
-  !12 = !DILocalVariable(name: "a", arg: 1, scope: !7, file: !1, line: 2, type: !10)
-  !13 = !DILocalVariable(name: "b", arg: 2, scope: !7, file: !1, line: 2, type: !10)
-  !14 = !DILocalVariable(name: "c", scope: !7, file: !1, line: 3, type: !10)
-  !15 = !DILocation(line: 0, scope: !7)
-  !16 = !DILocation(line: 4, column: 12, scope: !7)
-  !17 = !DILocation(line: 4, column: 3, scope: !7)
-
-...
----
-name:            _Z3fooii
-body:             |
-  bb.0.entry:
-    liveins: $rdi, $rsi
-  
-    DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), $rdi, $rsi, debug-location !15
-    CMP64ri8 $rdi, 0, implicit-def $eflags
-    JCC_1 %bb.2, 4, implicit $eflags
-
-  bb.1:
-    liveins: $rdi, $rsi
-    $rdi = MOV64ri 0
-    $rsi = MOV64ri 0
-    DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), $rdi, $rsi, debug-location !15
-
-  bb.2:
-    liveins: $rdi, $rsi
-    ; Should be a live-in loc here,
-    CMP64ri8 $rdi, 0, implicit-def $eflags
-    JCC_1 %bb.4, 4, implicit $eflags
-    
-  bb.3:
-    liveins: $rdi, $rsi
-    $rsi = MOV64ri 0
-
-  bb.4:
-    liveins: $rdi, $rsi
-    ; Should _not_ be a live-in loc here.
-    DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), $rdi, $rsi, debug-location !15
-    CMP64ri8 $rdi, 0, implicit-def $eflags
-    JCC_1 %bb.6, 4, implicit $eflags
-
-  bb.5:
-    liveins: $rdi, $rsi
-    ; Flip some args,
-    DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), $rsi, $rdi, debug-location !15
-
-  bb.6:
-    liveins: $rdi, $rsi
-    ; Should _not_ be a live-in loc here.
-    $rax = MOV64ri 0
-    DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), $rdi, $rsi, debug-location !15
-    CMP64ri8 $rdi, 0, implicit-def $eflags
-    JCC_1 %bb.8, 4, implicit $eflags
-
-  bb.7:
-    liveins: $rdi, $rsi
-    ; Add an extra, spurious, unused argument
-    $rax = MOV64ri 1
-    DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), $rdi, $rsi, $rax, debug-location !15
-
-  bb.8:
-    liveins: $rdi, $rsi
-    ; Should _not_ be a live-in loc here.
-    $rax = MOV64ri 0
-    DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), $rdi, $rsi, debug-location !15
-    CMP64ri8 $rdi, 0, implicit-def $eflags
-    JCC_1 %bb.10, 4, implicit $eflags
-
-  bb.9:
-    liveins: $rdi, $rsi
-    ; Replace add with sub in the expr
-    $rax = MOV64ri 1
-    DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_minus), $rdi, $rsi, debug-location !15
-
-  bb.10:
-    liveins: $rdi, $rsi
-    ; Should _not_ be a live-in loc here.
-    RET64
-
-...
-
+# RUN: llc -mtriple=x86_64-- -run-pass livedebugvalues -o - %s -experimental-debug-variable-locations=false | FileCheck %s --implicit-check-not=DBG_VALUE_LIST
+#
+# Test a series of joins, where:
+#  * The locations agree, although registers have changed,
+#  * A register down one of the predecessors has been def'd,
+#  * The register operands to DBG_VALUE_LIST have been swapped,
+#  * A spurious additional operand has been added to one path,
+#  * The expressions are not the same (plus -> minus).
+#
+# Each join block below checks for one DBG_VALUE_LIST: either we re-state the var
+# location in a block for the next test, or it's created as a live-in and we
+# use that for the next test. Two DBG_VALUE_LISTs in a block would represent
+# a live-in that we didn't expect, and a test failure.
+#
+# Each conditional block should have at least one, possibly two.
+#
+# CHECK: ![[VAR:[0-9]+]] = !DILocalVariable(name: "c"
+#
+# CHECK-LABEL: bb.0.entry:
+# CHECK:       DBG_VALUE_LIST ![[VAR]],
+# CHECK-LABEL: bb.1:
+# CHECK:       DBG_VALUE_LIST ![[VAR]],
+# CHECK:       DBG_VALUE_LIST ![[VAR]],
+# CHECK-LABEL: bb.2:
+# CHECK:       DBG_VALUE_LIST ![[VAR]], !DIExpression(DW_OP_LLVM_arg, 0,
+# CHECK-SAME:                        DW_OP_LLVM_arg, 1, DW_OP_plus), $rdi, $rsi
+# CHECK-LABEL: bb.3:
+# CHECK:       DBG_VALUE_LIST ![[VAR]],
+# CHECK-LABEL: bb.4:
+# CHECK:       DBG_VALUE_LIST ![[VAR]],
+# CHECK-SAME:        $rdi, $rsi
+# CHECK-LABEL: bb.5:
+# CHECK:       DBG_VALUE_LIST ![[VAR]],
+# CHECK:       DBG_VALUE_LIST ![[VAR]],
+# CHECK-SAME:        $rsi, $rdi
+# CHECK-LABEL: bb.6:
+# CHECK:       DBG_VALUE_LIST ![[VAR]],
+# CHECK-LABEL: bb.7:
+# CHECK:       DBG_VALUE_LIST ![[VAR]],
+# CHECK:       DBG_VALUE_LIST ![[VAR]],
+# CHECK-SAME:        $rdi, $rsi, $rax
+# CHECK-LABEL: bb.8:
+# CHECK:       DBG_VALUE_LIST ![[VAR]],
+# CHECK-LABEL: bb.9:
+# CHECK:       DBG_VALUE_LIST ![[VAR]],
+# CHECK:       DBG_VALUE_LIST ![[VAR]],
+# CHECK-SAME:        DW_OP_minus
+# CHECK-LABEL: bb.10:
+--- |
+  ; ModuleID = 'test.cpp'
+  source_filename = "test.cpp"
+  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-unknown-linux-gnu"
+
+  ; Function Attrs: norecurse nounwind readnone uwtable
+  define dso_local i32 @_Z3fooii(i32 %a, i32 %b) local_unnamed_addr !dbg !7 {
+  entry:
+    ret i32 0, !dbg !17
+  }
+
+  ; Function Attrs: nounwind readnone speculatable willreturn
+  declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!3, !4, !5}
+  !llvm.ident = !{!6}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+  !1 = !DIFile(filename: "test.cpp", directory: "/")
+  !2 = !{}
+  !3 = !{i32 7, !"Dwarf Version", i32 4}
+  !4 = !{i32 2, !"Debug Info Version", i32 3}
+  !5 = !{i32 1, !"wchar_size", i32 4}
+  !6 = !{!"clang version 11.0.0"}
+  !7 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooii", scope: !1, file: !1, line: 2, type: !8, scopeLine: 2, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11)
+  !8 = !DISubroutineType(types: !9)
+  !9 = !{!10, !10, !10}
+  !10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !11 = !{!12, !13, !14}
+  !12 = !DILocalVariable(name: "a", arg: 1, scope: !7, file: !1, line: 2, type: !10)
+  !13 = !DILocalVariable(name: "b", arg: 2, scope: !7, file: !1, line: 2, type: !10)
+  !14 = !DILocalVariable(name: "c", scope: !7, file: !1, line: 3, type: !10)
+  !15 = !DILocation(line: 0, scope: !7)
+  !16 = !DILocation(line: 4, column: 12, scope: !7)
+  !17 = !DILocation(line: 4, column: 3, scope: !7)
+
+...
+---
+name:            _Z3fooii
+body:             |
+  bb.0.entry:
+    liveins: $rdi, $rsi
+
+    DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), $rdi, $rsi, debug-location !15
+    CMP64ri8 $rdi, 0, implicit-def $eflags
+    JCC_1 %bb.2, 4, implicit $eflags
+
+  bb.1:
+    liveins: $rdi, $rsi
+    $rdi = MOV64ri 0
+    $rsi = MOV64ri 0
+    DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), $rdi, $rsi, debug-location !15
+
+  bb.2:
+    liveins: $rdi, $rsi
+    ; Should be a live-in loc here,
+    CMP64ri8 $rdi, 0, implicit-def $eflags
+    JCC_1 %bb.4, 4, implicit $eflags
+
+  bb.3:
+    liveins: $rdi, $rsi
+    $rsi = MOV64ri 0
+
+  bb.4:
+    liveins: $rdi, $rsi
+    ; Should _not_ be a live-in loc here.
+    DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), $rdi, $rsi, debug-location !15
+    CMP64ri8 $rdi, 0, implicit-def $eflags
+    JCC_1 %bb.6, 4, implicit $eflags
+
+  bb.5:
+    liveins: $rdi, $rsi
+    ; Flip some args,
+    DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), $rsi, $rdi, debug-location !15
+
+  bb.6:
+    liveins: $rdi, $rsi
+    ; Should _not_ be a live-in loc here.
+    $rax = MOV64ri 0
+    DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), $rdi, $rsi, debug-location !15
+    CMP64ri8 $rdi, 0, implicit-def $eflags
+    JCC_1 %bb.8, 4, implicit $eflags
+
+  bb.7:
+    liveins: $rdi, $rsi
+    ; Add an extra, spurious, unused argument
+    $rax = MOV64ri 1
+    DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), $rdi, $rsi, $rax, debug-location !15
+
+  bb.8:
+    liveins: $rdi, $rsi
+    ; Should _not_ be a live-in loc here.
+    $rax = MOV64ri 0
+    DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), $rdi, $rsi, debug-location !15
+    CMP64ri8 $rdi, 0, implicit-def $eflags
+    JCC_1 %bb.10, 4, implicit $eflags
+
+  bb.9:
+    liveins: $rdi, $rsi
+    ; Replace add with sub in the expr
+    $rax = MOV64ri 1
+    DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_minus), $rdi, $rsi, debug-location !15
+
+  bb.10:
+    liveins: $rdi, $rsi
+    ; Should _not_ be a live-in loc here.
+    RET64
+
+...
+
diff --git a/llvm/test/DebugInfo/MIR/X86/dvl-livedebugvalues-movements.mir b/llvm/test/DebugInfo/MIR/X86/dvl-livedebugvalues-movements.mir
index 21de1acd3606..077d113d46c1 100644
--- a/llvm/test/DebugInfo/MIR/X86/dvl-livedebugvalues-movements.mir
+++ b/llvm/test/DebugInfo/MIR/X86/dvl-livedebugvalues-movements.mir
@@ -1,90 +1,90 @@
-# RUN: llc -mtriple=x86_64-- -run-pass livedebugvalues -o - %s -experimental-debug-variable-locations=false | FileCheck %s --implicit-check-not=DBG_VALUE_LIST
-#
-# The MIR below moves values from argument registers to callee saved registers,
-# moves that are followed by DBG_VALUEs and which should also result in
-# DBG_VALUE_LISTs moving their operands.
-#
-# CHECK: ![[VAR:[0-9]+]] = !DILocalVariable(name: "c"
-# 
-# CHECK-LABEL: bb.0.entry:
-# CHECK:       DBG_VALUE_LIST ![[VAR]], !DIExpression(DW_OP_LLVM_arg, 0,
-# CHECK-SAME:                        DW_OP_LLVM_arg, 1, DW_OP_plus), $rdi, $rsi
-# CHECK:       $rbx = COPY killed $rdi
-# CHECK-NEXT:  DBG_VALUE_LIST ![[VAR]], !DIExpression(DW_OP_LLVM_arg, 0,
-# CHECK-SAME:                        DW_OP_LLVM_arg, 1, DW_OP_plus), $rbx, $rsi
-# CHECK-LABEL: bb.1:
-# CHECK:       DBG_VALUE_LIST ![[VAR]], !DIExpression(DW_OP_LLVM_arg, 0,
-# CHECK-SAME:                        DW_OP_LLVM_arg, 1, DW_OP_plus), $rbx, $rsi
-# CHECK:       $rbp = COPY killed $rsi
-# CHECK-NEXT:  DBG_VALUE_LIST ![[VAR]], !DIExpression(DW_OP_LLVM_arg, 0,
-# CHECK-SAME:                        DW_OP_LLVM_arg, 1, DW_OP_plus), $rbx, $rbp
-# CHECK-LABEL: bb.2:
-# CHECK:       DBG_VALUE_LIST ![[VAR]], !DIExpression(DW_OP_LLVM_arg, 0,
-# CHECK-SAME:                        DW_OP_LLVM_arg, 1, DW_OP_plus), $rbx, $rbp
---- |
-  ; ModuleID = 'test.cpp'
-  source_filename = "test.cpp"
-  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-  target triple = "x86_64-unknown-linux-gnu"
-  
-  ; Function Attrs: norecurse nounwind readnone uwtable
-  define dso_local i32 @_Z3fooii(i32 %a, i32 %b) local_unnamed_addr !dbg !7 {
-  entry:
-    ret i32 0, !dbg !17
-  }
-  
-  ; Function Attrs: nounwind readnone speculatable willreturn
-  declare void @llvm.dbg.value(metadata, metadata, metadata)
-  
-  !llvm.dbg.cu = !{!0}
-  !llvm.module.flags = !{!3, !4, !5}
-  !llvm.ident = !{!6}
-  
-  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
-  !1 = !DIFile(filename: "test.cpp", directory: "/")
-  !2 = !{}
-  !3 = !{i32 7, !"Dwarf Version", i32 4}
-  !4 = !{i32 2, !"Debug Info Version", i32 3}
-  !5 = !{i32 1, !"wchar_size", i32 4}
-  !6 = !{!"clang version 11.0.0"}
-  !7 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooii", scope: !1, file: !1, line: 2, type: !8, scopeLine: 2, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11)
-  !8 = !DISubroutineType(types: !9)
-  !9 = !{!10, !10, !10}
-  !10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-  !11 = !{!12, !13, !14}
-  !12 = !DILocalVariable(name: "a", arg: 1, scope: !7, file: !1, line: 2, type: !10)
-  !13 = !DILocalVariable(name: "b", arg: 2, scope: !7, file: !1, line: 2, type: !10)
-  !14 = !DILocalVariable(name: "c", scope: !7, file: !1, line: 3, type: !10)
-  !15 = !DILocation(line: 0, scope: !7)
-  !16 = !DILocation(line: 4, column: 12, scope: !7)
-  !17 = !DILocation(line: 4, column: 3, scope: !7)
-
-...
----
-name:            _Z3fooii
-fixedStack:
-  - { id: 0, type: spill-slot, offset: -32, size: 8, alignment: 16, stack-id: default,
-      callee-saved-register: '$rbx', callee-saved-restored: true }
-  - { id: 1, type: spill-slot, offset: -16, size: 8, alignment: 16, stack-id: default,
-      callee-saved-register: '$rbp', callee-saved-restored: true }
-body:             |
-  bb.0.entry:
-    liveins: $rdi, $rsi
-
-    DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), $rdi, $rsi, debug-location !15
-    $rbx = COPY killed $rdi
-    $rdi = MOV64ri 0
-    JMP_1 %bb.1
-
-  bb.1:
-    liveins: $rbx, $rsi
-    $rbp = COPY killed $rsi
-    $rsi = MOV64ri 0
-    JMP_1 %bb.2
-
-  bb.2:
-    liveins: $rbx, $rbp
-    RET64 $rbp, debug-location !17
-
-...
-
+# RUN: llc -mtriple=x86_64-- -run-pass livedebugvalues -o - %s -experimental-debug-variable-locations=false | FileCheck %s --implicit-check-not=DBG_VALUE_LIST
+#
+# The MIR below moves values from argument registers to callee saved registers,
+# moves that are followed by DBG_VALUEs and which should also result in
+# DBG_VALUE_LISTs moving their operands.
+#
+# CHECK: ![[VAR:[0-9]+]] = !DILocalVariable(name: "c"
+#
+# CHECK-LABEL: bb.0.entry:
+# CHECK:       DBG_VALUE_LIST ![[VAR]], !DIExpression(DW_OP_LLVM_arg, 0,
+# CHECK-SAME:                        DW_OP_LLVM_arg, 1, DW_OP_plus), $rdi, $rsi
+# CHECK:       $rbx = COPY killed $rdi
+# CHECK-NEXT:  DBG_VALUE_LIST ![[VAR]], !DIExpression(DW_OP_LLVM_arg, 0,
+# CHECK-SAME:                        DW_OP_LLVM_arg, 1, DW_OP_plus), $rbx, $rsi
+# CHECK-LABEL: bb.1:
+# CHECK:       DBG_VALUE_LIST ![[VAR]], !DIExpression(DW_OP_LLVM_arg, 0,
+# CHECK-SAME:                        DW_OP_LLVM_arg, 1, DW_OP_plus), $rbx, $rsi
+# CHECK:       $rbp = COPY killed $rsi
+# CHECK-NEXT:  DBG_VALUE_LIST ![[VAR]], !DIExpression(DW_OP_LLVM_arg, 0,
+# CHECK-SAME:                        DW_OP_LLVM_arg, 1, DW_OP_plus), $rbx, $rbp
+# CHECK-LABEL: bb.2:
+# CHECK:       DBG_VALUE_LIST ![[VAR]], !DIExpression(DW_OP_LLVM_arg, 0,
+# CHECK-SAME:                        DW_OP_LLVM_arg, 1, DW_OP_plus), $rbx, $rbp
+--- |
+  ; ModuleID = 'test.cpp'
+  source_filename = "test.cpp"
+  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-unknown-linux-gnu"
+
+  ; Function Attrs: norecurse nounwind readnone uwtable
+  define dso_local i32 @_Z3fooii(i32 %a, i32 %b) local_unnamed_addr !dbg !7 {
+  entry:
+    ret i32 0, !dbg !17
+  }
+
+  ; Function Attrs: nounwind readnone speculatable willreturn
+  declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!3, !4, !5}
+  !llvm.ident = !{!6}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+  !1 = !DIFile(filename: "test.cpp", directory: "/")
+  !2 = !{}
+  !3 = !{i32 7, !"Dwarf Version", i32 4}
+  !4 = !{i32 2, !"Debug Info Version", i32 3}
+  !5 = !{i32 1, !"wchar_size", i32 4}
+  !6 = !{!"clang version 11.0.0"}
+  !7 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooii", scope: !1, file: !1, line: 2, type: !8, scopeLine: 2, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11)
+  !8 = !DISubroutineType(types: !9)
+  !9 = !{!10, !10, !10}
+  !10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !11 = !{!12, !13, !14}
+  !12 = !DILocalVariable(name: "a", arg: 1, scope: !7, file: !1, line: 2, type: !10)
+  !13 = !DILocalVariable(name: "b", arg: 2, scope: !7, file: !1, line: 2, type: !10)
+  !14 = !DILocalVariable(name: "c", scope: !7, file: !1, line: 3, type: !10)
+  !15 = !DILocation(line: 0, scope: !7)
+  !16 = !DILocation(line: 4, column: 12, scope: !7)
+  !17 = !DILocation(line: 4, column: 3, scope: !7)
+
+...
+---
+name:            _Z3fooii
+fixedStack:
+  - { id: 0, type: spill-slot, offset: -32, size: 8, alignment: 16, stack-id: default,
+      callee-saved-register: '$rbx', callee-saved-restored: true }
+  - { id: 1, type: spill-slot, offset: -16, size: 8, alignment: 16, stack-id: default,
+      callee-saved-register: '$rbp', callee-saved-restored: true }
+body:             |
+  bb.0.entry:
+    liveins: $rdi, $rsi
+
+    DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), $rdi, $rsi, debug-location !15
+    $rbx = COPY killed $rdi
+    $rdi = MOV64ri 0
+    JMP_1 %bb.1
+
+  bb.1:
+    liveins: $rbx, $rsi
+    $rbp = COPY killed $rsi
+    $rsi = MOV64ri 0
+    JMP_1 %bb.2
+
+  bb.2:
+    liveins: $rbx, $rbp
+    RET64 $rbp, debug-location !17
+
+...
+
diff --git a/llvm/test/DebugInfo/MIR/X86/dvl-livedebugvalues-spillrestore.mir b/llvm/test/DebugInfo/MIR/X86/dvl-livedebugvalues-spillrestore.mir
index c5bd084563cd..8d6b879adbdb 100644
--- a/llvm/test/DebugInfo/MIR/X86/dvl-livedebugvalues-spillrestore.mir
+++ b/llvm/test/DebugInfo/MIR/X86/dvl-livedebugvalues-spillrestore.mir
@@ -1,77 +1,77 @@
-# RUN: llc -mtriple=x86_64-- -run-pass livedebugvalues -o - %s -experimental-debug-variable-locations=false | FileCheck %s --implicit-check-not=DBG_VALUE_LIST
-#
-# A DBG_VALUE_LIST that has a component spilt and restored should had its
-# expression and operands updated to refer to the stack for that period, and
-# then return to normal once the value is restored.
-#
-# CHECK: ![[VAR:[0-9]+]] = !DILocalVariable(name: "c"
-# 
-# CHECK-LABEL: bb.0.entry:
-# CHECK:       DBG_VALUE_LIST ![[VAR]],
-# CHECK-SAME:  !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus),
-# CHECK-SAME:  $rdi, $rsi,
-# CHECK:       MOV64mr $rsp, 1, $noreg, -16, $noreg, $rdi
-# CHECK-NEXT:  DBG_VALUE_LIST ![[VAR]],
-# CHECK-SAME:  !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_constu, 8, DW_OP_minus, DW_OP_deref, DW_OP_LLVM_arg, 1, DW_OP_plus),
-# CHECK-SAME:                $rsp, $rsi,
-# CHECK:       $rdi = MOV64rm $rsp, 1, $noreg, -16,
-# CHECK-NEXT:  DBG_VALUE_LIST ![[VAR]],
-# CHECK-SAME:  !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus),
-# CHECK-SAME:  $rdi, $rsi,
-
---- |
-  ; ModuleID = 'test.cpp'
-  source_filename = "test.cpp"
-  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-  target triple = "x86_64-unknown-linux-gnu"
-  
-  ; Function Attrs: norecurse nounwind readnone uwtable
-  define dso_local i32 @_Z3fooii(i32 %a, i32 %b) local_unnamed_addr !dbg !7 {
-  entry:
-    ret i32 0, !dbg !17
-  }
-  
-  ; Function Attrs: nounwind readnone speculatable willreturn
-  declare void @llvm.dbg.value(metadata, metadata, metadata)
-  
-  !llvm.dbg.cu = !{!0}
-  !llvm.module.flags = !{!3, !4, !5}
-  !llvm.ident = !{!6}
-  
-  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
-  !1 = !DIFile(filename: "test.cpp", directory: "/")
-  !2 = !{}
-  !3 = !{i32 7, !"Dwarf Version", i32 4}
-  !4 = !{i32 2, !"Debug Info Version", i32 3}
-  !5 = !{i32 1, !"wchar_size", i32 4}
-  !6 = !{!"clang version 11.0.0"}
-  !7 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooii", scope: !1, file: !1, line: 2, type: !8, scopeLine: 2, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11)
-  !8 = !DISubroutineType(types: !9)
-  !9 = !{!10, !10, !10}
-  !10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-  !11 = !{!12, !13, !14}
-  !12 = !DILocalVariable(name: "a", arg: 1, scope: !7, file: !1, line: 2, type: !10)
-  !13 = !DILocalVariable(name: "b", arg: 2, scope: !7, file: !1, line: 2, type: !10)
-  !14 = !DILocalVariable(name: "c", scope: !7, file: !1, line: 3, type: !10)
-  !15 = !DILocation(line: 0, scope: !7)
-  !16 = !DILocation(line: 4, column: 12, scope: !7)
-  !17 = !DILocation(line: 4, column: 3, scope: !7)
-
-...
----
-name:            _Z3fooii
-stack:
-  - { id: 0, offset: -16, size: 8, alignment: 8, type: spill-slot }
-body:             |
-  bb.0.entry:
-    liveins: $rdi, $rsi
-  
-    $rsp = frame-setup SUB64ri8 $rsp, 24, implicit-def dead $eflags
-    DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), $rdi, $rsi, debug-location !15
-    MOV64mr $rsp, 1, _, -16, _, $rdi, debug-location !15 :: (store (s64) into %stack.0)
-    $rax = COPY killed $rdi
-    $rdi = MOV64ri 0
-    $rdi = MOV64rm $rsp, 1, $noreg, -16, $noreg, debug-location !15 :: (load (s64) from %stack.0)
-    RET64
-...
-
+# RUN: llc -mtriple=x86_64-- -run-pass livedebugvalues -o - %s -experimental-debug-variable-locations=false | FileCheck %s --implicit-check-not=DBG_VALUE_LIST
+#
+# A DBG_VALUE_LIST that has a component spilt and restored should had its
+# expression and operands updated to refer to the stack for that period, and
+# then return to normal once the value is restored.
+#
+# CHECK: ![[VAR:[0-9]+]] = !DILocalVariable(name: "c"
+#
+# CHECK-LABEL: bb.0.entry:
+# CHECK:       DBG_VALUE_LIST ![[VAR]],
+# CHECK-SAME:  !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus),
+# CHECK-SAME:  $rdi, $rsi,
+# CHECK:       MOV64mr $rsp, 1, $noreg, -16, $noreg, $rdi
+# CHECK-NEXT:  DBG_VALUE_LIST ![[VAR]],
+# CHECK-SAME:  !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_constu, 8, DW_OP_minus, DW_OP_deref, DW_OP_LLVM_arg, 1, DW_OP_plus),
+# CHECK-SAME:                $rsp, $rsi,
+# CHECK:       $rdi = MOV64rm $rsp, 1, $noreg, -16,
+# CHECK-NEXT:  DBG_VALUE_LIST ![[VAR]],
+# CHECK-SAME:  !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus),
+# CHECK-SAME:  $rdi, $rsi,
+
+--- |
+  ; ModuleID = 'test.cpp'
+  source_filename = "test.cpp"
+  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-unknown-linux-gnu"
+
+  ; Function Attrs: norecurse nounwind readnone uwtable
+  define dso_local i32 @_Z3fooii(i32 %a, i32 %b) local_unnamed_addr !dbg !7 {
+  entry:
+    ret i32 0, !dbg !17
+  }
+
+  ; Function Attrs: nounwind readnone speculatable willreturn
+  declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!3, !4, !5}
+  !llvm.ident = !{!6}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+  !1 = !DIFile(filename: "test.cpp", directory: "/")
+  !2 = !{}
+  !3 = !{i32 7, !"Dwarf Version", i32 4}
+  !4 = !{i32 2, !"Debug Info Version", i32 3}
+  !5 = !{i32 1, !"wchar_size", i32 4}
+  !6 = !{!"clang version 11.0.0"}
+  !7 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooii", scope: !1, file: !1, line: 2, type: !8, scopeLine: 2, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11)
+  !8 = !DISubroutineType(types: !9)
+  !9 = !{!10, !10, !10}
+  !10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !11 = !{!12, !13, !14}
+  !12 = !DILocalVariable(name: "a", arg: 1, scope: !7, file: !1, line: 2, type: !10)
+  !13 = !DILocalVariable(name: "b", arg: 2, scope: !7, file: !1, line: 2, type: !10)
+  !14 = !DILocalVariable(name: "c", scope: !7, file: !1, line: 3, type: !10)
+  !15 = !DILocation(line: 0, scope: !7)
+  !16 = !DILocation(line: 4, column: 12, scope: !7)
+  !17 = !DILocation(line: 4, column: 3, scope: !7)
+
+...
+---
+name:            _Z3fooii
+stack:
+  - { id: 0, offset: -16, size: 8, alignment: 8, type: spill-slot }
+body:             |
+  bb.0.entry:
+    liveins: $rdi, $rsi
+
+    $rsp = frame-setup SUB64ri8 $rsp, 24, implicit-def dead $eflags
+    DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), $rdi, $rsi, debug-location !15
+    MOV64mr $rsp, 1, _, -16, _, $rdi, debug-location !15 :: (store (s64) into %stack.0)
+    $rax = COPY killed $rdi
+    $rdi = MOV64ri 0
+    $rdi = MOV64rm $rsp, 1, $noreg, -16, $noreg, debug-location !15 :: (load (s64) from %stack.0)
+    RET64
+...
+
diff --git a/llvm/test/DebugInfo/MIR/X86/dvl-livedebugvars-movements.mir b/llvm/test/DebugInfo/MIR/X86/dvl-livedebugvars-movements.mir
index 20536b674390..de69a6f034f1 100644
--- a/llvm/test/DebugInfo/MIR/X86/dvl-livedebugvars-movements.mir
+++ b/llvm/test/DebugInfo/MIR/X86/dvl-livedebugvars-movements.mir
@@ -1,109 +1,109 @@
-# RUN: llc -start-after=phi-node-elimination -stop-after=virtregrewriter %s -mtriple=x86_64-unknown-unknown -o - | FileCheck %s
-#
-# Test that when a livedebugvars interval is split, DBG_VALUE_LISTs are created
-# with the correct operands and exprs. Force values to be moved around between
-# registers and stack through inline asm blocks that clobber things.
-#
-# CHECK-LABEL: bb.0.entry:
-# CHECK:       $rbx = COPY $rsi
-# CHECK-NEXT:  MOV64mr %stack.0, 1, $noreg, 0, $noreg, $rdi
-# CHECK-NEXT:  DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_deref, DW_OP_LLVM_arg, 1, DW_OP_plus), %stack.0, $rbx,
-# CHECK-NEXT:  INLINEASM
-# CHECK-NEXT:  $rax = MOV64rm %stack.0,
-# CHECK-NEXT:  DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), $rax, $rbx,
-# CHECK-NEXT:  CALL64pcrel32 @foo
-# CHECK-NEXT:  DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_deref, DW_OP_LLVM_arg, 1, DW_OP_plus), %stack.0, $rbx,
-# CHECK-NEXT:  $rcx = COPY killed renamable $rbx
-# CHECK-NEXT:  DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_deref, DW_OP_LLVM_arg, 1, DW_OP_plus), %stack.0, $rcx,
-# CHECK-NEXT:  INLINEASM
-# CHECK-NEXT:  $rax = MOV64rm %stack.0
-# CHECK-NEXT:  DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), $rax, $rcx,
-
---- |
-  ; ModuleID = 'tmp.ll'
-  source_filename = "tmp.ll"
-  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-  
-  %struct.a = type { i32 }
-  
-  ; Function Attrs: nounwind ssp
-  define i32 @bar(ptr nocapture %b, i32 %shoes) !dbg !4 {
-  entry:
-    tail call void @llvm.dbg.value(metadata i32 %shoes, metadata !9, metadata !DIExpression()), !dbg !16
-    %tmp1 = getelementptr inbounds %struct.a, ptr %b, i64 0, i32 0, !dbg !17
-    br label %bb3
-  
-  bb1:                                              ; preds = %bb2
-    tail call void @llvm.dbg.value(metadata i32 %shoes, metadata !9, metadata !DIExpression()), !dbg !16
-    %add = add nsw i32 %tmp2, 1, !dbg !18
-    br label %exit
-  
-  bb2:                                              ; preds = %bb3
-    tail call void @llvm.dbg.value(metadata i32 %tmp2, metadata !14, metadata !DIExpression()), !dbg !17
-    %call = tail call i32 (...) @foo(i32 %tmp2), !dbg !19
-    br label %bb1
-  
-  bb3:                                              ; preds = %entry
-    %tmp2 = load i32, ptr %tmp1, align 4, !dbg !17
-    br label %bb2
-  
-  exit:                                             ; preds = %bb1
-    ret i32 %shoes, !dbg !18
-  }
-  
-  declare i32 @foo(...)
-  
-  ; Function Attrs: nounwind readnone speculatable willreturn
-  declare void @llvm.dbg.value(metadata, metadata, metadata)
-  
-  !llvm.dbg.cu = !{!0}
-  !llvm.module.flags = !{!3}
-  
-  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 2.9 (trunk 122997)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2)
-  !1 = !DIFile(filename: "bar.c", directory: "/private/tmp")
-  !2 = !{}
-  !3 = !{i32 1, !"Debug Info Version", i32 3}
-  !4 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 5, type: !5, virtualIndex: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8)
-  !5 = !DISubroutineType(types: !6)
-  !6 = !{!7}
-  !7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-  !8 = !{!9, !14}
-  !9 = !DILocalVariable(name: "b", arg: 1, scope: !4, file: !1, line: 5, type: !10)
-  !10 = !DIDerivedType(tag: DW_TAG_pointer_type, scope: !0, baseType: !11, size: 64, align: 64)
-  !11 = !DICompositeType(tag: DW_TAG_structure_type, name: "a", scope: !0, file: !1, line: 1, size: 32, align: 32, elements: !12)
-  !12 = !{!13}
-  !13 = !DIDerivedType(tag: DW_TAG_member, name: "c", scope: !1, file: !1, line: 2, baseType: !7, size: 32, align: 32)
-  !14 = !DILocalVariable(name: "x", scope: !15, file: !1, line: 6, type: !7)
-  !15 = distinct !DILexicalBlock(scope: !4, file: !1, line: 5, column: 22)
-  !16 = !DILocation(line: 5, column: 19, scope: !4)
-  !17 = !DILocation(line: 6, column: 14, scope: !15)
-  !18 = !DILocation(line: 8, column: 2, scope: !15)
-  !19 = !DILocation(line: 7, column: 2, scope: !15)
-
-...
----
-name:            bar
-tracksRegLiveness: true
-body:             |
-  bb.0.entry:
-    liveins: $rdi, $rsi
-  
-    %4:gr64= COPY $rsi
-    %2:gr64 = COPY $rdi
-    DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), %2, %4, debug-location !17
-    %3:gr64 = COPY killed %2
-    %5:gr64 = COPY killed %4
-
-    ; Force allocation into $rax and $rbx
-    INLINEASM &"", 1, 12, implicit-def dead $rcx, 12, implicit-def dead $rdx, 12, implicit-def dead $rsi, 12, implicit-def dead $rdi, 12, implicit-def $rbp, 12, implicit-def dead $r8, 12, implicit-def dead $r9, 12, implicit-def dead $r10, 12, implicit-def dead $r11, 12, implicit-def dead $r12, 12, implicit-def dead $r13, 12, implicit-def dead $r14, 12, implicit-def dead $r15, 12, !18, debug-location !17
-
-    ; Force a use of these two registers.
-    CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit %3, implicit %5
-
-    ; Now make the register allocator move them to rcx and rdx!
-    INLINEASM &"", 1, 12, implicit-def dead $rax, 12, implicit-def dead $rbx, 12, implicit-def dead $rsi, 12, implicit-def dead $rdi, 12, implicit-def $rbp, 12, implicit-def dead $r8, 12, implicit-def dead $r9, 12, implicit-def dead $r10, 12, implicit-def dead $r11, 12, implicit-def dead $r12, 12, implicit-def dead $r13, 12, implicit-def dead $r14, 12, implicit-def dead $r15, 12, !18, debug-location !17
-
-    CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit %3, implicit %5
-
-    RET64
-...
+# RUN: llc -start-after=phi-node-elimination -stop-after=virtregrewriter %s -mtriple=x86_64-unknown-unknown -o - | FileCheck %s
+#
+# Test that when a livedebugvars interval is split, DBG_VALUE_LISTs are created
+# with the correct operands and exprs. Force values to be moved around between
+# registers and stack through inline asm blocks that clobber things.
+#
+# CHECK-LABEL: bb.0.entry:
+# CHECK:       $rbx = COPY $rsi
+# CHECK-NEXT:  MOV64mr %stack.0, 1, $noreg, 0, $noreg, $rdi
+# CHECK-NEXT:  DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_deref, DW_OP_LLVM_arg, 1, DW_OP_plus), %stack.0, $rbx,
+# CHECK-NEXT:  INLINEASM
+# CHECK-NEXT:  $rax = MOV64rm %stack.0,
+# CHECK-NEXT:  DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), $rax, $rbx,
+# CHECK-NEXT:  CALL64pcrel32 @foo
+# CHECK-NEXT:  DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_deref, DW_OP_LLVM_arg, 1, DW_OP_plus), %stack.0, $rbx,
+# CHECK-NEXT:  $rcx = COPY killed renamable $rbx
+# CHECK-NEXT:  DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_deref, DW_OP_LLVM_arg, 1, DW_OP_plus), %stack.0, $rcx,
+# CHECK-NEXT:  INLINEASM
+# CHECK-NEXT:  $rax = MOV64rm %stack.0
+# CHECK-NEXT:  DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), $rax, $rcx,
+
+--- |
+  ; ModuleID = 'tmp.ll'
+  source_filename = "tmp.ll"
+  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+  %struct.a = type { i32 }
+
+  ; Function Attrs: nounwind ssp
+  define i32 @bar(ptr nocapture %b, i32 %shoes) !dbg !4 {
+  entry:
+    tail call void @llvm.dbg.value(metadata i32 %shoes, metadata !9, metadata !DIExpression()), !dbg !16
+    %tmp1 = getelementptr inbounds %struct.a, ptr %b, i64 0, i32 0, !dbg !17
+    br label %bb3
+
+  bb1:                                              ; preds = %bb2
+    tail call void @llvm.dbg.value(metadata i32 %shoes, metadata !9, metadata !DIExpression()), !dbg !16
+    %add = add nsw i32 %tmp2, 1, !dbg !18
+    br label %exit
+
+  bb2:                                              ; preds = %bb3
+    tail call void @llvm.dbg.value(metadata i32 %tmp2, metadata !14, metadata !DIExpression()), !dbg !17
+    %call = tail call i32 (...) @foo(i32 %tmp2), !dbg !19
+    br label %bb1
+
+  bb3:                                              ; preds = %entry
+    %tmp2 = load i32, ptr %tmp1, align 4, !dbg !17
+    br label %bb2
+
+  exit:                                             ; preds = %bb1
+    ret i32 %shoes, !dbg !18
+  }
+
+  declare i32 @foo(...)
+
+  ; Function Attrs: nounwind readnone speculatable willreturn
+  declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!3}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 2.9 (trunk 122997)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2)
+  !1 = !DIFile(filename: "bar.c", directory: "/private/tmp")
+  !2 = !{}
+  !3 = !{i32 1, !"Debug Info Version", i32 3}
+  !4 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 5, type: !5, virtualIndex: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8)
+  !5 = !DISubroutineType(types: !6)
+  !6 = !{!7}
+  !7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+  !8 = !{!9, !14}
+  !9 = !DILocalVariable(name: "b", arg: 1, scope: !4, file: !1, line: 5, type: !10)
+  !10 = !DIDerivedType(tag: DW_TAG_pointer_type, scope: !0, baseType: !11, size: 64, align: 64)
+  !11 = !DICompositeType(tag: DW_TAG_structure_type, name: "a", scope: !0, file: !1, line: 1, size: 32, align: 32, elements: !12)
+  !12 = !{!13}
+  !13 = !DIDerivedType(tag: DW_TAG_member, name: "c", scope: !1, file: !1, line: 2, baseType: !7, size: 32, align: 32)
+  !14 = !DILocalVariable(name: "x", scope: !15, file: !1, line: 6, type: !7)
+  !15 = distinct !DILexicalBlock(scope: !4, file: !1, line: 5, column: 22)
+  !16 = !DILocation(line: 5, column: 19, scope: !4)
+  !17 = !DILocation(line: 6, column: 14, scope: !15)
+  !18 = !DILocation(line: 8, column: 2, scope: !15)
+  !19 = !DILocation(line: 7, column: 2, scope: !15)
+
+...
+---
+name:            bar
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $rdi, $rsi
+
+    %4:gr64= COPY $rsi
+    %2:gr64 = COPY $rdi
+    DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), %2, %4, debug-location !17
+    %3:gr64 = COPY killed %2
+    %5:gr64 = COPY killed %4
+
+    ; Force allocation into $rax and $rbx
+    INLINEASM &"", 1, 12, implicit-def dead $rcx, 12, implicit-def dead $rdx, 12, implicit-def dead $rsi, 12, implicit-def dead $rdi, 12, implicit-def $rbp, 12, implicit-def dead $r8, 12, implicit-def dead $r9, 12, implicit-def dead $r10, 12, implicit-def dead $r11, 12, implicit-def dead $r12, 12, implicit-def dead $r13, 12, implicit-def dead $r14, 12, implicit-def dead $r15, 12, !18, debug-location !17
+
+    ; Force a use of these two registers.
+    CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit %3, implicit %5
+
+    ; Now make the register allocator move them to rcx and rdx!
+    INLINEASM &"", 1, 12, implicit-def dead $rax, 12, implicit-def dead $rbx, 12, implicit-def dead $rsi, 12, implicit-def dead $rdi, 12, implicit-def $rbp, 12, implicit-def dead $r8, 12, implicit-def dead $r9, 12, implicit-def dead $r10, 12, implicit-def dead $r11, 12, implicit-def dead $r12, 12, implicit-def dead $r13, 12, implicit-def dead $r14, 12, implicit-def dead $r15, 12, !18, debug-location !17
+
+    CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit %3, implicit %5
+
+    RET64
+...
diff --git a/llvm/test/DebugInfo/MIR/X86/dvl-livedebugvars-stackptr.mir b/llvm/test/DebugInfo/MIR/X86/dvl-livedebugvars-stackptr.mir
index 363ef4395eaf..ef495e9bac82 100644
--- a/llvm/test/DebugInfo/MIR/X86/dvl-livedebugvars-stackptr.mir
+++ b/llvm/test/DebugInfo/MIR/X86/dvl-livedebugvars-stackptr.mir
@@ -1,113 +1,113 @@
-# RUN: llc -start-after=phi-node-elimination -stop-after=virtregrewriter %s -mtriple=x86_64-unknown-unknown -o - | FileCheck %s
-#
-# This is a copy of the adjacent "-movements.mir" file, but where one of the
-# operands to DBG_VALUE_LIST is a stack _pointer_ rather than a spilt value.
-# The expression should grow no additional derefs for the stack pointer.
-#
-# CHECK-LABEL: bb.0.entry:
-# CHECK:       $rbx = COPY $rsi
-# CHECK-NEXT:  MOV64mr %stack.1, 1, $noreg, 0, $noreg, $rdi
-# CHECK-NEXT:  DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_deref, DW_OP_LLVM_arg, 1, DW_OP_plus), %stack.1, %stack.0.local1
-# CHECK-NEXT:  INLINEASM
-# CHECK-NEXT:  $rax = MOV64rm %stack.1,
-# CHECK-NEXT:  DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), $rax, %stack.0.local1,
-# CHECK-NEXT:  CALL64pcrel32 @foo
-# CHECK-NEXT:  DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_deref, DW_OP_LLVM_arg, 1, DW_OP_plus), %stack.1, %stack.0.local1,
-# CHECK-NEXT:  $rcx = COPY killed renamable $rbx
-# CHECK-NEXT:  INLINEASM
-# CHECK-NEXT:  $rax = MOV64rm %stack.1
-# CHECK-NEXT:  DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), $rax, %stack.0.local1,
-
---- |
-  ; ModuleID = 'tmp.ll'
-  source_filename = "tmp.ll"
-  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-  
-  %struct.a = type { i32 }
-  
-  ; Function Attrs: nounwind ssp
-  define i32 @bar(ptr nocapture %b, i32 %shoes) !dbg !4 {
-  entry:
-    %local1 = alloca i64
-    tail call void @llvm.dbg.value(metadata i32 %shoes, metadata !9, metadata !DIExpression()), !dbg !16
-    %tmp1 = getelementptr inbounds %struct.a, ptr %b, i64 0, i32 0, !dbg !17
-    br label %bb3
-  
-  bb1:                                              ; preds = %bb2
-    tail call void @llvm.dbg.value(metadata i32 %shoes, metadata !9, metadata !DIExpression()), !dbg !16
-    %add = add nsw i32 %tmp2, 1, !dbg !18
-    br label %exit
-  
-  bb2:                                              ; preds = %bb3
-    tail call void @llvm.dbg.value(metadata i32 %tmp2, metadata !14, metadata !DIExpression()), !dbg !17
-    %call = tail call i32 (...) @foo(i32 %tmp2), !dbg !19
-    br label %bb1
-  
-  bb3:                                              ; preds = %entry
-    %tmp2 = load i32, ptr %tmp1, align 4, !dbg !17
-    br label %bb2
-  
-  exit:                                             ; preds = %bb1
-    ret i32 %shoes, !dbg !18
-  }
-  
-  declare i32 @foo(...)
-  
-  ; Function Attrs: nounwind readnone speculatable willreturn
-  declare void @llvm.dbg.value(metadata, metadata, metadata)
-  
-  !llvm.dbg.cu = !{!0}
-  !llvm.module.flags = !{!3}
-  
-  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 2.9 (trunk 122997)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2)
-  !1 = !DIFile(filename: "bar.c", directory: "/private/tmp")
-  !2 = !{}
-  !3 = !{i32 1, !"Debug Info Version", i32 3}
-  !4 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 5, type: !5, virtualIndex: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8)
-  !5 = !DISubroutineType(types: !6)
-  !6 = !{!7}
-  !7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-  !8 = !{!9, !14}
-  !9 = !DILocalVariable(name: "b", arg: 1, scope: !4, file: !1, line: 5, type: !10)
-  !10 = !DIDerivedType(tag: DW_TAG_pointer_type, scope: !0, baseType: !11, size: 64, align: 64)
-  !11 = !DICompositeType(tag: DW_TAG_structure_type, name: "a", scope: !0, file: !1, line: 1, size: 32, align: 32, elements: !12)
-  !12 = !{!13}
-  !13 = !DIDerivedType(tag: DW_TAG_member, name: "c", scope: !1, file: !1, line: 2, baseType: !7, size: 32, align: 32)
-  !14 = !DILocalVariable(name: "x", scope: !15, file: !1, line: 6, type: !7)
-  !15 = distinct !DILexicalBlock(scope: !4, file: !1, line: 5, column: 22)
-  !16 = !DILocation(line: 5, column: 19, scope: !4)
-  !17 = !DILocation(line: 6, column: 14, scope: !15)
-  !18 = !DILocation(line: 8, column: 2, scope: !15)
-  !19 = !DILocation(line: 7, column: 2, scope: !15)
-
-...
----
-name:            bar
-tracksRegLiveness: true
-stack:
-  - { id: 0, name: local1, type: default, offset: -24, size: 8, alignment: 8,
-      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-body:             |
-  bb.0.entry:
-    liveins: $rdi, $rsi
-  
-    %4:gr64= COPY $rsi
-    %2:gr64 = COPY $rdi
-    DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), %2, %stack.0, debug-location !17
-    %3:gr64 = COPY killed %2
-    %5:gr64 = COPY killed %4
-
-    ; Force allocation into $rax and $rbx
-    INLINEASM &"", 1, 12, implicit-def dead $rcx, 12, implicit-def dead $rdx, 12, implicit-def dead $rsi, 12, implicit-def dead $rdi, 12, implicit-def $rbp, 12, implicit-def dead $r8, 12, implicit-def dead $r9, 12, implicit-def dead $r10, 12, implicit-def dead $r11, 12, implicit-def dead $r12, 12, implicit-def dead $r13, 12, implicit-def dead $r14, 12, implicit-def dead $r15, 12, !18, debug-location !17
-
-    ; Force a use of these two registers.
-    CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit %3, implicit %5
-
-    ; Now make the register allocator move them to rcx and rdx!
-    INLINEASM &"", 1, 12, implicit-def dead $rax, 12, implicit-def dead $rbx, 12, implicit-def dead $rsi, 12, implicit-def dead $rdi, 12, implicit-def $rbp, 12, implicit-def dead $r8, 12, implicit-def dead $r9, 12, implicit-def dead $r10, 12, implicit-def dead $r11, 12, implicit-def dead $r12, 12, implicit-def dead $r13, 12, implicit-def dead $r14, 12, implicit-def dead $r15, 12, !18, debug-location !17
-
-    CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit %3, implicit %5
-
-    RET64
-...
+# RUN: llc -start-after=phi-node-elimination -stop-after=virtregrewriter %s -mtriple=x86_64-unknown-unknown -o - | FileCheck %s
+#
+# This is a copy of the adjacent "-movements.mir" file, but where one of the
+# operands to DBG_VALUE_LIST is a stack _pointer_ rather than a spilt value.
+# The expression should grow no additional derefs for the stack pointer.
+#
+# CHECK-LABEL: bb.0.entry:
+# CHECK:       $rbx = COPY $rsi
+# CHECK-NEXT:  MOV64mr %stack.1, 1, $noreg, 0, $noreg, $rdi
+# CHECK-NEXT:  DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_deref, DW_OP_LLVM_arg, 1, DW_OP_plus), %stack.1, %stack.0.local1
+# CHECK-NEXT:  INLINEASM
+# CHECK-NEXT:  $rax = MOV64rm %stack.1,
+# CHECK-NEXT:  DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), $rax, %stack.0.local1,
+# CHECK-NEXT:  CALL64pcrel32 @foo
+# CHECK-NEXT:  DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_deref, DW_OP_LLVM_arg, 1, DW_OP_plus), %stack.1, %stack.0.local1,
+# CHECK-NEXT:  $rcx = COPY killed renamable $rbx
+# CHECK-NEXT:  INLINEASM
+# CHECK-NEXT:  $rax = MOV64rm %stack.1
+# CHECK-NEXT:  DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), $rax, %stack.0.local1,
+
+--- |
+  ; ModuleID = 'tmp.ll'
+  source_filename = "tmp.ll"
+  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+  %struct.a = type { i32 }
+
+  ; Function Attrs: nounwind ssp
+  define i32 @bar(ptr nocapture %b, i32 %shoes) !dbg !4 {
+  entry:
+    %local1 = alloca i64
+    tail call void @llvm.dbg.value(metadata i32 %shoes, metadata !9, metadata !DIExpression()), !dbg !16
+    %tmp1 = getelementptr inbounds %struct.a, ptr %b, i64 0, i32 0, !dbg !17
+    br label %bb3
+
+  bb1:                                              ; preds = %bb2
+    tail call void @llvm.dbg.value(metadata i32 %shoes, metadata !9, metadata !DIExpression()), !dbg !16
+    %add = add nsw i32 %tmp2, 1, !dbg !18
+    br label %exit
+
+  bb2:                                              ; preds = %bb3
+    tail call void @llvm.dbg.value(metadata i32 %tmp2, metadata !14, metadata !DIExpression()), !dbg !17
+    %call = tail call i32 (...) @foo(i32 %tmp2), !dbg !19
+    br label %bb1
+
+  bb3:                                              ; preds = %entry
+    %tmp2 = load i32, ptr %tmp1, align 4, !dbg !17
+    br label %bb2
+
+  exit:                                             ; preds = %bb1
+    ret i32 %shoes, !dbg !18
+  }
+
+  declare i32 @foo(...)
+
+  ; Function Attrs: nounwind readnone speculatable willreturn
+  declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!3}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 2.9 (trunk 122997)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2)
+  !1 = !DIFile(filename: "bar.c", directory: "/private/tmp")
+  !2 = !{}
+  !3 = !{i32 1, !"Debug Info Version", i32 3}
+  !4 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 5, type: !5, virtualIndex: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8)
+  !5 = !DISubroutineType(types: !6)
+  !6 = !{!7}
+  !7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+  !8 = !{!9, !14}
+  !9 = !DILocalVariable(name: "b", arg: 1, scope: !4, file: !1, line: 5, type: !10)
+  !10 = !DIDerivedType(tag: DW_TAG_pointer_type, scope: !0, baseType: !11, size: 64, align: 64)
+  !11 = !DICompositeType(tag: DW_TAG_structure_type, name: "a", scope: !0, file: !1, line: 1, size: 32, align: 32, elements: !12)
+  !12 = !{!13}
+  !13 = !DIDerivedType(tag: DW_TAG_member, name: "c", scope: !1, file: !1, line: 2, baseType: !7, size: 32, align: 32)
+  !14 = !DILocalVariable(name: "x", scope: !15, file: !1, line: 6, type: !7)
+  !15 = distinct !DILexicalBlock(scope: !4, file: !1, line: 5, column: 22)
+  !16 = !DILocation(line: 5, column: 19, scope: !4)
+  !17 = !DILocation(line: 6, column: 14, scope: !15)
+  !18 = !DILocation(line: 8, column: 2, scope: !15)
+  !19 = !DILocation(line: 7, column: 2, scope: !15)
+
+...
+---
+name:            bar
+tracksRegLiveness: true
+stack:
+  - { id: 0, name: local1, type: default, offset: -24, size: 8, alignment: 8,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+body:             |
+  bb.0.entry:
+    liveins: $rdi, $rsi
+
+    %4:gr64= COPY $rsi
+    %2:gr64 = COPY $rdi
+    DBG_VALUE_LIST !14, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), %2, %stack.0, debug-location !17
+    %3:gr64 = COPY killed %2
+    %5:gr64 = COPY killed %4
+
+    ; Force allocation into $rax and $rbx
+    INLINEASM &"", 1, 12, implicit-def dead $rcx, 12, implicit-def dead $rdx, 12, implicit-def dead $rsi, 12, implicit-def dead $rdi, 12, implicit-def $rbp, 12, implicit-def dead $r8, 12, implicit-def dead $r9, 12, implicit-def dead $r10, 12, implicit-def dead $r11, 12, implicit-def dead $r12, 12, implicit-def dead $r13, 12, implicit-def dead $r14, 12, implicit-def dead $r15, 12, !18, debug-location !17
+
+    ; Force a use of these two registers.
+    CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit %3, implicit %5
+
+    ; Now make the register allocator move them to rcx and rdx!
+    INLINEASM &"", 1, 12, implicit-def dead $rax, 12, implicit-def dead $rbx, 12, implicit-def dead $rsi, 12, implicit-def dead $rdi, 12, implicit-def $rbp, 12, implicit-def dead $r8, 12, implicit-def dead $r9, 12, implicit-def dead $r10, 12, implicit-def dead $r11, 12, implicit-def dead $r12, 12, implicit-def dead $r13, 12, implicit-def dead $r14, 12, implicit-def dead $r15, 12, !18, debug-location !17
+
+    CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit %3, implicit %5
+
+    RET64
+...
diff --git a/llvm/test/DebugInfo/MIR/X86/instr-ref-join-def-vphi.mir b/llvm/test/DebugInfo/MIR/X86/instr-ref-join-def-vphi.mir
index 35bd7cddca67..a23d80ea64b3 100644
--- a/llvm/test/DebugInfo/MIR/X86/instr-ref-join-def-vphi.mir
+++ b/llvm/test/DebugInfo/MIR/X86/instr-ref-join-def-vphi.mir
@@ -12,7 +12,7 @@
 # CHECK: ![[VAR:[0-9]+]] = !DILocalVariable(name: "a"
 
 # CHECK-LABEL: bb.6
-# CHECK: DBG_VALUE_LIST ![[VAR]], !DIExpression(DW_OP_LLVM_arg, 0), $esi
+# CHECK: DBG_VALUE_LIST ![[VAR]], !DIExpression(DW_OP_LLVM_arg, 0), $esi
 
 --- |
   target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -198,7 +198,7 @@ body:             |
 
     DBG_PHI $esi, 3
     DBG_PHI $edi, 2
-    DBG_INSTR_REF !14, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(3, 0), debug-location !16
+    DBG_INSTR_REF !14, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(3, 0), debug-location !16
     CALL64pcrel32 @"?bar@@YAHXZ", csr_win64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $eax, debug-location !19
     renamable $edi = nsw SUB32rr killed renamable $edi, killed renamable $eax, implicit-def dead $eflags, debug-instr-number 1, debug-location !19
     renamable $eax = IMUL32rri renamable $edi, -1431655765, implicit-def dead $eflags, debug-location !21
@@ -212,7 +212,7 @@ body:             |
 
     CALL64pcrel32 @"?bar@@YAHXZ", csr_win64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $eax, debug-location !22
     renamable $esi = nsw ADD32rr killed renamable $esi, killed renamable $eax, implicit-def dead $eflags, debug-instr-number 4, debug-location !22
-    DBG_INSTR_REF !14, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(4, 0), debug-location !16
+    DBG_INSTR_REF !14, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(4, 0), debug-location !16
     JMP_1 %bb.3
 
   bb.4 (%ir-block.16):
@@ -227,7 +227,7 @@ body:             |
     liveins: $esi
 
     CALL64pcrel32 @"?bar@@YAHXZ", csr_win64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $eax, debug-location !31
-    DBG_INSTR_REF !14, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(5, 0), debug-location !16
+    DBG_INSTR_REF !14, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(5, 0), debug-location !16
 
   bb.6 (%ir-block.22):
     liveins: $esi
diff --git a/llvm/test/DebugInfo/Mips/eh_frame.ll b/llvm/test/DebugInfo/Mips/eh_frame.ll
index 60d4dc76777e..d53bc156ef29 100644
--- a/llvm/test/DebugInfo/Mips/eh_frame.ll
+++ b/llvm/test/DebugInfo/Mips/eh_frame.ll
@@ -17,9 +17,9 @@
 ; STATIC-DAG: R_MIPS_32 00000000 .gcc_except_table
 
 ; PIC-LABEL: Relocation section '.rel.eh_frame'
-; PIC-DAG: R_MIPS_32   00000000 DW.ref.__gxx_personality_v0
-; PIC-DAG: R_MIPS_PC32
-; PIC-DAG: R_MIPS_32   00000000 .gcc_except_table
+; PIC-DAG: R_MIPS_PC32   00000000 DW.ref.__gxx_personality_v0
+; PIC-DAG: R_MIPS_PC32   00000000 .L0
+; PIC-DAG: R_MIPS_PC32   00000000 .L0
 
 ; CHECK-READELF: DW.ref.__gxx_personality_v0
 ; CHECK-READELF-STATIC-NEXT: R_MIPS_32 00000000 .text
diff --git a/llvm/test/DebugInfo/NVPTX/packed_bitfields.ll b/llvm/test/DebugInfo/NVPTX/packed_bitfields.ll
index e2097d7f49b4..62ffa0a4001f 100644
--- a/llvm/test/DebugInfo/NVPTX/packed_bitfields.ll
+++ b/llvm/test/DebugInfo/NVPTX/packed_bitfields.ll
@@ -16,7 +16,7 @@
 ; CHECK-NEXT: .b8 1    // DW_AT_byte_size
 ; CHECK-NEXT: .b8 6    // DW_AT_bit_size
 ; Negative offset must be encoded as an unsigned integer.
-; CHECK-NEXT: .b64 0xffffffffffffffff // DW_AT_bit_offset
+; CHECK-NEXT: .b8 127  // DW_AT_bit_offset
 ; CHECK-NEXT: .b8 2    // DW_AT_data_member_location
 
 %struct.anon = type { i16 }
diff --git a/llvm/test/DebugInfo/PDB/Inputs/longname-truncation.yaml b/llvm/test/DebugInfo/PDB/Inputs/longname-truncation.yaml
index 3d6639edc581..f20aca8a1a31 100644
--- a/llvm/test/DebugInfo/PDB/Inputs/longname-truncation.yaml
+++ b/llvm/test/DebugInfo/PDB/Inputs/longname-truncation.yaml
@@ -1,26 +1,26 @@
----
-TpiStream:       
-  Version:         VC80
-  Records:         
-    - Kind:            LF_STRUCTURE
-      Class:           
-        MemberCount:     0
-        Options:         [ None, HasUniqueName ]
-        FieldList:       0
-        Name:            'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
-        UniqueName:      'bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb'
-        DerivationList:  0
-        VTableShape:     0
-        Size:            1
-
-    - Kind:            LF_STRUCTURE
-      Class:           
-        MemberCount:     0
-        Options:         [ None ]
-        FieldList:       0
-        Name:            'fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff'
-        UniqueName:      ''
-        DerivationList:  0
-        VTableShape:     0
-        Size:            8
-...
+---
+TpiStream:
+  Version:         VC80
+  Records:
+    - Kind:            LF_STRUCTURE
+      Class:
+        MemberCount:     0
+        Options:         [ None, HasUniqueName ]
+        FieldList:       0
+        Name:            'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
+        UniqueName:      'bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb'
+        DerivationList:  0
+        VTableShape:     0
+        Size:            1
+
+    - Kind:            LF_STRUCTURE
+      Class:
+        MemberCount:     0
+        Options:         [ None ]
+        FieldList:       0
+        Name:            'fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff'
+        UniqueName:      ''
+        DerivationList:  0
+        VTableShape:     0
+        Size:            8
+...
diff --git a/llvm/test/DebugInfo/PDB/Inputs/merge-types-1.yaml b/llvm/test/DebugInfo/PDB/Inputs/merge-types-1.yaml
index 89d471e3343d..06f3feb31318 100644
--- a/llvm/test/DebugInfo/PDB/Inputs/merge-types-1.yaml
+++ b/llvm/test/DebugInfo/PDB/Inputs/merge-types-1.yaml
@@ -1,52 +1,52 @@
----
-TpiStream:
-  Records:
-    # uint32_t* [Index: 0x1000]
-    - Kind:            LF_POINTER
-      Pointer:         
-        ReferentType:    117
-        Attrs:           32778
-    # int64_t* [Index: 0x1001]
-    - Kind:            LF_POINTER
-      Pointer:         
-        ReferentType:    118
-        Attrs:           32778
-    # struct OnlyInMerge1 [Index: 0x1002]
-    - Kind:            LF_STRUCTURE
-      Class:           
-        MemberCount:     0
-        Options:         [ None, ForwardReference, HasUniqueName ]
-        FieldList:       0
-        Name:            'OnlyInMerge1'
-        UniqueName:      'OnlyInMerge1'
-        DerivationList:  0
-        VTableShape:     0
-        Size:            0
-    # uint32_t** [Index: 0x1003]
-    - Kind:            LF_POINTER
-      Pointer:         
-        ReferentType:    4096
-        Attrs:           32778
-    # uint32_t*** [Index: 0x1004]
-    - Kind:            LF_POINTER
-      Pointer:         
-        ReferentType:    4099
-        Attrs:           32778
-    # int64_t* [Index: 0x1005]
-    - Kind:            LF_POINTER
-      Pointer:         
-        ReferentType:    4097
-        Attrs:           32778
-    # [uint32_t, uint32_t*, uint32_t**] [Index: 0x1006]
-    - Kind:            LF_ARGLIST
-      ArgList:         
-        ArgIndices:      [ 117, 4096, 4099 ]
-    # uint32_t (uint32_t, uint32_t*, uint32_t**) [Index: 0x1007]
-    - Kind:            LF_PROCEDURE
-      Procedure:       
-        ReturnType:      117
-        CallConv:        NearC
-        Options:         [ None ]
-        ParameterCount:  0
-        ArgumentList:    4102
-...
+---
+TpiStream:
+  Records:
+    # uint32_t* [Index: 0x1000]
+    - Kind:            LF_POINTER
+      Pointer:
+        ReferentType:    117
+        Attrs:           32778
+    # int64_t* [Index: 0x1001]
+    - Kind:            LF_POINTER
+      Pointer:
+        ReferentType:    118
+        Attrs:           32778
+    # struct OnlyInMerge1 [Index: 0x1002]
+    - Kind:            LF_STRUCTURE
+      Class:
+        MemberCount:     0
+        Options:         [ None, ForwardReference, HasUniqueName ]
+        FieldList:       0
+        Name:            'OnlyInMerge1'
+        UniqueName:      'OnlyInMerge1'
+        DerivationList:  0
+        VTableShape:     0
+        Size:            0
+    # uint32_t** [Index: 0x1003]
+    - Kind:            LF_POINTER
+      Pointer:
+        ReferentType:    4096
+        Attrs:           32778
+    # uint32_t*** [Index: 0x1004]
+    - Kind:            LF_POINTER
+      Pointer:
+        ReferentType:    4099
+        Attrs:           32778
+    # int64_t* [Index: 0x1005]
+    - Kind:            LF_POINTER
+      Pointer:
+        ReferentType:    4097
+        Attrs:           32778
+    # [uint32_t, uint32_t*, uint32_t**] [Index: 0x1006]
+    - Kind:            LF_ARGLIST
+      ArgList:
+        ArgIndices:      [ 117, 4096, 4099 ]
+    # uint32_t (uint32_t, uint32_t*, uint32_t**) [Index: 0x1007]
+    - Kind:            LF_PROCEDURE
+      Procedure:
+        ReturnType:      117
+        CallConv:        NearC
+        Options:         [ None ]
+        ParameterCount:  0
+        ArgumentList:    4102
+...
diff --git a/llvm/test/DebugInfo/PDB/Inputs/merge-types-2.yaml b/llvm/test/DebugInfo/PDB/Inputs/merge-types-2.yaml
index b6cbdb98f0ca..957d2286e7a9 100644
--- a/llvm/test/DebugInfo/PDB/Inputs/merge-types-2.yaml
+++ b/llvm/test/DebugInfo/PDB/Inputs/merge-types-2.yaml
@@ -1,52 +1,52 @@
----
-TpiStream:
-  Records:         
-    # uint32_t* [Index: 0x1000]
-    - Kind:            LF_POINTER
-      Pointer:         
-        ReferentType:    117     
-        Attrs:           32778
-    # uint32_t** [Index: 0x1001]
-    - Kind:            LF_POINTER
-      Pointer:         
-        ReferentType:    4096    
-        Attrs:           32778
-    # uint32_t*** [Index: 0x1002]
-    - Kind:            LF_POINTER
-      Pointer:         
-        ReferentType:    4097    
-        Attrs:           32778
-    # [uint32_t, uint32_t*, uint32_t**] [Index: 0x1003]
-    - Kind:            LF_ARGLIST
-      ArgList:         
-        ArgIndices:      [ 117, 4096, 4097 ]
-    # uint32_t (uint32_t, uint32_t*, uint32_t**) [Index: 0x1004]
-    - Kind:            LF_PROCEDURE
-      Procedure:       
-        ReturnType:      117
-        CallConv:        NearC
-        Options:         [ None ]
-        ParameterCount:  0
-        ArgumentList:    4099
-    # int64_t* [Index: 0x1005]
-    - Kind:            LF_POINTER
-      Pointer:         
-        ReferentType:    118     
-        Attrs:           32778
-    # int64_t** [Index: 0x1006]
-    - Kind:            LF_POINTER
-      Pointer:         
-        ReferentType:    4101
-        Attrs:           32778
-    # struct OnlyInMerge2 [Index: 0x1007]
-    - Kind:            LF_STRUCTURE
-      Class:           
-        MemberCount:     0
-        Options:         [ None, ForwardReference, HasUniqueName ]
-        FieldList:       0
-        Name:            'OnlyInMerge2'
-        UniqueName:      'OnlyInMerge2'
-        DerivationList:  0
-        VTableShape:     0
-        Size:            0
-...
+---
+TpiStream:
+  Records:
+    # uint32_t* [Index: 0x1000]
+    - Kind:            LF_POINTER
+      Pointer:
+        ReferentType:    117
+        Attrs:           32778
+    # uint32_t** [Index: 0x1001]
+    - Kind:            LF_POINTER
+      Pointer:
+        ReferentType:    4096
+        Attrs:           32778
+    # uint32_t*** [Index: 0x1002]
+    - Kind:            LF_POINTER
+      Pointer:
+        ReferentType:    4097
+        Attrs:           32778
+    # [uint32_t, uint32_t*, uint32_t**] [Index: 0x1003]
+    - Kind:            LF_ARGLIST
+      ArgList:
+        ArgIndices:      [ 117, 4096, 4097 ]
+    # uint32_t (uint32_t, uint32_t*, uint32_t**) [Index: 0x1004]
+    - Kind:            LF_PROCEDURE
+      Procedure:
+        ReturnType:      117
+        CallConv:        NearC
+        Options:         [ None ]
+        ParameterCount:  0
+        ArgumentList:    4099
+    # int64_t* [Index: 0x1005]
+    - Kind:            LF_POINTER
+      Pointer:
+        ReferentType:    118
+        Attrs:           32778
+    # int64_t** [Index: 0x1006]
+    - Kind:            LF_POINTER
+      Pointer:
+        ReferentType:    4101
+        Attrs:           32778
+    # struct OnlyInMerge2 [Index: 0x1007]
+    - Kind:            LF_STRUCTURE
+      Class:
+        MemberCount:     0
+        Options:         [ None, ForwardReference, HasUniqueName ]
+        FieldList:       0
+        Name:            'OnlyInMerge2'
+        UniqueName:      'OnlyInMerge2'
+        DerivationList:  0
+        VTableShape:     0
+        Size:            0
+...
diff --git a/llvm/test/DebugInfo/PDB/Inputs/one-symbol.yaml b/llvm/test/DebugInfo/PDB/Inputs/one-symbol.yaml
index 5728f05d490c..4807ccc83643 100644
--- a/llvm/test/DebugInfo/PDB/Inputs/one-symbol.yaml
+++ b/llvm/test/DebugInfo/PDB/Inputs/one-symbol.yaml
@@ -1,11 +1,11 @@
----
-DbiStream:
-  Modules:
-    - Module:          one-symbol.yaml
-      Modi:
-        Records:
-          - Kind:            S_OBJNAME
-            ObjNameSym:
-              Signature:       0
-              ObjectName:      'c:\foo\one-symbol.yaml'
-...
+---
+DbiStream:
+  Modules:
+    - Module:          one-symbol.yaml
+      Modi:
+        Records:
+          - Kind:            S_OBJNAME
+            ObjNameSym:
+              Signature:       0
+              ObjectName:      'c:\foo\one-symbol.yaml'
+...
diff --git a/llvm/test/DebugInfo/PDB/pdb-longname-truncation.test b/llvm/test/DebugInfo/PDB/pdb-longname-truncation.test
index 06eae8ea226d..3d08a2b2dc0d 100644
--- a/llvm/test/DebugInfo/PDB/pdb-longname-truncation.test
+++ b/llvm/test/DebugInfo/PDB/pdb-longname-truncation.test
@@ -1,3 +1,3 @@
-; For now just verify that this doesn't cause an error.  Later we pdbdump can
-; do type lookup, we can verify that the name matches what we expect.
-; RUN: llvm-pdbutil yaml2pdb -pdb=%t.pdb %p/Inputs/longname-truncation.yaml
+; For now just verify that this doesn't cause an error.  Later we pdbdump can
+; do type lookup, we can verify that the name matches what we expect.
+; RUN: llvm-pdbutil yaml2pdb -pdb=%t.pdb %p/Inputs/longname-truncation.yaml
diff --git a/llvm/test/DebugInfo/PDB/pdbdump-raw-bytes.test b/llvm/test/DebugInfo/PDB/pdbdump-raw-bytes.test
index 1087dfb65953..2cf6850221d5 100644
--- a/llvm/test/DebugInfo/PDB/pdbdump-raw-bytes.test
+++ b/llvm/test/DebugInfo/PDB/pdbdump-raw-bytes.test
@@ -1,25 +1,25 @@
-; RUN: llvm-pdbutil bytes -byte-range=20-60 %p/Inputs/empty.pdb | FileCheck --check-prefix=VALID %s
-; RUN: not llvm-pdbutil bytes -byte-range=100-20 %p/Inputs/empty.pdb 2>&1 | FileCheck --check-prefix=INVALID %s
-; RUN: not llvm-pdbutil bytes -byte-range=100000-200000 %p/Inputs/empty.pdb 2>&1 | FileCheck --check-prefix=INVALID-RANGE %s
-
-; RUN: llvm-pdbutil bytes -name-map %p/Inputs/empty.pdb | FileCheck --check-prefix=NAME-MAP %s
-
-
-VALID:                               MSF Bytes
-VALID-NEXT: ============================================================
-VALID-NEXT:  Bytes (
-VALID-NEXT:    0014: 372E3030 0D0A1A44 53000000 00100000 02000000 19000000 88000000 00000000  |7.00...DS.......................|
-VALID-NEXT:    0034: 18000000 00000000 00                                                     |.........|
-VALID-NEXT:  )
-
-INVALID: llvm-pdbutil: Invalid byte range specified.  Max < Min
-
-INVALID-RANGE: llvm-pdbutil: Invalid byte range specified.  Requested byte larger than file size
-
-NAME-MAP:                            Named Stream Map
-NAME-MAP-NEXT: ============================================================
-NAME-MAP-NEXT:   Named Stream Map (
-NAME-MAP-NEXT:     1301C: 22000000 2F4C696E 6B496E66 6F002F6E 616D6573 002F7372 632F6865 61646572  |".../LinkInfo./names./src/header|
-NAME-MAP-NEXT:     1303C: 626C6F63 6B000300 00000600 00000100 00001A00 00000000 00001100 00000900  |block...........................|
-NAME-MAP-NEXT:     1305C: 00000A00 00000D00 00000000 00000500 0000                                 |..................|
-NAME-MAP-NEXT:   )
+; RUN: llvm-pdbutil bytes -byte-range=20-60 %p/Inputs/empty.pdb | FileCheck --check-prefix=VALID %s
+; RUN: not llvm-pdbutil bytes -byte-range=100-20 %p/Inputs/empty.pdb 2>&1 | FileCheck --check-prefix=INVALID %s
+; RUN: not llvm-pdbutil bytes -byte-range=100000-200000 %p/Inputs/empty.pdb 2>&1 | FileCheck --check-prefix=INVALID-RANGE %s
+
+; RUN: llvm-pdbutil bytes -name-map %p/Inputs/empty.pdb | FileCheck --check-prefix=NAME-MAP %s
+
+
+VALID:                               MSF Bytes
+VALID-NEXT: ============================================================
+VALID-NEXT:  Bytes (
+VALID-NEXT:    0014: 372E3030 0D0A1A44 53000000 00100000 02000000 19000000 88000000 00000000  |7.00...DS.......................|
+VALID-NEXT:    0034: 18000000 00000000 00                                                     |.........|
+VALID-NEXT:  )
+
+INVALID: llvm-pdbutil: Invalid byte range specified.  Max < Min
+
+INVALID-RANGE: llvm-pdbutil: Invalid byte range specified.  Requested byte larger than file size
+
+NAME-MAP:                            Named Stream Map
+NAME-MAP-NEXT: ============================================================
+NAME-MAP-NEXT:   Named Stream Map (
+NAME-MAP-NEXT:     1301C: 22000000 2F4C696E 6B496E66 6F002F6E 616D6573 002F7372 632F6865 61646572  |".../LinkInfo./names./src/header|
+NAME-MAP-NEXT:     1303C: 626C6F63 6B000300 00000600 00000100 00001A00 00000000 00001100 00000900  |block...........................|
+NAME-MAP-NEXT:     1305C: 00000A00 00000D00 00000000 00000500 0000                                 |..................|
+NAME-MAP-NEXT:   )
diff --git a/llvm/test/DebugInfo/X86/dbg-val-list-dangling.ll b/llvm/test/DebugInfo/X86/dbg-val-list-dangling.ll
index d902b1325227..8522b40992c8 100644
--- a/llvm/test/DebugInfo/X86/dbg-val-list-dangling.ll
+++ b/llvm/test/DebugInfo/X86/dbg-val-list-dangling.ll
@@ -1,69 +1,69 @@
-;; At the moment we emit an undef as soon as we encounter "dangling" variadic
-;; dbg_value nodes. This does not reduce correctness but does reduce coverage.
-;; We should make variadic dbg_values work in the same way as their
-;; non-variadic counterparts here.
-;; FIXME: When dangling nodes for a variadic dbg_value are found, we should be
-;; able to recover the value in some cases.
-
-; RUN: llc %s -start-after=codegenprepare -stop-before=finalize-isel -o - -experimental-debug-variable-locations=false | FileCheck %s
-; RUN: llc %s -start-after=codegenprepare -stop-before=finalize-isel -o - -experimental-debug-variable-locations=false --try-experimental-debuginfo-iterators | FileCheck %s
-
-;; Check that dangling debug info in the SelectionDAG build phase is handled
-;; in the same way for variadic dbg_value ndoes as non-variadics.
-
-;; Generated from the following source with -g -O2. Second dbg.value modified
-;; to be variadic by hand:
-;; void a(char*);
-;; void b() {
-;;   char *c = "abc";
-;;   char *d = "abc";
-;;   a("abc");
-;; }
-
-; CHECK: ![[C:[0-9]+]] = !DILocalVariable(name: "c",
-; CHECK: ![[D:[0-9]+]] = !DILocalVariable(name: "d",
-
-; CHECK-DAG: DBG_VALUE %[[VREG:[0-9]]], $noreg, ![[C]], !DIExpression(), debug-location
-; CHECK-DAG: DBG_VALUE_LIST ![[D]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value), $noreg, debug-location
-
-target triple = "x86_64-unknown-linux-gnu"
-
-@.str = private unnamed_addr constant [4 x i8] c"abc\00", align 1
-
-define dso_local void @b() local_unnamed_addr !dbg !7 {
-entry:
-  call void @llvm.dbg.value(metadata ptr @.str, metadata !11, metadata !DIExpression()), !dbg !15
-  call void @llvm.dbg.value(metadata !DIArgList(ptr @.str), metadata !14, metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value)), !dbg !15
-  tail call void @a(ptr @.str) #3, !dbg !16
-  ret void, !dbg !17
-}
-
-declare !dbg !18 dso_local void @a(ptr) local_unnamed_addr
-declare void @llvm.dbg.value(metadata, metadata, metadata)
-
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 11.0.0)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
-!1 = !DIFile(filename: "test.c", directory: "/")
-!2 = !{}
-!3 = !{i32 7, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{!"clang version 11.0.0"}
-!7 = distinct !DISubprogram(name: "b", scope: !1, file: !1, line: 2, type: !8, scopeLine: 2, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !10)
-!8 = !DISubroutineType(types: !9)
-!9 = !{null}
-!10 = !{!11, !14}
-!11 = !DILocalVariable(name: "c", scope: !7, file: !1, line: 3, type: !12)
-!12 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !13, size: 64)
-!13 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
-!14 = !DILocalVariable(name: "d", scope: !7, file: !1, line: 4, type: !12)
-!15 = !DILocation(line: 0, scope: !7)
-!16 = !DILocation(line: 5, column: 3, scope: !7)
-!17 = !DILocation(line: 6, column: 1, scope: !7)
-!18 = !DISubprogram(name: "a", scope: !1, file: !1, line: 1, type: !19, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2)
-!19 = !DISubroutineType(types: !20)
-!20 = !{null, !12}
+;; At the moment we emit an undef as soon as we encounter "dangling" variadic
+;; dbg_value nodes. This does not reduce correctness but does reduce coverage.
+;; We should make variadic dbg_values work in the same way as their
+;; non-variadic counterparts here.
+;; FIXME: When dangling nodes for a variadic dbg_value are found, we should be
+;; able to recover the value in some cases.
+
+; RUN: llc %s -start-after=codegenprepare -stop-before=finalize-isel -o - -experimental-debug-variable-locations=false | FileCheck %s
+; RUN: llc %s -start-after=codegenprepare -stop-before=finalize-isel -o - -experimental-debug-variable-locations=false --try-experimental-debuginfo-iterators | FileCheck %s
+
+;; Check that dangling debug info in the SelectionDAG build phase is handled
+;; in the same way for variadic dbg_value ndoes as non-variadics.
+
+;; Generated from the following source with -g -O2. Second dbg.value modified
+;; to be variadic by hand:
+;; void a(char*);
+;; void b() {
+;;   char *c = "abc";
+;;   char *d = "abc";
+;;   a("abc");
+;; }
+
+; CHECK: ![[C:[0-9]+]] = !DILocalVariable(name: "c",
+; CHECK: ![[D:[0-9]+]] = !DILocalVariable(name: "d",
+
+; CHECK-DAG: DBG_VALUE %[[VREG:[0-9]]], $noreg, ![[C]], !DIExpression(), debug-location
+; CHECK-DAG: DBG_VALUE_LIST ![[D]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value), $noreg, debug-location
+
+target triple = "x86_64-unknown-linux-gnu"
+
+@.str = private unnamed_addr constant [4 x i8] c"abc\00", align 1
+
+define dso_local void @b() local_unnamed_addr !dbg !7 {
+entry:
+  call void @llvm.dbg.value(metadata ptr @.str, metadata !11, metadata !DIExpression()), !dbg !15
+  call void @llvm.dbg.value(metadata !DIArgList(ptr @.str), metadata !14, metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value)), !dbg !15
+  tail call void @a(ptr @.str) #3, !dbg !16
+  ret void, !dbg !17
+}
+
+declare !dbg !18 dso_local void @a(ptr) local_unnamed_addr
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 11.0.0)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "test.c", directory: "/")
+!2 = !{}
+!3 = !{i32 7, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 11.0.0"}
+!7 = distinct !DISubprogram(name: "b", scope: !1, file: !1, line: 2, type: !8, scopeLine: 2, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !10)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null}
+!10 = !{!11, !14}
+!11 = !DILocalVariable(name: "c", scope: !7, file: !1, line: 3, type: !12)
+!12 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !13, size: 64)
+!13 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
+!14 = !DILocalVariable(name: "d", scope: !7, file: !1, line: 4, type: !12)
+!15 = !DILocation(line: 0, scope: !7)
+!16 = !DILocation(line: 5, column: 3, scope: !7)
+!17 = !DILocation(line: 6, column: 1, scope: !7)
+!18 = !DISubprogram(name: "a", scope: !1, file: !1, line: 1, type: !19, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2)
+!19 = !DISubroutineType(types: !20)
+!20 = !{null, !12}
diff --git a/llvm/test/DebugInfo/X86/dbg-value-arg-movement.ll b/llvm/test/DebugInfo/X86/dbg-value-arg-movement.ll
index 4c47cb046deb..7d8e9c9c819c 100644
--- a/llvm/test/DebugInfo/X86/dbg-value-arg-movement.ll
+++ b/llvm/test/DebugInfo/X86/dbg-value-arg-movement.ll
@@ -64,7 +64,7 @@
 ; INSTRREF: DBG_PHI $edi, 1
 ; INSTRREF: DBG_VALUE $edi, $noreg, [[BAZVAR]]
 ; INSTRREF-LABEL: bb.1.next
-; INSTRREF: DBG_INSTR_REF [[XYZVAR]], {{.+}}, dbg-instr-ref(1, 0)
+; INSTRREF: DBG_INSTR_REF [[XYZVAR]], {{.+}}, dbg-instr-ref(1, 0)
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/DebugInfo/X86/dbg-value-funcarg.ll b/llvm/test/DebugInfo/X86/dbg-value-funcarg.ll
index fed8a6e40992..29ec7179aa38 100644
--- a/llvm/test/DebugInfo/X86/dbg-value-funcarg.ll
+++ b/llvm/test/DebugInfo/X86/dbg-value-funcarg.ll
@@ -61,7 +61,7 @@ define dso_local void @foo_local(i32 %t1a) local_unnamed_addr #0 !dbg !7 {
 ; INSTRREF-NEXT: DBG_VALUE 123, $noreg, ![[LOCAL]], !DIExpression(),
 ; INSTRREF:      CALL64pcrel32 @bar,
 ; INSTRREF-NEXT: ADJCALLSTACKUP64
-; INSTRREF:      DBG_INSTR_REF ![[LOCAL]], !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0),
+; INSTRREF:      DBG_INSTR_REF ![[LOCAL]], !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0),
 ; INSTRREF-NOT: DBG_
 ; INSTRREF:    TCRETURNdi64 @bar,
 
@@ -96,7 +96,7 @@ define dso_local void @foo_other_param(i32 %t2a, i32 %t2b) local_unnamed_addr #0
 ; INSTRREF: CALL64pcrel32 @bar,
 ; INSTRREF: DBG_VALUE 123, $noreg, ![[T2B]], !DIExpression(),
 ; INSTRREF: CALL64pcrel32 @bar,
-; INSTRREF: DBG_INSTR_REF ![[T2B]], !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0),
+; INSTRREF: DBG_INSTR_REF ![[T2B]], !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0),
 ; INSTRREF: TCRETURNdi64 @bar,
 
 entry:
@@ -125,10 +125,10 @@ define dso_local void @foo_same_param(i32 %t3a) local_unnamed_addr #0 !dbg !31 {
 ; INSTRREF: DBG_PHI $edi, 1
 ; INSTRREF: DBG_VALUE $edi, $noreg, ![[T3A]], !DIExpression(),
 ; INSTRREF: CALL64pcrel32 @bar,
-; INSTRREF: DBG_INSTR_REF ![[TMP]], !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0),
+; INSTRREF: DBG_INSTR_REF ![[TMP]], !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0),
 ; INSTRREF: DBG_VALUE 123, $noreg, ![[T3A]], !DIExpression(),
 ; INSTRREF: CALL64pcrel32 @bar,
-; INSTRREF: DBG_INSTR_REF ![[T3A]], !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0),
+; INSTRREF: DBG_INSTR_REF ![[T3A]], !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0),
 ; INSTRREF: TCRETURNdi64 @bar,
 entry:
   call void @llvm.dbg.value(metadata i32 %t3a, metadata !33, metadata !DIExpression()), !dbg !35
diff --git a/llvm/test/DebugInfo/X86/dbg-value-funcarg2.ll b/llvm/test/DebugInfo/X86/dbg-value-funcarg2.ll
index 3d383bf9dfb3..d96657ca302e 100644
--- a/llvm/test/DebugInfo/X86/dbg-value-funcarg2.ll
+++ b/llvm/test/DebugInfo/X86/dbg-value-funcarg2.ll
@@ -60,8 +60,8 @@ define dso_local i32 @f(i64 %s1.coerce0, i64 %s1.coerce1, i64 %s2.coerce0, i64 %
 ;; of the earlier DBG_PHIs.
 ; INSTRREF:     ADJCALLSTACKUP
 ; INSTRREF-NOT: DBG_
-; INSTRREF-DAG: DBG_INSTR_REF ![[S1]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 0, 64), dbg-instr-ref(1, 0)
-; INSTRREF-DAG: DBG_INSTR_REF ![[S1]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 64, 64), dbg-instr-ref(2, 0)
+; INSTRREF-DAG: DBG_INSTR_REF ![[S1]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 0, 64), dbg-instr-ref(1, 0)
+; INSTRREF-DAG: DBG_INSTR_REF ![[S1]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 64, 64), dbg-instr-ref(2, 0)
 
 ; And then no more DBG_ instructions before the add.
 ; COMMON-NOT: DBG_
diff --git a/llvm/test/DebugInfo/X86/dbg-value-funcarg4.ll b/llvm/test/DebugInfo/X86/dbg-value-funcarg4.ll
index b63269a51d09..8a49256e9d67 100644
--- a/llvm/test/DebugInfo/X86/dbg-value-funcarg4.ll
+++ b/llvm/test/DebugInfo/X86/dbg-value-funcarg4.ll
@@ -9,8 +9,8 @@
 
 ; CHECK: DBG_PHI $edi, 1
 
-; CHECK: DBG_INSTR_REF ![[LOCAL]], !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0)
-; CHECK: DBG_INSTR_REF ![[LOCAL2]], !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0)
+; CHECK: DBG_INSTR_REF ![[LOCAL]], !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0)
+; CHECK: DBG_INSTR_REF ![[LOCAL2]], !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0)
 
 declare void @bar(i32)
 declare void @llvm.dbg.value(metadata, metadata, metadata)
diff --git a/llvm/test/DebugInfo/X86/dbg-value-list-dag-combine.ll b/llvm/test/DebugInfo/X86/dbg-value-list-dag-combine.ll
index a09372b42131..53a79ecc2e49 100644
--- a/llvm/test/DebugInfo/X86/dbg-value-list-dag-combine.ll
+++ b/llvm/test/DebugInfo/X86/dbg-value-list-dag-combine.ll
@@ -1,61 +1,61 @@
-; RUN: llc < %s -start-after=codegenprepare -stop-before=finalize-isel | FileCheck %s --implicit-check-not=DBG_VALUE
-
-;; Test for PR 9817 adapted for variadic dbg.values (those using !DIArgList) by
-;; hand.  The debug nodes for idx and gid are transferred to new nodes via
-;; TransferDbgValue. There should be a DEBUG_VALUE_LIST for each call to
-;; llvm.dbg.value.
-
-; CHECK-DAG: ![[ip:[0-9]+]]  = !DILocalVariable(name: "ip",
-; CHECK-DAG: ![[gid:[0-9]+]] = !DILocalVariable(name: "gid",
-; CHECK-DAG: ![[xxx:[0-9]+]] = !DILocalVariable(name: "xxx",
-; CHECK-DAG: ![[idx:[0-9]+]] = !DILocalVariable(name: "idx",
-
-; CHECK: DBG_VALUE_LIST ![[ip]],  !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value),
-; CHECK: DBG_VALUE_LIST ![[gid]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value),
-;; Constant value dbg.value should keep its value.
-; CHECK: DBG_VALUE_LIST ![[xxx]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value), 0
-; CHECK: DBG_VALUE_LIST ![[idx]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value),
-
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:32"
-target triple = "i686-apple-darwin"
-
-declare <4 x i32> @__amdil_get_global_id_int()
-declare void @llvm.dbg.value(metadata, metadata, metadata)
-define void @__OpenCL_test_kernel(ptr addrspace(1) %ip) nounwind !dbg !0 {
-entry:
-  call void @llvm.dbg.value(metadata !DIArgList(ptr addrspace(1) %ip), metadata !7, metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value)), !dbg !8
-  %0 = call <4 x i32> @__amdil_get_global_id_int() nounwind
-  %1 = extractelement <4 x i32> %0, i32 0
-  call void @llvm.dbg.value(metadata !DIArgList(i32 %1), metadata !9, metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value)), !dbg !11
-  call void @llvm.dbg.value(metadata !DIArgList(i32 0), metadata !21, metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value)), !dbg !14
-  %tmp2 = load i32, ptr addrspace(1) %ip, align 4, !dbg !15
-  %tmp3 = add i32 0, %tmp2, !dbg !15
-  call void @llvm.dbg.value(metadata !DIArgList(i32 %tmp3), metadata !13, metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value)), !dbg !15
-  %arrayidx = getelementptr i32, ptr addrspace(1) %ip, i32 %1, !dbg !16
-  store i32 %tmp3, ptr addrspace(1) %arrayidx, align 4, !dbg !16
-  ret void, !dbg !17
-}
-!llvm.dbg.cu = !{!2}
-!llvm.module.flags = !{!20}
-
-!0 = distinct !DISubprogram(name: "__OpenCL_test_kernel", linkageName: "__OpenCL_test_kernel", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !2, file: !19, scope: !1, type: !3)
-!1 = !DIFile(filename: "OCL6368.tmp.cl", directory: "E:CUsersCmvillmow.AMDCAppDataCLocalCTemp")
-!2 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "clc", isOptimized: false, emissionKind: FullDebug, file: !19, enums: !12, retainedTypes: !12, imports:  null)
-!3 = !DISubroutineType(types: !4)
-!4 = !{null, !5}
-!5 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, scope: !2, baseType: !6)
-!6 = !DIBasicType(tag: DW_TAG_base_type, name: "unsigned int", size: 32, align: 32, encoding: DW_ATE_unsigned)
-!7 = !DILocalVariable(name: "ip", line: 1, arg: 1, scope: !0, file: !1, type: !5)
-!8 = !DILocation(line: 1, column: 42, scope: !0)
-!9 = !DILocalVariable(name: "gid", line: 3, scope: !10, file: !1, type: !6)
-!10 = distinct !DILexicalBlock(line: 2, column: 1, file: !19, scope: !0)
-!11 = !DILocation(line: 3, column: 41, scope: !10)
-!12 = !{}
-!13 = !DILocalVariable(name: "idx", line: 4, scope: !10, file: !1, type: !6)
-!14 = !DILocation(line: 4, column: 20, scope: !10)
-!15 = !DILocation(line: 5, column: 15, scope: !10)
-!16 = !DILocation(line: 6, column: 18, scope: !10)
-!17 = !DILocation(line: 7, column: 1, scope: !0)
-!19 = !DIFile(filename: "OCL6368.tmp.cl", directory: "E:\5CUsers\5Cmvillmow.AMD\5CAppData\5CLocal\5CTemp")
-!20 = !{i32 1, !"Debug Info Version", i32 3}
-!21 = !DILocalVariable(name: "xxx", line: 4, scope: !10, file: !1, type: !6)
+; RUN: llc < %s -start-after=codegenprepare -stop-before=finalize-isel | FileCheck %s --implicit-check-not=DBG_VALUE
+
+;; Test for PR 9817 adapted for variadic dbg.values (those using !DIArgList) by
+;; hand.  The debug nodes for idx and gid are transferred to new nodes via
+;; TransferDbgValue. There should be a DEBUG_VALUE_LIST for each call to
+;; llvm.dbg.value.
+
+; CHECK-DAG: ![[ip:[0-9]+]]  = !DILocalVariable(name: "ip",
+; CHECK-DAG: ![[gid:[0-9]+]] = !DILocalVariable(name: "gid",
+; CHECK-DAG: ![[xxx:[0-9]+]] = !DILocalVariable(name: "xxx",
+; CHECK-DAG: ![[idx:[0-9]+]] = !DILocalVariable(name: "idx",
+
+; CHECK: DBG_VALUE_LIST ![[ip]],  !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value),
+; CHECK: DBG_VALUE_LIST ![[gid]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value),
+;; Constant value dbg.value should keep its value.
+; CHECK: DBG_VALUE_LIST ![[xxx]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value), 0
+; CHECK: DBG_VALUE_LIST ![[idx]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value),
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:32"
+target triple = "i686-apple-darwin"
+
+declare <4 x i32> @__amdil_get_global_id_int()
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+define void @__OpenCL_test_kernel(ptr addrspace(1) %ip) nounwind !dbg !0 {
+entry:
+  call void @llvm.dbg.value(metadata !DIArgList(ptr addrspace(1) %ip), metadata !7, metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value)), !dbg !8
+  %0 = call <4 x i32> @__amdil_get_global_id_int() nounwind
+  %1 = extractelement <4 x i32> %0, i32 0
+  call void @llvm.dbg.value(metadata !DIArgList(i32 %1), metadata !9, metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value)), !dbg !11
+  call void @llvm.dbg.value(metadata !DIArgList(i32 0), metadata !21, metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value)), !dbg !14
+  %tmp2 = load i32, ptr addrspace(1) %ip, align 4, !dbg !15
+  %tmp3 = add i32 0, %tmp2, !dbg !15
+  call void @llvm.dbg.value(metadata !DIArgList(i32 %tmp3), metadata !13, metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value)), !dbg !15
+  %arrayidx = getelementptr i32, ptr addrspace(1) %ip, i32 %1, !dbg !16
+  store i32 %tmp3, ptr addrspace(1) %arrayidx, align 4, !dbg !16
+  ret void, !dbg !17
+}
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!20}
+
+!0 = distinct !DISubprogram(name: "__OpenCL_test_kernel", linkageName: "__OpenCL_test_kernel", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !2, file: !19, scope: !1, type: !3)
+!1 = !DIFile(filename: "OCL6368.tmp.cl", directory: "E:CUsersCmvillmow.AMDCAppDataCLocalCTemp")
+!2 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "clc", isOptimized: false, emissionKind: FullDebug, file: !19, enums: !12, retainedTypes: !12, imports:  null)
+!3 = !DISubroutineType(types: !4)
+!4 = !{null, !5}
+!5 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, scope: !2, baseType: !6)
+!6 = !DIBasicType(tag: DW_TAG_base_type, name: "unsigned int", size: 32, align: 32, encoding: DW_ATE_unsigned)
+!7 = !DILocalVariable(name: "ip", line: 1, arg: 1, scope: !0, file: !1, type: !5)
+!8 = !DILocation(line: 1, column: 42, scope: !0)
+!9 = !DILocalVariable(name: "gid", line: 3, scope: !10, file: !1, type: !6)
+!10 = distinct !DILexicalBlock(line: 2, column: 1, file: !19, scope: !0)
+!11 = !DILocation(line: 3, column: 41, scope: !10)
+!12 = !{}
+!13 = !DILocalVariable(name: "idx", line: 4, scope: !10, file: !1, type: !6)
+!14 = !DILocation(line: 4, column: 20, scope: !10)
+!15 = !DILocation(line: 5, column: 15, scope: !10)
+!16 = !DILocation(line: 6, column: 18, scope: !10)
+!17 = !DILocation(line: 7, column: 1, scope: !0)
+!19 = !DIFile(filename: "OCL6368.tmp.cl", directory: "E:\5CUsers\5Cmvillmow.AMD\5CAppData\5CLocal\5CTemp")
+!20 = !{i32 1, !"Debug Info Version", i32 3}
+!21 = !DILocalVariable(name: "xxx", line: 4, scope: !10, file: !1, type: !6)
diff --git a/llvm/test/DebugInfo/X86/dbg_value_list_clobbers.mir b/llvm/test/DebugInfo/X86/dbg_value_list_clobbers.mir
index 4114efc3f858..8190ead7da02 100644
--- a/llvm/test/DebugInfo/X86/dbg_value_list_clobbers.mir
+++ b/llvm/test/DebugInfo/X86/dbg_value_list_clobbers.mir
@@ -1,84 +1,84 @@
-# RUN: llc %s --start-after=livedebugvalues -filetype=obj -o - \
-# RUN:     | llvm-dwarfdump - -name locala -o - | FileCheck %s
-#
-# Test that clobbers between DBG_VALUE_LIST and DBG_VALUE instructions work as
-# expected. Comments and test directives inline.
-
---- |
-  target triple = "x86_64-unknown-linux-gnu"
-  define dso_local i32 @fun() local_unnamed_addr !dbg !7 {
-  entry:
-    ret i32 0
-  }
-
-  !llvm.dbg.cu = !{!0}
-  !llvm.module.flags = !{!3, !4, !5}
-  !llvm.ident = !{!6}
-
-  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
-  !1 = !DIFile(filename: "example.c", directory: "/")
-  !2 = !{}
-  !3 = !{i32 7, !"Dwarf Version", i32 4}
-  !4 = !{i32 2, !"Debug Info Version", i32 3}
-  !5 = !{i32 1, !"wchar_size", i32 4}
-  !6 = !{!"clang version 11.0.0"}
-  !8 = !DISubroutineType(types: !9)
-  !9 = !{!10}
-  !10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-  !11 = !{!12}
-  !22 = !DISubroutineType(types: !23)
-  !23 = !{!10, !10}
-  ; --- Important metadata ---
-  !7 = distinct !DISubprogram(name: "fun", scope: !1, file: !1, line: 2, type: !8, scopeLine: 2, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11)
-  !15 = !DILocation(line: 1, column: 1, scope: !7)
-  !12 = !DILocalVariable(name: "locala", scope: !7, file: !1, line: 1, type: !10)
-
-...
----
-name:            fun
-body:             |
-  bb.0.entry:
-    ; This test checks that we see expected location ranges for a single variable.
-    ; CHECK: {{.*}} DW_TAG_variable
-    ; CHECK-NEXT: DW_AT_location {{.*}}
-
-    DBG_VALUE_LIST !12, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value), $eax, debug-location !15
-    ; CHECK-NEXT: [{{.*}}): DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_stack_value
-
-    $edi = MOV32ri 1
-    DBG_VALUE_LIST !12, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value), $esi, debug-location !15
-    ; CHECK-NEXT: [{{.*}}): DW_OP_breg4 RSI+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_stack_value
-
-    $eax = MOV32ri 2
-    DBG_VALUE $eax, $noreg, !12, !DIExpression(), debug-location !15
-    ; CHECK-NEXT: [{{.*}}): DW_OP_reg0 RAX
-
-    $ecx = MOV32ri 3
-    DBG_VALUE_LIST !12, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus, DW_OP_stack_value), $eax, $ecx, debug-location !15
-    ; CHECK-NEXT: [{{.*}}): DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_breg2 RCX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_plus, DW_OP_stack_value
-
-    ; Check that a reg clobber prevents identical locations merging.
-    $ecx = MOV32ri 4
-    $ecx = MOV32ri 5
-    DBG_VALUE_LIST !12, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus, DW_OP_stack_value), $eax, $ecx, debug-location !15
-    ; CHECK-NEXT: [{{.*}}): DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_breg2 RCX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_plus, DW_OP_stack_value
-
-    ; Check that fragments are composed correctly.
-    $ecx = MOV32ri 6
-    DBG_VALUE_LIST !12, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value, DW_OP_LLVM_fragment, 0, 16), $eax, debug-location !15
-    DBG_VALUE_LIST !12, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value, DW_OP_LLVM_fragment, 16, 16), $ecx, debug-location !15
-    ; CHECK-NEXT: [{{.*}}): DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_stack_value, DW_OP_piece 0x2, DW_OP_breg2 RCX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_stack_value, DW_OP_piece 0x2
-
-    ; Check that fragments clobber preceeding overlap.
-    $edi = MOV32ri 7
-    DBG_VALUE_LIST !12, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value, DW_OP_LLVM_fragment, 16, 16), $edi, debug-location !15
-    ; CHECK-NEXT: [{{.*}}): DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_stack_value, DW_OP_piece 0x2, DW_OP_breg5 RDI+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_stack_value, DW_OP_piece 0x2
-
-    ; Check that a (non-zero-offset) fragment works.
-    $ecx = MOV32ri 8
-    $ecx = MOV32ri 9
-    DBG_VALUE_LIST !12, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus, DW_OP_stack_value, DW_OP_LLVM_fragment, 16, 16), $eax, $ecx, debug-location !15
-    ; CHECK-NEXT: [{{.*}}): DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_stack_value, DW_OP_piece 0x2, DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_breg2 RCX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_plus, DW_OP_stack_value, DW_OP_piece 0x2
-
-    RET64 debug-location !15
-...
+# RUN: llc %s --start-after=livedebugvalues -filetype=obj -o - \
+# RUN:     | llvm-dwarfdump - -name locala -o - | FileCheck %s
+#
+# Test that clobbers between DBG_VALUE_LIST and DBG_VALUE instructions work as
+# expected. Comments and test directives inline.
+
+--- |
+  target triple = "x86_64-unknown-linux-gnu"
+  define dso_local i32 @fun() local_unnamed_addr !dbg !7 {
+  entry:
+    ret i32 0
+  }
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!3, !4, !5}
+  !llvm.ident = !{!6}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+  !1 = !DIFile(filename: "example.c", directory: "/")
+  !2 = !{}
+  !3 = !{i32 7, !"Dwarf Version", i32 4}
+  !4 = !{i32 2, !"Debug Info Version", i32 3}
+  !5 = !{i32 1, !"wchar_size", i32 4}
+  !6 = !{!"clang version 11.0.0"}
+  !8 = !DISubroutineType(types: !9)
+  !9 = !{!10}
+  !10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !11 = !{!12}
+  !22 = !DISubroutineType(types: !23)
+  !23 = !{!10, !10}
+  ; --- Important metadata ---
+  !7 = distinct !DISubprogram(name: "fun", scope: !1, file: !1, line: 2, type: !8, scopeLine: 2, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11)
+  !15 = !DILocation(line: 1, column: 1, scope: !7)
+  !12 = !DILocalVariable(name: "locala", scope: !7, file: !1, line: 1, type: !10)
+
+...
+---
+name:            fun
+body:             |
+  bb.0.entry:
+    ; This test checks that we see expected location ranges for a single variable.
+    ; CHECK: {{.*}} DW_TAG_variable
+    ; CHECK-NEXT: DW_AT_location {{.*}}
+
+    DBG_VALUE_LIST !12, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value), $eax, debug-location !15
+    ; CHECK-NEXT: [{{.*}}): DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_stack_value
+
+    $edi = MOV32ri 1
+    DBG_VALUE_LIST !12, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value), $esi, debug-location !15
+    ; CHECK-NEXT: [{{.*}}): DW_OP_breg4 RSI+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_stack_value
+
+    $eax = MOV32ri 2
+    DBG_VALUE $eax, $noreg, !12, !DIExpression(), debug-location !15
+    ; CHECK-NEXT: [{{.*}}): DW_OP_reg0 RAX
+
+    $ecx = MOV32ri 3
+    DBG_VALUE_LIST !12, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus, DW_OP_stack_value), $eax, $ecx, debug-location !15
+    ; CHECK-NEXT: [{{.*}}): DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_breg2 RCX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_plus, DW_OP_stack_value
+
+    ; Check that a reg clobber prevents identical locations merging.
+    $ecx = MOV32ri 4
+    $ecx = MOV32ri 5
+    DBG_VALUE_LIST !12, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus, DW_OP_stack_value), $eax, $ecx, debug-location !15
+    ; CHECK-NEXT: [{{.*}}): DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_breg2 RCX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_plus, DW_OP_stack_value
+
+    ; Check that fragments are composed correctly.
+    $ecx = MOV32ri 6
+    DBG_VALUE_LIST !12, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value, DW_OP_LLVM_fragment, 0, 16), $eax, debug-location !15
+    DBG_VALUE_LIST !12, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value, DW_OP_LLVM_fragment, 16, 16), $ecx, debug-location !15
+    ; CHECK-NEXT: [{{.*}}): DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_stack_value, DW_OP_piece 0x2, DW_OP_breg2 RCX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_stack_value, DW_OP_piece 0x2
+
+    ; Check that fragments clobber preceeding overlap.
+    $edi = MOV32ri 7
+    DBG_VALUE_LIST !12, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value, DW_OP_LLVM_fragment, 16, 16), $edi, debug-location !15
+    ; CHECK-NEXT: [{{.*}}): DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_stack_value, DW_OP_piece 0x2, DW_OP_breg5 RDI+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_stack_value, DW_OP_piece 0x2
+
+    ; Check that a (non-zero-offset) fragment works.
+    $ecx = MOV32ri 8
+    $ecx = MOV32ri 9
+    DBG_VALUE_LIST !12, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus, DW_OP_stack_value, DW_OP_LLVM_fragment, 16, 16), $eax, $ecx, debug-location !15
+    ; CHECK-NEXT: [{{.*}}): DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_stack_value, DW_OP_piece 0x2, DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_breg2 RCX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_plus, DW_OP_stack_value, DW_OP_piece 0x2
+
+    RET64 debug-location !15
+...
diff --git a/llvm/test/DebugInfo/X86/dbg_value_list_emission.mir b/llvm/test/DebugInfo/X86/dbg_value_list_emission.mir
index bc748e0009c4..de2bed1093f3 100644
--- a/llvm/test/DebugInfo/X86/dbg_value_list_emission.mir
+++ b/llvm/test/DebugInfo/X86/dbg_value_list_emission.mir
@@ -1,107 +1,107 @@
-# RUN: llc %s --start-after=livedebugvalues -filetype=obj -o - \
-# RUN:     | llvm-dwarfdump - -name local* -regex \
-# RUN:     | FileCheck %s
-#
-# Test that we produce correct DWARF from DBG_VALUE_LIST instructions.
-# Comments and test directives inline.
-
---- |
-  target triple = "x86_64-unknown-linux-gnu"
-  define dso_local i32 @fun() local_unnamed_addr !dbg !7 {
-  entry:
-    ret i32 0
-  }
-
-  !llvm.dbg.cu = !{!0}
-  !llvm.module.flags = !{!3, !4, !5}
-  !llvm.ident = !{!6}
-
-  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
-  !1 = !DIFile(filename: "example.c", directory: "/")
-  !2 = !{}
-  !3 = !{i32 7, !"Dwarf Version", i32 4}
-  !4 = !{i32 2, !"Debug Info Version", i32 3}
-  !5 = !{i32 1, !"wchar_size", i32 4}
-  !6 = !{!"clang version 11.0.0"}
-  !8 = !DISubroutineType(types: !9)
-  !9 = !{!10}
-  !10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-  !11 = !{!12, !13, !25}
-  !22 = !DISubroutineType(types: !23)
-  !23 = !{!10, !10}
-  ; --- Important metadata ---
-  !7 = distinct !DISubprogram(name: "fun", scope: !1, file: !1, line: 2, type: !8, scopeLine: 2, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11)
-  !15 = !DILocation(line: 1, column: 1, scope: !7)
-  !12 = !DILocalVariable(name: "locala", scope: !7, file: !1, line: 1, type: !10)
-  !13 = !DILocalVariable(name: "localb", scope: !7, file: !1, line: 2, type: !10)
-  !25 = !DILocalVariable(name: "localc", scope: !7, file: !1, line: 3, type: !10)
-  !26 = !DILocalVariable(name: "locald", scope: !7, file: !1, line: 4, type: !10)
-  !27 = !DILocalVariable(name: "locale", scope: !7, file: !1, line: 5, type: !10)
-  !28 = !DILocalVariable(name: "localf", scope: !7, file: !1, line: 6, type: !10)
-  !29 = !DILocalVariable(name: "localg", scope: !7, file: !1, line: 6, type: !10)
-  !30 = !DILocalVariable(name: "localh", scope: !7, file: !1, line: 6, type: !10)
-  !31 = !DILocalVariable(name: "locali", scope: !7, file: !1, line: 6, type: !10)
-
-...
----
-name:            fun
-body:             |
-  bb.0.entry:
-    ; NOTE: By design, all DBG_VALUE_LIST instructions describe stack_value
-    ;       locations, so they are always created with a DW_OP_stack_value op.
-    ;
-    ; (1) Check a single reg arg works.
-    DBG_VALUE_LIST !12, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value), $eax, debug-location !15
-    ; CHECK:      DW_TAG_variable
-    ;  CHECK-NEXT:   (DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_stack_value)
-    ;  CHECK-NEXT:   DW_AT_name ("locala")
-
-    ; (2) Check multiple reg args work.
-    DBG_VALUE_LIST !13, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus, DW_OP_stack_value), $eax, $edi, debug-location !15
-    ; CHECK:      DW_TAG_variable
-    ; CHECK-NEXT:   (DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_breg5 RDI+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_plus, DW_OP_stack_value)
-    ; CHECK-NEXT:   DW_AT_name ("localb")
-
-    ; (3) Check that multiple references to one reg arg works.
-    DBG_VALUE_LIST !25, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 0, DW_OP_minus, DW_OP_stack_value), $eax, debug-location !15
-    ; CHECK:      DW_TAG_variable
-    ; CHECK-NEXT:   (DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_minus, DW_OP_stack_value)
-    ; CHECK-NEXT:   DW_AT_name ("localc")
-
-    ; (4) Check constant and reg args work together.
-    DBG_VALUE_LIST !26, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_mul, DW_OP_stack_value), $eax, 5, debug-location !15
-    ; CHECK:      DW_TAG_variable
-    ; CHECK-NEXT:   (DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_lit5, DW_OP_mul, DW_OP_stack_value)
-    ; CHECK-NEXT:   DW_AT_name ("locald")
-
-    ; (5) Check that arg deref works.
-    DBG_VALUE_LIST !27, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_deref, DW_OP_stack_value), $eax, debug-location !15
-    ; CHECK:      DW_TAG_variable
-    ; CHECK-NEXT:   (DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_deref, DW_OP_stack_value)
-    ; CHECK-NEXT:   DW_AT_name ("locale")
-
-    ; (6) Check that fragments work.
-    DBG_VALUE_LIST !28, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value, DW_OP_LLVM_fragment, 0, 16), $eax, debug-location !15
-    ; CHECK:      DW_TAG_variable
-    ; CHECK-NEXT:   (DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_stack_value, DW_OP_piece 0x2)
-    ; CHECK-NEXT:   DW_AT_name ("localf")
-
-    ; (7) Check that constant register offsets are correctly folded.
-    DBG_VALUE_LIST !29, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_plus_uconst, 5, DW_OP_LLVM_arg, 1, DW_OP_plus_uconst, 17, DW_OP_plus, DW_OP_stack_value), $eax, $edi, debug-location !15
-    ; CHECK:      DW_TAG_variable
-    ; CHECK-NEXT:   (DW_OP_breg0 RAX+5, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_breg5 RDI+17, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_plus, DW_OP_stack_value)
-    ; CHECK-NEXT:   DW_AT_name ("localg")
-    
-    ; (8) Check that a single $noreg location invalidates the entire entry.
-    DBG_VALUE_LIST !30, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus, DW_OP_stack_value), $eax, $noreg, debug-location !15
-    ; CHECK-NOT:   DW_AT_name ("localh")
-
-    ; (9) Check that relational operators work
-    DBG_VALUE_LIST !31, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_eq, DW_OP_LLVM_arg, 0, DW_OP_ne, DW_OP_LLVM_arg, 1, DW_OP_gt, DW_OP_LLVM_arg, 0, DW_OP_lt, DW_OP_LLVM_arg, 1, DW_OP_le, DW_OP_stack_value), $eax, $edi, debug-location !15
-    ; CHECK:      DW_TAG_variable
-    ; CHECK-NEXT:   (DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_breg5 RDI+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_eq, DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_ne, DW_OP_breg5 RDI+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_gt, DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_lt, DW_OP_breg5 RDI+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_le, DW_OP_stack_value)
-    ; CHECK-NEXT:   DW_AT_name ("locali")
-
-
-    RET64 debug-location !15
-...
+# RUN: llc %s --start-after=livedebugvalues -filetype=obj -o - \
+# RUN:     | llvm-dwarfdump - -name local* -regex \
+# RUN:     | FileCheck %s
+#
+# Test that we produce correct DWARF from DBG_VALUE_LIST instructions.
+# Comments and test directives inline.
+
+--- |
+  target triple = "x86_64-unknown-linux-gnu"
+  define dso_local i32 @fun() local_unnamed_addr !dbg !7 {
+  entry:
+    ret i32 0
+  }
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!3, !4, !5}
+  !llvm.ident = !{!6}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+  !1 = !DIFile(filename: "example.c", directory: "/")
+  !2 = !{}
+  !3 = !{i32 7, !"Dwarf Version", i32 4}
+  !4 = !{i32 2, !"Debug Info Version", i32 3}
+  !5 = !{i32 1, !"wchar_size", i32 4}
+  !6 = !{!"clang version 11.0.0"}
+  !8 = !DISubroutineType(types: !9)
+  !9 = !{!10}
+  !10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !11 = !{!12, !13, !25}
+  !22 = !DISubroutineType(types: !23)
+  !23 = !{!10, !10}
+  ; --- Important metadata ---
+  !7 = distinct !DISubprogram(name: "fun", scope: !1, file: !1, line: 2, type: !8, scopeLine: 2, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11)
+  !15 = !DILocation(line: 1, column: 1, scope: !7)
+  !12 = !DILocalVariable(name: "locala", scope: !7, file: !1, line: 1, type: !10)
+  !13 = !DILocalVariable(name: "localb", scope: !7, file: !1, line: 2, type: !10)
+  !25 = !DILocalVariable(name: "localc", scope: !7, file: !1, line: 3, type: !10)
+  !26 = !DILocalVariable(name: "locald", scope: !7, file: !1, line: 4, type: !10)
+  !27 = !DILocalVariable(name: "locale", scope: !7, file: !1, line: 5, type: !10)
+  !28 = !DILocalVariable(name: "localf", scope: !7, file: !1, line: 6, type: !10)
+  !29 = !DILocalVariable(name: "localg", scope: !7, file: !1, line: 6, type: !10)
+  !30 = !DILocalVariable(name: "localh", scope: !7, file: !1, line: 6, type: !10)
+  !31 = !DILocalVariable(name: "locali", scope: !7, file: !1, line: 6, type: !10)
+
+...
+---
+name:            fun
+body:             |
+  bb.0.entry:
+    ; NOTE: By design, all DBG_VALUE_LIST instructions describe stack_value
+    ;       locations, so they are always created with a DW_OP_stack_value op.
+    ;
+    ; (1) Check a single reg arg works.
+    DBG_VALUE_LIST !12, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value), $eax, debug-location !15
+    ; CHECK:      DW_TAG_variable
+    ;  CHECK-NEXT:   (DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_stack_value)
+    ;  CHECK-NEXT:   DW_AT_name ("locala")
+
+    ; (2) Check multiple reg args work.
+    DBG_VALUE_LIST !13, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus, DW_OP_stack_value), $eax, $edi, debug-location !15
+    ; CHECK:      DW_TAG_variable
+    ; CHECK-NEXT:   (DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_breg5 RDI+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_plus, DW_OP_stack_value)
+    ; CHECK-NEXT:   DW_AT_name ("localb")
+
+    ; (3) Check that multiple references to one reg arg works.
+    DBG_VALUE_LIST !25, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 0, DW_OP_minus, DW_OP_stack_value), $eax, debug-location !15
+    ; CHECK:      DW_TAG_variable
+    ; CHECK-NEXT:   (DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_minus, DW_OP_stack_value)
+    ; CHECK-NEXT:   DW_AT_name ("localc")
+
+    ; (4) Check constant and reg args work together.
+    DBG_VALUE_LIST !26, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_mul, DW_OP_stack_value), $eax, 5, debug-location !15
+    ; CHECK:      DW_TAG_variable
+    ; CHECK-NEXT:   (DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_lit5, DW_OP_mul, DW_OP_stack_value)
+    ; CHECK-NEXT:   DW_AT_name ("locald")
+
+    ; (5) Check that arg deref works.
+    DBG_VALUE_LIST !27, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_deref, DW_OP_stack_value), $eax, debug-location !15
+    ; CHECK:      DW_TAG_variable
+    ; CHECK-NEXT:   (DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_deref, DW_OP_stack_value)
+    ; CHECK-NEXT:   DW_AT_name ("locale")
+
+    ; (6) Check that fragments work.
+    DBG_VALUE_LIST !28, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_stack_value, DW_OP_LLVM_fragment, 0, 16), $eax, debug-location !15
+    ; CHECK:      DW_TAG_variable
+    ; CHECK-NEXT:   (DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_stack_value, DW_OP_piece 0x2)
+    ; CHECK-NEXT:   DW_AT_name ("localf")
+
+    ; (7) Check that constant register offsets are correctly folded.
+    DBG_VALUE_LIST !29, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_plus_uconst, 5, DW_OP_LLVM_arg, 1, DW_OP_plus_uconst, 17, DW_OP_plus, DW_OP_stack_value), $eax, $edi, debug-location !15
+    ; CHECK:      DW_TAG_variable
+    ; CHECK-NEXT:   (DW_OP_breg0 RAX+5, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_breg5 RDI+17, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_plus, DW_OP_stack_value)
+    ; CHECK-NEXT:   DW_AT_name ("localg")
+
+    ; (8) Check that a single $noreg location invalidates the entire entry.
+    DBG_VALUE_LIST !30, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus, DW_OP_stack_value), $eax, $noreg, debug-location !15
+    ; CHECK-NOT:   DW_AT_name ("localh")
+
+    ; (9) Check that relational operators work
+    DBG_VALUE_LIST !31, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_eq, DW_OP_LLVM_arg, 0, DW_OP_ne, DW_OP_LLVM_arg, 1, DW_OP_gt, DW_OP_LLVM_arg, 0, DW_OP_lt, DW_OP_LLVM_arg, 1, DW_OP_le, DW_OP_stack_value), $eax, $edi, debug-location !15
+    ; CHECK:      DW_TAG_variable
+    ; CHECK-NEXT:   (DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_breg5 RDI+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_eq, DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_ne, DW_OP_breg5 RDI+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_gt, DW_OP_breg0 RAX+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_lt, DW_OP_breg5 RDI+0, DW_OP_constu 0xffffffff, DW_OP_and, DW_OP_le, DW_OP_stack_value)
+    ; CHECK-NEXT:   DW_AT_name ("locali")
+
+
+    RET64 debug-location !15
+...
diff --git a/llvm/test/DebugInfo/X86/instr-ref-dbg-declare.ll b/llvm/test/DebugInfo/X86/instr-ref-dbg-declare.ll
index 40ac544e0d24..5e900ac4db07 100644
--- a/llvm/test/DebugInfo/X86/instr-ref-dbg-declare.ll
+++ b/llvm/test/DebugInfo/X86/instr-ref-dbg-declare.ll
@@ -2,7 +2,7 @@
 ; RUN:     -experimental-debug-variable-locations=true \
 ; RUN:  | FileCheck %s
 
-;; Copy of DebugInfo/COFF/types-array-advanced.ll. This features a dbg.declare 
+;; Copy of DebugInfo/COFF/types-array-advanced.ll. This features a dbg.declare
 ;; of something (dynamic alloca) that isn't an argument, causing a SDDbgValue
 ;; with the indirect flag set to be emitted. Test that it's preserved in
 ;; instruction referencing mode -- we don't have an IsIndirect flag on
@@ -11,7 +11,7 @@
 ;; NB: the original test has an additional spurious DW_OP_deref in the
 ;; dbg.declare's arguments, which is preserved here, translating to two derefs.
 
-; CHECK: DBG_INSTR_REF !{{[0-9]+}}, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_deref, DW_OP_deref), dbg-instr-ref(1, 2)
+; CHECK: DBG_INSTR_REF !{{[0-9]+}}, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_deref, DW_OP_deref), dbg-instr-ref(1, 2)
 
 source_filename = "test/DebugInfo/COFF/types-array-advanced.ll"
 target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
diff --git a/llvm/test/DebugInfo/X86/instr-ref-dyn-alloca-win32.ll b/llvm/test/DebugInfo/X86/instr-ref-dyn-alloca-win32.ll
index e1f9e7ebf56a..d88bf365e275 100644
--- a/llvm/test/DebugInfo/X86/instr-ref-dyn-alloca-win32.ll
+++ b/llvm/test/DebugInfo/X86/instr-ref-dyn-alloca-win32.ll
@@ -13,7 +13,7 @@
 ;; The alloca instruction should be labelled, and we should refer to operand 2,
 ;; which happens to be a def of $esp
 ; DYN_LABEL: DYN_ALLOCA_32 {{.*}} debug-instr-number 1,
-; DYN_LABEL: DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 2)
+; DYN_LABEL: DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 2)
 
 ;; Once lowered, on win32 _chkstk alters the stack pointer. We should label the
 ;; call and it's SP operand, plus check for a value substitution.
diff --git a/llvm/test/DebugInfo/X86/instr-ref-ir-reg-read.ll b/llvm/test/DebugInfo/X86/instr-ref-ir-reg-read.ll
index 958fd0e7adc8..ef2a07ac991e 100644
--- a/llvm/test/DebugInfo/X86/instr-ref-ir-reg-read.ll
+++ b/llvm/test/DebugInfo/X86/instr-ref-ir-reg-read.ll
@@ -9,7 +9,7 @@
 ; Just examine to see that we read something from $rsp.
 ; CHECK-LABEL: bb.1.if.then:
 ; CHECK:       DBG_PHI $rsp, 1
-; CHECK:       DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
+; CHECK:       DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
 
 source_filename = "tlb-9e7172.c"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/DebugInfo/X86/live-debug-values-expr-conflict.ll b/llvm/test/DebugInfo/X86/live-debug-values-expr-conflict.ll
index afce27ee4abb..8e84240ceaa9 100644
--- a/llvm/test/DebugInfo/X86/live-debug-values-expr-conflict.ll
+++ b/llvm/test/DebugInfo/X86/live-debug-values-expr-conflict.ll
@@ -7,7 +7,7 @@
 ;
 ; void escape1(int bees);
 ; void escape2(int bees);
-; 
+;
 ; int foo(int bar) {
 ;   int baz = bar;
 ;   if (baz == 12) {
@@ -16,7 +16,7 @@
 ;     baz += 1;
 ;     escape2(bar);
 ;   }
-; 
+;
 ;   return bar;
 ; }
 ;
@@ -24,18 +24,18 @@
 ; one in the block two, and none in block three.
 ; CHECK:       ![[BAZVAR:[0-9]+]] = !DILocalVariable(name: "baz",
 ; CHECK-LABEL: bb.0.entry:
-; CHECK:       DBG_VALUE {{[0-9a-zA-Z$%_]*}}, $noreg, ![[BAZVAR]], 
+; CHECK:       DBG_VALUE {{[0-9a-zA-Z$%_]*}}, $noreg, ![[BAZVAR]],
 ; CHECK-SAME:     !DIExpression()
 ; CHECK-LABEL: bb.1.if.then:
 ; CHECK-LABEL: bb.2.if.else:
-; CHECK:       DBG_VALUE {{[0-9a-zA-Z$%_]*}}, $noreg, ![[BAZVAR]], 
+; CHECK:       DBG_VALUE {{[0-9a-zA-Z$%_]*}}, $noreg, ![[BAZVAR]],
 ; CHECK-SAME:     !DIExpression()
-; CHECK:       DBG_VALUE_LIST ![[BAZVAR]], 
-; CHECK-SAME:     !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_plus_uconst, 1, DW_OP_stack_value)
-; CHECK-SAME:     {{[0-9a-zA-Z$%_]*}}
+; CHECK:       DBG_VALUE_LIST ![[BAZVAR]],
+; CHECK-SAME:     !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_plus_uconst, 1, DW_OP_stack_value)
+; CHECK-SAME:     {{[0-9a-zA-Z$%_]*}}
 ; CHECK-LABEL: bb.3.if.end:
 ; CHECK-NOT:   DBG_VALUE
-; CHECK-NOT:   DBG_VALUE_LIST
+; CHECK-NOT:   DBG_VALUE_LIST
 
 declare void @escape1(i32)
 declare void @escape2(i32)
diff --git a/llvm/test/DebugInfo/X86/packed_bitfields.ll b/llvm/test/DebugInfo/X86/packed_bitfields.ll
index 0e541f09d227..614fa59c3678 100644
--- a/llvm/test/DebugInfo/X86/packed_bitfields.ll
+++ b/llvm/test/DebugInfo/X86/packed_bitfields.ll
@@ -15,7 +15,7 @@
 ; CHECK-NOT: DW_TAG_member
 ; CHECK:      DW_AT_byte_size  {{.*}} (0x01)
 ; CHECK-NEXT: DW_AT_bit_size   {{.*}} (0x06)
-; CHECK-NEXT: DW_AT_bit_offset {{.*}} (0xffffffffffffffff)
+; CHECK-NEXT: DW_AT_bit_offset {{.*}} (-1)
 ; CHECK-NEXT: DW_AT_data_member_location {{.*}} ({{.*}}0x0{{0*}})
 
 ; ModuleID = 'repro.c'
diff --git a/llvm/test/DebugInfo/X86/pieces-4.ll b/llvm/test/DebugInfo/X86/pieces-4.ll
index aa93bd6a7f5f..d171931c0b4a 100644
--- a/llvm/test/DebugInfo/X86/pieces-4.ll
+++ b/llvm/test/DebugInfo/X86/pieces-4.ll
@@ -18,7 +18,7 @@
 ; CHECK-LABEL: bitpiece_spill:                         # @bitpiece_spill
 ; CHECK:               callq   g
 ; CHECK:               movl    %eax, [[offs:[0-9]+]](%rsp)          # 4-byte Spill
-; CHECK:               #DEBUG_VALUE: bitpiece_spill:o <- [DW_OP_plus_uconst [[offs]], DW_OP_deref, DW_OP_LLVM_fragment 0 32] $rsp
+; CHECK:               #DEBUG_VALUE: bitpiece_spill:o <- [DW_OP_plus_uconst [[offs]], DW_OP_deref, DW_OP_LLVM_fragment 0 32] $rsp
 ; CHECK:               #DEBUG_VALUE: bitpiece_spill:o <- [DW_OP_LLVM_fragment 32 32] 0
 ; CHECK:               #APP
 ; CHECK:               #NO_APP
diff --git a/llvm/test/DebugInfo/X86/pr34545.ll b/llvm/test/DebugInfo/X86/pr34545.ll
index 013428f4b558..27a784cbcc4b 100644
--- a/llvm/test/DebugInfo/X86/pr34545.ll
+++ b/llvm/test/DebugInfo/X86/pr34545.ll
@@ -9,22 +9,22 @@
 
 ; CHECK:         $eax = MOV32rm
 ; INSTRREF-SAME:      debug-instr-number 1
-; INSTRREF:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
-; VARLOCS:         DBG_VALUE $eax
-; INSTRREF:        DBG_VALUE_LIST {{.+}} $eax
+; INSTRREF:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
+; VARLOCS:         DBG_VALUE $eax
+; INSTRREF:        DBG_VALUE_LIST {{.+}} $eax
 ; CHECK:         $eax = SHL32rCL killed renamable $eax,
 ; INSTRREF-SAME:      debug-instr-number 2
-; INSTRREF:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(2, 0)
-; VARLOCS:       DBG_VALUE $eax
-; INSTRREF:      DBG_VALUE_LIST {{.+}} $eax
-; VARLOCS:       DBG_VALUE $rsp, 0, !{{[0-9]+}}, !DIExpression(DW_OP_constu, 4, DW_OP_minus)
-; INSTRREF:      DBG_VALUE_LIST !{{[0-9]+}}, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_constu, 4, DW_OP_minus, DW_OP_deref), $rsp
+; INSTRREF:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(2, 0)
+; VARLOCS:       DBG_VALUE $eax
+; INSTRREF:      DBG_VALUE_LIST {{.+}} $eax
+; VARLOCS:       DBG_VALUE $rsp, 0, !{{[0-9]+}}, !DIExpression(DW_OP_constu, 4, DW_OP_minus)
+; INSTRREF:      DBG_VALUE_LIST !{{[0-9]+}}, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_constu, 4, DW_OP_minus, DW_OP_deref), $rsp
 ; VARLOCS:       DBG_VALUE $eax
 ; CHECK:         $eax = SHL32rCL killed renamable $eax,
 ; INSTRREF-SAME:      debug-instr-number 3
-; INSTRREF:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(3, 0)
-; VARLOCS:       DBG_VALUE $eax
-; INSTRREF:      DBG_VALUE_LIST {{.+}} $eax
+; INSTRREF:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(3, 0)
+; VARLOCS:       DBG_VALUE $eax
+; INSTRREF:      DBG_VALUE_LIST {{.+}} $eax
 ; CHECK:         RET64 $eax
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/DebugInfo/X86/pr40427.ll b/llvm/test/DebugInfo/X86/pr40427.ll
index fe5bc4a5470f..264076b2569d 100644
--- a/llvm/test/DebugInfo/X86/pr40427.ll
+++ b/llvm/test/DebugInfo/X86/pr40427.ll
@@ -30,7 +30,7 @@ block:
 ; CHECK-NEXT: [[LOADR:%[0-9]+]]:gr16 = MOV16rm %0,
 ; INSTRREF-SAME: debug-instr-number 1
 ; DBGVALUE-NEXT: DBG_VALUE [[LOADR]], $noreg, ![[DBGVAR]]
-; INSTRREF-NEXT: DBG_INSTR_REF ![[DBGVAR]], {{.+}}, dbg-instr-ref(1, 0)
+; INSTRREF-NEXT: DBG_INSTR_REF ![[DBGVAR]], {{.+}}, dbg-instr-ref(1, 0)
 ; CHECK-NEXT: %{{[0-9]+}}:gr32 = IMPLICIT_DEF
   %foo = phi ptr[%bees, %trueb], [%more, %falseb]
   %ret = load i32, ptr %foo, !dbg !6
diff --git a/llvm/test/DebugInfo/X86/sdag-combine.ll b/llvm/test/DebugInfo/X86/sdag-combine.ll
index 4b8708830fdd..84ef27be6dbd 100644
--- a/llvm/test/DebugInfo/X86/sdag-combine.ll
+++ b/llvm/test/DebugInfo/X86/sdag-combine.ll
@@ -17,7 +17,7 @@ entry:
   %0 = alloca %TSb, align 1
   %1 = call swiftcc i1 @f(), !dbg !7
   ; CHECK: DBG_VALUE $rcx, $noreg, !8, !DIExpression(),
-  ; INSTRREF: DBG_VALUE_LIST !8, !DIExpression(DW_OP_LLVM_arg, 0), $ecx
+  ; INSTRREF: DBG_VALUE_LIST !8, !DIExpression(DW_OP_LLVM_arg, 0), $ecx
   call void @llvm.dbg.value(metadata i1 %1, metadata !8, metadata !DIExpression()), !dbg !7
   store i1 %1, ptr %0, align 1, !dbg !7
   %2 = zext i1 %1 to i64, !dbg !7
diff --git a/llvm/test/DebugInfo/X86/sdag-dangling-dbgvalue.ll b/llvm/test/DebugInfo/X86/sdag-dangling-dbgvalue.ll
index 600d6d837964..2f9af6054039 100644
--- a/llvm/test/DebugInfo/X86/sdag-dangling-dbgvalue.ll
+++ b/llvm/test/DebugInfo/X86/sdag-dangling-dbgvalue.ll
@@ -81,9 +81,9 @@ target triple = "x86_64-apple-macosx10.4.0"
 define i32 @test1() local_unnamed_addr #0 !dbg !17 {
 ; CHECK-LABEL: bb.0.entry1
 ; CHECK-NEXT:    DBG_VALUE 0, $noreg, ![[BAR1]], !DIExpression()
-; CHECK-NEXT:    [[REG1:%[0-9]+]]:gr64 = LEA64r 
+; CHECK-NEXT:    [[REG1:%[0-9]+]]:gr64 = LEA64r
 ; INSTRREF-SAME:    debug-instr-number 1
-; INSTRREF-NEXT:  DBG_INSTR_REF ![[FOO1]], !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0)
+; INSTRREF-NEXT:  DBG_INSTR_REF ![[FOO1]], !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0)
 ; DBGVALUE-NEXT:  DBG_VALUE [[REG1]], $noreg, ![[FOO1]], !DIExpression()
 entry1:
   call void @llvm.dbg.value(metadata ptr @S, metadata !20, metadata !DIExpression()), !dbg !23
@@ -96,8 +96,8 @@ define i32 @test2() local_unnamed_addr #0 !dbg !26 {
 ; CHECK-LABEL: bb.0.entry2
 ; CHECK-NEXT:    [[REG2:%[0-9]+]]:gr64 = LEA64r
 ; INSTRREF-SAME:    debug-instr-number 1
-; INSTRREF-NEXT: DBG_INSTR_REF ![[FOO2]], !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0)
-; INSTRREF-NEXT: DBG_INSTR_REF ![[BAR2]], !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0)
+; INSTRREF-NEXT: DBG_INSTR_REF ![[FOO2]], !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0)
+; INSTRREF-NEXT: DBG_INSTR_REF ![[BAR2]], !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0)
 ; DBGVALUE-NEXT: DBG_VALUE [[REG2]], $noreg, ![[FOO2]], !DIExpression
 ; DBGVALUE-NEXT: DBG_VALUE [[REG2]], $noreg, ![[BAR2]], !DIExpression
 entry2:
@@ -111,8 +111,8 @@ define i32 @test3() local_unnamed_addr #0 !dbg !33 {
 ; CHECK-LABEL: bb.0.entry3
 ; CHECK-NEXT:    [[REG3:%[0-9]+]]:gr64 = LEA64r
 ; INSTRREF-SAME:    debug-instr-number 1
-; INSTRREF-NEXT: DBG_INSTR_REF ![[BAR3]], !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0)
-; INSTRREF-NEXT: DBG_INSTR_REF ![[FOO3]], !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0)
+; INSTRREF-NEXT: DBG_INSTR_REF ![[BAR3]], !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0)
+; INSTRREF-NEXT: DBG_INSTR_REF ![[FOO3]], !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0)
 ; DBGVALUE-NEXT: DBG_VALUE [[REG3]], $noreg, ![[BAR3]], !DIExpression()
 ; DBGVALUE-NEXT: DBG_VALUE [[REG3]], $noreg, ![[FOO3]], !DIExpression()
 entry3:
@@ -128,7 +128,7 @@ define i32 @test4() local_unnamed_addr #0 !dbg !40 {
 ; CHECK-NEXT:    DBG_VALUE 0, $noreg, ![[FOO4]], !DIExpression()
 ; CHECK-NEXT:    [[REG4:%[0-9]+]]:gr64 = LEA64r
 ; INSTRREF-SAME:    debug-instr-number 1
-; INSTRREF-NEXT: DBG_INSTR_REF ![[BAR4]], !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0)
+; INSTRREF-NEXT: DBG_INSTR_REF ![[BAR4]], !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0)
 ; DBGVALUE-NEXT: DBG_VALUE [[REG4]], $noreg, ![[BAR4]], !DIExpression()
 entry4:
   call void @llvm.dbg.value(metadata ptr @S, metadata !42, metadata !DIExpression()), !dbg !44
@@ -144,7 +144,7 @@ define i32 @test5() local_unnamed_addr #0 !dbg !47 {
 ; CHECK-NEXT:    DBG_VALUE 0, $noreg, ![[FOO5]], !DIExpression()
 ; CHECK-NEXT:    [[REG5:%[0-9]+]]:gr64 = LEA64r
 ; INSTRREF-SAME:    debug-instr-number 1
-; INSTRREF-NEXT: DBG_INSTR_REF ![[BAR5]], !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0)
+; INSTRREF-NEXT: DBG_INSTR_REF ![[BAR5]], !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0)
 ; DBGVALUE-NEXT: DBG_VALUE [[REG5]], $noreg, ![[BAR5]], !DIExpression()
 ; CHECK-NOT:     DBG_{{.*}} ![[FOO5]], !DIExpression()
 ; CHECK:         RET
diff --git a/llvm/test/DebugInfo/X86/sdag-dbgvalue-phi-use-1.ll b/llvm/test/DebugInfo/X86/sdag-dbgvalue-phi-use-1.ll
index 97b156773b91..3bece362d50a 100644
--- a/llvm/test/DebugInfo/X86/sdag-dbgvalue-phi-use-1.ll
+++ b/llvm/test/DebugInfo/X86/sdag-dbgvalue-phi-use-1.ll
@@ -59,7 +59,7 @@ for.cond.cleanup:                                 ; preds = %for.body, %entry
 ; CHECK-LABEL: bb.{{.*}}.for.cond.cleanup:
 ; CHECK:         [[REG1:%[0-9]+]]:gr32 = PHI
 ; INSTRREF-SAME:    debug-instr-number 7
-; INSTRREF-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(7, 0)
+; INSTRREF-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(7, 0)
 ; DBGVALUE-NEXT: DBG_VALUE [[REG1]]
   %x.0.lcssa = phi i32 [ 9, %entry ], [ %add, %for.body ]
   call void @llvm.dbg.value(metadata i32 %x.0.lcssa, metadata !15, metadata !DIExpression()), !dbg !26
@@ -76,9 +76,9 @@ for.body:                                         ; preds = %for.body.lr.ph, %fo
 ; INSTRREF-SAME:   debug-instr-number 4
 ; CHECK-NEXT:   [[REG4:%[0-9]+]]:gr32 = PHI
 ; INSTRREF-SAME:   debug-instr-number 5
-; INSTRREF-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(3, 0)
-; INSTRREF-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(4, 0)
-; INSTRREF-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(5, 0)
+; INSTRREF-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(3, 0)
+; INSTRREF-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(4, 0)
+; INSTRREF-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(5, 0)
 ; DBGVALUE-NEXT: DBG_VALUE [[REG2]]
 ; DBGVALUE-NEXT: DBG_VALUE [[REG3]]
 ; DBGVALUE-NEXT: DBG_VALUE [[REG4]]
diff --git a/llvm/test/DebugInfo/X86/sdag-dbgvalue-phi-use-2.ll b/llvm/test/DebugInfo/X86/sdag-dbgvalue-phi-use-2.ll
index 6ff5f3f82a33..d94c394cec5f 100644
--- a/llvm/test/DebugInfo/X86/sdag-dbgvalue-phi-use-2.ll
+++ b/llvm/test/DebugInfo/X86/sdag-dbgvalue-phi-use-2.ll
@@ -38,7 +38,7 @@ for.cond.cleanup:                                 ; preds = %for.body, %entry
 ; CHECK-LABEL: bb.{{.*}}.for.cond.cleanup:
 ; CHECK:         [[REG1:%[0-9]+]]:gr32 = PHI
 ; INSTRREF-SAME:      debug-instr-number 7
-; INSTRREF-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(7, 0)
+; INSTRREF-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(7, 0)
 ; DBGVALUE-NEXT: DBG_VALUE [[REG1]]
   %x.0.lcssa = phi i32 [ 9, %entry ], [ %add, %for.body ]
   call void @llvm.dbg.value(metadata i32 %x.0.lcssa, metadata !15, metadata !DIExpression()), !dbg !26
@@ -55,27 +55,27 @@ for.body:                                         ; preds = %for.body.lr.ph, %fo
 ; INSTRREF-SAME:    debug-instr-number 3
 ; CHECK-NEXT:    [[REG4:%[0-9]+]]:gr32 = PHI
 ; INSTRREF-SAME:    debug-instr-number 6
-; INSTRREF-NEXT: DBG_INSTR_REF !16, {{.+}}, dbg-instr-ref(3, 0)
+; INSTRREF-NEXT: DBG_INSTR_REF !16, {{.+}}, dbg-instr-ref(3, 0)
 ; DBGVALUE-NEXT: DBG_VALUE [[REG3]], $noreg, !16
 ; CHECK-NEXT:    DBG_VALUE 555, $noreg, !17
 ; CHECK-NEXT:    [[ADDREG:%[0-9]+]]:gr32 = nuw nsw ADD32rr
 ; INSTRREF-SAME:    debug-instr-number 5
-; INSTRREF-NEXT: DBG_INSTR_REF !17, {{.+}}, dbg-instr-ref(4, 0)
+; INSTRREF-NEXT: DBG_INSTR_REF !17, {{.+}}, dbg-instr-ref(4, 0)
 ; DBGVALUE-NEXT: DBG_VALUE [[REG2]], $noreg, !17
 ; CHECK:         [[MULREG:%[0-9]+]]:gr32 = LEA64_32r
 ; INSTRREF-SAME:    debug-instr-number 1
 ; CHECK-NEXT:    DBG_VALUE 777, $noreg, !17
 ;;; XXX: The following DBG_INSTR_REF should have stayed below the INC32r
-; INSTRREF-NEXT: DBG_INSTR_REF !16, {{.+}}, dbg-instr-ref(1, 0)
+; INSTRREF-NEXT: DBG_INSTR_REF !16, {{.+}}, dbg-instr-ref(1, 0)
 ; DBGVALUE-NEXT: DBG_VALUE [[MULREG]], $noreg, !16
 ; CHECK-NEXT:    [[INCREG:%[0-9]+]]:gr32 = nuw nsw INC32r
 ; INSTRREF-SAME:    debug-instr-number 2
-; INSTRREF-NEXT: DBG_INSTR_REF !17, {{.+}}, dbg-instr-ref(2, 0)
-; INSTRREF-NEXT: DBG_INSTR_REF !15, {{.+}}, dbg-instr-ref(5, 0)
+; INSTRREF-NEXT: DBG_INSTR_REF !17, {{.+}}, dbg-instr-ref(2, 0)
+; INSTRREF-NEXT: DBG_INSTR_REF !15, {{.+}}, dbg-instr-ref(5, 0)
 ; DBGVALUE-NEXT: DBG_VALUE [[INCREG]], $noreg, !17
 ; DBGVALUE-NEXT: DBG_VALUE [[ADDREG]], $noreg, !15
 ; CHECK-NEXT:    implicit-def $eflags,
-; INSTRREF-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(6, 0)
+; INSTRREF-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(6, 0)
 ; DBGVALUE-NEXT: DBG_VALUE [[REG4]]
   %u.023 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
   %y.022 = phi i32 [ 13, %for.body.lr.ph ], [ %mul, %for.body ]
diff --git a/llvm/test/DebugInfo/X86/sdag-dbgvalue-phi-use-3.ll b/llvm/test/DebugInfo/X86/sdag-dbgvalue-phi-use-3.ll
index fd0cfc1e9612..c2cdca3bb28c 100644
--- a/llvm/test/DebugInfo/X86/sdag-dbgvalue-phi-use-3.ll
+++ b/llvm/test/DebugInfo/X86/sdag-dbgvalue-phi-use-3.ll
@@ -79,12 +79,12 @@ for.body:                                         ; preds = %for.body.lr.ph, %fo
 ; INSTRREF-SAME:    debug-instr-number 9
 ; CHECK-NEXT:    [[REG7:%[0-9]+]]:gr32 = PHI
 ; INSTRREF-SAME:    debug-instr-number 10
-; INSTRREF-NEXT: DBG_INSTR_REF !19, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 0, 32), dbg-instr-ref(5, 0)
-; INSTRREF-NEXT: DBG_INSTR_REF !19, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 32, 32), dbg-instr-ref(6, 0)
-; INSTRREF-NEXT: DBG_INSTR_REF !18, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 0, 32), dbg-instr-ref(7, 0)
-; INSTRREF-NEXT: DBG_INSTR_REF !18, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 32, 32), dbg-instr-ref(8, 0)
-; INSTRREF-NEXT: DBG_INSTR_REF !17, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 0, 32), dbg-instr-ref(9, 0)
-; INSTRREF-NEXT: DBG_INSTR_REF !17, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 32, 32), dbg-instr-ref(10, 0)
+; INSTRREF-NEXT: DBG_INSTR_REF !19, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 0, 32), dbg-instr-ref(5, 0)
+; INSTRREF-NEXT: DBG_INSTR_REF !19, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 32, 32), dbg-instr-ref(6, 0)
+; INSTRREF-NEXT: DBG_INSTR_REF !18, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 0, 32), dbg-instr-ref(7, 0)
+; INSTRREF-NEXT: DBG_INSTR_REF !18, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 32, 32), dbg-instr-ref(8, 0)
+; INSTRREF-NEXT: DBG_INSTR_REF !17, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 0, 32), dbg-instr-ref(9, 0)
+; INSTRREF-NEXT: DBG_INSTR_REF !17, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 32, 32), dbg-instr-ref(10, 0)
 ; DBGVALUE-NEXT: DBG_VALUE [[REG2]], $noreg, !19, !DIExpression(DW_OP_LLVM_fragment, 0, 32)
 ; DBGVALUE-NEXT: DBG_VALUE [[REG3]], $noreg, !19, !DIExpression(DW_OP_LLVM_fragment, 32, 32)
 ; DBGVALUE-NEXT: DBG_VALUE [[REG4]], $noreg, !18, !DIExpression(DW_OP_LLVM_fragment, 0, 32)
diff --git a/llvm/test/DebugInfo/X86/sdag-dbgvalue-phi-use-4.ll b/llvm/test/DebugInfo/X86/sdag-dbgvalue-phi-use-4.ll
index 47c8515a537c..1c04e77c7d28 100644
--- a/llvm/test/DebugInfo/X86/sdag-dbgvalue-phi-use-4.ll
+++ b/llvm/test/DebugInfo/X86/sdag-dbgvalue-phi-use-4.ll
@@ -23,11 +23,11 @@
 ; INSTRREF-SAME:    debug-instr-number 2
 ; CHECK-NEXT:    [[REG3:%[0-9]+]]:gr32 = PHI
 ; INSTRREF-SAME:    debug-instr-number 3
-; INSTRREF-NEXT: DBG_INSTR_REF !13, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 0, 32), dbg-instr-ref(1, 0)
-; INSTRREF-NEXT: DBG_INSTR_REF !13, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 32, 32), dbg-instr-ref(2, 0)
-; INSTRREF-NEXT: DBG_INSTR_REF !13, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 64, 16), dbg-instr-ref(3, 0)
-; INSTRREF-NEXT: DBG_INSTR_REF !12, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 10, 32), dbg-instr-ref(1, 0)
-; INSTRREF-NEXT: DBG_INSTR_REF !12, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 42, 13), dbg-instr-ref(2, 0)
+; INSTRREF-NEXT: DBG_INSTR_REF !13, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 0, 32), dbg-instr-ref(1, 0)
+; INSTRREF-NEXT: DBG_INSTR_REF !13, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 32, 32), dbg-instr-ref(2, 0)
+; INSTRREF-NEXT: DBG_INSTR_REF !13, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 64, 16), dbg-instr-ref(3, 0)
+; INSTRREF-NEXT: DBG_INSTR_REF !12, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 10, 32), dbg-instr-ref(1, 0)
+; INSTRREF-NEXT: DBG_INSTR_REF !12, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 42, 13), dbg-instr-ref(2, 0)
 ; DBGVALUE-NEXT: DBG_VALUE [[REG1]], $noreg,  !13, !DIExpression(DW_OP_LLVM_fragment, 0, 32)
 ; DBGVALUE-NEXT: DBG_VALUE [[REG2]], $noreg,  !13, !DIExpression(DW_OP_LLVM_fragment, 32, 32)
 ; DBGVALUE-NEXT: DBG_VALUE [[REG3]], $noreg,  !13, !DIExpression(DW_OP_LLVM_fragment, 64, 16)
diff --git a/llvm/test/DebugInfo/X86/sdag-dbgvalue-ssareg.ll b/llvm/test/DebugInfo/X86/sdag-dbgvalue-ssareg.ll
index c712d8410b7b..ca23d10531cf 100644
--- a/llvm/test/DebugInfo/X86/sdag-dbgvalue-ssareg.ll
+++ b/llvm/test/DebugInfo/X86/sdag-dbgvalue-ssareg.ll
@@ -34,7 +34,7 @@ nextbb:
   %2 = mul i32 %0, %arg1, !dbg !26
 ; CHECK: IMUL32rr
   call void @llvm.dbg.value(metadata i32 %1, metadata !16, metadata !DIExpression()), !dbg !27
-; INSTRREF-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
+; INSTRREF-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
 ; DBGVALUE-NEXT: DBG_VALUE
   br label %exit, !dbg !26
 
diff --git a/llvm/test/DebugInfo/X86/sdag-ir-salvage.ll b/llvm/test/DebugInfo/X86/sdag-ir-salvage.ll
index a620b239f14f..c9a16e326491 100644
--- a/llvm/test/DebugInfo/X86/sdag-ir-salvage.ll
+++ b/llvm/test/DebugInfo/X86/sdag-ir-salvage.ll
@@ -19,7 +19,7 @@
 ; CHECK-LABEL: bb.0.entry:
 ; INSTRREF:    DBG_PHI $rdi, 1
 ; CHECK-LABEL: bb.1.next:
-; INSTRREF:    DBG_INSTR_REF ![[AAAVAR]], {{.+}}, dbg-instr-ref(1, 0)
+; INSTRREF:    DBG_INSTR_REF ![[AAAVAR]], {{.+}}, dbg-instr-ref(1, 0)
 ; DBGVALUE:    DBG_VALUE %{{[0-9]+}}, $noreg, ![[AAAVAR]]
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/DebugInfo/X86/sdag-salvage-add.ll b/llvm/test/DebugInfo/X86/sdag-salvage-add.ll
index ecd2c0dad3f1..5a77b9504206 100644
--- a/llvm/test/DebugInfo/X86/sdag-salvage-add.ll
+++ b/llvm/test/DebugInfo/X86/sdag-salvage-add.ll
@@ -30,21 +30,21 @@
 ; NB: instruction referencing and DBG_VALUE modes produce debug insts in a
 ; different order.
 ;
-; CHECK:         ![[S4:.*]] = !DILocalVariable(name: "s4", 
-; CHECK:         ![[MYVAR:.*]] = !DILocalVariable(name: "myVar", 
+; CHECK:         ![[S4:.*]] = !DILocalVariable(name: "s4",
+; CHECK:         ![[MYVAR:.*]] = !DILocalVariable(name: "myVar",
 ; CHECK:         $rax = MOV64rm
 ; INSTRREF-SAME: debug-instr-number 2,
-; INSTRREF-NEXT: DBG_INSTR_REF ![[S4]],
+; INSTRREF-NEXT: DBG_INSTR_REF ![[S4]],
 ; DBGVALUE-NEXT: DBG_VALUE $rax, $noreg, ![[MYVAR]],
-; DBGVALUE-SAME:       !DIExpression(DW_OP_plus_uconst, 4096, DW_OP_stack_value)
-; INSTRREF-SAME:       !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_plus_uconst, 4096, DW_OP_stack_value)
-; INSTRREF-SAME: dbg-instr-ref(2, 0)
+; DBGVALUE-SAME:       !DIExpression(DW_OP_plus_uconst, 4096, DW_OP_stack_value)
+; INSTRREF-SAME:       !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_plus_uconst, 4096, DW_OP_stack_value)
+; INSTRREF-SAME: dbg-instr-ref(2, 0)
 
-; INSTRREF:      DBG_INSTR_REF ![[MYVAR]],
+; INSTRREF:      DBG_INSTR_REF ![[MYVAR]],
 ; DBGVALUE:      DBG_VALUE $rax, $noreg, ![[S4]],
-; DBGVALUE-SAME:           !DIExpression(DW_OP_plus_uconst, 4096, DW_OP_stack_value)
-; INSTRREF-SAME:           !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_plus_uconst, 4096, DW_OP_stack_value)
-; INSTRREF-SAME: dbg-instr-ref(2, 0)
+; DBGVALUE-SAME:           !DIExpression(DW_OP_plus_uconst, 4096, DW_OP_stack_value)
+; INSTRREF-SAME:           !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_plus_uconst, 4096, DW_OP_stack_value)
+; INSTRREF-SAME: dbg-instr-ref(2, 0)
 ; CHECK-NEXT: $rdi = MOV64rm killed renamable $rax, 1, $noreg, 4096, $noreg,
 
 source_filename = "test.c"
diff --git a/llvm/test/DebugInfo/X86/sdag-transfer-dbgvalue.ll b/llvm/test/DebugInfo/X86/sdag-transfer-dbgvalue.ll
index a0f839eadf53..b4bf3a802684 100644
--- a/llvm/test/DebugInfo/X86/sdag-transfer-dbgvalue.ll
+++ b/llvm/test/DebugInfo/X86/sdag-transfer-dbgvalue.ll
@@ -24,7 +24,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK-LABEL: bb.0.entry:
 ; CHECK:         %[[REG:[0-9]+]]:gr32 = ADD32ri %1, 512,
 ; INSTRREF-SAME:   debug-instr-number 1
-; INSTRREF-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
+; INSTRREF-NEXT: DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
 ; DBGVALUE-NEXT: DBG_VALUE %[[REG]]
 
 ; Function Attrs: nofree norecurse nounwind uwtable writeonly
diff --git a/llvm/test/DebugInfo/X86/sdagsplit-1.ll b/llvm/test/DebugInfo/X86/sdagsplit-1.ll
index 29c5f6967156..40a6be850c38 100644
--- a/llvm/test/DebugInfo/X86/sdagsplit-1.ll
+++ b/llvm/test/DebugInfo/X86/sdagsplit-1.ll
@@ -13,8 +13,8 @@
 ;      return 0;
 ;    }
 ;
-; CHECK-DAG: DBG_VALUE_LIST !{{[0-9]+}}, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 0, 32), ${{[a-z]+}}, debug-location
-; CHECK-DAG: DBG_VALUE_LIST !{{[0-9]+}}, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 32, 32), ${{[a-z]+}}, debug-location
+; CHECK-DAG: DBG_VALUE_LIST !{{[0-9]+}}, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 0, 32), ${{[a-z]+}}, debug-location
+; CHECK-DAG: DBG_VALUE_LIST !{{[0-9]+}}, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_fragment, 32, 32), ${{[a-z]+}}, debug-location
 
 ; ModuleID = 'sdagsplit-1.c'
 target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
diff --git a/llvm/test/DebugInfo/X86/spill-nospill.ll b/llvm/test/DebugInfo/X86/spill-nospill.ll
index e6d34df836f8..94e47aea41bd 100644
--- a/llvm/test/DebugInfo/X86/spill-nospill.ll
+++ b/llvm/test/DebugInfo/X86/spill-nospill.ll
@@ -24,7 +24,7 @@
 ; CHECK-LABEL: f: # @f
 ; CHECK: callq   g
 ; CHECK: movl    %eax, [[X_OFFS:[0-9]+]](%rsp)          # 4-byte Spill
-; CHECK: #DEBUG_VALUE: f:x <- [DW_OP_plus_uconst [[X_OFFS]], DW_OP_deref] $rsp
+; CHECK: #DEBUG_VALUE: f:x <- [DW_OP_plus_uconst [[X_OFFS]], DW_OP_deref] $rsp
 ; CHECK: #APP
 ; CHECK: #NO_APP
 ; CHECK: callq   g
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/hotcoldsplit.ll b/llvm/test/DebugInfo/assignment-tracking/X86/hotcoldsplit.ll
new file mode 100644
index 000000000000..f3faba7122af
--- /dev/null
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/hotcoldsplit.ll
@@ -0,0 +1,50 @@
+; RUN: opt %s -passes=hotcoldsplit -S | FileCheck %s
+
+;; Check the extracted DIAssignID gets remapped.
+
+; CHECK-LABEL: define void @_foo()
+; CHECK: common.ret:
+; CHECK-NEXT: dbg.assign(metadata i64 0, metadata ![[#]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 64), metadata ![[ID1:[0-9]+]], {{.*}}, metadata !DIExpression())
+
+; CHECK-LABEL: define internal void @_foo.cold.1()
+; CHECK: store i64 0, ptr null, align 8, !DIAssignID ![[ID2:[0-9]+]]
+
+; CHECK-DAG: ![[ID1]] = distinct !DIAssignID()
+; CHECK-DAG: ![[ID2]] = distinct !DIAssignID()
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @_foo() !dbg !4 {
+entry:
+  br i1 false, label %if.then7, label %common.ret
+
+common.ret:                                       ; preds = %entry
+  call void @llvm.dbg.assign(metadata i64 0, metadata !7, metadata !DIExpression(DW_OP_LLVM_fragment, 0, 64), metadata !12, metadata ptr null, metadata !DIExpression()), !dbg !13
+  ret void
+
+if.then7:                                         ; preds = %entry
+  %call21 = load i1, ptr null, align 4294967296
+  store i64 0, ptr null, align 8, !DIAssignID !12
+  unreachable
+}
+
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "file.cpp", directory: "foo")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "foo", linkageName: "_foo", scope: !5, file: !1, line: 425, type: !6, scopeLine: 425, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!5 = !DINamespace(name: "llvm", scope: null)
+!6 = distinct !DISubroutineType(types: !2)
+!7 = !DILocalVariable(name: "Path", scope: !4, file: !1, line: 436, type: !8)
+!8 = !DIDerivedType(tag: DW_TAG_typedef, name: "string", scope: !9, file: !1, line: 79, baseType: !10)
+!9 = !DINamespace(name: "std", scope: null)
+!10 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "basic_string<char, std::char_traits<char>, std::allocator<char> >", scope: !11, file: !1, line: 85, size: 256, flags: DIFlagTypePassByReference | DIFlagNonTrivial, elements: !2, templateParams: !2, identifier: "_ZTSNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE")
+!11 = !DINamespace(name: "__cxx11", scope: !9, exportSymbols: true)
+!12 = distinct !DIAssignID()
+!13 = !DILocation(line: 0, scope: !4)
diff --git a/llvm/test/DebugInfo/precomp.test b/llvm/test/DebugInfo/precomp.test
index 0734f185a6f3..2c3c8db6a86b 100644
--- a/llvm/test/DebugInfo/precomp.test
+++ b/llvm/test/DebugInfo/precomp.test
@@ -1,57 +1,57 @@
-
-RUN: rm -rf %t1/
-RUN: mkdir %t1
-RUN: obj2yaml %S/Inputs/precomp-a.obj > %t1/precomp-a.yaml
-RUN: obj2yaml %S/Inputs/precomp.obj > %t1/precomp.yaml
-RUN: yaml2obj %t1/precomp-a.yaml -o %t1/a.obj
-RUN: yaml2obj %t1/precomp.yaml -o %t1/precomp.obj
-RUN: llvm-readobj --codeview %t1/a.obj | FileCheck %s -check-prefix PRECOMP
-RUN: llvm-readobj --codeview %t1/precomp.obj | FileCheck %s -check-prefix ENDPRECOMP
-RUN: llvm-pdbutil dump -types %t1/a.obj | FileCheck %s -check-prefix PDB-PRECOMP
-RUN: llvm-pdbutil dump -types %t1/precomp.obj | FileCheck %s -check-prefix PDB-ENDPRECOMP
-
-ENDPRECOMP:      CodeViewTypes [
-ENDPRECOMP-NEXT:   Section: .debug$P (3)
-ENDPRECOMP:        EndPrecomp (0x1407) {
-ENDPRECOMP-NEXT:     TypeLeafKind: LF_ENDPRECOMP (0x14)
-ENDPRECOMP-NEXT:     Signature: 0x1116980E
-ENDPRECOMP-NEXT:   }
-
-PRECOMP:      CodeViewTypes [
-PRECOMP-NEXT:   Section: .debug$T (3)
-PRECOMP-NEXT:   Magic: 0x4
-PRECOMP-NEXT:   Precomp (0x1000) {
-PRECOMP-NEXT:     TypeLeafKind: LF_PRECOMP (0x1509)
-PRECOMP-NEXT:     StartIndex: 0x1000
-PRECOMP-NEXT:     Count: 0x407
-PRECOMP-NEXT:     Signature: 0x1116980E
-
-PDB-PRECOMP:                              Types (.debug$T)
-PDB-PRECOMP-NEXT: ============================================================
-PDB-PRECOMP-NEXT: Showing 0 records
-PDB-PRECOMP-NEXT: 0x1000 | LF_PRECOMP [size = 60] start index = 0x1000, types count = 0x407, signature = 0x1116980E, precomp path = f:\svn\lld\test\coff\precomp\precomp.obj
-
-PDB-ENDPRECOMP:                     Precompiled Types (.debug$P)
-PDB-ENDPRECOMP-NEXT: ============================================================
-PDB-ENDPRECOMP-NEXT: Showing 0 records
-PDB-ENDPRECOMP:      0x1407 | LF_ENDPRECOMP [size = 8] signature = 0x1116980E
-
-# // precomp.h
-# #pragma once
-# int Function(char A);
-#
-# // precomp.cpp
-# // cl.exe precomp.cpp /Z7 /Ycprecomp.h /c
-# #include "precomp.h"
-#
-# // a.cpp
-# #include "precomp.h"
-# int main(void) {
-#   Function('a');
-#   return 0;
-# }
-#
-# // cl.exe a.cpp /Z7 /Yuprecomp.h /c
-#
-# // obj2yaml precomp.obj >precomp-precomp.yaml
-# // obj2yaml a.obj >precomp-a.yaml
+
+RUN: rm -rf %t1/
+RUN: mkdir %t1
+RUN: obj2yaml %S/Inputs/precomp-a.obj > %t1/precomp-a.yaml
+RUN: obj2yaml %S/Inputs/precomp.obj > %t1/precomp.yaml
+RUN: yaml2obj %t1/precomp-a.yaml -o %t1/a.obj
+RUN: yaml2obj %t1/precomp.yaml -o %t1/precomp.obj
+RUN: llvm-readobj --codeview %t1/a.obj | FileCheck %s -check-prefix PRECOMP
+RUN: llvm-readobj --codeview %t1/precomp.obj | FileCheck %s -check-prefix ENDPRECOMP
+RUN: llvm-pdbutil dump -types %t1/a.obj | FileCheck %s -check-prefix PDB-PRECOMP
+RUN: llvm-pdbutil dump -types %t1/precomp.obj | FileCheck %s -check-prefix PDB-ENDPRECOMP
+
+ENDPRECOMP:      CodeViewTypes [
+ENDPRECOMP-NEXT:   Section: .debug$P (3)
+ENDPRECOMP:        EndPrecomp (0x1407) {
+ENDPRECOMP-NEXT:     TypeLeafKind: LF_ENDPRECOMP (0x14)
+ENDPRECOMP-NEXT:     Signature: 0x1116980E
+ENDPRECOMP-NEXT:   }
+
+PRECOMP:      CodeViewTypes [
+PRECOMP-NEXT:   Section: .debug$T (3)
+PRECOMP-NEXT:   Magic: 0x4
+PRECOMP-NEXT:   Precomp (0x1000) {
+PRECOMP-NEXT:     TypeLeafKind: LF_PRECOMP (0x1509)
+PRECOMP-NEXT:     StartIndex: 0x1000
+PRECOMP-NEXT:     Count: 0x407
+PRECOMP-NEXT:     Signature: 0x1116980E
+
+PDB-PRECOMP:                              Types (.debug$T)
+PDB-PRECOMP-NEXT: ============================================================
+PDB-PRECOMP-NEXT: Showing 0 records
+PDB-PRECOMP-NEXT: 0x1000 | LF_PRECOMP [size = 60] start index = 0x1000, types count = 0x407, signature = 0x1116980E, precomp path = f:\svn\lld\test\coff\precomp\precomp.obj
+
+PDB-ENDPRECOMP:                     Precompiled Types (.debug$P)
+PDB-ENDPRECOMP-NEXT: ============================================================
+PDB-ENDPRECOMP-NEXT: Showing 0 records
+PDB-ENDPRECOMP:      0x1407 | LF_ENDPRECOMP [size = 8] signature = 0x1116980E
+
+# // precomp.h
+# #pragma once
+# int Function(char A);
+#
+# // precomp.cpp
+# // cl.exe precomp.cpp /Z7 /Ycprecomp.h /c
+# #include "precomp.h"
+#
+# // a.cpp
+# #include "precomp.h"
+# int main(void) {
+#   Function('a');
+#   return 0;
+# }
+#
+# // cl.exe a.cpp /Z7 /Yuprecomp.h /c
+#
+# // obj2yaml precomp.obj >precomp-precomp.yaml
+# // obj2yaml a.obj >precomp-a.yaml
diff --git a/llvm/test/DebugInfo/salvage-nonconst-binop.ll b/llvm/test/DebugInfo/salvage-nonconst-binop.ll
index da67fac8d5b9..7b0f9562e5b2 100644
--- a/llvm/test/DebugInfo/salvage-nonconst-binop.ll
+++ b/llvm/test/DebugInfo/salvage-nonconst-binop.ll
@@ -1,45 +1,45 @@
-; RUN: opt %s -passes=dce -S | FileCheck %s
-
-; Tests the salvaging of binary operators that use more than one non-constant
-; SSA value.
-
-; CHECK: call void @llvm.dbg.value(metadata !DIArgList(i32 %a, i32 %b),
-; CHECK-SAME: ![[VAR_C:[0-9]+]],
-; CHECK-SAME: !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus, DW_OP_stack_value))
-
-; CHECK: ![[VAR_C]] = !DILocalVariable(name: "c"
-
-define i32 @"?multiply@@YAHHH@Z"(i32 %a, i32 %b) !dbg !8 {
-entry:
-  call void @llvm.dbg.value(metadata i32 %b, metadata !12, metadata !DIExpression()), !dbg !13
-  call void @llvm.dbg.value(metadata i32 %a, metadata !14, metadata !DIExpression()), !dbg !13
-  %add = add nsw i32 %a, %b, !dbg !15
-  call void @llvm.dbg.value(metadata i32 %add, metadata !16, metadata !DIExpression()), !dbg !13
-  %mul = mul nsw i32 %a, %b, !dbg !17
-  ret i32 %mul, !dbg !17
-}
-
-declare void @llvm.dbg.value(metadata, metadata, metadata)
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5, !6}
-!llvm.ident = !{!7}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 11.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
-!1 = !DIFile(filename: "test.cpp", directory: "/")
-!2 = !{}
-!3 = !{i32 2, !"CodeView", i32 1}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 2}
-!6 = !{i32 7, !"PIC Level", i32 2}
-!7 = !{!"clang version 11.0.0"}
-!8 = distinct !DISubprogram(name: "multiply", linkageName: "?multiply@@YAHHH@Z", scope: !1, file: !1, line: 1, type: !9, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
-!9 = !DISubroutineType(types: !10)
-!10 = !{!11, !11, !11}
-!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-!12 = !DILocalVariable(name: "b", arg: 2, scope: !8, file: !1, line: 1, type: !11)
-!13 = !DILocation(line: 0, scope: !8)
-!14 = !DILocalVariable(name: "a", arg: 1, scope: !8, file: !1, line: 1, type: !11)
-!15 = !DILocation(line: 2, scope: !8)
-!16 = !DILocalVariable(name: "c", scope: !8, file: !1, line: 2, type: !11)
-!17 = !DILocation(line: 3, scope: !8)
+; RUN: opt %s -passes=dce -S | FileCheck %s
+
+; Tests the salvaging of binary operators that use more than one non-constant
+; SSA value.
+
+; CHECK: call void @llvm.dbg.value(metadata !DIArgList(i32 %a, i32 %b),
+; CHECK-SAME: ![[VAR_C:[0-9]+]],
+; CHECK-SAME: !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus, DW_OP_stack_value))
+
+; CHECK: ![[VAR_C]] = !DILocalVariable(name: "c"
+
+define i32 @"?multiply@@YAHHH@Z"(i32 %a, i32 %b) !dbg !8 {
+entry:
+  call void @llvm.dbg.value(metadata i32 %b, metadata !12, metadata !DIExpression()), !dbg !13
+  call void @llvm.dbg.value(metadata i32 %a, metadata !14, metadata !DIExpression()), !dbg !13
+  %add = add nsw i32 %a, %b, !dbg !15
+  call void @llvm.dbg.value(metadata i32 %add, metadata !16, metadata !DIExpression()), !dbg !13
+  %mul = mul nsw i32 %a, %b, !dbg !17
+  ret i32 %mul, !dbg !17
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6}
+!llvm.ident = !{!7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 11.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "test.cpp", directory: "/")
+!2 = !{}
+!3 = !{i32 2, !"CodeView", i32 1}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 2}
+!6 = !{i32 7, !"PIC Level", i32 2}
+!7 = !{!"clang version 11.0.0"}
+!8 = distinct !DISubprogram(name: "multiply", linkageName: "?multiply@@YAHHH@Z", scope: !1, file: !1, line: 1, type: !9, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!9 = !DISubroutineType(types: !10)
+!10 = !{!11, !11, !11}
+!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!12 = !DILocalVariable(name: "b", arg: 2, scope: !8, file: !1, line: 1, type: !11)
+!13 = !DILocation(line: 0, scope: !8)
+!14 = !DILocalVariable(name: "a", arg: 1, scope: !8, file: !1, line: 1, type: !11)
+!15 = !DILocation(line: 2, scope: !8)
+!16 = !DILocalVariable(name: "c", scope: !8, file: !1, line: 2, type: !11)
+!17 = !DILocation(line: 3, scope: !8)
diff --git a/llvm/test/Demangle/ms-options.test b/llvm/test/Demangle/ms-options.test
index 94663a5f9fc4..b0d49d202989 100644
--- a/llvm/test/Demangle/ms-options.test
+++ b/llvm/test/Demangle/ms-options.test
@@ -1,43 +1,43 @@
-; RUN: llvm-undname < %s | FileCheck %s
-; RUN: llvm-undname --no-calling-convention < %s | FileCheck %s --check-prefix=CHECK-NO-CALLING-CONV
-; RUN: llvm-undname --no-return-type < %s | FileCheck %s --check-prefix=CHECK-NO-RETURN
-; RUN: llvm-undname --no-access-specifier < %s | FileCheck %s --check-prefix=CHECK-NO-ACCESS
-; RUN: llvm-undname --no-member-type < %s | FileCheck %s --check-prefix=CHECK-NO-MEMBER-TYPE
-; RUN: llvm-undname --no-variable-type < %s | FileCheck %s --check-prefix=CHECK-NO-VARIABLE-TYPE
-; RUN: llvm-undname --no-calling-convention --no-return-type --no-access-specifier --no-member-type --no-variable-type < %s | FileCheck %s --check-prefix=CHECK-NO-ALL
-
-?func@MyClass@@UEAAHHH@Z
-; CHECK: public: virtual int __cdecl MyClass::func(int, int)
-; CHECK-NO-CALLING-CONV: public: virtual int MyClass::func(int, int)
-; CHECK-NO-RETURN: public: virtual __cdecl MyClass::func(int, int)
-; CHECK-NO-ACCESS: {{^}}virtual int __cdecl MyClass::func(int, int)
-; CHECK-NO-MEMBER-TYPE: public: int __cdecl MyClass::func(int, int)
-; CHECK-NO-VARIABLE-TYPE: public: virtual int __cdecl MyClass::func(int, int)
-; CHECK-NO-ALL: {{^}}MyClass::func(int, int)
-
-?array2d@@3PAY09HA
-; CHECK: int (*array2d)[10]
-; CHECK-NO-CALLING-CONV: int (*array2d)[10]
-; CHECK-NO-RETURN: int (*array2d)[10]
-; CHECK-NO-ACCESS: int (*array2d)[10]
-; CHECK-NO-MEMBER-TYPE: int (*array2d)[10]
-; CHECK-NO-VARIABLE-TYPE: array2d
-; CHECK-NO-ALL: array2d
-
-?a@abc@@3PAY09HA
-; CHECK: int (*abc::a)[10]
-; CHECK-NO-CALLING-CONV: int (*abc::a)[10]
-; CHECK-NO-RETURN: int (*abc::a)[10]
-; CHECK-NO-ACCESS: int (*abc::a)[10]
-; CHECK-NO-MEMBER-TYPE: int (*abc::a)[10]
-; CHECK-NO-VARIABLE-TYPE: abc::a
-; CHECK-NO-ALL: abc::a
-
-?x@@3PEAEEA
-; CHECK: unsigned char *x
-; CHECK-NO-CALLING-CONV: unsigned char *x
-; CHECK-NO-RETURN: unsigned char *x
-; CHECK-NO-ACCESS: unsigned char *x
-; CHECK-NO-MEMBER-TYPE: unsigned char *x
-; CHECK-NO-VARIABLE-TYPE: x
-; CHECK-NO-ALL: x
+; RUN: llvm-undname < %s | FileCheck %s
+; RUN: llvm-undname --no-calling-convention < %s | FileCheck %s --check-prefix=CHECK-NO-CALLING-CONV
+; RUN: llvm-undname --no-return-type < %s | FileCheck %s --check-prefix=CHECK-NO-RETURN
+; RUN: llvm-undname --no-access-specifier < %s | FileCheck %s --check-prefix=CHECK-NO-ACCESS
+; RUN: llvm-undname --no-member-type < %s | FileCheck %s --check-prefix=CHECK-NO-MEMBER-TYPE
+; RUN: llvm-undname --no-variable-type < %s | FileCheck %s --check-prefix=CHECK-NO-VARIABLE-TYPE
+; RUN: llvm-undname --no-calling-convention --no-return-type --no-access-specifier --no-member-type --no-variable-type < %s | FileCheck %s --check-prefix=CHECK-NO-ALL
+
+?func@MyClass@@UEAAHHH@Z
+; CHECK: public: virtual int __cdecl MyClass::func(int, int)
+; CHECK-NO-CALLING-CONV: public: virtual int MyClass::func(int, int)
+; CHECK-NO-RETURN: public: virtual __cdecl MyClass::func(int, int)
+; CHECK-NO-ACCESS: {{^}}virtual int __cdecl MyClass::func(int, int)
+; CHECK-NO-MEMBER-TYPE: public: int __cdecl MyClass::func(int, int)
+; CHECK-NO-VARIABLE-TYPE: public: virtual int __cdecl MyClass::func(int, int)
+; CHECK-NO-ALL: {{^}}MyClass::func(int, int)
+
+?array2d@@3PAY09HA
+; CHECK: int (*array2d)[10]
+; CHECK-NO-CALLING-CONV: int (*array2d)[10]
+; CHECK-NO-RETURN: int (*array2d)[10]
+; CHECK-NO-ACCESS: int (*array2d)[10]
+; CHECK-NO-MEMBER-TYPE: int (*array2d)[10]
+; CHECK-NO-VARIABLE-TYPE: array2d
+; CHECK-NO-ALL: array2d
+
+?a@abc@@3PAY09HA
+; CHECK: int (*abc::a)[10]
+; CHECK-NO-CALLING-CONV: int (*abc::a)[10]
+; CHECK-NO-RETURN: int (*abc::a)[10]
+; CHECK-NO-ACCESS: int (*abc::a)[10]
+; CHECK-NO-MEMBER-TYPE: int (*abc::a)[10]
+; CHECK-NO-VARIABLE-TYPE: abc::a
+; CHECK-NO-ALL: abc::a
+
+?x@@3PEAEEA
+; CHECK: unsigned char *x
+; CHECK-NO-CALLING-CONV: unsigned char *x
+; CHECK-NO-RETURN: unsigned char *x
+; CHECK-NO-ACCESS: unsigned char *x
+; CHECK-NO-MEMBER-TYPE: unsigned char *x
+; CHECK-NO-VARIABLE-TYPE: x
+; CHECK-NO-ALL: x
diff --git a/llvm/test/Instrumentation/BoundsChecking/simple.ll b/llvm/test/Instrumentation/BoundsChecking/simple.ll
index 60d124f0b897..914cafdc57f9 100644
--- a/llvm/test/Instrumentation/BoundsChecking/simple.ll
+++ b/llvm/test/Instrumentation/BoundsChecking/simple.ll
@@ -488,3 +488,69 @@ define <vscale x 1 x i32> @load_scalable_vector(i64 %y) nounwind {
   %3 = load <vscale x 1 x i32>, ptr %2, align 8
   ret <vscale x 1 x i32> %3
 }
+
+define void @scalable_alloca(i64 %y) nounwind {
+; CHECK-LABEL: @scalable_alloca(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 5
+; CHECK-NEXT:    [[TMP4:%.*]] = alloca <vscale x 4 x i16>, i32 5, align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[Y:%.*]], [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 0, [[DOTIDX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds <vscale x 4 x i16>, ptr [[TMP4]], i64 [[Y]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP3]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ult i64 [[TMP3]], [[TMP7]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ult i64 [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp slt i64 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = or i1 [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[TRAP:%.*]], label [[TMP17:%.*]]
+; CHECK:       17:
+; CHECK-NEXT:    [[TMP18:%.*]] = load <vscale x 4 x i16>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    ret void
+; CHECK:       trap:
+; CHECK-NEXT:    call void @llvm.trap() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+;
+  %1 = alloca <vscale x 4 x i16>, i32 5
+  %2 = getelementptr inbounds <vscale x 4 x i16>, ptr %1, i64 %y
+  %3 = load <vscale x 4 x i16>, ptr %2, align 4
+  ret void
+}
+
+define void @scalable_alloca2(i64 %y) nounwind {
+; CHECK-LABEL: @scalable_alloca2(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 32
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = alloca <vscale x 4 x i64>, align 32
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 32
+; CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[Y:%.*]], [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 0, [[DOTIDX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds <vscale x 4 x i64>, ptr [[TMP4]], i64 [[Y]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 32
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP3]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ult i64 [[TMP3]], [[TMP7]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ult i64 [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp slt i64 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = or i1 [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[TRAP:%.*]], label [[TMP17:%.*]]
+; CHECK:       17:
+; CHECK-NEXT:    [[TMP18:%.*]] = load <vscale x 4 x i64>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    ret void
+; CHECK:       trap:
+; CHECK-NEXT:    call void @llvm.trap() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+;
+  %1 = alloca <vscale x 4 x i64>
+  %2 = getelementptr inbounds <vscale x 4 x i64>, ptr %1, i64 %y
+  %3 = load <vscale x 4 x i64>, ptr %2, align 4
+  ret void
+}
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll b/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll
index 8db14222652f..9b64c06ddd55 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --version 2
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
 ; Test alloca instrumentation. Command line includes check-globals so that
 ; changes to debug-info are detectable.
 ;
@@ -14,27 +14,27 @@ target triple = "aarch64--linux-android10000"
 declare void @use32(ptr)
 
 ;.
-; DYNAMIC-SHADOW: @[[LLVM_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x ptr] [ptr @hwasan.module_ctor], section "llvm.metadata"
-; DYNAMIC-SHADOW: @[[LLVM_GLOBAL_CTORS:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @hwasan.module_ctor, ptr @hwasan.module_ctor }]
-; DYNAMIC-SHADOW: @[[__START_HWASAN_GLOBALS:[a-zA-Z0-9_$"\\.-]+]] = external hidden constant [0 x i8]
-; DYNAMIC-SHADOW: @[[__STOP_HWASAN_GLOBALS:[a-zA-Z0-9_$"\\.-]+]] = external hidden constant [0 x i8]
-; DYNAMIC-SHADOW: @[[HWASAN_NOTE:[a-zA-Z0-9_$"\\.-]+]] = private constant { i32, i32, i32, [8 x i8], i32, i32 } { i32 8, i32 8, i32 3, [8 x i8] c"LLVM\00\00\00\00", i32 trunc (i64 sub (i64 ptrtoint (ptr @__start_hwasan_globals to i64), i64 ptrtoint (ptr @hwasan.note to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr @__stop_hwasan_globals to i64), i64 ptrtoint (ptr @hwasan.note to i64)) to i32) }, section ".note.hwasan.globals", comdat($hwasan.module_ctor), align 4
-; DYNAMIC-SHADOW: @[[HWASAN_DUMMY_GLOBAL:[a-zA-Z0-9_$"\\.-]+]] = private constant [0 x i8] zeroinitializer, section "hwasan_globals", comdat($hwasan.module_ctor), !associated !0
-; DYNAMIC-SHADOW: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [2 x ptr] [ptr @hwasan.note, ptr @hwasan.dummy.global], section "llvm.metadata"
-; DYNAMIC-SHADOW: @[[__HWASAN_SHADOW:[a-zA-Z0-9_$"\\.-]+]] = external global [0 x i8]
+; DYNAMIC-SHADOW: @llvm.used = appending global [1 x ptr] [ptr @hwasan.module_ctor], section "llvm.metadata"
+; DYNAMIC-SHADOW: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @hwasan.module_ctor, ptr @hwasan.module_ctor }]
+; DYNAMIC-SHADOW: @__start_hwasan_globals = external hidden constant [0 x i8]
+; DYNAMIC-SHADOW: @__stop_hwasan_globals = external hidden constant [0 x i8]
+; DYNAMIC-SHADOW: @hwasan.note = private constant { i32, i32, i32, [8 x i8], i32, i32 } { i32 8, i32 8, i32 3, [8 x i8] c"LLVM\00\00\00\00", i32 trunc (i64 sub (i64 ptrtoint (ptr @__start_hwasan_globals to i64), i64 ptrtoint (ptr @hwasan.note to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr @__stop_hwasan_globals to i64), i64 ptrtoint (ptr @hwasan.note to i64)) to i32) }, section ".note.hwasan.globals", comdat($hwasan.module_ctor), align 4
+; DYNAMIC-SHADOW: @hwasan.dummy.global = private constant [0 x i8] zeroinitializer, section "hwasan_globals", comdat($hwasan.module_ctor), !associated [[META0:![0-9]+]]
+; DYNAMIC-SHADOW: @llvm.compiler.used = appending global [2 x ptr] [ptr @hwasan.note, ptr @hwasan.dummy.global], section "llvm.metadata"
+; DYNAMIC-SHADOW: @__hwasan_shadow = external global [0 x i8]
 ;.
-; ZERO-BASED-SHADOW: @[[LLVM_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x ptr] [ptr @hwasan.module_ctor], section "llvm.metadata"
-; ZERO-BASED-SHADOW: @[[LLVM_GLOBAL_CTORS:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @hwasan.module_ctor, ptr @hwasan.module_ctor }]
-; ZERO-BASED-SHADOW: @[[__START_HWASAN_GLOBALS:[a-zA-Z0-9_$"\\.-]+]] = external hidden constant [0 x i8]
-; ZERO-BASED-SHADOW: @[[__STOP_HWASAN_GLOBALS:[a-zA-Z0-9_$"\\.-]+]] = external hidden constant [0 x i8]
-; ZERO-BASED-SHADOW: @[[HWASAN_NOTE:[a-zA-Z0-9_$"\\.-]+]] = private constant { i32, i32, i32, [8 x i8], i32, i32 } { i32 8, i32 8, i32 3, [8 x i8] c"LLVM\00\00\00\00", i32 trunc (i64 sub (i64 ptrtoint (ptr @__start_hwasan_globals to i64), i64 ptrtoint (ptr @hwasan.note to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr @__stop_hwasan_globals to i64), i64 ptrtoint (ptr @hwasan.note to i64)) to i32) }, section ".note.hwasan.globals", comdat($hwasan.module_ctor), align 4
-; ZERO-BASED-SHADOW: @[[HWASAN_DUMMY_GLOBAL:[a-zA-Z0-9_$"\\.-]+]] = private constant [0 x i8] zeroinitializer, section "hwasan_globals", comdat($hwasan.module_ctor), !associated !0
-; ZERO-BASED-SHADOW: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [2 x ptr] [ptr @hwasan.note, ptr @hwasan.dummy.global], section "llvm.metadata"
-; ZERO-BASED-SHADOW: @[[__HWASAN_SHADOW:[a-zA-Z0-9_$"\\.-]+]] = external global [0 x i8]
+; ZERO-BASED-SHADOW: @llvm.used = appending global [1 x ptr] [ptr @hwasan.module_ctor], section "llvm.metadata"
+; ZERO-BASED-SHADOW: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @hwasan.module_ctor, ptr @hwasan.module_ctor }]
+; ZERO-BASED-SHADOW: @__start_hwasan_globals = external hidden constant [0 x i8]
+; ZERO-BASED-SHADOW: @__stop_hwasan_globals = external hidden constant [0 x i8]
+; ZERO-BASED-SHADOW: @hwasan.note = private constant { i32, i32, i32, [8 x i8], i32, i32 } { i32 8, i32 8, i32 3, [8 x i8] c"LLVM\00\00\00\00", i32 trunc (i64 sub (i64 ptrtoint (ptr @__start_hwasan_globals to i64), i64 ptrtoint (ptr @hwasan.note to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr @__stop_hwasan_globals to i64), i64 ptrtoint (ptr @hwasan.note to i64)) to i32) }, section ".note.hwasan.globals", comdat($hwasan.module_ctor), align 4
+; ZERO-BASED-SHADOW: @hwasan.dummy.global = private constant [0 x i8] zeroinitializer, section "hwasan_globals", comdat($hwasan.module_ctor), !associated [[META0:![0-9]+]]
+; ZERO-BASED-SHADOW: @llvm.compiler.used = appending global [2 x ptr] [ptr @hwasan.note, ptr @hwasan.dummy.global], section "llvm.metadata"
+; ZERO-BASED-SHADOW: @__hwasan_shadow = external global [0 x i8]
 ;.
 define void @test_alloca() sanitize_hwaddress !dbg !15 {
-; DYNAMIC-SHADOW-LABEL: define void @test_alloca
-; DYNAMIC-SHADOW-SAME: () #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] {
+; DYNAMIC-SHADOW-LABEL: define void @test_alloca(
+; DYNAMIC-SHADOW-SAME: ) #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] {
 ; DYNAMIC-SHADOW-NEXT:  entry:
 ; DYNAMIC-SHADOW-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
 ; DYNAMIC-SHADOW-NEXT:    [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -43,23 +43,23 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; DYNAMIC-SHADOW-NEXT:    [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]]
 ; DYNAMIC-SHADOW-NEXT:    [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56
 ; DYNAMIC-SHADOW-NEXT:    [[X:%.*]] = alloca { i32, [12 x i8] }, align 16
-; DYNAMIC-SHADOW-NEXT:    call void @llvm.dbg.value(metadata !DIArgList(ptr [[X]], ptr [[X]]), metadata [[META11:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref)), !dbg [[DBG13:![0-9]+]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG10:![0-9]+]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG10]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG10]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG10]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG10]]
-; DYNAMIC-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG10]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG10]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG10]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG10]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG10]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG10]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG10]]
-; DYNAMIC-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG10]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG10]]
-; DYNAMIC-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG10]]
-; DYNAMIC-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG10]]
+; DYNAMIC-SHADOW-NEXT:    tail call void @llvm.dbg.value(metadata !DIArgList(ptr [[X]], ptr [[X]]), metadata [[META10:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref)), !dbg [[DBG12:![0-9]+]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]]
 ; DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]]
 ; DYNAMIC-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
 ; DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]]
@@ -68,8 +68,8 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; DYNAMIC-SHADOW-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG14]]
 ; DYNAMIC-SHADOW-NEXT:    ret void, !dbg [[DBG14]]
 ;
-; ZERO-BASED-SHADOW-LABEL: define void @test_alloca
-; ZERO-BASED-SHADOW-SAME: () #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] {
+; ZERO-BASED-SHADOW-LABEL: define void @test_alloca(
+; ZERO-BASED-SHADOW-SAME: ) #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] {
 ; ZERO-BASED-SHADOW-NEXT:  entry:
 ; ZERO-BASED-SHADOW-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr null)
 ; ZERO-BASED-SHADOW-NEXT:    [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -78,23 +78,23 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; ZERO-BASED-SHADOW-NEXT:    [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]]
 ; ZERO-BASED-SHADOW-NEXT:    [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56
 ; ZERO-BASED-SHADOW-NEXT:    [[X:%.*]] = alloca { i32, [12 x i8] }, align 16
-; ZERO-BASED-SHADOW-NEXT:    call void @llvm.dbg.value(metadata !DIArgList(ptr [[X]], ptr [[X]]), metadata [[META11:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref)), !dbg [[DBG13:![0-9]+]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG10:![0-9]+]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG10]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG10]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG10]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG10]]
-; ZERO-BASED-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG10]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG10]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG10]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG10]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG10]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG10]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG10]]
-; ZERO-BASED-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG10]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG10]]
-; ZERO-BASED-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG10]]
-; ZERO-BASED-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG10]]
+; ZERO-BASED-SHADOW-NEXT:    tail call void @llvm.dbg.value(metadata !DIArgList(ptr [[X]], ptr [[X]]), metadata [[META10:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref)), !dbg [[DBG12:![0-9]+]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]]
 ; ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]]
 ; ZERO-BASED-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
 ; ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]]
@@ -110,6 +110,24 @@ entry:
   ret void, !dbg !24
 }
 
+define void @test_vscale_alloca() sanitize_hwaddress {
+; DYNAMIC-SHADOW-LABEL: define void @test_vscale_alloca(
+; DYNAMIC-SHADOW-SAME: ) #[[ATTR0]] {
+; DYNAMIC-SHADOW-NEXT:    [[X:%.*]] = alloca <vscale x 4 x i64>, align 32
+; DYNAMIC-SHADOW-NEXT:    call void @use32(ptr nonnull [[X]])
+; DYNAMIC-SHADOW-NEXT:    ret void
+;
+; ZERO-BASED-SHADOW-LABEL: define void @test_vscale_alloca(
+; ZERO-BASED-SHADOW-SAME: ) #[[ATTR0]] {
+; ZERO-BASED-SHADOW-NEXT:    [[X:%.*]] = alloca <vscale x 4 x i64>, align 32
+; ZERO-BASED-SHADOW-NEXT:    call void @use32(ptr nonnull [[X]])
+; ZERO-BASED-SHADOW-NEXT:    ret void
+;
+  %x = alloca <vscale x 4 x i64>
+  call void @use32(ptr nonnull %x)
+  ret void
+}
+
 declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
@@ -144,35 +162,35 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 ; ZERO-BASED-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
 ; ZERO-BASED-SHADOW: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
 ;.
-; DYNAMIC-SHADOW: [[META0:![0-9]+]] = !{ptr @hwasan.note}
-; DYNAMIC-SHADOW: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !2, producer: "clang version 13.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !3, splitDebugInlining: false, nameTableKind: None)
-; DYNAMIC-SHADOW: [[META2:![0-9]+]] = !DIFile(filename: "alloca.cpp", directory: "/")
-; DYNAMIC-SHADOW: [[META3:![0-9]+]] = !{}
+; DYNAMIC-SHADOW: [[META0]] = !{ptr @hwasan.note}
+; DYNAMIC-SHADOW: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META2:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META3:![0-9]+]], splitDebugInlining: false, nameTableKind: None)
+; DYNAMIC-SHADOW: [[META2]] = !DIFile(filename: "alloca.cpp", directory: {{.*}})
+; DYNAMIC-SHADOW: [[META3]] = !{}
 ; DYNAMIC-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4}
 ; DYNAMIC-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
-; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{!"clang version 13.0.0"}
-; DYNAMIC-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: !2, file: !2, line: 4, type: !8, scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !3)
-; DYNAMIC-SHADOW: [[META8:![0-9]+]] = !DISubroutineType(types: !9)
-; DYNAMIC-SHADOW: [[META9:![0-9]+]] = !{null}
-; DYNAMIC-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: !7, file: !2, line: 5, type: ![[META12:[0-9]+]])
-; DYNAMIC-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-; DYNAMIC-SHADOW: [[DBG13]] = !DILocation(line: 0, scope: !7)
-; DYNAMIC-SHADOW: [[DBG10]] = !DILocation(line: 7, column: 5, scope: !7)
-; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: !7)
+; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; DYNAMIC-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
+; DYNAMIC-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]])
+; DYNAMIC-SHADOW: [[META9]] = !{null}
+; DYNAMIC-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]])
+; DYNAMIC-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+; DYNAMIC-SHADOW: [[DBG12]] = !DILocation(line: 0, scope: [[DBG7]])
+; DYNAMIC-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]])
+; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]])
 ;.
-; ZERO-BASED-SHADOW: [[META0:![0-9]+]] = !{ptr @hwasan.note}
-; ZERO-BASED-SHADOW: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !2, producer: "clang version 13.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !3, splitDebugInlining: false, nameTableKind: None)
-; ZERO-BASED-SHADOW: [[META2:![0-9]+]] = !DIFile(filename: "alloca.cpp", directory: "/")
-; ZERO-BASED-SHADOW: [[META3:![0-9]+]] = !{}
+; ZERO-BASED-SHADOW: [[META0]] = !{ptr @hwasan.note}
+; ZERO-BASED-SHADOW: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META2:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META3:![0-9]+]], splitDebugInlining: false, nameTableKind: None)
+; ZERO-BASED-SHADOW: [[META2]] = !DIFile(filename: "alloca.cpp", directory: {{.*}})
+; ZERO-BASED-SHADOW: [[META3]] = !{}
 ; ZERO-BASED-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4}
 ; ZERO-BASED-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
-; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{!"clang version 13.0.0"}
-; ZERO-BASED-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: !2, file: !2, line: 4, type: !8, scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !3)
-; ZERO-BASED-SHADOW: [[META8:![0-9]+]] = !DISubroutineType(types: !9)
-; ZERO-BASED-SHADOW: [[META9:![0-9]+]] = !{null}
-; ZERO-BASED-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: !7, file: !2, line: 5, type: ![[META12:[0-9]+]])
-; ZERO-BASED-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-; ZERO-BASED-SHADOW: [[DBG13]] = !DILocation(line: 0, scope: !7)
-; ZERO-BASED-SHADOW: [[DBG10]] = !DILocation(line: 7, column: 5, scope: !7)
-; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: !7)
+; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; ZERO-BASED-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
+; ZERO-BASED-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]])
+; ZERO-BASED-SHADOW: [[META9]] = !{null}
+; ZERO-BASED-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]])
+; ZERO-BASED-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+; ZERO-BASED-SHADOW: [[DBG12]] = !DILocation(line: 0, scope: [[DBG7]])
+; ZERO-BASED-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]])
+; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]])
 ;.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector-load-store.ll b/llvm/test/Instrumentation/MemorySanitizer/vector-load-store.ll
index feb8a27fd541..2e4bfb76cc9d 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector-load-store.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector-load-store.ll
@@ -671,17 +671,17 @@ define void @store.nxv1i32(ptr %p) sanitize_memory {
 ; ORIGINS-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP6]], 0
 ; ORIGINS-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP13:%.*]], !prof [[PROF0:![0-9]+]]
 ; ORIGINS:       7:
-; ORIGINS-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vscale.i32()
-; ORIGINS-NEXT:    [[TMP9:%.*]] = mul i32 [[TMP8]], 4
-; ORIGINS-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], 3
-; ORIGINS-NEXT:    [[TMP11:%.*]] = udiv i32 [[TMP10]], 4
+; ORIGINS-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; ORIGINS-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
+; ORIGINS-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], 3
+; ORIGINS-NEXT:    [[TMP11:%.*]] = udiv i64 [[TMP10]], 4
 ; ORIGINS-NEXT:    br label [[DOTSPLIT:%.*]]
 ; ORIGINS:       .split:
-; ORIGINS-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[TMP7]] ], [ [[IV_NEXT:%.*]], [[DOTSPLIT]] ]
-; ORIGINS-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP5]], i32 [[IV]]
+; ORIGINS-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[TMP7]] ], [ [[IV_NEXT:%.*]], [[DOTSPLIT]] ]
+; ORIGINS-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP5]], i64 [[IV]]
 ; ORIGINS-NEXT:    store i32 0, ptr [[TMP12]], align 4
-; ORIGINS-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; ORIGINS-NEXT:    [[IV_CHECK:%.*]] = icmp eq i32 [[IV_NEXT]], [[TMP11]]
+; ORIGINS-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; ORIGINS-NEXT:    [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP11]]
 ; ORIGINS-NEXT:    br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]]
 ; ORIGINS:       .split.split:
 ; ORIGINS-NEXT:    br label [[TMP13]]
@@ -731,17 +731,17 @@ define void @store.nxv2i32(ptr %p) sanitize_memory {
 ; ORIGINS-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP6]], 0
 ; ORIGINS-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP13:%.*]], !prof [[PROF0]]
 ; ORIGINS:       7:
-; ORIGINS-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vscale.i32()
-; ORIGINS-NEXT:    [[TMP9:%.*]] = mul i32 [[TMP8]], 8
-; ORIGINS-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], 3
-; ORIGINS-NEXT:    [[TMP11:%.*]] = udiv i32 [[TMP10]], 4
+; ORIGINS-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; ORIGINS-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 8
+; ORIGINS-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], 3
+; ORIGINS-NEXT:    [[TMP11:%.*]] = udiv i64 [[TMP10]], 4
 ; ORIGINS-NEXT:    br label [[DOTSPLIT:%.*]]
 ; ORIGINS:       .split:
-; ORIGINS-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[TMP7]] ], [ [[IV_NEXT:%.*]], [[DOTSPLIT]] ]
-; ORIGINS-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP5]], i32 [[IV]]
+; ORIGINS-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[TMP7]] ], [ [[IV_NEXT:%.*]], [[DOTSPLIT]] ]
+; ORIGINS-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP5]], i64 [[IV]]
 ; ORIGINS-NEXT:    store i32 0, ptr [[TMP12]], align 4
-; ORIGINS-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; ORIGINS-NEXT:    [[IV_CHECK:%.*]] = icmp eq i32 [[IV_NEXT]], [[TMP11]]
+; ORIGINS-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; ORIGINS-NEXT:    [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP11]]
 ; ORIGINS-NEXT:    br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]]
 ; ORIGINS:       .split.split:
 ; ORIGINS-NEXT:    br label [[TMP13]]
@@ -791,17 +791,17 @@ define void @store.nxv4i32(ptr %p) sanitize_memory {
 ; ORIGINS-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP6]], 0
 ; ORIGINS-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP13:%.*]], !prof [[PROF0]]
 ; ORIGINS:       7:
-; ORIGINS-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vscale.i32()
-; ORIGINS-NEXT:    [[TMP9:%.*]] = mul i32 [[TMP8]], 16
-; ORIGINS-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], 3
-; ORIGINS-NEXT:    [[TMP11:%.*]] = udiv i32 [[TMP10]], 4
+; ORIGINS-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; ORIGINS-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; ORIGINS-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], 3
+; ORIGINS-NEXT:    [[TMP11:%.*]] = udiv i64 [[TMP10]], 4
 ; ORIGINS-NEXT:    br label [[DOTSPLIT:%.*]]
 ; ORIGINS:       .split:
-; ORIGINS-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[TMP7]] ], [ [[IV_NEXT:%.*]], [[DOTSPLIT]] ]
-; ORIGINS-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP5]], i32 [[IV]]
+; ORIGINS-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[TMP7]] ], [ [[IV_NEXT:%.*]], [[DOTSPLIT]] ]
+; ORIGINS-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP5]], i64 [[IV]]
 ; ORIGINS-NEXT:    store i32 0, ptr [[TMP12]], align 4
-; ORIGINS-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; ORIGINS-NEXT:    [[IV_CHECK:%.*]] = icmp eq i32 [[IV_NEXT]], [[TMP11]]
+; ORIGINS-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; ORIGINS-NEXT:    [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP11]]
 ; ORIGINS-NEXT:    br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]]
 ; ORIGINS:       .split.split:
 ; ORIGINS-NEXT:    br label [[TMP13]]
@@ -851,17 +851,17 @@ define void @store.nxv8i32(ptr %p) sanitize_memory {
 ; ORIGINS-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP6]], 0
 ; ORIGINS-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP13:%.*]], !prof [[PROF0]]
 ; ORIGINS:       7:
-; ORIGINS-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vscale.i32()
-; ORIGINS-NEXT:    [[TMP9:%.*]] = mul i32 [[TMP8]], 32
-; ORIGINS-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], 3
-; ORIGINS-NEXT:    [[TMP11:%.*]] = udiv i32 [[TMP10]], 4
+; ORIGINS-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; ORIGINS-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 32
+; ORIGINS-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], 3
+; ORIGINS-NEXT:    [[TMP11:%.*]] = udiv i64 [[TMP10]], 4
 ; ORIGINS-NEXT:    br label [[DOTSPLIT:%.*]]
 ; ORIGINS:       .split:
-; ORIGINS-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[TMP7]] ], [ [[IV_NEXT:%.*]], [[DOTSPLIT]] ]
-; ORIGINS-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP5]], i32 [[IV]]
+; ORIGINS-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[TMP7]] ], [ [[IV_NEXT:%.*]], [[DOTSPLIT]] ]
+; ORIGINS-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP5]], i64 [[IV]]
 ; ORIGINS-NEXT:    store i32 0, ptr [[TMP12]], align 4
-; ORIGINS-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; ORIGINS-NEXT:    [[IV_CHECK:%.*]] = icmp eq i32 [[IV_NEXT]], [[TMP11]]
+; ORIGINS-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; ORIGINS-NEXT:    [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP11]]
 ; ORIGINS-NEXT:    br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]]
 ; ORIGINS:       .split.split:
 ; ORIGINS-NEXT:    br label [[TMP13]]
@@ -911,17 +911,17 @@ define void @store.nxv16i32(ptr %p) sanitize_memory {
 ; ORIGINS-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP6]], 0
 ; ORIGINS-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP13:%.*]], !prof [[PROF0]]
 ; ORIGINS:       7:
-; ORIGINS-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vscale.i32()
-; ORIGINS-NEXT:    [[TMP9:%.*]] = mul i32 [[TMP8]], 64
-; ORIGINS-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], 3
-; ORIGINS-NEXT:    [[TMP11:%.*]] = udiv i32 [[TMP10]], 4
+; ORIGINS-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; ORIGINS-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 64
+; ORIGINS-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], 3
+; ORIGINS-NEXT:    [[TMP11:%.*]] = udiv i64 [[TMP10]], 4
 ; ORIGINS-NEXT:    br label [[DOTSPLIT:%.*]]
 ; ORIGINS:       .split:
-; ORIGINS-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[TMP7]] ], [ [[IV_NEXT:%.*]], [[DOTSPLIT]] ]
-; ORIGINS-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP5]], i32 [[IV]]
+; ORIGINS-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[TMP7]] ], [ [[IV_NEXT:%.*]], [[DOTSPLIT]] ]
+; ORIGINS-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP5]], i64 [[IV]]
 ; ORIGINS-NEXT:    store i32 0, ptr [[TMP12]], align 4
-; ORIGINS-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; ORIGINS-NEXT:    [[IV_CHECK:%.*]] = icmp eq i32 [[IV_NEXT]], [[TMP11]]
+; ORIGINS-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; ORIGINS-NEXT:    [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP11]]
 ; ORIGINS-NEXT:    br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]]
 ; ORIGINS:       .split.split:
 ; ORIGINS-NEXT:    br label [[TMP13]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vscale.ll b/llvm/test/Instrumentation/MemorySanitizer/vscale.ll
index e1a4a9b7aa68..36ca9e516f2d 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vscale.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vscale.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -S -msan-check-access-address=0 -passes="msan" 2>&1 | FileCheck %s
+; RUN: opt < %s -S -msan-check-access-address=0 -passes="msan" -msan-track-origins=2 2>&1 | FileCheck %s --check-prefixes=ORIGIN
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -20,6 +21,46 @@ define void @test_load_store_i32(ptr %a, ptr %b) sanitize_memory {
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP1]], ptr [[B]], align 16
 ; CHECK-NEXT:    ret void
 ;
+; ORIGIN-LABEL: define void @test_load_store_i32(
+; ORIGIN-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; ORIGIN-NEXT:    call void @llvm.donothing()
+; ORIGIN-NEXT:    [[TMP1:%.*]] = load <vscale x 4 x i32>, ptr [[A]], align 16
+; ORIGIN-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[A]] to i64
+; ORIGIN-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; ORIGIN-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; ORIGIN-NEXT:    [[TMP5:%.*]] = add i64 [[TMP3]], 17592186044416
+; ORIGIN-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; ORIGIN-NEXT:    [[_MSLD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP4]], align 16
+; ORIGIN-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 16
+; ORIGIN-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[B]] to i64
+; ORIGIN-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
+; ORIGIN-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
+; ORIGIN-NEXT:    [[TMP11:%.*]] = add i64 [[TMP9]], 17592186044416
+; ORIGIN-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; ORIGIN-NEXT:    store <vscale x 4 x i32> [[_MSLD]], ptr [[TMP10]], align 16
+; ORIGIN-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> [[_MSLD]])
+; ORIGIN-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP13]], 0
+; ORIGIN-NEXT:    br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP21:%.*]], !prof [[PROF0:![0-9]+]]
+; ORIGIN:       14:
+; ORIGIN-NEXT:    [[TMP15:%.*]] = call i32 @__msan_chain_origin(i32 [[TMP7]])
+; ORIGIN-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; ORIGIN-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 16
+; ORIGIN-NEXT:    [[TMP18:%.*]] = add i64 [[TMP17]], 3
+; ORIGIN-NEXT:    [[TMP19:%.*]] = udiv i64 [[TMP18]], 4
+; ORIGIN-NEXT:    br label [[DOTSPLIT:%.*]]
+; ORIGIN:       .split:
+; ORIGIN-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[TMP14]] ], [ [[IV_NEXT:%.*]], [[DOTSPLIT]] ]
+; ORIGIN-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[TMP12]], i64 [[IV]]
+; ORIGIN-NEXT:    store i32 [[TMP15]], ptr [[TMP20]], align 4
+; ORIGIN-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; ORIGIN-NEXT:    [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP19]]
+; ORIGIN-NEXT:    br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]]
+; ORIGIN:       .split.split:
+; ORIGIN-NEXT:    br label [[TMP21]]
+; ORIGIN:       21:
+; ORIGIN-NEXT:    store <vscale x 4 x i32> [[TMP1]], ptr [[B]], align 16
+; ORIGIN-NEXT:    ret void
+;
   %1 = load <vscale x 4 x i32>, ptr %a
   store <vscale x 4 x i32> %1, ptr %b
   ret void
@@ -48,6 +89,59 @@ define void @test_load_store_add_int(ptr %a, ptr %b) sanitize_memory {
 ; CHECK-NEXT:    store <vscale x 8 x i64> [[TMP5]], ptr [[B]], align 64
 ; CHECK-NEXT:    ret void
 ;
+; ORIGIN-LABEL: define void @test_load_store_add_int(
+; ORIGIN-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; ORIGIN-NEXT:    call void @llvm.donothing()
+; ORIGIN-NEXT:    [[TMP1:%.*]] = load <vscale x 8 x i64>, ptr [[A]], align 64
+; ORIGIN-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[A]] to i64
+; ORIGIN-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; ORIGIN-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; ORIGIN-NEXT:    [[TMP5:%.*]] = add i64 [[TMP3]], 17592186044416
+; ORIGIN-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; ORIGIN-NEXT:    [[_MSLD:%.*]] = load <vscale x 8 x i64>, ptr [[TMP4]], align 64
+; ORIGIN-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 64
+; ORIGIN-NEXT:    [[TMP8:%.*]] = load <vscale x 8 x i64>, ptr [[B]], align 64
+; ORIGIN-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B]] to i64
+; ORIGIN-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 87960930222080
+; ORIGIN-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+; ORIGIN-NEXT:    [[TMP12:%.*]] = add i64 [[TMP10]], 17592186044416
+; ORIGIN-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; ORIGIN-NEXT:    [[_MSLD1:%.*]] = load <vscale x 8 x i64>, ptr [[TMP11]], align 64
+; ORIGIN-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 64
+; ORIGIN-NEXT:    [[_MSPROP:%.*]] = or <vscale x 8 x i64> [[_MSLD]], [[_MSLD1]]
+; ORIGIN-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vector.reduce.or.nxv8i64(<vscale x 8 x i64> [[_MSLD1]])
+; ORIGIN-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP15]], 0
+; ORIGIN-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP7]]
+; ORIGIN-NEXT:    [[TMP18:%.*]] = add <vscale x 8 x i64> [[TMP1]], [[TMP8]]
+; ORIGIN-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[B]] to i64
+; ORIGIN-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 87960930222080
+; ORIGIN-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+; ORIGIN-NEXT:    [[TMP22:%.*]] = add i64 [[TMP20]], 17592186044416
+; ORIGIN-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP22]] to ptr
+; ORIGIN-NEXT:    store <vscale x 8 x i64> [[_MSLD1]], ptr [[TMP21]], align 64
+; ORIGIN-NEXT:    [[TMP24:%.*]] = call i64 @llvm.vector.reduce.or.nxv8i64(<vscale x 8 x i64> [[_MSLD1]])
+; ORIGIN-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP24]], 0
+; ORIGIN-NEXT:    br i1 [[_MSCMP]], label [[TMP25:%.*]], label [[TMP32:%.*]], !prof [[PROF0]]
+; ORIGIN:       25:
+; ORIGIN-NEXT:    [[TMP26:%.*]] = call i32 @__msan_chain_origin(i32 [[TMP14]])
+; ORIGIN-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
+; ORIGIN-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 64
+; ORIGIN-NEXT:    [[TMP29:%.*]] = add i64 [[TMP28]], 3
+; ORIGIN-NEXT:    [[TMP30:%.*]] = udiv i64 [[TMP29]], 4
+; ORIGIN-NEXT:    br label [[DOTSPLIT:%.*]]
+; ORIGIN:       .split:
+; ORIGIN-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[TMP25]] ], [ [[IV_NEXT:%.*]], [[DOTSPLIT]] ]
+; ORIGIN-NEXT:    [[TMP31:%.*]] = getelementptr i32, ptr [[TMP23]], i64 [[IV]]
+; ORIGIN-NEXT:    store i32 [[TMP26]], ptr [[TMP31]], align 4
+; ORIGIN-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; ORIGIN-NEXT:    [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP30]]
+; ORIGIN-NEXT:    br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]]
+; ORIGIN:       .split.split:
+; ORIGIN-NEXT:    br label [[TMP32]]
+; ORIGIN:       32:
+; ORIGIN-NEXT:    store <vscale x 8 x i64> [[TMP8]], ptr [[B]], align 64
+; ORIGIN-NEXT:    ret void
+;
   %1 = load <vscale x 8 x i64>, ptr %a
   %2 = load <vscale x 8 x i64>, ptr %b
   %3 = add <vscale x 8 x i64> %1, %2
@@ -71,6 +165,46 @@ define void @test_load_store_float(ptr %a, ptr %b) sanitize_memory {
 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP1]], ptr [[B]], align 16
 ; CHECK-NEXT:    ret void
 ;
+; ORIGIN-LABEL: define void @test_load_store_float(
+; ORIGIN-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; ORIGIN-NEXT:    call void @llvm.donothing()
+; ORIGIN-NEXT:    [[TMP1:%.*]] = load <vscale x 4 x float>, ptr [[A]], align 16
+; ORIGIN-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[A]] to i64
+; ORIGIN-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; ORIGIN-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; ORIGIN-NEXT:    [[TMP5:%.*]] = add i64 [[TMP3]], 17592186044416
+; ORIGIN-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; ORIGIN-NEXT:    [[_MSLD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP4]], align 16
+; ORIGIN-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 16
+; ORIGIN-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[B]] to i64
+; ORIGIN-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
+; ORIGIN-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
+; ORIGIN-NEXT:    [[TMP11:%.*]] = add i64 [[TMP9]], 17592186044416
+; ORIGIN-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; ORIGIN-NEXT:    store <vscale x 4 x i32> [[_MSLD]], ptr [[TMP10]], align 16
+; ORIGIN-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> [[_MSLD]])
+; ORIGIN-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP13]], 0
+; ORIGIN-NEXT:    br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP21:%.*]], !prof [[PROF0]]
+; ORIGIN:       14:
+; ORIGIN-NEXT:    [[TMP15:%.*]] = call i32 @__msan_chain_origin(i32 [[TMP7]])
+; ORIGIN-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; ORIGIN-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 16
+; ORIGIN-NEXT:    [[TMP18:%.*]] = add i64 [[TMP17]], 3
+; ORIGIN-NEXT:    [[TMP19:%.*]] = udiv i64 [[TMP18]], 4
+; ORIGIN-NEXT:    br label [[DOTSPLIT:%.*]]
+; ORIGIN:       .split:
+; ORIGIN-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[TMP14]] ], [ [[IV_NEXT:%.*]], [[DOTSPLIT]] ]
+; ORIGIN-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[TMP12]], i64 [[IV]]
+; ORIGIN-NEXT:    store i32 [[TMP15]], ptr [[TMP20]], align 4
+; ORIGIN-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; ORIGIN-NEXT:    [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP19]]
+; ORIGIN-NEXT:    br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]]
+; ORIGIN:       .split.split:
+; ORIGIN-NEXT:    br label [[TMP21]]
+; ORIGIN:       21:
+; ORIGIN-NEXT:    store <vscale x 4 x float> [[TMP1]], ptr [[B]], align 16
+; ORIGIN-NEXT:    ret void
+;
   %1 = load <vscale x 4 x float>, ptr %a
   store <vscale x 4 x float> %1, ptr %b
   ret void
@@ -99,6 +233,59 @@ define void @test_load_store_add_float(ptr %a, ptr %b) sanitize_memory {
 ; CHECK-NEXT:    store <vscale x 2 x float> [[TMP5]], ptr [[B]], align 8
 ; CHECK-NEXT:    ret void
 ;
+; ORIGIN-LABEL: define void @test_load_store_add_float(
+; ORIGIN-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; ORIGIN-NEXT:    call void @llvm.donothing()
+; ORIGIN-NEXT:    [[TMP1:%.*]] = load <vscale x 2 x float>, ptr [[A]], align 8
+; ORIGIN-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[A]] to i64
+; ORIGIN-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; ORIGIN-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; ORIGIN-NEXT:    [[TMP5:%.*]] = add i64 [[TMP3]], 17592186044416
+; ORIGIN-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; ORIGIN-NEXT:    [[_MSLD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP4]], align 8
+; ORIGIN-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 8
+; ORIGIN-NEXT:    [[TMP8:%.*]] = load <vscale x 2 x float>, ptr [[B]], align 8
+; ORIGIN-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[B]] to i64
+; ORIGIN-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 87960930222080
+; ORIGIN-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+; ORIGIN-NEXT:    [[TMP12:%.*]] = add i64 [[TMP10]], 17592186044416
+; ORIGIN-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; ORIGIN-NEXT:    [[_MSLD1:%.*]] = load <vscale x 2 x i32>, ptr [[TMP11]], align 8
+; ORIGIN-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 8
+; ORIGIN-NEXT:    [[_MSPROP:%.*]] = or <vscale x 2 x i32> [[_MSLD]], [[_MSLD1]]
+; ORIGIN-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.or.nxv2i32(<vscale x 2 x i32> [[_MSLD1]])
+; ORIGIN-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; ORIGIN-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP7]]
+; ORIGIN-NEXT:    [[TMP18:%.*]] = fadd <vscale x 2 x float> [[TMP1]], [[TMP8]]
+; ORIGIN-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[B]] to i64
+; ORIGIN-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 87960930222080
+; ORIGIN-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+; ORIGIN-NEXT:    [[TMP22:%.*]] = add i64 [[TMP20]], 17592186044416
+; ORIGIN-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP22]] to ptr
+; ORIGIN-NEXT:    store <vscale x 2 x i32> [[_MSLD1]], ptr [[TMP21]], align 8
+; ORIGIN-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vector.reduce.or.nxv2i32(<vscale x 2 x i32> [[_MSLD1]])
+; ORIGIN-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP24]], 0
+; ORIGIN-NEXT:    br i1 [[_MSCMP]], label [[TMP25:%.*]], label [[TMP32:%.*]], !prof [[PROF0]]
+; ORIGIN:       25:
+; ORIGIN-NEXT:    [[TMP26:%.*]] = call i32 @__msan_chain_origin(i32 [[TMP14]])
+; ORIGIN-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
+; ORIGIN-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 8
+; ORIGIN-NEXT:    [[TMP29:%.*]] = add i64 [[TMP28]], 3
+; ORIGIN-NEXT:    [[TMP30:%.*]] = udiv i64 [[TMP29]], 4
+; ORIGIN-NEXT:    br label [[DOTSPLIT:%.*]]
+; ORIGIN:       .split:
+; ORIGIN-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[TMP25]] ], [ [[IV_NEXT:%.*]], [[DOTSPLIT]] ]
+; ORIGIN-NEXT:    [[TMP31:%.*]] = getelementptr i32, ptr [[TMP23]], i64 [[IV]]
+; ORIGIN-NEXT:    store i32 [[TMP26]], ptr [[TMP31]], align 4
+; ORIGIN-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; ORIGIN-NEXT:    [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP30]]
+; ORIGIN-NEXT:    br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]]
+; ORIGIN:       .split.split:
+; ORIGIN-NEXT:    br label [[TMP32]]
+; ORIGIN:       32:
+; ORIGIN-NEXT:    store <vscale x 2 x float> [[TMP8]], ptr [[B]], align 8
+; ORIGIN-NEXT:    ret void
+;
   %1 = load <vscale x 2 x float>, ptr %a
   %2 = load <vscale x 2 x float>, ptr %b
   %3 = fadd <vscale x 2 x float> %1, %2
@@ -118,6 +305,21 @@ define <vscale x 2 x float> @fn_ret(ptr %a) sanitize_memory {
 ; CHECK-NEXT:    store <vscale x 2 x i32> [[_MSLD]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <vscale x 2 x float> [[TMP1]]
 ;
+; ORIGIN-LABEL: define <vscale x 2 x float> @fn_ret(
+; ORIGIN-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; ORIGIN-NEXT:    call void @llvm.donothing()
+; ORIGIN-NEXT:    [[TMP1:%.*]] = load <vscale x 2 x float>, ptr [[A]], align 8
+; ORIGIN-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[A]] to i64
+; ORIGIN-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; ORIGIN-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; ORIGIN-NEXT:    [[TMP5:%.*]] = add i64 [[TMP3]], 17592186044416
+; ORIGIN-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; ORIGIN-NEXT:    [[_MSLD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP4]], align 8
+; ORIGIN-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 8
+; ORIGIN-NEXT:    store <vscale x 2 x i32> [[_MSLD]], ptr @__msan_retval_tls, align 8
+; ORIGIN-NEXT:    store i32 [[TMP7]], ptr @__msan_retval_origin_tls, align 4
+; ORIGIN-NEXT:    ret <vscale x 2 x float> [[TMP1]]
+;
   %1 = load <vscale x 2 x float>, ptr %a
   ret <vscale x 2 x float> %1
 }
@@ -138,6 +340,46 @@ define void @test_ret(ptr %a, ptr %b) sanitize_memory {
 ; CHECK-NEXT:    store <vscale x 2 x float> [[TMP5]], ptr [[B]], align 8
 ; CHECK-NEXT:    ret void
 ;
+; ORIGIN-LABEL: define void @test_ret(
+; ORIGIN-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; ORIGIN-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; ORIGIN-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; ORIGIN-NEXT:    call void @llvm.donothing()
+; ORIGIN-NEXT:    store i64 [[TMP1]], ptr @__msan_param_tls, align 8
+; ORIGIN-NEXT:    store i32 [[TMP2]], ptr @__msan_param_origin_tls, align 4
+; ORIGIN-NEXT:    store <vscale x 2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; ORIGIN-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x float> @fn_ret(ptr [[A]])
+; ORIGIN-NEXT:    [[_MSRET:%.*]] = load <vscale x 2 x i32>, ptr @__msan_retval_tls, align 8
+; ORIGIN-NEXT:    [[TMP4:%.*]] = load i32, ptr @__msan_retval_origin_tls, align 4
+; ORIGIN-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[B]] to i64
+; ORIGIN-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; ORIGIN-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; ORIGIN-NEXT:    [[TMP8:%.*]] = add i64 [[TMP6]], 17592186044416
+; ORIGIN-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; ORIGIN-NEXT:    store <vscale x 2 x i32> [[_MSRET]], ptr [[TMP7]], align 8
+; ORIGIN-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.or.nxv2i32(<vscale x 2 x i32> [[_MSRET]])
+; ORIGIN-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP10]], 0
+; ORIGIN-NEXT:    br i1 [[_MSCMP]], label [[TMP11:%.*]], label [[TMP18:%.*]], !prof [[PROF0]]
+; ORIGIN:       11:
+; ORIGIN-NEXT:    [[TMP12:%.*]] = call i32 @__msan_chain_origin(i32 [[TMP4]])
+; ORIGIN-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; ORIGIN-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 8
+; ORIGIN-NEXT:    [[TMP15:%.*]] = add i64 [[TMP14]], 3
+; ORIGIN-NEXT:    [[TMP16:%.*]] = udiv i64 [[TMP15]], 4
+; ORIGIN-NEXT:    br label [[DOTSPLIT:%.*]]
+; ORIGIN:       .split:
+; ORIGIN-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[TMP11]] ], [ [[IV_NEXT:%.*]], [[DOTSPLIT]] ]
+; ORIGIN-NEXT:    [[TMP17:%.*]] = getelementptr i32, ptr [[TMP9]], i64 [[IV]]
+; ORIGIN-NEXT:    store i32 [[TMP12]], ptr [[TMP17]], align 4
+; ORIGIN-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; ORIGIN-NEXT:    [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP16]]
+; ORIGIN-NEXT:    br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]]
+; ORIGIN:       .split.split:
+; ORIGIN-NEXT:    br label [[TMP18]]
+; ORIGIN:       18:
+; ORIGIN-NEXT:    store <vscale x 2 x float> [[TMP3]], ptr [[B]], align 8
+; ORIGIN-NEXT:    ret void
+;
   %1 = call <vscale x 2 x float> @fn_ret(ptr %a)
   store <vscale x 2 x float> %1, ptr %b
   ret void
@@ -154,6 +396,38 @@ define void @fn_param(<vscale x 2 x float> %a, ptr %b) sanitize_memory {
 ; CHECK-NEXT:    store <vscale x 2 x float> [[A]], ptr [[B]], align 8
 ; CHECK-NEXT:    ret void
 ;
+; ORIGIN-LABEL: define void @fn_param(
+; ORIGIN-SAME: <vscale x 2 x float> [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; ORIGIN-NEXT:    call void @llvm.donothing()
+; ORIGIN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+; ORIGIN-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; ORIGIN-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; ORIGIN-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; ORIGIN-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; ORIGIN-NEXT:    store <vscale x 2 x i32> zeroinitializer, ptr [[TMP3]], align 8
+; ORIGIN-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.nxv2i32(<vscale x 2 x i32> zeroinitializer)
+; ORIGIN-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP6]], 0
+; ORIGIN-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; ORIGIN:       7:
+; ORIGIN-NEXT:    [[TMP8:%.*]] = call i32 @__msan_chain_origin(i32 0)
+; ORIGIN-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; ORIGIN-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; ORIGIN-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 3
+; ORIGIN-NEXT:    [[TMP12:%.*]] = udiv i64 [[TMP11]], 4
+; ORIGIN-NEXT:    br label [[DOTSPLIT:%.*]]
+; ORIGIN:       .split:
+; ORIGIN-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[TMP7]] ], [ [[IV_NEXT:%.*]], [[DOTSPLIT]] ]
+; ORIGIN-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP5]], i64 [[IV]]
+; ORIGIN-NEXT:    store i32 [[TMP8]], ptr [[TMP13]], align 4
+; ORIGIN-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; ORIGIN-NEXT:    [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP12]]
+; ORIGIN-NEXT:    br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]]
+; ORIGIN:       .split.split:
+; ORIGIN-NEXT:    br label [[TMP14]]
+; ORIGIN:       14:
+; ORIGIN-NEXT:    store <vscale x 2 x float> [[A]], ptr [[B]], align 8
+; ORIGIN-NEXT:    ret void
+;
   store <vscale x 2 x float> %a, ptr %b
   ret void
 }
@@ -173,16 +447,150 @@ define void @test_param(ptr %a, ptr %b) sanitize_memory {
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP6]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0:![0-9]+]]
 ; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5:[0-9]+]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @fn_param(<vscale x 2 x float> [[TMP2]], ptr [[B]])
 ; CHECK-NEXT:    ret void
 ;
+; ORIGIN-LABEL: define void @test_param(
+; ORIGIN-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; ORIGIN-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; ORIGIN-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; ORIGIN-NEXT:    call void @llvm.donothing()
+; ORIGIN-NEXT:    [[TMP3:%.*]] = load <vscale x 2 x float>, ptr [[A]], align 8
+; ORIGIN-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A]] to i64
+; ORIGIN-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 87960930222080
+; ORIGIN-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; ORIGIN-NEXT:    [[TMP7:%.*]] = add i64 [[TMP5]], 17592186044416
+; ORIGIN-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; ORIGIN-NEXT:    [[_MSLD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP6]], align 8
+; ORIGIN-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 8
+; ORIGIN-NEXT:    store i64 [[TMP1]], ptr @__msan_param_tls, align 8
+; ORIGIN-NEXT:    store i32 [[TMP2]], ptr @__msan_param_origin_tls, align 4
+; ORIGIN-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.or.nxv2i32(<vscale x 2 x i32> [[_MSLD]])
+; ORIGIN-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP10]], 0
+; ORIGIN-NEXT:    br i1 [[_MSCMP]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF0]]
+; ORIGIN:       11:
+; ORIGIN-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP9]]) #[[ATTR5:[0-9]+]]
+; ORIGIN-NEXT:    unreachable
+; ORIGIN:       12:
+; ORIGIN-NEXT:    call void @fn_param(<vscale x 2 x float> [[TMP3]], ptr [[B]])
+; ORIGIN-NEXT:    ret void
+;
   %1 = load <vscale x 2 x float>, ptr %a
   call void @fn_param(<vscale x 2 x float> %1, ptr %b)
   ret void
 }
+
+define void @test_alloca1() sanitize_memory {
+; CHECK-LABEL: define void @test_alloca1(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[X:%.*]] = alloca <vscale x 64 x i1>, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[X]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[TMP4]], i8 -1, i64 [[TMP1]], i1 false)
+; CHECK-NEXT:    ret void
+;
+; ORIGIN-LABEL: define void @test_alloca1(
+; ORIGIN-SAME: ) #[[ATTR0]] {
+; ORIGIN-NEXT:  entry:
+; ORIGIN-NEXT:    call void @llvm.donothing()
+; ORIGIN-NEXT:    [[X:%.*]] = alloca <vscale x 64 x i1>, align 4
+; ORIGIN-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; ORIGIN-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; ORIGIN-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[X]] to i64
+; ORIGIN-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; ORIGIN-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; ORIGIN-NEXT:    [[TMP5:%.*]] = add i64 [[TMP3]], 17592186044416
+; ORIGIN-NEXT:    [[TMP6:%.*]] = and i64 [[TMP5]], -4
+; ORIGIN-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; ORIGIN-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[TMP4]], i8 -1, i64 [[TMP1]], i1 false)
+; ORIGIN-NEXT:    call void @__msan_set_alloca_origin_with_descr(ptr [[X]], i64 [[TMP1]], ptr @[[GLOB0:[0-9]+]], ptr @[[GLOB1:[0-9]+]])
+; ORIGIN-NEXT:    ret void
+;
+entry:
+  %x = alloca <vscale x 64 x i1>, align 4
+  ret void
+}
+
+define void @test_alloca2() sanitize_memory {
+; CHECK-LABEL: define void @test_alloca2(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[X:%.*]] = alloca <vscale x 64 x double>, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 512
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[X]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[TMP4]], i8 -1, i64 [[TMP1]], i1 false)
+; CHECK-NEXT:    ret void
+;
+; ORIGIN-LABEL: define void @test_alloca2(
+; ORIGIN-SAME: ) #[[ATTR0]] {
+; ORIGIN-NEXT:  entry:
+; ORIGIN-NEXT:    call void @llvm.donothing()
+; ORIGIN-NEXT:    [[X:%.*]] = alloca <vscale x 64 x double>, align 4
+; ORIGIN-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; ORIGIN-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 512
+; ORIGIN-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[X]] to i64
+; ORIGIN-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; ORIGIN-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; ORIGIN-NEXT:    [[TMP5:%.*]] = add i64 [[TMP3]], 17592186044416
+; ORIGIN-NEXT:    [[TMP6:%.*]] = and i64 [[TMP5]], -4
+; ORIGIN-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; ORIGIN-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[TMP4]], i8 -1, i64 [[TMP1]], i1 false)
+; ORIGIN-NEXT:    call void @__msan_set_alloca_origin_with_descr(ptr [[X]], i64 [[TMP1]], ptr @[[GLOB2:[0-9]+]], ptr @[[GLOB3:[0-9]+]])
+; ORIGIN-NEXT:    ret void
+;
+entry:
+  %x = alloca <vscale x 64 x double>, align 4
+  ret void
+}
+
+define void @test_alloca3() sanitize_memory {
+; CHECK-LABEL: define void @test_alloca3(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[X:%.*]] = alloca <vscale x 1 x i1>, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[X]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[TMP3]], i8 -1, i64 [[TMP0]], i1 false)
+; CHECK-NEXT:    ret void
+;
+; ORIGIN-LABEL: define void @test_alloca3(
+; ORIGIN-SAME: ) #[[ATTR0]] {
+; ORIGIN-NEXT:  entry:
+; ORIGIN-NEXT:    call void @llvm.donothing()
+; ORIGIN-NEXT:    [[X:%.*]] = alloca <vscale x 1 x i1>, align 4
+; ORIGIN-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; ORIGIN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[X]] to i64
+; ORIGIN-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; ORIGIN-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; ORIGIN-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; ORIGIN-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], -4
+; ORIGIN-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; ORIGIN-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[TMP3]], i8 -1, i64 [[TMP0]], i1 false)
+; ORIGIN-NEXT:    call void @__msan_set_alloca_origin_with_descr(ptr [[X]], i64 [[TMP0]], ptr @[[GLOB4:[0-9]+]], ptr @[[GLOB5:[0-9]+]])
+; ORIGIN-NEXT:    ret void
+;
+entry:
+  %x = alloca <vscale x 1 x i1>, align 4
+  ret void
+}
+
 ;.
 ; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 1048575}
 ;.
+; ORIGIN: [[PROF0]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/Instrumentation/ThreadSanitizer/tsan_basic.ll b/llvm/test/Instrumentation/ThreadSanitizer/tsan_basic.ll
index 60a423835f21..3aef34317b0b 100644
--- a/llvm/test/Instrumentation/ThreadSanitizer/tsan_basic.ll
+++ b/llvm/test/Instrumentation/ThreadSanitizer/tsan_basic.ll
@@ -108,6 +108,18 @@ define i32 @NakedTest(ptr %a) naked sanitize_thread {
   ret i32 %tmp1
 }
 
+; vscale is unsupported, just don't crash here.
+define void @test_load_store_i32(ptr %a, ptr %b) sanitize_thread {
+; CHECK-LABEL: define void @test_load_store_i32(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 4 x i32>, ptr [[A]], align 16
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP1]], ptr [[B]], align 16
+; CHECK-NEXT:    ret void
+  %1 = load <vscale x 4 x i32>, ptr %a
+  store <vscale x 4 x i32> %1, ptr %b
+  ret void
+}
+
 declare void @foo() nounwind
 
 ; CHECK: define internal void @tsan.module_ctor()
diff --git a/llvm/test/MC/AMDGPU/alignto_mcexpr.s b/llvm/test/MC/AMDGPU/alignto_mcexpr.s
new file mode 100644
index 000000000000..e864f3736828
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/alignto_mcexpr.s
@@ -0,0 +1,15 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa < %s | FileCheck --check-prefix=ASM %s
+
+// ASM: .set alignto_zero_eight, 0
+// ASM: .set alignto_one_eight, 8
+// ASM: .set alignto_five_eight, 8
+// ASM: .set alignto_seven_eight, 8
+// ASM: .set alignto_eight_eight, 8
+// ASM: .set alignto_ten_eight, 16
+
+.set alignto_zero_eight, alignto(0, 8)
+.set alignto_one_eight, alignto(1, 8)
+.set alignto_five_eight, alignto(5, 8)
+.set alignto_seven_eight, alignto(7, 8)
+.set alignto_eight_eight, alignto(8, 8)
+.set alignto_ten_eight, alignto(10, 8)
diff --git a/llvm/test/MC/AMDGPU/extrasgprs_mcexpr.s b/llvm/test/MC/AMDGPU/extrasgprs_mcexpr.s
new file mode 100644
index 000000000000..e88b23bb34d4
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/extrasgprs_mcexpr.s
@@ -0,0 +1,31 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=bonaire < %s | FileCheck --check-prefix=GFX7 %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck --check-prefix=GFX90A %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck --check-prefix=GFX940 %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
+
+// gfx940 has architected flat scratch enabled.
+
+// GFX7: .set extrasgpr_none, 0
+// GFX7: .set extrasgpr_vcc, 2
+// GFX7: .set extrasgpr_flatscr, 4
+// GFX7: .set extrasgpr_xnack, 0
+
+// GFX90A: .set extrasgpr_none, 0
+// GFX90A: .set extrasgpr_vcc, 2
+// GFX90A: .set extrasgpr_flatscr, 6
+// GFX90A: .set extrasgpr_xnack, 4
+
+// GFX940: .set extrasgpr_none, 6
+// GFX940: .set extrasgpr_vcc, 6
+// GFX940: .set extrasgpr_flatscr, 6
+// GFX940: .set extrasgpr_xnack, 6
+
+// GFX10: .set extrasgpr_none, 0
+// GFX10: .set extrasgpr_vcc, 2
+// GFX10: .set extrasgpr_flatscr, 0
+// GFX10: .set extrasgpr_xnack, 0
+
+.set extrasgpr_none, extrasgprs(0, 0, 0)
+.set extrasgpr_vcc, extrasgprs(1, 0, 0)
+.set extrasgpr_flatscr, extrasgprs(0, 1, 0)
+.set extrasgpr_xnack, extrasgprs(0, 0, 1)
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s
index 662905809ad9..103fa67064ef 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s
@@ -399,73 +399,73 @@ v_ctz_i32_b32_e64 v255, 0xaf123456
 v_cvt_f32_bf8_e64 v1, s3
 // GFX12: encoding: [0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00]
 
-v_cvt_f32_bf8_e64 v1, s3 op_sel:[0,1]
+v_cvt_f32_bf8_e64 v1, s3 byte_sel:1
 // GFX12: encoding: [0x01,0x10,0xed,0xd5,0x03,0x00,0x00,0x00]
 
-v_cvt_f32_bf8_e64 v1, s3 op_sel:[1,0]
+v_cvt_f32_bf8_e64 v1, s3 byte_sel:2
 // GFX12: encoding: [0x01,0x08,0xed,0xd5,0x03,0x00,0x00,0x00]
 
-v_cvt_f32_bf8_e64 v1, s3 op_sel:[1,1]
+v_cvt_f32_bf8_e64 v1, s3 byte_sel:3
 // GFX12: encoding: [0x01,0x18,0xed,0xd5,0x03,0x00,0x00,0x00]
 
 v_cvt_f32_bf8_e64 v1, 3
 // GFX12: encoding: [0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00]
 
-v_cvt_f32_bf8_e64 v1, 3 op_sel:[0,1]
+v_cvt_f32_bf8_e64 v1, 3 byte_sel:1
 // GFX12: encoding: [0x01,0x10,0xed,0xd5,0x83,0x00,0x00,0x00]
 
-v_cvt_f32_bf8_e64 v1, 3 op_sel:[1,0]
+v_cvt_f32_bf8_e64 v1, 3 byte_sel:2
 // GFX12: encoding: [0x01,0x08,0xed,0xd5,0x83,0x00,0x00,0x00]
 
-v_cvt_f32_bf8_e64 v1, 3 op_sel:[1,1]
+v_cvt_f32_bf8_e64 v1, 3 byte_sel:3
 // GFX12: encoding: [0x01,0x18,0xed,0xd5,0x83,0x00,0x00,0x00]
 
 v_cvt_f32_bf8_e64 v1, v3
 // GFX12: encoding: [0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00]
 
-v_cvt_f32_bf8_e64 v1, v3 op_sel:[0,1]
+v_cvt_f32_bf8_e64 v1, v3 byte_sel:1
 // GFX12: encoding: [0x01,0x10,0xed,0xd5,0x03,0x01,0x00,0x00]
 
-v_cvt_f32_bf8_e64 v1, v3 op_sel:[1,0]
+v_cvt_f32_bf8_e64 v1, v3 byte_sel:2
 // GFX12: encoding: [0x01,0x08,0xed,0xd5,0x03,0x01,0x00,0x00]
 
-v_cvt_f32_bf8_e64 v1, v3 op_sel:[1,1]
+v_cvt_f32_bf8_e64 v1, v3 byte_sel:3
 // GFX12: encoding: [0x01,0x18,0xed,0xd5,0x03,0x01,0x00,0x00]
 
 v_cvt_f32_fp8_e64 v1, s3
 // GFX12: encoding: [0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00]
 
-v_cvt_f32_fp8_e64 v1, s3 op_sel:[0,1]
+v_cvt_f32_fp8_e64 v1, s3 byte_sel:1
 // GFX12: encoding: [0x01,0x10,0xec,0xd5,0x03,0x00,0x00,0x00]
 
-v_cvt_f32_fp8_e64 v1, s3 op_sel:[1,0]
+v_cvt_f32_fp8_e64 v1, s3 byte_sel:2
 // GFX12: encoding: [0x01,0x08,0xec,0xd5,0x03,0x00,0x00,0x00]
 
-v_cvt_f32_fp8_e64 v1, s3 op_sel:[1,1]
+v_cvt_f32_fp8_e64 v1, s3 byte_sel:3
 // GFX12: encoding: [0x01,0x18,0xec,0xd5,0x03,0x00,0x00,0x00]
 
 v_cvt_f32_fp8_e64 v1, 3
 // GFX12: encoding: [0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00]
 
-v_cvt_f32_fp8_e64 v1, 3 op_sel:[0,1]
+v_cvt_f32_fp8_e64 v1, 3 byte_sel:1
 // GFX12: encoding: [0x01,0x10,0xec,0xd5,0x83,0x00,0x00,0x00]
 
-v_cvt_f32_fp8_e64 v1, 3 op_sel:[1,0]
+v_cvt_f32_fp8_e64 v1, 3 byte_sel:2
 // GFX12: encoding: [0x01,0x08,0xec,0xd5,0x83,0x00,0x00,0x00]
 
-v_cvt_f32_fp8_e64 v1, 3 op_sel:[1,1]
+v_cvt_f32_fp8_e64 v1, 3 byte_sel:3
 // GFX12: encoding: [0x01,0x18,0xec,0xd5,0x83,0x00,0x00,0x00]
 
 v_cvt_f32_fp8_e64 v1, v3
 // GFX12: encoding: [0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00]
 
-v_cvt_f32_fp8_e64 v1, v3 op_sel:[0,1]
+v_cvt_f32_fp8_e64 v1, v3 byte_sel:1
 // GFX12: encoding: [0x01,0x10,0xec,0xd5,0x03,0x01,0x00,0x00]
 
-v_cvt_f32_fp8_e64 v1, v3 op_sel:[1,0]
+v_cvt_f32_fp8_e64 v1, v3 byte_sel:2
 // GFX12: encoding: [0x01,0x08,0xec,0xd5,0x03,0x01,0x00,0x00]
 
-v_cvt_f32_fp8_e64 v1, v3 op_sel:[1,1]
+v_cvt_f32_fp8_e64 v1, v3 byte_sel:3
 // GFX12: encoding: [0x01,0x18,0xec,0xd5,0x03,0x01,0x00,0x00]
 
 v_cvt_pk_f32_bf8_e64 v[2:3], s3
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s
index 706e9e3717dc..ae1381a5a729 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s
@@ -516,6 +516,36 @@ v_cvt_f32_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_
 v_cvt_f32_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: [0xff,0x81,0x8b,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
+v_cvt_f32_fp8 v1, v2 quad_perm:[0,1,2,3]
+// GFX12: v_cvt_f32_fp8_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd8,0x02,0x7e,0x02,0xe4,0x00,0xff]
+
+v_cvt_f32_fp8 v1, v2 byte_sel:0 quad_perm:[0,1,2,3]
+// GFX12: v_cvt_f32_fp8_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff]
+
+v_cvt_f32_fp8 v1, v2 byte_sel:1 quad_perm:[0,1,2,3]
+// GFX12: v_cvt_f32_fp8_e64_dpp v1, v2 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x10,0xec,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff]
+
+v_cvt_f32_fp8 v1, v2 byte_sel:2 quad_perm:[0,1,2,3]
+// GFX12: v_cvt_f32_fp8_e64_dpp v1, v2 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x08,0xec,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff]
+
+v_cvt_f32_fp8 v1, v2 byte_sel:3 quad_perm:[0,1,2,3]
+// GFX12: v_cvt_f32_fp8_e64_dpp v1, v2 byte_sel:3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x18,0xec,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff]
+
+v_cvt_f32_bf8 v1, v2 quad_perm:[0,1,2,3]
+// GFX12: v_cvt_f32_bf8_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xda,0x02,0x7e,0x02,0xe4,0x00,0xff]
+
+v_cvt_f32_bf8 v1, v2 byte_sel:0 quad_perm:[0,1,2,3]
+// GFX12: v_cvt_f32_bf8_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff]
+
+v_cvt_f32_bf8 v1, v2 byte_sel:1 quad_perm:[0,1,2,3]
+// GFX12: v_cvt_f32_bf8_e64_dpp v1, v2 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x10,0xed,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff]
+
+v_cvt_f32_bf8 v1, v2 byte_sel:2 quad_perm:[0,1,2,3]
+// GFX12: v_cvt_f32_bf8_e64_dpp v1, v2 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x08,0xed,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff]
+
+v_cvt_f32_bf8 v1, v2 byte_sel:3 quad_perm:[0,1,2,3]
+// GFX12: v_cvt_f32_bf8_e64_dpp v1, v2 byte_sel:3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x18,0xed,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff]
+
 v_cvt_f32_i32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x85,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s
index 002cd778a997..d88922c111f6 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s
@@ -144,6 +144,36 @@ v_cvt_f32_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cvt_f32_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0xff,0x81,0x8b,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
+v_cvt_f32_fp8 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_f32_fp8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd8,0x02,0x7e,0x02,0x77,0x39,0x05]
+
+v_cvt_f32_fp8 v1, v2 byte_sel:0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_f32_fp8_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+
+v_cvt_f32_fp8 v1, v2 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_f32_fp8_e64_dpp v1, v2 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x10,0xec,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+
+v_cvt_f32_fp8 v1, v2 byte_sel:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_f32_fp8_e64_dpp v1, v2 byte_sel:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x08,0xec,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+
+v_cvt_f32_fp8 v1, v2 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_f32_fp8_e64_dpp v1, v2 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x18,0xec,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+
+v_cvt_f32_bf8 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_f32_bf8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xda,0x02,0x7e,0x02,0x77,0x39,0x05]
+
+v_cvt_f32_bf8 v1, v2 byte_sel:0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_f32_bf8_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+
+v_cvt_f32_bf8 v1, v2 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_f32_bf8_e64_dpp v1, v2 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x10,0xed,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+
+v_cvt_f32_bf8 v1, v2 byte_sel:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_f32_bf8_e64_dpp v1, v2 byte_sel:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x08,0xed,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+
+v_cvt_f32_bf8 v1, v2 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_f32_bf8_e64_dpp v1, v2 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x18,0xed,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+
 v_cvt_f32_i32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x85,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
diff --git a/llvm/test/MC/AMDGPU/gfx9-asm-err.s b/llvm/test/MC/AMDGPU/gfx9-asm-err.s
index 93138a829185..31e0d953b5bd 100644
--- a/llvm/test/MC/AMDGPU/gfx9-asm-err.s
+++ b/llvm/test/MC/AMDGPU/gfx9-asm-err.s
@@ -41,3 +41,6 @@ global_load_dword v[2:3], off
 
 scratch_load_dword v2, off, offset:256
 // GFX9ERR: :[[@LINE-1]]:{{[0-9]+}}: error: too few operands for instruction
+
+s_sendmsg sendmsg(MSG_SYSMSG, SYSMSG_OP_HOST_TRAP_ACK)
+// GFX9ERR: :[[@LINE-1]]:{{[0-9]+}}: error: specified operation id is not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/occupancy_mcexpr.s b/llvm/test/MC/AMDGPU/occupancy_mcexpr.s
new file mode 100644
index 000000000000..06bec8c538da
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/occupancy_mcexpr.s
@@ -0,0 +1,61 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa < %s | FileCheck --check-prefix=ASM %s
+
+// ASM: .set occupancy_init_one, 1
+// ASM: .set occupancy_init_seven, 7
+// ASM: .set occupancy_init_eight, 8
+
+.set occupancy_init_one, occupancy(0, 0, 0, 0, 1, 0, 0)
+.set occupancy_init_seven, occupancy(0, 0, 0, 0, 7, 0, 0)
+.set occupancy_init_eight, occupancy(0, 0, 0, 0, 8, 0, 0)
+
+// ASM: .set occupancy_numsgpr_seaisle_ten, 10
+// ASM: .set occupancy_numsgpr_seaisle_nine, 9
+// ASM: .set occupancy_numsgpr_seaisle_eight, 8
+// ASM: .set occupancy_numsgpr_seaisle_seven, 7
+// ASM: .set occupancy_numsgpr_seaisle_six, 6
+// ASM: .set occupancy_numsgpr_seaisle_five, 5
+
+.set occupancy_numsgpr_seaisle_ten, occupancy(0, 0, 0, 6, 11, 1, 0)
+.set occupancy_numsgpr_seaisle_nine, occupancy(0, 0, 0, 6, 11, 49, 0)
+.set occupancy_numsgpr_seaisle_eight, occupancy(0, 0, 0, 6, 11, 57, 0)
+.set occupancy_numsgpr_seaisle_seven, occupancy(0, 0, 0, 6, 11, 65, 0)
+.set occupancy_numsgpr_seaisle_six, occupancy(0, 0, 0, 6, 11, 73, 0)
+.set occupancy_numsgpr_seaisle_five, occupancy(0, 0, 0, 6, 11, 81, 0)
+
+// ASM: .set occupancy_numsgpr_gfx9_ten, 10
+// ASM: .set occupancy_numsgpr_gfx9_nine, 9
+// ASM: .set occupancy_numsgpr_gfx9_eight, 8
+// ASM: .set occupancy_numsgpr_gfx9_seven, 7
+
+.set occupancy_numsgpr_gfx9_ten, occupancy(0, 0, 0, 8, 11, 1, 0)
+.set occupancy_numsgpr_gfx9_nine, occupancy(0, 0, 0, 8, 11, 81, 0)
+.set occupancy_numsgpr_gfx9_eight, occupancy(0, 0, 0, 8, 11, 89, 0)
+.set occupancy_numsgpr_gfx9_seven, occupancy(0, 0, 0, 8, 11, 101, 0)
+
+// ASM: .set occupancy_numsgpr_gfx10_one, 1
+// ASM: .set occupancy_numsgpr_gfx10_seven, 7
+// ASM: .set occupancy_numsgpr_gfx10_eight, 8
+
+.set occupancy_numsgpr_gfx10_one, occupancy(1, 0, 0, 9, 11, 1, 0)
+.set occupancy_numsgpr_gfx10_seven, occupancy(7, 0, 0, 9, 11, 1, 0)
+.set occupancy_numsgpr_gfx10_eight, occupancy(8, 0, 0, 9, 11, 1, 0)
+
+// ASM: .set occupancy_numvgpr_high_granule_one, 1
+// ASM: .set occupancy_numvgpr_high_granule_seven, 7
+// ASM: .set occupancy_numvgpr_high_granule_eight, 8
+
+.set occupancy_numvgpr_high_granule_one, occupancy(1, 2, 0, 0, 11, 0, 1)
+.set occupancy_numvgpr_high_granule_seven, occupancy(7, 2, 0, 0, 11, 0, 1)
+.set occupancy_numvgpr_high_granule_eight, occupancy(8, 2, 0, 0, 11, 0, 1)
+
+// ASM: .set occupancy_numvgpr_low_total_one, 1
+// ASM: .set occupancy_numvgpr_one, 1
+// ASM: .set occupancy_numvgpr_seven, 7
+// ASM: .set occupancy_numvgpr_eight, 8
+// ASM: .set occupancy_numvgpr_ten, 10
+
+.set occupancy_numvgpr_low_total_one, occupancy(11, 4, 2, 0, 11, 0, 4)
+.set occupancy_numvgpr_one, occupancy(11, 4, 4, 0, 11, 0, 4)
+.set occupancy_numvgpr_seven, occupancy(11, 4, 28, 0, 11, 0, 4)
+.set occupancy_numvgpr_eight, occupancy(11, 4, 32, 0, 11, 0, 4)
+.set occupancy_numvgpr_ten, occupancy(11, 4, 40, 0, 11, 0, 4)
diff --git a/llvm/test/MC/AMDGPU/sopp-err.s b/llvm/test/MC/AMDGPU/sopp-err.s
index bd044cb74340..8b7ff74b2105 100644
--- a/llvm/test/MC/AMDGPU/sopp-err.s
+++ b/llvm/test/MC/AMDGPU/sopp-err.s
@@ -199,6 +199,10 @@ s_sendmsg sendmsg(MSG_SYSMSG, 0)
 s_sendmsg sendmsg(MSG_SYSMSG, 5)
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operation id
 
+s_sendmsg sendmsg(MSG_SYSMSG, SYSMSG_OP_HOST_TRAP_ACK)
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: specified operation id is not supported on this GPU
+// GFX11PLUS: :[[@LINE-2]]:{{[0-9]+}}: error: specified operation id is not supported on this GPU
+
 //===----------------------------------------------------------------------===//
 // waitcnt
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/MC/AMDGPU/sopp-gfx9.s b/llvm/test/MC/AMDGPU/sopp-gfx9.s
index e760d497896f..2ba6d0043f35 100644
--- a/llvm/test/MC/AMDGPU/sopp-gfx9.s
+++ b/llvm/test/MC/AMDGPU/sopp-gfx9.s
@@ -109,3 +109,6 @@ s_sendmsg 10
 
 s_sendmsg sendmsg(MSG_GET_DOORBELL)
 // GFX9: s_sendmsg sendmsg(MSG_GET_DOORBELL) ; encoding: [0x0a,0x00,0x90,0xbf]
+
+s_sendmsg sendmsg(15, 3, 0)
+// GFX9: s_sendmsg sendmsg(15, 3, 0) ; encoding: [0x3f,0x00,0x90,0xbf]
diff --git a/llvm/test/MC/AMDGPU/totalnumvgpr_mcexpr.s b/llvm/test/MC/AMDGPU/totalnumvgpr_mcexpr.s
new file mode 100644
index 000000000000..29bb885b2080
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/totalnumvgpr_mcexpr.s
@@ -0,0 +1,26 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck --check-prefix=GFX90A %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
+
+// GFX10: .set totalvgpr_none, 0
+// GFX10: .set totalvgpr_one, 1
+// GFX10: .set totalvgpr_two, 2
+
+.set totalvgpr_none, totalnumvgprs(0, 0)
+.set totalvgpr_one, totalnumvgprs(1, 0)
+.set totalvgpr_two, totalnumvgprs(1, 2)
+
+// GFX90A: .set totalvgpr90a_none, 0
+// GFX90A: .set totalvgpr90a_one, 1
+// GFX90A: .set totalvgpr90a_two, 2
+
+.set totalvgpr90a_none, totalnumvgprs(0, 0)
+.set totalvgpr90a_one, totalnumvgprs(0, 1)
+.set totalvgpr90a_two, totalnumvgprs(0, 2)
+
+// GFX90A: .set totalvgpr90a_agpr_minimal, 1
+// GFX90A: .set totalvgpr90a_agpr_rounded_eight, 8
+// GFX90A: .set totalvgpr90a_agpr_exact_eight, 8
+
+.set totalvgpr90a_agpr_minimal, totalnumvgprs(1, 0)
+.set totalvgpr90a_agpr_rounded_eight, totalnumvgprs(4, 2)
+.set totalvgpr90a_agpr_exact_eight, totalnumvgprs(4, 4)
diff --git a/llvm/test/MC/AMDGPU/v_illegal-atomics.s b/llvm/test/MC/AMDGPU/v_illegal-atomics.s
index 175f017ad9ca..cedb89ce9cd1 100644
--- a/llvm/test/MC/AMDGPU/v_illegal-atomics.s
+++ b/llvm/test/MC/AMDGPU/v_illegal-atomics.s
@@ -1,6 +1,6 @@
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1030 -show-encoding %s | FileCheck --check-prefix=GFX1030 %s
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck --check-prefix=GFX1100 %s
-
-v_illegal
-// GFX1030: encoding: [0x00,0x00,0x00,0x00]
-// GFX1100: encoding: [0x00,0x00,0x00,0x00]
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1030 -show-encoding %s | FileCheck --check-prefix=GFX1030 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck --check-prefix=GFX1100 %s
+
+v_illegal
+// GFX1030: encoding: [0x00,0x00,0x00,0x00]
+// GFX1100: encoding: [0x00,0x00,0x00,0x00]
diff --git a/llvm/test/MC/AsmParser/directive_file-g.s b/llvm/test/MC/AsmParser/directive_file-g.s
index bbf6fbd8a4c4..48f083dc7aa8 100644
--- a/llvm/test/MC/AsmParser/directive_file-g.s
+++ b/llvm/test/MC/AsmParser/directive_file-g.s
@@ -1,24 +1,24 @@
-## Make sure that using -g (or equivalent) on an asm file that already has
-## debug-info directives in it will correctly ignore the -g and produce
-## debug info corresponding to the directives in the source.
-## Note gcc accepts ".file 1" after a label, although not after an opcode.
-## If no other directives appear, gcc emits no debug info at all.
-
-# RUN: llvm-mc -g -triple i386-unknown-unknown -filetype=obj %s -o %t
-# RUN: llvm-dwarfdump -debug-info -debug-line %t | FileCheck %s
-
-foo:
-        .file 1 "a.c"
-        .loc 1 1 1
-        nop
-
-# CHECK: .debug_info
-## gcc does generate a DW_TAG_compile_unit in this case, with or without
-## -g on the command line, but we do not.
-# CHECK-EMPTY:
-# CHECK-NEXT: .debug_line
-# CHECK: file_names[ 1]:
-# CHECK-NEXT: name: "a.c"
-# CHECK-NEXT: dir_index: 0
-# CHECK: 0x{{0+}}0 1 1 1 0 0 0 is_stmt
-# CHECK: 0x{{0+}}1 1 1 1 0 0 0 is_stmt end_sequence
+## Make sure that using -g (or equivalent) on an asm file that already has
+## debug-info directives in it will correctly ignore the -g and produce
+## debug info corresponding to the directives in the source.
+## Note gcc accepts ".file 1" after a label, although not after an opcode.
+## If no other directives appear, gcc emits no debug info at all.
+
+# RUN: llvm-mc -g -triple i386-unknown-unknown -filetype=obj %s -o %t
+# RUN: llvm-dwarfdump -debug-info -debug-line %t | FileCheck %s
+
+foo:
+        .file 1 "a.c"
+        .loc 1 1 1
+        nop
+
+# CHECK: .debug_info
+## gcc does generate a DW_TAG_compile_unit in this case, with or without
+## -g on the command line, but we do not.
+# CHECK-EMPTY:
+# CHECK-NEXT: .debug_line
+# CHECK: file_names[ 1]:
+# CHECK-NEXT: name: "a.c"
+# CHECK-NEXT: dir_index: 0
+# CHECK: 0x{{0+}}0 1 1 1 0 0 0 is_stmt
+# CHECK: 0x{{0+}}1 1 1 1 0 0 0 is_stmt end_sequence
diff --git a/llvm/test/MC/AsmParser/layout-interdependency.s b/llvm/test/MC/AsmParser/layout-interdependency.s
index f26149ced766..d275614e87e7 100644
--- a/llvm/test/MC/AsmParser/layout-interdependency.s
+++ b/llvm/test/MC/AsmParser/layout-interdependency.s
@@ -1,5 +1,6 @@
 # RUN: not llvm-mc --filetype=obj %s -o /dev/null 2>&1 | FileCheck %s
 # REQUIRES: object-emission
+# UNSUPPORTED: target={{.*}}-zos{{.*}}
 
 fct_end:
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
index ac745c543324..f9c768e3e026 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
@@ -342,12 +342,42 @@
 # GFX12: v_cvt_f32_fp8_e64_dpp v1, v3 quad_perm:[0,2,1,1] row_mask:0x5 bank_mask:0xe ; encoding: [0x01,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x03,0x58,0x00,0x5e]
 0x01,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x03,0x58,0x00,0x5e
 
+# GFX12: v_cvt_f32_fp8_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd8,0x02,0x7e,0x02,0xe4,0x00,0xff]
+0xfa,0xd8,0x02,0x7e,0x02,0xe4,0x00,0xff
+
+# GFX12: v_cvt_f32_fp8_e64_dpp v1, v2 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x10,0xec,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff]
+0x01,0x10,0xec,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff
+
+# GFX12: v_cvt_f32_fp8_e64_dpp v1, v2 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x08,0xec,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff]
+0x01,0x08,0xec,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff
+
+# GFX12: v_cvt_f32_fp8_e64_dpp v1, v2 byte_sel:3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x18,0xec,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff]
+0x01,0x18,0xec,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff
+
+# GFX12: v_cvt_f32_fp8_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff]
+0x01,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff
+
 # GFX12: v_cvt_f32_bf8_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x2 bank_mask:0xd ; encoding: [0x05,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0x2d]
 0x05,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0x2d
 
 # GFX12: v_cvt_f32_bf8_e64_dpp v1, v3 quad_perm:[0,2,1,1] row_mask:0x5 bank_mask:0xe ; encoding: [0x01,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x03,0x58,0x00,0x5e]
 0x01,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x03,0x58,0x00,0x5e
 
+# GFX12: v_cvt_f32_bf8_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xda,0x02,0x7e,0x02,0xe4,0x00,0xff]
+0xfa,0xda,0x02,0x7e,0x02,0xe4,0x00,0xff
+
+# GFX12: v_cvt_f32_bf8_e64_dpp v1, v2 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x10,0xed,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff]
+0x01,0x10,0xed,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff
+
+# GFX12: v_cvt_f32_bf8_e64_dpp v1, v2 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x08,0xed,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff]
+0x01,0x08,0xed,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff
+
+# GFX12: v_cvt_f32_bf8_e64_dpp v1, v2 byte_sel:3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x18,0xed,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff]
+0x01,0x18,0xed,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff
+
+# GFX12: v_cvt_f32_bf8_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff]
+0x01,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x00,0xff
+
 # GFX12: v_cvt_f16_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x8a,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x8a,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
index fdeda3bb272d..eccd69185577 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
@@ -78,12 +78,42 @@
 # GFX12: v_cvt_f32_fp8_e64_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05]
 0x01,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05
 
+# GFX12: v_cvt_f32_fp8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd8,0x02,0x7e,0x02,0x77,0x39,0x05]
+0xe9,0xd8,0x02,0x7e,0x02,0x77,0x39,0x05
+
+# GFX12: v_cvt_f32_fp8_e64_dpp v1, v2 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x10,0xec,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+0x01,0x10,0xec,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05
+
+# GFX12: v_cvt_f32_fp8_e64_dpp v1, v2 byte_sel:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x08,0xec,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+0x01,0x08,0xec,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05
+
+# GFX12: v_cvt_f32_fp8_e64_dpp v1, v2 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x18,0xec,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+0x01,0x18,0xec,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05
+
+# GFX12: v_cvt_f32_fp8_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+0x01,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05
+
 # GFX12: v_cvt_f32_bf8_e64_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0x05,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa]
 0x05,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa
 
 # GFX12: v_cvt_f32_bf8_e64_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05]
 0x01,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05
 
+# GFX12: v_cvt_f32_bf8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xda,0x02,0x7e,0x02,0x77,0x39,0x05]
+0xe9,0xda,0x02,0x7e,0x02,0x77,0x39,0x05
+
+# GFX12: v_cvt_f32_bf8_e64_dpp v1, v2 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x10,0xed,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+0x01,0x10,0xed,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05
+
+# GFX12: v_cvt_f32_bf8_e64_dpp v1, v2 byte_sel:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x08,0xed,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+0x01,0x08,0xed,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05
+
+# GFX12: v_cvt_f32_bf8_e64_dpp v1, v2 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x18,0xed,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+0x01,0x18,0xed,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05
+
+# GFX12: v_cvt_f32_bf8_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+0x01,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05
+
 # GFX12: v_cvt_f16_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x8a,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x8a,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 
diff --git a/llvm/test/MC/Mips/eh-frame.s b/llvm/test/MC/Mips/eh-frame.s
index fd145317bf4d..dac142325f9f 100644
--- a/llvm/test/MC/Mips/eh-frame.s
+++ b/llvm/test/MC/Mips/eh-frame.s
@@ -33,14 +33,13 @@
 // RUN: llvm-readobj -r %t.o | FileCheck --check-prefixes=RELOCS,PIC64 %s
 // RUN: llvm-dwarfdump -eh-frame %t.o | FileCheck --check-prefixes=DWARF64,DWARF64_PIC %s
 
-/// However using the large code model forces R_MIPS_64 since there is no R_MIPS_PC64 relocation:
 // RUN: llvm-mc -filetype=obj %s -o %t.o -triple mips64-unknown-linux-gnu --position-independent --large-code-model
-// RUN: llvm-readobj -r %t.o | FileCheck --check-prefixes=RELOCS,ABS64 %s
-// RUN: llvm-dwarfdump -eh-frame %t.o | FileCheck --check-prefixes=DWARF64,DWARF64_ABS %s
+// RUN: llvm-readobj -r %t.o | FileCheck --check-prefixes=RELOCS,PIC64 %s
+// RUN: llvm-dwarfdump -eh-frame %t.o | FileCheck --check-prefixes=DWARF64,DWARF64_PIC %s
 
 // RUN: llvm-mc -filetype=obj %s -o %t.o -triple mips64el-unknown-linux-gnu --position-independent  --large-code-model
-// RUN: llvm-readobj -r %t.o | FileCheck --check-prefixes=RELOCS,ABS64 %s
-// RUN: llvm-dwarfdump -eh-frame %t.o | FileCheck --check-prefixes=DWARF64,DWARF64_ABS %s
+// RUN: llvm-readobj -r %t.o | FileCheck --check-prefixes=RELOCS,PIC64 %s
+// RUN: llvm-dwarfdump -eh-frame %t.o | FileCheck --check-prefixes=DWARF64,DWARF64_PIC %s
 
 func:
 	.cfi_startproc
diff --git a/llvm/test/MC/RISCV/attribute-arch.s b/llvm/test/MC/RISCV/attribute-arch.s
index aea27146e370..a028d4025ec1 100644
--- a/llvm/test/MC/RISCV/attribute-arch.s
+++ b/llvm/test/MC/RISCV/attribute-arch.s
@@ -330,6 +330,9 @@
 .attribute arch, "rv32i_ssqosid1p0"
 # CHECK: attribute      5, "rv32i2p1_ssqosid1p0"
 
+.attribute arch, "rv32i_smstateen1p0"
+# CHECK: attribute      5, "rv32i2p1_smstateen1p0"
+
 .attribute arch, "rv32i_ssstateen1p0"
 # CHECK: attribute      5, "rv32i2p1_ssstateen1p0"
 
diff --git a/llvm/test/MC/RISCV/option-arch.s b/llvm/test/MC/RISCV/option-arch.s
index 6ee133c7159a..7826252f66e6 100644
--- a/llvm/test/MC/RISCV/option-arch.s
+++ b/llvm/test/MC/RISCV/option-arch.s
@@ -1,7 +1,7 @@
-# RUN: llvm-mc -triple riscv32 -show-encoding < %s \
+# RUN: llvm-mc -triple riscv32 -mattr=+experimental -show-encoding < %s \
 # RUN:   | FileCheck -check-prefixes=CHECK %s
-# RUN: llvm-mc -triple riscv32 -filetype=obj < %s \
-# RUN:   | llvm-objdump  --triple=riscv32 --mattr=+c,+m,+a,+f,+zba -d -M no-aliases - \
+# RUN: llvm-mc -triple riscv32 -mattr=+experimental -filetype=obj < %s \
+# RUN:   | llvm-objdump  --triple=riscv32 --mattr=+c,+m,+a,+f,+zba,+experimental-zicfiss -d -M no-aliases - \
 # RUN:   | FileCheck -check-prefixes=CHECK-INST %s
 
 # Test '.option arch, +' and '.option arch, -' directive
@@ -78,6 +78,13 @@ lr.w t0, (t1)
 # CHECK: encoding: [0xb3,0x22,0x73,0x20]
 sh1add t0, t1, t2
 
+# Test experimental extension
+# CHECK: .option arch, +zicfiss
+.option arch, +zicfiss
+# CHECK-INST: sspopchk ra
+# CHECK: encoding: [0x73,0xc0,0xc0,0xcd]
+sspopchk ra
+
 # Test '.option arch, <arch-string>' directive
 # CHECK: .option arch, rv32i2p1_m2p0_a2p1_c2p0
 .option arch, rv32i2p1_m2p0_a2p1_c2p0
diff --git a/llvm/test/MC/RISCV/tail-call.s b/llvm/test/MC/RISCV/tail-call.s
index c94af672edda..7c9f28bdfacd 100644
--- a/llvm/test/MC/RISCV/tail-call.s
+++ b/llvm/test/MC/RISCV/tail-call.s
@@ -12,17 +12,36 @@
 # RUN: llvm-mc -triple riscv64 < %s -show-encoding \
 # RUN:   | FileCheck -check-prefix=FIXUP %s
 
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-zicfilp < %s \
+# RUN:   | llvm-objdump -d - | FileCheck --check-prefix=INSTR-ZICFILP %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-zicfilp < %s \
+# RUN:   | llvm-readobj -r - | FileCheck -check-prefix=RELOC %s
+# RUN: llvm-mc -triple riscv32 -mattr=+experimental-zicfilp < %s -show-encoding \
+# RUN:   | FileCheck -check-prefix=FIXUP %s
+
+# RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+experimental-zicfilp < %s \
+# RUN:   | llvm-objdump -d - | FileCheck --check-prefix=INSTR-ZICFILP %s
+# RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+experimental-zicfilp < %s \
+# RUN:   | llvm-readobj -r - | FileCheck -check-prefix=RELOC %s
+# RUN: llvm-mc -triple riscv64 -mattr=+experimental-zicfilp < %s -show-encoding \
+# RUN:   | FileCheck -check-prefix=FIXUP %s
+
 .long foo
 
 tail foo
 # RELOC: R_RISCV_CALL_PLT foo 0x0
 # INSTR: auipc t1, 0
 # INSTR: jr  t1
+# INSTR-ZICFILP: auipc t2, 0
+# INSTR-ZICFILP: jr  t2
 # FIXUP: fixup A - offset: 0, value: foo, kind:
+
 tail bar
 # RELOC: R_RISCV_CALL_PLT bar 0x0
 # INSTR: auipc t1, 0
 # INSTR: jr  t1
+# INSTR-ZICFILP: auipc t2, 0
+# INSTR-ZICFILP: jr  t2
 # FIXUP: fixup A - offset: 0, value: bar, kind:
 
 # Ensure that tail calls to functions whose names coincide with register names
@@ -32,22 +51,30 @@ tail zero
 # RELOC: R_RISCV_CALL_PLT zero 0x0
 # INSTR: auipc t1, 0
 # INSTR: jr  t1
+# INSTR-ZICFILP: auipc t2, 0
+# INSTR-ZICFILP: jr  t2
 # FIXUP: fixup A - offset: 0, value: zero, kind:
 
 tail f1
 # RELOC: R_RISCV_CALL_PLT f1 0x0
 # INSTR: auipc t1, 0
 # INSTR: jr  t1
+# INSTR-ZICFILP: auipc t2, 0
+# INSTR-ZICFILP: jr  t2
 # FIXUP: fixup A - offset: 0, value: f1, kind:
 
 tail ra
 # RELOC: R_RISCV_CALL_PLT ra 0x0
 # INSTR: auipc t1, 0
 # INSTR: jr  t1
+# INSTR-ZICFILP: auipc t2, 0
+# INSTR-ZICFILP: jr  t2
 # FIXUP: fixup A - offset: 0, value: ra, kind:
 
 tail foo@plt
 # RELOC: R_RISCV_CALL_PLT foo 0x0
 # INSTR: auipc t1, 0
 # INSTR: jr  t1
+# INSTR-ZICFILP: auipc t2, 0
+# INSTR-ZICFILP: jr  t2
 # FIXUP: fixup A - offset: 0, value: foo, kind: fixup_riscv_call_plt
diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s
index c6c554990c2c..57fa71e74b8d 100644
--- a/llvm/test/MC/WebAssembly/simd-encodings.s
+++ b/llvm/test/MC/WebAssembly/simd-encodings.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc -no-type-check -show-encoding -triple=wasm32-unknown-unknown -mattr=+simd128,+relaxed-simd < %s | FileCheck %s
+# RUN: llvm-mc -no-type-check -show-encoding -triple=wasm32-unknown-unknown -mattr=+simd128,+relaxed-simd,+half-precision < %s | FileCheck %s
 
 main:
     .functype main () -> ()
@@ -839,4 +839,10 @@ main:
     # CHECK: i32x4.relaxed_dot_i8x16_i7x16_add_s # encoding: [0xfd,0x93,0x02]
     i32x4.relaxed_dot_i8x16_i7x16_add_s
 
+    # CHECK: f32.load_f16 48 # encoding: [0xfc,0x30,0x01,0x30]
+    f32.load_f16 48
+
+    # CHECK: f32.store_f16 32 # encoding: [0xfc,0x31,0x01,0x20]
+    f32.store_f16 32
+
     end_function
diff --git a/llvm/test/Object/Inputs/MachO/bind-negative-skip.yaml b/llvm/test/Object/Inputs/MachO/bind-negative-skip.yaml
new file mode 100644
index 000000000000..aef5664a798f
--- /dev/null
+++ b/llvm/test/Object/Inputs/MachO/bind-negative-skip.yaml
@@ -0,0 +1,499 @@
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x100000C
+  cpusubtype:      0x0
+  filetype:        0x2
+  ncmds:           17
+  sizeofcmds:      1384
+  flags:           0x200085
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __PAGEZERO
+    vmaddr:          0
+    vmsize:          4294967296
+    fileoff:         0
+    filesize:        0
+    maxprot:         0
+    initprot:        0
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         472
+    segname:         __TEXT
+    vmaddr:          4294967296
+    vmsize:          16384
+    fileoff:         0
+    filesize:        16384
+    maxprot:         5
+    initprot:        5
+    nsects:          5
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x100003E58
+        size:            228
+        offset:          0x3E58
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         FF8300D1FD7B01A9FD430091E9030091080000B0080540F9280100F90000009000B03D9130000094E9030091080000B0080140F9280100F90000009000E03D9129000094280000B0080940F9E9030091280100F90000009000083E9122000094280000B0081140F9E9030091280100F90000009000243E911B000094280000B0081940F9E9030091280100F90000009000403E9114000094280000B0E80700F9082140F9E9030091280100F900000090005C3E910C000094E80740F9082140F9E9030091280100F90000009000783E910500009400008052FD7B41A9FF830091C0035FD6
+      - sectname:        __stubs
+        segname:         __TEXT
+        addr:            0x100003F3C
+        size:            12
+        offset:          0x3F3C
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000408
+        reserved1:       0x0
+        reserved2:       0xC
+        reserved3:       0x0
+        content:         300000B0100240F900021FD6
+      - sectname:        __stub_helper
+        segname:         __TEXT
+        addr:            0x100003F48
+        size:            36
+        offset:          0x3F48
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         310000B031220091F047BFA9100000B0100A40F900021FD650000018F9FFFF1700000000
+      - sectname:        __cstring
+        segname:         __TEXT
+        addr:            0x100003F6C
+        size:            57
+        offset:          0x3F6C
+        align:           0
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x2
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         6D616C6C6F633A2025700A00667265653A2025700A00613A2025700A00623A2025700A00633A2025700A00643A2025700A00653A2025700A00
+      - sectname:        __unwind_info
+        segname:         __TEXT
+        addr:            0x100003FA8
+        size:            88
+        offset:          0x3FA8
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         010000001C000000000000001C000000000000001C00000002000000583E000040000000400000003C3F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         152
+    segname:         __DATA_CONST
+    vmaddr:          4294983680
+    vmsize:          16384
+    fileoff:         16384
+    filesize:        16384
+    maxprot:         3
+    initprot:        3
+    nsects:          1
+    flags:           16
+    Sections:
+      - sectname:        __got
+        segname:         __DATA_CONST
+        addr:            0x100004000
+        size:            24
+        offset:          0x4000
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x6
+        reserved1:       0x1
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         '000000000000000000000000000000000000000000000000'
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         232
+    segname:         __DATA
+    vmaddr:          4295000064
+    vmsize:          16384
+    fileoff:         32768
+    filesize:        16384
+    maxprot:         3
+    initprot:        3
+    nsects:          2
+    flags:           0
+    Sections:
+      - sectname:        __la_symbol_ptr
+        segname:         __DATA
+        addr:            0x100008000
+        size:            8
+        offset:          0x8000
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x7
+        reserved1:       0x4
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         603F000001000000
+      - sectname:        __data
+        segname:         __DATA
+        addr:            0x100008008
+        size:            88
+        offset:          0x8008
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         '00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000'
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          4295016448
+    vmsize:          32768
+    fileoff:         49152
+    filesize:        19184
+    maxprot:         1
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_DYLD_INFO_ONLY
+    cmdsize:         48
+    rebase_off:      49152
+    rebase_size:     8
+    bind_off:        49160
+    bind_size:       72
+    weak_bind_off:   0
+    weak_bind_size:  0
+    lazy_bind_off:   49232
+    lazy_bind_size:  16
+    export_off:      49248
+    export_size:     96
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          49352
+    nsyms:           13
+    stroff:          49584
+    strsize:         104
+  - cmd:             LC_DYSYMTAB
+    cmdsize:         80
+    ilocalsym:       0
+    nlocalsym:       1
+    iextdefsym:      1
+    nextdefsym:      7
+    iundefsym:       8
+    nundefsym:       5
+    tocoff:          0
+    ntoc:            0
+    modtaboff:       0
+    nmodtab:         0
+    extrefsymoff:    0
+    nextrefsyms:     0
+    indirectsymoff:  49560
+    nindirectsyms:   5
+    extreloff:       0
+    nextrel:         0
+    locreloff:       0
+    nlocrel:         0
+  - cmd:             LC_LOAD_DYLINKER
+    cmdsize:         32
+    name:            12
+    Content:         '/usr/lib/dyld'
+    ZeroPadBytes:    7
+  - cmd:             LC_UUID
+    cmdsize:         24
+    uuid:            2018719F-D4DC-3EE9-B8C3-3B790A01EAF7
+  - cmd:             LC_BUILD_VERSION
+    cmdsize:         32
+    platform:        1
+    minos:           917504
+    sdk:             918784
+    ntools:          1
+    Tools:
+      - tool:            3
+        version:         0
+  - cmd:             LC_SOURCE_VERSION
+    cmdsize:         16
+    version:         0
+  - cmd:             LC_MAIN
+    cmdsize:         24
+    entryoff:        15960
+    stacksize:       0
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         56
+    dylib:
+      name:            24
+      timestamp:       2
+      current_version: 88176642
+      compatibility_version: 65536
+    Content:         '/usr/lib/libSystem.B.dylib'
+    ZeroPadBytes:    6
+  - cmd:             LC_FUNCTION_STARTS
+    cmdsize:         16
+    dataoff:         49344
+    datasize:        8
+  - cmd:             LC_DATA_IN_CODE
+    cmdsize:         16
+    dataoff:         49352
+    datasize:        0
+  - cmd:             LC_CODE_SIGNATURE
+    cmdsize:         16
+    dataoff:         49696
+    datasize:        18640
+LinkEditData:
+  RebaseOpcodes:
+    - Opcode:          REBASE_OPCODE_SET_TYPE_IMM
+      Imm:             1
+    - Opcode:          REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
+      Imm:             3
+      ExtraData:       [ 0x0 ]
+    - Opcode:          REBASE_OPCODE_DO_REBASE_IMM_TIMES
+      Imm:             1
+    - Opcode:          REBASE_OPCODE_DONE
+      Imm:             0
+  BindOpcodes:
+    - Opcode:          BIND_OPCODE_SET_DYLIB_ORDINAL_IMM
+      Imm:             1
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM
+      Imm:             0
+      Symbol:          _free
+    - Opcode:          BIND_OPCODE_SET_TYPE_IMM
+      Imm:             1
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
+      Imm:             2
+      ULEBExtraData:   [ 0x0 ]
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DO_BIND
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
+      Imm:             3
+      ULEBExtraData:   [ 0x40 ]
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DO_BIND
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM
+      Imm:             0
+      Symbol:          _malloc
+    - Opcode:          BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
+      Imm:             2
+      ULEBExtraData:   [ 0x8 ]
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DO_BIND
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
+      Imm:             3
+      ULEBExtraData:   [ 0x30 ]
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB
+      Imm:             0
+      ULEBExtraData:   [ 0x2, 0xFFFFFFFFFFFFFFF0 ]
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DO_BIND
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM
+      Imm:             0
+      Symbol:          dyld_stub_binder
+    - Opcode:          BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
+      Imm:             2
+      ULEBExtraData:   [ 0x10 ]
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DO_BIND
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DONE
+      Imm:             0
+      Symbol:          ''
+  LazyBindOpcodes:
+    - Opcode:          BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
+      Imm:             3
+      ULEBExtraData:   [ 0x0 ]
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_DYLIB_ORDINAL_IMM
+      Imm:             1
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM
+      Imm:             0
+      Symbol:          _printf
+    - Opcode:          BIND_OPCODE_DO_BIND
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DONE
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DONE
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DONE
+      Imm:             0
+      Symbol:          ''
+  ExportTrie:
+    TerminalSize:    0
+    NodeOffset:      0
+    Name:            ''
+    Flags:           0x0
+    Address:         0x0
+    Other:           0x0
+    ImportName:      ''
+    Children:
+      - TerminalSize:    0
+        NodeOffset:      48
+        Name:            _
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    2
+            NodeOffset:      9
+            Name:            _mh_execute_header
+            Flags:           0x0
+            Address:         0x0
+            Other:           0x0
+            ImportName:      ''
+          - TerminalSize:    4
+            NodeOffset:      13
+            Name:            a
+            Flags:           0x0
+            Address:         0x8010
+            Other:           0x0
+            ImportName:      ''
+          - TerminalSize:    4
+            NodeOffset:      19
+            Name:            b
+            Flags:           0x0
+            Address:         0x8020
+            Other:           0x0
+            ImportName:      ''
+          - TerminalSize:    4
+            NodeOffset:      25
+            Name:            c
+            Flags:           0x0
+            Address:         0x8030
+            Other:           0x0
+            ImportName:      ''
+          - TerminalSize:    4
+            NodeOffset:      31
+            Name:            d
+            Flags:           0x0
+            Address:         0x8040
+            Other:           0x0
+            ImportName:      ''
+          - TerminalSize:    4
+            NodeOffset:      37
+            Name:            e
+            Flags:           0x0
+            Address:         0x8050
+            Other:           0x0
+            ImportName:      ''
+          - TerminalSize:    3
+            NodeOffset:      43
+            Name:            main
+            Flags:           0x0
+            Address:         0x3E58
+            Other:           0x0
+            ImportName:      ''
+  NameList:
+    - n_strx:          88
+      n_type:          0xE
+      n_sect:          8
+      n_desc:          0
+      n_value:         4295000072
+    - n_strx:          2
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          16
+      n_value:         4294967296
+    - n_strx:          22
+      n_type:          0xF
+      n_sect:          8
+      n_desc:          0
+      n_value:         4295000080
+    - n_strx:          25
+      n_type:          0xF
+      n_sect:          8
+      n_desc:          0
+      n_value:         4295000096
+    - n_strx:          28
+      n_type:          0xF
+      n_sect:          8
+      n_desc:          0
+      n_value:         4295000112
+    - n_strx:          31
+      n_type:          0xF
+      n_sect:          8
+      n_desc:          0
+      n_value:         4295000128
+    - n_strx:          34
+      n_type:          0xF
+      n_sect:          8
+      n_desc:          0
+      n_value:         4295000144
+    - n_strx:          37
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         4294983256
+    - n_strx:          43
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          256
+      n_value:         0
+    - n_strx:          49
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          256
+      n_value:         0
+    - n_strx:          57
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          256
+      n_value:         0
+    - n_strx:          65
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          256
+      n_value:         0
+    - n_strx:          71
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          256
+      n_value:         0
+  StringTable:
+    - ' '
+    - __mh_execute_header
+    - _a
+    - _b
+    - _c
+    - _d
+    - _e
+    - _main
+    - _free
+    - _malloc
+    - _printf
+    - _read
+    - dyld_stub_binder
+    - __dyld_private
+    - ''
+  IndirectSymbols: [ 0xA, 0x8, 0x9, 0xC, 0xA ]
+  FunctionStarts:  [ 0x3E58 ]
+...
diff --git a/llvm/test/Object/archive-big-extract.test b/llvm/test/Object/archive-big-extract.test
index a1d7f0c731c0..3de09d8fb106 100644
--- a/llvm/test/Object/archive-big-extract.test
+++ b/llvm/test/Object/archive-big-extract.test
@@ -1,4 +1,5 @@
 ## Test extract xcoff object file from AIX big archive.
+# UNSUPPORTED: target={{.*}}-zos{{.*}}
 # RUN: rm -rf %t && mkdir -p %t/extracted/ && cd %t/extracted/
 # RUN: llvm-ar x %p/Inputs/aix-big-archive.a
 # RUN: echo "content_of_evenlen" > evenlen_1
diff --git a/llvm/test/Object/archive-extract.test b/llvm/test/Object/archive-extract.test
index 57b3c8f6795a..d4edece8fc45 100644
--- a/llvm/test/Object/archive-extract.test
+++ b/llvm/test/Object/archive-extract.test
@@ -1,6 +1,7 @@
 ; This test just makes sure that llvm-ar can extract bytecode members
 ; from various style archives.
 
+; UNSUPPORTED: target={{.*}}-zos{{.*}}
 ; RUN: rm -rf %t && mkdir -p %t && cd %t
 
 ; RUN: rm -f very_long_bytecode_file_name.bc
diff --git a/llvm/test/Object/macho-bind-negative-skip.test b/llvm/test/Object/macho-bind-negative-skip.test
new file mode 100644
index 000000000000..26884a28ea46
--- /dev/null
+++ b/llvm/test/Object/macho-bind-negative-skip.test
@@ -0,0 +1,17 @@
+// A valid MachO object with a bind table containing an opcode
+// `BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB` with negative skip value
+// (0xFFFFFFFFFFFFFFF0).
+
+RUN: yaml2obj %p/Inputs/MachO/bind-negative-skip.yaml | \
+RUN: llvm-objdump --bind --macho - | \
+RUN: FileCheck %s
+
+CHECK:      Bind table:
+CHECK-NEXT: segment      section            address     type       addend dylib            symbol
+CHECK-NEXT: __DATA_CONST __got              0x100004000 pointer         0 libSystem        _free
+CHECK-NEXT: __DATA       __data             0x100008040 pointer         0 libSystem        _free
+CHECK-NEXT: __DATA_CONST __got              0x100004008 pointer         0 libSystem        _malloc
+CHECK-NEXT: __DATA       __data             0x100008030 pointer         0 libSystem        _malloc
+CHECK-NEXT: __DATA       __data             0x100008028 pointer         0 libSystem        _malloc
+CHECK-NEXT: __DATA       __data             0x100008020 pointer         0 libSystem        _malloc
+CHECK-NEXT: __DATA_CONST __got              0x100004010 pointer         0 libSystem        dyld_stub_binder
diff --git a/llvm/test/Other/dump-before-after-invalidated.ll b/llvm/test/Other/dump-before-after-invalidated.ll
index 00b1a599b67f..e640b2d908b9 100644
--- a/llvm/test/Other/dump-before-after-invalidated.ll
+++ b/llvm/test/Other/dump-before-after-invalidated.ll
@@ -7,7 +7,7 @@
 ; RUN: ls %t/logs | count 1
 ; RUN: cat %t/logs/* | FileCheck %s --check-prefix=CHECK-CONTENTS
 
-; CHECK-CONTENTS: ; *** IR Dump After LoopDeletionPass on bb1 (invalidated) ***
+; CHECK-CONTENTS: ; *** IR Dump After LoopDeletionPass on loop %bb1 in function foo (invalidated) ***
 ; CHECK-CONTENTS: define void @foo() {
 ; CHECK-CONTENTS:   br label %bb2
 ; CHECK-CONTENTS: bb2:                                              ; preds = %0
diff --git a/llvm/test/Other/loop-pass-ordering.ll b/llvm/test/Other/loop-pass-ordering.ll
index fde49a7b73a5..c7e24fe9a3c0 100644
--- a/llvm/test/Other/loop-pass-ordering.ll
+++ b/llvm/test/Other/loop-pass-ordering.ll
@@ -8,11 +8,11 @@
 ;      /      \        \
 ; loop.0.0  loop.0.1  loop.1.0
 ;
-; CHECK: Running pass: NoOpLoopPass on loop.0.0
-; CHECK: Running pass: NoOpLoopPass on loop.0.1
-; CHECK: Running pass: NoOpLoopPass on loop.0
-; CHECK: Running pass: NoOpLoopPass on loop.1.0
-; CHECK: Running pass: NoOpLoopPass on loop.1
+; CHECK: Running pass: NoOpLoopPass on loop %loop.0.0 in function f
+; CHECK: Running pass: NoOpLoopPass on loop %loop.0.1 in function f
+; CHECK: Running pass: NoOpLoopPass on loop %loop.0 in function f
+; CHECK: Running pass: NoOpLoopPass on loop %loop.1.0 in function f
+; CHECK: Running pass: NoOpLoopPass on loop %loop.1 in function f
 
 define void @f() {
 entry:
diff --git a/llvm/test/Other/loop-print-after-pass-invalidated.ll b/llvm/test/Other/loop-print-after-pass-invalidated.ll
index 63106f62ae13..79244c11cbed 100644
--- a/llvm/test/Other/loop-print-after-pass-invalidated.ll
+++ b/llvm/test/Other/loop-print-after-pass-invalidated.ll
@@ -3,8 +3,8 @@
 ; RUN:     -print-after=simple-loop-unswitch \
 ; RUN:	   | FileCheck %s
 
-; CHECK: *** IR Dump After SimpleLoopUnswitchPass on for.cond ***
-; CHECK: *** IR Dump After SimpleLoopUnswitchPass on for.cond.us ***
+; CHECK: *** IR Dump After SimpleLoopUnswitchPass on loop %for.cond in function loop ***
+; CHECK: *** IR Dump After SimpleLoopUnswitchPass on loop %for.cond.us in function loop ***
 
 define void @loop(i1 %w)  {
 entry:
diff --git a/llvm/test/Other/loopnest-pass-ordering.ll b/llvm/test/Other/loopnest-pass-ordering.ll
index 963653280d10..e3d58e8e1c76 100644
--- a/llvm/test/Other/loopnest-pass-ordering.ll
+++ b/llvm/test/Other/loopnest-pass-ordering.ll
@@ -8,8 +8,8 @@
 ;      /      \        \
 ; loop.0.0  loop.0.1  loop.1.0
 ;
-; CHECK: Running pass: NoOpLoopNestPass on loop.0
-; CHECK: Running pass: NoOpLoopNestPass on loop.1
+; CHECK: Running pass: NoOpLoopNestPass on loop %loop.0 in function f
+; CHECK: Running pass: NoOpLoopNestPass on loop %loop.1 in function f
 ; CHECK-NOT: Running pass: NoOpLoopNestPass on {{loop\..*\..*}}
 
 define void @f() {
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
index e2fd74306f80..6486639e07b4 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
@@ -183,10 +183,12 @@
 ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Running analysis: ShouldNotRunFunctionPassesAnalysis
+; CHECK-O-NEXT: Running pass: CoroSplitPass
 ; CHECK-O-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: InlineAdvisorAnalysis
 ; CHECK-O-NEXT: Running pass: DeadArgumentEliminationPass
+; CHECK-O-NEXT: Running pass: CoroCleanupPass
 ; CHECK-O-NEXT: Running pass: GlobalOptPass
 ; CHECK-O-NEXT: Running pass: GlobalDCEPass
 ; CHECK-EXT: Running pass: {{.*}}::Bye
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
index 13a63bbe4d9c..09f9f0f48bad 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
@@ -182,10 +182,12 @@
 ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Running analysis: ShouldNotRunFunctionPassesAnalysis
+; CHECK-O-NEXT: Running pass: CoroSplitPass
 ; CHECK-O-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: InlineAdvisorAnalysis
 ; CHECK-O-NEXT: Running pass: DeadArgumentEliminationPass
+; CHECK-O-NEXT: Running pass: CoroCleanupPass
 ; CHECK-O-NEXT: Running pass: GlobalOptPass
 ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis on bar
 ; CHECK-O-NEXT: Running pass: GlobalDCEPass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
index 3130da86fa99..47bdbfd2d357 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
@@ -147,10 +147,12 @@
 ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Running analysis: ShouldNotRunFunctionPassesAnalysis
+; CHECK-O-NEXT: Running pass: CoroSplitPass
 ; CHECK-O-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: InlineAdvisorAnalysis
 ; CHECK-O-NEXT: Running pass: DeadArgumentEliminationPass
+; CHECK-O-NEXT: Running pass: CoroCleanupPass
 ; CHECK-O-NEXT: Running pass: GlobalOptPass
 ; CHECK-O-NEXT: Running pass: GlobalDCEPass
 ; CHECK-O-NEXT: Running pass: AnnotationRemarksPass on foo
diff --git a/llvm/test/Other/print-at-pass-number.ll b/llvm/test/Other/print-at-pass-number.ll
index 770b3d703c09..4930eaa5a865 100644
--- a/llvm/test/Other/print-at-pass-number.ll
+++ b/llvm/test/Other/print-at-pass-number.ll
@@ -4,7 +4,7 @@
 ; RUN: opt -passes="loop(indvars,loop-deletion,loop-unroll-full)" -print-module-scope -print-after-pass-number=2 -S -o /dev/null %s 2>&1 | FileCheck %s --check-prefix=AFTER
 
 define i32 @bar(i32 %arg) {
-; BEFORE: *** IR Dump Before 3-IndVarSimplifyPass on bb1 ***
+; BEFORE: *** IR Dump Before 3-IndVarSimplifyPass on loop %bb1 in function bar ***
 ; BEFORE: define i32 @bar(i32 %arg) {
 ; AFTER:  *** IR Dump After 2-LCSSAPass on bar ***
 ; AFTER:  define i32 @bar(i32 %arg) {
@@ -30,8 +30,8 @@ define i32 @baz(i32 %arg) {
 
 ; NUMBER:  Running pass 1 LoopSimplifyPass on bar
 ; NUMBER-NEXT: Running pass 2 LCSSAPass on bar
-; NUMBER-NEXT: Running pass 3 IndVarSimplifyPass on bb1
-; NUMBER-NEXT: Running pass 4 LoopDeletionPass on bb1
+; NUMBER-NEXT: Running pass 3 IndVarSimplifyPass on loop %bb1 in function bar
+; NUMBER-NEXT: Running pass 4 LoopDeletionPass on loop %bb1 in function bar
 ; NUMBER-NEXT: Running pass 5 LoopSimplifyPass on baz
 ; NUMBER-NEXT: Running pass 6 LCSSAPass on baz
 ; NUMBER-NOT: Running pass
diff --git a/llvm/test/TableGen/HwModeBitSet.td b/llvm/test/TableGen/HwModeBitSet.td
new file mode 100644
index 000000000000..b2de6e8e012c
--- /dev/null
+++ b/llvm/test/TableGen/HwModeBitSet.td
@@ -0,0 +1,162 @@
+// This is to test the scenario where different HwMode attributes coexist.
+// RUN: llvm-tblgen -gen-register-info -register-info-debug -I %p/../../include %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK-REG
+// RUN: llvm-tblgen -gen-subtarget -I %p/../../include %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-SUBTARGET
+
+
+include "llvm/Target/Target.td"
+
+def TestTargetInstrInfo : InstrInfo;
+
+def TestTarget : Target {
+  let InstructionSet = TestTargetInstrInfo;
+}
+
+def TestMode : HwMode<"+feat", []>;
+def TestMode1 : HwMode<"+feat1", []>;
+def TestMode2 : HwMode<"+feat2", []>;
+
+class MyReg<string n>
+  : Register<n> {
+  let Namespace = "Test";
+}
+
+class MyClass<int size, list<ValueType> types, dag registers>
+  : RegisterClass<"Test", types, size, registers> {
+  let Size = size;
+}
+
+def X0 : MyReg<"x0">;
+def X1 : MyReg<"x1">;
+def X2 : MyReg<"x2">;
+def X3 : MyReg<"x3">;
+def X4 : MyReg<"x4">;
+def X5 : MyReg<"x5">;
+def X6 : MyReg<"x6">;
+def X7 : MyReg<"x7">;
+def X8 : MyReg<"x8">;
+def X9 : MyReg<"x9">;
+def X10 : MyReg<"x10">;
+def X11 : MyReg<"x11">;
+def X12 : MyReg<"x12">;
+def X13 : MyReg<"x13">;
+def X14 : MyReg<"x14">;
+def X15 : MyReg<"x15">;
+
+def ValueModeVT : ValueTypeByHwMode<[DefaultMode, TestMode, TestMode1],
+                                    [i32,  i64, f32]>;
+
+let RegInfos = RegInfoByHwMode<[DefaultMode, TestMode],
+                               [RegInfo<32,32,32>, RegInfo<64,64,64>]> in
+def XRegs : MyClass<32, [ValueModeVT], (sequence "X%u", 0, 15)>;
+
+def sub_even : SubRegIndex<32> {
+  let SubRegRanges = SubRegRangeByHwMode<[DefaultMode, TestMode],
+                                         [SubRegRange<32>, SubRegRange<64>]>;
+}
+def sub_odd  : SubRegIndex<32, 32> {
+  let SubRegRanges = SubRegRangeByHwMode<[DefaultMode, TestMode],
+                                         [SubRegRange<32, 32>, SubRegRange<64, 64>]>;
+}
+
+def XPairs : RegisterTuples<[sub_even, sub_odd],
+                            [(decimate (rotl XRegs, 0), 2),
+                             (decimate (rotl XRegs, 1), 2)]>;
+
+let RegInfos = RegInfoByHwMode<[DefaultMode, TestMode],
+                               [RegInfo<64,64,32>, RegInfo<128,128,64>]> in
+def XPairsClass : MyClass<64, [untyped], (add XPairs)>;
+
+// Modes who are not controlling Register related features will be manipulated
+// the same as DefaultMode.
+// CHECK-REG-LABEL: RegisterClass XRegs:
+// CHECK-REG: SpillSize: { Default:32 TestMode:64 TestMode1:32 TestMode2:32 }
+// CHECK-REG: SpillAlignment: { Default:32 TestMode:64 TestMode1:32 TestMode2:32 }
+// CHECK-REG: Regs: X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+
+// CHECK-REG-LABEL: RegisterClass XPairsClass:
+// CHECK-REG: SpillSize: { Default:64 TestMode:128 TestMode1:64 TestMode2:64 }
+// CHECK-REG: SpillAlignment: { Default:32 TestMode:64 TestMode1:32 TestMode2:32 }
+// CHECK-REG: CoveredBySubRegs: 1
+// CHECK-REG: Regs: X0_X1 X2_X3 X4_X5 X6_X7 X8_X9 X10_X11 X12_X13 X14_X15
+
+// CHECK-REG-LABEL: SubRegIndex sub_even:
+// CHECK-REG: Offset: { Default:0 TestMode:0 TestMode1:0 TestMode2:0 }
+// CHECK-REG: Size: { Default:32 TestMode:64 TestMode1:32 TestMode2:32 }
+// CHECK-REG-LABEL: SubRegIndex sub_odd:
+// CHECK-REG: Offset: { Default:32 TestMode:64 TestMode1:32 TestMode2:32 }
+// CHECK-REG: Size: { Default:32 TestMode:64 TestMode1:32 TestMode2:32 }
+
+//============================================================================//
+//--------------------- Encoding/Decoding parts ------------------------------//
+//============================================================================//
+def fooTypeEncDefault : InstructionEncoding {
+  let Size = 8;
+  field bits<64> SoftFail = 0;
+  bits<64> Inst;
+  bits<8> factor;
+  let Inst{7...0} = factor;
+  let Inst{3...2} = 0b10;
+  let Inst{1...0} = 0b00;
+}
+
+def fooTypeEncA : InstructionEncoding {
+  let Size = 4;
+  field bits<32> SoftFail = 0;
+  bits<32> Inst;
+  bits<8> factor;
+  let Inst{7...0} = factor;
+  let Inst{3...2} = 0b11;
+  let Inst{1...0} = 0b00;
+}
+
+
+def foo : Instruction {
+  bits<32> Inst;
+  let OutOperandList = (outs);
+  let InOperandList = (ins i32imm:$factor);
+  let EncodingInfos = EncodingByHwMode<
+    [TestMode2, DefaultMode], [fooTypeEncA, fooTypeEncDefault]
+  >;
+  let AsmString = "foo  $factor";
+}
+
+// CHECK-SUBTARGET-LABEL: unsigned TestTargetGenSubtargetInfo::getHwModeSet() const {
+// CHECK-SUBTARGET:         unsigned Modes = 0;
+// CHECK-SUBTARGET:         if (checkFeatures("+feat")) Modes |= (1 << 0);
+// CHECK-SUBTARGET:         if (checkFeatures("+feat1")) Modes |= (1 << 1);
+// CHECK-SUBTARGET:         if (checkFeatures("+feat2")) Modes |= (1 << 2);
+// CHECK-SUBTARGET:         return Modes;
+// CHECK-SUBTARGET:       }
+// CHECK-SUBTARGET-LABEL: unsigned TestTargetGenSubtargetInfo::getHwMode(enum HwModeType type) const {
+// CHECK-SUBTARGET:         unsigned Modes = getHwModeSet();
+// CHECK-SUBTARGET:         if (!Modes)
+// CHECK-SUBTARGET:           return Modes;
+// CHECK-SUBTARGET:         switch (type) {
+// CHECK-SUBTARGET:         case HwMode_Default:
+// CHECK-SUBTARGET:           return llvm::countr_zero(Modes) + 1;
+// CHECK-SUBTARGET:         case HwMode_ValueType:
+// CHECK-SUBTARGET:           Modes &= 3;
+// CHECK-SUBTARGET:           if (!Modes)
+// CHECK-SUBTARGET:             return Modes;
+// CHECK-SUBTARGET:           if (!llvm::has_single_bit<unsigned>(Modes))
+// CHECK-SUBTARGET:             llvm_unreachable("Two or more HwModes for ValueType were found!");
+// CHECK-SUBTARGET:           return llvm::countr_zero(Modes) + 1;
+// CHECK-SUBTARGET:         case HwMode_RegInfo:
+// CHECK-SUBTARGET:           Modes &= 1;
+// CHECK-SUBTARGET:           if (!Modes)
+// CHECK-SUBTARGET:             return Modes;
+// CHECK-SUBTARGET:           if (!llvm::has_single_bit<unsigned>(Modes))
+// CHECK-SUBTARGET:             llvm_unreachable("Two or more HwModes for RegInfo were found!");
+// CHECK-SUBTARGET:           return llvm::countr_zero(Modes) + 1;
+// CHECK-SUBTARGET:         case HwMode_EncodingInfo:
+// CHECK-SUBTARGET:           Modes &= 4;
+// CHECK-SUBTARGET:           if (!Modes)
+// CHECK-SUBTARGET:             return Modes;
+// CHECK-SUBTARGET:           if (!llvm::has_single_bit<unsigned>(Modes))
+// CHECK-SUBTARGET:             llvm_unreachable("Two or more HwModes for EncodingInfo were found!");
+// CHECK-SUBTARGET:           return llvm::countr_zero(Modes) + 1;
+// CHECK-SUBTARGET:         }
+// CHECK-SUBTARGET:         llvm_unreachable("unexpected HwModeType");
+// CHECK-SUBTARGET:         return 0; // should not get here
+// CHECK-SUBTARGET:       }
+
diff --git a/llvm/test/TableGen/HwModeEncodeAPInt.td b/llvm/test/TableGen/HwModeEncodeAPInt.td
new file mode 100644
index 000000000000..43ca5edd952a
--- /dev/null
+++ b/llvm/test/TableGen/HwModeEncodeAPInt.td
@@ -0,0 +1,241 @@
+// This testcase is to test the correctness of HwMode encoding under the 'APInt' Mode.
+// RUN: llvm-tblgen -gen-emitter -I %p/../../include %s | \
+// RUN:     FileCheck %s --check-prefix=ENCODER
+
+include "llvm/Target/Target.td"
+
+def archInstrInfo : InstrInfo { }
+
+def arch : Target {
+  let InstructionSet = archInstrInfo;
+}
+
+def Myi32 : Operand<i32> {
+  let DecoderMethod = "DecodeMyi32";
+}
+
+def HasA : Predicate<"Subtarget->hasA()">;
+def HasB : Predicate<"Subtarget->hasB()">;
+
+def ModeA : HwMode<"+a", [HasA]>; // Mode 1
+def ModeB : HwMode<"+b", [HasB]>; // Mode 2
+def ModeC : HwMode<"+c", []>;     // Mode 3
+
+
+def fooTypeEncDefault : InstructionEncoding {
+  let Size = 16;
+  field bits<128> SoftFail = 0;
+  bits<128> Inst;
+  bits<8> factor;
+  let Inst{127...120} = factor;
+  let Inst{3...2} = 0b10;
+  let Inst{1...0} = 0b00;
+}
+
+def fooTypeEncA : InstructionEncoding {
+  let Size = 16;
+  field bits<128> SoftFail = 0;
+  bits<128> Inst;
+  bits<8> factor;
+  let Inst{119...112} = factor;
+  let Inst{3...2} = 0b11;
+  let Inst{1...0} = 0b00;
+}
+
+def fooTypeEncB : InstructionEncoding {
+  let Size = 16;
+  field bits<128> SoftFail = 0;
+  bits<128> Inst;
+  bits<8> factor;
+  let Inst{119...112} = factor;
+  let Inst{111...110} = 0b11;
+}
+
+def fooTypeEncC : InstructionEncoding {
+  let Size = 16;
+  field bits<128> SoftFail = 0;
+  bits<128> Inst;
+  bits<8> factor;
+  let Inst{31...24} = factor;
+  let Inst{23...21} = 0b110;
+  let Inst{1...0} = 0b11;
+}
+
+// Test for DefaultMode as a selector.
+def foo : Instruction {
+  bits<128> Inst;
+  let OutOperandList = (outs);
+  let InOperandList = (ins i32imm:$factor);
+  let EncodingInfos = EncodingByHwMode<
+  [ModeC, ModeA, ModeB, DefaultMode],
+  [fooTypeEncC, fooTypeEncA, fooTypeEncB, fooTypeEncDefault]>;
+  let AsmString = "foo  $factor";
+}
+
+def bar: Instruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins i32imm:$factor);
+  let Size = 4;
+  bits<32> Inst;
+  bits<32> SoftFail;
+  bits<8> factor;
+  let Inst{31...24} = factor;
+  let Inst{1...0} = 0b10;
+  let AsmString = "bar  $factor";
+}
+
+def baz : Instruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins i32imm:$factor);
+  bits<32> Inst;
+  let EncodingInfos = EncodingByHwMode<
+    [ModeB], [fooTypeEncA]
+  >;
+  let AsmString = "foo  $factor";
+}
+
+def unrelated: Instruction {
+  let OutOperandList = (outs);
+  let DecoderNamespace = "Alt";
+  let InOperandList = (ins i32imm:$factor);
+  let Size = 4;
+  bits<32> Inst;
+  bits<32> SoftFail;
+  bits<8> factor;
+  let Inst{31...24} = factor;
+  let Inst{1...0} = 0b10;
+  let AsmString = "unrelated  $factor";
+}
+
+// For 'bar' and 'unrelated', we didn't assign any HwModes for them,
+// they should keep the same in the following four tables.
+// For 'foo' we assigned four HwModes( includes 'DefaultMode' ),
+// it's encodings should be different in the following four tables.
+// For 'baz' we only assigned ModeB for it, so it will be presented
+// as '0' in the tables of ModeA, ModeC and Default Mode.
+// ENCODER-LABEL:   static const uint64_t InstBits[] = {
+// ENCODER:         UINT64_C(2), UINT64_C(0),       // bar
+// ENCODER:         UINT64_C(0), UINT64_C(0),       // baz
+// ENCODER:         UINT64_C(8), UINT64_C(0),       // foo
+// ENCODER:         UINT64_C(2), UINT64_C(0),       // unrelated
+// ENCODER-LABEL:   static const uint64_t InstBits_ModeA[] = {
+// ENCODER:         UINT64_C(2), UINT64_C(0),       // bar
+// ENCODER:         UINT64_C(0), UINT64_C(0),       // baz
+// ENCODER:         UINT64_C(12), UINT64_C(0),      // foo
+// ENCODER:         UINT64_C(2), UINT64_C(0),       // unrelated
+// ENCODER-LABEL:   static const uint64_t InstBits_ModeB[] = {
+// ENCODER:         UINT64_C(2), UINT64_C(0),       // bar
+// ENCODER:         UINT64_C(12), UINT64_C(0),      // baz
+// ENCODER:         UINT64_C(0), UINT64_C(211106232532992),  // foo
+// ENCODER:         UINT64_C(2), UINT64_C(0),       // unrelated
+// ENCODER-LABEL:   static const uint64_t InstBits_ModeC[] = {
+// ENCODER:         UINT64_C(2), UINT64_C(0),      // bar
+// ENCODER:         UINT64_C(0), UINT64_C(0),      // baz
+// ENCODER:         UINT64_C(12582915),  UINT64_C(0),  // foo
+// ENCODER:         UINT64_C(2),  UINT64_C(0),     // unrelated
+
+
+// ENCODER: const uint64_t *InstBitsByHw;
+// ENCODER: const unsigned opcode = MI.getOpcode();
+// ENCODER: if (Scratch.getBitWidth() != 128)
+// ENCODER:   Scratch = Scratch.zext(128);
+// ENCODER: Inst = APInt(128, ArrayRef(InstBits + opcode * 2, 2));
+// ENCODER: APInt &Value = Inst;
+// ENCODER: APInt &op = Scratch;
+// ENCODER: switch (opcode) {
+// ENCODER-LABEL: case ::bar:
+// ENCODER-LABEL: case ::unrelated:
+// ENCODER-NOT: getHwMode
+// ENCODER-LABEL: case ::foo: {
+// ENCODER: unsigned HwMode = STI.getHwMode(MCSubtargetInfo::HwMode_EncodingInfo);
+// ENCODER: switch (HwMode) {
+// ENCODER: default: llvm_unreachable("Unknown hardware mode!"); break;
+// ENCODER: case 0: InstBitsByHw = InstBits; break;
+// ENCODER: case 1: InstBitsByHw = InstBits_ModeA; break;
+// ENCODER: case 2: InstBitsByHw = InstBits_ModeB; break;
+// ENCODER: case 3: InstBitsByHw = InstBits_ModeC; break;
+// ENCODER: };
+// ENCODER: Inst = APInt(128, ArrayRef(InstBitsByHw + opcode * 2, 2));
+// ENCODER: Value = Inst;
+// ENCODER: switch (HwMode) {
+// ENCODER: default: llvm_unreachable("Unhandled HwMode");
+// ENCODER: case 0: {
+// ENCODER: op.clearAllBits();
+// ENCODER: getMachineOpValue(MI, MI.getOperand(0), op, Fixups, STI);
+// ENCODER: Value.insertBits(op.extractBitsAsZExtValue(8, 0), 120, 8);
+// ENCODER: break;
+// ENCODER: }
+// ENCODER: case 1: {
+// ENCODER: op.clearAllBits();
+// ENCODER: getMachineOpValue(MI, MI.getOperand(0), op, Fixups, STI);
+// ENCODER: Value.insertBits(op.extractBitsAsZExtValue(8, 0), 112, 8);
+// ENCODER: break;
+// ENCODER: }
+// ENCODER: case 2: {
+// ENCODER: op.clearAllBits();
+// ENCODER: getMachineOpValue(MI, MI.getOperand(0), op, Fixups, STI);
+// ENCODER: Value.insertBits(op.extractBitsAsZExtValue(8, 0), 112, 8);
+// ENCODER: break;
+// ENCODER: }
+// ENCODER: case 3: {
+// ENCODER: op.clearAllBits();
+// ENCODER: getMachineOpValue(MI, MI.getOperand(0), op, Fixups, STI);
+// ENCODER: Value.insertBits(op.extractBitsAsZExtValue(8, 0), 24, 8);
+// ENCODER: break;
+// ENCODER: }
+// ENCODER-LABEL: case ::baz: {
+// ENCODER: unsigned HwMode = STI.getHwMode(MCSubtargetInfo::HwMode_EncodingInfo);
+// ENCODER: switch (HwMode) {
+// ENCODER: default: llvm_unreachable("Unknown hardware mode!"); break;
+// ENCODER: case 2: InstBitsByHw = InstBits_ModeB; break;
+// ENCODER: };
+// ENCODER: Inst = APInt(128, ArrayRef(InstBitsByHw + opcode * 2, 2));
+// ENCODER: Value = Inst;
+// ENCODER: switch (HwMode) {
+// ENCODER: default: llvm_unreachable("Unhandled HwMode");
+// ENCODER: case 2: {
+// ENCODER: getMachineOpValue(MI, MI.getOperand(0), op, Fixups, STI);
+// ENCODER: Value.insertBits(op.extractBitsAsZExtValue(8, 0), 112, 8);
+// ENCODER: break;
+// ENCODER: }
+
+// ENCODER-LABEL: uint32_t archMCCodeEmitter::getOperandBitOffset
+// ENCODER: switch (MI.getOpcode()) {
+// ENCODER-LABEL: case ::bar:
+// ENCODER-LABEL: case ::unrelated: {
+// ENCODER-NOT: getHwMode
+// ENCODER-LABEL: case ::foo: {
+// ENCODER:   unsigned HwMode = STI.getHwMode(MCSubtargetInfo::HwMode_EncodingInfo);
+// ENCODER:   switch (HwMode) {
+// ENCODER:   default: llvm_unreachable("Unhandled HwMode");
+// ENCODER:   case 0: {
+// ENCODER:   switch (OpNum) {
+// ENCODER:   case 0:
+// ENCODER:     return 120;
+// ENCODER:   }
+// ENCODER:   break;
+// ENCODER:   }
+// ENCODER:   case 1: {
+// ENCODER:   switch (OpNum) {
+// ENCODER:   case 0:
+// ENCODER:     return 112;
+// ENCODER:   }
+// ENCODER:   break;
+// ENCODER:   }
+// ENCODER:   case 2: {
+// ENCODER:   switch (OpNum) {
+// ENCODER:   case 0:
+// ENCODER:     return 112;
+// ENCODER:   }
+// ENCODER:   break;
+// ENCODER:   }
+// ENCODER:   case 3: {
+// ENCODER:   switch (OpNum) {
+// ENCODER:   case 0:
+// ENCODER:     return 24;
+// ENCODER:   }
+// ENCODER:   break;
+// ENCODER:   }
+// ENCODER:   }
+// ENCODER:   break;
+// ENCODER: }
diff --git a/llvm/test/TableGen/HwModeEncodeDecode3.td b/llvm/test/TableGen/HwModeEncodeDecode3.td
index 8e0266b2c55a..c4d488d9d5f8 100644
--- a/llvm/test/TableGen/HwModeEncodeDecode3.td
+++ b/llvm/test/TableGen/HwModeEncodeDecode3.td
@@ -22,8 +22,9 @@ def Myi32 : Operand<i32> {
 def HasA : Predicate<"Subtarget->hasA()">;
 def HasB : Predicate<"Subtarget->hasB()">;
 
-def ModeA : HwMode<"+a", [HasA]>;
-def ModeB : HwMode<"+b", [HasB]>;
+def ModeA : HwMode<"+a", [HasA]>; // Mode 1
+def ModeB : HwMode<"+b", [HasB]>; // Mode 2
+def ModeC : HwMode<"+c", []>;     // Mode 3
 
 
 def fooTypeEncDefault : InstructionEncoding {
@@ -55,13 +56,23 @@ def fooTypeEncB : InstructionEncoding {
   let Inst{1...0} = 0b11;
 }
 
+def fooTypeEncC : InstructionEncoding {
+  let Size = 4;
+  field bits<32> SoftFail = 0;
+  bits<32> Inst;
+  bits<8> factor;
+  let Inst{31...24} = factor;
+  let Inst{23...21} = 0b110;
+  let Inst{1...0} = 0b11;
+}
+
 // Test for DefaultMode as a selector.
 def foo : Instruction {
   let OutOperandList = (outs);
   let InOperandList = (ins i32imm:$factor);
   let EncodingInfos = EncodingByHwMode<
-    [ModeA, ModeB, DefaultMode], [fooTypeEncA, fooTypeEncB, fooTypeEncDefault]
-  >;
+  [ModeC, ModeA, ModeB, DefaultMode],
+  [fooTypeEncC, fooTypeEncA, fooTypeEncB, fooTypeEncDefault]>;
   let AsmString = "foo  $factor";
 }
 
@@ -102,9 +113,9 @@ def unrelated: Instruction {
 
 
 // Under default settings, using 'HwMode' to dictate instruction encodings results in
-// significant duplication of DecoderTables. The three tables ‘DecoderTableAlt32’,
-// ‘DecoderTableAlt_ModeA32’, and ‘DecoderTableAlt_ModeB32’ are exact duplicates and
-// could effectively be merged into one.
+// significant duplication of DecoderTables. The four tables ‘DecoderTableAlt32’,
+// ‘DecoderTableAlt_ModeA32’, ‘DecoderTableAlt_ModeB32’ and 'DecoderTable_ModeC32' are
+// exact duplicates and could effectively be merged into one.
 // DECODER-LABEL: DecoderTable32[] =
 // DECODER-DAG: Opcode: bar
 // DECODER-LABEL: DecoderTable64[] =
@@ -115,6 +126,8 @@ def unrelated: Instruction {
 // DECODER-DAG: Opcode: unrelated
 // DECODER-LABEL: DecoderTableAlt_ModeB32[] =
 // DECODER-DAG: Opcode: unrelated
+// DECODER-LABEL: DecoderTableAlt_ModeC32[] =
+// DECODER-DAG: Opcode: unrelated
 // DECODER-LABEL: DecoderTable_ModeA32[] =
 // DECODER-DAG: Opcode: fooTypeEncA:foo
 // DECODER-DAG: Opcode: bar
@@ -122,9 +135,12 @@ def unrelated: Instruction {
 // DECODER-DAG: Opcode: fooTypeEncB:foo
 // DECODER-DAG: Opcode: fooTypeEncA:baz
 // DECODER-DAG: Opcode: bar
+// DECODER-LABEL: DecoderTable_ModeC32[] =
+// DECODER-DAG: Opcode: fooTypeEncC:foo
+// DECODER-DAG: Opcode: bar
 
 // Under the 'O1' optimization level, unnecessary duplicate tables will be eliminated,
-// reducing the three ‘Alt’ tables down to just one.
+// reducing the four ‘Alt’ tables down to just one.
 // DECODER-SUPPRESS-O1-LABEL: DecoderTable32[] =
 // DECODER-SUPPRESS-O1-DAG: Opcode: bar
 // DECODER-SUPPRESS-O1-LABEL: DecoderTable64[] =
@@ -138,6 +154,9 @@ def unrelated: Instruction {
 // DECODER-SUPPRESS-O1-DAG: Opcode: fooTypeEncB:foo
 // DECODER-SUPPRESS-O1-DAG: Opcode: fooTypeEncA:baz
 // DECODER-SUPPRESS-O1-DAG: Opcode: bar
+// DECODER-SUPPRESS-O1-LABEL: DecoderTable_ModeC32[] =
+// DECODER-SUPPRESS-O1-DAG: Opcode: fooTypeEncC:foo
+// DECODER-SUPPRESS-O1-DAG: Opcode: bar
 
 // Under the 'O2' optimization condition, instructions possessing the 'EncodingByHwMode'
 // attribute will be extracted from their original DecoderNamespace and placed into their
@@ -159,37 +178,90 @@ def unrelated: Instruction {
 // DECODER-SUPPRESS-O2-DAG: Opcode: fooTypeEncB:foo
 // DECODER-SUPPRESS-O2-DAG: Opcode: fooTypeEncA:baz
 // DECODER-SUPPRESS-O2-NOT: Opcode: bar
+// DECODER-SUPPRESS-O2-LABEL: DecoderTable_ModeC32[] =
+// DECODER-SUPPRESS-O2-DAG: Opcode: fooTypeEncC:foo
+// DECODER-SUPPRESS-O2-NOT: Opcode: bar
 
-// ENCODER-LABEL:   static const uint64_t InstBits_DefaultMode[] = {
+// For 'bar' and 'unrelated', we didn't assign any HwModes for them,
+// they should keep the same in the following four tables.
+// For 'foo' we assigned four HwModes( includes 'DefaultMode' ),
+// it's encodings should be different in the following four tables.
+// For 'baz' we only assigned ModeB for it, so it will be presented
+// as '0' in the tables of ModeA, ModeC and Default Mode.
+// ENCODER-LABEL:   static const uint64_t InstBits[] = {
 // ENCODER:         UINT64_C(2),        // bar
 // ENCODER:         UINT64_C(0),        // baz
 // ENCODER:         UINT64_C(8),        // foo
 // ENCODER:         UINT64_C(2),        // unrelated
-
 // ENCODER-LABEL:   static const uint64_t InstBits_ModeA[] = {
 // ENCODER:         UINT64_C(2),        // bar
 // ENCODER:         UINT64_C(0),        // baz
 // ENCODER:         UINT64_C(12),       // foo
 // ENCODER:         UINT64_C(2),        // unrelated
-
 // ENCODER-LABEL:   static const uint64_t InstBits_ModeB[] = {
 // ENCODER:         UINT64_C(2),        // bar
 // ENCODER:         UINT64_C(12),       // baz
 // ENCODER:         UINT64_C(3),        // foo
 // ENCODER:         UINT64_C(2),        // unrelated
+// ENCODER-LABEL:   static const uint64_t InstBits_ModeC[] = {
+// ENCODER:         UINT64_C(2),        // bar
+// ENCODER:         UINT64_C(0),        // baz
+// ENCODER:         UINT64_C(12582915), // foo
+// ENCODER:         UINT64_C(2),        // unrelated
 
-// ENCODER:  unsigned HwMode = STI.getHwMode();
-// ENCODER:  switch (HwMode) {
-// ENCODER:  default: llvm_unreachable("Unknown hardware mode!"); break;
-// ENCODER:  case 0: InstBits = InstBits_DefaultMode; break;
-// ENCODER:  case 1: InstBits = InstBits_ModeA; break;
-// ENCODER:  case 2: InstBits = InstBits_ModeB; break;
-// ENCODER:  };
-
-// ENCODER:     case ::foo: {
-// ENCODER:      switch (HwMode) {
-// ENCODER:      default: llvm_unreachable("Unhandled HwMode");
-// ENCODER:      case 0: {
-// ENCODER:      case 1: {
-// ENCODER:      case 2: {
-
+// ENCODER-LABEL: case ::bar:
+// ENCODER-LABEL: case ::unrelated:
+// ENCODER-NOT: getHwMode
+// ENCODER-LABEL: case ::foo: {
+// ENCODER: unsigned HwMode = STI.getHwMode(MCSubtargetInfo::HwMode_EncodingInfo);
+// ENCODER: switch (HwMode) {
+// ENCODER: default: llvm_unreachable("Unknown hardware mode!"); break;
+// ENCODER: case 0: InstBitsByHw = InstBits; break;
+// ENCODER: case 1: InstBitsByHw = InstBits_ModeA; break;
+// ENCODER: case 2: InstBitsByHw = InstBits_ModeB; break;
+// ENCODER: case 3: InstBitsByHw = InstBits_ModeC; break;
+// ENCODER: };
+// ENCODER: Value = InstBitsByHw[opcode];
+// ENCODER: switch (HwMode) {
+// ENCODER: default: llvm_unreachable("Unhandled HwMode");
+// ENCODER: case 0: {
+// ENCODER: op = getMachineOpValue(MI, MI.getOperand(0), Fixups, STI);
+// ENCODER: op &= UINT64_C(240);
+// ENCODER: Value |= op;
+// ENCODER: break;
+// ENCODER: }
+// ENCODER: case 1: {
+// ENCODER: op = getMachineOpValue(MI, MI.getOperand(0), Fixups, STI);
+// ENCODER: op &= UINT64_C(240);
+// ENCODER: Value |= op;
+// ENCODER: break;
+// ENCODER: }
+// ENCODER: case 2: {
+// ENCODER: op = getMachineOpValue(MI, MI.getOperand(0), Fixups, STI);
+// ENCODER: op &= UINT64_C(255);
+// ENCODER: op <<= 8;
+// ENCODER: Value |= op;
+// ENCODER: break;
+// ENCODER: }
+// ENCODER: case 3: {
+// ENCODER: op = getMachineOpValue(MI, MI.getOperand(0), Fixups, STI);
+// ENCODER: op &= UINT64_C(255);
+// ENCODER: op <<= 24;
+// ENCODER: Value |= op;
+// ENCODER: break;
+// ENCODER: }
+// ENCODER-LABEL: case ::baz: {
+// ENCODER: unsigned HwMode = STI.getHwMode(MCSubtargetInfo::HwMode_EncodingInfo);
+// ENCODER: switch (HwMode) {
+// ENCODER: default: llvm_unreachable("Unknown hardware mode!"); break;
+// ENCODER: case 2: InstBitsByHw = InstBits_ModeB; break;
+// ENCODER: };
+// ENCODER: Value = InstBitsByHw[opcode];
+// ENCODER: switch (HwMode) {
+// ENCODER: default: llvm_unreachable("Unhandled HwMode");
+// ENCODER: case 2: {
+// ENCODER: op = getMachineOpValue(MI, MI.getOperand(0), Fixups, STI);
+// ENCODER: op &= UINT64_C(240);
+// ENCODER: Value |= op;
+// ENCODER: break;
+// ENCODER: }
diff --git a/llvm/test/ThinLTO/X86/distributed_indexes.ll b/llvm/test/ThinLTO/X86/distributed_indexes.ll
index 4f2662b1b34e..50724e466e30 100644
--- a/llvm/test/ThinLTO/X86/distributed_indexes.ll
+++ b/llvm/test/ThinLTO/X86/distributed_indexes.ll
@@ -16,11 +16,11 @@
 ; BACKEND1-NEXT: </MODULE_STRTAB_BLOCK
 ; BACKEND1-NEXT: <GLOBALVAL_SUMMARY_BLOCK
 ; BACKEND1-NEXT: <VERSION
-; BACKEND1-DAG: <VALUE_GUID {{.*}} op0={{.*}}
-; BACKEND1-DAG: <VALUE_GUID {{.*}} op0={{.*}}
-; BACKEND1-DAG: <VALUE_GUID {{.*}} op0={{.*}}
-; BACKEND1-DAG: <VALUE_GUID {{.*}} op0={{.*}}
-; BACKEND1-DAG: <VALUE_GUID {{.*}} op0={{.*}}
+; BACKEND1-DAG: <VALUE_GUID op0={{.*}}
+; BACKEND1-DAG: <VALUE_GUID op0={{.*}}
+; BACKEND1-DAG: <VALUE_GUID op0={{.*}}
+; BACKEND1-DAG: <VALUE_GUID op0={{.*}}
+; BACKEND1-DAG: <VALUE_GUID op0={{.*}}
 ; BACKEND1-NEXT: <COMBINED_PROFILE {{.*}} op1=0
 ; BACKEND1-NEXT: <COMBINED_PROFILE {{.*}} op1=0
 ; BACKEND1-NEXT: <COMBINED_PROFILE {{.*}} op1=1
@@ -34,9 +34,9 @@
 ; BACKEND2-NEXT: </MODULE_STRTAB_BLOCK
 ; BACKEND2-NEXT: <GLOBALVAL_SUMMARY_BLOCK
 ; BACKEND2-NEXT: <VERSION
-; BACKEND2-DAG: <VALUE_GUID {{.*}} op0={{.*}}
-; BACKEND2-DAG: <VALUE_GUID {{.*}} op0={{.*}}
-; BACKEND2-DAG: <VALUE_GUID {{.*}} op0={{.*}}
+; BACKEND2-DAG: <VALUE_GUID op0={{.*}}
+; BACKEND2-DAG: <VALUE_GUID op0={{.*}}
+; BACKEND2-DAG: <VALUE_GUID op0={{.*}}
 ; BACKEND2-NEXT: <COMBINED
 ; BACKEND2-NEXT: <COMBINED
 ; BACKEND2-NEXT: <COMBINED_ALIAS
diff --git a/llvm/test/Transforms/AggressiveInstCombine/strcmp.ll b/llvm/test/Transforms/AggressiveInstCombine/strcmp.ll
deleted file mode 100644
index 99dd450e6f44..000000000000
--- a/llvm/test/Transforms/AggressiveInstCombine/strcmp.ll
+++ /dev/null
@@ -1,219 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=aggressive-instcombine -S | FileCheck %s
-
-declare i32 @strcmp(ptr, ptr)
-
-@s0 = constant [1 x i8] c"\00"
-@s1 = constant [2 x i8] c"0\00"
-@s2 = constant [3 x i8] c"01\00"
-@s3 = constant [4 x i8] c"012\00"
-@s4 = constant [5 x i8] c"0123\00"
-
-; Expand strcmp(C, "x"), strcmp(C, "xy").
-
-define i1 @expand_strcmp_s0(ptr %C) {
-; CHECK-LABEL: @expand_strcmp_s0(
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strcmp(ptr [[C:%.*]], ptr @s0)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; CHECK-NEXT:    ret i1 [[CMP]]
-;
-  %call = call i32 @strcmp(ptr %C, ptr @s0)
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @expand_strcmp_eq_s1(ptr %C) {
-; CHECK-LABEL: @expand_strcmp_eq_s1(
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strcmp(ptr [[C:%.*]], ptr @s1)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; CHECK-NEXT:    ret i1 [[CMP]]
-;
-  %call = call i32 @strcmp(ptr %C, ptr @s1)
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @expand_strcmp_eq_s1_commuted(ptr %C) {
-; CHECK-LABEL: @expand_strcmp_eq_s1_commuted(
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strcmp(ptr @s1, ptr [[C:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; CHECK-NEXT:    ret i1 [[CMP]]
-;
-  %call = call i32 @strcmp(ptr @s1, ptr %C)
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @expand_strcmp_ne_s1(ptr %C) {
-; CHECK-LABEL: @expand_strcmp_ne_s1(
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strcmp(ptr [[C:%.*]], ptr @s1)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
-; CHECK-NEXT:    ret i1 [[CMP]]
-;
-  %call = call i32 @strcmp(ptr %C, ptr @s1)
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @expand_strcmp_sgt_s1(ptr %C) {
-; CHECK-LABEL: @expand_strcmp_sgt_s1(
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strcmp(ptr [[C:%.*]], ptr @s1)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
-; CHECK-NEXT:    ret i1 [[CMP]]
-;
-  %call = call i32 @strcmp(ptr %C, ptr @s1)
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @expand_strcmp_sge_s1(ptr %C) {
-; CHECK-LABEL: @expand_strcmp_sge_s1(
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strcmp(ptr [[C:%.*]], ptr @s1)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[CALL]], 0
-; CHECK-NEXT:    ret i1 [[CMP]]
-;
-  %call = call i32 @strcmp(ptr %C, ptr @s1)
-  %cmp = icmp sge i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @expand_strcmp_slt_s1(ptr %C) {
-; CHECK-LABEL: @expand_strcmp_slt_s1(
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strcmp(ptr [[C:%.*]], ptr @s1)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
-; CHECK-NEXT:    ret i1 [[CMP]]
-;
-  %call = call i32 @strcmp(ptr %C, ptr @s1)
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @expand_strcmp_sle_s1(ptr %C) {
-; CHECK-LABEL: @expand_strcmp_sle_s1(
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strcmp(ptr [[C:%.*]], ptr @s1)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[CALL]], 0
-; CHECK-NEXT:    ret i1 [[CMP]]
-;
-  %call = call i32 @strcmp(ptr %C, ptr @s1)
-  %cmp = icmp sle i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @expand_strcmp_s1_fail_1(ptr %C) {
-; CHECK-LABEL: @expand_strcmp_s1_fail_1(
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strcmp(ptr [[C:%.*]], ptr @s1)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 1
-; CHECK-NEXT:    ret i1 [[CMP]]
-;
-  %call = call i32 @strcmp(ptr %C, ptr @s1)
-  %cmp = icmp eq i32 %call, 1
-  ret i1 %cmp
-}
-
-define i1 @expand_strcmp_s1_fail_2(ptr %C) {
-; CHECK-LABEL: @expand_strcmp_s1_fail_2(
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strcmp(ptr @s1, ptr @s1)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; CHECK-NEXT:    ret i1 [[CMP]]
-;
-  %call = call i32 @strcmp(ptr @s1, ptr @s1)
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i32 @expand_strcmp_s1_fail_3(ptr %C) {
-; CHECK-LABEL: @expand_strcmp_s1_fail_3(
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strcmp(ptr [[C:%.*]], ptr @s1)
-; CHECK-NEXT:    ret i32 [[CALL]]
-;
-  %call = call i32 @strcmp(ptr %C, ptr @s1)
-  ret i32 %call
-}
-
-define i1 @expand_strcmp_eq_s2(ptr %C) {
-; CHECK-LABEL: @expand_strcmp_eq_s2(
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strcmp(ptr [[C:%.*]], ptr @s2)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; CHECK-NEXT:    ret i1 [[CMP]]
-;
-  %call = call i32 @strcmp(ptr %C, ptr @s2)
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @expand_strcmp_ne_s2(ptr %C) {
-; CHECK-LABEL: @expand_strcmp_ne_s2(
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strcmp(ptr [[C:%.*]], ptr @s2)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
-; CHECK-NEXT:    ret i1 [[CMP]]
-;
-  %call = call i32 @strcmp(ptr %C, ptr @s2)
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @expand_strcmp_sgt_s2(ptr %C) {
-; CHECK-LABEL: @expand_strcmp_sgt_s2(
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strcmp(ptr [[C:%.*]], ptr @s2)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
-; CHECK-NEXT:    ret i1 [[CMP]]
-;
-  %call = call i32 @strcmp(ptr %C, ptr @s2)
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @expand_strcmp_sge_s2(ptr %C) {
-; CHECK-LABEL: @expand_strcmp_sge_s2(
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strcmp(ptr [[C:%.*]], ptr @s2)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[CALL]], 0
-; CHECK-NEXT:    ret i1 [[CMP]]
-;
-  %call = call i32 @strcmp(ptr %C, ptr @s2)
-  %cmp = icmp sge i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @expand_strcmp_slt_s2(ptr %C) {
-; CHECK-LABEL: @expand_strcmp_slt_s2(
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strcmp(ptr [[C:%.*]], ptr @s2)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
-; CHECK-NEXT:    ret i1 [[CMP]]
-;
-  %call = call i32 @strcmp(ptr %C, ptr @s2)
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @expand_strcmp_sle_s2(ptr %C) {
-; CHECK-LABEL: @expand_strcmp_sle_s2(
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strcmp(ptr [[C:%.*]], ptr @s2)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[CALL]], 0
-; CHECK-NEXT:    ret i1 [[CMP]]
-;
-  %call = call i32 @strcmp(ptr %C, ptr @s2)
-  %cmp = icmp sle i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @expand_strcmp_s3(ptr %C) {
-; CHECK-LABEL: @expand_strcmp_s3(
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strcmp(ptr [[C:%.*]], ptr @s3)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; CHECK-NEXT:    ret i1 [[CMP]]
-;
-  %call = call i32 @strcmp(ptr %C, ptr @s3)
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @expand_strcmp_s4(ptr %C) {
-; CHECK-LABEL: @expand_strcmp_s4(
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strcmp(ptr [[C:%.*]], ptr @s4)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; CHECK-NEXT:    ret i1 [[CMP]]
-;
-  %call = call i32 @strcmp(ptr %C, ptr @s4)
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
diff --git a/llvm/test/Transforms/AggressiveInstCombine/strncmp-1.ll b/llvm/test/Transforms/AggressiveInstCombine/strncmp-1.ll
new file mode 100644
index 000000000000..21c82d82451d
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/strncmp-1.ll
@@ -0,0 +1,254 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=aggressive-instcombine < %s | FileCheck %s
+
+; check whether we generate the right IR
+
+declare i32 @strncmp(ptr nocapture, ptr nocapture, i64)
+declare i32 @strcmp(ptr nocapture, ptr nocapture)
+
+@s2 = constant [2 x i8] c"a\00"
+@s3 = constant [3 x i8] c"ab\00"
+@s3ff = constant [3 x i8] c"\FE\FF\00"
+
+define i1 @test_strncmp_1(ptr %s) {
+; CHECK-LABEL: define i1 @test_strncmp_1(
+; CHECK-SAME: ptr [[S:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[SUB:%.*]]
+; CHECK:       sub_0:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[S]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 97, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[NE:%.*]], label [[SUB1:%.*]]
+; CHECK:       sub_1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[S]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 98, [[TMP6]]
+; CHECK-NEXT:    br label [[NE]]
+; CHECK:       ne:
+; CHECK-NEXT:    [[TMP8:%.*]] = phi i32 [ [[TMP2]], [[SUB]] ], [ [[TMP7]], [[SUB1]] ]
+; CHECK-NEXT:    br label [[ENTRY:%.*]]
+; CHECK:       entry.tail:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP8]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %call = tail call i32 @strncmp(ptr nonnull dereferenceable(3) @s3, ptr nonnull dereferenceable(1) %s, i64 2)
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @test_strncmp_2(ptr %s) {
+; CHECK-LABEL: define i1 @test_strncmp_2(
+; CHECK-SAME: ptr [[S:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[SUB:%.*]]
+; CHECK:       sub_0:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[S]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 97, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[NE:%.*]], label [[SUB1:%.*]]
+; CHECK:       sub_1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[S]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 98, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i32 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[TMP8]], label [[NE]], label [[SUB2:%.*]]
+; CHECK:       sub_2:
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[S]], i64 2
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP12:%.*]] = sub i32 0, [[TMP11]]
+; CHECK-NEXT:    br label [[NE]]
+; CHECK:       ne:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi i32 [ [[TMP2]], [[SUB]] ], [ [[TMP7]], [[SUB1]] ], [ [[TMP12]], [[SUB2]] ]
+; CHECK-NEXT:    br label [[ENTRY:%.*]]
+; CHECK:       entry.tail:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP13]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %call = tail call i32 @strncmp(ptr nonnull dereferenceable(3) @s3, ptr nonnull dereferenceable(1) %s, i64 3)
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @test_strncmp_3(ptr %s) {
+; CHECK-LABEL: define i1 @test_strncmp_3(
+; CHECK-SAME: ptr [[S:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[SUB:%.*]]
+; CHECK:       sub_0:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[S]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 97, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[NE:%.*]], label [[SUB1:%.*]]
+; CHECK:       sub_1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[S]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 98, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i32 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[TMP8]], label [[NE]], label [[SUB2:%.*]]
+; CHECK:       sub_2:
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[S]], i64 2
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP12:%.*]] = sub i32 0, [[TMP11]]
+; CHECK-NEXT:    br label [[NE]]
+; CHECK:       ne:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi i32 [ [[TMP2]], [[SUB]] ], [ [[TMP7]], [[SUB1]] ], [ [[TMP12]], [[SUB2]] ]
+; CHECK-NEXT:    br label [[ENTRY:%.*]]
+; CHECK:       entry.tail:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP13]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %call = tail call i32 @strncmp(ptr nonnull dereferenceable(3) @s3, ptr nonnull dereferenceable(1) %s, i64 4)
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @test_strcmp_1(ptr %s) {
+; CHECK-LABEL: define i1 @test_strcmp_1(
+; CHECK-SAME: ptr [[S:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[SUB:%.*]]
+; CHECK:       sub_0:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[S]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 [[TMP1]], 97
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[NE:%.*]], label [[SUB1:%.*]]
+; CHECK:       sub_1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[S]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
+; CHECK-NEXT:    br label [[NE]]
+; CHECK:       ne:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP2]], [[SUB]] ], [ [[TMP6]], [[SUB1]] ]
+; CHECK-NEXT:    br label [[ENTRY:%.*]]
+; CHECK:       entry.tail:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[TMP7]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %call = tail call i32 @strcmp(ptr nonnull dereferenceable(1) %s, ptr nonnull dereferenceable(2) @s2)
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @test_strcmp_2(ptr %s) {
+; CHECK-LABEL: define i1 @test_strcmp_2(
+; CHECK-SAME: ptr [[S:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[SUB:%.*]]
+; CHECK:       sub_0:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[S]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 [[TMP1]], 97
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[NE:%.*]], label [[SUB1:%.*]]
+; CHECK:       sub_1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[S]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP6]], 98
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i32 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[TMP8]], label [[NE]], label [[SUB2:%.*]]
+; CHECK:       sub_2:
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[S]], i64 2
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP10]] to i32
+; CHECK-NEXT:    br label [[NE]]
+; CHECK:       ne:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi i32 [ [[TMP2]], [[SUB]] ], [ [[TMP7]], [[SUB1]] ], [ [[TMP11]], [[SUB2]] ]
+; CHECK-NEXT:    br label [[ENTRY:%.*]]
+; CHECK:       entry.tail:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[TMP12]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %call = tail call i32 @strcmp(ptr nonnull dereferenceable(1) %s, ptr nonnull dereferenceable(3) @s3)
+  %cmp = icmp sge i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @test_strcmp_3(ptr %s) {
+; CHECK-LABEL: define i1 @test_strcmp_3(
+; CHECK-SAME: ptr [[S:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[SUB:%.*]]
+; CHECK:       sub_0:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[S]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 97, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[NE:%.*]], label [[SUB1:%.*]]
+; CHECK:       sub_1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[S]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 98, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i32 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[TMP8]], label [[NE]], label [[SUB2:%.*]]
+; CHECK:       sub_2:
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[S]], i64 2
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP12:%.*]] = sub i32 0, [[TMP11]]
+; CHECK-NEXT:    br label [[NE]]
+; CHECK:       ne:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi i32 [ [[TMP2]], [[SUB]] ], [ [[TMP7]], [[SUB1]] ], [ [[TMP12]], [[SUB2]] ]
+; CHECK-NEXT:    br label [[ENTRY:%.*]]
+; CHECK:       entry.tail:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP13]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %call = tail call i32 @strcmp(ptr nonnull dereferenceable(3) @s3, ptr nonnull dereferenceable(1) %s)
+  %cmp = icmp sle i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @test_strcmp_4(ptr %s) {
+; CHECK-LABEL: define i1 @test_strcmp_4(
+; CHECK-SAME: ptr [[S:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[SUB_0:%.*]]
+; CHECK:       sub_0:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[S]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 254, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[NE:%.*]], label [[SUB_1:%.*]]
+; CHECK:       sub_1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[S]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 255, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i32 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[TMP8]], label [[NE]], label [[SUB_2:%.*]]
+; CHECK:       sub_2:
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[S]], i64 2
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP12:%.*]] = sub i32 0, [[TMP11]]
+; CHECK-NEXT:    br label [[NE]]
+; CHECK:       ne:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi i32 [ [[TMP2]], [[SUB_0]] ], [ [[TMP7]], [[SUB_1]] ], [ [[TMP12]], [[SUB_2]] ]
+; CHECK-NEXT:    br label [[ENTRY_TAIL:%.*]]
+; CHECK:       entry.tail:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %call = tail call i32 @strcmp(ptr nonnull dereferenceable(3) @s3ff, ptr nonnull dereferenceable(1) %s)
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
diff --git a/llvm/test/Transforms/AggressiveInstCombine/strncmp-2.ll b/llvm/test/Transforms/AggressiveInstCombine/strncmp-2.ll
new file mode 100644
index 000000000000..0cc5e3f135b6
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/strncmp-2.ll
@@ -0,0 +1,147 @@
+; RUN: opt -S -passes=aggressive-instcombine -strncmp-inline-threshold=3 < %s | FileCheck --check-prefixes=CHECK,TH-3 %s
+; RUN: opt -S -passes=aggressive-instcombine -strncmp-inline-threshold=2 < %s | FileCheck --check-prefixes=CHECK,TH-2 %s
+; RUN: opt -S -passes=aggressive-instcombine -strncmp-inline-threshold=1 < %s | FileCheck --check-prefixes=CHECK,TH-1 %s
+; RUN: opt -S -passes=aggressive-instcombine -strncmp-inline-threshold=0 < %s | FileCheck --check-prefixes=CHECK,TH-0 %s
+
+declare i32 @strcmp(ptr nocapture, ptr nocapture)
+declare i32 @strncmp(ptr nocapture, ptr nocapture, i64)
+
+@s1 = constant [1 x i8] c"\00", align 1
+@s2n = constant [2 x i8] c"aa", align 1
+@s3 = constant [3 x i8] c"aa\00", align 1
+@s4 = constant [4 x i8] c"aab\00", align 1
+
+; strncmp(s, "aa", 1)
+define i1 @test_strncmp_0(ptr %s) {
+entry:
+  %call = tail call i32 @strncmp(ptr nonnull dereferenceable(1) %s, ptr nonnull dereferenceable(3) @s3, i64 1)
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+; CHECK-LABEL: @test_strncmp_0(
+; CHECK: @strncmp
+
+; strncmp(s, "aa", 2)
+define i1 @test_strncmp_1(ptr %s) {
+entry:
+  %call = tail call i32 @strncmp(ptr nonnull dereferenceable(1) %s, ptr nonnull dereferenceable(3) @s3, i64 2)
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+; CHECK-LABEL: @test_strncmp_1(
+; TH-3-NOT: @strncmp
+; TH-2-NOT: @strncmp
+; TH-1: @strncmp
+; TH-0: @strncmp
+
+define i1 @test_strncmp_1_dereferenceable(ptr dereferenceable(2) %s) {
+entry:
+  %call = tail call i32 @strncmp(ptr nonnull %s, ptr nonnull dereferenceable(3) @s3, i64 2)
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+; CHECK-LABEL: @test_strncmp_1_dereferenceable(
+; CHECK: @strncmp
+
+define i32 @test_strncmp_1_not_comparision(ptr %s) {
+entry:
+  %call = tail call i32 @strncmp(ptr nonnull dereferenceable(1) %s, ptr nonnull dereferenceable(3) @s3, i64 2)
+  ret i32 %call
+}
+; CHECK-LABEL: @test_strncmp_1_not_comparision(
+; CHECK: @strncmp
+
+; strncmp(s, "aa", 3)
+define i1 @test_strncmp_2(ptr %s) {
+entry:
+  %call = tail call i32 @strncmp(ptr nonnull dereferenceable(1) %s, ptr nonnull dereferenceable(3) @s3, i64 3)
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+; CHECK-LABEL: @test_strncmp_2(
+; TH-3-NOT: @strncmp
+; TH-2: @strncmp
+; TH-1: @strncmp
+; TH-0: @strncmp
+
+; strncmp(s, "aab", 3)
+define i1 @test_strncmp_3(ptr %s) {
+entry:
+  %call = tail call i32 @strncmp(ptr nonnull dereferenceable(1) %s, ptr nonnull dereferenceable(4) @s4, i64 3)
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+; CHECK-LABEL: @test_strncmp_3(
+; TH-3-NOT: @strncmp
+
+; strncmp(s, "aab", 4)
+define i1 @test_strncmp_4(ptr %s) {
+entry:
+  %call = tail call i32 @strncmp(ptr nonnull dereferenceable(1) %s, ptr nonnull dereferenceable(4) @s4, i64 4)
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+; CHECK-LABEL: @test_strncmp_4(
+; TH-3: @strncmp
+
+; strncmp(s, "aa", 2)
+define i1 @test_strncmp_5(ptr %s) {
+entry:
+  %call = tail call i32 @strncmp(ptr nonnull dereferenceable(1) %s, ptr nonnull dereferenceable(3) @s3, i64 2)
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+; CHECK-LABEL: @test_strncmp_5(
+; TH-3-NOT: @strncmp
+
+; char s2[] = {'a', 'a'}
+; strncmp(s1, s2, 2)
+define i1 @test_strncmp_6(ptr %s1) {
+entry:
+  %call = tail call i32 @strncmp(ptr nonnull dereferenceable(1) %s1, ptr nonnull dereferenceable(2) @s2n, i64 2)
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+; CHECK-LABEL: @test_strncmp_6(
+; TH-3-NOT: @strncmp
+
+; char s2[] = {'a', 'a'}
+; strncmp(s, s2, 3)
+define i1 @test_strncmp_7(ptr %s) {
+entry:
+  %call = tail call i32 @strncmp(ptr nonnull dereferenceable(1) %s, ptr nonnull dereferenceable(2) @s2n, i64 3)
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+; CHECK-LABEL: @test_strncmp_7(
+; CHECK: @strncmp
+
+; strcmp(s, "")
+define i1 @test_strcmp_0(ptr %s) {
+entry:
+  %call = tail call i32 @strcmp(ptr nonnull dereferenceable(1) %s, ptr nonnull dereferenceable(1) @s1)
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+; CHECK-LABEL: @test_strcmp_0(
+; CHECK: @strcmp
+
+; strcmp(s, "aa")
+define i1 @test_strcmp_1(ptr %s) {
+entry:
+  %call = tail call i32 @strcmp(ptr nonnull dereferenceable(1) %s, ptr nonnull dereferenceable(3) @s3)
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+; CHECK-LABEL: @test_strcmp_1(
+; TH-3-NOT: @strcmp
+
+; strcmp(s, "aab")
+define i1 @test_strcmp_2(ptr %s) {
+entry:
+  %call = tail call i32 @strcmp(ptr nonnull dereferenceable(1) %s, ptr nonnull dereferenceable(4) @s4)
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+; CHECK-LABEL: @test_strcmp_2(
+; TH-3: @strcmp
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
new file mode 100644
index 000000000000..31da626e01f0
--- /dev/null
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
@@ -0,0 +1,3723 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX803 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX906 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX908 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX90A %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX940 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX10 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX11 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX12 %s
+
+;---------------------------------------------------------------------
+; atomicrmw xchg
+;---------------------------------------------------------------------
+
+; xchg is supported over PCIe, so no expansion is necessary
+define float @test_atomicrmw_xchg_f32_global_agent(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4
+; COMMON-NEXT:    ret float [[RES]]
+;
+  %res = atomicrmw xchg ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst
+  ret float %res
+}
+
+; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
+define float @test_atomicrmw_xchg_f32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; COMMON-NEXT:    ret float [[RES]]
+;
+  %res = atomicrmw xchg ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %res
+}
+
+; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
+define float @test_atomicrmw_xchg_f32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret float [[RES]]
+;
+  %res = atomicrmw xchg ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret float %res
+}
+
+; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
+define float @test_atomicrmw_xchg_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret float [[RES]]
+;
+  %res = atomicrmw xchg ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret float %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw fadd
+;---------------------------------------------------------------------
+
+define float @test_atomicrmw_fadd_f32_global_agent(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP5]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP5]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP5]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP5]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_agent(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4
+; GFX940-NEXT:    ret float [[RES]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[TMP5]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX11-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[TMP5]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret float [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst
+  ret float %res
+}
+
+define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP5]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP5]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP5]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP5]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX940-NEXT:    ret float [[RES]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[TMP5]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX11-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[TMP5]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret float [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %res
+}
+
+define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP5]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP5]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP5]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP5]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT:    ret float [[RES]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[TMP5]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX11-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[TMP5]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret float [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret float %res
+}
+
+define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP5]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP5]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP5]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP5]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT:    ret float [[RES]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[TMP5]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX11-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[TMP5]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret float [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret float %res
+}
+
+define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(ptr addrspace(1) %ptr, float %value) #0 {
+; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP5]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP5]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP5]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP5]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT:    ret float [[RES]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[TMP5]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX11-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[TMP5]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret float [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret float %res
+}
+
+define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(ptr addrspace(1) %ptr, float %value) #1 {
+; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP5]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP5]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP5]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP5]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT:    ret float [[RES]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[TMP5]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX11-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[TMP5]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret float [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret float %res
+}
+
+define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP5]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP5]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP5]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP5]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT:    ret float [[RES]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[TMP5]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX11-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[TMP5]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret float [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
+  ret float %res
+}
+
+define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP5]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP5]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP5]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP5]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT:    ret float [[RES]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[TMP5]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX11-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[TMP5]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret float [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret float %res
+}
+
+define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP5]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP5]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP5]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP5]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT:    ret float [[RES]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[TMP5]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX11-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[TMP5]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret float [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret float %res
+}
+
+define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP5]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP5]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP5]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP5]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT:    ret float [[RES]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[TMP5]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX11-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[TMP5]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret float [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret float %res
+}
+
+define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, float %value) #0 {
+; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP5]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP5]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP5]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP5]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT:    ret float [[RES]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[TMP5]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX11-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[TMP5]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret float [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret float %res
+}
+
+define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, float %value) #1 {
+; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP5]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP5]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP5]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP5]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT:    ret float [[RES]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[TMP5]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX11-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[TMP5]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret float [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret float %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw fadd (no return)
+;---------------------------------------------------------------------
+
+define void @test_atomicrmw_fadd_noret_f32_global_agent(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret void
+;
+; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret void
+;
+; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret void
+;
+; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret void
+;
+; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4
+; GFX940-NEXT:    ret void
+;
+; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret void
+;
+; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX11-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret void
+;
+; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret void
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst
+  ret void
+}
+
+define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret void
+;
+; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret void
+;
+; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret void
+;
+; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret void
+;
+; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX940-NEXT:    ret void
+;
+; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret void
+;
+; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX11-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret void
+;
+; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret void
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret void
+;
+; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret void
+;
+; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret void
+;
+; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret void
+;
+; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT:    ret void
+;
+; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret void
+;
+; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX11-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret void
+;
+; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret void
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret void
+}
+
+define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret void
+;
+; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret void
+;
+; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret void
+;
+; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret void
+;
+; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT:    ret void
+;
+; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret void
+;
+; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX11-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret void
+;
+; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret void
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret void
+}
+
+define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(ptr addrspace(1) %ptr, float %value) #0 {
+; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret void
+;
+; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret void
+;
+; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret void
+;
+; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret void
+;
+; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT:    ret void
+;
+; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret void
+;
+; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX11-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret void
+;
+; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret void
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret void
+}
+
+define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(ptr addrspace(1) %ptr, float %value) #1 {
+; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret void
+;
+; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret void
+;
+; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret void
+;
+; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret void
+;
+; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT:    ret void
+;
+; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret void
+;
+; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX11-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret void
+;
+; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret void
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret void
+}
+
+define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret void
+;
+; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret void
+;
+; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret void
+;
+; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret void
+;
+; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT:    ret void
+;
+; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret void
+;
+; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX11-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret void
+;
+; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret void
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
+  ret void
+}
+
+define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret void
+;
+; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret void
+;
+; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret void
+;
+; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret void
+;
+; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT:    ret void
+;
+; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret void
+;
+; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX11-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret void
+;
+; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret void
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret void
+}
+
+define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret void
+;
+; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret void
+;
+; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret void
+;
+; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret void
+;
+; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT:    ret void
+;
+; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret void
+;
+; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX11-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret void
+;
+; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret void
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret void
+}
+
+define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret void
+;
+; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret void
+;
+; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret void
+;
+; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret void
+;
+; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT:    ret void
+;
+; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret void
+;
+; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX11-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret void
+;
+; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret void
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret void
+}
+
+define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, float %value) #0 {
+; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret void
+;
+; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret void
+;
+; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret void
+;
+; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret void
+;
+; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT:    ret void
+;
+; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret void
+;
+; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX11-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret void
+;
+; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret void
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret void
+}
+
+define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, float %value) #1 {
+; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret void
+;
+; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret void
+;
+; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret void
+;
+; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret void
+;
+; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT:    ret void
+;
+; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret void
+;
+; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX11-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret void
+;
+; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret void
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret void
+}
+
+;---------------------------------------------------------------------
+; atomicrmw fsub
+;---------------------------------------------------------------------
+
+define float @test_atomicrmw_fsub_f32_global_agent(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret float [[RES]]
+;
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst
+  ret float %res
+}
+
+define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret float [[RES]]
+;
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %res
+}
+
+define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret float [[RES]]
+;
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret float %res
+}
+
+define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret float [[RES]]
+;
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret float %res
+}
+
+define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret float [[TMP5]]
+;
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
+  ret float %res
+}
+
+define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret float [[TMP5]]
+;
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret float %res
+}
+
+define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret float [[TMP5]]
+;
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret float %res
+}
+
+define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret float [[TMP5]]
+;
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret float %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw fmax
+;---------------------------------------------------------------------
+
+define float @test_atomicrmw_fmax_f32_global_agent(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret float [[RES]]
+;
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst
+  ret float %res
+}
+
+define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret float [[RES]]
+;
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %res
+}
+
+define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret float [[RES]]
+;
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret float %res
+}
+
+define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret float [[RES]]
+;
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret float %res
+}
+
+define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret float [[TMP6]]
+;
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
+  ret float %res
+}
+
+define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret float [[TMP6]]
+;
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret float %res
+}
+
+define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret float [[TMP6]]
+;
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret float %res
+}
+
+define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret float [[TMP6]]
+;
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret float %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw fmin
+;---------------------------------------------------------------------
+
+define float @test_atomicrmw_fmin_f32_global_agent(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret float [[RES]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst
+  ret float %res
+}
+
+define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret float [[RES]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %res
+}
+
+define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret float [[RES]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret float %res
+}
+
+define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret float [[RES]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret float %res
+}
+
+define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret float [[TMP6]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
+  ret float %res
+}
+
+define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret float [[TMP6]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret float %res
+}
+
+define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret float [[TMP6]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret float %res
+}
+
+define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret float [[TMP6]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret float %res
+}
+
+attributes #0 = { "denormal-fp-mode-f32"="preserve-sign,preserve-sign" }
+attributes #1 = { "denormal-fp-mode-f32"="dynamic,dynamic" }
+
+!0 = !{}
+;.
+; GFX803: [[META0]] = !{}
+;.
+; GFX906: [[META0]] = !{}
+;.
+; GFX908: [[META0]] = !{}
+;.
+; GFX90A: [[META0]] = !{}
+;.
+; GFX940: [[META0]] = !{}
+;.
+; GFX10: [[META0]] = !{}
+;.
+; GFX11: [[META0]] = !{}
+;.
+; GFX12: [[META0]] = !{}
+;.
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll
index 8bbbcd16cb1a..35c546322e63 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll
@@ -16,9 +16,7 @@
 define float @test_atomicrmw_xchg_f32_global_system(ptr addrspace(1) %ptr, float %value) {
 ; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_system(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = bitcast float [[VALUE]] to i32
-; COMMON-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[TMP1]] seq_cst, align 4
-; COMMON-NEXT:    [[RES:%.*]] = bitcast i32 [[TMP2]] to float
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4
 ; COMMON-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw xchg ptr addrspace(1) %ptr, float %value seq_cst
@@ -29,9 +27,7 @@ define float @test_atomicrmw_xchg_f32_global_system(ptr addrspace(1) %ptr, float
 define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
 ; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = bitcast float [[VALUE]] to i32
-; COMMON-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[TMP1]] seq_cst, align 4
-; COMMON-NEXT:    [[RES:%.*]] = bitcast i32 [[TMP2]] to float
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
 ; COMMON-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw xchg ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -39,28 +35,24 @@ define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memo
 }
 
 ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
-define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_remote_memory_access(
+define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = bitcast float [[VALUE]] to i32
-; COMMON-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[TMP1]] seq_cst, align 4
-; COMMON-NEXT:    [[RES:%.*]] = bitcast i32 [[TMP2]] to float
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret float [[RES]]
 ;
-  %res = atomicrmw xchg ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw xchg ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory !0
   ret float %res
 }
 
 ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
-define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = bitcast float [[VALUE]] to i32
-; COMMON-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[TMP1]] seq_cst, align 4
-; COMMON-NEXT:    [[RES:%.*]] = bitcast i32 [[TMP2]] to float
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret float [[RES]]
 ;
-  %res = atomicrmw xchg ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw xchg ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret float %res
 }
 
@@ -268,7 +260,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
 ;
 ; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory(
 ; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
 ; GFX940-NEXT:    ret float [[RES]]
 ;
 ; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory(
@@ -326,8 +318,8 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
   ret float %res
 }
 
-define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) {
-; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_access(
+define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(
 ; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -344,7 +336,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_acc
 ; GFX803:       atomicrmw.end:
 ; GFX803-NEXT:    ret float [[TMP5]]
 ;
-; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_access(
+; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(
 ; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -361,7 +353,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_acc
 ; GFX906:       atomicrmw.end:
 ; GFX906-NEXT:    ret float [[TMP5]]
 ;
-; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_access(
+; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(
 ; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -378,7 +370,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_acc
 ; GFX908:       atomicrmw.end:
 ; GFX908-NEXT:    ret float [[TMP5]]
 ;
-; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_access(
+; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(
 ; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -395,12 +387,12 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_acc
 ; GFX90A:       atomicrmw.end:
 ; GFX90A-NEXT:    ret float [[TMP5]]
 ;
-; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_access(
+; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(
 ; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]]
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
 ; GFX940-NEXT:    ret float [[RES]]
 ;
-; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_access(
+; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(
 ; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -417,7 +409,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_acc
 ; GFX10:       atomicrmw.end:
 ; GFX10-NEXT:    ret float [[TMP5]]
 ;
-; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_access(
+; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(
 ; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -434,7 +426,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_acc
 ; GFX11:       atomicrmw.end:
 ; GFX11-NEXT:    ret float [[TMP5]]
 ;
-; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_access(
+; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(
 ; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -451,12 +443,12 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_acc
 ; GFX12:       atomicrmw.end:
 ; GFX12-NEXT:    ret float [[TMP5]]
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory !0
   ret float %res
 }
 
-define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) {
-; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -473,7 +465,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
 ; GFX803:       atomicrmw.end:
 ; GFX803-NEXT:    ret float [[TMP5]]
 ;
-; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -490,7 +482,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
 ; GFX906:       atomicrmw.end:
 ; GFX906-NEXT:    ret float [[TMP5]]
 ;
-; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -507,7 +499,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
 ; GFX908:       atomicrmw.end:
 ; GFX908-NEXT:    ret float [[TMP5]]
 ;
-; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -524,12 +516,12 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
 ; GFX90A:       atomicrmw.end:
 ; GFX90A-NEXT:    ret float [[TMP5]]
 ;
-; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]]
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
 ; GFX940-NEXT:    ret float [[RES]]
 ;
-; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -546,7 +538,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
 ; GFX10:       atomicrmw.end:
 ; GFX10-NEXT:    ret float [[TMP5]]
 ;
-; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -563,7 +555,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
 ; GFX11:       atomicrmw.end:
 ; GFX11-NEXT:    ret float [[TMP5]]
 ;
-; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -580,12 +572,12 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
 ; GFX12:       atomicrmw.end:
 ; GFX12-NEXT:    ret float [[TMP5]]
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret float %res
 }
 
-define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz(ptr addrspace(1) %ptr, float %value) #0 {
-; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz(
+define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(ptr addrspace(1) %ptr, float %value) #0 {
+; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
 ; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
 ; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -602,7 +594,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
 ; GFX803:       atomicrmw.end:
 ; GFX803-NEXT:    ret float [[TMP5]]
 ;
-; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz(
+; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
 ; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
 ; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -619,7 +611,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
 ; GFX906:       atomicrmw.end:
 ; GFX906-NEXT:    ret float [[TMP5]]
 ;
-; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz(
+; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
 ; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
 ; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -636,7 +628,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
 ; GFX908:       atomicrmw.end:
 ; GFX908-NEXT:    ret float [[TMP5]]
 ;
-; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz(
+; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
 ; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
 ; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -653,12 +645,12 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
 ; GFX90A:       atomicrmw.end:
 ; GFX90A-NEXT:    ret float [[TMP5]]
 ;
-; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz(
+; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
 ; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
-; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]]
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
 ; GFX940-NEXT:    ret float [[RES]]
 ;
-; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz(
+; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
 ; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
 ; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -675,7 +667,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
 ; GFX10:       atomicrmw.end:
 ; GFX10-NEXT:    ret float [[TMP5]]
 ;
-; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz(
+; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
 ; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
 ; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -692,7 +684,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
 ; GFX11:       atomicrmw.end:
 ; GFX11-NEXT:    ret float [[TMP5]]
 ;
-; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz(
+; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
 ; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
 ; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -709,12 +701,12 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
 ; GFX12:       atomicrmw.end:
 ; GFX12-NEXT:    ret float [[TMP5]]
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret float %res
 }
 
-define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic(ptr addrspace(1) %ptr, float %value) #1 {
-; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic(
+define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(ptr addrspace(1) %ptr, float %value) #1 {
+; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
 ; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
 ; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -731,7 +723,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
 ; GFX803:       atomicrmw.end:
 ; GFX803-NEXT:    ret float [[TMP5]]
 ;
-; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic(
+; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
 ; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
 ; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -748,7 +740,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
 ; GFX906:       atomicrmw.end:
 ; GFX906-NEXT:    ret float [[TMP5]]
 ;
-; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic(
+; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
 ; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
 ; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -765,7 +757,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
 ; GFX908:       atomicrmw.end:
 ; GFX908-NEXT:    ret float [[TMP5]]
 ;
-; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic(
+; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
 ; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
 ; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -782,12 +774,12 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
 ; GFX90A:       atomicrmw.end:
 ; GFX90A-NEXT:    ret float [[TMP5]]
 ;
-; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic(
+; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
 ; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
-; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]]
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
 ; GFX940-NEXT:    ret float [[RES]]
 ;
-; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic(
+; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
 ; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
 ; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -804,7 +796,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
 ; GFX10:       atomicrmw.end:
 ; GFX10-NEXT:    ret float [[TMP5]]
 ;
-; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic(
+; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
 ; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
 ; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -821,7 +813,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
 ; GFX11:       atomicrmw.end:
 ; GFX11-NEXT:    ret float [[TMP5]]
 ;
-; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic(
+; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
 ; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
 ; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -838,7 +830,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
 ; GFX12:       atomicrmw.end:
 ; GFX12-NEXT:    ret float [[TMP5]]
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret float %res
 }
 
@@ -1100,8 +1092,8 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
   ret float %res
 }
 
-define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) {
-; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
+define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
 ; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1118,7 +1110,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
 ; GFX803:       atomicrmw.end:
 ; GFX803-NEXT:    ret float [[TMP5]]
 ;
-; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
+; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
 ; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1135,7 +1127,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
 ; GFX906:       atomicrmw.end:
 ; GFX906-NEXT:    ret float [[TMP5]]
 ;
-; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
+; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
 ; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1152,7 +1144,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
 ; GFX908:       atomicrmw.end:
 ; GFX908-NEXT:    ret float [[TMP5]]
 ;
-; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
+; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
 ; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1169,12 +1161,12 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
 ; GFX90A:       atomicrmw.end:
 ; GFX90A-NEXT:    ret float [[TMP5]]
 ;
-; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
+; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
 ; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
 ; GFX940-NEXT:    ret float [[RES]]
 ;
-; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
+; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
 ; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1191,7 +1183,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
 ; GFX10:       atomicrmw.end:
 ; GFX10-NEXT:    ret float [[TMP5]]
 ;
-; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
+; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
 ; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1208,7 +1200,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
 ; GFX11:       atomicrmw.end:
 ; GFX11-NEXT:    ret float [[TMP5]]
 ;
-; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
+; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
 ; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1225,12 +1217,12 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
 ; GFX12:       atomicrmw.end:
 ; GFX12-NEXT:    ret float [[TMP5]]
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
 }
 
-define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) {
-; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1247,7 +1239,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
 ; GFX803:       atomicrmw.end:
 ; GFX803-NEXT:    ret float [[TMP5]]
 ;
-; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1264,7 +1256,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
 ; GFX906:       atomicrmw.end:
 ; GFX906-NEXT:    ret float [[TMP5]]
 ;
-; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1281,7 +1273,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
 ; GFX908:       atomicrmw.end:
 ; GFX908-NEXT:    ret float [[TMP5]]
 ;
-; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1298,12 +1290,12 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
 ; GFX90A:       atomicrmw.end:
 ; GFX90A-NEXT:    ret float [[TMP5]]
 ;
-; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
 ; GFX940-NEXT:    ret float [[RES]]
 ;
-; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1320,7 +1312,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
 ; GFX10:       atomicrmw.end:
 ; GFX10-NEXT:    ret float [[TMP5]]
 ;
-; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1337,7 +1329,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
 ; GFX11:       atomicrmw.end:
 ; GFX11-NEXT:    ret float [[TMP5]]
 ;
-; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1354,12 +1346,12 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
 ; GFX12:       atomicrmw.end:
 ; GFX12-NEXT:    ret float [[TMP5]]
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
 }
 
-define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(ptr addrspace(1) %ptr, float %value) #0 {
-; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(
+define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, float %value) #0 {
+; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
 ; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
 ; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1376,7 +1368,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
 ; GFX803:       atomicrmw.end:
 ; GFX803-NEXT:    ret float [[TMP5]]
 ;
-; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(
+; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
 ; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
 ; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1393,7 +1385,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
 ; GFX906:       atomicrmw.end:
 ; GFX906-NEXT:    ret float [[TMP5]]
 ;
-; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(
+; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
 ; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
 ; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1410,7 +1402,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
 ; GFX908:       atomicrmw.end:
 ; GFX908-NEXT:    ret float [[TMP5]]
 ;
-; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(
+; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
 ; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
 ; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1427,12 +1419,12 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
 ; GFX90A:       atomicrmw.end:
 ; GFX90A-NEXT:    ret float [[TMP5]]
 ;
-; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(
+; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
 ; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
 ; GFX940-NEXT:    ret float [[RES]]
 ;
-; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(
+; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
 ; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
 ; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1449,7 +1441,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
 ; GFX10:       atomicrmw.end:
 ; GFX10-NEXT:    ret float [[TMP5]]
 ;
-; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(
+; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
 ; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
 ; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1466,7 +1458,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
 ; GFX11:       atomicrmw.end:
 ; GFX11-NEXT:    ret float [[TMP5]]
 ;
-; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(
+; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
 ; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
 ; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1483,12 +1475,12 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
 ; GFX12:       atomicrmw.end:
 ; GFX12-NEXT:    ret float [[TMP5]]
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
 }
 
-define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(ptr addrspace(1) %ptr, float %value) #1 {
-; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(
+define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, float %value) #1 {
+; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
 ; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
 ; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1505,7 +1497,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
 ; GFX803:       atomicrmw.end:
 ; GFX803-NEXT:    ret float [[TMP5]]
 ;
-; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(
+; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
 ; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
 ; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1522,7 +1514,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
 ; GFX906:       atomicrmw.end:
 ; GFX906-NEXT:    ret float [[TMP5]]
 ;
-; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(
+; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
 ; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
 ; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1539,7 +1531,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
 ; GFX908:       atomicrmw.end:
 ; GFX908-NEXT:    ret float [[TMP5]]
 ;
-; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(
+; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
 ; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
 ; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1556,12 +1548,12 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
 ; GFX90A:       atomicrmw.end:
 ; GFX90A-NEXT:    ret float [[TMP5]]
 ;
-; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(
+; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
 ; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
 ; GFX940-NEXT:    ret float [[RES]]
 ;
-; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(
+; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
 ; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
 ; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1578,7 +1570,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
 ; GFX10:       atomicrmw.end:
 ; GFX10-NEXT:    ret float [[TMP5]]
 ;
-; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(
+; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
 ; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
 ; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1595,7 +1587,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
 ; GFX11:       atomicrmw.end:
 ; GFX11-NEXT:    ret float [[TMP5]]
 ;
-; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(
+; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
 ; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
 ; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1612,7 +1604,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
 ; GFX12:       atomicrmw.end:
 ; GFX12-NEXT:    ret float [[TMP5]]
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
 }
 
@@ -1878,8 +1870,8 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
   ret void
 }
 
-define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) {
-; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory_access(
+define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory(
 ; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1896,7 +1888,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memor
 ; GFX803:       atomicrmw.end:
 ; GFX803-NEXT:    ret void
 ;
-; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory_access(
+; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory(
 ; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1913,7 +1905,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memor
 ; GFX906:       atomicrmw.end:
 ; GFX906-NEXT:    ret void
 ;
-; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory_access(
+; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory(
 ; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1930,7 +1922,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memor
 ; GFX908:       atomicrmw.end:
 ; GFX908-NEXT:    ret void
 ;
-; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory_access(
+; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory(
 ; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1947,12 +1939,12 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memor
 ; GFX90A:       atomicrmw.end:
 ; GFX90A-NEXT:    ret void
 ;
-; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory_access(
+; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory(
 ; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]]
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
 ; GFX940-NEXT:    ret void
 ;
-; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory_access(
+; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory(
 ; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1969,7 +1961,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memor
 ; GFX10:       atomicrmw.end:
 ; GFX10-NEXT:    ret void
 ;
-; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory_access(
+; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory(
 ; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1986,7 +1978,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memor
 ; GFX11:       atomicrmw.end:
 ; GFX11-NEXT:    ret void
 ;
-; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory_access(
+; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory(
 ; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2003,12 +1995,12 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memor
 ; GFX12:       atomicrmw.end:
 ; GFX12-NEXT:    ret void
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) {
-; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2025,7 +2017,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
 ; GFX803:       atomicrmw.end:
 ; GFX803-NEXT:    ret void
 ;
-; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2042,7 +2034,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
 ; GFX906:       atomicrmw.end:
 ; GFX906-NEXT:    ret void
 ;
-; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2059,7 +2051,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
 ; GFX908:       atomicrmw.end:
 ; GFX908-NEXT:    ret void
 ;
-; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2076,12 +2068,12 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
 ; GFX90A:       atomicrmw.end:
 ; GFX90A-NEXT:    ret void
 ;
-; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]]
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
 ; GFX940-NEXT:    ret void
 ;
-; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2098,7 +2090,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
 ; GFX10:       atomicrmw.end:
 ; GFX10-NEXT:    ret void
 ;
-; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2115,7 +2107,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
 ; GFX11:       atomicrmw.end:
 ; GFX11-NEXT:    ret void
 ;
-; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2132,12 +2124,12 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
 ; GFX12:       atomicrmw.end:
 ; GFX12-NEXT:    ret void
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz(ptr addrspace(1) %ptr, float %value) #0 {
-; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz(
+define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(ptr addrspace(1) %ptr, float %value) #0 {
+; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
 ; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
 ; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2154,7 +2146,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
 ; GFX803:       atomicrmw.end:
 ; GFX803-NEXT:    ret void
 ;
-; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz(
+; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
 ; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
 ; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2171,7 +2163,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
 ; GFX906:       atomicrmw.end:
 ; GFX906-NEXT:    ret void
 ;
-; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz(
+; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
 ; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
 ; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2188,7 +2180,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
 ; GFX908:       atomicrmw.end:
 ; GFX908-NEXT:    ret void
 ;
-; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz(
+; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
 ; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
 ; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2205,12 +2197,12 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
 ; GFX90A:       atomicrmw.end:
 ; GFX90A-NEXT:    ret void
 ;
-; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz(
+; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
 ; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]]
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
 ; GFX940-NEXT:    ret void
 ;
-; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz(
+; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
 ; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
 ; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2227,7 +2219,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
 ; GFX10:       atomicrmw.end:
 ; GFX10-NEXT:    ret void
 ;
-; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz(
+; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
 ; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
 ; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2244,7 +2236,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
 ; GFX11:       atomicrmw.end:
 ; GFX11-NEXT:    ret void
 ;
-; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz(
+; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
 ; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
 ; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2261,12 +2253,12 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
 ; GFX12:       atomicrmw.end:
 ; GFX12-NEXT:    ret void
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic(ptr addrspace(1) %ptr, float %value) #1 {
-; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic(
+define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(ptr addrspace(1) %ptr, float %value) #1 {
+; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
 ; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
 ; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2283,7 +2275,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
 ; GFX803:       atomicrmw.end:
 ; GFX803-NEXT:    ret void
 ;
-; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic(
+; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
 ; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
 ; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2300,7 +2292,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
 ; GFX906:       atomicrmw.end:
 ; GFX906-NEXT:    ret void
 ;
-; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic(
+; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
 ; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
 ; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2317,7 +2309,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
 ; GFX908:       atomicrmw.end:
 ; GFX908-NEXT:    ret void
 ;
-; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic(
+; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
 ; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
 ; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2334,12 +2326,12 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
 ; GFX90A:       atomicrmw.end:
 ; GFX90A-NEXT:    ret void
 ;
-; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic(
+; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
 ; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]]
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
 ; GFX940-NEXT:    ret void
 ;
-; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic(
+; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
 ; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
 ; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2356,7 +2348,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
 ; GFX10:       atomicrmw.end:
 ; GFX10-NEXT:    ret void
 ;
-; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic(
+; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
 ; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
 ; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2373,7 +2365,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
 ; GFX11:       atomicrmw.end:
 ; GFX11-NEXT:    ret void
 ;
-; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic(
+; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
 ; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
 ; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2390,7 +2382,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
 ; GFX12:       atomicrmw.end:
 ; GFX12-NEXT:    ret void
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret void
 }
 
@@ -2652,8 +2644,8 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
   ret void
 }
 
-define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) {
-; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
+define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
 ; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2670,7 +2662,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
 ; GFX803:       atomicrmw.end:
 ; GFX803-NEXT:    ret void
 ;
-; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
+; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
 ; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2687,7 +2679,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
 ; GFX906:       atomicrmw.end:
 ; GFX906-NEXT:    ret void
 ;
-; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
+; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
 ; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2704,7 +2696,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
 ; GFX908:       atomicrmw.end:
 ; GFX908-NEXT:    ret void
 ;
-; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
+; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
 ; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2721,12 +2713,12 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
 ; GFX90A:       atomicrmw.end:
 ; GFX90A-NEXT:    ret void
 ;
-; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
+; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
 ; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
 ; GFX940-NEXT:    ret void
 ;
-; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
+; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
 ; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2743,7 +2735,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
 ; GFX10:       atomicrmw.end:
 ; GFX10-NEXT:    ret void
 ;
-; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
+; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
 ; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2760,7 +2752,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
 ; GFX11:       atomicrmw.end:
 ; GFX11-NEXT:    ret void
 ;
-; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
+; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
 ; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2777,12 +2769,12 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
 ; GFX12:       atomicrmw.end:
 ; GFX12-NEXT:    ret void
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret void
 }
 
-define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) {
-; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2799,7 +2791,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
 ; GFX803:       atomicrmw.end:
 ; GFX803-NEXT:    ret void
 ;
-; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2816,7 +2808,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
 ; GFX906:       atomicrmw.end:
 ; GFX906-NEXT:    ret void
 ;
-; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2833,7 +2825,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
 ; GFX908:       atomicrmw.end:
 ; GFX908-NEXT:    ret void
 ;
-; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2850,12 +2842,12 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
 ; GFX90A:       atomicrmw.end:
 ; GFX90A-NEXT:    ret void
 ;
-; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
 ; GFX940-NEXT:    ret void
 ;
-; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2872,7 +2864,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
 ; GFX10:       atomicrmw.end:
 ; GFX10-NEXT:    ret void
 ;
-; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2889,7 +2881,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
 ; GFX11:       atomicrmw.end:
 ; GFX11-NEXT:    ret void
 ;
-; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2906,12 +2898,12 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
 ; GFX12:       atomicrmw.end:
 ; GFX12-NEXT:    ret void
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret void
 }
 
-define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(ptr addrspace(1) %ptr, float %value) #0 {
-; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(
+define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, float %value) #0 {
+; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
 ; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
 ; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2928,7 +2920,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
 ; GFX803:       atomicrmw.end:
 ; GFX803-NEXT:    ret void
 ;
-; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(
+; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
 ; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
 ; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2945,7 +2937,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
 ; GFX906:       atomicrmw.end:
 ; GFX906-NEXT:    ret void
 ;
-; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(
+; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
 ; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
 ; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2962,7 +2954,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
 ; GFX908:       atomicrmw.end:
 ; GFX908-NEXT:    ret void
 ;
-; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(
+; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
 ; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
 ; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -2979,12 +2971,12 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
 ; GFX90A:       atomicrmw.end:
 ; GFX90A-NEXT:    ret void
 ;
-; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(
+; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
 ; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
 ; GFX940-NEXT:    ret void
 ;
-; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(
+; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
 ; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
 ; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -3001,7 +2993,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
 ; GFX10:       atomicrmw.end:
 ; GFX10-NEXT:    ret void
 ;
-; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(
+; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
 ; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
 ; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -3018,7 +3010,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
 ; GFX11:       atomicrmw.end:
 ; GFX11-NEXT:    ret void
 ;
-; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(
+; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
 ; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
 ; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -3035,12 +3027,12 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
 ; GFX12:       atomicrmw.end:
 ; GFX12-NEXT:    ret void
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret void
 }
 
-define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(ptr addrspace(1) %ptr, float %value) #1 {
-; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(
+define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, float %value) #1 {
+; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
 ; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
 ; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -3057,7 +3049,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
 ; GFX803:       atomicrmw.end:
 ; GFX803-NEXT:    ret void
 ;
-; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(
+; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
 ; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
 ; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -3074,7 +3066,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
 ; GFX906:       atomicrmw.end:
 ; GFX906-NEXT:    ret void
 ;
-; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(
+; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
 ; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
 ; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -3091,7 +3083,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
 ; GFX908:       atomicrmw.end:
 ; GFX908-NEXT:    ret void
 ;
-; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(
+; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
 ; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
 ; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -3108,12 +3100,12 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
 ; GFX90A:       atomicrmw.end:
 ; GFX90A-NEXT:    ret void
 ;
-; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(
+; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
 ; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
 ; GFX940-NEXT:    ret void
 ;
-; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(
+; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
 ; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
 ; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -3130,7 +3122,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
 ; GFX10:       atomicrmw.end:
 ; GFX10-NEXT:    ret void
 ;
-; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(
+; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
 ; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
 ; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -3147,7 +3139,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
 ; GFX11:       atomicrmw.end:
 ; GFX11-NEXT:    ret void
 ;
-; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(
+; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
 ; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
 ; GFX12-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -3164,7 +3156,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
 ; GFX12:       atomicrmw.end:
 ; GFX12-NEXT:    ret void
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret void
 }
 
@@ -3216,8 +3208,8 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_fine_grained_memo
   ret float %res
 }
 
-define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_remote_memory_access(
+define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -3234,12 +3226,12 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_remote_memory_acc
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret float [[RES]]
 ;
-  %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory !0
   ret float %res
 }
 
-define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -3256,7 +3248,7 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_fine_grained_memo
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret float [[RES]]
 ;
-  %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret float %res
 }
 
@@ -3304,8 +3296,8 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode
   ret float %res
 }
 
-define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
+define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -3322,12 +3314,12 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret float [[TMP5]]
 ;
-  %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
 }
 
-define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -3344,7 +3336,7 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret float [[TMP5]]
 ;
-  %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
 }
 
@@ -3396,8 +3388,8 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo
   ret float %res
 }
 
-define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory_access(
+define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -3414,12 +3406,12 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory_acc
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret float [[RES]]
 ;
-  %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory !0
   ret float %res
 }
 
-define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -3436,7 +3428,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret float [[RES]]
 ;
-  %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret float %res
 }
 
@@ -3484,8 +3476,8 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
   ret float %res
 }
 
-define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
+define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -3502,12 +3494,12 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret float [[TMP6]]
 ;
-  %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
 }
 
-define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -3524,7 +3516,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret float [[TMP6]]
 ;
-  %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
 }
 
@@ -3576,8 +3568,8 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo
   ret float %res
 }
 
-define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory_access(
+define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -3594,12 +3586,12 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory_acc
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret float [[RES]]
 ;
-  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory !0
   ret float %res
 }
 
-define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -3616,7 +3608,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret float [[RES]]
 ;
-  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret float %res
 }
 
@@ -3664,8 +3656,8 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
   ret float %res
 }
 
-define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
+define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -3682,12 +3674,12 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret float [[TMP6]]
 ;
-  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
 }
 
-define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -3704,7 +3696,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret float [[TMP6]]
 ;
-  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
 }
 
@@ -3713,5 +3705,19 @@ attributes #1 = { "denormal-fp-mode-f32"="dynamic,dynamic" }
 
 !0 = !{}
 ;.
+; GFX803: [[META0]] = !{}
+;.
+; GFX906: [[META0]] = !{}
+;.
+; GFX908: [[META0]] = !{}
+;.
+; GFX90A: [[META0]] = !{}
+;.
 ; GFX940: [[META0]] = !{}
 ;.
+; GFX10: [[META0]] = !{}
+;.
+; GFX11: [[META0]] = !{}
+;.
+; GFX12: [[META0]] = !{}
+;.
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll
new file mode 100644
index 000000000000..a5830bd8d7c3
--- /dev/null
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll
@@ -0,0 +1,1691 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX803 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX906 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX908 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX90A %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX940 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX10 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX11 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX12 %s
+
+;---------------------------------------------------------------------
+; atomicrmw xchg
+;---------------------------------------------------------------------
+
+; xchg is supported over PCIe, so no expansion is necessary
+define double @test_atomicrmw_xchg_f64_global_agent(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8
+; COMMON-NEXT:    ret double [[RES]]
+;
+  %res = atomicrmw xchg ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst
+  ret double %res
+}
+
+; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
+define double @test_atomicrmw_xchg_f64_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; COMMON-NEXT:    ret double [[RES]]
+;
+  %res = atomicrmw xchg ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %res
+}
+
+; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
+define double @test_atomicrmw_xchg_f64_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret double [[RES]]
+;
+  %res = atomicrmw xchg ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret double %res
+}
+
+; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
+define double @test_atomicrmw_xchg_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret double [[RES]]
+;
+  %res = atomicrmw xchg ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret double %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw fadd
+;---------------------------------------------------------------------
+
+define double @test_atomicrmw_fadd_f64_global_agent(ptr addrspace(1) %ptr, double %value) {
+; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_agent(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX803-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX803-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP5]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_agent(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX906-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX906-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP5]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_agent(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP5]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_agent(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret double [[TMP5]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fadd_f64_global_agent(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_agent(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX10-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX10-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret double [[TMP5]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_agent(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX11-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX11-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP5]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_agent(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst
+  ret double %res
+}
+
+define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
+; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX803-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX803-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP5]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX906-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX906-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP5]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP5]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret double [[TMP5]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX10-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX10-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret double [[TMP5]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX11-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX11-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP5]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %res
+}
+
+define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX803-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX803-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP5]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX906-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX906-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP5]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP5]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret double [[TMP5]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX10-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX10-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret double [[TMP5]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX11-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX11-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP5]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret double %res
+}
+
+define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX803-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX803-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP5]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX906-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX906-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP5]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP5]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret double [[TMP5]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX10-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX10-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret double [[TMP5]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX11-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX11-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP5]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret double %res
+}
+
+define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(ptr addrspace(1) %ptr, double %value) #0 {
+; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX803-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX803-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP5]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX906-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX906-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP5]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP5]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret double [[TMP5]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX10-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX10-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret double [[TMP5]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX11-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX11-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP5]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret double %res
+}
+
+define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(ptr addrspace(1) %ptr, double %value) #1 {
+; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX803-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX803-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP5]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX906-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX906-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP5]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP5]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret double [[TMP5]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX10-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX10-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret double [[TMP5]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX11-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX11-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP5]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret double %res
+}
+
+define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP2:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP3:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; COMMON-NEXT:    store double [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; COMMON-NEXT:    store double [[NEW]], ptr addrspace(5) [[TMP2]], align 8
+; COMMON-NEXT:    [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; COMMON-NEXT:    [[TMP6:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; COMMON-NEXT:    [[TMP7:%.*]] = insertvalue { double, i1 } poison, double [[TMP6]], 0
+; COMMON-NEXT:    [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP7]], i1 [[TMP5]], 1
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP8]], 1
+; COMMON-NEXT:    [[NEWLOADED]] = extractvalue { double, i1 } [[TMP8]], 0
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret double [[NEWLOADED]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
+  ret double %res
+}
+
+define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP2:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP3:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; COMMON-NEXT:    store double [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; COMMON-NEXT:    store double [[NEW]], ptr addrspace(5) [[TMP2]], align 8
+; COMMON-NEXT:    [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; COMMON-NEXT:    [[TMP6:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; COMMON-NEXT:    [[TMP7:%.*]] = insertvalue { double, i1 } poison, double [[TMP6]], 0
+; COMMON-NEXT:    [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP7]], i1 [[TMP5]], 1
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP8]], 1
+; COMMON-NEXT:    [[NEWLOADED]] = extractvalue { double, i1 } [[TMP8]], 0
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret double [[NEWLOADED]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret double %res
+}
+
+define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP2:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP3:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; COMMON-NEXT:    store double [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; COMMON-NEXT:    store double [[NEW]], ptr addrspace(5) [[TMP2]], align 8
+; COMMON-NEXT:    [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; COMMON-NEXT:    [[TMP6:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; COMMON-NEXT:    [[TMP7:%.*]] = insertvalue { double, i1 } poison, double [[TMP6]], 0
+; COMMON-NEXT:    [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP7]], i1 [[TMP5]], 1
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP8]], 1
+; COMMON-NEXT:    [[NEWLOADED]] = extractvalue { double, i1 } [[TMP8]], 0
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret double [[NEWLOADED]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret double %res
+}
+
+define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP2:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP3:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; COMMON-NEXT:    store double [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; COMMON-NEXT:    store double [[NEW]], ptr addrspace(5) [[TMP2]], align 8
+; COMMON-NEXT:    [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; COMMON-NEXT:    [[TMP6:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; COMMON-NEXT:    [[TMP7:%.*]] = insertvalue { double, i1 } poison, double [[TMP6]], 0
+; COMMON-NEXT:    [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP7]], i1 [[TMP5]], 1
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP8]], 1
+; COMMON-NEXT:    [[NEWLOADED]] = extractvalue { double, i1 } [[TMP8]], 0
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret double [[NEWLOADED]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret double %res
+}
+
+define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, double %value) #0 {
+; COMMON-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP2:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP3:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; COMMON-NEXT:    store double [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; COMMON-NEXT:    store double [[NEW]], ptr addrspace(5) [[TMP2]], align 8
+; COMMON-NEXT:    [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; COMMON-NEXT:    [[TMP6:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; COMMON-NEXT:    [[TMP7:%.*]] = insertvalue { double, i1 } poison, double [[TMP6]], 0
+; COMMON-NEXT:    [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP7]], i1 [[TMP5]], 1
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP8]], 1
+; COMMON-NEXT:    [[NEWLOADED]] = extractvalue { double, i1 } [[TMP8]], 0
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret double [[NEWLOADED]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret double %res
+}
+
+define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, double %value) #1 {
+; COMMON-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP2:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP3:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; COMMON-NEXT:    store double [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; COMMON-NEXT:    store double [[NEW]], ptr addrspace(5) [[TMP2]], align 8
+; COMMON-NEXT:    [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; COMMON-NEXT:    [[TMP6:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; COMMON-NEXT:    [[TMP7:%.*]] = insertvalue { double, i1 } poison, double [[TMP6]], 0
+; COMMON-NEXT:    [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP7]], i1 [[TMP5]], 1
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP8]], 1
+; COMMON-NEXT:    [[NEWLOADED]] = extractvalue { double, i1 } [[TMP8]], 0
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret double [[NEWLOADED]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret double %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw fsub
+;---------------------------------------------------------------------
+
+define double @test_atomicrmw_fsub_f64_global_agent(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret double [[RES]]
+;
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst
+  ret double %res
+}
+
+define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret double [[RES]]
+;
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %res
+}
+
+define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret double [[RES]]
+;
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret double %res
+}
+
+define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret double [[RES]]
+;
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret double %res
+}
+
+define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_ignore_denormal_mode(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP3:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP2:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]])
+; COMMON-NEXT:    store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; COMMON-NEXT:    store double [[NEW]], ptr addrspace(5) [[TMP2]], align 8
+; COMMON-NEXT:    [[TMP9:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; COMMON-NEXT:    [[TMP6:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]])
+; COMMON-NEXT:    [[TMP7:%.*]] = insertvalue { double, i1 } poison, double [[TMP6]], 0
+; COMMON-NEXT:    [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP7]], i1 [[TMP9]], 1
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP8]], 1
+; COMMON-NEXT:    [[TMP5]] = extractvalue { double, i1 } [[TMP8]], 0
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret double [[TMP5]]
+;
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
+  ret double %res
+}
+
+define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP3:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP2:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]])
+; COMMON-NEXT:    store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; COMMON-NEXT:    store double [[NEW]], ptr addrspace(5) [[TMP2]], align 8
+; COMMON-NEXT:    [[TMP9:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; COMMON-NEXT:    [[TMP6:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]])
+; COMMON-NEXT:    [[TMP7:%.*]] = insertvalue { double, i1 } poison, double [[TMP6]], 0
+; COMMON-NEXT:    [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP7]], i1 [[TMP9]], 1
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP8]], 1
+; COMMON-NEXT:    [[TMP5]] = extractvalue { double, i1 } [[TMP8]], 0
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret double [[TMP5]]
+;
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret double %res
+}
+
+define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP3:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP2:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]])
+; COMMON-NEXT:    store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; COMMON-NEXT:    store double [[NEW]], ptr addrspace(5) [[TMP2]], align 8
+; COMMON-NEXT:    [[TMP9:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; COMMON-NEXT:    [[TMP6:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]])
+; COMMON-NEXT:    [[TMP7:%.*]] = insertvalue { double, i1 } poison, double [[TMP6]], 0
+; COMMON-NEXT:    [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP7]], i1 [[TMP9]], 1
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP8]], 1
+; COMMON-NEXT:    [[TMP5]] = extractvalue { double, i1 } [[TMP8]], 0
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret double [[TMP5]]
+;
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret double %res
+}
+
+define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP3:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP2:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]])
+; COMMON-NEXT:    store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; COMMON-NEXT:    store double [[NEW]], ptr addrspace(5) [[TMP2]], align 8
+; COMMON-NEXT:    [[TMP9:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; COMMON-NEXT:    [[TMP6:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]])
+; COMMON-NEXT:    [[TMP7:%.*]] = insertvalue { double, i1 } poison, double [[TMP6]], 0
+; COMMON-NEXT:    [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP7]], i1 [[TMP9]], 1
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP8]], 1
+; COMMON-NEXT:    [[TMP5]] = extractvalue { double, i1 } [[TMP8]], 0
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret double [[TMP5]]
+;
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret double %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw fmax
+;---------------------------------------------------------------------
+
+define double @test_atomicrmw_fmax_f64_global_agent(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret double [[RES]]
+;
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst
+  ret double %res
+}
+
+define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret double [[RES]]
+;
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %res
+}
+
+define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret double [[RES]]
+;
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret double %res
+}
+
+define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret double [[RES]]
+;
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret double %res
+}
+
+define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP3:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP4:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; COMMON-NEXT:    [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]])
+; COMMON-NEXT:    store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP4]])
+; COMMON-NEXT:    store double [[TMP2]], ptr addrspace(5) [[TMP4]], align 8
+; COMMON-NEXT:    [[TMP10:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP4]], i32 5, i32 5)
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP4]])
+; COMMON-NEXT:    [[TMP7:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]])
+; COMMON-NEXT:    [[TMP8:%.*]] = insertvalue { double, i1 } poison, double [[TMP7]], 0
+; COMMON-NEXT:    [[TMP9:%.*]] = insertvalue { double, i1 } [[TMP8]], i1 [[TMP10]], 1
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP9]], 1
+; COMMON-NEXT:    [[TMP6]] = extractvalue { double, i1 } [[TMP9]], 0
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret double [[TMP6]]
+;
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
+  ret double %res
+}
+
+define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP3:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP4:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; COMMON-NEXT:    [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]])
+; COMMON-NEXT:    store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP4]])
+; COMMON-NEXT:    store double [[TMP2]], ptr addrspace(5) [[TMP4]], align 8
+; COMMON-NEXT:    [[TMP10:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP4]], i32 5, i32 5)
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP4]])
+; COMMON-NEXT:    [[TMP7:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]])
+; COMMON-NEXT:    [[TMP8:%.*]] = insertvalue { double, i1 } poison, double [[TMP7]], 0
+; COMMON-NEXT:    [[TMP9:%.*]] = insertvalue { double, i1 } [[TMP8]], i1 [[TMP10]], 1
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP9]], 1
+; COMMON-NEXT:    [[TMP6]] = extractvalue { double, i1 } [[TMP9]], 0
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret double [[TMP6]]
+;
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret double %res
+}
+
+define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP3:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP4:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; COMMON-NEXT:    [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]])
+; COMMON-NEXT:    store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP4]])
+; COMMON-NEXT:    store double [[TMP2]], ptr addrspace(5) [[TMP4]], align 8
+; COMMON-NEXT:    [[TMP10:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP4]], i32 5, i32 5)
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP4]])
+; COMMON-NEXT:    [[TMP7:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]])
+; COMMON-NEXT:    [[TMP8:%.*]] = insertvalue { double, i1 } poison, double [[TMP7]], 0
+; COMMON-NEXT:    [[TMP9:%.*]] = insertvalue { double, i1 } [[TMP8]], i1 [[TMP10]], 1
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP9]], 1
+; COMMON-NEXT:    [[TMP6]] = extractvalue { double, i1 } [[TMP9]], 0
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret double [[TMP6]]
+;
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret double %res
+}
+
+define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP3:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP4:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; COMMON-NEXT:    [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]])
+; COMMON-NEXT:    store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP4]])
+; COMMON-NEXT:    store double [[TMP2]], ptr addrspace(5) [[TMP4]], align 8
+; COMMON-NEXT:    [[TMP10:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP4]], i32 5, i32 5)
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP4]])
+; COMMON-NEXT:    [[TMP7:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]])
+; COMMON-NEXT:    [[TMP8:%.*]] = insertvalue { double, i1 } poison, double [[TMP7]], 0
+; COMMON-NEXT:    [[TMP9:%.*]] = insertvalue { double, i1 } [[TMP8]], i1 [[TMP10]], 1
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP9]], 1
+; COMMON-NEXT:    [[TMP6]] = extractvalue { double, i1 } [[TMP9]], 0
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret double [[TMP6]]
+;
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret double %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw fmin
+;---------------------------------------------------------------------
+
+define double @test_atomicrmw_fmin_f64_global_agent(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret double [[RES]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst
+  ret double %res
+}
+
+define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret double [[RES]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %res
+}
+
+define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret double [[RES]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret double %res
+}
+
+define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret double [[RES]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret double %res
+}
+
+define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP3:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP4:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; COMMON-NEXT:    [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]])
+; COMMON-NEXT:    store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP4]])
+; COMMON-NEXT:    store double [[TMP2]], ptr addrspace(5) [[TMP4]], align 8
+; COMMON-NEXT:    [[TMP10:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP4]], i32 5, i32 5)
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP4]])
+; COMMON-NEXT:    [[TMP7:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]])
+; COMMON-NEXT:    [[TMP8:%.*]] = insertvalue { double, i1 } poison, double [[TMP7]], 0
+; COMMON-NEXT:    [[TMP9:%.*]] = insertvalue { double, i1 } [[TMP8]], i1 [[TMP10]], 1
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP9]], 1
+; COMMON-NEXT:    [[TMP6]] = extractvalue { double, i1 } [[TMP9]], 0
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret double [[TMP6]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
+  ret double %res
+}
+
+define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP3:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP4:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; COMMON-NEXT:    [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]])
+; COMMON-NEXT:    store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP4]])
+; COMMON-NEXT:    store double [[TMP2]], ptr addrspace(5) [[TMP4]], align 8
+; COMMON-NEXT:    [[TMP10:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP4]], i32 5, i32 5)
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP4]])
+; COMMON-NEXT:    [[TMP7:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]])
+; COMMON-NEXT:    [[TMP8:%.*]] = insertvalue { double, i1 } poison, double [[TMP7]], 0
+; COMMON-NEXT:    [[TMP9:%.*]] = insertvalue { double, i1 } [[TMP8]], i1 [[TMP10]], 1
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP9]], 1
+; COMMON-NEXT:    [[TMP6]] = extractvalue { double, i1 } [[TMP9]], 0
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret double [[TMP6]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret double %res
+}
+
+define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP3:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP4:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; COMMON-NEXT:    [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]])
+; COMMON-NEXT:    store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP4]])
+; COMMON-NEXT:    store double [[TMP2]], ptr addrspace(5) [[TMP4]], align 8
+; COMMON-NEXT:    [[TMP10:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP4]], i32 5, i32 5)
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP4]])
+; COMMON-NEXT:    [[TMP7:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]])
+; COMMON-NEXT:    [[TMP8:%.*]] = insertvalue { double, i1 } poison, double [[TMP7]], 0
+; COMMON-NEXT:    [[TMP9:%.*]] = insertvalue { double, i1 } [[TMP8]], i1 [[TMP10]], 1
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP9]], 1
+; COMMON-NEXT:    [[TMP6]] = extractvalue { double, i1 } [[TMP9]], 0
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret double [[TMP6]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret double %res
+}
+
+define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP3:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP4:%.*]] = alloca double, align 8, addrspace(5)
+; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; COMMON-NEXT:    [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]])
+; COMMON-NEXT:    store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP4]])
+; COMMON-NEXT:    store double [[TMP2]], ptr addrspace(5) [[TMP4]], align 8
+; COMMON-NEXT:    [[TMP10:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP4]], i32 5, i32 5)
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP4]])
+; COMMON-NEXT:    [[TMP7:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8
+; COMMON-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]])
+; COMMON-NEXT:    [[TMP8:%.*]] = insertvalue { double, i1 } poison, double [[TMP7]], 0
+; COMMON-NEXT:    [[TMP9:%.*]] = insertvalue { double, i1 } [[TMP8]], i1 [[TMP10]], 1
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP9]], 1
+; COMMON-NEXT:    [[TMP6]] = extractvalue { double, i1 } [[TMP9]], 0
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret double [[TMP6]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret double %res
+}
+
+attributes #0 = { "denormal-fp-mode"="preserve-sign,preserve-sign" }
+attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" }
+
+!0 = !{}
+;.
+; GFX803: [[META0]] = !{}
+;.
+; GFX906: [[META0]] = !{}
+;.
+; GFX908: [[META0]] = !{}
+;.
+; GFX90A: [[META0]] = !{}
+;.
+; GFX940: [[META0]] = !{}
+;.
+; GFX10: [[META0]] = !{}
+;.
+; GFX11: [[META0]] = !{}
+;.
+; GFX12: [[META0]] = !{}
+;.
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll
index e1890da15b0c..4489b639b678 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll
@@ -16,9 +16,7 @@
 define double @test_atomicrmw_xchg_f64_global_system(ptr addrspace(1) %ptr, double %value) {
 ; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_system(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = bitcast double [[VALUE]] to i64
-; COMMON-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[TMP1]] seq_cst, align 8
-; COMMON-NEXT:    [[RES:%.*]] = bitcast i64 [[TMP2]] to double
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8
 ; COMMON-NEXT:    ret double [[RES]]
 ;
   %res = atomicrmw xchg ptr addrspace(1) %ptr, double %value seq_cst
@@ -29,9 +27,7 @@ define double @test_atomicrmw_xchg_f64_global_system(ptr addrspace(1) %ptr, doub
 define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
 ; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = bitcast double [[VALUE]] to i64
-; COMMON-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[TMP1]] seq_cst, align 8
-; COMMON-NEXT:    [[RES:%.*]] = bitcast i64 [[TMP2]] to double
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
 ; COMMON-NEXT:    ret double [[RES]]
 ;
   %res = atomicrmw xchg ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -39,28 +35,24 @@ define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_mem
 }
 
 ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
-define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_remote_memory_access(
+define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = bitcast double [[VALUE]] to i64
-; COMMON-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[TMP1]] seq_cst, align 8
-; COMMON-NEXT:    [[RES:%.*]] = bitcast i64 [[TMP2]] to double
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret double [[RES]]
 ;
-  %res = atomicrmw xchg ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw xchg ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory !0
   ret double %res
 }
 
 ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
-define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = bitcast double [[VALUE]] to i64
-; COMMON-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[TMP1]] seq_cst, align 8
-; COMMON-NEXT:    [[RES:%.*]] = bitcast i64 [[TMP2]] to double
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret double [[RES]]
 ;
-  %res = atomicrmw xchg ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw xchg ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret double %res
 }
 
@@ -268,7 +260,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
 ;
 ; GFX940-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory(
 ; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
 ; GFX940-NEXT:    ret double [[RES]]
 ;
 ; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory(
@@ -326,8 +318,8 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
   ret double %res
 }
 
-define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) {
-; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_access(
+define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(
 ; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -344,7 +336,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_ac
 ; GFX803:       atomicrmw.end:
 ; GFX803-NEXT:    ret double [[TMP5]]
 ;
-; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_access(
+; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(
 ; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -361,7 +353,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_ac
 ; GFX906:       atomicrmw.end:
 ; GFX906-NEXT:    ret double [[TMP5]]
 ;
-; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_access(
+; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(
 ; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -378,7 +370,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_ac
 ; GFX908:       atomicrmw.end:
 ; GFX908-NEXT:    ret double [[TMP5]]
 ;
-; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_access(
+; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(
 ; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX90A-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -395,12 +387,12 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_ac
 ; GFX90A:       atomicrmw.end:
 ; GFX90A-NEXT:    ret double [[TMP5]]
 ;
-; GFX940-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_access(
+; GFX940-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(
 ; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory.access [[META0]]
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
 ; GFX940-NEXT:    ret double [[RES]]
 ;
-; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_access(
+; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(
 ; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX10-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -417,7 +409,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_ac
 ; GFX10:       atomicrmw.end:
 ; GFX10-NEXT:    ret double [[TMP5]]
 ;
-; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_access(
+; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(
 ; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -434,7 +426,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_ac
 ; GFX11:       atomicrmw.end:
 ; GFX11-NEXT:    ret double [[TMP5]]
 ;
-; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_access(
+; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(
 ; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -451,12 +443,12 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_ac
 ; GFX12:       atomicrmw.end:
 ; GFX12-NEXT:    ret double [[TMP5]]
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory !0
   ret double %res
 }
 
-define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) {
-; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -473,7 +465,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
 ; GFX803:       atomicrmw.end:
 ; GFX803-NEXT:    ret double [[TMP5]]
 ;
-; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -490,7 +482,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
 ; GFX906:       atomicrmw.end:
 ; GFX906-NEXT:    ret double [[TMP5]]
 ;
-; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -507,7 +499,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
 ; GFX908:       atomicrmw.end:
 ; GFX908-NEXT:    ret double [[TMP5]]
 ;
-; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX90A-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -524,12 +516,12 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
 ; GFX90A:       atomicrmw.end:
 ; GFX90A-NEXT:    ret double [[TMP5]]
 ;
-; GFX940-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX940-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]]
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
 ; GFX940-NEXT:    ret double [[RES]]
 ;
-; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX10-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -546,7 +538,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
 ; GFX10:       atomicrmw.end:
 ; GFX10-NEXT:    ret double [[TMP5]]
 ;
-; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -563,7 +555,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
 ; GFX11:       atomicrmw.end:
 ; GFX11-NEXT:    ret double [[TMP5]]
 ;
-; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
 ; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -580,12 +572,12 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
 ; GFX12:       atomicrmw.end:
 ; GFX12-NEXT:    ret double [[TMP5]]
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret double %res
 }
 
-define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_daz(ptr addrspace(1) %ptr, double %value) #0 {
-; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_daz(
+define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(ptr addrspace(1) %ptr, double %value) #0 {
+; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(
 ; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
 ; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -602,7 +594,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
 ; GFX803:       atomicrmw.end:
 ; GFX803-NEXT:    ret double [[TMP5]]
 ;
-; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_daz(
+; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(
 ; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
 ; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -619,7 +611,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
 ; GFX906:       atomicrmw.end:
 ; GFX906-NEXT:    ret double [[TMP5]]
 ;
-; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_daz(
+; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(
 ; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
 ; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -636,7 +628,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
 ; GFX908:       atomicrmw.end:
 ; GFX908-NEXT:    ret double [[TMP5]]
 ;
-; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_daz(
+; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(
 ; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
 ; GFX90A-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -653,12 +645,12 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
 ; GFX90A:       atomicrmw.end:
 ; GFX90A-NEXT:    ret double [[TMP5]]
 ;
-; GFX940-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_daz(
+; GFX940-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(
 ; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
-; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]]
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
 ; GFX940-NEXT:    ret double [[RES]]
 ;
-; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_daz(
+; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(
 ; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
 ; GFX10-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -675,7 +667,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
 ; GFX10:       atomicrmw.end:
 ; GFX10-NEXT:    ret double [[TMP5]]
 ;
-; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_daz(
+; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(
 ; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
 ; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -692,7 +684,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
 ; GFX11:       atomicrmw.end:
 ; GFX11-NEXT:    ret double [[TMP5]]
 ;
-; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_daz(
+; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(
 ; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
 ; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -709,12 +701,12 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
 ; GFX12:       atomicrmw.end:
 ; GFX12-NEXT:    ret double [[TMP5]]
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret double %res
 }
 
-define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_dynamic(ptr addrspace(1) %ptr, double %value) #1 {
-; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_dynamic(
+define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(ptr addrspace(1) %ptr, double %value) #1 {
+; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(
 ; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
 ; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -731,7 +723,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
 ; GFX803:       atomicrmw.end:
 ; GFX803-NEXT:    ret double [[TMP5]]
 ;
-; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_dynamic(
+; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(
 ; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
 ; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -748,7 +740,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
 ; GFX906:       atomicrmw.end:
 ; GFX906-NEXT:    ret double [[TMP5]]
 ;
-; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_dynamic(
+; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(
 ; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
 ; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -765,7 +757,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
 ; GFX908:       atomicrmw.end:
 ; GFX908-NEXT:    ret double [[TMP5]]
 ;
-; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_dynamic(
+; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(
 ; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
 ; GFX90A-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -782,12 +774,12 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
 ; GFX90A:       atomicrmw.end:
 ; GFX90A-NEXT:    ret double [[TMP5]]
 ;
-; GFX940-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_dynamic(
+; GFX940-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(
 ; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
-; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]]
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
 ; GFX940-NEXT:    ret double [[RES]]
 ;
-; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_dynamic(
+; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(
 ; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
 ; GFX10-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -804,7 +796,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
 ; GFX10:       atomicrmw.end:
 ; GFX10-NEXT:    ret double [[TMP5]]
 ;
-; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_dynamic(
+; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(
 ; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
 ; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -821,7 +813,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
 ; GFX11:       atomicrmw.end:
 ; GFX11-NEXT:    ret double [[TMP5]]
 ;
-; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_dynamic(
+; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(
 ; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
 ; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -838,7 +830,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
 ; GFX12:       atomicrmw.end:
 ; GFX12-NEXT:    ret double [[TMP5]]
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret double %res
 }
 
@@ -904,8 +896,8 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
   ret double %res
 }
 
-define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
+define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = alloca double, align 8, addrspace(5)
 ; COMMON-NEXT:    [[TMP2:%.*]] = alloca double, align 8, addrspace(5)
@@ -931,12 +923,12 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret double [[NEWLOADED]]
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret double %res
 }
 
-define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = alloca double, align 8, addrspace(5)
 ; COMMON-NEXT:    [[TMP2:%.*]] = alloca double, align 8, addrspace(5)
@@ -962,12 +954,12 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret double [[NEWLOADED]]
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret double %res
 }
 
-define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(ptr addrspace(1) %ptr, double %value) #0 {
-; COMMON-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(
+define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, double %value) #0 {
+; COMMON-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = alloca double, align 8, addrspace(5)
 ; COMMON-NEXT:    [[TMP2:%.*]] = alloca double, align 8, addrspace(5)
@@ -993,12 +985,12 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret double [[NEWLOADED]]
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret double %res
 }
 
-define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(ptr addrspace(1) %ptr, double %value) #1 {
-; COMMON-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(
+define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, double %value) #1 {
+; COMMON-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = alloca double, align 8, addrspace(5)
 ; COMMON-NEXT:    [[TMP2:%.*]] = alloca double, align 8, addrspace(5)
@@ -1024,7 +1016,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret double [[NEWLOADED]]
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret double %res
 }
 
@@ -1076,8 +1068,8 @@ define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_fine_grained_mem
   ret double %res
 }
 
-define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_remote_memory_access(
+define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1094,12 +1086,12 @@ define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_remote_memory_ac
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret double [[RES]]
 ;
-  %res = atomicrmw fsub ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory !0
   ret double %res
 }
 
-define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1116,7 +1108,7 @@ define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_fine_grained_mem
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret double [[RES]]
 ;
-  %res = atomicrmw fsub ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret double %res
 }
 
@@ -1182,8 +1174,8 @@ define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mod
   ret double %res
 }
 
-define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
+define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP3:%.*]] = alloca double, align 8, addrspace(5)
 ; COMMON-NEXT:    [[TMP2:%.*]] = alloca double, align 8, addrspace(5)
@@ -1209,12 +1201,12 @@ define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mod
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret double [[TMP5]]
 ;
-  %res = atomicrmw fsub ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret double %res
 }
 
-define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP3:%.*]] = alloca double, align 8, addrspace(5)
 ; COMMON-NEXT:    [[TMP2:%.*]] = alloca double, align 8, addrspace(5)
@@ -1240,7 +1232,7 @@ define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mod
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret double [[TMP5]]
 ;
-  %res = atomicrmw fsub ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret double %res
 }
 
@@ -1292,8 +1284,8 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem
   ret double %res
 }
 
-define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory_access(
+define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1310,12 +1302,12 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory_ac
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret double [[RES]]
 ;
-  %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory !0
   ret double %res
 }
 
-define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1332,7 +1324,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret double [[RES]]
 ;
-  %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret double %res
 }
 
@@ -1398,8 +1390,8 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mod
   ret double %res
 }
 
-define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
+define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP3:%.*]] = alloca double, align 8, addrspace(5)
 ; COMMON-NEXT:    [[TMP4:%.*]] = alloca double, align 8, addrspace(5)
@@ -1425,12 +1417,12 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mod
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret double [[TMP6]]
 ;
-  %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret double %res
 }
 
-define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP3:%.*]] = alloca double, align 8, addrspace(5)
 ; COMMON-NEXT:    [[TMP4:%.*]] = alloca double, align 8, addrspace(5)
@@ -1456,7 +1448,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mod
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret double [[TMP6]]
 ;
-  %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret double %res
 }
 
@@ -1508,8 +1500,8 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem
   ret double %res
 }
 
-define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory_access(
+define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1526,12 +1518,12 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory_ac
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret double [[RES]]
 ;
-  %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory !0
   ret double %res
 }
 
-define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1548,7 +1540,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret double [[RES]]
 ;
-  %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret double %res
 }
 
@@ -1614,8 +1606,8 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mod
   ret double %res
 }
 
-define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
+define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP3:%.*]] = alloca double, align 8, addrspace(5)
 ; COMMON-NEXT:    [[TMP4:%.*]] = alloca double, align 8, addrspace(5)
@@ -1641,12 +1633,12 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mod
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret double [[TMP6]]
 ;
-  %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret double %res
 }
 
-define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
+; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP3:%.*]] = alloca double, align 8, addrspace(5)
 ; COMMON-NEXT:    [[TMP4:%.*]] = alloca double, align 8, addrspace(5)
@@ -1672,7 +1664,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mod
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret double [[TMP6]]
 ;
-  %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret double %res
 }
 
@@ -1681,5 +1673,19 @@ attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" }
 
 !0 = !{}
 ;.
+; GFX803: [[META0]] = !{}
+;.
+; GFX906: [[META0]] = !{}
+;.
+; GFX908: [[META0]] = !{}
+;.
+; GFX90A: [[META0]] = !{}
+;.
 ; GFX940: [[META0]] = !{}
 ;.
+; GFX10: [[META0]] = !{}
+;.
+; GFX11: [[META0]] = !{}
+;.
+; GFX12: [[META0]] = !{}
+;.
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16-system.ll
index 78468b933ff5..050c0170270a 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16-system.ll
@@ -697,15 +697,15 @@ define i16 @test_atomicrmw_dec_i16_flat_system_align4(ptr %ptr, i16 %value) {
 
 define half @test_atomicrmw_xchg_f16_global_system(ptr addrspace(1) %ptr, half %value) {
 ; CHECK-LABEL: @test_atomicrmw_xchg_f16_global_system(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast half [[VALUE:%.*]] to i16
 ; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP2]], 3
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
 ; CHECK-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
 ; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast half [[VALUE:%.*]] to i16
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
 ; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP4]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -752,15 +752,15 @@ define half @test_atomicrmw_xchg_f16_global_system_align4(ptr addrspace(1) %ptr,
 
 define half @test_atomicrmw_xchg_f16_flat_system(ptr %ptr, half %value) {
 ; CHECK-LABEL: @test_atomicrmw_xchg_f16_flat_system(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast half [[VALUE:%.*]] to i16
 ; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP2]], 3
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
 ; CHECK-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
 ; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast half [[VALUE:%.*]] to i16
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
 ; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP4]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -807,15 +807,15 @@ define half @test_atomicrmw_xchg_f16_flat_system_align4(ptr %ptr, half %value) {
 
 define bfloat @test_atomicrmw_xchg_bf16_flat_system(ptr %ptr, bfloat %value) {
 ; CHECK-LABEL: @test_atomicrmw_xchg_bf16_flat_system(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast bfloat [[VALUE:%.*]] to i16
 ; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP2]], 3
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
 ; CHECK-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
 ; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[VALUE:%.*]] to i16
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
 ; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP4]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll
index 324b6d2f6596..ce8524c70af6 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand %s | FileCheck %s
-; RUN: opt -mtriple=r600-mesa-mesa3d -S -passes=atomic-expand %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand %s | FileCheck -check-prefixes=CHECK,GCN,BASE %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand %s | FileCheck -check-prefixes=CHECK,GCN,GFX940 %s
+; RUN: opt -mtriple=r600-mesa-mesa3d -S -passes=atomic-expand %s | FileCheck  -check-prefixes=CHECK,R600 %s
 
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
@@ -163,6 +164,119 @@ define i16 @test_atomicrmw_and_i16_global_agent(ptr addrspace(1) %ptr, i16 %valu
   ret i16 %res
 }
 
+define i16 @test_atomicrmw_and_i16_global_agent_align4(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_align4(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32
+; CHECK-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536
+; CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4
+; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16
+; CHECK-NEXT:    ret i16 [[EXTRACTED]]
+;
+  %res = atomicrmw and ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, align 4
+  ret i16 %res
+}
+
+; Preserve unknown metadata
+define i16 @test_atomicrmw_and_i16_global_agent_preserve_md(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_preserve_md(
+; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32
+; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[VALOPERAND_SHIFTED]], [[INV_MASK]]
+; CHECK-NEXT:    [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4
+; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; CHECK-NEXT:    ret i16 [[EXTRACTED]]
+;
+  %res = atomicrmw and ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, !noundef !0, !some.unknown.md !0
+  ret i16 %res
+}
+
+; Preserve unknown metadata
+define i16 @test_atomicrmw_and_i16_global_agent_align4_preserve_md(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_align4_preserve_md(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32
+; CHECK-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536
+; CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4
+; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16
+; CHECK-NEXT:    ret i16 [[EXTRACTED]]
+;
+  %res = atomicrmw and ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, align 4, !noundef !0, !some.unknown.md !0
+  ret i16 %res
+}
+
+define i16 @test_atomicrmw_and_i16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent__amdgpu_no_remote_memory(
+; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32
+; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[VALOPERAND_SHIFTED]], [[INV_MASK]]
+; CHECK-NEXT:    [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4
+; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; CHECK-NEXT:    ret i16 [[EXTRACTED]]
+;
+  %res = atomicrmw and ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret i16 %res
+}
+
+define i16 @test_atomicrmw_and_i16_global_agent_align4__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_align4__amdgpu_no_remote_memory(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32
+; CHECK-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536
+; CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4
+; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16
+; CHECK-NEXT:    ret i16 [[EXTRACTED]]
+;
+  %res = atomicrmw and ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0
+  ret i16 %res
+}
+
+define i16 @test_atomicrmw_and_i16_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent__amdgpu_no_fine_grained_memory(
+; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32
+; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[VALOPERAND_SHIFTED]], [[INV_MASK]]
+; CHECK-NEXT:    [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4
+; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; CHECK-NEXT:    ret i16 [[EXTRACTED]]
+;
+  %res = atomicrmw and ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i16 %res
+}
+
+define i16 @test_atomicrmw_and_i16_global_agent_align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_align4__amdgpu_no_fine_grained_memory(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32
+; CHECK-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536
+; CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4
+; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16
+; CHECK-NEXT:    ret i16 [[EXTRACTED]]
+;
+  %res = atomicrmw and ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
+  ret i16 %res
+}
+
 define i16 @test_atomicrmw_nand_i16_global_agent(ptr addrspace(1) %ptr, i16 %value) {
 ; CHECK-LABEL: @test_atomicrmw_nand_i16_global_agent(
 ; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
@@ -902,15 +1016,15 @@ define i16 @test_atomicrmw_dec_i16_flat_agent_align4(ptr %ptr, i16 %value) {
 
 define half @test_atomicrmw_xchg_f16_global_agent(ptr addrspace(1) %ptr, half %value) {
 ; CHECK-LABEL: @test_atomicrmw_xchg_f16_global_agent(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast half [[VALUE:%.*]] to i16
 ; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP2]], 3
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
 ; CHECK-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
 ; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast half [[VALUE:%.*]] to i16
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
 ; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP4]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -957,15 +1071,15 @@ define half @test_atomicrmw_xchg_f16_global_agent_align4(ptr addrspace(1) %ptr,
 
 define half @test_atomicrmw_xchg_f16_flat_agent(ptr %ptr, half %value) {
 ; CHECK-LABEL: @test_atomicrmw_xchg_f16_flat_agent(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast half [[VALUE:%.*]] to i16
 ; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP2]], 3
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
 ; CHECK-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
 ; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast half [[VALUE:%.*]] to i16
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
 ; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP4]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1012,15 +1126,15 @@ define half @test_atomicrmw_xchg_f16_flat_agent_align4(ptr %ptr, half %value) {
 
 define bfloat @test_atomicrmw_xchg_bf16_global_agent(ptr addrspace(1) %ptr, bfloat %value) {
 ; CHECK-LABEL: @test_atomicrmw_xchg_bf16_global_agent(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast bfloat [[VALUE:%.*]] to i16
 ; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP2]], 3
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
 ; CHECK-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
 ; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[VALUE:%.*]] to i16
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
 ; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP4]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1064,3 +1178,10 @@ define bfloat @test_atomicrmw_xchg_bf16_global_agent_align4(ptr addrspace(1) %pt
   %res = atomicrmw xchg ptr addrspace(1) %ptr, bfloat %value syncscope("agent") seq_cst, align 4
   ret bfloat %res
 }
+
+!0 = !{}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; BASE: {{.*}}
+; GCN: {{.*}}
+; GFX940: {{.*}}
+; R600: {{.*}}
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-agent.ll
new file mode 100644
index 000000000000..5dbf2f6e696e
--- /dev/null
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-agent.ll
@@ -0,0 +1,668 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX803 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX906 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX908 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX90A %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX940 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX10 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX11 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX12 %s
+
+;---------------------------------------------------------------------
+; atomicrmw xchg
+;---------------------------------------------------------------------
+
+; xchg is supported over PCIe, so no expansion is necessary
+define i32 @test_atomicrmw_xchg_i32_global_agent(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_xchg_i32_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4
+; COMMON-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw xchg ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst
+  ret i32 %res
+}
+
+; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
+define i32 @test_atomicrmw_xchg_i32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_xchg_i32_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; COMMON-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw xchg ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i32 %res
+}
+
+; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
+define i32 @test_atomicrmw_xchg_i32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_xchg_i32_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw xchg ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
+define i32 @test_atomicrmw_xchg_i32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_xchg_i32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw xchg ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw add
+;---------------------------------------------------------------------
+
+; add is supported over PCIe, so no expansion is necessary
+define i32 @test_atomicrmw_add_i32_global_agent(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_add_i32_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4
+; COMMON-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw add ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst
+  ret i32 %res
+}
+
+; add is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
+define i32 @test_atomicrmw_add_i32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_add_i32_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw add ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i32 %res
+}
+
+; add is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
+define i32 @test_atomicrmw_add_i32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_add_i32_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw add ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+; add is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
+define i32 @test_atomicrmw_add_i32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_add_i32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw add ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw sub
+;---------------------------------------------------------------------
+
+; expansion is necessary, sub is not supported over PCIe
+define i32 @test_atomicrmw_sub_i32_global_agent(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_sub_i32_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw sub ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_sub_i32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_sub_i32_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw sub ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_sub_i32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_sub_i32_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw sub ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_sub_i32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_sub_i32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw sub ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw and
+;---------------------------------------------------------------------
+
+; expansion is necessary, operation not supported over PCIe
+define i32 @test_atomicrmw_and_i32_global_agent(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_and_i32_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw and ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_and_i32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_and_i32_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw and ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_and_i32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_and_i32_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw and ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_and_i32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_and_i32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw and ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw nand
+;---------------------------------------------------------------------
+
+; expansion is necessary, operation not supported
+define i32 @test_atomicrmw_nand_i32_global_agent(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_nand_i32_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = and i32 [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[NEW:%.*]] = xor i32 [[TMP2]], -1
+; COMMON-NEXT:    [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+; COMMON-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw nand ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_nand_i32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_nand_i32_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = and i32 [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[NEW:%.*]] = xor i32 [[TMP2]], -1
+; COMMON-NEXT:    [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+; COMMON-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw nand ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_nand_i32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_nand_i32_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = and i32 [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[NEW:%.*]] = xor i32 [[TMP2]], -1
+; COMMON-NEXT:    [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+; COMMON-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw nand ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_nand_i32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_nand_i32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = and i32 [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[NEW:%.*]] = xor i32 [[TMP2]], -1
+; COMMON-NEXT:    [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+; COMMON-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw nand ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw or
+;---------------------------------------------------------------------
+
+; expansion is necessary, operation not supported over PCIe
+define i32 @test_atomicrmw_or_i32_global_agent(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_or_i32_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw or ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_or_i32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_or_i32_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw or ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_or_i32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_or_i32_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw or ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_or_i32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_or_i32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw or ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw xor
+;---------------------------------------------------------------------
+
+; expansion is necessary, operation not supported over PCIe
+define i32 @test_atomicrmw_xor_i32_global_agent(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_xor_i32_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw xor ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_xor_i32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_xor_i32_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw xor ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_xor_i32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_xor_i32_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw xor ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_xor_i32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_xor_i32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw xor ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw max
+;---------------------------------------------------------------------
+
+; expansion is necessary, operation not supported over PCIe
+define i32 @test_atomicrmw_max_i32_global_agent(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_max_i32_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw max ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw max ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_max_i32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_max_i32_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw max ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw max ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_max_i32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_max_i32_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw max ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw max ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_max_i32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_max_i32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw max ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw max ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw min
+;---------------------------------------------------------------------
+
+; expansion is necessary, operation not supported over PCIe
+define i32 @test_atomicrmw_min_i32_global_agent(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_min_i32_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw min ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw min ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_min_i32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_min_i32_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw min ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw min ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_min_i32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_min_i32_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw min ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw min ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_min_i32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_min_i32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw min ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw min ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw umax
+;---------------------------------------------------------------------
+
+; expansion is necessary, operation not supported over PCIe
+define i32 @test_atomicrmw_umax_i32_global_agent(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_umax_i32_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw umax ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw umax ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_umax_i32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_umax_i32_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw umax ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw umax ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_umax_i32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_umax_i32_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw umax ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw umax ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_umax_i32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_umax_i32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw umax ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw umax ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw umin
+;---------------------------------------------------------------------
+
+; expansion is necessary, operation not supported over PCIe
+define i32 @test_atomicrmw_umin_i32_global_agent(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_umin_i32_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw umin ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw umin ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_umin_i32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_umin_i32_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw umin ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw umin ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_umin_i32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_umin_i32_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw umin ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw umin ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_umin_i32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_umin_i32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw umin ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw umin ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw uinc_wrap
+;---------------------------------------------------------------------
+
+; expansion is necessary, operation not supported over PCIe
+define i32 @test_atomicrmw_uinc_wrap_i32_global_agent(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_uinc_wrap_i32_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_uinc_wrap_i32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_uinc_wrap_i32_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_uinc_wrap_i32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_uinc_wrap_i32_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_uinc_wrap_i32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_uinc_wrap_i32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw udec_wrap
+;---------------------------------------------------------------------
+
+; expansion is necessary, operation not supported over PCIe
+define i32 @test_atomicrmw_udec_wrap_i32_global_agent(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_udec_wrap_i32_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_udec_wrap_i32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_udec_wrap_i32_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_udec_wrap_i32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_udec_wrap_i32_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_udec_wrap_i32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_udec_wrap_i32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i32 [[NEWLOADED]]
+;
+  %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+!0 = !{}
+;.
+; GFX803: [[META0]] = !{}
+;.
+; GFX906: [[META0]] = !{}
+;.
+; GFX908: [[META0]] = !{}
+;.
+; GFX90A: [[META0]] = !{}
+;.
+; GFX940: [[META0]] = !{}
+;.
+; GFX10: [[META0]] = !{}
+;.
+; GFX11: [[META0]] = !{}
+;.
+; GFX12: [[META0]] = !{}
+;.
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10: {{.*}}
+; GFX11: {{.*}}
+; GFX12: {{.*}}
+; GFX803: {{.*}}
+; GFX906: {{.*}}
+; GFX908: {{.*}}
+; GFX90A: {{.*}}
+; GFX940: {{.*}}
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll
index b711b9fe4edf..175f75634e70 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll
@@ -35,24 +35,24 @@ define i32 @test_atomicrmw_xchg_i32_global_system__amdgpu_no_fine_grained_memory
 }
 
 ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
-define i32 @test_atomicrmw_xchg_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) {
-; COMMON-LABEL: define i32 @test_atomicrmw_xchg_i32_global_system__amdgpu_no_remote_memory_access(
+define i32 @test_atomicrmw_xchg_i32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_xchg_i32_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]]
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret i32 [[RES]]
 ;
-  %res = atomicrmw xchg ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw xchg ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %res
 }
 
 ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
-define i32 @test_atomicrmw_xchg_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) {
-; COMMON-LABEL: define i32 @test_atomicrmw_xchg_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define i32 @test_atomicrmw_xchg_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_xchg_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]]
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret i32 [[RES]]
 ;
-  %res = atomicrmw xchg ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw xchg ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret i32 %res
 }
 
@@ -83,24 +83,24 @@ define i32 @test_atomicrmw_add_i32_global_system__amdgpu_no_fine_grained_memory(
 }
 
 ; add is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
-define i32 @test_atomicrmw_add_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) {
-; COMMON-LABEL: define i32 @test_atomicrmw_add_i32_global_system__amdgpu_no_remote_memory_access(
+define i32 @test_atomicrmw_add_i32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_add_i32_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]]
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret i32 [[RES]]
 ;
-  %res = atomicrmw add ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw add ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %res
 }
 
 ; add is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
-define i32 @test_atomicrmw_add_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) {
-; COMMON-LABEL: define i32 @test_atomicrmw_add_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define i32 @test_atomicrmw_add_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_add_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]]
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret i32 [[RES]]
 ;
-  %res = atomicrmw add ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw add ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret i32 %res
 }
 
@@ -129,23 +129,23 @@ define i32 @test_atomicrmw_sub_i32_global_system__amdgpu_no_fine_grained_memory(
   ret i32 %res
 }
 
-define i32 @test_atomicrmw_sub_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) {
-; COMMON-LABEL: define i32 @test_atomicrmw_sub_i32_global_system__amdgpu_no_remote_memory_access(
+define i32 @test_atomicrmw_sub_i32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_sub_i32_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]]
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret i32 [[NEWLOADED]]
 ;
-  %res = atomicrmw sub ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw sub ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %res
 }
 
-define i32 @test_atomicrmw_sub_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) {
-; COMMON-LABEL: define i32 @test_atomicrmw_sub_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define i32 @test_atomicrmw_sub_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_sub_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]]
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret i32 [[NEWLOADED]]
 ;
-  %res = atomicrmw sub ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw sub ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret i32 %res
 }
 
@@ -174,23 +174,23 @@ define i32 @test_atomicrmw_and_i32_global_system__amdgpu_no_fine_grained_memory(
   ret i32 %res
 }
 
-define i32 @test_atomicrmw_and_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) {
-; COMMON-LABEL: define i32 @test_atomicrmw_and_i32_global_system__amdgpu_no_remote_memory_access(
+define i32 @test_atomicrmw_and_i32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_and_i32_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]]
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret i32 [[NEWLOADED]]
 ;
-  %res = atomicrmw and ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw and ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %res
 }
 
-define i32 @test_atomicrmw_and_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) {
-; COMMON-LABEL: define i32 @test_atomicrmw_and_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define i32 @test_atomicrmw_and_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_and_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]]
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret i32 [[NEWLOADED]]
 ;
-  %res = atomicrmw and ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw and ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret i32 %res
 }
 
@@ -239,8 +239,8 @@ define i32 @test_atomicrmw_nand_i32_global_system__amdgpu_no_fine_grained_memory
   ret i32 %res
 }
 
-define i32 @test_atomicrmw_nand_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) {
-; COMMON-LABEL: define i32 @test_atomicrmw_nand_i32_global_system__amdgpu_no_remote_memory_access(
+define i32 @test_atomicrmw_nand_i32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_nand_i32_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -255,12 +255,12 @@ define i32 @test_atomicrmw_nand_i32_global_system__amdgpu_no_remote_memory_acces
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret i32 [[NEWLOADED]]
 ;
-  %res = atomicrmw nand ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw nand ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %res
 }
 
-define i32 @test_atomicrmw_nand_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) {
-; COMMON-LABEL: define i32 @test_atomicrmw_nand_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define i32 @test_atomicrmw_nand_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_nand_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -275,7 +275,7 @@ define i32 @test_atomicrmw_nand_i32_global_system__amdgpu_no_fine_grained_memory
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret i32 [[NEWLOADED]]
 ;
-  %res = atomicrmw nand ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw nand ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret i32 %res
 }
 
@@ -304,23 +304,23 @@ define i32 @test_atomicrmw_or_i32_global_system__amdgpu_no_fine_grained_memory(p
   ret i32 %res
 }
 
-define i32 @test_atomicrmw_or_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) {
-; COMMON-LABEL: define i32 @test_atomicrmw_or_i32_global_system__amdgpu_no_remote_memory_access(
+define i32 @test_atomicrmw_or_i32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_or_i32_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]]
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret i32 [[NEWLOADED]]
 ;
-  %res = atomicrmw or ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw or ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %res
 }
 
-define i32 @test_atomicrmw_or_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) {
-; COMMON-LABEL: define i32 @test_atomicrmw_or_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define i32 @test_atomicrmw_or_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_or_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]]
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret i32 [[NEWLOADED]]
 ;
-  %res = atomicrmw or ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw or ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret i32 %res
 }
 
@@ -349,23 +349,23 @@ define i32 @test_atomicrmw_xor_i32_global_system__amdgpu_no_fine_grained_memory(
   ret i32 %res
 }
 
-define i32 @test_atomicrmw_xor_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) {
-; COMMON-LABEL: define i32 @test_atomicrmw_xor_i32_global_system__amdgpu_no_remote_memory_access(
+define i32 @test_atomicrmw_xor_i32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_xor_i32_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]]
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret i32 [[NEWLOADED]]
 ;
-  %res = atomicrmw xor ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw xor ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %res
 }
 
-define i32 @test_atomicrmw_xor_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) {
-; COMMON-LABEL: define i32 @test_atomicrmw_xor_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define i32 @test_atomicrmw_xor_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_xor_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]]
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret i32 [[NEWLOADED]]
 ;
-  %res = atomicrmw xor ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw xor ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret i32 %res
 }
 
@@ -414,8 +414,8 @@ define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_fine_grained_memory(
   ret i32 %res
 }
 
-define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) {
-; COMMON-LABEL: define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_remote_memory_access(
+define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -430,12 +430,12 @@ define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_remote_memory_access
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret i32 [[NEWLOADED]]
 ;
-  %res = atomicrmw max ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw max ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %res
 }
 
-define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) {
-; COMMON-LABEL: define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -450,7 +450,7 @@ define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_fine_grained_memory_
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret i32 [[NEWLOADED]]
 ;
-  %res = atomicrmw max ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw max ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret i32 %res
 }
 
@@ -499,8 +499,8 @@ define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_fine_grained_memory(
   ret i32 %res
 }
 
-define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) {
-; COMMON-LABEL: define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_remote_memory_access(
+define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -515,12 +515,12 @@ define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_remote_memory_access
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret i32 [[NEWLOADED]]
 ;
-  %res = atomicrmw min ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw min ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %res
 }
 
-define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) {
-; COMMON-LABEL: define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -535,7 +535,7 @@ define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_fine_grained_memory_
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret i32 [[NEWLOADED]]
 ;
-  %res = atomicrmw min ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw min ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret i32 %res
 }
 
@@ -584,8 +584,8 @@ define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_fine_grained_memory
   ret i32 %res
 }
 
-define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) {
-; COMMON-LABEL: define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_remote_memory_access(
+define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -600,12 +600,12 @@ define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_remote_memory_acces
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret i32 [[NEWLOADED]]
 ;
-  %res = atomicrmw umax ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw umax ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %res
 }
 
-define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) {
-; COMMON-LABEL: define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -620,7 +620,7 @@ define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_fine_grained_memory
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret i32 [[NEWLOADED]]
 ;
-  %res = atomicrmw umax ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw umax ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret i32 %res
 }
 
@@ -669,8 +669,8 @@ define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_fine_grained_memory
   ret i32 %res
 }
 
-define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) {
-; COMMON-LABEL: define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_remote_memory_access(
+define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -685,12 +685,12 @@ define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_remote_memory_acces
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret i32 [[NEWLOADED]]
 ;
-  %res = atomicrmw umin ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw umin ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %res
 }
 
-define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) {
-; COMMON-LABEL: define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -705,7 +705,7 @@ define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_fine_grained_memory
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret i32 [[NEWLOADED]]
 ;
-  %res = atomicrmw umin ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw umin ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret i32 %res
 }
 
@@ -734,23 +734,23 @@ define i32 @test_atomicrmw_uinc_wrap_i32_global_system__amdgpu_no_fine_grained_m
   ret i32 %res
 }
 
-define i32 @test_atomicrmw_uinc_wrap_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) {
-; COMMON-LABEL: define i32 @test_atomicrmw_uinc_wrap_i32_global_system__amdgpu_no_remote_memory_access(
+define i32 @test_atomicrmw_uinc_wrap_i32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_uinc_wrap_i32_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]]
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret i32 [[NEWLOADED]]
 ;
-  %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %res
 }
 
-define i32 @test_atomicrmw_uinc_wrap_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) {
-; COMMON-LABEL: define i32 @test_atomicrmw_uinc_wrap_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define i32 @test_atomicrmw_uinc_wrap_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_uinc_wrap_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]]
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret i32 [[NEWLOADED]]
 ;
-  %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret i32 %res
 }
 
@@ -779,23 +779,23 @@ define i32 @test_atomicrmw_udec_wrap_i32_global_system__amdgpu_no_fine_grained_m
   ret i32 %res
 }
 
-define i32 @test_atomicrmw_udec_wrap_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) {
-; COMMON-LABEL: define i32 @test_atomicrmw_udec_wrap_i32_global_system__amdgpu_no_remote_memory_access(
+define i32 @test_atomicrmw_udec_wrap_i32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_udec_wrap_i32_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]]
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret i32 [[NEWLOADED]]
 ;
-  %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory !0
   ret i32 %res
 }
 
-define i32 @test_atomicrmw_udec_wrap_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) {
-; COMMON-LABEL: define i32 @test_atomicrmw_udec_wrap_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define i32 @test_atomicrmw_udec_wrap_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_udec_wrap_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]]
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret i32 [[NEWLOADED]]
 ;
-  %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret i32 %res
 }
 
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-agent.ll
new file mode 100644
index 000000000000..bd37f5ba88c6
--- /dev/null
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-agent.ll
@@ -0,0 +1,668 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX803 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX906 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX908 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX90A %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX940 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX10 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX11 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX12 %s
+
+;---------------------------------------------------------------------
+; atomicrmw xchg
+;---------------------------------------------------------------------
+
+; xchg is supported over PCIe, so no expansion is necessary
+define i64 @test_atomicrmw_xchg_i64_global_agent(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_xchg_i64_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8
+; COMMON-NEXT:    ret i64 [[RES]]
+;
+  %res = atomicrmw xchg ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst
+  ret i64 %res
+}
+
+; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
+define i64 @test_atomicrmw_xchg_i64_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_xchg_i64_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; COMMON-NEXT:    ret i64 [[RES]]
+;
+  %res = atomicrmw xchg ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i64 %res
+}
+
+; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
+define i64 @test_atomicrmw_xchg_i64_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_xchg_i64_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[RES]]
+;
+  %res = atomicrmw xchg ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret i64 %res
+}
+
+; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
+define i64 @test_atomicrmw_xchg_i64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_xchg_i64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[RES]]
+;
+  %res = atomicrmw xchg ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret i64 %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw add
+;---------------------------------------------------------------------
+
+; add is supported over PCIe, so no expansion is necessary
+define i64 @test_atomicrmw_add_i64_global_agent(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_add_i64_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8
+; COMMON-NEXT:    ret i64 [[RES]]
+;
+  %res = atomicrmw add ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst
+  ret i64 %res
+}
+
+; add is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
+define i64 @test_atomicrmw_add_i64_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_add_i64_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[RES]]
+;
+  %res = atomicrmw add ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i64 %res
+}
+
+; add is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
+define i64 @test_atomicrmw_add_i64_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_add_i64_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[RES]]
+;
+  %res = atomicrmw add ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret i64 %res
+}
+
+; add is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
+define i64 @test_atomicrmw_add_i64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_add_i64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[RES]]
+;
+  %res = atomicrmw add ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret i64 %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw sub
+;---------------------------------------------------------------------
+
+; expansion is necessary, sub is not supported over PCIe
+define i64 @test_atomicrmw_sub_i64_global_agent(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_sub_i64_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw sub ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst
+  ret i64 %res
+}
+
+define i64 @test_atomicrmw_sub_i64_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_sub_i64_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw sub ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i64 %res
+}
+
+define i64 @test_atomicrmw_sub_i64_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_sub_i64_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw sub ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret i64 %res
+}
+
+define i64 @test_atomicrmw_sub_i64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_sub_i64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw sub ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret i64 %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw and
+;---------------------------------------------------------------------
+
+; expansion is necessary, operation not supported over PCIe
+define i64 @test_atomicrmw_and_i64_global_agent(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_and_i64_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw and ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst
+  ret i64 %res
+}
+
+define i64 @test_atomicrmw_and_i64_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_and_i64_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw and ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i64 %res
+}
+
+define i64 @test_atomicrmw_and_i64_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_and_i64_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw and ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret i64 %res
+}
+
+define i64 @test_atomicrmw_and_i64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_and_i64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw and ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret i64 %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw nand
+;---------------------------------------------------------------------
+
+; expansion is necessary, operation not supported
+define i64 @test_atomicrmw_nand_i64_global_agent(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_nand_i64_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[NEW:%.*]] = xor i64 [[TMP2]], -1
+; COMMON-NEXT:    [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1
+; COMMON-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw nand ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst
+  ret i64 %res
+}
+
+define i64 @test_atomicrmw_nand_i64_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_nand_i64_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[NEW:%.*]] = xor i64 [[TMP2]], -1
+; COMMON-NEXT:    [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1
+; COMMON-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw nand ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i64 %res
+}
+
+define i64 @test_atomicrmw_nand_i64_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_nand_i64_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[NEW:%.*]] = xor i64 [[TMP2]], -1
+; COMMON-NEXT:    [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1
+; COMMON-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw nand ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret i64 %res
+}
+
+define i64 @test_atomicrmw_nand_i64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_nand_i64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[NEW:%.*]] = xor i64 [[TMP2]], -1
+; COMMON-NEXT:    [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1
+; COMMON-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw nand ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret i64 %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw or
+;---------------------------------------------------------------------
+
+; expansion is necessary, operation not supported over PCIe
+define i64 @test_atomicrmw_or_i64_global_agent(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_or_i64_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw or ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst
+  ret i64 %res
+}
+
+define i64 @test_atomicrmw_or_i64_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_or_i64_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw or ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i64 %res
+}
+
+define i64 @test_atomicrmw_or_i64_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_or_i64_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw or ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret i64 %res
+}
+
+define i64 @test_atomicrmw_or_i64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_or_i64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw or ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret i64 %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw xor
+;---------------------------------------------------------------------
+
+; expansion is necessary, operation not supported over PCIe
+define i64 @test_atomicrmw_xor_i64_global_agent(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_xor_i64_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw xor ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst
+  ret i64 %res
+}
+
+define i64 @test_atomicrmw_xor_i64_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_xor_i64_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw xor ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i64 %res
+}
+
+define i64 @test_atomicrmw_xor_i64_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_xor_i64_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw xor ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret i64 %res
+}
+
+define i64 @test_atomicrmw_xor_i64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_xor_i64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw xor ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret i64 %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw max
+;---------------------------------------------------------------------
+
+; expansion is necessary, operation not supported over PCIe
+define i64 @test_atomicrmw_max_i64_global_agent(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_max_i64_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw max ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw max ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst
+  ret i64 %res
+}
+
+define i64 @test_atomicrmw_max_i64_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_max_i64_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw max ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw max ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i64 %res
+}
+
+define i64 @test_atomicrmw_max_i64_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_max_i64_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw max ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw max ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret i64 %res
+}
+
+define i64 @test_atomicrmw_max_i64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_max_i64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw max ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw max ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret i64 %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw min
+;---------------------------------------------------------------------
+
+; expansion is necessary, operation not supported over PCIe
+define i64 @test_atomicrmw_min_i64_global_agent(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_min_i64_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw min ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw min ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst
+  ret i64 %res
+}
+
+define i64 @test_atomicrmw_min_i64_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_min_i64_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw min ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw min ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i64 %res
+}
+
+define i64 @test_atomicrmw_min_i64_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_min_i64_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw min ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw min ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret i64 %res
+}
+
+define i64 @test_atomicrmw_min_i64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_min_i64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw min ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw min ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret i64 %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw umax
+;---------------------------------------------------------------------
+
+; expansion is necessary, operation not supported over PCIe
+define i64 @test_atomicrmw_umax_i64_global_agent(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_umax_i64_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw umax ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw umax ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst
+  ret i64 %res
+}
+
+define i64 @test_atomicrmw_umax_i64_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_umax_i64_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw umax ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw umax ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i64 %res
+}
+
+define i64 @test_atomicrmw_umax_i64_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_umax_i64_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw umax ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw umax ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret i64 %res
+}
+
+define i64 @test_atomicrmw_umax_i64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_umax_i64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw umax ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw umax ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret i64 %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw umin
+;---------------------------------------------------------------------
+
+; expansion is necessary, operation not supported over PCIe
+define i64 @test_atomicrmw_umin_i64_global_agent(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_umin_i64_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw umin ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw umin ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst
+  ret i64 %res
+}
+
+define i64 @test_atomicrmw_umin_i64_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_umin_i64_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw umin ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw umin ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i64 %res
+}
+
+define i64 @test_atomicrmw_umin_i64_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_umin_i64_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw umin ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw umin ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret i64 %res
+}
+
+define i64 @test_atomicrmw_umin_i64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_umin_i64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw umin ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw umin ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret i64 %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw uinc_wrap
+;---------------------------------------------------------------------
+
+; expansion is necessary, operation not supported over PCIe
+define i64 @test_atomicrmw_uinc_wrap_i64_global_agent(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_uinc_wrap_i64_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst
+  ret i64 %res
+}
+
+define i64 @test_atomicrmw_uinc_wrap_i64_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_uinc_wrap_i64_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i64 %res
+}
+
+define i64 @test_atomicrmw_uinc_wrap_i64_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_uinc_wrap_i64_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret i64 %res
+}
+
+define i64 @test_atomicrmw_uinc_wrap_i64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_uinc_wrap_i64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret i64 %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw udec_wrap
+;---------------------------------------------------------------------
+
+; expansion is necessary, operation not supported over PCIe
+define i64 @test_atomicrmw_udec_wrap_i64_global_agent(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_udec_wrap_i64_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst
+  ret i64 %res
+}
+
+define i64 @test_atomicrmw_udec_wrap_i64_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_udec_wrap_i64_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i64 %res
+}
+
+define i64 @test_atomicrmw_udec_wrap_i64_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_udec_wrap_i64_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret i64 %res
+}
+
+define i64 @test_atomicrmw_udec_wrap_i64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_udec_wrap_i64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT:    ret i64 [[NEWLOADED]]
+;
+  %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret i64 %res
+}
+
+!0 = !{}
+;.
+; GFX803: [[META0]] = !{}
+;.
+; GFX906: [[META0]] = !{}
+;.
+; GFX908: [[META0]] = !{}
+;.
+; GFX90A: [[META0]] = !{}
+;.
+; GFX940: [[META0]] = !{}
+;.
+; GFX10: [[META0]] = !{}
+;.
+; GFX11: [[META0]] = !{}
+;.
+; GFX12: [[META0]] = !{}
+;.
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10: {{.*}}
+; GFX11: {{.*}}
+; GFX12: {{.*}}
+; GFX803: {{.*}}
+; GFX906: {{.*}}
+; GFX908: {{.*}}
+; GFX90A: {{.*}}
+; GFX940: {{.*}}
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll
index d67bf2e450b8..ecb898d120dd 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll
@@ -35,24 +35,24 @@ define i64 @test_atomicrmw_xchg_i64_global_system__amdgpu_no_fine_grained_memory
 }
 
 ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
-define i64 @test_atomicrmw_xchg_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) {
-; COMMON-LABEL: define i64 @test_atomicrmw_xchg_i64_global_system__amdgpu_no_remote_memory_access(
+define i64 @test_atomicrmw_xchg_i64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_xchg_i64_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory.access [[META0]]
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret i64 [[RES]]
 ;
-  %res = atomicrmw xchg ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw xchg ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %res
 }
 
 ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
-define i64 @test_atomicrmw_xchg_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) {
-; COMMON-LABEL: define i64 @test_atomicrmw_xchg_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define i64 @test_atomicrmw_xchg_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_xchg_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]]
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret i64 [[RES]]
 ;
-  %res = atomicrmw xchg ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw xchg ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret i64 %res
 }
 
@@ -83,24 +83,24 @@ define i64 @test_atomicrmw_add_i64_global_system__amdgpu_no_fine_grained_memory(
 }
 
 ; add is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
-define i64 @test_atomicrmw_add_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) {
-; COMMON-LABEL: define i64 @test_atomicrmw_add_i64_global_system__amdgpu_no_remote_memory_access(
+define i64 @test_atomicrmw_add_i64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_add_i64_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory.access [[META0]]
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret i64 [[RES]]
 ;
-  %res = atomicrmw add ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw add ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %res
 }
 
 ; add is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
-define i64 @test_atomicrmw_add_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) {
-; COMMON-LABEL: define i64 @test_atomicrmw_add_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define i64 @test_atomicrmw_add_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_add_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]]
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret i64 [[RES]]
 ;
-  %res = atomicrmw add ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw add ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret i64 %res
 }
 
@@ -129,23 +129,23 @@ define i64 @test_atomicrmw_sub_i64_global_system__amdgpu_no_fine_grained_memory(
   ret i64 %res
 }
 
-define i64 @test_atomicrmw_sub_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) {
-; COMMON-LABEL: define i64 @test_atomicrmw_sub_i64_global_system__amdgpu_no_remote_memory_access(
+define i64 @test_atomicrmw_sub_i64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_sub_i64_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory.access [[META0]]
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret i64 [[NEWLOADED]]
 ;
-  %res = atomicrmw sub ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw sub ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %res
 }
 
-define i64 @test_atomicrmw_sub_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) {
-; COMMON-LABEL: define i64 @test_atomicrmw_sub_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define i64 @test_atomicrmw_sub_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_sub_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]]
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret i64 [[NEWLOADED]]
 ;
-  %res = atomicrmw sub ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw sub ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret i64 %res
 }
 
@@ -174,23 +174,23 @@ define i64 @test_atomicrmw_and_i64_global_system__amdgpu_no_fine_grained_memory(
   ret i64 %res
 }
 
-define i64 @test_atomicrmw_and_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) {
-; COMMON-LABEL: define i64 @test_atomicrmw_and_i64_global_system__amdgpu_no_remote_memory_access(
+define i64 @test_atomicrmw_and_i64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_and_i64_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory.access [[META0]]
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret i64 [[NEWLOADED]]
 ;
-  %res = atomicrmw and ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw and ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %res
 }
 
-define i64 @test_atomicrmw_and_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) {
-; COMMON-LABEL: define i64 @test_atomicrmw_and_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define i64 @test_atomicrmw_and_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_and_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]]
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret i64 [[NEWLOADED]]
 ;
-  %res = atomicrmw and ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw and ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret i64 %res
 }
 
@@ -239,8 +239,8 @@ define i64 @test_atomicrmw_nand_i64_global_system__amdgpu_no_fine_grained_memory
   ret i64 %res
 }
 
-define i64 @test_atomicrmw_nand_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) {
-; COMMON-LABEL: define i64 @test_atomicrmw_nand_i64_global_system__amdgpu_no_remote_memory_access(
+define i64 @test_atomicrmw_nand_i64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_nand_i64_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -255,12 +255,12 @@ define i64 @test_atomicrmw_nand_i64_global_system__amdgpu_no_remote_memory_acces
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret i64 [[NEWLOADED]]
 ;
-  %res = atomicrmw nand ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw nand ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %res
 }
 
-define i64 @test_atomicrmw_nand_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) {
-; COMMON-LABEL: define i64 @test_atomicrmw_nand_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define i64 @test_atomicrmw_nand_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_nand_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -275,7 +275,7 @@ define i64 @test_atomicrmw_nand_i64_global_system__amdgpu_no_fine_grained_memory
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret i64 [[NEWLOADED]]
 ;
-  %res = atomicrmw nand ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw nand ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret i64 %res
 }
 
@@ -304,23 +304,23 @@ define i64 @test_atomicrmw_or_i64_global_system__amdgpu_no_fine_grained_memory(p
   ret i64 %res
 }
 
-define i64 @test_atomicrmw_or_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) {
-; COMMON-LABEL: define i64 @test_atomicrmw_or_i64_global_system__amdgpu_no_remote_memory_access(
+define i64 @test_atomicrmw_or_i64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_or_i64_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory.access [[META0]]
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret i64 [[NEWLOADED]]
 ;
-  %res = atomicrmw or ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw or ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %res
 }
 
-define i64 @test_atomicrmw_or_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) {
-; COMMON-LABEL: define i64 @test_atomicrmw_or_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define i64 @test_atomicrmw_or_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_or_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]]
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret i64 [[NEWLOADED]]
 ;
-  %res = atomicrmw or ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw or ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret i64 %res
 }
 
@@ -349,23 +349,23 @@ define i64 @test_atomicrmw_xor_i64_global_system__amdgpu_no_fine_grained_memory(
   ret i64 %res
 }
 
-define i64 @test_atomicrmw_xor_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) {
-; COMMON-LABEL: define i64 @test_atomicrmw_xor_i64_global_system__amdgpu_no_remote_memory_access(
+define i64 @test_atomicrmw_xor_i64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_xor_i64_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory.access [[META0]]
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret i64 [[NEWLOADED]]
 ;
-  %res = atomicrmw xor ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw xor ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %res
 }
 
-define i64 @test_atomicrmw_xor_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) {
-; COMMON-LABEL: define i64 @test_atomicrmw_xor_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define i64 @test_atomicrmw_xor_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_xor_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]]
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret i64 [[NEWLOADED]]
 ;
-  %res = atomicrmw xor ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw xor ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret i64 %res
 }
 
@@ -414,8 +414,8 @@ define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_fine_grained_memory(
   ret i64 %res
 }
 
-define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) {
-; COMMON-LABEL: define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_remote_memory_access(
+define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -430,12 +430,12 @@ define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_remote_memory_access
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret i64 [[NEWLOADED]]
 ;
-  %res = atomicrmw max ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw max ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %res
 }
 
-define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) {
-; COMMON-LABEL: define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -450,7 +450,7 @@ define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_fine_grained_memory_
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret i64 [[NEWLOADED]]
 ;
-  %res = atomicrmw max ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw max ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret i64 %res
 }
 
@@ -499,8 +499,8 @@ define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_fine_grained_memory(
   ret i64 %res
 }
 
-define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) {
-; COMMON-LABEL: define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_remote_memory_access(
+define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -515,12 +515,12 @@ define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_remote_memory_access
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret i64 [[NEWLOADED]]
 ;
-  %res = atomicrmw min ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw min ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %res
 }
 
-define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) {
-; COMMON-LABEL: define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -535,7 +535,7 @@ define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_fine_grained_memory_
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret i64 [[NEWLOADED]]
 ;
-  %res = atomicrmw min ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw min ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret i64 %res
 }
 
@@ -584,8 +584,8 @@ define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_fine_grained_memory
   ret i64 %res
 }
 
-define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) {
-; COMMON-LABEL: define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_remote_memory_access(
+define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -600,12 +600,12 @@ define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_remote_memory_acces
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret i64 [[NEWLOADED]]
 ;
-  %res = atomicrmw umax ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw umax ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %res
 }
 
-define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) {
-; COMMON-LABEL: define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -620,7 +620,7 @@ define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_fine_grained_memory
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret i64 [[NEWLOADED]]
 ;
-  %res = atomicrmw umax ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw umax ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret i64 %res
 }
 
@@ -669,8 +669,8 @@ define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_fine_grained_memory
   ret i64 %res
 }
 
-define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) {
-; COMMON-LABEL: define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_remote_memory_access(
+define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -685,12 +685,12 @@ define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_remote_memory_acces
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret i64 [[NEWLOADED]]
 ;
-  %res = atomicrmw umin ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw umin ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %res
 }
 
-define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) {
-; COMMON-LABEL: define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -705,7 +705,7 @@ define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_fine_grained_memory
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret i64 [[NEWLOADED]]
 ;
-  %res = atomicrmw umin ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw umin ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret i64 %res
 }
 
@@ -734,23 +734,23 @@ define i64 @test_atomicrmw_uinc_wrap_i64_global_system__amdgpu_no_fine_grained_m
   ret i64 %res
 }
 
-define i64 @test_atomicrmw_uinc_wrap_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) {
-; COMMON-LABEL: define i64 @test_atomicrmw_uinc_wrap_i64_global_system__amdgpu_no_remote_memory_access(
+define i64 @test_atomicrmw_uinc_wrap_i64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_uinc_wrap_i64_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory.access [[META0]]
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret i64 [[NEWLOADED]]
 ;
-  %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %res
 }
 
-define i64 @test_atomicrmw_uinc_wrap_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) {
-; COMMON-LABEL: define i64 @test_atomicrmw_uinc_wrap_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define i64 @test_atomicrmw_uinc_wrap_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_uinc_wrap_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]]
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret i64 [[NEWLOADED]]
 ;
-  %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret i64 %res
 }
 
@@ -779,23 +779,23 @@ define i64 @test_atomicrmw_udec_wrap_i64_global_system__amdgpu_no_fine_grained_m
   ret i64 %res
 }
 
-define i64 @test_atomicrmw_udec_wrap_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) {
-; COMMON-LABEL: define i64 @test_atomicrmw_udec_wrap_i64_global_system__amdgpu_no_remote_memory_access(
+define i64 @test_atomicrmw_udec_wrap_i64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_udec_wrap_i64_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory.access [[META0]]
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret i64 [[NEWLOADED]]
 ;
-  %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory !0
   ret i64 %res
 }
 
-define i64 @test_atomicrmw_udec_wrap_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) {
-; COMMON-LABEL: define i64 @test_atomicrmw_udec_wrap_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define i64 @test_atomicrmw_udec_wrap_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_udec_wrap_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]]
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
 ; COMMON-NEXT:    ret i64 [[NEWLOADED]]
 ;
-  %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret i64 %res
 }
 
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll
new file mode 100644
index 000000000000..4f053af7aed9
--- /dev/null
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll
@@ -0,0 +1,859 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX803 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX906 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX908 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX90A %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX940 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX10 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX11 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX12 %s
+
+;---------------------------------------------------------------------
+; TODO: atomicrmw xchg
+;---------------------------------------------------------------------
+
+; ; xchg is supported over PCIe, so no expansion is necessary
+; define <2 x bfloat> @test_atomicrmw_xchg_v2bf16_global_agent(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+;   %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst
+;   ret <2 x bfloat> %res
+; }
+
+; ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
+; define <2 x bfloat> @test_atomicrmw_xchg_v2bf16_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+;   %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+;   ret <2 x bfloat> %res
+; }
+
+; ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
+; define <2 x bfloat> @test_atomicrmw_xchg_v2bf16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+;   %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+;   ret <2 x bfloat> %res
+; }
+
+; ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
+; define <2 x bfloat> @test_atomicrmw_xchg_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+;   %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+;   ret <2 x bfloat> %res
+; }
+
+;---------------------------------------------------------------------
+; atomicrmw fadd
+;---------------------------------------------------------------------
+
+define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst
+  ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(ptr addrspace(1) %ptr, <2 x bfloat> %value) #0 {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(ptr addrspace(1) %ptr, <2 x bfloat> %value) #1 {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
+  ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, <2 x bfloat> %value) #0 {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, <2 x bfloat> %value) #1 {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret <2 x bfloat> %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw fsub
+;---------------------------------------------------------------------
+
+define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[RES]]
+;
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst
+  ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[RES]]
+;
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[RES]]
+;
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[RES]]
+;
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[TMP5]]
+;
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
+  ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[TMP5]]
+;
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[TMP5]]
+;
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[TMP5]]
+;
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret <2 x bfloat> %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw fmax
+;---------------------------------------------------------------------
+
+define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[RES]]
+;
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst
+  ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[RES]]
+;
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[RES]]
+;
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[RES]]
+;
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[TMP6]]
+;
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
+  ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[TMP6]]
+;
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[TMP6]]
+;
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[TMP6]]
+;
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret <2 x bfloat> %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw fmin
+;---------------------------------------------------------------------
+
+define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[RES]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst
+  ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[RES]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[RES]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[RES]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[TMP6]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
+  ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[TMP6]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[TMP6]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x bfloat> [[TMP6]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret <2 x bfloat> %res
+}
+
+attributes #0 = { "denormal-fp-mode"="preserve-sign,preserve-sign" }
+attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" }
+
+!0 = !{}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10: {{.*}}
+; GFX11: {{.*}}
+; GFX12: {{.*}}
+; GFX803: {{.*}}
+; GFX906: {{.*}}
+; GFX908: {{.*}}
+; GFX90A: {{.*}}
+; GFX940: {{.*}}
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll
index 01a23097008c..0339e885ca77 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll
@@ -25,14 +25,14 @@
 ; }
 
 ; ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
-; define <2 x bfloat> @test_atomicrmw_xchg_v2bf16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-;   %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.remote.memory.access !0
+; define <2 x bfloat> @test_atomicrmw_xchg_v2bf16_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+;   %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.remote.memory !0
 ;   ret <2 x bfloat> %res
 ; }
 
 ; ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
-; define <2 x bfloat> @test_atomicrmw_xchg_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-;   %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+; define <2 x bfloat> @test_atomicrmw_xchg_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+;   %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
 ;   ret <2 x bfloat> %res
 ; }
 
@@ -84,8 +84,8 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
   ret <2 x bfloat> %res
 }
 
-define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory_access(
+define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -102,12 +102,12 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x bfloat> [[TMP5]]
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.remote.memory !0
   ret <2 x bfloat> %res
 }
 
-define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -124,12 +124,12 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x bfloat> [[TMP5]]
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret <2 x bfloat> %res
 }
 
-define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_v2bf16_daz(ptr addrspace(1) %ptr, <2 x bfloat> %value) #0 {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_v2bf16_daz(
+define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(ptr addrspace(1) %ptr, <2 x bfloat> %value) #0 {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -146,12 +146,12 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x bfloat> [[TMP5]]
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret <2 x bfloat> %res
 }
 
-define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_v2bf16_dynamic(ptr addrspace(1) %ptr, <2 x bfloat> %value) #1 {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_v2bf16_dynamic(
+define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(ptr addrspace(1) %ptr, <2 x bfloat> %value) #1 {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -168,7 +168,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x bfloat> [[TMP5]]
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret <2 x bfloat> %res
 }
 
@@ -216,8 +216,8 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
   ret <2 x bfloat> %res
 }
 
-define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
+define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -234,12 +234,12 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x bfloat> [[TMP5]]
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret <2 x bfloat> %res
 }
 
-define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -256,12 +256,12 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x bfloat> [[TMP5]]
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret <2 x bfloat> %res
 }
 
-define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(ptr addrspace(1) %ptr, <2 x bfloat> %value) #0 {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(
+define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, <2 x bfloat> %value) #0 {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -278,12 +278,12 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x bfloat> [[TMP5]]
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret <2 x bfloat> %res
 }
 
-define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(ptr addrspace(1) %ptr, <2 x bfloat> %value) #1 {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(
+define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, <2 x bfloat> %value) #1 {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -300,7 +300,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x bfloat> [[TMP5]]
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret <2 x bfloat> %res
 }
 
@@ -352,8 +352,8 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_fine_gr
   ret <2 x bfloat> %res
 }
 
-define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_remote_memory_access(
+define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -370,12 +370,12 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_remote_
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x bfloat> [[RES]]
 ;
-  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.remote.memory !0
   ret <2 x bfloat> %res
 }
 
-define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -392,7 +392,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_fine_gr
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x bfloat> [[RES]]
 ;
-  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret <2 x bfloat> %res
 }
 
@@ -440,8 +440,8 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_den
   ret <2 x bfloat> %res
 }
 
-define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
+define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -458,12 +458,12 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_den
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x bfloat> [[TMP5]]
 ;
-  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret <2 x bfloat> %res
 }
 
-define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -480,7 +480,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_den
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x bfloat> [[TMP5]]
 ;
-  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret <2 x bfloat> %res
 }
 
@@ -532,8 +532,8 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_fine_gr
   ret <2 x bfloat> %res
 }
 
-define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_remote_memory_access(
+define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -550,12 +550,12 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_remote_
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x bfloat> [[RES]]
 ;
-  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.remote.memory !0
   ret <2 x bfloat> %res
 }
 
-define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -572,7 +572,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_fine_gr
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x bfloat> [[RES]]
 ;
-  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret <2 x bfloat> %res
 }
 
@@ -620,8 +620,8 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_den
   ret <2 x bfloat> %res
 }
 
-define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
+define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -638,12 +638,12 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_den
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x bfloat> [[TMP6]]
 ;
-  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret <2 x bfloat> %res
 }
 
-define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -660,7 +660,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_den
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x bfloat> [[TMP6]]
 ;
-  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret <2 x bfloat> %res
 }
 
@@ -712,8 +712,8 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_fine_gr
   ret <2 x bfloat> %res
 }
 
-define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_remote_memory_access(
+define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -730,12 +730,12 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_remote_
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x bfloat> [[RES]]
 ;
-  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.remote.memory !0
   ret <2 x bfloat> %res
 }
 
-define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -752,7 +752,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_fine_gr
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x bfloat> [[RES]]
 ;
-  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret <2 x bfloat> %res
 }
 
@@ -800,8 +800,8 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_den
   ret <2 x bfloat> %res
 }
 
-define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
+define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -818,12 +818,12 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_den
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x bfloat> [[TMP6]]
 ;
-  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret <2 x bfloat> %res
 }
 
-define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -840,7 +840,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_den
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x bfloat> [[TMP6]]
 ;
-  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret <2 x bfloat> %res
 }
 
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll
new file mode 100644
index 000000000000..0fc6f1134ff4
--- /dev/null
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll
@@ -0,0 +1,859 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX803 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX906 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX908 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX90A %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX940 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX10 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX11 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX12 %s
+
+;---------------------------------------------------------------------
+; TODO: atomicrmw xchg
+;---------------------------------------------------------------------
+
+; ; xchg is supported over PCIe, so no expansion is necessary
+; define <2 x half> @test_atomicrmw_xchg_v2f16_global_agent(ptr addrspace(1) %ptr, <2 x half> %value) {
+;   %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst
+;   ret <2 x half> %res
+; }
+
+; ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
+; define <2 x half> @test_atomicrmw_xchg_v2f16_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+;   %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+;   ret <2 x half> %res
+; }
+
+; ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
+; define <2 x half> @test_atomicrmw_xchg_v2f16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+;   %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+;   ret <2 x half> %res
+; }
+
+; ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
+; define <2 x half> @test_atomicrmw_xchg_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+;   %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+;   ret <2 x half> %res
+; }
+
+;---------------------------------------------------------------------
+; atomicrmw fadd
+;---------------------------------------------------------------------
+
+define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst
+  ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(ptr addrspace(1) %ptr, <2 x half> %value) #0 {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(ptr addrspace(1) %ptr, <2 x half> %value) #1 {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
+  ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, <2 x half> %value) #0 {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, <2 x half> %value) #1 {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret <2 x half> %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw fsub
+;---------------------------------------------------------------------
+
+define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[RES]]
+;
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst
+  ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[RES]]
+;
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[RES]]
+;
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[RES]]
+;
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[TMP5]]
+;
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
+  ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[TMP5]]
+;
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[TMP5]]
+;
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; COMMON-NEXT:    [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[TMP5]]
+;
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret <2 x half> %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw fmax
+;---------------------------------------------------------------------
+
+define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[RES]]
+;
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst
+  ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[RES]]
+;
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[RES]]
+;
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[RES]]
+;
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[TMP6]]
+;
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
+  ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[TMP6]]
+;
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[TMP6]]
+;
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[TMP6]]
+;
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret <2 x half> %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw fmin
+;---------------------------------------------------------------------
+
+define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[RES]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst
+  ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[RES]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[RES]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[RES]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[TMP6]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
+  ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[TMP6]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[TMP6]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; COMMON:       atomicrmw.start:
+; COMMON-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT:    [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; COMMON-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; COMMON-NEXT:    [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON:       atomicrmw.end:
+; COMMON-NEXT:    ret <2 x half> [[TMP6]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret <2 x half> %res
+}
+
+attributes #0 = { "denormal-fp-mode"="preserve-sign,preserve-sign" }
+attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" }
+
+!0 = !{}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10: {{.*}}
+; GFX11: {{.*}}
+; GFX12: {{.*}}
+; GFX803: {{.*}}
+; GFX906: {{.*}}
+; GFX908: {{.*}}
+; GFX90A: {{.*}}
+; GFX940: {{.*}}
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll
index 2a1824b0ca4a..20f39b5ed0c8 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll
@@ -25,14 +25,14 @@
 ; }
 
 ; ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
-; define <2 x half> @test_atomicrmw_xchg_v2f16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
-;   %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.remote.memory.access !0
+; define <2 x half> @test_atomicrmw_xchg_v2f16_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+;   %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.remote.memory !0
 ;   ret <2 x half> %res
 ; }
 
 ; ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
-; define <2 x half> @test_atomicrmw_xchg_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
-;   %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+; define <2 x half> @test_atomicrmw_xchg_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+;   %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
 ;   ret <2 x half> %res
 ; }
 
@@ -84,8 +84,8 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
   ret <2 x half> %res
 }
 
-define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory_access(
+define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -102,12 +102,12 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_mem
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x half> [[TMP5]]
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.remote.memory !0
   ret <2 x half> %res
 }
 
-define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -124,12 +124,12 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x half> [[TMP5]]
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret <2 x half> %res
 }
 
-define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_v2f16_daz(ptr addrspace(1) %ptr, <2 x half> %value) #0 {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_v2f16_daz(
+define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(ptr addrspace(1) %ptr, <2 x half> %value) #0 {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -146,12 +146,12 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x half> [[TMP5]]
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret <2 x half> %res
 }
 
-define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_v2f16_dynamic(ptr addrspace(1) %ptr, <2 x half> %value) #1 {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_v2f16_dynamic(
+define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(ptr addrspace(1) %ptr, <2 x half> %value) #1 {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -168,7 +168,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x half> [[TMP5]]
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret <2 x half> %res
 }
 
@@ -216,8 +216,8 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
   ret <2 x half> %res
 }
 
-define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
+define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -234,12 +234,12 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x half> [[TMP5]]
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret <2 x half> %res
 }
 
-define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -256,12 +256,12 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x half> [[TMP5]]
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret <2 x half> %res
 }
 
-define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(ptr addrspace(1) %ptr, <2 x half> %value) #0 {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(
+define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, <2 x half> %value) #0 {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -278,12 +278,12 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x half> [[TMP5]]
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret <2 x half> %res
 }
 
-define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(ptr addrspace(1) %ptr, <2 x half> %value) #1 {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(
+define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, <2 x half> %value) #1 {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -300,7 +300,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x half> [[TMP5]]
 ;
-  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret <2 x half> %res
 }
 
@@ -352,8 +352,8 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_fine_grain
   ret <2 x half> %res
 }
 
-define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_remote_memory_access(
+define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -370,12 +370,12 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_remote_mem
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x half> [[RES]]
 ;
-  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.remote.memory !0
   ret <2 x half> %res
 }
 
-define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -392,7 +392,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_fine_grain
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x half> [[RES]]
 ;
-  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret <2 x half> %res
 }
 
@@ -440,8 +440,8 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denorm
   ret <2 x half> %res
 }
 
-define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
+define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -458,12 +458,12 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denorm
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x half> [[TMP5]]
 ;
-  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret <2 x half> %res
 }
 
-define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -480,7 +480,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denorm
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x half> [[TMP5]]
 ;
-  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret <2 x half> %res
 }
 
@@ -532,8 +532,8 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_fine_grain
   ret <2 x half> %res
 }
 
-define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_remote_memory_access(
+define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -550,12 +550,12 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_remote_mem
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x half> [[RES]]
 ;
-  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.remote.memory !0
   ret <2 x half> %res
 }
 
-define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -572,7 +572,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_fine_grain
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x half> [[RES]]
 ;
-  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret <2 x half> %res
 }
 
@@ -620,8 +620,8 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denorm
   ret <2 x half> %res
 }
 
-define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
+define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -638,12 +638,12 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denorm
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x half> [[TMP6]]
 ;
-  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret <2 x half> %res
 }
 
-define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -660,7 +660,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denorm
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x half> [[TMP6]]
 ;
-  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret <2 x half> %res
 }
 
@@ -712,8 +712,8 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_fine_grain
   ret <2 x half> %res
 }
 
-define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_remote_memory_access(
+define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -730,12 +730,12 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_remote_mem
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x half> [[RES]]
 ;
-  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.remote.memory !0
   ret <2 x half> %res
 }
 
-define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -752,7 +752,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_fine_grain
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x half> [[RES]]
 ;
-  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret <2 x half> %res
 }
 
@@ -800,8 +800,8 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denorm
   ret <2 x half> %res
 }
 
-define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(
+define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -818,12 +818,12 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denorm
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x half> [[TMP6]]
 ;
-  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret <2 x half> %res
 }
 
-define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
+define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
 ; COMMON-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
 ; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -840,7 +840,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denorm
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret <2 x half> [[TMP6]]
 ;
-  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret <2 x half> %res
 }
 
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll
index 097d7b6ac577..8e5b7806a590 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll
@@ -1,12 +1,18 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=atomic-expand %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 4
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=atomic-expand -mcpu=gfx803 %s | FileCheck -check-prefixes=CHECK,GFX803 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=atomic-expand -mcpu=gfx900 %s | FileCheck -check-prefixes=CHECK,GFX900 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=atomic-expand -mcpu=gfx90a %s | FileCheck -check-prefixes=CHECK,GFX90A %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=atomic-expand -mcpu=gfx1030 %s | FileCheck -check-prefixes=CHECK,GFX10 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=atomic-expand -mcpu=gfx1100 %s | FileCheck -check-prefixes=CHECK,GFX11 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=atomic-expand -mcpu=gfx940 %s | FileCheck -check-prefixes=CHECK,GFX940 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=atomic-expand -mcpu=gfx1200 %s | FileCheck -check-prefixes=CHECK,GFX12 %s
 
 ; Test that system scoped atomicrmw or 0 is transformed to add 0.
 
 ; Transform to add
 define i32 @test_atomicrmw_or_0_global_system(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: define i32 @test_atomicrmw_or_0_global_system(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4, !foo.md [[META0:![0-9]+]]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
@@ -17,7 +23,7 @@ define i32 @test_atomicrmw_or_0_global_system(ptr addrspace(1) %ptr) {
 ; Transform to add
 define i32 @test_atomicrmw_or_0_global_one_as(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: define i32 @test_atomicrmw_or_0_global_one_as(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 0 syncscope("one-as") seq_cst, align 4
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
@@ -28,7 +34,7 @@ define i32 @test_atomicrmw_or_0_global_one_as(ptr addrspace(1) %ptr) {
 ; Transform to add
 define i32 @test_atomicrmw_or_0_flat_system(ptr %ptr) {
 ; CHECK-LABEL: define i32 @test_atomicrmw_or_0_flat_system(
-; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RES:%.*]] = atomicrmw add ptr [[PTR]], i32 0 seq_cst, align 4
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
@@ -39,7 +45,7 @@ define i32 @test_atomicrmw_or_0_flat_system(ptr %ptr) {
 ; Transform to add
 define i32 @test_atomicrmw_or_0_as999_system(ptr addrspace(999) %ptr) {
 ; CHECK-LABEL: define i32 @test_atomicrmw_or_0_as999_system(
-; CHECK-SAME: ptr addrspace(999) [[PTR:%.*]]) {
+; CHECK-SAME: ptr addrspace(999) [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RES:%.*]] = atomicrmw add ptr addrspace(999) [[PTR]], i32 0 seq_cst, align 4
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
@@ -50,7 +56,7 @@ define i32 @test_atomicrmw_or_0_as999_system(ptr addrspace(999) %ptr) {
 ; Leave as-is, only system scope should be changed.
 define i32 @test_atomicrmw_or_0_global_agent(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: define i32 @test_atomicrmw_or_0_global_agent(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RES:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 0 syncscope("agent") seq_cst, align 4
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
@@ -61,7 +67,7 @@ define i32 @test_atomicrmw_or_0_global_agent(ptr addrspace(1) %ptr) {
 ; Leave as-is, LDS atomics aren't relevant.
 define i32 @test_atomicrmw_or_0_local(ptr addrspace(3) %ptr) {
 ; CHECK-LABEL: define i32 @test_atomicrmw_or_0_local(
-; CHECK-SAME: ptr addrspace(3) [[PTR:%.*]]) {
+; CHECK-SAME: ptr addrspace(3) [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RES:%.*]] = atomicrmw or ptr addrspace(3) [[PTR]], i32 0 seq_cst, align 4
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
@@ -72,7 +78,7 @@ define i32 @test_atomicrmw_or_0_local(ptr addrspace(3) %ptr) {
 ; Leave non-0 values alone.
 define i32 @test_atomicrmw_or_1_global_system(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: define i32 @test_atomicrmw_or_1_global_system(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RES:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 1 seq_cst, align 4
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
@@ -82,7 +88,7 @@ define i32 @test_atomicrmw_or_1_global_system(ptr addrspace(1) %ptr) {
 
 define i32 @test_atomicrmw_or_var_global_system(ptr addrspace(1) %ptr, i32 %val) {
 ; CHECK-LABEL: define i32 @test_atomicrmw_or_var_global_system(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VAL:%.*]]) {
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RES:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 [[VAL]] seq_cst, align 4
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
@@ -93,7 +99,7 @@ define i32 @test_atomicrmw_or_var_global_system(ptr addrspace(1) %ptr, i32 %val)
 ; Leave as-is
 define i32 @test_atomicrmw_add_0_global_system(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: define i32 @test_atomicrmw_add_0_global_system(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
@@ -104,7 +110,7 @@ define i32 @test_atomicrmw_add_0_global_system(ptr addrspace(1) %ptr) {
 ; Transform to add
 define i32 @test_atomicrmw_sub_0_global_system(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: define i32 @test_atomicrmw_sub_0_global_system(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4, !foo.md [[META0]]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
@@ -115,7 +121,7 @@ define i32 @test_atomicrmw_sub_0_global_system(ptr addrspace(1) %ptr) {
 ; Transform to add
 define i32 @test_atomicrmw_xor_0_global_system(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: define i32 @test_atomicrmw_xor_0_global_system(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4, !foo.md [[META0]]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
@@ -123,18 +129,193 @@ define i32 @test_atomicrmw_xor_0_global_system(ptr addrspace(1) %ptr) {
   ret i32 %res
 }
 
+define i32 @test_atomicrmw_or_0_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define i32 @test_atomicrmw_or_0_global_system__amdgpu_no_fine_grained_memory(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw or ptr addrspace(1) %ptr, i32 0 seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_or_0_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define i32 @test_atomicrmw_or_0_global_system__amdgpu_no_remote_memory(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw or ptr addrspace(1) %ptr, i32 0 seq_cst, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_or_0_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define i32 @test_atomicrmw_or_0_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw or ptr addrspace(1) %ptr, i32 0 seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
 
-define i32 @test_atomicrmw_or_0_global_system__metadata(ptr addrspace(1) %ptr) {
-; CHECK-LABEL: define i32 @test_atomicrmw_or_0_global_system__metadata(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
-; CHECK-NEXT:    [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]]
+define i32 @test_atomicrmw_xor_0_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define i32 @test_atomicrmw_xor_0_global_system__amdgpu_no_fine_grained_memory(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
-  %res = atomicrmw or ptr addrspace(1) %ptr, i32 0 seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw xor ptr addrspace(1) %ptr, i32 0 seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_xor_0_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define i32 @test_atomicrmw_xor_0_global_system__amdgpu_no_remote_memory(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw xor ptr addrspace(1) %ptr, i32 0 seq_cst, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_xor_0_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define i32 @test_atomicrmw_xor_0_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw xor ptr addrspace(1) %ptr, i32 0 seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_sub_0_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define i32 @test_atomicrmw_sub_0_global_system__amdgpu_no_fine_grained_memory(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw sub ptr addrspace(1) %ptr, i32 0 seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_sub_0_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define i32 @test_atomicrmw_sub_0_global_system__amdgpu_no_remote_memory(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw sub ptr addrspace(1) %ptr, i32 0 seq_cst, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_sub_0_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define i32 @test_atomicrmw_sub_0_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw sub ptr addrspace(1) %ptr, i32 0 seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_or_0_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define i32 @test_atomicrmw_or_0_global_agent__amdgpu_no_fine_grained_memory(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 0 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw or ptr addrspace(1) %ptr, i32 0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_or_0_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define i32 @test_atomicrmw_or_0_global_agent__amdgpu_no_remote_memory(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 0 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw or ptr addrspace(1) %ptr, i32 0 syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_or_0_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define i32 @test_atomicrmw_or_0_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 0 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw or ptr addrspace(1) %ptr, i32 0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_xor_0_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define i32 @test_atomicrmw_xor_0_global_agent__amdgpu_no_fine_grained_memory(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 0 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw xor ptr addrspace(1) %ptr, i32 0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_xor_0_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define i32 @test_atomicrmw_xor_0_global_agent__amdgpu_no_remote_memory(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 0 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw xor ptr addrspace(1) %ptr, i32 0 syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_xor_0_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define i32 @test_atomicrmw_xor_0_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 0 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw xor ptr addrspace(1) %ptr, i32 0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_sub_0_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define i32 @test_atomicrmw_sub_0_global_agent__amdgpu_no_fine_grained_memory(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i32 0 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw sub ptr addrspace(1) %ptr, i32 0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_sub_0_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define i32 @test_atomicrmw_sub_0_global_agent__amdgpu_no_remote_memory(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i32 0 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw sub ptr addrspace(1) %ptr, i32 0 syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret i32 %res
+}
+
+define i32 @test_atomicrmw_sub_0_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define i32 @test_atomicrmw_sub_0_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i32 0 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw sub ptr addrspace(1) %ptr, i32 0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret i32 %res
 }
 
 !0 = !{}
-;.
-; CHECK: [[META0]] = !{}
-;.
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10: {{.*}}
+; GFX11: {{.*}}
+; GFX12: {{.*}}
+; GFX803: {{.*}}
+; GFX900: {{.*}}
+; GFX90A: {{.*}}
+; GFX940: {{.*}}
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/no-expand-atomic-store.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/no-expand-atomic-store.ll
index db0c3a20e62f..9159393ab887 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/no-expand-atomic-store.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/no-expand-atomic-store.ll
@@ -4,8 +4,7 @@
 define void @store_atomic_f32_global_system(float %val, ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: define void @store_atomic_f32_global_system(
 ; CHECK-SAME: float [[VAL:%.*]], ptr addrspace(1) [[PTR:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[VAL]] to i32
-; CHECK-NEXT:    store atomic i32 [[TMP1]], ptr addrspace(1) [[PTR]] seq_cst, align 4
+; CHECK-NEXT:    store atomic float [[VAL]], ptr addrspace(1) [[PTR]] seq_cst, align 4, !some.unknown.md [[META0:![0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
   store atomic float %val, ptr addrspace(1) %ptr seq_cst, align 4, !some.unknown.md !0
@@ -15,8 +14,7 @@ define void @store_atomic_f32_global_system(float %val, ptr addrspace(1) %ptr) {
 define void @store_atomic_f32_global_agent(float %val, ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: define void @store_atomic_f32_global_agent(
 ; CHECK-SAME: float [[VAL:%.*]], ptr addrspace(1) [[PTR:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[VAL]] to i32
-; CHECK-NEXT:    store atomic i32 [[TMP1]], ptr addrspace(1) [[PTR]] syncscope("agent") seq_cst, align 4
+; CHECK-NEXT:    store atomic float [[VAL]], ptr addrspace(1) [[PTR]] syncscope("agent") seq_cst, align 4, !some.unknown.md [[META0]]
 ; CHECK-NEXT:    ret void
 ;
   store atomic float %val, ptr addrspace(1) %ptr syncscope("agent") seq_cst, align 4, !some.unknown.md !0
@@ -26,8 +24,7 @@ define void @store_atomic_f32_global_agent(float %val, ptr addrspace(1) %ptr) {
 define void @store_atomic_f32_local(float %val, ptr addrspace(3) %ptr) {
 ; CHECK-LABEL: define void @store_atomic_f32_local(
 ; CHECK-SAME: float [[VAL:%.*]], ptr addrspace(3) [[PTR:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[VAL]] to i32
-; CHECK-NEXT:    store atomic i32 [[TMP1]], ptr addrspace(3) [[PTR]] seq_cst, align 4
+; CHECK-NEXT:    store atomic float [[VAL]], ptr addrspace(3) [[PTR]] seq_cst, align 4, !some.unknown.md [[META0]]
 ; CHECK-NEXT:    ret void
 ;
   store atomic float %val, ptr addrspace(3) %ptr seq_cst, align 4, !some.unknown.md !0
@@ -37,8 +34,7 @@ define void @store_atomic_f32_local(float %val, ptr addrspace(3) %ptr) {
 define void @store_atomic_f32_flat(float %val, ptr %ptr) {
 ; CHECK-LABEL: define void @store_atomic_f32_flat(
 ; CHECK-SAME: float [[VAL:%.*]], ptr [[PTR:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[VAL]] to i32
-; CHECK-NEXT:    store atomic i32 [[TMP1]], ptr [[PTR]] seq_cst, align 4
+; CHECK-NEXT:    store atomic float [[VAL]], ptr [[PTR]] seq_cst, align 4, !some.unknown.md [[META0]]
 ; CHECK-NEXT:    ret void
 ;
   store atomic float %val, ptr %ptr seq_cst, align 4, !some.unknown.md !0
@@ -48,8 +44,7 @@ define void @store_atomic_f32_flat(float %val, ptr %ptr) {
 define void @store_atomic_f16_global_system(half %val, ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: define void @store_atomic_f16_global_system(
 ; CHECK-SAME: half [[VAL:%.*]], ptr addrspace(1) [[PTR:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast half [[VAL]] to i16
-; CHECK-NEXT:    store atomic i16 [[TMP1]], ptr addrspace(1) [[PTR]] seq_cst, align 4
+; CHECK-NEXT:    store atomic half [[VAL]], ptr addrspace(1) [[PTR]] seq_cst, align 4, !some.unknown.md [[META0]]
 ; CHECK-NEXT:    ret void
 ;
   store atomic half %val, ptr addrspace(1) %ptr seq_cst, align 4, !some.unknown.md !0
@@ -59,8 +54,7 @@ define void @store_atomic_f16_global_system(half %val, ptr addrspace(1) %ptr) {
 define void @store_atomic_f16_global_agent(half %val, ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: define void @store_atomic_f16_global_agent(
 ; CHECK-SAME: half [[VAL:%.*]], ptr addrspace(1) [[PTR:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast half [[VAL]] to i16
-; CHECK-NEXT:    store atomic i16 [[TMP1]], ptr addrspace(1) [[PTR]] syncscope("agent") seq_cst, align 4
+; CHECK-NEXT:    store atomic half [[VAL]], ptr addrspace(1) [[PTR]] syncscope("agent") seq_cst, align 4, !some.unknown.md [[META0]]
 ; CHECK-NEXT:    ret void
 ;
   store atomic half %val, ptr addrspace(1) %ptr syncscope("agent") seq_cst, align 4, !some.unknown.md !0
@@ -70,8 +64,7 @@ define void @store_atomic_f16_global_agent(half %val, ptr addrspace(1) %ptr) {
 define void @store_atomic_f16_local(half %val, ptr addrspace(3) %ptr) {
 ; CHECK-LABEL: define void @store_atomic_f16_local(
 ; CHECK-SAME: half [[VAL:%.*]], ptr addrspace(3) [[PTR:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast half [[VAL]] to i16
-; CHECK-NEXT:    store atomic i16 [[TMP1]], ptr addrspace(3) [[PTR]] seq_cst, align 4
+; CHECK-NEXT:    store atomic half [[VAL]], ptr addrspace(3) [[PTR]] seq_cst, align 4, !some.unknown.md [[META0]]
 ; CHECK-NEXT:    ret void
 ;
   store atomic half %val, ptr addrspace(3) %ptr seq_cst, align 4, !some.unknown.md !0
@@ -81,8 +74,7 @@ define void @store_atomic_f16_local(half %val, ptr addrspace(3) %ptr) {
 define void @store_atomic_f16_flat(half %val, ptr %ptr) {
 ; CHECK-LABEL: define void @store_atomic_f16_flat(
 ; CHECK-SAME: half [[VAL:%.*]], ptr [[PTR:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast half [[VAL]] to i16
-; CHECK-NEXT:    store atomic i16 [[TMP1]], ptr [[PTR]] seq_cst, align 4
+; CHECK-NEXT:    store atomic half [[VAL]], ptr [[PTR]] seq_cst, align 4, !some.unknown.md [[META0]]
 ; CHECK-NEXT:    ret void
 ;
   store atomic half %val, ptr %ptr seq_cst, align 4, !some.unknown.md !0
@@ -92,8 +84,7 @@ define void @store_atomic_f16_flat(half %val, ptr %ptr) {
 define void @store_atomic_bf16_global_system(bfloat %val, ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: define void @store_atomic_bf16_global_system(
 ; CHECK-SAME: bfloat [[VAL:%.*]], ptr addrspace(1) [[PTR:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast bfloat [[VAL]] to i16
-; CHECK-NEXT:    store atomic i16 [[TMP1]], ptr addrspace(1) [[PTR]] seq_cst, align 4
+; CHECK-NEXT:    store atomic bfloat [[VAL]], ptr addrspace(1) [[PTR]] seq_cst, align 4, !some.unknown.md [[META0]]
 ; CHECK-NEXT:    ret void
 ;
   store atomic bfloat %val, ptr addrspace(1) %ptr seq_cst, align 4, !some.unknown.md !0
@@ -103,8 +94,7 @@ define void @store_atomic_bf16_global_system(bfloat %val, ptr addrspace(1) %ptr)
 define void @store_atomic_bf16_global_agent(bfloat %val, ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: define void @store_atomic_bf16_global_agent(
 ; CHECK-SAME: bfloat [[VAL:%.*]], ptr addrspace(1) [[PTR:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast bfloat [[VAL]] to i16
-; CHECK-NEXT:    store atomic i16 [[TMP1]], ptr addrspace(1) [[PTR]] syncscope("agent") seq_cst, align 4
+; CHECK-NEXT:    store atomic bfloat [[VAL]], ptr addrspace(1) [[PTR]] syncscope("agent") seq_cst, align 4, !some.unknown.md [[META0]]
 ; CHECK-NEXT:    ret void
 ;
   store atomic bfloat %val, ptr addrspace(1) %ptr syncscope("agent") seq_cst, align 4, !some.unknown.md !0
@@ -114,8 +104,7 @@ define void @store_atomic_bf16_global_agent(bfloat %val, ptr addrspace(1) %ptr)
 define void @store_atomic_bf16_local(bfloat %val, ptr addrspace(3) %ptr) {
 ; CHECK-LABEL: define void @store_atomic_bf16_local(
 ; CHECK-SAME: bfloat [[VAL:%.*]], ptr addrspace(3) [[PTR:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast bfloat [[VAL]] to i16
-; CHECK-NEXT:    store atomic i16 [[TMP1]], ptr addrspace(3) [[PTR]] seq_cst, align 4
+; CHECK-NEXT:    store atomic bfloat [[VAL]], ptr addrspace(3) [[PTR]] seq_cst, align 4, !some.unknown.md [[META0]]
 ; CHECK-NEXT:    ret void
 ;
   store atomic bfloat %val, ptr addrspace(3) %ptr seq_cst, align 4, !some.unknown.md !0
@@ -125,8 +114,7 @@ define void @store_atomic_bf16_local(bfloat %val, ptr addrspace(3) %ptr) {
 define void @store_atomic_bf16_flat(bfloat %val, ptr %ptr) {
 ; CHECK-LABEL: define void @store_atomic_bf16_flat(
 ; CHECK-SAME: bfloat [[VAL:%.*]], ptr [[PTR:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast bfloat [[VAL]] to i16
-; CHECK-NEXT:    store atomic i16 [[TMP1]], ptr [[PTR]] seq_cst, align 4
+; CHECK-NEXT:    store atomic bfloat [[VAL]], ptr [[PTR]] seq_cst, align 4, !some.unknown.md [[META0]]
 ; CHECK-NEXT:    ret void
 ;
   store atomic bfloat %val, ptr %ptr seq_cst, align 4, !some.unknown.md !0
@@ -135,8 +123,7 @@ define void @store_atomic_bf16_flat(bfloat %val, ptr %ptr) {
 define void @store_atomic_f64_global_system(double %val, ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: define void @store_atomic_f64_global_system(
 ; CHECK-SAME: double [[VAL:%.*]], ptr addrspace(1) [[PTR:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double [[VAL]] to i64
-; CHECK-NEXT:    store atomic i64 [[TMP1]], ptr addrspace(1) [[PTR]] seq_cst, align 8
+; CHECK-NEXT:    store atomic double [[VAL]], ptr addrspace(1) [[PTR]] seq_cst, align 8, !some.unknown.md [[META0]]
 ; CHECK-NEXT:    ret void
 ;
   store atomic double %val, ptr addrspace(1) %ptr seq_cst, align 8, !some.unknown.md !0
@@ -146,8 +133,7 @@ define void @store_atomic_f64_global_system(double %val, ptr addrspace(1) %ptr)
 define void @store_atomic_f64_global_agent(double %val, ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: define void @store_atomic_f64_global_agent(
 ; CHECK-SAME: double [[VAL:%.*]], ptr addrspace(1) [[PTR:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double [[VAL]] to i64
-; CHECK-NEXT:    store atomic i64 [[TMP1]], ptr addrspace(1) [[PTR]] syncscope("agent") seq_cst, align 8
+; CHECK-NEXT:    store atomic double [[VAL]], ptr addrspace(1) [[PTR]] syncscope("agent") seq_cst, align 8, !some.unknown.md [[META0]]
 ; CHECK-NEXT:    ret void
 ;
   store atomic double %val, ptr addrspace(1) %ptr syncscope("agent") seq_cst, align 8, !some.unknown.md !0
@@ -157,8 +143,7 @@ define void @store_atomic_f64_global_agent(double %val, ptr addrspace(1) %ptr) {
 define void @store_atomic_f64_local(double %val, ptr addrspace(3) %ptr) {
 ; CHECK-LABEL: define void @store_atomic_f64_local(
 ; CHECK-SAME: double [[VAL:%.*]], ptr addrspace(3) [[PTR:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double [[VAL]] to i64
-; CHECK-NEXT:    store atomic i64 [[TMP1]], ptr addrspace(3) [[PTR]] seq_cst, align 8
+; CHECK-NEXT:    store atomic double [[VAL]], ptr addrspace(3) [[PTR]] seq_cst, align 8, !some.unknown.md [[META0]]
 ; CHECK-NEXT:    ret void
 ;
   store atomic double %val, ptr addrspace(3) %ptr seq_cst, align 8, !some.unknown.md !0
@@ -168,8 +153,7 @@ define void @store_atomic_f64_local(double %val, ptr addrspace(3) %ptr) {
 define void @store_atomic_f64_flat(double %val, ptr %ptr) {
 ; CHECK-LABEL: define void @store_atomic_f64_flat(
 ; CHECK-SAME: double [[VAL:%.*]], ptr [[PTR:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double [[VAL]] to i64
-; CHECK-NEXT:    store atomic i64 [[TMP1]], ptr [[PTR]] seq_cst, align 8
+; CHECK-NEXT:    store atomic double [[VAL]], ptr [[PTR]] seq_cst, align 8, !some.unknown.md [[META0]]
 ; CHECK-NEXT:    ret void
 ;
   store atomic double %val, ptr %ptr seq_cst, align 8, !some.unknown.md !0
@@ -177,3 +161,6 @@ define void @store_atomic_f64_flat(double %val, ptr %ptr) {
 }
 
 !0 = !{}
+;.
+; CHECK: [[META0]] = !{}
+;.
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/2008-02-01-ReturnAttrs.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/2008-02-01-ReturnAttrs.ll
index 8254ab623044..adac17e62807 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/2008-02-01-ReturnAttrs.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/2008-02-01-ReturnAttrs.ll
@@ -9,8 +9,8 @@ define internal i32 @deref(ptr %x) nounwind {
 ; CGSCC-NEXT:  entry:
 ; CGSCC-NEXT:    [[X_PRIV:%.*]] = alloca i32, align 4
 ; CGSCC-NEXT:    store i32 [[TMP0]], ptr [[X_PRIV]], align 4
-; CGSCC-NEXT:    [[TRUETMP2:%.*]] = load i32, ptr [[X_PRIV]], align 4
-; CGSCC-NEXT:    ret i32 [[TRUETMP2]]
+; CGSCC-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X_PRIV]], align 4
+; CGSCC-NEXT:    ret i32 [[TMP2]]
 ;
 entry:
   %tmp2 = load i32, ptr %x, align 4
@@ -32,8 +32,8 @@ define i32 @f(i32 %x) {
 ; CGSCC-NEXT:  entry:
 ; CGSCC-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
 ; CGSCC-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
-; CGSCC-NEXT:    [[TRUETMP1:%.*]] = call i32 @deref(i32 [[X]]) #[[ATTR2:[0-9]+]]
-; CGSCC-NEXT:    ret i32 [[TRUETMP1]]
+; CGSCC-NEXT:    [[TMP1:%.*]] = call i32 @deref(i32 [[X]]) #[[ATTR2:[0-9]+]]
+; CGSCC-NEXT:    ret i32 [[TMP1]]
 ;
 entry:
   %x_addr = alloca i32
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/2008-09-08-CGUpdateSelfEdge.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/2008-09-08-CGUpdateSelfEdge.ll
index 0cc7cbac057f..c5f7aacab57b 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/2008-09-08-CGUpdateSelfEdge.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/2008-09-08-CGUpdateSelfEdge.ll
@@ -46,7 +46,9 @@ bb14:		; preds = %entry
   ret i32 0
 }
 ;.
-; CHECK: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
+; TUNIT: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
+;.
+; CGSCC: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
 ;.
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; TUNIT: {{.*}}
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/attributes.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/attributes.ll
index 71e8bd11e9c7..23415d3d3262 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/attributes.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/attributes.ll
@@ -27,11 +27,11 @@ define void @no_promote(ptr %arg) #1 {
 ; TUNIT-SAME: (ptr nocapture nofree writeonly [[ARG:%.*]]) #[[ATTR1:[0-9]+]] {
 ; TUNIT-NEXT:  bb:
 ; TUNIT-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
-; TUNIT-NEXT:    [[TRUETMP2:%.*]] = alloca <4 x i64>, align 32
+; TUNIT-NEXT:    [[TMP2:%.*]] = alloca <4 x i64>, align 32
 ; TUNIT-NEXT:    call void @llvm.memset.p0.i64(ptr noalias nocapture nofree noundef nonnull writeonly align 32 dereferenceable(32) [[TMP]], i8 noundef 0, i64 noundef 32, i1 noundef false) #[[ATTR3:[0-9]+]]
-; TUNIT-NEXT:    call fastcc void @no_promote_avx2(ptr noalias nocapture nofree noundef nonnull writeonly align 32 dereferenceable(32) [[TRUETMP2]], ptr noalias nocapture nofree noundef nonnull readonly align 32 dereferenceable(32) [[TMP]]) #[[ATTR4:[0-9]+]]
-; TUNIT-NEXT:    [[TRUETMP4:%.*]] = load <4 x i64>, ptr [[TRUETMP2]], align 32
-; TUNIT-NEXT:    store <4 x i64> [[TRUETMP4]], ptr [[ARG]], align 2
+; TUNIT-NEXT:    call fastcc void @no_promote_avx2(ptr noalias nocapture nofree noundef nonnull writeonly align 32 dereferenceable(32) [[TMP2]], ptr noalias nocapture nofree noundef nonnull readonly align 32 dereferenceable(32) [[TMP]]) #[[ATTR4:[0-9]+]]
+; TUNIT-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32
+; TUNIT-NEXT:    store <4 x i64> [[TMP4]], ptr [[ARG]], align 2
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: readwrite) uwtable
@@ -39,11 +39,11 @@ define void @no_promote(ptr %arg) #1 {
 ; CGSCC-SAME: (ptr nocapture nofree noundef nonnull writeonly align 2 dereferenceable(32) [[ARG:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CGSCC-NEXT:  bb:
 ; CGSCC-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
-; CGSCC-NEXT:    [[TRUETMP2:%.*]] = alloca <4 x i64>, align 32
+; CGSCC-NEXT:    [[TMP2:%.*]] = alloca <4 x i64>, align 32
 ; CGSCC-NEXT:    call void @llvm.memset.p0.i64(ptr noalias nocapture nofree noundef nonnull writeonly align 32 dereferenceable(32) [[TMP]], i8 noundef 0, i64 noundef 32, i1 noundef false) #[[ATTR3:[0-9]+]]
-; CGSCC-NEXT:    call fastcc void @no_promote_avx2(ptr noalias nocapture nofree noundef nonnull writeonly align 32 dereferenceable(32) [[TRUETMP2]], ptr noalias nocapture nofree noundef nonnull readonly align 32 dereferenceable(32) [[TMP]]) #[[ATTR4:[0-9]+]]
-; CGSCC-NEXT:    [[TRUETMP4:%.*]] = load <4 x i64>, ptr [[TRUETMP2]], align 32
-; CGSCC-NEXT:    store <4 x i64> [[TRUETMP4]], ptr [[ARG]], align 2
+; CGSCC-NEXT:    call fastcc void @no_promote_avx2(ptr noalias nocapture nofree noundef nonnull writeonly align 32 dereferenceable(32) [[TMP2]], ptr noalias nocapture nofree noundef nonnull readonly align 32 dereferenceable(32) [[TMP]]) #[[ATTR4:[0-9]+]]
+; CGSCC-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32
+; CGSCC-NEXT:    store <4 x i64> [[TMP4]], ptr [[ARG]], align 2
 ; CGSCC-NEXT:    ret void
 ;
 bb:
@@ -79,12 +79,12 @@ define void @promote(ptr %arg) #0 {
 ; TUNIT-SAME: (ptr nocapture nofree writeonly [[ARG:%.*]]) #[[ATTR0]] {
 ; TUNIT-NEXT:  bb:
 ; TUNIT-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
-; TUNIT-NEXT:    [[TRUETMP2:%.*]] = alloca <4 x i64>, align 32
+; TUNIT-NEXT:    [[TMP2:%.*]] = alloca <4 x i64>, align 32
 ; TUNIT-NEXT:    call void @llvm.memset.p0.i64(ptr noalias nocapture nofree noundef nonnull writeonly align 32 dereferenceable(32) [[TMP]], i8 noundef 0, i64 noundef 32, i1 noundef false) #[[ATTR3]]
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[TMP]], align 32
-; TUNIT-NEXT:    call fastcc void @promote_avx2(ptr noalias nocapture nofree noundef nonnull writeonly align 32 dereferenceable(32) [[TRUETMP2]], <4 x i64> [[TMP0]]) #[[ATTR4]]
-; TUNIT-NEXT:    [[TRUETMP4:%.*]] = load <4 x i64>, ptr [[TRUETMP2]], align 32
-; TUNIT-NEXT:    store <4 x i64> [[TRUETMP4]], ptr [[ARG]], align 2
+; TUNIT-NEXT:    call fastcc void @promote_avx2(ptr noalias nocapture nofree noundef nonnull writeonly align 32 dereferenceable(32) [[TMP2]], <4 x i64> [[TMP0]]) #[[ATTR4]]
+; TUNIT-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32
+; TUNIT-NEXT:    store <4 x i64> [[TMP4]], ptr [[ARG]], align 2
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
@@ -92,12 +92,12 @@ define void @promote(ptr %arg) #0 {
 ; CGSCC-SAME: (ptr nocapture nofree noundef nonnull writeonly align 2 dereferenceable(32) [[ARG:%.*]]) #[[ATTR0]] {
 ; CGSCC-NEXT:  bb:
 ; CGSCC-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
-; CGSCC-NEXT:    [[TRUETMP2:%.*]] = alloca <4 x i64>, align 32
+; CGSCC-NEXT:    [[TMP2:%.*]] = alloca <4 x i64>, align 32
 ; CGSCC-NEXT:    call void @llvm.memset.p0.i64(ptr noalias nocapture nofree noundef nonnull writeonly align 32 dereferenceable(32) [[TMP]], i8 noundef 0, i64 noundef 32, i1 noundef false) #[[ATTR3]]
 ; CGSCC-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[TMP]], align 32
-; CGSCC-NEXT:    call fastcc void @promote_avx2(ptr noalias nocapture nofree noundef nonnull writeonly align 32 dereferenceable(32) [[TRUETMP2]], <4 x i64> [[TMP0]]) #[[ATTR4]]
-; CGSCC-NEXT:    [[TRUETMP4:%.*]] = load <4 x i64>, ptr [[TRUETMP2]], align 32
-; CGSCC-NEXT:    store <4 x i64> [[TRUETMP4]], ptr [[ARG]], align 2
+; CGSCC-NEXT:    call fastcc void @promote_avx2(ptr noalias nocapture nofree noundef nonnull writeonly align 32 dereferenceable(32) [[TMP2]], <4 x i64> [[TMP0]]) #[[ATTR4]]
+; CGSCC-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32
+; CGSCC-NEXT:    store <4 x i64> [[TMP4]], ptr [[ARG]], align 2
 ; CGSCC-NEXT:    ret void
 ;
 bb:
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll
index 321714849c85..f0bcf68b6444 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll
@@ -32,12 +32,12 @@ define void @avx512_legal512_prefer512_call_avx512_legal512_prefer512(ptr %arg)
 ; TUNIT-SAME: (ptr nocapture nofree writeonly [[ARG:%.*]]) #[[ATTR0]] {
 ; TUNIT-NEXT:  bb:
 ; TUNIT-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
-; TUNIT-NEXT:    [[TRUETMP2:%.*]] = alloca <8 x i64>, align 32
+; TUNIT-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; TUNIT-NEXT:    call void @llvm.memset.p0.i64(ptr noalias nocapture nofree noundef nonnull writeonly align 32 dereferenceable(64) [[TMP]], i8 noundef 0, i64 noundef 32, i1 noundef false) #[[ATTR5:[0-9]+]]
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[TMP]], align 64
-; TUNIT-NEXT:    call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TRUETMP2]], <8 x i64> [[TMP0]]) #[[ATTR6:[0-9]+]]
-; TUNIT-NEXT:    [[TRUETMP4:%.*]] = load <8 x i64>, ptr [[TRUETMP2]], align 64
-; TUNIT-NEXT:    store <8 x i64> [[TRUETMP4]], ptr [[ARG]], align 2
+; TUNIT-NEXT:    call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]]) #[[ATTR6:[0-9]+]]
+; TUNIT-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr [[TMP2]], align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP4]], ptr [[ARG]], align 2
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
@@ -45,12 +45,12 @@ define void @avx512_legal512_prefer512_call_avx512_legal512_prefer512(ptr %arg)
 ; CGSCC-SAME: (ptr nocapture nofree noundef nonnull writeonly align 2 dereferenceable(64) [[ARG:%.*]]) #[[ATTR0]] {
 ; CGSCC-NEXT:  bb:
 ; CGSCC-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
-; CGSCC-NEXT:    [[TRUETMP2:%.*]] = alloca <8 x i64>, align 32
+; CGSCC-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; CGSCC-NEXT:    call void @llvm.memset.p0.i64(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP]], i8 noundef 0, i64 noundef 32, i1 noundef false) #[[ATTR5:[0-9]+]]
 ; CGSCC-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[TMP]], align 64
-; CGSCC-NEXT:    call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TRUETMP2]], <8 x i64> [[TMP0]]) #[[ATTR6:[0-9]+]]
-; CGSCC-NEXT:    [[TRUETMP4:%.*]] = load <8 x i64>, ptr [[TRUETMP2]], align 64
-; CGSCC-NEXT:    store <8 x i64> [[TRUETMP4]], ptr [[ARG]], align 2
+; CGSCC-NEXT:    call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]]) #[[ATTR6:[0-9]+]]
+; CGSCC-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr [[TMP2]], align 64
+; CGSCC-NEXT:    store <8 x i64> [[TMP4]], ptr [[ARG]], align 2
 ; CGSCC-NEXT:    ret void
 ;
 bb:
@@ -89,12 +89,12 @@ define void @avx512_legal512_prefer256_call_avx512_legal512_prefer256(ptr %arg)
 ; TUNIT-SAME: (ptr nocapture nofree writeonly [[ARG:%.*]]) #[[ATTR1]] {
 ; TUNIT-NEXT:  bb:
 ; TUNIT-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
-; TUNIT-NEXT:    [[TRUETMP2:%.*]] = alloca <8 x i64>, align 32
+; TUNIT-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; TUNIT-NEXT:    call void @llvm.memset.p0.i64(ptr noalias nocapture nofree noundef nonnull writeonly align 32 dereferenceable(64) [[TMP]], i8 noundef 0, i64 noundef 32, i1 noundef false) #[[ATTR5]]
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[TMP]], align 64
-; TUNIT-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TRUETMP2]], <8 x i64> [[TMP0]]) #[[ATTR6]]
-; TUNIT-NEXT:    [[TRUETMP4:%.*]] = load <8 x i64>, ptr [[TRUETMP2]], align 64
-; TUNIT-NEXT:    store <8 x i64> [[TRUETMP4]], ptr [[ARG]], align 2
+; TUNIT-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]]) #[[ATTR6]]
+; TUNIT-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr [[TMP2]], align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP4]], ptr [[ARG]], align 2
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
@@ -102,12 +102,12 @@ define void @avx512_legal512_prefer256_call_avx512_legal512_prefer256(ptr %arg)
 ; CGSCC-SAME: (ptr nocapture nofree noundef nonnull writeonly align 2 dereferenceable(64) [[ARG:%.*]]) #[[ATTR1]] {
 ; CGSCC-NEXT:  bb:
 ; CGSCC-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
-; CGSCC-NEXT:    [[TRUETMP2:%.*]] = alloca <8 x i64>, align 32
+; CGSCC-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; CGSCC-NEXT:    call void @llvm.memset.p0.i64(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP]], i8 noundef 0, i64 noundef 32, i1 noundef false) #[[ATTR5]]
 ; CGSCC-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[TMP]], align 64
-; CGSCC-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TRUETMP2]], <8 x i64> [[TMP0]]) #[[ATTR6]]
-; CGSCC-NEXT:    [[TRUETMP4:%.*]] = load <8 x i64>, ptr [[TRUETMP2]], align 64
-; CGSCC-NEXT:    store <8 x i64> [[TRUETMP4]], ptr [[ARG]], align 2
+; CGSCC-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]]) #[[ATTR6]]
+; CGSCC-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr [[TMP2]], align 64
+; CGSCC-NEXT:    store <8 x i64> [[TMP4]], ptr [[ARG]], align 2
 ; CGSCC-NEXT:    ret void
 ;
 bb:
@@ -146,12 +146,12 @@ define void @avx512_legal512_prefer512_call_avx512_legal512_prefer256(ptr %arg)
 ; TUNIT-SAME: (ptr nocapture nofree writeonly [[ARG:%.*]]) #[[ATTR0]] {
 ; TUNIT-NEXT:  bb:
 ; TUNIT-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
-; TUNIT-NEXT:    [[TRUETMP2:%.*]] = alloca <8 x i64>, align 32
+; TUNIT-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; TUNIT-NEXT:    call void @llvm.memset.p0.i64(ptr noalias nocapture nofree noundef nonnull writeonly align 32 dereferenceable(64) [[TMP]], i8 noundef 0, i64 noundef 32, i1 noundef false) #[[ATTR5]]
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[TMP]], align 64
-; TUNIT-NEXT:    call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TRUETMP2]], <8 x i64> [[TMP0]]) #[[ATTR6]]
-; TUNIT-NEXT:    [[TRUETMP4:%.*]] = load <8 x i64>, ptr [[TRUETMP2]], align 64
-; TUNIT-NEXT:    store <8 x i64> [[TRUETMP4]], ptr [[ARG]], align 2
+; TUNIT-NEXT:    call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]]) #[[ATTR6]]
+; TUNIT-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr [[TMP2]], align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP4]], ptr [[ARG]], align 2
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
@@ -159,12 +159,12 @@ define void @avx512_legal512_prefer512_call_avx512_legal512_prefer256(ptr %arg)
 ; CGSCC-SAME: (ptr nocapture nofree noundef nonnull writeonly align 2 dereferenceable(64) [[ARG:%.*]]) #[[ATTR0]] {
 ; CGSCC-NEXT:  bb:
 ; CGSCC-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
-; CGSCC-NEXT:    [[TRUETMP2:%.*]] = alloca <8 x i64>, align 32
+; CGSCC-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; CGSCC-NEXT:    call void @llvm.memset.p0.i64(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP]], i8 noundef 0, i64 noundef 32, i1 noundef false) #[[ATTR5]]
 ; CGSCC-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[TMP]], align 64
-; CGSCC-NEXT:    call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TRUETMP2]], <8 x i64> [[TMP0]]) #[[ATTR6]]
-; CGSCC-NEXT:    [[TRUETMP4:%.*]] = load <8 x i64>, ptr [[TRUETMP2]], align 64
-; CGSCC-NEXT:    store <8 x i64> [[TRUETMP4]], ptr [[ARG]], align 2
+; CGSCC-NEXT:    call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]]) #[[ATTR6]]
+; CGSCC-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr [[TMP2]], align 64
+; CGSCC-NEXT:    store <8 x i64> [[TMP4]], ptr [[ARG]], align 2
 ; CGSCC-NEXT:    ret void
 ;
 bb:
@@ -203,12 +203,12 @@ define void @avx512_legal512_prefer256_call_avx512_legal512_prefer512(ptr %arg)
 ; TUNIT-SAME: (ptr nocapture nofree writeonly [[ARG:%.*]]) #[[ATTR1]] {
 ; TUNIT-NEXT:  bb:
 ; TUNIT-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
-; TUNIT-NEXT:    [[TRUETMP2:%.*]] = alloca <8 x i64>, align 32
+; TUNIT-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; TUNIT-NEXT:    call void @llvm.memset.p0.i64(ptr noalias nocapture nofree noundef nonnull writeonly align 32 dereferenceable(64) [[TMP]], i8 noundef 0, i64 noundef 32, i1 noundef false) #[[ATTR5]]
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[TMP]], align 64
-; TUNIT-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TRUETMP2]], <8 x i64> [[TMP0]]) #[[ATTR6]]
-; TUNIT-NEXT:    [[TRUETMP4:%.*]] = load <8 x i64>, ptr [[TRUETMP2]], align 64
-; TUNIT-NEXT:    store <8 x i64> [[TRUETMP4]], ptr [[ARG]], align 2
+; TUNIT-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]]) #[[ATTR6]]
+; TUNIT-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr [[TMP2]], align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP4]], ptr [[ARG]], align 2
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
@@ -216,12 +216,12 @@ define void @avx512_legal512_prefer256_call_avx512_legal512_prefer512(ptr %arg)
 ; CGSCC-SAME: (ptr nocapture nofree noundef nonnull writeonly align 2 dereferenceable(64) [[ARG:%.*]]) #[[ATTR1]] {
 ; CGSCC-NEXT:  bb:
 ; CGSCC-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
-; CGSCC-NEXT:    [[TRUETMP2:%.*]] = alloca <8 x i64>, align 32
+; CGSCC-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; CGSCC-NEXT:    call void @llvm.memset.p0.i64(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP]], i8 noundef 0, i64 noundef 32, i1 noundef false) #[[ATTR5]]
 ; CGSCC-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[TMP]], align 64
-; CGSCC-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TRUETMP2]], <8 x i64> [[TMP0]]) #[[ATTR6]]
-; CGSCC-NEXT:    [[TRUETMP4:%.*]] = load <8 x i64>, ptr [[TRUETMP2]], align 64
-; CGSCC-NEXT:    store <8 x i64> [[TRUETMP4]], ptr [[ARG]], align 2
+; CGSCC-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]]) #[[ATTR6]]
+; CGSCC-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr [[TMP2]], align 64
+; CGSCC-NEXT:    store <8 x i64> [[TMP4]], ptr [[ARG]], align 2
 ; CGSCC-NEXT:    ret void
 ;
 bb:
@@ -258,11 +258,11 @@ define void @avx512_legal256_prefer256_call_avx512_legal512_prefer256(ptr %arg)
 ; TUNIT-SAME: (ptr nocapture nofree writeonly [[ARG:%.*]]) #[[ATTR2:[0-9]+]] {
 ; TUNIT-NEXT:  bb:
 ; TUNIT-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
-; TUNIT-NEXT:    [[TRUETMP2:%.*]] = alloca <8 x i64>, align 32
+; TUNIT-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; TUNIT-NEXT:    call void @llvm.memset.p0.i64(ptr noalias nocapture nofree noundef nonnull writeonly align 32 dereferenceable(64) [[TMP]], i8 noundef 0, i64 noundef 32, i1 noundef false) #[[ATTR5]]
-; TUNIT-NEXT:    call fastcc void @callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TRUETMP2]], ptr noalias nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]]) #[[ATTR6]]
-; TUNIT-NEXT:    [[TRUETMP4:%.*]] = load <8 x i64>, ptr [[TRUETMP2]], align 64
-; TUNIT-NEXT:    store <8 x i64> [[TRUETMP4]], ptr [[ARG]], align 2
+; TUNIT-NEXT:    call fastcc void @callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], ptr noalias nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]]) #[[ATTR6]]
+; TUNIT-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr [[TMP2]], align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP4]], ptr [[ARG]], align 2
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
@@ -270,11 +270,11 @@ define void @avx512_legal256_prefer256_call_avx512_legal512_prefer256(ptr %arg)
 ; CGSCC-SAME: (ptr nocapture nofree noundef nonnull writeonly align 2 dereferenceable(64) [[ARG:%.*]]) #[[ATTR2:[0-9]+]] {
 ; CGSCC-NEXT:  bb:
 ; CGSCC-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
-; CGSCC-NEXT:    [[TRUETMP2:%.*]] = alloca <8 x i64>, align 32
+; CGSCC-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; CGSCC-NEXT:    call void @llvm.memset.p0.i64(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP]], i8 noundef 0, i64 noundef 32, i1 noundef false) #[[ATTR5]]
-; CGSCC-NEXT:    call fastcc void @callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TRUETMP2]], ptr noalias nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]]) #[[ATTR6]]
-; CGSCC-NEXT:    [[TRUETMP4:%.*]] = load <8 x i64>, ptr [[TRUETMP2]], align 64
-; CGSCC-NEXT:    store <8 x i64> [[TRUETMP4]], ptr [[ARG]], align 2
+; CGSCC-NEXT:    call fastcc void @callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], ptr noalias nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]]) #[[ATTR6]]
+; CGSCC-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr [[TMP2]], align 64
+; CGSCC-NEXT:    store <8 x i64> [[TMP4]], ptr [[ARG]], align 2
 ; CGSCC-NEXT:    ret void
 ;
 bb:
@@ -311,11 +311,11 @@ define void @avx512_legal512_prefer256_call_avx512_legal256_prefer256(ptr %arg)
 ; TUNIT-SAME: (ptr nocapture nofree writeonly [[ARG:%.*]]) #[[ATTR1]] {
 ; TUNIT-NEXT:  bb:
 ; TUNIT-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
-; TUNIT-NEXT:    [[TRUETMP2:%.*]] = alloca <8 x i64>, align 32
+; TUNIT-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; TUNIT-NEXT:    call void @llvm.memset.p0.i64(ptr noalias nocapture nofree noundef nonnull writeonly align 32 dereferenceable(64) [[TMP]], i8 noundef 0, i64 noundef 32, i1 noundef false) #[[ATTR5]]
-; TUNIT-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TRUETMP2]], ptr noalias nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]]) #[[ATTR6]]
-; TUNIT-NEXT:    [[TRUETMP4:%.*]] = load <8 x i64>, ptr [[TRUETMP2]], align 64
-; TUNIT-NEXT:    store <8 x i64> [[TRUETMP4]], ptr [[ARG]], align 2
+; TUNIT-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], ptr noalias nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]]) #[[ATTR6]]
+; TUNIT-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr [[TMP2]], align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP4]], ptr [[ARG]], align 2
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
@@ -323,11 +323,11 @@ define void @avx512_legal512_prefer256_call_avx512_legal256_prefer256(ptr %arg)
 ; CGSCC-SAME: (ptr nocapture nofree noundef nonnull writeonly align 2 dereferenceable(64) [[ARG:%.*]]) #[[ATTR1]] {
 ; CGSCC-NEXT:  bb:
 ; CGSCC-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
-; CGSCC-NEXT:    [[TRUETMP2:%.*]] = alloca <8 x i64>, align 32
+; CGSCC-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; CGSCC-NEXT:    call void @llvm.memset.p0.i64(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP]], i8 noundef 0, i64 noundef 32, i1 noundef false) #[[ATTR5]]
-; CGSCC-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TRUETMP2]], ptr noalias nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]]) #[[ATTR6]]
-; CGSCC-NEXT:    [[TRUETMP4:%.*]] = load <8 x i64>, ptr [[TRUETMP2]], align 64
-; CGSCC-NEXT:    store <8 x i64> [[TRUETMP4]], ptr [[ARG]], align 2
+; CGSCC-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], ptr noalias nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]]) #[[ATTR6]]
+; CGSCC-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr [[TMP2]], align 64
+; CGSCC-NEXT:    store <8 x i64> [[TMP4]], ptr [[ARG]], align 2
 ; CGSCC-NEXT:    ret void
 ;
 bb:
@@ -366,12 +366,12 @@ define void @avx2_legal256_prefer256_call_avx2_legal512_prefer256(ptr %arg) #4 {
 ; TUNIT-SAME: (ptr nocapture nofree writeonly [[ARG:%.*]]) #[[ATTR3]] {
 ; TUNIT-NEXT:  bb:
 ; TUNIT-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
-; TUNIT-NEXT:    [[TRUETMP2:%.*]] = alloca <8 x i64>, align 32
+; TUNIT-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; TUNIT-NEXT:    call void @llvm.memset.p0.i64(ptr noalias nocapture nofree noundef nonnull writeonly align 32 dereferenceable(64) [[TMP]], i8 noundef 0, i64 noundef 32, i1 noundef false) #[[ATTR5]]
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[TMP]], align 64
-; TUNIT-NEXT:    call fastcc void @callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TRUETMP2]], <8 x i64> [[TMP0]]) #[[ATTR6]]
-; TUNIT-NEXT:    [[TRUETMP4:%.*]] = load <8 x i64>, ptr [[TRUETMP2]], align 64
-; TUNIT-NEXT:    store <8 x i64> [[TRUETMP4]], ptr [[ARG]], align 2
+; TUNIT-NEXT:    call fastcc void @callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]]) #[[ATTR6]]
+; TUNIT-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr [[TMP2]], align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP4]], ptr [[ARG]], align 2
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
@@ -379,12 +379,12 @@ define void @avx2_legal256_prefer256_call_avx2_legal512_prefer256(ptr %arg) #4 {
 ; CGSCC-SAME: (ptr nocapture nofree noundef nonnull writeonly align 2 dereferenceable(64) [[ARG:%.*]]) #[[ATTR3]] {
 ; CGSCC-NEXT:  bb:
 ; CGSCC-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
-; CGSCC-NEXT:    [[TRUETMP2:%.*]] = alloca <8 x i64>, align 32
+; CGSCC-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; CGSCC-NEXT:    call void @llvm.memset.p0.i64(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP]], i8 noundef 0, i64 noundef 32, i1 noundef false) #[[ATTR5]]
 ; CGSCC-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[TMP]], align 64
-; CGSCC-NEXT:    call fastcc void @callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TRUETMP2]], <8 x i64> [[TMP0]]) #[[ATTR6]]
-; CGSCC-NEXT:    [[TRUETMP4:%.*]] = load <8 x i64>, ptr [[TRUETMP2]], align 64
-; CGSCC-NEXT:    store <8 x i64> [[TRUETMP4]], ptr [[ARG]], align 2
+; CGSCC-NEXT:    call fastcc void @callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]]) #[[ATTR6]]
+; CGSCC-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr [[TMP2]], align 64
+; CGSCC-NEXT:    store <8 x i64> [[TMP4]], ptr [[ARG]], align 2
 ; CGSCC-NEXT:    ret void
 ;
 bb:
@@ -423,12 +423,12 @@ define void @avx2_legal512_prefer256_call_avx2_legal256_prefer256(ptr %arg) #3 {
 ; TUNIT-SAME: (ptr nocapture nofree writeonly [[ARG:%.*]]) #[[ATTR3]] {
 ; TUNIT-NEXT:  bb:
 ; TUNIT-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
-; TUNIT-NEXT:    [[TRUETMP2:%.*]] = alloca <8 x i64>, align 32
+; TUNIT-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; TUNIT-NEXT:    call void @llvm.memset.p0.i64(ptr noalias nocapture nofree noundef nonnull writeonly align 32 dereferenceable(64) [[TMP]], i8 noundef 0, i64 noundef 32, i1 noundef false) #[[ATTR5]]
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[TMP]], align 64
-; TUNIT-NEXT:    call fastcc void @callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TRUETMP2]], <8 x i64> [[TMP0]]) #[[ATTR6]]
-; TUNIT-NEXT:    [[TRUETMP4:%.*]] = load <8 x i64>, ptr [[TRUETMP2]], align 64
-; TUNIT-NEXT:    store <8 x i64> [[TRUETMP4]], ptr [[ARG]], align 2
+; TUNIT-NEXT:    call fastcc void @callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]]) #[[ATTR6]]
+; TUNIT-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr [[TMP2]], align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP4]], ptr [[ARG]], align 2
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
@@ -436,12 +436,12 @@ define void @avx2_legal512_prefer256_call_avx2_legal256_prefer256(ptr %arg) #3 {
 ; CGSCC-SAME: (ptr nocapture nofree noundef nonnull writeonly align 2 dereferenceable(64) [[ARG:%.*]]) #[[ATTR3]] {
 ; CGSCC-NEXT:  bb:
 ; CGSCC-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
-; CGSCC-NEXT:    [[TRUETMP2:%.*]] = alloca <8 x i64>, align 32
+; CGSCC-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; CGSCC-NEXT:    call void @llvm.memset.p0.i64(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP]], i8 noundef 0, i64 noundef 32, i1 noundef false) #[[ATTR5]]
 ; CGSCC-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[TMP]], align 64
-; CGSCC-NEXT:    call fastcc void @callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TRUETMP2]], <8 x i64> [[TMP0]]) #[[ATTR6]]
-; CGSCC-NEXT:    [[TRUETMP4:%.*]] = load <8 x i64>, ptr [[TRUETMP2]], align 64
-; CGSCC-NEXT:    store <8 x i64> [[TRUETMP4]], ptr [[ARG]], align 2
+; CGSCC-NEXT:    call fastcc void @callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256(ptr noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]]) #[[ATTR6]]
+; CGSCC-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr [[TMP2]], align 64
+; CGSCC-NEXT:    store <8 x i64> [[TMP4]], ptr [[ARG]], align 2
 ; CGSCC-NEXT:    ret void
 ;
 bb:
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/thiscall.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/thiscall.ll
index 46fdcdd51725..71fd087c8ac1 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/thiscall.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/thiscall.ll
@@ -58,6 +58,9 @@ declare void @ext(ptr inalloca(<{ %struct.a }>))
 declare ptr @llvm.stacksave()
 declare void @llvm.stackrestore(ptr)
 ;.
-; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind willreturn }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { nofree willreturn }
+; TUNIT: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind willreturn }
+; TUNIT: attributes #[[ATTR1]] = { nofree willreturn }
+;.
+; CGSCC: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind willreturn }
+; CGSCC: attributes #[[ATTR1]] = { nofree willreturn }
 ;.
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/aggregate-promote.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/aggregate-promote.ll
index 35144183d7d9..a567af50e260 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/aggregate-promote.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/aggregate-promote.ll
@@ -6,7 +6,7 @@
 @G = constant %T { i32 0, i32 0, i32 17, i32 25 }
 
 ;.
-; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = constant [[T:%.*]] { i32 0, i32 0, i32 17, i32 25 }
+; CHECK: @G = constant %T { i32 0, i32 0, i32 17, i32 25 }
 ;.
 define internal i32 @test(ptr %p) {
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/alloca-as.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/alloca-as.ll
index 88483f0b4689..9e2cd06d26ea 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/alloca-as.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/alloca-as.ll
@@ -51,5 +51,7 @@ entry:
 
 declare void @use(i32)
 ;.
-; CHECK: attributes #[[ATTR0]] = { memory(readwrite, argmem: none) }
+; TUNIT: attributes #[[ATTR0]] = { memory(readwrite, argmem: none) }
+;.
+; CGSCC: attributes #[[ATTR0]] = { memory(readwrite, argmem: none) }
 ;.
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/attrs.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/attrs.ll
index 877071c1a3fe..9ce752aa95ee 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/attrs.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/attrs.ll
@@ -17,23 +17,23 @@ define internal i32 @f(ptr byval(%struct.ss) %b, ptr byval(i32) %X, i32 %i) noun
 ; CHECK-NEXT:    store i32 [[TMP0]], ptr [[B_PRIV]], align 4
 ; CHECK-NEXT:    [[B_PRIV_B4:%.*]] = getelementptr i8, ptr [[B_PRIV]], i64 4
 ; CHECK-NEXT:    store i64 [[TMP1]], ptr [[B_PRIV_B4]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_PRIV]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1
-; CHECK-NEXT:    store i32 [[TMP2]], ptr [[B_PRIV]], align 8
+; CHECK-NEXT:    [[VAL1:%.*]] = load i32, ptr [[B_PRIV]], align 8
+; CHECK-NEXT:    [[VAL2:%.*]] = add i32 [[VAL1]], 1
+; CHECK-NEXT:    store i32 [[VAL2]], ptr [[B_PRIV]], align 8
 ; CHECK-NEXT:    store i32 0, ptr [[X_PRIV]], align 4
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[X_PRIV]], align 4
-; CHECK-NEXT:    [[A:%.*]] = add i32 [[L]], [[TMP2]]
+; CHECK-NEXT:    [[A:%.*]] = add i32 [[L]], [[VAL2]]
 ; CHECK-NEXT:    ret i32 [[A]]
 ;
 entry:
 
-  %tmp1 = load i32, ptr %b, align 4
-  %tmp2 = add i32 %tmp1, 1
-  store i32 %tmp2, ptr %b, align 4
+  %val1 = load i32, ptr %b, align 4
+  %val2 = add i32 %val1, 1
+  store i32 %val2, ptr %b, align 4
 
   store i32 %i, ptr %X
   %l = load i32, ptr %X
-  %a = add i32 %l, %tmp2
+  %a = add i32 %l, %val2
   ret i32 %a
 }
 
@@ -46,7 +46,7 @@ define i32 @test(ptr %X) {
 ; TUNIT-NEXT:  entry:
 ; TUNIT-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
 ; TUNIT-NEXT:    store i32 1, ptr [[S]], align 8
-; TUNIT-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
+; TUNIT-NEXT:    [[VAL4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load i32, ptr [[S]], align 8
 ; TUNIT-NEXT:    [[S_B4:%.*]] = getelementptr i8, ptr [[S]], i64 4
 ; TUNIT-NEXT:    [[TMP1:%.*]] = load i64, ptr [[S_B4]], align 8
@@ -59,7 +59,7 @@ define i32 @test(ptr %X) {
 ; CGSCC-SAME: (ptr nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CGSCC-NEXT:  entry:
 ; CGSCC-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
-; CGSCC-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
+; CGSCC-NEXT:    [[VAL4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
 ; CGSCC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4
 ; CGSCC-NEXT:    [[C:%.*]] = call i32 @f(i32 noundef 1, i64 noundef 2, i32 [[TMP0]]) #[[ATTR2:[0-9]+]]
 ; CGSCC-NEXT:    ret i32 [[C]]
@@ -67,8 +67,8 @@ define i32 @test(ptr %X) {
 entry:
   %S = alloca %struct.ss
   store i32 1, ptr %S, align 8
-  %tmp4 = getelementptr %struct.ss, ptr %S, i32 0, i32 1
-  store i64 2, ptr %tmp4, align 4
+  %val4 = getelementptr %struct.ss, ptr %S, i32 0, i32 1
+  store i64 2, ptr %val4, align 4
 
   %c = call i32 @f(ptr byval(%struct.ss) %S, ptr byval(i32) %X, i32 zeroext 0)
 
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/byval-2.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/byval-2.ll
index b76254f66090..9f7acd579b27 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/byval-2.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/byval-2.ll
@@ -15,16 +15,16 @@ define internal void @f(ptr byval(%struct.ss)  %b, ptr byval(i32) %X) nounwind
 ; CHECK-NEXT:    store i32 [[TMP0]], ptr [[B_PRIV]], align 4
 ; CHECK-NEXT:    [[B_PRIV_B4:%.*]] = getelementptr i8, ptr [[B_PRIV]], i64 4
 ; CHECK-NEXT:    store i64 [[TMP1]], ptr [[B_PRIV_B4]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_PRIV]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1
-; CHECK-NEXT:    store i32 [[TMP2]], ptr [[B_PRIV]], align 8
+; CHECK-NEXT:    [[VAL1:%.*]] = load i32, ptr [[B_PRIV]], align 8
+; CHECK-NEXT:    [[VAL2:%.*]] = add i32 [[VAL1]], 1
+; CHECK-NEXT:    store i32 [[VAL2]], ptr [[B_PRIV]], align 8
 ; CHECK-NEXT:    store i32 0, ptr [[X_PRIV]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %tmp1 = load i32, ptr %b, align 4
-  %tmp2 = add i32 %tmp1, 1
-  store i32 %tmp2, ptr %b, align 4
+  %val1 = load i32, ptr %b, align 4
+  %val2 = add i32 %val1, 1
+  store i32 %val2, ptr %b, align 4
 
   store i32 0, ptr %X
   ret void
@@ -38,7 +38,7 @@ define i32 @test(ptr %X) {
 ; TUNIT-NEXT:  entry:
 ; TUNIT-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
 ; TUNIT-NEXT:    store i32 1, ptr [[S]], align 8
-; TUNIT-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
+; TUNIT-NEXT:    [[VAL4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load i32, ptr [[S]], align 8
 ; TUNIT-NEXT:    [[S_B4:%.*]] = getelementptr i8, ptr [[S]], i64 4
 ; TUNIT-NEXT:    [[TMP1:%.*]] = load i64, ptr [[S_B4]], align 8
@@ -51,14 +51,14 @@ define i32 @test(ptr %X) {
 ; CGSCC-SAME: (ptr nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CGSCC-NEXT:  entry:
 ; CGSCC-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
-; CGSCC-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
+; CGSCC-NEXT:    [[VAL4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
 ; CGSCC-NEXT:    ret i32 0
 ;
 entry:
   %S = alloca %struct.ss
   store i32 1, ptr %S, align 8
-  %tmp4 = getelementptr %struct.ss, ptr %S, i32 0, i32 1
-  store i64 2, ptr %tmp4, align 4
+  %val4 = getelementptr %struct.ss, ptr %S, i32 0, i32 1
+  store i64 2, ptr %val4, align 4
   call void @f(ptr byval(%struct.ss) %S, ptr byval(i32) %X)
   ret i32 0
 }
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/byval.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/byval.ll
index 77667875256f..621c6cf94313 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/byval.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/byval.ll
@@ -15,16 +15,16 @@ define internal i32 @f(ptr byval(%struct.ss)  %b) nounwind  {
 ; CHECK-NEXT:    store i32 [[TMP0]], ptr [[B_PRIV]], align 4
 ; CHECK-NEXT:    [[B_PRIV_B4:%.*]] = getelementptr i8, ptr [[B_PRIV]], i64 4
 ; CHECK-NEXT:    store i64 [[TMP1]], ptr [[B_PRIV_B4]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_PRIV]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1
-; CHECK-NEXT:    store i32 [[TMP2]], ptr [[B_PRIV]], align 8
-; CHECK-NEXT:    ret i32 [[TMP1]]
+; CHECK-NEXT:    [[VAL1:%.*]] = load i32, ptr [[B_PRIV]], align 8
+; CHECK-NEXT:    [[VAL2:%.*]] = add i32 [[VAL1]], 1
+; CHECK-NEXT:    store i32 [[VAL2]], ptr [[B_PRIV]], align 8
+; CHECK-NEXT:    ret i32 [[VAL1]]
 ;
 entry:
-  %tmp1 = load i32, ptr %b, align 4
-  %tmp2 = add i32 %tmp1, 1
-  store i32 %tmp2, ptr %b, align 4
-  ret i32 %tmp1
+  %val1 = load i32, ptr %b, align 4
+  %val2 = add i32 %val1, 1
+  store i32 %val2, ptr %b, align 4
+  ret i32 %val1
 }
 
 
@@ -37,16 +37,16 @@ define internal i32 @g(ptr byval(%struct.ss) align 32 %b) nounwind {
 ; CHECK-NEXT:    store i32 [[TMP0]], ptr [[B_PRIV]], align 4
 ; CHECK-NEXT:    [[B_PRIV_B4:%.*]] = getelementptr i8, ptr [[B_PRIV]], i64 4
 ; CHECK-NEXT:    store i64 [[TMP1]], ptr [[B_PRIV_B4]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_PRIV]], align 32
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1
-; CHECK-NEXT:    store i32 [[TMP2]], ptr [[B_PRIV]], align 32
-; CHECK-NEXT:    ret i32 [[TMP2]]
+; CHECK-NEXT:    [[VAL1:%.*]] = load i32, ptr [[B_PRIV]], align 32
+; CHECK-NEXT:    [[VAL2:%.*]] = add i32 [[VAL1]], 1
+; CHECK-NEXT:    store i32 [[VAL2]], ptr [[B_PRIV]], align 32
+; CHECK-NEXT:    ret i32 [[VAL2]]
 ;
 entry:
-  %tmp1 = load i32, ptr %b, align 4
-  %tmp2 = add i32 %tmp1, 1
-  store i32 %tmp2, ptr %b, align 4
-  ret i32 %tmp2
+  %val1 = load i32, ptr %b, align 4
+  %val2 = add i32 %val1, 1
+  store i32 %val2, ptr %b, align 4
+  ret i32 %val2
 }
 
 
@@ -57,14 +57,14 @@ define i32 @main() nounwind  {
 ; TUNIT-NEXT:  entry:
 ; TUNIT-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 4
 ; TUNIT-NEXT:    store i32 1, ptr [[S]], align 32
-; TUNIT-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
+; TUNIT-NEXT:    [[VAL4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load i32, ptr [[S]], align 8
-; TUNIT-NEXT:    [[S_B4:%.*]] = getelementptr i8, ptr [[S]], i64 4
-; TUNIT-NEXT:    [[TMP1:%.*]] = load i64, ptr [[S_B4]], align 8
+; TUNIT-NEXT:    [[S_B41:%.*]] = getelementptr i8, ptr [[S]], i64 4
+; TUNIT-NEXT:    [[TMP1:%.*]] = load i64, ptr [[S_B41]], align 8
 ; TUNIT-NEXT:    [[C0:%.*]] = call i32 @f(i32 [[TMP0]], i64 [[TMP1]]) #[[ATTR1:[0-9]+]]
 ; TUNIT-NEXT:    [[TMP2:%.*]] = load i32, ptr [[S]], align 32
-; TUNIT-NEXT:    [[S_B41:%.*]] = getelementptr i8, ptr [[S]], i64 4
-; TUNIT-NEXT:    [[TMP3:%.*]] = load i64, ptr [[S_B41]], align 32
+; TUNIT-NEXT:    [[S_B4:%.*]] = getelementptr i8, ptr [[S]], i64 4
+; TUNIT-NEXT:    [[TMP3:%.*]] = load i64, ptr [[S_B4]], align 32
 ; TUNIT-NEXT:    [[C1:%.*]] = call i32 @g(i32 [[TMP2]], i64 [[TMP3]]) #[[ATTR1]]
 ; TUNIT-NEXT:    [[A:%.*]] = add i32 [[C0]], [[C1]]
 ; TUNIT-NEXT:    ret i32 [[A]]
@@ -74,7 +74,7 @@ define i32 @main() nounwind  {
 ; CGSCC-SAME: () #[[ATTR1:[0-9]+]] {
 ; CGSCC-NEXT:  entry:
 ; CGSCC-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 4
-; CGSCC-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
+; CGSCC-NEXT:    [[VAL4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
 ; CGSCC-NEXT:    [[C0:%.*]] = call i32 @f(i32 noundef 1, i64 noundef 2) #[[ATTR2:[0-9]+]]
 ; CGSCC-NEXT:    [[C1:%.*]] = call i32 @g(i32 noundef 1, i64 noundef 2) #[[ATTR2]]
 ; CGSCC-NEXT:    [[A:%.*]] = add i32 [[C0]], [[C1]]
@@ -83,8 +83,8 @@ define i32 @main() nounwind  {
 entry:
   %S = alloca %struct.ss
   store i32 1, ptr %S, align 8
-  %tmp4 = getelementptr %struct.ss, ptr %S, i32 0, i32 1
-  store i64 2, ptr %tmp4, align 4
+  %val4 = getelementptr %struct.ss, ptr %S, i32 0, i32 1
+  store i64 2, ptr %val4, align 4
   %c0 = call i32 @f(ptr byval(%struct.ss) %S) nounwind
   %c1 = call i32 @g(ptr byval(%struct.ss) %S) nounwind
   %a = add i32 %c0, %c1
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/chained.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/chained.ll
index bc125fc6a1c3..2dc778989252 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/chained.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/chained.ll
@@ -6,8 +6,8 @@
 @G2 = constant ptr @G1
 
 ;.
-; CHECK: @[[G1:[a-zA-Z0-9_$"\\.-]+]] = constant i32 0
-; CHECK: @[[G2:[a-zA-Z0-9_$"\\.-]+]] = constant ptr @G1
+; CHECK: @G1 = constant i32 0
+; CHECK: @G2 = constant ptr @G1
 ;.
 define internal i32 @test(ptr %x) {
 ;
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/dbg.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/dbg.ll
index 39eb3e1d7ea3..67d783351aa0 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/dbg.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/dbg.ll
@@ -61,12 +61,21 @@ define void @caller(ptr %Y, ptr %P) {
 !5 = !DIFile(filename: "test.c", directory: "")
 !6 = !DILocation(line: 9, scope: !2)
 ;.
-; CHECK: attributes #[[ATTR0]] = { memory(readwrite, argmem: none) }
+; TUNIT: attributes #[[ATTR0]] = { memory(readwrite, argmem: none) }
 ;.
-; CHECK: [[META0:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
-; CHECK: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !2, producer: "clang version 3.5.0 ", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly)
-; CHECK: [[META2:![0-9]+]] = !DIFile(filename: "test.c", directory: "")
-; CHECK: [[DBG3]] = distinct !DISubprogram(name: "test", scope: null, file: !2, line: 3, scopeLine: 3, virtualIndex: 6, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1)
-; CHECK: [[META4:![0-9]+]] = !DILocation(line: 8, scope: !3)
-; CHECK: [[META5:![0-9]+]] = !DILocation(line: 9, scope: !3)
+; CGSCC: attributes #[[ATTR0]] = { memory(readwrite, argmem: none) }
+;.
+; TUNIT: [[META0:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
+; TUNIT: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: [[META2:![0-9]+]], producer: "{{.*}}clang version {{.*}} ", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly)
+; TUNIT: [[META2]] = !DIFile(filename: "test.c", directory: "")
+; TUNIT: [[DBG3]] = distinct !DISubprogram(name: "test", scope: null, file: [[META2]], line: 3, scopeLine: 3, virtualIndex: 6, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: [[META1]])
+; TUNIT: [[DBG4]] = !DILocation(line: 8, scope: [[DBG3]])
+; TUNIT: [[DBG5]] = !DILocation(line: 9, scope: [[DBG3]])
+;.
+; CGSCC: [[META0:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
+; CGSCC: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: [[META2:![0-9]+]], producer: "{{.*}}clang version {{.*}} ", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly)
+; CGSCC: [[META2]] = !DIFile(filename: "test.c", directory: "")
+; CGSCC: [[DBG3]] = distinct !DISubprogram(name: "test", scope: null, file: [[META2]], line: 3, scopeLine: 3, virtualIndex: 6, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: [[META1]])
+; CGSCC: [[DBG4]] = !DILocation(line: 8, scope: [[DBG3]])
+; CGSCC: [[DBG5]] = !DILocation(line: 9, scope: [[DBG3]])
 ;.
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/invalidation.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/invalidation.ll
index 0a34a7b621a6..eee4e38ad987 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/invalidation.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/invalidation.ll
@@ -12,7 +12,7 @@
 @G = constant i32 0
 
 ;.
-; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = constant i32 0
+; CHECK: @G = constant i32 0
 ;.
 define internal i32 @a(ptr %x) {
 entry:
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/naked_functions.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/naked_functions.ll
index 670303a365fa..1e8cdb2d98ab 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/naked_functions.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/naked_functions.ll
@@ -7,7 +7,7 @@
 @g = common global i32 0, align 4
 
 ;.
-; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = common global i32 0, align 4
+; CHECK: @g = common global i32 0, align 4
 ;.
 define i32 @bar() {
 ; CHECK-LABEL: define {{[^@]+}}@bar() {
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/nonzero-address-spaces.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/nonzero-address-spaces.ll
index 38c1d6099042..b588a399e5bd 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/nonzero-address-spaces.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/nonzero-address-spaces.ll
@@ -10,7 +10,7 @@ target datalayout = "e-P1-p:16:8-i8:8-i16:8-i32:8-i64:8-f32:8-f64:8-n8-a:8"
 @g = common global i32 0, align 4
 
 ;.
-; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = common global i32 0, align 4
+; CHECK: @g = common global i32 0, align 4
 ;.
 define i32 @bar() {
 ; CHECK-LABEL: define {{[^@]+}}@bar() addrspace(1) {
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/pr27568.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/pr27568.ll
index 819369bcf647..f905abdbcb81 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/pr27568.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/pr27568.ll
@@ -20,7 +20,7 @@ define void @test1() personality ptr @__CxxFrameHandler3 {
 ; CHECK-LABEL: define {{[^@]+}}@test1() personality ptr @__CxxFrameHandler3 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    invoke void @thunk()
-; CHECK-NEXT:    to label [[OUT:%.*]] unwind label [[CPAD:%.*]]
+; CHECK-NEXT:            to label [[OUT:%.*]] unwind label [[CPAD:%.*]]
 ; CHECK:       out:
 ; CHECK-NEXT:    ret void
 ; CHECK:       cpad:
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/pr32917.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/pr32917.ll
index 3416841c191e..a8639be2fa90 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/pr32917.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/pr32917.ll
@@ -7,8 +7,8 @@
 @a = common local_unnamed_addr global i32 0, align 4
 
 ;.
-; CHECK: @[[B:[a-zA-Z0-9_$"\\.-]+]] = common local_unnamed_addr global i32 0, align 4
-; CHECK: @[[A:[a-zA-Z0-9_$"\\.-]+]] = common local_unnamed_addr global i32 0, align 4
+; CHECK: @b = common local_unnamed_addr global i32 0, align 4
+; CHECK: @a = common local_unnamed_addr global i32 0, align 4
 ;.
 define i32 @fn2() local_unnamed_addr {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll
index e0777f9ecee8..8c02ca4e8660 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll
@@ -51,6 +51,7 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 !6 = !DILocation(line: 1, column: 1, scope: !3)
 ;.
 ; TUNIT: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
+; TUNIT: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
 ; CGSCC: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
 ; CGSCC: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/profile.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/profile.ll
index b6968897603b..4074fcb74323 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/profile.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/profile.ll
@@ -42,7 +42,11 @@ declare void @use_i32(i32)
 
 !0 = !{!"branch_weights", i32 30}
 ;.
-; CHECK: attributes #[[ATTR0]] = { memory(readwrite, argmem: none) }
+; TUNIT: attributes #[[ATTR0]] = { memory(readwrite, argmem: none) }
 ;.
-; CHECK: [[META0:![0-9]+]] = !{!"branch_weights", i32 30}
+; CGSCC: attributes #[[ATTR0]] = { memory(readwrite, argmem: none) }
+;.
+; TUNIT: [[PROF0]] = !{!"branch_weights", i32 30}
+;.
+; CGSCC: [[PROF0]] = !{!"branch_weights", i32 30}
 ;.
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/reserve-tbaa.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/reserve-tbaa.ll
index 9640f1472a72..bed038968a52 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/reserve-tbaa.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/reserve-tbaa.ll
@@ -15,11 +15,11 @@
 @d = global i8 0, align 1
 
 ;.
-; CHECK: @[[A:[a-zA-Z0-9_$"\\.-]+]] = global ptr null, align 8
-; CHECK: @[[E:[a-zA-Z0-9_$"\\.-]+]] = global ptr @a, align 8
-; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = global i32 0, align 4
-; CHECK: @[[C:[a-zA-Z0-9_$"\\.-]+]] = global i64 0, align 8
-; CHECK: @[[D:[a-zA-Z0-9_$"\\.-]+]] = global i8 0, align 1
+; CHECK: @a = global ptr null, align 8
+; CHECK: @e = global ptr @a, align 8
+; CHECK: @g = global i32 0, align 4
+; CHECK: @c = global i64 0, align 8
+; CHECK: @d = global i8 0, align 1
 ;.
 define internal fastcc void @fn(ptr nocapture readonly %p1, ptr nocapture readonly %p2) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, argmem: none)
@@ -92,11 +92,19 @@ entry:
 ; CGSCC: attributes #[[ATTR1]] = { mustprogress nofree nosync nounwind willreturn }
 ; CGSCC: attributes #[[ATTR2]] = { nofree nounwind willreturn }
 ;.
-; CHECK: [[TBAA0]] = !{!1, !1, i64 0}
-; CHECK: [[META1:![0-9]+]] = !{!"int", !2, i64 0}
-; CHECK: [[META2:![0-9]+]] = !{!"omnipotent char", !3, i64 0}
-; CHECK: [[META3:![0-9]+]] = !{!"Simple C/C++ TBAA"}
-; CHECK: [[TBAA4]] = !{!2, !2, i64 0}
-; CHECK: [[META5:![0-9]+]] = !{!6, !6, i64 0}
-; CHECK: [[META6:![0-9]+]] = !{!"any pointer", !2, i64 0}
+; TUNIT: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; TUNIT: [[META1]] = !{!"int", [[META2:![0-9]+]], i64 0}
+; TUNIT: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; TUNIT: [[META3]] = !{!"Simple C/C++ TBAA"}
+; TUNIT: [[TBAA4]] = !{[[META2]], [[META2]], i64 0}
+; TUNIT: [[TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+; TUNIT: [[META6]] = !{!"any pointer", [[META2]], i64 0}
+;.
+; CGSCC: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CGSCC: [[META1]] = !{!"int", [[META2:![0-9]+]], i64 0}
+; CGSCC: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; CGSCC: [[META3]] = !{!"Simple C/C++ TBAA"}
+; CGSCC: [[TBAA4]] = !{[[META2]], [[META2]], i64 0}
+; CGSCC: [[TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+; CGSCC: [[META6]] = !{!"any pointer", [[META2]], i64 0}
 ;.
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/variadic.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/variadic.ll
index 967dbc0ab5dc..f260b0192cf1 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/variadic.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/variadic.ll
@@ -16,7 +16,7 @@ target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: nounwind uwtable
 ;.
-; CHECK: @[[T45:[a-zA-Z0-9_$"\\.-]+]] = internal global [[STRUCT_TT0:%.*]] { i64 1335139741, i64 438042995 }, align 8
+; CHECK: @t45 = internal global %struct.tt0 { i64 1335139741, i64 438042995 }, align 8
 ;.
 define i32 @main(i32 %argc, ptr nocapture readnone %argv) #0 {
 ; CHECK-LABEL: define {{[^@]+}}@main
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/2009-09-24-byval-ptr.ll b/llvm/test/Transforms/Attributor/IPConstantProp/2009-09-24-byval-ptr.ll
index 154b093e9dbb..a209ee2ebe06 100644
--- a/llvm/test/Transforms/Attributor/IPConstantProp/2009-09-24-byval-ptr.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/2009-09-24-byval-ptr.ll
@@ -74,12 +74,12 @@ define i32 @unions() nounwind {
 ; TUNIT-SAME: () #[[ATTR2:[0-9]+]] {
 ; TUNIT-NEXT:  entry:
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load i8, ptr @mystr, align 8
-; TUNIT-NEXT:    [[MYSTR_B4:%.*]] = getelementptr i8, ptr @mystr, i64 4
-; TUNIT-NEXT:    [[TMP1:%.*]] = load i32, ptr [[MYSTR_B4]], align 8
+; TUNIT-NEXT:    [[MYSTR_B41:%.*]] = getelementptr i8, ptr @mystr, i64 4
+; TUNIT-NEXT:    [[TMP1:%.*]] = load i32, ptr [[MYSTR_B41]], align 8
 ; TUNIT-NEXT:    call void @vfu1(i8 [[TMP0]], i32 [[TMP1]]) #[[ATTR2]]
 ; TUNIT-NEXT:    [[TMP2:%.*]] = load i8, ptr @mystr, align 8
-; TUNIT-NEXT:    [[MYSTR_B41:%.*]] = getelementptr i8, ptr @mystr, i64 4
-; TUNIT-NEXT:    [[TMP3:%.*]] = load i32, ptr [[MYSTR_B41]], align 8
+; TUNIT-NEXT:    [[MYSTR_B4:%.*]] = getelementptr i8, ptr @mystr, i64 4
+; TUNIT-NEXT:    [[TMP3:%.*]] = load i32, ptr [[MYSTR_B4]], align 8
 ; TUNIT-NEXT:    [[RESULT:%.*]] = call i32 @vfu2(i8 [[TMP2]], i32 [[TMP3]]) #[[ATTR3:[0-9]+]]
 ; TUNIT-NEXT:    ret i32 [[RESULT]]
 ;
@@ -139,12 +139,12 @@ define i32 @unions_v2() nounwind {
 ; TUNIT-SAME: () #[[ATTR2]] {
 ; TUNIT-NEXT:  entry:
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load i8, ptr @mystr, align 8
-; TUNIT-NEXT:    [[MYSTR_B41:%.*]] = getelementptr i8, ptr @mystr, i64 4
-; TUNIT-NEXT:    [[TMP1:%.*]] = load i32, ptr [[MYSTR_B41]], align 8
+; TUNIT-NEXT:    [[MYSTR_B4:%.*]] = getelementptr i8, ptr @mystr, i64 4
+; TUNIT-NEXT:    [[TMP1:%.*]] = load i32, ptr [[MYSTR_B4]], align 8
 ; TUNIT-NEXT:    call void @vfu1(i8 [[TMP0]], i32 [[TMP1]]) #[[ATTR2]]
 ; TUNIT-NEXT:    [[TMP2:%.*]] = load i8, ptr @mystr, align 8
-; TUNIT-NEXT:    [[MYSTR_B4:%.*]] = getelementptr i8, ptr @mystr, i64 4
-; TUNIT-NEXT:    [[TMP3:%.*]] = load i32, ptr [[MYSTR_B4]], align 8
+; TUNIT-NEXT:    [[MYSTR_B41:%.*]] = getelementptr i8, ptr @mystr, i64 4
+; TUNIT-NEXT:    [[TMP3:%.*]] = load i32, ptr [[MYSTR_B41]], align 8
 ; TUNIT-NEXT:    [[RESULT:%.*]] = call i32 @vfu2_v2(i8 [[TMP2]], i32 [[TMP3]]) #[[ATTR3]]
 ; TUNIT-NEXT:    ret i32 [[RESULT]]
 ;
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/PR16052.ll b/llvm/test/Transforms/Attributor/IPConstantProp/PR16052.ll
index e6fbd31dc3d7..b0446479dac4 100644
--- a/llvm/test/Transforms/Attributor/IPConstantProp/PR16052.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/PR16052.ll
@@ -91,12 +91,12 @@ entry:
   ret i64 %cond
 }
 ;.
-; TUNIT: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
-;.
 ; CGSCC: attributes #[[ATTR0]] = { mustprogress nofree nosync nounwind willreturn memory(none) }
 ; CGSCC: attributes #[[ATTR1]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
 ; CGSCC: attributes #[[ATTR2]] = { nofree nosync willreturn }
 ;.
+; TUNIT: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
+;.
 ; CGSCC: [[RNG0]] = !{i64 -2147483606, i64 2147483690}
 ;.
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/PR43857.ll b/llvm/test/Transforms/Attributor/IPConstantProp/PR43857.ll
index df79cc01b0af..160cd7357181 100644
--- a/llvm/test/Transforms/Attributor/IPConstantProp/PR43857.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/PR43857.ll
@@ -23,7 +23,7 @@ define void @baz(<8 x i32> %arg) local_unnamed_addr {
 ; TUNIT-LABEL: define {{[^@]+}}@baz
 ; TUNIT-SAME: (<8 x i32> [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; TUNIT-NEXT:  bb:
-; TUNIT-NEXT:    [[TRUETMP1:%.*]] = extractvalue [[STRUCT_ZOT:%.*]] undef, 0, 0
+; TUNIT-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_ZOT:%.*]] undef, 0, 0
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/comdat-ipo.ll b/llvm/test/Transforms/Attributor/IPConstantProp/comdat-ipo.ll
index 5c81e45094f5..1ae9ceca5b7f 100644
--- a/llvm/test/Transforms/Attributor/IPConstantProp/comdat-ipo.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/comdat-ipo.ll
@@ -39,5 +39,7 @@ define i32 @bar() {
   ret i32 %val
 }
 ;.
-; CHECK: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
+; TUNIT: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
+;.
+; CGSCC: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
 ;.
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/dangling-block-address.ll b/llvm/test/Transforms/Attributor/IPConstantProp/dangling-block-address.ll
index 4e54720b657b..d76ea5e9290b 100644
--- a/llvm/test/Transforms/Attributor/IPConstantProp/dangling-block-address.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/dangling-block-address.ll
@@ -10,11 +10,11 @@
 @bar.l = internal constant [2 x ptr] [ptr blockaddress(@bar, %lab0), ptr blockaddress(@bar, %end)] ; <ptr> [#uses=1]
 
 ;.
-; TUNIT: @[[CODE:[a-zA-Z0-9_$"\\.-]+]] = global [5 x i32] [i32 0, i32 0, i32 0, i32 0, i32 1], align 4
-; TUNIT: @[[BAR_L:[a-zA-Z0-9_$"\\.-]+]] = internal constant [2 x ptr] [ptr inttoptr (i32 1 to ptr), ptr inttoptr (i32 1 to ptr)]
+; TUNIT: @code = global [5 x i32] [i32 0, i32 0, i32 0, i32 0, i32 1], align 4
+; TUNIT: @bar.l = internal constant [2 x ptr] [ptr inttoptr (i32 1 to ptr), ptr inttoptr (i32 1 to ptr)]
 ;.
-; CGSCC: @[[CODE:[a-zA-Z0-9_$"\\.-]+]] = global [5 x i32] [i32 0, i32 0, i32 0, i32 0, i32 1], align 4
-; CGSCC: @[[BAR_L:[a-zA-Z0-9_$"\\.-]+]] = internal constant [2 x ptr] [ptr blockaddress(@bar, [[LAB0:%.*]]), ptr blockaddress(@bar, [[END:%.*]])]
+; CGSCC: @code = global [5 x i32] [i32 0, i32 0, i32 0, i32 0, i32 1], align 4
+; CGSCC: @bar.l = internal constant [2 x ptr] [ptr blockaddress(@bar, %lab0), ptr blockaddress(@bar, %end)]
 ;.
 define internal void @foo(i32 %x) nounwind readnone {
 ; CGSCC: Function Attrs: nounwind memory(none)
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/deadarg.ll b/llvm/test/Transforms/Attributor/IPConstantProp/deadarg.ll
index 7992d06e6d85..8fb6af08e179 100644
--- a/llvm/test/Transforms/Attributor/IPConstantProp/deadarg.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/deadarg.ll
@@ -20,7 +20,9 @@ define void @bar() {
   ret void
 }
 ;.
-; CHECK: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
+; TUNIT: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
+;.
+; CGSCC: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
 ;.
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; TUNIT: {{.*}}
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/global.ll b/llvm/test/Transforms/Attributor/IPConstantProp/global.ll
index 67bddf7fb3eb..fdace93a5675 100644
--- a/llvm/test/Transforms/Attributor/IPConstantProp/global.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/global.ll
@@ -5,7 +5,7 @@
 @_ZL6test1g = internal global i32 42, align 4
 
 ;.
-; CHECK: @[[_ZL6TEST1G:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 42, align 4
+; CHECK: @_ZL6test1g = internal global i32 42, align 4
 ;.
 define void @_Z7test1f1v() nounwind {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/multiple_callbacks.ll b/llvm/test/Transforms/Attributor/IPConstantProp/multiple_callbacks.ll
index db6048b16abf..fb92cd20be19 100644
--- a/llvm/test/Transforms/Attributor/IPConstantProp/multiple_callbacks.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/multiple_callbacks.ll
@@ -130,8 +130,13 @@ declare !callback !3 void @broker(ptr, ptr, ptr, i32, i32)
 ; CGSCC: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
 ; CGSCC: attributes #[[ATTR1]] = { mustprogress nofree nosync nounwind willreturn memory(none) }
 ;.
-; CHECK: [[META0:![0-9]+]] = !{!1, !2, !3}
-; CHECK: [[META1:![0-9]+]] = !{i64 0, i64 3, i1 false}
-; CHECK: [[META2:![0-9]+]] = !{i64 2, i64 3, i1 false}
-; CHECK: [[META3:![0-9]+]] = !{i64 1, i64 4, i1 false}
+; TUNIT: [[META0:![0-9]+]] = !{[[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
+; TUNIT: [[META1]] = !{i64 0, i64 3, i1 false}
+; TUNIT: [[META2]] = !{i64 2, i64 3, i1 false}
+; TUNIT: [[META3]] = !{i64 1, i64 4, i1 false}
+;.
+; CGSCC: [[META0:![0-9]+]] = !{[[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
+; CGSCC: [[META1]] = !{i64 0, i64 3, i1 false}
+; CGSCC: [[META2]] = !{i64 2, i64 3, i1 false}
+; CGSCC: [[META3]] = !{i64 1, i64 4, i1 false}
 ;.
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/openmp_parallel_for.ll b/llvm/test/Transforms/Attributor/IPConstantProp/openmp_parallel_for.ll
index 932dcb5f2a91..683da42b6481 100644
--- a/llvm/test/Transforms/Attributor/IPConstantProp/openmp_parallel_for.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/openmp_parallel_for.ll
@@ -26,9 +26,9 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @1 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @.str }, align 8
 
 ;.
-; CHECK: @[[_STR:[a-zA-Z0-9_$"\\.-]+]] = private unnamed_addr constant [23 x i8] c"
-; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr global [[STRUCT_IDENT_T:%.*]] { i32 0, i32 514, i32 0, i32 0, ptr @.str }, align 8
-; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr global [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @.str }, align 8
+; CHECK: @.str = private unnamed_addr constant [23 x i8] c"
+; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 514, i32 0, i32 0, ptr @.str }, align 8
+; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @.str }, align 8
 ;.
 define dso_local void @foo(i32 %N) {
 ; TUNIT-LABEL: define {{[^@]+}}@foo
@@ -76,32 +76,32 @@ define internal void @.omp_outlined.(ptr noalias %.global_tid., ptr noalias %.bo
 ; TUNIT-NEXT:    store i32 4, ptr [[DOTOMP_UB]], align 4
 ; TUNIT-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 ; TUNIT-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
-; TUNIT-NEXT:    [[TRUETMP5:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4
-; TUNIT-NEXT:    call void @__kmpc_for_static_init_4(ptr noundef nonnull align 8 dereferenceable(24) @[[GLOB0]], i32 [[TRUETMP5]], i32 noundef 34, ptr noundef nonnull align 4 dereferenceable(4) [[DOTOMP_IS_LAST]], ptr noundef nonnull align 4 dereferenceable(4) [[DOTOMP_LB]], ptr noundef nonnull align 4 dereferenceable(4) [[DOTOMP_UB]], ptr noundef nonnull align 4 dereferenceable(4) [[DOTOMP_STRIDE]], i32 noundef 1, i32 noundef 1)
-; TUNIT-NEXT:    [[TRUETMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
-; TUNIT-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[TRUETMP6]], 4
+; TUNIT-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4
+; TUNIT-NEXT:    call void @__kmpc_for_static_init_4(ptr noundef nonnull align 8 dereferenceable(24) @[[GLOB0]], i32 [[TMP5]], i32 noundef 34, ptr noundef nonnull align 4 dereferenceable(4) [[DOTOMP_IS_LAST]], ptr noundef nonnull align 4 dereferenceable(4) [[DOTOMP_LB]], ptr noundef nonnull align 4 dereferenceable(4) [[DOTOMP_UB]], ptr noundef nonnull align 4 dereferenceable(4) [[DOTOMP_STRIDE]], i32 noundef 1, i32 noundef 1)
+; TUNIT-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+; TUNIT-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[TMP6]], 4
 ; TUNIT-NEXT:    br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 ; TUNIT:       cond.true:
 ; TUNIT-NEXT:    br label [[COND_END:%.*]]
 ; TUNIT:       cond.false:
-; TUNIT-NEXT:    [[TRUETMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+; TUNIT-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
 ; TUNIT-NEXT:    br label [[COND_END]]
 ; TUNIT:       cond.end:
-; TUNIT-NEXT:    [[COND:%.*]] = phi i32 [ 4, [[COND_TRUE]] ], [ [[TRUETMP7]], [[COND_FALSE]] ]
+; TUNIT-NEXT:    [[COND:%.*]] = phi i32 [ 4, [[COND_TRUE]] ], [ [[TMP7]], [[COND_FALSE]] ]
 ; TUNIT-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
-; TUNIT-NEXT:    [[TRUETMP8:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+; TUNIT-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
 ; TUNIT-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 ; TUNIT:       omp.inner.for.cond:
-; TUNIT-NEXT:    [[DOTOMP_IV_0:%.*]] = phi i32 [ [[TRUETMP8]], [[COND_END]] ], [ [[ADD11:%.*]], [[OMP_INNER_FOR_INC:%.*]] ]
-; TUNIT-NEXT:    [[TRUETMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
-; TUNIT-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[DOTOMP_IV_0]], [[TRUETMP9]]
+; TUNIT-NEXT:    [[DOTOMP_IV_0:%.*]] = phi i32 [ [[TMP8]], [[COND_END]] ], [ [[ADD11:%.*]], [[OMP_INNER_FOR_INC:%.*]] ]
+; TUNIT-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+; TUNIT-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[DOTOMP_IV_0]], [[TMP9]]
 ; TUNIT-NEXT:    br i1 [[CMP8]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]], label [[OMP_INNER_FOR_BODY:%.*]]
 ; TUNIT:       omp.inner.for.cond.cleanup:
 ; TUNIT-NEXT:    br label [[OMP_INNER_FOR_END:%.*]]
 ; TUNIT:       omp.inner.for.body:
 ; TUNIT-NEXT:    [[ADD10:%.*]] = add nsw i32 [[DOTOMP_IV_0]], 2
-; TUNIT-NEXT:    [[TRUETMP11:%.*]] = load double, ptr [[Q_ADDR]], align 8
-; TUNIT-NEXT:    call void @bar(i32 [[ADD10]], float nofpclass(nan inf zero sub nnorm) 3.000000e+00, double [[TRUETMP11]])
+; TUNIT-NEXT:    [[TMP11:%.*]] = load double, ptr [[Q_ADDR]], align 8
+; TUNIT-NEXT:    call void @bar(i32 [[ADD10]], float nofpclass(nan inf zero sub nnorm) 3.000000e+00, double [[TMP11]])
 ; TUNIT-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 ; TUNIT:       omp.body.continue:
 ; TUNIT-NEXT:    br label [[OMP_INNER_FOR_INC]]
@@ -111,8 +111,8 @@ define internal void @.omp_outlined.(ptr noalias %.global_tid., ptr noalias %.bo
 ; TUNIT:       omp.inner.for.end:
 ; TUNIT-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 ; TUNIT:       omp.loop.exit:
-; TUNIT-NEXT:    [[TRUETMP12:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4
-; TUNIT-NEXT:    call void @__kmpc_for_static_fini(ptr noundef nonnull align 8 dereferenceable(24) @[[GLOB0]], i32 [[TRUETMP12]])
+; TUNIT-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4
+; TUNIT-NEXT:    call void @__kmpc_for_static_fini(ptr noundef nonnull align 8 dereferenceable(24) @[[GLOB0]], i32 [[TMP12]])
 ; TUNIT-NEXT:    br label [[OMP_PRECOND_END:%.*]]
 ; TUNIT:       omp.precond.end:
 ; TUNIT-NEXT:    ret void
@@ -135,33 +135,33 @@ define internal void @.omp_outlined.(ptr noalias %.global_tid., ptr noalias %.bo
 ; CGSCC-NEXT:    store i32 [[SUB3]], ptr [[DOTOMP_UB]], align 4
 ; CGSCC-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 ; CGSCC-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
-; CGSCC-NEXT:    [[TRUETMP5:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4
-; CGSCC-NEXT:    call void @__kmpc_for_static_init_4(ptr noundef nonnull align 8 dereferenceable(24) @[[GLOB0]], i32 [[TRUETMP5]], i32 noundef 34, ptr noundef nonnull align 4 dereferenceable(4) [[DOTOMP_IS_LAST]], ptr noundef nonnull align 4 dereferenceable(4) [[DOTOMP_LB]], ptr noundef nonnull align 4 dereferenceable(4) [[DOTOMP_UB]], ptr noundef nonnull align 4 dereferenceable(4) [[DOTOMP_STRIDE]], i32 noundef 1, i32 noundef 1)
-; CGSCC-NEXT:    [[TRUETMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
-; CGSCC-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[TRUETMP6]], [[SUB3]]
+; CGSCC-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4
+; CGSCC-NEXT:    call void @__kmpc_for_static_init_4(ptr noundef nonnull align 8 dereferenceable(24) @[[GLOB0]], i32 [[TMP5]], i32 noundef 34, ptr noundef nonnull align 4 dereferenceable(4) [[DOTOMP_IS_LAST]], ptr noundef nonnull align 4 dereferenceable(4) [[DOTOMP_LB]], ptr noundef nonnull align 4 dereferenceable(4) [[DOTOMP_UB]], ptr noundef nonnull align 4 dereferenceable(4) [[DOTOMP_STRIDE]], i32 noundef 1, i32 noundef 1)
+; CGSCC-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+; CGSCC-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[TMP6]], [[SUB3]]
 ; CGSCC-NEXT:    br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 ; CGSCC:       cond.true:
 ; CGSCC-NEXT:    br label [[COND_END:%.*]]
 ; CGSCC:       cond.false:
-; CGSCC-NEXT:    [[TRUETMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+; CGSCC-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
 ; CGSCC-NEXT:    br label [[COND_END]]
 ; CGSCC:       cond.end:
-; CGSCC-NEXT:    [[COND:%.*]] = phi i32 [ [[SUB3]], [[COND_TRUE]] ], [ [[TRUETMP7]], [[COND_FALSE]] ]
+; CGSCC-NEXT:    [[COND:%.*]] = phi i32 [ [[SUB3]], [[COND_TRUE]] ], [ [[TMP7]], [[COND_FALSE]] ]
 ; CGSCC-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
-; CGSCC-NEXT:    [[TRUETMP8:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+; CGSCC-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
 ; CGSCC-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 ; CGSCC:       omp.inner.for.cond:
-; CGSCC-NEXT:    [[DOTOMP_IV_0:%.*]] = phi i32 [ [[TRUETMP8]], [[COND_END]] ], [ [[ADD11:%.*]], [[OMP_INNER_FOR_INC:%.*]] ]
-; CGSCC-NEXT:    [[TRUETMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
-; CGSCC-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[DOTOMP_IV_0]], [[TRUETMP9]]
+; CGSCC-NEXT:    [[DOTOMP_IV_0:%.*]] = phi i32 [ [[TMP8]], [[COND_END]] ], [ [[ADD11:%.*]], [[OMP_INNER_FOR_INC:%.*]] ]
+; CGSCC-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+; CGSCC-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[DOTOMP_IV_0]], [[TMP9]]
 ; CGSCC-NEXT:    br i1 [[CMP8]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]], label [[OMP_INNER_FOR_BODY:%.*]]
 ; CGSCC:       omp.inner.for.cond.cleanup:
 ; CGSCC-NEXT:    br label [[OMP_INNER_FOR_END:%.*]]
 ; CGSCC:       omp.inner.for.body:
 ; CGSCC-NEXT:    [[ADD10:%.*]] = add nsw i32 [[DOTOMP_IV_0]], 2
-; CGSCC-NEXT:    [[TRUETMP10:%.*]] = load float, ptr [[P]], align 4
-; CGSCC-NEXT:    [[TRUETMP11:%.*]] = load double, ptr [[Q_ADDR]], align 8
-; CGSCC-NEXT:    call void @bar(i32 [[ADD10]], float [[TRUETMP10]], double [[TRUETMP11]])
+; CGSCC-NEXT:    [[TMP10:%.*]] = load float, ptr [[P]], align 4
+; CGSCC-NEXT:    [[TMP11:%.*]] = load double, ptr [[Q_ADDR]], align 8
+; CGSCC-NEXT:    call void @bar(i32 [[ADD10]], float [[TMP10]], double [[TMP11]])
 ; CGSCC-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 ; CGSCC:       omp.body.continue:
 ; CGSCC-NEXT:    br label [[OMP_INNER_FOR_INC]]
@@ -171,8 +171,8 @@ define internal void @.omp_outlined.(ptr noalias %.global_tid., ptr noalias %.bo
 ; CGSCC:       omp.inner.for.end:
 ; CGSCC-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 ; CGSCC:       omp.loop.exit:
-; CGSCC-NEXT:    [[TRUETMP12:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4
-; CGSCC-NEXT:    call void @__kmpc_for_static_fini(ptr noundef nonnull align 8 dereferenceable(24) @[[GLOB0]], i32 [[TRUETMP12]])
+; CGSCC-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4
+; CGSCC-NEXT:    call void @__kmpc_for_static_fini(ptr noundef nonnull align 8 dereferenceable(24) @[[GLOB0]], i32 [[TMP12]])
 ; CGSCC-NEXT:    br label [[OMP_PRECOND_END]]
 ; CGSCC:       omp.precond.end:
 ; CGSCC-NEXT:    ret void
@@ -259,8 +259,11 @@ declare !callback !0 dso_local void @__kmpc_fork_call(ptr, i32, ptr, ...)
 !1 = !{i64 2, i64 -1, i64 -1, i1 true}
 !0 = !{!1}
 ;.
-; CHECK: [[META0:![0-9]+]] = !{!1}
-; CHECK: [[META1:![0-9]+]] = !{i64 2, i64 -1, i64 -1, i1 true}
+; TUNIT: [[META0:![0-9]+]] = !{[[META1:![0-9]+]]}
+; TUNIT: [[META1]] = !{i64 2, i64 -1, i64 -1, i1 true}
+;.
+; CGSCC: [[META0:![0-9]+]] = !{[[META1:![0-9]+]]}
+; CGSCC: [[META1]] = !{i64 2, i64 -1, i64 -1, i1 true}
 ;.
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK: {{.*}}
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/pthreads.ll b/llvm/test/Transforms/Attributor/IPConstantProp/pthreads.ll
index ac825468a58c..490894d12902 100644
--- a/llvm/test/Transforms/Attributor/IPConstantProp/pthreads.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/pthreads.ll
@@ -29,7 +29,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; FIXME: nocapture & noalias for %alloc2 in %call3
 
 ;.
-; CHECK: @[[GLOBALVPTR:[a-zA-Z0-9_$"\\.-]+]] = common dso_local global ptr null, align 8
+; CHECK: @GlobalVPtr = common dso_local global ptr null, align 8
 ;.
 define dso_local i32 @main() {
 ; TUNIT-LABEL: define {{[^@]+}}@main() {
@@ -114,8 +114,13 @@ entry:
 !1 = !{i64 2, i64 3, i1 false}
 !0 = !{!1}
 ;.
-; CHECK: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
+; TUNIT: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
 ;.
-; CHECK: [[META0:![0-9]+]] = !{!1}
-; CHECK: [[META1:![0-9]+]] = !{i64 2, i64 3, i1 false}
+; CGSCC: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
+;.
+; TUNIT: [[META0:![0-9]+]] = !{[[META1:![0-9]+]]}
+; TUNIT: [[META1]] = !{i64 2, i64 3, i1 false}
+;.
+; CGSCC: [[META0:![0-9]+]] = !{[[META1:![0-9]+]]}
+; CGSCC: [[META1]] = !{i64 2, i64 3, i1 false}
 ;.
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/remove-call-inst.ll b/llvm/test/Transforms/Attributor/IPConstantProp/remove-call-inst.ll
index 241807a712c8..9591b562e71c 100644
--- a/llvm/test/Transforms/Attributor/IPConstantProp/remove-call-inst.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/remove-call-inst.ll
@@ -33,8 +33,8 @@ define internal i32 @wwrite(i64 %i) nounwind readnone {
 ; CGSCC-SAME: () #[[ATTR1:[0-9]+]] {
 ; CGSCC-NEXT:  entry:
 ; CGSCC-NEXT:    switch i64 0, label [[SW_DEFAULT:%.*]] [
-; CGSCC-NEXT:    i64 3, label [[RETURN:%.*]]
-; CGSCC-NEXT:    i64 10, label [[RETURN]]
+; CGSCC-NEXT:      i64 3, label [[RETURN:%.*]]
+; CGSCC-NEXT:      i64 10, label [[RETURN]]
 ; CGSCC-NEXT:    ]
 ; CGSCC:       sw.default:
 ; CGSCC-NEXT:    ret i32 123
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/return-constants.ll b/llvm/test/Transforms/Attributor/IPConstantProp/return-constants.ll
index a75bca0707a9..343b6b9dd433 100644
--- a/llvm/test/Transforms/Attributor/IPConstantProp/return-constants.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/return-constants.ll
@@ -62,13 +62,13 @@ define %0 @caller(i1 %Q) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define {{[^@]+}}@caller
 ; TUNIT-SAME: (i1 [[Q:%.*]]) #[[ATTR0]] {
-; TUNIT-NEXT:    [[X:%.*]] = call [[TMP0:%.*]] @foo(i1 noundef [[Q]]) #[[ATTR1:[0-9]+]]
+; TUNIT-NEXT:    [[X:%.*]] = call [[TMP0:%.*]] [[FOO:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i1 noundef [[Q]]) #[[ATTR1:[0-9]+]]
 ; TUNIT-NEXT:    ret [[TMP0]] [[X]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define {{[^@]+}}@caller
 ; CGSCC-SAME: (i1 noundef [[Q:%.*]]) #[[ATTR1:[0-9]+]] {
-; CGSCC-NEXT:    [[X:%.*]] = call [[TMP0:%.*]] @foo(i1 noundef [[Q]]) #[[ATTR2:[0-9]+]]
+; CGSCC-NEXT:    [[X:%.*]] = call [[TMP0:%.*]] [[FOO:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i1 noundef [[Q]]) #[[ATTR2:[0-9]+]]
 ; CGSCC-NEXT:    ret [[TMP0]] [[X]]
 ;
   %X = call %0 @foo(i1 %Q)
@@ -87,10 +87,10 @@ define i32 @caller2(i1 %Q) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define {{[^@]+}}@caller2
 ; TUNIT-SAME: (i1 [[Q:%.*]]) #[[ATTR0]] {
-; TUNIT-NEXT:    [[X:%.*]] = call [[TMP0:%.*]] @foo(i1 noundef [[Q]]) #[[ATTR1]]
+; TUNIT-NEXT:    [[X:%.*]] = call [[TMP0:%.*]] [[FOO:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i1 noundef [[Q]]) #[[ATTR1]]
 ; TUNIT-NEXT:    [[A:%.*]] = extractvalue [[TMP0]] [[X]], 0
 ; TUNIT-NEXT:    [[B:%.*]] = extractvalue [[TMP0]] [[X]], 1
-; TUNIT-NEXT:    [[Y:%.*]] = call [[TMP0]] @bar(i1 noundef [[Q]]) #[[ATTR1]]
+; TUNIT-NEXT:    [[Y:%.*]] = call [[TMP0]] [[BAR:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i1 noundef [[Q]]) #[[ATTR1]]
 ; TUNIT-NEXT:    [[C:%.*]] = extractvalue [[TMP0]] [[Y]], 0
 ; TUNIT-NEXT:    [[D:%.*]] = extractvalue [[TMP0]] [[Y]], 1
 ; TUNIT-NEXT:    [[M:%.*]] = add i32 [[A]], [[C]]
@@ -101,10 +101,10 @@ define i32 @caller2(i1 %Q) {
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define {{[^@]+}}@caller2
 ; CGSCC-SAME: (i1 noundef [[Q:%.*]]) #[[ATTR1]] {
-; CGSCC-NEXT:    [[X:%.*]] = call [[TMP0:%.*]] @foo(i1 noundef [[Q]]) #[[ATTR2]]
+; CGSCC-NEXT:    [[X:%.*]] = call [[TMP0:%.*]] [[FOO:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i1 noundef [[Q]]) #[[ATTR2]]
 ; CGSCC-NEXT:    [[A:%.*]] = extractvalue [[TMP0]] [[X]], 0
 ; CGSCC-NEXT:    [[B:%.*]] = extractvalue [[TMP0]] [[X]], 1
-; CGSCC-NEXT:    [[Y:%.*]] = call [[TMP0]] @bar(i1 noundef [[Q]]) #[[ATTR2]]
+; CGSCC-NEXT:    [[Y:%.*]] = call [[TMP0]] [[BAR:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i1 noundef [[Q]]) #[[ATTR2]]
 ; CGSCC-NEXT:    [[C:%.*]] = extractvalue [[TMP0]] [[Y]], 0
 ; CGSCC-NEXT:    [[D:%.*]] = extractvalue [[TMP0]] [[Y]], 1
 ; CGSCC-NEXT:    [[M:%.*]] = add i32 [[A]], [[C]]
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/thread_local_acs.ll b/llvm/test/Transforms/Attributor/IPConstantProp/thread_local_acs.ll
index ac9d1b63d343..fd787d5b5d27 100644
--- a/llvm/test/Transforms/Attributor/IPConstantProp/thread_local_acs.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/thread_local_acs.ll
@@ -22,8 +22,8 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @gsh = dso_local global i32 0, align 4
 
 ;.
-; CHECK: @[[GTL:[a-zA-Z0-9_$"\\.-]+]] = dso_local thread_local global i32 0, align 4
-; CHECK: @[[GSH:[a-zA-Z0-9_$"\\.-]+]] = dso_local global i32 0, align 4
+; CHECK: @gtl = dso_local thread_local global i32 0, align 4
+; CHECK: @gsh = dso_local global i32 0, align 4
 ;.
 define internal i32 @callee(ptr %thread_local_ptr, ptr %shared_ptr) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read)
@@ -31,8 +31,8 @@ define internal i32 @callee(ptr %thread_local_ptr, ptr %shared_ptr) {
 ; CHECK-SAME: (ptr nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[THREAD_LOCAL_PTR:%.*]], ptr nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[SHARED_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr @gtl, align 4
-; CHECK-NEXT:    [[TRUETMP1:%.*]] = load i32, ptr @gsh, align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP]], [[TRUETMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @gsh, align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP]], [[TMP1]]
 ; CHECK-NEXT:    ret i32 [[ADD]]
 ;
 entry:
@@ -63,8 +63,13 @@ declare !callback !0 dso_local void @broker(ptr, ptr, ptr)
 !1 = !{i64 1, i64 0, i64 2, i1 false}
 !0 = !{!1}
 ;.
-; CHECK: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(read) }
+; TUNIT: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(read) }
 ;.
-; CHECK: [[META0:![0-9]+]] = !{!1}
-; CHECK: [[META1:![0-9]+]] = !{i64 1, i64 0, i64 2, i1 false}
+; CGSCC: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(read) }
+;.
+; TUNIT: [[META0:![0-9]+]] = !{[[META1:![0-9]+]]}
+; TUNIT: [[META1]] = !{i64 1, i64 0, i64 2, i1 false}
+;.
+; CGSCC: [[META0:![0-9]+]] = !{[[META1:![0-9]+]]}
+; CGSCC: [[META1]] = !{i64 1, i64 0, i64 2, i1 false}
 ;.
diff --git a/llvm/test/Transforms/Attributor/address_space_info.ll b/llvm/test/Transforms/Attributor/address_space_info.ll
index d865ae1eae39..73dd93c55b81 100644
--- a/llvm/test/Transforms/Attributor/address_space_info.ll
+++ b/llvm/test/Transforms/Attributor/address_space_info.ll
@@ -8,11 +8,11 @@
 @s2 = dso_local addrspace(3) global i32 undef, align 4
 
 ;.
-; CHECK: @[[DST:[a-zA-Z0-9_$"\\.-]+]] = dso_local addrspace(1) externally_initialized global i32 0, align 4
-; CHECK: @[[G1:[a-zA-Z0-9_$"\\.-]+]] = dso_local addrspace(1) externally_initialized global ptr null, align 4
-; CHECK: @[[G2:[a-zA-Z0-9_$"\\.-]+]] = dso_local addrspace(1) externally_initialized global i32 0, align 4
-; CHECK: @[[S1:[a-zA-Z0-9_$"\\.-]+]] = dso_local addrspace(3) global i32 undef, align 4
-; CHECK: @[[S2:[a-zA-Z0-9_$"\\.-]+]] = dso_local addrspace(3) global i32 undef, align 4
+; CHECK: @dst = dso_local addrspace(1) externally_initialized global i32 0, align 4
+; CHECK: @g1 = dso_local addrspace(1) externally_initialized global ptr null, align 4
+; CHECK: @g2 = dso_local addrspace(1) externally_initialized global i32 0, align 4
+; CHECK: @s1 = dso_local addrspace(3) global i32 undef, align 4
+; CHECK: @s2 = dso_local addrspace(3) global i32 undef, align 4
 ;.
 define internal void @_Z12global_writePi(ptr noundef %p) #0 {
 ; CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(write)
diff --git a/llvm/test/Transforms/Attributor/allocator.ll b/llvm/test/Transforms/Attributor/allocator.ll
index 7072fd5b9e78..f2d9ecd1d8fa 100644
--- a/llvm/test/Transforms/Attributor/allocator.ll
+++ b/llvm/test/Transforms/Attributor/allocator.ll
@@ -7,7 +7,7 @@
 @.str = private unnamed_addr constant [17 x i8] c"The value is %d\0A\00", align 1
 
 ;.
-; CHECK: @[[_STR:[a-zA-Z0-9_$"\\.-]+]] = private unnamed_addr constant [17 x i8] c"The value is %d\0A\00", align 1
+; CHECK: @.str = private unnamed_addr constant [17 x i8] c"The value is %d\0A\00", align 1
 ;.
 define dso_local void @positive_alloca_1(i32 noundef %val) #0 {
 ; CHECK-LABEL: define dso_local void @positive_alloca_1
@@ -511,8 +511,13 @@ declare i32 @printf(ptr noundef, ...) #1
 ; Function Attrs: nounwind allocsize(0)
 declare noalias ptr @malloc(i64 noundef) #1
 ;.
-; CHECK: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
+; TUNIT: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
 ;.
-; CHECK: [[META0:![0-9]+]] = !{!1}
-; CHECK: [[META1:![0-9]+]] = !{i64 2, i64 3, i1 false}
+; CGSCC: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
+;.
+; TUNIT: [[META0:![0-9]+]] = !{[[META1:![0-9]+]]}
+; TUNIT: [[META1]] = !{i64 2, i64 3, i1 false}
+;.
+; CGSCC: [[META0:![0-9]+]] = !{[[META1:![0-9]+]]}
+; CGSCC: [[META1]] = !{i64 2, i64 3, i1 false}
 ;.
diff --git a/llvm/test/Transforms/Attributor/callbacks.ll b/llvm/test/Transforms/Attributor/callbacks.ll
index 6ace218afcf3..dd5cbbc9e271 100644
--- a/llvm/test/Transforms/Attributor/callbacks.ll
+++ b/llvm/test/Transforms/Attributor/callbacks.ll
@@ -307,10 +307,15 @@ declare !callback !0 void @t3_callback_broker(ptr nocapture , ptr nocapture , pt
 !0 = !{!1}
 !1 = !{i64 2, i64 -1, i64 -1, i1 true}
 ;.
-; CHECK: attributes #[[ATTR0:[0-9]+]] = { nosync }
+; TUNIT: attributes #[[ATTR0]] = { nosync }
 ;.
-; CHECK: [[META0:![0-9]+]] = !{!1}
-; CHECK: [[META1:![0-9]+]] = !{i64 2, i64 -1, i64 -1, i1 true}
+; CGSCC: attributes #[[ATTR0]] = { nosync }
+;.
+; TUNIT: [[META0:![0-9]+]] = !{[[META1:![0-9]+]]}
+; TUNIT: [[META1]] = !{i64 2, i64 -1, i64 -1, i1 true}
+;.
+; CGSCC: [[META0:![0-9]+]] = !{[[META1:![0-9]+]]}
+; CGSCC: [[META1]] = !{i64 2, i64 -1, i64 -1, i1 true}
 ;.
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK: {{.*}}
diff --git a/llvm/test/Transforms/Attributor/callgraph.ll b/llvm/test/Transforms/Attributor/callgraph.ll
index a85c6a02a99e..929b89105c24 100644
--- a/llvm/test/Transforms/Attributor/callgraph.ll
+++ b/llvm/test/Transforms/Attributor/callgraph.ll
@@ -6,7 +6,7 @@
 ; RUN: opt -passes=attributor --attributor-assume-closed-world -S < %s | FileCheck %s --check-prefixes=CHECK,UPTO2,UNLIM,CWRLD
 
 ;.
-; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = global ptr @usedByGlobal
+; CHECK: @G = global ptr @usedByGlobal
 ;.
 define dso_local void @func1() {
 ; CHECK-LABEL: @func1(
@@ -67,7 +67,7 @@ define dso_local void @func2(i1 %c) {
 ;
 ; LIMI0-LABEL: @func2(
 ; LIMI0-NEXT:    [[F:%.*]] = select i1 [[C:%.*]], ptr @internal_good, ptr @func4
-; LIMI0-NEXT:    call void [[F]](), !callees !0
+; LIMI0-NEXT:    call void [[F]](), !callees [[META0:![0-9]+]]
 ; LIMI0-NEXT:    ret void
 ;
   %f = select i1 %c, ptr @internal_good, ptr @func4
@@ -98,7 +98,7 @@ define void @func5(i32 %0) {
 ; LIMI0-LABEL: @func5(
 ; LIMI0-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP0:%.*]], 0
 ; LIMI0-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], ptr @func4, ptr @func3
-; LIMI0-NEXT:    call void [[TMP3]](), !callees !1
+; LIMI0-NEXT:    call void [[TMP3]](), !callees [[META1:![0-9]+]]
 ; LIMI0-NEXT:    ret void
 ;
   %2 = icmp ne i32 %0, 0
@@ -178,7 +178,7 @@ define i32 @non_matching_fp1(i1 %c1, i1 %c2, i1 %c) {
 ; LIMI2-NEXT:    [[CALL2:%.*]] = call i32 @retI32(i32 42)
 ; LIMI2-NEXT:    br label [[TMP7]]
 ; LIMI2:       6:
-; LIMI2-NEXT:    [[CALL3:%.*]] = call i32 [[FP]](i32 42), !callees !0
+; LIMI2-NEXT:    [[CALL3:%.*]] = call i32 [[FP]](i32 42), !callees [[META0:![0-9]+]]
 ; LIMI2-NEXT:    br label [[TMP7]]
 ; LIMI2:       7:
 ; LIMI2-NEXT:    [[CALL_PHI:%.*]] = phi i32 [ [[CALL1]], [[TMP2]] ], [ [[CALL2]], [[TMP5]] ], [ [[CALL3]], [[TMP6]] ]
@@ -188,7 +188,7 @@ define i32 @non_matching_fp1(i1 %c1, i1 %c2, i1 %c) {
 ; LIMI0-NEXT:    [[FP1:%.*]] = select i1 [[C1:%.*]], ptr @retI32, ptr @takeI32
 ; LIMI0-NEXT:    [[FP2:%.*]] = select i1 [[C2:%.*]], ptr @retFloatTakeFloat, ptr @void
 ; LIMI0-NEXT:    [[FP:%.*]] = select i1 [[C:%.*]], ptr [[FP1]], ptr [[FP2]]
-; LIMI0-NEXT:    [[CALL:%.*]] = call i32 [[FP]](i32 42), !callees !2
+; LIMI0-NEXT:    [[CALL:%.*]] = call i32 [[FP]](i32 42), !callees [[META2:![0-9]+]]
 ; LIMI0-NEXT:    ret i32 [[CALL]]
 ;
   %fp1 = select i1 %c1, ptr @retI32, ptr @takeI32
@@ -241,7 +241,7 @@ define i32 @non_matching_fp1_noundef(i1 %c1, i1 %c2, i1 %c) {
 ; LIMI2-NEXT:    [[CALL2:%.*]] = call i32 @retI32(i32 42)
 ; LIMI2-NEXT:    br label [[TMP7]]
 ; LIMI2:       6:
-; LIMI2-NEXT:    [[CALL3:%.*]] = call i32 [[FP]](i32 42), !callees !1
+; LIMI2-NEXT:    [[CALL3:%.*]] = call i32 [[FP]](i32 42), !callees [[META1:![0-9]+]]
 ; LIMI2-NEXT:    br label [[TMP7]]
 ; LIMI2:       7:
 ; LIMI2-NEXT:    [[CALL_PHI:%.*]] = phi i32 [ [[CALL1]], [[TMP2]] ], [ [[CALL2]], [[TMP5]] ], [ [[CALL3]], [[TMP6]] ]
@@ -251,7 +251,7 @@ define i32 @non_matching_fp1_noundef(i1 %c1, i1 %c2, i1 %c) {
 ; LIMI0-NEXT:    [[FP1:%.*]] = select i1 [[C1:%.*]], ptr @retI32, ptr @takeI32
 ; LIMI0-NEXT:    [[FP2:%.*]] = select i1 [[C2:%.*]], ptr @retFloatTakeFloatFloatNoundef, ptr @void
 ; LIMI0-NEXT:    [[FP:%.*]] = select i1 [[C:%.*]], ptr [[FP1]], ptr [[FP2]]
-; LIMI0-NEXT:    [[CALL:%.*]] = call i32 [[FP]](i32 42), !callees !3
+; LIMI0-NEXT:    [[CALL:%.*]] = call i32 [[FP]](i32 42), !callees [[META3:![0-9]+]]
 ; LIMI0-NEXT:    ret i32 [[CALL]]
 ;
   %fp1 = select i1 %c1, ptr @retI32, ptr @takeI32
@@ -567,7 +567,7 @@ define void @func7(ptr %unknown) {
 ; UPTO2-NEXT:    ret void
 ;
 ; LIMI0-LABEL: @func7(
-; LIMI0-NEXT:    call void [[UNKNOWN:%.*]](), !callees !1
+; LIMI0-NEXT:    call void [[UNKNOWN:%.*]](), !callees [[META1]]
 ; LIMI0-NEXT:    ret void
 ;
   call void %unknown(), !callees !2
@@ -578,17 +578,17 @@ define void @func7(ptr %unknown) {
 define void @undef_in_callees() {
 ; UNLIM-LABEL: @undef_in_callees(
 ; UNLIM-NEXT:  cond.end.i:
-; UNLIM-NEXT:    call void undef(ptr undef, i32 undef, ptr undef), !callees !2
+; UNLIM-NEXT:    call void undef(ptr undef, i32 undef, ptr undef), !callees [[META2:![0-9]+]]
 ; UNLIM-NEXT:    ret void
 ;
 ; LIMI2-LABEL: @undef_in_callees(
 ; LIMI2-NEXT:  cond.end.i:
-; LIMI2-NEXT:    call void undef(ptr undef, i32 undef, ptr undef), !callees !4
+; LIMI2-NEXT:    call void undef(ptr undef, i32 undef, ptr undef), !callees [[META4:![0-9]+]]
 ; LIMI2-NEXT:    ret void
 ;
 ; LIMI0-LABEL: @undef_in_callees(
 ; LIMI0-NEXT:  cond.end.i:
-; LIMI0-NEXT:    call void undef(ptr undef, i32 undef, ptr undef), !callees !6
+; LIMI0-NEXT:    call void undef(ptr undef, i32 undef, ptr undef), !callees [[META6:![0-9]+]]
 ; LIMI0-NEXT:    ret void
 ;
 cond.end.i:
@@ -689,25 +689,35 @@ define void @as_cast(ptr %arg) {
 ; UTC_ARGS: --enable
 
 ;.
-; CHECK: attributes #[[ATTR0:[0-9]+]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
+; OUNLM: attributes #[[ATTR0:[0-9]+]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
 ;.
-; UNLIM: [[META0:![0-9]+]] = !{!1}
-; UNLIM: [[META1:![0-9]+]] = !{i64 0, i1 false}
-; UNLIM: [[META2:![0-9]+]] = distinct !{ptr undef, ptr null}
+; LIMI2: attributes #[[ATTR0:[0-9]+]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
 ;.
-; LIMI2: [[META0:![0-9]+]] = !{ptr @void, ptr @retFloatTakeFloat}
-; LIMI2: [[META1:![0-9]+]] = !{ptr @void}
-; LIMI2: [[META2:![0-9]+]] = !{!3}
-; LIMI2: [[META3:![0-9]+]] = !{i64 0, i1 false}
-; LIMI2: [[META4:![0-9]+]] = distinct !{ptr undef, ptr null}
+; LIMI0: attributes #[[ATTR0:[0-9]+]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
 ;.
-; LIMI0: [[META0:![0-9]+]] = !{ptr @func4, ptr @internal_good}
-; LIMI0: [[META1:![0-9]+]] = !{ptr @func3, ptr @func4}
-; LIMI0: [[META2:![0-9]+]] = !{ptr @takeI32, ptr @retI32, ptr @void, ptr @retFloatTakeFloat}
-; LIMI0: [[META3:![0-9]+]] = !{ptr @takeI32, ptr @retI32, ptr @void}
-; LIMI0: [[META4:![0-9]+]] = !{!5}
-; LIMI0: [[META5:![0-9]+]] = !{i64 0, i1 false}
-; LIMI0: [[META6:![0-9]+]] = distinct !{ptr undef, ptr null}
+; CWRLD: attributes #[[ATTR0:[0-9]+]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
+;.
+; OUNLM: [[META0:![0-9]+]] = !{[[META1:![0-9]+]]}
+; OUNLM: [[META1]] = !{i64 0, i1 false}
+; OUNLM: [[META2]] = distinct !{ptr undef, ptr null}
+;.
+; LIMI2: [[META0]] = !{ptr @void, ptr @retFloatTakeFloat}
+; LIMI2: [[META1]] = !{ptr @void}
+; LIMI2: [[META2:![0-9]+]] = !{[[META3:![0-9]+]]}
+; LIMI2: [[META3]] = !{i64 0, i1 false}
+; LIMI2: [[META4]] = distinct !{ptr undef, ptr null}
+;.
+; LIMI0: [[META0]] = !{ptr @func4, ptr @internal_good}
+; LIMI0: [[META1]] = !{ptr @func3, ptr @func4}
+; LIMI0: [[META2]] = !{ptr @takeI32, ptr @retI32, ptr @void, ptr @retFloatTakeFloat}
+; LIMI0: [[META3]] = !{ptr @takeI32, ptr @retI32, ptr @void}
+; LIMI0: [[META4:![0-9]+]] = !{[[META5:![0-9]+]]}
+; LIMI0: [[META5]] = !{i64 0, i1 false}
+; LIMI0: [[META6]] = distinct !{ptr undef, ptr null}
+;.
+; CWRLD: [[META0:![0-9]+]] = !{[[META1:![0-9]+]]}
+; CWRLD: [[META1]] = !{i64 0, i1 false}
+; CWRLD: [[META2]] = distinct !{ptr undef, ptr null}
 ;.
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; DOT: {{.*}}
diff --git a/llvm/test/Transforms/Attributor/convergent.ll b/llvm/test/Transforms/Attributor/convergent.ll
index ccb606a64187..cd81c4d6f27b 100644
--- a/llvm/test/Transforms/Attributor/convergent.ll
+++ b/llvm/test/Transforms/Attributor/convergent.ll
@@ -39,7 +39,6 @@ define void @calls_declared_non_convergent() convergent {
 }
 
 ; CHECK: Function Attrs: convergent
-; CHECK-NEXT: declare i32 @declared_convergent()
 declare i32 @declared_convergent() convergent
 
 define i32 @calls_declared_convergent() convergent {
diff --git a/llvm/test/Transforms/Attributor/depgraph.ll b/llvm/test/Transforms/Attributor/depgraph.ll
index 22186edefaf2..8336ac58eb3b 100644
--- a/llvm/test/Transforms/Attributor/depgraph.ll
+++ b/llvm/test/Transforms/Attributor/depgraph.ll
@@ -51,192 +51,6 @@ define ptr @checkAndAdvance(ptr align 16 %0) {
 ; Check for graph
 ;
 
-; GRAPH:      [AAIsDead] for CtxI '  %2 = load i32, ptr %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state Live[#BB 4/4][#TBEP 0][#KDE 1]
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAPotentialValues] for CtxI '  %3 = icmp eq i32 %2, 0' at position {flt: [@-1]} with state set-state(< {  %3 = icmp eq i32 %2, 0[3], } >)
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAPotentialValues] for CtxI '  %2 = load i32, ptr %0, align 4' at position {flt: [@-1]} with state set-state(< {  %2 = load i32, ptr %0, align 4[3], } >)
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAUnderlyingObjects] for CtxI '  %2 = load i32, ptr %0, align 4' at position {arg: [@0]} with state UnderlyingObjects <invalid>
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAPotentialValues] for CtxI <<null inst>> at position {flt: [@-1]} with state set-state(< {i32 0[3], } >)
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AANoReturn] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {cs: [@-1]} with state may-return
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AANoReturn] for CtxI '  %2 = load i32, ptr %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state may-return
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAIsDead] for CtxI '  ret ptr %.0' at position {flt: [@-1]} with state assumed-live
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAUndefinedBehavior] for CtxI '  %2 = load i32, ptr %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state undefined-behavior
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAPotentialValues] for CtxI '  %2 = load i32, ptr %0, align 4' at position {arg: [@0]} with state set-state(< {ptr %0[3], } >)
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAIsDead] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {flt: [@-1]} with state assumed-live
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AANoUnwind] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {cs: [@-1]} with state nounwind
-; GRAPH-NEXT:   updates [AAIsDead] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {flt: [@-1]} with state assumed-live
-; GRAPH-NEXT:   updates [AANoUnwind] for CtxI '  %2 = load i32, ptr %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state nounwind
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AANoUnwind] for CtxI '  %2 = load i32, ptr %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state nounwind
-; GRAPH-NEXT:   updates [AANoUnwind] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {cs: [@-1]} with state nounwind
-; GRAPH-NEXT:   updates [AANoCapture] for CtxI '  %2 = load i32, ptr %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAMemoryBehavior] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {cs: [@-1]} with state readonly
-; GRAPH-NEXT:   updates [AAIsDead] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {flt: [@-1]} with state assumed-live
-; GRAPH-NEXT:   updates [AAMemoryBehavior] for CtxI '  %2 = load i32, ptr %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state readonly
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAMemoryBehavior] for CtxI '  %2 = load i32, ptr %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state readonly
-; GRAPH-NEXT:   updates [AAMemoryBehavior] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {cs: [@-1]} with state readonly
-; GRAPH-NEXT:   updates [AANoCapture] for CtxI '  %2 = load i32, ptr %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned
-; GRAPH-NEXT:   updates [AAMemoryBehavior] for CtxI '  %2 = load i32, ptr %0, align 4' at position {arg: [@0]} with state readonly
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAIsDead] for CtxI '  %2 = load i32, ptr %0, align 4' at position {flt: [@-1]} with state assumed-live
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAIsDead] for CtxI '  %3 = icmp eq i32 %2, 0' at position {flt: [@-1]} with state assumed-live
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAIsDead] for CtxI '  br i1 %3, label %4, label %7' at position {flt: [@-1]} with state assumed-live
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAPotentialValues] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {cs_ret: [@-1]} with state set-state(< {  %5 = getelementptr inbounds i32, ptr %0, i64 4[3],   %5 = getelementptr inbounds i32, ptr %0, i64 4[3], } >)
-; GRAPH-NEXT:   updates [AAPotentialValues] for CtxI '  %.0 = phi ptr [ %6, %4 ], [ %0, %7 ]' at position {flt:.0 [.0@-1]} with state set-state(< {ptr %0[3],   %5 = getelementptr inbounds i32, ptr %0, i64 4[3],   %5 = getelementptr inbounds i32, ptr %0, i64 4[3], } >)
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAPotentialValues] for CtxI '  %2 = load i32, ptr %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state set-state(< {  %.0 = phi ptr [ %6, %4 ], [ %0, %7 ][3], } >)
-; GRAPH-NEXT:   updates [AAPotentialValues] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {cs_ret: [@-1]} with state set-state(< {  %5 = getelementptr inbounds i32, ptr %0, i64 4[3],   %5 = getelementptr inbounds i32, ptr %0, i64 4[3], } >)
-; GRAPH-NEXT:   updates [AANoUndef] for CtxI '  %2 = load i32, ptr %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state may-undef-or-poison
-; GRAPH-NEXT:   updates [AANoCapture] for CtxI '  %2 = load i32, ptr %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned
-; GRAPH-NEXT:   updates [AAAlign] for CtxI '  %2 = load i32, ptr %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state align<1-16>
-; GRAPH-NEXT:   updates [AANonNull] for CtxI '  %2 = load i32, ptr %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state nonnull
-; GRAPH-NEXT:   updates [AADereferenceable] for CtxI '  %2 = load i32, ptr %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state unknown-dereferenceable
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAPotentialValues] for CtxI '  %.0 = phi ptr [ %6, %4 ], [ %0, %7 ]' at position {flt:.0 [.0@-1]} with state set-state(< {ptr %0[3],   %5 = getelementptr inbounds i32, ptr %0, i64 4[3],   %5 = getelementptr inbounds i32, ptr %0, i64 4[3], } >)
-; GRAPH-NEXT:   updates [AAPotentialValues] for CtxI '  %2 = load i32, ptr %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state set-state(< {  %.0 = phi ptr [ %6, %4 ], [ %0, %7 ][3], } >)
-; GRAPH-NEXT:   updates [AAPotentialValues] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {cs_ret: [@-1]} with state set-state(< {  %5 = getelementptr inbounds i32, ptr %0, i64 4[3],   %5 = getelementptr inbounds i32, ptr %0, i64 4[3], } >)
-; GRAPH-NEXT:   updates [AANoCapture] for CtxI '  %2 = load i32, ptr %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned
-; GRAPH-NEXT:   updates [AAAlign] for CtxI '  %2 = load i32, ptr %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state align<1-16>
-; GRAPH-NEXT:   updates [AANonNull] for CtxI ' %.0 = phi ptr [ %6, %4 ], [ %0, %7 ]' at position {flt:.0 [.0@-1]} with state nonnull
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAPotentialValues] for CtxI '  %5 = getelementptr inbounds i32, ptr %0, i64 4' at position {flt: [@-1]} with state set-state(< {  %5 = getelementptr inbounds i32, ptr %0, i64 4[3], } >)
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAPotentialValues] for CtxI <<null inst>> at position {flt: [@-1]} with state set-state(< {i64 4[3], } >)
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAPotentialValues] for CtxI '  %2 = load i32, ptr %0, align 4' at position {flt:checkAndAdvance [checkAndAdvance@-1]} with state set-state(< {@checkAndAdvance[3], } >)
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAPotentialValues] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {cs_arg: [@0]} with state set-state(< {  %5 = getelementptr inbounds i32, ptr %0, i64 4[3], } >)
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAInstanceInfo] for CtxI '  %5 = getelementptr inbounds i32, ptr %0, i64 4' at position {flt: [@-1]} with state <unique [fAa]>
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AANoRecurse] for CtxI '  %2 = load i32, ptr %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state may-recurse
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAInterFnReachability] for CtxI ' %2 = load i32, ptr %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state #queries(1)
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAIntraFnReachability] for CtxI ' %2 = load i32, ptr %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state #queries(1)
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AACallEdges] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {cs: [@-1]} with state CallEdges[0,1]
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAIsDead] for CtxI '  br label %8' at position {flt: [@-1]} with state assumed-live
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AANoUndef] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {cs_arg: [@0]} with state may-undef-or-poison
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AANoUndef] for CtxI '  %5 = getelementptr inbounds i32, ptr %0, i64 4' at position {flt: [@-1]} with state may-undef-or-poison
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAIsDead] for CtxI '  %2 = load i32, ptr %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state assumed-live
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AANoUndef] for CtxI '  %2 = load i32, ptr %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state may-undef-or-poison
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAHeapToStack] for CtxI '  %2 = load i32, ptr %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state [H2S] Mallocs Good/Bad: 0/0
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAMustProgress] for CtxI ' %2 = load i32, ptr %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state may-not-progress
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAWillReturn] for CtxI '  %2 = load i32, ptr %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state may-noreturn
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAWillReturn] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {cs: [@-1]} with state may-noreturn
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AANoRecurse] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {cs: [@-1]} with state may-recurse
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AANoFree] for CtxI '  %2 = load i32, ptr %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state nofree
-; GRAPH-NEXT:   updates [AANoFree] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {cs: [@-1]} with state nofree
-; GRAPH-NEXT:   updates [AANoFree] for CtxI '  %2 = load i32, ptr %0, align 4' at position {arg: [@0]} with state nofree
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AANoFree] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {cs: [@-1]} with state nofree
-; GRAPH-NEXT:   updates [AANoFree] for CtxI '  %2 = load i32, ptr %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state nofree
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AANoSync] for CtxI '  %2 = load i32, ptr %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state nosync
-; GRAPH-NEXT:   updates [AANoSync] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {cs: [@-1]} with state nosync
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AANoSync] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {cs: [@-1]} with state nosync
-; GRAPH-NEXT:   updates [AANoSync] for CtxI '  %2 = load i32, ptr %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state nosync
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAMemoryLocation] for CtxI '  %2 = load i32, ptr %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state memory:argument
-; GRAPH-NEXT:   updates [AAMemoryLocation] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {cs: [@-1]} with state memory:argument
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAMemoryLocation] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {cs: [@-1]} with state memory:argument
-; GRAPH-NEXT:   updates [AAMemoryLocation] for CtxI '  %2 = load i32, ptr %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state memory:argument
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAMemoryBehavior] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {cs_arg: [@0]} with state readonly
-; GRAPH-NEXT:   updates [AAMemoryBehavior] for CtxI '  %2 = load i32, ptr %0, align 4' at position {arg: [@0]} with state readonly
-; GRAPH-NEXT:   updates [AAMemoryLocation] for CtxI '  %2 = load i32, ptr %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state memory:argument
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAMemoryBehavior] for CtxI '  %2 = load i32, ptr %0, align 4' at position {arg: [@0]} with state readonly
-; GRAPH-NEXT:   updates [AAMemoryBehavior] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {cs_arg: [@0]} with state readonly
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AANoCapture] for CtxI '  %2 = load i32, ptr %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned
-; GRAPH-NEXT:   updates [AANoCapture] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {cs_arg: [@0]} with state assumed not-captured-maybe-returned
-; GRAPH-NEXT:   updates [AAMemoryBehavior] for CtxI '  %2 = load i32, ptr %0, align 4' at position {arg: [@0]} with state readonly
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAIsDead] for CtxI '  %5 = getelementptr inbounds i32, ptr %0, i64 4' at position {flt: [@-1]} with state assumed-live
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAIsDead] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {cs_arg: [@0]} with state assumed-live
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAIsDead] for CtxI '  %2 = load i32, ptr %0, align 4' at position {arg: [@0]} with state assumed-live
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AANoCapture] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {cs_arg: [@0]} with state assumed not-captured-maybe-returned
-; GRAPH-NEXT:   updates [AANoCapture] for CtxI '  %2 = load i32, ptr %0, align 4' at position {arg: [@0]} with state assumed not-captured-maybe-returned
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAIsDead] for CtxI '  br label %8' at position {flt: [@-1]} with state assumed-live
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAUnderlyingObjects] for CtxI '  %5 = getelementptr inbounds i32, ptr %0, i64 4' at position {flt: [@-1]} with state UnderlyingObjects inter #1 objs, intra #1 objs
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAAssumptionInfo] for CtxI '  %2 = load i32, ptr %0, align 4' at position {fn:checkAndAdvance [checkAndAdvance@-1]} with state Known [], Assumed []
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAAlign] for CtxI '  %2 = load i32, ptr %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state align<1-16>
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAAlign] for CtxI '  %2 = load i32, ptr %0, align 4' at position {arg: [@0]} with state align<16-16>
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAAlign] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {cs_arg: [@0]} with state align<16-16>
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAAlign] for CtxI '  %5 = getelementptr inbounds i32, ptr %0, i64 4' at position {flt: [@-1]} with state align<16-16>
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AANonNull] for CtxI '  %2 = load i32, ptr %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state nonnull
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AANonNull] for CtxI ' %.0 = phi ptr [ %6, %4 ], [ %0, %7 ]' at position {flt:.0 [.0@-1]} with state nonnull
-; GRAPH-NEXT:   updates [AANonNull] for CtxI ' %2 = load i32, ptr %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state nonnull
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AANonNull] for CtxI '  %2 = load i32, ptr %0, align 4' at position {arg: [@0]} with state nonnull
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AANoAlias] for CtxI '  %2 = load i32, ptr %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state may-alias
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AADereferenceable] for CtxI '  %2 = load i32, ptr %0, align 4' at position {fn_ret:checkAndAdvance [checkAndAdvance@-1]} with state unknown-dereferenceable
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AADereferenceable] for CtxI '  %2 = load i32, ptr %0, align 4' at position {arg: [@0]} with state dereferenceable_or_null<4-4> [non-null is unknown]
-; GRAPH-NEXT:   updates [AADereferenceable] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {cs_arg: [@0]} with state unknown-dereferenceable
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AADereferenceable] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {cs_arg: [@0]} with state unknown-dereferenceable
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AANoFree] for CtxI '  %2 = load i32, ptr %0, align 4' at position {arg: [@0]} with state nofree
-; GRAPH-NEXT:   updates [AANoFree] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {cs_arg: [@0]} with state nofree
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAPrivatizablePtr] for CtxI '  %2 = load i32, ptr %0, align 4' at position {arg: [@0]} with state [no-priv]
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAAssumptionInfo] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {cs: [@-1]} with state Known [], Assumed []
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AANoAlias] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {cs_arg: [@0]} with state may-alias
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AANoAlias] for CtxI '  %5 = getelementptr inbounds i32, ptr %0, i64 4' at position {flt: [@-1]} with state may-alias
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AANoFree] for CtxI '  %6 = call ptr @checkAndAdvance(ptr %5)' at position {cs_arg: [@0]} with state nofree
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AAAddressSpace] for CtxI '  %2 = load i32, ptr %0, align 4' at position {arg: [@0]} with state addrspace(<invalid>)
-; GRAPH-EMPTY:
-; GRAPH-NEXT: [AADereferenceable] for CtxI '  %5 = getelementptr inbounds i32, ptr %0, i64 4' at position {flt: [@-1]} with state unknown-dereferenceable
 
 ; GRAPH-NOT: update
 
diff --git a/llvm/test/Transforms/Attributor/dereferenceable-1.ll b/llvm/test/Transforms/Attributor/dereferenceable-1.ll
index 97f0bf87b28a..f6a2df9eecc5 100644
--- a/llvm/test/Transforms/Attributor/dereferenceable-1.ll
+++ b/llvm/test/Transforms/Attributor/dereferenceable-1.ll
@@ -9,7 +9,7 @@ declare void @deref_phi_user(ptr %a);
 ; take mininimum of return values
 ;
 ;.
-; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = global i64 0
+; CHECK: @g = global i64 0
 ;.
 define ptr @test1(ptr dereferenceable(4) %0, ptr dereferenceable(8) %1, i1 zeroext %2) local_unnamed_addr {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
@@ -96,8 +96,8 @@ define void @deref_phi_growing(ptr dereferenceable(4000) %a) {
 ; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
 ; CHECK-NEXT:    [[A_ADDR_0:%.*]] = phi ptr [ [[A]], [[ENTRY]] ], [ [[INCDEC_PTR:%.*]], [[FOR_INC]] ]
 ; CHECK-NEXT:    call void @deref_phi_user(ptr nonnull [[A_ADDR_0]])
-; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr [[A_ADDR_0]], align 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], [[TMP]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[A_ADDR_0]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], [[VAL]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
@@ -117,8 +117,8 @@ for.cond:                                         ; preds = %for.inc, %entry
   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
   %a.addr.0 = phi ptr [ %a, %entry ], [ %incdec.ptr, %for.inc ]
   call void @deref_phi_user(ptr %a.addr.0)
-  %tmp = load i32, ptr %a.addr.0, align 4
-  %cmp = icmp slt i32 %i.0, %tmp
+  %val = load i32, ptr %a.addr.0, align 4
+  %cmp = icmp slt i32 %i.0, %val
   br i1 %cmp, label %for.body, label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %for.cond
@@ -147,8 +147,8 @@ define void @deref_phi_shrinking(ptr dereferenceable(4000) %a) {
 ; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
 ; CHECK-NEXT:    [[A_ADDR_0:%.*]] = phi ptr [ [[A]], [[ENTRY]] ], [ [[INCDEC_PTR:%.*]], [[FOR_INC]] ]
 ; CHECK-NEXT:    call void @deref_phi_user(ptr nonnull [[A_ADDR_0]])
-; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr [[A_ADDR_0]], align 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], [[TMP]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[A_ADDR_0]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], [[VAL]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
@@ -168,8 +168,8 @@ for.cond:                                         ; preds = %for.inc, %entry
   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
   %a.addr.0 = phi ptr [ %a, %entry ], [ %incdec.ptr, %for.inc ]
   call void @deref_phi_user(ptr %a.addr.0)
-  %tmp = load i32, ptr %a.addr.0, align 4
-  %cmp = icmp slt i32 %i.0, %tmp
+  %val = load i32, ptr %a.addr.0, align 4
+  %cmp = icmp slt i32 %i.0, %val
   br i1 %cmp, label %for.body, label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %for.cond
@@ -202,14 +202,14 @@ define ptr @f7_0(ptr %ptr) {
   ret ptr %ptr
 }
 
-define void @f7_1(ptr %ptr, i1 %c) {
+define void @f7_1(ptr %ptr, i1 %cnd) {
 ; CHECK: Function Attrs: mustprogress nounwind willreturn
 ; CHECK-LABEL: define {{[^@]+}}@f7_1
-; CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[PTR:%.*]], i1 noundef [[C:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[PTR:%.*]], i1 noundef [[CND:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[A:%.*]] = tail call i32 @unkown_f(ptr noundef nonnull align 4 dereferenceable(4) [[PTR]]) #[[ATTR1]]
 ; CHECK-NEXT:    [[PTR_0:%.*]] = load i32, ptr [[PTR]], align 4
 ; CHECK-NEXT:    [[B:%.*]] = tail call i32 @unkown_f(ptr noundef nonnull align 4 dereferenceable(4) [[PTR]]) #[[ATTR1]]
-; CHECK-NEXT:    br i1 [[C]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK-NEXT:    br i1 [[CND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
 ; CHECK:       if.true:
 ; CHECK-NEXT:    [[C:%.*]] = tail call i32 @unkown_f(ptr noundef nonnull align 4 dereferenceable(8) [[PTR]]) #[[ATTR1]]
 ; CHECK-NEXT:    [[D:%.*]] = tail call i32 @unkown_f(ptr noundef nonnull align 4 dereferenceable(8) [[PTR]]) #[[ATTR1]]
@@ -223,7 +223,7 @@ define void @f7_1(ptr %ptr, i1 %c) {
   ; deref 4 hold
 ; FIXME: this should be %B = tail call i32 @unkown_f(ptr nonnull dereferenceable(4) %ptr)
   %B = tail call i32 @unkown_f(ptr dereferenceable(1) %ptr)
-  br i1%c, label %if.true, label %if.false
+  br i1%cnd, label %if.true, label %if.false
 if.true:
   %C = tail call i32 @unkown_f(ptr %ptr)
   %D = tail call i32 @unkown_f(ptr dereferenceable(8) %ptr)
@@ -233,15 +233,15 @@ if.false:
   ret void
 }
 
-define void @f7_2(i1 %c) {
+define void @f7_2(i1 %cnd) {
 ; CHECK: Function Attrs: mustprogress nounwind willreturn
 ; CHECK-LABEL: define {{[^@]+}}@f7_2
-; CHECK-SAME: (i1 noundef [[C:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: (i1 noundef [[CND:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[PTR:%.*]] = tail call nonnull align 4 dereferenceable(4) ptr @unkown_ptr() #[[ATTR1]]
 ; CHECK-NEXT:    [[A:%.*]] = tail call i32 @unkown_f(ptr noundef nonnull align 4 dereferenceable(4) [[PTR]]) #[[ATTR1]]
 ; CHECK-NEXT:    [[ARG_A_0:%.*]] = load i32, ptr [[PTR]], align 4
 ; CHECK-NEXT:    [[B:%.*]] = tail call i32 @unkown_f(ptr noundef nonnull align 4 dereferenceable(4) [[PTR]]) #[[ATTR1]]
-; CHECK-NEXT:    br i1 [[C]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK-NEXT:    br i1 [[CND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
 ; CHECK:       if.true:
 ; CHECK-NEXT:    [[C:%.*]] = tail call i32 @unkown_f(ptr noundef nonnull align 4 dereferenceable(8) [[PTR]]) #[[ATTR1]]
 ; CHECK-NEXT:    [[D:%.*]] = tail call i32 @unkown_f(ptr noundef nonnull align 4 dereferenceable(8) [[PTR]]) #[[ATTR1]]
@@ -255,7 +255,7 @@ define void @f7_2(i1 %c) {
   %arg_a.0 = load i32, ptr %ptr
   ; deref 4 hold
   %B = tail call i32 @unkown_f(ptr dereferenceable(1) %ptr)
-  br i1%c, label %if.true, label %if.false
+  br i1%cnd, label %if.true, label %if.false
 if.true:
   %C = tail call i32 @unkown_f(ptr %ptr)
   %D = tail call i32 @unkown_f(ptr dereferenceable(8) %ptr)
@@ -837,5 +837,7 @@ f:
 ; CGSCC: attributes #[[ATTR9]] = { nofree willreturn memory(write) }
 ; CGSCC: attributes #[[ATTR10]] = { nounwind }
 ;.
-; CHECK: [[META0:![0-9]+]] = !{i64 10, i64 100}
+; TUNIT: [[RNG0]] = !{i64 10, i64 100}
+;.
+; CGSCC: [[RNG0]] = !{i64 10, i64 100}
 ;.
diff --git a/llvm/test/Transforms/Attributor/heap_to_stack.ll b/llvm/test/Transforms/Attributor/heap_to_stack.ll
index 4f267a7abe30..33ac066e43d0 100644
--- a/llvm/test/Transforms/Attributor/heap_to_stack.ll
+++ b/llvm/test/Transforms/Attributor/heap_to_stack.ll
@@ -30,7 +30,7 @@ declare void @free(ptr nocapture) allockind("free")
 declare void @llvm.lifetime.start.p0(i64, ptr nocapture) nounwind
 
 ;.
-; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = internal global ptr undef, align 4
+; CHECK: @G = internal global ptr undef, align 4
 ;.
 define void @h2s_value_simplify_interaction(i1 %c, ptr %A) {
 ; CHECK-LABEL: define {{[^@]+}}@h2s_value_simplify_interaction
diff --git a/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll b/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll
index 476e65b4e465..2a5b3e94291a 100644
--- a/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll
+++ b/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll
@@ -35,8 +35,8 @@ declare void @free(ptr nocapture)
 declare void @llvm.lifetime.start.p0(i64, ptr nocapture) nounwind
 
 ;.
-; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = internal global ptr undef, align 4
-; CHECK: @[[GTL:[a-zA-Z0-9_$"\\.-]+]] = internal thread_local global ptr undef, align 4
+; CHECK: @G = internal global ptr undef, align 4
+; CHECK: @Gtl = internal thread_local global ptr undef, align 4
 ;.
 define void @nofree_arg_only(ptr %p1, ptr %p2) {
 ; CHECK-LABEL: define {{[^@]+}}@nofree_arg_only
diff --git a/llvm/test/Transforms/Attributor/liveness.ll b/llvm/test/Transforms/Attributor/liveness.ll
index d6718884ea95..f17bd5795a17 100644
--- a/llvm/test/Transforms/Attributor/liveness.ll
+++ b/llvm/test/Transforms/Attributor/liveness.ll
@@ -24,17 +24,17 @@ declare i32 @bar() nosync readnone
 ; and nothing should be deduced for it.
 
 ;.
-; TUNIT: @[[DEAD_WITH_BLOCKADDRESS_USERS_L:[a-zA-Z0-9_$"\\.-]+]] = constant [2 x ptr] [ptr inttoptr (i32 1 to ptr), ptr inttoptr (i32 1 to ptr)]
-; TUNIT: @[[A1:[a-zA-Z0-9_$"\\.-]+]] = common global i8 0, align 8
-; TUNIT: @[[A2:[a-zA-Z0-9_$"\\.-]+]] = common global i8 0, align 16
-; TUNIT: @[[E:[a-zA-Z0-9_$"\\.-]+]] = global ptr null
-; TUNIT: @[[P:[a-zA-Z0-9_$"\\.-]+]] = global i8 0
+; TUNIT: @dead_with_blockaddress_users.l = constant [2 x ptr] [ptr inttoptr (i32 1 to ptr), ptr inttoptr (i32 1 to ptr)]
+; TUNIT: @a1 = common global i8 0, align 8
+; TUNIT: @a2 = common global i8 0, align 16
+; TUNIT: @e = global ptr null
+; TUNIT: @p = global i8 0
 ;.
-; CGSCC: @[[DEAD_WITH_BLOCKADDRESS_USERS_L:[a-zA-Z0-9_$"\\.-]+]] = constant [2 x ptr] [ptr blockaddress(@dead_with_blockaddress_users, [[LAB0:%.*]]), ptr blockaddress(@dead_with_blockaddress_users, [[END:%.*]])]
-; CGSCC: @[[A1:[a-zA-Z0-9_$"\\.-]+]] = common global i8 0, align 8
-; CGSCC: @[[A2:[a-zA-Z0-9_$"\\.-]+]] = common global i8 0, align 16
-; CGSCC: @[[E:[a-zA-Z0-9_$"\\.-]+]] = global ptr null
-; CGSCC: @[[P:[a-zA-Z0-9_$"\\.-]+]] = global i8 0
+; CGSCC: @dead_with_blockaddress_users.l = constant [2 x ptr] [ptr blockaddress(@dead_with_blockaddress_users, %lab0), ptr blockaddress(@dead_with_blockaddress_users, %end)]
+; CGSCC: @a1 = common global i8 0, align 8
+; CGSCC: @a2 = common global i8 0, align 16
+; CGSCC: @e = global ptr null
+; CGSCC: @p = global i8 0
 ;.
 define internal i32 @dead_internal_func(i32 %0) {
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
@@ -306,7 +306,7 @@ define i32 @invoke_noreturn(i32 %a) personality ptr @__gxx_personality_v0 {
 ; CHECK:       cond.true:
 ; CHECK-NEXT:    call void @normal_call()
 ; CHECK-NEXT:    [[CALL:%.*]] = invoke i32 @foo_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    to label [[CONTINUE:%.*]] unwind label [[CLEANUP:%.*]]
+; CHECK-NEXT:            to label [[CONTINUE:%.*]] unwind label [[CLEANUP:%.*]]
 ; CHECK:       cond.false:
 ; CHECK-NEXT:    call void @normal_call()
 ; CHECK-NEXT:    [[CALL1:%.*]] = call i32 @bar()
@@ -317,7 +317,7 @@ define i32 @invoke_noreturn(i32 %a) personality ptr @__gxx_personality_v0 {
 ; CHECK-NEXT:    unreachable
 ; CHECK:       cleanup:
 ; CHECK-NEXT:    [[RES:%.*]] = landingpad { ptr, i32 }
-; CHECK-NEXT:    catch ptr null
+; CHECK-NEXT:            catch ptr null
 ; CHECK-NEXT:    ret i32 0
 ;
 entry:
@@ -2147,15 +2147,15 @@ define void @live_with_dead_entry_lp() personality ptr @__gxx_personality_v0 {
 ; CHECK-SAME: () #[[ATTR2]] personality ptr @__gxx_personality_v0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    invoke void @blowup() #[[ATTR4]]
-; CHECK-NEXT:    to label [[LIVE_WITH_DEAD_ENTRY_DEAD:%.*]] unwind label [[LP1:%.*]]
+; CHECK-NEXT:            to label [[LIVE_WITH_DEAD_ENTRY_DEAD:%.*]] unwind label [[LP1:%.*]]
 ; CHECK:       lp1:
 ; CHECK-NEXT:    [[LP:%.*]] = landingpad { ptr, i32 }
-; CHECK-NEXT:    catch ptr null
+; CHECK-NEXT:            catch ptr null
 ; CHECK-NEXT:    invoke void @blowup() #[[ATTR4]]
-; CHECK-NEXT:    to label [[LIVE_WITH_DEAD_ENTRY_DEAD1:%.*]] unwind label [[LP2:%.*]]
+; CHECK-NEXT:            to label [[LIVE_WITH_DEAD_ENTRY_DEAD1:%.*]] unwind label [[LP2:%.*]]
 ; CHECK:       lp2:
 ; CHECK-NEXT:    [[TMP0:%.*]] = landingpad { ptr, i32 }
-; CHECK-NEXT:    catch ptr null
+; CHECK-NEXT:            catch ptr null
 ; CHECK-NEXT:    br label [[LIVE_WITH_DEAD_ENTRY:%.*]]
 ; CHECK:       live_with_dead_entry.dead:
 ; CHECK-NEXT:    unreachable
@@ -2249,8 +2249,8 @@ define internal i32 @switch_default(i64 %i) nounwind {
 ; TUNIT-SAME: () #[[ATTR12]] {
 ; TUNIT-NEXT:  entry:
 ; TUNIT-NEXT:    switch i64 0, label [[SW_DEFAULT:%.*]] [
-; TUNIT-NEXT:    i64 3, label [[RETURN:%.*]]
-; TUNIT-NEXT:    i64 10, label [[RETURN]]
+; TUNIT-NEXT:      i64 3, label [[RETURN:%.*]]
+; TUNIT-NEXT:      i64 10, label [[RETURN]]
 ; TUNIT-NEXT:    ]
 ; TUNIT:       sw.default:
 ; TUNIT-NEXT:    call void @sink() #[[ATTR17]]
@@ -2263,8 +2263,8 @@ define internal i32 @switch_default(i64 %i) nounwind {
 ; CGSCC-SAME: () #[[ATTR14]] {
 ; CGSCC-NEXT:  entry:
 ; CGSCC-NEXT:    switch i64 0, label [[SW_DEFAULT:%.*]] [
-; CGSCC-NEXT:    i64 3, label [[RETURN:%.*]]
-; CGSCC-NEXT:    i64 10, label [[RETURN]]
+; CGSCC-NEXT:      i64 3, label [[RETURN:%.*]]
+; CGSCC-NEXT:      i64 10, label [[RETURN]]
 ; CGSCC-NEXT:    ]
 ; CGSCC:       sw.default:
 ; CGSCC-NEXT:    call void @sink() #[[ATTR19]]
@@ -2309,8 +2309,8 @@ define internal i32 @switch_default_dead(i64 %i) nounwind {
 ; CGSCC-SAME: () #[[ATTR6]] {
 ; CGSCC-NEXT:  entry:
 ; CGSCC-NEXT:    switch i64 0, label [[SW_DEFAULT:%.*]] [
-; CGSCC-NEXT:    i64 3, label [[RETURN:%.*]]
-; CGSCC-NEXT:    i64 10, label [[RETURN]]
+; CGSCC-NEXT:      i64 3, label [[RETURN:%.*]]
+; CGSCC-NEXT:      i64 10, label [[RETURN]]
 ; CGSCC-NEXT:    ]
 ; CGSCC:       sw.default:
 ; CGSCC-NEXT:    ret i32 123
diff --git a/llvm/test/Transforms/Attributor/memory_locations.ll b/llvm/test/Transforms/Attributor/memory_locations.ll
index 71d6223a986b..2dbdf9e6048c 100644
--- a/llvm/test/Transforms/Attributor/memory_locations.ll
+++ b/llvm/test/Transforms/Attributor/memory_locations.ll
@@ -8,7 +8,7 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
 declare noalias ptr @malloc(i64) inaccessiblememonly
 
 ;.
-; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external dso_local global i32, align 4
+; CHECK: @G = external dso_local global i32, align 4
 ;.
 define dso_local ptr @internal_only(i32 %arg) {
 ; CHECK: Function Attrs: memory(inaccessiblemem: readwrite)
@@ -235,16 +235,16 @@ define internal ptr @internal_argmem_only_rec_1(ptr %arg) {
 ; CHECK:       if.then:
 ; CHECK-NEXT:    br label [[RETURN:%.*]]
 ; CHECK:       if.end:
-; CHECK-NEXT:    [[TRUETMP1:%.*]] = load i32, ptr [[ARG]], align 4
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[TRUETMP1]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARG]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[TMP1]], 1
 ; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN2:%.*]], label [[IF_END3:%.*]]
 ; CHECK:       if.then2:
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr [[ARG]], i64 -1
 ; CHECK-NEXT:    [[CALL:%.*]] = call noalias ptr @internal_argmem_only_rec_2(ptr nocapture nofree noundef nonnull align 4 dereferenceable(4) [[ADD_PTR]])
 ; CHECK-NEXT:    br label [[RETURN]]
 ; CHECK:       if.end3:
-; CHECK-NEXT:    [[TRUETMP2:%.*]] = load i32, ptr [[ARG]], align 4
-; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[TRUETMP2]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARG]], align 4
+; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[TMP2]] to i64
 ; CHECK-NEXT:    [[CALL4:%.*]] = call noalias ptr @malloc(i64 [[CONV]])
 ; CHECK-NEXT:    br label [[RETURN]]
 ; CHECK:       return:
diff --git a/llvm/test/Transforms/Attributor/memory_locations_gpu.ll b/llvm/test/Transforms/Attributor/memory_locations_gpu.ll
index 70734aeedec9..2c7a98a41f86 100644
--- a/llvm/test/Transforms/Attributor/memory_locations_gpu.ll
+++ b/llvm/test/Transforms/Attributor/memory_locations_gpu.ll
@@ -12,7 +12,7 @@ declare ptr addrspace(3) @ptr_to_shared() memory(none)
 
 ; Should be memory(none)
 ;.
-; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external dso_local addrspace(4) global i32, align 4
+; CHECK: @G = external dso_local addrspace(4) global i32, align 4
 ;.
 define i32 @test_const_as_global1() {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
diff --git a/llvm/test/Transforms/Attributor/misc_crash.ll b/llvm/test/Transforms/Attributor/misc_crash.ll
index e24af3faface..68e0e22f29cf 100644
--- a/llvm/test/Transforms/Attributor/misc_crash.ll
+++ b/llvm/test/Transforms/Attributor/misc_crash.ll
@@ -5,8 +5,8 @@
 @var2 = internal global i32 0
 
 ;.
-; CHECK: @[[VAR1:[a-zA-Z0-9_$"\\.-]+]] = internal global [1 x i32] undef
-; CHECK: @[[VAR2:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 0
+; CHECK: @var1 = internal global [1 x i32] undef
+; CHECK: @var2 = internal global i32 0
 ;.
 define ptr addrspace(1) @foo(ptr addrspace(4) %arg) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
@@ -124,8 +124,6 @@ define internal i16 @bar3(ptr %p1, i16 %p2) {
   ret i16 %p2
 }
 
-; CHECK-LABEL: declare {{[^@]+}}@func6
-; CHECK-SAME: (ptr)
 declare void @func6(ptr)
 ;.
 ; CHECK: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
diff --git a/llvm/test/Transforms/Attributor/new_attributes.ll b/llvm/test/Transforms/Attributor/new_attributes.ll
index 6b432fcbe63f..67b72e009da1 100644
--- a/llvm/test/Transforms/Attributor/new_attributes.ll
+++ b/llvm/test/Transforms/Attributor/new_attributes.ll
@@ -2,11 +2,6 @@
 ; RUN: opt < %s -passes=attributor -attributor-annotate-decl-cs   -S | FileCheck %s
 
 ; CHECK-NOT: Function
-; CHECK: declare i32 @foo1()
-; CHECK-NOT: Function
-; CHECK: declare i32 @foo2()
-; CHECK-NOT: Function
-; CHECK: declare i32 @foo3()
 declare i32 @foo1()
 declare i32 @foo2()
 declare i32 @foo3()
diff --git a/llvm/test/Transforms/Attributor/noalias.ll b/llvm/test/Transforms/Attributor/noalias.ll
index 00ca83d0c5d6..c63d81878f53 100644
--- a/llvm/test/Transforms/Attributor/noalias.ll
+++ b/llvm/test/Transforms/Attributor/noalias.ll
@@ -14,8 +14,8 @@
 @G = external global ptr
 
 ;.
-; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global ptr
-; CHECK: @[[ALIAS_OF_P:[a-zA-Z0-9_$"\\.-]+]] = external global ptr
+; CHECK: @G = external global ptr
+; CHECK: @alias_of_p = external global ptr
 ;.
 define ptr @foo() {
 ; CHECK-LABEL: define {{[^@]+}}@foo() {
diff --git a/llvm/test/Transforms/Attributor/nocapture-1.ll b/llvm/test/Transforms/Attributor/nocapture-1.ll
index f61388f71c46..3401ddfdd7d7 100644
--- a/llvm/test/Transforms/Attributor/nocapture-1.ll
+++ b/llvm/test/Transforms/Attributor/nocapture-1.ll
@@ -5,10 +5,10 @@
 @g = global ptr null		; <ptr> [#uses=1]
 
 ;.
-; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = global ptr null
-; CHECK: @[[LOOKUP_TABLE:[a-zA-Z0-9_$"\\.-]+]] = global [2 x i1] [i1 false, i1 true]
-; CHECK: @[[G2:[a-zA-Z0-9_$"\\.-]+]] = global ptr null
-; CHECK: @[[G3:[a-zA-Z0-9_$"\\.-]+]] = global ptr null
+; CHECK: @g = global ptr null
+; CHECK: @lookup_table = global [2 x i1] [i1 false, i1 true]
+; CHECK: @g2 = global ptr null
+; CHECK: @g3 = global ptr null
 ;.
 define ptr @c1(ptr %q) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
@@ -53,8 +53,8 @@ define i1 @c4(ptr %q, i32 %bitno) {
 ; CHECK-LABEL: define {{[^@]+}}@c4
 ; CHECK-SAME: (ptr nofree readnone [[Q:%.*]], i32 [[BITNO:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP:%.*]] = ptrtoint ptr [[Q]] to i32
-; CHECK-NEXT:    [[TRUETMP2:%.*]] = lshr i32 [[TMP]], [[BITNO]]
-; CHECK-NEXT:    [[BIT:%.*]] = trunc i32 [[TRUETMP2]] to i1
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP]], [[BITNO]]
+; CHECK-NEXT:    [[BIT:%.*]] = trunc i32 [[TMP2]] to i1
 ; CHECK-NEXT:    br i1 [[BIT]], label [[L1:%.*]], label [[L0:%.*]]
 ; CHECK:       l0:
 ; CHECK-NEXT:    ret i1 false
@@ -77,8 +77,8 @@ define i1 @c4b(ptr %q, i32 %bitno) {
 ; CHECK-LABEL: define {{[^@]+}}@c4b
 ; CHECK-SAME: (ptr nocapture nofree readnone [[Q:%.*]], i32 [[BITNO:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP:%.*]] = ptrtoint ptr [[Q]] to i32
-; CHECK-NEXT:    [[TRUETMP2:%.*]] = lshr i32 [[TMP]], [[BITNO]]
-; CHECK-NEXT:    [[BIT:%.*]] = trunc i32 [[TRUETMP2]] to i1
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP]], [[BITNO]]
+; CHECK-NEXT:    [[BIT:%.*]] = trunc i32 [[TMP2]] to i1
 ; CHECK-NEXT:    br i1 [[BIT]], label [[L1:%.*]], label [[L0:%.*]]
 ; CHECK:       l0:
 ; CHECK-NEXT:    ret i1 false
@@ -102,8 +102,8 @@ define i1 @c5(ptr %q, i32 %bitno) {
 ; TUNIT-LABEL: define {{[^@]+}}@c5
 ; TUNIT-SAME: (ptr nofree readonly [[Q:%.*]], i32 [[BITNO:%.*]]) #[[ATTR2:[0-9]+]] {
 ; TUNIT-NEXT:    [[TMP:%.*]] = ptrtoint ptr [[Q]] to i32
-; TUNIT-NEXT:    [[TRUETMP2:%.*]] = lshr i32 [[TMP]], [[BITNO]]
-; TUNIT-NEXT:    [[BIT:%.*]] = and i32 [[TRUETMP2]], 1
+; TUNIT-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP]], [[BITNO]]
+; TUNIT-NEXT:    [[BIT:%.*]] = and i32 [[TMP2]], 1
 ; TUNIT-NEXT:    [[LOOKUP:%.*]] = getelementptr [2 x i1], ptr @lookup_table, i32 0, i32 [[BIT]]
 ; TUNIT-NEXT:    [[VAL:%.*]] = load i1, ptr [[LOOKUP]], align 1
 ; TUNIT-NEXT:    ret i1 [[VAL]]
@@ -112,8 +112,8 @@ define i1 @c5(ptr %q, i32 %bitno) {
 ; CGSCC-LABEL: define {{[^@]+}}@c5
 ; CGSCC-SAME: (ptr nofree readonly [[Q:%.*]], i32 [[BITNO:%.*]]) #[[ATTR3:[0-9]+]] {
 ; CGSCC-NEXT:    [[TMP:%.*]] = ptrtoint ptr [[Q]] to i32
-; CGSCC-NEXT:    [[TRUETMP2:%.*]] = lshr i32 [[TMP]], [[BITNO]]
-; CGSCC-NEXT:    [[BIT:%.*]] = and i32 [[TRUETMP2]], 1
+; CGSCC-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP]], [[BITNO]]
+; CGSCC-NEXT:    [[BIT:%.*]] = and i32 [[TMP2]], 1
 ; CGSCC-NEXT:    [[LOOKUP:%.*]] = getelementptr [2 x i1], ptr @lookup_table, i32 0, i32 [[BIT]]
 ; CGSCC-NEXT:    [[VAL:%.*]] = load i1, ptr [[LOOKUP]], align 1
 ; CGSCC-NEXT:    ret i1 [[VAL]]
@@ -134,24 +134,24 @@ define i1 @c6(ptr %q, i8 %bit) personality ptr @__gxx_personality_v0 {
 ; TUNIT-LABEL: define {{[^@]+}}@c6
 ; TUNIT-SAME: (ptr readonly [[Q:%.*]], i8 [[BIT:%.*]]) #[[ATTR4:[0-9]+]] personality ptr @__gxx_personality_v0 {
 ; TUNIT-NEXT:    invoke void @throw_if_bit_set(ptr readonly [[Q]], i8 [[BIT]]) #[[ATTR4]]
-; TUNIT-NEXT:    to label [[RET0:%.*]] unwind label [[RET1:%.*]]
+; TUNIT-NEXT:            to label [[RET0:%.*]] unwind label [[RET1:%.*]]
 ; TUNIT:       ret0:
 ; TUNIT-NEXT:    ret i1 false
 ; TUNIT:       ret1:
 ; TUNIT-NEXT:    [[EXN:%.*]] = landingpad { ptr, i32 }
-; TUNIT-NEXT:    cleanup
+; TUNIT-NEXT:            cleanup
 ; TUNIT-NEXT:    ret i1 true
 ;
 ; CGSCC: Function Attrs: nosync memory(read)
 ; CGSCC-LABEL: define {{[^@]+}}@c6
 ; CGSCC-SAME: (ptr readonly [[Q:%.*]], i8 [[BIT:%.*]]) #[[ATTR5:[0-9]+]] personality ptr @__gxx_personality_v0 {
 ; CGSCC-NEXT:    invoke void @throw_if_bit_set(ptr readonly [[Q]], i8 [[BIT]]) #[[ATTR5]]
-; CGSCC-NEXT:    to label [[RET0:%.*]] unwind label [[RET1:%.*]]
+; CGSCC-NEXT:            to label [[RET0:%.*]] unwind label [[RET1:%.*]]
 ; CGSCC:       ret0:
 ; CGSCC-NEXT:    ret i1 false
 ; CGSCC:       ret1:
 ; CGSCC-NEXT:    [[EXN:%.*]] = landingpad { ptr, i32 }
-; CGSCC-NEXT:    cleanup
+; CGSCC-NEXT:            cleanup
 ; CGSCC-NEXT:    ret i1 true
 ;
   invoke void @throw_if_bit_set(ptr %q, i8 %bit)
@@ -171,8 +171,8 @@ define ptr @lookup_bit(ptr %q, i32 %bitno) readnone nounwind {
 ; CHECK-LABEL: define {{[^@]+}}@lookup_bit
 ; CHECK-SAME: (ptr nofree readnone [[Q:%.*]], i32 [[BITNO:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP:%.*]] = ptrtoint ptr [[Q]] to i32
-; CHECK-NEXT:    [[TRUETMP2:%.*]] = lshr i32 [[TMP]], [[BITNO]]
-; CHECK-NEXT:    [[BIT:%.*]] = and i32 [[TRUETMP2]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP]], [[BITNO]]
+; CHECK-NEXT:    [[BIT:%.*]] = and i32 [[TMP2]], 1
 ; CHECK-NEXT:    [[LOOKUP:%.*]] = getelementptr [2 x i1], ptr @lookup_table, i32 0, i32 [[BIT]]
 ; CHECK-NEXT:    ret ptr [[LOOKUP]]
 ;
@@ -213,8 +213,8 @@ define i32 @nc1(ptr %q, ptr %p, i1 %b) {
 ; TUNIT:       l:
 ; TUNIT-NEXT:    [[X:%.*]] = phi ptr [ [[P]], [[E:%.*]] ]
 ; TUNIT-NEXT:    [[Y:%.*]] = phi ptr [ [[Q]], [[E]] ]
-; TUNIT-NEXT:    [[TRUETMP2:%.*]] = select i1 [[B]], ptr [[P]], ptr [[Q]]
-; TUNIT-NEXT:    [[VAL:%.*]] = load i32, ptr [[TRUETMP2]], align 4
+; TUNIT-NEXT:    [[TMP2:%.*]] = select i1 [[B]], ptr [[P]], ptr [[Q]]
+; TUNIT-NEXT:    [[VAL:%.*]] = load i32, ptr [[TMP2]], align 4
 ; TUNIT-NEXT:    store i32 0, ptr [[P]], align 4
 ; TUNIT-NEXT:    store ptr [[Q]], ptr @g, align 8
 ; TUNIT-NEXT:    ret i32 [[VAL]]
@@ -227,8 +227,8 @@ define i32 @nc1(ptr %q, ptr %p, i1 %b) {
 ; CGSCC:       l:
 ; CGSCC-NEXT:    [[X:%.*]] = phi ptr [ [[P]], [[E:%.*]] ]
 ; CGSCC-NEXT:    [[Y:%.*]] = phi ptr [ [[Q]], [[E]] ]
-; CGSCC-NEXT:    [[TRUETMP2:%.*]] = select i1 [[B]], ptr [[P]], ptr [[Q]]
-; CGSCC-NEXT:    [[VAL:%.*]] = load i32, ptr [[TRUETMP2]], align 4
+; CGSCC-NEXT:    [[TMP2:%.*]] = select i1 [[B]], ptr [[P]], ptr [[Q]]
+; CGSCC-NEXT:    [[VAL:%.*]] = load i32, ptr [[TMP2]], align 4
 ; CGSCC-NEXT:    store i32 0, ptr [[P]], align 4
 ; CGSCC-NEXT:    store ptr [[Q]], ptr @g, align 8
 ; CGSCC-NEXT:    ret i32 [[VAL]]
@@ -255,8 +255,8 @@ define i32 @nc1_addrspace(ptr %q, ptr addrspace(1) %p, i1 %b) {
 ; TUNIT-NEXT:    [[X:%.*]] = phi ptr addrspace(1) [ [[P]], [[E:%.*]] ]
 ; TUNIT-NEXT:    [[Y:%.*]] = phi ptr [ [[Q]], [[E]] ]
 ; TUNIT-NEXT:    [[TMP:%.*]] = addrspacecast ptr addrspace(1) [[P]] to ptr
-; TUNIT-NEXT:    [[TRUETMP2:%.*]] = select i1 [[B]], ptr [[TMP]], ptr [[Q]]
-; TUNIT-NEXT:    [[VAL:%.*]] = load i32, ptr [[TRUETMP2]], align 4
+; TUNIT-NEXT:    [[TMP2:%.*]] = select i1 [[B]], ptr [[TMP]], ptr [[Q]]
+; TUNIT-NEXT:    [[VAL:%.*]] = load i32, ptr [[TMP2]], align 4
 ; TUNIT-NEXT:    store i32 0, ptr addrspace(1) [[P]], align 4
 ; TUNIT-NEXT:    store ptr [[Q]], ptr @g, align 8
 ; TUNIT-NEXT:    ret i32 [[VAL]]
@@ -270,8 +270,8 @@ define i32 @nc1_addrspace(ptr %q, ptr addrspace(1) %p, i1 %b) {
 ; CGSCC-NEXT:    [[X:%.*]] = phi ptr addrspace(1) [ [[P]], [[E:%.*]] ]
 ; CGSCC-NEXT:    [[Y:%.*]] = phi ptr [ [[Q]], [[E]] ]
 ; CGSCC-NEXT:    [[TMP:%.*]] = addrspacecast ptr addrspace(1) [[P]] to ptr
-; CGSCC-NEXT:    [[TRUETMP2:%.*]] = select i1 [[B]], ptr [[TMP]], ptr [[Q]]
-; CGSCC-NEXT:    [[VAL:%.*]] = load i32, ptr [[TRUETMP2]], align 4
+; CGSCC-NEXT:    [[TMP2:%.*]] = select i1 [[B]], ptr [[TMP]], ptr [[Q]]
+; CGSCC-NEXT:    [[VAL:%.*]] = load i32, ptr [[TMP2]], align 4
 ; CGSCC-NEXT:    store i32 0, ptr addrspace(1) [[P]], align 4
 ; CGSCC-NEXT:    store ptr [[Q]], ptr @g, align 8
 ; CGSCC-NEXT:    ret i32 [[VAL]]
diff --git a/llvm/test/Transforms/Attributor/nofpclass.ll b/llvm/test/Transforms/Attributor/nofpclass.ll
index d2d11e0276c4..5945fc5e7b0b 100644
--- a/llvm/test/Transforms/Attributor/nofpclass.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass.ll
@@ -1542,9 +1542,9 @@ define <4 x float> @insertelement_non_constant_chain(i32 %idx) {
 ; CHECK-NEXT:    [[INS_0:%.*]] = insertelement <4 x float> poison, float 1.000000e+00, i32 0
 ; CHECK-NEXT:    [[INS_1:%.*]] = insertelement <4 x float> [[INS_0]], float 0.000000e+00, i32 1
 ; CHECK-NEXT:    [[INS_2:%.*]] = insertelement <4 x float> [[INS_1]], float -9.000000e+00, i32 2
-; CHECK-NEXT:    [[INS_4:%.*]] = insertelement <4 x float> [[INS_2]], float 3.000000e+00, i32 3
-; CHECK-NEXT:    [[INS_3:%.*]] = insertelement <4 x float> [[INS_2]], float 4.000000e+00, i32 [[IDX]]
-; CHECK-NEXT:    ret <4 x float> [[INS_3]]
+; CHECK-NEXT:    [[INS_3:%.*]] = insertelement <4 x float> [[INS_2]], float 3.000000e+00, i32 3
+; CHECK-NEXT:    [[INS_4:%.*]] = insertelement <4 x float> [[INS_2]], float 4.000000e+00, i32 [[IDX]]
+; CHECK-NEXT:    ret <4 x float> [[INS_4]]
 ;
   %ins.0 = insertelement <4 x float> poison, float 1.0, i32 0
   %ins.1 = insertelement <4 x float> %ins.0, float 0.0, i32 1
diff --git a/llvm/test/Transforms/Attributor/nofree.ll b/llvm/test/Transforms/Attributor/nofree.ll
index eafb16823fdf..8f1f02ebeba0 100644
--- a/llvm/test/Transforms/Attributor/nofree.ll
+++ b/llvm/test/Transforms/Attributor/nofree.ll
@@ -183,7 +183,6 @@ define noalias ptr @call_realloc(ptr nocapture %0, i64 %1) local_unnamed_addr #0
 
 
 ; CHECK: Function Attrs:  nofree noinline nounwind memory(none) uwtable
-; CHECK-NEXT: declare void @nofree_function()
 declare void @nofree_function() nofree readnone #0
 
 define void @call_nofree_function() #0 {
@@ -206,7 +205,6 @@ define void @call_nofree_function() #0 {
 
 
 ; CHECK: Function Attrs: noinline nounwind uwtable
-; CHECK-NEXT: declare void @maybe_free()
 declare void @maybe_free() #0
 
 
@@ -241,7 +239,6 @@ define void @call_both() #0 {
 ; TEST 10 (positive case)
 ; Call intrinsic function
 ; CHECK: Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-; CHECK-NEXT: declare float @llvm.floor.f32(float)
 declare float @llvm.floor.f32(float)
 
 define void @call_floor(float %a) #0 {
diff --git a/llvm/test/Transforms/Attributor/nonnull.ll b/llvm/test/Transforms/Attributor/nonnull.ll
index 42d81792c2f4..24e60a0ae8cd 100644
--- a/llvm/test/Transforms/Attributor/nonnull.ll
+++ b/llvm/test/Transforms/Attributor/nonnull.ll
@@ -397,20 +397,20 @@ define internal ptr @f1(ptr %arg) {
 ; TUNIT-NEXT:    [[TMP:%.*]] = icmp eq ptr [[ARG]], null
 ; TUNIT-NEXT:    br i1 [[TMP]], label [[BB9:%.*]], label [[BB1:%.*]]
 ; TUNIT:       bb1:
-; TUNIT-NEXT:    [[TRUETMP2:%.*]] = load i32, ptr [[ARG]], align 4
-; TUNIT-NEXT:    [[TRUETMP3:%.*]] = icmp eq i32 [[TRUETMP2]], 0
-; TUNIT-NEXT:    br i1 [[TRUETMP3]], label [[BB6:%.*]], label [[BB4:%.*]]
+; TUNIT-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARG]], align 4
+; TUNIT-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 0
+; TUNIT-NEXT:    br i1 [[TMP3]], label [[BB6:%.*]], label [[BB4:%.*]]
 ; TUNIT:       bb4:
-; TUNIT-NEXT:    [[TRUETMP5:%.*]] = getelementptr inbounds i32, ptr [[ARG]], i64 1
-; TUNIT-NEXT:    [[TMP5B:%.*]] = tail call ptr @f3(ptr nofree nonnull readonly [[TRUETMP5]]) #[[ATTR16:[0-9]+]]
+; TUNIT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[ARG]], i64 1
+; TUNIT-NEXT:    [[TMP5B:%.*]] = tail call ptr @f3(ptr nofree nonnull readonly [[TMP5]]) #[[ATTR16:[0-9]+]]
 ; TUNIT-NEXT:    [[TMP5C:%.*]] = getelementptr inbounds i32, ptr [[TMP5B]], i64 -1
 ; TUNIT-NEXT:    br label [[BB9]]
 ; TUNIT:       bb6:
-; TUNIT-NEXT:    [[TRUETMP7:%.*]] = tail call ptr @f2(ptr nofree nonnull readonly align 4 dereferenceable(4) [[ARG]]) #[[ATTR16]]
-; TUNIT-NEXT:    ret ptr [[TRUETMP7]]
+; TUNIT-NEXT:    [[TMP7:%.*]] = tail call ptr @f2(ptr nofree nonnull readonly align 4 dereferenceable(4) [[ARG]]) #[[ATTR16]]
+; TUNIT-NEXT:    ret ptr [[TMP7]]
 ; TUNIT:       bb9:
-; TUNIT-NEXT:    [[TRUETMP10:%.*]] = phi ptr [ [[TMP5C]], [[BB4]] ], [ inttoptr (i64 4 to ptr), [[BB:%.*]] ]
-; TUNIT-NEXT:    ret ptr [[TRUETMP10]]
+; TUNIT-NEXT:    [[TMP10:%.*]] = phi ptr [ [[TMP5C]], [[BB4]] ], [ inttoptr (i64 4 to ptr), [[BB:%.*]] ]
+; TUNIT-NEXT:    ret ptr [[TMP10]]
 ;
 ; CGSCC: Function Attrs: nofree nosync nounwind memory(argmem: read)
 ; CGSCC-LABEL: define {{[^@]+}}@f1
@@ -419,20 +419,20 @@ define internal ptr @f1(ptr %arg) {
 ; CGSCC-NEXT:    [[TMP:%.*]] = icmp eq ptr [[ARG]], null
 ; CGSCC-NEXT:    br i1 [[TMP]], label [[BB9:%.*]], label [[BB1:%.*]]
 ; CGSCC:       bb1:
-; CGSCC-NEXT:    [[TRUETMP2:%.*]] = load i32, ptr [[ARG]], align 4
-; CGSCC-NEXT:    [[TRUETMP3:%.*]] = icmp eq i32 [[TRUETMP2]], 0
-; CGSCC-NEXT:    br i1 [[TRUETMP3]], label [[BB6:%.*]], label [[BB4:%.*]]
+; CGSCC-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARG]], align 4
+; CGSCC-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 0
+; CGSCC-NEXT:    br i1 [[TMP3]], label [[BB6:%.*]], label [[BB4:%.*]]
 ; CGSCC:       bb4:
-; CGSCC-NEXT:    [[TRUETMP5:%.*]] = getelementptr inbounds i32, ptr [[ARG]], i64 1
-; CGSCC-NEXT:    [[TMP5B:%.*]] = tail call ptr @f3(ptr nofree nonnull readonly [[TRUETMP5]]) #[[ATTR16:[0-9]+]]
+; CGSCC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[ARG]], i64 1
+; CGSCC-NEXT:    [[TMP5B:%.*]] = tail call ptr @f3(ptr nofree nonnull readonly [[TMP5]]) #[[ATTR16:[0-9]+]]
 ; CGSCC-NEXT:    [[TMP5C:%.*]] = getelementptr inbounds i32, ptr [[TMP5B]], i64 -1
 ; CGSCC-NEXT:    br label [[BB9]]
 ; CGSCC:       bb6:
-; CGSCC-NEXT:    [[TRUETMP7:%.*]] = tail call ptr @f2(ptr nofree nonnull readonly align 4 dereferenceable(4) [[ARG]]) #[[ATTR16]]
-; CGSCC-NEXT:    ret ptr [[TRUETMP7]]
+; CGSCC-NEXT:    [[TMP7:%.*]] = tail call ptr @f2(ptr nofree nonnull readonly align 4 dereferenceable(4) [[ARG]]) #[[ATTR16]]
+; CGSCC-NEXT:    ret ptr [[TMP7]]
 ; CGSCC:       bb9:
-; CGSCC-NEXT:    [[TRUETMP10:%.*]] = phi ptr [ [[TMP5C]], [[BB4]] ], [ inttoptr (i64 4 to ptr), [[BB:%.*]] ]
-; CGSCC-NEXT:    ret ptr [[TRUETMP10]]
+; CGSCC-NEXT:    [[TMP10:%.*]] = phi ptr [ [[TMP5C]], [[BB4]] ], [ inttoptr (i64 4 to ptr), [[BB:%.*]] ]
+; CGSCC-NEXT:    ret ptr [[TMP10]]
 ;
 
 bb:
@@ -877,12 +877,12 @@ define i1 @parent8(ptr %a, ptr %bogus1, ptr %b) personality ptr @esfp{
 ; TUNIT-SAME: (ptr nonnull [[A:%.*]], ptr nocapture nofree readnone [[BOGUS1:%.*]], ptr nonnull [[B:%.*]]) #[[ATTR5]] personality ptr @esfp {
 ; TUNIT-NEXT:  entry:
 ; TUNIT-NEXT:    invoke void @use2nonnull(ptr nonnull [[A]], ptr nonnull [[B]])
-; TUNIT-NEXT:    to label [[CONT:%.*]] unwind label [[EXC:%.*]]
+; TUNIT-NEXT:            to label [[CONT:%.*]] unwind label [[EXC:%.*]]
 ; TUNIT:       cont:
 ; TUNIT-NEXT:    ret i1 false
 ; TUNIT:       exc:
 ; TUNIT-NEXT:    [[LP:%.*]] = landingpad { ptr, i32 }
-; TUNIT-NEXT:    filter [0 x ptr] zeroinitializer
+; TUNIT-NEXT:            filter [0 x ptr] zeroinitializer
 ; TUNIT-NEXT:    unreachable
 ;
 ; CGSCC: Function Attrs: nounwind
@@ -890,12 +890,12 @@ define i1 @parent8(ptr %a, ptr %bogus1, ptr %b) personality ptr @esfp{
 ; CGSCC-SAME: (ptr nonnull [[A:%.*]], ptr nocapture nofree readnone [[BOGUS1:%.*]], ptr nonnull [[B:%.*]]) #[[ATTR4]] personality ptr @esfp {
 ; CGSCC-NEXT:  entry:
 ; CGSCC-NEXT:    invoke void @use2nonnull(ptr nonnull [[A]], ptr nonnull [[B]])
-; CGSCC-NEXT:    to label [[CONT:%.*]] unwind label [[EXC:%.*]]
+; CGSCC-NEXT:            to label [[CONT:%.*]] unwind label [[EXC:%.*]]
 ; CGSCC:       cont:
 ; CGSCC-NEXT:    ret i1 false
 ; CGSCC:       exc:
 ; CGSCC-NEXT:    [[LP:%.*]] = landingpad { ptr, i32 }
-; CGSCC-NEXT:    filter [0 x ptr] zeroinitializer
+; CGSCC-NEXT:            filter [0 x ptr] zeroinitializer
 ; CGSCC-NEXT:    unreachable
 ;
 
@@ -963,7 +963,6 @@ define ptr addrspace(3) @as(ptr addrspace(3) dereferenceable(4) %p) {
   ret ptr addrspace(3) %p
 }
 
-; CHECK-NOT: @g2()
 define internal ptr @g2() {
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define {{[^@]+}}@g2
@@ -1100,33 +1099,33 @@ define i32 @nonnull_exec_ctx_1(ptr %a, i32 %b) {
 ; TUNIT-LABEL: define {{[^@]+}}@nonnull_exec_ctx_1
 ; TUNIT-SAME: (ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR8]] {
 ; TUNIT-NEXT:  en:
-; TUNIT-NEXT:    [[TRUETMP3:%.*]] = icmp eq i32 [[B]], 0
-; TUNIT-NEXT:    br i1 [[TRUETMP3]], label [[EX:%.*]], label [[HD:%.*]]
+; TUNIT-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
+; TUNIT-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
 ; TUNIT:       ex:
-; TUNIT-NEXT:    [[TRUETMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]]) #[[ATTR7]]
-; TUNIT-NEXT:    ret i32 [[TRUETMP5]]
+; TUNIT-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]]) #[[ATTR7]]
+; TUNIT-NEXT:    ret i32 [[TMP5]]
 ; TUNIT:       hd:
-; TUNIT-NEXT:    [[TRUETMP7:%.*]] = phi i32 [ [[TRUETMP8:%.*]], [[HD]] ], [ 0, [[EN:%.*]] ]
+; TUNIT-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD]] ], [ 0, [[EN:%.*]] ]
 ; TUNIT-NEXT:    tail call void @h(ptr [[A]]) #[[ATTR7]]
-; TUNIT-NEXT:    [[TRUETMP8]] = add nuw i32 [[TRUETMP7]], 1
-; TUNIT-NEXT:    [[TRUETMP9:%.*]] = icmp eq i32 [[TRUETMP8]], [[B]]
-; TUNIT-NEXT:    br i1 [[TRUETMP9]], label [[EX]], label [[HD]]
+; TUNIT-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
+; TUNIT-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
+; TUNIT-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
 ;
 ; CGSCC: Function Attrs: mustprogress nounwind willreturn
 ; CGSCC-LABEL: define {{[^@]+}}@nonnull_exec_ctx_1
 ; CGSCC-SAME: (ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR7]] {
 ; CGSCC-NEXT:  en:
-; CGSCC-NEXT:    [[TRUETMP3:%.*]] = icmp eq i32 [[B]], 0
-; CGSCC-NEXT:    br i1 [[TRUETMP3]], label [[EX:%.*]], label [[HD:%.*]]
+; CGSCC-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
+; CGSCC-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
 ; CGSCC:       ex:
-; CGSCC-NEXT:    [[TRUETMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]]) #[[ATTR6]]
-; CGSCC-NEXT:    ret i32 [[TRUETMP5]]
+; CGSCC-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]]) #[[ATTR6]]
+; CGSCC-NEXT:    ret i32 [[TMP5]]
 ; CGSCC:       hd:
-; CGSCC-NEXT:    [[TRUETMP7:%.*]] = phi i32 [ [[TRUETMP8:%.*]], [[HD]] ], [ 0, [[EN:%.*]] ]
+; CGSCC-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD]] ], [ 0, [[EN:%.*]] ]
 ; CGSCC-NEXT:    tail call void @h(ptr [[A]]) #[[ATTR6]]
-; CGSCC-NEXT:    [[TRUETMP8]] = add nuw i32 [[TRUETMP7]], 1
-; CGSCC-NEXT:    [[TRUETMP9:%.*]] = icmp eq i32 [[TRUETMP8]], [[B]]
-; CGSCC-NEXT:    br i1 [[TRUETMP9]], label [[EX]], label [[HD]]
+; CGSCC-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
+; CGSCC-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
+; CGSCC-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
 ;
 en:
   %tmp3 = icmp eq i32 %b, 0
@@ -1150,37 +1149,37 @@ define i32 @nonnull_exec_ctx_1b(ptr %a, i32 %b) {
 ; TUNIT-LABEL: define {{[^@]+}}@nonnull_exec_ctx_1b
 ; TUNIT-SAME: (ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR8]] {
 ; TUNIT-NEXT:  en:
-; TUNIT-NEXT:    [[TRUETMP3:%.*]] = icmp eq i32 [[B]], 0
-; TUNIT-NEXT:    br i1 [[TRUETMP3]], label [[EX:%.*]], label [[HD:%.*]]
+; TUNIT-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
+; TUNIT-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
 ; TUNIT:       ex:
-; TUNIT-NEXT:    [[TRUETMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]]) #[[ATTR7]]
-; TUNIT-NEXT:    ret i32 [[TRUETMP5]]
+; TUNIT-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]]) #[[ATTR7]]
+; TUNIT-NEXT:    ret i32 [[TMP5]]
 ; TUNIT:       hd:
-; TUNIT-NEXT:    [[TRUETMP7:%.*]] = phi i32 [ [[TRUETMP8:%.*]], [[HD2:%.*]] ], [ 0, [[EN:%.*]] ]
+; TUNIT-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD2:%.*]] ], [ 0, [[EN:%.*]] ]
 ; TUNIT-NEXT:    tail call void @h(ptr [[A]]) #[[ATTR7]]
 ; TUNIT-NEXT:    br label [[HD2]]
 ; TUNIT:       hd2:
-; TUNIT-NEXT:    [[TRUETMP8]] = add nuw i32 [[TRUETMP7]], 1
-; TUNIT-NEXT:    [[TRUETMP9:%.*]] = icmp eq i32 [[TRUETMP8]], [[B]]
-; TUNIT-NEXT:    br i1 [[TRUETMP9]], label [[EX]], label [[HD]]
+; TUNIT-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
+; TUNIT-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
+; TUNIT-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
 ;
 ; CGSCC: Function Attrs: mustprogress nounwind willreturn
 ; CGSCC-LABEL: define {{[^@]+}}@nonnull_exec_ctx_1b
 ; CGSCC-SAME: (ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR7]] {
 ; CGSCC-NEXT:  en:
-; CGSCC-NEXT:    [[TRUETMP3:%.*]] = icmp eq i32 [[B]], 0
-; CGSCC-NEXT:    br i1 [[TRUETMP3]], label [[EX:%.*]], label [[HD:%.*]]
+; CGSCC-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
+; CGSCC-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
 ; CGSCC:       ex:
-; CGSCC-NEXT:    [[TRUETMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]]) #[[ATTR6]]
-; CGSCC-NEXT:    ret i32 [[TRUETMP5]]
+; CGSCC-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]]) #[[ATTR6]]
+; CGSCC-NEXT:    ret i32 [[TMP5]]
 ; CGSCC:       hd:
-; CGSCC-NEXT:    [[TRUETMP7:%.*]] = phi i32 [ [[TRUETMP8:%.*]], [[HD2:%.*]] ], [ 0, [[EN:%.*]] ]
+; CGSCC-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD2:%.*]] ], [ 0, [[EN:%.*]] ]
 ; CGSCC-NEXT:    tail call void @h(ptr [[A]]) #[[ATTR6]]
 ; CGSCC-NEXT:    br label [[HD2]]
 ; CGSCC:       hd2:
-; CGSCC-NEXT:    [[TRUETMP8]] = add nuw i32 [[TRUETMP7]], 1
-; CGSCC-NEXT:    [[TRUETMP9:%.*]] = icmp eq i32 [[TRUETMP8]], [[B]]
-; CGSCC-NEXT:    br i1 [[TRUETMP9]], label [[EX]], label [[HD]]
+; CGSCC-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
+; CGSCC-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
+; CGSCC-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
 ;
 en:
   %tmp3 = icmp eq i32 %b, 0
@@ -1207,33 +1206,33 @@ define i32 @nonnull_exec_ctx_2(ptr %a, i32 %b) willreturn nounwind {
 ; TUNIT-LABEL: define {{[^@]+}}@nonnull_exec_ctx_2
 ; TUNIT-SAME: (ptr nonnull [[A:%.*]], i32 [[B:%.*]]) #[[ATTR8]] {
 ; TUNIT-NEXT:  en:
-; TUNIT-NEXT:    [[TRUETMP3:%.*]] = icmp eq i32 [[B]], 0
-; TUNIT-NEXT:    br i1 [[TRUETMP3]], label [[EX:%.*]], label [[HD:%.*]]
+; TUNIT-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
+; TUNIT-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
 ; TUNIT:       ex:
-; TUNIT-NEXT:    [[TRUETMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]]) #[[ATTR5]]
-; TUNIT-NEXT:    ret i32 [[TRUETMP5]]
+; TUNIT-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]]) #[[ATTR5]]
+; TUNIT-NEXT:    ret i32 [[TMP5]]
 ; TUNIT:       hd:
-; TUNIT-NEXT:    [[TRUETMP7:%.*]] = phi i32 [ [[TRUETMP8:%.*]], [[HD]] ], [ 0, [[EN:%.*]] ]
+; TUNIT-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD]] ], [ 0, [[EN:%.*]] ]
 ; TUNIT-NEXT:    tail call void @h(ptr nonnull [[A]]) #[[ATTR5]]
-; TUNIT-NEXT:    [[TRUETMP8]] = add nuw i32 [[TRUETMP7]], 1
-; TUNIT-NEXT:    [[TRUETMP9:%.*]] = icmp eq i32 [[TRUETMP8]], [[B]]
-; TUNIT-NEXT:    br i1 [[TRUETMP9]], label [[EX]], label [[HD]]
+; TUNIT-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
+; TUNIT-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
+; TUNIT-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
 ;
 ; CGSCC: Function Attrs: mustprogress nounwind willreturn
 ; CGSCC-LABEL: define {{[^@]+}}@nonnull_exec_ctx_2
 ; CGSCC-SAME: (ptr nonnull [[A:%.*]], i32 [[B:%.*]]) #[[ATTR7]] {
 ; CGSCC-NEXT:  en:
-; CGSCC-NEXT:    [[TRUETMP3:%.*]] = icmp eq i32 [[B]], 0
-; CGSCC-NEXT:    br i1 [[TRUETMP3]], label [[EX:%.*]], label [[HD:%.*]]
+; CGSCC-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
+; CGSCC-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
 ; CGSCC:       ex:
-; CGSCC-NEXT:    [[TRUETMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]]) #[[ATTR4]]
-; CGSCC-NEXT:    ret i32 [[TRUETMP5]]
+; CGSCC-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]]) #[[ATTR4]]
+; CGSCC-NEXT:    ret i32 [[TMP5]]
 ; CGSCC:       hd:
-; CGSCC-NEXT:    [[TRUETMP7:%.*]] = phi i32 [ [[TRUETMP8:%.*]], [[HD]] ], [ 0, [[EN:%.*]] ]
+; CGSCC-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD]] ], [ 0, [[EN:%.*]] ]
 ; CGSCC-NEXT:    tail call void @h(ptr nonnull [[A]]) #[[ATTR4]]
-; CGSCC-NEXT:    [[TRUETMP8]] = add nuw i32 [[TRUETMP7]], 1
-; CGSCC-NEXT:    [[TRUETMP9:%.*]] = icmp eq i32 [[TRUETMP8]], [[B]]
-; CGSCC-NEXT:    br i1 [[TRUETMP9]], label [[EX]], label [[HD]]
+; CGSCC-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
+; CGSCC-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
+; CGSCC-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
 ;
 en:
   %tmp3 = icmp eq i32 %b, 0
@@ -1257,37 +1256,37 @@ define i32 @nonnull_exec_ctx_2b(ptr %a, i32 %b) willreturn nounwind {
 ; TUNIT-LABEL: define {{[^@]+}}@nonnull_exec_ctx_2b
 ; TUNIT-SAME: (ptr nonnull [[A:%.*]], i32 [[B:%.*]]) #[[ATTR8]] {
 ; TUNIT-NEXT:  en:
-; TUNIT-NEXT:    [[TRUETMP3:%.*]] = icmp eq i32 [[B]], 0
-; TUNIT-NEXT:    br i1 [[TRUETMP3]], label [[EX:%.*]], label [[HD:%.*]]
+; TUNIT-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
+; TUNIT-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
 ; TUNIT:       ex:
-; TUNIT-NEXT:    [[TRUETMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]]) #[[ATTR5]]
-; TUNIT-NEXT:    ret i32 [[TRUETMP5]]
+; TUNIT-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]]) #[[ATTR5]]
+; TUNIT-NEXT:    ret i32 [[TMP5]]
 ; TUNIT:       hd:
-; TUNIT-NEXT:    [[TRUETMP7:%.*]] = phi i32 [ [[TRUETMP8:%.*]], [[HD2:%.*]] ], [ 0, [[EN:%.*]] ]
+; TUNIT-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD2:%.*]] ], [ 0, [[EN:%.*]] ]
 ; TUNIT-NEXT:    tail call void @h(ptr nonnull [[A]]) #[[ATTR5]]
 ; TUNIT-NEXT:    br label [[HD2]]
 ; TUNIT:       hd2:
-; TUNIT-NEXT:    [[TRUETMP8]] = add nuw i32 [[TRUETMP7]], 1
-; TUNIT-NEXT:    [[TRUETMP9:%.*]] = icmp eq i32 [[TRUETMP8]], [[B]]
-; TUNIT-NEXT:    br i1 [[TRUETMP9]], label [[EX]], label [[HD]]
+; TUNIT-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
+; TUNIT-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
+; TUNIT-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
 ;
 ; CGSCC: Function Attrs: mustprogress nounwind willreturn
 ; CGSCC-LABEL: define {{[^@]+}}@nonnull_exec_ctx_2b
 ; CGSCC-SAME: (ptr nonnull [[A:%.*]], i32 [[B:%.*]]) #[[ATTR7]] {
 ; CGSCC-NEXT:  en:
-; CGSCC-NEXT:    [[TRUETMP3:%.*]] = icmp eq i32 [[B]], 0
-; CGSCC-NEXT:    br i1 [[TRUETMP3]], label [[EX:%.*]], label [[HD:%.*]]
+; CGSCC-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
+; CGSCC-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
 ; CGSCC:       ex:
-; CGSCC-NEXT:    [[TRUETMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]]) #[[ATTR4]]
-; CGSCC-NEXT:    ret i32 [[TRUETMP5]]
+; CGSCC-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]]) #[[ATTR4]]
+; CGSCC-NEXT:    ret i32 [[TMP5]]
 ; CGSCC:       hd:
-; CGSCC-NEXT:    [[TRUETMP7:%.*]] = phi i32 [ [[TRUETMP8:%.*]], [[HD2:%.*]] ], [ 0, [[EN:%.*]] ]
+; CGSCC-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD2:%.*]] ], [ 0, [[EN:%.*]] ]
 ; CGSCC-NEXT:    tail call void @h(ptr nonnull [[A]]) #[[ATTR4]]
 ; CGSCC-NEXT:    br label [[HD2]]
 ; CGSCC:       hd2:
-; CGSCC-NEXT:    [[TRUETMP8]] = add nuw i32 [[TRUETMP7]], 1
-; CGSCC-NEXT:    [[TRUETMP9:%.*]] = icmp eq i32 [[TRUETMP8]], [[B]]
-; CGSCC-NEXT:    br i1 [[TRUETMP9]], label [[EX]], label [[HD]]
+; CGSCC-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
+; CGSCC-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
+; CGSCC-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
 ;
 en:
   %tmp3 = icmp eq i32 %b, 0
diff --git a/llvm/test/Transforms/Attributor/norecurse.ll b/llvm/test/Transforms/Attributor/norecurse.ll
index b0034e465018..f139f193f10d 100644
--- a/llvm/test/Transforms/Attributor/norecurse.ll
+++ b/llvm/test/Transforms/Attributor/norecurse.ll
@@ -62,7 +62,6 @@ define i32 @extern() {
 }
 
 ; CHECK: Function Attrs
-; CHECK-NEXT: declare i32 @k()
 declare i32 @k() readnone
 
 define void @intrinsic(ptr %dest, ptr %src, i32 %len) {
@@ -77,7 +76,6 @@ define void @intrinsic(ptr %dest, ptr %src, i32 %len) {
 }
 
 ; CHECK: Function Attrs
-; CHECK-NEXT: declare void @llvm.memcpy.p0.p0.i32
 declare void @llvm.memcpy.p0.p0.i32(ptr, ptr, i32, i1)
 
 define internal i32 @called_by_norecurse() {
@@ -307,15 +305,27 @@ f:
 }
 
 ;.
-; CHECK: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
-; CHECK: attributes #[[ATTR1]] = { mustprogress nofree nosync nounwind willreturn memory(none) }
-; CHECK: attributes #[[ATTR2]] = { nosync memory(none) }
-; CHECK: attributes #[[ATTR3:[0-9]+]] = { memory(none) }
-; CHECK: attributes #[[ATTR4]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) }
-; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
-; CHECK: attributes #[[ATTR6]] = { norecurse nosync memory(none) }
-; CHECK: attributes #[[ATTR7]] = { null_pointer_is_valid }
-; CHECK: attributes #[[ATTR8:[0-9]+]] = { norecurse }
-; CHECK: attributes #[[ATTR9]] = { nosync }
-; CHECK: attributes #[[ATTR10]] = { nofree willreturn }
+; TUNIT: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
+; TUNIT: attributes #[[ATTR1]] = { mustprogress nofree nosync nounwind willreturn memory(none) }
+; TUNIT: attributes #[[ATTR2]] = { nosync memory(none) }
+; TUNIT: attributes #[[ATTR3:[0-9]+]] = { memory(none) }
+; TUNIT: attributes #[[ATTR4]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) }
+; TUNIT: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+; TUNIT: attributes #[[ATTR6]] = { norecurse nosync memory(none) }
+; TUNIT: attributes #[[ATTR7]] = { null_pointer_is_valid }
+; TUNIT: attributes #[[ATTR8]] = { norecurse }
+; TUNIT: attributes #[[ATTR9]] = { nosync }
+; TUNIT: attributes #[[ATTR10]] = { nofree willreturn }
+;.
+; CGSCC: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
+; CGSCC: attributes #[[ATTR1]] = { mustprogress nofree nosync nounwind willreturn memory(none) }
+; CGSCC: attributes #[[ATTR2]] = { nosync memory(none) }
+; CGSCC: attributes #[[ATTR3:[0-9]+]] = { memory(none) }
+; CGSCC: attributes #[[ATTR4]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) }
+; CGSCC: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+; CGSCC: attributes #[[ATTR6]] = { norecurse nosync memory(none) }
+; CGSCC: attributes #[[ATTR7]] = { null_pointer_is_valid }
+; CGSCC: attributes #[[ATTR8]] = { norecurse }
+; CGSCC: attributes #[[ATTR9]] = { nosync }
+; CGSCC: attributes #[[ATTR10]] = { nofree willreturn }
 ;.
diff --git a/llvm/test/Transforms/Attributor/noreturn_async.ll b/llvm/test/Transforms/Attributor/noreturn_async.ll
index c554fe658f49..45681bb1c5ed 100644
--- a/llvm/test/Transforms/Attributor/noreturn_async.ll
+++ b/llvm/test/Transforms/Attributor/noreturn_async.ll
@@ -26,9 +26,6 @@ entry:
 ; CHECK:      Function Attrs: noreturn
 ; CHECK-NEXT: define
 ; CHECK-NEXT:   entry:
-; CHECK-NEXT:   {{.*}}@printf{{.*}}
-; CHECK-NEXT:   call void @"?overflow@@YAXXZ"()
-; CHECK-NEXT:   unreachable
   %call2 = call i32 (ptr, ...) @printf(ptr @"??_C@_0BC@NKPAGFFJ@Exception?5caught?6?$AA@") nofree nosync nounwind
   call void @"?overflow@@YAXXZ"()
   %call3 = call i32 (ptr, ...) @printf(ptr @"??_C@_0BC@NKPAGFFJ@Exception?5caught?6?$AA@")
@@ -41,13 +38,10 @@ b:
 ; CHECK-NOT:    nounwind
 ; CHECK-NOT:    noreturn
 ; CHECK:        define
-; CHECK-SAME:   @"?catchoverflow@@YAHXZ"()
 define dso_local i32 @"?catchoverflow@@YAHXZ"()  personality ptr @__C_specific_handler {
 entry:
   %retval = alloca i32, align 4
   %__exception_code = alloca i32, align 4
-; CHECK: invoke void @"?overflow@@YAXXZ"()
-; CHECK:          to label %invoke.cont unwind label %catch.dispatch
   invoke void @"?overflow@@YAXXZ"()
   to label %invoke.cont unwind label %catch.dispatch
 
@@ -88,9 +82,6 @@ entry:
 ; CHECK-NOT:  nounwind
 ; CHECK-NEXT: define
 ; CHECK-NEXT:   entry:
-; CHECK-NEXT:   %call3 = call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(18) @"??_C@_0BC@NKPAGFFJ@Exception?5caught?6?$AA@")
-; CHECK-NEXT:   call void @"?overflow@@YAXXZ_may_throw"()
-; CHECK-NEXT:   unreachable
   %call3 = call i32 (ptr, ...) @printf(ptr @"??_C@_0BC@NKPAGFFJ@Exception?5caught?6?$AA@")
   call void @"?overflow@@YAXXZ_may_throw"()
   br label %b
@@ -102,13 +93,10 @@ b:
 ; CHECK-NOT:    nounwind
 ; CHECK-NOT:    noreturn
 ; CHECK:        define
-; CHECK-SAME:   @"?catchoverflow@@YAHXZ_may_throw"()
 define dso_local i32 @"?catchoverflow@@YAHXZ_may_throw"()  personality ptr @__C_specific_handler {
 entry:
   %retval = alloca i32, align 4
   %__exception_code = alloca i32, align 4
-; CHECK: invoke void @"?overflow@@YAXXZ_may_throw"()
-; CHECK:          to label %invoke.cont unwind label %catch.dispatch
   invoke void @"?overflow@@YAXXZ_may_throw"()
   to label %invoke.cont unwind label %catch.dispatch
 
diff --git a/llvm/test/Transforms/Attributor/noreturn_sync.ll b/llvm/test/Transforms/Attributor/noreturn_sync.ll
index 5bfa67fd0857..9dcc97d76d5d 100644
--- a/llvm/test/Transforms/Attributor/noreturn_sync.ll
+++ b/llvm/test/Transforms/Attributor/noreturn_sync.ll
@@ -26,10 +26,6 @@ entry:
 ; CHECK-NOT:  Function Attrs:
 ; CHECK:      define
 ; CHECK-NEXT:   entry:
-; CHECK-NEXT:   {{.*}}@printf{{.*}}
-; CHECK-NEXT:   call void @"?overflow@@YAXXZ"()
-; CHECK-NEXT:   {{.*}}@printf{{.*}}
-; CHECK-NEXT:   ret void
   %call2 = call i32 (ptr, ...) @printf(ptr @"??_C@_0BC@NKPAGFFJ@Exception?5caught?6?$AA@") nofree nosync nounwind
   call void @"?overflow@@YAXXZ"()
   %call3 = call i32 (ptr, ...) @printf(ptr @"??_C@_0BC@NKPAGFFJ@Exception?5caught?6?$AA@")
@@ -38,15 +34,12 @@ entry:
 
 
 ; CHECK-NOT:       Function Attrs:
-; CHECK:   @"?catchoverflow@@YAHXZ"()
 define dso_local i32 @"?catchoverflow@@YAHXZ"()  personality ptr @__gcc_personality_v0 {
 entry:
   %retval = alloca i32, align 4
   %__exception_code = alloca i32, align 4
   invoke void @"?overflow@@YAXXZ"()
   to label %invoke.cont unwind label %catch.dispatch
-; CHECK:      invoke void @"?overflow@@YAXXZ"()
-; CHECK-NEXT:        to label %invoke.cont unwind label %catch.dispatch
 
 invoke.cont:                                      ; preds = %entry
   br label %invoke.cont1
@@ -82,9 +75,6 @@ entry:
 ; CHECK-NOT:      Function Attrs:
 ; CHECK:      define
 ; CHECK-NEXT:   entry:
-; CHECK-NEXT:   %call3 = call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(18) @"??_C@_0BC@NKPAGFFJ@Exception?5caught?6?$AA@")
-; CHECK-NEXT:   call void @"?overflow@@YAXXZ_may_throw"()
-; CHECK-NEXT:   ret void
   %call3 = call i32 (ptr, ...) @printf(ptr @"??_C@_0BC@NKPAGFFJ@Exception?5caught?6?$AA@")
   call void @"?overflow@@YAXXZ_may_throw"()
   ret void
@@ -93,13 +83,10 @@ entry:
 
 ; CHECK-NOT:    Function Attrs:
 ; CHECK:        define
-; CHECK-SAME:   @"?catchoverflow@@YAHXZ_may_throw"()
 define dso_local i32 @"?catchoverflow@@YAHXZ_may_throw"()  personality ptr @__gcc_personality_v0 {
 entry:
   %retval = alloca i32, align 4
   %__exception_code = alloca i32, align 4
-; CHECK: invoke void @"?overflow@@YAXXZ_may_throw"()
-; CHECK:          to label %invoke.cont unwind label %catch.dispatch
   invoke void @"?overflow@@YAXXZ_may_throw"()
   to label %invoke.cont unwind label %catch.dispatch
 
diff --git a/llvm/test/Transforms/Attributor/nosync.ll b/llvm/test/Transforms/Attributor/nosync.ll
index 505966694380..63b55d9e220a 100644
--- a/llvm/test/Transforms/Attributor/nosync.ll
+++ b/llvm/test/Transforms/Attributor/nosync.ll
@@ -27,7 +27,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 %struct.ST = type { i32, double, %struct.RT }
 
 ;.
-; CHECK: @[[A:[a-zA-Z0-9_$"\\.-]+]] = common global i32 0, align 4
+; CHECK: @a = common global i32 0, align 4
 ;.
 define ptr @foo(ptr %s) nounwind optsize ssp memory(none) uwtable {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind optsize ssp willreturn memory(none) uwtable
@@ -164,7 +164,6 @@ define i32 @volatile_load(ptr %arg) norecurse nounwind uwtable {
 ; TEST 9
 
 ; CHECK: Function Attrs: noinline nosync nounwind uwtable
-; CHECK-NEXT: declare void @nosync_function()
 declare void @nosync_function() noinline nounwind uwtable nosync
 
 define void @call_nosync_function() noinline nounwind uwtable {
@@ -181,7 +180,6 @@ define void @call_nosync_function() noinline nounwind uwtable {
 ; TEST 10 - negative, should not deduce nosync
 
 ; CHECK: Function Attrs: noinline nounwind uwtable
-; CHECK-NEXT: declare void @might_sync()
 declare void @might_sync() noinline nounwind uwtable
 
 define void @call_might_sync() noinline nounwind uwtable {
@@ -386,7 +384,6 @@ define void @convergent_readnone() {
 }
 
 ; CHECK: Function Attrs: nounwind
-; CHECK-NEXT: declare void @llvm.x86.sse2.clflush(ptr)
 declare void @llvm.x86.sse2.clflush(ptr)
 @a = common global i32 0, align 4
 
diff --git a/llvm/test/Transforms/Attributor/noundef.ll b/llvm/test/Transforms/Attributor/noundef.ll
index 608d9efe7349..67dcf2680c64 100644
--- a/llvm/test/Transforms/Attributor/noundef.ll
+++ b/llvm/test/Transforms/Attributor/noundef.ll
@@ -149,6 +149,9 @@ declare !callback !0 void @callback_broker(ptr, ptr)
 !1 = !{i64 0, i64 1, i1 false}
 !0 = !{!1}
 ;.
-; CHECK: [[META0:![0-9]+]] = !{!1}
-; CHECK: [[META1:![0-9]+]] = !{i64 0, i64 1, i1 false}
+; TUNIT: [[META0:![0-9]+]] = !{[[META1:![0-9]+]]}
+; TUNIT: [[META1]] = !{i64 0, i64 1, i1 false}
+;.
+; CGSCC: [[META0:![0-9]+]] = !{[[META1:![0-9]+]]}
+; CGSCC: [[META1]] = !{i64 0, i64 1, i1 false}
 ;.
diff --git a/llvm/test/Transforms/Attributor/nounwind.ll b/llvm/test/Transforms/Attributor/nounwind.ll
index 147ba3e6a72e..8ea812ce42a8 100644
--- a/llvm/test/Transforms/Attributor/nounwind.ll
+++ b/llvm/test/Transforms/Attributor/nounwind.ll
@@ -111,12 +111,12 @@ declare void @__cxa_rethrow()
 define i32 @catch_thing() personality ptr @__gxx_personality_v0 {
 ; CHECK-LABEL: define {{[^@]+}}@catch_thing() personality ptr @__gxx_personality_v0 {
 ; CHECK-NEXT:    invoke void @__cxa_rethrow()
-; CHECK-NEXT:    to label [[TMP1:%.*]] unwind label [[TMP2:%.*]]
+; CHECK-NEXT:            to label [[TMP1:%.*]] unwind label [[TMP2:%.*]]
 ; CHECK:       1:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       2:
 ; CHECK-NEXT:    [[TMP3:%.*]] = landingpad { ptr, i32 }
-; CHECK-NEXT:    catch ptr null
+; CHECK-NEXT:            catch ptr null
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { ptr, i32 } [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP4]])
 ; CHECK-NEXT:    tail call void @__cxa_end_catch()
diff --git a/llvm/test/Transforms/Attributor/openmp_parallel.ll b/llvm/test/Transforms/Attributor/openmp_parallel.ll
index f4867ad60ae5..02636ab926dd 100644
--- a/llvm/test/Transforms/Attributor/openmp_parallel.ll
+++ b/llvm/test/Transforms/Attributor/openmp_parallel.ll
@@ -15,8 +15,8 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
 
 ;.
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
-; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 514, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
+; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
+; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ;.
 define dso_local void @func(ptr nocapture %a, ptr %b, i32 %N) local_unnamed_addr #0 {
 ; TUNIT: Function Attrs: nounwind uwtable
@@ -250,14 +250,23 @@ attributes #2 = { nounwind }
 !1 = !{!2}
 !2 = !{i64 2, i64 -1, i64 -1, i1 true}
 ;.
-; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind uwtable }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { alwaysinline nofree norecurse nounwind uwtable }
-; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
-; CHECK: attributes #[[ATTR3:[0-9]+]] = { memory(readwrite) }
+; TUNIT: attributes #[[ATTR0]] = { nounwind uwtable }
+; TUNIT: attributes #[[ATTR1]] = { alwaysinline nofree norecurse nounwind uwtable }
+; TUNIT: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+; TUNIT: attributes #[[ATTR3]] = { memory(readwrite) }
 ;.
-; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; CHECK: [[META1:![0-9]+]] = !{!2}
-; CHECK: [[META2:![0-9]+]] = !{i64 2, i64 -1, i64 -1, i1 true}
+; CGSCC: attributes #[[ATTR0]] = { nounwind uwtable }
+; CGSCC: attributes #[[ATTR1]] = { alwaysinline nofree norecurse nounwind uwtable }
+; CGSCC: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+; CGSCC: attributes #[[ATTR3]] = { memory(readwrite) }
+;.
+; TUNIT: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; TUNIT: [[META1:![0-9]+]] = !{[[META2:![0-9]+]]}
+; TUNIT: [[META2]] = !{i64 2, i64 -1, i64 -1, i1 true}
+;.
+; CGSCC: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; CGSCC: [[META1:![0-9]+]] = !{[[META2:![0-9]+]]}
+; CGSCC: [[META2]] = !{i64 2, i64 -1, i64 -1, i1 true}
 ;.
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK: {{.*}}
diff --git a/llvm/test/Transforms/Attributor/potential.ll b/llvm/test/Transforms/Attributor/potential.ll
index 02b139e8377a..11652f0f7b71 100644
--- a/llvm/test/Transforms/Attributor/potential.ll
+++ b/llvm/test/Transforms/Attributor/potential.ll
@@ -387,8 +387,8 @@ define internal i32 @may_return_undef(i32 %c) {
 ; CGSCC-LABEL: define {{[^@]+}}@may_return_undef
 ; CGSCC-SAME: (i32 noundef [[C:%.*]]) #[[ATTR0]] {
 ; CGSCC-NEXT:    switch i32 [[C]], label [[OTHERWISE:%.*]] [
-; CGSCC-NEXT:    i32 1, label [[A:%.*]]
-; CGSCC-NEXT:    i32 -1, label [[B:%.*]]
+; CGSCC-NEXT:      i32 1, label [[A:%.*]]
+; CGSCC-NEXT:      i32 -1, label [[B:%.*]]
 ; CGSCC-NEXT:    ]
 ; CGSCC:       a:
 ; CGSCC-NEXT:    ret i32 1
diff --git a/llvm/test/Transforms/Attributor/range.ll b/llvm/test/Transforms/Attributor/range.ll
index 50887ebfb751..9b2f9ed2dde9 100644
--- a/llvm/test/Transforms/Attributor/range.ll
+++ b/llvm/test/Transforms/Attributor/range.ll
@@ -722,8 +722,8 @@ define dso_local zeroext i1 @phi(i32 %arg) {
 ; TUNIT:       bb2:
 ; TUNIT-NEXT:    br label [[BB3]]
 ; TUNIT:       bb3:
-; TUNIT-NEXT:    [[TRUETMP4:%.*]] = icmp sgt i32 [[ARG]], 10
-; TUNIT-NEXT:    br i1 [[TRUETMP4]], label [[BB5:%.*]], label [[BB7:%.*]]
+; TUNIT-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[ARG]], 10
+; TUNIT-NEXT:    br i1 [[TMP4]], label [[BB5:%.*]], label [[BB7:%.*]]
 ; TUNIT:       bb5:
 ; TUNIT-NEXT:    br label [[BB9:%.*]]
 ; TUNIT:       bb7:
@@ -748,8 +748,8 @@ define dso_local zeroext i1 @phi(i32 %arg) {
 ; CGSCC:       bb2:
 ; CGSCC-NEXT:    br label [[BB3]]
 ; CGSCC:       bb3:
-; CGSCC-NEXT:    [[TRUETMP4:%.*]] = icmp sgt i32 [[ARG]], 10
-; CGSCC-NEXT:    br i1 [[TRUETMP4]], label [[BB5:%.*]], label [[BB7:%.*]]
+; CGSCC-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[ARG]], 10
+; CGSCC-NEXT:    br i1 [[TMP4]], label [[BB5:%.*]], label [[BB7:%.*]]
 ; CGSCC:       bb5:
 ; CGSCC-NEXT:    br label [[BB9:%.*]]
 ; CGSCC:       bb7:
@@ -1383,13 +1383,13 @@ define internal i32 @less_than_100_1(i32 %c) {
 ; CGSCC-LABEL: define {{[^@]+}}@less_than_100_1
 ; CGSCC-SAME: (i32 noundef [[C:%.*]]) #[[ATTR2]] {
 ; CGSCC-NEXT:    switch i32 [[C]], label [[OTHERWISE:%.*]] [
-; CGSCC-NEXT:    i32 0, label [[ONZERO:%.*]]
-; CGSCC-NEXT:    i32 1, label [[ONONE:%.*]]
-; CGSCC-NEXT:    i32 2, label [[ONTWO:%.*]]
-; CGSCC-NEXT:    i32 3, label [[ONTHREE:%.*]]
-; CGSCC-NEXT:    i32 4, label [[ONFOUR:%.*]]
-; CGSCC-NEXT:    i32 5, label [[ONFIVE:%.*]]
-; CGSCC-NEXT:    i32 6, label [[ONSIX:%.*]]
+; CGSCC-NEXT:      i32 0, label [[ONZERO:%.*]]
+; CGSCC-NEXT:      i32 1, label [[ONONE:%.*]]
+; CGSCC-NEXT:      i32 2, label [[ONTWO:%.*]]
+; CGSCC-NEXT:      i32 3, label [[ONTHREE:%.*]]
+; CGSCC-NEXT:      i32 4, label [[ONFOUR:%.*]]
+; CGSCC-NEXT:      i32 5, label [[ONFIVE:%.*]]
+; CGSCC-NEXT:      i32 6, label [[ONSIX:%.*]]
 ; CGSCC-NEXT:    ]
 ; CGSCC:       onzero:
 ; CGSCC-NEXT:    ret i32 0
@@ -1468,13 +1468,13 @@ define internal i32 @less_than_100_2(i32 %c) {
 ; TUNIT-LABEL: define {{[^@]+}}@less_than_100_2
 ; TUNIT-SAME: (i32 noundef [[C:%.*]]) #[[ATTR1]] {
 ; TUNIT-NEXT:    switch i32 [[C]], label [[OTHERWISE:%.*]] [
-; TUNIT-NEXT:    i32 0, label [[ONZERO:%.*]]
-; TUNIT-NEXT:    i32 1, label [[ONONE:%.*]]
-; TUNIT-NEXT:    i32 2, label [[ONTWO:%.*]]
-; TUNIT-NEXT:    i32 3, label [[ONTHREE:%.*]]
-; TUNIT-NEXT:    i32 4, label [[ONFOUR:%.*]]
-; TUNIT-NEXT:    i32 5, label [[ONFIVE:%.*]]
-; TUNIT-NEXT:    i32 6, label [[ONSIX:%.*]]
+; TUNIT-NEXT:      i32 0, label [[ONZERO:%.*]]
+; TUNIT-NEXT:      i32 1, label [[ONONE:%.*]]
+; TUNIT-NEXT:      i32 2, label [[ONTWO:%.*]]
+; TUNIT-NEXT:      i32 3, label [[ONTHREE:%.*]]
+; TUNIT-NEXT:      i32 4, label [[ONFOUR:%.*]]
+; TUNIT-NEXT:      i32 5, label [[ONFIVE:%.*]]
+; TUNIT-NEXT:      i32 6, label [[ONSIX:%.*]]
 ; TUNIT-NEXT:    ]
 ; TUNIT:       onzero:
 ; TUNIT-NEXT:    ret i32 0
@@ -1497,13 +1497,13 @@ define internal i32 @less_than_100_2(i32 %c) {
 ; CGSCC-LABEL: define {{[^@]+}}@less_than_100_2
 ; CGSCC-SAME: (i32 noundef [[C:%.*]]) #[[ATTR2]] {
 ; CGSCC-NEXT:    switch i32 [[C]], label [[OTHERWISE:%.*]] [
-; CGSCC-NEXT:    i32 0, label [[ONZERO:%.*]]
-; CGSCC-NEXT:    i32 1, label [[ONONE:%.*]]
-; CGSCC-NEXT:    i32 2, label [[ONTWO:%.*]]
-; CGSCC-NEXT:    i32 3, label [[ONTHREE:%.*]]
-; CGSCC-NEXT:    i32 4, label [[ONFOUR:%.*]]
-; CGSCC-NEXT:    i32 5, label [[ONFIVE:%.*]]
-; CGSCC-NEXT:    i32 6, label [[ONSIX:%.*]]
+; CGSCC-NEXT:      i32 0, label [[ONZERO:%.*]]
+; CGSCC-NEXT:      i32 1, label [[ONONE:%.*]]
+; CGSCC-NEXT:      i32 2, label [[ONTWO:%.*]]
+; CGSCC-NEXT:      i32 3, label [[ONTHREE:%.*]]
+; CGSCC-NEXT:      i32 4, label [[ONFOUR:%.*]]
+; CGSCC-NEXT:      i32 5, label [[ONFIVE:%.*]]
+; CGSCC-NEXT:      i32 6, label [[ONSIX:%.*]]
 ; CGSCC-NEXT:    ]
 ; CGSCC:       onzero:
 ; CGSCC-NEXT:    ret i32 0
@@ -1652,8 +1652,8 @@ define void @spam(ptr %arg, ptr %arg1) {
 ; CHECK-SAME: (ptr nocapture nofree noundef nonnull readonly align 8 dereferenceable(4) [[ARG:%.*]], ptr nocapture nofree readnone [[ARG1:%.*]]) {
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr [[ARG]], align 8
-; CHECK-NEXT:    [[TRUETMP2:%.*]] = icmp ult i32 [[TMP]], 4
-; CHECK-NEXT:    br i1 [[TRUETMP2]], label [[BB3:%.*]], label [[BB4:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i32 [[TMP]], 4
+; CHECK-NEXT:    br i1 [[TMP2]], label [[BB3:%.*]], label [[BB4:%.*]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    call fastcc void @wobble(i32 signext [[TMP]])
 ; CHECK-NEXT:    br label [[BB5:%.*]]
diff --git a/llvm/test/Transforms/Attributor/readattrs.ll b/llvm/test/Transforms/Attributor/readattrs.ll
index 429068b47b02..54afdebb3a89 100644
--- a/llvm/test/Transforms/Attributor/readattrs.ll
+++ b/llvm/test/Transforms/Attributor/readattrs.ll
@@ -10,8 +10,8 @@ declare void @test1_1(ptr %x1_1, ptr readonly %y1_1, ...)
 ; NOTE: readonly for %y1_2 would be OK here but not for the similar situation in test13.
 ;
 ;.
-; CHECK: @[[X:[a-zA-Z0-9_$"\\.-]+]] = global i32 0
-; CHECK: @[[CONSTANT_MEM:[a-zA-Z0-9_$"\\.-]+]] = external dso_local constant i32, align 4
+; CHECK: @x = global i32 0
+; CHECK: @constant_mem = external dso_local constant i32, align 4
 ;.
 define void @test1_2(ptr %x1_2, ptr %y1_2, ptr %z1_2) {
 ; CHECK-LABEL: define {{[^@]+}}@test1_2
@@ -129,7 +129,6 @@ entry:
   ret void
 }
 
-; CHECK: declare void @llvm.masked.scatter
 declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>%val, <4 x ptr>, i32, <4 x i1>)
 
 ; CHECK-NOT: readnone
@@ -151,7 +150,6 @@ define void @test9(<4 x ptr> %ptrs, <4 x i32>%val) {
   ret void
 }
 
-; CHECK: declare <4 x i32> @llvm.masked.gather
 declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i32>)
 define <4 x i32> @test10(<4 x ptr> %ptrs) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read)
@@ -170,7 +168,6 @@ define <4 x i32> @test10(<4 x ptr> %ptrs) {
   ret <4 x i32> %res
 }
 
-; CHECK: declare <4 x i32> @test11_1
 declare <4 x i32> @test11_1(<4 x ptr>) argmemonly nounwind readonly
 define <4 x i32> @test11_2(<4 x ptr> %ptrs) {
 ; TUNIT: Function Attrs: nosync nounwind memory(argmem: read)
diff --git a/llvm/test/Transforms/Attributor/reduced/aa_execution_domain_wrong_fn.ll b/llvm/test/Transforms/Attributor/reduced/aa_execution_domain_wrong_fn.ll
index 8baee3f74d30..e8b72b2c218e 100644
--- a/llvm/test/Transforms/Attributor/reduced/aa_execution_domain_wrong_fn.ll
+++ b/llvm/test/Transforms/Attributor/reduced/aa_execution_domain_wrong_fn.ll
@@ -26,7 +26,7 @@ define internal i1 @__kmpc_kernel_parallel() {
 
 !0 = !{i32 7, !"openmp", i32 50}
 ;.
-; CHECK: @[[_ZN4OMPX5STATE9TEAMSTATEE:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global %"struct.ompx::state::TeamStateTy" undef
+; CHECK: @_ZN4ompx5state9TeamStateE = internal addrspace(3) global %"struct.ompx::state::TeamStateTy" undef
 ;.
 ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_16_1d1156__Z38test_target_teams_distribute__parallelv_l16() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @__kmpc_target_init(ptr null)
diff --git a/llvm/test/Transforms/Attributor/reduced/missed_cached_entry_for_intra_reachability.ll b/llvm/test/Transforms/Attributor/reduced/missed_cached_entry_for_intra_reachability.ll
index b730dff03ed1..188939db5074 100644
--- a/llvm/test/Transforms/Attributor/reduced/missed_cached_entry_for_intra_reachability.ll
+++ b/llvm/test/Transforms/Attributor/reduced/missed_cached_entry_for_intra_reachability.ll
@@ -10,7 +10,7 @@
 @random = external global i1, align 4
 
 ;.
-; CHECK: @[[RANDOM:[a-zA-Z0-9_$"\\.-]+]] = external global i1, align 4
+; CHECK: @random = external global i1, align 4
 ;.
 define void @widget(ptr %arg1, float %arg2, i64 %idx1, i64 %idx2, i32 %limit) {
 ; CHECK: Function Attrs: nofree norecurse nounwind
diff --git a/llvm/test/Transforms/Attributor/reduced/openmp_opt_global_read.ll b/llvm/test/Transforms/Attributor/reduced/openmp_opt_global_read.ll
index 565ac31055f1..8e342910139d 100644
--- a/llvm/test/Transforms/Attributor/reduced/openmp_opt_global_read.ll
+++ b/llvm/test/Transforms/Attributor/reduced/openmp_opt_global_read.ll
@@ -21,7 +21,7 @@ attributes #0 = { nocallback nofree nosync nounwind willreturn memory(inaccessib
 !0 = !{i32 7, !"openmp", i32 50}
 !1 = !{i32 7, !"openmp-device", i32 50}
 ;.
-; CHECK: @[[ISSPMDMODE:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef
+; CHECK: @IsSPMDMode = internal addrspace(3) global i32 undef
 ;.
 ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_16_2e1d69__ZN11qmcplusplus7ompBLAS9gemv_implIfEEiRiciiT_PKS3_iS5_iS3_PS3_i_l44() {
 ; CHECK-NEXT:  bb:
diff --git a/llvm/test/Transforms/Attributor/reduced/openmp_opt_global_synced.ll b/llvm/test/Transforms/Attributor/reduced/openmp_opt_global_synced.ll
index 26d05d108f40..f958041fb3e5 100644
--- a/llvm/test/Transforms/Attributor/reduced/openmp_opt_global_synced.ll
+++ b/llvm/test/Transforms/Attributor/reduced/openmp_opt_global_synced.ll
@@ -34,7 +34,7 @@ attributes #1 = { "kernel" }
 !0 = !{i32 7, !"openmp", i32 50}
 !1 = !{i32 7, !"openmp-device", i32 50}
 ;.
-; CHECK: @[[_ZN4OMPX5STATE9TEAMSTATEE:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global ptr undef
+; CHECK: @_ZN4ompx5state9TeamStateE = internal addrspace(3) global ptr undef
 ;.
 ; CHECK: Function Attrs: norecurse nosync nounwind memory(read)
 ; CHECK-LABEL: define {{[^@]+}}@__kmpc_kernel_parallel
diff --git a/llvm/test/Transforms/Attributor/reduced/pred_iterator_crash.ll b/llvm/test/Transforms/Attributor/reduced/pred_iterator_crash.ll
index 04e4f577179b..89fdefa41b5e 100644
--- a/llvm/test/Transforms/Attributor/reduced/pred_iterator_crash.ll
+++ b/llvm/test/Transforms/Attributor/reduced/pred_iterator_crash.ll
@@ -11,7 +11,7 @@
 declare void @llvm.assume(i1 noundef) #0
 
 ;.
-; CHECK: @[[_ZN4OMPX5STATE9TEAMSTATEE:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global %"struct.ompx::state::TeamStateTy" undef
+; CHECK: @_ZN4ompx5state9TeamStateE = internal addrspace(3) global %"struct.ompx::state::TeamStateTy" undef
 ;.
 define weak_odr amdgpu_kernel void @__omp_offloading_16_19bc70bc_main_l44() {
 ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_16_19bc70bc_main_l44() {
diff --git a/llvm/test/Transforms/Attributor/returned.ll b/llvm/test/Transforms/Attributor/returned.ll
index 74064dae5034..e94cb9506969 100644
--- a/llvm/test/Transforms/Attributor/returned.ll
+++ b/llvm/test/Transforms/Attributor/returned.ll
@@ -36,9 +36,9 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ;.
-; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i8
-; CHECK: @[[_ZTI1X:[a-zA-Z0-9_$"\\.-]+]] = external dso_local constant { ptr, ptr }, align 8
-; CHECK: @[[_ZTI1Y:[a-zA-Z0-9_$"\\.-]+]] = external dso_local constant { ptr, ptr, ptr }, align 8
+; CHECK: @G = external global i8
+; CHECK: @_ZTI1X = external dso_local constant { ptr, ptr }, align 8
+; CHECK: @_ZTI1Y = external dso_local constant { ptr, ptr, ptr }, align 8
 ;.
 define i32 @sink_r0(i32 %r) #0 {
 ; CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) uwtable
diff --git a/llvm/test/Transforms/Attributor/undefined_behavior.ll b/llvm/test/Transforms/Attributor/undefined_behavior.ll
index 7ecd4ce33d49..4ca6ab2c7343 100644
--- a/llvm/test/Transforms/Attributor/undefined_behavior.ll
+++ b/llvm/test/Transforms/Attributor/undefined_behavior.ll
@@ -864,8 +864,8 @@ define nonnull ptr @returned_nonnnull(i32 %c) {
 ; CHECK-LABEL: define {{[^@]+}}@returned_nonnnull
 ; CHECK-SAME: (i32 noundef [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    switch i32 [[C]], label [[ONDEFAULT:%.*]] [
-; CHECK-NEXT:    i32 0, label [[ONZERO:%.*]]
-; CHECK-NEXT:    i32 1, label [[ONONE:%.*]]
+; CHECK-NEXT:      i32 0, label [[ONZERO:%.*]]
+; CHECK-NEXT:      i32 1, label [[ONONE:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       onzero:
 ; CHECK-NEXT:    [[PTR:%.*]] = alloca i32, align 4
@@ -891,8 +891,8 @@ define noundef ptr @returned_noundef(i32 %c) {
 ; CHECK-LABEL: define {{[^@]+}}@returned_noundef
 ; CHECK-SAME: (i32 noundef [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    switch i32 [[C]], label [[ONDEFAULT:%.*]] [
-; CHECK-NEXT:    i32 0, label [[ONZERO:%.*]]
-; CHECK-NEXT:    i32 1, label [[ONONE:%.*]]
+; CHECK-NEXT:      i32 0, label [[ONZERO:%.*]]
+; CHECK-NEXT:      i32 1, label [[ONONE:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       onzero:
 ; CHECK-NEXT:    [[PTR:%.*]] = alloca i32, align 4
@@ -918,8 +918,8 @@ define nonnull noundef ptr @returned_nonnnull_noundef(i32 %c) {
 ; CHECK-LABEL: define {{[^@]+}}@returned_nonnnull_noundef
 ; CHECK-SAME: (i32 noundef [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    switch i32 [[C]], label [[ONDEFAULT:%.*]] [
-; CHECK-NEXT:    i32 0, label [[ONZERO:%.*]]
-; CHECK-NEXT:    i32 1, label [[ONONE:%.*]]
+; CHECK-NEXT:      i32 0, label [[ONZERO:%.*]]
+; CHECK-NEXT:      i32 1, label [[ONONE:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       onzero:
 ; CHECK-NEXT:    [[PTR:%.*]] = alloca i32, align 4
diff --git a/llvm/test/Transforms/Attributor/value-simplify-assume.ll b/llvm/test/Transforms/Attributor/value-simplify-assume.ll
index 0ccad5cf21e2..b01a43e3ec75 100644
--- a/llvm/test/Transforms/Attributor/value-simplify-assume.ll
+++ b/llvm/test/Transforms/Attributor/value-simplify-assume.ll
@@ -10,8 +10,8 @@ declare void @useI1p(ptr)
 declare void @unknown()
 
 ;.
-; CHECK: @[[GSTATIC_INT1:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 0, align 4
-; CHECK: @[[GSTATIC_INT2:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 0, align 4
+; CHECK: @Gstatic_int1 = internal global i32 0, align 4
+; CHECK: @Gstatic_int2 = internal global i32 0, align 4
 ;.
 define i1 @readI1p(ptr %p) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read)
diff --git a/llvm/test/Transforms/Attributor/value-simplify-dbg.ll b/llvm/test/Transforms/Attributor/value-simplify-dbg.ll
index 4f55b055b3ad..5a57ed61ded3 100644
--- a/llvm/test/Transforms/Attributor/value-simplify-dbg.ll
+++ b/llvm/test/Transforms/Attributor/value-simplify-dbg.ll
@@ -5,7 +5,7 @@
 @G = internal global i32 undef, align 4, !dbg !0
 
 ;.
-; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 undef, align 4, !dbg [[DBG0:![0-9]+]]
+; CHECK: @G = internal global i32 undef, align 4, !dbg [[META0:![0-9]+]]
 ;.
 define void @dest() !dbg !15 {
 ; CHECK-LABEL: define {{[^@]+}}@dest
@@ -77,13 +77,13 @@ declare i32 @speculatable() speculatable readnone
 ; CHECK: attributes #[[ATTR1:[0-9]+]] = { speculatable memory(none) }
 ; CHECK: attributes #[[ATTR2]] = { nosync }
 ;.
-; CHECK: [[DBG0]] = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
-; CHECK: [[META1:![0-9]+]] = distinct !DIGlobalVariable(name: "G", scope: !2, file: !5, line: 1, type: !6, isLocal: true, isDefinition: true)
-; CHECK: [[META2:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 15.0.0 (https://github.com/llvm/llvm-project.git ef94609d6ebe981767788e6877b0b3b731d425af)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: None)
-; CHECK: [[META3:![0-9]+]] = !DIFile(filename: "/app/example.c", directory: "/app", checksumkind: CSK_MD5, checksum: "b456b90cec5c3705a028b274d88ee970")
-; CHECK: [[META4:![0-9]+]] = !{!0}
-; CHECK: [[META5:![0-9]+]] = !DIFile(filename: "example.c", directory: "/app", checksumkind: CSK_MD5, checksum: "b456b90cec5c3705a028b274d88ee970")
-; CHECK: [[META6:![0-9]+]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+; CHECK: [[META0]] = !DIGlobalVariableExpression(var: [[META1:![0-9]+]], expr: !DIExpression())
+; CHECK: [[META1]] = distinct !DIGlobalVariable(name: "G", scope: [[META2:![0-9]+]], file: [[META5:![0-9]+]], line: 1, type: [[META6:![0-9]+]], isLocal: true, isDefinition: true)
+; CHECK: [[META2]] = distinct !DICompileUnit(language: DW_LANG_C99, file: [[META3:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: [[META4:![0-9]+]], splitDebugInlining: false, nameTableKind: None)
+; CHECK: [[META3]] = !DIFile(filename: "/app/example.c", directory: {{.*}})
+; CHECK: [[META4]] = !{[[META0]]}
+; CHECK: [[META5]] = !DIFile(filename: "example.c", directory: {{.*}})
+; CHECK: [[META6]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
 ; CHECK: [[META7:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 5}
 ; CHECK: [[META8:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
 ; CHECK: [[META9:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
@@ -91,18 +91,18 @@ declare i32 @speculatable() speculatable readnone
 ; CHECK: [[META11:![0-9]+]] = !{i32 7, !"PIE Level", i32 2}
 ; CHECK: [[META12:![0-9]+]] = !{i32 7, !"uwtable", i32 2}
 ; CHECK: [[META13:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; CHECK: [[META14:![0-9]+]] = !{!"clang version 15.0.0 (https://github.com/llvm/llvm-project.git ef94609d6ebe981767788e6877b0b3b731d425af)"}
-; CHECK: [[DBG15]] = distinct !DISubprogram(name: "dest", scope: !5, file: !5, line: 4, type: !16, scopeLine: 4, spFlags: DISPFlagDefinition, unit: !2, retainedNodes: !18)
-; CHECK: [[META16:![0-9]+]] = !DISubroutineType(types: !17)
-; CHECK: [[META17:![0-9]+]] = !{null}
-; CHECK: [[META18:![0-9]+]] = !{}
-; CHECK: [[DBG19]] = !DILocation(line: 5, column: 9, scope: !15)
-; CHECK: [[DBG20]] = !DILocation(line: 5, column: 5, scope: !15)
-; CHECK: [[DBG21]] = !DILocation(line: 6, column: 1, scope: !15)
-; CHECK: [[DBG22]] = distinct !DISubprogram(name: "src", scope: !5, file: !5, line: 9, type: !16, scopeLine: 9, spFlags: DISPFlagDefinition, unit: !2, retainedNodes: !18)
-; CHECK: [[DBG23]] = !DILocation(line: 10, column: 9, scope: !22)
-; CHECK: [[DBG24]] = !DILocation(line: 10, column: 7, scope: !22)
-; CHECK: [[DBG25]] = !DILocation(line: 11, column: 1, scope: !22)
+; CHECK: [[META14:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; CHECK: [[DBG15]] = distinct !DISubprogram(name: "dest", scope: [[META5]], file: [[META5]], line: 4, type: [[META16:![0-9]+]], scopeLine: 4, spFlags: DISPFlagDefinition, unit: [[META2]], retainedNodes: [[META18:![0-9]+]])
+; CHECK: [[META16]] = !DISubroutineType(types: [[META17:![0-9]+]])
+; CHECK: [[META17]] = !{null}
+; CHECK: [[META18]] = !{}
+; CHECK: [[DBG19]] = !DILocation(line: 5, column: 9, scope: [[DBG15]])
+; CHECK: [[DBG20]] = !DILocation(line: 5, column: 5, scope: [[DBG15]])
+; CHECK: [[DBG21]] = !DILocation(line: 6, column: 1, scope: [[DBG15]])
+; CHECK: [[DBG22]] = distinct !DISubprogram(name: "src", scope: [[META5]], file: [[META5]], line: 9, type: [[META16]], scopeLine: 9, spFlags: DISPFlagDefinition, unit: [[META2]], retainedNodes: [[META18]])
+; CHECK: [[DBG23]] = !DILocation(line: 10, column: 9, scope: [[DBG22]])
+; CHECK: [[DBG24]] = !DILocation(line: 10, column: 7, scope: [[DBG22]])
+; CHECK: [[DBG25]] = !DILocation(line: 11, column: 1, scope: [[DBG22]])
 ;.
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CGSCC: {{.*}}
diff --git a/llvm/test/Transforms/Attributor/value-simplify-dominance.ll b/llvm/test/Transforms/Attributor/value-simplify-dominance.ll
index 7c266ba70a1f..7d95f35b24fd 100644
--- a/llvm/test/Transforms/Attributor/value-simplify-dominance.ll
+++ b/llvm/test/Transforms/Attributor/value-simplify-dominance.ll
@@ -179,7 +179,11 @@ define i32 @local_stack_remote_write_and_read() norecurse {
   ret i32 %r
 }
 ;.
-; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback }
-; CHECK: attributes #[[ATTR1]] = { norecurse nosync }
-; CHECK: attributes #[[ATTR2]] = { norecurse }
+; TUNIT: attributes #[[ATTR0:[0-9]+]] = { nocallback }
+; TUNIT: attributes #[[ATTR1]] = { norecurse nosync }
+; TUNIT: attributes #[[ATTR2]] = { norecurse }
+;.
+; CGSCC: attributes #[[ATTR0:[0-9]+]] = { nocallback }
+; CGSCC: attributes #[[ATTR1]] = { norecurse nosync }
+; CGSCC: attributes #[[ATTR2]] = { norecurse }
 ;.
diff --git a/llvm/test/Transforms/Attributor/value-simplify-gpu.ll b/llvm/test/Transforms/Attributor/value-simplify-gpu.ll
index a7e0a9291209..04ba6e2dc5f9 100644
--- a/llvm/test/Transforms/Attributor/value-simplify-gpu.ll
+++ b/llvm/test/Transforms/Attributor/value-simplify-gpu.ll
@@ -11,12 +11,12 @@ target triple = "amdgcn-amd-amdhsa"
 @AS3OneKernelAtATime = internal addrspace(3) global i32 42, align 4
 
 ;.
-; CHECK: @[[REACHABLEKERNEL:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 3, align 4
-; CHECK: @[[UNREACHABLEKERNEL:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 42, align 4
-; CHECK: @[[REACHABLEKERNELAS0:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 7, align 4
-; CHECK: @[[AS3ONEKERNELATATIME:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 42, align 4
-; CHECK: @[[REACHABLENONKERNEL:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 0, align 4
-; CHECK: @[[UNREACHABLENONKERNEL:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 0, align 4
+; CHECK: @ReachableKernel = internal addrspace(3) global i32 3, align 4
+; CHECK: @UnreachableKernel = internal addrspace(3) global i32 42, align 4
+; CHECK: @ReachableKernelAS0 = internal global i32 7, align 4
+; CHECK: @AS3OneKernelAtATime = internal addrspace(3) global i32 42, align 4
+; CHECK: @ReachableNonKernel = internal addrspace(3) global i32 0, align 4
+; CHECK: @UnreachableNonKernel = internal addrspace(3) global i32 0, align 4
 ;.
 define dso_local void @kernel(i32 %C) norecurse "kernel" {
 ; TUNIT: Function Attrs: norecurse nosync nounwind
diff --git a/llvm/test/Transforms/Attributor/value-simplify-instances.ll b/llvm/test/Transforms/Attributor/value-simplify-instances.ll
index b1b907cd6250..d1675fdc9a06 100644
--- a/llvm/test/Transforms/Attributor/value-simplify-instances.ll
+++ b/llvm/test/Transforms/Attributor/value-simplify-instances.ll
@@ -8,9 +8,9 @@ declare ptr @geti1Ptr()
 
 ; Make sure we do *not* return true.
 ;.
-; CHECK: @[[G1:[a-zA-Z0-9_$"\\.-]+]] = private global ptr undef
-; CHECK: @[[G2:[a-zA-Z0-9_$"\\.-]+]] = private global ptr undef
-; CHECK: @[[G3:[a-zA-Z0-9_$"\\.-]+]] = private global i1 undef
+; CHECK: @G1 = private global ptr undef
+; CHECK: @G2 = private global ptr undef
+; CHECK: @G3 = private global i1 undef
 ;.
 define internal i1 @recursive_inst_comparator(ptr %a, ptr %b) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
diff --git a/llvm/test/Transforms/Attributor/value-simplify-pointer-info-struct.ll b/llvm/test/Transforms/Attributor/value-simplify-pointer-info-struct.ll
index 7a427f62f436..b1af38fc8a29 100644
--- a/llvm/test/Transforms/Attributor/value-simplify-pointer-info-struct.ll
+++ b/llvm/test/Transforms/Attributor/value-simplify-pointer-info-struct.ll
@@ -31,7 +31,7 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
 declare void @harmless_use(ptr nocapture readonly) nofree norecurse nosync nounwind readnone willreturn nocallback
 
 ;.
-; CHECK: @[[GLOBALS:[a-zA-Z0-9_$"\\.-]+]] = internal constant [[STRUCT_S:%.*]] { i32 42, double 3.140000e+00, ptr null, i32 0 }, align 8
+; CHECK: @GlobalS = internal constant %struct.S { i32 42, double 3.140000e+00, ptr null, i32 0 }, align 8
 ;.
 define i32 @testOneFieldGlobalS(i32 %cmpx) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(none)
diff --git a/llvm/test/Transforms/Attributor/value-simplify-reachability.ll b/llvm/test/Transforms/Attributor/value-simplify-reachability.ll
index 18b6601c93f6..2dc2ec203692 100644
--- a/llvm/test/Transforms/Attributor/value-simplify-reachability.ll
+++ b/llvm/test/Transforms/Attributor/value-simplify-reachability.ll
@@ -14,12 +14,12 @@ declare void @free(ptr) allockind("free") "alloc-family"="malloc"
 declare noalias ptr @calloc(i64, i64) allockind("alloc,zeroed") allocsize(0, 1) "alloc-family"="malloc"
 
 ;.
-; CHECK: @[[GINT1:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 undef, align 4
-; CHECK: @[[GINT2:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 0, align 4
-; CHECK: @[[GINT3:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 undef, align 4
-; CHECK: @[[GINT4:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 0, align 4
-; CHECK: @[[GINT5:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 undef, align 4
-; CHECK: @[[B:[a-zA-Z0-9_$"\\.-]+]] = global i32 0
+; CHECK: @GInt1 = internal global i32 undef, align 4
+; CHECK: @GInt2 = internal global i32 0, align 4
+; CHECK: @GInt3 = internal global i32 undef, align 4
+; CHECK: @GInt4 = internal global i32 0, align 4
+; CHECK: @GInt5 = internal global i32 undef, align 4
+; CHECK: @B = global i32 0
 ;.
 define internal void @write1ToGInt1() {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
diff --git a/llvm/test/Transforms/Attributor/value-simplify.ll b/llvm/test/Transforms/Attributor/value-simplify.ll
index 0ee06afe92f2..62d4f63677df 100644
--- a/llvm/test/Transforms/Attributor/value-simplify.ll
+++ b/llvm/test/Transforms/Attributor/value-simplify.ll
@@ -14,14 +14,14 @@ declare ptr @llvm.call.preallocated.arg(token, i32)
 @ConstWeakODRPtr = weak_odr constant i32 0, align 4
 
 ;.
-; CHECK: @[[STR:[a-zA-Z0-9_$"\\.-]+]] = private unnamed_addr addrspace(4) constant [1 x i8] zeroinitializer, align 1
-; CHECK: @[[CONSTAS3PTR:[a-zA-Z0-9_$"\\.-]+]] = addrspace(3) global i32 0, align 4
-; CHECK: @[[CONSTPTR:[a-zA-Z0-9_$"\\.-]+]] = constant i32 0, align 4
-; CHECK: @[[CONSTWEAKPTR:[a-zA-Z0-9_$"\\.-]+]] = weak constant i32 0, align 4
-; CHECK: @[[CONSTWEAKODRPTR:[a-zA-Z0-9_$"\\.-]+]] = weak_odr constant i32 0, align 4
-; CHECK: @[[S:[a-zA-Z0-9_$"\\.-]+]] = external global [[STRUCT_X:%.*]]
-; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = internal constant { [2 x ptr] } { [2 x ptr] [ptr @f1, ptr @f2] }
-; CHECK: @[[X:[a-zA-Z0-9_$"\\.-]+]] = external global i32
+; CHECK: @str = private unnamed_addr addrspace(4) constant [1 x i8] zeroinitializer, align 1
+; CHECK: @ConstAS3Ptr = addrspace(3) global i32 0, align 4
+; CHECK: @ConstPtr = constant i32 0, align 4
+; CHECK: @ConstWeakPtr = weak constant i32 0, align 4
+; CHECK: @ConstWeakODRPtr = weak_odr constant i32 0, align 4
+; CHECK: @S = external global %struct.X
+; CHECK: @g = internal constant { [2 x ptr] } { [2 x ptr] [ptr @f1, ptr @f2] }
+; CHECK: @x = external global i32
 ;.
 define internal ptr addrspace(3) @const_ptr_return_as3() {
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
@@ -654,7 +654,7 @@ define void @fixpoint_changed(ptr %p) {
 ; TUNIT-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
 ; TUNIT:       for.body:
 ; TUNIT-NEXT:    switch i32 [[J_0]], label [[SW_EPILOG]] [
-; TUNIT-NEXT:    i32 1, label [[SW_BB:%.*]]
+; TUNIT-NEXT:      i32 1, label [[SW_BB:%.*]]
 ; TUNIT-NEXT:    ]
 ; TUNIT:       sw.bb:
 ; TUNIT-NEXT:    br label [[SW_EPILOG]]
@@ -677,7 +677,7 @@ define void @fixpoint_changed(ptr %p) {
 ; CGSCC-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
 ; CGSCC:       for.body:
 ; CGSCC-NEXT:    switch i32 [[J_0]], label [[SW_EPILOG]] [
-; CGSCC-NEXT:    i32 1, label [[SW_BB:%.*]]
+; CGSCC-NEXT:      i32 1, label [[SW_BB:%.*]]
 ; CGSCC-NEXT:    ]
 ; CGSCC:       sw.bb:
 ; CGSCC-NEXT:    br label [[SW_EPILOG]]
@@ -1523,19 +1523,19 @@ define i8 @switch(i1 %c1, i1 %c2) {
 ; TUNIT:       m:
 ; TUNIT-NEXT:    [[J:%.*]] = phi i32 [ 0, [[T]] ], [ 4, [[F]] ]
 ; TUNIT-NEXT:    switch i32 [[J]], label [[DEFAULT1:%.*]] [
-; TUNIT-NEXT:    i32 1, label [[DEAD1:%.*]]
-; TUNIT-NEXT:    i32 2, label [[DEAD2:%.*]]
-; TUNIT-NEXT:    i32 3, label [[DEAD3:%.*]]
-; TUNIT-NEXT:    i32 4, label [[ALIVE1:%.*]]
+; TUNIT-NEXT:      i32 1, label [[DEAD1:%.*]]
+; TUNIT-NEXT:      i32 2, label [[DEAD2:%.*]]
+; TUNIT-NEXT:      i32 3, label [[DEAD3:%.*]]
+; TUNIT-NEXT:      i32 4, label [[ALIVE1:%.*]]
 ; TUNIT-NEXT:    ]
 ; TUNIT:       default1:
 ; TUNIT-NEXT:    br label [[ALIVE1]]
 ; TUNIT:       alive1:
 ; TUNIT-NEXT:    [[K:%.*]] = phi i32 [ 1, [[M]] ], [ 4, [[DEFAULT1]] ]
 ; TUNIT-NEXT:    switch i32 [[K]], label [[DEAD4:%.*]] [
-; TUNIT-NEXT:    i32 1, label [[END1:%.*]]
-; TUNIT-NEXT:    i32 2, label [[DEAD5:%.*]]
-; TUNIT-NEXT:    i32 4, label [[END2:%.*]]
+; TUNIT-NEXT:      i32 1, label [[END1:%.*]]
+; TUNIT-NEXT:      i32 2, label [[DEAD5:%.*]]
+; TUNIT-NEXT:      i32 4, label [[END2:%.*]]
 ; TUNIT-NEXT:    ]
 ; TUNIT:       end1:
 ; TUNIT-NEXT:    ret i8 -1
@@ -1564,19 +1564,19 @@ define i8 @switch(i1 %c1, i1 %c2) {
 ; CGSCC:       m:
 ; CGSCC-NEXT:    [[J:%.*]] = phi i32 [ 0, [[T]] ], [ 4, [[F]] ]
 ; CGSCC-NEXT:    switch i32 [[J]], label [[DEFAULT1:%.*]] [
-; CGSCC-NEXT:    i32 1, label [[DEAD1:%.*]]
-; CGSCC-NEXT:    i32 2, label [[DEAD2:%.*]]
-; CGSCC-NEXT:    i32 3, label [[DEAD3:%.*]]
-; CGSCC-NEXT:    i32 4, label [[ALIVE1:%.*]]
+; CGSCC-NEXT:      i32 1, label [[DEAD1:%.*]]
+; CGSCC-NEXT:      i32 2, label [[DEAD2:%.*]]
+; CGSCC-NEXT:      i32 3, label [[DEAD3:%.*]]
+; CGSCC-NEXT:      i32 4, label [[ALIVE1:%.*]]
 ; CGSCC-NEXT:    ]
 ; CGSCC:       default1:
 ; CGSCC-NEXT:    br label [[ALIVE1]]
 ; CGSCC:       alive1:
 ; CGSCC-NEXT:    [[K:%.*]] = phi i32 [ 1, [[M]] ], [ 4, [[DEFAULT1]] ]
 ; CGSCC-NEXT:    switch i32 [[K]], label [[DEAD4:%.*]] [
-; CGSCC-NEXT:    i32 1, label [[END1:%.*]]
-; CGSCC-NEXT:    i32 2, label [[DEAD5:%.*]]
-; CGSCC-NEXT:    i32 4, label [[END2:%.*]]
+; CGSCC-NEXT:      i32 1, label [[END1:%.*]]
+; CGSCC-NEXT:      i32 2, label [[DEAD5:%.*]]
+; CGSCC-NEXT:      i32 4, label [[END2:%.*]]
 ; CGSCC-NEXT:    ]
 ; CGSCC:       end1:
 ; CGSCC-NEXT:    ret i8 -1
@@ -1704,6 +1704,8 @@ define i32 @readWeakOdrConst() {
 ; TUNIT: attributes #[[ATTR15]] = { nosync nounwind memory(read) }
 ; TUNIT: attributes #[[ATTR16]] = { nounwind memory(write) }
 ;.
+; TUNIT: [[RNG0]] = !{i32 0, i32 -2147483648}
+;.
 ; CGSCC: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind willreturn }
 ; CGSCC: attributes #[[ATTR1]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
 ; CGSCC: attributes #[[ATTR2]] = { memory(readwrite, argmem: none) }
@@ -1724,5 +1726,3 @@ define i32 @readWeakOdrConst() {
 ; CGSCC: attributes #[[ATTR17]] = { nosync }
 ; CGSCC: attributes #[[ATTR18]] = { nounwind }
 ;.
-; TUNIT: [[RNG0]] = !{i32 0, i32 -2147483648}
-;.
diff --git a/llvm/test/Transforms/Attributor/willreturn.ll b/llvm/test/Transforms/Attributor/willreturn.ll
index 0cd2339de6f8..6588dd066f4c 100644
--- a/llvm/test/Transforms/Attributor/willreturn.ll
+++ b/llvm/test/Transforms/Attributor/willreturn.ll
@@ -212,7 +212,6 @@ define void @mutual_recursion2(i1 %c) #0 {
 ; TEST 5 (negative case)
 ; call exit/abort (has noreturn attribute)
 ; CHECK: Function Attrs: noreturn
-; CHECK-NEXT: declare void @exit(i32) local_unnamed_add
 declare void @exit(i32 %0) local_unnamed_addr noreturn
 
 define void @only_exit() local_unnamed_addr #0 {
@@ -278,7 +277,6 @@ define void @conditional_exit(i32 %0, ptr nocapture readonly %1) local_unnamed_a
 ; TEST 6 (positive case)
 ; Call intrinsic function
 ; CHECK: Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-; CHECK-NEXT: declare float @llvm.floor.f32(float)
 declare float @llvm.floor.f32(float)
 
 define void @call_floor(float %a) #0 {
@@ -308,7 +306,6 @@ define float @call_floor2(float %a) #0 {
 
 ; CHECK: Function Attrs: noinline nounwind uwtable
 ; CHECK-NOT: willreturn
-; CHECK-NEXT: declare void @maybe_noreturn()
 declare void @maybe_noreturn() #0
 
 define void @call_maybe_noreturn() #0 {
@@ -327,7 +324,6 @@ define void @call_maybe_noreturn() #0 {
 ; Check propagation.
 
 ; CHECK: Function Attrs: norecurse willreturn
-; CHECK-NEXT: declare void @will_return()
 declare void @will_return() willreturn norecurse
 
 define void @f1() #0 {
@@ -380,7 +376,6 @@ label2:
 ; invoke a function with willreturn
 
 ; CHECK: Function Attrs: noinline willreturn uwtable
-; CHECK-NEXT: declare i1 @maybe_raise_exception()
 declare i1 @maybe_raise_exception() #1 willreturn
 
 define void @invoke_test() personality ptr @__gxx_personality_v0 {
@@ -388,12 +383,12 @@ define void @invoke_test() personality ptr @__gxx_personality_v0 {
 ; CHECK-LABEL: define {{[^@]+}}@invoke_test
 ; CHECK-SAME: () #[[ATTR12:[0-9]+]] personality ptr @__gxx_personality_v0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = invoke i1 @maybe_raise_exception() #[[ATTR32]]
-; CHECK-NEXT:    to label [[N:%.*]] unwind label [[F:%.*]]
+; CHECK-NEXT:            to label [[N:%.*]] unwind label [[F:%.*]]
 ; CHECK:       N:
 ; CHECK-NEXT:    ret void
 ; CHECK:       F:
 ; CHECK-NEXT:    [[VAL:%.*]] = landingpad { ptr, i32 }
-; CHECK-NEXT:    catch ptr null
+; CHECK-NEXT:            catch ptr null
 ; CHECK-NEXT:    ret void
 ;
   invoke i1 @maybe_raise_exception()
@@ -685,7 +680,6 @@ unreachable_label:
 }
 
 ; CHECK: Function Attrs: noreturn nounwind
-; CHECK-NEXT: declare void @llvm.eh.sjlj.longjmp(ptr)
 declare void @llvm.eh.sjlj.longjmp(ptr)
 
 define void @call_longjmp(ptr nocapture readnone %0) local_unnamed_addr #0 {
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/sitofp.ll b/llvm/test/Transforms/CorrelatedValuePropagation/sitofp.ll
new file mode 100644
index 000000000000..83533290e2f6
--- /dev/null
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/sitofp.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=correlated-propagation -S | FileCheck %s
+
+declare void @use.f32(float)
+
+define void @test1(i32 %n) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[N:%.*]], -1
+; CHECK-NEXT:    br i1 [[CMP]], label [[BB:%.*]], label [[EXIT:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[EXT_WIDE:%.*]] = uitofp nneg i32 [[N]] to float
+; CHECK-NEXT:    call void @use.f32(float [[EXT_WIDE]])
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp sgt i32 %n, -1
+  br i1 %cmp, label %bb, label %exit
+
+bb:
+  %ext.wide = sitofp i32 %n to float
+  call void @use.f32(float %ext.wide)
+  br label %exit
+
+exit:
+  ret void
+}
+
+
+define void @test2_fail(i32 %n) {
+; CHECK-LABEL: @test2_fail(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[N:%.*]], -2
+; CHECK-NEXT:    br i1 [[CMP]], label [[BB:%.*]], label [[EXIT:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[EXT_WIDE:%.*]] = sitofp i32 [[N]] to float
+; CHECK-NEXT:    call void @use.f32(float [[EXT_WIDE]])
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp sgt i32 %n, -2
+  br i1 %cmp, label %bb, label %exit
+
+bb:
+  %ext.wide = sitofp i32 %n to float
+  call void @use.f32(float %ext.wide)
+  br label %exit
+
+exit:
+  ret void
+}
+
+define float @may_including_undef(i1 %c.1, i1 %c.2) {
+; CHECK-LABEL: @may_including_undef(
+; CHECK-NEXT:    br i1 [[C_1:%.*]], label [[TRUE_1:%.*]], label [[FALSE:%.*]]
+; CHECK:       true.1:
+; CHECK-NEXT:    br i1 [[C_2:%.*]], label [[TRUE_2:%.*]], label [[EXIT:%.*]]
+; CHECK:       true.2:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       false:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[P:%.*]] = phi i32 [ 0, [[TRUE_1]] ], [ 1, [[TRUE_2]] ], [ undef, [[FALSE]] ]
+; CHECK-NEXT:    [[EXT:%.*]] = sitofp i32 [[P]] to float
+; CHECK-NEXT:    ret float [[EXT]]
+;
+  br i1 %c.1, label %true.1, label %false
+
+true.1:
+  br i1 %c.2, label %true.2, label %exit
+
+true.2:
+  br label %exit
+
+false:
+  br label %exit
+
+exit:
+  %p = phi i32 [ 0, %true.1 ], [ 1, %true.2], [ undef, %false ]
+  %ext = sitofp i32 %p to float
+  ret float %ext
+}
+
+define double @test_infer_at_use(i32 noundef %n) {
+; CHECK-LABEL: @test_infer_at_use(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[N:%.*]], -1
+; CHECK-NEXT:    [[EXT:%.*]] = uitofp nneg i32 [[N]] to double
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[CMP]], double [[EXT]], double 0.000000e+00
+; CHECK-NEXT:    ret double [[SELECT]]
+;
+  %cmp = icmp sgt i32 %n, -1
+  %ext = sitofp i32 %n to double
+  %select = select i1 %cmp, double %ext, double 0.0
+  ret double %select
+}
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/uitofp.ll b/llvm/test/Transforms/CorrelatedValuePropagation/uitofp.ll
new file mode 100644
index 000000000000..32d0f5b4d338
--- /dev/null
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/uitofp.ll
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=correlated-propagation -S | FileCheck %s
+
+declare void @use.f32(float)
+
+define void @test1(i32 %n) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[N:%.*]], -1
+; CHECK-NEXT:    br i1 [[CMP]], label [[BB:%.*]], label [[EXIT:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[EXT_WIDE:%.*]] = uitofp nneg i32 [[N]] to float
+; CHECK-NEXT:    call void @use.f32(float [[EXT_WIDE]])
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp sgt i32 %n, -1
+  br i1 %cmp, label %bb, label %exit
+
+bb:
+  %ext.wide = uitofp i32 %n to float
+  call void @use.f32(float %ext.wide)
+  br label %exit
+
+exit:
+  ret void
+}
+
+define void @test2_fail(i32 %n) {
+; CHECK-LABEL: @test2_fail(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[N:%.*]], -2
+; CHECK-NEXT:    br i1 [[CMP]], label [[BB:%.*]], label [[EXIT:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[EXT_WIDE:%.*]] = uitofp i32 [[N]] to float
+; CHECK-NEXT:    call void @use.f32(float [[EXT_WIDE]])
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp sgt i32 %n, -2
+  br i1 %cmp, label %bb, label %exit
+
+bb:
+  %ext.wide = uitofp i32 %n to float
+  call void @use.f32(float %ext.wide)
+  br label %exit
+
+exit:
+  ret void
+}
+
+define float @may_including_undef(i1 %c.1, i1 %c.2) {
+; CHECK-LABEL: @may_including_undef(
+; CHECK-NEXT:    br i1 [[C_1:%.*]], label [[TRUE_1:%.*]], label [[FALSE:%.*]]
+; CHECK:       true.1:
+; CHECK-NEXT:    br i1 [[C_2:%.*]], label [[TRUE_2:%.*]], label [[EXIT:%.*]]
+; CHECK:       true.2:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       false:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[P:%.*]] = phi i32 [ 0, [[TRUE_1]] ], [ 1, [[TRUE_2]] ], [ undef, [[FALSE]] ]
+; CHECK-NEXT:    [[EXT:%.*]] = uitofp i32 [[P]] to float
+; CHECK-NEXT:    ret float [[EXT]]
+;
+  br i1 %c.1, label %true.1, label %false
+
+true.1:
+  br i1 %c.2, label %true.2, label %exit
+
+true.2:
+  br label %exit
+
+false:
+  br label %exit
+
+exit:
+  %p = phi i32 [ 0, %true.1 ], [ 1, %true.2], [ undef, %false ]
+  %ext = uitofp i32 %p to float
+  ret float %ext
+}
+
+define double @test_infer_at_use(i32 noundef %n) {
+; CHECK-LABEL: @test_infer_at_use(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[N:%.*]], -1
+; CHECK-NEXT:    [[EXT:%.*]] = uitofp nneg i32 [[N]] to double
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[CMP]], double [[EXT]], double 0.000000e+00
+; CHECK-NEXT:    ret double [[SELECT]]
+;
+  %cmp = icmp sgt i32 %n, -1
+  %ext = uitofp i32 %n to double
+  %select = select i1 %cmp, double %ext, double 0.0
+  ret double %select
+}
diff --git a/llvm/test/Transforms/FunctionAttrs/nocapture.ll b/llvm/test/Transforms/FunctionAttrs/nocapture.ll
index 3d483f671b1a..8d6f6a7c73f8 100644
--- a/llvm/test/Transforms/FunctionAttrs/nocapture.ll
+++ b/llvm/test/Transforms/FunctionAttrs/nocapture.ll
@@ -197,7 +197,7 @@ declare i32 @__gxx_personality_v0(...)
 
 define ptr @lookup_bit(ptr %q, i32 %bitno) readnone nounwind {
 ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; FNATTRS-LABEL: define nonnull ptr @lookup_bit
+; FNATTRS-LABEL: define ptr @lookup_bit
 ; FNATTRS-SAME: (ptr [[Q:%.*]], i32 [[BITNO:%.*]]) #[[ATTR0]] {
 ; FNATTRS-NEXT:    [[TMP:%.*]] = ptrtoint ptr [[Q]] to i32
 ; FNATTRS-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP]], [[BITNO]]
diff --git a/llvm/test/Transforms/FunctionAttrs/nonnull.ll b/llvm/test/Transforms/FunctionAttrs/nonnull.ll
index d9bdb6298ed0..ec5545b969e5 100644
--- a/llvm/test/Transforms/FunctionAttrs/nonnull.ll
+++ b/llvm/test/Transforms/FunctionAttrs/nonnull.ll
@@ -905,26 +905,26 @@ define i1 @parent8(ptr %a, ptr %bogus1, ptr %b) personality ptr @esfp{
 ; FNATTRS-SAME: ptr nonnull [[A:%.*]], ptr nocapture readnone [[BOGUS1:%.*]], ptr nonnull [[B:%.*]]) #[[ATTR7]] personality ptr @esfp {
 ; FNATTRS-NEXT:  entry:
 ; FNATTRS-NEXT:    invoke void @use2nonnull(ptr [[A]], ptr [[B]])
-; FNATTRS-NEXT:    to label [[CONT:%.*]] unwind label [[EXC:%.*]]
+; FNATTRS-NEXT:            to label [[CONT:%.*]] unwind label [[EXC:%.*]]
 ; FNATTRS:       cont:
 ; FNATTRS-NEXT:    [[NULL_CHECK:%.*]] = icmp eq ptr [[B]], null
 ; FNATTRS-NEXT:    ret i1 [[NULL_CHECK]]
 ; FNATTRS:       exc:
 ; FNATTRS-NEXT:    [[LP:%.*]] = landingpad { ptr, i32 }
-; FNATTRS-NEXT:    filter [0 x ptr] zeroinitializer
+; FNATTRS-NEXT:            filter [0 x ptr] zeroinitializer
 ; FNATTRS-NEXT:    unreachable
 ;
 ; ATTRIBUTOR-LABEL: define i1 @parent8(
 ; ATTRIBUTOR-SAME: ptr nonnull [[A:%.*]], ptr nocapture nofree readnone [[BOGUS1:%.*]], ptr nonnull [[B:%.*]]) #[[ATTR8]] personality ptr @esfp {
 ; ATTRIBUTOR-NEXT:  entry:
 ; ATTRIBUTOR-NEXT:    invoke void @use2nonnull(ptr nonnull [[A]], ptr nonnull [[B]])
-; ATTRIBUTOR-NEXT:    to label [[CONT:%.*]] unwind label [[EXC:%.*]]
+; ATTRIBUTOR-NEXT:            to label [[CONT:%.*]] unwind label [[EXC:%.*]]
 ; ATTRIBUTOR:       cont:
 ; ATTRIBUTOR-NEXT:    [[NULL_CHECK:%.*]] = icmp eq ptr [[B]], null
 ; ATTRIBUTOR-NEXT:    ret i1 [[NULL_CHECK]]
 ; ATTRIBUTOR:       exc:
 ; ATTRIBUTOR-NEXT:    [[LP:%.*]] = landingpad { ptr, i32 }
-; ATTRIBUTOR-NEXT:    filter [0 x ptr] zeroinitializer
+; ATTRIBUTOR-NEXT:            filter [0 x ptr] zeroinitializer
 ; ATTRIBUTOR-NEXT:    unreachable
 ;
 
@@ -1415,5 +1415,20 @@ define void @PR43833_simple(ptr %0, i32 %1) {
   br i1 %11, label %7, label %8
 }
 
+define ptr @pr91177_non_inbounds_gep(ptr nonnull %arg) {
+; FNATTRS-LABEL: define ptr @pr91177_non_inbounds_gep(
+; FNATTRS-SAME: ptr nonnull readnone [[ARG:%.*]]) #[[ATTR0]] {
+; FNATTRS-NEXT:    [[RES:%.*]] = getelementptr i8, ptr [[ARG]], i64 -8
+; FNATTRS-NEXT:    ret ptr [[RES]]
+;
+; ATTRIBUTOR-LABEL: define ptr @pr91177_non_inbounds_gep(
+; ATTRIBUTOR-SAME: ptr nofree nonnull readnone [[ARG:%.*]]) #[[ATTR0]] {
+; ATTRIBUTOR-NEXT:    [[RES:%.*]] = getelementptr i8, ptr [[ARG]], i64 -8
+; ATTRIBUTOR-NEXT:    ret ptr [[RES]]
+;
+  %res = getelementptr i8, ptr %arg, i64 -8
+  ret ptr %res
+}
+
 attributes #0 = { null_pointer_is_valid }
 attributes #1 = { nounwind willreturn}
diff --git a/llvm/test/Transforms/GVNSink/int_sideeffect.ll b/llvm/test/Transforms/GVNSink/int_sideeffect.ll
index 3cc54e84f17c..9a3bc062dd94 100644
--- a/llvm/test/Transforms/GVNSink/int_sideeffect.ll
+++ b/llvm/test/Transforms/GVNSink/int_sideeffect.ll
@@ -28,3 +28,29 @@ if.end:
   ret float %phi
 }
 
+; CHECK-LABEL: scalarsSinkingReverse
+; CHECK-NOT: fmul
+; CHECK: = phi
+; CHECK: = fmul
+define float @scalarsSinkingReverse(float %d, float %m, float %a, i1 %cmp) {
+; This test is just a reverse(graph mirror) of the test
+; above to ensure GVNSink doesn't depend on the order of branches.
+entry:
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %add = fadd float %m, %a
+  %mul1 = fmul float %add, %d
+  br label %if.end
+
+if.else:
+  call void @llvm.sideeffect()
+  %sub = fsub float %m, %a
+  %mul0 = fmul float %sub, %d
+  br label %if.end
+
+if.end:
+  %phi = phi float [ %mul1, %if.then ], [ %mul0, %if.else ]
+  ret float %phi
+}
+
diff --git a/llvm/test/Transforms/HipStdPar/global-var.ll b/llvm/test/Transforms/HipStdPar/global-var.ll
new file mode 100644
index 000000000000..860c30e4a464
--- /dev/null
+++ b/llvm/test/Transforms/HipStdPar/global-var.ll
@@ -0,0 +1,12 @@
+; REQUIRES: amdgpu-registered-target
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=hipstdpar-select-accelerator-code \
+; RUN: %s | FileCheck %s
+
+; CHECK: @var = extern_weak addrspace(1) externally_initialized global i32, align 4
+@var = addrspace(1) global i32 0, align 4
+
+define amdgpu_kernel void @kernel() {
+entry:
+  store i32 1, ptr addrspace(1) @var, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/IndVarSimplify/preserving-debugloc.ll b/llvm/test/Transforms/IndVarSimplify/preserving-debugloc.ll
new file mode 100644
index 000000000000..7d23c8697efa
--- /dev/null
+++ b/llvm/test/Transforms/IndVarSimplify/preserving-debugloc.ll
@@ -0,0 +1,61 @@
+; RUN: opt < %s -passes=indvars -S | FileCheck %s
+
+; This testcase checks the preservation of debug locations of newly created 
+; phi, sitofp, add and icmp instructions in IndVarSimplify Pass.
+
+define void @test1() !dbg !5 {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[BB:%.*]], !dbg
+; CHECK:  bb:
+; CHECK:    [[IV_INT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[DOTINT:%.*]], [[BB]] ], !dbg ![[DBG1:[0-9]+]]
+; CHECK:    [[INDVAR_CONV:%.*]] = sitofp i32 [[IV_INT]] to double, !dbg ![[DBG1]]
+; CHECK:    [[DOTINT]] = add nuw nsw i32 [[IV_INT]], 1, !dbg ![[DBG2:[0-9]+]]
+; CHECK:    [[TMP1:%.*]] = icmp ult i32 [[DOTINT]], 10000, !dbg ![[DBG3:[0-9]+]]
+; CHECK: ![[DBG1]] = !DILocation(line: 2
+; CHECK: ![[DBG2]] = !DILocation(line: 4
+; CHECK: ![[DBG3]] = !DILocation(line: 5
+;
+entry:
+  br label %bb, !dbg !16
+
+bb:                                               ; preds = %bb, %entry
+  %iv = phi double [ 0.000000e+00, %entry ], [ %1, %bb ], !dbg !17
+  %0 = tail call i32 @foo(double %iv), !dbg !18
+  %1 = fadd double %iv, 1.000000e+00, !dbg !19
+  %2 = fcmp olt double %1, 1.000000e+04, !dbg !20
+  br i1 %2, label %bb, label %return, !dbg !21
+
+return:                                           ; preds = %bb
+  ret void, !dbg !22
+}
+
+declare i32 @foo(double)
+
+!llvm.dbg.cu = !{!0}
+!llvm.debugify = !{!2, !3}
+!llvm.module.flags = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "indvars-preserving.ll", directory: "/")
+!2 = !{i32 7}
+!3 = !{i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "test1", linkageName: "test1", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8)
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!8 = !{!9, !11, !13, !14}
+!9 = !DILocalVariable(name: "1", scope: !5, file: !1, line: 2, type: !10)
+!10 = !DIBasicType(name: "ty64", size: 64, encoding: DW_ATE_unsigned)
+!11 = !DILocalVariable(name: "2", scope: !5, file: !1, line: 3, type: !12)
+!12 = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned)
+!13 = !DILocalVariable(name: "3", scope: !5, file: !1, line: 4, type: !10)
+!14 = !DILocalVariable(name: "4", scope: !5, file: !1, line: 5, type: !15)
+!15 = !DIBasicType(name: "ty8", size: 8, encoding: DW_ATE_unsigned)
+!16 = !DILocation(line: 1, column: 1, scope: !5)
+!17 = !DILocation(line: 2, column: 1, scope: !5)
+!18 = !DILocation(line: 3, column: 1, scope: !5)
+!19 = !DILocation(line: 4, column: 1, scope: !5)
+!20 = !DILocation(line: 5, column: 1, scope: !5)
+!21 = !DILocation(line: 6, column: 1, scope: !5)
+!22 = !DILocation(line: 7, column: 1, scope: !5)
diff --git a/llvm/test/Transforms/IndVarSimplify/trip-count-expansion-loop-guard-preserve-nsw.ll b/llvm/test/Transforms/IndVarSimplify/trip-count-expansion-loop-guard-preserve-nsw.ll
new file mode 100644
index 000000000000..f86639ea4c50
--- /dev/null
+++ b/llvm/test/Transforms/IndVarSimplify/trip-count-expansion-loop-guard-preserve-nsw.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=indvars -S %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128-Fn32"
+
+define void @rewrite_preserve_add_nsw(i32 %a) {
+; CHECK-LABEL: define void @rewrite_preserve_add_nsw(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[A]], 4
+; CHECK-NEXT:    call void @use(i32 noundef [[ADD]])
+; CHECK-NEXT:    [[PRE:%.*]] = icmp sgt i32 [[A]], -4
+; CHECK-NEXT:    br i1 [[PRE]], label [[LOOP_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK:       loop.preheader:
+; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[ADD]], i32 0)
+; CHECK-NEXT:    [[TMP0:%.*]] = add nuw i32 [[SMAX]], 1
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    call void @clobber()
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i32 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp ne i32 [[IV_NEXT]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[EC]], label [[LOOP]], label [[EXIT_LOOPEXIT:%.*]]
+; CHECK:       exit.loopexit:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %add = add nsw i32 %a, 4
+  call void @use(i32 noundef %add)
+  %pre = icmp sgt i32 %a, -4
+  br i1 %pre, label %loop, label %exit
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  call void @clobber()
+  %iv.next = add i32 %iv, 1
+  %ec = icmp slt i32 %iv, %add
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+declare void @clobber()
+declare void @use(i32)
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll
index 9ea2db86d7f3..e6f26aeb98b1 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll
@@ -183,10 +183,10 @@ entry:
 }
 
 ; CHECK-LABEL: @atomicrmw_add_global_to_flat_preserve_amdgpu_md(
-; CHECK-NEXT: %ret = atomicrmw add ptr addrspace(1) %global.ptr, i32 %y seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+; CHECK-NEXT: %ret = atomicrmw add ptr addrspace(1) %global.ptr, i32 %y seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
 define i32 @atomicrmw_add_global_to_flat_preserve_amdgpu_md(ptr addrspace(1) %global.ptr, i32 %y) #0 {
   %cast = addrspacecast ptr addrspace(1) %global.ptr to ptr
-  %ret = atomicrmw add ptr %cast, i32 %y seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %ret = atomicrmw add ptr %cast, i32 %y seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret i32 %ret
 }
 
diff --git a/llvm/test/Transforms/Inline/AArch64/binop.ll b/llvm/test/Transforms/Inline/AArch64/binop.ll
index eb882282820b..3dd66689a257 100644
--- a/llvm/test/Transforms/Inline/AArch64/binop.ll
+++ b/llvm/test/Transforms/Inline/AArch64/binop.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=inline -mtriple=aarch64--linux-gnu -S -o - < %s -inline-threshold=0 | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
@@ -7,20 +8,35 @@ declare void @pad()
 @glbl = external global i32
 
 define i32 @outer_add1(i32 %a) {
-; CHECK-LABEL: @outer_add1(
-; CHECK-NOT: call i32 @add
+; CHECK-LABEL: define i32 @outer_add1(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    store i32 0, ptr @glbl, align 4
+; CHECK-NEXT:    ret i32 [[A]]
+;
   %C = call i32 @add(i32 %a, i32 0)
   ret i32 %C
 }
 
 define i32 @outer_add2(i32 %a) {
-; CHECK-LABEL: @outer_add2(
-; CHECK-NOT: call i32 @add
+; CHECK-LABEL: define i32 @outer_add2(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    store i32 0, ptr @glbl, align 4
+; CHECK-NEXT:    ret i32 [[A]]
+;
   %C = call i32 @add(i32 0, i32 %a)
   ret i32 %C
 }
 
 define i32 @add(i32 %a, i32 %b) {
+; CHECK-LABEL: define i32 @add(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A]], [[B]]
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    store i32 0, ptr @glbl, align 4
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
   %add = add i32 %a, %b
   call void @pad()
   store i32 0, ptr @glbl
@@ -30,13 +46,24 @@ define i32 @add(i32 %a, i32 %b) {
 
 
 define i32 @outer_sub1(i32 %a) {
-; CHECK-LABEL: @outer_sub1(
-; CHECK-NOT: call i32 @sub1
+; CHECK-LABEL: define i32 @outer_sub1(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    store i32 0, ptr @glbl, align 4
+; CHECK-NEXT:    ret i32 [[A]]
+;
   %C = call i32 @sub1(i32 %a, i32 0)
   ret i32 %C
 }
 
 define i32 @sub1(i32 %a, i32 %b) {
+; CHECK-LABEL: define i32 @sub1(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[A]], [[B]]
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    store i32 0, ptr @glbl, align 4
+; CHECK-NEXT:    ret i32 [[SUB]]
+;
   %sub = sub i32 %a, %b
   call void @pad()
   store i32 0, ptr @glbl
@@ -45,13 +72,22 @@ define i32 @sub1(i32 %a, i32 %b) {
 
 
 define i32 @outer_sub2(i32 %a) {
-; CHECK-LABEL: @outer_sub2(
-; CHECK-NOT: call i32 @sub2
+; CHECK-LABEL: define i32 @outer_sub2(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    ret i32 0
+;
   %C = call i32 @sub2(i32 %a)
   ret i32 %C
 }
 
 define i32 @sub2(i32 %a) {
+; CHECK-LABEL: define i32 @sub2(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[A]], [[A]]
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    ret i32 [[SUB]]
+;
   %sub = sub i32 %a, %a
   call void @pad()
   ret i32 %sub
@@ -60,20 +96,35 @@ define i32 @sub2(i32 %a) {
 
 
 define i32 @outer_mul1(i32 %a) {
-; CHECK-LABEL: @outer_mul1(
-; CHECK-NOT: call i32 @mul
+; CHECK-LABEL: define i32 @outer_mul1(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    store i32 0, ptr @glbl, align 4
+; CHECK-NEXT:    ret i32 0
+;
   %C = call i32 @mul(i32 %a, i32 0)
   ret i32 %C
 }
 
 define i32 @outer_mul2(i32 %a) {
-; CHECK-LABEL: @outer_mul2(
-; CHECK-NOT: call i32 @mul
+; CHECK-LABEL: define i32 @outer_mul2(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    store i32 0, ptr @glbl, align 4
+; CHECK-NEXT:    ret i32 [[A]]
+;
   %C = call i32 @mul(i32 %a, i32 1)
   ret i32 %C
 }
 
 define i32 @mul(i32 %a, i32 %b) {
+; CHECK-LABEL: define i32 @mul(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[A]], [[B]]
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    store i32 0, ptr @glbl, align 4
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
   %mul = mul i32 %a, %b
   call void @pad()
   store i32 0, ptr @glbl
@@ -83,20 +134,35 @@ define i32 @mul(i32 %a, i32 %b) {
 
 
 define i32 @outer_div1(i32 %a) {
-; CHECK-LABEL: @outer_div1(
-; CHECK-NOT: call i32 @div1
+; CHECK-LABEL: define i32 @outer_div1(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    store i32 0, ptr @glbl, align 4
+; CHECK-NEXT:    ret i32 0
+;
   %C = call i32 @div1(i32 0, i32 %a)
   ret i32 %C
 }
 
 define i32 @outer_div2(i32 %a) {
-; CHECK-LABEL: @outer_div2(
-; CHECK-NOT: call i32 @div1
+; CHECK-LABEL: define i32 @outer_div2(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    store i32 0, ptr @glbl, align 4
+; CHECK-NEXT:    ret i32 [[A]]
+;
   %C = call i32 @div1(i32 %a, i32 1)
   ret i32 %C
 }
 
 define i32 @div1(i32 %a, i32 %b) {
+; CHECK-LABEL: define i32 @div1(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 [[A]], [[B]]
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    store i32 0, ptr @glbl, align 4
+; CHECK-NEXT:    ret i32 [[DIV]]
+;
   %div = sdiv i32 %a, %b
   call void @pad()
   store i32 0, ptr @glbl
@@ -105,13 +171,22 @@ define i32 @div1(i32 %a, i32 %b) {
 
 
 define i32 @outer_div3(i32 %a) {
-; CHECK-LABEL: @outer_div3(
-; CHECK-NOT: call i32 @div
+; CHECK-LABEL: define i32 @outer_div3(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    ret i32 1
+;
   %C = call i32 @div2(i32 %a)
   ret i32 %C
 }
 
 define i32 @div2(i32 %a) {
+; CHECK-LABEL: define i32 @div2(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 [[A]], [[A]]
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    ret i32 [[DIV]]
+;
   %div = sdiv i32 %a, %a
   call void @pad()
   ret i32 %div
@@ -120,20 +195,35 @@ define i32 @div2(i32 %a) {
 
 
 define i32 @outer_rem1(i32 %a) {
-; CHECK-LABEL: @outer_rem1(
-; CHECK-NOT: call i32 @rem
+; CHECK-LABEL: define i32 @outer_rem1(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    store i32 0, ptr @glbl, align 4
+; CHECK-NEXT:    ret i32 0
+;
   %C = call i32 @rem1(i32 0, i32 %a)
   ret i32 %C
 }
 
 define i32 @outer_rem2(i32 %a) {
-; CHECK-LABEL: @outer_rem2(
-; CHECK-NOT: call i32 @rem
+; CHECK-LABEL: define i32 @outer_rem2(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    store i32 0, ptr @glbl, align 4
+; CHECK-NEXT:    ret i32 0
+;
   %C = call i32 @rem1(i32 %a, i32 1)
   ret i32 %C
 }
 
 define i32 @rem1(i32 %a, i32 %b) {
+; CHECK-LABEL: define i32 @rem1(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[A]], [[B]]
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    store i32 0, ptr @glbl, align 4
+; CHECK-NEXT:    ret i32 [[REM]]
+;
   %rem = urem i32 %a, %b
   call void @pad()
   store i32 0, ptr @glbl
@@ -142,13 +232,22 @@ define i32 @rem1(i32 %a, i32 %b) {
 
 
 define i32 @outer_rem3(i32 %a) {
-; CHECK-LABEL: @outer_rem3(
-; CHECK-NOT: call i32 @rem
+; CHECK-LABEL: define i32 @outer_rem3(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    ret i32 0
+;
   %C = call i32 @rem2(i32 %a)
   ret i32 %C
 }
 
 define i32 @rem2(i32 %a) {
+; CHECK-LABEL: define i32 @rem2(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[A]], [[A]]
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    ret i32 [[REM]]
+;
   %rem = urem i32 %a, %a
   call void @pad()
   ret i32 %rem
@@ -157,13 +256,24 @@ define i32 @rem2(i32 %a) {
 
 
 define i32 @outer_shl1(i32 %a) {
-; CHECK-LABEL: @outer_shl1(
-; CHECK-NOT: call i32 @shl
+; CHECK-LABEL: define i32 @outer_shl1(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    store i32 0, ptr @glbl, align 4
+; CHECK-NEXT:    ret i32 [[A]]
+;
   %C = call i32 @shl(i32 %a, i32 0)
   ret i32 %C
 }
 
 define i32 @shl(i32 %a, i32 %b) {
+; CHECK-LABEL: define i32 @shl(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[A]], [[B]]
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    store i32 0, ptr @glbl, align 4
+; CHECK-NEXT:    ret i32 [[SHL]]
+;
   %shl = shl i32 %a, %b
   call void @pad()
   store i32 0, ptr @glbl
@@ -173,13 +283,24 @@ define i32 @shl(i32 %a, i32 %b) {
 
 
 define i32 @outer_shr1(i32 %a) {
-; CHECK-LABEL: @outer_shr1(
-; CHECK-NOT: call i32 @shr
+; CHECK-LABEL: define i32 @outer_shr1(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    store i32 0, ptr @glbl, align 4
+; CHECK-NEXT:    ret i32 [[A]]
+;
   %C = call i32 @shr(i32 %a, i32 0)
   ret i32 %C
 }
 
 define i32 @shr(i32 %a, i32 %b) {
+; CHECK-LABEL: define i32 @shr(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[A]], [[B]]
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    store i32 0, ptr @glbl, align 4
+; CHECK-NEXT:    ret i32 [[SHR]]
+;
   %shr = ashr i32 %a, %b
   call void @pad()
   store i32 0, ptr @glbl
@@ -189,20 +310,35 @@ define i32 @shr(i32 %a, i32 %b) {
 
 
 define i1 @outer_and1(i1 %a) {
-; check-label: @outer_and1(
-; check-not: call i1 @and1
+; CHECK-LABEL: define i1 @outer_and1(
+; CHECK-SAME: i1 [[A:%.*]]) {
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    store i32 0, ptr @glbl, align 4
+; CHECK-NEXT:    ret i1 false
+;
   %c = call i1 @and1(i1 %a, i1 false)
   ret i1 %c
 }
 
 define i1 @outer_and2(i1 %a) {
-; check-label: @outer_and2(
-; check-not: call i1 @and1
+; CHECK-LABEL: define i1 @outer_and2(
+; CHECK-SAME: i1 [[A:%.*]]) {
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    store i32 0, ptr @glbl, align 4
+; CHECK-NEXT:    ret i1 [[A]]
+;
   %c = call i1 @and1(i1 %a, i1 true)
   ret i1 %c
 }
 
 define i1 @and1(i1 %a, i1 %b) {
+; CHECK-LABEL: define i1 @and1(
+; CHECK-SAME: i1 [[A:%.*]], i1 [[B:%.*]]) {
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[A]], [[B]]
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    store i32 0, ptr @glbl, align 4
+; CHECK-NEXT:    ret i1 [[AND]]
+;
   %and = and i1 %a, %b
   call void @pad()
   store i32 0, ptr @glbl
@@ -211,13 +347,22 @@ define i1 @and1(i1 %a, i1 %b) {
 
 
 define i1 @outer_and3(i1 %a) {
-; check-label: @outer_and3(
-; check-not: call i1 @and2
+; CHECK-LABEL: define i1 @outer_and3(
+; CHECK-SAME: i1 [[A:%.*]]) {
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    ret i1 [[A]]
+;
   %c = call i1 @and2(i1 %a)
   ret i1 %c
 }
 
 define i1 @and2(i1 %a) {
+; CHECK-LABEL: define i1 @and2(
+; CHECK-SAME: i1 [[A:%.*]]) {
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[A]], [[A]]
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    ret i1 [[AND]]
+;
   %and = and i1 %a, %a
   call void @pad()
   ret i1 %and
@@ -226,20 +371,35 @@ define i1 @and2(i1 %a) {
 
 
 define i1 @outer_or1(i1 %a) {
-; check-label: @outer_or1(
-; check-not: call i1 @or1
+; CHECK-LABEL: define i1 @outer_or1(
+; CHECK-SAME: i1 [[A:%.*]]) {
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    store i32 0, ptr @glbl, align 4
+; CHECK-NEXT:    ret i1 [[A]]
+;
   %c = call i1 @or1(i1 %a, i1 false)
   ret i1 %c
 }
 
 define i1 @outer_or2(i1 %a) {
-; check-label: @outer_or2(
-; check-not: call i1 @or1
+; CHECK-LABEL: define i1 @outer_or2(
+; CHECK-SAME: i1 [[A:%.*]]) {
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    store i32 0, ptr @glbl, align 4
+; CHECK-NEXT:    ret i1 true
+;
   %c = call i1 @or1(i1 %a, i1 true)
   ret i1 %c
 }
 
 define i1 @or1(i1 %a, i1 %b) {
+; CHECK-LABEL: define i1 @or1(
+; CHECK-SAME: i1 [[A:%.*]], i1 [[B:%.*]]) {
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[A]], [[B]]
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    store i32 0, ptr @glbl, align 4
+; CHECK-NEXT:    ret i1 [[OR]]
+;
   %or = or i1 %a, %b
   call void @pad()
   store i32 0, ptr @glbl
@@ -248,13 +408,22 @@ define i1 @or1(i1 %a, i1 %b) {
 
 
 define i1 @outer_or3(i1 %a) {
-; check-label: @outer_or3(
-; check-not: call i1 @or2
+; CHECK-LABEL: define i1 @outer_or3(
+; CHECK-SAME: i1 [[A:%.*]]) {
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    ret i1 [[A]]
+;
   %c = call i1 @or2(i1 %a)
   ret i1 %c
 }
 
 define i1 @or2(i1 %a) {
+; CHECK-LABEL: define i1 @or2(
+; CHECK-SAME: i1 [[A:%.*]]) {
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[A]], [[A]]
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    ret i1 [[OR]]
+;
   %or = or i1 %a, %a
   call void @pad()
   ret i1 %or
@@ -263,13 +432,24 @@ define i1 @or2(i1 %a) {
 
 
 define i1 @outer_xor1(i1 %a) {
-; check-label: @outer_xor1(
-; check-not: call i1 @xor
+; CHECK-LABEL: define i1 @outer_xor1(
+; CHECK-SAME: i1 [[A:%.*]]) {
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    store i32 0, ptr @glbl, align 4
+; CHECK-NEXT:    ret i1 [[A]]
+;
   %c = call i1 @xor1(i1 %a, i1 false)
   ret i1 %c
 }
 
 define i1 @xor1(i1 %a, i1 %b) {
+; CHECK-LABEL: define i1 @xor1(
+; CHECK-SAME: i1 [[A:%.*]], i1 [[B:%.*]]) {
+; CHECK-NEXT:    [[XOR:%.*]] = xor i1 [[A]], [[B]]
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    store i32 0, ptr @glbl, align 4
+; CHECK-NEXT:    ret i1 [[XOR]]
+;
   %xor = xor i1 %a, %b
   call void @pad()
   store i32 0, ptr @glbl
@@ -278,13 +458,22 @@ define i1 @xor1(i1 %a, i1 %b) {
 
 
 define i1 @outer_xor3(i1 %a) {
-; check-label: @outer_xor3(
-; check-not: call i1 @xor
+; CHECK-LABEL: define i1 @outer_xor3(
+; CHECK-SAME: i1 [[A:%.*]]) {
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    ret i1 false
+;
   %c = call i1 @xor2(i1 %a)
   ret i1 %c
 }
 
 define i1 @xor2(i1 %a) {
+; CHECK-LABEL: define i1 @xor2(
+; CHECK-SAME: i1 [[A:%.*]]) {
+; CHECK-NEXT:    [[XOR:%.*]] = xor i1 [[A]], [[A]]
+; CHECK-NEXT:    call void @pad()
+; CHECK-NEXT:    ret i1 [[XOR]]
+;
   %xor = xor i1 %a, %a
   call void @pad()
   ret i1 %xor
diff --git a/llvm/test/Transforms/Inline/AMDGPU/inline-atomicrmw-md-preserve.ll b/llvm/test/Transforms/Inline/AMDGPU/inline-atomicrmw-md-preserve.ll
index ec7edd277dd7..569cc91a867c 100644
--- a/llvm/test/Transforms/Inline/AMDGPU/inline-atomicrmw-md-preserve.ll
+++ b/llvm/test/Transforms/Inline/AMDGPU/inline-atomicrmw-md-preserve.ll
@@ -7,17 +7,17 @@
 define i32 @atomic_xor(ptr addrspace(1) %ptr, i32 %val) {
 ; CHECK-LABEL: define i32 @atomic_xor(
 ; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VAL:%.*]]) {
-; CHECK-NEXT:    [[RES:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 [[VAL]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]], !amdgpu.no.remote.memory.access [[META0]]
+; CHECK-NEXT:    [[RES:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 [[VAL]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]], !amdgpu.no.remote.memory [[META0]]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
-  %res = atomicrmw xor ptr addrspace(1) %ptr, i32 %val monotonic, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
+  %res = atomicrmw xor ptr addrspace(1) %ptr, i32 %val monotonic, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret i32 %res
 }
 
 define i32 @caller(ptr addrspace(1) %ptr, i32 %val) {
 ; CHECK-LABEL: define i32 @caller(
 ; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VAL:%.*]]) {
-; CHECK-NEXT:    [[RES_I:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 [[VAL]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]]
+; CHECK-NEXT:    [[RES_I:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 [[VAL]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
 ; CHECK-NEXT:    ret i32 [[RES_I]]
 ;
   %res = call i32 @atomic_xor(ptr addrspace(1) %ptr, i32 %val)
diff --git a/llvm/test/Transforms/Inline/access-attributes-prop.ll b/llvm/test/Transforms/Inline/access-attributes-prop.ll
index 3b4a59897c56..ffd31fbe8ae1 100644
--- a/llvm/test/Transforms/Inline/access-attributes-prop.ll
+++ b/llvm/test/Transforms/Inline/access-attributes-prop.ll
@@ -5,7 +5,7 @@
 
 declare void @bar1(ptr %p)
 declare void @bar2(ptr %p, ptr %p2)
-
+declare void @bar3(ptr writable %p)
 define dso_local void @foo1_rdonly(ptr readonly %p) {
 ; CHECK-LABEL: define {{[^@]+}}@foo1_rdonly
 ; CHECK-SAME: (ptr readonly [[P:%.*]]) {
@@ -26,6 +26,27 @@ define dso_local void @foo1(ptr %p) {
   ret void
 }
 
+define dso_local void @foo1_writable(ptr %p) {
+; CHECK-LABEL: define {{[^@]+}}@foo1_writable
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT:    call void @bar1(ptr writable [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @bar1(ptr writable %p)
+  ret void
+}
+
+define dso_local void @foo3_writable(ptr %p) {
+; CHECK-LABEL: define {{[^@]+}}@foo3_writable
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT:    call void @bar3(ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @bar3(ptr %p)
+  ret void
+}
+
+
 define dso_local void @foo1_bar_aligned64_deref512(ptr %p) {
 ; CHECK-LABEL: define {{[^@]+}}@foo1_bar_aligned64_deref512
 ; CHECK-SAME: (ptr [[P:%.*]]) {
@@ -168,7 +189,7 @@ define dso_local void @foo2_through_obj(ptr %p, ptr %p2) {
 define void @prop_param_func_decl(ptr %p) {
 ; CHECK-LABEL: define {{[^@]+}}@prop_param_func_decl
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    call void @bar1(ptr [[P]])
+; CHECK-NEXT:    call void @bar1(ptr readonly [[P]])
 ; CHECK-NEXT:    ret void
 ;
   call void @foo1_rdonly(ptr %p)
@@ -178,7 +199,7 @@ define void @prop_param_func_decl(ptr %p) {
 define void @prop_param_callbase_def(ptr %p) {
 ; CHECK-LABEL: define {{[^@]+}}@prop_param_callbase_def
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    call void @bar1(ptr [[P]])
+; CHECK-NEXT:    call void @bar1(ptr readonly [[P]])
 ; CHECK-NEXT:    call void @bar1(ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -190,7 +211,7 @@ define void @prop_param_callbase_def(ptr %p) {
 define void @prop_param_callbase_def_2x(ptr %p, ptr %p2) {
 ; CHECK-LABEL: define {{[^@]+}}@prop_param_callbase_def_2x
 ; CHECK-SAME: (ptr [[P:%.*]], ptr [[P2:%.*]]) {
-; CHECK-NEXT:    call void @bar2(ptr [[P]], ptr [[P]])
+; CHECK-NEXT:    call void @bar2(ptr readonly [[P]], ptr readonly [[P]])
 ; CHECK-NEXT:    ret void
 ;
   call void @foo2(ptr readonly %p, ptr %p)
@@ -202,7 +223,7 @@ define void @prop_param_callbase_def_2x_2(ptr %p, ptr %p2) {
 ; CHECK-SAME: (ptr [[P:%.*]], ptr [[P2:%.*]]) {
 ; CHECK-NEXT:    [[PP_I:%.*]] = getelementptr i8, ptr [[P]], i64 9
 ; CHECK-NEXT:    [[P2P_I:%.*]] = getelementptr i8, ptr [[P2]], i64 123
-; CHECK-NEXT:    call void @bar2(ptr [[P2P_I]], ptr [[PP_I]])
+; CHECK-NEXT:    call void @bar2(ptr writeonly [[P2P_I]], ptr readonly [[PP_I]])
 ; CHECK-NEXT:    ret void
 ;
   call void @foo2_through_obj(ptr readonly %p, ptr writeonly %p2)
@@ -214,7 +235,7 @@ define void @prop_param_callbase_def_2x_incompat(ptr %p, ptr %p2) {
 ; CHECK-SAME: (ptr [[P:%.*]], ptr [[P2:%.*]]) {
 ; CHECK-NEXT:    [[PP_I:%.*]] = getelementptr i8, ptr [[P]], i64 9
 ; CHECK-NEXT:    [[P2P_I:%.*]] = getelementptr i8, ptr [[P]], i64 123
-; CHECK-NEXT:    call void @bar2(ptr [[P2P_I]], ptr [[PP_I]])
+; CHECK-NEXT:    call void @bar2(ptr readonly [[P2P_I]], ptr readnone [[PP_I]])
 ; CHECK-NEXT:    ret void
 ;
   call void @foo2_through_obj(ptr readnone %p, ptr readonly %p)
@@ -224,7 +245,7 @@ define void @prop_param_callbase_def_2x_incompat(ptr %p, ptr %p2) {
 define void @prop_param_callbase_def_2x_incompat_2(ptr %p, ptr %p2) {
 ; CHECK-LABEL: define {{[^@]+}}@prop_param_callbase_def_2x_incompat_2
 ; CHECK-SAME: (ptr [[P:%.*]], ptr [[P2:%.*]]) {
-; CHECK-NEXT:    call void @bar2(ptr [[P]], ptr [[P]])
+; CHECK-NEXT:    call void @bar2(ptr readonly [[P]], ptr readonly [[P]])
 ; CHECK-NEXT:    ret void
 ;
   call void @foo2(ptr readonly %p, ptr readnone %p)
@@ -234,7 +255,7 @@ define void @prop_param_callbase_def_2x_incompat_2(ptr %p, ptr %p2) {
 define void @prop_param_callbase_def_2x_incompat_3(ptr %p, ptr %p2) {
 ; CHECK-LABEL: define {{[^@]+}}@prop_param_callbase_def_2x_incompat_3
 ; CHECK-SAME: (ptr [[P:%.*]], ptr [[P2:%.*]]) {
-; CHECK-NEXT:    call void @bar2(ptr [[P]], ptr [[P]])
+; CHECK-NEXT:    call void @bar2(ptr readnone [[P]], ptr readnone [[P]])
 ; CHECK-NEXT:    ret void
 ;
   call void @foo2_2(ptr readonly %p, ptr readnone %p)
@@ -244,7 +265,7 @@ define void @prop_param_callbase_def_2x_incompat_3(ptr %p, ptr %p2) {
 define void @prop_param_callbase_def_1x_partial(ptr %p, ptr %p2) {
 ; CHECK-LABEL: define {{[^@]+}}@prop_param_callbase_def_1x_partial
 ; CHECK-SAME: (ptr [[P:%.*]], ptr [[P2:%.*]]) {
-; CHECK-NEXT:    call void @bar2(ptr [[P]], ptr [[P]])
+; CHECK-NEXT:    call void @bar2(ptr readonly [[P]], ptr readonly [[P]])
 ; CHECK-NEXT:    ret void
 ;
   call void @foo2(ptr readonly %p, ptr %p)
@@ -264,7 +285,7 @@ define void @prop_param_callbase_def_1x_partial_2(ptr %p, ptr %p2) {
 define void @prop_param_callbase_def_1x_partial_3(ptr %p, ptr %p2) {
 ; CHECK-LABEL: define {{[^@]+}}@prop_param_callbase_def_1x_partial_3
 ; CHECK-SAME: (ptr [[P:%.*]], ptr [[P2:%.*]]) {
-; CHECK-NEXT:    call void @bar2(ptr [[P]], ptr [[P]])
+; CHECK-NEXT:    call void @bar2(ptr readonly [[P]], ptr readnone [[P]])
 ; CHECK-NEXT:    ret void
 ;
   call void @foo2_3(ptr readonly %p, ptr %p)
@@ -496,3 +517,25 @@ define void @prop_cb_def_mustprogress(ptr %p) {
   call void @foo1(ptr %p) mustprogress
   ret void
 }
+
+define void @prop_no_conflict_writable(ptr %p) {
+; CHECK-LABEL: define {{[^@]+}}@prop_no_conflict_writable
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT:    call void @bar1(ptr readonly [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @foo1_writable(ptr readonly %p)
+  ret void
+}
+
+
+define void @prop_no_conflict_writable2(ptr %p) {
+; CHECK-LABEL: define {{[^@]+}}@prop_no_conflict_writable2
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT:    call void @bar3(ptr readnone [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @foo3_writable(ptr readnone %p)
+  ret void
+}
+
diff --git a/llvm/test/Transforms/Inline/inline-cost-switch-default.ll b/llvm/test/Transforms/Inline/inline-cost-switch-default.ll
new file mode 100644
index 000000000000..e3768ac8233a
--- /dev/null
+++ b/llvm/test/Transforms/Inline/inline-cost-switch-default.ll
@@ -0,0 +1,130 @@
+; RUN: opt -S -passes=inline %s -debug-only=inline-cost -min-jump-table-entries=4 --disable-output 2>&1 | FileCheck %s -check-prefix=LOOKUPTABLE -match-full-lines
+; RUN: opt -S -passes=inline %s -debug-only=inline-cost -min-jump-table-entries=5 --disable-output 2>&1 | FileCheck %s -check-prefix=SWITCH -match-full-lines
+; REQUIRES: x86_64-linux, asserts
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i64 @main(i64 %a) {
+  %b = call i64 @small_switch_default(i64 %a)
+  %c = call i64 @small_switch_no_default(i64 %a)
+  %d = call i64 @lookup_table_default(i64 %a)
+  %e = call i64 @lookup_table_no_default(i64 %a)
+  ret i64 %b
+}
+
+; SWITCH-LABEL: Analyzing call of small_switch_default{{.*}}
+; SWITCH: Cost: 0
+define i64 @small_switch_default(i64 %a) {
+  switch i64 %a, label %default_branch [
+  i64 -1, label %branch_0
+  i64 8, label %branch_1
+  i64 52, label %branch_2
+  ]
+
+branch_0:
+  br label %exit
+
+branch_1:
+  br label %exit
+
+branch_2:
+  br label %exit
+
+default_branch:
+  br label %exit
+
+exit:
+  %b = phi i64 [ 5, %branch_0 ], [ 9, %branch_1 ], [ 2, %branch_2 ], [ 3, %default_branch ]
+  ret i64 %b
+}
+
+; SWITCH-LABEL: Analyzing call of small_switch_no_default{{.*}}
+; SWITCH: Cost: -10
+define i64 @small_switch_no_default(i64 %a) {
+  switch i64 %a, label %unreachabledefault [
+  i64 -1, label %branch_0
+  i64 8, label %branch_1
+  i64 52, label %branch_2
+  ]
+
+branch_0:
+  br label %exit
+
+branch_1:
+  br label %exit
+
+branch_2:
+  br label %exit
+
+unreachabledefault:
+  unreachable
+
+exit:
+  %b = phi i64 [ 5, %branch_0 ], [ 9, %branch_1 ], [ 2, %branch_2 ]
+  ret i64 %b
+}
+
+; LOOKUPTABLE-LABEL: Analyzing call of lookup_table_default{{.*}}
+; LOOKUPTABLE: Cost: 10
+; SWITCH-LABEL: Analyzing call of lookup_table_default{{.*}}
+; SWITCH: Cost: 20
+define i64 @lookup_table_default(i64 %a) {
+  switch i64 %a, label %default_branch [
+  i64 0, label %branch_0
+  i64 1, label %branch_1
+  i64 2, label %branch_2
+  i64 3, label %branch_3
+  ]
+
+branch_0:
+  br label %exit
+
+branch_1:
+  br label %exit
+
+branch_2:
+  br label %exit
+
+branch_3:
+  br label %exit
+
+default_branch:
+  br label %exit
+
+exit:
+  %b = phi i64 [ 5, %branch_0 ], [ 9, %branch_1 ], [ 2, %branch_2 ], [ 7, %branch_3 ], [ 3, %default_branch ]
+  ret i64 %b
+}
+
+; LOOKUPTABLE-LABEL: Analyzing call of lookup_table_no_default{{.*}}
+; LOOKUPTABLE: Cost: 0
+; SWITCH-LABEL: Analyzing call of lookup_table_no_default{{.*}}
+; SWITCH: Cost: 20
+define i64 @lookup_table_no_default(i64 %a) {
+  switch i64 %a, label %unreachabledefault [
+  i64 0, label %branch_0
+  i64 1, label %branch_1
+  i64 2, label %branch_2
+  i64 3, label %branch_3
+  ]
+
+branch_0:
+  br label %exit
+
+branch_1:
+  br label %exit
+
+branch_2:
+  br label %exit
+
+branch_3:
+  br label %exit
+
+unreachabledefault:
+  unreachable
+
+exit:
+  %b = phi i64 [ 5, %branch_0 ], [ 9, %branch_1 ], [ 2, %branch_2 ], [ 7, %branch_3 ]
+  ret i64 %b
+}
diff --git a/llvm/test/Transforms/Inline/inline-deferred-instsimplify.ll b/llvm/test/Transforms/Inline/inline-deferred-instsimplify.ll
index 4a9c576f0271..f02d03688f03 100644
--- a/llvm/test/Transforms/Inline/inline-deferred-instsimplify.ll
+++ b/llvm/test/Transforms/Inline/inline-deferred-instsimplify.ll
@@ -38,8 +38,6 @@ store_ptr_in_gvar:                                ; preds = %entry
 
 check_pointers_are_equal:                         ; preds = %store_ptr_in_gvar, %entry
   %phi = phi ptr [ %ptr, %store_ptr_in_gvar ], [ @other_g_var, %entry ]
-; FIXME: While inlining, the following is miscompiled to i1 false,
-; as %ptr in the phi-node is not taken into account.
   %.not1 = icmp eq ptr %phi, %ptr
   br i1 %.not1, label %return, label %abort
 
@@ -64,9 +62,13 @@ define i32 @main() {
 ; CHECK-NEXT:    br label [[CHECK_POINTERS_ARE_EQUAL_I]]
 ; CHECK:       check_pointers_are_equal.i:
 ; CHECK-NEXT:    [[PHI_I:%.*]] = phi ptr [ [[G_VAR]], [[STORE_PTR_IN_GVAR_I]] ], [ @other_g_var, [[TMP0:%.*]] ]
+; CHECK-NEXT:    [[DOTNOT1_I:%.*]] = icmp eq ptr [[PHI_I]], [[G_VAR]]
+; CHECK-NEXT:    br i1 [[DOTNOT1_I]], label [[CALLEE_EXIT:%.*]], label [[ABORT_I:%.*]]
+; CHECK:       abort.i:
 ; CHECK-NEXT:    call void @abort()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       callee.exit:
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 20, ptr [[G_VAR]])
 ; CHECK-NEXT:    ret i32 0
 ;
   call void @callee(ptr noundef byval(%struct.a) align 8 @g_var)
diff --git a/llvm/test/Transforms/Inline/inline-drop-attributes.ll b/llvm/test/Transforms/Inline/inline-drop-attributes.ll
new file mode 100644
index 000000000000..9a451f4b8699
--- /dev/null
+++ b/llvm/test/Transforms/Inline/inline-drop-attributes.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=inline -S | FileCheck %s
+; RUN: opt < %s -passes='cgscc(inline)' -S | FileCheck %s
+
+define void @callee() {
+; CHECK-LABEL: define void @callee() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[VAL_PTR:%.*]] = load ptr, ptr null, align 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[VAL_PTR]], null
+; CHECK-NEXT:    [[VAL:%.*]] = load i64, ptr null, align 8
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i64 undef, i64 [[VAL]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %val_ptr = load ptr, ptr null, align 8
+  %cmp = icmp eq ptr %val_ptr, null
+  %val = load i64, ptr null, align 8
+  %sel = select i1 %cmp, i64 undef, i64 %val
+  ret void
+}
+
+define noundef i1 @caller() {
+; CHECK-LABEL: define noundef i1 @caller() {
+; CHECK-NEXT:    [[VAL_PTR_I:%.*]] = load ptr, ptr null, align 8
+; CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq ptr [[VAL_PTR_I]], null
+; CHECK-NEXT:    [[VAL_I:%.*]] = load i64, ptr null, align 8
+; CHECK-NEXT:    [[SEL_I:%.*]] = select i1 [[CMP_I]], i64 undef, i64 [[VAL_I]]
+; CHECK-NEXT:    ret i1 false
+;
+  call void @callee()
+  ret i1 false
+}
diff --git a/llvm/test/Transforms/Inline/inline-switch-default-2.ll b/llvm/test/Transforms/Inline/inline-switch-default-2.ll
index 82dae1c27648..169cb2cff9b8 100644
--- a/llvm/test/Transforms/Inline/inline-switch-default-2.ll
+++ b/llvm/test/Transforms/Inline/inline-switch-default-2.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt %s -S -passes=inline -inline-threshold=21 | FileCheck %s
+; RUN: opt %s -S -passes=inline -inline-threshold=11 | FileCheck %s
 
 ; Check for scenarios without TTI.
 
@@ -16,24 +16,7 @@ define i64 @foo1(i64 %a) {
 define i64 @foo2(i64 %a) {
 ; CHECK-LABEL: define i64 @foo2(
 ; CHECK-SAME: i64 [[A:%.*]]) {
-; CHECK-NEXT:    switch i64 [[A]], label [[UNREACHABLEDEFAULT_I:%.*]] [
-; CHECK-NEXT:      i64 0, label [[BRANCH_0_I:%.*]]
-; CHECK-NEXT:      i64 2, label [[BRANCH_2_I:%.*]]
-; CHECK-NEXT:      i64 4, label [[BRANCH_4_I:%.*]]
-; CHECK-NEXT:      i64 6, label [[BRANCH_6_I:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       branch_0.i:
-; CHECK-NEXT:    br label [[BAR2_EXIT:%.*]]
-; CHECK:       branch_2.i:
-; CHECK-NEXT:    br label [[BAR2_EXIT]]
-; CHECK:       branch_4.i:
-; CHECK-NEXT:    br label [[BAR2_EXIT]]
-; CHECK:       branch_6.i:
-; CHECK-NEXT:    br label [[BAR2_EXIT]]
-; CHECK:       unreachabledefault.i:
-; CHECK-NEXT:    unreachable
-; CHECK:       bar2.exit:
-; CHECK-NEXT:    [[B_I:%.*]] = phi i64 [ 5, [[BRANCH_0_I]] ], [ 9, [[BRANCH_2_I]] ], [ 2, [[BRANCH_4_I]] ], [ 7, [[BRANCH_6_I]] ]
+; CHECK-NEXT:    [[B_I:%.*]] = call i64 @bar2(i64 [[A]])
 ; CHECK-NEXT:    ret i64 [[B_I]]
 ;
   %b = call i64 @bar2(i64 %a)
diff --git a/llvm/test/Transforms/Inline/inline-switch-default.ll b/llvm/test/Transforms/Inline/inline-switch-default.ll
index 44f1304e82df..288d414fe0e0 100644
--- a/llvm/test/Transforms/Inline/inline-switch-default.ll
+++ b/llvm/test/Transforms/Inline/inline-switch-default.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt %s -S -passes=inline -inline-threshold=26 -min-jump-table-entries=4 | FileCheck %s -check-prefix=LOOKUPTABLE
-; RUN: opt %s -S -passes=inline -inline-threshold=21 -min-jump-table-entries=5 | FileCheck %s -check-prefix=SWITCH
+; RUN: opt %s -S -passes=inline -inline-threshold=16 -min-jump-table-entries=4 | FileCheck %s -check-prefix=LOOKUPTABLE
+; RUN: opt %s -S -passes=inline -inline-threshold=11 -min-jump-table-entries=5 | FileCheck %s -check-prefix=SWITCH
+; REQUIRES: x86_64-linux
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -22,6 +23,8 @@ define i64 @foo1(i64 %a) {
   ret i64 %b
 }
 
+; Since the default branch is undefined behavior,
+; we can inline `bar2`: https://github.com/llvm/llvm-project/issues/90929
 define i64 @foo2(i64 %a) {
 ; LOOKUPTABLE-LABEL: define i64 @foo2(
 ; LOOKUPTABLE-SAME: i64 [[A:%.*]]) {
@@ -47,24 +50,7 @@ define i64 @foo2(i64 %a) {
 ;
 ; SWITCH-LABEL: define i64 @foo2(
 ; SWITCH-SAME: i64 [[A:%.*]]) {
-; SWITCH-NEXT:    switch i64 [[A]], label [[UNREACHABLEDEFAULT_I:%.*]] [
-; SWITCH-NEXT:      i64 0, label [[BRANCH_0_I:%.*]]
-; SWITCH-NEXT:      i64 2, label [[BRANCH_2_I:%.*]]
-; SWITCH-NEXT:      i64 4, label [[BRANCH_4_I:%.*]]
-; SWITCH-NEXT:      i64 6, label [[BRANCH_6_I:%.*]]
-; SWITCH-NEXT:    ]
-; SWITCH:       branch_0.i:
-; SWITCH-NEXT:    br label [[BAR2_EXIT:%.*]]
-; SWITCH:       branch_2.i:
-; SWITCH-NEXT:    br label [[BAR2_EXIT]]
-; SWITCH:       branch_4.i:
-; SWITCH-NEXT:    br label [[BAR2_EXIT]]
-; SWITCH:       branch_6.i:
-; SWITCH-NEXT:    br label [[BAR2_EXIT]]
-; SWITCH:       unreachabledefault.i:
-; SWITCH-NEXT:    unreachable
-; SWITCH:       bar2.exit:
-; SWITCH-NEXT:    [[B_I:%.*]] = phi i64 [ 5, [[BRANCH_0_I]] ], [ 9, [[BRANCH_2_I]] ], [ 2, [[BRANCH_4_I]] ], [ 7, [[BRANCH_6_I]] ]
+; SWITCH-NEXT:    [[B_I:%.*]] = call i64 @bar2(i64 [[A]])
 ; SWITCH-NEXT:    ret i64 [[B_I]]
 ;
   %b = call i64 @bar2(i64 %a)
diff --git a/llvm/test/Transforms/Inline/noalias-calls-always.ll b/llvm/test/Transforms/Inline/noalias-calls-always.ll
index 9c851b932783..a80cd12b26b6 100644
--- a/llvm/test/Transforms/Inline/noalias-calls-always.ll
+++ b/llvm/test/Transforms/Inline/noalias-calls-always.ll
@@ -34,11 +34,11 @@ define void @foo(ptr nocapture %a, ptr nocapture readonly %c, ptr nocapture %b)
 ; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META0:![0-9]+]])
 ; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]])
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 512, ptr [[L_I]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[A:%.*]], ptr align 16 [[B:%.*]], i64 16, i1 false), !noalias !3
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[B]], ptr align 16 [[C:%.*]], i64 16, i1 false), !noalias !0
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[A]], ptr align 16 [[C]], i64 16, i1 false), !alias.scope !5
-; CHECK-NEXT:    call void @hey(), !noalias !5
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[L_I]], ptr align 16 [[C]], i64 16, i1 false), !noalias !0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[A:%.*]], ptr align 16 [[B:%.*]], i64 16, i1 false), !noalias [[META3]]
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[B]], ptr readonly align 16 [[C:%.*]], i64 16, i1 false), !noalias [[META0]]
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[A]], ptr readonly align 16 [[C]], i64 16, i1 false), !alias.scope [[META5:![0-9]+]]
+; CHECK-NEXT:    call void @hey(), !noalias [[META5]]
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[L_I]], ptr readonly align 16 [[C]], i64 16, i1 false), !noalias [[META0]]
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 512, ptr [[L_I]])
 ; CHECK-NEXT:    ret void
 ;
@@ -75,11 +75,11 @@ define void @foo_cs(ptr nocapture %a, ptr nocapture readonly %c, ptr nocapture %
 ; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]])
 ; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META9:![0-9]+]])
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 512, ptr [[L_I]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[A:%.*]], ptr align 16 [[B:%.*]], i64 16, i1 false), !noalias !9
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[B]], ptr align 16 [[C:%.*]], i64 16, i1 false), !noalias !6
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[A]], ptr align 16 [[C]], i64 16, i1 false), !alias.scope !11
-; CHECK-NEXT:    call void @hey(), !noalias !11
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[L_I]], ptr align 16 [[C]], i64 16, i1 false), !noalias !6
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[A:%.*]], ptr align 16 [[B:%.*]], i64 16, i1 false), !noalias [[META9]]
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[B]], ptr readonly align 16 [[C:%.*]], i64 16, i1 false), !noalias [[META6]]
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[A]], ptr readonly align 16 [[C]], i64 16, i1 false), !alias.scope [[META11:![0-9]+]]
+; CHECK-NEXT:    call void @hey(), !noalias [[META11]]
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[L_I]], ptr readonly align 16 [[C]], i64 16, i1 false), !noalias [[META6]]
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 512, ptr [[L_I]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/Inline/noalias-calls.ll b/llvm/test/Transforms/Inline/noalias-calls.ll
index e3791da54b23..0dd9ec3498a9 100644
--- a/llvm/test/Transforms/Inline/noalias-calls.ll
+++ b/llvm/test/Transforms/Inline/noalias-calls.ll
@@ -37,11 +37,11 @@ define void @foo(ptr nocapture %a, ptr nocapture readonly %c, ptr nocapture %b)
 ; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META0:![0-9]+]])
 ; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]])
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 512, ptr [[L_I]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[A]], ptr align 16 [[B]], i64 16, i1 false), !noalias !3
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[B]], ptr align 16 [[C]], i64 16, i1 false), !noalias !0
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[A]], ptr align 16 [[C]], i64 16, i1 false), !alias.scope !5
-; CHECK-NEXT:    call void @hey(), !noalias !5
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[L_I]], ptr align 16 [[C]], i64 16, i1 false), !noalias !0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[A]], ptr align 16 [[B]], i64 16, i1 false), !noalias [[META3]]
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[B]], ptr readonly align 16 [[C]], i64 16, i1 false), !noalias [[META0]]
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[A]], ptr readonly align 16 [[C]], i64 16, i1 false), !alias.scope [[META5:![0-9]+]]
+; CHECK-NEXT:    call void @hey(), !noalias [[META5]]
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[L_I]], ptr readonly align 16 [[C]], i64 16, i1 false), !noalias [[META0]]
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 512, ptr [[L_I]])
 ; CHECK-NEXT:    ret void
 ;
@@ -80,11 +80,11 @@ define void @foo_cs(ptr nocapture %a, ptr nocapture readonly %c, ptr nocapture %
 ; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]])
 ; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META9:![0-9]+]])
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 512, ptr [[L_I]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[A]], ptr align 16 [[B]], i64 16, i1 false), !noalias !9
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[B]], ptr align 16 [[C]], i64 16, i1 false), !noalias !6
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[A]], ptr align 16 [[C]], i64 16, i1 false), !alias.scope !11
-; CHECK-NEXT:    call void @hey(), !noalias !11
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[L_I]], ptr align 16 [[C]], i64 16, i1 false), !noalias !6
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[A]], ptr align 16 [[B]], i64 16, i1 false), !noalias [[META9]]
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[B]], ptr readonly align 16 [[C]], i64 16, i1 false), !noalias [[META6]]
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[A]], ptr readonly align 16 [[C]], i64 16, i1 false), !alias.scope [[META11:![0-9]+]]
+; CHECK-NEXT:    call void @hey(), !noalias [[META11]]
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[L_I]], ptr readonly align 16 [[C]], i64 16, i1 false), !noalias [[META6]]
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 512, ptr [[L_I]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/Inline/prof-update-sample-alwaysinline.ll b/llvm/test/Transforms/Inline/prof-update-sample-alwaysinline.ll
index d6b771e2629d..8af4d89663a4 100644
--- a/llvm/test/Transforms/Inline/prof-update-sample-alwaysinline.ll
+++ b/llvm/test/Transforms/Inline/prof-update-sample-alwaysinline.ll
@@ -53,7 +53,6 @@ define void @caller() {
 !18 = !{!"VP", i32 0, i64 140, i64 111, i64 80, i64 222, i64 40, i64 333, i64 20}
 attributes #0 = { alwaysinline }
 ; CHECK: ![[ENTRY_COUNT]] = !{!"function_entry_count", i64 600}
-; CHECK: ![[COUNT_CALLEE1]] = !{!"branch_weights", i32 2000}
 ; CHECK: ![[COUNT_CALLEE]] = !{!"branch_weights", i32 1200}
 ; CHECK: ![[COUNT_IND_CALLEE]] = !{!"VP", i32 0, i64 84, i64 111, i64 48, i64 222, i64 24, i64 333, i64 12}
 ; CHECK: ![[COUNT_CALLER]] = !{!"branch_weights", i32 800}
diff --git a/llvm/test/Transforms/Inline/prof-update-sample.ll b/llvm/test/Transforms/Inline/prof-update-sample.ll
index 6cdd70e84e0c..e09b859b6981 100644
--- a/llvm/test/Transforms/Inline/prof-update-sample.ll
+++ b/llvm/test/Transforms/Inline/prof-update-sample.ll
@@ -52,7 +52,6 @@ define void @caller() {
 !17 = !{!"branch_weights", i32 400}
 !18 = !{!"VP", i32 0, i64 140, i64 111, i64 80, i64 222, i64 40, i64 333, i64 20}
 ; CHECK: ![[ENTRY_COUNT]] = !{!"function_entry_count", i64 600}
-; CHECK: ![[COUNT_CALLEE1]] = !{!"branch_weights", i32 2000}
 ; CHECK: ![[COUNT_CALLEE]] = !{!"branch_weights", i32 1200}
 ; CHECK: ![[COUNT_IND_CALLEE]] = !{!"VP", i32 0, i64 84, i64 111, i64 48, i64 222, i64 24, i64 333, i64 12}
 ; CHECK: ![[COUNT_CALLER]] = !{!"branch_weights", i32 800}
diff --git a/llvm/test/Transforms/Inline/update_invoke_prof.ll b/llvm/test/Transforms/Inline/update_invoke_prof.ll
index 5f09c7cf8fe0..f6b86dfe5bb1 100644
--- a/llvm/test/Transforms/Inline/update_invoke_prof.ll
+++ b/llvm/test/Transforms/Inline/update_invoke_prof.ll
@@ -1,22 +1,31 @@
-; A pre-commit test to show that branch weights and value profiles associated with invoke are not updated.
+; Test that branch weights and value profiles associated with invoke are updated
+; in both caller and callee after inline, but invoke instructions with taken or
+; not taken branch probabilities are not updated.
 ; RUN: opt < %s -passes='require<profile-summary>,cgscc(inline)' -S | FileCheck %s
 
 declare i32 @__gxx_personality_v0(...)
 
 define void @caller(ptr %func) personality ptr @__gxx_personality_v0 !prof !15 {
   call void @callee(ptr %func), !prof !16
+
   ret void
 }
 
-declare void @inner_callee(ptr %func)
+declare void @callee1(ptr %func)
+
+declare void @callee2(ptr %func)
 
 define void @callee(ptr %func) personality ptr @__gxx_personality_v0 !prof !17 {
   invoke void %func()
           to label %next unwind label %lpad, !prof !18
 
 next:
-  invoke void @inner_callee(ptr %func)
-          to label %ret unwind label %lpad, !prof !19
+  invoke void @callee1(ptr %func)
+          to label %cont unwind label %lpad, !prof !19
+
+cont:
+  invoke void @callee2(ptr %func)
+          to label %ret unwind label %lpad, !prof !20
 
 lpad:
   %exn = landingpad {ptr, i32}
@@ -47,18 +56,27 @@ ret:
 !17 = !{!"function_entry_count", i32 1500}
 !18 = !{!"VP", i32 0, i64 1500, i64 123, i64 900, i64 456, i64 600}
 !19 = !{!"branch_weights", i32 1500}
+!20 = !{!"branch_weights", i32 1234, i32 5678}
 
 ; CHECK-LABEL: @caller(
 ; CHECK:  invoke void %func(
 ; CHECK-NEXT: {{.*}} !prof ![[PROF1:[0-9]+]]
-; CHECK:  invoke void @inner_callee(
+; CHECK:  invoke void @callee1(
 ; CHECK-NEXT: {{.*}} !prof ![[PROF2:[0-9]+]]
+; CHECK:  invoke void @callee2(
+; CHECK-NEXT: {{.*}} !prof ![[PROF3:[0-9]+]]
 
 ; CHECK-LABL: @callee(
 ; CHECK:  invoke void %func(
-; CHECK-NEXT: {{.*}} !prof ![[PROF1]] 
-; CHECK:  invoke void @inner_callee(
-; CHECK-NEXT: {{.*}} !prof ![[PROF2]]
+; CHECK-NEXT: {{.*}} !prof ![[PROF4:[0-9]+]]
+; CHECK:  invoke void @callee1(
+; CHECK-NEXT: {{.*}} !prof ![[PROF5:[0-9]+]]
+; CHECK:  invoke void @callee2(
+; CHECK-NEXT: {{.*}} !prof ![[PROF3]]
+
 
-; CHECK: ![[PROF1]] = !{!"VP", i32 0, i64 1500, i64 123, i64 900, i64 456, i64 600}
-; CHECK: ![[PROF2]] = !{!"branch_weights", i32 1500}
+; CHECK: ![[PROF1]] = !{!"VP", i32 0, i64 1000, i64 123, i64 600, i64 456, i64 400}
+; CHECK: ![[PROF2]] = !{!"branch_weights", i32 1000}
+; CHECK: ![[PROF3]] = !{!"branch_weights", i32 1234, i32 5678}
+; CHECK: ![[PROF4]] = !{!"VP", i32 0, i64 500, i64 123, i64 300, i64 456, i64 200}
+; CHECK: ![[PROF5]] = !{!"branch_weights", i32 500}
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/memcpy-from-constant.ll b/llvm/test/Transforms/InstCombine/AMDGPU/memcpy-from-constant.ll
index 2d0e3d2edd90..c14d61b51ad7 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/memcpy-from-constant.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/memcpy-from-constant.ll
@@ -10,7 +10,7 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:3
 define i8 @memcpy_constant_arg_ptr_to_alloca(ptr addrspace(4) noalias readonly align 4 dereferenceable(32) %arg, i32 %idx) {
 ; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [32 x i8], ptr addrspace(4) [[ARG:%.*]], i64 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [32 x i8], ptr addrspace(4) [[ARG:%.*]], i64 0, i64 [[TMP1]]
 ; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr addrspace(4) [[GEP]], align 1
 ; CHECK-NEXT:    ret i8 [[LOAD]]
 ;
@@ -24,7 +24,7 @@ define i8 @memcpy_constant_arg_ptr_to_alloca(ptr addrspace(4) noalias readonly a
 define i8 @memcpy_constant_arg_ptr_to_alloca_load_metadata(ptr addrspace(4) noalias readonly align 4 dereferenceable(32) %arg, i32 %idx) {
 ; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca_load_metadata(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [32 x i8], ptr addrspace(4) [[ARG:%.*]], i64 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [32 x i8], ptr addrspace(4) [[ARG:%.*]], i64 0, i64 [[TMP1]]
 ; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr addrspace(4) [[GEP]], align 1, !noalias [[META0:![0-9]+]]
 ; CHECK-NEXT:    ret i8 [[LOAD]]
 ;
@@ -38,7 +38,7 @@ define i8 @memcpy_constant_arg_ptr_to_alloca_load_metadata(ptr addrspace(4) noal
 define i64 @memcpy_constant_arg_ptr_to_alloca_load_alignment(ptr addrspace(4) noalias readonly align 4 dereferenceable(256) %arg, i32 %idx) {
 ; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca_load_alignment(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [32 x i64], ptr addrspace(4) [[ARG:%.*]], i64 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [32 x i64], ptr addrspace(4) [[ARG:%.*]], i64 0, i64 [[TMP1]]
 ; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr addrspace(4) [[GEP]], align 16
 ; CHECK-NEXT:    ret i64 [[LOAD]]
 ;
@@ -68,7 +68,7 @@ define i64 @memcpy_constant_arg_ptr_to_alloca_load_atomic(ptr addrspace(4) noali
 define i8 @memmove_constant_arg_ptr_to_alloca(ptr addrspace(4) noalias readonly align 4 dereferenceable(32) %arg, i32 %idx) {
 ; CHECK-LABEL: @memmove_constant_arg_ptr_to_alloca(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [32 x i8], ptr addrspace(4) [[ARG:%.*]], i64 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [32 x i8], ptr addrspace(4) [[ARG:%.*]], i64 0, i64 [[TMP1]]
 ; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr addrspace(4) [[GEP]], align 1
 ; CHECK-NEXT:    ret i8 [[LOAD]]
 ;
@@ -83,7 +83,7 @@ define i8 @memmove_constant_arg_ptr_to_alloca(ptr addrspace(4) noalias readonly
 define amdgpu_kernel void @memcpy_constant_byref_arg_ptr_to_alloca(ptr addrspace(4) noalias readonly align 4 byref([32 x i8]) %arg, ptr addrspace(1) %out, i32 %idx) {
 ; CHECK-LABEL: @memcpy_constant_byref_arg_ptr_to_alloca(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [32 x i8], ptr addrspace(4) [[ARG:%.*]], i64 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [32 x i8], ptr addrspace(4) [[ARG:%.*]], i64 0, i64 [[TMP1]]
 ; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr addrspace(4) [[GEP]], align 1
 ; CHECK-NEXT:    store i8 [[LOAD]], ptr addrspace(1) [[OUT:%.*]], align 1
 ; CHECK-NEXT:    ret void
@@ -138,7 +138,7 @@ define amdgpu_kernel void @memcpy_constant_intrinsic_ptr_to_alloca(ptr addrspace
 define i8 @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat(ptr addrspace(4) noalias readonly align 4 dereferenceable(32) %arg, i32 %idx) {
 ; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [32 x i8], ptr addrspace(4) [[ARG:%.*]], i64 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [32 x i8], ptr addrspace(4) [[ARG:%.*]], i64 0, i64 [[TMP1]]
 ; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr addrspace(4) [[GEP]], align 1
 ; CHECK-NEXT:    ret i8 [[LOAD]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/add.ll b/llvm/test/Transforms/InstCombine/add.ll
index 56ee54d351e7..25087fef68a1 100644
--- a/llvm/test/Transforms/InstCombine/add.ll
+++ b/llvm/test/Transforms/InstCombine/add.ll
@@ -3284,12 +3284,14 @@ define i32 @add_reduce_sqr_sum_flipped(i32 %a, i32 %b) {
   ret i32 %add
 }
 
-define i32 @add_reduce_sqr_sum_flipped2(i32 %a, i32 %b) {
+define i32 @add_reduce_sqr_sum_flipped2(i32 %a, i32 %bx) {
 ; CHECK-LABEL: @add_reduce_sqr_sum_flipped2(
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[B:%.*]] = xor i32 [[BX:%.*]], 42
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[B]], [[A:%.*]]
 ; CHECK-NEXT:    [[ADD:%.*]] = mul i32 [[TMP1]], [[TMP1]]
 ; CHECK-NEXT:    ret i32 [[ADD]]
 ;
+  %b = xor i32 %bx, 42 ; thwart complexity-based canonicalization
   %a_sq = mul nsw i32 %a, %a
   %two_a = shl i32 %a, 1
   %two_a_plus_b = add i32 %two_a, %b
@@ -3342,12 +3344,14 @@ define i32 @add_reduce_sqr_sum_order2_flipped(i32 %a, i32 %b) {
   ret i32 %ab2
 }
 
-define i32 @add_reduce_sqr_sum_order2_flipped2(i32 %a, i32 %b) {
+define i32 @add_reduce_sqr_sum_order2_flipped2(i32 %a, i32 %bx) {
 ; CHECK-LABEL: @add_reduce_sqr_sum_order2_flipped2(
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[B:%.*]] = xor i32 [[BX:%.*]], 42
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[B]], [[A:%.*]]
 ; CHECK-NEXT:    [[AB2:%.*]] = mul i32 [[TMP1]], [[TMP1]]
 ; CHECK-NEXT:    ret i32 [[AB2]]
 ;
+  %b = xor i32 %bx, 42 ; thwart complexity-based canonicalization
   %a_sq = mul nsw i32 %a, %a
   %twoa = mul i32 %a, 2
   %twoab = mul i32 %twoa, %b
@@ -3357,12 +3361,14 @@ define i32 @add_reduce_sqr_sum_order2_flipped2(i32 %a, i32 %b) {
   ret i32 %ab2
 }
 
-define i32 @add_reduce_sqr_sum_order2_flipped3(i32 %a, i32 %b) {
+define i32 @add_reduce_sqr_sum_order2_flipped3(i32 %a, i32 %bx) {
 ; CHECK-LABEL: @add_reduce_sqr_sum_order2_flipped3(
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[B:%.*]] = xor i32 [[BX:%.*]], 42
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[B]], [[A:%.*]]
 ; CHECK-NEXT:    [[AB2:%.*]] = mul i32 [[TMP1]], [[TMP1]]
 ; CHECK-NEXT:    ret i32 [[AB2]]
 ;
+  %b = xor i32 %bx, 42 ; thwart complexity-based canonicalization
   %a_sq = mul nsw i32 %a, %a
   %twoa = mul i32 %a, 2
   %twoab = mul i32 %b, %twoa
@@ -3552,12 +3558,14 @@ define i32 @add_reduce_sqr_sum_order5_flipped2(i32 %a, i32 %b) {
   ret i32 %ab2
 }
 
-define i32 @add_reduce_sqr_sum_order5_flipped3(i32 %a, i32 %b) {
+define i32 @add_reduce_sqr_sum_order5_flipped3(i32 %ax, i32 %b) {
 ; CHECK-LABEL: @add_reduce_sqr_sum_order5_flipped3(
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[B:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[A:%.*]] = xor i32 [[AX:%.*]], 42
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[A]], [[B:%.*]]
 ; CHECK-NEXT:    [[AB2:%.*]] = mul i32 [[TMP1]], [[TMP1]]
 ; CHECK-NEXT:    ret i32 [[AB2]]
 ;
+  %a = xor i32 %ax, 42 ; thwart complexity-based canonicalization
   %a_sq = mul nsw i32 %a, %a
   %twob = mul i32 %b, 2
   %twoab = mul i32 %a, %twob
@@ -4018,8 +4026,8 @@ define i32 @add_reduce_sqr_sum_varC_invalid2(i32 %a, i32 %b) {
 
 define i32 @fold_sext_addition_or_disjoint(i8 %x) {
 ; CHECK-LABEL: @fold_sext_addition_or_disjoint(
-; CHECK-NEXT:    [[SE:%.*]] = sext i8 [[XX:%.*]] to i32
-; CHECK-NEXT:    [[R:%.*]] = add nsw i32 [[SE]], 1246
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32
+; CHECK-NEXT:    [[R:%.*]] = add nsw i32 [[TMP1]], 1246
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %xx = or disjoint i8 %x, 12
@@ -4043,8 +4051,8 @@ define i32 @fold_sext_addition_fail(i8 %x) {
 
 define i32 @fold_zext_addition_or_disjoint(i8 %x) {
 ; CHECK-LABEL: @fold_zext_addition_or_disjoint(
-; CHECK-NEXT:    [[SE:%.*]] = zext i8 [[XX:%.*]] to i32
-; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i32 [[SE]], 1246
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i32 [[TMP1]], 1246
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %xx = or disjoint i8 %x, 12
@@ -4055,9 +4063,9 @@ define i32 @fold_zext_addition_or_disjoint(i8 %x) {
 
 define i32 @fold_zext_addition_or_disjoint2(i8 %x) {
 ; CHECK-LABEL: @fold_zext_addition_or_disjoint2(
-; CHECK-NEXT:    [[XX:%.*]] = add nuw i8 [[X:%.*]], 4
-; CHECK-NEXT:    [[SE:%.*]] = zext i8 [[XX]] to i32
-; CHECK-NEXT:    ret i32 [[SE]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i8 [[X:%.*]], 4
+; CHECK-NEXT:    [[R:%.*]] = zext i8 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[R]]
 ;
   %xx = or disjoint i8 %x, 18
   %se = zext i8 %xx to i32
diff --git a/llvm/test/Transforms/InstCombine/apint-and-xor-merge.ll b/llvm/test/Transforms/InstCombine/apint-and-xor-merge.ll
index c904035f41ca..9810e5057d8a 100644
--- a/llvm/test/Transforms/InstCombine/apint-and-xor-merge.ll
+++ b/llvm/test/Transforms/InstCombine/apint-and-xor-merge.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; This test case checks that the merge of and/xor can work on arbitrary
 ; precision integers.
 
@@ -7,8 +7,8 @@
 ; (x &z ) ^ (y & z) -> (x ^ y) & z
 define i57 @test1(i57 %x, i57 %y, i57 %z) {
 ; CHECK-LABEL: @test1(
-; CHECK-NEXT:    [[TMP61:%.*]] = xor i57 %x, %y
-; CHECK-NEXT:    [[TMP7:%.*]] = and i57 [[TMP61]], %z
+; CHECK-NEXT:    [[TMP61:%.*]] = xor i57 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[TMP7:%.*]] = and i57 [[TMP61]], [[Z:%.*]]
 ; CHECK-NEXT:    ret i57 [[TMP7]]
 ;
   %tmp3 = and i57 %z, %x
@@ -20,7 +20,7 @@ define i57 @test1(i57 %x, i57 %y, i57 %z) {
 ; (x & y) ^ (x | y) -> x ^ y
 define i23 @test2(i23 %x, i23 %y, i23 %z) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i23 %y, %x
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i23 [[Y:%.*]], [[X:%.*]]
 ; CHECK-NEXT:    ret i23 [[TMP7]]
 ;
   %tmp3 = and i23 %y, %x
diff --git a/llvm/test/Transforms/InstCombine/apint-or.ll b/llvm/test/Transforms/InstCombine/apint-or.ll
index 939d151c21d2..38bffdf35a36 100644
--- a/llvm/test/Transforms/InstCombine/apint-or.ll
+++ b/llvm/test/Transforms/InstCombine/apint-or.ll
@@ -1,56 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s
 
 ; These tests are for Integer BitWidth <= 64 && BitWidth % 2 != 0.
+;; A | ~A == -1
 define i23 @test1(i23 %A) {
-    ;; A | ~A == -1
-    %NotA = xor i23 -1, %A
-    %B = or i23 %A, %NotA
-    ret i23 %B
-; CHECK-LABEL: @test1
-; CHECK-NEXT: ret i23 -1
+; CHECK-LABEL: define i23 @test1(
+; CHECK-SAME: i23 [[A:%.*]]) {
+; CHECK-NEXT:    ret i23 -1
+;
+  %NotA = xor i23 -1, %A
+  %B = or i23 %A, %NotA
+  ret i23 %B
 }
 
+;; If we have: ((V + N) & C1) | (V & C2)
+;; .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0
+;; replace with V+N.
 define i39 @test2(i39 %V, i39 %M) {
-    ;; If we have: ((V + N) & C1) | (V & C2)
-    ;; .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0
-    ;; replace with V+N.
-    %C1 = xor i39 274877906943, -1 ;; C2 = 274877906943
-    %N = and i39 %M, 274877906944
-    %A = add i39 %V, %N
-    %B = and i39 %A, %C1
-    %D = and i39 %V, 274877906943
-    %R = or i39 %B, %D
-    ret i39 %R
-; CHECK-LABEL: @test2
-; CHECK-NEXT: %N = and i39 %M, -274877906944
-; CHECK-NEXT: %A = add i39 %N, %V
-; CHECK-NEXT: ret i39 %A
+; CHECK-LABEL: define i39 @test2(
+; CHECK-SAME: i39 [[V:%.*]], i39 [[M:%.*]]) {
+; CHECK-NEXT:    [[N:%.*]] = and i39 [[M]], -274877906944
+; CHECK-NEXT:    [[A:%.*]] = add i39 [[N]], [[V]]
+; CHECK-NEXT:    ret i39 [[A]]
+;
+  %C1 = xor i39 274877906943, -1 ;; C2 = 274877906943
+  %N = and i39 %M, 274877906944
+  %A = add i39 %V, %N
+  %B = and i39 %A, %C1
+  %D = and i39 %V, 274877906943
+  %R = or i39 %B, %D
+  ret i39 %R
 }
 
 ; These tests are for Integer BitWidth > 64 && BitWidth <= 1024.
+;; A | ~A == -1
 define i1023 @test4(i1023 %A) {
-    ;; A | ~A == -1
-    %NotA = xor i1023 -1, %A
-    %B = or i1023 %A, %NotA
-    ret i1023 %B
-; CHECK-LABEL: @test4
-; CHECK-NEXT: ret i1023 -1
+; CHECK-LABEL: define i1023 @test4(
+; CHECK-SAME: i1023 [[A:%.*]]) {
+; CHECK-NEXT:    ret i1023 -1
+;
+  %NotA = xor i1023 -1, %A
+  %B = or i1023 %A, %NotA
+  ret i1023 %B
 }
 
+;; If we have: ((V + N) & C1) | (V & C2)
+;; .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0
+;; replace with V+N.
 define i399 @test5(i399 %V, i399 %M) {
-    ;; If we have: ((V + N) & C1) | (V & C2)
-    ;; .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0
-    ;; replace with V+N.
-    %C1 = xor i399 274877906943, -1 ;; C2 = 274877906943
-    %N = and i399 %M, 18446742974197923840
-    %A = add i399 %V, %N
-    %B = and i399 %A, %C1
-    %D = and i399 %V, 274877906943
-    %R = or i399 %B, %D
-    ret i399 %R
-; CHECK-LABEL: @test5
-; CHECK-NEXT: %N = and i399 %M, 18446742974197923840
-; CHECK-NEXT: %A = add i399 %N, %V
-; CHECK-NEXT: ret i399 %A
+; CHECK-LABEL: define i399 @test5(
+; CHECK-SAME: i399 [[V:%.*]], i399 [[M:%.*]]) {
+; CHECK-NEXT:    [[N:%.*]] = and i399 [[M]], 18446742974197923840
+; CHECK-NEXT:    [[A:%.*]] = add i399 [[N]], [[V]]
+; CHECK-NEXT:    ret i399 [[A]]
+;
+  %C1 = xor i399 274877906943, -1 ;; C2 = 274877906943
+  %N = and i399 %M, 18446742974197923840
+  %A = add i399 %V, %N
+  %B = and i399 %A, %C1
+  %D = and i399 %V, 274877906943
+  %R = or i399 %B, %D
+  ret i399 %R
 }
-
diff --git a/llvm/test/Transforms/InstCombine/bit_ceil.ll b/llvm/test/Transforms/InstCombine/bit_ceil.ll
index 16631afa4878..79665be01576 100644
--- a/llvm/test/Transforms/InstCombine/bit_ceil.ll
+++ b/llvm/test/Transforms/InstCombine/bit_ceil.ll
@@ -284,6 +284,42 @@ define <4 x i32> @bit_ceil_v4i32(<4 x i32> %x) {
   ret <4 x i32> %sel
 }
 
+define i32 @pr91691(i32 %0) {
+; CHECK-LABEL: @pr91691(
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 -2, [[TMP0:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 [[TMP2]], i1 false)
+; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw i32 0, [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[TMP4]], 31
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw i32 1, [[TMP5]]
+; CHECK-NEXT:    ret i32 [[TMP6]]
+;
+  %2 = sub nuw i32 -2, %0
+  %3 = tail call i32 @llvm.ctlz.i32(i32 %2, i1 false)
+  %4 = sub i32 32, %3
+  %5 = shl i32 1, %4
+  %6 = icmp ult i32 %0, -2
+  %7 = select i1 %6, i32 %5, i32 1
+  ret i32 %7
+}
+
+define i32 @pr91691_keep_nsw(i32 %0) {
+; CHECK-LABEL: @pr91691_keep_nsw(
+; CHECK-NEXT:    [[TMP2:%.*]] = sub nsw i32 -2, [[TMP0:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 [[TMP2]], i1 false)
+; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw i32 0, [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[TMP4]], 31
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw i32 1, [[TMP5]]
+; CHECK-NEXT:    ret i32 [[TMP6]]
+;
+  %2 = sub nsw i32 -2, %0
+  %3 = tail call i32 @llvm.ctlz.i32(i32 %2, i1 false)
+  %4 = sub i32 32, %3
+  %5 = shl i32 1, %4
+  %6 = icmp ult i32 %0, -2
+  %7 = select i1 %6, i32 %5, i32 1
+  ret i32 %7
+}
+
 declare i32 @llvm.ctlz.i32(i32, i1 immarg)
 declare i64 @llvm.ctlz.i64(i64, i1 immarg)
 declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)
diff --git a/llvm/test/Transforms/InstCombine/call-guard.ll b/llvm/test/Transforms/InstCombine/call-guard.ll
index 6d9308bbbd81..358518b9bd1c 100644
--- a/llvm/test/Transforms/InstCombine/call-guard.ll
+++ b/llvm/test/Transforms/InstCombine/call-guard.ll
@@ -114,22 +114,22 @@ define void @negative_div(i32 %V1, i32 %D) {
 ; Highlight the limit of the window in a case which would otherwise be mergable
 define void @negative_window(i32 %V1, i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: @negative_window(
-; CHECK-NEXT:    [[A:%.*]] = icmp slt i32 [[V1:%.*]], 0
-; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[A]], i32 123) [ "deopt"() ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[V1:%.*]], 0
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[CMP1]], i32 123) [ "deopt"() ]
 ; CHECK-NEXT:    [[V2:%.*]] = add i32 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[V3:%.*]] = add i32 [[V2]], [[C:%.*]]
 ; CHECK-NEXT:    [[V4:%.*]] = add i32 [[V3]], [[D:%.*]]
-; CHECK-NEXT:    [[B:%.*]] = icmp slt i32 [[V4]], 0
-; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[B]], i32 456) [ "deopt"() ]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[V4]], 0
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[CMP2]], i32 456) [ "deopt"() ]
 ; CHECK-NEXT:    ret void
 ;
-  %A = icmp slt i32 %V1, 0
-  call void(i1, ...) @llvm.experimental.guard( i1 %A, i32 123 )[ "deopt"() ]
+  %cmp1 = icmp slt i32 %V1, 0
+  call void(i1, ...) @llvm.experimental.guard( i1 %cmp1, i32 123 )[ "deopt"() ]
   %V2 = add i32 %a, %b
   %V3 = add i32 %V2, %c
   %V4 = add i32 %V3, %d
-  %B = icmp slt i32 %V4, 0
-  call void(i1, ...) @llvm.experimental.guard( i1 %B, i32 456 )[ "deopt"() ]
+  %cmp2 = icmp slt i32 %V4, 0
+  call void(i1, ...) @llvm.experimental.guard( i1 %cmp2, i32 456 )[ "deopt"() ]
   ret void
 }
 
diff --git a/llvm/test/Transforms/InstCombine/cttz.ll b/llvm/test/Transforms/InstCombine/cttz.ll
index 3595cff5f1ae..66b7a03fe5d7 100644
--- a/llvm/test/Transforms/InstCombine/cttz.ll
+++ b/llvm/test/Transforms/InstCombine/cttz.ll
@@ -215,3 +215,64 @@ define i32 @cttz_of_lowest_set_bit_wrong_intrinsic(i32 %x) {
   %tz = call i32 @llvm.ctlz.i32(i32 %and, i1 false)
   ret i32 %tz
 }
+
+define i32 @cttz_of_power_of_two(i32 %x) {
+; CHECK-LABEL: @cttz_of_power_of_two(
+; CHECK-NEXT:    [[R:%.*]] = sub i32 32, [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %lshr = lshr i32 -1, %x
+  %add = add i32 %lshr, 1
+  %r = call i32 @llvm.cttz.i32(i32 %add, i1 false)
+  ret i32 %r
+}
+
+define i32 @cttz_of_power_of_two_zero_poison(i32 %x) {
+; CHECK-LABEL: @cttz_of_power_of_two_zero_poison(
+; CHECK-NEXT:    [[R:%.*]] = sub i32 32, [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %lshr = lshr i32 -1, %x
+  %add = add i32 %lshr, 1
+  %r = call i32 @llvm.cttz.i32(i32 %add, i1 true)
+  ret i32 %r
+}
+
+define i32 @cttz_of_power_of_two_wrong_intrinsic(i32 %x) {
+; CHECK-LABEL: @cttz_of_power_of_two_wrong_intrinsic(
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr i32 -1, [[X:%.*]]
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[LSHR]], 1
+; CHECK-NEXT:    [[R:%.*]] = call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 [[ADD]], i1 false)
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %lshr = lshr i32 -1, %x
+  %add = add i32 %lshr, 1
+  %r = call i32 @llvm.ctlz.i32(i32 %add, i1 false)
+  ret i32 %r
+}
+
+define i32 @cttz_of_power_of_two_wrong_constant_1(i32 %x) {
+; CHECK-LABEL: @cttz_of_power_of_two_wrong_constant_1(
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr i32 -2, [[X:%.*]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i32 [[LSHR]], 1
+; CHECK-NEXT:    [[R:%.*]] = call range(i32 0, 33) i32 @llvm.cttz.i32(i32 [[ADD]], i1 true)
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %lshr = lshr i32 -2, %x
+  %add = add i32 %lshr, 1
+  %r = call i32 @llvm.cttz.i32(i32 %add, i1 false)
+  ret i32 %r
+}
+
+define i32 @cttz_of_power_of_two_wrong_constant_2(i32 %x) {
+; CHECK-LABEL: @cttz_of_power_of_two_wrong_constant_2(
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr i32 -1, [[X:%.*]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[LSHR]], -1
+; CHECK-NEXT:    [[R:%.*]] = call range(i32 1, 33) i32 @llvm.cttz.i32(i32 [[ADD]], i1 false)
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %lshr = lshr i32 -1, %x
+  %add = add i32 %lshr, -1
+  %r = call i32 @llvm.cttz.i32(i32 %add, i1 false)
+  ret i32 %r
+}
diff --git a/llvm/test/Transforms/InstCombine/debuginfo-sink.ll b/llvm/test/Transforms/InstCombine/debuginfo-sink.ll
index 0c7d30d70c36..311948262fac 100644
--- a/llvm/test/Transforms/InstCombine/debuginfo-sink.ll
+++ b/llvm/test/Transforms/InstCombine/debuginfo-sink.ll
@@ -54,33 +54,33 @@ sink2:
   ret i32 %extract
 }
 
-; This GEP is sunk, and has multiple debug uses in the same block. Check that
-; only the last use is cloned into the sunk block, and that both of the
-; original dbg.values are salvaged.
-;
-; CHECK-LABEL: define i32 @baz(ptr
-; CHECK:       call void @llvm.dbg.value(metadata ptr %a, metadata !{{[0-9]+}},
-; CHECK-SAME:  metadata !DIExpression(DW_OP_plus_uconst, 4, DW_OP_stack_value))
-; CHECK-NEXT:  call void @llvm.dbg.value(metadata ptr %a, metadata !{{[0-9]+}},
-; CHECK-SAME:  metadata !DIExpression(DW_OP_plus_uconst, 4, DW_OP_plus_uconst, 5, DW_OP_stack_value))
-; CHECK-NEXT:  br label %sink1
-
-define i32 @baz(ptr %a) !dbg !80 {
-entry:
-  %gep = getelementptr i32, ptr %a, i32 1
-  call void @llvm.dbg.value(metadata ptr %gep, metadata !83, metadata !12), !dbg !84
-  call void @llvm.dbg.value(metadata ptr %gep, metadata !83, metadata !DIExpression(DW_OP_plus_uconst, 5)), !dbg !85
-  br label %sink1
-
-sink1:
-; CHECK-LABEL: sink1:
-; CHECK:       call void @llvm.dbg.value(metadata ptr %gep,
-; CHECK-SAME:  metadata !{{[0-9]+}}, metadata !DIExpression(DW_OP_plus_uconst, 5))
-; CHECK-NEXT:  load
-  %0 = load i32, ptr %gep, align 4, !dbg !85
-  ret i32 %0, !dbg !85
-}
-
+; This GEP is sunk, and has multiple debug uses in the same block. Check that
+; only the last use is cloned into the sunk block, and that both of the
+; original dbg.values are salvaged.
+;
+; CHECK-LABEL: define i32 @baz(ptr
+; CHECK:       call void @llvm.dbg.value(metadata ptr %a, metadata !{{[0-9]+}},
+; CHECK-SAME:  metadata !DIExpression(DW_OP_plus_uconst, 4, DW_OP_stack_value))
+; CHECK-NEXT:  call void @llvm.dbg.value(metadata ptr %a, metadata !{{[0-9]+}},
+; CHECK-SAME:  metadata !DIExpression(DW_OP_plus_uconst, 4, DW_OP_plus_uconst, 5, DW_OP_stack_value))
+; CHECK-NEXT:  br label %sink1
+
+define i32 @baz(ptr %a) !dbg !80 {
+entry:
+  %gep = getelementptr i32, ptr %a, i32 1
+  call void @llvm.dbg.value(metadata ptr %gep, metadata !83, metadata !12), !dbg !84
+  call void @llvm.dbg.value(metadata ptr %gep, metadata !83, metadata !DIExpression(DW_OP_plus_uconst, 5)), !dbg !85
+  br label %sink1
+
+sink1:
+; CHECK-LABEL: sink1:
+; CHECK:       call void @llvm.dbg.value(metadata ptr %gep,
+; CHECK-SAME:  metadata !{{[0-9]+}}, metadata !DIExpression(DW_OP_plus_uconst, 5))
+; CHECK-NEXT:  load
+  %0 = load i32, ptr %gep, align 4, !dbg !85
+  ret i32 %0, !dbg !85
+}
+
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
@@ -105,7 +105,7 @@ sink1:
 !72 = !{!10, !10, !10}
 !73 = !DILocalVariable(name: "k", scope: !70, file: !1, line: 2, type: !10)
 !74 = !DILocation(line: 5, column: 3, scope: !70)
-!80 = distinct !DISubprogram(name: "baz", scope: !1, file: !1, line: 2, type: !8, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
-!83 = !DILocalVariable(name: "l", scope: !80, file: !1, line: 2, type: !10)
-!84 = !DILocation(line: 5, column: 3, scope: !80)
-!85 = !DILocation(line: 6, column: 3, scope: !80)
+!80 = distinct !DISubprogram(name: "baz", scope: !1, file: !1, line: 2, type: !8, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+!83 = !DILocalVariable(name: "l", scope: !80, file: !1, line: 2, type: !10)
+!84 = !DILocation(line: 5, column: 3, scope: !80)
+!85 = !DILocation(line: 6, column: 3, scope: !80)
diff --git a/llvm/test/Transforms/InstCombine/fast-math.ll b/llvm/test/Transforms/InstCombine/fast-math.ll
index 83f2091244e5..da403555ebe2 100644
--- a/llvm/test/Transforms/InstCombine/fast-math.ll
+++ b/llvm/test/Transforms/InstCombine/fast-math.ll
@@ -922,8 +922,8 @@ define float @test55(i1 %which, float %a) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = fadd float [[A:%.*]], 1.000000e+00
 ; CHECK-NEXT:    br label [[FINAL]]
 ; CHECK:       final:
-; CHECK-NEXT:    [[A:%.*]] = phi float [ 3.000000e+00, [[ENTRY:%.*]] ], [ [[TMP0]], [[DELAY]] ]
-; CHECK-NEXT:    ret float [[A]]
+; CHECK-NEXT:    [[PHI:%.*]] = phi float [ 3.000000e+00, [[ENTRY:%.*]] ], [ [[TMP0]], [[DELAY]] ]
+; CHECK-NEXT:    ret float [[PHI]]
 ;
 entry:
   br i1 %which, label %final, label %delay
@@ -932,7 +932,7 @@ delay:
   br label %final
 
 final:
-  %A = phi float [ 2.0, %entry ], [ %a, %delay ]
-  %value = fadd float %A, 1.0
+  %phi = phi float [ 2.0, %entry ], [ %a, %delay ]
+  %value = fadd float %phi, 1.0
   ret float %value
 }
diff --git a/llvm/test/Transforms/InstCombine/icmp-div-constant.ll b/llvm/test/Transforms/InstCombine/icmp-div-constant.ll
index b047715432d7..f667e1aa105d 100644
--- a/llvm/test/Transforms/InstCombine/icmp-div-constant.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-div-constant.ll
@@ -118,8 +118,8 @@ define i32 @icmp_div(i16 %a, i16 %c) {
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i16 [[A:%.*]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL]], label [[THEN:%.*]], label [[EXIT:%.*]]
 ; CHECK:       then:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[C:%.*]], 0
-; CHECK-NEXT:    [[TMP0:%.*]] = sext i1 [[CMP]] to i32
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i16 [[C:%.*]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i1 [[CMP_NOT]] to i32
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[TMP0]], [[THEN]] ]
@@ -173,8 +173,8 @@ define i32 @icmp_div3(i16 %a, i16 %c) {
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i16 [[A:%.*]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL]], label [[THEN:%.*]], label [[EXIT:%.*]]
 ; CHECK:       then:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[C:%.*]], 0
-; CHECK-NEXT:    [[TMP0:%.*]] = sext i1 [[CMP]] to i32
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i16 [[C:%.*]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i1 [[CMP_NOT]] to i32
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[TMP0]], [[THEN]] ]
@@ -381,8 +381,8 @@ define i1 @sdiv_eq_smin_use(i32 %x, i32 %y) {
 
 define i1 @sdiv_x_by_const_cmp_x(i32 %x) {
 ; CHECK-LABEL: @sdiv_x_by_const_cmp_x(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[X:%.*]], 0
-; CHECK-NEXT:    ret i1 [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %v = sdiv i32 %x, 13
   %r = icmp eq i32 %v, %x
@@ -399,12 +399,33 @@ define i1 @udiv_x_by_const_cmp_x(i32 %x) {
   ret i1 %2
 }
 
+define <2 x i1> @udiv_x_by_const_cmp_x_non_splat(<2 x i32> %x) {
+; CHECK-LABEL: @udiv_x_by_const_cmp_x_non_splat(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <2 x i32> [[X:%.*]], zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[TMP1]]
+;
+  %1 = udiv <2 x i32> %x, <i32 123, i32 -123>
+  %2 = icmp slt <2 x i32> %1, %x
+  ret <2 x i1> %2
+}
+
+
+define <2 x i1> @sdiv_x_by_const_cmp_x_non_splat(<2 x i32> %x) {
+; CHECK-LABEL: @sdiv_x_by_const_cmp_x_non_splat(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i32> [[X:%.*]], zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[TMP1]]
+;
+  %1 = sdiv <2 x i32> %x, <i32 2, i32 3>
+  %2 = icmp eq <2 x i32> %1, %x
+  ret <2 x i1> %2
+}
+
 ; Same as above but with right shift instead of division (C != 0)
 
 define i1 @lshr_x_by_const_cmp_x(i32 %x) {
 ; CHECK-LABEL: @lshr_x_by_const_cmp_x(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[X:%.*]], 0
-; CHECK-NEXT:    ret i1 [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %v = lshr i32 %x, 1
   %r = icmp eq i32 %v, %x
@@ -421,6 +442,28 @@ define <4 x i1> @lshr_by_const_cmp_sle_value(<4 x i32> %x) {
   ret <4 x i1> %r
 }
 
+define <4 x i1> @lshr_by_const_cmp_sle_value_non_splat(<4 x i32> %x) {
+; CHECK-LABEL: @lshr_by_const_cmp_sle_value_non_splat(
+; CHECK-NEXT:    [[R:%.*]] = icmp sgt <4 x i32> [[X:%.*]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    ret <4 x i1> [[R]]
+;
+  %v = lshr <4 x i32> %x, <i32 3, i32 3, i32 3, i32 5>
+  %r = icmp sle <4 x i32> %v, %x
+  ret <4 x i1> %r
+}
+
+
+define <4 x i1> @ashr_by_const_cmp_sge_value_non_splat(<4 x i32> %x) {
+; CHECK-LABEL: @ashr_by_const_cmp_sge_value_non_splat(
+; CHECK-NEXT:    [[R:%.*]] = icmp slt <4 x i32> [[X:%.*]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    ret <4 x i1> [[R]]
+;
+  %v = ashr <4 x i32> %x, <i32 1, i32 2, i32 3, i32 4>
+  %r = icmp sge <4 x i32> %v, %x
+  ret <4 x i1> %r
+}
+
+
 define i1 @lshr_by_const_cmp_sge_value(i32 %x) {
 ; CHECK-LABEL: @lshr_by_const_cmp_sge_value(
 ; CHECK-NEXT:    [[R:%.*]] = icmp slt i32 [[X:%.*]], 1
diff --git a/llvm/test/Transforms/InstCombine/icmp-of-trunc-ext.ll b/llvm/test/Transforms/InstCombine/icmp-of-trunc-ext.ll
index a61694919ab0..a1757fbb84b2 100644
--- a/llvm/test/Transforms/InstCombine/icmp-of-trunc-ext.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-of-trunc-ext.ll
@@ -271,9 +271,7 @@ define i1 @icmp_trunc_x_zext_y_fail_multiuse(i32 %x, i8 %y) {
 
 define i1 @trunc_unsigned_nuw(i16 %x, i16 %y) {
 ; CHECK-LABEL: @trunc_unsigned_nuw(
-; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i16 [[X:%.*]] to i8
-; CHECK-NEXT:    [[YT:%.*]] = trunc nuw i16 [[Y:%.*]] to i8
-; CHECK-NEXT:    [[C:%.*]] = icmp ult i8 [[XT]], [[YT]]
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i16 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %xt = trunc nuw i16 %x to i8
@@ -284,9 +282,7 @@ define i1 @trunc_unsigned_nuw(i16 %x, i16 %y) {
 
 define i1 @trunc_unsigned_nsw(i16 %x, i16 %y) {
 ; CHECK-LABEL: @trunc_unsigned_nsw(
-; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i16 [[X:%.*]] to i8
-; CHECK-NEXT:    [[YT:%.*]] = trunc nsw i16 [[Y:%.*]] to i8
-; CHECK-NEXT:    [[C:%.*]] = icmp ult i8 [[XT]], [[YT]]
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i16 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %xt = trunc nsw i16 %x to i8
@@ -297,9 +293,7 @@ define i1 @trunc_unsigned_nsw(i16 %x, i16 %y) {
 
 define i1 @trunc_unsigned_both(i16 %x, i16 %y) {
 ; CHECK-LABEL: @trunc_unsigned_both(
-; CHECK-NEXT:    [[XT:%.*]] = trunc nuw nsw i16 [[X:%.*]] to i8
-; CHECK-NEXT:    [[YT:%.*]] = trunc nuw nsw i16 [[Y:%.*]] to i8
-; CHECK-NEXT:    [[C:%.*]] = icmp ult i8 [[XT]], [[YT]]
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i16 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %xt = trunc nuw nsw i16 %x to i8
@@ -336,9 +330,7 @@ define i1 @trunc_signed_nuw(i16 %x, i16 %y) {
 
 define i1 @trunc_signed_nsw(i16 %x, i16 %y) {
 ; CHECK-LABEL: @trunc_signed_nsw(
-; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i16 [[X:%.*]] to i8
-; CHECK-NEXT:    [[YT:%.*]] = trunc nsw i16 [[Y:%.*]] to i8
-; CHECK-NEXT:    [[C:%.*]] = icmp slt i8 [[XT]], [[YT]]
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i16 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %xt = trunc nsw i16 %x to i8
@@ -349,9 +341,7 @@ define i1 @trunc_signed_nsw(i16 %x, i16 %y) {
 
 define i1 @trunc_signed_both(i16 %x, i16 %y) {
 ; CHECK-LABEL: @trunc_signed_both(
-; CHECK-NEXT:    [[XT:%.*]] = trunc nuw nsw i16 [[X:%.*]] to i8
-; CHECK-NEXT:    [[YT:%.*]] = trunc nuw nsw i16 [[Y:%.*]] to i8
-; CHECK-NEXT:    [[C:%.*]] = icmp slt i8 [[XT]], [[YT]]
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i16 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %xt = trunc nuw nsw i16 %x to i8
@@ -375,9 +365,7 @@ define i1 @trunc_signed_either(i16 %x, i16 %y) {
 
 define i1 @trunc_equality_nuw(i16 %x, i16 %y) {
 ; CHECK-LABEL: @trunc_equality_nuw(
-; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i16 [[X:%.*]] to i8
-; CHECK-NEXT:    [[YT:%.*]] = trunc nuw i16 [[Y:%.*]] to i8
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[XT]], [[YT]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i16 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %xt = trunc nuw i16 %x to i8
@@ -388,9 +376,7 @@ define i1 @trunc_equality_nuw(i16 %x, i16 %y) {
 
 define i1 @trunc_equality_nsw(i16 %x, i16 %y) {
 ; CHECK-LABEL: @trunc_equality_nsw(
-; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i16 [[X:%.*]] to i8
-; CHECK-NEXT:    [[YT:%.*]] = trunc nsw i16 [[Y:%.*]] to i8
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[XT]], [[YT]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i16 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %xt = trunc nsw i16 %x to i8
@@ -401,9 +387,7 @@ define i1 @trunc_equality_nsw(i16 %x, i16 %y) {
 
 define i1 @trunc_equality_both(i16 %x, i16 %y) {
 ; CHECK-LABEL: @trunc_equality_both(
-; CHECK-NEXT:    [[XT:%.*]] = trunc nuw nsw i16 [[X:%.*]] to i8
-; CHECK-NEXT:    [[YT:%.*]] = trunc nuw nsw i16 [[Y:%.*]] to i8
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[XT]], [[YT]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i16 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %xt = trunc nuw nsw i16 %x to i8
@@ -427,9 +411,8 @@ define i1 @trunc_equality_either(i16 %x, i16 %y) {
 
 define i1 @trunc_unsigned_nuw_zext(i32 %x, i8 %y) {
 ; CHECK-LABEL: @trunc_unsigned_nuw_zext(
-; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i32 [[X:%.*]] to i16
-; CHECK-NEXT:    [[YE:%.*]] = zext i8 [[Y:%.*]] to i16
-; CHECK-NEXT:    [[C:%.*]] = icmp ult i16 [[XT]], [[YE]]
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[Y:%.*]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp ugt i32 [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %xt = trunc nuw i32 %x to i16
@@ -453,9 +436,8 @@ define i1 @trunc_unsigned_nuw_sext(i32 %x, i8 %y) {
 
 define i1 @trunc_unsigned_nsw_zext(i32 %x, i8 %y) {
 ; CHECK-LABEL: @trunc_unsigned_nsw_zext(
-; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i32 [[X:%.*]] to i16
-; CHECK-NEXT:    [[YE:%.*]] = zext i8 [[Y:%.*]] to i16
-; CHECK-NEXT:    [[C:%.*]] = icmp ult i16 [[XT]], [[YE]]
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[Y:%.*]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp ugt i32 [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %xt = trunc nsw i32 %x to i16
@@ -466,9 +448,8 @@ define i1 @trunc_unsigned_nsw_zext(i32 %x, i8 %y) {
 
 define i1 @trunc_unsigned_nsw_sext(i32 %x, i8 %y) {
 ; CHECK-LABEL: @trunc_unsigned_nsw_sext(
-; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i32 [[X:%.*]] to i16
-; CHECK-NEXT:    [[YE:%.*]] = sext i8 [[Y:%.*]] to i16
-; CHECK-NEXT:    [[C:%.*]] = icmp ult i16 [[XT]], [[YE]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[Y:%.*]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp ugt i32 [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %xt = trunc nsw i32 %x to i16
@@ -479,9 +460,8 @@ define i1 @trunc_unsigned_nsw_sext(i32 %x, i8 %y) {
 
 define i1 @trunc_signed_nsw_sext(i32 %x, i8 %y) {
 ; CHECK-LABEL: @trunc_signed_nsw_sext(
-; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i32 [[X:%.*]] to i16
-; CHECK-NEXT:    [[YE:%.*]] = sext i8 [[Y:%.*]] to i16
-; CHECK-NEXT:    [[C:%.*]] = icmp slt i16 [[XT]], [[YE]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[Y:%.*]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %xt = trunc nsw i32 %x to i16
@@ -492,9 +472,8 @@ define i1 @trunc_signed_nsw_sext(i32 %x, i8 %y) {
 
 define i1 @trunc_signed_nsw_zext(i32 %x, i8 %y) {
 ; CHECK-LABEL: @trunc_signed_nsw_zext(
-; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i32 [[X:%.*]] to i16
-; CHECK-NEXT:    [[YE:%.*]] = zext i8 [[Y:%.*]] to i16
-; CHECK-NEXT:    [[C:%.*]] = icmp slt i16 [[XT]], [[YE]]
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[Y:%.*]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %xt = trunc nsw i32 %x to i16
@@ -531,9 +510,8 @@ define i1 @trunc_signed_nuw_zext(i32 %x, i8 %y) {
 
 define i1 @trunc_equality_nuw_zext(i32 %x, i8 %y) {
 ; CHECK-LABEL: @trunc_equality_nuw_zext(
-; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i32 [[X:%.*]] to i16
-; CHECK-NEXT:    [[YE:%.*]] = zext i8 [[Y:%.*]] to i16
-; CHECK-NEXT:    [[C:%.*]] = icmp ne i16 [[XT]], [[YE]]
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[Y:%.*]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i32 [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %xt = trunc nuw i32 %x to i16
@@ -557,9 +535,8 @@ define i1 @trunc_equality_nuw_sext(i32 %x, i8 %y) {
 
 define i1 @trunc_equality_nsw_zext(i32 %x, i8 %y) {
 ; CHECK-LABEL: @trunc_equality_nsw_zext(
-; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i32 [[X:%.*]] to i16
-; CHECK-NEXT:    [[YE:%.*]] = zext i8 [[Y:%.*]] to i16
-; CHECK-NEXT:    [[C:%.*]] = icmp ne i16 [[XT]], [[YE]]
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[Y:%.*]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i32 [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %xt = trunc nsw i32 %x to i16
@@ -570,9 +547,8 @@ define i1 @trunc_equality_nsw_zext(i32 %x, i8 %y) {
 
 define i1 @trunc_equality_nsw_sext(i32 %x, i8 %y) {
 ; CHECK-LABEL: @trunc_equality_nsw_sext(
-; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i32 [[X:%.*]] to i16
-; CHECK-NEXT:    [[YE:%.*]] = sext i8 [[Y:%.*]] to i16
-; CHECK-NEXT:    [[C:%.*]] = icmp ne i16 [[XT]], [[YE]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[Y:%.*]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i32 [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %xt = trunc nsw i32 %x to i16
@@ -583,9 +559,8 @@ define i1 @trunc_equality_nsw_sext(i32 %x, i8 %y) {
 
 define i1 @trunc_equality_both_sext(i32 %x, i8 %y) {
 ; CHECK-LABEL: @trunc_equality_both_sext(
-; CHECK-NEXT:    [[XT:%.*]] = trunc nuw nsw i32 [[X:%.*]] to i16
-; CHECK-NEXT:    [[YE:%.*]] = sext i8 [[Y:%.*]] to i16
-; CHECK-NEXT:    [[C:%.*]] = icmp ne i16 [[XT]], [[YE]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[Y:%.*]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i32 [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %xt = trunc nuw nsw i32 %x to i16
@@ -593,3 +568,77 @@ define i1 @trunc_equality_both_sext(i32 %x, i8 %y) {
   %c = icmp ne i16 %xt, %ye
   ret i1 %c
 }
+
+define i1 @test_eq1(i32 %x, i16 %y) {
+; CHECK-LABEL: @test_eq1(
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[Y:%.*]] to i32
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret i1 [[COND]]
+;
+  %conv1 = trunc nsw i32 %x to i8
+  %conv2 = trunc nsw i16 %y to i8
+  %cond = icmp eq i8 %conv1, %conv2
+  ret i1 %cond
+}
+
+; FIXME: It is weird that we generate truncs for test_eq2, but not for test_eq1.
+
+define i1 @test_eq2(i32 %x, i16 %y) {
+; CHECK-LABEL: @test_eq2(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i16 [[TMP1]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[COND]]
+;
+  %conv1 = trunc nsw i32 %x to i8
+  %conv2 = trunc nsw i16 %y to i8
+  %cond = icmp eq i8 %conv2, %conv1
+  ret i1 %cond
+}
+
+define i1 @test_ult(i32 %x, i16 %y) {
+; CHECK-LABEL: @test_ult(
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[Y:%.*]] to i32
+; CHECK-NEXT:    [[COND:%.*]] = icmp ugt i32 [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret i1 [[COND]]
+;
+  %conv1 = trunc nsw i32 %x to i8
+  %conv2 = trunc nsw i16 %y to i8
+  %cond = icmp ult i8 %conv1, %conv2
+  ret i1 %cond
+}
+
+define i1 @test_slt(i32 %x, i16 %y) {
+; CHECK-LABEL: @test_slt(
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[Y:%.*]] to i32
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret i1 [[COND]]
+;
+  %conv1 = trunc nsw i32 %x to i8
+  %conv2 = trunc nsw i16 %y to i8
+  %cond = icmp slt i8 %conv1, %conv2
+  ret i1 %cond
+}
+
+define i1 @test_ult_nuw(i32 %x, i16 %y) {
+; CHECK-LABEL: @test_ult_nuw(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[Y:%.*]] to i32
+; CHECK-NEXT:    [[COND:%.*]] = icmp ugt i32 [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret i1 [[COND]]
+;
+  %conv1 = trunc nuw nsw i32 %x to i8
+  %conv2 = trunc nuw nsw i16 %y to i8
+  %cond = icmp ult i8 %conv1, %conv2
+  ret i1 %cond
+}
+
+define i1 @test_slt_nuw(i32 %x, i16 %y) {
+; CHECK-LABEL: @test_slt_nuw(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[Y:%.*]] to i32
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret i1 [[COND]]
+;
+  %conv1 = trunc nuw nsw i32 %x to i8
+  %conv2 = trunc nuw nsw i16 %y to i8
+  %cond = icmp slt i8 %conv1, %conv2
+  ret i1 %cond
+}
diff --git a/llvm/test/Transforms/InstCombine/icmp-shr-lt-gt.ll b/llvm/test/Transforms/InstCombine/icmp-shr-lt-gt.ll
index 1b8efe4351c6..5f09964fd93a 100644
--- a/llvm/test/Transforms/InstCombine/icmp-shr-lt-gt.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-shr-lt-gt.ll
@@ -3379,7 +3379,7 @@ define i1 @ashrslt_01_01_exact(i4 %x) {
 
 define i1 @ashrslt_01_02_exact(i4 %x) {
 ; CHECK-LABEL: @ashrslt_01_02_exact(
-; CHECK-NEXT:    [[C:%.*]] = icmp slt i4 [[X:%.*]], 4
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i4 [[X:%.*]], 3
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %s = ashr exact i4 %x, 1
@@ -3389,7 +3389,7 @@ define i1 @ashrslt_01_02_exact(i4 %x) {
 
 define i1 @ashrslt_01_03_exact(i4 %x) {
 ; CHECK-LABEL: @ashrslt_01_03_exact(
-; CHECK-NEXT:    [[C:%.*]] = icmp slt i4 [[X:%.*]], 6
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i4 [[X:%.*]], 5
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %s = ashr exact i4 %x, 1
@@ -3800,3 +3800,62 @@ define i1 @ashrslt_03_15_exact(i4 %x) {
   ret i1 %c
 }
 
+define i1 @ashr_slt_exact_near_pow2_cmpval(i8 %x) {
+; CHECK-LABEL: @ashr_slt_exact_near_pow2_cmpval(
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i8 [[X:%.*]], 9
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %s = ashr exact i8 %x, 1
+  %c = icmp slt i8 %s, 5
+  ret i1 %c
+}
+
+define i1 @ashr_ult_exact_near_pow2_cmpval(i8 %x) {
+; CHECK-LABEL: @ashr_ult_exact_near_pow2_cmpval(
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i8 [[X:%.*]], 9
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %s = ashr exact i8 %x, 1
+  %c = icmp ult i8 %s, 5
+  ret i1 %c
+}
+
+define i1 @negtest_near_pow2_cmpval_ashr_slt_noexact(i8 %x) {
+; CHECK-LABEL: @negtest_near_pow2_cmpval_ashr_slt_noexact(
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i8 [[X:%.*]], 10
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %s = ashr i8 %x, 1
+  %c = icmp slt i8 %s, 5
+  ret i1 %c
+}
+
+define i1 @negtest_near_pow2_cmpval_ashr_wrong_cmp_pred(i8 %x) {
+; CHECK-LABEL: @negtest_near_pow2_cmpval_ashr_wrong_cmp_pred(
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[X:%.*]], 10
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %s = ashr exact i8 %x, 1
+  %c = icmp eq i8 %s, 5
+  ret i1 %c
+}
+
+define i1 @negtest_near_pow2_cmpval_isnt_close_to_pow2(i8 %x) {
+; CHECK-LABEL: @negtest_near_pow2_cmpval_isnt_close_to_pow2(
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i8 [[X:%.*]], 12
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %s = ashr exact i8 %x, 1
+  %c = icmp slt i8 %s, 6
+  ret i1 %c
+}
+
+define i1 @negtest_near_pow2_cmpval_would_overflow_into_signbit(i8 %x) {
+; CHECK-LABEL: @negtest_near_pow2_cmpval_would_overflow_into_signbit(
+; CHECK-NEXT:    [[C:%.*]] = icmp sgt i8 [[X:%.*]], -1
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %s = ashr exact i8 %x, 2
+  %c = icmp ult i8 %s, 33
+  ret i1 %c
+}
diff --git a/llvm/test/Transforms/InstCombine/icmp.ll b/llvm/test/Transforms/InstCombine/icmp.ll
index 31093c7ca103..2d786c8f4883 100644
--- a/llvm/test/Transforms/InstCombine/icmp.ll
+++ b/llvm/test/Transforms/InstCombine/icmp.ll
@@ -5183,3 +5183,18 @@ entry:
   %cmp = icmp eq i8 %add2, %add1
   ret i1 %cmp
 }
+
+define i1 @icmp_freeze_sext(i16 %x, i16 %y) {
+; CHECK-LABEL: @icmp_freeze_sext(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp uge i16 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP1_FR:%.*]] = freeze i1 [[CMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i16 [[Y]], 0
+; CHECK-NEXT:    [[CMP2:%.*]] = or i1 [[TMP1]], [[CMP1_FR]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp1 = icmp uge i16 %x, %y
+  %ext = sext i1 %cmp1 to i16
+  %ext.fr = freeze i16 %ext
+  %cmp2 = icmp uge i16 %ext.fr, %y
+  ret i1 %cmp2
+}
diff --git a/llvm/test/Transforms/InstCombine/lshr.ll b/llvm/test/Transforms/InstCombine/lshr.ll
index 7d611ba188d6..fa92c1c4b3be 100644
--- a/llvm/test/Transforms/InstCombine/lshr.ll
+++ b/llvm/test/Transforms/InstCombine/lshr.ll
@@ -163,6 +163,17 @@ define <2 x i8> @lshr_exact_splat_vec(<2 x i8> %x) {
   ret <2 x i8> %lshr
 }
 
+define <2 x i8> @lshr_exact_splat_vec_nuw(<2 x i8> %x) {
+; CHECK-LABEL: @lshr_exact_splat_vec_nuw(
+; CHECK-NEXT:    [[LSHR:%.*]] = add nuw <2 x i8> [[X:%.*]], <i8 1, i8 1>
+; CHECK-NEXT:    ret <2 x i8> [[LSHR]]
+;
+  %shl = shl nuw <2 x i8> %x, <i8 2, i8 2>
+  %add = add nuw <2 x i8> %shl, <i8 4, i8 4>
+  %lshr = lshr <2 x i8> %add, <i8 2, i8 2>
+  ret <2 x i8> %lshr
+}
+
 define i8 @shl_add(i8 %x, i8 %y) {
 ; CHECK-LABEL: @shl_add(
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i8 [[Y:%.*]], 2
@@ -360,8 +371,224 @@ define <3 x i14> @mul_splat_fold_vec(<3 x i14> %x) {
   ret <3 x i14> %t
 }
 
+define i32 @shl_add_lshr_flag_preservation(i32 %x, i32 %c, i32 %y) {
+; CHECK-LABEL: @shl_add_lshr_flag_preservation(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr exact i32 [[Y:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[LSHR:%.*]] = add nuw nsw i32 [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[LSHR]]
+;
+  %shl = shl nuw i32 %x, %c
+  %add = add nuw nsw i32 %shl, %y
+  %lshr = lshr exact i32 %add, %c
+  ret i32 %lshr
+}
+
+define i32 @shl_add_lshr(i32 %x, i32 %c, i32 %y) {
+; CHECK-LABEL: @shl_add_lshr(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[Y:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[LSHR:%.*]] = add nuw i32 [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[LSHR]]
+;
+  %shl = shl nuw i32 %x, %c
+  %add = add nuw i32 %shl, %y
+  %lshr = lshr i32 %add, %c
+  ret i32 %lshr
+}
+
+define i32 @shl_add_lshr_comm(i32 %x, i32 %c, i32 %y) {
+; CHECK-LABEL: @shl_add_lshr_comm(
+; CHECK-NEXT:    [[Y2:%.*]] = mul i32 [[Y:%.*]], [[Y]]
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[Y2]], [[C:%.*]]
+; CHECK-NEXT:    [[LSHR:%.*]] = add nuw i32 [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[LSHR]]
+;
+  %shl = shl nuw i32 %x, %c
+  %y2 = mul i32 %y, %y ; thwart complexity-based canonicalization
+  %add = add nuw i32 %y2, %shl
+  %lshr = lshr i32 %add, %c
+  ret i32 %lshr
+}
+
 ; Negative test
 
+define i32 @shl_add_lshr_no_nuw(i32 %x, i32 %c, i32 %y) {
+; CHECK-LABEL: @shl_add_lshr_no_nuw(
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i32 [[X:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SHL]], [[Y:%.*]]
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr i32 [[ADD]], [[C]]
+; CHECK-NEXT:    ret i32 [[LSHR]]
+;
+  %shl = shl nuw i32 %x, %c
+  %add = add i32 %shl, %y
+  %lshr = lshr i32 %add, %c
+  ret i32 %lshr
+}
+
+; Negative test
+
+define i32 @shl_sub_lshr_not_exact(i32 %x, i32 %c, i32 %y) {
+; CHECK-LABEL: @shl_sub_lshr_not_exact(
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i32 [[X:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw i32 [[SHL]], [[Y:%.*]]
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr i32 [[SUB]], [[C]]
+; CHECK-NEXT:    ret i32 [[LSHR]]
+;
+  %shl = shl nuw i32 %x, %c
+  %sub = sub nuw i32 %shl, %y
+  %lshr = lshr i32 %sub, %c
+  ret i32 %lshr
+}
+
+; Negative test
+
+define i32 @shl_sub_lshr_no_nuw(i32 %x, i32 %c, i32 %y) {
+; CHECK-LABEL: @shl_sub_lshr_no_nuw(
+; CHECK-NEXT:    [[SHL:%.*]] = shl nsw i32 [[X:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[SHL]], [[Y:%.*]]
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr exact i32 [[SUB]], [[C]]
+; CHECK-NEXT:    ret i32 [[LSHR]]
+;
+  %shl = shl nsw i32 %x, %c
+  %sub = sub nsw i32 %shl, %y
+  %lshr = lshr exact i32 %sub, %c
+  ret i32 %lshr
+}
+
+define i32 @shl_sub_lshr(i32 %x, i32 %c, i32 %y) {
+; CHECK-LABEL: @shl_sub_lshr(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr exact i32 [[Y:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[LSHR:%.*]] = sub nuw nsw i32 [[X:%.*]], [[TMP1]]
+; CHECK-NEXT:    ret i32 [[LSHR]]
+;
+  %shl = shl nuw i32 %x, %c
+  %sub = sub nuw nsw i32 %shl, %y
+  %lshr = lshr exact i32 %sub, %c
+  ret i32 %lshr
+}
+
+define i32 @shl_or_lshr(i32 %x, i32 %c, i32 %y) {
+; CHECK-LABEL: @shl_or_lshr(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[Y:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[LSHR:%.*]] = or i32 [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[LSHR]]
+;
+  %shl = shl nuw i32 %x, %c
+  %or = or i32 %shl, %y
+  %lshr = lshr i32 %or, %c
+  ret i32 %lshr
+}
+
+define i32 @shl_or_disjoint_lshr(i32 %x, i32 %c, i32 %y) {
+; CHECK-LABEL: @shl_or_disjoint_lshr(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[Y:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[LSHR:%.*]] = or disjoint i32 [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[LSHR]]
+;
+  %shl = shl nuw i32 %x, %c
+  %or = or disjoint i32 %shl, %y
+  %lshr = lshr i32 %or, %c
+  ret i32 %lshr
+}
+
+define i32 @shl_or_lshr_comm(i32 %x, i32 %c, i32 %y) {
+; CHECK-LABEL: @shl_or_lshr_comm(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[Y:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[LSHR:%.*]] = or i32 [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[LSHR]]
+;
+  %shl = shl nuw i32 %x, %c
+  %or = or i32 %y, %shl
+  %lshr = lshr i32 %or, %c
+  ret i32 %lshr
+}
+
+define i32 @shl_or_disjoint_lshr_comm(i32 %x, i32 %c, i32 %y) {
+; CHECK-LABEL: @shl_or_disjoint_lshr_comm(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[Y:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[LSHR:%.*]] = or disjoint i32 [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[LSHR]]
+;
+  %shl = shl nuw i32 %x, %c
+  %or = or disjoint i32 %y, %shl
+  %lshr = lshr i32 %or, %c
+  ret i32 %lshr
+}
+
+define i32 @shl_xor_lshr(i32 %x, i32 %c, i32 %y) {
+; CHECK-LABEL: @shl_xor_lshr(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[Y:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[LSHR:%.*]] = xor i32 [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[LSHR]]
+;
+  %shl = shl nuw i32 %x, %c
+  %xor = xor i32 %shl, %y
+  %lshr = lshr i32 %xor, %c
+  ret i32 %lshr
+}
+
+define i32 @shl_xor_lshr_comm(i32 %x, i32 %c, i32 %y) {
+; CHECK-LABEL: @shl_xor_lshr_comm(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[Y:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[LSHR:%.*]] = xor i32 [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[LSHR]]
+;
+  %shl = shl nuw i32 %x, %c
+  %xor = xor i32 %y, %shl
+  %lshr = lshr i32 %xor, %c
+  ret i32 %lshr
+}
+
+define i32 @shl_and_lshr(i32 %x, i32 %c, i32 %y) {
+; CHECK-LABEL: @shl_and_lshr(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[Y:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[LSHR:%.*]] = and i32 [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[LSHR]]
+;
+  %shl = shl nuw i32 %x, %c
+  %and = and i32 %shl, %y
+  %lshr = lshr i32 %and, %c
+  ret i32 %lshr
+}
+
+define i32 @shl_and_lshr_comm(i32 %x, i32 %c, i32 %y) {
+; CHECK-LABEL: @shl_and_lshr_comm(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[Y:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[LSHR:%.*]] = and i32 [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[LSHR]]
+;
+  %shl = shl nuw i32 %x, %c
+  %and = and i32 %y, %shl
+  %lshr = lshr i32 %and, %c
+  ret i32 %lshr
+}
+
+define i32 @shl_lshr_and_exact(i32 %x, i32 %c, i32 %y) {
+; CHECK-LABEL: @shl_lshr_and_exact(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[Y:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %2 = shl nuw i32 %x, %c
+  %3 = and i32 %2, %y
+  %4 = lshr exact i32 %3, %c
+  ret i32 %4
+}
+
+; Negative test
+
+define i32 @shl_add_lshr_neg(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @shl_add_lshr_neg(
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[SHL]], [[Z:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = lshr exact i32 [[ADD]], [[Z]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %shl = shl nuw i32 %x, %y
+  %add = add nuw nsw i32 %shl, %z
+  %res = lshr exact i32 %add, %z
+  ret i32 %res
+}
+
 define i32 @mul_splat_fold_wrong_mul_const(i32 %x) {
 ; CHECK-LABEL: @mul_splat_fold_wrong_mul_const(
 ; CHECK-NEXT:    [[M:%.*]] = mul nuw i32 [[X:%.*]], 65538
@@ -375,6 +602,21 @@ define i32 @mul_splat_fold_wrong_mul_const(i32 %x) {
 
 ; Negative test
 
+define i32 @shl_add_lshr_multiuse(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @shl_add_lshr_multiuse(
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[SHL]], [[Z:%.*]]
+; CHECK-NEXT:    call void @use(i32 [[ADD]])
+; CHECK-NEXT:    [[RES:%.*]] = lshr exact i32 [[ADD]], [[Z]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %shl = shl nuw i32 %x, %y
+  %add = add nuw nsw i32 %shl, %z
+  call void @use (i32 %add)
+  %res = lshr exact i32 %add, %z
+  ret i32 %res
+}
+
 define i32 @mul_splat_fold_wrong_lshr_const(i32 %x) {
 ; CHECK-LABEL: @mul_splat_fold_wrong_lshr_const(
 ; CHECK-NEXT:    [[M:%.*]] = mul nuw i32 [[X:%.*]], 65537
diff --git a/llvm/test/Transforms/InstCombine/memcpy-addrspace.ll b/llvm/test/Transforms/InstCombine/memcpy-addrspace.ll
index d1543696bfc0..2ec3994f30fc 100644
--- a/llvm/test/Transforms/InstCombine/memcpy-addrspace.ll
+++ b/llvm/test/Transforms/InstCombine/memcpy-addrspace.ll
@@ -6,7 +6,7 @@
 define void @test_load(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @test_load(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr [8 x i32], ptr addrspace(2) @test.data, i64 0, i64 [[X:%.*]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [8 x i32], ptr addrspace(2) @test.data, i64 0, i64 [[X:%.*]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(2) [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT:%.*]], i64 [[X]]
 ; CHECK-NEXT:    store i32 [[TMP0]], ptr addrspace(1) [[ARRAYIDX1]], align 4
@@ -25,7 +25,7 @@ entry:
 define void @test_load_bitcast_chain(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @test_load_bitcast_chain(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr i32, ptr addrspace(2) @test.data, i64 [[X:%.*]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(2) @test.data, i64 [[X:%.*]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(2) [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT:%.*]], i64 [[X]]
 ; CHECK-NEXT:    store i32 [[TMP0]], ptr addrspace(1) [[ARRAYIDX1]], align 4
diff --git a/llvm/test/Transforms/InstCombine/or-xor.ll b/llvm/test/Transforms/InstCombine/or-xor.ll
index 0a322d6aa023..cf6b9000182d 100644
--- a/llvm/test/Transforms/InstCombine/or-xor.ll
+++ b/llvm/test/Transforms/InstCombine/or-xor.ll
@@ -7,8 +7,8 @@ declare void @use(i8)
 
 define i32 @test1(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test1(
-; CHECK-NEXT:    [[Y_NOT:%.*]] = xor i32 [[Y:%.*]], -1
-; CHECK-NEXT:    [[Z:%.*]] = or i32 [[Y_NOT]], [[X:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[Y:%.*]], -1
+; CHECK-NEXT:    [[Z:%.*]] = or i32 [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    ret i32 [[Z]]
 ;
   %or = or i32 %x, %y
@@ -22,8 +22,8 @@ define i32 @test1(i32 %x, i32 %y) {
 
 define i32 @test2(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[X_NOT:%.*]] = xor i32 [[X:%.*]], -1
-; CHECK-NEXT:    [[Z:%.*]] = or i32 [[X_NOT]], [[Y:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[X:%.*]], -1
+; CHECK-NEXT:    [[Z:%.*]] = or i32 [[TMP1]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i32 [[Z]]
 ;
   %or = or i32 %x, %y
@@ -36,8 +36,8 @@ define i32 @test2(i32 %x, i32 %y) {
 
 define i32 @test3(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test3(
-; CHECK-NEXT:    [[Y_NOT:%.*]] = xor i32 [[Y:%.*]], -1
-; CHECK-NEXT:    [[Z:%.*]] = or i32 [[Y_NOT]], [[X:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[Y:%.*]], -1
+; CHECK-NEXT:    [[Z:%.*]] = or i32 [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    ret i32 [[Z]]
 ;
   %xor = xor i32 %x, %y
@@ -51,8 +51,8 @@ define i32 @test3(i32 %x, i32 %y) {
 
 define i32 @test4(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test4(
-; CHECK-NEXT:    [[X_NOT:%.*]] = xor i32 [[X:%.*]], -1
-; CHECK-NEXT:    [[Z:%.*]] = or i32 [[X_NOT]], [[Y:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[X:%.*]], -1
+; CHECK-NEXT:    [[Z:%.*]] = or i32 [[TMP1]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i32 [[Z]]
 ;
   %xor = xor i32 %x, %y
@@ -205,8 +205,8 @@ define i8 @xor_common_op_commute3(i8 %p, i8 %q) {
 
 define i32 @test8(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test8(
-; CHECK-NEXT:    [[X_NOT:%.*]] = xor i32 [[X:%.*]], -1
-; CHECK-NEXT:    [[Z:%.*]] = or i32 [[X_NOT]], [[Y:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[X:%.*]], -1
+; CHECK-NEXT:    [[Z:%.*]] = or i32 [[TMP1]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i32 [[Z]]
 ;
   %not = xor i32 %y, -1
@@ -217,8 +217,8 @@ define i32 @test8(i32 %x, i32 %y) {
 
 define i32 @test9(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test9(
-; CHECK-NEXT:    [[Y_NOT:%.*]] = xor i32 [[Y:%.*]], -1
-; CHECK-NEXT:    [[Z:%.*]] = or i32 [[Y_NOT]], [[X:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[Y:%.*]], -1
+; CHECK-NEXT:    [[Z:%.*]] = or i32 [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    ret i32 [[Z]]
 ;
   %not = xor i32 %x, -1
@@ -1097,8 +1097,8 @@ define i32 @PR75692_3(i32 %x, i32 %y) {
 
 define i32 @or_xor_not(i32 %x, i32 %y) {
 ; CHECK-LABEL: @or_xor_not(
-; CHECK-NEXT:    [[X_NOT:%.*]] = xor i32 [[X:%.*]], -1
-; CHECK-NEXT:    [[OR1:%.*]] = or i32 [[X_NOT]], [[Y:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[X:%.*]], -1
+; CHECK-NEXT:    [[OR1:%.*]] = or i32 [[TMP1]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i32 [[OR1]]
 ;
   %not = xor i32 %y, -1
@@ -1140,8 +1140,8 @@ define i32 @or_xor_not_uses2(i32 %x, i32 %y) {
 define i32 @or_xor_and_commuted1(i32 %x, i32 %y) {
 ; CHECK-LABEL: @or_xor_and_commuted1(
 ; CHECK-NEXT:    [[YY:%.*]] = mul i32 [[Y:%.*]], [[Y]]
-; CHECK-NEXT:    [[X_NOT:%.*]] = xor i32 [[X:%.*]], -1
-; CHECK-NEXT:    [[OR1:%.*]] = or i32 [[YY]], [[X_NOT]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[X:%.*]], -1
+; CHECK-NEXT:    [[OR1:%.*]] = or i32 [[YY]], [[TMP1]]
 ; CHECK-NEXT:    ret i32 [[OR1]]
 ;
   %yy = mul i32 %y, %y ; thwart complexity-based ordering
@@ -1155,8 +1155,8 @@ define i32 @or_xor_and_commuted2(i32 %x, i32 %y) {
 ; CHECK-LABEL: @or_xor_and_commuted2(
 ; CHECK-NEXT:    [[YY:%.*]] = mul i32 [[Y:%.*]], [[Y]]
 ; CHECK-NEXT:    [[XX:%.*]] = mul i32 [[X:%.*]], [[X]]
-; CHECK-NEXT:    [[XX_NOT:%.*]] = xor i32 [[XX]], -1
-; CHECK-NEXT:    [[OR1:%.*]] = or i32 [[YY]], [[XX_NOT]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[XX]], -1
+; CHECK-NEXT:    [[OR1:%.*]] = or i32 [[YY]], [[TMP1]]
 ; CHECK-NEXT:    ret i32 [[OR1]]
 ;
   %yy = mul i32 %y, %y ; thwart complexity-based ordering
@@ -1166,3 +1166,309 @@ define i32 @or_xor_and_commuted2(i32 %x, i32 %y) {
   %or1 = or i32 %xor, %yy
   ret i32 %or1
 }
+
+; (A ^ B) | ((B ^ C) ^ A) -> (A ^ B) | C and commuted variants.
+
+define i32 @or_xor_tree_0000(i32 %ax, i32 %bx, i32 %cx) {
+; CHECK-LABEL: @or_xor_tree_0000(
+; CHECK-NEXT:    [[A:%.*]] = mul i32 [[AX:%.*]], 42
+; CHECK-NEXT:    [[B:%.*]] = mul i32 [[BX:%.*]], 42
+; CHECK-NEXT:    [[C:%.*]] = mul i32 [[CX:%.*]], 42
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[A]], [[B]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[XOR1]], [[C]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %a = mul i32 %ax, 42
+  %b = mul i32 %bx, 42
+  %c = mul i32 %cx, 42
+  %xor1 = xor i32 %a, %b
+  %xor2 = xor i32 %b, %c
+  %xor3 = xor i32 %xor2, %a
+  %or = or i32 %xor1, %xor3
+  ret i32 %or
+}
+
+define i32 @or_xor_tree_0001(i32 %ax, i32 %bx, i32 %cx) {
+; CHECK-LABEL: @or_xor_tree_0001(
+; CHECK-NEXT:    [[A:%.*]] = mul i32 [[AX:%.*]], 42
+; CHECK-NEXT:    [[B:%.*]] = mul i32 [[BX:%.*]], 42
+; CHECK-NEXT:    [[C:%.*]] = mul i32 [[CX:%.*]], 42
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[B]], [[A]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[XOR1]], [[C]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %a = mul i32 %ax, 42
+  %b = mul i32 %bx, 42
+  %c = mul i32 %cx, 42
+  %xor1 = xor i32 %b, %a
+  %xor2 = xor i32 %b, %c
+  %xor3 = xor i32 %xor2, %a
+  %or = or i32 %xor1, %xor3
+  ret i32 %or
+}
+
+define i32 @or_xor_tree_0010(i32 %ax, i32 %bx, i32 %cx) {
+; CHECK-LABEL: @or_xor_tree_0010(
+; CHECK-NEXT:    [[A:%.*]] = mul i32 [[AX:%.*]], 42
+; CHECK-NEXT:    [[B:%.*]] = mul i32 [[BX:%.*]], 42
+; CHECK-NEXT:    [[C:%.*]] = mul i32 [[CX:%.*]], 42
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[A]], [[B]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[XOR1]], [[C]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %a = mul i32 %ax, 42
+  %b = mul i32 %bx, 42
+  %c = mul i32 %cx, 42
+  %xor1 = xor i32 %a, %b
+  %xor2 = xor i32 %c, %b
+  %xor3 = xor i32 %xor2, %a
+  %or = or i32 %xor1, %xor3
+  ret i32 %or
+}
+
+define i32 @or_xor_tree_0011(i32 %ax, i32 %bx, i32 %cx) {
+; CHECK-LABEL: @or_xor_tree_0011(
+; CHECK-NEXT:    [[A:%.*]] = mul i32 [[AX:%.*]], 42
+; CHECK-NEXT:    [[B:%.*]] = mul i32 [[BX:%.*]], 42
+; CHECK-NEXT:    [[C:%.*]] = mul i32 [[CX:%.*]], 42
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[B]], [[A]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[XOR1]], [[C]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %a = mul i32 %ax, 42
+  %b = mul i32 %bx, 42
+  %c = mul i32 %cx, 42
+  %xor1 = xor i32 %b, %a
+  %xor2 = xor i32 %c, %b
+  %xor3 = xor i32 %xor2, %a
+  %or = or i32 %xor1, %xor3
+  ret i32 %or
+}
+
+define i32 @or_xor_tree_0100(i32 %ax, i32 %bx, i32 %cx) {
+; CHECK-LABEL: @or_xor_tree_0100(
+; CHECK-NEXT:    [[A:%.*]] = mul i32 [[AX:%.*]], 42
+; CHECK-NEXT:    [[B:%.*]] = mul i32 [[BX:%.*]], 42
+; CHECK-NEXT:    [[C:%.*]] = mul i32 [[CX:%.*]], 42
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[A]], [[B]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[XOR1]], [[C]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %a = mul i32 %ax, 42
+  %b = mul i32 %bx, 42
+  %c = mul i32 %cx, 42
+  %xor1 = xor i32 %a, %b
+  %xor2 = xor i32 %b, %c
+  %xor3 = xor i32 %a, %xor2
+  %or = or i32 %xor1, %xor3
+  ret i32 %or
+}
+
+define i32 @or_xor_tree_0101(i32 %ax, i32 %bx, i32 %cx) {
+; CHECK-LABEL: @or_xor_tree_0101(
+; CHECK-NEXT:    [[A:%.*]] = mul i32 [[AX:%.*]], 42
+; CHECK-NEXT:    [[B:%.*]] = mul i32 [[BX:%.*]], 42
+; CHECK-NEXT:    [[C:%.*]] = mul i32 [[CX:%.*]], 42
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[B]], [[A]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[XOR1]], [[C]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %a = mul i32 %ax, 42
+  %b = mul i32 %bx, 42
+  %c = mul i32 %cx, 42
+  %xor1 = xor i32 %b, %a
+  %xor2 = xor i32 %b, %c
+  %xor3 = xor i32 %a, %xor2
+  %or = or i32 %xor1, %xor3
+  ret i32 %or
+}
+
+define i32 @or_xor_tree_0110(i32 %ax, i32 %bx, i32 %cx) {
+; CHECK-LABEL: @or_xor_tree_0110(
+; CHECK-NEXT:    [[A:%.*]] = mul i32 [[AX:%.*]], 42
+; CHECK-NEXT:    [[B:%.*]] = mul i32 [[BX:%.*]], 42
+; CHECK-NEXT:    [[C:%.*]] = mul i32 [[CX:%.*]], 42
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[A]], [[B]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[XOR1]], [[C]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %a = mul i32 %ax, 42
+  %b = mul i32 %bx, 42
+  %c = mul i32 %cx, 42
+  %xor1 = xor i32 %a, %b
+  %xor2 = xor i32 %c, %b
+  %xor3 = xor i32 %a, %xor2
+  %or = or i32 %xor1, %xor3
+  ret i32 %or
+}
+
+define i32 @or_xor_tree_0111(i32 %ax, i32 %bx, i32 %cx) {
+; CHECK-LABEL: @or_xor_tree_0111(
+; CHECK-NEXT:    [[A:%.*]] = mul i32 [[AX:%.*]], 42
+; CHECK-NEXT:    [[B:%.*]] = mul i32 [[BX:%.*]], 42
+; CHECK-NEXT:    [[C:%.*]] = mul i32 [[CX:%.*]], 42
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[B]], [[A]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[XOR1]], [[C]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %a = mul i32 %ax, 42
+  %b = mul i32 %bx, 42
+  %c = mul i32 %cx, 42
+  %xor1 = xor i32 %b, %a
+  %xor2 = xor i32 %c, %b
+  %xor3 = xor i32 %a, %xor2
+  %or = or i32 %xor1, %xor3
+  ret i32 %or
+}
+
+define i32 @or_xor_tree_1000(i32 %ax, i32 %bx, i32 %cx) {
+; CHECK-LABEL: @or_xor_tree_1000(
+; CHECK-NEXT:    [[A:%.*]] = mul i32 [[AX:%.*]], 42
+; CHECK-NEXT:    [[B:%.*]] = mul i32 [[BX:%.*]], 42
+; CHECK-NEXT:    [[C:%.*]] = mul i32 [[CX:%.*]], 42
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[A]], [[B]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[XOR1]], [[C]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %a = mul i32 %ax, 42
+  %b = mul i32 %bx, 42
+  %c = mul i32 %cx, 42
+  %xor1 = xor i32 %a, %b
+  %xor2 = xor i32 %b, %c
+  %xor3 = xor i32 %xor2, %a
+  %or = or i32 %xor3, %xor1
+  ret i32 %or
+}
+
+define i32 @or_xor_tree_1001(i32 %ax, i32 %bx, i32 %cx) {
+; CHECK-LABEL: @or_xor_tree_1001(
+; CHECK-NEXT:    [[A:%.*]] = mul i32 [[AX:%.*]], 42
+; CHECK-NEXT:    [[B:%.*]] = mul i32 [[BX:%.*]], 42
+; CHECK-NEXT:    [[C:%.*]] = mul i32 [[CX:%.*]], 42
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[B]], [[A]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[XOR1]], [[C]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %a = mul i32 %ax, 42
+  %b = mul i32 %bx, 42
+  %c = mul i32 %cx, 42
+  %xor1 = xor i32 %b, %a
+  %xor2 = xor i32 %b, %c
+  %xor3 = xor i32 %xor2, %a
+  %or = or i32 %xor3, %xor1
+  ret i32 %or
+}
+
+define i32 @or_xor_tree_1010(i32 %ax, i32 %bx, i32 %cx) {
+; CHECK-LABEL: @or_xor_tree_1010(
+; CHECK-NEXT:    [[A:%.*]] = mul i32 [[AX:%.*]], 42
+; CHECK-NEXT:    [[B:%.*]] = mul i32 [[BX:%.*]], 42
+; CHECK-NEXT:    [[C:%.*]] = mul i32 [[CX:%.*]], 42
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[A]], [[B]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[XOR1]], [[C]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %a = mul i32 %ax, 42
+  %b = mul i32 %bx, 42
+  %c = mul i32 %cx, 42
+  %xor1 = xor i32 %a, %b
+  %xor2 = xor i32 %c, %b
+  %xor3 = xor i32 %xor2, %a
+  %or = or i32 %xor3, %xor1
+  ret i32 %or
+}
+
+define i32 @or_xor_tree_1011(i32 %ax, i32 %bx, i32 %cx) {
+; CHECK-LABEL: @or_xor_tree_1011(
+; CHECK-NEXT:    [[A:%.*]] = mul i32 [[AX:%.*]], 42
+; CHECK-NEXT:    [[B:%.*]] = mul i32 [[BX:%.*]], 42
+; CHECK-NEXT:    [[C:%.*]] = mul i32 [[CX:%.*]], 42
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[B]], [[A]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[XOR1]], [[C]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %a = mul i32 %ax, 42
+  %b = mul i32 %bx, 42
+  %c = mul i32 %cx, 42
+  %xor1 = xor i32 %b, %a
+  %xor2 = xor i32 %c, %b
+  %xor3 = xor i32 %xor2, %a
+  %or = or i32 %xor3, %xor1
+  ret i32 %or
+}
+
+define i32 @or_xor_tree_1100(i32 %ax, i32 %bx, i32 %cx) {
+; CHECK-LABEL: @or_xor_tree_1100(
+; CHECK-NEXT:    [[A:%.*]] = mul i32 [[AX:%.*]], 42
+; CHECK-NEXT:    [[B:%.*]] = mul i32 [[BX:%.*]], 42
+; CHECK-NEXT:    [[C:%.*]] = mul i32 [[CX:%.*]], 42
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[A]], [[B]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[XOR1]], [[C]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %a = mul i32 %ax, 42
+  %b = mul i32 %bx, 42
+  %c = mul i32 %cx, 42
+  %xor1 = xor i32 %a, %b
+  %xor2 = xor i32 %b, %c
+  %xor3 = xor i32 %a, %xor2
+  %or = or i32 %xor3, %xor1
+  ret i32 %or
+}
+
+define i32 @or_xor_tree_1101(i32 %ax, i32 %bx, i32 %cx) {
+; CHECK-LABEL: @or_xor_tree_1101(
+; CHECK-NEXT:    [[A:%.*]] = mul i32 [[AX:%.*]], 42
+; CHECK-NEXT:    [[B:%.*]] = mul i32 [[BX:%.*]], 42
+; CHECK-NEXT:    [[C:%.*]] = mul i32 [[CX:%.*]], 42
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[B]], [[A]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[XOR1]], [[C]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %a = mul i32 %ax, 42
+  %b = mul i32 %bx, 42
+  %c = mul i32 %cx, 42
+  %xor1 = xor i32 %b, %a
+  %xor2 = xor i32 %b, %c
+  %xor3 = xor i32 %a, %xor2
+  %or = or i32 %xor3, %xor1
+  ret i32 %or
+}
+
+define i32 @or_xor_tree_1110(i32 %ax, i32 %bx, i32 %cx) {
+; CHECK-LABEL: @or_xor_tree_1110(
+; CHECK-NEXT:    [[A:%.*]] = mul i32 [[AX:%.*]], 42
+; CHECK-NEXT:    [[B:%.*]] = mul i32 [[BX:%.*]], 42
+; CHECK-NEXT:    [[C:%.*]] = mul i32 [[CX:%.*]], 42
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[A]], [[B]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[XOR1]], [[C]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %a = mul i32 %ax, 42
+  %b = mul i32 %bx, 42
+  %c = mul i32 %cx, 42
+  %xor1 = xor i32 %a, %b
+  %xor2 = xor i32 %c, %b
+  %xor3 = xor i32 %a, %xor2
+  %or = or i32 %xor3, %xor1
+  ret i32 %or
+}
+
+define i32 @or_xor_tree_1111(i32 %ax, i32 %bx, i32 %cx) {
+; CHECK-LABEL: @or_xor_tree_1111(
+; CHECK-NEXT:    [[A:%.*]] = mul i32 [[AX:%.*]], 42
+; CHECK-NEXT:    [[B:%.*]] = mul i32 [[BX:%.*]], 42
+; CHECK-NEXT:    [[C:%.*]] = mul i32 [[CX:%.*]], 42
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[B]], [[A]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[XOR1]], [[C]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %a = mul i32 %ax, 42
+  %b = mul i32 %bx, 42
+  %c = mul i32 %cx, 42
+  %xor1 = xor i32 %b, %a
+  %xor2 = xor i32 %c, %b
+  %xor3 = xor i32 %a, %xor2
+  %or = or i32 %xor3, %xor1
+  ret i32 %or
+}
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index 2efe2742ca49..2ade6faa99be 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -4580,3 +4580,35 @@ define i32 @sequence_select_with_same_cond_extra_use(i1 %c1, i1 %c2){
   %s3 = select i1 %c1, i32 789, i32 %s2
   ret i32 %s3
 }
+
+define i8 @test_replace_freeze_multiuse(i1 %x, i8 %y) {
+; CHECK-LABEL: @test_replace_freeze_multiuse(
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[X:%.*]] to i8
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i8 [[EXT]], [[Y:%.*]]
+; CHECK-NEXT:    [[SHL_FR:%.*]] = freeze i8 [[SHL]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[X]], i8 0, i8 [[SHL_FR]]
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[SHL_FR]], [[SEL]]
+; CHECK-NEXT:    ret i8 [[ADD]]
+;
+  %ext = zext i1 %x to i8
+  %shl = shl nuw i8 %ext, %y
+  %shl.fr = freeze i8 %shl
+  %sel = select i1 %x, i8 0, i8 %shl.fr
+  %add = add i8 %shl.fr, %sel
+  ret i8 %add
+}
+
+define i8 @test_replace_freeze_oneuse(i1 %x, i8 %y) {
+; CHECK-LABEL: @test_replace_freeze_oneuse(
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[X:%.*]] to i8
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i8 [[EXT]], [[Y:%.*]]
+; CHECK-NEXT:    [[SHL_FR:%.*]] = freeze i8 [[SHL]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[X]], i8 0, i8 [[SHL_FR]]
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %ext = zext i1 %x to i8
+  %shl = shl nuw i8 %ext, %y
+  %shl.fr = freeze i8 %shl
+  %sel = select i1 %x, i8 0, i8 %shl.fr
+  ret i8 %sel
+}
diff --git a/llvm/test/Transforms/InstCombine/simplify-libcalls-new.ll b/llvm/test/Transforms/InstCombine/simplify-libcalls-new.ll
index 51debdf6643e..ecfafbc69797 100644
--- a/llvm/test/Transforms/InstCombine/simplify-libcalls-new.ll
+++ b/llvm/test/Transforms/InstCombine/simplify-libcalls-new.ll
@@ -1,13 +1,19 @@
 ;; Test behavior of -optimize-hot-cold-new and related options.
 
 ;; Check that we don't get hot/cold new calls without enabling it explicitly.
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s --implicit-check-not=hot_cold_t
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s --check-prefix=OFF
+; OFF-NOT: hot_cold_t
+; OFF-LABEL: @new_hot_cold()
 
 ;; First check with the default cold and hot hint values (255 = -2).
-; RUN: opt < %s -passes=instcombine -optimize-hot-cold-new -S | FileCheck %s --check-prefix=HOTCOLD -DCOLD=1 -DHOT=-2
+; RUN: opt < %s -passes=instcombine -optimize-hot-cold-new -S | FileCheck %s --check-prefix=HOTCOLD -DCOLD=1 -DHOT=-2 -DPREVHINTCOLD=7 -DPREVHINTNOTCOLD=7 -DPREVHINTHOT=7
 
 ;; Next check with the non-default cold and hot hint values (200 =-56).
-; RUN: opt < %s -passes=instcombine -optimize-hot-cold-new -cold-new-hint-value=5 -hot-new-hint-value=200 -S | FileCheck %s --check-prefix=HOTCOLD -DCOLD=5 -DHOT=-56
+; RUN: opt < %s -passes=instcombine -optimize-hot-cold-new -cold-new-hint-value=5 -hot-new-hint-value=200 -S | FileCheck %s --check-prefix=HOTCOLD -DCOLD=5 -DHOT=-56 -DPREVHINTCOLD=7 -DPREVHINTNOTCOLD=7 -DPREVHINTHOT=7
+
+;; Try again with the non-default cold and hot hint values (200 =-56), and this
+;; time specify that existing hints should be updated.
+; RUN: opt < %s -passes=instcombine -optimize-hot-cold-new -cold-new-hint-value=5 -notcold-new-hint-value=100 -hot-new-hint-value=200 -optimize-existing-hot-cold-new -S | FileCheck %s --check-prefix=HOTCOLD -DCOLD=5 -DHOT=-56 -DPREVHINTCOLD=5 -DPREVHINTNOTCOLD=100 -DPREVHINTHOT=-56
 
 ;; Make sure that values not in 0..255 are flagged with an error
 ; RUN: not opt < %s -passes=instcombine -optimize-hot-cold-new -cold-new-hint-value=256 -S 2>&1 | FileCheck %s --check-prefix=ERROR
@@ -178,6 +184,162 @@ define void @array_new_align_nothrow() {
   ret void
 }
 
+;; Check that operator new(unsigned long, __hot_cold_t)
+;; optionally has its hint updated.
+; HOTCOLD-LABEL: @new_hot_cold()
+define void @new_hot_cold() {
+  ;; Attribute cold converted to __hot_cold_t cold value.
+  ; HOTCOLD: @_Znwm12__hot_cold_t(i64 10, i8 [[PREVHINTCOLD]])
+  %call = call ptr @_Znwm12__hot_cold_t(i64 10, i8 7) #0
+  call void @dummy(ptr %call)
+  ;; Attribute notcold converted to __hot_cold_t notcold value.
+  ; HOTCOLD: @_Znwm12__hot_cold_t(i64 10, i8 [[PREVHINTNOTCOLD]])
+  %call1 = call ptr @_Znwm12__hot_cold_t(i64 10, i8 7) #1
+  call void @dummy(ptr %call1)
+  ;; Attribute hot converted to __hot_cold_t hot value.
+  ; HOTCOLD: @_Znwm12__hot_cold_t(i64 10, i8 [[PREVHINTHOT]])
+  %call2 = call ptr @_Znwm12__hot_cold_t(i64 10, i8 7) #2
+  call void @dummy(ptr %call2)
+  ret void
+}
+
+;; Check that operator new(unsigned long, std::align_val_t, __hot_cold_t)
+;; optionally has its hint updated.
+; HOTCOLD-LABEL: @new_align_hot_cold()
+define void @new_align_hot_cold() {
+  ;; Attribute cold converted to __hot_cold_t cold value.
+  ; HOTCOLD: @_ZnwmSt11align_val_t12__hot_cold_t(i64 10, i64 8, i8 [[PREVHINTCOLD]])
+  %call = call ptr @_ZnwmSt11align_val_t12__hot_cold_t(i64 10, i64 8, i8 7) #0
+  call void @dummy(ptr %call)
+  ;; Attribute notcold converted to __hot_cold_t notcold value.
+  ; HOTCOLD: @_ZnwmSt11align_val_t12__hot_cold_t(i64 10, i64 8, i8 [[PREVHINTNOTCOLD]])
+  %call1 = call ptr @_ZnwmSt11align_val_t12__hot_cold_t(i64 10, i64 8, i8 7) #1
+  call void @dummy(ptr %call1)
+  ;; Attribute hot converted to __hot_cold_t hot value.
+  ; HOTCOLD: @_ZnwmSt11align_val_t12__hot_cold_t(i64 10, i64 8, i8 [[PREVHINTHOT]])
+  %call2 = call ptr @_ZnwmSt11align_val_t12__hot_cold_t(i64 10, i64 8, i8 7) #2
+  call void @dummy(ptr %call2)
+  ret void
+}
+
+;; Check that operator new(unsigned long, const std::nothrow_t&, __hot_cold_t)
+;; optionally has its hint updated.
+; HOTCOLD-LABEL: @new_nothrow_hot_cold()
+define void @new_nothrow_hot_cold() {
+  %nt = alloca i8
+  ;; Attribute cold converted to __hot_cold_t cold value.
+  ; HOTCOLD: @_ZnwmRKSt9nothrow_t12__hot_cold_t(i64 10, ptr nonnull %nt, i8 [[PREVHINTCOLD]])
+  %call = call ptr @_ZnwmRKSt9nothrow_t12__hot_cold_t(i64 10, ptr %nt, i8 7) #0
+  call void @dummy(ptr %call)
+  ;; Attribute notcold converted to __hot_cold_t notcold value.
+  ; HOTCOLD: @_ZnwmRKSt9nothrow_t12__hot_cold_t(i64 10, ptr nonnull %nt, i8 [[PREVHINTNOTCOLD]])
+  %call1 = call ptr @_ZnwmRKSt9nothrow_t12__hot_cold_t(i64 10, ptr %nt, i8 7) #1
+  call void @dummy(ptr %call1)
+  ;; Attribute hot converted to __hot_cold_t hot value.
+  ; HOTCOLD: @_ZnwmRKSt9nothrow_t12__hot_cold_t(i64 10, ptr nonnull %nt, i8 [[PREVHINTHOT]])
+  %call2 = call ptr @_ZnwmRKSt9nothrow_t12__hot_cold_t(i64 10, ptr %nt, i8 7) #2
+  call void @dummy(ptr %call2)
+  ret void
+}
+
+;; Check that operator new(unsigned long, std::align_val_t, const std::nothrow_t&, __hot_cold_t)
+;; optionally has its hint updated.
+; HOTCOLD-LABEL: @new_align_nothrow_hot_cold()
+define void @new_align_nothrow_hot_cold() {
+  %nt = alloca i8
+  ;; Attribute cold converted to __hot_cold_t cold value.
+  ; HOTCOLD: @_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t(i64 10, i64 8, ptr nonnull %nt, i8 [[PREVHINTCOLD]])
+  %call = call ptr @_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t(i64 10, i64 8, ptr %nt, i8 7) #0
+  call void @dummy(ptr %call)
+  ;; Attribute notcold converted to __hot_cold_t notcold value.
+  ; HOTCOLD: @_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t(i64 10, i64 8, ptr nonnull %nt, i8 [[PREVHINTNOTCOLD]])
+  %call1 = call ptr @_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t(i64 10, i64 8, ptr %nt, i8 7) #1
+  call void @dummy(ptr %call1)
+  ;; Attribute hot converted to __hot_cold_t hot value.
+  ; HOTCOLD: @_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t(i64 10, i64 8, ptr nonnull %nt, i8 [[PREVHINTHOT]])
+  %call2 = call ptr @_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t(i64 10, i64 8, ptr %nt, i8 7) #2
+  call void @dummy(ptr %call2)
+  ret void
+}
+
+;; Check that operator new[](unsigned long, __hot_cold_t)
+;; optionally has its hint updated.
+; HOTCOLD-LABEL: @array_new_hot_cold()
+define void @array_new_hot_cold() {
+  ;; Attribute cold converted to __hot_cold_t cold value.
+  ; HOTCOLD: @_Znam12__hot_cold_t(i64 10, i8 [[PREVHINTCOLD]])
+  %call = call ptr @_Znam12__hot_cold_t(i64 10, i8 7) #0
+  call void @dummy(ptr %call)
+  ;; Attribute notcold converted to __hot_cold_t notcold value.
+  ; HOTCOLD: @_Znam12__hot_cold_t(i64 10, i8 [[PREVHINTNOTCOLD]])
+  %call1 = call ptr @_Znam12__hot_cold_t(i64 10, i8 7) #1
+  call void @dummy(ptr %call1)
+  ;; Attribute hot converted to __hot_cold_t hot value.
+  ; HOTCOLD: @_Znam12__hot_cold_t(i64 10, i8 [[PREVHINTHOT]])
+  %call2 = call ptr @_Znam12__hot_cold_t(i64 10, i8 7) #2
+  call void @dummy(ptr %call2)
+  ret void
+}
+
+;; Check that operator new[](unsigned long, std::align_val_t, __hot_cold_t)
+;; optionally has its hint updated.
+; HOTCOLD-LABEL: @array_new_align_hot_cold()
+define void @array_new_align_hot_cold() {
+  ;; Attribute cold converted to __hot_cold_t cold value.
+  ; HOTCOLD: @_ZnamSt11align_val_t12__hot_cold_t(i64 10, i64 8, i8 [[PREVHINTCOLD]])
+  %call = call ptr @_ZnamSt11align_val_t12__hot_cold_t(i64 10, i64 8, i8 7) #0
+  call void @dummy(ptr %call)
+  ;; Attribute notcold converted to __hot_cold_t notcold value.
+  ; HOTCOLD: @_ZnamSt11align_val_t12__hot_cold_t(i64 10, i64 8, i8 [[PREVHINTNOTCOLD]])
+  %call1 = call ptr @_ZnamSt11align_val_t12__hot_cold_t(i64 10, i64 8, i8 7) #1
+  call void @dummy(ptr %call1)
+  ;; Attribute hot converted to __hot_cold_t hot value.
+  ; HOTCOLD: @_ZnamSt11align_val_t12__hot_cold_t(i64 10, i64 8, i8 [[PREVHINTHOT]])
+  %call2 = call ptr @_ZnamSt11align_val_t12__hot_cold_t(i64 10, i64 8, i8 7) #2
+  call void @dummy(ptr %call2)
+  ret void
+}
+
+;; Check that operator new[](unsigned long, const std::nothrow_t&, __hot_cold_t)
+;; optionally has its hint updated.
+; HOTCOLD-LABEL: @array_new_nothrow_hot_cold()
+define void @array_new_nothrow_hot_cold() {
+  %nt = alloca i8
+  ;; Attribute cold converted to __hot_cold_t cold value.
+  ; HOTCOLD: @_ZnamRKSt9nothrow_t12__hot_cold_t(i64 10, ptr nonnull %nt, i8 [[PREVHINTCOLD]])
+  %call = call ptr @_ZnamRKSt9nothrow_t12__hot_cold_t(i64 10, ptr %nt, i8 7) #0
+  call void @dummy(ptr %call)
+  ;; Attribute notcold converted to __hot_cold_t notcold value.
+  ; HOTCOLD: @_ZnamRKSt9nothrow_t12__hot_cold_t(i64 10, ptr nonnull %nt, i8 [[PREVHINTNOTCOLD]])
+  %call1 = call ptr @_ZnamRKSt9nothrow_t12__hot_cold_t(i64 10, ptr %nt, i8 7) #1
+  call void @dummy(ptr %call1)
+  ;; Attribute hot converted to __hot_cold_t hot value.
+  ; HOTCOLD: @_ZnamRKSt9nothrow_t12__hot_cold_t(i64 10, ptr nonnull %nt, i8 [[PREVHINTHOT]])
+  %call2 = call ptr @_ZnamRKSt9nothrow_t12__hot_cold_t(i64 10, ptr %nt, i8 7) #2
+  call void @dummy(ptr %call2)
+  ret void
+}
+
+;; Check that operator new[](unsigned long, std::align_val_t, const std::nothrow_t&, __hot_cold_t)
+;; optionally has its hint updated.
+; HOTCOLD-LABEL: @array_new_align_nothrow_hot_cold()
+define void @array_new_align_nothrow_hot_cold() {
+  %nt = alloca i8
+  ;; Attribute cold converted to __hot_cold_t cold value.
+  ; HOTCOLD: @_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t(i64 10, i64 8, ptr nonnull %nt, i8 [[PREVHINTCOLD]])
+  %call = call ptr @_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t(i64 10, i64 8, ptr %nt, i8 7) #0
+  call void @dummy(ptr %call)
+  ;; Attribute notcold converted to __hot_cold_t notcold value.
+  ; HOTCOLD: @_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t(i64 10, i64 8, ptr nonnull %nt, i8 [[PREVHINTNOTCOLD]])
+  %call1 = call ptr @_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t(i64 10, i64 8, ptr %nt, i8 7) #1
+  call void @dummy(ptr %call1)
+  ;; Attribute hot converted to __hot_cold_t hot value.
+  ; HOTCOLD: @_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t(i64 10, i64 8, ptr nonnull %nt, i8 [[PREVHINTHOT]])
+  %call2 = call ptr @_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t(i64 10, i64 8, ptr %nt, i8 7) #2
+  call void @dummy(ptr %call2)
+  ret void
+}
+
 ;; So that instcombine doesn't optimize out the call.
 declare void @dummy(ptr)
 
@@ -189,6 +351,14 @@ declare ptr @_Znam(i64)
 declare ptr @_ZnamSt11align_val_t(i64, i64)
 declare ptr @_ZnamRKSt9nothrow_t(i64, ptr)
 declare ptr @_ZnamSt11align_val_tRKSt9nothrow_t(i64, i64, ptr)
+declare ptr @_Znwm12__hot_cold_t(i64, i8)
+declare ptr @_ZnwmSt11align_val_t12__hot_cold_t(i64, i64, i8)
+declare ptr @_ZnwmRKSt9nothrow_t12__hot_cold_t(i64, ptr, i8)
+declare ptr @_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t(i64, i64, ptr, i8)
+declare ptr @_Znam12__hot_cold_t(i64, i8)
+declare ptr @_ZnamSt11align_val_t12__hot_cold_t(i64, i64, i8)
+declare ptr @_ZnamRKSt9nothrow_t12__hot_cold_t(i64, ptr, i8)
+declare ptr @_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t(i64, i64, ptr, i8)
 
 attributes #0 = { builtin allocsize(0) "memprof"="cold" }
 attributes #1 = { builtin allocsize(0) "memprof"="notcold" }
diff --git a/llvm/test/Transforms/InstCombine/trunc-binop-ext.ll b/llvm/test/Transforms/InstCombine/trunc-binop-ext.ll
index 787df081eef2..e3103906911a 100644
--- a/llvm/test/Transforms/InstCombine/trunc-binop-ext.ll
+++ b/llvm/test/Transforms/InstCombine/trunc-binop-ext.ll
@@ -1,9 +1,11 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s
 
 define i16 @narrow_sext_and(i16 %x16, i32 %y32) {
-; CHECK-LABEL: @narrow_sext_and(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 %y32 to i16
-; CHECK-NEXT:    [[R:%.*]] = and i16 [[TMP1]], %x16
+; CHECK-LABEL: define i16 @narrow_sext_and(
+; CHECK-SAME: i16 [[X16:%.*]], i32 [[Y32:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[Y32]] to i16
+; CHECK-NEXT:    [[R:%.*]] = and i16 [[TMP1]], [[X16]]
 ; CHECK-NEXT:    ret i16 [[R]]
 ;
   %x32 = sext i16 %x16 to i32
@@ -13,9 +15,10 @@ define i16 @narrow_sext_and(i16 %x16, i32 %y32) {
 }
 
 define i16 @narrow_zext_and(i16 %x16, i32 %y32) {
-; CHECK-LABEL: @narrow_zext_and(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 %y32 to i16
-; CHECK-NEXT:    [[R:%.*]] = and i16 [[TMP1]], %x16
+; CHECK-LABEL: define i16 @narrow_zext_and(
+; CHECK-SAME: i16 [[X16:%.*]], i32 [[Y32:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[Y32]] to i16
+; CHECK-NEXT:    [[R:%.*]] = and i16 [[TMP1]], [[X16]]
 ; CHECK-NEXT:    ret i16 [[R]]
 ;
   %x32 = zext i16 %x16 to i32
@@ -25,9 +28,10 @@ define i16 @narrow_zext_and(i16 %x16, i32 %y32) {
 }
 
 define i16 @narrow_sext_or(i16 %x16, i32 %y32) {
-; CHECK-LABEL: @narrow_sext_or(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 %y32 to i16
-; CHECK-NEXT:    [[R:%.*]] = or i16 [[TMP1]], %x16
+; CHECK-LABEL: define i16 @narrow_sext_or(
+; CHECK-SAME: i16 [[X16:%.*]], i32 [[Y32:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[Y32]] to i16
+; CHECK-NEXT:    [[R:%.*]] = or i16 [[TMP1]], [[X16]]
 ; CHECK-NEXT:    ret i16 [[R]]
 ;
   %x32 = sext i16 %x16 to i32
@@ -37,9 +41,10 @@ define i16 @narrow_sext_or(i16 %x16, i32 %y32) {
 }
 
 define i16 @narrow_zext_or(i16 %x16, i32 %y32) {
-; CHECK-LABEL: @narrow_zext_or(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 %y32 to i16
-; CHECK-NEXT:    [[R:%.*]] = or i16 [[TMP1]], %x16
+; CHECK-LABEL: define i16 @narrow_zext_or(
+; CHECK-SAME: i16 [[X16:%.*]], i32 [[Y32:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[Y32]] to i16
+; CHECK-NEXT:    [[R:%.*]] = or i16 [[TMP1]], [[X16]]
 ; CHECK-NEXT:    ret i16 [[R]]
 ;
   %x32 = zext i16 %x16 to i32
@@ -49,9 +54,10 @@ define i16 @narrow_zext_or(i16 %x16, i32 %y32) {
 }
 
 define i16 @narrow_sext_xor(i16 %x16, i32 %y32) {
-; CHECK-LABEL: @narrow_sext_xor(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 %y32 to i16
-; CHECK-NEXT:    [[R:%.*]] = xor i16 [[TMP1]], %x16
+; CHECK-LABEL: define i16 @narrow_sext_xor(
+; CHECK-SAME: i16 [[X16:%.*]], i32 [[Y32:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[Y32]] to i16
+; CHECK-NEXT:    [[R:%.*]] = xor i16 [[TMP1]], [[X16]]
 ; CHECK-NEXT:    ret i16 [[R]]
 ;
   %x32 = sext i16 %x16 to i32
@@ -61,9 +67,10 @@ define i16 @narrow_sext_xor(i16 %x16, i32 %y32) {
 }
 
 define i16 @narrow_zext_xor(i16 %x16, i32 %y32) {
-; CHECK-LABEL: @narrow_zext_xor(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 %y32 to i16
-; CHECK-NEXT:    [[R:%.*]] = xor i16 [[TMP1]], %x16
+; CHECK-LABEL: define i16 @narrow_zext_xor(
+; CHECK-SAME: i16 [[X16:%.*]], i32 [[Y32:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[Y32]] to i16
+; CHECK-NEXT:    [[R:%.*]] = xor i16 [[TMP1]], [[X16]]
 ; CHECK-NEXT:    ret i16 [[R]]
 ;
   %x32 = zext i16 %x16 to i32
@@ -73,9 +80,10 @@ define i16 @narrow_zext_xor(i16 %x16, i32 %y32) {
 }
 
 define i16 @narrow_sext_add(i16 %x16, i32 %y32) {
-; CHECK-LABEL: @narrow_sext_add(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 %y32 to i16
-; CHECK-NEXT:    [[R:%.*]] = add i16 [[TMP1]], %x16
+; CHECK-LABEL: define i16 @narrow_sext_add(
+; CHECK-SAME: i16 [[X16:%.*]], i32 [[Y32:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[Y32]] to i16
+; CHECK-NEXT:    [[R:%.*]] = add i16 [[TMP1]], [[X16]]
 ; CHECK-NEXT:    ret i16 [[R]]
 ;
   %x32 = sext i16 %x16 to i32
@@ -85,9 +93,10 @@ define i16 @narrow_sext_add(i16 %x16, i32 %y32) {
 }
 
 define i16 @narrow_zext_add(i16 %x16, i32 %y32) {
-; CHECK-LABEL: @narrow_zext_add(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 %y32 to i16
-; CHECK-NEXT:    [[R:%.*]] = add i16 [[TMP1]], %x16
+; CHECK-LABEL: define i16 @narrow_zext_add(
+; CHECK-SAME: i16 [[X16:%.*]], i32 [[Y32:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[Y32]] to i16
+; CHECK-NEXT:    [[R:%.*]] = add i16 [[TMP1]], [[X16]]
 ; CHECK-NEXT:    ret i16 [[R]]
 ;
   %x32 = zext i16 %x16 to i32
@@ -97,9 +106,10 @@ define i16 @narrow_zext_add(i16 %x16, i32 %y32) {
 }
 
 define i16 @narrow_sext_sub(i16 %x16, i32 %y32) {
-; CHECK-LABEL: @narrow_sext_sub(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 %y32 to i16
-; CHECK-NEXT:    [[R:%.*]] = sub i16 %x16, [[TMP1]]
+; CHECK-LABEL: define i16 @narrow_sext_sub(
+; CHECK-SAME: i16 [[X16:%.*]], i32 [[Y32:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[Y32]] to i16
+; CHECK-NEXT:    [[R:%.*]] = sub i16 [[X16]], [[TMP1]]
 ; CHECK-NEXT:    ret i16 [[R]]
 ;
   %x32 = sext i16 %x16 to i32
@@ -109,9 +119,10 @@ define i16 @narrow_sext_sub(i16 %x16, i32 %y32) {
 }
 
 define i16 @narrow_zext_sub(i16 %x16, i32 %y32) {
-; CHECK-LABEL: @narrow_zext_sub(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 %y32 to i16
-; CHECK-NEXT:    [[R:%.*]] = sub i16 %x16, [[TMP1]]
+; CHECK-LABEL: define i16 @narrow_zext_sub(
+; CHECK-SAME: i16 [[X16:%.*]], i32 [[Y32:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[Y32]] to i16
+; CHECK-NEXT:    [[R:%.*]] = sub i16 [[X16]], [[TMP1]]
 ; CHECK-NEXT:    ret i16 [[R]]
 ;
   %x32 = zext i16 %x16 to i32
@@ -121,9 +132,10 @@ define i16 @narrow_zext_sub(i16 %x16, i32 %y32) {
 }
 
 define i16 @narrow_sext_mul(i16 %x16, i32 %y32) {
-; CHECK-LABEL: @narrow_sext_mul(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 %y32 to i16
-; CHECK-NEXT:    [[R:%.*]] = mul i16 [[TMP1]], %x16
+; CHECK-LABEL: define i16 @narrow_sext_mul(
+; CHECK-SAME: i16 [[X16:%.*]], i32 [[Y32:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[Y32]] to i16
+; CHECK-NEXT:    [[R:%.*]] = mul i16 [[TMP1]], [[X16]]
 ; CHECK-NEXT:    ret i16 [[R]]
 ;
   %x32 = sext i16 %x16 to i32
@@ -133,9 +145,10 @@ define i16 @narrow_sext_mul(i16 %x16, i32 %y32) {
 }
 
 define i16 @narrow_zext_mul(i16 %x16, i32 %y32) {
-; CHECK-LABEL: @narrow_zext_mul(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 %y32 to i16
-; CHECK-NEXT:    [[R:%.*]] = mul i16 [[TMP1]], %x16
+; CHECK-LABEL: define i16 @narrow_zext_mul(
+; CHECK-SAME: i16 [[X16:%.*]], i32 [[Y32:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[Y32]] to i16
+; CHECK-NEXT:    [[R:%.*]] = mul i16 [[TMP1]], [[X16]]
 ; CHECK-NEXT:    ret i16 [[R]]
 ;
   %x32 = zext i16 %x16 to i32
@@ -148,10 +161,11 @@ define i16 @narrow_zext_mul(i16 %x16, i32 %y32) {
 ; canonicalization doesn't swap the binop operands. Use vector types to show those work too.
 
 define <2 x i16> @narrow_sext_and_commute(<2 x i16> %x16, <2 x i32> %y32) {
-; CHECK-LABEL: @narrow_sext_and_commute(
-; CHECK-NEXT:    [[Y32OP0:%.*]] = sdiv <2 x i32> %y32, <i32 7, i32 -17>
+; CHECK-LABEL: define <2 x i16> @narrow_sext_and_commute(
+; CHECK-SAME: <2 x i16> [[X16:%.*]], <2 x i32> [[Y32:%.*]]) {
+; CHECK-NEXT:    [[Y32OP0:%.*]] = sdiv <2 x i32> [[Y32]], <i32 7, i32 -17>
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i32> [[Y32OP0]] to <2 x i16>
-; CHECK-NEXT:    [[R:%.*]] = and <2 x i16> [[TMP1]], %x16
+; CHECK-NEXT:    [[R:%.*]] = and <2 x i16> [[TMP1]], [[X16]]
 ; CHECK-NEXT:    ret <2 x i16> [[R]]
 ;
   %y32op0 = sdiv <2 x i32> %y32, <i32 7, i32 -17>
@@ -162,10 +176,11 @@ define <2 x i16> @narrow_sext_and_commute(<2 x i16> %x16, <2 x i32> %y32) {
 }
 
 define <2 x i16> @narrow_zext_and_commute(<2 x i16> %x16, <2 x i32> %y32) {
-; CHECK-LABEL: @narrow_zext_and_commute(
-; CHECK-NEXT:    [[Y32OP0:%.*]] = sdiv <2 x i32> %y32, <i32 7, i32 -17>
+; CHECK-LABEL: define <2 x i16> @narrow_zext_and_commute(
+; CHECK-SAME: <2 x i16> [[X16:%.*]], <2 x i32> [[Y32:%.*]]) {
+; CHECK-NEXT:    [[Y32OP0:%.*]] = sdiv <2 x i32> [[Y32]], <i32 7, i32 -17>
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i32> [[Y32OP0]] to <2 x i16>
-; CHECK-NEXT:    [[R:%.*]] = and <2 x i16> [[TMP1]], %x16
+; CHECK-NEXT:    [[R:%.*]] = and <2 x i16> [[TMP1]], [[X16]]
 ; CHECK-NEXT:    ret <2 x i16> [[R]]
 ;
   %y32op0 = sdiv <2 x i32> %y32, <i32 7, i32 -17>
@@ -176,10 +191,11 @@ define <2 x i16> @narrow_zext_and_commute(<2 x i16> %x16, <2 x i32> %y32) {
 }
 
 define <2 x i16> @narrow_sext_or_commute(<2 x i16> %x16, <2 x i32> %y32) {
-; CHECK-LABEL: @narrow_sext_or_commute(
-; CHECK-NEXT:    [[Y32OP0:%.*]] = sdiv <2 x i32> %y32, <i32 7, i32 -17>
+; CHECK-LABEL: define <2 x i16> @narrow_sext_or_commute(
+; CHECK-SAME: <2 x i16> [[X16:%.*]], <2 x i32> [[Y32:%.*]]) {
+; CHECK-NEXT:    [[Y32OP0:%.*]] = sdiv <2 x i32> [[Y32]], <i32 7, i32 -17>
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i32> [[Y32OP0]] to <2 x i16>
-; CHECK-NEXT:    [[R:%.*]] = or <2 x i16> [[TMP1]], %x16
+; CHECK-NEXT:    [[R:%.*]] = or <2 x i16> [[TMP1]], [[X16]]
 ; CHECK-NEXT:    ret <2 x i16> [[R]]
 ;
   %y32op0 = sdiv <2 x i32> %y32, <i32 7, i32 -17>
@@ -190,10 +206,11 @@ define <2 x i16> @narrow_sext_or_commute(<2 x i16> %x16, <2 x i32> %y32) {
 }
 
 define <2 x i16> @narrow_zext_or_commute(<2 x i16> %x16, <2 x i32> %y32) {
-; CHECK-LABEL: @narrow_zext_or_commute(
-; CHECK-NEXT:    [[Y32OP0:%.*]] = sdiv <2 x i32> %y32, <i32 7, i32 -17>
+; CHECK-LABEL: define <2 x i16> @narrow_zext_or_commute(
+; CHECK-SAME: <2 x i16> [[X16:%.*]], <2 x i32> [[Y32:%.*]]) {
+; CHECK-NEXT:    [[Y32OP0:%.*]] = sdiv <2 x i32> [[Y32]], <i32 7, i32 -17>
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i32> [[Y32OP0]] to <2 x i16>
-; CHECK-NEXT:    [[R:%.*]] = or <2 x i16> [[TMP1]], %x16
+; CHECK-NEXT:    [[R:%.*]] = or <2 x i16> [[TMP1]], [[X16]]
 ; CHECK-NEXT:    ret <2 x i16> [[R]]
 ;
   %y32op0 = sdiv <2 x i32> %y32, <i32 7, i32 -17>
@@ -204,10 +221,11 @@ define <2 x i16> @narrow_zext_or_commute(<2 x i16> %x16, <2 x i32> %y32) {
 }
 
 define <2 x i16> @narrow_sext_xor_commute(<2 x i16> %x16, <2 x i32> %y32) {
-; CHECK-LABEL: @narrow_sext_xor_commute(
-; CHECK-NEXT:    [[Y32OP0:%.*]] = sdiv <2 x i32> %y32, <i32 7, i32 -17>
+; CHECK-LABEL: define <2 x i16> @narrow_sext_xor_commute(
+; CHECK-SAME: <2 x i16> [[X16:%.*]], <2 x i32> [[Y32:%.*]]) {
+; CHECK-NEXT:    [[Y32OP0:%.*]] = sdiv <2 x i32> [[Y32]], <i32 7, i32 -17>
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i32> [[Y32OP0]] to <2 x i16>
-; CHECK-NEXT:    [[R:%.*]] = xor <2 x i16> [[TMP1]], %x16
+; CHECK-NEXT:    [[R:%.*]] = xor <2 x i16> [[TMP1]], [[X16]]
 ; CHECK-NEXT:    ret <2 x i16> [[R]]
 ;
   %y32op0 = sdiv <2 x i32> %y32, <i32 7, i32 -17>
@@ -218,10 +236,11 @@ define <2 x i16> @narrow_sext_xor_commute(<2 x i16> %x16, <2 x i32> %y32) {
 }
 
 define <2 x i16> @narrow_zext_xor_commute(<2 x i16> %x16, <2 x i32> %y32) {
-; CHECK-LABEL: @narrow_zext_xor_commute(
-; CHECK-NEXT:    [[Y32OP0:%.*]] = sdiv <2 x i32> %y32, <i32 7, i32 -17>
+; CHECK-LABEL: define <2 x i16> @narrow_zext_xor_commute(
+; CHECK-SAME: <2 x i16> [[X16:%.*]], <2 x i32> [[Y32:%.*]]) {
+; CHECK-NEXT:    [[Y32OP0:%.*]] = sdiv <2 x i32> [[Y32]], <i32 7, i32 -17>
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i32> [[Y32OP0]] to <2 x i16>
-; CHECK-NEXT:    [[R:%.*]] = xor <2 x i16> [[TMP1]], %x16
+; CHECK-NEXT:    [[R:%.*]] = xor <2 x i16> [[TMP1]], [[X16]]
 ; CHECK-NEXT:    ret <2 x i16> [[R]]
 ;
   %y32op0 = sdiv <2 x i32> %y32, <i32 7, i32 -17>
@@ -232,10 +251,11 @@ define <2 x i16> @narrow_zext_xor_commute(<2 x i16> %x16, <2 x i32> %y32) {
 }
 
 define <2 x i16> @narrow_sext_add_commute(<2 x i16> %x16, <2 x i32> %y32) {
-; CHECK-LABEL: @narrow_sext_add_commute(
-; CHECK-NEXT:    [[Y32OP0:%.*]] = sdiv <2 x i32> %y32, <i32 7, i32 -17>
+; CHECK-LABEL: define <2 x i16> @narrow_sext_add_commute(
+; CHECK-SAME: <2 x i16> [[X16:%.*]], <2 x i32> [[Y32:%.*]]) {
+; CHECK-NEXT:    [[Y32OP0:%.*]] = sdiv <2 x i32> [[Y32]], <i32 7, i32 -17>
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i32> [[Y32OP0]] to <2 x i16>
-; CHECK-NEXT:    [[R:%.*]] = add <2 x i16> [[TMP1]], %x16
+; CHECK-NEXT:    [[R:%.*]] = add <2 x i16> [[TMP1]], [[X16]]
 ; CHECK-NEXT:    ret <2 x i16> [[R]]
 ;
   %y32op0 = sdiv <2 x i32> %y32, <i32 7, i32 -17>
@@ -246,10 +266,11 @@ define <2 x i16> @narrow_sext_add_commute(<2 x i16> %x16, <2 x i32> %y32) {
 }
 
 define <2 x i16> @narrow_zext_add_commute(<2 x i16> %x16, <2 x i32> %y32) {
-; CHECK-LABEL: @narrow_zext_add_commute(
-; CHECK-NEXT:    [[Y32OP0:%.*]] = sdiv <2 x i32> %y32, <i32 7, i32 -17>
+; CHECK-LABEL: define <2 x i16> @narrow_zext_add_commute(
+; CHECK-SAME: <2 x i16> [[X16:%.*]], <2 x i32> [[Y32:%.*]]) {
+; CHECK-NEXT:    [[Y32OP0:%.*]] = sdiv <2 x i32> [[Y32]], <i32 7, i32 -17>
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i32> [[Y32OP0]] to <2 x i16>
-; CHECK-NEXT:    [[R:%.*]] = add <2 x i16> [[TMP1]], %x16
+; CHECK-NEXT:    [[R:%.*]] = add <2 x i16> [[TMP1]], [[X16]]
 ; CHECK-NEXT:    ret <2 x i16> [[R]]
 ;
   %y32op0 = sdiv <2 x i32> %y32, <i32 7, i32 -17>
@@ -260,10 +281,11 @@ define <2 x i16> @narrow_zext_add_commute(<2 x i16> %x16, <2 x i32> %y32) {
 }
 
 define <2 x i16> @narrow_sext_sub_commute(<2 x i16> %x16, <2 x i32> %y32) {
-; CHECK-LABEL: @narrow_sext_sub_commute(
-; CHECK-NEXT:    [[Y32OP0:%.*]] = sdiv <2 x i32> %y32, <i32 7, i32 -17>
+; CHECK-LABEL: define <2 x i16> @narrow_sext_sub_commute(
+; CHECK-SAME: <2 x i16> [[X16:%.*]], <2 x i32> [[Y32:%.*]]) {
+; CHECK-NEXT:    [[Y32OP0:%.*]] = sdiv <2 x i32> [[Y32]], <i32 7, i32 -17>
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i32> [[Y32OP0]] to <2 x i16>
-; CHECK-NEXT:    [[R:%.*]] = sub <2 x i16> [[TMP1]], %x16
+; CHECK-NEXT:    [[R:%.*]] = sub <2 x i16> [[TMP1]], [[X16]]
 ; CHECK-NEXT:    ret <2 x i16> [[R]]
 ;
   %y32op0 = sdiv <2 x i32> %y32, <i32 7, i32 -17>
@@ -274,10 +296,11 @@ define <2 x i16> @narrow_sext_sub_commute(<2 x i16> %x16, <2 x i32> %y32) {
 }
 
 define <2 x i16> @narrow_zext_sub_commute(<2 x i16> %x16, <2 x i32> %y32) {
-; CHECK-LABEL: @narrow_zext_sub_commute(
-; CHECK-NEXT:    [[Y32OP0:%.*]] = sdiv <2 x i32> %y32, <i32 7, i32 -17>
+; CHECK-LABEL: define <2 x i16> @narrow_zext_sub_commute(
+; CHECK-SAME: <2 x i16> [[X16:%.*]], <2 x i32> [[Y32:%.*]]) {
+; CHECK-NEXT:    [[Y32OP0:%.*]] = sdiv <2 x i32> [[Y32]], <i32 7, i32 -17>
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i32> [[Y32OP0]] to <2 x i16>
-; CHECK-NEXT:    [[R:%.*]] = sub <2 x i16> [[TMP1]], %x16
+; CHECK-NEXT:    [[R:%.*]] = sub <2 x i16> [[TMP1]], [[X16]]
 ; CHECK-NEXT:    ret <2 x i16> [[R]]
 ;
   %y32op0 = sdiv <2 x i32> %y32, <i32 7, i32 -17>
@@ -288,10 +311,11 @@ define <2 x i16> @narrow_zext_sub_commute(<2 x i16> %x16, <2 x i32> %y32) {
 }
 
 define <2 x i16> @narrow_sext_mul_commute(<2 x i16> %x16, <2 x i32> %y32) {
-; CHECK-LABEL: @narrow_sext_mul_commute(
-; CHECK-NEXT:    [[Y32OP0:%.*]] = sdiv <2 x i32> %y32, <i32 7, i32 -17>
+; CHECK-LABEL: define <2 x i16> @narrow_sext_mul_commute(
+; CHECK-SAME: <2 x i16> [[X16:%.*]], <2 x i32> [[Y32:%.*]]) {
+; CHECK-NEXT:    [[Y32OP0:%.*]] = sdiv <2 x i32> [[Y32]], <i32 7, i32 -17>
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i32> [[Y32OP0]] to <2 x i16>
-; CHECK-NEXT:    [[R:%.*]] = mul <2 x i16> [[TMP1]], %x16
+; CHECK-NEXT:    [[R:%.*]] = mul <2 x i16> [[TMP1]], [[X16]]
 ; CHECK-NEXT:    ret <2 x i16> [[R]]
 ;
   %y32op0 = sdiv <2 x i32> %y32, <i32 7, i32 -17>
@@ -302,10 +326,11 @@ define <2 x i16> @narrow_sext_mul_commute(<2 x i16> %x16, <2 x i32> %y32) {
 }
 
 define <2 x i16> @narrow_zext_mul_commute(<2 x i16> %x16, <2 x i32> %y32) {
-; CHECK-LABEL: @narrow_zext_mul_commute(
-; CHECK-NEXT:    [[Y32OP0:%.*]] = sdiv <2 x i32> %y32, <i32 7, i32 -17>
+; CHECK-LABEL: define <2 x i16> @narrow_zext_mul_commute(
+; CHECK-SAME: <2 x i16> [[X16:%.*]], <2 x i32> [[Y32:%.*]]) {
+; CHECK-NEXT:    [[Y32OP0:%.*]] = sdiv <2 x i32> [[Y32]], <i32 7, i32 -17>
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i32> [[Y32OP0]] to <2 x i16>
-; CHECK-NEXT:    [[R:%.*]] = mul <2 x i16> [[TMP1]], %x16
+; CHECK-NEXT:    [[R:%.*]] = mul <2 x i16> [[TMP1]], [[X16]]
 ; CHECK-NEXT:    ret <2 x i16> [[R]]
 ;
   %y32op0 = sdiv <2 x i32> %y32, <i32 7, i32 -17>
@@ -317,12 +342,13 @@ define <2 x i16> @narrow_zext_mul_commute(<2 x i16> %x16, <2 x i32> %y32) {
 
 ; Test cases for PR43580
 define i8 @narrow_zext_ashr_keep_trunc(i8 %i1, i8 %i2) {
-; CHECK-LABEL: @narrow_zext_ashr_keep_trunc(
-; CHECK-NEXT:    [[I1_EXT:%.*]] = sext i8 [[I1:%.*]] to i16
-; CHECK-NEXT:    [[I2_EXT:%.*]] = sext i8 [[I2:%.*]] to i16
+; CHECK-LABEL: define i8 @narrow_zext_ashr_keep_trunc(
+; CHECK-SAME: i8 [[I1:%.*]], i8 [[I2:%.*]]) {
+; CHECK-NEXT:    [[I1_EXT:%.*]] = sext i8 [[I1]] to i16
+; CHECK-NEXT:    [[I2_EXT:%.*]] = sext i8 [[I2]] to i16
 ; CHECK-NEXT:    [[SUB:%.*]] = add nsw i16 [[I1_EXT]], [[I2_EXT]]
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr i16 [[SUB]], 1
-; CHECK-NEXT:    [[T:%.*]] = trunc i16 [[TMP1]] to i8
+; CHECK-NEXT:    [[SHIFT:%.*]] = lshr i16 [[SUB]], 1
+; CHECK-NEXT:    [[T:%.*]] = trunc i16 [[SHIFT]] to i8
 ; CHECK-NEXT:    ret i8 [[T]]
 ;
   %i1.ext = sext i8 %i1 to i32
@@ -334,12 +360,13 @@ define i8 @narrow_zext_ashr_keep_trunc(i8 %i1, i8 %i2) {
 }
 
 define i8 @narrow_zext_ashr_keep_trunc2(i9 %i1, i9 %i2) {
-; CHECK-LABEL: @narrow_zext_ashr_keep_trunc2(
-; CHECK-NEXT:    [[I1_EXT1:%.*]] = zext i9 [[I1:%.*]] to i16
-; CHECK-NEXT:    [[I2_EXT2:%.*]] = zext i9 [[I2:%.*]] to i16
-; CHECK-NEXT:    [[SUB:%.*]] = add nuw nsw i16 [[I1_EXT1]], [[I2_EXT2]]
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr i16 [[SUB]], 1
-; CHECK-NEXT:    [[T:%.*]] = trunc i16 [[TMP1]] to i8
+; CHECK-LABEL: define i8 @narrow_zext_ashr_keep_trunc2(
+; CHECK-SAME: i9 [[I1:%.*]], i9 [[I2:%.*]]) {
+; CHECK-NEXT:    [[I1_EXT:%.*]] = zext i9 [[I1]] to i16
+; CHECK-NEXT:    [[I2_EXT:%.*]] = zext i9 [[I2]] to i16
+; CHECK-NEXT:    [[SUB:%.*]] = add nuw nsw i16 [[I1_EXT]], [[I2_EXT]]
+; CHECK-NEXT:    [[SHIFT:%.*]] = lshr i16 [[SUB]], 1
+; CHECK-NEXT:    [[T:%.*]] = trunc i16 [[SHIFT]] to i8
 ; CHECK-NEXT:    ret i8 [[T]]
 ;
   %i1.ext = sext i9 %i1 to i64
@@ -351,12 +378,13 @@ define i8 @narrow_zext_ashr_keep_trunc2(i9 %i1, i9 %i2) {
 }
 
 define i7 @narrow_zext_ashr_keep_trunc3(i8 %i1, i8 %i2) {
-; CHECK-LABEL: @narrow_zext_ashr_keep_trunc3(
-; CHECK-NEXT:    [[I1_EXT1:%.*]] = zext i8 [[I1:%.*]] to i14
-; CHECK-NEXT:    [[I2_EXT2:%.*]] = zext i8 [[I2:%.*]] to i14
-; CHECK-NEXT:    [[SUB:%.*]] = add nuw nsw i14 [[I1_EXT1]], [[I2_EXT2]]
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr i14 [[SUB]], 1
-; CHECK-NEXT:    [[T:%.*]] = trunc i14 [[TMP1]] to i7
+; CHECK-LABEL: define i7 @narrow_zext_ashr_keep_trunc3(
+; CHECK-SAME: i8 [[I1:%.*]], i8 [[I2:%.*]]) {
+; CHECK-NEXT:    [[I1_EXT:%.*]] = zext i8 [[I1]] to i14
+; CHECK-NEXT:    [[I2_EXT:%.*]] = zext i8 [[I2]] to i14
+; CHECK-NEXT:    [[SUB:%.*]] = add nuw nsw i14 [[I1_EXT]], [[I2_EXT]]
+; CHECK-NEXT:    [[SHIFT:%.*]] = lshr i14 [[SUB]], 1
+; CHECK-NEXT:    [[T:%.*]] = trunc i14 [[SHIFT]] to i7
 ; CHECK-NEXT:    ret i7 [[T]]
 ;
   %i1.ext = sext i8 %i1 to i64
@@ -368,12 +396,13 @@ define i7 @narrow_zext_ashr_keep_trunc3(i8 %i1, i8 %i2) {
 }
 
 define <8 x i8> @narrow_zext_ashr_keep_trunc_vector(<8 x i8> %i1, <8 x i8> %i2) {
-; CHECK-LABEL: @narrow_zext_ashr_keep_trunc_vector(
-; CHECK-NEXT:    [[I1_EXT:%.*]] = sext <8 x i8> [[I1:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[I2_EXT:%.*]] = sext <8 x i8> [[I2:%.*]] to <8 x i32>
+; CHECK-LABEL: define <8 x i8> @narrow_zext_ashr_keep_trunc_vector(
+; CHECK-SAME: <8 x i8> [[I1:%.*]], <8 x i8> [[I2:%.*]]) {
+; CHECK-NEXT:    [[I1_EXT:%.*]] = sext <8 x i8> [[I1]] to <8 x i32>
+; CHECK-NEXT:    [[I2_EXT:%.*]] = sext <8 x i8> [[I2]] to <8 x i32>
 ; CHECK-NEXT:    [[SUB:%.*]] = add nsw <8 x i32> [[I1_EXT]], [[I2_EXT]]
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i32> [[SUB]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[T:%.*]] = trunc <8 x i32> [[TMP1]] to <8 x i8>
+; CHECK-NEXT:    [[SHIFT:%.*]] = lshr <8 x i32> [[SUB]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[T:%.*]] = trunc <8 x i32> [[SHIFT]] to <8 x i8>
 ; CHECK-NEXT:    ret <8 x i8> [[T]]
 ;
   %i1.ext = sext <8 x i8> %i1 to <8 x i32>
@@ -385,12 +414,13 @@ define <8 x i8> @narrow_zext_ashr_keep_trunc_vector(<8 x i8> %i1, <8 x i8> %i2)
 }
 
 define i8 @dont_narrow_zext_ashr_keep_trunc(i8 %i1, i8 %i2) {
-; CHECK-LABEL: @dont_narrow_zext_ashr_keep_trunc(
-; CHECK-NEXT:    [[I1_EXT:%.*]] = sext i8 [[I1:%.*]] to i16
-; CHECK-NEXT:    [[I2_EXT:%.*]] = sext i8 [[I2:%.*]] to i16
+; CHECK-LABEL: define i8 @dont_narrow_zext_ashr_keep_trunc(
+; CHECK-SAME: i8 [[I1:%.*]], i8 [[I2:%.*]]) {
+; CHECK-NEXT:    [[I1_EXT:%.*]] = sext i8 [[I1]] to i16
+; CHECK-NEXT:    [[I2_EXT:%.*]] = sext i8 [[I2]] to i16
 ; CHECK-NEXT:    [[SUB:%.*]] = add nsw i16 [[I1_EXT]], [[I2_EXT]]
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr i16 [[SUB]], 1
-; CHECK-NEXT:    [[T:%.*]] = trunc i16 [[TMP1]] to i8
+; CHECK-NEXT:    [[SHIFT:%.*]] = lshr i16 [[SUB]], 1
+; CHECK-NEXT:    [[T:%.*]] = trunc i16 [[SHIFT]] to i8
 ; CHECK-NEXT:    ret i8 [[T]]
 ;
   %i1.ext = sext i8 %i1 to i16
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/gep-constanfolding-error.ll b/llvm/test/Transforms/InstSimplify/ConstProp/gep-constanfolding-error.ll
index 474af2d42726..bcba5ce3aa7e 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/gep-constanfolding-error.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/gep-constanfolding-error.ll
@@ -1,52 +1,52 @@
-; RUN: opt -passes=gvn -S -o - %s | FileCheck %s
-; RUN: opt -passes=newgvn -S -o - %s | FileCheck %s
-; Test that the constantfolding getelementptr computation results in
-; j[5][4][1] (j+239)
-; and not [1][4][4][1] (#449) which is an incorrect out-of-range error
-target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
-target triple = "armv7-none-eabi"
-
-@f = local_unnamed_addr global i32 2, align 4
-@t6 = local_unnamed_addr global i32 1, align 4
-@j = local_unnamed_addr global [6 x [6 x [7 x i8]]] [[6 x [7 x i8]] [[7 x i8] c"\06\00\00\00\00\00\00", [7 x i8] zeroinitializer, [7 x i8] zeroinitializer, [7 x i8] zeroinitializer, [7 x i8] zeroinitializer, [7 x i8] zeroinitializer], [6 x [7 x i8]] zeroinitializer, [6 x [7 x i8]] zeroinitializer, [6 x [7 x i8]] zeroinitializer, [6 x [7 x i8]] zeroinitializer, [6 x [7 x i8]] zeroinitializer], align 1
-@p = internal global i64 0, align 8
-@y = local_unnamed_addr global ptr @p, align 4
-@b = internal unnamed_addr global i32 0, align 4
-@h = common local_unnamed_addr global i16 0, align 2
-@a = common local_unnamed_addr global i32 0, align 4
-@k = common local_unnamed_addr global i32 0, align 4
-@t11 = common local_unnamed_addr global i32 0, align 4
-
-; Function Attrs: nounwind
-define i32 @main() local_unnamed_addr {
-entry:
-  %0 = load i32, ptr @t6, align 4
-  %inc = add nsw i32 %0, 1
-  store i32 %inc, ptr @t6, align 4
-  store i16 4, ptr @h, align 2
-  %1 = load i32, ptr @a, align 4
-  %conv = trunc i32 %1 to i8
-  store i32 1, ptr @f, align 4
-  %2 = load i64, ptr @p, align 8
-  %cmp4 = icmp slt i64 %2, 2
-  %conv6 = zext i1 %cmp4 to i8
-  %3 = load i16, ptr @h, align 2
-  %conv7 = sext i16 %3 to i32
-  %add = add nsw i32 %conv7, 1
-  %f.promoted = load i32, ptr @f, align 4
-  %4 = mul i32 %conv7, 7
-  %5 = add i32 %4, 5
-  %6 = sub i32 -1, %f.promoted
-  %7 = icmp sgt i32 %6, -2
-  %smax = select i1 %7, i32 %6, i32 -2
-  %8 = sub i32 6, %smax
-  %scevgep = getelementptr [6 x [6 x [7 x i8]]], ptr @j, i32 0, i32 0, i32 %5, i32 %8
-  %9 = add i32 %f.promoted, %smax
-  %10 = add i32 %9, 2
-  call void @llvm.memset.p0.i32(ptr %scevgep, i8 %conv6, i32 %10, i1 false)
-; CHECK:  call void @llvm.memset.p0.i32(ptr getelementptr inbounds ([6 x [6 x [7 x i8]]], ptr @j, i32 0, i{{32|64}} 5, i{{32|64}} 4, i32 1), i8 %conv6, i32 1, i1 false)
-; CHECK-NOT: call void @llvm.memset.p0.i32(ptr getelementptr ([6 x [6 x [7 x i8]]], ptr @j, i64 1, i64 4, i64 4, i32 1)
-  ret i32 0
-}
-; Function Attrs: argmemonly nounwind
-declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1)
+; RUN: opt -passes=gvn -S -o - %s | FileCheck %s
+; RUN: opt -passes=newgvn -S -o - %s | FileCheck %s
+; Test that the constantfolding getelementptr computation results in
+; j[5][4][1] (j+239)
+; and not [1][4][4][1] (#449) which is an incorrect out-of-range error
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv7-none-eabi"
+
+@f = local_unnamed_addr global i32 2, align 4
+@t6 = local_unnamed_addr global i32 1, align 4
+@j = local_unnamed_addr global [6 x [6 x [7 x i8]]] [[6 x [7 x i8]] [[7 x i8] c"\06\00\00\00\00\00\00", [7 x i8] zeroinitializer, [7 x i8] zeroinitializer, [7 x i8] zeroinitializer, [7 x i8] zeroinitializer, [7 x i8] zeroinitializer], [6 x [7 x i8]] zeroinitializer, [6 x [7 x i8]] zeroinitializer, [6 x [7 x i8]] zeroinitializer, [6 x [7 x i8]] zeroinitializer, [6 x [7 x i8]] zeroinitializer], align 1
+@p = internal global i64 0, align 8
+@y = local_unnamed_addr global ptr @p, align 4
+@b = internal unnamed_addr global i32 0, align 4
+@h = common local_unnamed_addr global i16 0, align 2
+@a = common local_unnamed_addr global i32 0, align 4
+@k = common local_unnamed_addr global i32 0, align 4
+@t11 = common local_unnamed_addr global i32 0, align 4
+
+; Function Attrs: nounwind
+define i32 @main() local_unnamed_addr {
+entry:
+  %0 = load i32, ptr @t6, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, ptr @t6, align 4
+  store i16 4, ptr @h, align 2
+  %1 = load i32, ptr @a, align 4
+  %conv = trunc i32 %1 to i8
+  store i32 1, ptr @f, align 4
+  %2 = load i64, ptr @p, align 8
+  %cmp4 = icmp slt i64 %2, 2
+  %conv6 = zext i1 %cmp4 to i8
+  %3 = load i16, ptr @h, align 2
+  %conv7 = sext i16 %3 to i32
+  %add = add nsw i32 %conv7, 1
+  %f.promoted = load i32, ptr @f, align 4
+  %4 = mul i32 %conv7, 7
+  %5 = add i32 %4, 5
+  %6 = sub i32 -1, %f.promoted
+  %7 = icmp sgt i32 %6, -2
+  %smax = select i1 %7, i32 %6, i32 -2
+  %8 = sub i32 6, %smax
+  %scevgep = getelementptr [6 x [6 x [7 x i8]]], ptr @j, i32 0, i32 0, i32 %5, i32 %8
+  %9 = add i32 %f.promoted, %smax
+  %10 = add i32 %9, 2
+  call void @llvm.memset.p0.i32(ptr %scevgep, i8 %conv6, i32 %10, i1 false)
+; CHECK:  call void @llvm.memset.p0.i32(ptr getelementptr inbounds ([6 x [6 x [7 x i8]]], ptr @j, i32 0, i{{32|64}} 5, i{{32|64}} 4, i32 1), i8 %conv6, i32 1, i1 false)
+; CHECK-NOT: call void @llvm.memset.p0.i32(ptr getelementptr ([6 x [6 x [7 x i8]]], ptr @j, i64 1, i64 4, i64 4, i32 1)
+  ret i32 0
+}
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1)
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/timeout.ll b/llvm/test/Transforms/InstSimplify/ConstProp/timeout.ll
index 3417d4a6c1f4..2c5b31f099fb 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/timeout.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/timeout.ll
@@ -1,70 +1,70 @@
-; NOTE: This is a timeout test for some O(something silly) constant folding behaviour. It may not be the best test. Providing it finishes, it passes.
-; RUN: opt < %s -O3 -S | FileCheck %s
-target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
-target triple = "armv8-none-eabi"
-
-%struct.ST = type { ptr }
-
-@global = internal global [121 x i8] zeroinitializer, align 1
-
-define void @func() #0 {
-;CHECK-LABEL: func
-entry:
-  %s = alloca ptr, align 4
-  %j = alloca i32, align 4
-  store ptr @global, ptr %s, align 4
-  store i32 0, ptr %j, align 4
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %entry
-  %0 = load i32, ptr %j, align 4
-  %cmp = icmp slt i32 %0, 30
-  br i1 %cmp, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %1 = load ptr, ptr %s, align 4
-  %add.ptr = getelementptr inbounds i8, ptr %1, i32 4
-  %2 = ptrtoint ptr %add.ptr to i32
-  %3 = load ptr, ptr %s, align 4
-  %add.ptr1 = getelementptr inbounds i8, ptr %3, i32 4
-  %4 = ptrtoint ptr %add.ptr1 to i32
-  %rem = urem i32 %4, 2
-  %cmp2 = icmp eq i32 %rem, 0
-  br i1 %cmp2, label %cond.true, label %cond.false
-
-cond.true:                                        ; preds = %for.body
-  br label %cond.end
-
-cond.false:                                       ; preds = %for.body
-  %5 = load ptr, ptr %s, align 4
-  %add.ptr3 = getelementptr inbounds i8, ptr %5, i32 4
-  %6 = ptrtoint ptr %add.ptr3 to i32
-  %rem4 = urem i32 %6, 2
-  br label %cond.end
-
-cond.end:                                         ; preds = %cond.false, %cond.true
-  %cond = phi i32 [ 0, %cond.true ], [ %rem4, %cond.false ]
-  %add = add i32 %2, %cond
-  %7 = inttoptr i32 %add to ptr
-  %8 = load ptr, ptr %s, align 4
-  %next = getelementptr inbounds %struct.ST, ptr %8, i32 0, i32 0
-  store ptr %7, ptr %next, align 4
-  %9 = load ptr, ptr %s, align 4
-  %next5 = getelementptr inbounds %struct.ST, ptr %9, i32 0, i32 0
-  %10 = load ptr, ptr %next5, align 4
-  store ptr %10, ptr %s, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %cond.end
-  %11 = load i32, ptr %j, align 4
-  %inc = add nsw i32 %11, 1
-  store i32 %inc, ptr %j, align 4
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  %12 = load ptr, ptr %s, align 4
-  %next6 = getelementptr inbounds %struct.ST, ptr %12, i32 0, i32 0
-  store ptr null, ptr %next6, align 4
-  ret void
-}
-
+; NOTE: This is a timeout test for some O(something silly) constant folding behaviour. It may not be the best test. Providing it finishes, it passes.
+; RUN: opt < %s -O3 -S | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv8-none-eabi"
+
+%struct.ST = type { ptr }
+
+@global = internal global [121 x i8] zeroinitializer, align 1
+
+define void @func() #0 {
+;CHECK-LABEL: func
+entry:
+  %s = alloca ptr, align 4
+  %j = alloca i32, align 4
+  store ptr @global, ptr %s, align 4
+  store i32 0, ptr %j, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, ptr %j, align 4
+  %cmp = icmp slt i32 %0, 30
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load ptr, ptr %s, align 4
+  %add.ptr = getelementptr inbounds i8, ptr %1, i32 4
+  %2 = ptrtoint ptr %add.ptr to i32
+  %3 = load ptr, ptr %s, align 4
+  %add.ptr1 = getelementptr inbounds i8, ptr %3, i32 4
+  %4 = ptrtoint ptr %add.ptr1 to i32
+  %rem = urem i32 %4, 2
+  %cmp2 = icmp eq i32 %rem, 0
+  br i1 %cmp2, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %for.body
+  br label %cond.end
+
+cond.false:                                       ; preds = %for.body
+  %5 = load ptr, ptr %s, align 4
+  %add.ptr3 = getelementptr inbounds i8, ptr %5, i32 4
+  %6 = ptrtoint ptr %add.ptr3 to i32
+  %rem4 = urem i32 %6, 2
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 0, %cond.true ], [ %rem4, %cond.false ]
+  %add = add i32 %2, %cond
+  %7 = inttoptr i32 %add to ptr
+  %8 = load ptr, ptr %s, align 4
+  %next = getelementptr inbounds %struct.ST, ptr %8, i32 0, i32 0
+  store ptr %7, ptr %next, align 4
+  %9 = load ptr, ptr %s, align 4
+  %next5 = getelementptr inbounds %struct.ST, ptr %9, i32 0, i32 0
+  %10 = load ptr, ptr %next5, align 4
+  store ptr %10, ptr %s, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %cond.end
+  %11 = load i32, ptr %j, align 4
+  %inc = add nsw i32 %11, 1
+  store i32 %inc, ptr %j, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %12 = load ptr, ptr %s, align 4
+  %next6 = getelementptr inbounds %struct.ST, ptr %12, i32 0, i32 0
+  store ptr null, ptr %next6, align 4
+  ret void
+}
+
diff --git a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll
index 9ae9245eb15d..66ece62bd74f 100644
--- a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll
@@ -23,6 +23,55 @@ define void @load_factor2(ptr %ptr) {
   ret void
 }
 
+define void @load_factor2_as(ptr addrspace(1) %ptr) {
+; RV32-LABEL: @load_factor2_as(
+; RV32-NEXT:    [[TMP1:%.*]] = call { <8 x i32>, <8 x i32> } @llvm.riscv.seg2.load.v8i32.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 8)
+; RV32-NEXT:    [[TMP2:%.*]] = extractvalue { <8 x i32>, <8 x i32> } [[TMP1]], 1
+; RV32-NEXT:    [[TMP3:%.*]] = extractvalue { <8 x i32>, <8 x i32> } [[TMP1]], 0
+; RV32-NEXT:    ret void
+;
+; RV64-LABEL: @load_factor2_as(
+; RV64-NEXT:    [[TMP1:%.*]] = call { <8 x i32>, <8 x i32> } @llvm.riscv.seg2.load.v8i32.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 8)
+; RV64-NEXT:    [[TMP2:%.*]] = extractvalue { <8 x i32>, <8 x i32> } [[TMP1]], 1
+; RV64-NEXT:    [[TMP3:%.*]] = extractvalue { <8 x i32>, <8 x i32> } [[TMP1]], 0
+; RV64-NEXT:    ret void
+;
+  %interleaved.vec = load <16 x i32>, ptr addrspace(1) %ptr
+  %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret void
+}
+
+define void @load_factor2_vscale(ptr %ptr) {
+; RV32-LABEL: @load_factor2_vscale(
+; RV32-NEXT:    [[TMP1:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.riscv.vlseg2.nxv8i32.i32(<vscale x 8 x i32> poison, <vscale x 8 x i32> poison, ptr [[PTR:%.*]], i32 -1)
+; RV32-NEXT:    ret void
+;
+; RV64-LABEL: @load_factor2_vscale(
+; RV64-NEXT:    [[TMP1:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.riscv.vlseg2.nxv8i32.i64(<vscale x 8 x i32> poison, <vscale x 8 x i32> poison, ptr [[PTR:%.*]], i64 -1)
+; RV64-NEXT:    ret void
+;
+  %interleaved.vec = load <vscale x 16 x i32>, ptr %ptr
+  %v = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %interleaved.vec)
+  ret void
+}
+
+define void @load_factor2_vscale_as(ptr addrspace(1) %ptr) {
+; RV32-LABEL: @load_factor2_vscale_as(
+; RV32-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <vscale x 16 x i32>, ptr addrspace(1) [[PTR:%.*]], align 64
+; RV32-NEXT:    [[V:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[INTERLEAVED_VEC]])
+; RV32-NEXT:    ret void
+;
+; RV64-LABEL: @load_factor2_vscale_as(
+; RV64-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <vscale x 16 x i32>, ptr addrspace(1) [[PTR:%.*]], align 64
+; RV64-NEXT:    [[V:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[INTERLEAVED_VEC]])
+; RV64-NEXT:    ret void
+;
+  %interleaved.vec = load <vscale x 16 x i32>, ptr addrspace(1) %ptr
+  %v = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %interleaved.vec)
+  ret void
+}
+
 define void @load_factor3(ptr %ptr) {
 ; RV32-LABEL: @load_factor3(
 ; RV32-NEXT:    [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg3.load.v4i32.p0.i32(ptr [[PTR:%.*]], i32 4)
@@ -219,6 +268,54 @@ define void @store_factor2(ptr %ptr, <8 x i8> %v0, <8 x i8> %v1) {
   ret void
 }
 
+define void @store_factor2_as(ptr addrspace(1) %ptr, <8 x i8> %v0, <8 x i8> %v1) {
+; RV32-LABEL: @store_factor2_as(
+; RV32-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[V0:%.*]], <8 x i8> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; RV32-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i8> [[V0]], <8 x i8> [[V1]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; RV32-NEXT:    call void @llvm.riscv.seg2.store.v8i8.p1.i32(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], ptr addrspace(1) [[PTR:%.*]], i32 8)
+; RV32-NEXT:    ret void
+;
+; RV64-LABEL: @store_factor2_as(
+; RV64-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[V0:%.*]], <8 x i8> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; RV64-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i8> [[V0]], <8 x i8> [[V1]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; RV64-NEXT:    call void @llvm.riscv.seg2.store.v8i8.p1.i64(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], ptr addrspace(1) [[PTR:%.*]], i64 8)
+; RV64-NEXT:    ret void
+;
+  %interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  store <16 x i8> %interleaved.vec, ptr addrspace(1) %ptr, align 4
+  ret void
+}
+
+define void @store_factor2_vscale(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1) {
+; RV32-LABEL: @store_factor2_vscale(
+; RV32-NEXT:    call void @llvm.riscv.vsseg2.nxv8i8.i32(<vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], ptr [[PTR:%.*]], i32 -1)
+; RV32-NEXT:    ret void
+;
+; RV64-LABEL: @store_factor2_vscale(
+; RV64-NEXT:    call void @llvm.riscv.vsseg2.nxv8i8.i64(<vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], ptr [[PTR:%.*]], i64 -1)
+; RV64-NEXT:    ret void
+;
+  %interleaved.vec = call <vscale x 16 x i8> @llvm.vector.interleave2.nxv8i8(<vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1)
+  store <vscale x 16 x i8> %interleaved.vec, ptr %ptr, align 4
+  ret void
+}
+
+define void @store_factor2_vscale_as(ptr addrspace(1) %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1) {
+; RV32-LABEL: @store_factor2_vscale_as(
+; RV32-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 16 x i8> @llvm.vector.interleave2.nxv16i8(<vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]])
+; RV32-NEXT:    store <vscale x 16 x i8> [[INTERLEAVED_VEC]], ptr addrspace(1) [[PTR:%.*]], align 4
+; RV32-NEXT:    ret void
+;
+; RV64-LABEL: @store_factor2_vscale_as(
+; RV64-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 16 x i8> @llvm.vector.interleave2.nxv16i8(<vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]])
+; RV64-NEXT:    store <vscale x 16 x i8> [[INTERLEAVED_VEC]], ptr addrspace(1) [[PTR:%.*]], align 4
+; RV64-NEXT:    ret void
+;
+  %interleaved.vec = call <vscale x 16 x i8> @llvm.vector.interleave2.nxv8i8(<vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1)
+  store <vscale x 16 x i8> %interleaved.vec, ptr addrspace(1) %ptr, align 4
+  ret void
+}
+
 define void @store_factor3(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
 ; RV32-LABEL: @store_factor3(
 ; RV32-NEXT:    [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
diff --git a/llvm/test/Transforms/JumpThreading/guard-split-debuginfo.ll b/llvm/test/Transforms/JumpThreading/guard-split-debuginfo.ll
index 05ff74939449..38fbe4de51ad 100644
--- a/llvm/test/Transforms/JumpThreading/guard-split-debuginfo.ll
+++ b/llvm/test/Transforms/JumpThreading/guard-split-debuginfo.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; RUN: opt -S -passes=jump-threading %s -o - -S | FileCheck %s
 ; RUN: opt -S -passes=jump-threading %s -o - -S --try-experimental-debuginfo-iterators | FileCheck %s
 
@@ -7,6 +7,9 @@
 ; parent blocks. And that ino jump-threading, the old dbg.value gets
 ; deleted.
 
+; Test that JumpThreading's threadGuard() propagates the debug location
+; to the `phi` from the instruction it replaces (`%retval`)
+
 declare void @llvm.experimental.guard(i1, ...)
 
 declare i32 @f1()
@@ -20,20 +23,20 @@ define i32 @branch_implies_guard(i32 %a) !dbg !7 {
 ; CHECK-NEXT:    br i1 [[COND]], label [[T1_SPLIT:%.*]], label [[F1_SPLIT:%.*]], !dbg [[DBG12:![0-9]+]]
 ; CHECK:       T1.split:
 ; CHECK-NEXT:    [[V1:%.*]] = call i32 @f1(), !dbg [[DBG12]]
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 0, metadata [[META13:![0-9]+]], metadata !DIExpression()), !dbg [[DBG14:![0-9]+]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 0, metadata [[META13:![0-9]+]], metadata !DIExpression()), !dbg [[DBG14:![0-9]+]]
 ; CHECK-NEXT:    [[RETVAL3:%.*]] = add i32 [[V1]], 10, !dbg [[DBG12]]
 ; CHECK-NEXT:    [[CONDGUARD4:%.*]] = icmp slt i32 [[A]], 20, !dbg [[DBG12]]
 ; CHECK-NEXT:    br label [[MERGE:%.*]]
 ; CHECK:       F1.split:
 ; CHECK-NEXT:    [[V2:%.*]] = call i32 @f2(), !dbg [[DBG12]]
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 0, metadata [[META13]], metadata !DIExpression()), !dbg [[DBG14]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 0, metadata [[META13]], metadata !DIExpression()), !dbg [[DBG14]]
 ; CHECK-NEXT:    [[RETVAL1:%.*]] = add i32 [[V2]], 10, !dbg [[DBG12]]
 ; CHECK-NEXT:    [[CONDGUARD2:%.*]] = icmp slt i32 [[A]], 20, !dbg [[DBG12]]
 ; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[CONDGUARD2]]) [ "deopt"() ]
 ; CHECK-NEXT:    br label [[MERGE]]
 ; CHECK:       Merge:
 ; CHECK-NEXT:    [[RETPHI:%.*]] = phi i32 [ [[V1]], [[T1_SPLIT]] ], [ [[V2]], [[F1_SPLIT]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[RETVAL3]], [[T1_SPLIT]] ], [ [[RETVAL1]], [[F1_SPLIT]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[RETVAL3]], [[T1_SPLIT]] ], [ [[RETVAL1]], [[F1_SPLIT]] ], !dbg [[DBG12]]
 ; CHECK-NEXT:    ret i32 [[TMP1]], !dbg [[DBG12]]
 ;
   %cond = icmp slt i32 %a, 10
@@ -78,3 +81,22 @@ Merge:
 !19 = distinct !DILexicalBlock(scope: !7, file: !1, line: 8, column: 7)
 !26 = !DILocation(line: 13, column: 3, scope: !7)
 
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+;.
+; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C99, file: [[META1:![0-9]+]], isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META2:![0-9]+]], splitDebugInlining: false, nameTableKind: None)
+; CHECK: [[META1]] = !DIFile(filename: "test.c", directory: {{.*}})
+; CHECK: [[META2]] = !{}
+; CHECK: [[META3:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4}
+; CHECK: [[META4:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
+; CHECK: [[META5:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; CHECK: [[META6:![0-9]+]] = !{!""}
+; CHECK: [[META7:![0-9]+]] = distinct !DISubprogram(name: "foo", scope: [[META1]], file: [[META1]], line: 3, type: [[META8:![0-9]+]], scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META2]])
+; CHECK: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]])
+; CHECK: [[META9]] = !{[[META10:![0-9]+]], [[META11:![0-9]+]], [[META11]]}
+; CHECK: [[META10]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+; CHECK: [[META11]] = !DIBasicType(name: "long int", size: 64, encoding: DW_ATE_signed)
+; CHECK: [[DBG12]] = !DILocation(line: 13, column: 3, scope: [[META7]])
+; CHECK: [[META13]] = !DILocalVariable(name: "bar", arg: 1, scope: [[META7]], file: [[META1]], line: 3, type: [[META11]])
+; CHECK: [[DBG14]] = !DILocation(line: 0, scope: [[META7]])
+;.
diff --git a/llvm/test/Transforms/JumpThreading/preserving-debugloc-bitcast.ll b/llvm/test/Transforms/JumpThreading/preserving-debugloc-bitcast.ll
new file mode 100644
index 000000000000..319350e06f47
--- /dev/null
+++ b/llvm/test/Transforms/JumpThreading/preserving-debugloc-bitcast.ll
@@ -0,0 +1,46 @@
+; RUN: opt < %s -S -passes=jump-threading | FileCheck %s
+
+; Test that JumpThreading's `simplifyPartiallyRedundantLoad` propagates
+; the debug location to the `bitcast` from the LoadInst it replaces (`%b`).
+
+declare void @f1(...)
+
+define void @test8(ptr %0, ptr %1, ptr %2) !dbg !5 {
+; CHECK: @test8
+; CHECK:    [[TMP4:%.*]] = bitcast float [[A:%.*]] to i32, !dbg [[DBG9:![0-9]+]]
+; CHECK: [[DBG9]] = !DILocation(line: 2,
+;
+  %a = load float, ptr %0, align 4, !dbg !8
+  %b = load i32, ptr %0, align 4, !dbg !9
+  store float %a, ptr %1, align 4, !dbg !10
+  %c = icmp eq i32 %b, 8, !dbg !11
+  br i1 %c, label %ret1, label %ret2, !dbg !12
+
+ret1:                                             ; preds = %3
+  ret void, !dbg !13
+
+ret2:                                             ; preds = %3
+  %xxx = tail call i32 (...) @f1() #0, !dbg !14
+  ret void, !dbg !15
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.debugify = !{!2, !3}
+!llvm.module.flags = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "test.ll", directory: "/")
+!2 = !{i32 8}
+!3 = !{i32 0}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "test8", linkageName: "test8", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!8 = !DILocation(line: 1, column: 1, scope: !5)
+!9 = !DILocation(line: 2, column: 1, scope: !5)
+!10 = !DILocation(line: 3, column: 1, scope: !5)
+!11 = !DILocation(line: 4, column: 1, scope: !5)
+!12 = !DILocation(line: 5, column: 1, scope: !5)
+!13 = !DILocation(line: 6, column: 1, scope: !5)
+!14 = !DILocation(line: 7, column: 1, scope: !5)
+!15 = !DILocation(line: 8, column: 1, scope: !5)
diff --git a/llvm/test/Transforms/JumpThreading/preserving-debugloc-fold-select.ll b/llvm/test/Transforms/JumpThreading/preserving-debugloc-fold-select.ll
new file mode 100644
index 000000000000..8fdec7210980
--- /dev/null
+++ b/llvm/test/Transforms/JumpThreading/preserving-debugloc-fold-select.ll
@@ -0,0 +1,76 @@
+; RUN: opt < %s -S -passes=jump-threading | FileCheck %s
+
+; Test the debug location update of the newly created PHINode
+; which replaces the select instruction in .exit block.
+
+define i32 @unfold3(i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z, i32 %j) !dbg !5 {
+; CHECK:       .exit.thread4:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 {{.*}}, !dbg [[DBG29:![0-9]+]]
+; CHECK-NEXT:    ret i32 [[TMP0]], !dbg [[DBG30:![0-9]+]]
+;
+; CHECK: [[DBG29]] = !DILocation(line: 13,
+;
+entry:
+  %add3 = add nsw i32 %j, 2, !dbg !19
+  %cmp.i = icmp slt i32 %u, %v, !dbg !20
+  br i1 %cmp.i, label %.exit, label %cond.false.i, !dbg !21
+
+cond.false.i:                                     ; preds = %entry
+  %cmp4.i = icmp sgt i32 %u, %v, !dbg !22
+  br i1 %cmp4.i, label %.exit, label %cond.false.6.i, !dbg !23
+
+cond.false.6.i:                                   ; preds = %cond.false.i
+  %cmp8.i = icmp slt i32 %w, %x, !dbg !24
+  br i1 %cmp8.i, label %.exit, label %cond.false.10.i, !dbg !25
+
+cond.false.10.i:                                  ; preds = %cond.false.6.i
+  %cmp13.i = icmp sgt i32 %w, %x, !dbg !26
+  br i1 %cmp13.i, label %.exit, label %cond.false.15.i, !dbg !27
+
+cond.false.15.i:                                  ; preds = %cond.false.10.i
+  %phitmp = icmp sge i32 %y, %z, !dbg !28
+  br label %.exit, !dbg !29
+
+.exit:                                            ; preds = %cond.false.15.i, %cond.false.10.i, %cond.false.6.i, %cond.false.i, %entry
+  %cond23.i = phi i1 [ false, %entry ], [ true, %cond.false.i ], [ false, %cond.false.6.i ], [ %phitmp, %cond.false.15.i ], [ true, %cond.false.10.i ], !dbg !30
+  %j.add3 = select i1 %cond23.i, i32 %j, i32 %add3, !dbg !31
+  ret i32 %j.add3, !dbg !32
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.debugify = !{!2, !3}
+!llvm.module.flags = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "preserving-debugloc-trytofoldselect.ll", directory: "/")
+!2 = !{i32 14}
+!3 = !{i32 8}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "unfold3", linkageName: "unfold3", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8)
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!8 = !{!9, !11, !13, !14, !15, !16, !17, !18}
+!9 = !DILocalVariable(name: "1", scope: !5, file: !1, line: 1, type: !10)
+!10 = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned)
+!11 = !DILocalVariable(name: "2", scope: !5, file: !1, line: 2, type: !12)
+!12 = !DIBasicType(name: "ty8", size: 8, encoding: DW_ATE_unsigned)
+!13 = !DILocalVariable(name: "3", scope: !5, file: !1, line: 4, type: !12)
+!14 = !DILocalVariable(name: "4", scope: !5, file: !1, line: 6, type: !12)
+!15 = !DILocalVariable(name: "5", scope: !5, file: !1, line: 8, type: !12)
+!16 = !DILocalVariable(name: "6", scope: !5, file: !1, line: 10, type: !12)
+!17 = !DILocalVariable(name: "7", scope: !5, file: !1, line: 12, type: !12)
+!18 = !DILocalVariable(name: "8", scope: !5, file: !1, line: 13, type: !10)
+!19 = !DILocation(line: 1, column: 1, scope: !5)
+!20 = !DILocation(line: 2, column: 1, scope: !5)
+!21 = !DILocation(line: 3, column: 1, scope: !5)
+!22 = !DILocation(line: 4, column: 1, scope: !5)
+!23 = !DILocation(line: 5, column: 1, scope: !5)
+!24 = !DILocation(line: 6, column: 1, scope: !5)
+!25 = !DILocation(line: 7, column: 1, scope: !5)
+!26 = !DILocation(line: 8, column: 1, scope: !5)
+!27 = !DILocation(line: 9, column: 1, scope: !5)
+!28 = !DILocation(line: 10, column: 1, scope: !5)
+!29 = !DILocation(line: 11, column: 1, scope: !5)
+!30 = !DILocation(line: 12, column: 1, scope: !5)
+!31 = !DILocation(line: 13, column: 1, scope: !5)
+!32 = !DILocation(line: 14, column: 1, scope: !5)
diff --git a/llvm/test/Transforms/JumpThreading/thread-two-bbs.ll b/llvm/test/Transforms/JumpThreading/thread-two-bbs.ll
index f7e6b2189dc8..09394a946241 100644
--- a/llvm/test/Transforms/JumpThreading/thread-two-bbs.ll
+++ b/llvm/test/Transforms/JumpThreading/thread-two-bbs.ll
@@ -130,8 +130,8 @@ exit:
 }
 
 
-; Verify that we do *not* thread any edge.  We used to evaluate
-; constant expressions like:
+; Verify that we thread the edge correctly.  We used to evaluate constant
+; expressions like:
 ;
 ;   icmp ugt ptr null, inttoptr (i64 4 to ptr)
 ;
@@ -141,16 +141,17 @@ define void @icmp_ult_null_constexpr(ptr %arg1, ptr %arg2) {
 ; CHECK-LABEL: @icmp_ult_null_constexpr(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq ptr [[ARG1:%.*]], null
-; CHECK-NEXT:    br i1 [[CMP1]], label [[BB_BAR1:%.*]], label [[BB_END:%.*]]
-; CHECK:       bb_bar1:
-; CHECK-NEXT:    call void @bar(i32 1)
-; CHECK-NEXT:    br label [[BB_END]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[BB_END_THREAD:%.*]], label [[BB_END:%.*]]
 ; CHECK:       bb_end:
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne ptr [[ARG2:%.*]], null
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[BB_CONT:%.*]], label [[BB_BAR2:%.*]]
+; CHECK:       bb_end.thread:
+; CHECK-NEXT:    call void @bar(i32 1)
+; CHECK-NEXT:    [[CMP21:%.*]] = icmp ne ptr [[ARG2]], null
+; CHECK-NEXT:    br i1 [[CMP21]], label [[BB_EXIT:%.*]], label [[BB_BAR2]]
 ; CHECK:       bb_bar2:
 ; CHECK-NEXT:    call void @bar(i32 2)
-; CHECK-NEXT:    br label [[BB_EXIT:%.*]]
+; CHECK-NEXT:    br label [[BB_EXIT]]
 ; CHECK:       bb_cont:
 ; CHECK-NEXT:    [[CMP3:%.*]] = icmp ult ptr [[ARG1]], inttoptr (i64 4 to ptr)
 ; CHECK-NEXT:    br i1 [[CMP3]], label [[BB_EXIT]], label [[BB_BAR3:%.*]]
diff --git a/llvm/test/Transforms/LICM/debugloc-preserve-fmul-drop-fdiv.ll b/llvm/test/Transforms/LICM/debugloc-preserve-fmul-drop-fdiv.ll
new file mode 100644
index 000000000000..e6e6f077fd6d
--- /dev/null
+++ b/llvm/test/Transforms/LICM/debugloc-preserve-fmul-drop-fdiv.ll
@@ -0,0 +1,66 @@
+; RUN: opt -passes=licm -verify-memoryssa -S < %s | FileCheck %s
+
+; JumpThreading's hoistRegion() replaces the `fdiv` (%v6), of which the second
+; operand (%v) is a loop invariant, with a loop invariant `fdiv` and a `fmul`.
+; This test checks that the debug location propagates to the new `fmul` from
+; the old `fdiv` it replaces in block `loop` and the debug location drop of new
+; `fdiv`, which is hoisted to block `entry` after being created.
+
+define zeroext i1 @invariant_denom(double %v) !dbg !5 {
+; CHECK:       entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv fast double 1.000000e+00, [[V:%.*]]{{$}}
+; CHECK:       loop:
+; CHECK:         [[TMP1:%.*]] = fmul fast double {{.*}}, !dbg [[DBG12:![0-9]+]]
+; CHECK:       [[DBG12]] = !DILocation(line: 5,
+;
+entry:
+  br label %loop, !dbg !8
+
+loop:                                             ; preds = %loop, %entry
+  %v3 = phi i32 [ 0, %entry ], [ %v11, %loop ], !dbg !9
+  %v4 = phi i32 [ 0, %entry ], [ %v12, %loop ], !dbg !10
+  %v5 = uitofp i32 %v4 to double, !dbg !11
+  %v6 = fdiv fast double %v5, %v, !dbg !12
+  %v7 = fptoui double %v6 to i64, !dbg !13
+  %v8 = and i64 %v7, 1, !dbg !14
+  %v9 = xor i64 %v8, 1, !dbg !15
+  %v10 = trunc i64 %v9 to i32, !dbg !16
+  %v11 = add i32 %v10, %v3, !dbg !17
+  %v12 = add nuw i32 %v4, 1, !dbg !18
+  %v13 = icmp eq i32 %v12, -1, !dbg !19
+  br i1 %v13, label %end, label %loop, !dbg !20
+
+end:                                              ; preds = %loop
+  %v15 = phi i32 [ %v11, %loop ], !dbg !21
+  %v16 = icmp ne i32 %v15, 0, !dbg !22
+  ret i1 %v16, !dbg !23
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.debugify = !{!2, !3}
+!llvm.module.flags = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "test.ll", directory: "/")
+!2 = !{i32 16}
+!3 = !{i32 0}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "invariant_denom", linkageName: "invariant_denom", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!8 = !DILocation(line: 1, column: 1, scope: !5)
+!9 = !DILocation(line: 2, column: 1, scope: !5)
+!10 = !DILocation(line: 3, column: 1, scope: !5)
+!11 = !DILocation(line: 4, column: 1, scope: !5)
+!12 = !DILocation(line: 5, column: 1, scope: !5)
+!13 = !DILocation(line: 6, column: 1, scope: !5)
+!14 = !DILocation(line: 7, column: 1, scope: !5)
+!15 = !DILocation(line: 8, column: 1, scope: !5)
+!16 = !DILocation(line: 9, column: 1, scope: !5)
+!17 = !DILocation(line: 10, column: 1, scope: !5)
+!18 = !DILocation(line: 11, column: 1, scope: !5)
+!19 = !DILocation(line: 12, column: 1, scope: !5)
+!20 = !DILocation(line: 13, column: 1, scope: !5)
+!21 = !DILocation(line: 14, column: 1, scope: !5)
+!22 = !DILocation(line: 15, column: 1, scope: !5)
+!23 = !DILocation(line: 16, column: 1, scope: !5)
diff --git a/llvm/test/Transforms/LoopPredication/invalidate-analyses.ll b/llvm/test/Transforms/LoopPredication/invalidate-analyses.ll
index 7afacd564939..3fc836961f2f 100644
--- a/llvm/test/Transforms/LoopPredication/invalidate-analyses.ll
+++ b/llvm/test/Transforms/LoopPredication/invalidate-analyses.ll
@@ -5,10 +5,10 @@
 ;       please update this test some other analysis that isn't preserved.
 
 ; CHECK: Running analysis: LazyValueAnalysis on drop_a_wc_and_leave_early
-; CHECK: Running pass: LoopPredicationPass on loop
+; CHECK: Running pass: LoopPredicationPass on loop %loop in function drop_a_wc_and_leave_early
 ; CHECK: Invalidating analysis: LazyValueAnalysis on drop_a_wc_and_leave_early
 ; CHECK: Running analysis: LazyValueAnalysis on drop_a_wc_and_leave
-; CHECK: Running pass: LoopPredicationPass on loop
+; CHECK: Running pass: LoopPredicationPass on loop %loop in function drop_a_wc_and_leave
 ; CHECK: Invalidating analysis: LazyValueAnalysis on drop_a_wc_and_leave
 
 
diff --git a/llvm/test/Transforms/LoopRotate/pr35210.ll b/llvm/test/Transforms/LoopRotate/pr35210.ll
index 12df8f5a33e7..c24f5164e532 100644
--- a/llvm/test/Transforms/LoopRotate/pr35210.ll
+++ b/llvm/test/Transforms/LoopRotate/pr35210.ll
@@ -17,7 +17,7 @@
 ; CHECK-NEXT: Running analysis: TargetLibraryAnalysis on f
 ; CHECK-NEXT: Running analysis: ScalarEvolutionAnalysis on f
 ; CHECK-NEXT: Running analysis: InnerAnalysisManagerProxy{{.*}} on f
-; CHECK-NEXT: Running pass: LoopRotatePass on bb
+; CHECK-NEXT: Running pass: LoopRotatePass on loop %bb in function f
 ; CHECK-NEXT: Folding loop latch bb4 into bb
 ; CHECK-NEXT: Invalidating analysis: PostDominatorTreeAnalysis on f
 ; CHECK-NEXT: Running pass: ADCEPass on f
@@ -36,7 +36,7 @@
 ; MSSA-NEXT: Running analysis: TargetLibraryAnalysis on f
 ; MSSA-NEXT: Running analysis: ScalarEvolutionAnalysis on f
 ; MSSA-NEXT: Running analysis: InnerAnalysisManagerProxy{{.*}} on f
-; MSSA-NEXT: Running pass: LoopRotatePass on bb
+; MSSA-NEXT: Running pass: LoopRotatePass on loop %bb in function f
 ; MSSA-NEXT: Folding loop latch bb4 into bb
 ; MSSA-NEXT: Invalidating analysis: PostDominatorTreeAnalysis on f
 ; MSSA-NEXT: Running pass: ADCEPass on f
diff --git a/llvm/test/Transforms/LoopStrengthReduce/pr51329.ll b/llvm/test/Transforms/LoopStrengthReduce/pr51329.ll
index 52a8f1921dae..e4edab1346d3 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/pr51329.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/pr51329.ll
@@ -1,55 +1,55 @@
-; RUN: opt -S -loop-reduce %s | FileCheck %s
-; RUN: opt --try-experimental-debuginfo-iterators -S -loop-reduce %s | FileCheck %s
-;
-; Test that LSR SCEV-based salvaging does not crash when translating SCEVs
-; that contain integers with binary representations greater than 64-bits. 
-; Also show that no salvaging attempt is made for dbg.value that are undef
-; pre-LSR.
-;
-; CHECK: call void @llvm.dbg.value(metadata i64 undef, metadata !{{[0-9]+}}, metadata !DIExpression(DW_OP_plus_uconst, 228, DW_OP_stack_value))
-; CHECK: call void @llvm.dbg.value(metadata i64 %var2, metadata !{{[0-9]+}}, metadata !DIExpression(DW_OP_plus_uconst, 228, DW_OP_stack_value))
-
-
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata)
-
-; Function Attrs: nounwind
-define hidden void @reproducer() local_unnamed_addr !dbg !5 {
-init:
-  %0 = lshr i128 undef, 64
-  %var1 = trunc i128 %0 to i64
-  %1 = add nuw i64 undef, %var1
-  %var2 = lshr i64 %1, 12
-  br label %Label_d0
-
-Label_d0:                                         ; preds = %Label_d0, %init
-  %var3 = phi i64 [ %var2, %init ], [ %var4, %Label_d0 ]
-  call void @llvm.dbg.value(metadata i64 undef, metadata !11, metadata !DIExpression(DW_OP_plus_uconst, 228, DW_OP_stack_value)), !dbg !12
-  call void @llvm.dbg.value(metadata i64 %var2, metadata !11, metadata !DIExpression(DW_OP_plus_uconst, 228, DW_OP_stack_value)), !dbg !12
-  %var4 = add i64 %var3, -1
-  %var5 = icmp eq i64 %var4, 0
-  br i1 %var5, label %Label_1bc, label %Label_d0
-
-Label_1bc:                                        ; preds = %Label_d0
-  ret void
-}
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "frontend", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
-!1 = !DIFile(filename: "source", directory: "")
-!2 = !{}
-!3 = !{i32 2, !"Debug Info Version", i32 3}
-!4 = !{i32 1, !"wchar_size", i32 4}
-!5 = distinct !DISubprogram(name: "reproducer", scope: !1, file: !1, line: 904320, type: !6, scopeLine: 904320, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !10)
-!6 = !DISubroutineType(types: !7)
-!7 = !{null, !8, !9, !9, !9, !9, !9, !9}
-!8 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
-!9 = !DIBasicType(name: "my_type", size: 64, encoding: DW_ATE_unsigned)
-!10 = !{!11}
-!11 = !DILocalVariable(name: "my_var", arg: 1, scope: !5, file: !1, line: 904320, type: !8)
+; RUN: opt -S -loop-reduce %s | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -S -loop-reduce %s | FileCheck %s
+;
+; Test that LSR SCEV-based salvaging does not crash when translating SCEVs
+; that contain integers with binary representations greater than 64-bits.
+; Also show that no salvaging attempt is made for dbg.value that are undef
+; pre-LSR.
+;
+; CHECK: call void @llvm.dbg.value(metadata i64 undef, metadata !{{[0-9]+}}, metadata !DIExpression(DW_OP_plus_uconst, 228, DW_OP_stack_value))
+; CHECK: call void @llvm.dbg.value(metadata i64 %var2, metadata !{{[0-9]+}}, metadata !DIExpression(DW_OP_plus_uconst, 228, DW_OP_stack_value))
+
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+; Function Attrs: nounwind
+define hidden void @reproducer() local_unnamed_addr !dbg !5 {
+init:
+  %0 = lshr i128 undef, 64
+  %var1 = trunc i128 %0 to i64
+  %1 = add nuw i64 undef, %var1
+  %var2 = lshr i64 %1, 12
+  br label %Label_d0
+
+Label_d0:                                         ; preds = %Label_d0, %init
+  %var3 = phi i64 [ %var2, %init ], [ %var4, %Label_d0 ]
+  call void @llvm.dbg.value(metadata i64 undef, metadata !11, metadata !DIExpression(DW_OP_plus_uconst, 228, DW_OP_stack_value)), !dbg !12
+  call void @llvm.dbg.value(metadata i64 %var2, metadata !11, metadata !DIExpression(DW_OP_plus_uconst, 228, DW_OP_stack_value)), !dbg !12
+  %var4 = add i64 %var3, -1
+  %var5 = icmp eq i64 %var4, 0
+  br i1 %var5, label %Label_1bc, label %Label_d0
+
+Label_1bc:                                        ; preds = %Label_d0
+  ret void
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "frontend", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "source", directory: "")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = distinct !DISubprogram(name: "reproducer", scope: !1, file: !1, line: 904320, type: !6, scopeLine: 904320, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !10)
+!6 = !DISubroutineType(types: !7)
+!7 = !{null, !8, !9, !9, !9, !9, !9, !9}
+!8 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
+!9 = !DIBasicType(name: "my_type", size: 64, encoding: DW_ATE_unsigned)
+!10 = !{!11}
+!11 = !DILocalVariable(name: "my_var", arg: 1, scope: !5, file: !1, line: 904320, type: !8)
 !12 = !DILocation(line: 904544, scope: !5)
 \ No newline at end of file
diff --git a/llvm/test/Transforms/LoopUnroll/revisit.ll b/llvm/test/Transforms/LoopUnroll/revisit.ll
index de1f02ac997d..67cb7e52f4f4 100644
--- a/llvm/test/Transforms/LoopUnroll/revisit.ll
+++ b/llvm/test/Transforms/LoopUnroll/revisit.ll
@@ -33,7 +33,7 @@ l0.0.0.ph:
 l0.0.0:
   %cond.0.0.0 = load volatile i1, ptr %ptr
   br i1 %cond.0.0.0, label %l0.0.0, label %l0.0.1.ph
-; CHECK: LoopFullUnrollPass on l0.0.0
+; CHECK: LoopFullUnrollPass on loop %l0.0.0
 ; CHECK-NOT: LoopFullUnrollPass
 
 l0.0.1.ph:
@@ -42,29 +42,29 @@ l0.0.1.ph:
 l0.0.1:
   %cond.0.0.1 = load volatile i1, ptr %ptr
   br i1 %cond.0.0.1, label %l0.0.1, label %l0.0.latch
-; CHECK: LoopFullUnrollPass on l0.0.1
+; CHECK: LoopFullUnrollPass on loop %l0.0.1 in function full_unroll
 ; CHECK-NOT: LoopFullUnrollPass
 
 l0.0.latch:
   %cmp = icmp slt i32 %iv.next, 2
   br i1 %cmp, label %l0.0, label %l0.latch
-; CHECK: LoopFullUnrollPass on l0.0
+; CHECK: LoopFullUnrollPass on loop %l0.0 in function full_unroll
 ; CHECK-NOT: LoopFullUnrollPass
 ;
 ; Unrolling occurs, so we visit what were the inner loops twice over. First we
 ; visit their clones, and then we visit the original loops re-parented.
-; CHECK: LoopFullUnrollPass on l0.0.1.1
+; CHECK: LoopFullUnrollPass on loop %l0.0.1.1 in function full_unroll 
 ; CHECK-NOT: LoopFullUnrollPass
-; CHECK: LoopFullUnrollPass on l0.0.0.1
+; CHECK: LoopFullUnrollPass on loop %l0.0.0.1 in function full_unroll
 ; CHECK-NOT: LoopFullUnrollPass
-; CHECK: LoopFullUnrollPass on l0.0.1
+; CHECK: LoopFullUnrollPass on loop %l0.0.1 in function full_unroll
 ; CHECK-NOT: LoopFullUnrollPass
-; CHECK: LoopFullUnrollPass on l0.0.0
+; CHECK: LoopFullUnrollPass on loop %l0.0.0 in function full_unroll
 ; CHECK-NOT: LoopFullUnrollPass
 
 l0.latch:
   br label %l0
-; CHECK: LoopFullUnrollPass on l0
+; CHECK: LoopFullUnrollPass on loop %l0 in function full_unroll
 ; CHECK-NOT: LoopFullUnrollPass
 
 exit:
diff --git a/llvm/test/Transforms/LoopUnroll/shifted-tripcount.ll b/llvm/test/Transforms/LoopUnroll/shifted-tripcount.ll
index 01ac4b77f2e4..4f3f861d308e 100644
--- a/llvm/test/Transforms/LoopUnroll/shifted-tripcount.ll
+++ b/llvm/test/Transforms/LoopUnroll/shifted-tripcount.ll
@@ -20,8 +20,7 @@ define void @latch_exit(ptr nocapture %p, i64 %n) nounwind {
 ; CHECK-NEXT:    [[TMP16_1]] = add i64 [[I_013]], 2
 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr double, ptr [[P]], i64 [[TMP16_1]]
 ; CHECK-NEXT:    [[TMP4_1:%.*]] = load double, ptr [[ARRAYIDX_1]], align 8
-; CHECK-NEXT:    [[TMP8_1:%.*]] = load double, ptr [[ARRAYIDX7_1]], align 8
-; CHECK-NEXT:    [[MUL9_1:%.*]] = fmul double [[TMP8_1]], [[TMP4_1]]
+; CHECK-NEXT:    [[MUL9_1:%.*]] = fmul double [[TMP4]], [[TMP4_1]]
 ; CHECK-NEXT:    store double [[MUL9_1]], ptr [[ARRAYIDX7_1]], align 8
 ; CHECK-NEXT:    [[EXITCOND_1:%.*]] = icmp eq i64 [[TMP16_1]], [[MUL10]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_1]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -70,8 +69,7 @@ define void @non_latch_exit(ptr nocapture %p, i64 %n) nounwind {
 ; CHECK-NEXT:    [[TMP16_1]] = add i64 [[I_013]], 2
 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr double, ptr [[P]], i64 [[TMP16_1]]
 ; CHECK-NEXT:    [[TMP4_1:%.*]] = load double, ptr [[ARRAYIDX_1]], align 8
-; CHECK-NEXT:    [[TMP8_1:%.*]] = load double, ptr [[ARRAYIDX7_1]], align 8
-; CHECK-NEXT:    [[MUL9_1:%.*]] = fmul double [[TMP8_1]], [[TMP4_1]]
+; CHECK-NEXT:    [[MUL9_1:%.*]] = fmul double [[TMP4]], [[TMP4_1]]
 ; CHECK-NEXT:    store double [[MUL9_1]], ptr [[ARRAYIDX7_1]], align 8
 ; CHECK-NEXT:    [[EXITCOND_1:%.*]] = icmp eq i64 [[TMP16_1]], [[MUL10]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_1]], label [[FOR_END:%.*]], label [[LATCH_1]]
diff --git a/llvm/test/Transforms/LoopUnroll/unroll-loads-cse.ll b/llvm/test/Transforms/LoopUnroll/unroll-loads-cse.ll
index 109a1834c302..d4105254e531 100644
--- a/llvm/test/Transforms/LoopUnroll/unroll-loads-cse.ll
+++ b/llvm/test/Transforms/LoopUnroll/unroll-loads-cse.ll
@@ -29,9 +29,7 @@ define void @cse_matching_load_from_previous_unrolled_iteration(ptr %src, ptr no
 ; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[GEP_SRC_12_1:%.*]] = getelementptr i64, ptr [[SRC_12]], i64 [[IV_NEXT]]
 ; CHECK-NEXT:    [[L_12_1:%.*]] = load i64, ptr [[GEP_SRC_12_1]], align 8
-; CHECK-NEXT:    [[GEP_SRC_4_1:%.*]] = getelementptr i64, ptr [[SRC_4]], i64 [[IV_NEXT]]
-; CHECK-NEXT:    [[L_4_1:%.*]] = load i64, ptr [[GEP_SRC_4_1]], align 8
-; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_12_1]], [[L_4_1]]
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_12_1]], [[L_12]]
 ; CHECK-NEXT:    [[GEP_DST_1:%.*]] = getelementptr i64, ptr [[DST]], i64 [[IV_NEXT]]
 ; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[GEP_DST_1]], align 8
 ; CHECK-NEXT:    [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2
@@ -425,8 +423,7 @@ define void @loop_body_with_dead_blocks(ptr %src) {
 ; CHECK:       loop.header.1:
 ; CHECK-NEXT:    br label [[LOOP_BB_1:%.*]]
 ; CHECK:       loop.bb.1:
-; CHECK-NEXT:    [[L_1_1:%.*]] = load i32, ptr [[SRC]], align 8
-; CHECK-NEXT:    [[C_1_1:%.*]] = icmp eq i32 [[L_1_1]], 0
+; CHECK-NEXT:    [[C_1_1:%.*]] = icmp eq i32 [[L_2]], 0
 ; CHECK-NEXT:    br i1 [[C_1_1]], label [[OUTER_HEADER_LOOPEXIT]], label [[LOOP_LATCH_1:%.*]]
 ; CHECK:       loop.latch.1:
 ; CHECK-NEXT:    call void @foo()
diff --git a/llvm/test/Transforms/LoopUnroll/unroll-remove-redundant-dbg.ll b/llvm/test/Transforms/LoopUnroll/unroll-remove-redundant-dbg.ll
index 66cd4d454443..8e348281dc61 100644
--- a/llvm/test/Transforms/LoopUnroll/unroll-remove-redundant-dbg.ll
+++ b/llvm/test/Transforms/LoopUnroll/unroll-remove-redundant-dbg.ll
@@ -1,14 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -S -passes=loop-unroll | FileCheck %s
 
 define i64 @d(i1 %tobool.not, i32 %add, i64 %conv23) !dbg !14{
+; There should be only one "llvm.dbg.vale" after loop unrolling
+; CHECK-LABEL: @d(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 0, metadata [[META16:![0-9]+]], metadata !DIExpression()), !dbg [[DBG17:![0-9]+]]
+; CHECK-NEXT:    ret i64 5
+;
 entry:
   br label %for.body
 
 for.body:                                         ; preds = %for.body, %entry
-  ; There should be only one "llvm.dbg.vale" after loop unrolling
-  ; CHECK: call void @llvm.dbg.value
-  ; CHECK-NOT: call void @llvm.dbg.value
-
   %k.045 = phi i64 [ 0, %entry ], [ %k.046, %for.body ]
   tail call void @llvm.dbg.value(metadata i32 0, metadata !13, metadata !DIExpression()), !dbg !17
   %k.046 = add nuw nsw i64 %k.045, 1
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
index 9969f881063c..6b4cfa091c45 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
@@ -5,8 +5,8 @@
 target triple = "aarch64-unknown-linux-gnu"
 
 ; CHECK-COST: Checking a loop in 'fixed_width'
-; CHECK-COST: Found an estimated cost of 14 for VF 2 For instruction:   store i32 2, ptr %arrayidx1, align 4
-; CHECK-COST: Found an estimated cost of 28 for VF 4 For instruction:   store i32 2, ptr %arrayidx1, align 4
+; CHECK-COST: Found an estimated cost of 10 for VF 2 For instruction:   store i32 2, ptr %arrayidx1, align 4
+; CHECK-COST: Found an estimated cost of 20 for VF 4 For instruction:   store i32 2, ptr %arrayidx1, align 4
 ; CHECK-COST: Selecting VF: 1.
 
 ; We should decide this loop is not worth vectorising using fixed width vectors
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll
index 1c26ee8479e5..2470bca1e17b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll
@@ -8,39 +8,41 @@ target triple = "aarch64-linux-gnu"
 define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) #0 {
 ; CHECK-VF4IC1-LABEL: @select_const_i32_from_icmp
 ; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
 ; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <vscale x 4 x i32>
 ; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <vscale x 4 x i32> [[VEC_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <vscale x 4 x i1> [[VEC_ICMP]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1-NEXT:   [[NOT:%*]] = xor <vscale x 4 x i1> [[VEC_ICMP]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = or <vscale x 4 x i1> [[VEC_PHI]], [[NOT]]
 ; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[FIN_ICMP]])
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[VEC_SEL]])
+; CHECK-VF4IC1-NEXT:   [[FR:%.*]] = freeze i1 [[OR_RDX]]
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR]], i32 7, i32 3
 
 ; CHECK-VF4IC4-LABEL: @select_const_i32_from_icmp
 ; CHECK-VF4IC4:      vector.body:
-; CHECK-VF4IC4:        [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ]
-; CHECK-VF4IC4:        [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ]
-; CHECK-VF4IC4:        [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ]
-; CHECK-VF4IC4:        [[VEC_PHI4:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ]
+; CHECK-VF4IC4:        [[VEC_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ]
+; CHECK-VF4IC4:        [[VEC_PHI2:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ]
+; CHECK-VF4IC4:        [[VEC_PHI3:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ]
+; CHECK-VF4IC4:        [[VEC_PHI4:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ]
 ; CHECK-VF4IC4:        [[VEC_ICMP1:%.*]] = icmp eq <vscale x 4 x i32> {{.*}}, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-VF4IC4-NEXT:   [[VEC_ICMP2:%.*]] = icmp eq <vscale x 4 x i32> {{.*}}, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-VF4IC4-NEXT:   [[VEC_ICMP3:%.*]] = icmp eq <vscale x 4 x i32> {{.*}}, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-VF4IC4-NEXT:   [[VEC_ICMP4:%.*]] = icmp eq <vscale x 4 x i32> {{.*}}, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL1]] = select <vscale x 4 x i1> [[VEC_ICMP1]], <vscale x 4 x i32> [[VEC_PHI1]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL2]] = select <vscale x 4 x i1> [[VEC_ICMP2]], <vscale x 4 x i32> [[VEC_PHI2]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL3]] = select <vscale x 4 x i1> [[VEC_ICMP3]], <vscale x 4 x i32> [[VEC_PHI3]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL4]] = select <vscale x 4 x i1> [[VEC_ICMP4]], <vscale x 4 x i32> [[VEC_PHI4]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:    [[NOT1:%.*]] = xor <vscale x 4 x i1> [[VEC_ICMP1]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:    [[NOT2:%.*]] = xor <vscale x 4 x i1> [[VEC_ICMP2]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:    [[NOT3:%.*]] = xor <vscale x 4 x i1> [[VEC_ICMP3]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:    [[NOT4:%.*]] = xor <vscale x 4 x i1> [[VEC_ICMP4]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL1:%.*]] = or <vscale x 4 x i1> [[VEC_PHI1]], [[NOT1]]
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL2:%.*]] = or <vscale x 4 x i1> [[VEC_PHI2]], [[NOT2]]
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL3:%.*]] = or <vscale x 4 x i1> [[VEC_PHI3]], [[NOT3]]
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL4:%.*]] = or <vscale x 4 x i1> [[VEC_PHI4]], [[NOT4]]
 ; CHECK-VF4IC4:      middle.block:
-; CHECK-VF4IC4-NEXT:   [[VEC_ICMP5:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL1]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL5:%.*]] = select <vscale x 4 x i1> [[VEC_ICMP5]], <vscale x 4 x i32> [[VEC_SEL1]], <vscale x 4 x i32> [[VEC_SEL2]]
-; CHECK-VF4IC4-NEXT:   [[VEC_ICMP6:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL5]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL6:%.*]] = select <vscale x 4 x i1> [[VEC_ICMP6]], <vscale x 4 x i32> [[VEC_SEL5]], <vscale x 4 x i32> [[VEC_SEL3]]
-; CHECK-VF4IC4-NEXT:   [[VEC_ICMP7:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL6]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL7:%.*]] = select <vscale x 4 x i1> [[VEC_ICMP7]], <vscale x 4 x i32> [[VEC_SEL6]], <vscale x 4 x i32> [[VEC_SEL4]]
-; CHECK-VF4IC4-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL7]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[FIN_ICMP]])
-; CHECK-VF4IC4-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3
+; CHECK-VF4IC4-NEXT:   [[OR1:%.*]] = or <vscale x 4 x i1> [[VEC_SEL2]], [[VEC_SEL1]]
+; CHECK-VF4IC4-NEXT:   [[OR2:%.*]] = or <vscale x 4 x i1> [[VEC_SEL3]], [[OR1]]
+; CHECK-VF4IC4-NEXT:   [[OR3:%.*]] = or <vscale x 4 x i1> [[VEC_SEL4]], [[OR2]]
+; CHECK-VF4IC4-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[OR3]])
+; CHECK-VF4IC4-NEXT:   [[FR:%.*]] = freeze i1 [[OR_RDX]]
+; CHECK-VF4IC4-NEXT:   {{.*}} = select i1 [[FR]], i32 7, i32 3
 entry:
   br label %for.body
 
@@ -62,21 +64,18 @@ exit:                                     ; preds = %for.body
 define i32 @select_i32_from_icmp(ptr nocapture readonly %v, i32 %a, i32 %b, i64 %n) #0 {
 ; CHECK-VF4IC1-LABEL: @select_i32_from_icmp
 ; CHECK-VF4IC1:      vector.ph:
-; CHECK-VF4IC1:        [[TMP1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 %a, i64 0
-; CHECK-VF4IC1-NEXT:   [[SPLAT_OF_A:%.*]] = shufflevector <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4IC1-NEXT:   [[TMP2:%.*]] = insertelement <vscale x 4 x i32> poison, i32 %b, i64 0
-; CHECK-VF4IC1-NEXT:   [[SPLAT_OF_B:%.*]] = shufflevector <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4IC1-NOT:    shufflevector <vscale x 4 x i32>
+; CHECK-VF4IC1-NOT:    shufflevector <vscale x 4 x i32>
 ; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[SPLAT_OF_A]], %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
 ; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <vscale x 4 x i32>
 ; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <vscale x 4 x i32> [[VEC_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <vscale x 4 x i1> [[VEC_ICMP]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[SPLAT_OF_B]]
+; CHECK-VF4IC1-NEXT:   [[NOT:%*]] = xor <vscale x 4 x i1> [[VEC_ICMP]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = or <vscale x 4 x i1> [[VEC_PHI]], [[NOT]]
 ; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[FIN_INS:%.*]] = insertelement <vscale x 4 x i32> poison, i32 %a, i64 0
-; CHECK-VF4IC1-NEXT:   [[FIN_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[FIN_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4IC1-NEXT:   [[FIN_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL]], [[FIN_SPLAT]]
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[FIN_CMP]])
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 %b, i32 %a
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[VEC_SEL]])
+; CHECK-VF4IC1-NEXT:   [[FR:%.*]] = freeze i1 [[OR_RDX]]
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR]], i32 %b, i32 %a
 
 ; CHECK-VF4IC4-LABEL: @select_i32_from_icmp
 ; CHECK-VF4IC4:      vector.body:
@@ -101,14 +100,15 @@ exit:                                     ; preds = %for.body
 define i32 @select_const_i32_from_fcmp(ptr nocapture readonly %v, i64 %n) #0 {
 ; CHECK-VF4IC1-LABEL: @select_const_i32_from_fcmp
 ; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
 ; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <vscale x 4 x float>
 ; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = fcmp fast ueq <vscale x 4 x float> [[VEC_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <vscale x 4 x i1> [[VEC_ICMP]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1-NEXT:   [[NOT:%*]] = xor <vscale x 4 x i1> [[VEC_ICMP]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = or <vscale x 4 x i1> [[VEC_PHI]], [[NOT]]
 ; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[FIN_ICMP]])
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 1, i32 2
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[VEC_SEL]])
+; CHECK-VF4IC1-NEXT:   [[FR:%.*]] = freeze i1 [[OR_RDX]]
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR]], i32 1, i32 2
 
 ; CHECK-VF4IC4-LABEL: @select_const_i32_from_fcmp
 ; CHECK-VF4IC4:      vector.body:
@@ -156,17 +156,17 @@ exit:                                     ; preds = %for.body
 define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i64 %n) #0 {
 ; CHECK-VF4IC1-LABEL: @pred_select_const_i32_from_icmp
 ; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
 ; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <vscale x 4 x i32>
 ; CHECK-VF4IC1:        [[MASK:%.*]] = icmp sgt <vscale x 4 x i32> [[VEC_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 35, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-VF4IC1:        [[MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr {{%.*}}, i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x i32> poison)
 ; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <vscale x 4 x i32> [[MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL_TMP:%.*]] = select <vscale x 4 x i1> [[VEC_ICMP]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> [[VEC_PHI]]
-; CHECK-VF4IC1:        [[VEC_SEL:%.*]] = select <vscale x 4 x i1> [[MASK]], <vscale x 4 x i32> [[VEC_SEL_TMP]], <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL_TMP:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[VEC_ICMP]]
+; CHECK-VF4IC1:        [[VEC_SEL:%.*]] = select <vscale x 4 x i1> [[MASK]], <vscale x 4 x i1> [[VEC_SEL_TMP]], <vscale x 4 x i1> [[VEC_PHI]]
 ; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL]], zeroinitializer
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[FIN_ICMP]])
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 1, i32 0
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[VEC_SEL]])
+; CHECK-VF4IC1-NEXT:   [[FR:%.*]] = freeze i1 [[OR_RDX]]
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR]], i32 1, i32 0
 
 ; CHECK-VF4IC4-LABEL: @pred_select_const_i32_from_icmp
 ; CHECK-VF4IC4:      vector.body:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
index 6fa197591ab3..576dc0833fa3 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
@@ -1,36 +1,48 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -S | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=off -mtriple=riscv64 -mattr=+v -S | FileCheck %s --check-prefix=FIXED
+; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=on -mtriple=riscv64 -mattr=+v -S | FileCheck %s --check-prefix=SCALABLE
 
 define void @load_store_factor2_i32(ptr %p) {
 ; CHECK-LABEL: @load_store_factor2_i32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP3]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC1]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[TMP6]], i32 -1
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-; CHECK-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i32> [[TMP11]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[TMP14]], i32 -1
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP12]], <vscale x 4 x i32> [[TMP15]])
+; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP16]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -50,6 +62,112 @@ define void @load_store_factor2_i32(ptr %p) {
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
+; FIXED-LABEL: @load_store_factor2_i32(
+; FIXED-NEXT:  entry:
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED:       vector.ph:
+; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED:       vector.body:
+; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; FIXED-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
+; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]]
+; FIXED-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[TMP2]], i32 0
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP3]], align 4
+; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; FIXED-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; FIXED-NEXT:    [[TMP5:%.*]] = add i64 [[TMP1]], 1
+; FIXED-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP5]]
+; FIXED-NEXT:    [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC1]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; FIXED-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[TMP6]], i32 -1
+; FIXED-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; FIXED-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4
+; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; FIXED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; FIXED:       middle.block:
+; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED:       scalar.ph:
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED:       loop:
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; FIXED-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
+; FIXED-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
+; FIXED-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
+; FIXED-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
+; FIXED-NEXT:    store i32 [[Y1]], ptr [[Q1]], align 4
+; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; FIXED:       exit:
+; FIXED-NEXT:    ret void
+;
+; SCALABLE-LABEL: @load_store_factor2_i32(
+; SCALABLE-NEXT:  entry:
+; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE:       vector.ph:
+; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALABLE:       vector.body:
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
+; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]]
+; SCALABLE-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP9]], align 4
+; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[TMP13:%.*]] = add i64 [[TMP7]], 1
+; SCALABLE-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP13]]
+; SCALABLE-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i32> [[TMP11]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[TMP14]], i32 -1
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP12]], <vscale x 4 x i32> [[TMP15]])
+; SCALABLE-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP16]], align 4
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; SCALABLE-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; SCALABLE:       middle.block:
+; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; SCALABLE:       scalar.ph:
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE:       loop:
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; SCALABLE-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
+; SCALABLE-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
+; SCALABLE-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
+; SCALABLE-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
+; SCALABLE-NEXT:    store i32 [[Y1]], ptr [[Q1]], align 4
+; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; SCALABLE:       exit:
+; SCALABLE-NEXT:    ret void
+;
 entry:
   br label %loop
 loop:
@@ -77,33 +195,43 @@ exit:
 define void @load_store_factor2_i64(ptr %p) {
 ; CHECK-LABEL: @load_store_factor2_i64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP3]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC1]], <i64 2, i64 2, i64 2, i64 2>
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[TMP6]], i32 -1
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP10]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i64> [[TMP11]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 2, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i64, ptr [[TMP14]], i32 -1
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP12]], <vscale x 2 x i64> [[TMP15]])
+; CHECK-NEXT:    store <vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP16]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -123,6 +251,112 @@ define void @load_store_factor2_i64(ptr %p) {
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
+; FIXED-LABEL: @load_store_factor2_i64(
+; FIXED-NEXT:  entry:
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED:       vector.ph:
+; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED:       vector.body:
+; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; FIXED-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
+; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
+; FIXED-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP3]], align 8
+; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; FIXED-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], <i64 1, i64 1, i64 1, i64 1>
+; FIXED-NEXT:    [[TMP5:%.*]] = add i64 [[TMP1]], 1
+; FIXED-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP5]]
+; FIXED-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC1]], <i64 2, i64 2, i64 2, i64 2>
+; FIXED-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[TMP6]], i32 -1
+; FIXED-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; FIXED-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8
+; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; FIXED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; FIXED:       middle.block:
+; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED:       scalar.ph:
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED:       loop:
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; FIXED-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
+; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
+; FIXED-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
+; FIXED-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
+; FIXED-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
+; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; FIXED:       exit:
+; FIXED-NEXT:    ret void
+;
+; SCALABLE-LABEL: @load_store_factor2_i64(
+; SCALABLE-NEXT:  entry:
+; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE:       vector.ph:
+; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALABLE:       vector.body:
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
+; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]]
+; SCALABLE-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[TMP9]], align 8
+; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[WIDE_VEC]])
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP10]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[TMP13:%.*]] = add i64 [[TMP7]], 1
+; SCALABLE-NEXT:    [[TMP14:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP13]]
+; SCALABLE-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i64> [[TMP11]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 2, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[TMP16:%.*]] = getelementptr i64, ptr [[TMP14]], i32 -1
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP12]], <vscale x 2 x i64> [[TMP15]])
+; SCALABLE-NEXT:    store <vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP16]], align 8
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; SCALABLE-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; SCALABLE:       middle.block:
+; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; SCALABLE:       scalar.ph:
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE:       loop:
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
+; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
+; SCALABLE-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
+; SCALABLE-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
+; SCALABLE-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
+; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; SCALABLE:       exit:
+; SCALABLE-NEXT:    ret void
+;
 entry:
   br label %loop
 loop:
@@ -206,6 +440,122 @@ define void @load_store_factor3_i32(ptr %p) {
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
+; FIXED-LABEL: @load_store_factor3_i32(
+; FIXED-NEXT:  entry:
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED:       vector.ph:
+; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED:       vector.body:
+; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; FIXED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 3
+; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]]
+; FIXED-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[TMP2]], i32 0
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <6 x i32>, ptr [[TMP3]], align 4
+; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <6 x i32> [[WIDE_VEC]], <6 x i32> poison, <2 x i32> <i32 0, i32 3>
+; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <6 x i32> [[WIDE_VEC]], <6 x i32> poison, <2 x i32> <i32 1, i32 4>
+; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <6 x i32> [[WIDE_VEC]], <6 x i32> poison, <2 x i32> <i32 2, i32 5>
+; FIXED-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[STRIDED_VEC]], <i32 1, i32 1>
+; FIXED-NEXT:    [[TMP5:%.*]] = add i64 [[TMP1]], 1
+; FIXED-NEXT:    [[TMP6:%.*]] = add <2 x i32> [[STRIDED_VEC1]], <i32 2, i32 2>
+; FIXED-NEXT:    [[TMP7:%.*]] = add i64 [[TMP5]], 1
+; FIXED-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP7]]
+; FIXED-NEXT:    [[TMP9:%.*]] = add <2 x i32> [[STRIDED_VEC2]], <i32 3, i32 3>
+; FIXED-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[TMP8]], i32 -2
+; FIXED-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; FIXED-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; FIXED-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <6 x i32> [[TMP13]], <6 x i32> poison, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+; FIXED-NEXT:    store <6 x i32> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4
+; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; FIXED:       middle.block:
+; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED:       scalar.ph:
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED:       loop:
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; FIXED-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
+; FIXED-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
+; FIXED-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
+; FIXED-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
+; FIXED-NEXT:    store i32 [[Y1]], ptr [[Q1]], align 4
+; FIXED-NEXT:    [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
+; FIXED-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
+; FIXED-NEXT:    [[X2:%.*]] = load i32, ptr [[Q2]], align 4
+; FIXED-NEXT:    [[Y2:%.*]] = add i32 [[X2]], 3
+; FIXED-NEXT:    store i32 [[Y2]], ptr [[Q2]], align 4
+; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; FIXED:       exit:
+; FIXED-NEXT:    ret void
+;
+; SCALABLE-LABEL: @load_store_factor3_i32(
+; SCALABLE-NEXT:  entry:
+; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE:       vector.ph:
+; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALABLE:       vector.body:
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 3
+; SCALABLE-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]]
+; SCALABLE-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[TMP2]], i32 0
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <6 x i32>, ptr [[TMP3]], align 4
+; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <6 x i32> [[WIDE_VEC]], <6 x i32> poison, <2 x i32> <i32 0, i32 3>
+; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <6 x i32> [[WIDE_VEC]], <6 x i32> poison, <2 x i32> <i32 1, i32 4>
+; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <6 x i32> [[WIDE_VEC]], <6 x i32> poison, <2 x i32> <i32 2, i32 5>
+; SCALABLE-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[STRIDED_VEC]], <i32 1, i32 1>
+; SCALABLE-NEXT:    [[TMP5:%.*]] = add i64 [[TMP1]], 1
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add <2 x i32> [[STRIDED_VEC1]], <i32 2, i32 2>
+; SCALABLE-NEXT:    [[TMP7:%.*]] = add i64 [[TMP5]], 1
+; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP7]]
+; SCALABLE-NEXT:    [[TMP9:%.*]] = add <2 x i32> [[STRIDED_VEC2]], <i32 3, i32 3>
+; SCALABLE-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[TMP8]], i32 -2
+; SCALABLE-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SCALABLE-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SCALABLE-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <6 x i32> [[TMP13]], <6 x i32> poison, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+; SCALABLE-NEXT:    store <6 x i32> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; SCALABLE:       middle.block:
+; SCALABLE-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; SCALABLE:       scalar.ph:
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE:       loop:
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; SCALABLE-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
+; SCALABLE-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
+; SCALABLE-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
+; SCALABLE-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
+; SCALABLE-NEXT:    store i32 [[Y1]], ptr [[Q1]], align 4
+; SCALABLE-NEXT:    [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
+; SCALABLE-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
+; SCALABLE-NEXT:    [[X2:%.*]] = load i32, ptr [[Q2]], align 4
+; SCALABLE-NEXT:    [[Y2:%.*]] = add i32 [[X2]], 3
+; SCALABLE-NEXT:    store i32 [[Y2]], ptr [[Q2]], align 4
+; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; SCALABLE:       exit:
+; SCALABLE-NEXT:    ret void
+;
 entry:
   br label %loop
 loop:
@@ -248,37 +598,37 @@ define void @load_store_factor3_i64(ptr %p) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = add <vscale x 2 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = mul <vscale x 2 x i64> [[TMP5]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP8]]
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 3, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[P:%.*]], <vscale x 2 x i64> [[TMP10]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP11]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
-; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP12]], <vscale x 2 x ptr> [[TMP11]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 2 x i64> [[TMP10]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP13]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP14]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
-; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER1]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 2, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP15]], <vscale x 2 x ptr> [[TMP14]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; CHECK-NEXT:    [[TMP16:%.*]] = add <vscale x 2 x i64> [[TMP13]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP16]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP17]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
-; CHECK-NEXT:    [[TMP18:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER2]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 3, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP18]], <vscale x 2 x ptr> [[TMP17]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP20]]
+; CHECK-NEXT:    [[TMP12:%.*]] = mul <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 3, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[P:%.*]], <vscale x 2 x i64> [[TMP12]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP13]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP14:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP14]], <vscale x 2 x ptr> [[TMP13]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i64> [[TMP12]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP15]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP16]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP17:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER1]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 2, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP17]], <vscale x 2 x ptr> [[TMP16]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP18:%.*]] = add <vscale x 2 x i64> [[TMP15]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP18]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP19]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP20:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER2]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 3, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP20]], <vscale x 2 x ptr> [[TMP19]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -311,6 +661,135 @@ define void @load_store_factor3_i64(ptr %p) {
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
+; FIXED-LABEL: @load_store_factor3_i64(
+; FIXED-NEXT:  entry:
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED:       vector.ph:
+; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED:       vector.body:
+; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = mul <4 x i64> [[VEC_IND]], <i64 3, i64 3, i64 3, i64 3>
+; FIXED-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], <4 x i64> [[TMP0]]
+; FIXED-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP1]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> poison)
+; FIXED-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[WIDE_MASKED_GATHER]], <i64 1, i64 1, i64 1, i64 1>
+; FIXED-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[TMP2]], <4 x ptr> [[TMP1]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; FIXED-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP0]], <i64 1, i64 1, i64 1, i64 1>
+; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P]], <4 x i64> [[TMP3]]
+; FIXED-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP4]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> poison)
+; FIXED-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[WIDE_MASKED_GATHER1]], <i64 2, i64 2, i64 2, i64 2>
+; FIXED-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[TMP5]], <4 x ptr> [[TMP4]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; FIXED-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP3]], <i64 1, i64 1, i64 1, i64 1>
+; FIXED-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[P]], <4 x i64> [[TMP6]]
+; FIXED-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP7]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> poison)
+; FIXED-NEXT:    [[TMP8:%.*]] = add <4 x i64> [[WIDE_MASKED_GATHER2]], <i64 3, i64 3, i64 3, i64 3>
+; FIXED-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[TMP8]], <4 x ptr> [[TMP7]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; FIXED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; FIXED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; FIXED:       middle.block:
+; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED:       scalar.ph:
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED:       loop:
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; FIXED-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
+; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
+; FIXED-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
+; FIXED-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
+; FIXED-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
+; FIXED-NEXT:    [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
+; FIXED-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
+; FIXED-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
+; FIXED-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
+; FIXED-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
+; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; FIXED:       exit:
+; FIXED-NEXT:    ret void
+;
+; SCALABLE-LABEL: @load_store_factor3_i64(
+; SCALABLE-NEXT:  entry:
+; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE:       vector.ph:
+; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; SCALABLE-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; SCALABLE-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
+; SCALABLE-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; SCALABLE-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; SCALABLE-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; SCALABLE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; SCALABLE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALABLE:       vector.body:
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP12:%.*]] = mul <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 3, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[P:%.*]], <vscale x 2 x i64> [[TMP12]]
+; SCALABLE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP13]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; SCALABLE-NEXT:    [[TMP14:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; SCALABLE-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP14]], <vscale x 2 x ptr> [[TMP13]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SCALABLE-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i64> [[TMP12]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[TMP16:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP15]]
+; SCALABLE-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP16]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; SCALABLE-NEXT:    [[TMP17:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER1]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 2, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; SCALABLE-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP17]], <vscale x 2 x ptr> [[TMP16]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SCALABLE-NEXT:    [[TMP18:%.*]] = add <vscale x 2 x i64> [[TMP15]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[TMP19:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP18]]
+; SCALABLE-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP19]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; SCALABLE-NEXT:    [[TMP20:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER2]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 3, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; SCALABLE-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP20]], <vscale x 2 x ptr> [[TMP19]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; SCALABLE-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; SCALABLE:       middle.block:
+; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; SCALABLE:       scalar.ph:
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE:       loop:
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
+; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
+; SCALABLE-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
+; SCALABLE-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
+; SCALABLE-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
+; SCALABLE-NEXT:    [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
+; SCALABLE-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
+; SCALABLE-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
+; SCALABLE-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
+; SCALABLE-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
+; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; SCALABLE:       exit:
+; SCALABLE-NEXT:    ret void
+;
 entry:
   br label %loop
 loop:
@@ -353,62 +832,62 @@ define void @load_store_factor8(ptr %p) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP34:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP35:%.*]] = mul i64 [[TMP34]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = add <vscale x 2 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = mul <vscale x 2 x i64> [[TMP5]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP8]]
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = shl <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 3, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[P:%.*]], <vscale x 2 x i64> [[TMP10]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP11]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
-; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP12]], <vscale x 2 x ptr> [[TMP11]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 2 x i64> [[TMP10]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP13]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP14]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
-; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER1]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 2, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP15]], <vscale x 2 x ptr> [[TMP14]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; CHECK-NEXT:    [[TMP16:%.*]] = add <vscale x 2 x i64> [[TMP13]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP16]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP17]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
-; CHECK-NEXT:    [[TMP18:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER2]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 3, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP18]], <vscale x 2 x ptr> [[TMP17]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; CHECK-NEXT:    [[TMP19:%.*]] = add <vscale x 2 x i64> [[TMP16]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP19]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP20]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
-; CHECK-NEXT:    [[TMP21:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER3]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 4, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP21]], <vscale x 2 x ptr> [[TMP20]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; CHECK-NEXT:    [[TMP22:%.*]] = add <vscale x 2 x i64> [[TMP19]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP22]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP23]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
-; CHECK-NEXT:    [[TMP24:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER4]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 5, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP24]], <vscale x 2 x ptr> [[TMP23]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; CHECK-NEXT:    [[TMP25:%.*]] = add <vscale x 2 x i64> [[TMP22]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP25]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP26]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
-; CHECK-NEXT:    [[TMP27:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER5]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 6, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP27]], <vscale x 2 x ptr> [[TMP26]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; CHECK-NEXT:    [[TMP28:%.*]] = add <vscale x 2 x i64> [[TMP25]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP28]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP29]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
-; CHECK-NEXT:    [[TMP30:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER6]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 7, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP30]], <vscale x 2 x ptr> [[TMP29]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; CHECK-NEXT:    [[TMP31:%.*]] = add <vscale x 2 x i64> [[TMP28]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP31]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP32]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
-; CHECK-NEXT:    [[TMP33:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 8, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP33]], <vscale x 2 x ptr> [[TMP32]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP35]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shl <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 3, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[P:%.*]], <vscale x 2 x i64> [[TMP12]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP13]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP14:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP14]], <vscale x 2 x ptr> [[TMP13]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i64> [[TMP12]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP15]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP16]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP17:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER1]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 2, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP17]], <vscale x 2 x ptr> [[TMP16]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP18:%.*]] = add <vscale x 2 x i64> [[TMP15]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP18]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP19]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP20:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER2]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 3, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP20]], <vscale x 2 x ptr> [[TMP19]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP21:%.*]] = add <vscale x 2 x i64> [[TMP18]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP21]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP22]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP23:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER3]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 4, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP23]], <vscale x 2 x ptr> [[TMP22]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP24:%.*]] = add <vscale x 2 x i64> [[TMP21]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP24]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP25]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP26:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER4]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 5, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP26]], <vscale x 2 x ptr> [[TMP25]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP27:%.*]] = add <vscale x 2 x i64> [[TMP24]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP27]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP28]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP29:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER5]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 6, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP29]], <vscale x 2 x ptr> [[TMP28]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP30:%.*]] = add <vscale x 2 x i64> [[TMP27]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP30]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP31]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP32:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER6]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 7, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP32]], <vscale x 2 x ptr> [[TMP31]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP33:%.*]] = add <vscale x 2 x i64> [[TMP30]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP33]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP34]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP35:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 8, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP35]], <vscale x 2 x ptr> [[TMP34]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -466,6 +945,235 @@ define void @load_store_factor8(ptr %p) {
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
+; FIXED-LABEL: @load_store_factor8(
+; FIXED-NEXT:  entry:
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED:       vector.ph:
+; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED:       vector.body:
+; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = shl <4 x i64> [[VEC_IND]], <i64 3, i64 3, i64 3, i64 3>
+; FIXED-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], <4 x i64> [[TMP0]]
+; FIXED-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP1]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> poison)
+; FIXED-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[WIDE_MASKED_GATHER]], <i64 1, i64 1, i64 1, i64 1>
+; FIXED-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[TMP2]], <4 x ptr> [[TMP1]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; FIXED-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP0]], <i64 1, i64 1, i64 1, i64 1>
+; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P]], <4 x i64> [[TMP3]]
+; FIXED-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP4]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> poison)
+; FIXED-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[WIDE_MASKED_GATHER1]], <i64 2, i64 2, i64 2, i64 2>
+; FIXED-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[TMP5]], <4 x ptr> [[TMP4]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; FIXED-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP3]], <i64 1, i64 1, i64 1, i64 1>
+; FIXED-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[P]], <4 x i64> [[TMP6]]
+; FIXED-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP7]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> poison)
+; FIXED-NEXT:    [[TMP8:%.*]] = add <4 x i64> [[WIDE_MASKED_GATHER2]], <i64 3, i64 3, i64 3, i64 3>
+; FIXED-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[TMP8]], <4 x ptr> [[TMP7]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; FIXED-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[TMP6]], <i64 1, i64 1, i64 1, i64 1>
+; FIXED-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[P]], <4 x i64> [[TMP9]]
+; FIXED-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP10]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> poison)
+; FIXED-NEXT:    [[TMP11:%.*]] = add <4 x i64> [[WIDE_MASKED_GATHER3]], <i64 4, i64 4, i64 4, i64 4>
+; FIXED-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[TMP11]], <4 x ptr> [[TMP10]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; FIXED-NEXT:    [[TMP12:%.*]] = add <4 x i64> [[TMP9]], <i64 1, i64 1, i64 1, i64 1>
+; FIXED-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[P]], <4 x i64> [[TMP12]]
+; FIXED-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP13]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> poison)
+; FIXED-NEXT:    [[TMP14:%.*]] = add <4 x i64> [[WIDE_MASKED_GATHER4]], <i64 5, i64 5, i64 5, i64 5>
+; FIXED-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[TMP14]], <4 x ptr> [[TMP13]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; FIXED-NEXT:    [[TMP15:%.*]] = add <4 x i64> [[TMP12]], <i64 1, i64 1, i64 1, i64 1>
+; FIXED-NEXT:    [[TMP16:%.*]] = getelementptr i64, ptr [[P]], <4 x i64> [[TMP15]]
+; FIXED-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP16]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> poison)
+; FIXED-NEXT:    [[TMP17:%.*]] = add <4 x i64> [[WIDE_MASKED_GATHER5]], <i64 6, i64 6, i64 6, i64 6>
+; FIXED-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[TMP17]], <4 x ptr> [[TMP16]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; FIXED-NEXT:    [[TMP18:%.*]] = add <4 x i64> [[TMP15]], <i64 1, i64 1, i64 1, i64 1>
+; FIXED-NEXT:    [[TMP19:%.*]] = getelementptr i64, ptr [[P]], <4 x i64> [[TMP18]]
+; FIXED-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP19]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> poison)
+; FIXED-NEXT:    [[TMP20:%.*]] = add <4 x i64> [[WIDE_MASKED_GATHER6]], <i64 7, i64 7, i64 7, i64 7>
+; FIXED-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[TMP20]], <4 x ptr> [[TMP19]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; FIXED-NEXT:    [[TMP21:%.*]] = add <4 x i64> [[TMP18]], <i64 1, i64 1, i64 1, i64 1>
+; FIXED-NEXT:    [[TMP22:%.*]] = getelementptr i64, ptr [[P]], <4 x i64> [[TMP21]]
+; FIXED-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP22]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> poison)
+; FIXED-NEXT:    [[TMP23:%.*]] = add <4 x i64> [[WIDE_MASKED_GATHER7]], <i64 8, i64 8, i64 8, i64 8>
+; FIXED-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[TMP23]], <4 x ptr> [[TMP22]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; FIXED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; FIXED-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; FIXED:       middle.block:
+; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED:       scalar.ph:
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED:       loop:
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 3
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; FIXED-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
+; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
+; FIXED-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
+; FIXED-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
+; FIXED-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
+; FIXED-NEXT:    [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
+; FIXED-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
+; FIXED-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
+; FIXED-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
+; FIXED-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
+; FIXED-NEXT:    [[OFFSET3:%.*]] = add i64 [[OFFSET2]], 1
+; FIXED-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
+; FIXED-NEXT:    [[X3:%.*]] = load i64, ptr [[Q3]], align 8
+; FIXED-NEXT:    [[Y3:%.*]] = add i64 [[X3]], 4
+; FIXED-NEXT:    store i64 [[Y3]], ptr [[Q3]], align 8
+; FIXED-NEXT:    [[OFFSET4:%.*]] = add i64 [[OFFSET3]], 1
+; FIXED-NEXT:    [[Q4:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET4]]
+; FIXED-NEXT:    [[X4:%.*]] = load i64, ptr [[Q4]], align 8
+; FIXED-NEXT:    [[Y4:%.*]] = add i64 [[X4]], 5
+; FIXED-NEXT:    store i64 [[Y4]], ptr [[Q4]], align 8
+; FIXED-NEXT:    [[OFFSET5:%.*]] = add i64 [[OFFSET4]], 1
+; FIXED-NEXT:    [[Q5:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET5]]
+; FIXED-NEXT:    [[X5:%.*]] = load i64, ptr [[Q5]], align 8
+; FIXED-NEXT:    [[Y5:%.*]] = add i64 [[X5]], 6
+; FIXED-NEXT:    store i64 [[Y5]], ptr [[Q5]], align 8
+; FIXED-NEXT:    [[OFFSET6:%.*]] = add i64 [[OFFSET5]], 1
+; FIXED-NEXT:    [[Q6:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET6]]
+; FIXED-NEXT:    [[X6:%.*]] = load i64, ptr [[Q6]], align 8
+; FIXED-NEXT:    [[Y6:%.*]] = add i64 [[X6]], 7
+; FIXED-NEXT:    store i64 [[Y6]], ptr [[Q6]], align 8
+; FIXED-NEXT:    [[OFFSET7:%.*]] = add i64 [[OFFSET6]], 1
+; FIXED-NEXT:    [[Q7:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET7]]
+; FIXED-NEXT:    [[X7:%.*]] = load i64, ptr [[Q7]], align 8
+; FIXED-NEXT:    [[Y7:%.*]] = add i64 [[X7]], 8
+; FIXED-NEXT:    store i64 [[Y7]], ptr [[Q7]], align 8
+; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; FIXED:       exit:
+; FIXED-NEXT:    ret void
+;
+; SCALABLE-LABEL: @load_store_factor8(
+; SCALABLE-NEXT:  entry:
+; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE:       vector.ph:
+; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; SCALABLE-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; SCALABLE-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
+; SCALABLE-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; SCALABLE-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; SCALABLE-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; SCALABLE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; SCALABLE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALABLE:       vector.body:
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP12:%.*]] = shl <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 3, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[P:%.*]], <vscale x 2 x i64> [[TMP12]]
+; SCALABLE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP13]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; SCALABLE-NEXT:    [[TMP14:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; SCALABLE-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP14]], <vscale x 2 x ptr> [[TMP13]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SCALABLE-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i64> [[TMP12]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[TMP16:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP15]]
+; SCALABLE-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP16]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; SCALABLE-NEXT:    [[TMP17:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER1]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 2, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; SCALABLE-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP17]], <vscale x 2 x ptr> [[TMP16]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SCALABLE-NEXT:    [[TMP18:%.*]] = add <vscale x 2 x i64> [[TMP15]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[TMP19:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP18]]
+; SCALABLE-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP19]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; SCALABLE-NEXT:    [[TMP20:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER2]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 3, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; SCALABLE-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP20]], <vscale x 2 x ptr> [[TMP19]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SCALABLE-NEXT:    [[TMP21:%.*]] = add <vscale x 2 x i64> [[TMP18]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[TMP22:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP21]]
+; SCALABLE-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP22]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; SCALABLE-NEXT:    [[TMP23:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER3]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 4, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; SCALABLE-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP23]], <vscale x 2 x ptr> [[TMP22]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SCALABLE-NEXT:    [[TMP24:%.*]] = add <vscale x 2 x i64> [[TMP21]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[TMP25:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP24]]
+; SCALABLE-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP25]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; SCALABLE-NEXT:    [[TMP26:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER4]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 5, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; SCALABLE-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP26]], <vscale x 2 x ptr> [[TMP25]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SCALABLE-NEXT:    [[TMP27:%.*]] = add <vscale x 2 x i64> [[TMP24]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[TMP28:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP27]]
+; SCALABLE-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP28]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; SCALABLE-NEXT:    [[TMP29:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER5]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 6, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; SCALABLE-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP29]], <vscale x 2 x ptr> [[TMP28]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SCALABLE-NEXT:    [[TMP30:%.*]] = add <vscale x 2 x i64> [[TMP27]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[TMP31:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP30]]
+; SCALABLE-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP31]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; SCALABLE-NEXT:    [[TMP32:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER6]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 7, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; SCALABLE-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP32]], <vscale x 2 x ptr> [[TMP31]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SCALABLE-NEXT:    [[TMP33:%.*]] = add <vscale x 2 x i64> [[TMP30]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[TMP34:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP33]]
+; SCALABLE-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP34]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; SCALABLE-NEXT:    [[TMP35:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 8, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; SCALABLE-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP35]], <vscale x 2 x ptr> [[TMP34]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; SCALABLE-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; SCALABLE:       middle.block:
+; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; SCALABLE:       scalar.ph:
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE:       loop:
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 3
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
+; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
+; SCALABLE-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
+; SCALABLE-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
+; SCALABLE-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
+; SCALABLE-NEXT:    [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
+; SCALABLE-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
+; SCALABLE-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
+; SCALABLE-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
+; SCALABLE-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
+; SCALABLE-NEXT:    [[OFFSET3:%.*]] = add i64 [[OFFSET2]], 1
+; SCALABLE-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
+; SCALABLE-NEXT:    [[X3:%.*]] = load i64, ptr [[Q3]], align 8
+; SCALABLE-NEXT:    [[Y3:%.*]] = add i64 [[X3]], 4
+; SCALABLE-NEXT:    store i64 [[Y3]], ptr [[Q3]], align 8
+; SCALABLE-NEXT:    [[OFFSET4:%.*]] = add i64 [[OFFSET3]], 1
+; SCALABLE-NEXT:    [[Q4:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET4]]
+; SCALABLE-NEXT:    [[X4:%.*]] = load i64, ptr [[Q4]], align 8
+; SCALABLE-NEXT:    [[Y4:%.*]] = add i64 [[X4]], 5
+; SCALABLE-NEXT:    store i64 [[Y4]], ptr [[Q4]], align 8
+; SCALABLE-NEXT:    [[OFFSET5:%.*]] = add i64 [[OFFSET4]], 1
+; SCALABLE-NEXT:    [[Q5:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET5]]
+; SCALABLE-NEXT:    [[X5:%.*]] = load i64, ptr [[Q5]], align 8
+; SCALABLE-NEXT:    [[Y5:%.*]] = add i64 [[X5]], 6
+; SCALABLE-NEXT:    store i64 [[Y5]], ptr [[Q5]], align 8
+; SCALABLE-NEXT:    [[OFFSET6:%.*]] = add i64 [[OFFSET5]], 1
+; SCALABLE-NEXT:    [[Q6:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET6]]
+; SCALABLE-NEXT:    [[X6:%.*]] = load i64, ptr [[Q6]], align 8
+; SCALABLE-NEXT:    [[Y6:%.*]] = add i64 [[X6]], 7
+; SCALABLE-NEXT:    store i64 [[Y6]], ptr [[Q6]], align 8
+; SCALABLE-NEXT:    [[OFFSET7:%.*]] = add i64 [[OFFSET6]], 1
+; SCALABLE-NEXT:    [[Q7:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET7]]
+; SCALABLE-NEXT:    [[X7:%.*]] = load i64, ptr [[Q7]], align 8
+; SCALABLE-NEXT:    [[Y7:%.*]] = add i64 [[X7]], 8
+; SCALABLE-NEXT:    store i64 [[Y7]], ptr [[Q7]], align 8
+; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; SCALABLE:       exit:
+; SCALABLE-NEXT:    ret void
+;
 entry:
   br label %loop
 loop:
@@ -529,40 +1237,40 @@ exit:
 define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; CHECK-LABEL: @combine_load_factor2_i32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP7]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP8:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], [[STRIDED_VEC4]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[Q]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP10]], i32 8
-; CHECK-NEXT:    store <8 x i32> [[TMP8]], ptr [[TMP12]], align 4
-; CHECK-NEXT:    store <8 x i32> [[TMP9]], ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP12]], ptr [[TMP14]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -581,6 +1289,114 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
+; FIXED-LABEL: @combine_load_factor2_i32(
+; FIXED-NEXT:  entry:
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED:       vector.ph:
+; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED:       vector.body:
+; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
+; FIXED-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP0]], 1
+; FIXED-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP1]], 1
+; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP2]]
+; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP3]]
+; FIXED-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
+; FIXED-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP6]], align 4
+; FIXED-NEXT:    [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP7]], align 4
+; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; FIXED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; FIXED-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; FIXED-NEXT:    [[TMP8:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC3]]
+; FIXED-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], [[STRIDED_VEC4]]
+; FIXED-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP0]]
+; FIXED-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[Q]], i64 [[TMP1]]
+; FIXED-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
+; FIXED-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP10]], i32 8
+; FIXED-NEXT:    store <8 x i32> [[TMP8]], ptr [[TMP12]], align 4
+; FIXED-NEXT:    store <8 x i32> [[TMP9]], ptr [[TMP13]], align 4
+; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; FIXED:       middle.block:
+; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED:       scalar.ph:
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED:       loop:
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
+; FIXED-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
+; FIXED-NEXT:    [[RES:%.*]] = add i32 [[X0]], [[X1]]
+; FIXED-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]]
+; FIXED-NEXT:    store i32 [[RES]], ptr [[DST]], align 4
+; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; FIXED:       exit:
+; FIXED-NEXT:    ret void
+;
+; SCALABLE-LABEL: @combine_load_factor2_i32(
+; SCALABLE-NEXT:  entry:
+; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE:       vector.ph:
+; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALABLE:       vector.body:
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
+; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]]
+; SCALABLE-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP9]], align 4
+; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], [[TMP11]]
+; SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]]
+; SCALABLE-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
+; SCALABLE-NEXT:    store <vscale x 4 x i32> [[TMP12]], ptr [[TMP14]], align 4
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; SCALABLE-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; SCALABLE:       middle.block:
+; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; SCALABLE:       scalar.ph:
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE:       loop:
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
+; SCALABLE-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
+; SCALABLE-NEXT:    [[RES:%.*]] = add i32 [[X0]], [[X1]]
+; SCALABLE-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]]
+; SCALABLE-NEXT:    store i32 [[RES]], ptr [[DST]], align 4
+; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; SCALABLE:       exit:
+; SCALABLE-NEXT:    ret void
+;
 entry:
   br label %loop
 loop:
@@ -609,40 +1425,40 @@ exit:
 define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; CHECK-LABEL: @combine_load_factor2_i64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], [[STRIDED_VEC4]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[Q]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP10]], i32 4
-; CHECK-NEXT:    store <4 x i64> [[TMP8]], ptr [[TMP12]], align 8
-; CHECK-NEXT:    store <4 x i64> [[TMP9]], ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i64, ptr [[TMP13]], i32 0
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP12]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -661,6 +1477,114 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
+; FIXED-LABEL: @combine_load_factor2_i64(
+; FIXED-NEXT:  entry:
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED:       vector.ph:
+; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED:       vector.body:
+; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
+; FIXED-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP0]], 1
+; FIXED-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP1]], 1
+; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]]
+; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
+; FIXED-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
+; FIXED-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP6]], align 8
+; FIXED-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8
+; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; FIXED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; FIXED-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; FIXED-NEXT:    [[TMP8:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC3]]
+; FIXED-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], [[STRIDED_VEC4]]
+; FIXED-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]]
+; FIXED-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[Q]], i64 [[TMP1]]
+; FIXED-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[TMP10]], i32 0
+; FIXED-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP10]], i32 4
+; FIXED-NEXT:    store <4 x i64> [[TMP8]], ptr [[TMP12]], align 8
+; FIXED-NEXT:    store <4 x i64> [[TMP9]], ptr [[TMP13]], align 8
+; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; FIXED:       middle.block:
+; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED:       scalar.ph:
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED:       loop:
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
+; FIXED-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
+; FIXED-NEXT:    [[RES:%.*]] = add i64 [[X0]], [[X1]]
+; FIXED-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]]
+; FIXED-NEXT:    store i64 [[RES]], ptr [[DST]], align 8
+; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; FIXED:       exit:
+; FIXED-NEXT:    ret void
+;
+; SCALABLE-LABEL: @combine_load_factor2_i64(
+; SCALABLE-NEXT:  entry:
+; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE:       vector.ph:
+; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALABLE:       vector.body:
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
+; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]]
+; SCALABLE-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[TMP9]], align 8
+; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[WIDE_VEC]])
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP10]], [[TMP11]]
+; SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]]
+; SCALABLE-NEXT:    [[TMP14:%.*]] = getelementptr i64, ptr [[TMP13]], i32 0
+; SCALABLE-NEXT:    store <vscale x 2 x i64> [[TMP12]], ptr [[TMP14]], align 8
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; SCALABLE-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; SCALABLE:       middle.block:
+; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; SCALABLE:       scalar.ph:
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE:       loop:
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
+; SCALABLE-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
+; SCALABLE-NEXT:    [[RES:%.*]] = add i64 [[X0]], [[X1]]
+; SCALABLE-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]]
+; SCALABLE-NEXT:    store i64 [[RES]], ptr [[DST]], align 8
+; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; SCALABLE:       exit:
+; SCALABLE-NEXT:    ret void
+;
 entry:
   br label %loop
 loop:
@@ -685,4 +1609,3 @@ loop:
 exit:
   ret void
 }
-
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
index 8a2dc0abb0de..2b58acbfe9cc 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S \
 ; RUN:   < %s | FileCheck %s
 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 \
@@ -7,109 +6,59 @@
 target triple = "riscv64"
 
 define i32 @select_icmp(i32 %x, i32 %y, ptr nocapture readonly %c, i64 %n) #0 {
-; CHECK-LABEL: define i32 @select_icmp(
-; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr nocapture readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-LABEL: @select_icmp
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i64 0
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 %n, 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[Y]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]])
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i32 [[Y]], i32 0
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[A:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[COND:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP7]], [[X]]
-; CHECK-NEXT:    [[COND]] = select i1 [[CMP1]], i32 [[A]], i32 [[Y]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[COND_LCSSA]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[FR:%.*]] = freeze i1 [[TMP7]]
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 %y, i32 0
 ;
-; SCALABLE-LABEL: define i32 @select_icmp(
-; SCALABLE-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr nocapture readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; SCALABLE-NEXT:  entry:
-; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
-; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-LABEL: @select_icmp
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 %n, [[TMP3]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
+; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X:%.*]], i64 0
 ; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Y]], i64 0
-; SCALABLE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP6]]
-; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
-; SCALABLE-NEXT:    [[TMP9:%.*]] = icmp slt <vscale x 4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; SCALABLE-NEXT:    [[TMP10]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[BROADCAST_SPLAT2]]
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; SCALABLE-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP4]]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
+; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
+; SCALABLE-NEXT:    [[TMP8:%.*]] = icmp slt <vscale x 4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; SCALABLE-NEXT:    [[NOT:%.*]] = xor <vscale x 4 x i1> [[TMP8]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[TMP9]] = or <vscale x 4 x i1> [[VEC_PHI]], [[NOT]]
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
+; SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[TMP10]], zeroinitializer
-; SCALABLE-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP]])
-; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP12]], i32 [[Y]], i32 0
-; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SCALABLE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; SCALABLE-NEXT:    br label [[FOR_BODY:%.*]]
-; SCALABLE:       for.body:
-; SCALABLE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; SCALABLE-NEXT:    [[A:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[COND:%.*]], [[FOR_BODY]] ]
-; SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]]
-; SCALABLE-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; SCALABLE-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP13]], [[X]]
-; SCALABLE-NEXT:    [[COND]] = select i1 [[CMP1]], i32 [[A]], i32 [[Y]]
-; SCALABLE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; SCALABLE:       for.end:
-; SCALABLE-NEXT:    [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; SCALABLE-NEXT:    ret i32 [[COND_LCSSA]]
+; SCALABLE-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
+; SCALABLE-NEXT:    [[FR:%.*]] = freeze i1 [[TMP13]]
+; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 %y, i32 0
 ;
 entry:
   br label %for.body
@@ -130,109 +79,59 @@ for.end:
 }
 
 define i32 @select_fcmp(float %x, i32 %y, ptr nocapture readonly %c, i64 %n) #0 {
-; CHECK-LABEL: define i32 @select_fcmp(
-; CHECK-SAME: float [[X:%.*]], i32 [[Y:%.*]], ptr nocapture readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-LABEL: @select_fcmp
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[X]], i64 0
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 %n, 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[Y]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]])
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i32 [[Y]], i32 0
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[A:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[COND:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp fast olt float [[TMP7]], [[X]]
-; CHECK-NEXT:    [[COND]] = select i1 [[CMP1]], i32 [[A]], i32 [[Y]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[COND_LCSSA]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[FR:%.*]] = freeze i1 [[TMP7]]
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 %y, i32 0
 ;
-; SCALABLE-LABEL: define i32 @select_fcmp(
-; SCALABLE-SAME: float [[X:%.*]], i32 [[Y:%.*]], ptr nocapture readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SCALABLE-NEXT:  entry:
-; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
-; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-LABEL: @select_fcmp
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[X]], i64 0
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 %n, [[TMP3]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
+; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[X:%.*]], i64 0
 ; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[BROADCAST_SPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
-; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Y]], i64 0
-; SCALABLE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[TMP6]]
-; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
-; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
-; SCALABLE-NEXT:    [[TMP9:%.*]] = fcmp fast olt <vscale x 4 x float> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; SCALABLE-NEXT:    [[TMP10]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[BROADCAST_SPLAT2]]
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; SCALABLE-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[TMP4]]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
+; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP6]], align 4
+; SCALABLE-NEXT:    [[TMP8:%.*]] = fcmp fast olt <vscale x 4 x float> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; SCALABLE-NEXT:    [[NOT:%.*]] = xor <vscale x 4 x i1> [[TMP8]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[TMP9]] = or <vscale x 4 x i1> [[VEC_PHI]], [[NOT]]
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
+; SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[TMP10]], zeroinitializer
-; SCALABLE-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP]])
-; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP12]], i32 [[Y]], i32 0
-; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SCALABLE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; SCALABLE-NEXT:    br label [[FOR_BODY:%.*]]
-; SCALABLE:       for.body:
-; SCALABLE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; SCALABLE-NEXT:    [[A:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[COND:%.*]], [[FOR_BODY]] ]
-; SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[INDVARS_IV]]
-; SCALABLE-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; SCALABLE-NEXT:    [[CMP1:%.*]] = fcmp fast olt float [[TMP13]], [[X]]
-; SCALABLE-NEXT:    [[COND]] = select i1 [[CMP1]], i32 [[A]], i32 [[Y]]
-; SCALABLE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; SCALABLE:       for.end:
-; SCALABLE-NEXT:    [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; SCALABLE-NEXT:    ret i32 [[COND_LCSSA]]
+; SCALABLE-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
+; SCALABLE-NEXT:    [[FR:%.*]] = freeze i1 [[TMP13]]
+; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 %y, i32 0
 ;
 entry:
   br label %for.body
@@ -253,101 +152,55 @@ for.end:
 }
 
 define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) #0 {
-; CHECK-LABEL: define i32 @select_const_i32_from_icmp(
-; CHECK-SAME: ptr nocapture readonly [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-LABEL: @select_const_i32_from_icmp
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 %n, 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 3, i32 3, i32 3, i32 3>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    [[NOT:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[TMP5]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]])
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 7, i32 3
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 3, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[TMP15:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 3
-; CHECK-NEXT:    [[TMP12]] = select i1 [[TMP11]], i32 [[TMP8]], i32 7
-; CHECK-NEXT:    [[TMP13]] = add nuw nsw i64 [[TMP15]], 1
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP13]], [[N]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP12]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[DOTLCSSA]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[FR:%.*]] = freeze i1 [[TMP7]]
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 7, i32 3
 ;
-; SCALABLE-LABEL: define i32 @select_const_i32_from_icmp(
-; SCALABLE-SAME: ptr nocapture readonly [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SCALABLE-NEXT:  entry:
-; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
-; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-LABEL: @select_const_i32_from_icmp
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 %n, [[TMP3]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP4]]
+; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[V:%.*]], i64 [[TMP4]]
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
 ; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
 ; SCALABLE-NEXT:    [[TMP8:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[TMP9]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[NOT:%.*]] = xor <vscale x 4 x i1> [[TMP8]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[TMP9]] = or <vscale x 4 x i1> [[VEC_PHI]], [[NOT]]
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
 ; SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[TMP9]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP]])
-; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 7, i32 3
-; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SCALABLE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 3, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; SCALABLE-NEXT:    br label [[FOR_BODY:%.*]]
-; SCALABLE:       for.body:
-; SCALABLE-NEXT:    [[TMP21:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP14:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP21]]
-; SCALABLE-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
-; SCALABLE-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP16]], 3
-; SCALABLE-NEXT:    [[TMP18]] = select i1 [[TMP17]], i32 [[TMP14]], i32 7
-; SCALABLE-NEXT:    [[TMP19]] = add nuw nsw i64 [[TMP21]], 1
-; SCALABLE-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[TMP19]], [[N]]
-; SCALABLE-NEXT:    br i1 [[TMP20]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; SCALABLE:       exit:
-; SCALABLE-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP18]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; SCALABLE-NEXT:    ret i32 [[DOTLCSSA]]
+; SCALABLE-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
+; SCALABLE-NEXT:    [[FR:%.*]] = freeze i1 [[TMP13]]
+; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 7, i32 3
 ;
 entry:
   br label %for.body
@@ -368,113 +221,55 @@ exit:                                     ; preds = %for.body
 }
 
 define i32 @select_i32_from_icmp(ptr nocapture readonly %v, i32 %a, i32 %b, i64 %n) #0 {
-; CHECK-LABEL: define i32 @select_i32_from_icmp(
-; CHECK-SAME: ptr nocapture readonly [[V:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-LABEL: @select_i32_from_icmp
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
-; CHECK-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i32> [[MINMAX_IDENT_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[B]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 %n, 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[TMP5]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]])
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 [[B]], i32 [[A]]
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[A]], [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[TMP15:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 3
-; CHECK-NEXT:    [[TMP12]] = select i1 [[TMP11]], i32 [[TMP8]], i32 [[B]]
-; CHECK-NEXT:    [[TMP13]] = add nuw nsw i64 [[TMP15]], 1
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP13]], [[N]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP12]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[DOTLCSSA]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[FR:%.*]] = freeze i1 [[TMP7]]
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 %b, i32 %a
 ;
-; SCALABLE-LABEL: define i32 @select_i32_from_icmp(
-; SCALABLE-SAME: ptr nocapture readonly [[V:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SCALABLE-NEXT:  entry:
-; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
-; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-LABEL: @select_i32_from_icmp
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 %n, [[TMP3]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
-; SCALABLE-NEXT:    [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A]], i64 0
-; SCALABLE-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[MINMAX_IDENT_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B]], i64 0
-; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP4]]
+; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[V:%.*]], i64 [[TMP4]]
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
 ; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
 ; SCALABLE-NEXT:    [[TMP8:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[TMP9]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[BROADCAST_SPLAT]]
+; SCALABLE-NEXT:    [[NOT:%.*]] = xor <vscale x 4 x i1> [[TMP8]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[TMP9]] = or <vscale x 4 x i1> [[VEC_PHI]], [[NOT]]
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
 ; SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A]], i64 0
-; SCALABLE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; SCALABLE-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[TMP9]], [[DOTSPLAT]]
-; SCALABLE-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP]])
-; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 [[B]], i32 [[A]]
-; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SCALABLE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[A]], [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; SCALABLE-NEXT:    br label [[FOR_BODY:%.*]]
-; SCALABLE:       for.body:
-; SCALABLE-NEXT:    [[TMP21:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP14:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP21]]
-; SCALABLE-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
-; SCALABLE-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP16]], 3
-; SCALABLE-NEXT:    [[TMP18]] = select i1 [[TMP17]], i32 [[TMP14]], i32 [[B]]
-; SCALABLE-NEXT:    [[TMP19]] = add nuw nsw i64 [[TMP21]], 1
-; SCALABLE-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[TMP19]], [[N]]
-; SCALABLE-NEXT:    br i1 [[TMP20]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; SCALABLE:       exit:
-; SCALABLE-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP18]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; SCALABLE-NEXT:    ret i32 [[DOTLCSSA]]
+; SCALABLE-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
+; SCALABLE-NEXT:    [[FR:%.*]] = freeze i1 [[TMP13]]
+; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 %b, i32 %a
 ;
 entry:
   br label %for.body
@@ -495,101 +290,55 @@ exit:                                     ; preds = %for.body
 }
 
 define i32 @select_const_i32_from_fcmp(ptr nocapture readonly %v, i64 %n) #0 {
-; CHECK-LABEL: define i32 @select_const_i32_from_fcmp(
-; CHECK-SAME: ptr nocapture readonly [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-LABEL: @select_const_i32_from_fcmp
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 %n, 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 2, i32 2, i32 2, i32 2>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[V:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = fcmp fast ueq <4 x float> [[WIDE_LOAD]], <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
-; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[NOT:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[TMP5]], <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]])
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 1, i32 2
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[TMP15:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = fcmp fast ueq float [[TMP10]], 3.000000e+00
-; CHECK-NEXT:    [[TMP12]] = select i1 [[TMP11]], i32 [[TMP8]], i32 1
-; CHECK-NEXT:    [[TMP13]] = add nuw nsw i64 [[TMP15]], 1
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP13]], [[N]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP12]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[DOTLCSSA]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[FR:%.*]] = freeze i1 [[TMP7]]
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 1, i32 2
 ;
-; SCALABLE-LABEL: define i32 @select_const_i32_from_fcmp(
-; SCALABLE-SAME: ptr nocapture readonly [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SCALABLE-NEXT:  entry:
-; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
-; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-LABEL: @select_const_i32_from_fcmp
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 %n, [[TMP3]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP4]]
+; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[V:%.*]], i64 [[TMP4]]
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
 ; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP6]], align 4
 ; SCALABLE-NEXT:    [[TMP8:%.*]] = fcmp fast ueq <vscale x 4 x float> [[WIDE_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[TMP9]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[NOT:%.*]] = xor <vscale x 4 x i1> [[TMP8]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[TMP9]] = or <vscale x 4 x i1> [[VEC_PHI]], [[NOT]]
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
 ; SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[TMP9]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP]])
-; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 1, i32 2
-; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SCALABLE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; SCALABLE-NEXT:    br label [[FOR_BODY:%.*]]
-; SCALABLE:       for.body:
-; SCALABLE-NEXT:    [[TMP21:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP14:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP21]]
-; SCALABLE-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4
-; SCALABLE-NEXT:    [[TMP17:%.*]] = fcmp fast ueq float [[TMP16]], 3.000000e+00
-; SCALABLE-NEXT:    [[TMP18]] = select i1 [[TMP17]], i32 [[TMP14]], i32 1
-; SCALABLE-NEXT:    [[TMP19]] = add nuw nsw i64 [[TMP21]], 1
-; SCALABLE-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[TMP19]], [[N]]
-; SCALABLE-NEXT:    br i1 [[TMP20]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; SCALABLE:       exit:
-; SCALABLE-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP18]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; SCALABLE-NEXT:    ret i32 [[DOTLCSSA]]
+; SCALABLE-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
+; SCALABLE-NEXT:    [[FR:%.*]] = freeze i1 [[TMP13]]
+; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 1, i32 2
 ;
 entry:
   br label %for.body
@@ -610,41 +359,11 @@ exit:                                     ; preds = %for.body
 }
 
 define float @select_const_f32_from_icmp(ptr nocapture readonly %v, i64 %n) #0 {
-; CHECK-LABEL: define float @select_const_f32_from_icmp(
-; CHECK-SAME: ptr nocapture readonly [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = phi fast float [ 3.000000e+00, [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 3
-; CHECK-NEXT:    [[TMP5]] = select fast i1 [[TMP4]], float [[TMP1]], float 7.000000e+00
-; CHECK-NEXT:    [[TMP6]] = add nuw nsw i64 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP6]], [[N]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[EXIT:%.*]], label [[FOR_BODY]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ]
-; CHECK-NEXT:    ret float [[DOTLCSSA]]
+; CHECK-LABEL: @select_const_f32_from_icmp
+; CHECK-NOT: vector.body
 ;
-; SCALABLE-LABEL: define float @select_const_f32_from_icmp(
-; SCALABLE-SAME: ptr nocapture readonly [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SCALABLE-NEXT:  entry:
-; SCALABLE-NEXT:    br label [[FOR_BODY:%.*]]
-; SCALABLE:       for.body:
-; SCALABLE-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP1:%.*]] = phi fast float [ 3.000000e+00, [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP0]]
-; SCALABLE-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
-; SCALABLE-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 3
-; SCALABLE-NEXT:    [[TMP5]] = select fast i1 [[TMP4]], float [[TMP1]], float 7.000000e+00
-; SCALABLE-NEXT:    [[TMP6]] = add nuw nsw i64 [[TMP0]], 1
-; SCALABLE-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP6]], [[N]]
-; SCALABLE-NEXT:    br i1 [[TMP7]], label [[EXIT:%.*]], label [[FOR_BODY]]
-; SCALABLE:       exit:
-; SCALABLE-NEXT:    [[DOTLCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ]
-; SCALABLE-NEXT:    ret float [[DOTLCSSA]]
+; SCALABLE-LABEL: @select_const_f32_from_icmp
+; SCALABLE-NOT: vector.body
 ;
 entry:
   br label %for.body
@@ -665,127 +384,63 @@ exit:                                     ; preds = %for.body
 }
 
 define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i64 %n) #0 {
-; CHECK-LABEL: define i32 @pred_select_const_i32_from_icmp(
-; CHECK-SAME: ptr noalias nocapture readonly [[SRC1:%.*]], ptr noalias nocapture readonly [[SRC2:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-LABEL: @pred_select_const_i32_from_icmp
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 %n, 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC1:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], <i32 35, i32 35, i32 35, i32 35>
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[SRC2]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP5]], i32 4, <4 x i1> [[TMP3]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i32> [[WIDE_MASKED_LOAD]], <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> [[VEC_PHI]]
-; CHECK-NEXT:    [[PREDPHI]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP7]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], <i32 35, i32 35, i32 35, i32 35>
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[SRC2:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP6]], i32 4, <4 x i1> [[TMP4]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i32> [[WIDE_MASKED_LOAD]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i1> [[VEC_PHI]], [[TMP8]]
+; CHECK-NEXT:    [[PREDPHI]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP9]], <4 x i1> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[PREDPHI]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]])
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP9]], i32 1, i32 0
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_013:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], [[FOR_INC]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[I_013]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP10]], 35
-; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 [[I_013]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[TMP11]], 2
-; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP3]], i32 1, i32 [[R_012]]
-; CHECK-NEXT:    br label [[FOR_INC]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    [[R_1]] = phi i32 [ [[R_012]], [[FOR_BODY]] ], [ [[SPEC_SELECT]], [[IF_THEN]] ]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_013]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], [[FOR_INC]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[R_1_LCSSA]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[PREDPHI]])
+; CHECK-NEXT:    [[FR:%.*]] = freeze i1 [[TMP12]]
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 1, i32 0
 ;
-; SCALABLE-LABEL: define i32 @pred_select_const_i32_from_icmp(
-; SCALABLE-SAME: ptr noalias nocapture readonly [[SRC1:%.*]], ptr noalias nocapture readonly [[SRC2:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SCALABLE-NEXT:  entry:
-; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
-; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-LABEL: @pred_select_const_i32_from_icmp
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 %n, [[TMP3]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[TMP6]]
-; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
-; SCALABLE-NEXT:    [[TMP9:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 35, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[SRC2]], i64 [[TMP6]]
-; SCALABLE-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
-; SCALABLE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i32> poison)
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[SRC1:%.*]], i64 [[TMP4]]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
+; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
+; SCALABLE-NEXT:    [[TMP8:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 35, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[SRC2:%.*]], i64 [[TMP4]]
+; SCALABLE-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[TMP9]], i32 0
+; SCALABLE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP10]], i32 4, <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> poison)
 ; SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[TMP13:%.*]] = select <vscale x 4 x i1> [[TMP12]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> [[VEC_PHI]]
-; SCALABLE-NEXT:    [[PREDPHI]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i32> [[TMP13]], <vscale x 4 x i32> [[VEC_PHI]]
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP13:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP12]]
+; SCALABLE-NEXT:    [[PREDPHI]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> [[VEC_PHI]]
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
+; SCALABLE-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[PREDPHI]], zeroinitializer
-; SCALABLE-NEXT:    [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP]])
-; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP15]], i32 1, i32 0
-; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SCALABLE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; SCALABLE-NEXT:    br label [[FOR_BODY:%.*]]
-; SCALABLE:       for.body:
-; SCALABLE-NEXT:    [[I_013:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; SCALABLE-NEXT:    [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], [[FOR_INC]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[I_013]]
-; SCALABLE-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; SCALABLE-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP16]], 35
-; SCALABLE-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; SCALABLE:       if.then:
-; SCALABLE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 [[I_013]]
-; SCALABLE-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; SCALABLE-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[TMP17]], 2
-; SCALABLE-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP3]], i32 1, i32 [[R_012]]
-; SCALABLE-NEXT:    br label [[FOR_INC]]
-; SCALABLE:       for.inc:
-; SCALABLE-NEXT:    [[R_1]] = phi i32 [ [[R_012]], [[FOR_BODY]] ], [ [[SPEC_SELECT]], [[IF_THEN]] ]
-; SCALABLE-NEXT:    [[INC]] = add nuw nsw i64 [[I_013]], 1
-; SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; SCALABLE:       for.end.loopexit:
-; SCALABLE-NEXT:    [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], [[FOR_INC]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; SCALABLE-NEXT:    ret i32 [[R_1_LCSSA]]
+; SCALABLE-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[PREDPHI]])
+; SCALABLE-NEXT:    [[FR:%.*]] = freeze i1 [[TMP18]]
+; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 1, i32 0
 ;
 entry:
   br label %for.body
@@ -817,34 +472,3 @@ for.end.loopexit:                                 ; preds = %for.inc
 }
 
 attributes #0 = { "target-features"="+f,+v" }
-;.
-; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
-; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
-; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
-; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
-; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
-; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
-; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
-; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
-;.
-; SCALABLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; SCALABLE: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; SCALABLE: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; SCALABLE: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; SCALABLE: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; SCALABLE: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
-; SCALABLE: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; SCALABLE: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
-; SCALABLE: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
-; SCALABLE: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
-; SCALABLE: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
-; SCALABLE: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
-; SCALABLE: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
-; SCALABLE: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
-;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
index 12fdf2149daf..6936887cd166 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
@@ -17,27 +17,27 @@ define void @single_constant_stride_int_scaled(ptr %p) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[TMP5]]
-; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 4 x i64> [[TMP9]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 4
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 1, [[TMP12]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP13]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP12:%.*]] = mul nuw nsw <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 8, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[P:%.*]], <vscale x 4 x i64> [[TMP12]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP14:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP14]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
+; CHECK-NEXT:    [[TMP14:%.*]] = mul nuw nsw <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 8, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i32, ptr [[P:%.*]], <vscale x 4 x i64> [[TMP14]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP15]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP16:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP16]], <vscale x 4 x ptr> [[TMP15]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -90,26 +90,26 @@ define void @single_constant_stride_int_iv(ptr %p) {
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 64
-; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = add <vscale x 4 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = mul <vscale x 4 x i64> [[TMP5]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 64, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 64, [[TMP8]]
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 64, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 64, [[TMP10]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[P:%.*]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP10]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x ptr> [[TMP10]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[P:%.*]], <vscale x 4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP12]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP13]], <vscale x 4 x ptr> [[TMP12]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -157,35 +157,51 @@ exit:
 define void @single_constant_stride_ptr_iv(ptr %p) {
 ; CHECK-LABEL: @single_constant_stride_ptr_iv(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 1024, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 8064
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[N_VEC]], 8
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <8 x i64> <i64 0, i64 8, i64 16, i64 24, i64 32, i64 40, i64 48, i64 56>
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <8 x i64> <i64 64, i64 72, i64 80, i64 88, i64 96, i64 104, i64 112, i64 120>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x ptr> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x ptr> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP3]], align 4
-; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC3]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP6]], <8 x ptr> [[TMP0]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP7]], <8 x ptr> [[TMP1]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 128
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1008
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 8, [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP10]], 0
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP13]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i64> [[DOTSPLAT]], [[TMP14]]
+; CHECK-NEXT:    [[VECTOR_GEP:%.*]] = mul <vscale x 4 x i64> [[TMP15]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 8, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 4 x i64> [[VECTOR_GEP]]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 4 x ptr> [[TMP16]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP18]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP20:%.*]] = add <vscale x 4 x i32> [[TMP19]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x ptr> [[TMP16]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1008, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[P]], [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
@@ -236,18 +252,18 @@ define void @single_stride_int_scaled(ptr %p, i64 %stride) {
 ; NOSTRIDED-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
 ; NOSTRIDED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
 ; NOSTRIDED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; NOSTRIDED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; NOSTRIDED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; NOSTRIDED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; NOSTRIDED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
 ; NOSTRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NOSTRIDED:       vector.body:
 ; NOSTRIDED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NOSTRIDED-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
-; NOSTRIDED-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP5]]
-; NOSTRIDED-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i32 0
-; NOSTRIDED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
-; NOSTRIDED-NEXT:    [[TMP8:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; NOSTRIDED-NEXT:    store <vscale x 4 x i32> [[TMP8]], ptr [[TMP7]], align 4
-; NOSTRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
+; NOSTRIDED-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; NOSTRIDED-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]]
+; NOSTRIDED-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0
+; NOSTRIDED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP9]], align 4
+; NOSTRIDED-NEXT:    [[TMP10:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; NOSTRIDED-NEXT:    store <vscale x 4 x i32> [[TMP10]], ptr [[TMP9]], align 4
+; NOSTRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; NOSTRIDED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NOSTRIDED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; NOSTRIDED:       middle.block:
@@ -320,18 +336,18 @@ define void @single_stride_int_iv(ptr %p, i64 %stride) {
 ; NOSTRIDED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
 ; NOSTRIDED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; NOSTRIDED-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], [[STRIDE]]
-; NOSTRIDED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; NOSTRIDED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; NOSTRIDED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; NOSTRIDED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
 ; NOSTRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NOSTRIDED:       vector.body:
 ; NOSTRIDED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NOSTRIDED-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
-; NOSTRIDED-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP5]]
-; NOSTRIDED-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i32 0
-; NOSTRIDED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
-; NOSTRIDED-NEXT:    [[TMP8:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; NOSTRIDED-NEXT:    store <vscale x 4 x i32> [[TMP8]], ptr [[TMP7]], align 4
-; NOSTRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
+; NOSTRIDED-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; NOSTRIDED-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]]
+; NOSTRIDED-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0
+; NOSTRIDED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP9]], align 4
+; NOSTRIDED-NEXT:    [[TMP10:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; NOSTRIDED-NEXT:    store <vscale x 4 x i32> [[TMP10]], ptr [[TMP9]], align 4
+; NOSTRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; NOSTRIDED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NOSTRIDED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; NOSTRIDED:       middle.block:
@@ -452,20 +468,20 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
 ; NOSTRIDED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
 ; NOSTRIDED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP8]]
 ; NOSTRIDED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; NOSTRIDED-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; NOSTRIDED-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
+; NOSTRIDED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NOSTRIDED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
 ; NOSTRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NOSTRIDED:       vector.body:
 ; NOSTRIDED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NOSTRIDED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
-; NOSTRIDED-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP9]]
-; NOSTRIDED-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
-; NOSTRIDED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP11]], align 4
-; NOSTRIDED-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; NOSTRIDED-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[P2]], i64 [[TMP9]]
-; NOSTRIDED-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
-; NOSTRIDED-NEXT:    store <vscale x 4 x i32> [[TMP12]], ptr [[TMP14]], align 4
-; NOSTRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
+; NOSTRIDED-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 0
+; NOSTRIDED-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP11]]
+; NOSTRIDED-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
+; NOSTRIDED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
+; NOSTRIDED-NEXT:    [[TMP14:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; NOSTRIDED-NEXT:    [[TMP15:%.*]] = getelementptr i32, ptr [[P2]], i64 [[TMP11]]
+; NOSTRIDED-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[TMP15]], i32 0
+; NOSTRIDED-NEXT:    store <vscale x 4 x i32> [[TMP14]], ptr [[TMP16]], align 4
+; NOSTRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
 ; NOSTRIDED-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NOSTRIDED-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; NOSTRIDED:       middle.block:
@@ -518,16 +534,16 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
 ; STRIDED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP9]]
 ; STRIDED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; STRIDED-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; STRIDED-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 4
-; STRIDED-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; STRIDED-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i64> [[TMP10]], zeroinitializer
-; STRIDED-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i64> [[TMP11]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; STRIDED-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP12]]
-; STRIDED-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; STRIDED-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
-; STRIDED-NEXT:    [[TMP15:%.*]] = mul i64 1, [[TMP14]]
-; STRIDED-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP15]], i64 0
+; STRIDED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; STRIDED-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
+; STRIDED-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; STRIDED-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i64> [[TMP12]], zeroinitializer
+; STRIDED-NEXT:    [[TMP14:%.*]] = mul <vscale x 4 x i64> [[TMP13]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; STRIDED-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP14]]
+; STRIDED-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; STRIDED-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
+; STRIDED-NEXT:    [[TMP17:%.*]] = mul i64 1, [[TMP16]]
+; STRIDED-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP17]], i64 0
 ; STRIDED-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[STRIDE]], i64 0
 ; STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
@@ -535,13 +551,13 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED:       vector.body:
 ; STRIDED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; STRIDED-NEXT:    [[TMP16:%.*]] = mul nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; STRIDED-NEXT:    [[TMP17:%.*]] = getelementptr i32, ptr [[P]], <vscale x 4 x i64> [[TMP16]]
-; STRIDED-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP17]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison), !alias.scope !8
-; STRIDED-NEXT:    [[TMP18:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; STRIDED-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr [[P2]], <vscale x 4 x i64> [[TMP16]]
-; STRIDED-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP18]], <vscale x 4 x ptr> [[TMP19]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)), !alias.scope !11, !noalias !8
-; STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]]
+; STRIDED-NEXT:    [[TMP18:%.*]] = mul nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; STRIDED-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr [[P]], <vscale x 4 x i64> [[TMP18]]
+; STRIDED-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP19]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison), !alias.scope [[META8:![0-9]+]]
+; STRIDED-NEXT:    [[TMP20:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; STRIDED-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[P2]], <vscale x 4 x i64> [[TMP18]]
+; STRIDED-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x ptr> [[TMP21]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)), !alias.scope [[META11:![0-9]+]], !noalias [[META8]]
+; STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
 ; STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; STRIDED-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; STRIDED-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
@@ -601,18 +617,18 @@ define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) {
 ; NOSTRIDED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
 ; NOSTRIDED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; NOSTRIDED-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], [[STRIDE]]
-; NOSTRIDED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; NOSTRIDED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; NOSTRIDED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; NOSTRIDED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
 ; NOSTRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NOSTRIDED:       vector.body:
 ; NOSTRIDED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NOSTRIDED-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
-; NOSTRIDED-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP5]]
-; NOSTRIDED-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i32 0
-; NOSTRIDED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
-; NOSTRIDED-NEXT:    [[TMP8:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; NOSTRIDED-NEXT:    store <vscale x 4 x i32> [[TMP8]], ptr [[TMP7]], align 4
-; NOSTRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
+; NOSTRIDED-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; NOSTRIDED-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]]
+; NOSTRIDED-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0
+; NOSTRIDED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP9]], align 4
+; NOSTRIDED-NEXT:    [[TMP10:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; NOSTRIDED-NEXT:    store <vscale x 4 x i32> [[TMP10]], ptr [[TMP9]], align 4
+; NOSTRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; NOSTRIDED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NOSTRIDED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; NOSTRIDED:       middle.block:
@@ -729,43 +745,43 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP10]]
 ; STRIDED-NEXT:    [[TMP11:%.*]] = mul i64 [[N_VEC]], [[STRIDE]]
 ; STRIDED-NEXT:    [[IND_END7:%.*]] = getelementptr i8, ptr [[P2]], i64 [[TMP11]]
-; STRIDED-NEXT:    [[TMP29:%.*]] = call i64 @llvm.vscale.i64()
-; STRIDED-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], 4
+; STRIDED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; STRIDED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
 ; STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; STRIDED:       vector.body:
 ; STRIDED-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
 ; STRIDED-NEXT:    [[POINTER_PHI11:%.*]] = phi ptr [ [[P2]], [[VECTOR_PH]] ], [ [[PTR_IND12:%.*]], [[VECTOR_BODY]] ]
 ; STRIDED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; STRIDED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; STRIDED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
-; STRIDED-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 1
-; STRIDED-NEXT:    [[TMP15:%.*]] = mul i64 [[STRIDE]], [[TMP14]]
-; STRIDED-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP13]], 0
-; STRIDED-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP16]], i64 0
+; STRIDED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; STRIDED-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
+; STRIDED-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 1
+; STRIDED-NEXT:    [[TMP17:%.*]] = mul i64 [[STRIDE]], [[TMP16]]
+; STRIDED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP15]], 0
+; STRIDED-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP18]], i64 0
 ; STRIDED-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; STRIDED-NEXT:    [[TMP17:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; STRIDED-NEXT:    [[TMP18:%.*]] = add <vscale x 4 x i64> [[DOTSPLAT]], [[TMP17]]
+; STRIDED-NEXT:    [[TMP19:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; STRIDED-NEXT:    [[TMP20:%.*]] = add <vscale x 4 x i64> [[DOTSPLAT]], [[TMP19]]
 ; STRIDED-NEXT:    [[DOTSPLATINSERT9:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[STRIDE]], i64 0
 ; STRIDED-NEXT:    [[DOTSPLAT10:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT9]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; STRIDED-NEXT:    [[VECTOR_GEP:%.*]] = mul <vscale x 4 x i64> [[TMP18]], [[DOTSPLAT10]]
-; STRIDED-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 4 x i64> [[VECTOR_GEP]]
-; STRIDED-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; STRIDED-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 4
-; STRIDED-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], 1
-; STRIDED-NEXT:    [[TMP23:%.*]] = mul i64 [[STRIDE]], [[TMP22]]
-; STRIDED-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP21]], 0
-; STRIDED-NEXT:    [[DOTSPLATINSERT13:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP24]], i64 0
+; STRIDED-NEXT:    [[VECTOR_GEP:%.*]] = mul <vscale x 4 x i64> [[TMP20]], [[DOTSPLAT10]]
+; STRIDED-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 4 x i64> [[VECTOR_GEP]]
+; STRIDED-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
+; STRIDED-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], 4
+; STRIDED-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP23]], 1
+; STRIDED-NEXT:    [[TMP25:%.*]] = mul i64 [[STRIDE]], [[TMP24]]
+; STRIDED-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP23]], 0
+; STRIDED-NEXT:    [[DOTSPLATINSERT13:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP26]], i64 0
 ; STRIDED-NEXT:    [[DOTSPLAT14:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT13]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; STRIDED-NEXT:    [[TMP25:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; STRIDED-NEXT:    [[TMP26:%.*]] = add <vscale x 4 x i64> [[DOTSPLAT14]], [[TMP25]]
-; STRIDED-NEXT:    [[VECTOR_GEP17:%.*]] = mul <vscale x 4 x i64> [[TMP26]], [[DOTSPLAT10]]
-; STRIDED-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[POINTER_PHI11]], <vscale x 4 x i64> [[VECTOR_GEP17]]
-; STRIDED-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP19]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison), !alias.scope !15
-; STRIDED-NEXT:    [[TMP28:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; STRIDED-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP28]], <vscale x 4 x ptr> [[TMP27]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)), !alias.scope !18, !noalias !15
-; STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP30]]
-; STRIDED-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP15]]
-; STRIDED-NEXT:    [[PTR_IND12]] = getelementptr i8, ptr [[POINTER_PHI11]], i64 [[TMP23]]
+; STRIDED-NEXT:    [[TMP27:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; STRIDED-NEXT:    [[TMP28:%.*]] = add <vscale x 4 x i64> [[DOTSPLAT14]], [[TMP27]]
+; STRIDED-NEXT:    [[VECTOR_GEP17:%.*]] = mul <vscale x 4 x i64> [[TMP28]], [[DOTSPLAT10]]
+; STRIDED-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[POINTER_PHI11]], <vscale x 4 x i64> [[VECTOR_GEP17]]
+; STRIDED-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP21]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison), !alias.scope [[META15:![0-9]+]]
+; STRIDED-NEXT:    [[TMP30:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; STRIDED-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP30]], <vscale x 4 x ptr> [[TMP29]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)), !alias.scope [[META18:![0-9]+]], !noalias [[META15]]
+; STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]]
+; STRIDED-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP17]]
+; STRIDED-NEXT:    [[PTR_IND12]] = getelementptr i8, ptr [[POINTER_PHI11]], i64 [[TMP25]]
 ; STRIDED-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; STRIDED-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; STRIDED:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
index 0dee4a9b8585..c9b57361c0b9 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
@@ -9,7 +9,6 @@
 ; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
 ; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=NO-VP
 
-; FIXME: reversed loads/stores are not supported yet with predicated vectorization.
 define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %ptr2) {
 ; IF-EVL-LABEL: @reverse_load_store(
 ; IF-EVL-NEXT:  entry:
@@ -30,35 +29,32 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[INDEX]]
-; IF-EVL-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[INDEX]], i64 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; IF-EVL-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
-; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
-; IF-EVL-NEXT:    [[TMP10:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1023, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; IF-EVL-NEXT:    [[TMP11:%.*]] = add i64 [[TMP7]], -1
-; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP11]]
-; IF-EVL-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
-; IF-EVL-NEXT:    [[TMP15:%.*]] = mul i64 0, [[TMP14]]
-; IF-EVL-NEXT:    [[TMP16:%.*]] = sub i64 1, [[TMP14]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[TMP15]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP16]]
-; IF-EVL-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
-; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP18]], i32 4, <vscale x 4 x i1> [[REVERSE]], <vscale x 4 x i32> poison)
-; IF-EVL-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]])
-; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP11]]
-; IF-EVL-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 4
-; IF-EVL-NEXT:    [[TMP22:%.*]] = mul i64 0, [[TMP21]]
-; IF-EVL-NEXT:    [[TMP23:%.*]] = sub i64 1, [[TMP21]]
-; IF-EVL-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP22]]
-; IF-EVL-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP23]]
-; IF-EVL-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
-; IF-EVL-NEXT:    [[REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[REVERSE3]])
-; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[REVERSE5]], ptr [[TMP25]], i32 4, <vscale x 4 x i1> [[REVERSE4]])
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP7:%.*]] = sub i64 1024, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP7]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0
+; IF-EVL-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], -1
+; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP10]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
+; IF-EVL-NEXT:    [[TMP14:%.*]] = mul i64 0, [[TMP13]]
+; IF-EVL-NEXT:    [[TMP15:%.*]] = sub i64 1, [[TMP13]]
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 [[TMP14]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[TMP15]]
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
+; IF-EVL-NEXT:    [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
+; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP10]]
+; IF-EVL-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 4
+; IF-EVL-NEXT:    [[TMP21:%.*]] = mul i64 0, [[TMP20]]
+; IF-EVL-NEXT:    [[TMP22:%.*]] = sub i64 1, [[TMP20]]
+; IF-EVL-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 [[TMP21]]
+; IF-EVL-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i64 [[TMP22]]
+; IF-EVL-NEXT:    [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE3]], ptr align 4 [[TMP24]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
+; IF-EVL-NEXT:    [[TMP25:%.*]] = zext i32 [[TMP8]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP25]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
 ; IF-EVL-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IF-EVL-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -117,3 +113,147 @@ for.body:
 loopend:
   ret void
 }
+
+define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noalias %ptr1, ptr noalias %ptr2) {
+; IF-EVL-LABEL: @reverse_load_store_masked(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; IF-EVL-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
+; IF-EVL-NEXT:    [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32
+; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP7:%.*]] = sub i64 1024, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP7]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0
+; IF-EVL-NEXT:    [[OFFSET_IDX3:%.*]] = trunc i64 [[EVL_BASED_IV]] to i32
+; IF-EVL-NEXT:    [[TMP10:%.*]] = add i32 [[OFFSET_IDX3]], 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[TMP11:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; IF-EVL-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP11]]
+; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP12]]
+; IF-EVL-NEXT:    [[TMP13:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1023, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; IF-EVL-NEXT:    [[TMP14:%.*]] = add i64 [[TMP9]], -1
+; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[TMP10]]
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
+; IF-EVL-NEXT:    [[TMP17:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 100, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; IF-EVL-NEXT:    [[TMP18:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i1> zeroinitializer
+; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP14]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 4
+; IF-EVL-NEXT:    [[TMP22:%.*]] = mul i64 0, [[TMP21]]
+; IF-EVL-NEXT:    [[TMP23:%.*]] = sub i64 1, [[TMP21]]
+; IF-EVL-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr [[TMP19]], i64 [[TMP22]]
+; IF-EVL-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]]
+; IF-EVL-NEXT:    [[VP_REVERSE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD4:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP25]], <vscale x 4 x i1> [[VP_REVERSE_MASK]], i32 [[TMP8]])
+; IF-EVL-NEXT:    [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD4]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
+; IF-EVL-NEXT:    [[TMP26:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP14]]
+; IF-EVL-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 4
+; IF-EVL-NEXT:    [[TMP29:%.*]] = mul i64 0, [[TMP28]]
+; IF-EVL-NEXT:    [[TMP30:%.*]] = sub i64 1, [[TMP28]]
+; IF-EVL-NEXT:    [[TMP31:%.*]] = getelementptr i32, ptr [[TMP26]], i64 [[TMP29]]
+; IF-EVL-NEXT:    [[TMP32:%.*]] = getelementptr i32, ptr [[TMP31]], i64 [[TMP30]]
+; IF-EVL-NEXT:    [[VP_REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
+; IF-EVL-NEXT:    [[VP_REVERSE_MASK6:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE5]], ptr align 4 [[TMP32]], <vscale x 4 x i1> [[VP_REVERSE_MASK6]], i32 [[TMP8]])
+; IF-EVL-NEXT:    [[TMP33:%.*]] = zext i32 [[TMP8]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP33]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
+; IF-EVL-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[LOOPEND:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[STARTVAL]], [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[ADD_PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_INC:%.*]] ]
+; IF-EVL-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC]] ]
+; IF-EVL-NEXT:    [[ADD]] = add i64 [[ADD_PHI]], -1
+; IF-EVL-NEXT:    [[GEPL:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i32 [[I]]
+; IF-EVL-NEXT:    [[TMP:%.*]] = load i32, ptr [[GEPL]], align 4
+; IF-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP]], 100
+; IF-EVL-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; IF-EVL:       if.then:
+; IF-EVL-NEXT:    [[GEPL1:%.*]] = getelementptr inbounds i32, ptr [[PTR1]], i64 [[ADD]]
+; IF-EVL-NEXT:    [[V:%.*]] = load i32, ptr [[GEPL1]], align 4
+; IF-EVL-NEXT:    [[GEPS:%.*]] = getelementptr inbounds i32, ptr [[PTR2]], i64 [[ADD]]
+; IF-EVL-NEXT:    store i32 [[V]], ptr [[GEPS]], align 4
+; IF-EVL-NEXT:    br label [[FOR_INC]]
+; IF-EVL:       for.inc:
+; IF-EVL-NEXT:    [[INC]] = add i32 [[I]], 1
+; IF-EVL-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC]], 1024
+; IF-EVL-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[LOOPEND]], !llvm.loop [[LOOP5:![0-9]+]]
+; IF-EVL:       loopend:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @reverse_load_store_masked(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[ADD_PHI:%.*]] = phi i64 [ [[STARTVAL:%.*]], [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_INC:%.*]] ]
+; NO-VP-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC:%.*]], [[FOR_INC]] ]
+; NO-VP-NEXT:    [[ADD]] = add i64 [[ADD_PHI]], -1
+; NO-VP-NEXT:    [[GEPL:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[I]]
+; NO-VP-NEXT:    [[TMP:%.*]] = load i32, ptr [[GEPL]], align 4
+; NO-VP-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP]], 100
+; NO-VP-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; NO-VP:       if.then:
+; NO-VP-NEXT:    [[GEPL1:%.*]] = getelementptr inbounds i32, ptr [[PTR1:%.*]], i64 [[ADD]]
+; NO-VP-NEXT:    [[V:%.*]] = load i32, ptr [[GEPL1]], align 4
+; NO-VP-NEXT:    [[GEPS:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[ADD]]
+; NO-VP-NEXT:    store i32 [[V]], ptr [[GEPS]], align 4
+; NO-VP-NEXT:    br label [[FOR_INC]]
+; NO-VP:       for.inc:
+; NO-VP-NEXT:    [[INC]] = add i32 [[I]], 1
+; NO-VP-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC]], 1024
+; NO-VP-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[LOOPEND:%.*]]
+; NO-VP:       loopend:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %add.phi = phi i64 [ %startval, %entry ], [ %add, %for.inc ]
+  %i = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %add = add i64 %add.phi, -1
+  %gepl = getelementptr inbounds i32, ptr %ptr, i32 %i
+  %tmp = load i32, ptr %gepl, align 4
+  %cmp1 = icmp slt i32 %tmp, 100
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %gepl1 = getelementptr inbounds i32, ptr %ptr1, i64 %add
+  %v = load i32, ptr %gepl1, align 4
+  %geps = getelementptr inbounds i32, ptr %ptr2, i64 %add
+  store i32 %v, ptr %geps, align 4
+  br label %for.inc
+
+for.inc:
+  %inc = add i32 %i, 1
+  %exitcond = icmp ne i32 %inc, 1024
+  br i1 %exitcond, label %for.body, label %loopend
+
+loopend:
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
index 0b872709ec6c..9335396d01c9 100644
--- a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
@@ -19,20 +19,20 @@ define i32 @any_of_reduction_epilog(ptr %src, i64 %N) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP8]] = select <4 x i1> [[TMP4]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 1, i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = freeze i1 [[TMP7]]
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP8]], i32 1, i32 0
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
@@ -42,27 +42,28 @@ define i32 @any_of_reduction_epilog(ptr %src, i64 %N) {
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[BC_MERGE_RDX]], 0
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF2]]
-; CHECK-NEXT:    [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_MERGE_RDX]], i64 0
-; CHECK-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i32> [[MINMAX_IDENT_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP9]], i64 0
+; CHECK-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i1> [[MINMAX_IDENT_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[MINMAX_IDENT_SPLAT]], [[VEC_EPILOG_PH]] ], [ [[TMP17:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i1> [ [[MINMAX_IDENT_SPLAT]], [[VEC_EPILOG_PH]] ], [ [[TMP14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX5]], 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP12]], align 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD7]], zeroinitializer
-; CHECK-NEXT:    [[TMP17]] = select <4 x i1> [[TMP13]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> [[VEC_PHI6]]
+; CHECK-NEXT:    [[TMP14]] = or <4 x i1> [[VEC_PHI6]], [[TMP13]]
 ; CHECK-NEXT:    [[INDEX_NEXT8]] = add nuw i64 [[INDEX5]], 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <4 x i32> [[TMP17]], zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP14]])
-; CHECK-NEXT:    [[RDX_SELECT9:%.*]] = select i1 [[TMP16]], i32 1, i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = freeze i1 [[TMP16]]
+; CHECK-NEXT:    [[RDX_SELECT9:%.*]] = select i1 [[TMP17]], i32 1, i32 0
 ; CHECK-NEXT:    [[CMP_N4:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[CMP_N4]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
@@ -101,6 +102,104 @@ exit:
   ret i32 %select
 }
 
+define i32 @any_of_reduction_epilog_arg_as_start_value(ptr %src, i64 %N, i32 %start) {
+; CHECK-LABEL: define i32 @any_of_reduction_epilog_arg_as_start_value(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) {
+; CHECK-NEXT:  iter.check:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[TMP8:%.*]] = freeze i1 [[TMP7]]
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP8]], i32 1, i32 [[START]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[BC_MERGE_RDX]], [[START]]
+; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF2]]
+; CHECK-NEXT:    [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP9]], i64 0
+; CHECK-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i1> [[MINMAX_IDENT_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK:       vec.epilog.vector.body:
+; CHECK-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i1> [ [[MINMAX_IDENT_SPLAT]], [[VEC_EPILOG_PH]] ], [ [[TMP14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX5]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP12]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD7]], zeroinitializer
+; CHECK-NEXT:    [[TMP14]] = or <4 x i1> [[VEC_PHI6]], [[TMP13]]
+; CHECK-NEXT:    [[INDEX_NEXT8]] = add nuw i64 [[INDEX5]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP14]])
+; CHECK-NEXT:    [[TMP17:%.*]] = freeze i1 [[TMP16]]
+; CHECK-NEXT:    [[RDX_SELECT9:%.*]] = select i1 [[TMP17]], i32 1, i32 [[START]]
+; CHECK-NEXT:    [[CMP_N4:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[CMP_N4]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       vec.epilog.scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX10:%.*]] = phi i32 [ [[START]], [[ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[RDX_SELECT9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX10]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[SELECT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp eq i8 [[LOAD]], 0
+; CHECK-NEXT:    [[SELECT]] = select i1 [[ICMP]], i32 1, i32 [[RED]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[ICMP3:%.*]] = icmp eq i64 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[ICMP3]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[SELECT_LCSSA:%.*]] = phi i32 [ [[SELECT]], [[LOOP]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[RDX_SELECT9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[SELECT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %red = phi i32 [ %start, %entry ], [ %select, %loop ]
+  %gep = getelementptr inbounds i8, ptr %src, i64 %iv
+  %load = load i8, ptr %gep, align 1
+  %icmp = icmp eq i8 %load, 0
+  %select = select i1 %icmp, i32 1, i32 %red
+  %iv.next = add i64 %iv, 1
+  %icmp3 = icmp eq i64 %iv, %N
+  br i1 %icmp3, label %exit, label %loop
+
+exit:
+  ret i32 %select
+}
 
 define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) {
 ; CHECK-LABEL: define i1 @any_of_reduction_i1_epilog(
@@ -124,14 +223,15 @@ define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP3]] = select <4 x i1> [[TMP1]], <4 x i1> [[VEC_PHI]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP3]] = or <4 x i1> [[VEC_PHI]], [[TMP2]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i1> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = freeze i1 [[TMP5]]
 ; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i1 false, i1 false
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
@@ -144,10 +244,11 @@ define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) {
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i1 [ false, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i1 [[BC_MERGE_RDX]], false
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF2]]
 ; CHECK-NEXT:    [[IND_END5:%.*]] = trunc i64 [[N_VEC3]] to i32
-; CHECK-NEXT:    [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[BC_MERGE_RDX]], i64 0
+; CHECK-NEXT:    [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP7]], i64 0
 ; CHECK-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i1> [[MINMAX_IDENT_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
@@ -160,14 +261,15 @@ define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) {
 ; CHECK-NEXT:    [[VEC_PHI10:%.*]] = phi <4 x i1> [ [[MINMAX_IDENT_SPLAT]], [[VEC_EPILOG_PH]] ], [ [[TMP10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND11:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i32> [[VEC_IND11]], [[BROADCAST_SPLAT14]]
-; CHECK-NEXT:    [[TMP10]] = select <4 x i1> [[TMP8]], <4 x i1> [[VEC_PHI10]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i1> [[TMP8]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP10]] = or <4 x i1> [[VEC_PHI10]], [[TMP9]]
 ; CHECK-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[INDEX9]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT12]] = add <4 x i32> [[VEC_IND11]], <i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC3]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[RDX_SELECT_CMP16:%.*]] = icmp ne <4 x i1> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP16]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP10]])
+; CHECK-NEXT:    [[TMP13:%.*]] = freeze i1 [[TMP12]]
 ; CHECK-NEXT:    [[RDX_SELECT16:%.*]] = select i1 [[TMP13]], i1 false, i1 false
 ; CHECK-NEXT:    [[CMP_N8:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[CMP_N8]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -185,7 +287,7 @@ define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) {
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[IV_2_NEXT]] = add i32 [[IV_2]], 1
 ; CHECK-NEXT:    [[CMP_2:%.*]] = icmp eq i64 [[IV]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP_2]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP_2]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[SEL_LCSSA:%.*]] = phi i1 [ [[SEL]], [[LOOP]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[RDX_SELECT16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i1 [[SEL_LCSSA]]
@@ -210,6 +312,155 @@ exit:
 ; uselistorder directives
   uselistorder i1 %sel, { 1, 0 }
 }
+
+define i1 @any_of_reduction_i1_epilog2(ptr %start, ptr %end, i64 %x) {
+; CHECK-LABEL: define i1 @any_of_reduction_i1_epilog2(
+; CHECK-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT:  iter.check:
+; CHECK-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; CHECK-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], -16
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK3:%.*]] = icmp ult i64 [[TMP3]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK3]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[X]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[RDX_SELECT_CMP:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 16
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 32
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 48
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]]
+; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]]
+; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP6]]
+; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP4]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP5]], i64 8
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP6]], i64 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 1
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 2
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 3
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq <4 x i64> [[TMP19]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP21:%.*]] = xor <4 x i1> [[TMP20]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[RDX_SELECT_CMP]] = or <4 x i1> [[VEC_PHI]], [[TMP21]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP23:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]])
+; CHECK-NEXT:    [[TMP47:%.*]] = freeze i1 [[TMP23]]
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP47]], i1 false, i1 true
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[N_VEC]], 16
+; CHECK-NEXT:    [[IND_END9:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP24]]
+; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]]
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i1 [ true, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[TMP48:%.*]] = icmp ne i1 [[BC_MERGE_RDX]], true
+; CHECK-NEXT:    [[N_MOD_VF7:%.*]] = urem i64 [[TMP3]], 4
+; CHECK-NEXT:    [[N_VEC8:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF7]]
+; CHECK-NEXT:    [[TMP25:%.*]] = mul i64 [[N_VEC8]], 16
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP25]]
+; CHECK-NEXT:    [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i64 0
+; CHECK-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i1> [[MINMAX_IDENT_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT18:%.*]] = insertelement <4 x i64> poison, i64 [[X]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT19:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT18]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK:       vec.epilog.vector.body:
+; CHECK-NEXT:    [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT20:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI12:%.*]] = phi <4 x i1> [ [[MINMAX_IDENT_SPLAT]], [[VEC_EPILOG_PH]] ], [ [[TMP43:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX13:%.*]] = mul i64 [[INDEX11]], 16
+; CHECK-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX13]], 0
+; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[OFFSET_IDX13]], 16
+; CHECK-NEXT:    [[TMP28:%.*]] = add i64 [[OFFSET_IDX13]], 32
+; CHECK-NEXT:    [[TMP29:%.*]] = add i64 [[OFFSET_IDX13]], 48
+; CHECK-NEXT:    [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP26]]
+; CHECK-NEXT:    [[NEXT_GEP15:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP27]]
+; CHECK-NEXT:    [[NEXT_GEP16:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP28]]
+; CHECK-NEXT:    [[NEXT_GEP17:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP29]]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP14]], i64 8
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP15]], i64 8
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP16]], i64 8
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP17]], i64 8
+; CHECK-NEXT:    [[TMP34:%.*]] = load i64, ptr [[TMP30]], align 8
+; CHECK-NEXT:    [[TMP35:%.*]] = load i64, ptr [[TMP31]], align 8
+; CHECK-NEXT:    [[TMP36:%.*]] = load i64, ptr [[TMP32]], align 8
+; CHECK-NEXT:    [[TMP37:%.*]] = load i64, ptr [[TMP33]], align 8
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i64> poison, i64 [[TMP34]], i32 0
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i64> [[TMP38]], i64 [[TMP35]], i32 1
+; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <4 x i64> [[TMP39]], i64 [[TMP36]], i32 2
+; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <4 x i64> [[TMP40]], i64 [[TMP37]], i32 3
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq <4 x i64> [[TMP41]], [[BROADCAST_SPLAT19]]
+; CHECK-NEXT:    [[TMP46:%.*]] = xor <4 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP43]] = or <4 x i1> [[VEC_PHI12]], [[TMP46]]
+; CHECK-NEXT:    [[INDEX_NEXT20]] = add nuw i64 [[INDEX11]], 4
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT20]], [[N_VEC8]]
+; CHECK-NEXT:    br i1 [[TMP44]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    [[TMP49:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP43]])
+; CHECK-NEXT:    [[TMP45:%.*]] = freeze i1 [[TMP49]]
+; CHECK-NEXT:    [[RDX_SELECT22:%.*]] = select i1 [[TMP45]], i1 false, i1 true
+; CHECK-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC8]]
+; CHECK-NEXT:    br i1 [[CMP_N10]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       vec.epilog.scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX23:%.*]] = phi i1 [ true, [[ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[RDX_SELECT22]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[RED:%.*]] = phi i1 [ [[BC_MERGE_RDX23]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[SELECT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[GETELEMENTPTR5:%.*]] = getelementptr inbounds i8, ptr [[IV]], i64 8
+; CHECK-NEXT:    [[LOAD6:%.*]] = load i64, ptr [[GETELEMENTPTR5]], align 8
+; CHECK-NEXT:    [[ICMP7:%.*]] = icmp eq i64 [[LOAD6]], [[X]]
+; CHECK-NEXT:    [[SELECT]] = select i1 [[ICMP7]], i1 [[RED]], i1 false
+; CHECK-NEXT:    [[IV_NEXT]] = getelementptr inbounds i8, ptr [[IV]], i64 16
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq ptr [[IV_NEXT]], [[END]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[SELECT_LCSSA:%.*]] = phi i1 [ [[SELECT]], [[LOOP]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[RDX_SELECT22]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i1 [[SELECT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:                                              ; preds = %bb3, %bb2
+  %red = phi i1 [ true, %entry ], [ %select, %loop ]
+  %iv = phi ptr [ %start, %entry ], [ %iv.next, %loop ]
+  %getelementptr5 = getelementptr inbounds i8, ptr %iv, i64 8
+  %load6 = load i64, ptr %getelementptr5, align 8
+  %icmp7 = icmp eq i64 %load6, %x
+  %select = select i1 %icmp7, i1 %red, i1 false
+  %iv.next = getelementptr inbounds i8, ptr %iv, i64 16
+  %ec = icmp eq ptr %iv.next, %end
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i1 %select
+}
+
 ;.
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -219,4 +470,10 @@ exit:
 ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META2]], [[META1]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
index 5584aa969367..8cf4e77a0d49 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
@@ -17,7 +17,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4, !alias.scope !0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
@@ -118,6 +118,40 @@ exit:
   ret void
 }
 
+; Check that if we have a read from an invariant address, we do not vectorize,
+; even if we vectorize with runtime checks. The test below is a variant of
+; @reduc_store_load with a non-constant dependence distance, resulting in
+; vectorization with runtime checks.
+;
+; CHECK-LABEL: @reduc_store_load_with_non_constant_distance_dependence
+; CHECK-NOT: vector.body:
+define void @reduc_store_load_with_non_constant_distance_dependence(ptr %dst, ptr noalias %dst.2, i64 %off) {
+entry:
+  %gep.dst = getelementptr inbounds i32, ptr %dst, i64 42
+  %dst.2.off = getelementptr inbounds i32, ptr %dst.2, i64 %off
+  store i32 0, ptr %gep.dst, align 4
+  br label %for.body
+
+for.body:
+  %sum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.src = getelementptr inbounds i32, ptr %dst.2, i64 %iv
+  %0 = load i32, ptr %gep.src, align 4
+  %iv.off = mul i64 %iv, 2
+  %add = add nsw i32 %sum, %0
+  %lv = load i32, ptr %gep.dst
+  store i32 %add, ptr %gep.dst, align 4
+  %gep.src.2 = getelementptr inbounds i32, ptr %dst.2.off, i64 %iv
+  store i32 %lv, ptr %gep.src.2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+
 ; Final value is not guaranteed to be stored in an invariant address.
 ; We don't vectorize in that case.
 ;
@@ -186,10 +220,10 @@ for.end:
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 4, !alias.scope !12
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4, !alias.scope !12
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 4, !alias.scope !12
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 4, !alias.scope !12
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> poison, i32 [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 2
@@ -204,10 +238,10 @@ for.end:
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP22]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP19]], align 4, !alias.scope !12
-; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP21]], align 4, !alias.scope !12
-; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP23]], align 4, !alias.scope !12
-; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP25]], align 4, !alias.scope !12
+; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP19]], align 4
+; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP21]], align 4
+; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP23]], align 4
+; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP25]], align 4
 ; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <4 x i32> poison, i32 [[TMP26]], i32 0
 ; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP27]], i32 1
 ; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x i32> [[TMP31]], i32 [[TMP28]], i32 2
diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll b/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll
index 1b4bcf6a3739..6a9f83a9e0aa 100644
--- a/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll
@@ -1,155 +1,114 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S < %s | FileCheck %s --check-prefix=CHECK-VF2IC1
 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=2 -force-vector-width=1 -S < %s | FileCheck %s --check-prefix=CHECK-VF1IC2
 
 define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i64 %n) {
-; CHECK-VF2IC1-LABEL: define i32 @pred_select_const_i32_from_icmp(
-; CHECK-VF2IC1-SAME: ptr noalias nocapture readonly [[SRC1:%.*]], ptr noalias nocapture readonly [[SRC2:%.*]], i64 [[N:%.*]]) {
-; CHECK-VF2IC1-NEXT:  entry:
-; CHECK-VF2IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
-; CHECK-VF2IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-VF2IC1:       vector.ph:
-; CHECK-VF2IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
-; CHECK-VF2IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-VF2IC1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-VF2IC1-LABEL: @pred_select_const_i32_from_icmp(
 ; CHECK-VF2IC1:       vector.body:
-; CHECK-VF2IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ]
-; CHECK-VF2IC1-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_LOAD_CONTINUE2]] ]
-; CHECK-VF2IC1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-VF2IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[TMP0]]
-; CHECK-VF2IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-VF2IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
-; CHECK-VF2IC1-NEXT:    [[TMP3:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], <i32 35, i32 35>
-; CHECK-VF2IC1-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0
-; CHECK-VF2IC1-NEXT:    br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK-VF2IC1:         [[VEC_PHI:%.*]] = phi <2 x i1> [ zeroinitializer, %vector.ph ], [ [[PREDPHI:%.*]], %pred.load.continue2 ]
+; CHECK-VF2IC1:         [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr {{%.*}}, align 4
+; CHECK-VF2IC1-NEXT:    [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], <i32 35, i32 35>
+; CHECK-VF2IC1-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; CHECK-VF2IC1-NEXT:    br i1 [[TMP5]], label %pred.load.if, label %pred.load.continue
 ; CHECK-VF2IC1:       pred.load.if:
-; CHECK-VF2IC1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 [[TMP0]]
-; CHECK-VF2IC1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
-; CHECK-VF2IC1-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP6]], i32 0
-; CHECK-VF2IC1-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; CHECK-VF2IC1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC2:%.*]], i64 {{%.*}}
+; CHECK-VF2IC1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+; CHECK-VF2IC1-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
+; CHECK-VF2IC1-NEXT:    br label %pred.load.continue
 ; CHECK-VF2IC1:       pred.load.continue:
-; CHECK-VF2IC1-NEXT:    [[TMP8:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ]
-; CHECK-VF2IC1-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1
-; CHECK-VF2IC1-NEXT:    br i1 [[TMP9]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]]
+; CHECK-VF2IC1-NEXT:    [[TMP9:%.*]] = phi <2 x i32> [ poison, %vector.body ], [ [[TMP8]], %pred.load.if ]
+; CHECK-VF2IC1-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; CHECK-VF2IC1-NEXT:    br i1 [[TMP10]], label %pred.load.if1, label %pred.load.continue2
 ; CHECK-VF2IC1:       pred.load.if1:
-; CHECK-VF2IC1-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 1
-; CHECK-VF2IC1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 [[TMP10]]
-; CHECK-VF2IC1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
-; CHECK-VF2IC1-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP12]], i32 1
-; CHECK-VF2IC1-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
+; CHECK-VF2IC1:         [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 {{%.*}}
+; CHECK-VF2IC1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
+; CHECK-VF2IC1-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP13]], i32 1
+; CHECK-VF2IC1-NEXT:    br label %pred.load.continue2
 ; CHECK-VF2IC1:       pred.load.continue2:
-; CHECK-VF2IC1-NEXT:    [[TMP14:%.*]] = phi <2 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF1]] ]
-; CHECK-VF2IC1-NEXT:    [[TMP15:%.*]] = icmp eq <2 x i32> [[TMP14]], <i32 2, i32 2>
-; CHECK-VF2IC1-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> <i32 1, i32 1>, <2 x i32> [[VEC_PHI]]
-; CHECK-VF2IC1-NEXT:    [[PREDPHI]] = select <2 x i1> [[TMP3]], <2 x i32> [[TMP16]], <2 x i32> [[VEC_PHI]]
-; CHECK-VF2IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-VF2IC1-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF2IC1-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-VF2IC1-NEXT:    [[TMP15:%.*]] = phi <2 x i32> [ [[TMP9]], %pred.load.continue ], [ [[TMP14]], %pred.load.if1 ]
+; CHECK-VF2IC1-NEXT:    [[TMP16:%.*]] = icmp eq <2 x i32> [[TMP15]], <i32 2, i32 2>
+; CHECK-VF2IC1-NEXT:    [[TMP17:%.*]] = or <2 x i1> [[VEC_PHI]], [[TMP16]]
+; CHECK-VF2IC1-NEXT:    [[PREDPHI]] = select <2 x i1> [[TMP4]], <2 x i1> [[TMP17]], <2 x i1> [[VEC_PHI]]
+; CHECK-VF2IC1:         br i1 {{%.*}}, label %middle.block, label %vector.body
 ; CHECK-VF2IC1:       middle.block:
-; CHECK-VF2IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <2 x i32> [[PREDPHI]], zeroinitializer
-; CHECK-VF2IC1-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[RDX_SELECT_CMP]])
-; CHECK-VF2IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP18]], i32 1, i32 0
-; CHECK-VF2IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-VF2IC1-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-VF2IC1-NEXT:    [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[PREDPHI]])
+; CHECK-VF2IC1-NEXT:    [[FR_TMP20:%.*]] = freeze i1 [[TMP20]]
+; CHECK-VF2IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR_TMP20]], i32 1, i32 0
 ; CHECK-VF2IC1:       scalar.ph:
-; CHECK-VF2IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-VF2IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF2IC1-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-VF2IC1:         [[BC_RESUME_VAL:%.*]] = phi i64 [ {{%.*}}, %middle.block ], [ 0, %entry ]
+; CHECK-VF2IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, %entry ], [ [[RDX_SELECT]], %middle.block ]
+; CHECK-VF2IC1-NEXT:    br label %for.body
 ; CHECK-VF2IC1:       for.body:
-; CHECK-VF2IC1-NEXT:    [[I_013:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-VF2IC1-NEXT:    [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], [[FOR_INC]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-VF2IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[I_013]]
-; CHECK-VF2IC1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-VF2IC1-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP19]], 35
-; CHECK-VF2IC1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; CHECK-VF2IC1:         [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], %for.inc ], [ [[BC_MERGE_RDX]], %scalar.ph ]
+; CHECK-VF2IC1:         [[TMP21:%.*]] = load i32, ptr {{%.*}}, align 4
+; CHECK-VF2IC1-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP21]], 35
+; CHECK-VF2IC1-NEXT:    br i1 [[CMP1]], label %if.then, label %for.inc
 ; CHECK-VF2IC1:       if.then:
-; CHECK-VF2IC1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 [[I_013]]
-; CHECK-VF2IC1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; CHECK-VF2IC1-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[TMP20]], 2
+; CHECK-VF2IC1:         [[TMP22:%.*]] = load i32, ptr {{%.*}}, align 4
+; CHECK-VF2IC1-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[TMP22]], 2
 ; CHECK-VF2IC1-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP3]], i32 1, i32 [[R_012]]
-; CHECK-VF2IC1-NEXT:    br label [[FOR_INC]]
+; CHECK-VF2IC1-NEXT:    br label %for.inc
 ; CHECK-VF2IC1:       for.inc:
-; CHECK-VF2IC1-NEXT:    [[R_1]] = phi i32 [ [[R_012]], [[FOR_BODY]] ], [ [[SPEC_SELECT]], [[IF_THEN]] ]
-; CHECK-VF2IC1-NEXT:    [[INC]] = add nuw nsw i64 [[I_013]], 1
-; CHECK-VF2IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF2IC1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-VF2IC1-NEXT:    [[R_1]] = phi i32 [ [[R_012]], %for.body ], [ [[SPEC_SELECT]], %if.then ]
 ; CHECK-VF2IC1:       for.end.loopexit:
-; CHECK-VF2IC1-NEXT:    [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], [[FOR_INC]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; CHECK-VF2IC1-NEXT:    [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], %for.inc ], [ [[RDX_SELECT]], %middle.block ]
 ; CHECK-VF2IC1-NEXT:    ret i32 [[R_1_LCSSA]]
 ;
-; CHECK-VF1IC2-LABEL: define i32 @pred_select_const_i32_from_icmp(
-; CHECK-VF1IC2-SAME: ptr noalias nocapture readonly [[SRC1:%.*]], ptr noalias nocapture readonly [[SRC2:%.*]], i64 [[N:%.*]]) {
-; CHECK-VF1IC2-NEXT:  entry:
-; CHECK-VF1IC2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
-; CHECK-VF1IC2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-VF1IC2:       vector.ph:
-; CHECK-VF1IC2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
-; CHECK-VF1IC2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-VF1IC2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-VF1IC2-LABEL: @pred_select_const_i32_from_icmp(
 ; CHECK-VF1IC2:       vector.body:
-; CHECK-VF1IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE3:%.*]] ]
-; CHECK-VF1IC2-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_LOAD_CONTINUE3]] ]
-; CHECK-VF1IC2-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[PREDPHI4:%.*]], [[PRED_LOAD_CONTINUE3]] ]
-; CHECK-VF1IC2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-VF1IC2-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-VF1IC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[TMP0]]
-; CHECK-VF1IC2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[TMP1]]
-; CHECK-VF1IC2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4
-; CHECK-VF1IC2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4
-; CHECK-VF1IC2-NEXT:    [[TMP6:%.*]] = icmp sgt i32 [[TMP4]], 35
-; CHECK-VF1IC2-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], 35
-; CHECK-VF1IC2-NEXT:    br i1 [[TMP6]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK-VF1IC2:         [[VEC_PHI:%.*]] = phi i1 [ false, %vector.ph ], [ [[PREDPHI:%.*]], %pred.load.continue3 ]
+; CHECK-VF1IC2-NEXT:    [[VEC_PHI2:%.*]] = phi i1 [ false, %vector.ph ], [ [[PREDPHI5:%.*]], %pred.load.continue3 ]
+; CHECK-VF1IC2:         [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[SRC1:%.*]], i64 {{%.*}}
+; CHECK-VF1IC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 {{%.*}}
+; CHECK-VF1IC2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-VF1IC2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
+; CHECK-VF1IC2-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], 35
+; CHECK-VF1IC2-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], 35
+; CHECK-VF1IC2-NEXT:    br i1 [[TMP4]], label %pred.load.if, label %pred.load.continue
 ; CHECK-VF1IC2:       pred.load.if:
-; CHECK-VF1IC2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 [[TMP0]]
-; CHECK-VF1IC2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4
-; CHECK-VF1IC2-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; CHECK-VF1IC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC2:%.*]], i64 {{%.*}}
+; CHECK-VF1IC2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+; CHECK-VF1IC2-NEXT:    br label %pred.load.continue
 ; CHECK-VF1IC2:       pred.load.continue:
-; CHECK-VF1IC2-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP9]], [[PRED_LOAD_IF]] ]
-; CHECK-VF1IC2-NEXT:    br i1 [[TMP7]], label [[PRED_LOAD_IF2:%.*]], label [[PRED_LOAD_CONTINUE3]]
+; CHECK-VF1IC2-NEXT:    [[TMP8:%.*]] = phi i32 [ poison, %vector.body ], [ [[TMP7]], %pred.load.if ]
+; CHECK-VF1IC2-NEXT:    br i1 [[TMP5]], label %pred.load.if2, label %pred.load.continue3
 ; CHECK-VF1IC2:       pred.load.if2:
-; CHECK-VF1IC2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 [[TMP1]]
-; CHECK-VF1IC2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
-; CHECK-VF1IC2-NEXT:    br label [[PRED_LOAD_CONTINUE3]]
+; CHECK-VF1IC2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 {{%.*}}
+; CHECK-VF1IC2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-VF1IC2-NEXT:    br label %pred.load.continue3
 ; CHECK-VF1IC2:       pred.load.continue3:
-; CHECK-VF1IC2-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], [[PRED_LOAD_IF2]] ]
-; CHECK-VF1IC2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP10]], 2
-; CHECK-VF1IC2-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP13]], 2
-; CHECK-VF1IC2-NEXT:    [[TMP16:%.*]] = select i1 [[TMP14]], i32 1, i32 [[VEC_PHI]]
-; CHECK-VF1IC2-NEXT:    [[TMP17:%.*]] = select i1 [[TMP15]], i32 1, i32 [[VEC_PHI1]]
-; CHECK-VF1IC2-NEXT:    [[PREDPHI]] = select i1 [[TMP6]], i32 [[TMP16]], i32 [[VEC_PHI]]
-; CHECK-VF1IC2-NEXT:    [[PREDPHI4]] = select i1 [[TMP7]], i32 [[TMP17]], i32 [[VEC_PHI1]]
-; CHECK-VF1IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-VF1IC2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF1IC2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-VF1IC2-NEXT:    [[TMP11:%.*]] = phi i32 [ poison, %pred.load.continue ], [ [[TMP10]], %pred.load.if2 ]
+; CHECK-VF1IC2-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP8]], 2
+; CHECK-VF1IC2-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP11]], 2
+; CHECK-VF1IC2-NEXT:    [[TMP14:%.*]] = or i1 [[VEC_PHI]], [[TMP12]]
+; CHECK-VF1IC2-NEXT:    [[TMP15:%.*]] = or i1 [[VEC_PHI2]], [[TMP13]]
+; CHECK-VF1IC2-NEXT:    [[PREDPHI]] = select i1 [[TMP4]], i1 [[TMP14]], i1 [[VEC_PHI]]
+; CHECK-VF1IC2-NEXT:    [[PREDPHI5]] = select i1 [[TMP5]], i1 [[TMP15]], i1 [[VEC_PHI2]]
+; CHECK-VF1IC2:         br i1 {{%.*}}, label %middle.block, label %vector.body
 ; CHECK-VF1IC2:       middle.block:
-; CHECK-VF1IC2-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[PREDPHI]], 0
-; CHECK-VF1IC2-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[PREDPHI]], i32 [[PREDPHI4]]
-; CHECK-VF1IC2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-VF1IC2-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-VF1IC2-NEXT:    [[OR:%.*]] = or i1 [[PREDPHI5]], [[PREDPHI]]
+; CHECK-VF1IC2-NEXT:    [[FR_OR:%.*]] = freeze i1 [[OR]]
+; CHECK-VF1IC2-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR_OR]], i32 1, i32 0
+; CHECK-VF1IC2:         br i1 {{%.*}}, label %for.end.loopexit, label %scalar.ph
 ; CHECK-VF1IC2:       scalar.ph:
-; CHECK-VF1IC2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-VF1IC2-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF1IC2-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-VF1IC2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ {{%.*}}, %middle.block ], [ 0, %entry ]
+; CHECK-VF1IC2-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, %entry ], [ [[RDX_SELECT]], %middle.block ]
+; CHECK-VF1IC2-NEXT:    br label %for.body
 ; CHECK-VF1IC2:       for.body:
-; CHECK-VF1IC2-NEXT:    [[I_013:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-VF1IC2-NEXT:    [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], [[FOR_INC]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-VF1IC2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[I_013]]
-; CHECK-VF1IC2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF1IC2-NEXT:    [[I_013:%.*]] = phi i64 [ [[INC:%.*]], %for.inc ], [ [[BC_RESUME_VAL]], %scalar.ph ]
+; CHECK-VF1IC2-NEXT:    [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], %for.inc ], [ [[BC_MERGE_RDX]], %scalar.ph ]
+; CHECK-VF1IC2:         [[TMP19:%.*]] = load i32, ptr {{%.*}}, align 4
 ; CHECK-VF1IC2-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP19]], 35
-; CHECK-VF1IC2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; CHECK-VF1IC2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label %for.inc
 ; CHECK-VF1IC2:       if.then:
-; CHECK-VF1IC2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 [[I_013]]
-; CHECK-VF1IC2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-VF1IC2:         [[TMP20:%.*]] = load i32, ptr {{%.*}}, align 4
 ; CHECK-VF1IC2-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[TMP20]], 2
 ; CHECK-VF1IC2-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP3]], i32 1, i32 [[R_012]]
-; CHECK-VF1IC2-NEXT:    br label [[FOR_INC]]
+; CHECK-VF1IC2-NEXT:    br label %for.inc
 ; CHECK-VF1IC2:       for.inc:
-; CHECK-VF1IC2-NEXT:    [[R_1]] = phi i32 [ [[R_012]], [[FOR_BODY]] ], [ [[SPEC_SELECT]], [[IF_THEN]] ]
-; CHECK-VF1IC2-NEXT:    [[INC]] = add nuw nsw i64 [[I_013]], 1
-; CHECK-VF1IC2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF1IC2-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-VF1IC2-NEXT:    [[R_1]] = phi i32 [ [[R_012]], %for.body ], [ [[SPEC_SELECT]], %if.then ]
+; CHECK-VF1IC2:         br i1 {{%.*}}, label %for.end.loopexit, label %for.body
 ; CHECK-VF1IC2:       for.end.loopexit:
-; CHECK-VF1IC2-NEXT:    [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], [[FOR_INC]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; CHECK-VF1IC2-NEXT:    [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], %for.inc ], [ [[RDX_SELECT]], %middle.block ]
 ; CHECK-VF1IC2-NEXT:    ret i32 [[R_1_LCSSA]]
 ;
 entry:
@@ -180,14 +139,3 @@ for.end.loopexit:                                 ; preds = %for.inc
   %r.1.lcssa = phi i32 [ %r.1, %for.inc ]
   ret i32 %r.1.lcssa
 }
-;.
-; CHECK-VF2IC1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK-VF2IC1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK-VF2IC1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK-VF2IC1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-;.
-; CHECK-VF1IC2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK-VF1IC2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK-VF1IC2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK-VF1IC2: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
-;.
diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp.ll b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
index c9f2aaef6d5c..993b56a05207 100644
--- a/llvm/test/Transforms/LoopVectorize/select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
@@ -5,45 +5,47 @@
 define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) {
 ; CHECK-LABEL: @select_const_i32_from_icmp
 ; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 3, i32 3, i32 3, i32 3>, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
 ; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <4 x i32>
 ; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_LOAD]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <4 x i1> [[VEC_ICMP]], <4 x i32> [[VEC_PHI]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+; CHECK-VF4IC1-NEXT:   [[NOT:%.*]] = xor <4 x i1> [[VEC_ICMP]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
 ; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]])
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]])
+; CHECK-VF4IC1-NEXT:   [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR_OR_RDX]], i32 7, i32 3
 
 ; CHECK-VF4IC4:      vector.body:
-; CHECK-VF4IC4:        [[VEC_PHI1:%.*]] = phi <4 x i32> [ <i32 3, i32 3, i32 3, i32 3>, %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ]
-; CHECK-VF4IC4-NEXT:   [[VEC_PHI2:%.*]] = phi <4 x i32> [ <i32 3, i32 3, i32 3, i32 3>, %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ]
-; CHECK-VF4IC4-NEXT:   [[VEC_PHI3:%.*]] = phi <4 x i32> [ <i32 3, i32 3, i32 3, i32 3>, %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ]
-; CHECK-VF4IC4-NEXT:   [[VEC_PHI4:%.*]] = phi <4 x i32> [ <i32 3, i32 3, i32 3, i32 3>, %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ]
+; CHECK-VF4IC4:        [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ]
+; CHECK-VF4IC4-NEXT:   [[VEC_PHI2:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ]
+; CHECK-VF4IC4-NEXT:   [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ]
+; CHECK-VF4IC4-NEXT:   [[VEC_PHI4:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ]
 ; CHECK-VF4IC4:        [[VEC_ICMP1:%.*]] = icmp eq <4 x i32> {{.*}}, <i32 3, i32 3, i32 3, i32 3>
 ; CHECK-VF4IC4-NEXT:   [[VEC_ICMP2:%.*]] = icmp eq <4 x i32> {{.*}}, <i32 3, i32 3, i32 3, i32 3>
 ; CHECK-VF4IC4-NEXT:   [[VEC_ICMP3:%.*]] = icmp eq <4 x i32> {{.*}}, <i32 3, i32 3, i32 3, i32 3>
 ; CHECK-VF4IC4-NEXT:   [[VEC_ICMP4:%.*]] = icmp eq <4 x i32> {{.*}}, <i32 3, i32 3, i32 3, i32 3>
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL1:%.*]] = select <4 x i1> [[VEC_ICMP1]], <4 x i32> [[VEC_PHI1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL2:%.*]] = select <4 x i1> [[VEC_ICMP2]], <4 x i32> [[VEC_PHI2]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL3:%.*]] = select <4 x i1> [[VEC_ICMP3]], <4 x i32> [[VEC_PHI3]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL4:%.*]] = select <4 x i1> [[VEC_ICMP4]], <4 x i32> [[VEC_PHI4]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+; CHECK-VF4IC4-NEXT:   [[NOT1:%.*]] = xor <4 x i1> [[VEC_ICMP1]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-VF4IC4-NEXT:   [[NOT2:%.*]] = xor <4 x i1> [[VEC_ICMP2]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-VF4IC4-NEXT:   [[NOT3:%.*]] = xor <4 x i1> [[VEC_ICMP3]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-VF4IC4-NEXT:   [[NOT4:%.*]] = xor <4 x i1> [[VEC_ICMP4]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL1:%.*]] = or <4 x i1> [[VEC_PHI1]], [[NOT1]]
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL2:%.*]] = or <4 x i1> [[VEC_PHI2]], [[NOT2]]
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL3:%.*]] = or <4 x i1> [[VEC_PHI3]], [[NOT3]]
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL4:%.*]] = or <4 x i1> [[VEC_PHI4]], [[NOT4]]
 ; CHECK-VF4IC4:      middle.block:
-; CHECK-VF4IC4-NEXT:   [[VEC_ICMP5:%.*]] = icmp ne <4 x i32> [[VEC_SEL1]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL5:%.*]] = select <4 x i1> [[VEC_ICMP5]], <4 x i32> [[VEC_SEL1]], <4 x i32> [[VEC_SEL2]]
-; CHECK-VF4IC4-NEXT:   [[VEC_ICMP6:%.*]] = icmp ne <4 x i32> [[VEC_SEL5]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL6:%.*]] = select <4 x i1> [[VEC_ICMP6]], <4 x i32> [[VEC_SEL5]], <4 x i32> [[VEC_SEL3]]
-; CHECK-VF4IC4-NEXT:   [[VEC_ICMP7:%.*]] = icmp ne <4 x i32> [[VEC_SEL6]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL_FIN:%.*]] = select <4 x i1> [[VEC_ICMP7]], <4 x i32> [[VEC_SEL6]], <4 x i32> [[VEC_SEL4]]
-; CHECK-VF4IC4-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL_FIN]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-VF4IC4-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]])
-; CHECK-VF4IC4-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL5:%.*]] = or <4 x i1>  [[VEC_SEL2]], [[VEC_SEL1]]
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL6:%.*]] = or <4 x i1> [[VEC_SEL3]], [[VEC_SEL5]]
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL7:%.*]] = or <4 x i1> [[VEC_SEL4]], [[VEC_SEL6]]
+; CHECK-VF4IC4-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL7]])
+; CHECK-VF4IC4-NEXT:   [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
+; CHECK-VF4IC4-NEXT:   {{.*}} = select i1 [[FR_OR_RDX]], i32 7, i32 3
 
 
 ; CHECK-VF1IC4:      vector.body:
-; CHECK-VF1IC4:        [[VEC_PHI1:%.*]] = phi i32 [ 3, %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ]
-; CHECK-VF1IC4-NEXT:   [[VEC_PHI2:%.*]] = phi i32 [ 3, %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ]
-; CHECK-VF1IC4-NEXT:   [[VEC_PHI3:%.*]] = phi i32 [ 3, %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ]
-; CHECK-VF1IC4-NEXT:   [[VEC_PHI4:%.*]] = phi i32 [ 3, %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ]
+; CHECK-VF1IC4:        [[VEC_PHI1:%.*]] = phi i1 [ false, %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ]
+; CHECK-VF1IC4-NEXT:   [[VEC_PHI2:%.*]] = phi i1 [ false, %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ]
+; CHECK-VF1IC4-NEXT:   [[VEC_PHI3:%.*]] = phi i1 [ false, %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ]
+; CHECK-VF1IC4-NEXT:   [[VEC_PHI4:%.*]] = phi i1 [ false, %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ]
 ; CHECK-VF1IC4:        [[VEC_LOAD1:%.*]] = load i32
 ; CHECK-VF1IC4-NEXT:   [[VEC_LOAD2:%.*]] = load i32
 ; CHECK-VF1IC4-NEXT:   [[VEC_LOAD3:%.*]] = load i32
@@ -52,17 +54,20 @@ define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) {
 ; CHECK-VF1IC4-NEXT:   [[VEC_ICMP2:%.*]] = icmp eq i32 [[VEC_LOAD2]], 3
 ; CHECK-VF1IC4-NEXT:   [[VEC_ICMP3:%.*]] = icmp eq i32 [[VEC_LOAD3]], 3
 ; CHECK-VF1IC4-NEXT:   [[VEC_ICMP4:%.*]] = icmp eq i32 [[VEC_LOAD4]], 3
-; CHECK-VF1IC4-NEXT:   [[VEC_SEL1]] = select i1 [[VEC_ICMP1]], i32 [[VEC_PHI1]], i32 7
-; CHECK-VF1IC4-NEXT:   [[VEC_SEL2]] = select i1 [[VEC_ICMP2]], i32 [[VEC_PHI2]], i32 7
-; CHECK-VF1IC4-NEXT:   [[VEC_SEL3]] = select i1 [[VEC_ICMP3]], i32 [[VEC_PHI3]], i32 7
-; CHECK-VF1IC4-NEXT:   [[VEC_SEL4]] = select i1 [[VEC_ICMP4]], i32 [[VEC_PHI4]], i32 7
+; CHECK-VF1IC4-NEXT:   [[NOT1:%.*]] = xor i1 [[VEC_ICMP1]], true
+; CHECK-VF1IC4-NEXT:   [[NOT2:%.*]] = xor i1 [[VEC_ICMP2]], true
+; CHECK-VF1IC4-NEXT:   [[NOT3:%.*]] = xor i1 [[VEC_ICMP3]], true
+; CHECK-VF1IC4-NEXT:   [[NOT4:%.*]] = xor i1 [[VEC_ICMP4]], true
+; CHECK-VF1IC4-NEXT:   [[VEC_SEL1:%.*]] = or i1 [[VEC_PHI1]], [[NOT1]]
+; CHECK-VF1IC4-NEXT:   [[VEC_SEL2:%.*]] = or i1 [[VEC_PHI2]], [[NOT2]]
+; CHECK-VF1IC4-NEXT:   [[VEC_SEL3:%.*]] = or i1 [[VEC_PHI3]], [[NOT3]]
+; CHECK-VF1IC4-NEXT:   [[VEC_SEL4:%.*]] = or i1 [[VEC_PHI4]], [[NOT4]]
 ; CHECK-VF1IC4:      middle.block:
-; CHECK-VF1IC4-NEXT:   [[VEC_ICMP4:%.*]] = icmp ne i32 [[VEC_SEL1]], 3
-; CHECK-VF1IC4-NEXT:   [[VEC_SEL5:%.*]] = select i1 [[VEC_ICMP4]], i32 [[VEC_SEL1]], i32 [[VEC_SEL2]]
-; CHECK-VF1IC4-NEXT:   [[VEC_ICMP5:%.*]] = icmp ne i32 [[VEC_SEL5]], 3
-; CHECK-VF1IC4-NEXT:   [[VEC_SEL6:%.*]] = select i1 [[VEC_ICMP5]], i32 [[VEC_SEL5]], i32 [[VEC_SEL3]]
-; CHECK-VF1IC4-NEXT:   [[VEC_ICMP6:%.*]] = icmp ne i32 [[VEC_SEL6]], 3
-; CHECK-VF1IC4-NEXT:   {{.*}} = select i1 [[VEC_ICMP6]], i32 [[VEC_SEL6]], i32 [[VEC_SEL4]]
+; CHECK-VF1IC4-NEXT:   [[VEC_SEL5:%.*]] = or i1 [[VEC_SEL2]], [[VEC_SEL1]]
+; CHECK-VF1IC4-NEXT:   [[VEC_SEL6:%.*]] = or i1 [[VEC_SEL3]], [[VEC_SEL5]]
+; CHECK-VF1IC4-NEXT:   [[OR_RDX:%.*]] = or i1  [[VEC_SEL4]], [[VEC_SEL6]]
+; CHECK-VF1IC4-NEXT:   [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
+; CHECK-VF1IC4-NEXT:   {{.*}} = select i1 [[FR_OR_RDX]], i32 7, i32 3
 
 entry:
   br label %for.body
@@ -86,14 +91,14 @@ exit:                                     ; preds = %for.body
 define i32 @select_const_i32_from_icmp2(ptr nocapture readonly %v, i64 %n) {
 ; CHECK-LABEL: @select_const_i32_from_icmp2
 ; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 3, i32 3, i32 3, i32 3>, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
 ; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <4 x i32>
 ; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_LOAD]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <4 x i1> [[VEC_ICMP]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>, <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[VEC_ICMP]]
 ; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]])
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]])
+; CHECK-VF4IC1-NEXT:   [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR_OR_RDX]], i32 7, i32 3
 
 entry:
   br label %for.body
@@ -117,21 +122,18 @@ exit:                                     ; preds = %for.body
 define i32 @select_i32_from_icmp(ptr nocapture readonly %v, i32 %a, i32 %b, i64 %n) {
 ; CHECK-LABEL: @select_i32_from_icmp
 ; CHECK-VF4IC1:      vector.ph:
-; CHECK-VF4IC1:        [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0
-; CHECK-VF4IC1-NEXT:   [[SPLAT_OF_A:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-VF4IC1-NEXT:   [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 %b, i64 0
-; CHECK-VF4IC1-NEXT:   [[SPLAT_OF_B:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NOT:    shufflevector <4 x i32>
+; CHECK-VF4IC1-NOT:    shufflevector <4 x i32>
 ; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i32> [ [[SPLAT_OF_A]], %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
 ; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <4 x i32>
 ; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_LOAD]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <4 x i1> [[VEC_ICMP]], <4 x i32> [[VEC_PHI]], <4 x i32> [[SPLAT_OF_B]]
+; CHECK-VF4IC1-NEXT:   [[NOT:%.*]] = xor <4 x i1> [[VEC_ICMP]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
 ; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[FIN_INS:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0
-; CHECK-VF4IC1-NEXT:   [[FIN_SPLAT:%.*]] = shufflevector <4 x i32> [[FIN_INS]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-VF4IC1-NEXT:   [[FIN_CMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], [[FIN_SPLAT]]
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_CMP]])
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 %b, i32 %a
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]])
+; CHECK-VF4IC1-NEXT:   [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR_OR_RDX]], i32 %b, i32 %a
 entry:
   br label %for.body
 
@@ -154,14 +156,15 @@ exit:                                     ; preds = %for.body
 define i32 @select_const_i32_from_fcmp_fast(ptr nocapture readonly %v, i64 %n) {
 ; CHECK-LABEL: @select_const_i32_from_fcmp_fast
 ; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 2, i32 2, i32 2, i32 2>, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
 ; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <4 x float>
 ; CHECK-VF4IC1-NEXT:   [[VEC_FCMP:%.*]] = fcmp fast ueq <4 x float> [[VEC_LOAD]], <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <4 x i1> [[VEC_FCMP]], <4 x i32> [[VEC_PHI]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-VF4IC1-NEXT:   [[NOT:%.*]] = xor <4 x i1> [[VEC_FCMP]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
 ; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], <i32 2, i32 2, i32 2, i32 2>
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]])
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 1, i32 2
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]])
+; CHECK-VF4IC1-NEXT:   [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR_OR_RDX]], i32 1, i32 2
 entry:
   br label %for.body
 
@@ -184,14 +187,15 @@ exit:                                     ; preds = %for.body
 define i32 @select_const_i32_from_fcmp(ptr nocapture readonly %v, i64 %n) {
 ; CHECK-LABEL: @select_const_i32_from_fcmp
 ; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 2, i32 2, i32 2, i32 2>, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
 ; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <4 x float>
 ; CHECK-VF4IC1-NEXT:   [[VEC_FCMP:%.*]] = fcmp ueq <4 x float> [[VEC_LOAD]], <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <4 x i1> [[VEC_FCMP]], <4 x i32> [[VEC_PHI]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-VF4IC1-NEXT:   [[NOT:%.*]] = xor <4 x i1> [[VEC_FCMP]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
 ; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], <i32 2, i32 2, i32 2, i32 2>
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]])
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 1, i32 2
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]])
+; CHECK-VF4IC1-NEXT:   [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR_OR_RDX]], i32 1, i32 2
 entry:
   br label %for.body
 
@@ -216,18 +220,16 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF4IC1:      vector.ph:
 ; CHECK-VF4IC1:        [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0
 ; CHECK-VF4IC1-NEXT:   [[SPLAT_OF_A:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-VF4IC1-NEXT:   [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 %b, i64 0
-; CHECK-VF4IC1-NEXT:   [[SPLAT_OF_B:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NOT:   [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 %b, i64 0
 ; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i32> [ [[SPLAT_OF_A]], %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
-; CHECK-VF4IC1:        [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_PHI]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <4 x i1> [[VEC_ICMP]], <4 x i32> [[VEC_PHI]], <4 x i32> [[SPLAT_OF_B]]
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[SPLAT_OF_A]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-VF4IC1-NEXT:   [[NOT:%.*]] = xor <4 x i1> [[VEC_ICMP]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
 ; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[FIN_INS:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0
-; CHECK-VF4IC1-NEXT:   [[FIN_SPLAT:%.*]] = shufflevector <4 x i32> [[FIN_INS]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-VF4IC1-NEXT:   [[FIN_CMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], [[FIN_SPLAT]]
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_CMP]])
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 %b, i32 %a
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]])
+; CHECK-VF4IC1-NEXT:   [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR_OR_RDX]], i32 %b, i32 %a
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll b/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll
index 16ab45415b5c..55e61158a79c 100644
--- a/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll
@@ -8,26 +8,25 @@ define i64 @pr62565_incoming_value_known_undef(i64 %a, ptr %src) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ undef, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <2 x i32> [[WIDE_LOAD]], <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP4]] = select <2 x i1> [[TMP3]], <2 x i64> [[VEC_PHI]], <2 x i64> [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[NOT:%*]] = xor <2 x i1> [[TMP3]], <i1 true, i1 true>
+; CHECK-NEXT:    [[TMP4]] = or <2 x i1> [[VEC_PHI]], [[NOT]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <2 x i64> [[TMP4]], undef
-; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[RDX_SELECT_CMP]])
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i64 [[A]], i64 undef
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP4]])
+; CHECK-NEXT:    [[FR_TMP6:%.*]] = freeze i1 [[TMP6]]
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR_TMP6]], i64 [[A]], i64 undef
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 33, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
@@ -72,26 +71,25 @@ define i64 @pr62565_incoming_value_known_poison(i64 %a, ptr %src) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ poison, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <2 x i32> [[WIDE_LOAD]], <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP4]] = select <2 x i1> [[TMP3]], <2 x i64> [[VEC_PHI]], <2 x i64> [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor <2 x i1> [[TMP3]], <i1 true, i1 true>
+; CHECK-NEXT:    [[TMP4]] = or <2 x i1> [[VEC_PHI]], [[NOT]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <2 x i64> [[TMP4]], poison
-; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[RDX_SELECT_CMP]])
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i64 [[A]], i64 poison
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP4]])
+; CHECK-NEXT:    [[FR_TMP6:%.*]] = freeze i1 [[TMP6]]
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR_TMP6]], i64 [[A]], i64 poison
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 33, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
@@ -136,30 +134,25 @@ define i64 @pr62565_incoming_value_may_be_poison(i64 %a, ptr %src, i64 %start) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[START]], i64 0
-; CHECK-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <2 x i64> [[MINMAX_IDENT_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <2 x i32> [[WIDE_LOAD]], <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP4]] = select <2 x i1> [[TMP3]], <2 x i64> [[VEC_PHI]], <2 x i64> [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor <2 x i1> [[TMP3]], <i1 true, i1 true>
+; CHECK-NEXT:    [[TMP4]] = or <2 x i1> [[VEC_PHI]], [[NOT]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[START]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <2 x i64> [[TMP4]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[RDX_SELECT_CMP]])
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i64 [[A]], i64 [[START]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP4]])
+; CHECK-NEXT:    [[FR_TMP6:%.*]] = freeze i1 [[TMP6]]
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR_TMP6]], i64 [[A]], i64 [[START]]
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 33, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
index d09066fa2d70..45596169da3c 100644
--- a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
+++ b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
@@ -412,6 +412,138 @@ loop:
 exit:
   ret void
 }
+
+; Test case to make sure that uses of versioned strides of type i1 are properly
+; extended. From https://github.com/llvm/llvm-project/issues/91369.
+define void @zext_of_i1_stride(i1 %g, ptr %dst) mustprogress {
+; CHECK-LABEL: define void @zext_of_i1_stride(
+; CHECK-SAME: i1 [[G:%.*]], ptr [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[G_16:%.*]] = zext i1 [[G]] to i16
+; CHECK-NEXT:    [[G_64:%.*]] = zext i1 [[G]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = udiv i64 15, [[G_64]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i1 [[G]], true
+; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], [[G_64]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], [[G_64]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 0, [[G_64]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    store <4 x i16> <i16 1, i16 1, i16 1, i16 1>, ptr [[TMP5]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i16 [[G_16]], ptr [[GEP]], align 2
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], [[G_64]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], 16
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %g.16 = zext i1 %g to i16
+  %g.64 = zext i1 %g to i64
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr inbounds i16, ptr %dst, i64 %iv
+  store i16 %g.16, ptr %gep, align 2
+  %iv.next = add nuw nsw i64 %iv, %g.64
+  %cmp = icmp ult i64 %iv.next, 16
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Test case to make sure that uses of versioned strides of type i1 are properly
+; extended.
+define void @sext_of_i1_stride(i1 %g, ptr %dst) mustprogress {
+; CHECK-LABEL: define void @sext_of_i1_stride(
+; CHECK-SAME: i1 [[G:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[G_16:%.*]] = sext i1 [[G]] to i16
+; CHECK-NEXT:    [[G_64:%.*]] = sext i1 [[G]] to i64
+; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[G_64]], i64 16)
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[UMAX]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = udiv i64 [[TMP0]], [[G_64]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i1 [[G]], true
+; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], [[G_64]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], [[G_64]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 0, [[G_64]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 -3
+; CHECK-NEXT:    store <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, ptr [[TMP7]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i16 [[G_16]], ptr [[GEP]], align 2
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], [[G_64]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], 16
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %g.16 = sext i1 %g to i16
+  %g.64 = sext i1 %g to i64
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr inbounds i16, ptr %dst, i64 %iv
+  store i16 %g.16, ptr %gep, align 2
+  %iv.next = add nuw nsw i64 %iv, %g.64
+  %cmp = icmp ult i64 %iv.next, 16
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+
 ;.
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -425,4 +557,8 @@ exit:
 ; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]]}
 ; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]]}
+; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/OpenMP/always_inline_device.ll b/llvm/test/Transforms/OpenMP/always_inline_device.ll
index 4641aff1ea21..6028ff527803 100644
--- a/llvm/test/Transforms/OpenMP/always_inline_device.ll
+++ b/llvm/test/Transforms/OpenMP/always_inline_device.ll
@@ -13,9 +13,9 @@
 ; Function Attrs: convergent norecurse nounwind
 ;.
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
-; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i8
-; CHECK: @[[KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
+; CHECK: @G = external global i8
+; CHECK: @kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 define weak void @__omp_offloading_fd02_c0934fc2_foo_l4(ptr %dyn) #0 {
 ; CHECK: Function Attrs: norecurse nounwind
@@ -103,5 +103,5 @@ attributes #2 = { convergent }
 ; CHECK: [[META4:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ; CHECK: [[META5:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
 ; CHECK: [[META6:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; CHECK: [[META7:![0-9]+]] = !{!"clang version 14.0.0"}
+; CHECK: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
 ;.
diff --git a/llvm/test/Transforms/OpenMP/barrier_removal.ll b/llvm/test/Transforms/OpenMP/barrier_removal.ll
index c8e9ac05091c..5bfdb6fe284a 100644
--- a/llvm/test/Transforms/OpenMP/barrier_removal.ll
+++ b/llvm/test/Transforms/OpenMP/barrier_removal.ll
@@ -16,17 +16,17 @@ declare void @llvm.amdgcn.s.barrier()
 declare void @llvm.assume(i1)
 
 ;.
-; CHECK: @[[GC1:[a-zA-Z0-9_$"\\.-]+]] = constant i32 42
-; CHECK: @[[GC2:[a-zA-Z0-9_$"\\.-]+]] = addrspace(4) global i32 0
-; CHECK: @[[GPTR4:[a-zA-Z0-9_$"\\.-]+]] = addrspace(4) global ptr addrspace(4) null
-; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = global i32 42
-; CHECK: @[[GS:[a-zA-Z0-9_$"\\.-]+]] = addrspace(3) global i32 0
-; CHECK: @[[GPTR:[a-zA-Z0-9_$"\\.-]+]] = global ptr null
-; CHECK: @[[PG1:[a-zA-Z0-9_$"\\.-]+]] = thread_local global i32 42
-; CHECK: @[[PG2:[a-zA-Z0-9_$"\\.-]+]] = addrspace(5) global i32 0
-; CHECK: @[[GPTR5:[a-zA-Z0-9_$"\\.-]+]] = global ptr addrspace(5) null
-; CHECK: @[[G1:[a-zA-Z0-9_$"\\.-]+]] = global i32 42
-; CHECK: @[[G2:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) global i32 0
+; CHECK: @GC1 = constant i32 42
+; CHECK: @GC2 = addrspace(4) global i32 0
+; CHECK: @GPtr4 = addrspace(4) global ptr addrspace(4) null
+; CHECK: @G = global i32 42
+; CHECK: @GS = addrspace(3) global i32 0
+; CHECK: @GPtr = global ptr null
+; CHECK: @PG1 = thread_local global i32 42
+; CHECK: @PG2 = addrspace(5) global i32 0
+; CHECK: @GPtr5 = global ptr addrspace(5) null
+; CHECK: @G1 = global i32 42
+; CHECK: @G2 = addrspace(1) global i32 0
 ;.
 define void @pos_empty_1(i1 %c) "kernel" {
 ; MODULE-LABEL: define {{[^@]+}}@pos_empty_1
@@ -1268,42 +1268,81 @@ exit:
 !15 = !{i32 7, !"openmp", i32 50}
 !16 = !{i32 7, !"openmp-device", i32 50}
 ;.
-; CHECK: attributes #[[ATTR0:[0-9]+]] = { "llvm.assume"="ompx_aligned_barrier" }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { convergent nocallback nounwind }
-; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
-; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
-; CHECK: attributes #[[ATTR4]] = { "kernel" }
-; CHECK: attributes #[[ATTR5]] = { nosync memory(none) }
+; MODULE: attributes #[[ATTR0:[0-9]+]] = { "llvm.assume"="ompx_aligned_barrier" }
+; MODULE: attributes #[[ATTR1:[0-9]+]] = { convergent nocallback nounwind }
+; MODULE: attributes #[[ATTR2:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+; MODULE: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
+; MODULE: attributes #[[ATTR4]] = { "kernel" }
+; MODULE: attributes #[[ATTR5]] = { nosync memory(none) }
 ;.
-; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; CHECK: [[META2:![0-9]+]] = !{ptr @pos_empty_1, !"kernel", i32 1}
-; CHECK: [[META3:![0-9]+]] = !{ptr @pos_empty_2, !"kernel", i32 1}
-; CHECK: [[META4:![0-9]+]] = !{ptr @pos_empty_3, !"kernel", i32 1}
-; CHECK: [[META5:![0-9]+]] = !{ptr @pos_empty_4, !"kernel", i32 1}
-; CHECK: [[META6:![0-9]+]] = !{ptr @pos_empty_5, !"kernel", i32 1}
-; CHECK: [[META7:![0-9]+]] = !{ptr @pos_empty_6, !"kernel", i32 1}
-; CHECK: [[META8:![0-9]+]] = !{ptr @neg_empty_8, !"kernel", i32 1}
-; CHECK: [[META9:![0-9]+]] = !{ptr @pos_constant_loads, !"kernel", i32 1}
-; CHECK: [[META10:![0-9]+]] = !{ptr @neg_loads, !"kernel", i32 1}
-; CHECK: [[META11:![0-9]+]] = !{ptr @pos_priv_mem, !"kernel", i32 1}
-; CHECK: [[META12:![0-9]+]] = !{ptr @neg_mem, !"kernel", i32 1}
-; CHECK: [[META13:![0-9]+]] = !{ptr @pos_multiple, !"kernel", i32 1}
-; CHECK: [[META14:![0-9]+]] = !{ptr @multiple_blocks_kernel_1, !"kernel", i32 1}
-; CHECK: [[META15:![0-9]+]] = !{ptr @multiple_blocks_kernel_2, !"kernel", i32 1}
-; CHECK: [[META16:![0-9]+]] = !{ptr @multiple_blocks_functions_kernel_effects_0, !"kernel", i32 1}
-; CHECK: [[META17:![0-9]+]] = !{ptr @pos_empty_7a, !"kernel", i32 1}
-; CHECK: [[META18:![0-9]+]] = !{ptr @pos_empty_7b, !"kernel", i32 1}
-; CHECK: [[META19:![0-9]+]] = !{ptr @neg_empty_9, !"kernel", i32 1}
-; CHECK: [[META20:![0-9]+]] = !{ptr @pos_empty_10, !"kernel", i32 1}
-; CHECK: [[META21:![0-9]+]] = !{ptr @pos_empty_11, !"kernel", i32 1}
-; CHECK: [[META22:![0-9]+]] = !{ptr @neg_empty_12, !"kernel", i32 1}
-; CHECK: [[META23:![0-9]+]] = !{ptr @pos_empty_8, !"kernel", i32 1}
-; CHECK: [[META24:![0-9]+]] = !{ptr @caller_barrier1, !"kernel", i32 1}
-; CHECK: [[META25:![0-9]+]] = !{ptr @caller_barrier2, !"kernel", i32 1}
-; CHECK: [[META26:![0-9]+]] = !{ptr @loop_barrier, !"kernel", i32 1}
-; CHECK: [[META27:![0-9]+]] = !{ptr @loop_barrier_end_barriers, !"kernel", i32 1}
-; CHECK: [[META28:![0-9]+]] = !{ptr @loop_barrier_end_barriers_unknown, !"kernel", i32 1}
-; CHECK: [[META29:![0-9]+]] = !{ptr @loop_barrier_store, !"kernel", i32 1}
-; CHECK: [[META30:![0-9]+]] = !{ptr @loop_barrier_end_barriers_store, !"kernel", i32 1}
+; CGSCC: attributes #[[ATTR0]] = { "llvm.assume"="ompx_aligned_barrier" }
+; CGSCC: attributes #[[ATTR1:[0-9]+]] = { convergent nocallback nounwind }
+; CGSCC: attributes #[[ATTR2:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+; CGSCC: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
+; CGSCC: attributes #[[ATTR4]] = { "kernel" }
+; CGSCC: attributes #[[ATTR5]] = { nosync memory(none) }
+;.
+; MODULE: [[META0:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; MODULE: [[META1:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; MODULE: [[META2:![0-9]+]] = !{ptr @pos_empty_1, !"kernel", i32 1}
+; MODULE: [[META3:![0-9]+]] = !{ptr @pos_empty_2, !"kernel", i32 1}
+; MODULE: [[META4:![0-9]+]] = !{ptr @pos_empty_3, !"kernel", i32 1}
+; MODULE: [[META5:![0-9]+]] = !{ptr @pos_empty_4, !"kernel", i32 1}
+; MODULE: [[META6:![0-9]+]] = !{ptr @pos_empty_5, !"kernel", i32 1}
+; MODULE: [[META7:![0-9]+]] = !{ptr @pos_empty_6, !"kernel", i32 1}
+; MODULE: [[META8:![0-9]+]] = !{ptr @neg_empty_8, !"kernel", i32 1}
+; MODULE: [[META9:![0-9]+]] = !{ptr @pos_constant_loads, !"kernel", i32 1}
+; MODULE: [[META10:![0-9]+]] = !{ptr @neg_loads, !"kernel", i32 1}
+; MODULE: [[META11:![0-9]+]] = !{ptr @pos_priv_mem, !"kernel", i32 1}
+; MODULE: [[META12:![0-9]+]] = !{ptr @neg_mem, !"kernel", i32 1}
+; MODULE: [[META13:![0-9]+]] = !{ptr @pos_multiple, !"kernel", i32 1}
+; MODULE: [[META14:![0-9]+]] = !{ptr @multiple_blocks_kernel_1, !"kernel", i32 1}
+; MODULE: [[META15:![0-9]+]] = !{ptr @multiple_blocks_kernel_2, !"kernel", i32 1}
+; MODULE: [[META16:![0-9]+]] = !{ptr @multiple_blocks_functions_kernel_effects_0, !"kernel", i32 1}
+; MODULE: [[META17:![0-9]+]] = !{ptr @pos_empty_7a, !"kernel", i32 1}
+; MODULE: [[META18:![0-9]+]] = !{ptr @pos_empty_7b, !"kernel", i32 1}
+; MODULE: [[META19:![0-9]+]] = !{ptr @neg_empty_9, !"kernel", i32 1}
+; MODULE: [[META20:![0-9]+]] = !{ptr @pos_empty_10, !"kernel", i32 1}
+; MODULE: [[META21:![0-9]+]] = !{ptr @pos_empty_11, !"kernel", i32 1}
+; MODULE: [[META22:![0-9]+]] = !{ptr @neg_empty_12, !"kernel", i32 1}
+; MODULE: [[META23:![0-9]+]] = !{ptr @pos_empty_8, !"kernel", i32 1}
+; MODULE: [[META24:![0-9]+]] = !{ptr @caller_barrier1, !"kernel", i32 1}
+; MODULE: [[META25:![0-9]+]] = !{ptr @caller_barrier2, !"kernel", i32 1}
+; MODULE: [[META26:![0-9]+]] = !{ptr @loop_barrier, !"kernel", i32 1}
+; MODULE: [[META27:![0-9]+]] = !{ptr @loop_barrier_end_barriers, !"kernel", i32 1}
+; MODULE: [[META28:![0-9]+]] = !{ptr @loop_barrier_end_barriers_unknown, !"kernel", i32 1}
+; MODULE: [[META29:![0-9]+]] = !{ptr @loop_barrier_store, !"kernel", i32 1}
+; MODULE: [[META30:![0-9]+]] = !{ptr @loop_barrier_end_barriers_store, !"kernel", i32 1}
+;.
+; CGSCC: [[META0:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; CGSCC: [[META1:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; CGSCC: [[META2:![0-9]+]] = !{ptr @pos_empty_1, !"kernel", i32 1}
+; CGSCC: [[META3:![0-9]+]] = !{ptr @pos_empty_2, !"kernel", i32 1}
+; CGSCC: [[META4:![0-9]+]] = !{ptr @pos_empty_3, !"kernel", i32 1}
+; CGSCC: [[META5:![0-9]+]] = !{ptr @pos_empty_4, !"kernel", i32 1}
+; CGSCC: [[META6:![0-9]+]] = !{ptr @pos_empty_5, !"kernel", i32 1}
+; CGSCC: [[META7:![0-9]+]] = !{ptr @pos_empty_6, !"kernel", i32 1}
+; CGSCC: [[META8:![0-9]+]] = !{ptr @neg_empty_8, !"kernel", i32 1}
+; CGSCC: [[META9:![0-9]+]] = !{ptr @pos_constant_loads, !"kernel", i32 1}
+; CGSCC: [[META10:![0-9]+]] = !{ptr @neg_loads, !"kernel", i32 1}
+; CGSCC: [[META11:![0-9]+]] = !{ptr @pos_priv_mem, !"kernel", i32 1}
+; CGSCC: [[META12:![0-9]+]] = !{ptr @neg_mem, !"kernel", i32 1}
+; CGSCC: [[META13:![0-9]+]] = !{ptr @pos_multiple, !"kernel", i32 1}
+; CGSCC: [[META14:![0-9]+]] = !{ptr @multiple_blocks_kernel_1, !"kernel", i32 1}
+; CGSCC: [[META15:![0-9]+]] = !{ptr @multiple_blocks_kernel_2, !"kernel", i32 1}
+; CGSCC: [[META16:![0-9]+]] = !{ptr @multiple_blocks_functions_kernel_effects_0, !"kernel", i32 1}
+; CGSCC: [[META17:![0-9]+]] = !{ptr @pos_empty_7a, !"kernel", i32 1}
+; CGSCC: [[META18:![0-9]+]] = !{ptr @pos_empty_7b, !"kernel", i32 1}
+; CGSCC: [[META19:![0-9]+]] = !{ptr @neg_empty_9, !"kernel", i32 1}
+; CGSCC: [[META20:![0-9]+]] = !{ptr @pos_empty_10, !"kernel", i32 1}
+; CGSCC: [[META21:![0-9]+]] = !{ptr @pos_empty_11, !"kernel", i32 1}
+; CGSCC: [[META22:![0-9]+]] = !{ptr @neg_empty_12, !"kernel", i32 1}
+; CGSCC: [[META23:![0-9]+]] = !{ptr @pos_empty_8, !"kernel", i32 1}
+; CGSCC: [[META24:![0-9]+]] = !{ptr @caller_barrier1, !"kernel", i32 1}
+; CGSCC: [[META25:![0-9]+]] = !{ptr @caller_barrier2, !"kernel", i32 1}
+; CGSCC: [[META26:![0-9]+]] = !{ptr @loop_barrier, !"kernel", i32 1}
+; CGSCC: [[META27:![0-9]+]] = !{ptr @loop_barrier_end_barriers, !"kernel", i32 1}
+; CGSCC: [[META28:![0-9]+]] = !{ptr @loop_barrier_end_barriers_unknown, !"kernel", i32 1}
+; CGSCC: [[META29:![0-9]+]] = !{ptr @loop_barrier_store, !"kernel", i32 1}
+; CGSCC: [[META30:![0-9]+]] = !{ptr @loop_barrier_end_barriers_store, !"kernel", i32 1}
 ;.
diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines.ll b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
index b0d1842ca2e8..34a68a3020e5 100644
--- a/llvm/test/Transforms/OpenMP/custom_state_machines.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
@@ -836,78 +836,78 @@ attributes #9 = { convergent nounwind readonly willreturn }
 !18 = !{i32 7, !"openmp-device", i32 50}
 ;.
 ; AMDGPU: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
-; AMDGPU: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
-; AMDGPU: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OUTLINED__2_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; AMDGPU: @[[__OMP_OUTLINED__3_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; AMDGPU: @[[__OMP_OUTLINED__5_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; AMDGPU: @[[__OMP_OUTLINED__7_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; AMDGPU: @[[__OMP_OUTLINED__8_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; AMDGPU: @[[__OMP_OUTLINED__10_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; AMDGPU: @[[__OMP_OUTLINED__11_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; AMDGPU: @[[__OMP_OUTLINED__13_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; AMDGPU: @[[__OMP_OUTLINED__14_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
+; AMDGPU: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
+; AMDGPU: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
+; AMDGPU: @G = external global i32, align 4
+; AMDGPU: @[[GLOB3:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
+; AMDGPU: @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @__omp_outlined__2_wrapper.ID = private constant i8 undef
+; AMDGPU: @__omp_outlined__3_wrapper.ID = private constant i8 undef
+; AMDGPU: @__omp_outlined__5_wrapper.ID = private constant i8 undef
+; AMDGPU: @__omp_outlined__7_wrapper.ID = private constant i8 undef
+; AMDGPU: @__omp_outlined__8_wrapper.ID = private constant i8 undef
+; AMDGPU: @__omp_outlined__10_wrapper.ID = private constant i8 undef
+; AMDGPU: @__omp_outlined__11_wrapper.ID = private constant i8 undef
+; AMDGPU: @__omp_outlined__13_wrapper.ID = private constant i8 undef
+; AMDGPU: @__omp_outlined__14_wrapper.ID = private constant i8 undef
 ;.
 ; NVPTX: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
-; NVPTX: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
-; NVPTX: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OUTLINED__2_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; NVPTX: @[[__OMP_OUTLINED__3_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; NVPTX: @[[__OMP_OUTLINED__5_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; NVPTX: @[[__OMP_OUTLINED__7_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; NVPTX: @[[__OMP_OUTLINED__8_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; NVPTX: @[[__OMP_OUTLINED__10_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; NVPTX: @[[__OMP_OUTLINED__11_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; NVPTX: @[[__OMP_OUTLINED__13_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; NVPTX: @[[__OMP_OUTLINED__14_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
+; NVPTX: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
+; NVPTX: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
+; NVPTX: @G = external global i32, align 4
+; NVPTX: @[[GLOB3:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
+; NVPTX: @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @__omp_outlined__2_wrapper.ID = private constant i8 undef
+; NVPTX: @__omp_outlined__3_wrapper.ID = private constant i8 undef
+; NVPTX: @__omp_outlined__5_wrapper.ID = private constant i8 undef
+; NVPTX: @__omp_outlined__7_wrapper.ID = private constant i8 undef
+; NVPTX: @__omp_outlined__8_wrapper.ID = private constant i8 undef
+; NVPTX: @__omp_outlined__10_wrapper.ID = private constant i8 undef
+; NVPTX: @__omp_outlined__11_wrapper.ID = private constant i8 undef
+; NVPTX: @__omp_outlined__13_wrapper.ID = private constant i8 undef
+; NVPTX: @__omp_outlined__14_wrapper.ID = private constant i8 undef
 ;.
 ; AMDGPU-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
-; AMDGPU-DISABLED: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU-DISABLED: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU-DISABLED: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
-; AMDGPU-DISABLED: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
+; AMDGPU-DISABLED: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
+; AMDGPU-DISABLED: @G = external global i32, align 4
+; AMDGPU-DISABLED: @[[GLOB3:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
+; AMDGPU-DISABLED: @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED: @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED: @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED: @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED: @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED: @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED: @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED: @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; NVPTX-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
-; NVPTX-DISABLED: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX-DISABLED: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX-DISABLED: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
-; NVPTX-DISABLED: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
+; NVPTX-DISABLED: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
+; NVPTX-DISABLED: @G = external global i32, align 4
+; NVPTX-DISABLED: @[[GLOB3:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
+; NVPTX-DISABLED: @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED: @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED: @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED: @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED: @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED: @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED: @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED: @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; AMDGPU: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_needed_l14
diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll b/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll
index fe134ce350dc..85d495f45039 100644
--- a/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll
@@ -835,88 +835,88 @@ attributes #9 = { convergent nounwind readonly willreturn }
 !18 = !{i32 7, !"openmp-device", i32 50}
 ;.
 ; AMDGPU1: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
-; AMDGPU1: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU1: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU1: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
-; AMDGPU1: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
+; AMDGPU1: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
+; AMDGPU1: @G = external global i32, align 4
+; AMDGPU1: @[[GLOB3:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
+; AMDGPU1: @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; NVPTX1: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
-; NVPTX1: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX1: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX1: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
-; NVPTX1: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
+; NVPTX1: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
+; NVPTX1: @G = external global i32, align 4
+; NVPTX1: @[[GLOB3:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
+; NVPTX1: @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; AMDGPU2: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
-; AMDGPU2: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU2: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU2: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
-; AMDGPU2: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
+; AMDGPU2: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
+; AMDGPU2: @G = external global i32, align 4
+; AMDGPU2: @[[GLOB3:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
+; AMDGPU2: @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; AMDGPU3: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
-; AMDGPU3: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU3: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU3: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
-; AMDGPU3: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
+; AMDGPU3: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
+; AMDGPU3: @G = external global i32, align 4
+; AMDGPU3: @[[GLOB3:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
+; AMDGPU3: @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; NVPTX2: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
-; NVPTX2: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX2: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX2: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
-; NVPTX2: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
+; NVPTX2: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
+; NVPTX2: @G = external global i32, align 4
+; NVPTX2: @[[GLOB3:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
+; NVPTX2: @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; NVPTX3: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
-; NVPTX3: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX3: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX3: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
-; NVPTX3: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
+; NVPTX3: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
+; NVPTX3: @G = external global i32, align 4
+; NVPTX3: @[[GLOB3:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
+; NVPTX3: @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; AMDGPU1: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU1-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_needed_l14
diff --git a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
index 078ec22e6110..6102201ad4ba 100644
--- a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
+++ b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
@@ -12,12 +12,12 @@ target triple = "nvptx64"
 @kernel2_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 
 ;.
-; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32
-; CHECK: @[[KERNEL0_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 666, i32 0, i32 777, i32 0, i32 0 }, ptr null, ptr null }
-; CHECK: @[[KERNEL1_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 666, i32 0, i32 777, i32 0, i32 0 }, ptr null, ptr null }
-; CHECK: @[[KERNEL2_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 666, i32 0, i32 777, i32 0, i32 0 }, ptr null, ptr null }
+; CHECK: @G = external global i32
+; CHECK: @kernel0_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 3, i32 0, i32 666, i32 0, i32 777, i32 0, i32 0 }, ptr null, ptr null }
+; CHECK: @kernel1_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 3, i32 0, i32 666, i32 0, i32 777, i32 0, i32 0 }, ptr null, ptr null }
+; CHECK: @kernel2_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 3, i32 0, i32 666, i32 0, i32 777, i32 0, i32 0 }, ptr null, ptr null }
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
-; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
+; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
 ;.
 define weak void @kernel0(ptr %dyn) "kernel" #0 {
 ; CHECK-LABEL: define {{[^@]+}}@kernel0
diff --git a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold_optnone.ll b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold_optnone.ll
index 2d22ac52275c..0cf6e7488b4d 100644
--- a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold_optnone.ll
+++ b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold_optnone.ll
@@ -5,7 +5,7 @@ target triple = "nvptx64"
 @G = external global i32
 
 ;.
-; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32
+; CHECK: @G = external global i32
 ;.
 define weak void @kernel0() #0 {
 ; CHECK-LABEL: define {{[^@]+}}@kernel0
diff --git a/llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll b/llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll
index 678acd57062b..e0b4229371ee 100644
--- a/llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll
+++ b/llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll
@@ -436,8 +436,6 @@ declare void @__tgt_target_data_end_mapper(ptr, i64, i32, ptr, ptr, ptr, ptr, pt
 
 declare dso_local i32 @rand(...)
 
-; CHECK: declare void @__tgt_target_data_begin_mapper_issue(ptr, i64, i32, ptr, ptr, ptr, ptr, ptr, ptr, ptr)
-; CHECK: declare void @__tgt_target_data_begin_mapper_wait(i64, ptr)
 
 !llvm.module.flags = !{!0}
 
diff --git a/llvm/test/Transforms/OpenMP/icv_tracking.ll b/llvm/test/Transforms/OpenMP/icv_tracking.ll
index 9f7714f73850..7adab7a19122 100644
--- a/llvm/test/Transforms/OpenMP/icv_tracking.ll
+++ b/llvm/test/Transforms/OpenMP/icv_tracking.ll
@@ -505,13 +505,13 @@ define void @test4_invoke(i1 %0) personality ptr @__gxx_personality_v0 {
 ; CHECK-SAME: (i1 [[TMP0:%.*]]) personality ptr @__gxx_personality_v0 {
 ; CHECK-NEXT:    call void @known_unique_icv(i1 [[TMP0]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = invoke i32 @maybe_throw(i1 zeroext [[TMP0]])
-; CHECK-NEXT:    to label [[CONT:%.*]] unwind label [[EXC:%.*]]
+; CHECK-NEXT:            to label [[CONT:%.*]] unwind label [[EXC:%.*]]
 ; CHECK:       cont:
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i1 [[TMP0]], false
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP5:%.*]], label [[TMP4:%.*]]
 ; CHECK:       exc:
 ; CHECK-NEXT:    [[LP:%.*]] = landingpad { ptr, i32 }
-; CHECK-NEXT:    filter [0 x ptr] zeroinitializer
+; CHECK-NEXT:            filter [0 x ptr] zeroinitializer
 ; CHECK-NEXT:    unreachable
 ; CHECK:       4:
 ; CHECK-NEXT:    [[VAL:%.*]] = call i32 @icv_free_use(i32 10)
diff --git a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
index acaacf8af0ac..310ac0a8296c 100644
--- a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
+++ b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
@@ -12,11 +12,11 @@ target triple = "nvptx64"
 @will_not_be_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 
 ;.
-; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i8
-; CHECK: @[[IS_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
-; CHECK: @[[WILL_BE_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
-; CHECK: @[[NONE_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
-; CHECK: @[[WILL_NOT_BE_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+; CHECK: @G = external global i8
+; CHECK: @is_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+; CHECK: @will_be_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+; CHECK: @none_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+; CHECK: @will_not_be_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 ;.
 define weak void @is_spmd() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@is_spmd
diff --git a/llvm/test/Transforms/OpenMP/keep_rpc_client.ll b/llvm/test/Transforms/OpenMP/keep_rpc_client.ll
index 8ee7bddd4dc0..7bac905e1793 100644
--- a/llvm/test/Transforms/OpenMP/keep_rpc_client.ll
+++ b/llvm/test/Transforms/OpenMP/keep_rpc_client.ll
@@ -6,11 +6,11 @@
 @__llvm_libc_rpc_client = protected local_unnamed_addr addrspace(1) global ptr addrspacecast (ptr addrspace(1) @client to ptr), align 8
 
 ;.
-; POSTLINK: @[[CLIENT:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(1) global i64 0, align 8
-; POSTLINK: @[[__LLVM_LIBC_RPC_CLIENT:[a-zA-Z0-9_$"\\.-]+]] = protected local_unnamed_addr addrspace(1) global ptr addrspacecast (ptr addrspace(1) @client to ptr), align 8
+; POSTLINK: @client = internal addrspace(1) global i64 0, align 8
+; POSTLINK: @__llvm_libc_rpc_client = protected local_unnamed_addr addrspace(1) global ptr addrspacecast (ptr addrspace(1) @client to ptr), align 8
 ;.
-; PRELINK: @[[CLIENT:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(1) global i64 0, align 8
-; PRELINK: @[[__LLVM_LIBC_RPC_CLIENT:[a-zA-Z0-9_$"\\.-]+]] = protected local_unnamed_addr addrspace(1) global ptr addrspacecast (ptr addrspace(1) @client to ptr), align 8
+; PRELINK: @client = internal addrspace(1) global i64 0, align 8
+; PRELINK: @__llvm_libc_rpc_client = protected local_unnamed_addr addrspace(1) global ptr addrspacecast (ptr addrspace(1) @client to ptr), align 8
 ;.
 define i64 @a() {
 ; POSTLINK-LABEL: define {{[^@]+}}@a
@@ -32,3 +32,16 @@ define i64 @a() {
 !0 = !{i32 1, !"wchar_size", i32 4}
 !1 = !{i32 7, !"openmp", i32 50}
 !2 = !{i32 7, !"openmp-device", i32 50}
+;.
+; POSTLINK: attributes #[[ATTR0]] = { norecurse nosync }
+;.
+; PRELINK: attributes #[[ATTR0]] = { norecurse nosync }
+;.
+; POSTLINK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; POSTLINK: [[META1:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; POSTLINK: [[META2:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+;.
+; PRELINK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; PRELINK: [[META1:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; PRELINK: [[META2:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+;.
diff --git a/llvm/test/Transforms/OpenMP/nested_parallelism.ll b/llvm/test/Transforms/OpenMP/nested_parallelism.ll
index adc9c3fd311d..4f4a87cbddfe 100644
--- a/llvm/test/Transforms/OpenMP/nested_parallelism.ll
+++ b/llvm/test/Transforms/OpenMP/nested_parallelism.ll
@@ -37,11 +37,11 @@ target triple = "nvptx64"
 
 ;.
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
-; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
-; CHECK: @[[I_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 16
-; CHECK: @[[I_I_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 16
-; CHECK: @[[__OMP_OFFLOADING_10302_BD7E0_MAIN_L13_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; CHECK: @[[__OMP_OFFLOADING_10302_BD7E0_MAIN_L16_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
+; CHECK: @i_shared = internal addrspace(3) global [4 x i8] undef, align 16
+; CHECK: @i.i_shared = internal addrspace(3) global [4 x i8] undef, align 16
+; CHECK: @__omp_offloading_10302_bd7e0_main_l13_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @__omp_offloading_10302_bd7e0_main_l16_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 define weak_odr protected void @__omp_offloading_10302_bd7e0_main_l13(ptr %dyn, i64 noundef %i) local_unnamed_addr "kernel" {
 ; CHECK-LABEL: @__omp_offloading_10302_bd7e0_main_l13(
diff --git a/llvm/test/Transforms/OpenMP/parallel_deletion.ll b/llvm/test/Transforms/OpenMP/parallel_deletion.ll
index 6319875fd905..4619da120609 100644
--- a/llvm/test/Transforms/OpenMP/parallel_deletion.ll
+++ b/llvm/test/Transforms/OpenMP/parallel_deletion.ll
@@ -537,8 +537,8 @@ define internal void @.omp_outlined..6(ptr noalias %.global_tid., ptr noalias %.
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_reduce_nowait(ptr noundef nonnull @[[GLOB2:[0-9]+]], i32 [[TMP2]], i32 noundef 1, i64 noundef 8, ptr noundef nonnull align 8 [[DOTOMP_REDUCTION_RED_LIST]], ptr noundef nonnull @.omp.reduction.reduction_func, ptr noundef nonnull @.gomp_critical_user_.reduction.var)
 ; CHECK-NEXT:    switch i32 [[TMP4]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
-; CHECK-NEXT:    i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
-; CHECK-NEXT:    i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+; CHECK-NEXT:      i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+; CHECK-NEXT:      i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       .omp.reduction.case1:
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A]], align 4
diff --git a/llvm/test/Transforms/OpenMP/parallel_level_fold.ll b/llvm/test/Transforms/OpenMP/parallel_level_fold.ll
index d26dd74f0e7f..fd6e7683af8e 100644
--- a/llvm/test/Transforms/OpenMP/parallel_level_fold.ll
+++ b/llvm/test/Transforms/OpenMP/parallel_level_fold.ll
@@ -11,10 +11,10 @@ target triple = "nvptx64"
 @parallel_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 
 ;.
-; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i16
-; CHECK: @[[NONE_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
-; CHECK: @[[SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
-; CHECK: @[[PARALLEL_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+; CHECK: @G = external global i16
+; CHECK: @none_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+; CHECK: @spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+; CHECK: @parallel_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 ;.
 define weak void @none_spmd() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@none_spmd
diff --git a/llvm/test/Transforms/OpenMP/remove_globalization.ll b/llvm/test/Transforms/OpenMP/remove_globalization.ll
index 40d9df835b4f..31e3ef2b9079 100644
--- a/llvm/test/Transforms/OpenMP/remove_globalization.ll
+++ b/llvm/test/Transforms/OpenMP/remove_globalization.ll
@@ -21,11 +21,11 @@ target triple = "nvptx64"
 
 ; Make it a weak definition so we will apply custom state machine rewriting but can't use the body in the reasoning.
 ;.
-; CHECK: @[[S:[a-zA-Z0-9_$"\\.-]+]] = external local_unnamed_addr global ptr
-; CHECK: @[[KERNEL_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+; CHECK: @S = external local_unnamed_addr global ptr
+; CHECK: @kernel_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 ;.
-; CHECK-DISABLED: @[[S:[a-zA-Z0-9_$"\\.-]+]] = external local_unnamed_addr global ptr
-; CHECK-DISABLED: @[[KERNEL_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+; CHECK-DISABLED: @S = external local_unnamed_addr global ptr
+; CHECK-DISABLED: @kernel_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 ;.
 define weak i32 @__kmpc_target_init(ptr %0, ptr) {
 ; CHECK-LABEL: define {{[^@]+}}@__kmpc_target_init
@@ -226,10 +226,8 @@ exit:
   ret void
 }
 
-; CHECK: declare noalias ptr @__kmpc_alloc_shared(i64)
 declare ptr @__kmpc_alloc_shared(i64)
 
-; CHECK: declare void @__kmpc_free_shared(ptr allocptr nocapture, i64)
 declare void @__kmpc_free_shared(ptr, i64)
 
 declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp"
@@ -271,30 +269,30 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp"
 ; CHECK-DISABLED: attributes #[[ATTR5]] = { nosync nounwind memory(write) }
 ; CHECK-DISABLED: attributes #[[ATTR6]] = { nounwind }
 ;.
-; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 13.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
-; CHECK: [[META1:![0-9]+]] = !DIFile(filename: "remove_globalization.c", directory: "/tmp/remove_globalization.c")
-; CHECK: [[META2:![0-9]+]] = !{}
+; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C99, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META2:![0-9]+]], splitDebugInlining: false, nameTableKind: None)
+; CHECK: [[META1]] = !DIFile(filename: "remove_globalization.c", directory: {{.*}})
+; CHECK: [[META2]] = !{}
 ; CHECK: [[META3:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
 ; CHECK: [[META4:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 ; CHECK: [[META5:![0-9]+]] = !{i32 7, !"openmp", i32 50}
 ; CHECK: [[META6:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ; CHECK: [[META7:![0-9]+]] = !{ptr @kernel, !"kernel", i32 1}
-; CHECK: [[DBG8]] = !DILocation(line: 4, column: 2, scope: !9)
-; CHECK: [[META9:![0-9]+]] = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 1, type: !10, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
-; CHECK: [[META10:![0-9]+]] = !DISubroutineType(types: !2)
+; CHECK: [[DBG8]] = !DILocation(line: 4, column: 2, scope: [[META9:![0-9]+]])
+; CHECK: [[META9]] = distinct !DISubprogram(name: "bar", scope: [[META1]], file: [[META1]], line: 1, type: [[META10:![0-9]+]], scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META2]])
+; CHECK: [[META10]] = !DISubroutineType(types: [[META2]])
 ;.
-; CHECK-DISABLED: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 13.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
-; CHECK-DISABLED: [[META1:![0-9]+]] = !DIFile(filename: "remove_globalization.c", directory: "/tmp/remove_globalization.c")
-; CHECK-DISABLED: [[META2:![0-9]+]] = !{}
+; CHECK-DISABLED: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C99, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META2:![0-9]+]], splitDebugInlining: false, nameTableKind: None)
+; CHECK-DISABLED: [[META1]] = !DIFile(filename: "remove_globalization.c", directory: {{.*}})
+; CHECK-DISABLED: [[META2]] = !{}
 ; CHECK-DISABLED: [[META3:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
 ; CHECK-DISABLED: [[META4:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 ; CHECK-DISABLED: [[META5:![0-9]+]] = !{i32 7, !"openmp", i32 50}
 ; CHECK-DISABLED: [[META6:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ; CHECK-DISABLED: [[META7:![0-9]+]] = !{ptr @kernel, !"kernel", i32 1}
-; CHECK-DISABLED: [[DBG8]] = !DILocation(line: 4, column: 2, scope: !9)
-; CHECK-DISABLED: [[META9:![0-9]+]] = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 1, type: !10, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
-; CHECK-DISABLED: [[META10:![0-9]+]] = !DISubroutineType(types: !2)
-; CHECK-DISABLED: [[DBG11]] = !DILocation(line: 6, column: 2, scope: !9)
+; CHECK-DISABLED: [[DBG8]] = !DILocation(line: 4, column: 2, scope: [[META9:![0-9]+]])
+; CHECK-DISABLED: [[META9]] = distinct !DISubprogram(name: "bar", scope: [[META1]], file: [[META1]], line: 1, type: [[META10:![0-9]+]], scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META2]])
+; CHECK-DISABLED: [[META10]] = !DISubroutineType(types: [[META2]])
+; CHECK-DISABLED: [[DBG11]] = !DILocation(line: 6, column: 2, scope: [[META9]])
 ;.
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK-REMARKS: {{.*}}
diff --git a/llvm/test/Transforms/OpenMP/remove_rpc_client.ll b/llvm/test/Transforms/OpenMP/remove_rpc_client.ll
index 18538cbb72cb..e6c7f704d0af 100644
--- a/llvm/test/Transforms/OpenMP/remove_rpc_client.ll
+++ b/llvm/test/Transforms/OpenMP/remove_rpc_client.ll
@@ -6,11 +6,8 @@
 @__llvm_libc_rpc_client = protected local_unnamed_addr addrspace(1) global ptr addrspacecast (ptr addrspace(1) @client to ptr), align 8
 
 ;.
-; POSTLINK-NOT: @[[CLIENT:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(1) global i32 0, align 8
-; POSTLINK-NOT: @[[__LLVM_LIBC_RPC_CLIENT:[a-zA-Z0-9_$"\\.-]+]] = protected local_unnamed_addr addrspace(1) global ptr addrspacecast (ptr addrspace(1) @client to ptr), align 8
-;.
-; PRELINK: @[[CLIENT:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(1) global i32 0, align 8
-; PRELINK: @[[__LLVM_LIBC_RPC_CLIENT:[a-zA-Z0-9_$"\\.-]+]] = protected local_unnamed_addr addrspace(1) global ptr addrspacecast (ptr addrspace(1) @client to ptr), align 8
+; PRELINK: @client = internal addrspace(1) global i32 0, align 8
+; PRELINK: @__llvm_libc_rpc_client = protected local_unnamed_addr addrspace(1) global ptr addrspacecast (ptr addrspace(1) @client to ptr), align 8
 ;.
 define void @a() {
 ; POSTLINK-LABEL: define {{[^@]+}}@a() {
@@ -27,3 +24,12 @@ define void @a() {
 !0 = !{i32 1, !"wchar_size", i32 4}
 !1 = !{i32 7, !"openmp", i32 50}
 !2 = !{i32 7, !"openmp-device", i32 50}
+;.
+; POSTLINK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; POSTLINK: [[META1:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; POSTLINK: [[META2:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+;.
+; PRELINK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; PRELINK: [[META1:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; PRELINK: [[META2:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+;.
diff --git a/llvm/test/Transforms/OpenMP/replace_globalization.ll b/llvm/test/Transforms/OpenMP/replace_globalization.ll
index 0b88249f2097..0f89b428de7b 100644
--- a/llvm/test/Transforms/OpenMP/replace_globalization.ll
+++ b/llvm/test/Transforms/OpenMP/replace_globalization.ll
@@ -126,16 +126,16 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp"
 !11 = !DILocation(line: 5, column: 7, scope: !9)
 !12 = !DILocation(line: 5, column: 14, scope: !9)
 ;.
-; CHECK: @[[S:[a-zA-Z0-9_$"\\.-]+]] = external local_unnamed_addr global ptr
+; CHECK: @S = external local_unnamed_addr global ptr
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [113 x i8] c"
-; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; CHECK: @[[FOO_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; CHECK: @[[BAR_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; CHECK: @[[BAZ_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; CHECK: @[[OFFSET:[a-zA-Z0-9_$"\\.-]+]] = global i32 undef
-; CHECK: @[[STACK:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [1024 x i8] undef
-; CHECK: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [16 x i8] poison, align 4
-; CHECK: @[[Y_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
+; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
+; CHECK: @foo_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @bar_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @baz_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @offset = global i32 undef
+; CHECK: @stack = internal addrspace(3) global [1024 x i8] undef
+; CHECK: @x_shared = internal addrspace(3) global [16 x i8] poison, align 4
+; CHECK: @y_shared = internal addrspace(3) global [4 x i8] poison, align 4
 ;.
 ; CHECK-LABEL: define {{[^@]+}}@foo
 ; CHECK-SAME: (ptr [[DYN:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -224,9 +224,9 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp"
 ; CHECK: attributes #[[ATTR7]] = { nosync nounwind memory(write) }
 ; CHECK: attributes #[[ATTR8]] = { nounwind }
 ;.
-; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
-; CHECK: [[META1:![0-9]+]] = !DIFile(filename: "replace_globalization.c", directory: "/tmp/replace_globalization.c")
-; CHECK: [[META2:![0-9]+]] = !{}
+; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C99, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META2:![0-9]+]], splitDebugInlining: false, nameTableKind: None)
+; CHECK: [[META1]] = !DIFile(filename: "replace_globalization.c", directory: {{.*}})
+; CHECK: [[META2]] = !{}
 ; CHECK: [[META3:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
 ; CHECK: [[META4:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 ; CHECK: [[META5:![0-9]+]] = !{i32 7, !"openmp", i32 50}
@@ -234,9 +234,9 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp"
 ; CHECK: [[META7:![0-9]+]] = !{ptr @foo, !"kernel", i32 1}
 ; CHECK: [[META8:![0-9]+]] = !{ptr @bar, !"kernel", i32 1}
 ; CHECK: [[META9:![0-9]+]] = !{ptr @baz_spmd, !"kernel", i32 1}
-; CHECK: [[DBG10]] = !DILocation(line: 5, column: 14, scope: !11)
-; CHECK: [[META11:![0-9]+]] = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 1, type: !12, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
-; CHECK: [[META12:![0-9]+]] = !DISubroutineType(types: !2)
+; CHECK: [[DBG10]] = !DILocation(line: 5, column: 14, scope: [[META11:![0-9]+]])
+; CHECK: [[META11]] = distinct !DISubprogram(name: "bar", scope: [[META1]], file: [[META1]], line: 1, type: [[META12:![0-9]+]], scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META2]])
+; CHECK: [[META12]] = !DISubroutineType(types: [[META2]])
 ;.
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK-LIMIT: {{.*}}
diff --git a/llvm/test/Transforms/OpenMP/spmdization.ll b/llvm/test/Transforms/OpenMP/spmdization.ll
index ef95aed24ded..159280ae62a0 100644
--- a/llvm/test/Transforms/OpenMP/spmdization.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization.ll
@@ -137,94 +137,94 @@
 ; NVPTX-DISABLED: @[[__OMP_OUTLINED__9_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
 ;.
 ; AMDGPU: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
-; AMDGPU: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
-; AMDGPU: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
-; AMDGPU: @[[X_SHARED_1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
-; AMDGPU: @[[__OMP_OUTLINED__9_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
+; AMDGPU: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
+; AMDGPU: @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
+; AMDGPU: @x_shared = internal addrspace(3) global [4 x i8] poison, align 4
+; AMDGPU: @x_shared.1 = internal addrspace(3) global [4 x i8] poison, align 4
+; AMDGPU: @__omp_outlined__9_wrapper.ID = private constant i8 undef
 ;.
 ; NVPTX: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
-; NVPTX: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
-; NVPTX: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
-; NVPTX: @[[X_SHARED1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
-; NVPTX: @[[__OMP_OUTLINED__9_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
+; NVPTX: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
+; NVPTX: @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
+; NVPTX: @x_shared = internal addrspace(3) global [4 x i8] poison, align 4
+; NVPTX: @x_shared1 = internal addrspace(3) global [4 x i8] poison, align 4
+; NVPTX: @__omp_outlined__9_wrapper.ID = private constant i8 undef
 ;.
 ; AMDGPU-DISABLED1: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
-; AMDGPU-DISABLED1: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED1: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
-; AMDGPU-DISABLED1: @[[X_SHARED_1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
-; AMDGPU-DISABLED1: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; AMDGPU-DISABLED1: @[[__OMP_OUTLINED__3_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; AMDGPU-DISABLED1: @[[__OMP_OUTLINED__5_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; AMDGPU-DISABLED1: @[[__OMP_OUTLINED__7_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; AMDGPU-DISABLED1: @[[__OMP_OUTLINED__9_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
+; AMDGPU-DISABLED1: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
+; AMDGPU-DISABLED1: @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED1: @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED1: @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED1: @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED1: @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED1: @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED1: @x_shared = internal addrspace(3) global [4 x i8] poison, align 4
+; AMDGPU-DISABLED1: @x_shared.1 = internal addrspace(3) global [4 x i8] poison, align 4
+; AMDGPU-DISABLED1: @__omp_outlined__1_wrapper.ID = private constant i8 undef
+; AMDGPU-DISABLED1: @__omp_outlined__3_wrapper.ID = private constant i8 undef
+; AMDGPU-DISABLED1: @__omp_outlined__5_wrapper.ID = private constant i8 undef
+; AMDGPU-DISABLED1: @__omp_outlined__7_wrapper.ID = private constant i8 undef
+; AMDGPU-DISABLED1: @__omp_outlined__9_wrapper.ID = private constant i8 undef
 ;.
 ; AMDGPU-DISABLED2: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
-; AMDGPU-DISABLED2: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED2: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
-; AMDGPU-DISABLED2: @[[X_SHARED_1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
-; AMDGPU-DISABLED2: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; AMDGPU-DISABLED2: @[[__OMP_OUTLINED__3_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; AMDGPU-DISABLED2: @[[__OMP_OUTLINED__5_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; AMDGPU-DISABLED2: @[[__OMP_OUTLINED__7_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; AMDGPU-DISABLED2: @[[__OMP_OUTLINED__9_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
+; AMDGPU-DISABLED2: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
+; AMDGPU-DISABLED2: @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED2: @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED2: @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED2: @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED2: @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED2: @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED2: @x_shared = internal addrspace(3) global [4 x i8] poison, align 4
+; AMDGPU-DISABLED2: @x_shared.1 = internal addrspace(3) global [4 x i8] poison, align 4
+; AMDGPU-DISABLED2: @__omp_outlined__1_wrapper.ID = private constant i8 undef
+; AMDGPU-DISABLED2: @__omp_outlined__3_wrapper.ID = private constant i8 undef
+; AMDGPU-DISABLED2: @__omp_outlined__5_wrapper.ID = private constant i8 undef
+; AMDGPU-DISABLED2: @__omp_outlined__7_wrapper.ID = private constant i8 undef
+; AMDGPU-DISABLED2: @__omp_outlined__9_wrapper.ID = private constant i8 undef
 ;.
 ; NVPTX-DISABLED1: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
-; NVPTX-DISABLED1: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED1: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
-; NVPTX-DISABLED1: @[[X_SHARED1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
-; NVPTX-DISABLED1: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; NVPTX-DISABLED1: @[[__OMP_OUTLINED__3_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; NVPTX-DISABLED1: @[[__OMP_OUTLINED__5_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; NVPTX-DISABLED1: @[[__OMP_OUTLINED__7_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; NVPTX-DISABLED1: @[[__OMP_OUTLINED__9_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
+; NVPTX-DISABLED1: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
+; NVPTX-DISABLED1: @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED1: @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED1: @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED1: @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED1: @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED1: @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED1: @x_shared = internal addrspace(3) global [4 x i8] poison, align 4
+; NVPTX-DISABLED1: @x_shared1 = internal addrspace(3) global [4 x i8] poison, align 4
+; NVPTX-DISABLED1: @__omp_outlined__1_wrapper.ID = private constant i8 undef
+; NVPTX-DISABLED1: @__omp_outlined__3_wrapper.ID = private constant i8 undef
+; NVPTX-DISABLED1: @__omp_outlined__5_wrapper.ID = private constant i8 undef
+; NVPTX-DISABLED1: @__omp_outlined__7_wrapper.ID = private constant i8 undef
+; NVPTX-DISABLED1: @__omp_outlined__9_wrapper.ID = private constant i8 undef
 ;.
 ; NVPTX-DISABLED2: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
-; NVPTX-DISABLED2: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED2: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
-; NVPTX-DISABLED2: @[[X_SHARED1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
-; NVPTX-DISABLED2: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; NVPTX-DISABLED2: @[[__OMP_OUTLINED__3_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; NVPTX-DISABLED2: @[[__OMP_OUTLINED__5_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; NVPTX-DISABLED2: @[[__OMP_OUTLINED__7_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; NVPTX-DISABLED2: @[[__OMP_OUTLINED__9_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
+; NVPTX-DISABLED2: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
+; NVPTX-DISABLED2: @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED2: @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED2: @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED2: @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED2: @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED2: @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED2: @x_shared = internal addrspace(3) global [4 x i8] poison, align 4
+; NVPTX-DISABLED2: @x_shared1 = internal addrspace(3) global [4 x i8] poison, align 4
+; NVPTX-DISABLED2: @__omp_outlined__1_wrapper.ID = private constant i8 undef
+; NVPTX-DISABLED2: @__omp_outlined__3_wrapper.ID = private constant i8 undef
+; NVPTX-DISABLED2: @__omp_outlined__5_wrapper.ID = private constant i8 undef
+; NVPTX-DISABLED2: @__omp_outlined__7_wrapper.ID = private constant i8 undef
+; NVPTX-DISABLED2: @__omp_outlined__9_wrapper.ID = private constant i8 undef
 ;.
 define weak void @__omp_offloading_fd02_2044372e_sequential_loop_l5() #0 {
 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5
@@ -4499,19 +4499,19 @@ attributes #11 = { convergent }
 ; AMDGPU: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ; AMDGPU: [[META15:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
 ; AMDGPU: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; AMDGPU: [[META17:![0-9]+]] = !{!"clang version 14.0.0"}
-; AMDGPU: [[TBAA18]] = !{!19, !19, i64 0}
-; AMDGPU: [[META19:![0-9]+]] = !{!"int", !20, i64 0}
-; AMDGPU: [[META20:![0-9]+]] = !{!"omnipotent char", !21, i64 0}
-; AMDGPU: [[META21:![0-9]+]] = !{!"Simple C/C++ TBAA"}
-; AMDGPU: [[LOOP22]] = distinct !{!22, !23, !24}
-; AMDGPU: [[META23:![0-9]+]] = !{!"llvm.loop.mustprogress"}
-; AMDGPU: [[META24:![0-9]+]] = !{!"llvm.loop.unroll.disable"}
-; AMDGPU: [[LOOP25]] = distinct !{!25, !23, !24}
-; AMDGPU: [[TBAA26]] = !{!27, !27, i64 0}
-; AMDGPU: [[META27:![0-9]+]] = !{!"any pointer", !20, i64 0}
-; AMDGPU: [[LOOP28]] = distinct !{!28, !23, !24}
-; AMDGPU: [[LOOP29]] = distinct !{!29, !23, !24}
+; AMDGPU: [[META17:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; AMDGPU: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
+; AMDGPU: [[META19]] = !{!"int", [[META20:![0-9]+]], i64 0}
+; AMDGPU: [[META20]] = !{!"omnipotent char", [[META21:![0-9]+]], i64 0}
+; AMDGPU: [[META21]] = !{!"Simple C/C++ TBAA"}
+; AMDGPU: [[LOOP22]] = distinct !{[[LOOP22]], [[META23:![0-9]+]], [[META24:![0-9]+]]}
+; AMDGPU: [[META23]] = !{!"llvm.loop.mustprogress"}
+; AMDGPU: [[META24]] = !{!"llvm.loop.unroll.disable"}
+; AMDGPU: [[LOOP25]] = distinct !{[[LOOP25]], [[META23]], [[META24]]}
+; AMDGPU: [[TBAA26]] = !{[[META27:![0-9]+]], [[META27]], i64 0}
+; AMDGPU: [[META27]] = !{!"any pointer", [[META20]], i64 0}
+; AMDGPU: [[LOOP28]] = distinct !{[[LOOP28]], [[META23]], [[META24]]}
+; AMDGPU: [[LOOP29]] = distinct !{[[LOOP29]], [[META23]], [[META24]]}
 ;.
 ; NVPTX: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5}
 ; NVPTX: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
@@ -4530,19 +4530,19 @@ attributes #11 = { convergent }
 ; NVPTX: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ; NVPTX: [[META15:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
 ; NVPTX: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; NVPTX: [[META17:![0-9]+]] = !{!"clang version 14.0.0"}
-; NVPTX: [[TBAA18]] = !{!19, !19, i64 0}
-; NVPTX: [[META19:![0-9]+]] = !{!"int", !20, i64 0}
-; NVPTX: [[META20:![0-9]+]] = !{!"omnipotent char", !21, i64 0}
-; NVPTX: [[META21:![0-9]+]] = !{!"Simple C/C++ TBAA"}
-; NVPTX: [[LOOP22]] = distinct !{!22, !23, !24}
-; NVPTX: [[META23:![0-9]+]] = !{!"llvm.loop.mustprogress"}
-; NVPTX: [[META24:![0-9]+]] = !{!"llvm.loop.unroll.disable"}
-; NVPTX: [[LOOP25]] = distinct !{!25, !23, !24}
-; NVPTX: [[TBAA26]] = !{!27, !27, i64 0}
-; NVPTX: [[META27:![0-9]+]] = !{!"any pointer", !20, i64 0}
-; NVPTX: [[LOOP28]] = distinct !{!28, !23, !24}
-; NVPTX: [[LOOP29]] = distinct !{!29, !23, !24}
+; NVPTX: [[META17:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; NVPTX: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
+; NVPTX: [[META19]] = !{!"int", [[META20:![0-9]+]], i64 0}
+; NVPTX: [[META20]] = !{!"omnipotent char", [[META21:![0-9]+]], i64 0}
+; NVPTX: [[META21]] = !{!"Simple C/C++ TBAA"}
+; NVPTX: [[LOOP22]] = distinct !{[[LOOP22]], [[META23:![0-9]+]], [[META24:![0-9]+]]}
+; NVPTX: [[META23]] = !{!"llvm.loop.mustprogress"}
+; NVPTX: [[META24]] = !{!"llvm.loop.unroll.disable"}
+; NVPTX: [[LOOP25]] = distinct !{[[LOOP25]], [[META23]], [[META24]]}
+; NVPTX: [[TBAA26]] = !{[[META27:![0-9]+]], [[META27]], i64 0}
+; NVPTX: [[META27]] = !{!"any pointer", [[META20]], i64 0}
+; NVPTX: [[LOOP28]] = distinct !{[[LOOP28]], [[META23]], [[META24]]}
+; NVPTX: [[LOOP29]] = distinct !{[[LOOP29]], [[META23]], [[META24]]}
 ;.
 ; AMDGPU-DISABLED1: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5}
 ; AMDGPU-DISABLED1: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
@@ -4561,19 +4561,19 @@ attributes #11 = { convergent }
 ; AMDGPU-DISABLED1: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ; AMDGPU-DISABLED1: [[META15:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
 ; AMDGPU-DISABLED1: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; AMDGPU-DISABLED1: [[META17:![0-9]+]] = !{!"clang version 14.0.0"}
-; AMDGPU-DISABLED1: [[TBAA18]] = !{!19, !19, i64 0}
-; AMDGPU-DISABLED1: [[META19:![0-9]+]] = !{!"int", !20, i64 0}
-; AMDGPU-DISABLED1: [[META20:![0-9]+]] = !{!"omnipotent char", !21, i64 0}
-; AMDGPU-DISABLED1: [[META21:![0-9]+]] = !{!"Simple C/C++ TBAA"}
-; AMDGPU-DISABLED1: [[LOOP22]] = distinct !{!22, !23, !24}
-; AMDGPU-DISABLED1: [[META23:![0-9]+]] = !{!"llvm.loop.mustprogress"}
-; AMDGPU-DISABLED1: [[META24:![0-9]+]] = !{!"llvm.loop.unroll.disable"}
-; AMDGPU-DISABLED1: [[LOOP25]] = distinct !{!25, !23, !24}
-; AMDGPU-DISABLED1: [[TBAA26]] = !{!27, !27, i64 0}
-; AMDGPU-DISABLED1: [[META27:![0-9]+]] = !{!"any pointer", !20, i64 0}
-; AMDGPU-DISABLED1: [[LOOP28]] = distinct !{!28, !23, !24}
-; AMDGPU-DISABLED1: [[LOOP29]] = distinct !{!29, !23, !24}
+; AMDGPU-DISABLED1: [[META17:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; AMDGPU-DISABLED1: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
+; AMDGPU-DISABLED1: [[META19]] = !{!"int", [[META20:![0-9]+]], i64 0}
+; AMDGPU-DISABLED1: [[META20]] = !{!"omnipotent char", [[META21:![0-9]+]], i64 0}
+; AMDGPU-DISABLED1: [[META21]] = !{!"Simple C/C++ TBAA"}
+; AMDGPU-DISABLED1: [[LOOP22]] = distinct !{[[LOOP22]], [[META23:![0-9]+]], [[META24:![0-9]+]]}
+; AMDGPU-DISABLED1: [[META23]] = !{!"llvm.loop.mustprogress"}
+; AMDGPU-DISABLED1: [[META24]] = !{!"llvm.loop.unroll.disable"}
+; AMDGPU-DISABLED1: [[LOOP25]] = distinct !{[[LOOP25]], [[META23]], [[META24]]}
+; AMDGPU-DISABLED1: [[TBAA26]] = !{[[META27:![0-9]+]], [[META27]], i64 0}
+; AMDGPU-DISABLED1: [[META27]] = !{!"any pointer", [[META20]], i64 0}
+; AMDGPU-DISABLED1: [[LOOP28]] = distinct !{[[LOOP28]], [[META23]], [[META24]]}
+; AMDGPU-DISABLED1: [[LOOP29]] = distinct !{[[LOOP29]], [[META23]], [[META24]]}
 ;.
 ; AMDGPU-DISABLED2: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5}
 ; AMDGPU-DISABLED2: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
@@ -4592,19 +4592,19 @@ attributes #11 = { convergent }
 ; AMDGPU-DISABLED2: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ; AMDGPU-DISABLED2: [[META15:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
 ; AMDGPU-DISABLED2: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; AMDGPU-DISABLED2: [[META17:![0-9]+]] = !{!"clang version 14.0.0"}
-; AMDGPU-DISABLED2: [[TBAA18]] = !{!19, !19, i64 0}
-; AMDGPU-DISABLED2: [[META19:![0-9]+]] = !{!"int", !20, i64 0}
-; AMDGPU-DISABLED2: [[META20:![0-9]+]] = !{!"omnipotent char", !21, i64 0}
-; AMDGPU-DISABLED2: [[META21:![0-9]+]] = !{!"Simple C/C++ TBAA"}
-; AMDGPU-DISABLED2: [[LOOP22]] = distinct !{!22, !23, !24}
-; AMDGPU-DISABLED2: [[META23:![0-9]+]] = !{!"llvm.loop.mustprogress"}
-; AMDGPU-DISABLED2: [[META24:![0-9]+]] = !{!"llvm.loop.unroll.disable"}
-; AMDGPU-DISABLED2: [[LOOP25]] = distinct !{!25, !23, !24}
-; AMDGPU-DISABLED2: [[TBAA26]] = !{!27, !27, i64 0}
-; AMDGPU-DISABLED2: [[META27:![0-9]+]] = !{!"any pointer", !20, i64 0}
-; AMDGPU-DISABLED2: [[LOOP28]] = distinct !{!28, !23, !24}
-; AMDGPU-DISABLED2: [[LOOP29]] = distinct !{!29, !23, !24}
+; AMDGPU-DISABLED2: [[META17:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; AMDGPU-DISABLED2: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
+; AMDGPU-DISABLED2: [[META19]] = !{!"int", [[META20:![0-9]+]], i64 0}
+; AMDGPU-DISABLED2: [[META20]] = !{!"omnipotent char", [[META21:![0-9]+]], i64 0}
+; AMDGPU-DISABLED2: [[META21]] = !{!"Simple C/C++ TBAA"}
+; AMDGPU-DISABLED2: [[LOOP22]] = distinct !{[[LOOP22]], [[META23:![0-9]+]], [[META24:![0-9]+]]}
+; AMDGPU-DISABLED2: [[META23]] = !{!"llvm.loop.mustprogress"}
+; AMDGPU-DISABLED2: [[META24]] = !{!"llvm.loop.unroll.disable"}
+; AMDGPU-DISABLED2: [[LOOP25]] = distinct !{[[LOOP25]], [[META23]], [[META24]]}
+; AMDGPU-DISABLED2: [[TBAA26]] = !{[[META27:![0-9]+]], [[META27]], i64 0}
+; AMDGPU-DISABLED2: [[META27]] = !{!"any pointer", [[META20]], i64 0}
+; AMDGPU-DISABLED2: [[LOOP28]] = distinct !{[[LOOP28]], [[META23]], [[META24]]}
+; AMDGPU-DISABLED2: [[LOOP29]] = distinct !{[[LOOP29]], [[META23]], [[META24]]}
 ;.
 ; NVPTX-DISABLED1: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5}
 ; NVPTX-DISABLED1: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
@@ -4623,19 +4623,19 @@ attributes #11 = { convergent }
 ; NVPTX-DISABLED1: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ; NVPTX-DISABLED1: [[META15:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
 ; NVPTX-DISABLED1: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; NVPTX-DISABLED1: [[META17:![0-9]+]] = !{!"clang version 14.0.0"}
-; NVPTX-DISABLED1: [[TBAA18]] = !{!19, !19, i64 0}
-; NVPTX-DISABLED1: [[META19:![0-9]+]] = !{!"int", !20, i64 0}
-; NVPTX-DISABLED1: [[META20:![0-9]+]] = !{!"omnipotent char", !21, i64 0}
-; NVPTX-DISABLED1: [[META21:![0-9]+]] = !{!"Simple C/C++ TBAA"}
-; NVPTX-DISABLED1: [[LOOP22]] = distinct !{!22, !23, !24}
-; NVPTX-DISABLED1: [[META23:![0-9]+]] = !{!"llvm.loop.mustprogress"}
-; NVPTX-DISABLED1: [[META24:![0-9]+]] = !{!"llvm.loop.unroll.disable"}
-; NVPTX-DISABLED1: [[LOOP25]] = distinct !{!25, !23, !24}
-; NVPTX-DISABLED1: [[TBAA26]] = !{!27, !27, i64 0}
-; NVPTX-DISABLED1: [[META27:![0-9]+]] = !{!"any pointer", !20, i64 0}
-; NVPTX-DISABLED1: [[LOOP28]] = distinct !{!28, !23, !24}
-; NVPTX-DISABLED1: [[LOOP29]] = distinct !{!29, !23, !24}
+; NVPTX-DISABLED1: [[META17:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; NVPTX-DISABLED1: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
+; NVPTX-DISABLED1: [[META19]] = !{!"int", [[META20:![0-9]+]], i64 0}
+; NVPTX-DISABLED1: [[META20]] = !{!"omnipotent char", [[META21:![0-9]+]], i64 0}
+; NVPTX-DISABLED1: [[META21]] = !{!"Simple C/C++ TBAA"}
+; NVPTX-DISABLED1: [[LOOP22]] = distinct !{[[LOOP22]], [[META23:![0-9]+]], [[META24:![0-9]+]]}
+; NVPTX-DISABLED1: [[META23]] = !{!"llvm.loop.mustprogress"}
+; NVPTX-DISABLED1: [[META24]] = !{!"llvm.loop.unroll.disable"}
+; NVPTX-DISABLED1: [[LOOP25]] = distinct !{[[LOOP25]], [[META23]], [[META24]]}
+; NVPTX-DISABLED1: [[TBAA26]] = !{[[META27:![0-9]+]], [[META27]], i64 0}
+; NVPTX-DISABLED1: [[META27]] = !{!"any pointer", [[META20]], i64 0}
+; NVPTX-DISABLED1: [[LOOP28]] = distinct !{[[LOOP28]], [[META23]], [[META24]]}
+; NVPTX-DISABLED1: [[LOOP29]] = distinct !{[[LOOP29]], [[META23]], [[META24]]}
 ;.
 ; NVPTX-DISABLED2: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5}
 ; NVPTX-DISABLED2: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
@@ -4654,17 +4654,17 @@ attributes #11 = { convergent }
 ; NVPTX-DISABLED2: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ; NVPTX-DISABLED2: [[META15:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
 ; NVPTX-DISABLED2: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; NVPTX-DISABLED2: [[META17:![0-9]+]] = !{!"clang version 14.0.0"}
-; NVPTX-DISABLED2: [[TBAA18]] = !{!19, !19, i64 0}
-; NVPTX-DISABLED2: [[META19:![0-9]+]] = !{!"int", !20, i64 0}
-; NVPTX-DISABLED2: [[META20:![0-9]+]] = !{!"omnipotent char", !21, i64 0}
-; NVPTX-DISABLED2: [[META21:![0-9]+]] = !{!"Simple C/C++ TBAA"}
-; NVPTX-DISABLED2: [[LOOP22]] = distinct !{!22, !23, !24}
-; NVPTX-DISABLED2: [[META23:![0-9]+]] = !{!"llvm.loop.mustprogress"}
-; NVPTX-DISABLED2: [[META24:![0-9]+]] = !{!"llvm.loop.unroll.disable"}
-; NVPTX-DISABLED2: [[LOOP25]] = distinct !{!25, !23, !24}
-; NVPTX-DISABLED2: [[TBAA26]] = !{!27, !27, i64 0}
-; NVPTX-DISABLED2: [[META27:![0-9]+]] = !{!"any pointer", !20, i64 0}
-; NVPTX-DISABLED2: [[LOOP28]] = distinct !{!28, !23, !24}
-; NVPTX-DISABLED2: [[LOOP29]] = distinct !{!29, !23, !24}
+; NVPTX-DISABLED2: [[META17:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; NVPTX-DISABLED2: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
+; NVPTX-DISABLED2: [[META19]] = !{!"int", [[META20:![0-9]+]], i64 0}
+; NVPTX-DISABLED2: [[META20]] = !{!"omnipotent char", [[META21:![0-9]+]], i64 0}
+; NVPTX-DISABLED2: [[META21]] = !{!"Simple C/C++ TBAA"}
+; NVPTX-DISABLED2: [[LOOP22]] = distinct !{[[LOOP22]], [[META23:![0-9]+]], [[META24:![0-9]+]]}
+; NVPTX-DISABLED2: [[META23]] = !{!"llvm.loop.mustprogress"}
+; NVPTX-DISABLED2: [[META24]] = !{!"llvm.loop.unroll.disable"}
+; NVPTX-DISABLED2: [[LOOP25]] = distinct !{[[LOOP25]], [[META23]], [[META24]]}
+; NVPTX-DISABLED2: [[TBAA26]] = !{[[META27:![0-9]+]], [[META27]], i64 0}
+; NVPTX-DISABLED2: [[META27]] = !{!"any pointer", [[META20]], i64 0}
+; NVPTX-DISABLED2: [[LOOP28]] = distinct !{[[LOOP28]], [[META23]], [[META24]]}
+; NVPTX-DISABLED2: [[LOOP29]] = distinct !{[[LOOP29]], [[META23]], [[META24]]}
 ;.
diff --git a/llvm/test/Transforms/OpenMP/spmdization_assumes.ll b/llvm/test/Transforms/OpenMP/spmdization_assumes.ll
index 205669572e41..2f43a4e4286a 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_assumes.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_assumes.ll
@@ -24,9 +24,9 @@ target triple = "nvptx64"
 ; Function Attrs: alwaysinline convergent norecurse nounwind
 ;.
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
-; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; CHECK: @[[__OMP_OFFLOADING_FD02_404433C2_MAIN_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
+; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
+; CHECK: @__omp_offloading_fd02_404433c2_main_l5_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
 ;.
 define weak void @__omp_offloading_fd02_404433c2_main_l5(ptr %dyn, ptr nonnull align 8 dereferenceable(8) %x) local_unnamed_addr #0 {
 ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_fd02_404433c2_main_l5
@@ -160,9 +160,9 @@ attributes #6 = { convergent nounwind "llvm.assume"="ompx_spmd_amenable" }
 ; CHECK: [[META4:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ; CHECK: [[META5:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
 ; CHECK: [[META6:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; CHECK: [[META7:![0-9]+]] = !{!"clang version 14.0.0"}
-; CHECK: [[TBAA8]] = !{!9, !9, i64 0}
-; CHECK: [[META9:![0-9]+]] = !{!"double", !10, i64 0}
-; CHECK: [[META10:![0-9]+]] = !{!"omnipotent char", !11, i64 0}
-; CHECK: [[META11:![0-9]+]] = !{!"Simple C/C++ TBAA"}
+; CHECK: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; CHECK: [[TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
+; CHECK: [[META9]] = !{!"double", [[META10:![0-9]+]], i64 0}
+; CHECK: [[META10]] = !{!"omnipotent char", [[META11:![0-9]+]], i64 0}
+; CHECK: [[META11]] = !{!"Simple C/C++ TBAA"}
 ;.
diff --git a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
index 45e86e4ce2ef..b2e14dce94d5 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
@@ -47,16 +47,16 @@ target triple = "nvptx64"
 ; Function Attrs: convergent norecurse nounwind
 ;.
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
-; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; CHECK: @[[LOCGLOB:[a-zA-Z0-9_$"\\.-]+]] = private unnamed_addr addrspace(5) global i32 43
-; CHECK: @[[__OMP_OFFLOADING_2A_FBFA7A_SEQUENTIAL_LOOP_L6_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
+; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
+; CHECK: @LocGlob = private unnamed_addr addrspace(5) global i32 43
+; CHECK: @__omp_offloading_2a_fbfa7a_sequential_loop_l6_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
 ;.
 ; CHECK-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
-; CHECK-DISABLED: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; CHECK-DISABLED: @[[LOCGLOB:[a-zA-Z0-9_$"\\.-]+]] = private unnamed_addr addrspace(5) global i32 43
-; CHECK-DISABLED: @[[__OMP_OFFLOADING_2A_FBFA7A_SEQUENTIAL_LOOP_L6_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; CHECK-DISABLED: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
+; CHECK-DISABLED: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
+; CHECK-DISABLED: @LocGlob = private unnamed_addr addrspace(5) global i32 43
+; CHECK-DISABLED: @__omp_offloading_2a_fbfa7a_sequential_loop_l6_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK-DISABLED: @__omp_outlined__1_wrapper.ID = private constant i8 undef
 ;.
 define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x, i64 %N) #0 {
 ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2a_fbfa7a_sequential_loop_l6
@@ -84,9 +84,9 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 0
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]]
 ; CHECK:       region.guarded:
-; CHECK-NEXT:    store i32 0, ptr [[X]], align 4, !noalias !8
-; CHECK-NEXT:    store i32 1, ptr [[ARRAYIDX1_I]], align 4, !noalias !8
-; CHECK-NEXT:    store i32 [[N_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr [[ARRAYIDX2_I]], align 4, !noalias !8
+; CHECK-NEXT:    store i32 0, ptr [[X]], align 4, !noalias [[META8:![0-9]+]]
+; CHECK-NEXT:    store i32 1, ptr [[ARRAYIDX1_I]], align 4, !noalias [[META8]]
+; CHECK-NEXT:    store i32 [[N_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr [[ARRAYIDX2_I]], align 4, !noalias [[META8]]
 ; CHECK-NEXT:    br label [[REGION_GUARDED_END:%.*]]
 ; CHECK:       region.guarded.end:
 ; CHECK-NEXT:    br label [[REGION_BARRIER]]
@@ -111,7 +111,7 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[REGION_GUARDED4:%.*]], label [[REGION_BARRIER2:%.*]]
 ; CHECK:       region.guarded4:
-; CHECK-NEXT:    store i32 [[SUB3_I]], ptr [[ARRAYIDX5_I]], align 4, !noalias !8
+; CHECK-NEXT:    store i32 [[SUB3_I]], ptr [[ARRAYIDX5_I]], align 4, !noalias [[META8]]
 ; CHECK-NEXT:    br label [[REGION_GUARDED_END1:%.*]]
 ; CHECK:       region.guarded.end1:
 ; CHECK-NEXT:    br label [[REGION_BARRIER2]]
@@ -123,7 +123,7 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x
 ; CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       __omp_outlined__.exit:
 ; CHECK-NEXT:    call void @__kmpc_parallel_51(ptr null, i32 0, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr null, i64 0)
-; CHECK-NEXT:    [[CALL_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10:[0-9]+]], !noalias !8
+; CHECK-NEXT:    [[CALL_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10:[0-9]+]], !noalias [[META8]]
 ; CHECK-NEXT:    [[IDXPROM6_I:%.*]] = sext i32 [[CALL_I]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX7_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM6_I]]
 ; CHECK-NEXT:    br label [[REGION_CHECK_TID10:%.*]]
@@ -132,7 +132,7 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[REGION_GUARDED9:%.*]], label [[REGION_BARRIER7:%.*]]
 ; CHECK:       region.guarded9:
-; CHECK-NEXT:    store i32 [[CALL_I]], ptr [[ARRAYIDX7_I]], align 4, !noalias !8
+; CHECK-NEXT:    store i32 [[CALL_I]], ptr [[ARRAYIDX7_I]], align 4, !noalias [[META8]]
 ; CHECK-NEXT:    br label [[REGION_GUARDED_END6:%.*]]
 ; CHECK:       region.guarded.end6:
 ; CHECK-NEXT:    br label [[REGION_BARRIER7]]
@@ -140,7 +140,7 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x
 ; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP6]])
 ; CHECK-NEXT:    br label [[REGION_EXIT8:%.*]]
 ; CHECK:       region.exit8:
-; CHECK-NEXT:    [[CALL8_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias !8
+; CHECK-NEXT:    [[CALL8_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META8]]
 ; CHECK-NEXT:    [[IDXPROM9_I:%.*]] = sext i32 [[CALL8_I]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX10_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM9_I]]
 ; CHECK-NEXT:    br label [[REGION_CHECK_TID15:%.*]]
@@ -149,7 +149,7 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[REGION_GUARDED14:%.*]], label [[REGION_BARRIER12:%.*]]
 ; CHECK:       region.guarded14:
-; CHECK-NEXT:    store i32 [[CALL8_I]], ptr [[ARRAYIDX10_I]], align 4, !noalias !8
+; CHECK-NEXT:    store i32 [[CALL8_I]], ptr [[ARRAYIDX10_I]], align 4, !noalias [[META8]]
 ; CHECK-NEXT:    br label [[REGION_GUARDED_END11:%.*]]
 ; CHECK:       region.guarded.end11:
 ; CHECK-NEXT:    br label [[REGION_BARRIER12]]
@@ -157,7 +157,7 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x
 ; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP8]])
 ; CHECK-NEXT:    br label [[REGION_EXIT13:%.*]]
 ; CHECK:       region.exit13:
-; CHECK-NEXT:    [[CALL11_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias !8
+; CHECK-NEXT:    [[CALL11_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META8]]
 ; CHECK-NEXT:    [[IDXPROM12_I:%.*]] = sext i32 [[CALL11_I]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX13_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM12_I]]
 ; CHECK-NEXT:    br label [[REGION_CHECK_TID20:%.*]]
@@ -166,7 +166,7 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 0
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[REGION_GUARDED19:%.*]], label [[REGION_BARRIER17:%.*]]
 ; CHECK:       region.guarded19:
-; CHECK-NEXT:    store i32 [[CALL11_I]], ptr [[ARRAYIDX13_I]], align 4, !noalias !8
+; CHECK-NEXT:    store i32 [[CALL11_I]], ptr [[ARRAYIDX13_I]], align 4, !noalias [[META8]]
 ; CHECK-NEXT:    br label [[REGION_GUARDED_END16:%.*]]
 ; CHECK:       region.guarded.end16:
 ; CHECK-NEXT:    br label [[REGION_BARRIER17]]
@@ -174,9 +174,9 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x
 ; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP10]])
 ; CHECK-NEXT:    br label [[REGION_EXIT18:%.*]]
 ; CHECK:       region.exit18:
-; CHECK-NEXT:    [[CALL14_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias !8
-; CHECK-NEXT:    [[CALL15_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias !8
-; CHECK-NEXT:    [[CALL16_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias !8
+; CHECK-NEXT:    [[CALL14_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META8]]
+; CHECK-NEXT:    [[CALL15_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META8]]
+; CHECK-NEXT:    [[CALL16_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META8]]
 ; CHECK-NEXT:    call void @__kmpc_target_deinit() #[[ATTR6]]
 ; CHECK-NEXT:    ret void
 ; CHECK:       worker.exit:
@@ -230,13 +230,13 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x
 ; CHECK-DISABLED-NEXT:    [[SELECT:%.*]] = select i1 [[C]], ptr [[AL32]], ptr addrspacecast (ptr addrspace(5) @LocGlob to ptr)
 ; CHECK-DISABLED-NEXT:    store ptr [[SELECT]], ptr [[LOC]], align 8
 ; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR6]]
-; CHECK-DISABLED-NEXT:    store i32 0, ptr [[X]], align 4, !noalias !8
+; CHECK-DISABLED-NEXT:    store i32 0, ptr [[X]], align 4, !noalias [[META8:![0-9]+]]
 ; CHECK-DISABLED-NEXT:    [[ARRAYIDX1_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 1
-; CHECK-DISABLED-NEXT:    store i32 1, ptr [[ARRAYIDX1_I]], align 4, !noalias !8
+; CHECK-DISABLED-NEXT:    store i32 1, ptr [[ARRAYIDX1_I]], align 4, !noalias [[META8]]
 ; CHECK-DISABLED-NEXT:    [[SEXT:%.*]] = shl i64 [[N]], 32
 ; CHECK-DISABLED-NEXT:    [[IDXPROM_I:%.*]] = ashr exact i64 [[SEXT]], 32
 ; CHECK-DISABLED-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM_I]]
-; CHECK-DISABLED-NEXT:    store i32 [[N_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr [[ARRAYIDX2_I]], align 4, !noalias !8
+; CHECK-DISABLED-NEXT:    store i32 [[N_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr [[ARRAYIDX2_I]], align 4, !noalias [[META8]]
 ; CHECK-DISABLED-NEXT:    call void @usei8ptr(ptr nocapture [[HEAP2STACK_H2S]]) #[[ATTR9:[0-9]+]]
 ; CHECK-DISABLED-NEXT:    br label [[FOR_COND_I:%.*]]
 ; CHECK-DISABLED:       for.cond.i:
@@ -248,26 +248,26 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x
 ; CHECK-DISABLED-NEXT:    [[SUB3_I:%.*]] = add nsw i32 [[I_0_I]], -1
 ; CHECK-DISABLED-NEXT:    [[IDXPROM4_I:%.*]] = zext i32 [[I_0_I]] to i64
 ; CHECK-DISABLED-NEXT:    [[ARRAYIDX5_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM4_I]]
-; CHECK-DISABLED-NEXT:    store i32 [[SUB3_I]], ptr [[ARRAYIDX5_I]], align 4, !noalias !8
+; CHECK-DISABLED-NEXT:    store i32 [[SUB3_I]], ptr [[ARRAYIDX5_I]], align 4, !noalias [[META8]]
 ; CHECK-DISABLED-NEXT:    [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1
 ; CHECK-DISABLED-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK-DISABLED:       __omp_outlined__.exit:
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_parallel_51(ptr null, i32 0, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper.ID, ptr null, i64 0)
-; CHECK-DISABLED-NEXT:    [[CALL_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10:[0-9]+]], !noalias !8
+; CHECK-DISABLED-NEXT:    [[CALL_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10:[0-9]+]], !noalias [[META8]]
 ; CHECK-DISABLED-NEXT:    [[IDXPROM6_I:%.*]] = sext i32 [[CALL_I]] to i64
 ; CHECK-DISABLED-NEXT:    [[ARRAYIDX7_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM6_I]]
-; CHECK-DISABLED-NEXT:    store i32 [[CALL_I]], ptr [[ARRAYIDX7_I]], align 4, !noalias !8
-; CHECK-DISABLED-NEXT:    [[CALL8_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias !8
+; CHECK-DISABLED-NEXT:    store i32 [[CALL_I]], ptr [[ARRAYIDX7_I]], align 4, !noalias [[META8]]
+; CHECK-DISABLED-NEXT:    [[CALL8_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META8]]
 ; CHECK-DISABLED-NEXT:    [[IDXPROM9_I:%.*]] = sext i32 [[CALL8_I]] to i64
 ; CHECK-DISABLED-NEXT:    [[ARRAYIDX10_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM9_I]]
-; CHECK-DISABLED-NEXT:    store i32 [[CALL8_I]], ptr [[ARRAYIDX10_I]], align 4, !noalias !8
-; CHECK-DISABLED-NEXT:    [[CALL11_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias !8
+; CHECK-DISABLED-NEXT:    store i32 [[CALL8_I]], ptr [[ARRAYIDX10_I]], align 4, !noalias [[META8]]
+; CHECK-DISABLED-NEXT:    [[CALL11_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META8]]
 ; CHECK-DISABLED-NEXT:    [[IDXPROM12_I:%.*]] = sext i32 [[CALL11_I]] to i64
 ; CHECK-DISABLED-NEXT:    [[ARRAYIDX13_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM12_I]]
-; CHECK-DISABLED-NEXT:    store i32 [[CALL11_I]], ptr [[ARRAYIDX13_I]], align 4, !noalias !8
-; CHECK-DISABLED-NEXT:    [[CALL14_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias !8
-; CHECK-DISABLED-NEXT:    [[CALL15_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias !8
-; CHECK-DISABLED-NEXT:    [[CALL16_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias !8
+; CHECK-DISABLED-NEXT:    store i32 [[CALL11_I]], ptr [[ARRAYIDX13_I]], align 4, !noalias [[META8]]
+; CHECK-DISABLED-NEXT:    [[CALL14_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META8]]
+; CHECK-DISABLED-NEXT:    [[CALL15_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META8]]
+; CHECK-DISABLED-NEXT:    [[CALL16_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META8]]
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_target_deinit() #[[ATTR6]]
 ; CHECK-DISABLED-NEXT:    ret void
 ; CHECK-DISABLED:       worker.exit:
@@ -453,12 +453,12 @@ attributes #5 = { convergent nounwind "llvm.assume"="omp_no_openmp,ompx_spmd_ame
 ; CHECK: [[META4:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ; CHECK: [[META5:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
 ; CHECK: [[META6:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; CHECK: [[META7:![0-9]+]] = !{!"clang version 14.0.0"}
-; CHECK: [[META8:![0-9]+]] = !{!9}
-; CHECK: [[META9:![0-9]+]] = distinct !{!9, !10, !"__omp_outlined__: %__context"}
-; CHECK: [[META10:![0-9]+]] = distinct !{!10, !"__omp_outlined__"}
-; CHECK: [[LOOP11]] = distinct !{!11, !12}
-; CHECK: [[META12:![0-9]+]] = !{!"llvm.loop.mustprogress"}
+; CHECK: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; CHECK: [[META8]] = !{[[META9:![0-9]+]]}
+; CHECK: [[META9]] = distinct !{[[META9]], [[META10:![0-9]+]], !"__omp_outlined__: %__context"}
+; CHECK: [[META10]] = distinct !{[[META10]], !"__omp_outlined__"}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META12:![0-9]+]]}
+; CHECK: [[META12]] = !{!"llvm.loop.mustprogress"}
 ;.
 ; CHECK-DISABLED: [[META0:![0-9]+]] = !{i32 0, i32 42, i32 16513658, !"sequential_loop", i32 6, i32 0}
 ; CHECK-DISABLED: [[META1:![0-9]+]] = !{ptr @__omp_offloading_2a_fbfa7a_sequential_loop_l6, !"kernel", i32 1}
@@ -467,10 +467,10 @@ attributes #5 = { convergent nounwind "llvm.assume"="omp_no_openmp,ompx_spmd_ame
 ; CHECK-DISABLED: [[META4:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ; CHECK-DISABLED: [[META5:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
 ; CHECK-DISABLED: [[META6:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; CHECK-DISABLED: [[META7:![0-9]+]] = !{!"clang version 14.0.0"}
-; CHECK-DISABLED: [[META8:![0-9]+]] = !{!9}
-; CHECK-DISABLED: [[META9:![0-9]+]] = distinct !{!9, !10, !"__omp_outlined__: %__context"}
-; CHECK-DISABLED: [[META10:![0-9]+]] = distinct !{!10, !"__omp_outlined__"}
-; CHECK-DISABLED: [[LOOP11]] = distinct !{!11, !12}
-; CHECK-DISABLED: [[META12:![0-9]+]] = !{!"llvm.loop.mustprogress"}
+; CHECK-DISABLED: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; CHECK-DISABLED: [[META8]] = !{[[META9:![0-9]+]]}
+; CHECK-DISABLED: [[META9]] = distinct !{[[META9]], [[META10:![0-9]+]], !"__omp_outlined__: %__context"}
+; CHECK-DISABLED: [[META10]] = distinct !{[[META10]], !"__omp_outlined__"}
+; CHECK-DISABLED: [[LOOP11]] = distinct !{[[LOOP11]], [[META12:![0-9]+]]}
+; CHECK-DISABLED: [[META12]] = !{!"llvm.loop.mustprogress"}
 ;.
diff --git a/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll b/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
index c15bae1d4481..11405b7eb447 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
@@ -42,19 +42,19 @@ target triple = "nvptx64"
 
 ;.
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
-; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
-; CHECK: @[[__OMP_OFFLOADING_2B_10393B5_SPMD_L12_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; CHECK: @[[__OMP_OFFLOADING_2B_10393B5_GENERIC_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; CHECK: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
+; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
+; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
+; CHECK: @G = external global i32, align 4
+; CHECK: @__omp_offloading_2b_10393b5_spmd_l12_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @__omp_offloading_2b_10393b5_generic_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[GLOB3:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
 ;.
 ; CHECK-DISABLE-SPMDIZATION: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
-; CHECK-DISABLE-SPMDIZATION: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; CHECK-DISABLE-SPMDIZATION: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; CHECK-DISABLE-SPMDIZATION: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
-; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OFFLOADING_2B_10393B5_SPMD_L12_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OFFLOADING_2B_10393B5_GENERIC_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK-DISABLE-SPMDIZATION: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
+; CHECK-DISABLE-SPMDIZATION: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
+; CHECK-DISABLE-SPMDIZATION: @G = external global i32, align 4
+; CHECK-DISABLE-SPMDIZATION: @__omp_offloading_2b_10393b5_spmd_l12_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK-DISABLE-SPMDIZATION: @__omp_offloading_2b_10393b5_generic_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 define weak void @__omp_offloading_2b_10393b5_spmd_l12(ptr %dyn) "kernel" #0 {
 ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2b_10393b5_spmd_l12
@@ -365,7 +365,7 @@ attributes #5 = { convergent }
 ; CHECK: [[META6:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ; CHECK: [[META7:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
 ; CHECK: [[META8:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; CHECK: [[META9:![0-9]+]] = !{!"clang version 14.0.0"}
+; CHECK: [[META9:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
 ;.
 ; CHECK-DISABLE-SPMDIZATION: [[META0:![0-9]+]] = !{i32 0, i32 43, i32 17011637, !"spmd", i32 12, i32 0}
 ; CHECK-DISABLE-SPMDIZATION: [[META1:![0-9]+]] = !{i32 0, i32 43, i32 17011637, !"generic", i32 20, i32 1}
@@ -376,5 +376,5 @@ attributes #5 = { convergent }
 ; CHECK-DISABLE-SPMDIZATION: [[META6:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ; CHECK-DISABLE-SPMDIZATION: [[META7:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
 ; CHECK-DISABLE-SPMDIZATION: [[META8:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; CHECK-DISABLE-SPMDIZATION: [[META9:![0-9]+]] = !{!"clang version 14.0.0"}
+; CHECK-DISABLE-SPMDIZATION: [[META9:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
 ;.
diff --git a/llvm/test/Transforms/OpenMP/spmdization_indirect.ll b/llvm/test/Transforms/OpenMP/spmdization_indirect.ll
index 33885af2f992..f348825446c6 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_indirect.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_indirect.ll
@@ -15,18 +15,18 @@
 
 ;.
 ; AMDGPU: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
-; AMDGPU: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU: @[[SPMD_CALLEES_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[SPMD_AND_NON_SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[SPMD_AND_NON_SPMD_CALLEE_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
+; AMDGPU: @spmd_callees_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @spmd_callees_metadata_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @spmd_and_non_spmd_callees_metadata_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @spmd_and_non_spmd_callee_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; NVPTX: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
-; NVPTX: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX: @[[SPMD_CALLEES_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[SPMD_AND_NON_SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[SPMD_AND_NON_SPMD_CALLEE_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
+; NVPTX: @spmd_callees_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @spmd_callees_metadata_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @spmd_and_non_spmd_callees_metadata_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @spmd_and_non_spmd_callee_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 define weak void @spmd_callees(i1 %c) #0 {
 ; AMDGPU-LABEL: define {{[^@]+}}@spmd_callees
@@ -1149,19 +1149,19 @@ attributes #11 = { convergent }
 ; AMDGPU: [[META13:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ; AMDGPU: [[META14:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
 ; AMDGPU: [[META15:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; AMDGPU: [[META16:![0-9]+]] = !{!"clang version 14.0.0"}
-; AMDGPU: [[TBAA17]] = !{!18, !18, i64 0}
-; AMDGPU: [[META18:![0-9]+]] = !{!"int", !19, i64 0}
-; AMDGPU: [[META19:![0-9]+]] = !{!"omnipotent char", !20, i64 0}
-; AMDGPU: [[META20:![0-9]+]] = !{!"Simple C/C++ TBAA"}
-; AMDGPU: [[LOOP21]] = distinct !{!21, !22, !23}
-; AMDGPU: [[META22:![0-9]+]] = !{!"llvm.loop.mustprogress"}
-; AMDGPU: [[META23:![0-9]+]] = !{!"llvm.loop.unroll.disable"}
-; AMDGPU: [[LOOP24]] = distinct !{!24, !22, !23}
-; AMDGPU: [[TBAA25]] = !{!26, !26, i64 0}
-; AMDGPU: [[META26:![0-9]+]] = !{!"any pointer", !19, i64 0}
-; AMDGPU: [[LOOP27]] = distinct !{!27, !22, !23}
-; AMDGPU: [[LOOP28]] = distinct !{!28, !22, !23}
+; AMDGPU: [[META16:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; AMDGPU: [[TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
+; AMDGPU: [[META18]] = !{!"int", [[META19:![0-9]+]], i64 0}
+; AMDGPU: [[META19]] = !{!"omnipotent char", [[META20:![0-9]+]], i64 0}
+; AMDGPU: [[META20]] = !{!"Simple C/C++ TBAA"}
+; AMDGPU: [[LOOP21]] = distinct !{[[LOOP21]], [[META22:![0-9]+]], [[META23:![0-9]+]]}
+; AMDGPU: [[META22]] = !{!"llvm.loop.mustprogress"}
+; AMDGPU: [[META23]] = !{!"llvm.loop.unroll.disable"}
+; AMDGPU: [[LOOP24]] = distinct !{[[LOOP24]], [[META22]], [[META23]]}
+; AMDGPU: [[TBAA25]] = !{[[META26:![0-9]+]], [[META26]], i64 0}
+; AMDGPU: [[META26]] = !{!"any pointer", [[META19]], i64 0}
+; AMDGPU: [[LOOP27]] = distinct !{[[LOOP27]], [[META22]], [[META23]]}
+; AMDGPU: [[LOOP28]] = distinct !{[[LOOP28]], [[META22]], [[META23]]}
 ;.
 ; NVPTX: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"", i32 74, i32 5}
 ; NVPTX: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
@@ -1179,17 +1179,17 @@ attributes #11 = { convergent }
 ; NVPTX: [[META13:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ; NVPTX: [[META14:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
 ; NVPTX: [[META15:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; NVPTX: [[META16:![0-9]+]] = !{!"clang version 14.0.0"}
-; NVPTX: [[TBAA17]] = !{!18, !18, i64 0}
-; NVPTX: [[META18:![0-9]+]] = !{!"int", !19, i64 0}
-; NVPTX: [[META19:![0-9]+]] = !{!"omnipotent char", !20, i64 0}
-; NVPTX: [[META20:![0-9]+]] = !{!"Simple C/C++ TBAA"}
-; NVPTX: [[LOOP21]] = distinct !{!21, !22, !23}
-; NVPTX: [[META22:![0-9]+]] = !{!"llvm.loop.mustprogress"}
-; NVPTX: [[META23:![0-9]+]] = !{!"llvm.loop.unroll.disable"}
-; NVPTX: [[LOOP24]] = distinct !{!24, !22, !23}
-; NVPTX: [[TBAA25]] = !{!26, !26, i64 0}
-; NVPTX: [[META26:![0-9]+]] = !{!"any pointer", !19, i64 0}
-; NVPTX: [[LOOP27]] = distinct !{!27, !22, !23}
-; NVPTX: [[LOOP28]] = distinct !{!28, !22, !23}
+; NVPTX: [[META16:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; NVPTX: [[TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
+; NVPTX: [[META18]] = !{!"int", [[META19:![0-9]+]], i64 0}
+; NVPTX: [[META19]] = !{!"omnipotent char", [[META20:![0-9]+]], i64 0}
+; NVPTX: [[META20]] = !{!"Simple C/C++ TBAA"}
+; NVPTX: [[LOOP21]] = distinct !{[[LOOP21]], [[META22:![0-9]+]], [[META23:![0-9]+]]}
+; NVPTX: [[META22]] = !{!"llvm.loop.mustprogress"}
+; NVPTX: [[META23]] = !{!"llvm.loop.unroll.disable"}
+; NVPTX: [[LOOP24]] = distinct !{[[LOOP24]], [[META22]], [[META23]]}
+; NVPTX: [[TBAA25]] = !{[[META26:![0-9]+]], [[META26]], i64 0}
+; NVPTX: [[META26]] = !{!"any pointer", [[META19]], i64 0}
+; NVPTX: [[LOOP27]] = distinct !{[[LOOP27]], [[META22]], [[META23]]}
+; NVPTX: [[LOOP28]] = distinct !{[[LOOP28]], [[META22]], [[META23]]}
 ;.
diff --git a/llvm/test/Transforms/OpenMP/spmdization_kernel_env_dep.ll b/llvm/test/Transforms/OpenMP/spmdization_kernel_env_dep.ll
index 7a632dc0a968..ce7b4f89b893 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_kernel_env_dep.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_kernel_env_dep.ll
@@ -11,8 +11,8 @@ target triple = "amdgcn-amd-amdhsa"
 @__omp_offloading_10302_b20a40e_main_l4_kernel_environment = addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy.8 { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) null to ptr), ptr addrspacecast (ptr addrspace(1) null to ptr) }
 
 ;.
-; AMDGPU: @[[ISSPMDMODE:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef
-; AMDGPU: @[[__OMP_OFFLOADING_10302_B20A40E_MAIN_L4_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY_8:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) null to ptr), ptr addrspacecast (ptr addrspace(1) null to ptr) }
+; AMDGPU: @IsSPMDMode = internal addrspace(3) global i32 undef
+; AMDGPU: @__omp_offloading_10302_b20a40e_main_l4_kernel_environment = addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy.8 { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) null to ptr), ptr addrspacecast (ptr addrspace(1) null to ptr) }
 ;.
 define i32 @fputs() {
 ; AMDGPU-LABEL: define {{[^@]+}}@fputs
diff --git a/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll b/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll
index aad9446a0e28..f28f61e05327 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll
@@ -43,19 +43,19 @@ target triple = "nvptx64"
 
 ;.
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
-; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(5) global i32, align 4
-; CHECK: @[[__OMP_OFFLOADING_2B_10393B5_SPMD_L12_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; CHECK: @[[__OMP_OFFLOADING_2B_10393B5_GENERIC_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
+; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
+; CHECK: @G = external addrspace(5) global i32, align 4
+; CHECK: @__omp_offloading_2b_10393b5_spmd_l12_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @__omp_offloading_2b_10393b5_generic_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; CHECK-DISABLE-SPMDIZATION: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
-; CHECK-DISABLE-SPMDIZATION: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; CHECK-DISABLE-SPMDIZATION: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; CHECK-DISABLE-SPMDIZATION: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(5) global i32, align 4
-; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OFFLOADING_2B_10393B5_SPMD_L12_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OFFLOADING_2B_10393B5_GENERIC_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OUTLINED___WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
+; CHECK-DISABLE-SPMDIZATION: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
+; CHECK-DISABLE-SPMDIZATION: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
+; CHECK-DISABLE-SPMDIZATION: @G = external addrspace(5) global i32, align 4
+; CHECK-DISABLE-SPMDIZATION: @__omp_offloading_2b_10393b5_spmd_l12_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK-DISABLE-SPMDIZATION: @__omp_offloading_2b_10393b5_generic_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK-DISABLE-SPMDIZATION: @__omp_outlined___wrapper.ID = private constant i8 undef
 ;.
 define weak void @__omp_offloading_2b_10393b5_spmd_l12(ptr %dyn) #0 {
 ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2b_10393b5_spmd_l12
@@ -441,7 +441,7 @@ attributes #5 = { convergent }
 ; CHECK: [[META6:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ; CHECK: [[META7:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
 ; CHECK: [[META8:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; CHECK: [[META9:![0-9]+]] = !{!"clang version 14.0.0"}
+; CHECK: [[META9:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
 ;.
 ; CHECK-DISABLE-SPMDIZATION: [[META0:![0-9]+]] = !{i32 0, i32 43, i32 17011637, !"spmd", i32 12, i32 0}
 ; CHECK-DISABLE-SPMDIZATION: [[META1:![0-9]+]] = !{i32 0, i32 43, i32 17011637, !"generic", i32 20, i32 1}
@@ -452,5 +452,5 @@ attributes #5 = { convergent }
 ; CHECK-DISABLE-SPMDIZATION: [[META6:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ; CHECK-DISABLE-SPMDIZATION: [[META7:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
 ; CHECK-DISABLE-SPMDIZATION: [[META8:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; CHECK-DISABLE-SPMDIZATION: [[META9:![0-9]+]] = !{!"clang version 14.0.0"}
+; CHECK-DISABLE-SPMDIZATION: [[META9:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
 ;.
diff --git a/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll b/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll
index 733d2906fbee..604894c5a77f 100644
--- a/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll
+++ b/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll
@@ -31,27 +31,27 @@ target triple = "amdgcn-amd-amdhsa"
 
 ; Make sure we do not delete the stores to @G without also replacing the load with `1`.
 ;.
-; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef, align 4
-; CHECK: @[[H:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef, align 4
-; CHECK: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef, align 4
-; CHECK: @[[QA1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef, align 4
-; CHECK: @[[QB1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef, align 4
-; CHECK: @[[QC1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef, align 4
-; CHECK: @[[QD1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef, align 4
-; CHECK: @[[QA2:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef, align 4
-; CHECK: @[[QB2:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef, align 4
-; CHECK: @[[QC2:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef, align 4
-; CHECK: @[[QD2:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef, align 4
-; CHECK: @[[QA3:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef, align 4
-; CHECK: @[[QB3:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef, align 4
-; CHECK: @[[QC3:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef, align 4
-; CHECK: @[[QD3:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef, align 4
-; CHECK: @[[UAA1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef, align 4
-; CHECK: @[[UAA2:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef, align 4
-; CHECK: @[[UAA3:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef, align 4
-; CHECK: @[[UANA1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef, align 4
-; CHECK: @[[STR:[a-zA-Z0-9_$"\\.-]+]] = private unnamed_addr addrspace(4) constant [1 x i8] zeroinitializer, align 1
-; CHECK: @[[KERNEL_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+; CHECK: @G = internal addrspace(3) global i32 undef, align 4
+; CHECK: @H = internal addrspace(3) global i32 undef, align 4
+; CHECK: @X = internal addrspace(3) global i32 undef, align 4
+; CHECK: @QA1 = internal addrspace(3) global i32 undef, align 4
+; CHECK: @QB1 = internal addrspace(3) global i32 undef, align 4
+; CHECK: @QC1 = internal addrspace(3) global i32 undef, align 4
+; CHECK: @QD1 = internal addrspace(3) global i32 undef, align 4
+; CHECK: @QA2 = internal addrspace(3) global i32 undef, align 4
+; CHECK: @QB2 = internal addrspace(3) global i32 undef, align 4
+; CHECK: @QC2 = internal addrspace(3) global i32 undef, align 4
+; CHECK: @QD2 = internal addrspace(3) global i32 undef, align 4
+; CHECK: @QA3 = internal addrspace(3) global i32 undef, align 4
+; CHECK: @QB3 = internal addrspace(3) global i32 undef, align 4
+; CHECK: @QC3 = internal addrspace(3) global i32 undef, align 4
+; CHECK: @QD3 = internal addrspace(3) global i32 undef, align 4
+; CHECK: @UAA1 = internal addrspace(3) global i32 undef, align 4
+; CHECK: @UAA2 = internal addrspace(3) global i32 undef, align 4
+; CHECK: @UAA3 = internal addrspace(3) global i32 undef, align 4
+; CHECK: @UANA1 = internal addrspace(3) global i32 undef, align 4
+; CHECK: @str = private unnamed_addr addrspace(4) constant [1 x i8] zeroinitializer, align 1
+; CHECK: @kernel_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 ;.
 define void @kernel(ptr %dyn) "kernel" {
 ;
@@ -868,25 +868,47 @@ declare void @llvm.assume(i1)
 ; CGSCC: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
 ; CGSCC: attributes #[[ATTR6]] = { nounwind }
 ;.
-; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; CHECK: [[META2:![0-9]+]] = !{ptr @kernel, !"kernel", i32 1}
-; CHECK: [[META3:![0-9]+]] = !{ptr @kernel2, !"kernel", i32 1}
-; CHECK: [[META4:![0-9]+]] = !{ptr @kernel3, !"kernel", i32 1}
-; CHECK: [[META5:![0-9]+]] = !{ptr @kernel4a1, !"kernel", i32 1}
-; CHECK: [[META6:![0-9]+]] = !{ptr @kernel4b1, !"kernel", i32 1}
-; CHECK: [[META7:![0-9]+]] = !{ptr @kernel4a2, !"kernel", i32 1}
-; CHECK: [[META8:![0-9]+]] = !{ptr @kernel4b2, !"kernel", i32 1}
-; CHECK: [[META9:![0-9]+]] = !{ptr @kernel4a3, !"kernel", i32 1}
-; CHECK: [[META10:![0-9]+]] = !{ptr @kernel4b3, !"kernel", i32 1}
-; CHECK: [[META11:![0-9]+]] = !{ptr @kernel4c1, !"kernel", i32 1}
-; CHECK: [[META12:![0-9]+]] = !{ptr @kernel4d1, !"kernel", i32 1}
-; CHECK: [[META13:![0-9]+]] = !{ptr @kernel4c2, !"kernel", i32 1}
-; CHECK: [[META14:![0-9]+]] = !{ptr @kernel4d2, !"kernel", i32 1}
-; CHECK: [[META15:![0-9]+]] = !{ptr @kernel4c3, !"kernel", i32 1}
-; CHECK: [[META16:![0-9]+]] = !{ptr @kernel4d3, !"kernel", i32 1}
-; CHECK: [[META17:![0-9]+]] = !{ptr @kernel_unknown_and_aligned1, !"kernel", i32 1}
-; CHECK: [[META18:![0-9]+]] = !{ptr @kernel_unknown_and_aligned2, !"kernel", i32 1}
-; CHECK: [[META19:![0-9]+]] = !{ptr @kernel_unknown_and_aligned3, !"kernel", i32 1}
-; CHECK: [[META20:![0-9]+]] = !{ptr @kernel_unknown_and_not_aligned1, !"kernel", i32 1}
+; TUNIT: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; TUNIT: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; TUNIT: [[META2:![0-9]+]] = !{ptr @kernel, !"kernel", i32 1}
+; TUNIT: [[META3:![0-9]+]] = !{ptr @kernel2, !"kernel", i32 1}
+; TUNIT: [[META4:![0-9]+]] = !{ptr @kernel3, !"kernel", i32 1}
+; TUNIT: [[META5:![0-9]+]] = !{ptr @kernel4a1, !"kernel", i32 1}
+; TUNIT: [[META6:![0-9]+]] = !{ptr @kernel4b1, !"kernel", i32 1}
+; TUNIT: [[META7:![0-9]+]] = !{ptr @kernel4a2, !"kernel", i32 1}
+; TUNIT: [[META8:![0-9]+]] = !{ptr @kernel4b2, !"kernel", i32 1}
+; TUNIT: [[META9:![0-9]+]] = !{ptr @kernel4a3, !"kernel", i32 1}
+; TUNIT: [[META10:![0-9]+]] = !{ptr @kernel4b3, !"kernel", i32 1}
+; TUNIT: [[META11:![0-9]+]] = !{ptr @kernel4c1, !"kernel", i32 1}
+; TUNIT: [[META12:![0-9]+]] = !{ptr @kernel4d1, !"kernel", i32 1}
+; TUNIT: [[META13:![0-9]+]] = !{ptr @kernel4c2, !"kernel", i32 1}
+; TUNIT: [[META14:![0-9]+]] = !{ptr @kernel4d2, !"kernel", i32 1}
+; TUNIT: [[META15:![0-9]+]] = !{ptr @kernel4c3, !"kernel", i32 1}
+; TUNIT: [[META16:![0-9]+]] = !{ptr @kernel4d3, !"kernel", i32 1}
+; TUNIT: [[META17:![0-9]+]] = !{ptr @kernel_unknown_and_aligned1, !"kernel", i32 1}
+; TUNIT: [[META18:![0-9]+]] = !{ptr @kernel_unknown_and_aligned2, !"kernel", i32 1}
+; TUNIT: [[META19:![0-9]+]] = !{ptr @kernel_unknown_and_aligned3, !"kernel", i32 1}
+; TUNIT: [[META20:![0-9]+]] = !{ptr @kernel_unknown_and_not_aligned1, !"kernel", i32 1}
+;.
+; CGSCC: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; CGSCC: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; CGSCC: [[META2:![0-9]+]] = !{ptr @kernel, !"kernel", i32 1}
+; CGSCC: [[META3:![0-9]+]] = !{ptr @kernel2, !"kernel", i32 1}
+; CGSCC: [[META4:![0-9]+]] = !{ptr @kernel3, !"kernel", i32 1}
+; CGSCC: [[META5:![0-9]+]] = !{ptr @kernel4a1, !"kernel", i32 1}
+; CGSCC: [[META6:![0-9]+]] = !{ptr @kernel4b1, !"kernel", i32 1}
+; CGSCC: [[META7:![0-9]+]] = !{ptr @kernel4a2, !"kernel", i32 1}
+; CGSCC: [[META8:![0-9]+]] = !{ptr @kernel4b2, !"kernel", i32 1}
+; CGSCC: [[META9:![0-9]+]] = !{ptr @kernel4a3, !"kernel", i32 1}
+; CGSCC: [[META10:![0-9]+]] = !{ptr @kernel4b3, !"kernel", i32 1}
+; CGSCC: [[META11:![0-9]+]] = !{ptr @kernel4c1, !"kernel", i32 1}
+; CGSCC: [[META12:![0-9]+]] = !{ptr @kernel4d1, !"kernel", i32 1}
+; CGSCC: [[META13:![0-9]+]] = !{ptr @kernel4c2, !"kernel", i32 1}
+; CGSCC: [[META14:![0-9]+]] = !{ptr @kernel4d2, !"kernel", i32 1}
+; CGSCC: [[META15:![0-9]+]] = !{ptr @kernel4c3, !"kernel", i32 1}
+; CGSCC: [[META16:![0-9]+]] = !{ptr @kernel4d3, !"kernel", i32 1}
+; CGSCC: [[META17:![0-9]+]] = !{ptr @kernel_unknown_and_aligned1, !"kernel", i32 1}
+; CGSCC: [[META18:![0-9]+]] = !{ptr @kernel_unknown_and_aligned2, !"kernel", i32 1}
+; CGSCC: [[META19:![0-9]+]] = !{ptr @kernel_unknown_and_aligned3, !"kernel", i32 1}
+; CGSCC: [[META20:![0-9]+]] = !{ptr @kernel_unknown_and_not_aligned1, !"kernel", i32 1}
 ;.
diff --git a/llvm/test/Transforms/PGOProfile/chr.ll b/llvm/test/Transforms/PGOProfile/chr.ll
index 0551a171091c..38e8f8536a19 100644
--- a/llvm/test/Transforms/PGOProfile/chr.ll
+++ b/llvm/test/Transforms/PGOProfile/chr.ll
@@ -1298,11 +1298,12 @@ define i32 @test_chr_14(ptr %i, ptr %j, i32 %sum0, i1 %pred, i32 %z) !prof !14 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[Z_FR:%.*]] = freeze i32 [[Z:%.*]]
 ; CHECK-NEXT:    [[I0:%.*]] = load i32, ptr [[I:%.*]], align 4
-; CHECK-NEXT:    [[V1:%.*]] = icmp eq i32 [[Z_FR]], 1
-; CHECK-NEXT:    br i1 [[V1]], label [[BB1:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof [[PROF15]]
+; CHECK-NEXT:    [[V1_NOT:%.*]] = icmp eq i32 [[Z_FR]], 1
+; CHECK-NEXT:    br i1 [[V1_NOT]], label [[BB1:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof [[PROF15]]
 ; CHECK:       entry.split.nonchr:
+; CHECK-NEXT:    [[PRED_FR:%.*]] = freeze i1 [[PRED:%.*]]
 ; CHECK-NEXT:    [[V0:%.*]] = icmp eq i32 [[Z_FR]], 0
-; CHECK-NEXT:    [[V3_NONCHR:%.*]] = and i1 [[V0]], [[PRED:%.*]]
+; CHECK-NEXT:    [[V3_NONCHR:%.*]] = and i1 [[V0]], [[PRED_FR]]
 ; CHECK-NEXT:    br i1 [[V3_NONCHR]], label [[BB0_NONCHR:%.*]], label [[BB1]], !prof [[PROF16]]
 ; CHECK:       bb0.nonchr:
 ; CHECK-NEXT:    call void @foo()
diff --git a/llvm/test/Transforms/PGOProfile/ctx-instrumentation-invalid-roots.ll b/llvm/test/Transforms/PGOProfile/ctx-instrumentation-invalid-roots.ll
new file mode 100644
index 000000000000..99c7762a67df
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/ctx-instrumentation-invalid-roots.ll
@@ -0,0 +1,17 @@
+; RUN: not opt -passes=pgo-instr-gen,ctx-instr-lower -profile-context-root=good \
+; RUN:   -profile-context-root=bad \
+; RUN:   -S < %s 2>&1 | FileCheck %s
+
+declare void @foo()
+
+define void @good() {
+  call void @foo()
+  ret void
+}
+
+define void @bad() {
+  musttail call void @foo()
+  ret void
+}
+
+; CHECK: error: The function bad was indicated as a context root, but it features musttail calls, which is not supported.
diff --git a/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll b/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll
index 2ad95ab51cc6..56c7c7519f69 100644
--- a/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll
+++ b/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll
@@ -1,11 +1,31 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
 ; RUN: opt -passes=pgo-instr-gen -profile-context-root=an_entrypoint \
 ; RUN:   -S < %s | FileCheck --check-prefix=INSTRUMENT %s
+; RUN: opt -passes=pgo-instr-gen,ctx-instr-lower -profile-context-root=an_entrypoint \
+; RUN:   -profile-context-root=another_entrypoint_no_callees \
+; RUN:   -S < %s | FileCheck --check-prefix=LOWERING %s
+
 
 declare void @bar()
 
 ;.
 ; INSTRUMENT: @__profn_foo = private constant [3 x i8] c"foo"
+; INSTRUMENT: @__profn_an_entrypoint = private constant [13 x i8] c"an_entrypoint"
+; INSTRUMENT: @__profn_another_entrypoint_no_callees = private constant [29 x i8] c"another_entrypoint_no_callees"
+; INSTRUMENT: @__profn_simple = private constant [6 x i8] c"simple"
+; INSTRUMENT: @__profn_no_callsites = private constant [12 x i8] c"no_callsites"
+; INSTRUMENT: @__profn_no_counters = private constant [11 x i8] c"no_counters"
+;.
+; LOWERING: @__profn_foo = private constant [3 x i8] c"foo"
+; LOWERING: @__profn_an_entrypoint = private constant [13 x i8] c"an_entrypoint"
+; LOWERING: @__profn_another_entrypoint_no_callees = private constant [29 x i8] c"another_entrypoint_no_callees"
+; LOWERING: @__profn_simple = private constant [6 x i8] c"simple"
+; LOWERING: @__profn_no_callsites = private constant [12 x i8] c"no_callsites"
+; LOWERING: @__profn_no_counters = private constant [11 x i8] c"no_counters"
+; LOWERING: @an_entrypoint_ctx_root = global { ptr, ptr, ptr, i8 } zeroinitializer
+; LOWERING: @another_entrypoint_no_callees_ctx_root = global { ptr, ptr, ptr, i8 } zeroinitializer
+; LOWERING: @__llvm_ctx_profile_callsite = external hidden thread_local global ptr
+; LOWERING: @__llvm_ctx_profile_expected_callee = external hidden thread_local global ptr
 ;.
 define void @foo(i32 %a, ptr %fct) {
 ; INSTRUMENT-LABEL: define void @foo(
@@ -25,6 +45,38 @@ define void @foo(i32 %a, ptr %fct) {
 ; INSTRUMENT:       exit:
 ; INSTRUMENT-NEXT:    ret void
 ;
+; LOWERING-LABEL: define void @foo(
+; LOWERING-SAME: i32 [[A:%.*]], ptr [[FCT:%.*]]) {
+; LOWERING-NEXT:    [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @foo, i64 6699318081062747564, i32 2, i32 2)
+; LOWERING-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; LOWERING-NEXT:    [[TMP3:%.*]] = and i64 [[TMP2]], 1
+; LOWERING-NEXT:    [[TMP4:%.*]] = call ptr @llvm.threadlocal.address.p0(ptr @__llvm_ctx_profile_expected_callee)
+; LOWERING-NEXT:    [[TMP5:%.*]] = getelementptr ptr, ptr [[TMP4]], i64 [[TMP3]]
+; LOWERING-NEXT:    [[TMP6:%.*]] = call ptr @llvm.threadlocal.address.p0(ptr @__llvm_ctx_profile_callsite)
+; LOWERING-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i64 [[TMP3]]
+; LOWERING-NEXT:    [[TMP8:%.*]] = and i64 [[TMP2]], -2
+; LOWERING-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; LOWERING-NEXT:    [[T:%.*]] = icmp eq i32 [[A]], 0
+; LOWERING-NEXT:    br i1 [[T]], label [[YES:%.*]], label [[NO:%.*]]
+; LOWERING:       yes:
+; LOWERING-NEXT:    [[TMP10:%.*]] = getelementptr { { i64, ptr, i32, i32 }, [2 x i64], [2 x ptr] }, ptr [[TMP9]], i32 0, i32 1, i32 1
+; LOWERING-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP10]], align 4
+; LOWERING-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], 1
+; LOWERING-NEXT:    store i64 [[TMP12]], ptr [[TMP10]], align 4
+; LOWERING-NEXT:    store volatile ptr [[FCT]], ptr [[TMP5]], align 8
+; LOWERING-NEXT:    [[TMP13:%.*]] = getelementptr { { i64, ptr, i32, i32 }, [2 x i64], [2 x ptr] }, ptr [[TMP1]], i32 0, i32 2, i32 0
+; LOWERING-NEXT:    store volatile ptr [[TMP13]], ptr [[TMP7]], align 8
+; LOWERING-NEXT:    call void [[FCT]](i32 [[A]])
+; LOWERING-NEXT:    br label [[EXIT:%.*]]
+; LOWERING:       no:
+; LOWERING-NEXT:    store volatile ptr @bar, ptr [[TMP5]], align 8
+; LOWERING-NEXT:    [[TMP14:%.*]] = getelementptr { { i64, ptr, i32, i32 }, [2 x i64], [2 x ptr] }, ptr [[TMP1]], i32 0, i32 2, i32 1
+; LOWERING-NEXT:    store volatile ptr [[TMP14]], ptr [[TMP7]], align 8
+; LOWERING-NEXT:    call void @bar()
+; LOWERING-NEXT:    br label [[EXIT]]
+; LOWERING:       exit:
+; LOWERING-NEXT:    ret void
+;
   %t = icmp eq i32 %a, 0
   br i1 %t, label %yes, label %no
 yes:
@@ -36,6 +88,183 @@ no:
 exit:
   ret void
 }
+
+define void @an_entrypoint(i32 %a) {
+; INSTRUMENT-LABEL: define void @an_entrypoint(
+; INSTRUMENT-SAME: i32 [[A:%.*]]) {
+; INSTRUMENT-NEXT:    call void @llvm.instrprof.increment(ptr @__profn_an_entrypoint, i64 784007058953177093, i32 2, i32 0)
+; INSTRUMENT-NEXT:    [[T:%.*]] = icmp eq i32 [[A]], 0
+; INSTRUMENT-NEXT:    br i1 [[T]], label [[YES:%.*]], label [[NO:%.*]]
+; INSTRUMENT:       yes:
+; INSTRUMENT-NEXT:    call void @llvm.instrprof.increment(ptr @__profn_an_entrypoint, i64 784007058953177093, i32 2, i32 1)
+; INSTRUMENT-NEXT:    call void @llvm.instrprof.callsite(ptr @__profn_an_entrypoint, i64 784007058953177093, i32 1, i32 0, ptr @foo)
+; INSTRUMENT-NEXT:    call void @foo(i32 1, ptr null)
+; INSTRUMENT-NEXT:    ret void
+; INSTRUMENT:       no:
+; INSTRUMENT-NEXT:    ret void
+;
+; LOWERING-LABEL: define void @an_entrypoint(
+; LOWERING-SAME: i32 [[A:%.*]]) {
+; LOWERING-NEXT:    [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_start_context(ptr @an_entrypoint_ctx_root, i64 4909520559318251808, i32 2, i32 1)
+; LOWERING-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; LOWERING-NEXT:    [[TMP3:%.*]] = and i64 [[TMP2]], 1
+; LOWERING-NEXT:    [[TMP4:%.*]] = call ptr @llvm.threadlocal.address.p0(ptr @__llvm_ctx_profile_expected_callee)
+; LOWERING-NEXT:    [[TMP5:%.*]] = getelementptr ptr, ptr [[TMP4]], i64 [[TMP3]]
+; LOWERING-NEXT:    [[TMP6:%.*]] = call ptr @llvm.threadlocal.address.p0(ptr @__llvm_ctx_profile_callsite)
+; LOWERING-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i64 [[TMP3]]
+; LOWERING-NEXT:    [[TMP8:%.*]] = and i64 [[TMP2]], -2
+; LOWERING-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; LOWERING-NEXT:    [[T:%.*]] = icmp eq i32 [[A]], 0
+; LOWERING-NEXT:    br i1 [[T]], label [[YES:%.*]], label [[NO:%.*]]
+; LOWERING:       yes:
+; LOWERING-NEXT:    [[TMP10:%.*]] = getelementptr { { i64, ptr, i32, i32 }, [2 x i64], [1 x ptr] }, ptr [[TMP9]], i32 0, i32 1, i32 1
+; LOWERING-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP10]], align 4
+; LOWERING-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], 1
+; LOWERING-NEXT:    store i64 [[TMP12]], ptr [[TMP10]], align 4
+; LOWERING-NEXT:    store volatile ptr @foo, ptr [[TMP5]], align 8
+; LOWERING-NEXT:    [[TMP13:%.*]] = getelementptr { { i64, ptr, i32, i32 }, [2 x i64], [1 x ptr] }, ptr [[TMP1]], i32 0, i32 2, i32 0
+; LOWERING-NEXT:    store volatile ptr [[TMP13]], ptr [[TMP7]], align 8
+; LOWERING-NEXT:    call void @foo(i32 1, ptr null)
+; LOWERING-NEXT:    call void @__llvm_ctx_profile_release_context(ptr @an_entrypoint_ctx_root)
+; LOWERING-NEXT:    ret void
+; LOWERING:       no:
+; LOWERING-NEXT:    call void @__llvm_ctx_profile_release_context(ptr @an_entrypoint_ctx_root)
+; LOWERING-NEXT:    ret void
+;
+  %t = icmp eq i32 %a, 0
+  br i1 %t, label %yes, label %no
+
+yes:
+  call void @foo(i32 1, ptr null)
+  ret void
+no:
+  ret void
+}
+
+define void @another_entrypoint_no_callees(i32 %a) {
+; INSTRUMENT-LABEL: define void @another_entrypoint_no_callees(
+; INSTRUMENT-SAME: i32 [[A:%.*]]) {
+; INSTRUMENT-NEXT:    call void @llvm.instrprof.increment(ptr @__profn_another_entrypoint_no_callees, i64 784007058953177093, i32 2, i32 0)
+; INSTRUMENT-NEXT:    [[T:%.*]] = icmp eq i32 [[A]], 0
+; INSTRUMENT-NEXT:    br i1 [[T]], label [[YES:%.*]], label [[NO:%.*]]
+; INSTRUMENT:       yes:
+; INSTRUMENT-NEXT:    call void @llvm.instrprof.increment(ptr @__profn_another_entrypoint_no_callees, i64 784007058953177093, i32 2, i32 1)
+; INSTRUMENT-NEXT:    ret void
+; INSTRUMENT:       no:
+; INSTRUMENT-NEXT:    ret void
+;
+; LOWERING-LABEL: define void @another_entrypoint_no_callees(
+; LOWERING-SAME: i32 [[A:%.*]]) {
+; LOWERING-NEXT:    [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_start_context(ptr @another_entrypoint_no_callees_ctx_root, i64 -6371873725078000974, i32 2, i32 0)
+; LOWERING-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; LOWERING-NEXT:    [[TMP3:%.*]] = and i64 [[TMP2]], -2
+; LOWERING-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; LOWERING-NEXT:    [[T:%.*]] = icmp eq i32 [[A]], 0
+; LOWERING-NEXT:    br i1 [[T]], label [[YES:%.*]], label [[NO:%.*]]
+; LOWERING:       yes:
+; LOWERING-NEXT:    [[TMP5:%.*]] = getelementptr { { i64, ptr, i32, i32 }, [2 x i64], [0 x ptr] }, ptr [[TMP4]], i32 0, i32 1, i32 1
+; LOWERING-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP5]], align 4
+; LOWERING-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 1
+; LOWERING-NEXT:    store i64 [[TMP7]], ptr [[TMP5]], align 4
+; LOWERING-NEXT:    call void @__llvm_ctx_profile_release_context(ptr @another_entrypoint_no_callees_ctx_root)
+; LOWERING-NEXT:    ret void
+; LOWERING:       no:
+; LOWERING-NEXT:    call void @__llvm_ctx_profile_release_context(ptr @another_entrypoint_no_callees_ctx_root)
+; LOWERING-NEXT:    ret void
+;
+  %t = icmp eq i32 %a, 0
+  br i1 %t, label %yes, label %no
+
+yes:
+  ret void
+no:
+  ret void
+}
+
+define void @simple(i32 %a) {
+; INSTRUMENT-LABEL: define void @simple(
+; INSTRUMENT-SAME: i32 [[A:%.*]]) {
+; INSTRUMENT-NEXT:    call void @llvm.instrprof.increment(ptr @__profn_simple, i64 742261418966908927, i32 1, i32 0)
+; INSTRUMENT-NEXT:    ret void
+;
+; LOWERING-LABEL: define void @simple(
+; LOWERING-SAME: i32 [[A:%.*]]) {
+; LOWERING-NEXT:    [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @simple, i64 -3006003237940970099, i32 1, i32 0)
+; LOWERING-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; LOWERING-NEXT:    [[TMP3:%.*]] = and i64 [[TMP2]], -2
+; LOWERING-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; LOWERING-NEXT:    ret void
+;
+  ret void
+}
+
+
+define i32 @no_callsites(i32 %a) {
+; INSTRUMENT-LABEL: define i32 @no_callsites(
+; INSTRUMENT-SAME: i32 [[A:%.*]]) {
+; INSTRUMENT-NEXT:    call void @llvm.instrprof.increment(ptr @__profn_no_callsites, i64 784007058953177093, i32 2, i32 0)
+; INSTRUMENT-NEXT:    [[C:%.*]] = icmp eq i32 [[A]], 0
+; INSTRUMENT-NEXT:    br i1 [[C]], label [[YES:%.*]], label [[NO:%.*]]
+; INSTRUMENT:       yes:
+; INSTRUMENT-NEXT:    call void @llvm.instrprof.increment(ptr @__profn_no_callsites, i64 784007058953177093, i32 2, i32 1)
+; INSTRUMENT-NEXT:    ret i32 1
+; INSTRUMENT:       no:
+; INSTRUMENT-NEXT:    ret i32 0
+;
+; LOWERING-LABEL: define i32 @no_callsites(
+; LOWERING-SAME: i32 [[A:%.*]]) {
+; LOWERING-NEXT:    [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @no_callsites, i64 5679753335911435902, i32 2, i32 0)
+; LOWERING-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; LOWERING-NEXT:    [[TMP3:%.*]] = and i64 [[TMP2]], -2
+; LOWERING-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; LOWERING-NEXT:    [[C:%.*]] = icmp eq i32 [[A]], 0
+; LOWERING-NEXT:    br i1 [[C]], label [[YES:%.*]], label [[NO:%.*]]
+; LOWERING:       yes:
+; LOWERING-NEXT:    [[TMP5:%.*]] = getelementptr { { i64, ptr, i32, i32 }, [2 x i64], [0 x ptr] }, ptr [[TMP4]], i32 0, i32 1, i32 1
+; LOWERING-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP5]], align 4
+; LOWERING-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 1
+; LOWERING-NEXT:    store i64 [[TMP7]], ptr [[TMP5]], align 4
+; LOWERING-NEXT:    ret i32 1
+; LOWERING:       no:
+; LOWERING-NEXT:    ret i32 0
+;
+  %c = icmp eq i32 %a, 0
+  br i1 %c, label %yes, label %no
+yes:
+  ret i32 1
+no:
+  ret i32 0
+}
+
+define void @no_counters() {
+; INSTRUMENT-LABEL: define void @no_counters() {
+; INSTRUMENT-NEXT:    call void @llvm.instrprof.increment(ptr @__profn_no_counters, i64 742261418966908927, i32 1, i32 0)
+; INSTRUMENT-NEXT:    call void @llvm.instrprof.callsite(ptr @__profn_no_counters, i64 742261418966908927, i32 1, i32 0, ptr @bar)
+; INSTRUMENT-NEXT:    call void @bar()
+; INSTRUMENT-NEXT:    ret void
+;
+; LOWERING-LABEL: define void @no_counters() {
+; LOWERING-NEXT:    [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @no_counters, i64 5458232184388660970, i32 1, i32 1)
+; LOWERING-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; LOWERING-NEXT:    [[TMP3:%.*]] = and i64 [[TMP2]], 1
+; LOWERING-NEXT:    [[TMP4:%.*]] = call ptr @llvm.threadlocal.address.p0(ptr @__llvm_ctx_profile_expected_callee)
+; LOWERING-NEXT:    [[TMP5:%.*]] = getelementptr ptr, ptr [[TMP4]], i64 [[TMP3]]
+; LOWERING-NEXT:    [[TMP6:%.*]] = call ptr @llvm.threadlocal.address.p0(ptr @__llvm_ctx_profile_callsite)
+; LOWERING-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i64 [[TMP3]]
+; LOWERING-NEXT:    [[TMP8:%.*]] = and i64 [[TMP2]], -2
+; LOWERING-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; LOWERING-NEXT:    store volatile ptr @bar, ptr [[TMP5]], align 8
+; LOWERING-NEXT:    [[TMP10:%.*]] = getelementptr { { i64, ptr, i32, i32 }, [1 x i64], [1 x ptr] }, ptr [[TMP1]], i32 0, i32 2, i32 0
+; LOWERING-NEXT:    store volatile ptr [[TMP10]], ptr [[TMP7]], align 8
+; LOWERING-NEXT:    call void @bar()
+; LOWERING-NEXT:    ret void
+;
+  call void @bar()
+  ret void
+}
 ;.
 ; INSTRUMENT: attributes #[[ATTR0:[0-9]+]] = { nounwind }
 ;.
+; LOWERING: attributes #[[ATTR0:[0-9]+]] = { nounwind }
+; LOWERING: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+;.
diff --git a/llvm/test/Transforms/PGOProfile/memprof_match_hot_cold_new_calls.ll b/llvm/test/Transforms/PGOProfile/memprof_match_hot_cold_new_calls.ll
new file mode 100644
index 000000000000..2fac931e1568
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/memprof_match_hot_cold_new_calls.ll
@@ -0,0 +1,63 @@
+;; Tests optional matching of memprof profile on call to operator new
+;; with manual hot/cold hint.
+
+;; Avoid failures on big-endian systems that can't read the profile properly
+; REQUIRES: x86_64-linux
+
+;; TODO: Use text profile inputs once that is available for memprof.
+;; This test uses the same raw profile used for memprof.ll, see instructions
+;; in that file for updating.
+
+;; Generate indexed profile
+; RUN: llvm-profdata merge %S/Inputs/memprof.memprofraw --profiled-binary %S/Inputs/memprof.exe -o %t.memprofdata
+
+;; By default we should not match profile on to manually hinted operator
+;; new calls, because we don't currently override the manual hints anyway.
+; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -S 2>&1 | FileCheck %s --implicit-check-not !memprof --implicit-check-not !callsite
+
+;; Check that we match profiles onto these manually hinted new calls
+;; under the -memprof-match-hot-cold-new=true option.
+; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -S -memprof-match-hot-cold-new=true 2>&1 | FileCheck %s --check-prefixes=MEMPROF
+
+; ModuleID = 'memprof_match_hot_cold_new_calls.cc'
+source_filename = "memprof_match_hot_cold_new_calls.cc"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%"struct.std::nothrow_t" = type { i8 }
+
+@_ZSt7nothrow = external global %"struct.std::nothrow_t", align 1
+
+define dso_local noundef ptr @_Z3foov() !dbg !10 {
+entry:
+  ; MEMPROF: call {{.*}} @_Znwm{{.*}} !memprof ![[M1:[0-9]+]], !callsite ![[C1:[0-9]+]]
+  %call = call noalias noundef align 32 ptr @_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t(i64 noundef 32, i64 noundef 32, ptr noundef nonnull align 1 dereferenceable(1) @_ZSt7nothrow, i8 noundef zeroext 0), !dbg !13
+  ret ptr %call
+}
+
+declare noundef ptr @_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t(i64 noundef, i64 noundef, ptr noundef nonnull align 1 dereferenceable(1), i8 noundef zeroext)
+
+; MEMPROF: ![[M1]] = !{![[MIB1:[0-9]+]], ![[MIB2:[0-9]+]], ![[MIB3:[0-9]+]], ![[MIB4:[0-9]+]], ![[MIB5:[0-9]+]]}
+; MEMPROF: ![[MIB1]] = !{![[STACK1:[0-9]+]], !"cold"}
+; MEMPROF: ![[STACK1]] = !{i64 2732490490862098848, i64 748269490701775343}
+; MEMPROF: ![[MIB2]] = !{![[STACK2:[0-9]+]], !"cold"}
+; MEMPROF: ![[STACK2]] = !{i64 2732490490862098848, i64 2104812325165620841, i64 6281715513834610934, i64 6281715513834610934, i64 6281715513834610934, i64 1544787832369987002}
+; MEMPROF: ![[MIB3]] = !{![[STACK3:[0-9]+]], !"notcold"}
+; MEMPROF: ![[STACK3]] = !{i64 2732490490862098848, i64 2104812325165620841, i64 6281715513834610934, i64 6281715513834610934, i64 6281715513834610934, i64 6281715513834610934}
+; MEMPROF: ![[MIB4]] = !{![[STACK4:[0-9]+]], !"cold"}
+; MEMPROF: ![[STACK4]] = !{i64 2732490490862098848, i64 8467819354083268568}
+; MEMPROF: ![[MIB5]] = !{![[STACK5:[0-9]+]], !"notcold"}
+; MEMPROF: ![[STACK5]] = !{i64 2732490490862098848, i64 8690657650969109624}
+; MEMPROF: ![[C1]] = !{i64 2732490490862098848}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 15.0.0 (https://github.com/llvm/llvm-project.git 6cbe6284d1f0a088b5c6482ae27b738f03d82fe7)", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!1 = !DIFile(filename: "memprof.cc", directory: "/usr/local/google/home/tejohnson/llvm/tmp", checksumkind: CSK_MD5, checksum: "e8c40ebe4b21776b4d60e9632cbc13c2")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 4, type: !11, scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !12)
+!11 = !DISubroutineType(types: !12)
+!12 = !{}
+!13 = !DILocation(line: 5, column: 10, scope: !10)
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll
index b32f4e2a258c..6c45442bdcd3 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll
@@ -101,9 +101,7 @@ define void @cse_matching_load_from_previous_unrolled_iteration(i32 %N, ptr %src
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or disjoint i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[GEP_SRC_12_1:%.*]] = getelementptr <2 x i32>, ptr [[SRC_12]], i64 [[INDVARS_IV_NEXT]]
 ; CHECK-NEXT:    [[L_12_1:%.*]] = load <2 x i32>, ptr [[GEP_SRC_12_1]], align 8
-; CHECK-NEXT:    [[GEP_SRC_4_1:%.*]] = getelementptr <2 x i32>, ptr [[SRC_4]], i64 [[INDVARS_IV_NEXT]]
-; CHECK-NEXT:    [[L_4_1:%.*]] = load <2 x i32>, ptr [[GEP_SRC_4_1]], align 8
-; CHECK-NEXT:    [[MUL_1:%.*]] = mul <2 x i32> [[L_4_1]], [[L_12_1]]
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul <2 x i32> [[L_12]], [[L_12_1]]
 ; CHECK-NEXT:    [[GEP_DST_1:%.*]] = getelementptr <2 x i32>, ptr [[DST]], i64 [[INDVARS_IV_NEXT]]
 ; CHECK-NEXT:    store <2 x i32> [[MUL_1]], ptr [[GEP_DST_1]], align 8
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/interleavevectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/interleavevectorization.ll
index f1d7c0e0c412..c085e10c049a 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/interleavevectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/interleavevectorization.ll
@@ -22,19 +22,13 @@ define void @add4(ptr noalias noundef %x, ptr noalias noundef %y, i32 noundef %n
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <32 x i16>, ptr [[TMP0]], align 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC24:%.*]] = load <32 x i16>, ptr [[TMP1]], align 2
-; CHECK-NEXT:    [[TMP2:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP3:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP4:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[OFFSET_IDX]], 3
-; CHECK-NEXT:    [[TMP6:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> [[TMP3]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> [[TMP6]], <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i16> [[TMP7]], <16 x i16> [[TMP8]], <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]]
+; CHECK-NEXT:    [[TMP2:%.*]] = or disjoint i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP2]]
 ; CHECK-NEXT:    store <32 x i16> [[INTERLEAVED_VEC]], ptr [[GEP]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-NEXT:    br i1 [[TMP9]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP3]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -412,22 +406,13 @@ define void @addmul(ptr noalias noundef %x, ptr noundef %y, ptr noundef %z, i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC36:%.*]] = load <32 x i16>, ptr [[TMP3]], align 2
-; CHECK-NEXT:    [[TMP4:%.*]] = add <32 x i16> [[TMP2]], [[WIDE_VEC36]]
-; CHECK-NEXT:    [[TMP5:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP6:%.*]] = add <32 x i16> [[TMP5]], [[WIDE_VEC36]]
-; CHECK-NEXT:    [[TMP7:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add <32 x i16> [[TMP7]], [[WIDE_VEC36]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[OFFSET_IDX]], 3
-; CHECK-NEXT:    [[TMP10:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]]
-; CHECK-NEXT:    [[TMP11:%.*]] = add <32 x i16> [[TMP10]], [[WIDE_VEC36]]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> [[TMP6]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <32 x i16> [[TMP8]], <32 x i16> [[TMP11]], <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i16> [[TMP12]], <16 x i16> [[TMP13]], <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = add <32 x i16> [[TMP2]], [[WIDE_VEC36]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP4]]
 ; CHECK-NEXT:    store <32 x i16> [[INTERLEAVED_VEC]], ptr [[GEP]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-NEXT:    br i1 [[TMP14]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/pr32544.ll b/llvm/test/Transforms/PhaseOrdering/pr32544.ll
index 186954fd22aa..421260b10231 100644
--- a/llvm/test/Transforms/PhaseOrdering/pr32544.ll
+++ b/llvm/test/Transforms/PhaseOrdering/pr32544.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -O3 -S < %s                    | FileCheck %s
-; RUN: opt -passes='default<O3>' -S < %s  | FileCheck %s
+; RUN: opt -passes='default<O3>' -S < %s  | FileCheck %s
 
 define void @foo(i1 %which, i32 %a, i32 %b, ptr %result) {
 ; CHECK-LABEL: @foo(
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/X86/load-relative.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/X86/load-relative.ll
index 805d61607a7e..c157cb56ff0e 100644
--- a/llvm/test/Transforms/PreISelIntrinsicLowering/X86/load-relative.ll
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/X86/load-relative.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -mtriple=x86_64-pc-linux-gnu -pre-isel-intrinsic-lowering -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=x86_64-pc-linux-gnu -passes=pre-isel-intrinsic-lowering -S -o - %s | FileCheck %s
 
 ; CHECK: define ptr @foo32(ptr [[P:%.*]], i32 [[O:%.*]])
 define ptr @foo32(ptr %p, i32 %o) {
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/X86/objc-arc.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/X86/objc-arc.ll
index adeaef5adccf..37b07bb99ec2 100644
--- a/llvm/test/Transforms/PreISelIntrinsicLowering/X86/objc-arc.ll
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/X86/objc-arc.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -mtriple=x86_64-pc-linux-gnu -pre-isel-intrinsic-lowering -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=x86_64-pc-linux-gnu -passes=pre-isel-intrinsic-lowering -S -o - %s | FileCheck %s
 
 ; Make sure calls to the objc intrinsics are translated to calls in to the
 ; runtime
diff --git a/llvm/test/Transforms/Reassociate/fast-ArrayOutOfBounds.ll b/llvm/test/Transforms/Reassociate/fast-ArrayOutOfBounds.ll
index faabd8d7815b..6dc7b89a9b18 100644
--- a/llvm/test/Transforms/Reassociate/fast-ArrayOutOfBounds.ll
+++ b/llvm/test/Transforms/Reassociate/fast-ArrayOutOfBounds.ll
@@ -1,25 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=reassociate,instcombine -S | FileCheck %s
 
 ; Not marked as fast, so must not change.
 define float @test1(float %a0, float %a1, float %a2, float %a3, float %a4) {
-; CHECK-LABEL: test1
-; CHECK-NEXT: %tmp.2 = fadd float %a3, %a4
-; CHECK-NEXT: %tmp.4 = fadd float %tmp.2, %a2
-; CHECK-NEXT: %tmp.6 = fadd float %tmp.4, %a1
-; CHECK-NEXT: %tmp.8 = fadd float %tmp.6, %a0
-; CHECK-NEXT: %tmp.11 = fadd float %a2, %a3
-; CHECK-NEXT: %tmp.13 = fadd float %tmp.11, %a1
-; CHECK-NEXT: %tmp.15 = fadd float %tmp.13, %a0
-; CHECK-NEXT: %tmp.18 = fadd float %a1, %a2
-; CHECK-NEXT: %tmp.20 = fadd float %tmp.18, %a0
-; CHECK-NEXT: %tmp.23 = fadd float %a0, %a1
-; CHECK-NEXT: %tmp.26 = fsub float %tmp.8, %tmp.15
-; CHECK-NEXT: %tmp.28 = fadd float %tmp.20, %tmp.26
-; CHECK-NEXT: %tmp.30 = fsub float %tmp.28, %tmp.23
-; CHECK-NEXT: %tmp.32 = fsub float %tmp.30, %a4
-; CHECK-NEXT: %tmp.34 = fsub float %tmp.32, %a2
-; CHECK-NEXT: %T = fmul float %tmp.34, %tmp.34
-; CHECK-NEXT: ret float %T
+; CHECK-LABEL: define float @test1(
+; CHECK-SAME: float [[A0:%.*]], float [[A1:%.*]], float [[A2:%.*]], float [[A3:%.*]], float [[A4:%.*]]) {
+; CHECK-NEXT:    [[TMP_2:%.*]] = fadd float [[A3]], [[A4]]
+; CHECK-NEXT:    [[TMP_4:%.*]] = fadd float [[TMP_2]], [[A2]]
+; CHECK-NEXT:    [[TMP_6:%.*]] = fadd float [[TMP_4]], [[A1]]
+; CHECK-NEXT:    [[TMP_8:%.*]] = fadd float [[TMP_6]], [[A0]]
+; CHECK-NEXT:    [[TMP_11:%.*]] = fadd float [[A2]], [[A3]]
+; CHECK-NEXT:    [[TMP_13:%.*]] = fadd float [[TMP_11]], [[A1]]
+; CHECK-NEXT:    [[TMP_15:%.*]] = fadd float [[TMP_13]], [[A0]]
+; CHECK-NEXT:    [[TMP_18:%.*]] = fadd float [[A1]], [[A2]]
+; CHECK-NEXT:    [[TMP_20:%.*]] = fadd float [[TMP_18]], [[A0]]
+; CHECK-NEXT:    [[TMP_23:%.*]] = fadd float [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP_26:%.*]] = fsub float [[TMP_8]], [[TMP_15]]
+; CHECK-NEXT:    [[TMP_28:%.*]] = fadd float [[TMP_20]], [[TMP_26]]
+; CHECK-NEXT:    [[TMP_30:%.*]] = fsub float [[TMP_28]], [[TMP_23]]
+; CHECK-NEXT:    [[TMP_32:%.*]] = fsub float [[TMP_30]], [[A4]]
+; CHECK-NEXT:    [[TMP_34:%.*]] = fsub float [[TMP_32]], [[A2]]
+; CHECK-NEXT:    [[T:%.*]] = fmul float [[TMP_34]], [[TMP_34]]
+; CHECK-NEXT:    ret float [[T]]
+;
 
   %tmp.2 = fadd float %a4, %a3
   %tmp.4 = fadd float %tmp.2, %a2
@@ -42,8 +45,10 @@ define float @test1(float %a0, float %a1, float %a2, float %a3, float %a4) {
 
 ; Should be able to eliminate everything.
 define float @test2(float %a0, float %a1, float %a2, float %a3, float %a4) {
-; CHECK-LABEL: test2
-; CHECK: ret float 0.000000e+00
+; CHECK-LABEL: define float @test2(
+; CHECK-SAME: float [[A0:%.*]], float [[A1:%.*]], float [[A2:%.*]], float [[A3:%.*]], float [[A4:%.*]]) {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
 
   %tmp.2 = fadd fast float %a4, %a3
   %tmp.4 = fadd fast float %tmp.2, %a2
diff --git a/llvm/test/Transforms/Reassociate/reassoc_bool.ll b/llvm/test/Transforms/Reassociate/reassoc_bool.ll
new file mode 100644
index 000000000000..935a1e8c31a0
--- /dev/null
+++ b/llvm/test/Transforms/Reassociate/reassoc_bool.ll
@@ -0,0 +1,207 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=reassociate -S | FileCheck %s
+
+define i1 @scalar(i1 %b0, i1 %b1, i1 %b2, i1 %b3, i1 %b4, i1 %b5, i1 %b6, i1 %b7) {
+; CHECK-LABEL: define i1 @scalar(
+; CHECK-SAME: i1 [[B0:%.*]], i1 [[B1:%.*]], i1 [[B2:%.*]], i1 [[B3:%.*]], i1 [[B4:%.*]], i1 [[B5:%.*]], i1 [[B6:%.*]], i1 [[B7:%.*]]) {
+; CHECK-NEXT:    [[OR01:%.*]] = or i1 [[B0]], [[B1]]
+; CHECK-NEXT:    [[OR23:%.*]] = or i1 [[B2]], [[B3]]
+; CHECK-NEXT:    [[OR45:%.*]] = or i1 [[B4]], [[B5]]
+; CHECK-NEXT:    [[OR67:%.*]] = or i1 [[B6]], [[B7]]
+; CHECK-NEXT:    [[OR0123:%.*]] = or i1 [[OR01]], [[OR23]]
+; CHECK-NEXT:    [[OR4567:%.*]] = or i1 [[OR45]], [[OR67]]
+; CHECK-NEXT:    [[OR01234567:%.*]] = or i1 [[OR0123]], [[OR4567]]
+; CHECK-NEXT:    ret i1 [[OR01234567]]
+;
+  %or01 = or i1 %b0, %b1
+  %or23 = or i1 %b2, %b3
+  %or45 = or i1 %b4, %b5
+  %or67 = or i1 %b6, %b7
+  %or0123 = or i1 %or01, %or23
+  %or4567 = or i1 %or45, %or67
+  %or01234567 = or i1 %or0123, %or4567
+  ret i1 %or01234567
+}
+
+
+define i1 @scalar1(i1 %a, i1 %b0, i1 %b1, i1 %b2, i1 %b3, i1 %b4, i1 %b5, i1 %b6, i1 %b7) {
+; CHECK-LABEL: define i1 @scalar1(
+; CHECK-SAME: i1 [[A:%.*]], i1 [[B0:%.*]], i1 [[B1:%.*]], i1 [[B2:%.*]], i1 [[B3:%.*]], i1 [[B4:%.*]], i1 [[B5:%.*]], i1 [[B6:%.*]], i1 [[B7:%.*]]) {
+; CHECK-NEXT:    [[OR0:%.*]] = or i1 [[A]], [[B0]]
+; CHECK-NEXT:    [[OR1:%.*]] = or i1 [[A]], [[B1]]
+; CHECK-NEXT:    [[OR2:%.*]] = or i1 [[A]], [[B2]]
+; CHECK-NEXT:    [[OR3:%.*]] = or i1 [[A]], [[B3]]
+; CHECK-NEXT:    [[OR4:%.*]] = or i1 [[A]], [[B4]]
+; CHECK-NEXT:    [[OR5:%.*]] = or i1 [[A]], [[B5]]
+; CHECK-NEXT:    [[OR6:%.*]] = or i1 [[A]], [[B6]]
+; CHECK-NEXT:    [[OR7:%.*]] = or i1 [[A]], [[B7]]
+; CHECK-NEXT:    [[XOR0:%.*]] = xor i1 [[OR0]], [[OR1]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i1 [[XOR0]], [[OR2]]
+; CHECK-NEXT:    [[XOR2:%.*]] = xor i1 [[XOR1]], [[OR3]]
+; CHECK-NEXT:    [[XOR3:%.*]] = xor i1 [[XOR2]], [[OR4]]
+; CHECK-NEXT:    [[XOR4:%.*]] = xor i1 [[XOR3]], [[OR5]]
+; CHECK-NEXT:    [[XOR5:%.*]] = xor i1 [[XOR4]], [[OR6]]
+; CHECK-NEXT:    [[XOR6:%.*]] = xor i1 [[XOR5]], [[OR7]]
+; CHECK-NEXT:    [[OR001:%.*]] = or i1 [[XOR0]], [[XOR1]]
+; CHECK-NEXT:    [[OR023:%.*]] = or i1 [[XOR2]], [[XOR3]]
+; CHECK-NEXT:    [[OR045:%.*]] = or i1 [[XOR4]], [[XOR5]]
+; CHECK-NEXT:    [[OR060:%.*]] = or i1 [[XOR0]], [[XOR6]]
+; CHECK-NEXT:    [[OR0123:%.*]] = or i1 [[OR001]], [[OR023]]
+; CHECK-NEXT:    [[OR4560:%.*]] = or i1 [[OR045]], [[OR060]]
+; CHECK-NEXT:    [[OR01234567:%.*]] = or i1 [[OR0123]], [[OR4560]]
+; CHECK-NEXT:    ret i1 [[OR01234567]]
+;
+  %or0 = or i1 %b0, %a
+  %or1 = or i1 %b1, %a
+  %or2 = or i1 %b2, %a
+  %or3 = or i1 %b3, %a
+  %or4 = or i1 %b4, %a
+  %or5 = or i1 %b5, %a
+  %or6 = or i1 %b6, %a
+  %or7 = or i1 %b7, %a
+  %xor0 = xor i1 %or0, %or1
+  %xor1 = xor i1 %xor0, %or2
+  %xor2 = xor i1 %xor1, %or3
+  %xor3 = xor i1 %xor2, %or4
+  %xor4 = xor i1 %xor3, %or5
+  %xor5 = xor i1 %xor4, %or6
+  %xor6 = xor i1 %xor5, %or7
+  %or001 = or i1 %xor0, %xor1
+  %or023 = or i1 %xor2, %xor3
+  %or045 = or i1 %xor4, %xor5
+  %or060 = or i1 %xor6, %xor0
+  %or0123 = or i1 %or001, %or023
+  %or4560 = or i1 %or045, %or060
+  %or01234567 = or i1 %or0123, %or4560
+  ret i1 %or01234567
+}
+
+define i1 @scalar2(i1 %a, i1 %b0, i1 %b1, i1 %b2, i1 %b3, i1 %b4, i1 %b5, i1 %b6, i1 %b7) {
+; CHECK-LABEL: define i1 @scalar2(
+; CHECK-SAME: i1 [[A:%.*]], i1 [[B0:%.*]], i1 [[B1:%.*]], i1 [[B2:%.*]], i1 [[B3:%.*]], i1 [[B4:%.*]], i1 [[B5:%.*]], i1 [[B6:%.*]], i1 [[B7:%.*]]) {
+; CHECK-NEXT:    [[OR0:%.*]] = or i1 [[A]], [[B0]]
+; CHECK-NEXT:    [[OR1:%.*]] = or i1 [[A]], [[B1]]
+; CHECK-NEXT:    [[OR2:%.*]] = or i1 [[A]], [[B2]]
+; CHECK-NEXT:    [[OR3:%.*]] = or i1 [[A]], [[B3]]
+; CHECK-NEXT:    [[OR4:%.*]] = or i1 [[A]], [[B4]]
+; CHECK-NEXT:    [[OR5:%.*]] = or i1 [[A]], [[B5]]
+; CHECK-NEXT:    [[OR6:%.*]] = or i1 [[A]], [[B6]]
+; CHECK-NEXT:    [[OR7:%.*]] = or i1 [[A]], [[B7]]
+; CHECK-NEXT:    [[XOR0:%.*]] = xor i1 [[OR0]], [[OR1]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i1 [[OR2]], [[OR3]]
+; CHECK-NEXT:    [[XOR2:%.*]] = xor i1 [[OR4]], [[OR5]]
+; CHECK-NEXT:    [[XOR3:%.*]] = xor i1 [[OR6]], [[OR7]]
+; CHECK-NEXT:    [[OR01:%.*]] = xor i1 [[XOR0]], [[XOR1]]
+; CHECK-NEXT:    [[OR23:%.*]] = xor i1 [[XOR2]], [[XOR3]]
+; CHECK-NEXT:    [[OR0123:%.*]] = xor i1 [[OR01]], [[OR23]]
+; CHECK-NEXT:    ret i1 [[OR0123]]
+;
+  %or0 = or i1 %b0, %a
+  %or1 = or i1 %b1, %a
+  %or2 = or i1 %b2, %a
+  %or3 = or i1 %b3, %a
+  %or4 = or i1 %b4, %a
+  %or5 = or i1 %b5, %a
+  %or6 = or i1 %b6, %a
+  %or7 = or i1 %b7, %a
+  %xor0 = xor i1 %or0, %or1
+  %xor1 = xor i1 %or2, %or3
+  %xor2 = xor i1 %or4, %or5
+  %xor3 = xor i1 %or6, %or7
+  %or01 = xor i1 %xor0, %xor1
+  %or23 = xor i1 %xor2, %xor3
+  %or0123 = xor i1 %or01, %or23
+  ret i1 %or0123
+}
+
+define i1 @scalar3(i1 %a, i1 %b0, i1 %b1, i1 %b2, i1 %b3, i1 %b4, i1 %b5, i1 %b6, i1 %b7) {
+; CHECK-LABEL: define i1 @scalar3(
+; CHECK-SAME: i1 [[A:%.*]], i1 [[B0:%.*]], i1 [[B1:%.*]], i1 [[B2:%.*]], i1 [[B3:%.*]], i1 [[B4:%.*]], i1 [[B5:%.*]], i1 [[B6:%.*]], i1 [[B7:%.*]]) {
+; CHECK-NEXT:    [[XOR0:%.*]] = xor i1 [[A]], [[B0]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i1 [[A]], [[B1]]
+; CHECK-NEXT:    [[XOR2:%.*]] = xor i1 [[A]], [[B2]]
+; CHECK-NEXT:    [[XOR3:%.*]] = xor i1 [[A]], [[B3]]
+; CHECK-NEXT:    [[XOR4:%.*]] = xor i1 [[A]], [[B4]]
+; CHECK-NEXT:    [[XOR5:%.*]] = xor i1 [[A]], [[B5]]
+; CHECK-NEXT:    [[XOR6:%.*]] = xor i1 [[A]], [[B6]]
+; CHECK-NEXT:    [[XOR7:%.*]] = xor i1 [[A]], [[B7]]
+; CHECK-NEXT:    [[AND0:%.*]] = and i1 [[XOR0]], [[XOR1]]
+; CHECK-NEXT:    [[AND1:%.*]] = and i1 [[XOR2]], [[XOR3]]
+; CHECK-NEXT:    [[AND2:%.*]] = and i1 [[XOR4]], [[XOR5]]
+; CHECK-NEXT:    [[AND3:%.*]] = and i1 [[XOR6]], [[XOR7]]
+; CHECK-NEXT:    [[OR01:%.*]] = and i1 [[AND0]], [[AND1]]
+; CHECK-NEXT:    [[OR23:%.*]] = and i1 [[AND2]], [[AND3]]
+; CHECK-NEXT:    [[OR0123:%.*]] = and i1 [[OR01]], [[OR23]]
+; CHECK-NEXT:    ret i1 [[OR0123]]
+;
+  %xor0 = xor i1 %b0, %a
+  %xor1 = xor i1 %b1, %a
+  %xor2 = xor i1 %b2, %a
+  %xor3 = xor i1 %b3, %a
+  %xor4 = xor i1 %b4, %a
+  %xor5 = xor i1 %b5, %a
+  %xor6 = xor i1 %b6, %a
+  %xor7 = xor i1 %b7, %a
+  %and0 = and i1 %xor0, %xor1
+  %and1 = and i1 %xor2, %xor3
+  %and2 = and i1 %xor4, %xor5
+  %and3 = and i1 %xor6, %xor7
+  %or01 = and i1 %and0, %and1
+  %or23 = and i1 %and2, %and3
+  %or0123 = and i1 %or01, %or23
+  ret i1 %or0123
+}
+
+define i1 @scalar4(i1 %a, i1 %b0, i1 %b1, i1 %b2, i1 %b3, i1 %b4, i1 %b5, i1 %b6, i1 %b7) {
+; CHECK-LABEL: define i1 @scalar4(
+; CHECK-SAME: i1 [[A:%.*]], i1 [[B0:%.*]], i1 [[B1:%.*]], i1 [[B2:%.*]], i1 [[B3:%.*]], i1 [[B4:%.*]], i1 [[B5:%.*]], i1 [[B6:%.*]], i1 [[B7:%.*]]) {
+; CHECK-NEXT:    [[XOR0:%.*]] = xor i1 [[A]], [[B0]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i1 [[A]], [[B1]]
+; CHECK-NEXT:    [[XOR2:%.*]] = xor i1 [[A]], [[B2]]
+; CHECK-NEXT:    [[XOR3:%.*]] = xor i1 [[A]], [[B3]]
+; CHECK-NEXT:    [[XOR4:%.*]] = xor i1 [[A]], [[B4]]
+; CHECK-NEXT:    [[XOR5:%.*]] = xor i1 [[A]], [[B5]]
+; CHECK-NEXT:    [[XOR6:%.*]] = xor i1 [[A]], [[B6]]
+; CHECK-NEXT:    [[XOR7:%.*]] = xor i1 [[A]], [[B7]]
+; CHECK-NEXT:    [[OR0:%.*]] = or i1 [[XOR0]], [[XOR1]]
+; CHECK-NEXT:    [[OR1:%.*]] = or i1 [[XOR2]], [[XOR3]]
+; CHECK-NEXT:    [[OR2:%.*]] = or i1 [[XOR4]], [[XOR5]]
+; CHECK-NEXT:    [[OR3:%.*]] = or i1 [[XOR6]], [[XOR7]]
+; CHECK-NEXT:    [[OR4:%.*]] = or i1 [[B0]], [[B1]]
+; CHECK-NEXT:    [[OR5:%.*]] = or i1 [[B2]], [[B3]]
+; CHECK-NEXT:    [[OR6:%.*]] = or i1 [[B4]], [[B5]]
+; CHECK-NEXT:    [[OR7:%.*]] = or i1 [[B6]], [[B7]]
+; CHECK-NEXT:    [[OR01:%.*]] = or i1 [[OR0]], [[OR1]]
+; CHECK-NEXT:    [[OR23:%.*]] = or i1 [[OR2]], [[OR3]]
+; CHECK-NEXT:    [[OR45:%.*]] = or i1 [[OR4]], [[OR5]]
+; CHECK-NEXT:    [[OR67:%.*]] = or i1 [[OR6]], [[OR7]]
+; CHECK-NEXT:    [[OR0123:%.*]] = or i1 [[OR01]], [[OR23]]
+; CHECK-NEXT:    [[OR4567:%.*]] = or i1 [[OR45]], [[OR67]]
+; CHECK-NEXT:    [[OR01234567:%.*]] = or i1 [[OR4567]], [[OR0123]]
+; CHECK-NEXT:    ret i1 [[OR01234567]]
+;
+  %xor0 = xor i1 %b0, %a
+  %xor1 = xor i1 %b1, %a
+  %xor2 = xor i1 %b2, %a
+  %xor3 = xor i1 %b3, %a
+  %xor4 = xor i1 %b4, %a
+  %xor5 = xor i1 %b5, %a
+  %xor6 = xor i1 %b6, %a
+  %xor7 = xor i1 %b7, %a
+  %or0 = or i1 %xor0, %xor1
+  %or1 = or i1 %xor2, %xor3
+  %or2 = or i1 %xor4, %xor5
+  %or3 = or i1 %xor6, %xor7
+  %or4 = or i1 %b0, %b1
+  %or5 = or i1 %b2, %b3
+  %or6 = or i1 %b4, %b5
+  %or7 = or i1 %b6, %b7
+  %or01 = or i1 %or0, %or1
+  %or23 = or i1 %or2, %or3
+  %or45 = or i1 %or4, %or5
+  %or67 = or i1 %or6, %or7
+  %or0123 = or i1 %or01, %or23
+  %or4567 = or i1 %or45, %or67
+  %or01234567 = or i1 %or0123, %or4567
+  ret i1 %or01234567
+}
diff --git a/llvm/test/Transforms/Reassociate/reassoc_bool_vec.ll b/llvm/test/Transforms/Reassociate/reassoc_bool_vec.ll
new file mode 100644
index 000000000000..fcedde23ecc7
--- /dev/null
+++ b/llvm/test/Transforms/Reassociate/reassoc_bool_vec.ll
@@ -0,0 +1,227 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=reassociate -S | FileCheck %s
+
+define <8 x i1> @vector0(<8 x i1> %b0, <8 x i1> %b1, <8 x i1> %b2, <8 x i1> %b3, <8 x i1> %b4, <8 x i1> %b5, <8 x i1> %b6, <8 x i1> %b7) {
+; CHECK-LABEL: define <8 x i1> @vector0(
+; CHECK-SAME: <8 x i1> [[B0:%.*]], <8 x i1> [[B1:%.*]], <8 x i1> [[B2:%.*]], <8 x i1> [[B3:%.*]], <8 x i1> [[B4:%.*]], <8 x i1> [[B5:%.*]], <8 x i1> [[B6:%.*]], <8 x i1> [[B7:%.*]]) {
+; CHECK-NEXT:    [[OR67:%.*]] = or <8 x i1> [[B1]], [[B0]]
+; CHECK-NEXT:    [[OR45:%.*]] = or <8 x i1> [[OR67]], [[B2]]
+; CHECK-NEXT:    [[OR4567:%.*]] = or <8 x i1> [[OR45]], [[B3]]
+; CHECK-NEXT:    [[OR23:%.*]] = or <8 x i1> [[OR4567]], [[B4]]
+; CHECK-NEXT:    [[OR01:%.*]] = or <8 x i1> [[OR23]], [[B5]]
+; CHECK-NEXT:    [[OR0123:%.*]] = or <8 x i1> [[OR01]], [[B6]]
+; CHECK-NEXT:    [[OR01234567:%.*]] = or <8 x i1> [[OR0123]], [[B7]]
+; CHECK-NEXT:    ret <8 x i1> [[OR01234567]]
+;
+  %or01 = or <8 x i1> %b0, %b1
+  %or23 = or <8 x i1> %b2, %b3
+  %or45 = or <8 x i1> %b4, %b5
+  %or67 = or <8 x i1> %b6, %b7
+  %or0123 = or <8 x i1> %or01, %or23
+  %or4567 = or <8 x i1> %or45, %or67
+  %or01234567 = or <8 x i1> %or0123, %or4567
+  ret <8 x i1> %or01234567
+}
+
+define <8 x i1> @vector1(<8 x i1> %b0, <8 x i1> %b1, <8 x i1> %b2, <8 x i1> %b3, <8 x i1> %b4, <8 x i1> %b5, <8 x i1> %b6, <8 x i1> %b7) {
+; CHECK-LABEL: define <8 x i1> @vector1(
+; CHECK-SAME: <8 x i1> [[B0:%.*]], <8 x i1> [[B1:%.*]], <8 x i1> [[B2:%.*]], <8 x i1> [[B3:%.*]], <8 x i1> [[B4:%.*]], <8 x i1> [[B5:%.*]], <8 x i1> [[B6:%.*]], <8 x i1> [[B7:%.*]]) {
+; CHECK-NEXT:    [[OR67:%.*]] = and <8 x i1> [[B1]], [[B0]]
+; CHECK-NEXT:    [[OR45:%.*]] = and <8 x i1> [[OR67]], [[B2]]
+; CHECK-NEXT:    [[OR4567:%.*]] = and <8 x i1> [[OR45]], [[B3]]
+; CHECK-NEXT:    [[OR23:%.*]] = and <8 x i1> [[OR4567]], [[B4]]
+; CHECK-NEXT:    [[OR01:%.*]] = and <8 x i1> [[OR23]], [[B5]]
+; CHECK-NEXT:    [[OR0123:%.*]] = and <8 x i1> [[OR01]], [[B6]]
+; CHECK-NEXT:    [[OR01234567:%.*]] = and <8 x i1> [[OR0123]], [[B7]]
+; CHECK-NEXT:    ret <8 x i1> [[OR01234567]]
+;
+  %or01 = and <8 x i1> %b0, %b1
+  %or23 = and <8 x i1> %b2, %b3
+  %or45 = and <8 x i1> %b4, %b5
+  %or67 = and <8 x i1> %b6, %b7
+  %or0123 = and <8 x i1> %or01, %or23
+  %or4567 = and <8 x i1> %or45, %or67
+  %or01234567 = and <8 x i1> %or0123, %or4567
+  ret <8 x i1> %or01234567
+}
+
+define <8 x i1> @vector2(<8 x i1> %a, <8 x i1> %b0, <8 x i1> %b1, <8 x i1> %b2, <8 x i1> %b3, <8 x i1> %b4, <8 x i1> %b5, <8 x i1> %b6, <8 x i1> %b7) {
+; CHECK-LABEL: define <8 x i1> @vector2(
+; CHECK-SAME: <8 x i1> [[A:%.*]], <8 x i1> [[B0:%.*]], <8 x i1> [[B1:%.*]], <8 x i1> [[B2:%.*]], <8 x i1> [[B3:%.*]], <8 x i1> [[B4:%.*]], <8 x i1> [[B5:%.*]], <8 x i1> [[B6:%.*]], <8 x i1> [[B7:%.*]]) {
+; CHECK-NEXT:    [[OR0:%.*]] = or <8 x i1> [[B0]], [[A]]
+; CHECK-NEXT:    [[OR1:%.*]] = or <8 x i1> [[B1]], [[A]]
+; CHECK-NEXT:    [[OR2:%.*]] = or <8 x i1> [[B2]], [[A]]
+; CHECK-NEXT:    [[OR3:%.*]] = or <8 x i1> [[B3]], [[A]]
+; CHECK-NEXT:    [[OR4:%.*]] = or <8 x i1> [[B4]], [[A]]
+; CHECK-NEXT:    [[OR5:%.*]] = or <8 x i1> [[B5]], [[A]]
+; CHECK-NEXT:    [[OR6:%.*]] = or <8 x i1> [[B6]], [[A]]
+; CHECK-NEXT:    [[OR7:%.*]] = or <8 x i1> [[B7]], [[A]]
+; CHECK-NEXT:    [[XOR0:%.*]] = xor <8 x i1> [[OR1]], [[OR0]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor <8 x i1> [[XOR0]], [[OR2]]
+; CHECK-NEXT:    [[XOR2:%.*]] = xor <8 x i1> [[XOR1]], [[OR3]]
+; CHECK-NEXT:    [[XOR3:%.*]] = xor <8 x i1> [[XOR2]], [[OR4]]
+; CHECK-NEXT:    [[XOR4:%.*]] = xor <8 x i1> [[XOR3]], [[OR5]]
+; CHECK-NEXT:    [[XOR5:%.*]] = xor <8 x i1> [[XOR4]], [[OR6]]
+; CHECK-NEXT:    [[XOR6:%.*]] = xor <8 x i1> [[XOR5]], [[OR7]]
+; CHECK-NEXT:    [[OR045:%.*]] = or <8 x i1> [[XOR1]], [[XOR0]]
+; CHECK-NEXT:    [[OR4560:%.*]] = or <8 x i1> [[OR045]], [[XOR2]]
+; CHECK-NEXT:    [[OR023:%.*]] = or <8 x i1> [[OR4560]], [[XOR3]]
+; CHECK-NEXT:    [[OR001:%.*]] = or <8 x i1> [[OR023]], [[XOR4]]
+; CHECK-NEXT:    [[OR0123:%.*]] = or <8 x i1> [[OR001]], [[XOR5]]
+; CHECK-NEXT:    [[OR01234567:%.*]] = or <8 x i1> [[OR0123]], [[XOR6]]
+; CHECK-NEXT:    ret <8 x i1> [[OR01234567]]
+;
+  %or0 = or <8 x i1> %b0, %a
+  %or1 = or <8 x i1> %b1, %a
+  %or2 = or <8 x i1> %b2, %a
+  %or3 = or <8 x i1> %b3, %a
+  %or4 = or <8 x i1> %b4, %a
+  %or5 = or <8 x i1> %b5, %a
+  %or6 = or <8 x i1> %b6, %a
+  %or7 = or <8 x i1> %b7, %a
+  %xor0 = xor <8 x i1> %or0, %or1
+  %xor1 = xor <8 x i1> %xor0, %or2
+  %xor2 = xor <8 x i1> %xor1, %or3
+  %xor3 = xor <8 x i1> %xor2, %or4
+  %xor4 = xor <8 x i1> %xor3, %or5
+  %xor5 = xor <8 x i1> %xor4, %or6
+  %xor6 = xor <8 x i1> %xor5, %or7
+  %or001 = or <8 x i1> %xor0, %xor1
+  %or023 = or <8 x i1> %xor2, %xor3
+  %or045 = or <8 x i1> %xor4, %xor5
+  %or060 = or <8 x i1> %xor6, %xor0
+  %or0123 = or <8 x i1> %or001, %or023
+  %or4560 = or <8 x i1> %or045, %or060
+  %or01234567 = or <8 x i1> %or0123, %or4560
+  ret <8 x i1> %or01234567
+}
+
+define <8 x i1> @vector3(<8 x i1> %a, <8 x i1> %b0, <8 x i1> %b1, <8 x i1> %b2, <8 x i1> %b3, <8 x i1> %b4, <8 x i1> %b5, <8 x i1> %b6, <8 x i1> %b7) {
+; CHECK-LABEL: define <8 x i1> @vector3(
+; CHECK-SAME: <8 x i1> [[A:%.*]], <8 x i1> [[B0:%.*]], <8 x i1> [[B1:%.*]], <8 x i1> [[B2:%.*]], <8 x i1> [[B3:%.*]], <8 x i1> [[B4:%.*]], <8 x i1> [[B5:%.*]], <8 x i1> [[B6:%.*]], <8 x i1> [[B7:%.*]]) {
+; CHECK-NEXT:    [[OR0:%.*]] = or <8 x i1> [[B0]], [[A]]
+; CHECK-NEXT:    [[OR1:%.*]] = or <8 x i1> [[B1]], [[A]]
+; CHECK-NEXT:    [[OR2:%.*]] = or <8 x i1> [[B2]], [[A]]
+; CHECK-NEXT:    [[OR3:%.*]] = or <8 x i1> [[B3]], [[A]]
+; CHECK-NEXT:    [[OR4:%.*]] = or <8 x i1> [[B4]], [[A]]
+; CHECK-NEXT:    [[OR5:%.*]] = or <8 x i1> [[B5]], [[A]]
+; CHECK-NEXT:    [[OR6:%.*]] = or <8 x i1> [[B6]], [[A]]
+; CHECK-NEXT:    [[OR7:%.*]] = or <8 x i1> [[B7]], [[A]]
+; CHECK-NEXT:    [[XOR3:%.*]] = xor <8 x i1> [[OR1]], [[OR0]]
+; CHECK-NEXT:    [[XOR2:%.*]] = xor <8 x i1> [[XOR3]], [[OR2]]
+; CHECK-NEXT:    [[XOR7:%.*]] = xor <8 x i1> [[XOR2]], [[OR3]]
+; CHECK-NEXT:    [[XOR0:%.*]] = xor <8 x i1> [[XOR7]], [[OR4]]
+; CHECK-NEXT:    [[XOR4:%.*]] = xor <8 x i1> [[XOR0]], [[OR5]]
+; CHECK-NEXT:    [[XOR5:%.*]] = xor <8 x i1> [[XOR4]], [[OR6]]
+; CHECK-NEXT:    [[OR4560:%.*]] = xor <8 x i1> [[XOR5]], [[OR7]]
+; CHECK-NEXT:    ret <8 x i1> [[OR4560]]
+;
+  %or0 = or <8 x i1> %b0, %a
+  %or1 = or <8 x i1> %b1, %a
+  %or2 = or <8 x i1> %b2, %a
+  %or3 = or <8 x i1> %b3, %a
+  %or4 = or <8 x i1> %b4, %a
+  %or5 = or <8 x i1> %b5, %a
+  %or6 = or <8 x i1> %b6, %a
+  %or7 = or <8 x i1> %b7, %a
+  %xor0 = xor <8 x i1> %or0, %or1
+  %xor1 = xor <8 x i1> %or2, %or3
+  %xor2 = xor <8 x i1> %or4, %or5
+  %xor3 = xor <8 x i1> %or6, %or7
+  %or01 = xor <8 x i1> %xor0, %xor1
+  %or23 = xor <8 x i1> %xor2, %xor3
+  %or0123 = xor <8 x i1> %or01, %or23
+  ret <8 x i1> %or0123
+}
+
+define <8 x i1> @vector4(<8 x i1> %a, <8 x i1> %b0, <8 x i1> %b1, <8 x i1> %b2, <8 x i1> %b3, <8 x i1> %b4, <8 x i1> %b5, <8 x i1> %b6, <8 x i1> %b7) {
+; CHECK-LABEL: define <8 x i1> @vector4(
+; CHECK-SAME: <8 x i1> [[A:%.*]], <8 x i1> [[B0:%.*]], <8 x i1> [[B1:%.*]], <8 x i1> [[B2:%.*]], <8 x i1> [[B3:%.*]], <8 x i1> [[B4:%.*]], <8 x i1> [[B5:%.*]], <8 x i1> [[B6:%.*]], <8 x i1> [[B7:%.*]]) {
+; CHECK-NEXT:    [[XOR0:%.*]] = xor <8 x i1> [[B0]], [[A]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor <8 x i1> [[B1]], [[A]]
+; CHECK-NEXT:    [[XOR2:%.*]] = xor <8 x i1> [[B2]], [[A]]
+; CHECK-NEXT:    [[XOR3:%.*]] = xor <8 x i1> [[B3]], [[A]]
+; CHECK-NEXT:    [[XOR4:%.*]] = xor <8 x i1> [[B4]], [[A]]
+; CHECK-NEXT:    [[XOR5:%.*]] = xor <8 x i1> [[B5]], [[A]]
+; CHECK-NEXT:    [[XOR6:%.*]] = xor <8 x i1> [[B6]], [[A]]
+; CHECK-NEXT:    [[XOR7:%.*]] = xor <8 x i1> [[B7]], [[A]]
+; CHECK-NEXT:    [[AND3:%.*]] = and <8 x i1> [[XOR1]], [[XOR0]]
+; CHECK-NEXT:    [[AND2:%.*]] = and <8 x i1> [[AND3]], [[XOR2]]
+; CHECK-NEXT:    [[OR23:%.*]] = and <8 x i1> [[AND2]], [[XOR3]]
+; CHECK-NEXT:    [[AND1:%.*]] = and <8 x i1> [[OR23]], [[XOR4]]
+; CHECK-NEXT:    [[AND0:%.*]] = and <8 x i1> [[AND1]], [[XOR5]]
+; CHECK-NEXT:    [[OR01:%.*]] = and <8 x i1> [[AND0]], [[XOR6]]
+; CHECK-NEXT:    [[OR0123:%.*]] = and <8 x i1> [[OR01]], [[XOR7]]
+; CHECK-NEXT:    ret <8 x i1> [[OR0123]]
+;
+  %xor0 = xor <8 x i1> %b0, %a
+  %xor1 = xor <8 x i1> %b1, %a
+  %xor2 = xor <8 x i1> %b2, %a
+  %xor3 = xor <8 x i1> %b3, %a
+  %xor4 = xor <8 x i1> %b4, %a
+  %xor5 = xor <8 x i1> %b5, %a
+  %xor6 = xor <8 x i1> %b6, %a
+  %xor7 = xor <8 x i1> %b7, %a
+  %and0 = and <8 x i1> %xor0, %xor1
+  %and1 = and <8 x i1> %xor2, %xor3
+  %and2 = and <8 x i1> %xor4, %xor5
+  %and3 = and <8 x i1> %xor6, %xor7
+  %or01 = and <8 x i1> %and0, %and1
+  %or23 = and <8 x i1> %and2, %and3
+  %or0123 = and <8 x i1> %or01, %or23
+  ret <8 x i1> %or0123
+}
+
+define <8 x i1> @vector5(<8 x i1> %a, <8 x i1> %b0, <8 x i1> %b1, <8 x i1> %b2, <8 x i1> %b3, <8 x i1> %b4, <8 x i1> %b5, <8 x i1> %b6, <8 x i1> %b7) {
+; CHECK-LABEL: define <8 x i1> @vector5(
+; CHECK-SAME: <8 x i1> [[A:%.*]], <8 x i1> [[B0:%.*]], <8 x i1> [[B1:%.*]], <8 x i1> [[B2:%.*]], <8 x i1> [[B3:%.*]], <8 x i1> [[B4:%.*]], <8 x i1> [[B5:%.*]], <8 x i1> [[B6:%.*]], <8 x i1> [[B7:%.*]]) {
+; CHECK-NEXT:    [[XOR0:%.*]] = xor <8 x i1> [[B0]], [[A]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor <8 x i1> [[B1]], [[A]]
+; CHECK-NEXT:    [[XOR2:%.*]] = xor <8 x i1> [[B2]], [[A]]
+; CHECK-NEXT:    [[XOR3:%.*]] = xor <8 x i1> [[B3]], [[A]]
+; CHECK-NEXT:    [[XOR4:%.*]] = xor <8 x i1> [[B4]], [[A]]
+; CHECK-NEXT:    [[XOR5:%.*]] = xor <8 x i1> [[B5]], [[A]]
+; CHECK-NEXT:    [[XOR6:%.*]] = xor <8 x i1> [[B6]], [[A]]
+; CHECK-NEXT:    [[XOR7:%.*]] = xor <8 x i1> [[B7]], [[A]]
+; CHECK-NEXT:    [[OR3:%.*]] = or <8 x i1> [[B1]], [[B0]]
+; CHECK-NEXT:    [[OR2:%.*]] = or <8 x i1> [[OR3]], [[XOR0]]
+; CHECK-NEXT:    [[OR23:%.*]] = or <8 x i1> [[OR2]], [[B2]]
+; CHECK-NEXT:    [[OR1:%.*]] = or <8 x i1> [[OR23]], [[XOR1]]
+; CHECK-NEXT:    [[OR0:%.*]] = or <8 x i1> [[OR1]], [[B3]]
+; CHECK-NEXT:    [[OR01:%.*]] = or <8 x i1> [[OR0]], [[XOR2]]
+; CHECK-NEXT:    [[OR0123:%.*]] = or <8 x i1> [[OR01]], [[B4]]
+; CHECK-NEXT:    [[OR7:%.*]] = or <8 x i1> [[OR0123]], [[XOR3]]
+; CHECK-NEXT:    [[OR6:%.*]] = or <8 x i1> [[OR7]], [[B5]]
+; CHECK-NEXT:    [[OR67:%.*]] = or <8 x i1> [[OR6]], [[XOR4]]
+; CHECK-NEXT:    [[OR5:%.*]] = or <8 x i1> [[OR67]], [[B6]]
+; CHECK-NEXT:    [[OR4:%.*]] = or <8 x i1> [[OR5]], [[XOR5]]
+; CHECK-NEXT:    [[OR45:%.*]] = or <8 x i1> [[OR4]], [[B7]]
+; CHECK-NEXT:    [[OR4567:%.*]] = or <8 x i1> [[OR45]], [[XOR6]]
+; CHECK-NEXT:    [[OR01234567:%.*]] = or <8 x i1> [[OR4567]], [[XOR7]]
+; CHECK-NEXT:    ret <8 x i1> [[OR01234567]]
+;
+  %xor0 = xor <8 x i1> %b0, %a
+  %xor1 = xor <8 x i1> %b1, %a
+  %xor2 = xor <8 x i1> %b2, %a
+  %xor3 = xor <8 x i1> %b3, %a
+  %xor4 = xor <8 x i1> %b4, %a
+  %xor5 = xor <8 x i1> %b5, %a
+  %xor6 = xor <8 x i1> %b6, %a
+  %xor7 = xor <8 x i1> %b7, %a
+  %or0 = or <8 x i1> %xor0, %xor1
+  %or1 = or <8 x i1> %xor2, %xor3
+  %or2 = or <8 x i1> %xor4, %xor5
+  %or3 = or <8 x i1> %xor6, %xor7
+  %or4 = or <8 x i1> %b0, %b1
+  %or5 = or <8 x i1> %b2, %b3
+  %or6 = or <8 x i1> %b4, %b5
+  %or7 = or <8 x i1> %b6, %b7
+  %or01 = or <8 x i1> %or0, %or1
+  %or23 = or <8 x i1> %or2, %or3
+  %or45 = or <8 x i1> %or4, %or5
+  %or67 = or <8 x i1> %or6, %or7
+  %or0123 = or <8 x i1> %or01, %or23
+  %or4567 = or <8 x i1> %or45, %or67
+  %or01234567 = or <8 x i1> %or0123, %or4567
+  ret <8 x i1> %or01234567
+}
diff --git a/llvm/test/Transforms/Reassociate/undef_intrinsics_when_deleting_instructions.ll b/llvm/test/Transforms/Reassociate/undef_intrinsics_when_deleting_instructions.ll
index ef9b86db52f2..a98bdb5a88e4 100644
--- a/llvm/test/Transforms/Reassociate/undef_intrinsics_when_deleting_instructions.ll
+++ b/llvm/test/Transforms/Reassociate/undef_intrinsics_when_deleting_instructions.ll
@@ -1,74 +1,74 @@
-; RUN: opt < %s -passes=reassociate -S | FileCheck %s
-; RUN: opt < %s -passes=reassociate -S --try-experimental-debuginfo-iterators | FileCheck %s
-
-; Check that reassociate pass now undefs debug intrinsics that reference a value
-; that gets dropped and cannot be salvaged.
-
-; CHECK-NOT: %add = fadd fast float %a, %b
-; CHECK: call void @llvm.dbg.value(metadata float poison, metadata [[VAR_X:![0-9]+]], metadata !DIExpression())
-
-; CHECK-LABEL: if.then:
-; CHECK-NOT: %add1 = fadd fast float %add, %c
-; CHECK: call void @llvm.dbg.value(metadata float poison, metadata [[VAR_Y:![0-9]+]], metadata !DIExpression())
-; CHECK-LABEL: !0 =
-; CHECK-DAG: [[VAR_Y]] = !DILocalVariable(name: "y"
-; CHECK-DAG: [[VAR_X]] = !DILocalVariable(name: "x"
-
-define float @"?foo@@YAMMMMM@Z"(float %a, float %b, float %c, float %d) !dbg !8 {
-entry:
-  call void @llvm.dbg.value(metadata float %d, metadata !12, metadata !DIExpression()), !dbg !13
-  call void @llvm.dbg.value(metadata float %c, metadata !14, metadata !DIExpression()), !dbg !13
-  call void @llvm.dbg.value(metadata float %b, metadata !15, metadata !DIExpression()), !dbg !13
-  call void @llvm.dbg.value(metadata float %a, metadata !16, metadata !DIExpression()), !dbg !13
-  %add = fadd fast float %a, %b, !dbg !17
-  call void @llvm.dbg.value(metadata float %add, metadata !18, metadata !DIExpression()), !dbg !13
-  %cmp = fcmp fast oeq float %d, 4.000000e+00, !dbg !19
-  br i1 %cmp, label %if.then, label %return, !dbg !19
+; RUN: opt < %s -passes=reassociate -S | FileCheck %s
+; RUN: opt < %s -passes=reassociate -S --try-experimental-debuginfo-iterators | FileCheck %s
 
-if.then:                                          ; preds = %entry
-  %add1 = fadd fast float %add, %c, !dbg !20
-  call void @llvm.dbg.value(metadata float %add1, metadata !23, metadata !DIExpression()), !dbg !24
-  %sub = fsub fast float %add, 1.200000e+01, !dbg !25
-  %sub2 = fsub fast float %add1, %sub, !dbg !25
-  %mul = fmul fast float %sub2, 2.000000e+01, !dbg !25
-  %div = fdiv fast float %mul, 3.000000e+00, !dbg !25
-  br label %return, !dbg !25
+; Check that reassociate pass now undefs debug intrinsics that reference a value
+; that gets dropped and cannot be salvaged.
 
-return:                                           ; preds = %entry, %if.then
-  %retval.0 = phi float [ %div, %if.then ], [ 0.000000e+00, %entry ], !dbg !13
-  ret float %retval.0, !dbg !26
+; CHECK-NOT: %add = fadd fast float %a, %b
+; CHECK: call void @llvm.dbg.value(metadata float poison, metadata [[VAR_X:![0-9]+]], metadata !DIExpression())
+
+; CHECK-LABEL: if.then:
+; CHECK-NOT: %add1 = fadd fast float %add, %c
+; CHECK: call void @llvm.dbg.value(metadata float poison, metadata [[VAR_Y:![0-9]+]], metadata !DIExpression())
+; CHECK-LABEL: !0 =
+; CHECK-DAG: [[VAR_Y]] = !DILocalVariable(name: "y"
+; CHECK-DAG: [[VAR_X]] = !DILocalVariable(name: "x"
+
+define float @"?foo@@YAMMMMM@Z"(float %a, float %b, float %c, float %d) !dbg !8 {
+entry:
+  call void @llvm.dbg.value(metadata float %d, metadata !12, metadata !DIExpression()), !dbg !13
+  call void @llvm.dbg.value(metadata float %c, metadata !14, metadata !DIExpression()), !dbg !13
+  call void @llvm.dbg.value(metadata float %b, metadata !15, metadata !DIExpression()), !dbg !13
+  call void @llvm.dbg.value(metadata float %a, metadata !16, metadata !DIExpression()), !dbg !13
+  %add = fadd fast float %a, %b, !dbg !17
+  call void @llvm.dbg.value(metadata float %add, metadata !18, metadata !DIExpression()), !dbg !13
+  %cmp = fcmp fast oeq float %d, 4.000000e+00, !dbg !19
+  br i1 %cmp, label %if.then, label %return, !dbg !19
+
+if.then:                                          ; preds = %entry
+  %add1 = fadd fast float %add, %c, !dbg !20
+  call void @llvm.dbg.value(metadata float %add1, metadata !23, metadata !DIExpression()), !dbg !24
+  %sub = fsub fast float %add, 1.200000e+01, !dbg !25
+  %sub2 = fsub fast float %add1, %sub, !dbg !25
+  %mul = fmul fast float %sub2, 2.000000e+01, !dbg !25
+  %div = fdiv fast float %mul, 3.000000e+00, !dbg !25
+  br label %return, !dbg !25
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi float [ %div, %if.then ], [ 0.000000e+00, %entry ], !dbg !13
+  ret float %retval.0, !dbg !26
 }
 
-declare void @llvm.dbg.value(metadata, metadata, metadata)
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5, !6}
 !llvm.ident = !{!7}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 11.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
-!1 = !DIFile(filename: "undef_intrinsics_when_deleting_instructions.cpp", directory: "/")
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 11.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "undef_intrinsics_when_deleting_instructions.cpp", directory: "/")
 !2 = !{}
-!3 = !{i32 2, !"CodeView", i32 1}
+!3 = !{i32 2, !"CodeView", i32 1}
 !4 = !{i32 2, !"Debug Info Version", i32 3}
 !5 = !{i32 1, !"wchar_size", i32 2}
 !6 = !{i32 7, !"PIC Level", i32 2}
-!7 = !{!"clang version 11.0.0"}
-!8 = distinct !DISubprogram(name: "foo", linkageName: "?foo@@YAMMMMM@Z", scope: !1, file: !1, line: 1, type: !9, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
-!9 = !DISubroutineType(types: !10)
-!10 = !{!11, !11, !11, !11, !11}
-!11 = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float)
-!12 = !DILocalVariable(name: "d", arg: 4, scope: !8, file: !1, line: 1, type: !11)
-!13 = !DILocation(line: 0, scope: !8)
-!14 = !DILocalVariable(name: "c", arg: 3, scope: !8, file: !1, line: 1, type: !11)
-!15 = !DILocalVariable(name: "b", arg: 2, scope: !8, file: !1, line: 1, type: !11)
-!16 = !DILocalVariable(name: "a", arg: 1, scope: !8, file: !1, line: 1, type: !11)
-!17 = !DILocation(line: 2, scope: !8)
-!18 = !DILocalVariable(name: "x", scope: !8, file: !1, line: 2, type: !11)
-!19 = !DILocation(line: 3, scope: !8)
-!20 = !DILocation(line: 4, scope: !21)
-!21 = distinct !DILexicalBlock(scope: !22, file: !1, line: 3)
-!22 = distinct !DILexicalBlock(scope: !8, file: !1, line: 3)
-!23 = !DILocalVariable(name: "y", scope: !21, file: !1, line: 4, type: !11)
-!24 = !DILocation(line: 0, scope: !21)
-!25 = !DILocation(line: 5, scope: !21)
-!26 = !DILocation(line: 8, scope: !8)
+!7 = !{!"clang version 11.0.0"}
+!8 = distinct !DISubprogram(name: "foo", linkageName: "?foo@@YAMMMMM@Z", scope: !1, file: !1, line: 1, type: !9, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!9 = !DISubroutineType(types: !10)
+!10 = !{!11, !11, !11, !11, !11}
+!11 = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float)
+!12 = !DILocalVariable(name: "d", arg: 4, scope: !8, file: !1, line: 1, type: !11)
+!13 = !DILocation(line: 0, scope: !8)
+!14 = !DILocalVariable(name: "c", arg: 3, scope: !8, file: !1, line: 1, type: !11)
+!15 = !DILocalVariable(name: "b", arg: 2, scope: !8, file: !1, line: 1, type: !11)
+!16 = !DILocalVariable(name: "a", arg: 1, scope: !8, file: !1, line: 1, type: !11)
+!17 = !DILocation(line: 2, scope: !8)
+!18 = !DILocalVariable(name: "x", scope: !8, file: !1, line: 2, type: !11)
+!19 = !DILocation(line: 3, scope: !8)
+!20 = !DILocation(line: 4, scope: !21)
+!21 = distinct !DILexicalBlock(scope: !22, file: !1, line: 3)
+!22 = distinct !DILexicalBlock(scope: !8, file: !1, line: 3)
+!23 = !DILocalVariable(name: "y", scope: !21, file: !1, line: 4, type: !11)
+!24 = !DILocation(line: 0, scope: !21)
+!25 = !DILocation(line: 5, scope: !21)
+!26 = !DILocation(line: 8, scope: !8)
diff --git a/llvm/test/Transforms/SCCP/ip-ranges-casts.ll b/llvm/test/Transforms/SCCP/ip-ranges-casts.ll
index 05fa04a9fbe0..e8d417546def 100644
--- a/llvm/test/Transforms/SCCP/ip-ranges-casts.ll
+++ b/llvm/test/Transforms/SCCP/ip-ranges-casts.ll
@@ -167,7 +167,7 @@ define i1 @caller.sext() {
 define internal i1 @f.fptosi(i32 %x) {
 ; CHECK-LABEL: define internal i1 @f.fptosi(
 ; CHECK-SAME: i32 [[X:%.*]]) {
-; CHECK-NEXT:    [[TO_DOUBLE:%.*]] = sitofp i32 [[X]] to double
+; CHECK-NEXT:    [[TO_DOUBLE:%.*]] = uitofp nneg i32 [[X]] to double
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd double 0.000000e+00, [[TO_DOUBLE]]
 ; CHECK-NEXT:    [[TO_I32:%.*]] = fptosi double [[ADD]] to i32
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp sgt i32 [[TO_I32]], 300
@@ -209,7 +209,7 @@ define i1 @caller.fptosi() {
 define internal i1 @f.fpext(i16 %x) {
 ; CHECK-LABEL: define internal i1 @f.fpext(
 ; CHECK-SAME: i16 [[X:%.*]]) {
-; CHECK-NEXT:    [[TO_FLOAT:%.*]] = sitofp i16 [[X]] to float
+; CHECK-NEXT:    [[TO_FLOAT:%.*]] = uitofp nneg i16 [[X]] to float
 ; CHECK-NEXT:    [[TO_DOUBLE:%.*]] = fpext float [[TO_FLOAT]] to double
 ; CHECK-NEXT:    [[TO_I64:%.*]] = fptoui float [[TO_FLOAT]] to i64
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp sgt i64 [[TO_I64]], 300
@@ -293,7 +293,7 @@ define i1 @int_range_to_double_cast(i32 %a) {
 ; CHECK-LABEL: define i1 @int_range_to_double_cast(
 ; CHECK-SAME: i32 [[A:%.*]]) {
 ; CHECK-NEXT:    [[R:%.*]] = and i32 [[A]], 255
-; CHECK-NEXT:    [[T4:%.*]] = sitofp i32 [[R]] to double
+; CHECK-NEXT:    [[T4:%.*]] = uitofp nneg i32 [[R]] to double
 ; CHECK-NEXT:    [[T10:%.*]] = fadd double 0.000000e+00, [[T4]]
 ; CHECK-NEXT:    [[T11:%.*]] = fcmp olt double [[T4]], [[T10]]
 ; CHECK-NEXT:    ret i1 [[T11]]
diff --git a/llvm/test/Transforms/SCCP/sitofp.ll b/llvm/test/Transforms/SCCP/sitofp.ll
index b635263a5726..24f04ae1fccb 100644
--- a/llvm/test/Transforms/SCCP/sitofp.ll
+++ b/llvm/test/Transforms/SCCP/sitofp.ll
@@ -4,7 +4,7 @@
 define float @sitofp_and(i8 %x) {
 ; CHECK-LABEL: @sitofp_and(
 ; CHECK-NEXT:    [[PX:%.*]] = and i8 [[X:%.*]], 127
-; CHECK-NEXT:    [[R:%.*]] = sitofp i8 [[PX]] to float
+; CHECK-NEXT:    [[R:%.*]] = uitofp nneg i8 [[PX]] to float
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %px = and i8 %x, 127
@@ -23,7 +23,7 @@ define half @sitofp_const(i8 %x) {
 define double @sitofp_zext(i7 %x) {
 ; CHECK-LABEL: @sitofp_zext(
 ; CHECK-NEXT:    [[PX:%.*]] = zext i7 [[X:%.*]] to i8
-; CHECK-NEXT:    [[R:%.*]] = sitofp i8 [[PX]] to double
+; CHECK-NEXT:    [[R:%.*]] = uitofp nneg i8 [[PX]] to double
 ; CHECK-NEXT:    ret double [[R]]
 ;
   %px = zext i7 %x to i8
@@ -52,7 +52,7 @@ define float @dominating_condition(i32 %x) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[X:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[T:%.*]], label [[F:%.*]]
 ; CHECK:       t:
-; CHECK-NEXT:    [[A:%.*]] = sitofp i32 [[X]] to float
+; CHECK-NEXT:    [[A:%.*]] = uitofp nneg i32 [[X]] to float
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       f:
 ; CHECK-NEXT:    br label [[EXIT]]
@@ -86,7 +86,7 @@ define float @dominating_condition_alt(i32 %x) {
 ; CHECK:       t:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       f:
-; CHECK-NEXT:    [[A:%.*]] = sitofp i32 [[X]] to float
+; CHECK-NEXT:    [[A:%.*]] = uitofp nneg i32 [[X]] to float
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[COND:%.*]] = phi float [ -4.200000e+01, [[T]] ], [ [[A]], [[F]] ]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll
index 76bb882171b1..3ebe920d1734 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll
@@ -5,14 +5,7 @@ define void @h() {
 ; CHECK-LABEL: define void @h() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc i32 0 to i1
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 poison, i1 false, i1 false, i1 false>, i1 [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP1:%.*]] = sub <8 x i1> [[TMP0]], zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = add <8 x i1> [[TMP0]], zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i1> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <8 x i1> [[TMP5]] to <8 x i16>
-; CHECK-NEXT:    store <8 x i16> [[TMP4]], ptr [[ARRAYIDX2]], align 2
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr [[ARRAYIDX2]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/unsigned-after-sext-node.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/unsigned-after-sext-node.ll
new file mode 100644
index 000000000000..96ed3e77d987
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/unsigned-after-sext-node.ll
@@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -mtriple=aarch64 -passes=slp-vectorizer -S -slp-threshold=-100 < %s | FileCheck %s
+
+define i16 @test()  {
+; CHECK-LABEL: define i16 @test() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LNOT:%.*]] = xor i1 true, true
+; CHECK-NEXT:    [[LNOT_EXT:%.*]] = zext i1 [[LNOT]] to i16
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i16 0, [[LNOT_EXT]]
+; CHECK-NEXT:    [[LNOT5:%.*]] = xor i1 true, true
+; CHECK-NEXT:    [[LNOT_EXT6:%.*]] = zext i1 [[LNOT5]] to i16
+; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i16 [[ADD]], [[LNOT_EXT6]]
+; CHECK-NEXT:    ret i16 [[ADD7]]
+;
+entry:
+  %conv = sext i16 1 to i32
+  %cmp = icmp eq i32 %conv, 1
+  %lnot = xor i1 %cmp, true
+  %lnot.ext = zext i1 %lnot to i16
+  %add = add nsw i16 0, %lnot.ext
+  %conv2 = sext i16 1 to i32
+  %cmp3 = icmp eq i32 %conv2, 1
+  %lnot5 = xor i1 %cmp3, true
+  %lnot.ext6 = zext i1 %lnot5 to i16
+  %add7 = add nsw i16 %add, %lnot.ext6
+  ret i16 %add7
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/user-node-not-in-bitwidths.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/user-node-not-in-bitwidths.ll
index 2ab6e919c23b..6404cf4a2cd1 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/user-node-not-in-bitwidths.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/user-node-not-in-bitwidths.ll
@@ -5,12 +5,7 @@ define void @h() {
 ; CHECK-LABEL: define void @h() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 0 to i1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 poison, i1 false, i1 false, i1 false>, i1 [[TMP0]], i32 4
-; CHECK-NEXT:    [[TMP2:%.*]] = or <8 x i1> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i1> zeroinitializer, [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <8 x i1> [[TMP3]] to <8 x i16>
-; CHECK-NEXT:    store <8 x i16> [[TMP4]], ptr [[ARRAYIDX2]], align 2
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr [[ARRAYIDX2]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/shuffled-gather-casted.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/shuffled-gather-casted.ll
new file mode 100644
index 000000000000..c83d9363c180
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/shuffled-gather-casted.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=slp-vectorizer -mtriple=riscv64-unknown-linux-gnu -mattr=+v < %s | FileCheck %s
+
+define i32 @test(ptr %p) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[D_0:%.*]] = load i16, ptr [[P]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> <i16 0, i16 poison, i16 0, i16 0>, i16 [[D_0]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = or <4 x i16> [[TMP0]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> <i32 0, i32 poison, i32 0, i32 0>, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt <4 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i16> [[TMP2]], <4 x i16> <i16 0, i16 2, i16 0, i16 0>
+; CHECK-NEXT:    [[TMP7:%.*]] = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP7]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP8]], i32 1)
+; CHECK-NEXT:    ret i32 [[TMP9]]
+;
+entry:
+  %d.0 = load i16, ptr %p, align 4
+  %zext.d.0 = zext i16 %d.0 to i32
+  %zero.0 = zext i16 0 to i32
+  %zero.1 = zext i16 0 to i32
+  %zero.2 = zext i16 0 to i32
+
+  %or.d.0 = or i32 %zext.d.0, 0
+  %or.zero.0 = or i32 %zero.0, 0
+  %or.zero.1 = or i32 %zero.1, 0
+  %or.zero.2 = or i32 %zero.2, 0
+
+  %zero.d.0 = and i32 %or.d.0, 0
+  %and.zero.0 = and i32 %or.zero.0, 0
+  %and.zero.1 = and i32 %or.zero.1, 0
+  %and.zero.2 = and i32 %or.zero.2, 0
+
+  %d.0.gt.0 = icmp sgt i32 %zext.d.0, 0
+  %false.0 = icmp sgt i32 0, 0
+  %false.1 = icmp sgt i32 0, 0
+  %false.2 = icmp sgt i32 0, 0
+
+  %select.0.2 = select i1 %d.0.gt.0, i32 %zero.d.0, i32 2
+  %select.1.0 = select i1 %false.0, i32 %and.zero.0, i32 0
+  %select.2.0 = select i1 %false.1, i32 %and.zero.1, i32 0
+  %select.3.0 = select i1 %false.2, i32 %and.zero.2, i32 0
+
+  %max.0 = call i32 @llvm.umax.i32(i32 %select.0.2, i32 %select.1.0)
+  %max.1 = call i32 @llvm.umax.i32(i32 %max.0, i32 %select.2.0)
+  %max.2 = call i32 @llvm.umax.i32(i32 %max.1, i32 %select.3.0)
+  %max.3 = call i32 @llvm.umax.i32(i32 %max.2, i32 1)
+
+  ret i32 %max.3
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/smin-signed-zextended.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/smin-signed-zextended.ll
new file mode 100644
index 000000000000..54b7b1192ec9
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/smin-signed-zextended.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=slp-vectorizer -mtriple=riscv64-unknown-linux-gnu -mattr=+v < %s | FileCheck %s
+
+define <4 x i32> @test(i16 %0, i16 %1) {
+; CHECK-LABEL: define <4 x i32> @test(
+; CHECK-SAME: i16 [[TMP0:%.*]], i16 [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> <i16 poison, i16 0>, i16 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[CONV15_I:%.*]] = sext i16 [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <4 x i32> [[TMP4]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>, i32 [[CONV15_I]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = and <4 x i32> [[TMP8]], <i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT:    ret <4 x i32> [[TMP9]]
+;
+entry:
+  %conv13.1.i = zext i16 %1 to i32
+  %not.i = xor i32 %conv13.1.i, -1
+  %cond19.i = tail call i32 @llvm.smax.i32(i32 %not.i, i32 0)
+  %conv21.i = and i32 %cond19.i, 65535
+  %not.1.i = xor i32 %conv13.1.i, -1
+  %conv15.i = sext i16 %0 to i32
+  %cond19.1.i = tail call i32 @llvm.smax.i32(i32 %not.1.i, i32 %conv15.i)
+  %conv21.1.i = and i32 %cond19.1.i, 65535
+  %not.2.i = xor i32 %conv13.1.i, -1
+  %cond19.2.i = tail call i32 @llvm.smax.i32(i32 %not.2.i, i32 %conv15.i)
+  %conv21.2.i = and i32 %cond19.2.i, 65535
+  %conv13.3.i = zext i16 0 to i32
+  %not.3.i = xor i32 %conv13.3.i, -1
+  %cond19.3.i = tail call i32 @llvm.smax.i32(i32 %not.3.i, i32 %conv15.i)
+  %conv21.3.i = and i32 %cond19.3.i, 65535
+  %ins1 = insertelement <4 x i32> poison, i32 %conv21.i, i32 0
+  %ins2 = insertelement <4 x i32> %ins1, i32 %conv21.1.i, i32 1
+  %ins3 = insertelement <4 x i32> %ins2, i32 %conv21.2.i, i32 2
+  %ins4 = insertelement <4 x i32> %ins3, i32 %conv21.3.i, i32 3
+  ret <4 x i32> %ins4
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-stores-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-stores-vectorized.ll
index 56e8829b0ec6..d4dca87bfd8e 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-stores-vectorized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-stores-vectorized.ll
@@ -5,11 +5,11 @@ define void @store_reverse(ptr %p3) {
 ; CHECK-LABEL: @store_reverse(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[P3:%.*]], i64 8
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 7
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[P3]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[ARRAYIDX1]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl <4 x i64> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    call void @llvm.experimental.vp.strided.store.v4i64.p0.i64(<4 x i64> [[TMP2]], ptr align 8 [[ARRAYIDX14]], i64 -8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 4)
+; CHECK-NEXT:    call void @llvm.experimental.vp.strided.store.v4i64.p0.i64(<4 x i64> [[TMP2]], ptr align 8 [[ARRAYIDX2]], i64 -8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 4)
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-root-trunc.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-root-trunc.ll
index 1bb87bf6205f..3c8e98485ffc 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-root-trunc.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-root-trunc.ll
@@ -4,10 +4,9 @@
 define void @test(ptr %a, i8 %0, i16 %b.promoted.i) {
 ; CHECK-LABEL: define void @test(
 ; CHECK-SAME: ptr [[A:%.*]], i8 [[TMP0:%.*]], i16 [[B_PROMOTED_I:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP0]] to i128
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[B_PROMOTED_I]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP2]] to i16
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i8 [[TMP0]] to i16
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i16> poison, i16 [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i16> [[TMP4]], [[TMP7]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-call.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-call.ll
new file mode 100644
index 000000000000..0bd152e18fb4
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-call.ll
@@ -0,0 +1,540 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,SCALAR
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S -mcpu=x86-64-v2 -mattr=+prefer-128-bit | FileCheck %s --check-prefixes=CHECK,VEC128
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S -mcpu=x86-64-v2 -mattr=-prefer-128-bit | FileCheck %s --check-prefixes=CHECK,VEC128
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S -mcpu=x86-64-v3 -mattr=+prefer-128-bit | FileCheck %s --check-prefixes=CHECK,VEC128
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S -mcpu=x86-64-v3 -mattr=-prefer-128-bit | FileCheck %s --check-prefixes=CHECK,VEC256,VEC256-AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S -mcpu=x86-64-v4 -mattr=+prefer-256-bit | FileCheck %s --check-prefixes=CHECK,VEC256,VEC256-AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S -mcpu=x86-64-v4 -mattr=-prefer-256-bit | FileCheck %s --check-prefixes=CHECK,VEC512
+
+@f64 = common global [16 x double] zeroinitializer, align 64
+@f32 = common global [16 x float] zeroinitializer, align 64
+@r64 = common global [16 x i64] zeroinitializer, align 64
+@r32 = common global [16 x i32] zeroinitializer, align 64
+
+define void @rint_v8f32_v8f32() {
+; SCALAR-LABEL: @rint_v8f32_v8f32(
+; SCALAR-NEXT:    [[A0:%.*]] = load float, ptr @f32, align 8
+; SCALAR-NEXT:    [[A1:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 1), align 8
+; SCALAR-NEXT:    [[A2:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 2), align 8
+; SCALAR-NEXT:    [[A3:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 3), align 8
+; SCALAR-NEXT:    [[A4:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 4), align 8
+; SCALAR-NEXT:    [[A5:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 5), align 8
+; SCALAR-NEXT:    [[A6:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 6), align 8
+; SCALAR-NEXT:    [[A7:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 7), align 8
+; SCALAR-NEXT:    [[R0:%.*]] = call float @llvm.rint.f32(float [[A0]])
+; SCALAR-NEXT:    [[R1:%.*]] = call float @llvm.rint.f32(float [[A1]])
+; SCALAR-NEXT:    [[R2:%.*]] = call float @llvm.rint.f32(float [[A2]])
+; SCALAR-NEXT:    [[R3:%.*]] = call float @llvm.rint.f32(float [[A3]])
+; SCALAR-NEXT:    [[R4:%.*]] = call float @llvm.rint.f32(float [[A4]])
+; SCALAR-NEXT:    [[R5:%.*]] = call float @llvm.rint.f32(float [[A5]])
+; SCALAR-NEXT:    [[R6:%.*]] = call float @llvm.rint.f32(float [[A6]])
+; SCALAR-NEXT:    [[R7:%.*]] = call float @llvm.rint.f32(float [[A7]])
+; SCALAR-NEXT:    store float [[R0]], ptr @f32, align 8
+; SCALAR-NEXT:    store float [[R1]], ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 1), align 8
+; SCALAR-NEXT:    store float [[R2]], ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 2), align 8
+; SCALAR-NEXT:    store float [[R3]], ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 3), align 8
+; SCALAR-NEXT:    store float [[R4]], ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 4), align 8
+; SCALAR-NEXT:    store float [[R5]], ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 5), align 8
+; SCALAR-NEXT:    store float [[R6]], ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 6), align 8
+; SCALAR-NEXT:    store float [[R7]], ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 7), align 8
+; SCALAR-NEXT:    ret void
+;
+; VEC128-LABEL: @rint_v8f32_v8f32(
+; VEC128-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @f32, align 8
+; VEC128-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]])
+; VEC128-NEXT:    store <4 x float> [[TMP2]], ptr @f32, align 8
+; VEC128-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 4), align 8
+; VEC128-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP3]])
+; VEC128-NEXT:    store <4 x float> [[TMP4]], ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 4), align 8
+; VEC128-NEXT:    ret void
+;
+; VEC256-LABEL: @rint_v8f32_v8f32(
+; VEC256-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr @f32, align 8
+; VEC256-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP1]])
+; VEC256-NEXT:    store <8 x float> [[TMP2]], ptr @f32, align 8
+; VEC256-NEXT:    ret void
+;
+; VEC512-LABEL: @rint_v8f32_v8f32(
+; VEC512-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr @f32, align 8
+; VEC512-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP1]])
+; VEC512-NEXT:    store <8 x float> [[TMP2]], ptr @f32, align 8
+; VEC512-NEXT:    ret void
+;
+  %a0 = load float, ptr @f32, align 8
+  %a1 = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 1), align 8
+  %a2 = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 2), align 8
+  %a3 = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 3), align 8
+  %a4 = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 4), align 8
+  %a5 = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 5), align 8
+  %a6 = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 6), align 8
+  %a7 = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 7), align 8
+  %r0 = call float @llvm.rint.i32.f32(float %a0)
+  %r1 = call float @llvm.rint.i32.f32(float %a1)
+  %r2 = call float @llvm.rint.i32.f32(float %a2)
+  %r3 = call float @llvm.rint.i32.f32(float %a3)
+  %r4 = call float @llvm.rint.i32.f32(float %a4)
+  %r5 = call float @llvm.rint.i32.f32(float %a5)
+  %r6 = call float @llvm.rint.i32.f32(float %a6)
+  %r7 = call float @llvm.rint.i32.f32(float %a7)
+  store float %r0, ptr @f32, align 8
+  store float %r1, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 1), align 8
+  store float %r2, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 2), align 8
+  store float %r3, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 3), align 8
+  store float %r4, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 4), align 8
+  store float %r5, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 5), align 8
+  store float %r6, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 6), align 8
+  store float %r7, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 7), align 8
+  ret void
+}
+
+define void @rint_v8f64_v8f64() {
+; SCALAR-LABEL: @rint_v8f64_v8f64(
+; SCALAR-NEXT:    [[A0:%.*]] = load double, ptr @f64, align 8
+; SCALAR-NEXT:    [[A1:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 1), align 8
+; SCALAR-NEXT:    [[A2:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 2), align 8
+; SCALAR-NEXT:    [[A3:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 3), align 8
+; SCALAR-NEXT:    [[A4:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 4), align 8
+; SCALAR-NEXT:    [[A5:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 5), align 8
+; SCALAR-NEXT:    [[A6:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 6), align 8
+; SCALAR-NEXT:    [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 7), align 8
+; SCALAR-NEXT:    [[R0:%.*]] = call double @llvm.rint.f64(double [[A0]])
+; SCALAR-NEXT:    [[R1:%.*]] = call double @llvm.rint.f64(double [[A1]])
+; SCALAR-NEXT:    [[R2:%.*]] = call double @llvm.rint.f64(double [[A2]])
+; SCALAR-NEXT:    [[R3:%.*]] = call double @llvm.rint.f64(double [[A3]])
+; SCALAR-NEXT:    [[R4:%.*]] = call double @llvm.rint.f64(double [[A4]])
+; SCALAR-NEXT:    [[R5:%.*]] = call double @llvm.rint.f64(double [[A5]])
+; SCALAR-NEXT:    [[R6:%.*]] = call double @llvm.rint.f64(double [[A6]])
+; SCALAR-NEXT:    [[R7:%.*]] = call double @llvm.rint.f64(double [[A7]])
+; SCALAR-NEXT:    store double [[R0]], ptr @f64, align 8
+; SCALAR-NEXT:    store double [[R1]], ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 1), align 8
+; SCALAR-NEXT:    store double [[R2]], ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 2), align 8
+; SCALAR-NEXT:    store double [[R3]], ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 3), align 8
+; SCALAR-NEXT:    store double [[R4]], ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 4), align 8
+; SCALAR-NEXT:    store double [[R5]], ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 5), align 8
+; SCALAR-NEXT:    store double [[R6]], ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 6), align 8
+; SCALAR-NEXT:    store double [[R7]], ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 7), align 8
+; SCALAR-NEXT:    ret void
+;
+; VEC128-LABEL: @rint_v8f64_v8f64(
+; VEC128-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr @f64, align 8
+; VEC128-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]])
+; VEC128-NEXT:    store <2 x double> [[TMP2]], ptr @f64, align 8
+; VEC128-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 2), align 8
+; VEC128-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP3]])
+; VEC128-NEXT:    store <2 x double> [[TMP4]], ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 2), align 8
+; VEC128-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 4), align 8
+; VEC128-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP5]])
+; VEC128-NEXT:    store <2 x double> [[TMP6]], ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 4), align 8
+; VEC128-NEXT:    [[TMP7:%.*]] = load <2 x double>, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 6), align 8
+; VEC128-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP7]])
+; VEC128-NEXT:    store <2 x double> [[TMP8]], ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 6), align 8
+; VEC128-NEXT:    ret void
+;
+; VEC256-LABEL: @rint_v8f64_v8f64(
+; VEC256-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr @f64, align 8
+; VEC256-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP1]])
+; VEC256-NEXT:    store <4 x double> [[TMP2]], ptr @f64, align 8
+; VEC256-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 4), align 8
+; VEC256-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP3]])
+; VEC256-NEXT:    store <4 x double> [[TMP4]], ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 4), align 8
+; VEC256-NEXT:    ret void
+;
+; VEC512-LABEL: @rint_v8f64_v8f64(
+; VEC512-NEXT:    [[TMP1:%.*]] = load <8 x double>, ptr @f64, align 8
+; VEC512-NEXT:    [[TMP2:%.*]] = call <8 x double> @llvm.rint.v8f64(<8 x double> [[TMP1]])
+; VEC512-NEXT:    store <8 x double> [[TMP2]], ptr @f64, align 8
+; VEC512-NEXT:    ret void
+;
+  %a0 = load double, ptr @f64, align 8
+  %a1 = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 1), align 8
+  %a2 = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 2), align 8
+  %a3 = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 3), align 8
+  %a4 = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 4), align 8
+  %a5 = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 5), align 8
+  %a6 = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 6), align 8
+  %a7 = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 7), align 8
+  %r0 = call double @llvm.rint.f64.f64(double %a0)
+  %r1 = call double @llvm.rint.f64.f64(double %a1)
+  %r2 = call double @llvm.rint.f64.f64(double %a2)
+  %r3 = call double @llvm.rint.f64.f64(double %a3)
+  %r4 = call double @llvm.rint.f64.f64(double %a4)
+  %r5 = call double @llvm.rint.f64.f64(double %a5)
+  %r6 = call double @llvm.rint.f64.f64(double %a6)
+  %r7 = call double @llvm.rint.f64.f64(double %a7)
+  store double %r0, ptr @f64, align 8
+  store double %r1, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 1), align 8
+  store double %r2, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 2), align 8
+  store double %r3, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 3), align 8
+  store double %r4, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 4), align 8
+  store double %r5, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 5), align 8
+  store double %r6, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 6), align 8
+  store double %r7, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 7), align 8
+  ret void
+}
+
+define void @lrint_v8f32_v8i32() {
+; SCALAR-LABEL: @lrint_v8f32_v8i32(
+; SCALAR-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @f32, align 8
+; SCALAR-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> [[TMP1]])
+; SCALAR-NEXT:    store <4 x i32> [[TMP2]], ptr @r32, align 8
+; SCALAR-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 4), align 8
+; SCALAR-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> [[TMP3]])
+; SCALAR-NEXT:    store <4 x i32> [[TMP4]], ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 4), align 8
+; SCALAR-NEXT:    ret void
+;
+; VEC128-LABEL: @lrint_v8f32_v8i32(
+; VEC128-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @f32, align 8
+; VEC128-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> [[TMP1]])
+; VEC128-NEXT:    store <4 x i32> [[TMP2]], ptr @r32, align 8
+; VEC128-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 4), align 8
+; VEC128-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> [[TMP3]])
+; VEC128-NEXT:    store <4 x i32> [[TMP4]], ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 4), align 8
+; VEC128-NEXT:    ret void
+;
+; VEC256-LABEL: @lrint_v8f32_v8i32(
+; VEC256-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr @f32, align 8
+; VEC256-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> [[TMP1]])
+; VEC256-NEXT:    store <8 x i32> [[TMP2]], ptr @r32, align 8
+; VEC256-NEXT:    ret void
+;
+; VEC512-LABEL: @lrint_v8f32_v8i32(
+; VEC512-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr @f32, align 8
+; VEC512-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> [[TMP1]])
+; VEC512-NEXT:    store <8 x i32> [[TMP2]], ptr @r32, align 8
+; VEC512-NEXT:    ret void
+;
+  %a0 = load float, ptr @f32, align 8
+  %a1 = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 1), align 8
+  %a2 = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 2), align 8
+  %a3 = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 3), align 8
+  %a4 = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 4), align 8
+  %a5 = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 5), align 8
+  %a6 = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 6), align 8
+  %a7 = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 7), align 8
+  %r0 = call i32 @llvm.lrint.i32.f32(float %a0)
+  %r1 = call i32 @llvm.lrint.i32.f32(float %a1)
+  %r2 = call i32 @llvm.lrint.i32.f32(float %a2)
+  %r3 = call i32 @llvm.lrint.i32.f32(float %a3)
+  %r4 = call i32 @llvm.lrint.i32.f32(float %a4)
+  %r5 = call i32 @llvm.lrint.i32.f32(float %a5)
+  %r6 = call i32 @llvm.lrint.i32.f32(float %a6)
+  %r7 = call i32 @llvm.lrint.i32.f32(float %a7)
+  store i32 %r0, ptr @r32, align 8
+  store i32 %r1, ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 1), align 8
+  store i32 %r2, ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 2), align 8
+  store i32 %r3, ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 3), align 8
+  store i32 %r4, ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 4), align 8
+  store i32 %r5, ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 5), align 8
+  store i32 %r6, ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 6), align 8
+  store i32 %r7, ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 7), align 8
+  ret void
+}
+
+define void @lrint_v8f64_v8i32() {
+; SCALAR-LABEL: @lrint_v8f64_v8i32(
+; SCALAR-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr @f64, align 8
+; SCALAR-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> [[TMP1]])
+; SCALAR-NEXT:    store <4 x i32> [[TMP2]], ptr @r32, align 8
+; SCALAR-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 4), align 8
+; SCALAR-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> [[TMP3]])
+; SCALAR-NEXT:    store <4 x i32> [[TMP4]], ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 4), align 8
+; SCALAR-NEXT:    ret void
+;
+; VEC128-LABEL: @lrint_v8f64_v8i32(
+; VEC128-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr @f64, align 8
+; VEC128-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> [[TMP1]])
+; VEC128-NEXT:    store <4 x i32> [[TMP2]], ptr @r32, align 8
+; VEC128-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 4), align 8
+; VEC128-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.lrint.v4i32.v4f64(<4 x double> [[TMP3]])
+; VEC128-NEXT:    store <4 x i32> [[TMP4]], ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 4), align 8
+; VEC128-NEXT:    ret void
+;
+; VEC256-LABEL: @lrint_v8f64_v8i32(
+; VEC256-NEXT:    [[TMP1:%.*]] = load <8 x double>, ptr @f64, align 8
+; VEC256-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> [[TMP1]])
+; VEC256-NEXT:    store <8 x i32> [[TMP2]], ptr @r32, align 8
+; VEC256-NEXT:    ret void
+;
+; VEC512-LABEL: @lrint_v8f64_v8i32(
+; VEC512-NEXT:    [[TMP1:%.*]] = load <8 x double>, ptr @f64, align 8
+; VEC512-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.lrint.v8i32.v8f64(<8 x double> [[TMP1]])
+; VEC512-NEXT:    store <8 x i32> [[TMP2]], ptr @r32, align 8
+; VEC512-NEXT:    ret void
+;
+  %a0 = load double, ptr @f64, align 8
+  %a1 = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 1), align 8
+  %a2 = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 2), align 8
+  %a3 = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 3), align 8
+  %a4 = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 4), align 8
+  %a5 = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 5), align 8
+  %a6 = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 6), align 8
+  %a7 = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 7), align 8
+  %r0 = call i32 @llvm.lrint.i32.f64(double %a0)
+  %r1 = call i32 @llvm.lrint.i32.f64(double %a1)
+  %r2 = call i32 @llvm.lrint.i32.f64(double %a2)
+  %r3 = call i32 @llvm.lrint.i32.f64(double %a3)
+  %r4 = call i32 @llvm.lrint.i32.f64(double %a4)
+  %r5 = call i32 @llvm.lrint.i32.f64(double %a5)
+  %r6 = call i32 @llvm.lrint.i32.f64(double %a6)
+  %r7 = call i32 @llvm.lrint.i32.f64(double %a7)
+  store i32 %r0, ptr @r32, align 8
+  store i32 %r1, ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 1), align 8
+  store i32 %r2, ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 2), align 8
+  store i32 %r3, ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 3), align 8
+  store i32 %r4, ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 4), align 8
+  store i32 %r5, ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 5), align 8
+  store i32 %r6, ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 6), align 8
+  store i32 %r7, ptr getelementptr inbounds ([8 x i32], ptr @r32, i32 0, i32 7), align 8
+  ret void
+}
+
+define void @llrint_v8f32_v8i64() {
+; SCALAR-LABEL: @llrint_v8f32_v8i64(
+; SCALAR-NEXT:    [[A0:%.*]] = load float, ptr @f32, align 8
+; SCALAR-NEXT:    [[A1:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 1), align 8
+; SCALAR-NEXT:    [[A2:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 2), align 8
+; SCALAR-NEXT:    [[A3:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 3), align 8
+; SCALAR-NEXT:    [[A4:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 4), align 8
+; SCALAR-NEXT:    [[A5:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 5), align 8
+; SCALAR-NEXT:    [[A6:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 6), align 8
+; SCALAR-NEXT:    [[A7:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 7), align 8
+; SCALAR-NEXT:    [[R0:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A0]])
+; SCALAR-NEXT:    [[R1:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A1]])
+; SCALAR-NEXT:    [[R2:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A2]])
+; SCALAR-NEXT:    [[R3:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A3]])
+; SCALAR-NEXT:    [[R4:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A4]])
+; SCALAR-NEXT:    [[R5:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A5]])
+; SCALAR-NEXT:    [[R6:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A6]])
+; SCALAR-NEXT:    [[R7:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A7]])
+; SCALAR-NEXT:    store i64 [[R0]], ptr @r64, align 8
+; SCALAR-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 1), align 8
+; SCALAR-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 2), align 8
+; SCALAR-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 3), align 8
+; SCALAR-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 4), align 8
+; SCALAR-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 5), align 8
+; SCALAR-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 6), align 8
+; SCALAR-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 7), align 8
+; SCALAR-NEXT:    ret void
+;
+; VEC128-LABEL: @llrint_v8f32_v8i64(
+; VEC128-NEXT:    [[A0:%.*]] = load float, ptr @f32, align 8
+; VEC128-NEXT:    [[A1:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 1), align 8
+; VEC128-NEXT:    [[A2:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 2), align 8
+; VEC128-NEXT:    [[A3:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 3), align 8
+; VEC128-NEXT:    [[A4:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 4), align 8
+; VEC128-NEXT:    [[A5:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 5), align 8
+; VEC128-NEXT:    [[A6:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 6), align 8
+; VEC128-NEXT:    [[A7:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 7), align 8
+; VEC128-NEXT:    [[R0:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A0]])
+; VEC128-NEXT:    [[R1:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A1]])
+; VEC128-NEXT:    [[R2:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A2]])
+; VEC128-NEXT:    [[R3:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A3]])
+; VEC128-NEXT:    [[R4:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A4]])
+; VEC128-NEXT:    [[R5:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A5]])
+; VEC128-NEXT:    [[R6:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A6]])
+; VEC128-NEXT:    [[R7:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A7]])
+; VEC128-NEXT:    store i64 [[R0]], ptr @r64, align 8
+; VEC128-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 1), align 8
+; VEC128-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 2), align 8
+; VEC128-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 3), align 8
+; VEC128-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 4), align 8
+; VEC128-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 5), align 8
+; VEC128-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 6), align 8
+; VEC128-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 7), align 8
+; VEC128-NEXT:    ret void
+;
+; VEC256-AVX2-LABEL: @llrint_v8f32_v8i64(
+; VEC256-AVX2-NEXT:    [[A0:%.*]] = load float, ptr @f32, align 8
+; VEC256-AVX2-NEXT:    [[A1:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 1), align 8
+; VEC256-AVX2-NEXT:    [[A2:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 2), align 8
+; VEC256-AVX2-NEXT:    [[A3:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 3), align 8
+; VEC256-AVX2-NEXT:    [[A4:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 4), align 8
+; VEC256-AVX2-NEXT:    [[A5:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 5), align 8
+; VEC256-AVX2-NEXT:    [[A6:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 6), align 8
+; VEC256-AVX2-NEXT:    [[A7:%.*]] = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 7), align 8
+; VEC256-AVX2-NEXT:    [[R0:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A0]])
+; VEC256-AVX2-NEXT:    [[R1:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A1]])
+; VEC256-AVX2-NEXT:    [[R2:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A2]])
+; VEC256-AVX2-NEXT:    [[R3:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A3]])
+; VEC256-AVX2-NEXT:    [[R4:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A4]])
+; VEC256-AVX2-NEXT:    [[R5:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A5]])
+; VEC256-AVX2-NEXT:    [[R6:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A6]])
+; VEC256-AVX2-NEXT:    [[R7:%.*]] = call i64 @llvm.llrint.i64.f32(float [[A7]])
+; VEC256-AVX2-NEXT:    store i64 [[R0]], ptr @r64, align 8
+; VEC256-AVX2-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 1), align 8
+; VEC256-AVX2-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 2), align 8
+; VEC256-AVX2-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 3), align 8
+; VEC256-AVX2-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 4), align 8
+; VEC256-AVX2-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 5), align 8
+; VEC256-AVX2-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 6), align 8
+; VEC256-AVX2-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 7), align 8
+; VEC256-AVX2-NEXT:    ret void
+;
+; VEC256-AVX512-LABEL: @llrint_v8f32_v8i64(
+; VEC256-AVX512-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @f32, align 8
+; VEC256-AVX512-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> [[TMP1]])
+; VEC256-AVX512-NEXT:    store <4 x i64> [[TMP2]], ptr @r64, align 8
+; VEC256-AVX512-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 4), align 8
+; VEC256-AVX512-NEXT:    [[TMP4:%.*]] = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> [[TMP3]])
+; VEC256-AVX512-NEXT:    store <4 x i64> [[TMP4]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 4), align 8
+; VEC256-AVX512-NEXT:    ret void
+;
+; VEC512-LABEL: @llrint_v8f32_v8i64(
+; VEC512-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr @f32, align 8
+; VEC512-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> [[TMP1]])
+; VEC512-NEXT:    store <8 x i64> [[TMP2]], ptr @r64, align 8
+; VEC512-NEXT:    ret void
+;
+  %a0 = load float, ptr @f32, align 8
+  %a1 = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 1), align 8
+  %a2 = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 2), align 8
+  %a3 = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 3), align 8
+  %a4 = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 4), align 8
+  %a5 = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 5), align 8
+  %a6 = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 6), align 8
+  %a7 = load float, ptr getelementptr inbounds ([8 x float], ptr @f32, i32 0, i32 7), align 8
+  %r0 = call i64 @llvm.llrint.i64.f32(float %a0)
+  %r1 = call i64 @llvm.llrint.i64.f32(float %a1)
+  %r2 = call i64 @llvm.llrint.i64.f32(float %a2)
+  %r3 = call i64 @llvm.llrint.i64.f32(float %a3)
+  %r4 = call i64 @llvm.llrint.i64.f32(float %a4)
+  %r5 = call i64 @llvm.llrint.i64.f32(float %a5)
+  %r6 = call i64 @llvm.llrint.i64.f32(float %a6)
+  %r7 = call i64 @llvm.llrint.i64.f32(float %a7)
+  store i64 %r0, ptr @r64, align 8
+  store i64 %r1, ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 1), align 8
+  store i64 %r2, ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 2), align 8
+  store i64 %r3, ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 3), align 8
+  store i64 %r4, ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 4), align 8
+  store i64 %r5, ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 5), align 8
+  store i64 %r6, ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 6), align 8
+  store i64 %r7, ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 7), align 8
+  ret void
+}
+
+define void @llrint_v8f64_v8i64() {
+; SCALAR-LABEL: @llrint_v8f64_v8i64(
+; SCALAR-NEXT:    [[A0:%.*]] = load double, ptr @f64, align 8
+; SCALAR-NEXT:    [[A1:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 1), align 8
+; SCALAR-NEXT:    [[A2:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 2), align 8
+; SCALAR-NEXT:    [[A3:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 3), align 8
+; SCALAR-NEXT:    [[A4:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 4), align 8
+; SCALAR-NEXT:    [[A5:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 5), align 8
+; SCALAR-NEXT:    [[A6:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 6), align 8
+; SCALAR-NEXT:    [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 7), align 8
+; SCALAR-NEXT:    [[R0:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A0]])
+; SCALAR-NEXT:    [[R1:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A1]])
+; SCALAR-NEXT:    [[R2:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A2]])
+; SCALAR-NEXT:    [[R3:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A3]])
+; SCALAR-NEXT:    [[R4:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A4]])
+; SCALAR-NEXT:    [[R5:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A5]])
+; SCALAR-NEXT:    [[R6:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A6]])
+; SCALAR-NEXT:    [[R7:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A7]])
+; SCALAR-NEXT:    store i64 [[R0]], ptr @r64, align 8
+; SCALAR-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 1), align 8
+; SCALAR-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 2), align 8
+; SCALAR-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 3), align 8
+; SCALAR-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 4), align 8
+; SCALAR-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 5), align 8
+; SCALAR-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 6), align 8
+; SCALAR-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 7), align 8
+; SCALAR-NEXT:    ret void
+;
+; VEC128-LABEL: @llrint_v8f64_v8i64(
+; VEC128-NEXT:    [[A0:%.*]] = load double, ptr @f64, align 8
+; VEC128-NEXT:    [[A1:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 1), align 8
+; VEC128-NEXT:    [[A2:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 2), align 8
+; VEC128-NEXT:    [[A3:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 3), align 8
+; VEC128-NEXT:    [[A4:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 4), align 8
+; VEC128-NEXT:    [[A5:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 5), align 8
+; VEC128-NEXT:    [[A6:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 6), align 8
+; VEC128-NEXT:    [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 7), align 8
+; VEC128-NEXT:    [[R0:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A0]])
+; VEC128-NEXT:    [[R1:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A1]])
+; VEC128-NEXT:    [[R2:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A2]])
+; VEC128-NEXT:    [[R3:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A3]])
+; VEC128-NEXT:    [[R4:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A4]])
+; VEC128-NEXT:    [[R5:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A5]])
+; VEC128-NEXT:    [[R6:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A6]])
+; VEC128-NEXT:    [[R7:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A7]])
+; VEC128-NEXT:    store i64 [[R0]], ptr @r64, align 8
+; VEC128-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 1), align 8
+; VEC128-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 2), align 8
+; VEC128-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 3), align 8
+; VEC128-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 4), align 8
+; VEC128-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 5), align 8
+; VEC128-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 6), align 8
+; VEC128-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 7), align 8
+; VEC128-NEXT:    ret void
+;
+; VEC256-AVX2-LABEL: @llrint_v8f64_v8i64(
+; VEC256-AVX2-NEXT:    [[A0:%.*]] = load double, ptr @f64, align 8
+; VEC256-AVX2-NEXT:    [[A1:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 1), align 8
+; VEC256-AVX2-NEXT:    [[A2:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 2), align 8
+; VEC256-AVX2-NEXT:    [[A3:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 3), align 8
+; VEC256-AVX2-NEXT:    [[A4:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 4), align 8
+; VEC256-AVX2-NEXT:    [[A5:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 5), align 8
+; VEC256-AVX2-NEXT:    [[A6:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 6), align 8
+; VEC256-AVX2-NEXT:    [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 7), align 8
+; VEC256-AVX2-NEXT:    [[R0:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A0]])
+; VEC256-AVX2-NEXT:    [[R1:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A1]])
+; VEC256-AVX2-NEXT:    [[R2:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A2]])
+; VEC256-AVX2-NEXT:    [[R3:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A3]])
+; VEC256-AVX2-NEXT:    [[R4:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A4]])
+; VEC256-AVX2-NEXT:    [[R5:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A5]])
+; VEC256-AVX2-NEXT:    [[R6:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A6]])
+; VEC256-AVX2-NEXT:    [[R7:%.*]] = call i64 @llvm.llrint.i64.f64(double [[A7]])
+; VEC256-AVX2-NEXT:    store i64 [[R0]], ptr @r64, align 8
+; VEC256-AVX2-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 1), align 8
+; VEC256-AVX2-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 2), align 8
+; VEC256-AVX2-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 3), align 8
+; VEC256-AVX2-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 4), align 8
+; VEC256-AVX2-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 5), align 8
+; VEC256-AVX2-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 6), align 8
+; VEC256-AVX2-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 7), align 8
+; VEC256-AVX2-NEXT:    ret void
+;
+; VEC256-AVX512-LABEL: @llrint_v8f64_v8i64(
+; VEC256-AVX512-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr @f64, align 8
+; VEC256-AVX512-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> [[TMP1]])
+; VEC256-AVX512-NEXT:    store <4 x i64> [[TMP2]], ptr @r64, align 8
+; VEC256-AVX512-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 4), align 8
+; VEC256-AVX512-NEXT:    [[TMP4:%.*]] = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> [[TMP3]])
+; VEC256-AVX512-NEXT:    store <4 x i64> [[TMP4]], ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 4), align 8
+; VEC256-AVX512-NEXT:    ret void
+;
+; VEC512-LABEL: @llrint_v8f64_v8i64(
+; VEC512-NEXT:    [[TMP1:%.*]] = load <8 x double>, ptr @f64, align 8
+; VEC512-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> [[TMP1]])
+; VEC512-NEXT:    store <8 x i64> [[TMP2]], ptr @r64, align 8
+; VEC512-NEXT:    ret void
+;
+  %a0 = load double, ptr @f64, align 8
+  %a1 = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 1), align 8
+  %a2 = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 2), align 8
+  %a3 = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 3), align 8
+  %a4 = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 4), align 8
+  %a5 = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 5), align 8
+  %a6 = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 6), align 8
+  %a7 = load double, ptr getelementptr inbounds ([8 x double], ptr @f64, i32 0, i32 7), align 8
+  %r0 = call i64 @llvm.llrint.i64.f64(double %a0)
+  %r1 = call i64 @llvm.llrint.i64.f64(double %a1)
+  %r2 = call i64 @llvm.llrint.i64.f64(double %a2)
+  %r3 = call i64 @llvm.llrint.i64.f64(double %a3)
+  %r4 = call i64 @llvm.llrint.i64.f64(double %a4)
+  %r5 = call i64 @llvm.llrint.i64.f64(double %a5)
+  %r6 = call i64 @llvm.llrint.i64.f64(double %a6)
+  %r7 = call i64 @llvm.llrint.i64.f64(double %a7)
+  store i64 %r0, ptr @r64, align 8
+  store i64 %r1, ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 1), align 8
+  store i64 %r2, ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 2), align 8
+  store i64 %r3, ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 3), align 8
+  store i64 %r4, ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 4), align 8
+  store i64 %r5, ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 5), align 8
+  store i64 %r6, ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 6), align 8
+  store i64 %r7, ptr getelementptr inbounds ([8 x i64], ptr @r64, i32 0, i32 7), align 8
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extended-vectorized-gathered-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/extended-vectorized-gathered-inst.ll
new file mode 100644
index 000000000000..2d028060f491
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extended-vectorized-gathered-inst.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux < %s | FileCheck %s
+
+define void @test(ptr %top) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[TOP:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[TOP]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[TMP0]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i8> <i8 0, i8 0, i8 0, i8 poison>, i8 [[TMP4]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i8> [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <4 x i8> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = lshr <4 x i8> [[TMP7]], <i8 2, i8 2, i8 2, i8 2>
+; CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
+; CHECK:       for.cond.i:
+; CHECK-NEXT:    store <4 x i8> [[TMP8]], ptr null, align 1
+; CHECK-NEXT:    br label [[FOR_COND_I]]
+;
+entry:
+  %0 = load i8, ptr %top, align 1
+  %conv2.i = zext i8 %0 to i32
+  %mul.i = mul i32 %conv2.i, 0
+  %add.i = or i32 %mul.i, 0
+  %arrayidx3.i = getelementptr i8, ptr %top, i64 1
+  %1 = load i8, ptr %arrayidx3.i, align 1
+  %conv4.i = zext i8 %1 to i32
+  %add5.i = or i32 %add.i, 0
+  %shr.i = lshr i32 %add5.i, 2
+  %conv7.i = trunc i32 %shr.i to i8
+  %mul12.i = mul i32 %conv4.i, 0
+  %arrayidx14.i = getelementptr i8, ptr %top, i64 2
+  %2 = load i8, ptr %arrayidx14.i, align 1
+  %conv15.i = zext i8 %2 to i32
+  %add16.i = or i32 %mul12.i, 0
+  %add17.i = or i32 %add16.i, 0
+  %shr18.i = lshr i32 %add17.i, 2
+  %conv19.i = trunc i32 %shr18.i to i8
+  %mul25.i = mul i32 %conv15.i, 0
+  %arrayidx27.i = getelementptr i8, ptr %top, i64 3
+  %3 = load i8, ptr %arrayidx27.i, align 1
+  %conv28.i = zext i8 %3 to i32
+  %add29.i = or i32 %mul25.i, 0
+  %add30.i = or i32 %add29.i, 0
+  %shr31.i = lshr i32 %add30.i, 2
+  %conv32.i = trunc i32 %shr31.i to i8
+  %mul38.i = mul i32 %conv28.i, 0
+  %add39.i = or i32 %mul38.i, %conv15.i
+  %add42.i = or i32 %add39.i, 0
+  %shr44.i = lshr i32 %add42.i, 2
+  %conv45.i = trunc i32 %shr44.i to i8
+  br label %for.cond.i
+
+for.cond.i:
+  store i8 %conv7.i, ptr null, align 1
+  %vals.sroa.5.0.add.ptr.sroa_idx.i = getelementptr i8, ptr null, i64 1
+  store i8 %conv19.i, ptr %vals.sroa.5.0.add.ptr.sroa_idx.i, align 1
+  %vals.sroa.7.0.add.ptr.sroa_idx.i = getelementptr i8, ptr null, i64 2
+  store i8 %conv32.i, ptr %vals.sroa.7.0.add.ptr.sroa_idx.i, align 1
+  %vals.sroa.9.0.add.ptr.sroa_idx.i = getelementptr i8, ptr null, i64 3
+  store i8 %conv45.i, ptr %vals.sroa.9.0.add.ptr.sroa_idx.i, align 1
+  br label %for.cond.i
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/ext-after-phi-node.ll b/llvm/test/Transforms/SLPVectorizer/ext-after-phi-node.ll
new file mode 100644
index 000000000000..0abfdc91e729
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/ext-after-phi-node.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+;RUN: opt -S -S --passes=slp-vectorizer -slp-threshold=-99999 < %s | FileCheck %s
+
+define double @test() {
+; CHECK-LABEL: define double @test() {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[BB7:%.*]]
+; CHECK:       bb7:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i32> [ poison, [[BB9:%.*]] ], [ zeroinitializer, [[BB:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, [[BB9]] ], [ zeroinitializer, [[BB]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <2 x i32> [[TMP0]] to <2 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[TMP6]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> zeroinitializer, <2 x i64> [[TMP2]], <2 x i32> <i32 3, i32 1>
+; CHECK-NEXT:    ret double 0.000000e+00
+; CHECK:       bb9:
+; CHECK-NEXT:    br label [[BB7]]
+;
+bb:
+  br label %bb7
+
+bb7:
+  %0 = phi <2 x i32> [ poison, %bb9 ], [ zeroinitializer, %bb ]
+  %1 = phi <2 x i32> [ zeroinitializer, %bb9 ], [ zeroinitializer, %bb ]
+  %2 = extractelement <2 x i32> %0, i32 1
+  %3 = extractelement <2 x i32> %0, i32 0
+  %zext8 = zext i32 %3 to i64
+  %zext = zext i32 %2 to i64
+  %icmp = icmp ult i64 %zext, %zext8
+  %4 = insertelement <2 x i64> zeroinitializer, i64 %zext, i32 0
+  ret double 0.000000e+00
+
+bb9:
+  br label %bb7
+}
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/non-probe-stale-profile-matching.prof b/llvm/test/Transforms/SampleProfile/Inputs/non-probe-stale-profile-matching.prof
index 8e988515be8e..418f2c4af264 100644
--- a/llvm/test/Transforms/SampleProfile/Inputs/non-probe-stale-profile-matching.prof
+++ b/llvm/test/Transforms/SampleProfile/Inputs/non-probe-stale-profile-matching.prof
@@ -10,12 +10,12 @@ main:9229397:0
  7: 0
  2: foo:1479916
   1: 47663
-  1.1: 46683 bar:43238
+  1.15: 46683 bar:43238
   2: 4519 bar:4932
   3: 48723
  4: foo:1505537
   1: 48604
-  1.1: 46965 bar:44479
+  1.15: 46965 bar:44479
   2: 4613 bar:4967
   3: 49087
 bar:2333388:196222
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/overflow.proftext b/llvm/test/Transforms/SampleProfile/Inputs/overflow.proftext
new file mode 100644
index 000000000000..753294a49e99
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/Inputs/overflow.proftext
@@ -0,0 +1,2 @@
+_Z3testi:29600000000:29600000000
+ 5: 29600000000
diff --git a/llvm/test/Transforms/SampleProfile/non-probe-stale-profile-matching.ll b/llvm/test/Transforms/SampleProfile/non-probe-stale-profile-matching.ll
index eb69c18add01..5394a00ced86 100644
--- a/llvm/test/Transforms/SampleProfile/non-probe-stale-profile-matching.ll
+++ b/llvm/test/Transforms/SampleProfile/non-probe-stale-profile-matching.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: x86_64-linux
 ; REQUIRES: asserts
-; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/non-probe-stale-profile-matching.prof --salvage-stale-profile -S --debug-only=sample-profile,sample-profile-matcher,sample-profile-impl 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/non-probe-stale-profile-matching.prof --salvage-stale-profile -S --debug-only=sample-profile,sample-profile-matcher,sample-profile-impl -profile-isfs 2>&1 | FileCheck %s
 
 ; The profiled source code:
 
@@ -51,7 +51,7 @@
 ; CHECK: Run stale profile matching for bar
 
 ; CHECK: Run stale profile matching for foo
-; CHECK: Callsite with callee:bar is matched from 1.1 to 1.1
+; CHECK: Callsite with callee:bar is matched from 1.15 to 1.15
 ; CHECK: Callsite with callee:bar is matched from 2 to 2
 
 ; CHECK: Run stale profile matching for main
@@ -183,7 +183,7 @@ attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memo
 !15 = !DILocation(line: 7, column: 9, scope: !14)
 !16 = !DILocation(line: 7, column: 7, scope: !14)
 !17 = !DILocation(line: 7, column: 23, scope: !18)
-!18 = !DILexicalBlockFile(scope: !14, file: !10, discriminator: 2)
+!18 = !DILexicalBlockFile(scope: !14, file: !10, discriminator: 15)
 !19 = !DILocation(line: 7, column: 15, scope: !18)
 !20 = !DILocation(line: 8, column: 21, scope: !14)
 !21 = !DILocation(line: 8, column: 15, scope: !14)
@@ -201,7 +201,7 @@ attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memo
 !33 = !DILocation(line: 14, column: 8, scope: !25)
 !34 = !DILocation(line: 14, scope: !25)
 !35 = !DILocation(line: 14, column: 21, scope: !36)
-!36 = !DILexicalBlockFile(scope: !25, file: !10, discriminator: 2)
+!36 = !DILexicalBlockFile(scope: !25, file: !10, discriminator: 15)
 !37 = !DILocation(line: 14, column: 3, scope: !36)
 !38 = !DILocation(line: 14, column: 3, scope: !39)
 !39 = !DILexicalBlockFile(scope: !25, file: !10, discriminator: 4)
diff --git a/llvm/test/Transforms/SampleProfile/overflow.ll b/llvm/test/Transforms/SampleProfile/overflow.ll
new file mode 100644
index 000000000000..06be3ce50023
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/overflow.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
+
+; Checks that we are able to handle overflowing counters correctly.
+
+; RUN: opt < %s -passes='sample-profile,print<branch-prob>' -sample-profile-file=%S/Inputs/overflow.proftext -disable-output 2>&1 | FileCheck %s
+
+; Original Source:
+; int sqrt(int);
+; int test(int i) {
+;    if (i == 5) {
+;        return 42;
+;    }
+;    else {
+;        return sqrt(i);
+;    }
+;}
+
+define dso_local noundef i32 @_Z3testi(i32 noundef %i) local_unnamed_addr #0 !dbg !10 {
+; CHECK-LABEL: '_Z3testi'
+; CHECK-NEXT:  ---- Branch Probabilities ----
+; CHECK-NEXT:    edge %entry -> %return probability is 0x00000000 / 0x80000000 = 0.00%
+; CHECK-NEXT:    edge %entry -> %if.else probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+; CHECK-NEXT:    edge %if.else -> %return probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+;
+entry:
+  tail call void @llvm.dbg.value(metadata i32 %i, metadata !16, metadata !DIExpression()), !dbg !17
+  %cmp = icmp eq i32 %i, 5, !dbg !18
+  br i1 %cmp, label %return, label %if.else, !dbg !20
+
+if.else:                                          ; preds = %entry
+  %call = tail call noundef i32 @_Z4sqrti(i32 noundef %i), !dbg !21
+  br label %return, !dbg !23
+
+return:                                           ; preds = %entry, %if.else
+  %retval.0 = phi i32 [ %call, %if.else ], [ 42, %entry ], !dbg !24
+  ret i32 %retval.0, !dbg !25
+}
+
+declare !dbg !26 noundef i32 @_Z4sqrti(i32 noundef)
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+attributes #0 = { "use-sample-profile" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7, !8}
+!llvm.ident = !{!9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "test.cpp", directory: "/", checksumkind: CSK_MD5, checksum: "cb38d90153a7ebdd6ecf3058eb0524c7")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 8, !"PIC Level", i32 2}
+!6 = !{i32 7, !"PIE Level", i32 2}
+!7 = !{i32 7, !"uwtable", i32 2}
+!8 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
+!9 = !{!"clang"}
+!10 = distinct !DISubprogram(name: "test", linkageName: "_Z3loli", scope: !11, file: !11, line: 3, type: !12, scopeLine: 3, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !15)
+!11 = !DIFile(filename: "./test.cpp", directory: "/", checksumkind: CSK_MD5, checksum: "cb38d90153a7ebdd6ecf3058eb0524c7")
+!12 = !DISubroutineType(types: !13)
+!13 = !{!14, !14}
+!14 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!15 = !{!16}
+!16 = !DILocalVariable(name: "i", arg: 1, scope: !10, file: !11, line: 3, type: !14)
+!17 = !DILocation(line: 0, scope: !10)
+!18 = !DILocation(line: 4, column: 11, scope: !19)
+!19 = distinct !DILexicalBlock(scope: !10, file: !11, line: 4, column: 9)
+!20 = !DILocation(line: 4, column: 9, scope: !10)
+!21 = !DILocation(line: 8, column: 16, scope: !22)
+!22 = distinct !DILexicalBlock(scope: !19, file: !11, line: 7, column: 10)
+!23 = !DILocation(line: 8, column: 9, scope: !22)
+!24 = !DILocation(line: 0, scope: !19)
+!25 = !DILocation(line: 10, column: 1, scope: !10)
+!26 = !DISubprogram(name: "sqrt", linkageName: "_Z4sqrti", scope: !11, file: !11, line: 1, type: !12, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep-inbounds.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep-inbounds.ll
index c24bbd5f658f..16e47f057bab 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep-inbounds.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep-inbounds.ll
@@ -1,28 +1,27 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -S -passes=separate-const-offset-from-gep < %s | FileCheck %s
 
-define void @inboundsPossiblyNegative(ptr %in.ptr, i32 %in.idx1) {
+define void @inboundsPossiblyNegative(ptr %in.ptr, i64 %in.idx1) {
 ; CHECK-LABEL: define void @inboundsPossiblyNegative(
-; CHECK-SAME: ptr [[IN_PTR:%.*]], i32 [[IN_IDX1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IN_IDX1]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <2 x i8>, ptr [[IN_PTR]], i64 [[IDXPROM]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <2 x i8>, ptr [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <2 x i8>, ptr [[IN_PTR]], i64 [[IN_IDX1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <2 x i8>, ptr [[TMP0]], i64 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %const1 = getelementptr inbounds <2 x i8>, ptr %in.ptr, i32 1
-  %idx1 = getelementptr inbounds <2 x i8>, ptr %const1, i32 %in.idx1
+  %const1 = getelementptr inbounds <2 x i8>, ptr %in.ptr, i64 1
+  %idx1 = getelementptr inbounds <2 x i8>, ptr %const1, i64 %in.idx1
   ret void
 }
 
-define void @inboundsNonNegative(ptr %in.ptr, i32 %in.idx1) {
-; CHECK-LABEL: define void @inboundsNonNegative(
+define void @inboundsNonNegative_nonCanonical(ptr %in.ptr, i32 %in.idx1) {
+; CHECK-LABEL: define void @inboundsNonNegative_nonCanonical(
 ; CHECK-SAME: ptr [[IN_PTR:%.*]], i32 [[IN_IDX1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i32 [[IN_IDX1]], 2147483647
-; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IN_IDX1_NNEG]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <2 x i8>, ptr [[IN_PTR]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[IN_IDX1_NNEG1:%.*]] = and i32 [[IN_IDX1]], 2147483647
+; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = sext i32 [[IN_IDX1_NNEG1]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <2 x i8>, ptr [[IN_PTR]], i64 [[IN_IDX1_NNEG]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <2 x i8>, ptr [[TMP0]], i32 1
 ; CHECK-NEXT:    ret void
 ;
@@ -33,19 +32,277 @@ entry:
   ret void
 }
 
-define void @inboundsNonchained(ptr %in.ptr, i32 %in.idx1) {
+define void @inboundsNonNegative(ptr %in.ptr, i64 %in.idx1) {
+; CHECK-LABEL: define void @inboundsNonNegative(
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = and i64 [[IN_IDX1]], 9223372036854775807
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <2 x i8>, ptr [[IN_PTR]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <2 x i8>, ptr [[TMP0]], i64 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in.idx1.nneg = and i64 %in.idx1, 9223372036854775807
+  %const1 = getelementptr inbounds <2 x i8>, ptr %in.ptr, i64 1
+  %idx1 = getelementptr inbounds <2 x i8>, ptr %const1, i64 %in.idx1.nneg
+  ret void
+}
+
+define void @inboundsNonchained(ptr %in.ptr, i64 %in.idx1) {
 ; CHECK-LABEL: define void @inboundsNonchained(
-; CHECK-SAME: ptr [[IN_PTR:%.*]], i32 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i32 [[IN_IDX1]], 2147483647
-; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IN_IDX1_NNEG]] to i64
+; CHECK-NEXT:    [[IDXPROM:%.*]] = and i64 [[IN_IDX1]], 9223372036854775807
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <2 x i8>, ptr [[IN_PTR]], i64 [[IDXPROM]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <2 x i8>, ptr [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <2 x i8>, ptr [[TMP0]], i64 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %in.idx1.nneg = and i32 %in.idx1, 2147483647
-  %const1 = getelementptr inbounds <2 x i8>, ptr %in.ptr, i32 1
-  %idx1 = getelementptr <2 x i8>, ptr %const1, i32 %in.idx1.nneg
+  %in.idx1.nneg = and i64 %in.idx1, 9223372036854775807
+  %const1 = getelementptr inbounds <2 x i8>, ptr %in.ptr, i64 1
+  %idx1 = getelementptr <2 x i8>, ptr %const1, i64 %in.idx1.nneg
+  ret void
+}
+
+define void @inboundsNonNegativeType_i16i8(ptr %in.ptr, i64 %in.idx1) {
+; CHECK-LABEL: define void @inboundsNonNegativeType_i16i8(
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = and i64 [[IN_IDX1]], 9223372036854775807
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[IN_PTR]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i64 1024
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in.idx1.nneg = and i64 %in.idx1, 9223372036854775807
+  %const1 = getelementptr inbounds i16, ptr %in.ptr, i64 1024
+  %idx1 = getelementptr inbounds i8, ptr %const1, i64 %in.idx1.nneg
+  ret void
+}
+
+define void @inboundsNonNegative_i8i16(ptr %in.ptr, i64 %in.idx1) {
+; CHECK-LABEL: define void @inboundsNonNegative_i8i16(
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = and i64 [[IN_IDX1]], 9223372036854775807
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[IN_PTR]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 1024
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in.idx1.nneg = and i64 %in.idx1, 9223372036854775807
+  %const1 = getelementptr inbounds i8, ptr %in.ptr, i64 1024
+  %idx1 = getelementptr inbounds i16, ptr %const1, i64 %in.idx1.nneg
+  ret void
+}
+
+define void @inboundsNonchained_first(ptr %in.ptr, i64 %in.idx1) {
+; CHECK-LABEL: define void @inboundsNonchained_first(
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i64 [[IN_IDX1]], 9223372036854775807
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[IN_PTR]], i64 [[IN_IDX1_NNEG]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1024
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in.idx1.nneg = and i64 %in.idx1, 9223372036854775807
+  %const1 = getelementptr inbounds i8, ptr %in.ptr, i64 1024
+  %idx1 = getelementptr i32, ptr %const1, i64 %in.idx1.nneg
+  ret void
+}
+
+define void @inboundsNonchained_second(ptr %in.ptr, i64 %in.idx1) {
+; CHECK-LABEL: define void @inboundsNonchained_second(
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i64 [[IN_IDX1]], 9223372036854775807
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i64, ptr [[IN_PTR]], i64 [[IN_IDX1_NNEG]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1024
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in.idx1.nneg = and i64 %in.idx1, 9223372036854775807
+  %const1 = getelementptr i8, ptr %in.ptr, i64 1024
+  %idx1 = getelementptr inbounds i64, ptr %const1, i64 %in.idx1.nneg
+  ret void
+}
+
+define void @notInbounds(ptr %in.ptr, i64 %in.idx1) {
+; CHECK-LABEL: define void @notInbounds(
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i64 [[IN_IDX1]], 9223372036854775807
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i128, ptr [[IN_PTR]], i64 [[IN_IDX1_NNEG]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1024
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in.idx1.nneg = and i64 %in.idx1, 9223372036854775807
+  %const1 = getelementptr i8, ptr %in.ptr, i64 1024
+  %idx1 = getelementptr i128, ptr %const1, i64 %in.idx1.nneg
+  ret void
+}
+
+define void @vectorType1(ptr %in.ptr, i64 %in.idx1) {
+; CHECK-LABEL: define void @vectorType1(
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i64 [[IN_IDX1]], 2147483647
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <2 x i8>, ptr [[IN_PTR]], i64 [[IN_IDX1_NNEG]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i8>, ptr [[TMP0]], i32 3
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in.idx1.nneg = and i64 %in.idx1, 2147483647
+  %const1 = getelementptr inbounds <4 x i8>, ptr %in.ptr, i32 3
+  %idx1 = getelementptr inbounds <2 x i8>, ptr %const1, i64 %in.idx1.nneg
+  ret void
+}
+
+define void @vectorType2(ptr %in.ptr, i64 %in.idx1) {
+; CHECK-LABEL: define void @vectorType2(
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i64 [[IN_IDX1]], 2147483647
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <4 x half>, ptr [[IN_PTR]], i64 [[IN_IDX1_NNEG]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i8>, ptr [[TMP0]], i32 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in.idx1.nneg = and i64 %in.idx1, 2147483647
+  %const1 = getelementptr inbounds <4 x i8>, ptr %in.ptr, i32 1
+  %idx1 = getelementptr inbounds <4 x half>, ptr %const1, i64 %in.idx1.nneg
+  ret void
+}
+
+define void @vectorType3(ptr %in.ptr, i64 %in.idx1) {
+; CHECK-LABEL: define void @vectorType3(
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i64 [[IN_IDX1]], 2147483647
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds ptr, ptr [[IN_PTR]], i64 [[IN_IDX1_NNEG]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x ptr>, ptr [[TMP0]], i32 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in.idx1.nneg = and i64 %in.idx1, 2147483647
+  %const1 = getelementptr inbounds <4 x ptr>, ptr %in.ptr, i32 1
+  %idx1 = getelementptr inbounds ptr, ptr %const1, i64 %in.idx1.nneg
+  ret void
+}
+
+define void @vectorType4(ptr %in.ptr, i64 %in.idx1) {
+; CHECK-LABEL: define void @vectorType4(
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i64 [[IN_IDX1]], 2147483647
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <8 x ptr addrspace(1)>, ptr [[IN_PTR]], i64 [[IN_IDX1_NNEG]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x ptr>, ptr [[TMP0]], i32 3
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in.idx1.nneg = and i64 %in.idx1, 2147483647
+  %const1 = getelementptr inbounds <4 x ptr>, ptr %in.ptr, i32 3
+  %idx1 = getelementptr inbounds <8 x ptr addrspace(1)>, ptr %const1, i64 %in.idx1.nneg
+  ret void
+}
+
+
+define void @ptrType(ptr %in.ptr, i64 %in.idx1) {
+; CHECK-LABEL: define void @ptrType(
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i64 [[IN_IDX1]], 2147483647
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds ptr, ptr [[IN_PTR]], i64 [[IN_IDX1_NNEG]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds ptr addrspace(2), ptr [[TMP0]], i32 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in.idx1.nneg = and i64 %in.idx1, 2147483647
+  %const1 = getelementptr inbounds ptr addrspace(2), ptr %in.ptr, i32 1
+  %idx1 = getelementptr inbounds ptr, ptr %const1, i64 %in.idx1.nneg
+  ret void
+}
+
+define void @ptrType2(ptr %in.ptr, i64 %in.idx1) {
+; CHECK-LABEL: define void @ptrType2(
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i64 [[IN_IDX1]], 2147483647
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[IN_PTR]], i64 [[IN_IDX1_NNEG]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds ptr addrspace(3), ptr [[TMP0]], i32 3
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in.idx1.nneg = and i64 %in.idx1, 2147483647
+  %const1 = getelementptr inbounds ptr addrspace(3), ptr %in.ptr, i32 3
+  %idx1 = getelementptr inbounds i64, ptr %const1, i64 %in.idx1.nneg
+  ret void
+}
+
+define void @ptrType3(ptr %in.ptr, i64 %in.idx1) {
+; CHECK-LABEL: define void @ptrType3(
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i64 [[IN_IDX1]], 2147483647
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[IN_PTR]], i64 [[IN_IDX1_NNEG]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds ptr addrspace(7), ptr [[TMP0]], i32 3
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in.idx1.nneg = and i64 %in.idx1, 2147483647
+  %const1 = getelementptr inbounds ptr addrspace(7), ptr %in.ptr, i32 3
+  %idx1 = getelementptr inbounds i16, ptr %const1, i64 %in.idx1.nneg
+  ret void
+}
+
+define void @addrspace1(ptr addrspace(1) %in.ptr, i64 %in.idx1) {
+; CHECK-LABEL: define void @addrspace1(
+; CHECK-SAME: ptr addrspace(1) [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i64 [[IN_IDX1]], 9223372036854775807
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i128, ptr addrspace(1) [[IN_PTR]], i64 [[IN_IDX1_NNEG]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0]], i64 1024
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in.idx1.nneg = and i64 %in.idx1, 9223372036854775807
+  %const1 = getelementptr inbounds i8, ptr addrspace(1) %in.ptr, i64 1024
+  %idx1 = getelementptr inbounds i128, ptr addrspace(1) %const1, i64 %in.idx1.nneg
+  ret void
+}
+
+define void @addrspace3(ptr addrspace(3) %in.ptr, i64 %in.idx1) {
+; CHECK-LABEL: define void @addrspace3(
+; CHECK-SAME: ptr addrspace(3) [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i64 [[IN_IDX1]], 9223372036854775807
+; CHECK-NEXT:    [[IDXPROM:%.*]] = trunc i64 [[IN_IDX1_NNEG]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i128, ptr addrspace(3) [[IN_PTR]], i32 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP0]], i64 1024
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in.idx1.nneg = and i64 %in.idx1, 9223372036854775807
+  %const1 = getelementptr inbounds i8, ptr addrspace(3) %in.ptr, i64 1024
+  %idx1 = getelementptr inbounds i128, ptr addrspace(3) %const1, i64 %in.idx1.nneg
+  ret void
+}
+
+define void @addrspace7(ptr addrspace(7) %in.ptr, i64 %in.idx1) {
+; CHECK-LABEL: define void @addrspace7(
+; CHECK-SAME: ptr addrspace(7) [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i64 [[IN_IDX1]], 9223372036854775807
+; CHECK-NEXT:    [[IDXPROM:%.*]] = trunc i64 [[IN_IDX1_NNEG]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i128, ptr addrspace(7) [[IN_PTR]], i32 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr addrspace(7) [[TMP0]], i64 1024
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in.idx1.nneg = and i64 %in.idx1, 9223372036854775807
+  %const1 = getelementptr inbounds i8, ptr addrspace(7) %in.ptr, i64 1024
+  %idx1 = getelementptr inbounds i128, ptr addrspace(7) %const1, i64 %in.idx1.nneg
   ret void
 }
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep.ll
index 7137f0fb66fd..b4119f0b50b4 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep.ll
@@ -1,175 +1,286 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --start-before=separate-const-offset-from-gep < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --passes=separate-const-offset-from-gep < %s | FileCheck %s
 
-define protected amdgpu_kernel void @sink_addr(ptr addrspace(3) %in.ptr, i32 %in.idx0, i32 %in.idx1) {
-; CHECK-LABEL: sink_addr:
-; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_lshl_b32 s3, s1, 1
-; CHECK-NEXT:    s_add_i32 s0, s0, s3
-; CHECK-NEXT:    s_lshl_b32 s2, s2, 1
-; CHECK-NEXT:    s_add_i32 s0, s0, s2
-; CHECK-NEXT:    s_cmp_lg_u32 s1, 0
-; CHECK-NEXT:    s_cbranch_scc1 .LBB0_2
-; CHECK-NEXT:  ; %bb.1: ; %bb.1
-; CHECK-NEXT:    v_mov_b32_e32 v12, s0
-; CHECK-NEXT:    ds_read_b128 v[0:3], v12
-; CHECK-NEXT:    ds_read_b128 v[4:7], v12 offset:512
-; CHECK-NEXT:    ds_read_b128 v[8:11], v12 offset:1024
-; CHECK-NEXT:    ds_read_b128 v[12:15], v12 offset:1536
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[0:3]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[4:7]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[8:11]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[12:15]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:  .LBB0_2: ; %end
-; CHECK-NEXT:    s_add_i32 s1, s0, 0x200
-; CHECK-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-NEXT:    s_add_i32 s2, s0, 0x400
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mov_b32_e32 v0, s1
-; CHECK-NEXT:    s_add_i32 s3, s0, 0x600
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mov_b32_e32 v0, s3
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_endpgm
+define void @sink_addr(ptr addrspace(3) %in.ptr, i64 %in.idx0, i64 %in.idx1) {
+; CHECK-LABEL: define void @sink_addr(
+; CHECK-SAME: ptr addrspace(3) [[IN_PTR:%.*]], i64 [[IN_IDX0:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = trunc i64 [[IN_IDX0]] to i32
+; CHECK-NEXT:    [[BASE:%.*]] = getelementptr half, ptr addrspace(3) [[IN_PTR]], i32 [[IDXPROM]]
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = trunc i64 [[IN_IDX1]] to i32
+; CHECK-NEXT:    [[IDX0:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IDXPROM1]]
+; CHECK-NEXT:    [[IDXPROM2:%.*]] = trunc i64 [[IN_IDX1]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IDXPROM2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr half, ptr addrspace(3) [[TMP0]], i64 256
+; CHECK-NEXT:    [[IDXPROM3:%.*]] = trunc i64 [[IN_IDX1]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IDXPROM3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr half, ptr addrspace(3) [[TMP2]], i64 512
+; CHECK-NEXT:    [[IDXPROM4:%.*]] = trunc i64 [[IN_IDX1]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IDXPROM4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP4]], i64 768
+; CHECK-NEXT:    ret void
+;
 entry:
-  %base = getelementptr half, ptr addrspace(3) %in.ptr, i32 %in.idx0
-  %idx0 = getelementptr half, ptr addrspace(3) %base, i32 %in.idx1
-  %const1 = getelementptr half, ptr addrspace(3) %base, i32 256
-  %idx1 = getelementptr half, ptr addrspace(3) %const1, i32 %in.idx1
-  %const2 = getelementptr half, ptr addrspace(3) %base, i32 512
-  %idx2 = getelementptr half, ptr addrspace(3) %const2, i32 %in.idx1
-  %const3 = getelementptr half, ptr addrspace(3) %base, i32 768
-  %idx3 = getelementptr half, ptr addrspace(3) %const3, i32 %in.idx1
-  %cmp0 = icmp eq i32 %in.idx0, 0
-  br i1 %cmp0, label %bb.1, label %end
+  %base = getelementptr half, ptr addrspace(3) %in.ptr, i64 %in.idx0
+  %idx0 = getelementptr half, ptr addrspace(3) %base, i64 %in.idx1
+  %const1 = getelementptr half, ptr addrspace(3) %base, i64 256
+  %idx1 = getelementptr half, ptr addrspace(3) %const1, i64 %in.idx1
+  %const2 = getelementptr half, ptr addrspace(3) %base, i64 512
+  %idx2 = getelementptr half, ptr addrspace(3) %const2, i64 %in.idx1
+  %const3 = getelementptr half, ptr addrspace(3) %base, i64 768
+  %idx3 = getelementptr half, ptr addrspace(3) %const3, i64 %in.idx1
+  ret void
+}
+
+define void @illegal_addr_mode(ptr addrspace(3) %in.ptr, i64 %in.idx0, i64 %in.idx1) {
+; CHECK-LABEL: define void @illegal_addr_mode(
+; CHECK-SAME: ptr addrspace(3) [[IN_PTR:%.*]], i64 [[IN_IDX0:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = trunc i64 [[IN_IDX0]] to i32
+; CHECK-NEXT:    [[BASE:%.*]] = getelementptr half, ptr addrspace(3) [[IN_PTR]], i32 [[IDXPROM]]
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = trunc i64 [[IN_IDX1]] to i32
+; CHECK-NEXT:    [[IDX0:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IDXPROM1]]
+; CHECK-NEXT:    [[CONST1:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i64 38192
+; CHECK-NEXT:    [[IDXPROM2:%.*]] = trunc i64 [[IN_IDX1]] to i32
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr half, ptr addrspace(3) [[CONST1]], i32 [[IDXPROM2]]
+; CHECK-NEXT:    [[CONST2:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i64 38448
+; CHECK-NEXT:    [[IDXPROM3:%.*]] = trunc i64 [[IN_IDX1]] to i32
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr half, ptr addrspace(3) [[CONST2]], i32 [[IDXPROM3]]
+; CHECK-NEXT:    [[CONST3:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i64 38764
+; CHECK-NEXT:    [[IDXPROM4:%.*]] = trunc i64 [[IN_IDX1]] to i32
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr half, ptr addrspace(3) [[CONST3]], i32 [[IDXPROM4]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %base = getelementptr half, ptr addrspace(3) %in.ptr, i64 %in.idx0
+  %idx0 = getelementptr half, ptr addrspace(3) %base, i64 %in.idx1
+  %const1 = getelementptr half, ptr addrspace(3) %base, i64 38192
+  %idx1 = getelementptr half, ptr addrspace(3) %const1, i64 %in.idx1
+  %const2 = getelementptr half, ptr addrspace(3) %base, i64 38448
+  %idx2 = getelementptr half, ptr addrspace(3) %const2, i64 %in.idx1
+  %const3 = getelementptr half, ptr addrspace(3) %base, i64 38764
+  %idx3 = getelementptr half, ptr addrspace(3) %const3, i64 %in.idx1
+  ret void
+}
+
+
+define void @reorder_i8half(ptr addrspace(3) %in.ptr, i64 %in.idx0, i64 %in.idx1) {
+; CHECK-LABEL: define void @reorder_i8half(
+; CHECK-SAME: ptr addrspace(3) [[IN_PTR:%.*]], i64 [[IN_IDX0:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = trunc i64 [[IN_IDX0]] to i32
+; CHECK-NEXT:    [[BASE:%.*]] = getelementptr i8, ptr addrspace(3) [[IN_PTR]], i32 [[IDXPROM]]
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = trunc i64 [[IN_IDX1]] to i32
+; CHECK-NEXT:    [[IDX0:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IDXPROM1]]
+; CHECK-NEXT:    [[IDXPROM2:%.*]] = trunc i64 [[IN_IDX1]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IDXPROM2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP0]], i64 256
+; CHECK-NEXT:    [[IDXPROM3:%.*]] = trunc i64 [[IN_IDX1]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IDXPROM3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i64 512
+; CHECK-NEXT:    [[IDXPROM4:%.*]] = trunc i64 [[IN_IDX1]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IDXPROM4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i64 768
+; CHECK-NEXT:    ret void
+;
+entry:
+  %base = getelementptr i8, ptr addrspace(3) %in.ptr, i64 %in.idx0
+  %idx0 = getelementptr half, ptr addrspace(3) %base, i64 %in.idx1
+  %const1 = getelementptr i8, ptr addrspace(3) %base, i64 256
+  %idx1 = getelementptr half, ptr addrspace(3) %const1, i64 %in.idx1
+  %const2 = getelementptr i8, ptr addrspace(3) %base, i64 512
+  %idx2 = getelementptr half, ptr addrspace(3) %const2, i64 %in.idx1
+  %const3 = getelementptr i8, ptr addrspace(3) %base, i64 768
+  %idx3 = getelementptr half, ptr addrspace(3) %const3, i64 %in.idx1
+  ret void
+}
+
+define void @reorder_i64half(ptr addrspace(3) %in.ptr, i64 %in.idx0, i64 %in.idx1) {
+; CHECK-LABEL: define void @reorder_i64half(
+; CHECK-SAME: ptr addrspace(3) [[IN_PTR:%.*]], i64 [[IN_IDX0:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = trunc i64 [[IN_IDX0]] to i32
+; CHECK-NEXT:    [[BASE:%.*]] = getelementptr i64, ptr addrspace(3) [[IN_PTR]], i32 [[IDXPROM]]
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = trunc i64 [[IN_IDX1]] to i32
+; CHECK-NEXT:    [[IDX0:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IDXPROM1]]
+; CHECK-NEXT:    [[IDXPROM2:%.*]] = trunc i64 [[IN_IDX1]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IDXPROM2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr addrspace(3) [[TMP0]], i64 256
+; CHECK-NEXT:    [[IDXPROM3:%.*]] = trunc i64 [[IN_IDX1]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IDXPROM3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr addrspace(3) [[TMP2]], i64 512
+; CHECK-NEXT:    [[IDXPROM4:%.*]] = trunc i64 [[IN_IDX1]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IDXPROM4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr addrspace(3) [[TMP4]], i64 768
+; CHECK-NEXT:    ret void
+;
+entry:
+  %base = getelementptr i64, ptr addrspace(3) %in.ptr, i64 %in.idx0
+  %idx0 = getelementptr half, ptr addrspace(3) %base, i64 %in.idx1
+  %const1 = getelementptr i64, ptr addrspace(3) %base, i64 256
+  %idx1 = getelementptr half, ptr addrspace(3) %const1, i64 %in.idx1
+  %const2 = getelementptr i64, ptr addrspace(3) %base, i64 512
+  %idx2 = getelementptr half, ptr addrspace(3) %const2, i64 %in.idx1
+  %const3 = getelementptr i64, ptr addrspace(3) %base, i64 768
+  %idx3 = getelementptr half, ptr addrspace(3) %const3, i64 %in.idx1
+  ret void
+}
 
-bb.1:
-  %val0 = load <8 x half>, ptr addrspace(3) %idx0, align 16
-  %val1 = load <8 x half>, ptr addrspace(3) %idx1, align 16
-  %val2 = load <8 x half>, ptr addrspace(3) %idx2, align 16
-  %val3 = load <8 x half>, ptr addrspace(3) %idx3, align 16
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val0)
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val1)
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val2)
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val3)
-  br label %end
+define void @reorder_halfi8(ptr addrspace(3) %in.ptr, i64 %in.idx0, i64 %in.idx1) {
+; CHECK-LABEL: define void @reorder_halfi8(
+; CHECK-SAME: ptr addrspace(3) [[IN_PTR:%.*]], i64 [[IN_IDX0:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = trunc i64 [[IN_IDX0]] to i32
+; CHECK-NEXT:    [[BASE:%.*]] = getelementptr half, ptr addrspace(3) [[IN_PTR]], i32 [[IDXPROM]]
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = trunc i64 [[IN_IDX1]] to i32
+; CHECK-NEXT:    [[IDX0:%.*]] = getelementptr i8, ptr addrspace(3) [[BASE]], i32 [[IDXPROM1]]
+; CHECK-NEXT:    [[IDXPROM2:%.*]] = trunc i64 [[IN_IDX1]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr addrspace(3) [[BASE]], i32 [[IDXPROM2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr half, ptr addrspace(3) [[TMP0]], i64 256
+; CHECK-NEXT:    [[IDXPROM3:%.*]] = trunc i64 [[IN_IDX1]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr addrspace(3) [[BASE]], i32 [[IDXPROM3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr half, ptr addrspace(3) [[TMP2]], i64 512
+; CHECK-NEXT:    [[IDXPROM4:%.*]] = trunc i64 [[IN_IDX1]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr addrspace(3) [[BASE]], i32 [[IDXPROM4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP4]], i64 768
+; CHECK-NEXT:    ret void
+;
+entry:
+  %base = getelementptr half, ptr addrspace(3) %in.ptr, i64 %in.idx0
+  %idx0 = getelementptr i8, ptr addrspace(3) %base, i64 %in.idx1
+  %const1 = getelementptr half, ptr addrspace(3) %base, i64 256
+  %idx1 = getelementptr i8, ptr addrspace(3) %const1, i64 %in.idx1
+  %const2 = getelementptr half, ptr addrspace(3) %base, i64 512
+  %idx2 = getelementptr i8, ptr addrspace(3) %const2, i64 %in.idx1
+  %const3 = getelementptr half, ptr addrspace(3) %base, i64 768
+  %idx3 = getelementptr i8, ptr addrspace(3) %const3, i64 %in.idx1
+  ret void
+}
 
-end:
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx0)
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx1)
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx2)
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx3)
+
+
+define void @bad_index(ptr addrspace(3) %in.ptr, i64 %in.idx0, i64 %in.idx1) {
+; CHECK-LABEL: define void @bad_index(
+; CHECK-SAME: ptr addrspace(3) [[IN_PTR:%.*]], i64 [[IN_IDX0:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = trunc i64 [[IN_IDX0]] to i32
+; CHECK-NEXT:    [[BASE:%.*]] = getelementptr half, ptr addrspace(3) [[IN_PTR]], i32 [[IDXPROM]]
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = trunc i64 [[IN_IDX1]] to i32
+; CHECK-NEXT:    [[IDX0:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IDXPROM1]]
+; CHECK-NEXT:    [[IDXPROM2:%.*]] = trunc i64 [[IN_IDX1]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IDXPROM2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i64 1
+; CHECK-NEXT:    [[IDXPROM3:%.*]] = trunc i64 [[IN_IDX1]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IDXPROM3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i64 2
+; CHECK-NEXT:    [[IDXPROM4:%.*]] = trunc i64 [[IN_IDX1]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IDXPROM4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP0]], i64 3
+; CHECK-NEXT:    ret void
+;
+entry:
+  %base = getelementptr half, ptr addrspace(3) %in.ptr, i64 %in.idx0
+  %idx0 = getelementptr half, ptr addrspace(3) %base, i64 %in.idx1
+  %const1 = getelementptr i8, ptr addrspace(3) %base, i64 1
+  %idx1 = getelementptr half, ptr addrspace(3) %const1, i64 %in.idx1
+  %const2 = getelementptr i8, ptr addrspace(3) %base, i64 2
+  %idx2 = getelementptr half, ptr addrspace(3) %const2, i64 %in.idx1
+  %const3 = getelementptr i8, ptr addrspace(3) %base, i64 3
+  %idx3 = getelementptr half, ptr addrspace(3) %const3, i64 %in.idx1
   ret void
 }
 
-define protected amdgpu_kernel void @illegal_addr_mode(ptr addrspace(3) %in.ptr, i32 %in.idx0, i32 %in.idx1) {
-; CHECK-LABEL: illegal_addr_mode:
-; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[6:7], 0x0
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_lshl_b32 s0, s5, 1
-; CHECK-NEXT:    s_lshl_b32 s1, s6, 1
-; CHECK-NEXT:    s_add_i32 s3, s4, s0
-; CHECK-NEXT:    s_add_i32 s3, s3, s1
-; CHECK-NEXT:    s_add_i32 s2, s3, 0x12a60
-; CHECK-NEXT:    s_add_i32 s1, s3, 0x12c60
-; CHECK-NEXT:    s_add_i32 s0, s3, 0x12ed8
-; CHECK-NEXT:    s_cmp_lg_u32 s5, 0
-; CHECK-NEXT:    s_cbranch_scc1 .LBB1_2
-; CHECK-NEXT:  ; %bb.1: ; %bb.1
-; CHECK-NEXT:    v_mov_b32_e32 v0, s3
-; CHECK-NEXT:    v_mov_b32_e32 v4, s2
-; CHECK-NEXT:    v_mov_b32_e32 v8, s1
-; CHECK-NEXT:    v_mov_b32_e32 v12, s0
-; CHECK-NEXT:    ds_read_b128 v[0:3], v0
-; CHECK-NEXT:    ds_read_b128 v[4:7], v4
-; CHECK-NEXT:    ds_read_b128 v[8:11], v8
-; CHECK-NEXT:    ds_read_b128 v[12:15], v12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[0:3]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[4:7]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[8:11]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[12:15]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:  .LBB1_2: ; %end
-; CHECK-NEXT:    v_mov_b32_e32 v0, s3
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mov_b32_e32 v0, s1
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_endpgm
+
+%struct.vec = type { [8 x i8], [4 x half] }
+define void @vector_struct_type(ptr addrspace(3) %in.ptr, i64 %in.idx0, i64 %in.idx1) {
+; CHECK-LABEL: define void @vector_struct_type(
+; CHECK-SAME: ptr addrspace(3) [[IN_PTR:%.*]], i64 [[IN_IDX0:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = trunc i64 [[IN_IDX0]] to i32
+; CHECK-NEXT:    [[BASE:%.*]] = getelementptr [1024 x %struct.vec], ptr addrspace(3) [[IN_PTR]], i32 [[IDXPROM]]
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = trunc i64 [[IN_IDX1]] to i32
+; CHECK-NEXT:    [[IDX0:%.*]] = getelementptr i8, ptr addrspace(3) [[BASE]], i32 [[IDXPROM1]]
+; CHECK-NEXT:    [[CONST1:%.*]] = getelementptr [1024 x %struct.vec], ptr addrspace(3) [[BASE]], i64 256
+; CHECK-NEXT:    [[IDXPROM2:%.*]] = trunc i64 [[IN_IDX1]] to i32
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr i8, ptr addrspace(3) [[CONST1]], i32 [[IDXPROM2]]
+; CHECK-NEXT:    [[CONST2:%.*]] = getelementptr [1024 x %struct.vec], ptr addrspace(3) [[BASE]], i64 512
+; CHECK-NEXT:    [[IDXPROM3:%.*]] = trunc i64 [[IN_IDX1]] to i32
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr i8, ptr addrspace(3) [[CONST2]], i32 [[IDXPROM3]]
+; CHECK-NEXT:    [[CONST3:%.*]] = getelementptr [1024 x %struct.vec], ptr addrspace(3) [[BASE]], i64 768
+; CHECK-NEXT:    [[IDXPROM4:%.*]] = trunc i64 [[IN_IDX1]] to i32
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr i8, ptr addrspace(3) [[CONST3]], i32 [[IDXPROM4]]
+; CHECK-NEXT:    ret void
+;
 entry:
-  %base = getelementptr half, ptr addrspace(3) %in.ptr, i32 %in.idx0
-  %idx0 = getelementptr half, ptr addrspace(3) %base, i32 %in.idx1
-  %const1 = getelementptr half, ptr addrspace(3) %base, i32 38192
-  %idx1 = getelementptr half, ptr addrspace(3) %const1, i32 %in.idx1
-  %const2 = getelementptr half, ptr addrspace(3) %base, i32 38448
-  %idx2 = getelementptr half, ptr addrspace(3) %const2, i32 %in.idx1
-  %const3 = getelementptr half, ptr addrspace(3) %base, i32 38764
-  %idx3 = getelementptr half, ptr addrspace(3) %const3, i32 %in.idx1
-  %cmp0 = icmp eq i32 %in.idx0, 0
-  br i1 %cmp0, label %bb.1, label %end
+  %base = getelementptr [1024 x %struct.vec], ptr addrspace(3) %in.ptr, i64 %in.idx0
+  %idx0 = getelementptr i8, ptr addrspace(3) %base, i64 %in.idx1
+  %const1 = getelementptr [1024 x %struct.vec], ptr addrspace(3) %base, i64 256
+  %idx1 = getelementptr i8, ptr addrspace(3) %const1, i64 %in.idx1
+  %const2 = getelementptr [1024 x %struct.vec], ptr addrspace(3) %base, i64 512
+  %idx2 = getelementptr i8, ptr addrspace(3) %const2, i64 %in.idx1
+  %const3 = getelementptr [1024 x %struct.vec], ptr addrspace(3) %base, i64 768
+  %idx3 = getelementptr i8, ptr addrspace(3) %const3, i64 %in.idx1
+  ret void
+}
 
-bb.1:
-  %val0 = load <8 x half>, ptr addrspace(3) %idx0, align 16
-  %val1 = load <8 x half>, ptr addrspace(3) %idx1, align 16
-  %val2 = load <8 x half>, ptr addrspace(3) %idx2, align 16
-  %val3 = load <8 x half>, ptr addrspace(3) %idx3, align 16
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val0)
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val1)
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val2)
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val3)
-  br label %end
+define void @struct_type(ptr addrspace(3) %in.ptr, i64 %in.idx0, i64 %in.idx1) {
+; CHECK-LABEL: define void @struct_type(
+; CHECK-SAME: ptr addrspace(3) [[IN_PTR:%.*]], i64 [[IN_IDX0:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = trunc i64 [[IN_IDX0]] to i32
+; CHECK-NEXT:    [[BASE:%.*]] = getelementptr [[STRUCT_VEC:%.*]], ptr addrspace(3) [[IN_PTR]], i32 [[IDXPROM]]
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = trunc i64 [[IN_IDX1]] to i32
+; CHECK-NEXT:    [[IDX0:%.*]] = getelementptr i8, ptr addrspace(3) [[BASE]], i32 [[IDXPROM1]]
+; CHECK-NEXT:    [[IDXPROM2:%.*]] = trunc i64 [[IN_IDX1]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr addrspace(3) [[BASE]], i32 [[IDXPROM2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_VEC]], ptr addrspace(3) [[TMP0]], i64 256
+; CHECK-NEXT:    [[IDXPROM3:%.*]] = trunc i64 [[IN_IDX1]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr addrspace(3) [[BASE]], i32 [[IDXPROM3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr [[STRUCT_VEC]], ptr addrspace(3) [[TMP2]], i64 512
+; CHECK-NEXT:    [[IDXPROM4:%.*]] = trunc i64 [[IN_IDX1]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr addrspace(3) [[BASE]], i32 [[IDXPROM4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr [[STRUCT_VEC]], ptr addrspace(3) [[TMP4]], i64 768
+; CHECK-NEXT:    ret void
+;
+entry:
+  %base = getelementptr %struct.vec, ptr addrspace(3) %in.ptr, i64 %in.idx0
+  %idx0 = getelementptr i8, ptr addrspace(3) %base, i64 %in.idx1
+  %const1 = getelementptr %struct.vec, ptr addrspace(3) %base, i64 256
+  %idx1 = getelementptr i8, ptr addrspace(3) %const1, i64 %in.idx1
+  %const2 = getelementptr %struct.vec, ptr addrspace(3) %base, i64 512
+  %idx2 = getelementptr i8, ptr addrspace(3) %const2, i64 %in.idx1
+  %const3 = getelementptr %struct.vec, ptr addrspace(3) %base, i64 768
+  %idx3 = getelementptr i8, ptr addrspace(3) %const3, i64 %in.idx1
+  ret void
+}
 
-end:
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx0)
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx1)
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx2)
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx3)
+define void @struct_type_multiindex(ptr addrspace(3) %in.ptr, i64 %in.idx0, i32 %in.idx1, i64 %in.idx2) {
+; CHECK-LABEL: define void @struct_type_multiindex(
+; CHECK-SAME: ptr addrspace(3) [[IN_PTR:%.*]], i64 [[IN_IDX0:%.*]], i32 [[IN_IDX1:%.*]], i64 [[IN_IDX2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = trunc i64 [[IN_IDX0]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [[STRUCT_VEC:%.*]], ptr addrspace(3) [[IN_PTR]], i32 [[IDXPROM]], i32 0, i32 0
+; CHECK-NEXT:    [[IDXPROM2:%.*]] = trunc i64 [[IN_IDX2]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP0]], i32 [[IDXPROM2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP1]], i32 2
+; CHECK-NEXT:    [[IDXPROM3:%.*]] = trunc i64 [[IN_IDX0]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr [[STRUCT_VEC]], ptr addrspace(3) [[IN_PTR]], i32 [[IDXPROM3]], i32 0, i32 0
+; CHECK-NEXT:    [[IDXPROM5:%.*]] = trunc i64 [[IN_IDX2]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP3]], i32 [[IDXPROM5]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 4
+; CHECK-NEXT:    [[IDXPROM6:%.*]] = trunc i64 [[IN_IDX0]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr [[STRUCT_VEC]], ptr addrspace(3) [[IN_PTR]], i32 [[IDXPROM6]], i32 0, i32 0
+; CHECK-NEXT:    [[IDXPROM8:%.*]] = trunc i64 [[IN_IDX2]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 [[IDXPROM8]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 6
+; CHECK-NEXT:    ret void
+;
+entry:
+  %const1 = getelementptr %struct.vec, ptr addrspace(3) %in.ptr, i64 %in.idx0, i32 0, i32 2
+  %idx1 = getelementptr i8, ptr addrspace(3) %const1, i64 %in.idx2
+  %const2 = getelementptr %struct.vec, ptr addrspace(3) %in.ptr, i64 %in.idx0, i32 0, i32 4
+  %idx2 = getelementptr i8, ptr addrspace(3) %const2, i64 %in.idx2
+  %const3 = getelementptr %struct.vec, ptr addrspace(3) %in.ptr, i64 %in.idx0, i32 0, i32 6
+  %idx3 = getelementptr i8, ptr addrspace(3) %const3, i64 %in.idx2
   ret void
 }
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/lower-gep-reorder.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/lower-gep-reorder.ll
index a91c8172177f..43dda1ae1517 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/lower-gep-reorder.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/lower-gep-reorder.ll
@@ -7,14 +7,14 @@ define protected amdgpu_kernel void @sink_addr(ptr %in.ptr, i64 %in.idx0, i64 %i
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[IDX0:%.*]] = getelementptr [8192 x i64], ptr [[IN_PTR]], i64 [[IN_IDX0]], i64 [[IN_IDX1]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [8192 x i64], ptr [[IN_PTR]], i64 [[IN_IDX0]], i64 0
-; CHECK-NEXT:    [[CONST11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 2048
-; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr i64, ptr [[CONST11]], i64 [[IN_IDX1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[TMP0]], i64 [[IN_IDX1]]
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr i8, ptr [[TMP3]], i64 2048
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [8192 x i64], ptr [[IN_PTR]], i64 [[IN_IDX0]], i64 0
-; CHECK-NEXT:    [[CONST22:%.*]] = getelementptr i8, ptr [[TMP1]], i64 4096
-; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr i64, ptr [[CONST22]], i64 [[IN_IDX1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[TMP1]], i64 [[IN_IDX1]]
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr i8, ptr [[TMP4]], i64 4096
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [8192 x i64], ptr [[IN_PTR]], i64 [[IN_IDX0]], i64 0
-; CHECK-NEXT:    [[CONST33:%.*]] = getelementptr i8, ptr [[TMP2]], i64 6144
-; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr i64, ptr [[CONST33]], i64 [[IN_IDX1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP2]], i64 [[IN_IDX1]]
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr i8, ptr [[TMP7]], i64 6144
 ; CHECK-NEXT:    [[CMP0:%.*]] = icmp eq i64 [[IN_IDX0]], 0
 ; CHECK-NEXT:    br i1 [[CMP0]], label [[BB_1:%.*]], label [[END:%.*]]
 ; CHECK:       bb.1:
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/reorder-gep.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/reorder-gep.ll
index a15f11a634db..2e3b6ca3653f 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/reorder-gep.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/reorder-gep.ll
@@ -186,3 +186,66 @@ end:
   call void asm sideeffect "; use $0", "v"(ptr %idx3)
   ret void
 }
+
+
+define void @different_type_reorder2(ptr %in.ptr, i64 %in.idx0, i64 %in.idx1) {
+; CHECK-LABEL: define void @different_type_reorder2(
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX0:%.*]], i64 [[IN_IDX1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[BASE:%.*]] = getelementptr i8, ptr [[IN_PTR]], i64 [[IN_IDX0]]
+; CHECK-NEXT:    [[IDX0:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[IN_IDX1]]
+; CHECK-NEXT:    [[CONST1:%.*]] = getelementptr i64, ptr [[BASE]], i64 256
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr i8, ptr [[CONST1]], i64 [[IN_IDX1]]
+; CHECK-NEXT:    [[CONST2:%.*]] = getelementptr i64, ptr [[BASE]], i64 512
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr i8, ptr [[CONST2]], i64 [[IN_IDX1]]
+; CHECK-NEXT:    [[CONST3:%.*]] = getelementptr i64, ptr [[BASE]], i64 768
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr i8, ptr [[CONST3]], i64 [[IN_IDX1]]
+; CHECK-NEXT:    [[CMP0:%.*]] = icmp eq i64 [[IN_IDX0]], 0
+; CHECK-NEXT:    br i1 [[CMP0]], label [[BB_1:%.*]], label [[END:%.*]]
+; CHECK:       bb.1:
+; CHECK-NEXT:    [[VAL0:%.*]] = load <8 x i64>, ptr [[IDX0]], align 16
+; CHECK-NEXT:    [[VAL1:%.*]] = load <8 x i64>, ptr [[IDX1]], align 16
+; CHECK-NEXT:    [[VAL2:%.*]] = load <8 x i64>, ptr [[IDX2]], align 16
+; CHECK-NEXT:    [[VAL3:%.*]] = load <8 x i64>, ptr [[IDX3]], align 16
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+entry:
+  %base = getelementptr i8, ptr %in.ptr, i64 %in.idx0
+  %idx0 = getelementptr i8, ptr %base, i64 %in.idx1
+  %const1 = getelementptr i64, ptr %base, i64 256
+  %idx1 = getelementptr i8, ptr %const1, i64 %in.idx1
+  %const2 = getelementptr i64, ptr %base, i64 512
+  %idx2 = getelementptr i8, ptr %const2, i64 %in.idx1
+  %const3 = getelementptr i64, ptr %base, i64 768
+  %idx3 = getelementptr i8, ptr %const3, i64 %in.idx1
+  %cmp0 = icmp eq i64 %in.idx0, 0
+  br i1 %cmp0, label %bb.1, label %end
+
+bb.1:
+  %val0 = load <8 x i64>, ptr %idx0, align 16
+  %val1 = load <8 x i64>, ptr %idx1, align 16
+  %val2 = load <8 x i64>, ptr %idx2, align 16
+  %val3 = load <8 x i64>, ptr %idx3, align 16
+  call void asm sideeffect "; use $0", "v"(<8 x i64> %val0)
+  call void asm sideeffect "; use $0", "v"(<8 x i64> %val1)
+  call void asm sideeffect "; use $0", "v"(<8 x i64> %val2)
+  call void asm sideeffect "; use $0", "v"(<8 x i64> %val3)
+  br label %end
+
+end:
+  call void asm sideeffect "; use $0", "v"(ptr %idx0)
+  call void asm sideeffect "; use $0", "v"(ptr %idx1)
+  call void asm sideeffect "; use $0", "v"(ptr %idx2)
+  call void asm sideeffect "; use $0", "v"(ptr %idx3)
+  ret void
+}
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-markloopasdeleted.ll b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-markloopasdeleted.ll
index c8e1291b9cd5..9ab713cc8a4f 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-markloopasdeleted.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-markloopasdeleted.ll
@@ -17,7 +17,7 @@
 ; SimpleLoopUnswitch not marking the Loop as removed, so we missed clearing
 ; the analysis caches.
 ;
-; CHECK: Running pass: SimpleLoopUnswitchPass on loop_begin
+; CHECK: Running pass: SimpleLoopUnswitchPass on loop %loop_begin in function test6
 ; CHECK-NEXT: Running analysis: OuterAnalysisManagerProxy
 ; CHECK-NEXT: Clearing all analysis results for: loop_a_inner
 
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll b/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
index 3873f0c0ae0b..9d6502072c16 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
@@ -2068,3 +2068,37 @@ cond.end:                                         ; preds = %entry, %cond.false
   %conv = sext i3 %cond to i8
   ret i8 %conv
 }
+
+; Don't create a table with an unknown type
+define { i8, i8 } @test_unknown_result_type(i8 %n) {
+; CHECK-LABEL: @test_unknown_result_type(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i8 [[N:%.*]], label [[SW_DEFAULT:%.*]] [
+; CHECK-NEXT:      i8 0, label [[RETURN:%.*]]
+; CHECK-NEXT:      i8 1, label [[RETURN]]
+; CHECK-NEXT:      i8 2, label [[RETURN]]
+; CHECK-NEXT:    ]
+; CHECK:       sw.default:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { i8, i8 } undef, i8 0, 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { i8, i8 } [[TMP0]], i8 1, 1
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi { i8, i8 } [ undef, [[ENTRY:%.*]] ], [ undef, [[ENTRY]] ], [ undef, [[ENTRY]] ], [ [[TMP1]], [[SW_DEFAULT]] ]
+; CHECK-NEXT:    ret { i8, i8 } [[RETVAL_0]]
+;
+entry:
+  switch i8 %n, label %sw.default [
+  i8 0, label %return
+  i8 1, label %return
+  i8 2, label %return
+  ]
+
+sw.default:                                       ; preds = %entry
+  %0 = insertvalue { i8, i8 } undef, i8 0, 0
+  %1 = insertvalue { i8, i8 } %0, i8 1, 1
+  br label %return
+
+return:                                           ; preds = %sw.default, %entry, %entry, %entry
+  %retval.0 = phi { i8, i8 } [ undef, %entry ], [ undef, %entry ], [ undef, %entry ], [ %1, %sw.default ]
+  ret { i8, i8 } %retval.0
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/hoist-dbgvalue-inlined.ll b/llvm/test/Transforms/SimplifyCFG/hoist-dbgvalue-inlined.ll
index 5af73e789f11..e00d1daf71de 100644
--- a/llvm/test/Transforms/SimplifyCFG/hoist-dbgvalue-inlined.ll
+++ b/llvm/test/Transforms/SimplifyCFG/hoist-dbgvalue-inlined.ll
@@ -9,6 +9,7 @@ init:
 
 ; CHECK:  %vala = load i64, ptr %ptr
 ; CHECK-NEXT:  call void @llvm.dbg.value(metadata i64 %vala, metadata [[MD:![0-9]*]]
+; CHECK-NEXT:  call void @llvm.dbg.value(metadata i64 %vala, metadata [[MD]]
 ; CHECK-NEXT:  %valbmasked = and i64 %vala, 1
 
 a:                                              ; preds = %init
diff --git a/llvm/test/Transforms/SimplifyCFG/hoist-dbgvalue.ll b/llvm/test/Transforms/SimplifyCFG/hoist-dbgvalue.ll
index c5d723c4e3dd..af7da45ec089 100644
--- a/llvm/test/Transforms/SimplifyCFG/hoist-dbgvalue.ll
+++ b/llvm/test/Transforms/SimplifyCFG/hoist-dbgvalue.ll
@@ -47,6 +47,7 @@ define i1 @hoist_with_debug2(i32 %x) !dbg !22 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp ugt i32 [[X:%.*]], 2
 ; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 [[X]], metadata [[META21:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 [[X]], metadata [[META21]], metadata !DIExpression()), !dbg [[DBG23]]
 ; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[TOBOOL_NOT]], i1 false, i1 true
 ; CHECK-NEXT:    ret i1 [[DOT]]
 ;
diff --git a/llvm/test/Transforms/ThinLTOBitcodeWriter/split-typed-decl.ll b/llvm/test/Transforms/ThinLTOBitcodeWriter/split-typed-decl.ll
index ec96c1632586..ddc8604fe41c 100644
--- a/llvm/test/Transforms/ThinLTOBitcodeWriter/split-typed-decl.ll
+++ b/llvm/test/Transforms/ThinLTOBitcodeWriter/split-typed-decl.ll
@@ -1,12 +1,12 @@
-;; Generating bitcode files with split LTO modules should not crash if there are
-;; typed declarations in sources.
-
-; RUN: opt --thinlto-bc --thinlto-split-lto-unit -o - %s
-
-@_ZTV3Foo = external constant { [3 x ptr] }, !type !0
-
-define void @Bar() {
-  ret void
-}
-
-!0 = !{i64 16, !"_ZTS3Foo"}
+;; Generating bitcode files with split LTO modules should not crash if there are
+;; typed declarations in sources.
+
+; RUN: opt --thinlto-bc --thinlto-split-lto-unit -o - %s
+
+@_ZTV3Foo = external constant { [3 x ptr] }, !type !0
+
+define void @Bar() {
+  ret void
+}
+
+!0 = !{i64 16, !"_ZTS3Foo"}
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
index 6a81964b917e..b58f92d70936 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
@@ -15,13 +15,7 @@ define <8 x i8> @trivial(<8 x i8> %a) {
 
 define <8 x i8> @add(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: @add(
-; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[BB:%.*]] = shufflevector <8 x i8> [[B:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[BT:%.*]] = shufflevector <8 x i8> [[B]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[ABT:%.*]] = add <4 x i8> [[AT]], [[BT]]
-; CHECK-NEXT:    [[ABB:%.*]] = add <4 x i8> [[AB]], [[BB]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i8> [[ABT]], <4 x i8> [[ABB]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[R:%.*]] = add <8 x i8> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <8 x i8> [[R]]
 ;
   %ab = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -80,13 +74,7 @@ define <8 x i8> @wrong_lanes(<8 x i8> %a, <8 x i8> %b) {
 
 define <8 x half> @fadd(<8 x half> %a, <8 x half> %b) {
 ; CHECK-LABEL: @fadd(
-; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[BB:%.*]] = shufflevector <8 x half> [[B:%.*]], <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[BT:%.*]] = shufflevector <8 x half> [[B]], <8 x half> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[ABT:%.*]] = fadd <4 x half> [[AT]], [[BT]]
-; CHECK-NEXT:    [[ABB:%.*]] = fadd <4 x half> [[AB]], [[BB]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x half> [[ABT]], <4 x half> [[ABB]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[R:%.*]] = fadd <8 x half> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <8 x half> [[R]]
 ;
   %ab = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -101,11 +89,7 @@ define <8 x half> @fadd(<8 x half> %a, <8 x half> %b) {
 
 define <8 x half> @fneg(<8 x half> %a, <8 x half> %b) {
 ; CHECK-LABEL: @fneg(
-; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[ABT:%.*]] = fneg <4 x half> [[AT]]
-; CHECK-NEXT:    [[ABB:%.*]] = fneg <4 x half> [[AB]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x half> [[ABT]], <4 x half> [[ABB]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[R:%.*]] = fneg <8 x half> [[A:%.*]]
 ; CHECK-NEXT:    ret <8 x half> [[R]]
 ;
   %ab = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -118,6 +102,80 @@ define <8 x half> @fneg(<8 x half> %a, <8 x half> %b) {
 
 define <8 x i8> @abs(<8 x i8> %a) {
 ; CHECK-LABEL: @abs(
+; CHECK-NEXT:    [[R:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[A:%.*]], i1 false)
+; CHECK-NEXT:    ret <8 x i8> [[R]]
+;
+  %ab = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %at = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
+  %abt = call <4 x i8> @llvm.abs.v4i8(<4 x i8> %at, i1 false)
+  %abb = call <4 x i8> @llvm.abs.v4i8(<4 x i8> %ab, i1 false)
+  %r = shufflevector <4 x i8> %abt, <4 x i8> %abb, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  ret <8 x i8> %r
+}
+
+define <8 x half> @powi(<8 x half> %a) {
+; CHECK-LABEL: @powi(
+; CHECK-NEXT:    [[R:%.*]] = call <8 x half> @llvm.powi.v8f16.i32(<8 x half> [[A:%.*]], i32 10)
+; CHECK-NEXT:    ret <8 x half> [[R]]
+;
+  %ab = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %at = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
+  %abt = call <4 x half> @llvm.powi.v4f16.i32(<4 x half> %at, i32 10)
+  %abb = call <4 x half> @llvm.powi.v4f16.i32(<4 x half> %ab, i32 10)
+  %r = shufflevector <4 x half> %abt, <4 x half> %abb, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  ret <8 x half> %r
+}
+
+; Check that call instructions are treated separately from intrinsics.
+define <8 x half> @callinst(<8 x half> %a) {
+; CHECK-LABEL: @callinst(
+; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEXT:    [[ABT:%.*]] = call <4 x half> @othercall(<4 x half> [[AT]])
+; CHECK-NEXT:    [[ABB:%.*]] = call <4 x half> @llvm.fabs.v4f16(<4 x half> [[AB]])
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x half> [[ABT]], <4 x half> [[ABB]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x half> [[R]]
+;
+  %ab = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %at = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
+  %abt = call <4 x half> @othercall(<4 x half> %at)
+  %abb = call <4 x half> @llvm.fabs.v4f16(<4 x half> %ab)
+  %r = shufflevector <4 x half> %abt, <4 x half> %abb, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  ret <8 x half> %r
+}
+define <8 x half> @callinst2(<8 x half> %a) {
+; CHECK-LABEL: @callinst2(
+; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEXT:    [[ABT:%.*]] = call <4 x half> @llvm.fabs.v4f16(<4 x half> [[AT]])
+; CHECK-NEXT:    [[ABB:%.*]] = call <4 x half> @othercall(<4 x half> [[AB]])
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x half> [[ABT]], <4 x half> [[ABB]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x half> [[R]]
+;
+  %ab = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %at = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
+  %abt = call <4 x half> @llvm.fabs.v4f16(<4 x half> %at)
+  %abb = call <4 x half> @othercall(<4 x half> %ab)
+  %r = shufflevector <4 x half> %abt, <4 x half> %abb, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  ret <8 x half> %r
+}
+declare <4 x half> @othercall(<4 x half>)
+
+define <8 x i32> @lrint(<8 x half> %a) {
+; CHECK-LABEL: @lrint(
+; CHECK-NEXT:    [[R:%.*]] = call <8 x i32> @llvm.lrint.v8i32.v8f16(<8 x half> [[A:%.*]])
+; CHECK-NEXT:    ret <8 x i32> [[R]]
+;
+  %ab = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %at = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
+  %abt = call <4 x i32> @llvm.lrint.v4i32.v4f16(<4 x half> %at)
+  %abb = call <4 x i32> @llvm.lrint.v4i32.v4f16(<4 x half> %ab)
+  %r = shufflevector <4 x i32> %abt, <4 x i32> %abb, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  ret <8 x i32> %r
+}
+
+define <8 x i8> @abs_different(<8 x i8> %a) {
+; CHECK-LABEL: @abs_different(
 ; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
 ; CHECK-NEXT:    [[ABT:%.*]] = call <4 x i8> @llvm.abs.v4i8(<4 x i8> [[AT]], i1 true)
@@ -135,12 +193,8 @@ define <8 x i8> @abs(<8 x i8> %a) {
 
 define <8 x half> @splat0(<8 x half> %a, <8 x half> %b) {
 ; CHECK-LABEL: @splat0(
-; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[BS:%.*]] = shufflevector <8 x half> [[B:%.*]], <8 x half> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[ABT:%.*]] = fadd <4 x half> [[AT]], [[BS]]
-; CHECK-NEXT:    [[ABB:%.*]] = fadd <4 x half> [[AB]], [[BS]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x half> [[ABT]], <4 x half> [[ABB]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x half> [[B:%.*]], <8 x half> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[R:%.*]] = fadd <8 x half> [[A:%.*]], [[TMP1]]
 ; CHECK-NEXT:    ret <8 x half> [[R]]
 ;
   %ab = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -154,12 +208,8 @@ define <8 x half> @splat0(<8 x half> %a, <8 x half> %b) {
 
 define <8 x half> @splat2(<8 x half> %a, <8 x half> %b) {
 ; CHECK-LABEL: @splat2(
-; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[BS:%.*]] = shufflevector <8 x half> [[B:%.*]], <8 x half> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[ABT:%.*]] = fadd <4 x half> [[AT]], [[BS]]
-; CHECK-NEXT:    [[ABB:%.*]] = fadd <4 x half> [[AB]], [[BS]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x half> [[ABT]], <4 x half> [[ABB]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x half> [[B:%.*]], <8 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[R:%.*]] = fadd <8 x half> [[A:%.*]], [[TMP1]]
 ; CHECK-NEXT:    ret <8 x half> [[R]]
 ;
   %ab = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -173,12 +223,8 @@ define <8 x half> @splat2(<8 x half> %a, <8 x half> %b) {
 
 define <8 x half> @splatandidentity(<8 x half> %a, <8 x half> %b) {
 ; CHECK-LABEL: @splatandidentity(
-; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[BS:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[ABT:%.*]] = fadd <4 x half> [[AT]], [[BS]]
-; CHECK-NEXT:    [[ABB:%.*]] = fadd <4 x half> [[AB]], [[BS]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x half> [[ABT]], <4 x half> [[ABB]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[R:%.*]] = fadd <8 x half> [[A]], [[TMP1]]
 ; CHECK-NEXT:    ret <8 x half> [[R]]
 ;
   %ab = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -192,11 +238,9 @@ define <8 x half> @splatandidentity(<8 x half> %a, <8 x half> %b) {
 
 define <8 x half> @splattwice(<8 x half> %a, <8 x half> %b) {
 ; CHECK-LABEL: @splattwice(
-; CHECK-NEXT:    [[AS:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BS:%.*]] = shufflevector <8 x half> [[B:%.*]], <8 x half> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[AB1:%.*]] = fadd <4 x half> [[AS]], [[BS]]
-; CHECK-NEXT:    [[AB2:%.*]] = fadd <4 x half> [[AS]], [[BS]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x half> [[AB1]], <4 x half> [[AB2]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x half> [[B:%.*]], <8 x half> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[R:%.*]] = fadd <8 x half> [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    ret <8 x half> [[R]]
 ;
   %as = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> zeroinitializer
@@ -209,13 +253,7 @@ define <8 x half> @splattwice(<8 x half> %a, <8 x half> %b) {
 
 define <8 x i8> @undeflane(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: @undeflane(
-; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[BB:%.*]] = shufflevector <8 x i8> [[B:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[BT:%.*]] = shufflevector <8 x i8> [[B]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[ABT:%.*]] = add <4 x i8> [[AT]], [[BT]]
-; CHECK-NEXT:    [[ABB:%.*]] = add <4 x i8> [[AB]], [[BB]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i8> [[ABT]], <4 x i8> [[ABB]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 poison, i32 1, i32 0>
+; CHECK-NEXT:    [[R:%.*]] = add <8 x i8> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <8 x i8> [[R]]
 ;
   %ab = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -281,18 +319,9 @@ define <8 x i8> @constantdiff2(<8 x i8> %a) {
 
 define <8 x i8> @inner_shuffle(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
 ; CHECK-LABEL: @inner_shuffle(
-; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[BB:%.*]] = shufflevector <8 x i8> [[B:%.*]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[BT:%.*]] = shufflevector <8 x i8> [[B]], <8 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[CS:%.*]] = shufflevector <8 x i8> [[C:%.*]], <8 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[ABT:%.*]] = mul <4 x i8> [[AT]], [[BT]]
-; CHECK-NEXT:    [[ABB:%.*]] = mul <4 x i8> [[AB]], [[BB]]
-; CHECK-NEXT:    [[ABT2:%.*]] = shufflevector <4 x i8> [[ABT]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[ABB2:%.*]] = shufflevector <4 x i8> [[ABB]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[ABT3:%.*]] = add <4 x i8> [[ABT2]], [[CS]]
-; CHECK-NEXT:    [[ABB3:%.*]] = add <4 x i8> [[ABB2]], [[CS]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i8> [[ABT3]], <4 x i8> [[ABB3]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[C:%.*]], <8 x i8> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i8> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = add <8 x i8> [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    ret <8 x i8> [[R]]
 ;
   %ab = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -335,14 +364,9 @@ define <8 x i8> @extrause_add(<8 x i8> %a, <8 x i8> %b) {
 
 define <8 x i8> @extrause_shuffle(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: @extrause_shuffle(
-; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[BB:%.*]] = shufflevector <8 x i8> [[B:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[BT:%.*]] = shufflevector <8 x i8> [[B]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEXT:    [[BT:%.*]] = shufflevector <8 x i8> [[B:%.*]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
 ; CHECK-NEXT:    call void @use(<4 x i8> [[BT]])
-; CHECK-NEXT:    [[ABT:%.*]] = add <4 x i8> [[AT]], [[BT]]
-; CHECK-NEXT:    [[ABB:%.*]] = add <4 x i8> [[AB]], [[BB]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i8> [[ABT]], <4 x i8> [[ABB]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[R:%.*]] = add <8 x i8> [[A:%.*]], [[B]]
 ; CHECK-NEXT:    ret <8 x i8> [[R]]
 ;
   %ab = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -391,15 +415,7 @@ define <8 x i8> @icmpsel(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
 
 define <8 x half> @fma(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; CHECK-LABEL: @fma(
-; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[BB:%.*]] = shufflevector <8 x half> [[B:%.*]], <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[BT:%.*]] = shufflevector <8 x half> [[B]], <8 x half> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[CB:%.*]] = shufflevector <8 x half> [[C:%.*]], <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[CT:%.*]] = shufflevector <8 x half> [[C]], <8 x half> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[ABB:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[AB]], <4 x half> [[BB]], <4 x half> [[CB]])
-; CHECK-NEXT:    [[ABT:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[AT]], <4 x half> [[BT]], <4 x half> [[CT]])
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x half> [[ABT]], <4 x half> [[ABB]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[R:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[C:%.*]])
 ; CHECK-NEXT:    ret <8 x half> [[R]]
 ;
   %ab = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -451,19 +467,10 @@ define void @exttrunc(<8 x i32> %a, <8 x i32> %b, ptr %p) {
 
 define <8 x i8> @intrinsics_minmax(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: @intrinsics_minmax(
-; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[BB:%.*]] = shufflevector <8 x i8> [[B:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[BT:%.*]] = shufflevector <8 x i8> [[B]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[ABT:%.*]] = call <4 x i8> @llvm.smin.v4i8(<4 x i8> [[AT]], <4 x i8> [[BT]])
-; CHECK-NEXT:    [[ABB:%.*]] = call <4 x i8> @llvm.smin.v4i8(<4 x i8> [[AB]], <4 x i8> [[BB]])
-; CHECK-NEXT:    [[ABT1:%.*]] = call <4 x i8> @llvm.smin.v4i8(<4 x i8> [[ABT]], <4 x i8> [[BT]])
-; CHECK-NEXT:    [[ABB1:%.*]] = call <4 x i8> @llvm.smax.v4i8(<4 x i8> [[ABB]], <4 x i8> [[BB]])
-; CHECK-NEXT:    [[ABT2:%.*]] = call <4 x i8> @llvm.umin.v4i8(<4 x i8> [[ABT1]], <4 x i8> [[BT]])
-; CHECK-NEXT:    [[ABB2:%.*]] = call <4 x i8> @llvm.umin.v4i8(<4 x i8> [[ABB1]], <4 x i8> [[BB]])
-; CHECK-NEXT:    [[ABT3:%.*]] = call <4 x i8> @llvm.umax.v4i8(<4 x i8> [[ABT2]], <4 x i8> [[BT]])
-; CHECK-NEXT:    [[ABB3:%.*]] = call <4 x i8> @llvm.umax.v4i8(<4 x i8> [[ABB2]], <4 x i8> [[BB]])
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i8> [[ABT3]], <4 x i8> [[ABB3]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.smin.v8i8(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[TMP1]], <8 x i8> [[B]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[TMP2]], <8 x i8> [[B]])
+; CHECK-NEXT:    [[R:%.*]] = call <8 x i8> @llvm.umax.v8i8(<8 x i8> [[TMP3]], <8 x i8> [[B]])
 ; CHECK-NEXT:    ret <8 x i8> [[R]]
 ;
   %ab = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -472,7 +479,7 @@ define <8 x i8> @intrinsics_minmax(<8 x i8> %a, <8 x i8> %b) {
   %bt = shufflevector <8 x i8> %b, <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
   %abt = call <4 x i8> @llvm.smin.v4i8(<4 x i8> %at, <4 x i8> %bt)
   %abb = call <4 x i8> @llvm.smin.v4i8(<4 x i8> %ab, <4 x i8> %bb)
-  %abt1 = call <4 x i8> @llvm.smin.v4i8(<4 x i8> %abt, <4 x i8> %bt)
+  %abt1 = call <4 x i8> @llvm.smax.v4i8(<4 x i8> %abt, <4 x i8> %bt)
   %abb1 = call <4 x i8> @llvm.smax.v4i8(<4 x i8> %abb, <4 x i8> %bb)
   %abt2 = call <4 x i8> @llvm.umin.v4i8(<4 x i8> %abt1, <4 x i8> %bt)
   %abb2 = call <4 x i8> @llvm.umin.v4i8(<4 x i8> %abb1, <4 x i8> %bb)
@@ -484,19 +491,10 @@ define <8 x i8> @intrinsics_minmax(<8 x i8> %a, <8 x i8> %b) {
 
 define <8 x i8> @intrinsics_addsat(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: @intrinsics_addsat(
-; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[BB:%.*]] = shufflevector <8 x i8> [[B:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[BT:%.*]] = shufflevector <8 x i8> [[B]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[ABT:%.*]] = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> [[AT]], <4 x i8> [[BT]])
-; CHECK-NEXT:    [[ABB:%.*]] = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> [[AB]], <4 x i8> [[BB]])
-; CHECK-NEXT:    [[ABT1:%.*]] = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> [[ABT]], <4 x i8> [[BT]])
-; CHECK-NEXT:    [[ABB1:%.*]] = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> [[ABB]], <4 x i8> [[BB]])
-; CHECK-NEXT:    [[ABT2:%.*]] = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> [[ABT1]], <4 x i8> [[BT]])
-; CHECK-NEXT:    [[ABB2:%.*]] = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> [[ABB1]], <4 x i8> [[BB]])
-; CHECK-NEXT:    [[ABT3:%.*]] = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> [[ABT2]], <4 x i8> [[BT]])
-; CHECK-NEXT:    [[ABB3:%.*]] = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> [[ABB2]], <4 x i8> [[BB]])
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i8> [[ABT3]], <4 x i8> [[ABB3]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> [[TMP1]], <8 x i8> [[B]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> [[TMP2]], <8 x i8> [[B]])
+; CHECK-NEXT:    [[R:%.*]] = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> [[TMP3]], <8 x i8> [[B]])
 ; CHECK-NEXT:    ret <8 x i8> [[R]]
 ;
   %ab = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -536,57 +534,42 @@ define <8 x i8> @intrinsics_different(<8 x i8> %a, <8 x i8> %b) {
   ret <8 x i8> %r
 }
 
+; div and rem are currently excluded.
+define <8 x i8> @div(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: @div(
+; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEXT:    [[BB:%.*]] = shufflevector <8 x i8> [[B:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[BT:%.*]] = shufflevector <8 x i8> [[B]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEXT:    [[ABT:%.*]] = udiv <4 x i8> [[AT]], [[BT]]
+; CHECK-NEXT:    [[ABB:%.*]] = udiv <4 x i8> [[AB]], [[BB]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i8> [[ABT]], <4 x i8> [[ABB]], <8 x i32> <i32 7, i32 poison, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i8> [[R]]
+;
+  %ab = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %at = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
+  %bb = shufflevector <8 x i8> %b, <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %bt = shufflevector <8 x i8> %b, <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
+  %abt = udiv <4 x i8> %at, %bt
+  %abb = udiv <4 x i8> %ab, %bb
+  %r = shufflevector <4 x i8> %abt, <4 x i8> %abb, <8 x i32> <i32 7, i32 poison, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  ret <8 x i8> %r
+}
+
 define void @v8f64interleave(i64 %0, ptr %1, ptr %x, double %z) {
 ; CHECK-LABEL: @v8f64interleave(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[Z:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x double>, ptr [[TMP1:%.*]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <2 x i32> <i32 0, i32 8>
-; CHECK-NEXT:    [[STRIDED_VEC27:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <2 x i32> <i32 1, i32 9>
-; CHECK-NEXT:    [[STRIDED_VEC28:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <2 x i32> <i32 2, i32 10>
-; CHECK-NEXT:    [[STRIDED_VEC29:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <2 x i32> <i32 3, i32 11>
-; CHECK-NEXT:    [[STRIDED_VEC30:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <2 x i32> <i32 4, i32 12>
-; CHECK-NEXT:    [[STRIDED_VEC31:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <2 x i32> <i32 5, i32 13>
-; CHECK-NEXT:    [[STRIDED_VEC32:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <2 x i32> <i32 6, i32 14>
-; CHECK-NEXT:    [[STRIDED_VEC33:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <2 x i32> <i32 7, i32 15>
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <2 x double> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, ptr [[X:%.*]], i64 [[TMP0:%.*]]
-; CHECK-NEXT:    [[WIDE_VEC34:%.*]] = load <16 x double>, ptr [[TMP3]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC35:%.*]] = shufflevector <16 x double> [[WIDE_VEC34]], <16 x double> poison, <2 x i32> <i32 0, i32 8>
-; CHECK-NEXT:    [[STRIDED_VEC36:%.*]] = shufflevector <16 x double> [[WIDE_VEC34]], <16 x double> poison, <2 x i32> <i32 1, i32 9>
-; CHECK-NEXT:    [[STRIDED_VEC37:%.*]] = shufflevector <16 x double> [[WIDE_VEC34]], <16 x double> poison, <2 x i32> <i32 2, i32 10>
-; CHECK-NEXT:    [[STRIDED_VEC38:%.*]] = shufflevector <16 x double> [[WIDE_VEC34]], <16 x double> poison, <2 x i32> <i32 3, i32 11>
-; CHECK-NEXT:    [[STRIDED_VEC39:%.*]] = shufflevector <16 x double> [[WIDE_VEC34]], <16 x double> poison, <2 x i32> <i32 4, i32 12>
-; CHECK-NEXT:    [[STRIDED_VEC40:%.*]] = shufflevector <16 x double> [[WIDE_VEC34]], <16 x double> poison, <2 x i32> <i32 5, i32 13>
-; CHECK-NEXT:    [[STRIDED_VEC41:%.*]] = shufflevector <16 x double> [[WIDE_VEC34]], <16 x double> poison, <2 x i32> <i32 6, i32 14>
-; CHECK-NEXT:    [[STRIDED_VEC42:%.*]] = shufflevector <16 x double> [[WIDE_VEC34]], <16 x double> poison, <2 x i32> <i32 7, i32 15>
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <2 x double> [[STRIDED_VEC35]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <2 x double> [[STRIDED_VEC27]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fadd fast <2 x double> [[STRIDED_VEC36]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul fast <2 x double> [[STRIDED_VEC28]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fadd fast <2 x double> [[STRIDED_VEC37]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast <2 x double> [[STRIDED_VEC29]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[STRIDED_VEC38]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast <2 x double> [[STRIDED_VEC30]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd fast <2 x double> [[STRIDED_VEC39]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = fmul fast <2 x double> [[STRIDED_VEC31]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP14:%.*]] = fadd fast <2 x double> [[STRIDED_VEC40]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = fmul fast <2 x double> [[STRIDED_VEC32]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP16:%.*]] = fadd fast <2 x double> [[STRIDED_VEC41]], [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or disjoint i64 [[TMP0]], 7
-; CHECK-NEXT:    [[TMP18:%.*]] = fmul fast <2 x double> [[STRIDED_VEC33]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP20:%.*]] = fadd fast <2 x double> [[STRIDED_VEC42]], [[TMP18]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP19]], i64 -56
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <2 x double> [[TMP12]], <2 x double> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <2 x double> [[TMP16]], <2 x double> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x double> [[TMP22]], <4 x double> [[TMP23]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <4 x double> [[TMP24]], <4 x double> [[TMP25]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x double> [[TMP26]], <8 x double> [[TMP27]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; CHECK-NEXT:    store <16 x double> [[INTERLEAVED_VEC]], ptr [[TMP21]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <16 x double> [[WIDE_VEC]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, ptr [[X:%.*]], i64 [[TMP0:%.*]]
+; CHECK-NEXT:    [[WIDE_VEC34:%.*]] = load <16 x double>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = fadd <16 x double> [[WIDE_VEC34]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[TMP0]], 7
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 -56
+; CHECK-NEXT:    store <16 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/VectorCombine/X86/select-shuffle.ll b/llvm/test/Transforms/VectorCombine/X86/select-shuffle.ll
index 685d661ea6bc..60a6c4b1d9b9 100644
--- a/llvm/test/Transforms/VectorCombine/X86/select-shuffle.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/select-shuffle.ll
@@ -12,12 +12,11 @@ define <4 x double> @PR60649() {
 ; CHECK:       unreachable:
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[T0:%.*]] = phi <4 x double> [ zeroinitializer, [[ENTRY:%.*]] ], [ zeroinitializer, [[UNREACHABLE:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <4 x double> [ zeroinitializer, [[ENTRY:%.*]] ], [ zeroinitializer, [[UNREACHABLE:%.*]] ]
 ; CHECK-NEXT:    [[T1:%.*]] = phi <4 x double> [ zeroinitializer, [[ENTRY]] ], [ zeroinitializer, [[UNREACHABLE]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x double> [[T0]], <4 x double> [[T0]], <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[T0]], <4 x double> [[T0]], <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP2:%.*]] = fdiv <4 x double> [[TMP1]], <double 0.000000e+00, double 0.000000e+00, double undef, double undef>
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x double> [[TMP0]], <double 0.000000e+00, double 0.000000e+00, double undef, double undef>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[TMP0]], <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = fdiv <4 x double> [[TMP0]], <double 0.000000e+00, double 0.000000e+00, double undef, double undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], <double 0.000000e+00, double 0.000000e+00, double undef, double undef>
 ; CHECK-NEXT:    [[T5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    ret <4 x double> [[T5]]
 ;
diff --git a/llvm/test/Transforms/VectorCombine/pr88796.ll b/llvm/test/Transforms/VectorCombine/pr88796.ll
new file mode 100644
index 000000000000..4f26f5dcbb92
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/pr88796.ll
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=vector-combine -S %s | FileCheck %s
+
+define i32 @test() {
+; CHECK-LABEL: define i32 @test() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i16 @llvm.vector.reduce.and.nxv8i16(<vscale x 8 x i16> trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 268435456, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>))
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %0 = tail call i16 @llvm.vector.reduce.and.nxv8i16(<vscale x 8 x i16> trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 268435456, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>))
+  ret i32 0
+}
+
+declare i16 @llvm.vector.reduce.and.nxv8i16(<vscale x 8 x i16>)
+
diff --git a/llvm/test/Verifier/vp-intrinsics.ll b/llvm/test/Verifier/vp-intrinsics.ll
index 765d67356c2b..9ed8279f94d7 100644
--- a/llvm/test/Verifier/vp-intrinsics.ll
+++ b/llvm/test/Verifier/vp-intrinsics.ll
@@ -43,8 +43,10 @@ define void @test_vp_reduction(i32 %x, <8 x i32> %vi, <8 x float> %vf, float %f,
   %r8 = call i32 @llvm.vp.reduce.umin.v8i32(i32 %x, <8 x i32> %vi, <8 x i1> %m, i32 %n)
   %r9 = call float @llvm.vp.reduce.fmin.v8f32(float %f, <8 x float> %vf, <8 x i1> %m, i32 %n)
   %rA = call float @llvm.vp.reduce.fmax.v8f32(float %f, <8 x float> %vf, <8 x i1> %m, i32 %n)
-  %rB = call float @llvm.vp.reduce.fadd.v8f32(float %f, <8 x float> %vf, <8 x i1> %m, i32 %n)
-  %rC = call float @llvm.vp.reduce.fmul.v8f32(float %f, <8 x float> %vf, <8 x i1> %m, i32 %n)
+  %rB = call float @llvm.vp.reduce.fminimum.v8f32(float %f, <8 x float> %vf, <8 x i1> %m, i32 %n)
+  %rC = call float @llvm.vp.reduce.fmaximum.v8f32(float %f, <8 x float> %vf, <8 x i1> %m, i32 %n)
+  %rD = call float @llvm.vp.reduce.fadd.v8f32(float %f, <8 x float> %vf, <8 x i1> %m, i32 %n)
+  %rE = call float @llvm.vp.reduce.fmul.v8f32(float %f, <8 x float> %vf, <8 x i1> %m, i32 %n)
   ret void
 }
 
@@ -113,6 +115,8 @@ declare i32 @llvm.vp.reduce.umax.v8i32(i32, <8 x i32>, <8 x i1>, i32)
 declare i32 @llvm.vp.reduce.umin.v8i32(i32, <8 x i32>, <8 x i1>, i32)
 declare float @llvm.vp.reduce.fmin.v8f32(float, <8 x float>, <8 x i1>, i32)
 declare float @llvm.vp.reduce.fmax.v8f32(float, <8 x float>, <8 x i1>, i32)
+declare float @llvm.vp.reduce.fminimum.v8f32(float, <8 x float>, <8 x i1>, i32)
+declare float @llvm.vp.reduce.fmaximum.v8f32(float, <8 x float>, <8 x i1>, i32)
 declare float @llvm.vp.reduce.fadd.v8f32(float, <8 x float>, <8 x i1>, i32)
 declare float @llvm.vp.reduce.fmul.v8f32(float, <8 x float>, <8 x i1>, i32)
 ; casts
diff --git a/llvm/test/tools/UpdateTestChecks/lit.local.cfg b/llvm/test/tools/UpdateTestChecks/lit.local.cfg
index f8ab6b82cde7..2e695490b005 100644
--- a/llvm/test/tools/UpdateTestChecks/lit.local.cfg
+++ b/llvm/test/tools/UpdateTestChecks/lit.local.cfg
@@ -19,7 +19,8 @@ def add_update_script_substition(
     # Specify an explicit default version in UTC tests, so that the --version
     # embedded in UTC_ARGS does not change in all test expectations every time
     # the default is bumped.
-    extra_args += " --version=1"
+    if name != "%update_test_body":
+        extra_args += " --version=1"
     config.substitutions.append(
         (name, "'%s' %s %s" % (python_exe, script_path, extra_args))
     )
@@ -47,3 +48,7 @@ if os.path.isfile(llvm_mca_path):
     config.available_features.add("llvm-mca-binary")
     mca_arg = "--llvm-mca-binary " + shell_quote(llvm_mca_path)
     add_update_script_substition("%update_test_checks", extra_args=mca_arg)
+
+split_file_path = os.path.join(config.llvm_tools_dir, "split-file")
+if os.path.isfile(split_file_path):
+    add_update_script_substition("%update_test_body")
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_body/Inputs/basic-asm.test.expected b/llvm/test/tools/UpdateTestChecks/update_test_body/Inputs/basic-asm.test.expected
new file mode 100644
index 000000000000..05024d8799cd
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_body/Inputs/basic-asm.test.expected
@@ -0,0 +1,13 @@
+# RUN: cp %s %t && %update_test_body %t 2>&1 | count 0
+# RUN: diff -u %S/Inputs/basic-asm.test.expected %t
+
+.ifdef GEN
+#--- a.txt
+.long 0
+#--- b.txt
+.long 1
+#--- gen
+cat a.txt b.txt
+.endif
+.long 0
+.long 1
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_body/Inputs/basic.test.expected b/llvm/test/tools/UpdateTestChecks/update_test_body/Inputs/basic.test.expected
new file mode 100644
index 000000000000..80a2676d0a75
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_body/Inputs/basic.test.expected
@@ -0,0 +1,16 @@
+; RUN: cp %s %t && %update_test_body %t 2>&1 | count 0
+; RUN: diff -u %S/Inputs/basic.test.expected %t
+
+;--- a.txt
+@a = global i32 0
+;--- b.txt
+@b = global i32 0
+;--- gen
+cat a.txt
+echo ';--- b.ll'
+cat b.txt
+
+;--- a.ll
+@a = global i32 0
+;--- b.ll
+@b = global i32 0
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_body/basic-asm.test b/llvm/test/tools/UpdateTestChecks/update_test_body/basic-asm.test
new file mode 100644
index 000000000000..3e82a3ffab9a
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_body/basic-asm.test
@@ -0,0 +1,11 @@
+# RUN: cp %s %t && %update_test_body %t 2>&1 | count 0
+# RUN: diff -u %S/Inputs/basic-asm.test.expected %t
+
+.ifdef GEN
+#--- a.txt
+.long 0
+#--- b.txt
+.long 1
+#--- gen
+cat a.txt b.txt
+.endif
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_body/basic.test b/llvm/test/tools/UpdateTestChecks/update_test_body/basic.test
new file mode 100644
index 000000000000..d99946e2bd92
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_body/basic.test
@@ -0,0 +1,13 @@
+; RUN: cp %s %t && %update_test_body %t 2>&1 | count 0
+; RUN: diff -u %S/Inputs/basic.test.expected %t
+
+;--- a.txt
+@a = global i32 0
+;--- b.txt
+@b = global i32 0
+;--- gen
+cat a.txt
+echo ';--- b.ll'
+cat b.txt
+
+;--- a.ll
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_body/empty-stdout.test b/llvm/test/tools/UpdateTestChecks/update_test_body/empty-stdout.test
new file mode 100644
index 000000000000..9ea9c7bc7ac9
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_body/empty-stdout.test
@@ -0,0 +1,13 @@
+# RUN: cp %s %t && not %update_test_body %t 2>&1 | FileCheck %s
+# RUN: diff -u %t %s
+
+# CHECK: stdout is empty; forgot -o - ?
+
+.ifdef GEN
+#--- a.txt
+.long 0
+#--- b.txt
+.long 1
+#--- gen
+true
+.endif
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_body/gen-absent.test b/llvm/test/tools/UpdateTestChecks/update_test_body/gen-absent.test
new file mode 100644
index 000000000000..c12f22adceb2
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_body/gen-absent.test
@@ -0,0 +1,7 @@
+# RUN: cp %s %t && not %update_test_body %t 2>&1 | FileCheck %s
+
+# CHECK: 'gen' does not exist
+
+.ifdef GEN
+#--- a.txt
+.endif
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_body/gen-fail.test b/llvm/test/tools/UpdateTestChecks/update_test_body/gen-fail.test
new file mode 100644
index 000000000000..7e1a9365df14
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_body/gen-fail.test
@@ -0,0 +1,11 @@
+# RUN: cp %s %t && not %update_test_body %t 2>&1 | FileCheck %s
+
+# CHECK:      log
+# CHECK-NEXT: 'gen' failed
+
+.ifdef GEN
+#--- gen
+echo log >&2
+false  # gen fails due to sh -e
+true
+.endif
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_body/gen-unterminated.test b/llvm/test/tools/UpdateTestChecks/update_test_body/gen-unterminated.test
new file mode 100644
index 000000000000..c0026939e414
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_body/gen-unterminated.test
@@ -0,0 +1,8 @@
+# RUN: cp %s %t && not %update_test_body %t 2>&1 | FileCheck %s
+
+# CHECK: 'gen' should be followed by another part (---) or .endif
+
+#--- a.txt
+.long 0
+#--- gen
+cat a.txt
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_body/lit.local.cfg b/llvm/test/tools/UpdateTestChecks/update_test_body/lit.local.cfg
new file mode 100644
index 000000000000..1bb2464ad957
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_body/lit.local.cfg
@@ -0,0 +1,4 @@
+import platform
+
+if platform.system() == "Windows":
+    config.unsupported = True
diff --git a/llvm/test/tools/dsymutil/ARM/empty-map.test b/llvm/test/tools/dsymutil/ARM/empty-map.test
index 40ffa8b1cc51..eeca28273a3f 100644
--- a/llvm/test/tools/dsymutil/ARM/empty-map.test
+++ b/llvm/test/tools/dsymutil/ARM/empty-map.test
@@ -1,4 +1,5 @@
 # RUN: dsymutil -f -oso-prepend-path=%p/../Inputs -y %s -o - 2>&1 | FileCheck %s
+# RUN: dsymutil -q -f -oso-prepend-path=%p/../Inputs -y %s -o - 2>&1 | FileCheck %s --check-prefix QUIET
 
 # RUN: dsymutil --linker parallel -f -oso-prepend-path=%p/../Inputs -y %s -o - 2>&1 | FileCheck %s
 
@@ -7,3 +8,4 @@ triple:          'thumbv7-apple-darwin'
 ...
 
 # CHECK: warning: no debug symbols in executable (-arch armv7)
+# QUIET-NOT: no debug symbols in executable
diff --git a/llvm/test/tools/dsymutil/cmdline.test b/llvm/test/tools/dsymutil/cmdline.test
index 814252b6e230..6c67ac7cd723 100644
--- a/llvm/test/tools/dsymutil/cmdline.test
+++ b/llvm/test/tools/dsymutil/cmdline.test
@@ -23,6 +23,7 @@ CHECK: -object-prefix-map <prefix=remapped>
 CHECK: -oso-prepend-path <path>
 CHECK: -out <filename>
 CHECK: {{-o <filename>}}
+CHECK: -quiet
 CHECK: -remarks-drop-without-debug
 CHECK: -remarks-output-format <format>
 CHECK: -remarks-prepend-path <path>
@@ -46,3 +47,6 @@ NOINPUT: error: no input files specified
 
 RUN: dsymutil -bogus -help 2>&1 | FileCheck --check-prefix=BOGUS %s
 BOGUS: warning: ignoring unknown option: -bogus
+
+RUN: not dsymutil --quiet --verbose 2>&1 | FileCheck --check-prefix=CONFLICT %s
+CONFLICT: error: --quiet and --verbose cannot be specified together
diff --git a/llvm/test/tools/dxil-dis/attribute-filter.ll b/llvm/test/tools/dxil-dis/attribute-filter.ll
index 432a5a1b7101..27590e10d79b 100644
--- a/llvm/test/tools/dxil-dis/attribute-filter.ll
+++ b/llvm/test/tools/dxil-dis/attribute-filter.ll
@@ -19,8 +19,8 @@ define float @fma2(float %0, float %1, float %2) #1 {
   ret float %5
 }
 
-; CHECK: attributes #0 = { nounwind readnone "disable-tail-calls"="false" }
-attributes #0 = { norecurse nounwind readnone willreturn "disable-tail-calls"="false" }
+; CHECK: attributes #0 = { nounwind readnone "fp32-denorm-mode"="any" "waveops-include-helper-lanes" }
+attributes #0 = { norecurse nounwind readnone willreturn "disable-tail-calls"="false" "waveops-include-helper-lanes" "fp32-denorm-mode"="any" }
 
-; CHECK: attributes #1 = { readnone "disable-tail-calls"="false" }
-attributes #1 = { norecurse memory(none) willreturn "disable-tail-calls"="false" }
+; CHECK: attributes #1 = { readnone "fp32-denorm-mode"="ftz" "waveops-include-helper-lanes" }
+attributes #1 = { norecurse memory(none) willreturn "disable-tail-calls"="false" "waveops-include-helper-lanes" "fp32-denorm-mode"="ftz" }
diff --git a/llvm/test/tools/dxil-dis/shuffle.ll b/llvm/test/tools/dxil-dis/shuffle.ll
index 6e45adc9d3ac..62d0172f90d1 100644
--- a/llvm/test/tools/dxil-dis/shuffle.ll
+++ b/llvm/test/tools/dxil-dis/shuffle.ll
@@ -1,27 +1,27 @@
-; RUN: llc --filetype=obj %s -o - 2>&1 | dxil-dis -o - | FileCheck %s
-target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
-target triple = "dxil-unknown-shadermodel6.7-library"
-
-; Make sure alloca is the same.
-; CHECK:alloca <2 x float>, align 8
-; Make sure shufflevector works for DXIL bitcode writer.
-; CHECK:shufflevector <2 x float> %{{.*}}, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-
-; Function Attrs: noinline nounwind optnone
-define noundef <2 x float> @foo(<2 x float> noundef %a) #0 {
-entry:
-  %a.addr = alloca <2 x float>, align 8
-  store <2 x float> %a, ptr %a.addr, align 8
-  %0 = load <2 x float>, ptr %a.addr, align 8
-  %1 = shufflevector <2 x float> %0, <2 x float> poison, <2 x i32> <i32 1, i32 0>
-  ret <2 x float> %1
-}
-
-attributes #0 = { noinline nounwind optnone "frame-pointer"="all" "min-legal-vector-width"="64" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-
-!llvm.module.flags = !{!0, !1, !3}
-
-!0 = !{i32 1, !"wchar_size", i32 4}
-!1 = !{i32 6, !"dx.valver", !2}
-!2 = !{i32 1, i32 7}
-!3 = !{i32 7, !"frame-pointer", i32 2}
+; RUN: llc --filetype=obj %s -o - 2>&1 | dxil-dis -o - | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-unknown-shadermodel6.7-library"
+
+; Make sure alloca is the same.
+; CHECK:alloca <2 x float>, align 8
+; Make sure shufflevector works for DXIL bitcode writer.
+; CHECK:shufflevector <2 x float> %{{.*}}, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+
+; Function Attrs: noinline nounwind optnone
+define noundef <2 x float> @foo(<2 x float> noundef %a) #0 {
+entry:
+  %a.addr = alloca <2 x float>, align 8
+  store <2 x float> %a, ptr %a.addr, align 8
+  %0 = load <2 x float>, ptr %a.addr, align 8
+  %1 = shufflevector <2 x float> %0, <2 x float> poison, <2 x i32> <i32 1, i32 0>
+  ret <2 x float> %1
+}
+
+attributes #0 = { noinline nounwind optnone "frame-pointer"="all" "min-legal-vector-width"="64" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+
+!llvm.module.flags = !{!0, !1, !3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 6, !"dx.valver", !2}
+!2 = !{i32 1, i32 7}
+!3 = !{i32 7, !"frame-pointer", i32 2}
diff --git a/llvm/test/tools/gold/X86/thinlto.ll b/llvm/test/tools/gold/X86/thinlto.ll
index fb2183450e4c..9d56afe9d58d 100644
--- a/llvm/test/tools/gold/X86/thinlto.ll
+++ b/llvm/test/tools/gold/X86/thinlto.ll
@@ -103,8 +103,8 @@
 ; BACKEND1-NEXT: <GLOBALVAL_SUMMARY_BLOCK
 ; BACKEND1-NEXT: <VERSION
 ; BACKEND1-NEXT: <FLAGS
-; BACKEND1-NEXT: <VALUE_GUID {{.*}} op0={{1|2}} {{op1=3060885059 op2=1207956914|op1=3432075125 op2=3712786831}}
-; BACKEND1-NEXT: <VALUE_GUID {{.*}} op0={{1|2}} {{op1=3060885059 op2=1207956914|op1=3432075125 op2=3712786831}}
+; BACKEND1-NEXT: <VALUE_GUID op0={{1|2}} op1={{-3706093650706652785|-5300342847281564238}}
+; BACKEND1-NEXT: <VALUE_GUID op0={{1|2}} op1={{-3706093650706652785|-5300342847281564238}}
 ; BACKEND1-NEXT: <COMBINED
 ; BACKEND1-NEXT: <COMBINED
 ; BACKEND1-NEXT: </GLOBALVAL_SUMMARY_BLOCK
@@ -117,7 +117,7 @@
 ; BACKEND2-NEXT: <GLOBALVAL_SUMMARY_BLOCK
 ; BACKEND2-NEXT: <VERSION
 ; BACKEND2-NEXT: <FLAGS
-; BACKEND2-NEXT: <VALUE_GUID {{.*}} op0=1 op1=3060885059 op2=1207956914
+; BACKEND2-NEXT: <VALUE_GUID op0=1 op1=-5300342847281564238
 ; BACKEND2-NEXT: <COMBINED
 ; BACKEND2-NEXT: </GLOBALVAL_SUMMARY_BLOCK
 
@@ -136,8 +136,8 @@
 ; COMBINED-NEXT: <GLOBALVAL_SUMMARY_BLOCK
 ; COMBINED-NEXT: <VERSION
 ; COMBINED-NEXT: <FLAGS
-; COMBINED-NEXT: <VALUE_GUID {{.*}} op0={{1|2}} {{op1=3060885059 op2=1207956914|op1=3432075125 op2=3712786831}}
-; COMBINED-NEXT: <VALUE_GUID {{.*}} op0={{1|2}} {{op1=3060885059 op2=1207956914|op1=3432075125 op2=3712786831}}
+; COMBINED-NEXT: <VALUE_GUID op0={{1|2}} op1={{-3706093650706652785|-5300342847281564238}}
+; COMBINED-NEXT: <VALUE_GUID op0={{1|2}} op1={{-3706093650706652785|-5300342847281564238}}
 ; COMBINED-NEXT: <COMBINED
 ; COMBINED-NEXT: <COMBINED
 ; COMBINED-NEXT: </GLOBALVAL_SUMMARY_BLOCK
diff --git a/llvm/test/tools/llc/new-pm/verify.mir b/llvm/test/tools/llc/new-pm/verify.mir
new file mode 100644
index 000000000000..0cc7fc837e5b
--- /dev/null
+++ b/llvm/test/tools/llc/new-pm/verify.mir
@@ -0,0 +1,10 @@
+# RUN: not --crash llc -mtriple=x86_64-pc-linux-gnu -debug-pass-manager -passes='module(function(machine-function(trigger-verifier-error)))' -filetype=null %s 2>&1 | FileCheck %s
+
+# CHECK: Verifying machine function f
+# CHECK: Broken machine function found after pass "TriggerVerifierErrorPass"
+---
+name: f
+body: |
+  bb.0:
+    RET 0
+...
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/formclass4.s b/llvm/test/tools/llvm-dwarfdump/X86/formclass4.s
index d0f8857c638f..5b3cdfc97790 100644
--- a/llvm/test/tools/llvm-dwarfdump/X86/formclass4.s
+++ b/llvm/test/tools/llvm-dwarfdump/X86/formclass4.s
@@ -1,15 +1,3 @@
-# Source:
-#   struct e {
-#     enum {} f[16384];
-#     short g;
-#   };
-#   e foo() {
-#     auto E = new e;
-#     return *E;
-#   }
-# Compile with:
-#   clang -O2 -gdwarf-4 -S a.cpp -o a4.s
-
 # RUN: llvm-mc %s -filetype obj -triple x86_64-apple-darwin -o %t.o
 # RUN: llvm-dwarfdump -debug-info -name g %t.o | FileCheck %s
 
@@ -17,6 +5,20 @@
 # CHECK: DW_AT_name ("g")
 # CHECK: DW_AT_data_member_location    (0x4000)
 
+.ifdef GEN
+#--- a.cpp
+struct e {
+  enum {} f[16384];
+  short g;
+};
+e foo() {
+  auto E = new e;
+  return *E;
+}
+#--- gen
+clang --target=x86_64-apple-macosx -O2 -gdwarf-4 -S a.cpp -o -
+.endif
+
 	.section	__TEXT,__text,regular,pure_instructions
 	.macosx_version_min 10, 14
 	.globl	__Z3foov                ## -- Begin function _Z3foov
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/parent_recurse_depth.s b/llvm/test/tools/llvm-dwarfdump/X86/parent_recurse_depth.s
index a63a918a865d..d14b5b1ca91a 100644
--- a/llvm/test/tools/llvm-dwarfdump/X86/parent_recurse_depth.s
+++ b/llvm/test/tools/llvm-dwarfdump/X86/parent_recurse_depth.s
@@ -1,67 +1,67 @@
-# RUN: yaml2obj %s -o %t.o
-# RUN: llvm-dwarfdump --debug-info=0x00000020 -p -parent-recurse-depth 0 %t.o | FileCheck %s --check-prefixes=COMMON,ALL
-# RUN: llvm-dwarfdump --debug-info=0x00000020 -p -parent-recurse-depth 1 %t.o | FileCheck %s --check-prefixes=COMMON,ONE
-# RUN: llvm-dwarfdump --debug-info=0x00000020 -p -parent-recurse-depth 2 %t.o | FileCheck %s --check-prefixes=COMMON,TWO
-# RUN: llvm-dwarfdump --debug-info=0x00000020 -p -parent-recurse-depth 3 %t.o | FileCheck %s --check-prefixes=COMMON,ALL
-
-# COMMON: .o: file format
-
-# ALL: by_hand
-# ALL: main
-# ALL: test
-# ALL: int
-
-# ONE-NOT: by_hand
-# ONE-NOT: main
-# ONE: test
-# ONE: int
-
-# TWO-NOT: by_hand
-# TWO: main
-# TWO: test
-# TWO: int
-
---- !ELF
-FileHeader:
-  Class:   ELFCLASS64
-  Data:    ELFDATA2LSB
-  Type:    ET_EXEC
-  Machine: EM_X86_64
-DWARF:
-  debug_abbrev:
-    - Table:
-      - Tag:      DW_TAG_compile_unit
-        Children: DW_CHILDREN_yes
-        Attributes:
-          - Attribute: DW_AT_producer
-            Form:      DW_FORM_string
-      - Tag:      DW_TAG_subprogram
-        Children: DW_CHILDREN_yes
-        Attributes:
-          - Attribute: DW_AT_name
-            Form:      DW_FORM_string
-      - Tag:      DW_TAG_namespace
-        Children: DW_CHILDREN_yes
-        Attributes:
-          - Attribute: DW_AT_name
-            Form:      DW_FORM_string
-      - Tag:      DW_TAG_base_type
-        Children: DW_CHILDREN_no
-        Attributes:
-          - Attribute: DW_AT_name
-            Form:      DW_FORM_string
-  debug_info:
-    - Version: 4
-      Entries:
-        - AbbrCode: 1
-          Values:
-            - CStr: by_hand
-        - AbbrCode: 2
-          Values:
-            - CStr: main
-        - AbbrCode: 3
-          Values:
-            - CStr: test
-        - AbbrCode: 4
-          Values:
-            - CStr: int
+# RUN: yaml2obj %s -o %t.o
+# RUN: llvm-dwarfdump --debug-info=0x00000020 -p -parent-recurse-depth 0 %t.o | FileCheck %s --check-prefixes=COMMON,ALL
+# RUN: llvm-dwarfdump --debug-info=0x00000020 -p -parent-recurse-depth 1 %t.o | FileCheck %s --check-prefixes=COMMON,ONE
+# RUN: llvm-dwarfdump --debug-info=0x00000020 -p -parent-recurse-depth 2 %t.o | FileCheck %s --check-prefixes=COMMON,TWO
+# RUN: llvm-dwarfdump --debug-info=0x00000020 -p -parent-recurse-depth 3 %t.o | FileCheck %s --check-prefixes=COMMON,ALL
+
+# COMMON: .o: file format
+
+# ALL: by_hand
+# ALL: main
+# ALL: test
+# ALL: int
+
+# ONE-NOT: by_hand
+# ONE-NOT: main
+# ONE: test
+# ONE: int
+
+# TWO-NOT: by_hand
+# TWO: main
+# TWO: test
+# TWO: int
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+DWARF:
+  debug_abbrev:
+    - Table:
+      - Tag:      DW_TAG_compile_unit
+        Children: DW_CHILDREN_yes
+        Attributes:
+          - Attribute: DW_AT_producer
+            Form:      DW_FORM_string
+      - Tag:      DW_TAG_subprogram
+        Children: DW_CHILDREN_yes
+        Attributes:
+          - Attribute: DW_AT_name
+            Form:      DW_FORM_string
+      - Tag:      DW_TAG_namespace
+        Children: DW_CHILDREN_yes
+        Attributes:
+          - Attribute: DW_AT_name
+            Form:      DW_FORM_string
+      - Tag:      DW_TAG_base_type
+        Children: DW_CHILDREN_no
+        Attributes:
+          - Attribute: DW_AT_name
+            Form:      DW_FORM_string
+  debug_info:
+    - Version: 4
+      Entries:
+        - AbbrCode: 1
+          Values:
+            - CStr: by_hand
+        - AbbrCode: 2
+          Values:
+            - CStr: main
+        - AbbrCode: 3
+          Values:
+            - CStr: test
+        - AbbrCode: 4
+          Values:
+            - CStr: int
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/prettyprint_type_units_split_v5.s b/llvm/test/tools/llvm-dwarfdump/X86/prettyprint_type_units_split_v5.s
index e8bb95175087..81d15cd2be22 100644
--- a/llvm/test/tools/llvm-dwarfdump/X86/prettyprint_type_units_split_v5.s
+++ b/llvm/test/tools/llvm-dwarfdump/X86/prettyprint_type_units_split_v5.s
@@ -1,16 +1,16 @@
 # RUN: llvm-mc < %s -filetype obj -triple x86_64 -o - \
 # RUN:   | llvm-dwarfdump - | FileCheck %s
 
-# Generated from:
-#
-#   struct t1 { };
-#   t1 v1;
-#
-# $ clang++ -S -g -fdebug-types-section -gsplit-dwarf -o test.5.split.s -gdwarf-5 -g
-
 # CHECK: DW_TAG_variable
 # CHECK:   DW_AT_type ({{.*}} "t1")
 
+.ifdef GEN
+#--- test.cpp
+struct t1 { };
+t1 v1;
+#--- gen
+clang++ --target=x86_64-linux -S -g -fdebug-types-section -gsplit-dwarf -gdwarf-5 test.cpp -o -
+.endif
 	.text
 	.file	"test.cpp"
 	.section	.debug_types.dwo,"e",@progbits
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/quiet.s b/llvm/test/tools/llvm-dwarfdump/X86/quiet.s
index 72ac60a6d794..2bc644b75b20 100644
--- a/llvm/test/tools/llvm-dwarfdump/X86/quiet.s
+++ b/llvm/test/tools/llvm-dwarfdump/X86/quiet.s
@@ -1,10 +1,10 @@
-# RUN: llvm-mc %S/brief.s -filetype obj -triple x86_64-apple-darwin -o %t.o
-# RUN: llvm-dwarfdump --verify %t.o | FileCheck %s
-# RUN: llvm-dwarfdump --verify --quiet %t.o | FileCheck %s --check-prefix=QUIET --allow-empty
-
-# RUN: llvm-mc %S/empty-CU.s -filetype obj -triple x86_64-apple-darwin -o %t-error.o
-# RUN: not llvm-dwarfdump --verify %t-error.o | FileCheck %s
-# RUN: not llvm-dwarfdump --verify --quiet %t-error.o | FileCheck %s --check-prefix=QUIET --allow-empty
-
-# CHECK: {{.}}
-# QUIET-NOT: {{.}}
+# RUN: llvm-mc %S/brief.s -filetype obj -triple x86_64-apple-darwin -o %t.o
+# RUN: llvm-dwarfdump --verify %t.o | FileCheck %s
+# RUN: llvm-dwarfdump --verify --quiet %t.o | FileCheck %s --check-prefix=QUIET --allow-empty
+
+# RUN: llvm-mc %S/empty-CU.s -filetype obj -triple x86_64-apple-darwin -o %t-error.o
+# RUN: not llvm-dwarfdump --verify %t-error.o | FileCheck %s
+# RUN: not llvm-dwarfdump --verify --quiet %t-error.o | FileCheck %s --check-prefix=QUIET --allow-empty
+
+# CHECK: {{.}}
+# QUIET-NOT: {{.}}
diff --git a/llvm/test/tools/llvm-lto/thinlto.ll b/llvm/test/tools/llvm-lto/thinlto.ll
index 8eb7e7286e6c..23843e07d6a5 100644
--- a/llvm/test/tools/llvm-lto/thinlto.ll
+++ b/llvm/test/tools/llvm-lto/thinlto.ll
@@ -12,8 +12,8 @@
 ; COMBINED-NEXT: <GLOBALVAL_SUMMARY_BLOCK
 ; COMBINED-NEXT: <VERSION
 ; COMBINED-NEXT: <FLAGS
-; COMBINED-NEXT: <VALUE_GUID {{.*}} op0={{1|2}} {{op1=3060885059 op2=1207956914|op1=3432075125 op2=3712786831}}
-; COMBINED-NEXT: <VALUE_GUID {{.*}} op0={{1|2}} {{op1=3060885059 op2=1207956914|op1=3432075125 op2=3712786831}}
+; COMBINED-NEXT: <VALUE_GUID op0={{1|2}} op1={{-3706093650706652785|-5300342847281564238}}
+; COMBINED-NEXT: <VALUE_GUID op0={{1|2}} op1={{-3706093650706652785|-5300342847281564238}}
 ; COMBINED-NEXT: <COMBINED
 ; COMBINED-NEXT: <COMBINED
 ; COMBINED-NEXT: </GLOBALVAL_SUMMARY_BLOCK
diff --git a/llvm/test/tools/llvm-mca/AArch64/Exynos/float-divide-multiply.s b/llvm/test/tools/llvm-mca/AArch64/Exynos/float-divide-multiply.s
index ecfd019452af..271bd836eb24 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Exynos/float-divide-multiply.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Exynos/float-divide-multiply.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,EM3
+# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false -skip-unsupported-instructions=parse-failure < %s | FileCheck %s -check-prefixes=ALL,EM3
 # RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,EM4
 # RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,EM5
 
diff --git a/llvm/test/tools/llvm-mca/AArch64/Exynos/float-integer.s b/llvm/test/tools/llvm-mca/AArch64/Exynos/float-integer.s
index 16c710553f75..f95e530a41fe 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Exynos/float-integer.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Exynos/float-integer.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,EM3
+# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 -resource-pressure=false -skip-unsupported-instructions=parse-failure < %s | FileCheck %s -check-prefixes=ALL,EM3
 # RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,EM4
 # RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,EM5
 
diff --git a/llvm/test/tools/llvm-mca/AArch64/HiSilicon/tsv110-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/HiSilicon/tsv110-forwarding.s
index 207822b61839..b29697ea7972 100644
--- a/llvm/test/tools/llvm-mca/AArch64/HiSilicon/tsv110-forwarding.s
+++ b/llvm/test/tools/llvm-mca/AArch64/HiSilicon/tsv110-forwarding.s
@@ -52,22 +52,22 @@ madd x0, x0, x0, x0
 
 # CHECK:      Iterations:        1
 # CHECK-NEXT: Instructions:      4
-# CHECK-NEXT: Total Cycles:      13
+# CHECK-NEXT: Total Cycles:      12
 # CHECK-NEXT: Total uOps:        4
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    0.31
-# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: uOps Per Cycle:    0.33
+# CHECK-NEXT: IPC:               0.33
 # CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   . .   mul	x0, x1, x2
-# CHECK-NEXT: [0,1]     D=eeeeER  . .   madd	x0, x1, x2, x0
-# CHECK-NEXT: [0,2]     D==eeeeER . .   madd	x0, x1, x2, x0
-# CHECK-NEXT: [0,3]     D======eeeeER   madd	x0, x0, x0, x0
+# CHECK:      [0,0]     DeeeeER   ..   mul	x0, x1, x2
+# CHECK-NEXT: [0,1]     D==eeeeER ..   madd	x0, x1, x2, x0
+# CHECK-NEXT: [0,2]     D=eeeeE-R ..   madd	x0, x1, x2, x0
+# CHECK-NEXT: [0,3]     D=====eeeeER   madd	x0, x0, x0, x0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -77,7 +77,7 @@ madd x0, x0, x0, x0
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       mul	x0, x1, x2
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       madd	x0, x1, x2, x0
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       madd	x0, x1, x2, x0
-# CHECK-NEXT: 3.     1     7.0    0.0    0.0       madd	x0, x0, x0, x0
-# CHECK-NEXT:        1     3.3    0.3    0.0       <total>
+# CHECK-NEXT: 1.     1     3.0    3.0    0.0       madd	x0, x1, x2, x0
+# CHECK-NEXT: 2.     1     2.0    2.0    1.0       madd	x0, x1, x2, x0
+# CHECK-NEXT: 3.     1     6.0    0.0    0.0       madd	x0, x0, x0, x0
+# CHECK-NEXT:        1     3.0    1.5    0.3       <total>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-zero-dependency.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-zero-dependency.s
new file mode 100644
index 000000000000..071329fd00cd
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-zero-dependency.s
@@ -0,0 +1,76 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-v1 --timeline --timeline-max-iterations=4 < %s | FileCheck %s
+
+mov x0, x1
+cmp x0, #4
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      54
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    3.70
+# CHECK-NEXT: IPC:               3.70
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.25                        mov	x0, x1
+# CHECK-NEXT:  1      1     0.33                        cmp	x0, #4
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.50   0.50   0.50   0.50    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.48   0.50   0.01   0.01    -      -      -      -     mov	x0, x1
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.02    -     0.49   0.49    -      -      -      -     cmp	x0, #4
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345
+
+# CHECK:      [0,0]     DeER .   mov	x0, x1
+# CHECK-NEXT: [0,1]     D=eER.   cmp	x0, #4
+# CHECK-NEXT: [1,0]     DeE-R.   mov	x0, x1
+# CHECK-NEXT: [1,1]     D=eER.   cmp	x0, #4
+# CHECK-NEXT: [2,0]     DeE-R.   mov	x0, x1
+# CHECK-NEXT: [2,1]     D=eER.   cmp	x0, #4
+# CHECK-NEXT: [3,0]     DeE-R.   mov	x0, x1
+# CHECK-NEXT: [3,1]     D==eER   cmp	x0, #4
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.0    1.0    0.8       mov	x0, x1
+# CHECK-NEXT: 1.     4     2.3    0.3    0.0       cmp	x0, #4
+# CHECK-NEXT:        4     1.6    0.6    0.4       <total>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/bad-input.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/bad-input.s
new file mode 100644
index 000000000000..41891adc6c0d
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/bad-input.s
@@ -0,0 +1,16 @@
+# This test is generic but not all builders have an llvm-mca which can run natively.
+
+# RUN: not llvm-mca -mtriple=aarch64 -mcpu=neoverse-v1 %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK-ALL,CHECK %s
+# RUN: not llvm-mca -mtriple=aarch64 -mcpu=neoverse-v1 -skip-unsupported-instructions=none %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK-ALL,CHECK %s
+# RUN: not llvm-mca -mtriple=aarch64 -mcpu=neoverse-v1 -skip-unsupported-instructions=lack-sched %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK-ALL,CHECK %s
+# RUN: not llvm-mca -mtriple=aarch64 -mcpu=neoverse-v1 -skip-unsupported-instructions=parse-failure %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK-ALL,CHECK-SKIP %s
+# RUN: not llvm-mca -mtriple=aarch64 -mcpu=neoverse-v1 -skip-unsupported-instructions=any %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK-ALL,CHECK-SKIP %s
+
+# Test checks that MCA does not produce a total cycles estimate if it encounters parse errors.
+
+# CHECK-ALL-NOT: Total Cycles:
+
+# CHECK: error: Assembly input parsing had errors, use -skip-unsupported-instructions=parse-failure to drop failing lines from the input.
+# CHECK-SKIP: error: no assembly instructions found.
+
+This is not a valid assembly file for any architecture (by virtue of this text.)
diff --git a/llvm/test/tools/llvm-mca/ARM/cortex-a57-basic-instructions.s b/llvm/test/tools/llvm-mca/ARM/cortex-a57-basic-instructions.s
index d686293c9b43..9c2ae8fb2aa5 100644
--- a/llvm/test/tools/llvm-mca/ARM/cortex-a57-basic-instructions.s
+++ b/llvm/test/tools/llvm-mca/ARM/cortex-a57-basic-instructions.s
@@ -33,7 +33,6 @@
   adc	pc, r5, r6, ror #2
   adc	r4, r5, r6, ror #31
   adc	r6, r7, r8, lsl r9
-  adc	pc, r7, r8, lsl r9
   adc	r6, r7, r8, lsr r9
   adc	r6, r7, r8, asr r9
   adc	r6, r7, r8, ror r9
diff --git a/llvm/test/tools/llvm-mca/ARM/cortex-a57-thumb.s b/llvm/test/tools/llvm-mca/ARM/cortex-a57-thumb.s
index 21accd7e2e18..6c56e1dbf024 100644
--- a/llvm/test/tools/llvm-mca/ARM/cortex-a57-thumb.s
+++ b/llvm/test/tools/llvm-mca/ARM/cortex-a57-thumb.s
@@ -95,12 +95,13 @@
   itett	ne
   cmpne	r7, #243
   addeq	r7, r1, r2
+  addne	r7, r1, r2
+  uxthne	r7, r7
   itttt	lt
   cmplt	r7, #243
   uxthlt	r7, r1
   strhlt	r2, [r7, #22]
   lsrlt	r1, r6, #3
-  uxthne	r7, r7
   strh	r2, [r7, #22]
   asrs	r1, r6, #7
   lsrs	r1, r6, #31
@@ -253,7 +254,7 @@
   ldrd	r0, r1, [r2, #-0]!
   ldrd	r0, r1, [r2, #0]!
   ldrd	r0, r1, [r2, #-0]
-  ldrd	r1, r1, [r0], #0
+  ldrd	r1, r2, [r0], #0
   ldrex	r1, [r4]
   ldrex	r8, [r4]
   ldrex	r2, [sp, #128]
@@ -648,7 +649,7 @@
   str	r10, [r11], #0
   strd	r1, r1, [r0], #0
   strd	r6, r3, [r5], #-8
-  strd	r8, r5, [r5], #-0
+  strd	r8, r5, [r6], #-0
   strd	r7, r4, [r5], #-4
   strd	r0, r1, [r2, #-0]!
   strd	r0, r1, [r2, #0]!
@@ -1010,6 +1011,13 @@
 # CHECK-NEXT:  0      0     0.00                  U     itett	ne
 # CHECK-NEXT:  1      1     0.50                        cmpne	r7, #243
 # CHECK-NEXT:  1      1     0.50                        addeq	r7, r1, r2
+# CHECK-NEXT:  1      1     0.50                        addne	r7, r1, r2
+# CHECK-NEXT:  1      1     0.50                        uxthne	r7, r7
+# CHECK-NEXT:  0      0     0.00                  U     itttt	lt
+# CHECK-NEXT:  1      1     0.50                        cmplt	r7, #243
+# CHECK-NEXT:  1      1     0.50                        uxthlt	r7, r1
+# CHECK-NEXT:  1      1     1.00           *            strhlt	r2, [r7, #22]
+# CHECK-NEXT:  1      1     0.50                        lsrlt	r1, r6, #3
 # CHECK-NEXT:  1      1     1.00           *            strh	r2, [r7, #22]
 # CHECK-NEXT:  1      2     1.00                        asrs	r1, r6, #7
 # CHECK-NEXT:  1      2     1.00                        lsrs	r1, r6, #31
@@ -1162,6 +1170,7 @@
 # CHECK-NEXT:  4      4     2.00    *                   ldrd	r0, r1, [r2, #-0]!
 # CHECK-NEXT:  4      4     2.00    *                   ldrd	r0, r1, [r2, #0]!
 # CHECK-NEXT:  2      4     2.00    *                   ldrd	r0, r1, [r2, #-0]
+# CHECK-NEXT:  4      4     2.00    *                   ldrd	r1, r2, [r0], #0
 # CHECK-NEXT:  0      0     0.00    *      *      U     ldrex	r1, [r4]
 # CHECK-NEXT:  0      0     0.00    *      *      U     ldrex	r8, [r4]
 # CHECK-NEXT:  0      0     0.00    *      *      U     ldrex	r2, [sp, #128]
@@ -1556,6 +1565,7 @@
 # CHECK-NEXT:  2      1     1.00           *            str	r10, [r11], #0
 # CHECK-NEXT:  2      1     1.00           *            strd	r1, r1, [r0], #0
 # CHECK-NEXT:  2      1     1.00           *            strd	r6, r3, [r5], #-8
+# CHECK-NEXT:  2      1     1.00           *            strd	r8, r5, [r6], #-0
 # CHECK-NEXT:  2      1     1.00           *            strd	r7, r4, [r5], #-4
 # CHECK-NEXT:  2      1     1.00           *            strd	r0, r1, [r2, #-0]!
 # CHECK-NEXT:  2      1     1.00           *            strd	r0, r1, [r2, #0]!
@@ -1827,7 +1837,7 @@
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1.0]  [1.1]  [2]    [3]    [4]    [5]    [6]
-# CHECK-NEXT: 12.00  164.00 164.00 221.00 313.00 44.00   -      -
+# CHECK-NEXT: 12.00  168.00 168.00 223.00 313.00 46.00   -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1.0]  [1.1]  [2]    [3]    [4]    [5]    [6]    Instructions:
@@ -1924,6 +1934,13 @@
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     itett	ne
 # CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     cmpne	r7, #243
 # CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     addeq	r7, r1, r2
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     addne	r7, r1, r2
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     uxthne	r7, r7
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     itttt	lt
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     cmplt	r7, #243
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     uxthlt	r7, r1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     strhlt	r2, [r7, #22]
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     lsrlt	r1, r6, #3
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -     strh	r2, [r7, #22]
 # CHECK-NEXT:  -      -      -      -     1.00    -      -      -     asrs	r1, r6, #7
 # CHECK-NEXT:  -      -      -      -     1.00    -      -      -     lsrs	r1, r6, #31
@@ -2076,6 +2093,7 @@
 # CHECK-NEXT:  -     1.00   1.00   2.00    -      -      -      -     ldrd	r0, r1, [r2, #-0]!
 # CHECK-NEXT:  -     1.00   1.00   2.00    -      -      -      -     ldrd	r0, r1, [r2, #0]!
 # CHECK-NEXT:  -      -      -     2.00    -      -      -      -     ldrd	r0, r1, [r2, #-0]
+# CHECK-NEXT:  -     1.00   1.00   2.00    -      -      -      -     ldrd	r1, r2, [r0], #0
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     ldrex	r1, [r4]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     ldrex	r8, [r4]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     ldrex	r2, [sp, #128]
@@ -2470,6 +2488,7 @@
 # CHECK-NEXT:  -     0.50   0.50    -      -     1.00    -      -     str	r10, [r11], #0
 # CHECK-NEXT:  -     0.50   0.50    -      -     1.00    -      -     strd	r1, r1, [r0], #0
 # CHECK-NEXT:  -     0.50   0.50    -      -     1.00    -      -     strd	r6, r3, [r5], #-8
+# CHECK-NEXT:  -     0.50   0.50    -      -     1.00    -      -     strd	r8, r5, [r6], #-0
 # CHECK-NEXT:  -     0.50   0.50    -      -     1.00    -      -     strd	r7, r4, [r5], #-4
 # CHECK-NEXT:  -     0.50   0.50    -      -     1.00    -      -     strd	r0, r1, [r2, #-0]!
 # CHECK-NEXT:  -     0.50   0.50    -      -     1.00    -      -     strd	r0, r1, [r2, #0]!
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvbb.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvbb.s
new file mode 100644
index 000000000000..61915375dd28
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvbb.s
@@ -0,0 +1,461 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-p670 -iterations=1 < %s | FileCheck %s
+
+vsetvli zero, zero, e8, mf8, tu, mu
+vandn.vv v4, v8, v12
+vandn.vx v4, v8, a0
+vbrev.v v4, v8
+vbrev8.v v4, v8
+vrev8.v v4, v8
+vclz.v v4, v8
+vctz.v v4, v8
+vcpop.v v4, v8
+vrol.vv v4, v8, v12
+vrol.vx v4, v8, a0
+vror.vv v4, v8, v12
+vror.vx v4, v8, a0
+vror.vi v4, v8, 8
+
+vwsll.vv v4, v8, v12
+vwsll.vx v4, v8, a0
+vwsll.vi v4, v8, 8
+
+vsetvli zero, zero, e16, mf4, tu, mu
+vandn.vv v4, v8, v12
+vandn.vx v4, v8, a0
+vbrev.v v4, v8
+vbrev8.v v4, v8
+vrev8.v v4, v8
+vclz.v v4, v8
+vctz.v v4, v8
+vcpop.v v4, v8
+vrol.vv v4, v8, v12
+vrol.vx v4, v8, a0
+vror.vv v4, v8, v12
+vror.vx v4, v8, a0
+vror.vi v4, v8, 8
+
+vwsll.vv v4, v8, v12
+vwsll.vx v4, v8, a0
+vwsll.vi v4, v8, 8
+
+vsetvli zero, zero, e32, mf2, tu, mu
+vandn.vv v4, v8, v12
+vandn.vx v4, v8, a0
+vbrev.v v4, v8
+vbrev8.v v4, v8
+vrev8.v v4, v8
+vclz.v v4, v8
+vctz.v v4, v8
+vcpop.v v4, v8
+vrol.vv v4, v8, v12
+vrol.vx v4, v8, a0
+vror.vv v4, v8, v12
+vror.vx v4, v8, a0
+vror.vi v4, v8, 8
+
+vwsll.vv v4, v8, v12
+vwsll.vx v4, v8, a0
+vwsll.vi v4, v8, 8
+
+vsetvli zero, zero, e32, m1, tu, mu
+vandn.vv v4, v8, v12
+vandn.vx v4, v8, a0
+vbrev.v v4, v8
+vbrev8.v v4, v8
+vrev8.v v4, v8
+vclz.v v4, v8
+vctz.v v4, v8
+vcpop.v v4, v8
+vrol.vv v4, v8, v12
+vrol.vx v4, v8, a0
+vror.vv v4, v8, v12
+vror.vx v4, v8, a0
+vror.vi v4, v8, 8
+
+vwsll.vv v4, v8, v12
+vwsll.vx v4, v8, a0
+vwsll.vi v4, v8, 8
+
+vsetvli zero, zero, e32, m2, tu, mu
+vandn.vv v4, v8, v12
+vandn.vx v4, v8, a0
+vbrev.v v4, v8
+vbrev8.v v4, v8
+vrev8.v v4, v8
+vclz.v v4, v8
+vctz.v v4, v8
+vcpop.v v4, v8
+vrol.vv v4, v8, v12
+vrol.vx v4, v8, a0
+vror.vv v4, v8, v12
+vror.vx v4, v8, a0
+vror.vi v4, v8, 8
+
+vwsll.vv v4, v8, v12
+vwsll.vx v4, v8, a0
+vwsll.vi v4, v8, 8
+
+vsetvli zero, zero, e32, m4, tu, mu
+vandn.vv v4, v8, v12
+vandn.vx v4, v8, a0
+vbrev.v v4, v8
+vbrev8.v v4, v8
+vrev8.v v4, v8
+vclz.v v4, v8
+vctz.v v4, v8
+vcpop.v v4, v8
+vrol.vv v4, v8, v12
+vrol.vx v4, v8, a0
+vror.vv v4, v8, v12
+vror.vx v4, v8, a0
+vror.vi v4, v8, 8
+
+vwsll.vv v8, v4, v12
+vwsll.vx v8, v4, a0
+vwsll.vi v8, v4, 8
+
+vsetvli zero, zero, e32, m8, tu, mu
+vandn.vv v8, v16, v24
+vandn.vx v8, v16, a0
+vbrev.v  v8, v16
+vbrev8.v v8, v16
+vrev8.v  v8, v16
+vclz.v   v8, v16
+vctz.v   v8, v16
+vcpop.v  v8, v16
+vrol.vv  v8, v16, v24
+vrol.vx  v8, v16, a0
+vror.vv  v8, v16, v24
+vror.vx  v8, v16, a0
+vror.vi  v8, v16, 8
+
+# Show SEW does not matter
+vsetvli zero, zero, e16, m4, tu, mu
+vandn.vv v4, v8, v12
+vandn.vx v4, v8, a0
+vbrev.v  v4, v8
+vbrev8.v v4, v8
+vrev8.v  v4, v8
+vclz.v   v4, v8
+vctz.v   v4, v8
+vcpop.v  v4, v8
+vrol.vv  v4, v8, v12
+vrol.vx  v4, v8, a0
+vror.vv  v4, v8, v12
+vror.vx  v4, v8, a0
+vror.vi  v4, v8, 8
+vwsll.vv v8, v4, v12
+vwsll.vx v8, v4, a0
+vwsll.vi v8, v4, 8
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      133
+# CHECK-NEXT: Total Cycles:      166
+# CHECK-NEXT: Total uOps:        133
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.80
+# CHECK-NEXT: IPC:               0.80
+# CHECK-NEXT: Block RThroughput: 164.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     0.50                        vandn.vv	v4, v8, v12
+# CHECK-NEXT:  1      1     0.50                        vandn.vx	v4, v8, a0
+# CHECK-NEXT:  1      2     0.50                        vbrev.v	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vbrev8.v	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vrev8.v	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vclz.v	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vctz.v	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vcpop.v	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vrol.vv	v4, v8, v12
+# CHECK-NEXT:  1      2     0.50                        vrol.vx	v4, v8, a0
+# CHECK-NEXT:  1      2     0.50                        vror.vv	v4, v8, v12
+# CHECK-NEXT:  1      2     0.50                        vror.vx	v4, v8, a0
+# CHECK-NEXT:  1      2     0.50                        vror.vi	v4, v8, 8
+# CHECK-NEXT:  1      2     0.50                        vwsll.vv	v4, v8, v12
+# CHECK-NEXT:  1      2     0.50                        vwsll.vx	v4, v8, a0
+# CHECK-NEXT:  1      2     0.50                        vwsll.vi	v4, v8, 8
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     0.50                        vandn.vv	v4, v8, v12
+# CHECK-NEXT:  1      1     0.50                        vandn.vx	v4, v8, a0
+# CHECK-NEXT:  1      2     0.50                        vbrev.v	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vbrev8.v	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vrev8.v	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vclz.v	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vctz.v	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vcpop.v	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vrol.vv	v4, v8, v12
+# CHECK-NEXT:  1      2     0.50                        vrol.vx	v4, v8, a0
+# CHECK-NEXT:  1      2     0.50                        vror.vv	v4, v8, v12
+# CHECK-NEXT:  1      2     0.50                        vror.vx	v4, v8, a0
+# CHECK-NEXT:  1      2     0.50                        vror.vi	v4, v8, 8
+# CHECK-NEXT:  1      2     0.50                        vwsll.vv	v4, v8, v12
+# CHECK-NEXT:  1      2     0.50                        vwsll.vx	v4, v8, a0
+# CHECK-NEXT:  1      2     0.50                        vwsll.vi	v4, v8, 8
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     0.50                        vandn.vv	v4, v8, v12
+# CHECK-NEXT:  1      1     0.50                        vandn.vx	v4, v8, a0
+# CHECK-NEXT:  1      2     0.50                        vbrev.v	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vbrev8.v	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vrev8.v	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vclz.v	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vctz.v	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vcpop.v	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vrol.vv	v4, v8, v12
+# CHECK-NEXT:  1      2     0.50                        vrol.vx	v4, v8, a0
+# CHECK-NEXT:  1      2     0.50                        vror.vv	v4, v8, v12
+# CHECK-NEXT:  1      2     0.50                        vror.vx	v4, v8, a0
+# CHECK-NEXT:  1      2     0.50                        vror.vi	v4, v8, 8
+# CHECK-NEXT:  1      2     0.50                        vwsll.vv	v4, v8, v12
+# CHECK-NEXT:  1      2     0.50                        vwsll.vx	v4, v8, a0
+# CHECK-NEXT:  1      2     0.50                        vwsll.vi	v4, v8, 8
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     0.50                        vandn.vv	v4, v8, v12
+# CHECK-NEXT:  1      1     0.50                        vandn.vx	v4, v8, a0
+# CHECK-NEXT:  1      2     0.50                        vbrev.v	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vbrev8.v	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vrev8.v	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vclz.v	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vctz.v	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vcpop.v	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vrol.vv	v4, v8, v12
+# CHECK-NEXT:  1      2     0.50                        vrol.vx	v4, v8, a0
+# CHECK-NEXT:  1      2     0.50                        vror.vv	v4, v8, v12
+# CHECK-NEXT:  1      2     0.50                        vror.vx	v4, v8, a0
+# CHECK-NEXT:  1      2     0.50                        vror.vi	v4, v8, 8
+# CHECK-NEXT:  1      2     0.50                        vwsll.vv	v4, v8, v12
+# CHECK-NEXT:  1      2     0.50                        vwsll.vx	v4, v8, a0
+# CHECK-NEXT:  1      2     0.50                        vwsll.vi	v4, v8, 8
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                        vandn.vv	v4, v8, v12
+# CHECK-NEXT:  1      1     1.00                        vandn.vx	v4, v8, a0
+# CHECK-NEXT:  1      2     1.00                        vbrev.v	v4, v8
+# CHECK-NEXT:  1      2     1.00                        vbrev8.v	v4, v8
+# CHECK-NEXT:  1      2     1.00                        vrev8.v	v4, v8
+# CHECK-NEXT:  1      2     1.00                        vclz.v	v4, v8
+# CHECK-NEXT:  1      2     1.00                        vctz.v	v4, v8
+# CHECK-NEXT:  1      2     1.00                        vcpop.v	v4, v8
+# CHECK-NEXT:  1      2     1.00                        vrol.vv	v4, v8, v12
+# CHECK-NEXT:  1      2     1.00                        vrol.vx	v4, v8, a0
+# CHECK-NEXT:  1      2     1.00                        vror.vv	v4, v8, v12
+# CHECK-NEXT:  1      2     1.00                        vror.vx	v4, v8, a0
+# CHECK-NEXT:  1      2     1.00                        vror.vi	v4, v8, 8
+# CHECK-NEXT:  1      2     1.00                        vwsll.vv	v4, v8, v12
+# CHECK-NEXT:  1      2     1.00                        vwsll.vx	v4, v8, a0
+# CHECK-NEXT:  1      2     1.00                        vwsll.vi	v4, v8, 8
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     2.00                        vandn.vv	v4, v8, v12
+# CHECK-NEXT:  1      1     2.00                        vandn.vx	v4, v8, a0
+# CHECK-NEXT:  1      2     2.00                        vbrev.v	v4, v8
+# CHECK-NEXT:  1      2     2.00                        vbrev8.v	v4, v8
+# CHECK-NEXT:  1      2     2.00                        vrev8.v	v4, v8
+# CHECK-NEXT:  1      2     2.00                        vclz.v	v4, v8
+# CHECK-NEXT:  1      2     2.00                        vctz.v	v4, v8
+# CHECK-NEXT:  1      2     2.00                        vcpop.v	v4, v8
+# CHECK-NEXT:  1      2     2.00                        vrol.vv	v4, v8, v12
+# CHECK-NEXT:  1      2     2.00                        vrol.vx	v4, v8, a0
+# CHECK-NEXT:  1      2     2.00                        vror.vv	v4, v8, v12
+# CHECK-NEXT:  1      2     2.00                        vror.vx	v4, v8, a0
+# CHECK-NEXT:  1      2     2.00                        vror.vi	v4, v8, 8
+# CHECK-NEXT:  1      2     2.00                        vwsll.vv	v8, v4, v12
+# CHECK-NEXT:  1      2     2.00                        vwsll.vx	v8, v4, a0
+# CHECK-NEXT:  1      2     2.00                        vwsll.vi	v8, v4, 8
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     4.00                        vandn.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     4.00                        vandn.vx	v8, v16, a0
+# CHECK-NEXT:  1      2     4.00                        vbrev.v	v8, v16
+# CHECK-NEXT:  1      2     4.00                        vbrev8.v	v8, v16
+# CHECK-NEXT:  1      2     4.00                        vrev8.v	v8, v16
+# CHECK-NEXT:  1      2     4.00                        vclz.v	v8, v16
+# CHECK-NEXT:  1      2     4.00                        vctz.v	v8, v16
+# CHECK-NEXT:  1      2     4.00                        vcpop.v	v8, v16
+# CHECK-NEXT:  1      2     4.00                        vrol.vv	v8, v16, v24
+# CHECK-NEXT:  1      2     4.00                        vrol.vx	v8, v16, a0
+# CHECK-NEXT:  1      2     4.00                        vror.vv	v8, v16, v24
+# CHECK-NEXT:  1      2     4.00                        vror.vx	v8, v16, a0
+# CHECK-NEXT:  1      2     4.00                        vror.vi	v8, v16, 8
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     2.00                        vandn.vv	v4, v8, v12
+# CHECK-NEXT:  1      1     2.00                        vandn.vx	v4, v8, a0
+# CHECK-NEXT:  1      2     2.00                        vbrev.v	v4, v8
+# CHECK-NEXT:  1      2     2.00                        vbrev8.v	v4, v8
+# CHECK-NEXT:  1      2     2.00                        vrev8.v	v4, v8
+# CHECK-NEXT:  1      2     2.00                        vclz.v	v4, v8
+# CHECK-NEXT:  1      2     2.00                        vctz.v	v4, v8
+# CHECK-NEXT:  1      2     2.00                        vcpop.v	v4, v8
+# CHECK-NEXT:  1      2     2.00                        vrol.vv	v4, v8, v12
+# CHECK-NEXT:  1      2     2.00                        vrol.vx	v4, v8, a0
+# CHECK-NEXT:  1      2     2.00                        vror.vv	v4, v8, v12
+# CHECK-NEXT:  1      2     2.00                        vror.vx	v4, v8, a0
+# CHECK-NEXT:  1      2     2.00                        vror.vi	v4, v8, 8
+# CHECK-NEXT:  1      2     2.00                        vwsll.vv	v8, v4, v12
+# CHECK-NEXT:  1      2     2.00                        vwsll.vx	v8, v4, a0
+# CHECK-NEXT:  1      2     2.00                        vwsll.vi	v8, v4, 8
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SiFiveP600Div
+# CHECK-NEXT: [1]   - SiFiveP600FEXQ0
+# CHECK-NEXT: [2]   - SiFiveP600FEXQ1
+# CHECK-NEXT: [3]   - SiFiveP600FloatDiv
+# CHECK-NEXT: [4]   - SiFiveP600IEXQ0
+# CHECK-NEXT: [5]   - SiFiveP600IEXQ1
+# CHECK-NEXT: [6]   - SiFiveP600IEXQ2
+# CHECK-NEXT: [7]   - SiFiveP600IEXQ3
+# CHECK-NEXT: [8.0] - SiFiveP600LDST
+# CHECK-NEXT: [8.1] - SiFiveP600LDST
+# CHECK-NEXT: [9]   - SiFiveP600VDiv
+# CHECK-NEXT: [10]  - SiFiveP600VEXQ0
+# CHECK-NEXT: [11]  - SiFiveP600VEXQ1
+# CHECK-NEXT: [12]  - SiFiveP600VFloatDiv
+# CHECK-NEXT: [13]  - SiFiveP600VLD
+# CHECK-NEXT: [14]  - SiFiveP600VST
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]
+# CHECK-NEXT:  -      -      -      -     8.00    -      -      -      -      -      -     164.00 164.00  -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   Instructions:
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vandn.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vandn.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vbrev.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vbrev8.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vrev8.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vclz.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vctz.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vcpop.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vrol.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vrol.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vror.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vror.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vror.vi	v4, v8, 8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vwsll.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vwsll.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vwsll.vi	v4, v8, 8
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vandn.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vandn.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vbrev.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vbrev8.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vrev8.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vclz.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vctz.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vcpop.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vrol.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vrol.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vror.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vror.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vror.vi	v4, v8, 8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vwsll.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vwsll.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vwsll.vi	v4, v8, 8
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vandn.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vandn.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vbrev.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vbrev8.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vrev8.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vclz.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vctz.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vcpop.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vrol.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vrol.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vror.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vror.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vror.vi	v4, v8, 8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vwsll.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vwsll.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vwsll.vi	v4, v8, 8
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vandn.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vandn.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vbrev.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vbrev8.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vrev8.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vclz.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vctz.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vcpop.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vrol.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vrol.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vror.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vror.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vror.vi	v4, v8, 8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vwsll.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vwsll.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vwsll.vi	v4, v8, 8
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -     vandn.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -      -     vandn.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -     vbrev.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -      -     vbrev8.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -     vrev8.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -      -     vclz.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -     vctz.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -      -     vcpop.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -     vrol.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -      -     vrol.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -     vror.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -      -     vror.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -     vror.vi	v4, v8, 8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -      -     vwsll.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -     vwsll.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -      -     vwsll.vi	v4, v8, 8
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -     vandn.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vandn.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -     vbrev.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vbrev8.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -     vrev8.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vclz.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -     vctz.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vcpop.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -     vrol.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vrol.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vror.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -     vror.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -     vror.vi	v4, v8, 8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vwsll.vv	v8, v4, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -     vwsll.vx	v8, v4, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vwsll.vi	v8, v4, 8
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -      -     vandn.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -     vandn.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -      -     vbrev.v	v8, v16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -     vbrev8.v	v8, v16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -      -     vrev8.v	v8, v16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -     vclz.v	v8, v16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -      -     vctz.v	v8, v16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -     vcpop.v	v8, v16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -      -     vrol.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -     vrol.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -      -     vror.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -     vror.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -     vror.vi	v8, v16, 8
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vandn.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vandn.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -     vbrev.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vbrev8.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -     vrev8.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vclz.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -     vctz.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vcpop.v	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -     vrol.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vrol.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vror.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -     vror.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -     vror.vi	v4, v8, 8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vwsll.vv	v8, v4, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -     vwsll.vx	v8, v4, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vwsll.vi	v8, v4, 8
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvbc.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvbc.s
new file mode 100644
index 000000000000..faf75234ff3b
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvbc.s
@@ -0,0 +1,113 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-p670 -iterations=1 < %s | FileCheck %s
+
+# These instructions only work with e64
+
+vsetvli zero, zero, e64, m1, tu, mu
+vclmul.vv v4, v8, v12
+vclmul.vx v4, v8, a0
+vclmulh.vv v4, v8, v12
+vclmulh.vx v4, v8, a0
+
+vsetvli zero, zero, e64, m2, tu, mu
+vclmul.vv v4, v8, v12
+vclmul.vx v4, v8, a0
+vclmulh.vv v4, v8, v12
+vclmulh.vx v4, v8, a0
+
+vsetvli zero, zero, e64, m4, tu, mu
+vclmul.vv v4, v8, v12
+vclmul.vx v4, v8, a0
+vclmulh.vv v4, v8, v12
+vclmulh.vx v4, v8, a0
+
+vsetvli zero, zero, e64, m8, tu, mu
+vclmul.vv  v8, v12, v24
+vclmul.vx  v8, v12, a0
+vclmulh.vv v8, v12, v24
+vclmulh.vx v8, v12, a0
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      20
+# CHECK-NEXT: Total Cycles:      28
+# CHECK-NEXT: Total uOps:        20
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.71
+# CHECK-NEXT: IPC:               0.71
+# CHECK-NEXT: Block RThroughput: 30.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      2     0.50                        vclmul.vv	v4, v8, v12
+# CHECK-NEXT:  1      2     0.50                        vclmul.vx	v4, v8, a0
+# CHECK-NEXT:  1      2     0.50                        vclmulh.vv	v4, v8, v12
+# CHECK-NEXT:  1      2     0.50                        vclmulh.vx	v4, v8, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      2     1.00                        vclmul.vv	v4, v8, v12
+# CHECK-NEXT:  1      2     1.00                        vclmul.vx	v4, v8, a0
+# CHECK-NEXT:  1      2     1.00                        vclmulh.vv	v4, v8, v12
+# CHECK-NEXT:  1      2     1.00                        vclmulh.vx	v4, v8, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      2     2.00                        vclmul.vv	v4, v8, v12
+# CHECK-NEXT:  1      2     2.00                        vclmul.vx	v4, v8, a0
+# CHECK-NEXT:  1      2     2.00                        vclmulh.vv	v4, v8, v12
+# CHECK-NEXT:  1      2     2.00                        vclmulh.vx	v4, v8, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      2     4.00                        vclmul.vv	v8, v12, v24
+# CHECK-NEXT:  1      2     4.00                        vclmul.vx	v8, v12, a0
+# CHECK-NEXT:  1      2     4.00                        vclmulh.vv	v8, v12, v24
+# CHECK-NEXT:  1      2     4.00                        vclmulh.vx	v8, v12, a0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SiFiveP600Div
+# CHECK-NEXT: [1]   - SiFiveP600FEXQ0
+# CHECK-NEXT: [2]   - SiFiveP600FEXQ1
+# CHECK-NEXT: [3]   - SiFiveP600FloatDiv
+# CHECK-NEXT: [4]   - SiFiveP600IEXQ0
+# CHECK-NEXT: [5]   - SiFiveP600IEXQ1
+# CHECK-NEXT: [6]   - SiFiveP600IEXQ2
+# CHECK-NEXT: [7]   - SiFiveP600IEXQ3
+# CHECK-NEXT: [8.0] - SiFiveP600LDST
+# CHECK-NEXT: [8.1] - SiFiveP600LDST
+# CHECK-NEXT: [9]   - SiFiveP600VDiv
+# CHECK-NEXT: [10]  - SiFiveP600VEXQ0
+# CHECK-NEXT: [11]  - SiFiveP600VEXQ1
+# CHECK-NEXT: [12]  - SiFiveP600VFloatDiv
+# CHECK-NEXT: [13]  - SiFiveP600VLD
+# CHECK-NEXT: [14]  - SiFiveP600VST
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]
+# CHECK-NEXT:  -      -      -      -     4.00    -      -      -      -      -      -     30.00  30.00   -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   Instructions:
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vclmul.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vclmul.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vclmulh.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vclmulh.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -     vclmul.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -      -     vclmul.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -     vclmulh.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -      -     vclmulh.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -     vclmul.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vclmul.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -     vclmulh.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vclmulh.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -     vclmul.vv	v8, v12, v24
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -      -     vclmul.vx	v8, v12, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -     vclmulh.vv	v8, v12, v24
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -      -     vclmulh.vx	v8, v12, a0
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvkg.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvkg.s
new file mode 100644
index 000000000000..d974a077ab5a
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvkg.s
@@ -0,0 +1,128 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-p670 -iterations=1 < %s | FileCheck %s
+
+vsetvli zero, zero, e8, mf8, tu, mu
+vghsh.vv v4, v8, v12
+vgmul.vv v4, v8
+
+vsetvli zero, zero, e16, mf4, tu, mu
+vghsh.vv v4, v8, v12
+vgmul.vv v4, v8
+
+vsetvli zero, zero, e32, mf2, tu, mu
+vghsh.vv v4, v8, v12
+vgmul.vv v4, v8
+
+vsetvli zero, zero, e32, m1, tu, mu
+vghsh.vv v4, v8, v12
+vgmul.vv v4, v8
+
+vsetvli zero, zero, e32, m2, tu, mu
+vghsh.vv v4, v8, v12
+vgmul.vv v4, v8
+
+vsetvli zero, zero, e32, m4, tu, mu
+vghsh.vv v4, v8, v12
+vgmul.vv v4, v8
+
+vsetvli zero, zero, e32, m8, tu, mu
+vghsh.vv v8, v16, v24
+vgmul.vv v8, v16
+
+# Show SEW does not matter
+vsetvli zero, zero, e64, m4, tu, mu
+vghsh.vv v4, v8, v12
+vgmul.vv v4, v8
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      24
+# CHECK-NEXT: Total Cycles:      45
+# CHECK-NEXT: Total uOps:        24
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.53
+# CHECK-NEXT: IPC:               0.53
+# CHECK-NEXT: Block RThroughput: 36.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      2     4.00                        vghsh.vv	v4, v8, v12
+# CHECK-NEXT:  1      2     4.00                        vgmul.vv	v4, v8
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      2     4.00                        vghsh.vv	v4, v8, v12
+# CHECK-NEXT:  1      2     4.00                        vgmul.vv	v4, v8
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      2     0.50                        vghsh.vv	v4, v8, v12
+# CHECK-NEXT:  1      2     0.50                        vgmul.vv	v4, v8
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      2     0.50                        vghsh.vv	v4, v8, v12
+# CHECK-NEXT:  1      2     0.50                        vgmul.vv	v4, v8
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      2     1.00                        vghsh.vv	v4, v8, v12
+# CHECK-NEXT:  1      2     1.00                        vgmul.vv	v4, v8
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      2     2.00                        vghsh.vv	v4, v8, v12
+# CHECK-NEXT:  1      2     2.00                        vgmul.vv	v4, v8
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      2     4.00                        vghsh.vv	v8, v16, v24
+# CHECK-NEXT:  1      2     4.00                        vgmul.vv	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      2     2.00                        vghsh.vv	v4, v8, v12
+# CHECK-NEXT:  1      2     2.00                        vgmul.vv	v4, v8
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SiFiveP600Div
+# CHECK-NEXT: [1]   - SiFiveP600FEXQ0
+# CHECK-NEXT: [2]   - SiFiveP600FEXQ1
+# CHECK-NEXT: [3]   - SiFiveP600FloatDiv
+# CHECK-NEXT: [4]   - SiFiveP600IEXQ0
+# CHECK-NEXT: [5]   - SiFiveP600IEXQ1
+# CHECK-NEXT: [6]   - SiFiveP600IEXQ2
+# CHECK-NEXT: [7]   - SiFiveP600IEXQ3
+# CHECK-NEXT: [8.0] - SiFiveP600LDST
+# CHECK-NEXT: [8.1] - SiFiveP600LDST
+# CHECK-NEXT: [9]   - SiFiveP600VDiv
+# CHECK-NEXT: [10]  - SiFiveP600VEXQ0
+# CHECK-NEXT: [11]  - SiFiveP600VEXQ1
+# CHECK-NEXT: [12]  - SiFiveP600VFloatDiv
+# CHECK-NEXT: [13]  - SiFiveP600VLD
+# CHECK-NEXT: [14]  - SiFiveP600VST
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]
+# CHECK-NEXT:  -      -      -      -     8.00    -      -      -      -      -      -     36.00  36.00   -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   Instructions:
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -     vghsh.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -      -     vgmul.vv	v4, v8
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -     vghsh.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -      -     vgmul.vv	v4, v8
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vghsh.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vgmul.vv	v4, v8
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vghsh.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vgmul.vv	v4, v8
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -     vghsh.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -      -     vgmul.vv	v4, v8
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -     vghsh.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vgmul.vv	v4, v8
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -     vghsh.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -      -     vgmul.vv	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -     vghsh.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vgmul.vv	v4, v8
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvkned.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvkned.s
new file mode 100644
index 000000000000..a5c226e34452
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvkned.s
@@ -0,0 +1,204 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-p670 -iterations=1 < %s | FileCheck %s
+
+# These instructions only support e32
+
+vsetvli zero, zero, e32, mf2, tu, mu
+vaesef.vv v4, v8
+vaesef.vs v4, v8
+vaesem.vv v4, v8
+vaesem.vs v4, v8
+vaesdm.vv v4, v8
+vaesdm.vs v4, v8
+vaeskf1.vi v4, v8, 8
+vaeskf2.vi v4, v8, 8
+vaesz.vs v4, v8
+
+vsetvli zero, zero, e32, m1, tu, mu
+vaesef.vv v4, v8
+vaesef.vs v4, v8
+vaesem.vv v4, v8
+vaesem.vs v4, v8
+vaesdm.vv v4, v8
+vaesdm.vs v4, v8
+vaeskf1.vi v4, v8, 8
+vaeskf2.vi v4, v8, 8
+vaesz.vs v4, v8
+
+vsetvli zero, zero, e32, m2, tu, mu
+vaesef.vv v4, v8
+vaesef.vs v4, v8
+vaesem.vv v4, v8
+vaesem.vs v4, v8
+vaesdm.vv v4, v8
+vaesdm.vs v4, v8
+vaeskf1.vi v4, v8, 8
+vaeskf2.vi v4, v8, 8
+vaesz.vs v4, v8
+
+vsetvli zero, zero, e32, m4, tu, mu
+vaesef.vv v4, v8
+vaesef.vs v4, v8
+vaesem.vv v4, v8
+vaesem.vs v4, v8
+vaesdm.vv v4, v8
+vaesdm.vs v4, v8
+vaeskf1.vi v4, v8, 8
+vaeskf2.vi v4, v8, 8
+vaesz.vs v4, v8
+
+vsetvli zero, zero, e32, m8, tu, mu
+vaesef.vv  v8, v16
+vaesef.vs  v8, v16
+vaesem.vv  v8, v16
+vaesem.vs  v8, v16
+vaesdm.vv  v8, v16
+vaesdm.vs  v8, v16
+vaeskf1.vi v8, v16, 8
+vaeskf2.vi v8, v16, 8
+vaesz.vs   v8, v16
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      50
+# CHECK-NEXT: Total Cycles:      73
+# CHECK-NEXT: Total uOps:        50
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.68
+# CHECK-NEXT: IPC:               0.68
+# CHECK-NEXT: Block RThroughput: 72.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      2     0.50                        vaesef.vv	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vaesef.vs	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vaesem.vv	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vaesem.vs	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vaesdm.vv	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vaesdm.vs	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vaeskf1.vi	v4, v8, 8
+# CHECK-NEXT:  1      2     0.50                        vaeskf2.vi	v4, v8, 8
+# CHECK-NEXT:  1      1     0.50                        vaesz.vs	v4, v8
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      2     0.50                        vaesef.vv	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vaesef.vs	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vaesem.vv	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vaesem.vs	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vaesdm.vv	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vaesdm.vs	v4, v8
+# CHECK-NEXT:  1      2     0.50                        vaeskf1.vi	v4, v8, 8
+# CHECK-NEXT:  1      2     0.50                        vaeskf2.vi	v4, v8, 8
+# CHECK-NEXT:  1      1     0.50                        vaesz.vs	v4, v8
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      2     1.00                        vaesef.vv	v4, v8
+# CHECK-NEXT:  1      2     1.00                        vaesef.vs	v4, v8
+# CHECK-NEXT:  1      2     1.00                        vaesem.vv	v4, v8
+# CHECK-NEXT:  1      2     1.00                        vaesem.vs	v4, v8
+# CHECK-NEXT:  1      2     1.00                        vaesdm.vv	v4, v8
+# CHECK-NEXT:  1      2     1.00                        vaesdm.vs	v4, v8
+# CHECK-NEXT:  1      2     1.00                        vaeskf1.vi	v4, v8, 8
+# CHECK-NEXT:  1      2     1.00                        vaeskf2.vi	v4, v8, 8
+# CHECK-NEXT:  1      1     1.00                        vaesz.vs	v4, v8
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      2     2.00                        vaesef.vv	v4, v8
+# CHECK-NEXT:  1      2     2.00                        vaesef.vs	v4, v8
+# CHECK-NEXT:  1      2     2.00                        vaesem.vv	v4, v8
+# CHECK-NEXT:  1      2     2.00                        vaesem.vs	v4, v8
+# CHECK-NEXT:  1      2     2.00                        vaesdm.vv	v4, v8
+# CHECK-NEXT:  1      2     2.00                        vaesdm.vs	v4, v8
+# CHECK-NEXT:  1      2     2.00                        vaeskf1.vi	v4, v8, 8
+# CHECK-NEXT:  1      2     2.00                        vaeskf2.vi	v4, v8, 8
+# CHECK-NEXT:  1      1     2.00                        vaesz.vs	v4, v8
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      2     4.00                        vaesef.vv	v8, v16
+# CHECK-NEXT:  1      2     4.00                        vaesef.vs	v8, v16
+# CHECK-NEXT:  1      2     4.00                        vaesem.vv	v8, v16
+# CHECK-NEXT:  1      2     4.00                        vaesem.vs	v8, v16
+# CHECK-NEXT:  1      2     4.00                        vaesdm.vv	v8, v16
+# CHECK-NEXT:  1      2     4.00                        vaesdm.vs	v8, v16
+# CHECK-NEXT:  1      2     4.00                        vaeskf1.vi	v8, v16, 8
+# CHECK-NEXT:  1      2     4.00                        vaeskf2.vi	v8, v16, 8
+# CHECK-NEXT:  1      1     4.00                        vaesz.vs	v8, v16
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SiFiveP600Div
+# CHECK-NEXT: [1]   - SiFiveP600FEXQ0
+# CHECK-NEXT: [2]   - SiFiveP600FEXQ1
+# CHECK-NEXT: [3]   - SiFiveP600FloatDiv
+# CHECK-NEXT: [4]   - SiFiveP600IEXQ0
+# CHECK-NEXT: [5]   - SiFiveP600IEXQ1
+# CHECK-NEXT: [6]   - SiFiveP600IEXQ2
+# CHECK-NEXT: [7]   - SiFiveP600IEXQ3
+# CHECK-NEXT: [8.0] - SiFiveP600LDST
+# CHECK-NEXT: [8.1] - SiFiveP600LDST
+# CHECK-NEXT: [9]   - SiFiveP600VDiv
+# CHECK-NEXT: [10]  - SiFiveP600VEXQ0
+# CHECK-NEXT: [11]  - SiFiveP600VEXQ1
+# CHECK-NEXT: [12]  - SiFiveP600VFloatDiv
+# CHECK-NEXT: [13]  - SiFiveP600VLD
+# CHECK-NEXT: [14]  - SiFiveP600VST
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]
+# CHECK-NEXT:  -      -      -      -     5.00    -      -      -      -      -      -     71.00  73.00   -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   Instructions:
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vaesef.vv	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vaesef.vs	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vaesem.vv	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vaesem.vs	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vaesdm.vv	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vaesdm.vs	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vaeskf1.vi	v4, v8, 8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vaeskf2.vi	v4, v8, 8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vaesz.vs	v4, v8
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vaesef.vv	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vaesef.vs	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vaesem.vv	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vaesem.vs	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vaesdm.vv	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vaesdm.vs	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vaeskf1.vi	v4, v8, 8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vaeskf2.vi	v4, v8, 8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vaesz.vs	v4, v8
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -      -     vaesef.vv	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -     vaesef.vs	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -      -     vaesem.vv	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -     vaesem.vs	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -     vaesdm.vv	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -     vaesdm.vs	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -     vaeskf1.vi	v4, v8, 8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -      -     vaeskf2.vi	v4, v8, 8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -     vaesz.vs	v4, v8
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vaesef.vv	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vaesef.vs	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vaesem.vv	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vaesem.vs	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vaesdm.vv	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vaesdm.vs	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -     vaeskf1.vi	v4, v8, 8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -     vaeskf2.vi	v4, v8, 8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -     vaesz.vs	v4, v8
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -     vaesef.vv	v8, v16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -      -     vaesef.vs	v8, v16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -      -     vaesem.vv	v8, v16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -      -     vaesem.vs	v8, v16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -      -     vaesdm.vv	v8, v16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -     vaesdm.vs	v8, v16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -     vaeskf1.vi	v8, v16, 8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -     vaeskf2.vi	v8, v16, 8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -     vaesz.vs	v8, v16
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvknhb.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvknhb.s
new file mode 100644
index 000000000000..f1a2a1899f0c
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvknhb.s
@@ -0,0 +1,153 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-p670 -iterations=1 < %s | FileCheck %s
+
+# SEW is only e32 or e64
+
+vsetvli zero, zero, e32, m1, tu, mu
+vsha2ms.vv v4, v8, v12
+vsha2ch.vv v4, v8, v12
+vsha2cl.vv v4, v8, v12
+
+vsetvli zero, zero, e32, m2, tu, mu
+vsha2ms.vv v4, v8, v12
+vsha2ch.vv v4, v8, v12
+vsha2cl.vv v4, v8, v12
+
+vsetvli zero, zero, e32, m4, tu, mu
+vsha2ms.vv v4, v8, v12
+vsha2ch.vv v4, v8, v12
+vsha2cl.vv v4, v8, v12
+
+vsetvli zero, zero, e32, m8, tu, mu
+vsha2ms.vv v8, v16, v24
+vsha2ch.vv v8, v16, v24
+vsha2cl.vv v8, v16, v24
+
+vsetvli zero, zero, e64, m1, tu, mu
+vsha2ms.vv v4, v8, v12
+vsha2ch.vv v4, v8, v12
+vsha2cl.vv v4, v8, v12
+
+vsetvli zero, zero, e64, m2, tu, mu
+vsha2ms.vv v4, v8, v12
+vsha2ch.vv v4, v8, v12
+vsha2cl.vv v4, v8, v12
+
+vsetvli zero, zero, e64, m4, tu, mu
+vsha2ms.vv v4, v8, v12
+vsha2ch.vv v4, v8, v12
+vsha2cl.vv v4, v8, v12
+
+vsetvli zero, zero, e64, m8, tu, mu
+vsha2ms.vv v8, v16, v24
+vsha2ch.vv v8, v16, v24
+vsha2cl.vv v8, v16, v24
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      32
+# CHECK-NEXT: Total Cycles:      68
+# CHECK-NEXT: Total uOps:        32
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.47
+# CHECK-NEXT: IPC:               0.47
+# CHECK-NEXT: Block RThroughput: 45.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      3     0.50                        vsha2ms.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     0.50                        vsha2ch.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     0.50                        vsha2cl.vv	v4, v8, v12
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      3     1.00                        vsha2ms.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                        vsha2ch.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                        vsha2cl.vv	v4, v8, v12
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      3     2.00                        vsha2ms.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     2.00                        vsha2ch.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     2.00                        vsha2cl.vv	v4, v8, v12
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      3     4.00                        vsha2ms.vv	v8, v16, v24
+# CHECK-NEXT:  1      3     4.00                        vsha2ch.vv	v8, v16, v24
+# CHECK-NEXT:  1      3     4.00                        vsha2cl.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      3     0.50                        vsha2ms.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     0.50                        vsha2ch.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     0.50                        vsha2cl.vv	v4, v8, v12
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      3     1.00                        vsha2ms.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                        vsha2ch.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                        vsha2cl.vv	v4, v8, v12
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      3     2.00                        vsha2ms.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     2.00                        vsha2ch.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     2.00                        vsha2cl.vv	v4, v8, v12
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      3     4.00                        vsha2ms.vv	v8, v16, v24
+# CHECK-NEXT:  1      3     4.00                        vsha2ch.vv	v8, v16, v24
+# CHECK-NEXT:  1      3     4.00                        vsha2cl.vv	v8, v16, v24
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SiFiveP600Div
+# CHECK-NEXT: [1]   - SiFiveP600FEXQ0
+# CHECK-NEXT: [2]   - SiFiveP600FEXQ1
+# CHECK-NEXT: [3]   - SiFiveP600FloatDiv
+# CHECK-NEXT: [4]   - SiFiveP600IEXQ0
+# CHECK-NEXT: [5]   - SiFiveP600IEXQ1
+# CHECK-NEXT: [6]   - SiFiveP600IEXQ2
+# CHECK-NEXT: [7]   - SiFiveP600IEXQ3
+# CHECK-NEXT: [8.0] - SiFiveP600LDST
+# CHECK-NEXT: [8.1] - SiFiveP600LDST
+# CHECK-NEXT: [9]   - SiFiveP600VDiv
+# CHECK-NEXT: [10]  - SiFiveP600VEXQ0
+# CHECK-NEXT: [11]  - SiFiveP600VEXQ1
+# CHECK-NEXT: [12]  - SiFiveP600VFloatDiv
+# CHECK-NEXT: [13]  - SiFiveP600VLD
+# CHECK-NEXT: [14]  - SiFiveP600VST
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]
+# CHECK-NEXT:  -      -      -      -     8.00    -      -      -      -      -      -     44.00  46.00   -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   Instructions:
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vsha2ms.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vsha2ch.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vsha2cl.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -     vsha2ms.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -      -     vsha2ch.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -     vsha2cl.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vsha2ms.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vsha2ch.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vsha2cl.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -     vsha2ms.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -      -     vsha2ch.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -     vsha2cl.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vsha2ms.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     vsha2ch.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vsha2cl.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -     vsha2ms.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -      -     vsha2ch.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -     vsha2cl.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vsha2ms.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -     vsha2ch.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vsha2cl.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -      -     vsha2ms.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -     vsha2ch.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -     vsha2cl.vv	v8, v16, v24
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvksed.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvksed.s
new file mode 100644
index 000000000000..26fc8702d75f
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvksed.s
@@ -0,0 +1,114 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-p670 -iterations=1 < %s | FileCheck %s
+
+# These instructions only support e32
+
+vsetvli zero, zero, e32, mf2, tu, mu
+vsm4k.vi v4, v8, 8
+vsm4r.vv v4, v8
+vsm4r.vs v4, v8
+
+vsetvli zero, zero, e32, m1, tu, mu
+vsm4k.vi v4, v8, 8
+vsm4r.vv v4, v8
+vsm4r.vs v4, v8
+
+vsetvli zero, zero, e32, m2, tu, mu
+vsm4k.vi v4, v8, 8
+vsm4r.vv v4, v8
+vsm4r.vs v4, v8
+
+vsetvli zero, zero, e32, m4, tu, mu
+vsm4k.vi v4, v8, 8
+vsm4r.vv v4, v8
+vsm4r.vs v4, v8
+
+vsetvli zero, zero, e32, m8, tu, mu
+vsm4k.vi v8, v16, 8
+vsm4r.vv v8, v16
+vsm4r.vs v8, v16
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      20
+# CHECK-NEXT: Total Cycles:      47
+# CHECK-NEXT: Total uOps:        20
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.43
+# CHECK-NEXT: IPC:               0.43
+# CHECK-NEXT: Block RThroughput: 48.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      3     1.00                        vsm4k.vi	v4, v8, 8
+# CHECK-NEXT:  1      3     1.00                        vsm4r.vv	v4, v8
+# CHECK-NEXT:  1      3     1.00                        vsm4r.vs	v4, v8
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      3     1.00                        vsm4k.vi	v4, v8, 8
+# CHECK-NEXT:  1      3     1.00                        vsm4r.vv	v4, v8
+# CHECK-NEXT:  1      3     1.00                        vsm4r.vs	v4, v8
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      3     2.00                        vsm4k.vi	v4, v8, 8
+# CHECK-NEXT:  1      3     2.00                        vsm4r.vv	v4, v8
+# CHECK-NEXT:  1      3     2.00                        vsm4r.vs	v4, v8
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      3     4.00                        vsm4k.vi	v4, v8, 8
+# CHECK-NEXT:  1      3     4.00                        vsm4r.vv	v4, v8
+# CHECK-NEXT:  1      3     4.00                        vsm4r.vs	v4, v8
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      3     8.00                        vsm4k.vi	v8, v16, 8
+# CHECK-NEXT:  1      3     8.00                        vsm4r.vv	v8, v16
+# CHECK-NEXT:  1      3     8.00                        vsm4r.vs	v8, v16
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SiFiveP600Div
+# CHECK-NEXT: [1]   - SiFiveP600FEXQ0
+# CHECK-NEXT: [2]   - SiFiveP600FEXQ1
+# CHECK-NEXT: [3]   - SiFiveP600FloatDiv
+# CHECK-NEXT: [4]   - SiFiveP600IEXQ0
+# CHECK-NEXT: [5]   - SiFiveP600IEXQ1
+# CHECK-NEXT: [6]   - SiFiveP600IEXQ2
+# CHECK-NEXT: [7]   - SiFiveP600IEXQ3
+# CHECK-NEXT: [8.0] - SiFiveP600LDST
+# CHECK-NEXT: [8.1] - SiFiveP600LDST
+# CHECK-NEXT: [9]   - SiFiveP600VDiv
+# CHECK-NEXT: [10]  - SiFiveP600VEXQ0
+# CHECK-NEXT: [11]  - SiFiveP600VEXQ1
+# CHECK-NEXT: [12]  - SiFiveP600VFloatDiv
+# CHECK-NEXT: [13]  - SiFiveP600VLD
+# CHECK-NEXT: [14]  - SiFiveP600VST
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]
+# CHECK-NEXT:  -      -      -      -     5.00    -      -      -      -      -      -     48.00   -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   Instructions:
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vsm4k.vi	v4, v8, 8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vsm4r.vv	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vsm4r.vs	v4, v8
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vsm4k.vi	v4, v8, 8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vsm4r.vv	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vsm4r.vs	v4, v8
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -      -     vsm4k.vi	v4, v8, 8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -      -     vsm4r.vv	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -      -     vsm4r.vs	v4, v8
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vsm4k.vi	v4, v8, 8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vsm4r.vv	v4, v8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vsm4r.vs	v4, v8
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -      -     vsm4k.vi	v8, v16, 8
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -      -     vsm4r.vv	v8, v16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -      -     vsm4r.vs	v8, v16
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvksh.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvksh.s
new file mode 100644
index 000000000000..574bbb36c23f
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvksh.s
@@ -0,0 +1,99 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-p670 -iterations=1 < %s | FileCheck %s
+
+# These instructions only support e32
+
+vsetvli zero, zero, e32, mf2, tu, mu
+vsm3me.vv v4, v8, v12
+vsm3c.vi v4, v8, 8
+
+vsetvli zero, zero, e32, m1, tu, mu
+vsm3me.vv v4, v8, v12
+vsm3c.vi v4, v8, 8
+
+vsetvli zero, zero, e32, m2, tu, mu
+vsm3me.vv v4, v8, v12
+vsm3c.vi v4, v8, 8
+
+vsetvli zero, zero, e32, m4, tu, mu
+vsm3me.vv v4, v8, v12
+vsm3c.vi v4, v8, 8
+
+vsetvli zero, zero, e32, m8, tu, mu
+vsm3me.vv v8, v16, v24
+vsm3c.vi  v8, v16, 8
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      15
+# CHECK-NEXT: Total Cycles:      31
+# CHECK-NEXT: Total uOps:        15
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.48
+# CHECK-NEXT: IPC:               0.48
+# CHECK-NEXT: Block RThroughput: 32.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      3     1.00                        vsm3me.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                        vsm3c.vi	v4, v8, 8
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      3     1.00                        vsm3me.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                        vsm3c.vi	v4, v8, 8
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      3     2.00                        vsm3me.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     2.00                        vsm3c.vi	v4, v8, 8
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      3     4.00                        vsm3me.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     4.00                        vsm3c.vi	v4, v8, 8
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      3     8.00                        vsm3me.vv	v8, v16, v24
+# CHECK-NEXT:  1      3     8.00                        vsm3c.vi	v8, v16, 8
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SiFiveP600Div
+# CHECK-NEXT: [1]   - SiFiveP600FEXQ0
+# CHECK-NEXT: [2]   - SiFiveP600FEXQ1
+# CHECK-NEXT: [3]   - SiFiveP600FloatDiv
+# CHECK-NEXT: [4]   - SiFiveP600IEXQ0
+# CHECK-NEXT: [5]   - SiFiveP600IEXQ1
+# CHECK-NEXT: [6]   - SiFiveP600IEXQ2
+# CHECK-NEXT: [7]   - SiFiveP600IEXQ3
+# CHECK-NEXT: [8.0] - SiFiveP600LDST
+# CHECK-NEXT: [8.1] - SiFiveP600LDST
+# CHECK-NEXT: [9]   - SiFiveP600VDiv
+# CHECK-NEXT: [10]  - SiFiveP600VEXQ0
+# CHECK-NEXT: [11]  - SiFiveP600VEXQ1
+# CHECK-NEXT: [12]  - SiFiveP600VFloatDiv
+# CHECK-NEXT: [13]  - SiFiveP600VLD
+# CHECK-NEXT: [14]  - SiFiveP600VST
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]
+# CHECK-NEXT:  -      -      -      -     5.00    -      -      -      -      -      -     32.00   -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   Instructions:
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vsm3me.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vsm3c.vi	v4, v8, 8
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vsm3me.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     vsm3c.vi	v4, v8, 8
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -      -     vsm3me.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     2.00    -      -      -      -     vsm3c.vi	v4, v8, 8
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vsm3me.vv	v4, v8, v12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     4.00    -      -      -      -     vsm3c.vi	v4, v8, 8
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -      -     vsm3me.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     8.00    -      -      -      -     vsm3c.vi	v8, v16, 8
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/simple-test.s b/llvm/test/tools/llvm-mca/X86/BtVer2/simple-test.s
index d1285441de5e..715f3706ef88 100644
--- a/llvm/test/tools/llvm-mca/X86/BtVer2/simple-test.s
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/simple-test.s
@@ -1,5 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=100 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=100 -skip-unsupported-instructions=lack-sched < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=100 -skip-unsupported-instructions=parse-failure < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=100 -skip-unsupported-instructions=any < %s | FileCheck %s
 
 add %edi, %eax
 
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/skip-unsupported-instructions-none-remain.s b/llvm/test/tools/llvm-mca/X86/BtVer2/skip-unsupported-instructions-none-remain.s
index 0d67f53e12f1..5bd6910369ee 100644
--- a/llvm/test/tools/llvm-mca/X86/BtVer2/skip-unsupported-instructions-none-remain.s
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/skip-unsupported-instructions-none-remain.s
@@ -1,4 +1,4 @@
-# RUN: not llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -skip-unsupported-instructions %s 2>&1 | FileCheck --check-prefixes=CHECK-ALL,CHECK-SKIP %s
+# RUN: not llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -skip-unsupported-instructions=lack-sched %s 2>&1 | FileCheck --check-prefixes=CHECK-ALL,CHECK-SKIP %s
 # RUN: not llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 %s 2>&1 | FileCheck --check-prefixes=CHECK-ALL,CHECK-ERROR %s
 
 # Test defends that if all instructions are skipped leaving an empty input, an error is printed.
@@ -7,7 +7,7 @@ bzhi %eax, %ebx, %ecx
 
 # CHECK-ALL-NOT: error
 
-# CHECK-ERROR: error: found an unsupported instruction in the input assembly sequence, use -skip-unsupported-instructions to ignore.
+# CHECK-ERROR: error: found an unsupported instruction in the input assembly sequence, use -skip-unsupported-instructions=lack-sched to ignore these on the input.
 
 # CHECK-SKIP: warning: found an unsupported instruction in the input assembly sequence, skipping with -skip-unsupported-instructions, note accuracy will be impacted:
 # CHECK-SKIP: note: instruction:      bzhil   %eax, %ebx, %ecx
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/unsupported-instruction.s b/llvm/test/tools/llvm-mca/X86/BtVer2/unsupported-instruction.s
index 3690a1101be9..7d3aee5e3bf9 100644
--- a/llvm/test/tools/llvm-mca/X86/BtVer2/unsupported-instruction.s
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/unsupported-instruction.s
@@ -1,10 +1,13 @@
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -skip-unsupported-instructions -timeline %s 2>&1 | FileCheck --check-prefix=CHECK-SKIP %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -skip-unsupported-instructions=any -timeline %s 2>&1 | FileCheck --check-prefix=CHECK-SKIP %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -skip-unsupported-instructions=lack-sched -timeline %s 2>&1 | FileCheck --check-prefix=CHECK-SKIP %s
+# RUN: not llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -skip-unsupported-instructions=parse-failure -timeline %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
 # RUN: not llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
 
-# Test checks that unsupported instructions exit with an error, unless -skip-unsupported-instructions is passed, in which case the remaining instructions should be analysed.
+# Test checks that unsupported instructions exit with an error, unless -skip-unsupported-instructions=lack-sched is passed, in which case the remaining instructions should be analysed.
+# Additionally check that -skip-unsupported-instructions=parse-failure continues to raise the lack of scheduling information.
 
 # CHECK-SKIP: warning: found an unsupported instruction in the input assembly sequence, skipping with -skip-unsupported-instructions, note accuracy will be impacted:
-# CHECK-ERROR: error: found an unsupported instruction in the input assembly sequence, use -skip-unsupported-instructions to ignore.
+# CHECK-ERROR: error: found an unsupported instruction in the input assembly sequence, use -skip-unsupported-instructions=lack-sched to ignore these on the input.
 
 bzhi %eax, %ebx, %ecx
 
diff --git a/llvm/test/tools/llvm-mca/X86/bad-input.s b/llvm/test/tools/llvm-mca/X86/bad-input.s
new file mode 100644
index 000000000000..b49bccc45b2d
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/X86/bad-input.s
@@ -0,0 +1,16 @@
+# This test is generic but not all builders have an llvm-mca which can run natively.
+
+# RUN: not llvm-mca -mtriple=x86_64 -mcpu=x86-64 %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK-ALL,CHECK %s
+# RUN: not llvm-mca -mtriple=x86_64 -mcpu=x86-64 -skip-unsupported-instructions=none %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK-ALL,CHECK %s
+# RUN: not llvm-mca -mtriple=x86_64 -mcpu=x86-64 -skip-unsupported-instructions=lack-sched %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK-ALL,CHECK %s
+# RUN: not llvm-mca -mtriple=x86_64 -mcpu=x86-64 -skip-unsupported-instructions=parse-failure %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK-ALL,CHECK-SKIP %s
+# RUN: not llvm-mca -mtriple=x86_64 -mcpu=x86-64 -skip-unsupported-instructions=any %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK-ALL,CHECK-SKIP %s
+
+# Test checks that MCA does not produce a total cycles estimate if it encounters parse errors.
+
+# CHECK-ALL-NOT: Total Cycles:
+
+# CHECK: error: Assembly input parsing had errors, use -skip-unsupported-instructions=parse-failure to drop failing lines from the input.
+# CHECK-SKIP: error: no assembly instructions found.
+
+This is not a valid assembly file for any architecture (by virtue of this text.)
diff --git a/llvm/test/tools/llvm-objcopy/ELF/dynrelocsec-remove-shinfo-reference.test b/llvm/test/tools/llvm-objcopy/ELF/dynrelocsec-remove-shinfo-reference.test
index b5fd2ee3bc69..2c138e1158a7 100644
--- a/llvm/test/tools/llvm-objcopy/ELF/dynrelocsec-remove-shinfo-reference.test
+++ b/llvm/test/tools/llvm-objcopy/ELF/dynrelocsec-remove-shinfo-reference.test
@@ -1,30 +1,30 @@
-# RUN: yaml2obj %s -o %t
-# RUN: llvm-objcopy -R .got.plt %t %t2
-
-## .rela.plt is a dynamic relocation section that has a connection
-## via sh_info field with its target section .got.plt.
-## Here we check that if the target section is removed then dynamic
-## relocation section is also removed and we do not end up with a broken
-## sh_info value, for example.
-
-# RUN: llvm-readelf --sections %t2 \
-# RUN:  | FileCheck %s --implicit-check-not=".got.plt" --implicit-check-not=".rela.plt"
-
---- !ELF
-FileHeader:
-  Class:          ELFCLASS64
-  Data:           ELFDATA2LSB
-  Type:           ET_DYN
-  Machine:        EM_X86_64
-Sections:
-  - Name:         .rela.plt
-    Type:         SHT_RELA
-    Flags:        [ SHF_ALLOC ]
-    Link:         .dynsym
-    Info:         .got.plt
-  - Name:         .got.plt
-    Type:         SHT_PROGBITS
-    Flags:        [ SHF_WRITE, SHF_ALLOC ]
-DynamicSymbols:
-  - Name:          bar
-    Binding:       STB_GLOBAL
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-objcopy -R .got.plt %t %t2
+
+## .rela.plt is a dynamic relocation section that has a connection
+## via sh_info field with its target section .got.plt.
+## Here we check that if the target section is removed then dynamic
+## relocation section is also removed and we do not end up with a broken
+## sh_info value, for example.
+
+# RUN: llvm-readelf --sections %t2 \
+# RUN:  | FileCheck %s --implicit-check-not=".got.plt" --implicit-check-not=".rela.plt"
+
+--- !ELF
+FileHeader:
+  Class:          ELFCLASS64
+  Data:           ELFDATA2LSB
+  Type:           ET_DYN
+  Machine:        EM_X86_64
+Sections:
+  - Name:         .rela.plt
+    Type:         SHT_RELA
+    Flags:        [ SHF_ALLOC ]
+    Link:         .dynsym
+    Info:         .got.plt
+  - Name:         .got.plt
+    Type:         SHT_PROGBITS
+    Flags:        [ SHF_WRITE, SHF_ALLOC ]
+DynamicSymbols:
+  - Name:          bar
+    Binding:       STB_GLOBAL
diff --git a/llvm/test/tools/llvm-objcopy/ELF/dynrelocsec-remove-shlink-reference.test b/llvm/test/tools/llvm-objcopy/ELF/dynrelocsec-remove-shlink-reference.test
index 3cef37d85c94..d0229364c2f9 100644
--- a/llvm/test/tools/llvm-objcopy/ELF/dynrelocsec-remove-shlink-reference.test
+++ b/llvm/test/tools/llvm-objcopy/ELF/dynrelocsec-remove-shlink-reference.test
@@ -1,34 +1,34 @@
-# RUN: yaml2obj %s -o %t
-
-## Check we cannot remove the .dynsym symbol table because dynamic
-## relocation section .rela.dyn still references it via sh_link field.
-# RUN: not llvm-objcopy -R .dynsym %t %t2 2>&1 >/dev/null | FileCheck %s --check-prefix=ERR -DINPUT=%t
-# ERR: error: '[[INPUT]]': symbol table '.dynsym' cannot be removed because it is referenced by the relocation section '.rela.dyn'
-
-## Check we can remove .dynsym after removing the reference.
-# RUN: llvm-objcopy -R .dynsym -R .rela.dyn %t %t2
-# RUN: llvm-readelf --sections %t2 | FileCheck %s --implicit-check-not=".dynsym"
-
-## Check we zero out sh_link field and allow producing output with the --allow-broken-links switch.
-# RUN: llvm-objcopy -R .dynsym --allow-broken-links %t %t2
-# RUN: llvm-readelf --sections %t2 | FileCheck %s --check-prefix=DROP-LINK
-# DROP-LINK:     [Nr] Name      Type Address          Off    Size   ES   Flg L
-# DROP-LINK:     [ 1] .rela.dyn RELA 0000000000000270 000040 000000 18   A   0
-# DROP-LINK-NOT: .dynsym
-
-!ELF
-FileHeader:
-  Class:           ELFCLASS64
-  Data:            ELFDATA2LSB
-  Type:            ET_DYN
-  Machine:         EM_X86_64
-Sections:
-  - Name:          .rela.dyn
-    Type:          SHT_RELA
-    Flags:         [ SHF_ALLOC ]
-    Address:       0x0000000000000270
-    Link:          .dynsym
-    EntSize:       0x0000000000000018
-DynamicSymbols:
-  - Name:          bar
-    Binding:       STB_GLOBAL
+# RUN: yaml2obj %s -o %t
+
+## Check we cannot remove the .dynsym symbol table because dynamic
+## relocation section .rela.dyn still references it via sh_link field.
+# RUN: not llvm-objcopy -R .dynsym %t %t2 2>&1 >/dev/null | FileCheck %s --check-prefix=ERR -DINPUT=%t
+# ERR: error: '[[INPUT]]': symbol table '.dynsym' cannot be removed because it is referenced by the relocation section '.rela.dyn'
+
+## Check we can remove .dynsym after removing the reference.
+# RUN: llvm-objcopy -R .dynsym -R .rela.dyn %t %t2
+# RUN: llvm-readelf --sections %t2 | FileCheck %s --implicit-check-not=".dynsym"
+
+## Check we zero out sh_link field and allow producing output with the --allow-broken-links switch.
+# RUN: llvm-objcopy -R .dynsym --allow-broken-links %t %t2
+# RUN: llvm-readelf --sections %t2 | FileCheck %s --check-prefix=DROP-LINK
+# DROP-LINK:     [Nr] Name      Type Address          Off    Size   ES   Flg L
+# DROP-LINK:     [ 1] .rela.dyn RELA 0000000000000270 000040 000000 18   A   0
+# DROP-LINK-NOT: .dynsym
+
+!ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_DYN
+  Machine:         EM_X86_64
+Sections:
+  - Name:          .rela.dyn
+    Type:          SHT_RELA
+    Flags:         [ SHF_ALLOC ]
+    Address:       0x0000000000000270
+    Link:          .dynsym
+    EntSize:       0x0000000000000018
+DynamicSymbols:
+  - Name:          bar
+    Binding:       STB_GLOBAL
diff --git a/llvm/test/tools/llvm-objdump/ELF/ARM/literal-arm.s b/llvm/test/tools/llvm-objdump/ELF/ARM/literal-arm.s
index f10e00fe7520..17620a244338 100644
--- a/llvm/test/tools/llvm-objdump/ELF/ARM/literal-arm.s
+++ b/llvm/test/tools/llvm-objdump/ELF/ARM/literal-arm.s
@@ -1,66 +1,66 @@
-@@ Check that PC-relative memory addressing is annotated
-
-@ RUN: llvm-mc %s -triple=armv7 -filetype=obj | \
-@ RUN:   llvm-objdump --no-print-imm-hex -d --no-show-raw-insn --triple=armv7 - | \
-@ RUN:   FileCheck %s
-
-.text
-foo:
-@ CHECK:      00000000 <foo>:
-  .word 0x01020304
-
-_start:
-@ CHECK:      00000004 <_start>:
-
-@@ Check a special case immediate for AddrMode_i12
-  ldr   r1, [pc, #-0]
-@ CHECK-NEXT:    4: ldr   r1, [pc, #-0]           @ 0xc <_start+0x8>
-
-@@ Check AddrMode_i12 instructions, with positive and negative immediates
-  ldr   r0, foo
-  ldrb  r0, bar
-  pli   _start
-  pld   bar
-@ CHECK-NEXT:    8: ldr   r0, [pc, #-16]          @ 0x0 <foo>
-@ CHECK-NEXT:    c: ldrb  r0, [pc, #48]           @ 0x44 <bar>
-@ CHECK-NEXT:   10: pli   [pc, #-20]              @ 0x4 <_start>
-@ CHECK-NEXT:   14: pld   [pc, #40]               @ 0x44 <bar>
-
-@@ Check that AddrMode_i12 instructions that do not use PC-relative addressing
-@@ are not annotated
-  ldr   r0, [r1, #8]
-@ CHECK-NEXT:   18: ldr   r0, [r1, #8]{{$}}
-
-@@ Check AddrMode3 instructions, with positive and negative immediates
-  ldrd  r0, r1, foo
-  ldrh  r0, bar
-@ CHECK-NEXT:   1c: ldrd  r0, r1, [pc, #-36]      @ 0x0 <foo>
-@ CHECK-NEXT:   20: ldrh  r0, [pc, #28]           @ 0x44 <bar>
-
-@@ Check that AddrMode3 instruction that do not use PC+imm addressing are not
-@@ annotated
-  ldrh  r0, [r1, #8]
-  ldrh  r0, [pc, r2]
-@ CHECK-NEXT:   24: ldrh  r0, [r1, #8]{{$}}
-@ CHECK-NEXT:   28: ldrh  r0, [pc, r2]{{$}}
-
-@@ Check AddrMode5 instructions, with positive and negative immediates
-  ldc   p14, c5, foo
-  ldcl  p6, c4, bar
-  ldc2  p5, c2, foo
-  ldc2l p3, c1, bar
-@ CHECK-NEXT:   2c: ldc   p14, c5, [pc, #-52]     @ 0x0 <foo>
-@ CHECK-NEXT:   30: ldcl  p6, c4, [pc, #12]       @ 0x44 <bar>
-@ CHECK-NEXT:   34: ldc2  p5, c2, [pc, #-60]      @ 0x0 <foo>
-@ CHECK-NEXT:   38: ldc2l p3, c1, [pc, #4]        @ 0x44 <bar>
-
-@@ Check that AddrMode5 instruction that do not use PC+imm addressing are not
-@@ annotated
-  ldc   p14, c5, [r1, #8]
-  ldc   p14, c5, [pc], {16}
-@ CHECK-NEXT:   3c: ldc   p14, c5, [r1, #8]{{$}}
-@ CHECK-NEXT:   40: ldc   p14, c5, [pc], {16}{{$}}
-
-bar:
-@ CHECK:      00000044 <bar>:
-  .word 0x01020304
+@@ Check that PC-relative memory addressing is annotated
+
+@ RUN: llvm-mc %s -triple=armv7 -filetype=obj | \
+@ RUN:   llvm-objdump --no-print-imm-hex -d --no-show-raw-insn --triple=armv7 - | \
+@ RUN:   FileCheck %s
+
+.text
+foo:
+@ CHECK:      00000000 <foo>:
+  .word 0x01020304
+
+_start:
+@ CHECK:      00000004 <_start>:
+
+@@ Check a special case immediate for AddrMode_i12
+  ldr   r1, [pc, #-0]
+@ CHECK-NEXT:    4: ldr   r1, [pc, #-0]           @ 0xc <_start+0x8>
+
+@@ Check AddrMode_i12 instructions, with positive and negative immediates
+  ldr   r0, foo
+  ldrb  r0, bar
+  pli   _start
+  pld   bar
+@ CHECK-NEXT:    8: ldr   r0, [pc, #-16]          @ 0x0 <foo>
+@ CHECK-NEXT:    c: ldrb  r0, [pc, #48]           @ 0x44 <bar>
+@ CHECK-NEXT:   10: pli   [pc, #-20]              @ 0x4 <_start>
+@ CHECK-NEXT:   14: pld   [pc, #40]               @ 0x44 <bar>
+
+@@ Check that AddrMode_i12 instructions that do not use PC-relative addressing
+@@ are not annotated
+  ldr   r0, [r1, #8]
+@ CHECK-NEXT:   18: ldr   r0, [r1, #8]{{$}}
+
+@@ Check AddrMode3 instructions, with positive and negative immediates
+  ldrd  r0, r1, foo
+  ldrh  r0, bar
+@ CHECK-NEXT:   1c: ldrd  r0, r1, [pc, #-36]      @ 0x0 <foo>
+@ CHECK-NEXT:   20: ldrh  r0, [pc, #28]           @ 0x44 <bar>
+
+@@ Check that AddrMode3 instruction that do not use PC+imm addressing are not
+@@ annotated
+  ldrh  r0, [r1, #8]
+  ldrh  r0, [pc, r2]
+@ CHECK-NEXT:   24: ldrh  r0, [r1, #8]{{$}}
+@ CHECK-NEXT:   28: ldrh  r0, [pc, r2]{{$}}
+
+@@ Check AddrMode5 instructions, with positive and negative immediates
+  ldc   p14, c5, foo
+  ldcl  p6, c4, bar
+  ldc2  p5, c2, foo
+  ldc2l p3, c1, bar
+@ CHECK-NEXT:   2c: ldc   p14, c5, [pc, #-52]     @ 0x0 <foo>
+@ CHECK-NEXT:   30: ldcl  p6, c4, [pc, #12]       @ 0x44 <bar>
+@ CHECK-NEXT:   34: ldc2  p5, c2, [pc, #-60]      @ 0x0 <foo>
+@ CHECK-NEXT:   38: ldc2l p3, c1, [pc, #4]        @ 0x44 <bar>
+
+@@ Check that AddrMode5 instruction that do not use PC+imm addressing are not
+@@ annotated
+  ldc   p14, c5, [r1, #8]
+  ldc   p14, c5, [pc], {16}
+@ CHECK-NEXT:   3c: ldc   p14, c5, [r1, #8]{{$}}
+@ CHECK-NEXT:   40: ldc   p14, c5, [pc], {16}{{$}}
+
+bar:
+@ CHECK:      00000044 <bar>:
+  .word 0x01020304
diff --git a/llvm/test/tools/llvm-objdump/ELF/ARM/literal-thumb.s b/llvm/test/tools/llvm-objdump/ELF/ARM/literal-thumb.s
index 9f002c975ea3..767e37b0bed9 100644
--- a/llvm/test/tools/llvm-objdump/ELF/ARM/literal-thumb.s
+++ b/llvm/test/tools/llvm-objdump/ELF/ARM/literal-thumb.s
@@ -1,24 +1,24 @@
-@@ Check that PC-relative memory addressing is annotated
-
-@ RUN: llvm-mc %s -triple=thumbv6m -filetype=obj | \
-@ RUN:   llvm-objdump --no-print-imm-hex -d --no-show-raw-insn --triple=thumbv6m - | \
-@ RUN:   FileCheck %s
-
-.text
-_start:
-@ CHECK:      00000000 <_start>:
-
-@@ Check AddrModeT1_s instruction, with 4-byte and 2-byte alignment
-  ldr r0, bar
-  ldr r1, bar
-  ldr r2, bar
-  ldr r3, bar
-@ CHECK-NEXT:   0: ldr    r0, [pc, #4]            @ 0x8 <bar>
-@ CHECK-NEXT:   2: ldr    r1, [pc, #4]            @ 0x8 <bar>
-@ CHECK-NEXT:   4: ldr    r2, [pc, #0]            @ 0x8 <bar>
-@ CHECK-NEXT:   6: ldr    r3, [pc, #0]            @ 0x8 <bar>
-
-  .balign 4
-bar:
-@ CHECK:      00000008 <bar>:
-  .word 0x01020304
+@@ Check that PC-relative memory addressing is annotated
+
+@ RUN: llvm-mc %s -triple=thumbv6m -filetype=obj | \
+@ RUN:   llvm-objdump --no-print-imm-hex -d --no-show-raw-insn --triple=thumbv6m - | \
+@ RUN:   FileCheck %s
+
+.text
+_start:
+@ CHECK:      00000000 <_start>:
+
+@@ Check AddrModeT1_s instruction, with 4-byte and 2-byte alignment
+  ldr r0, bar
+  ldr r1, bar
+  ldr r2, bar
+  ldr r3, bar
+@ CHECK-NEXT:   0: ldr    r0, [pc, #4]            @ 0x8 <bar>
+@ CHECK-NEXT:   2: ldr    r1, [pc, #4]            @ 0x8 <bar>
+@ CHECK-NEXT:   4: ldr    r2, [pc, #0]            @ 0x8 <bar>
+@ CHECK-NEXT:   6: ldr    r3, [pc, #0]            @ 0x8 <bar>
+
+  .balign 4
+bar:
+@ CHECK:      00000008 <bar>:
+  .word 0x01020304
diff --git a/llvm/test/tools/llvm-objdump/ELF/ARM/literal-thumb2.s b/llvm/test/tools/llvm-objdump/ELF/ARM/literal-thumb2.s
index e8c2844fb0bf..41795cd3fce0 100644
--- a/llvm/test/tools/llvm-objdump/ELF/ARM/literal-thumb2.s
+++ b/llvm/test/tools/llvm-objdump/ELF/ARM/literal-thumb2.s
@@ -1,116 +1,116 @@
-@@ Check that PC-relative memory addressing is annotated
-
-@ RUN: llvm-mc %s -triple=thumbv7 -filetype=obj | \
-@ RUN:   llvm-objdump --no-print-imm-hex -d --no-show-raw-insn --triple=thumbv7 - | \
-@ RUN:   FileCheck %s
-
-.text
-foo:
-@ CHECK:      00000000 <foo>:
-  .word 0x01020304
-
-_start:
-@ CHECK:      00000004 <_start>:
-
-@@ Check a special case immediate for AddrModeT2_pc
-  .balign 4
-  ldr r0, [pc, #-0]
-@ CHECK:         4: ldr.w   r0, [pc, #-0]           @ 0x8 <_start+0x4>
-
-@@ Same instruction, but the address is not 4-byte aligned
-  nop
-  ldr r0, [pc, #-0]
-@ CHECK:         a: ldr.w   r0, [pc, #-0]           @ 0xc <_start+0x8>
-
-@@ Check a special case immediate for AddrModeT2_i8s4
-  .balign 4
-  ldrd r0, r1, [pc, #-0]
-@ CHECK:        10: ldrd    r0, r1, [pc, #-0]       @ 0x14 <_start+0x10>
-
-@@ Same instruction, but the address is not 4-byte aligned
-  nop
-  ldrd r0, r1, [pc, #-0]
-@ CHECK:        16: ldrd    r0, r1, [pc, #-0]       @ 0x18 <_start+0x14>
-
-@@ Check AddrModeT2_pc instructions, with positive and negative immediates
-  .balign 4
-  ldr r0, foo
-  ldrb r0, bar
-  ldrsb r0, foo
-  ldrsh r0, bar
-  pli _start
-  pld bar
-@ CHECK:        1c: ldr.w   r0, [pc, #-32]          @ 0x0 <foo>
-@ CHECK-NEXT:   20: ldrb.w  r0, [pc, #112]          @ 0x94 <bar>
-@ CHECK-NEXT:   24: ldrsb.w r0, [pc, #-40]          @ 0x0 <foo>
-@ CHECK-NEXT:   28: ldrsh.w r0, [pc, #104]          @ 0x94 <bar>
-@ CHECK-NEXT:   2c: pli     [pc, #-44]              @ 0x4 <_start>
-@ CHECK-NEXT:   30: pld     [pc, #96]               @ 0x94 <bar>
-
-@@ Same instructions, but the addresses are not 4-byte aligned
-  nop
-  ldr r0, foo
-  ldrb r0, bar
-  ldrsb r0, foo
-  ldrsh r0, bar
-  pli _start
-  pld bar
-@ CHECK:        36: ldr.w   r0, [pc, #-56]          @ 0x0 <foo>
-@ CHECK-NEXT:   3a: ldrb.w  r0, [pc, #88]           @ 0x94 <bar>
-@ CHECK-NEXT:   3e: ldrsb.w r0, [pc, #-64]          @ 0x0 <foo>
-@ CHECK-NEXT:   42: ldrsh.w r0, [pc, #80]           @ 0x94 <bar>
-@ CHECK-NEXT:   46: pli     [pc, #-68]              @ 0x4 <_start>
-@ CHECK-NEXT:   4a: pld     [pc, #72]               @ 0x94 <bar>
-
-@@ Check AddrModeT2_i8s4 instructions, with positive and negative immediates
-  .balign 4
-  ldrd r0, r1, foo
-  ldrd r0, r1, bar
-@ CHECK:        50: ldrd    r0, r1, [pc, #-84]      @ 0x0 <foo>
-@ CHECK-NEXT:   54: ldrd    r0, r1, [pc, #60]       @ 0x94 <bar>
-
-@@ Same instructions, but the addresses are not 4-byte aligned
-  nop
-  ldrd r0, r1, foo
-  ldrd r0, r1, bar
-@ CHECK:        5a: ldrd    r0, r1, [pc, #-92]      @ 0x0 <foo>
-@ CHECK-NEXT:   5e: ldrd    r0, r1, [pc, #52]       @ 0x94 <bar>
-
-@@ Check that AddrModeT2_i8s4 instructions that do not use PC-relative
-@@ addressingare not annotated
-  ldrd  r0, r1, [r2, #8]
-@ CHECK:        62: ldrd    r0, r1, [r2, #8]{{$}}
-
-@@ Check AddrMode5 instructions, with positive and negative immediates
-  .balign 4
-  ldc   p14, c5, foo
-  ldcl  p6, c4, bar
-  ldc2  p5, c2, foo
-  ldc2l p3, c1, bar
-@ CHECK:        68: ldc     p14, c5, [pc, #-108]    @ 0x0 <foo>
-@ CHECK-NEXT:   6c: ldcl    p6, c4, [pc, #36]       @ 0x94 <bar>
-@ CHECK-NEXT:   70: ldc2    p5, c2, [pc, #-116]     @ 0x0 <foo>
-@ CHECK-NEXT:   74: ldc2l   p3, c1, [pc, #28]       @ 0x94 <bar>
-
-@@ Same instructions, but the addresses are not 4-byte aligned
-  nop
-  ldc   p14, c5, foo
-  ldcl  p6, c4, bar
-  ldc2  p5, c2, foo
-  ldc2l p3, c1, bar
-@ CHECK:        7a: ldc     p14, c5, [pc, #-124]    @ 0x0 <foo>
-@ CHECK-NEXT:   7e: ldcl    p6, c4, [pc, #20]       @ 0x94 <bar>
-@ CHECK-NEXT:   82: ldc2    p5, c2, [pc, #-132]     @ 0x0 <foo>
-@ CHECK-NEXT:   86: ldc2l   p3, c1, [pc, #12]       @ 0x94 <bar>
-
-@@ Check that AddrMode5 instruction that do not use PC+imm addressing are not
-@@ annotated
-  ldc   p14, c5, [r1, #8]
-  ldc   p14, c5, [pc], {16}
-@ CHECK:        8a: ldc     p14, c5, [r1, #8]{{$}}
-@ CHECK-NEXT:   8e: ldc     p14, c5, [pc], {16}{{$}}
-
-  .balign 4
-bar:
-@ CHECK:      00000094 <bar>:
-  .word 0x01020304
+@@ Check that PC-relative memory addressing is annotated
+
+@ RUN: llvm-mc %s -triple=thumbv7 -filetype=obj | \
+@ RUN:   llvm-objdump --no-print-imm-hex -d --no-show-raw-insn --triple=thumbv7 - | \
+@ RUN:   FileCheck %s
+
+.text
+foo:
+@ CHECK:      00000000 <foo>:
+  .word 0x01020304
+
+_start:
+@ CHECK:      00000004 <_start>:
+
+@@ Check a special case immediate for AddrModeT2_pc
+  .balign 4
+  ldr r0, [pc, #-0]
+@ CHECK:         4: ldr.w   r0, [pc, #-0]           @ 0x8 <_start+0x4>
+
+@@ Same instruction, but the address is not 4-byte aligned
+  nop
+  ldr r0, [pc, #-0]
+@ CHECK:         a: ldr.w   r0, [pc, #-0]           @ 0xc <_start+0x8>
+
+@@ Check a special case immediate for AddrModeT2_i8s4
+  .balign 4
+  ldrd r0, r1, [pc, #-0]
+@ CHECK:        10: ldrd    r0, r1, [pc, #-0]       @ 0x14 <_start+0x10>
+
+@@ Same instruction, but the address is not 4-byte aligned
+  nop
+  ldrd r0, r1, [pc, #-0]
+@ CHECK:        16: ldrd    r0, r1, [pc, #-0]       @ 0x18 <_start+0x14>
+
+@@ Check AddrModeT2_pc instructions, with positive and negative immediates
+  .balign 4
+  ldr r0, foo
+  ldrb r0, bar
+  ldrsb r0, foo
+  ldrsh r0, bar
+  pli _start
+  pld bar
+@ CHECK:        1c: ldr.w   r0, [pc, #-32]          @ 0x0 <foo>
+@ CHECK-NEXT:   20: ldrb.w  r0, [pc, #112]          @ 0x94 <bar>
+@ CHECK-NEXT:   24: ldrsb.w r0, [pc, #-40]          @ 0x0 <foo>
+@ CHECK-NEXT:   28: ldrsh.w r0, [pc, #104]          @ 0x94 <bar>
+@ CHECK-NEXT:   2c: pli     [pc, #-44]              @ 0x4 <_start>
+@ CHECK-NEXT:   30: pld     [pc, #96]               @ 0x94 <bar>
+
+@@ Same instructions, but the addresses are not 4-byte aligned
+  nop
+  ldr r0, foo
+  ldrb r0, bar
+  ldrsb r0, foo
+  ldrsh r0, bar
+  pli _start
+  pld bar
+@ CHECK:        36: ldr.w   r0, [pc, #-56]          @ 0x0 <foo>
+@ CHECK-NEXT:   3a: ldrb.w  r0, [pc, #88]           @ 0x94 <bar>
+@ CHECK-NEXT:   3e: ldrsb.w r0, [pc, #-64]          @ 0x0 <foo>
+@ CHECK-NEXT:   42: ldrsh.w r0, [pc, #80]           @ 0x94 <bar>
+@ CHECK-NEXT:   46: pli     [pc, #-68]              @ 0x4 <_start>
+@ CHECK-NEXT:   4a: pld     [pc, #72]               @ 0x94 <bar>
+
+@@ Check AddrModeT2_i8s4 instructions, with positive and negative immediates
+  .balign 4
+  ldrd r0, r1, foo
+  ldrd r0, r1, bar
+@ CHECK:        50: ldrd    r0, r1, [pc, #-84]      @ 0x0 <foo>
+@ CHECK-NEXT:   54: ldrd    r0, r1, [pc, #60]       @ 0x94 <bar>
+
+@@ Same instructions, but the addresses are not 4-byte aligned
+  nop
+  ldrd r0, r1, foo
+  ldrd r0, r1, bar
+@ CHECK:        5a: ldrd    r0, r1, [pc, #-92]      @ 0x0 <foo>
+@ CHECK-NEXT:   5e: ldrd    r0, r1, [pc, #52]       @ 0x94 <bar>
+
+@@ Check that AddrModeT2_i8s4 instructions that do not use PC-relative
+@@ addressingare not annotated
+  ldrd  r0, r1, [r2, #8]
+@ CHECK:        62: ldrd    r0, r1, [r2, #8]{{$}}
+
+@@ Check AddrMode5 instructions, with positive and negative immediates
+  .balign 4
+  ldc   p14, c5, foo
+  ldcl  p6, c4, bar
+  ldc2  p5, c2, foo
+  ldc2l p3, c1, bar
+@ CHECK:        68: ldc     p14, c5, [pc, #-108]    @ 0x0 <foo>
+@ CHECK-NEXT:   6c: ldcl    p6, c4, [pc, #36]       @ 0x94 <bar>
+@ CHECK-NEXT:   70: ldc2    p5, c2, [pc, #-116]     @ 0x0 <foo>
+@ CHECK-NEXT:   74: ldc2l   p3, c1, [pc, #28]       @ 0x94 <bar>
+
+@@ Same instructions, but the addresses are not 4-byte aligned
+  nop
+  ldc   p14, c5, foo
+  ldcl  p6, c4, bar
+  ldc2  p5, c2, foo
+  ldc2l p3, c1, bar
+@ CHECK:        7a: ldc     p14, c5, [pc, #-124]    @ 0x0 <foo>
+@ CHECK-NEXT:   7e: ldcl    p6, c4, [pc, #20]       @ 0x94 <bar>
+@ CHECK-NEXT:   82: ldc2    p5, c2, [pc, #-132]     @ 0x0 <foo>
+@ CHECK-NEXT:   86: ldc2l   p3, c1, [pc, #12]       @ 0x94 <bar>
+
+@@ Check that AddrMode5 instruction that do not use PC+imm addressing are not
+@@ annotated
+  ldc   p14, c5, [r1, #8]
+  ldc   p14, c5, [pc], {16}
+@ CHECK:        8a: ldc     p14, c5, [r1, #8]{{$}}
+@ CHECK-NEXT:   8e: ldc     p14, c5, [pc], {16}{{$}}
+
+  .balign 4
+bar:
+@ CHECK:      00000094 <bar>:
+  .word 0x01020304
diff --git a/llvm/test/tools/llvm-objdump/ELF/ARM/literal-vldr-arm.s b/llvm/test/tools/llvm-objdump/ELF/ARM/literal-vldr-arm.s
index cbc779ab451b..2c2eca95639f 100644
--- a/llvm/test/tools/llvm-objdump/ELF/ARM/literal-vldr-arm.s
+++ b/llvm/test/tools/llvm-objdump/ELF/ARM/literal-vldr-arm.s
@@ -1,48 +1,48 @@
-@@ Check that PC-relative memory addressing is annotated
-
-@ RUN: llvm-mc %s -triple=armv8a --mattr=+fullfp16 -filetype=obj | \
-@ RUN:   llvm-objdump -d --no-show-raw-insn --triple=armv8a --mattr=+fullfp16 - | \
-@ RUN:   FileCheck %s
-
-.text
-foo:
-@ CHECK:      00000000 <foo>:
-  .short 0x0102
-foo2:
-@ CHECK:      00000002 <foo2>:
-  .short 0x0304
-
-_start:
-@ CHECK:      00000004 <_start>:
-@@ Check AddrMode5 instructions, with positive and negative immediates
-  vldr d0, foo
-  vldr s0, bar
-@ CHECK-NEXT:    4: vldr    d0, [pc, #-12]          @ 0x0 <foo>
-@ CHECK-NEXT:    8: vldr    s0, [pc, #20]           @ 0x24 <bar>
-
-@@ Check that AddrMode5 instructions which do not use PC-relative addressing are
-@@ not annotated
-  vldr d0, [r1, #8]
-@ CHECK-NEXT:    c: vldr    d0, [r1, #8]{{$}}
-
-@@ Check AddrMode5FP16 instructions, with positive and negative immediates
-  vldr.16 s0, foo
-  vldr.16 s0, foo2
-  vldr.16 s1, bar
-  vldr.16 s1, bar2
-@ CHECK-NEXT:   10: vldr.16 s0, [pc, #-24]          @ 0x0 <foo>
-@ CHECK-NEXT:   14: vldr.16 s0, [pc, #-26]          @ 0x2 <foo2>
-@ CHECK-NEXT:   18: vldr.16 s1, [pc, #4]            @ 0x24 <bar>
-@ CHECK-NEXT:   1c: vldr.16 s1, [pc, #2]            @ 0x26 <bar2>
-
-@@ Check that AddrMode5FP16 instructions which do not use PC-relative addressing
-@@ are not annotated
-  vldr.16 s0, [r1, #8]
-@ CHECK-NEXT:   20: vldr.16 s0, [r1, #8]{{$}}
-
-bar:
-@ CHECK:      00000024 <bar>:
-  .short 0x0102
-bar2:
-@ CHECK:      00000026 <bar2>:
-  .short 0x0304
+@@ Check that PC-relative memory addressing is annotated
+
+@ RUN: llvm-mc %s -triple=armv8a --mattr=+fullfp16 -filetype=obj | \
+@ RUN:   llvm-objdump -d --no-show-raw-insn --triple=armv8a --mattr=+fullfp16 - | \
+@ RUN:   FileCheck %s
+
+.text
+foo:
+@ CHECK:      00000000 <foo>:
+  .short 0x0102
+foo2:
+@ CHECK:      00000002 <foo2>:
+  .short 0x0304
+
+_start:
+@ CHECK:      00000004 <_start>:
+@@ Check AddrMode5 instructions, with positive and negative immediates
+  vldr d0, foo
+  vldr s0, bar
+@ CHECK-NEXT:    4: vldr    d0, [pc, #-12]          @ 0x0 <foo>
+@ CHECK-NEXT:    8: vldr    s0, [pc, #20]           @ 0x24 <bar>
+
+@@ Check that AddrMode5 instructions which do not use PC-relative addressing are
+@@ not annotated
+  vldr d0, [r1, #8]
+@ CHECK-NEXT:    c: vldr    d0, [r1, #8]{{$}}
+
+@@ Check AddrMode5FP16 instructions, with positive and negative immediates
+  vldr.16 s0, foo
+  vldr.16 s0, foo2
+  vldr.16 s1, bar
+  vldr.16 s1, bar2
+@ CHECK-NEXT:   10: vldr.16 s0, [pc, #-24]          @ 0x0 <foo>
+@ CHECK-NEXT:   14: vldr.16 s0, [pc, #-26]          @ 0x2 <foo2>
+@ CHECK-NEXT:   18: vldr.16 s1, [pc, #4]            @ 0x24 <bar>
+@ CHECK-NEXT:   1c: vldr.16 s1, [pc, #2]            @ 0x26 <bar2>
+
+@@ Check that AddrMode5FP16 instructions which do not use PC-relative addressing
+@@ are not annotated
+  vldr.16 s0, [r1, #8]
+@ CHECK-NEXT:   20: vldr.16 s0, [r1, #8]{{$}}
+
+bar:
+@ CHECK:      00000024 <bar>:
+  .short 0x0102
+bar2:
+@ CHECK:      00000026 <bar2>:
+  .short 0x0304
diff --git a/llvm/test/tools/llvm-objdump/ELF/ARM/literal-vldr-thumb2.s b/llvm/test/tools/llvm-objdump/ELF/ARM/literal-vldr-thumb2.s
index fe699170f76d..1bf53fce98fb 100644
--- a/llvm/test/tools/llvm-objdump/ELF/ARM/literal-vldr-thumb2.s
+++ b/llvm/test/tools/llvm-objdump/ELF/ARM/literal-vldr-thumb2.s
@@ -1,66 +1,66 @@
-@@ Check that PC-relative memory addressing is annotated
-
-@ RUN: llvm-mc %s -triple=thumbv8a --mattr=+fullfp16 -filetype=obj | \
-@ RUN:   llvm-objdump -d --no-show-raw-insn --triple=thumbv8a --mattr=+fullfp16 - | \
-@ RUN:   FileCheck %s
-
-.text
-foo:
-@ CHECK:      00000000 <foo>:
-  .short 0x0102
-foo2:
-@ CHECK:      00000002 <foo2>:
-  .short 0x0304
-
-_start:
-@@ Check AddrMode5 instructions, with positive and negative immediates
-  .balign 4
-  vldr d0, foo
-  vldr s0, bar
-@ CHECK:         4: vldr      d0, [pc, #-8]           @ 0x0 <foo>
-@ CHECK-NEXT:    8: vldr      s0, [pc, #56]           @ 0x44 <bar>
-
-@@ Same instructions, but the addresses are not 4-byte aligned
-  nop
-  vldr d0, foo
-  vldr s0, bar
-@ CHECK:          e: vldr     d0, [pc, #-16]          @ 0x0 <foo>
-@ CHECK-NEXT:    12: vldr     s0, [pc, #48]           @ 0x44 <bar>
-
-@@ Check that AddrMode5 instructions which do not use PC-relative addressing are not annotated
-  vldr d0, [r1, #8]
-@ CHECK:         16: vldr     d0, [r1, #8]{{$}}
-
-@@ Check AddrMode5FP16 instructions, with positive and negative immediates
-  .balign 4
-  vldr.16 s0, foo
-  vldr.16 s0, foo2
-  vldr.16 s1, bar
-  vldr.16 s1, bar2
-@ CHECK:         1c: vldr.16  s0, [pc, #-32]          @ 0x0 <foo>
-@ CHECK-NEXT:    20: vldr.16  s0, [pc, #-34]          @ 0x2 <foo2>
-@ CHECK-NEXT:    24: vldr.16  s1, [pc, #28]           @ 0x44 <bar>
-@ CHECK-NEXT:    28: vldr.16  s1, [pc, #26]           @ 0x46 <bar2>
-
-@@ Same instructions, but the addresses are not 4-byte aligned
-  nop
-  vldr.16 s0, foo
-  vldr.16 s0, foo2
-  vldr.16 s1, bar
-  vldr.16 s1, bar2
-@ CHECK:         2e: vldr.16  s0, [pc, #-48]          @ 0x0 <foo>
-@ CHECK-NEXT:    32: vldr.16  s0, [pc, #-50]          @ 0x2 <foo2>
-@ CHECK-NEXT:    36: vldr.16  s1, [pc, #12]           @ 0x44 <bar>
-@ CHECK-NEXT:    3a: vldr.16  s1, [pc, #10]           @ 0x46 <bar2>
-
-@@ Check that AddrMode5FP16 instructions which do not use PC-relative addressing are not annotated
-  vldr.16 s0, [r1, #8]
-@ CHECK:         3e: vldr.16  s0, [r1, #8]{{$}}
-
-  .balign 4
-bar:
-@ CHECK:      00000044 <bar>:
-  .short 0x0102
-bar2:
-@ CHECK:      00000046 <bar2>:
-  .short 0x0304
+@@ Check that PC-relative memory addressing is annotated
+
+@ RUN: llvm-mc %s -triple=thumbv8a --mattr=+fullfp16 -filetype=obj | \
+@ RUN:   llvm-objdump -d --no-show-raw-insn --triple=thumbv8a --mattr=+fullfp16 - | \
+@ RUN:   FileCheck %s
+
+.text
+foo:
+@ CHECK:      00000000 <foo>:
+  .short 0x0102
+foo2:
+@ CHECK:      00000002 <foo2>:
+  .short 0x0304
+
+_start:
+@@ Check AddrMode5 instructions, with positive and negative immediates
+  .balign 4
+  vldr d0, foo
+  vldr s0, bar
+@ CHECK:         4: vldr      d0, [pc, #-8]           @ 0x0 <foo>
+@ CHECK-NEXT:    8: vldr      s0, [pc, #56]           @ 0x44 <bar>
+
+@@ Same instructions, but the addresses are not 4-byte aligned
+  nop
+  vldr d0, foo
+  vldr s0, bar
+@ CHECK:          e: vldr     d0, [pc, #-16]          @ 0x0 <foo>
+@ CHECK-NEXT:    12: vldr     s0, [pc, #48]           @ 0x44 <bar>
+
+@@ Check that AddrMode5 instructions which do not use PC-relative addressing are not annotated
+  vldr d0, [r1, #8]
+@ CHECK:         16: vldr     d0, [r1, #8]{{$}}
+
+@@ Check AddrMode5FP16 instructions, with positive and negative immediates
+  .balign 4
+  vldr.16 s0, foo
+  vldr.16 s0, foo2
+  vldr.16 s1, bar
+  vldr.16 s1, bar2
+@ CHECK:         1c: vldr.16  s0, [pc, #-32]          @ 0x0 <foo>
+@ CHECK-NEXT:    20: vldr.16  s0, [pc, #-34]          @ 0x2 <foo2>
+@ CHECK-NEXT:    24: vldr.16  s1, [pc, #28]           @ 0x44 <bar>
+@ CHECK-NEXT:    28: vldr.16  s1, [pc, #26]           @ 0x46 <bar2>
+
+@@ Same instructions, but the addresses are not 4-byte aligned
+  nop
+  vldr.16 s0, foo
+  vldr.16 s0, foo2
+  vldr.16 s1, bar
+  vldr.16 s1, bar2
+@ CHECK:         2e: vldr.16  s0, [pc, #-48]          @ 0x0 <foo>
+@ CHECK-NEXT:    32: vldr.16  s0, [pc, #-50]          @ 0x2 <foo2>
+@ CHECK-NEXT:    36: vldr.16  s1, [pc, #12]           @ 0x44 <bar>
+@ CHECK-NEXT:    3a: vldr.16  s1, [pc, #10]           @ 0x46 <bar2>
+
+@@ Check that AddrMode5FP16 instructions which do not use PC-relative addressing are not annotated
+  vldr.16 s0, [r1, #8]
+@ CHECK:         3e: vldr.16  s0, [r1, #8]{{$}}
+
+  .balign 4
+bar:
+@ CHECK:      00000044 <bar>:
+  .short 0x0102
+bar2:
+@ CHECK:      00000046 <bar2>:
+  .short 0x0304
diff --git a/llvm/test/tools/llvm-objdump/ELF/RISCV/unknown-arch-attr.test b/llvm/test/tools/llvm-objdump/ELF/RISCV/unknown-arch-attr.test
index 35c8c6240d84..704c9d4add0d 100644
--- a/llvm/test/tools/llvm-objdump/ELF/RISCV/unknown-arch-attr.test
+++ b/llvm/test/tools/llvm-objdump/ELF/RISCV/unknown-arch-attr.test
@@ -3,7 +3,7 @@
 ## The expected behavior is to ignore the unrecognized arch feature and
 ## continue to process the following arch features.
 ##
-## The object file has the "rv32i2p0_m2p0_x1p0" arch feature. "x1p0" is an
+## The object file has the "rv32i2p0_m2p0_y1p0" arch feature. "y1p0" is an
 ## unrecognized architecture extension. llvm-objdump will ignore it and decode
 ## "mul" instruction correctly according to "m2p0" in the arch feature.
 ##
@@ -34,5 +34,5 @@ Sections:
     Content: 3385C502
   - Name:    .riscv.attributes
     Type:    SHT_RISCV_ATTRIBUTES
-## The content is the encoding of the arch feature "rv32i2p0_m2p0_x1p0"
-    Content: 412300000072697363760001190000000572763332693270305F6D3270305F7831703000
+## The content is the encoding of the arch feature "rv32i2p0_m2p0_y1p0"
+    Content: 412300000072697363760001190000000572763332693270305F6D3270305F7931703000
diff --git a/llvm/test/tools/llvm-objdump/X86/print-imm-hex.s b/llvm/test/tools/llvm-objdump/X86/print-imm-hex.s
index c4ae34639da4..d807e228bdf2 100644
--- a/llvm/test/tools/llvm-objdump/X86/print-imm-hex.s
+++ b/llvm/test/tools/llvm-objdump/X86/print-imm-hex.s
@@ -1,29 +1,29 @@
-# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t
-
-# RUN: llvm-objdump -d %t | FileCheck %s --check-prefix=PRINT
-# RUN: llvm-objdump -d --print-imm-hex --no-print-imm-hex %t | FileCheck %s --check-prefix=NOPRINT
-# RUN: llvm-objdump -d --no-print-imm-hex --print-imm-hex %t | FileCheck %s --check-prefix=PRINT
-
-.text
-  retq
-  movq 0x123456(%rip),%rax
-  movabs $0x5555555555555554,%rax
-  lwpval $0x0, 0x40(%rdx,%rax), %r15d
-  lwpins $0x0, 0x1cf01cf0, %r15d
-  .word 0xffff
-
-# NOPRINT:      0000000000000000 <.text>:
-# NOPRINT-NEXT:  0: c3                            retq
-# NOPRINT-NEXT:  1: 48 8b 05 56 34 12 00          movq    1193046(%rip), %rax  # 0x12345e <.text+0x12345e>
-# NOPRINT-NEXT:  8: 48 b8 54 55 55 55 55 55 55 55 movabsq $6148914691236517204, %rax # imm = 0x5555555555555554
-# NOPRINT-NEXT: 12: 8f ea 00 12 4c 02 40 00 00 00 00      lwpval  $0, 64(%rdx,%rax), %r15d
-# NOPRINT-NEXT: 1d: 8f ea 00 12 04 25 f0 1c f0 1c 00 00 00 00     lwpins  $0, 485498096, %r15d
-# NOPRINT-NEXT: 2b: ff ff                         <unknown>
-
-# PRINT:       0000000000000000 <.text>:
-# PRINT-NEXT:  0: c3                            retq
-# PRINT-NEXT:  1: 48 8b 05 56 34 12 00          movq    0x123456(%rip), %rax  # 0x12345e <.text+0x12345e>
-# PRINT-NEXT:  8: 48 b8 54 55 55 55 55 55 55 55 movabsq $0x5555555555555554, %rax # imm = 0x5555555555555554
-# PRINT-NEXT: 12: 8f ea 00 12 4c 02 40 00 00 00 00      lwpval  $0x0, 0x40(%rdx,%rax), %r15d
-# PRINT-NEXT: 1d: 8f ea 00 12 04 25 f0 1c f0 1c 00 00 00 00     lwpins  $0x0, 0x1cf01cf0, %r15d
-# PRINT-NEXT: 2b: ff ff                         <unknown>
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t
+
+# RUN: llvm-objdump -d %t | FileCheck %s --check-prefix=PRINT
+# RUN: llvm-objdump -d --print-imm-hex --no-print-imm-hex %t | FileCheck %s --check-prefix=NOPRINT
+# RUN: llvm-objdump -d --no-print-imm-hex --print-imm-hex %t | FileCheck %s --check-prefix=PRINT
+
+.text
+  retq
+  movq 0x123456(%rip),%rax
+  movabs $0x5555555555555554,%rax
+  lwpval $0x0, 0x40(%rdx,%rax), %r15d
+  lwpins $0x0, 0x1cf01cf0, %r15d
+  .word 0xffff
+
+# NOPRINT:      0000000000000000 <.text>:
+# NOPRINT-NEXT:  0: c3                            retq
+# NOPRINT-NEXT:  1: 48 8b 05 56 34 12 00          movq    1193046(%rip), %rax  # 0x12345e <.text+0x12345e>
+# NOPRINT-NEXT:  8: 48 b8 54 55 55 55 55 55 55 55 movabsq $6148914691236517204, %rax # imm = 0x5555555555555554
+# NOPRINT-NEXT: 12: 8f ea 00 12 4c 02 40 00 00 00 00      lwpval  $0, 64(%rdx,%rax), %r15d
+# NOPRINT-NEXT: 1d: 8f ea 00 12 04 25 f0 1c f0 1c 00 00 00 00     lwpins  $0, 485498096, %r15d
+# NOPRINT-NEXT: 2b: ff ff                         <unknown>
+
+# PRINT:       0000000000000000 <.text>:
+# PRINT-NEXT:  0: c3                            retq
+# PRINT-NEXT:  1: 48 8b 05 56 34 12 00          movq    0x123456(%rip), %rax  # 0x12345e <.text+0x12345e>
+# PRINT-NEXT:  8: 48 b8 54 55 55 55 55 55 55 55 movabsq $0x5555555555555554, %rax # imm = 0x5555555555555554
+# PRINT-NEXT: 12: 8f ea 00 12 4c 02 40 00 00 00 00      lwpval  $0x0, 0x40(%rdx,%rax), %r15d
+# PRINT-NEXT: 1d: 8f ea 00 12 04 25 f0 1c f0 1c 00 00 00 00     lwpins  $0x0, 0x1cf01cf0, %r15d
+# PRINT-NEXT: 2b: ff ff                         <unknown>
diff --git a/llvm/test/tools/llvm-pdbutil/Inputs/TypeServerTest.cpp b/llvm/test/tools/llvm-pdbutil/Inputs/TypeServerTest.cpp
index 140881db38cd..6a773a6e6cfc 100644
--- a/llvm/test/tools/llvm-pdbutil/Inputs/TypeServerTest.cpp
+++ b/llvm/test/tools/llvm-pdbutil/Inputs/TypeServerTest.cpp
@@ -1,6 +1,6 @@
-// Compile with "cl /c /Zi TypeServerTest.cpp /FdTypeServerTest.pdb"
-
-int main(void)
-{
-	return 0;
-}
+// Compile with "cl /c /Zi TypeServerTest.cpp /FdTypeServerTest.pdb"
+
+int main(void)
+{
+	return 0;
+}
diff --git a/llvm/test/tools/llvm-pdbutil/complex-padding-graphical.test b/llvm/test/tools/llvm-pdbutil/complex-padding-graphical.test
index 42511db95ffc..89d2da3006b6 100644
--- a/llvm/test/tools/llvm-pdbutil/complex-padding-graphical.test
+++ b/llvm/test/tools/llvm-pdbutil/complex-padding-graphical.test
@@ -1,55 +1,55 @@
-; REQUIRES: diasdk
-
-; RUN: llvm-pdbutil pretty -classes -class-definitions=layout \
-; RUN:     -include-types=Test %p/Inputs/ComplexPaddingTest.pdb > %t
-
-; RUN: FileCheck -input-file=%t %s -check-prefix=DIRECT_VB_ONLY
-; RUN: FileCheck -input-file=%t %s -check-prefix=DIRECT_VB_AND_NON_VB
-; RUN: FileCheck -input-file=%t %s -check-prefix=INDIRECT_VB
-; RUN: FileCheck -input-file=%t %s -check-prefix=INDIRECT_AND_DIRECT_VB
-
-
-; DIRECT_VB_ONLY:      struct TestIVBBase [sizeof = 16]
-; DIRECT_VB_ONLY-NEXT:   : public virtual TestVB {
-; DIRECT_VB_ONLY-NEXT:   vbptr +0x00 [sizeof=4]
-; DIRECT_VB_ONLY-NEXT:   data +0x04 [sizeof=4] int A
-; DIRECT_VB_ONLY-NEXT:   vbase +0x08 [sizeof=8] TestVB
-; DIRECT_VB_ONLY-NEXT:     vfptr +0x08 [sizeof=4]
-; DIRECT_VB_ONLY-NEXT:     data +0x0c [sizeof=4] int X
-; DIRECT_VB_ONLY-NEXT: }
-
-DIRECT_VB_AND_NON_VB:      struct TestVBLayout [sizeof = 24]
-DIRECT_VB_AND_NON_VB-NEXT:   : public TestNVB
-DIRECT_VB_AND_NON_VB-NEXT:   , public virtual TestVB {
-DIRECT_VB_AND_NON_VB-NEXT:   base +0x00 [sizeof=8] TestNVB
-DIRECT_VB_AND_NON_VB-NEXT:     vfptr +0x00 [sizeof=4]
-DIRECT_VB_AND_NON_VB-NEXT:     data +0x04 [sizeof=4] int Y
-DIRECT_VB_AND_NON_VB-NEXT:   vbptr +0x08 [sizeof=4]
-DIRECT_VB_AND_NON_VB-NEXT:   data +0x0c [sizeof=4] int Z
-DIRECT_VB_AND_NON_VB-NEXT:   vbase +0x10 [sizeof=8] TestVB
-DIRECT_VB_AND_NON_VB-NEXT:     vfptr +0x10 [sizeof=4]
-DIRECT_VB_AND_NON_VB-NEXT:     data +0x14 [sizeof=4] int X
-DIRECT_VB_AND_NON_VB-NEXT: }
-
-INDIRECT_VB:      struct TestIVBDerived [sizeof = 20]
-INDIRECT_VB-NEXT:   : public TestIVBBase {
-INDIRECT_VB-NEXT:   base +0x00 [sizeof=8] TestIVBBase
-INDIRECT_VB-NEXT:     vbptr +0x00 [sizeof=4]
-INDIRECT_VB-NEXT:     data +0x04 [sizeof=4] int A
-INDIRECT_VB-NEXT:   data +0x08 [sizeof=4] int B
-INDIRECT_VB-NEXT:   ivbase +0x0c [sizeof=8] TestVB
-INDIRECT_VB-NEXT:     vfptr +0x0c [sizeof=4]
-INDIRECT_VB-NEXT:     data +0x10 [sizeof=4] int X
-INDIRECT_VB-NEXT: }
-
-INDIRECT_AND_DIRECT_VB:      struct TestIVBMergedDerived [sizeof = 20]
-INDIRECT_AND_DIRECT_VB-NEXT:   : public TestIVBBase
-INDIRECT_AND_DIRECT_VB-NEXT:   , public virtual TestVB {
-INDIRECT_AND_DIRECT_VB-NEXT:   base +0x00 [sizeof=8] TestIVBBase
-INDIRECT_AND_DIRECT_VB-NEXT:     vbptr +0x00 [sizeof=4]
-INDIRECT_AND_DIRECT_VB-NEXT:     data +0x04 [sizeof=4] int A
-INDIRECT_AND_DIRECT_VB-NEXT:   data +0x08 [sizeof=4] int B
-INDIRECT_AND_DIRECT_VB-NEXT:   vbase +0x0c [sizeof=8] TestVB
-INDIRECT_AND_DIRECT_VB-NEXT:     vfptr +0x0c [sizeof=4]
-INDIRECT_AND_DIRECT_VB-NEXT:     data +0x10 [sizeof=4] int X
-INDIRECT_AND_DIRECT_VB-NEXT: }
+; REQUIRES: diasdk
+
+; RUN: llvm-pdbutil pretty -classes -class-definitions=layout \
+; RUN:     -include-types=Test %p/Inputs/ComplexPaddingTest.pdb > %t
+
+; RUN: FileCheck -input-file=%t %s -check-prefix=DIRECT_VB_ONLY
+; RUN: FileCheck -input-file=%t %s -check-prefix=DIRECT_VB_AND_NON_VB
+; RUN: FileCheck -input-file=%t %s -check-prefix=INDIRECT_VB
+; RUN: FileCheck -input-file=%t %s -check-prefix=INDIRECT_AND_DIRECT_VB
+
+
+; DIRECT_VB_ONLY:      struct TestIVBBase [sizeof = 16]
+; DIRECT_VB_ONLY-NEXT:   : public virtual TestVB {
+; DIRECT_VB_ONLY-NEXT:   vbptr +0x00 [sizeof=4]
+; DIRECT_VB_ONLY-NEXT:   data +0x04 [sizeof=4] int A
+; DIRECT_VB_ONLY-NEXT:   vbase +0x08 [sizeof=8] TestVB
+; DIRECT_VB_ONLY-NEXT:     vfptr +0x08 [sizeof=4]
+; DIRECT_VB_ONLY-NEXT:     data +0x0c [sizeof=4] int X
+; DIRECT_VB_ONLY-NEXT: }
+
+DIRECT_VB_AND_NON_VB:      struct TestVBLayout [sizeof = 24]
+DIRECT_VB_AND_NON_VB-NEXT:   : public TestNVB
+DIRECT_VB_AND_NON_VB-NEXT:   , public virtual TestVB {
+DIRECT_VB_AND_NON_VB-NEXT:   base +0x00 [sizeof=8] TestNVB
+DIRECT_VB_AND_NON_VB-NEXT:     vfptr +0x00 [sizeof=4]
+DIRECT_VB_AND_NON_VB-NEXT:     data +0x04 [sizeof=4] int Y
+DIRECT_VB_AND_NON_VB-NEXT:   vbptr +0x08 [sizeof=4]
+DIRECT_VB_AND_NON_VB-NEXT:   data +0x0c [sizeof=4] int Z
+DIRECT_VB_AND_NON_VB-NEXT:   vbase +0x10 [sizeof=8] TestVB
+DIRECT_VB_AND_NON_VB-NEXT:     vfptr +0x10 [sizeof=4]
+DIRECT_VB_AND_NON_VB-NEXT:     data +0x14 [sizeof=4] int X
+DIRECT_VB_AND_NON_VB-NEXT: }
+
+INDIRECT_VB:      struct TestIVBDerived [sizeof = 20]
+INDIRECT_VB-NEXT:   : public TestIVBBase {
+INDIRECT_VB-NEXT:   base +0x00 [sizeof=8] TestIVBBase
+INDIRECT_VB-NEXT:     vbptr +0x00 [sizeof=4]
+INDIRECT_VB-NEXT:     data +0x04 [sizeof=4] int A
+INDIRECT_VB-NEXT:   data +0x08 [sizeof=4] int B
+INDIRECT_VB-NEXT:   ivbase +0x0c [sizeof=8] TestVB
+INDIRECT_VB-NEXT:     vfptr +0x0c [sizeof=4]
+INDIRECT_VB-NEXT:     data +0x10 [sizeof=4] int X
+INDIRECT_VB-NEXT: }
+
+INDIRECT_AND_DIRECT_VB:      struct TestIVBMergedDerived [sizeof = 20]
+INDIRECT_AND_DIRECT_VB-NEXT:   : public TestIVBBase
+INDIRECT_AND_DIRECT_VB-NEXT:   , public virtual TestVB {
+INDIRECT_AND_DIRECT_VB-NEXT:   base +0x00 [sizeof=8] TestIVBBase
+INDIRECT_AND_DIRECT_VB-NEXT:     vbptr +0x00 [sizeof=4]
+INDIRECT_AND_DIRECT_VB-NEXT:     data +0x04 [sizeof=4] int A
+INDIRECT_AND_DIRECT_VB-NEXT:   data +0x08 [sizeof=4] int B
+INDIRECT_AND_DIRECT_VB-NEXT:   vbase +0x0c [sizeof=8] TestVB
+INDIRECT_AND_DIRECT_VB-NEXT:     vfptr +0x0c [sizeof=4]
+INDIRECT_AND_DIRECT_VB-NEXT:     data +0x10 [sizeof=4] int X
+INDIRECT_AND_DIRECT_VB-NEXT: }
diff --git a/llvm/test/tools/llvm-profgen/Inputs/coff-profile.perfscript b/llvm/test/tools/llvm-profgen/Inputs/coff-profile.perfscript
index 96eb878c3166..ec5c8ffee87c 100644
--- a/llvm/test/tools/llvm-profgen/Inputs/coff-profile.perfscript
+++ b/llvm/test/tools/llvm-profgen/Inputs/coff-profile.perfscript
@@ -1,13 +1,13 @@
-PERF_RECORD_MMAP2 5752/0: [0x7ff70a1b0000(0x640000) @ 0x1000 00:00 0 0]: r-xp c:\Users\haohaiwe\Desktop\coff-profile.exe
- 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 
- 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 
- 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 
- 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 
- 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 
- 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 
- 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 
- 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 
- 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 
- 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 
- 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 
- 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 
+PERF_RECORD_MMAP2 5752/0: [0x7ff70a1b0000(0x640000) @ 0x1000 00:00 0 0]: r-xp c:\Users\haohaiwe\Desktop\coff-profile.exe
+ 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0
+ 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0
+ 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0
+ 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0
+ 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0
+ 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0
+ 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0
+ 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0
+ 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0
+ 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0
+ 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0
+ 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0
diff --git a/llvm/test/tools/llvm-symbolizer/Inputs/discrim.inp b/llvm/test/tools/llvm-symbolizer/Inputs/discrim.inp
index 2c4d722e3286..c09a0adf9704 100644
--- a/llvm/test/tools/llvm-symbolizer/Inputs/discrim.inp
+++ b/llvm/test/tools/llvm-symbolizer/Inputs/discrim.inp
@@ -1,8 +1,8 @@
-some text
-0x400590
-0x4005a5
-0x4005ad
-0x4005b9
-0x4005ce
-0x4005d4
-another text
+some text
+0x400590
+0x4005a5
+0x4005ad
+0x4005b9
+0x4005ce
+0x4005d4
+another text
diff --git a/llvm/test/tools/obj2yaml/ELF/relr-section.yaml b/llvm/test/tools/obj2yaml/ELF/relr-section.yaml
index 6bc51ca09b3e..82391e52cb07 100644
--- a/llvm/test/tools/obj2yaml/ELF/relr-section.yaml
+++ b/llvm/test/tools/obj2yaml/ELF/relr-section.yaml
@@ -1,66 +1,66 @@
-## Test how we dump SHT_RELR sections for 32 and 64-bit targets.
-
-## Test we use the "Entries" property when it is possible to
-## dump values correctly. Also, check we do not dump sh_entsize when
-## it has the default value.
-
-# RUN: yaml2obj --docnum=1 -D BITS=32 -D ENCODE=LSB %s -o %t.32le
-# RUN: obj2yaml %t.32le | FileCheck %s --check-prefix=ELF32LE
-# RUN: yaml2obj --docnum=1 -D BITS=32 -D ENCODE=MSB %s -o %t.32be
-# RUN: obj2yaml %t.32be | FileCheck %s --check-prefix=ELF32BE
-# RUN: yaml2obj --docnum=1 -D BITS=64 -D ENCODE=LSB %s -o %t.64le
-# RUN: obj2yaml %t.64le | FileCheck %s --check-prefix=ELF64LE
-# RUN: yaml2obj --docnum=1 -D BITS=64 -D ENCODE=MSB %s -o %t.64be
-# RUN: obj2yaml %t.64be | FileCheck %s --check-prefix=ELF64BE
-
-# ELF64LE:      Sections:
-# ELF64LE-NEXT:   - Name:    .relr.dyn
-# ELF64LE-NEXT:     Type:    SHT_RELR
-# ELF64LE-NEXT:     Entries: [ 0x8877665544332211 ]
-
-# ELF32LE:      Sections:
-# ELF32LE-NEXT:   - Name:    .relr.dyn
-# ELF32LE-NEXT:     Type:    SHT_RELR
-# ELF32LE-NEXT:     Entries: [ 0x44332211, 0x88776655 ]
-
-# ELF64BE:      Sections:
-# ELF64BE-NEXT:   - Name:    .relr.dyn
-# ELF64BE-NEXT:     Type:    SHT_RELR
-# ELF64BE-NEXT:     Entries: [ 0x1122334455667788 ]
-
-# ELF32BE:      Sections:
-# ELF32BE-NEXT:  - Name:    .relr.dyn
-# ELF32BE-NEXT:    Type:    SHT_RELR
-# ELF32BE-NEXT:    Entries: [ 0x11223344, 0x55667788 ]
-
---- !ELF
-FileHeader:
-  Class: ELFCLASS[[BITS]]
-  Data:  ELFDATA2[[ENCODE]]
-  Type:  ET_DYN
-Sections:
-  - Name: .relr.dyn
-    Type: SHT_RELR
-    Content: "1122334455667788"
-
-## Test we use the "Content" property when a SHT_RELR section is truncated.
-
-# RUN: yaml2obj --docnum=2 %s -o %t.content
-# RUN: obj2yaml %t.content | FileCheck %s --check-prefix=CONTENT
-
-# CONTENT:      - Name:    .relr.dyn
-# CONTENT-NEXT:   Type:    SHT_RELR
-# CONTENT-NEXT:   Content: '11223344556677'
-
---- !ELF
-FileHeader:
-  Class: ELFCLASS64
-  Data:  ELFDATA2MSB
-  Type:  ET_DYN
-Sections:
-  - Name: .relr.dyn
-    Type: SHT_RELR
-    Content: "11223344556677"
+## Test how we dump SHT_RELR sections for 32 and 64-bit targets.
+
+## Test we use the "Entries" property when it is possible to
+## dump values correctly. Also, check we do not dump sh_entsize when
+## it has the default value.
+
+# RUN: yaml2obj --docnum=1 -D BITS=32 -D ENCODE=LSB %s -o %t.32le
+# RUN: obj2yaml %t.32le | FileCheck %s --check-prefix=ELF32LE
+# RUN: yaml2obj --docnum=1 -D BITS=32 -D ENCODE=MSB %s -o %t.32be
+# RUN: obj2yaml %t.32be | FileCheck %s --check-prefix=ELF32BE
+# RUN: yaml2obj --docnum=1 -D BITS=64 -D ENCODE=LSB %s -o %t.64le
+# RUN: obj2yaml %t.64le | FileCheck %s --check-prefix=ELF64LE
+# RUN: yaml2obj --docnum=1 -D BITS=64 -D ENCODE=MSB %s -o %t.64be
+# RUN: obj2yaml %t.64be | FileCheck %s --check-prefix=ELF64BE
+
+# ELF64LE:      Sections:
+# ELF64LE-NEXT:   - Name:    .relr.dyn
+# ELF64LE-NEXT:     Type:    SHT_RELR
+# ELF64LE-NEXT:     Entries: [ 0x8877665544332211 ]
+
+# ELF32LE:      Sections:
+# ELF32LE-NEXT:   - Name:    .relr.dyn
+# ELF32LE-NEXT:     Type:    SHT_RELR
+# ELF32LE-NEXT:     Entries: [ 0x44332211, 0x88776655 ]
+
+# ELF64BE:      Sections:
+# ELF64BE-NEXT:   - Name:    .relr.dyn
+# ELF64BE-NEXT:     Type:    SHT_RELR
+# ELF64BE-NEXT:     Entries: [ 0x1122334455667788 ]
+
+# ELF32BE:      Sections:
+# ELF32BE-NEXT:  - Name:    .relr.dyn
+# ELF32BE-NEXT:    Type:    SHT_RELR
+# ELF32BE-NEXT:    Entries: [ 0x11223344, 0x55667788 ]
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS[[BITS]]
+  Data:  ELFDATA2[[ENCODE]]
+  Type:  ET_DYN
+Sections:
+  - Name: .relr.dyn
+    Type: SHT_RELR
+    Content: "1122334455667788"
+
+## Test we use the "Content" property when a SHT_RELR section is truncated.
+
+# RUN: yaml2obj --docnum=2 %s -o %t.content
+# RUN: obj2yaml %t.content | FileCheck %s --check-prefix=CONTENT
+
+# CONTENT:      - Name:    .relr.dyn
+# CONTENT-NEXT:   Type:    SHT_RELR
+# CONTENT-NEXT:   Content: '11223344556677'
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2MSB
+  Type:  ET_DYN
+Sections:
+  - Name: .relr.dyn
+    Type: SHT_RELR
+    Content: "11223344556677"
 
 ## Test we are able to dump a SHT_RELR section when sh_entsize is invalid.
 ## Here we use 0xFE as a value instead of expected 0x8.
diff --git a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
index 7246ba45d5af..83473704398d 100644
--- a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
+++ b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
@@ -705,7 +705,7 @@ bool DwarfLinkerForBinary::linkImpl(
     } else {
       // Try and emit more helpful warnings by applying some heuristics.
       StringRef ObjFile = ContainerName;
-      bool IsClangModule = sys::path::extension(Path).equals(".pcm");
+      bool IsClangModule = sys::path::extension(Path) == ".pcm";
       bool IsArchive = ObjFile.ends_with(")");
 
       if (IsClangModule) {
diff --git a/llvm/tools/dsymutil/LinkUtils.h b/llvm/tools/dsymutil/LinkUtils.h
index 6aa0b847eebd..ad5515a04333 100644
--- a/llvm/tools/dsymutil/LinkUtils.h
+++ b/llvm/tools/dsymutil/LinkUtils.h
@@ -38,6 +38,9 @@ struct LinkOptions {
   /// Verbosity
   bool Verbose = false;
 
+  /// Quiet
+  bool Quiet = false;
+
   /// Statistics
   bool Statistics = false;
 
diff --git a/llvm/tools/dsymutil/Options.td b/llvm/tools/dsymutil/Options.td
index d8cec0cb2c41..b72ae1909a72 100644
--- a/llvm/tools/dsymutil/Options.td
+++ b/llvm/tools/dsymutil/Options.td
@@ -24,6 +24,14 @@ def verbose: F<"verbose">,
   HelpText<"Enable verbose mode.">,
   Group<grp_general>;
 
+def quiet: F<"quiet">,
+  HelpText<"Enable quiet mode.">,
+  Group<grp_general>;
+def: Flag<["-"], "q">,
+  Alias<quiet>,
+  HelpText<"Alias for --quiet">,
+  Group<grp_general>;
+
 def keep_func_for_static: F<"keep-function-for-static">,
   HelpText<"Make a static variable keep the enclosing function even if it would have been omitted otherwise.">,
   Group<grp_general>;
diff --git a/llvm/tools/dsymutil/dsymutil.cpp b/llvm/tools/dsymutil/dsymutil.cpp
index bc968b6387b6..728f2ed3e62a 100644
--- a/llvm/tools/dsymutil/dsymutil.cpp
+++ b/llvm/tools/dsymutil/dsymutil.cpp
@@ -169,6 +169,12 @@ static Expected<std::vector<std::string>> getInputs(opt::InputArgList &Args,
 
 // Verify that the given combination of options makes sense.
 static Error verifyOptions(const DsymutilOptions &Options) {
+  if (Options.LinkOpts.Verbose && Options.LinkOpts.Quiet) {
+    return make_error<StringError>(
+        "--quiet and --verbose cannot be specified together",
+        errc::invalid_argument);
+  }
+
   if (Options.InputFiles.empty()) {
     return make_error<StringError>("no input files specified",
                                    errc::invalid_argument);
@@ -311,6 +317,7 @@ static Expected<DsymutilOptions> getOptions(opt::InputArgList &Args) {
   Options.LinkOpts.NoTimestamp = Args.hasArg(OPT_no_swiftmodule_timestamp);
   Options.LinkOpts.Update = Args.hasArg(OPT_update);
   Options.LinkOpts.Verbose = Args.hasArg(OPT_verbose);
+  Options.LinkOpts.Quiet = Args.hasArg(OPT_quiet);
   Options.LinkOpts.Statistics = Args.hasArg(OPT_statistics);
   Options.LinkOpts.Fat64 = Args.hasArg(OPT_fat64);
   Options.LinkOpts.KeepFunctionForStatic =
@@ -483,16 +490,20 @@ static bool verifyOutput(StringRef OutputFile, StringRef Arch,
                          DsymutilOptions Options, std::mutex &Mutex) {
 
   if (OutputFile == "-") {
-    std::lock_guard<std::mutex> Guard(Mutex);
-    WithColor::warning() << "verification skipped for " << Arch
-                         << " because writing to stdout.\n";
+    if (!Options.LinkOpts.Quiet) {
+      std::lock_guard<std::mutex> Guard(Mutex);
+      WithColor::warning() << "verification skipped for " << Arch
+                           << " because writing to stdout.\n";
+    }
     return true;
   }
 
   if (Options.LinkOpts.NoOutput) {
-    std::lock_guard<std::mutex> Guard(Mutex);
-    WithColor::warning() << "verification skipped for " << Arch
-                         << " because --no-output was passed.\n";
+    if (!Options.LinkOpts.Quiet) {
+      std::lock_guard<std::mutex> Guard(Mutex);
+      WithColor::warning() << "verification skipped for " << Arch
+                           << " because --no-output was passed.\n";
+    }
     return true;
   }
 
@@ -507,10 +518,12 @@ static bool verifyOutput(StringRef OutputFile, StringRef Arch,
   if (auto *Obj = dyn_cast<MachOObjectFile>(&Binary)) {
     std::unique_ptr<DWARFContext> DICtx = DWARFContext::create(*Obj);
     if (DICtx->getMaxVersion() > 5) {
-      std::lock_guard<std::mutex> Guard(Mutex);
-      WithColor::warning()
-          << "verification skipped for " << Arch
-          << " because DWARF standard greater than v5 is not supported yet.\n";
+      if (!Options.LinkOpts.Quiet) {
+        std::lock_guard<std::mutex> Guard(Mutex);
+        WithColor::warning() << "verification skipped for " << Arch
+                             << " because DWARF standard greater than v5 is "
+                                "not supported yet.\n";
+      }
       return true;
     }
 
@@ -751,11 +764,13 @@ int dsymutil_main(int argc, char **argv, const llvm::ToolContext &) {
           continue;
 
         if (Map->begin() == Map->end()) {
-          std::lock_guard<std::mutex> Guard(ErrorHandlerMutex);
-          WithColor::warning()
-              << "no debug symbols in executable (-arch "
-              << MachOUtils::getArchName(Map->getTriple().getArchName())
-              << ")\n";
+          if (!Options.LinkOpts.Quiet) {
+            std::lock_guard<std::mutex> Guard(ErrorHandlerMutex);
+            WithColor::warning()
+                << "no debug symbols in executable (-arch "
+                << MachOUtils::getArchName(Map->getTriple().getArchName())
+                << ")\n";
+          }
         }
 
         // Using a std::shared_ptr rather than std::unique_ptr because move-only
diff --git a/llvm/tools/llc/NewPMDriver.cpp b/llvm/tools/llc/NewPMDriver.cpp
index 6d9956ea07d3..fb1959c6457f 100644
--- a/llvm/tools/llc/NewPMDriver.cpp
+++ b/llvm/tools/llc/NewPMDriver.cpp
@@ -115,7 +115,7 @@ int llvm::compileModuleWithNewPM(
   MachineModuleInfo MMI(&LLVMTM);
 
   PassInstrumentationCallbacks PIC;
-  StandardInstrumentations SI(Context, Opt.DebugPM);
+  StandardInstrumentations SI(Context, Opt.DebugPM, !NoVerify);
   SI.registerCallbacks(PIC);
   registerCodeGenCallback(PIC, LLVMTM);
 
diff --git a/llvm/tools/llvm-as/llvm-as.cpp b/llvm/tools/llvm-as/llvm-as.cpp
index e48e3f4d22c1..0958e16c2197 100644
--- a/llvm/tools/llvm-as/llvm-as.cpp
+++ b/llvm/tools/llvm-as/llvm-as.cpp
@@ -142,11 +142,10 @@ int main(int argc, char **argv) {
   }
 
   // Convert to new debug format if requested.
-  assert(!M->IsNewDbgInfoFormat && "Unexpectedly in new debug mode");
-  if (UseNewDbgInfoFormat && WriteNewDbgInfoFormatToBitcode) {
-    M->convertToNewDbgValues();
+  M->setIsNewDbgInfoFormat(UseNewDbgInfoFormat &&
+                           WriteNewDbgInfoFormatToBitcode);
+  if (M->IsNewDbgInfoFormat)
     M->removeDebugIntrinsicDeclarations();
-  }
 
   std::unique_ptr<ModuleSummaryIndex> Index = std::move(ModuleAndIndex.Index);
 
diff --git a/llvm/tools/llvm-c-test/echo.cpp b/llvm/tools/llvm-c-test/echo.cpp
index 347863638849..518716168c42 100644
--- a/llvm/tools/llvm-c-test/echo.cpp
+++ b/llvm/tools/llvm-c-test/echo.cpp
@@ -570,6 +570,46 @@ struct FunCloner {
           LLVMDisposeOperandBundle(Bundle);
         break;
       }
+      case LLVMCallBr: {
+        LLVMTypeRef FnTy = CloneType(LLVMGetCalledFunctionType(Src));
+        LLVMValueRef Fn = CloneValue(LLVMGetCalledValue(Src));
+
+        LLVMBasicBlockRef DefaultDest =
+            DeclareBB(LLVMGetCallBrDefaultDest(Src));
+
+        // Clone indirect destinations
+        SmallVector<LLVMBasicBlockRef, 8> IndirectDests;
+        unsigned IndirectDestCount = LLVMGetCallBrNumIndirectDests(Src);
+        for (unsigned i = 0; i < IndirectDestCount; ++i)
+          IndirectDests.push_back(DeclareBB(LLVMGetCallBrIndirectDest(Src, i)));
+
+        // Clone input arguments
+        SmallVector<LLVMValueRef, 8> Args;
+        unsigned ArgCount = LLVMGetNumArgOperands(Src);
+        for (unsigned i = 0; i < ArgCount; ++i)
+          Args.push_back(CloneValue(LLVMGetOperand(Src, i)));
+
+        // Clone operand bundles
+        SmallVector<LLVMOperandBundleRef, 8> Bundles;
+        unsigned BundleCount = LLVMGetNumOperandBundles(Src);
+        for (unsigned i = 0; i < BundleCount; ++i) {
+          auto Bundle = LLVMGetOperandBundleAtIndex(Src, i);
+          Bundles.push_back(CloneOB(Bundle));
+          LLVMDisposeOperandBundle(Bundle);
+        }
+
+        Dst = LLVMBuildCallBr(Builder, FnTy, Fn, DefaultDest,
+                              IndirectDests.data(), IndirectDests.size(),
+                              Args.data(), Args.size(), Bundles.data(),
+                              Bundles.size(), Name);
+
+        CloneAttrs(Src, Dst);
+
+        for (auto Bundle : Bundles)
+          LLVMDisposeOperandBundle(Bundle);
+
+        break;
+      }
       case LLVMUnreachable:
         Dst = LLVMBuildUnreachable(Builder);
         break;
diff --git a/llvm/tools/llvm-dis/llvm-dis.cpp b/llvm/tools/llvm-dis/llvm-dis.cpp
index fbbb5506e43e..d28af85bc739 100644
--- a/llvm/tools/llvm-dis/llvm-dis.cpp
+++ b/llvm/tools/llvm-dis/llvm-dis.cpp
@@ -258,7 +258,7 @@ int main(int argc, char **argv) {
       // All that llvm-dis does is write the assembly to a file.
       if (!DontPrint) {
         if (M) {
-          ScopedDbgInfoFormatSetter FormatSetter(*M, WriteNewDbgInfoFormat);
+          M->setIsNewDbgInfoFormat(WriteNewDbgInfoFormat);
           if (WriteNewDbgInfoFormat)
             M->removeDebugIntrinsicDeclarations();
           M->print(Out->os(), Annotator.get(), PreserveAssemblyUseListOrder);
diff --git a/llvm/tools/llvm-dwarfdump/Statistics.cpp b/llvm/tools/llvm-dwarfdump/Statistics.cpp
index 96841c3c387b..1846f9265c75 100644
--- a/llvm/tools/llvm-dwarfdump/Statistics.cpp
+++ b/llvm/tools/llvm-dwarfdump/Statistics.cpp
@@ -229,7 +229,7 @@ static std::string constructDieID(DWARFDie Die,
      << Die.getName(DINameKind::LinkageName);
 
   // Prefix + Name is enough for local variables and parameters.
-  if (!Prefix.empty() && !Prefix.equals("g"))
+  if (!Prefix.empty() && Prefix != "g")
     return ID.str();
 
   auto DeclFile = Die.findRecursively(dwarf::DW_AT_decl_file);
diff --git a/llvm/tools/llvm-extract/llvm-extract.cpp b/llvm/tools/llvm-extract/llvm-extract.cpp
index a879c203fc37..5915f92ea05c 100644
--- a/llvm/tools/llvm-extract/llvm-extract.cpp
+++ b/llvm/tools/llvm-extract/llvm-extract.cpp
@@ -357,7 +357,7 @@ int main(int argc, char **argv) {
         // The function has been materialized, so add its matching basic blocks
         // to the block extractor list, or fail if a name is not found.
         auto Res = llvm::find_if(*P.first, [&](const BasicBlock &BB) {
-          return BB.getName().equals(BBName);
+          return BB.getName() == BBName;
         });
         if (Res == P.first->end()) {
           errs() << argv[0] << ": function " << P.first->getName()
diff --git a/llvm/tools/llvm-link/llvm-link.cpp b/llvm/tools/llvm-link/llvm-link.cpp
index 7794f2d81ed0..b84469d1c757 100644
--- a/llvm/tools/llvm-link/llvm-link.cpp
+++ b/llvm/tools/llvm-link/llvm-link.cpp
@@ -489,12 +489,6 @@ int main(int argc, char **argv) {
   if (LoadBitcodeIntoNewDbgInfoFormat == cl::boolOrDefault::BOU_UNSET)
     LoadBitcodeIntoNewDbgInfoFormat = cl::boolOrDefault::BOU_TRUE;
 
-  // RemoveDIs debug-info transition: tests may request that we /try/ to use the
-  // new debug-info format.
-  if (TryUseNewDbgInfoFormat) {
-    // Turn the new debug-info format on.
-    UseNewDbgInfoFormat = true;
-  }
   // Since llvm-link collects multiple IR modules together, for simplicity's
   // sake we disable the "PreserveInputDbgFormat" flag to enforce a single
   // debug info format.
@@ -556,7 +550,7 @@ int main(int argc, char **argv) {
     SetFormat(WriteNewDbgInfoFormat);
     Composite->print(Out.os(), nullptr, PreserveAssemblyUseListOrder);
   } else if (Force || !CheckBitcodeOutputToConsole(Out.os())) {
-    SetFormat(WriteNewDbgInfoFormatToBitcode);
+    SetFormat(UseNewDbgInfoFormat && WriteNewDbgInfoFormatToBitcode);
     WriteBitcodeToFile(*Composite, Out.os(), PreserveBitcodeUseListOrder);
   }
 
diff --git a/llvm/tools/llvm-mca/CodeRegionGenerator.cpp b/llvm/tools/llvm-mca/CodeRegionGenerator.cpp
index 5241b584b746..863766cd777d 100644
--- a/llvm/tools/llvm-mca/CodeRegionGenerator.cpp
+++ b/llvm/tools/llvm-mca/CodeRegionGenerator.cpp
@@ -29,7 +29,7 @@ namespace mca {
 CodeRegionGenerator::~CodeRegionGenerator() {}
 
 Expected<const CodeRegions &> AsmCodeRegionGenerator::parseCodeRegions(
-    const std::unique_ptr<MCInstPrinter> &IP) {
+    const std::unique_ptr<MCInstPrinter> &IP, bool SkipFailures) {
   MCTargetOptions Opts;
   Opts.PreserveAsmComments = false;
   CodeRegions &Regions = getRegions();
@@ -61,7 +61,16 @@ Expected<const CodeRegions &> AsmCodeRegionGenerator::parseCodeRegions(
         "This target does not support assembly parsing.",
         inconvertibleErrorCode());
   Parser->setTargetParser(*TAP);
-  Parser->Run(false);
+  // Parser->Run() confusingly returns true on errors, in which case the errors
+  // were already shown to the user. SkipFailures implies continuing in the
+  // presence of any kind of failure within the parser, in which case failing
+  // input lines are not represented, but the rest of the input remains.
+  if (Parser->Run(false) && !SkipFailures) {
+    const char *Message = "Assembly input parsing had errors, use "
+                          "-skip-unsupported-instructions=parse-failure "
+                          "to drop failing lines from the input.";
+    return make_error<StringError>(Message, inconvertibleErrorCode());
+  }
 
   if (CCP->hadErr())
     return make_error<StringError>("There was an error parsing comments.",
diff --git a/llvm/tools/llvm-mca/CodeRegionGenerator.h b/llvm/tools/llvm-mca/CodeRegionGenerator.h
index 68da567f3e0f..12261e7656a4 100644
--- a/llvm/tools/llvm-mca/CodeRegionGenerator.h
+++ b/llvm/tools/llvm-mca/CodeRegionGenerator.h
@@ -148,7 +148,8 @@ protected:
   CodeRegionGenerator(const CodeRegionGenerator &) = delete;
   CodeRegionGenerator &operator=(const CodeRegionGenerator &) = delete;
   virtual Expected<const CodeRegions &>
-  parseCodeRegions(const std::unique_ptr<MCInstPrinter> &IP) = 0;
+  parseCodeRegions(const std::unique_ptr<MCInstPrinter> &IP,
+                   bool SkipFailures) = 0;
 
 public:
   CodeRegionGenerator() {}
@@ -164,7 +165,8 @@ public:
   AnalysisRegionGenerator(llvm::SourceMgr &SM) : Regions(SM) {}
 
   virtual Expected<const AnalysisRegions &>
-  parseAnalysisRegions(const std::unique_ptr<MCInstPrinter> &IP) = 0;
+  parseAnalysisRegions(const std::unique_ptr<MCInstPrinter> &IP,
+                       bool SkipFailures) = 0;
 };
 
 /// Abstract CodeRegionGenerator with InstrumentRegionsRegions member
@@ -176,7 +178,8 @@ public:
   InstrumentRegionGenerator(llvm::SourceMgr &SM) : Regions(SM) {}
 
   virtual Expected<const InstrumentRegions &>
-  parseInstrumentRegions(const std::unique_ptr<MCInstPrinter> &IP) = 0;
+  parseInstrumentRegions(const std::unique_ptr<MCInstPrinter> &IP,
+                         bool SkipFailures) = 0;
 };
 
 /// This abstract class is responsible for parsing input ASM and
@@ -202,7 +205,8 @@ public:
 
   unsigned getAssemblerDialect() const { return AssemblerDialect; }
   Expected<const CodeRegions &>
-  parseCodeRegions(const std::unique_ptr<MCInstPrinter> &IP) override;
+  parseCodeRegions(const std::unique_ptr<MCInstPrinter> &IP,
+                   bool SkipFailures) override;
 };
 
 class AsmAnalysisRegionGenerator final : public AnalysisRegionGenerator,
@@ -222,8 +226,10 @@ public:
   MCStreamerWrapper *getMCStreamer() override { return &Streamer; }
 
   Expected<const AnalysisRegions &>
-  parseAnalysisRegions(const std::unique_ptr<MCInstPrinter> &IP) override {
-    Expected<const CodeRegions &> RegionsOrErr = parseCodeRegions(IP);
+  parseAnalysisRegions(const std::unique_ptr<MCInstPrinter> &IP,
+                       bool SkipFailures) override {
+    Expected<const CodeRegions &> RegionsOrErr =
+        parseCodeRegions(IP, SkipFailures);
     if (!RegionsOrErr)
       return RegionsOrErr.takeError();
     else
@@ -231,8 +237,9 @@ public:
   }
 
   Expected<const CodeRegions &>
-  parseCodeRegions(const std::unique_ptr<MCInstPrinter> &IP) override {
-    return AsmCodeRegionGenerator::parseCodeRegions(IP);
+  parseCodeRegions(const std::unique_ptr<MCInstPrinter> &IP,
+                   bool SkipFailures) override {
+    return AsmCodeRegionGenerator::parseCodeRegions(IP, SkipFailures);
   }
 };
 
@@ -254,8 +261,10 @@ public:
   MCStreamerWrapper *getMCStreamer() override { return &Streamer; }
 
   Expected<const InstrumentRegions &>
-  parseInstrumentRegions(const std::unique_ptr<MCInstPrinter> &IP) override {
-    Expected<const CodeRegions &> RegionsOrErr = parseCodeRegions(IP);
+  parseInstrumentRegions(const std::unique_ptr<MCInstPrinter> &IP,
+                         bool SkipFailures) override {
+    Expected<const CodeRegions &> RegionsOrErr =
+        parseCodeRegions(IP, SkipFailures);
     if (!RegionsOrErr)
       return RegionsOrErr.takeError();
     else
@@ -263,8 +272,9 @@ public:
   }
 
   Expected<const CodeRegions &>
-  parseCodeRegions(const std::unique_ptr<MCInstPrinter> &IP) override {
-    return AsmCodeRegionGenerator::parseCodeRegions(IP);
+  parseCodeRegions(const std::unique_ptr<MCInstPrinter> &IP,
+                   bool SkipFailures) override {
+    return AsmCodeRegionGenerator::parseCodeRegions(IP, SkipFailures);
   }
 };
 
diff --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp
index e037c06b12a3..03d7d7944b9c 100644
--- a/llvm/tools/llvm-mca/llvm-mca.cpp
+++ b/llvm/tools/llvm-mca/llvm-mca.cpp
@@ -135,6 +135,35 @@ static cl::opt<unsigned>
                                "(instructions per cycle)"),
                       cl::cat(ToolOptions), cl::init(0));
 
+enum class SkipType { NONE, LACK_SCHED, PARSE_FAILURE, ANY_FAILURE };
+
+static cl::opt<enum SkipType> SkipUnsupportedInstructions(
+    "skip-unsupported-instructions",
+    cl::desc("Force analysis to continue in the presence of unsupported "
+             "instructions"),
+    cl::values(
+        clEnumValN(SkipType::NONE, "none",
+                   "Exit with an error when an instruction is unsupported for "
+                   "any reason (default)"),
+        clEnumValN(
+            SkipType::LACK_SCHED, "lack-sched",
+            "Skip instructions on input which lack scheduling information"),
+        clEnumValN(
+            SkipType::PARSE_FAILURE, "parse-failure",
+            "Skip lines on the input which fail to parse for any reason"),
+        clEnumValN(SkipType::ANY_FAILURE, "any",
+                   "Skip instructions or lines on input which are unsupported "
+                   "for any reason")),
+    cl::init(SkipType::NONE), cl::cat(ViewOptions));
+
+bool shouldSkip(enum SkipType skipType) {
+  if (SkipUnsupportedInstructions == SkipType::NONE)
+    return false;
+  if (SkipUnsupportedInstructions == SkipType::ANY_FAILURE)
+    return true;
+  return skipType == SkipUnsupportedInstructions;
+}
+
 static cl::opt<bool>
     PrintRegisterFileStats("register-file-stats",
                            cl::desc("Print register file statistics"),
@@ -237,11 +266,6 @@ static cl::opt<bool> DisableInstrumentManager(
              "ignores instruments.)."),
     cl::cat(ViewOptions), cl::init(false));
 
-static cl::opt<bool> SkipUnsupportedInstructions(
-    "skip-unsupported-instructions",
-    cl::desc("Make unsupported instruction errors into warnings."),
-    cl::cat(ViewOptions), cl::init(false));
-
 namespace {
 
 const Target *getTarget(const char *ProgName) {
@@ -440,7 +464,8 @@ int main(int argc, char **argv) {
   mca::AsmAnalysisRegionGenerator CRG(*TheTarget, SrcMgr, ACtx, *MAI, *STI,
                                       *MCII);
   Expected<const mca::AnalysisRegions &> RegionsOrErr =
-      CRG.parseAnalysisRegions(std::move(IPtemp));
+      CRG.parseAnalysisRegions(std::move(IPtemp),
+                               shouldSkip(SkipType::PARSE_FAILURE));
   if (!RegionsOrErr) {
     if (auto Err =
             handleErrors(RegionsOrErr.takeError(), [](const StringError &E) {
@@ -482,7 +507,8 @@ int main(int argc, char **argv) {
   mca::AsmInstrumentRegionGenerator IRG(*TheTarget, SrcMgr, ICtx, *MAI, *STI,
                                         *MCII, *IM);
   Expected<const mca::InstrumentRegions &> InstrumentRegionsOrErr =
-      IRG.parseInstrumentRegions(std::move(IPtemp));
+      IRG.parseInstrumentRegions(std::move(IPtemp),
+                                 shouldSkip(SkipType::PARSE_FAILURE));
   if (!InstrumentRegionsOrErr) {
     if (auto Err = handleErrors(InstrumentRegionsOrErr.takeError(),
                                 [](const StringError &E) {
@@ -593,7 +619,7 @@ int main(int argc, char **argv) {
                 [&IP, &STI](const mca::InstructionError<MCInst> &IE) {
                   std::string InstructionStr;
                   raw_string_ostream SS(InstructionStr);
-                  if (SkipUnsupportedInstructions)
+                  if (shouldSkip(SkipType::LACK_SCHED))
                     WithColor::warning()
                         << IE.Message
                         << ", skipping with -skip-unsupported-instructions, "
@@ -601,7 +627,8 @@ int main(int argc, char **argv) {
                   else
                     WithColor::error()
                         << IE.Message
-                        << ", use -skip-unsupported-instructions to ignore.\n";
+                        << ", use -skip-unsupported-instructions=lack-sched to "
+                           "ignore these on the input.\n";
                   IP->printInst(&IE.Inst, 0, "", *STI, SS);
                   SS.flush();
                   WithColor::note()
@@ -610,7 +637,7 @@ int main(int argc, char **argv) {
           // Default case.
           WithColor::error() << toString(std::move(NewE));
         }
-        if (SkipUnsupportedInstructions) {
+        if (shouldSkip(SkipType::LACK_SCHED)) {
           DroppedInsts.insert(&MCI);
           continue;
         }
diff --git a/llvm/tools/llvm-objdump/MachODump.cpp b/llvm/tools/llvm-objdump/MachODump.cpp
index 5e0d69a68d69..749f98820175 100644
--- a/llvm/tools/llvm-objdump/MachODump.cpp
+++ b/llvm/tools/llvm-objdump/MachODump.cpp
@@ -2148,7 +2148,7 @@ static void ProcessMachO(StringRef Name, MachOObjectFile *MachOOF,
       else
         consumeError(NameOrErr.takeError());
 
-      if (SectName.equals("__text")) {
+      if (SectName == "__text") {
         DataRefImpl Ref = Section.getRawDataRefImpl();
         StringRef SegName = MachOOF->getSectionFinalSegmentName(Ref);
         DisassembleMachO(FileName, MachOOF, SegName, SectName);
diff --git a/llvm/tools/llvm-xray/xray-graph-diff.cpp b/llvm/tools/llvm-xray/xray-graph-diff.cpp
index 899a6725a5d3..b5c63ab0a918 100644
--- a/llvm/tools/llvm-xray/xray-graph-diff.cpp
+++ b/llvm/tools/llvm-xray/xray-graph-diff.cpp
@@ -381,14 +381,14 @@ void GraphDiffRenderer::exportGraphAsDOT(raw_ostream &OS, StatType EdgeLabel,
                   R"(color="{5}" labelfontcolor="{5}" penwidth={6}])"
                   "\n",
                   VertexNo[HeadId], VertexNo[TailId],
-                  (HeadId.equals("")) ? static_cast<StringRef>("F0") : HeadId,
+                  HeadId.empty() ? static_cast<StringRef>("F0") : HeadId,
                   TailId, getLabel(E, EdgeLabel), getColor(E, G, H, EdgeColor),
                   getLineWidth(E, EdgeColor));
   }
 
   for (const auto &V : G.vertices()) {
     const auto &VertexId = V.first;
-    if (VertexId.equals("")) {
+    if (VertexId.empty()) {
       OS << formatv(R"(F{0} [label="F0"])"
                     "\n",
                     VertexNo[VertexId]);
diff --git a/llvm/tools/llvm-yaml-numeric-parser-fuzzer/yaml-numeric-parser-fuzzer.cpp b/llvm/tools/llvm-yaml-numeric-parser-fuzzer/yaml-numeric-parser-fuzzer.cpp
index c8370289963d..9a572c1e0600 100644
--- a/llvm/tools/llvm-yaml-numeric-parser-fuzzer/yaml-numeric-parser-fuzzer.cpp
+++ b/llvm/tools/llvm-yaml-numeric-parser-fuzzer/yaml-numeric-parser-fuzzer.cpp
@@ -18,7 +18,7 @@ inline bool isNumericRegex(llvm::StringRef S) {
   static llvm::Regex Float(
       "^[-+]?(\\.[0-9]+|[0-9]+(\\.[0-9]*)?)([eE][-+]?[0-9]+)?$");
 
-  if (S.equals(".nan") || S.equals(".NaN") || S.equals(".NAN"))
+  if (S == ".nan" || S == ".NaN" || S == ".NAN")
     return true;
 
   if (Infinity.match(S))
diff --git a/llvm/tools/obj2yaml/dxcontainer2yaml.cpp b/llvm/tools/obj2yaml/dxcontainer2yaml.cpp
index ec4f5c74498f..06966b188358 100644
--- a/llvm/tools/obj2yaml/dxcontainer2yaml.cpp
+++ b/llvm/tools/obj2yaml/dxcontainer2yaml.cpp
@@ -58,8 +58,8 @@ dumpDXContainer(MemoryBufferRef Source) {
       assert(DXIL && "Since we are iterating and found a DXIL part, "
                      "this should never not have a value");
       NewPart.Program = DXContainerYAML::DXILProgram{
-          DXIL->first.MajorVersion,
-          DXIL->first.MinorVersion,
+          DXIL->first.getMajorVersion(),
+          DXIL->first.getMinorVersion(),
           DXIL->first.ShaderKind,
           DXIL->first.Size,
           DXIL->first.Bitcode.MajorVersion,
diff --git a/llvm/unittests/ADT/APIntTest.cpp b/llvm/unittests/ADT/APIntTest.cpp
index 46aaa47ee645..eb4b847185f5 100644
--- a/llvm/unittests/ADT/APIntTest.cpp
+++ b/llvm/unittests/ADT/APIntTest.cpp
@@ -2534,6 +2534,71 @@ TEST(APIntTest, clearLowBits) {
   EXPECT_EQ(16u, i32hi16.popcount());
 }
 
+TEST(APIntTest, clearHighBits) {
+  APInt i64hi32 = APInt::getAllOnes(64);
+  i64hi32.clearHighBits(32);
+  EXPECT_EQ(32u, i64hi32.countr_one());
+  EXPECT_EQ(0u, i64hi32.countr_zero());
+  EXPECT_EQ(32u, i64hi32.getActiveBits());
+  EXPECT_EQ(32u, i64hi32.countl_zero());
+  EXPECT_EQ(0u, i64hi32.countl_one());
+  EXPECT_EQ(32u, i64hi32.popcount());
+
+  APInt i128hi64 = APInt::getAllOnes(128);
+  i128hi64.clearHighBits(64);
+  EXPECT_EQ(64u, i128hi64.countr_one());
+  EXPECT_EQ(0u, i128hi64.countr_zero());
+  EXPECT_EQ(64u, i128hi64.getActiveBits());
+  EXPECT_EQ(64u, i128hi64.countl_zero());
+  EXPECT_EQ(0u, i128hi64.countl_one());
+  EXPECT_EQ(64u, i128hi64.popcount());
+
+  APInt i128hi24 = APInt::getAllOnes(128);
+  i128hi24.clearHighBits(104);
+  EXPECT_EQ(24u, i128hi24.countr_one());
+  EXPECT_EQ(0u, i128hi24.countr_zero());
+  EXPECT_EQ(24u, i128hi24.getActiveBits());
+  EXPECT_EQ(104u, i128hi24.countl_zero());
+  EXPECT_EQ(0u, i128hi24.countl_one());
+  EXPECT_EQ(24u, i128hi24.popcount());
+
+  APInt i128hi104 = APInt::getAllOnes(128);
+  i128hi104.clearHighBits(24);
+  EXPECT_EQ(104u, i128hi104.countr_one());
+  EXPECT_EQ(0u, i128hi104.countr_zero());
+  EXPECT_EQ(104u, i128hi104.getActiveBits());
+  EXPECT_EQ(24u, i128hi104.countl_zero());
+  EXPECT_EQ(0u, i128hi104.countl_one());
+  EXPECT_EQ(104u, i128hi104.popcount());
+
+  APInt i128hi0 = APInt::getAllOnes(128);
+  i128hi0.clearHighBits(128);
+  EXPECT_EQ(0u, i128hi0.countr_one());
+  EXPECT_EQ(128u, i128hi0.countr_zero());
+  EXPECT_EQ(0u, i128hi0.getActiveBits());
+  EXPECT_EQ(128u, i128hi0.countl_zero());
+  EXPECT_EQ(0u, i128hi0.countl_one());
+  EXPECT_EQ(0u, i128hi0.popcount());
+
+  APInt i80hi1 = APInt::getAllOnes(80);
+  i80hi1.clearHighBits(79);
+  EXPECT_EQ(1u, i80hi1.countr_one());
+  EXPECT_EQ(0u, i80hi1.countr_zero());
+  EXPECT_EQ(1u, i80hi1.getActiveBits());
+  EXPECT_EQ(79u, i80hi1.countl_zero());
+  EXPECT_EQ(0u, i80hi1.countl_one());
+  EXPECT_EQ(1u, i80hi1.popcount());
+
+  APInt i32hi16 = APInt::getAllOnes(32);
+  i32hi16.clearHighBits(16);
+  EXPECT_EQ(16u, i32hi16.countr_one());
+  EXPECT_EQ(0u, i32hi16.countr_zero());
+  EXPECT_EQ(16u, i32hi16.getActiveBits());
+  EXPECT_EQ(16u, i32hi16.countl_zero());
+  EXPECT_EQ(0u, i32hi16.countl_one());
+  EXPECT_EQ(16u, i32hi16.popcount());
+}
+
 TEST(APIntTest, abds) {
   using APIntOps::abds;
 
diff --git a/llvm/unittests/ADT/StringRefTest.cpp b/llvm/unittests/ADT/StringRefTest.cpp
index fa537e816fc8..b3c206a33696 100644
--- a/llvm/unittests/ADT/StringRefTest.cpp
+++ b/llvm/unittests/ADT/StringRefTest.cpp
@@ -998,7 +998,7 @@ TEST(StringRefTest, AllocatorCopy) {
   // allocator.
   StringRef StrEmpty = "";
   StringRef StrEmptyc = StrEmpty.copy(Alloc);
-  EXPECT_TRUE(StrEmpty.equals(StrEmptyc));
+  EXPECT_TRUE(StrEmpty == StrEmptyc);
   EXPECT_EQ(StrEmptyc.data(), nullptr);
   EXPECT_EQ(StrEmptyc.size(), 0u);
   EXPECT_EQ(Alloc.getTotalMemory(), 0u);
@@ -1007,9 +1007,9 @@ TEST(StringRefTest, AllocatorCopy) {
   StringRef Str2 = "bye";
   StringRef Str1c = Str1.copy(Alloc);
   StringRef Str2c = Str2.copy(Alloc);
-  EXPECT_TRUE(Str1.equals(Str1c));
+  EXPECT_TRUE(Str1 == Str1c);
   EXPECT_NE(Str1.data(), Str1c.data());
-  EXPECT_TRUE(Str2.equals(Str2c));
+  EXPECT_TRUE(Str2 == Str2c);
   EXPECT_NE(Str2.data(), Str2c.data());
 }
 
diff --git a/llvm/unittests/ADT/StringSetTest.cpp b/llvm/unittests/ADT/StringSetTest.cpp
index e3703f6f0150..a804c1f17d1c 100644
--- a/llvm/unittests/ADT/StringSetTest.cpp
+++ b/llvm/unittests/ADT/StringSetTest.cpp
@@ -73,4 +73,12 @@ TEST_F(StringSetTest, Contains) {
   EXPECT_FALSE(Set.contains("test"));
 }
 
+TEST_F(StringSetTest, Equal) {
+  StringSet<> A = {"A"};
+  StringSet<> B = {"B"};
+  ASSERT_TRUE(A != B);
+  ASSERT_FALSE(A == B);
+  ASSERT_TRUE(A == A);
+}
+
 } // end anonymous namespace
diff --git a/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp b/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp
index f6a053792f85..24f4f11db9a8 100644
--- a/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp
+++ b/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/IRSimilarityIdentifier.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
@@ -22,6 +23,11 @@
 using namespace llvm;
 using namespace IRSimilarity;
 
+extern llvm::cl::opt<bool> UseNewDbgInfoFormat;
+extern cl::opt<cl::boolOrDefault> PreserveInputDbgFormat;
+extern bool WriteNewDbgInfoFormatToBitcode;
+extern cl::opt<bool> WriteNewDbgInfoFormat;
+
 static std::unique_ptr<Module> makeLLVMModule(LLVMContext &Context,
                                               StringRef ModuleStr) {
   SMDiagnostic Err;
@@ -1306,19 +1312,18 @@ TEST(IRInstructionMapper, CallBrInstIllegal) {
   ASSERT_GT(UnsignedVec[0], Mapper.IllegalInstrNumber);
 }
 
-// Checks that an debuginfo intrinsics are mapped to be invisible.  Since they
+// Checks that an debuginfo records are mapped to be invisible. Since they
 // do not semantically change the program, they can be recognized as similar.
 TEST(IRInstructionMapper, DebugInfoInvisible) {
   StringRef ModuleString = R"(
                           define i32 @f(i32 %a, i32 %b) {
                           then:
-                            %0 = add i32 %a, %b                    
-                            call void @llvm.dbg.value(metadata !0)
-                            %1 = add i32 %a, %b     
+                            %0 = add i32 %a, %b
+                              #dbg_value(i32 0, !0, !0, !0)
+                            %1 = add i32 %a, %b
                             ret i32 0
                           }
 
-                          declare void @llvm.dbg.value(metadata)
                           !0 = distinct !{!"test\00", i32 10})";
   LLVMContext Context;
   std::unique_ptr<Module> M = makeLLVMModule(Context, ModuleString);
@@ -1914,19 +1919,19 @@ TEST(IRSimilarityCandidate, CheckRegionsDifferentTypes) {
   ASSERT_FALSE(longSimCandCompare(InstrList));
 }
 
-// Check that debug instructions do not impact similarity. They are marked as
+// Check that debug records do not impact similarity. They are marked as
 // invisible.
 TEST(IRSimilarityCandidate, IdenticalWithDebug) {
   StringRef ModuleString = R"(
                           define i32 @f(i32 %a, i32 %b) {
                           bb0:
                              %0 = add i32 %a, %b
-                             call void @llvm.dbg.value(metadata !0)
+                               #dbg_value(i32 0, !0, !0, !0)
                              %1 = add i32 %b, %a
                              ret i32 0
                           bb1:
                              %2 = add i32 %a, %b
-                             call void @llvm.dbg.value(metadata !1)
+                               #dbg_value(i32 1, !1, !1, !1)
                              %3 = add i32 %b, %a
                              ret i32 0
                           bb2:
@@ -1935,7 +1940,6 @@ TEST(IRSimilarityCandidate, IdenticalWithDebug) {
                              ret i32 0       
                           }
 
-                          declare void @llvm.dbg.value(metadata)
                           !0 = distinct !{!"test\00", i32 10}
                           !1 = distinct !{!"test\00", i32 11})";
   LLVMContext Context;
diff --git a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
index d7e4dba4ac17..7148e2902fa7 100644
--- a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
+++ b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
@@ -21,6 +21,8 @@ TEST(DataLayoutUpgradeTest, ValidDataLayoutUpgrade) {
       "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32", "i686-pc-windows-msvc");
   std::string DL3 = UpgradeDataLayoutString(
       "e-m:o-i64:64-f80:128-n8:16:32:64-S128", "x86_64-apple-macosx");
+  std::string DL4 =
+      UpgradeDataLayoutString("e-m:o-i64:64-i128:128-n32:64-S128", "aarch64--");
   EXPECT_EQ(DL1,
             "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128"
             "-f80:128-n8:16:32:64-S128");
@@ -29,6 +31,7 @@ TEST(DataLayoutUpgradeTest, ValidDataLayoutUpgrade) {
             "-f80:128-n8:16:32-S32");
   EXPECT_EQ(DL3, "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:"
                  "128-n8:16:32:64-S128");
+  EXPECT_EQ(DL4, "e-m:o-i64:64-i128:128-n32:64-S128-Fn32");
 
   // Check that AMDGPU targets add -G1 if it's not present.
   EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32", "r600"), "e-p:32:32-G1");
@@ -78,15 +81,15 @@ TEST(DataLayoutUpgradeTest, NoDataLayoutUpgrade) {
       "x86_64-unknown-linux-gnu");
   std::string DL2 = UpgradeDataLayoutString("e-m:e-i64:64-n32:64",
                                             "powerpc64le-unknown-linux-gnu");
-  std::string DL3 =
-      UpgradeDataLayoutString("e-m:o-i64:64-i128:128-n32:64-S128", "aarch64--");
+  std::string DL3 = UpgradeDataLayoutString(
+      "e-m:o-i64:64-i128:128-n32:64-S128-Fn32", "aarch64--");
   EXPECT_EQ(
       DL1,
       "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128"
       "-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64"
       "-f80:128:128-n8:16:32:64-S128");
   EXPECT_EQ(DL2, "e-m:e-i64:64-n32:64");
-  EXPECT_EQ(DL3, "e-m:o-i64:64-i128:128-n32:64-S128");
+  EXPECT_EQ(DL3, "e-m:o-i64:64-i128:128-n32:64-S128-Fn32");
 
   // Check that AMDGPU targets don't add -G1 if there is already a -G flag.
   EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G2", "r600"), "e-p:32:32-G2");
diff --git a/llvm/unittests/CodeGen/GlobalISel/CMakeLists.txt b/llvm/unittests/CodeGen/GlobalISel/CMakeLists.txt
index 6ed2409f2ad7..111d7f4d2f62 100644
--- a/llvm/unittests/CodeGen/GlobalISel/CMakeLists.txt
+++ b/llvm/unittests/CodeGen/GlobalISel/CMakeLists.txt
@@ -15,6 +15,7 @@ set(LLVM_LINK_COMPONENTS
 add_llvm_unittest(GlobalISelTests
   ConstantFoldingTest.cpp
   CSETest.cpp
+  GIMatchTableExecutorTest.cpp
   LegalizerTest.cpp
   LegalizerHelperTest.cpp
   LegalizerInfoTest.cpp
diff --git a/llvm/unittests/CodeGen/GlobalISel/GIMatchTableExecutorTest.cpp b/llvm/unittests/CodeGen/GlobalISel/GIMatchTableExecutorTest.cpp
new file mode 100644
index 000000000000..5a811d795928
--- /dev/null
+++ b/llvm/unittests/CodeGen/GlobalISel/GIMatchTableExecutorTest.cpp
@@ -0,0 +1,49 @@
+//===- GIMatchTableExecutorTest.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+TEST(GlobalISelLEB128Test, fastDecodeULEB128) {
+#define EXPECT_DECODE_ULEB128_EQ(EXPECTED, VALUE)                              \
+  do {                                                                         \
+    uint64_t ActualSize = 0;                                                   \
+    uint64_t Actual = GIMatchTableExecutor::fastDecodeULEB128(                 \
+        reinterpret_cast<const uint8_t *>(VALUE), ActualSize);                 \
+    EXPECT_EQ(sizeof(VALUE) - 1, ActualSize);                                  \
+    EXPECT_EQ(EXPECTED, Actual);                                               \
+  } while (0)
+
+  EXPECT_DECODE_ULEB128_EQ(0u, "\x00");
+  EXPECT_DECODE_ULEB128_EQ(1u, "\x01");
+  EXPECT_DECODE_ULEB128_EQ(63u, "\x3f");
+  EXPECT_DECODE_ULEB128_EQ(64u, "\x40");
+  EXPECT_DECODE_ULEB128_EQ(0x7fu, "\x7f");
+  EXPECT_DECODE_ULEB128_EQ(0x80u, "\x80\x01");
+  EXPECT_DECODE_ULEB128_EQ(0x81u, "\x81\x01");
+  EXPECT_DECODE_ULEB128_EQ(0x90u, "\x90\x01");
+  EXPECT_DECODE_ULEB128_EQ(0xffu, "\xff\x01");
+  EXPECT_DECODE_ULEB128_EQ(0x100u, "\x80\x02");
+  EXPECT_DECODE_ULEB128_EQ(0x101u, "\x81\x02");
+  EXPECT_DECODE_ULEB128_EQ(4294975616ULL, "\x80\xc1\x80\x80\x10");
+
+  // Decode ULEB128 with extra padding bytes
+  EXPECT_DECODE_ULEB128_EQ(0u, "\x80\x00");
+  EXPECT_DECODE_ULEB128_EQ(0u, "\x80\x80\x00");
+  EXPECT_DECODE_ULEB128_EQ(0x7fu, "\xff\x00");
+  EXPECT_DECODE_ULEB128_EQ(0x7fu, "\xff\x80\x00");
+  EXPECT_DECODE_ULEB128_EQ(0x80u, "\x80\x81\x00");
+  EXPECT_DECODE_ULEB128_EQ(0x80u, "\x80\x81\x80\x00");
+  EXPECT_DECODE_ULEB128_EQ(0x80u, "\x80\x81\x80\x80\x80\x80\x80\x80\x80\x00");
+  EXPECT_DECODE_ULEB128_EQ(0x80000000'00000000ul,
+                           "\x80\x80\x80\x80\x80\x80\x80\x80\x80\x01");
+
+#undef EXPECT_DECODE_ULEB128_EQ
+}
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
index a7112cfac63d..24930b965f1d 100644
--- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -217,6 +217,7 @@ TEST_F(SelectionDAGPatternMatchTest, matchConstants) {
   SDValue Zero = DAG->getConstant(0, DL, Int32VT);
   SDValue One = DAG->getConstant(1, DL, Int32VT);
   SDValue AllOnes = DAG->getConstant(APInt::getAllOnes(32), DL, Int32VT);
+  SDValue SetCC = DAG->getSetCC(DL, Int32VT, Arg0, Const3, ISD::SETULT);
 
   using namespace SDPatternMatch;
   EXPECT_TRUE(sd_match(Const87, m_ConstInt()));
@@ -233,6 +234,13 @@ TEST_F(SelectionDAGPatternMatchTest, matchConstants) {
   EXPECT_TRUE(sd_match(Zero, DAG.get(), m_False()));
   EXPECT_TRUE(sd_match(One, DAG.get(), m_True()));
   EXPECT_FALSE(sd_match(AllOnes, DAG.get(), m_True()));
+
+  ISD::CondCode CC;
+  EXPECT_TRUE(sd_match(
+      SetCC, m_Node(ISD::SETCC, m_Value(), m_Value(), m_CondCode(CC))));
+  EXPECT_EQ(CC, ISD::SETULT);
+  EXPECT_TRUE(sd_match(SetCC, m_Node(ISD::SETCC, m_Value(), m_Value(),
+                                     m_SpecificCondCode(ISD::SETULT))));
 }
 
 TEST_F(SelectionDAGPatternMatchTest, patternCombinators) {
@@ -249,6 +257,7 @@ TEST_F(SelectionDAGPatternMatchTest, patternCombinators) {
   EXPECT_TRUE(sd_match(
       Sub, m_AnyOf(m_Opc(ISD::ADD), m_Opc(ISD::SUB), m_Opc(ISD::MUL))));
   EXPECT_TRUE(sd_match(Add, m_AllOf(m_Opc(ISD::ADD), m_OneUse())));
+  EXPECT_TRUE(sd_match(Add, m_NoneOf(m_Opc(ISD::SUB), m_Opc(ISD::MUL))));
 }
 
 TEST_F(SelectionDAGPatternMatchTest, optionalResizing) {
@@ -260,6 +269,8 @@ TEST_F(SelectionDAGPatternMatchTest, optionalResizing) {
   SDValue Op64 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, Int64VT);
   SDValue ZExt = DAG->getNode(ISD::ZERO_EXTEND, DL, Int64VT, Op32);
   SDValue SExt = DAG->getNode(ISD::SIGN_EXTEND, DL, Int64VT, Op32);
+  SDValue SExtInReg = DAG->getNode(ISD::SIGN_EXTEND_INREG, DL, Int64VT, Op64,
+                                   DAG->getValueType(Int32VT));
   SDValue AExt = DAG->getNode(ISD::ANY_EXTEND, DL, Int64VT, Op32);
   SDValue Trunc = DAG->getNode(ISD::TRUNCATE, DL, Int32VT, Op64);
 
@@ -273,6 +284,8 @@ TEST_F(SelectionDAGPatternMatchTest, optionalResizing) {
   EXPECT_TRUE(A == Op64);
   EXPECT_TRUE(sd_match(SExt, m_SExtOrSelf(m_Value(A))));
   EXPECT_TRUE(A == Op32);
+  EXPECT_TRUE(sd_match(SExtInReg, m_SExtOrSelf(m_Value(A))));
+  EXPECT_TRUE(A == Op64);
   EXPECT_TRUE(sd_match(Op32, m_AExtOrSelf(m_Value(A))));
   EXPECT_TRUE(A == Op32);
   EXPECT_TRUE(sd_match(AExt, m_AExtOrSelf(m_Value(A))));
diff --git a/llvm/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp b/llvm/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp
index 3b24e29e1ed3..53a74c833eb3 100644
--- a/llvm/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp
@@ -1132,6 +1132,71 @@ TEST_F(CoreAPIsStandardTest, SimpleAsynchronousGeneratorTest) {
   EXPECT_TRUE(LookupCompleted);
 }
 
+TEST_F(CoreAPIsStandardTest, ErrorFromSuspendedAsynchronousGeneratorTest) {
+
+  auto &G = JD.addGenerator(std::make_unique<SimpleAsyncGenerator>());
+
+  bool LookupCompleted = false;
+
+  ES.lookup(
+      LookupKind::Static, makeJITDylibSearchOrder(&JD), SymbolLookupSet(Foo),
+      SymbolState::Ready,
+      [&](Expected<SymbolMap> Result) {
+        LookupCompleted = true;
+        EXPECT_THAT_EXPECTED(Result, Failed());
+      },
+      NoDependenciesToRegister);
+
+  EXPECT_FALSE(LookupCompleted);
+
+  G.takeLookup().LS.continueLookup(
+      make_error<StringError>("boom", inconvertibleErrorCode()));
+
+  EXPECT_TRUE(LookupCompleted);
+}
+
+TEST_F(CoreAPIsStandardTest, ErrorFromAutoSuspendedAsynchronousGeneratorTest) {
+
+  auto &G = JD.addGenerator(std::make_unique<SimpleAsyncGenerator>());
+
+  std::atomic_size_t LookupsCompleted = 0;
+
+  ES.lookup(
+      LookupKind::Static, makeJITDylibSearchOrder(&JD), SymbolLookupSet(Foo),
+      SymbolState::Ready,
+      [&](Expected<SymbolMap> Result) {
+        ++LookupsCompleted;
+        EXPECT_THAT_EXPECTED(Result, Failed());
+      },
+      NoDependenciesToRegister);
+
+  EXPECT_EQ(LookupsCompleted, 0U);
+
+  // Suspend the first lookup.
+  auto LS1 = std::move(G.takeLookup().LS);
+
+  // Start a second lookup that should be auto-suspended.
+  ES.lookup(
+      LookupKind::Static, makeJITDylibSearchOrder(&JD), SymbolLookupSet(Foo),
+      SymbolState::Ready,
+      [&](Expected<SymbolMap> Result) {
+        ++LookupsCompleted;
+        EXPECT_THAT_EXPECTED(Result, Failed());
+      },
+      NoDependenciesToRegister);
+
+  EXPECT_EQ(LookupsCompleted, 0U);
+
+  // Unsuspend the first lookup.
+  LS1.continueLookup(make_error<StringError>("boom", inconvertibleErrorCode()));
+
+  // Unsuspend the second.
+  G.takeLookup().LS.continueLookup(
+      make_error<StringError>("boom", inconvertibleErrorCode()));
+
+  EXPECT_EQ(LookupsCompleted, 2U);
+}
+
 TEST_F(CoreAPIsStandardTest, BlockedGeneratorAutoSuspensionTest) {
   // Test that repeated lookups while a generator is in use cause automatic
   // lookup suspension / resumption.
diff --git a/llvm/unittests/ExecutionEngine/Orc/ObjectLinkingLayerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/ObjectLinkingLayerTest.cpp
index 7ab3e40df745..70570055fea9 100644
--- a/llvm/unittests/ExecutionEngine/Orc/ObjectLinkingLayerTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/ObjectLinkingLayerTest.cpp
@@ -178,6 +178,82 @@ TEST_F(ObjectLinkingLayerTest, HandleErrorDuringPostAllocationPass) {
   EXPECT_THAT_EXPECTED(ES.lookup(&JD, "_anchor"), Failed());
 }
 
+TEST_F(ObjectLinkingLayerTest, AddAndRemovePlugins) {
+  class TestPlugin : public ObjectLinkingLayer::Plugin {
+  public:
+    TestPlugin(size_t &ActivationCount, bool &PluginDestroyed)
+        : ActivationCount(ActivationCount), PluginDestroyed(PluginDestroyed) {}
+
+    ~TestPlugin() { PluginDestroyed = true; }
+
+    void modifyPassConfig(MaterializationResponsibility &MR,
+                          jitlink::LinkGraph &G,
+                          jitlink::PassConfiguration &Config) override {
+      ++ActivationCount;
+    }
+
+    Error notifyFailed(MaterializationResponsibility &MR) override {
+      ADD_FAILURE() << "TestPlugin::notifyFailed called unexpectedly";
+      return Error::success();
+    }
+
+    Error notifyRemovingResources(JITDylib &JD, ResourceKey K) override {
+      return Error::success();
+    }
+
+    void notifyTransferringResources(JITDylib &JD, ResourceKey DstKey,
+                                     ResourceKey SrcKey) override {}
+
+  private:
+    size_t &ActivationCount;
+    bool &PluginDestroyed;
+  };
+
+  size_t ActivationCount = 0;
+  bool PluginDestroyed = false;
+
+  auto P = std::make_shared<TestPlugin>(ActivationCount, PluginDestroyed);
+
+  ObjLinkingLayer.addPlugin(P);
+
+  {
+    auto G1 = std::make_unique<LinkGraph>("G1", Triple("x86_64-apple-darwin"),
+                                          8, llvm::endianness::little,
+                                          x86_64::getEdgeKindName);
+
+    auto &DataSec = G1->createSection("__data", MemProt::Read | MemProt::Write);
+    auto &DataBlock = G1->createContentBlock(DataSec, BlockContent,
+                                             orc::ExecutorAddr(0x1000), 8, 0);
+    G1->addDefinedSymbol(DataBlock, 4, "_anchor1", 4, Linkage::Weak,
+                         Scope::Default, false, true);
+
+    EXPECT_THAT_ERROR(ObjLinkingLayer.add(JD, std::move(G1)), Succeeded());
+    EXPECT_THAT_EXPECTED(ES.lookup(&JD, "_anchor1"), Succeeded());
+    EXPECT_EQ(ActivationCount, 1U);
+  }
+
+  ObjLinkingLayer.removePlugin(*P);
+
+  {
+    auto G2 = std::make_unique<LinkGraph>("G2", Triple("x86_64-apple-darwin"),
+                                          8, llvm::endianness::little,
+                                          x86_64::getEdgeKindName);
+
+    auto &DataSec = G2->createSection("__data", MemProt::Read | MemProt::Write);
+    auto &DataBlock = G2->createContentBlock(DataSec, BlockContent,
+                                             orc::ExecutorAddr(0x1000), 8, 0);
+    G2->addDefinedSymbol(DataBlock, 4, "_anchor2", 4, Linkage::Weak,
+                         Scope::Default, false, true);
+
+    EXPECT_THAT_ERROR(ObjLinkingLayer.add(JD, std::move(G2)), Succeeded());
+    EXPECT_THAT_EXPECTED(ES.lookup(&JD, "_anchor2"), Succeeded());
+    EXPECT_EQ(ActivationCount, 1U);
+  }
+
+  P.reset();
+  EXPECT_TRUE(PluginDestroyed);
+}
+
 TEST(ObjectLinkingLayerSearchGeneratorTest, AbsoluteSymbolsObjectLayer) {
   class TestEPC : public UnsupportedExecutorProcessControl {
   public:
diff --git a/llvm/unittests/Frontend/CMakeLists.txt b/llvm/unittests/Frontend/CMakeLists.txt
index 3f290b63ba64..85e113816e3b 100644
--- a/llvm/unittests/Frontend/CMakeLists.txt
+++ b/llvm/unittests/Frontend/CMakeLists.txt
@@ -15,6 +15,7 @@ add_llvm_unittest(LLVMFrontendTests
   OpenMPIRBuilderTest.cpp
   OpenMPParsingTest.cpp
   OpenMPCompositionTest.cpp
+  OpenMPDecompositionTest.cpp
 
   DEPENDS
   acc_gen
diff --git a/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp b/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
new file mode 100644
index 000000000000..df48e9cc0ff4
--- /dev/null
+++ b/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
@@ -0,0 +1,999 @@
+//===- llvm/unittests/Frontend/OpenMPDecompositionTest.cpp ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Frontend/OpenMP/ClauseT.h"
+#include "llvm/Frontend/OpenMP/ConstructDecompositionT.h"
+#include "llvm/Frontend/OpenMP/OMP.h"
+#include "gtest/gtest.h"
+
+#include <iterator>
+#include <optional>
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+// The actual tests start at comment "--- Test" below.
+
+// Create simple instantiations of all clauses to allow manual construction
+// of clauses, and implement emitting of a directive with clauses to a string.
+//
+// The tests then follow the pattern
+// 1. Create a list of clauses.
+// 2. Pass them, together with a construct, to the decomposition class.
+// 3. Extract individual resulting leaf constructs with clauses applied
+//    to them.
+// 4. Convert them to strings and compare with expected outputs.
+
+namespace omp {
+struct TypeTy {}; // placeholder
+struct ExprTy {}; // placeholder
+using IdTy = std::string;
+} // namespace omp
+
+namespace tomp::type {
+template <> struct ObjectT<omp::IdTy, omp::ExprTy> {
+  const omp::IdTy &id() const { return name; }
+  const std::optional<omp::ExprTy> ref() const { return omp::ExprTy{}; }
+
+  omp::IdTy name;
+};
+} // namespace tomp::type
+
+namespace omp {
+template <typename ElemTy> using List = tomp::type::ListT<ElemTy>;
+
+using Object = tomp::ObjectT<IdTy, ExprTy>;
+
+namespace clause {
+using DefinedOperator = tomp::type::DefinedOperatorT<IdTy, ExprTy>;
+using ProcedureDesignator = tomp::type::ProcedureDesignatorT<IdTy, ExprTy>;
+using ReductionOperator = tomp::type::ReductionIdentifierT<IdTy, ExprTy>;
+
+using AcqRel = tomp::clause::AcqRelT<TypeTy, IdTy, ExprTy>;
+using Acquire = tomp::clause::AcquireT<TypeTy, IdTy, ExprTy>;
+using AdjustArgs = tomp::clause::AdjustArgsT<TypeTy, IdTy, ExprTy>;
+using Affinity = tomp::clause::AffinityT<TypeTy, IdTy, ExprTy>;
+using Aligned = tomp::clause::AlignedT<TypeTy, IdTy, ExprTy>;
+using Align = tomp::clause::AlignT<TypeTy, IdTy, ExprTy>;
+using Allocate = tomp::clause::AllocateT<TypeTy, IdTy, ExprTy>;
+using Allocator = tomp::clause::AllocatorT<TypeTy, IdTy, ExprTy>;
+using AppendArgs = tomp::clause::AppendArgsT<TypeTy, IdTy, ExprTy>;
+using AtomicDefaultMemOrder =
+    tomp::clause::AtomicDefaultMemOrderT<TypeTy, IdTy, ExprTy>;
+using At = tomp::clause::AtT<TypeTy, IdTy, ExprTy>;
+using Bind = tomp::clause::BindT<TypeTy, IdTy, ExprTy>;
+using Capture = tomp::clause::CaptureT<TypeTy, IdTy, ExprTy>;
+using Collapse = tomp::clause::CollapseT<TypeTy, IdTy, ExprTy>;
+using Compare = tomp::clause::CompareT<TypeTy, IdTy, ExprTy>;
+using Copyin = tomp::clause::CopyinT<TypeTy, IdTy, ExprTy>;
+using Copyprivate = tomp::clause::CopyprivateT<TypeTy, IdTy, ExprTy>;
+using Defaultmap = tomp::clause::DefaultmapT<TypeTy, IdTy, ExprTy>;
+using Default = tomp::clause::DefaultT<TypeTy, IdTy, ExprTy>;
+using Depend = tomp::clause::DependT<TypeTy, IdTy, ExprTy>;
+using Destroy = tomp::clause::DestroyT<TypeTy, IdTy, ExprTy>;
+using Detach = tomp::clause::DetachT<TypeTy, IdTy, ExprTy>;
+using Device = tomp::clause::DeviceT<TypeTy, IdTy, ExprTy>;
+using DeviceType = tomp::clause::DeviceTypeT<TypeTy, IdTy, ExprTy>;
+using DistSchedule = tomp::clause::DistScheduleT<TypeTy, IdTy, ExprTy>;
+using Doacross = tomp::clause::DoacrossT<TypeTy, IdTy, ExprTy>;
+using DynamicAllocators =
+    tomp::clause::DynamicAllocatorsT<TypeTy, IdTy, ExprTy>;
+using Enter = tomp::clause::EnterT<TypeTy, IdTy, ExprTy>;
+using Exclusive = tomp::clause::ExclusiveT<TypeTy, IdTy, ExprTy>;
+using Fail = tomp::clause::FailT<TypeTy, IdTy, ExprTy>;
+using Filter = tomp::clause::FilterT<TypeTy, IdTy, ExprTy>;
+using Final = tomp::clause::FinalT<TypeTy, IdTy, ExprTy>;
+using Firstprivate = tomp::clause::FirstprivateT<TypeTy, IdTy, ExprTy>;
+using From = tomp::clause::FromT<TypeTy, IdTy, ExprTy>;
+using Full = tomp::clause::FullT<TypeTy, IdTy, ExprTy>;
+using Grainsize = tomp::clause::GrainsizeT<TypeTy, IdTy, ExprTy>;
+using HasDeviceAddr = tomp::clause::HasDeviceAddrT<TypeTy, IdTy, ExprTy>;
+using Hint = tomp::clause::HintT<TypeTy, IdTy, ExprTy>;
+using If = tomp::clause::IfT<TypeTy, IdTy, ExprTy>;
+using Inbranch = tomp::clause::InbranchT<TypeTy, IdTy, ExprTy>;
+using Inclusive = tomp::clause::InclusiveT<TypeTy, IdTy, ExprTy>;
+using Indirect = tomp::clause::IndirectT<TypeTy, IdTy, ExprTy>;
+using Init = tomp::clause::InitT<TypeTy, IdTy, ExprTy>;
+using InReduction = tomp::clause::InReductionT<TypeTy, IdTy, ExprTy>;
+using IsDevicePtr = tomp::clause::IsDevicePtrT<TypeTy, IdTy, ExprTy>;
+using Lastprivate = tomp::clause::LastprivateT<TypeTy, IdTy, ExprTy>;
+using Linear = tomp::clause::LinearT<TypeTy, IdTy, ExprTy>;
+using Link = tomp::clause::LinkT<TypeTy, IdTy, ExprTy>;
+using Map = tomp::clause::MapT<TypeTy, IdTy, ExprTy>;
+using Match = tomp::clause::MatchT<TypeTy, IdTy, ExprTy>;
+using Mergeable = tomp::clause::MergeableT<TypeTy, IdTy, ExprTy>;
+using Message = tomp::clause::MessageT<TypeTy, IdTy, ExprTy>;
+using Nocontext = tomp::clause::NocontextT<TypeTy, IdTy, ExprTy>;
+using Nogroup = tomp::clause::NogroupT<TypeTy, IdTy, ExprTy>;
+using Nontemporal = tomp::clause::NontemporalT<TypeTy, IdTy, ExprTy>;
+using Notinbranch = tomp::clause::NotinbranchT<TypeTy, IdTy, ExprTy>;
+using Novariants = tomp::clause::NovariantsT<TypeTy, IdTy, ExprTy>;
+using Nowait = tomp::clause::NowaitT<TypeTy, IdTy, ExprTy>;
+using NumTasks = tomp::clause::NumTasksT<TypeTy, IdTy, ExprTy>;
+using NumTeams = tomp::clause::NumTeamsT<TypeTy, IdTy, ExprTy>;
+using NumThreads = tomp::clause::NumThreadsT<TypeTy, IdTy, ExprTy>;
+using OmpxAttribute = tomp::clause::OmpxAttributeT<TypeTy, IdTy, ExprTy>;
+using OmpxBare = tomp::clause::OmpxBareT<TypeTy, IdTy, ExprTy>;
+using OmpxDynCgroupMem = tomp::clause::OmpxDynCgroupMemT<TypeTy, IdTy, ExprTy>;
+using Ordered = tomp::clause::OrderedT<TypeTy, IdTy, ExprTy>;
+using Order = tomp::clause::OrderT<TypeTy, IdTy, ExprTy>;
+using Partial = tomp::clause::PartialT<TypeTy, IdTy, ExprTy>;
+using Priority = tomp::clause::PriorityT<TypeTy, IdTy, ExprTy>;
+using Private = tomp::clause::PrivateT<TypeTy, IdTy, ExprTy>;
+using ProcBind = tomp::clause::ProcBindT<TypeTy, IdTy, ExprTy>;
+using Read = tomp::clause::ReadT<TypeTy, IdTy, ExprTy>;
+using Reduction = tomp::clause::ReductionT<TypeTy, IdTy, ExprTy>;
+using Relaxed = tomp::clause::RelaxedT<TypeTy, IdTy, ExprTy>;
+using Release = tomp::clause::ReleaseT<TypeTy, IdTy, ExprTy>;
+using ReverseOffload = tomp::clause::ReverseOffloadT<TypeTy, IdTy, ExprTy>;
+using Safelen = tomp::clause::SafelenT<TypeTy, IdTy, ExprTy>;
+using Schedule = tomp::clause::ScheduleT<TypeTy, IdTy, ExprTy>;
+using SeqCst = tomp::clause::SeqCstT<TypeTy, IdTy, ExprTy>;
+using Severity = tomp::clause::SeverityT<TypeTy, IdTy, ExprTy>;
+using Shared = tomp::clause::SharedT<TypeTy, IdTy, ExprTy>;
+using Simdlen = tomp::clause::SimdlenT<TypeTy, IdTy, ExprTy>;
+using Simd = tomp::clause::SimdT<TypeTy, IdTy, ExprTy>;
+using Sizes = tomp::clause::SizesT<TypeTy, IdTy, ExprTy>;
+using TaskReduction = tomp::clause::TaskReductionT<TypeTy, IdTy, ExprTy>;
+using ThreadLimit = tomp::clause::ThreadLimitT<TypeTy, IdTy, ExprTy>;
+using Threads = tomp::clause::ThreadsT<TypeTy, IdTy, ExprTy>;
+using To = tomp::clause::ToT<TypeTy, IdTy, ExprTy>;
+using UnifiedAddress = tomp::clause::UnifiedAddressT<TypeTy, IdTy, ExprTy>;
+using UnifiedSharedMemory =
+    tomp::clause::UnifiedSharedMemoryT<TypeTy, IdTy, ExprTy>;
+using Uniform = tomp::clause::UniformT<TypeTy, IdTy, ExprTy>;
+using Unknown = tomp::clause::UnknownT<TypeTy, IdTy, ExprTy>;
+using Untied = tomp::clause::UntiedT<TypeTy, IdTy, ExprTy>;
+using Update = tomp::clause::UpdateT<TypeTy, IdTy, ExprTy>;
+using UseDeviceAddr = tomp::clause::UseDeviceAddrT<TypeTy, IdTy, ExprTy>;
+using UseDevicePtr = tomp::clause::UseDevicePtrT<TypeTy, IdTy, ExprTy>;
+using UsesAllocators = tomp::clause::UsesAllocatorsT<TypeTy, IdTy, ExprTy>;
+using Use = tomp::clause::UseT<TypeTy, IdTy, ExprTy>;
+using Weak = tomp::clause::WeakT<TypeTy, IdTy, ExprTy>;
+using When = tomp::clause::WhenT<TypeTy, IdTy, ExprTy>;
+using Write = tomp::clause::WriteT<TypeTy, IdTy, ExprTy>;
+} // namespace clause
+
+struct Helper {
+  std::optional<Object> getBaseObject(const Object &object) {
+    return std::nullopt;
+  }
+  std::optional<Object> getLoopIterVar() { return std::nullopt; }
+};
+
+using Clause = tomp::ClauseT<TypeTy, IdTy, ExprTy>;
+using ConstructDecomposition = tomp::ConstructDecompositionT<Clause, Helper>;
+using DirectiveWithClauses = tomp::DirectiveWithClauses<Clause>;
+} // namespace omp
+
+struct StringifyClause {
+  static std::string join(const omp::List<std::string> &Strings) {
+    std::stringstream Stream;
+    for (const auto &[Index, String] : llvm::enumerate(Strings)) {
+      if (Index != 0)
+        Stream << ", ";
+      Stream << String;
+    }
+    return Stream.str();
+  }
+
+  static std::string to_str(llvm::omp::Directive D) {
+    return getOpenMPDirectiveName(D).str();
+  }
+  static std::string to_str(llvm::omp::Clause C) {
+    return getOpenMPClauseName(C).str();
+  }
+  static std::string to_str(const omp::TypeTy &Type) { return "type"; }
+  static std::string to_str(const omp::ExprTy &Expr) { return "expr"; }
+  static std::string to_str(const omp::Object &Obj) { return Obj.id(); }
+
+  template <typename U>
+  static std::enable_if_t<std::is_enum_v<llvm::remove_cvref_t<U>>, std::string>
+  to_str(U &&Item) {
+    return std::to_string(llvm::to_underlying(Item));
+  }
+
+  template <typename U> static std::string to_str(const omp::List<U> &Items) {
+    omp::List<std::string> Names;
+    llvm::transform(Items, std::back_inserter(Names),
+                    [](auto &&S) { return to_str(S); });
+    return "(" + join(Names) + ")";
+  }
+
+  template <typename U>
+  static std::string to_str(const std::optional<U> &Item) {
+    if (Item)
+      return to_str(*Item);
+    return "";
+  }
+
+  template <typename... Us, size_t... Is>
+  static std::string to_str(const std::tuple<Us...> &Tuple,
+                            std::index_sequence<Is...>) {
+    omp::List<std::string> Strings;
+    (Strings.push_back(to_str(std::get<Is>(Tuple))), ...);
+    return "(" + join(Strings) + ")";
+  }
+
+  template <typename U>
+  static std::enable_if_t<llvm::remove_cvref_t<U>::EmptyTrait::value,
+                          std::string>
+  to_str(U &&Item) {
+    return "";
+  }
+
+  template <typename U>
+  static std::enable_if_t<llvm::remove_cvref_t<U>::IncompleteTrait::value,
+                          std::string>
+  to_str(U &&Item) {
+    return "";
+  }
+
+  template <typename U>
+  static std::enable_if_t<llvm::remove_cvref_t<U>::WrapperTrait::value,
+                          std::string>
+  to_str(U &&Item) {
+    // For a wrapper, stringify the wrappee, and only add parentheses if
+    // there aren't any already.
+    std::string Str = to_str(Item.v);
+    if (!Str.empty()) {
+      if (Str.front() == '(' && Str.back() == ')')
+        return Str;
+    }
+    return "(" + to_str(Item.v) + ")";
+  }
+
+  template <typename U>
+  static std::enable_if_t<llvm::remove_cvref_t<U>::TupleTrait::value,
+                          std::string>
+  to_str(U &&Item) {
+    constexpr size_t TupleSize =
+        std::tuple_size_v<llvm::remove_cvref_t<decltype(Item.t)>>;
+    return to_str(Item.t, std::make_index_sequence<TupleSize>{});
+  }
+
+  template <typename U>
+  static std::enable_if_t<llvm::remove_cvref_t<U>::UnionTrait::value,
+                          std::string>
+  to_str(U &&Item) {
+    return std::visit([](auto &&S) { return to_str(S); }, Item.u);
+  }
+
+  StringifyClause(const omp::Clause &C)
+      // Rely on content stringification to emit enclosing parentheses.
+      : Str(to_str(C.id) + to_str(C)) {}
+
+  std::string Str;
+};
+
+std::string stringify(const omp::DirectiveWithClauses &DWC) {
+  std::stringstream Stream;
+
+  Stream << getOpenMPDirectiveName(DWC.id).str();
+  for (const omp::Clause &C : DWC.clauses)
+    Stream << ' ' << StringifyClause(C).Str;
+
+  return Stream.str();
+}
+
+// --- Tests ----------------------------------------------------------
+
+namespace {
+using namespace llvm::omp;
+
+class OpenMPDecompositionTest : public testing::Test {
+protected:
+  void SetUp() override {}
+  void TearDown() override {}
+
+  omp::Helper Helper;
+  uint32_t AnyVersion = 999;
+};
+
+// PRIVATE
+// [5.2:111:5-7]
+// Directives: distribute, do, for, loop, parallel, scope, sections, simd,
+// single, target, task, taskloop, teams
+//
+// [5.2:340:1-2]
+// (1) The effect of the 1 private clause is as if it is applied only to the
+// innermost leaf construct that permits it.
+TEST_F(OpenMPDecompositionTest, Private1) {
+  omp::Object x{"x"};
+
+  omp::List<omp::Clause> Clauses{
+      {OMPC_private, omp::clause::Private{{x}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_parallel_sections,
+                                  Clauses);
+  ASSERT_EQ(Dec.output.size(), 2u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  ASSERT_EQ(Dir0, "parallel");            // (1)
+  ASSERT_EQ(Dir1, "sections private(x)"); // (1)
+}
+
+TEST_F(OpenMPDecompositionTest, Private2) {
+  omp::Object x{"x"};
+
+  omp::List<omp::Clause> Clauses{
+      {OMPC_private, omp::clause::Private{{x}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_parallel_masked,
+                                  Clauses);
+  ASSERT_EQ(Dec.output.size(), 2u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  ASSERT_EQ(Dir0, "parallel private(x)"); // (1)
+  ASSERT_EQ(Dir1, "masked");              // (1)
+}
+
+// FIRSTPRIVATE
+// [5.2:112:5-7]
+// Directives: distribute, do, for, parallel, scope, sections, single, target,
+// task, taskloop, teams
+//
+// [5.2:340:3-20]
+// (3) The effect of the firstprivate clause is as if it is applied to one or
+// more leaf constructs as follows:
+//  (5) To the distribute construct if it is among the constituent constructs;
+//  (6) To the teams construct if it is among the constituent constructs and the
+//      distribute construct is not;
+//  (8) To a worksharing construct that accepts the clause if one is among the
+//      constituent constructs;
+//  (9) To the taskloop construct if it is among the constituent constructs;
+// (10) To the parallel construct if it is among the constituent constructs and
+//      neither a taskloop construct nor a worksharing construct that accepts
+//      the clause is among them;
+// (12) To the target construct if it is among the constituent constructs and
+//      the same list item neither appears in a lastprivate clause nor is the
+//      base variable or base pointer of a list item that appears in a map
+//      clause.
+//
+// (15) If the parallel construct is among the constituent constructs and the
+// effect is not as if the firstprivate clause is applied to it by the above
+// rules, then the effect is as if the shared clause with the same list item is
+// applied to the parallel construct.
+// (17) If the teams construct is among the constituent constructs and the
+// effect is not as if the firstprivate clause is applied to it by the above
+// rules, then the effect is as if the shared clause with the same list item is
+// applied to the teams construct.
+TEST_F(OpenMPDecompositionTest, Firstprivate1) {
+  omp::Object x{"x"};
+
+  omp::List<omp::Clause> Clauses{
+      {OMPC_firstprivate, omp::clause::Firstprivate{{x}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_parallel_sections,
+                                  Clauses);
+  ASSERT_EQ(Dec.output.size(), 2u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  ASSERT_EQ(Dir0, "parallel shared(x)");       // (10), (15)
+  ASSERT_EQ(Dir1, "sections firstprivate(x)"); // (8)
+}
+
+TEST_F(OpenMPDecompositionTest, Firstprivate2) {
+  omp::Object x{"x"};
+
+  omp::List<omp::Clause> Clauses{
+      {OMPC_firstprivate, omp::clause::Firstprivate{{x}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper,
+                                  OMPD_target_teams_distribute, Clauses);
+  ASSERT_EQ(Dec.output.size(), 3u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  std::string Dir2 = stringify(Dec.output[2]);
+  ASSERT_EQ(Dir0, "target firstprivate(x)");     // (12)
+  ASSERT_EQ(Dir1, "teams shared(x)");            // (6), (17)
+  ASSERT_EQ(Dir2, "distribute firstprivate(x)"); // (5)
+}
+
+TEST_F(OpenMPDecompositionTest, Firstprivate3) {
+  omp::Object x{"x"};
+
+  omp::List<omp::Clause> Clauses{
+      {OMPC_firstprivate, omp::clause::Firstprivate{{x}}},
+      {OMPC_lastprivate, omp::clause::Lastprivate{{std::nullopt, {x}}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper,
+                                  OMPD_target_teams_distribute, Clauses);
+  ASSERT_EQ(Dec.output.size(), 3u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  std::string Dir2 = stringify(Dec.output[2]);
+  ASSERT_EQ(Dir0, "target map(2, , , , (x))"); // (12), (27)
+  ASSERT_EQ(Dir1, "teams shared(x)");          // (6), (17)
+  ASSERT_EQ(Dir2, "distribute firstprivate(x) lastprivate(, (x))"); // (5), (21)
+}
+
+TEST_F(OpenMPDecompositionTest, Firstprivate4) {
+  omp::Object x{"x"};
+
+  omp::List<omp::Clause> Clauses{
+      {OMPC_firstprivate, omp::clause::Firstprivate{{x}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_target_teams,
+                                  Clauses);
+  ASSERT_EQ(Dec.output.size(), 2u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  ASSERT_EQ(Dir0, "target firstprivate(x)"); // (12)
+  ASSERT_EQ(Dir1, "teams firstprivate(x)");  // (6)
+}
+
+TEST_F(OpenMPDecompositionTest, Firstprivate5) {
+  omp::Object x{"x"};
+
+  omp::List<omp::Clause> Clauses{
+      {OMPC_firstprivate, omp::clause::Firstprivate{{x}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper,
+                                  OMPD_parallel_masked_taskloop, Clauses);
+  ASSERT_EQ(Dec.output.size(), 3u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  std::string Dir2 = stringify(Dec.output[2]);
+  ASSERT_EQ(Dir0, "parallel shared(x)"); // (10)
+  ASSERT_EQ(Dir1, "masked");
+  ASSERT_EQ(Dir2, "taskloop firstprivate(x)"); // (9)
+}
+
+TEST_F(OpenMPDecompositionTest, Firstprivate6) {
+  omp::Object x{"x"};
+
+  omp::List<omp::Clause> Clauses{
+      {OMPC_firstprivate, omp::clause::Firstprivate{{x}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_parallel_masked,
+                                  Clauses);
+  ASSERT_EQ(Dec.output.size(), 2u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  ASSERT_EQ(Dir0, "parallel firstprivate(x)"); // (10)
+  ASSERT_EQ(Dir1, "masked");
+}
+
+TEST_F(OpenMPDecompositionTest, Firstprivate7) {
+  omp::Object x{"x"};
+
+  omp::List<omp::Clause> Clauses{
+      {OMPC_firstprivate, omp::clause::Firstprivate{{x}}},
+  };
+
+  // Composite constructs are still decomposed.
+  omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_teams_distribute,
+                                  Clauses);
+  ASSERT_EQ(Dec.output.size(), 2u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  ASSERT_EQ(Dir0, "teams shared(x)");            // (17)
+  ASSERT_EQ(Dir1, "distribute firstprivate(x)"); // (5)
+}
+
+// LASTPRIVATE
+// [5.2:115:7-8]
+// Directives: distribute, do, for, loop, sections, simd, taskloop
+//
+// [5.2:340:21-30]
+// (21) The effect of the lastprivate clause is as if it is applied to all leaf
+// constructs that permit the clause.
+// (22) If the parallel construct is among the constituent constructs and the
+// list item is not also specified in the firstprivate clause, then the effect
+// of the lastprivate clause is as if the shared clause with the same list item
+// is applied to the parallel construct.
+// (24) If the teams construct is among the constituent constructs and the list
+// item is not also specified in the firstprivate clause, then the effect of the
+// lastprivate clause is as if the shared clause with the same list item is
+// applied to the teams construct.
+// (27) If the target construct is among the constituent constructs and the list
+// item is not the base variable or base pointer of a list item that appears in
+// a map clause, the effect of the lastprivate clause is as if the same list
+// item appears in a map clause with a map-type of tofrom.
+TEST_F(OpenMPDecompositionTest, Lastprivate1) {
+  omp::Object x{"x"};
+
+  omp::List<omp::Clause> Clauses{
+      {OMPC_lastprivate, omp::clause::Lastprivate{{std::nullopt, {x}}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_parallel_sections,
+                                  Clauses);
+  ASSERT_EQ(Dec.output.size(), 2u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  ASSERT_EQ(Dir0, "parallel shared(x)");          // (21), (22)
+  ASSERT_EQ(Dir1, "sections lastprivate(, (x))"); // (21)
+}
+
+TEST_F(OpenMPDecompositionTest, Lastprivate2) {
+  omp::Object x{"x"};
+
+  omp::List<omp::Clause> Clauses{
+      {OMPC_lastprivate, omp::clause::Lastprivate{{std::nullopt, {x}}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_teams_distribute,
+                                  Clauses);
+  ASSERT_EQ(Dec.output.size(), 2u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  ASSERT_EQ(Dir0, "teams shared(x)");               // (21), (25)
+  ASSERT_EQ(Dir1, "distribute lastprivate(, (x))"); // (21)
+}
+
+TEST_F(OpenMPDecompositionTest, Lastprivate3) {
+  omp::Object x{"x"};
+
+  omp::List<omp::Clause> Clauses{
+      {OMPC_lastprivate, omp::clause::Lastprivate{{std::nullopt, {x}}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_target_parallel_do,
+                                  Clauses);
+  ASSERT_EQ(Dec.output.size(), 3u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  std::string Dir2 = stringify(Dec.output[2]);
+  ASSERT_EQ(Dir0, "target map(2, , , , (x))"); // (21), (27)
+  ASSERT_EQ(Dir1, "parallel shared(x)");       // (22)
+  ASSERT_EQ(Dir2, "do lastprivate(, (x))");    // (21)
+}
+
+// SHARED
+// [5.2:110:5-6]
+// Directives: parallel, task, taskloop, teams
+//
+// [5.2:340:31-32]
+// (31) The effect of the shared, default, thread_limit, or order clause is as
+// if it is applied to all leaf constructs that permit the clause.
+TEST_F(OpenMPDecompositionTest, Shared1) {
+  omp::Object x{"x"};
+
+  omp::List<omp::Clause> Clauses{
+      {OMPC_shared, omp::clause::Shared{{x}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper,
+                                  OMPD_parallel_masked_taskloop, Clauses);
+  ASSERT_EQ(Dec.output.size(), 3u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  std::string Dir2 = stringify(Dec.output[2]);
+  ASSERT_EQ(Dir0, "parallel shared(x)"); // (31)
+  ASSERT_EQ(Dir1, "masked");             // (31)
+  ASSERT_EQ(Dir2, "taskloop shared(x)"); // (31)
+}
+
+// DEFAULT
+// [5.2:109:5-6]
+// Directives: parallel, task, taskloop, teams
+//
+// [5.2:340:31-32]
+// (31) The effect of the shared, default, thread_limit, or order clause is as
+// if it is applied to all leaf constructs that permit the clause.
+TEST_F(OpenMPDecompositionTest, Default1) {
+  omp::Object x{"x"};
+
+  omp::List<omp::Clause> Clauses{
+      {OMPC_default,
+       omp::clause::Default{
+           omp::clause::Default::DataSharingAttribute::Firstprivate}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper,
+                                  OMPD_parallel_masked_taskloop, Clauses);
+  ASSERT_EQ(Dec.output.size(), 3u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  std::string Dir2 = stringify(Dec.output[2]);
+  ASSERT_EQ(Dir0, "parallel default(0)"); // (31)
+  ASSERT_EQ(Dir1, "masked");              // (31)
+  ASSERT_EQ(Dir2, "taskloop default(0)"); // (31)
+}
+
+// THREAD_LIMIT
+// [5.2:277:14-15]
+// Directives: target, teams
+//
+// [5.2:340:31-32]
+// (31) The effect of the shared, default, thread_limit, or order clause is as
+// if it is applied to all leaf constructs that permit the clause.
+TEST_F(OpenMPDecompositionTest, ThreadLimit1) {
+  omp::Object x{"x"};
+
+  omp::List<omp::Clause> Clauses{
+      {OMPC_thread_limit, omp::clause::ThreadLimit{omp::ExprTy{}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper,
+                                  OMPD_target_teams_distribute, Clauses);
+  ASSERT_EQ(Dec.output.size(), 3u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  std::string Dir2 = stringify(Dec.output[2]);
+  ASSERT_EQ(Dir0, "target thread_limit(expr)"); // (31)
+  ASSERT_EQ(Dir1, "teams thread_limit(expr)");  // (31)
+  ASSERT_EQ(Dir2, "distribute");                // (31)
+}
+
+// ORDER
+// [5.2:234:3-4]
+// Directives: distribute, do, for, loop, simd
+//
+// [5.2:340:31-32]
+// (31) The effect of the shared, default, thread_limit, or order clause is as
+// if it is applied to all leaf constructs that permit the clause.
+TEST_F(OpenMPDecompositionTest, Order1) {
+  omp::Object x{"x"};
+
+  omp::List<omp::Clause> Clauses{
+      {OMPC_order,
+       omp::clause::Order{{omp::clause::Order::OrderModifier::Unconstrained,
+                           omp::clause::Order::Ordering::Concurrent}}},
+  };
+
+  omp::ConstructDecomposition Dec(
+      AnyVersion, Helper, OMPD_target_teams_distribute_parallel_for_simd,
+      Clauses);
+  ASSERT_EQ(Dec.output.size(), 6u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  std::string Dir2 = stringify(Dec.output[2]);
+  std::string Dir3 = stringify(Dec.output[3]);
+  std::string Dir4 = stringify(Dec.output[4]);
+  std::string Dir5 = stringify(Dec.output[5]);
+  ASSERT_EQ(Dir0, "target"); // (31)
+  ASSERT_EQ(Dir1, "teams");  // (31)
+  // XXX OMP.td doesn't list "order" as allowed for "distribute"
+  ASSERT_EQ(Dir2, "distribute");       // (31)
+  ASSERT_EQ(Dir3, "parallel");         // (31)
+  ASSERT_EQ(Dir4, "for order(1, 0)");  // (31)
+  ASSERT_EQ(Dir5, "simd order(1, 0)"); // (31)
+}
+
+// ALLOCATE
+// [5.2:178:7-9]
+// Directives: allocators, distribute, do, for, parallel, scope, sections,
+// single, target, task, taskgroup, taskloop, teams
+//
+// [5.2:340:33-35]
+// (33) The effect of the allocate clause is as if it is applied to all leaf
+// constructs that permit the clause and to which a data-sharing attribute
+// clause that may create a private copy of the same list item is applied.
+TEST_F(OpenMPDecompositionTest, Allocate1) {
+  omp::Object x{"x"};
+
+  omp::List<omp::Clause> Clauses{
+      {OMPC_allocate,
+       omp::clause::Allocate{{std::nullopt, std::nullopt, std::nullopt, {x}}}},
+      {OMPC_private, omp::clause::Private{{x}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_parallel_sections,
+                                  Clauses);
+  ASSERT_EQ(Dec.output.size(), 2u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  ASSERT_EQ(Dir0, "parallel");                                // (33)
+  ASSERT_EQ(Dir1, "sections private(x) allocate(, , , (x))"); // (33)
+}
+
+// REDUCTION
+// [5.2:134:17-18]
+// Directives: do, for, loop, parallel, scope, sections, simd, taskloop, teams
+//
+// [5.2:340-341:36-13]
+// (36) The effect of the reduction clause is as if it is applied to all leaf
+// constructs that permit the clause, except for the following constructs:
+//  (1) The parallel construct, when combined with the sections,
+//      worksharing-loop, loop, or taskloop construct; and
+//  (3) The teams construct, when combined with the loop construct.
+// (4) For the parallel and teams constructs above, the effect of the reduction
+// clause instead is as if each list item or, for any list item that is an array
+// item, its corresponding base array or base pointer appears in a shared clause
+// for the construct.
+// (6) If the task reduction-modifier is specified, the effect is as if it only
+// modifies the behavior of the reduction clause on the innermost leaf construct
+// that accepts the modifier (see Section 5.5.8).
+// (8) If the inscan reduction-modifier is specified, the effect is as if it
+// modifies the behavior of the reduction clause on all constructs of the
+// combined construct to which the clause is applied and that accept the
+// modifier.
+// (10) If a list item in a reduction clause on a combined target construct does
+// not have the same base variable or base pointer as a list item in a map
+// clause on the construct, then the effect is as if the list item in the
+// reduction clause appears as a list item in a map clause with a map-type of
+// tofrom.
+namespace red {
+// Make is easier to construct reduction operators from built-in intrinsics.
+omp::clause::ReductionOperator
+makeOp(omp::clause::DefinedOperator::IntrinsicOperator Op) {
+  return omp::clause::ReductionOperator{omp::clause::DefinedOperator{Op}};
+}
+} // namespace red
+
+TEST_F(OpenMPDecompositionTest, Reduction1) {
+  omp::Object x{"x"};
+  auto Add = red::makeOp(omp::clause::DefinedOperator::IntrinsicOperator::Add);
+
+  omp::List<omp::Clause> Clauses{
+      {OMPC_reduction, omp::clause::Reduction{{std::nullopt, {Add}, {x}}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_parallel_sections,
+                                  Clauses);
+  ASSERT_EQ(Dec.output.size(), 2u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  ASSERT_EQ(Dir0, "parallel shared(x)");             // (36), (1), (4)
+  ASSERT_EQ(Dir1, "sections reduction(, (3), (x))"); // (36)
+}
+
+TEST_F(OpenMPDecompositionTest, Reduction2) {
+  omp::Object x{"x"};
+  auto Add = red::makeOp(omp::clause::DefinedOperator::IntrinsicOperator::Add);
+
+  omp::List<omp::Clause> Clauses{
+      {OMPC_reduction, omp::clause::Reduction{{std::nullopt, {Add}, {x}}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_parallel_masked,
+                                  Clauses);
+  ASSERT_EQ(Dec.output.size(), 2u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  ASSERT_EQ(Dir0, "parallel reduction(, (3), (x))"); // (36), (1), (4)
+  ASSERT_EQ(Dir1, "masked");                         // (36)
+}
+
+TEST_F(OpenMPDecompositionTest, Reduction3) {
+  omp::Object x{"x"};
+  auto Add = red::makeOp(omp::clause::DefinedOperator::IntrinsicOperator::Add);
+
+  omp::List<omp::Clause> Clauses{
+      {OMPC_reduction, omp::clause::Reduction{{std::nullopt, {Add}, {x}}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_teams_loop, Clauses);
+  ASSERT_EQ(Dec.output.size(), 2u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  ASSERT_EQ(Dir0, "teams shared(x)");            // (36), (3), (4)
+  ASSERT_EQ(Dir1, "loop reduction(, (3), (x))"); // (36)
+}
+
+TEST_F(OpenMPDecompositionTest, Reduction4) {
+  omp::Object x{"x"};
+  auto Add = red::makeOp(omp::clause::DefinedOperator::IntrinsicOperator::Add);
+
+  omp::List<omp::Clause> Clauses{
+      {OMPC_reduction, omp::clause::Reduction{{std::nullopt, {Add}, {x}}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper,
+                                  OMPD_teams_distribute_parallel_for, Clauses);
+  ASSERT_EQ(Dec.output.size(), 4u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  std::string Dir2 = stringify(Dec.output[2]);
+  std::string Dir3 = stringify(Dec.output[3]);
+  ASSERT_EQ(Dir0, "teams reduction(, (3), (x))"); // (36), (3)
+  ASSERT_EQ(Dir1, "distribute");                  // (36)
+  ASSERT_EQ(Dir2, "parallel shared(x)");          // (36), (1), (4)
+  ASSERT_EQ(Dir3, "for reduction(, (3), (x))");   // (36)
+}
+
+TEST_F(OpenMPDecompositionTest, Reduction5) {
+  omp::Object x{"x"};
+  auto Add = red::makeOp(omp::clause::DefinedOperator::IntrinsicOperator::Add);
+  auto TaskMod = omp::clause::Reduction::ReductionModifier::Task;
+
+  omp::List<omp::Clause> Clauses{
+      {OMPC_reduction, omp::clause::Reduction{{TaskMod, {Add}, {x}}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper,
+                                  OMPD_teams_distribute_parallel_for, Clauses);
+  ASSERT_EQ(Dec.output.size(), 4u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  std::string Dir2 = stringify(Dec.output[2]);
+  std::string Dir3 = stringify(Dec.output[3]);
+  ASSERT_EQ(Dir0, "teams reduction(, (3), (x))"); // (36), (3), (6)
+  ASSERT_EQ(Dir1, "distribute");                  // (36)
+  ASSERT_EQ(Dir2, "parallel shared(x)");          // (36), (1), (4)
+  ASSERT_EQ(Dir3, "for reduction(2, (3), (x))");  // (36), (6)
+}
+
+TEST_F(OpenMPDecompositionTest, Reduction6) {
+  omp::Object x{"x"};
+  auto Add = red::makeOp(omp::clause::DefinedOperator::IntrinsicOperator::Add);
+  auto InscanMod = omp::clause::Reduction::ReductionModifier::Inscan;
+
+  omp::List<omp::Clause> Clauses{
+      {OMPC_reduction, omp::clause::Reduction{{InscanMod, {Add}, {x}}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper,
+                                  OMPD_teams_distribute_parallel_for, Clauses);
+  ASSERT_EQ(Dec.output.size(), 4u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  std::string Dir2 = stringify(Dec.output[2]);
+  std::string Dir3 = stringify(Dec.output[3]);
+  ASSERT_EQ(Dir0, "teams reduction(, (3), (x))"); // (36), (3), (8)
+  ASSERT_EQ(Dir1, "distribute");                  // (36)
+  ASSERT_EQ(Dir2, "parallel shared(x)");          // (36), (1), (4)
+  ASSERT_EQ(Dir3, "for reduction(1, (3), (x))");  // (36), (8)
+}
+
+TEST_F(OpenMPDecompositionTest, Reduction7) {
+  omp::Object x{"x"};
+  auto Add = red::makeOp(omp::clause::DefinedOperator::IntrinsicOperator::Add);
+
+  omp::List<omp::Clause> Clauses{
+      {OMPC_reduction, omp::clause::Reduction{{std::nullopt, {Add}, {x}}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_target_parallel_do,
+                                  Clauses);
+  ASSERT_EQ(Dec.output.size(), 3u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  std::string Dir2 = stringify(Dec.output[2]);
+  // XXX Currently OMP.td allows "reduction" on "target".
+  ASSERT_EQ(Dir0,
+            "target reduction(, (3), (x)) map(2, , , , (x))"); // (36), (10)
+  ASSERT_EQ(Dir1, "parallel shared(x)");                       // (36), (1), (4)
+  ASSERT_EQ(Dir2, "do reduction(, (3), (x))");                 // (36)
+}
+
+// IF
+// [5.2:72:7-9]
+// Directives: cancel, parallel, simd, target, target data, target enter data,
+// target exit data, target update, task, taskloop
+//
+// [5.2:72:15-18]
+// (15) For combined or composite constructs, the if clause only applies to the
+// semantics of the construct named in the directive-name-modifier.
+// (16) For a combined or composite construct, if no directive-name-modifier is
+// specified then the if clause applies to all constituent constructs to which
+// an if clause can apply.
+TEST_F(OpenMPDecompositionTest, If1) {
+  omp::List<omp::Clause> Clauses{
+      {OMPC_if,
+       omp::clause::If{{llvm::omp::Directive::OMPD_parallel, omp::ExprTy{}}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper,
+                                  OMPD_target_parallel_for_simd, Clauses);
+  ASSERT_EQ(Dec.output.size(), 4u);
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  std::string Dir2 = stringify(Dec.output[2]);
+  std::string Dir3 = stringify(Dec.output[3]);
+  ASSERT_EQ(Dir0, "target");              // (15)
+  ASSERT_EQ(Dir1, "parallel if(, expr)"); // (15)
+  ASSERT_EQ(Dir2, "for");                 // (15)
+  ASSERT_EQ(Dir3, "simd");                // (15)
+}
+
+TEST_F(OpenMPDecompositionTest, If2) {
+  omp::List<omp::Clause> Clauses{
+      {OMPC_if, omp::clause::If{{std::nullopt, omp::ExprTy{}}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper,
+                                  OMPD_target_parallel_for_simd, Clauses);
+  ASSERT_EQ(Dec.output.size(), 4u);
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  std::string Dir2 = stringify(Dec.output[2]);
+  std::string Dir3 = stringify(Dec.output[3]);
+  ASSERT_EQ(Dir0, "target if(, expr)");   // (16)
+  ASSERT_EQ(Dir1, "parallel if(, expr)"); // (16)
+  ASSERT_EQ(Dir2, "for");                 // (16)
+  ASSERT_EQ(Dir3, "simd if(, expr)");     // (16)
+}
+
+// LINEAR
+// [5.2:118:1-2]
+// Directives: declare simd, do, for, simd
+//
+// [5.2:341:15-22]
+// (15.1) The effect of the linear clause is as if it is applied to the
+// innermost leaf construct.
+// (15.2) Additionally, if the list item is not the iteration variable of a simd
+// or worksharing-loop SIMD construct, the effect on the outer leaf constructs
+// is as if the list item was specified in firstprivate and lastprivate clauses
+// on the combined or composite construct, with the rules specified above
+// applied.
+// (19) If a list item of the linear clause is the iteration variable of a simd
+// or worksharing-loop SIMD construct and it is not declared in the construct,
+// the effect on the outer leaf constructs is as if the list item was specified
+// in a lastprivate clause on the combined or composite construct with the rules
+// specified above applied.
+TEST_F(OpenMPDecompositionTest, Linear1) {
+  omp::Object x{"x"};
+
+  omp::List<omp::Clause> Clauses{
+      {OMPC_linear,
+       omp::clause::Linear{{std::nullopt, std::nullopt, std::nullopt, {x}}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_for_simd, Clauses);
+  ASSERT_EQ(Dec.output.size(), 2u);
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  ASSERT_EQ(Dir0, "for firstprivate(x) lastprivate(, (x))"); // (15.1), (15.2)
+  ASSERT_EQ(Dir1, "simd linear(, , , (x)) lastprivate(, (x))"); // (15.1)
+}
+
+// NOWAIT
+// [5.2:308:11-13]
+// Directives: dispatch, do, for, interop, scope, sections, single, target,
+// target enter data, target exit data, target update, taskwait, workshare
+//
+// [5.2:341:23]
+// (23) The effect of the nowait clause is as if it is applied to the outermost
+// leaf construct that permits it.
+TEST_F(OpenMPDecompositionTest, Nowait1) {
+  omp::List<omp::Clause> Clauses{
+      {OMPC_nowait, omp::clause::Nowait{}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_target_parallel_for,
+                                  Clauses);
+  ASSERT_EQ(Dec.output.size(), 3u);
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  std::string Dir2 = stringify(Dec.output[2]);
+  ASSERT_EQ(Dir0, "target nowait"); // (23)
+  ASSERT_EQ(Dir1, "parallel");      // (23)
+  ASSERT_EQ(Dir2, "for");           // (23)
+}
+} // namespace
diff --git a/llvm/unittests/IR/AttributesTest.cpp b/llvm/unittests/IR/AttributesTest.cpp
index 1accdfd3f6a4..da72fa14510c 100644
--- a/llvm/unittests/IR/AttributesTest.cpp
+++ b/llvm/unittests/IR/AttributesTest.cpp
@@ -7,8 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/Attributes.h"
+#include "llvm-c/Core.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/AttributeMask.h"
+#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/LLVMContext.h"
@@ -308,4 +310,80 @@ TEST(Attributes, RemoveParamAttributes) {
   EXPECT_EQ(AL.getNumAttrSets(), 0U);
 }
 
+TEST(Attributes, ConstantRangeAttributeCAPI) {
+  LLVMContext C;
+  {
+    const unsigned NumBits = 8;
+    const uint64_t LowerWords[] = {0};
+    const uint64_t UpperWords[] = {42};
+
+    ConstantRange Range(APInt(NumBits, ArrayRef(LowerWords)),
+                        APInt(NumBits, ArrayRef(UpperWords)));
+
+    Attribute RangeAttr = Attribute::get(C, Attribute::Range, Range);
+    auto OutAttr = unwrap(LLVMCreateConstantRangeAttribute(
+        wrap(&C), Attribute::Range, NumBits, LowerWords, UpperWords));
+    EXPECT_EQ(OutAttr, RangeAttr);
+  }
+  {
+    const unsigned NumBits = 128;
+    const uint64_t LowerWords[] = {1, 1};
+    const uint64_t UpperWords[] = {42, 42};
+
+    ConstantRange Range(APInt(NumBits, ArrayRef(LowerWords)),
+                        APInt(NumBits, ArrayRef(UpperWords)));
+
+    Attribute RangeAttr = Attribute::get(C, Attribute::Range, Range);
+    auto OutAttr = unwrap(LLVMCreateConstantRangeAttribute(
+        wrap(&C), Attribute::Range, NumBits, LowerWords, UpperWords));
+    EXPECT_EQ(OutAttr, RangeAttr);
+  }
+}
+
+TEST(Attributes, CalleeAttributes) {
+  const char *IRString = R"IR(
+    declare void @f1(i32 %i)
+    declare void @f2(i32 range(i32 1, 2) %i)
+
+    define void @g1(i32 %i) {
+      call void @f1(i32 %i)
+      ret void
+    }
+    define void @g2(i32 %i) {
+      call void @f2(i32 %i)
+      ret void
+    }
+    define void @g3(i32 %i) {
+      call void @f1(i32 range(i32 3, 4) %i)
+      ret void
+    }
+    define void @g4(i32 %i) {
+      call void @f2(i32 range(i32 3, 4) %i)
+      ret void
+    }
+  )IR";
+
+  SMDiagnostic Err;
+  LLVMContext Context;
+  std::unique_ptr<Module> M = parseAssemblyString(IRString, Err, Context);
+  ASSERT_TRUE(M);
+
+  {
+    auto *I = cast<CallBase>(&M->getFunction("g1")->getEntryBlock().front());
+    ASSERT_FALSE(I->getParamAttr(0, Attribute::Range).isValid());
+  }
+  {
+    auto *I = cast<CallBase>(&M->getFunction("g2")->getEntryBlock().front());
+    ASSERT_TRUE(I->getParamAttr(0, Attribute::Range).isValid());
+  }
+  {
+    auto *I = cast<CallBase>(&M->getFunction("g3")->getEntryBlock().front());
+    ASSERT_TRUE(I->getParamAttr(0, Attribute::Range).isValid());
+  }
+  {
+    auto *I = cast<CallBase>(&M->getFunction("g4")->getEntryBlock().front());
+    ASSERT_TRUE(I->getParamAttr(0, Attribute::Range).isValid());
+  }
+}
+
 } // end anonymous namespace
diff --git a/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp b/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
index f873bbd4293a..91a0745a0cc7 100644
--- a/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
+++ b/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
@@ -25,8 +25,6 @@
 
 using namespace llvm;
 
-extern cl::opt<bool> UseNewDbgInfoFormat;
-
 static std::unique_ptr<Module> parseIR(LLVMContext &C, const char *IR) {
   SMDiagnostic Err;
   std::unique_ptr<Module> Mod = parseAssemblyString(IR, Err, C);
@@ -44,8 +42,6 @@ namespace {
 // by DbgVariableRecords, the dbg.value replacement.
 TEST(BasicBlockDbgInfoTest, InsertAfterSelf) {
   LLVMContext C;
-  UseNewDbgInfoFormat = true;
-
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
       call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
@@ -72,8 +68,6 @@ TEST(BasicBlockDbgInfoTest, InsertAfterSelf) {
     !11 = !DILocation(line: 1, column: 1, scope: !6)
 )");
 
-  // Convert the module to "new" form debug-info.
-  M->convertToNewDbgValues();
   // Fetch the entry block.
   BasicBlock &BB = M->getFunction("f")->getEntryBlock();
 
@@ -103,16 +97,10 @@ TEST(BasicBlockDbgInfoTest, InsertAfterSelf) {
   EXPECT_TRUE(RetInst->hasDbgRecords());
   auto Range2 = RetInst->getDbgRecordRange();
   EXPECT_EQ(std::distance(Range2.begin(), Range2.end()), 1u);
-
-  M->convertFromNewDbgValues();
-
-  UseNewDbgInfoFormat = false;
 }
 
 TEST(BasicBlockDbgInfoTest, SplitBasicBlockBefore) {
   LLVMContext C;
-  UseNewDbgInfoFormat = true;
-
   std::unique_ptr<Module> M = parseIR(C, R"---(
     define dso_local void @func() #0 !dbg !10 {
       %1 = alloca i32, align 4
@@ -150,8 +138,6 @@ TEST(BasicBlockDbgInfoTest, SplitBasicBlockBefore) {
   )---");
   ASSERT_TRUE(M);
 
-  M->convertToNewDbgValues();
-
   Function *F = M->getFunction("func");
 
   BasicBlock &BB = F->getEntryBlock();
@@ -161,14 +147,10 @@ TEST(BasicBlockDbgInfoTest, SplitBasicBlockBefore) {
   BasicBlock &BBBefore = F->getEntryBlock();
   auto I2 = std::prev(BBBefore.end(), 2);
   ASSERT_TRUE(I2->hasDbgRecords());
-
-  UseNewDbgInfoFormat = false;
 }
 
 TEST(BasicBlockDbgInfoTest, MarkerOperations) {
   LLVMContext C;
-  UseNewDbgInfoFormat = true;
-
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
       call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
@@ -196,8 +178,6 @@ TEST(BasicBlockDbgInfoTest, MarkerOperations) {
 
   // Fetch the entry block,
   BasicBlock &BB = M->getFunction("f")->getEntryBlock();
-  // Convert the module to "new" form debug-info.
-  M->convertToNewDbgValues();
   EXPECT_EQ(BB.size(), 2u);
 
   // Fetch out our two markers,
@@ -295,14 +275,10 @@ TEST(BasicBlockDbgInfoTest, MarkerOperations) {
 
   // Teardown,
   Instr1->insertBefore(BB, BB.begin());
-
-  UseNewDbgInfoFormat = false;
 }
 
 TEST(BasicBlockDbgInfoTest, HeadBitOperations) {
   LLVMContext C;
-  UseNewDbgInfoFormat = true;
-
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
       %b = add i16 %a, 1, !dbg !11
@@ -332,8 +308,6 @@ TEST(BasicBlockDbgInfoTest, HeadBitOperations) {
   // Test that the movement of debug-data when using moveBefore etc and
   // insertBefore etc are governed by the "head" bit of iterators.
   BasicBlock &BB = M->getFunction("f")->getEntryBlock();
-  // Convert the module to "new" form debug-info.
-  M->convertToNewDbgValues();
 
   // Test that the head bit behaves as expected: it should be set when the
   // code wants the _start_ of the block, but not otherwise.
@@ -404,14 +378,10 @@ TEST(BasicBlockDbgInfoTest, HeadBitOperations) {
               DInst->DebugMarker->StoredDbgRecords.empty());
   EXPECT_FALSE(CInst->DebugMarker->StoredDbgRecords.empty());
   EXPECT_EQ(&*BB.begin(), CInst);
-
-  UseNewDbgInfoFormat = false;
 }
 
 TEST(BasicBlockDbgInfoTest, InstrDbgAccess) {
   LLVMContext C;
-  UseNewDbgInfoFormat = true;
-
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
       %b = add i16 %a, 1, !dbg !11
@@ -441,8 +411,6 @@ TEST(BasicBlockDbgInfoTest, InstrDbgAccess) {
   // Check that DbgVariableRecords can be accessed from Instructions without
   // digging into the depths of DbgMarkers.
   BasicBlock &BB = M->getFunction("f")->getEntryBlock();
-  // Convert the module to "new" form debug-info.
-  M->convertToNewDbgValues();
 
   Instruction *BInst = &*BB.begin();
   Instruction *CInst = BInst->getNextNode();
@@ -483,8 +451,6 @@ TEST(BasicBlockDbgInfoTest, InstrDbgAccess) {
   CInst->dropOneDbgRecord(DVR1);
   EXPECT_FALSE(CInst->hasDbgRecords());
   EXPECT_EQ(CInst->DebugMarker->StoredDbgRecords.size(), 0u);
-
-  UseNewDbgInfoFormat = false;
 }
 
 /* Let's recall the big illustration from BasicBlock::spliceDebugInfo:
@@ -577,9 +543,7 @@ protected:
   DbgVariableRecord *DVRA, *DVRB, *DVRConst;
 
   void SetUp() override {
-    UseNewDbgInfoFormat = true;
     M = parseIR(C, SpliceTestIR.c_str());
-    M->convertToNewDbgValues();
 
     BBEntry = &M->getFunction("f")->getEntryBlock();
     BBExit = BBEntry->getNextNode();
@@ -599,8 +563,6 @@ protected:
         cast<DbgVariableRecord>(&*CInst->DebugMarker->StoredDbgRecords.begin());
   }
 
-  void TearDown() override { UseNewDbgInfoFormat = false; }
-
   bool InstContainsDbgVariableRecord(Instruction *I, DbgVariableRecord *DVR) {
     for (DbgRecord &D : I->getDbgRecordRange()) {
       if (&D == DVR) {
@@ -1187,8 +1149,6 @@ metadata !9, metadata !DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1,
 // then the trailing DbgVariableRecords should get flushed back out.
 TEST(BasicBlockDbgInfoTest, DbgSpliceTrailing) {
   LLVMContext C;
-  UseNewDbgInfoFormat = true;
-
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
     entry:
@@ -1219,7 +1179,6 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceTrailing) {
 
   BasicBlock &Entry = M->getFunction("f")->getEntryBlock();
   BasicBlock &Exit = *Entry.getNextNode();
-  M->convertToNewDbgValues();
 
   // Begin by forcing entry block to have dangling DbgVariableRecord.
   Entry.getTerminator()->eraseFromParent();
@@ -1234,8 +1193,6 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceTrailing) {
   Instruction *BInst = &*Entry.begin();
   ASSERT_TRUE(BInst->DebugMarker);
   EXPECT_EQ(BInst->DebugMarker->StoredDbgRecords.size(), 1u);
-
-  UseNewDbgInfoFormat = false;
 }
 
 // When we remove instructions from the program, adjacent DbgVariableRecords
@@ -1244,8 +1201,6 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceTrailing) {
 // dbg.values. Test that this can be replicated correctly by DbgVariableRecords
 TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsert) {
   LLVMContext C;
-  UseNewDbgInfoFormat = true;
-
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
     entry:
@@ -1273,7 +1228,6 @@ TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsert) {
 )");
 
   BasicBlock &Entry = M->getFunction("f")->getEntryBlock();
-  M->convertToNewDbgValues();
 
   // Fetch the relevant instructions from the converted function.
   Instruction *SubInst = &*Entry.begin();
@@ -1316,16 +1270,12 @@ TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsert) {
   EXPECT_EQ(std::distance(R4.begin(), R4.end()), 1u);
   auto R5 = RetInst->getDbgRecordRange();
   EXPECT_EQ(std::distance(R5.begin(), R5.end()), 1u);
-
-  UseNewDbgInfoFormat = false;
 }
 
 // Test instruction removal and re-insertion, this time with one
 // DbgVariableRecord that should hop up one instruction.
 TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsertForOneDbgVariableRecord) {
   LLVMContext C;
-  UseNewDbgInfoFormat = true;
-
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
     entry:
@@ -1352,7 +1302,6 @@ TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsertForOneDbgVariableRecord) {
 )");
 
   BasicBlock &Entry = M->getFunction("f")->getEntryBlock();
-  M->convertToNewDbgValues();
 
   // Fetch the relevant instructions from the converted function.
   Instruction *SubInst = &*Entry.begin();
@@ -1391,8 +1340,6 @@ TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsertForOneDbgVariableRecord) {
   EXPECT_FALSE(RetInst->hasDbgRecords());
   auto R3 = AddInst->getDbgRecordRange();
   EXPECT_EQ(std::distance(R3.begin(), R3.end()), 1u);
-
-  UseNewDbgInfoFormat = false;
 }
 
 // Similar to the above, what if we splice into an empty block with debug-info,
@@ -1401,8 +1348,6 @@ TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsertForOneDbgVariableRecord) {
 // of the i16 0 dbg.value.
 TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty1) {
   LLVMContext C;
-  UseNewDbgInfoFormat = true;
-
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
     entry:
@@ -1436,7 +1381,6 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty1) {
   Function &F = *M->getFunction("f");
   BasicBlock &Entry = F.getEntryBlock();
   BasicBlock &Exit = *Entry.getNextNode();
-  M->convertToNewDbgValues();
 
   // Begin by forcing entry block to have dangling DbgVariableRecord.
   Entry.getTerminator()->eraseFromParent();
@@ -1463,16 +1407,12 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty1) {
 
   // No trailing DbgVariableRecords in the entry block now.
   EXPECT_EQ(Entry.getTrailingDbgRecords(), nullptr);
-
-  UseNewDbgInfoFormat = false;
 }
 
 // Similar test again, but this time: splice the contents of exit into entry,
 // with the intention of leaving the first dbg.value (i16 0) behind.
 TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty2) {
   LLVMContext C;
-  UseNewDbgInfoFormat = true;
-
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
     entry:
@@ -1506,7 +1446,6 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty2) {
   Function &F = *M->getFunction("f");
   BasicBlock &Entry = F.getEntryBlock();
   BasicBlock &Exit = *Entry.getNextNode();
-  M->convertToNewDbgValues();
 
   // Begin by forcing entry block to have dangling DbgVariableRecord.
   Entry.getTerminator()->eraseFromParent();
@@ -1537,16 +1476,12 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty2) {
   EXPECT_FALSE(Exit.getTrailingDbgRecords()->empty());
   Exit.getTrailingDbgRecords()->eraseFromParent();
   Exit.deleteTrailingDbgRecords();
-
-  UseNewDbgInfoFormat = false;
 }
 
 // What if we moveBefore end() -- there might be no debug-info there, in which
 // case we shouldn't crash.
 TEST(BasicBlockDbgInfoTest, DbgMoveToEnd) {
   LLVMContext C;
-  UseNewDbgInfoFormat = true;
-
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
     entry:
@@ -1576,7 +1511,6 @@ TEST(BasicBlockDbgInfoTest, DbgMoveToEnd) {
   Function &F = *M->getFunction("f");
   BasicBlock &Entry = F.getEntryBlock();
   BasicBlock &Exit = *Entry.getNextNode();
-  M->convertToNewDbgValues();
 
   // Move the return to the end of the entry block.
   Instruction *Br = Entry.getTerminator();
@@ -1589,8 +1523,6 @@ TEST(BasicBlockDbgInfoTest, DbgMoveToEnd) {
   EXPECT_EQ(Entry.getTrailingDbgRecords(), nullptr);
   EXPECT_EQ(Exit.getTrailingDbgRecords(), nullptr);
   EXPECT_FALSE(Ret->hasDbgRecords());
-
-  UseNewDbgInfoFormat = false;
 }
 
 } // End anonymous namespace.
diff --git a/llvm/unittests/IR/DebugInfoTest.cpp b/llvm/unittests/IR/DebugInfoTest.cpp
index d06b979bf4a1..cac8acbe15a7 100644
--- a/llvm/unittests/IR/DebugInfoTest.cpp
+++ b/llvm/unittests/IR/DebugInfoTest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/DebugInfo.h"
+#include "../lib/IR/LLVMContextImpl.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/DIBuilder.h"
@@ -20,6 +21,7 @@
 #include "llvm/IR/Verifier.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Transforms/Utils/Local.h"
+
 #include "gtest/gtest.h"
 
 using namespace llvm;
@@ -154,7 +156,7 @@ TEST(StripTest, LoopMetadata) {
   EXPECT_FALSE(BrokenDebugInfo);
 }
 
-TEST(MetadataTest, DeleteInstUsedByDbgValue) {
+TEST(MetadataTest, DeleteInstUsedByDbgRecord) {
   LLVMContext C;
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
@@ -185,12 +187,13 @@ TEST(MetadataTest, DeleteInstUsedByDbgValue) {
 
   // Find the dbg.value using %b.
   SmallVector<DbgValueInst *, 1> DVIs;
-  findDbgValues(DVIs, &I);
+  SmallVector<DbgVariableRecord *, 1> DVRs;
+  findDbgValues(DVIs, &I, &DVRs);
 
   // Delete %b. The dbg.value should now point to undef.
   I.eraseFromParent();
-  EXPECT_EQ(DVIs[0]->getNumVariableLocationOps(), 1u);
-  EXPECT_TRUE(isa<UndefValue>(DVIs[0]->getValue(0)));
+  EXPECT_EQ(DVRs[0]->getNumVariableLocationOps(), 1u);
+  EXPECT_TRUE(isa<UndefValue>(DVRs[0]->getValue(0)));
 }
 
 TEST(DbgVariableIntrinsic, EmptyMDIsKillLocation) {
@@ -228,8 +231,8 @@ TEST(DbgVariableIntrinsic, EmptyMDIsKillLocation) {
 
   // Get the dbg.declare.
   Function &F = *cast<Function>(M->getNamedValue("fun"));
-  DbgVariableIntrinsic *DbgDeclare =
-      cast<DbgVariableIntrinsic>(&F.front().front());
+  DbgVariableRecord *DbgDeclare =
+      cast<DbgVariableRecord>(&*F.front().front().getDbgRecordRange().begin());
   // Check that this form counts as a "no location" marker.
   EXPECT_TRUE(DbgDeclare->isKillLocation());
 }
@@ -237,6 +240,9 @@ TEST(DbgVariableIntrinsic, EmptyMDIsKillLocation) {
 // Duplicate of above test, but in DbgVariableRecord representation.
 TEST(MetadataTest, DeleteInstUsedByDbgVariableRecord) {
   LLVMContext C;
+  bool OldDbgValueMode = UseNewDbgInfoFormat;
+  UseNewDbgInfoFormat = true;
+
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
       %b = add i16 %a, 1, !dbg !11
@@ -262,10 +268,7 @@ TEST(MetadataTest, DeleteInstUsedByDbgVariableRecord) {
     !11 = !DILocation(line: 1, column: 1, scope: !6)
 )");
 
-  bool OldDbgValueMode = UseNewDbgInfoFormat;
-  UseNewDbgInfoFormat = true;
   Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHI();
-  M->convertToNewDbgValues();
 
   // Find the DbgVariableRecords using %b.
   SmallVector<DbgValueInst *, 2> DVIs;
@@ -287,6 +290,8 @@ TEST(MetadataTest, DeleteInstUsedByDbgVariableRecord) {
 // Ensure that the order of dbg.value intrinsics returned by findDbgValues, and
 // their corresponding DbgVariableRecord representation, are consistent.
 TEST(MetadataTest, OrderingOfDbgVariableRecords) {
+  bool OldDbgValueMode = UseNewDbgInfoFormat;
+  UseNewDbgInfoFormat = false;
   LLVMContext C;
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
@@ -314,8 +319,6 @@ TEST(MetadataTest, OrderingOfDbgVariableRecords) {
     !12 = !DILocalVariable(name: "bar", scope: !6, file: !1, line: 1, type: !10)
 )");
 
-  bool OldDbgValueMode = UseNewDbgInfoFormat;
-  UseNewDbgInfoFormat = true;
   Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHI();
 
   SmallVector<DbgValueInst *, 2> DVIs;
@@ -349,7 +352,7 @@ TEST(MetadataTest, OrderingOfDbgVariableRecords) {
   UseNewDbgInfoFormat = OldDbgValueMode;
 }
 
-TEST(DIBuiler, CreateFile) {
+TEST(DIBuilder, CreateFile) {
   LLVMContext Ctx;
   std::unique_ptr<Module> M(new Module("MyModule", Ctx));
   DIBuilder DIB(*M);
@@ -513,14 +516,15 @@ TEST(DbgAssignIntrinsicTest, replaceVariableLocationOp) {
   Value *V1 = Fun.getArg(0);
   Value *P1 = Fun.getArg(1);
   Value *P2 = Fun.getArg(2);
-  DbgAssignIntrinsic *DAI = cast<DbgAssignIntrinsic>(Fun.begin()->begin());
-  ASSERT_TRUE(V1 == DAI->getVariableLocationOp(0));
-  ASSERT_TRUE(P1 == DAI->getAddress());
+  DbgVariableRecord *DbgAssign = cast<DbgVariableRecord>(
+      &*Fun.front().front().getDbgRecordRange().begin());
+  ASSERT_TRUE(V1 == DbgAssign->getVariableLocationOp(0));
+  ASSERT_TRUE(P1 == DbgAssign->getAddress());
 
 #define TEST_REPLACE(Old, New, ExpectedValue, ExpectedAddr)                    \
-  DAI->replaceVariableLocationOp(Old, New);                                    \
-  EXPECT_EQ(DAI->getVariableLocationOp(0), ExpectedValue);                     \
-  EXPECT_EQ(DAI->getAddress(), ExpectedAddr);
+  DbgAssign->replaceVariableLocationOp(Old, New);                              \
+  EXPECT_EQ(DbgAssign->getVariableLocationOp(0), ExpectedValue);               \
+  EXPECT_EQ(DbgAssign->getAddress(), ExpectedAddr);
 
   // Replace address only.
   TEST_REPLACE(/*Old*/ P1, /*New*/ P2, /*Value*/ V1, /*Address*/ P2);
@@ -531,8 +535,8 @@ TEST(DbgAssignIntrinsicTest, replaceVariableLocationOp) {
 
   // Replace address only, value uses a DIArgList.
   // Value = {DIArgList(V1)}, Addr = P1.
-  DAI->setRawLocation(DIArgList::get(C, ValueAsMetadata::get(V1)));
-  DAI->setExpression(DIExpression::get(
+  DbgAssign->setRawLocation(DIArgList::get(C, ValueAsMetadata::get(V1)));
+  DbgAssign->setExpression(DIExpression::get(
       C, {dwarf::DW_OP_LLVM_arg, 0, dwarf::DW_OP_stack_value}));
   TEST_REPLACE(/*Old*/ P1, /*New*/ P2, /*Value*/ V1, /*Address*/ P2);
 #undef TEST_REPLACE
@@ -618,11 +622,11 @@ TEST(AssignmentTrackingTest, Utils) {
   //
   // Check there are two llvm.dbg.assign intrinsics linked to Alloca.
   auto CheckFun1Mapping = [&Alloca]() {
-    auto Markers = at::getAssignmentMarkers(&Alloca);
+    auto Markers = at::getDVRAssignmentMarkers(&Alloca);
     EXPECT_TRUE(std::distance(Markers.begin(), Markers.end()) == 2);
     // Check those two entries are distinct.
-    DbgAssignIntrinsic *First = *Markers.begin();
-    DbgAssignIntrinsic *Second = *std::next(Markers.begin());
+    DbgVariableRecord *First = *Markers.begin();
+    DbgVariableRecord *Second = *std::next(Markers.begin());
     EXPECT_NE(First, Second);
 
     // Check that we can get back to Alloca from each llvm.dbg.assign.
@@ -658,7 +662,7 @@ TEST(AssignmentTrackingTest, Utils) {
   DIAssignID *Fun2ID = cast_or_null<DIAssignID>(
       Fun2Alloca.getMetadata(LLVMContext::MD_DIAssignID));
   EXPECT_NE(New, Fun2ID);
-  auto Fun2Markers = at::getAssignmentMarkers(&Fun2Alloca);
+  auto Fun2Markers = at::getDVRAssignmentMarkers(&Fun2Alloca);
   ASSERT_TRUE(std::distance(Fun2Markers.begin(), Fun2Markers.end()) == 1);
   auto Fun2Insts = at::getAssignmentInsts(*Fun2Markers.begin());
   ASSERT_TRUE(std::distance(Fun2Insts.begin(), Fun2Insts.end()) == 1);
@@ -667,10 +671,10 @@ TEST(AssignmentTrackingTest, Utils) {
   // 3. Check that deleting dbg.assigns from a specific instruction works.
   Instruction &Fun3Alloca =
       *M->getFunction("fun3")->getEntryBlock().getFirstNonPHIOrDbg();
-  auto Fun3Markers = at::getAssignmentMarkers(&Fun3Alloca);
+  auto Fun3Markers = at::getDVRAssignmentMarkers(&Fun3Alloca);
   ASSERT_TRUE(std::distance(Fun3Markers.begin(), Fun3Markers.end()) == 1);
   at::deleteAssignmentMarkers(&Fun3Alloca);
-  Fun3Markers = at::getAssignmentMarkers(&Fun3Alloca);
+  Fun3Markers = at::getDVRAssignmentMarkers(&Fun3Alloca);
   EXPECT_EQ(Fun3Markers.empty(), true);
 
   // 4. Check that deleting works and applies only to the target function.
@@ -681,7 +685,7 @@ TEST(AssignmentTrackingTest, Utils) {
   // llvm.dbg.assign.
   EXPECT_EQ(Fun2ID, cast_or_null<DIAssignID>(
                         Fun2Alloca.getMetadata(LLVMContext::MD_DIAssignID)));
-  EXPECT_FALSE(at::getAssignmentMarkers(&Fun2Alloca).empty());
+  EXPECT_FALSE(at::getDVRAssignmentMarkers(&Fun2Alloca).empty());
 }
 
 TEST(IRBuilder, GetSetInsertionPointWithEmptyBasicBlock) {
@@ -767,12 +771,12 @@ TEST(AssignmentTrackingTest, InstrMethods) {
   // Use SetVectors to check that the attachments and markers are unique
   // (another test requirement).
   SetVector<Metadata *> OrigIDs;
-  SetVector<DbgAssignIntrinsic *> Markers;
+  SetVector<DbgVariableRecord *> Markers;
   for (const Instruction *SI : Stores) {
     Metadata *ID = SI->getMetadata(LLVMContext::MD_DIAssignID);
     ASSERT_TRUE(OrigIDs.insert(ID));
     ASSERT_TRUE(ID != nullptr);
-    auto Range = at::getAssignmentMarkers(SI);
+    auto Range = at::getDVRAssignmentMarkers(SI);
     ASSERT_TRUE(std::distance(Range.begin(), Range.end()) == 1);
     ASSERT_TRUE(Markers.insert(*Range.begin()));
   }
@@ -865,6 +869,8 @@ TEST(AssignmentTrackingTest, InstrMethods) {
 // dbg.values that have been converted to a non-instruction format.
 TEST(MetadataTest, ConvertDbgToDbgVariableRecord) {
   LLVMContext C;
+  bool OldDbgValueMode = UseNewDbgInfoFormat;
+  UseNewDbgInfoFormat = false;
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
       call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
@@ -1039,14 +1045,14 @@ TEST(MetadataTest, ConvertDbgToDbgVariableRecord) {
   // The record of those trailing DbgVariableRecords would dangle and cause an
   // assertion failure if it lived until the end of the LLVMContext.
   ExitBlock->deleteTrailingDbgRecords();
+  UseNewDbgInfoFormat = OldDbgValueMode;
 }
 
 TEST(MetadataTest, DbgVariableRecordConversionRoutines) {
   LLVMContext C;
 
-  // For the purpose of this test, set and un-set the command line option
-  // corresponding to UseNewDbgInfoFormat.
-  UseNewDbgInfoFormat = true;
+  bool OldDbgValueMode = UseNewDbgInfoFormat;
+  UseNewDbgInfoFormat = false;
 
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
@@ -1077,6 +1083,11 @@ TEST(MetadataTest, DbgVariableRecordConversionRoutines) {
     !11 = !DILocation(line: 1, column: 1, scope: !6)
 )");
 
+  // For the purpose of this test, set and un-set the command line option
+  // corresponding to UseNewDbgInfoFormat, but only after parsing, to ensure
+  // that the IR starts off in the old format.
+  UseNewDbgInfoFormat = true;
+
   // Check that the conversion routines and utilities between dbg.value
   // debug-info format and DbgVariableRecords works.
   Function *F = M->getFunction("f");
@@ -1181,7 +1192,56 @@ TEST(MetadataTest, DbgVariableRecordConversionRoutines) {
   EXPECT_EQ(DVI2->getVariable(), DLV2);
   EXPECT_EQ(DVI2->getExpression(), Expr2);
 
-  UseNewDbgInfoFormat = false;
+  UseNewDbgInfoFormat = OldDbgValueMode;
+}
+
+// Test that the hashing function for DISubprograms representing methods produce
+// the same result after replacing their scope (the type containing the
+// subprogram) from a temporary DIType with the permanent one.
+TEST(DIBuilder, HashingDISubprogram) {
+  LLVMContext Ctx;
+  std::unique_ptr<Module> M = std::make_unique<Module>("MyModule", Ctx);
+  DIBuilder DIB(*M);
+
+  DIFile *F = DIB.createFile("main.c", "/");
+  DICompileUnit *CU =
+      DIB.createCompileUnit(dwarf::DW_LANG_C, F, "Test", false, "", 0);
+
+  llvm::TempDIType ForwardDeclaredType =
+      llvm::TempDIType(DIB.createReplaceableCompositeType(
+          llvm::dwarf::DW_TAG_structure_type, "MyType", CU, F, 0, 0, 8, 8, {},
+          "UniqueIdentifier"));
+
+  // The hashing function is different for declarations and definitions, so
+  // create one of each.
+  DISubprogram *Declaration =
+      DIB.createMethod(ForwardDeclaredType.get(), "MethodName", "LinkageName",
+                       F, 0, DIB.createSubroutineType({}));
+
+  DISubprogram *Definition = DIB.createFunction(
+      ForwardDeclaredType.get(), "MethodName", "LinkageName", F, 0,
+      DIB.createSubroutineType({}), 0, DINode::FlagZero,
+      llvm::DISubprogram::SPFlagDefinition, nullptr, Declaration);
+
+  // Produce the hash with the temporary scope.
+  unsigned HashDeclaration =
+      MDNodeKeyImpl<DISubprogram>(Declaration).getHashValue();
+  unsigned HashDefinition =
+      MDNodeKeyImpl<DISubprogram>(Definition).getHashValue();
+
+  // Instantiate the real scope and replace the temporary one with it.
+  DICompositeType *Type = DIB.createStructType(CU, "MyType", F, 0, 8, 8, {}, {},
+                                               {}, 0, {}, "UniqueIdentifier");
+  DIB.replaceTemporary(std::move(ForwardDeclaredType), Type);
+
+  // Now make sure the hashing is consistent.
+  unsigned HashDeclarationAfter =
+      MDNodeKeyImpl<DISubprogram>(Declaration).getHashValue();
+  unsigned HashDefinitionAfter =
+      MDNodeKeyImpl<DISubprogram>(Definition).getHashValue();
+
+  EXPECT_EQ(HashDeclaration, HashDeclarationAfter);
+  EXPECT_EQ(HashDefinition, HashDefinitionAfter);
 }
 
 } // end namespace
diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp
index 2001df090aed..ff96df858120 100644
--- a/llvm/unittests/IR/IRBuilderTest.cpp
+++ b/llvm/unittests/IR/IRBuilderTest.cpp
@@ -994,17 +994,17 @@ TEST_F(IRBuilderTest, DIBuilder) {
     EXPECT_TRUE(verifyModule(*M));
   };
 
-  // Test in old-debug mode.
-  EXPECT_FALSE(M->IsNewDbgInfoFormat);
+  // Test in new-debug mode.
+  EXPECT_TRUE(M->IsNewDbgInfoFormat);
   RunTest();
 
-  // Test in new-debug mode.
-  // Reset the test then call convertToNewDbgValues to flip the flag
+  // Test in old-debug mode.
+  // Reset the test then call convertFromNewDbgValues to flip the flag
   // on the test's Module, Function and BasicBlock.
   TearDown();
   SetUp();
-  M->convertToNewDbgValues();
-  EXPECT_TRUE(M->IsNewDbgInfoFormat);
+  M->convertFromNewDbgValues();
+  EXPECT_FALSE(M->IsNewDbgInfoFormat);
   RunTest();
 }
 
diff --git a/llvm/unittests/IR/InstructionsTest.cpp b/llvm/unittests/IR/InstructionsTest.cpp
index b47c73f0b329..b6044b286292 100644
--- a/llvm/unittests/IR/InstructionsTest.cpp
+++ b/llvm/unittests/IR/InstructionsTest.cpp
@@ -25,12 +25,15 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/NoFolder.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm-c/Core.h"
 #include "gmock/gmock-matchers.h"
 #include "gtest/gtest.h"
 #include <memory>
 
+extern llvm::cl::opt<bool> UseNewDbgInfoFormat;
+
 namespace llvm {
 namespace {
 
@@ -1460,6 +1463,8 @@ TEST(InstructionsTest, GetSplat) {
 
 TEST(InstructionsTest, SkipDebug) {
   LLVMContext C;
+  bool OldDbgValueMode = UseNewDbgInfoFormat;
+  UseNewDbgInfoFormat = false;
   std::unique_ptr<Module> M = parseIR(C,
                                       R"(
       declare void @llvm.dbg.value(metadata, metadata, metadata)
@@ -1495,6 +1500,7 @@ TEST(InstructionsTest, SkipDebug) {
 
   // After the terminator, there are no non-debug instructions.
   EXPECT_EQ(nullptr, Term->getNextNonDebugInstruction());
+  UseNewDbgInfoFormat = OldDbgValueMode;
 }
 
 TEST(InstructionsTest, PhiMightNotBeFPMathOperator) {
diff --git a/llvm/unittests/IR/PatternMatch.cpp b/llvm/unittests/IR/PatternMatch.cpp
index a25885faa3a4..9f91b4f3f993 100644
--- a/llvm/unittests/IR/PatternMatch.cpp
+++ b/llvm/unittests/IR/PatternMatch.cpp
@@ -611,6 +611,128 @@ TEST_F(PatternMatchTest, BitCast) {
   EXPECT_FALSE(m_ElementWiseBitCast(m_Value()).match(NXV2I64ToNXV4I32));
 }
 
+TEST_F(PatternMatchTest, CheckedInt) {
+  Type *I8Ty = IRB.getInt8Ty();
+  const Constant * CRes = nullptr;
+  auto CheckUgt1 = [](const APInt &C) { return C.ugt(1); };
+  auto CheckTrue = [](const APInt &) { return true; };
+  auto CheckFalse = [](const APInt &) { return false; };
+  auto CheckNonZero = [](const APInt &C) { return !C.isZero(); };
+  auto CheckPow2 = [](const APInt &C) { return C.isPowerOf2(); };
+
+  auto DoScalarCheck = [&](int8_t Val) {
+    APInt APVal(8, Val);
+    Constant *C = ConstantInt::get(I8Ty, Val);
+
+    CRes = nullptr;
+    EXPECT_TRUE(m_CheckedInt(CheckTrue).match(C));
+    EXPECT_TRUE(m_CheckedInt(CRes, CheckTrue).match(C));
+    EXPECT_EQ(CRes, C);
+
+    CRes = nullptr;
+    EXPECT_FALSE(m_CheckedInt(CheckFalse).match(C));
+    EXPECT_FALSE(m_CheckedInt(CRes, CheckFalse).match(C));
+    EXPECT_EQ(CRes, nullptr);
+
+    CRes = nullptr;
+    EXPECT_EQ(CheckUgt1(APVal), m_CheckedInt(CheckUgt1).match(C));
+    EXPECT_EQ(CheckUgt1(APVal), m_CheckedInt(CRes, CheckUgt1).match(C));
+    if (CheckUgt1(APVal))
+      EXPECT_EQ(CRes, C);
+
+    CRes = nullptr;
+    EXPECT_EQ(CheckNonZero(APVal), m_CheckedInt(CheckNonZero).match(C));
+    EXPECT_EQ(CheckNonZero(APVal), m_CheckedInt(CRes, CheckNonZero).match(C));
+    if (CheckNonZero(APVal))
+      EXPECT_EQ(CRes, C);
+
+    CRes = nullptr;
+    EXPECT_EQ(CheckPow2(APVal), m_CheckedInt(CheckPow2).match(C));
+    EXPECT_EQ(CheckPow2(APVal), m_CheckedInt(CRes, CheckPow2).match(C));
+    if (CheckPow2(APVal))
+      EXPECT_EQ(CRes, C);
+
+  };
+
+  DoScalarCheck(0);
+  DoScalarCheck(1);
+  DoScalarCheck(2);
+  DoScalarCheck(3);
+
+  EXPECT_FALSE(m_CheckedInt(CheckTrue).match(UndefValue::get(I8Ty)));
+  EXPECT_FALSE(m_CheckedInt(CRes, CheckTrue).match(UndefValue::get(I8Ty)));
+  EXPECT_EQ(CRes, nullptr);
+
+  EXPECT_FALSE(m_CheckedInt(CheckFalse).match(UndefValue::get(I8Ty)));
+  EXPECT_FALSE(m_CheckedInt(CRes, CheckFalse).match(UndefValue::get(I8Ty)));
+  EXPECT_EQ(CRes, nullptr);
+
+  EXPECT_FALSE(m_CheckedInt(CheckTrue).match(PoisonValue::get(I8Ty)));
+  EXPECT_FALSE(m_CheckedInt(CRes, CheckTrue).match(PoisonValue::get(I8Ty)));
+  EXPECT_EQ(CRes, nullptr);
+
+  EXPECT_FALSE(m_CheckedInt(CheckFalse).match(PoisonValue::get(I8Ty)));
+  EXPECT_FALSE(m_CheckedInt(CRes, CheckFalse).match(PoisonValue::get(I8Ty)));
+  EXPECT_EQ(CRes, nullptr);
+
+  auto DoVecCheckImpl = [&](ArrayRef<std::optional<int8_t>> Vals,
+                            function_ref<bool(const APInt &)> CheckFn,
+                            bool UndefAsPoison) {
+    SmallVector<Constant *> VecElems;
+    std::optional<bool> Okay;
+    bool AllSame = true;
+    bool HasUndef = false;
+    std::optional<APInt> First;
+    for (const std::optional<int8_t> &Val : Vals) {
+      if (!Val.has_value()) {
+        VecElems.push_back(UndefAsPoison ? PoisonValue::get(I8Ty)
+                                         : UndefValue::get(I8Ty));
+        HasUndef = true;
+      } else {
+        if (!Okay.has_value())
+          Okay = true;
+        APInt APVal(8, *Val);
+        if (!First.has_value())
+          First = APVal;
+        else
+          AllSame &= First->eq(APVal);
+        Okay = *Okay && CheckFn(APVal);
+        VecElems.push_back(ConstantInt::get(I8Ty, *Val));
+      }
+    }
+
+    Constant *C = ConstantVector::get(VecElems);
+    EXPECT_EQ(!(HasUndef && !UndefAsPoison) && Okay.value_or(false),
+              m_CheckedInt(CheckFn).match(C));
+
+    CRes = nullptr;
+    bool Expec = !(HasUndef && !UndefAsPoison) && Okay.value_or(false);
+    EXPECT_EQ(Expec, m_CheckedInt(CRes, CheckFn).match(C));
+    if (Expec) {
+      EXPECT_NE(CRes, nullptr);
+      if (AllSame)
+        EXPECT_EQ(CRes, C);
+    }
+  };
+  auto DoVecCheck = [&](ArrayRef<std::optional<int8_t>> Vals) {
+    DoVecCheckImpl(Vals, CheckTrue, /*UndefAsPoison=*/false);
+    DoVecCheckImpl(Vals, CheckFalse, /*UndefAsPoison=*/false);
+    DoVecCheckImpl(Vals, CheckTrue, /*UndefAsPoison=*/true);
+    DoVecCheckImpl(Vals, CheckFalse, /*UndefAsPoison=*/true);
+    DoVecCheckImpl(Vals, CheckUgt1, /*UndefAsPoison=*/false);
+    DoVecCheckImpl(Vals, CheckNonZero, /*UndefAsPoison=*/false);
+    DoVecCheckImpl(Vals, CheckPow2, /*UndefAsPoison=*/false);
+  };
+
+  DoVecCheck({0, 1});
+  DoVecCheck({1, 1});
+  DoVecCheck({1, 2});
+  DoVecCheck({1, std::nullopt});
+  DoVecCheck({1, std::nullopt, 1});
+  DoVecCheck({1, std::nullopt, 2});
+  DoVecCheck({std::nullopt, std::nullopt, std::nullopt});
+}
+
 TEST_F(PatternMatchTest, Power2) {
   Value *C128 = IRB.getInt32(128);
   Value *CNeg128 = ConstantExpr::getNeg(cast<Constant>(C128));
@@ -1397,21 +1519,59 @@ TEST_F(PatternMatchTest, VectorUndefFloat) {
   EXPECT_FALSE(match(VectorInfPoison, m_Finite()));
   EXPECT_FALSE(match(VectorNaNPoison, m_Finite()));
 
+  auto CheckTrue = [](const APFloat &) { return true; };
+  EXPECT_FALSE(match(VectorZeroUndef, m_CheckedFp(CheckTrue)));
+  EXPECT_TRUE(match(VectorZeroPoison, m_CheckedFp(CheckTrue)));
+  EXPECT_TRUE(match(ScalarPosInf, m_CheckedFp(CheckTrue)));
+  EXPECT_TRUE(match(ScalarNegInf, m_CheckedFp(CheckTrue)));
+  EXPECT_TRUE(match(ScalarNaN, m_CheckedFp(CheckTrue)));
+  EXPECT_FALSE(match(VectorInfUndef, m_CheckedFp(CheckTrue)));
+  EXPECT_TRUE(match(VectorInfPoison, m_CheckedFp(CheckTrue)));
+  EXPECT_FALSE(match(VectorNaNUndef, m_CheckedFp(CheckTrue)));
+  EXPECT_TRUE(match(VectorNaNPoison, m_CheckedFp(CheckTrue)));
+
+  auto CheckFalse = [](const APFloat &) { return false; };
+  EXPECT_FALSE(match(VectorZeroUndef, m_CheckedFp(CheckFalse)));
+  EXPECT_FALSE(match(VectorZeroPoison, m_CheckedFp(CheckFalse)));
+  EXPECT_FALSE(match(ScalarPosInf, m_CheckedFp(CheckFalse)));
+  EXPECT_FALSE(match(ScalarNegInf, m_CheckedFp(CheckFalse)));
+  EXPECT_FALSE(match(ScalarNaN, m_CheckedFp(CheckFalse)));
+  EXPECT_FALSE(match(VectorInfUndef, m_CheckedFp(CheckFalse)));
+  EXPECT_FALSE(match(VectorInfPoison, m_CheckedFp(CheckFalse)));
+  EXPECT_FALSE(match(VectorNaNUndef, m_CheckedFp(CheckFalse)));
+  EXPECT_FALSE(match(VectorNaNPoison, m_CheckedFp(CheckFalse)));
+
+  auto CheckNonNaN = [](const APFloat &C) { return !C.isNaN(); };
+  EXPECT_FALSE(match(VectorZeroUndef, m_CheckedFp(CheckNonNaN)));
+  EXPECT_TRUE(match(VectorZeroPoison, m_CheckedFp(CheckNonNaN)));
+  EXPECT_TRUE(match(ScalarPosInf, m_CheckedFp(CheckNonNaN)));
+  EXPECT_TRUE(match(ScalarNegInf, m_CheckedFp(CheckNonNaN)));
+  EXPECT_FALSE(match(ScalarNaN, m_CheckedFp(CheckNonNaN)));
+  EXPECT_FALSE(match(VectorInfUndef, m_CheckedFp(CheckNonNaN)));
+  EXPECT_TRUE(match(VectorInfPoison, m_CheckedFp(CheckNonNaN)));
+  EXPECT_FALSE(match(VectorNaNUndef, m_CheckedFp(CheckNonNaN)));
+  EXPECT_FALSE(match(VectorNaNPoison, m_CheckedFp(CheckNonNaN)));
+
   const APFloat *C;
+  const Constant *CC;
   // Regardless of whether poison is allowed,
   // a fully undef/poison constant does not match.
   EXPECT_FALSE(match(ScalarUndef, m_APFloat(C)));
   EXPECT_FALSE(match(ScalarUndef, m_APFloatForbidPoison(C)));
   EXPECT_FALSE(match(ScalarUndef, m_APFloatAllowPoison(C)));
+  EXPECT_FALSE(match(ScalarUndef, m_CheckedFp(CC, CheckTrue)));
   EXPECT_FALSE(match(VectorUndef, m_APFloat(C)));
   EXPECT_FALSE(match(VectorUndef, m_APFloatForbidPoison(C)));
   EXPECT_FALSE(match(VectorUndef, m_APFloatAllowPoison(C)));
+  EXPECT_FALSE(match(VectorUndef, m_CheckedFp(CC, CheckTrue)));
   EXPECT_FALSE(match(ScalarPoison, m_APFloat(C)));
   EXPECT_FALSE(match(ScalarPoison, m_APFloatForbidPoison(C)));
   EXPECT_FALSE(match(ScalarPoison, m_APFloatAllowPoison(C)));
+  EXPECT_FALSE(match(ScalarPoison, m_CheckedFp(CC, CheckTrue)));
   EXPECT_FALSE(match(VectorPoison, m_APFloat(C)));
   EXPECT_FALSE(match(VectorPoison, m_APFloatForbidPoison(C)));
   EXPECT_FALSE(match(VectorPoison, m_APFloatAllowPoison(C)));
+  EXPECT_FALSE(match(VectorPoison, m_CheckedFp(CC, CheckTrue)));
 
   // We can always match simple constants and simple splats.
   C = nullptr;
@@ -1433,6 +1593,13 @@ TEST_F(PatternMatchTest, VectorUndefFloat) {
   EXPECT_TRUE(match(VectorZero, m_APFloatAllowPoison(C)));
   EXPECT_TRUE(C->isZero());
 
+  CC = nullptr;
+  EXPECT_TRUE(match(VectorZero, m_CheckedFp(CC, CheckTrue)));
+  EXPECT_TRUE(CC->isNullValue());
+  CC = nullptr;
+  EXPECT_TRUE(match(VectorZero, m_CheckedFp(CC, CheckNonNaN)));
+  EXPECT_TRUE(CC->isNullValue());
+
   // Splats with undef are never allowed.
   // Whether splats with poison can be matched depends on the matcher.
   EXPECT_FALSE(match(VectorZeroUndef, m_APFloat(C)));
@@ -1456,6 +1623,18 @@ TEST_F(PatternMatchTest, VectorUndefFloat) {
   C = nullptr;
   EXPECT_TRUE(match(VectorZeroPoison, m_Finite(C)));
   EXPECT_TRUE(C->isZero());
+  CC = nullptr;
+  C = nullptr;
+  EXPECT_TRUE(match(VectorZeroPoison, m_CheckedFp(CC, CheckTrue)));
+  EXPECT_NE(CC, nullptr);
+  EXPECT_TRUE(match(CC, m_APFloatAllowPoison(C)));
+  EXPECT_TRUE(C->isZero());
+  CC = nullptr;
+  C = nullptr;
+  EXPECT_TRUE(match(VectorZeroPoison, m_CheckedFp(CC, CheckNonNaN)));
+  EXPECT_NE(CC, nullptr);
+  EXPECT_TRUE(match(CC, m_APFloatAllowPoison(C)));
+  EXPECT_TRUE(C->isZero());
 }
 
 TEST_F(PatternMatchTest, FloatingPointFNeg) {
diff --git a/llvm/unittests/IR/VPIntrinsicTest.cpp b/llvm/unittests/IR/VPIntrinsicTest.cpp
index 626ab2e9a9c5..d6508abd5197 100644
--- a/llvm/unittests/IR/VPIntrinsicTest.cpp
+++ b/llvm/unittests/IR/VPIntrinsicTest.cpp
@@ -27,7 +27,8 @@ namespace {
 static const char *ReductionIntOpcodes[] = {
     "add", "mul", "and", "or", "xor", "smin", "smax", "umin", "umax"};
 
-static const char *ReductionFPOpcodes[] = {"fadd", "fmul", "fmin", "fmax"};
+static const char *ReductionFPOpcodes[] = {"fadd", "fmul",     "fmin",
+                                           "fmax", "fminimum", "fmaximum"};
 
 class VPIntrinsicTest : public testing::Test {
 protected:
diff --git a/llvm/unittests/IR/ValueTest.cpp b/llvm/unittests/IR/ValueTest.cpp
index 246c2fc7fe40..33a86d510d45 100644
--- a/llvm/unittests/IR/ValueTest.cpp
+++ b/llvm/unittests/IR/ValueTest.cpp
@@ -13,6 +13,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSlotTracker.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
 using namespace llvm;
@@ -255,6 +256,8 @@ TEST(ValueTest, getLocalSlotDeath) {
 TEST(ValueTest, replaceUsesOutsideBlock) {
   // Check that Value::replaceUsesOutsideBlock(New, BB) replaces uses outside
   // BB, including dbg.* uses of MetadataAsValue(ValueAsMetadata(this)).
+  bool OldDbgValueMode = UseNewDbgInfoFormat;
+  UseNewDbgInfoFormat = false;
   const auto *IR = R"(
     define i32 @f() !dbg !6 {
     entry:
@@ -315,6 +318,7 @@ TEST(ValueTest, replaceUsesOutsideBlock) {
   // These users are outside Entry so should be changed.
   ASSERT_TRUE(ExitDbg->getValue(0) == cast<Value>(B));
   ASSERT_TRUE(Ret->getOperand(0) == cast<Value>(B));
+  UseNewDbgInfoFormat = OldDbgValueMode;
 }
 
 TEST(ValueTest, replaceUsesOutsideBlockDbgVariableRecord) {
@@ -359,10 +363,6 @@ TEST(ValueTest, replaceUsesOutsideBlockDbgVariableRecord) {
   if (!M)
     Err.print("ValueTest", errs());
 
-  bool OldDbgValueMode = UseNewDbgInfoFormat;
-  UseNewDbgInfoFormat = true;
-  M->convertToNewDbgValues();
-
   auto GetNext = [](auto *I) { return &*++I->getIterator(); };
 
   Function *F = M->getFunction("f");
@@ -389,7 +389,6 @@ TEST(ValueTest, replaceUsesOutsideBlockDbgVariableRecord) {
   EXPECT_TRUE(DVR1->getVariableLocationOp(0) == cast<Value>(A));
   // These users are outside Entry so should be changed.
   EXPECT_TRUE(DVR2->getVariableLocationOp(0) == cast<Value>(B));
-  UseNewDbgInfoFormat = OldDbgValueMode;
 }
 
 } // end anonymous namespace
diff --git a/llvm/unittests/IR/VerifierTest.cpp b/llvm/unittests/IR/VerifierTest.cpp
index c8db7fb7ab84..d79b4f3d8a44 100644
--- a/llvm/unittests/IR/VerifierTest.cpp
+++ b/llvm/unittests/IR/VerifierTest.cpp
@@ -173,27 +173,27 @@ TEST(VerifierTest, CrossModuleRef) {
   std::string Error;
   raw_string_ostream ErrorOS(Error);
   EXPECT_TRUE(verifyModule(M2, &ErrorOS));
-  EXPECT_TRUE(StringRef(ErrorOS.str())
-                  .equals("Global is referenced in a different module!\n"
-                          "ptr @foo2\n"
-                          "; ModuleID = 'M2'\n"
-                          "  %call = call i32 @foo2()\n"
-                          "ptr @foo1\n"
-                          "; ModuleID = 'M1'\n"
-                          "Global is used by function in a different module\n"
-                          "ptr @foo2\n"
-                          "; ModuleID = 'M2'\n"
-                          "ptr @foo3\n"
-                          "; ModuleID = 'M3'\n"));
+  EXPECT_TRUE(StringRef(ErrorOS.str()) ==
+              "Global is referenced in a different module!\n"
+              "ptr @foo2\n"
+              "; ModuleID = 'M2'\n"
+              "  %call = call i32 @foo2()\n"
+              "ptr @foo1\n"
+              "; ModuleID = 'M1'\n"
+              "Global is used by function in a different module\n"
+              "ptr @foo2\n"
+              "; ModuleID = 'M2'\n"
+              "ptr @foo3\n"
+              "; ModuleID = 'M3'\n");
 
   Error.clear();
   EXPECT_TRUE(verifyModule(M1, &ErrorOS));
-  EXPECT_TRUE(StringRef(ErrorOS.str()).equals(
-      "Referencing function in another module!\n"
-      "  %call = call i32 @foo2()\n"
-      "; ModuleID = 'M1'\n"
-      "ptr @foo2\n"
-      "; ModuleID = 'M2'\n"));
+  EXPECT_TRUE(StringRef(ErrorOS.str()) ==
+              "Referencing function in another module!\n"
+              "  %call = call i32 @foo2()\n"
+              "; ModuleID = 'M1'\n"
+              "ptr @foo2\n"
+              "; ModuleID = 'M2'\n");
 
   Error.clear();
   EXPECT_TRUE(verifyModule(M3, &ErrorOS));
diff --git a/llvm/unittests/MC/AMDGPU/CMakeLists.txt b/llvm/unittests/MC/AMDGPU/CMakeLists.txt
index 06ca89a72a7c..be8ff572e6f7 100644
--- a/llvm/unittests/MC/AMDGPU/CMakeLists.txt
+++ b/llvm/unittests/MC/AMDGPU/CMakeLists.txt
@@ -1,12 +1,20 @@
+include_directories(
+  ${PROJECT_SOURCE_DIR}/lib/Target/AMDGPU
+  ${PROJECT_BINARY_DIR}/lib/Target/AMDGPU
+  )
+
 set(LLVM_LINK_COMPONENTS
   AMDGPUCodeGen
   AMDGPUDesc
   AMDGPUInfo
+  CodeGen
+  Core
   MC
   Support
   TargetParser
   )
 
-add_llvm_unittest(AMDGPUDwarfTests
+add_llvm_unittest(AMDGPUMCTests
   DwarfRegMappings.cpp
+  SIProgramInfoMCExprs.cpp
   )
diff --git a/llvm/unittests/MC/AMDGPU/SIProgramInfoMCExprs.cpp b/llvm/unittests/MC/AMDGPU/SIProgramInfoMCExprs.cpp
new file mode 100644
index 000000000000..57828a728931
--- /dev/null
+++ b/llvm/unittests/MC/AMDGPU/SIProgramInfoMCExprs.cpp
@@ -0,0 +1,83 @@
+//===- llvm/unittests/MC/AMDGPU/SIProgramInfoMCExprs.cpp ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUHSAMetadataStreamer.h"
+#include "SIProgramInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetMachine.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+class SIProgramInfoMCExprsTest : public testing::Test {
+protected:
+  std::unique_ptr<LLVMTargetMachine> TM;
+  std::unique_ptr<LLVMContext> Ctx;
+  std::unique_ptr<MachineModuleInfo> MMI;
+  std::unique_ptr<MachineFunction> MF;
+  std::unique_ptr<Module> M;
+
+  SIProgramInfo PI;
+
+  static void SetUpTestSuite() {
+    LLVMInitializeAMDGPUTargetInfo();
+    LLVMInitializeAMDGPUTarget();
+    LLVMInitializeAMDGPUTargetMC();
+  }
+
+  SIProgramInfoMCExprsTest() {
+    std::string Triple = "amdgcn-amd-amdhsa";
+    std::string CPU = "gfx1010";
+    std::string FS = "";
+
+    std::string Error;
+    const Target *TheTarget = TargetRegistry::lookupTarget(Triple, Error);
+    TargetOptions Options;
+
+    TM.reset(static_cast<LLVMTargetMachine *>(TheTarget->createTargetMachine(
+        Triple, CPU, FS, Options, std::nullopt, std::nullopt)));
+
+    Ctx = std::make_unique<LLVMContext>();
+    M = std::make_unique<Module>("Module", *Ctx);
+    M->setDataLayout(TM->createDataLayout());
+    auto *FType = FunctionType::get(Type::getVoidTy(*Ctx), false);
+    auto *F = Function::Create(FType, GlobalValue::ExternalLinkage, "Test", *M);
+    MMI = std::make_unique<MachineModuleInfo>(TM.get());
+
+    auto *ST = TM->getSubtargetImpl(*F);
+
+    MF = std::make_unique<MachineFunction>(*F, *TM, *ST, 1, *MMI);
+    MF->initTargetMachineFunctionInfo(*ST);
+    PI.reset(*MF.get());
+  }
+};
+
+TEST_F(SIProgramInfoMCExprsTest, TestDeathHSAKernelEmit) {
+  MCContext &Ctx = MF->getContext();
+  MCSymbol *Sym = Ctx.getOrCreateSymbol("Unknown");
+  PI.ScratchSize = MCSymbolRefExpr::create(Sym, Ctx);
+
+  auto &Func = MF->getFunction();
+  Func.setCallingConv(CallingConv::AMDGPU_KERNEL);
+  AMDGPU::HSAMD::MetadataStreamerMsgPackV4 MD;
+
+  testing::internal::CaptureStderr();
+  MD.emitKernel(*MF, PI);
+  std::string err = testing::internal::GetCapturedStderr();
+  EXPECT_EQ(
+      err, "<unknown>:0: error: could not resolve expression when required.\n");
+  EXPECT_TRUE(Ctx.hadError());
+}
diff --git a/llvm/unittests/Object/DXContainerTest.cpp b/llvm/unittests/Object/DXContainerTest.cpp
index da640225617d..115de47a3cef 100644
--- a/llvm/unittests/Object/DXContainerTest.cpp
+++ b/llvm/unittests/Object/DXContainerTest.cpp
@@ -174,6 +174,49 @@ TEST(DXCFile, ParseEmptyParts) {
   }
 }
 
+// This test verify DXIL part are correctly parsed.
+// This test is based on the binary output constructed from this yaml.
+// --- !dxcontainer
+// Header:
+//   Hash:            [ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+//                      0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ]
+//   Version:
+//     Major:           1
+//     Minor:           0
+//   PartCount:       1
+// Parts:
+//   - Name:            DXIL
+//     Size:            28
+//     Program:
+//       MajorVersion:    6
+//       MinorVersion:    5
+//       ShaderKind:      5
+//       Size:            8
+//       DXILMajorVersion: 1
+//       DXILMinorVersion: 5
+//       DXILSize:        4
+//       DXIL:            [ 0x42, 0x43, 0xC0, 0xDE, ]
+// ...
+TEST(DXCFile, ParseDXILPart) {
+  uint8_t Buffer[] = {
+      0x44, 0x58, 0x42, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+      0x48, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+      0x44, 0x58, 0x49, 0x4c, 0x1c, 0x00, 0x00, 0x00, 0x65, 0x00, 0x05, 0x00,
+      0x08, 0x00, 0x00, 0x00, 0x44, 0x58, 0x49, 0x4c, 0x05, 0x01, 0x00, 0x00,
+      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x42, 0x43, 0xc0, 0xde};
+  DXContainer C =
+      llvm::cantFail(DXContainer::create(getMemoryBuffer<116>(Buffer)));
+  EXPECT_EQ(C.getHeader().PartCount, 1u);
+  const std::optional<object::DXContainer::DXILData> &DXIL = C.getDXIL();
+  EXPECT_TRUE(DXIL.has_value());
+  dxbc::ProgramHeader Header = DXIL->first;
+  EXPECT_EQ(Header.getMajorVersion(), 6u);
+  EXPECT_EQ(Header.getMinorVersion(), 5u);
+  EXPECT_EQ(Header.ShaderKind, 5u);
+  EXPECT_EQ(Header.Size, 8u);
+}
+
 static Expected<DXContainer>
 generateDXContainer(StringRef Yaml, SmallVectorImpl<char> &BinaryData) {
   DXContainerYAML::Object Obj;
diff --git a/llvm/unittests/ProfileData/InstrProfTest.cpp b/llvm/unittests/ProfileData/InstrProfTest.cpp
index 402de64fe99b..924d848176e7 100644
--- a/llvm/unittests/ProfileData/InstrProfTest.cpp
+++ b/llvm/unittests/ProfileData/InstrProfTest.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
@@ -1730,6 +1732,34 @@ TEST(SymtabTest, instr_prof_symtab_module_test) {
   Function::Create(FTy, Function::WeakODRLinkage, "Wblah", M.get());
   Function::Create(FTy, Function::WeakODRLinkage, "Wbar", M.get());
 
+  // [ptr, ptr, ptr]
+  ArrayType *VTableArrayType = ArrayType::get(
+      PointerType::get(Ctx, M->getDataLayout().getDefaultGlobalsAddressSpace()),
+      3);
+  Constant *Int32TyNull =
+      llvm::ConstantExpr::getNullValue(PointerType::getUnqual(Ctx));
+  SmallVector<llvm::Type *, 1> tys = {VTableArrayType};
+  StructType *VTableType = llvm::StructType::get(Ctx, tys);
+
+  // Create two vtables in the module, one with external linkage and the other
+  // with local linkage.
+  for (auto [Name, Linkage] :
+       {std::pair{"ExternalGV", GlobalValue::ExternalLinkage},
+        {"LocalGV", GlobalValue::InternalLinkage}}) {
+    llvm::Twine FuncName(Name, StringRef("VFunc"));
+    Function *VFunc = Function::Create(FTy, Linkage, FuncName, M.get());
+    GlobalVariable *GV = new llvm::GlobalVariable(
+        *M, VTableType, /* isConstant= */ true, Linkage,
+        llvm::ConstantStruct::get(
+            VTableType,
+            {llvm::ConstantArray::get(VTableArrayType,
+                                      {Int32TyNull, Int32TyNull, VFunc})}),
+        Name);
+    // Add type metadata for the test data, since vtables with type metadata
+    // are added to symtab.
+    GV->addTypeMetadata(16, MDString::get(Ctx, Name));
+  }
+
   InstrProfSymtab ProfSymtab;
   EXPECT_THAT_ERROR(ProfSymtab.create(*M), Succeeded());
 
@@ -1751,6 +1781,22 @@ TEST(SymtabTest, instr_prof_symtab_module_test) {
     EXPECT_EQ(PGOName, PGOFuncName);
     EXPECT_THAT(PGOFuncName.str(), EndsWith(Funcs[I].str()));
   }
+
+  for (auto [VTableName, PGOName] : {std::pair{"ExternalGV", "ExternalGV"},
+                                     {"LocalGV", "MyModule.cpp;LocalGV"}}) {
+    GlobalVariable *GV =
+        M->getGlobalVariable(VTableName, /* AllowInternal=*/true);
+
+    // Test that ProfSymtab returns the expected name given a hash.
+    std::string IRPGOName = getPGOName(*GV);
+    EXPECT_STREQ(IRPGOName.c_str(), PGOName);
+    uint64_t GUID = IndexedInstrProf::ComputeHash(IRPGOName);
+    EXPECT_EQ(IRPGOName, ProfSymtab.getFuncOrVarName(GUID));
+    EXPECT_EQ(VTableName, getParsedIRPGOName(IRPGOName).second);
+
+    // Test that ProfSymtab returns the expected global variable
+    EXPECT_EQ(GV, ProfSymtab.getGlobalVariable(GUID));
+  }
 }
 
 // Testing symtab serialization and creator/deserialization interface
diff --git a/llvm/unittests/ProfileData/MemProfTest.cpp b/llvm/unittests/ProfileData/MemProfTest.cpp
index 40335d191ba7..8b97866e403f 100644
--- a/llvm/unittests/ProfileData/MemProfTest.cpp
+++ b/llvm/unittests/ProfileData/MemProfTest.cpp
@@ -1,3 +1,11 @@
+//===- unittests/Support/MemProfTest.cpp ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include "llvm/ProfileData/MemProf.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
diff --git a/llvm/unittests/Support/LEB128Test.cpp b/llvm/unittests/Support/LEB128Test.cpp
index 08b8c5573ce6..60f5ddd568ca 100644
--- a/llvm/unittests/Support/LEB128Test.cpp
+++ b/llvm/unittests/Support/LEB128Test.cpp
@@ -155,6 +155,12 @@ TEST(LEB128Test, DecodeInvalidULEB128) {
     EXPECT_NE(Error, nullptr);                                                 \
     EXPECT_EQ(0ul, Actual);                                                    \
     EXPECT_EQ(ERROR_OFFSET, ErrorOffset);                                      \
+    Value = reinterpret_cast<const uint8_t *>(VALUE);                          \
+    Error = nullptr;                                                           \
+    Actual = decodeULEB128AndInc(Value, Value + strlen(VALUE), &Error);        \
+    EXPECT_NE(Error, nullptr);                                                 \
+    EXPECT_EQ(0ul, Actual);                                                    \
+    EXPECT_EQ(ERROR_OFFSET, Value - reinterpret_cast<const uint8_t *>(VALUE)); \
   } while (0)
 
   // Buffer overflow.
@@ -224,6 +230,12 @@ TEST(LEB128Test, DecodeInvalidSLEB128) {
     EXPECT_NE(Error, nullptr);                                                 \
     EXPECT_EQ(0ul, Actual);                                                    \
     EXPECT_EQ(ERROR_OFFSET, ErrorOffset);                                      \
+    Value = reinterpret_cast<const uint8_t *>(VALUE);                          \
+    Error = nullptr;                                                           \
+    Actual = decodeSLEB128AndInc(Value, Value + strlen(VALUE), &Error);        \
+    EXPECT_NE(Error, nullptr);                                                 \
+    EXPECT_EQ(0ul, Actual);                                                    \
+    EXPECT_EQ(ERROR_OFFSET, Value - reinterpret_cast<const uint8_t *>(VALUE)); \
   } while (0)
 
   // Buffer overflow.
@@ -246,7 +258,7 @@ TEST(LEB128Test, DecodeAndInc) {
 #define EXPECT_LEB128(FUN, VALUE, SIZE)                                        \
   do {                                                                         \
     const uint8_t *V = reinterpret_cast<const uint8_t *>(VALUE), *P = V;       \
-    auto Expected = FUN(P), Actual = FUN##AndInc(P);                           \
+    auto Expected = FUN(P), Actual = FUN##AndInc(P, P + strlen(VALUE));        \
     EXPECT_EQ(Actual, Expected);                                               \
     EXPECT_EQ(P - V, SIZE);                                                    \
   } while (0)
@@ -255,6 +267,17 @@ TEST(LEB128Test, DecodeAndInc) {
   EXPECT_LEB128(decodeSLEB128, "\x7f", 1);
   EXPECT_LEB128(decodeSLEB128, "\x80\x01", 2);
 #undef EXPECT_LEB128
+
+#define EXPECT_LEB128(FUN, VALUE, SIZE)                                        \
+  do {                                                                         \
+    const uint8_t *V = reinterpret_cast<const uint8_t *>(VALUE), *P = V;       \
+    auto Expected = FUN(P), Actual = FUN##AndIncUnsafe(P);                     \
+    EXPECT_EQ(Actual, Expected);                                               \
+    EXPECT_EQ(P - V, SIZE);                                                    \
+  } while (0)
+  EXPECT_LEB128(decodeULEB128, "\x7f", 1);
+  EXPECT_LEB128(decodeULEB128, "\x80\x01", 2);
+#undef EXPECT_LEB128
 }
 
 TEST(LEB128Test, SLEB128Size) {
diff --git a/llvm/unittests/Support/MemoryBufferTest.cpp b/llvm/unittests/Support/MemoryBufferTest.cpp
index cfee3e477d2e..4815e65c968d 100644
--- a/llvm/unittests/Support/MemoryBufferTest.cpp
+++ b/llvm/unittests/Support/MemoryBufferTest.cpp
@@ -317,13 +317,13 @@ TEST_F(MemoryBufferTest, slice) {
   EXPECT_EQ(0x4000UL, MB.get()->getBufferSize());
  
   StringRef BufData = MB.get()->getBuffer();
-  EXPECT_TRUE(BufData.substr(0x0000,8).equals("12345678"));
-  EXPECT_TRUE(BufData.substr(0x0FF8,8).equals("12345678"));
-  EXPECT_TRUE(BufData.substr(0x1000,8).equals("abcdefgh"));
-  EXPECT_TRUE(BufData.substr(0x2FF8,8).equals("abcdefgh"));
-  EXPECT_TRUE(BufData.substr(0x3000,8).equals("ABCDEFGH"));
-  EXPECT_TRUE(BufData.substr(0x3FF8,8).equals("ABCDEFGH"));
-   
+  EXPECT_TRUE(BufData.substr(0x0000, 8) == "12345678");
+  EXPECT_TRUE(BufData.substr(0x0FF8, 8) == "12345678");
+  EXPECT_TRUE(BufData.substr(0x1000, 8) == "abcdefgh");
+  EXPECT_TRUE(BufData.substr(0x2FF8, 8) == "abcdefgh");
+  EXPECT_TRUE(BufData.substr(0x3000, 8) == "ABCDEFGH");
+  EXPECT_TRUE(BufData.substr(0x3FF8, 8) == "ABCDEFGH");
+
   // Try non-page aligned.
   ErrorOr<OwningBuffer> MB2 = MemoryBuffer::getFileSlice(TestPath.str(),
                                                          0x3000, 0x0800);
@@ -332,10 +332,10 @@ TEST_F(MemoryBufferTest, slice) {
   EXPECT_EQ(0x3000UL, MB2.get()->getBufferSize());
   
   StringRef BufData2 = MB2.get()->getBuffer();
-  EXPECT_TRUE(BufData2.substr(0x0000,8).equals("12345678"));
-  EXPECT_TRUE(BufData2.substr(0x17F8,8).equals("12345678"));
-  EXPECT_TRUE(BufData2.substr(0x1800,8).equals("abcdefgh"));
-  EXPECT_TRUE(BufData2.substr(0x2FF8,8).equals("abcdefgh"));
+  EXPECT_TRUE(BufData2.substr(0x0000, 8) == "12345678");
+  EXPECT_TRUE(BufData2.substr(0x17F8, 8) == "12345678");
+  EXPECT_TRUE(BufData2.substr(0x1800, 8) == "abcdefgh");
+  EXPECT_TRUE(BufData2.substr(0x2FF8, 8) == "abcdefgh");
 }
 
 TEST_F(MemoryBufferTest, writableSlice) {
diff --git a/llvm/unittests/Support/YAMLIOTest.cpp b/llvm/unittests/Support/YAMLIOTest.cpp
index 6ac0d1b412f0..9d40b62115a6 100644
--- a/llvm/unittests/Support/YAMLIOTest.cpp
+++ b/llvm/unittests/Support/YAMLIOTest.cpp
@@ -1389,10 +1389,10 @@ TEST(YAMLIO, TestReadWriteMyFlowSequence) {
     yin >> map2;
 
     EXPECT_FALSE(yin.error());
-    EXPECT_TRUE(map2.name.equals("hello"));
+    EXPECT_TRUE(map2.name == "hello");
     EXPECT_EQ(map2.strings.size(), 2UL);
-    EXPECT_TRUE(map2.strings[0].value.equals("one"));
-    EXPECT_TRUE(map2.strings[1].value.equals("two"));
+    EXPECT_TRUE(map2.strings[0].value == "one");
+    EXPECT_TRUE(map2.strings[1].value == "two");
     EXPECT_EQ(map2.single.size(), 1UL);
     EXPECT_EQ(1,       map2.single[0]);
     EXPECT_EQ(map2.numbers.size(), 3UL);
@@ -1436,7 +1436,7 @@ TEST(YAMLIO, TestReadWriteSequenceOfMyFlowSequence) {
     yin >> map2;
 
     EXPECT_FALSE(yin.error());
-    EXPECT_TRUE(map2.name.equals("hello"));
+    EXPECT_TRUE(map2.name == "hello");
     EXPECT_EQ(map2.sequenceOfNumbers.size(), 3UL);
     EXPECT_EQ(map2.sequenceOfNumbers[0].size(), 1UL);
     EXPECT_EQ(0,    map2.sequenceOfNumbers[0][0]);
diff --git a/llvm/unittests/TargetParser/CSKYTargetParserTest.cpp b/llvm/unittests/TargetParser/CSKYTargetParserTest.cpp
index f28a2a33eb90..50e825c5f99f 100644
--- a/llvm/unittests/TargetParser/CSKYTargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/CSKYTargetParserTest.cpp
@@ -1020,7 +1020,7 @@ TEST(TargetParserTest, testInvalidCSKYArch) {
 bool testCSKYArch(StringRef Arch, StringRef DefaultCPU) {
   CSKY::ArchKind AK = CSKY::parseArch(Arch);
   bool Result = (AK != CSKY::ArchKind::INVALID);
-  Result &= CSKY::getDefaultCPU(Arch).equals(DefaultCPU);
+  Result &= CSKY::getDefaultCPU(Arch) == DefaultCPU;
   return Result;
 }
 
diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
index 3aa0178100ab..7f2d1eb8c017 100644
--- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
@@ -20,11 +20,12 @@ bool operator==(const RISCVISAUtils::ExtensionVersion &A,
   return A.Major == B.Major && A.Minor == B.Minor;
 }
 
-TEST(ParseNormalizedArchString, RejectsUpperCase) {
-  for (StringRef Input : {"RV32", "rV64", "rv32i2P0", "rv64i2p0_A2p0"}) {
+TEST(ParseNormalizedArchString, RejectsInvalidChars) {
+  for (StringRef Input :
+       {"RV32", "rV64", "rv32i2P0", "rv64i2p0_A2p0", "rv32e2.0"}) {
     EXPECT_EQ(
         toString(RISCVISAInfo::parseNormalizedArchString(Input).takeError()),
-        "string must be lowercase");
+        "string may only contain [a-z0-9_]");
   }
 }
 
@@ -37,12 +38,57 @@ TEST(ParseNormalizedArchString, RejectsInvalidBaseISA) {
 }
 
 TEST(ParseNormalizedArchString, RejectsMalformedInputs) {
-  for (StringRef Input : {"rv64i2p0_", "rv32i2p0__a2p0", "rv32e2.0", "rv64e2p",
-                          "rv32i", "rv64ip1"}) {
+  for (StringRef Input : {"rv64e2p", "rv32i", "rv64ip1"}) {
     EXPECT_EQ(
         toString(RISCVISAInfo::parseNormalizedArchString(Input).takeError()),
         "extension lacks version in expected format");
   }
+
+  for (StringRef Input : {"rv64i2p0_", "rv32i2p0__a2p0"}) {
+    EXPECT_EQ(
+        toString(RISCVISAInfo::parseNormalizedArchString(Input).takeError()),
+        "extension name missing after separator '_'");
+  }
+}
+
+TEST(ParseNormalizedArchString, RejectsOnlyVersion) {
+  for (StringRef Input : {"rv64i2p0_1p0", "rv32i2p0_1p0"}) {
+    EXPECT_EQ(
+        toString(RISCVISAInfo::parseNormalizedArchString(Input).takeError()),
+        "missing extension name");
+  }
+}
+
+TEST(ParseNormalizedArchString, RejectsBadZ) {
+  for (StringRef Input : {"rv64i2p0_z1p0", "rv32i2p0_z2a1p0"}) {
+    EXPECT_EQ(
+        toString(RISCVISAInfo::parseNormalizedArchString(Input).takeError()),
+        "'z' must be followed by a letter");
+  }
+}
+
+TEST(ParseNormalizedArchString, RejectsBadS) {
+  for (StringRef Input : {"rv64i2p0_s1p0", "rv32i2p0_s2a1p0"}) {
+    EXPECT_EQ(
+        toString(RISCVISAInfo::parseNormalizedArchString(Input).takeError()),
+        "'s' must be followed by a letter");
+  }
+}
+
+TEST(ParseNormalizedArchString, RejectsBadX) {
+  for (StringRef Input : {"rv64i2p0_x1p0", "rv32i2p0_x2a1p0"}) {
+    EXPECT_EQ(
+        toString(RISCVISAInfo::parseNormalizedArchString(Input).takeError()),
+        "'x' must be followed by a letter");
+  }
+}
+
+TEST(ParseNormalizedArchString, DuplicateExtension) {
+  for (StringRef Input : {"rv64i2p0_a2p0_a1p0"}) {
+    EXPECT_EQ(
+        toString(RISCVISAInfo::parseNormalizedArchString(Input).takeError()),
+        "duplicate extension 'a'");
+  }
 }
 
 TEST(ParseNormalizedArchString, AcceptsValidBaseISAsAndSetsXLen) {
@@ -106,12 +152,21 @@ TEST(ParseNormalizedArchString, UpdatesFLenMinVLenMaxELen) {
   EXPECT_EQ(Info.getFLen(), 64U);
   EXPECT_EQ(Info.getMinVLen(), 64U);
   EXPECT_EQ(Info.getMaxELen(), 64U);
+  EXPECT_EQ(Info.getMaxELenFp(), 64U);
 }
 
-TEST(ParseArchString, RejectsUpperCase) {
+TEST(ParseNormalizedArchString, AcceptsUnknownMultiletter) {
+  auto MaybeISAInfo = RISCVISAInfo::parseNormalizedArchString(
+      "rv64i2p0_f2p0_d2p0_zicsr2p0_ykk1p0");
+  ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded());
+  RISCVISAInfo &Info = **MaybeISAInfo;
+  EXPECT_EQ(Info.toString(), "rv64i2p0_f2p0_d2p0_zicsr2p0_ykk1p0");
+}
+
+TEST(ParseArchString, RejectsInvalidChars) {
   for (StringRef Input : {"RV32", "rV64", "rv32i2P0", "rv64i2p0_A2p0"}) {
     EXPECT_EQ(toString(RISCVISAInfo::parseArchString(Input, true).takeError()),
-              "string must be lowercase");
+              "string may only contain [a-z0-9_]");
   }
 }
 
@@ -148,6 +203,9 @@ TEST(ParseArchString, AcceptsSupportedBaseISAsAndSetsXLenAndFLen) {
   EXPECT_TRUE(ExtsRV32I.at("i") == (RISCVISAUtils::ExtensionVersion{2, 1}));
   EXPECT_EQ(InfoRV32I.getXLen(), 32U);
   EXPECT_EQ(InfoRV32I.getFLen(), 0U);
+  EXPECT_EQ(InfoRV32I.getMinVLen(), 0U);
+  EXPECT_EQ(InfoRV32I.getMaxELen(), 0U);
+  EXPECT_EQ(InfoRV32I.getMaxELenFp(), 0U);
 
   auto MaybeRV32E = RISCVISAInfo::parseArchString("rv32e", true);
   ASSERT_THAT_EXPECTED(MaybeRV32E, Succeeded());
@@ -157,6 +215,9 @@ TEST(ParseArchString, AcceptsSupportedBaseISAsAndSetsXLenAndFLen) {
   EXPECT_TRUE(ExtsRV32E.at("e") == (RISCVISAUtils::ExtensionVersion{2, 0}));
   EXPECT_EQ(InfoRV32E.getXLen(), 32U);
   EXPECT_EQ(InfoRV32E.getFLen(), 0U);
+  EXPECT_EQ(InfoRV32E.getMinVLen(), 0U);
+  EXPECT_EQ(InfoRV32E.getMaxELen(), 0U);
+  EXPECT_EQ(InfoRV32E.getMaxELenFp(), 0U);
 
   auto MaybeRV32G = RISCVISAInfo::parseArchString("rv32g", true);
   ASSERT_THAT_EXPECTED(MaybeRV32G, Succeeded());
@@ -173,6 +234,9 @@ TEST(ParseArchString, AcceptsSupportedBaseISAsAndSetsXLenAndFLen) {
               (RISCVISAUtils::ExtensionVersion{2, 0}));
   EXPECT_EQ(InfoRV32G.getXLen(), 32U);
   EXPECT_EQ(InfoRV32G.getFLen(), 64U);
+  EXPECT_EQ(InfoRV32G.getMinVLen(), 0U);
+  EXPECT_EQ(InfoRV32G.getMaxELen(), 0U);
+  EXPECT_EQ(InfoRV32G.getMaxELenFp(), 0U);
 
   auto MaybeRV64I = RISCVISAInfo::parseArchString("rv64i", true);
   ASSERT_THAT_EXPECTED(MaybeRV64I, Succeeded());
@@ -182,6 +246,9 @@ TEST(ParseArchString, AcceptsSupportedBaseISAsAndSetsXLenAndFLen) {
   EXPECT_TRUE(ExtsRV64I.at("i") == (RISCVISAUtils::ExtensionVersion{2, 1}));
   EXPECT_EQ(InfoRV64I.getXLen(), 64U);
   EXPECT_EQ(InfoRV64I.getFLen(), 0U);
+  EXPECT_EQ(InfoRV64I.getMinVLen(), 0U);
+  EXPECT_EQ(InfoRV64I.getMaxELen(), 0U);
+  EXPECT_EQ(InfoRV64I.getMaxELenFp(), 0U);
 
   auto MaybeRV64E = RISCVISAInfo::parseArchString("rv64e", true);
   ASSERT_THAT_EXPECTED(MaybeRV64E, Succeeded());
@@ -191,6 +258,9 @@ TEST(ParseArchString, AcceptsSupportedBaseISAsAndSetsXLenAndFLen) {
   EXPECT_TRUE(ExtsRV64E.at("e") == (RISCVISAUtils::ExtensionVersion{2, 0}));
   EXPECT_EQ(InfoRV64E.getXLen(), 64U);
   EXPECT_EQ(InfoRV64E.getFLen(), 0U);
+  EXPECT_EQ(InfoRV64E.getMinVLen(), 0U);
+  EXPECT_EQ(InfoRV64E.getMaxELen(), 0U);
+  EXPECT_EQ(InfoRV64E.getMaxELenFp(), 0U);
 
   auto MaybeRV64G = RISCVISAInfo::parseArchString("rv64g", true);
   ASSERT_THAT_EXPECTED(MaybeRV64G, Succeeded());
@@ -207,6 +277,38 @@ TEST(ParseArchString, AcceptsSupportedBaseISAsAndSetsXLenAndFLen) {
               (RISCVISAUtils::ExtensionVersion{2, 0}));
   EXPECT_EQ(InfoRV64G.getXLen(), 64U);
   EXPECT_EQ(InfoRV64G.getFLen(), 64U);
+  EXPECT_EQ(InfoRV64G.getMinVLen(), 0U);
+  EXPECT_EQ(InfoRV64G.getMaxELen(), 0U);
+  EXPECT_EQ(InfoRV64G.getMaxELenFp(), 0U);
+
+  auto MaybeRV64GCV = RISCVISAInfo::parseArchString("rv64gcv", true);
+  ASSERT_THAT_EXPECTED(MaybeRV64GCV, Succeeded());
+  RISCVISAInfo &InfoRV64GCV = **MaybeRV64GCV;
+  const auto &ExtsRV64GCV = InfoRV64GCV.getExtensions();
+  EXPECT_EQ(ExtsRV64GCV.size(), 17UL);
+  EXPECT_TRUE(ExtsRV64GCV.at("i") == (RISCVISAUtils::ExtensionVersion{2, 1}));
+  EXPECT_TRUE(ExtsRV64GCV.at("m") == (RISCVISAUtils::ExtensionVersion{2, 0}));
+  EXPECT_TRUE(ExtsRV64GCV.at("a") == (RISCVISAUtils::ExtensionVersion{2, 1}));
+  EXPECT_TRUE(ExtsRV64GCV.at("f") == (RISCVISAUtils::ExtensionVersion{2, 2}));
+  EXPECT_TRUE(ExtsRV64GCV.at("d") == (RISCVISAUtils::ExtensionVersion{2, 2}));
+  EXPECT_TRUE(ExtsRV64GCV.at("c") == (RISCVISAUtils::ExtensionVersion{2, 0}));
+  EXPECT_TRUE(ExtsRV64GCV.at("zicsr") == (RISCVISAUtils::ExtensionVersion{2, 0}));
+  EXPECT_TRUE(ExtsRV64GCV.at("zifencei") ==
+              (RISCVISAUtils::ExtensionVersion{2, 0}));
+  EXPECT_TRUE(ExtsRV64GCV.at("v") == (RISCVISAUtils::ExtensionVersion{1, 0}));
+  EXPECT_TRUE(ExtsRV64GCV.at("zve32x") == (RISCVISAUtils::ExtensionVersion{1, 0}));
+  EXPECT_TRUE(ExtsRV64GCV.at("zve32f") == (RISCVISAUtils::ExtensionVersion{1, 0}));
+  EXPECT_TRUE(ExtsRV64GCV.at("zve64x") == (RISCVISAUtils::ExtensionVersion{1, 0}));
+  EXPECT_TRUE(ExtsRV64GCV.at("zve64f") == (RISCVISAUtils::ExtensionVersion{1, 0}));
+  EXPECT_TRUE(ExtsRV64GCV.at("zve64d") == (RISCVISAUtils::ExtensionVersion{1, 0}));
+  EXPECT_TRUE(ExtsRV64GCV.at("zvl32b") == (RISCVISAUtils::ExtensionVersion{1, 0}));
+  EXPECT_TRUE(ExtsRV64GCV.at("zvl64b") == (RISCVISAUtils::ExtensionVersion{1, 0}));
+  EXPECT_TRUE(ExtsRV64GCV.at("zvl128b") == (RISCVISAUtils::ExtensionVersion{1, 0}));
+  EXPECT_EQ(InfoRV64GCV.getXLen(), 64U);
+  EXPECT_EQ(InfoRV64GCV.getFLen(), 64U);
+  EXPECT_EQ(InfoRV64GCV.getMinVLen(), 128U);
+  EXPECT_EQ(InfoRV64GCV.getMaxELen(), 64U);
+  EXPECT_EQ(InfoRV64GCV.getMaxELenFp(), 64U);
 }
 
 TEST(ParseArchString, RejectsUnrecognizedExtensionNamesByDefault) {
@@ -844,6 +946,7 @@ R"(All available -march extensions for RISC-V
     shvstvecd            1.0
     smaia                1.0
     smepmp               1.0
+    smstateen            1.0
     ssaia                1.0
     ssccptr              1.0
     sscofpmf             1.0
@@ -906,6 +1009,19 @@ Experimental extensions
     ssqosid              1.0
     supm                 0.8
 
+Supported Profiles
+    rva20s64
+    rva20u64
+    rva22s64
+    rva22u64
+    rva23s64
+    rva23u64
+    rvb23s64
+    rvb23u64
+    rvi20u32
+    rvi20u64
+    rvm23u32
+
 Use -march to specify the target's extension.
 For example, clang -march=rv32i_v1p0)";
   // clang-format on
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index 816aea44a9bc..0455e061f0bf 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -572,8 +572,8 @@ bool testARMArch(StringRef Arch, StringRef DefaultCPU, StringRef SubArch,
                  unsigned ArchAttr) {
   ARM::ArchKind AK = ARM::parseArch(Arch);
   bool Result = (AK != ARM::ArchKind::INVALID);
-  Result &= ARM::getDefaultCPU(Arch).equals(DefaultCPU);
-  Result &= ARM::getSubArch(AK).equals(SubArch);
+  Result &= ARM::getDefaultCPU(Arch) == DefaultCPU;
+  Result &= ARM::getSubArch(AK) == SubArch;
   Result &= (ARM::getArchAttr(AK) == ArchAttr);
   return Result;
 }
@@ -641,8 +641,8 @@ TEST(TargetParserTest, testARMArch) {
                           ARMBuildAttrs::CPUArch::v9_A));
   EXPECT_TRUE(testARMArch("armv9.5-a", "generic", "v9.5a",
                           ARMBuildAttrs::CPUArch::v9_A));
-  EXPECT_TRUE(testARMArch("armv8-r", "cortex-r52", "v8r",
-                          ARMBuildAttrs::CPUArch::v8_R));
+  EXPECT_TRUE(
+      testARMArch("armv8-r", "generic", "v8r", ARMBuildAttrs::CPUArch::v8_R));
   EXPECT_TRUE(testARMArch("armv8-m.base", "generic", "v8m.base",
                           ARMBuildAttrs::CPUArch::v8_M_Base));
   EXPECT_TRUE(testARMArch("armv8-m.main", "generic", "v8m.main",
@@ -1989,19 +1989,19 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
       AArch64::AEK_SME,          AArch64::AEK_SMEF64F64,
       AArch64::AEK_SMEI16I64,    AArch64::AEK_SME2,
       AArch64::AEK_HBC,          AArch64::AEK_MOPS,
-      AArch64::AEK_PERFMON,      AArch64::AEK_SVE2p1,
-      AArch64::AEK_SME2p1,       AArch64::AEK_B16B16,
+      AArch64::AEK_PERFMON,      AArch64::AEK_SVE2P1,
+      AArch64::AEK_SME2P1,       AArch64::AEK_B16B16,
       AArch64::AEK_SMEF16F16,    AArch64::AEK_CSSC,
       AArch64::AEK_RCPC3,        AArch64::AEK_THE,
       AArch64::AEK_D128,         AArch64::AEK_LSE128,
-      AArch64::AEK_SPECRES2,     AArch64::AEK_RASv2,
+      AArch64::AEK_SPECRES2,     AArch64::AEK_RASV2,
       AArch64::AEK_ITE,          AArch64::AEK_GCS,
       AArch64::AEK_FPMR,         AArch64::AEK_FP8,
       AArch64::AEK_FAMINMAX,     AArch64::AEK_FP8FMA,
       AArch64::AEK_SSVE_FP8FMA,  AArch64::AEK_FP8DOT2,
       AArch64::AEK_SSVE_FP8DOT2, AArch64::AEK_FP8DOT4,
       AArch64::AEK_SSVE_FP8DOT4, AArch64::AEK_LUT,
-      AArch64::AEK_SME_LUTv2,    AArch64::AEK_SMEF8F16,
+      AArch64::AEK_SME_LUTV2,    AArch64::AEK_SMEF8F16,
       AArch64::AEK_SMEF8F32,     AArch64::AEK_SMEFA64,
       AArch64::AEK_CPA,          AArch64::AEK_PAUTHLR,
       AArch64::AEK_TLBIW,        AArch64::AEK_JSCVT,
diff --git a/llvm/unittests/TargetParser/TripleTest.cpp b/llvm/unittests/TargetParser/TripleTest.cpp
index b8f5fbd87407..f93dc3671197 100644
--- a/llvm/unittests/TargetParser/TripleTest.cpp
+++ b/llvm/unittests/TargetParser/TripleTest.cpp
@@ -437,6 +437,85 @@ TEST(TripleTest, ParsedIDs) {
   EXPECT_EQ(VersionTuple(1, 3), T.getVulkanVersion());
   EXPECT_EQ(Triple::Compute, T.getEnvironment());
 
+  T = Triple("dxilv1.0--shadermodel6.0-pixel");
+  EXPECT_EQ(Triple::dxil, T.getArch());
+  EXPECT_EQ(Triple::DXILSubArch_v1_0, T.getSubArch());
+  EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
+  EXPECT_EQ(Triple::ShaderModel, T.getOS());
+  EXPECT_EQ(VersionTuple(1, 0), T.getDXILVersion());
+  EXPECT_EQ(Triple::Pixel, T.getEnvironment());
+
+  T = Triple("dxilv1.1--shadermodel6.1-vertex");
+  EXPECT_EQ(Triple::dxil, T.getArch());
+  EXPECT_EQ(Triple::DXILSubArch_v1_1, T.getSubArch());
+  EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
+  EXPECT_EQ(Triple::ShaderModel, T.getOS());
+  EXPECT_EQ(VersionTuple(1, 1), T.getDXILVersion());
+  EXPECT_EQ(Triple::Vertex, T.getEnvironment());
+
+  T = Triple("dxilv1.2--shadermodel6.2-geometry");
+  EXPECT_EQ(Triple::dxil, T.getArch());
+  EXPECT_EQ(Triple::DXILSubArch_v1_2, T.getSubArch());
+  EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
+  EXPECT_EQ(Triple::ShaderModel, T.getOS());
+  EXPECT_EQ(VersionTuple(1, 2), T.getDXILVersion());
+  EXPECT_EQ(Triple::Geometry, T.getEnvironment());
+
+  T = Triple("dxilv1.3--shadermodel6.3-library");
+  EXPECT_EQ(Triple::dxil, T.getArch());
+  EXPECT_EQ(Triple::DXILSubArch_v1_3, T.getSubArch());
+  EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
+  EXPECT_EQ(Triple::ShaderModel, T.getOS());
+  EXPECT_EQ(VersionTuple(1, 3), T.getDXILVersion());
+  EXPECT_EQ(Triple::Library, T.getEnvironment());
+
+  T = Triple("dxilv1.4--shadermodel6.4-hull");
+  EXPECT_EQ(Triple::dxil, T.getArch());
+  EXPECT_EQ(Triple::DXILSubArch_v1_4, T.getSubArch());
+  EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
+  EXPECT_EQ(Triple::ShaderModel, T.getOS());
+  EXPECT_EQ(VersionTuple(1, 4), T.getDXILVersion());
+  EXPECT_EQ(Triple::Hull, T.getEnvironment());
+
+  T = Triple("dxilv1.5--shadermodel6.5-domain");
+  EXPECT_EQ(Triple::dxil, T.getArch());
+  EXPECT_EQ(Triple::DXILSubArch_v1_5, T.getSubArch());
+  EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
+  EXPECT_EQ(Triple::ShaderModel, T.getOS());
+  EXPECT_EQ(VersionTuple(1, 5), T.getDXILVersion());
+  EXPECT_EQ(Triple::Domain, T.getEnvironment());
+
+  T = Triple("dxilv1.6--shadermodel6.6-compute");
+  EXPECT_EQ(Triple::dxil, T.getArch());
+  EXPECT_EQ(Triple::DXILSubArch_v1_6, T.getSubArch());
+  EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
+  EXPECT_EQ(Triple::ShaderModel, T.getOS());
+  EXPECT_EQ(VersionTuple(1, 6), T.getDXILVersion());
+  EXPECT_EQ(Triple::Compute, T.getEnvironment());
+
+  T = Triple("dxilv1.7-unknown-shadermodel6.7-mesh");
+  EXPECT_EQ(Triple::dxil, T.getArch());
+  EXPECT_EQ(Triple::DXILSubArch_v1_7, T.getSubArch());
+  EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
+  EXPECT_EQ(Triple::ShaderModel, T.getOS());
+  EXPECT_EQ(VersionTuple(1, 7), T.getDXILVersion());
+  EXPECT_EQ(Triple::Mesh, T.getEnvironment());
+
+  T = Triple("dxilv1.8-unknown-shadermodel6.8-amplification");
+  EXPECT_EQ(Triple::dxil, T.getArch());
+  EXPECT_EQ(Triple::DXILSubArch_v1_8, T.getSubArch());
+  EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
+  EXPECT_EQ(Triple::ShaderModel, T.getOS());
+  EXPECT_EQ(VersionTuple(1, 8), T.getDXILVersion());
+  EXPECT_EQ(Triple::Amplification, T.getEnvironment());
+
+  T = Triple("dxilv1.8-unknown-shadermodel6.15-library");
+  EXPECT_EQ(Triple::dxil, T.getArch());
+  EXPECT_EQ(Triple::DXILSubArch_v1_8, T.getSubArch());
+  EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
+  EXPECT_EQ(Triple::ShaderModel, T.getOS());
+  EXPECT_EQ(VersionTuple(1, 8), T.getDXILVersion());
+
   T = Triple("x86_64-unknown-fuchsia");
   EXPECT_EQ(Triple::x86_64, T.getArch());
   EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
@@ -2454,4 +2533,20 @@ TEST(TripleTest, isArmMClass) {
     EXPECT_TRUE(T.isArmMClass());
   }
 }
+
+TEST(TripleTest, DXILNormaizeWithVersion) {
+  EXPECT_EQ("dxilv1.0-unknown-shadermodel6.0",
+            Triple::normalize("dxilv1.0--shadermodel6.0"));
+  EXPECT_EQ("dxilv1.0-unknown-shadermodel6.0",
+            Triple::normalize("dxil--shadermodel6.0"));
+  EXPECT_EQ("dxilv1.1-unknown-shadermodel6.1-library",
+            Triple::normalize("dxil-shadermodel6.1-unknown-library"));
+  EXPECT_EQ("dxilv1.8-unknown-shadermodel6.x-unknown",
+            Triple::normalize("dxil-unknown-shadermodel6.x-unknown"));
+  EXPECT_EQ("dxilv1.8-unknown-shadermodel6.x-unknown",
+            Triple::normalize("dxil-unknown-shadermodel6.x-unknown"));
+  EXPECT_EQ("dxil-unknown-unknown-unknown", Triple::normalize("dxil---"));
+  EXPECT_EQ("dxilv1.0-pc-shadermodel5.0-compute",
+            Triple::normalize("dxil-shadermodel5.0-pc-compute"));
+}
 } // end anonymous namespace
diff --git a/llvm/unittests/Transforms/Utils/CloningTest.cpp b/llvm/unittests/Transforms/Utils/CloningTest.cpp
index 025771f07ce5..1d0d56a2099c 100644
--- a/llvm/unittests/Transforms/Utils/CloningTest.cpp
+++ b/llvm/unittests/Transforms/Utils/CloningTest.cpp
@@ -844,8 +844,9 @@ TEST(CloneFunction, CloneFunctionWithInlinedSubprograms) {
   EXPECT_FALSE(verifyModule(*ImplModule, &errs()));
 
   // Check that DILexicalBlock of inlined function was not cloned.
-  auto DbgDeclareI = Func->begin()->begin();
-  auto ClonedDbgDeclareI = ClonedFunc->begin()->begin();
+  auto DbgDeclareI = Func->begin()->begin()->getDbgRecordRange().begin();
+  auto ClonedDbgDeclareI =
+      ClonedFunc->begin()->begin()->getDbgRecordRange().begin();
   const DebugLoc &DbgLoc = DbgDeclareI->getDebugLoc();
   const DebugLoc &ClonedDbgLoc = ClonedDbgDeclareI->getDebugLoc();
   EXPECT_NE(DbgLoc.get(), ClonedDbgLoc.get());
@@ -1121,4 +1122,41 @@ TEST_F(CloneModule, IFunc) {
   EXPECT_EQ("resolver", Resolver->getName());
   EXPECT_EQ(GlobalValue::PrivateLinkage, Resolver->getLinkage());
 }
+
+TEST_F(CloneModule, CloneDbgLabel) {
+  LLVMContext Context;
+
+  std::unique_ptr<Module> M = parseIR(Context,
+                                      R"M(
+define void @noop(ptr nocapture noundef writeonly align 4 %dst) local_unnamed_addr !dbg !3 {
+entry:
+  %call = tail call spir_func i64 @foo(i32 noundef 0)
+    #dbg_label(!11, !12)
+  store i64 %call, ptr %dst, align 4
+  ret void
+}
+
+declare i64 @foo(i32 noundef) local_unnamed_addr
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "<stdin>", directory: "foo")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = distinct !DISubprogram(name: "noop", scope: !4, file: !4, line: 17, type: !5, scopeLine: 17, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !9)
+!4 = !DIFile(filename: "file", directory: "foo")
+!5 = !DISubroutineType(types: !6)
+!6 = !{null, !7}
+!7 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !8, size: 64)
+!8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!9 = !{}
+!11 = !DILabel(scope: !3, name: "foo", file: !4, line: 23)
+!12 = !DILocation(line: 23, scope: !3)
+)M");
+
+  ASSERT_FALSE(verifyModule(*M, &errs()));
+  auto NewM = llvm::CloneModule(*M);
+  EXPECT_FALSE(verifyModule(*NewM, &errs()));
 }
+} // namespace
diff --git a/llvm/unittests/Transforms/Utils/LocalTest.cpp b/llvm/unittests/Transforms/Utils/LocalTest.cpp
index a0119ed5159d..6052e58b697d 100644
--- a/llvm/unittests/Transforms/Utils/LocalTest.cpp
+++ b/llvm/unittests/Transforms/Utils/LocalTest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/PostDominators.h"
@@ -26,6 +27,27 @@
 
 using namespace llvm;
 
+extern llvm::cl::opt<bool> UseNewDbgInfoFormat;
+extern cl::opt<cl::boolOrDefault> PreserveInputDbgFormat;
+extern bool WriteNewDbgInfoFormatToBitcode;
+extern cl::opt<bool> WriteNewDbgInfoFormat;
+
+// Backup all of the existing settings that may be modified when
+// PreserveInputDbgFormat=true, so that when the test is finished we return them
+// (and the "preserve" setting) to their original values.
+static auto SaveDbgInfoFormat() {
+  return make_scope_exit(
+      [OldPreserveInputDbgFormat = PreserveInputDbgFormat.getValue(),
+       OldUseNewDbgInfoFormat = UseNewDbgInfoFormat.getValue(),
+       OldWriteNewDbgInfoFormatToBitcode = WriteNewDbgInfoFormatToBitcode,
+       OldWriteNewDbgInfoFormat = WriteNewDbgInfoFormat.getValue()] {
+        PreserveInputDbgFormat = OldPreserveInputDbgFormat;
+        UseNewDbgInfoFormat = OldUseNewDbgInfoFormat;
+        WriteNewDbgInfoFormatToBitcode = OldWriteNewDbgInfoFormatToBitcode;
+        WriteNewDbgInfoFormat = OldWriteNewDbgInfoFormat;
+      });
+}
+
 TEST(Local, RecursivelyDeleteDeadPHINodes) {
   LLVMContext C;
 
@@ -116,7 +138,6 @@ static std::unique_ptr<Module> parseIR(LLVMContext &C, const char *IR) {
 
 TEST(Local, ReplaceDbgDeclare) {
   LLVMContext C;
-
   // Original C source to get debug info for a local variable:
   // void f() { int x; }
   std::unique_ptr<Module> M = parseIR(C,
@@ -124,11 +145,11 @@ TEST(Local, ReplaceDbgDeclare) {
       define void @f() !dbg !8 {
       entry:
         %x = alloca i32, align 4
-        call void @llvm.dbg.declare(metadata i32* %x, metadata !11, metadata !DIExpression()), !dbg !13
-        call void @llvm.dbg.declare(metadata i32* %x, metadata !11, metadata !DIExpression()), !dbg !13
+          #dbg_declare(ptr %x, !11, !DIExpression(), !13)
+          #dbg_declare(ptr %x, !11, !DIExpression(), !13)
         ret void, !dbg !14
       }
-      declare void @llvm.dbg.declare(metadata, metadata, metadata)
+
       !llvm.dbg.cu = !{!0}
       !llvm.module.flags = !{!3, !4}
       !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 6.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
@@ -151,20 +172,18 @@ TEST(Local, ReplaceDbgDeclare) {
   Instruction *Inst = &F->front().front();
   auto *AI = dyn_cast<AllocaInst>(Inst);
   ASSERT_TRUE(AI);
-  Inst = Inst->getNextNode()->getNextNode();
-  ASSERT_TRUE(Inst);
-  auto *DII = dyn_cast<DbgDeclareInst>(Inst);
-  ASSERT_TRUE(DII);
+
   Value *NewBase = Constant::getNullValue(PointerType::getUnqual(C));
   DIBuilder DIB(*M);
   replaceDbgDeclare(AI, NewBase, DIB, DIExpression::ApplyOffset, 0);
 
-  // There should be exactly two dbg.declares.
-  int Declares = 0;
-  for (const Instruction &I : F->front())
-    if (isa<DbgDeclareInst>(I))
-      Declares++;
-  EXPECT_EQ(2, Declares);
+  // There should be exactly two dbg.declares, attached to the terminator.
+  Inst = F->front().getTerminator();
+  ASSERT_TRUE(Inst);
+  EXPECT_TRUE(Inst->hasDbgRecords());
+  EXPECT_EQ(range_size(Inst->getDbgRecordRange()), 2u);
+  for (DbgVariableRecord &DVR : filterDbgVars(Inst->getDbgRecordRange()))
+    EXPECT_EQ(DVR.getAddress(), NewBase);
 }
 
 /// Build the dominator tree for the function and run the Test.
@@ -499,11 +518,10 @@ struct SalvageDebugInfoTest : ::testing::Test {
       entry:
         %x = add i32 0, 1
         %y = add i32 %x, 2
-        call void @llvm.dbg.value(metadata i32 %x, metadata !11, metadata !DIExpression()), !dbg !13
-        call void @llvm.dbg.value(metadata i32 %y, metadata !11, metadata !DIExpression()), !dbg !13
+          #dbg_value(i32 %x, !11, !DIExpression(), !13)
+          #dbg_value(i32 %y, !11, !DIExpression(), !13)
         ret void, !dbg !14
       }
-      declare void @llvm.dbg.value(metadata, metadata, metadata)
       !llvm.dbg.cu = !{!0}
       !llvm.module.flags = !{!3, !4}
       !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 6.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
@@ -526,49 +544,48 @@ struct SalvageDebugInfoTest : ::testing::Test {
     ASSERT_TRUE(F);
   }
 
-  bool doesDebugValueDescribeX(const DbgValueInst &DI) {
-    if (DI.getNumVariableLocationOps() != 1u)
+  bool doesDebugValueDescribeX(const DbgVariableRecord &DVR) {
+    if (DVR.getNumVariableLocationOps() != 1u)
       return false;
-    const auto &CI = *cast<ConstantInt>(DI.getValue(0));
+    const auto &CI = *cast<ConstantInt>(DVR.getValue(0));
     if (CI.isZero())
-      return DI.getExpression()->getElements().equals(
+      return DVR.getExpression()->getElements().equals(
           {dwarf::DW_OP_plus_uconst, 1, dwarf::DW_OP_stack_value});
     else if (CI.isOneValue())
-      return DI.getExpression()->getElements().empty();
+      return DVR.getExpression()->getElements().empty();
     return false;
   }
 
-  bool doesDebugValueDescribeY(const DbgValueInst &DI) {
-    if (DI.getNumVariableLocationOps() != 1u)
+  bool doesDebugValueDescribeY(const DbgVariableRecord &DVR) {
+    if (DVR.getNumVariableLocationOps() != 1u)
       return false;
-    const auto &CI = *cast<ConstantInt>(DI.getVariableLocationOp(0));
+    const auto &CI = *cast<ConstantInt>(DVR.getVariableLocationOp(0));
     if (CI.isZero())
-      return DI.getExpression()->getElements().equals(
+      return DVR.getExpression()->getElements().equals(
           {dwarf::DW_OP_plus_uconst, 1, dwarf::DW_OP_plus_uconst, 2,
            dwarf::DW_OP_stack_value});
     else if (CI.isOneValue())
-      return DI.getExpression()->getElements().equals(
+      return DVR.getExpression()->getElements().equals(
           {dwarf::DW_OP_plus_uconst, 2, dwarf::DW_OP_stack_value});
     return false;
   }
 
   void verifyDebugValuesAreSalvaged() {
+    // The function should only contain debug values and a terminator.
+    EXPECT_EQ(F->size(), 1u);
+    EXPECT_TRUE(F->begin()->begin()->isTerminator());
+
     // Check that the debug values for %x and %y are preserved.
     bool FoundX = false;
     bool FoundY = false;
-    for (const Instruction &I : F->front()) {
-      auto DI = dyn_cast<DbgValueInst>(&I);
-      if (!DI) {
-        // The function should only contain debug values and a terminator.
-        ASSERT_TRUE(I.isTerminator());
-        continue;
-      }
-      EXPECT_EQ(DI->getVariable()->getName(), "x");
-      FoundX |= doesDebugValueDescribeX(*DI);
-      FoundY |= doesDebugValueDescribeY(*DI);
+    for (DbgVariableRecord &DVR :
+         filterDbgVars(F->begin()->begin()->getDbgRecordRange())) {
+      EXPECT_EQ(DVR.getVariable()->getName(), "x");
+      FoundX |= doesDebugValueDescribeX(DVR);
+      FoundY |= doesDebugValueDescribeY(DVR);
     }
-    ASSERT_TRUE(FoundX);
-    ASSERT_TRUE(FoundY);
+    EXPECT_TRUE(FoundX);
+    EXPECT_TRUE(FoundY);
   }
 };
 
@@ -591,6 +608,12 @@ TEST_F(SalvageDebugInfoTest, RecursiveBlockSimplification) {
 
 TEST(Local, wouldInstructionBeTriviallyDead) {
   LLVMContext Ctx;
+  // FIXME: PreserveInputDbgFormat is set to true because this test has
+  // been written to expect debug intrinsics rather than debug records.
+  // TODO: This test doesn't have a DbgRecord equivalent form so delete
+  // it when debug intrinsics are removed.
+  auto SettingGuard = SaveDbgInfoFormat();
+  PreserveInputDbgFormat = cl::boolOrDefault::BOU_TRUE;
   std::unique_ptr<Module> M = parseIR(Ctx,
                                       R"(
     define dso_local void @fun() local_unnamed_addr #0 !dbg !9 {
@@ -684,12 +707,10 @@ TEST(Local, FindDbgUsers) {
                                       R"(
   define dso_local void @fun(ptr %a) #0 !dbg !11 {
   entry:
-    call void @llvm.dbg.assign(metadata ptr %a, metadata !16, metadata !DIExpression(), metadata !15, metadata ptr %a, metadata !DIExpression()), !dbg !19
+      #dbg_assign(ptr %a, !16, !DIExpression(), !15, ptr %a, !DIExpression(), !19)
     ret void
   }
 
-  declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
-
   !llvm.dbg.cu = !{!0}
   !llvm.module.flags = !{!2, !3, !9}
   !llvm.ident = !{!10}
@@ -716,9 +737,13 @@ TEST(Local, FindDbgUsers) {
   verifyModule(*M, &errs(), &BrokenDebugInfo);
   ASSERT_FALSE(BrokenDebugInfo);
 
+  // Convert to debug intrinsics as we want to test findDbgUsers and
+  // findDbgValue's debug-intrinsic-finding code here.
+  // TODO: Remove this test when debug intrinsics are removed.
+  M->convertFromNewDbgValues();
+
   Function &Fun = *cast<Function>(M->getNamedValue("fun"));
   Value *Arg = Fun.getArg(0);
-
   SmallVector<DbgVariableIntrinsic *> Users;
   // Arg (%a) is used twice by a single dbg.assign. Check findDbgUsers returns
   // only 1 pointer to it rather than 2.
@@ -739,7 +764,7 @@ TEST(Local, FindDbgRecords) {
                                       R"(
   define dso_local void @fun(ptr %a) #0 !dbg !11 {
   entry:
-    call void @llvm.dbg.assign(metadata ptr %a, metadata !16, metadata !DIExpression(), metadata !15, metadata ptr %a, metadata !DIExpression()), !dbg !19
+      #dbg_assign(ptr %a, !16, !DIExpression(), !15, ptr %a, !DIExpression(), !19)
     ret void
   }
 
@@ -768,9 +793,6 @@ TEST(Local, FindDbgRecords) {
   bool BrokenDebugInfo = true;
   verifyModule(*M, &errs(), &BrokenDebugInfo);
   ASSERT_FALSE(BrokenDebugInfo);
-  bool NewDbgInfoFormat = UseNewDbgInfoFormat;
-  UseNewDbgInfoFormat = true;
-  M->convertToNewDbgValues();
 
   Function &Fun = *cast<Function>(M->getNamedValue("fun"));
   Value *Arg = Fun.getArg(0);
@@ -790,12 +812,10 @@ TEST(Local, FindDbgRecords) {
   findDbgValues(Vals, Arg, &Records);
   EXPECT_EQ(Vals.size(), 0u);
   EXPECT_EQ(Records.size(), 1u);
-  UseNewDbgInfoFormat = NewDbgInfoFormat;
 }
 
 TEST(Local, ReplaceAllDbgUsesWith) {
   using namespace llvm::dwarf;
-
   LLVMContext Ctx;
 
   // Note: The datalayout simulates Darwin/x86_64.
@@ -808,39 +828,36 @@ TEST(Local, ReplaceAllDbgUsesWith) {
     define void @f() !dbg !6 {
     entry:
       %a = add i32 0, 1, !dbg !15
-      call void @llvm.dbg.value(metadata i32 %a, metadata !9, metadata !DIExpression()), !dbg !15
 
+        #dbg_value(i32 %a, !9, !DIExpression(), !15)
       %b = add i64 0, 1, !dbg !16
-      call void @llvm.dbg.value(metadata i64 %b, metadata !11, metadata !DIExpression()), !dbg !16
-      call void @llvm.dbg.value(metadata i64 %b, metadata !11, metadata !DIExpression(DW_OP_lit0, DW_OP_mul)), !dbg !16
-      call void @llvm.dbg.value(metadata i64 %b, metadata !11, metadata !DIExpression(DW_OP_lit0, DW_OP_mul, DW_OP_stack_value)), !dbg !16
-      call void @llvm.dbg.value(metadata i64 %b, metadata !11, metadata !DIExpression(DW_OP_LLVM_fragment, 0, 8)), !dbg !16
-      call void @llvm.dbg.value(metadata i64 %b, metadata !11, metadata !DIExpression(DW_OP_lit0, DW_OP_mul, DW_OP_LLVM_fragment, 0, 8)), !dbg !16
-      call void @llvm.dbg.value(metadata i64 %b, metadata !11, metadata !DIExpression(DW_OP_lit0, DW_OP_mul, DW_OP_stack_value, DW_OP_LLVM_fragment, 0, 8)), !dbg !16
 
-      %c = inttoptr i64 0 to i64*, !dbg !17
-      call void @llvm.dbg.declare(metadata i64* %c, metadata !13, metadata !DIExpression()), !dbg !17
+        #dbg_value(i64 %b, !11, !DIExpression(), !16)
+        #dbg_value(i64 %b, !11, !DIExpression(DW_OP_lit0, DW_OP_mul), !16)
+        #dbg_value(i64 %b, !11, !DIExpression(DW_OP_lit0, DW_OP_mul, DW_OP_stack_value), !16)
+        #dbg_value(i64 %b, !11, !DIExpression(DW_OP_LLVM_fragment, 0, 8), !16)
+        #dbg_value(i64 %b, !11, !DIExpression(DW_OP_lit0, DW_OP_mul, DW_OP_LLVM_fragment, 0, 8), !16)
+        #dbg_value(i64 %b, !11, !DIExpression(DW_OP_lit0, DW_OP_mul, DW_OP_stack_value, DW_OP_LLVM_fragment, 0, 8), !16)
+      %c = inttoptr i64 0 to ptr, !dbg !17
 
-      %d = inttoptr i64 0 to i32*, !dbg !18
-      call void @llvm.dbg.declare(metadata i32* %d, metadata !20, metadata !DIExpression()), !dbg !18
+        #dbg_declare(ptr %c, !13, !DIExpression(), !17)
+      %d = inttoptr i64 0 to ptr, !dbg !18
 
+        #dbg_declare(ptr %d,  !20,  !DIExpression(), !18)
       %e = add <2 x i16> zeroinitializer, zeroinitializer
-      call void @llvm.dbg.value(metadata <2 x i16> %e, metadata !14, metadata !DIExpression()), !dbg !18
 
+        #dbg_value(<2 x i16> %e, !14, !DIExpression(), !18)
       %f = call i32 @escape(i32 0)
-      call void @llvm.dbg.value(metadata i32 %f, metadata !9, metadata !DIExpression()), !dbg !15
 
+        #dbg_value(i32 %f, !9, !DIExpression(), !15)
       %barrier = call i32 @escape(i32 0)
 
       %g = call i32 @escape(i32 %f)
-      call void @llvm.dbg.value(metadata i32 %g, metadata !9, metadata !DIExpression()), !dbg !15
 
+        #dbg_value(i32 %g, !9, !DIExpression(), !15)
       ret void, !dbg !19
     }
 
-    declare void @llvm.dbg.declare(metadata, metadata, metadata)
-    declare void @llvm.dbg.value(metadata, metadata, metadata)
-
     !llvm.dbg.cu = !{!0}
     !llvm.module.flags = !{!5}
 
@@ -895,38 +912,47 @@ TEST(Local, ReplaceAllDbgUsesWith) {
   EXPECT_TRUE(replaceAllDbgUsesWith(D, C, C, DT));
 
   SmallVector<DbgVariableIntrinsic *, 2> CDbgVals;
-  findDbgUsers(CDbgVals, &C);
-  EXPECT_EQ(2U, CDbgVals.size());
-  EXPECT_TRUE(all_of(CDbgVals, [](DbgVariableIntrinsic *DII) {
-    return isa<DbgDeclareInst>(DII);
-  }));
+  SmallVector<DbgVariableRecord *, 2> CDbgRecords;
+  findDbgUsers(CDbgVals, &C, &CDbgRecords);
+  EXPECT_EQ(0U, CDbgVals.size());
+  EXPECT_EQ(2U, CDbgRecords.size());
+  EXPECT_TRUE(all_of(
+      CDbgRecords, [](DbgVariableRecord *DVR) { return DVR->isDbgDeclare(); }));
 
   EXPECT_TRUE(replaceAllDbgUsesWith(C, D, D, DT));
 
   SmallVector<DbgVariableIntrinsic *, 2> DDbgVals;
-  findDbgUsers(DDbgVals, &D);
-  EXPECT_EQ(2U, DDbgVals.size());
-  EXPECT_TRUE(all_of(DDbgVals, [](DbgVariableIntrinsic *DII) {
-    return isa<DbgDeclareInst>(DII);
-  }));
+  SmallVector<DbgVariableRecord *, 2> DDbgRecords;
+  findDbgUsers(DDbgVals, &D, &DDbgRecords);
+  EXPECT_EQ(0U, DDbgVals.size());
+  EXPECT_EQ(2U, DDbgRecords.size());
+  EXPECT_TRUE(all_of(
+      DDbgRecords, [](DbgVariableRecord *DVR) { return DVR->isDbgDeclare(); }));
 
   // Introduce a use-before-def. Check that the dbg.value for %a is salvaged.
   EXPECT_TRUE(replaceAllDbgUsesWith(A, F_, F_, DT));
 
-  auto *ADbgVal = cast<DbgValueInst>(A.getNextNode());
-  EXPECT_EQ(ADbgVal->getNumVariableLocationOps(), 1u);
-  EXPECT_EQ(ConstantInt::get(A.getType(), 0), ADbgVal->getVariableLocationOp(0));
+  EXPECT_FALSE(A.hasDbgRecords());
+  EXPECT_TRUE(B.hasDbgRecords());
+  DbgVariableRecord *BDbgVal =
+      cast<DbgVariableRecord>(&*B.getDbgRecordRange().begin());
+  EXPECT_EQ(BDbgVal->getNumVariableLocationOps(), 1u);
+  EXPECT_EQ(ConstantInt::get(A.getType(), 0),
+            BDbgVal->getVariableLocationOp(0));
 
   // Introduce a use-before-def. Check that the dbg.values for %f become undef.
   EXPECT_TRUE(replaceAllDbgUsesWith(F_, G, G, DT));
 
-  auto *FDbgVal = cast<DbgValueInst>(F_.getNextNode());
-  EXPECT_EQ(FDbgVal->getNumVariableLocationOps(), 1u);
-  EXPECT_TRUE(FDbgVal->isKillLocation());
+  DbgVariableRecord *BarrierDbgVal =
+      cast<DbgVariableRecord>(&*Barrier.getDbgRecordRange().begin());
+  EXPECT_EQ(BarrierDbgVal->getNumVariableLocationOps(), 1u);
+  EXPECT_TRUE(BarrierDbgVal->isKillLocation());
 
-  SmallVector<DbgValueInst *, 1> FDbgVals;
-  findDbgValues(FDbgVals, &F_);
-  EXPECT_EQ(0U, FDbgVals.size());
+  SmallVector<DbgValueInst *, 1> BarrierDbgVals;
+  SmallVector<DbgVariableRecord *, 8> BarrierDbgRecs;
+  findDbgValues(BarrierDbgVals, &F_, &BarrierDbgRecs);
+  EXPECT_EQ(0U, BarrierDbgVals.size());
+  EXPECT_EQ(0U, BarrierDbgRecs.size());
 
   // Simulate i32 -> i64 conversion to test sign-extension. Here are some
   // interesting cases to handle:
@@ -936,13 +962,15 @@ TEST(Local, ReplaceAllDbgUsesWith) {
   //  4-6) like (1-3), but with a fragment
   EXPECT_TRUE(replaceAllDbgUsesWith(B, A, A, DT));
 
-  SmallVector<DbgValueInst *, 8> ADbgVals;
-  findDbgValues(ADbgVals, &A);
-  EXPECT_EQ(6U, ADbgVals.size());
+  SmallVector<DbgValueInst *, 8> BDbgVals;
+  SmallVector<DbgVariableRecord *, 8> BDbgRecs;
+  findDbgValues(BDbgVals, &A, &BDbgRecs);
+  EXPECT_EQ(0U, BDbgVals.size());
+  EXPECT_EQ(6U, BDbgRecs.size());
 
   // Check that %a has a dbg.value with a DIExpression matching \p Ops.
   auto hasADbgVal = [&](ArrayRef<uint64_t> Ops) {
-    return any_of(ADbgVals, [&](DbgValueInst *DVI) {
+    return any_of(BDbgRecs, [&](DbgVariableRecord *DVI) {
       assert(DVI->getVariable()->getName() == "2");
       return DVI->getExpression()->getElements() == Ops;
     });
@@ -1149,7 +1177,7 @@ TEST(Local, SimplifyCFGWithNullAC) {
   // Obtain BasicBlock of interest to this test, %test.bb.
   BasicBlock *TestBB = nullptr;
   for (BasicBlock &BB : F) {
-    if (BB.getName().equals("test.bb")) {
+    if (BB.getName() == "test.bb") {
       TestBB = &BB;
       break;
     }
@@ -1345,6 +1373,11 @@ TEST(Local, ExpressionForConstant) {
 
 TEST(Local, ReplaceDbgVariableRecord) {
   LLVMContext C;
+  // FIXME: PreserveInputDbgFormat is set to true because this test has
+  // been written to expect debug intrinsics rather than debug records; use the
+  // intrinsic format until we update the test checks.
+  auto SettingGuard = SaveDbgInfoFormat();
+  PreserveInputDbgFormat = cl::boolOrDefault::BOU_TRUE;
 
   // Test that RAUW also replaces the operands of DbgVariableRecord objects,
   // i.e. non-instruction stored debugging information.
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
index 396919763c93..910fc24455a6 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
@@ -44,7 +44,7 @@ protected:
     AARes.reset(new AAResults(TLI));
     AARes->addAAResult(*BasicAA);
     PSE.reset(new PredicatedScalarEvolution(*SE, *L));
-    LAI.reset(new LoopAccessInfo(L, &*SE, &TLI, &*AARes, &*DT, &*LI));
+    LAI.reset(new LoopAccessInfo(L, &*SE, nullptr, &TLI, &*AARes, &*DT, &*LI));
     IAI.reset(new InterleavedAccessInfo(*PSE, L, &*DT, &*LI, &*LAI));
     IAI->analyzeInterleaving(false);
     return {Plan, *IAI};
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index eda4723f67b2..5c45d86130bd 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -1530,5 +1530,13 @@ TEST(VPDoubleValueDefTest, traverseUseLists) {
   EXPECT_EQ(&DoubleValueDef, I3.getOperand(0)->getDefiningRecipe());
 }
 
+TEST(VPRecipeTest, CastToVPSingleDefRecipe) {
+  VPValue Start;
+  VPEVLBasedIVPHIRecipe R(&Start, {});
+  VPRecipeBase *B = &R;
+  EXPECT_TRUE(isa<VPSingleDefRecipe>(B));
+  // TODO: check other VPSingleDefRecipes.
+}
+
 } // namespace
 } // namespace llvm
diff --git a/llvm/utils/TableGen/ARMTargetDefEmitter.cpp b/llvm/utils/TableGen/ARMTargetDefEmitter.cpp
index 05aa146b5715..4a46f2ea9586 100644
--- a/llvm/utils/TableGen/ARMTargetDefEmitter.cpp
+++ b/llvm/utils/TableGen/ARMTargetDefEmitter.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/StringSet.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
+#include <cstdint>
 
 using namespace llvm;
 
@@ -33,6 +34,16 @@ static void EmitARMTargetDef(RecordKeeper &RK, raw_ostream &OS) {
     return Set;
   };
 
+  // Sort the extensions alphabetically, so they don't appear in tablegen order.
+  std::vector<Record *> SortedExtensions =
+      RK.getAllDerivedDefinitions("Extension");
+  auto Alphabetical = [](Record *A, Record *B) -> bool {
+    const auto MarchA = A->getValueAsString("MArchName");
+    const auto MarchB = B->getValueAsString("MArchName");
+    return MarchA.compare(MarchB) < 0; // A lexographically less than B
+  };
+  std::sort(SortedExtensions.begin(), SortedExtensions.end(), Alphabetical);
+
   // The ARMProcFamilyEnum values are initialised by SubtargetFeature defs
   // which set the ARMProcFamily field. We can generate the enum from these defs
   // which look like this:
@@ -57,16 +68,46 @@ static void EmitARMTargetDef(RecordKeeper &RK, raw_ostream &OS) {
     OS << "ARM_ARCHITECTURE(" << Arch << ")\n";
   OS << "\n#undef ARM_ARCHITECTURE\n\n";
 
-  // Emit information for each defined Extension; used to build ArmExtKind.
-  OS << "#ifndef ARM_EXTENSION\n"
-     << "#define ARM_EXTENSION(NAME, ENUM)\n"
-     << "#endif\n\n";
-  for (const Record *Rec : RK.getAllDerivedDefinitions("Extension")) {
-    StringRef Name = Rec->getValueAsString("Name");
-    std::string Enum = Rec->getValueAsString("ArchExtKindSpelling").upper();
-    OS << "ARM_EXTENSION(" << Name << ", " << Enum << ")\n";
+  // Emit the ArchExtKind enum
+  OS << "#ifdef EMIT_ARCHEXTKIND_ENUM\n"
+     << "enum ArchExtKind : unsigned {\n"
+     << "  AEK_NONE = 1,\n";
+  for (const Record *Rec : SortedExtensions) {
+    auto AEK = Rec->getValueAsString("ArchExtKindSpelling").upper();
+    if (AEK != "AEK_NONE")
+      OS << "  " << AEK << ",\n";
   }
-  OS << "\n#undef ARM_EXTENSION\n\n";
+  OS << "  AEK_NUM_EXTENSIONS\n"
+     << "};\n"
+     << "#undef EMIT_ARCHEXTKIND_ENUM\n"
+     << "#endif // EMIT_ARCHEXTKIND_ENUM\n";
+
+  // Emit information for each defined Extension; used to build ArmExtKind.
+  OS << "#ifdef EMIT_EXTENSIONS\n"
+     << "inline constexpr ExtensionInfo Extensions[] = {\n";
+  for (const Record *Rec : SortedExtensions) {
+    auto AEK = Rec->getValueAsString("ArchExtKindSpelling").upper();
+    OS << "  ";
+    OS << "{\"" << Rec->getValueAsString("MArchName") << "\"";
+    OS << ", AArch64::" << AEK;
+    if (AEK == "AEK_NONE") {
+      // HACK: don't emit posfeat/negfeat strings for FMVOnlyExtensions.
+      OS << ", {}, {}";
+    } else {
+      OS << ", \"+" << Rec->getValueAsString("Name") << "\""; // posfeature
+      OS << ", \"-" << Rec->getValueAsString("Name") << "\""; // negfeature
+    }
+    OS << ", " << Rec->getValueAsString("FMVBit");
+    OS << ", \"" << Rec->getValueAsString("FMVDependencies") << "\"";
+    OS << ", " << (uint64_t)Rec->getValueAsInt("FMVPriority");
+    OS << "},\n";
+  };
+  OS << "  {\"none\", AArch64::AEK_NONE, {}, {}, FEAT_INIT, \"\", "
+        "ExtensionInfo::MaxFMVPriority},\n";
+  OS << "};\n"
+     << "#undef EMIT_EXTENSIONS\n"
+     << "#endif // EMIT_EXTENSIONS\n"
+     << "\n";
 }
 
 static TableGen::Emitter::Opt
diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
index 53d49a2900a1..8e475f9153b0 100644
--- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
@@ -3121,7 +3121,7 @@ static void emitMnemonicSpellChecker(raw_ostream &OS, CodeGenTarget &Target,
     OS << "\n";
     OS << "    StringRef T = I->getMnemonic();\n";
     OS << "    // Avoid recomputing the edit distance for the same string.\n";
-    OS << "    if (T.equals(Prev))\n";
+    OS << "    if (T == Prev)\n";
     OS << "      continue;\n";
     OS << "\n";
     OS << "    Prev = T;\n";
diff --git a/llvm/utils/TableGen/CodeEmitterGen.cpp b/llvm/utils/TableGen/CodeEmitterGen.cpp
index a57885f22d7e..755b819e748f 100644
--- a/llvm/utils/TableGen/CodeEmitterGen.cpp
+++ b/llvm/utils/TableGen/CodeEmitterGen.cpp
@@ -68,7 +68,7 @@ private:
 
   void emitInstructionBaseValues(
       raw_ostream &o, ArrayRef<const CodeGenInstruction *> NumberedInstructions,
-      CodeGenTarget &Target, int HwMode = -1);
+      CodeGenTarget &Target, unsigned HwMode = DefaultMode);
   void
   emitCaseMap(raw_ostream &o,
               const std::map<std::string, std::vector<std::string>> &CaseMap);
@@ -281,7 +281,7 @@ std::pair<std::string, std::string>
 CodeEmitterGen::getInstructionCases(Record *R, CodeGenTarget &Target) {
   std::string Case, BitOffsetCase;
 
-  auto append = [&](const char *S) {
+  auto append = [&](const std::string &S) {
     Case += S;
     BitOffsetCase += S;
   };
@@ -290,11 +290,45 @@ CodeEmitterGen::getInstructionCases(Record *R, CodeGenTarget &Target) {
     if (auto *DI = dyn_cast_or_null<DefInit>(RV->getValue())) {
       const CodeGenHwModes &HWM = Target.getHwModes();
       EncodingInfoByHwMode EBM(DI->getDef(), HWM);
+
+      // Invoke the interface to obtain the HwMode ID controlling the
+      // EncodingInfo for the current subtarget. This interface will
+      // mask off irrelevant HwMode IDs.
+      append("      unsigned HwMode = "
+             "STI.getHwMode(MCSubtargetInfo::HwMode_EncodingInfo);\n");
+      Case += "      switch (HwMode) {\n";
+      Case += "      default: llvm_unreachable(\"Unknown hardware mode!\"); "
+              "break;\n";
+      for (auto &[ModeId, Encoding] : EBM) {
+        if (ModeId == DefaultMode) {
+          Case +=
+              "      case " + itostr(DefaultMode) + ": InstBitsByHw = InstBits";
+        } else {
+          Case += "      case " + itostr(ModeId) +
+                  ": InstBitsByHw = InstBits_" +
+                  std::string(HWM.getMode(ModeId).Name);
+        }
+        Case += "; break;\n";
+      }
+      Case += "      };\n";
+
+      // We need to remodify the 'Inst' value from the table we found above.
+      if (UseAPInt) {
+        int NumWords = APInt::getNumWords(BitWidth);
+        Case += "      Inst = APInt(" + itostr(BitWidth);
+        Case += ", ArrayRef(InstBitsByHw + opcode * " + itostr(NumWords) +
+                ", " + itostr(NumWords);
+        Case += "));\n";
+        Case += "      Value = Inst;\n";
+      } else {
+        Case += "      Value = InstBitsByHw[opcode];\n";
+      }
+
       append("      switch (HwMode) {\n");
       append("      default: llvm_unreachable(\"Unhandled HwMode\");\n");
-      for (auto &KV : EBM) {
-        append(("      case " + itostr(KV.first) + ": {\n").c_str());
-        addInstructionCasesForEncoding(R, KV.second, Target, Case,
+      for (auto &[ModeId, Encoding] : EBM) {
+        append("      case " + itostr(ModeId) + ": {\n");
+        addInstructionCasesForEncoding(R, Encoding, Target, Case,
                                        BitOffsetCase);
         append("      break;\n");
         append("      }\n");
@@ -360,9 +394,9 @@ static void emitInstBits(raw_ostream &OS, const APInt &Bits) {
 
 void CodeEmitterGen::emitInstructionBaseValues(
     raw_ostream &o, ArrayRef<const CodeGenInstruction *> NumberedInstructions,
-    CodeGenTarget &Target, int HwMode) {
+    CodeGenTarget &Target, unsigned HwMode) {
   const CodeGenHwModes &HWM = Target.getHwModes();
-  if (HwMode == -1)
+  if (HwMode == DefaultMode)
     o << "  static const uint64_t InstBits[] = {\n";
   else
     o << "  static const uint64_t InstBits_"
@@ -383,8 +417,17 @@ void CodeEmitterGen::emitInstructionBaseValues(
     if (const RecordVal *RV = R->getValue("EncodingInfos")) {
       if (auto *DI = dyn_cast_or_null<DefInit>(RV->getValue())) {
         EncodingInfoByHwMode EBM(DI->getDef(), HWM);
-        if (EBM.hasMode(HwMode))
+        if (EBM.hasMode(HwMode)) {
           EncodingDef = EBM.get(HwMode);
+        } else {
+          // If the HwMode does not match, then Encoding '0'
+          // should be generated.
+          APInt Value(BitWidth, 0);
+          o << "    ";
+          emitInstBits(o, Value);
+          o << "," << '\t' << "// " << R->getName() << "\n";
+          continue;
+        }
       }
     }
     BitsInit *BI = EncodingDef->getValueAsBitsInit("Inst");
@@ -479,23 +522,17 @@ void CodeEmitterGen::run(raw_ostream &o) {
     }
 
     // Emit instruction base values
-    if (HwModes.empty()) {
-      emitInstructionBaseValues(o, NumberedInstructions, Target, -1);
-    } else {
-      for (unsigned HwMode : HwModes)
-        emitInstructionBaseValues(o, NumberedInstructions, Target, (int)HwMode);
-    }
-
+    emitInstructionBaseValues(o, NumberedInstructions, Target, DefaultMode);
     if (!HwModes.empty()) {
-      o << "  const uint64_t *InstBits;\n";
-      o << "  unsigned HwMode = STI.getHwMode();\n";
-      o << "  switch (HwMode) {\n";
-      o << "  default: llvm_unreachable(\"Unknown hardware mode!\"); break;\n";
-      for (unsigned I : HwModes) {
-        o << "  case " << I << ": InstBits = InstBits_"
-          << HWM.getModeName(I, /*IncludeDefault=*/true) << "; break;\n";
+      // Emit table for instrs whose encodings are controlled by HwModes.
+      for (unsigned HwMode : HwModes) {
+        if (HwMode == DefaultMode)
+          continue;
+        emitInstructionBaseValues(o, NumberedInstructions, Target, HwMode);
       }
-      o << "  };\n";
+
+      // This pointer will be assigned to the HwMode table later.
+      o << "  const uint64_t *InstBitsByHw;\n";
     }
 
     // Map to accumulate all the cases.
diff --git a/llvm/utils/TableGen/Common/CodeGenHwModes.cpp b/llvm/utils/TableGen/Common/CodeGenHwModes.cpp
index fec74d29c8bb..124cfbaf4fb7 100644
--- a/llvm/utils/TableGen/Common/CodeGenHwModes.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenHwModes.cpp
@@ -74,6 +74,8 @@ CodeGenHwModes::CodeGenHwModes(RecordKeeper &RK) : Records(RK) {
     ModeIds.insert(std::pair(R, Modes.size()));
   }
 
+  assert(Modes.size() <= 32 && "number of HwModes exceeds maximum of 32");
+
   for (Record *R : Records.getAllDerivedDefinitions("HwModeSelect")) {
     auto P = ModeSelects.emplace(std::pair(R, HwModeSelect(R, *this)));
     assert(P.second);
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
index 8af219f34e18..6d03eecae672 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
@@ -1565,13 +1565,14 @@ bool MemoryAddressSpacePredicateMatcher::isIdentical(
 
 void MemoryAddressSpacePredicateMatcher::emitPredicateOpcodes(
     MatchTable &Table, RuleMatcher &Rule) const {
+  assert(AddrSpaces.size() < 256);
   Table << MatchTable::Opcode("GIM_CheckMemoryAddressSpace")
         << MatchTable::Comment("MI") << MatchTable::ULEB128Value(InsnVarID)
         << MatchTable::Comment("MMO")
         << MatchTable::ULEB128Value(MMOIdx)
         // Encode number of address spaces to expect.
         << MatchTable::Comment("NumAddrSpace")
-        << MatchTable::ULEB128Value(AddrSpaces.size());
+        << MatchTable::IntValue(1, AddrSpaces.size());
   for (unsigned AS : AddrSpaces)
     Table << MatchTable::Comment("AddrSpace") << MatchTable::ULEB128Value(AS);
 
@@ -1590,10 +1591,13 @@ bool MemoryAlignmentPredicateMatcher::isIdentical(
 
 void MemoryAlignmentPredicateMatcher::emitPredicateOpcodes(
     MatchTable &Table, RuleMatcher &Rule) const {
+  // TODO: we could support more, just need to emit the right opcode or switch
+  // to log alignment.
+  assert(MinAlign < 256);
   Table << MatchTable::Opcode("GIM_CheckMemoryAlignment")
         << MatchTable::Comment("MI") << MatchTable::ULEB128Value(InsnVarID)
         << MatchTable::Comment("MMO") << MatchTable::ULEB128Value(MMOIdx)
-        << MatchTable::Comment("MinAlign") << MatchTable::ULEB128Value(MinAlign)
+        << MatchTable::Comment("MinAlign") << MatchTable::IntValue(1, MinAlign)
         << MatchTable::LineBreak;
 }
 
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index 3bd7f432ff9a..c303322e63b4 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -2301,7 +2301,7 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
     }
     case MCD::OPC_CheckField: {
       // Decode the start value.
-      unsigned Start = decodeULEB128AndInc(++Ptr);
+      unsigned Start = decodeULEB128AndIncUnsafe(++Ptr);
       unsigned Len = *Ptr;)";
   if (IsVarLenInst)
     OS << "\n      makeUp(insn, Start + Len);";
@@ -2328,7 +2328,7 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
     }
     case MCD::OPC_CheckPredicate: {
       // Decode the Predicate Index value.
-      unsigned PIdx = decodeULEB128AndInc(++Ptr);
+      unsigned PIdx = decodeULEB128AndIncUnsafe(++Ptr);
       // NumToSkip is a plain 24-bit integer.
       unsigned NumToSkip = *Ptr++;
       NumToSkip |= (*Ptr++) << 8;
@@ -2345,8 +2345,8 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
     }
     case MCD::OPC_Decode: {
       // Decode the Opcode value.
-      unsigned Opc = decodeULEB128AndInc(++Ptr);
-      unsigned DecodeIdx = decodeULEB128AndInc(Ptr);
+      unsigned Opc = decodeULEB128AndIncUnsafe(++Ptr);
+      unsigned DecodeIdx = decodeULEB128AndIncUnsafe(Ptr);
 
       MI.clear();
       MI.setOpcode(Opc);
@@ -2366,8 +2366,8 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
     }
     case MCD::OPC_TryDecode: {
       // Decode the Opcode value.
-      unsigned Opc = decodeULEB128AndInc(++Ptr);
-      unsigned DecodeIdx = decodeULEB128AndInc(Ptr);
+      unsigned Opc = decodeULEB128AndIncUnsafe(++Ptr);
+      unsigned DecodeIdx = decodeULEB128AndIncUnsafe(Ptr);
       // NumToSkip is a plain 24-bit integer.
       unsigned NumToSkip = *Ptr++;
       NumToSkip |= (*Ptr++) << 8;
@@ -2399,8 +2399,8 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
     }
     case MCD::OPC_SoftFail: {
       // Decode the mask values.
-      uint64_t PositiveMask = decodeULEB128AndInc(++Ptr);
-      uint64_t NegativeMask = decodeULEB128AndInc(Ptr);
+      uint64_t PositiveMask = decodeULEB128AndIncUnsafe(++Ptr);
+      uint64_t NegativeMask = decodeULEB128AndIncUnsafe(Ptr);
       bool Fail = (insn & PositiveMask) != 0 || (~insn & NegativeMask) != 0;
       if (Fail)
         S = MCDisassembler::SoftFail;
diff --git a/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp b/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp
index 6784514032eb..bb409ea6ea69 100644
--- a/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp
+++ b/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp
@@ -48,37 +48,41 @@ static void emitRISCVExtensions(RecordKeeper &Records, raw_ostream &OS) {
   OS << "#undef GET_SUPPORTED_EXTENSIONS\n\n";
 
   std::vector<Record *> Extensions =
-      Records.getAllDerivedDefinitions("RISCVExtension");
+      Records.getAllDerivedDefinitionsIfDefined("RISCVExtension");
   llvm::sort(Extensions, [](const Record *Rec1, const Record *Rec2) {
     return getExtensionName(Rec1) < getExtensionName(Rec2);
   });
 
-  printExtensionTable(OS, Extensions, /*Experimental=*/false);
-  printExtensionTable(OS, Extensions, /*Experimental=*/true);
+  if (!Extensions.empty()) {
+    printExtensionTable(OS, Extensions, /*Experimental=*/false);
+    printExtensionTable(OS, Extensions, /*Experimental=*/true);
+  }
 
   OS << "#endif // GET_SUPPORTED_EXTENSIONS\n\n";
 
   OS << "#ifdef GET_IMPLIED_EXTENSIONS\n";
   OS << "#undef GET_IMPLIED_EXTENSIONS\n\n";
 
-  OS << "\nstatic constexpr ImpliedExtsEntry ImpliedExts[] = {\n";
-  for (Record *Ext : Extensions) {
-    auto ImpliesList = Ext->getValueAsListOfDefs("Implies");
-    if (ImpliesList.empty())
-      continue;
+  if (!Extensions.empty()) {
+    OS << "\nstatic constexpr ImpliedExtsEntry ImpliedExts[] = {\n";
+    for (Record *Ext : Extensions) {
+      auto ImpliesList = Ext->getValueAsListOfDefs("Implies");
+      if (ImpliesList.empty())
+        continue;
 
-    StringRef Name = getExtensionName(Ext);
+      StringRef Name = getExtensionName(Ext);
 
-    for (auto *ImpliedExt : ImpliesList) {
-      if (!ImpliedExt->isSubClassOf("RISCVExtension"))
-        continue;
+      for (auto *ImpliedExt : ImpliesList) {
+        if (!ImpliedExt->isSubClassOf("RISCVExtension"))
+          continue;
 
-      OS << "    { {\"" << Name << "\"}, \"" << getExtensionName(ImpliedExt)
-         << "\"},\n";
+        OS << "    { {\"" << Name << "\"}, \"" << getExtensionName(ImpliedExt)
+           << "\"},\n";
+      }
     }
-  }
 
-  OS << "};\n\n";
+    OS << "};\n\n";
+  }
 
   OS << "#endif // GET_IMPLIED_EXTENSIONS\n\n";
 }
@@ -122,19 +126,20 @@ static void emitRISCVProfiles(RecordKeeper &Records, raw_ostream &OS) {
   OS << "#ifdef GET_SUPPORTED_PROFILES\n";
   OS << "#undef GET_SUPPORTED_PROFILES\n\n";
 
-  OS << "static constexpr RISCVProfile SupportedProfiles[] = {\n";
+  auto Profiles = Records.getAllDerivedDefinitionsIfDefined("RISCVProfile");
 
-  auto Profiles = Records.getAllDerivedDefinitions("RISCVProfile");
-  llvm::sort(Profiles, LessRecordFieldName());
+  if (!Profiles.empty()) {
+    llvm::sort(Profiles, LessRecordFieldName());
+    OS << "static constexpr RISCVProfile SupportedProfiles[] = {\n";
+    for (const Record *Rec : Profiles) {
+      OS.indent(4) << "{\"" << Rec->getValueAsString("Name") << "\",\"";
+      printMArch(OS, Rec->getValueAsListOfDefs("Implies"));
+      OS << "\"},\n";
+    }
 
-  for (const Record *Rec : Profiles) {
-    OS.indent(4) << "{\"" << Rec->getValueAsString("Name") << "\",\"";
-    printMArch(OS, Rec->getValueAsListOfDefs("Implies"));
-    OS << "\"},\n";
+    OS << "};\n\n";
   }
 
-  OS << "};\n\n";
-
   OS << "#endif // GET_SUPPORTED_PROFILES\n\n";
 }
 
@@ -144,7 +149,8 @@ static void emitRISCVProcs(RecordKeeper &RK, raw_ostream &OS) {
      << "#endif\n\n";
 
   // Iterate on all definition records.
-  for (const Record *Rec : RK.getAllDerivedDefinitions("RISCVProcessorModel")) {
+  for (const Record *Rec :
+       RK.getAllDerivedDefinitionsIfDefined("RISCVProcessorModel")) {
     const std::vector<Record *> &Features =
         Rec->getValueAsListOfDefs("Features");
     bool FastScalarUnalignedAccess = any_of(Features, [&](auto &Feature) {
@@ -177,7 +183,7 @@ static void emitRISCVProcs(RecordKeeper &RK, raw_ostream &OS) {
      << "#endif\n\n";
 
   for (const Record *Rec :
-       RK.getAllDerivedDefinitions("RISCVTuneProcessorModel")) {
+       RK.getAllDerivedDefinitionsIfDefined("RISCVTuneProcessorModel")) {
     OS << "TUNE_PROC(" << Rec->getName() << ", "
        << "\"" << Rec->getValueAsString("Name") << "\")\n";
   }
diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
index ee8830edeedb..980d9a39636e 100644
--- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
@@ -962,7 +962,7 @@ void RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
 
   OS << "extern const MCRegisterDesc " << TargetName
      << "RegDesc[] = { // Descriptors\n";
-  OS << "  { " << RegStrings.get("") << ", 0, 0, 0, 0, 0 },\n";
+  OS << "  { " << RegStrings.get("") << ", 0, 0, 0, 0, 0, 0 },\n";
 
   // Emit the register descriptors now.
   i = 0;
@@ -977,7 +977,8 @@ void RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
        << DiffSeqs.get(SubRegLists[i]) << ", " << DiffSeqs.get(SuperRegLists[i])
        << ", " << SubRegIdxSeqs.get(SubRegIdxLists[i]) << ", "
        << (Offset << RegUnitBits | FirstRU) << ", "
-       << LaneMaskSeqs.get(RegUnitLaneMasks[i]) << " },\n";
+       << LaneMaskSeqs.get(RegUnitLaneMasks[i]) << ", " << Reg.Constant
+       << " },\n";
     ++i;
   }
   OS << "};\n\n"; // End of register descriptors...
diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp
index 2e2c57b802ee..b6b7641cfb92 100644
--- a/llvm/utils/TableGen/SubtargetEmitter.cpp
+++ b/llvm/utils/TableGen/SubtargetEmitter.cpp
@@ -1781,13 +1781,62 @@ void SubtargetEmitter::EmitHwModeCheck(const std::string &ClassName,
   if (CGH.getNumModeIds() == 1)
     return;
 
-  OS << "unsigned " << ClassName << "::getHwMode() const {\n";
+  // Collect all HwModes and related features defined in the TD files,
+  // and store them as a bit set.
+  unsigned ValueTypeModes = 0;
+  unsigned RegInfoModes = 0;
+  unsigned EncodingInfoModes = 0;
+  for (const auto &MS : CGH.getHwModeSelects()) {
+    for (const HwModeSelect::PairType &P : MS.second.Items) {
+      if (P.first == DefaultMode)
+        continue;
+      if (P.second->isSubClassOf("ValueType")) {
+        ValueTypeModes |= (1 << (P.first - 1));
+      } else if (P.second->isSubClassOf("RegInfo") ||
+                 P.second->isSubClassOf("SubRegRange")) {
+        RegInfoModes |= (1 << (P.first - 1));
+      } else if (P.second->isSubClassOf("InstructionEncoding")) {
+        EncodingInfoModes |= (1 << (P.first - 1));
+      }
+    }
+  }
+
+  // Start emitting for getHwModeSet().
+  OS << "unsigned " << ClassName << "::getHwModeSet() const {\n";
+  OS << "  // Collect HwModes and store them as a bit set.\n";
+  OS << "  unsigned Modes = 0;\n";
   for (unsigned M = 1, NumModes = CGH.getNumModeIds(); M != NumModes; ++M) {
     const HwMode &HM = CGH.getMode(M);
-    OS << "  if (checkFeatures(\"" << HM.Features << "\")) return " << M
-       << ";\n";
+    OS << "  if (checkFeatures(\"" << HM.Features << "\")) Modes |= (1 << "
+       << (M - 1) << ");\n";
   }
-  OS << "  return 0;\n}\n";
+  OS << "  return Modes;\n}\n";
+  // End emitting for getHwModeSet().
+
+  auto handlePerMode = [&](std::string ModeType, unsigned ModeInBitSet) {
+    OS << "  case HwMode_" << ModeType << ":\n"
+       << "    Modes &= " << ModeInBitSet << ";\n"
+       << "    if (!Modes)\n      return Modes;\n"
+       << "    if (!llvm::has_single_bit<unsigned>(Modes))\n"
+       << "      llvm_unreachable(\"Two or more HwModes for " << ModeType
+       << " were found!\");\n"
+       << "    return llvm::countr_zero(Modes) + 1;\n";
+  };
+
+  // Start emitting for getHwMode().
+  OS << "unsigned " << ClassName
+     << "::getHwMode(enum HwModeType type) const {\n";
+  OS << "  unsigned Modes = getHwModeSet();\n\n";
+  OS << "  if (!Modes)\n    return Modes;\n\n";
+  OS << "  switch (type) {\n";
+  OS << "  case HwMode_Default:\n    return llvm::countr_zero(Modes) + 1;\n";
+  handlePerMode("ValueType", ValueTypeModes);
+  handlePerMode("RegInfo", RegInfoModes);
+  handlePerMode("EncodingInfo", EncodingInfoModes);
+  OS << "  }\n";
+  OS << "  llvm_unreachable(\"unexpected HwModeType\");\n"
+     << "  return 0; // should not get here\n}\n";
+  // End emitting for getHwMode().
 }
 
 void SubtargetEmitter::emitGetMacroFusions(const std::string &ClassName,
@@ -1876,8 +1925,11 @@ void SubtargetEmitter::emitGenMCSubtargetInfo(raw_ostream &OS) {
      << "    return " << Target << "_MC"
      << "::resolveVariantSchedClassImpl(SchedClass, MI, MCII, CPUID);\n";
   OS << "  }\n";
-  if (TGT.getHwModes().getNumModeIds() > 1)
-    OS << "  unsigned getHwMode() const override;\n";
+  if (TGT.getHwModes().getNumModeIds() > 1) {
+    OS << "  unsigned getHwModeSet() const override;\n";
+    OS << "  unsigned getHwMode(enum HwModeType type = HwMode_Default) const "
+          "override;\n";
+  }
   OS << "};\n";
   EmitHwModeCheck(Target + "GenMCSubtargetInfo", OS);
 }
@@ -2004,8 +2056,11 @@ void SubtargetEmitter::run(raw_ostream &OS) {
      << " unsigned CPUID) const override;\n"
      << "  DFAPacketizer *createDFAPacketizer(const InstrItineraryData *IID)"
      << " const;\n";
-  if (TGT.getHwModes().getNumModeIds() > 1)
-    OS << "  unsigned getHwMode() const override;\n";
+  if (TGT.getHwModes().getNumModeIds() > 1) {
+    OS << "  unsigned getHwModeSet() const override;\n";
+    OS << "  unsigned getHwMode(enum HwModeType type = HwMode_Default) const "
+          "override;\n";
+  }
   if (TGT.hasMacroFusion())
     OS << "  std::vector<MacroFusionPredTy> getMacroFusions() const "
           "override;\n";
diff --git a/llvm/utils/emacs/llvm-mir-mode.el b/llvm/utils/emacs/llvm-mir-mode.el
index 6f1de4252445..5ded9cce50bb 100644
--- a/llvm/utils/emacs/llvm-mir-mode.el
+++ b/llvm/utils/emacs/llvm-mir-mode.el
@@ -1,70 +1,70 @@
-;;; llvm-mir-mode.el --- Major mode for LLVM Machine IR
-
-;; Maintainer:  The LLVM team, http://llvm.org/
-;; Version: 1.0
-
-;;; Commentary:
-
-;; Major mode for editing LLVM MIR files.
-
-;;; Code:
-
-(require 'llvm-mode)
-
-(defvar llvm-mir-mode-map
-  (let ((map (make-sparse-keymap)))
-    map)
-  "Keymap for `llvm-mir-mode'.")
-
-(defvar llvm-mir-mode-syntax-table
-  (let ((st (make-syntax-table)))
-    (modify-syntax-entry ?% "_" st)
-    (modify-syntax-entry ?$ "_" st)
-    (modify-syntax-entry ?. "_" st)
-    (modify-syntax-entry ?# "< " st)
-    (modify-syntax-entry ?\; "< " st)
-    (modify-syntax-entry ?\n "> " st)
-    st)
-  "Syntax table for `llvm-mir-mode'.")
-
-(defvar llvm-mir-font-lock-keywords
-  (append
-   (list
-    ; YAML Attributes
-    '("^name: +\\([a-zA-Z._][-a-zA-Z._0-9]*\\)"
-      1 font-lock-function-name-face)
-    '("^body: +|" . font-lock-keyword-face)
-    '("^[a-zA-Z_.][-a-zA-Z._0-9]*:" . font-lock-keyword-face)
-    `(,(regexp-opt '("true" "false")) . font-lock-constant-face)
-    ; YAML separators
-    '("^\\(---\\( |\\)?\\|\\.\\.\\.\\)$" . font-lock-comment-face)
-    ; Registers
-    '("%[a-zA-Z_.][-a-zA-Z._0-9]*" . font-lock-variable-name-face)
-    '("%[0-9]+\\(\\.[a-zA-Z._0-9]+\\)?" . font-lock-variable-name-face)
-    '("$[a-zA-Z_.][-a-zA-Z._0-9]*" . font-lock-constant-face)
-    ; Register classes
-    `(,(concat
-        "%\\([a-zA-Z_.][-a-zA-Z._0-9]*\\|[0-9]+\\(\\.[a-zA-Z._0-9]+\\)?\\)"
-        "\\(:[a-zA-Z_.][-a-zA-Z._0-9]*\\)")
-      3 font-lock-type-face)
-    '("class: \\([a-zA-Z_.][-a-zA-Z._0-9]*\\)" 1 font-lock-type-face)
-    ; MO Register flags
-    `(,(regexp-opt '("dead" "debug-use" "def" "early-clobber" "implicit"
-                     "implicit-def" "internal" "killed" "renamable" "undef")
-                   'symbols)
-      . font-lock-keyword-face))
-   llvm-font-lock-keywords)
-  "Keyword highlighting specification for `llvm-mir-mode'.")
-
- ;;;###autoload
-(define-derived-mode llvm-mir-mode prog-mode "LLVM MIR"
-  "A major mode for editing LLVM MIR files."
-  (setq-local comment-start "; ")
-  (setq-local font-lock-defaults `(llvm-mir-font-lock-keywords)))
-
-;;;###autoload
-(add-to-list 'auto-mode-alist (cons "\\.mir\\'" 'llvm-mir-mode))
-
-(provide 'llvm-mir-mode)
-
-;;; llvm-mir-mode.el ends here
+;;; llvm-mir-mode.el --- Major mode for LLVM Machine IR
+
+;; Maintainer:  The LLVM team, http://llvm.org/
+;; Version: 1.0
+
+;;; Commentary:
+
+;; Major mode for editing LLVM MIR files.
+
+;;; Code:
+
+(require 'llvm-mode)
+
+(defvar llvm-mir-mode-map
+  (let ((map (make-sparse-keymap)))
+    map)
+  "Keymap for `llvm-mir-mode'.")
+
+(defvar llvm-mir-mode-syntax-table
+  (let ((st (make-syntax-table)))
+    (modify-syntax-entry ?% "_" st)
+    (modify-syntax-entry ?$ "_" st)
+    (modify-syntax-entry ?. "_" st)
+    (modify-syntax-entry ?# "< " st)
+    (modify-syntax-entry ?\; "< " st)
+    (modify-syntax-entry ?\n "> " st)
+    st)
+  "Syntax table for `llvm-mir-mode'.")
+
+(defvar llvm-mir-font-lock-keywords
+  (append
+   (list
+    ; YAML Attributes
+    '("^name: +\\([a-zA-Z._][-a-zA-Z._0-9]*\\)"
+      1 font-lock-function-name-face)
+    '("^body: +|" . font-lock-keyword-face)
+    '("^[a-zA-Z_.][-a-zA-Z._0-9]*:" . font-lock-keyword-face)
+    `(,(regexp-opt '("true" "false")) . font-lock-constant-face)
+    ; YAML separators
+    '("^\\(---\\( |\\)?\\|\\.\\.\\.\\)$" . font-lock-comment-face)
+    ; Registers
+    '("%[a-zA-Z_.][-a-zA-Z._0-9]*" . font-lock-variable-name-face)
+    '("%[0-9]+\\(\\.[a-zA-Z._0-9]+\\)?" . font-lock-variable-name-face)
+    '("$[a-zA-Z_.][-a-zA-Z._0-9]*" . font-lock-constant-face)
+    ; Register classes
+    `(,(concat
+        "%\\([a-zA-Z_.][-a-zA-Z._0-9]*\\|[0-9]+\\(\\.[a-zA-Z._0-9]+\\)?\\)"
+        "\\(:[a-zA-Z_.][-a-zA-Z._0-9]*\\)")
+      3 font-lock-type-face)
+    '("class: \\([a-zA-Z_.][-a-zA-Z._0-9]*\\)" 1 font-lock-type-face)
+    ; MO Register flags
+    `(,(regexp-opt '("dead" "debug-use" "def" "early-clobber" "implicit"
+                     "implicit-def" "internal" "killed" "renamable" "undef")
+                   'symbols)
+      . font-lock-keyword-face))
+   llvm-font-lock-keywords)
+  "Keyword highlighting specification for `llvm-mir-mode'.")
+
+ ;;;###autoload
+(define-derived-mode llvm-mir-mode prog-mode "LLVM MIR"
+  "A major mode for editing LLVM MIR files."
+  (setq-local comment-start "; ")
+  (setq-local font-lock-defaults `(llvm-mir-font-lock-keywords)))
+
+;;;###autoload
+(add-to-list 'auto-mode-alist (cons "\\.mir\\'" 'llvm-mir-mode))
+
+(provide 'llvm-mir-mode)
+
+;;; llvm-mir-mode.el ends here
diff --git a/llvm/utils/git/requirements.txt b/llvm/utils/git/requirements.txt
index bed449e6bf9f..0ff62a8ea5be 100644
--- a/llvm/utils/git/requirements.txt
+++ b/llvm/utils/git/requirements.txt
@@ -4,41 +4,39 @@
 #
 #    pip-compile --output-file=requirements.txt requirements.txt.in
 #
-certifi==2023.7.22
+certifi==2024.2.2
     # via
     #   -r requirements.txt.in
     #   requests
-cffi==1.15.1
+cffi==1.16.0
     # via
     #   cryptography
     #   pynacl
-charset-normalizer==2.1.1
+charset-normalizer==3.3.2
     # via requests
-cryptography==41.0.3
+cryptography==42.0.5
     # via pyjwt
-deprecated==1.2.13
+deprecated==1.2.14
     # via pygithub
-gitdb==4.0.9
+gitdb==4.0.11
     # via gitpython
-gitpython==3.1.32
+gitpython==3.1.43
     # via -r requirements.txt.in
-idna==3.4
+idna==3.7
     # via requests
-pycparser==2.21
+pycparser==2.22
     # via cffi
 pygithub==1.59.1
     # via -r requirements.txt.in
-pyjwt[crypto]==2.5.0
+pyjwt[crypto]==2.8.0
     # via pygithub
 pynacl==1.5.0
     # via pygithub
-requests==2.28.1
+requests==2.31.0
     # via pygithub
-smmap==5.0.0
+smmap==5.0.1
     # via gitdb
-types-cryptography==3.3.23.2
-    # via pyjwt
-urllib3==1.26.12
+urllib3==2.2.1
     # via requests
-wrapt==1.14.1
+wrapt==1.16.0
     # via deprecated
diff --git a/llvm/utils/git/requirements_formatting.txt b/llvm/utils/git/requirements_formatting.txt
index ff744f0d4225..4c9dbd8755ab 100644
--- a/llvm/utils/git/requirements_formatting.txt
+++ b/llvm/utils/git/requirements_formatting.txt
@@ -4,37 +4,37 @@
 #
 #    pip-compile --output-file=llvm/utils/git/requirements_formatting.txt llvm/utils/git/requirements_formatting.txt.in
 #
-black==23.9.1
+black==23.12.1
     # via
     #   -r llvm/utils/git/requirements_formatting.txt.in
     #   darker
-certifi==2023.7.22
+certifi==2024.2.2
     # via requests
-cffi==1.15.1
+cffi==1.16.0
     # via
     #   cryptography
     #   pynacl
-charset-normalizer==3.2.0
+charset-normalizer==3.3.2
     # via requests
 click==8.1.7
     # via black
-cryptography==41.0.3
+cryptography==42.0.5
     # via pyjwt
 darker==1.7.2
     # via -r llvm/utils/git/requirements_formatting.txt.in
 deprecated==1.2.14
     # via pygithub
-idna==3.4
+idna==3.7
     # via requests
 mypy-extensions==1.0.0
     # via black
-packaging==23.1
+packaging==24.0
     # via black
-pathspec==0.11.2
+pathspec==0.12.1
     # via black
-platformdirs==3.10.0
+platformdirs==4.2.1
     # via black
-pycparser==2.21
+pycparser==2.22
     # via cffi
 pygithub==1.59.1
     # via -r llvm/utils/git/requirements_formatting.txt.in
@@ -46,7 +46,7 @@ requests==2.31.0
     # via pygithub
 toml==0.10.2
     # via darker
-urllib3==2.0.4
+urllib3==2.2.1
     # via requests
-wrapt==1.15.0
+wrapt==1.16.0
     # via deprecated
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/test/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/test/BUILD.gn
index 3f541f04de43..2227ad42cf40 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/test/BUILD.gn
@@ -35,7 +35,6 @@ write_lit_config("lit_site_cfg") {
     "CURRENT_TOOLS_DIR=" + rebase_path("$root_out_dir/bin"),
     "CLANG_PLUGIN_SUPPORT=0",
     "LLVM_HOST_TRIPLE=$llvm_current_triple",
-    "LLVM_INSTALL_TOOLCHAIN_ONLY=0",
     "LLVM_LIT_TOOLS_DIR=",  # Intentionally empty, matches cmake build.
     "LLVM_TOOLS_DIR=" + rebase_path("$root_out_dir/bin"),
     "Python3_EXECUTABLE=$python_path",
diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
index 971ceb3185ff..0a7cc3854056 100644
--- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
@@ -105,9 +105,11 @@ copy("Headers") {
     "__clang_hip_stdlib.h",
     "__stdarg___gnuc_va_list.h",
     "__stdarg___va_copy.h",
+    "__stdarg_header_macro.h",
     "__stdarg_va_arg.h",
     "__stdarg_va_copy.h",
     "__stdarg_va_list.h",
+    "__stddef_header_macro.h",
     "__stddef_max_align_t.h",
     "__stddef_null.h",
     "__stddef_nullptr_t.h",
diff --git a/llvm/utils/gn/secondary/clang/test/BUILD.gn b/llvm/utils/gn/secondary/clang/test/BUILD.gn
index 4ed9352da9c9..11454e68ec91 100644
--- a/llvm/utils/gn/secondary/clang/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/test/BUILD.gn
@@ -116,11 +116,13 @@ write_lit_config("lit_site_cfg") {
       "CMAKE_LIBRARY_OUTPUT_DIRECTORY=" + rebase_path("$root_out_dir/bin", dir),
       "LLVM_LIT_ERRC_MESSAGES=no such file or directory;is a directory;" +
           "invalid argument;permission denied",
+      "PERL_EXECUTABLE="
     ]
   } else {
     extra_values += [
       "CMAKE_LIBRARY_OUTPUT_DIRECTORY=" + rebase_path("$root_out_dir/lib", dir),
       "LLVM_LIT_ERRC_MESSAGES=",
+      "PERL_EXECUTABLE=/usr/bin/perl"
     ]
   }
 
diff --git a/llvm/utils/gn/secondary/clang/unittests/AST/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/AST/BUILD.gn
index 1fb3d621275e..f66e86ad938b 100644
--- a/llvm/utils/gn/secondary/clang/unittests/AST/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/AST/BUILD.gn
@@ -32,6 +32,7 @@ unittest("ASTTests") {
     "CommentLexer.cpp",
     "CommentParser.cpp",
     "CommentTextTest.cpp",
+    "ConceptPrinterTest.cpp",
     "DataCollectionTest.cpp",
     "DeclPrinterTest.cpp",
     "DeclTest.cpp",
diff --git a/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn
index df5b4587bf1c..e16ca31b81a8 100644
--- a/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn
@@ -33,7 +33,6 @@ unittest("ClangAnalysisFlowSensitiveTests") {
     "SignAnalysisTest.cpp",
     "SimplifyConstraintsTest.cpp",
     "SingleVarConstantPropagationTest.cpp",
-    "SolverTest.cpp",
     "TestingSupport.cpp",
     "TestingSupportTest.cpp",
     "TransferBranchTest.cpp",
@@ -41,5 +40,6 @@ unittest("ClangAnalysisFlowSensitiveTests") {
     "TypeErasedDataflowAnalysisTest.cpp",
     "UncheckedOptionalAccessModelTest.cpp",
     "ValueTest.cpp",
+    "WatchedLiteralsSolverTest.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index cbeb8a26a62d..210b26e8f166 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -516,6 +516,7 @@ if (current_toolchain == default_toolchain) {
       "__ios/fpos.h",
       "__iterator/access.h",
       "__iterator/advance.h",
+      "__iterator/aliasing_iterator.h",
       "__iterator/back_insert_iterator.h",
       "__iterator/bounded_iter.h",
       "__iterator/common_iterator.h",
@@ -651,6 +652,8 @@ if (current_toolchain == default_toolchain) {
       "__numeric/transform_exclusive_scan.h",
       "__numeric/transform_inclusive_scan.h",
       "__numeric/transform_reduce.h",
+      "__ostream/basic_ostream.h",
+      "__ostream/print.h",
       "__pstl/backends/libdispatch.h",
       "__pstl/backends/serial.h",
       "__pstl/backends/std_thread.h",
@@ -800,7 +803,6 @@ if (current_toolchain == default_toolchain) {
       "__type_traits/aligned_storage.h",
       "__type_traits/aligned_union.h",
       "__type_traits/alignment_of.h",
-      "__type_traits/apply_cv.h",
       "__type_traits/can_extract_key.h",
       "__type_traits/common_reference.h",
       "__type_traits/common_type.h",
diff --git a/llvm/utils/gn/secondary/lldb/include/lldb/API/BUILD.gn b/llvm/utils/gn/secondary/lldb/include/lldb/API/BUILD.gn
index 18726255e6a4..2fe295d13656 100644
--- a/llvm/utils/gn/secondary/lldb/include/lldb/API/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/include/lldb/API/BUILD.gn
@@ -1,11 +1,10 @@
-import("//lldb/utils/TableGen/lldb_tablegen.gni")
-
-lldb_tablegen("SBLanguages") {
-  args = [ "-gen-lldb-sbapi-dwarf-enum" ]
-
-  # See discussion on https://github.com/llvm/llvm-project/pull/89981 for
-  # why this runs tblgen on a .def file.
-  td_file = "//llvm/include/llvm/BinaryFormat/Dwarf.def"
-  output_name = "SBLanguages.h"
+action("SBLanguages") {
+  script = "//lldb/scripts/generate-sbapi-dwarf-enum.py"
+  outputs = [ "$target_gen_dir/SBLanguages.h" ]
+  sources = [ "//llvm/include/llvm/BinaryFormat/Dwarf.def" ]
+  args = [
+    rebase_path(sources[0], root_build_dir),
+    "-o",
+    rebase_path(outputs[0], root_build_dir),
+  ]
 }
-
diff --git a/llvm/utils/gn/secondary/lldb/utils/TableGen/BUILD.gn b/llvm/utils/gn/secondary/lldb/utils/TableGen/BUILD.gn
index bc9f6903a288..37305d1de66c 100644
--- a/llvm/utils/gn/secondary/lldb/utils/TableGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/utils/TableGen/BUILD.gn
@@ -6,7 +6,6 @@ executable("lldb-tblgen") {
   sources = [
     "LLDBOptionDefEmitter.cpp",
     "LLDBPropertyDefEmitter.cpp",
-    "LLDBSBAPIDWARFEnum.cpp",
     "LLDBTableGen.cpp",
     "LLDBTableGenUtils.cpp",
   ]
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
index e1b867bf70ba..dad4f028236d 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
@@ -60,7 +60,9 @@ tablegen("AMDGPUGenMCPseudoLowering") {
 tablegen("AMDGPUGenRegisterBank") {
   visibility = [
     ":LLVMAMDGPUCodeGen",
+    "MCTargetDesc",
     "Utils",
+    "//llvm/unittests/MC/AMDGPU:AMDGPUMCTests",
     "//llvm/unittests/Target/AMDGPU:AMDGPUTests",
   ]
   args = [ "-gen-register-bank" ]
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/MCTargetDesc/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/MCTargetDesc/BUILD.gn
index 5ba91fcec83a..0df55cbc0826 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/MCTargetDesc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/MCTargetDesc/BUILD.gn
@@ -94,6 +94,10 @@ static_library("MCTargetDesc") {
     "//llvm/lib/Target/AMDGPU/TargetInfo",
     "//llvm/lib/Target/AMDGPU/Utils",
     "//llvm/lib/TargetParser",
+
+    # AMDGPUMCExpr.cpp includes GCNSubtarget.h which after 490e348e679
+    # includes the generated AMDGPUGenRegisterBank.inc file :/
+    "//llvm/lib/Target/AMDGPU/:AMDGPUGenRegisterBank",
   ]
   include_dirs = [ ".." ]
   sources = [
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/Utils/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/Utils/BUILD.gn
index 631d1ef5c7b0..ec0d5fc767f7 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/Utils/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/Utils/BUILD.gn
@@ -1,7 +1,10 @@
 import("//llvm/utils/TableGen/tablegen.gni")
 
 tablegen("AMDGPUGenSearchableTables") {
-  visibility = [ ":Utils" ]
+  visibility = [
+    ":Utils",
+    "//llvm/unittests/MC/AMDGPU:AMDGPUMCTests",
+  ]
   args = [ "-gen-searchable-tables" ]
   td_file = "../AMDGPU.td"
 }
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/LoongArch/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/LoongArch/BUILD.gn
index 6e0efc548e33..822c2ec8afc6 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/LoongArch/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/LoongArch/BUILD.gn
@@ -41,6 +41,7 @@ static_library("LLVMLoongArchCodeGen") {
     "LoongArchISelLowering.cpp",
     "LoongArchInstrInfo.cpp",
     "LoongArchMCInstLower.cpp",
+    "LoongArchOptWInstrs.cpp",
     "LoongArchRegisterInfo.cpp",
     "LoongArchSubtarget.cpp",
     "LoongArchTargetMachine.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn
index d79b5efe69eb..50309d8ee024 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn
@@ -26,6 +26,7 @@ static_library("Instrumentation") {
     "LowerAllowCheckPass.cpp",
     "MemProfiler.cpp",
     "MemorySanitizer.cpp",
+    "PGOCtxProfLowering.cpp",
     "PGOForceFunctionAttrs.cpp",
     "PGOInstrumentation.cpp",
     "PGOMemOPSizeOpt.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn
index 6cc3848c1114..2db5b9603f21 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn
@@ -77,7 +77,7 @@ group("unittests") {
   }
   if (llvm_build_AMDGPU) {
     deps += [
-      "MC/AMDGPU:AMDGPUDwarfTests",
+      "MC/AMDGPU:AMDGPUMCTests",
       "Target/AMDGPU:AMDGPUTests",
     ]
   }
diff --git a/llvm/utils/gn/secondary/llvm/unittests/CodeGen/GlobalISel/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/CodeGen/GlobalISel/BUILD.gn
index 003e69d8f85e..a91709357d88 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/CodeGen/GlobalISel/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/CodeGen/GlobalISel/BUILD.gn
@@ -16,6 +16,7 @@ unittest("GlobalISelTests") {
   sources = [
     "CSETest.cpp",
     "ConstantFoldingTest.cpp",
+    "GIMatchTableExecutorTest.cpp",
     "GISelAliasTest.cpp",
     "GISelMITest.cpp",
     "GISelUtilsTest.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/Frontend/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Frontend/BUILD.gn
index 80ac77feec9a..6070a6f00419 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/Frontend/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/Frontend/BUILD.gn
@@ -15,6 +15,7 @@ unittest("LLVMFrontendTests") {
     "OpenACCTest.cpp",
     "OpenMPCompositionTest.cpp",
     "OpenMPContextTest.cpp",
+    "OpenMPDecompositionTest.cpp",
     "OpenMPIRBuilderTest.cpp",
     "OpenMPParsingTest.cpp",
   ]
diff --git a/llvm/utils/gn/secondary/llvm/unittests/MC/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/MC/AMDGPU/BUILD.gn
index 603753abec35..4a7f829d6d8e 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/MC/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/MC/AMDGPU/BUILD.gn
@@ -1,13 +1,29 @@
 import("//third-party/unittest/unittest.gni")
 
-unittest("AMDGPUDwarfTests") {
+unittest("AMDGPUMCTests") {
   deps = [
+    "//llvm/lib/CodeGen",
+    "//llvm/lib/IR",
     "//llvm/lib/MC",
     "//llvm/lib/Support",
     "//llvm/lib/Target/AMDGPU:LLVMAMDGPUCodeGen",
     "//llvm/lib/Target/AMDGPU/MCTargetDesc",
     "//llvm/lib/Target/AMDGPU/TargetInfo",
     "//llvm/lib/TargetParser",
+
+    # SIProgramInfoMCExprs.cpp includes AMDGPUTargetMachine.h, which includes
+    # the generated AMDGPUGenRegisterBank.inc file :/
+    "//llvm/lib/Target/AMDGPU:AMDGPUGenRegisterBank",
+
+    # SIProgramInfoMCExprs.cpp includes AMDGPUTargetMachine.h, which includes
+    # the generated AMDGPUGenSearchableTables.inc file :/
+    "//llvm/lib/Target/AMDGPU/Utils:AMDGPUGenSearchableTables",
+  ]
+
+  # AMDGPUMCTests heavily reaches into lib/Target/AMDGPU internals.
+  include_dirs = [ "//llvm/lib/Target/AMDGPU" ]
+  sources = [
+    "DwarfRegMappings.cpp",
+    "SIProgramInfoMCExprs.cpp",
   ]
-  sources = [ "DwarfRegMappings.cpp" ]
 }
diff --git a/llvm/utils/update_test_body.py b/llvm/utils/update_test_body.py
new file mode 100755
index 000000000000..661b0270d783
--- /dev/null
+++ b/llvm/utils/update_test_body.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+"""Generate test body using split-file and a custom script.
+
+The script will prepare extra files with `split-file`, invoke `gen`, and then
+rewrite the part after `gen` with its stdout.
+
+https://llvm.org/docs/TestingGuide.html#elaborated-tests
+
+Example:
+PATH=/path/to/clang_build/bin:$PATH llvm/utils/update_test_body.py path/to/test.s
+"""
+import argparse
+import contextlib
+import os
+import re
+import subprocess
+import sys
+import tempfile
+
+
+@contextlib.contextmanager
+def cd(directory):
+    cwd = os.getcwd()
+    os.chdir(directory)
+    try:
+        yield
+    finally:
+        os.chdir(cwd)
+
+
+def process(args, path):
+    prolog = []
+    seen_gen = False
+    with open(path) as f:
+        for line in f.readlines():
+            line = line.rstrip()
+            prolog.append(line)
+            if (seen_gen and re.match(r"(.|//)---", line)) or line.startswith(".endif"):
+                break
+            if re.match(r"(.|//)--- gen", line):
+                seen_gen = True
+        else:
+            print(
+                "'gen' should be followed by another part (---) or .endif",
+                file=sys.stderr,
+            )
+            return 1
+
+    if not seen_gen:
+        print("'gen' does not exist", file=sys.stderr)
+        return 1
+    with tempfile.TemporaryDirectory(prefix="update_test_body_") as dir:
+        try:
+            # If the last line starts with ".endif", remove it.
+            sub = subprocess.run(
+                ["split-file", "-", dir],
+                input="\n".join(
+                    prolog[:-1] if prolog[-1].startswith(".endif") else prolog
+                ).encode(),
+                capture_output=True,
+                check=True,
+            )
+        except subprocess.CalledProcessError as ex:
+            sys.stderr.write(ex.stderr.decode())
+            return 1
+        with cd(dir):
+            if args.shell:
+                print(f"invoke shell in the temporary directory '{dir}'")
+                subprocess.run([os.environ.get("SHELL", "sh")])
+                return 0
+
+            sub = subprocess.run(
+                ["sh", "-eu", "gen"],
+                capture_output=True,
+                # Don't encode the directory information to the Clang output.
+                # Remove unneeded details (.ident) as well.
+                env=dict(
+                    os.environ,
+                    CCC_OVERRIDE_OPTIONS="#^-fno-ident",
+                    PWD="/proc/self/cwd",
+                ),
+            )
+            sys.stderr.write(sub.stderr.decode())
+            if sub.returncode != 0:
+                print("'gen' failed", file=sys.stderr)
+                return sub.returncode
+            if not sub.stdout:
+                print("stdout is empty; forgot -o - ?", file=sys.stderr)
+                return 1
+            content = sub.stdout.decode()
+
+    with open(path, "w") as f:
+        # Print lines up to '.endif'.
+        print("\n".join(prolog), file=f)
+        # Then print the stdout of 'gen'.
+        f.write(content)
+
+
+parser = argparse.ArgumentParser(
+    description="Generate test body using split-file and a custom script"
+)
+parser.add_argument("files", nargs="+")
+parser.add_argument(
+    "--shell", action="store_true", help="invoke shell instead of 'gen'"
+)
+args = parser.parse_args()
+for path in args.files:
+    retcode = process(args, path)
+    if retcode != 0:
+        sys.exit(retcode)
diff --git a/mlir/docs/Bufferization.md b/mlir/docs/Bufferization.md
index 808535822212..6a49bea9c68c 100644
--- a/mlir/docs/Bufferization.md
+++ b/mlir/docs/Bufferization.md
@@ -5,35 +5,45 @@
 ## Overview
 
 Bufferization in MLIR is the process of converting ops with `tensor` semantics
-to ops with `memref` semantics. MLIR provides an infrastructure that bufferizes
-an entire program in a single pass (*One-Shot Bufferize*). This infrastructure
-bufferizes all ops that implement the
-[`BufferizableOpInterface`](https://github.com/llvm/llvm-project/blob/17a68065c378da74805e4e1b9a5b78cc9f83e580/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td)
-can be bufferized.
-
-MLIR has an older bufferization infrastructure built around
-[dialect conversion](DialectConversion.md). Most dialect conversion
-bufferization patterns have been migrated to One-Shot Bufferize, but some
-functionality such as function boundary bufferization still depends on dialect
-conversion and its type converter. New projects should use One-Shot Bufferize,
-as the dialect conversion-based bufferization will eventually be deprecated.
-Moreover, One-Shot Bufferize results in better bufferization with fewer memory
-allocations and buffer copies. This documentation is mostly about One-Shot
-Bufferize, but also describes how to gradually migrate a project from dialect
-conversion-based bufferization to One-Shot Bufferize.
+to ops with `memref` semantics. There are multiple MLIR passes that are related
+to bufferization. These passes typically run as one of the last steps in a
+pass pipeline, right before lowering to `memref` ops to LLVM. That is because
+many transformations are easier or only supported in tensor land; e.g.,
+[tile/fuse/… on tensors first](https://llvm.discourse.group/t/rfc-linalg-on-tensors-update-and-comprehensive-bufferization-rfc/3373),
+then bufferize the remaining IR.
+
+![bufferization passes](/includes/img/bufferization_passes.svg)
+
+The most important bufferization pass is *One-Shot Bufferize*: This pass
+rewrites `tensor` IR to `memref` IR. There are additional helper passes that
+preprocess IR (e.g., so that IR can be bufferized more efficiently), perform
+buffer-level optimizations such as allocation hoisting, and
+[insert buffer deallocation ops](OwnershipBasedBufferDeallocation.md) so that
+the resulting `memref` IR has no memory leaks.
+
+## Deprecated Passes
+
+The old dialect conversion-based bufferization passes have been deprecated and
+should not be used anymore. Most of those passes have already been removed from
+MLIR. One-Shot Bufferize produces in better bufferization results with fewer
+memory allocations and buffer copies.
+
+The buffer deallocation pass has been deprecated in favor of the ownership-based
+buffer deallocation pipeline. The deprecated pass has some limitations that may
+cause memory leaks in the resulting IR.
 
 ## What is One-Shot Bufferize?
 
-One-Shot Bufferize is a new tensor bufferization pass designed for IR in
+One-Shot Bufferize is a tensor bufferization pass designed for IR in
 [destination-passing style](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/11/dps-fhpc17.pdf),
 and with aggressive in-place bufferization.
 
 One-Shot Bufferize is:
 
-*   **Monolithic**: A single MLIR pass does the entire work, whereas the
-    previous bufferization in MLIR was split across multiple passes residing in
-    different dialects. In One-Shot Bufferize, `BufferizableOpInterface`
-    implementations are spread across different dialects.
+*   **Monolithic**: A single MLIR pass does the entire work.
+
+*   **Extensible** via an op interface: All ops that implement
+    `BufferizableOpInterface` can be bufferized.
 
 *   A **whole-function at a time analysis**. In-place bufferization decisions
     are made by analyzing SSA use-def chains on tensors. Op interface
@@ -41,10 +51,7 @@ One-Shot Bufferize is:
     ops, but also helper methods for One-Shot Bufferize's analysis to query
     information about an op's bufferization/memory semantics.
 
-*   **Extensible** via an op interface: All ops that implement
-    `BufferizableOpInterface` can be bufferized.
-
-*   **2-Pass**: Bufferization is internally broken down into 2 steps: First,
+*   **2-Phase**: Bufferization is internally broken down into 2 steps: First,
     analyze the entire IR and make bufferization decisions. Then, bufferize
     (rewrite) the IR. The analysis has access to exact SSA use-def information.
     It incrementally builds alias and equivalence sets and does not rely on a
@@ -60,27 +67,17 @@ One-Shot Bufferize is:
     of `AnalysisState` that implements a small number virtual functions can
     serve as a custom analysis. It is even possible to run One-Shot Bufferize
     without any analysis (`AlwaysCopyAnalysisState`), in which case One-Shot
-    Bufferize behaves exactly like the old dialect conversion-based
-    bufferization (i.e., copy every buffer before writing to it).
+    Bufferize copies every buffer before writing to it.
 
-To reduce complexity, One-Shot Bufferize should be
-[run after other transformations](https://llvm.discourse.group/t/rfc-linalg-on-tensors-update-and-comprehensive-bufferization-rfc/3373),
-typically as one of the last steps right before lowering memref ops. Many
-transformations are easier in tensor land; e.g., tile/fuse/… on tensors first,
-then bufferize the remaining IR.
-
-From an architecture perspective, One-Shot Bufferize consists of
-[BufferizableOpInterface](https://github.com/llvm/llvm-project/blob/17a68065c378da74805e4e1b9a5b78cc9f83e580/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td)
-(and its implementations) and an
-[analysis](https://github.com/llvm/llvm-project/blob/ae2764e835a26bad9774803eca0a6530df2a3e2d/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h#L164)
-of tensor SSA values that decides if a buffer can be used directly or must be
-copied. The [bufferize] method of the op interface inspects analysis results and
-rewrites tensor ops into memref ops.
+Note that One-Shot Bufferize does not deallocate buffers. That is done by the
+[Ownership-based Buffer Deallocation passes](OwnershipBasedBufferDeallocation.md).
 
 ## Goals of Bufferization
 
-The high-level goal of every bufferization technique is to: 1. Use as little
-memory as possible. 2. Copy as little memory as possible.
+The high-level goal of every bufferization technique is to:
+
+1. Use as little memory as possible.
+2. Copy as little memory as possible.
 
 This implies reusing already allocated buffers when possible, turning
 bufferization into an algorithmically complex problem with similarities to
@@ -102,40 +99,46 @@ choosing an already existing buffer, we must be careful not to accidentally
 overwrite data that is still needed later in the program.
 
 To simplify this problem, One-Shot Bufferize was designed to take advantage of
-*destination-passing style*. This form exists in itself independently of
-bufferization and is tied to SSA semantics: many ops are “updating” part of
-their input SSA variable. For example the LLVM instruction
+*destination-passing style* (DPS). In MLIR, DPS op should implement the
+[`DestinationStyleOpInterface`](https://github.com/llvm/llvm-project/blob/792d437b56adfb3416daf8105942d4899fb82763/mlir/include/mlir/Interfaces/DestinationStyleOpInterface.td).
+DPS exists in itself independently of bufferization and is tied to SSA
+semantics: many ops are "updating" a part of their input SSA variables. For
+example the LLVM instruction
 [`insertelement`](https://llvm.org/docs/LangRef.html#insertelement-instruction)
 is inserting an element inside a vector. Since SSA values are immutable, the
 operation returns a copy of the input vector with the element inserted.
-Another example in MLIR is `linalg.generic`, which always has an extra `outs`
-operand which provides the initial values to update (for example when the
-operation is doing a reduction).
+Another example in MLIR is `linalg.generic` on tensors, which always has an
+extra `outs` operand for each result, which provides the initial values to
+update (for example when the operation is doing a reduction).
 
-This input is referred to as "destination" in the following (quotes are
+`outs` operands are referred to as "destinations" in the following (quotes are
 important as this operand isn't modified in place but copied) and comes into
 place in the context of bufferization as a possible "anchor" for the
 bufferization algorithm. This allows the user to shape the input in a form that
 guarantees close to optimal bufferization result when carefully choosing the
 SSA value used as "destination".
 
-For every tensor result, a "destination-passing" style op has a corresponding
-tensor operand. If there aren't any other uses of this tensor, the bufferization
-can alias it with the op result and perform the operation "in-place" by reusing
-the buffer allocated for this "destination" input.
+For every tensor result, a DPS op has a corresponding tensor operand. If there
+aren't any other conflicting uses of this tensor, the bufferization can alias
+it with the op result and perform the operation "in-place" by reusing the buffer
+allocated for this "destination" input.
 
-As an example, consider the following op: `%0 = tensor.insert %cst into
-%t[%idx] : tensor<?xf32>`
+As an example, consider the following op: `%r = tensor.insert %f into
+%t[%idx] : tensor<5xf32>`
+
+![tensor.insert example](/includes/img/bufferization_tensor_insert_dst.svg)
 
 `%t` is the "destination" in this example. When choosing a buffer for the result
-`%0`, denoted as `buffer(%0)`, One-Shot Bufferize considers only two options:
+`%r`, denoted as `buffer(%r)`, One-Shot Bufferize considers only two options:
 
-1.  `buffer(%0) = buffer(%t)` : alias the "destination" tensor with the
-    result and perform the operation in-place.
-2.  `buffer(%0)` is a newly allocated buffer.
+1.  `buffer(%r) = buffer(%t)`: store the result in the existing `buffer(%t)`.
+    Note that this is not always possible. E.g., if the old contents of
+    `buffer(%t)` are still needed. One-Shot Bufferize's main task is to detect
+    such cases and fall back to the second option when necessary.
+2.  `buffer(%r)` is a newly allocated buffer.
 
 There may be other buffers in the same function that could potentially be used
-for `buffer(%0)`, but those are not considered by One-Shot Bufferize to keep the
+for `buffer(%r)`, but those are not considered by One-Shot Bufferize to keep the
 bufferization simple. One-Shot Bufferize could be extended to consider such
 buffers in the future to achieve a better quality of bufferization.
 
@@ -151,7 +154,7 @@ memory allocation. E.g.:
 ```
 
 The result of `tensor.generate` does not have a "destination" operand, so
-bufferization allocates a new buffer. This could be avoided by choosing an
+bufferization allocates a new buffer. This could be avoided by instead using an
 op such as `linalg.generic`, which can express the same computation with a
 "destination" operand, as specified behind outputs (`outs`):
 
@@ -198,12 +201,61 @@ e.g.:
 ```mlir
 %0 = "my_dialect.some_op"(%t) : (tensor<?xf32>) -> (tensor<?xf32>)
 %1 = "my_dialect.another_op"(%0) : (tensor<?xf32>) -> (tensor<?xf32>)
+
+// "yet_another_op" likely needs to read the data of %0, so "another_op" cannot
+// in-place write to buffer(%0).
 %2 = "my_dialect.yet_another_op"(%0) : (tensor<?xf32>) -> (tensor<?xf32>)
 ```
 
-One-Shot Bufferize has debug flags (`test-analysis-only print-conflicts`) that
-print the results of the analysis and explain to the user why buffer copies were
-inserted.
+## Tensor / MemRef Boundary
+
+The bufferization dialect provides a few helper ops to connect tensor IR (that
+should be bufferized) with existing buffers (that may be allocated/provided by
+a different runtime/library/etc.).
+
+`bufferization.to_memref %t` returns the future buffer of a tensor SSA value.
+`bufferization.to_tensor %m` returns a tensor SSA value for a given MemRef
+buffer. `bufferization.materialize_in_destination` indicates that a tensor value
+should materialize in a certain buffer.
+
+Consider the following example, where a TOSA matmul result should materialize in
+an existing buffer `%C`:
+
+```mlir
+// Batched TOSA matrix multiplication. %A and %B are the
+// inputs, %C is the output.
+func.func @test_matmul(%A: memref<1x17x19xf32>,
+                       %B: memref<1x19x29xf32>,
+                       %C: memref<1x17x29xf32>) {
+
+  %A_tensor = bufferization.to_tensor %A restrict : memref<1x17x19xf32>
+  %B_tensor = bufferization.to_tensor %B restrict : memref<1x19x29xf32>
+
+  %0 = tosa.matmul %A_tensor, %B_tensor
+      : (tensor<1x17x19xf32>, tensor<1x19x29xf32>) ->
+         tensor<1x17x29xf32>
+
+  bufferization.materialize_in_destination
+    %0 in restrict writable %C
+      : (tensor<1x17x29xf32>, memref<1x17x29xf32>) -> ()
+
+  return
+}
+```
+
+Note that all bufferization ops in this example have the `restrict` unit
+attribute set. This attribute is similar to the C restrict keyword and indicates
+that there is no other `to_tensor` or `materialize_in_destination` op with
+the same or an aliasing MemRef operand. Only such
+`to_tensor`/`materialize_in_destination` ops are supported. The `restrict`
+attribute gives strong aliasing guarantees to the bufferization analysis and
+allows us to look only at the tensor IR in a program. (Ops that do not operate
+on tensors are ignored by the One-Shot Bufferize.)
+
+Also note that `tosa.matmul` cannot be bufferized as is: there is no
+`BufferizableOpInterface` implementation for that op. However, the op can be
+lowered to a combination of `tensor.empty` and `linalg.matmul`, which can be
+bufferized.
 
 ## Using One-Shot Bufferize
 
@@ -221,17 +273,14 @@ By default, One-Shot Bufferize fails when it encounters an op with tensor
 semantics (i.e., tensor result or tensor operand) that is not bufferizable
 (i.e., does not implement `BufferizableOpInterface`). This can be avoided with
 `allow-unknown-ops`. In that case, One-Shot Bufferize inserts
-`to_memref`/`to_tensor` ops around the bufferization boundary. These ops are
-named versions of `unrealized_conversion_cast`. Note that One-Shot Bufferize's
-analysis can currently not analyze these ops, so input IR with such ops may fail
-bufferization. Therefore, running One-Shot Bufferize multiple times in a
-sequence is also not supported at the moment.
+`to_memref`/`to_tensor` ops around the bufferization boundary.
 
 One-Shot Bufferize can be configured to bufferize only ops from a set of
 dialects with `dialect-filter`. This can be useful for gradually migrating from
 dialect conversion-based bufferization to One-Shot Bufferize. One-Shot Bufferize
 must run first in such a case, because dialect conversion-based bufferization
-generates `to_tensor`/`to_memref` ops which One-Shot Bufferize cannot analyze.
+generates `to_tensor` ops without the `restrict` unit attribute, which One-Shot
+Bufferize cannot analyze.
 
 One-Shot Bufferize can also be called programmatically with
 [`bufferization::runOneShotBufferize`](https://github.com/llvm/llvm-project/blob/ae2764e835a26bad9774803eca0a6530df2a3e2d/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h#L167).
@@ -240,6 +289,14 @@ Alternatively,
 skips the analysis and inserts a copy on every buffer write, just like the
 dialect conversion-based bufferization.
 
+By default, function boundaries are not bufferized. This is because there are
+currently limitations around function graph bufferization: recursive
+calls are not supported. As long as there are no recursive calls, function
+boundary bufferization can be enabled with `bufferize-function-boundaries`. Each
+tensor function argument and tensor function result is then turned into a
+memref. The layout map of the memref type can be controlled with
+`function-boundary-type-conversion`.
+
 ## Memory Layouts
 
 One-Shot Bufferize bufferizes ops from top to bottom. This works well when all
@@ -319,6 +376,11 @@ To get a better intuition of the interface methods, we invite users to take a
 look at existing implementations in MLIR, e.g., the implementation of
 `tensor.insert` or `tensor.extract`.
 
+Interface implementations of DPS ops (that implement
+`DestinationStyleOpInterface`) can derive from
+`DstBufferizableOpInterfaceExternalModel`, which provides all necessary
+method implementations except for `bufferize`.
+
 ## Debugging Buffer Copies
 
 To get a better understanding of why One-Shot Bufferize introduced a buffer
@@ -338,14 +400,90 @@ There are two reasons why a buffer copy may be inserted.
 In the first case, `print-conflicts` illustrates the conflict in the form of a
 ("read", "conflicting write", "last write") tuple.
 
-## Understanding the SSA Use-Def Chain Analysis
+A RaW conflict consists of three parts, in the following order according to
+op dominance:
+
+1. **Definition:** A tensor `%t` is defined.
+2. **Conflicting Write:** An operation writes to `buffer(%t)`.
+3. **Read:** An operation reads `%t`.
+
+When such a RaW conflict is detected during the analysis phase, One-Shot
+Bufferize will insert a buffer copy for the conflicting write.
+
+**Example**
+
+```mlir
+// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries test-analysis-only print-conflicts"
+func.func @test(%arg0: f32, %arg1: f32, %arg2: index, %arg3: index) -> (f32, tensor<3xf32>) {
+  // Create a new tensor with [%arg0, %arg0, %arg0].
+  %0 = tensor.from_elements %arg0, %arg0, %arg0 : tensor<3xf32>
+
+  // Insert something into the new tensor.
+  %1 = tensor.insert %arg1 into %0[%arg2] : tensor<3xf32>
+
+  // Read from the old tensor.
+  %r = tensor.extract %0[%arg3] : tensor<3xf32>
+
+  // Return the extracted value and the result of the insertion.
+  func.return %r, %1 : f32, tensor<3xf32>
+}
+```
+
+The output IR is as follows:
+
+```mlir
+func.func @test(%arg0: f32, %arg1: f32, %arg2: index, %arg3: index) -> (f32, tensor<3xf32>) {
+  %from_elements = tensor.from_elements %arg0, %arg0, %arg0 {"C_0[DEF: result 0]"} : tensor<3xf32>
+  %inserted = tensor.insert %arg1 into %from_elements[%arg2] {"C_0[CONFL-WRITE: 1]", __inplace_operands_attr__ = ["none", "false", "none"]} : tensor<3xf32>
+  %extracted = tensor.extract %from_elements[%arg3] {"C_0[READ: 0]", __inplace_operands_attr__ = ["true", "none"]} : tensor<3xf32>
+  return {__inplace_operands_attr__ = ["none", "true"]} %extracted, %inserted : f32, tensor<3xf32>
+}
+```
+
+Note that the IR was not bufferized. It was merely annotated with the results
+of the bufferization analysis. Every operation with tensor semantics has a
+`__inplace_operands_attr__` attribute with one value per operand. If an operand
+is not a tensor, the respective value is `none`. Otherwise, if the operand was
+decided to be bufferized in-place, the value is `true`. A value of `false`
+indicates a buffer copy. In the above example, a buffer copy would be inserted
+for `tensor.insert`, so that it does not overwrite `buffer(%from_elements)`,
+which is still needed for `tensor.extract`.
+
+For each RaW (there is only one in the example), three `C_i` attributes were
+added:
+
+* `C_0[DEF: result 0]`: A tensor is defined: 0-th result of
+  `tensor.from_elements`.
+* `C_0[CONFL-WRITE: 1]`: An operation (if bufferized in-place) would write into
+  the future buffer of the defined tensor: 1-st operand of `tensor.insert`.
+* `C_0[READ: 0]`: An operation reads the tensor definition: 0-th operand of
+  `tensor.extract`.
+
+The fully bufferized IR (with the inserted buffer copy) is as follows:
+
+```mlir
+func.func @test(%arg0: f32, %arg1: f32, %arg2: index, %arg3: index) -> (f32, memref<3xf32>) {
+  %c2 = arith.constant 2 : index
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %alloc = memref.alloc() {alignment = 64 : i64} : memref<3xf32>
+  memref.store %arg0, %alloc[%c0] : memref<3xf32>
+  memref.store %arg0, %alloc[%c1] : memref<3xf32>
+  memref.store %arg0, %alloc[%c2] : memref<3xf32>
+  %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<3xf32>
+  memref.copy %alloc, %alloc_0 : memref<3xf32> to memref<3xf32>
+  memref.store %arg1, %alloc_0[%arg2] : memref<3xf32>
+  %0 = memref.load %alloc[%arg3] : memref<3xf32>
+  return %0, %alloc_0 : f32, memref<3xf32>
+}
+```
 
 To get a better understanding of the SSA Use-Def Chain Analysis and the RaW
-conflict detection algorithm, we invite interested users to read the
-[design document](https://discourse.llvm.org/uploads/short-url/5kckJ3DftYwQokG252teFgw3sYa.pdf)
-and watch the corresponding [ODM talk](https://youtu.be/TXEo59CYS9A)
-([slides](https://mlir.llvm.org/OpenMeetings/2022-01-13-One-Shot-Bufferization.pdf)).
-can be used to bufferize a program in a single pass, as long as each op
+conflict detection algorithm, interested users may want to refer to:
+
+* [Original design document](https://discourse.llvm.org/uploads/short-url/5kckJ3DftYwQokG252teFgw3sYa.pdf)
+* [ODM talk](https://youtu.be/TXEo59CYS9A), ([slides](https://mlir.llvm.org/OpenMeetings/2022-01-13-One-Shot-Bufferization.pdf)).
+* [LLVM Dev Meeting 2023 tutorial slides](https://m-sp.org/downloads/llvm_dev_2023.pdf)
 
 ## Migrating from Dialect Conversion-based Bufferization
 
@@ -356,20 +494,6 @@ One-Shot Bufferize must run first because it cannot analyze those boundary ops.
 To update existing code step-by-step, it may be useful to specify a dialect
 filter for One-Shot Bufferize, so that dialects can be switched over one-by-one.
 
-## Bufferization Function Graphs
-
-One-Shot Bufferize does currently not support function graph bufferization.
-I.e., `CallOp`, `ReturnOp` and function bbArgs are not bufferizable. Users can
-run the existing `--func-bufferize` bufferization pass after One-Shot Bufferize.
-
-Alternatively, users can try
-[`ModuleBufferization`](https://github.com/llvm/llvm-project/blob/ae2764e835a26bad9774803eca0a6530df2a3e2d/mlir/include/mlir/Dialect/Linalg/ComprehensiveBufferize/ModuleBufferization.h#L31),
-which is an extension of One-Shot Bufferize. This bufferization is still under
-development and does not support arbitrary IR. In essence, returning a tensor
-from a function is not supported, unless it is equivalent to a function bbArg.
-In that case, the corresponding return value can simply be dropped during
-bufferization.
-
 ## Dialect Conversion-based Bufferization
 
 Disclaimer: Most dialect conversion-based bufferization has been migrated to
diff --git a/mlir/docs/DefiningDialects/Operations.md b/mlir/docs/DefiningDialects/Operations.md
index 79a0cc55f138..706abce55ce9 100644
--- a/mlir/docs/DefiningDialects/Operations.md
+++ b/mlir/docs/DefiningDialects/Operations.md
@@ -640,10 +640,11 @@ The available directives are as follows:
 
 *   `attr-dict`
 
-    -   Represents the attribute dictionary of the operation. Any inherent 
-    -   attributes that are not used elsewhere in the format are printed as
-    -   part of the attribute dictionary unless a `prop-dict` is present.
-    -   Discardable attributes are always part of the `attr-dict`.  
+    -   Represents the attribute dictionary of the operation.
+    -   Any inherent attributes that are not used elsewhere in the format are
+        printed as part of the attribute dictionary unless a `prop-dict` is
+        present.
+    -   Discardable attributes are always part of the `attr-dict`.
 
 *   `attr-dict-with-keyword`
 
@@ -654,23 +655,23 @@ The available directives are as follows:
 
     -   Represents the properties of the operation converted to a dictionary.
     -   Any property or inherent attribute that are not used elsewhere in the
-    -   format are parsed and printed as part of this dictionary.
+        format are parsed and printed as part of this dictionary.
     -   If present, the `attr-dict` will not contain any inherent attributes.
 
-*   `custom` < UserDirective > ( Params )
+*   `custom < UserDirective > ( Params )`
 
     -   Represents a custom directive implemented by the user in C++.
     -   See the [Custom Directives](#custom-directives) section below for more
         details.
 
-*   `functional-type` ( inputs , outputs )
+*   `functional-type ( inputs , outputs )`
 
     -   Formats the `inputs` and `outputs` arguments as a
         [function type](../Dialects/Builtin.md/#functiontype).
     -   The constraints on `inputs` and `outputs` are the same as the `input` of
         the `type` directive.
 
-*   `oilist` ( \`keyword\` elements | \`otherKeyword\` elements ...)
+*   ``oilist ( `keyword` elements | `otherKeyword` elements ...)``
 
     -   Represents an optional order-independent list of clauses. Each clause
         has a keyword and corresponding assembly format.
@@ -682,7 +683,7 @@ The available directives are as follows:
 
     -   Represents all of the operands of an operation.
 
-*   `ref` ( input )
+*   `ref ( input )`
 
     -   Represents a reference to the a variable or directive, that must have
         already been resolved, that may be used as a parameter to a `custom`
@@ -703,13 +704,13 @@ The available directives are as follows:
 
     -   Represents all of the successors of an operation.
 
-*   `type` ( input )
+*   `type ( input )`
 
     -   Represents the type of the given input.
     -   `input` must be either an operand or result [variable](#variables), the
         `operands` directive, or the `results` directive.
 
-*   `qualified` ( type_or_attribute )
+*   `qualified ( type_or_attribute )`
 
     -   Wraps a `type` directive or an attribute parameter.
     -   Used to force printing the type or attribute prefixed with its dialect
@@ -1700,11 +1701,11 @@ To allow more convenient syntax, helper classes exist for TableGen classes
 which are commonly used as anonymous definitions. These currently include:
 
 * `DeprecatedOpBuilder`: Can be used in place of `OpBuilder` with the same
-  arguments except taking the reason as first argument, e.g. 
+  arguments except taking the reason as first argument, e.g.
   `DeprecatedOpBuilder<"use 'build' with foo instead", (ins "int":$bar)>`
 
-Note: Support for the `CppDeprecated` mechanism has to be implemented by 
-every code generator separately. 
+Note: Support for the `CppDeprecated` mechanism has to be implemented by
+every code generator separately.
 
 ### Requirements and existing mechanisms analysis
 
diff --git a/mlir/docs/includes/img/bufferization_passes.svg b/mlir/docs/includes/img/bufferization_passes.svg
new file mode 100644
index 000000000000..835726569227
--- /dev/null
+++ b/mlir/docs/includes/img/bufferization_passes.svg
@@ -0,0 +1 @@
+<svg version="1.1" viewBox="0.0 0.0 480.0 540.0" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg"><clipPath id="p.0"><path d="m0 0l480.0 0l0 540.0l-480.0 0l0 -540.0z" clip-rule="nonzero"/></clipPath><g clip-path="url(#p.0)"><path fill="#ffffff" d="m0 0l480.0 0l0 540.0l-480.0 0z" fill-rule="evenodd"/><path fill="#4285f4" d="m163.16655 122.94077l315.59058 0l0 29.039375l-315.59058 0z" fill-rule="evenodd"/><path stroke="#4285f4" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m163.16655 122.94077l315.59058 0l0 29.039375l-315.59058 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m244.41011 131.65561l153.10344 0l0 17.16098l-153.10344 0l0 -17.16098z" fill-rule="nonzero"/><path fill="#ffffff" d="m246.20699 139.41545l0 -1.546875l4.453125 0l0 1.546875l-4.453125 0zm13.80809 -0.5625q0 0.859375 -0.25 1.578125q-0.234375 0.703125 -0.703125 1.21875q-0.453125 0.5 -1.125 0.78125q-0.65625 0.28125 -1.515625 0.28125q-0.79689026 0 -1.4375153 -0.234375q-0.640625 -0.25 -1.09375 -0.71875q-0.4375 -0.46875 -0.671875 -1.171875q-0.234375 -0.703125 -0.234375 -1.640625q0 -0.859375 0.25 -1.5625q0.25 -0.71875 0.703125 -1.21875q0.46875 -0.5 1.125 -0.765625q0.671875 -0.28125 1.5000153 -0.28125q0.828125 0 1.453125 0.234375q0.640625 0.234375 1.078125 0.71875q0.453125 0.46875 0.6875 1.171875q0.234375 0.6875 0.234375 1.609375zm-1.828125 0.03125q0 -1.125 -0.421875 -1.6875q-0.421875 -0.578125 -1.25 -0.578125q-0.46875 0 -0.79689026 0.1875q-0.3125 0.171875 -0.515625 0.484375q-0.203125 0.3125 -0.3125 0.734375q-0.09375 0.40625 -0.09375 0.875q0 1.140625 0.453125 1.71875q0.46875 0.578125 1.2656403 0.578125q0.4375 0 0.75 -0.171875q0.328125 -0.1875 0.53125 -0.484375q0.203125 -0.3125 0.296875 -0.734375q0.09375 -0.421875 0.09375 -0.921875zm7.66745 3.65625l0 -4.734375q0 -1.203125 -0.890625 -1.203125q-0.4375 0 -0.84375 0.359375q-0.40625 0.359375 -0.859375 0.984375l0 4.59375l-1.734375 0l0 -7.265625l1.5 0l0.046875 1.0625q0.21875 -0.265625 0.453125 -0.5q0.25 -0.234375 0.53125 -0.390625q0.28125 -0.15625 0.59375 -0.234375q0.328125 -0.09375 0.734375 -0.09375q0.546875 0 0.953125 0.1875q0.421875 0.171875 0.703125 0.515625q0.28125 0.328125 0.421875 0.796875q0.140625 0.46875 0.140625 1.046875l0 4.875l-1.75 0zm10.058075 -4.1875q0 0.171875 -0.015625 0.46875q0 0.28125 -0.03125 0.53125l-4.78125 0q0 0.46875 0.140625 0.84375q0.15625 0.359375 0.421875 0.609375q0.265625 0.25 0.640625 0.375q0.390625 0.125 0.84375 0.125q0.53125 0 1.125 -0.078125q0.609375 -0.078125 1.265625 -0.265625l0 1.390625q-0.28125 0.078125 -0.609375 0.140625q-0.328125 0.0625 -0.671875 0.109375q-0.34375 0.046875 -0.703125 0.078125q-0.34375 0.03125 -0.671875 0.03125q-0.828125 0 -1.5 -0.234375q-0.65625 -0.25 -1.109375 -0.703125q-0.453125 -0.46875 -0.703125 -1.140625q-0.234375 -0.6875 -0.234375 -1.578125q0 -0.890625 0.234375 -1.609375q0.25 -0.734375 0.6875 -1.25q0.453125 -0.515625 1.078125 -0.796875q0.640625 -0.28125 1.421875 -0.28125q0.765625 0 1.359375 0.234375q0.59375 0.234375 0.984375 0.671875q0.40625 0.421875 0.609375 1.015625q0.21875 0.59375 0.21875 1.3125zm-1.78125 -0.25q0.015625 -0.4375 -0.109375 -0.75q-0.109375 -0.328125 -0.3125 -0.53125q-0.1875 -0.203125 -0.453125 -0.296875q-0.25 -0.109375 -0.5625 -0.109375q-0.65625 0 -1.078125 0.4375q-0.421875 0.4375 -0.5 1.25l3.015625 0zm4.308075 1.3125l0 -1.546875l4.453125 0l0 1.546875l-4.453125 0zm13.245575 0.984375q0 0.625 -0.28125 1.0625q-0.265625 0.4375 -0.71875 0.71875q-0.453125 0.28125 -1.03125 0.40625q-0.578125 0.125 -1.171875 0.125q-0.796875 0 -1.453125 -0.078125q-0.640625 -0.078125 -1.21875 -0.21875l0 -1.578125q0.6875 0.28125 1.34375 0.40625q0.671875 0.125 1.265625 0.125q0.6875 0 1.03125 -0.21875q0.34375 -0.21875 0.34375 -0.5625q0 -0.171875 -0.078125 -0.296875q-0.078125 -0.140625 -0.28125 -0.265625q-0.1875 -0.125 -0.546875 -0.25q-0.359375 -0.140625 -0.9375 -0.3125q-0.53125 -0.15625 -0.9375 -0.34375q-0.40625 -0.1875 -0.671875 -0.4375q-0.265625 -0.265625 -0.40625 -0.59375q-0.125 -0.34375 -0.125 -0.8125q0 -0.4375 0.203125 -0.828125q0.203125 -0.390625 0.59375 -0.6875q0.40625 -0.296875 1.0 -0.46875q0.59375 -0.171875 1.390625 -0.171875q0.6875 0 1.21875 0.078125q0.53125 0.0625 0.9375 0.140625l0 1.421875q-0.625 -0.1875 -1.171875 -0.265625q-0.546875 -0.09375 -1.09375 -0.09375q-0.53125 0 -0.859375 0.203125q-0.328125 0.1875 -0.328125 0.53125q0 0.15625 0.0625 0.296875q0.078125 0.125 0.265625 0.25q0.1875 0.109375 0.515625 0.25q0.34375 0.125 0.90625 0.28125q0.625 0.1875 1.046875 0.390625q0.4375 0.203125 0.703125 0.46875q0.265625 0.25 0.375 0.578125q0.109375 0.328125 0.109375 0.75zm6.401825 2.140625l0 -4.734375q0 -1.203125 -0.890625 -1.203125q-0.4375 0 -0.84375 0.359375q-0.40625 0.359375 -0.859375 0.984375l0 4.59375l-1.734375 0l0 -10.109375l1.734375 0l0 2.484375l-0.078125 1.3125q0.21875 -0.25 0.4375 -0.453125q0.234375 -0.21875 0.5 -0.359375q0.265625 -0.15625 0.578125 -0.21875q0.3125 -0.078125 0.6875 -0.078125q0.546875 0 0.953125 0.1875q0.421875 0.171875 0.703125 0.515625q0.28125 0.328125 0.421875 0.796875q0.140625 0.46875 0.140625 1.046875l0 4.875l-1.75 0zm10.276825 -3.6875q0 0.859375 -0.25 1.578125q-0.234375 0.703125 -0.703125 1.21875q-0.453125 0.5 -1.125 0.78125q-0.65625 0.28125 -1.515625 0.28125q-0.796875 0 -1.4375 -0.234375q-0.640625 -0.25 -1.09375 -0.71875q-0.4375 -0.46875 -0.671875 -1.171875q-0.234375 -0.703125 -0.234375 -1.640625q0 -0.859375 0.25 -1.5625q0.25 -0.71875 0.703125 -1.21875q0.46875 -0.5 1.125 -0.765625q0.671875 -0.28125 1.5 -0.28125q0.828125 0 1.453125 0.234375q0.640625 0.234375 1.078125 0.71875q0.453125 0.46875 0.6875 1.171875q0.234375 0.6875 0.234375 1.609375zm-1.828125 0.03125q0 -1.125 -0.421875 -1.6875q-0.421875 -0.578125 -1.25 -0.578125q-0.46875 0 -0.796875 0.1875q-0.3125 0.171875 -0.515625 0.484375q-0.203125 0.3125 -0.3125 0.734375q-0.09375 0.40625 -0.09375 0.875q0 1.140625 0.453125 1.71875q0.46875 0.578125 1.265625 0.578125q0.4375 0 0.75 -0.171875q0.328125 -0.1875 0.53125 -0.484375q0.203125 -0.3125 0.296875 -0.734375q0.09375 -0.421875 0.09375 -0.921875zm9.3237 3.578125q-0.46875 0.109375 -0.953125 0.171875q-0.46875 0.078125 -0.90625 0.078125q-0.703125 0 -1.234375 -0.15625q-0.515625 -0.15625 -0.859375 -0.453125q-0.328125 -0.3125 -0.5 -0.78125q-0.15625 -0.484375 -0.15625 -1.140625l0 -3.546875l-1.953125 0l0 -1.359375l1.953125 0l0 -1.859375l1.796875 -0.46875l0 2.328125l2.8125 0l0 1.359375l-2.8125 0l0 3.421875q0 0.609375 0.28125 0.9375q0.28125 0.3125 0.953125 0.3125q0.4375 0 0.84375 -0.0625q0.40625 -0.078125 0.734375 -0.171875l0 1.390625zm2.870575 -3.046875l0 -1.546875l4.453125 0l0 1.546875l-4.453125 0zm13.79245 -0.640625q0 1.0 -0.28125 1.734375q-0.28125 0.734375 -0.78125 1.21875q-0.5 0.46875 -1.1875 0.703125q-0.6875 0.234375 -1.5 0.234375q-0.796875 0 -1.484375 -0.125q-0.671875 -0.109375 -1.296875 -0.3125l0 -9.796875l1.734375 0l0 2.375l-0.0625 1.421875q0.390625 -0.5 0.90625 -0.796875q0.53125 -0.3125 1.28125 -0.3125q0.640625 0 1.140625 0.265625q0.5 0.25 0.828125 0.71875q0.34375 0.46875 0.515625 1.15625q0.1875 0.671875 0.1875 1.515625zm-1.828125 0.078125q0 -0.609375 -0.09375 -1.03125q-0.078125 -0.421875 -0.25 -0.6875q-0.15625 -0.28125 -0.390625 -0.40625q-0.21875 -0.125 -0.53125 -0.125q-0.4375 0 -0.84375 0.359375q-0.40625 0.359375 -0.859375 0.984375l0 3.171875q0.21875 0.078125 0.515625 0.125q0.3125 0.046875 0.625 0.046875q0.40625 0 0.75 -0.171875q0.34375 -0.171875 0.578125 -0.484375q0.25 -0.3125 0.375 -0.765625q0.125 -0.453125 0.125 -1.015625zm5.089325 -3.578125l0 4.734375q0 0.59375 0.21875 0.90625q0.21875 0.296875 0.671875 0.296875q0.4375 0 0.828125 -0.359375q0.40625 -0.359375 0.859375 -0.984375l0 -4.59375l1.75 0l0 7.265625l-1.515625 0l-0.03125 -1.078125q-0.234375 0.28125 -0.46875 0.515625q-0.234375 0.21875 -0.515625 0.390625q-0.265625 0.15625 -0.59375 0.25q-0.328125 0.09375 -0.71875 0.09375q-0.5625 0 -0.984375 -0.1875q-0.421875 -0.1875 -0.703125 -0.515625q-0.265625 -0.34375 -0.40625 -0.8125q-0.140625 -0.46875 -0.140625 -1.046875l0 -4.875l1.75 0zm12.839325 -1.359375q-0.3125 -0.109375 -0.75 -0.171875q-0.421875 -0.0625 -0.875 -0.0625q-0.3125 0 -0.578125 0.078125q-0.265625 0.078125 -0.453125 0.265625q-0.1875 0.171875 -0.3125 0.453125q-0.109375 0.265625 -0.109375 0.65625l0 1.1875l2.859375 0l0 1.359375l-2.859375 0l0 4.859375l-1.75 0l0 -4.859375l-2.109375 0l0 -1.359375l2.109375 0l0 -1.109375q0 -0.765625 0.21875 -1.3125q0.234375 -0.5625 0.640625 -0.921875q0.40625 -0.359375 0.984375 -0.515625q0.578125 -0.171875 1.296875 -0.171875q0.484375 0 0.90625 0.0625q0.421875 0.0625 0.78125 0.140625l0 1.421875zm-7.515625 1.359375l0 0zm15.5737 -1.359375q-0.3125 -0.109375 -0.75 -0.171875q-0.421875 -0.0625 -0.875 -0.0625q-0.3125 0 -0.578125 0.078125q-0.265625 0.078125 -0.453125 0.265625q-0.1875 0.171875 -0.3125 0.453125q-0.109375 0.265625 -0.109375 0.65625l0 1.1875l2.859375 0l0 1.359375l-2.859375 0l0 4.859375l-1.75 0l0 -4.859375l-2.109375 0l0 -1.359375l2.109375 0l0 -1.109375q0 -0.765625 0.21875 -1.3125q0.234375 -0.5625 0.640625 -0.921875q0.40625 -0.359375 0.984375 -0.515625q0.578125 -0.171875 1.296875 -0.171875q0.484375 0 0.90625 0.0625q0.421875 0.0625 0.78125 0.140625l0 1.421875zm-7.515625 1.359375l0 0zm15.3862 3.078125q0 0.171875 -0.015625 0.46875q0 0.28125 -0.03125 0.53125l-4.78125 0q0 0.46875 0.140625 0.84375q0.15625 0.359375 0.421875 0.609375q0.265625 0.25 0.640625 0.375q0.390625 0.125 0.84375 0.125q0.53125 0 1.125 -0.078125q0.609375 -0.078125 1.265625 -0.265625l0 1.390625q-0.28125 0.078125 -0.609375 0.140625q-0.328125 0.0625 -0.671875 0.109375q-0.34375 0.046875 -0.703125 0.078125q-0.34375 0.03125 -0.671875 0.03125q-0.828125 0 -1.5 -0.234375q-0.65625 -0.25 -1.109375 -0.703125q-0.453125 -0.46875 -0.703125 -1.140625q-0.234375 -0.6875 -0.234375 -1.578125q0 -0.890625 0.234375 -1.609375q0.25 -0.734375 0.6875 -1.25q0.453125 -0.515625 1.078125 -0.796875q0.640625 -0.28125 1.421875 -0.28125q0.765625 0 1.359375 0.234375q0.59375 0.234375 0.984375 0.671875q0.40625 0.421875 0.609375 1.015625q0.21875 0.59375 0.21875 1.3125zm-1.78125 -0.25q0.015625 -0.4375 -0.109375 -0.75q-0.109375 -0.328125 -0.3125 -0.53125q-0.1875 -0.203125 -0.453125 -0.296875q-0.25 -0.109375 -0.5625 -0.109375q-0.65625 0 -1.078125 0.4375q-0.421875 0.4375 -0.5 1.25l3.015625 0zm8.183075 0q0.015625 -0.421875 -0.046875 -0.703125q-0.046875 -0.28125 -0.171875 -0.46875q-0.109375 -0.1875 -0.28125 -0.265625q-0.171875 -0.09375 -0.390625 -0.09375q-0.390625 0 -0.796875 0.328125q-0.40625 0.3125 -0.90625 1.046875l0 4.59375l-1.796875 0l0 -7.265625l1.59375 0l0.0625 1.046875q0.171875 -0.265625 0.390625 -0.484375q0.234375 -0.234375 0.5 -0.390625q0.28125 -0.15625 0.609375 -0.234375q0.34375 -0.09375 0.75 -0.09375q0.546875 0 0.96875 0.1875q0.4375 0.1875 0.71875 0.5625q0.296875 0.359375 0.4375 0.921875q0.140625 0.5625 0.109375 1.3125l-1.75 0zm7.66745 -4.828125q0 0.234375 -0.09375 0.453125q-0.078125 0.203125 -0.234375 0.359375q-0.15625 0.15625 -0.375 0.25q-0.203125 0.078125 -0.4375 0.078125q-0.25 0 -0.46875 -0.078125q-0.203125 -0.09375 -0.359375 -0.25q-0.15625 -0.15625 -0.25 -0.359375q-0.078125 -0.21875 -0.078125 -0.453125q0 -0.234375 0.078125 -0.4375q0.09375 -0.203125 0.25 -0.359375q0.15625 -0.15625 0.359375 -0.25q0.21875 -0.09375 0.46875 -0.09375q0.234375 0 0.4375 0.09375q0.21875 0.09375 0.375 0.25q0.15625 0.15625 0.234375 0.359375q0.09375 0.203125 0.09375 0.4375zm-1.9375 3.359375l-2.0625 0l0 -1.359375l3.84375 0l0 5.90625l2.109375 0l0 1.359375l-6.203125 0l0 -1.359375l2.3125 0l0 -4.546875zm5.79245 5.90625l0 -1.21875l3.546875 -4.5625l-3.46875 0l0 -1.484375l5.78125 0l0 1.21875l-3.484375 4.515625l3.53125 0l0 1.53125l-5.90625 0zm14.308075 -4.1875q0 0.171875 -0.015625 0.46875q0 0.28125 -0.03125 0.53125l-4.78125 0q0 0.46875 0.140625 0.84375q0.15625 0.359375 0.421875 0.609375q0.265625 0.25 0.640625 0.375q0.390625 0.125 0.84375 0.125q0.53125 0 1.125 -0.078125q0.609375 -0.078125 1.265625 -0.265625l0 1.390625q-0.28125 0.078125 -0.609375 0.140625q-0.328125 0.0625 -0.671875 0.109375q-0.34375 0.046875 -0.703125 0.078125q-0.34375 0.03125 -0.671875 0.03125q-0.828125 0 -1.5 -0.234375q-0.65625 -0.25 -1.109375 -0.703125q-0.453125 -0.46875 -0.703125 -1.140625q-0.234375 -0.6875 -0.234375 -1.578125q0 -0.890625 0.234375 -1.609375q0.25 -0.734375 0.6875 -1.25q0.453125 -0.515625 1.078125 -0.796875q0.640625 -0.28125 1.421875 -0.28125q0.765625 0 1.359375 0.234375q0.59375 0.234375 0.984375 0.671875q0.40625 0.421875 0.609375 1.015625q0.21875 0.59375 0.21875 1.3125zm-1.78125 -0.25q0.015625 -0.4375 -0.109375 -0.75q-0.109375 -0.328125 -0.3125 -0.53125q-0.1875 -0.203125 -0.453125 -0.296875q-0.25 -0.109375 -0.5625 -0.109375q-0.65625 0 -1.078125 0.4375q-0.421875 0.4375 -0.5 1.25l3.015625 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m397.51355 131.65561l0 0l0 17.16098l0 0l0 -17.16098z" fill-rule="nonzero"/><path fill="#ffffff" d="m244.41011 143.49379l153.10344 0l0 1.3200073l-153.10344 0l0 -1.3200073z" fill-rule="nonzero"/><path fill="#34a853" d="m163.16655 174.66765l315.59058 0l0 29.039368l-315.59058 0z" fill-rule="evenodd"/><path stroke="#34a853" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m163.16655 174.66765l315.59058 0l0 29.039368l-315.59058 0z" fill-rule="evenodd"/><path fill="#ffffff" d="m262.6691 190.95483l-4.28125 0l0 -1.171875l4.28125 0l0 1.171875zm9.16745 -0.40625q0 0.890625 -0.25 1.59375q-0.234375 0.703125 -0.703125 1.203125q-0.453125 0.484375 -1.109375 0.75q-0.640625 0.25 -1.453125 0.25q-0.640625 0 -1.3125 -0.109375q-0.65625 -0.125 -1.3125 -0.390625l0 -9.6875l1.25 0l0 2.78125l-0.0625 1.328125q0.546875 -0.71875 1.15625 -1.015625q0.609375 -0.296875 1.3125 -0.296875q0.609375 0 1.078125 0.265625q0.46875 0.25 0.78125 0.71875q0.3125 0.46875 0.46875 1.140625q0.15625 0.65625 0.15625 1.46875zm-1.265625 0.046875q0 -0.5625 -0.09375 -1.03125q-0.078125 -0.46875 -0.265625 -0.796875q-0.171875 -0.34375 -0.453125 -0.53125q-0.265625 -0.1875 -0.65625 -0.1875q-0.234375 0 -0.484375 0.078125q-0.234375 0.0625 -0.5 0.25q-0.265625 0.171875 -0.5625 0.46875q-0.28125 0.28125 -0.609375 0.703125l0 3.484375q0.359375 0.140625 0.734375 0.234375q0.390625 0.078125 0.734375 0.078125q0.421875 0 0.8125 -0.125q0.390625 -0.140625 0.6875 -0.453125q0.296875 -0.328125 0.46875 -0.859375q0.1875 -0.53125 0.1875 -1.3125zm8.964325 3.671875l-1.125 0l-0.03125 -1.15625q-0.328125 0.375 -0.625 0.625q-0.28125 0.234375 -0.578125 0.390625q-0.28125 0.140625 -0.578125 0.203125q-0.28125 0.0625 -0.59375 0.0625q-1.109375 0 -1.6875 -0.640625q-0.5625 -0.65625 -0.5625 -1.96875l0 -4.703125l1.25 0l0 4.59375q0 1.65625 1.25 1.65625q0.21875 0 0.4375 -0.0625q0.21875 -0.078125 0.453125 -0.234375q0.25 -0.171875 0.515625 -0.453125q0.28125 -0.296875 0.625 -0.734375l0 -4.765625l1.25 0l0 7.1875zm8.651825 -8.953125q-0.96875 -0.203125 -1.6875 -0.203125q-1.671875 0 -1.671875 1.75l0 1.265625l3.140625 0l0 1.03125l-3.140625 0l0 5.109375l-1.265625 0l0 -5.109375l-2.3125 0l0 -1.03125l2.3125 0l0 -1.1875q0 -2.875 2.984375 -2.875q0.75 0 1.640625 0.171875l0 1.078125zm-7.515625 1.765625l0 0zm15.5737 -1.765625q-0.96875 -0.203125 -1.6875 -0.203125q-1.671875 0 -1.671875 1.75l0 1.265625l3.140625 0l0 1.03125l-3.140625 0l0 5.109375l-1.265625 0l0 -5.109375l-2.3125 0l0 -1.03125l2.3125 0l0 -1.1875q0 -2.875 2.984375 -2.875q0.75 0 1.640625 0.171875l0 1.078125zm-7.515625 1.765625l0 0zm15.276825 3.21875q0 0.265625 -0.015625 0.453125q0 0.171875 -0.015625 0.328125l-5.046875 0q0 1.09375 0.609375 1.6875q0.625 0.59375 1.78125 0.59375q0.3125 0 0.625 -0.015625q0.3125 -0.03125 0.609375 -0.078125q0.296875 -0.046875 0.5625 -0.09375q0.265625 -0.0625 0.5 -0.125l0 1.03125q-0.515625 0.140625 -1.15625 0.21875q-0.640625 0.09375 -1.328125 0.09375q-0.921875 0 -1.59375 -0.25q-0.65625 -0.25 -1.078125 -0.71875q-0.421875 -0.484375 -0.625 -1.171875q-0.203125 -0.6875 -0.203125 -1.5625q0 -0.765625 0.21875 -1.4375q0.21875 -0.671875 0.625 -1.1875q0.421875 -0.515625 1.03125 -0.8125q0.609375 -0.296875 1.375 -0.296875q0.765625 0 1.34375 0.234375q0.578125 0.234375 0.96875 0.671875q0.40625 0.4375 0.609375 1.0625q0.203125 0.609375 0.203125 1.375zm-1.296875 -0.1875q0.015625 -0.46875 -0.109375 -0.859375q-0.109375 -0.40625 -0.34375 -0.6875q-0.234375 -0.296875 -0.59375 -0.453125q-0.359375 -0.15625 -0.828125 -0.15625q-0.40625 0 -0.75 0.15625q-0.328125 0.15625 -0.578125 0.4375q-0.25 0.28125 -0.40625 0.6875q-0.140625 0.40625 -0.171875 0.875l3.78125 0zm3.5737 -3.03125l1.140625 0l0.03125 1.328125q0.640625 -0.765625 1.25 -1.109375q0.625 -0.34375 1.25 -0.34375q1.125 0 1.703125 0.71875q0.578125 0.71875 0.53125 2.15625l-1.265625 0q0.03125 -0.953125 -0.265625 -1.375q-0.296875 -0.421875 -0.875 -0.421875q-0.25 0 -0.515625 0.09375q-0.25 0.078125 -0.515625 0.28125q-0.265625 0.1875 -0.5625 0.5q-0.296875 0.3125 -0.640625 0.75l0 4.609375l-1.265625 0l0 -7.1875zm12.79245 3.875l-4.28125 0l0 -1.171875l4.28125 0l0 1.171875zm8.808075 3.3125l-1.25 0l0 -4.59375q0 -0.828125 -0.3125 -1.234375q-0.3125 -0.40625 -0.890625 -0.40625q-0.25 0 -0.46875 0.0625q-0.21875 0.0625 -0.453125 0.234375q-0.234375 0.171875 -0.515625 0.453125q-0.28125 0.28125 -0.640625 0.71875l0 4.765625l-1.25 0l0 -10.109375l1.25 0l0 2.921875l-0.046875 1.140625q0.296875 -0.359375 0.578125 -0.59375q0.28125 -0.25 0.5625 -0.390625q0.28125 -0.15625 0.578125 -0.21875q0.296875 -0.0625 0.609375 -0.0625q1.078125 0 1.65625 0.65625q0.59375 0.65625 0.59375 1.96875l0 4.6875zm8.54245 -3.65625q0 0.84375 -0.25 1.546875q-0.234375 0.6875 -0.671875 1.1875q-0.4375 0.5 -1.078125 0.78125q-0.640625 0.265625 -1.453125 0.265625q-0.765625 0 -1.390625 -0.234375q-0.609375 -0.234375 -1.03125 -0.703125q-0.421875 -0.46875 -0.65625 -1.15625q-0.21875 -0.6875 -0.21875 -1.578125q0 -0.84375 0.234375 -1.53125q0.234375 -0.6875 0.671875 -1.1875q0.453125 -0.5 1.09375 -0.765625q0.640625 -0.28125 1.4375 -0.28125q0.78125 0 1.390625 0.25q0.609375 0.234375 1.03125 0.703125q0.4375 0.453125 0.65625 1.140625q0.234375 0.6875 0.234375 1.5625zm-1.28125 0.0625q0 -0.671875 -0.15625 -1.15625q-0.140625 -0.5 -0.421875 -0.828125q-0.265625 -0.34375 -0.65625 -0.5q-0.375 -0.171875 -0.859375 -0.171875q-0.5625 0 -0.96875 0.21875q-0.390625 0.21875 -0.640625 0.578125q-0.25 0.359375 -0.375 0.84375q-0.109375 0.484375 -0.109375 1.015625q0 0.671875 0.140625 1.171875q0.140625 0.5 0.40625 0.828125q0.28125 0.328125 0.671875 0.5q0.390625 0.171875 0.875 0.171875q0.5625 0 0.953125 -0.21875q0.390625 -0.21875 0.640625 -0.578125q0.265625 -0.375 0.375 -0.859375q0.125 -0.484375 0.125 -1.015625zm5.526825 -2.5625l-2.125 0l0 -1.03125l3.390625 0l0 6.15625l2.125 0l0 1.03125l-5.75 0l0 -1.03125l2.359375 0l0 -5.125zm0.4375 -4.046875q0.203125 0 0.375 0.078125q0.1875 0.078125 0.3125 0.21875q0.140625 0.125 0.21875 0.3125q0.078125 0.171875 0.078125 0.375q0 0.203125 -0.078125 0.390625q-0.078125 0.171875 -0.21875 0.3125q-0.125 0.125 -0.3125 0.203125q-0.171875 0.078125 -0.375 0.078125q-0.203125 0 -0.390625 -0.078125q-0.171875 -0.078125 -0.3125 -0.203125q-0.125 -0.140625 -0.203125 -0.3125q-0.078125 -0.1875 -0.078125 -0.390625q0 -0.203125 0.078125 -0.375q0.078125 -0.1875 0.203125 -0.3125q0.140625 -0.140625 0.3125 -0.21875q0.1875 -0.078125 0.390625 -0.078125zm10.85495 8.25q0 0.375 -0.125 0.671875q-0.125 0.296875 -0.34375 0.53125q-0.21875 0.234375 -0.515625 0.40625q-0.296875 0.15625 -0.640625 0.265625q-0.328125 0.109375 -0.6875 0.15625q-0.34375 0.046875 -0.671875 0.046875q-0.734375 0 -1.34375 -0.0625q-0.609375 -0.0625 -1.203125 -0.203125l0 -1.140625q0.640625 0.171875 1.25 0.265625q0.625 0.09375 1.25 0.09375q0.890625 0 1.3125 -0.234375q0.4375 -0.25 0.4375 -0.703125q0 -0.1875 -0.078125 -0.34375q-0.0625 -0.15625 -0.25 -0.296875q-0.171875 -0.140625 -0.546875 -0.28125q-0.375 -0.15625 -1.015625 -0.359375q-0.5 -0.140625 -0.90625 -0.3125q-0.40625 -0.1875 -0.71875 -0.4375q-0.296875 -0.25 -0.46875 -0.578125q-0.171875 -0.34375 -0.171875 -0.8125q0 -0.296875 0.140625 -0.65625q0.140625 -0.359375 0.46875 -0.65625q0.34375 -0.3125 0.921875 -0.515625q0.578125 -0.203125 1.421875 -0.203125q0.421875 0 0.9375 0.046875q0.515625 0.046875 1.078125 0.15625l0 1.109375q-0.578125 -0.140625 -1.109375 -0.203125q-0.53125 -0.078125 -0.90625 -0.078125q-0.46875 0 -0.796875 0.078125q-0.3125 0.0625 -0.515625 0.203125q-0.1875 0.125 -0.28125 0.296875q-0.078125 0.15625 -0.078125 0.34375q0 0.203125 0.0625 0.359375q0.078125 0.15625 0.28125 0.3125q0.21875 0.140625 0.578125 0.28125q0.359375 0.140625 0.953125 0.3125q0.640625 0.1875 1.0625 0.390625q0.4375 0.203125 0.703125 0.46875q0.28125 0.25 0.390625 0.5625q0.125 0.3125 0.125 0.71875zm8.16745 1.859375q-0.421875 0.09375 -0.875 0.140625q-0.453125 0.046875 -0.921875 0.046875q-1.34375 0 -2.015625 -0.609375q-0.65625 -0.609375 -0.65625 -1.875l0 -3.75l-2.015625 0l0 -1.046875l2.015625 0l0 -1.96875l1.234375 -0.328125l0 2.296875l3.234375 0l0 1.046875l-3.234375 0l0 3.65625q0 0.765625 0.40625 1.15625q0.421875 0.375 1.21875 0.375q0.34375 0 0.75 -0.046875q0.40625 -0.0625 0.859375 -0.171875l0 1.078125zm4.714325 -6.0625l-2.125 0l0 -1.03125l3.390625 0l0 6.15625l2.125 0l0 1.03125l-5.75 0l0 -1.03125l2.359375 0l0 -5.125zm0.4375 -4.046875q0.203125 0 0.375 0.078125q0.1875 0.078125 0.3125 0.21875q0.140625 0.125 0.21875 0.3125q0.078125 0.171875 0.078125 0.375q0 0.203125 -0.078125 0.390625q-0.078125 0.171875 -0.21875 0.3125q-0.125 0.125 -0.3125 0.203125q-0.171875 0.078125 -0.375 0.078125q-0.203125 0 -0.390625 -0.078125q-0.171875 -0.078125 -0.3125 -0.203125q-0.125 -0.140625 -0.203125 -0.3125q-0.078125 -0.1875 -0.078125 -0.390625q0 -0.203125 0.078125 -0.375q0.078125 -0.1875 0.203125 -0.3125q0.140625 -0.140625 0.3125 -0.21875q0.1875 -0.078125 0.390625 -0.078125zm5.16745 3.015625l1.109375 0l0.046875 1.15625q0.328125 -0.359375 0.609375 -0.609375q0.296875 -0.25 0.578125 -0.390625q0.28125 -0.15625 0.578125 -0.21875q0.296875 -0.0625 0.609375 -0.0625q1.109375 0 1.671875 0.65625q0.578125 0.65625 0.578125 1.96875l0 4.6875l-1.25 0l0 -4.59375q0 -0.84375 -0.3125 -1.234375q-0.3125 -0.40625 -0.9375 -0.40625q-0.234375 0 -0.453125 0.0625q-0.21875 0.0625 -0.453125 0.234375q-0.234375 0.171875 -0.515625 0.453125q-0.265625 0.28125 -0.609375 0.71875l0 4.765625l-1.25 0l0 -7.1875zm13.22995 1.015625q0.203125 0.25 0.3125 0.59375q0.109375 0.328125 0.109375 0.71875q0 0.546875 -0.203125 1.015625q-0.203125 0.453125 -0.578125 0.78125q-0.359375 0.328125 -0.890625 0.515625q-0.515625 0.1875 -1.140625 0.1875q-0.4375 0 -0.84375 -0.09375q-0.390625 -0.109375 -0.609375 -0.25q-0.140625 0.203125 -0.234375 0.390625q-0.09375 0.171875 -0.09375 0.390625q0 0.28125 0.25 0.46875q0.265625 0.171875 0.703125 0.171875l1.890625 0.078125q0.53125 0.015625 0.984375 0.140625q0.453125 0.125 0.78125 0.359375q0.328125 0.21875 0.5 0.5625q0.1875 0.328125 0.1875 0.765625q0 0.484375 -0.203125 0.90625q-0.203125 0.4375 -0.640625 0.765625q-0.4375 0.328125 -1.109375 0.515625q-0.65625 0.1875 -1.5625 0.1875q-0.875 0 -1.5 -0.140625q-0.609375 -0.125 -1.0 -0.375q-0.390625 -0.25 -0.578125 -0.59375q-0.171875 -0.34375 -0.171875 -0.75q0 -0.515625 0.234375 -0.921875q0.25 -0.390625 0.765625 -0.75q-0.1875 -0.078125 -0.328125 -0.203125q-0.140625 -0.140625 -0.234375 -0.296875q-0.078125 -0.15625 -0.125 -0.328125q-0.03125 -0.171875 -0.03125 -0.328125q0 -0.46875 0.21875 -0.84375q0.21875 -0.390625 0.515625 -0.734375q-0.140625 -0.171875 -0.25 -0.328125q-0.09375 -0.15625 -0.171875 -0.328125q-0.078125 -0.1875 -0.125 -0.390625q-0.03125 -0.21875 -0.03125 -0.5q0 -0.5625 0.203125 -1.015625q0.21875 -0.46875 0.578125 -0.796875q0.375 -0.328125 0.890625 -0.5q0.515625 -0.1875 1.140625 -0.1875q0.265625 0 0.515625 0.03125q0.25 0.03125 0.421875 0.09375l2.609375 0l0 1.015625l-1.15625 0zm-4.3125 7.203125q0 0.515625 0.53125 0.75q0.53125 0.234375 1.46875 0.234375q0.59375 0 1.0 -0.109375q0.40625 -0.109375 0.640625 -0.28125q0.25 -0.171875 0.359375 -0.40625q0.109375 -0.21875 0.109375 -0.453125q0 -0.4375 -0.359375 -0.65625q-0.359375 -0.203125 -1.09375 -0.234375l-1.875 -0.0625q-0.234375 0.15625 -0.390625 0.296875q-0.15625 0.15625 -0.25 0.3125q-0.078125 0.15625 -0.109375 0.296875q-0.03125 0.15625 -0.03125 0.3125zm0.375 -5.875q0 0.34375 0.109375 0.640625q0.125 0.28125 0.328125 0.484375q0.203125 0.203125 0.484375 0.3125q0.28125 0.109375 0.625 0.109375q0.375 0 0.65625 -0.125q0.296875 -0.125 0.5 -0.34375q0.203125 -0.21875 0.296875 -0.484375q0.109375 -0.28125 0.109375 -0.59375q0 -0.34375 -0.125 -0.625q-0.109375 -0.28125 -0.3125 -0.484375q-0.203125 -0.203125 -0.5 -0.3125q-0.28125 -0.109375 -0.625 -0.109375q-0.359375 0 -0.65625 0.125q-0.28125 0.125 -0.484375 0.34375q-0.203125 0.203125 -0.3125 0.484375q-0.09375 0.28125 -0.09375 0.578125z" fill-rule="nonzero"/><path fill="#34a853" d="m163.16655 224.35252l315.59058 0l0 29.039368l-315.59058 0z" fill-rule="evenodd"/><path stroke="#34a853" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m163.16655 224.35252l315.59058 0l0 29.039368l-315.59058 0z" fill-rule="evenodd"/><path fill="#ffffff" d="m242.52391 240.63971l-4.28125 0l0 -1.171875l4.28125 0l0 1.171875zm9.16745 -0.40625q0 0.890625 -0.25 1.59375q-0.234375 0.703125 -0.703125 1.203125q-0.453125 0.484375 -1.109375 0.75q-0.640625 0.25 -1.453125 0.25q-0.640625 0 -1.3125 -0.109375q-0.65625 -0.125 -1.3125 -0.390625l0 -9.6875l1.25 0l0 2.78125l-0.0625 1.328125q0.546875 -0.71875 1.15625 -1.015625q0.609375 -0.296875 1.3125 -0.296875q0.609375 0 1.078125 0.265625q0.46875 0.25 0.78125 0.71875q0.3125 0.46875 0.46875 1.140625q0.15625 0.65625 0.15625 1.46875zm-1.265625 0.046875q0 -0.5625 -0.09375 -1.03125q-0.078125 -0.46875 -0.265625 -0.796875q-0.171875 -0.34375 -0.453125 -0.53125q-0.265625 -0.1875 -0.65625 -0.1875q-0.234375 0 -0.484375 0.078125q-0.234375 0.0625 -0.5 0.25q-0.265625 0.171875 -0.5625 0.46875q-0.28125 0.28125 -0.609375 0.703125l0 3.484375q0.359375 0.140625 0.734375 0.234375q0.390625 0.078125 0.734375 0.078125q0.421875 0 0.8125 -0.125q0.390625 -0.140625 0.6875 -0.453125q0.296875 -0.328125 0.46875 -0.859375q0.1875 -0.53125 0.1875 -1.3125zm8.96434 3.671875l-1.125 0l-0.03125 -1.15625q-0.328125 0.375 -0.625 0.625q-0.28125 0.234375 -0.578125 0.390625q-0.28125 0.140625 -0.578125 0.203125q-0.28125 0.0625 -0.59376526 0.0625q-1.109375 0 -1.6875 -0.640625q-0.5625 -0.65625 -0.5625 -1.96875l0 -4.703125l1.25 0l0 4.59375q0 1.65625 1.2500153 1.65625q0.21875 0 0.4375 -0.0625q0.21875 -0.078125 0.453125 -0.234375q0.25 -0.171875 0.515625 -0.453125q0.28125 -0.296875 0.625 -0.734375l0 -4.765625l1.25 0l0 7.1875zm8.651825 -8.953125q-0.96875 -0.203125 -1.6875 -0.203125q-1.671875 0 -1.671875 1.75l0 1.265625l3.140625 0l0 1.03125l-3.140625 0l0 5.109375l-1.265625 0l0 -5.109375l-2.3125 0l0 -1.03125l2.3125 0l0 -1.1875q0 -2.875 2.984375 -2.875q0.75 0 1.640625 0.171875l0 1.078125zm-7.515625 1.765625l0 0zm15.5737 -1.765625q-0.96875 -0.203125 -1.6875 -0.203125q-1.671875 0 -1.671875 1.75l0 1.265625l3.140625 0l0 1.03125l-3.140625 0l0 5.109375l-1.265625 0l0 -5.109375l-2.3125 0l0 -1.03125l2.3125 0l0 -1.1875q0 -2.875 2.984375 -2.875q0.75 0 1.640625 0.171875l0 1.078125zm-7.515625 1.765625l0 0zm15.276825 3.21875q0 0.265625 -0.015625 0.453125q0 0.171875 -0.015625 0.328125l-5.046875 0q0 1.09375 0.609375 1.6875q0.625 0.59375 1.78125 0.59375q0.3125 0 0.625 -0.015625q0.3125 -0.03125 0.609375 -0.078125q0.296875 -0.046875 0.5625 -0.09375q0.265625 -0.0625 0.5 -0.125l0 1.03125q-0.515625 0.140625 -1.15625 0.21875q-0.640625 0.09375 -1.328125 0.09375q-0.921875 0 -1.59375 -0.25q-0.65625 -0.25 -1.078125 -0.71875q-0.421875 -0.484375 -0.625 -1.171875q-0.203125 -0.6875 -0.203125 -1.5625q0 -0.765625 0.21875 -1.4375q0.21875 -0.671875 0.625 -1.1875q0.421875 -0.515625 1.03125 -0.8125q0.609375 -0.296875 1.375 -0.296875q0.765625 0 1.34375 0.234375q0.578125 0.234375 0.96875 0.671875q0.40625 0.4375 0.609375 1.0625q0.203125 0.609375 0.203125 1.375zm-1.296875 -0.1875q0.015625 -0.46875 -0.109375 -0.859375q-0.109375 -0.40625 -0.34375 -0.6875q-0.234375 -0.296875 -0.59375 -0.453125q-0.359375 -0.15625 -0.828125 -0.15625q-0.40625 0 -0.75 0.15625q-0.328125 0.15625 -0.578125 0.4375q-0.25 0.28125 -0.40625 0.6875q-0.140625 0.40625 -0.171875 0.875l3.78125 0zm3.5737 -3.03125l1.140625 0l0.03125 1.328125q0.640625 -0.765625 1.25 -1.109375q0.625 -0.34375 1.25 -0.34375q1.125 0 1.703125 0.71875q0.578125 0.71875 0.53125 2.15625l-1.265625 0q0.03125 -0.953125 -0.265625 -1.375q-0.296875 -0.421875 -0.875 -0.421875q-0.25 0 -0.515625 0.09375q-0.25 0.078125 -0.515625 0.28125q-0.265625 0.1875 -0.5625 0.5q-0.296875 0.3125 -0.640625 0.75l0 4.609375l-1.265625 0l0 -7.1875zm12.79245 3.875l-4.28125 0l0 -1.171875l4.28125 0l0 1.171875zm5.47995 -5.78125l-2.125 0l0 -1.015625l3.390625 0l0 9.078125l2.125 0l0 1.03125l-5.75 0l0 -1.03125l2.359375 0l0 -8.0625zm11.870575 5.4375q0 0.84375 -0.25 1.546875q-0.234375 0.6875 -0.671875 1.1875q-0.4375 0.5 -1.078125 0.78125q-0.640625 0.265625 -1.453125 0.265625q-0.765625 0 -1.390625 -0.234375q-0.609375 -0.234375 -1.03125 -0.703125q-0.421875 -0.46875 -0.65625 -1.15625q-0.21875 -0.6875 -0.21875 -1.578125q0 -0.84375 0.234375 -1.53125q0.234375 -0.6875 0.671875 -1.1875q0.453125 -0.5 1.09375 -0.765625q0.640625 -0.28125 1.4375 -0.28125q0.78125 0 1.390625 0.25q0.609375 0.234375 1.03125 0.703125q0.4375 0.453125 0.65625 1.140625q0.234375 0.6875 0.234375 1.5625zm-1.28125 0.0625q0 -0.671875 -0.15625 -1.15625q-0.140625 -0.5 -0.421875 -0.828125q-0.265625 -0.34375 -0.65625 -0.5q-0.375 -0.171875 -0.859375 -0.171875q-0.5625 0 -0.96875 0.21875q-0.390625 0.21875 -0.640625 0.578125q-0.25 0.359375 -0.375 0.84375q-0.109375 0.484375 -0.109375 1.015625q0 0.671875 0.140625 1.171875q0.140625 0.5 0.40625 0.828125q0.28125 0.328125 0.671875 0.5q0.390625 0.171875 0.875 0.171875q0.5625 0 0.953125 -0.21875q0.390625 -0.21875 0.640625 -0.578125q0.265625 -0.375 0.375 -0.859375q0.125 -0.484375 0.125 -1.015625zm9.339325 -0.0625q0 0.84375 -0.25 1.546875q-0.234375 0.6875 -0.671875 1.1875q-0.4375 0.5 -1.078125 0.78125q-0.640625 0.265625 -1.453125 0.265625q-0.765625 0 -1.390625 -0.234375q-0.609375 -0.234375 -1.03125 -0.703125q-0.421875 -0.46875 -0.65625 -1.15625q-0.21875 -0.6875 -0.21875 -1.578125q0 -0.84375 0.234375 -1.53125q0.234375 -0.6875 0.671875 -1.1875q0.453125 -0.5 1.09375 -0.765625q0.640625 -0.28125 1.4375 -0.28125q0.78125 0 1.390625 0.25q0.609375 0.234375 1.03125 0.703125q0.4375 0.453125 0.65625 1.140625q0.234375 0.6875 0.234375 1.5625zm-1.28125 0.0625q0 -0.671875 -0.15625 -1.15625q-0.140625 -0.5 -0.421875 -0.828125q-0.265625 -0.34375 -0.65625 -0.5q-0.375 -0.171875 -0.859375 -0.171875q-0.5625 0 -0.96875 0.21875q-0.390625 0.21875 -0.640625 0.578125q-0.25 0.359375 -0.375 0.84375q-0.109375 0.484375 -0.109375 1.015625q0 0.671875 0.140625 1.171875q0.140625 0.5 0.40625 0.828125q0.28125 0.328125 0.671875 0.5q0.390625 0.171875 0.875 0.171875q0.5625 0 0.953125 -0.21875q0.390625 -0.21875 0.640625 -0.578125q0.265625 -0.375 0.375 -0.859375q0.125 -0.484375 0.125 -1.015625zm9.214325 -0.125q0 0.953125 -0.265625 1.671875q-0.265625 0.71875 -0.75 1.1875q-0.46875 0.46875 -1.109375 0.703125q-0.640625 0.234375 -1.390625 0.234375q-0.34375 0 -0.6875 -0.03125q-0.34375 -0.03125 -0.6875 -0.125l0 3.015625l-1.25 0l0 -10.125l1.109375 0l0.078125 1.203125q0.546875 -0.734375 1.15625 -1.03125q0.609375 -0.296875 1.3125 -0.296875q0.609375 0 1.078125 0.265625q0.46875 0.25 0.78125 0.71875q0.3125 0.46875 0.46875 1.140625q0.15625 0.65625 0.15625 1.46875zm-1.265625 0.046875q0 -0.5625 -0.09375 -1.03125q-0.078125 -0.46875 -0.265625 -0.796875q-0.171875 -0.34375 -0.453125 -0.53125q-0.265625 -0.1875 -0.65625 -0.1875q-0.234375 0 -0.484375 0.078125q-0.234375 0.0625 -0.5 0.25q-0.265625 0.171875 -0.5625 0.46875q-0.28125 0.28125 -0.609375 0.703125l0 3.484375q0.34375 0.140625 0.71875 0.234375q0.390625 0.078125 0.75 0.078125q1.0 0 1.578125 -0.6875q0.578125 -0.6875 0.578125 -2.0625zm8.214325 0.359375l-4.28125 0l0 -1.171875l4.28125 0l0 1.171875zm8.808075 3.3125l-1.25 0l0 -4.59375q0 -0.828125 -0.3125 -1.234375q-0.3125 -0.40625 -0.890625 -0.40625q-0.25 0 -0.46875 0.0625q-0.21875 0.0625 -0.453125 0.234375q-0.234375 0.171875 -0.515625 0.453125q-0.28125 0.28125 -0.640625 0.71875l0 4.765625l-1.25 0l0 -10.109375l1.25 0l0 2.921875l-0.046875 1.140625q0.296875 -0.359375 0.578125 -0.59375q0.28125 -0.25 0.5625 -0.390625q0.28125 -0.15625 0.578125 -0.21875q0.296875 -0.0625 0.609375 -0.0625q1.078125 0 1.65625 0.65625q0.59375 0.65625 0.59375 1.96875l0 4.6875zm8.54245 -3.65625q0 0.84375 -0.25 1.546875q-0.234375 0.6875 -0.671875 1.1875q-0.4375 0.5 -1.078125 0.78125q-0.640625 0.265625 -1.453125 0.265625q-0.765625 0 -1.390625 -0.234375q-0.609375 -0.234375 -1.03125 -0.703125q-0.421875 -0.46875 -0.65625 -1.15625q-0.21875 -0.6875 -0.21875 -1.578125q0 -0.84375 0.234375 -1.53125q0.234375 -0.6875 0.671875 -1.1875q0.453125 -0.5 1.09375 -0.765625q0.640625 -0.28125 1.4375 -0.28125q0.78125 0 1.390625 0.25q0.609375 0.234375 1.03125 0.703125q0.4375 0.453125 0.65625 1.140625q0.234375 0.6875 0.234375 1.5625zm-1.28125 0.0625q0 -0.671875 -0.15625 -1.15625q-0.140625 -0.5 -0.421875 -0.828125q-0.265625 -0.34375 -0.65625 -0.5q-0.375 -0.171875 -0.859375 -0.171875q-0.5625 0 -0.96875 0.21875q-0.390625 0.21875 -0.640625 0.578125q-0.25 0.359375 -0.375 0.84375q-0.109375 0.484375 -0.109375 1.015625q0 0.671875 0.140625 1.171875q0.140625 0.5 0.40625 0.828125q0.28125 0.328125 0.671875 0.5q0.390625 0.171875 0.875 0.171875q0.5625 0 0.953125 -0.21875q0.390625 -0.21875 0.640625 -0.578125q0.265625 -0.375 0.375 -0.859375q0.125 -0.484375 0.125 -1.015625zm5.526825 -2.5625l-2.125 0l0 -1.03125l3.390625 0l0 6.15625l2.125 0l0 1.03125l-5.75 0l0 -1.03125l2.359375 0l0 -5.125zm0.4375 -4.046875q0.203125 0 0.375 0.078125q0.1875 0.078125 0.3125 0.21875q0.140625 0.125 0.21875 0.3125q0.078125 0.171875 0.078125 0.375q0 0.203125 -0.078125 0.390625q-0.078125 0.171875 -0.21875 0.3125q-0.125 0.125 -0.3125 0.203125q-0.171875 0.078125 -0.375 0.078125q-0.203125 0 -0.390625 -0.078125q-0.171875 -0.078125 -0.3125 -0.203125q-0.125 -0.140625 -0.203125 -0.3125q-0.078125 -0.1875 -0.078125 -0.390625q0 -0.203125 0.078125 -0.375q0.078125 -0.1875 0.203125 -0.3125q0.140625 -0.140625 0.3125 -0.21875q0.1875 -0.078125 0.390625 -0.078125zm10.85495 8.25q0 0.375 -0.125 0.671875q-0.125 0.296875 -0.34375 0.53125q-0.21875 0.234375 -0.515625 0.40625q-0.296875 0.15625 -0.640625 0.265625q-0.328125 0.109375 -0.6875 0.15625q-0.34375 0.046875 -0.671875 0.046875q-0.734375 0 -1.34375 -0.0625q-0.609375 -0.0625 -1.203125 -0.203125l0 -1.140625q0.640625 0.171875 1.25 0.265625q0.625 0.09375 1.25 0.09375q0.890625 0 1.3125 -0.234375q0.4375 -0.25 0.4375 -0.703125q0 -0.1875 -0.078125 -0.34375q-0.0625 -0.15625 -0.25 -0.296875q-0.171875 -0.140625 -0.546875 -0.28125q-0.375 -0.15625 -1.015625 -0.359375q-0.5 -0.140625 -0.90625 -0.3125q-0.40625 -0.1875 -0.71875 -0.4375q-0.296875 -0.25 -0.46875 -0.578125q-0.171875 -0.34375 -0.171875 -0.8125q0 -0.296875 0.140625 -0.65625q0.140625 -0.359375 0.46875 -0.65625q0.34375 -0.3125 0.921875 -0.515625q0.578125 -0.203125 1.421875 -0.203125q0.421875 0 0.9375 0.046875q0.515625 0.046875 1.078125 0.15625l0 1.109375q-0.578125 -0.140625 -1.109375 -0.203125q-0.53125 -0.078125 -0.90625 -0.078125q-0.46875 0 -0.796875 0.078125q-0.3125 0.0625 -0.515625 0.203125q-0.1875 0.125 -0.28125 0.296875q-0.078125 0.15625 -0.078125 0.34375q0 0.203125 0.0625 0.359375q0.078125 0.15625 0.28125 0.3125q0.21875 0.140625 0.578125 0.28125q0.359375 0.140625 0.953125 0.3125q0.640625 0.1875 1.0625 0.390625q0.4375 0.203125 0.703125 0.46875q0.28125 0.25 0.390625 0.5625q0.125 0.3125 0.125 0.71875zm8.16745 1.859375q-0.421875 0.09375 -0.875 0.140625q-0.453125 0.046875 -0.921875 0.046875q-1.34375 0 -2.015625 -0.609375q-0.65625 -0.609375 -0.65625 -1.875l0 -3.75l-2.015625 0l0 -1.046875l2.015625 0l0 -1.96875l1.234375 -0.328125l0 2.296875l3.234375 0l0 1.046875l-3.234375 0l0 3.65625q0 0.765625 0.40625 1.15625q0.421875 0.375 1.21875 0.375q0.34375 0 0.75 -0.046875q0.40625 -0.0625 0.859375 -0.171875l0 1.078125zm4.714325 -6.0625l-2.125 0l0 -1.03125l3.390625 0l0 6.15625l2.125 0l0 1.03125l-5.75 0l0 -1.03125l2.359375 0l0 -5.125zm0.4375 -4.046875q0.203125 0 0.375 0.078125q0.1875 0.078125 0.3125 0.21875q0.140625 0.125 0.21875 0.3125q0.078125 0.171875 0.078125 0.375q0 0.203125 -0.078125 0.390625q-0.078125 0.171875 -0.21875 0.3125q-0.125 0.125 -0.3125 0.203125q-0.171875 0.078125 -0.375 0.078125q-0.203125 0 -0.390625 -0.078125q-0.171875 -0.078125 -0.3125 -0.203125q-0.125 -0.140625 -0.203125 -0.3125q-0.078125 -0.1875 -0.078125 -0.390625q0 -0.203125 0.078125 -0.375q0.078125 -0.1875 0.203125 -0.3125q0.140625 -0.140625 0.3125 -0.21875q0.1875 -0.078125 0.390625 -0.078125zm5.16745 3.015625l1.109375 0l0.046875 1.15625q0.328125 -0.359375 0.609375 -0.609375q0.296875 -0.25 0.578125 -0.390625q0.28125 -0.15625 0.578125 -0.21875q0.296875 -0.0625 0.609375 -0.0625q1.109375 0 1.671875 0.65625q0.578125 0.65625 0.578125 1.96875l0 4.6875l-1.25 0l0 -4.59375q0 -0.84375 -0.3125 -1.234375q-0.3125 -0.40625 -0.9375 -0.40625q-0.234375 0 -0.453125 0.0625q-0.21875 0.0625 -0.453125 0.234375q-0.234375 0.171875 -0.515625 0.453125q-0.265625 0.28125 -0.609375 0.71875l0 4.765625l-1.25 0l0 -7.1875zm13.22995 1.015625q0.203125 0.25 0.3125 0.59375q0.109375 0.328125 0.109375 0.71875q0 0.546875 -0.203125 1.015625q-0.203125 0.453125 -0.578125 0.78125q-0.359375 0.328125 -0.890625 0.515625q-0.515625 0.1875 -1.140625 0.1875q-0.4375 0 -0.84375 -0.09375q-0.390625 -0.109375 -0.609375 -0.25q-0.140625 0.203125 -0.234375 0.390625q-0.09375 0.171875 -0.09375 0.390625q0 0.28125 0.25 0.46875q0.265625 0.171875 0.703125 0.171875l1.890625 0.078125q0.53125 0.015625 0.984375 0.140625q0.453125 0.125 0.78125 0.359375q0.328125 0.21875 0.5 0.5625q0.1875 0.328125 0.1875 0.765625q0 0.484375 -0.203125 0.90625q-0.203125 0.4375 -0.640625 0.765625q-0.4375 0.328125 -1.109375 0.515625q-0.65625 0.1875 -1.5625 0.1875q-0.875 0 -1.5 -0.140625q-0.609375 -0.125 -1.0 -0.375q-0.390625 -0.25 -0.578125 -0.59375q-0.171875 -0.34375 -0.171875 -0.75q0 -0.515625 0.234375 -0.921875q0.25 -0.390625 0.765625 -0.75q-0.1875 -0.078125 -0.328125 -0.203125q-0.140625 -0.140625 -0.234375 -0.296875q-0.078125 -0.15625 -0.125 -0.328125q-0.03125 -0.171875 -0.03125 -0.328125q0 -0.46875 0.21875 -0.84375q0.21875 -0.390625 0.515625 -0.734375q-0.140625 -0.171875 -0.25 -0.328125q-0.09375 -0.15625 -0.171875 -0.328125q-0.078125 -0.1875 -0.125 -0.390625q-0.03125 -0.21875 -0.03125 -0.5q0 -0.5625 0.203125 -1.015625q0.21875 -0.46875 0.578125 -0.796875q0.375 -0.328125 0.890625 -0.5q0.515625 -0.1875 1.140625 -0.1875q0.265625 0 0.515625 0.03125q0.25 0.03125 0.421875 0.09375l2.609375 0l0 1.015625l-1.15625 0zm-4.3125 7.203125q0 0.515625 0.53125 0.75q0.53125 0.234375 1.46875 0.234375q0.59375 0 1.0 -0.109375q0.40625 -0.109375 0.640625 -0.28125q0.25 -0.171875 0.359375 -0.40625q0.109375 -0.21875 0.109375 -0.453125q0 -0.4375 -0.359375 -0.65625q-0.359375 -0.203125 -1.09375 -0.234375l-1.875 -0.0625q-0.234375 0.15625 -0.390625 0.296875q-0.15625 0.15625 -0.25 0.3125q-0.078125 0.15625 -0.109375 0.296875q-0.03125 0.15625 -0.03125 0.3125zm0.375 -5.875q0 0.34375 0.109375 0.640625q0.125 0.28125 0.328125 0.484375q0.203125 0.203125 0.484375 0.3125q0.28125 0.109375 0.625 0.109375q0.375 0 0.65625 -0.125q0.296875 -0.125 0.5 -0.34375q0.203125 -0.21875 0.296875 -0.484375q0.109375 -0.28125 0.109375 -0.59375q0 -0.34375 -0.125 -0.625q-0.109375 -0.28125 -0.3125 -0.484375q-0.203125 -0.203125 -0.5 -0.3125q-0.28125 -0.109375 -0.625 -0.109375q-0.359375 0 -0.65625 0.125q-0.28125 0.125 -0.484375 0.34375q-0.203125 0.203125 -0.3125 0.484375q-0.09375 0.28125 -0.09375 0.578125z" fill-rule="nonzero"/><path fill="#34a853" d="m163.16655 274.0374l315.59058 0l0 29.039368l-315.59058 0z" fill-rule="evenodd"/><path stroke="#34a853" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m163.16655 274.0374l315.59058 0l0 29.039368l-315.59058 0z" fill-rule="evenodd"/><path fill="#ffffff" d="m210.29161 290.3246l-4.28125 0l0 -1.171875l4.28125 0l0 1.171875zm9.16745 -0.40625q0 0.890625 -0.25 1.59375q-0.234375 0.703125 -0.703125 1.203125q-0.453125 0.484375 -1.109375 0.75q-0.640625 0.25 -1.453125 0.25q-0.640625 0 -1.3125 -0.109375q-0.65625 -0.125 -1.3125 -0.390625l0 -9.6875l1.25 0l0 2.78125l-0.0625 1.328125q0.546875 -0.71875 1.15625 -1.015625q0.609375 -0.296875 1.3125 -0.296875q0.609375 0 1.078125 0.265625q0.46875 0.25 0.78125 0.71875q0.3125 0.46875 0.46875 1.140625q0.15625 0.65625 0.15625 1.46875zm-1.265625 0.046875q0 -0.5625 -0.09375 -1.03125q-0.078125 -0.46875 -0.265625 -0.796875q-0.171875 -0.34375 -0.453125 -0.53125q-0.265625 -0.1875 -0.65625 -0.1875q-0.234375 0 -0.484375 0.078125q-0.234375 0.0625 -0.5 0.25q-0.265625 0.171875 -0.5625 0.46875q-0.28125 0.28125 -0.609375 0.703125l0 3.484375q0.359375 0.140625 0.734375 0.234375q0.390625 0.078125 0.734375 0.078125q0.421875 0 0.8125 -0.125q0.390625 -0.140625 0.6875 -0.453125q0.296875 -0.328125 0.46875 -0.859375q0.1875 -0.53125 0.1875 -1.3125zm8.964325 3.671875l-1.125 0l-0.03125 -1.15625q-0.328125 0.375 -0.625 0.625q-0.28125 0.234375 -0.578125 0.390625q-0.28125 0.140625 -0.578125 0.203125q-0.28125 0.0625 -0.59375 0.0625q-1.109375 0 -1.6875 -0.640625q-0.5625 -0.65625 -0.5625 -1.96875l0 -4.703125l1.25 0l0 4.59375q0 1.65625 1.25 1.65625q0.21875 0 0.4375 -0.0625q0.21875 -0.078125 0.453125 -0.234375q0.25 -0.171875 0.515625 -0.453125q0.28125 -0.296875 0.625 -0.734375l0 -4.765625l1.25 0l0 7.1875zm8.651825 -8.953125q-0.96875 -0.203125 -1.6875 -0.203125q-1.671875 0 -1.671875 1.75l0 1.265625l3.140625 0l0 1.03125l-3.140625 0l0 5.109375l-1.265625 0l0 -5.109375l-2.3125 0l0 -1.03125l2.3125 0l0 -1.1875q0 -2.875 2.984375 -2.875q0.75 0 1.640625 0.171875l0 1.078125zm-7.515625 1.765625l0 0zm15.5737 -1.765625q-0.96875 -0.203125 -1.6875 -0.203125q-1.671875 0 -1.671875 1.75l0 1.265625l3.140625 0l0 1.03125l-3.140625 0l0 5.109375l-1.265625 0l0 -5.109375l-2.3125 0l0 -1.03125l2.3125 0l0 -1.1875q0 -2.875 2.984375 -2.875q0.75 0 1.640625 0.171875l0 1.078125zm-7.515625 1.765625l0 0zm15.276825 3.21875q0 0.265625 -0.015625 0.453125q0 0.171875 -0.015625 0.328125l-5.046875 0q0 1.09375 0.609375 1.6875q0.625 0.59375 1.78125 0.59375q0.3125 0 0.625 -0.015625q0.3125 -0.03125 0.609375 -0.078125q0.296875 -0.046875 0.5625 -0.09375q0.265625 -0.0625 0.5 -0.125l0 1.03125q-0.515625 0.140625 -1.15625 0.21875q-0.640625 0.09375 -1.328125 0.09375q-0.921875 0 -1.59375 -0.25q-0.65625 -0.25 -1.078125 -0.71875q-0.421875 -0.484375 -0.625 -1.171875q-0.203125 -0.6875 -0.203125 -1.5625q0 -0.765625 0.21875 -1.4375q0.21875 -0.671875 0.625 -1.1875q0.421875 -0.515625 1.03125 -0.8125q0.609375 -0.296875 1.375 -0.296875q0.765625 0 1.34375 0.234375q0.578125 0.234375 0.96875 0.671875q0.40625 0.4375 0.609375 1.0625q0.203125 0.609375 0.203125 1.375zm-1.296875 -0.1875q0.015625 -0.46875 -0.109375 -0.859375q-0.109375 -0.40625 -0.34375 -0.6875q-0.234375 -0.296875 -0.59375 -0.453125q-0.359375 -0.15625 -0.828125 -0.15625q-0.40625 0 -0.75 0.15625q-0.328125 0.15625 -0.578125 0.4375q-0.25 0.28125 -0.40625 0.6875q-0.140625 0.40625 -0.171875 0.875l3.78125 0zm3.5737 -3.03125l1.140625 0l0.03125 1.328125q0.640625 -0.765625 1.2500153 -1.109375q0.625 -0.34375 1.25 -0.34375q1.125 0 1.703125 0.71875q0.578125 0.71875 0.53125 2.15625l-1.265625 0q0.03125 -0.953125 -0.265625 -1.375q-0.296875 -0.421875 -0.875 -0.421875q-0.25 0 -0.515625 0.09375q-0.25 0.078125 -0.515625 0.28125q-0.265625 0.1875 -0.56251526 0.5q-0.296875 0.3125 -0.640625 0.75l0 4.609375l-1.265625 0l0 -7.1875zm12.792465 3.875l-4.28125 0l0 -1.171875l4.28125 0l0 1.171875zm3.3237 -3.875l1.140625 0l0.03125 1.328125q0.640625 -0.765625 1.25 -1.109375q0.625 -0.34375 1.25 -0.34375q1.125 0 1.703125 0.71875q0.578125 0.71875 0.53125 2.15625l-1.265625 0q0.03125 -0.953125 -0.265625 -1.375q-0.296875 -0.421875 -0.875 -0.421875q-0.25 0 -0.515625 0.09375q-0.25 0.078125 -0.515625 0.28125q-0.265625 0.1875 -0.5625 0.5q-0.296875 0.3125 -0.640625 0.75l0 4.609375l-1.265625 0l0 -7.1875zm13.839325 3.21875q0 0.265625 -0.015625 0.453125q0 0.171875 -0.015625 0.328125l-5.046875 0q0 1.09375 0.609375 1.6875q0.625 0.59375 1.78125 0.59375q0.3125 0 0.625 -0.015625q0.3125 -0.03125 0.609375 -0.078125q0.296875 -0.046875 0.5625 -0.09375q0.265625 -0.0625 0.5 -0.125l0 1.03125q-0.515625 0.140625 -1.15625 0.21875q-0.640625 0.09375 -1.328125 0.09375q-0.921875 0 -1.59375 -0.25q-0.65625 -0.25 -1.078125 -0.71875q-0.421875 -0.484375 -0.625 -1.171875q-0.203125 -0.6875 -0.203125 -1.5625q0 -0.765625 0.21875 -1.4375q0.21875 -0.671875 0.625 -1.1875q0.421875 -0.515625 1.03125 -0.8125q0.609375 -0.296875 1.375 -0.296875q0.765625 0 1.34375 0.234375q0.578125 0.234375 0.96875 0.671875q0.40625 0.4375 0.609375 1.0625q0.203125 0.609375 0.203125 1.375zm-1.296875 -0.1875q0.015625 -0.46875 -0.109375 -0.859375q-0.109375 -0.40625 -0.34375 -0.6875q-0.234375 -0.296875 -0.59375 -0.453125q-0.359375 -0.15625 -0.828125 -0.15625q-0.40625 0 -0.75 0.15625q-0.328125 0.15625 -0.578125 0.4375q-0.25 0.28125 -0.40625 0.6875q-0.140625 0.40625 -0.171875 0.875l3.78125 0zm8.964325 2.203125q0 0.375 -0.125 0.671875q-0.125 0.296875 -0.34375 0.53125q-0.21875 0.234375 -0.515625 0.40625q-0.296875 0.15625 -0.640625 0.265625q-0.328125 0.109375 -0.6875 0.15625q-0.34375 0.046875 -0.671875 0.046875q-0.734375 0 -1.34375 -0.0625q-0.609375 -0.0625 -1.203125 -0.203125l0 -1.140625q0.640625 0.171875 1.25 0.265625q0.625 0.09375 1.25 0.09375q0.890625 0 1.3125 -0.234375q0.4375 -0.25 0.4375 -0.703125q0 -0.1875 -0.078125 -0.34375q-0.0625 -0.15625 -0.25 -0.296875q-0.171875 -0.140625 -0.546875 -0.28125q-0.375 -0.15625 -1.015625 -0.359375q-0.5 -0.140625 -0.90625 -0.3125q-0.40625 -0.1875 -0.71875 -0.4375q-0.296875 -0.25 -0.46875 -0.578125q-0.171875 -0.34375 -0.171875 -0.8125q0 -0.296875 0.140625 -0.65625q0.140625 -0.359375 0.46875 -0.65625q0.34375 -0.3125 0.921875 -0.515625q0.578125 -0.203125 1.421875 -0.203125q0.421875 0 0.9375 0.046875q0.515625 0.046875 1.078125 0.15625l0 1.109375q-0.578125 -0.140625 -1.109375 -0.203125q-0.53125 -0.078125 -0.90625 -0.078125q-0.46875 0 -0.796875 0.078125q-0.3125 0.0625 -0.515625 0.203125q-0.1875 0.125 -0.28125 0.296875q-0.078125 0.15625 -0.078125 0.34375q0 0.203125 0.0625 0.359375q0.078125 0.15625 0.28125 0.3125q0.21875 0.140625 0.578125 0.28125q0.359375 0.140625 0.953125 0.3125q0.640625 0.1875 1.0625 0.390625q0.4375 0.203125 0.703125 0.46875q0.28125 0.25 0.390625 0.5625q0.125 0.3125 0.125 0.71875zm8.151825 1.953125l-1.125 0l-0.03125 -1.15625q-0.328125 0.375 -0.625 0.625q-0.28125 0.234375 -0.578125 0.390625q-0.28125 0.140625 -0.578125 0.203125q-0.28125 0.0625 -0.59375 0.0625q-1.109375 0 -1.6875 -0.640625q-0.5625 -0.65625 -0.5625 -1.96875l0 -4.703125l1.25 0l0 4.59375q0 1.65625 1.25 1.65625q0.21875 0 0.4375 -0.0625q0.21875 -0.078125 0.453125 -0.234375q0.25 -0.171875 0.515625 -0.453125q0.28125 -0.296875 0.625 -0.734375l0 -4.765625l1.25 0l0 7.1875zm4.72995 -9.09375l-2.125 0l0 -1.015625l3.390625 0l0 9.078125l2.125 0l0 1.03125l-5.75 0l0 -1.03125l2.359375 0l0 -8.0625zm11.401825 9.0q-0.421875 0.09375 -0.875 0.140625q-0.453125 0.046875 -0.921875 0.046875q-1.34375 0 -2.015625 -0.609375q-0.65625 -0.609375 -0.65625 -1.875l0 -3.75l-2.015625 0l0 -1.046875l2.015625 0l0 -1.96875l1.234375 -0.328125l0 2.296875l3.234375 0l0 1.046875l-3.234375 0l0 3.65625q0 0.765625 0.40625 1.15625q0.421875 0.375 1.21875 0.375q0.34375 0 0.75 -0.046875q0.40625 -0.0625 0.859375 -0.171875l0 1.078125zm7.9487 -1.859375q0 0.375 -0.125 0.671875q-0.125 0.296875 -0.34375 0.53125q-0.21875 0.234375 -0.515625 0.40625q-0.296875 0.15625 -0.640625 0.265625q-0.328125 0.109375 -0.6875 0.15625q-0.34375 0.046875 -0.671875 0.046875q-0.734375 0 -1.34375 -0.0625q-0.609375 -0.0625 -1.203125 -0.203125l0 -1.140625q0.640625 0.171875 1.25 0.265625q0.625 0.09375 1.25 0.09375q0.890625 0 1.3125 -0.234375q0.4375 -0.25 0.4375 -0.703125q0 -0.1875 -0.078125 -0.34375q-0.0625 -0.15625 -0.25 -0.296875q-0.171875 -0.140625 -0.546875 -0.28125q-0.375 -0.15625 -1.015625 -0.359375q-0.5 -0.140625 -0.90625 -0.3125q-0.40625 -0.1875 -0.71875 -0.4375q-0.296875 -0.25 -0.46875 -0.578125q-0.171875 -0.34375 -0.171875 -0.8125q0 -0.296875 0.140625 -0.65625q0.140625 -0.359375 0.46875 -0.65625q0.34375 -0.3125 0.921875 -0.515625q0.578125 -0.203125 1.421875 -0.203125q0.421875 0 0.9375 0.046875q0.515625 0.046875 1.078125 0.15625l0 1.109375q-0.578125 -0.140625 -1.109375 -0.203125q-0.53125 -0.078125 -0.90625 -0.078125q-0.46875 0 -0.796875 0.078125q-0.3125 0.0625 -0.515625 0.203125q-0.1875 0.125 -0.28125 0.296875q-0.078125 0.15625 -0.078125 0.34375q0 0.203125 0.0625 0.359375q0.078125 0.15625 0.28125 0.3125q0.21875 0.140625 0.578125 0.28125q0.359375 0.140625 0.953125 0.3125q0.640625 0.1875 1.0625 0.390625q0.4375 0.203125 0.703125 0.46875q0.28125 0.25 0.390625 0.5625q0.125 0.3125 0.125 0.71875zm7.401825 -1.359375l-4.28125 0l0 -1.171875l4.28125 0l0 1.171875zm8.8237 3.21875q-0.421875 0.09375 -0.875 0.140625q-0.453125 0.046875 -0.921875 0.046875q-1.34375 0 -2.015625 -0.609375q-0.65625 -0.609375 -0.65625 -1.875l0 -3.75l-2.015625 0l0 -1.046875l2.015625 0l0 -1.96875l1.234375 -0.328125l0 2.296875l3.234375 0l0 1.046875l-3.234375 0l0 3.65625q0 0.765625 0.40625 1.15625q0.421875 0.375 1.21875 0.375q0.34375 0 0.75 -0.046875q0.40625 -0.0625 0.859375 -0.171875l0 1.078125zm8.526825 -3.5625q0 0.84375 -0.25 1.546875q-0.234375 0.6875 -0.671875 1.1875q-0.4375 0.5 -1.078125 0.78125q-0.640625 0.265625 -1.453125 0.265625q-0.765625 0 -1.390625 -0.234375q-0.609375 -0.234375 -1.03125 -0.703125q-0.421875 -0.46875 -0.65625 -1.15625q-0.21875 -0.6875 -0.21875 -1.578125q0 -0.84375 0.234375 -1.53125q0.234375 -0.6875 0.671875 -1.1875q0.453125 -0.5 1.09375 -0.765625q0.640625 -0.28125 1.4375 -0.28125q0.78125 0 1.390625 0.25q0.609375 0.234375 1.03125 0.703125q0.4375 0.453125 0.65625 1.140625q0.234375 0.6875 0.234375 1.5625zm-1.28125 0.0625q0 -0.671875 -0.15625 -1.15625q-0.140625 -0.5 -0.421875 -0.828125q-0.265625 -0.34375 -0.65625 -0.5q-0.375 -0.171875 -0.859375 -0.171875q-0.5625 0 -0.96875 0.21875q-0.390625 0.21875 -0.640625 0.578125q-0.25 0.359375 -0.375 0.84375q-0.109375 0.484375 -0.109375 1.015625q0 0.671875 0.140625 1.171875q0.140625 0.5 0.40625 0.828125q0.28125 0.328125 0.671875 0.5q0.390625 0.171875 0.875 0.171875q0.5625 0 0.953125 -0.21875q0.390625 -0.21875 0.640625 -0.578125q0.265625 -0.375 0.375 -0.859375q0.125 -0.484375 0.125 -1.015625zm8.10495 0.28125l-4.28125 0l0 -1.171875l4.28125 0l0 1.171875zm9.29245 -0.34375q0 0.84375 -0.25 1.546875q-0.234375 0.6875 -0.671875 1.1875q-0.4375 0.5 -1.078125 0.78125q-0.640625 0.265625 -1.453125 0.265625q-0.765625 0 -1.390625 -0.234375q-0.609375 -0.234375 -1.03125 -0.703125q-0.421875 -0.46875 -0.65625 -1.15625q-0.21875 -0.6875 -0.21875 -1.578125q0 -0.84375 0.234375 -1.53125q0.234375 -0.6875 0.671875 -1.1875q0.453125 -0.5 1.09375 -0.765625q0.640625 -0.28125 1.4375 -0.28125q0.78125 0 1.390625 0.25q0.609375 0.234375 1.03125 0.703125q0.4375 0.453125 0.65625 1.140625q0.234375 0.6875 0.234375 1.5625zm-1.28125 0.0625q0 -0.671875 -0.15625 -1.15625q-0.140625 -0.5 -0.421875 -0.828125q-0.265625 -0.34375 -0.65625 -0.5q-0.375 -0.171875 -0.859375 -0.171875q-0.5625 0 -0.96875 0.21875q-0.390625 0.21875 -0.640625 0.578125q-0.25 0.359375 -0.375 0.84375q-0.109375 0.484375 -0.109375 1.015625q0 0.671875 0.140625 1.171875q0.140625 0.5 0.40625 0.828125q0.28125 0.328125 0.671875 0.5q0.390625 0.171875 0.875 0.171875q0.5625 0 0.953125 -0.21875q0.390625 -0.21875 0.640625 -0.578125q0.265625 -0.375 0.375 -0.859375q0.125 -0.484375 0.125 -1.015625zm8.85495 3.59375l-1.125 0l-0.03125 -1.15625q-0.328125 0.375 -0.625 0.625q-0.28125 0.234375 -0.578125 0.390625q-0.28125 0.140625 -0.578125 0.203125q-0.28125 0.0625 -0.59375 0.0625q-1.109375 0 -1.6875 -0.640625q-0.5625 -0.65625 -0.5625 -1.96875l0 -4.703125l1.25 0l0 4.59375q0 1.65625 1.25 1.65625q0.21875 0 0.4375 -0.0625q0.21875 -0.078125 0.453125 -0.234375q0.25 -0.171875 0.515625 -0.453125q0.28125 -0.296875 0.625 -0.734375l0 -4.765625l1.25 0l0 7.1875zm8.0737 -0.09375q-0.421875 0.09375 -0.875 0.140625q-0.453125 0.046875 -0.921875 0.046875q-1.34375 0 -2.015625 -0.609375q-0.65625 -0.609375 -0.65625 -1.875l0 -3.75l-2.015625 0l0 -1.046875l2.015625 0l0 -1.96875l1.234375 -0.328125l0 2.296875l3.234375 0l0 1.046875l-3.234375 0l0 3.65625q0 0.765625 0.40625 1.15625q0.421875 0.375 1.21875 0.375q0.34375 0 0.75 -0.046875q0.40625 -0.0625 0.859375 -0.171875l0 1.078125zm7.29245 -3.21875l-4.28125 0l0 -1.171875l4.28125 0l0 1.171875zm9.16745 -0.40625q0 0.953125 -0.265625 1.671875q-0.265625 0.71875 -0.75 1.1875q-0.46875 0.46875 -1.109375 0.703125q-0.640625 0.234375 -1.390625 0.234375q-0.34375 0 -0.6875 -0.03125q-0.34375 -0.03125 -0.6875 -0.125l0 3.015625l-1.25 0l0 -10.125l1.109375 0l0.078125 1.203125q0.546875 -0.734375 1.15625 -1.03125q0.609375 -0.296875 1.3125 -0.296875q0.609375 0 1.078125 0.265625q0.46875 0.25 0.78125 0.71875q0.3125 0.46875 0.46875 1.140625q0.15625 0.65625 0.15625 1.46875zm-1.265625 0.046875q0 -0.5625 -0.09375 -1.03125q-0.078125 -0.46875 -0.265625 -0.796875q-0.171875 -0.34375 -0.453125 -0.53125q-0.265625 -0.1875 -0.65625 -0.1875q-0.234375 0 -0.484375 0.078125q-0.234375 0.0625 -0.5 0.25q-0.265625 0.171875 -0.5625 0.46875q-0.28125 0.28125 -0.609375 0.703125l0 3.484375q0.34375 0.140625 0.71875 0.234375q0.390625 0.078125 0.75 0.078125q1.0 0 1.578125 -0.6875q0.578125 -0.6875 0.578125 -2.0625zm7.808075 3.671875l-0.03125 -0.96875q-0.59375 0.578125 -1.203125 0.84375q-0.59375 0.25 -1.265625 0.25q-0.609375 0 -1.046875 -0.15625q-0.4375 -0.15625 -0.71875 -0.421875q-0.28125 -0.28125 -0.421875 -0.65625q-0.125 -0.375 -0.125 -0.8125q0 -1.078125 0.796875 -1.6875q0.8125 -0.609375 2.390625 -0.609375l1.484375 0l0 -0.640625q0 -0.625 -0.40625 -1.0q-0.40625 -0.390625 -1.25 -0.390625q-0.609375 0 -1.203125 0.140625q-0.578125 0.125 -1.21875 0.375l0 -1.125q0.25 -0.078125 0.53125 -0.15625q0.296875 -0.09375 0.609375 -0.15625q0.328125 -0.0625 0.671875 -0.09375q0.359375 -0.046875 0.71875 -0.046875q0.640625 0 1.15625 0.140625q0.53125 0.140625 0.890625 0.4375q0.359375 0.296875 0.546875 0.75q0.203125 0.4375 0.203125 1.03125l0 4.953125l-1.109375 0zm-0.140625 -3.265625l-1.578125 0q-0.46875 0 -0.8125 0.09375q-0.328125 0.09375 -0.546875 0.265625q-0.21875 0.171875 -0.328125 0.40625q-0.09375 0.234375 -0.09375 0.546875q0 0.203125 0.0625 0.390625q0.0625 0.1875 0.203125 0.34375q0.15625 0.140625 0.375 0.234375q0.234375 0.078125 0.5625 0.078125q0.4375 0 0.984375 -0.265625q0.5625 -0.265625 1.171875 -0.828125l0 -1.265625zm3.870575 -3.921875l1.140625 0l0.03125 1.328125q0.640625 -0.765625 1.25 -1.109375q0.625 -0.34375 1.25 -0.34375q1.125 0 1.703125 0.71875q0.578125 0.71875 0.53125 2.15625l-1.265625 0q0.03125 -0.953125 -0.265625 -1.375q-0.296875 -0.421875 -0.875 -0.421875q-0.25 0 -0.515625 0.09375q-0.25 0.078125 -0.515625 0.28125q-0.265625 0.1875 -0.5625 0.5q-0.296875 0.3125 -0.640625 0.75l0 4.609375l-1.265625 0l0 -7.1875zm12.3862 7.1875l-0.03125 -0.96875q-0.59375 0.578125 -1.203125 0.84375q-0.59375 0.25 -1.265625 0.25q-0.609375 0 -1.046875 -0.15625q-0.4375 -0.15625 -0.71875 -0.421875q-0.28125 -0.28125 -0.421875 -0.65625q-0.125 -0.375 -0.125 -0.8125q0 -1.078125 0.796875 -1.6875q0.8125 -0.609375 2.390625 -0.609375l1.484375 0l0 -0.640625q0 -0.625 -0.40625 -1.0q-0.40625 -0.390625 -1.25 -0.390625q-0.609375 0 -1.203125 0.140625q-0.578125 0.125 -1.21875 0.375l0 -1.125q0.25 -0.078125 0.53125 -0.15625q0.296875 -0.09375 0.609375 -0.15625q0.328125 -0.0625 0.671875 -0.09375q0.359375 -0.046875 0.71875 -0.046875q0.640625 0 1.15625 0.140625q0.53125 0.140625 0.890625 0.4375q0.359375 0.296875 0.546875 0.75q0.203125 0.4375 0.203125 1.03125l0 4.953125l-1.109375 0zm-0.140625 -3.265625l-1.578125 0q-0.46875 0 -0.8125 0.09375q-0.328125 0.09375 -0.546875 0.265625q-0.21875 0.171875 -0.328125 0.40625q-0.09375 0.234375 -0.09375 0.546875q0 0.203125 0.0625 0.390625q0.0625 0.1875 0.203125 0.34375q0.15625 0.140625 0.375 0.234375q0.234375 0.078125 0.5625 0.078125q0.4375 0 0.984375 -0.265625q0.5625 -0.265625 1.171875 -0.828125l0 -1.265625zm8.6987 3.265625l0 -5.15625q0 -0.34375 -0.03125 -0.546875q-0.015625 -0.21875 -0.078125 -0.34375q-0.046875 -0.125 -0.125 -0.171875q-0.078125 -0.0625 -0.203125 -0.0625q-0.140625 0 -0.265625 0.09375q-0.125 0.078125 -0.265625 0.28125q-0.140625 0.1875 -0.3125 0.515625q-0.15625 0.3125 -0.390625 0.78125l0 4.609375l-1.140625 0l0 -5.03125q0 -0.390625 -0.03125 -0.625q-0.015625 -0.25 -0.078125 -0.375q-0.046875 -0.140625 -0.140625 -0.1875q-0.078125 -0.0625 -0.203125 -0.0625q-0.125 0 -0.25 0.078125q-0.109375 0.0625 -0.25 0.25q-0.125 0.1875 -0.296875 0.515625q-0.171875 0.328125 -0.40625 0.828125l0 4.609375l-1.15625 0l0 -7.1875l0.953125 0l0.0625 1.375q0.1875 -0.40625 0.359375 -0.6875q0.171875 -0.296875 0.359375 -0.46875q0.1875 -0.1875 0.390625 -0.265625q0.21875 -0.078125 0.46875 -0.078125q0.59375 0 0.890625 0.375q0.296875 0.375 0.296875 1.171875q0.171875 -0.375 0.328125 -0.65625q0.171875 -0.296875 0.359375 -0.484375q0.1875 -0.203125 0.421875 -0.296875q0.234375 -0.109375 0.53125 -0.109375q1.34375 0 1.34375 2.078125l0 5.234375l-1.140625 0zm8.620575 -1.953125q0 0.375 -0.125 0.671875q-0.125 0.296875 -0.34375 0.53125q-0.21875 0.234375 -0.515625 0.40625q-0.296875 0.15625 -0.640625 0.265625q-0.328125 0.109375 -0.6875 0.15625q-0.34375 0.046875 -0.671875 0.046875q-0.734375 0 -1.34375 -0.0625q-0.609375 -0.0625 -1.203125 -0.203125l0 -1.140625q0.640625 0.171875 1.25 0.265625q0.625 0.09375 1.25 0.09375q0.890625 0 1.3125 -0.234375q0.4375 -0.25 0.4375 -0.703125q0 -0.1875 -0.078125 -0.34375q-0.0625 -0.15625 -0.25 -0.296875q-0.171875 -0.140625 -0.546875 -0.28125q-0.375 -0.15625 -1.015625 -0.359375q-0.5 -0.140625 -0.90625 -0.3125q-0.40625 -0.1875 -0.71875 -0.4375q-0.296875 -0.25 -0.46875 -0.578125q-0.171875 -0.34375 -0.171875 -0.8125q0 -0.296875 0.140625 -0.65625q0.140625 -0.359375 0.46875 -0.65625q0.34375 -0.3125 0.921875 -0.515625q0.578125 -0.203125 1.421875 -0.203125q0.421875 0 0.9375 0.046875q0.515625 0.046875 1.078125 0.15625l0 1.109375q-0.578125 -0.140625 -1.109375 -0.203125q-0.53125 -0.078125 -0.90625 -0.078125q-0.46875 0 -0.796875 0.078125q-0.3125 0.0625 -0.515625 0.203125q-0.1875 0.125 -0.28125 0.296875q-0.078125 0.15625 -0.078125 0.34375q0 0.203125 0.0625 0.359375q0.078125 0.15625 0.28125 0.3125q0.21875 0.140625 0.578125 0.28125q0.359375 0.140625 0.953125 0.3125q0.640625 0.1875 1.0625 0.390625q0.4375 0.203125 0.703125 0.46875q0.28125 0.25 0.390625 0.5625q0.125 0.3125 0.125 0.71875z" fill-rule="nonzero"/><path fill="#34a853" d="m163.16655 323.7223l315.59058 0l0 29.039368l-315.59058 0z" fill-rule="evenodd"/><path stroke="#34a853" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m163.16655 323.7223l315.59058 0l0 29.039368l-315.59058 0z" fill-rule="evenodd"/><path fill="#ffffff" d="m202.23354 340.0095l-4.28125 0l0 -1.171875l4.28125 0l0 1.171875zm2.66745 -0.125q0 -0.921875 0.25 -1.625q0.25 -0.71875 0.703125 -1.203125q0.46875 -0.5 1.109375 -0.75q0.65625 -0.25 1.453125 -0.25q0.34375 0 0.671875 0.046875q0.328125 0.03125 0.65625 0.125l0 -3.015625l1.25 0l0 10.109375l-1.109375 0l-0.046875 -1.359375q-0.515625 0.765625 -1.125 1.125q-0.609375 0.359375 -1.328125 0.359375q-0.609375 0 -1.078125 -0.25q-0.46875 -0.265625 -0.78125 -0.734375q-0.3125 -0.46875 -0.46875 -1.125q-0.15625 -0.65625 -0.15625 -1.453125zm1.265625 -0.078125q0 1.296875 0.375 1.9375q0.390625 0.640625 1.09375 0.640625q0.484375 0 1.0 -0.421875q0.53125 -0.421875 1.109375 -1.25l0 -3.34375q-0.3125 -0.140625 -0.6875 -0.203125q-0.359375 -0.078125 -0.734375 -0.078125q-1.015625 0 -1.59375 0.65625q-0.5625 0.65625 -0.5625 2.0625zm7.4487 -3.671875l1.140625 0l0.03125 1.328125q0.640625 -0.765625 1.25 -1.109375q0.625 -0.34375 1.25 -0.34375q1.125 0 1.703125 0.71875q0.578125 0.71875 0.53125 2.15625l-1.265625 0q0.03125 -0.953125 -0.265625 -1.375q-0.296875 -0.421875 -0.875 -0.421875q-0.25 0 -0.515625 0.09375q-0.25 0.078125 -0.515625 0.28125q-0.265625 0.1875 -0.5625 0.5q-0.296875 0.3125 -0.640625 0.75l0 4.609375l-1.265625 0l0 -7.1875zm14.026825 3.53125q0 0.84375 -0.25 1.546875q-0.234375 0.6875 -0.671875 1.1875q-0.4375 0.5 -1.078125 0.78125q-0.640625 0.265625 -1.453125 0.265625q-0.765625 0 -1.390625 -0.234375q-0.609375 -0.234375 -1.03125 -0.703125q-0.421875 -0.46875 -0.65625 -1.15625q-0.21875 -0.6875 -0.21875 -1.578125q0 -0.84375 0.234375 -1.53125q0.234375 -0.6875 0.671875 -1.1875q0.453125 -0.5 1.09375 -0.765625q0.640625 -0.28125 1.4375 -0.28125q0.78125 0 1.390625 0.25q0.609375 0.234375 1.03125 0.703125q0.4375 0.453125 0.65625 1.140625q0.234375 0.6875 0.234375 1.5625zm-1.28125 0.0625q0 -0.671875 -0.15625 -1.15625q-0.140625 -0.5 -0.421875 -0.828125q-0.265625 -0.34375 -0.65625 -0.5q-0.375 -0.171875 -0.859375 -0.171875q-0.5625 0 -0.96875 0.21875q-0.390625 0.21875 -0.640625 0.578125q-0.25 0.359375 -0.375 0.84375q-0.109375 0.484375 -0.109375 1.015625q0 0.671875 0.140625 1.171875q0.140625 0.5 0.40625 0.828125q0.28125 0.328125 0.671875 0.5q0.390625 0.171875 0.875 0.171875q0.5625 0 0.953125 -0.21875q0.390625 -0.21875 0.640625 -0.578125q0.265625 -0.375 0.375 -0.859375q0.125 -0.484375 0.125 -1.015625zm9.214325 -0.125q0 0.953125 -0.265625 1.671875q-0.265625 0.71875 -0.75 1.1875q-0.46875 0.46875 -1.109375 0.703125q-0.640625 0.234375 -1.390625 0.234375q-0.34375 0 -0.6875 -0.03125q-0.34375 -0.03125 -0.6875 -0.125l0 3.015625l-1.25 0l0 -10.125l1.109375 0l0.078125 1.203125q0.546875 -0.734375 1.15625 -1.03125q0.609375 -0.296875 1.3125 -0.296875q0.609375 0 1.078125 0.265625q0.46875 0.25 0.78125 0.71875q0.3125 0.46875 0.46875 1.140625q0.15625 0.65625 0.15625 1.46875zm-1.265625 0.046875q0 -0.5625 -0.09375 -1.03125q-0.078125 -0.46875 -0.265625 -0.796875q-0.171875 -0.34375 -0.453125 -0.53125q-0.265625 -0.1875 -0.65625 -0.1875q-0.234375 0 -0.484375 0.078125q-0.234375 0.0625 -0.5 0.25q-0.265625 0.171875 -0.5625 0.46875q-0.28125 0.28125 -0.609375 0.703125l0 3.484375q0.34375 0.140625 0.71875 0.234375q0.390625 0.078125 0.75 0.078125q1.0 0 1.578125 -0.6875q0.578125 -0.6875 0.578125 -2.0625zm8.214325 0.359375l-4.28125 0l0 -1.171875l4.28125 0l0 1.171875zm9.10495 -0.65625q0 0.265625 -0.015625 0.453125q0 0.171875 -0.015625 0.328125l-5.046875 0q0 1.09375 0.609375 1.6875q0.625 0.59375 1.78125 0.59375q0.3125 0 0.625 -0.015625q0.3125 -0.03125 0.609375 -0.078125q0.296875 -0.046875 0.5625 -0.09375q0.265625 -0.0625 0.5 -0.125l0 1.03125q-0.515625 0.140625 -1.15625 0.21875q-0.640625 0.09375 -1.328125 0.09375q-0.921875 0 -1.59375 -0.25q-0.65625 -0.25 -1.078125 -0.71875q-0.421875 -0.484375 -0.625 -1.171875q-0.203125 -0.6875 -0.203125 -1.5625q0 -0.765625 0.21875 -1.4375q0.21875 -0.671875 0.625 -1.1875q0.421875 -0.515625 1.03125 -0.8125q0.609375 -0.296875 1.375 -0.296875q0.765625 0 1.34375 0.234375q0.578125 0.234375 0.96875 0.671875q0.40625 0.4375 0.609375 1.0625q0.203125 0.609375 0.203125 1.375zm-1.296875 -0.1875q0.015625 -0.46875 -0.109375 -0.859375q-0.109375 -0.40625 -0.34375 -0.6875q-0.234375 -0.296875 -0.59375 -0.453125q-0.359375 -0.15625 -0.828125 -0.15625q-0.40625 0 -0.75 0.15625q-0.328125 0.15625 -0.578125 0.4375q-0.25 0.28125 -0.40625 0.6875q-0.140625 0.40625 -0.171875 0.875l3.78125 0zm2.91745 0.71875q0 -0.78125 0.203125 -1.46875q0.203125 -0.703125 0.625 -1.21875q0.4375 -0.53125 1.09375 -0.828125q0.671875 -0.3125 1.5937653 -0.3125q0.359375 0 0.71875 0.0625q0.359375 0.046875 0.765625 0.171875l1.09375 -0.28125l0 10.25l-1.25 0l0 -2.71875l0.0625 -1.53125q-1.015625 1.4375 -2.4218903 1.4375q-0.625 0 -1.09375 -0.25q-0.46875 -0.265625 -0.78125 -0.734375q-0.296875 -0.46875 -0.453125 -1.125q-0.15625 -0.65625 -0.15625 -1.453125zm1.265625 -0.078125q0 0.59375 0.09375 1.078125q0.09375 0.484375 0.265625 0.8125q0.1875 0.328125 0.46875 0.515625q0.28125 0.171875 0.640625 0.171875q0.48439026 0 1.0000153 -0.421875q0.53125 -0.421875 1.109375 -1.25l0 -3.34375q-0.28125 -0.125 -0.65625 -0.203125q-0.359375 -0.09375 -0.765625 -0.09375q-1.0468903 0 -1.6093903 0.703125q-0.546875 0.6875 -0.546875 2.03125zm12.93309 3.515625l-1.125 0l-0.03125 -1.15625q-0.328125 0.375 -0.625 0.625q-0.28125 0.234375 -0.578125 0.390625q-0.28125 0.140625 -0.578125 0.203125q-0.28125 0.0625 -0.59375 0.0625q-1.109375 0 -1.6875 -0.640625q-0.5625 -0.65625 -0.5625 -1.96875l0 -4.703125l1.25 0l0 4.59375q0 1.65625 1.25 1.65625q0.21875 0 0.4375 -0.0625q0.21875 -0.078125 0.453125 -0.234375q0.25 -0.171875 0.515625 -0.453125q0.28125 -0.296875 0.625 -0.734375l0 -4.765625l1.25 0l0 7.1875zm4.72995 -6.15625l-2.125 0l0 -1.03125l3.390625 0l0 6.15625l2.125 0l0 1.03125l-5.75 0l0 -1.03125l2.359375 0l0 -5.125zm0.4375 -4.046875q0.203125 0 0.375 0.078125q0.1875 0.078125 0.3125 0.21875q0.140625 0.125 0.21875 0.3125q0.078125 0.171875 0.078125 0.375q0 0.203125 -0.078125 0.390625q-0.078125 0.171875 -0.21875 0.3125q-0.125 0.125 -0.3125 0.203125q-0.171875 0.078125 -0.375 0.078125q-0.203125 0 -0.390625 -0.078125q-0.171875 -0.078125 -0.3125 -0.203125q-0.125 -0.140625 -0.203125 -0.3125q-0.078125 -0.1875 -0.078125 -0.390625q0 -0.203125 0.078125 -0.375q0.078125 -0.1875 0.203125 -0.3125q0.140625 -0.140625 0.3125 -0.21875q0.1875 -0.078125 0.390625 -0.078125zm4.495575 3.015625l1.421875 0l1.765625 4.75l0.375 1.15625l0.390625 -1.1875l1.75 -4.71875l1.359375 0l-2.8125 7.1875l-1.4375 0l-2.8125 -7.1875zm13.35495 7.1875l-0.03125 -0.96875q-0.59375 0.578125 -1.203125 0.84375q-0.59375 0.25 -1.265625 0.25q-0.609375 0 -1.046875 -0.15625q-0.4375 -0.15625 -0.71875 -0.421875q-0.28125 -0.28125 -0.421875 -0.65625q-0.125 -0.375 -0.125 -0.8125q0 -1.078125 0.796875 -1.6875q0.8125 -0.609375 2.390625 -0.609375l1.484375 0l0 -0.640625q0 -0.625 -0.40625 -1.0q-0.40625 -0.390625 -1.25 -0.390625q-0.609375 0 -1.203125 0.140625q-0.578125 0.125 -1.21875 0.375l0 -1.125q0.25 -0.078125 0.53125 -0.15625q0.296875 -0.09375 0.609375 -0.15625q0.328125 -0.0625 0.671875 -0.09375q0.359375 -0.046875 0.71875 -0.046875q0.640625 0 1.15625 0.140625q0.53125 0.140625 0.890625 0.4375q0.359375 0.296875 0.546875 0.75q0.203125 0.4375 0.203125 1.03125l0 4.953125l-1.109375 0zm-0.140625 -3.265625l-1.578125 0q-0.46875 0 -0.8125 0.09375q-0.328125 0.09375 -0.546875 0.265625q-0.21875 0.171875 -0.328125 0.40625q-0.09375 0.234375 -0.09375 0.546875q0 0.203125 0.0625 0.390625q0.0625 0.1875 0.203125 0.34375q0.15625 0.140625 0.375 0.234375q0.234375 0.078125 0.5625 0.078125q0.4375 0 0.984375 -0.265625q0.5625 -0.265625 1.171875 -0.828125l0 -1.265625zm6.026825 -5.828125l-2.125 0l0 -1.015625l3.390625 0l0 9.078125l2.125 0l0 1.03125l-5.75 0l0 -1.03125l2.359375 0l0 -8.0625zm11.683075 5.125q0 0.265625 -0.015625 0.453125q0 0.171875 -0.015625 0.328125l-5.046875 0q0 1.09375 0.609375 1.6875q0.625 0.59375 1.78125 0.59375q0.3125 0 0.625 -0.015625q0.3125 -0.03125 0.609375 -0.078125q0.296875 -0.046875 0.5625 -0.09375q0.265625 -0.0625 0.5 -0.125l0 1.03125q-0.515625 0.140625 -1.15625 0.21875q-0.640625 0.09375 -1.328125 0.09375q-0.921875 0 -1.59375 -0.25q-0.65625 -0.25 -1.078125 -0.71875q-0.421875 -0.484375 -0.625 -1.171875q-0.203125 -0.6875 -0.203125 -1.5625q0 -0.765625 0.21875 -1.4375q0.21875 -0.671875 0.625 -1.1875q0.421875 -0.515625 1.03125 -0.8125q0.609375 -0.296875 1.375 -0.296875q0.765625 0 1.34375 0.234375q0.578125 0.234375 0.96875 0.671875q0.40625 0.4375 0.609375 1.0625q0.203125 0.609375 0.203125 1.375zm-1.296875 -0.1875q0.015625 -0.46875 -0.109375 -0.859375q-0.109375 -0.40625 -0.34375 -0.6875q-0.234375 -0.296875 -0.59375 -0.453125q-0.359375 -0.15625 -0.828125 -0.15625q-0.40625 0 -0.75 0.15625q-0.328125 0.15625 -0.578125 0.4375q-0.25 0.28125 -0.40625 0.6875q-0.140625 0.40625 -0.171875 0.875l3.78125 0zm3.276825 -3.03125l1.109375 0l0.046875 1.15625q0.328125 -0.359375 0.609375 -0.609375q0.296875 -0.25 0.578125 -0.390625q0.28125 -0.15625 0.578125 -0.21875q0.296875 -0.0625 0.609375 -0.0625q1.109375 0 1.671875 0.65625q0.578125 0.65625 0.578125 1.96875l0 4.6875l-1.25 0l0 -4.59375q0 -0.84375 -0.3125 -1.234375q-0.3125 -0.40625 -0.9375 -0.40625q-0.234375 0 -0.453125 0.0625q-0.21875 0.0625 -0.453125 0.234375q-0.234375 0.171875 -0.515625 0.453125q-0.265625 0.28125 -0.609375 0.71875l0 4.765625l-1.25 0l0 -7.1875zm13.85495 7.09375q-0.421875 0.09375 -0.875 0.140625q-0.453125 0.046875 -0.921875 0.046875q-1.34375 0 -2.015625 -0.609375q-0.65625 -0.609375 -0.65625 -1.875l0 -3.75l-2.015625 0l0 -1.046875l2.015625 0l0 -1.96875l1.234375 -0.328125l0 2.296875l3.234375 0l0 1.046875l-3.234375 0l0 3.65625q0 0.765625 0.40625 1.15625q0.421875 0.375 1.21875 0.375q0.34375 0 0.75 -0.046875q0.40625 -0.0625 0.859375 -0.171875l0 1.078125zm7.29245 -3.21875l-4.28125 0l0 -1.171875l4.28125 0l0 1.171875zm9.16745 -0.40625q0 0.890625 -0.25 1.59375q-0.234375 0.703125 -0.703125 1.203125q-0.453125 0.484375 -1.109375 0.75q-0.640625 0.25 -1.453125 0.25q-0.640625 0 -1.3125 -0.109375q-0.65625 -0.125 -1.3125 -0.390625l0 -9.6875l1.25 0l0 2.78125l-0.0625 1.328125q0.546875 -0.71875 1.15625 -1.015625q0.609375 -0.296875 1.3125 -0.296875q0.609375 0 1.078125 0.265625q0.46875 0.25 0.78125 0.71875q0.3125 0.46875 0.46875 1.140625q0.15625 0.65625 0.15625 1.46875zm-1.265625 0.046875q0 -0.5625 -0.09375 -1.03125q-0.078125 -0.46875 -0.265625 -0.796875q-0.171875 -0.34375 -0.453125 -0.53125q-0.265625 -0.1875 -0.65625 -0.1875q-0.234375 0 -0.484375 0.078125q-0.234375 0.0625 -0.5 0.25q-0.265625 0.171875 -0.5625 0.46875q-0.28125 0.28125 -0.609375 0.703125l0 3.484375q0.359375 0.140625 0.734375 0.234375q0.390625 0.078125 0.734375 0.078125q0.421875 0 0.8125 -0.125q0.390625 -0.140625 0.6875 -0.453125q0.296875 -0.328125 0.46875 -0.859375q0.1875 -0.53125 0.1875 -1.3125zm8.964325 3.671875l-1.125 0l-0.03125 -1.15625q-0.328125 0.375 -0.625 0.625q-0.28125 0.234375 -0.578125 0.390625q-0.28125 0.140625 -0.578125 0.203125q-0.28125 0.0625 -0.59375 0.0625q-1.109375 0 -1.6875 -0.640625q-0.5625 -0.65625 -0.5625 -1.96875l0 -4.703125l1.25 0l0 4.59375q0 1.65625 1.25 1.65625q0.21875 0 0.4375 -0.0625q0.21875 -0.078125 0.453125 -0.234375q0.25 -0.171875 0.515625 -0.453125q0.28125 -0.296875 0.625 -0.734375l0 -4.765625l1.25 0l0 7.1875zm8.651825 -8.953125q-0.96875 -0.203125 -1.6875 -0.203125q-1.671875 0 -1.671875 1.75l0 1.265625l3.140625 0l0 1.03125l-3.140625 0l0 5.109375l-1.265625 0l0 -5.109375l-2.3125 0l0 -1.03125l2.3125 0l0 -1.1875q0 -2.875 2.984375 -2.875q0.75 0 1.640625 0.171875l0 1.078125zm-7.515625 1.765625l0 0zm15.5737 -1.765625q-0.96875 -0.203125 -1.6875 -0.203125q-1.671875 0 -1.671875 1.75l0 1.265625l3.140625 0l0 1.03125l-3.140625 0l0 5.109375l-1.265625 0l0 -5.109375l-2.3125 0l0 -1.03125l2.3125 0l0 -1.1875q0 -2.875 2.984375 -2.875q0.75 0 1.640625 0.171875l0 1.078125zm-7.515625 1.765625l0 0zm15.276825 3.21875q0 0.265625 -0.015625 0.453125q0 0.171875 -0.015625 0.328125l-5.046875 0q0 1.09375 0.609375 1.6875q0.625 0.59375 1.78125 0.59375q0.3125 0 0.625 -0.015625q0.3125 -0.03125 0.609375 -0.078125q0.296875 -0.046875 0.5625 -0.09375q0.265625 -0.0625 0.5 -0.125l0 1.03125q-0.515625 0.140625 -1.15625 0.21875q-0.640625 0.09375 -1.328125 0.09375q-0.921875 0 -1.59375 -0.25q-0.65625 -0.25 -1.078125 -0.71875q-0.421875 -0.484375 -0.625 -1.171875q-0.203125 -0.6875 -0.203125 -1.5625q0 -0.765625 0.21875 -1.4375q0.21875 -0.671875 0.625 -1.1875q0.421875 -0.515625 1.03125 -0.8125q0.609375 -0.296875 1.375 -0.296875q0.765625 0 1.34375 0.234375q0.578125 0.234375 0.96875 0.671875q0.40625 0.4375 0.609375 1.0625q0.203125 0.609375 0.203125 1.375zm-1.296875 -0.1875q0.015625 -0.46875 -0.109375 -0.859375q-0.109375 -0.40625 -0.34375 -0.6875q-0.234375 -0.296875 -0.59375 -0.453125q-0.359375 -0.15625 -0.828125 -0.15625q-0.40625 0 -0.75 0.15625q-0.328125 0.15625 -0.578125 0.4375q-0.25 0.28125 -0.40625 0.6875q-0.140625 0.40625 -0.171875 0.875l3.78125 0zm3.5737 -3.03125l1.140625 0l0.03125 1.328125q0.640625 -0.765625 1.25 -1.109375q0.625 -0.34375 1.25 -0.34375q1.125 0 1.703125 0.71875q0.578125 0.71875 0.53125 2.15625l-1.265625 0q0.03125 -0.953125 -0.265625 -1.375q-0.296875 -0.421875 -0.875 -0.421875q-0.25 0 -0.515625 0.09375q-0.25 0.078125 -0.515625 0.28125q-0.265625 0.1875 -0.5625 0.5q-0.296875 0.3125 -0.640625 0.75l0 4.609375l-1.265625 0l0 -7.1875zm12.79245 3.875l-4.28125 0l0 -1.171875l4.28125 0l0 1.171875zm3.3237 -3.875l1.140625 0l0.03125 1.328125q0.640625 -0.765625 1.25 -1.109375q0.625 -0.34375 1.25 -0.34375q1.125 0 1.703125 0.71875q0.578125 0.71875 0.53125 2.15625l-1.265625 0q0.03125 -0.953125 -0.265625 -1.375q-0.296875 -0.421875 -0.875 -0.421875q-0.25 0 -0.515625 0.09375q-0.25 0.078125 -0.515625 0.28125q-0.265625 0.1875 -0.5625 0.5q-0.296875 0.3125 -0.640625 0.75l0 4.609375l-1.265625 0l0 -7.1875zm13.839325 3.21875q0 0.265625 -0.015625 0.453125q0 0.171875 -0.015625 0.328125l-5.046875 0q0 1.09375 0.609375 1.6875q0.625 0.59375 1.78125 0.59375q0.3125 0 0.625 -0.015625q0.3125 -0.03125 0.609375 -0.078125q0.296875 -0.046875 0.5625 -0.09375q0.265625 -0.0625 0.5 -0.125l0 1.03125q-0.515625 0.140625 -1.15625 0.21875q-0.640625 0.09375 -1.328125 0.09375q-0.921875 0 -1.59375 -0.25q-0.65625 -0.25 -1.078125 -0.71875q-0.421875 -0.484375 -0.625 -1.171875q-0.203125 -0.6875 -0.203125 -1.5625q0 -0.765625 0.21875 -1.4375q0.21875 -0.671875 0.625 -1.1875q0.421875 -0.515625 1.03125 -0.8125q0.609375 -0.296875 1.375 -0.296875q0.765625 0 1.34375 0.234375q0.578125 0.234375 0.96875 0.671875q0.40625 0.4375 0.609375 1.0625q0.203125 0.609375 0.203125 1.375zm-1.296875 -0.1875q0.015625 -0.46875 -0.109375 -0.859375q-0.109375 -0.40625 -0.34375 -0.6875q-0.234375 -0.296875 -0.59375 -0.453125q-0.359375 -0.15625 -0.828125 -0.15625q-0.40625 0 -0.75 0.15625q-0.328125 0.15625 -0.578125 0.4375q-0.25 0.28125 -0.40625 0.6875q-0.140625 0.40625 -0.171875 0.875l3.78125 0zm8.964325 2.203125q0 0.375 -0.125 0.671875q-0.125 0.296875 -0.34375 0.53125q-0.21875 0.234375 -0.515625 0.40625q-0.296875 0.15625 -0.640625 0.265625q-0.328125 0.109375 -0.6875 0.15625q-0.34375 0.046875 -0.671875 0.046875q-0.734375 0 -1.34375 -0.0625q-0.609375 -0.0625 -1.203125 -0.203125l0 -1.140625q0.640625 0.171875 1.25 0.265625q0.625 0.09375 1.25 0.09375q0.890625 0 1.3125 -0.234375q0.4375 -0.25 0.4375 -0.703125q0 -0.1875 -0.078125 -0.34375q-0.0625 -0.15625 -0.25 -0.296875q-0.171875 -0.140625 -0.546875 -0.28125q-0.375 -0.15625 -1.015625 -0.359375q-0.5 -0.140625 -0.90625 -0.3125q-0.40625 -0.1875 -0.71875 -0.4375q-0.296875 -0.25 -0.46875 -0.578125q-0.171875 -0.34375 -0.171875 -0.8125q0 -0.296875 0.140625 -0.65625q0.140625 -0.359375 0.46875 -0.65625q0.34375 -0.3125 0.921875 -0.515625q0.578125 -0.203125 1.421875 -0.203125q0.421875 0 0.9375 0.046875q0.515625 0.046875 1.078125 0.15625l0 1.109375q-0.578125 -0.140625 -1.109375 -0.203125q-0.53125 -0.078125 -0.90625 -0.078125q-0.46875 0 -0.796875 0.078125q-0.3125 0.0625 -0.515625 0.203125q-0.1875 0.125 -0.28125 0.296875q-0.078125 0.15625 -0.078125 0.34375q0 0.203125 0.0625 0.359375q0.078125 0.15625 0.28125 0.3125q0.21875 0.140625 0.578125 0.28125q0.359375 0.140625 0.953125 0.3125q0.640625 0.1875 1.0625 0.390625q0.4375 0.203125 0.703125 0.46875q0.28125 0.25 0.390625 0.5625q0.125 0.3125 0.125 0.71875zm8.151825 1.953125l-1.125 0l-0.03125 -1.15625q-0.328125 0.375 -0.625 0.625q-0.28125 0.234375 -0.578125 0.390625q-0.28125 0.140625 -0.578125 0.203125q-0.28125 0.0625 -0.59375 0.0625q-1.109375 0 -1.6875 -0.640625q-0.5625 -0.65625 -0.5625 -1.96875l0 -4.703125l1.25 0l0 4.59375q0 1.65625 1.25 1.65625q0.21875 0 0.4375 -0.0625q0.21875 -0.078125 0.453125 -0.234375q0.25 -0.171875 0.515625 -0.453125q0.28125 -0.296875 0.625 -0.734375l0 -4.765625l1.25 0l0 7.1875zm4.72995 -9.09375l-2.125 0l0 -1.015625l3.390625 0l0 9.078125l2.125 0l0 1.03125l-5.75 0l0 -1.03125l2.359375 0l0 -8.0625zm11.401825 9.0q-0.421875 0.09375 -0.875 0.140625q-0.453125 0.046875 -0.921875 0.046875q-1.34375 0 -2.015625 -0.609375q-0.65625 -0.609375 -0.65625 -1.875l0 -3.75l-2.015625 0l0 -1.046875l2.015625 0l0 -1.96875l1.234375 -0.328125l0 2.296875l3.234375 0l0 1.046875l-3.234375 0l0 3.65625q0 0.765625 0.40625 1.15625q0.421875 0.375 1.21875 0.375q0.34375 0 0.75 -0.046875q0.40625 -0.0625 0.859375 -0.171875l0 1.078125zm7.9487 -1.859375q0 0.375 -0.125 0.671875q-0.125 0.296875 -0.34375 0.53125q-0.21875 0.234375 -0.515625 0.40625q-0.296875 0.15625 -0.640625 0.265625q-0.328125 0.109375 -0.6875 0.15625q-0.34375 0.046875 -0.671875 0.046875q-0.734375 0 -1.34375 -0.0625q-0.609375 -0.0625 -1.203125 -0.203125l0 -1.140625q0.640625 0.171875 1.25 0.265625q0.625 0.09375 1.25 0.09375q0.890625 0 1.3125 -0.234375q0.4375 -0.25 0.4375 -0.703125q0 -0.1875 -0.078125 -0.34375q-0.0625 -0.15625 -0.25 -0.296875q-0.171875 -0.140625 -0.546875 -0.28125q-0.375 -0.15625 -1.015625 -0.359375q-0.5 -0.140625 -0.90625 -0.3125q-0.40625 -0.1875 -0.71875 -0.4375q-0.296875 -0.25 -0.46875 -0.578125q-0.171875 -0.34375 -0.171875 -0.8125q0 -0.296875 0.140625 -0.65625q0.140625 -0.359375 0.46875 -0.65625q0.34375 -0.3125 0.921875 -0.515625q0.578125 -0.203125 1.421875 -0.203125q0.421875 0 0.9375 0.046875q0.515625 0.046875 1.078125 0.15625l0 1.109375q-0.578125 -0.140625 -1.109375 -0.203125q-0.53125 -0.078125 -0.90625 -0.078125q-0.46875 0 -0.796875 0.078125q-0.3125 0.0625 -0.515625 0.203125q-0.1875 0.125 -0.28125 0.296875q-0.078125 0.15625 -0.078125 0.34375q0 0.203125 0.0625 0.359375q0.078125 0.15625 0.28125 0.3125q0.21875 0.140625 0.578125 0.28125q0.359375 0.140625 0.953125 0.3125q0.640625 0.1875 1.0625 0.390625q0.4375 0.203125 0.703125 0.46875q0.28125 0.25 0.390625 0.5625q0.125 0.3125 0.125 0.71875z" fill-rule="nonzero"/><path fill="#ea4335" d="m163.2269 19.56168l316.7874 0l0 29.039371l-316.7874 0z" fill-rule="evenodd"/><path stroke="#ea4335" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m163.2269 19.56168l316.7874 0l0 29.039371l-316.7874 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m176.57526 28.276524l290.0907 0l0 17.16098l-290.0907 0l0 -17.16098z" fill-rule="nonzero"/><path fill="#ffffff" d="m182.24713 34.723866q0.015625 -0.421875 -0.046875 -0.703125q-0.046875 -0.28125 -0.171875 -0.46875q-0.109375 -0.1875 -0.28125 -0.265625q-0.171875 -0.09375 -0.390625 -0.09375q-0.390625 0 -0.796875 0.328125q-0.40625 0.3125 -0.90625 1.046875l0 4.59375l-1.796875 0l0 -7.265625l1.59375 0l0.0625 1.046875q0.171875 -0.265625 0.390625 -0.484375q0.234375 -0.234375 0.5 -0.390625q0.28125 -0.15625 0.609375 -0.234375q0.34375 -0.09375 0.75 -0.09375q0.546875 0 0.96875 0.1875q0.4375 0.1875 0.71875 0.5625q0.296875 0.359375 0.4375 0.921875q0.140625 0.5625 0.109375 1.3125l-1.75 0zm9.714325 0.25q0 0.171875 -0.015625 0.46875q0 0.28125 -0.03125 0.53125l-4.78125 0q0 0.46875 0.140625 0.84375q0.15625 0.359375 0.421875 0.609375q0.265625 0.25 0.640625 0.375q0.390625 0.125 0.84375 0.125q0.53125 0 1.125 -0.078125q0.609375 -0.078125 1.265625 -0.265625l0 1.390625q-0.28125 0.078125 -0.609375 0.140625q-0.328125 0.0625 -0.671875 0.109375q-0.34375 0.046875 -0.703125 0.078125q-0.34375 0.03125 -0.671875 0.03125q-0.828125 0 -1.5 -0.234375q-0.65625 -0.25 -1.109375 -0.703125q-0.453125 -0.46875 -0.703125 -1.140625q-0.234375 -0.6875 -0.234375 -1.578125q0 -0.890625 0.234375 -1.609375q0.25 -0.734375 0.6875 -1.25q0.453125 -0.515625 1.078125 -0.796875q0.640625 -0.28125 1.421875 -0.28125q0.765625 0 1.359375 0.234375q0.59375 0.234375 0.984375 0.671875q0.40625 0.421875 0.609375 1.015625q0.21875 0.59375 0.21875 1.3125zm-1.78125 -0.25q0.015625 -0.4375 -0.109375 -0.75q-0.109375 -0.328125 -0.3125 -0.53125q-0.1875 -0.203125 -0.453125 -0.296875q-0.25 -0.109375 -0.5625 -0.109375q-0.65625 0 -1.078125 0.4375q-0.421875 0.4375 -0.5 1.25l3.015625 0zm9.47995 4.4375l-1.9375 0l-0.78125 -2.296875l-0.234375 -0.8125l-0.234375 0.828125l-0.75 2.28125l-1.9375 0l-1.03125 -7.265625l1.65625 0l0.515625 4.109375l0.125 1.109375l0.296875 -1.0l0.734375 -2.21875l1.328125 0l0.78125 2.203125l0.328125 1.03125l0.140625 -1.140625l0.4375 -4.09375l1.578125 0l-1.015625 7.265625zm6.7612 -4.4375q0.015625 -0.421875 -0.046875 -0.703125q-0.046875 -0.28125 -0.171875 -0.46875q-0.109375 -0.1875 -0.28125 -0.265625q-0.171875 -0.09375 -0.390625 -0.09375q-0.390625 0 -0.796875 0.328125q-0.40625 0.3125 -0.90625 1.046875l0 4.59375l-1.796875 0l0 -7.265625l1.59375 0l0.0625 1.046875q0.171875 -0.265625 0.390625 -0.484375q0.234375 -0.234375 0.5 -0.390625q0.28125 -0.15625 0.609375 -0.234375q0.34375 -0.09375 0.75 -0.09375q0.546875 0 0.96875 0.1875q0.4375 0.1875 0.71875 0.5625q0.296875 0.359375 0.4375 0.921875q0.140625 0.5625 0.109375 1.3125l-1.75 0zm7.66745 -4.828125q0 0.234375 -0.09375 0.453125q-0.078125 0.203125 -0.234375 0.359375q-0.15625 0.15625 -0.375 0.25q-0.203125 0.078125 -0.4375 0.078125q-0.25 0 -0.46875 -0.078125q-0.203125 -0.09375 -0.359375 -0.25q-0.15625 -0.15625 -0.25 -0.359375q-0.078125 -0.21875 -0.078125 -0.453125q0 -0.234375 0.078125 -0.4375q0.09375 -0.203125 0.25 -0.359375q0.15625 -0.15625 0.359375 -0.25q0.21875 -0.09375 0.46875 -0.09375q0.234375 0 0.4375 0.09375q0.21875 0.09375 0.375 0.25q0.15625 0.15625 0.234375 0.359375q0.09375 0.203125 0.09375 0.4375zm-1.9375 3.359375l-2.0625 0l0 -1.359375l3.84375 0l0 5.90625l2.109375 0l0 1.359375l-6.203125 0l0 -1.359375l2.3125 0l0 -4.546875zm11.6987 5.828125q-0.46875 0.109375 -0.953125 0.171875q-0.46875 0.078125 -0.90625 0.078125q-0.703125 0 -1.234375 -0.15625q-0.515625 -0.15625 -0.859375 -0.453125q-0.328125 -0.3125 -0.5 -0.78125q-0.15625 -0.484375 -0.15625 -1.140625l0 -3.546875l-1.953125 0l0 -1.359375l1.953125 0l0 -1.859375l1.796875 -0.46875l0 2.328125l2.8125 0l0 1.359375l-2.8125 0l0 3.421875q0 0.609375 0.28125 0.9375q0.28125 0.3125 0.953125 0.3125q0.4375 0 0.84375 -0.0625q0.40625 -0.078125 0.734375 -0.171875l0 1.390625zm8.401825 -4.109375q0 0.171875 -0.015625 0.46875q0 0.28125 -0.03125 0.53125l-4.78125 0q0 0.46875 0.140625 0.84375q0.15625 0.359375 0.421875 0.609375q0.265625 0.25 0.640625 0.375q0.390625 0.125 0.84375 0.125q0.53125 0 1.125 -0.078125q0.609375 -0.078125 1.265625 -0.265625l0 1.390625q-0.28125 0.078125 -0.609375 0.140625q-0.328125 0.0625 -0.671875 0.109375q-0.34375 0.046875 -0.703125 0.078125q-0.34375 0.03125 -0.671875 0.03125q-0.828125 0 -1.5 -0.234375q-0.65625 -0.25 -1.109375 -0.703125q-0.453125 -0.46875 -0.703125 -1.140625q-0.234375 -0.6875 -0.234375 -1.578125q0 -0.890625 0.234375 -1.609375q0.25 -0.734375 0.6875 -1.25q0.453125 -0.515625 1.078125 -0.796875q0.640625 -0.28125 1.421875 -0.28125q0.765625 0 1.359375 0.234375q0.59375 0.234375 0.984375 0.671875q0.40625 0.421875 0.609375 1.015625q0.21875 0.59375 0.21875 1.3125zm-1.78125 -0.25q0.015625 -0.4375 -0.109375 -0.75q-0.109375 -0.328125 -0.3125 -0.53125q-0.1875 -0.203125 -0.453125 -0.296875q-0.25 -0.109375 -0.5625 -0.109375q-0.65625 0 -1.078125 0.4375q-0.421875 0.4375 -0.5 1.25l3.015625 0zm2.5112 7.375l0 -1.390625l8.0625 0l0 1.390625l-8.0625 0zm13.339325 -12.203125q0 0.234375 -0.09375 0.453125q-0.078125 0.203125 -0.234375 0.359375q-0.15625 0.15625 -0.375 0.25q-0.203125 0.078125 -0.4375 0.078125q-0.25 0 -0.46875 -0.078125q-0.203125 -0.09375 -0.359375 -0.25q-0.15625 -0.15625 -0.25 -0.359375q-0.078125 -0.21875 -0.078125 -0.453125q0 -0.234375 0.078125 -0.4375q0.09375 -0.203125 0.25 -0.359375q0.15625 -0.15625 0.359375 -0.25q0.21875 -0.09375 0.46875 -0.09375q0.234375 0 0.4375 0.09375q0.21875 0.09375 0.375 0.25q0.15625 0.15625 0.234375 0.359375q0.09375 0.203125 0.09375 0.4375zm-1.9375 3.359375l-2.0625 0l0 -1.359375l3.84375 0l0 5.90625l2.109375 0l0 1.359375l-6.203125 0l0 -1.359375l2.3125 0l0 -4.546875zm10.04245 5.90625l0 -4.734375q0 -1.203125 -0.890625 -1.203125q-0.4375 0 -0.84375 0.359375q-0.40625 0.359375 -0.859375 0.984375l0 4.59375l-1.734375 0l0 -7.265625l1.5 0l0.046875 1.0625q0.21875 -0.265625 0.453125 -0.5q0.25 -0.234375 0.53125 -0.390625q0.28125 -0.15625 0.59375 -0.234375q0.328125 -0.09375 0.734375 -0.09375q0.546875 0 0.953125 0.1875q0.421875 0.171875 0.703125 0.515625q0.28125 0.328125 0.421875 0.796875q0.140625 0.46875 0.140625 1.046875l0 4.875l-1.75 0zm2.72995 2.9375l0 -1.390625l8.0625 0l0 1.390625l-8.0625 0zm13.620575 -2.9375l-0.03125 -1.078125q-0.234375 0.28125 -0.484375 0.515625q-0.234375 0.21875 -0.515625 0.390625q-0.28125 0.15625 -0.609375 0.25q-0.3125 0.09375 -0.6875 0.09375q-0.65625 0 -1.15625 -0.25q-0.5 -0.265625 -0.84375 -0.734375q-0.328125 -0.484375 -0.515625 -1.15625q-0.171875 -0.6875 -0.171875 -1.53125q0 -1.0 0.28125 -1.734375q0.28125 -0.734375 0.78125 -1.21875q0.5 -0.484375 1.1875 -0.71875q0.6875 -0.234375 1.5 -0.234375q0.265625 0 0.53125 0.046875q0.28125 0.03125 0.5 0.09375l0 -2.84375l1.75 0l0 10.109375l-1.515625 0zm-3.1875 -3.578125q0 0.59375 0.078125 1.03125q0.09375 0.421875 0.25 0.703125q0.15625 0.265625 0.390625 0.390625q0.234375 0.125 0.546875 0.125q0.4375 0 0.828125 -0.359375q0.40625 -0.359375 0.859375 -0.984375l0 -3.171875q-0.203125 -0.078125 -0.515625 -0.125q-0.296875 -0.0625 -0.609375 -0.0625q-0.421875 0 -0.765625 0.171875q-0.328125 0.171875 -0.5625 0.5q-0.234375 0.3125 -0.375 0.765625q-0.125 0.453125 -0.125 1.015625zm13.0112 -0.609375q0 0.171875 -0.015625 0.46875q0 0.28125 -0.03125 0.53125l-4.78125 0q0 0.46875 0.140625 0.84375q0.15625 0.359375 0.421875 0.609375q0.265625 0.25 0.640625 0.375q0.390625 0.125 0.84375 0.125q0.53125 0 1.125 -0.078125q0.609375 -0.078125 1.265625 -0.265625l0 1.390625q-0.28125 0.078125 -0.609375 0.140625q-0.328125 0.0625 -0.671875 0.109375q-0.34375 0.046875 -0.703125 0.078125q-0.34375 0.03125 -0.671875 0.03125q-0.828125 0 -1.5 -0.234375q-0.65625 -0.25 -1.109375 -0.703125q-0.453125 -0.46875 -0.703125 -1.140625q-0.234375 -0.6875 -0.234375 -1.578125q0 -0.890625 0.234375 -1.609375q0.25 -0.734375 0.6875 -1.25q0.453125 -0.515625 1.078125 -0.796875q0.640625 -0.28125 1.421875 -0.28125q0.765625 0 1.359375 0.234375q0.59375 0.234375 0.984375 0.671875q0.40625 0.421875 0.609375 1.015625q0.21875 0.59375 0.21875 1.3125zm-1.78125 -0.25q0.015625 -0.4375 -0.109375 -0.75q-0.109375 -0.328125 -0.3125 -0.53125q-0.1875 -0.203125 -0.453125 -0.296875q-0.25 -0.109375 -0.5625 -0.109375q-0.65625 0 -1.078125 0.4375q-0.421875 0.4375 -0.5 1.25l3.015625 0zm9.495575 2.296875q0 0.625 -0.28125 1.0625q-0.265625 0.4375 -0.71875 0.71875q-0.453125 0.28125 -1.03125 0.40625q-0.578125 0.125 -1.171875 0.125q-0.796875 0 -1.453125 -0.078125q-0.640625 -0.078125 -1.21875 -0.21875l0 -1.578125q0.6875 0.28125 1.34375 0.40625q0.671875 0.125 1.265625 0.125q0.6875 0 1.03125 -0.21875q0.34375 -0.21875 0.34375 -0.5625q0 -0.171875 -0.078125 -0.296875q-0.078125 -0.140625 -0.28125 -0.265625q-0.1875 -0.125 -0.546875 -0.25q-0.359375 -0.140625 -0.9375 -0.3125q-0.53125 -0.15625 -0.9375 -0.34375q-0.40625 -0.1875 -0.671875 -0.4375q-0.265625 -0.265625 -0.40625 -0.59375q-0.125 -0.34375 -0.125 -0.8125q0 -0.4375 0.203125 -0.828125q0.203125 -0.390625 0.59375 -0.6875q0.40625 -0.296875 1.0 -0.46875q0.59375 -0.171875 1.390625 -0.171875q0.6875 0 1.21875 0.078125q0.53125 0.0625 0.9375 0.140625l0 1.421875q-0.625 -0.1875 -1.171875 -0.265625q-0.546875 -0.09375 -1.09375 -0.09375q-0.53125 0 -0.859375 0.203125q-0.328125 0.1875 -0.328125 0.53125q0 0.15625 0.0625 0.296875q0.078125 0.125 0.265625 0.25q0.1875 0.109375 0.515625 0.25q0.34375 0.125 0.90625 0.28125q0.625 0.1875 1.046875 0.390625q0.4375 0.203125 0.703125 0.46875q0.265625 0.25 0.375 0.578125q0.109375 0.328125 0.109375 0.75zm8.058075 2.0625q-0.46875 0.109375 -0.953125 0.171875q-0.46875 0.078125 -0.90625 0.078125q-0.703125 0 -1.234375 -0.15625q-0.515625 -0.15625 -0.859375 -0.453125q-0.328125 -0.3125 -0.5 -0.78125q-0.15625 -0.484375 -0.15625 -1.140625l0 -3.546875l-1.953125 0l0 -1.359375l1.953125 0l0 -1.859375l1.796875 -0.46875l0 2.328125l2.8125 0l0 1.359375l-2.8125 0l0 3.421875q0 0.609375 0.28125 0.9375q0.28125 0.3125 0.953125 0.3125q0.4375 0 0.84375 -0.0625q0.40625 -0.078125 0.734375 -0.171875l0 1.390625zm6.35495 -9.1875q0 0.234375 -0.09375 0.453125q-0.078125 0.203125 -0.234375 0.359375q-0.15625 0.15625 -0.375 0.25q-0.203125 0.078125 -0.4375 0.078125q-0.25 0 -0.46875 -0.078125q-0.203125 -0.09375 -0.359375 -0.25q-0.15625 -0.15625 -0.25 -0.359375q-0.078125 -0.21875 -0.078125 -0.453125q0 -0.234375 0.078125 -0.4375q0.09375 -0.203125 0.25 -0.359375q0.15625 -0.15625 0.359375 -0.25q0.21875 -0.09375 0.46875 -0.09375q0.234375 0 0.4375 0.09375q0.21875 0.09375 0.375 0.25q0.15625 0.15625 0.234375 0.359375q0.09375 0.203125 0.09375 0.4375zm-1.9375 3.359375l-2.0625 0l0 -1.359375l3.84375 0l0 5.90625l2.109375 0l0 1.359375l-6.203125 0l0 -1.359375l2.3125 0l0 -4.546875zm10.04245 5.90625l0 -4.734375q0 -1.203125 -0.890625 -1.203125q-0.4375 0 -0.84375 0.359375q-0.40625 0.359375 -0.859375 0.984375l0 4.59375l-1.734375 0l0 -7.265625l1.5 0l0.046875 1.0625q0.21875 -0.265625 0.453125 -0.5q0.25 -0.234375 0.53125 -0.390625q0.28125 -0.15625 0.59375 -0.234375q0.328125 -0.09375 0.734375 -0.09375q0.546875 0 0.953125 0.1875q0.421875 0.171875 0.703125 0.515625q0.28125 0.328125 0.421875 0.796875q0.140625 0.46875 0.140625 1.046875l0 4.875l-1.75 0zm8.276825 0l-0.03125 -0.953125q-0.234375 0.25 -0.484375 0.453125q-0.25 0.203125 -0.5625 0.359375q-0.296875 0.140625 -0.65625 0.21875q-0.34375 0.09375 -0.765625 0.09375q-0.5625 0 -0.984375 -0.171875q-0.421875 -0.171875 -0.703125 -0.453125q-0.28125 -0.296875 -0.4375 -0.703125q-0.140625 -0.421875 -0.140625 -0.921875q0 -0.515625 0.21875 -0.953125q0.21875 -0.4375 0.65625 -0.75q0.453125 -0.3125 1.125 -0.484375q0.671875 -0.1875 1.5625 -0.1875l0.953125 0l0 -0.4375q0 -0.28125 -0.078125 -0.5q-0.078125 -0.234375 -0.25 -0.390625q-0.171875 -0.15625 -0.453125 -0.234375q-0.28125 -0.09375 -0.6875 -0.09375q-0.640625 0 -1.265625 0.15625q-0.625 0.140625 -1.21875 0.40625l0 -1.40625q0.53125 -0.203125 1.203125 -0.328125q0.6875 -0.140625 1.421875 -0.140625q0.8125 0 1.390625 0.15625q0.578125 0.140625 0.953125 0.453125q0.375 0.3125 0.546875 0.78125q0.1875 0.453125 0.1875 1.0625l0 4.96875l-1.5 0zm-0.25 -3.21875l-1.0625 0q-0.4375 0 -0.75 0.09375q-0.3125 0.078125 -0.5 0.234375q-0.1875 0.15625 -0.28125 0.359375q-0.09375 0.1875 -0.09375 0.40625q0 0.4375 0.28125 0.671875q0.296875 0.234375 0.78125 0.234375q0.375 0 0.765625 -0.265625q0.390625 -0.265625 0.859375 -0.75l0 -0.984375zm9.745575 3.140625q-0.46875 0.109375 -0.953125 0.171875q-0.46875 0.078125 -0.90625 0.078125q-0.703125 0 -1.234375 -0.15625q-0.515625 -0.15625 -0.859375 -0.453125q-0.328125 -0.3125 -0.5 -0.78125q-0.15625 -0.484375 -0.15625 -1.140625l0 -3.546875l-1.953125 0l0 -1.359375l1.953125 0l0 -1.859375l1.796875 -0.46875l0 2.328125l2.8125 0l0 1.359375l-2.8125 0l0 3.421875q0 0.609375 0.28125 0.9375q0.28125 0.3125 0.953125 0.3125q0.4375 0 0.84375 -0.0625q0.40625 -0.078125 0.734375 -0.171875l0 1.390625zm6.35495 -9.1875q0 0.234375 -0.09375 0.453125q-0.078125 0.203125 -0.234375 0.359375q-0.15625 0.15625 -0.375 0.25q-0.203125 0.078125 -0.4375 0.078125q-0.25 0 -0.46875 -0.078125q-0.203125 -0.09375 -0.359375 -0.25q-0.15625 -0.15625 -0.25 -0.359375q-0.078125 -0.21875 -0.078125 -0.453125q0 -0.234375 0.078125 -0.4375q0.09375 -0.203125 0.25 -0.359375q0.15625 -0.15625 0.359375 -0.25q0.21875 -0.09375 0.46875 -0.09375q0.234375 0 0.4375 0.09375q0.21875 0.09375 0.375 0.25q0.15625 0.15625 0.234375 0.359375q0.09375 0.203125 0.09375 0.4375zm-1.9375 3.359375l-2.0625 0l0 -1.359375l3.84375 0l0 5.90625l2.109375 0l0 1.359375l-6.203125 0l0 -1.359375l2.3125 0l0 -4.546875zm12.2612 2.21875q0 0.859375 -0.25 1.578125q-0.234375 0.703125 -0.703125 1.21875q-0.453125 0.5 -1.125 0.78125q-0.65625 0.28125 -1.515625 0.28125q-0.796875 0 -1.4375 -0.234375q-0.640625 -0.25 -1.09375 -0.71875q-0.4375 -0.46875 -0.671875 -1.171875q-0.234375 -0.703125 -0.234375 -1.640625q0 -0.859375 0.25 -1.5625q0.25 -0.71875 0.703125 -1.21875q0.46875 -0.5 1.125 -0.765625q0.671875 -0.28125 1.5 -0.28125q0.828125 0 1.453125 0.234375q0.640625 0.234375 1.078125 0.71875q0.453125 0.46875 0.6875 1.171875q0.234375 0.6875 0.234375 1.609375zm-1.828125 0.03125q0 -1.125 -0.421875 -1.6875q-0.421875 -0.578125 -1.25 -0.578125q-0.46875 0 -0.796875 0.1875q-0.3125 0.171875 -0.515625 0.484375q-0.203125 0.3125 -0.3125 0.734375q-0.09375 0.40625 -0.09375 0.875q0 1.140625 0.453125 1.71875q0.46875 0.578125 1.265625 0.578125q0.4375 0 0.75 -0.171875q0.328125 -0.1875 0.53125 -0.484375q0.203125 -0.3125 0.296875 -0.734375q0.09375 -0.421875 0.09375 -0.921875zm7.66745 3.65625l0 -4.734375q0 -1.203125 -0.890625 -1.203125q-0.4375 0 -0.84375 0.359375q-0.40625 0.359375 -0.859375 0.984375l0 4.59375l-1.734375 0l0 -7.265625l1.5 0l0.046875 1.0625q0.21875 -0.265625 0.453125 -0.5q0.25 -0.234375 0.53125 -0.390625q0.28125 -0.15625 0.59375 -0.234375q0.328125 -0.09375 0.734375 -0.09375q0.546875 0 0.953125 0.1875q0.421875 0.171875 0.703125 0.515625q0.28125 0.328125 0.421875 0.796875q0.140625 0.46875 0.140625 1.046875l0 4.875l-1.75 0zm2.72995 2.9375l0 -1.390625l8.0625 0l0 1.390625l-8.0625 0zm15.589325 -6.703125q0 1.0 -0.28125 1.734375q-0.28125 0.734375 -0.78125 1.21875q-0.5 0.46875 -1.1875 0.703125q-0.6875 0.234375 -1.5 0.234375q-0.28125 0 -0.546875 -0.03125q-0.265625 -0.03125 -0.5 -0.09375l0 2.84375l-1.734375 0l0 -10.109375l1.5 0l0.046875 1.0625q0.21875 -0.265625 0.453125 -0.5q0.25 -0.234375 0.53125 -0.390625q0.28125 -0.15625 0.59375 -0.234375q0.328125 -0.09375 0.734375 -0.09375q0.640625 0 1.140625 0.265625q0.5 0.25 0.828125 0.71875q0.34375 0.46875 0.515625 1.15625q0.1875 0.671875 0.1875 1.515625zm-1.828125 0.078125q0 -0.609375 -0.09375 -1.03125q-0.078125 -0.421875 -0.25 -0.6875q-0.15625 -0.28125 -0.390625 -0.40625q-0.21875 -0.125 -0.53125 -0.125q-0.4375 0 -0.84375 0.359375q-0.40625 0.359375 -0.859375 0.984375l0 3.171875q0.21875 0.078125 0.515625 0.125q0.3125 0.046875 0.625 0.046875q0.40625 0 0.75 -0.171875q0.34375 -0.171875 0.578125 -0.484375q0.25 -0.3125 0.375 -0.765625q0.125 -0.453125 0.125 -1.015625zm7.901825 3.6875l-0.03125 -0.953125q-0.234375 0.25 -0.484375 0.453125q-0.25 0.203125 -0.5625 0.359375q-0.296875 0.140625 -0.65625 0.21875q-0.34375 0.09375 -0.765625 0.09375q-0.5625 0 -0.984375 -0.171875q-0.421875 -0.171875 -0.703125 -0.453125q-0.28125 -0.296875 -0.4375 -0.703125q-0.140625 -0.421875 -0.140625 -0.921875q0 -0.515625 0.21875 -0.953125q0.21875 -0.4375 0.65625 -0.75q0.453125 -0.3125 1.125 -0.484375q0.671875 -0.1875 1.5625 -0.1875l0.953125 0l0 -0.4375q0 -0.28125 -0.078125 -0.5q-0.078125 -0.234375 -0.25 -0.390625q-0.171875 -0.15625 -0.453125 -0.234375q-0.28125 -0.09375 -0.6875 -0.09375q-0.640625 0 -1.265625 0.15625q-0.625 0.140625 -1.21875 0.40625l0 -1.40625q0.53125 -0.203125 1.203125 -0.328125q0.6875 -0.140625 1.421875 -0.140625q0.8125 0 1.390625 0.15625q0.578125 0.140625 0.953125 0.453125q0.375 0.3125 0.546875 0.78125q0.1875 0.453125 0.1875 1.0625l0 4.96875l-1.5 0zm-0.25 -3.21875l-1.0625 0q-0.4375 0 -0.75 0.09375q-0.3125 0.078125 -0.5 0.234375q-0.1875 0.15625 -0.28125 0.359375q-0.09375 0.1875 -0.09375 0.40625q0 0.4375 0.28125 0.671875q0.296875 0.234375 0.78125 0.234375q0.375 0 0.765625 -0.265625q0.390625 -0.265625 0.859375 -0.75l0 -0.984375zm9.745575 1.078125q0 0.625 -0.28125 1.0625q-0.265625 0.4375 -0.71875 0.71875q-0.453125 0.28125 -1.03125 0.40625q-0.578125 0.125 -1.171875 0.125q-0.796875 0 -1.453125 -0.078125q-0.640625 -0.078125 -1.21875 -0.21875l0 -1.578125q0.6875 0.28125 1.34375 0.40625q0.671875 0.125 1.265625 0.125q0.6875 0 1.03125 -0.21875q0.34375 -0.21875 0.34375 -0.5625q0 -0.171875 -0.078125 -0.296875q-0.078125 -0.140625 -0.28125 -0.265625q-0.1875 -0.125 -0.546875 -0.25q-0.359375 -0.140625 -0.9375 -0.3125q-0.53125 -0.15625 -0.9375 -0.34375q-0.40625 -0.1875 -0.671875 -0.4375q-0.265625 -0.265625 -0.40625 -0.59375q-0.125 -0.34375 -0.125 -0.8125q0 -0.4375 0.203125 -0.828125q0.203125 -0.390625 0.59375 -0.6875q0.40625 -0.296875 1.0 -0.46875q0.59375 -0.171875 1.390625 -0.171875q0.6875 0 1.21875 0.078125q0.53125 0.0625 0.9375 0.140625l0 1.421875q-0.625 -0.1875 -1.171875 -0.265625q-0.546875 -0.09375 -1.09375 -0.09375q-0.53125 0 -0.859375 0.203125q-0.328125 0.1875 -0.328125 0.53125q0 0.15625 0.0625 0.296875q0.078125 0.125 0.265625 0.25q0.1875 0.109375 0.515625 0.25q0.34375 0.125 0.90625 0.28125q0.625 0.1875 1.046875 0.390625q0.4375 0.203125 0.703125 0.46875q0.265625 0.25 0.375 0.578125q0.109375 0.328125 0.109375 0.75zm8.058075 0q0 0.625 -0.28125 1.0625q-0.265625 0.4375 -0.71875 0.71875q-0.453125 0.28125 -1.03125 0.40625q-0.578125 0.125 -1.171875 0.125q-0.796875 0 -1.453125 -0.078125q-0.640625 -0.078125 -1.21875 -0.21875l0 -1.578125q0.6875 0.28125 1.34375 0.40625q0.671875 0.125 1.265625 0.125q0.6875 0 1.03125 -0.21875q0.34375 -0.21875 0.34375 -0.5625q0 -0.171875 -0.078125 -0.296875q-0.078125 -0.140625 -0.28125 -0.265625q-0.1875 -0.125 -0.546875 -0.25q-0.359375 -0.140625 -0.9375 -0.3125q-0.53125 -0.15625 -0.9375 -0.34375q-0.40625 -0.1875 -0.671875 -0.4375q-0.265625 -0.265625 -0.40625 -0.59375q-0.125 -0.34375 -0.125 -0.8125q0 -0.4375 0.203125 -0.828125q0.203125 -0.390625 0.59375 -0.6875q0.40625 -0.296875 1.0 -0.46875q0.59375 -0.171875 1.390625 -0.171875q0.6875 0 1.21875 0.078125q0.53125 0.0625 0.9375 0.140625l0 1.421875q-0.625 -0.1875 -1.171875 -0.265625q-0.546875 -0.09375 -1.09375 -0.09375q-0.53125 0 -0.859375 0.203125q-0.328125 0.1875 -0.328125 0.53125q0 0.15625 0.0625 0.296875q0.078125 0.125 0.265625 0.25q0.1875 0.109375 0.515625 0.25q0.34375 0.125 0.90625 0.28125q0.625 0.1875 1.046875 0.390625q0.4375 0.203125 0.703125 0.46875q0.265625 0.25 0.375 0.578125q0.109375 0.328125 0.109375 0.75zm6.35495 -7.125q0 0.234375 -0.09375 0.453125q-0.078125 0.203125 -0.234375 0.359375q-0.15625 0.15625 -0.375 0.25q-0.203125 0.078125 -0.4375 0.078125q-0.25 0 -0.46875 -0.078125q-0.203125 -0.09375 -0.359375 -0.25q-0.15625 -0.15625 -0.25 -0.359375q-0.078125 -0.21875 -0.078125 -0.453125q0 -0.234375 0.078125 -0.4375q0.09375 -0.203125 0.25 -0.359375q0.15625 -0.15625 0.359375 -0.25q0.21875 -0.09375 0.46875 -0.09375q0.234375 0 0.4375 0.09375q0.21875 0.09375 0.375 0.25q0.15625 0.15625 0.234375 0.359375q0.09375 0.203125 0.09375 0.4375zm-1.9375 3.359375l-2.0625 0l0 -1.359375l3.84375 0l0 5.90625l2.109375 0l0 1.359375l-6.203125 0l0 -1.359375l2.3125 0l0 -4.546875zm10.04245 5.90625l0 -4.734375q0 -1.203125 -0.890625 -1.203125q-0.4375 0 -0.84375 0.359375q-0.40625 0.359375 -0.859375 0.984375l0 4.59375l-1.734375 0l0 -7.265625l1.5 0l0.046875 1.0625q0.21875 -0.265625 0.453125 -0.5q0.25 -0.234375 0.53125 -0.390625q0.28125 -0.15625 0.59375 -0.234375q0.328125 -0.09375 0.734375 -0.09375q0.546875 0 0.953125 0.1875q0.421875 0.171875 0.703125 0.515625q0.28125 0.328125 0.421875 0.796875q0.140625 0.46875 0.140625 1.046875l0 4.875l-1.75 0zm9.183075 -6.0q0.1875 0.25 0.265625 0.53125q0.09375 0.265625 0.09375 0.5625q0 0.609375 -0.21875 1.078125q-0.203125 0.453125 -0.59375 0.78125q-0.375 0.3125 -0.90625 0.46875q-0.53125 0.15625 -1.171875 0.15625q-0.390625 0 -0.71875 -0.0625q-0.3125 -0.078125 -0.46875 -0.1875q-0.125 0.125 -0.21875 0.28125q-0.078125 0.140625 -0.078125 0.328125q0 0.109375 0.046875 0.234375q0.0625 0.109375 0.171875 0.203125q0.109375 0.078125 0.265625 0.140625q0.15625 0.046875 0.34375 0.0625l1.734375 0.0625q0.578125 0.015625 1.046875 0.15625q0.46875 0.125 0.796875 0.375q0.34375 0.234375 0.515625 0.578125q0.1875 0.34375 0.1875 0.796875q0 0.5 -0.234375 0.953125q-0.21875 0.453125 -0.6875 0.78125q-0.453125 0.34375 -1.140625 0.53125q-0.6875 0.1875 -1.59375 0.1875q-0.890625 0 -1.53125 -0.140625q-0.625 -0.125 -1.03125 -0.390625q-0.390625 -0.25 -0.59375 -0.59375q-0.1875 -0.34375 -0.1875 -0.765625q0 -0.25 0.0625 -0.46875q0.0625 -0.203125 0.1875 -0.40625q0.140625 -0.1875 0.328125 -0.375q0.1875 -0.1875 0.453125 -0.375q-0.34375 -0.203125 -0.53125 -0.515625q-0.1875 -0.3125 -0.1875 -0.671875q0 -0.25 0.0625 -0.453125q0.078125 -0.21875 0.171875 -0.40625q0.109375 -0.1875 0.234375 -0.359375q0.140625 -0.1875 0.296875 -0.359375q-0.265625 -0.265625 -0.4375 -0.640625q-0.171875 -0.375 -0.171875 -0.9375q0 -0.609375 0.21875 -1.078125q0.21875 -0.484375 0.59375 -0.8125q0.390625 -0.328125 0.921875 -0.484375q0.53125 -0.171875 1.171875 -0.171875q0.3125 0 0.609375 0.03125q0.296875 0.03125 0.546875 0.109375l2.515625 0l0 1.265625l-1.140625 0zm-4.125 6.96875q0 0.421875 0.421875 0.625q0.4375 0.203125 1.203125 0.203125q0.5 0 0.828125 -0.09375q0.34375 -0.09375 0.5625 -0.25q0.21875 -0.15625 0.3125 -0.359375q0.09375 -0.1875 0.09375 -0.40625q0 -0.1875 -0.09375 -0.328125q-0.09375 -0.125 -0.25 -0.21875q-0.15625 -0.09375 -0.375 -0.140625q-0.21875 -0.046875 -0.484375 -0.0625l-1.546875 -0.015625q-0.203125 0.140625 -0.34375 0.265625q-0.125 0.125 -0.203125 0.25q-0.0625 0.125 -0.09375 0.25q-0.03125 0.140625 -0.03125 0.28125zm0.296875 -5.828125q0 0.578125 0.34375 0.921875q0.34375 0.328125 0.953125 0.328125q0.3125 0 0.546875 -0.109375q0.25 -0.109375 0.40625 -0.28125q0.15625 -0.171875 0.234375 -0.40625q0.078125 -0.25 0.078125 -0.5q0 -0.609375 -0.34375 -0.953125q-0.328125 -0.34375 -0.9375 -0.34375q-0.328125 0 -0.578125 0.109375q-0.234375 0.109375 -0.390625 0.296875q-0.15625 0.171875 -0.234375 0.421875q-0.078125 0.25 -0.078125 0.515625zm5.433075 7.796875l0 -1.390625l8.0625 0l0 1.390625l-8.0625 0zm15.04245 -5.078125q0 0.625 -0.28125 1.0625q-0.265625 0.4375 -0.71875 0.71875q-0.453125 0.28125 -1.03125 0.40625q-0.578125 0.125 -1.171875 0.125q-0.796875 0 -1.453125 -0.078125q-0.640625 -0.078125 -1.21875 -0.21875l0 -1.578125q0.6875 0.28125 1.34375 0.40625q0.671875 0.125 1.265625 0.125q0.6875 0 1.03125 -0.21875q0.34375 -0.21875 0.34375 -0.5625q0 -0.171875 -0.078125 -0.296875q-0.078125 -0.140625 -0.28125 -0.265625q-0.1875 -0.125 -0.546875 -0.25q-0.359375 -0.140625 -0.9375 -0.3125q-0.53125 -0.15625 -0.9375 -0.34375q-0.40625 -0.1875 -0.671875 -0.4375q-0.265625 -0.265625 -0.40625 -0.59375q-0.125 -0.34375 -0.125 -0.8125q0 -0.4375 0.203125 -0.828125q0.203125 -0.390625 0.59375 -0.6875q0.40625 -0.296875 1.0 -0.46875q0.59375 -0.171875 1.390625 -0.171875q0.6875 0 1.21875 0.078125q0.53125 0.0625 0.9375 0.140625l0 1.421875q-0.625 -0.1875 -1.171875 -0.265625q-0.546875 -0.09375 -1.09375 -0.09375q-0.53125 0 -0.859375 0.203125q-0.328125 0.1875 -0.328125 0.53125q0 0.15625 0.0625 0.296875q0.078125 0.125 0.265625 0.25q0.1875 0.109375 0.515625 0.25q0.34375 0.125 0.90625 0.28125q0.625 0.1875 1.046875 0.390625q0.4375 0.203125 0.703125 0.46875q0.265625 0.25 0.375 0.578125q0.109375 0.328125 0.109375 0.75zm8.058075 2.0625q-0.46875 0.109375 -0.953125 0.171875q-0.46875 0.078125 -0.90625 0.078125q-0.703125 0 -1.234375 -0.15625q-0.515625 -0.15625 -0.859375 -0.453125q-0.328125 -0.3125 -0.5 -0.78125q-0.15625 -0.484375 -0.15625 -1.140625l0 -3.546875l-1.953125 0l0 -1.359375l1.953125 0l0 -1.859375l1.796875 -0.46875l0 2.328125l2.8125 0l0 1.359375l-2.8125 0l0 3.421875q0 0.609375 0.28125 0.9375q0.28125 0.3125 0.953125 0.3125q0.4375 0 0.84375 -0.0625q0.40625 -0.078125 0.734375 -0.171875l0 1.390625zm6.4487 -0.71875q-0.390625 0.953125 -0.796875 1.65625q-0.40625 0.71875 -0.90625 1.1875q-0.5 0.484375 -1.109375 0.71875q-0.59375 0.234375 -1.34375 0.234375q-0.1875 0 -0.40625 -0.015625q-0.203125 -0.015625 -0.421875 -0.046875l0 -1.484375q0.09375 0.015625 0.203125 0.03125q0.109375 0.015625 0.21875 0.03125q0.125 0.015625 0.25 0.015625q0.125 0 0.234375 0q0.3125 0 0.5625 -0.109375q0.265625 -0.109375 0.484375 -0.3125q0.21875 -0.203125 0.390625 -0.484375q0.171875 -0.28125 0.3125 -0.625l-2.875 -7.265625l1.9375 0l1.515625 4.109375l0.453125 1.34375l0.4375 -1.28125l1.53125 -4.171875l1.875 0l-2.546875 6.46875zm6.026825 -7.953125l-2.0625 0l0 -1.359375l3.84375 0l0 8.75l2.109375 0l0 1.359375l-6.203125 0l0 -1.359375l2.3125 0l0 -7.390625zm12.04245 4.5625q0 0.171875 -0.015625 0.46875q0 0.28125 -0.03125 0.53125l-4.78125 0q0 0.46875 0.140625 0.84375q0.15625 0.359375 0.421875 0.609375q0.265625 0.25 0.640625 0.375q0.390625 0.125 0.84375 0.125q0.53125 0 1.125 -0.078125q0.609375 -0.078125 1.265625 -0.265625l0 1.390625q-0.28125 0.078125 -0.609375 0.140625q-0.328125 0.0625 -0.671875 0.109375q-0.34375 0.046875 -0.703125 0.078125q-0.34375 0.03125 -0.671875 0.03125q-0.828125 0 -1.5 -0.234375q-0.65625 -0.25 -1.109375 -0.703125q-0.453125 -0.46875 -0.703125 -1.140625q-0.234375 -0.6875 -0.234375 -1.578125q0 -0.890625 0.234375 -1.609375q0.25 -0.734375 0.6875 -1.25q0.453125 -0.515625 1.078125 -0.796875q0.640625 -0.28125 1.421875 -0.28125q0.765625 0 1.359375 0.234375q0.59375 0.234375 0.984375 0.671875q0.40625 0.421875 0.609375 1.015625q0.21875 0.59375 0.21875 1.3125zm-1.78125 -0.25q0.015625 -0.4375 -0.109375 -0.75q-0.109375 -0.328125 -0.3125 -0.53125q-0.1875 -0.203125 -0.453125 -0.296875q-0.25 -0.109375 -0.5625 -0.109375q-0.65625 0 -1.078125 0.4375q-0.421875 0.4375 -0.5 1.25l3.015625 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m466.66595 28.276524l0 0l0 17.16098l0 0l0 -17.16098z" fill-rule="nonzero"/><path fill="#ffffff" d="m176.57526 40.1147l290.0907 0l0 1.3199997l-290.0907 0l0 -1.3199997z" fill-rule="nonzero"/><path fill="#34a853" d="m163.16655 377.49115l315.59058 0l0 29.039368l-315.59058 0z" fill-rule="evenodd"/><path stroke="#34a853" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m163.16655 377.49115l315.59058 0l0 29.039368l-315.59058 0z" fill-rule="evenodd"/><path fill="#ffffff" d="m226.40776 393.77835l-4.28125 0l0 -1.171875l4.28125 0l0 1.171875zm9.16745 -0.40625q0 0.953125 -0.265625 1.671875q-0.265625 0.71875 -0.75 1.1875q-0.46875 0.46875 -1.109375 0.703125q-0.640625 0.234375 -1.390625 0.234375q-0.34375 0 -0.6875 -0.03125q-0.34375 -0.03125 -0.6875 -0.125l0 3.015625l-1.25 0l0 -10.125l1.109375 0l0.078125 1.203125q0.546875 -0.734375 1.15625 -1.03125q0.609375 -0.296875 1.3125 -0.296875q0.609375 0 1.078125 0.265625q0.46875 0.25 0.78125 0.71875q0.3125 0.46875 0.46875 1.140625q0.15625 0.65625 0.15625 1.46875zm-1.265625 0.046875q0 -0.5625 -0.09375 -1.03125q-0.078125 -0.46875 -0.265625 -0.796875q-0.171875 -0.34375 -0.453125 -0.53125q-0.265625 -0.1875 -0.65625 -0.1875q-0.234375 0 -0.484375 0.078125q-0.234375 0.0625 -0.5 0.25q-0.265625 0.171875 -0.5625 0.46875q-0.28125 0.28125 -0.609375 0.703125l0 3.484375q0.34375 0.140625 0.71875 0.234375q0.390625 0.078125 0.75 0.078125q1.0 0 1.578125 -0.6875q0.578125 -0.6875 0.578125 -2.0625zm3.47995 -3.515625l1.140625 0l0.03125 1.328125q0.640625 -0.765625 1.25 -1.109375q0.625 -0.34375 1.25 -0.34375q1.125 0 1.703125 0.71875q0.578125 0.71875 0.53125 2.15625l-1.265625 0q0.03125 -0.953125 -0.265625 -1.375q-0.296875 -0.421875 -0.875 -0.421875q-0.25 0 -0.515625 0.09375q-0.25 0.078125 -0.515625 0.28125q-0.265625 0.1875 -0.5625 0.5q-0.296875 0.3125 -0.640625 0.75l0 4.609375l-1.265625 0l0 -7.1875zm14.026825 3.53125q0 0.84375 -0.25 1.546875q-0.234375 0.6875 -0.671875 1.1875q-0.4375 0.5 -1.078125 0.78125q-0.640625 0.265625 -1.453125 0.265625q-0.765625 0 -1.390625 -0.234375q-0.609375 -0.234375 -1.03125 -0.703125q-0.421875 -0.46875 -0.65625 -1.15625q-0.21875 -0.6875 -0.21875 -1.578125q0 -0.84375 0.234375 -1.53125q0.234375 -0.6875 0.671875 -1.1875q0.453125 -0.5 1.09375 -0.765625q0.640625 -0.28125 1.4375 -0.28125q0.78125 0 1.390625 0.25q0.609375 0.234375 1.03125 0.703125q0.4375 0.453125 0.65625 1.140625q0.234375 0.6875 0.234375 1.5625zm-1.28125 0.0625q0 -0.671875 -0.15625 -1.15625q-0.140625 -0.5 -0.421875 -0.828125q-0.265625 -0.34375 -0.65625 -0.5q-0.375 -0.171875 -0.859375 -0.171875q-0.5625 0 -0.96875 0.21875q-0.390625 0.21875 -0.640625 0.578125q-0.25 0.359375 -0.375 0.84375q-0.109375 0.484375 -0.109375 1.015625q0 0.671875 0.140625 1.171875q0.140625 0.5 0.40625 0.828125q0.28125 0.328125 0.671875 0.5q0.390625 0.171875 0.875 0.171875q0.5625 0 0.953125 -0.21875q0.390625 -0.21875 0.640625 -0.578125q0.265625 -0.375 0.375 -0.859375q0.125 -0.484375 0.125 -1.015625zm8.198715 3.59375l0 -5.15625q0 -0.34375 -0.03125 -0.546875q-0.015625 -0.21875 -0.078125 -0.34375q-0.046875 -0.125 -0.125 -0.171875q-0.078125 -0.0625 -0.203125 -0.0625q-0.140625 0 -0.265625 0.09375q-0.125 0.078125 -0.265625 0.28125q-0.140625 0.1875 -0.3125 0.515625q-0.15625 0.3125 -0.390625 0.78125l0 4.609375l-1.1406403 0l0 -5.03125q0 -0.390625 -0.03125 -0.625q-0.015625 -0.25 -0.078125 -0.375q-0.046875 -0.140625 -0.140625 -0.1875q-0.078125 -0.0625 -0.203125 -0.0625q-0.125 0 -0.25 0.078125q-0.109375 0.0625 -0.25 0.25q-0.125 0.1875 -0.296875 0.515625q-0.171875 0.328125 -0.40625 0.828125l0 4.609375l-1.15625 0l0 -7.1875l0.953125 0l0.0625 1.375q0.1875 -0.40625 0.359375 -0.6875q0.171875 -0.296875 0.359375 -0.46875q0.1875 -0.1875 0.390625 -0.265625q0.21875 -0.078125 0.46875 -0.078125q0.59376526 0 0.89064026 0.375q0.296875 0.375 0.296875 1.171875q0.171875 -0.375 0.328125 -0.65625q0.171875 -0.296875 0.359375 -0.484375q0.1875 -0.203125 0.421875 -0.296875q0.234375 -0.109375 0.53125 -0.109375q1.34375 0 1.34375 2.078125l0 5.234375l-1.140625 0zm9.1987 -3.65625q0 0.84375 -0.25 1.546875q-0.234375 0.6875 -0.671875 1.1875q-0.4375 0.5 -1.078125 0.78125q-0.640625 0.265625 -1.453125 0.265625q-0.765625 0 -1.390625 -0.234375q-0.609375 -0.234375 -1.03125 -0.703125q-0.421875 -0.46875 -0.65625 -1.15625q-0.21875 -0.6875 -0.21875 -1.578125q0 -0.84375 0.234375 -1.53125q0.234375 -0.6875 0.671875 -1.1875q0.453125 -0.5 1.09375 -0.765625q0.640625 -0.28125 1.4375 -0.28125q0.78125 0 1.390625 0.25q0.609375 0.234375 1.03125 0.703125q0.4375 0.453125 0.65625 1.140625q0.234375 0.6875 0.234375 1.5625zm-1.28125 0.0625q0 -0.671875 -0.15625 -1.15625q-0.140625 -0.5 -0.421875 -0.828125q-0.265625 -0.34375 -0.65625 -0.5q-0.375 -0.171875 -0.859375 -0.171875q-0.5625 0 -0.96875 0.21875q-0.390625 0.21875 -0.640625 0.578125q-0.25 0.359375 -0.375 0.84375q-0.109375 0.484375 -0.109375 1.015625q0 0.671875 0.140625 1.171875q0.140625 0.5 0.40625 0.828125q0.28125 0.328125 0.671875 0.5q0.390625 0.171875 0.875 0.171875q0.5625 0 0.953125 -0.21875q0.390625 -0.21875 0.640625 -0.578125q0.265625 -0.375 0.375 -0.859375q0.125 -0.484375 0.125 -1.015625zm8.870575 3.5q-0.421875 0.09375 -0.875 0.140625q-0.453125 0.046875 -0.921875 0.046875q-1.34375 0 -2.015625 -0.609375q-0.65625 -0.609375 -0.65625 -1.875l0 -3.75l-2.015625 0l0 -1.046875l2.015625 0l0 -1.96875l1.234375 -0.328125l0 2.296875l3.234375 0l0 1.046875l-3.234375 0l0 3.65625q0 0.765625 0.40625 1.15625q0.421875 0.375 1.21875 0.375q0.34375 0 0.75 -0.046875q0.40625 -0.0625 0.859375 -0.171875l0 1.078125zm8.339325 -3.875q0 0.265625 -0.015625 0.453125q0 0.171875 -0.015625 0.328125l-5.046875 0q0 1.09375 0.609375 1.6875q0.625 0.59375 1.78125 0.59375q0.3125 0 0.625 -0.015625q0.3125 -0.03125 0.609375 -0.078125q0.296875 -0.046875 0.5625 -0.09375q0.265625 -0.0625 0.5 -0.125l0 1.03125q-0.515625 0.140625 -1.15625 0.21875q-0.640625 0.09375 -1.328125 0.09375q-0.921875 0 -1.59375 -0.25q-0.65625 -0.25 -1.078125 -0.71875q-0.421875 -0.484375 -0.625 -1.171875q-0.203125 -0.6875 -0.203125 -1.5625q0 -0.765625 0.21875 -1.4375q0.21875 -0.671875 0.625 -1.1875q0.421875 -0.515625 1.03125 -0.8125q0.609375 -0.296875 1.375 -0.296875q0.765625 0 1.34375 0.234375q0.578125 0.234375 0.96875 0.671875q0.40625 0.4375 0.609375 1.0625q0.203125 0.609375 0.203125 1.375zm-1.296875 -0.1875q0.015625 -0.46875 -0.109375 -0.859375q-0.109375 -0.40625 -0.34375 -0.6875q-0.234375 -0.296875 -0.59375 -0.453125q-0.359375 -0.15625 -0.828125 -0.15625q-0.40625 0 -0.75 0.15625q-0.328125 0.15625 -0.578125 0.4375q-0.25 0.28125 -0.40625 0.6875q-0.140625 0.40625 -0.171875 0.875l3.78125 0zm8.308075 0.84375l-4.28125 0l0 -1.171875l4.28125 0l0 1.171875zm9.16745 -0.40625q0 0.890625 -0.25 1.59375q-0.234375 0.703125 -0.703125 1.203125q-0.453125 0.484375 -1.109375 0.75q-0.640625 0.25 -1.453125 0.25q-0.640625 0 -1.3125 -0.109375q-0.65625 -0.125 -1.3125 -0.390625l0 -9.6875l1.25 0l0 2.78125l-0.0625 1.328125q0.546875 -0.71875 1.15625 -1.015625q0.609375 -0.296875 1.3125 -0.296875q0.609375 0 1.078125 0.265625q0.46875 0.25 0.78125 0.71875q0.3125 0.46875 0.46875 1.140625q0.15625 0.65625 0.15625 1.46875zm-1.265625 0.046875q0 -0.5625 -0.09375 -1.03125q-0.078125 -0.46875 -0.265625 -0.796875q-0.171875 -0.34375 -0.453125 -0.53125q-0.265625 -0.1875 -0.65625 -0.1875q-0.234375 0 -0.484375 0.078125q-0.234375 0.0625 -0.5 0.25q-0.265625 0.171875 -0.5625 0.46875q-0.28125 0.28125 -0.609375 0.703125l0 3.484375q0.359375 0.140625 0.734375 0.234375q0.390625 0.078125 0.734375 0.078125q0.421875 0 0.8125 -0.125q0.390625 -0.140625 0.6875 -0.453125q0.296875 -0.328125 0.46875 -0.859375q0.1875 -0.53125 0.1875 -1.3125zm8.964325 3.671875l-1.125 0l-0.03125 -1.15625q-0.328125 0.375 -0.625 0.625q-0.28125 0.234375 -0.578125 0.390625q-0.28125 0.140625 -0.578125 0.203125q-0.28125 0.0625 -0.59375 0.0625q-1.109375 0 -1.6875 -0.640625q-0.5625 -0.65625 -0.5625 -1.96875l0 -4.703125l1.25 0l0 4.59375q0 1.65625 1.25 1.65625q0.21875 0 0.4375 -0.0625q0.21875 -0.078125 0.453125 -0.234375q0.25 -0.171875 0.515625 -0.453125q0.28125 -0.296875 0.625 -0.734375l0 -4.765625l1.25 0l0 7.1875zm8.651825 -8.953125q-0.96875 -0.203125 -1.6875 -0.203125q-1.671875 0 -1.671875 1.75l0 1.265625l3.140625 0l0 1.03125l-3.140625 0l0 5.109375l-1.265625 0l0 -5.109375l-2.3125 0l0 -1.03125l2.3125 0l0 -1.1875q0 -2.875 2.984375 -2.875q0.75 0 1.640625 0.171875l0 1.078125zm-7.515625 1.765625l0 0zm15.5737 -1.765625q-0.96875 -0.203125 -1.6875 -0.203125q-1.671875 0 -1.671875 1.75l0 1.265625l3.140625 0l0 1.03125l-3.140625 0l0 5.109375l-1.265625 0l0 -5.109375l-2.3125 0l0 -1.03125l2.3125 0l0 -1.1875q0 -2.875 2.984375 -2.875q0.75 0 1.640625 0.171875l0 1.078125zm-7.515625 1.765625l0 0zm15.276825 3.21875q0 0.265625 -0.015625 0.453125q0 0.171875 -0.015625 0.328125l-5.046875 0q0 1.09375 0.609375 1.6875q0.625 0.59375 1.78125 0.59375q0.3125 0 0.625 -0.015625q0.3125 -0.03125 0.609375 -0.078125q0.296875 -0.046875 0.5625 -0.09375q0.265625 -0.0625 0.5 -0.125l0 1.03125q-0.515625 0.140625 -1.15625 0.21875q-0.640625 0.09375 -1.328125 0.09375q-0.921875 0 -1.59375 -0.25q-0.65625 -0.25 -1.078125 -0.71875q-0.421875 -0.484375 -0.625 -1.171875q-0.203125 -0.6875 -0.203125 -1.5625q0 -0.765625 0.21875 -1.4375q0.21875 -0.671875 0.625 -1.1875q0.421875 -0.515625 1.03125 -0.8125q0.609375 -0.296875 1.375 -0.296875q0.765625 0 1.34375 0.234375q0.578125 0.234375 0.96875 0.671875q0.40625 0.4375 0.609375 1.0625q0.203125 0.609375 0.203125 1.375zm-1.296875 -0.1875q0.015625 -0.46875 -0.109375 -0.859375q-0.109375 -0.40625 -0.34375 -0.6875q-0.234375 -0.296875 -0.59375 -0.453125q-0.359375 -0.15625 -0.828125 -0.15625q-0.40625 0 -0.75 0.15625q-0.328125 0.15625 -0.578125 0.4375q-0.25 0.28125 -0.40625 0.6875q-0.140625 0.40625 -0.171875 0.875l3.78125 0zm3.5737 -3.03125l1.140625 0l0.03125 1.328125q0.640625 -0.765625 1.25 -1.109375q0.625 -0.34375 1.25 -0.34375q1.125 0 1.703125 0.71875q0.578125 0.71875 0.53125 2.15625l-1.265625 0q0.03125 -0.953125 -0.265625 -1.375q-0.296875 -0.421875 -0.875 -0.421875q-0.25 0 -0.515625 0.09375q-0.25 0.078125 -0.515625 0.28125q-0.265625 0.1875 -0.5625 0.5q-0.296875 0.3125 -0.640625 0.75l0 4.609375l-1.265625 0l0 -7.1875zm13.4487 5.234375q0 0.375 -0.125 0.671875q-0.125 0.296875 -0.34375 0.53125q-0.21875 0.234375 -0.515625 0.40625q-0.296875 0.15625 -0.640625 0.265625q-0.328125 0.109375 -0.6875 0.15625q-0.34375 0.046875 -0.671875 0.046875q-0.734375 0 -1.34375 -0.0625q-0.609375 -0.0625 -1.203125 -0.203125l0 -1.140625q0.640625 0.171875 1.25 0.265625q0.625 0.09375 1.25 0.09375q0.890625 0 1.3125 -0.234375q0.4375 -0.25 0.4375 -0.703125q0 -0.1875 -0.078125 -0.34375q-0.0625 -0.15625 -0.25 -0.296875q-0.171875 -0.140625 -0.546875 -0.28125q-0.375 -0.15625 -1.015625 -0.359375q-0.5 -0.140625 -0.90625 -0.3125q-0.40625 -0.1875 -0.71875 -0.4375q-0.296875 -0.25 -0.46875 -0.578125q-0.171875 -0.34375 -0.171875 -0.8125q0 -0.296875 0.140625 -0.65625q0.140625 -0.359375 0.46875 -0.65625q0.34375 -0.3125 0.921875 -0.515625q0.578125 -0.203125 1.421875 -0.203125q0.421875 0 0.9375 0.046875q0.515625 0.046875 1.078125 0.15625l0 1.109375q-0.578125 -0.140625 -1.109375 -0.203125q-0.53125 -0.078125 -0.90625 -0.078125q-0.46875 0 -0.796875 0.078125q-0.3125 0.0625 -0.515625 0.203125q-0.1875 0.125 -0.28125 0.296875q-0.078125 0.15625 -0.078125 0.34375q0 0.203125 0.0625 0.359375q0.078125 0.15625 0.28125 0.3125q0.21875 0.140625 0.578125 0.28125q0.359375 0.140625 0.953125 0.3125q0.640625 0.1875 1.0625 0.390625q0.4375 0.203125 0.703125 0.46875q0.28125 0.25 0.390625 0.5625q0.125 0.3125 0.125 0.71875zm7.401825 -1.359375l-4.28125 0l0 -1.171875l4.28125 0l0 1.171875zm8.8237 3.21875q-0.421875 0.09375 -0.875 0.140625q-0.453125 0.046875 -0.921875 0.046875q-1.34375 0 -2.015625 -0.609375q-0.65625 -0.609375 -0.65625 -1.875l0 -3.75l-2.015625 0l0 -1.046875l2.015625 0l0 -1.96875l1.234375 -0.328125l0 2.296875l3.234375 0l0 1.046875l-3.234375 0l0 3.65625q0 0.765625 0.40625 1.15625q0.421875 0.375 1.21875 0.375q0.34375 0 0.75 -0.046875q0.40625 -0.0625 0.859375 -0.171875l0 1.078125zm8.526825 -3.5625q0 0.84375 -0.25 1.546875q-0.234375 0.6875 -0.671875 1.1875q-0.4375 0.5 -1.078125 0.78125q-0.640625 0.265625 -1.453125 0.265625q-0.765625 0 -1.390625 -0.234375q-0.609375 -0.234375 -1.03125 -0.703125q-0.421875 -0.46875 -0.65625 -1.15625q-0.21875 -0.6875 -0.21875 -1.578125q0 -0.84375 0.234375 -1.53125q0.234375 -0.6875 0.671875 -1.1875q0.453125 -0.5 1.09375 -0.765625q0.640625 -0.28125 1.4375 -0.28125q0.78125 0 1.390625 0.25q0.609375 0.234375 1.03125 0.703125q0.4375 0.453125 0.65625 1.140625q0.234375 0.6875 0.234375 1.5625zm-1.28125 0.0625q0 -0.671875 -0.15625 -1.15625q-0.140625 -0.5 -0.421875 -0.828125q-0.265625 -0.34375 -0.65625 -0.5q-0.375 -0.171875 -0.859375 -0.171875q-0.5625 0 -0.96875 0.21875q-0.390625 0.21875 -0.640625 0.578125q-0.25 0.359375 -0.375 0.84375q-0.109375 0.484375 -0.109375 1.015625q0 0.671875 0.140625 1.171875q0.140625 0.5 0.40625 0.828125q0.28125 0.328125 0.671875 0.5q0.390625 0.171875 0.875 0.171875q0.5625 0 0.953125 -0.21875q0.390625 -0.21875 0.640625 -0.578125q0.265625 -0.375 0.375 -0.859375q0.125 -0.484375 0.125 -1.015625zm8.10495 0.28125l-4.28125 0l0 -1.171875l4.28125 0l0 1.171875zm8.714325 1.359375q0 0.375 -0.125 0.671875q-0.125 0.296875 -0.34375 0.53125q-0.21875 0.234375 -0.515625 0.40625q-0.296875 0.15625 -0.640625 0.265625q-0.328125 0.109375 -0.6875 0.15625q-0.34375 0.046875 -0.671875 0.046875q-0.734375 0 -1.34375 -0.0625q-0.609375 -0.0625 -1.203125 -0.203125l0 -1.140625q0.640625 0.171875 1.25 0.265625q0.625 0.09375 1.25 0.09375q0.890625 0 1.3125 -0.234375q0.4375 -0.25 0.4375 -0.703125q0 -0.1875 -0.078125 -0.34375q-0.0625 -0.15625 -0.25 -0.296875q-0.171875 -0.140625 -0.546875 -0.28125q-0.375 -0.15625 -1.015625 -0.359375q-0.5 -0.140625 -0.90625 -0.3125q-0.40625 -0.1875 -0.71875 -0.4375q-0.296875 -0.25 -0.46875 -0.578125q-0.171875 -0.34375 -0.171875 -0.8125q0 -0.296875 0.140625 -0.65625q0.140625 -0.359375 0.46875 -0.65625q0.34375 -0.3125 0.921875 -0.515625q0.578125 -0.203125 1.421875 -0.203125q0.421875 0 0.9375 0.046875q0.515625 0.046875 1.078125 0.15625l0 1.109375q-0.578125 -0.140625 -1.109375 -0.203125q-0.53125 -0.078125 -0.90625 -0.078125q-0.46875 0 -0.796875 0.078125q-0.3125 0.0625 -0.515625 0.203125q-0.1875 0.125 -0.28125 0.296875q-0.078125 0.15625 -0.078125 0.34375q0 0.203125 0.0625 0.359375q0.078125 0.15625 0.28125 0.3125q0.21875 0.140625 0.578125 0.28125q0.359375 0.140625 0.953125 0.3125q0.640625 0.1875 1.0625 0.390625q0.4375 0.203125 0.703125 0.46875q0.28125 0.25 0.390625 0.5625q0.125 0.3125 0.125 0.71875zm8.16745 1.859375q-0.421875 0.09375 -0.875 0.140625q-0.453125 0.046875 -0.921875 0.046875q-1.34375 0 -2.015625 -0.609375q-0.65625 -0.609375 -0.65625 -1.875l0 -3.75l-2.015625 0l0 -1.046875l2.015625 0l0 -1.96875l1.234375 -0.328125l0 2.296875l3.234375 0l0 1.046875l-3.234375 0l0 3.65625q0 0.765625 0.40625 1.15625q0.421875 0.375 1.21875 0.375q0.34375 0 0.75 -0.046875q0.40625 -0.0625 0.859375 -0.171875l0 1.078125zm6.8862 0.09375l-0.03125 -0.96875q-0.59375 0.578125 -1.203125 0.84375q-0.59375 0.25 -1.265625 0.25q-0.609375 0 -1.046875 -0.15625q-0.4375 -0.15625 -0.71875 -0.421875q-0.28125 -0.28125 -0.421875 -0.65625q-0.125 -0.375 -0.125 -0.8125q0 -1.078125 0.796875 -1.6875q0.8125 -0.609375 2.390625 -0.609375l1.484375 0l0 -0.640625q0 -0.625 -0.40625 -1.0q-0.40625 -0.390625 -1.25 -0.390625q-0.609375 0 -1.203125 0.140625q-0.578125 0.125 -1.21875 0.375l0 -1.125q0.25 -0.078125 0.53125 -0.15625q0.296875 -0.09375 0.609375 -0.15625q0.328125 -0.0625 0.671875 -0.09375q0.359375 -0.046875 0.71875 -0.046875q0.640625 0 1.15625 0.140625q0.53125 0.140625 0.890625 0.4375q0.359375 0.296875 0.546875 0.75q0.203125 0.4375 0.203125 1.03125l0 4.953125l-1.109375 0zm-0.140625 -3.265625l-1.578125 0q-0.46875 0 -0.8125 0.09375q-0.328125 0.09375 -0.546875 0.265625q-0.21875 0.171875 -0.328125 0.40625q-0.09375 0.234375 -0.09375 0.546875q0 0.203125 0.0625 0.390625q0.0625 0.1875 0.203125 0.34375q0.15625 0.140625 0.375 0.234375q0.234375 0.078125 0.5625 0.078125q0.4375 0 0.984375 -0.265625q0.5625 -0.265625 1.171875 -0.828125l0 -1.265625zm9.16745 3.0q-0.5 0.1875 -1.015625 0.265625q-0.5 0.09375 -1.046875 0.09375q-1.703125 0 -2.625 -0.921875q-0.921875 -0.921875 -0.921875 -2.6875q0 -0.859375 0.265625 -1.546875q0.265625 -0.703125 0.75 -1.1875q0.484375 -0.5 1.140625 -0.765625q0.671875 -0.265625 1.46875 -0.265625q0.5625 0 1.046875 0.078125q0.484375 0.078125 0.9375 0.25l0 1.1875q-0.46875 -0.234375 -0.953125 -0.34375q-0.484375 -0.109375 -1.0 -0.109375q-0.484375 0 -0.90625 0.1875q-0.421875 0.171875 -0.75 0.515625q-0.3125 0.34375 -0.5 0.84375q-0.1875 0.484375 -0.1875 1.109375q0 1.296875 0.625 1.953125q0.640625 0.640625 1.765625 0.640625q0.5 0 0.96875 -0.109375q0.484375 -0.109375 0.9375 -0.34375l0 1.15625zm8.933075 0.265625l-1.71875 0l-3.34375 -3.84375l0 3.84375l-1.25 0l0 -10.109375l1.25 0l0 6.203125l3.234375 -3.28125l1.640625 0l-3.375 3.3125l3.5625 3.875z" fill-rule="nonzero"/><path fill="#fbbc05" d="m163.16655 431.26l315.59058 0l0 29.039368l-315.59058 0z" fill-rule="evenodd"/><path stroke="#fbbc05" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m163.16655 431.26l315.59058 0l0 29.039368l-315.59058 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m204.11974 439.97485l233.68419 0l0 17.16098l-233.68419 0l0 -17.16098z" fill-rule="nonzero"/><path fill="#000000" d="m205.91661 447.7347l0 -1.546875l4.453125 0l0 1.546875l-4.453125 0zm13.79245 -0.640625q0 1.0 -0.28125 1.734375q-0.28125 0.734375 -0.78125 1.21875q-0.5 0.46875 -1.1875 0.703125q-0.6875 0.234375 -1.5 0.234375q-0.796875 0 -1.484375 -0.125q-0.671875 -0.109375 -1.296875 -0.3125l0 -9.796875l1.734375 0l0 2.375l-0.0625 1.421875q0.390625 -0.5 0.90625 -0.796875q0.53125 -0.3125 1.28125 -0.3125q0.640625 0 1.140625 0.265625q0.5 0.25 0.828125 0.71875q0.34375 0.46875 0.515625 1.15625q0.1875 0.671875 0.1875 1.515625zm-1.828125 0.078125q0 -0.609375 -0.09375 -1.03125q-0.078125 -0.421875 -0.25 -0.6875q-0.15625 -0.28125 -0.390625 -0.40625q-0.21875 -0.125 -0.53125 -0.125q-0.4375 0 -0.84375 0.359375q-0.40625 0.359375 -0.859375 0.984375l0 3.171875q0.21875 0.078125 0.515625 0.125q0.3125 0.046875 0.625 0.046875q0.40625 0 0.75 -0.171875q0.34375 -0.171875 0.578125 -0.484375q0.25 -0.3125 0.375 -0.765625q0.125 -0.453125 0.125 -1.015625zm5.089325 -3.578125l0 4.734375q0 0.59375 0.21875 0.90625q0.21875 0.296875 0.671875 0.296875q0.4375 0 0.828125 -0.359375q0.40625 -0.359375 0.859375 -0.984375l0 -4.59375l1.75 0l0 7.265625l-1.515625 0l-0.03125 -1.078125q-0.234375 0.28125 -0.46875 0.515625q-0.234375 0.21875 -0.515625 0.390625q-0.265625 0.15625 -0.59375 0.25q-0.328125 0.09375 -0.71875 0.09375q-0.5625 0 -0.984375 -0.1875q-0.421875 -0.1875 -0.703125 -0.515625q-0.265625 -0.34375 -0.40625 -0.8125q-0.140625 -0.46875 -0.140625 -1.046875l0 -4.875l1.75 0zm12.839325 -1.359375q-0.3125 -0.109375 -0.75 -0.171875q-0.421875 -0.0625 -0.875 -0.0625q-0.3125 0 -0.578125 0.078125q-0.265625 0.078125 -0.453125 0.265625q-0.1875 0.171875 -0.3125 0.453125q-0.109375 0.265625 -0.109375 0.65625l0 1.1875l2.859375 0l0 1.359375l-2.859375 0l0 4.859375l-1.75 0l0 -4.859375l-2.109375 0l0 -1.359375l2.109375 0l0 -1.109375q0 -0.765625 0.21875 -1.3125q0.234375 -0.5625 0.640625 -0.921875q0.40625 -0.359375 0.984375 -0.515625q0.578125 -0.171875 1.296875 -0.171875q0.484375 0 0.90625 0.0625q0.421875 0.0625 0.78125 0.140625l0 1.421875zm-7.515625 1.359375l0 0zm15.5737 -1.359375q-0.3125 -0.109375 -0.75 -0.171875q-0.421875 -0.0625 -0.875 -0.0625q-0.3125 0 -0.578125 0.078125q-0.265625 0.078125 -0.453125 0.265625q-0.1875 0.171875 -0.3125 0.453125q-0.109375 0.265625 -0.109375 0.65625l0 1.1875l2.859375 0l0 1.359375l-2.859375 0l0 4.859375l-1.75 0l0 -4.859375l-2.109375 0l0 -1.359375l2.109375 0l0 -1.109375q0 -0.765625 0.21875 -1.3125q0.234375 -0.5625 0.640625 -0.921875q0.40625 -0.359375 0.984375 -0.515625q0.578125 -0.171875 1.296875 -0.171875q0.484375 0 0.90625 0.0625q0.421875 0.0625 0.78125 0.140625l0 1.421875zm-7.515625 1.359375l0 0zm15.3862 3.078125q0 0.171875 -0.015625 0.46875q0 0.28125 -0.03125 0.53125l-4.78125 0q0 0.46875 0.140625 0.84375q0.15625 0.359375 0.421875 0.609375q0.265625 0.25 0.640625 0.375q0.390625 0.125 0.84375 0.125q0.53125 0 1.125 -0.078125q0.609375 -0.078125 1.265625 -0.265625l0 1.390625q-0.28125 0.078125 -0.609375 0.140625q-0.328125 0.0625 -0.671875 0.109375q-0.34375 0.046875 -0.703125 0.078125q-0.34375 0.03125 -0.671875 0.03125q-0.828125 0 -1.5 -0.234375q-0.65625 -0.25 -1.109375 -0.703125q-0.453125 -0.46875 -0.703125 -1.140625q-0.234375 -0.6875 -0.234375 -1.578125q0 -0.890625 0.234375 -1.609375q0.25 -0.734375 0.6875 -1.25q0.453125 -0.515625 1.078125 -0.796875q0.640625 -0.28125 1.421875 -0.28125q0.765625 0 1.359375 0.234375q0.59375 0.234375 0.984375 0.671875q0.40625 0.421875 0.609375 1.015625q0.21875 0.59375 0.21875 1.3125zm-1.78125 -0.25q0.015625 -0.4375 -0.109375 -0.75q-0.109375 -0.328125 -0.3125 -0.53125q-0.1875 -0.203125 -0.453125 -0.296875q-0.25 -0.109375 -0.5625 -0.109375q-0.65625 0 -1.078125 0.4375q-0.421875 0.4375 -0.5 1.25l3.015625 0zm8.18309 0q0.015625 -0.421875 -0.046875 -0.703125q-0.046875 -0.28125 -0.171875 -0.46875q-0.109375 -0.1875 -0.28125 -0.265625q-0.171875 -0.09375 -0.390625 -0.09375q-0.390625 0 -0.796875 0.328125q-0.40625 0.3125 -0.90626526 1.046875l0 4.59375l-1.796875 0l0 -7.265625l1.59375 0l0.0625 1.046875q0.171875 -0.265625 0.390625 -0.484375q0.23439026 -0.234375 0.50001526 -0.390625q0.28125 -0.15625 0.609375 -0.234375q0.34375 -0.09375 0.75 -0.09375q0.546875 0 0.96875 0.1875q0.4375 0.1875 0.71875 0.5625q0.296875 0.359375 0.4375 0.921875q0.140625 0.5625 0.109375 1.3125l-1.75 0zm4.183075 1.3125l0 -1.546875l4.453125 0l0 1.546875l-4.453125 0zm11.8237 3.125l-0.03125 -1.078125q-0.234375 0.28125 -0.484375 0.515625q-0.234375 0.21875 -0.515625 0.390625q-0.28125 0.15625 -0.609375 0.25q-0.3125 0.09375 -0.6875 0.09375q-0.65625 0 -1.15625 -0.25q-0.5 -0.265625 -0.84375 -0.734375q-0.328125 -0.484375 -0.515625 -1.15625q-0.171875 -0.6875 -0.171875 -1.53125q0 -1.0 0.28125 -1.734375q0.28125 -0.734375 0.78125 -1.21875q0.5 -0.484375 1.1875 -0.71875q0.6875 -0.234375 1.5 -0.234375q0.265625 0 0.53125 0.046875q0.28125 0.03125 0.5 0.09375l0 -2.84375l1.75 0l0 10.109375l-1.515625 0zm-3.1875 -3.578125q0 0.59375 0.078125 1.03125q0.09375 0.421875 0.25 0.703125q0.15625 0.265625 0.390625 0.390625q0.234375 0.125 0.546875 0.125q0.4375 0 0.828125 -0.359375q0.40625 -0.359375 0.859375 -0.984375l0 -3.171875q-0.203125 -0.078125 -0.515625 -0.125q-0.296875 -0.0625 -0.609375 -0.0625q-0.421875 0 -0.765625 0.171875q-0.328125 0.171875 -0.5625 0.5q-0.234375 0.3125 -0.375 0.765625q-0.125 0.453125 -0.125 1.015625zm13.0112 -0.609375q0 0.171875 -0.015625 0.46875q0 0.28125 -0.03125 0.53125l-4.78125 0q0 0.46875 0.140625 0.84375q0.15625 0.359375 0.421875 0.609375q0.265625 0.25 0.640625 0.375q0.390625 0.125 0.84375 0.125q0.53125 0 1.125 -0.078125q0.609375 -0.078125 1.265625 -0.265625l0 1.390625q-0.28125 0.078125 -0.609375 0.140625q-0.328125 0.0625 -0.671875 0.109375q-0.34375 0.046875 -0.703125 0.078125q-0.34375 0.03125 -0.671875 0.03125q-0.828125 0 -1.5 -0.234375q-0.65625 -0.25 -1.109375 -0.703125q-0.453125 -0.46875 -0.703125 -1.140625q-0.234375 -0.6875 -0.234375 -1.578125q0 -0.890625 0.234375 -1.609375q0.25 -0.734375 0.6875 -1.25q0.453125 -0.515625 1.078125 -0.796875q0.640625 -0.28125 1.421875 -0.28125q0.765625 0 1.359375 0.234375q0.59375 0.234375 0.984375 0.671875q0.40625 0.421875 0.609375 1.015625q0.21875 0.59375 0.21875 1.3125zm-1.78125 -0.25q0.015625 -0.4375 -0.109375 -0.75q-0.109375 -0.328125 -0.3125 -0.53125q-0.1875 -0.203125 -0.453125 -0.296875q-0.25 -0.109375 -0.5625 -0.109375q-0.65625 0 -1.078125 0.4375q-0.421875 0.4375 -0.5 1.25l3.015625 0zm8.058075 4.4375l-0.03125 -0.953125q-0.234375 0.25 -0.484375 0.453125q-0.25 0.203125 -0.5625 0.359375q-0.296875 0.140625 -0.65625 0.21875q-0.34375 0.09375 -0.765625 0.09375q-0.5625 0 -0.984375 -0.171875q-0.421875 -0.171875 -0.703125 -0.453125q-0.28125 -0.296875 -0.4375 -0.703125q-0.140625 -0.421875 -0.140625 -0.921875q0 -0.515625 0.21875 -0.953125q0.21875 -0.4375 0.65625 -0.75q0.453125 -0.3125 1.125 -0.484375q0.671875 -0.1875 1.5625 -0.1875l0.953125 0l0 -0.4375q0 -0.28125 -0.078125 -0.5q-0.078125 -0.234375 -0.25 -0.390625q-0.171875 -0.15625 -0.453125 -0.234375q-0.28125 -0.09375 -0.6875 -0.09375q-0.640625 0 -1.265625 0.15625q-0.625 0.140625 -1.21875 0.40625l0 -1.40625q0.53125 -0.203125 1.203125 -0.328125q0.6875 -0.140625 1.421875 -0.140625q0.8125 0 1.390625 0.15625q0.578125 0.140625 0.953125 0.453125q0.375 0.3125 0.546875 0.78125q0.1875 0.453125 0.1875 1.0625l0 4.96875l-1.5 0zm-0.25 -3.21875l-1.0625 0q-0.4375 0 -0.75 0.09375q-0.3125 0.078125 -0.5 0.234375q-0.1875 0.15625 -0.28125 0.359375q-0.09375 0.1875 -0.09375 0.40625q0 0.4375 0.28125 0.671875q0.296875 0.234375 0.78125 0.234375q0.375 0 0.765625 -0.265625q0.390625 -0.265625 0.859375 -0.75l0 -0.984375zm6.10495 -5.53125l-2.0625 0l0 -1.359375l3.84375 0l0 8.75l2.109375 0l0 1.359375l-6.203125 0l0 -1.359375l2.3125 0l0 -7.390625zm8.058075 0l-2.0625 0l0 -1.359375l3.84375 0l0 8.75l2.109375 0l0 1.359375l-6.203125 0l0 -1.359375l2.3125 0l0 -7.390625zm12.2612 5.0625q0 0.859375 -0.25 1.578125q-0.234375 0.703125 -0.703125 1.21875q-0.453125 0.5 -1.125 0.78125q-0.65625 0.28125 -1.515625 0.28125q-0.796875 0 -1.4375 -0.234375q-0.640625 -0.25 -1.09375 -0.71875q-0.4375 -0.46875 -0.671875 -1.171875q-0.234375 -0.703125 -0.234375 -1.640625q0 -0.859375 0.25 -1.5625q0.25 -0.71875 0.703125 -1.21875q0.46875 -0.5 1.125 -0.765625q0.671875 -0.28125 1.5 -0.28125q0.828125 0 1.453125 0.234375q0.640625 0.234375 1.078125 0.71875q0.453125 0.46875 0.6875 1.171875q0.234375 0.6875 0.234375 1.609375zm-1.828125 0.03125q0 -1.125 -0.421875 -1.6875q-0.421875 -0.578125 -1.25 -0.578125q-0.46875 0 -0.796875 0.1875q-0.3125 0.171875 -0.515625 0.484375q-0.203125 0.3125 -0.3125 0.734375q-0.09375 0.40625 -0.09375 0.875q0 1.140625 0.453125 1.71875q0.46875 0.578125 1.265625 0.578125q0.4375 0 0.75 -0.171875q0.328125 -0.1875 0.53125 -0.484375q0.203125 -0.3125 0.296875 -0.734375q0.09375 -0.421875 0.09375 -0.921875zm9.214325 3.375q-0.515625 0.203125 -1.0625 0.296875q-0.546875 0.109375 -1.15625 0.109375q-0.859375 0 -1.546875 -0.234375q-0.6875 -0.234375 -1.171875 -0.703125q-0.484375 -0.46875 -0.734375 -1.140625q-0.25 -0.6875 -0.25 -1.59375q0 -0.875 0.265625 -1.578125q0.28125 -0.71875 0.78125 -1.21875q0.5 -0.5 1.203125 -0.765625q0.71875 -0.28125 1.578125 -0.28125q0.703125 0 1.1875 0.078125q0.484375 0.078125 0.875 0.1875l0 1.703125q-0.453125 -0.21875 -0.96875 -0.34375q-0.515625 -0.125 -0.984375 -0.125q-0.484375 0 -0.875 0.171875q-0.375 0.15625 -0.640625 0.453125q-0.265625 0.296875 -0.40625 0.71875q-0.140625 0.40625 -0.140625 0.90625q0 0.53125 0.140625 0.953125q0.15625 0.40625 0.421875 0.703125q0.28125 0.28125 0.671875 0.4375q0.390625 0.15625 0.859375 0.15625q0.234375 0 0.484375 -0.03125q0.25 -0.03125 0.5 -0.09375q0.265625 -0.0625 0.5 -0.140625q0.25 -0.078125 0.46875 -0.171875l0 1.546875zm6.72995 0.28125l-0.03125 -0.953125q-0.234375 0.25 -0.484375 0.453125q-0.25 0.203125 -0.5625 0.359375q-0.296875 0.140625 -0.65625 0.21875q-0.34375 0.09375 -0.765625 0.09375q-0.5625 0 -0.984375 -0.171875q-0.421875 -0.171875 -0.703125 -0.453125q-0.28125 -0.296875 -0.4375 -0.703125q-0.140625 -0.421875 -0.140625 -0.921875q0 -0.515625 0.21875 -0.953125q0.21875 -0.4375 0.65625 -0.75q0.453125 -0.3125 1.125 -0.484375q0.671875 -0.1875 1.5625 -0.1875l0.953125 0l0 -0.4375q0 -0.28125 -0.078125 -0.5q-0.078125 -0.234375 -0.25 -0.390625q-0.171875 -0.15625 -0.453125 -0.234375q-0.28125 -0.09375 -0.6875 -0.09375q-0.640625 0 -1.265625 0.15625q-0.625 0.140625 -1.21875 0.40625l0 -1.40625q0.53125 -0.203125 1.203125 -0.328125q0.6875 -0.140625 1.421875 -0.140625q0.8125 0 1.390625 0.15625q0.578125 0.140625 0.953125 0.453125q0.375 0.3125 0.546875 0.78125q0.1875 0.453125 0.1875 1.0625l0 4.96875l-1.5 0zm-0.25 -3.21875l-1.0625 0q-0.4375 0 -0.75 0.09375q-0.3125 0.078125 -0.5 0.234375q-0.1875 0.15625 -0.28125 0.359375q-0.09375 0.1875 -0.09375 0.40625q0 0.4375 0.28125 0.671875q0.296875 0.234375 0.78125 0.234375q0.375 0 0.765625 -0.265625q0.390625 -0.265625 0.859375 -0.75l0 -0.984375zm9.745575 3.140625q-0.46875 0.109375 -0.953125 0.171875q-0.46875 0.078125 -0.90625 0.078125q-0.703125 0 -1.234375 -0.15625q-0.515625 -0.15625 -0.859375 -0.453125q-0.328125 -0.3125 -0.5 -0.78125q-0.15625 -0.484375 -0.15625 -1.140625l0 -3.546875l-1.953125 0l0 -1.359375l1.953125 0l0 -1.859375l1.796875 -0.46875l0 2.328125l2.8125 0l0 1.359375l-2.8125 0l0 3.421875q0 0.609375 0.28125 0.9375q0.28125 0.3125 0.953125 0.3125q0.4375 0 0.84375 -0.0625q0.40625 -0.078125 0.734375 -0.171875l0 1.390625zm6.35495 -9.1875q0 0.234375 -0.09375 0.453125q-0.078125 0.203125 -0.234375 0.359375q-0.15625 0.15625 -0.375 0.25q-0.203125 0.078125 -0.4375 0.078125q-0.25 0 -0.46875 -0.078125q-0.203125 -0.09375 -0.359375 -0.25q-0.15625 -0.15625 -0.25 -0.359375q-0.078125 -0.21875 -0.078125 -0.453125q0 -0.234375 0.078125 -0.4375q0.09375 -0.203125 0.25 -0.359375q0.15625 -0.15625 0.359375 -0.25q0.21875 -0.09375 0.46875 -0.09375q0.234375 0 0.4375 0.09375q0.21875 0.09375 0.375 0.25q0.15625 0.15625 0.234375 0.359375q0.09375 0.203125 0.09375 0.4375zm-1.9375 3.359375l-2.0625 0l0 -1.359375l3.84375 0l0 5.90625l2.109375 0l0 1.359375l-6.203125 0l0 -1.359375l2.3125 0l0 -4.546875zm12.2612 2.21875q0 0.859375 -0.25 1.578125q-0.234375 0.703125 -0.703125 1.21875q-0.453125 0.5 -1.125 0.78125q-0.65625 0.28125 -1.515625 0.28125q-0.796875 0 -1.4375 -0.234375q-0.640625 -0.25 -1.09375 -0.71875q-0.4375 -0.46875 -0.671875 -1.171875q-0.234375 -0.703125 -0.234375 -1.640625q0 -0.859375 0.25 -1.5625q0.25 -0.71875 0.703125 -1.21875q0.46875 -0.5 1.125 -0.765625q0.671875 -0.28125 1.5 -0.28125q0.828125 0 1.453125 0.234375q0.640625 0.234375 1.078125 0.71875q0.453125 0.46875 0.6875 1.171875q0.234375 0.6875 0.234375 1.609375zm-1.828125 0.03125q0 -1.125 -0.421875 -1.6875q-0.421875 -0.578125 -1.25 -0.578125q-0.46875 0 -0.796875 0.1875q-0.3125 0.171875 -0.515625 0.484375q-0.203125 0.3125 -0.3125 0.734375q-0.09375 0.40625 -0.09375 0.875q0 1.140625 0.453125 1.71875q0.46875 0.578125 1.265625 0.578125q0.4375 0 0.75 -0.171875q0.328125 -0.1875 0.53125 -0.484375q0.203125 -0.3125 0.296875 -0.734375q0.09375 -0.421875 0.09375 -0.921875zm7.66745 3.65625l0 -4.734375q0 -1.203125 -0.890625 -1.203125q-0.4375 0 -0.84375 0.359375q-0.40625 0.359375 -0.859375 0.984375l0 4.59375l-1.734375 0l0 -7.265625l1.5 0l0.046875 1.0625q0.21875 -0.265625 0.453125 -0.5q0.25 -0.234375 0.53125 -0.390625q0.28125 -0.15625 0.59375 -0.234375q0.328125 -0.09375 0.734375 -0.09375q0.546875 0 0.953125 0.1875q0.421875 0.171875 0.703125 0.515625q0.28125 0.328125 0.421875 0.796875q0.140625 0.46875 0.140625 1.046875l0 4.875l-1.75 0zm4.526825 -3.125l0 -1.546875l4.453125 0l0 1.546875l-4.453125 0zm13.79245 -0.640625q0 1.0 -0.28125 1.734375q-0.28125 0.734375 -0.78125 1.21875q-0.5 0.46875 -1.1875 0.703125q-0.6875 0.234375 -1.5 0.234375q-0.28125 0 -0.546875 -0.03125q-0.265625 -0.03125 -0.5 -0.09375l0 2.84375l-1.734375 0l0 -10.109375l1.5 0l0.046875 1.0625q0.21875 -0.265625 0.453125 -0.5q0.25 -0.234375 0.53125 -0.390625q0.28125 -0.15625 0.59375 -0.234375q0.328125 -0.09375 0.734375 -0.09375q0.640625 0 1.140625 0.265625q0.5 0.25 0.828125 0.71875q0.34375 0.46875 0.515625 1.15625q0.1875 0.671875 0.1875 1.515625zm-1.828125 0.078125q0 -0.609375 -0.09375 -1.03125q-0.078125 -0.421875 -0.25 -0.6875q-0.15625 -0.28125 -0.390625 -0.40625q-0.21875 -0.125 -0.53125 -0.125q-0.4375 0 -0.84375 0.359375q-0.40625 0.359375 -0.859375 0.984375l0 3.171875q0.21875 0.078125 0.515625 0.125q0.3125 0.046875 0.625 0.046875q0.40625 0 0.75 -0.171875q0.34375 -0.171875 0.578125 -0.484375q0.25 -0.3125 0.375 -0.765625q0.125 -0.453125 0.125 -1.015625zm7.6362 -5.578125q0 0.234375 -0.09375 0.453125q-0.078125 0.203125 -0.234375 0.359375q-0.15625 0.15625 -0.375 0.25q-0.203125 0.078125 -0.4375 0.078125q-0.25 0 -0.46875 -0.078125q-0.203125 -0.09375 -0.359375 -0.25q-0.15625 -0.15625 -0.25 -0.359375q-0.078125 -0.21875 -0.078125 -0.453125q0 -0.234375 0.078125 -0.4375q0.09375 -0.203125 0.25 -0.359375q0.15625 -0.15625 0.359375 -0.25q0.21875 -0.09375 0.46875 -0.09375q0.234375 0 0.4375 0.09375q0.21875 0.09375 0.375 0.25q0.15625 0.15625 0.234375 0.359375q0.09375 0.203125 0.09375 0.4375zm-1.9375 3.359375l-2.0625 0l0 -1.359375l3.84375 0l0 5.90625l2.109375 0l0 1.359375l-6.203125 0l0 -1.359375l2.3125 0l0 -4.546875zm12.245575 2.140625q0 1.0 -0.28125 1.734375q-0.28125 0.734375 -0.78125 1.21875q-0.5 0.46875 -1.1875 0.703125q-0.6875 0.234375 -1.5 0.234375q-0.28125 0 -0.546875 -0.03125q-0.265625 -0.03125 -0.5 -0.09375l0 2.84375l-1.734375 0l0 -10.109375l1.5 0l0.046875 1.0625q0.21875 -0.265625 0.453125 -0.5q0.25 -0.234375 0.53125 -0.390625q0.28125 -0.15625 0.59375 -0.234375q0.328125 -0.09375 0.734375 -0.09375q0.640625 0 1.140625 0.265625q0.5 0.25 0.828125 0.71875q0.34375 0.46875 0.515625 1.15625q0.1875 0.671875 0.1875 1.515625zm-1.828125 0.078125q0 -0.609375 -0.09375 -1.03125q-0.078125 -0.421875 -0.25 -0.6875q-0.15625 -0.28125 -0.390625 -0.40625q-0.21875 -0.125 -0.53125 -0.125q-0.4375 0 -0.84375 0.359375q-0.40625 0.359375 -0.859375 0.984375l0 3.171875q0.21875 0.078125 0.515625 0.125q0.3125 0.046875 0.625 0.046875q0.40625 0 0.75 -0.171875q0.34375 -0.171875 0.578125 -0.484375q0.25 -0.3125 0.375 -0.765625q0.125 -0.453125 0.125 -1.015625zm9.683075 -0.5q0 0.171875 -0.015625 0.46875q0 0.28125 -0.03125 0.53125l-4.78125 0q0 0.46875 0.140625 0.84375q0.15625 0.359375 0.421875 0.609375q0.265625 0.25 0.640625 0.375q0.390625 0.125 0.84375 0.125q0.53125 0 1.125 -0.078125q0.609375 -0.078125 1.265625 -0.265625l0 1.390625q-0.28125 0.078125 -0.609375 0.140625q-0.328125 0.0625 -0.671875 0.109375q-0.34375 0.046875 -0.703125 0.078125q-0.34375 0.03125 -0.671875 0.03125q-0.828125 0 -1.5 -0.234375q-0.65625 -0.25 -1.109375 -0.703125q-0.453125 -0.46875 -0.703125 -1.140625q-0.234375 -0.6875 -0.234375 -1.578125q0 -0.890625 0.234375 -1.609375q0.25 -0.734375 0.6875 -1.25q0.453125 -0.515625 1.078125 -0.796875q0.640625 -0.28125 1.421875 -0.28125q0.765625 0 1.359375 0.234375q0.59375 0.234375 0.984375 0.671875q0.40625 0.421875 0.609375 1.015625q0.21875 0.59375 0.21875 1.3125zm-1.78125 -0.25q0.015625 -0.4375 -0.109375 -0.75q-0.109375 -0.328125 -0.3125 -0.53125q-0.1875 -0.203125 -0.453125 -0.296875q-0.25 -0.109375 -0.5625 -0.109375q-0.65625 0 -1.078125 0.4375q-0.421875 0.4375 -0.5 1.25l3.015625 0zm5.85495 -4.3125l-2.0625 0l0 -1.359375l3.84375 0l0 8.75l2.109375 0l0 1.359375l-6.203125 0l0 -1.359375l2.3125 0l0 -7.390625zm9.995575 -0.515625q0 0.234375 -0.09375 0.453125q-0.078125 0.203125 -0.234375 0.359375q-0.15625 0.15625 -0.375 0.25q-0.203125 0.078125 -0.4375 0.078125q-0.25 0 -0.46875 -0.078125q-0.203125 -0.09375 -0.359375 -0.25q-0.15625 -0.15625 -0.25 -0.359375q-0.078125 -0.21875 -0.078125 -0.453125q0 -0.234375 0.078125 -0.4375q0.09375 -0.203125 0.25 -0.359375q0.15625 -0.15625 0.359375 -0.25q0.21875 -0.09375 0.46875 -0.09375q0.234375 0 0.4375 0.09375q0.21875 0.09375 0.375 0.25q0.15625 0.15625 0.234375 0.359375q0.09375 0.203125 0.09375 0.4375zm-1.9375 3.359375l-2.0625 0l0 -1.359375l3.84375 0l0 5.90625l2.109375 0l0 1.359375l-6.203125 0l0 -1.359375l2.3125 0l0 -4.546875zm10.04245 5.90625l0 -4.734375q0 -1.203125 -0.890625 -1.203125q-0.4375 0 -0.84375 0.359375q-0.40625 0.359375 -0.859375 0.984375l0 4.59375l-1.734375 0l0 -7.265625l1.5 0l0.046875 1.0625q0.21875 -0.265625 0.453125 -0.5q0.25 -0.234375 0.53125 -0.390625q0.28125 -0.15625 0.59375 -0.234375q0.328125 -0.09375 0.734375 -0.09375q0.546875 0 0.953125 0.1875q0.421875 0.171875 0.703125 0.515625q0.28125 0.328125 0.421875 0.796875q0.140625 0.46875 0.140625 1.046875l0 4.875l-1.75 0zm10.058075 -4.1875q0 0.171875 -0.015625 0.46875q0 0.28125 -0.03125 0.53125l-4.78125 0q0 0.46875 0.140625 0.84375q0.15625 0.359375 0.421875 0.609375q0.265625 0.25 0.640625 0.375q0.390625 0.125 0.84375 0.125q0.53125 0 1.125 -0.078125q0.609375 -0.078125 1.265625 -0.265625l0 1.390625q-0.28125 0.078125 -0.609375 0.140625q-0.328125 0.0625 -0.671875 0.109375q-0.34375 0.046875 -0.703125 0.078125q-0.34375 0.03125 -0.671875 0.03125q-0.828125 0 -1.5 -0.234375q-0.65625 -0.25 -1.109375 -0.703125q-0.453125 -0.46875 -0.703125 -1.140625q-0.234375 -0.6875 -0.234375 -1.578125q0 -0.890625 0.234375 -1.609375q0.25 -0.734375 0.6875 -1.25q0.453125 -0.515625 1.078125 -0.796875q0.640625 -0.28125 1.421875 -0.28125q0.765625 0 1.359375 0.234375q0.59375 0.234375 0.984375 0.671875q0.40625 0.421875 0.609375 1.015625q0.21875 0.59375 0.21875 1.3125zm-1.78125 -0.25q0.015625 -0.4375 -0.109375 -0.75q-0.109375 -0.328125 -0.3125 -0.53125q-0.1875 -0.203125 -0.453125 -0.296875q-0.25 -0.109375 -0.5625 -0.109375q-0.65625 0 -1.078125 0.4375q-0.421875 0.4375 -0.5 1.25l3.015625 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m437.80392 439.97485l0 0l0 17.16098l0 0l0 -17.16098z" fill-rule="nonzero"/><path fill="#000000" d="m204.11974 451.81302l233.68419 0l0 1.3200073l-233.68419 0l0 -1.3200073z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m0 34.706036l161.41733 0l0 47.968502l-161.41733 0z" fill-rule="evenodd"/><path fill="#000000" d="m22.033333 61.626034l0 -13.359375l4.484375 0q1.125 0 2.078125 0.515625q0.953125 0.5 1.515625 1.390625q0.578125 0.875 0.578125 2.046875q0 1.15625 -0.578125 2.0625q-0.5625 0.890625 -1.515625 1.390625q-0.953125 0.5 -2.078125 0.5l-3.65625 0l0 -1.5l3.703125 0q0.78125 0 1.328125 -0.359375q0.5625 -0.375 0.875 -0.9375q0.3125 -0.5625 0.3125 -1.15625q0 -0.59375 -0.3125 -1.15625q-0.3125 -0.5625 -0.875 -0.921875q-0.546875 -0.359375 -1.328125 -0.359375l-2.953125 0l0 11.84375l-1.578125 0zm10.417877 0l0 -9.515625l1.515625 0l0 1.53125l0.078125 0q0.1875 -0.546875 0.625 -0.9375q0.4375 -0.40625 1.0 -0.640625q0.578125 -0.234375 1.125 -0.234375q0.4375 0 0.671875 0.046875q0.25 0.046875 0.453125 0.140625l0 1.71875q-0.296875 -0.15625 -0.640625 -0.21875q-0.34375 -0.078125 -0.703125 -0.078125q-0.6875 0 -1.265625 0.390625q-0.578125 0.390625 -0.921875 1.046875q-0.34375 0.65625 -0.34375 1.4375l0 5.3125l-1.59375 0zm10.670471 0.296875q-1.375 0 -2.453125 -0.640625q-1.0625 -0.65625 -1.671875 -1.796875q-0.609375 -1.140625 -0.609375 -2.59375q0 -1.359375 0.5625 -2.515625q0.578125 -1.15625 1.609375 -1.859375q1.03125 -0.703125 2.4375 -0.703125q1.421875 0 2.4375 0.625q1.015625 0.625 1.5625 1.734375q0.546875 1.09375 0.546875 2.515625q0 0.125 -0.015625 0.265625q0 0.125 -0.015625 0.21875l-8.1875 0l0 -1.3125l6.546875 0q-0.015625 -0.390625 -0.1875 -0.84375q-0.15625 -0.46875 -0.5 -0.859375q-0.34375 -0.40625 -0.875 -0.65625q-0.53125 -0.25 -1.3125 -0.25q-0.9375 0 -1.625 0.484375q-0.671875 0.46875 -1.046875 1.296875q-0.359375 0.8125 -0.359375 1.859375q0 1.203125 0.46875 2.015625q0.46875 0.796875 1.203125 1.1875q0.75 0.390625 1.546875 0.390625q1.046875 0 1.71875 -0.484375q0.6875 -0.5 1.09375 -1.234375l1.34375 0.65625q-0.5625 1.078125 -1.609375 1.796875q-1.03125 0.703125 -2.609375 0.703125zm6.2690887 3.7343712l0 -13.546871l1.515625 0l0 1.421875l0.078125 0q0.25 -0.453125 0.71875 -0.84375q0.484375 -0.390625 1.125 -0.625q0.640625 -0.25 1.390625 -0.25q1.3125 0 2.328125 0.65625q1.03125 0.65625 1.625 1.796875q0.609375 1.125 0.609375 2.609375q0 1.46875 -0.609375 2.609375q-0.59375 1.140625 -1.625 1.796875q-1.015625 0.640625 -2.328125 0.640625q-1.125 0 -1.984375 -0.515625q-0.859375 -0.53125 -1.25 -1.1875l-0.078125 0l0.078125 1.3125l0 4.124996l-1.59375 0zm4.671875 -5.171871q0.8125 0 1.53125 -0.4375q0.71875 -0.4375 1.15625 -1.25q0.4375 -0.828125 0.4375 -1.921875q0 -1.125 -0.4375 -1.9375q-0.4375 -0.8125 -1.15625 -1.25q-0.71875 -0.4375 -1.53125 -0.4375q-0.828125 0 -1.546875 0.4375q-0.71875 0.4375 -1.171875 1.25q-0.4375 0.8125 -0.4375 1.9375q0 1.109375 0.4375 1.921875q0.453125 0.8125 1.171875 1.25q0.71875 0.4375 1.546875 0.4375zm6.465851 1.140625l0 -9.515625l1.515625 0l0 1.53125l0.078125 0q0.1875 -0.546875 0.625 -0.9375q0.4375 -0.40625 1.0 -0.640625q0.578125 -0.234375 1.125 -0.234375q0.4375 0 0.671875 0.046875q0.25 0.046875 0.453125 0.140625l0 1.71875q-0.296875 -0.15625 -0.640625 -0.21875q-0.34375 -0.078125 -0.703125 -0.078125q-0.6875 0 -1.265625 0.390625q-0.578125 0.390625 -0.921875 1.046875q-0.34375 0.65625 -0.34375 1.4375l0 5.3125l-1.59375 0zm10.826721 0.296875q-1.4375 0 -2.546875 -0.671875q-1.09375 -0.671875 -1.71875 -1.8125q-0.625 -1.15625 -0.625 -2.5625q0 -1.421875 0.625 -2.5625q0.625 -1.15625 1.71875 -1.828125q1.109375 -0.671875 2.546875 -0.671875q1.4375 0 2.53125 0.6875q1.109375 0.671875 1.734375 1.828125q0.625 1.140625 0.625 2.546875q0 1.40625 -0.625 2.5625q-0.625 1.140625 -1.734375 1.8125q-1.09375 0.671875 -2.53125 0.671875zm0 -1.4375q0.859375 0 1.609375 -0.421875q0.75 -0.4375 1.21875 -1.25q0.46875 -0.8125 0.46875 -1.9375q0 -1.140625 -0.46875 -1.953125q-0.46875 -0.8125 -1.21875 -1.234375q-0.75 -0.4375 -1.609375 -0.4375q-0.859375 0 -1.625 0.4375q-0.765625 0.421875 -1.234375 1.234375q-0.46875 0.8125 -0.46875 1.953125q0 1.125 0.46875 1.9375q0.46875 0.8125 1.234375 1.25q0.765625 0.421875 1.625 0.421875zm11.041 1.4375q-1.40625 0 -2.5 -0.65625q-1.078125 -0.671875 -1.703125 -1.8125q-0.609375 -1.15625 -0.609375 -2.578125q0 -1.46875 0.609375 -2.59375q0.625 -1.140625 1.703125 -1.796875q1.09375 -0.671875 2.5 -0.671875q1.609375 0 2.640625 0.734375q1.03125 0.734375 1.46875 1.890625l-1.4375 0.609375q-0.359375 -0.890625 -1.0625 -1.34375q-0.6875 -0.453125 -1.6875 -0.453125q-0.828125 0 -1.546875 0.453125q-0.71875 0.4375 -1.171875 1.25q-0.453125 0.8125 -0.453125 1.921875q0 1.078125 0.453125 1.90625q0.453125 0.8125 1.171875 1.265625q0.71875 0.4375 1.546875 0.4375q1.015625 0 1.734375 -0.46875q0.734375 -0.46875 1.09375 -1.3125l1.40625 0.59375q-0.46875 1.09375 -1.515625 1.859375q-1.03125 0.765625 -2.640625 0.765625zm10.014847 0q-1.375 0 -2.453125 -0.640625q-1.0625 -0.65625 -1.671875 -1.796875q-0.609375 -1.140625 -0.609375 -2.59375q0 -1.359375 0.5625 -2.515625q0.578125 -1.15625 1.609375 -1.859375q1.03125 -0.703125 2.4375 -0.703125q1.421875 0 2.4375 0.625q1.015625 0.625 1.5625 1.734375q0.546875 1.09375 0.546875 2.515625q0 0.125 -0.015625 0.265625q0 0.125 -0.015625 0.21875l-8.1875 0l0 -1.3125l6.546875 0q-0.015625 -0.390625 -0.1875 -0.84375q-0.15625 -0.46875 -0.5 -0.859375q-0.34375 -0.40625 -0.875 -0.65625q-0.53125 -0.25 -1.3125 -0.25q-0.9375 0 -1.625 0.484375q-0.671875 0.46875 -1.046875 1.296875q-0.359375 0.8125 -0.359375 1.859375q0 1.203125 0.46875 2.015625q0.46875 0.796875 1.203125 1.1875q0.75 0.390625 1.546875 0.390625q1.046875 0 1.71875 -0.484375q0.6875 -0.5 1.09375 -1.234375l1.34375 0.65625q-0.5625 1.078125 -1.609375 1.796875q-1.03125 0.703125 -2.609375 0.703125zm9.690552 0q-1.0625 0 -1.875 -0.34375q-0.8125 -0.34375 -1.34375 -0.921875q-0.53125 -0.59375 -0.796875 -1.28125l1.421875 -0.640625q0.375 0.859375 1.078125 1.328125q0.703125 0.46875 1.625 0.46875q0.875 0 1.453125 -0.359375q0.59375 -0.359375 0.59375 -1.046875q0 -0.421875 -0.25 -0.703125q-0.234375 -0.296875 -0.703125 -0.5q-0.453125 -0.21875 -1.125 -0.375l-1.15625 -0.3125q-0.671875 -0.1875 -1.28125 -0.515625q-0.59375 -0.34375 -0.953125 -0.875q-0.359375 -0.53125 -0.359375 -1.296875q0 -0.84375 0.5 -1.453125q0.5 -0.625 1.3125 -0.953125q0.828125 -0.328125 1.765625 -0.328125q0.8125 0 1.53125 0.234375q0.71875 0.234375 1.265625 0.6875q0.546875 0.453125 0.8125 1.125l-1.375 0.640625q-0.359375 -0.703125 -0.953125 -0.984375q-0.59375 -0.28125 -1.328125 -0.28125q-0.78125 0 -1.359375 0.34375q-0.578125 0.34375 -0.578125 0.9375q0 0.59375 0.46875 0.90625q0.484375 0.296875 1.171875 0.46875l1.390625 0.359375q1.390625 0.359375 2.09375 1.0625q0.71875 0.6875 0.71875 1.703125q0 0.890625 -0.515625 1.546875q-0.5 0.65625 -1.359375 1.015625q-0.84375 0.34375 -1.890625 0.34375zm8.824402 0q-1.0625 0 -1.875 -0.34375q-0.8125 -0.34375 -1.34375 -0.921875q-0.53125 -0.59375 -0.796875 -1.28125l1.421875 -0.640625q0.375 0.859375 1.078125 1.328125q0.703125 0.46875 1.625 0.46875q0.875 0 1.453125 -0.359375q0.59375 -0.359375 0.59375 -1.046875q0 -0.421875 -0.25 -0.703125q-0.234375 -0.296875 -0.703125 -0.5q-0.453125 -0.21875 -1.125 -0.375l-1.15625 -0.3125q-0.671875 -0.1875 -1.28125 -0.515625q-0.59375 -0.34375 -0.953125 -0.875q-0.359375 -0.53125 -0.359375 -1.296875q0 -0.84375 0.5 -1.453125q0.5 -0.625 1.3125 -0.953125q0.828125 -0.328125 1.765625 -0.328125q0.8125 0 1.53125 0.234375q0.71875 0.234375 1.265625 0.6875q0.546875 0.453125 0.8125 1.125l-1.375 0.640625q-0.359375 -0.703125 -0.953125 -0.984375q-0.59375 -0.28125 -1.328125 -0.28125q-0.78125 0 -1.359375 0.34375q-0.578125 0.34375 -0.578125 0.9375q0 0.59375 0.46875 0.90625q0.484375 0.296875 1.171875 0.46875l1.390625 0.359375q1.390625 0.359375 2.09375 1.0625q0.71875 0.6875 0.71875 1.703125q0 0.890625 -0.515625 1.546875q-0.5 0.65625 -1.359375 1.015625q-0.84375 0.34375 -1.890625 0.34375zm5.559189 -0.296875l0 -9.515625l1.578125 0l0 9.515625l-1.578125 0zm0.78125 -11.265625q-0.46875 0 -0.8125 -0.328125q-0.328125 -0.34375 -0.328125 -0.8125q0 -0.484375 0.328125 -0.8125q0.34375 -0.328125 0.8125 -0.328125q0.484375 0 0.8125 0.328125q0.328125 0.328125 0.328125 0.8125q0 0.46875 -0.328125 0.8125q-0.328125 0.328125 -0.8125 0.328125zm3.2788086 11.265625l0 -9.515625l1.515625 0l0 1.40625l0.078125 0q0.375 -0.703125 1.21875 -1.203125q0.84375 -0.5 1.859375 -0.5q1.75 0 2.625 1.015625q0.8906174 1.015625 0.8906174 2.703125l0 6.09375l-1.5781174 0l0 -5.859375q0 -1.375 -0.671875 -1.9375q-0.65625 -0.578125 -1.703125 -0.578125q-0.78125 0 -1.375 0.4375q-0.59375 0.4375 -0.9375 1.125q-0.328125 0.6875 -0.328125 1.453125l0 5.359375l-1.59375 0zm14.560326 4.328121q-1.21875 0 -2.09375 -0.40625q-0.859375 -0.390625 -1.390625 -1.015625q-0.53125 -0.6093712 -0.75 -1.2499962l1.453125 -0.609375q0.296875 0.78125 1.015625 1.3125q0.71875 0.5312462 1.765625 0.5312462q1.484375 0 2.28125 -0.8749962q0.8125 -0.859375 0.8125 -2.40625l0 -1.0625l-0.078125 0q-0.453125 0.6875 -1.296875 1.171875q-0.828125 0.46875 -1.953125 0.46875q-1.21875 0 -2.25 -0.625q-1.015625 -0.640625 -1.609375 -1.765625q-0.59375 -1.140625 -0.59375 -2.609375q0 -1.46875 0.59375 -2.59375q0.59375 -1.140625 1.609375 -1.765625q1.03125 -0.640625 2.25 -0.640625q1.125 0 1.953125 0.484375q0.84375 0.46875 1.296875 1.171875l0.078125 0l0 -1.359375l1.515625 0l0 9.140625q0 1.59375 -0.609375 2.625q-0.609375 1.0468712 -1.65625 1.5624962q-1.03125 0.515625 -2.34375 0.515625zm0 -5.578121q0.828125 0 1.53125 -0.421875q0.71875 -0.421875 1.140625 -1.21875q0.421875 -0.796875 0.421875 -1.921875q0 -1.15625 -0.421875 -1.953125q-0.421875 -0.796875 -1.140625 -1.203125q-0.703125 -0.40625 -1.53125 -0.40625q-0.84375 0 -1.5625 0.421875q-0.703125 0.421875 -1.140625 1.21875q-0.421875 0.78125 -0.421875 1.921875q0 1.140625 0.421875 1.9375q0.4375 0.78125 1.140625 1.203125q0.71875 0.421875 1.5625 0.421875z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m0 117.82413l161.41733 0l0 47.968506l-161.41733 0z" fill-rule="evenodd"/><path fill="#000000" d="m28.693481 144.74413l0 -13.359375l4.78125 0q1.03125 0 1.890625 0.46875q0.875 0.453125 1.40625 1.265625q0.53125 0.796875 0.53125 1.875q0 0.984375 -0.5 1.71875q-0.5 0.71875 -1.328125 1.09375l0 0.078125q0.984375 0.3125 1.625 1.140625q0.65625 0.8125 0.65625 1.9375q0 1.109375 -0.5625 1.96875q-0.5625 0.84375 -1.484375 1.328125q-0.90625 0.484375 -1.96875 0.484375l-5.046875 0zm0.828125 -6.078125l0 -1.515625l3.953125 0q0.71875 0 1.21875 -0.328125q0.5 -0.328125 0.765625 -0.8125q0.28125 -0.484375 0.28125 -0.984375q0 -0.5 -0.265625 -0.984375q-0.265625 -0.484375 -0.75 -0.8125q-0.484375 -0.328125 -1.171875 -0.328125l-3.28125 0l0 10.328125l3.5625 0q0.703125 0 1.21875 -0.34375q0.53125 -0.34375 0.8125 -0.875q0.28125 -0.53125 0.28125 -1.078125q0 -0.546875 -0.296875 -1.0625q-0.28125 -0.53125 -0.8125 -0.859375q-0.53125 -0.34375 -1.28125 -0.34375l-4.234375 0zm13.60257 6.375q-1.75 0 -2.640625 -1.015625q-0.890625 -1.03125 -0.890625 -2.8125l0 -5.984375l1.59375 0l0 5.75q0 1.421875 0.65625 2.03125q0.65625 0.59375 1.625 0.59375q0.828125 0 1.4375 -0.4375q0.625 -0.4375 0.953125 -1.125q0.34375 -0.6875 0.34375 -1.4375l0 -5.375l1.59375 0l0 9.515625l-1.515625 0l0 -1.375l-0.078125 0q-0.234375 0.4375 -0.71875 0.828125q-0.46875 0.375 -1.078125 0.609375q-0.609375 0.234375 -1.28125 0.234375zm12.356369 -8.375l0 -1.4375l5.796875 0l0 1.4375l-5.796875 0zm1.765625 8.078125l0 -10.546875q0 -0.921875 0.40625 -1.578125q0.40625 -0.671875 1.109375 -1.015625q0.703125 -0.359375 1.515625 -0.359375q0.546875 0 0.859375 0.0625q0.328125 0.0625 0.546875 0.15625l0 1.546875q-0.25 -0.09375 -0.5625 -0.1875q-0.3125 -0.109375 -0.78125 -0.109375q-0.578125 0 -1.046875 0.421875q-0.46875 0.421875 -0.46875 1.234375l0 10.375l-1.578125 0zm-7.671875 -8.078125l0 -1.4375l7.6875 0l0 1.4375l-7.6875 0zm3.34375 -2.296875l0 10.375l-1.578125 0l0 -10.546875q0 -0.921875 0.40625 -1.578125q0.40625 -0.671875 1.109375 -1.015625q0.703125 -0.359375 1.515625 -0.359375q0.546875 0 0.859375 0.0625q0.328125 0.0625 0.546875 0.15625l0 1.546875q-0.25 -0.09375 -0.5625 -0.1875q-0.3125 -0.109375 -0.78125 -0.109375q-0.375 0 -0.734375 0.1875q-0.34375 0.1875 -0.5625 0.546875q-0.21875 0.359375 -0.21875 0.921875zm13.952972 10.671875q-1.375 0 -2.453125 -0.640625q-1.0625 -0.65625 -1.671875 -1.796875q-0.609375 -1.140625 -0.609375 -2.59375q0 -1.359375 0.5625 -2.515625q0.578125 -1.15625 1.609375 -1.859375q1.03125 -0.703125 2.4375 -0.703125q1.421875 0 2.4375 0.625q1.015625 0.625 1.5625 1.734375q0.546875 1.09375 0.546875 2.515625q0 0.125 -0.015625 0.265625q0 0.125 -0.015625 0.21875l-8.1875 0l0 -1.3125l6.546875 0q-0.015625 -0.390625 -0.1875 -0.84375q-0.15625 -0.46875 -0.5 -0.859375q-0.34375 -0.40625 -0.875 -0.65625q-0.53125 -0.25 -1.3125 -0.25q-0.9375 0 -1.625 0.484375q-0.671875 0.46875 -1.046875 1.296875q-0.359375 0.8125 -0.359375 1.859375q0 1.203125 0.46875 2.015625q0.46875 0.796875 1.203125 1.1875q0.75 0.390625 1.546875 0.390625q1.046875 0 1.71875 -0.484375q0.6875 -0.5 1.09375 -1.234375l1.34375 0.65625q-0.5625 1.078125 -1.609375 1.796875q-1.03125 0.703125 -2.609375 0.703125zm6.2690887 -0.296875l0 -9.515625l1.515625 0l0 1.53125l0.078125 0q0.1875 -0.546875 0.625 -0.9375q0.4375 -0.40625 1.0 -0.640625q0.578125 -0.234375 1.125 -0.234375q0.4375 0 0.671875 0.046875q0.25 0.046875 0.453125 0.140625l0 1.71875q-0.296875 -0.15625 -0.640625 -0.21875q-0.34375 -0.078125 -0.703125 -0.078125q-0.6875 0 -1.265625 0.390625q-0.578125 0.390625 -0.921875 1.046875q-0.34375 0.65625 -0.34375 1.4375l0 5.3125l-1.59375 0zm7.1523438 0l0 -9.515625l1.578125 0l0 9.515625l-1.578125 0zm0.78125 -11.265625q-0.46875 0 -0.8125 -0.328125q-0.328125 -0.34375 -0.328125 -0.8125q0 -0.484375 0.328125 -0.8125q0.34375 -0.328125 0.8125 -0.328125q0.484375 0 0.8125 0.328125q0.328125 0.328125 0.328125 0.8125q0 0.46875 -0.328125 0.8125q-0.328125 0.328125 -0.8125 0.328125zm2.7944336 11.265625l0 -1.359375l5.625 -6.71875l-5.453125 0l0 -1.4375l7.390625 0l0 1.359375l-5.625 6.71875l5.765625 0l0 1.4375l-7.703125 0zm12.430191 0.296875q-1.0625 0 -1.875 -0.40625q-0.796875 -0.40625 -1.25 -1.125q-0.453125 -0.71875 -0.453125 -1.640625q0 -1.046875 0.53125 -1.765625q0.546875 -0.71875 1.453125 -1.078125q0.921875 -0.359375 2.015625 -0.359375q0.640625 0 1.171875 0.109375q0.546875 0.09375 0.9375 0.234375q0.40625 0.140625 0.625 0.265625l0 -0.578125q0 -1.078125 -0.765625 -1.703125q-0.765625 -0.640625 -1.875 -0.640625q-0.78125 0 -1.46875 0.34375q-0.671875 0.34375 -1.0625 0.953125l-1.203125 -0.890625q0.375 -0.5625 0.9375 -0.96875q0.5625 -0.40625 1.28125 -0.625q0.71875 -0.234375 1.515625 -0.234375q1.9375 0 3.03125 1.03125q1.109375 1.015625 1.109375 2.75l0 6.03125l-1.5 0l0 -1.359375l-0.078125 0q-0.25 0.40625 -0.703125 0.796875q-0.4375 0.375 -1.046875 0.609375q-0.609375 0.25 -1.328125 0.25zm0.140625 -1.390625q0.828125 0 1.5 -0.40625q0.6875 -0.421875 1.09375 -1.109375q0.421875 -0.6875 0.421875 -1.515625q-0.4375 -0.296875 -1.078125 -0.484375q-0.640625 -0.1875 -1.40625 -0.1875q-1.359375 0 -2.0 0.5625q-0.640625 0.5625 -0.640625 1.375q0 0.78125 0.59375 1.28125q0.609375 0.484375 1.515625 0.484375zm5.876053 -8.421875l5.578125 0l0 1.4375l-5.578125 0l0 -1.4375zm1.671875 7.015625l0 -9.703125l1.578125 0l0 9.3125q0 0.75 0.3125 1.15625q0.3125 0.40625 1.015625 0.40625q0.3125 0 0.578125 -0.09375q0.265625 -0.09375 0.46875 -0.21875l0 1.546875q-0.25 0.109375 -0.546875 0.171875q-0.28125 0.078125 -0.765625 0.078125q-1.1875 0 -1.921875 -0.703125q-0.71875 -0.703125 -0.71875 -1.953125zm6.043503 2.5l0 -9.515625l1.578125 0l0 9.515625l-1.578125 0zm0.78125 -11.265625q-0.46875 0 -0.8125 -0.328125q-0.328125 -0.34375 -0.328125 -0.8125q0 -0.484375 0.328125 -0.8125q0.34375 -0.328125 0.8125 -0.328125q0.484375 0 0.8125 0.328125q0.328125 0.328125 0.328125 0.8125q0 0.46875 -0.328125 0.8125q-0.328125 0.328125 -0.8125 0.328125zm7.5760956 11.5625q-1.4375 0 -2.546875 -0.671875q-1.09375 -0.671875 -1.71875 -1.8125q-0.625 -1.15625 -0.625 -2.5625q0 -1.421875 0.625 -2.5625q0.625 -1.15625 1.71875 -1.828125q1.109375 -0.671875 2.546875 -0.671875q1.4375 0 2.53125 0.6875q1.109375 0.671875 1.734375 1.828125q0.625 1.140625 0.625 2.546875q0 1.40625 -0.625 2.5625q-0.625 1.140625 -1.734375 1.8125q-1.09375 0.671875 -2.53125 0.671875zm0 -1.4375q0.859375 0 1.609375 -0.421875q0.75 -0.4375 1.21875 -1.25q0.46875 -0.8125 0.46875 -1.9375q0 -1.140625 -0.46875 -1.953125q-0.46875 -0.8125 -1.21875 -1.234375q-0.75 -0.4375 -1.609375 -0.4375q-0.859375 0 -1.625 0.4375q-0.765625 0.421875 -1.234375 1.234375q-0.46875 0.8125 -0.46875 1.953125q0 1.125 0.46875 1.9375q0.46875 0.8125 1.234375 1.25q0.765625 0.421875 1.625 0.421875zm6.5418396 1.140625l0 -9.515625l1.515625 0l0 1.40625l0.078125 0q0.375 -0.703125 1.21875 -1.203125q0.84375 -0.5 1.859375 -0.5q1.75 0 2.625 1.015625q0.890625 1.015625 0.890625 2.703125l0 6.09375l-1.578125 0l0 -5.859375q0 -1.375 -0.671875 -1.9375q-0.65625 -0.578125 -1.703125 -0.578125q-0.78125 0 -1.375 0.4375q-0.59375 0.4375 -0.9375 1.125q-0.328125 0.6875 -0.328125 1.453125l0 5.359375l-1.59375 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m0 308.9212l161.41733 0l0 47.968506l-161.41733 0z" fill-rule="evenodd"/><path fill="#000000" d="m30.540504 335.84122l0 -13.359375l4.78125 0q1.03125 0 1.890625 0.46875q0.875 0.453125 1.40625 1.265625q0.53125 0.796875 0.53125 1.875q0 0.984375 -0.5 1.71875q-0.5 0.71875 -1.328125 1.09375l0 0.078125q0.984375 0.3125 1.625 1.140625q0.65625 0.8125 0.65625 1.9375q0 1.109375 -0.5625 1.96875q-0.5625 0.84375 -1.484375 1.328125q-0.90625 0.484375 -1.96875 0.484375l-5.046875 0zm0.828125 -6.078125l0 -1.515625l3.953125 0q0.71875 0 1.21875 -0.328125q0.5 -0.328125 0.765625 -0.8125q0.28125 -0.484375 0.28125 -0.984375q0 -0.5 -0.265625 -0.984375q-0.265625 -0.484375 -0.75 -0.8125q-0.484375 -0.328125 -1.171875 -0.328125l-3.28125 0l0 10.328125l3.5625 0q0.703125 0 1.21875 -0.34375q0.53125 -0.34375 0.8125 -0.875q0.28125 -0.53125 0.28125 -1.078125q0 -0.546875 -0.296875 -1.0625q-0.28125 -0.53125 -0.8125 -0.859375q-0.53125 -0.34375 -1.28125 -0.34375l-4.234375 0zm13.60257 6.375q-1.75 0 -2.640625 -1.015625q-0.890625 -1.03125 -0.890625 -2.8125l0 -5.984375l1.59375 0l0 5.75q0 1.421875 0.65625 2.03125q0.65625 0.59375 1.625 0.59375q0.828125 0 1.4375 -0.4375q0.625 -0.4375 0.953125 -1.125q0.34375 -0.6875 0.34375 -1.4375l0 -5.375l1.59375 0l0 9.515625l-1.515625 0l0 -1.375l-0.078125 0q-0.234375 0.4375 -0.71875 0.828125q-0.46875 0.375 -1.078125 0.609375q-0.609375 0.234375 -1.28125 0.234375zm12.356369 -8.375l0 -1.4375l5.796875 0l0 1.4375l-5.796875 0zm1.765625 8.078125l0 -10.546875q0 -0.921875 0.40625 -1.578125q0.40625 -0.671875 1.109375 -1.015625q0.703125 -0.359375 1.515625 -0.359375q0.546875 0 0.859375 0.0625q0.328125 0.0625 0.546875 0.15625l0 1.546875q-0.25 -0.09375 -0.5625 -0.1875q-0.3125 -0.109375 -0.78125 -0.109375q-0.578125 0 -1.046875 0.421875q-0.46875 0.421875 -0.46875 1.234375l0 10.375l-1.578125 0zm-7.671875 -8.078125l0 -1.4375l7.6875 0l0 1.4375l-7.6875 0zm3.34375 -2.296875l0 10.375l-1.578125 0l0 -10.546875q0 -0.921875 0.40625 -1.578125q0.40625 -0.671875 1.109375 -1.015625q0.703125 -0.359375 1.515625 -0.359375q0.546875 0 0.859375 0.0625q0.328125 0.0625 0.546875 0.15625l0 1.546875q-0.25 -0.09375 -0.5625 -0.1875q-0.3125 -0.109375 -0.78125 -0.109375q-0.375 0 -0.734375 0.1875q-0.34375 0.1875 -0.5625 0.546875q-0.21875 0.359375 -0.21875 0.921875zm13.952972 10.671875q-1.375 0 -2.453125 -0.640625q-1.0625 -0.65625 -1.671875 -1.796875q-0.609375 -1.140625 -0.609375 -2.59375q0 -1.359375 0.5625 -2.515625q0.578125 -1.15625 1.609375 -1.859375q1.03125 -0.703125 2.4375 -0.703125q1.421875 0 2.4375 0.625q1.015625 0.625 1.5625 1.734375q0.546875 1.09375 0.546875 2.515625q0 0.125 -0.015625 0.265625q0 0.125 -0.015625 0.21875l-8.1875 0l0 -1.3125l6.546875 0q-0.015625 -0.390625 -0.1875 -0.84375q-0.15625 -0.46875 -0.5 -0.859375q-0.34375 -0.40625 -0.875 -0.65625q-0.53125 -0.25 -1.3125 -0.25q-0.9375 0 -1.625 0.484375q-0.671875 0.46875 -1.046875 1.296875q-0.359375 0.8125 -0.359375 1.859375q0 1.203125 0.46875 2.015625q0.46875 0.796875 1.203125 1.1875q0.75 0.390625 1.546875 0.390625q1.046875 0 1.71875 -0.484375q0.6875 -0.5 1.09375 -1.234375l1.34375 0.65625q-0.5625 1.078125 -1.609375 1.796875q-1.03125 0.703125 -2.609375 0.703125zm6.2690887 -0.296875l0 -9.515625l1.515625 0l0 1.53125l0.078125 0q0.1875 -0.546875 0.625 -0.9375q0.4375 -0.40625 1.0 -0.640625q0.578125 -0.234375 1.125 -0.234375q0.4375 0 0.671875 0.046875q0.25 0.046875 0.453125 0.140625l0 1.71875q-0.296875 -0.15625 -0.640625 -0.21875q-0.34375 -0.078125 -0.703125 -0.078125q-0.6875 0 -1.265625 0.390625q-0.578125 0.390625 -0.921875 1.046875q-0.34375 0.65625 -0.34375 1.4375l0 5.3125l-1.59375 0zm6.9804688 -5.125l0 -1.3125l5.921875 0l0 1.3125l-5.921875 0zm8.571152 5.125l0 -13.359375l1.578125 0l0 11.84375l5.84375 0l0 1.515625l-7.421875 0zm13.178284 0.296875q-1.375 0 -2.453125 -0.640625q-1.0625 -0.65625 -1.671875 -1.796875q-0.609375 -1.140625 -0.609375 -2.59375q0 -1.359375 0.5625 -2.515625q0.578125 -1.15625 1.609375 -1.859375q1.03125 -0.703125 2.4375 -0.703125q1.421875 0 2.4375 0.625q1.015625 0.625 1.5625 1.734375q0.546875 1.09375 0.546875 2.515625q0 0.125 -0.015625 0.265625q0 0.125 -0.015625 0.21875l-8.1875 0l0 -1.3125l6.546875 0q-0.015625 -0.390625 -0.1875 -0.84375q-0.15625 -0.46875 -0.5 -0.859375q-0.34375 -0.40625 -0.875 -0.65625q-0.53125 -0.25 -1.3125 -0.25q-0.9375 0 -1.625 0.484375q-0.671875 0.46875 -1.046875 1.296875q-0.359375 0.8125 -0.359375 1.859375q0 1.203125 0.46875 2.015625q0.46875 0.796875 1.203125 1.1875q0.75 0.390625 1.546875 0.390625q1.046875 0 1.71875 -0.484375q0.6875 -0.5 1.09375 -1.234375l1.34375 0.65625q-0.5625 1.078125 -1.609375 1.796875q-1.03125 0.703125 -2.609375 0.703125zm8.879303 -0.296875l-3.84375 -9.515625l1.703125 0l2.953125 7.65625l0.03125 0l2.984375 -7.65625l1.65625 0l-3.875 9.515625l-1.609375 0zm10.746811 0.296875q-1.375 0 -2.453125 -0.640625q-1.0625 -0.65625 -1.671875 -1.796875q-0.609375 -1.140625 -0.609375 -2.59375q0 -1.359375 0.5625 -2.515625q0.578125 -1.15625 1.609375 -1.859375q1.03125 -0.703125 2.4375 -0.703125q1.421875 0 2.4375 0.625q1.015625 0.625 1.5625 1.734375q0.546875 1.09375 0.546875 2.515625q0 0.125 -0.015625 0.265625q0 0.125 -0.015625 0.21875l-8.1875 0l0 -1.3125l6.546875 0q-0.015625 -0.390625 -0.1875 -0.84375q-0.15625 -0.46875 -0.5 -0.859375q-0.34375 -0.40625 -0.875 -0.65625q-0.53125 -0.25 -1.3125 -0.25q-0.9375 0 -1.625 0.484375q-0.671875 0.46875 -1.046875 1.296875q-0.359375 0.8125 -0.359375 1.859375q0 1.203125 0.46875 2.015625q0.46875 0.796875 1.203125 1.1875q0.75 0.390625 1.546875 0.390625q1.046875 0 1.71875 -0.484375q0.6875 -0.5 1.09375 -1.234375l1.34375 0.65625q-0.5625 1.078125 -1.609375 1.796875q-1.03125 0.703125 -2.609375 0.703125zm6.2690964 -0.296875l0 -13.359375l1.59375 0l0 13.359375l-1.59375 0z" fill-rule="nonzero"/><path fill="#000000" d="m29.563927 358.5381q-1.46875 0 -2.71875 -0.53125q-1.25 -0.546875 -2.1875 -1.5q-0.921875 -0.96875 -1.4375 -2.21875q-0.5 -1.265625 -0.5 -2.71875q0 -1.46875 0.5 -2.71875q0.515625 -1.265625 1.4375 -2.21875q0.9375 -0.96875 2.1875 -1.5q1.25 -0.546875 2.71875 -0.546875q1.46875 0 2.71875 0.546875q1.25 0.53125 2.171875 1.5q0.921875 0.953125 1.4375 2.21875q0.515625 1.25 0.515625 2.71875q0 1.453125 -0.515625 2.71875q-0.515625 1.25 -1.4375 2.21875q-0.921875 0.953125 -2.171875 1.5q-1.25 0.53125 -2.71875 0.53125zm0 -1.515625q1.453125 0 2.640625 -0.6875q1.1875 -0.6875 1.890625 -1.921875q0.703125 -1.25 0.703125 -2.84375q0 -1.625 -0.703125 -2.859375q-0.703125 -1.234375 -1.890625 -1.921875q-1.1875 -0.6875 -2.640625 -0.6875q-1.4375 0 -2.640625 0.6875q-1.1875 0.6875 -1.90625 1.921875q-0.703125 1.234375 -0.703125 2.859375q0 1.59375 0.703125 2.84375q0.71875 1.234375 1.90625 1.921875q1.203125 0.6875 2.640625 0.6875zm8.857132 5.25l0 -13.546875l1.515625 0l0 1.421875l0.078125 0q0.25 -0.453125 0.71875 -0.84375q0.484375 -0.390625 1.125 -0.625q0.640625 -0.25 1.390625 -0.25q1.3125 0 2.328125 0.65625q1.03125 0.65625 1.625 1.796875q0.609375 1.125 0.609375 2.609375q0 1.46875 -0.609375 2.609375q-0.59375 1.140625 -1.625 1.796875q-1.015625 0.640625 -2.328125 0.640625q-1.125 0 -1.984375 -0.515625q-0.859375 -0.53125 -1.25 -1.1875l-0.078125 0l0.078125 1.3125l0 4.125l-1.59375 0zm4.671875 -5.171875q0.8125 0 1.53125 -0.4375q0.71875 -0.4375 1.15625 -1.25q0.4375 -0.828125 0.4375 -1.921875q0 -1.125 -0.4375 -1.9375q-0.4375 -0.8125 -1.15625 -1.25q-0.71875 -0.4375 -1.53125 -0.4375q-0.828125 0 -1.546875 0.4375q-0.71875 0.4375 -1.171875 1.25q-0.4375 0.8125 -0.4375 1.9375q0 1.109375 0.4375 1.921875q0.453125 0.8125 1.171875 1.25q0.71875 0.4375 1.546875 0.4375zm5.529602 -8.375l5.578125 0l0 1.4375l-5.578125 0l0 -1.4375zm1.671875 7.015625l0 -9.703125l1.578125 0l0 9.3125q0 0.75 0.3125 1.15625q0.3125 0.40625 1.015625 0.40625q0.3125 0 0.578125 -0.09375q0.265625 -0.09375 0.46875 -0.21875l0 1.546875q-0.25 0.109375 -0.546875 0.171875q-0.28125 0.078125 -0.765625 0.078125q-1.1875 0 -1.921875 -0.703125q-0.71875 -0.703125 -0.71875 -1.953125zm6.043503 2.5l0 -9.515625l1.578125 0l0 9.515625l-1.578125 0zm0.78125 -11.265625q-0.46875 0 -0.8125 -0.328125q-0.328125 -0.34375 -0.328125 -0.8125q0 -0.484375 0.328125 -0.8125q0.34375 -0.328125 0.8125 -0.328125q0.484375 0 0.8125 0.328125q0.328125 0.328125 0.328125 0.8125q0 0.46875 -0.328125 0.8125q-0.328125 0.328125 -0.8125 0.328125zm3.2788086 11.265625l0 -9.515625l1.515625 0l0 1.40625l0.078125 0q0.25 -0.46875 0.703125 -0.84375q0.46875 -0.390625 1.046875 -0.625q0.59375 -0.234375 1.21875 -0.234375q1.0625 0 1.828125 0.515625q0.765625 0.515625 1.09375 1.328125q0.484375 -0.796875 1.296875 -1.3125q0.828125 -0.53125 1.96875 -0.53125q1.703125 0 2.5 1.03125q0.796875 1.015625 0.796875 2.6875l0 6.09375l-1.5625 0l0 -5.859375q0 -1.375 -0.5625 -1.9375q-0.5625 -0.578125 -1.578125 -0.578125q-0.734375 0 -1.3125 0.421875q-0.5625 0.421875 -0.890625 1.109375q-0.3125 0.671875 -0.3125 1.46875l0 5.375l-1.59375 0l0 -5.84375q0 -1.375 -0.5625 -1.953125q-0.5625 -0.578125 -1.5625 -0.578125q-0.734375 0 -1.296875 0.4375q-0.5625 0.421875 -0.890625 1.109375q-0.328125 0.6875 -0.328125 1.5l0 5.328125l-1.59375 0zm16.480469 0l0 -9.515625l1.578125 0l0 9.515625l-1.578125 0zm0.78125 -11.265625q-0.46875 0 -0.8125 -0.328125q-0.328125 -0.34375 -0.328125 -0.8125q0 -0.484375 0.328125 -0.8125q0.34375 -0.328125 0.8125 -0.328125q0.484375 0 0.8125 0.328125q0.328125 0.328125 0.328125 0.8125q0 0.46875 -0.328125 0.8125q-0.328125 0.328125 -0.8125 0.328125zm2.7944336 11.265625l0 -1.359375l5.625 -6.71875l-5.453125 0l0 -1.4375l7.390625 0l0 1.359375l-5.625 6.71875l5.765625 0l0 1.4375l-7.703125 0zm12.430191 0.296875q-1.0625 0 -1.875 -0.40625q-0.796875 -0.40625 -1.25 -1.125q-0.453125 -0.71875 -0.453125 -1.640625q0 -1.046875 0.53125 -1.765625q0.546875 -0.71875 1.453125 -1.078125q0.921875 -0.359375 2.015625 -0.359375q0.640625 0 1.171875 0.109375q0.546875 0.09375 0.9375 0.234375q0.40625 0.140625 0.625 0.265625l0 -0.578125q0 -1.078125 -0.765625 -1.703125q-0.765625 -0.640625 -1.875 -0.640625q-0.78125 0 -1.46875 0.34375q-0.671875 0.34375 -1.0625 0.953125l-1.203125 -0.890625q0.375 -0.5625 0.9375 -0.96875q0.5625 -0.40625 1.28125 -0.625q0.71875 -0.234375 1.515625 -0.234375q1.9375 0 3.03125 1.03125q1.109375 1.015625 1.109375 2.75l0 6.03125l-1.5 0l0 -1.359375l-0.078125 0q-0.25 0.40625 -0.703125 0.796875q-0.4375 0.375 -1.046875 0.609375q-0.609375 0.25 -1.328125 0.25zm0.140625 -1.390625q0.828125 0 1.5 -0.40625q0.6875 -0.421875 1.09375 -1.109375q0.421875 -0.6875 0.421875 -1.515625q-0.4375 -0.296875 -1.078125 -0.484375q-0.640625 -0.1875 -1.40625 -0.1875q-1.359375 0 -2.0 0.5625q-0.640625 0.5625 -0.640625 1.375q0 0.78125 0.59375 1.28125q0.609375 0.484375 1.515625 0.484375zm5.876053 -8.421875l5.578125 0l0 1.4375l-5.578125 0l0 -1.4375zm1.671875 7.015625l0 -9.703125l1.578125 0l0 9.3125q0 0.75 0.3125 1.15625q0.3125 0.40625 1.015625 0.40625q0.3125 0 0.578125 -0.09375q0.265625 -0.09375 0.46875 -0.21875l0 1.546875q-0.25 0.109375 -0.546875 0.171875q-0.28125 0.078125 -0.765625 0.078125q-1.1875 0 -1.921875 -0.703125q-0.71875 -0.703125 -0.71875 -1.953125zm6.043503 2.5l0 -9.515625l1.578125 0l0 9.515625l-1.578125 0zm0.78125 -11.265625q-0.46875 0 -0.8125 -0.328125q-0.328125 -0.34375 -0.328125 -0.8125q0 -0.484375 0.328125 -0.8125q0.34375 -0.328125 0.8125 -0.328125q0.484375 0 0.8125 0.328125q0.328125 0.328125 0.328125 0.8125q0 0.46875 -0.328125 0.8125q-0.328125 0.328125 -0.8125 0.328125zm7.5760956 11.5625q-1.4375 0 -2.546875 -0.671875q-1.09375 -0.671875 -1.71875 -1.8125q-0.625 -1.15625 -0.625 -2.5625q0 -1.421875 0.625 -2.5625q0.625 -1.15625 1.71875 -1.828125q1.109375 -0.671875 2.546875 -0.671875q1.4375 0 2.53125 0.6875q1.109375 0.671875 1.734375 1.828125q0.625 1.140625 0.625 2.546875q0 1.40625 -0.625 2.5625q-0.625 1.140625 -1.734375 1.8125q-1.09375 0.671875 -2.53125 0.671875zm0 -1.4375q0.859375 0 1.609375 -0.421875q0.75 -0.4375 1.21875 -1.25q0.46875 -0.8125 0.46875 -1.9375q0 -1.140625 -0.46875 -1.953125q-0.46875 -0.8125 -1.21875 -1.234375q-0.75 -0.4375 -1.609375 -0.4375q-0.859375 0 -1.625 0.4375q-0.765625 0.421875 -1.234375 1.234375q-0.46875 0.8125 -0.46875 1.953125q0 1.125 0.46875 1.9375q0.46875 0.8125 1.234375 1.25q0.765625 0.421875 1.625 0.421875zm6.5418396 1.140625l0 -9.515625l1.515625 0l0 1.40625l0.078125 0q0.375 -0.703125 1.21875 -1.203125q0.84375 -0.5 1.859375 -0.5q1.75 0 2.6250076 1.015625q0.890625 1.015625 0.890625 2.703125l0 6.09375l-1.578125 0l0 -5.859375q0 -1.375 -0.6718826 -1.9375q-0.65625 -0.578125 -1.703125 -0.578125q-0.78125 0 -1.375 0.4375q-0.59375 0.4375 -0.9375 1.125q-0.328125 0.6875 -0.328125 1.453125l0 5.359375l-1.59375 0zm13.700935 0.296875q-1.0625 0 -1.875 -0.34375q-0.8125 -0.34375 -1.34375 -0.921875q-0.53125 -0.59375 -0.796875 -1.28125l1.421875 -0.640625q0.375 0.859375 1.078125 1.328125q0.703125 0.46875 1.625 0.46875q0.875 0 1.453125 -0.359375q0.59375 -0.359375 0.59375 -1.046875q0 -0.421875 -0.25 -0.703125q-0.234375 -0.296875 -0.703125 -0.5q-0.453125 -0.21875 -1.125 -0.375l-1.15625 -0.3125q-0.671875 -0.1875 -1.28125 -0.515625q-0.59375 -0.34375 -0.953125 -0.875q-0.359375 -0.53125 -0.359375 -1.296875q0 -0.84375 0.5 -1.453125q0.5 -0.625 1.3125 -0.953125q0.828125 -0.328125 1.765625 -0.328125q0.8125 0 1.53125 0.234375q0.71875 0.234375 1.265625 0.6875q0.546875 0.453125 0.8125 1.125l-1.375 0.640625q-0.359375 -0.703125 -0.953125 -0.984375q-0.59375 -0.28125 -1.328125 -0.28125q-0.78125 0 -1.359375 0.34375q-0.578125 0.34375 -0.578125 0.9375q0 0.59375 0.46875 0.90625q0.484375 0.296875 1.171875 0.46875l1.390625 0.359375q1.390625 0.359375 2.09375 1.0625q0.71875 0.6875 0.71875 1.703125q0 0.890625 -0.515625 1.546875q-0.5 0.65625 -1.359375 1.015625q-0.84375 0.34375 -1.890625 0.34375z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m0 454.21594l161.41733 0l0 47.968506l-161.41733 0z" fill-rule="evenodd"/><path fill="#000000" d="m29.971382 481.13596l0 -13.359375l4.015625 0q2.09375 0 3.609375 0.859375q1.515625 0.84375 2.328125 2.359375q0.828125 1.5 0.828125 3.46875q0 1.953125 -0.828125 3.453125q-0.8125 1.5 -2.328125 2.359375q-1.515625 0.859375 -3.609375 0.859375l-4.015625 0zm1.578125 -1.515625l2.4375 0q1.578125 0 2.734375 -0.59375q1.15625 -0.609375 1.78125 -1.765625q0.640625 -1.15625 0.640625 -2.796875q0 -1.65625 -0.640625 -2.8125q-0.625 -1.15625 -1.78125 -1.75q-1.15625 -0.609375 -2.734375 -0.609375l-2.4375 0l0 10.328125zm15.163467 1.8125q-1.375 0 -2.453125 -0.640625q-1.0625 -0.65625 -1.671875 -1.796875q-0.609375 -1.140625 -0.609375 -2.59375q0 -1.359375 0.5625 -2.515625q0.578125 -1.15625 1.609375 -1.859375q1.03125 -0.703125 2.4375 -0.703125q1.421875 0 2.4375 0.625q1.015625 0.625 1.5625 1.734375q0.546875 1.09375 0.546875 2.515625q0 0.125 -0.015625 0.265625q0 0.125 -0.015625 0.21875l-8.1875 0l0 -1.3125l6.546875 0q-0.015625 -0.390625 -0.1875 -0.84375q-0.15625 -0.46875 -0.5 -0.859375q-0.34375 -0.40625 -0.875 -0.65625q-0.53125 -0.25 -1.3125 -0.25q-0.9375 0 -1.625 0.484375q-0.671875 0.46875 -1.046875 1.296875q-0.359375 0.8125 -0.359375 1.859375q0 1.203125 0.46875 2.015625q0.46875 0.796875 1.203125 1.1875q0.75 0.390625 1.546875 0.390625q1.046875 0 1.71875 -0.484375q0.6875 -0.5 1.09375 -1.234375l1.34375 0.65625q-0.5625 1.078125 -1.609375 1.796875q-1.03125 0.703125 -2.609375 0.703125zm9.409302 0q-1.0625 0 -1.875 -0.40625q-0.796875 -0.40625 -1.25 -1.125q-0.453125 -0.71875 -0.453125 -1.640625q0 -1.046875 0.53125 -1.765625q0.546875 -0.71875 1.453125 -1.078125q0.921875 -0.359375 2.015625 -0.359375q0.640625 0 1.171875 0.109375q0.546875 0.09375 0.9375 0.234375q0.40625 0.140625 0.625 0.265625l0 -0.578125q0 -1.078125 -0.765625 -1.703125q-0.765625 -0.640625 -1.875 -0.640625q-0.78125 0 -1.46875 0.34375q-0.671875 0.34375 -1.0625 0.953125l-1.203125 -0.890625q0.375 -0.5625 0.9375 -0.96875q0.5625 -0.40625 1.28125 -0.625q0.71875 -0.234375 1.515625 -0.234375q1.9375 0 3.03125 1.03125q1.109375 1.015625 1.109375 2.75l0 6.03125l-1.5 0l0 -1.359375l-0.078125 0q-0.25 0.40625 -0.703125 0.796875q-0.4375 0.375 -1.046875 0.609375q-0.609375 0.25 -1.328125 0.25zm0.140625 -1.390625q0.828125 0 1.5 -0.40625q0.6875 -0.421875 1.09375 -1.109375q0.421875 -0.6875 0.421875 -1.515625q-0.4375 -0.296875 -1.078125 -0.484375q-0.640625 -0.1875 -1.40625 -0.1875q-1.359375 0 -2.0 0.5625q-0.640625 0.5625 -0.640625 1.375q0 0.78125 0.59375 1.28125q0.609375 0.484375 1.515625 0.484375zm6.7189636 1.09375l0 -13.359375l1.59375 0l0 13.359375l-1.59375 0zm3.9364624 0l0 -13.359375l1.59375 0l0 13.359375l-1.59375 0zm8.233749 0.296875q-1.4375 0 -2.546875 -0.671875q-1.09375 -0.671875 -1.71875 -1.8125q-0.625 -1.15625 -0.625 -2.5625q0 -1.421875 0.625 -2.5625q0.625 -1.15625 1.71875 -1.828125q1.109375 -0.671875 2.546875 -0.671875q1.4375 0 2.53125 0.6875q1.109375 0.671875 1.734375 1.828125q0.625 1.140625 0.625 2.546875q0 1.40625 -0.625 2.5625q-0.625 1.140625 -1.734375 1.8125q-1.09375 0.671875 -2.53125 0.671875zm0 -1.4375q0.859375 0 1.609375 -0.421875q0.75 -0.4375 1.21875 -1.25q0.46875 -0.8125 0.46875 -1.9375q0 -1.140625 -0.46875 -1.953125q-0.46875 -0.8125 -1.21875 -1.234375q-0.75 -0.4375 -1.609375 -0.4375q-0.859375 0 -1.625 0.4375q-0.765625 0.421875 -1.234375 1.234375q-0.46875 0.8125 -0.46875 1.953125q0 1.125 0.46875 1.9375q0.46875 0.8125 1.234375 1.25q0.765625 0.421875 1.625 0.421875zm11.041 1.4375q-1.40625 0 -2.5 -0.65625q-1.078125 -0.671875 -1.703125 -1.8125q-0.609375 -1.15625 -0.609375 -2.578125q0 -1.46875 0.609375 -2.59375q0.625 -1.140625 1.703125 -1.796875q1.09375 -0.671875 2.5 -0.671875q1.609375 0 2.640625 0.734375q1.03125 0.734375 1.46875 1.890625l-1.4375 0.609375q-0.359375 -0.890625 -1.0625 -1.34375q-0.6875 -0.453125 -1.6875 -0.453125q-0.828125 0 -1.546875 0.453125q-0.71875 0.4375 -1.171875 1.25q-0.453125 0.8125 -0.453125 1.921875q0 1.078125 0.453125 1.90625q0.453125 0.8125 1.171875 1.265625q0.71875 0.4375 1.546875 0.4375q1.015625 0 1.734375 -0.46875q0.734375 -0.46875 1.09375 -1.3125l1.40625 0.59375q-0.46875 1.09375 -1.515625 1.859375q-1.03125 0.765625 -2.640625 0.765625zm8.827347 0q-1.0625 0 -1.875 -0.40625q-0.796875 -0.40625 -1.25 -1.125q-0.453125 -0.71875 -0.453125 -1.640625q0 -1.046875 0.53125 -1.765625q0.546875 -0.71875 1.453125 -1.078125q0.921875 -0.359375 2.015625 -0.359375q0.640625 0 1.171875 0.109375q0.546875 0.09375 0.9375 0.234375q0.40625 0.140625 0.625 0.265625l0 -0.578125q0 -1.078125 -0.765625 -1.703125q-0.765625 -0.640625 -1.875 -0.640625q-0.78125 0 -1.46875 0.34375q-0.671875 0.34375 -1.0625 0.953125l-1.203125 -0.890625q0.375 -0.5625 0.9375 -0.96875q0.5625 -0.40625 1.28125 -0.625q0.71875 -0.234375 1.515625 -0.234375q1.9375 0 3.03125 1.03125q1.109375 1.015625 1.109375 2.75l0 6.03125l-1.5 0l0 -1.359375l-0.078125 0q-0.25 0.40625 -0.703125 0.796875q-0.4375 0.375 -1.046875 0.609375q-0.609375 0.25 -1.328125 0.25zm0.140625 -1.390625q0.828125 0 1.5 -0.40625q0.6875 -0.421875 1.09375 -1.109375q0.421875 -0.6875 0.421875 -1.515625q-0.4375 -0.296875 -1.078125 -0.484375q-0.640625 -0.1875 -1.40625 -0.1875q-1.359375 0 -2.0 0.5625q-0.640625 0.5625 -0.640625 1.375q0 0.78125 0.59375 1.28125q0.609375 0.484375 1.515625 0.484375zm5.876053 -8.421875l5.578125 0l0 1.4375l-5.578125 0l0 -1.4375zm1.671875 7.015625l0 -9.703125l1.578125 0l0 9.3125q0 0.75 0.3125 1.15625q0.3125 0.40625 1.015625 0.40625q0.3125 0 0.578125 -0.09375q0.265625 -0.09375 0.46875 -0.21875l0 1.546875q-0.25 0.109375 -0.546875 0.171875q-0.28125 0.078125 -0.765625 0.078125q-1.1875 0 -1.921875 -0.703125q-0.71875 -0.703125 -0.71875 -1.953125zm6.043503 2.5l0 -9.515625l1.578125 0l0 9.515625l-1.578125 0zm0.78125 -11.265625q-0.46875 0 -0.8125 -0.328125q-0.328125 -0.34375 -0.328125 -0.8125q0 -0.484375 0.328125 -0.8125q0.34375 -0.328125 0.8125 -0.328125q0.484375 0 0.8125 0.328125q0.328125 0.328125 0.328125 0.8125q0 0.46875 -0.328125 0.8125q-0.328125 0.328125 -0.8125 0.328125zm7.5760956 11.5625q-1.4375 0 -2.546875 -0.671875q-1.09375 -0.671875 -1.71875 -1.8125q-0.625 -1.15625 -0.625 -2.5625q0 -1.421875 0.625 -2.5625q0.625 -1.15625 1.71875 -1.828125q1.109375 -0.671875 2.546875 -0.671875q1.4375 0 2.53125 0.6875q1.109375 0.671875 1.734375 1.828125q0.625 1.140625 0.625 2.546875q0 1.40625 -0.625 2.5625q-0.625 1.140625 -1.734375 1.8125q-1.09375 0.671875 -2.53125 0.671875zm0 -1.4375q0.859375 0 1.609375 -0.421875q0.75 -0.4375 1.21875 -1.25q0.46875 -0.8125 0.46875 -1.9375q0 -1.140625 -0.46875 -1.953125q-0.46875 -0.8125 -1.21875 -1.234375q-0.75 -0.4375 -1.609375 -0.4375q-0.859375 0 -1.625 0.4375q-0.765625 0.421875 -1.234375 1.234375q-0.46875 0.8125 -0.46875 1.953125q0 1.125 0.46875 1.9375q0.46875 0.8125 1.234375 1.25q0.765625 0.421875 1.625 0.421875zm6.5418396 1.140625l0 -9.515625l1.515625 0l0 1.40625l0.078125 0q0.375 -0.703125 1.21875 -1.203125q0.84375 -0.5 1.8593826 -0.5q1.75 0 2.625 1.015625q0.890625 1.015625 0.890625 2.703125l0 6.09375l-1.578125 0l0 -5.859375q0 -1.375 -0.671875 -1.9375q-0.65625 -0.578125 -1.7031326 -0.578125q-0.78125 0 -1.375 0.4375q-0.59375 0.4375 -0.9375 1.125q-0.328125 0.6875 -0.328125 1.453125l0 5.359375l-1.59375 0z" fill-rule="nonzero"/><path fill="#fbbc05" d="m163.16655 485.02887l315.59058 0l0 29.039368l-315.59058 0z" fill-rule="evenodd"/><path stroke="#fbbc05" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m163.16655 485.02887l315.59058 0l0 29.039368l-315.59058 0z" fill-rule="evenodd"/><path fill="#000000" d="m246.55295 501.31607l-4.28125 0l0 -1.171875l4.28125 0l0 1.171875zm9.16745 -0.40625q0 0.890625 -0.25 1.59375q-0.234375 0.703125 -0.703125 1.203125q-0.453125 0.484375 -1.109375 0.75q-0.640625 0.25 -1.453125 0.25q-0.640625 0 -1.3125 -0.109375q-0.65625 -0.125 -1.3125 -0.390625l0 -9.6875l1.25 0l0 2.78125l-0.0625 1.328125q0.546875 -0.71875 1.15625 -1.015625q0.609375 -0.296875 1.3125 -0.296875q0.609375 0 1.078125 0.265625q0.46875 0.25 0.78125 0.71875q0.3125 0.46875 0.46875 1.140625q0.15625 0.65625 0.15625 1.46875zm-1.265625 0.046875q0 -0.5625 -0.09375 -1.03125q-0.078125 -0.46875 -0.265625 -0.796875q-0.171875 -0.34375 -0.453125 -0.53125q-0.265625 -0.1875 -0.65625 -0.1875q-0.234375 0 -0.484375 0.078125q-0.234375 0.0625 -0.5 0.25q-0.265625 0.171875 -0.5625 0.46875q-0.28125 0.28125 -0.609375 0.703125l0 3.484375q0.359375 0.140625 0.734375 0.234375q0.390625 0.078125 0.734375 0.078125q0.421875 0 0.8125 -0.125q0.390625 -0.140625 0.6875 -0.453125q0.296875 -0.328125 0.46875 -0.859375q0.1875 -0.53125 0.1875 -1.3125zm8.964325 3.671875l-1.125 0l-0.03125 -1.15625q-0.328125 0.375 -0.625 0.625q-0.28125 0.234375 -0.578125 0.390625q-0.28125 0.140625 -0.578125 0.203125q-0.28125 0.0625 -0.59375 0.0625q-1.109375 0 -1.6875 -0.640625q-0.5625 -0.65625 -0.5625 -1.96875l0 -4.703125l1.25 0l0 4.59375q0 1.65625 1.25 1.65625q0.21875 0 0.4375 -0.0625q0.21875 -0.078125 0.453125 -0.234375q0.25 -0.171875 0.515625 -0.453125q0.28125 -0.296875 0.625 -0.734375l0 -4.765625l1.25 0l0 7.1875zm8.651825 -8.953125q-0.96875 -0.203125 -1.6875 -0.203125q-1.671875 0 -1.671875 1.75l0 1.265625l3.140625 0l0 1.03125l-3.140625 0l0 5.109375l-1.265625 0l0 -5.109375l-2.3125 0l0 -1.03125l2.3125 0l0 -1.1875q0 -2.875 2.984375 -2.875q0.75 0 1.640625 0.171875l0 1.078125zm-7.515625 1.765625l0 0zm15.5737 -1.765625q-0.96875 -0.203125 -1.6875 -0.203125q-1.671875 0 -1.671875 1.75l0 1.265625l3.140625 0l0 1.03125l-3.140625 0l0 5.109375l-1.265625 0l0 -5.109375l-2.3125 0l0 -1.03125l2.3125 0l0 -1.1875q0 -2.875 2.984375 -2.875q0.75 0 1.640625 0.171875l0 1.078125zm-7.515625 1.765625l0 0zm15.276825 3.21875q0 0.265625 -0.015625 0.453125q0 0.171875 -0.015625 0.328125l-5.046875 0q0 1.09375 0.609375 1.6875q0.625 0.59375 1.78125 0.59375q0.3125 0 0.625 -0.015625q0.3125 -0.03125 0.609375 -0.078125q0.296875 -0.046875 0.5625 -0.09375q0.265625 -0.0625 0.5 -0.125l0 1.03125q-0.515625 0.140625 -1.15625 0.21875q-0.640625 0.09375 -1.328125 0.09375q-0.921875 0 -1.59375 -0.25q-0.65625 -0.25 -1.078125 -0.71875q-0.421875 -0.484375 -0.625 -1.171875q-0.203125 -0.6875 -0.203125 -1.5625q0 -0.765625 0.21875 -1.4375q0.21875 -0.671875 0.625 -1.1875q0.421875 -0.515625 1.03125 -0.8125q0.609375 -0.296875 1.375 -0.296875q0.765625 0 1.34375 0.234375q0.578125 0.234375 0.96875 0.671875q0.40625 0.4375 0.609375 1.0625q0.203125 0.609375 0.203125 1.375zm-1.296875 -0.1875q0.015625 -0.46875 -0.109375 -0.859375q-0.109375 -0.40625 -0.34375 -0.6875q-0.234375 -0.296875 -0.59375 -0.453125q-0.359375 -0.15625 -0.828125 -0.15625q-0.40625 0 -0.75 0.15625q-0.328125 0.15625 -0.578125 0.4375q-0.25 0.28125 -0.40625 0.6875q-0.140625 0.40625 -0.171875 0.875l3.78125 0zm3.5737 -3.03125l1.140625 0l0.03125 1.328125q0.640625 -0.765625 1.25 -1.109375q0.625 -0.34375 1.25 -0.34375q1.125 0 1.703125 0.71875q0.578125 0.71875 0.53125 2.15625l-1.265625 0q0.03125 -0.953125 -0.265625 -1.375q-0.296875 -0.421875 -0.875 -0.421875q-0.25 0 -0.515625 0.09375q-0.25 0.078125 -0.515625 0.28125q-0.265625 0.1875 -0.5625 0.5q-0.296875 0.3125 -0.640625 0.75l0 4.609375l-1.265625 0l0 -7.1875zm12.79245 3.875l-4.28125 0l0 -1.171875l4.28125 0l0 1.171875zm2.66745 -0.125q0 -0.921875 0.25 -1.625q0.25 -0.71875 0.703125 -1.203125q0.46875 -0.5 1.109375 -0.75q0.65625 -0.25 1.453125 -0.25q0.34375 0 0.671875 0.046875q0.328125 0.03125 0.65625 0.125l0 -3.015625l1.25 0l0 10.109375l-1.109375 0l-0.046875 -1.359375q-0.515625 0.765625 -1.125 1.125q-0.609375 0.359375 -1.328125 0.359375q-0.609375 0 -1.078125 -0.25q-0.46875 -0.265625 -0.78125 -0.734375q-0.3125 -0.46875 -0.46875 -1.125q-0.15625 -0.65625 -0.15625 -1.453125zm1.265625 -0.078125q0 1.296875 0.375 1.9375q0.390625 0.640625 1.09375 0.640625q0.484375 0 1.0 -0.421875q0.53125 -0.421875 1.109375 -1.25l0 -3.34375q-0.3125 -0.140625 -0.6875 -0.203125q-0.359375 -0.078125 -0.734375 -0.078125q-1.015625 0 -1.59375 0.65625q-0.5625 0.65625 -0.5625 2.0625zm13.22995 -0.453125q0 0.265625 -0.015625 0.453125q0 0.171875 -0.015625 0.328125l-5.046875 0q0 1.09375 0.609375 1.6875q0.625 0.59375 1.78125 0.59375q0.3125 0 0.625 -0.015625q0.3125 -0.03125 0.609375 -0.078125q0.296875 -0.046875 0.5625 -0.09375q0.265625 -0.0625 0.5 -0.125l0 1.03125q-0.515625 0.140625 -1.15625 0.21875q-0.640625 0.09375 -1.328125 0.09375q-0.921875 0 -1.59375 -0.25q-0.65625 -0.25 -1.078125 -0.71875q-0.421875 -0.484375 -0.625 -1.171875q-0.203125 -0.6875 -0.203125 -1.5625q0 -0.765625 0.21875 -1.4375q0.21875 -0.671875 0.625 -1.1875q0.421875 -0.515625 1.03125 -0.8125q0.609375 -0.296875 1.375 -0.296875q0.765625 0 1.34375 0.234375q0.578125 0.234375 0.96875 0.671875q0.40625 0.4375 0.609375 1.0625q0.203125 0.609375 0.203125 1.375zm-1.296875 -0.1875q0.015625 -0.46875 -0.109375 -0.859375q-0.109375 -0.40625 -0.34375 -0.6875q-0.234375 -0.296875 -0.59375 -0.453125q-0.359375 -0.15625 -0.828125 -0.15625q-0.40625 0 -0.75 0.15625q-0.328125 0.15625 -0.578125 0.4375q-0.25 0.28125 -0.40625 0.6875q-0.140625 0.40625 -0.171875 0.875l3.78125 0zm7.901825 4.15625l-0.03125 -0.96875q-0.59375 0.578125 -1.203125 0.84375q-0.59375 0.25 -1.265625 0.25q-0.609375 0 -1.046875 -0.15625q-0.4375 -0.15625 -0.71875 -0.421875q-0.28125 -0.28125 -0.421875 -0.65625q-0.125 -0.375 -0.125 -0.8125q0 -1.078125 0.796875 -1.6875q0.8125 -0.609375 2.390625 -0.609375l1.484375 0l0 -0.640625q0 -0.625 -0.40625 -1.0q-0.40625 -0.390625 -1.25 -0.390625q-0.609375 0 -1.203125 0.140625q-0.578125 0.125 -1.21875 0.375l0 -1.125q0.25 -0.078125 0.53125 -0.15625q0.296875 -0.09375 0.609375 -0.15625q0.328125 -0.0625 0.671875 -0.09375q0.359375 -0.046875 0.71875 -0.046875q0.640625 0 1.15625 0.140625q0.53125 0.140625 0.890625 0.4375q0.359375 0.296875 0.546875 0.75q0.203125 0.4375 0.203125 1.03125l0 4.953125l-1.109375 0zm-0.140625 -3.265625l-1.578125 0q-0.46875 0 -0.8125 0.09375q-0.328125 0.09375 -0.546875 0.265625q-0.21875 0.171875 -0.328125 0.40625q-0.09375 0.234375 -0.09375 0.546875q0 0.203125 0.0625 0.390625q0.0625 0.1875 0.203125 0.34375q0.15625 0.140625 0.375 0.234375q0.234375 0.078125 0.5625 0.078125q0.4375 0 0.984375 -0.265625q0.5625 -0.265625 1.171875 -0.828125l0 -1.265625zm6.026825 -5.828125l-2.125 0l0 -1.015625l3.390625 0l0 9.078125l2.125 0l0 1.03125l-5.75 0l0 -1.03125l2.359375 0l0 -8.0625zm8.058075 0l-2.125 0l0 -1.015625l3.390625 0l0 9.078125l2.125 0l0 1.03125l-5.75 0l0 -1.03125l2.359375 0l0 -8.0625zm11.870575 5.4375q0 0.84375 -0.25 1.546875q-0.234375 0.6875 -0.671875 1.1875q-0.4375 0.5 -1.078125 0.78125q-0.640625 0.265625 -1.453125 0.265625q-0.765625 0 -1.390625 -0.234375q-0.609375 -0.234375 -1.03125 -0.703125q-0.421875 -0.46875 -0.65625 -1.15625q-0.21875 -0.6875 -0.21875 -1.578125q0 -0.84375 0.234375 -1.53125q0.234375 -0.6875 0.671875 -1.1875q0.453125 -0.5 1.09375 -0.765625q0.640625 -0.28125 1.4375 -0.28125q0.78125 0 1.390625 0.25q0.609375 0.234375 1.03125 0.703125q0.4375 0.453125 0.65625 1.140625q0.234375 0.6875 0.234375 1.5625zm-1.28125 0.0625q0 -0.671875 -0.15625 -1.15625q-0.140625 -0.5 -0.421875 -0.828125q-0.265625 -0.34375 -0.65625 -0.5q-0.375 -0.171875 -0.859375 -0.171875q-0.5625 0 -0.96875 0.21875q-0.390625 0.21875 -0.640625 0.578125q-0.25 0.359375 -0.375 0.84375q-0.109375 0.484375 -0.109375 1.015625q0 0.671875 0.140625 1.171875q0.140625 0.5 0.40625 0.828125q0.28125 0.328125 0.671875 0.5q0.390625 0.171875 0.875 0.171875q0.5625 0 0.953125 -0.21875q0.390625 -0.21875 0.640625 -0.578125q0.265625 -0.375 0.375 -0.859375q0.125 -0.484375 0.125 -1.015625zm8.66745 3.328125q-0.5 0.1875 -1.015625 0.265625q-0.5 0.09375 -1.046875 0.09375q-1.703125 0 -2.625 -0.921875q-0.921875 -0.921875 -0.921875 -2.6875q0 -0.859375 0.265625 -1.546875q0.265625 -0.703125 0.75 -1.1875q0.484375 -0.5 1.140625 -0.765625q0.671875 -0.265625 1.46875 -0.265625q0.5625 0 1.046875 0.078125q0.484375 0.078125 0.9375 0.25l0 1.1875q-0.46875 -0.234375 -0.953125 -0.34375q-0.484375 -0.109375 -1.0 -0.109375q-0.484375 0 -0.90625 0.1875q-0.421875 0.171875 -0.75 0.515625q-0.3125 0.34375 -0.5 0.84375q-0.1875 0.484375 -0.1875 1.109375q0 1.296875 0.625 1.953125q0.640625 0.640625 1.765625 0.640625q0.5 0 0.96875 -0.109375q0.484375 -0.109375 0.9375 -0.34375l0 1.15625zm7.089325 0.265625l-0.03125 -0.96875q-0.59375 0.578125 -1.203125 0.84375q-0.59375 0.25 -1.265625 0.25q-0.609375 0 -1.046875 -0.15625q-0.4375 -0.15625 -0.71875 -0.421875q-0.28125 -0.28125 -0.421875 -0.65625q-0.125 -0.375 -0.125 -0.8125q0 -1.078125 0.796875 -1.6875q0.8125 -0.609375 2.390625 -0.609375l1.484375 0l0 -0.640625q0 -0.625 -0.40625 -1.0q-0.40625 -0.390625 -1.25 -0.390625q-0.609375 0 -1.203125 0.140625q-0.578125 0.125 -1.21875 0.375l0 -1.125q0.25 -0.078125 0.53125 -0.15625q0.296875 -0.09375 0.609375 -0.15625q0.328125 -0.0625 0.671875 -0.09375q0.359375 -0.046875 0.71875 -0.046875q0.640625 0 1.15625 0.140625q0.53125 0.140625 0.890625 0.4375q0.359375 0.296875 0.546875 0.75q0.203125 0.4375 0.203125 1.03125l0 4.953125l-1.109375 0zm-0.140625 -3.265625l-1.578125 0q-0.46875 0 -0.8125 0.09375q-0.328125 0.09375 -0.546875 0.265625q-0.21875 0.171875 -0.328125 0.40625q-0.09375 0.234375 -0.09375 0.546875q0 0.203125 0.0625 0.390625q0.0625 0.1875 0.203125 0.34375q0.15625 0.140625 0.375 0.234375q0.234375 0.078125 0.5625 0.078125q0.4375 0 0.984375 -0.265625q0.5625 -0.265625 1.171875 -0.828125l0 -1.265625zm9.370575 3.171875q-0.421875 0.09375 -0.875 0.140625q-0.453125 0.046875 -0.921875 0.046875q-1.34375 0 -2.015625 -0.609375q-0.65625 -0.609375 -0.65625 -1.875l0 -3.75l-2.015625 0l0 -1.046875l2.015625 0l0 -1.96875l1.234375 -0.328125l0 2.296875l3.234375 0l0 1.046875l-3.234375 0l0 3.65625q0 0.765625 0.40625 1.15625q0.421875 0.375 1.21875 0.375q0.34375 0 0.75 -0.046875q0.40625 -0.0625 0.859375 -0.171875l0 1.078125zm4.714325 -6.0625l-2.125 0l0 -1.03125l3.390625 0l0 6.15625l2.125 0l0 1.03125l-5.75 0l0 -1.03125l2.359375 0l0 -5.125zm0.4375 -4.046875q0.203125 0 0.375 0.078125q0.1875 0.078125 0.3125 0.21875q0.140625 0.125 0.21875 0.3125q0.078125 0.171875 0.078125 0.375q0 0.203125 -0.078125 0.390625q-0.078125 0.171875 -0.21875 0.3125q-0.125 0.125 -0.3125 0.203125q-0.171875 0.078125 -0.375 0.078125q-0.203125 0 -0.390625 -0.078125q-0.171875 -0.078125 -0.3125 -0.203125q-0.125 -0.140625 -0.203125 -0.3125q-0.078125 -0.1875 -0.078125 -0.390625q0 -0.203125 0.078125 -0.375q0.078125 -0.1875 0.203125 -0.3125q0.140625 -0.140625 0.3125 -0.21875q0.1875 -0.078125 0.390625 -0.078125zm11.433075 6.546875q0 0.84375 -0.25 1.546875q-0.234375 0.6875 -0.671875 1.1875q-0.4375 0.5 -1.078125 0.78125q-0.640625 0.265625 -1.453125 0.265625q-0.765625 0 -1.390625 -0.234375q-0.609375 -0.234375 -1.03125 -0.703125q-0.421875 -0.46875 -0.65625 -1.15625q-0.21875 -0.6875 -0.21875 -1.578125q0 -0.84375 0.234375 -1.53125q0.234375 -0.6875 0.671875 -1.1875q0.453125 -0.5 1.09375 -0.765625q0.640625 -0.28125 1.4375 -0.28125q0.78125 0 1.390625 0.25q0.609375 0.234375 1.03125 0.703125q0.4375 0.453125 0.65625 1.140625q0.234375 0.6875 0.234375 1.5625zm-1.28125 0.0625q0 -0.671875 -0.15625 -1.15625q-0.140625 -0.5 -0.421875 -0.828125q-0.265625 -0.34375 -0.65625 -0.5q-0.375 -0.171875 -0.859375 -0.171875q-0.5625 0 -0.96875 0.21875q-0.390625 0.21875 -0.640625 0.578125q-0.25 0.359375 -0.375 0.84375q-0.109375 0.484375 -0.109375 1.015625q0 0.671875 0.140625 1.171875q0.140625 0.5 0.40625 0.828125q0.28125 0.328125 0.671875 0.5q0.390625 0.171875 0.875 0.171875q0.5625 0 0.953125 -0.21875q0.390625 -0.21875 0.640625 -0.578125q0.265625 -0.375 0.375 -0.859375q0.125 -0.484375 0.125 -1.015625zm3.0737 -3.59375l1.109375 0l0.046875 1.15625q0.328125 -0.359375 0.609375 -0.609375q0.296875 -0.25 0.578125 -0.390625q0.28125 -0.15625 0.578125 -0.21875q0.296875 -0.0625 0.609375 -0.0625q1.109375 0 1.671875 0.65625q0.578125 0.65625 0.578125 1.96875l0 4.6875l-1.25 0l0 -4.59375q0 -0.84375 -0.3125 -1.234375q-0.3125 -0.40625 -0.9375 -0.40625q-0.234375 0 -0.453125 0.0625q-0.21875 0.0625 -0.453125 0.234375q-0.234375 0.171875 -0.515625 0.453125q-0.265625 0.28125 -0.609375 0.71875l0 4.765625l-1.25 0l0 -7.1875z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m162.69989 484.61154l316.7874 30.11026" fill-rule="evenodd"/><path stroke="#595959" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m162.69989 484.61154l316.7874 30.11026" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m163.29161 514.21515l314.39368 -27.559082" fill-rule="evenodd"/><path stroke="#595959" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m163.29161 514.21515l314.39368 -27.559082" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m77.98062 65.06823l0 59.590553" fill-rule="evenodd"/><path stroke="#595959" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m77.98062 65.06823l0 53.590553" fill-rule="evenodd"/><path fill="#595959" stroke="#595959" stroke-width="1.0" stroke-linecap="butt" d="m76.32889 118.65878l1.6517334 4.5380936l1.6517334 -4.5380936z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m78.50435 160.0866l0 156.88188" fill-rule="evenodd"/><path stroke="#595959" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m78.50435 160.08658l0 150.8819" fill-rule="evenodd"/><path fill="#595959" stroke="#595959" stroke-width="1.0" stroke-linecap="butt" d="m76.852615 310.96848l1.6517334 4.5381165l1.6517334 -4.5381165z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m81.09268 363.24664l0 104.787415" fill-rule="evenodd"/><path stroke="#595959" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m81.09268 363.24664l0 98.787384" fill-rule="evenodd"/><path fill="#595959" stroke="#595959" stroke-width="1.0" stroke-linecap="butt" d="m79.44095 462.03403l1.6517334 4.5381165l1.6517334 -4.5381165z" fill-rule="evenodd"/><path fill="#ea4335" d="m163.3038 71.25196l315.59055 0l0 29.039368l-315.59055 0z" fill-rule="evenodd"/><path stroke="#ea4335" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m163.3038 71.25196l315.59055 0l0 29.039368l-315.59055 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m224.40218 79.966805l193.39381 0l0 17.16098l-193.39381 0l0 -17.16098z" fill-rule="nonzero"/><path fill="#ffffff" d="m226.19905 87.72665l0 -1.546875l4.453125 0l0 1.546875l-4.453125 0zm13.589325 -1.0625q0 0.171875 -0.015625 0.46875q0 0.28125 -0.03125 0.53125l-4.78125 0q0 0.46875 0.140625 0.84375q0.15625 0.359375 0.421875 0.609375q0.265625 0.25 0.640625 0.375q0.390625 0.125 0.84375 0.125q0.53125 0 1.125 -0.078125q0.609375 -0.078125 1.265625 -0.265625l0 1.390625q-0.28125 0.078125 -0.609375 0.140625q-0.328125 0.0625 -0.671875 0.109375q-0.34375 0.046875 -0.703125 0.078125q-0.34375 0.03125 -0.671875 0.03125q-0.828125 0 -1.5 -0.234375q-0.65625 -0.25 -1.109375 -0.703125q-0.453125 -0.46875 -0.703125 -1.140625q-0.234375 -0.6875 -0.234375 -1.578125q0 -0.890625 0.234375 -1.609375q0.25 -0.734375 0.6875 -1.25q0.453125 -0.515625 1.078125 -0.796875q0.640625 -0.28125 1.421875 -0.28125q0.765625 0 1.359375 0.234375q0.59375 0.234375 0.984375 0.671875q0.40625 0.421875 0.609375 1.015625q0.21875 0.59375 0.21875 1.3125zm-1.78125 -0.25q0.015625 -0.4375 -0.109375 -0.75q-0.109375 -0.328125 -0.3125 -0.53125q-0.1875 -0.203125 -0.453125 -0.296875q-0.25 -0.109375 -0.5625 -0.109375q-0.65625 0 -1.078125 0.4375q-0.421875 0.4375 -0.5 1.25l3.015625 0zm5.85495 -4.3125l-2.0625 0l0 -1.359375l3.84375 0l0 8.75l2.109375 0l0 1.359375l-6.203125 0l0 -1.359375l2.3125 0l0 -7.390625zm9.995575 -0.515625q0 0.234375 -0.09375 0.453125q-0.078125 0.203125 -0.234375 0.359375q-0.15625 0.15625 -0.375 0.25q-0.203125 0.078125 -0.4375 0.078125q-0.25 0 -0.46875 -0.078125q-0.203125 -0.09375 -0.359375 -0.25q-0.15625 -0.15625 -0.25 -0.359375q-0.078125 -0.21875 -0.078125 -0.453125q0 -0.234375 0.078125 -0.4375q0.09375 -0.203125 0.25 -0.359375q0.15625 -0.15625 0.359375 -0.25q0.21875 -0.09375 0.46875 -0.09375q0.234375 0 0.4375 0.09375q0.21875 0.09375 0.375 0.25q0.15625 0.15625 0.234375 0.359375q0.09375 0.203125 0.09375 0.4375zm-1.9375 3.359375l-2.0625 0l0 -1.359375l3.84375 0l0 5.90625l2.109375 0l0 1.359375l-6.203125 0l0 -1.359375l2.3125 0l0 -4.546875zm10.83934 5.90625l0 -4.953125q0 -0.265625 -0.015625 -0.46875q-0.015625 -0.203125 -0.0625 -0.34375q-0.03125 -0.140625 -0.125 -0.203125q-0.078125 -0.078125 -0.21875 -0.078125q-0.21875 0 -0.4375 0.3125q-0.21875 0.296875 -0.484375 1.015625l0 4.71875l-1.5 0l0 -4.953125q0 -0.265625 -0.015625 -0.46875q-0.015625 -0.203125 -0.0625 -0.34375q-0.03125 -0.140625 -0.125 -0.203125q-0.078125 -0.078125 -0.21875 -0.078125q-0.1875 0 -0.421875 0.3125q-0.21875 0.296875 -0.5 1.015625l0 4.71875l-1.515625 0l0 -7.265625l1.296875 0l0.03125 1.046875q0.15625 -0.3125 0.3125 -0.53125q0.15625 -0.234375 0.328125 -0.375q0.1875 -0.15625 0.390625 -0.21875q0.203125 -0.0625 0.453125 -0.0625q0.609375 0 0.921875 0.3125q0.328125 0.3125 0.375 0.875q0.171875 -0.3125 0.328125 -0.53125q0.171875 -0.234375 0.359375 -0.375q0.1875 -0.15625 0.390625 -0.21875q0.21875 -0.0625 0.46875 -0.0625q0.796875 0 1.171875 0.546875q0.390625 0.546875 0.390625 1.640625l0 5.21875l-1.515625 0zm7.214325 -9.265625q0 0.234375 -0.09375 0.453125q-0.078125 0.203125 -0.234375 0.359375q-0.15625 0.15625 -0.375 0.25q-0.203125 0.078125 -0.4375 0.078125q-0.25 0 -0.46875 -0.078125q-0.203125 -0.09375 -0.359375 -0.25q-0.15625 -0.15625 -0.25 -0.359375q-0.078125 -0.21875 -0.078125 -0.453125q0 -0.234375 0.078125 -0.4375q0.09375 -0.203125 0.25 -0.359375q0.15625 -0.15625 0.359375 -0.25q0.21875 -0.09375 0.46875 -0.09375q0.234375 0 0.4375 0.09375q0.21875 0.09375 0.375 0.25q0.15625 0.15625 0.234375 0.359375q0.09375 0.203125 0.09375 0.4375zm-1.9375 3.359375l-2.0625 0l0 -1.359375l3.84375 0l0 5.90625l2.109375 0l0 1.359375l-6.203125 0l0 -1.359375l2.3125 0l0 -4.546875zm10.04245 5.90625l0 -4.734375q0 -1.203125 -0.890625 -1.203125q-0.4375 0 -0.84375 0.359375q-0.40625 0.359375 -0.859375 0.984375l0 4.59375l-1.734375 0l0 -7.265625l1.5 0l0.046875 1.0625q0.21875 -0.265625 0.453125 -0.5q0.25 -0.234375 0.53125 -0.390625q0.28125 -0.15625 0.59375 -0.234375q0.328125 -0.09375 0.734375 -0.09375q0.546875 0 0.953125 0.1875q0.421875 0.171875 0.703125 0.515625q0.28125 0.328125 0.421875 0.796875q0.140625 0.46875 0.140625 1.046875l0 4.875l-1.75 0zm8.276825 0l-0.03125 -0.953125q-0.234375 0.25 -0.484375 0.453125q-0.25 0.203125 -0.5625 0.359375q-0.296875 0.140625 -0.65625 0.21875q-0.34375 0.09375 -0.765625 0.09375q-0.5625 0 -0.984375 -0.171875q-0.421875 -0.171875 -0.703125 -0.453125q-0.28125 -0.296875 -0.4375 -0.703125q-0.140625 -0.421875 -0.140625 -0.921875q0 -0.515625 0.21875 -0.953125q0.21875 -0.4375 0.65625 -0.75q0.453125 -0.3125 1.125 -0.484375q0.671875 -0.1875 1.5625 -0.1875l0.953125 0l0 -0.4375q0 -0.28125 -0.078125 -0.5q-0.078125 -0.234375 -0.25 -0.390625q-0.171875 -0.15625 -0.453125 -0.234375q-0.28125 -0.09375 -0.6875 -0.09375q-0.640625 0 -1.265625 0.15625q-0.625 0.140625 -1.21875 0.40625l0 -1.40625q0.53125 -0.203125 1.203125 -0.328125q0.6875 -0.140625 1.421875 -0.140625q0.8125 0 1.390625 0.15625q0.578125 0.140625 0.953125 0.453125q0.375 0.3125 0.546875 0.78125q0.1875 0.453125 0.1875 1.0625l0 4.96875l-1.5 0zm-0.25 -3.21875l-1.0625 0q-0.4375 0 -0.75 0.09375q-0.3125 0.078125 -0.5 0.234375q-0.1875 0.15625 -0.28125 0.359375q-0.09375 0.1875 -0.09375 0.40625q0 0.4375 0.28125 0.671875q0.296875 0.234375 0.78125 0.234375q0.375 0 0.765625 -0.265625q0.390625 -0.265625 0.859375 -0.75l0 -0.984375zm9.745575 3.140625q-0.46875 0.109375 -0.953125 0.171875q-0.46875 0.078125 -0.90625 0.078125q-0.703125 0 -1.234375 -0.15625q-0.515625 -0.15625 -0.859375 -0.453125q-0.328125 -0.3125 -0.5 -0.78125q-0.15625 -0.484375 -0.15625 -1.140625l0 -3.546875l-1.953125 0l0 -1.359375l1.953125 0l0 -1.859375l1.796875 -0.46875l0 2.328125l2.8125 0l0 1.359375l-2.8125 0l0 3.421875q0 0.609375 0.28125 0.9375q0.28125 0.3125 0.953125 0.3125q0.4375 0 0.84375 -0.0625q0.40625 -0.078125 0.734375 -0.171875l0 1.390625zm8.401825 -4.109375q0 0.171875 -0.015625 0.46875q0 0.28125 -0.03125 0.53125l-4.78125 0q0 0.46875 0.140625 0.84375q0.15625 0.359375 0.421875 0.609375q0.265625 0.25 0.640625 0.375q0.390625 0.125 0.84375 0.125q0.53125 0 1.125 -0.078125q0.609375 -0.078125 1.265625 -0.265625l0 1.390625q-0.28125 0.078125 -0.609375 0.140625q-0.328125 0.0625 -0.671875 0.109375q-0.34375 0.046875 -0.703125 0.078125q-0.34375 0.03125 -0.671875 0.03125q-0.828125 0 -1.5 -0.234375q-0.65625 -0.25 -1.109375 -0.703125q-0.453125 -0.46875 -0.703125 -1.140625q-0.234375 -0.6875 -0.234375 -1.578125q0 -0.890625 0.234375 -1.609375q0.25 -0.734375 0.6875 -1.25q0.453125 -0.515625 1.078125 -0.796875q0.640625 -0.28125 1.421875 -0.28125q0.765625 0 1.359375 0.234375q0.59375 0.234375 0.984375 0.671875q0.40625 0.421875 0.609375 1.015625q0.21875 0.59375 0.21875 1.3125zm-1.78125 -0.25q0.015625 -0.4375 -0.109375 -0.75q-0.109375 -0.328125 -0.3125 -0.53125q-0.1875 -0.203125 -0.453125 -0.296875q-0.25 -0.109375 -0.5625 -0.109375q-0.65625 0 -1.078125 0.4375q-0.421875 0.4375 -0.5 1.25l3.015625 0zm4.308075 1.3125l0 -1.546875l4.453125 0l0 1.546875l-4.453125 0zm13.589325 -1.0625q0 0.171875 -0.015625 0.46875q0 0.28125 -0.03125 0.53125l-4.78125 0q0 0.46875 0.140625 0.84375q0.15625 0.359375 0.421875 0.609375q0.265625 0.25 0.640625 0.375q0.390625 0.125 0.84375 0.125q0.53125 0 1.125 -0.078125q0.609375 -0.078125 1.265625 -0.265625l0 1.390625q-0.28125 0.078125 -0.609375 0.140625q-0.328125 0.0625 -0.671875 0.109375q-0.34375 0.046875 -0.703125 0.078125q-0.34375 0.03125 -0.671875 0.03125q-0.828125 0 -1.5 -0.234375q-0.65625 -0.25 -1.109375 -0.703125q-0.453125 -0.46875 -0.703125 -1.140625q-0.234375 -0.6875 -0.234375 -1.578125q0 -0.890625 0.234375 -1.609375q0.25 -0.734375 0.6875 -1.25q0.453125 -0.515625 1.078125 -0.796875q0.640625 -0.28125 1.421875 -0.28125q0.765625 0 1.359375 0.234375q0.59375 0.234375 0.984375 0.671875q0.40625 0.421875 0.609375 1.015625q0.21875 0.59375 0.21875 1.3125zm-1.78125 -0.25q0.015625 -0.4375 -0.109375 -0.75q-0.109375 -0.328125 -0.3125 -0.53125q-0.1875 -0.203125 -0.453125 -0.296875q-0.25 -0.109375 -0.5625 -0.109375q-0.65625 0 -1.078125 0.4375q-0.421875 0.4375 -0.5 1.25l3.015625 0zm8.6362 4.4375l0 -4.953125q0 -0.265625 -0.015625 -0.46875q-0.015625 -0.203125 -0.0625 -0.34375q-0.03125 -0.140625 -0.125 -0.203125q-0.078125 -0.078125 -0.21875 -0.078125q-0.21875 0 -0.4375 0.3125q-0.21875 0.296875 -0.484375 1.015625l0 4.71875l-1.5 0l0 -4.953125q0 -0.265625 -0.015625 -0.46875q-0.015625 -0.203125 -0.0625 -0.34375q-0.03125 -0.140625 -0.125 -0.203125q-0.078125 -0.078125 -0.21875 -0.078125q-0.1875 0 -0.421875 0.3125q-0.21875 0.296875 -0.5 1.015625l0 4.71875l-1.515625 0l0 -7.265625l1.296875 0l0.03125 1.046875q0.15625 -0.3125 0.3125 -0.53125q0.15625 -0.234375 0.328125 -0.375q0.1875 -0.15625 0.390625 -0.21875q0.203125 -0.0625 0.453125 -0.0625q0.609375 0 0.921875 0.3125q0.328125 0.3125 0.375 0.875q0.171875 -0.3125 0.328125 -0.53125q0.171875 -0.234375 0.359375 -0.375q0.1875 -0.15625 0.390625 -0.21875q0.21875 -0.0625 0.46875 -0.0625q0.796875 0 1.171875 0.546875q0.390625 0.546875 0.390625 1.640625l0 5.21875l-1.515625 0zm9.464325 -3.765625q0 1.0 -0.28125 1.734375q-0.28125 0.734375 -0.78125 1.21875q-0.5 0.46875 -1.1875 0.703125q-0.6875 0.234375 -1.5 0.234375q-0.28125 0 -0.546875 -0.03125q-0.265625 -0.03125 -0.5 -0.09375l0 2.84375l-1.734375 0l0 -10.109375l1.5 0l0.046875 1.0625q0.21875 -0.265625 0.453125 -0.5q0.25 -0.234375 0.53125 -0.390625q0.28125 -0.15625 0.59375 -0.234375q0.328125 -0.09375 0.734375 -0.09375q0.640625 0 1.140625 0.265625q0.5 0.25 0.828125 0.71875q0.34375 0.46875 0.515625 1.15625q0.1875 0.671875 0.1875 1.515625zm-1.828125 0.078125q0 -0.609375 -0.09375 -1.03125q-0.078125 -0.421875 -0.25 -0.6875q-0.15625 -0.28125 -0.390625 -0.40625q-0.21875 -0.125 -0.53125 -0.125q-0.4375 0 -0.84375 0.359375q-0.40625 0.359375 -0.859375 0.984375l0 3.171875q0.21875 0.078125 0.515625 0.125q0.3125 0.046875 0.625 0.046875q0.40625 0 0.75 -0.171875q0.34375 -0.171875 0.578125 -0.484375q0.25 -0.3125 0.375 -0.765625q0.125 -0.453125 0.125 -1.015625zm9.339325 3.609375q-0.46875 0.109375 -0.953125 0.171875q-0.46875 0.078125 -0.90625 0.078125q-0.703125 0 -1.234375 -0.15625q-0.515625 -0.15625 -0.859375 -0.453125q-0.328125 -0.3125 -0.5 -0.78125q-0.15625 -0.484375 -0.15625 -1.140625l0 -3.546875l-1.953125 0l0 -1.359375l1.953125 0l0 -1.859375l1.796875 -0.46875l0 2.328125l2.8125 0l0 1.359375l-2.8125 0l0 3.421875q0 0.609375 0.28125 0.9375q0.28125 0.3125 0.953125 0.3125q0.4375 0 0.84375 -0.0625q0.40625 -0.078125 0.734375 -0.171875l0 1.390625zm6.4487 -0.71875q-0.390625 0.953125 -0.796875 1.65625q-0.40625 0.71875 -0.90625 1.1875q-0.5 0.484375 -1.109375 0.71875q-0.59375 0.234375 -1.34375 0.234375q-0.1875 0 -0.40625 -0.015625q-0.203125 -0.015625 -0.421875 -0.046875l0 -1.484375q0.09375 0.015625 0.203125 0.03125q0.109375 0.015625 0.21875 0.03125q0.125 0.015625 0.25 0.015625q0.125 0 0.234375 0q0.3125 0 0.5625 -0.109375q0.265625 -0.109375 0.484375 -0.3125q0.21875 -0.203125 0.390625 -0.484375q0.171875 -0.28125 0.3125 -0.625l-2.875 -7.265625l1.9375 0l1.515625 4.109375l0.453125 1.34375l0.4375 -1.28125l1.53125 -4.171875l1.875 0l-2.546875 6.46875zm4.47995 -2.328125l0 -1.546875l4.453125 0l0 1.546875l-4.453125 0zm13.245575 3.046875q-0.46875 0.109375 -0.953125 0.171875q-0.46875 0.078125 -0.90625 0.078125q-0.703125 0 -1.234375 -0.15625q-0.515625 -0.15625 -0.859375 -0.453125q-0.328125 -0.3125 -0.5 -0.78125q-0.15625 -0.484375 -0.15625 -1.140625l0 -3.546875l-1.953125 0l0 -1.359375l1.953125 0l0 -1.859375l1.796875 -0.46875l0 2.328125l2.8125 0l0 1.359375l-2.8125 0l0 3.421875q0 0.609375 0.28125 0.9375q0.28125 0.3125 0.953125 0.3125q0.4375 0 0.84375 -0.0625q0.40625 -0.078125 0.734375 -0.171875l0 1.390625zm8.401825 -4.109375q0 0.171875 -0.015625 0.46875q0 0.28125 -0.03125 0.53125l-4.78125 0q0 0.46875 0.140625 0.84375q0.15625 0.359375 0.421875 0.609375q0.265625 0.25 0.640625 0.375q0.390625 0.125 0.84375 0.125q0.53125 0 1.125 -0.078125q0.609375 -0.078125 1.265625 -0.265625l0 1.390625q-0.28125 0.078125 -0.609375 0.140625q-0.328125 0.0625 -0.671875 0.109375q-0.34375 0.046875 -0.703125 0.078125q-0.34375 0.03125 -0.671875 0.03125q-0.828125 0 -1.5 -0.234375q-0.65625 -0.25 -1.109375 -0.703125q-0.453125 -0.46875 -0.703125 -1.140625q-0.234375 -0.6875 -0.234375 -1.578125q0 -0.890625 0.234375 -1.609375q0.25 -0.734375 0.6875 -1.25q0.453125 -0.515625 1.078125 -0.796875q0.640625 -0.28125 1.421875 -0.28125q0.765625 0 1.359375 0.234375q0.59375 0.234375 0.984375 0.671875q0.40625 0.421875 0.609375 1.015625q0.21875 0.59375 0.21875 1.3125zm-1.78125 -0.25q0.015625 -0.4375 -0.109375 -0.75q-0.109375 -0.328125 -0.3125 -0.53125q-0.1875 -0.203125 -0.453125 -0.296875q-0.25 -0.109375 -0.5625 -0.109375q-0.65625 0 -1.078125 0.4375q-0.421875 0.4375 -0.5 1.25l3.015625 0zm7.839325 4.4375l0 -4.734375q0 -1.203125 -0.890625 -1.203125q-0.4375 0 -0.84375 0.359375q-0.40625 0.359375 -0.859375 0.984375l0 4.59375l-1.734375 0l0 -7.265625l1.5 0l0.046875 1.0625q0.21875 -0.265625 0.453125 -0.5q0.25 -0.234375 0.53125 -0.390625q0.28125 -0.15625 0.59375 -0.234375q0.328125 -0.09375 0.734375 -0.09375q0.546875 0 0.953125 0.1875q0.421875 0.171875 0.703125 0.515625q0.28125 0.328125 0.421875 0.796875q0.140625 0.46875 0.140625 1.046875l0 4.875l-1.75 0zm9.714325 -2.140625q0 0.625 -0.28125 1.0625q-0.265625 0.4375 -0.71875 0.71875q-0.453125 0.28125 -1.03125 0.40625q-0.578125 0.125 -1.171875 0.125q-0.796875 0 -1.453125 -0.078125q-0.640625 -0.078125 -1.21875 -0.21875l0 -1.578125q0.6875 0.28125 1.34375 0.40625q0.671875 0.125 1.265625 0.125q0.6875 0 1.03125 -0.21875q0.34375 -0.21875 0.34375 -0.5625q0 -0.171875 -0.078125 -0.296875q-0.078125 -0.140625 -0.28125 -0.265625q-0.1875 -0.125 -0.546875 -0.25q-0.359375 -0.140625 -0.9375 -0.3125q-0.53125 -0.15625 -0.9375 -0.34375q-0.40625 -0.1875 -0.671875 -0.4375q-0.265625 -0.265625 -0.40625 -0.59375q-0.125 -0.34375 -0.125 -0.8125q0 -0.4375 0.203125 -0.828125q0.203125 -0.390625 0.59375 -0.6875q0.40625 -0.296875 1.0 -0.46875q0.59375 -0.171875 1.390625 -0.171875q0.6875 0 1.21875 0.078125q0.53125 0.0625 0.9375 0.140625l0 1.421875q-0.625 -0.1875 -1.171875 -0.265625q-0.546875 -0.09375 -1.09375 -0.09375q-0.53125 0 -0.859375 0.203125q-0.328125 0.1875 -0.328125 0.53125q0 0.15625 0.0625 0.296875q0.078125 0.125 0.265625 0.25q0.1875 0.109375 0.515625 0.25q0.34375 0.125 0.90625 0.28125q0.625 0.1875 1.046875 0.390625q0.4375 0.203125 0.703125 0.46875q0.265625 0.25 0.375 0.578125q0.109375 0.328125 0.109375 0.75zm8.620575 -1.546875q0 0.859375 -0.25 1.578125q-0.234375 0.703125 -0.703125 1.21875q-0.453125 0.5 -1.125 0.78125q-0.65625 0.28125 -1.515625 0.28125q-0.796875 0 -1.4375 -0.234375q-0.640625 -0.25 -1.09375 -0.71875q-0.4375 -0.46875 -0.671875 -1.171875q-0.234375 -0.703125 -0.234375 -1.640625q0 -0.859375 0.25 -1.5625q0.25 -0.71875 0.703125 -1.21875q0.46875 -0.5 1.125 -0.765625q0.671875 -0.28125 1.5 -0.28125q0.828125 0 1.453125 0.234375q0.640625 0.234375 1.078125 0.71875q0.453125 0.46875 0.6875 1.171875q0.234375 0.6875 0.234375 1.609375zm-1.828125 0.03125q0 -1.125 -0.421875 -1.6875q-0.421875 -0.578125 -1.25 -0.578125q-0.46875 0 -0.796875 0.1875q-0.3125 0.171875 -0.515625 0.484375q-0.203125 0.3125 -0.3125 0.734375q-0.09375 0.40625 -0.09375 0.875q0 1.140625 0.453125 1.71875q0.46875 0.578125 1.265625 0.578125q0.4375 0 0.75 -0.171875q0.328125 -0.1875 0.53125 -0.484375q0.203125 -0.3125 0.296875 -0.734375q0.09375 -0.421875 0.09375 -0.921875zm8.0112 -0.78125q0.015625 -0.421875 -0.046875 -0.703125q-0.046875 -0.28125 -0.171875 -0.46875q-0.109375 -0.1875 -0.28125 -0.265625q-0.171875 -0.09375 -0.390625 -0.09375q-0.390625 0 -0.796875 0.328125q-0.40625 0.3125 -0.90625 1.046875l0 4.59375l-1.796875 0l0 -7.265625l1.59375 0l0.0625 1.046875q0.171875 -0.265625 0.390625 -0.484375q0.234375 -0.234375 0.5 -0.390625q0.28125 -0.15625 0.609375 -0.234375q0.34375 -0.09375 0.75 -0.09375q0.546875 0 0.96875 0.1875q0.4375 0.1875 0.71875 0.5625q0.296875 0.359375 0.4375 0.921875q0.140625 0.5625 0.109375 1.3125l-1.75 0zm9.370575 2.296875q0 0.625 -0.28125 1.0625q-0.265625 0.4375 -0.71875 0.71875q-0.453125 0.28125 -1.03125 0.40625q-0.578125 0.125 -1.171875 0.125q-0.796875 0 -1.453125 -0.078125q-0.640625 -0.078125 -1.21875 -0.21875l0 -1.578125q0.6875 0.28125 1.34375 0.40625q0.671875 0.125 1.265625 0.125q0.6875 0 1.03125 -0.21875q0.34375 -0.21875 0.34375 -0.5625q0 -0.171875 -0.078125 -0.296875q-0.078125 -0.140625 -0.28125 -0.265625q-0.1875 -0.125 -0.546875 -0.25q-0.359375 -0.140625 -0.9375 -0.3125q-0.53125 -0.15625 -0.9375 -0.34375q-0.40625 -0.1875 -0.671875 -0.4375q-0.265625 -0.265625 -0.40625 -0.59375q-0.125 -0.34375 -0.125 -0.8125q0 -0.4375 0.203125 -0.828125q0.203125 -0.390625 0.59375 -0.6875q0.40625 -0.296875 1.0 -0.46875q0.59375 -0.171875 1.390625 -0.171875q0.6875 0 1.21875 0.078125q0.53125 0.0625 0.9375 0.140625l0 1.421875q-0.625 -0.1875 -1.171875 -0.265625q-0.546875 -0.09375 -1.09375 -0.09375q-0.53125 0 -0.859375 0.203125q-0.328125 0.1875 -0.328125 0.53125q0 0.15625 0.0625 0.296875q0.078125 0.125 0.265625 0.25q0.1875 0.109375 0.515625 0.25q0.34375 0.125 0.90625 0.28125q0.625 0.1875 1.046875 0.390625q0.4375 0.203125 0.703125 0.46875q0.265625 0.25 0.375 0.578125q0.109375 0.328125 0.109375 0.75z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m417.796 79.966805l0 0l0 17.16098l0 0l0 -17.16098z" fill-rule="nonzero"/><path fill="#ffffff" d="m224.40218 91.80498l193.39381 0l0 1.3199997l-193.39381 0l0 -1.3199997z" fill-rule="nonzero"/></g></svg>
+\ No newline at end of file
diff --git a/mlir/docs/includes/img/bufferization_tensor_insert_dst.svg b/mlir/docs/includes/img/bufferization_tensor_insert_dst.svg
new file mode 100644
index 000000000000..228b19e92299
--- /dev/null
+++ b/mlir/docs/includes/img/bufferization_tensor_insert_dst.svg
@@ -0,0 +1 @@
+<svg version="1.1" viewBox="0.0 0.0 624.0 192.0" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg"><clipPath id="p.0"><path d="m0 0l624.0 0l0 192.0l-624.0 0l0 -192.0z" clip-rule="nonzero"/></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l624.0 0l0 192.0l-624.0 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m81.125984 50.27559l681.22833 0l0 73.732285l-681.22833 0z" fill-rule="evenodd"/><path fill="#616161" d="m100.20411 64.32059l-8.40625 12.875l-1.484375 0l8.375 -12.875l1.515625 0zm-4.859375 2.546875q0 0.59375 -0.171875 1.109375q-0.171875 0.5 -0.5 0.875q-0.3125 0.359375 -0.78125 0.578125q-0.453125 0.21875 -1.03125 0.21875q-0.546875 0 -1.0 -0.171875q-0.4375 -0.171875 -0.75 -0.515625q-0.3125 -0.34375 -0.484375 -0.84375q-0.15625 -0.5 -0.15625 -1.15625q0 -0.59375 0.171875 -1.09375q0.171875 -0.5 0.484375 -0.875q0.328125 -0.375 0.78125 -0.578125q0.46875 -0.21875 1.03125 -0.21875q0.5625 0 1.0 0.171875q0.4375 0.171875 0.75 0.515625q0.3125 0.328125 0.484375 0.828125q0.171875 0.5 0.171875 1.15625zm-1.34375 0.046875q0 -0.796875 -0.296875 -1.171875q-0.28125 -0.390625 -0.796875 -0.390625q-0.265625 0 -0.46875 0.125q-0.203125 0.109375 -0.34375 0.328125q-0.140625 0.203125 -0.21875 0.5q-0.078125 0.28125 -0.078125 0.609375q0 0.8125 0.296875 1.203125q0.296875 0.375 0.8125 0.375q0.265625 0 0.46875 -0.125q0.203125 -0.125 0.34375 -0.328125q0.140625 -0.203125 0.203125 -0.484375q0.078125 -0.296875 0.078125 -0.640625zm6.046875 7.65625q0 0.578125 -0.171875 1.09375q-0.171875 0.5 -0.5 0.875q-0.328125 0.359375 -0.796875 0.578125q-0.453125 0.203125 -1.015625 0.203125q-0.546875 0 -1.0 -0.171875q-0.4375 -0.171875 -0.75 -0.5q-0.3125 -0.34375 -0.484375 -0.84375q-0.171875 -0.515625 -0.171875 -1.171875q0 -0.578125 0.171875 -1.078125q0.1875 -0.515625 0.5 -0.890625q0.328125 -0.375 0.78125 -0.578125q0.46875 -0.203125 1.03125 -0.203125q0.5625 0 1.0 0.171875q0.453125 0.171875 0.765625 0.515625q0.3125 0.328125 0.46875 0.84375q0.171875 0.5 0.171875 1.15625zm-1.328125 0.03125q0 -0.796875 -0.296875 -1.171875q-0.296875 -0.390625 -0.8125 -0.390625q-0.265625 0 -0.46875 0.125q-0.203125 0.109375 -0.359375 0.328125q-0.140625 0.203125 -0.21875 0.484375q-0.0625 0.28125 -0.0625 0.625q0 0.796875 0.28125 1.1875q0.296875 0.375 0.828125 0.375q0.265625 0 0.46875 -0.109375q0.203125 -0.125 0.34375 -0.328125q0.140625 -0.21875 0.21875 -0.5q0.078125 -0.296875 0.078125 -0.625z" fill-rule="nonzero"/><path fill="#c53929" d="m102.2114 68.05496l1.453125 0l0.046875 1.6875q0.8125 -0.984375 1.59375 -1.421875q0.796875 -0.4375 1.59375 -0.4375q1.421875 0 2.15625 0.921875q0.734375 0.921875 0.671875 2.734375l-1.59375 0q0.015625 -1.203125 -0.359375 -1.734375q-0.375 -0.546875 -1.109375 -0.546875q-0.3125 0 -0.640625 0.109375q-0.328125 0.109375 -0.671875 0.359375q-0.328125 0.25 -0.71875 0.640625q-0.375 0.390625 -0.8125 0.953125l0 5.875l-1.609375 0l0 -9.140625z" fill-rule="nonzero"/><path fill="#616161" d="m129.94475 71.30496l-7.8281326 0l0 -1.328125l7.8281326 0l0 1.328125zm0 3.171875l-7.8281326 0l0 -1.328125l7.8281326 0l0 1.328125z" fill-rule="nonzero"/><path fill="#000000" d="m150.24059 77.07059q-0.53125 0.140625 -1.109375 0.1875q-0.578125 0.0625 -1.171875 0.0625q-1.71875 0 -2.5625 -0.78125q-0.84375 -0.78125 -0.84375 -2.390625l0 -4.765625l-2.5625 0l0 -1.328125l2.5625 0l0 -2.515625l1.578125 -0.40625l0 2.921875l4.109375 0l0 1.328125l-4.109375 0l0 4.640625q0 0.984375 0.515625 1.46875q0.53125 0.484375 1.546875 0.484375q0.4375 0 0.953125 -0.0625q0.53125 -0.0625 1.09375 -0.21875l0 1.375zm10.616669 -4.9375q0 0.34375 -0.015625 0.578125q0 0.21875 -0.03125 0.421875l-6.421875 0q0 1.40625 0.78125 2.15625q0.796875 0.75 2.265625 0.75q0.40625 0 0.796875 -0.03125q0.40625 -0.03125 0.78125 -0.078125q0.375 -0.0625 0.71875 -0.125q0.34375 -0.078125 0.625 -0.15625l0 1.296875q-0.640625 0.1875 -1.46875 0.296875q-0.8125 0.109375 -1.6875 0.109375q-1.171875 0 -2.015625 -0.3125q-0.84375 -0.3125 -1.390625 -0.921875q-0.546875 -0.609375 -0.8125 -1.484375q-0.25 -0.875 -0.25 -2.0q0 -0.953125 0.28125 -1.8125q0.28125 -0.859375 0.8125 -1.515625q0.53125 -0.65625 1.296875 -1.03125q0.78125 -0.390625 1.765625 -0.390625q0.953125 0 1.6875 0.3125q0.75 0.296875 1.25 0.84375q0.5 0.546875 0.765625 1.34375q0.265625 0.78125 0.265625 1.75zm-1.65625 -0.21875q0.03125 -0.609375 -0.125 -1.109375q-0.140625 -0.515625 -0.453125 -0.875q-0.296875 -0.375 -0.75 -0.578125q-0.453125 -0.203125 -1.0625 -0.203125q-0.515625 0 -0.953125 0.203125q-0.421875 0.203125 -0.734375 0.5625q-0.296875 0.359375 -0.5 0.875q-0.1875 0.515625 -0.234375 1.125l4.8125 0zm4.1791687 -3.859375l1.421875 0l0.0625 1.46875q0.390625 -0.46875 0.765625 -0.78125q0.375 -0.3125 0.734375 -0.5q0.359375 -0.203125 0.734375 -0.28125q0.375 -0.078125 0.78125 -0.078125q1.40625 0 2.125 0.84375q0.734375 0.828125 0.734375 2.5l0 5.96875l-1.59375 0l0 -5.84375q0 -1.078125 -0.40625 -1.578125q-0.390625 -0.515625 -1.1875 -0.515625q-0.28125 0 -0.5625 0.09375q-0.28125 0.078125 -0.578125 0.296875q-0.296875 0.203125 -0.65625 0.578125q-0.34375 0.359375 -0.78125 0.90625l0 6.0625l-1.59375 0l0 -9.140625zm17.507294 6.640625q0 0.484375 -0.171875 0.875q-0.15625 0.375 -0.4375 0.671875q-0.28125 0.296875 -0.65625 0.515625q-0.375 0.203125 -0.796875 0.34375q-0.421875 0.125 -0.875 0.1875q-0.453125 0.0625 -0.875 0.0625q-0.921875 0 -1.703125 -0.078125q-0.78125 -0.078125 -1.53125 -0.265625l0 -1.453125q0.796875 0.234375 1.59375 0.359375q0.796875 0.109375 1.578125 0.109375q1.140625 0 1.6875 -0.3125q0.546875 -0.3125 0.546875 -0.890625q0 -0.234375 -0.09375 -0.421875q-0.078125 -0.203125 -0.3125 -0.375q-0.21875 -0.1875 -0.703125 -0.375q-0.484375 -0.203125 -1.3125 -0.4375q-0.609375 -0.1875 -1.140625 -0.421875q-0.515625 -0.234375 -0.90625 -0.546875q-0.375 -0.328125 -0.59375 -0.75q-0.21875 -0.421875 -0.21875 -1.015625q0 -0.375 0.171875 -0.828125q0.171875 -0.453125 0.59375 -0.84375q0.4375 -0.40625 1.171875 -0.65625q0.734375 -0.265625 1.8125 -0.265625q0.546875 0 1.203125 0.0625q0.65625 0.0625 1.359375 0.203125l0 1.40625q-0.75 -0.171875 -1.421875 -0.25q-0.65625 -0.09375 -1.15625 -0.09375q-0.59375 0 -1.0 0.09375q-0.390625 0.078125 -0.65625 0.25q-0.25 0.15625 -0.359375 0.375q-0.109375 0.203125 -0.109375 0.453125q0 0.25 0.09375 0.453125q0.109375 0.1875 0.359375 0.375q0.265625 0.1875 0.71875 0.375q0.46875 0.1875 1.21875 0.40625q0.8125 0.234375 1.359375 0.5q0.5625 0.25 0.90625 0.578125q0.359375 0.3125 0.5 0.71875q0.15625 0.40625 0.15625 0.90625zm10.976044 -2.140625q0 1.0625 -0.3125 1.953125q-0.296875 0.890625 -0.859375 1.53125q-0.5625 0.625 -1.375 0.96875q-0.8125 0.34375 -1.84375 0.34375q-0.984375 0 -1.765625 -0.296875q-0.765625 -0.3125 -1.3125 -0.890625q-0.546875 -0.59375 -0.828125 -1.46875q-0.28125 -0.875 -0.28125 -2.015625q0 -1.0625 0.296875 -1.9375q0.296875 -0.890625 0.859375 -1.515625q0.5625 -0.640625 1.375 -0.984375q0.8125 -0.359375 1.84375 -0.359375q0.984375 0 1.765625 0.3125q0.78125 0.296875 1.3125 0.890625q0.546875 0.578125 0.828125 1.453125q0.296875 0.875 0.296875 2.015625zm-1.625 0.0625q0 -0.84375 -0.1875 -1.46875q-0.1875 -0.640625 -0.53125 -1.0625q-0.34375 -0.421875 -0.84375 -0.640625q-0.5 -0.21875 -1.109375 -0.21875q-0.703125 0 -1.21875 0.28125q-0.5 0.28125 -0.828125 0.75q-0.3125 0.453125 -0.46875 1.078125q-0.15625 0.609375 -0.15625 1.28125q0 0.859375 0.1875 1.5q0.1875 0.625 0.53125 1.046875q0.359375 0.421875 0.84375 0.640625q0.5 0.21875 1.109375 0.21875q0.71875 0 1.21875 -0.28125q0.5 -0.28125 0.828125 -0.734375q0.328125 -0.46875 0.46875 -1.078125q0.15625 -0.625 0.15625 -1.3125zm4.2885437 -4.5625l1.453125 0l0.046875 1.6875q0.8125 -0.984375 1.59375 -1.421875q0.796875 -0.4375 1.59375 -0.4375q1.421875 0 2.15625 0.921875q0.734375 0.921875 0.671875 2.734375l-1.59375 0q0.015625 -1.203125 -0.359375 -1.734375q-0.375 -0.546875 -1.109375 -0.546875q-0.3125 0 -0.640625 0.109375q-0.328125 0.109375 -0.671875 0.359375q-0.328125 0.25 -0.71875 0.640625q-0.375 0.390625 -0.8125 0.953125l0 5.875l-1.609375 0l0 -9.140625z" fill-rule="nonzero"/><path fill="#616161" d="m208.01872 74.32059q0.296875 0 0.578125 0.125q0.28125 0.125 0.484375 0.328125q0.203125 0.203125 0.3125 0.484375q0.125 0.28125 0.125 0.59375q0 0.3125 -0.125 0.59375q-0.109375 0.265625 -0.3125 0.46875q-0.203125 0.203125 -0.484375 0.3125q-0.28125 0.125 -0.578125 0.125q-0.328125 0 -0.609375 -0.125q-0.265625 -0.109375 -0.46875 -0.3125q-0.203125 -0.203125 -0.328125 -0.46875q-0.109375 -0.28125 -0.109375 -0.59375q0 -0.3125 0.109375 -0.59375q0.125 -0.28125 0.328125 -0.484375q0.203125 -0.203125 0.46875 -0.328125q0.28125 -0.125 0.609375 -0.125z" fill-rule="nonzero"/><path fill="#000000" d="m217.79164 69.36746l-2.703125 0l0 -1.3125l4.296875 0l0 7.8125l2.734375 0l0 1.328125l-7.34375 0l0 -1.328125l3.015625 0l0 -6.5zm0.546875 -5.15625q0.265625 0 0.484375 0.09375q0.234375 0.09375 0.40625 0.28125q0.171875 0.171875 0.265625 0.390625q0.09375 0.21875 0.09375 0.484375q0 0.25 -0.09375 0.484375q-0.09375 0.234375 -0.265625 0.40625q-0.171875 0.171875 -0.40625 0.265625q-0.21875 0.09375 -0.484375 0.09375q-0.265625 0 -0.5 -0.09375q-0.21875 -0.09375 -0.390625 -0.265625q-0.15625 -0.171875 -0.265625 -0.40625q-0.09375 -0.234375 -0.09375 -0.484375q0 -0.265625 0.09375 -0.484375q0.109375 -0.21875 0.265625 -0.390625q0.171875 -0.1875 0.390625 -0.28125q0.234375 -0.09375 0.5 -0.09375zm6.5854187 3.84375l1.421875 0l0.0625 1.46875q0.390625 -0.46875 0.765625 -0.78125q0.375 -0.3125 0.734375 -0.5q0.359375 -0.203125 0.734375 -0.28125q0.375 -0.078125 0.78125 -0.078125q1.40625 0 2.125 0.84375q0.734375 0.828125 0.734375 2.5l0 5.96875l-1.59375 0l0 -5.84375q0 -1.078125 -0.40625 -1.578125q-0.390625 -0.515625 -1.1875 -0.515625q-0.28125 0 -0.5625 0.09375q-0.28125 0.078125 -0.578125 0.296875q-0.296875 0.203125 -0.65625 0.578125q-0.34375 0.359375 -0.78125 0.90625l0 6.0625l-1.59375 0l0 -9.140625zm17.507294 6.640625q0 0.484375 -0.171875 0.875q-0.15625 0.375 -0.4375 0.671875q-0.28125 0.296875 -0.65625 0.515625q-0.375 0.203125 -0.796875 0.34375q-0.421875 0.125 -0.875 0.1875q-0.453125 0.0625 -0.875 0.0625q-0.921875 0 -1.703125 -0.078125q-0.78125 -0.078125 -1.53125 -0.265625l0 -1.453125q0.796875 0.234375 1.59375 0.359375q0.796875 0.109375 1.578125 0.109375q1.140625 0 1.6875 -0.3125q0.546875 -0.3125 0.546875 -0.890625q0 -0.234375 -0.09375 -0.421875q-0.078125 -0.203125 -0.3125 -0.375q-0.21875 -0.1875 -0.703125 -0.375q-0.484375 -0.203125 -1.3125 -0.4375q-0.609375 -0.1875 -1.140625 -0.421875q-0.515625 -0.234375 -0.90625 -0.546875q-0.375 -0.328125 -0.59375 -0.75q-0.21875 -0.421875 -0.21875 -1.015625q0 -0.375 0.171875 -0.828125q0.171875 -0.453125 0.59375 -0.84375q0.4375 -0.40625 1.171875 -0.65625q0.734375 -0.265625 1.8125 -0.265625q0.546875 0 1.203125 0.0625q0.65625 0.0625 1.359375 0.203125l0 1.40625q-0.75 -0.171875 -1.421875 -0.25q-0.65625 -0.09375 -1.15625 -0.09375q-0.59375 0 -1.0 0.09375q-0.390625 0.078125 -0.65625 0.25q-0.25 0.15625 -0.359375 0.375q-0.109375 0.203125 -0.109375 0.453125q0 0.25 0.09375 0.453125q0.109375 0.1875 0.359375 0.375q0.265625 0.1875 0.71875 0.375q0.46875 0.1875 1.21875 0.40625q0.8125 0.234375 1.359375 0.5q0.5625 0.25 0.90625 0.578125q0.359375 0.3125 0.5 0.71875q0.15625 0.40625 0.15625 0.90625zm10.741669 -2.5625q0 0.34375 -0.015625 0.578125q0 0.21875 -0.03125 0.421875l-6.421875 0q0 1.40625 0.78125 2.15625q0.796875 0.75 2.265625 0.75q0.40625 0 0.796875 -0.03125q0.40625 -0.03125 0.78125 -0.078125q0.375 -0.0625 0.71875 -0.125q0.34375 -0.078125 0.625 -0.15625l0 1.296875q-0.640625 0.1875 -1.46875 0.296875q-0.8125 0.109375 -1.6875 0.109375q-1.171875 0 -2.015625 -0.3125q-0.84375 -0.3125 -1.390625 -0.921875q-0.546875 -0.609375 -0.8125 -1.484375q-0.25 -0.875 -0.25 -2.0q0 -0.953125 0.28125 -1.8125q0.28125 -0.859375 0.8125 -1.515625q0.53125 -0.65625 1.296875 -1.03125q0.78125 -0.390625 1.765625 -0.390625q0.953125 0 1.6875 0.3125q0.75 0.296875 1.25 0.84375q0.5 0.546875 0.765625 1.34375q0.265625 0.78125 0.265625 1.75zm-1.65625 -0.21875q0.03125 -0.609375 -0.125 -1.109375q-0.140625 -0.515625 -0.453125 -0.875q-0.296875 -0.375 -0.75 -0.578125q-0.453125 -0.203125 -1.0625 -0.203125q-0.515625 0 -0.953125 0.203125q-0.421875 0.203125 -0.734375 0.5625q-0.296875 0.359375 -0.5 0.875q-0.1875 0.515625 -0.234375 1.125l4.8125 0zm4.5541534 -3.859375l1.453125 0l0.046875 1.6875q0.8125 -0.984375 1.59375 -1.421875q0.796875 -0.4375 1.59375 -0.4375q1.421875 0 2.15625 0.921875q0.734375 0.921875 0.671875 2.734375l-1.59375 0q0.015625 -1.203125 -0.359375 -1.734375q-0.375 -0.546875 -1.109375 -0.546875q-0.3125 0 -0.640625 0.109375q-0.328125 0.109375 -0.671875 0.359375q-0.328125 0.25 -0.71875 0.640625q-0.375 0.390625 -0.8125 0.953125l0 5.875l-1.609375 0l0 -9.140625zm17.257294 9.015625q-0.53125 0.140625 -1.109375 0.1875q-0.578125 0.0625 -1.171875 0.0625q-1.71875 0 -2.5625 -0.78125q-0.84375 -0.78125 -0.84375 -2.390625l0 -4.765625l-2.5625 0l0 -1.328125l2.5625 0l0 -2.515625l1.578125 -0.40625l0 2.921875l4.109375 0l0 1.328125l-4.109375 0l0 4.640625q0 0.984375 0.515625 1.46875q0.53125 0.484375 1.546875 0.484375q0.4375 0 0.953125 -0.0625q0.53125 -0.0625 1.09375 -0.21875l0 1.375z" fill-rule="nonzero"/><path fill="#616161" d="m295.09268 64.32059l-8.40625 12.875l-1.484375 0l8.375 -12.875l1.515625 0zm-4.859375 2.546875q0 0.59375 -0.171875 1.109375q-0.171875 0.5 -0.5 0.875q-0.3125 0.359375 -0.78125 0.578125q-0.453125 0.21875 -1.03125 0.21875q-0.546875 0 -1.0 -0.171875q-0.4375 -0.171875 -0.75 -0.515625q-0.3125 -0.34375 -0.484375 -0.84375q-0.15625 -0.5 -0.15625 -1.15625q0 -0.59375 0.171875 -1.09375q0.171875 -0.5 0.484375 -0.875q0.328125 -0.375 0.78125 -0.578125q0.46875 -0.21875 1.03125 -0.21875q0.5625 0 1.0 0.171875q0.4375 0.171875 0.75 0.515625q0.3125 0.328125 0.484375 0.828125q0.171875 0.5 0.171875 1.15625zm-1.34375 0.046875q0 -0.796875 -0.296875 -1.171875q-0.28125 -0.390625 -0.796875 -0.390625q-0.265625 0 -0.46875 0.125q-0.203125 0.109375 -0.34375 0.328125q-0.140625 0.203125 -0.21875 0.5q-0.078125 0.28125 -0.078125 0.609375q0 0.8125 0.296875 1.203125q0.296875 0.375 0.8125 0.375q0.265625 0 0.46875 -0.125q0.203125 -0.125 0.34375 -0.328125q0.140625 -0.203125 0.203125 -0.484375q0.078125 -0.296875 0.078125 -0.640625zm6.046875 7.65625q0 0.578125 -0.171875 1.09375q-0.171875 0.5 -0.5 0.875q-0.328125 0.359375 -0.796875 0.578125q-0.453125 0.203125 -1.015625 0.203125q-0.546875 0 -1.0 -0.171875q-0.4375 -0.171875 -0.75 -0.5q-0.3125 -0.34375 -0.484375 -0.84375q-0.171875 -0.515625 -0.171875 -1.171875q0 -0.578125 0.171875 -1.078125q0.1875 -0.515625 0.5 -0.890625q0.328125 -0.375 0.78125 -0.578125q0.46875 -0.203125 1.03125 -0.203125q0.5625 0 1.0 0.171875q0.453125 0.171875 0.765625 0.515625q0.3125 0.328125 0.46875 0.84375q0.171875 0.5 0.171875 1.15625zm-1.328125 0.03125q0 -0.796875 -0.296875 -1.171875q-0.296875 -0.390625 -0.8125 -0.390625q-0.265625 0 -0.46875 0.125q-0.203125 0.109375 -0.359375 0.328125q-0.140625 0.203125 -0.21875 0.484375q-0.0625 0.28125 -0.0625 0.625q0 0.796875 0.28125 1.1875q0.296875 0.375 0.828125 0.375q0.265625 0 0.46875 -0.109375q0.203125 -0.125 0.34375 -0.328125q0.140625 -0.21875 0.21875 -0.5q0.078125 -0.296875 0.078125 -0.625z" fill-rule="nonzero"/><path fill="#c53929" d="m304.84998 65.80496q-1.25 -0.265625 -2.15625 -0.265625q-2.140625 0 -2.140625 2.234375l0 1.609375l4.015625 0l0 1.3125l-4.015625 0l0 6.5l-1.609375 0l0 -6.5l-2.9375 0l0 -1.3125l2.9375 0l0 -1.515625q0 -3.65625 3.8125 -3.65625q0.9375 0 2.09375 0.21875l0 1.375zm-9.578125 2.25l0 0z" fill-rule="nonzero"/><path fill="#9c27b0" d="m320.36456 69.36746l-2.703125 0l0 -1.3125l4.296875 0l0 7.8125l2.734375 0l0 1.328125l-7.34375 0l0 -1.328125l3.015625 0l0 -6.5zm0.546875 -5.15625q0.265625 0 0.484375 0.09375q0.234375 0.09375 0.40625 0.28125q0.171875 0.171875 0.265625 0.390625q0.09375 0.21875 0.09375 0.484375q0 0.25 -0.09375 0.484375q-0.09375 0.234375 -0.265625 0.40625q-0.171875 0.171875 -0.40625 0.265625q-0.21875 0.09375 -0.484375 0.09375q-0.265625 0 -0.5 -0.09375q-0.21875 -0.09375 -0.390625 -0.265625q-0.15625 -0.171875 -0.265625 -0.40625q-0.09375 -0.234375 -0.09375 -0.484375q0 -0.265625 0.09375 -0.484375q0.109375 -0.21875 0.265625 -0.390625q0.171875 -0.1875 0.390625 -0.28125q0.234375 -0.09375 0.5 -0.09375zm6.5854187 3.84375l1.421875 0l0.0625 1.46875q0.390625 -0.46875 0.765625 -0.78125q0.375 -0.3125 0.734375 -0.5q0.359375 -0.203125 0.734375 -0.28125q0.375 -0.078125 0.78125 -0.078125q1.40625 0 2.125 0.84375q0.734375 0.828125 0.734375 2.5l0 5.96875l-1.59375 0l0 -5.84375q0 -1.078125 -0.40625 -1.578125q-0.390625 -0.515625 -1.1875 -0.515625q-0.28125 0 -0.5625 0.09375q-0.28125 0.078125 -0.578125 0.296875q-0.296875 0.203125 -0.65625 0.578125q-0.34375 0.359375 -0.78125 0.90625l0 6.0625l-1.59375 0l0 -9.140625zm17.632294 9.015625q-0.53125 0.140625 -1.109375 0.1875q-0.578125 0.0625 -1.171875 0.0625q-1.71875 0 -2.5625 -0.78125q-0.84375 -0.78125 -0.84375 -2.390625l0 -4.765625l-2.5625 0l0 -1.328125l2.5625 0l0 -2.515625l1.578125 -0.40625l0 2.921875l4.109375 0l0 1.328125l-4.109375 0l0 4.640625q0 0.984375 0.515625 1.46875q0.53125 0.484375 1.546875 0.484375q0.4375 0 0.953125 -0.0625q0.53125 -0.0625 1.09375 -0.21875l0 1.375zm10.851044 -4.515625q0 1.0625 -0.3125 1.953125q-0.296875 0.890625 -0.859375 1.53125q-0.5625 0.625 -1.375 0.96875q-0.8125 0.34375 -1.84375 0.34375q-0.984375 0 -1.765625 -0.296875q-0.765625 -0.3125 -1.3125 -0.890625q-0.546875 -0.59375 -0.828125 -1.46875q-0.28125 -0.875 -0.28125 -2.015625q0 -1.0625 0.296875 -1.9375q0.296875 -0.890625 0.859375 -1.515625q0.5625 -0.640625 1.375 -0.984375q0.8125 -0.359375 1.84375 -0.359375q0.984375 0 1.765625 0.3125q0.78125 0.296875 1.3125 0.890625q0.546875 0.578125 0.828125 1.453125q0.296875 0.875 0.296875 2.015625zm-1.625 0.0625q0 -0.84375 -0.1875 -1.46875q-0.1875 -0.640625 -0.53125 -1.0625q-0.34375 -0.421875 -0.84375 -0.640625q-0.5 -0.21875 -1.109375 -0.21875q-0.703125 0 -1.21875 0.28125q-0.5 0.28125 -0.828125 0.75q-0.3125 0.453125 -0.46875 1.078125q-0.15625 0.609375 -0.15625 1.28125q0 0.859375 0.1875 1.5q0.1875 0.625 0.53125 1.046875q0.359375 0.421875 0.84375 0.640625q0.5 0.21875 1.109375 0.21875q0.71875 0 1.21875 -0.28125q0.5 -0.28125 0.828125 -0.734375q0.328125 -0.46875 0.46875 -1.078125q0.15625 -0.625 0.15625 -1.3125z" fill-rule="nonzero"/><path fill="#616161" d="m377.15103 64.32059l-8.40625 12.875l-1.484375 0l8.375 -12.875l1.515625 0zm-4.859375 2.546875q0 0.59375 -0.171875 1.109375q-0.171875 0.5 -0.5 0.875q-0.3125 0.359375 -0.78125 0.578125q-0.453125 0.21875 -1.03125 0.21875q-0.546875 0 -1.0 -0.171875q-0.4375 -0.171875 -0.75 -0.515625q-0.3125 -0.34375 -0.484375 -0.84375q-0.15625 -0.5 -0.15625 -1.15625q0 -0.59375 0.171875 -1.09375q0.171875 -0.5 0.484375 -0.875q0.328125 -0.375 0.78125 -0.578125q0.46875 -0.21875 1.03125 -0.21875q0.5625 0 1.0 0.171875q0.4375 0.171875 0.75 0.515625q0.3125 0.328125 0.484375 0.828125q0.171875 0.5 0.171875 1.15625zm-1.34375 0.046875q0 -0.796875 -0.296875 -1.171875q-0.28125 -0.390625 -0.796875 -0.390625q-0.265625 0 -0.46875 0.125q-0.203125 0.109375 -0.34375 0.328125q-0.140625 0.203125 -0.21875 0.5q-0.078125 0.28125 -0.078125 0.609375q0 0.8125 0.296875 1.203125q0.296875 0.375 0.8125 0.375q0.265625 0 0.46875 -0.125q0.203125 -0.125 0.34375 -0.328125q0.140625 -0.203125 0.203125 -0.484375q0.078125 -0.296875 0.078125 -0.640625zm6.046875 7.65625q0 0.578125 -0.171875 1.09375q-0.171875 0.5 -0.5 0.875q-0.328125 0.359375 -0.796875 0.578125q-0.453125 0.203125 -1.015625 0.203125q-0.546875 0 -1.0 -0.171875q-0.4375 -0.171875 -0.75 -0.5q-0.3125 -0.34375 -0.484375 -0.84375q-0.171875 -0.515625 -0.171875 -1.171875q0 -0.578125 0.171875 -1.078125q0.1875 -0.515625 0.5 -0.890625q0.328125 -0.375 0.78125 -0.578125q0.46875 -0.203125 1.03125 -0.203125q0.5625 0 1.0 0.171875q0.453125 0.171875 0.765625 0.515625q0.3125 0.328125 0.46875 0.84375q0.171875 0.5 0.171875 1.15625zm-1.328125 0.03125q0 -0.796875 -0.296875 -1.171875q-0.296875 -0.390625 -0.8125 -0.390625q-0.265625 0 -0.46875 0.125q-0.203125 0.109375 -0.359375 0.328125q-0.140625 0.203125 -0.21875 0.484375q-0.0625 0.28125 -0.0625 0.625q0 0.796875 0.28125 1.1875q0.296875 0.375 0.828125 0.375q0.265625 0 0.46875 -0.109375q0.203125 -0.125 0.34375 -0.328125q0.140625 -0.21875 0.21875 -0.5q0.078125 -0.296875 0.078125 -0.625z" fill-rule="nonzero"/><path fill="#c53929" d="m386.15833 77.07059q-0.53125 0.140625 -1.109375 0.1875q-0.578125 0.0625 -1.171875 0.0625q-1.71875 0 -2.5625 -0.78125q-0.84375 -0.78125 -0.84375 -2.390625l0 -4.765625l-2.5625 0l0 -1.328125l2.5625 0l0 -2.515625l1.578125 -0.40625l0 2.921875l4.109375 0l0 1.328125l-4.109375 0l0 4.640625q0 0.984375 0.515625 1.46875q0.53125 0.484375 1.546875 0.484375q0.4375 0 0.953125 -0.0625q0.53125 -0.0625 1.09375 -0.21875l0 1.375z" fill-rule="nonzero"/><path fill="#616161" d="m395.22812 80.92996l-4.5 0l0 -16.921875l4.5 0l0 1.265625l-2.984375 0l0 14.375l2.984375 0l0 1.28125zm12.694794 -16.609375l-8.40625 12.875l-1.484375 0l8.375 -12.875l1.515625 0zm-4.859375 2.546875q0 0.59375 -0.171875 1.109375q-0.171875 0.5 -0.5 0.875q-0.3125 0.359375 -0.78125 0.578125q-0.453125 0.21875 -1.03125 0.21875q-0.546875 0 -1.0 -0.171875q-0.4375 -0.171875 -0.75 -0.515625q-0.3125 -0.34375 -0.484375 -0.84375q-0.15625 -0.5 -0.15625 -1.15625q0 -0.59375 0.171875 -1.09375q0.171875 -0.5 0.484375 -0.875q0.328125 -0.375 0.78125 -0.578125q0.46875 -0.21875 1.03125 -0.21875q0.5625 0 1.0 0.171875q0.4375 0.171875 0.75 0.515625q0.3125 0.328125 0.484375 0.828125q0.171875 0.5 0.171875 1.15625zm-1.34375 0.046875q0 -0.796875 -0.296875 -1.171875q-0.28125 -0.390625 -0.796875 -0.390625q-0.265625 0 -0.46875 0.125q-0.203125 0.109375 -0.34375 0.328125q-0.140625 0.203125 -0.21875 0.5q-0.078125 0.28125 -0.078125 0.609375q0 0.8125 0.296875 1.203125q0.296875 0.375 0.8125 0.375q0.265625 0 0.46875 -0.125q0.203125 -0.125 0.34375 -0.328125q0.140625 -0.203125 0.203125 -0.484375q0.078125 -0.296875 0.078125 -0.640625zm6.046875 7.65625q0 0.578125 -0.171875 1.09375q-0.171875 0.5 -0.5 0.875q-0.328125 0.359375 -0.796875 0.578125q-0.453125 0.203125 -1.015625 0.203125q-0.546875 0 -1.0 -0.171875q-0.4375 -0.171875 -0.75 -0.5q-0.3125 -0.34375 -0.484375 -0.84375q-0.171875 -0.515625 -0.171875 -1.171875q0 -0.578125 0.171875 -1.078125q0.1875 -0.515625 0.5 -0.890625q0.328125 -0.375 0.78125 -0.578125q0.46875 -0.203125 1.03125 -0.203125q0.5625 0 1.0 0.171875q0.453125 0.171875 0.765625 0.515625q0.3125 0.328125 0.46875 0.84375q0.171875 0.5 0.171875 1.15625zm-1.328125 0.03125q0 -0.796875 -0.296875 -1.171875q-0.296875 -0.390625 -0.8125 -0.390625q-0.265625 0 -0.46875 0.125q-0.203125 0.109375 -0.359375 0.328125q-0.140625 0.203125 -0.21875 0.484375q-0.0625 0.28125 -0.0625 0.625q0 0.796875 0.28125 1.1875q0.296875 0.375 0.828125 0.375q0.265625 0 0.46875 -0.109375q0.203125 -0.125 0.34375 -0.328125q0.140625 -0.21875 0.21875 -0.5q0.078125 -0.296875 0.078125 -0.625z" fill-rule="nonzero"/><path fill="#c53929" d="m412.6802 69.36746l-2.703125 0l0 -1.3125l4.296875 0l0 7.8125l2.734375 0l0 1.328125l-7.34375 0l0 -1.328125l3.015625 0l0 -6.5zm0.546875 -5.15625q0.265625 0 0.484375 0.09375q0.234375 0.09375 0.40625 0.28125q0.171875 0.171875 0.265625 0.390625q0.09375 0.21875 0.09375 0.484375q0 0.25 -0.09375 0.484375q-0.09375 0.234375 -0.265625 0.40625q-0.171875 0.171875 -0.40625 0.265625q-0.21875 0.09375 -0.484375 0.09375q-0.265625 0 -0.5 -0.09375q-0.21875 -0.09375 -0.390625 -0.265625q-0.15625 -0.171875 -0.265625 -0.40625q-0.09375 -0.234375 -0.09375 -0.484375q0 -0.265625 0.09375 -0.484375q0.109375 -0.21875 0.265625 -0.390625q0.171875 -0.1875 0.390625 -0.28125q0.234375 -0.09375 0.5 -0.09375zm6.1322937 8.609375q0 -1.171875 0.3125 -2.078125q0.3125 -0.90625 0.90625 -1.53125q0.59375 -0.625 1.40625 -0.9375q0.828125 -0.328125 1.84375 -0.328125q0.4375 0 0.859375 0.0625q0.421875 0.046875 0.828125 0.15625l0 -3.84375l1.59375 0l0 12.875l-1.421875 0l-0.046875 -1.734375q-0.671875 0.96875 -1.453125 1.4375q-0.765625 0.453125 -1.671875 0.453125q-0.78125 0 -1.375 -0.3125q-0.59375 -0.328125 -1.0 -0.921875q-0.390625 -0.609375 -0.59375 -1.4375q-0.1875 -0.84375 -0.1875 -1.859375zm1.609375 -0.109375q0 1.65625 0.484375 2.484375q0.5 0.8125 1.390625 0.8125q0.59375 0 1.265625 -0.53125q0.671875 -0.546875 1.40625 -1.609375l0 -4.234375q-0.390625 -0.1875 -0.859375 -0.28125q-0.46875 -0.09375 -0.9375 -0.09375q-1.296875 0 -2.03125 0.84375q-0.71875 0.828125 -0.71875 2.609375zm17.351044 4.484375l-2.109375 0l-2.46875 -3.484375l-2.4375 3.484375l-2.046875 0l3.546875 -4.59375l-3.390625 -4.546875l2.03125 0l2.40625 3.515625l2.359375 -3.515625l1.96875 0l-3.4375 4.578125l3.578125 4.5625z" fill-rule="nonzero"/><path fill="#616161" d="m445.98334 80.92996l-4.484375 0l0 -1.28125l2.953125 0l0 -14.375l-2.953125 0l0 -1.265625l4.484375 0l0 16.921875z" fill-rule="nonzero"/><path fill="#616161" d="m464.51355 67.88309q0.28125 0 0.53125 0.125q0.265625 0.109375 0.453125 0.296875q0.1875 0.1875 0.296875 0.453125q0.125 0.25 0.125 0.53125q0 0.296875 -0.125 0.546875q-0.109375 0.25 -0.296875 0.4375q-0.1875 0.1875 -0.453125 0.296875q-0.25 0.109375 -0.53125 0.109375q-0.28125 0 -0.53125 -0.109375q-0.25 -0.109375 -0.453125 -0.296875q-0.1875 -0.1875 -0.296875 -0.4375q-0.109375 -0.25 -0.109375 -0.546875q0 -0.28125 0.109375 -0.53125q0.109375 -0.265625 0.296875 -0.453125q0.203125 -0.1875 0.453125 -0.296875q0.25 -0.125 0.53125 -0.125zm0 6.6875q0.28125 0 0.53125 0.109375q0.265625 0.109375 0.453125 0.3125q0.1875 0.1875 0.296875 0.4375q0.125 0.25 0.125 0.53125q0 0.296875 -0.125 0.546875q-0.109375 0.25 -0.296875 0.453125q-0.1875 0.1875 -0.453125 0.28125q-0.25 0.109375 -0.53125 0.109375q-0.28125 0 -0.53125 -0.109375q-0.25 -0.09375 -0.453125 -0.28125q-0.1875 -0.203125 -0.296875 -0.453125q-0.109375 -0.25 -0.109375 -0.546875q0 -0.28125 0.109375 -0.53125q0.109375 -0.25 0.296875 -0.4375q0.203125 -0.203125 0.453125 -0.3125q0.25 -0.109375 0.53125 -0.109375z" fill-rule="nonzero"/><path fill="#000000" d="m488.73126 77.07059q-0.53125 0.140625 -1.109375 0.1875q-0.578125 0.0625 -1.171875 0.0625q-1.71875 0 -2.5625 -0.78125q-0.84375 -0.78125 -0.84375 -2.390625l0 -4.765625l-2.5625 0l0 -1.328125l2.5625 0l0 -2.515625l1.578125 -0.40625l0 2.921875l4.109375 0l0 1.328125l-4.109375 0l0 4.640625q0 0.984375 0.515625 1.46875q0.53125 0.484375 1.546875 0.484375q0.4375 0 0.953125 -0.0625q0.53125 -0.0625 1.09375 -0.21875l0 1.375zm10.616669 -4.9375q0 0.34375 -0.015625 0.578125q0 0.21875 -0.03125 0.421875l-6.421875 0q0 1.40625 0.78125 2.15625q0.796875 0.75 2.265625 0.75q0.40625 0 0.796875 -0.03125q0.40625 -0.03125 0.78125 -0.078125q0.375 -0.0625 0.71875 -0.125q0.34375 -0.078125 0.625 -0.15625l0 1.296875q-0.640625 0.1875 -1.46875 0.296875q-0.8125 0.109375 -1.6875 0.109375q-1.171875 0 -2.015625 -0.3125q-0.84375 -0.3125 -1.390625 -0.921875q-0.546875 -0.609375 -0.8125 -1.484375q-0.25 -0.875 -0.25 -2.0q0 -0.953125 0.28125 -1.8125q0.28125 -0.859375 0.8125 -1.515625q0.53125 -0.65625 1.296875 -1.03125q0.78125 -0.390625 1.765625 -0.390625q0.953125 0 1.6875 0.3125q0.75 0.296875 1.25 0.84375q0.5 0.546875 0.765625 1.34375q0.265625 0.78125 0.265625 1.75zm-1.65625 -0.21875q0.03125 -0.609375 -0.125 -1.109375q-0.140625 -0.515625 -0.453125 -0.875q-0.296875 -0.375 -0.75 -0.578125q-0.453125 -0.203125 -1.0625 -0.203125q-0.515625 0 -0.953125 0.203125q-0.421875 0.203125 -0.734375 0.5625q-0.296875 0.359375 -0.5 0.875q-0.1875 0.515625 -0.234375 1.125l4.8125 0zm4.1791687 -3.859375l1.421875 0l0.0625 1.46875q0.390625 -0.46875 0.765625 -0.78125q0.375 -0.3125 0.734375 -0.5q0.359375 -0.203125 0.734375 -0.28125q0.375 -0.078125 0.78125 -0.078125q1.40625 0 2.125 0.84375q0.734375 0.828125 0.734375 2.5l0 5.96875l-1.59375 0l0 -5.84375q0 -1.078125 -0.40625 -1.578125q-0.390625 -0.515625 -1.1875 -0.515625q-0.28125 0 -0.5625 0.09375q-0.28125 0.078125 -0.578125 0.296875q-0.296875 0.203125 -0.65625 0.578125q-0.34375 0.359375 -0.78125 0.90625l0 6.0625l-1.59375 0l0 -9.140625zm17.507324 6.640625q0 0.484375 -0.171875 0.875q-0.15625 0.375 -0.4375 0.671875q-0.28125 0.296875 -0.65625 0.515625q-0.375 0.203125 -0.796875 0.34375q-0.421875 0.125 -0.875 0.1875q-0.453125 0.0625 -0.875 0.0625q-0.921875 0 -1.703125 -0.078125q-0.78125 -0.078125 -1.53125 -0.265625l0 -1.453125q0.796875 0.234375 1.59375 0.359375q0.796875 0.109375 1.578125 0.109375q1.140625 0 1.6875 -0.3125q0.546875 -0.3125 0.546875 -0.890625q0 -0.234375 -0.09375 -0.421875q-0.078125 -0.203125 -0.3125 -0.375q-0.21875 -0.1875 -0.703125 -0.375q-0.484375 -0.203125 -1.3125 -0.4375q-0.609375 -0.1875 -1.140625 -0.421875q-0.515625 -0.234375 -0.90625 -0.546875q-0.375 -0.328125 -0.59375 -0.75q-0.21875 -0.421875 -0.21875 -1.015625q0 -0.375 0.171875 -0.828125q0.171875 -0.453125 0.59375 -0.84375q0.4375 -0.40625 1.171875 -0.65625q0.734375 -0.265625 1.8125 -0.265625q0.546875 0 1.203125 0.0625q0.65625 0.0625 1.359375 0.203125l0 1.40625q-0.75 -0.171875 -1.421875 -0.25q-0.65625 -0.09375 -1.15625 -0.09375q-0.59375 0 -1.0 0.09375q-0.390625 0.078125 -0.65625 0.25q-0.25 0.15625 -0.359375 0.375q-0.109375 0.203125 -0.109375 0.453125q0 0.25 0.09375 0.453125q0.109375 0.1875 0.359375 0.375q0.265625 0.1875 0.71875 0.375q0.46875 0.1875 1.21875 0.40625q0.8125 0.234375 1.359375 0.5q0.5625 0.25 0.90625 0.578125q0.359375 0.3125 0.5 0.71875q0.15625 0.40625 0.15625 0.90625zm10.976013 -2.140625q0 1.0625 -0.3125 1.953125q-0.296875 0.890625 -0.859375 1.53125q-0.5625 0.625 -1.375 0.96875q-0.8125 0.34375 -1.84375 0.34375q-0.984375 0 -1.765625 -0.296875q-0.765625 -0.3125 -1.3125 -0.890625q-0.546875 -0.59375 -0.828125 -1.46875q-0.28125 -0.875 -0.28125 -2.015625q0 -1.0625 0.296875 -1.9375q0.296875 -0.890625 0.859375 -1.515625q0.5625 -0.640625 1.375 -0.984375q0.8125 -0.359375 1.84375 -0.359375q0.984375 0 1.765625 0.3125q0.78125 0.296875 1.3125 0.890625q0.546875 0.578125 0.828125 1.453125q0.296875 0.875 0.296875 2.015625zm-1.625 0.0625q0 -0.84375 -0.1875 -1.46875q-0.1875 -0.640625 -0.53125 -1.0625q-0.34375 -0.421875 -0.84375 -0.640625q-0.5 -0.21875 -1.109375 -0.21875q-0.703125 0 -1.21875 0.28125q-0.5 0.28125 -0.828125 0.75q-0.3125 0.453125 -0.46875 1.078125q-0.15625 0.609375 -0.15625 1.28125q0 0.859375 0.1875 1.5q0.1875 0.625 0.53125 1.046875q0.359375 0.421875 0.84375 0.640625q0.5 0.21875 1.109375 0.21875q0.71875 0 1.21875 -0.28125q0.5 -0.28125 0.828125 -0.734375q0.328125 -0.46875 0.46875 -1.078125q0.15625 -0.625 0.15625 -1.3125zm4.288574 -4.5625l1.453125 0l0.046875 1.6875q0.8125 -0.984375 1.59375 -1.421875q0.796875 -0.4375 1.59375 -0.4375q1.421875 0 2.15625 0.921875q0.734375 0.921875 0.671875 2.734375l-1.59375 0q0.015625 -1.203125 -0.359375 -1.734375q-0.375 -0.546875 -1.109375 -0.546875q-0.3125 0 -0.640625 0.109375q-0.328125 0.109375 -0.671875 0.359375q-0.328125 0.25 -0.71875 0.640625q-0.375 0.390625 -0.8125 0.953125l0 5.875l-1.609375 0l0 -9.140625z" fill-rule="nonzero"/><path fill="#616161" d="m549.8063 76.33621l-0.984375 1.0l-6.125 -5.109375l6.125 -5.109375l0.984375 1.015625l-4.96875 4.078125l4.96875 4.125z" fill-rule="nonzero"/><path fill="#c53929" d="m560.5792 73.38309q0 0.890625 -0.390625 1.625q-0.375 0.71875 -1.046875 1.25q-0.65625 0.515625 -1.578125 0.8125q-0.90625 0.28125 -1.953125 0.28125q-0.265625 0 -0.578125 -0.015625q-0.296875 0 -0.609375 -0.03125q-0.296875 -0.015625 -0.59375 -0.046875q-0.28125 -0.03125 -0.5 -0.0625l0 -1.4375q0.484375 0.109375 1.109375 0.171875q0.640625 0.046875 1.296875 0.046875q0.71875 0 1.296875 -0.171875q0.59375 -0.171875 1.0 -0.484375q0.40625 -0.328125 0.625 -0.78125q0.21875 -0.46875 0.21875 -1.046875q0 -1.109375 -0.796875 -1.609375q-0.796875 -0.515625 -2.28125 -0.515625l-2.265625 0l0 -6.078125l6.421875 0l0 1.390625l-4.921875 0l0 3.34375l1.03125 0q0.859375 0 1.671875 0.15625q0.8125 0.140625 1.4375 0.53125q0.640625 0.390625 1.015625 1.046875q0.390625 0.640625 0.390625 1.625zm11.085388 3.8125l-2.109375 0l-2.46875 -3.484375l-2.4375 3.484375l-2.046875 0l3.546875 -4.59375l-3.390625 -4.546875l2.03125 0l2.40625 3.515625l2.359375 -3.515625l1.96875 0l-3.4375 4.578125l3.578125 4.5625zm10.132324 -11.390625q-1.25 -0.265625 -2.15625 -0.265625q-2.140625 0 -2.140625 2.234375l0 1.609375l4.015625 0l0 1.3125l-4.015625 0l0 6.5l-1.609375 0l0 -6.5l-2.9375 0l0 -1.3125l2.9375 0l0 -1.515625q0 -3.65625 3.8125 -3.65625q0.9375 0 2.09375 0.21875l0 1.375zm-9.578125 2.25l0 0zm19.210388 5.53125q0 0.78125 -0.328125 1.46875q-0.3125 0.6875 -0.953125 1.203125q-0.625 0.515625 -1.59375 0.8125q-0.96875 0.28125 -2.25 0.28125q-0.71875 0 -1.28125 -0.046875q-0.5625 -0.03125 -1.046875 -0.109375l0 -1.421875q0.5625 0.09375 1.1875 0.15625q0.625 0.046875 1.28125 0.046875q0.890625 0 1.515625 -0.15625q0.640625 -0.15625 1.046875 -0.453125q0.40625 -0.296875 0.578125 -0.71875q0.1875 -0.4375 0.1875 -0.984375q0 -0.484375 -0.21875 -0.84375q-0.21875 -0.375 -0.625 -0.609375q-0.390625 -0.25 -0.9375 -0.375q-0.546875 -0.125 -1.203125 -0.125l-1.359375 0l0 -1.296875l1.375 0q0.53125 0 0.96875 -0.140625q0.453125 -0.140625 0.765625 -0.40625q0.3125 -0.265625 0.484375 -0.65625q0.171875 -0.390625 0.171875 -0.875q0 -0.953125 -0.59375 -1.390625q-0.578125 -0.4375 -1.703125 -0.4375q-0.609375 0 -1.25 0.125q-0.625 0.109375 -1.359375 0.34375l0 -1.390625q0.3125 -0.109375 0.65625 -0.1875q0.359375 -0.09375 0.703125 -0.140625q0.359375 -0.0625 0.703125 -0.09375q0.34375 -0.03125 0.65625 -0.03125q0.953125 0 1.671875 0.203125q0.71875 0.203125 1.203125 0.59375q0.484375 0.375 0.71875 0.921875q0.25 0.546875 0.25 1.21875q0 1.03125 -0.53125 1.71875q-0.515625 0.6875 -1.421875 1.109375q0.46875 0.0625 0.90625 0.28125q0.453125 0.21875 0.8125 0.5625q0.375 0.34375 0.59375 0.8125q0.21875 0.453125 0.21875 1.03125zm10.491699 3.609375l-7.84375 0l0 -1.421875l3.078125 -3.0625q0.75 -0.75 1.21875 -1.296875q0.484375 -0.546875 0.75 -0.984375q0.265625 -0.453125 0.34375 -0.859375q0.09375 -0.40625 0.09375 -0.859375q0 -0.4375 -0.125 -0.828125q-0.109375 -0.40625 -0.359375 -0.703125q-0.234375 -0.3125 -0.640625 -0.484375q-0.390625 -0.171875 -0.9375 -0.171875q-0.75 0 -1.375 0.34375q-0.609375 0.328125 -1.125 0.875l-0.875 -1.046875q0.671875 -0.71875 1.546875 -1.140625q0.875 -0.421875 2.046875 -0.421875q0.796875 0 1.4375 0.234375q0.65625 0.234375 1.125 0.6875q0.484375 0.4375 0.734375 1.09375q0.265625 0.640625 0.265625 1.453125q0 0.671875 -0.1875 1.265625q-0.171875 0.578125 -0.546875 1.15625q-0.375 0.578125 -0.9375 1.203125q-0.546875 0.609375 -1.3125 1.359375l-2.171875 2.09375l5.796875 0l0 1.515625z" fill-rule="nonzero"/><path fill="#616161" d="m604.8969 68.13309l0.984375 -1.015625l6.125 5.109375l-6.125 5.109375l-0.984375 -1.0l4.96875 -4.09375l-4.96875 -4.109375z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m376.13647 58.68504l-12.75589 -23.464565" fill-rule="evenodd"/><path stroke="#ff0000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m376.13647 58.68504l-12.75589 -23.464565" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m297.02362 4.5826774l154.14172 0l0 23.464567l-154.14172 0z" fill-rule="evenodd"/><path fill="#000000" d="m308.58612 17.846426l0.734375 0.484375l-0.90625 1.40625q0.3125 0.046875 0.578125 0.328125q0.28125 0.265625 0.28125 0.734375q0 0.5 -0.34375 0.84375q-0.34375 0.34375 -0.828125 0.34375q-0.25 0 -0.484375 -0.09375q-0.21875 -0.09375 -0.375 -0.25q-0.15625 -0.15625 -0.25 -0.375q-0.09375 -0.234375 -0.09375 -0.46875q0 -0.234375 0.0625 -0.4375q0.0625 -0.21875 0.1875 -0.421875q0.125 -0.203125 0.25 -0.390625l1.1875 -1.703125zm3.15625 0l0.734375 0.484375l-0.90625 1.40625q0.3125 0.046875 0.578125 0.328125q0.28125 0.265625 0.28125 0.734375q0 0.5 -0.34375 0.84375q-0.34375 0.34375 -0.828125 0.34375q-0.25 0 -0.484375 -0.09375q-0.21875 -0.09375 -0.375 -0.25q-0.15625 -0.15625 -0.25 -0.375q-0.09375 -0.234375 -0.09375 -0.46875q0 -0.234375 0.0625 -0.4375q0.0625 -0.21875 0.1875 -0.421875q0.125 -0.203125 0.25 -0.390625l1.1875 -1.703125zm5.711914 13.953125q-1.28125 0 -2.328125 -0.640625q-1.03125 -0.65625 -1.625 -1.796875q-0.59375 -1.140625 -0.59375 -2.609375q0 -1.484375 0.59375 -2.609375q0.59375 -1.140625 1.625 -1.796875q1.046875 -0.65625 2.328125 -0.65625q0.765625 0 1.390625 0.25q0.640625 0.234375 1.109375 0.625q0.484375 0.390625 0.75 0.84375l0.078125 0l-0.078125 -1.328125l0 -3.9375l1.59375 0l0 13.359375l-1.515625 0l0 -1.40625l-0.078125 0q-0.265625 0.4375 -0.75 0.828125q-0.46875 0.390625 -1.109375 0.625q-0.625 0.25 -1.390625 0.25zm0.171875 -1.4375q0.828125 0 1.546875 -0.4375q0.71875 -0.4375 1.15625 -1.25q0.453125 -0.8125 0.453125 -1.921875q0 -1.125 -0.453125 -1.9375q-0.4375 -0.8125 -1.15625 -1.25q-0.71875 -0.4375 -1.546875 -0.4375q-0.8125 0 -1.53125 0.4375q-0.71875 0.4375 -1.171875 1.25q-0.453125 0.8125 -0.453125 1.9375q0 1.09375 0.453125 1.921875q0.453125 0.8125 1.171875 1.25q0.71875 0.4375 1.53125 0.4375zm11.153351 1.4375q-1.375 0 -2.453125 -0.640625q-1.0625 -0.65625 -1.671875 -1.796875q-0.609375 -1.140625 -0.609375 -2.59375q0 -1.359375 0.5625 -2.515625q0.578125 -1.15625 1.609375 -1.859375q1.03125 -0.703125 2.4375 -0.703125q1.421875 0 2.4375 0.625q1.015625 0.625 1.5625 1.734375q0.546875 1.09375 0.546875 2.515625q0 0.125 -0.015625 0.265625q0 0.125 -0.015625 0.21875l-8.1875 0l0 -1.3125l6.546875 0q-0.015625 -0.390625 -0.1875 -0.84375q-0.15625 -0.46875 -0.5 -0.859375q-0.34375 -0.40625 -0.875 -0.65625q-0.53125 -0.25 -1.3125 -0.25q-0.9375 0 -1.625 0.484375q-0.671875 0.46875 -1.046875 1.296875q-0.359375 0.8125 -0.359375 1.859375q0 1.203125 0.46875 2.015625q0.46875 0.796875 1.203125 1.1875q0.75 0.390625 1.546875 0.390625q1.046875 0 1.71875 -0.484375q0.6875 -0.5 1.09375 -1.234375l1.34375 0.65625q-0.5625 1.078125 -1.609375 1.796875q-1.03125 0.703125 -2.609375 0.703125zm9.690552 0q-1.0625 0 -1.875 -0.34375q-0.8125 -0.34375 -1.34375 -0.921875q-0.53125 -0.59375 -0.796875 -1.28125l1.421875 -0.640625q0.375 0.859375 1.078125 1.328125q0.703125 0.46875 1.625 0.46875q0.875 0 1.453125 -0.359375q0.59375 -0.359375 0.59375 -1.046875q0 -0.421875 -0.25 -0.703125q-0.234375 -0.296875 -0.703125 -0.5q-0.453125 -0.21875 -1.125 -0.375l-1.15625 -0.3125q-0.671875 -0.1875 -1.28125 -0.515625q-0.59375 -0.34375 -0.953125 -0.875q-0.359375 -0.53125 -0.359375 -1.296875q0 -0.84375 0.5 -1.453125q0.5 -0.625 1.3125 -0.953125q0.828125 -0.328125 1.765625 -0.328125q0.8125 0 1.53125 0.234375q0.71875 0.234375 1.265625 0.6875q0.546875 0.453125 0.8125 1.125l-1.375 0.640625q-0.359375 -0.703125 -0.953125 -0.984375q-0.59375 -0.28125 -1.328125 -0.28125q-0.78125 0 -1.359375 0.34375q-0.578125 0.34375 -0.578125 0.9375q0 0.59375 0.46875 0.90625q0.484375 0.296875 1.171875 0.46875l1.390625 0.359375q1.390625 0.359375 2.09375 1.0625q0.71875 0.6875 0.71875 1.703125q0 0.890625 -0.515625 1.546875q-0.5 0.65625 -1.359375 1.015625q-0.84375 0.34375 -1.890625 0.34375zm4.746704 -9.8125l5.578125 0l0 1.4375l-5.578125 0l0 -1.4375zm1.671875 7.015625l0 -9.703125l1.578125 0l0 9.3125q0 0.75 0.3125 1.15625q0.3125 0.40625 1.015625 0.40625q0.3125 0 0.578125 -0.09375q0.265625 -0.09375 0.46875 -0.21875l0 1.546875q-0.25 0.109375 -0.546875 0.171875q-0.28125 0.078125 -0.765625 0.078125q-1.1875 0 -1.921875 -0.703125q-0.71875 -0.703125 -0.71875 -1.953125zm6.0434875 2.5l0 -9.515625l1.578125 0l0 9.515625l-1.578125 0zm0.78125 -11.265625q-0.46875 0 -0.8125 -0.328125q-0.328125 -0.34375 -0.328125 -0.8125q0 -0.484375 0.328125 -0.8125q0.34375 -0.328125 0.8125 -0.328125q0.484375 0 0.8125 0.328125q0.328125 0.328125 0.328125 0.8125q0 0.46875 -0.328125 0.8125q-0.328125 0.328125 -0.8125 0.328125zm3.2788086 11.265625l0 -9.515625l1.515625 0l0 1.40625l0.078125 0q0.375 -0.703125 1.21875 -1.203125q0.84375 -0.5 1.859375 -0.5q1.75 0 2.625 1.015625q0.890625 1.015625 0.890625 2.703125l0 6.09375l-1.578125 0l0 -5.859375q0 -1.375 -0.671875 -1.9375q-0.65625 -0.578125 -1.703125 -0.578125q-0.78125 0 -1.375 0.4375q-0.59375 0.4375 -0.9375 1.125q-0.328125 0.6875 -0.328125 1.453125l0 5.359375l-1.59375 0zm13.419708 0.296875q-1.0625 0 -1.875 -0.40625q-0.796875 -0.40625 -1.25 -1.125q-0.453125 -0.71875 -0.453125 -1.640625q0 -1.046875 0.53125 -1.765625q0.546875 -0.71875 1.453125 -1.078125q0.921875 -0.359375 2.015625 -0.359375q0.640625 0 1.171875 0.109375q0.546875 0.09375 0.9375 0.234375q0.40625 0.140625 0.625 0.265625l0 -0.578125q0 -1.078125 -0.765625 -1.703125q-0.765625 -0.640625 -1.875 -0.640625q-0.78125 0 -1.46875 0.34375q-0.671875 0.34375 -1.0625 0.953125l-1.203125 -0.890625q0.375 -0.5625 0.9375 -0.96875q0.5625 -0.40625 1.28125 -0.625q0.71875 -0.234375 1.515625 -0.234375q1.9375 0 3.03125 1.03125q1.109375 1.015625 1.109375 2.75l0 6.03125l-1.5 0l0 -1.359375l-0.078125 0q-0.25 0.40625 -0.703125 0.796875q-0.4375 0.375 -1.046875 0.609375q-0.609375 0.25 -1.328125 0.25zm0.140625 -1.390625q0.828125 0 1.5 -0.40625q0.6875 -0.421875 1.09375 -1.109375q0.421875 -0.6875 0.421875 -1.515625q-0.4375 -0.296875 -1.078125 -0.484375q-0.640625 -0.1875 -1.40625 -0.1875q-1.359375 0 -2.0 0.5625q-0.640625 0.5625 -0.640625 1.375q0 0.78125 0.59375 1.28125q0.609375 0.484375 1.515625 0.484375zm5.8760376 -8.421875l5.578125 0l0 1.4375l-5.578125 0l0 -1.4375zm1.671875 7.015625l0 -9.703125l1.578125 0l0 9.3125q0 0.75 0.3125 1.15625q0.3125 0.40625 1.015625 0.40625q0.3125 0 0.578125 -0.09375q0.265625 -0.09375 0.46875 -0.21875l0 1.546875q-0.25 0.109375 -0.546875 0.171875q-0.28125 0.078125 -0.765625 0.078125q-1.1875 0 -1.921875 -0.703125q-0.71875 -0.703125 -0.71875 -1.953125zm6.043518 2.5l0 -9.515625l1.578125 0l0 9.515625l-1.578125 0zm0.78125 -11.265625q-0.46875 0 -0.8125 -0.328125q-0.328125 -0.34375 -0.328125 -0.8125q0 -0.484375 0.328125 -0.8125q0.34375 -0.328125 0.8125 -0.328125q0.484375 0 0.8125 0.328125q0.328125 0.328125 0.328125 0.8125q0 0.46875 -0.328125 0.8125q-0.328125 0.328125 -0.8125 0.328125zm7.5760803 11.5625q-1.4375 0 -2.546875 -0.671875q-1.09375 -0.671875 -1.71875 -1.8125q-0.625 -1.15625 -0.625 -2.5625q0 -1.421875 0.625 -2.5625q0.625 -1.15625 1.71875 -1.828125q1.109375 -0.671875 2.546875 -0.671875q1.4375 0 2.53125 0.6875q1.109375 0.671875 1.734375 1.828125q0.625 1.140625 0.625 2.546875q0 1.40625 -0.625 2.5625q-0.625 1.140625 -1.734375 1.8125q-1.09375 0.671875 -2.53125 0.671875zm0 -1.4375q0.859375 0 1.609375 -0.421875q0.75 -0.4375 1.21875 -1.25q0.46875 -0.8125 0.46875 -1.9375q0 -1.140625 -0.46875 -1.953125q-0.46875 -0.8125 -1.21875 -1.234375q-0.75 -0.4375 -1.609375 -0.4375q-0.859375 0 -1.625 0.4375q-0.765625 0.421875 -1.234375 1.234375q-0.46875 0.8125 -0.46875 1.953125q0 1.125 0.46875 1.9375q0.46875 0.8125 1.234375 1.25q0.765625 0.421875 1.625 0.421875zm6.5418396 1.140625l0 -9.515625l1.515625 0l0 1.40625l0.078125 0q0.375 -0.703125 1.21875 -1.203125q0.84375 -0.5 1.859375 -0.5q1.75 0 2.625 1.015625q0.890625 1.015625 0.890625 2.703125l0 6.09375l-1.578125 0l0 -5.859375q0 -1.375 -0.671875 -1.9375q-0.65625 -0.578125 -1.703125 -0.578125q-0.78125 0 -1.375 0.4375q-0.59375 0.4375 -0.9375 1.125q-0.328125 0.6875 -0.328125 1.453125l0 5.359375l-1.59375 0zm9.795959 -10.0l0.984375 -1.40625q-0.34375 -0.0625 -0.640625 -0.328125q-0.28125 -0.265625 -0.28125 -0.734375q0 -0.5 0.34375 -0.84375q0.34375 -0.34375 0.828125 -0.34375q0.265625 0 0.46875 0.09375q0.21875 0.09375 0.375 0.25q0.15625 0.15625 0.25 0.375q0.09375 0.21875 0.09375 0.46875q0 0.234375 -0.078125 0.4375q-0.0625 0.203125 -0.1875 0.40625q-0.109375 0.203125 -0.234375 0.40625l-1.171875 1.703125l-0.75 -0.484375zm3.15625 0l0.984375 -1.40625q-0.34375 -0.0625 -0.640625 -0.328125q-0.28125 -0.265625 -0.28125 -0.734375q0 -0.5 0.34375 -0.84375q0.34375 -0.34375 0.828125 -0.34375q0.265625 0 0.46875 0.09375q0.21875 0.09375 0.375 0.25q0.15625 0.15625 0.25 0.375q0.09375 0.21875 0.09375 0.46875q0 0.234375 -0.078125 0.4375q-0.0625 0.203125 -0.1875 0.40625q-0.109375 0.203125 -0.234375 0.40625l-1.171875 1.703125l-0.75 -0.484375z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m100.52231 63.587925l54.61418 -21.952755" fill-rule="evenodd"/><path stroke="#ff0000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m100.52231 63.587925l54.61418 -21.952755" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m366.43832 63.07874l-146.48819 -20.944881" fill-rule="evenodd"/><path stroke="#ff0000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m366.43832 63.07874l-146.48819 -20.944881" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m-3.0367453 9.677165l388.15747 0l0 23.464565l-388.15747 0z" fill-rule="evenodd"/><path fill="#000000" d="m6.4788795 27.08154l5.5781255 0l0 1.4375l-5.5781255 0l0 -1.4375zm1.6718755 7.015625l0 -9.703125l1.578125 0l0 9.3125q0 0.75 0.3125 1.15625q0.3125 0.40625 1.015625 0.40625q0.3125 0 0.578125 -0.09375q0.265625 -0.09375 0.46875 -0.21875l0 1.546875q-0.25 0.109375 -0.546875 0.171875q-0.28125 0.078125 -0.765625 0.078125q-1.1875 0 -1.921875 -0.703125q-0.71875 -0.703125 -0.71875 -1.953125zm6.043503 2.5l0 -9.515625l1.578125 0l0 9.515625l-1.578125 0zm0.78125 -11.265625q-0.46875 0 -0.8125 -0.328125q-0.328125 -0.34375 -0.328125 -0.8125q0 -0.484375 0.328125 -0.8125q0.34375 -0.328125 0.8125 -0.328125q0.484375 0 0.8125 0.328125q0.328125 0.328125 0.328125 0.8125q0 0.46875 -0.328125 0.8125q-0.328125 0.328125 -0.8125 0.328125zm7.4198456 11.5625q-1.375 0 -2.453125 -0.640625q-1.0625 -0.65625 -1.671875 -1.796875q-0.609375 -1.140625 -0.609375 -2.59375q0 -1.359375 0.5625 -2.515625q0.578125 -1.15625 1.609375 -1.859375q1.03125 -0.703125 2.4375 -0.703125q1.421875 0 2.4375 0.625q1.015625 0.625 1.5625 1.734375q0.546875 1.09375 0.546875 2.515625q0 0.125 -0.015625 0.265625q0 0.125 -0.015625 0.21875l-8.1875 0l0 -1.3125l6.546875 0q-0.015625 -0.390625 -0.1875 -0.84375q-0.15625 -0.46875 -0.5 -0.859375q-0.34375 -0.40625 -0.875 -0.65625q-0.53125 -0.25 -1.3125 -0.25q-0.9375 0 -1.625 0.484375q-0.671875 0.46875 -1.046875 1.296875q-0.359375 0.8125 -0.359375 1.859375q0 1.203125 0.46875 2.015625q0.46875 0.796875 1.203125 1.1875q0.75 0.390625 1.546875 0.390625q1.046875 0 1.71875 -0.484375q0.6875 -0.5 1.09375 -1.234375l1.34375 0.65625q-0.5625 1.078125 -1.609375 1.796875q-1.03125 0.703125 -2.609375 0.703125zm10.4093 0q-1.2812481 0 -2.328123 -0.640625q-1.03125 -0.65625 -1.625 -1.796875q-0.59375 -1.140625 -0.59375 -2.609375q0 -1.484375 0.59375 -2.609375q0.59375 -1.140625 1.625 -1.796875q1.046875 -0.65625 2.328123 -0.65625q0.765625 0 1.390625 0.25q0.640625 0.234375 1.109375 0.625q0.484375 0.390625 0.75 0.84375l0.078125 0l-0.078125 -1.328125l0 -3.9375l1.59375 0l0 13.359375l-1.515625 0l0 -1.40625l-0.078125 0q-0.265625 0.4375 -0.75 0.828125q-0.46875 0.390625 -1.109375 0.625q-0.625 0.25 -1.390625 0.25zm0.171875 -1.4375q0.828125 0 1.546875 -0.4375q0.71875 -0.4375 1.15625 -1.25q0.453125 -0.8125 0.453125 -1.921875q0 -1.125 -0.453125 -1.9375q-0.4375 -0.8125 -1.15625 -1.25q-0.71875 -0.4375 -1.546875 -0.4375q-0.8125 0 -1.5312481 0.4375q-0.71875 0.4375 -1.171875 1.25q-0.453125 0.8125 -0.453125 1.9375q0 1.09375 0.453125 1.921875q0.453125 0.8125 1.171875 1.25q0.7187481 0.4375 1.5312481 0.4375zm17.48285 1.4375q-1.46875 0 -2.71875 -0.53125q-1.25 -0.546875 -2.1875 -1.5q-0.921875 -0.96875 -1.4375 -2.21875q-0.5 -1.265625 -0.5 -2.71875q0 -1.46875 0.5 -2.71875q0.515625 -1.265625 1.4375 -2.21875q0.9375 -0.96875 2.1875 -1.5q1.25 -0.546875 2.71875 -0.546875q1.46875 0 2.71875 0.546875q1.25 0.53125 2.171875 1.5q0.921875 0.953125 1.4375 2.21875q0.515625 1.25 0.515625 2.71875q0 1.453125 -0.515625 2.71875q-0.515625 1.25 -1.4375 2.21875q-0.921875 0.953125 -2.171875 1.5q-1.25 0.53125 -2.71875 0.53125zm0 -1.515625q1.453125 0 2.640625 -0.6875q1.1875 -0.6875 1.890625 -1.921875q0.703125 -1.25 0.703125 -2.84375q0 -1.625 -0.703125 -2.859375q-0.703125 -1.234375 -1.890625 -1.921875q-1.1875 -0.6875 -2.640625 -0.6875q-1.4375 0 -2.640625 0.6875q-1.1875 0.6875 -1.90625 1.921875q-0.703125 1.234375 -0.703125 2.859375q0 1.59375 0.703125 2.84375q0.71875 1.234375 1.90625 1.921875q1.203125 0.6875 2.640625 0.6875zm8.857132 5.25l0 -13.546875l1.515625 0l0 1.421875l0.078125 0q0.25 -0.453125 0.71875 -0.84375q0.484375 -0.390625 1.125 -0.625q0.640625 -0.25 1.3906288 -0.25q1.3125 0 2.328125 0.65625q1.03125 0.65625 1.625 1.796875q0.609375 1.125 0.609375 2.609375q0 1.46875 -0.609375 2.609375q-0.59375 1.140625 -1.625 1.796875q-1.015625 0.640625 -2.328125 0.640625q-1.1250038 0 -1.9843788 -0.515625q-0.859375 -0.53125 -1.25 -1.1875l-0.078125 0l0.078125 1.3125l0 4.125l-1.59375 0zm4.671875 -5.171875q0.8125038 0 1.5312538 -0.4375q0.71875 -0.4375 1.15625 -1.25q0.4375 -0.828125 0.4375 -1.921875q0 -1.125 -0.4375 -1.9375q-0.4375 -0.8125 -1.15625 -1.25q-0.71875 -0.4375 -1.5312538 -0.4375q-0.828125 0 -1.546875 0.4375q-0.71875 0.4375 -1.171875 1.25q-0.4375 0.8125 -0.4375 1.9375q0 1.109375 0.4375 1.921875q0.453125 0.8125 1.171875 1.25q0.71875 0.4375 1.546875 0.4375zm13.074818 1.4375q-1.46875 0 -2.71875 -0.53125q-1.25 -0.546875 -2.1875 -1.5q-0.921875 -0.96875 -1.4375 -2.21875q-0.5 -1.265625 -0.5 -2.71875q0 -1.46875 0.5 -2.71875q0.515625 -1.265625 1.4375 -2.21875q0.9375 -0.96875 2.1875 -1.5q1.25 -0.546875 2.71875 -0.546875q1.46875 0 2.71875 0.546875q1.25 0.53125 2.171875 1.5q0.921875 0.953125 1.4375 2.21875q0.515625 1.25 0.515625 2.71875q0 1.453125 -0.515625 2.71875q-0.515625 1.25 -1.4375 2.21875q-0.921875 0.953125 -2.171875 1.5q-1.25 0.53125 -2.71875 0.53125zm0 -1.515625q1.453125 0 2.640625 -0.6875q1.1875 -0.6875 1.890625 -1.921875q0.703125 -1.25 0.703125 -2.84375q0 -1.625 -0.703125 -2.859375q-0.703125 -1.234375 -1.890625 -1.921875q-1.1875 -0.6875 -2.640625 -0.6875q-1.4375 0 -2.640625 0.6875q-1.1875 0.6875 -1.90625 1.921875q-0.703125 1.234375 -0.703125 2.859375q0 1.59375 0.703125 2.84375q0.71875 1.234375 1.90625 1.921875q1.203125 0.6875 2.640625 0.6875zm8.857132 5.25l0 -13.546875l1.515625 0l0 1.421875l0.078125 0q0.25 -0.453125 0.71875 -0.84375q0.484375 -0.390625 1.125 -0.625q0.640625 -0.25 1.390625 -0.25q1.3125 0 2.328125 0.65625q1.03125 0.65625 1.625 1.796875q0.609375 1.125 0.609375 2.609375q0 1.46875 -0.609375 2.609375q-0.59375 1.140625 -1.625 1.796875q-1.015625 0.640625 -2.328125 0.640625q-1.125 0 -1.984375 -0.515625q-0.859375 -0.53125 -1.25 -1.1875l-0.078125 0l0.078125 1.3125l0 4.125l-1.59375 0zm4.671875 -5.171875q0.8125 0 1.53125 -0.4375q0.71875 -0.4375 1.15625 -1.25q0.4375 -0.828125 0.4375 -1.921875q0 -1.125 -0.4375 -1.9375q-0.4375 -0.8125 -1.15625 -1.25q-0.71875 -0.4375 -1.53125 -0.4375q-0.828125 0 -1.546875 0.4375q-0.71875 0.4375 -1.171875 1.25q-0.4375 0.8125 -0.4375 1.9375q0 1.109375 0.4375 1.921875q0.453125 0.8125 1.171875 1.25q0.71875 0.4375 1.546875 0.4375zm10.793564 1.4375q-1.375 0 -2.453125 -0.640625q-1.0625 -0.65625 -1.671875 -1.796875q-0.609375 -1.140625 -0.609375 -2.59375q0 -1.359375 0.5625 -2.515625q0.578125 -1.15625 1.609375 -1.859375q1.03125 -0.703125 2.4375 -0.703125q1.421875 0 2.4375 0.625q1.015625 0.625 1.5625 1.734375q0.546875 1.09375 0.546875 2.515625q0 0.125 -0.015625 0.265625q0 0.125 -0.015625 0.21875l-8.1875 0l0 -1.3125l6.546875 0q-0.015625 -0.390625 -0.1875 -0.84375q-0.15625 -0.46875 -0.5 -0.859375q-0.34375 -0.40625 -0.875 -0.65625q-0.53125 -0.25 -1.3125 -0.25q-0.9375 0 -1.625 0.484375q-0.671875 0.46875 -1.046875 1.296875q-0.359375 0.8125 -0.359375 1.859375q0 1.203125 0.46875 2.015625q0.46875 0.796875 1.203125 1.1875q0.75 0.390625 1.546875 0.390625q1.046875 0 1.71875 -0.484375q0.6875 -0.5 1.09375 -1.234375l1.34375 0.65625q-0.5625 1.078125 -1.609375 1.796875q-1.03125 0.703125 -2.609375 0.703125zm6.2690887 -0.296875l0 -9.515625l1.515625 0l0 1.53125l0.078125 0q0.1875 -0.546875 0.625 -0.9375q0.4375 -0.40625 1.0 -0.640625q0.578125 -0.234375 1.125 -0.234375q0.4375 0 0.671875 0.046875q0.25 0.046875 0.453125 0.140625l0 1.71875q-0.296875 -0.15625 -0.640625 -0.21875q-0.34375 -0.078125 -0.703125 -0.078125q-0.6875 0 -1.265625 0.390625q-0.578125 0.390625 -0.921875 1.046875q-0.34375 0.65625 -0.34375 1.4375l0 5.3125l-1.59375 0zm9.669632 0.296875q-1.0625 0 -1.875 -0.40625q-0.796875 -0.40625 -1.25 -1.125q-0.453125 -0.71875 -0.453125 -1.640625q0 -1.046875 0.53125 -1.765625q0.546875 -0.71875 1.453125 -1.078125q0.921875 -0.359375 2.015625 -0.359375q0.640625 0 1.171875 0.109375q0.546875 0.09375 0.9375 0.234375q0.40625 0.140625 0.625 0.265625l0 -0.578125q0 -1.078125 -0.765625 -1.703125q-0.765625 -0.640625 -1.875 -0.640625q-0.78125 0 -1.46875 0.34375q-0.671875 0.34375 -1.0625 0.953125l-1.203125 -0.890625q0.375 -0.5625 0.9375 -0.96875q0.5625 -0.40625 1.28125 -0.625q0.71875 -0.234375 1.515625 -0.234375q1.9375 0 3.03125 1.03125q1.109375 1.015625 1.109375 2.75l0 6.03125l-1.5 0l0 -1.359375l-0.078125 0q-0.25 0.40625 -0.703125 0.796875q-0.4375 0.375 -1.046875 0.609375q-0.609375 0.25 -1.328125 0.25zm0.140625 -1.390625q0.828125 0 1.5 -0.40625q0.6875 -0.421875 1.09375 -1.109375q0.421875 -0.6875 0.421875 -1.515625q-0.4375 -0.296875 -1.078125 -0.484375q-0.640625 -0.1875 -1.40625 -0.1875q-1.359375 0 -2.0 0.5625q-0.640625 0.5625 -0.640625 1.375q0 0.78125 0.59375 1.28125q0.609375 0.484375 1.515625 0.484375zm6.7189636 1.09375l0 -9.515625l1.515625 0l0 1.40625l0.078125 0q0.375 -0.703125 1.21875 -1.203125q0.84375 -0.5 1.859375 -0.5q1.75 0 2.625 1.015625q0.890625 1.015625 0.890625 2.703125l0 6.09375l-1.578125 0l0 -5.859375q0 -1.375 -0.671875 -1.9375q-0.65625 -0.578125 -1.703125 -0.578125q-0.78125 0 -1.375 0.4375q-0.59375 0.4375 -0.9375 1.125q-0.328125 0.6875 -0.328125 1.453125l0 5.359375l-1.59375 0zm14.513031 0.296875q-1.28125 0 -2.328125 -0.640625q-1.03125 -0.65625 -1.625 -1.796875q-0.59375 -1.140625 -0.59375 -2.609375q0 -1.484375 0.59375 -2.609375q0.59375 -1.140625 1.625 -1.796875q1.046875 -0.65625 2.328125 -0.65625q0.765625 0 1.390625 0.25q0.640625 0.234375 1.109375 0.625q0.484375 0.390625 0.75 0.84375l0.078125 0l-0.078125 -1.328125l0 -3.9375l1.59375 0l0 13.359375l-1.515625 0l0 -1.40625l-0.078125 0q-0.265625 0.4375 -0.75 0.828125q-0.46875 0.390625 -1.109375 0.625q-0.625 0.25 -1.390625 0.25zm0.171875 -1.4375q0.828125 0 1.546875 -0.4375q0.71875 -0.4375 1.15625 -1.25q0.453125 -0.8125 0.453125 -1.921875q0 -1.125 -0.453125 -1.9375q-0.4375 -0.8125 -1.15625 -1.25q-0.71875 -0.4375 -1.546875 -0.4375q-0.8125 0 -1.53125 0.4375q-0.71875 0.4375 -1.171875 1.25q-0.453125 0.8125 -0.453125 1.9375q0 1.09375 0.453125 1.921875q0.453125 0.8125 1.171875 1.25q0.71875 0.4375 1.53125 0.4375zm5.871689 1.140625l4.078125 -13.359375l1.453125 0l-4.0625 13.359375l-1.46875 0zm12.879791 0.296875q-1.46875 0 -2.71875 -0.53125q-1.25 -0.546875 -2.1875 -1.5q-0.921875 -0.96875 -1.4375 -2.21875q-0.5 -1.265625 -0.5 -2.71875q0 -1.46875 0.5 -2.71875q0.515625 -1.265625 1.4375 -2.21875q0.9375 -0.96875 2.1875 -1.5q1.25 -0.546875 2.71875 -0.546875q1.46875 0 2.71875 0.546875q1.25 0.53125 2.171875 1.5q0.921875 0.953125 1.4375 2.21875q0.515625 1.25 0.515625 2.71875q0 1.453125 -0.515625 2.71875q-0.515625 1.25 -1.4375 2.21875q-0.921875 0.953125 -2.171875 1.5q-1.25 0.53125 -2.71875 0.53125zm0 -1.515625q1.453125 0 2.640625 -0.6875q1.1875 -0.6875 1.890625 -1.921875q0.703125 -1.25 0.703125 -2.84375q0 -1.625 -0.703125 -2.859375q-0.703125 -1.234375 -1.890625 -1.921875q-1.1875 -0.6875 -2.640625 -0.6875q-1.4375 0 -2.640625 0.6875q-1.1875 0.6875 -1.90625 1.921875q-0.703125 1.234375 -0.703125 2.859375q0 1.59375 0.703125 2.84375q0.71875 1.234375 1.90625 1.921875q1.203125 0.6875 2.640625 0.6875zm8.857132 5.25l0 -13.546875l1.515625 0l0 1.421875l0.078125 0q0.25 -0.453125 0.71875 -0.84375q0.484375 -0.390625 1.125 -0.625q0.640625 -0.25 1.390625 -0.25q1.3125 0 2.328125 0.65625q1.03125 0.65625 1.625 1.796875q0.609375 1.125 0.609375 2.609375q0 1.46875 -0.609375 2.609375q-0.59375 1.140625 -1.625 1.796875q-1.015625 0.640625 -2.328125 0.640625q-1.125 0 -1.984375 -0.515625q-0.859375 -0.53125 -1.25 -1.1875l-0.078125 0l0.078125 1.3125l0 4.125l-1.59375 0zm4.671875 -5.171875q0.8125 0 1.53125 -0.4375q0.71875 -0.4375 1.15625 -1.25q0.4375 -0.828125 0.4375 -1.921875q0 -1.125 -0.4375 -1.9375q-0.4375 -0.8125 -1.15625 -1.25q-0.71875 -0.4375 -1.53125 -0.4375q-0.828125 0 -1.546875 0.4375q-0.71875 0.4375 -1.171875 1.25q-0.4375 0.8125 -0.4375 1.9375q0 1.109375 0.4375 1.921875q0.453125 0.8125 1.171875 1.25q0.71875 0.4375 1.546875 0.4375zm6.887314 1.140625l0 -13.359375l4.5 0q1.125 0 2.0625 0.515625q0.953125 0.5 1.515625 1.390625q0.578125 0.875 0.578125 2.046875q0 0.71875 -0.265625 1.421875q-0.265625 0.6875 -0.8125 1.265625q-0.546875 0.578125 -1.40625 0.921875q-0.84375 0.34375 -2.015625 0.34375l-3.328125 0l0 -1.46875l3.578125 0q0.703125 0 1.296875 -0.3125q0.59375 -0.328125 0.96875 -0.875q0.375 -0.5625 0.375 -1.296875q0 -0.59375 -0.3125 -1.15625q-0.3125 -0.5625 -0.875 -0.921875q-0.546875 -0.359375 -1.328125 -0.359375l-2.953125 0l0 11.84375l-1.578125 0zm3.21875 -6.125l1.734375 -0.09375l4.21875 6.140625l0 0.078125l-1.859375 0l-4.09375 -6.125zm11.433411 6.421875q-1.375 0 -2.453125 -0.640625q-1.0625 -0.65625 -1.671875 -1.796875q-0.609375 -1.140625 -0.609375 -2.59375q0 -1.359375 0.5625 -2.515625q0.578125 -1.15625 1.609375 -1.859375q1.03125 -0.703125 2.4375 -0.703125q1.421875 0 2.4375 0.625q1.015625 0.625 1.5625 1.734375q0.546875 1.09375 0.546875 2.515625q0 0.125 -0.015625 0.265625q0 0.125 -0.015625 0.21875l-8.1875 0l0 -1.3125l6.546875 0q-0.015625 -0.390625 -0.1875 -0.84375q-0.15625 -0.46875 -0.5 -0.859375q-0.34375 -0.40625 -0.875 -0.65625q-0.53125 -0.25 -1.3125 -0.25q-0.9375 0 -1.625 0.484375q-0.671875 0.46875 -1.046875 1.296875q-0.359375 0.8125 -0.359375 1.859375q0 1.203125 0.46875 2.015625q0.46875 0.796875 1.203125 1.1875q0.75 0.390625 1.546875 0.390625q1.046875 0 1.71875 -0.484375q0.6875 -0.5 1.09375 -1.234375l1.34375 0.65625q-0.5625 1.078125 -1.609375 1.796875q-1.03125 0.703125 -2.609375 0.703125zm9.690552 0q-1.0625 0 -1.875 -0.34375q-0.8125 -0.34375 -1.34375 -0.921875q-0.53125 -0.59375 -0.796875 -1.28125l1.421875 -0.640625q0.375 0.859375 1.078125 1.328125q0.703125 0.46875 1.625 0.46875q0.875 0 1.453125 -0.359375q0.59375 -0.359375 0.59375 -1.046875q0 -0.421875 -0.25 -0.703125q-0.234375 -0.296875 -0.703125 -0.5q-0.453125 -0.21875 -1.125 -0.375l-1.15625 -0.3125q-0.671875 -0.1875 -1.28125 -0.515625q-0.59375 -0.34375 -0.953125 -0.875q-0.359375 -0.53125 -0.359375 -1.296875q0 -0.84375 0.5 -1.453125q0.5 -0.625 1.3125 -0.953125q0.828125 -0.328125 1.765625 -0.328125q0.8125 0 1.53125 0.234375q0.71875 0.234375 1.265625 0.6875q0.546875 0.453125 0.8125 1.125l-1.375 0.640625q-0.359375 -0.703125 -0.953125 -0.984375q-0.59375 -0.28125 -1.328125 -0.28125q-0.78125 0 -1.359375 0.34375q-0.578125 0.34375 -0.578125 0.9375q0 0.59375 0.46875 0.90625q0.484375 0.296875 1.171875 0.46875l1.390625 0.359375q1.390625 0.359375 2.09375 1.0625q0.71875 0.6875 0.71875 1.703125q0 0.890625 -0.515625 1.546875q-0.5 0.65625 -1.359375 1.015625q-0.84375 0.34375 -1.890625 0.34375zm8.949402 0q-1.75 0 -2.640625 -1.015625q-0.890625 -1.03125 -0.890625 -2.8125l0 -5.984375l1.59375 0l0 5.75q0 1.421875 0.65625 2.03125q0.65625 0.59375 1.625 0.59375q0.828125 0 1.4375 -0.4375q0.625 -0.4375 0.953125 -1.125q0.34375 -0.6875 0.34375 -1.4375l0 -5.375l1.59375 0l0 9.515625l-1.515625 0l0 -1.375l-0.078125 0q-0.234375 0.4375 -0.71875 0.828125q-0.46875 0.375 -1.078125 0.609375q-0.609375 0.234375 -1.28125 0.234375zm7.013031 -0.296875l0 -13.359375l1.59375 0l0 13.359375l-1.59375 0zm3.3735504 -9.515625l5.578125 0l0 1.4375l-5.578125 0l0 -1.4375zm1.671875 7.015625l0 -9.703125l1.578125 0l0 9.3125q0 0.75 0.3125 1.15625q0.3125 0.40625 1.015625 0.40625q0.3125 0 0.578125 -0.09375q0.265625 -0.09375 0.46875 -0.21875l0 1.546875q-0.25 0.109375 -0.546875 0.171875q-0.28125 0.078125 -0.765625 0.078125q-1.1875 0 -1.921875 -0.703125q-0.71875 -0.703125 -0.71875 -1.953125zm10.122162 6.53125l0 -13.546875l1.515625 0l0 1.421875l0.078125 0q0.25 -0.453125 0.71875 -0.84375q0.484375 -0.390625 1.125 -0.625q0.640625 -0.25 1.390625 -0.25q1.3125 0 2.328125 0.65625q1.03125 0.65625 1.625 1.796875q0.609375 1.125 0.609375 2.609375q0 1.46875 -0.609375 2.609375q-0.59375 1.140625 -1.625 1.796875q-1.015625 0.640625 -2.328125 0.640625q-1.125 0 -1.984375 -0.515625q-0.859375 -0.53125 -1.25 -1.1875l-0.078125 0l0.078125 1.3125l0 4.125l-1.59375 0zm4.671875 -5.171875q0.8125 0 1.53125 -0.4375q0.71875 -0.4375 1.15625 -1.25q0.4375 -0.828125 0.4375 -1.921875q0 -1.125 -0.4375 -1.9375q-0.4375 -0.8125 -1.15625 -1.25q-0.71875 -0.4375 -1.53125 -0.4375q-0.828125 0 -1.546875 0.4375q-0.71875 0.4375 -1.171875 1.25q-0.4375 0.8125 -0.4375 1.9375q0 1.109375 0.4375 1.921875q0.453125 0.8125 1.171875 1.25q0.71875 0.4375 1.546875 0.4375zm9.606064 1.4375q-1.0625 0 -1.875 -0.40625q-0.796875 -0.40625 -1.25 -1.125q-0.453125 -0.71875 -0.453125 -1.640625q0 -1.046875 0.53125 -1.765625q0.546875 -0.71875 1.453125 -1.078125q0.921875 -0.359375 2.015625 -0.359375q0.640625 0 1.171875 0.109375q0.546875 0.09375 0.9375 0.234375q0.40625 0.140625 0.625 0.265625l0 -0.578125q0 -1.078125 -0.765625 -1.703125q-0.765625 -0.640625 -1.875 -0.640625q-0.78125 0 -1.46875 0.34375q-0.671875 0.34375 -1.0625 0.953125l-1.203125 -0.890625q0.375 -0.5625 0.9375 -0.96875q0.5625 -0.40625 1.28125 -0.625q0.71875 -0.234375 1.515625 -0.234375q1.9375 0 3.03125 1.03125q1.109375 1.015625 1.109375 2.75l0 6.03125l-1.5 0l0 -1.359375l-0.078125 0q-0.25 0.40625 -0.703125 0.796875q-0.4375 0.375 -1.046875 0.609375q-0.609375 0.25 -1.328125 0.25zm0.140625 -1.390625q0.828125 0 1.5 -0.40625q0.6875 -0.421875 1.09375 -1.109375q0.421875 -0.6875 0.421875 -1.515625q-0.4375 -0.296875 -1.078125 -0.484375q-0.640625 -0.1875 -1.40625 -0.1875q-1.359375 0 -2.0 0.5625q-0.640625 0.5625 -0.640625 1.375q0 0.78125 0.59375 1.28125q0.609375 0.484375 1.515625 0.484375zm6.8751984 1.09375l0 -9.515625l1.578125 0l0 9.515625l-1.578125 0zm0.78125 -11.265625q-0.46875 0 -0.8125 -0.328125q-0.328125 -0.34375 -0.328125 -0.8125q0 -0.484375 0.328125 -0.8125q0.34375 -0.328125 0.8125 -0.328125q0.484375 0 0.8125 0.328125q0.328125 0.328125 0.328125 0.8125q0 0.46875 -0.328125 0.8125q-0.328125 0.328125 -0.8125 0.328125zm3.2788086 11.265625l0 -9.515625l1.515625 0l0 1.53125l0.078125 0q0.1875 -0.546875 0.625 -0.9375q0.4375 -0.40625 1.0 -0.640625q0.578125 -0.234375 1.125 -0.234375q0.4375 0 0.671875 0.046875q0.25 0.046875 0.453125 0.140625l0 1.71875q-0.296875 -0.15625 -0.640625 -0.21875q-0.34375 -0.078125 -0.703125 -0.078125q-0.6875 0 -1.265625 0.390625q-0.578125 0.390625 -0.921875 1.046875q-0.34375 0.65625 -0.34375 1.4375l0 5.3125l-1.59375 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m114.61155 124.00787l-11.527557 -43.244095" fill-rule="evenodd"/><path stroke="#ff0000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m114.61155 124.00787l-9.982109 -37.44654" fill-rule="evenodd"/><path fill="#ff0000" stroke="#ff0000" stroke-width="1.0" stroke-linecap="butt" d="m106.22544 86.13589l-2.7649002 -3.9595337l-0.42710114 4.810417z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m51.220474 125.220474l604.50397 0l0 32.72441l-604.50397 0z" fill-rule="evenodd"/><path fill="#000000" d="m60.845474 152.14047l2.78125 -13.359375l1.65625 0l-1.0 4.78125q0.78125 -0.71875 1.421875 -1.015625q0.640625 -0.296875 1.328125 -0.296875q1.359375 0 2.265625 1.015625q0.90625 1.0 0.90625 2.9375q0 1.28125 -0.375 2.359375q-0.359375 1.0625 -0.890625 1.78125q-0.53125 0.71875 -1.109375 1.15625q-0.578125 0.4375 -1.1875 0.640625q-0.59375 0.21875 -1.140625 0.21875q-0.96875 0 -1.703125 -0.5q-0.71875 -0.515625 -1.125 -1.546875l-0.390625 1.828125l-1.4375 0zm2.4375 -3.96875l-0.015625 0.3125q0 1.234375 0.59375 1.890625q0.59375 0.640625 1.484375 0.640625q0.859375 0 1.578125 -0.609375q0.734375 -0.609375 1.1875 -1.890625q0.46875 -1.28125 0.46875 -2.375q0 -1.21875 -0.59375 -1.890625q-0.578125 -0.671875 -1.4375 -0.671875q-0.90625 0 -1.65625 0.6875q-0.734375 0.6875 -1.234375 2.125q-0.375 1.0625 -0.375 1.78125zm14.531967 2.21875q-1.734375 1.96875 -3.5625 1.96875q-1.109375 0 -1.796875 -0.640625q-0.6875 -0.640625 -0.6875 -1.578125q0 -0.609375 0.296875 -2.09375l1.171875 -5.578125l1.65625 0l-1.296875 6.1875q-0.171875 0.765625 -0.171875 1.203125q0 0.546875 0.328125 0.859375q0.34375 0.296875 0.984375 0.296875q0.703125 0 1.359375 -0.328125q0.65625 -0.34375 1.125 -0.921875q0.484375 -0.578125 0.796875 -1.359375q0.1875 -0.5 0.453125 -1.765625l0.875 -4.171875l1.65625 0l-2.03125 9.671875l-1.515625 0l0.359375 -1.75zm4.000717 1.75l1.765625 -8.40625l-1.484375 0l0.265625 -1.265625l1.484375 0l0.28125 -1.375q0.21875 -1.03125 0.4375 -1.484375q0.234375 -0.453125 0.75 -0.75q0.53125 -0.296875 1.4375 -0.296875q0.625 0 1.828125 0.265625l-0.296875 1.4375q-0.84375 -0.21875 -1.40625 -0.21875q-0.484375 0 -0.734375 0.25q-0.25 0.234375 -0.4375 1.125l-0.21875 1.046875l1.84375 0l-0.265625 1.265625l-1.84375 0l-1.75 8.40625l-1.65625 0zm5.183304 0l1.765625 -8.40625l-1.484375 0l0.265625 -1.265625l1.484375 0l0.28125 -1.375q0.21875 -1.03125 0.4375 -1.484375q0.234375 -0.453125 0.75 -0.75q0.53125 -0.296875 1.4375 -0.296875q0.625 0 1.828125 0.265625l-0.296875 1.4375q-0.84375 -0.21875 -1.40625 -0.21875q-0.484375 0 -0.734375 0.25q-0.25 0.234375 -0.4375 1.125l-0.21875 1.046875l1.84375 0l-0.265625 1.265625l-1.84375 0l-1.75 8.40625l-1.65625 0zm12.058304 -3.28125l1.609375 0.15625q-0.34375 1.1875 -1.59375 2.265625q-1.234375 1.078125 -2.96875 1.078125q-1.0625 0 -1.96875 -0.5q-0.890625 -0.5 -1.359375 -1.4375q-0.46875 -0.953125 -0.46875 -2.15625q0 -1.59375 0.734375 -3.078125q0.734375 -1.484375 1.890625 -2.203125q1.171875 -0.734375 2.53125 -0.734375q1.734375 0 2.765625 1.078125q1.03125 1.0625 1.03125 2.921875q0 0.71875 -0.125 1.46875l-7.125 0q-0.046875 0.28125 -0.046875 0.5q0 1.359375 0.625 2.078125q0.625 0.71875 1.53125 0.71875q0.84375 0 1.65625 -0.546875q0.828125 -0.5625 1.28125 -1.609375zm-4.78125 -2.40625l5.421875 0q0.015625 -0.25 0.015625 -0.359375q0 -1.234375 -0.625 -1.890625q-0.625 -0.671875 -1.59375 -0.671875q-1.0625 0 -1.9375 0.734375q-0.859375 0.71875 -1.28125 2.1875zm8.063217 5.6875l2.015625 -9.671875l1.453125 0l-0.40625 1.96875q0.75 -1.109375 1.453125 -1.640625q0.71875 -0.546875 1.46875 -0.546875q0.5 0 1.21875 0.359375l-0.671875 1.53125q-0.4375 -0.3125 -0.9375 -0.3125q-0.875 0 -1.78125 0.96875q-0.90625 0.953125 -1.4375 3.46875l-0.8125 3.875l-1.5625 0z" fill-rule="nonzero"/><path fill="#000000" d="m112.286285 156.06235q-1.359375 -1.703125 -2.296875 -4.0q-0.9375 -2.296875 -0.9375 -4.765625q0 -2.15625 0.703125 -4.140625q0.828125 -2.3125 2.53125 -4.59375l1.171875 0q-1.09375 1.890625 -1.453125 2.703125q-0.546875 1.25 -0.875 2.625q-0.390625 1.703125 -0.390625 3.421875q0 4.375 2.71875 8.75l-1.171875 0z" fill-rule="nonzero"/><path fill="#000000" d="m124.21771 139.26547l-8.40625 12.875l-1.484375 0l8.375 -12.875l1.515625 0zm-4.859375 2.546875q0 0.59375 -0.171875 1.109375q-0.171875 0.5 -0.5 0.875q-0.3125 0.359375 -0.78125 0.578125q-0.453125 0.21875 -1.03125 0.21875q-0.546875 0 -1.0 -0.171875q-0.4375 -0.171875 -0.75 -0.515625q-0.3125 -0.34375 -0.484375 -0.84375q-0.15625 -0.5 -0.15625 -1.15625q0 -0.59375 0.171875 -1.09375q0.171875 -0.5 0.484375 -0.875q0.328125 -0.375 0.78125 -0.578125q0.46875 -0.21875 1.03125 -0.21875q0.5625 0 1.0 0.171875q0.4375 0.171875 0.75 0.515625q0.3125 0.328125 0.484375 0.828125q0.171875 0.5 0.171875 1.15625zm-1.34375 0.046875q0 -0.796875 -0.296875 -1.171875q-0.28125 -0.390625 -0.796875 -0.390625q-0.265625 0 -0.46875 0.125q-0.203125 0.109375 -0.34375 0.328125q-0.140625 0.203125 -0.21875 0.5q-0.078125 0.28125 -0.078125 0.609375q0 0.8125 0.296875 1.203125q0.296875 0.375 0.8125 0.375q0.265625 0 0.46875 -0.125q0.203125 -0.125 0.34375 -0.328125q0.140625 -0.203125 0.203125 -0.484375q0.078125 -0.296875 0.078125 -0.640625zm6.046875 7.65625q0 0.578125 -0.171875 1.09375q-0.171875 0.5 -0.5 0.875q-0.328125 0.359375 -0.796875 0.578125q-0.453125 0.203125 -1.015625 0.203125q-0.546875 0 -1.0 -0.171875q-0.4375 -0.171875 -0.75 -0.5q-0.3125 -0.34375 -0.484375 -0.84375q-0.171875 -0.515625 -0.171875 -1.171875q0 -0.578125 0.171875 -1.078125q0.1875 -0.515625 0.5 -0.890625q0.328125 -0.375 0.78125 -0.578125q0.46875 -0.203125 1.03125 -0.203125q0.5625 0 1.0 0.171875q0.453125 0.171875 0.765625 0.515625q0.3125 0.328125 0.46875 0.84375q0.171875 0.5 0.171875 1.15625zm-1.328125 0.03125q0 -0.796875 -0.296875 -1.171875q-0.296875 -0.390625 -0.8125 -0.390625q-0.265625 0 -0.46875 0.125q-0.203125 0.109375 -0.359375 0.328125q-0.140625 0.203125 -0.21875 0.484375q-0.0625 0.28125 -0.0625 0.625q0 0.796875 0.28125 1.1875q0.296875 0.375 0.828125 0.375q0.265625 0 0.46875 -0.109375q0.203125 -0.125 0.34375 -0.328125q0.140625 -0.21875 0.21875 -0.5q0.078125 -0.296875 0.078125 -0.625zm3.4916687 -6.546875l1.453125 0l0.046875 1.6875q0.8125 -0.984375 1.59375 -1.421875q0.796875 -0.4375 1.59375 -0.4375q1.421875 0 2.15625 0.921875q0.734375 0.921875 0.671875 2.734375l-1.59375 0q0.015625 -1.203125 -0.359375 -1.734375q-0.375 -0.546875 -1.109375 -0.546875q-0.3125 0 -0.640625 0.109375q-0.328125 0.109375 -0.671875 0.359375q-0.328125 0.25 -0.71875 0.640625q-0.375 0.390625 -0.8125 0.953125l0 5.875l-1.609375 0l0 -9.140625z" fill-rule="nonzero"/><path fill="#000000" d="m136.96667 156.06235l-1.1875 0q2.734375 -4.375 2.734375 -8.75q0 -1.71875 -0.390625 -3.390625q-0.3125 -1.375 -0.875 -2.625q-0.359375 -0.828125 -1.46875 -2.734375l1.1875 0q1.703125 2.28125 2.53125 4.59375q0.6875 1.984375 0.6875 4.140625q0 2.46875 -0.9375 4.765625q-0.9375 2.296875 -2.28125 4.0zm10.770981 -11.734375l0 -1.859375l1.859375 0l0 1.859375l-1.859375 0zm0 7.8125l0 -1.875l1.859375 0l0 1.875l-1.859375 0zm13.355179 -7.859375l-8.828125 0l0 -1.515625l8.828125 0l0 1.515625zm0 4.0625l-8.828125 0l0 -1.53125l8.828125 0l0 1.53125z" fill-rule="nonzero"/><path fill="#000000" d="m167.93672 152.14047l2.78125 -13.359375l1.65625 0l-1.0 4.78125q0.78125 -0.71875 1.421875 -1.015625q0.640625 -0.296875 1.328125 -0.296875q1.359375 0 2.265625 1.015625q0.90625 1.0 0.90625 2.9375q0 1.28125 -0.375 2.359375q-0.359375 1.0625 -0.890625 1.78125q-0.53125 0.71875 -1.109375 1.15625q-0.578125 0.4375 -1.1875 0.640625q-0.59375 0.21875 -1.140625 0.21875q-0.96875 0 -1.703125 -0.5q-0.71875 -0.515625 -1.125 -1.546875l-0.390625 1.828125l-1.4375 0zm2.4375 -3.96875l-0.015625 0.3125q0 1.234375 0.59375 1.890625q0.59375 0.640625 1.484375 0.640625q0.859375 0 1.578125 -0.609375q0.734375 -0.609375 1.1875 -1.890625q0.46875 -1.28125 0.46875 -2.375q0 -1.21875 -0.59375 -1.890625q-0.578125 -0.671875 -1.4375 -0.671875q-0.90625 0 -1.65625 0.6875q-0.734375 0.6875 -1.234375 2.125q-0.375 1.0625 -0.375 1.78125zm14.531967 2.21875q-1.734375 1.96875 -3.5625 1.96875q-1.109375 0 -1.796875 -0.640625q-0.6875 -0.640625 -0.6875 -1.578125q0 -0.609375 0.296875 -2.09375l1.171875 -5.578125l1.65625 0l-1.296875 6.1875q-0.171875 0.765625 -0.171875 1.203125q0 0.546875 0.328125 0.859375q0.34375 0.296875 0.984375 0.296875q0.703125 0 1.359375 -0.328125q0.65625 -0.34375 1.125 -0.921875q0.484375 -0.578125 0.796875 -1.359375q0.1875 -0.5 0.453125 -1.765625l0.875 -4.171875l1.65625 0l-2.03125 9.671875l-1.515625 0l0.359375 -1.75zm4.000717 1.75l1.765625 -8.40625l-1.484375 0l0.265625 -1.265625l1.484375 0l0.28125 -1.375q0.21875 -1.03125 0.4375 -1.484375q0.234375 -0.453125 0.75 -0.75q0.53125 -0.296875 1.4375 -0.296875q0.625 0 1.828125 0.265625l-0.296875 1.4375q-0.84375 -0.21875 -1.40625 -0.21875q-0.484375 0 -0.734375 0.25q-0.25 0.234375 -0.4375 1.125l-0.21875 1.046875l1.84375 0l-0.265625 1.265625l-1.84375 0l-1.75 8.40625l-1.65625 0zm5.183304 0l1.765625 -8.40625l-1.484375 0l0.265625 -1.265625l1.484375 0l0.28125 -1.375q0.21875 -1.03125 0.4375 -1.484375q0.234375 -0.453125 0.75 -0.75q0.53125 -0.296875 1.4375 -0.296875q0.625 0 1.828125 0.265625l-0.296875 1.4375q-0.84375 -0.21875 -1.40625 -0.21875q-0.484375 0 -0.734375 0.25q-0.25 0.234375 -0.4375 1.125l-0.21875 1.046875l1.84375 0l-0.265625 1.265625l-1.84375 0l-1.75 8.40625l-1.65625 0zm12.058304 -3.28125l1.609375 0.15625q-0.34375 1.1875 -1.59375 2.265625q-1.234375 1.078125 -2.96875 1.078125q-1.0625 0 -1.96875 -0.5q-0.890625 -0.5 -1.359375 -1.4375q-0.46875 -0.953125 -0.46875 -2.15625q0 -1.59375 0.734375 -3.078125q0.734375 -1.484375 1.890625 -2.203125q1.171875 -0.734375 2.53125 -0.734375q1.734375 0 2.765625 1.078125q1.03125 1.0625 1.03125 2.921875q0 0.71875 -0.125 1.46875l-7.125 0q-0.046875 0.28125 -0.046875 0.5q0 1.359375 0.625 2.078125q0.625 0.71875 1.53125 0.71875q0.84375 0 1.65625 -0.546875q0.828125 -0.5625 1.28125 -1.609375zm-4.78125 -2.40625l5.421875 0q0.015625 -0.25 0.015625 -0.359375q0 -1.234375 -0.625 -1.890625q-0.625 -0.671875 -1.59375 -0.671875q-1.0625 0 -1.9375 0.734375q-0.859375 0.71875 -1.28125 2.1875zm8.063217 5.6875l2.015625 -9.671875l1.453125 0l-0.40625 1.96875q0.75 -1.109375 1.453125 -1.640625q0.71875 -0.546875 1.46875 -0.546875q0.5 0 1.21875 0.359375l-0.671875 1.53125q-0.4375 -0.3125 -0.9375 -0.3125q-0.875 0 -1.78125 0.96875q-0.90625 0.953125 -1.4375 3.46875l-0.8125 3.875l-1.5625 0z" fill-rule="nonzero"/><path fill="#000000" d="m219.37753 156.06235q-1.359375 -1.703125 -2.296875 -4.0q-0.9375 -2.296875 -0.9375 -4.765625q0 -2.15625 0.703125 -4.140625q0.828125 -2.3125 2.53125 -4.59375l1.171875 0q-1.09375 1.890625 -1.453125 2.703125q-0.546875 1.25 -0.875 2.625q-0.390625 1.703125 -0.390625 3.421875q0 4.375 2.71875 8.75l-1.171875 0z" fill-rule="nonzero"/><path fill="#000000" d="m231.30896 139.26547l-8.40625 12.875l-1.484375 0l8.375 -12.875l1.515625 0zm-4.859375 2.546875q0 0.59375 -0.171875 1.109375q-0.171875 0.5 -0.5 0.875q-0.3125 0.359375 -0.78125 0.578125q-0.453125 0.21875 -1.03125 0.21875q-0.546875 0 -1.0 -0.171875q-0.4375 -0.171875 -0.75 -0.515625q-0.3125 -0.34375 -0.484375 -0.84375q-0.15625 -0.5 -0.15625 -1.15625q0 -0.59375 0.171875 -1.09375q0.171875 -0.5 0.484375 -0.875q0.328125 -0.375 0.78125 -0.578125q0.46875 -0.21875 1.03125 -0.21875q0.5625 0 1.0 0.171875q0.4375 0.171875 0.75 0.515625q0.3125 0.328125 0.484375 0.828125q0.171875 0.5 0.171875 1.15625zm-1.34375 0.046875q0 -0.796875 -0.296875 -1.171875q-0.28125 -0.390625 -0.796875 -0.390625q-0.265625 0 -0.46875 0.125q-0.203125 0.109375 -0.34375 0.328125q-0.140625 0.203125 -0.21875 0.5q-0.078125 0.28125 -0.078125 0.609375q0 0.8125 0.296875 1.203125q0.296875 0.375 0.8125 0.375q0.265625 0 0.46875 -0.125q0.203125 -0.125 0.34375 -0.328125q0.140625 -0.203125 0.203125 -0.484375q0.078125 -0.296875 0.078125 -0.640625zm6.046875 7.65625q0 0.578125 -0.171875 1.09375q-0.171875 0.5 -0.5 0.875q-0.328125 0.359375 -0.796875 0.578125q-0.453125 0.203125 -1.015625 0.203125q-0.546875 0 -1.0 -0.171875q-0.4375 -0.171875 -0.75 -0.5q-0.3125 -0.34375 -0.484375 -0.84375q-0.171875 -0.515625 -0.171875 -1.171875q0 -0.578125 0.171875 -1.078125q0.1875 -0.515625 0.5 -0.890625q0.328125 -0.375 0.78125 -0.578125q0.46875 -0.203125 1.03125 -0.203125q0.5625 0 1.0 0.171875q0.453125 0.171875 0.765625 0.515625q0.3125 0.328125 0.46875 0.84375q0.171875 0.5 0.171875 1.15625zm-1.328125 0.03125q0 -0.796875 -0.296875 -1.171875q-0.296875 -0.390625 -0.8125 -0.390625q-0.265625 0 -0.46875 0.125q-0.203125 0.109375 -0.359375 0.328125q-0.140625 0.203125 -0.21875 0.484375q-0.0625 0.28125 -0.0625 0.625q0 0.796875 0.28125 1.1875q0.296875 0.375 0.828125 0.375q0.265625 0 0.46875 -0.109375q0.203125 -0.125 0.34375 -0.328125q0.140625 -0.21875 0.21875 -0.5q0.078125 -0.296875 0.078125 -0.625zm10.491669 2.46875q-0.53125 0.140625 -1.109375 0.1875q-0.578125 0.0625 -1.171875 0.0625q-1.71875 0 -2.5625 -0.78125q-0.84375 -0.78125 -0.84375 -2.390625l0 -4.765625l-2.5625 0l0 -1.328125l2.5625 0l0 -2.515625l1.578125 -0.40625l0 2.921875l4.109375 0l0 1.328125l-4.109375 0l0 4.640625q0 0.984375 0.515625 1.46875q0.53125 0.484375 1.546875 0.484375q0.4375 0 0.953125 -0.0625q0.53125 -0.0625 1.09375 -0.21875l0 1.375z" fill-rule="nonzero"/><path fill="#000000" d="m244.05792 156.06235l-1.1875 0q2.734375 -4.375 2.734375 -8.75q0 -1.71875 -0.390625 -3.390625q-0.3125 -1.375 -0.875 -2.625q-0.359375 -0.828125 -1.46875 -2.734375l1.1875 0q1.703125 2.28125 2.53125 4.59375q0.6875 1.984375 0.6875 4.140625q0 2.46875 -0.9375 4.765625q-0.9375 2.296875 -2.28125 4.0zm9.708481 -8.765625q0 -2.6875 1.484375 -3.96875q1.2499847 -1.078125 3.0468597 -1.078125q2.0 0 3.265625 1.3125q1.265625 1.296875 1.265625 3.609375q0 1.859375 -0.5625 2.9375q-0.5625 1.0625 -1.640625 1.65625q-1.0625 0.59375 -2.328125 0.59375q-2.03125 0 -3.2812347 -1.296875q-1.25 -1.3125 -1.25 -3.765625zm1.6875 0q0 1.859375 0.79685974 2.796875q0.8125 0.921875 2.046875 0.921875q1.21875 0 2.03125 -0.921875q0.8125 -0.9375 0.8125 -2.84375q0 -1.796875 -0.8125 -2.71875q-0.8125 -0.921875 -2.03125 -0.921875q-1.234375 0 -2.046875 0.921875q-0.79685974 0.90625 -0.79685974 2.765625zm9.281967 4.84375l0 -9.671875l1.46875 0l0 1.46875q0.5625 -1.03125 1.03125 -1.359375q0.484375 -0.328125 1.0625 -0.328125q0.828125 0 1.6875 0.53125l-0.5625 1.515625q-0.609375 -0.359375 -1.203125 -0.359375q-0.546875 0 -0.96875 0.328125q-0.421875 0.328125 -0.609375 0.890625q-0.28125 0.875 -0.28125 1.921875l0 5.0625l-1.625 0zm11.411591 0l0 -9.671875l1.46875 0l0 1.375q1.0625 -1.59375 3.078125 -1.59375q0.875 0 1.609375 0.3125q0.734375 0.3125 1.09375 0.828125q0.375 0.5 0.515625 1.203125q0.09375 0.453125 0.09375 1.59375l0 5.953125l-1.640625 0l0 -5.890625q0 -1.0 -0.203125 -1.484375q-0.1875 -0.5 -0.671875 -0.796875q-0.484375 -0.296875 -1.140625 -0.296875q-1.046875 0 -1.8125 0.671875q-0.75 0.65625 -0.75 2.515625l0 5.28125l-1.640625 0zm17.000732 -3.109375l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.765625 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375 0 3.15625 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.21875 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.546875 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.390625 -2.65625l5.40625 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.03125 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm10.922577 5.765625l-2.96875 -9.671875l1.703125 0l1.53125 5.578125l0.578125 2.078125q0.046875 -0.15625 0.5 -2.0l1.546875 -5.65625l1.6875 0l1.4375 5.609375l0.484375 1.84375l0.5625 -1.859375l1.65625 -5.59375l1.59375 0l-3.03125 9.671875l-1.703125 0l-1.53125 -5.796875l-0.375 -1.640625l-1.953125 7.4375l-1.71875 0zm23.1875 -1.1875q-0.921875 0.765625 -1.765625 1.09375q-0.828125 0.3125 -1.796875 0.3125q-1.59375 0 -2.453125 -0.78125q-0.859375 -0.78125 -0.859375 -1.984375q0 -0.71875 0.328125 -1.296875q0.328125 -0.59375 0.84375 -0.9375q0.53125 -0.359375 1.1875 -0.546875q0.46875 -0.125 1.453125 -0.25q1.984375 -0.234375 2.921875 -0.5625q0.015625 -0.34375 0.015625 -0.421875q0 -1.0 -0.46875 -1.421875q-0.625 -0.546875 -1.875 -0.546875q-1.15625 0 -1.703125 0.40625q-0.546875 0.40625 -0.8125 1.421875l-1.609375 -0.21875q0.21875 -1.015625 0.71875 -1.640625q0.5 -0.640625 1.453125 -0.984375q0.953125 -0.34375 2.1875 -0.34375q1.25 0 2.015625 0.296875q0.78125 0.28125 1.140625 0.734375q0.375 0.4375 0.515625 1.109375q0.078125 0.421875 0.078125 1.515625l0 2.1875q0 2.28125 0.109375 2.890625q0.109375 0.59375 0.40625 1.15625l-1.703125 0q-0.265625 -0.515625 -0.328125 -1.1875zm-0.140625 -3.671875q-0.890625 0.375 -2.671875 0.625q-1.015625 0.140625 -1.4375 0.328125q-0.421875 0.1875 -0.65625 0.53125q-0.21875 0.34375 -0.21875 0.78125q0 0.65625 0.5 1.09375q0.5 0.4375 1.453125 0.4375q0.9375 0 1.671875 -0.40625q0.75 -0.421875 1.09375 -1.140625q0.265625 -0.5625 0.265625 -1.640625l0 -0.609375zm4.156952 4.859375l0 -13.359375l1.640625 0l0 13.359375l-1.640625 0zm4.1448364 0l0 -13.359375l1.640625 0l0 13.359375l-1.640625 0zm3.5823364 -4.84375q0 -2.6875 1.484375 -3.96875q1.25 -1.078125 3.046875 -1.078125q2.0 0 3.265625 1.3125q1.265625 1.296875 1.265625 3.609375q0 1.859375 -0.5625 2.9375q-0.5625 1.0625 -1.640625 1.65625q-1.0625 0.59375 -2.328125 0.59375q-2.03125 0 -3.28125 -1.296875q-1.25 -1.3125 -1.25 -3.765625zm1.6875 0q0 1.859375 0.796875 2.796875q0.8125 0.921875 2.046875 0.921875q1.21875 0 2.03125 -0.921875q0.8125 -0.9375 0.8125 -2.84375q0 -1.796875 -0.8125 -2.71875q-0.8125 -0.921875 -2.03125 -0.921875q-1.234375 0 -2.046875 0.921875q-0.796875 0.90625 -0.796875 2.765625zm15.610077 1.296875l1.609375 0.21875q-0.265625 1.65625 -1.359375 2.609375q-1.078125 0.9375 -2.671875 0.9375q-1.984375 0 -3.1875 -1.296875q-1.203125 -1.296875 -1.203125 -3.71875q0 -1.578125 0.515625 -2.75q0.515625 -1.171875 1.578125 -1.75q1.0625 -0.59375 2.3125 -0.59375q1.578125 0 2.578125 0.796875q1.0 0.796875 1.28125 2.265625l-1.59375 0.234375q-0.234375 -0.96875 -0.8125 -1.453125q-0.578125 -0.5 -1.390625 -0.5q-1.234375 0 -2.015625 0.890625q-0.78125 0.890625 -0.78125 2.8125q0 1.953125 0.75 2.84375q0.75 0.875 1.953125 0.875q0.96875 0 1.609375 -0.59375q0.65625 -0.59375 0.828125 -1.828125z" fill-rule="nonzero"/></g></svg>
+\ No newline at end of file
diff --git a/mlir/include/mlir-c/Dialect/IRDL.h b/mlir/include/mlir-c/Dialect/IRDL.h
new file mode 100644
index 000000000000..c4d6ffd989af
--- /dev/null
+++ b/mlir/include/mlir-c/Dialect/IRDL.h
@@ -0,0 +1,29 @@
+//===-- mlir-c/Dialect/IRDL.h - C API for IRDL --------------------*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM
+// Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_C_DIALECT_IRDL_H
+#define MLIR_C_DIALECT_IRDL_H
+
+#include "mlir-c/IR.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+MLIR_DECLARE_CAPI_DIALECT_REGISTRATION(IRDL, irdl);
+
+/// Loads all IRDL dialects in the provided module, registering the dialects in
+/// the module's associated context.
+MLIR_CAPI_EXPORTED MlirLogicalResult mlirLoadIRDLDialects(MlirModule module);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // MLIR_C_DIALECT_IRDL_H
diff --git a/mlir/include/mlir/Analysis/DataFlowFramework.h b/mlir/include/mlir/Analysis/DataFlowFramework.h
index c76cfac07fc7..2580ec28b519 100644
--- a/mlir/include/mlir/Analysis/DataFlowFramework.h
+++ b/mlir/include/mlir/Analysis/DataFlowFramework.h
@@ -242,6 +242,17 @@ public:
     return static_cast<const StateT *>(it->second.get());
   }
 
+  /// Erase any analysis state associated with the given program point.
+  template <typename PointT>
+  void eraseState(PointT point) {
+    ProgramPoint pp(point);
+
+    for (auto it = analysisStates.begin(); it != analysisStates.end(); ++it) {
+      if (it->first.first == pp)
+        analysisStates.erase(it);
+    }
+  }
+
   /// Get a uniqued program point instance. If one is not present, it is
   /// created with the provided arguments.
   template <typename PointT, typename... Args>
diff --git a/mlir/include/mlir/Dialect/Affine/Utils.h b/mlir/include/mlir/Dialect/Affine/Utils.h
index 67c7a964feef..7f25db029781 100644
--- a/mlir/include/mlir/Dialect/Affine/Utils.h
+++ b/mlir/include/mlir/Dialect/Affine/Utils.h
@@ -13,6 +13,7 @@
 #ifndef MLIR_DIALECT_AFFINE_UTILS_H
 #define MLIR_DIALECT_AFFINE_UTILS_H
 
+#include "mlir/Analysis/AliasAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/IR/OpDefinition.h"
@@ -106,7 +107,8 @@ struct VectorizationStrategy {
 /// loads and eliminate invariant affine loads; consequently, eliminate dead
 /// allocs.
 void affineScalarReplace(func::FuncOp f, DominanceInfo &domInfo,
-                         PostDominanceInfo &postDomInfo);
+                         PostDominanceInfo &postDomInfo,
+                         AliasAnalysis &analysis);
 
 /// Vectorizes affine loops in 'loops' using the n-D vectorization factors in
 /// 'vectorSizes'. By default, each vectorization factor is applied
@@ -325,7 +327,8 @@ OpFoldResult linearizeIndex(ArrayRef<OpFoldResult> multiIndex,
 /// will check if there is no write to the memory between `start` and `memOp`
 /// that would change the read within `memOp`.
 template <typename EffectType, typename T>
-bool hasNoInterveningEffect(Operation *start, T memOp);
+bool hasNoInterveningEffect(Operation *start, T memOp,
+                            llvm::function_ref<bool(Value, Value)> mayAlias);
 
 struct AffineValueExpr {
   explicit AffineValueExpr(AffineExpr e) : e(e) {}
diff --git a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
index f051e03efbcd..0e38325f9891 100644
--- a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
@@ -115,7 +115,7 @@ class ArmSME_IntrLoadStoreOp<string mnemonic>
                     /*immArgPositions=*/[2],
                     /*immArgAttrNames=*/["tile_id"]>;
 
-// Loads
+// Loads (from memory to ZA tile slice)
 class ArmSME_IntrLoadOp<string mnemonic>
     : ArmSME_IntrLoadStoreOp<mnemonic>,
       Arguments<(ins Arg<SVEPredicate, "Vector predicate">:$predicate,
@@ -134,7 +134,7 @@ def LLVM_aarch64_sme_ld1w_vert : ArmSME_IntrLoadOp<"ld1w.vert">;
 def LLVM_aarch64_sme_ld1d_vert : ArmSME_IntrLoadOp<"ld1d.vert">;
 def LLVM_aarch64_sme_ld1q_vert : ArmSME_IntrLoadOp<"ld1q.vert">;
 
-// Stores
+// Stores (ZA tile slice to memory)
 class ArmSME_IntrStoreOp<string mnemonic>
     : ArmSME_IntrLoadStoreOp<mnemonic>,
       Arguments<(ins Arg<SVEPredicate, "Vector predicate">:$predicate,
diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
index a729bc99b987..459c252b7071 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
@@ -166,6 +166,10 @@ struct BufferResultsToOutParamsOpts {
   /// If true, the pass adds a "bufferize.result" attribute to each output
   /// parameter.
   bool addResultAttribute = false;
+
+  /// If true, the pass eliminates the memref.alloc and memcpy if the returned
+  /// memref is allocated in the current function.
+  bool hoistStaticAllocs = false;
 };
 
 /// Creates a pass that converts memref function results to out-params.
diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
index 1303dc2c9ae1..75ce85c9128c 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
@@ -315,11 +315,20 @@ def BufferResultsToOutParams : Pass<"buffer-results-to-out-params", "ModuleOp">
     The main issue with this pass (and the out-param calling convention) is that
     buffers for results need to be allocated in the caller. This currently only
     works for static shaped memrefs.
+
+    If the hoist-static-allocs option is on, the pass tries to eliminate the
+    allocation for the returned memref and avoid the memory-copy if possible.
+    This optimization applies on the returned memref which has static shape and
+    is allocated by memref.alloc in the function. It will use the memref given
+    in function argument to replace the allocated memref.
   }];
   let options = [
     Option<"addResultAttribute", "add-result-attr", "bool",
        /*default=*/"false",
        "Add the attribute 'bufferize.result' to all output parameters.">,
+    Option<"hoistStaticAllocs", "hoist-static-allocs",
+       "bool", /*default=*/"false",
+       "Hoist static allocations to call sites.">,
   ];
   let constructor = "mlir::bufferization::createBufferResultsToOutParamsPass()";
   let dependentDialects = ["memref::MemRefDialect"];
diff --git a/mlir/include/mlir/Dialect/IRDL/IRDLVerifiers.h b/mlir/include/mlir/Dialect/IRDL/IRDLVerifiers.h
index 9ecb7c0107d7..89e99a63a5f1 100644
--- a/mlir/include/mlir/Dialect/IRDL/IRDLVerifiers.h
+++ b/mlir/include/mlir/Dialect/IRDL/IRDLVerifiers.h
@@ -30,7 +30,10 @@ class DynamicTypeDefinition;
 namespace mlir {
 namespace irdl {
 
+class AttributeOp;
 class Constraint;
+class OperationOp;
+class TypeOp;
 
 /// Provides context to the verification of constraints.
 /// It contains the assignment of variables to attributes, and the assignment
@@ -246,6 +249,14 @@ private:
   std::optional<SmallVector<unsigned>> argumentConstraints;
   std::optional<size_t> blockCount;
 };
+
+/// Generate an op verifier function from the given IRDL operation definition.
+llvm::unique_function<LogicalResult(Operation *) const> createVerifier(
+    OperationOp operation,
+    const DenseMap<irdl::TypeOp, std::unique_ptr<DynamicTypeDefinition>>
+        &typeDefs,
+    const DenseMap<irdl::AttributeOp, std::unique_ptr<DynamicAttrDefinition>>
+        &attrDefs);
 } // namespace irdl
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index 6655ce6f123e..4b91708ea1aa 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -1795,7 +1795,7 @@ def LLVM_FenceOp : LLVM_Op<"fence">, LLVM_MemOpPatterns {
   let hasVerifier = 1;
 }
 
-def LLVM_InlineAsmOp : LLVM_Op<"inline_asm", []> {
+def LLVM_InlineAsmOp : LLVM_Op<"inline_asm", [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
   let description = [{
     The InlineAsmOp mirrors the underlying LLVM semantics with a notable
     exception: the embedded `asm_string` is not allowed to define or reference
diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.td b/mlir/include/mlir/Dialect/Linalg/Passes.td
index 85f11c66d29a..0a4ce8953136 100644
--- a/mlir/include/mlir/Dialect/Linalg/Passes.td
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.td
@@ -141,4 +141,63 @@ def LinalgDetensorizePass : InterfacePass<"linalg-detensorize", "FunctionOpInter
   ];
 }
 
+def LinalgBlockPackMatmul : Pass<"linalg-block-pack-matmul"> {
+  let summary = "Convert linalg matmul ops to block layout and back";
+  let description = [{
+    Pack a matmul operation into blocked layout with two levels of subdivision:
+    - major 2D blocks - outer dimensions, consist of minor blocks
+    - minor 2D blocks - inner dimensions, consist of scalar elements
+
+    A 2D matmul MxNxK gets reshaped into blocked 4D representation
+    as: [MB][NB][mb][nb] += [MB][KB][mb][kb] * [NB][KB][nb][kb]
+    where the (MB, NB, KB) dimensions represent the major blocks,
+    and the (mb, nb, kb) are the minor blocks of their respective
+    original 2D dimensions (M, N, K).
+
+    Depending on the initial operands' data layout and the specified
+    packing options, the major blocks dimensions might get transposed
+    e.g., [MB][KB] -> [KB][MB]. The minor blocks can also be transposed
+    e.g., [mb][kb] -> [kb][mb].
+    Any present batch dimensions remain unchanged.
+    The final result is unpacked back to the original shape.
+
+    For example, given a matmul operation:
+    ```mlir
+      %res = linalg.matmul ins(%A, %B) outs(%C)
+    ```
+    the default transformation result can be represented as:
+    ```mlir
+      %A_packed = pack %A : 2D <MxK> -> 4D <MBxKBxmbxkb>
+      %B_packed = pack %B : 2D <KxN> -> 4D <NBxKBxnbxkb>
+      %C_packed = pack %C : 2D <MxN> -> 4D <MBxNBxmbxnb>
+      %res_packed = linalg.mmt4d ins(%A_packed, %B_packed) outs(%C_packed)
+      %res = unpack %res_packed : 4D <MBxNBxmbxnb> -> 2D <MxN>
+    ```
+  }];
+  let dependentDialects = ["linalg::LinalgDialect", "tensor::TensorDialect"];
+  let options = [
+    ListOption<"blockFactors", "block-factors", "int64_t",
+               "Block factors (mb, nb, kb) for relayout">,
+    Option<"allowPadding", "allow-padding", "bool",
+           /*default=*/"true",
+           "Allow packing padding">,
+    ListOption<"mnkPaddedSizesNextMultipleOf", "mnk-padded-multiples", "int64_t",
+               "Next multiples of the packing sizes">,
+    ListOption<"mnkOrder", "mnk-order", "int64_t",
+               "Permutation of matmul (M, N, K) dimensions order">,
+    Option<"lhsTransposeOuterBlocks", "lhs-transpose-outer-blocks", "bool",
+           /*default=*/"false",
+           "Transpose LHS outer block layout [MB][KB] -> [KB][MB]">,
+    Option<"lhsTransposeInnerBlocks", "lhs-transpose-inner-blocks", "bool",
+           /*default=*/"false",
+           "Transpose LHS inner block layout [mb][kb] -> [kb][mb]">,
+    Option<"rhsTransposeOuterBlocks", "rhs-transpose-outer-blocks", "bool",
+           /*default=*/"true",
+           "Transpose RHS outer block layout [KB][NB] -> [NB][KB]">,
+    Option<"rhsTransposeInnerBlocks", "rhs-transpose-inner-blocks", "bool",
+           /*default=*/"true",
+           "Transpose RHS inner block layout [kb][nb] -> [nb][kb]">
+  ];
+}
+
 #endif // MLIR_DIALECT_LINALG_PASSES
diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index d0ad4ccdf031..5585ba27fdad 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -783,10 +783,9 @@ def PackOp : Op<Transform_Dialect, "structured.pack", [
   let assemblyFormat = [{
     $target
     `packed_sizes` `=` custom<DynamicIndexList>($packed_sizes,
-                                                $static_packed_sizes,
-                                                type($packed_sizes))
+                                                $static_packed_sizes)
     attr-dict
-    `:` functional-type($target, results)
+    `:` functional-type(operands, results)
   }];
 
   let builders = [
@@ -890,14 +889,13 @@ def PackGreedilyOp : Op<Transform_Dialect, "structured.pack_greedily", [
     $target
     oilist(
       `matmul_packed_sizes` `=` custom<DynamicIndexList>($matmul_packed_sizes,
-                                                         $static_matmul_packed_sizes,
-                                                         type($matmul_packed_sizes))
+                                                         $static_matmul_packed_sizes)
       (`matmul_padded_sizes_next_multiple_of` `=`
         $matmul_padded_sizes_next_multiple_of^)?
       `matmul_inner_dims_order` `=` $matmul_inner_dims_order
     )
     attr-dict
-    `:` functional-type($target, results)
+    `:` functional-type(operands, results)
   }];
   let hasVerifier = 1;
 
@@ -978,8 +976,8 @@ def PackTransposeOp : Op<Transform_Dialect, "structured.pack_transpose", [
 //===----------------------------------------------------------------------===//
 
 def PadOp : Op<Transform_Dialect, "structured.pad",
-    [FunctionalStyleTransformOpTrait, MemoryEffectsOpInterface,
-     DeclareOpInterfaceMethods<TransformOpInterface>,
+    [FunctionalStyleTransformOpTrait, DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+     TransformOpInterface,
      ReportTrackingListenerFailuresOpTrait]> {
   let description = [{
     Pads the operations pointed to by the target handle using the options
@@ -1011,7 +1009,9 @@ def PadOp : Op<Transform_Dialect, "structured.pad",
     (ins TransformHandleTypeInterface:$target,
          DefaultValuedAttr<ArrayAttr, "{}">:$padding_values,
          DefaultValuedAttr<I64ArrayAttr, "{}">:$padding_dimensions,
-         OptionalAttr<I64ArrayAttr>:$pad_to_multiple_of,
+         Variadic<TransformAnyParamTypeOrAnyHandle>:$pad_to_multiple_of,
+         DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:
+                          $static_pad_to_multiple_of,
          DefaultValuedAttr<I64ArrayAttr, "{}">:$pack_paddings,
          DefaultValuedAttr<
           TypedArrayAttrBase<I64ArrayAttr, "array of arrays of i64">,
@@ -1021,8 +1021,13 @@ def PadOp : Op<Transform_Dialect, "structured.pad",
                       TransformHandleTypeInterface:$pad,
                       TransformHandleTypeInterface:$copy);
 
-  let assemblyFormat =
-    "$target attr-dict `:` functional-type(operands, results)";
+  let assemblyFormat = [{
+    $target 
+    (`pad_to_multiple_of` custom<DynamicIndexList>($pad_to_multiple_of, $static_pad_to_multiple_of)^)?
+    attr-dict
+    `:` functional-type(operands, results)
+  }];
+
   let hasVerifier = 1;
 
   let builders = [
@@ -1033,7 +1038,13 @@ def PadOp : Op<Transform_Dialect, "structured.pad",
     // TODO: support other operations (e.g. min, max etc).
     OpBuilder<(ins "Value":$target,
                    "ArrayRef<int64_t>":$paddingDimensions,
-                   CArg<"ArrayRef<int64_t>", "{}">:$padToMultipleOf,
+                   CArg<"ArrayRef<int64_t>", "{}">:$staticPadToMultipleOf,
+                   CArg<"ArrayRef<int64_t>", "{}">:$packPaddings,
+                   CArg<"ArrayRef<Attribute>", "{}">:$transposePaddings,
+                   CArg<"StringRef", "::mlir::bufferization::MaterializeInDestinationOp::getOperationName()">:$copyBackOp)>,
+    OpBuilder<(ins "Value":$target,
+                   "ArrayRef<int64_t>":$paddingDimensions,
+                   "ArrayRef<OpFoldResult>":$mixedPadToMultipleOf,
                    CArg<"ArrayRef<int64_t>", "{}">:$packPaddings,
                    CArg<"ArrayRef<Attribute>", "{}">:$transposePaddings,
                    CArg<"StringRef", "::mlir::bufferization::MaterializeInDestinationOp::getOperationName()">:$copyBackOp)>
@@ -1043,11 +1054,13 @@ def PadOp : Op<Transform_Dialect, "structured.pad",
     /// copy_back_op attribute value indicating that no copy back is desired.
     static constexpr StringRef kCopyOpNone = "none";
 
-    ::mlir::DiagnosedSilenceableFailure applyToOne(
-        ::mlir::transform::TransformRewriter &rewriter,
-        ::mlir::linalg::LinalgOp target,
-        ::mlir::transform::ApplyToEachResultList &results,
-        ::mlir::transform::TransformState &state);
+    /// Returns a mix of dynamic `pad_to_multiple_of` and static `static_pad_to_multiple_of`.
+    SmallVector<OpFoldResult> getMixedPadToMultipleOf();
+
+    ::mlir::DiagnosedSilenceableFailure apply(
+      ::mlir::transform::TransformRewriter &rewriter,
+      ::mlir::transform::TransformResults &results,
+      ::mlir::transform::TransformState &state);
   }];
 }
 
@@ -1884,7 +1897,17 @@ def TileUsingForOp : Op<Transform_Dialect, "structured.tile_using_for",
                       $scalableSizes)>,
   ];
 
-  let hasCustomAssemblyFormat = 1;
+  let assemblyFormat = [{
+    $target
+      `tile_sizes` custom<DynamicIndexList>(
+        $dynamic_sizes,
+        $static_sizes,
+        $scalable_sizes)
+      (`interchange` `=` $interchange^)?
+    attr-dict
+    `:` functional-type(operands, results)
+  }];
+
   let hasVerifier = 1;
 
   let extraClassDeclaration = [{
@@ -2002,17 +2025,13 @@ def TileUsingForallOp :
   let assemblyFormat = [{
     $target oilist(
         `num_threads` custom<PackedOrDynamicIndexList>($packed_num_threads,
-                                                       type($packed_num_threads),
                                                        $num_threads,
-                                                       type($num_threads),
                                                        $static_num_threads) |
          `tile_sizes` custom<PackedOrDynamicIndexList>($packed_tile_sizes,
-                                                       type($packed_tile_sizes),
                                                        $tile_sizes,
-                                                       type($tile_sizes),
                                                        $static_tile_sizes))
     (`(` `mapping` `=` $mapping^ `)`)? attr-dict
-    `:` functional-type($target, results)
+    `:` functional-type(operands, results)
   }];
   let hasVerifier = 1;
 
@@ -2147,7 +2166,18 @@ def VectorizeOp : Op<Transform_Dialect, "structured.vectorize",
 
   let results = (outs);
 
-  let hasCustomAssemblyFormat = 1;
+  // We use oilist here to elide the optional `vector_sizes` when empty list
+  // is passed.
+  let assemblyFormat = [{
+    $target oilist(
+      `vector_sizes` custom<DynamicIndexList>(
+        $vector_sizes,
+        $static_vector_sizes,
+        $scalable_sizes))
+    attr-dict
+    `:` type($target)(`,`type($vector_sizes)^)? 
+  }];
+
   let hasVerifier = 1;
 
   let extraClassDeclaration = [{
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 5ecf84fa9c70..f77c19ed0fcc 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -1162,6 +1162,66 @@ packMatmulGreedily(RewriterBase &rewriter, LinalgOp linalgOp,
                    ArrayRef<int64_t> mnkPaddedSizesNextMultipleOf,
                    ArrayRef<int64_t> mnkOrder);
 
+struct BlockPackMatmulOptions {
+  /// Minor block factors (mb, nb, kb) for packing relayout where mb, mn are
+  /// the parallel dimensions and kb is the reduction dimension.
+  SmallVector<int64_t, 3> blockFactors;
+
+  /// If true, allows packing of dimensions that only partially fit into the
+  /// block factors.
+  bool allowPadding = true;
+
+  /// Next multiples of the packing sizes.
+  SmallVector<int64_t, 3> mnkPaddedSizesNextMultipleOf;
+
+  /// Permutation of matmul (M, N, K) dimensions order.
+  SmallVector<int64_t, 3> mnkOrder = {0, 1, 2};
+
+  /// Transpose LHS outer block layout [MB][KB] -> [KB][MB].
+  bool lhsTransposeOuterBlocks = false;
+
+  /// Transpose LHS inner block layout [mb][kb] -> [kb][mb].
+  bool lhsTransposeInnerBlocks = false;
+
+  /// Transpose RHS outer block layout [KB][NB] -> [NB][KB].
+  bool rhsTransposeOuterBlocks = true;
+
+  /// Transpose RHS inner block layout [kb][nb] -> [nb][kb].
+  bool rhsTransposeInnerBlocks = true;
+};
+
+/// Function type which is used to control matmul packing.
+/// It is expected to return valid packing configuration for each operation.
+/// Lack of packing options indicates that no valid configuration could be
+/// assigned and the operation will not be packed.
+using ControlBlockPackMatmulFn =
+    std::function<std::optional<BlockPackMatmulOptions>(linalg::LinalgOp)>;
+
+/// Pack a matmul operation into blocked 4D layout.
+///
+/// Relayout a matmul operation into blocked layout with two levels of
+/// subdivision:
+///   - major 2D blocks - outer dimensions, consist of minor blocks
+///   - minor 2D blocks - inner dimensions, consist of scalar elements
+///
+/// A 2D matmul MxNxK gets reshaped into blocked 4D representation
+/// as: [MB][NB][mb][nb] += [MB][KB][mb][kb] * [NB][KB][nb][kb]
+/// where the (MB, NB, KB) dimensions represent the major blocks,
+/// and the (mb, nb, kb) are the minor blocks of their respective
+/// original 2D dimensions (M, N, K).
+///
+/// Depending on the initial operands' data layout and the specified
+/// packing options, the major blocks dimensions might get transposed
+/// e.g., [MB][KB] -> [KB][MB]. The minor blocks can also be transposed
+/// e.g., [mb][kb] -> [kb][mb].
+/// Any present batch dimensions remain unchanged.
+/// The final result is unpacked back to the original shape.
+///
+/// Return failure if no valid packing options are provided.
+FailureOr<PackResult>
+blockPackMatmul(RewriterBase &rewriter, linalg::LinalgOp linalgOp,
+                const ControlBlockPackMatmulFn &controlPackMatmul);
+
 /// Rewrite tensor.from_elements to linalg.generic.
 FailureOr<Operation *>
 rewriteInDestinationPassingStyle(RewriterBase &rewriter,
@@ -1628,6 +1688,10 @@ void populateSplitReductionPattern(
 void populateTransposeMatmulPatterns(RewritePatternSet &patterns,
                                      bool transposeLHS = true);
 
+/// Patterns to block pack Linalg matmul ops.
+void populateBlockPackMatmulPatterns(RewritePatternSet &patterns,
+                                     const ControlBlockPackMatmulFn &controlFn);
+
 } // namespace linalg
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/Math/Transforms/Passes.h b/mlir/include/mlir/Dialect/Math/Transforms/Passes.h
index e2c513047c77..ba6977251564 100644
--- a/mlir/include/mlir/Dialect/Math/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Math/Transforms/Passes.h
@@ -31,6 +31,9 @@ void populateExpandTanPattern(RewritePatternSet &patterns);
 void populateExpandSinhPattern(RewritePatternSet &patterns);
 void populateExpandCoshPattern(RewritePatternSet &patterns);
 void populateExpandTanhPattern(RewritePatternSet &patterns);
+void populateExpandAsinhPattern(RewritePatternSet &patterns);
+void populateExpandAcoshPattern(RewritePatternSet &patterns);
+void populateExpandAtanhPattern(RewritePatternSet &patterns);
 void populateExpandFmaFPattern(RewritePatternSet &patterns);
 void populateExpandFloorFPattern(RewritePatternSet &patterns);
 void populateExpandCeilFPattern(RewritePatternSet &patterns);
@@ -39,6 +42,7 @@ void populateExpandPowFPattern(RewritePatternSet &patterns);
 void populateExpandFPowIPattern(RewritePatternSet &patterns);
 void populateExpandRoundFPattern(RewritePatternSet &patterns);
 void populateExpandRoundEvenPattern(RewritePatternSet &patterns);
+void populateExpandRsqrtPattern(RewritePatternSet &patterns);
 void populateMathAlgebraicSimplificationPatterns(RewritePatternSet &patterns);
 
 struct MathPolynomialApproximationOptions {
diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
index 14b8d95ea15b..5738b6ca51c1 100644
--- a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
+++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
@@ -1578,7 +1578,8 @@ class MemRef_ReassociativeReshapeOp<string mnemonic, list<Trait> traits = []> :
 }
 
 def MemRef_ExpandShapeOp : MemRef_ReassociativeReshapeOp<"expand_shape", [
-    DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>]> {
+    DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>,
+    DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface>]> {
   let summary = "operation to produce a memref with a higher rank.";
   let description = [{
     The `memref.expand_shape` op produces a new view with a higher rank whose
diff --git a/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h b/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h
index 7d9a5e6ca759..46003ed84686 100644
--- a/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h
+++ b/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h
@@ -64,6 +64,35 @@ getLinearizedMemRefOffsetAndSize(OpBuilder &builder, Location loc, int srcBits,
 // it means both the allocations and associated stores can be removed.
 void eraseDeadAllocAndStores(RewriterBase &rewriter, Operation *parentOp);
 
+/// Given a set of sizes, return the suffix product.
+///
+/// When applied to slicing, this is the calculation needed to derive the
+/// strides (i.e. the number of linear indices to skip along the (k-1) most
+/// minor dimensions to get the next k-slice).
+///
+/// This is the basis to linearize an n-D offset confined to `[0 ... sizes]`.
+///
+/// Assuming `sizes` is `[s0, .. sn]`, return the vector<Value>
+///   `[s1 * ... * sn, s2 * ... * sn, ..., sn, 1]`.
+///
+/// It is the caller's responsibility to provide valid OpFoldResult type values
+/// and construct valid IR in the end.
+///
+/// `sizes` elements are asserted to be non-negative.
+///
+/// Return an empty vector if `sizes` is empty.
+///
+/// The function emits an IR block which computes suffix product for provided
+/// sizes.
+SmallVector<OpFoldResult>
+computeSuffixProductIRBlock(Location loc, OpBuilder &builder,
+                            ArrayRef<OpFoldResult> sizes);
+inline SmallVector<OpFoldResult>
+computeStridesIRBlock(Location loc, OpBuilder &builder,
+                      ArrayRef<OpFoldResult> sizes) {
+  return computeSuffixProductIRBlock(loc, builder, sizes);
+}
+
 } // namespace memref
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index a40676d071e6..f248be1639fe 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -1423,10 +1423,12 @@ def MapInfoOp : OpenMP_Op<"map.info", [AttrSizedOperandSegments]> {
                        TypeAttr:$var_type,
                        Optional<OpenMP_PointerLikeType>:$var_ptr_ptr,
                        Variadic<OpenMP_PointerLikeType>:$members,
+                       OptionalAttr<AnyIntElementsAttr>:$members_index,
                        Variadic<MapBoundsType>:$bounds, /* rank-0 to rank-{n-1} */
                        OptionalAttr<UI64Attr>:$map_type,
                        OptionalAttr<VariableCaptureKindAttr>:$map_capture_type,
-                       OptionalAttr<StrAttr>:$name);
+                       OptionalAttr<StrAttr>:$name,
+                       DefaultValuedAttr<BoolAttr, "false">:$partial_map);
   let results = (outs OpenMP_PointerLikeType:$omp_ptr);
 
   let description = [{
@@ -1462,10 +1464,14 @@ def MapInfoOp : OpenMP_Op<"map.info", [AttrSizedOperandSegments]> {
     - `var_type`: The type of the variable to copy.
     - `var_ptr_ptr`: Used when the variable copied is a member of a class, structure
       or derived type and refers to the originating struct.
-    - `members`:  Used to indicate mapped child members for the current MapInfoOp,
+    - `members`: Used to indicate mapped child members for the current MapInfoOp,
        represented as other MapInfoOp's, utilised in cases where a parent structure
        type and members of the structure type are being mapped at the same time.
        For example: map(to: parent, parent->member, parent->member2[:10])
+    - `members_index`: Used to indicate the ordering of members within the containing
+       parent (generally a record type such as a structure, class or derived type),
+       e.g. struct {int x, float y, double z}, x would be 0, y would be 1, and z
+       would be 2. This aids the mapping.
     - `bounds`: Used when copying slices of array's, pointers or pointer members of
        objects (e.g. derived types or classes), indicates the bounds to be copied
        of the variable. When it's an array slice it is in rank order where rank 0
@@ -1476,6 +1482,8 @@ def MapInfoOp : OpenMP_Op<"map.info", [AttrSizedOperandSegments]> {
     - 'map_capture_type': Capture type for the variable e.g. this, byref, byvalue, byvla
        this can affect how the variable is lowered.
     - `name`: Holds the name of variable as specified in user clause (including bounds).
+    - `partial_map`: The record type being mapped will not be mapped in its entirety,
+       it may be used however, in a mapping to bind it's mapped components together.
   }];
 
   let assemblyFormat = [{
@@ -1484,7 +1492,7 @@ def MapInfoOp : OpenMP_Op<"map.info", [AttrSizedOperandSegments]> {
         `var_ptr_ptr` `(` $var_ptr_ptr `:` type($var_ptr_ptr) `)`
       | `map_clauses` `(` custom<MapClause>($map_type) `)`
       | `capture` `(` custom<CaptureType>($map_capture_type) `)`
-      | `members` `(` $members `:` type($members) `)`
+      | `members` `(` $members `:` custom<MembersIndex>($members_index) `:` type($members) `)`
       | `bounds` `(` $bounds `)`
     ) `->` type($omp_ptr) attr-dict
   }];
@@ -1787,7 +1795,10 @@ def TargetOp : OpenMP_Op<"target", [IsolatedFromAbove, MapClauseOwningOpInterfac
                        UnitAttr:$nowait,
                        Variadic<OpenMP_PointerLikeType>:$is_device_ptr,
                        Variadic<OpenMP_PointerLikeType>:$has_device_addr,
-                       Variadic<AnyType>:$map_operands);
+                       Variadic<AnyType>:$map_operands,
+                       Variadic<AnyType>:$private_vars,
+                       OptionalAttr<SymbolRefArrayAttr>:$privatizers);
+
   let regions = (region AnyRegion:$region);
 
   let builders = [
@@ -1802,6 +1813,7 @@ def TargetOp : OpenMP_Op<"target", [IsolatedFromAbove, MapClauseOwningOpInterfac
     | `is_device_ptr` `(` $is_device_ptr `:` type($is_device_ptr) `)`
     | `has_device_addr` `(` $has_device_addr `:` type($has_device_addr) `)`
     | `map_entries` `(` custom<MapEntries>($map_operands, type($map_operands)) `)`
+    | `private` `(` custom<PrivateList>($private_vars, type($private_vars), $privatizers) `)`
     | `depend` `(` custom<DependVarList>($depend_vars, type($depend_vars), $depends) `)`
     ) $region attr-dict
   }];
diff --git a/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.td b/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.td
index d3e3ac55677f..ed1f4ce8b7e5 100644
--- a/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.td
+++ b/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.td
@@ -79,7 +79,7 @@ def Polynomial_PolynomialAttr : Polynomial_Attr<"Polynomial", "polynomial"> {
     #poly = #polynomial.polynomial<x**1024 + 1>
     ```
   }];
-  let parameters = (ins "Polynomial":$polynomial);
+  let parameters = (ins "::mlir::polynomial::Polynomial":$polynomial);
   let hasCustomAssemblyFormat = 1;
 }
 
@@ -122,10 +122,19 @@ def Polynomial_RingAttr : Polynomial_Attr<"Ring", "ring"> {
 
   let parameters = (ins
     "Type": $coefficientType,
-    OptionalParameter<"IntegerAttr">: $coefficientModulus,
-    OptionalParameter<"PolynomialAttr">: $polynomialModulus
+    OptionalParameter<"::mlir::IntegerAttr">: $coefficientModulus,
+    OptionalParameter<"::mlir::polynomial::PolynomialAttr">: $polynomialModulus,
+    OptionalParameter<"::mlir::IntegerAttr">: $primitiveRoot
   );
 
+  let builders = [
+    AttrBuilder<
+        (ins "::mlir::Type":$coefficientTy,
+             "::mlir::IntegerAttr":$coefficientModulusAttr,
+             "::mlir::polynomial::PolynomialAttr":$polynomialModulusAttr), [{
+      return $_get($_ctxt, coefficientTy, coefficientModulusAttr, polynomialModulusAttr, nullptr);
+    }]>
+  ];
   let hasCustomAssemblyFormat = 1;
 }
 
@@ -416,4 +425,45 @@ def Polynomial_ConstantOp : Polynomial_Op<"constant", [Pure]> {
   let assemblyFormat = "$input attr-dict `:` type($output)";
 }
 
+def Polynomial_NTTOp : Polynomial_Op<"ntt", [Pure]> {
+  let summary = "Computes point-value tensor representation of a polynomial.";
+  let description = [{
+    `polynomial.ntt` computes the forward integer Number Theoretic Transform
+    (NTT) on the input polynomial. It returns a tensor containing a point-value
+    representation of the input polynomial. The output tensor has shape equal
+    to the degree of the ring's `polynomialModulus`. The polynomial's RingAttr
+    is embedded as the encoding attribute of the output tensor.
+
+    Given an input polynomial `F(x)` over a ring whose `polynomialModulus` has
+    degree `n`, and a primitive `n`-th root of unity `omega_n`, the output is
+    the list of $n$ evaluations
+
+      `f[k] = F(omega[n]^k) ; k = {0, ..., n-1}`
+
+    The choice of primitive root is determined by subsequent lowerings.
+  }];
+  let arguments = (ins Polynomial_PolynomialType:$input);
+  let results = (outs RankedTensorOf<[AnyInteger]>:$output);
+  let assemblyFormat = "$input attr-dict `:` qualified(type($input)) `->` type($output)";
+  let hasVerifier = 1;
+}
+
+def Polynomial_INTTOp : Polynomial_Op<"intt", [Pure]> {
+  let summary = "Computes the reverse integer Number Theoretic Transform (NTT).";
+  let description = [{
+    `polynomial.intt` computes the reverse integer Number Theoretic Transform
+    (INTT) on the input tensor. This is the inverse operation of the
+    `polynomial.ntt` operation.
+
+    The input tensor is interpreted as a point-value representation of the
+    output polynomial at powers of a primitive `n`-th root of unity (see
+    `polynomial.ntt`). The ring of the polynomial is taken from the required
+    encoding attribute of the tensor.
+  }];
+  let arguments = (ins RankedTensorOf<[AnyInteger]>:$input);
+  let results = (outs Polynomial_PolynomialType:$output);
+  let assemblyFormat = "$input attr-dict `:` qualified(type($input)) `->` type($output)";
+  let hasVerifier = 1;
+}
+
 #endif // POLYNOMIAL_OPS
diff --git a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
index b3d085bfff1a..0b063aa772ba 100644
--- a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
+++ b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
@@ -307,7 +307,8 @@ def ForallOp : SCF_Op<"forall", [
        RecursiveMemoryEffects,
        SingleBlockImplicitTerminator<"scf::InParallelOp">,
        DeclareOpInterfaceMethods<RegionBranchOpInterface>,
-       DestinationStyleOpInterface
+       DestinationStyleOpInterface,
+       HasParallelRegion
      ]> {
   let summary = "evaluate a block multiple times in parallel";
   let description = [{
@@ -608,6 +609,10 @@ def ForallOp : SCF_Op<"forall", [
 
     // Declare the shared_outs as inits/outs to DestinationStyleOpInterface.
     MutableOperandRange getDpsInitsMutable() { return getOutputsMutable(); }
+
+    /// Returns operations within scf.forall.in_parallel whose destination
+    /// operand is the block argument `bbArg`.
+    SmallVector<Operation*> getCombiningOps(BlockArgument bbArg);
   }];
 }
 
@@ -764,7 +769,8 @@ def ParallelOp : SCF_Op<"parallel",
           "getSingleLowerBound", "getSingleUpperBound", "getSingleStep"]>,
      RecursiveMemoryEffects,
      DeclareOpInterfaceMethods<RegionBranchOpInterface>,
-     SingleBlockImplicitTerminator<"scf::ReduceOp">]> {
+     SingleBlockImplicitTerminator<"scf::ReduceOp">,
+     HasParallelRegion]> {
   let summary = "parallel for operation";
   let description = [{
     The "scf.parallel" operation represents a loop nest taking 4 groups of SSA
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h
index b182b4c72b95..3cf81d2e58f2 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h
@@ -41,6 +41,19 @@ using Level = uint64_t;
 /// including the value `ShapedType::kDynamic` (for shapes).
 using Size = int64_t;
 
+/// A simple structure that encodes a range of levels in the sparse tensors
+/// that forms a COO segment.
+struct COOSegment {
+  std::pair<Level, Level> lvlRange; // [low, high)
+  bool isSoA;
+
+  bool isAoS() const { return !isSoA; }
+  bool isSegmentStart(Level l) const { return l == lvlRange.first; }
+  bool inSegment(Level l) const {
+    return l >= lvlRange.first && l < lvlRange.second;
+  }
+};
+
 } // namespace sparse_tensor
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
index eefa4c71bbd2..53dd8e39438c 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
@@ -502,10 +502,25 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding",
     //
     // Helper function to translate between level/dimension space.
     //
+
     SmallVector<int64_t> translateShape(::mlir::ArrayRef<int64_t> srcShape, ::mlir::sparse_tensor::CrdTransDirectionKind) const;
     ValueRange translateCrds(::mlir::OpBuilder &builder, ::mlir::Location loc, ::mlir::ValueRange crds, ::mlir::sparse_tensor::CrdTransDirectionKind) const;
 
     //
+    // COO methods.
+    //
+
+    /// Returns the starting level of this sparse tensor type for a
+    /// trailing COO region that spans **at least** two levels. If
+    /// no such COO region is found, then returns the level-rank.
+    ///
+    /// DEPRECATED: use getCOOSegment instead;
+    Level getAoSCOOStart() const;
+
+    /// Returns a list of COO segments in the sparse tensor types.
+    SmallVector<COOSegment> getCOOSegments() const;
+
+    //
     // Printing methods.
     //
 
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
index ea3d8013b456..a154d7fa5fb6 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
@@ -18,19 +18,6 @@
 namespace mlir {
 namespace sparse_tensor {
 
-/// A simple structure that encodes a range of levels in the sparse tensors that
-/// forms a COO segment.
-struct COOSegment {
-  std::pair<Level, Level> lvlRange; // [low, high)
-  bool isSoA;
-
-  bool isAoS() const { return !isSoA; }
-  bool isSegmentStart(Level l) const { return l == lvlRange.first; }
-  bool inSegment(Level l) const {
-    return l >= lvlRange.first && l < lvlRange.second;
-  }
-};
-
 //===----------------------------------------------------------------------===//
 /// A wrapper around `RankedTensorType`, which has three goals:
 ///
@@ -73,12 +60,6 @@ public:
       : SparseTensorType(
             RankedTensorType::get(stp.getShape(), stp.getElementType(), enc)) {}
 
-  // TODO: remove?
-  SparseTensorType(SparseTensorEncodingAttr enc)
-      : SparseTensorType(RankedTensorType::get(
-            SmallVector<Size>(enc.getDimRank(), ShapedType::kDynamic),
-            Float32Type::get(enc.getContext()), enc)) {}
-
   SparseTensorType &operator=(const SparseTensorType &) = delete;
   SparseTensorType(const SparseTensorType &) = default;
 
@@ -369,13 +350,15 @@ public:
   /// no such COO region is found, then returns the level-rank.
   ///
   /// DEPRECATED: use getCOOSegment instead;
-  Level getAoSCOOStart() const;
+  Level getAoSCOOStart() const { return getEncoding().getAoSCOOStart(); };
 
   /// Returns [un]ordered COO type for this sparse tensor type.
   RankedTensorType getCOOType(bool ordered) const;
 
   /// Returns a list of COO segments in the sparse tensor types.
-  SmallVector<COOSegment> getCOOSegments() const;
+  SmallVector<COOSegment> getCOOSegments() const {
+    return getEncoding().getCOOSegments();
+  }
 
 private:
   // These two must be const, to ensure coherence of the memoized fields.
diff --git a/mlir/include/mlir/Dialect/Transform/CMakeLists.txt b/mlir/include/mlir/Dialect/Transform/CMakeLists.txt
index 0cd71ec6919d..b6155b5f573f 100644
--- a/mlir/include/mlir/Dialect/Transform/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/Transform/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_subdirectory(DebugExtension)
 add_subdirectory(Interfaces)
 add_subdirectory(IR)
+add_subdirectory(IRDLExtension)
 add_subdirectory(LoopExtension)
 add_subdirectory(PDLExtension)
 add_subdirectory(Transforms)
diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
index fbac1ffb621f..77048a28d751 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
@@ -512,7 +512,10 @@ def CollectMatchingOp : TransformDialectOp<"collect_matching", [
 def ForeachMatchOp : TransformDialectOp<"foreach_match", [
     DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
     DeclareOpInterfaceMethods<SymbolUserOpInterface>,
-    DeclareOpInterfaceMethods<TransformOpInterface>]> {
+    DeclareOpInterfaceMethods<TransformOpInterface,
+                              ["allowsRepeatedHandleOperands"]>,
+    DeclareOpInterfaceMethods<OpAsmOpInterface,
+                              ["getAsmResultNames"]>]> {
   let summary = "Applies named sequences when a named matcher succeeds";
   let description = [{
     Given a pair of co-indexed lists of transform dialect symbols (such as
@@ -528,25 +531,31 @@ def ForeachMatchOp : TransformDialectOp<"foreach_match", [
     the following matchers are not applied to the same payload operation. If the
     action succeeds, the next payload operation in walk order is matched. If it
     fails, both silenceable and definite errors are propagated as the result of
-    this op.
-
-    The matcher symbol must take one operand of a type that implements the same
-    transform dialect interface as the `root` operand (a check is performed at
-    application time to see if the associated payload satisfies the constraints
-    of the actual type). It must not consume the operand as multiple matchers
+    this op; propagation of silenceable errors is postponed until the end of the
+    walk.
+
+    The matcher symbol must take at least one operand of a type that implements
+    the same transform dialect interface as the `root` operand (a check is
+    performed at application time to see if the associated payload satisfies the
+    constraints of the actual type), and may take additional operands with a
+    similar type requirement. It must not consume operands as multiple matchers
     may be applied. The matcher may produce any number of results. The action
     symbol paired with the matcher must take the same number of arguments as the
     matcher has results, and these arguments must implement the same transform
     dialect interfaces, but not necessarily have the exact same type (again, a
     check is performed at application time to see if the associated payload
-    satisfies the constraints of actual types on both sides). The action symbol
-    may not have results. The actions are expected to only modify payload
-    operations nested in the `root` payload operations associated with the
-    operand of this transform operation. Furhermore, the actions may not modify
-    operations outside of the currently matched payload operation, e.g., they
-    may not modify sibling or parent operations. If such behavior is desired,
-    the parent must be matched first and the nested operations obtained by
-    traversing the IR from the parent. This is due to the matching being
+    satisfies the constraints of actual types on both sides).
+
+    The action symbol may have results that are accumulated from all actions and
+    returned from the `foreach_match` operation on success. Unless the
+    `flatten_results` attribute is present, each action result must be
+    associated with exactly one payload entity. The actions are expected to only
+    modify payload operations nested in the `root` payload operations associated
+    with the operand of this transform operation. Furthermore, the actions may
+    not modify operations outside of the currently matched payload operation,
+    e.g., they may not modify sibling or parent operations. If such behavior is
+    desired, the parent must be matched first and the nested operations obtained
+    by traversing the IR from the parent. This is due to the matching being
     performed as a post-order IR walk.
 
     This operation consumes the operand and produces a new handle associated
@@ -573,19 +582,26 @@ def ForeachMatchOp : TransformDialectOp<"foreach_match", [
     produced a definite failure.
   }];
 
-  let arguments = (ins TransformHandleTypeInterface:$root,
-                       UnitAttr:$restrict_root,
-                       SymbolRefArrayAttr:$matchers,
-                       SymbolRefArrayAttr:$actions);
-  let results = (outs TransformHandleTypeInterface:$updated);
+  let arguments =
+      (ins TransformHandleTypeInterface:$root,
+           Variadic<Transform_AnyHandleOrParamType>:$forwarded_inputs,
+           UnitAttr:$restrict_root,
+           UnitAttr:$flatten_results,
+           SymbolRefArrayAttr:$matchers,
+           SymbolRefArrayAttr:$actions);
+  let results =
+      (outs TransformHandleTypeInterface:$updated,
+            Variadic<Transform_AnyHandleOrParamType>:$forwarded_outputs);
 
   let assemblyFormat = [{
-    (`restrict_root` $restrict_root^)?
+    oilist( `restrict_root` $restrict_root
+          | `flatten_results` $flatten_results
+          )
     `in`
-    $root
+    $root (`,` $forwarded_inputs^)?
     custom<ForeachMatchSymbols>($matchers, $actions)
     attr-dict
-    `:` functional-type($root, $updated)
+    `:` functional-type(operands, results)
   }];
 
   let hasVerifier = 1;
diff --git a/mlir/include/mlir/Dialect/Transform/IRDLExtension/CMakeLists.txt b/mlir/include/mlir/Dialect/Transform/IRDLExtension/CMakeLists.txt
new file mode 100644
index 000000000000..dfcd906b43af
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Transform/IRDLExtension/CMakeLists.txt
@@ -0,0 +1,6 @@
+set(LLVM_TARGET_DEFINITIONS IRDLExtensionOps.td)
+mlir_tablegen(IRDLExtensionOps.h.inc -gen-op-decls)
+mlir_tablegen(IRDLExtensionOps.cpp.inc -gen-op-defs)
+add_public_tablegen_target(MLIRTransformDialectIRDLExtensionOpsIncGen)
+
+add_mlir_doc(IRDLExtensionOps IRDLExtensionOps Dialects/ -gen-op-doc)
diff --git a/mlir/include/mlir/Dialect/Transform/IRDLExtension/IRDLExtension.h b/mlir/include/mlir/Dialect/Transform/IRDLExtension/IRDLExtension.h
new file mode 100644
index 000000000000..19684e1ed444
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Transform/IRDLExtension/IRDLExtension.h
@@ -0,0 +1,21 @@
+//===- IRDLExtension.h - IRDL extension for Transform dialect ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_TRANSFORM_IRDLEXTENSION_IRDLEXTENSION_H
+#define MLIR_DIALECT_TRANSFORM_IRDLEXTENSION_IRDLEXTENSION_H
+
+namespace mlir {
+class DialectRegistry;
+
+namespace transform {
+/// Registers the IRDL extension of the Transform dialect in the given registry.
+void registerIRDLExtension(DialectRegistry &dialectRegistry);
+} // namespace transform
+} // namespace mlir
+
+#endif // MLIR_DIALECT_TRANSFORM_IRDLEXTENSION_IRDLEXTENSION_H
diff --git a/mlir/include/mlir/Dialect/Transform/IRDLExtension/IRDLExtensionOps.h b/mlir/include/mlir/Dialect/Transform/IRDLExtension/IRDLExtensionOps.h
new file mode 100644
index 000000000000..7e1d5cad1fbd
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Transform/IRDLExtension/IRDLExtensionOps.h
@@ -0,0 +1,20 @@
+//===- IRDLExtensionOps.h - IRDL Transform dialect extension ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_TRANSFORM_IRDLEXTENSION_IRDLEXTENSIONOPS_H
+#define MLIR_DIALECT_TRANSFORM_IRDLEXTENSION_IRDLEXTENSIONOPS_H
+
+#include "mlir/Bytecode/BytecodeOpInterface.h"
+#include "mlir/Dialect/Transform/IR/TransformDialect.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
+#include "mlir/IR/OpDefinition.h"
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/Transform/IRDLExtension/IRDLExtensionOps.h.inc"
+
+#endif // MLIR_DIALECT_TRANSFORM_IRDLEXTENSION_IRDLEXTENSIONOPS_H
diff --git a/mlir/include/mlir/Dialect/Transform/IRDLExtension/IRDLExtensionOps.td b/mlir/include/mlir/Dialect/Transform/IRDLExtension/IRDLExtensionOps.td
new file mode 100644
index 000000000000..6ca624aeda12
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Transform/IRDLExtension/IRDLExtensionOps.td
@@ -0,0 +1,36 @@
+//===- IRDLExtensionOps.td - Transform dialect extension ---*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_TRANSFORM_IRDLEXTENSION_IRDLEXTENSIONOPS
+#define MLIR_DIALECT_TRANSFORM_IRDLEXTENSION_IRDLEXTENSIONOPS
+
+include "mlir/Dialect/Transform/IR/TransformDialect.td"
+include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/IR/SymbolInterfaces.td"
+
+def IRDLCollectMatchingOp : TransformDialectOp<"irdl.collect_matching",
+    [DeclareOpInterfaceMethods<TransformOpInterface>,
+     DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+     SymbolTable,
+     NoTerminator]> {
+  let summary = 
+    "Finds ops that match the IRDL definition without registering them.";
+
+  let arguments = (ins TransformHandleTypeInterface:$root);
+  let regions = (region SizedRegion<1>:$body);
+  let results = (outs TransformHandleTypeInterface:$matched);
+
+  let assemblyFormat =
+    "`in` $root `:` functional-type(operands, results) attr-dict-with-keyword "
+    "regions";
+
+  let hasVerifier = 1;
+}
+
+#endif // MLIR_DIALECT_TRANSFORM_IRDLEXTENSION_IRDLEXTENSIONOPS
diff --git a/mlir/include/mlir/Dialect/Transform/Interfaces/TransformInterfaces.h b/mlir/include/mlir/Dialect/Transform/Interfaces/TransformInterfaces.h
index 59cc2f22c938..21795753ac5f 100644
--- a/mlir/include/mlir/Dialect/Transform/Interfaces/TransformInterfaces.h
+++ b/mlir/include/mlir/Dialect/Transform/Interfaces/TransformInterfaces.h
@@ -52,6 +52,17 @@ void getPotentialTopLevelEffects(
 /// Verification hook for TransformOpInterface.
 LogicalResult verifyTransformOpInterface(Operation *op);
 
+/// Appends the entities associated with the given transform values in `state`
+/// to the pre-existing list of mappings. The array of mappings must have as
+/// many elements as values. If `flatten` is set, multiple values may be
+/// associated with each transform value, and this always succeeds. Otherwise,
+/// checks that each value has exactly one mapping associated and return failure
+/// otherwise.
+LogicalResult appendValueMappings(
+    MutableArrayRef<SmallVector<transform::MappedValue>> mappings,
+    ValueRange values, const transform::TransformState &state,
+    bool flatten = true);
+
 /// Populates `mappings` with mapped values associated with the given transform
 /// IR values in the given `state`.
 void prepareValueMappings(
@@ -317,6 +328,8 @@ public:
   }
   LogicalResult mapBlockArgument(BlockArgument argument,
                                  ArrayRef<MappedValue> values);
+  LogicalResult mapBlockArguments(Block::BlockArgListType arguments,
+                                  ArrayRef<SmallVector<MappedValue>> mapping);
 
   // Forward declarations to support limited visibility.
   class RegionScope;
diff --git a/mlir/include/mlir/Dialect/Transform/PDLExtension/PDLExtension.h b/mlir/include/mlir/Dialect/Transform/PDLExtension/PDLExtension.h
index 08915213cd22..bf5a105bc9f2 100644
--- a/mlir/include/mlir/Dialect/Transform/PDLExtension/PDLExtension.h
+++ b/mlir/include/mlir/Dialect/Transform/PDLExtension/PDLExtension.h
@@ -6,6 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef MLIR_DIALECT_TRANSFORM_PDLEXTENSION_PDLEXTENSION_H
+#define MLIR_DIALECT_TRANSFORM_PDLEXTENSION_PDLEXTENSION_H
+
 namespace mlir {
 class DialectRegistry;
 
@@ -14,3 +17,5 @@ namespace transform {
 void registerPDLExtension(DialectRegistry &dialectRegistry);
 } // namespace transform
 } // namespace mlir
+
+#endif // MLIR_DIALECT_TRANSFORM_PDLEXTENSION_PDLEXTENSION_H
diff --git a/mlir/include/mlir/Dialect/Transform/Transforms/TransformInterpreterPassBase.h b/mlir/include/mlir/Dialect/Transform/Transforms/TransformInterpreterPassBase.h
deleted file mode 100644
index 3a4b391fd7f4..000000000000
--- a/mlir/include/mlir/Dialect/Transform/Transforms/TransformInterpreterPassBase.h
+++ /dev/null
@@ -1,216 +0,0 @@
-//===- TransformInterpreterPassBase.h ---------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Base class with shared implementation for transform dialect interpreter
-// passes.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MLIR_DIALECT_TRANSFORM_TRANSFORMS_TRANSFORMINTERPRETERPASSBASE_H
-#define MLIR_DIALECT_TRANSFORM_TRANSFORMS_TRANSFORMINTERPRETERPASSBASE_H
-
-#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Support/LLVM.h"
-#include <memory>
-
-namespace mlir {
-struct LogicalResult;
-class MLIRContext;
-class ModuleOp;
-class Operation;
-template <typename>
-class OwningOpRef;
-class Region;
-
-namespace transform {
-namespace detail {
-/// Template-free implementation of TransformInterpreterPassBase::initialize.
-LogicalResult interpreterBaseInitializeImpl(
-    MLIRContext *context, StringRef transformFileName,
-    ArrayRef<std::string> transformLibraryPaths,
-    std::shared_ptr<OwningOpRef<ModuleOp>> &module,
-    std::shared_ptr<OwningOpRef<ModuleOp>> &libraryModule,
-    function_ref<std::optional<LogicalResult>(OpBuilder &, Location)>
-        moduleBuilder = nullptr);
-
-/// Template-free implementation of
-/// TransformInterpreterPassBase::runOnOperation.
-LogicalResult interpreterBaseRunOnOperationImpl(
-    Operation *target, StringRef passName,
-    const std::shared_ptr<OwningOpRef<ModuleOp>> &sharedTransformModule,
-    const std::shared_ptr<OwningOpRef<ModuleOp>> &libraryModule,
-    const RaggedArray<MappedValue> &extraMappings,
-    const TransformOptions &options,
-    const Pass::Option<std::string> &transformFileName,
-    const Pass::ListOption<std::string> &transformLibraryPaths,
-    const Pass::Option<std::string> &debugPayloadRootTag,
-    const Pass::Option<std::string> &debugTransformRootTag,
-    StringRef binaryName);
-} // namespace detail
-
-/// Base class for transform dialect interpreter passes that can consume and
-/// dump transform dialect scripts in separate files. The pass is controlled by
-/// three string options:
-///
-///   - transformFileName: if non-empty, the name of the file containing the
-///     transform script. If empty, `debugTransformRootTag` is considered or the
-///     pass root operation must contain a single top-level transform op that
-///     will be interpreted.
-///   - transformLibraryPaths: if non-empty, the modules in these files will be
-///     merged into the main transform script run by the interpreter before
-///     execution. This allows to provide definitions for external functions
-///     used in the main script. Other public symbols in the library modules may
-///     lead to collisions with public symbols in the main script and among each
-///     other.
-///   - debugPayloadRootTag: if non-empty, the value of the attribute named
-///     `kTransformDialectTagAttrName` indicating the single op that is
-///     considered the payload root of the transform interpreter; otherwise, the
-///     root operation of the pass is used.
-///   - debugTransformRootTag: if non-empty, the value of the attribute named
-///     `kTransformDialectTagAttrName` indicating the single top-level transform
-///     op contained in the payload root to be used as the entry point by the
-///     transform interpreter; mutually exclusive with `transformFileName`.
-///
-/// The pass runs the transform dialect interpreter as directed by the options.
-/// It also provides the mechanism to dump reproducers into stderr
-/// (-debug-only=transform-dialect-dump-repro) or into a temporary file
-/// (-debug-only=transform-dialect-save-repro) that can be used with this
-/// pass in a standalone mode.
-///
-/// Concrete passes must derive from this class instead of their generated base
-/// class (or PassWrapper), and supply themselves and the generated base class
-/// as template arguments. They are *not* expected to to implement `initialize`
-/// or `runOnOperation`. They *are* expected to call the copy constructor of
-/// this class in their copy constructors, short of which the file-based
-/// transform dialect script injection facility will become non-operational.
-///
-/// Concrete passes may implement the `runBeforeInterpreter` and
-/// `runAfterInterpreter` to customize the behavior of the pass.
-template <typename Concrete, template <typename> typename GeneratedBase>
-class TransformInterpreterPassBase : public GeneratedBase<Concrete> {
-public:
-  explicit TransformInterpreterPassBase(
-      const TransformOptions &options = TransformOptions())
-      : options(options) {}
-
-  TransformInterpreterPassBase(const TransformInterpreterPassBase &pass) {
-    sharedTransformModule = pass.sharedTransformModule;
-    transformLibraryModule = pass.transformLibraryModule;
-    options = pass.options;
-  }
-
-  static StringLiteral getBinaryName() { return "mlir-opt"; }
-
-  LogicalResult initialize(MLIRContext *context) override {
-
-#define REQUIRE_PASS_OPTION(NAME)                                              \
-  static_assert(                                                               \
-      std::is_same_v<                                                          \
-          std::remove_reference_t<decltype(std::declval<Concrete &>().NAME)>,  \
-          Pass::Option<std::string>>,                                          \
-      "required " #NAME " string pass option is missing")
-
-    REQUIRE_PASS_OPTION(transformFileName);
-    REQUIRE_PASS_OPTION(debugPayloadRootTag);
-    REQUIRE_PASS_OPTION(debugTransformRootTag);
-
-#undef REQUIRE_PASS_OPTION
-
-#define REQUIRE_PASS_LIST_OPTION(NAME)                                         \
-  static_assert(                                                               \
-      std::is_same_v<                                                          \
-          std::remove_reference_t<decltype(std::declval<Concrete &>().NAME)>,  \
-          Pass::ListOption<std::string>>,                                      \
-      "required " #NAME " string pass option is missing")
-
-    REQUIRE_PASS_LIST_OPTION(transformLibraryPaths);
-
-#undef REQUIRE_PASS_LIST_OPTION
-
-    StringRef transformFileName =
-        static_cast<Concrete *>(this)->transformFileName;
-    ArrayRef<std::string> transformLibraryPaths =
-        static_cast<Concrete *>(this)->transformLibraryPaths;
-    return detail::interpreterBaseInitializeImpl(
-        context, transformFileName, transformLibraryPaths,
-        sharedTransformModule, transformLibraryModule,
-        [this](OpBuilder &builder, Location loc) {
-          return static_cast<Concrete *>(this)->constructTransformModule(
-              builder, loc);
-        });
-  }
-
-  /// Hook for passes to run additional logic in the pass before the
-  /// interpreter. If failure is returned, the pass fails and the interpreter is
-  /// not run.
-  LogicalResult runBeforeInterpreter(Operation *) { return success(); }
-
-  /// Hook for passes to run additional logic in the pass after the interpreter.
-  /// Only runs if everything succeeded before. If failure is returned, the pass
-  /// fails.
-  LogicalResult runAfterInterpreter(Operation *) { return success(); }
-
-  /// Hook for passes to run custom logic to construct the transform module.
-  /// This will run during initialization. If the external script is provided,
-  /// it overrides the construction, which will not be called.
-  std::optional<LogicalResult> constructTransformModule(OpBuilder &builder,
-                                                        Location loc) {
-    return std::nullopt;
-  }
-
-  void runOnOperation() override {
-    auto *pass = static_cast<Concrete *>(this);
-    Operation *op = pass->getOperation();
-    StringRef binaryName = Concrete::getBinaryName();
-    if (failed(pass->runBeforeInterpreter(op)) ||
-        failed(detail::interpreterBaseRunOnOperationImpl(
-            op, pass->getArgument(), sharedTransformModule,
-            transformLibraryModule,
-            /*extraMappings=*/{}, options, pass->transformFileName,
-            pass->transformLibraryPaths, pass->debugPayloadRootTag,
-            pass->debugTransformRootTag, binaryName)) ||
-        failed(pass->runAfterInterpreter(op))) {
-      return pass->signalPassFailure();
-    }
-  }
-
-protected:
-  /// Transform interpreter options.
-  TransformOptions options;
-
-  /// Returns a read-only reference to shared transform module.
-  const std::shared_ptr<OwningOpRef<ModuleOp>> &
-  getSharedTransformModule() const {
-    return sharedTransformModule;
-  }
-
-  /// Returns a read-only reference to the transform library module.
-  const std::shared_ptr<OwningOpRef<ModuleOp>> &
-  getTransformLibraryModule() const {
-    return transformLibraryModule;
-  }
-
-private:
-  /// The separate transform module to be used for transformations, shared
-  /// across multiple instances of the pass if it is applied in parallel to
-  /// avoid potentially expensive cloning. MUST NOT be modified after the pass
-  /// has been initialized.
-  std::shared_ptr<OwningOpRef<ModuleOp>> sharedTransformModule = nullptr;
-
-  /// The transform module containing symbol definitions that become available
-  /// in the transform scripts. Similar to dynamic linking for binaries. This is
-  /// shared across multiple instances of the pass and therefore MUST NOT be
-  /// modified after the pass has been initialized.
-  std::shared_ptr<OwningOpRef<ModuleOp>> transformLibraryModule = nullptr;
-};
-
-} // namespace transform
-} // namespace mlir
-
-#endif // MLIR_DIALECT_TRANSFORM_TRANSFORMS_TRANSFORMINTERPRETERPASSBASE_H
diff --git a/mlir/include/mlir/Dialect/Transform/Utils/Utils.h b/mlir/include/mlir/Dialect/Transform/Utils/Utils.h
index 868054e5e2ae..be31f5beea8c 100644
--- a/mlir/include/mlir/Dialect/Transform/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Transform/Utils/Utils.h
@@ -37,6 +37,12 @@ void printPackedOrDynamicIndexList(OpAsmPrinter &printer, Operation *op,
                                    Value packed, Type packedType,
                                    OperandRange values, TypeRange valueTypes,
                                    DenseI64ArrayAttr integers);
+inline void printPackedOrDynamicIndexList(OpAsmPrinter &printer, Operation *op,
+                                          Value packed, OperandRange values,
+                                          DenseI64ArrayAttr integers) {
+  printPackedOrDynamicIndexList(printer, op, packed, Type(), values,
+                                TypeRange{}, integers);
+}
 
 /// Parser hook for custom directive in assemblyFormat.
 ///
@@ -47,7 +53,15 @@ void printPackedOrDynamicIndexList(OpAsmPrinter &printer, Operation *op,
 ParseResult parsePackedOrDynamicIndexList(
     OpAsmParser &parser, std::optional<OpAsmParser::UnresolvedOperand> &packed,
     Type &packedType, SmallVectorImpl<OpAsmParser::UnresolvedOperand> &values,
-    SmallVectorImpl<Type> &valueTypes, DenseI64ArrayAttr &integers);
+    SmallVectorImpl<Type> *valueTypes, DenseI64ArrayAttr &integers);
+inline ParseResult parsePackedOrDynamicIndexList(
+    OpAsmParser &parser, std::optional<OpAsmParser::UnresolvedOperand> &packed,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &values,
+    DenseI64ArrayAttr &integers) {
+  Type packedType;
+  return parsePackedOrDynamicIndexList(parser, packed, packedType, values,
+                                       nullptr, integers);
+}
 } // namespace transform
 } // namespace mlir
 
diff --git a/mlir/include/mlir/IR/BuiltinTypes.h b/mlir/include/mlir/IR/BuiltinTypes.h
index 2361cf137123..5579b138668d 100644
--- a/mlir/include/mlir/IR/BuiltinTypes.h
+++ b/mlir/include/mlir/IR/BuiltinTypes.h
@@ -360,9 +360,15 @@ private:
 /// which dimensions must be kept when e.g. compute MemRef strides under
 /// rank-reducing operations. Return std::nullopt if reducedShape cannot be
 /// obtained by dropping only `1` entries in `originalShape`.
+/// If `matchDynamic` is true, then dynamic dims in `originalShape` and
+/// `reducedShape` will be considered matching with non-dynamic dims, unless
+/// the non-dynamic dim is from `originalShape` and equal to 1. For example,
+/// in ([1, 3, ?], [?, 5]), the mask would be {1, 0, 0}, since 3 and 5 will
+/// match with the corresponding dynamic dims.
 std::optional<llvm::SmallDenseSet<unsigned>>
 computeRankReductionMask(ArrayRef<int64_t> originalShape,
-                         ArrayRef<int64_t> reducedShape);
+                         ArrayRef<int64_t> reducedShape,
+                         bool matchDynamic = false);
 
 /// Enum that captures information related to verifier error conditions on
 /// slice insert/extract type of ops.
diff --git a/mlir/include/mlir/IR/OpImplementation.h b/mlir/include/mlir/IR/OpImplementation.h
index 5333d7446df5..fa435cb3155e 100644
--- a/mlir/include/mlir/IR/OpImplementation.h
+++ b/mlir/include/mlir/IR/OpImplementation.h
@@ -700,6 +700,10 @@ public:
   /// Parse a floating point value from the stream.
   virtual ParseResult parseFloat(double &result) = 0;
 
+  /// Parse a floating point value into APFloat from the stream.
+  virtual ParseResult parseFloat(const llvm::fltSemantics &semantics,
+                                 APFloat &result) = 0;
+
   /// Parse an integer value from the stream.
   template <typename IntT>
   ParseResult parseInteger(IntT &result) {
diff --git a/mlir/include/mlir/IR/OperationSupport.h b/mlir/include/mlir/IR/OperationSupport.h
index e661bb87a27e..f8ab5338107f 100644
--- a/mlir/include/mlir/IR/OperationSupport.h
+++ b/mlir/include/mlir/IR/OperationSupport.h
@@ -1219,6 +1219,9 @@ public:
   /// Return if the printer should print users of values.
   bool shouldPrintValueUsers() const;
 
+  /// Return if printer should use unique SSA IDs.
+  bool shouldPrintUniqueSSAIDs() const;
+
 private:
   /// Elide large elements attributes if the number of elements is larger than
   /// the upper limit.
@@ -1249,6 +1252,9 @@ private:
 
   /// Print users of values.
   bool printValueUsersFlag : 1;
+
+  /// Print unique SSA IDs for values, block arguments and naming conflicts
+  bool printUniqueSSAIDsFlag : 1;
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/InitAllExtensions.h b/mlir/include/mlir/InitAllExtensions.h
index 7708ca5571de..20a4ab6f18a2 100644
--- a/mlir/include/mlir/InitAllExtensions.h
+++ b/mlir/include/mlir/InitAllExtensions.h
@@ -35,6 +35,7 @@
 #include "mlir/Dialect/SparseTensor/TransformOps/SparseTensorTransformOps.h"
 #include "mlir/Dialect/Tensor/TransformOps/TensorTransformOps.h"
 #include "mlir/Dialect/Transform/DebugExtension/DebugExtension.h"
+#include "mlir/Dialect/Transform/IRDLExtension/IRDLExtension.h"
 #include "mlir/Dialect/Transform/LoopExtension/LoopExtension.h"
 #include "mlir/Dialect/Transform/PDLExtension/PDLExtension.h"
 #include "mlir/Dialect/Vector/TransformOps/VectorTransformOps.h"
@@ -77,6 +78,7 @@ inline void registerAllExtensions(DialectRegistry &registry) {
   sparse_tensor::registerTransformDialectExtension(registry);
   tensor::registerTransformDialectExtension(registry);
   transform::registerDebugExtension(registry);
+  transform::registerIRDLExtension(registry);
   transform::registerLoopExtension(registry);
   transform::registerPDLExtension(registry);
   vector::registerTransformDialectExtension(registry);
diff --git a/mlir/include/mlir/Interfaces/LoopLikeInterface.h b/mlir/include/mlir/Interfaces/LoopLikeInterface.h
index 7c7d378d0590..42609e824c86 100644
--- a/mlir/include/mlir/Interfaces/LoopLikeInterface.h
+++ b/mlir/include/mlir/Interfaces/LoopLikeInterface.h
@@ -29,8 +29,31 @@ namespace detail {
 /// Verify invariants of the LoopLikeOpInterface.
 LogicalResult verifyLoopLikeOpInterface(Operation *op);
 } // namespace detail
+
+//===----------------------------------------------------------------------===//
+// Traits
+//===----------------------------------------------------------------------===//
+
+namespace OpTrait {
+// A trait indicating that the single region contained in the operation has
+// parallel execution semantics. This may have implications in a certain pass.
+// For example, buffer hoisting is illegal in parallel loops, and local buffers
+// may be accessed by parallel threads simultaneously.
+template <typename ConcreteType>
+class HasParallelRegion : public TraitBase<ConcreteType, HasParallelRegion> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifyOneRegion(op);
+  }
+};
+
+} // namespace OpTrait
 } // namespace mlir
 
+//===----------------------------------------------------------------------===//
+// Interfaces
+//===----------------------------------------------------------------------===//
+
 /// Include the generated interface declarations.
 #include "mlir/Interfaces/LoopLikeInterface.h.inc"
 
diff --git a/mlir/include/mlir/Interfaces/LoopLikeInterface.td b/mlir/include/mlir/Interfaces/LoopLikeInterface.td
index e2ac85a3f772..f0dc6e60eba5 100644
--- a/mlir/include/mlir/Interfaces/LoopLikeInterface.td
+++ b/mlir/include/mlir/Interfaces/LoopLikeInterface.td
@@ -15,6 +15,10 @@
 
 include "mlir/IR/OpBase.td"
 
+//===----------------------------------------------------------------------===//
+// Interfaces
+//===----------------------------------------------------------------------===//
+
 def LoopLikeOpInterface : OpInterface<"LoopLikeOpInterface"> {
   let description = [{
     Contains helper functions to query properties and perform transformations
@@ -371,4 +375,11 @@ def LoopLikeOpInterface : OpInterface<"LoopLikeOpInterface"> {
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// Traits
+//===----------------------------------------------------------------------===//
+
+// Op contains a region with parallel execution semantics
+def HasParallelRegion : NativeOpTrait<"HasParallelRegion">;
+
 #endif // MLIR_INTERFACES_LOOPLIKEINTERFACE
diff --git a/mlir/include/mlir/Interfaces/MemorySlotInterfaces.td b/mlir/include/mlir/Interfaces/MemorySlotInterfaces.td
index 764fa6d547b2..e2409cbec5fd 100644
--- a/mlir/include/mlir/Interfaces/MemorySlotInterfaces.td
+++ b/mlir/include/mlir/Interfaces/MemorySlotInterfaces.td
@@ -40,28 +40,28 @@ def PromotableAllocationOpInterface
         Provides the default Value of this memory slot. The provided Value
         will be used as the reaching definition of loads done before any store.
         This Value must outlive the promotion and dominate all the uses of this
-        slot's pointer. The provided rewriter can be used to create the default
+        slot's pointer. The provided builder can be used to create the default
         value on the fly.
 
-        The rewriter is located at the beginning of the block where the slot
-        pointer is defined. All IR mutations must happen through the rewriter.
+        The builder is located at the beginning of the block where the slot
+        pointer is defined.
       }], "::mlir::Value", "getDefaultValue",
       (ins
         "const ::mlir::MemorySlot &":$slot,
-        "::mlir::RewriterBase &":$rewriter)
+        "::mlir::OpBuilder &":$builder)
     >,
     InterfaceMethod<[{
         Hook triggered for every new block argument added to a block.
         This will only be called for slots declared by this operation.
 
-        The rewriter is located at the beginning of the block on call. All IR
-        mutations must happen through the rewriter.
+        The builder is located at the beginning of the block on call. All IR
+        mutations must happen through the builder.
       }],
       "void", "handleBlockArgument",
       (ins
         "const ::mlir::MemorySlot &":$slot,
         "::mlir::BlockArgument":$argument,
-        "::mlir::RewriterBase &":$rewriter
+        "::mlir::OpBuilder &":$builder
       )
     >,
     InterfaceMethod<[{
@@ -69,13 +69,15 @@ def PromotableAllocationOpInterface
         also clean up the created default value if necessary.
         This will only be called for slots declared by this operation.
 
-        All IR mutations must happen through the rewriter.
+        Must return a new promotable allocation op if this operation produced
+        multiple promotable slots, nullopt otherwise.
       }],
-      "void", "handlePromotionComplete",
+      "::std::optional<::mlir::PromotableAllocationOpInterface>",
+        "handlePromotionComplete",
       (ins
         "const ::mlir::MemorySlot &":$slot, 
         "::mlir::Value":$defaultValue,
-        "::mlir::RewriterBase &":$rewriter)
+        "::mlir::OpBuilder &":$builder)
     >,
   ];
 }
@@ -119,15 +121,14 @@ def PromotableMemOpInterface : OpInterface<"PromotableMemOpInterface"> {
         The returned value must dominate all operations dominated by the storing
         operation.
 
-        If IR must be mutated to extract a concrete value being stored, mutation
-        must happen through the provided rewriter. The rewriter is located
-        immediately after the memory operation on call. No IR deletion is
-        allowed in this method. IR mutations must not introduce new uses of the
-        memory slot. Existing control flow must not be modified.
+        The builder is located immediately after the memory operation on call.
+        No IR deletion is allowed in this method. IR mutations must not
+        introduce new uses of the memory slot. Existing control flow must not
+        be modified.
       }],
       "::mlir::Value", "getStored",
       (ins "const ::mlir::MemorySlot &":$slot,
-           "::mlir::RewriterBase &":$rewriter,
+           "::mlir::OpBuilder &":$builder,
            "::mlir::Value":$reachingDef,
            "const ::mlir::DataLayout &":$dataLayout)
     >,
@@ -166,14 +167,13 @@ def PromotableMemOpInterface : OpInterface<"PromotableMemOpInterface"> {
         have been done at the point of calling this method, but it will be done
         eventually.
 
-        The rewriter is located after the promotable operation on call. All IR
-        mutations must happen through the rewriter.
+        The builder is located after the promotable operation on call.
       }],
       "::mlir::DeletionKind",
       "removeBlockingUses",
       (ins "const ::mlir::MemorySlot &":$slot,
            "const ::llvm::SmallPtrSetImpl<mlir::OpOperand *> &":$blockingUses,
-           "::mlir::RewriterBase &":$rewriter,
+           "::mlir::OpBuilder &":$builder,
            "::mlir::Value":$reachingDefinition,
            "const ::mlir::DataLayout &":$dataLayout)
     >,
@@ -224,13 +224,12 @@ def PromotableOpInterface : OpInterface<"PromotableOpInterface"> {
         have been done at the point of calling this method, but it will be done
         eventually.
 
-        The rewriter is located after the promotable operation on call. All IR
-        mutations must happen through the rewriter.
+        The builder is located after the promotable operation on call.
       }],
       "::mlir::DeletionKind",
       "removeBlockingUses",
       (ins "const ::llvm::SmallPtrSetImpl<mlir::OpOperand *> &":$blockingUses,
-           "::mlir::RewriterBase &":$rewriter)
+           "::mlir::OpBuilder &":$builder)
     >,
     InterfaceMethod<[{
         This method allows the promoted operation to visit the SSA values used
@@ -254,13 +253,12 @@ def PromotableOpInterface : OpInterface<"PromotableOpInterface"> {
         scheduled for removal and if `requiresReplacedValues` returned
         true.
 
-        The rewriter is located after the promotable operation on call. All IR
-        mutations must happen through the rewriter. During the transformation,
-        *no operation should be deleted*.
+        The builder is located after the promotable operation on call. During
+        the transformation, *no operation should be deleted*.
       }],
       "void", "visitReplacedValues",
       (ins "::llvm::ArrayRef<std::pair<::mlir::Operation*, ::mlir::Value>>":$mutatedDefs,
-           "::mlir::RewriterBase &":$rewriter), [{}], [{ return; }]
+           "::mlir::OpBuilder &":$builder), [{}], [{ return; }]
     >,
   ];
 }
@@ -293,25 +291,23 @@ def DestructurableAllocationOpInterface
         at the end of this call. Only generates subslots for the indices found in
         `usedIndices` since all other subslots are unused.
 
-        The rewriter is located at the beginning of the block where the slot
-        pointer is defined. All IR mutations must happen through the rewriter.
+        The builder is located at the beginning of the block where the slot
+        pointer is defined.
       }],
       "::llvm::DenseMap<::mlir::Attribute, ::mlir::MemorySlot>",
       "destructure",
       (ins "const ::mlir::DestructurableMemorySlot &":$slot,
            "const ::llvm::SmallPtrSetImpl<::mlir::Attribute> &":$usedIndices,
-           "::mlir::RewriterBase &":$rewriter)
+           "::mlir::OpBuilder &":$builder)
     >,
     InterfaceMethod<[{
         Hook triggered once the destructuring of a slot is complete, meaning the
         original slot is no longer being refered to and could be deleted.
         This will only be called for slots declared by this operation.
-
-        All IR mutations must happen through the rewriter.
       }],
       "void", "handleDestructuringComplete",
       (ins "const ::mlir::DestructurableMemorySlot &":$slot,
-           "::mlir::RewriterBase &":$rewriter)
+           "::mlir::OpBuilder &":$builder)
     >,
   ];
 }
@@ -376,15 +372,14 @@ def DestructurableAccessorOpInterface
         Rewires the use of a slot to the generated subslots, without deleting
         any operation. Returns whether the accessor should be deleted.
 
-        All IR mutations must happen through the rewriter. Deletion of
-        operations is not allowed, only the accessor can be scheduled for
-        deletion by returning the appropriate value.
+        Deletion of operations is not allowed, only the accessor can be
+        scheduled for deletion by returning the appropriate value.
       }],
       "::mlir::DeletionKind",
       "rewire",
       (ins "const ::mlir::DestructurableMemorySlot &":$slot,
            "::llvm::DenseMap<::mlir::Attribute, ::mlir::MemorySlot> &":$subslots,
-           "::mlir::RewriterBase &":$rewriter,
+           "::mlir::OpBuilder &":$builder,
            "const ::mlir::DataLayout &":$dataLayout)
     >
   ];
diff --git a/mlir/include/mlir/Interfaces/ViewLikeInterface.h b/mlir/include/mlir/Interfaces/ViewLikeInterface.h
index 931309b0c596..d6479143a0a5 100644
--- a/mlir/include/mlir/Interfaces/ViewLikeInterface.h
+++ b/mlir/include/mlir/Interfaces/ViewLikeInterface.h
@@ -106,9 +106,16 @@ public:
 /// empty then assume that all indices are non-scalable.
 void printDynamicIndexList(
     OpAsmPrinter &printer, Operation *op, OperandRange values,
-    ArrayRef<int64_t> integers, TypeRange valueTypes = TypeRange(),
-    ArrayRef<bool> scalables = {},
+    ArrayRef<int64_t> integers, ArrayRef<bool> scalables,
+    TypeRange valueTypes = TypeRange(),
     AsmParser::Delimiter delimiter = AsmParser::Delimiter::Square);
+inline void printDynamicIndexList(
+    OpAsmPrinter &printer, Operation *op, OperandRange values,
+    ArrayRef<int64_t> integers, TypeRange valueTypes = TypeRange(),
+    AsmParser::Delimiter delimiter = AsmParser::Delimiter::Square) {
+  return printDynamicIndexList(printer, op, values, integers, {}, valueTypes,
+                               delimiter);
+}
 
 /// Parser hook for custom directive in assemblyFormat.
 ///
diff --git a/mlir/include/mlir/Tools/lsp-server-support/Protocol.h b/mlir/include/mlir/Tools/lsp-server-support/Protocol.h
index 839d82bb02b8..1d22b8a66774 100644
--- a/mlir/include/mlir/Tools/lsp-server-support/Protocol.h
+++ b/mlir/include/mlir/Tools/lsp-server-support/Protocol.h
@@ -677,6 +677,16 @@ enum class DiagnosticSeverity {
   Hint = 4
 };
 
+enum class DiagnosticTag {
+  Unnecessary = 1,
+  Deprecated = 2,
+};
+
+/// Add support for JSON serialization.
+llvm::json::Value toJSON(DiagnosticTag tag);
+bool fromJSON(const llvm::json::Value &value, DiagnosticTag &result,
+              llvm::json::Path path);
+
 struct Diagnostic {
   /// The source range where the message applies.
   Range range;
@@ -696,6 +706,9 @@ struct Diagnostic {
   /// a scope collide all definitions can be marked via this property.
   std::optional<std::vector<DiagnosticRelatedInformation>> relatedInformation;
 
+  /// Additional metadata about the diagnostic.
+  std::vector<DiagnosticTag> tags;
+
   /// The diagnostic's category. Can be omitted.
   /// An LSP extension that's used to send the name of the category over to the
   /// client. The category typically describes the compilation stage during
diff --git a/mlir/include/mlir/Tools/lsp-server-support/Transport.h b/mlir/include/mlir/Tools/lsp-server-support/Transport.h
index 047d174234df..83fff92e1b4b 100644
--- a/mlir/include/mlir/Tools/lsp-server-support/Transport.h
+++ b/mlir/include/mlir/Tools/lsp-server-support/Transport.h
@@ -109,9 +109,10 @@ using OutgoingRequest =
 
 /// An `OutgoingRequestCallback` is invoked when an outgoing request to the
 /// client receives a response in turn. It is passed the original request's ID,
-/// as well as the result JSON.
+/// as well as the response result.
+template <typename T>
 using OutgoingRequestCallback =
-    std::function<void(llvm::json::Value, llvm::Expected<llvm::json::Value>)>;
+    std::function<void(llvm::json::Value, llvm::Expected<T>)>;
 
 /// A handler used to process the incoming transport messages.
 class MessageHandler {
@@ -185,21 +186,37 @@ public:
 
   /// Create an OutgoingRequest function that, when called, sends a request with
   /// the given method via the transport. Should the outgoing request be
-  /// met with a response, the response callback is invoked to handle that
-  /// response.
-  template <typename T>
-  OutgoingRequest<T> outgoingRequest(llvm::StringLiteral method,
-                                     OutgoingRequestCallback callback) {
-    return [&, method, callback](const T &params, llvm::json::Value id) {
+  /// met with a response, the result JSON is parsed and the response callback
+  /// is invoked.
+  template <typename Param, typename Result>
+  OutgoingRequest<Param>
+  outgoingRequest(llvm::StringLiteral method,
+                  OutgoingRequestCallback<Result> callback) {
+    return [&, method, callback](const Param &param, llvm::json::Value id) {
+      auto callbackWrapper = [method, callback = std::move(callback)](
+                                 llvm::json::Value id,
+                                 llvm::Expected<llvm::json::Value> value) {
+        if (!value)
+          return callback(std::move(id), value.takeError());
+
+        std::string responseName = llvm::formatv("reply:{0}({1})", method, id);
+        llvm::Expected<Result> result =
+            parse<Result>(*value, responseName, "response");
+        if (!result)
+          return callback(std::move(id), result.takeError());
+
+        return callback(std::move(id), *result);
+      };
+
       {
         std::lock_guard<std::mutex> lock(responseHandlersMutex);
         responseHandlers.insert(
-            {debugString(id), std::make_pair(method.str(), callback)});
+            {debugString(id), std::make_pair(method.str(), callbackWrapper)});
       }
 
       std::lock_guard<std::mutex> transportLock(transportOutputMutex);
       Logger::info("--> {0}({1})", method, id);
-      transport.call(method, llvm::json::Value(params), id);
+      transport.call(method, llvm::json::Value(param), id);
     };
   }
 
@@ -213,7 +230,8 @@ private:
 
   /// A pair of (1) the original request's method name, and (2) the callback
   /// function to be invoked for responses.
-  using ResponseHandlerTy = std::pair<std::string, OutgoingRequestCallback>;
+  using ResponseHandlerTy =
+      std::pair<std::string, OutgoingRequestCallback<llvm::json::Value>>;
   /// A mapping from request/response ID to response handler.
   llvm::StringMap<ResponseHandlerTy> responseHandlers;
   /// Mutex to guard insertion into the response handler map.
diff --git a/mlir/include/mlir/Transforms/Mem2Reg.h b/mlir/include/mlir/Transforms/Mem2Reg.h
index ed10644e26a5..6986cad9ae12 100644
--- a/mlir/include/mlir/Transforms/Mem2Reg.h
+++ b/mlir/include/mlir/Transforms/Mem2Reg.h
@@ -9,7 +9,6 @@
 #ifndef MLIR_TRANSFORMS_MEM2REG_H
 #define MLIR_TRANSFORMS_MEM2REG_H
 
-#include "mlir/IR/PatternMatch.h"
 #include "mlir/Interfaces/MemorySlotInterfaces.h"
 #include "llvm/ADT/Statistic.h"
 
@@ -23,11 +22,13 @@ struct Mem2RegStatistics {
   llvm::Statistic *newBlockArgumentAmount = nullptr;
 };
 
-/// Attempts to promote the memory slots of the provided allocators. Succeeds if
-/// at least one memory slot was promoted.
+/// Attempts to promote the memory slots of the provided allocators. Iteratively
+/// retries the promotion of all slots as promoting one slot might enable
+/// subsequent promotions. Succeeds if at least one memory slot was promoted.
 LogicalResult
 tryToPromoteMemorySlots(ArrayRef<PromotableAllocationOpInterface> allocators,
-                        RewriterBase &rewriter, const DataLayout &dataLayout,
+                        OpBuilder &builder, const DataLayout &dataLayout,
+                        DominanceInfo &dominance,
                         Mem2RegStatistics statistics = {});
 
 } // namespace mlir
diff --git a/mlir/include/mlir/Transforms/SROA.h b/mlir/include/mlir/Transforms/SROA.h
index d09a7989edea..fa84fb1eae73 100644
--- a/mlir/include/mlir/Transforms/SROA.h
+++ b/mlir/include/mlir/Transforms/SROA.h
@@ -31,7 +31,7 @@ struct SROAStatistics {
 /// failure if no slot was destructured.
 LogicalResult tryToDestructureMemorySlots(
     ArrayRef<DestructurableAllocationOpInterface> allocators,
-    RewriterBase &rewriter, const DataLayout &dataLayout,
+    OpBuilder &builder, const DataLayout &dataLayout,
     SROAStatistics statistics = {});
 
 } // namespace mlir
diff --git a/mlir/lib/AsmParser/AsmParserImpl.h b/mlir/lib/AsmParser/AsmParserImpl.h
index 30c0079cda08..8f22be80865b 100644
--- a/mlir/lib/AsmParser/AsmParserImpl.h
+++ b/mlir/lib/AsmParser/AsmParserImpl.h
@@ -269,8 +269,12 @@ public:
     return success();
   }
 
-  /// Parse a floating point value from the stream.
-  ParseResult parseFloat(double &result) override {
+  /// Parse a floating point value with given semantics from the stream. Since
+  /// this implementation parses the string as double precision and only
+  /// afterwards converts the value to the requested semantic, precision may be
+  /// lost.
+  ParseResult parseFloat(const llvm::fltSemantics &semantics,
+                         APFloat &result) override {
     bool isNegative = parser.consumeIf(Token::minus);
     Token curTok = parser.getToken();
     SMLoc loc = curTok.getLoc();
@@ -281,7 +285,9 @@ public:
       if (!val)
         return emitError(loc, "floating point value too large");
       parser.consumeToken(Token::floatliteral);
-      result = isNegative ? -*val : *val;
+      result = APFloat(isNegative ? -*val : *val);
+      bool losesInfo;
+      result.convert(semantics, APFloat::rmNearestTiesToEven, &losesInfo);
       return success();
     }
 
@@ -289,18 +295,28 @@ public:
     if (curTok.is(Token::integer)) {
       std::optional<APFloat> apResult;
       if (failed(parser.parseFloatFromIntegerLiteral(
-              apResult, curTok, isNegative, APFloat::IEEEdouble(),
-              /*typeSizeInBits=*/64)))
+              apResult, curTok, isNegative, semantics,
+              APFloat::semanticsSizeInBits(semantics))))
         return failure();
 
+      result = *apResult;
       parser.consumeToken(Token::integer);
-      result = apResult->convertToDouble();
       return success();
     }
 
     return emitError(loc, "expected floating point literal");
   }
 
+  /// Parse a floating point value from the stream.
+  ParseResult parseFloat(double &result) override {
+    llvm::APFloat apResult(0.0);
+    if (parseFloat(APFloat::IEEEdouble(), apResult))
+      return failure();
+
+    result = apResult.convertToDouble();
+    return success();
+  }
+
   /// Parse an optional integer value from the stream.
   OptionalParseResult parseOptionalInteger(APInt &result) override {
     return parser.parseOptionalInteger(result);
diff --git a/mlir/lib/AsmParser/Parser.cpp b/mlir/lib/AsmParser/Parser.cpp
index 00f2b0c0c2f1..1b8b4bac1821 100644
--- a/mlir/lib/AsmParser/Parser.cpp
+++ b/mlir/lib/AsmParser/Parser.cpp
@@ -326,19 +326,15 @@ ParseResult Parser::parseFloatFromIntegerLiteral(
                           "leading minus");
   }
 
-  std::optional<uint64_t> value = tok.getUInt64IntegerValue();
-  if (!value)
+  APInt intValue;
+  tok.getSpelling().getAsInteger(isHex ? 0 : 10, intValue);
+  if (intValue.getActiveBits() > typeSizeInBits)
     return emitError(loc, "hexadecimal float constant out of range for type");
 
-  if (&semantics == &APFloat::IEEEdouble()) {
-    result = APFloat(semantics, APInt(typeSizeInBits, *value));
-    return success();
-  }
+  APInt truncatedValue(typeSizeInBits, intValue.getNumWords(),
+                       intValue.getRawData());
 
-  APInt apInt(typeSizeInBits, *value);
-  if (apInt != *value)
-    return emitError(loc, "hexadecimal float constant out of range for type");
-  result = APFloat(semantics, apInt);
+  result.emplace(semantics, truncatedValue);
 
   return success();
 }
diff --git a/mlir/lib/Bindings/Python/DialectNVGPU.cpp b/mlir/lib/Bindings/Python/DialectNVGPU.cpp
index 341e4d55bcf2..754e0a75b0ab 100644
--- a/mlir/lib/Bindings/Python/DialectNVGPU.cpp
+++ b/mlir/lib/Bindings/Python/DialectNVGPU.cpp
@@ -1,4 +1,4 @@
-//===--- DialectNvgpu.cpp - Pybind module for Nvgpu dialect API support ---===//
+//===--- DialectNVGPU.cpp - Pybind module for NVGPU dialect API support ---===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -17,7 +17,7 @@ using namespace mlir;
 using namespace mlir::python;
 using namespace mlir::python::adaptors;
 
-static void populateDialectNvgpuSubmodule(const pybind11::module &m) {
+static void populateDialectNVGPUSubmodule(const pybind11::module &m) {
   auto nvgpuTensorMapDescriptorType = mlir_type_subclass(
       m, "TensorMapDescriptorType", mlirTypeIsANVGPUTensorMapDescriptorType);
 
@@ -34,8 +34,8 @@ static void populateDialectNvgpuSubmodule(const pybind11::module &m) {
       py::arg("ctx") = py::none());
 }
 
-PYBIND11_MODULE(_mlirDialectsNvgpu, m) {
+PYBIND11_MODULE(_mlirDialectsNVGPU, m) {
   m.doc() = "MLIR NVGPU dialect.";
 
-  populateDialectNvgpuSubmodule(m);
+  populateDialectNVGPUSubmodule(m);
 }
diff --git a/mlir/lib/CAPI/Dialect/CMakeLists.txt b/mlir/lib/CAPI/Dialect/CMakeLists.txt
index 58b8739043f9..4e141b60ff8c 100644
--- a/mlir/lib/CAPI/Dialect/CMakeLists.txt
+++ b/mlir/lib/CAPI/Dialect/CMakeLists.txt
@@ -72,6 +72,15 @@ add_mlir_upstream_c_api_library(MLIRCAPIGPU
   MLIRPass
 )
 
+add_mlir_upstream_c_api_library(MLIRCAPIIRDL
+  IRDL.cpp
+
+  PARTIAL_SOURCES_INTENDED
+  LINK_LIBS PUBLIC
+  MLIRCAPIIR
+  MLIRIRDL
+)
+
 add_mlir_upstream_c_api_library(MLIRCAPILLVM
   LLVM.cpp
 
diff --git a/mlir/lib/CAPI/Dialect/IRDL.cpp b/mlir/lib/CAPI/Dialect/IRDL.cpp
new file mode 100644
index 000000000000..cb9dc8ceb679
--- /dev/null
+++ b/mlir/lib/CAPI/Dialect/IRDL.cpp
@@ -0,0 +1,18 @@
+//===- IRDL.cpp - C Interface for IRDL dialect ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir-c/Dialect/IRDL.h"
+#include "mlir/CAPI/Registration.h"
+#include "mlir/Dialect/IRDL/IR/IRDL.h"
+#include "mlir/Dialect/IRDL/IRDLLoading.h"
+
+MLIR_DEFINE_CAPI_DIALECT_REGISTRATION(IRDL, irdl, mlir::irdl::IRDLDialect)
+
+MlirLogicalResult mlirLoadIRDLDialects(MlirModule module) {
+  return wrap(mlir::irdl::loadDialects(unwrap(module)));
+}
diff --git a/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp b/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp
index 9b2544276ce4..1447b182ccfd 100644
--- a/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp
+++ b/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp
@@ -201,6 +201,96 @@ public:
   }
 };
 
+// Floating-point to integer conversions.
+template <typename CastOp>
+class FtoICastOpConversion : public OpConversionPattern<CastOp> {
+public:
+  FtoICastOpConversion(const TypeConverter &typeConverter, MLIRContext *context)
+      : OpConversionPattern<CastOp>(typeConverter, context) {}
+
+  LogicalResult
+  matchAndRewrite(CastOp castOp, typename CastOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    Type operandType = adaptor.getIn().getType();
+    if (!emitc::isSupportedFloatType(operandType))
+      return rewriter.notifyMatchFailure(castOp,
+                                         "unsupported cast source type");
+
+    Type dstType = this->getTypeConverter()->convertType(castOp.getType());
+    if (!dstType)
+      return rewriter.notifyMatchFailure(castOp, "type conversion failed");
+
+    // Float-to-i1 casts are not supported: any value with 0 < value < 1 must be
+    // truncated to 0, whereas a boolean conversion would return true.
+    if (!emitc::isSupportedIntegerType(dstType) || dstType.isInteger(1))
+      return rewriter.notifyMatchFailure(castOp,
+                                         "unsupported cast destination type");
+
+    // Convert to unsigned if it's the "ui" variant
+    // Signless is interpreted as signed, so no need to cast for "si"
+    Type actualResultType = dstType;
+    if (isa<arith::FPToUIOp>(castOp)) {
+      actualResultType =
+          rewriter.getIntegerType(operandType.getIntOrFloatBitWidth(),
+                                  /*isSigned=*/false);
+    }
+
+    Value result = rewriter.create<emitc::CastOp>(
+        castOp.getLoc(), actualResultType, adaptor.getOperands());
+
+    if (isa<arith::FPToUIOp>(castOp)) {
+      result = rewriter.create<emitc::CastOp>(castOp.getLoc(), dstType, result);
+    }
+    rewriter.replaceOp(castOp, result);
+
+    return success();
+  }
+};
+
+// Integer to floating-point conversions.
+template <typename CastOp>
+class ItoFCastOpConversion : public OpConversionPattern<CastOp> {
+public:
+  ItoFCastOpConversion(const TypeConverter &typeConverter, MLIRContext *context)
+      : OpConversionPattern<CastOp>(typeConverter, context) {}
+
+  LogicalResult
+  matchAndRewrite(CastOp castOp, typename CastOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Vectors in particular are not supported
+    Type operandType = adaptor.getIn().getType();
+    if (!emitc::isSupportedIntegerType(operandType))
+      return rewriter.notifyMatchFailure(castOp,
+                                         "unsupported cast source type");
+
+    Type dstType = this->getTypeConverter()->convertType(castOp.getType());
+    if (!dstType)
+      return rewriter.notifyMatchFailure(castOp, "type conversion failed");
+
+    if (!emitc::isSupportedFloatType(dstType))
+      return rewriter.notifyMatchFailure(castOp,
+                                         "unsupported cast destination type");
+
+    // Convert to unsigned if it's the "ui" variant
+    // Signless is interpreted as signed, so no need to cast for "si"
+    Type actualOperandType = operandType;
+    if (isa<arith::UIToFPOp>(castOp)) {
+      actualOperandType =
+          rewriter.getIntegerType(operandType.getIntOrFloatBitWidth(),
+                                  /*isSigned=*/false);
+    }
+    Value fpCastOperand = adaptor.getIn();
+    if (actualOperandType != operandType) {
+      fpCastOperand = rewriter.template create<emitc::CastOp>(
+          castOp.getLoc(), actualOperandType, fpCastOperand);
+    }
+    rewriter.replaceOpWithNewOp<emitc::CastOp>(castOp, dstType, fpCastOperand);
+
+    return success();
+  }
+};
+
 } // namespace
 
 //===----------------------------------------------------------------------===//
@@ -222,7 +312,11 @@ void mlir::populateArithToEmitCPatterns(TypeConverter &typeConverter,
     IntegerOpConversion<arith::MulIOp, emitc::MulOp>,
     IntegerOpConversion<arith::SubIOp, emitc::SubOp>,
     CmpIOpConversion,
-    SelectOpConversion
+    SelectOpConversion,
+    ItoFCastOpConversion<arith::SIToFPOp>,
+    ItoFCastOpConversion<arith::UIToFPOp>,
+    FtoICastOpConversion<arith::FPToSIOp>,
+    FtoICastOpConversion<arith::FPToUIOp>
   >(typeConverter, ctx);
   // clang-format on
 }
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 3a4fc7d8063f..82bfa9514a88 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -926,7 +926,7 @@ LogicalResult ConvertAsyncYieldToGpuRuntimeCallPattern::matchAndRewrite(
 static bool isDefinedByCallTo(Value value, StringRef functionName) {
   assert(isa<LLVM::LLVMPointerType>(value.getType()));
   if (auto defOp = value.getDefiningOp<LLVM::CallOp>())
-    return defOp.getCallee()->equals(functionName);
+    return *defOp.getCallee() == functionName;
   return false;
 }
 
diff --git a/mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp b/mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp
index 775dd1e60903..b7fd454c6090 100644
--- a/mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp
@@ -42,11 +42,11 @@ static LogicalResult areAllLLVMTypes(Operation *op, ValueRange operands,
 static constexpr StringRef kInvalidCaseStr = "Unsupported WMMA variant.";
 
 static NVVM::MMAFrag convertOperand(StringRef operandName) {
-  if (operandName.equals("AOp"))
+  if (operandName == "AOp")
     return NVVM::MMAFrag::a;
-  if (operandName.equals("BOp"))
+  if (operandName == "BOp")
     return NVVM::MMAFrag::b;
-  if (operandName.equals("COp"))
+  if (operandName == "COp")
     return NVVM::MMAFrag::c;
   llvm_unreachable("Unknown operand name");
 }
@@ -55,8 +55,8 @@ static NVVM::MMATypes getElementType(gpu::MMAMatrixType type) {
   if (type.getElementType().isF16())
     return NVVM::MMATypes::f16;
   if (type.getElementType().isF32())
-    return type.getOperand().equals("COp") ? NVVM::MMATypes::f32
-                                           : NVVM::MMATypes::tf32;
+    return type.getOperand() == "COp" ? NVVM::MMATypes::f32
+                                      : NVVM::MMATypes::tf32;
 
   if (type.getElementType().isSignedInteger(8))
     return NVVM::MMATypes::s8;
@@ -99,15 +99,15 @@ struct WmmaLoadOpToNVVMLowering
     NVVM::MMATypes eltype = getElementType(retType);
     // NVVM intrinsics require to give mxnxk dimensions, infer the missing
     // dimension based on the valid intrinsics available.
-    if (retType.getOperand().equals("AOp")) {
+    if (retType.getOperand() == "AOp") {
       m = retTypeShape[0];
       k = retTypeShape[1];
       n = NVVM::WMMALoadOp::inferNDimension(m, k, eltype);
-    } else if (retType.getOperand().equals("BOp")) {
+    } else if (retType.getOperand() == "BOp") {
       k = retTypeShape[0];
       n = retTypeShape[1];
       m = NVVM::WMMALoadOp::inferMDimension(k, n, eltype);
-    } else if (retType.getOperand().equals("COp")) {
+    } else if (retType.getOperand() == "COp") {
       m = retTypeShape[0];
       n = retTypeShape[1];
       k = NVVM::WMMALoadOp::inferKDimension(m, n, eltype);
diff --git a/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp b/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp
index cd6da3558246..89f956a5e701 100644
--- a/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp
+++ b/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp
@@ -55,9 +55,8 @@ TensorType inferReshapeExpandedType(TensorType inputType,
   // Check if the input is static, and if so, get its total size
   bool inputIsStatic = inputType.hasStaticShape();
   int64_t totalSize = inputIsStatic ? inputType.getNumElements() : -1;
- 
+
   // Compute result shape
-  bool resultIsStatic = true;
   auto resultShape = llvm::map_to_vector(newShape, [&](int64_t size) -> int64_t {
     // If this is not a placeholder, do not change it
     if (size >= 0)
@@ -65,10 +64,8 @@ TensorType inferReshapeExpandedType(TensorType inputType,
 
     // If we do not know the total size of the tensor, keep this dimension
     // dynamic in the result shape.
-    if (!inputIsStatic) {
-      resultIsStatic = false;
+    if (!inputIsStatic)
       return ShapedType::kDynamic;
-    }
 
     // Calculate the product of all elements in 'newShape' except for the -1
     // placeholder, which we discard by negating the result.
@@ -84,12 +81,14 @@ TensorType inferReshapeExpandedType(TensorType inputType,
     return totalSize / totalSizeNoPlaceholder;
   });
 
+  bool resultIsStatic = !ShapedType::isDynamicShape(resultShape);
+
   // A syntactic restriction in 'tensor.expand_shape' forbids a dynamically
   // shaped input from being reshaped into a statically shaped result. We may
   // simply turn the first result dimension dynamic to address this.
   if (!inputIsStatic && resultIsStatic)
     resultShape[0] = ShapedType::kDynamic;
-  
+
   // The 'tensor.expand_shape' op also forbids a statically shaped input from
   // being reshaped into a dynamically shaped result, but the placeholder
   // inference algorithm above guarantees that this will never be the case.
diff --git a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
index 3f92372d7cea..782cc92f83fe 100644
--- a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
+++ b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
@@ -441,7 +441,7 @@ struct PrepareContractToGPUMMA
   }
 };
 
-// Fold transpose op into the transfer read op. Nvgpu mma.sync op only supports
+// Fold transpose op into the transfer read op. NVGPU mma.sync op only supports
 // row-, column-, and row-major layout for matrixA, matrixB, and matrixC,
 // respectively. We can fold the transpose operation when loading the data from
 // Shared Memory to registers.
diff --git a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
index f8485e02a220..19f02297bfbb 100644
--- a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
+++ b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
@@ -261,7 +261,7 @@ static void maybeApplyPassLabel(OpBuilder &b, OpTy newXferOp,
 template <typename OpTy>
 static bool isTensorOp(OpTy xferOp) {
   if (isa<RankedTensorType>(xferOp.getShapedType())) {
-    if (xferOp.getOperationName().equals(TransferWriteOp::getOperationName())) {
+    if (xferOp.getOperationName() == TransferWriteOp::getOperationName()) {
       // TransferWriteOps on tensors have a result.
       assert(xferOp->getNumResults() > 0);
     }
diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
index c9c0a7b4cc68..2e31487bd55a 100644
--- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -3585,20 +3585,18 @@ ParseResult AffinePrefetchOp::parse(OpAsmParser &parser,
       parser.resolveOperands(mapOperands, indexTy, result.operands))
     return failure();
 
-  if (!readOrWrite.equals("read") && !readOrWrite.equals("write"))
+  if (readOrWrite != "read" && readOrWrite != "write")
     return parser.emitError(parser.getNameLoc(),
                             "rw specifier has to be 'read' or 'write'");
-  result.addAttribute(
-      AffinePrefetchOp::getIsWriteAttrStrName(),
-      parser.getBuilder().getBoolAttr(readOrWrite.equals("write")));
+  result.addAttribute(AffinePrefetchOp::getIsWriteAttrStrName(),
+                      parser.getBuilder().getBoolAttr(readOrWrite == "write"));
 
-  if (!cacheType.equals("data") && !cacheType.equals("instr"))
+  if (cacheType != "data" && cacheType != "instr")
     return parser.emitError(parser.getNameLoc(),
                             "cache type has to be 'data' or 'instr'");
 
-  result.addAttribute(
-      AffinePrefetchOp::getIsDataCacheAttrStrName(),
-      parser.getBuilder().getBoolAttr(cacheType.equals("data")));
+  result.addAttribute(AffinePrefetchOp::getIsDataCacheAttrStrName(),
+                      parser.getBuilder().getBoolAttr(cacheType == "data"));
 
   return success();
 }
diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineScalarReplacement.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineScalarReplacement.cpp
index ed94fb690af2..707bba2f1e6f 100644
--- a/mlir/lib/Dialect/Affine/Transforms/AffineScalarReplacement.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/AffineScalarReplacement.cpp
@@ -13,6 +13,7 @@
 
 #include "mlir/Dialect/Affine/Passes.h"
 
+#include "mlir/Analysis/AliasAnalysis.h"
 #include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Dominance.h"
@@ -47,5 +48,6 @@ mlir::affine::createAffineScalarReplacementPass() {
 
 void AffineScalarReplacement::runOnOperation() {
   affineScalarReplace(getOperation(), getAnalysis<DominanceInfo>(),
-                      getAnalysis<PostDominanceInfo>());
+                      getAnalysis<PostDominanceInfo>(),
+                      getAnalysis<AliasAnalysis>());
 }
diff --git a/mlir/lib/Dialect/Affine/Utils/Utils.cpp b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
index 8b8ed2578ca5..f46381403bc5 100644
--- a/mlir/lib/Dialect/Affine/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
@@ -678,12 +678,9 @@ static bool mayHaveEffect(Operation *srcMemOp, Operation *destMemOp,
 }
 
 template <typename EffectType, typename T>
-bool mlir::affine::hasNoInterveningEffect(Operation *start, T memOp) {
-  auto isLocallyAllocated = [](Value memref) {
-    auto *defOp = memref.getDefiningOp();
-    return defOp && hasSingleEffect<MemoryEffects::Allocate>(defOp, memref);
-  };
-
+bool mlir::affine::hasNoInterveningEffect(
+    Operation *start, T memOp,
+    llvm::function_ref<bool(Value, Value)> mayAlias) {
   // A boolean representing whether an intervening operation could have impacted
   // memOp.
   bool hasSideEffect = false;
@@ -704,11 +701,8 @@ bool mlir::affine::hasNoInterveningEffect(Operation *start, T memOp) {
         // If op causes EffectType on a potentially aliasing location for
         // memOp, mark as having the effect.
         if (isa<EffectType>(effect.getEffect())) {
-          // TODO: This should be replaced with a check for no aliasing.
-          // Aliasing information should be passed to this method.
           if (effect.getValue() && effect.getValue() != memref &&
-              isLocallyAllocated(memref) &&
-              isLocallyAllocated(effect.getValue()))
+              !mayAlias(effect.getValue(), memref))
             continue;
           opMayHaveEffect = true;
           break;
@@ -832,10 +826,10 @@ bool mlir::affine::hasNoInterveningEffect(Operation *start, T memOp) {
 /// other operations will overwrite the memory loaded between the given load
 /// and store.  If such a value exists, the replaced `loadOp` will be added to
 /// `loadOpsToErase` and its memref will be added to `memrefsToErase`.
-static void forwardStoreToLoad(AffineReadOpInterface loadOp,
-                               SmallVectorImpl<Operation *> &loadOpsToErase,
-                               SmallPtrSetImpl<Value> &memrefsToErase,
-                               DominanceInfo &domInfo) {
+static void forwardStoreToLoad(
+    AffineReadOpInterface loadOp, SmallVectorImpl<Operation *> &loadOpsToErase,
+    SmallPtrSetImpl<Value> &memrefsToErase, DominanceInfo &domInfo,
+    llvm::function_ref<bool(Value, Value)> mayAlias) {
 
   // The store op candidate for forwarding that satisfies all conditions
   // to replace the load, if any.
@@ -872,7 +866,8 @@ static void forwardStoreToLoad(AffineReadOpInterface loadOp,
 
     // 4. Ensure there is no intermediate operation which could replace the
     // value in memory.
-    if (!affine::hasNoInterveningEffect<MemoryEffects::Write>(storeOp, loadOp))
+    if (!affine::hasNoInterveningEffect<MemoryEffects::Write>(storeOp, loadOp,
+                                                              mayAlias))
       continue;
 
     // We now have a candidate for forwarding.
@@ -901,7 +896,8 @@ static void forwardStoreToLoad(AffineReadOpInterface loadOp,
 template bool
 mlir::affine::hasNoInterveningEffect<mlir::MemoryEffects::Read,
                                      affine::AffineReadOpInterface>(
-    mlir::Operation *, affine::AffineReadOpInterface);
+    mlir::Operation *, affine::AffineReadOpInterface,
+    llvm::function_ref<bool(Value, Value)>);
 
 // This attempts to find stores which have no impact on the final result.
 // A writing op writeA will be eliminated if there exists an op writeB if
@@ -910,7 +906,8 @@ mlir::affine::hasNoInterveningEffect<mlir::MemoryEffects::Read,
 // 3) There is no potential read between writeA and writeB.
 static void findUnusedStore(AffineWriteOpInterface writeA,
                             SmallVectorImpl<Operation *> &opsToErase,
-                            PostDominanceInfo &postDominanceInfo) {
+                            PostDominanceInfo &postDominanceInfo,
+                            llvm::function_ref<bool(Value, Value)> mayAlias) {
 
   for (Operation *user : writeA.getMemRef().getUsers()) {
     // Only consider writing operations.
@@ -939,7 +936,8 @@ static void findUnusedStore(AffineWriteOpInterface writeA,
 
     // There cannot be an operation which reads from memory between
     // the two writes.
-    if (!affine::hasNoInterveningEffect<MemoryEffects::Read>(writeA, writeB))
+    if (!affine::hasNoInterveningEffect<MemoryEffects::Read>(writeA, writeB,
+                                                             mayAlias))
       continue;
 
     opsToErase.push_back(writeA);
@@ -955,7 +953,8 @@ static void findUnusedStore(AffineWriteOpInterface writeA,
 // 3) There is no write between loadA and loadB.
 static void loadCSE(AffineReadOpInterface loadA,
                     SmallVectorImpl<Operation *> &loadOpsToErase,
-                    DominanceInfo &domInfo) {
+                    DominanceInfo &domInfo,
+                    llvm::function_ref<bool(Value, Value)> mayAlias) {
   SmallVector<AffineReadOpInterface, 4> loadCandidates;
   for (auto *user : loadA.getMemRef().getUsers()) {
     auto loadB = dyn_cast<AffineReadOpInterface>(user);
@@ -976,7 +975,7 @@ static void loadCSE(AffineReadOpInterface loadA,
 
     // 3. There should not be a write between loadA and loadB.
     if (!affine::hasNoInterveningEffect<MemoryEffects::Write>(
-            loadB.getOperation(), loadA))
+            loadB.getOperation(), loadA, mayAlias))
       continue;
 
     // Check if two values have the same shape. This is needed for affine vector
@@ -1034,16 +1033,21 @@ static void loadCSE(AffineReadOpInterface loadA,
 // than dealloc) remain.
 //
 void mlir::affine::affineScalarReplace(func::FuncOp f, DominanceInfo &domInfo,
-                                       PostDominanceInfo &postDomInfo) {
+                                       PostDominanceInfo &postDomInfo,
+                                       AliasAnalysis &aliasAnalysis) {
   // Load op's whose results were replaced by those forwarded from stores.
   SmallVector<Operation *, 8> opsToErase;
 
   // A list of memref's that are potentially dead / could be eliminated.
   SmallPtrSet<Value, 4> memrefsToErase;
 
+  auto mayAlias = [&](Value val1, Value val2) -> bool {
+    return !aliasAnalysis.alias(val1, val2).isNo();
+  };
+
   // Walk all load's and perform store to load forwarding.
   f.walk([&](AffineReadOpInterface loadOp) {
-    forwardStoreToLoad(loadOp, opsToErase, memrefsToErase, domInfo);
+    forwardStoreToLoad(loadOp, opsToErase, memrefsToErase, domInfo, mayAlias);
   });
   for (auto *op : opsToErase)
     op->erase();
@@ -1051,7 +1055,7 @@ void mlir::affine::affineScalarReplace(func::FuncOp f, DominanceInfo &domInfo,
 
   // Walk all store's and perform unused store elimination
   f.walk([&](AffineWriteOpInterface storeOp) {
-    findUnusedStore(storeOp, opsToErase, postDomInfo);
+    findUnusedStore(storeOp, opsToErase, postDomInfo, mayAlias);
   });
   for (auto *op : opsToErase)
     op->erase();
@@ -1084,7 +1088,7 @@ void mlir::affine::affineScalarReplace(func::FuncOp f, DominanceInfo &domInfo,
   // stores. Otherwise, some stores are wrongly seen as having an intervening
   // effect.
   f.walk([&](AffineReadOpInterface loadOp) {
-    loadCSE(loadOp, opsToErase, domInfo);
+    loadCSE(loadOp, opsToErase, domInfo, mayAlias);
   });
   for (auto *op : opsToErase)
     op->erase();
diff --git a/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td b/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td
index 02d05780a7ac..6d7ac2be951d 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td
+++ b/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td
@@ -24,10 +24,10 @@ def SubIntAttrs : NativeCodeCall<"subIntegerAttrs($_builder, $0, $1, $2)">;
 // Multiply two integer attributes and create a new one with the result.
 def MulIntAttrs : NativeCodeCall<"mulIntegerAttrs($_builder, $0, $1, $2)">;
 
-// TODO: Canonicalizations currently doesn't take into account integer overflow
-// flags and always reset them to default (wraparound) which is safe but can
-// inhibit later optimizations. Individual patterns must be reviewed for
-// better handling of overflow flags.
+// Merge overflow flags from 2 ops, selecting the most conservative combination.
+def MergeOverflow : NativeCodeCall<"mergeOverflowFlags($0, $1)">;
+
+// Default overflow flag (all wraparounds allowed).
 defvar DefOverflow = ConstantEnumCase<Arith_IntegerOverflowAttr, "none">;
 
 class cast<string type> : NativeCodeCall<"::mlir::cast<" # type # ">($0)">;
@@ -45,7 +45,7 @@ def AddIAddConstant :
           (Arith_AddIOp $x, (ConstantLikeMatcher APIntAttr:$c0), $ovf1),
           (ConstantLikeMatcher APIntAttr:$c1), $ovf2),
         (Arith_AddIOp $x, (Arith_ConstantOp (AddIntAttrs $res, $c0, $c1)),
-            DefOverflow)>;
+            (MergeOverflow $ovf1, $ovf2))>;
 
 // addi(subi(x, c0), c1) -> addi(x, c1 - c0)
 def AddISubConstantRHS :
@@ -53,7 +53,7 @@ def AddISubConstantRHS :
           (Arith_SubIOp $x, (ConstantLikeMatcher APIntAttr:$c0), $ovf1),
           (ConstantLikeMatcher APIntAttr:$c1), $ovf2),
         (Arith_AddIOp $x, (Arith_ConstantOp (SubIntAttrs $res, $c1, $c0)),
-            DefOverflow)>;
+            (MergeOverflow $ovf1, $ovf2))>;
 
 // addi(subi(c0, x), c1) -> subi(c0 + c1, x)
 def AddISubConstantLHS :
@@ -61,7 +61,7 @@ def AddISubConstantLHS :
           (Arith_SubIOp (ConstantLikeMatcher APIntAttr:$c0), $x, $ovf1),
           (ConstantLikeMatcher APIntAttr:$c1), $ovf2),
         (Arith_SubIOp (Arith_ConstantOp (AddIntAttrs $res, $c0, $c1)), $x,
-            DefOverflow)>;
+            (MergeOverflow $ovf1, $ovf2))>;
 
 def IsScalarOrSplatNegativeOne :
     Constraint<And<[
@@ -73,7 +73,7 @@ def AddIMulNegativeOneRhs :
     Pat<(Arith_AddIOp
            $x,
            (Arith_MulIOp $y, (ConstantLikeMatcher AnyAttr:$c0), $ovf1), $ovf2),
-        (Arith_SubIOp $x, $y, DefOverflow),
+        (Arith_SubIOp $x, $y, DefOverflow), // TODO: overflow flags
         [(IsScalarOrSplatNegativeOne $c0)]>;
 
 // addi(muli(x, -1), y) -> subi(y, x)
@@ -81,7 +81,7 @@ def AddIMulNegativeOneLhs :
     Pat<(Arith_AddIOp
            (Arith_MulIOp $x, (ConstantLikeMatcher AnyAttr:$c0), $ovf1),
            $y, $ovf2),
-        (Arith_SubIOp $y, $x, DefOverflow),
+        (Arith_SubIOp $y, $x, DefOverflow), // TODO: overflow flags
         [(IsScalarOrSplatNegativeOne $c0)]>;
 
 // muli(muli(x, c0), c1) -> muli(x, c0 * c1)
@@ -90,7 +90,7 @@ def MulIMulIConstant :
           (Arith_MulIOp $x, (ConstantLikeMatcher APIntAttr:$c0), $ovf1),
           (ConstantLikeMatcher APIntAttr:$c1), $ovf2),
         (Arith_MulIOp $x, (Arith_ConstantOp (MulIntAttrs $res, $c0, $c1)),
-            DefOverflow)>;
+            (MergeOverflow $ovf1, $ovf2))>;
 
 //===----------------------------------------------------------------------===//
 // AddUIExtendedOp
@@ -113,7 +113,7 @@ def SubIRHSAddConstant :
           (Arith_AddIOp $x, (ConstantLikeMatcher APIntAttr:$c0), $ovf1),
           (ConstantLikeMatcher APIntAttr:$c1), $ovf2),
         (Arith_AddIOp $x, (Arith_ConstantOp (SubIntAttrs $res, $c0, $c1)),
-            DefOverflow)>;
+            DefOverflow)>; // TODO: overflow flags
 
 // subi(c1, addi(x, c0)) -> subi(c1 - c0, x)
 def SubILHSAddConstant :
@@ -121,7 +121,7 @@ def SubILHSAddConstant :
           (ConstantLikeMatcher APIntAttr:$c1),
           (Arith_AddIOp $x, (ConstantLikeMatcher APIntAttr:$c0), $ovf1), $ovf2),
         (Arith_SubIOp (Arith_ConstantOp (SubIntAttrs $res, $c1, $c0)), $x,
-            DefOverflow)>;
+            (MergeOverflow $ovf1, $ovf2))>;
 
 // subi(subi(x, c0), c1) -> subi(x, c0 + c1)
 def SubIRHSSubConstantRHS :
@@ -129,7 +129,7 @@ def SubIRHSSubConstantRHS :
           (Arith_SubIOp $x, (ConstantLikeMatcher APIntAttr:$c0), $ovf1),
           (ConstantLikeMatcher APIntAttr:$c1), $ovf2),
         (Arith_SubIOp $x, (Arith_ConstantOp (AddIntAttrs $res, $c0, $c1)),
-            DefOverflow)>;
+            (MergeOverflow $ovf1, $ovf2))>;
 
 // subi(subi(c0, x), c1) -> subi(c0 - c1, x)
 def SubIRHSSubConstantLHS :
@@ -137,7 +137,7 @@ def SubIRHSSubConstantLHS :
           (Arith_SubIOp (ConstantLikeMatcher APIntAttr:$c0), $x, $ovf1),
           (ConstantLikeMatcher APIntAttr:$c1), $ovf2),
         (Arith_SubIOp (Arith_ConstantOp (SubIntAttrs $res, $c0, $c1)), $x,
-            DefOverflow)>;
+            (MergeOverflow $ovf1, $ovf2))>;
 
 // subi(c1, subi(x, c0)) -> subi(c0 + c1, x)
 def SubILHSSubConstantRHS :
@@ -145,7 +145,7 @@ def SubILHSSubConstantRHS :
           (ConstantLikeMatcher APIntAttr:$c1),
           (Arith_SubIOp $x, (ConstantLikeMatcher APIntAttr:$c0), $ovf1), $ovf2),
         (Arith_SubIOp (Arith_ConstantOp (AddIntAttrs $res, $c0, $c1)), $x,
-            DefOverflow)>;
+            (MergeOverflow $ovf1, $ovf2))>;
 
 // subi(c1, subi(c0, x)) -> addi(x, c1 - c0)
 def SubILHSSubConstantLHS :
@@ -153,12 +153,13 @@ def SubILHSSubConstantLHS :
           (ConstantLikeMatcher APIntAttr:$c1),
           (Arith_SubIOp (ConstantLikeMatcher APIntAttr:$c0), $x, $ovf1), $ovf2),
         (Arith_AddIOp $x, (Arith_ConstantOp (SubIntAttrs $res, $c1, $c0)),
-            DefOverflow)>;
+            (MergeOverflow $ovf1, $ovf2))>;
 
 // subi(subi(a, b), a) -> subi(0, b)
 def SubISubILHSRHSLHS :
     Pat<(Arith_SubIOp:$res (Arith_SubIOp $x, $y, $ovf1), $x, $ovf2),
-        (Arith_SubIOp (Arith_ConstantOp (GetZeroAttr $y)), $y, DefOverflow)>;
+        (Arith_SubIOp (Arith_ConstantOp (GetZeroAttr $y)), $y,
+            (MergeOverflow $ovf1, $ovf2))>;
 
 //===----------------------------------------------------------------------===//
 // MulSIExtendedOp
diff --git a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
index 6f995b93bc3e..a0b50251c6b6 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
+++ b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
@@ -64,6 +64,14 @@ static IntegerAttr mulIntegerAttrs(PatternRewriter &builder, Value res,
   return applyToIntegerAttrs(builder, res, lhs, rhs, std::multiplies<APInt>());
 }
 
+// Merge overflow flags from 2 ops, selecting the most conservative combination.
+static IntegerOverflowFlagsAttr
+mergeOverflowFlags(IntegerOverflowFlagsAttr val1,
+                   IntegerOverflowFlagsAttr val2) {
+  return IntegerOverflowFlagsAttr::get(val1.getContext(),
+                                       val1.getValue() & val2.getValue());
+}
+
 /// Invert an integer comparison predicate.
 arith::CmpIPredicate arith::invertPredicate(arith::CmpIPredicate pred) {
   switch (pred) {
@@ -683,6 +691,8 @@ OpFoldResult arith::CeilDivSIOp::fold(FoldAdaptor adaptor) {
     return getLhs();
 
   // Don't fold if it would overflow or if it requires a division by zero.
+  // TODO: This hook won't fold operations where a = MININT, because
+  // negating MININT overflows. This can be improved.
   bool overflowOrDiv0 = false;
   auto result = constFoldBinaryOp<IntegerAttr>(
       adaptor.getOperands(), [&](APInt a, const APInt &b) {
@@ -701,22 +711,36 @@ OpFoldResult arith::CeilDivSIOp::fold(FoldAdaptor adaptor) {
           // Both positive, return ceil(a, b).
           return signedCeilNonnegInputs(a, b, overflowOrDiv0);
         }
+
+        // No folding happens if any of the intermediate arithmetic operations
+        // overflows.
+        bool overflowNegA = false;
+        bool overflowNegB = false;
+        bool overflowDiv = false;
+        bool overflowNegRes = false;
         if (!aGtZero && !bGtZero) {
           // Both negative, return ceil(-a, -b).
-          APInt posA = zero.ssub_ov(a, overflowOrDiv0);
-          APInt posB = zero.ssub_ov(b, overflowOrDiv0);
-          return signedCeilNonnegInputs(posA, posB, overflowOrDiv0);
+          APInt posA = zero.ssub_ov(a, overflowNegA);
+          APInt posB = zero.ssub_ov(b, overflowNegB);
+          APInt res = signedCeilNonnegInputs(posA, posB, overflowDiv);
+          overflowOrDiv0 = (overflowNegA || overflowNegB || overflowDiv);
+          return res;
         }
         if (!aGtZero && bGtZero) {
           // A is negative, b is positive, return - ( -a / b).
-          APInt posA = zero.ssub_ov(a, overflowOrDiv0);
-          APInt div = posA.sdiv_ov(b, overflowOrDiv0);
-          return zero.ssub_ov(div, overflowOrDiv0);
+          APInt posA = zero.ssub_ov(a, overflowNegA);
+          APInt div = posA.sdiv_ov(b, overflowDiv);
+          APInt res = zero.ssub_ov(div, overflowNegRes);
+          overflowOrDiv0 = (overflowNegA || overflowDiv || overflowNegRes);
+          return res;
         }
         // A is positive, b is negative, return - (a / -b).
-        APInt posB = zero.ssub_ov(b, overflowOrDiv0);
-        APInt div = a.sdiv_ov(posB, overflowOrDiv0);
-        return zero.ssub_ov(div, overflowOrDiv0);
+        APInt posB = zero.ssub_ov(b, overflowNegB);
+        APInt div = a.sdiv_ov(posB, overflowDiv);
+        APInt res = zero.ssub_ov(div, overflowNegRes);
+
+        overflowOrDiv0 = (overflowNegB || overflowDiv || overflowNegRes);
+        return res;
       });
 
   return overflowOrDiv0 ? Attribute() : result;
diff --git a/mlir/lib/Dialect/Arith/Transforms/IntRangeOptimizations.cpp b/mlir/lib/Dialect/Arith/Transforms/IntRangeOptimizations.cpp
index 92cad7cd1ef2..2473169962b9 100644
--- a/mlir/lib/Dialect/Arith/Transforms/IntRangeOptimizations.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/IntRangeOptimizations.cpp
@@ -102,6 +102,24 @@ static FailureOr<bool> handleUge(ConstantIntRanges lhs, ConstantIntRanges rhs) {
 }
 
 namespace {
+/// This class listens on IR transformations performed during a pass relying on
+/// information from a `DataflowSolver`. It erases state associated with the
+/// erased operation and its results from the `DataFlowSolver` so that Patterns
+/// do not accidentally query old state information for newly created Ops.
+class DataFlowListener : public RewriterBase::Listener {
+public:
+  DataFlowListener(DataFlowSolver &s) : s(s) {}
+
+protected:
+  void notifyOperationErased(Operation *op) override {
+    s.eraseState(op);
+    for (Value res : op->getResults())
+      s.eraseState(res);
+  }
+
+  DataFlowSolver &s;
+};
+
 struct ConvertCmpOp : public OpRewritePattern<arith::CmpIOp> {
 
   ConvertCmpOp(MLIRContext *context, DataFlowSolver &s)
@@ -167,10 +185,15 @@ struct IntRangeOptimizationsPass
     if (failed(solver.initializeAndRun(op)))
       return signalPassFailure();
 
+    DataFlowListener listener(solver);
+
     RewritePatternSet patterns(ctx);
     populateIntRangeOptimizationsPatterns(patterns, solver);
 
-    if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns))))
+    GreedyRewriteConfig config;
+    config.listener = &listener;
+
+    if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns), config)))
       signalPassFailure();
   }
 };
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/BufferOptimizations.cpp b/mlir/lib/Dialect/Bufferization/Transforms/BufferOptimizations.cpp
index 9dc2f262a511..d7056f35cbc8 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/BufferOptimizations.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/BufferOptimizations.cpp
@@ -59,6 +59,12 @@ static bool isLoop(Operation *op) {
   return regionInterface.hasLoop();
 }
 
+/// Return whether the given operation is a loop with sequential execution
+/// semantics.
+static bool isSequentialLoop(Operation *op) {
+  return !op->hasTrait<OpTrait::HasParallelRegion>() && isLoop(op);
+}
+
 /// Returns true if the given operation implements the AllocationOpInterface
 /// and it supports the dominate block hoisting.
 static bool allowAllocDominateBlockHoisting(Operation *op) {
@@ -338,12 +344,13 @@ struct BufferAllocationLoopHoistingState : BufferAllocationHoistingStateBase {
     return dependencyBlock ? dependencyBlock : nullptr;
   }
 
-  /// Returns true if the given operation represents a loop and one of the
-  /// aliases caused the `aliasDominatorBlock` to be "above" the block of the
-  /// given loop operation. If this is the case, it indicates that the
-  /// allocation is passed via a back edge.
+  /// Returns true if the given operation represents a loop with sequential
+  /// execution semantics and one of the aliases caused the
+  /// `aliasDominatorBlock` to be "above" the block of the given loop operation.
+  /// If this is the case, it indicates that the allocation is passed via a back
+  /// edge.
   bool isLegalPlacement(Operation *op) {
-    return isLoop(op) &&
+    return isSequentialLoop(op) &&
            !dominators->dominates(aliasDominatorBlock, op->getBlock());
   }
 
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp b/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp
index a2222e169c4d..b19636adaa69 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp
@@ -107,7 +107,8 @@ updateFuncOp(func::FuncOp func,
 // the given out-params.
 static LogicalResult updateReturnOps(func::FuncOp func,
                                      ArrayRef<BlockArgument> appendedEntryArgs,
-                                     MemCpyFn memCpyFn) {
+                                     MemCpyFn memCpyFn,
+                                     bool hoistStaticAllocs) {
   auto res = func.walk([&](func::ReturnOp op) {
     SmallVector<Value, 6> copyIntoOutParams;
     SmallVector<Value, 6> keepAsReturnOperands;
@@ -118,10 +119,15 @@ static LogicalResult updateReturnOps(func::FuncOp func,
         keepAsReturnOperands.push_back(operand);
     }
     OpBuilder builder(op);
-    for (auto t : llvm::zip(copyIntoOutParams, appendedEntryArgs)) {
-      if (failed(
-              memCpyFn(builder, op.getLoc(), std::get<0>(t), std::get<1>(t))))
-        return WalkResult::interrupt();
+    for (auto [orig, arg] : llvm::zip(copyIntoOutParams, appendedEntryArgs)) {
+      if (hoistStaticAllocs && isa<memref::AllocOp>(orig.getDefiningOp()) &&
+          mlir::cast<MemRefType>(orig.getType()).hasStaticShape()) {
+        orig.replaceAllUsesWith(arg);
+        orig.getDefiningOp()->erase();
+      } else {
+        if (failed(memCpyFn(builder, op.getLoc(), orig, arg)))
+          return WalkResult::interrupt();
+      }
     }
     builder.create<func::ReturnOp>(op.getLoc(), keepAsReturnOperands);
     op.erase();
@@ -212,7 +218,8 @@ LogicalResult mlir::bufferization::promoteBufferResultsToOutParams(
       return success();
     };
     if (failed(updateReturnOps(func, appendedEntryArgs,
-                               options.memCpyFn.value_or(defaultMemCpyFn)))) {
+                               options.memCpyFn.value_or(defaultMemCpyFn),
+                               options.hoistStaticAllocs))) {
       return failure();
     }
   }
@@ -233,6 +240,8 @@ struct BufferResultsToOutParamsPass
     // Convert from pass options in tablegen to BufferResultsToOutParamsOpts.
     if (addResultAttribute)
       options.addResultAttribute = true;
+    if (hoistStaticAllocs)
+      options.hoistStaticAllocs = true;
 
     if (failed(bufferization::promoteBufferResultsToOutParams(getOperation(),
                                                               options)))
diff --git a/mlir/lib/Dialect/EmitC/Transforms/FormExpressions.cpp b/mlir/lib/Dialect/EmitC/Transforms/FormExpressions.cpp
index e7c431f39e3f..82bd031430d3 100644
--- a/mlir/lib/Dialect/EmitC/Transforms/FormExpressions.cpp
+++ b/mlir/lib/Dialect/EmitC/Transforms/FormExpressions.cpp
@@ -37,7 +37,8 @@ struct FormExpressionsPass
     OpBuilder builder(context);
     auto matchFun = [&](Operation *op) {
       if (op->hasTrait<OpTrait::emitc::CExpression>() &&
-          !op->getParentOfType<emitc::ExpressionOp>())
+          !op->getParentOfType<emitc::ExpressionOp>() &&
+          op->getNumResults() == 1)
         createExpression(op, builder);
     };
     rootOp->walk(matchFun);
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index f1b9ca5c5002..0c2590d71130 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -152,8 +152,7 @@ LogicalResult
 MMAMatrixType::verify(function_ref<InFlightDiagnostic()> emitError,
                       ArrayRef<int64_t> shape, Type elementType,
                       StringRef operand) {
-  if (!operand.equals("AOp") && !operand.equals("BOp") &&
-      !operand.equals("COp"))
+  if (operand != "AOp" && operand != "BOp" && operand != "COp")
     return emitError() << "operand expected to be one of AOp, BOp or COp";
 
   if (shape.size() != 2)
@@ -1941,8 +1940,7 @@ LogicalResult SubgroupMmaLoadMatrixOp::verify() {
     return emitError(
         "expected source memref most minor dim must have unit stride");
 
-  if (!operand.equals("AOp") && !operand.equals("BOp") &&
-      !operand.equals("COp"))
+  if (operand != "AOp" && operand != "BOp" && operand != "COp")
     return emitError("only AOp, BOp and COp can be loaded");
 
   return success();
@@ -1962,7 +1960,7 @@ LogicalResult SubgroupMmaStoreMatrixOp::verify() {
     return emitError(
         "expected destination memref most minor dim must have unit stride");
 
-  if (!srcMatrixType.getOperand().equals("COp"))
+  if (srcMatrixType.getOperand() != "COp")
     return emitError(
         "expected the operand matrix being stored to have 'COp' operand type");
 
@@ -1980,9 +1978,8 @@ LogicalResult SubgroupMmaComputeOp::verify() {
   opTypes.push_back(llvm::cast<MMAMatrixType>(getOpB().getType()));
   opTypes.push_back(llvm::cast<MMAMatrixType>(getOpC().getType()));
 
-  if (!opTypes[A].getOperand().equals("AOp") ||
-      !opTypes[B].getOperand().equals("BOp") ||
-      !opTypes[C].getOperand().equals("COp"))
+  if (opTypes[A].getOperand() != "AOp" || opTypes[B].getOperand() != "BOp" ||
+      opTypes[C].getOperand() != "COp")
     return emitError("operands must be in the order AOp, BOp, COp");
 
   ArrayRef<int64_t> aShape, bShape, cShape;
diff --git a/mlir/lib/Dialect/IRDL/IRDLLoading.cpp b/mlir/lib/Dialect/IRDL/IRDLLoading.cpp
index cfc8d092c817..5df2b45d8037 100644
--- a/mlir/lib/Dialect/IRDL/IRDLLoading.cpp
+++ b/mlir/lib/Dialect/IRDL/IRDLLoading.cpp
@@ -270,26 +270,30 @@ static LogicalResult irdlRegionVerifier(
   return success();
 }
 
-/// Define and load an operation represented by a `irdl.operation`
-/// operation.
-static WalkResult loadOperation(
-    OperationOp op, ExtensibleDialect *dialect,
-    DenseMap<TypeOp, std::unique_ptr<DynamicTypeDefinition>> &types,
-    DenseMap<AttributeOp, std::unique_ptr<DynamicAttrDefinition>> &attrs) {
+llvm::unique_function<LogicalResult(Operation *) const>
+mlir::irdl::createVerifier(
+    OperationOp op,
+    const DenseMap<irdl::TypeOp, std::unique_ptr<DynamicTypeDefinition>> &types,
+    const DenseMap<irdl::AttributeOp, std::unique_ptr<DynamicAttrDefinition>>
+        &attrs) {
   // Resolve SSA values to verifier constraint slots
   SmallVector<Value> constrToValue;
   SmallVector<Value> regionToValue;
   for (Operation &op : op->getRegion(0).getOps()) {
     if (isa<VerifyConstraintInterface>(op)) {
-      if (op.getNumResults() != 1)
-        return op.emitError()
-               << "IRDL constraint operations must have exactly one result";
+      if (op.getNumResults() != 1) {
+        op.emitError()
+            << "IRDL constraint operations must have exactly one result";
+        return nullptr;
+      }
       constrToValue.push_back(op.getResult(0));
     }
     if (isa<VerifyRegionInterface>(op)) {
-      if (op.getNumResults() != 1)
-        return op.emitError()
-               << "IRDL constraint operations must have exactly one result";
+      if (op.getNumResults() != 1) {
+        op.emitError()
+            << "IRDL constraint operations must have exactly one result";
+        return nullptr;
+      }
       regionToValue.push_back(op.getResult(0));
     }
   }
@@ -302,7 +306,7 @@ static WalkResult loadOperation(
     std::unique_ptr<Constraint> verifier =
         op.getVerifier(constrToValue, types, attrs);
     if (!verifier)
-      return WalkResult::interrupt();
+      return nullptr;
     constraints.push_back(std::move(verifier));
   }
 
@@ -358,7 +362,7 @@ static WalkResult loadOperation(
   }
 
   // Gather which constraint slots correspond to attributes constraints
-  DenseMap<StringAttr, size_t> attributesContraints;
+  DenseMap<StringAttr, size_t> attributeConstraints;
   auto attributesOp = op.getOp<AttributesOp>();
   if (attributesOp.has_value()) {
     const Operation::operand_range values = attributesOp->getAttributeValues();
@@ -367,40 +371,53 @@ static WalkResult loadOperation(
     for (const auto &[name, value] : llvm::zip(names, values)) {
       for (auto [i, constr] : enumerate(constrToValue)) {
         if (constr == value) {
-          attributesContraints[cast<StringAttr>(name)] = i;
+          attributeConstraints[cast<StringAttr>(name)] = i;
           break;
         }
       }
     }
   }
 
-  // IRDL does not support defining custom parsers or printers.
-  auto parser = [](OpAsmParser &parser, OperationState &result) {
-    return failure();
-  };
-  auto printer = [](Operation *op, OpAsmPrinter &printer, StringRef) {
-    printer.printGenericOp(op);
-  };
-
-  auto verifier =
+  return
       [constraints{std::move(constraints)},
        regionConstraints{std::move(regionConstraints)},
        operandConstraints{std::move(operandConstraints)},
        operandVariadicity{std::move(operandVariadicity)},
        resultConstraints{std::move(resultConstraints)},
        resultVariadicity{std::move(resultVariadicity)},
-       attributesContraints{std::move(attributesContraints)}](Operation *op) {
+       attributeConstraints{std::move(attributeConstraints)}](Operation *op) {
         ConstraintVerifier verifier(constraints);
         const LogicalResult opVerifierResult = irdlOpVerifier(
             op, verifier, operandConstraints, operandVariadicity,
-            resultConstraints, resultVariadicity, attributesContraints);
+            resultConstraints, resultVariadicity, attributeConstraints);
         const LogicalResult opRegionVerifierResult =
             irdlRegionVerifier(op, verifier, regionConstraints);
         return LogicalResult::success(opVerifierResult.succeeded() &&
                                       opRegionVerifierResult.succeeded());
       };
+}
+
+/// Define and load an operation represented by a `irdl.operation`
+/// operation.
+static WalkResult loadOperation(
+    OperationOp op, ExtensibleDialect *dialect,
+    const DenseMap<TypeOp, std::unique_ptr<DynamicTypeDefinition>> &types,
+    const DenseMap<AttributeOp, std::unique_ptr<DynamicAttrDefinition>>
+        &attrs) {
+
+  // IRDL does not support defining custom parsers or printers.
+  auto parser = [](OpAsmParser &parser, OperationState &result) {
+    return failure();
+  };
+  auto printer = [](Operation *op, OpAsmPrinter &printer, StringRef) {
+    printer.printGenericOp(op);
+  };
+
+  auto verifier = createVerifier(op, types, attrs);
+  if (!verifier)
+    return WalkResult::interrupt();
 
-  // IRDL supports only checking number of blocks and argument contraints
+  // IRDL supports only checking number of blocks and argument constraints
   // It is done in the main verifier to reuse `ConstraintVerifier` context
   auto regionVerifier = [](Operation *op) { return LogicalResult::success(); };
 
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index 7be493d5992c..7d33d05feb65 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -3035,6 +3035,19 @@ LogicalResult LinkerOptionsOp::verify() {
 }
 
 //===----------------------------------------------------------------------===//
+// InlineAsmOp
+//===----------------------------------------------------------------------===//
+
+void InlineAsmOp::getEffects(
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
+        &effects) {
+  if (getHasSideEffects()) {
+    effects.emplace_back(MemoryEffects::Write::get());
+    effects.emplace_back(MemoryEffects::Read::get());
+  }
+}
+
+//===----------------------------------------------------------------------===//
 // LLVMDialect initialization, type parsing, and registration.
 //===----------------------------------------------------------------------===//
 
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
index 6eeb13ebffb5..4fdf847a559c 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
@@ -36,26 +36,28 @@ llvm::SmallVector<MemorySlot> LLVM::AllocaOp::getPromotableSlots() {
 }
 
 Value LLVM::AllocaOp::getDefaultValue(const MemorySlot &slot,
-                                      RewriterBase &rewriter) {
-  return rewriter.create<LLVM::UndefOp>(getLoc(), slot.elemType);
+                                      OpBuilder &builder) {
+  return builder.create<LLVM::UndefOp>(getLoc(), slot.elemType);
 }
 
 void LLVM::AllocaOp::handleBlockArgument(const MemorySlot &slot,
                                          BlockArgument argument,
-                                         RewriterBase &rewriter) {
+                                         OpBuilder &builder) {
   for (Operation *user : getOperation()->getUsers())
     if (auto declareOp = llvm::dyn_cast<LLVM::DbgDeclareOp>(user))
-      rewriter.create<LLVM::DbgValueOp>(declareOp.getLoc(), argument,
-                                        declareOp.getVarInfo(),
-                                        declareOp.getLocationExpr());
+      builder.create<LLVM::DbgValueOp>(declareOp.getLoc(), argument,
+                                       declareOp.getVarInfo(),
+                                       declareOp.getLocationExpr());
 }
 
-void LLVM::AllocaOp::handlePromotionComplete(const MemorySlot &slot,
-                                             Value defaultValue,
-                                             RewriterBase &rewriter) {
+std::optional<PromotableAllocationOpInterface>
+LLVM::AllocaOp::handlePromotionComplete(const MemorySlot &slot,
+                                        Value defaultValue,
+                                        OpBuilder &builder) {
   if (defaultValue && defaultValue.use_empty())
-    rewriter.eraseOp(defaultValue.getDefiningOp());
-  rewriter.eraseOp(*this);
+    defaultValue.getDefiningOp()->erase();
+  this->erase();
+  return std::nullopt;
 }
 
 SmallVector<DestructurableMemorySlot> LLVM::AllocaOp::getDestructurableSlots() {
@@ -78,16 +80,16 @@ SmallVector<DestructurableMemorySlot> LLVM::AllocaOp::getDestructurableSlots() {
 DenseMap<Attribute, MemorySlot>
 LLVM::AllocaOp::destructure(const DestructurableMemorySlot &slot,
                             const SmallPtrSetImpl<Attribute> &usedIndices,
-                            RewriterBase &rewriter) {
+                            OpBuilder &builder) {
   assert(slot.ptr == getResult());
-  rewriter.setInsertionPointAfter(*this);
+  builder.setInsertionPointAfter(*this);
 
   auto destructurableType = cast<DestructurableTypeInterface>(getElemType());
   DenseMap<Attribute, MemorySlot> slotMap;
   for (Attribute index : usedIndices) {
     Type elemType = destructurableType.getTypeAtIndex(index);
     assert(elemType && "used index must exist");
-    auto subAlloca = rewriter.create<LLVM::AllocaOp>(
+    auto subAlloca = builder.create<LLVM::AllocaOp>(
         getLoc(), LLVM::LLVMPointerType::get(getContext()), elemType,
         getArraySize());
     slotMap.try_emplace<MemorySlot>(index, {subAlloca.getResult(), elemType});
@@ -97,9 +99,9 @@ LLVM::AllocaOp::destructure(const DestructurableMemorySlot &slot,
 }
 
 void LLVM::AllocaOp::handleDestructuringComplete(
-    const DestructurableMemorySlot &slot, RewriterBase &rewriter) {
+    const DestructurableMemorySlot &slot, OpBuilder &builder) {
   assert(slot.ptr == getResult());
-  rewriter.eraseOp(*this);
+  this->erase();
 }
 
 //===----------------------------------------------------------------------===//
@@ -112,7 +114,7 @@ bool LLVM::LoadOp::loadsFrom(const MemorySlot &slot) {
 
 bool LLVM::LoadOp::storesTo(const MemorySlot &slot) { return false; }
 
-Value LLVM::LoadOp::getStored(const MemorySlot &slot, RewriterBase &rewriter,
+Value LLVM::LoadOp::getStored(const MemorySlot &slot, OpBuilder &builder,
                               Value reachingDef, const DataLayout &dataLayout) {
   llvm_unreachable("getStored should not be called on LoadOp");
 }
@@ -175,7 +177,7 @@ static bool isBigEndian(const DataLayout &dataLayout) {
 
 /// Converts a value to an integer type of the same size.
 /// Assumes that the type can be converted.
-static Value castToSameSizedInt(RewriterBase &rewriter, Location loc, Value val,
+static Value castToSameSizedInt(OpBuilder &builder, Location loc, Value val,
                                 const DataLayout &dataLayout) {
   Type type = val.getType();
   assert(isSupportedTypeForConversion(type) &&
@@ -185,15 +187,15 @@ static Value castToSameSizedInt(RewriterBase &rewriter, Location loc, Value val,
     return val;
 
   uint64_t typeBitSize = dataLayout.getTypeSizeInBits(type);
-  IntegerType valueSizeInteger = rewriter.getIntegerType(typeBitSize);
+  IntegerType valueSizeInteger = builder.getIntegerType(typeBitSize);
 
   if (isa<LLVM::LLVMPointerType>(type))
-    return rewriter.createOrFold<LLVM::PtrToIntOp>(loc, valueSizeInteger, val);
-  return rewriter.createOrFold<LLVM::BitcastOp>(loc, valueSizeInteger, val);
+    return builder.createOrFold<LLVM::PtrToIntOp>(loc, valueSizeInteger, val);
+  return builder.createOrFold<LLVM::BitcastOp>(loc, valueSizeInteger, val);
 }
 
 /// Converts a value with an integer type to `targetType`.
-static Value castIntValueToSameSizedType(RewriterBase &rewriter, Location loc,
+static Value castIntValueToSameSizedType(OpBuilder &builder, Location loc,
                                          Value val, Type targetType) {
   assert(isa<IntegerType>(val.getType()) &&
          "expected value to have an integer type");
@@ -202,13 +204,13 @@ static Value castIntValueToSameSizedType(RewriterBase &rewriter, Location loc,
   if (val.getType() == targetType)
     return val;
   if (isa<LLVM::LLVMPointerType>(targetType))
-    return rewriter.createOrFold<LLVM::IntToPtrOp>(loc, targetType, val);
-  return rewriter.createOrFold<LLVM::BitcastOp>(loc, targetType, val);
+    return builder.createOrFold<LLVM::IntToPtrOp>(loc, targetType, val);
+  return builder.createOrFold<LLVM::BitcastOp>(loc, targetType, val);
 }
 
 /// Constructs operations that convert `srcValue` into a new value of type
 /// `targetType`. Assumes the types have the same bitsize.
-static Value castSameSizedTypes(RewriterBase &rewriter, Location loc,
+static Value castSameSizedTypes(OpBuilder &builder, Location loc,
                                 Value srcValue, Type targetType,
                                 const DataLayout &dataLayout) {
   Type srcType = srcValue.getType();
@@ -226,18 +228,18 @@ static Value castSameSizedTypes(RewriterBase &rewriter, Location loc,
   // provenance.
   if (isa<LLVM::LLVMPointerType>(targetType) &&
       isa<LLVM::LLVMPointerType>(srcType))
-    return rewriter.createOrFold<LLVM::AddrSpaceCastOp>(loc, targetType,
-                                                        srcValue);
+    return builder.createOrFold<LLVM::AddrSpaceCastOp>(loc, targetType,
+                                                       srcValue);
 
   // For all other castable types, casting through integers is necessary.
-  Value replacement = castToSameSizedInt(rewriter, loc, srcValue, dataLayout);
-  return castIntValueToSameSizedType(rewriter, loc, replacement, targetType);
+  Value replacement = castToSameSizedInt(builder, loc, srcValue, dataLayout);
+  return castIntValueToSameSizedType(builder, loc, replacement, targetType);
 }
 
 /// Constructs operations that convert `srcValue` into a new value of type
 /// `targetType`. Performs bit-level extraction if the source type is larger
 /// than the target type. Assumes that this conversion is possible.
-static Value createExtractAndCast(RewriterBase &rewriter, Location loc,
+static Value createExtractAndCast(OpBuilder &builder, Location loc,
                                   Value srcValue, Type targetType,
                                   const DataLayout &dataLayout) {
   // Get the types of the source and target values.
@@ -249,31 +251,31 @@ static Value createExtractAndCast(RewriterBase &rewriter, Location loc,
   uint64_t srcTypeSize = dataLayout.getTypeSizeInBits(srcType);
   uint64_t targetTypeSize = dataLayout.getTypeSizeInBits(targetType);
   if (srcTypeSize == targetTypeSize)
-    return castSameSizedTypes(rewriter, loc, srcValue, targetType, dataLayout);
+    return castSameSizedTypes(builder, loc, srcValue, targetType, dataLayout);
 
   // First, cast the value to a same-sized integer type.
-  Value replacement = castToSameSizedInt(rewriter, loc, srcValue, dataLayout);
+  Value replacement = castToSameSizedInt(builder, loc, srcValue, dataLayout);
 
   // Truncate the integer if the size of the target is less than the value.
   if (isBigEndian(dataLayout)) {
     uint64_t shiftAmount = srcTypeSize - targetTypeSize;
-    auto shiftConstant = rewriter.create<LLVM::ConstantOp>(
-        loc, rewriter.getIntegerAttr(srcType, shiftAmount));
+    auto shiftConstant = builder.create<LLVM::ConstantOp>(
+        loc, builder.getIntegerAttr(srcType, shiftAmount));
     replacement =
-        rewriter.createOrFold<LLVM::LShrOp>(loc, srcValue, shiftConstant);
+        builder.createOrFold<LLVM::LShrOp>(loc, srcValue, shiftConstant);
   }
 
-  replacement = rewriter.create<LLVM::TruncOp>(
-      loc, rewriter.getIntegerType(targetTypeSize), replacement);
+  replacement = builder.create<LLVM::TruncOp>(
+      loc, builder.getIntegerType(targetTypeSize), replacement);
 
   // Now cast the integer to the actual target type if required.
-  return castIntValueToSameSizedType(rewriter, loc, replacement, targetType);
+  return castIntValueToSameSizedType(builder, loc, replacement, targetType);
 }
 
 /// Constructs operations that insert the bits of `srcValue` into the
 /// "beginning" of `reachingDef` (beginning is endianness dependent).
 /// Assumes that this conversion is possible.
-static Value createInsertAndCast(RewriterBase &rewriter, Location loc,
+static Value createInsertAndCast(OpBuilder &builder, Location loc,
                                  Value srcValue, Value reachingDef,
                                  const DataLayout &dataLayout) {
 
@@ -284,27 +286,27 @@ static Value createInsertAndCast(RewriterBase &rewriter, Location loc,
   uint64_t valueTypeSize = dataLayout.getTypeSizeInBits(srcValue.getType());
   uint64_t slotTypeSize = dataLayout.getTypeSizeInBits(reachingDef.getType());
   if (slotTypeSize == valueTypeSize)
-    return castSameSizedTypes(rewriter, loc, srcValue, reachingDef.getType(),
+    return castSameSizedTypes(builder, loc, srcValue, reachingDef.getType(),
                               dataLayout);
 
   // In the case where the store only overwrites parts of the memory,
   // bit fiddling is required to construct the new value.
 
   // First convert both values to integers of the same size.
-  Value defAsInt = castToSameSizedInt(rewriter, loc, reachingDef, dataLayout);
-  Value valueAsInt = castToSameSizedInt(rewriter, loc, srcValue, dataLayout);
+  Value defAsInt = castToSameSizedInt(builder, loc, reachingDef, dataLayout);
+  Value valueAsInt = castToSameSizedInt(builder, loc, srcValue, dataLayout);
   // Extend the value to the size of the reaching definition.
   valueAsInt =
-      rewriter.createOrFold<LLVM::ZExtOp>(loc, defAsInt.getType(), valueAsInt);
+      builder.createOrFold<LLVM::ZExtOp>(loc, defAsInt.getType(), valueAsInt);
   uint64_t sizeDifference = slotTypeSize - valueTypeSize;
   if (isBigEndian(dataLayout)) {
     // On big endian systems, a store to the base pointer overwrites the most
     // significant bits. To accomodate for this, the stored value needs to be
     // shifted into the according position.
-    Value bigEndianShift = rewriter.create<LLVM::ConstantOp>(
-        loc, rewriter.getIntegerAttr(defAsInt.getType(), sizeDifference));
+    Value bigEndianShift = builder.create<LLVM::ConstantOp>(
+        loc, builder.getIntegerAttr(defAsInt.getType(), sizeDifference));
     valueAsInt =
-        rewriter.createOrFold<LLVM::ShlOp>(loc, valueAsInt, bigEndianShift);
+        builder.createOrFold<LLVM::ShlOp>(loc, valueAsInt, bigEndianShift);
   }
 
   // Construct the mask that is used to erase the bits that are overwritten by
@@ -322,23 +324,23 @@ static Value createInsertAndCast(RewriterBase &rewriter, Location loc,
   }
 
   // Mask out the affected bits ...
-  Value mask = rewriter.create<LLVM::ConstantOp>(
-      loc, rewriter.getIntegerAttr(defAsInt.getType(), maskValue));
-  Value masked = rewriter.createOrFold<LLVM::AndOp>(loc, defAsInt, mask);
+  Value mask = builder.create<LLVM::ConstantOp>(
+      loc, builder.getIntegerAttr(defAsInt.getType(), maskValue));
+  Value masked = builder.createOrFold<LLVM::AndOp>(loc, defAsInt, mask);
 
   // ... and combine the result with the new value.
-  Value combined = rewriter.createOrFold<LLVM::OrOp>(loc, masked, valueAsInt);
+  Value combined = builder.createOrFold<LLVM::OrOp>(loc, masked, valueAsInt);
 
-  return castIntValueToSameSizedType(rewriter, loc, combined,
+  return castIntValueToSameSizedType(builder, loc, combined,
                                      reachingDef.getType());
 }
 
-Value LLVM::StoreOp::getStored(const MemorySlot &slot, RewriterBase &rewriter,
+Value LLVM::StoreOp::getStored(const MemorySlot &slot, OpBuilder &builder,
                                Value reachingDef,
                                const DataLayout &dataLayout) {
   assert(reachingDef && reachingDef.getType() == slot.elemType &&
          "expected the reaching definition's type to match the slot's type");
-  return createInsertAndCast(rewriter, getLoc(), getValue(), reachingDef,
+  return createInsertAndCast(builder, getLoc(), getValue(), reachingDef,
                              dataLayout);
 }
 
@@ -360,13 +362,13 @@ bool LLVM::LoadOp::canUsesBeRemoved(
 
 DeletionKind LLVM::LoadOp::removeBlockingUses(
     const MemorySlot &slot, const SmallPtrSetImpl<OpOperand *> &blockingUses,
-    RewriterBase &rewriter, Value reachingDefinition,
+    OpBuilder &builder, Value reachingDefinition,
     const DataLayout &dataLayout) {
   // `canUsesBeRemoved` checked this blocking use must be the loaded slot
   // pointer.
-  Value newResult = createExtractAndCast(rewriter, getLoc(), reachingDefinition,
+  Value newResult = createExtractAndCast(builder, getLoc(), reachingDefinition,
                                          getResult().getType(), dataLayout);
-  rewriter.replaceAllUsesWith(getResult(), newResult);
+  getResult().replaceAllUsesWith(newResult);
   return DeletionKind::Delete;
 }
 
@@ -390,7 +392,7 @@ bool LLVM::StoreOp::canUsesBeRemoved(
 
 DeletionKind LLVM::StoreOp::removeBlockingUses(
     const MemorySlot &slot, const SmallPtrSetImpl<OpOperand *> &blockingUses,
-    RewriterBase &rewriter, Value reachingDefinition,
+    OpBuilder &builder, Value reachingDefinition,
     const DataLayout &dataLayout) {
   return DeletionKind::Delete;
 }
@@ -452,14 +454,13 @@ bool LLVM::LoadOp::canRewire(const DestructurableMemorySlot &slot,
 
 DeletionKind LLVM::LoadOp::rewire(const DestructurableMemorySlot &slot,
                                   DenseMap<Attribute, MemorySlot> &subslots,
-                                  RewriterBase &rewriter,
+                                  OpBuilder &builder,
                                   const DataLayout &dataLayout) {
   auto index = IntegerAttr::get(IntegerType::get(getContext(), 32), 0);
   auto it = subslots.find(index);
   assert(it != subslots.end());
 
-  rewriter.modifyOpInPlace(
-      *this, [&]() { getAddrMutable().set(it->getSecond().ptr); });
+  getAddrMutable().set(it->getSecond().ptr);
   return DeletionKind::Keep;
 }
 
@@ -491,14 +492,13 @@ bool LLVM::StoreOp::canRewire(const DestructurableMemorySlot &slot,
 
 DeletionKind LLVM::StoreOp::rewire(const DestructurableMemorySlot &slot,
                                    DenseMap<Attribute, MemorySlot> &subslots,
-                                   RewriterBase &rewriter,
+                                   OpBuilder &builder,
                                    const DataLayout &dataLayout) {
   auto index = IntegerAttr::get(IntegerType::get(getContext(), 32), 0);
   auto it = subslots.find(index);
   assert(it != subslots.end());
 
-  rewriter.modifyOpInPlace(
-      *this, [&]() { getAddrMutable().set(it->getSecond().ptr); });
+  getAddrMutable().set(it->getSecond().ptr);
   return DeletionKind::Keep;
 }
 
@@ -523,7 +523,7 @@ bool LLVM::BitcastOp::canUsesBeRemoved(
 }
 
 DeletionKind LLVM::BitcastOp::removeBlockingUses(
-    const SmallPtrSetImpl<OpOperand *> &blockingUses, RewriterBase &rewriter) {
+    const SmallPtrSetImpl<OpOperand *> &blockingUses, OpBuilder &builder) {
   return DeletionKind::Delete;
 }
 
@@ -535,7 +535,7 @@ bool LLVM::AddrSpaceCastOp::canUsesBeRemoved(
 }
 
 DeletionKind LLVM::AddrSpaceCastOp::removeBlockingUses(
-    const SmallPtrSetImpl<OpOperand *> &blockingUses, RewriterBase &rewriter) {
+    const SmallPtrSetImpl<OpOperand *> &blockingUses, OpBuilder &builder) {
   return DeletionKind::Delete;
 }
 
@@ -547,7 +547,7 @@ bool LLVM::LifetimeStartOp::canUsesBeRemoved(
 }
 
 DeletionKind LLVM::LifetimeStartOp::removeBlockingUses(
-    const SmallPtrSetImpl<OpOperand *> &blockingUses, RewriterBase &rewriter) {
+    const SmallPtrSetImpl<OpOperand *> &blockingUses, OpBuilder &builder) {
   return DeletionKind::Delete;
 }
 
@@ -559,7 +559,7 @@ bool LLVM::LifetimeEndOp::canUsesBeRemoved(
 }
 
 DeletionKind LLVM::LifetimeEndOp::removeBlockingUses(
-    const SmallPtrSetImpl<OpOperand *> &blockingUses, RewriterBase &rewriter) {
+    const SmallPtrSetImpl<OpOperand *> &blockingUses, OpBuilder &builder) {
   return DeletionKind::Delete;
 }
 
@@ -571,7 +571,7 @@ bool LLVM::InvariantStartOp::canUsesBeRemoved(
 }
 
 DeletionKind LLVM::InvariantStartOp::removeBlockingUses(
-    const SmallPtrSetImpl<OpOperand *> &blockingUses, RewriterBase &rewriter) {
+    const SmallPtrSetImpl<OpOperand *> &blockingUses, OpBuilder &builder) {
   return DeletionKind::Delete;
 }
 
@@ -583,7 +583,7 @@ bool LLVM::InvariantEndOp::canUsesBeRemoved(
 }
 
 DeletionKind LLVM::InvariantEndOp::removeBlockingUses(
-    const SmallPtrSetImpl<OpOperand *> &blockingUses, RewriterBase &rewriter) {
+    const SmallPtrSetImpl<OpOperand *> &blockingUses, OpBuilder &builder) {
   return DeletionKind::Delete;
 }
 
@@ -595,7 +595,7 @@ bool LLVM::DbgDeclareOp::canUsesBeRemoved(
 }
 
 DeletionKind LLVM::DbgDeclareOp::removeBlockingUses(
-    const SmallPtrSetImpl<OpOperand *> &blockingUses, RewriterBase &rewriter) {
+    const SmallPtrSetImpl<OpOperand *> &blockingUses, OpBuilder &builder) {
   return DeletionKind::Delete;
 }
 
@@ -611,28 +611,27 @@ bool LLVM::DbgValueOp::canUsesBeRemoved(
 }
 
 DeletionKind LLVM::DbgValueOp::removeBlockingUses(
-    const SmallPtrSetImpl<OpOperand *> &blockingUses, RewriterBase &rewriter) {
-  // Rewriter by default is after '*this', but we need it before '*this'.
-  rewriter.setInsertionPoint(*this);
+    const SmallPtrSetImpl<OpOperand *> &blockingUses, OpBuilder &builder) {
+  // builder by default is after '*this', but we need it before '*this'.
+  builder.setInsertionPoint(*this);
 
   // Rather than dropping the debug value, replace it with undef to preserve the
   // debug local variable info. This allows the debugger to inform the user that
   // the variable has been optimized out.
   auto undef =
-      rewriter.create<UndefOp>(getValue().getLoc(), getValue().getType());
-  rewriter.modifyOpInPlace(*this, [&] { getValueMutable().assign(undef); });
+      builder.create<UndefOp>(getValue().getLoc(), getValue().getType());
+  getValueMutable().assign(undef);
   return DeletionKind::Keep;
 }
 
 bool LLVM::DbgDeclareOp::requiresReplacedValues() { return true; }
 
 void LLVM::DbgDeclareOp::visitReplacedValues(
-    ArrayRef<std::pair<Operation *, Value>> definitions,
-    RewriterBase &rewriter) {
+    ArrayRef<std::pair<Operation *, Value>> definitions, OpBuilder &builder) {
   for (auto [op, value] : definitions) {
-    rewriter.setInsertionPointAfter(op);
-    rewriter.create<LLVM::DbgValueOp>(getLoc(), value, getVarInfo(),
-                                      getLocationExpr());
+    builder.setInsertionPointAfter(op);
+    builder.create<LLVM::DbgValueOp>(getLoc(), value, getVarInfo(),
+                                     getLocationExpr());
   }
 }
 
@@ -658,7 +657,7 @@ bool LLVM::GEPOp::canUsesBeRemoved(
 }
 
 DeletionKind LLVM::GEPOp::removeBlockingUses(
-    const SmallPtrSetImpl<OpOperand *> &blockingUses, RewriterBase &rewriter) {
+    const SmallPtrSetImpl<OpOperand *> &blockingUses, OpBuilder &builder) {
   return DeletionKind::Delete;
 }
 
@@ -855,7 +854,7 @@ bool LLVM::GEPOp::canRewire(const DestructurableMemorySlot &slot,
 
 DeletionKind LLVM::GEPOp::rewire(const DestructurableMemorySlot &slot,
                                  DenseMap<Attribute, MemorySlot> &subslots,
-                                 RewriterBase &rewriter,
+                                 OpBuilder &builder,
                                  const DataLayout &dataLayout) {
   std::optional<SubslotAccessInfo> accessInfo =
       getSubslotAccessInfo(slot, dataLayout, *this);
@@ -864,11 +863,11 @@ DeletionKind LLVM::GEPOp::rewire(const DestructurableMemorySlot &slot,
       IntegerAttr::get(IntegerType::get(getContext(), 32), accessInfo->index);
   const MemorySlot &newSlot = subslots.at(indexAttr);
 
-  auto byteType = IntegerType::get(rewriter.getContext(), 8);
-  auto newPtr = rewriter.createOrFold<LLVM::GEPOp>(
+  auto byteType = IntegerType::get(builder.getContext(), 8);
+  auto newPtr = builder.createOrFold<LLVM::GEPOp>(
       getLoc(), getResult().getType(), byteType, newSlot.ptr,
       ArrayRef<GEPArg>(accessInfo->subslotOffset), getInbounds());
-  rewriter.replaceAllUsesWith(getResult(), newPtr);
+  getResult().replaceAllUsesWith(newPtr);
   return DeletionKind::Delete;
 }
 
@@ -938,7 +937,7 @@ bool LLVM::MemsetOp::storesTo(const MemorySlot &slot) {
   return getDst() == slot.ptr;
 }
 
-Value LLVM::MemsetOp::getStored(const MemorySlot &slot, RewriterBase &rewriter,
+Value LLVM::MemsetOp::getStored(const MemorySlot &slot, OpBuilder &builder,
                                 Value reachingDef,
                                 const DataLayout &dataLayout) {
   // TODO: Support non-integer types.
@@ -953,14 +952,14 @@ Value LLVM::MemsetOp::getStored(const MemorySlot &slot, RewriterBase &rewriter,
         // or-ing it with the previous value.
         uint64_t coveredBits = 8;
         Value currentValue =
-            rewriter.create<LLVM::ZExtOp>(getLoc(), intType, getVal());
+            builder.create<LLVM::ZExtOp>(getLoc(), intType, getVal());
         while (coveredBits < intType.getWidth()) {
           Value shiftBy =
-              rewriter.create<LLVM::ConstantOp>(getLoc(), intType, coveredBits);
+              builder.create<LLVM::ConstantOp>(getLoc(), intType, coveredBits);
           Value shifted =
-              rewriter.create<LLVM::ShlOp>(getLoc(), currentValue, shiftBy);
+              builder.create<LLVM::ShlOp>(getLoc(), currentValue, shiftBy);
           currentValue =
-              rewriter.create<LLVM::OrOp>(getLoc(), currentValue, shifted);
+              builder.create<LLVM::OrOp>(getLoc(), currentValue, shifted);
           coveredBits *= 2;
         }
 
@@ -994,7 +993,7 @@ bool LLVM::MemsetOp::canUsesBeRemoved(
 
 DeletionKind LLVM::MemsetOp::removeBlockingUses(
     const MemorySlot &slot, const SmallPtrSetImpl<OpOperand *> &blockingUses,
-    RewriterBase &rewriter, Value reachingDefinition,
+    OpBuilder &builder, Value reachingDefinition,
     const DataLayout &dataLayout) {
   return DeletionKind::Delete;
 }
@@ -1026,7 +1025,7 @@ bool LLVM::MemsetOp::canRewire(const DestructurableMemorySlot &slot,
 
 DeletionKind LLVM::MemsetOp::rewire(const DestructurableMemorySlot &slot,
                                     DenseMap<Attribute, MemorySlot> &subslots,
-                                    RewriterBase &rewriter,
+                                    OpBuilder &builder,
                                     const DataLayout &dataLayout) {
   std::optional<DenseMap<Attribute, Type>> types =
       cast<DestructurableTypeInterface>(slot.elemType).getSubelementIndexMap();
@@ -1063,15 +1062,14 @@ DeletionKind LLVM::MemsetOp::rewire(const DestructurableMemorySlot &slot,
       uint64_t newMemsetSize = std::min(memsetLen - covered, typeSize);
 
       Value newMemsetSizeValue =
-          rewriter
+          builder
               .create<LLVM::ConstantOp>(
                   getLen().getLoc(),
                   IntegerAttr::get(memsetLenAttr.getType(), newMemsetSize))
               .getResult();
 
-      rewriter.create<LLVM::MemsetOp>(getLoc(), subslots.at(index).ptr,
-                                      getVal(), newMemsetSizeValue,
-                                      getIsVolatile());
+      builder.create<LLVM::MemsetOp>(getLoc(), subslots.at(index).ptr, getVal(),
+                                     newMemsetSizeValue, getIsVolatile());
     }
 
     covered += typeSize;
@@ -1096,8 +1094,8 @@ static bool memcpyStoresTo(MemcpyLike op, const MemorySlot &slot) {
 
 template <class MemcpyLike>
 static Value memcpyGetStored(MemcpyLike op, const MemorySlot &slot,
-                             RewriterBase &rewriter) {
-  return rewriter.create<LLVM::LoadOp>(op.getLoc(), slot.elemType, op.getSrc());
+                             OpBuilder &builder) {
+  return builder.create<LLVM::LoadOp>(op.getLoc(), slot.elemType, op.getSrc());
 }
 
 template <class MemcpyLike>
@@ -1122,10 +1120,9 @@ template <class MemcpyLike>
 static DeletionKind
 memcpyRemoveBlockingUses(MemcpyLike op, const MemorySlot &slot,
                          const SmallPtrSetImpl<OpOperand *> &blockingUses,
-                         RewriterBase &rewriter, Value reachingDefinition) {
+                         OpBuilder &builder, Value reachingDefinition) {
   if (op.loadsFrom(slot))
-    rewriter.create<LLVM::StoreOp>(op.getLoc(), reachingDefinition,
-                                   op.getDst());
+    builder.create<LLVM::StoreOp>(op.getLoc(), reachingDefinition, op.getDst());
   return DeletionKind::Delete;
 }
 
@@ -1168,23 +1165,23 @@ static bool memcpyCanRewire(MemcpyLike op, const DestructurableMemorySlot &slot,
 namespace {
 
 template <class MemcpyLike>
-void createMemcpyLikeToReplace(RewriterBase &rewriter, const DataLayout &layout,
+void createMemcpyLikeToReplace(OpBuilder &builder, const DataLayout &layout,
                                MemcpyLike toReplace, Value dst, Value src,
                                Type toCpy, bool isVolatile) {
-  Value memcpySize = rewriter.create<LLVM::ConstantOp>(
+  Value memcpySize = builder.create<LLVM::ConstantOp>(
       toReplace.getLoc(), IntegerAttr::get(toReplace.getLen().getType(),
                                            layout.getTypeSize(toCpy)));
-  rewriter.create<MemcpyLike>(toReplace.getLoc(), dst, src, memcpySize,
-                              isVolatile);
+  builder.create<MemcpyLike>(toReplace.getLoc(), dst, src, memcpySize,
+                             isVolatile);
 }
 
 template <>
-void createMemcpyLikeToReplace(RewriterBase &rewriter, const DataLayout &layout,
+void createMemcpyLikeToReplace(OpBuilder &builder, const DataLayout &layout,
                                LLVM::MemcpyInlineOp toReplace, Value dst,
                                Value src, Type toCpy, bool isVolatile) {
   Type lenType = IntegerType::get(toReplace->getContext(),
                                   toReplace.getLen().getBitWidth());
-  rewriter.create<LLVM::MemcpyInlineOp>(
+  builder.create<LLVM::MemcpyInlineOp>(
       toReplace.getLoc(), dst, src,
       IntegerAttr::get(lenType, layout.getTypeSize(toCpy)), isVolatile);
 }
@@ -1196,7 +1193,7 @@ void createMemcpyLikeToReplace(RewriterBase &rewriter, const DataLayout &layout,
 template <class MemcpyLike>
 static DeletionKind
 memcpyRewire(MemcpyLike op, const DestructurableMemorySlot &slot,
-             DenseMap<Attribute, MemorySlot> &subslots, RewriterBase &rewriter,
+             DenseMap<Attribute, MemorySlot> &subslots, OpBuilder &builder,
              const DataLayout &dataLayout) {
   if (subslots.empty())
     return DeletionKind::Delete;
@@ -1226,12 +1223,12 @@ memcpyRewire(MemcpyLike op, const DestructurableMemorySlot &slot,
     SmallVector<LLVM::GEPArg> gepIndices{
         0, static_cast<int32_t>(
                cast<IntegerAttr>(index).getValue().getZExtValue())};
-    Value subslotPtrInOther = rewriter.create<LLVM::GEPOp>(
+    Value subslotPtrInOther = builder.create<LLVM::GEPOp>(
         op.getLoc(), LLVM::LLVMPointerType::get(op.getContext()), slot.elemType,
         isDst ? op.getSrc() : op.getDst(), gepIndices);
 
     // Then create a new memcpy out of this source pointer.
-    createMemcpyLikeToReplace(rewriter, dataLayout, op,
+    createMemcpyLikeToReplace(builder, dataLayout, op,
                               isDst ? subslot.ptr : subslotPtrInOther,
                               isDst ? subslotPtrInOther : subslot.ptr,
                               subslot.elemType, op.getIsVolatile());
@@ -1250,10 +1247,10 @@ bool LLVM::MemcpyOp::storesTo(const MemorySlot &slot) {
   return memcpyStoresTo(*this, slot);
 }
 
-Value LLVM::MemcpyOp::getStored(const MemorySlot &slot, RewriterBase &rewriter,
+Value LLVM::MemcpyOp::getStored(const MemorySlot &slot, OpBuilder &builder,
                                 Value reachingDef,
                                 const DataLayout &dataLayout) {
-  return memcpyGetStored(*this, slot, rewriter);
+  return memcpyGetStored(*this, slot, builder);
 }
 
 bool LLVM::MemcpyOp::canUsesBeRemoved(
@@ -1266,9 +1263,9 @@ bool LLVM::MemcpyOp::canUsesBeRemoved(
 
 DeletionKind LLVM::MemcpyOp::removeBlockingUses(
     const MemorySlot &slot, const SmallPtrSetImpl<OpOperand *> &blockingUses,
-    RewriterBase &rewriter, Value reachingDefinition,
+    OpBuilder &builder, Value reachingDefinition,
     const DataLayout &dataLayout) {
-  return memcpyRemoveBlockingUses(*this, slot, blockingUses, rewriter,
+  return memcpyRemoveBlockingUses(*this, slot, blockingUses, builder,
                                   reachingDefinition);
 }
 
@@ -1288,9 +1285,9 @@ bool LLVM::MemcpyOp::canRewire(const DestructurableMemorySlot &slot,
 
 DeletionKind LLVM::MemcpyOp::rewire(const DestructurableMemorySlot &slot,
                                     DenseMap<Attribute, MemorySlot> &subslots,
-                                    RewriterBase &rewriter,
+                                    OpBuilder &builder,
                                     const DataLayout &dataLayout) {
-  return memcpyRewire(*this, slot, subslots, rewriter, dataLayout);
+  return memcpyRewire(*this, slot, subslots, builder, dataLayout);
 }
 
 bool LLVM::MemcpyInlineOp::loadsFrom(const MemorySlot &slot) {
@@ -1302,9 +1299,9 @@ bool LLVM::MemcpyInlineOp::storesTo(const MemorySlot &slot) {
 }
 
 Value LLVM::MemcpyInlineOp::getStored(const MemorySlot &slot,
-                                      RewriterBase &rewriter, Value reachingDef,
+                                      OpBuilder &builder, Value reachingDef,
                                       const DataLayout &dataLayout) {
-  return memcpyGetStored(*this, slot, rewriter);
+  return memcpyGetStored(*this, slot, builder);
 }
 
 bool LLVM::MemcpyInlineOp::canUsesBeRemoved(
@@ -1317,9 +1314,9 @@ bool LLVM::MemcpyInlineOp::canUsesBeRemoved(
 
 DeletionKind LLVM::MemcpyInlineOp::removeBlockingUses(
     const MemorySlot &slot, const SmallPtrSetImpl<OpOperand *> &blockingUses,
-    RewriterBase &rewriter, Value reachingDefinition,
+    OpBuilder &builder, Value reachingDefinition,
     const DataLayout &dataLayout) {
-  return memcpyRemoveBlockingUses(*this, slot, blockingUses, rewriter,
+  return memcpyRemoveBlockingUses(*this, slot, blockingUses, builder,
                                   reachingDefinition);
 }
 
@@ -1341,9 +1338,8 @@ bool LLVM::MemcpyInlineOp::canRewire(
 DeletionKind
 LLVM::MemcpyInlineOp::rewire(const DestructurableMemorySlot &slot,
                              DenseMap<Attribute, MemorySlot> &subslots,
-                             RewriterBase &rewriter,
-                             const DataLayout &dataLayout) {
-  return memcpyRewire(*this, slot, subslots, rewriter, dataLayout);
+                             OpBuilder &builder, const DataLayout &dataLayout) {
+  return memcpyRewire(*this, slot, subslots, builder, dataLayout);
 }
 
 bool LLVM::MemmoveOp::loadsFrom(const MemorySlot &slot) {
@@ -1354,10 +1350,10 @@ bool LLVM::MemmoveOp::storesTo(const MemorySlot &slot) {
   return memcpyStoresTo(*this, slot);
 }
 
-Value LLVM::MemmoveOp::getStored(const MemorySlot &slot, RewriterBase &rewriter,
+Value LLVM::MemmoveOp::getStored(const MemorySlot &slot, OpBuilder &builder,
                                  Value reachingDef,
                                  const DataLayout &dataLayout) {
-  return memcpyGetStored(*this, slot, rewriter);
+  return memcpyGetStored(*this, slot, builder);
 }
 
 bool LLVM::MemmoveOp::canUsesBeRemoved(
@@ -1370,9 +1366,9 @@ bool LLVM::MemmoveOp::canUsesBeRemoved(
 
 DeletionKind LLVM::MemmoveOp::removeBlockingUses(
     const MemorySlot &slot, const SmallPtrSetImpl<OpOperand *> &blockingUses,
-    RewriterBase &rewriter, Value reachingDefinition,
+    OpBuilder &builder, Value reachingDefinition,
     const DataLayout &dataLayout) {
-  return memcpyRemoveBlockingUses(*this, slot, blockingUses, rewriter,
+  return memcpyRemoveBlockingUses(*this, slot, blockingUses, builder,
                                   reachingDefinition);
 }
 
@@ -1392,9 +1388,9 @@ bool LLVM::MemmoveOp::canRewire(const DestructurableMemorySlot &slot,
 
 DeletionKind LLVM::MemmoveOp::rewire(const DestructurableMemorySlot &slot,
                                      DenseMap<Attribute, MemorySlot> &subslots,
-                                     RewriterBase &rewriter,
+                                     OpBuilder &builder,
                                      const DataLayout &dataLayout) {
-  return memcpyRewire(*this, slot, subslots, rewriter, dataLayout);
+  return memcpyRewire(*this, slot, subslots, builder, dataLayout);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/LLVMIR/IR/TypeDetail.h b/mlir/lib/Dialect/LLVMIR/IR/TypeDetail.h
index 2040d0a06b2e..8767b1c3ffc5 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/TypeDetail.h
+++ b/mlir/lib/Dialect/LLVMIR/IR/TypeDetail.h
@@ -131,8 +131,7 @@ public:
     /// Compares two keys.
     bool operator==(const Key &other) const {
       if (isIdentified())
-        return other.isIdentified() &&
-               other.getIdentifier().equals(getIdentifier());
+        return other.isIdentified() && other.getIdentifier() == getIdentifier();
 
       return !other.isIdentified() && other.isPacked() == isPacked() &&
              other.getTypeList() == getTypeList();
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 156784f0e674..13582a140a96 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -171,6 +171,54 @@ static DiagnosedSilenceableFailure unpackSingleIndexResultPayloadOperations(
   return DiagnosedSilenceableFailure::success();
 }
 
+/// When possible, converts each `OpFoldResult` in `mixedResult` to
+/// an integer if the value can be statically inferred.  If a result
+/// is a `Value` then it must be either a `ParamType` or a handle
+/// to an a constant like op.
+static DiagnosedSilenceableFailure reifyMixedParamAndHandleResults(
+    TransformState &state, TransformOpInterface &transformOp,
+    ArrayRef<OpFoldResult> mixedResults, SmallVectorImpl<int64_t> &reified) {
+  for (OpFoldResult paramOrHandle : mixedResults) {
+    if (isa<Attribute>(paramOrHandle)) {
+      reified.push_back(
+          cast<IntegerAttr>(paramOrHandle.get<Attribute>()).getInt());
+      continue;
+    } else if (isa<ParamType>(paramOrHandle.get<Value>().getType())) {
+      ArrayRef<Attribute> params = state.getParams(paramOrHandle.get<Value>());
+      if (params.size() != 1)
+        return transformOp.emitSilenceableError() << "expected a single param";
+      reified.push_back(
+          cast<IntegerAttr>(params.front()).getValue().getSExtValue());
+      continue;
+    }
+
+    Value handle = paramOrHandle.get<Value>();
+    if (!isa<TransformHandleTypeInterface>(handle.getType()))
+      return transformOp.emitSilenceableError() << "unexpected value handle";
+    auto payload = state.getPayloadOps(handle);
+    if (!llvm::hasSingleElement(payload))
+      return transformOp.emitSilenceableError()
+             << "requires param or handle that is mapped to 1 payload op";
+
+    Operation *paramOrHandlePayloadOp = *payload.begin();
+    if (paramOrHandlePayloadOp->getNumResults() != 1 ||
+        !paramOrHandlePayloadOp->getResult(0).getType().isIndex()) {
+      return transformOp.emitSilenceableError()
+             << "requires param or handle to be result of op with 1 index "
+                "result";
+    }
+
+    IntegerAttr attr;
+    if (!matchPattern(paramOrHandlePayloadOp->getResult(0), m_Constant(&attr)))
+      return transformOp.emitSilenceableError()
+             << "requires param or handle to be the result of a constant like "
+                "op";
+
+    reified.push_back(attr.getInt());
+  }
+  return DiagnosedSilenceableFailure::success();
+}
+
 //===----------------------------------------------------------------------===//
 // Apply...PatternsOp
 //===----------------------------------------------------------------------===//
@@ -1677,18 +1725,60 @@ void transform::PadOp::build(OpBuilder &b, OperationState &result, Value target,
                /*target=*/target,
                /*paddingValues=*/ArrayAttr(), // let inference handle this
                /*paddingDimensions=*/b.getI64ArrayAttr(paddingDimensions),
+               /*padToMultipleOf=*/ValueRange{},
                /*padToMultipleOf=*/
-               (padToMultipleOf.empty() ? ArrayAttr()
-                                        : b.getI64ArrayAttr(padToMultipleOf)),
+               (padToMultipleOf.empty()
+                    ? DenseI64ArrayAttr()
+                    : b.getDenseI64ArrayAttr(padToMultipleOf)),
+               /*packPaddings=*/b.getI64ArrayAttr(packPaddings),
+               /*transposePaddings=*/b.getArrayAttr(transposePaddings),
+               /*copyBackOp=*/b.getStringAttr(copyBackOp));
+}
+
+void transform::PadOp::build(OpBuilder &b, OperationState &result, Value target,
+                             ArrayRef<int64_t> paddingDimensions,
+                             ArrayRef<OpFoldResult> mixedPadToMultipleOf,
+                             ArrayRef<int64_t> packPaddings,
+                             ArrayRef<Attribute> transposePaddings,
+                             StringRef copyBackOp) {
+  auto resultType = transform::AnyOpType::get(b.getContext());
+  SmallVector<int64_t> staticPadToMultipleOf;
+  SmallVector<Value> dynamicPadToMultipleOf;
+  dispatchIndexOpFoldResults(mixedPadToMultipleOf, dynamicPadToMultipleOf,
+                             staticPadToMultipleOf);
+  return build(/*builder=*/b,
+               /*result=*/result,
+               /*types=*/TypeRange{resultType, resultType},
+               /*target=*/target,
+               /*paddingValues=*/ArrayAttr(), // let inference handle this
+               /*paddingDimensions=*/b.getI64ArrayAttr(paddingDimensions),
+               /*padToMultipleOf=*/dynamicPadToMultipleOf,
+               /*padToMultipleOf=*/staticPadToMultipleOf,
                /*packPaddings=*/b.getI64ArrayAttr(packPaddings),
                /*transposePaddings=*/b.getArrayAttr(transposePaddings),
                /*copyBackOp=*/b.getStringAttr(copyBackOp));
 }
 
+void PadOp::getEffects(
+    SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
+  consumesHandle(getTarget(), effects);
+  onlyReadsHandle(getPadToMultipleOf(), effects);
+  producesHandle(getPadded(), effects);
+  producesHandle(getPad(), effects);
+  producesHandle(getCopy(), effects);
+  modifiesPayload(effects);
+}
+
+SmallVector<OpFoldResult> PadOp::getMixedPadToMultipleOf() {
+  Builder b(getContext());
+  return getMixedValues(getStaticPadToMultipleOf(), getPadToMultipleOf(), b);
+}
+
 DiagnosedSilenceableFailure
 transform::PadOp::apply(transform::TransformRewriter &rewriter,
                         transform::TransformResults &results,
                         transform::TransformState &state) {
+  auto transformOp = cast<TransformOpInterface>(getOperation());
   SmallVector<Operation *> paddedOps, padOps, copyBackOps;
 
   for (Operation *target : state.getPayloadOps(getTarget())) {
@@ -1749,10 +1839,16 @@ transform::PadOp::apply(transform::TransformRewriter &rewriter,
     LinalgPaddingOptions options;
     options.paddingDimensions =
         extractFromIntegerArrayAttr<int64_t>(getPaddingDimensions());
-    SmallVector<int64_t> padToMultipleOf(options.paddingDimensions.size(), 1);
-    if (getPadToMultipleOf().has_value())
+
+    SmallVector<int64_t> padToMultipleOf;
+    DiagnosedSilenceableFailure status = reifyMixedParamAndHandleResults(
+        state, transformOp, getMixedPadToMultipleOf(), padToMultipleOf);
+    if (!status.succeeded())
+      return status;
+    if (padToMultipleOf.empty())
       padToMultipleOf =
-          extractFromIntegerArrayAttr<int64_t>(*getPadToMultipleOf());
+          SmallVector<int64_t>(options.paddingDimensions.size(), 1);
+
     options.padToMultipleOf = padToMultipleOf;
     options.paddingValues = paddingValues;
     options.packPaddings = packPaddings;
@@ -1819,8 +1915,8 @@ LogicalResult transform::PadOp::verify() {
                             "integers, found "
                          << getPaddingDimensions();
   }
-  if (getPadToMultipleOf().has_value()) {
-    if (getPadToMultipleOf()->size() != paddingDimensions.size()) {
+  if (!getMixedPadToMultipleOf().empty()) {
+    if (getMixedPadToMultipleOf().size() != paddingDimensions.size()) {
       return emitOpError() << "expects as many multiples as padding_dimensions";
     }
   }
@@ -2727,86 +2823,6 @@ SmallVector<OpFoldResult> transform::TileUsingForOp::getMixedSizes() {
   return results;
 }
 
-// We want to parse `DenseI64ArrayAttr` using the short form without the
-// `array` prefix to be consistent in the IR with `parseDynamicIndexList`.
-ParseResult parseOptionalInterchange(OpAsmParser &parser,
-                                     OperationState &result) {
-  if (failed(parser.parseOptionalKeyword("interchange")))
-    return success();
-  if (failed(parser.parseEqual()))
-    return failure();
-  result.addAttribute(
-      transform::TileUsingForOp::getInterchangeAttrName(result.name),
-      DenseI64ArrayAttr::parse(parser, Type{}));
-  return success();
-}
-
-void printOptionalInterchange(OpAsmPrinter &p,
-                              ArrayRef<int64_t> interchangeVals) {
-  if (!interchangeVals.empty()) {
-    p << " interchange = [";
-    llvm::interleaveComma(interchangeVals, p,
-                          [&](int64_t integer) { p << integer; });
-    p << "]";
-  }
-}
-
-ParseResult transform::TileUsingForOp::parse(OpAsmParser &parser,
-                                             OperationState &result) {
-  OpAsmParser::UnresolvedOperand target;
-  SmallVector<OpAsmParser::UnresolvedOperand> dynamicSizes;
-  DenseI64ArrayAttr staticSizes;
-  FunctionType functionalType;
-  llvm::SMLoc operandLoc;
-  DenseBoolArrayAttr scalableVals;
-
-  if (parser.parseOperand(target) || parser.getCurrentLocation(&operandLoc) ||
-      parseDynamicIndexList(parser, dynamicSizes, staticSizes, scalableVals) ||
-      parseOptionalInterchange(parser, result) ||
-      parser.parseOptionalAttrDict(result.attributes) ||
-      parser.parseColonType(functionalType))
-    return ParseResult::failure();
-
-  size_t numExpectedLoops =
-      staticSizes.size() - llvm::count(staticSizes.asArrayRef(), 0);
-  if (functionalType.getNumResults() != numExpectedLoops + 1) {
-    return parser.emitError(parser.getNameLoc())
-           << "expected " << (numExpectedLoops + 1) << " result type(s)";
-  }
-  if (functionalType.getNumInputs() != dynamicSizes.size() + 1) {
-    return parser.emitError(operandLoc)
-           << "expected " << dynamicSizes.size() + 1 << " operand type(s)";
-  }
-  if (parser.resolveOperand(target, functionalType.getInputs().front(),
-                            result.operands) ||
-      parser.resolveOperands(dynamicSizes,
-                             functionalType.getInputs().drop_front(),
-                             operandLoc, result.operands)) {
-    return failure();
-  }
-
-  result.addAttribute(getScalableSizesAttrName(result.name), scalableVals);
-
-  result.addAttribute(getStaticSizesAttrName(result.name), staticSizes);
-  result.addTypes(functionalType.getResults());
-  return success();
-}
-
-void TileUsingForOp::print(OpAsmPrinter &p) {
-  p << ' ' << getTarget();
-  printDynamicIndexList(p, getOperation(), getDynamicSizes(), getStaticSizes(),
-                        /*valueTypes=*/{}, getScalableSizesAttr(),
-                        OpAsmParser::Delimiter::Square);
-  printOptionalInterchange(p, getInterchange());
-  p.printOptionalAttrDict(
-      (*this)->getAttrs(),
-      /*elidedAttrs=*/{getInterchangeAttrName(getOperation()->getName()),
-                       getScalableSizesAttrName(getOperation()->getName()),
-                       getStaticSizesAttrName(getOperation()->getName())});
-  p << " : ";
-  p.printFunctionalType(getOperands().getTypes(), getResults().getTypes());
-}
-
 void transform::TileUsingForOp::getEffects(
     SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
   consumesHandle(getTarget(), effects);
@@ -3123,80 +3139,6 @@ transform::VectorizeChildrenAndApplyPatternsOp::applyToOne(
 // VectorizeOp
 //===----------------------------------------------------------------------===//
 
-static const StringLiteral kVectorSizesKeyword = "vector_sizes";
-
-ParseResult transform::VectorizeOp::parse(OpAsmParser &parser,
-                                          OperationState &result) {
-  OpAsmParser::UnresolvedOperand target;
-  SmallVector<OpAsmParser::UnresolvedOperand> dynamicSizes;
-  DenseI64ArrayAttr staticSizes;
-  SmallVector<Type> operandTypes;
-  llvm::SMLoc operandLoc;
-  DenseBoolArrayAttr scalableVals;
-
-  if (parser.parseOperand(target) || parser.getCurrentLocation(&operandLoc))
-    return ParseResult::failure();
-
-  if (succeeded(parser.parseOptionalKeyword(kVectorSizesKeyword))) {
-    if (failed(parseDynamicIndexList(parser, dynamicSizes, staticSizes,
-                                     scalableVals)))
-      return ParseResult::failure();
-  }
-
-  if (succeeded(parser.parseOptionalKeyword(
-          getVectorizeNdExtractAttrName(result.name))))
-    result.addAttribute(getVectorizeNdExtractAttrName(result.name),
-                        parser.getBuilder().getUnitAttr());
-
-  if (parser.parseOptionalAttrDict(result.attributes) ||
-      parser.parseColonTypeList(operandTypes))
-    return ParseResult::failure();
-
-  if (operandTypes.size() != dynamicSizes.size() + 1) {
-    return parser.emitError(operandLoc)
-           << "expected " << dynamicSizes.size() + 1 << " operand type(s)";
-  }
-  if (parser.resolveOperand(target, operandTypes.front(), result.operands) ||
-      parser.resolveOperands(dynamicSizes, ArrayRef(operandTypes).drop_front(),
-                             operandLoc, result.operands)) {
-    return failure();
-  }
-
-  if (scalableVals)
-    result.addAttribute(getScalableSizesAttrName(result.name), scalableVals);
-  if (staticSizes)
-    result.addAttribute(getStaticVectorSizesAttrName(result.name), staticSizes);
-
-  return success();
-}
-
-void transform::VectorizeOp::print(OpAsmPrinter &p) {
-  p << ' ' << getTarget() << ' ';
-  if (!getMixedVectorSizes().empty()) {
-    p << kVectorSizesKeyword << ' ';
-    printDynamicIndexList(p, getOperation(), getVectorSizes(),
-                          getStaticVectorSizesAttr(),
-                          /*valueTypes=*/{}, getScalableSizesAttr(),
-                          OpAsmParser::Delimiter::Square);
-  }
-
-  if (getVectorizeNdExtract())
-    p << getVectorizeNdExtractAttrName() << ' ';
-
-  p.printOptionalAttrDict(
-      (*this)->getAttrs(),
-      /*elidedAttrs=*/{
-          getScalableSizesAttrName(getOperation()->getName()),
-          getStaticVectorSizesAttrName(getOperation()->getName())});
-  p << " : ";
-  p << getTarget().getType();
-  if (!getVectorSizes().empty()) {
-    p << ", ";
-    llvm::interleaveComma(getVectorSizes(), p,
-                          [&](Value operand) { p << operand.getType(); });
-  }
-}
-
 DiagnosedSilenceableFailure transform::VectorizeOp::apply(
     transform::TransformRewriter &rewriter,
     mlir::transform::TransformResults &transformResults,
@@ -3204,49 +3146,12 @@ DiagnosedSilenceableFailure transform::VectorizeOp::apply(
   auto targets = state.getPayloadOps(getTarget());
   if (std::empty(targets))
     return DiagnosedSilenceableFailure::success();
-
+  auto transformOp = cast<TransformOpInterface>(getOperation());
   SmallVector<int64_t> vectorSizes;
-  for (OpFoldResult sz : getMixedVectorSizes()) {
-    if (sz.is<Attribute>()) {
-      auto attr = sz.get<Attribute>();
-      vectorSizes.push_back(cast<IntegerAttr>(attr).getInt());
-      continue;
-    } else if (sz.is<Value>() && isa<ParamType>(sz.get<Value>().getType())) {
-      ArrayRef<Attribute> params = state.getParams(sz.get<Value>());
-      if (params.size() != 1)
-        return emitSilenceableFailure(getLoc()) << "expected a single param";
-      vectorSizes.push_back(
-          cast<IntegerAttr>(params.front()).getValue().getSExtValue());
-      continue;
-    }
-
-    auto szPayloads = state.getPayloadOps(sz.get<Value>());
-    if (!llvm::hasSingleElement(szPayloads)) {
-      auto diag = this->emitOpError(
-          "requires vector size handle that is mapped to 1 payload op");
-      diag.attachNote(sz.get<Value>().getLoc())
-          << "mapped to " << llvm::range_size(szPayloads) << " payload ops";
-      return DiagnosedSilenceableFailure::definiteFailure();
-    }
-
-    Operation *szPayloadOp = *szPayloads.begin();
-    if (szPayloadOp->getNumResults() != 1 ||
-        !szPayloadOp->getResult(0).getType().isIndex()) {
-      auto diag = this->emitOpError(
-          "requires vector size payload op with 1 index result");
-      diag.attachNote(szPayloadOp->getLoc()) << "vector size payload op";
-      return DiagnosedSilenceableFailure::definiteFailure();
-    }
-
-    IntegerAttr attr;
-    if (!matchPattern(szPayloadOp->getResult(0), m_Constant(&attr))) {
-      auto diag = this->emitOpError("requires constant vector size");
-      diag.attachNote(szPayloadOp->getLoc()) << "vector size payload op";
-      return DiagnosedSilenceableFailure::definiteFailure();
-    }
-
-    vectorSizes.push_back(attr.getInt());
-  }
+  DiagnosedSilenceableFailure status = reifyMixedParamAndHandleResults(
+      state, transformOp, getMixedVectorSizes(), vectorSizes);
+  if (!status.succeeded())
+    return status;
 
   // TODO: Check that the correct number of vectorSizes was provided.
   for (Operation *target : targets) {
diff --git a/mlir/lib/Dialect/Linalg/Transforms/BlockPackMatmul.cpp b/mlir/lib/Dialect/Linalg/Transforms/BlockPackMatmul.cpp
new file mode 100644
index 000000000000..c07d1387ec75
--- /dev/null
+++ b/mlir/lib/Dialect/Linalg/Transforms/BlockPackMatmul.cpp
@@ -0,0 +1,321 @@
+//===- BlockPackMatmul.cpp - Linalg matmul block packing ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Linalg/Passes.h"
+
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/TypeSwitch.h"
+
+#include <optional>
+
+namespace mlir {
+#define GEN_PASS_DEF_LINALGBLOCKPACKMATMUL
+#include "mlir/Dialect/Linalg/Passes.h.inc"
+} // namespace mlir
+
+using namespace mlir;
+using namespace mlir::linalg;
+
+/// Return constant range span or nullopt, otherwise.
+static std::optional<int64_t> getConstantRange(const Range &range) {
+  std::optional<int64_t> stride = getConstantIntValue(range.stride);
+  if (!stride || *stride != 1)
+    return std::nullopt;
+  std::optional<int64_t> offset = getConstantIntValue(range.offset);
+  if (!offset)
+    return std::nullopt;
+  std::optional<int64_t> size = getConstantIntValue(range.size);
+  if (!size)
+    return std::nullopt;
+  return (*size - *offset);
+}
+
+/// Return true if all dimensions are fully divisible by the respective tiles.
+static bool validateFullTilesOnDims(linalg::LinalgOp linalgOp,
+                                    ArrayRef<OpFoldResult> tiles,
+                                    ArrayRef<int64_t> dims) {
+  if (dims.size() != tiles.size() || tiles.empty())
+    return false;
+
+  FailureOr<ContractionDimensions> contractDims =
+      inferContractionDims(linalgOp);
+  if (failed(contractDims))
+    return false;
+  unsigned batchDimsOffset = contractDims->batch.size();
+
+  // Skip the batch dimension if present.
+  // Offset all dimensions accordingly.
+  SmallVector<int64_t, 3> offsetDims{dims};
+  for (size_t i = 0; i < offsetDims.size(); i++)
+    offsetDims[i] += batchDimsOffset;
+
+  auto tileOp = cast<TilingInterface>(linalgOp.getOperation());
+  OpBuilder builder(tileOp);
+  OpBuilder::InsertionGuard guard(builder);
+  SmallVector<Range> iterationDomain = tileOp.getIterationDomain(builder);
+
+  for (auto dim : llvm::enumerate(offsetDims)) {
+    if (dim.value() >= static_cast<int64_t>(iterationDomain.size()))
+      return false;
+
+    std::optional<int64_t> tileSize = getConstantIntValue(tiles[dim.index()]);
+    std::optional<int64_t> rangeOnDim =
+        getConstantRange(iterationDomain[dim.value()]);
+
+    // If the tile factor or the range are non-constant, the tile size is
+    // considered to be invalid.
+    if (!tileSize || !rangeOnDim)
+      return false;
+
+    // The dimension must be fully divisible by the tile.
+    if (*rangeOnDim % *tileSize != 0)
+      return false;
+  }
+
+  return true;
+}
+
+/// Return failure or packed matmul with one of its operands transposed.
+static FailureOr<PackTransposeResult>
+transposePackedMatmul(RewriterBase &rewriter, linalg::LinalgOp linalgOp,
+                      tensor::PackOp packOp, AffineMap operandMap,
+                      ArrayRef<unsigned> blocksStartDimPos,
+                      bool transposeOuterBlocks, bool transposeInnerBlocks) {
+  assert(operandMap.getNumDims() >= 4 &&
+         "expected at least 4D prepacked matmul");
+  assert(blocksStartDimPos.size() >= 2 &&
+         "expected starting outer and inner block positions");
+
+  // Bias toward innermost dimensions.
+  unsigned outerBlockPos = operandMap.getNumResults() - 4;
+  unsigned innerBlockPos = operandMap.getNumResults() - 2;
+
+  // Transpose control options define the desired block and element layout.
+  // Block transposition (outer dimensions) or element transposition (inner
+  // dimensions) may not be necessary depending on the original matmul data
+  // layout.
+  bool isOuterTransposed =
+      operandMap.getDimPosition(outerBlockPos) != blocksStartDimPos.end()[-2];
+  bool isInnerTransposed =
+      operandMap.getDimPosition(innerBlockPos) != blocksStartDimPos.back();
+
+  // Transpose only the dimensions that need that to conform to the provided
+  // transpotion settings.
+  SmallVector<int64_t> innerPerm{0, 1};
+  if (isInnerTransposed != transposeInnerBlocks)
+    innerPerm = {1, 0};
+  SmallVector<int64_t> outerPerm{0, 1};
+  if (isOuterTransposed != transposeOuterBlocks)
+    outerPerm = {1, 0};
+
+  // Leave the outer dimensions, like batch, unchanged by offsetting all
+  // outer dimensions permutations.
+  SmallVector<int64_t> offsetPerms;
+  for (auto i : llvm::seq(0u, outerBlockPos))
+    offsetPerms.push_back(i);
+  for (auto perm : outerPerm)
+    offsetPerms.push_back(perm + outerBlockPos);
+  outerPerm = offsetPerms;
+
+  FailureOr<PackTransposeResult> packTransposedMatmul =
+      packTranspose(rewriter, packOp, linalgOp,
+                    /*maybeUnPackOp=*/nullptr, outerPerm, innerPerm);
+
+  return packTransposedMatmul;
+}
+
+/// Pack a matmul operation into blocked 4D layout.
+FailureOr<PackResult>
+linalg::blockPackMatmul(RewriterBase &rewriter, linalg::LinalgOp linalgOp,
+                        const ControlBlockPackMatmulFn &controlPackMatmul) {
+  if (linalgOp.hasPureBufferSemantics())
+    return rewriter.notifyMatchFailure(linalgOp, "require tensor semantics");
+
+  std::optional<BlockPackMatmulOptions> options = controlPackMatmul(linalgOp);
+  if (!options)
+    return rewriter.notifyMatchFailure(linalgOp, "invalid packing options");
+
+  if (options->blockFactors.size() != 3)
+    return rewriter.notifyMatchFailure(linalgOp, "require 3 tile factors");
+
+  SmallVector<OpFoldResult> mnkTiles =
+      getAsOpFoldResult(rewriter.getI64ArrayAttr(options->blockFactors));
+
+  // If padding is disabled, make sure that dimensions can be packed cleanly.
+  if (!options->allowPadding &&
+      !validateFullTilesOnDims(linalgOp, mnkTiles, options->mnkOrder)) {
+    return rewriter.notifyMatchFailure(linalgOp,
+                                       "expect packing full tiles only");
+  }
+
+  OpBuilder::InsertionGuard guard(rewriter);
+  // The op is replaced, we need to set the insertion point after it.
+  rewriter.setInsertionPointAfter(linalgOp);
+
+  // Pack the matmul operation into blocked layout with two levels of
+  // subdivision:
+  //   - major 2D blocks - outer dimensions, consist of minor blocks
+  //   - minor 2D blocks - inner dimensions, consist of scalar elements
+  FailureOr<PackResult> packedMatmul = packMatmulGreedily(
+      rewriter, linalgOp, mnkTiles, options->mnkPaddedSizesNextMultipleOf,
+      options->mnkOrder);
+  if (failed(packedMatmul))
+    return failure();
+
+  assert(packedMatmul->packOps.size() == 3 &&
+         "invalid number of pack ops after matmul packing");
+  assert(packedMatmul->unPackOps.size() == 1 &&
+         "invalid number of unpack ops after matmul packing");
+
+  FailureOr<ContractionDimensions> contractDims =
+      inferContractionDims(packedMatmul->packedLinalgOp);
+  if (failed(contractDims))
+    return failure();
+
+  auto genericOp =
+      dyn_cast<linalg::GenericOp>(packedMatmul->packedLinalgOp.getOperation());
+  SmallVector<AffineMap> maps = genericOp.getIndexingMapsArray();
+
+  // Transpose LHS matrix according to the options.
+  FailureOr<PackTransposeResult> packedLhs = transposePackedMatmul(
+      rewriter, packedMatmul->packedLinalgOp, packedMatmul->packOps[0], maps[0],
+      contractDims->m, options->lhsTransposeOuterBlocks,
+      options->lhsTransposeInnerBlocks);
+  if (failed(packedLhs))
+    return failure();
+
+  // Update results.
+  packedMatmul->packOps[0] = packedLhs->transposedPackOp;
+  packedMatmul->packedLinalgOp = packedLhs->transposedLinalgOp;
+
+  // Transpose RHS matrix according to the options.
+  FailureOr<PackTransposeResult> packedRhs = transposePackedMatmul(
+      rewriter, packedMatmul->packedLinalgOp, packedMatmul->packOps[1], maps[1],
+      contractDims->k, options->rhsTransposeOuterBlocks,
+      options->rhsTransposeInnerBlocks);
+  if (failed(packedRhs))
+    return failure();
+
+  // Update results.
+  packedMatmul->packOps[1] = packedRhs->transposedPackOp;
+  packedMatmul->packedLinalgOp = packedRhs->transposedLinalgOp;
+
+  return packedMatmul;
+}
+
+namespace {
+template <typename OpTy>
+struct BlockPackMatmul : public OpRewritePattern<OpTy> {
+  BlockPackMatmul(MLIRContext *context, ControlBlockPackMatmulFn fun,
+                  PatternBenefit benefit = 1)
+      : OpRewritePattern<OpTy>(context, benefit), controlFn(std::move(fun)) {}
+
+  LogicalResult matchAndRewrite(OpTy linalgOp,
+                                PatternRewriter &rewriter) const override {
+    FailureOr<PackResult> packedMatmul =
+        blockPackMatmul(rewriter, linalgOp, controlFn);
+    if (failed(packedMatmul))
+      return failure();
+    return success();
+  }
+
+private:
+  ControlBlockPackMatmulFn controlFn;
+};
+
+template <>
+struct BlockPackMatmul<linalg::GenericOp>
+    : public OpRewritePattern<linalg::GenericOp> {
+  BlockPackMatmul(MLIRContext *context, ControlBlockPackMatmulFn fun,
+                  PatternBenefit benefit = 1)
+      : OpRewritePattern<linalg::GenericOp>(context, benefit),
+        controlFn(std::move(fun)) {}
+
+  LogicalResult matchAndRewrite(linalg::GenericOp linalgOp,
+                                PatternRewriter &rewriter) const override {
+    // Match suitable generics.
+    if (failed(linalg::detail::verifyContractionInterface(
+            linalgOp.getOperation()))) {
+      return rewriter.notifyMatchFailure(linalgOp, "not a contraction");
+    }
+
+    using MapList = ArrayRef<ArrayRef<AffineExpr>>;
+    auto infer = [&](MapList m) {
+      return AffineMap::inferFromExprList(m, linalgOp.getContext());
+    };
+
+    AffineExpr i, j, k;
+    bindDims(linalgOp->getContext(), i, j, k);
+    SmallVector<AffineMap> maps = linalgOp.getIndexingMapsArray();
+
+    // For now, only match simple matmuls.
+    if (!(maps == infer({{i, k}, {k, j}, {i, j}}) ||
+          maps == infer({{k, i}, {k, j}, {i, j}}) ||
+          maps == infer({{i, k}, {j, k}, {i, j}}))) {
+      return rewriter.notifyMatchFailure(linalgOp, "not a suitable matmul");
+    }
+
+    FailureOr<PackResult> packedMatmul =
+        blockPackMatmul(rewriter, linalgOp, controlFn);
+    if (failed(packedMatmul))
+      return failure();
+    return success();
+  }
+
+private:
+  ControlBlockPackMatmulFn controlFn;
+};
+
+/// Convert linalg matmul ops to block layout and back.
+struct LinalgBlockPackMatmul
+    : public impl::LinalgBlockPackMatmulBase<LinalgBlockPackMatmul> {
+  using LinalgBlockPackMatmulBase::LinalgBlockPackMatmulBase;
+
+  void runOnOperation() override {
+    Operation *op = getOperation();
+    RewritePatternSet patterns(&getContext());
+
+    ControlBlockPackMatmulFn controlFn =
+        [&](linalg::LinalgOp op) -> BlockPackMatmulOptions {
+      BlockPackMatmulOptions options;
+      options.blockFactors = SmallVector<int64_t>{*blockFactors};
+      options.allowPadding = allowPadding;
+      options.mnkPaddedSizesNextMultipleOf =
+          SmallVector<int64_t>{*mnkPaddedSizesNextMultipleOf};
+      if (!mnkOrder.empty())
+        options.mnkOrder = SmallVector<int64_t>{*mnkOrder};
+      options.lhsTransposeOuterBlocks = lhsTransposeOuterBlocks;
+      options.lhsTransposeInnerBlocks = lhsTransposeInnerBlocks;
+      options.rhsTransposeOuterBlocks = rhsTransposeOuterBlocks;
+      options.rhsTransposeInnerBlocks = rhsTransposeInnerBlocks;
+      return options;
+    };
+
+    linalg::populateBlockPackMatmulPatterns(patterns, controlFn);
+    if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns))))
+      return signalPassFailure();
+  }
+};
+} // namespace
+
+void linalg::populateBlockPackMatmulPatterns(
+    RewritePatternSet &patterns, const ControlBlockPackMatmulFn &controlFn) {
+  patterns.add<BlockPackMatmul<linalg::GenericOp>,
+               BlockPackMatmul<linalg::MatmulOp>,
+               BlockPackMatmul<linalg::BatchMatmulOp>,
+               BlockPackMatmul<linalg::MatmulTransposeAOp>,
+               BlockPackMatmul<linalg::BatchMatmulTransposeAOp>,
+               BlockPackMatmul<linalg::MatmulTransposeBOp>,
+               BlockPackMatmul<linalg::BatchMatmulTransposeBOp>>(
+      patterns.getContext(), controlFn);
+}
diff --git a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
index 3b5282a09569..ed9f40089282 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
@@ -25,6 +25,7 @@ add_mlir_dialect_library(MLIRLinalgTransforms
   TransposeMatmul.cpp
   MeshShardingInterfaceImpl.cpp
   NamedOpConversions.cpp
+  BlockPackMatmul.cpp
   Padding.cpp
   Promotion.cpp
   RuntimeOpVerification.cpp
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
index 89fb4944c0ca..ad313c2d5ce6 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
@@ -432,12 +432,6 @@ public:
 
       Operation *producer = opOperand.get().getDefiningOp();
 
-      // Do not fuse a sparse-in/dense-out operation, as the
-      // result is too often not sparsifiable anymore.
-      if (sparse_tensor::hasAnySparseOperand(producer) &&
-          !sparse_tensor::hasAnySparseResult(producer))
-        return failure();
-
       // Find the producer of the operand.
       FailureOr<ElementwiseOpFusionResult> fusionResult =
           fuseElementwiseOps(rewriter, &opOperand);
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index ef9a30be9a01..7b4507c52e02 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -1414,27 +1414,39 @@ static SmallVector<int64_t> getTiledPackShape(tensor::PackOp packOp,
 /// create an empty destination tensor and create a TransferWriteOp from the
 /// input to the empty tensor. If the destination shape is not the same as the
 /// inputVectorSizes for the first rank(inputVectorSizes) dims, then create a
-/// mask for the write.
+/// mask for the write. If `useInBoundsInsteadOfMasking` is set, then update the
+/// inBounds attribute of the transfer write op instead of masking.
 static Operation *createWriteOrMaskedWrite(OpBuilder &builder, Location loc,
                                            Value input,
                                            SmallVector<OpFoldResult> destSizes,
-                                           ArrayRef<int64_t> inputVectorSizes) {
+                                           ArrayRef<int64_t> inputVectorSizes,
+                                           bool useInBoundsInsteadOfMasking) {
+
   auto inputType = cast<VectorType>(input.getType());
   Value dest = builder.create<tensor::EmptyOp>(loc, destSizes,
                                                inputType.getElementType());
   int64_t rank = cast<ShapedType>(dest.getType()).getRank();
   auto zero = builder.create<arith::ConstantIndexOp>(loc, 0);
+  auto destShape = cast<ShapedType>(dest.getType()).getShape();
+  SmallVector<bool> inBoundsVal(rank, true);
+  if (useInBoundsInsteadOfMasking) {
+    // Update the inBounds attribute.
+    for (unsigned i = 0; i < rank; i++)
+      inBoundsVal[i] = (destShape[i] == inputVectorSizes[i]) &&
+                       !ShapedType::isDynamic(destShape[i]);
+  }
   Operation *write = builder.create<vector::TransferWriteOp>(
       loc,
       /*vector=*/input,
       /*source=*/dest,
       /*indices=*/SmallVector<Value>(rank, zero),
-      /*inBounds=*/SmallVector<bool>(rank, true));
-  auto destShape = cast<ShapedType>(dest.getType()).getShape();
+      /*inBounds=*/inBoundsVal);
   assert(llvm::none_of(
              destShape.drop_front(inputVectorSizes.size()),
              [](int64_t size) { return size == ShapedType::kDynamic; }) &&
          "Only dims aligned with inputVectorSizes may be dynamic");
+  if (useInBoundsInsteadOfMasking)
+    return write;
   bool needMaskForWrite = !llvm::equal(
       inputVectorSizes, destShape.take_front(inputVectorSizes.size()));
   if (needMaskForWrite) {
@@ -1535,9 +1547,9 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, tensor::PackOp packOp,
       loc, shapeCastOp.getResult(), destPermutation);
 
   // Create TransferWriteOp.
-  Operation *write =
-      createWriteOrMaskedWrite(rewriter, loc, transposeOp.getResult(),
-                               reifiedReturnShapes[0], inputVectorSizes);
+  Operation *write = createWriteOrMaskedWrite(
+      rewriter, loc, transposeOp.getResult(), reifiedReturnShapes[0],
+      inputVectorSizes, /*useInBoundsInsteadOfMasking=*/false);
   newResults.push_back(write->getResult(0));
   return success();
 }
@@ -1547,7 +1559,10 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, tensor::PackOp packOp,
 ///   vector::TransposeOp - Transpose the Source tensor
 ///   ShapeCastOp - Reshape the data based on the target.
 ///   vector::TransferWriteOp. - Write the result vector back to the destination
-///   tensor
+///   tensor.
+///   If the vector sizes are not provided:
+///   * the vector sizes are determined by the input operand and attributes,
+///   * update the inBounds attribute instead of masking.
 static LogicalResult
 vectorizeAsTensorUnpackOp(RewriterBase &rewriter, tensor::UnPackOp unpackOp,
                           ArrayRef<int64_t> inputVectorSizes,
@@ -1560,40 +1575,65 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, tensor::UnPackOp unpackOp,
 
   ArrayRef<int64_t> innerDimPos = unpackOp.getInnerDimsPos();
   ArrayRef<int64_t> innerTiles = unpackOp.getStaticInnerTiles();
-
-  SmallVector<int64_t> readMaskShape(inputVectorSizes.begin(),
-                                     inputVectorSizes.end());
-  ArrayRef<int64_t> outerDimsPerm = unpackOp.getOuterDimsPerm();
   ArrayRef<int64_t> sourceShape = unpackTensorType.getShape();
+  bool useInBoundsInsteadOfMasking = false;
+  ArrayRef<int64_t> outerDimsPerm = unpackOp.getOuterDimsPerm();
+
+  auto destSize = unpackOp.getDestRank();
+
+  if (!inputVectorSizes.empty())
+    assert(inputVectorSizes.size() == destSize &&
+           "Incorrect number of input vector sizes");
 
-  // ReadMask is the size of tensor used to read and apply mask. It is
+  // vectorSizes is the shape of the vector that will be used to do final
+  // write on the destination tensor. It is set like this: Let's say the
+  // source tensor is rank 'M' and the dest tensor rank 'N', where N <= M.
+  // Thus:
+  // 1. vectorSizes = sourceShape.take_front(N)
+  // 2. if outer_dims_perms is present: do that permutation on vectorSizes.
+  // 3. multiply all the locations in vectorSize pointed by innerDimPos by the
+  //    innerTiles attribute value.
+  SmallVector<int64_t> vectorSizes(inputVectorSizes);
+  if (vectorSizes.empty()) {
+    llvm::append_range(vectorSizes, sourceShape.take_front(destSize));
+    if (!outerDimsPerm.empty())
+      applyPermutationToVector(vectorSizes, outerDimsPerm);
+    for (auto [i, pos] : llvm::enumerate(innerDimPos))
+      vectorSizes[pos] *= innerTiles[i];
+
+    useInBoundsInsteadOfMasking = true;
+  }
+
+  // readVectorSizes is the size of tensor used to read and apply mask. It is
   // set like this: Let's say the vectorSize (VS) array is size 'N' and
   // the sourceShape(SS) is 'M' where M >= N and InnerTileSizes (IT) of
   // size M-N
   // Thus:
-  // - initially: ReadMaskShape = vectorInputSizes
+  // - initially: readVectorSizes = vectorInputSizes
   // - Divide all the readMaskShape locations pointed by innerDimPos
   //   by the innerTileSize attribute value.
-  // - if outer_dims_perms is present: do that permutation on readMaskShape.
+  // - if outer_dims_perms is present: do that permutation on readVectorSizes.
   // - Append the remaining shape from SS
   // E.g. let's say let's say unpackTensorType.getShape() = <8x8x32x16>
   // inner Dim Pos = [0, 1] and Inner Tiles = [32, 16], vector_sizes are [512,
   // 128] and outer_dims_perm is [1, 0] then read shape is:
-  //   ReadMaskShape(initial): [512, 128]
+  //   ReadVectorSizes(initial): [512, 128]
   //   Final Value(after innerDim Adjustment): [512/32, 128/16]
   //                                           = [16, 8]
   //   After applying outer_dims_perm: [8, 16]
   //   After appending the rest of the sourceShape: [8, 16, 32, 16]
 
+  SmallVector<int64_t> readVectorSizes(vectorSizes.begin(), vectorSizes.end());
+
   for (auto [index, size] : enumerate(innerTiles)) {
-    readMaskShape[innerDimPos[index]] =
-        llvm::divideCeil(readMaskShape[innerDimPos[index]], size);
+    readVectorSizes[innerDimPos[index]] =
+        llvm::divideCeil(readVectorSizes[innerDimPos[index]], size);
   }
   if (!outerDimsPerm.empty()) {
-    applyPermutationToVector(readMaskShape, outerDimsPerm);
+    applyPermutationToVector(readVectorSizes, outerDimsPerm);
   }
-  readMaskShape.append(sourceShape.begin() + inputVectorSizes.size(),
-                       sourceShape.end());
+  readVectorSizes.append(sourceShape.begin() + vectorSizes.size(),
+                         sourceShape.end());
 
   ReifiedRankedShapedTypeDims reifiedRetShapes;
   LogicalResult status =
@@ -1611,8 +1651,7 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, tensor::UnPackOp unpackOp,
   // Read result, mask if necessary. If transferReadOp shape is not equal
   // to shape of source, then a mask is necessary.
   Value readResult = vector::createReadOrMaskedRead(
-      rewriter, loc, unpackOp.getSource(),
-      ArrayRef<int64_t>(readMaskShape.begin(), readMaskShape.end()), padValue,
+      rewriter, loc, unpackOp.getSource(), readVectorSizes, padValue,
       /*useInBoundsInsteadOfMasking=*/false);
 
   PackingMetadata packMetadata;
@@ -1636,15 +1675,15 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, tensor::UnPackOp unpackOp,
   vector::ShapeCastOp shapeCastOp = rewriter.create<vector::ShapeCastOp>(
       loc, vecCollapsedType, transposeOp->getResult(0));
 
-  // WriteMaskShape had to match the shapecast shape for dynamic sizes,
+  // writeVectorSizes had to match the shapecast shape for dynamic sizes,
   // otherwise the validator complains that the mask size is invalid.
-  SmallVector<int64_t> writeMaskShape(
+  SmallVector<int64_t> writeVectorSizes(
       unpackOp.getDestType().hasStaticShape()
-          ? inputVectorSizes
+          ? vectorSizes
           : shapeCastOp.getResultVectorType().getShape());
-  Operation *write =
-      createWriteOrMaskedWrite(rewriter, loc, shapeCastOp.getResult(),
-                               reifiedRetShapes[0], writeMaskShape);
+  Operation *write = createWriteOrMaskedWrite(
+      rewriter, loc, shapeCastOp.getResult(), reifiedRetShapes[0],
+      writeVectorSizes, useInBoundsInsteadOfMasking);
   newResults.push_back(write->getResult(0));
   return success();
 }
@@ -1673,7 +1712,8 @@ vectorizeAsTensorPadOp(RewriterBase &rewriter, tensor::PadOp padOp,
       rewriter, loc, padOp.getSource(), inputVectorSizes, padValue,
       /*useInBoundsInsteadOfMasking=*/false);
   Operation *write = createWriteOrMaskedWrite(
-      rewriter, loc, maskedRead, reifiedReturnShapes[0], inputVectorSizes);
+      rewriter, loc, maskedRead, reifiedReturnShapes[0], inputVectorSizes,
+      /*useInBoundsInsteadOfMasking=*/false);
   newResults.push_back(write->getResult(0));
   return success();
 }
@@ -1755,8 +1795,11 @@ vectorizeUnPackOpPrecondition(tensor::UnPackOp unpackOp,
     LDBG("Inner-tiles must be constant: " << unpackOp << "\n");
     return failure();
   }
-  llvm::ArrayRef<int64_t> resultShape = unpackOp.getDestType().getShape();
-  if (!inputVectorSizes.empty() &&
+  ArrayRef<int64_t> resultShape = unpackOp.getDestType().getShape();
+  bool satisfyEmptyCond = inputVectorSizes.empty() &&
+                          unpackOp.getDestType().hasStaticShape() &&
+                          unpackOp.getSourceType().hasStaticShape();
+  if (!satisfyEmptyCond &&
       failed(vector::isValidMaskedInputVector(resultShape, inputVectorSizes)))
     return failure();
 
diff --git a/mlir/lib/Dialect/Math/Transforms/ExpandPatterns.cpp b/mlir/lib/Dialect/Math/Transforms/ExpandPatterns.cpp
index 42629e149e9f..80569d95137c 100644
--- a/mlir/lib/Dialect/Math/Transforms/ExpandPatterns.cpp
+++ b/mlir/lib/Dialect/Math/Transforms/ExpandPatterns.cpp
@@ -73,14 +73,14 @@ static LogicalResult convertSinhOp(math::SinhOp op, PatternRewriter &rewriter) {
   ImplicitLocOpBuilder b(op->getLoc(), rewriter);
   Value operand = op.getOperand();
   Type opType = operand.getType();
-  Value exp = b.create<math::ExpOp>(operand);
 
-  Value one = createFloatConst(op->getLoc(), opType, 1.0, rewriter);
-  Value nexp = b.create<arith::DivFOp>(one, exp);
+  Value exp = b.create<math::ExpOp>(operand);
+  Value neg = b.create<arith::NegFOp>(operand);
+  Value nexp = b.create<math::ExpOp>(neg);
   Value sub = b.create<arith::SubFOp>(exp, nexp);
-  Value two = createFloatConst(op->getLoc(), opType, 2.0, rewriter);
-  Value div = b.create<arith::DivFOp>(sub, two);
-  rewriter.replaceOp(op, div);
+  Value half = createFloatConst(op->getLoc(), opType, 0.5, rewriter);
+  Value res = b.create<arith::MulFOp>(sub, half);
+  rewriter.replaceOp(op, res);
   return success();
 }
 
@@ -89,14 +89,14 @@ static LogicalResult convertCoshOp(math::CoshOp op, PatternRewriter &rewriter) {
   ImplicitLocOpBuilder b(op->getLoc(), rewriter);
   Value operand = op.getOperand();
   Type opType = operand.getType();
-  Value exp = b.create<math::ExpOp>(operand);
 
-  Value one = createFloatConst(op->getLoc(), opType, 1.0, rewriter);
-  Value nexp = b.create<arith::DivFOp>(one, exp);
+  Value exp = b.create<math::ExpOp>(operand);
+  Value neg = b.create<arith::NegFOp>(operand);
+  Value nexp = b.create<math::ExpOp>(neg);
   Value add = b.create<arith::AddFOp>(exp, nexp);
-  Value two = createFloatConst(op->getLoc(), opType, 2.0, rewriter);
-  Value div = b.create<arith::DivFOp>(add, two);
-  rewriter.replaceOp(op, div);
+  Value half = createFloatConst(op->getLoc(), opType, 0.5, rewriter);
+  Value res = b.create<arith::MulFOp>(add, half);
+  rewriter.replaceOp(op, res);
   return success();
 }
 
@@ -152,6 +152,57 @@ static LogicalResult convertTanOp(math::TanOp op, PatternRewriter &rewriter) {
   return success();
 }
 
+// asinh(float x) -> log(x + sqrt(x**2 + 1))
+static LogicalResult convertAsinhOp(math::AsinhOp op,
+                                    PatternRewriter &rewriter) {
+  ImplicitLocOpBuilder b(op->getLoc(), rewriter);
+  Value operand = op.getOperand();
+  Type opType = operand.getType();
+
+  Value one = createFloatConst(op->getLoc(), opType, 1.0, rewriter);
+  Value fma = b.create<math::FmaOp>(operand, operand, one);
+  Value sqrt = b.create<math::SqrtOp>(fma);
+  Value add = b.create<arith::AddFOp>(operand, sqrt);
+  Value res = b.create<math::LogOp>(add);
+  rewriter.replaceOp(op, res);
+  return success();
+}
+
+// acosh(float x) -> log(x + sqrt(x**2 - 1))
+static LogicalResult convertAcoshOp(math::AcoshOp op,
+                                    PatternRewriter &rewriter) {
+  ImplicitLocOpBuilder b(op->getLoc(), rewriter);
+  Value operand = op.getOperand();
+  Type opType = operand.getType();
+
+  Value negOne = createFloatConst(op->getLoc(), opType, -1.0, rewriter);
+  Value fma = b.create<math::FmaOp>(operand, operand, negOne);
+  Value sqrt = b.create<math::SqrtOp>(fma);
+  Value add = b.create<arith::AddFOp>(operand, sqrt);
+  Value res = b.create<math::LogOp>(add);
+  rewriter.replaceOp(op, res);
+  return success();
+}
+
+// atanh(float x) -> log((1 + x) / (1 - x)) / 2
+static LogicalResult convertAtanhOp(math::AtanhOp op,
+                                    PatternRewriter &rewriter) {
+  ImplicitLocOpBuilder b(op->getLoc(), rewriter);
+  Value operand = op.getOperand();
+  Type opType = operand.getType();
+
+  Value one = createFloatConst(op->getLoc(), opType, 1.0, rewriter);
+  Value add = b.create<arith::AddFOp>(operand, one);
+  Value neg = b.create<arith::NegFOp>(operand);
+  Value sub = b.create<arith::AddFOp>(neg, one);
+  Value div = b.create<arith::DivFOp>(add, sub);
+  Value log = b.create<math::LogOp>(div);
+  Value half = createFloatConst(op->getLoc(), opType, 0.5, rewriter);
+  Value res = b.create<arith::MulFOp>(log, half);
+  rewriter.replaceOp(op, res);
+  return success();
+}
+
 static LogicalResult convertFmaFOp(math::FmaOp op, PatternRewriter &rewriter) {
   ImplicitLocOpBuilder b(op->getLoc(), rewriter);
   Value operandA = op.getOperand(0);
@@ -564,6 +615,23 @@ static LogicalResult convertRoundEvenOp(math::RoundEvenOp op,
   return success();
 }
 
+// Convert `math.rsqrt` into `arith.divf` + `math.sqrt`
+static LogicalResult convertRsqrtOp(math::RsqrtOp op,
+                                    PatternRewriter &rewriter) {
+
+  auto operand = op.getOperand();
+  auto operandTy = operand.getType();
+  auto eTy = getElementTypeOrSelf(operandTy);
+  if (!isa<FloatType>(eTy))
+    return failure();
+
+  Location loc = op->getLoc();
+  auto constOneFloat = createFloatConst(loc, operandTy, 1.0, rewriter);
+  auto sqrtOp = rewriter.create<math::SqrtOp>(loc, operand);
+  rewriter.replaceOpWithNewOp<arith::DivFOp>(op, constOneFloat, sqrtOp);
+  return success();
+}
+
 void mlir::populateExpandCtlzPattern(RewritePatternSet &patterns) {
   patterns.add(convertCtlzOp);
 }
@@ -584,6 +652,18 @@ void mlir::populateExpandTanhPattern(RewritePatternSet &patterns) {
   patterns.add(convertTanhOp);
 }
 
+void mlir::populateExpandAsinhPattern(RewritePatternSet &patterns) {
+  patterns.add(convertAsinhOp);
+}
+
+void mlir::populateExpandAcoshPattern(RewritePatternSet &patterns) {
+  patterns.add(convertAcoshOp);
+}
+
+void mlir::populateExpandAtanhPattern(RewritePatternSet &patterns) {
+  patterns.add(convertAtanhOp);
+}
+
 void mlir::populateExpandFmaFPattern(RewritePatternSet &patterns) {
   patterns.add(convertFmaFOp);
 }
@@ -615,3 +695,7 @@ void mlir::populateExpandFloorFPattern(RewritePatternSet &patterns) {
 void mlir::populateExpandRoundEvenPattern(RewritePatternSet &patterns) {
   patterns.add(convertRoundEvenOp);
 }
+
+void mlir::populateExpandRsqrtPattern(RewritePatternSet &patterns) {
+  patterns.add(convertRsqrtOp);
+}
diff --git a/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp b/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
index 428c1c37c4e8..f4fae68da63b 100644
--- a/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
+++ b/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
@@ -822,6 +822,153 @@ Log1pApproximation::matchAndRewrite(math::Log1pOp op,
 }
 
 //----------------------------------------------------------------------------//
+// Asin approximation.
+//----------------------------------------------------------------------------//
+
+// Approximates asin(x).
+// This approximation is based on the following stackoverflow post:
+// https://stackoverflow.com/a/42683455
+namespace {
+struct AsinPolynomialApproximation : public OpRewritePattern<math::AsinOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(math::AsinOp op,
+                                PatternRewriter &rewriter) const final;
+};
+} // namespace
+LogicalResult
+AsinPolynomialApproximation::matchAndRewrite(math::AsinOp op,
+                                             PatternRewriter &rewriter) const {
+  Value operand = op.getOperand();
+  Type elementType = getElementTypeOrSelf(operand);
+
+  if (!(elementType.isF32() || elementType.isF16()))
+    return rewriter.notifyMatchFailure(op,
+                                       "only f32 and f16 type is supported.");
+  VectorShape shape = vectorShape(operand);
+
+  ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
+  auto bcast = [&](Value value) -> Value {
+    return broadcast(builder, value, shape);
+  };
+
+  auto fma = [&](Value a, Value b, Value c) -> Value {
+    return builder.create<math::FmaOp>(a, b, c);
+  };
+
+  auto mul = [&](Value a, Value b) -> Value {
+    return builder.create<arith::MulFOp>(a, b);
+  };
+
+  Value s = mul(operand, operand);
+  Value q = mul(s, s);
+  Value r = bcast(floatCst(builder, 5.5579749017470502e-2, elementType));
+  Value t = bcast(floatCst(builder, -6.2027913464120114e-2, elementType));
+
+  r = fma(r, q, bcast(floatCst(builder, 5.4224464349245036e-2, elementType)));
+  t = fma(t, q, bcast(floatCst(builder, -1.1326992890324464e-2, elementType)));
+  r = fma(r, q, bcast(floatCst(builder, 1.5268872539397656e-2, elementType)));
+  t = fma(t, q, bcast(floatCst(builder, 1.0493798473372081e-2, elementType)));
+  r = fma(r, q, bcast(floatCst(builder, 1.4106045900607047e-2, elementType)));
+  t = fma(t, q, bcast(floatCst(builder, 1.7339776384962050e-2, elementType)));
+  r = fma(r, q, bcast(floatCst(builder, 2.2372961589651054e-2, elementType)));
+  t = fma(t, q, bcast(floatCst(builder, 3.0381912707941005e-2, elementType)));
+  r = fma(r, q, bcast(floatCst(builder, 4.4642857881094775e-2, elementType)));
+  t = fma(t, q, bcast(floatCst(builder, 7.4999999991367292e-2, elementType)));
+  r = fma(r, s, t);
+  r = fma(r, s, bcast(floatCst(builder, 1.6666666666670193e-1, elementType)));
+  t = mul(operand, s);
+  r = fma(r, t, operand);
+
+  rewriter.replaceOp(op, r);
+  return success();
+}
+
+//----------------------------------------------------------------------------//
+// Acos approximation.
+//----------------------------------------------------------------------------//
+
+// Approximates acos(x).
+// This approximation is based on the following stackoverflow post:
+// https://stackoverflow.com/a/42683455
+namespace {
+struct AcosPolynomialApproximation : public OpRewritePattern<math::AcosOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(math::AcosOp op,
+                                PatternRewriter &rewriter) const final;
+};
+} // namespace
+LogicalResult
+AcosPolynomialApproximation::matchAndRewrite(math::AcosOp op,
+                                             PatternRewriter &rewriter) const {
+  Value operand = op.getOperand();
+  Type elementType = getElementTypeOrSelf(operand);
+
+  if (!(elementType.isF32() || elementType.isF16()))
+    return rewriter.notifyMatchFailure(op,
+                                       "only f32 and f16 type is supported.");
+  VectorShape shape = vectorShape(operand);
+
+  ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
+  auto bcast = [&](Value value) -> Value {
+    return broadcast(builder, value, shape);
+  };
+
+  auto fma = [&](Value a, Value b, Value c) -> Value {
+    return builder.create<math::FmaOp>(a, b, c);
+  };
+
+  auto mul = [&](Value a, Value b) -> Value {
+    return builder.create<arith::MulFOp>(a, b);
+  };
+
+  Value negOperand = builder.create<arith::NegFOp>(operand);
+  Value zero = bcast(floatCst(builder, 0.0, elementType));
+  Value half = bcast(floatCst(builder, 0.5, elementType));
+  Value negOne = bcast(floatCst(builder, -1.0, elementType));
+  Value selR =
+      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OGT, operand, zero);
+  Value r = builder.create<arith::SelectOp>(selR, negOperand, operand);
+  Value chkConst = bcast(floatCst(builder, -0.5625, elementType));
+  Value firstPred =
+      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OGT, r, chkConst);
+
+  Value trueVal =
+      fma(bcast(floatCst(builder, 9.3282184640716537e-1, elementType)),
+          bcast(floatCst(builder, 1.6839188885261840e+0, elementType)),
+          builder.create<math::AsinOp>(r));
+
+  Value falseVal = builder.create<math::SqrtOp>(fma(half, r, half));
+  falseVal = builder.create<math::AsinOp>(falseVal);
+  falseVal = mul(bcast(floatCst(builder, 2.0, elementType)), falseVal);
+
+  r = builder.create<arith::SelectOp>(firstPred, trueVal, falseVal);
+
+  // Check whether the operand lies in between [-1.0, 0.0).
+  Value greaterThanNegOne =
+      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OGE, operand, negOne);
+
+  Value lessThanZero =
+      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OLT, operand, zero);
+
+  Value betweenNegOneZero =
+      builder.create<arith::AndIOp>(greaterThanNegOne, lessThanZero);
+
+  trueVal = fma(bcast(floatCst(builder, 1.8656436928143307e+0, elementType)),
+                bcast(floatCst(builder, 1.6839188885261840e+0, elementType)),
+                builder.create<arith::NegFOp>(r));
+
+  Value finalVal =
+      builder.create<arith::SelectOp>(betweenNegOneZero, trueVal, r);
+
+  rewriter.replaceOp(op, finalVal);
+  return success();
+}
+
+//----------------------------------------------------------------------------//
 // Erf approximation.
 //----------------------------------------------------------------------------//
 
@@ -1505,12 +1652,13 @@ void mlir::populateMathPolynomialApproximationPatterns(
            ReuseF32Expansion<math::SinOp>, ReuseF32Expansion<math::CosOp>>(
           patterns.getContext());
 
-  patterns.add<AtanApproximation, Atan2Approximation, TanhApproximation,
-               LogApproximation, Log2Approximation, Log1pApproximation,
-               ErfPolynomialApproximation, ExpApproximation, ExpM1Approximation,
-               CbrtApproximation, SinAndCosApproximation<true, math::SinOp>,
-               SinAndCosApproximation<false, math::CosOp>>(
-      patterns.getContext());
+  patterns
+      .add<AtanApproximation, Atan2Approximation, TanhApproximation,
+           LogApproximation, Log2Approximation, Log1pApproximation,
+           ErfPolynomialApproximation, AsinPolynomialApproximation,
+           AcosPolynomialApproximation, ExpApproximation, ExpM1Approximation,
+           CbrtApproximation, SinAndCosApproximation<true, math::SinOp>,
+           SinAndCosApproximation<false, math::CosOp>>(patterns.getContext());
   if (options.enableAvx2) {
     patterns.add<RsqrtApproximation, ReuseF32Expansion<math::RsqrtOp>>(
         patterns.getContext());
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefMemorySlot.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefMemorySlot.cpp
index 958c5f0c8dbc..e30598e6878f 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefMemorySlot.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefMemorySlot.cpp
@@ -83,30 +83,32 @@ SmallVector<MemorySlot> memref::AllocaOp::getPromotableSlots() {
 }
 
 Value memref::AllocaOp::getDefaultValue(const MemorySlot &slot,
-                                        RewriterBase &rewriter) {
+                                        OpBuilder &builder) {
   assert(isSupportedElementType(slot.elemType));
   // TODO: support more types.
   return TypeSwitch<Type, Value>(slot.elemType)
       .Case([&](MemRefType t) {
-        return rewriter.create<memref::AllocaOp>(getLoc(), t);
+        return builder.create<memref::AllocaOp>(getLoc(), t);
       })
       .Default([&](Type t) {
-        return rewriter.create<arith::ConstantOp>(getLoc(), t,
-                                                  rewriter.getZeroAttr(t));
+        return builder.create<arith::ConstantOp>(getLoc(), t,
+                                                 builder.getZeroAttr(t));
       });
 }
 
-void memref::AllocaOp::handlePromotionComplete(const MemorySlot &slot,
-                                               Value defaultValue,
-                                               RewriterBase &rewriter) {
+std::optional<PromotableAllocationOpInterface>
+memref::AllocaOp::handlePromotionComplete(const MemorySlot &slot,
+                                          Value defaultValue,
+                                          OpBuilder &builder) {
   if (defaultValue.use_empty())
-    rewriter.eraseOp(defaultValue.getDefiningOp());
-  rewriter.eraseOp(*this);
+    defaultValue.getDefiningOp()->erase();
+  this->erase();
+  return std::nullopt;
 }
 
 void memref::AllocaOp::handleBlockArgument(const MemorySlot &slot,
                                            BlockArgument argument,
-                                           RewriterBase &rewriter) {}
+                                           OpBuilder &builder) {}
 
 SmallVector<DestructurableMemorySlot>
 memref::AllocaOp::getDestructurableSlots() {
@@ -127,8 +129,8 @@ memref::AllocaOp::getDestructurableSlots() {
 DenseMap<Attribute, MemorySlot>
 memref::AllocaOp::destructure(const DestructurableMemorySlot &slot,
                               const SmallPtrSetImpl<Attribute> &usedIndices,
-                              RewriterBase &rewriter) {
-  rewriter.setInsertionPointAfter(*this);
+                              OpBuilder &builder) {
+  builder.setInsertionPointAfter(*this);
 
   DenseMap<Attribute, MemorySlot> slotMap;
 
@@ -136,7 +138,7 @@ memref::AllocaOp::destructure(const DestructurableMemorySlot &slot,
   for (Attribute usedIndex : usedIndices) {
     Type elemType = memrefType.getTypeAtIndex(usedIndex);
     MemRefType elemPtr = MemRefType::get({}, elemType);
-    auto subAlloca = rewriter.create<memref::AllocaOp>(getLoc(), elemPtr);
+    auto subAlloca = builder.create<memref::AllocaOp>(getLoc(), elemPtr);
     slotMap.try_emplace<MemorySlot>(usedIndex,
                                     {subAlloca.getResult(), elemType});
   }
@@ -145,9 +147,9 @@ memref::AllocaOp::destructure(const DestructurableMemorySlot &slot,
 }
 
 void memref::AllocaOp::handleDestructuringComplete(
-    const DestructurableMemorySlot &slot, RewriterBase &rewriter) {
+    const DestructurableMemorySlot &slot, OpBuilder &builder) {
   assert(slot.ptr == getResult());
-  rewriter.eraseOp(*this);
+  this->erase();
 }
 
 //===----------------------------------------------------------------------===//
@@ -160,7 +162,7 @@ bool memref::LoadOp::loadsFrom(const MemorySlot &slot) {
 
 bool memref::LoadOp::storesTo(const MemorySlot &slot) { return false; }
 
-Value memref::LoadOp::getStored(const MemorySlot &slot, RewriterBase &rewriter,
+Value memref::LoadOp::getStored(const MemorySlot &slot, OpBuilder &builder,
                                 Value reachingDef,
                                 const DataLayout &dataLayout) {
   llvm_unreachable("getStored should not be called on LoadOp");
@@ -179,11 +181,11 @@ bool memref::LoadOp::canUsesBeRemoved(
 
 DeletionKind memref::LoadOp::removeBlockingUses(
     const MemorySlot &slot, const SmallPtrSetImpl<OpOperand *> &blockingUses,
-    RewriterBase &rewriter, Value reachingDefinition,
+    OpBuilder &builder, Value reachingDefinition,
     const DataLayout &dataLayout) {
   // `canUsesBeRemoved` checked this blocking use must be the loaded slot
   // pointer.
-  rewriter.replaceAllUsesWith(getResult(), reachingDefinition);
+  getResult().replaceAllUsesWith(reachingDefinition);
   return DeletionKind::Delete;
 }
 
@@ -224,15 +226,13 @@ bool memref::LoadOp::canRewire(const DestructurableMemorySlot &slot,
 
 DeletionKind memref::LoadOp::rewire(const DestructurableMemorySlot &slot,
                                     DenseMap<Attribute, MemorySlot> &subslots,
-                                    RewriterBase &rewriter,
+                                    OpBuilder &builder,
                                     const DataLayout &dataLayout) {
   Attribute index = getAttributeIndexFromIndexOperands(
       getContext(), getIndices(), getMemRefType());
   const MemorySlot &memorySlot = subslots.at(index);
-  rewriter.modifyOpInPlace(*this, [&]() {
-    setMemRef(memorySlot.ptr);
-    getIndicesMutable().clear();
-  });
+  setMemRef(memorySlot.ptr);
+  getIndicesMutable().clear();
   return DeletionKind::Keep;
 }
 
@@ -242,7 +242,7 @@ bool memref::StoreOp::storesTo(const MemorySlot &slot) {
   return getMemRef() == slot.ptr;
 }
 
-Value memref::StoreOp::getStored(const MemorySlot &slot, RewriterBase &rewriter,
+Value memref::StoreOp::getStored(const MemorySlot &slot, OpBuilder &builder,
                                  Value reachingDef,
                                  const DataLayout &dataLayout) {
   return getValue();
@@ -261,7 +261,7 @@ bool memref::StoreOp::canUsesBeRemoved(
 
 DeletionKind memref::StoreOp::removeBlockingUses(
     const MemorySlot &slot, const SmallPtrSetImpl<OpOperand *> &blockingUses,
-    RewriterBase &rewriter, Value reachingDefinition,
+    OpBuilder &builder, Value reachingDefinition,
     const DataLayout &dataLayout) {
   return DeletionKind::Delete;
 }
@@ -282,15 +282,13 @@ bool memref::StoreOp::canRewire(const DestructurableMemorySlot &slot,
 
 DeletionKind memref::StoreOp::rewire(const DestructurableMemorySlot &slot,
                                      DenseMap<Attribute, MemorySlot> &subslots,
-                                     RewriterBase &rewriter,
+                                     OpBuilder &builder,
                                      const DataLayout &dataLayout) {
   Attribute index = getAttributeIndexFromIndexOperands(
       getContext(), getIndices(), getMemRefType());
   const MemorySlot &memorySlot = subslots.at(index);
-  rewriter.modifyOpInPlace(*this, [&]() {
-    setMemRef(memorySlot.ptr);
-    getIndicesMutable().clear();
-  });
+  setMemRef(memorySlot.ptr);
+  getIndicesMutable().clear();
   return DeletionKind::Keep;
 }
 
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
index b969d41d934d..199e7330a233 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
@@ -1742,20 +1742,18 @@ ParseResult PrefetchOp::parse(OpAsmParser &parser, OperationState &result) {
       parser.resolveOperands(indexInfo, indexTy, result.operands))
     return failure();
 
-  if (!readOrWrite.equals("read") && !readOrWrite.equals("write"))
+  if (readOrWrite != "read" && readOrWrite != "write")
     return parser.emitError(parser.getNameLoc(),
                             "rw specifier has to be 'read' or 'write'");
-  result.addAttribute(
-      PrefetchOp::getIsWriteAttrStrName(),
-      parser.getBuilder().getBoolAttr(readOrWrite.equals("write")));
+  result.addAttribute(PrefetchOp::getIsWriteAttrStrName(),
+                      parser.getBuilder().getBoolAttr(readOrWrite == "write"));
 
-  if (!cacheType.equals("data") && !cacheType.equals("instr"))
+  if (cacheType != "data" && cacheType != "instr")
     return parser.emitError(parser.getNameLoc(),
                             "cache type has to be 'data' or 'instr'");
 
-  result.addAttribute(
-      PrefetchOp::getIsDataCacheAttrStrName(),
-      parser.getBuilder().getBoolAttr(cacheType.equals("data")));
+  result.addAttribute(PrefetchOp::getIsDataCacheAttrStrName(),
+                      parser.getBuilder().getBoolAttr(cacheType == "data"));
 
   return success();
 }
@@ -2079,6 +2077,13 @@ void ExpandShapeOp::getAsmResultNames(
   setNameFn(getResult(), "expand_shape");
 }
 
+LogicalResult ExpandShapeOp::reifyResultShapes(
+    OpBuilder &builder, ReifiedRankedShapedTypeDims &reifiedResultShapes) {
+  reifiedResultShapes = {
+      getMixedValues(getStaticOutputShape(), getOutputShape(), builder)};
+  return success();
+}
+
 /// Helper function for verifying the shape of ExpandShapeOp and ResultShapeOp
 /// result and operand. Layout maps are verified separately.
 ///
@@ -2346,6 +2351,15 @@ LogicalResult ExpandShapeOp::verify() {
            << " dynamic dims while output_shape has " << getOutputShape().size()
            << " values";
 
+  // Verify if provided output shapes are in agreement with output type.
+  DenseI64ArrayAttr staticOutputShapes = getStaticOutputShapeAttr();
+  ArrayRef<int64_t> resShape = getResult().getType().getShape();
+  for (auto [pos, shape] : llvm::enumerate(resShape)) {
+    if (!ShapedType::isDynamic(shape) && shape != staticOutputShapes[pos]) {
+      return emitOpError("invalid output shape provided at pos ") << pos;
+    }
+  }
+
   return success();
 }
 
diff --git a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
index aa44455ada7f..db085b386483 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
@@ -19,6 +19,7 @@
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Dialect/MemRef/Transforms/Transforms.h"
+#include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
 #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
@@ -63,39 +64,85 @@ resolveSourceIndicesExpandShape(Location loc, PatternRewriter &rewriter,
                                 memref::ExpandShapeOp expandShapeOp,
                                 ValueRange indices,
                                 SmallVectorImpl<Value> &sourceIndices) {
-  // The below implementation uses computeSuffixProduct method, which only
-  // allows int64_t values (i.e., static shape). Bail out if it has dynamic
-  // shapes.
-  if (!expandShapeOp.getResultType().hasStaticShape())
+  // Record the rewriter context for constructing ops later.
+  MLIRContext *ctx = rewriter.getContext();
+
+  // Capture expand_shape's input dimensions as `SmallVector<OpFoldResult>`.
+  // This is done for the purpose of inferring the output shape via
+  // `inferExpandOutputShape` which will in turn be used for suffix product
+  // calculation later.
+  SmallVector<OpFoldResult> srcShape;
+  MemRefType srcType = expandShapeOp.getSrcType();
+
+  for (int64_t i = 0, e = srcType.getRank(); i < e; ++i) {
+    if (srcType.isDynamicDim(i)) {
+      srcShape.push_back(
+          rewriter.create<memref::DimOp>(loc, expandShapeOp.getSrc(), i)
+              .getResult());
+    } else {
+      srcShape.push_back(rewriter.getIndexAttr(srcType.getShape()[i]));
+    }
+  }
+
+  auto outputShape = inferExpandShapeOutputShape(
+      rewriter, loc, expandShapeOp.getResultType(),
+      expandShapeOp.getReassociationIndices(), srcShape);
+  if (!outputShape.has_value())
     return failure();
 
-  MLIRContext *ctx = rewriter.getContext();
+  // Traverse all reassociation groups to determine the appropriate indices
+  // corresponding to each one of them post op folding.
   for (ArrayRef<int64_t> groups : expandShapeOp.getReassociationIndices()) {
     assert(!groups.empty() && "association indices groups cannot be empty");
+    // Flag to indicate the presence of dynamic dimensions in current
+    // reassociation group.
     int64_t groupSize = groups.size();
 
-    // Construct the expression for the index value w.r.t to expand shape op
-    // source corresponding the indices wrt to expand shape op result.
-    SmallVector<int64_t> sizes(groupSize);
-    for (int64_t i = 0; i < groupSize; ++i)
-      sizes[i] = expandShapeOp.getResultType().getDimSize(groups[i]);
-    SmallVector<int64_t> suffixProduct = computeSuffixProduct(sizes);
-    SmallVector<AffineExpr> dims(groupSize);
-    bindDimsList(ctx, MutableArrayRef{dims});
-    AffineExpr srcIndexExpr = linearize(ctx, dims, suffixProduct);
+    // Group output dimensions utilized in this reassociation group for suffix
+    // product calculation.
+    SmallVector<OpFoldResult> sizesVal(groupSize);
+    for (int64_t i = 0; i < groupSize; ++i) {
+      sizesVal[i] = (*outputShape)[groups[i]];
+    }
+
+    // Calculate suffix product of relevant output dimension sizes.
+    SmallVector<OpFoldResult> suffixProduct =
+        memref::computeSuffixProductIRBlock(loc, rewriter, sizesVal);
+
+    // Create affine expression variables for dimensions and symbols in the
+    // newly constructed affine map.
+    SmallVector<AffineExpr> dims(groupSize), symbols(groupSize);
+    bindDimsList<AffineExpr>(ctx, dims);
+    bindSymbolsList<AffineExpr>(ctx, symbols);
 
-    /// Apply permutation and create AffineApplyOp.
+    // Linearize binded dimensions and symbols to construct the resultant
+    // affine expression for this indice.
+    AffineExpr srcIndexExpr = linearize(ctx, dims, symbols);
+
+    // Record the load index corresponding to each dimension in the
+    // reassociation group. These are later supplied as operands to the affine
+    // map used for calulating relevant index post op folding.
     SmallVector<OpFoldResult> dynamicIndices(groupSize);
     for (int64_t i = 0; i < groupSize; i++)
       dynamicIndices[i] = indices[groups[i]];
 
-    // Creating maximally folded and composd affine.apply composes better with
-    // other transformations without interleaving canonicalization passes.
+    // Supply suffix product results followed by load op indices as operands
+    // to the map.
+    SmallVector<OpFoldResult> mapOperands;
+    llvm::append_range(mapOperands, suffixProduct);
+    llvm::append_range(mapOperands, dynamicIndices);
+
+    // Creating maximally folded and composed affine.apply composes better
+    // with other transformations without interleaving canonicalization
+    // passes.
     OpFoldResult ofr = affine::makeComposedFoldedAffineApply(
         rewriter, loc,
         AffineMap::get(/*numDims=*/groupSize,
-                       /*numSymbols=*/0, srcIndexExpr),
-        dynamicIndices);
+                       /*numSymbols=*/groupSize, /*expression=*/srcIndexExpr),
+        mapOperands);
+
+    // Push index value in the op post folding corresponding to this
+    // reassociation group.
     sourceIndices.push_back(
         getValueOrCreateConstantIndexOp(rewriter, loc, ofr));
   }
@@ -315,7 +362,7 @@ public:
 
 /// Folds nvgpu.device_async_copy subviews into the copy itself. This pattern
 /// is folds subview on src and dst memref of the copy.
-class NvgpuAsyncCopyOpSubViewOpFolder final
+class NVGPUAsyncCopyOpSubViewOpFolder final
     : public OpRewritePattern<nvgpu::DeviceAsyncCopyOp> {
 public:
   using OpRewritePattern<nvgpu::DeviceAsyncCopyOp>::OpRewritePattern;
@@ -647,7 +694,7 @@ LogicalResult StoreOpOfCollapseShapeOpFolder<OpTy>::matchAndRewrite(
   return success();
 }
 
-LogicalResult NvgpuAsyncCopyOpSubViewOpFolder::matchAndRewrite(
+LogicalResult NVGPUAsyncCopyOpSubViewOpFolder::matchAndRewrite(
     nvgpu::DeviceAsyncCopyOp copyOp, PatternRewriter &rewriter) const {
 
   LLVM_DEBUG(DBGS() << "copyOp       : " << copyOp << "\n");
@@ -722,7 +769,7 @@ void memref::populateFoldMemRefAliasOpPatterns(RewritePatternSet &patterns) {
                LoadOpOfCollapseShapeOpFolder<memref::LoadOp>,
                StoreOpOfCollapseShapeOpFolder<affine::AffineStoreOp>,
                StoreOpOfCollapseShapeOpFolder<memref::StoreOp>,
-               SubViewOfSubViewFolder, NvgpuAsyncCopyOpSubViewOpFolder>(
+               SubViewOfSubViewFolder, NVGPUAsyncCopyOpSubViewOpFolder>(
       patterns.getContext());
 }
 
diff --git a/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp b/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
index 556a82de2166..c93e5a9dcd39 100644
--- a/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
+++ b/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
@@ -15,6 +15,7 @@
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "llvm/ADT/STLExtras.h"
 
 namespace mlir {
 namespace memref {
@@ -155,5 +156,27 @@ void eraseDeadAllocAndStores(RewriterBase &rewriter, Operation *parentOp) {
     rewriter.eraseOp(op);
 }
 
+static SmallVector<OpFoldResult>
+computeSuffixProductIRBlockImpl(Location loc, OpBuilder &builder,
+                                ArrayRef<OpFoldResult> sizes,
+                                OpFoldResult unit) {
+  SmallVector<OpFoldResult> strides(sizes.size(), unit);
+  AffineExpr s0, s1;
+  bindSymbols(builder.getContext(), s0, s1);
+
+  for (int64_t r = strides.size() - 1; r > 0; --r) {
+    strides[r - 1] = affine::makeComposedFoldedAffineApply(
+        builder, loc, s0 * s1, {strides[r], sizes[r]});
+  }
+  return strides;
+}
+
+SmallVector<OpFoldResult>
+computeSuffixProductIRBlock(Location loc, OpBuilder &builder,
+                            ArrayRef<OpFoldResult> sizes) {
+  OpFoldResult unit = builder.getIndexAttr(1);
+  return computeSuffixProductIRBlockImpl(loc, builder, sizes, unit);
+}
+
 } // namespace memref
 } // namespace mlir
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 0799090cdea9..61073af2aa4d 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -470,13 +470,17 @@ static void printClauseWithRegionArgs(OpAsmPrinter &p, Operation *op,
                                       ValueRange argsSubrange,
                                       StringRef clauseName, ValueRange operands,
                                       TypeRange types, ArrayAttr symbols) {
-  p << clauseName << "(";
+  if (!clauseName.empty())
+    p << clauseName << "(";
+
   llvm::interleaveComma(
       llvm::zip_equal(symbols, operands, argsSubrange, types), p, [&p](auto t) {
         auto [sym, op, arg, type] = t;
         p << sym << " " << op << " -> " << arg << " : " << type;
       });
-  p << ") ";
+
+  if (!clauseName.empty())
+    p << ") ";
 }
 
 static ParseResult parseParallelRegion(
@@ -990,6 +994,79 @@ static void printMapClause(OpAsmPrinter &p, Operation *op,
   }
 }
 
+static ParseResult parseMembersIndex(OpAsmParser &parser,
+                                     DenseIntElementsAttr &membersIdx) {
+  SmallVector<APInt> values;
+  int64_t value;
+  int64_t shape[2] = {0, 0};
+  unsigned shapeTmp = 0;
+  auto parseIndices = [&]() -> ParseResult {
+    if (parser.parseInteger(value))
+      return failure();
+    shapeTmp++;
+    values.push_back(APInt(32, value));
+    return success();
+  };
+
+  do {
+    if (failed(parser.parseLSquare()))
+      return failure();
+
+    if (parser.parseCommaSeparatedList(parseIndices))
+      return failure();
+
+    if (failed(parser.parseRSquare()))
+      return failure();
+
+    // Only set once, if any indices are not the same size
+    // we error out in the next check as that's unsupported
+    if (shape[1] == 0)
+      shape[1] = shapeTmp;
+
+    // Verify that the recently parsed list is equal to the
+    // first one we parsed, they must be equal lengths to
+    // keep the rectangular shape DenseIntElementsAttr
+    // requires
+    if (shapeTmp != shape[1])
+      return failure();
+
+    shapeTmp = 0;
+    shape[0]++;
+  } while (succeeded(parser.parseOptionalComma()));
+
+  if (!values.empty()) {
+    ShapedType valueType =
+        VectorType::get(shape, IntegerType::get(parser.getContext(), 32));
+    membersIdx = DenseIntElementsAttr::get(valueType, values);
+  }
+
+  return success();
+}
+
+static void printMembersIndex(OpAsmPrinter &p, MapInfoOp op,
+                              DenseIntElementsAttr membersIdx) {
+  llvm::ArrayRef<int64_t> shape = membersIdx.getShapedType().getShape();
+  assert(shape.size() <= 2);
+  
+  if (!membersIdx)
+    return;
+
+  for (int i = 0; i < shape[0]; ++i) {
+    p << "[";
+    int rowOffset = i * shape[1];
+    for (int j = 0; j < shape[1]; ++j) {
+      p << membersIdx.getValues<
+          int32_t>()[rowOffset + j];
+      if ((j + 1) < shape[1])
+        p << ",";
+    }
+    p << "]";
+
+    if ((i + 1) < shape[0])
+      p << ", ";
+  }
+}
+
 static ParseResult
 parseMapEntries(OpAsmParser &parser,
                 SmallVectorImpl<OpAsmParser::UnresolvedOperand> &mapOperands,
@@ -1048,6 +1125,49 @@ static void printMapEntries(OpAsmPrinter &p, Operation *op,
   }
 }
 
+static ParseResult parsePrivateList(
+    OpAsmParser &parser,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &privateOperands,
+    SmallVectorImpl<Type> &privateOperandTypes, ArrayAttr &privatizerSymbols) {
+  SmallVector<SymbolRefAttr> privateSymRefs;
+  SmallVector<OpAsmParser::Argument> regionPrivateArgs;
+
+  if (failed(parser.parseCommaSeparatedList([&]() {
+        if (parser.parseAttribute(privateSymRefs.emplace_back()) ||
+            parser.parseOperand(privateOperands.emplace_back()) ||
+            parser.parseArrow() ||
+            parser.parseArgument(regionPrivateArgs.emplace_back()) ||
+            parser.parseColonType(privateOperandTypes.emplace_back()))
+          return failure();
+        return success();
+      })))
+    return failure();
+
+  SmallVector<Attribute> privateSymAttrs(privateSymRefs.begin(),
+                                         privateSymRefs.end());
+  privatizerSymbols = ArrayAttr::get(parser.getContext(), privateSymAttrs);
+
+  return success();
+}
+
+static void printPrivateList(OpAsmPrinter &p, Operation *op,
+                             ValueRange privateVarOperands,
+                             TypeRange privateVarTypes,
+                             ArrayAttr privatizerSymbols) {
+  // TODO: Remove target-specific logic from this function.
+  auto targetOp = mlir::dyn_cast<mlir::omp::TargetOp>(op);
+  assert(targetOp);
+
+  auto &region = op->getRegion(0);
+  auto *argsBegin = region.front().getArguments().begin();
+  MutableArrayRef argsSubrange(argsBegin + targetOp.getMapOperands().size(),
+                               argsBegin + targetOp.getMapOperands().size() +
+                                   privateVarTypes.size());
+  printClauseWithRegionArgs(
+      p, op, argsSubrange, /*clauseName=*/llvm::StringRef{}, privateVarOperands,
+      privateVarTypes, privatizerSymbols);
+}
+
 static void printCaptureType(OpAsmPrinter &p, Operation *op,
                              VariableCaptureKindAttr mapCaptureType) {
   std::string typeCapStr;
@@ -1256,13 +1376,14 @@ void TargetOp::build(OpBuilder &builder, OperationState &state,
                      const TargetClauseOps &clauses) {
   MLIRContext *ctx = builder.getContext();
   // TODO Store clauses in op: allocateVars, allocatorVars, inReductionVars,
-  // inReductionDeclSymbols, privateVars, privatizers, reductionVars,
-  // reductionByRefAttr, reductionDeclSymbols.
+  // inReductionDeclSymbols, reductionVars, reductionByRefAttr,
+  // reductionDeclSymbols.
   TargetOp::build(
       builder, state, clauses.ifVar, clauses.deviceVar, clauses.threadLimitVar,
       makeArrayAttr(ctx, clauses.dependTypeAttrs), clauses.dependVars,
       clauses.nowaitAttr, clauses.isDevicePtrVars, clauses.hasDeviceAddrVars,
-      clauses.mapVars);
+      clauses.mapVars, clauses.privateVars,
+      makeArrayAttr(ctx, clauses.privatizers));
 }
 
 LogicalResult TargetOp::verify() {
diff --git a/mlir/lib/Dialect/Polynomial/IR/PolynomialAttributes.cpp b/mlir/lib/Dialect/Polynomial/IR/PolynomialAttributes.cpp
index f1ec2be72a33..236bb7896635 100644
--- a/mlir/lib/Dialect/Polynomial/IR/PolynomialAttributes.cpp
+++ b/mlir/lib/Dialect/Polynomial/IR/PolynomialAttributes.cpp
@@ -202,11 +202,27 @@ Attribute RingAttr::parse(AsmParser &parser, Type type) {
     polyAttr = attr;
   }
 
+  Polynomial poly = polyAttr.getPolynomial();
+  APInt root(coefficientModulusAttr.getValue().getBitWidth(), 0);
+  IntegerAttr rootAttr = nullptr;
+  if (succeeded(parser.parseOptionalComma())) {
+    if (failed(parser.parseKeyword("primitiveRoot")) ||
+        failed(parser.parseEqual()))
+      return {};
+
+    ParseResult result = parser.parseInteger(root);
+    if (failed(result)) {
+      parser.emitError(parser.getCurrentLocation(), "invalid primitiveRoot");
+      return {};
+    }
+    rootAttr = IntegerAttr::get(coefficientModulusAttr.getType(), root);
+  }
+
   if (failed(parser.parseGreater()))
     return {};
 
   return RingAttr::get(parser.getContext(), ty, coefficientModulusAttr,
-                       polyAttr);
+                       polyAttr, rootAttr);
 }
 
 } // namespace polynomial
diff --git a/mlir/lib/Dialect/Polynomial/IR/PolynomialOps.cpp b/mlir/lib/Dialect/Polynomial/IR/PolynomialOps.cpp
index 8e2bb5f27dc6..12010de34823 100644
--- a/mlir/lib/Dialect/Polynomial/IR/PolynomialOps.cpp
+++ b/mlir/lib/Dialect/Polynomial/IR/PolynomialOps.cpp
@@ -104,3 +104,82 @@ LogicalResult MulScalarOp::verify() {
 
   return success();
 }
+
+/// Test if a value is a primitive nth root of unity modulo cmod.
+bool isPrimitiveNthRootOfUnity(const APInt &root, const unsigned n,
+                               const APInt &cmod) {
+  // Root bitwidth may be 1 less then cmod.
+  APInt r = APInt(root).zext(cmod.getBitWidth());
+  assert(r.ule(cmod) && "root must be less than cmod");
+
+  APInt a = r;
+  for (size_t k = 1; k < n; k++) {
+    if (a.isOne())
+      return false;
+    a = (a * r).urem(cmod);
+  }
+  return a.isOne();
+}
+
+/// Verify that the types involved in an NTT or INTT operation are
+/// compatible.
+static LogicalResult verifyNTTOp(Operation *op, RingAttr ring,
+                                 RankedTensorType tensorType) {
+  Attribute encoding = tensorType.getEncoding();
+  if (!encoding) {
+    return op->emitOpError()
+           << "expects a ring encoding to be provided to the tensor";
+  }
+  auto encodedRing = dyn_cast<RingAttr>(encoding);
+  if (!encodedRing) {
+    return op->emitOpError()
+           << "the provided tensor encoding is not a ring attribute";
+  }
+
+  if (encodedRing != ring) {
+    return op->emitOpError()
+           << "encoded ring type " << encodedRing
+           << " is not equivalent to the polynomial ring " << ring;
+  }
+
+  unsigned polyDegree = ring.getPolynomialModulus().getPolynomial().getDegree();
+  ArrayRef<int64_t> tensorShape = tensorType.getShape();
+  bool compatible = tensorShape.size() == 1 && tensorShape[0] == polyDegree;
+  if (!compatible) {
+    InFlightDiagnostic diag = op->emitOpError()
+                              << "tensor type " << tensorType
+                              << " does not match output type " << ring;
+    diag.attachNote() << "the tensor must have shape [d] where d "
+                         "is exactly the degree of the polynomialModulus of "
+                         "the polynomial type's ring attribute";
+    return diag;
+  }
+
+  if (!ring.getPrimitiveRoot()) {
+    return op->emitOpError()
+           << "ring type " << ring << " does not provide a primitive root "
+           << "of unity, which is required to express an NTT";
+  }
+
+  if (!isPrimitiveNthRootOfUnity(ring.getPrimitiveRoot().getValue(), polyDegree,
+                                 ring.getCoefficientModulus().getValue())) {
+    return op->emitOpError()
+           << "ring type " << ring << " has a primitiveRoot attribute '"
+           << ring.getPrimitiveRoot()
+           << "' that is not a primitive root of the coefficient ring";
+  }
+
+  return success();
+}
+
+LogicalResult NTTOp::verify() {
+  auto ring = getInput().getType().getRing();
+  auto tensorType = getOutput().getType();
+  return verifyNTTOp(this->getOperation(), ring, tensorType);
+}
+
+LogicalResult INTTOp::verify() {
+  auto tensorType = getInput().getType();
+  auto ring = getOutput().getType().getRing();
+  return verifyNTTOp(this->getOperation(), ring, tensorType);
+}
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index 7a1aafc9f1c2..107fd0690f19 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -1415,6 +1415,19 @@ InParallelOp ForallOp::getTerminator() {
   return cast<InParallelOp>(getBody()->getTerminator());
 }
 
+SmallVector<Operation *> ForallOp::getCombiningOps(BlockArgument bbArg) {
+  SmallVector<Operation *> storeOps;
+  InParallelOp inParallelOp = getTerminator();
+  for (Operation &yieldOp : inParallelOp.getYieldingOps()) {
+    if (auto parallelInsertSliceOp =
+            dyn_cast<tensor::ParallelInsertSliceOp>(yieldOp);
+        parallelInsertSliceOp && parallelInsertSliceOp.getDest() == bbArg) {
+      storeOps.push_back(parallelInsertSliceOp);
+    }
+  }
+  return storeOps;
+}
+
 std::optional<Value> ForallOp::getSingleInductionVar() {
   if (getRank() != 1)
     return std::nullopt;
@@ -1509,6 +1522,179 @@ public:
   }
 };
 
+/// The following canonicalization pattern folds the iter arguments of
+/// scf.forall op if :-
+/// 1. The corresponding result has zero uses.
+/// 2. The iter argument is NOT being modified within the loop body.
+/// uses.
+///
+/// Example of first case :-
+///  INPUT:
+///   %res:3 = scf.forall ... shared_outs(%arg0 = %a, %arg1 = %b, %arg2 = %c)
+///            {
+///                ...
+///                <SOME USE OF %arg0>
+///                <SOME USE OF %arg1>
+///                <SOME USE OF %arg2>
+///                ...
+///                scf.forall.in_parallel {
+///                    <STORE OP WITH DESTINATION %arg1>
+///                    <STORE OP WITH DESTINATION %arg0>
+///                    <STORE OP WITH DESTINATION %arg2>
+///                }
+///             }
+///   return %res#1
+///
+///  OUTPUT:
+///   %res:3 = scf.forall ... shared_outs(%new_arg0 = %b)
+///            {
+///                ...
+///                <SOME USE OF %a>
+///                <SOME USE OF %new_arg0>
+///                <SOME USE OF %c>
+///                ...
+///                scf.forall.in_parallel {
+///                    <STORE OP WITH DESTINATION %new_arg0>
+///                }
+///             }
+///   return %res
+///
+/// NOTE: 1. All uses of the folded shared_outs (iter argument) within the
+///          scf.forall is replaced by their corresponding operands.
+///       2. Even if there are <STORE OP WITH DESTINATION *> ops within the body
+///          of the scf.forall besides within scf.forall.in_parallel terminator,
+///          this canonicalization remains valid. For more details, please refer
+///          to :
+///          https://github.com/llvm/llvm-project/pull/90189#discussion_r1589011124
+///       3. TODO(avarma): Generalize it for other store ops. Currently it
+///          handles tensor.parallel_insert_slice ops only.
+///
+/// Example of second case :-
+///  INPUT:
+///   %res:2 = scf.forall ... shared_outs(%arg0 = %a, %arg1 = %b)
+///            {
+///                ...
+///                <SOME USE OF %arg0>
+///                <SOME USE OF %arg1>
+///                ...
+///                scf.forall.in_parallel {
+///                    <STORE OP WITH DESTINATION %arg1>
+///                }
+///             }
+///   return %res#0, %res#1
+///
+///  OUTPUT:
+///   %res = scf.forall ... shared_outs(%new_arg0 = %b)
+///            {
+///                ...
+///                <SOME USE OF %a>
+///                <SOME USE OF %new_arg0>
+///                ...
+///                scf.forall.in_parallel {
+///                    <STORE OP WITH DESTINATION %new_arg0>
+///                }
+///             }
+///   return %a, %res
+struct ForallOpIterArgsFolder : public OpRewritePattern<ForallOp> {
+  using OpRewritePattern<ForallOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ForallOp forallOp,
+                                PatternRewriter &rewriter) const final {
+    // Step 1: For a given i-th result of scf.forall, check the following :-
+    //         a. If it has any use.
+    //         b. If the corresponding iter argument is being modified within
+    //            the loop, i.e. has at least one store op with the iter arg as
+    //            its destination operand. For this we use
+    //            ForallOp::getCombiningOps(iter_arg).
+    //
+    //         Based on the check we maintain the following :-
+    //         a. `resultToDelete` - i-th result of scf.forall that'll be
+    //            deleted.
+    //         b. `resultToReplace` - i-th result of the old scf.forall
+    //            whose uses will be replaced by the new scf.forall.
+    //         c. `newOuts` - the shared_outs' operand of the new scf.forall
+    //            corresponding to the i-th result with at least one use.
+    SetVector<OpResult> resultToDelete;
+    SmallVector<Value> resultToReplace;
+    SmallVector<Value> newOuts;
+    for (OpResult result : forallOp.getResults()) {
+      OpOperand *opOperand = forallOp.getTiedOpOperand(result);
+      BlockArgument blockArg = forallOp.getTiedBlockArgument(opOperand);
+      if (result.use_empty() || forallOp.getCombiningOps(blockArg).empty()) {
+        resultToDelete.insert(result);
+      } else {
+        resultToReplace.push_back(result);
+        newOuts.push_back(opOperand->get());
+      }
+    }
+
+    // Return early if all results of scf.forall have at least one use and being
+    // modified within the loop.
+    if (resultToDelete.empty())
+      return failure();
+
+    // Step 2: For the the i-th result, do the following :-
+    //         a. Fetch the corresponding BlockArgument.
+    //         b. Look for store ops (currently tensor.parallel_insert_slice)
+    //            with the BlockArgument as its destination operand.
+    //         c. Remove the operations fetched in b.
+    for (OpResult result : resultToDelete) {
+      OpOperand *opOperand = forallOp.getTiedOpOperand(result);
+      BlockArgument blockArg = forallOp.getTiedBlockArgument(opOperand);
+      SmallVector<Operation *> combiningOps =
+          forallOp.getCombiningOps(blockArg);
+      for (Operation *combiningOp : combiningOps)
+        rewriter.eraseOp(combiningOp);
+    }
+
+    // Step 3. Create a new scf.forall op with the new shared_outs' operands
+    //         fetched earlier
+    auto newForallOp = rewriter.create<scf::ForallOp>(
+        forallOp.getLoc(), forallOp.getMixedLowerBound(),
+        forallOp.getMixedUpperBound(), forallOp.getMixedStep(), newOuts,
+        forallOp.getMapping(),
+        /*bodyBuilderFn =*/[](OpBuilder &, Location, ValueRange) {});
+
+    // Step 4. Merge the block of the old scf.forall into the newly created
+    //         scf.forall using the new set of arguments.
+    Block *loopBody = forallOp.getBody();
+    Block *newLoopBody = newForallOp.getBody();
+    ArrayRef<BlockArgument> newBbArgs = newLoopBody->getArguments();
+    // Form initial new bbArg list with just the control operands of the new
+    // scf.forall op.
+    SmallVector<Value> newBlockArgs =
+        llvm::map_to_vector(newBbArgs.take_front(forallOp.getRank()),
+                            [](BlockArgument b) -> Value { return b; });
+    Block::BlockArgListType newSharedOutsArgs = newForallOp.getRegionOutArgs();
+    unsigned index = 0;
+    // Take the new corresponding bbArg if the old bbArg was used as a
+    // destination in the in_parallel op. For all other bbArgs, use the
+    // corresponding init_arg from the old scf.forall op.
+    for (OpResult result : forallOp.getResults()) {
+      if (resultToDelete.count(result)) {
+        newBlockArgs.push_back(forallOp.getTiedOpOperand(result)->get());
+      } else {
+        newBlockArgs.push_back(newSharedOutsArgs[index++]);
+      }
+    }
+    rewriter.mergeBlocks(loopBody, newLoopBody, newBlockArgs);
+
+    // Step 5. Replace the uses of result of old scf.forall with that of the new
+    //         scf.forall.
+    for (auto &&[oldResult, newResult] :
+         llvm::zip(resultToReplace, newForallOp->getResults()))
+      rewriter.replaceAllUsesWith(oldResult, newResult);
+
+    // Step 6. Replace the uses of those values that either has no use or are
+    //         not being modified within the loop with the corresponding
+    //         OpOperand.
+    for (OpResult oldResult : resultToDelete)
+      rewriter.replaceAllUsesWith(oldResult,
+                                  forallOp.getTiedOpOperand(oldResult)->get());
+    return success();
+  }
+};
+
 struct ForallOpSingleOrZeroIterationDimsFolder
     : public OpRewritePattern<ForallOp> {
   using OpRewritePattern<ForallOp>::OpRewritePattern;
@@ -1667,7 +1853,7 @@ struct FoldTensorCastOfOutputIntoForallOp
 void ForallOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                            MLIRContext *context) {
   results.add<DimOfForallOp, FoldTensorCastOfOutputIntoForallOp,
-              ForallOpControlOperandsFolder,
+              ForallOpControlOperandsFolder, ForallOpIterArgsFolder,
               ForallOpSingleOrZeroIterationDimsFolder>(context);
 }
 
diff --git a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
index 2a16b10bbaf8..cf40443ff383 100644
--- a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -1267,6 +1267,9 @@ struct ForallOpInterface
         forallOp.getMixedUpperBound(), forallOp.getMixedStep(),
         /*outputs=*/ValueRange(), forallOp.getMapping());
 
+    // Keep discardable attributes from the original op.
+    newForallOp->setDiscardableAttrs(op->getDiscardableAttrDictionary());
+
     rewriter.eraseOp(newForallOp.getBody()->getTerminator());
 
     // Move over block contents of the old op.
diff --git a/mlir/lib/Dialect/SCF/Utils/Utils.cpp b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
index 9279081cfd45..6658cca03eba 100644
--- a/mlir/lib/Dialect/SCF/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
@@ -544,11 +544,24 @@ static void denormalizeInductionVariable(RewriterBase &rewriter, Location loc,
 static Value getProductOfIntsOrIndexes(RewriterBase &rewriter, Location loc,
                                        ArrayRef<Value> values) {
   assert(!values.empty() && "unexpected empty list");
-  Value productOf = values.front();
-  for (auto v : values.drop_front()) {
-    productOf = rewriter.create<arith::MulIOp>(loc, productOf, v);
+  std::optional<Value> productOf;
+  for (auto v : values) {
+    auto vOne = getConstantIntValue(v);
+    if (vOne && vOne.value() == 1)
+      continue;
+    if (productOf)
+      productOf =
+          rewriter.create<arith::MulIOp>(loc, productOf.value(), v).getResult();
+    else
+      productOf = v;
   }
-  return productOf;
+  if (!productOf) {
+    productOf = rewriter
+                    .create<arith::ConstantOp>(
+                        loc, rewriter.getOneAttr(values.front().getType()))
+                    .getResult();
+  }
+  return productOf.value();
 }
 
 /// For each original loop, the value of the
@@ -562,19 +575,43 @@ static Value getProductOfIntsOrIndexes(RewriterBase &rewriter, Location loc,
 static std::pair<SmallVector<Value>, SmallPtrSet<Operation *, 2>>
 delinearizeInductionVariable(RewriterBase &rewriter, Location loc,
                              Value linearizedIv, ArrayRef<Value> ubs) {
-  Value previous = linearizedIv;
   SmallVector<Value> delinearizedIvs(ubs.size());
   SmallPtrSet<Operation *, 2> preservedUsers;
-  for (unsigned i = 0, e = ubs.size(); i < e; ++i) {
-    unsigned idx = ubs.size() - i - 1;
-    if (i != 0) {
+
+  llvm::BitVector isUbOne(ubs.size());
+  for (auto [index, ub] : llvm::enumerate(ubs)) {
+    auto ubCst = getConstantIntValue(ub);
+    if (ubCst && ubCst.value() == 1)
+      isUbOne.set(index);
+  }
+
+  // Prune the lead ubs that are all ones.
+  unsigned numLeadingOneUbs = 0;
+  for (auto [index, ub] : llvm::enumerate(ubs)) {
+    if (!isUbOne.test(index)) {
+      break;
+    }
+    delinearizedIvs[index] = rewriter.create<arith::ConstantOp>(
+        loc, rewriter.getZeroAttr(ub.getType()));
+    numLeadingOneUbs++;
+  }
+
+  Value previous = linearizedIv;
+  for (unsigned i = numLeadingOneUbs, e = ubs.size(); i < e; ++i) {
+    unsigned idx = ubs.size() - (i - numLeadingOneUbs) - 1;
+    if (i != numLeadingOneUbs && !isUbOne.test(idx + 1)) {
       previous = rewriter.create<arith::DivSIOp>(loc, previous, ubs[idx + 1]);
       preservedUsers.insert(previous.getDefiningOp());
     }
     Value iv = previous;
     if (i != e - 1) {
-      iv = rewriter.create<arith::RemSIOp>(loc, previous, ubs[idx]);
-      preservedUsers.insert(iv.getDefiningOp());
+      if (!isUbOne.test(idx)) {
+        iv = rewriter.create<arith::RemSIOp>(loc, previous, ubs[idx]);
+        preservedUsers.insert(iv.getDefiningOp());
+      } else {
+        iv = rewriter.create<arith::ConstantOp>(
+            loc, rewriter.getZeroAttr(ubs[idx].getType()));
+      }
     }
     delinearizedIvs[idx] = iv;
   }
diff --git a/mlir/lib/Dialect/SparseTensor/IR/CMakeLists.txt b/mlir/lib/Dialect/SparseTensor/IR/CMakeLists.txt
index dd6f1037f71b..6f59b69bddce 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/SparseTensor/IR/CMakeLists.txt
@@ -45,6 +45,7 @@ add_mlir_dialect_library(MLIRSparseTensorDialect
 
   LINK_LIBS PUBLIC
   MLIRArithDialect
+  MLIRComplexDialect
   MLIRDialect
   MLIRDialectUtils
   MLIRIR
diff --git a/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp b/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp
index 92e5efaa8104..39f5cf1a7508 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp
@@ -89,11 +89,11 @@ ParseResult LvlTypeParser::parseProperty(AsmParser &parser,
   auto loc = parser.getCurrentLocation();
   ERROR_IF(failed(parser.parseOptionalKeyword(&strVal)),
            "expected valid level property (e.g. nonordered, nonunique or high)")
-  if (strVal.equals(toPropString(LevelPropNonDefault::Nonunique))) {
+  if (strVal == toPropString(LevelPropNonDefault::Nonunique)) {
     *properties |= static_cast<uint64_t>(LevelPropNonDefault::Nonunique);
-  } else if (strVal.equals(toPropString(LevelPropNonDefault::Nonordered))) {
+  } else if (strVal == toPropString(LevelPropNonDefault::Nonordered)) {
     *properties |= static_cast<uint64_t>(LevelPropNonDefault::Nonordered);
-  } else if (strVal.equals(toPropString(LevelPropNonDefault::SoA))) {
+  } else if (strVal == toPropString(LevelPropNonDefault::SoA)) {
     *properties |= static_cast<uint64_t>(LevelPropNonDefault::SoA);
   } else {
     parser.emitError(loc, "unknown level property: ") << strVal;
diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
index 028a69da10c1..4adb1c19096a 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
@@ -17,6 +17,7 @@
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/DialectImplementation.h"
@@ -103,7 +104,7 @@ void StorageLayout::foreachField(
         callback) const {
   const auto lvlTypes = enc.getLvlTypes();
   const Level lvlRank = enc.getLvlRank();
-  SmallVector<COOSegment> cooSegs = SparseTensorType(enc).getCOOSegments();
+  SmallVector<COOSegment> cooSegs = enc.getCOOSegments();
   FieldIndex fieldIdx = kDataFieldStartingIdx;
 
   ArrayRef cooSegsRef = cooSegs;
@@ -210,7 +211,7 @@ StorageLayout::getFieldIndexAndStride(SparseTensorFieldKind kind,
   unsigned stride = 1;
   if (kind == SparseTensorFieldKind::CrdMemRef) {
     assert(lvl.has_value());
-    const Level cooStart = SparseTensorType(enc).getAoSCOOStart();
+    const Level cooStart = enc.getAoSCOOStart();
     const Level lvlRank = enc.getLvlRank();
     if (lvl.value() >= cooStart && lvl.value() < lvlRank) {
       lvl = cooStart;
@@ -663,6 +664,8 @@ Attribute SparseTensorEncodingAttr::parse(AsmParser &parser, Type type) {
         explicitVal = result;
       } else if (auto result = llvm::dyn_cast<IntegerAttr>(attr)) {
         explicitVal = result;
+      } else if (auto result = llvm::dyn_cast<complex::NumberAttr>(attr)) {
+        explicitVal = result;
       } else {
         parser.emitError(parser.getNameLoc(),
                          "expected a numeric value for explicitVal");
@@ -678,6 +681,8 @@ Attribute SparseTensorEncodingAttr::parse(AsmParser &parser, Type type) {
         implicitVal = result;
       } else if (auto result = llvm::dyn_cast<IntegerAttr>(attr)) {
         implicitVal = result;
+      } else if (auto result = llvm::dyn_cast<complex::NumberAttr>(attr)) {
+        implicitVal = result;
       } else {
         parser.emitError(parser.getNameLoc(),
                          "expected a numeric value for implicitVal");
@@ -783,24 +788,29 @@ LogicalResult SparseTensorEncodingAttr::verify(
     return emitError() << "unexpected position bitwidth: " << posWidth;
   if (!acceptBitWidth(crdWidth))
     return emitError() << "unexpected coordinate bitwidth: " << crdWidth;
-  if (auto it = std::find_if(lvlTypes.begin(), lvlTypes.end(), isSingletonLT);
-      it != std::end(lvlTypes)) {
+
+  // Verify every COO segment.
+  auto *it = std::find_if(lvlTypes.begin(), lvlTypes.end(), isSingletonLT);
+  while (it != lvlTypes.end()) {
     if (it == lvlTypes.begin() ||
-        (!isCompressedLT(*(it - 1)) && !isLooseCompressedLT(*(it - 1))))
+        !(it - 1)->isa<LevelFormat::Compressed, LevelFormat::LooseCompressed>())
       return emitError() << "expected compressed or loose_compressed level "
                             "before singleton level";
-    if (!std::all_of(it, lvlTypes.end(),
+
+    auto *curCOOEnd = std::find_if_not(it, lvlTypes.end(), isSingletonLT);
+    if (!std::all_of(it, curCOOEnd,
                      [](LevelType i) { return isSingletonLT(i); }))
       return emitError() << "expected all singleton lvlTypes "
                             "following a singleton level";
     // We can potentially support mixed SoA/AoS singleton levels.
-    if (!std::all_of(it, lvlTypes.end(), [it](LevelType i) {
+    if (!std::all_of(it, curCOOEnd, [it](LevelType i) {
           return it->isa<LevelPropNonDefault::SoA>() ==
                  i.isa<LevelPropNonDefault::SoA>();
         })) {
       return emitError() << "expected all singleton lvlTypes stored in the "
                             "same memory layout (SoA vs AoS).";
     }
+    it = std::find_if(curCOOEnd, lvlTypes.end(), isSingletonLT);
   }
 
   auto lastBatch = std::find_if(lvlTypes.rbegin(), lvlTypes.rend(), isBatchLT);
@@ -907,46 +917,53 @@ LogicalResult SparseTensorEncodingAttr::verifyEncoding(
     return emitError()
            << "dimension-rank mismatch between encoding and tensor shape: "
            << getDimRank() << " != " << dimRank;
+  if (auto expVal = getExplicitVal()) {
+    Type attrType = llvm::dyn_cast<TypedAttr>(expVal).getType();
+    if (attrType != elementType) {
+      return emitError() << "explicit value type mismatch between encoding and "
+                         << "tensor element type: " << attrType
+                         << " != " << elementType;
+    }
+  }
+  if (auto impVal = getImplicitVal()) {
+    Type attrType = llvm::dyn_cast<TypedAttr>(impVal).getType();
+    if (attrType != elementType) {
+      return emitError() << "implicit value type mismatch between encoding and "
+                         << "tensor element type: " << attrType
+                         << " != " << elementType;
+    }
+    // Currently, we only support zero as the implicit value.
+    auto impFVal = llvm::dyn_cast<FloatAttr>(impVal);
+    auto impIntVal = llvm::dyn_cast<IntegerAttr>(impVal);
+    auto impComplexVal = llvm::dyn_cast<complex::NumberAttr>(impVal);
+    if ((impFVal && impFVal.getValue().isNonZero()) ||
+        (impIntVal && !impIntVal.getValue().isZero()) ||
+        (impComplexVal && (impComplexVal.getImag().isNonZero() ||
+                           impComplexVal.getReal().isNonZero()))) {
+      return emitError() << "implicit value must be zero";
+    }
+  }
   return success();
 }
 
-//===----------------------------------------------------------------------===//
-// SparseTensorType Methods.
-//===----------------------------------------------------------------------===//
-
-bool mlir::sparse_tensor::SparseTensorType::isCOOType(Level startLvl,
-                                                      bool isUnique) const {
-  if (!hasEncoding())
-    return false;
-  if (!isCompressedLvl(startLvl) && !isLooseCompressedLvl(startLvl))
-    return false;
-  for (Level l = startLvl + 1; l < lvlRank; ++l)
-    if (!isSingletonLvl(l))
-      return false;
-  // If isUnique is true, then make sure that the last level is unique,
-  // that is, when lvlRank == 1, the only compressed level is unique,
-  // and when lvlRank > 1, the last singleton is unique.
-  return !isUnique || isUniqueLvl(lvlRank - 1);
-}
-
-Level mlir::sparse_tensor::SparseTensorType::getAoSCOOStart() const {
+Level mlir::sparse_tensor::SparseTensorEncodingAttr::getAoSCOOStart() const {
   SmallVector<COOSegment> coo = getCOOSegments();
   assert(coo.size() == 1 || coo.empty());
   if (!coo.empty() && coo.front().isAoS()) {
     return coo.front().lvlRange.first;
   }
-  return lvlRank;
+  return getLvlRank();
 }
 
 SmallVector<COOSegment>
-mlir::sparse_tensor::SparseTensorType::getCOOSegments() const {
+mlir::sparse_tensor::SparseTensorEncodingAttr::getCOOSegments() const {
   SmallVector<COOSegment> ret;
-  if (!hasEncoding() || lvlRank <= 1)
+  if (getLvlRank() <= 1)
     return ret;
 
   ArrayRef<LevelType> lts = getLvlTypes();
   Level l = 0;
-  while (l < lvlRank) {
+  while (l < getLvlRank()) {
     auto lt = lts[l];
     if (lt.isa<LevelFormat::Compressed, LevelFormat::LooseCompressed>()) {
       auto cur = lts.begin() + l;
@@ -970,6 +987,25 @@ mlir::sparse_tensor::SparseTensorType::getCOOSegments() const {
   return ret;
 }
 
+//===----------------------------------------------------------------------===//
+// SparseTensorType Methods.
+//===----------------------------------------------------------------------===//
+
+bool mlir::sparse_tensor::SparseTensorType::isCOOType(Level startLvl,
+                                                      bool isUnique) const {
+  if (!hasEncoding())
+    return false;
+  if (!isCompressedLvl(startLvl) && !isLooseCompressedLvl(startLvl))
+    return false;
+  for (Level l = startLvl + 1; l < lvlRank; ++l)
+    if (!isSingletonLvl(l))
+      return false;
+  // If isUnique is true, then make sure that the last level is unique,
+  // that is, when lvlRank == 1, the only compressed level is unique,
+  // and when lvlRank > 1, the last singleton is unique.
+  return !isUnique || isUniqueLvl(lvlRank - 1);
+}
+
 RankedTensorType
 mlir::sparse_tensor::SparseTensorType::getCOOType(bool ordered) const {
   SmallVector<LevelType> lvlTypes;
diff --git a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
index e58503d508ce..ab4d90ec745d 100644
--- a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
@@ -31,8 +31,9 @@
 
 void mlir::sparse_tensor::buildSparsifier(OpPassManager &pm,
                                           const SparsifierOptions &options) {
-  // Rewrite named linalg ops into generic ops.
+  // Rewrite named linalg ops into generic ops and apply fusion.
   pm.addNestedPass<func::FuncOp>(createLinalgGeneralizeNamedOpsPass());
+  pm.addNestedPass<func::FuncOp>(createLinalgElementwiseOpFusionPass());
 
   // Sparsification and bufferization mini-pipeline.
   pm.addPass(createSparsificationAndBufferizationPass(
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
index 5679f277e148..164e722c45db 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
@@ -417,11 +417,17 @@ static void genEndInsert(OpBuilder &builder, Location loc,
 /// Generates a subview into the sizes.
 static Value genSliceToSize(OpBuilder &builder, Location loc, Value mem,
                             Value sz) {
-  auto elemTp = llvm::cast<MemRefType>(mem.getType()).getElementType();
+  auto memTp = llvm::cast<MemRefType>(mem.getType());
+  // For higher-dimensional memrefs, we assume that the innermost
+  // dimension is always of the right size.
+  // TODO: generate complex truncating view here too?
+  if (memTp.getRank() > 1)
+    return mem;
+  // Truncate linear memrefs to given size.
   return builder
       .create<memref::SubViewOp>(
-          loc, MemRefType::get({ShapedType::kDynamic}, elemTp), mem,
-          ValueRange{}, ValueRange{sz}, ValueRange{},
+          loc, MemRefType::get({ShapedType::kDynamic}, memTp.getElementType()),
+          mem, ValueRange{}, ValueRange{sz}, ValueRange{},
           ArrayRef<int64_t>{0},                    // static offset
           ArrayRef<int64_t>{ShapedType::kDynamic}, // dynamic size
           ArrayRef<int64_t>{1})                    // static stride
@@ -1050,10 +1056,14 @@ public:
   matchAndRewrite(ToPositionsOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     // Replace the requested position access with corresponding field.
-    // The cast_op is inserted by type converter to intermix 1:N type
-    // conversion.
+    // The view is restricted to the actual size to ensure clients
+    // of this operation truly observe size, not capacity!
+    Location loc = op.getLoc();
+    Level lvl = op.getLevel();
     auto desc = getDescriptorFromTensorTuple(adaptor.getTensor());
-    rewriter.replaceOp(op, desc.getPosMemRef(op.getLevel()));
+    auto mem = desc.getPosMemRef(lvl);
+    auto size = desc.getPosMemSize(rewriter, loc, lvl);
+    rewriter.replaceOp(op, genSliceToSize(rewriter, loc, mem, size));
     return success();
   }
 };
@@ -1068,12 +1078,17 @@ public:
   matchAndRewrite(ToCoordinatesOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     // Replace the requested coordinates access with corresponding field.
-    // The cast_op is inserted by type converter to intermix 1:N type
-    // conversion.
+    // The view is restricted to the actual size to ensure clients
+    // of this operation truly observe size, not capacity!
+    Location loc = op.getLoc();
+    Level lvl = op.getLevel();
     auto desc = getDescriptorFromTensorTuple(adaptor.getTensor());
-    rewriter.replaceOp(
-        op, desc.getCrdMemRefOrView(rewriter, op.getLoc(), op.getLevel()));
-
+    auto mem = desc.getCrdMemRefOrView(rewriter, loc, lvl);
+    if (lvl < getSparseTensorType(op.getTensor()).getAoSCOOStart()) {
+      auto size = desc.getCrdMemSize(rewriter, loc, lvl);
+      mem = genSliceToSize(rewriter, loc, mem, size);
+    }
+    rewriter.replaceOp(op, mem);
     return success();
   }
 };
@@ -1088,11 +1103,14 @@ public:
   matchAndRewrite(ToCoordinatesBufferOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     // Replace the requested coordinates access with corresponding field.
-    // The cast_op is inserted by type converter to intermix 1:N type
-    // conversion.
+    // The view is restricted to the actual size to ensure clients
+    // of this operation truly observe size, not capacity!
+    Location loc = op.getLoc();
+    Level lvl = getSparseTensorType(op.getTensor()).getAoSCOOStart();
     auto desc = getDescriptorFromTensorTuple(adaptor.getTensor());
-    rewriter.replaceOp(op, desc.getAOSMemRef());
-
+    auto mem = desc.getAOSMemRef();
+    auto size = desc.getCrdMemSize(rewriter, loc, lvl);
+    rewriter.replaceOp(op, genSliceToSize(rewriter, loc, mem, size));
     return success();
   }
 };
@@ -1106,10 +1124,13 @@ public:
   matchAndRewrite(ToValuesOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     // Replace the requested values access with corresponding field.
-    // The cast_op is inserted by type converter to intermix 1:N type
-    // conversion.
+    // The view is restricted to the actual size to ensure clients
+    // of this operation truly observe size, not capacity!
+    Location loc = op.getLoc();
     auto desc = getDescriptorFromTensorTuple(adaptor.getTensor());
-    rewriter.replaceOp(op, desc.getValMemRef());
+    auto mem = desc.getValMemRef();
+    auto size = desc.getValMemSize(rewriter, loc);
+    rewriter.replaceOp(op, genSliceToSize(rewriter, loc, mem, size));
     return success();
   }
 };
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
index 7d469198a653..da635c257888 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
@@ -785,45 +785,67 @@ public:
   }
 
 private:
-  // Helper to print contents of a single memref. Note that for the "push_back"
-  // vectors, this prints the full capacity, not just the size. This is done
-  // on purpose, so that clients see how much storage has been allocated in
-  // total. Contents of the extra capacity in the buffer may be uninitialized
-  // (unless the flag enable-buffer-initialization is set to true).
+  // Helper to print contents of a single memref. For "push_back" vectors,
+  // we assume that the previous getters for pos/crd/val have added a
+  // slice-to-size view to make sure we just print the size and not the
+  // full capacity.
   //
-  // Generates code to print:
+  // Generates code to print (1-dim or higher):
   //    ( a0, a1, ... )
   static void printContents(PatternRewriter &rewriter, Location loc,
                             Value vec) {
+    auto shape = cast<ShapedType>(vec.getType()).getShape();
+    SmallVector<Value> idxs;
+    printContentsLevel(rewriter, loc, vec, 0, shape, idxs);
+    rewriter.create<vector::PrintOp>(loc, vector::PrintPunctuation::NewLine);
+  }
+
+  // Helper to the helper.
+  static void printContentsLevel(PatternRewriter &rewriter, Location loc,
+                                 Value vec, unsigned i, ArrayRef<int64_t> shape,
+                                 SmallVectorImpl<Value> &idxs) {
     // Open bracket.
     rewriter.create<vector::PrintOp>(loc, vector::PrintPunctuation::Open);
-    // For loop over elements.
+    // Generate for loop.
     auto zero = constantIndex(rewriter, loc, 0);
-    auto size = rewriter.create<memref::DimOp>(loc, vec, zero);
+    auto index = constantIndex(rewriter, loc, i);
+    auto size = rewriter.create<memref::DimOp>(loc, vec, index);
     auto step = constantIndex(rewriter, loc, 1);
     auto forOp = rewriter.create<scf::ForOp>(loc, zero, size, step);
+    idxs.push_back(forOp.getInductionVar());
     rewriter.setInsertionPointToStart(forOp.getBody());
-    auto idx = forOp.getInductionVar();
-    auto val = rewriter.create<memref::LoadOp>(loc, vec, idx);
-    if (llvm::isa<ComplexType>(val.getType())) {
-      // Since the vector dialect does not support complex types in any op,
-      // we split those into (real, imag) pairs here.
-      Value real = rewriter.create<complex::ReOp>(loc, val);
-      Value imag = rewriter.create<complex::ImOp>(loc, val);
-      rewriter.create<vector::PrintOp>(loc, vector::PrintPunctuation::Open);
-      rewriter.create<vector::PrintOp>(loc, real,
-                                       vector::PrintPunctuation::Comma);
-      rewriter.create<vector::PrintOp>(loc, imag,
-                                       vector::PrintPunctuation::Close);
-      rewriter.create<vector::PrintOp>(loc, vector::PrintPunctuation::Comma);
+    if (i < shape.size() - 1) {
+      // Enter deeper loop nest.
+      printContentsLevel(rewriter, loc, vec, i + 1, shape, idxs);
     } else {
-      rewriter.create<vector::PrintOp>(loc, val,
-                                       vector::PrintPunctuation::Comma);
+      // Actual contents printing.
+      auto val = rewriter.create<memref::LoadOp>(loc, vec, idxs);
+      if (llvm::isa<ComplexType>(val.getType())) {
+        // Since the vector dialect does not support complex types in any op,
+        // we split those into (real, imag) pairs here.
+        Value real = rewriter.create<complex::ReOp>(loc, val);
+        Value imag = rewriter.create<complex::ImOp>(loc, val);
+        rewriter.create<vector::PrintOp>(loc, vector::PrintPunctuation::Open);
+        rewriter.create<vector::PrintOp>(loc, real,
+                                         vector::PrintPunctuation::Comma);
+        rewriter.create<vector::PrintOp>(loc, imag,
+                                         vector::PrintPunctuation::Close);
+      } else {
+        rewriter.create<vector::PrintOp>(
+            loc, val, vector::PrintPunctuation::NoPunctuation);
+      }
+      // Terminating comma (except at end).
+      auto bound = rewriter.create<arith::AddIOp>(loc, idxs.back(), step);
+      Value cond = rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ne,
+                                                  bound, size);
+      scf::IfOp ifOp = rewriter.create<scf::IfOp>(loc, cond, /*else*/ false);
+      rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front());
+      rewriter.create<vector::PrintOp>(loc, vector::PrintPunctuation::Comma);
     }
+    idxs.pop_back();
     rewriter.setInsertionPointAfter(forOp);
-    // Close bracket and end of line.
+    // Close bracket.
     rewriter.create<vector::PrintOp>(loc, vector::PrintPunctuation::Close);
-    rewriter.create<vector::PrintOp>(loc, vector::PrintPunctuation::NewLine);
   }
 
   // Helper method to print run-time lvl/dim sizes.
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.h
index cf3c35f5fa4c..d0ef8a6860bb 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.h
@@ -401,9 +401,12 @@ inline Value constantLevelTypeEncoding(OpBuilder &builder, Location loc,
 
 // Generates a constant from a validated value carrying attribute.
 inline Value genValFromAttr(OpBuilder &builder, Location loc, Attribute attr) {
-  if (auto arrayAttr = dyn_cast<ArrayAttr>(attr)) {
-    Type tp = cast<TypedAttr>(arrayAttr[0]).getType();
-    return builder.create<complex::ConstantOp>(loc, tp, arrayAttr);
+  if (auto complexAttr = dyn_cast<complex::NumberAttr>(attr)) {
+    Type tp = cast<ComplexType>(complexAttr.getType()).getElementType();
+    return builder.create<complex::ConstantOp>(
+        loc, complexAttr.getType(),
+        builder.getArrayAttr({FloatAttr::get(tp, complexAttr.getReal()),
+                              FloatAttr::get(tp, complexAttr.getImag())}));
   }
   return builder.create<arith::ConstantOp>(loc, cast<TypedAttr>(attr));
 }
diff --git a/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp b/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
index 9c0aed3c18ef..308fbd965259 100644
--- a/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
@@ -1356,50 +1356,54 @@ Merger::buildTensorExp(linalg::GenericOp op, Value v) {
   // See buildLattices() for an explanation of rejecting certain
   // division and shift operations.
   if (def->getNumOperands() == 2) {
-    const auto [x, xDepSp] = buildTensorExp(op, def->getOperand(0));
-    const auto [y, yDepSp] = buildTensorExp(op, def->getOperand(1));
-    bool hasSpDep = xDepSp || yDepSp;
+    const auto [x, xSpVals] = buildTensorExp(op, def->getOperand(0));
+    const auto [y, ySpVals] = buildTensorExp(op, def->getOperand(1));
+    // For a conjunctive operation, it yields a "sparse" result if any operand
+    // is sparse. For a disjunctive operation, it yields a "sparse" result if
+    // all operands are sparse.
+    bool conjSpVals = xSpVals || ySpVals;
+    bool disjSpVals = xSpVals && ySpVals;
     if (x.has_value() && y.has_value()) {
       const ExprId e0 = *x;
       const ExprId e1 = *y;
       if (isa<arith::MulFOp>(def))
-        return {addExp(TensorExp::Kind::kMulF, e0, e1), hasSpDep};
+        return {addExp(TensorExp::Kind::kMulF, e0, e1), conjSpVals};
       if (isa<complex::MulOp>(def))
-        return {addExp(TensorExp::Kind::kMulC, e0, e1), hasSpDep};
+        return {addExp(TensorExp::Kind::kMulC, e0, e1), conjSpVals};
       if (isa<arith::MulIOp>(def))
-        return {addExp(TensorExp::Kind::kMulI, e0, e1), hasSpDep};
+        return {addExp(TensorExp::Kind::kMulI, e0, e1), conjSpVals};
       if (isa<arith::DivFOp>(def) && !maybeZero(e1))
-        return {addExp(TensorExp::Kind::kDivF, e0, e1), hasSpDep};
+        return {addExp(TensorExp::Kind::kDivF, e0, e1), conjSpVals};
       if (isa<complex::DivOp>(def) && !maybeZero(e1))
-        return {addExp(TensorExp::Kind::kDivC, e0, e1), hasSpDep};
+        return {addExp(TensorExp::Kind::kDivC, e0, e1), conjSpVals};
       if (isa<arith::DivSIOp>(def) && !maybeZero(e1))
-        return {addExp(TensorExp::Kind::kDivS, e0, e1), hasSpDep};
+        return {addExp(TensorExp::Kind::kDivS, e0, e1), conjSpVals};
       if (isa<arith::DivUIOp>(def) && !maybeZero(e1))
-        return {addExp(TensorExp::Kind::kDivU, e0, e1), hasSpDep};
+        return {addExp(TensorExp::Kind::kDivU, e0, e1), conjSpVals};
       if (isa<arith::AddFOp>(def))
-        return {addExp(TensorExp::Kind::kAddF, e0, e1), hasSpDep};
+        return {addExp(TensorExp::Kind::kAddF, e0, e1), disjSpVals};
       if (isa<complex::AddOp>(def))
-        return {addExp(TensorExp::Kind::kAddC, e0, e1), hasSpDep};
+        return {addExp(TensorExp::Kind::kAddC, e0, e1), disjSpVals};
       if (isa<arith::AddIOp>(def))
-        return {addExp(TensorExp::Kind::kAddI, e0, e1), hasSpDep};
+        return {addExp(TensorExp::Kind::kAddI, e0, e1), disjSpVals};
       if (isa<arith::SubFOp>(def))
-        return {addExp(TensorExp::Kind::kSubF, e0, e1), hasSpDep};
+        return {addExp(TensorExp::Kind::kSubF, e0, e1), disjSpVals};
       if (isa<complex::SubOp>(def))
-        return {addExp(TensorExp::Kind::kSubC, e0, e1), hasSpDep};
+        return {addExp(TensorExp::Kind::kSubC, e0, e1), disjSpVals};
       if (isa<arith::SubIOp>(def))
-        return {addExp(TensorExp::Kind::kSubI, e0, e1), hasSpDep};
+        return {addExp(TensorExp::Kind::kSubI, e0, e1), disjSpVals};
       if (isa<arith::AndIOp>(def))
-        return {addExp(TensorExp::Kind::kAndI, e0, e1), hasSpDep};
+        return {addExp(TensorExp::Kind::kAndI, e0, e1), conjSpVals};
       if (isa<arith::OrIOp>(def))
-        return {addExp(TensorExp::Kind::kOrI, e0, e1), hasSpDep};
+        return {addExp(TensorExp::Kind::kOrI, e0, e1), disjSpVals};
       if (isa<arith::XOrIOp>(def))
-        return {addExp(TensorExp::Kind::kXorI, e0, e1), hasSpDep};
+        return {addExp(TensorExp::Kind::kXorI, e0, e1), disjSpVals};
       if (isa<arith::ShRSIOp>(def) && isInvariant(e1))
-        return {addExp(TensorExp::Kind::kShrS, e0, e1), hasSpDep};
+        return {addExp(TensorExp::Kind::kShrS, e0, e1), conjSpVals};
       if (isa<arith::ShRUIOp>(def) && isInvariant(e1))
-        return {addExp(TensorExp::Kind::kShrU, e0, e1), hasSpDep};
+        return {addExp(TensorExp::Kind::kShrU, e0, e1), conjSpVals};
       if (isa<arith::ShLIOp>(def) && isInvariant(e1))
-        return {addExp(TensorExp::Kind::kShlI, e0, e1), hasSpDep};
+        return {addExp(TensorExp::Kind::kShlI, e0, e1), conjSpVals};
       if (auto ci = dyn_cast<arith::CmpIOp>(def)) {
         if (ci.getPredicate() == arith::CmpIPredicate::eq &&
             ci.getPredicate() == arith::CmpIPredicate::sle &&
@@ -1413,7 +1417,7 @@ Merger::buildTensorExp(linalg::GenericOp op, Value v) {
 
         auto e = addExp(TensorExp::Kind::kCmpI, e0, e1, nullptr,
                         ci.getPredicateAttr());
-        return {e, hasSpDep};
+        return {e, conjSpVals};
       }
       if (auto cf = dyn_cast<arith::CmpFOp>(def)) {
         if (cf.getPredicate() == arith::CmpFPredicate::OEQ &&
@@ -1431,7 +1435,7 @@ Merger::buildTensorExp(linalg::GenericOp op, Value v) {
         }
         auto e = addExp(TensorExp::Kind::kCmpF, e0, e1, nullptr,
                         cf.getPredicateAttr());
-        return {e, hasSpDep};
+        return {e, conjSpVals};
       }
       if (auto binop = dyn_cast<sparse_tensor::BinaryOp>(def)) {
         if (isAdmissibleBranch(binop, binop.getOverlapRegion()) &&
@@ -1439,7 +1443,7 @@ Merger::buildTensorExp(linalg::GenericOp op, Value v) {
              isAdmissibleBranch(binop, binop.getLeftRegion())) &&
             (binop.getRightIdentity() ||
              isAdmissibleBranch(binop, binop.getRightRegion())))
-          return {addExp(TensorExp::Kind::kBinary, e0, e1, def), hasSpDep};
+          return {addExp(TensorExp::Kind::kBinary, e0, e1, def), conjSpVals};
       }
     }
   }
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index 4c65045084dc..1f94397e823f 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -1676,10 +1676,12 @@ void ExpandShapeOp::build(OpBuilder &builder, OperationState &result,
   auto tensorResultTy = cast<RankedTensorType>(resultType);
   FailureOr<SmallVector<OpFoldResult>> outputShape = inferOutputShape(
       builder, result.location, tensorResultTy, reassociation, inputShape);
-  // Failure of this assertion usually indicates presence of multiple
-  // dynamic dimensions in the same reassociation group.
-  assert(succeeded(outputShape) && "unable to infer output shape");
-  build(builder, result, tensorResultTy, src, reassociation, *outputShape);
+  SmallVector<OpFoldResult> outputShapeOrEmpty;
+  if (succeeded(outputShape)) {
+    outputShapeOrEmpty = *outputShape;
+  }
+  build(builder, result, tensorResultTy, src, reassociation,
+        outputShapeOrEmpty);
 }
 
 SmallVector<AffineMap, 4> CollapseShapeOp::getReassociationMaps() {
@@ -2711,15 +2713,38 @@ struct InsertSliceOpCastFolder final : public OpRewritePattern<InsertOpTy> {
     auto dstType = llvm::dyn_cast<RankedTensorType>(dst.getType());
     if (!srcType || !dstType)
       return failure();
+
+    // The tensor.cast source could have additional static information not seen
+    // in the insert slice op static sizes, so we ignore dynamic dims when
+    // computing the rank reduction mask.
+    SmallVector<int64_t> staticSizes(insertSliceOp.getStaticSizes());
+    auto rankReductionMask = computeRankReductionMask(
+        staticSizes, srcType.getShape(), /*matchDynamic=*/true);
+    if (!rankReductionMask.has_value())
+      return failure();
+    // Replace dimensions in the insert slice op with corresponding static dims
+    // from the cast source type. If the insert slice sizes have static dims
+    // that are not static in the tensor.cast source (i.e., when the cast op
+    // casts a dynamic dim to static), the dim should not be replaced, and the
+    // pattern will fail later in `verifyInsertSliceOp`.
+    SmallVector<OpFoldResult> mixedSizes(insertSliceOp.getMixedSizes());
+    int64_t rankReducedIdx = 0;
+    for (auto [idx, size] : enumerate(staticSizes)) {
+      if (!rankReductionMask.value().contains(idx) &&
+          !srcType.isDynamicDim(rankReducedIdx)) {
+        mixedSizes[idx] = getAsIndexOpFoldResult(
+            rewriter.getContext(), srcType.getDimSize(rankReducedIdx));
+        size = srcType.getDimSize(rankReducedIdx++);
+      }
+    }
     if (verifyInsertSliceOp(srcType, dstType, insertSliceOp.getStaticOffsets(),
-                            insertSliceOp.getStaticSizes(),
-                            insertSliceOp.getStaticStrides()) !=
+                            staticSizes, insertSliceOp.getStaticStrides()) !=
         SliceVerificationResult::Success)
       return failure();
 
     Operation *replacement = rewriter.create<InsertOpTy>(
         insertSliceOp.getLoc(), src, dst, insertSliceOp.getMixedOffsets(),
-        insertSliceOp.getMixedSizes(), insertSliceOp.getMixedStrides());
+        mixedSizes, insertSliceOp.getMixedStrides());
 
     // In the parallel case there is no result and so nothing to cast.
     bool isParallelInsert =
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp
index 8d937217d706..a94bb3a920b1 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp
@@ -182,9 +182,9 @@ public:
     // Pad the weight so that it is modulo of the striding.
     llvm::SmallVector<int32_t, 8> weightPadding = {0, 0, 0, 0, 0, 0, 0, 0};
     weightPadding[3] =
-        weightHeight % stride[0] ? stride[0] - weightHeight % stride[0] : 0;
+        (weightHeight % stride[0]) ? (stride[0] - weightHeight % stride[0]) : 0;
     weightPadding[5] =
-        weightWidth % stride[1] ? stride[1] - weightWidth % stride[1] : 0;
+        (weightWidth % stride[1]) ? (stride[1] - weightWidth % stride[1]) : 0;
     DenseElementsAttr weightPaddingAttr = DenseIntElementsAttr::get(
         RankedTensorType::get({4, 2}, rewriter.getI32Type()), weightPadding);
     Value weightPaddingVal = createOpAndInfer<tosa::ConstOp>(
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaInferShapes.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaInferShapes.cpp
index 8614559e2a6f..b1d572054184 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaInferShapes.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaInferShapes.cpp
@@ -18,6 +18,7 @@
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"
 #include "mlir/Dialect/Tosa/Utils/ShapeUtils.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/Interfaces/InferTypeOpInterface.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -39,13 +40,10 @@ namespace {
 // type-inference related interface.
 // When a non-replaceable use is encountered, the value is wrapped in a
 // cast back to the original type after inference.
-bool isReplaceableUser(Operation *user) {
-  // Handle unregistered dialects.
+bool canBeRefined(Operation *user) {
   if (!user->getDialect())
     return false;
-
-  return user->getDialect()->getNamespace() ==
-             TosaDialect::getDialectNamespace() ||
+  return user->getDialect()->getTypeID() == TypeID::get<TosaDialect>() ||
          isa<InferTypeOpInterface, InferShapedTypeOpInterface>(user);
 }
 
@@ -53,16 +51,16 @@ bool isReplaceableUser(Operation *user) {
 // updated. For the tosa.while_loop operation, types are speculatively updated
 // within the body region to determine the output type of the while_loop. This
 // process is performed until a fixed point is reached, then the types are
-// reverted.
+// rolled back.
 //
-// This class encapsulates the state information needed to perform the reversion
+// This class encapsulates the state information needed to perform the roll back
 // process or to commit to the final changes.
 class TypeModificationState {
 public:
   TypeModificationState() = default;
 
   ~TypeModificationState() {
-    // Ensure the recorded modifications are either committed or reverted.
+    // Ensure the recorded modifications are either committed or rolled back.
     assert(oldTypes.empty() && "unhandled type modifications");
   }
 
@@ -74,10 +72,9 @@ public:
     }
   }
 
-  // Revert changes made to the types in the IR by setting all the affected
+  // Roll back changes made to the types in the IR by setting all the affected
   // values to their old types.
-  void revert() {
-    // Otherwise revert the changes.
+  void rollBack() {
     for (auto [value, type] : oldTypes)
       value.setType(type);
 
@@ -91,15 +88,18 @@ public:
     // For each use whose type changed, cast the value with the new type back to
     // the old type.
     for (auto [value, oldType] : oldTypes) {
+      tensor::CastOp castedValue;
       for (auto &use : value.getUses()) {
-        if (isReplaceableUser(use.getOwner()))
+        if (canBeRefined(use.getOwner()))
           continue;
 
-        OpBuilder builder(value.getContext());
-        builder.setInsertionPoint(use.getOwner());
+        // Cache the cast to avoid generating duplicates
+        if (!castedValue) {
+          ImplicitLocOpBuilder builder{value.getLoc(), use.getOwner()};
+          castedValue = builder.create<tensor::CastOp>(oldType, value);
+        }
 
-        Location loc = value.getLoc();
-        use.set(builder.create<tensor::CastOp>(loc, oldType, value));
+        use.set(castedValue);
       }
     }
 
@@ -211,8 +211,8 @@ void propagateShapesToTosaWhile(Operation &op, TypeModificationState &state) {
       argTypes[i] = newType;
     }
 
-    // Revert all changes made during the speculative part of the algorithm.
-    localState.revert();
+    // Roll back all changes made during the speculative part of the algorithm.
+    localState.rollBack();
   }
 
   // We now set the block arguments according to the most recent shape
@@ -228,10 +228,11 @@ void propagateShapesToTosaWhile(Operation &op, TypeModificationState &state) {
 }
 
 void propagateShapesInRegion(Region &region, TypeModificationState &state) {
+  Dialect *tosaDialect = region.getContext()->getLoadedDialect<TosaDialect>();
+
   for (auto &block : region) {
     for (Operation &op : block) {
-      if (!op.getDialect() ||
-          op.getDialect()->getNamespace() != TosaDialect::getDialectNamespace())
+      if (op.getDialect() != tosaDialect)
         continue;
 
       propagateShapesToTosaIf(op, state);
diff --git a/mlir/lib/Dialect/Transform/CMakeLists.txt b/mlir/lib/Dialect/Transform/CMakeLists.txt
index 64115dcc29d6..0c0d5ebe0c21 100644
--- a/mlir/lib/Dialect/Transform/CMakeLists.txt
+++ b/mlir/lib/Dialect/Transform/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_subdirectory(DebugExtension)
 add_subdirectory(Interfaces)
 add_subdirectory(IR)
+add_subdirectory(IRDLExtension)
 add_subdirectory(LoopExtension)
 add_subdirectory(PDLExtension)
 add_subdirectory(Transforms)
diff --git a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
index 7a5a69747005..eb09f007fbca 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
@@ -19,6 +19,7 @@
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/Dominance.h"
+#include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/OperationSupport.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/Verifier.h"
@@ -834,19 +835,23 @@ bool transform::CastOp::areCastCompatible(TypeRange inputs, TypeRange outputs) {
 // CollectMatchingOp
 //===----------------------------------------------------------------------===//
 
-/// Applies matcher operations from the given `block` assigning `op` as the
-/// payload of the block's first argument. Updates `state` accordingly. If any
-/// of the matcher produces a silenceable failure, discards it (printing the
-/// content to the debug output stream) and returns failure. If any of the
-/// matchers produces a definite failure, reports it and returns failure. If all
-/// matchers in the block succeed, populates `mappings` with the payload
-/// entities associated with the block terminator operands.
+/// Applies matcher operations from the given `block` using
+/// `blockArgumentMapping` to initialize block arguments. Updates `state`
+/// accordingly. If any of the matcher produces a silenceable failure, discards
+/// it (printing the content to the debug output stream) and returns failure. If
+/// any of the matchers produces a definite failure, reports it and returns
+/// failure. If all matchers in the block succeed, populates `mappings` with the
+/// payload entities associated with the block terminator operands. Note that
+/// `mappings` will be cleared before that.
 static DiagnosedSilenceableFailure
-matchBlock(Block &block, Operation *op, transform::TransformState &state,
+matchBlock(Block &block,
+           ArrayRef<SmallVector<transform::MappedValue>> blockArgumentMapping,
+           transform::TransformState &state,
            SmallVectorImpl<SmallVector<transform::MappedValue>> &mappings) {
   assert(block.getParent() && "cannot match using a detached block");
   auto matchScope = state.make_region_scope(*block.getParent());
-  if (failed(state.mapBlockArgument(block.getArgument(0), {op})))
+  if (failed(
+          state.mapBlockArguments(block.getArguments(), blockArgumentMapping)))
     return DiagnosedSilenceableFailure::definiteFailure();
 
   for (Operation &match : block.without_terminator()) {
@@ -866,6 +871,9 @@ matchBlock(Block &block, Operation *op, transform::TransformState &state,
   // Remember the values mapped to the terminator operands so we can
   // forward them to the action.
   ValueRange yieldedValues = block.getTerminator()->getOperands();
+  // Our contract with the caller is that the mappings will contain only the
+  // newly mapped values, clear the rest.
+  mappings.clear();
   transform::detail::prepareValueMappings(mappings, yieldedValues, state);
   return DiagnosedSilenceableFailure::success();
 }
@@ -915,8 +923,11 @@ transform::CollectMatchingOp::apply(transform::TransformRewriter &rewriter,
 
       // Try matching.
       SmallVector<SmallVector<MappedValue>> mappings;
-      DiagnosedSilenceableFailure diag =
-          matchBlock(matcher.getFunctionBody().front(), op, state, mappings);
+      SmallVector<transform::MappedValue> inputMapping({op});
+      DiagnosedSilenceableFailure diag = matchBlock(
+          matcher.getFunctionBody().front(),
+          ArrayRef<SmallVector<transform::MappedValue>>(inputMapping), state,
+          mappings);
       if (diag.isDefiniteFailure())
         return WalkResult::interrupt();
       if (diag.isSilenceableFailure()) {
@@ -1001,6 +1012,9 @@ LogicalResult transform::CollectMatchingOp::verifySymbolUses(
 // ForeachMatchOp
 //===----------------------------------------------------------------------===//
 
+// This is fine because nothing is actually consumed by this op.
+bool transform::ForeachMatchOp::allowsRepeatedHandleOperands() { return true; }
+
 DiagnosedSilenceableFailure
 transform::ForeachMatchOp::apply(transform::TransformRewriter &rewriter,
                                  transform::TransformResults &results,
@@ -1030,6 +1044,18 @@ transform::ForeachMatchOp::apply(transform::TransformRewriter &rewriter,
 
   DiagnosedSilenceableFailure overallDiag =
       DiagnosedSilenceableFailure::success();
+
+  SmallVector<SmallVector<MappedValue>> matchInputMapping;
+  SmallVector<SmallVector<MappedValue>> matchOutputMapping;
+  SmallVector<SmallVector<MappedValue>> actionResultMapping;
+  // Explicitly add the mapping for the first block argument (the op being
+  // matched).
+  matchInputMapping.emplace_back();
+  transform::detail::prepareValueMappings(matchInputMapping,
+                                          getForwardedInputs(), state);
+  SmallVector<MappedValue> &firstMatchArgument = matchInputMapping.front();
+  actionResultMapping.resize(getForwardedOutputs().size());
+
   for (Operation *root : state.getPayloadOps(getRoot())) {
     WalkResult walkResult = root->walk([&](Operation *op) {
       // If getRestrictRoot is not present, skip over the root op itself so we
@@ -1044,11 +1070,14 @@ transform::ForeachMatchOp::apply(transform::TransformRewriter &rewriter,
         llvm::dbgs() << " @" << op << "\n";
       });
 
+      firstMatchArgument.clear();
+      firstMatchArgument.push_back(op);
+
       // Try all the match/action pairs until the first successful match.
       for (auto [matcher, action] : matchActionPairs) {
-        SmallVector<SmallVector<MappedValue>> mappings;
         DiagnosedSilenceableFailure diag =
-            matchBlock(matcher.getFunctionBody().front(), op, state, mappings);
+            matchBlock(matcher.getFunctionBody().front(), matchInputMapping,
+                       state, matchOutputMapping);
         if (diag.isDefiniteFailure())
           return WalkResult::interrupt();
         if (diag.isSilenceableFailure()) {
@@ -1058,10 +1087,10 @@ transform::ForeachMatchOp::apply(transform::TransformRewriter &rewriter,
         }
 
         auto scope = state.make_region_scope(action.getFunctionBody());
-        for (auto &&[arg, map] : llvm::zip_equal(
-                 action.getFunctionBody().front().getArguments(), mappings)) {
-          if (failed(state.mapBlockArgument(arg, map)))
-            return WalkResult::interrupt();
+        if (failed(state.mapBlockArguments(
+                action.getFunctionBody().front().getArguments(),
+                matchOutputMapping))) {
+          return WalkResult::interrupt();
         }
 
         for (Operation &transform :
@@ -1082,6 +1111,16 @@ transform::ForeachMatchOp::apply(transform::TransformRewriter &rewriter,
             continue;
           }
         }
+        if (failed(detail::appendValueMappings(
+                MutableArrayRef<SmallVector<MappedValue>>(actionResultMapping),
+                action.getFunctionBody().front().getTerminator()->getOperands(),
+                state, getFlattenResults()))) {
+          emitDefiniteFailure()
+              << "action @" << action.getName()
+              << " has results associated with multiple payload entities, "
+                 "but flattening was not requested";
+          return WalkResult::interrupt();
+        }
         break;
       }
       return WalkResult::advance();
@@ -1096,9 +1135,21 @@ transform::ForeachMatchOp::apply(transform::TransformRewriter &rewriter,
   // by actions, are invalidated.
   results.set(llvm::cast<OpResult>(getUpdated()),
               state.getPayloadOps(getRoot()));
+  for (auto &&[result, mapping] :
+       llvm::zip_equal(getForwardedOutputs(), actionResultMapping)) {
+    results.setMappedValues(result, mapping);
+  }
   return overallDiag;
 }
 
+void transform::ForeachMatchOp::getAsmResultNames(
+    OpAsmSetValueNameFn setNameFn) {
+  setNameFn(getUpdated(), "updated_root");
+  for (Value v : getForwardedOutputs()) {
+    setNameFn(v, "yielded");
+  }
+}
+
 void transform::ForeachMatchOp::getEffects(
     SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
   // Bail if invalid.
@@ -1108,7 +1159,8 @@ void transform::ForeachMatchOp::getEffects(
   }
 
   consumesHandle(getRoot(), effects);
-  producesHandle(getUpdated(), effects);
+  onlyReadsHandle(getForwardedInputs(), effects);
+  producesHandle(getResults(), effects);
   modifiesPayload(effects);
 }
 
@@ -1224,6 +1276,7 @@ LogicalResult transform::ForeachMatchOp::verifySymbolUses(
       StringAttr::get(getContext(), TransformDialect::kArgConsumedAttrName);
   for (auto &&[matcher, action] :
        llvm::zip_equal(getMatchers(), getActions())) {
+    // Presence and typing.
     auto matcherSymbol = dyn_cast_or_null<FunctionOpInterface>(
         symbolTable.lookupNearestSymbolFrom(getOperation(),
                                             cast<SymbolRefAttr>(matcher)));
@@ -1250,8 +1303,41 @@ LogicalResult transform::ForeachMatchOp::verifySymbolUses(
       return failure();
     }
 
-    ArrayRef<Type> matcherResults = matcherSymbol.getResultTypes();
-    ArrayRef<Type> actionArguments = actionSymbol.getArgumentTypes();
+    // Input -> matcher forwarding.
+    TypeRange operandTypes = getOperandTypes();
+    TypeRange matcherArguments = matcherSymbol.getArgumentTypes();
+    if (operandTypes.size() != matcherArguments.size()) {
+      InFlightDiagnostic diag =
+          emitError() << "the number of operands (" << operandTypes.size()
+                      << ") doesn't match the number of matcher arguments ("
+                      << matcherArguments.size() << ") for " << matcher;
+      diag.attachNote(matcherSymbol->getLoc()) << "symbol declaration";
+      return diag;
+    }
+    for (auto &&[i, operand, argument] :
+         llvm::enumerate(operandTypes, matcherArguments)) {
+      if (matcherSymbol.getArgAttr(i, consumedAttr)) {
+        InFlightDiagnostic diag =
+            emitOpError()
+            << "does not expect matcher symbol to consume its operand #" << i;
+        diag.attachNote(matcherSymbol->getLoc()) << "symbol declaration";
+        return diag;
+      }
+
+      if (implementSameTransformInterface(operand, argument))
+        continue;
+
+      InFlightDiagnostic diag =
+          emitError()
+          << "mismatching type interfaces for operand and matcher argument #"
+          << i << " of matcher " << matcher;
+      diag.attachNote(matcherSymbol->getLoc()) << "symbol declaration";
+      return diag;
+    }
+
+    // Matcher -> action forwarding.
+    TypeRange matcherResults = matcherSymbol.getResultTypes();
+    TypeRange actionArguments = actionSymbol.getArgumentTypes();
     if (matcherResults.size() != actionArguments.size()) {
       return emitError() << "mismatching number of matcher results and "
                             "action arguments between "
@@ -1265,31 +1351,31 @@ LogicalResult transform::ForeachMatchOp::verifySymbolUses(
 
       return emitError() << "mismatching type interfaces for matcher result "
                             "and action argument #"
-                         << i;
+                         << i << "of matcher " << matcher << " and action "
+                         << action;
     }
 
-    if (!actionSymbol.getResultTypes().empty()) {
+    // Action -> result forwarding.
+    TypeRange actionResults = actionSymbol.getResultTypes();
+    auto resultTypes = TypeRange(getResultTypes()).drop_front();
+    if (actionResults.size() != resultTypes.size()) {
       InFlightDiagnostic diag =
-          emitError() << "action symbol is not expected to have results";
+          emitError() << "the number of action results ("
+                      << actionResults.size() << ") for " << action
+                      << " doesn't match the number of extra op results ("
+                      << resultTypes.size() << ")";
       diag.attachNote(actionSymbol->getLoc()) << "symbol declaration";
       return diag;
     }
+    for (auto &&[i, resultType, actionType] :
+         llvm::enumerate(resultTypes, actionResults)) {
+      if (implementSameTransformInterface(resultType, actionType))
+        continue;
 
-    if (matcherSymbol.getArgumentTypes().size() != 1 ||
-        !implementSameTransformInterface(matcherSymbol.getArgumentTypes()[0],
-                                         getRoot().getType())) {
-      InFlightDiagnostic diag =
-          emitOpError() << "expects matcher symbol to have one argument with "
-                           "the same transform interface as the first operand";
-      diag.attachNote(matcherSymbol->getLoc()) << "symbol declaration";
-      return diag;
-    }
-
-    if (matcherSymbol.getArgAttr(0, consumedAttr)) {
       InFlightDiagnostic diag =
-          emitOpError()
-          << "does not expect matcher symbol to consume its operand";
-      diag.attachNote(matcherSymbol->getLoc()) << "symbol declaration";
+          emitError() << "mismatching type interfaces for action result #" << i
+                      << " of action " << action << " and op result";
+      diag.attachNote(actionSymbol->getLoc()) << "symbol declaration";
       return diag;
     }
   }
diff --git a/mlir/lib/Dialect/Transform/IRDLExtension/CMakeLists.txt b/mlir/lib/Dialect/Transform/IRDLExtension/CMakeLists.txt
new file mode 100644
index 000000000000..9216a3d72202
--- /dev/null
+++ b/mlir/lib/Dialect/Transform/IRDLExtension/CMakeLists.txt
@@ -0,0 +1,12 @@
+add_mlir_dialect_library(MLIRTransformDialectIRDLExtension
+  IRDLExtension.cpp
+  IRDLExtensionOps.cpp
+
+  DEPENDS
+  MLIRTransformDialectIRDLExtensionOpsIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRTransformDialect
+  MLIRIRDL
+)
diff --git a/mlir/lib/Dialect/Transform/IRDLExtension/IRDLExtension.cpp b/mlir/lib/Dialect/Transform/IRDLExtension/IRDLExtension.cpp
new file mode 100644
index 000000000000..94004365b8a1
--- /dev/null
+++ b/mlir/lib/Dialect/Transform/IRDLExtension/IRDLExtension.cpp
@@ -0,0 +1,34 @@
+//===- IRDLExtension.cpp - IRDL extension for the Transform dialect -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Transform/IRDLExtension/IRDLExtension.h"
+#include "mlir/Dialect/IRDL/IR/IRDL.h"
+#include "mlir/Dialect/Transform/IR/TransformDialect.h"
+#include "mlir/Dialect/Transform/IRDLExtension/IRDLExtensionOps.h"
+#include "mlir/IR/DialectRegistry.h"
+
+using namespace mlir;
+
+namespace {
+class IRDLExtension
+    : public transform::TransformDialectExtension<IRDLExtension> {
+public:
+  void init() {
+    registerTransformOps<
+#define GET_OP_LIST
+#include "mlir/Dialect/Transform/IRDLExtension/IRDLExtensionOps.cpp.inc"
+        >();
+
+    declareDependentDialect<irdl::IRDLDialect>();
+  }
+};
+} // namespace
+
+void mlir::transform::registerIRDLExtension(DialectRegistry &dialectRegistry) {
+  dialectRegistry.addExtensions<IRDLExtension>();
+}
diff --git a/mlir/lib/Dialect/Transform/IRDLExtension/IRDLExtensionOps.cpp b/mlir/lib/Dialect/Transform/IRDLExtension/IRDLExtensionOps.cpp
new file mode 100644
index 000000000000..9cc579e65edf
--- /dev/null
+++ b/mlir/lib/Dialect/Transform/IRDLExtension/IRDLExtensionOps.cpp
@@ -0,0 +1,84 @@
+//===- IRDLExtensionOps.cpp - IRDL extension for the Transform dialect ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Transform/IRDLExtension/IRDLExtensionOps.h"
+#include "mlir/Dialect/IRDL/IR/IRDL.h"
+#include "mlir/Dialect/IRDL/IRDLVerifiers.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/ExtensibleDialect.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "llvm/ADT/STLExtras.h"
+
+using namespace mlir;
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/Transform/IRDLExtension/IRDLExtensionOps.cpp.inc"
+
+namespace mlir::transform {
+
+DiagnosedSilenceableFailure
+IRDLCollectMatchingOp::apply(TransformRewriter &rewriter,
+                             TransformResults &results, TransformState &state) {
+  auto dialect = cast<irdl::DialectOp>(getBody().front().front());
+  Block &body = dialect.getBody().front();
+  irdl::OperationOp operation = *body.getOps<irdl::OperationOp>().begin();
+  auto verifier = irdl::createVerifier(
+      operation,
+      DenseMap<irdl::TypeOp, std::unique_ptr<DynamicTypeDefinition>>(),
+      DenseMap<irdl::AttributeOp, std::unique_ptr<DynamicAttrDefinition>>());
+
+  auto handlerID = getContext()->getDiagEngine().registerHandler(
+      [](Diagnostic &) { return success(); });
+  SmallVector<Operation *> matched;
+  for (Operation *payload : state.getPayloadOps(getRoot())) {
+    payload->walk([&](Operation *target) {
+      if (succeeded(verifier(target))) {
+        matched.push_back(target);
+      }
+    });
+  }
+  getContext()->getDiagEngine().eraseHandler(handlerID);
+  results.set(cast<OpResult>(getMatched()), matched);
+  return DiagnosedSilenceableFailure::success();
+}
+
+void IRDLCollectMatchingOp::getEffects(
+    SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
+  onlyReadsHandle(getRoot(), effects);
+  producesHandle(getMatched(), effects);
+  onlyReadsPayload(effects);
+}
+
+LogicalResult IRDLCollectMatchingOp::verify() {
+  Block &bodyBlock = getBody().front();
+  if (!llvm::hasSingleElement(bodyBlock))
+    return emitOpError() << "expects a single operation in the body";
+
+  auto dialect = dyn_cast<irdl::DialectOp>(bodyBlock.front());
+  if (!dialect) {
+    return emitOpError() << "expects the body operation to be "
+                         << irdl::DialectOp::getOperationName();
+  }
+
+  // TODO: relax this by taking a symbol name of the operation to match, note
+  // that symbol name is also the name of the operation and we may want to
+  // divert from that to have constraints on-the-fly using IRDL.
+  auto irdlOperations = dialect.getOps<irdl::OperationOp>();
+  if (!llvm::hasSingleElement(irdlOperations))
+    return emitOpError() << "expects IRDL to contain exactly one operation";
+
+  if (!dialect.getOps<irdl::TypeOp>().empty() ||
+      !dialect.getOps<irdl::AttributeOp>().empty()) {
+    return emitOpError() << "IRDL types and attributes are not yet supported";
+  }
+
+  return success();
+}
+
+} // namespace mlir::transform
diff --git a/mlir/lib/Dialect/Transform/Interfaces/TransformInterfaces.cpp b/mlir/lib/Dialect/Transform/Interfaces/TransformInterfaces.cpp
index 48f3954b6cf6..b6a35e23a5d1 100644
--- a/mlir/lib/Dialect/Transform/Interfaces/TransformInterfaces.cpp
+++ b/mlir/lib/Dialect/Transform/Interfaces/TransformInterfaces.cpp
@@ -206,6 +206,15 @@ transform::TransformState::mapBlockArgument(BlockArgument argument,
       .checkAndReport();
 }
 
+LogicalResult transform::TransformState::mapBlockArguments(
+    Block::BlockArgListType arguments,
+    ArrayRef<SmallVector<MappedValue>> mapping) {
+  for (auto &&[argument, values] : llvm::zip_equal(arguments, mapping))
+    if (failed(mapBlockArgument(argument, values)))
+      return failure();
+  return success();
+}
+
 LogicalResult
 transform::TransformState::setPayloadOps(Value value,
                                          ArrayRef<Operation *> targets) {
@@ -1528,11 +1537,12 @@ void transform::detail::setApplyToOneResults(
 // Utilities for implementing transform ops with regions.
 //===----------------------------------------------------------------------===//
 
-void transform::detail::prepareValueMappings(
-    SmallVectorImpl<SmallVector<transform::MappedValue>> &mappings,
-    ValueRange values, const transform::TransformState &state) {
-  for (Value operand : values) {
-    SmallVector<MappedValue> &mapped = mappings.emplace_back();
+LogicalResult transform::detail::appendValueMappings(
+    MutableArrayRef<SmallVector<transform::MappedValue>> mappings,
+    ValueRange values, const transform::TransformState &state, bool flatten) {
+  assert(mappings.size() == values.size() && "mismatching number of mappings");
+  for (auto &&[operand, mapped] : llvm::zip_equal(values, mappings)) {
+    size_t mappedSize = mapped.size();
     if (llvm::isa<TransformHandleTypeInterface>(operand.getType())) {
       llvm::append_range(mapped, state.getPayloadOps(operand));
     } else if (llvm::isa<TransformValueHandleTypeInterface>(
@@ -1543,7 +1553,21 @@ void transform::detail::prepareValueMappings(
              "unsupported kind of transform dialect value");
       llvm::append_range(mapped, state.getParams(operand));
     }
+
+    if (mapped.size() - mappedSize != 1 && !flatten)
+      return failure();
   }
+  return success();
+}
+
+void transform::detail::prepareValueMappings(
+    SmallVectorImpl<SmallVector<transform::MappedValue>> &mappings,
+    ValueRange values, const transform::TransformState &state) {
+  mappings.resize(mappings.size() + values.size());
+  (void)appendValueMappings(
+      MutableArrayRef<SmallVector<transform::MappedValue>>(mappings).take_back(
+          values.size()),
+      values, state);
 }
 
 void transform::detail::forwardTerminatorOperands(
diff --git a/mlir/lib/Dialect/Transform/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Transform/Transforms/CMakeLists.txt
index f0f57874f5e7..9fed8c6b5caa 100644
--- a/mlir/lib/Dialect/Transform/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Transform/Transforms/CMakeLists.txt
@@ -3,7 +3,6 @@ add_mlir_dialect_library(MLIRTransformDialectTransforms
   InferEffects.cpp
   InterpreterPass.cpp
   PreloadLibraryPass.cpp
-  TransformInterpreterPassBase.cpp
   TransformInterpreterUtils.cpp
 
   DEPENDS
diff --git a/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterPassBase.cpp b/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterPassBase.cpp
deleted file mode 100644
index efb9359e1995..000000000000
--- a/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterPassBase.cpp
+++ /dev/null
@@ -1,457 +0,0 @@
-//===- TransformInterpreterPassBase.cpp -----------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Base class with shared implementation for transform dialect interpreter
-// passes.
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/Transform/Transforms/TransformInterpreterPassBase.h"
-#include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformOps.h"
-#include "mlir/Dialect/Transform/IR/Utils.h"
-#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
-#include "mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/Verifier.h"
-#include "mlir/IR/Visitors.h"
-#include "mlir/Interfaces/FunctionInterfaces.h"
-#include "mlir/Parser/Parser.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Support/FileUtilities.h"
-#include "llvm/ADT/ScopeExit.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/Mutex.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace mlir;
-
-#define DEBUG_TYPE "transform-dialect-interpreter"
-#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE << "]: ")
-#define DEBUG_TYPE_DUMP_STDERR "transform-dialect-dump-repro"
-#define DEBUG_TYPE_DUMP_FILE "transform-dialect-save-repro"
-
-/// Name of the attribute used for targeting the transform dialect interpreter
-/// at specific operations.
-constexpr static llvm::StringLiteral kTransformDialectTagAttrName =
-    "transform.target_tag";
-/// Value of the attribute indicating the root payload operation.
-constexpr static llvm::StringLiteral kTransformDialectTagPayloadRootValue =
-    "payload_root";
-/// Value of the attribute indicating the container of transform operations
-/// (containing the top-level transform operation).
-constexpr static llvm::StringLiteral
-    kTransformDialectTagTransformContainerValue = "transform_container";
-
-/// Finds the single top-level transform operation with `root` as ancestor.
-/// Reports an error if there is more than one such operation and returns the
-/// first one found. Reports an error returns nullptr if no such operation
-/// found.
-static Operation *
-findTopLevelTransform(Operation *root, StringRef filenameOption,
-                      mlir::transform::TransformOptions options) {
-  ::mlir::transform::TransformOpInterface topLevelTransform = nullptr;
-  root->walk<WalkOrder::PreOrder>(
-      [&](::mlir::transform::TransformOpInterface transformOp) {
-        if (!transformOp
-                 ->hasTrait<transform::PossibleTopLevelTransformOpTrait>())
-          return WalkResult::skip();
-        if (!topLevelTransform) {
-          topLevelTransform = transformOp;
-          return WalkResult::skip();
-        }
-        if (options.getEnforceSingleToplevelTransformOp()) {
-          auto diag = transformOp.emitError()
-                      << "more than one top-level transform op";
-          diag.attachNote(topLevelTransform.getLoc())
-              << "previous top-level transform op";
-          return WalkResult::interrupt();
-        }
-        return WalkResult::skip();
-      });
-  if (!topLevelTransform) {
-    auto diag = root->emitError()
-                << "could not find a nested top-level transform op";
-    diag.attachNote() << "use the '" << filenameOption
-                      << "' option to provide transform as external file";
-    return nullptr;
-  }
-  return topLevelTransform;
-}
-
-/// Finds an operation nested in `root` that has the transform dialect tag
-/// attribute with the value specified as `tag`. Assumes only one operation
-/// may have the tag. Returns nullptr if there is no such operation.
-static Operation *findOpWithTag(Operation *root, StringRef tagKey,
-                                StringRef tagValue) {
-  Operation *found = nullptr;
-  WalkResult walkResult = root->walk<WalkOrder::PreOrder>(
-      [tagKey, tagValue, &found, root](Operation *op) {
-        auto attr = op->getAttrOfType<StringAttr>(tagKey);
-        if (!attr || attr.getValue() != tagValue)
-          return WalkResult::advance();
-
-        if (found) {
-          InFlightDiagnostic diag = root->emitError()
-                                    << "more than one operation with " << tagKey
-                                    << "=\"" << tagValue << "\" attribute";
-          diag.attachNote(found->getLoc()) << "first operation";
-          diag.attachNote(op->getLoc()) << "other operation";
-          return WalkResult::interrupt();
-        }
-
-        found = op;
-        return WalkResult::advance();
-      });
-  if (walkResult.wasInterrupted())
-    return nullptr;
-
-  if (!found) {
-    root->emitError() << "could not find the operation with " << tagKey << "=\""
-                      << tagValue << "\" attribute";
-  }
-  return found;
-}
-
-/// Returns the ancestor of `target` that doesn't have a parent.
-static Operation *getRootOperation(Operation *target) {
-  Operation *root = target;
-  while (root->getParentOp())
-    root = root->getParentOp();
-  return root;
-}
-
-/// Prints the CLI command running the repro with the current path.
-// TODO: make binary name optional by querying LLVM command line API for the
-// name of the current binary.
-static llvm::raw_ostream &
-printReproCall(llvm::raw_ostream &os, StringRef rootOpName, StringRef passName,
-               const Pass::Option<std::string> &debugPayloadRootTag,
-               const Pass::Option<std::string> &debugTransformRootTag,
-               StringRef binaryName) {
-  os << llvm::formatv(
-      "{6} --pass-pipeline=\"{0}({1}{{{2}={3} {4}={5}})\"", rootOpName,
-      passName, debugPayloadRootTag.getArgStr(),
-      debugPayloadRootTag.empty()
-          ? StringRef(kTransformDialectTagPayloadRootValue)
-          : debugPayloadRootTag,
-      debugTransformRootTag.getArgStr(),
-      debugTransformRootTag.empty()
-          ? StringRef(kTransformDialectTagTransformContainerValue)
-          : debugTransformRootTag,
-      binaryName);
-  return os;
-}
-
-/// Prints the module rooted at `root` to `os` and appends
-/// `transformContainer` if it is not nested in `root`.
-static llvm::raw_ostream &printModuleForRepro(llvm::raw_ostream &os,
-                                              Operation *root,
-                                              Operation *transform) {
-  root->print(os);
-  if (!root->isAncestor(transform))
-    transform->print(os);
-  return os;
-}
-
-/// Saves the payload and the transform IR into a temporary file and reports
-/// the file name to `os`.
-[[maybe_unused]] static void
-saveReproToTempFile(llvm::raw_ostream &os, Operation *target,
-                    Operation *transform, StringRef passName,
-                    const Pass::Option<std::string> &debugPayloadRootTag,
-                    const Pass::Option<std::string> &debugTransformRootTag,
-                    const Pass::ListOption<std::string> &transformLibraryPaths,
-                    StringRef binaryName) {
-  using llvm::sys::fs::TempFile;
-  Operation *root = getRootOperation(target);
-
-  SmallVector<char, 128> tmpPath;
-  llvm::sys::path::system_temp_directory(/*erasedOnReboot=*/true, tmpPath);
-  llvm::sys::path::append(tmpPath, "transform_dialect_%%%%%%.mlir");
-  llvm::Expected<TempFile> tempFile = TempFile::create(tmpPath);
-  if (!tempFile) {
-    os << "could not open temporary file to save the repro\n";
-    return;
-  }
-
-  llvm::raw_fd_ostream fout(tempFile->FD, /*shouldClose=*/false);
-  printModuleForRepro(fout, root, transform);
-  fout.flush();
-  std::string filename = tempFile->TmpName;
-
-  if (tempFile->keep()) {
-    os << "could not preserve the temporary file with the repro\n";
-    return;
-  }
-
-  os << "=== Transform Interpreter Repro ===\n";
-  printReproCall(os, root->getName().getStringRef(), passName,
-                 debugPayloadRootTag, debugTransformRootTag, binaryName)
-      << " " << filename << "\n";
-  os << "===================================\n";
-}
-
-// Optionally perform debug actions requested by the user to dump IR and a
-// repro to stderr and/or a file.
-static void performOptionalDebugActions(
-    Operation *target, Operation *transform, StringRef passName,
-    const Pass::Option<std::string> &debugPayloadRootTag,
-    const Pass::Option<std::string> &debugTransformRootTag,
-    const Pass::ListOption<std::string> &transformLibraryPaths,
-    StringRef binaryName) {
-  MLIRContext *context = target->getContext();
-
-  // If we are not planning to print, bail early.
-  bool hasDebugFlags = false;
-  DEBUG_WITH_TYPE(DEBUG_TYPE_DUMP_STDERR, { hasDebugFlags = true; });
-  DEBUG_WITH_TYPE(DEBUG_TYPE_DUMP_FILE, { hasDebugFlags = true; });
-  if (!hasDebugFlags)
-    return;
-
-  // We will be mutating the IR to set attributes. If this is running
-  // concurrently on several parts of a container or using a shared transform
-  // script, this would create a race. Bail in multithreaded mode and require
-  // the user to disable threading to dump repros.
-  static llvm::sys::SmartMutex<true> dbgStreamMutex;
-  if (target->getContext()->isMultithreadingEnabled()) {
-    llvm::sys::SmartScopedLock<true> lock(dbgStreamMutex);
-    llvm::dbgs() << "=======================================================\n";
-    llvm::dbgs() << "|      Transform reproducers cannot be produced       |\n";
-    llvm::dbgs() << "|              in multi-threaded mode!                |\n";
-    llvm::dbgs() << "=======================================================\n";
-    return;
-  }
-
-  Operation *root = getRootOperation(target);
-
-  // Add temporary debug / repro attributes, these must never leak out.
-  if (debugPayloadRootTag.empty()) {
-    target->setAttr(
-        kTransformDialectTagAttrName,
-        StringAttr::get(context, kTransformDialectTagPayloadRootValue));
-  }
-  if (debugTransformRootTag.empty()) {
-    transform->setAttr(
-        kTransformDialectTagAttrName,
-        StringAttr::get(context, kTransformDialectTagTransformContainerValue));
-  }
-
-  DEBUG_WITH_TYPE(DEBUG_TYPE_DUMP_STDERR, {
-    llvm::dbgs() << "=== Transform Interpreter Repro ===\n";
-    printReproCall(llvm::dbgs() << "cat <<EOF | ",
-                   root->getName().getStringRef(), passName,
-                   debugPayloadRootTag, debugTransformRootTag, binaryName)
-        << "\n";
-    printModuleForRepro(llvm::dbgs(), root, transform);
-    llvm::dbgs() << "\nEOF\n";
-    llvm::dbgs() << "===================================\n";
-  });
-  (void)root;
-  DEBUG_WITH_TYPE(DEBUG_TYPE_DUMP_FILE, {
-    saveReproToTempFile(llvm::dbgs(), target, transform, passName,
-                        debugPayloadRootTag, debugTransformRootTag,
-                        transformLibraryPaths, binaryName);
-  });
-
-  // Remove temporary attributes if they were set.
-  if (debugPayloadRootTag.empty())
-    target->removeAttr(kTransformDialectTagAttrName);
-  if (debugTransformRootTag.empty())
-    transform->removeAttr(kTransformDialectTagAttrName);
-}
-
-LogicalResult transform::detail::interpreterBaseRunOnOperationImpl(
-    Operation *target, StringRef passName,
-    const std::shared_ptr<OwningOpRef<ModuleOp>> &sharedTransformModule,
-    const std::shared_ptr<OwningOpRef<ModuleOp>> &transformLibraryModule,
-    const RaggedArray<MappedValue> &extraMappings,
-    const TransformOptions &options,
-    const Pass::Option<std::string> &transformFileName,
-    const Pass::ListOption<std::string> &transformLibraryPaths,
-    const Pass::Option<std::string> &debugPayloadRootTag,
-    const Pass::Option<std::string> &debugTransformRootTag,
-    StringRef binaryName) {
-  bool hasSharedTransformModule =
-      sharedTransformModule && *sharedTransformModule;
-  bool hasTransformLibraryModule =
-      transformLibraryModule && *transformLibraryModule;
-  assert((!hasSharedTransformModule || !hasTransformLibraryModule) &&
-         "at most one of shared or library transform module can be set");
-
-  // Step 1
-  // ------
-  // If debugPayloadRootTag was passed, then we are in user-specified selection
-  // of the transformed IR. This corresponds to REPL debug mode. Otherwise, just
-  // apply to `target`.
-  Operation *payloadRoot = target;
-  if (!debugPayloadRootTag.empty()) {
-    payloadRoot = findOpWithTag(target, kTransformDialectTagAttrName,
-                                debugPayloadRootTag);
-    if (!payloadRoot)
-      return failure();
-  }
-
-  // Step 2
-  // ------
-  // If a shared transform was specified separately, use it. Otherwise, the
-  // transform is embedded in the payload IR. If debugTransformRootTag was
-  // passed, then we are in user-specified selection of the transforming IR.
-  // This corresponds to REPL debug mode.
-  Operation *transformContainer =
-      hasSharedTransformModule ? sharedTransformModule->get() : target;
-  Operation *transformRoot =
-      debugTransformRootTag.empty()
-          ? findTopLevelTransform(transformContainer,
-                                  transformFileName.getArgStr(), options)
-          : findOpWithTag(transformContainer, kTransformDialectTagAttrName,
-                          debugTransformRootTag);
-  if (!transformRoot)
-    return failure();
-
-  if (!transformRoot->hasTrait<PossibleTopLevelTransformOpTrait>()) {
-    return emitError(transformRoot->getLoc())
-           << "expected the transform entry point to be a top-level transform "
-              "op";
-  }
-
-  // Step 3
-  // ------
-  // Copy external defintions for symbols if provided. Be aware of potential
-  // concurrent execution (normally, the error shouldn't be triggered unless the
-  // transform IR modifies itself in a pass, which is also forbidden elsewhere).
-  if (hasTransformLibraryModule) {
-    if (!target->isProperAncestor(transformRoot)) {
-      InFlightDiagnostic diag =
-          transformRoot->emitError()
-          << "cannot inject transform definitions next to pass anchor op";
-      diag.attachNote(target->getLoc()) << "pass anchor op";
-      return diag;
-    }
-    InFlightDiagnostic diag = detail::mergeSymbolsInto(
-        SymbolTable::getNearestSymbolTable(transformRoot),
-        transformLibraryModule->get()->clone());
-    if (failed(diag)) {
-      diag.attachNote(transformRoot->getLoc())
-          << "failed to merge library symbols into transform root";
-      return diag;
-    }
-  }
-
-  // Step 4
-  // ------
-  // Optionally perform debug actions requested by the user to dump IR and a
-  // repro to stderr and/or a file.
-  performOptionalDebugActions(target, transformRoot, passName,
-                              debugPayloadRootTag, debugTransformRootTag,
-                              transformLibraryPaths, binaryName);
-
-  // Step 5
-  // ------
-  // Apply the transform to the IR
-  return applyTransforms(payloadRoot, cast<TransformOpInterface>(transformRoot),
-                         extraMappings, options);
-}
-
-LogicalResult transform::detail::interpreterBaseInitializeImpl(
-    MLIRContext *context, StringRef transformFileName,
-    ArrayRef<std::string> transformLibraryPaths,
-    std::shared_ptr<OwningOpRef<ModuleOp>> &sharedTransformModule,
-    std::shared_ptr<OwningOpRef<ModuleOp>> &transformLibraryModule,
-    function_ref<std::optional<LogicalResult>(OpBuilder &, Location)>
-        moduleBuilder) {
-  auto unknownLoc = UnknownLoc::get(context);
-
-  // Parse module from file.
-  OwningOpRef<ModuleOp> moduleFromFile;
-  {
-    auto loc = FileLineColLoc::get(context, transformFileName, 0, 0);
-    if (failed(detail::parseTransformModuleFromFile(context, transformFileName,
-                                                    moduleFromFile)))
-      return emitError(loc) << "failed to parse transform module";
-    if (moduleFromFile && failed(mlir::verify(*moduleFromFile)))
-      return emitError(loc) << "failed to verify transform module";
-  }
-
-  // Assemble list of library files.
-  SmallVector<std::string> libraryFileNames;
-  if (failed(expandPathsToMLIRFiles(transformLibraryPaths, context,
-                                    libraryFileNames)))
-    return failure();
-
-  // Parse modules from library files.
-  SmallVector<OwningOpRef<ModuleOp>> parsedLibraries;
-  for (const std::string &libraryFileName : libraryFileNames) {
-    OwningOpRef<ModuleOp> parsedLibrary;
-    auto loc = FileLineColLoc::get(context, libraryFileName, 0, 0);
-    if (failed(detail::parseTransformModuleFromFile(context, libraryFileName,
-                                                    parsedLibrary)))
-      return emitError(loc) << "failed to parse transform library module";
-    if (parsedLibrary && failed(mlir::verify(*parsedLibrary)))
-      return emitError(loc) << "failed to verify transform library module";
-    parsedLibraries.push_back(std::move(parsedLibrary));
-  }
-
-  // Build shared transform module.
-  if (moduleFromFile) {
-    sharedTransformModule =
-        std::make_shared<OwningOpRef<ModuleOp>>(std::move(moduleFromFile));
-  } else if (moduleBuilder) {
-    auto loc = FileLineColLoc::get(context, "<shared-transform-module>", 0, 0);
-    auto localModule = std::make_shared<OwningOpRef<ModuleOp>>(
-        ModuleOp::create(unknownLoc, "__transform"));
-
-    OpBuilder b(context);
-    b.setInsertionPointToEnd(localModule->get().getBody());
-    if (std::optional<LogicalResult> result = moduleBuilder(b, loc)) {
-      if (failed(*result))
-        return (*localModule)->emitError()
-               << "failed to create shared transform module";
-      sharedTransformModule = std::move(localModule);
-    }
-  }
-
-  if (parsedLibraries.empty())
-    return success();
-
-  // Merge parsed libraries into one module.
-  auto loc = FileLineColLoc::get(context, "<shared-library-module>", 0, 0);
-  OwningOpRef<ModuleOp> mergedParsedLibraries =
-      ModuleOp::create(loc, "__transform");
-  {
-    mergedParsedLibraries.get()->setAttr("transform.with_named_sequence",
-                                         UnitAttr::get(context));
-    IRRewriter rewriter(context);
-    // TODO: extend `mergeSymbolsInto` to support multiple `other` modules.
-    for (OwningOpRef<ModuleOp> &parsedLibrary : parsedLibraries) {
-      if (failed(detail::mergeSymbolsInto(mergedParsedLibraries.get(),
-                                          std::move(parsedLibrary))))
-        return mergedParsedLibraries->emitError()
-               << "failed to verify merged transform module";
-    }
-  }
-
-  // Use parsed libaries to resolve symbols in shared transform module or return
-  // as separate library module.
-  if (sharedTransformModule && *sharedTransformModule) {
-    if (failed(detail::mergeSymbolsInto(sharedTransformModule->get(),
-                                        std::move(mergedParsedLibraries))))
-      return (*sharedTransformModule)->emitError()
-             << "failed to merge symbols from library files "
-                "into shared transform module";
-  } else {
-    transformLibraryModule = std::make_shared<OwningOpRef<ModuleOp>>(
-        std::move(mergedParsedLibraries));
-  }
-  return success();
-}
diff --git a/mlir/lib/Dialect/Transform/Utils/Utils.cpp b/mlir/lib/Dialect/Transform/Utils/Utils.cpp
index 08068d285b4c..2ce21fe8a9c1 100644
--- a/mlir/lib/Dialect/Transform/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Transform/Utils/Utils.cpp
@@ -20,7 +20,11 @@ void mlir::transform::printPackedOrDynamicIndexList(
   if (packed) {
     assert(values.empty() && (!integers || integers.empty()) &&
            "expected no values/integers");
-    printer << "*(" << packed << " : " << packedType << ")";
+    printer << "*(" << packed;
+    if (packedType) {
+      printer << " : " << packedType;
+    }
+    printer << ")";
     return;
   }
   printDynamicIndexList(printer, op, values, integers, valueTypes);
@@ -29,19 +33,20 @@ void mlir::transform::printPackedOrDynamicIndexList(
 ParseResult mlir::transform::parsePackedOrDynamicIndexList(
     OpAsmParser &parser, std::optional<OpAsmParser::UnresolvedOperand> &packed,
     Type &packedType, SmallVectorImpl<OpAsmParser::UnresolvedOperand> &values,
-    SmallVectorImpl<Type> &valueTypes, DenseI64ArrayAttr &integers) {
+    SmallVectorImpl<Type> *valueTypes, DenseI64ArrayAttr &integers) {
   OpAsmParser::UnresolvedOperand packedOperand;
   if (parser.parseOptionalStar().succeeded()) {
     if (parser.parseLParen().failed() ||
-        parser.parseOperand(packedOperand).failed() ||
-        parser.parseColonType(packedType).failed() ||
-        parser.parseRParen().failed()) {
+        parser.parseOperand(packedOperand).failed())
+      return failure();
+    if (packedType && (parser.parseColonType(packedType).failed()))
+      return failure();
+    if (parser.parseRParen().failed())
       return failure();
-    }
     packed.emplace(packedOperand);
     integers = parser.getBuilder().getDenseI64ArrayAttr({});
     return success();
   }
 
-  return parseDynamicIndexList(parser, values, integers, &valueTypes);
+  return parseDynamicIndexList(parser, values, integers, valueTypes);
 }
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorTranspose.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorTranspose.cpp
index 792550dcfaf2..7011c478fefb 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorTranspose.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorTranspose.cpp
@@ -352,6 +352,7 @@ public:
       return success();
     }
 
+    // TODO: Add support for scalable vectors
     if (inputType.isScalable())
       return failure();
 
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
index a301b919dc52..6025c4ad7c14 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
@@ -1119,8 +1119,9 @@ struct RewriteAlignedSubByteIntExt : OpRewritePattern<ConversionOpType> {
                                 PatternRewriter &rewriter) const override {
     // Verify the preconditions.
     Value srcValue = conversionOp.getIn();
-    auto srcVecType = cast<VectorType>(srcValue.getType());
-    auto dstVecType = cast<VectorType>(conversionOp.getType());
+    auto srcVecType = dyn_cast<VectorType>(srcValue.getType());
+    auto dstVecType = dyn_cast<VectorType>(conversionOp.getType());
+
     if (failed(
             commonConversionPrecondition(rewriter, dstVecType, conversionOp)))
       return failure();
diff --git a/mlir/lib/IR/AsmPrinter.cpp b/mlir/lib/IR/AsmPrinter.cpp
index e915b97d9ff1..9a5c51ba738f 100644
--- a/mlir/lib/IR/AsmPrinter.cpp
+++ b/mlir/lib/IR/AsmPrinter.cpp
@@ -189,6 +189,11 @@ struct AsmPrinterOptions {
       "mlir-print-value-users", llvm::cl::init(false),
       llvm::cl::desc(
           "Print users of operation results and block arguments as a comment")};
+
+  llvm::cl::opt<bool> printUniqueSSAIDs{
+      "mlir-print-unique-ssa-ids", llvm::cl::init(false),
+      llvm::cl::desc("Print unique SSA ID numbers for values, block arguments "
+                     "and naming conflicts across all regions")};
 };
 } // namespace
 
@@ -206,7 +211,7 @@ OpPrintingFlags::OpPrintingFlags()
     : printDebugInfoFlag(false), printDebugInfoPrettyFormFlag(false),
       printGenericOpFormFlag(false), skipRegionsFlag(false),
       assumeVerifiedFlag(false), printLocalScope(false),
-      printValueUsersFlag(false) {
+      printValueUsersFlag(false), printUniqueSSAIDsFlag(false) {
   // Initialize based upon command line options, if they are available.
   if (!clOptions.isConstructed())
     return;
@@ -224,6 +229,7 @@ OpPrintingFlags::OpPrintingFlags()
   printLocalScope = clOptions->printLocalScopeOpt;
   skipRegionsFlag = clOptions->skipRegionsOpt;
   printValueUsersFlag = clOptions->printValueUsers;
+  printUniqueSSAIDsFlag = clOptions->printUniqueSSAIDs;
 }
 
 /// Enable the elision of large elements attributes, by printing a '...'
@@ -350,6 +356,11 @@ bool OpPrintingFlags::shouldPrintValueUsers() const {
   return printValueUsersFlag;
 }
 
+/// Return if the printer should use unique IDs.
+bool OpPrintingFlags::shouldPrintUniqueSSAIDs() const {
+  return printUniqueSSAIDsFlag || shouldPrintGenericOpForm();
+}
+
 //===----------------------------------------------------------------------===//
 // NewLineCounter
 //===----------------------------------------------------------------------===//
@@ -1369,8 +1380,14 @@ SSANameState::SSANameState(Operation *op, const OpPrintingFlags &printerFlags)
   while (!nameContext.empty()) {
     Region *region;
     UsedNamesScopeTy *parentScope;
-    std::tie(region, nextValueID, nextArgumentID, nextConflictID, parentScope) =
-        nameContext.pop_back_val();
+
+    if (printerFlags.shouldPrintUniqueSSAIDs())
+      // To print unique SSA IDs, ignore saved ID counts from parent regions
+      std::tie(region, std::ignore, std::ignore, std::ignore, parentScope) =
+          nameContext.pop_back_val();
+    else
+      std::tie(region, nextValueID, nextArgumentID, nextConflictID,
+               parentScope) = nameContext.pop_back_val();
 
     // When we switch from one subtree to another, pop the scopes(needless)
     // until the parent scope.
diff --git a/mlir/lib/IR/AttributeDetail.h b/mlir/lib/IR/AttributeDetail.h
index dcd24af0107d..26d40ac3a38f 100644
--- a/mlir/lib/IR/AttributeDetail.h
+++ b/mlir/lib/IR/AttributeDetail.h
@@ -261,7 +261,7 @@ struct DenseStringElementsAttrStorage : public DenseElementsAttributeStorage {
     // Check to see if this storage represents a splat. If it doesn't then
     // combine the hash for the data starting with the first non splat element.
     for (size_t i = 1, e = data.size(); i != e; i++)
-      if (!firstElt.equals(data[i]))
+      if (firstElt != data[i])
         return KeyTy(ty, data, llvm::hash_combine(hashVal, data.drop_front(i)));
 
     // Otherwise, this is a splat so just return the hash of the first element.
diff --git a/mlir/lib/IR/BuiltinTypes.cpp b/mlir/lib/IR/BuiltinTypes.cpp
index a2738946de41..179797cb943a 100644
--- a/mlir/lib/IR/BuiltinTypes.cpp
+++ b/mlir/lib/IR/BuiltinTypes.cpp
@@ -408,24 +408,24 @@ unsigned BaseMemRefType::getMemorySpaceAsInt() const {
 // MemRefType
 //===----------------------------------------------------------------------===//
 
-/// Given an `originalShape` and a `reducedShape` assumed to be a subset of
-/// `originalShape` with some `1` entries erased, return the set of indices
-/// that specifies which of the entries of `originalShape` are dropped to obtain
-/// `reducedShape`. The returned mask can be applied as a projection to
-/// `originalShape` to obtain the `reducedShape`. This mask is useful to track
-/// which dimensions must be kept when e.g. compute MemRef strides under
-/// rank-reducing operations. Return std::nullopt if reducedShape cannot be
-/// obtained by dropping only `1` entries in `originalShape`.
 std::optional<llvm::SmallDenseSet<unsigned>>
 mlir::computeRankReductionMask(ArrayRef<int64_t> originalShape,
-                               ArrayRef<int64_t> reducedShape) {
+                               ArrayRef<int64_t> reducedShape,
+                               bool matchDynamic) {
   size_t originalRank = originalShape.size(), reducedRank = reducedShape.size();
   llvm::SmallDenseSet<unsigned> unusedDims;
   unsigned reducedIdx = 0;
   for (unsigned originalIdx = 0; originalIdx < originalRank; ++originalIdx) {
     // Greedily insert `originalIdx` if match.
-    if (reducedIdx < reducedRank &&
-        originalShape[originalIdx] == reducedShape[reducedIdx]) {
+    int64_t origSize = originalShape[originalIdx];
+    // if `matchDynamic`, count dynamic dims as a match, unless `origSize` is 1.
+    if (matchDynamic && reducedIdx < reducedRank && origSize != 1 &&
+        (ShapedType::isDynamic(reducedShape[reducedIdx]) ||
+         ShapedType::isDynamic(origSize))) {
+      reducedIdx++;
+      continue;
+    }
+    if (reducedIdx < reducedRank && origSize == reducedShape[reducedIdx]) {
       reducedIdx++;
       continue;
     }
@@ -433,7 +433,7 @@ mlir::computeRankReductionMask(ArrayRef<int64_t> originalShape,
     unusedDims.insert(originalIdx);
     // If no match on `originalIdx`, the `originalShape` at this dimension
     // must be 1, otherwise we bail.
-    if (originalShape[originalIdx] != 1)
+    if (origSize != 1)
       return std::nullopt;
   }
   // The whole reducedShape must be scanned, otherwise we bail.
diff --git a/mlir/lib/Interfaces/ViewLikeInterface.cpp b/mlir/lib/Interfaces/ViewLikeInterface.cpp
index 6d1ff03756ac..ca33636336bf 100644
--- a/mlir/lib/Interfaces/ViewLikeInterface.cpp
+++ b/mlir/lib/Interfaces/ViewLikeInterface.cpp
@@ -113,7 +113,7 @@ static char getRightDelimiter(AsmParser::Delimiter delimiter) {
 void mlir::printDynamicIndexList(OpAsmPrinter &printer, Operation *op,
                                  OperandRange values,
                                  ArrayRef<int64_t> integers,
-                                 TypeRange valueTypes, ArrayRef<bool> scalables,
+                                 ArrayRef<bool> scalables, TypeRange valueTypes,
                                  AsmParser::Delimiter delimiter) {
   char leftDelimiter = getLeftDelimiter(delimiter);
   char rightDelimiter = getRightDelimiter(delimiter);
diff --git a/mlir/lib/TableGen/Builder.cpp b/mlir/lib/TableGen/Builder.cpp
index 47a2f6cc4456..044765c72601 100644
--- a/mlir/lib/TableGen/Builder.cpp
+++ b/mlir/lib/TableGen/Builder.cpp
@@ -52,7 +52,7 @@ Builder::Builder(const llvm::Record *record, ArrayRef<SMLoc> loc)
   // Initialize the parameters of the builder.
   const llvm::DagInit *dag = def->getValueAsDag("dagParams");
   auto *defInit = dyn_cast<llvm::DefInit>(dag->getOperator());
-  if (!defInit || !defInit->getDef()->getName().equals("ins"))
+  if (!defInit || defInit->getDef()->getName() != "ins")
     PrintFatalError(def->getLoc(), "expected 'ins' in builders");
 
   bool seenDefaultValue = false;
diff --git a/mlir/lib/Target/Cpp/TranslateToCpp.cpp b/mlir/lib/Target/Cpp/TranslateToCpp.cpp
index 1bdb74cd8bf2..7db7163bac4a 100644
--- a/mlir/lib/Target/Cpp/TranslateToCpp.cpp
+++ b/mlir/lib/Target/Cpp/TranslateToCpp.cpp
@@ -293,9 +293,16 @@ static bool shouldBeInlined(ExpressionOp expressionOp) {
   if (!result.hasOneUse())
     return false;
 
+  Operation *user = *result.getUsers().begin();
+
+  // Do not inline expressions used by subscript operations, since the
+  // way the subscript operation translation is implemented requires that
+  // variables be materialized.
+  if (isa<emitc::SubscriptOp>(user))
+    return false;
+
   // Do not inline expressions used by other expressions, as any desired
   // expression folding was taken care of by transformations.
-  Operation *user = *result.getUsers().begin();
   return !user->getParentOfType<ExpressionOp>();
 }
 
diff --git a/mlir/lib/Target/LLVMIR/DebugTranslation.cpp b/mlir/lib/Target/LLVMIR/DebugTranslation.cpp
index 2de5e372d88c..2aa1b6b85ac0 100644
--- a/mlir/lib/Target/LLVMIR/DebugTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/DebugTranslation.cpp
@@ -406,6 +406,15 @@ llvm::DILocation *DebugTranslation::translateLoc(Location loc,
   if (auto callLoc = dyn_cast<CallSiteLoc>(loc)) {
     // For callsites, the caller is fed as the inlinedAt for the callee.
     auto *callerLoc = translateLoc(callLoc.getCaller(), scope, inlinedAt);
+    // If the caller scope is not translatable, the overall callsite cannot be
+    // represented in LLVM (the callee scope may not match the parent function).
+    if (!callerLoc) {
+      // If there is an inlinedAt scope (an outer caller), skip to that
+      // directly. Otherwise, cannot translate.
+      if (!inlinedAt)
+        return nullptr;
+      callerLoc = inlinedAt;
+    }
     llvmLoc = translateLoc(callLoc.getCallee(), nullptr, callerLoc);
     // Fallback: Ignore callee if it has no debug scope.
     if (!llvmLoc)
diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
index 40d8253d822f..06673965245c 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
@@ -93,7 +93,7 @@ static LogicalResult setProfilingAttr(OpBuilder &builder, llvm::MDNode *node,
     return failure();
 
   // Handle function entry count metadata.
-  if (name->getString().equals("function_entry_count")) {
+  if (name->getString() == "function_entry_count") {
 
     // TODO support function entry count metadata with GUID fields.
     if (node->getNumOperands() != 2)
@@ -111,7 +111,7 @@ static LogicalResult setProfilingAttr(OpBuilder &builder, llvm::MDNode *node,
            << "expected function_entry_count to be attached to a function";
   }
 
-  if (!name->getString().equals("branch_weights"))
+  if (name->getString() != "branch_weights")
     return failure();
 
   // Handle branch weights metadata.
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 9f87f89d8c63..a7294632d666 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -33,6 +33,9 @@
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
 #include <any>
+#include <cstdint>
+#include <iterator>
+#include <numeric>
 #include <optional>
 #include <utility>
 
@@ -878,36 +881,40 @@ static void collectReductionInfo(
 }
 
 /// handling of DeclareReductionOp's cleanup region
-static LogicalResult inlineReductionCleanup(
-    llvm::SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls,
-    llvm::ArrayRef<llvm::Value *> privateReductionVariables,
-    LLVM::ModuleTranslation &moduleTranslation, llvm::IRBuilderBase &builder) {
-  for (auto [i, reductionDecl] : llvm::enumerate(reductionDecls)) {
-    Region &cleanupRegion = reductionDecl.getCleanupRegion();
-    if (cleanupRegion.empty())
+static LogicalResult
+inlineOmpRegionCleanup(llvm::SmallVectorImpl<Region *> &cleanupRegions,
+                       llvm::ArrayRef<llvm::Value *> privateVariables,
+                       LLVM::ModuleTranslation &moduleTranslation,
+                       llvm::IRBuilderBase &builder, StringRef regionName,
+                       bool shouldLoadCleanupRegionArg = true) {
+  for (auto [i, cleanupRegion] : llvm::enumerate(cleanupRegions)) {
+    if (cleanupRegion->empty())
       continue;
 
     // map the argument to the cleanup region
-    Block &entry = cleanupRegion.front();
+    Block &entry = cleanupRegion->front();
 
     llvm::Instruction *potentialTerminator =
         builder.GetInsertBlock()->empty() ? nullptr
                                           : &builder.GetInsertBlock()->back();
     if (potentialTerminator && potentialTerminator->isTerminator())
       builder.SetInsertPoint(potentialTerminator);
-    llvm::Value *reductionVar = builder.CreateLoad(
-        moduleTranslation.convertType(entry.getArgument(0).getType()),
-        privateReductionVariables[i]);
+    llvm::Value *prviateVarValue =
+        shouldLoadCleanupRegionArg
+            ? builder.CreateLoad(
+                  moduleTranslation.convertType(entry.getArgument(0).getType()),
+                  privateVariables[i])
+            : privateVariables[i];
 
-    moduleTranslation.mapValue(entry.getArgument(0), reductionVar);
+    moduleTranslation.mapValue(entry.getArgument(0), prviateVarValue);
 
-    if (failed(inlineConvertOmpRegions(cleanupRegion, "omp.reduction.cleanup",
-                                       builder, moduleTranslation)))
+    if (failed(inlineConvertOmpRegions(*cleanupRegion, regionName, builder,
+                                       moduleTranslation)))
       return failure();
 
     // clear block argument mapping in case it needs to be re-created with a
     // different source for another use of the same reduction decl
-    moduleTranslation.forgetMapping(cleanupRegion);
+    moduleTranslation.forgetMapping(*cleanupRegion);
   }
   return success();
 }
@@ -1110,8 +1117,14 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
   builder.restoreIP(nextInsertionPoint);
 
   // after the workshare loop, deallocate private reduction variables
-  return inlineReductionCleanup(reductionDecls, privateReductionVariables,
-                                moduleTranslation, builder);
+  SmallVector<Region *> reductionRegions;
+  llvm::transform(reductionDecls, std::back_inserter(reductionRegions),
+                  [](omp::DeclareReductionOp reductionDecl) {
+                    return &reductionDecl.getCleanupRegion();
+                  });
+  return inlineOmpRegionCleanup(reductionRegions, privateReductionVariables,
+                                moduleTranslation, builder,
+                                "omp.reduction.cleanup");
 }
 
 /// A RAII class that on construction replaces the region arguments of the
@@ -1267,6 +1280,9 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
     }
   };
 
+  SmallVector<omp::PrivateClauseOp> privatizerClones;
+  SmallVector<llvm::Value *> privateVariables;
+
   // TODO: Perform appropriate actions according to the data-sharing
   // attribute (shared, private, firstprivate, ...) of variables.
   // Currently shared and private are supported.
@@ -1356,12 +1372,17 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
         opInst.emitError("failed to inline `alloc` region of an `omp.private` "
                          "op in the parallel region");
         bodyGenStatus = failure();
+        privatizerClone.erase();
       } else {
         assert(yieldedValues.size() == 1);
         replacementValue = yieldedValues.front();
+
+        // Keep the LLVM replacement value and the op clone in case we need to
+        // emit cleanup (i.e. deallocation) logic.
+        privateVariables.push_back(replacementValue);
+        privatizerClones.push_back(privatizerClone);
       }
 
-      privatizerClone.erase();
       builder.restoreIP(oldIP);
     }
 
@@ -1376,8 +1397,25 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
 
     // if the reduction has a cleanup region, inline it here to finalize the
     // reduction variables
-    if (failed(inlineReductionCleanup(reductionDecls, privateReductionVariables,
-                                      moduleTranslation, builder)))
+    SmallVector<Region *> reductionCleanupRegions;
+    llvm::transform(reductionDecls, std::back_inserter(reductionCleanupRegions),
+                    [](omp::DeclareReductionOp reductionDecl) {
+                      return &reductionDecl.getCleanupRegion();
+                    });
+    if (failed(inlineOmpRegionCleanup(
+            reductionCleanupRegions, privateReductionVariables,
+            moduleTranslation, builder, "omp.reduction.cleanup")))
+      bodyGenStatus = failure();
+
+    SmallVector<Region *> privateCleanupRegions;
+    llvm::transform(privatizerClones, std::back_inserter(privateCleanupRegions),
+                    [](omp::PrivateClauseOp privatizer) {
+                      return &privatizer.getDeallocRegion();
+                    });
+
+    if (failed(inlineOmpRegionCleanup(
+            privateCleanupRegions, privateVariables, moduleTranslation, builder,
+            "omp.private.dealloc", /*shouldLoadCleanupRegionArg=*/false)))
       bodyGenStatus = failure();
 
     builder.restoreIP(oldIP);
@@ -1403,6 +1441,9 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
       ompBuilder->createParallel(ompLoc, allocaIP, bodyGenCB, privCB, finiCB,
                                  ifCond, numThreads, pbKind, isCancellable));
 
+  for (mlir::omp::PrivateClauseOp privatizerClone : privatizerClones)
+    privatizerClone.erase();
+
   return bodyGenStatus;
 }
 
@@ -1998,7 +2039,7 @@ llvm::Value *getSizeInBytes(DataLayout &dl, const mlir::Type &type,
         if (auto boundOp = mlir::dyn_cast_if_present<mlir::omp::MapBoundsOp>(
                 bounds.getDefiningOp())) {
           // The below calculation for the size to be mapped calculated from the
-          // map_info's bounds is: (elemCount * [UB - LB] + 1), later we
+          // map.info's bounds is: (elemCount * [UB - LB] + 1), later we
           // multiply by the underlying element types byte size to get the full
           // size to be offloaded based on the bounds
           elementCount = builder.CreateMul(
@@ -2050,9 +2091,9 @@ void collectMapDataFromMapOperands(MapInfoData &mapData,
 
       mapData.BaseType.push_back(
           moduleTranslation.convertType(mapOp.getVarType()));
-      mapData.Sizes.push_back(getSizeInBytes(
-          dl, mapOp.getVarType(), mapOp, mapData.BasePointers.back(),
-          mapData.BaseType.back(), builder, moduleTranslation));
+      mapData.Sizes.push_back(
+          getSizeInBytes(dl, mapOp.getVarType(), mapOp, mapData.Pointers.back(),
+                         mapData.BaseType.back(), builder, moduleTranslation));
       mapData.MapClause.push_back(mapOp.getOperation());
       mapData.Types.push_back(
           llvm::omp::OpenMPOffloadMappingFlags(mapOp.getMapType().value()));
@@ -2083,6 +2124,64 @@ void collectMapDataFromMapOperands(MapInfoData &mapData,
   }
 }
 
+static int getMapDataMemberIdx(MapInfoData &mapData,
+                               mlir::omp::MapInfoOp memberOp) {
+  auto *res = llvm::find(mapData.MapClause, memberOp);
+  assert(res != mapData.MapClause.end() &&
+         "MapInfoOp for member not found in MapData, cannot return index");
+  return std::distance(mapData.MapClause.begin(), res);
+}
+
+static mlir::omp::MapInfoOp
+getFirstOrLastMappedMemberPtr(mlir::omp::MapInfoOp mapInfo, bool first) {
+  mlir::DenseIntElementsAttr indexAttr = mapInfo.getMembersIndexAttr();
+
+  // Only 1 member has been mapped, we can return it.
+  if (indexAttr.size() == 1)
+    if (auto mapOp = mlir::dyn_cast<mlir::omp::MapInfoOp>(
+            mapInfo.getMembers()[0].getDefiningOp()))
+      return mapOp;
+
+  llvm::ArrayRef<int64_t> shape = indexAttr.getShapedType().getShape();
+  llvm::SmallVector<size_t> indices(shape[0]);
+  std::iota(indices.begin(), indices.end(), 0);
+
+  llvm::sort(
+      indices.begin(), indices.end(), [&](const size_t a, const size_t b) {
+        auto indexValues = indexAttr.getValues<int32_t>();
+        for (int i = 0;
+             i < shape[1];
+             ++i) {
+          int aIndex = indexValues[a * shape[1] + i];
+          int bIndex = indexValues[b * shape[1] + i];
+
+          if (aIndex == bIndex)
+            continue;
+
+          if (aIndex != -1 && bIndex == -1)
+            return false;
+
+          if (aIndex == -1 && bIndex != -1)
+            return true;
+
+          // A is earlier in the record type layout than B
+          if (aIndex < bIndex)
+            return first;
+
+          if (bIndex < aIndex)
+            return !first;
+        }
+
+        // Iterated the entire list and couldn't make a decision, all elements
+        // were likely the same. Return false, since the sort comparator should
+        // return false for equal elements.
+        return false;
+      });
+
+    return llvm::cast<mlir::omp::MapInfoOp>(
+          mapInfo.getMembers()[indices.front()].getDefiningOp());
+}
+
 /// This function calculates the array/pointer offset for map data provided
 /// with bounds operations, e.g. when provided something like the following:
 ///
@@ -2188,6 +2287,9 @@ calculateBoundsOffset(LLVM::ModuleTranslation &moduleTranslation,
 // which is utilised in subsequent member mappings (by modifying there map type
 // with it) to indicate that a member is part of this parent and should be
 // treated by the runtime as such. Important to achieve the correct mapping.
+//
+// This function borrows a lot from Clang's emitCombinedEntry function
+// inside of CGOpenMPRuntime.cpp
 static llvm::omp::OpenMPOffloadMappingFlags mapParentWithMembers(
     LLVM::ModuleTranslation &moduleTranslation, llvm::IRBuilderBase &builder,
     llvm::OpenMPIRBuilder &ompBuilder, DataLayout &dl,
@@ -2203,7 +2305,6 @@ static llvm::omp::OpenMPOffloadMappingFlags mapParentWithMembers(
   combinedInfo.Names.emplace_back(LLVM::createMappingInformation(
       mapData.MapClause[mapDataIndex]->getLoc(), ompBuilder));
   combinedInfo.BasePointers.emplace_back(mapData.BasePointers[mapDataIndex]);
-  combinedInfo.Pointers.emplace_back(mapData.Pointers[mapDataIndex]);
 
   // Calculate size of the parent object being mapped based on the
   // addresses at runtime, highAddr - lowAddr = size. This of course
@@ -2212,42 +2313,68 @@ static llvm::omp::OpenMPOffloadMappingFlags mapParentWithMembers(
   // Fortran pointers and allocatables, the mapping of the pointed to
   // data by the descriptor (which itself, is a structure containing
   // runtime information on the dynamically allocated data).
-  llvm::Value *lowAddr = builder.CreatePointerCast(
-      mapData.Pointers[mapDataIndex], builder.getPtrTy());
-  llvm::Value *highAddr = builder.CreatePointerCast(
-      builder.CreateConstGEP1_32(mapData.BaseType[mapDataIndex],
-                                 mapData.Pointers[mapDataIndex], 1),
-      builder.getPtrTy());
+  auto parentClause =
+      llvm::cast<mlir::omp::MapInfoOp>(mapData.MapClause[mapDataIndex]);
+
+  llvm::Value *lowAddr, *highAddr;
+  if (!parentClause.getPartialMap()) {
+    lowAddr = builder.CreatePointerCast(mapData.Pointers[mapDataIndex],
+                                        builder.getPtrTy());
+    highAddr = builder.CreatePointerCast(
+        builder.CreateConstGEP1_32(mapData.BaseType[mapDataIndex],
+                                   mapData.Pointers[mapDataIndex], 1),
+        builder.getPtrTy());
+    combinedInfo.Pointers.emplace_back(mapData.Pointers[mapDataIndex]);
+  } else {
+    auto mapOp =
+        mlir::dyn_cast<mlir::omp::MapInfoOp>(mapData.MapClause[mapDataIndex]);
+    int firstMemberIdx = getMapDataMemberIdx(
+        mapData, getFirstOrLastMappedMemberPtr(mapOp, true));
+    lowAddr = builder.CreatePointerCast(mapData.Pointers[firstMemberIdx],
+                                        builder.getPtrTy());
+    int lastMemberIdx = getMapDataMemberIdx(
+        mapData, getFirstOrLastMappedMemberPtr(mapOp, false));
+    highAddr = builder.CreatePointerCast(
+        builder.CreateGEP(mapData.BaseType[lastMemberIdx],
+                          mapData.Pointers[lastMemberIdx], builder.getInt64(1)),
+        builder.getPtrTy());
+    combinedInfo.Pointers.emplace_back(mapData.Pointers[firstMemberIdx]);
+  }
+
   llvm::Value *size = builder.CreateIntCast(
       builder.CreatePtrDiff(builder.getInt8Ty(), highAddr, lowAddr),
       builder.getInt64Ty(),
       /*isSigned=*/false);
   combinedInfo.Sizes.push_back(size);
 
-  // This creates the initial MEMBER_OF mapping that consists of
-  // the parent/top level container (same as above effectively, except
-  // with a fixed initial compile time size and seperate maptype which
-  // indicates the true mape type (tofrom etc.) and that it is a part
-  // of a larger mapping and indicating the link between it and it's
-  // members that are also explicitly mapped).
+  // TODO: This will need to be expanded to include the whole host of logic for
+  // the map flags that Clang currently supports (e.g. it should take the map
+  // flag of the parent map flag, remove the OMP_MAP_TARGET_PARAM and do some
+  // further case specific flag modifications). For the moment, it handles what
+  // we support as expected.
   llvm::omp::OpenMPOffloadMappingFlags mapFlag =
       llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO;
-  if (isTargetParams)
-    mapFlag &= ~llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TARGET_PARAM;
 
   llvm::omp::OpenMPOffloadMappingFlags memberOfFlag =
       ompBuilder.getMemberOfFlag(combinedInfo.BasePointers.size() - 1);
   ompBuilder.setCorrectMemberOfFlag(mapFlag, memberOfFlag);
 
-  combinedInfo.Types.emplace_back(mapFlag);
-  combinedInfo.DevicePointers.emplace_back(
-      llvm::OpenMPIRBuilder::DeviceInfoTy::None);
-  combinedInfo.Names.emplace_back(LLVM::createMappingInformation(
-      mapData.MapClause[mapDataIndex]->getLoc(), ompBuilder));
-  combinedInfo.BasePointers.emplace_back(mapData.BasePointers[mapDataIndex]);
-  combinedInfo.Pointers.emplace_back(mapData.Pointers[mapDataIndex]);
-  combinedInfo.Sizes.emplace_back(mapData.Sizes[mapDataIndex]);
-
+  // This creates the initial MEMBER_OF mapping that consists of
+  // the parent/top level container (same as above effectively, except
+  // with a fixed initial compile time size and seperate maptype which
+  // indicates the true mape type (tofrom etc.). This parent mapping is
+  // only relevant if the structure in its totality is being mapped,
+  // otherwise the above suffices.
+  if (!parentClause.getPartialMap()) {
+    combinedInfo.Types.emplace_back(mapFlag);
+    combinedInfo.DevicePointers.emplace_back(
+        llvm::OpenMPIRBuilder::DeviceInfoTy::None);
+    combinedInfo.Names.emplace_back(LLVM::createMappingInformation(
+        mapData.MapClause[mapDataIndex]->getLoc(), ompBuilder));
+    combinedInfo.BasePointers.emplace_back(mapData.BasePointers[mapDataIndex]);
+    combinedInfo.Pointers.emplace_back(mapData.Pointers[mapDataIndex]);
+    combinedInfo.Sizes.emplace_back(mapData.Sizes[mapDataIndex]);
+  }
   return memberOfFlag;
 }
 
@@ -2280,21 +2407,17 @@ static void processMapMembersWithParent(
     uint64_t mapDataIndex, llvm::omp::OpenMPOffloadMappingFlags memberOfFlag) {
 
   auto parentClause =
-      mlir::dyn_cast<mlir::omp::MapInfoOp>(mapData.MapClause[mapDataIndex]);
+      llvm::cast<mlir::omp::MapInfoOp>(mapData.MapClause[mapDataIndex]);
 
   for (auto mappedMembers : parentClause.getMembers()) {
     auto memberClause =
-        mlir::dyn_cast<mlir::omp::MapInfoOp>(mappedMembers.getDefiningOp());
-    int memberDataIdx = -1;
-    for (size_t i = 0; i < mapData.MapClause.size(); ++i) {
-      if (mapData.MapClause[i] == memberClause)
-        memberDataIdx = i;
-    }
+        llvm::cast<mlir::omp::MapInfoOp>(mappedMembers.getDefiningOp());
+    int memberDataIdx = getMapDataMemberIdx(mapData, memberClause);
 
     assert(memberDataIdx >= 0 && "could not find mapped member of structure");
 
     // Same MemberOfFlag to indicate its link with parent and other members
-    // of, and we flag that it's part of a pointer and object coupling.
+    // of.
     auto mapFlag =
         llvm::omp::OpenMPOffloadMappingFlags(memberClause.getMapType().value());
     mapFlag &= ~llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TARGET_PARAM;
@@ -2308,18 +2431,81 @@ static void processMapMembersWithParent(
         llvm::OpenMPIRBuilder::DeviceInfoTy::None);
     combinedInfo.Names.emplace_back(
         LLVM::createMappingInformation(memberClause.getLoc(), ompBuilder));
-
-    combinedInfo.BasePointers.emplace_back(mapData.BasePointers[memberDataIdx]);
+    combinedInfo.BasePointers.emplace_back(mapData.BasePointers[mapDataIndex]);
     combinedInfo.Pointers.emplace_back(mapData.Pointers[memberDataIdx]);
     combinedInfo.Sizes.emplace_back(mapData.Sizes[memberDataIdx]);
   }
 }
 
+static void
+processIndividualMap(MapInfoData &mapData, size_t mapDataIdx,
+                     llvm::OpenMPIRBuilder::MapInfosTy &combinedInfo,
+                     bool isTargetParams, int mapDataParentIdx = -1) {
+  // Declare Target Mappings are excluded from being marked as
+  // OMP_MAP_TARGET_PARAM as they are not passed as parameters, they're
+  // marked with OMP_MAP_PTR_AND_OBJ instead.
+  auto mapFlag = mapData.Types[mapDataIdx];
+  auto mapInfoOp =
+      llvm::cast<mlir::omp::MapInfoOp>(mapData.MapClause[mapDataIdx]);
+
+  bool isPtrTy = checkIfPointerMap(mapInfoOp);
+  if (isPtrTy)
+    mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ;
+
+  if (isTargetParams && !mapData.IsDeclareTarget[mapDataIdx])
+    mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TARGET_PARAM;
+
+  if (mapInfoOp.getMapCaptureType().value() ==
+          mlir::omp::VariableCaptureKind::ByCopy &&
+      !isPtrTy)
+    mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_LITERAL;
+
+  // if we're provided a mapDataParentIdx, then the data being mapped is
+  // part of a larger object (in a parent <-> member mapping) and in this
+  // case our BasePointer should be the parent.
+  if (mapDataParentIdx >= 0)
+    combinedInfo.BasePointers.emplace_back(
+        mapData.BasePointers[mapDataParentIdx]);
+  else
+    combinedInfo.BasePointers.emplace_back(mapData.BasePointers[mapDataIdx]);
+
+  combinedInfo.Pointers.emplace_back(mapData.Pointers[mapDataIdx]);
+  combinedInfo.DevicePointers.emplace_back(mapData.DevicePointers[mapDataIdx]);
+  combinedInfo.Names.emplace_back(mapData.Names[mapDataIdx]);
+  combinedInfo.Types.emplace_back(mapFlag);
+  combinedInfo.Sizes.emplace_back(mapData.Sizes[mapDataIdx]);
+}
+
 static void processMapWithMembersOf(
     LLVM::ModuleTranslation &moduleTranslation, llvm::IRBuilderBase &builder,
     llvm::OpenMPIRBuilder &ompBuilder, DataLayout &dl,
     llvm::OpenMPIRBuilder::MapInfosTy &combinedInfo, MapInfoData &mapData,
     uint64_t mapDataIndex, bool isTargetParams) {
+  auto parentClause =
+      llvm::cast<mlir::omp::MapInfoOp>(mapData.MapClause[mapDataIndex]);
+
+  // If we have a partial map (no parent referenced in the map clauses of the
+  // directive, only members) and only a single member, we do not need to bind
+  // the map of the member to the parent, we can pass the member seperately.
+  if (parentClause.getMembers().size() == 1 && parentClause.getPartialMap()) {
+    auto memberClause = llvm::cast<mlir::omp::MapInfoOp>(
+        parentClause.getMembers()[0].getDefiningOp());
+    int memberDataIdx = getMapDataMemberIdx(mapData, memberClause);
+    // Note: Clang treats arrays with explicit bounds that fall into this
+    // category as a parent with map case, however, it seems this isn't a
+    // requirement, and processing them as an individual map is fine. So,
+    // we will handle them as individual maps for the moment, as it's
+    // difficult for us to check this as we always require bounds to be
+    // specified currently and it's also marginally more optimal (single
+    // map rather than two). The difference may come from the fact that
+    // Clang maps array without bounds as pointers (which we do not
+    // currently do), whereas we treat them as arrays in all cases
+    // currently.
+    processIndividualMap(mapData, memberDataIdx, combinedInfo, isTargetParams,
+                         mapDataIndex);
+    return;
+  }
+
   llvm::omp::OpenMPOffloadMappingFlags memberOfParentFlag =
       mapParentWithMembers(moduleTranslation, builder, ompBuilder, dl,
                            combinedInfo, mapData, mapDataIndex, isTargetParams);
@@ -2438,12 +2624,8 @@ static void genMapInfos(llvm::IRBuilderBase &builder,
   // utilise the size from any component of MapInfoData, if we can't
   // something is missing from the initial MapInfoData construction.
   for (size_t i = 0; i < mapData.MapClause.size(); ++i) {
-    // NOTE/TODO: We currently do not handle member mapping seperately from it's
-    // parent or explicit mapping of a parent and member in the same operation,
-    // this will need to change in the near future, for now we primarily handle
-    // descriptor mapping from fortran, generalised as mapping record types
-    // with implicit member maps. This lowering needs further generalisation to
-    // fully support fortran derived types, and C/C++ structures and classes.
+    // NOTE/TODO: We currently do not support arbitrary depth record
+    // type mapping.
     if (mapData.IsAMember[i])
       continue;
 
@@ -2454,28 +2636,7 @@ static void genMapInfos(llvm::IRBuilderBase &builder,
       continue;
     }
 
-    auto mapFlag = mapData.Types[i];
-    bool isPtrTy = checkIfPointerMap(mapInfoOp);
-    if (isPtrTy)
-      mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ;
-
-    // Declare Target Mappings are excluded from being marked as
-    // OMP_MAP_TARGET_PARAM as they are not passed as parameters.
-    if (isTargetParams && !mapData.IsDeclareTarget[i])
-      mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TARGET_PARAM;
-
-    if (auto mapInfoOp = dyn_cast<mlir::omp::MapInfoOp>(mapData.MapClause[i]))
-      if (mapInfoOp.getMapCaptureType().value() ==
-              mlir::omp::VariableCaptureKind::ByCopy &&
-          !isPtrTy)
-        mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_LITERAL;
-
-    combinedInfo.BasePointers.emplace_back(mapData.BasePointers[i]);
-    combinedInfo.Pointers.emplace_back(mapData.Pointers[i]);
-    combinedInfo.DevicePointers.emplace_back(mapData.DevicePointers[i]);
-    combinedInfo.Names.emplace_back(mapData.Names[i]);
-    combinedInfo.Types.emplace_back(mapFlag);
-    combinedInfo.Sizes.emplace_back(mapData.Sizes[i]);
+    processIndividualMap(mapData, i, combinedInfo, isTargetParams);
   }
 
   auto findMapInfo = [&combinedInfo](llvm::Value *val, unsigned &index) {
diff --git a/mlir/lib/Tools/lsp-server-support/Protocol.cpp b/mlir/lib/Tools/lsp-server-support/Protocol.cpp
index e110fdd97a38..188f5253c95c 100644
--- a/mlir/lib/Tools/lsp-server-support/Protocol.cpp
+++ b/mlir/lib/Tools/lsp-server-support/Protocol.cpp
@@ -646,6 +646,20 @@ llvm::json::Value mlir::lsp::toJSON(const DiagnosticRelatedInformation &info) {
 // Diagnostic
 //===----------------------------------------------------------------------===//
 
+llvm::json::Value mlir::lsp::toJSON(DiagnosticTag tag) {
+  return static_cast<int>(tag);
+}
+
+bool mlir::lsp::fromJSON(const llvm::json::Value &value, DiagnosticTag &result,
+                         llvm::json::Path path) {
+  if (std::optional<int64_t> i = value.getAsInteger()) {
+    result = (DiagnosticTag)*i;
+    return true;
+  }
+
+  return false;
+}
+
 llvm::json::Value mlir::lsp::toJSON(const Diagnostic &diag) {
   llvm::json::Object result{
       {"range", diag.range},
@@ -658,6 +672,8 @@ llvm::json::Value mlir::lsp::toJSON(const Diagnostic &diag) {
     result["source"] = diag.source;
   if (diag.relatedInformation)
     result["relatedInformation"] = *diag.relatedInformation;
+  if (!diag.tags.empty())
+    result["tags"] = diag.tags;
   return std::move(result);
 }
 
@@ -675,7 +691,8 @@ bool mlir::lsp::fromJSON(const llvm::json::Value &value, Diagnostic &result,
          mapOptOrNull(value, "category", result.category, path) &&
          mapOptOrNull(value, "source", result.source, path) &&
          mapOptOrNull(value, "relatedInformation", result.relatedInformation,
-                      path);
+                      path) &&
+         mapOptOrNull(value, "tags", result.tags, path);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Transforms/Mem2Reg.cpp b/mlir/lib/Transforms/Mem2Reg.cpp
index 71ba5bc076f0..e096747741c0 100644
--- a/mlir/lib/Transforms/Mem2Reg.cpp
+++ b/mlir/lib/Transforms/Mem2Reg.cpp
@@ -18,7 +18,6 @@
 #include "mlir/Transforms/Passes.h"
 #include "mlir/Transforms/RegionUtils.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/GenericIteratedDominanceFrontier.h"
 
 namespace mlir {
@@ -158,20 +157,25 @@ private:
   const DataLayout &dataLayout;
 };
 
+using BlockIndexCache = DenseMap<Region *, DenseMap<Block *, size_t>>;
+
 /// The MemorySlotPromoter handles the state of promoting a memory slot. It
 /// wraps a slot and its associated allocator. This will perform the mutation of
 /// IR.
 class MemorySlotPromoter {
 public:
   MemorySlotPromoter(MemorySlot slot, PromotableAllocationOpInterface allocator,
-                     RewriterBase &rewriter, DominanceInfo &dominance,
+                     OpBuilder &builder, DominanceInfo &dominance,
                      const DataLayout &dataLayout, MemorySlotPromotionInfo info,
-                     const Mem2RegStatistics &statistics);
+                     const Mem2RegStatistics &statistics,
+                     BlockIndexCache &blockIndexCache);
 
   /// Actually promotes the slot by mutating IR. Promoting a slot DOES
   /// invalidate the MemorySlotPromotionInfo of other slots. Preparation of
   /// promotion info should NOT be performed in batches.
-  void promoteSlot();
+  /// Returns a promotable allocation op if a new allocator was created, nullopt
+  /// otherwise.
+  std::optional<PromotableAllocationOpInterface> promoteSlot();
 
 private:
   /// Computes the reaching definition for all the operations that require
@@ -195,7 +199,7 @@ private:
 
   MemorySlot slot;
   PromotableAllocationOpInterface allocator;
-  RewriterBase &rewriter;
+  OpBuilder &builder;
   /// Potentially non-initialized default value. Use `getOrCreateDefaultValue`
   /// to initialize it on demand.
   Value defaultValue;
@@ -207,18 +211,21 @@ private:
   const DataLayout &dataLayout;
   MemorySlotPromotionInfo info;
   const Mem2RegStatistics &statistics;
+
+  /// Shared cache of block indices of specific regions.
+  BlockIndexCache &blockIndexCache;
 };
 
 } // namespace
 
 MemorySlotPromoter::MemorySlotPromoter(
     MemorySlot slot, PromotableAllocationOpInterface allocator,
-    RewriterBase &rewriter, DominanceInfo &dominance,
-    const DataLayout &dataLayout, MemorySlotPromotionInfo info,
-    const Mem2RegStatistics &statistics)
-    : slot(slot), allocator(allocator), rewriter(rewriter),
-      dominance(dominance), dataLayout(dataLayout), info(std::move(info)),
-      statistics(statistics) {
+    OpBuilder &builder, DominanceInfo &dominance, const DataLayout &dataLayout,
+    MemorySlotPromotionInfo info, const Mem2RegStatistics &statistics,
+    BlockIndexCache &blockIndexCache)
+    : slot(slot), allocator(allocator), builder(builder), dominance(dominance),
+      dataLayout(dataLayout), info(std::move(info)), statistics(statistics),
+      blockIndexCache(blockIndexCache) {
 #ifndef NDEBUG
   auto isResultOrNewBlockArgument = [&]() {
     if (BlockArgument arg = dyn_cast<BlockArgument>(slot.ptr))
@@ -236,9 +243,9 @@ Value MemorySlotPromoter::getOrCreateDefaultValue() {
   if (defaultValue)
     return defaultValue;
 
-  RewriterBase::InsertionGuard guard(rewriter);
-  rewriter.setInsertionPointToStart(slot.ptr.getParentBlock());
-  return defaultValue = allocator.getDefaultValue(slot, rewriter);
+  OpBuilder::InsertionGuard guard(builder);
+  builder.setInsertionPointToStart(slot.ptr.getParentBlock());
+  return defaultValue = allocator.getDefaultValue(slot, builder);
 }
 
 LogicalResult MemorySlotPromotionAnalyzer::computeBlockingUses(
@@ -437,8 +444,8 @@ Value MemorySlotPromoter::computeReachingDefInBlock(Block *block,
         reachingDefs.insert({memOp, reachingDef});
 
       if (memOp.storesTo(slot)) {
-        rewriter.setInsertionPointAfter(memOp);
-        Value stored = memOp.getStored(slot, rewriter, reachingDef, dataLayout);
+        builder.setInsertionPointAfter(memOp);
+        Value stored = memOp.getStored(slot, builder, reachingDef, dataLayout);
         assert(stored && "a memory operation storing to a slot must provide a "
                          "new definition of the slot");
         reachingDef = stored;
@@ -475,33 +482,10 @@ void MemorySlotPromoter::computeReachingDefInRegion(Region *region,
     Block *block = job.block->getBlock();
 
     if (info.mergePoints.contains(block)) {
-      // If the block is a merge point, we need to add a block argument to hold
-      // the selected reaching definition. This has to be a bit complicated
-      // because of RewriterBase limitations: we need to create a new block with
-      // the extra block argument, move the content of the block to the new
-      // block, and replace the block with the new block in the merge point set.
-      SmallVector<Type> argTypes;
-      SmallVector<Location> argLocs;
-      for (BlockArgument arg : block->getArguments()) {
-        argTypes.push_back(arg.getType());
-        argLocs.push_back(arg.getLoc());
-      }
-      argTypes.push_back(slot.elemType);
-      argLocs.push_back(slot.ptr.getLoc());
-      Block *newBlock = rewriter.createBlock(block, argTypes, argLocs);
-
-      info.mergePoints.erase(block);
-      info.mergePoints.insert(newBlock);
-
-      rewriter.replaceAllUsesWith(block, newBlock);
-      rewriter.mergeBlocks(block, newBlock,
-                           newBlock->getArguments().drop_back());
-
-      block = newBlock;
-
-      BlockArgument blockArgument = block->getArguments().back();
-      rewriter.setInsertionPointToStart(block);
-      allocator.handleBlockArgument(slot, blockArgument, rewriter);
+      BlockArgument blockArgument =
+          block->addArgument(slot.elemType, slot.ptr.getLoc());
+      builder.setInsertionPointToStart(block);
+      allocator.handleBlockArgument(slot, blockArgument, builder);
       job.reachingDef = blockArgument;
 
       if (statistics.newBlockArgumentAmount)
@@ -514,10 +498,8 @@ void MemorySlotPromoter::computeReachingDefInRegion(Region *region,
     if (auto terminator = dyn_cast<BranchOpInterface>(block->getTerminator())) {
       for (BlockOperand &blockOperand : terminator->getBlockOperands()) {
         if (info.mergePoints.contains(blockOperand.get())) {
-          rewriter.modifyOpInPlace(terminator, [&]() {
-            terminator.getSuccessorOperands(blockOperand.getOperandNumber())
-                .append(job.reachingDef);
-          });
+          terminator.getSuccessorOperands(blockOperand.getOperandNumber())
+              .append(job.reachingDef);
         }
       }
     }
@@ -527,15 +509,29 @@ void MemorySlotPromoter::computeReachingDefInRegion(Region *region,
   }
 }
 
+/// Gets or creates a block index mapping for `region`.
+static const DenseMap<Block *, size_t> &
+getOrCreateBlockIndices(BlockIndexCache &blockIndexCache, Region *region) {
+  auto [it, inserted] = blockIndexCache.try_emplace(region);
+  if (!inserted)
+    return it->second;
+
+  DenseMap<Block *, size_t> &blockIndices = it->second;
+  SetVector<Block *> topologicalOrder = getTopologicallySortedBlocks(*region);
+  for (auto [index, block] : llvm::enumerate(topologicalOrder))
+    blockIndices[block] = index;
+  return blockIndices;
+}
+
 /// Sorts `ops` according to dominance. Relies on the topological order of basic
-/// blocks to get a deterministic ordering.
-static void dominanceSort(SmallVector<Operation *> &ops, Region &region) {
+/// blocks to get a deterministic ordering. Uses `blockIndexCache` to avoid the
+/// potentially expensive recomputation of a block index map.
+static void dominanceSort(SmallVector<Operation *> &ops, Region &region,
+                          BlockIndexCache &blockIndexCache) {
   // Produce a topological block order and construct a map to lookup the indices
   // of blocks.
-  DenseMap<Block *, size_t> topoBlockIndices;
-  SetVector<Block *> topologicalOrder = getTopologicallySortedBlocks(region);
-  for (auto [index, block] : llvm::enumerate(topologicalOrder))
-    topoBlockIndices[block] = index;
+  const DenseMap<Block *, size_t> &topoBlockIndices =
+      getOrCreateBlockIndices(blockIndexCache, &region);
 
   // Combining the topological order of the basic blocks together with block
   // internal operation order guarantees a deterministic, dominance respecting
@@ -554,7 +550,8 @@ void MemorySlotPromoter::removeBlockingUses() {
       llvm::make_first_range(info.userToBlockingUses));
 
   // Sort according to dominance.
-  dominanceSort(usersToRemoveUses, *slot.ptr.getParentBlock()->getParent());
+  dominanceSort(usersToRemoveUses, *slot.ptr.getParentBlock()->getParent(),
+                blockIndexCache);
 
   llvm::SmallVector<Operation *> toErase;
   // List of all replaced values in the slot.
@@ -569,9 +566,9 @@ void MemorySlotPromoter::removeBlockingUses() {
       if (!reachingDef)
         reachingDef = getOrCreateDefaultValue();
 
-      rewriter.setInsertionPointAfter(toPromote);
+      builder.setInsertionPointAfter(toPromote);
       if (toPromoteMemOp.removeBlockingUses(
-              slot, info.userToBlockingUses[toPromote], rewriter, reachingDef,
+              slot, info.userToBlockingUses[toPromote], builder, reachingDef,
               dataLayout) == DeletionKind::Delete)
         toErase.push_back(toPromote);
       if (toPromoteMemOp.storesTo(slot))
@@ -581,26 +578,27 @@ void MemorySlotPromoter::removeBlockingUses() {
     }
 
     auto toPromoteBasic = cast<PromotableOpInterface>(toPromote);
-    rewriter.setInsertionPointAfter(toPromote);
+    builder.setInsertionPointAfter(toPromote);
     if (toPromoteBasic.removeBlockingUses(info.userToBlockingUses[toPromote],
-                                          rewriter) == DeletionKind::Delete)
+                                          builder) == DeletionKind::Delete)
       toErase.push_back(toPromote);
     if (toPromoteBasic.requiresReplacedValues())
       toVisit.push_back(toPromoteBasic);
   }
   for (PromotableOpInterface op : toVisit) {
-    rewriter.setInsertionPointAfter(op);
-    op.visitReplacedValues(replacedValuesList, rewriter);
+    builder.setInsertionPointAfter(op);
+    op.visitReplacedValues(replacedValuesList, builder);
   }
 
   for (Operation *toEraseOp : toErase)
-    rewriter.eraseOp(toEraseOp);
+    toEraseOp->erase();
 
   assert(slot.ptr.use_empty() &&
          "after promotion, the slot pointer should not be used anymore");
 }
 
-void MemorySlotPromoter::promoteSlot() {
+std::optional<PromotableAllocationOpInterface>
+MemorySlotPromoter::promoteSlot() {
   computeReachingDefInRegion(slot.ptr.getParentRegion(),
                              getOrCreateDefaultValue());
 
@@ -617,8 +615,7 @@ void MemorySlotPromoter::promoteSlot() {
       assert(succOperands.size() == mergePoint->getNumArguments() ||
              succOperands.size() + 1 == mergePoint->getNumArguments());
       if (succOperands.size() + 1 == mergePoint->getNumArguments())
-        rewriter.modifyOpInPlace(
-            user, [&]() { succOperands.append(getOrCreateDefaultValue()); });
+        succOperands.append(getOrCreateDefaultValue());
     }
   }
 
@@ -628,30 +625,64 @@ void MemorySlotPromoter::promoteSlot() {
   if (statistics.promotedAmount)
     (*statistics.promotedAmount)++;
 
-  allocator.handlePromotionComplete(slot, defaultValue, rewriter);
+  return allocator.handlePromotionComplete(slot, defaultValue, builder);
 }
 
 LogicalResult mlir::tryToPromoteMemorySlots(
-    ArrayRef<PromotableAllocationOpInterface> allocators,
-    RewriterBase &rewriter, const DataLayout &dataLayout,
+    ArrayRef<PromotableAllocationOpInterface> allocators, OpBuilder &builder,
+    const DataLayout &dataLayout, DominanceInfo &dominance,
     Mem2RegStatistics statistics) {
   bool promotedAny = false;
 
-  for (PromotableAllocationOpInterface allocator : allocators) {
-    for (MemorySlot slot : allocator.getPromotableSlots()) {
-      if (slot.ptr.use_empty())
-        continue;
-
-      DominanceInfo dominance;
-      MemorySlotPromotionAnalyzer analyzer(slot, dominance, dataLayout);
-      std::optional<MemorySlotPromotionInfo> info = analyzer.computeInfo();
-      if (info) {
-        MemorySlotPromoter(slot, allocator, rewriter, dominance, dataLayout,
-                           std::move(*info), statistics)
-            .promoteSlot();
-        promotedAny = true;
+  // A cache that stores deterministic block indices which are used to determine
+  // a valid operation modification order. The block index maps are computed
+  // lazily and cached to avoid expensive recomputation.
+  BlockIndexCache blockIndexCache;
+
+  SmallVector<PromotableAllocationOpInterface> workList(allocators.begin(),
+                                                        allocators.end());
+
+  SmallVector<PromotableAllocationOpInterface> newWorkList;
+  newWorkList.reserve(workList.size());
+  while (true) {
+    bool changesInThisRound = false;
+    for (PromotableAllocationOpInterface allocator : workList) {
+      bool changedAllocator = false;
+      for (MemorySlot slot : allocator.getPromotableSlots()) {
+        if (slot.ptr.use_empty())
+          continue;
+
+        MemorySlotPromotionAnalyzer analyzer(slot, dominance, dataLayout);
+        std::optional<MemorySlotPromotionInfo> info = analyzer.computeInfo();
+        if (info) {
+          std::optional<PromotableAllocationOpInterface> newAllocator =
+              MemorySlotPromoter(slot, allocator, builder, dominance,
+                                 dataLayout, std::move(*info), statistics,
+                                 blockIndexCache)
+                  .promoteSlot();
+          changedAllocator = true;
+          // Add newly created allocators to the worklist for further
+          // processing.
+          if (newAllocator)
+            newWorkList.push_back(*newAllocator);
+
+          // A break is required, since promoting a slot may invalidate the
+          // remaining slots of an allocator.
+          break;
+        }
       }
+      if (!changedAllocator)
+        newWorkList.push_back(allocator);
+      changesInThisRound |= changedAllocator;
     }
+    if (!changesInThisRound)
+      break;
+    promotedAny = true;
+
+    // Swap the vector's backing memory and clear the entries in newWorkList
+    // afterwards. This ensures that additional heap allocations can be avoided.
+    workList.swap(newWorkList);
+    newWorkList.clear();
   }
 
   return success(promotedAny);
@@ -669,33 +700,26 @@ struct Mem2Reg : impl::Mem2RegBase<Mem2Reg> {
 
     bool changed = false;
 
+    auto &dataLayoutAnalysis = getAnalysis<DataLayoutAnalysis>();
+    const DataLayout &dataLayout = dataLayoutAnalysis.getAtOrAbove(scopeOp);
+    auto &dominance = getAnalysis<DominanceInfo>();
+
     for (Region &region : scopeOp->getRegions()) {
       if (region.getBlocks().empty())
         continue;
 
       OpBuilder builder(&region.front(), region.front().begin());
-      IRRewriter rewriter(builder);
-
-      // Promoting a slot can allow for further promotion of other slots,
-      // promotion is tried until no promotion succeeds.
-      while (true) {
-        SmallVector<PromotableAllocationOpInterface> allocators;
-        // Build a list of allocators to attempt to promote the slots of.
-        region.walk([&](PromotableAllocationOpInterface allocator) {
-          allocators.emplace_back(allocator);
-        });
-
-        auto &dataLayoutAnalysis = getAnalysis<DataLayoutAnalysis>();
-        const DataLayout &dataLayout = dataLayoutAnalysis.getAtOrAbove(scopeOp);
-
-        // Attempt promoting until no promotion succeeds.
-        if (failed(tryToPromoteMemorySlots(allocators, rewriter, dataLayout,
-                                           statistics)))
-          break;
 
+      SmallVector<PromotableAllocationOpInterface> allocators;
+      // Build a list of allocators to attempt to promote the slots of.
+      region.walk([&](PromotableAllocationOpInterface allocator) {
+        allocators.emplace_back(allocator);
+      });
+
+      // Attempt promoting as many of the slots as possible.
+      if (succeeded(tryToPromoteMemorySlots(allocators, builder, dataLayout,
+                                            dominance, statistics)))
         changed = true;
-        getAnalysisManager().invalidate({});
-      }
     }
     if (!changed)
       markAllAnalysesPreserved();
diff --git a/mlir/lib/Transforms/SROA.cpp b/mlir/lib/Transforms/SROA.cpp
index f24cbb7b1725..4e28fa687ffd 100644
--- a/mlir/lib/Transforms/SROA.cpp
+++ b/mlir/lib/Transforms/SROA.cpp
@@ -134,15 +134,14 @@ computeDestructuringInfo(DestructurableMemorySlot &slot,
 /// subslots as specified by its allocator.
 static void destructureSlot(DestructurableMemorySlot &slot,
                             DestructurableAllocationOpInterface allocator,
-                            RewriterBase &rewriter,
-                            const DataLayout &dataLayout,
+                            OpBuilder &builder, const DataLayout &dataLayout,
                             MemorySlotDestructuringInfo &info,
                             const SROAStatistics &statistics) {
-  RewriterBase::InsertionGuard guard(rewriter);
+  OpBuilder::InsertionGuard guard(builder);
 
-  rewriter.setInsertionPointToStart(slot.ptr.getParentBlock());
+  builder.setInsertionPointToStart(slot.ptr.getParentBlock());
   DenseMap<Attribute, MemorySlot> subslots =
-      allocator.destructure(slot, info.usedIndices, rewriter);
+      allocator.destructure(slot, info.usedIndices, builder);
 
   if (statistics.slotsWithMemoryBenefit &&
       slot.elementPtrs.size() != info.usedIndices.size())
@@ -160,9 +159,9 @@ static void destructureSlot(DestructurableMemorySlot &slot,
 
   llvm::SmallVector<Operation *> toErase;
   for (Operation *toRewire : llvm::reverse(usersToRewire)) {
-    rewriter.setInsertionPointAfter(toRewire);
+    builder.setInsertionPointAfter(toRewire);
     if (auto accessor = dyn_cast<DestructurableAccessorOpInterface>(toRewire)) {
-      if (accessor.rewire(slot, subslots, rewriter, dataLayout) ==
+      if (accessor.rewire(slot, subslots, builder, dataLayout) ==
           DeletionKind::Delete)
         toErase.push_back(accessor);
       continue;
@@ -170,12 +169,12 @@ static void destructureSlot(DestructurableMemorySlot &slot,
 
     auto promotable = cast<PromotableOpInterface>(toRewire);
     if (promotable.removeBlockingUses(info.userToBlockingUses[promotable],
-                                      rewriter) == DeletionKind::Delete)
+                                      builder) == DeletionKind::Delete)
       toErase.push_back(promotable);
   }
 
   for (Operation *toEraseOp : toErase)
-    rewriter.eraseOp(toEraseOp);
+    toEraseOp->erase();
 
   assert(slot.ptr.use_empty() && "after destructuring, the original slot "
                                  "pointer should no longer be used");
@@ -186,12 +185,12 @@ static void destructureSlot(DestructurableMemorySlot &slot,
   if (statistics.destructuredAmount)
     (*statistics.destructuredAmount)++;
 
-  allocator.handleDestructuringComplete(slot, rewriter);
+  allocator.handleDestructuringComplete(slot, builder);
 }
 
 LogicalResult mlir::tryToDestructureMemorySlots(
     ArrayRef<DestructurableAllocationOpInterface> allocators,
-    RewriterBase &rewriter, const DataLayout &dataLayout,
+    OpBuilder &builder, const DataLayout &dataLayout,
     SROAStatistics statistics) {
   bool destructuredAny = false;
 
@@ -202,7 +201,7 @@ LogicalResult mlir::tryToDestructureMemorySlots(
       if (!info)
         continue;
 
-      destructureSlot(slot, allocator, rewriter, dataLayout, *info, statistics);
+      destructureSlot(slot, allocator, builder, dataLayout, *info, statistics);
       destructuredAny = true;
     }
   }
@@ -230,7 +229,6 @@ struct SROA : public impl::SROABase<SROA> {
         continue;
 
       OpBuilder builder(&region.front(), region.front().begin());
-      IRRewriter rewriter(builder);
 
       // Destructuring a slot can allow for further destructuring of other
       // slots, destructuring is tried until no destructuring succeeds.
@@ -243,7 +241,7 @@ struct SROA : public impl::SROABase<SROA> {
           allocators.emplace_back(allocator);
         });
 
-        if (failed(tryToDestructureMemorySlots(allocators, rewriter, dataLayout,
+        if (failed(tryToDestructureMemorySlots(allocators, builder, dataLayout,
                                                statistics)))
           break;
 
diff --git a/mlir/python/CMakeLists.txt b/mlir/python/CMakeLists.txt
index a6c78880c8e7..d8f2d1989fde 100644
--- a/mlir/python/CMakeLists.txt
+++ b/mlir/python/CMakeLists.txt
@@ -538,7 +538,7 @@ declare_mlir_python_extension(MLIRPythonExtension.Dialects.Quant.Pybind
 )
 
 declare_mlir_python_extension(MLIRPythonExtension.Dialects.NVGPU.Pybind
-  MODULE_NAME _mlirDialectsNvgpu
+  MODULE_NAME _mlirDialectsNVGPU
   ADD_TO_PARENT MLIRPythonSources.Dialects.nvgpu
   ROOT_DIR "${PYTHON_SOURCE_DIR}"
   SOURCES
diff --git a/mlir/python/mlir/dialects/nvgpu.py b/mlir/python/mlir/dialects/nvgpu.py
index e19bf610ea33..d6a54f2772f4 100644
--- a/mlir/python/mlir/dialects/nvgpu.py
+++ b/mlir/python/mlir/dialects/nvgpu.py
@@ -4,4 +4,4 @@
 
 from ._nvgpu_ops_gen import *
 from ._nvgpu_enum_gen import *
-from .._mlir_libs._mlirDialectsNvgpu import *
+from .._mlir_libs._mlirDialectsNVGPU import *
diff --git a/mlir/python/mlir/dialects/transform/structured.py b/mlir/python/mlir/dialects/transform/structured.py
index d7b41c0bd220..2c49ef0960c7 100644
--- a/mlir/python/mlir/dialects/transform/structured.py
+++ b/mlir/python/mlir/dialects/transform/structured.py
@@ -374,9 +374,9 @@ class PadOp(PadOp):
         self,
         target: Union[Operation, OpView, Value],
         *,
+        pad_to_multiple_of: Optional[Union[DynamicIndexList, ArrayAttr]] = None,
         padding_values: Optional[Union[ArrayAttr, Sequence[Attribute]]] = None,
         padding_dimensions: OptionalIntList = None,
-        pad_to_multiple_of: OptionalIntList = None,
         pack_paddings: OptionalIntList = None,
         transpose_paddings: Optional[
             Union[ArrayAttr, Sequence[Union[ArrayAttr, IntOrAttrList]]]
@@ -385,6 +385,16 @@ class PadOp(PadOp):
         loc=None,
         ip=None,
     ):
+        if pad_to_multiple_of is None:
+            dynamic_pad_to_multiple_of = []
+            static_pad_to_multiple_of = None
+        else:
+            (
+                dynamic_pad_to_multiple_of,
+                static_pad_to_multiple_of,
+                _,
+            ) = _dispatch_dynamic_index_list(pad_to_multiple_of)
+
         transpose_paddings = _get_int_array_array_attr(transpose_paddings)
 
         any_op_type = transform.AnyOpType.get()
@@ -393,9 +403,10 @@ class PadOp(PadOp):
             any_op_type,
             any_op_type,
             target,
+            pad_to_multiple_of=dynamic_pad_to_multiple_of,
             padding_values=padding_values,
             padding_dimensions=padding_dimensions,
-            pad_to_multiple_of=pad_to_multiple_of,
+            static_pad_to_multiple_of=static_pad_to_multiple_of,
             pack_paddings=pack_paddings,
             transpose_paddings=transpose_paddings,
             copy_back_op=copy_back_op,
diff --git a/mlir/python/mlir/ir.py b/mlir/python/mlir/ir.py
index eb7f035fec7c..80c965b2d0eb 100644
--- a/mlir/python/mlir/ir.py
+++ b/mlir/python/mlir/ir.py
@@ -274,7 +274,7 @@ try:
     @register_attribute_builder("F64ElementsAttr")
     def _f64ElementsAttr(x, context):
         return DenseElementsAttr.get(
-            np.array(x, dtype=np.int64),
+            np.array(x, dtype=np.float64),
             type=F64Type.get(context=context),
             context=context,
         )
diff --git a/mlir/test/CAPI/CMakeLists.txt b/mlir/test/CAPI/CMakeLists.txt
index b9cd63ef7c67..57b342a5e26b 100644
--- a/mlir/test/CAPI/CMakeLists.txt
+++ b/mlir/test/CAPI/CMakeLists.txt
@@ -38,6 +38,13 @@ _add_capi_test_executable(mlir-capi-ir-test
     MLIRCAPIRegisterEverything
 )
 
+_add_capi_test_executable(mlir-capi-irdl-test
+  irdl.c
+  LINK_LIBS PRIVATE
+    MLIRCAPIIR
+    MLIRCAPIIRDL
+)
+
 _add_capi_test_executable(mlir-capi-llvm-test
   llvm.c
   LINK_LIBS PRIVATE
diff --git a/mlir/test/CAPI/irdl.c b/mlir/test/CAPI/irdl.c
new file mode 100644
index 000000000000..b35345b664b1
--- /dev/null
+++ b/mlir/test/CAPI/irdl.c
@@ -0,0 +1,58 @@
+//===- irdl.c - Test for the C bindings for IRDL registration -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM
+// Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+/* RUN: mlir-capi-irdl-test 2>&1 | FileCheck %s
+ */
+
+#include "mlir-c/Dialect/IRDL.h"
+#include "mlir-c/IR.h"
+
+const char irdlDialect[] = "\
+  irdl.dialect @foo {\
+    irdl.operation @op {\
+      %i32 = irdl.is i32\
+      irdl.results(%i32)\
+    }\
+  }\
+  irdl.dialect @bar {\
+    irdl.operation @op {\
+      %i32 = irdl.is i32\
+      irdl.operands(%i32)\
+    }\
+  }";
+
+// CHECK:      module {
+// CHECK-NEXT:   %[[RES:.*]] = "foo.op"() : () -> i32
+// CHECK-NEXT:   "bar.op"(%[[RES]]) :  (i32) -> ()
+// CHECK-NEXT: }
+const char newDialectUsage[] = "\
+  module {\
+    %res = \"foo.op\"() : () -> i32\
+    \"bar.op\"(%res) : (i32) -> ()\
+  }";
+
+int main(void) {
+  MlirContext ctx = mlirContextCreate();
+  mlirDialectHandleLoadDialect(mlirGetDialectHandle__irdl__(), ctx);
+
+  MlirModule dialectDecl =
+      mlirModuleCreateParse(ctx, mlirStringRefCreateFromCString(irdlDialect));
+
+  mlirLoadIRDLDialects(dialectDecl);
+  mlirModuleDestroy(dialectDecl);
+
+  MlirModule usingModule = mlirModuleCreateParse(
+      ctx, mlirStringRefCreateFromCString(newDialectUsage));
+
+  mlirOperationDump(mlirModuleGetOperation(usingModule));
+
+  mlirModuleDestroy(usingModule);
+  mlirContextDestroy(ctx);
+  return 0;
+}
diff --git a/mlir/test/CAPI/transform_interpreter.c b/mlir/test/CAPI/transform_interpreter.c
index f1ab185e0e21..a849b2f24526 100644
--- a/mlir/test/CAPI/transform_interpreter.c
+++ b/mlir/test/CAPI/transform_interpreter.c
@@ -18,7 +18,7 @@
 #include <stdlib.h>
 
 int testApplyNamedSequence(MlirContext ctx) {
-  fprintf(stderr, "%s\n", __FUNCTION__);
+  fprintf(stderr, "%s\n", __func__);
 
   const char module[] =
       "module attributes {transform.with_named_sequence} {"
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
index 5319a9cb33e0..8806a1dd9223 100644
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -101,6 +101,7 @@ configure_lit_site_cfg(
 set(MLIR_TEST_DEPENDS
   FileCheck count not split-file
   mlir-capi-ir-test
+  mlir-capi-irdl-test
   mlir-capi-llvm-test
   mlir-capi-pass-test
   mlir-capi-quant-test
diff --git a/mlir/test/Conversion/ArithToEmitC/arith-to-emitc-unsupported.mlir b/mlir/test/Conversion/ArithToEmitC/arith-to-emitc-unsupported.mlir
new file mode 100644
index 000000000000..66dfa8fa3e15
--- /dev/null
+++ b/mlir/test/Conversion/ArithToEmitC/arith-to-emitc-unsupported.mlir
@@ -0,0 +1,65 @@
+// RUN: mlir-opt -split-input-file -convert-arith-to-emitc -verify-diagnostics %s
+
+func.func @arith_cast_tensor(%arg0: tensor<5xf32>) -> tensor<5xi32> {
+  // expected-error @+1 {{failed to legalize operation 'arith.fptosi'}}
+  %t = arith.fptosi %arg0 : tensor<5xf32> to tensor<5xi32>
+  return %t: tensor<5xi32>
+}
+
+// -----
+
+func.func @arith_cast_vector(%arg0: vector<5xf32>) -> vector<5xi32> {
+  // expected-error @+1 {{failed to legalize operation 'arith.fptosi'}}
+  %t = arith.fptosi %arg0 : vector<5xf32> to vector<5xi32>
+  return %t: vector<5xi32>
+}
+
+// -----
+
+func.func @arith_cast_bf16(%arg0: bf16) -> i32 {
+  // expected-error @+1 {{failed to legalize operation 'arith.fptosi'}}
+  %t = arith.fptosi %arg0 : bf16 to i32
+  return %t: i32
+}
+
+// -----
+
+func.func @arith_cast_f16(%arg0: f16) -> i32 {
+  // expected-error @+1 {{failed to legalize operation 'arith.fptosi'}}
+  %t = arith.fptosi %arg0 : f16 to i32
+  return %t: i32
+}
+
+
+// -----
+
+func.func @arith_cast_to_bf16(%arg0: i32) -> bf16 {
+  // expected-error @+1 {{failed to legalize operation 'arith.sitofp'}}
+  %t = arith.sitofp %arg0 : i32 to bf16
+  return %t: bf16
+}
+
+// -----
+
+func.func @arith_cast_to_f16(%arg0: i32) -> f16 {
+  // expected-error @+1 {{failed to legalize operation 'arith.sitofp'}}
+  %t = arith.sitofp %arg0 : i32 to f16
+  return %t: f16
+}
+
+// -----
+
+func.func @arith_cast_fptosi_i1(%arg0: f32) -> i1 {
+  // expected-error @+1 {{failed to legalize operation 'arith.fptosi'}}
+  %t = arith.fptosi %arg0 : f32 to i1
+  return %t: i1
+}
+
+// -----
+
+func.func @arith_cast_fptoui_i1(%arg0: f32) -> i1 {
+  // expected-error @+1 {{failed to legalize operation 'arith.fptoui'}}
+  %t = arith.fptoui %arg0 : f32 to i1
+  return %t: i1
+}
+
diff --git a/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir b/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir
index 46b407177b46..79fecd61494d 100644
--- a/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir
+++ b/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir
@@ -141,3 +141,39 @@ func.func @arith_cmpi_predicates(%arg0: i32, %arg1: i32) {
   
   return
 }
+
+// -----
+
+func.func @arith_float_to_int_cast_ops(%arg0: f32, %arg1: f64) {
+  // CHECK: emitc.cast %arg0 : f32 to i32
+  %0 = arith.fptosi %arg0 : f32 to i32
+
+  // CHECK: emitc.cast %arg1 : f64 to i32
+  %1 = arith.fptosi %arg1 : f64 to i32
+
+  // CHECK: emitc.cast %arg0 : f32 to i16
+  %2 = arith.fptosi %arg0 : f32 to i16
+
+  // CHECK: emitc.cast %arg1 : f64 to i16
+  %3 = arith.fptosi %arg1 : f64 to i16
+
+  // CHECK: %[[CAST0:.*]] = emitc.cast %arg0 : f32 to ui32
+  // CHECK: emitc.cast %[[CAST0]] : ui32 to i32
+  %4 = arith.fptoui %arg0 : f32 to i32
+
+  return
+}
+
+func.func @arith_int_to_float_cast_ops(%arg0: i8, %arg1: i64) {
+  // CHECK: emitc.cast %arg0 : i8 to f32
+  %0 = arith.sitofp %arg0 : i8 to f32
+
+  // CHECK: emitc.cast %arg1 : i64 to f32
+  %1 = arith.sitofp %arg1 : i64 to f32
+
+  // CHECK: %[[CAST_UNS:.*]] = emitc.cast %arg0 : i8 to ui8
+  // CHECK: emitc.cast %[[CAST_UNS]] : ui8 to f32
+  %2 = arith.uitofp %arg0 : i8 to f32
+
+  return
+}
diff --git a/mlir/test/Conversion/ArmSMEToLLVM/arm-sme-to-llvm.mlir b/mlir/test/Conversion/ArmSMEToLLVM/arm-sme-to-llvm.mlir
index 81087cc02099..f48046a8d799 100644
--- a/mlir/test/Conversion/ArmSMEToLLVM/arm-sme-to-llvm.mlir
+++ b/mlir/test/Conversion/ArmSMEToLLVM/arm-sme-to-llvm.mlir
@@ -25,6 +25,7 @@ func.func @arm_sme_load_tile_slice_hor_i8(%src : memref<?x?xi8>, %mask : vector<
   %c0 = arith.constant 0 : index
   %tile = arm_sme.get_tile : vector<[16]x[16]xi8>
   %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index : memref<?x?xi8>, vector<[16]xi1>, vector<[16]x[16]xi8>
+  "test.some_use" (%tile_update) : (vector<[16]x[16]xi8>) -> ()
   return
 }
 
@@ -36,6 +37,7 @@ func.func @arm_sme_load_tile_slice_hor_i16(%src : memref<?x?xi16>, %mask : vecto
   %c0 = arith.constant 0 : index
   %tile = arm_sme.get_tile : vector<[8]x[8]xi16>
   %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index : memref<?x?xi16>, vector<[8]xi1>, vector<[8]x[8]xi16>
+  "test.some_use" (%tile_update) : (vector<[8]x[8]xi16>) -> ()
   return
 }
 
@@ -47,6 +49,7 @@ func.func @arm_sme_load_tile_slice_hor_i32(%src : memref<?x?xi32>, %mask : vecto
   %c0 = arith.constant 0 : index
   %tile = arm_sme.get_tile : vector<[4]x[4]xi32>
   %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index : memref<?x?xi32>, vector<[4]xi1>, vector<[4]x[4]xi32>
+  "test.some_use" (%tile_update) : (vector<[4]x[4]xi32>) -> ()
   return
 }
 
@@ -58,6 +61,7 @@ func.func @arm_sme_load_tile_slice_hor_i64(%src : memref<?x?xi64>, %mask : vecto
   %c0 = arith.constant 0 : index
   %tile = arm_sme.get_tile : vector<[2]x[2]xi64>
   %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index : memref<?x?xi64>, vector<[2]xi1>, vector<[2]x[2]xi64>
+  "test.some_use" (%tile_update) : (vector<[2]x[2]xi64>) -> ()
   return
 }
 
@@ -69,6 +73,7 @@ func.func @arm_sme_load_tile_slice_hor_i128(%src : memref<?x?xi128>, %mask : vec
   %c0 = arith.constant 0 : index
   %tile = arm_sme.get_tile : vector<[1]x[1]xi128>
   %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index : memref<?x?xi128>, vector<[1]xi1>, vector<[1]x[1]xi128>
+  "test.some_use" (%tile_update) : (vector<[1]x[1]xi128>) -> ()
   return
 }
 
@@ -80,6 +85,7 @@ func.func @arm_sme_load_tile_slice_hor_f16(%src : memref<?x?xf16>, %mask : vecto
   %c0 = arith.constant 0 : index
   %tile = arm_sme.get_tile : vector<[8]x[8]xf16>
   %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index : memref<?x?xf16>, vector<[8]xi1>, vector<[8]x[8]xf16>
+  "test.some_use" (%tile_update) : (vector<[8]x[8]xf16>) -> ()
   return
 }
 
@@ -91,6 +97,7 @@ func.func @arm_sme_load_tile_slice_hor_bf16(%src : memref<?x?xbf16>, %mask : vec
   %c0 = arith.constant 0 : index
   %tile = arm_sme.get_tile : vector<[8]x[8]xbf16>
   %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index : memref<?x?xbf16>, vector<[8]xi1>, vector<[8]x[8]xbf16>
+  "test.some_use" (%tile_update) : (vector<[8]x[8]xbf16>) -> ()
   return
 }
 
@@ -102,6 +109,7 @@ func.func @arm_sme_load_tile_slice_hor_f32(%src : memref<?x?xf32>, %mask : vecto
   %c0 = arith.constant 0 : index
   %tile = arm_sme.get_tile : vector<[4]x[4]xf32>
   %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index : memref<?x?xf32>, vector<[4]xi1>, vector<[4]x[4]xf32>
+  "test.some_use" (%tile_update) : (vector<[4]x[4]xf32>) -> ()
   return
 }
 
@@ -113,6 +121,7 @@ func.func @arm_sme_load_tile_slice_hor_f64(%src : memref<?x?xf64>, %mask : vecto
   %c0 = arith.constant 0 : index
   %tile = arm_sme.get_tile : vector<[2]x[2]xf64>
   %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index : memref<?x?xf64>, vector<[2]xi1>, vector<[2]x[2]xf64>
+  "test.some_use" (%tile_update) : (vector<[2]x[2]xf64>) -> ()
   return
 }
 
@@ -124,6 +133,7 @@ func.func @arm_sme_load_tile_slice_ver_i8(%src : memref<?x?xi8>, %mask : vector<
   %c0 = arith.constant 0 : index
   %tile = arm_sme.get_tile : vector<[16]x[16]xi8>
   %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index layout<vertical> : memref<?x?xi8>, vector<[16]xi1>, vector<[16]x[16]xi8>
+  "test.some_use" (%tile_update) : (vector<[16]x[16]xi8>) -> ()
   return
 }
 
@@ -135,6 +145,7 @@ func.func @arm_sme_load_tile_slice_ver_i16(%src : memref<?x?xi16>, %mask : vecto
   %c0 = arith.constant 0 : index
   %tile = arm_sme.get_tile : vector<[8]x[8]xi16>
   %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index layout<vertical> : memref<?x?xi16>, vector<[8]xi1>, vector<[8]x[8]xi16>
+  "test.some_use" (%tile_update) : (vector<[8]x[8]xi16>) -> ()
   return
 }
 
@@ -146,6 +157,7 @@ func.func @arm_sme_load_tile_slice_ver_i32(%src : memref<?x?xi32>, %mask : vecto
   %c0 = arith.constant 0 : index
   %tile = arm_sme.get_tile : vector<[4]x[4]xi32>
   %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index layout<vertical> : memref<?x?xi32>, vector<[4]xi1>, vector<[4]x[4]xi32>
+  "test.some_use" (%tile_update) : (vector<[4]x[4]xi32>) -> ()
   return
 }
 
@@ -157,6 +169,7 @@ func.func @arm_sme_load_tile_slice_ver_i64(%src : memref<?x?xi64>, %mask : vecto
   %c0 = arith.constant 0 : index
   %tile = arm_sme.get_tile : vector<[2]x[2]xi64>
   %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index layout<vertical> : memref<?x?xi64>, vector<[2]xi1>, vector<[2]x[2]xi64>
+  "test.some_use" (%tile_update) : (vector<[2]x[2]xi64>) -> ()
   return
 }
 
@@ -168,6 +181,7 @@ func.func @arm_sme_load_tile_slice_ver_i128(%src : memref<?x?xi128>, %mask : vec
   %c0 = arith.constant 0 : index
   %tile = arm_sme.get_tile : vector<[1]x[1]xi128>
   %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index layout<vertical> : memref<?x?xi128>, vector<[1]xi1>, vector<[1]x[1]xi128>
+  "test.some_use" (%tile_update) : (vector<[1]x[1]xi128>) -> ()
   return
 }
 
@@ -179,6 +193,7 @@ func.func @arm_sme_load_tile_slice_ver_f16(%src : memref<?x?xf16>, %mask : vecto
   %c0 = arith.constant 0 : index
   %tile = arm_sme.get_tile : vector<[8]x[8]xf16>
   %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index layout<vertical> : memref<?x?xf16>, vector<[8]xi1>, vector<[8]x[8]xf16>
+  "test.some_use" (%tile_update) : (vector<[8]x[8]xf16>) -> ()
   return
 }
 
@@ -190,6 +205,7 @@ func.func @arm_sme_load_tile_slice_ver_bf16(%src : memref<?x?xbf16>, %mask : vec
   %c0 = arith.constant 0 : index
   %tile = arm_sme.get_tile : vector<[8]x[8]xbf16>
   %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index layout<vertical> : memref<?x?xbf16>, vector<[8]xi1>, vector<[8]x[8]xbf16>
+  "test.some_use" (%tile_update) : (vector<[8]x[8]xbf16>) -> ()
   return
 }
 
@@ -201,6 +217,7 @@ func.func @arm_sme_load_tile_slice_ver_f32(%src : memref<?x?xf32>, %mask : vecto
   %c0 = arith.constant 0 : index
   %tile = arm_sme.get_tile : vector<[4]x[4]xf32>
   %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index layout<vertical> : memref<?x?xf32>, vector<[4]xi1>, vector<[4]x[4]xf32>
+  "test.some_use" (%tile_update) : (vector<[4]x[4]xf32>) -> ()
   return
 }
 
@@ -212,6 +229,7 @@ func.func @arm_sme_load_tile_slice_ver_f64(%src : memref<?x?xf64>, %mask : vecto
   %c0 = arith.constant 0 : index
   %tile = arm_sme.get_tile : vector<[2]x[2]xf64>
   %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index layout<vertical> : memref<?x?xf64>, vector<[2]xi1>, vector<[2]x[2]xf64>
+  "test.some_use" (%tile_update) : (vector<[2]x[2]xf64>) -> ()
   return
 }
 
@@ -441,7 +459,8 @@ func.func @arm_sme_store_tile_slice_ver_f64(%tile_slice_index : index, %mask : v
 func.func @arm_sme_move_vector_to_tile_slice_hor_i32(%vector : vector<[4]xi32>, %tile_slice_index : index) -> () {
   %c0 = arith.constant 0 : index
   %tile = arm_sme.get_tile : vector<[4]x[4]xi32>
-  arm_sme.move_vector_to_tile_slice %vector, %tile, %tile_slice_index : vector<[4]xi32> into vector<[4]x[4]xi32>
+  %tile_update = arm_sme.move_vector_to_tile_slice %vector, %tile, %tile_slice_index : vector<[4]xi32> into vector<[4]x[4]xi32>
+  "test.some_use" (%tile_update) : (vector<[4]x[4]xi32>) -> ()
   return
 }
 
@@ -452,7 +471,8 @@ func.func @arm_sme_move_vector_to_tile_slice_hor_i32(%vector : vector<[4]xi32>,
 func.func @arm_sme_move_vector_to_tile_slice_ver_bf16(%vector : vector<[8]xbf16>, %tile_slice_index : index) -> () {
   %c0 = arith.constant 0 : index
   %tile = arm_sme.get_tile : vector<[8]x[8]xbf16>
-  arm_sme.move_vector_to_tile_slice %vector, %tile, %tile_slice_index layout<vertical> : vector<[8]xbf16> into vector<[8]x[8]xbf16>
+  %tile_update =  arm_sme.move_vector_to_tile_slice %vector, %tile, %tile_slice_index layout<vertical> : vector<[8]xbf16> into vector<[8]x[8]xbf16>
+  "test.some_use" (%tile_update) : (vector<[8]x[8]xbf16>) -> ()
   return
 }
 
diff --git a/mlir/test/Conversion/ArmSMEToLLVM/tile-spills-and-fills.mlir b/mlir/test/Conversion/ArmSMEToLLVM/tile-spills-and-fills.mlir
index 7a9e6b421575..a9c1a65a296f 100644
--- a/mlir/test/Conversion/ArmSMEToLLVM/tile-spills-and-fills.mlir
+++ b/mlir/test/Conversion/ArmSMEToLLVM/tile-spills-and-fills.mlir
@@ -72,17 +72,32 @@ func.func @use_too_many_tiles() {
 //  AFTER-LLVM-LOWERING-DAG: %[[C8:.*]] = arith.constant 8 : index
 //  AFTER-LLVM-LOWERING-DAG: %[[VSCALE:.*]] = vector.vscale
 //  AFTER-LLVM-LOWERING-DAG: %[[SVL_H:.*]] = arith.muli %[[VSCALE]], %[[C8]] : index
+
+///     0. Create an in-memory-tile
+///        Note: 16 is an in-memory tile ID, that is a tile ID >= 16
+
 //  AFTER-LLVM-LOWERING-DAG: %[[TILE_ALLOCA:.*]] = memref.alloca(%[[SVL_H]], %[[SVL_H]])
 // AFTER-LLVM-LOWERING-SAME:   {arm_sme.in_memory_tile_id = 16 : i32} : memref<?x?xi16>
 //
 //  AFTER-LLVM-LOWERING-NOT: scf.for
-//                           Note: 17 is the mask for the 32-bit tile 0.
+
+///     1. The following instruciton corresponds to %0 after tile allocation
+///        Note: 17 is the mask for the 32-bit tile 0.
+
 //      AFTER-LLVM-LOWERING: "arm_sme.intr.zero"() <{tile_mask = 17 : i32}>
 //
 //  AFTER-LLVM-LOWERING-NOT: scf.for
-//                           Note: 34 is the mask for the 32-bit tile 1.
+
+///     2. The following instruciton corresponds to %1 after tile allocation
+///        Note: 34 is the mask for the 32-bit tile 1.
+
 //      AFTER-LLVM-LOWERING: "arm_sme.intr.zero"() <{tile_mask = 34 : i32}>
-//
+
+///     3. swap(<in-memory-tile>, tile 0).
+///        This can be interpreted as spilling %0 (the 32-bit tile 0), so that
+///        %2 can be allocated a tile (16 bit tile 0). Note that this is
+///        swapping vector<[8]x[8]xi16> rather than vector<[4]x[4]xi32>.
+
 //      AFTER-LLVM-LOWERING: scf.for
 // AFTER-LLVM-LOWERING-SAME: %[[C0]] to %[[SVL_H]] step %[[C1]] {
 //      AFTER-LLVM-LOWERING:   %[[MEM_DESC:.*]] = builtin.unrealized_conversion_cast %[[TILE_ALLOCA]]
@@ -92,8 +107,15 @@ func.func @use_too_many_tiles() {
 // AFTER-LLVM-LOWERING-NEXT:   "arm_sme.intr.ld1h.horiz"({{.*}}, %[[SLICE_PTR]], {{.*}}) <{tile_id = 0 : i32}>
 // AFTER-LLVM-LOWERING-NEXT:   vector.store %[[SLICE]], %[[TILE_ALLOCA]]
 // AFTER-LLVM-LOWERING-NEXT: }
-//                           Note: 85 is the mask for the 16-bit tile 0.
+
+///     4. The following instruciton corresponds to %3 after tile allocation
+///        Note: 85 is the mask for the 16-bit tile 0.
+
 //      AFTER-LLVM-LOWERING: "arm_sme.intr.zero"() <{tile_mask = 85 : i32}>
+
+///     5.  swap(<inMemoryTile>, tile 0)
+///         This can be interpreted as restoring %0.
+
 //      AFTER-LLVM-LOWERING: scf.for
 // AFTER-LLVM-LOWERING-SAME: %[[C0]] to %[[SVL_H]] step %[[C1]] {
 //      AFTER-LLVM-LOWERING:   %[[MEM_DESC:.*]] = builtin.unrealized_conversion_cast %[[TILE_ALLOCA]]
@@ -116,7 +138,7 @@ func.func @very_excessive_spills(%memref : memref<?x?xf32>) -> vector<[4]x[4]xf3
   %tile = arm_sme.get_tile : vector<[4]x[4]xf32>
   %mask = vector.constant_mask [4] : vector<[4]xi1>
   %loadSlice = arm_sme.load_tile_slice %memref[%c0, %c0], %mask, %tile, %c0 : memref<?x?xf32>, vector<[4]xi1>, vector<[4]x[4]xf32>
-  return %loadSlice : vector<[4]x[4]xf32>
+  "test.some_use"(%loadSlice) : (vector<[4]x[4]xf32>) -> ()
 }
 // AFTER-TILE-ALLOC-LABEL: @very_excessive_spills
 //      AFTER-TILE-ALLOC: arm_sme.get_tile
@@ -133,22 +155,38 @@ func.func @very_excessive_spills(%memref : memref<?x?xf32>) -> vector<[4]x[4]xf3
 //  AFTER-LLVM-LOWERING-DAG: %[[TILE_ALLOCA:.*]] = memref.alloca(%[[SVL_S]], %[[SVL_S]])
 // AFTER-LLVM-LOWERING-SAME:   {arm_sme.in_memory_tile_id = 16 : i32} : memref<?x?xf32>
 //
+
+/// 1. Swap %useAllTiles and %tile - note that this will only swap one 32-bit
+///    tile (vector<[4]x[4]xf32>)
+
 //      AFTER-LLVM-LOWERING: scf.for
 // AFTER-LLVM-LOWERING-SAME: %[[C0]] to %[[SVL_S]] step %[[C1]] {
 //      AFTER-LLVM-LOWERING:   %[[MEM_DESC:.*]] = builtin.unrealized_conversion_cast %[[TILE_ALLOCA]]
 //      AFTER-LLVM-LOWERING:   %[[BASE_PTR:.*]] = llvm.extractvalue %[[MEM_DESC]][1]
 //      AFTER-LLVM-LOWERING:   %[[SLICE_PTR:.*]] = llvm.getelementptr %[[BASE_PTR]]
+// Read ZA tile slice -> vector
 //      AFTER-LLVM-LOWERING:   %[[SLICE:.*]] = "arm_sme.intr.read.horiz"{{.*}} <{tile_id = 0 : i32}>
+/// Load vector from memory -> ZA tile
 // AFTER-LLVM-LOWERING-NEXT:   "arm_sme.intr.ld1w.horiz"({{.*}}, %[[SLICE_PTR]], {{.*}}) <{tile_id = 0 : i32}>
+/// Store ZA tile slice in memory
 // AFTER-LLVM-LOWERING-NEXT:   vector.store %[[SLICE]], %[[TILE_ALLOCA]]
 // AFTER-LLVM-LOWERING-NEXT: }
+
+/// 2. Load into %tile
 //      AFTER-LLVM-LOWERING: "arm_sme.intr.ld1w.horiz"{{.*}} <{tile_id = 0 : i32}>
+
+/// 3. Swap %useAllTiles and %tile - note that this will only swap one 32-bit
+///    tile (vector<[4]x[4]xf32>)
+
 //      AFTER-LLVM-LOWERING: scf.for
 // AFTER-LLVM-LOWERING-SAME: %[[C0]] to %[[SVL_S]] step %[[C1]] {
 //      AFTER-LLVM-LOWERING:   %[[MEM_DESC:.*]] = builtin.unrealized_conversion_cast %[[TILE_ALLOCA]]
 //      AFTER-LLVM-LOWERING:   %[[BASE_PTR:.*]] = llvm.extractvalue %[[MEM_DESC]][1]
 //      AFTER-LLVM-LOWERING:   %[[SLICE_PTR:.*]] = llvm.getelementptr %[[BASE_PTR]]
+/// Read ZA tile slice -> vector
 //      AFTER-LLVM-LOWERING:   %[[SLICE:.*]] = "arm_sme.intr.read.horiz"{{.*}} <{tile_id = 0 : i32}>
+/// Load vector from memory -> ZA tile
 // AFTER-LLVM-LOWERING-NEXT:   "arm_sme.intr.ld1w.horiz"({{.*}}, %[[SLICE_PTR]], {{.*}}) <{tile_id = 0 : i32}>
+/// Store ZA tile slice in memory
 // AFTER-LLVM-LOWERING-NEXT:   vector.store %[[SLICE]], %[[TILE_ALLOCA]]
 // AFTER-LLVM-LOWERING-NEXT: }
diff --git a/mlir/test/Conversion/ArmSMEToLLVM/unsupported.mlir b/mlir/test/Conversion/ArmSMEToLLVM/unsupported.mlir
index 59665c471921..15767ff1dec3 100644
--- a/mlir/test/Conversion/ArmSMEToLLVM/unsupported.mlir
+++ b/mlir/test/Conversion/ArmSMEToLLVM/unsupported.mlir
@@ -9,6 +9,6 @@ func.func @arm_sme_outerproduct_unsupported_type(%lhs : vector<[16]xi8>, %rhs :
   // expected-error@+2 {{failed to legalize operation 'arm_sme.outerproduct'}}
   // expected-error@+1 {{unsupported type}}
   %0 = arm_sme.outerproduct %lhs, %rhs  acc(%acc) : vector<[16]xi8>, vector<[16]xi8>
-  "prevent.dce"(%0) : (vector<[16]x[16]xi8>) -> ()
+  "test.some_use"(%0) : (vector<[16]x[16]xi8>) -> ()
 }
 
diff --git a/mlir/test/Conversion/ArmSMEToSCF/arm-sme-to-scf.mlir b/mlir/test/Conversion/ArmSMEToSCF/arm-sme-to-scf.mlir
index 6c393bc38af9..a2f2beff78c4 100644
--- a/mlir/test/Conversion/ArmSMEToSCF/arm-sme-to-scf.mlir
+++ b/mlir/test/Conversion/ArmSMEToSCF/arm-sme-to-scf.mlir
@@ -20,6 +20,7 @@
 func.func @arm_sme_tile_load_hor(%src : memref<?x?xi32>) {
   %c0 = arith.constant 0 : index
   %tile = arm_sme.tile_load %src[%c0, %c0] : memref<?x?xi32>, vector<[4]x[4]xi32>
+  "test.some_use" (%tile) : (vector<[4]x[4]xi32>) -> ()
   return
 }
 
@@ -30,6 +31,7 @@ func.func @arm_sme_tile_load_hor(%src : memref<?x?xi32>) {
 func.func @arm_sme_tile_load_ver(%src : memref<?x?xi32>) {
   %c0 = arith.constant 0 : index
   %tile = arm_sme.tile_load %src[%c0, %c0] layout<vertical> : memref<?x?xi32>, vector<[4]x[4]xi32>
+  "test.some_use" (%tile) : (vector<[4]x[4]xi32>) -> ()
   return
 }
 
@@ -60,6 +62,7 @@ func.func @arm_sme_tile_load_hor_with_mask_and_pad_zero(%src : memref<?x?xi32>)
   %pad = arith.constant 0 : i32
   %mask = vector.create_mask %c3, %c2 : vector<[4]x[4]xi1>
   %tile = arm_sme.tile_load %src[%c0, %c0], %pad, %mask : memref<?x?xi32>, vector<[4]x[4]xi32>
+  "test.some_use" (%tile) : (vector<[4]x[4]xi32>) -> ()
   return
 }
 
@@ -94,6 +97,7 @@ func.func @arm_sme_tile_load_hor_with_mask_and_nonzero_pad(%src : memref<?x?xi32
   %c3 = arith.constant 3 : index
   %mask = vector.create_mask %c3, %c2 : vector<[4]x[4]xi1>
   %tile = arm_sme.tile_load %src[%c0, %c0], %pad, %mask : memref<?x?xi32>, vector<[4]x[4]xi32>
+  "test.some_use" (%tile) : (vector<[4]x[4]xi32>) -> ()
   return
 }
 
@@ -104,6 +108,7 @@ func.func @arm_sme_tile_load_zero_pad__unsupported_mask_op(%src : memref<?x?xi32
   %pad = arith.constant 0 : i32
   // expected-error@+1 {{failed to legalize operation 'arm_sme.tile_load' that was explicitly marked illegal}}
   %tile = arm_sme.tile_load %src[%c0, %c0], %pad, %mask : memref<?x?xi32>, vector<[4]x[4]xi32>
+  "test.some_use" (%tile) : (vector<[4]x[4]xi32>) -> ()
   return
 }
 
@@ -113,6 +118,7 @@ func.func @arm_sme_tile_load_nonzero_pad__unsupported_mask_op(%src : memref<?x?x
   %c0 = arith.constant 0 : index
   // expected-error@+1 {{failed to legalize operation 'arm_sme.tile_load' that was explicitly marked illegal}}
   %tile = arm_sme.tile_load %src[%c0, %c0], %pad, %mask : memref<?x?xi32>, vector<[4]x[4]xi32>
+  "test.some_use" (%tile) : (vector<[4]x[4]xi32>) -> ()
   return
 }
 
diff --git a/mlir/test/Conversion/GPUCommon/transfer_write.mlir b/mlir/test/Conversion/GPUCommon/transfer_write.mlir
index cba85915b49e..cd62b7b13fa9 100644
--- a/mlir/test/Conversion/GPUCommon/transfer_write.mlir
+++ b/mlir/test/Conversion/GPUCommon/transfer_write.mlir
@@ -1,13 +1,13 @@
-// RUN: mlir-opt %s --gpu-to-llvm | FileCheck %s
-
-  func.func @warp_extract(%arg0: index, %arg1: memref<1024x1024xf32>, %arg2: index, %arg3: vector<1xf32>) {
-    %c0 = arith.constant 0 : index
-    vector.warp_execute_on_lane_0(%arg0)[32] {
-      // CHECK:%[[val:[0-9]+]] = llvm.extractelement
-      // CHECK:%[[base:[0-9]+]] = llvm.extractvalue
-      // CHECK:%[[ptr:[0-9]+]] = llvm.getelementptr %[[base]]
-      // CHECK:llvm.store %[[val]], %[[ptr]]
-      vector.transfer_write %arg3, %arg1[%c0, %c0] {in_bounds = [true]} : vector<1xf32>, memref<1024x1024xf32>
-    }
-    return
-  }
+// RUN: mlir-opt %s --gpu-to-llvm | FileCheck %s
+
+  func.func @warp_extract(%arg0: index, %arg1: memref<1024x1024xf32>, %arg2: index, %arg3: vector<1xf32>) {
+    %c0 = arith.constant 0 : index
+    vector.warp_execute_on_lane_0(%arg0)[32] {
+      // CHECK:%[[val:[0-9]+]] = llvm.extractelement
+      // CHECK:%[[base:[0-9]+]] = llvm.extractvalue
+      // CHECK:%[[ptr:[0-9]+]] = llvm.getelementptr %[[base]]
+      // CHECK:llvm.store %[[val]], %[[ptr]]
+      vector.transfer_write %arg3, %arg1[%c0, %c0] {in_bounds = [true]} : vector<1xf32>, memref<1024x1024xf32>
+    }
+    return
+  }
diff --git a/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir b/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir
index b8c3d56f21f1..72e7e4cc8408 100644
--- a/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir
+++ b/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir
@@ -394,6 +394,21 @@ func.func @test_reshape_6d_down_s2s_auto(%arg0: tensor<1x2x3x5x7x11xf32>) -> ten
 
 // -----
 
+// This test would previously fail on GCC with certain compiler flags.
+// The GCC issue would cause invalid IR after tosa-to-tensor, so this test
+// locks down that the code goes through tosa-to-tensor and verifies.
+//
+// See https://github.com/llvm/llvm-project/pull/91521 for a full description.
+
+// CHECK-LABEL: reshape_bug_fix
+// CHECK: tensor.expand_shape
+func.func @reshape_bug_fix(%arg0: tensor<?xf32>) -> tensor<1x1x1x?xf32> {
+  %0 = tosa.reshape %arg0 {new_shape = array<i64: 1, 1, 1, -1>} : (tensor<?xf32>) -> tensor<1x1x1x?xf32>
+  return %0 : tensor<1x1x1x?xf32>
+}
+
+// -----
+
 // CHECK-LABEL: test_reshape_6d_down_s2s_explicit
 // CHECK-SAME: %[[ARG_0:[a-zA-Z0-9_]+]]: tensor<1x2x3x5x7x11xf32>
 // CHECK: %[[VAL_0:.*]] = tensor.collapse_shape %[[ARG_0]] {{\[\[}}0, 1, 2], [3], [4, 5]] : tensor<1x2x3x5x7x11xf32> into tensor<6x5x77xf32>
diff --git a/mlir/test/Dialect/Affine/scalrep.mlir b/mlir/test/Dialect/Affine/scalrep.mlir
index 22d394bfcf09..4a99dee50a28 100644
--- a/mlir/test/Dialect/Affine/scalrep.mlir
+++ b/mlir/test/Dialect/Affine/scalrep.mlir
@@ -682,6 +682,24 @@ func.func @redundant_store_elim(%out : memref<512xf32>) {
 // CHECK-NEXT:   affine.store
 // CHECK-NEXT: }
 
+// CHECK-LABEL: func @redundant_store_elim_nonintervening
+
+func.func @redundant_store_elim_nonintervening(%in : memref<512xf32>) {
+  %cf1 = arith.constant 1.0 : f32
+  %out = memref.alloc() :  memref<512xf32>
+  affine.for %i = 0 to 16 {
+    affine.store %cf1, %out[32*%i] : memref<512xf32>
+    %0 = affine.load %in[32*%i] : memref<512xf32>
+    affine.store %0, %out[32*%i] : memref<512xf32>
+  }
+  return
+}
+
+// CHECK: affine.for
+// CHECK-NEXT:   affine.load
+// CHECK-NEXT:   affine.store
+// CHECK-NEXT: }
+
 // CHECK-LABEL: func @redundant_store_elim_fail
 
 func.func @redundant_store_elim_fail(%out : memref<512xf32>) {
diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir
index f7ce2123a93c..e4f95bb0545a 100644
--- a/mlir/test/Dialect/Arith/canonicalize.mlir
+++ b/mlir/test/Dialect/Arith/canonicalize.mlir
@@ -833,6 +833,30 @@ func.func @tripleAddAdd(%arg0: index) -> index {
   return %add2 : index
 }
 
+// CHECK-LABEL: @tripleAddAddOvf1
+//       CHECK:   %[[cres:.+]] = arith.constant 59 : index
+//       CHECK:   %[[add:.+]] = arith.addi %arg0, %[[cres]] overflow<nsw, nuw> : index
+//       CHECK:   return %[[add]]
+func.func @tripleAddAddOvf1(%arg0: index) -> index {
+  %c17 = arith.constant 17 : index
+  %c42 = arith.constant 42 : index
+  %add1 = arith.addi %c17, %arg0 overflow<nsw, nuw> : index
+  %add2 = arith.addi %c42, %add1 overflow<nsw, nuw> : index
+  return %add2 : index
+}
+
+// CHECK-LABEL: @tripleAddAddOvf2
+//       CHECK:   %[[cres:.+]] = arith.constant 59 : index
+//       CHECK:   %[[add:.+]] = arith.addi %arg0, %[[cres]] : index
+//       CHECK:   return %[[add]]
+func.func @tripleAddAddOvf2(%arg0: index) -> index {
+  %c17 = arith.constant 17 : index
+  %c42 = arith.constant 42 : index
+  %add1 = arith.addi %c17, %arg0 overflow<nsw> : index
+  %add2 = arith.addi %c42, %add1 overflow<nuw> : index
+  return %add2 : index
+}
+
 // CHECK-LABEL: @tripleAddSub0
 //       CHECK:   %[[cres:.+]] = arith.constant 59 : index
 //       CHECK:   %[[add:.+]] = arith.subi %[[cres]], %arg0 : index
@@ -845,6 +869,18 @@ func.func @tripleAddSub0(%arg0: index) -> index {
   return %add2 : index
 }
 
+// CHECK-LABEL: @tripleAddSub0Ovf
+//       CHECK:   %[[cres:.+]] = arith.constant 59 : index
+//       CHECK:   %[[add:.+]] = arith.subi %[[cres]], %arg0 overflow<nsw, nuw> : index
+//       CHECK:   return %[[add]]
+func.func @tripleAddSub0Ovf(%arg0: index) -> index {
+  %c17 = arith.constant 17 : index
+  %c42 = arith.constant 42 : index
+  %add1 = arith.subi %c17, %arg0 overflow<nsw, nuw> : index
+  %add2 = arith.addi %c42, %add1 overflow<nsw, nuw> : index
+  return %add2 : index
+}
+
 // CHECK-LABEL: @tripleAddSub1
 //       CHECK:   %[[cres:.+]] = arith.constant 25 : index
 //       CHECK:   %[[add:.+]] = arith.addi %arg0, %[[cres]] : index
@@ -857,6 +893,18 @@ func.func @tripleAddSub1(%arg0: index) -> index {
   return %add2 : index
 }
 
+// CHECK-LABEL: @tripleAddSub1Ovf
+//       CHECK:   %[[cres:.+]] = arith.constant 25 : index
+//       CHECK:   %[[add:.+]] = arith.addi %arg0, %[[cres]] overflow<nsw, nuw> : index
+//       CHECK:   return %[[add]]
+func.func @tripleAddSub1Ovf(%arg0: index) -> index {
+  %c17 = arith.constant 17 : index
+  %c42 = arith.constant 42 : index
+  %add1 = arith.subi %arg0, %c17 overflow<nsw, nuw> : index
+  %add2 = arith.addi %c42, %add1 overflow<nsw, nuw> : index
+  return %add2 : index
+}
+
 // CHECK-LABEL: @tripleSubAdd0
 //       CHECK:   %[[cres:.+]] = arith.constant 25 : index
 //       CHECK:   %[[add:.+]] = arith.subi %[[cres]], %arg0 : index
@@ -869,6 +917,18 @@ func.func @tripleSubAdd0(%arg0: index) -> index {
   return %add2 : index
 }
 
+// CHECK-LABEL: @tripleSubAdd0Ovf
+//       CHECK:   %[[cres:.+]] = arith.constant 25 : index
+//       CHECK:   %[[add:.+]] = arith.subi %[[cres]], %arg0 overflow<nsw, nuw> : index
+//       CHECK:   return %[[add]]
+func.func @tripleSubAdd0Ovf(%arg0: index) -> index {
+  %c17 = arith.constant 17 : index
+  %c42 = arith.constant 42 : index
+  %add1 = arith.addi %c17, %arg0 overflow<nsw, nuw> : index
+  %add2 = arith.subi %c42, %add1 overflow<nsw, nuw> : index
+  return %add2 : index
+}
+
 // CHECK-LABEL: @tripleSubAdd1
 //       CHECK:   %[[cres:.+]] = arith.constant -25 : index
 //       CHECK:   %[[add:.+]] = arith.addi %arg0, %[[cres]] : index
@@ -891,6 +951,16 @@ func.func @subSub0(%arg0: index, %arg1: index) -> index {
   return %sub2 : index
 }
 
+// CHECK-LABEL: @subSub0Ovf
+//       CHECK:   %[[c0:.+]] = arith.constant 0 : index
+//       CHECK:   %[[add:.+]] = arith.subi %[[c0]], %arg1 overflow<nsw, nuw> : index
+//       CHECK:   return %[[add]]
+func.func @subSub0Ovf(%arg0: index, %arg1: index) -> index {
+  %sub1 = arith.subi %arg0, %arg1 overflow<nsw, nuw> : index
+  %sub2 = arith.subi %sub1, %arg0 overflow<nsw, nuw> : index
+  return %sub2 : index
+}
+
 // CHECK-LABEL: @tripleSubSub0
 //       CHECK:   %[[cres:.+]] = arith.constant 25 : index
 //       CHECK:   %[[add:.+]] = arith.addi %arg0, %[[cres]] : index
@@ -903,6 +973,19 @@ func.func @tripleSubSub0(%arg0: index) -> index {
   return %add2 : index
 }
 
+// CHECK-LABEL: @tripleSubSub0Ovf
+//       CHECK:   %[[cres:.+]] = arith.constant 25 : index
+//       CHECK:   %[[add:.+]] = arith.addi %arg0, %[[cres]] overflow<nsw, nuw> : index
+//       CHECK:   return %[[add]]
+func.func @tripleSubSub0Ovf(%arg0: index) -> index {
+  %c17 = arith.constant 17 : index
+  %c42 = arith.constant 42 : index
+  %add1 = arith.subi %c17, %arg0 overflow<nsw, nuw> : index
+  %add2 = arith.subi %c42, %add1 overflow<nsw, nuw> : index
+  return %add2 : index
+}
+
+
 // CHECK-LABEL: @tripleSubSub1
 //       CHECK:   %[[cres:.+]] = arith.constant -25 : index
 //       CHECK:   %[[add:.+]] = arith.subi %[[cres]], %arg0 : index
@@ -915,6 +998,18 @@ func.func @tripleSubSub1(%arg0: index) -> index {
   return %add2 : index
 }
 
+// CHECK-LABEL: @tripleSubSub1Ovf
+//       CHECK:   %[[cres:.+]] = arith.constant -25 : index
+//       CHECK:   %[[add:.+]] = arith.subi %[[cres]], %arg0 overflow<nsw, nuw> : index
+//       CHECK:   return %[[add]]
+func.func @tripleSubSub1Ovf(%arg0: index) -> index {
+  %c17 = arith.constant 17 : index
+  %c42 = arith.constant 42 : index
+  %add1 = arith.subi %c17, %arg0 overflow<nsw, nuw> : index
+  %add2 = arith.subi %add1, %c42 overflow<nsw, nuw> : index
+  return %add2 : index
+}
+
 // CHECK-LABEL: @tripleSubSub2
 //       CHECK:   %[[cres:.+]] = arith.constant 59 : index
 //       CHECK:   %[[add:.+]] = arith.subi %[[cres]], %arg0 : index
@@ -927,6 +1022,18 @@ func.func @tripleSubSub2(%arg0: index) -> index {
   return %add2 : index
 }
 
+// CHECK-LABEL: @tripleSubSub2Ovf
+//       CHECK:   %[[cres:.+]] = arith.constant 59 : index
+//       CHECK:   %[[add:.+]] = arith.subi %[[cres]], %arg0 overflow<nsw, nuw> : index
+//       CHECK:   return %[[add]]
+func.func @tripleSubSub2Ovf(%arg0: index) -> index {
+  %c17 = arith.constant 17 : index
+  %c42 = arith.constant 42 : index
+  %add1 = arith.subi %arg0, %c17 overflow<nsw, nuw> : index
+  %add2 = arith.subi %c42, %add1 overflow<nsw, nuw> : index
+  return %add2 : index
+}
+
 // CHECK-LABEL: @tripleSubSub3
 //       CHECK:   %[[cres:.+]] = arith.constant 59 : index
 //       CHECK:   %[[add:.+]] = arith.subi %arg0, %[[cres]] : index
@@ -939,6 +1046,18 @@ func.func @tripleSubSub3(%arg0: index) -> index {
   return %add2 : index
 }
 
+// CHECK-LABEL: @tripleSubSub3Ovf
+//       CHECK:   %[[cres:.+]] = arith.constant 59 : index
+//       CHECK:   %[[add:.+]] = arith.subi %arg0, %[[cres]] overflow<nsw, nuw> : index
+//       CHECK:   return %[[add]]
+func.func @tripleSubSub3Ovf(%arg0: index) -> index {
+  %c17 = arith.constant 17 : index
+  %c42 = arith.constant 42 : index
+  %add1 = arith.subi %arg0, %c17 overflow<nsw, nuw> : index
+  %add2 = arith.subi %add1, %c42 overflow<nsw, nuw> : index
+  return %add2 : index
+}
+
 // CHECK-LABEL: @subAdd1
 //  CHECK-NEXT:   return %arg0
 func.func @subAdd1(%arg0: index, %arg1 : index) -> index {
diff --git a/mlir/test/Dialect/ArmSME/tile-allocation.mlir b/mlir/test/Dialect/ArmSME/basic-tile-allocation.mlir
index 9c368dd4fa23..e144bac970a7 100644
--- a/mlir/test/Dialect/ArmSME/tile-allocation.mlir
+++ b/mlir/test/Dialect/ArmSME/basic-tile-allocation.mlir
@@ -1,9 +1,10 @@
-// RUN: mlir-opt %s -allocate-arm-sme-tiles -split-input-file -verify-diagnostics | FileCheck %s
+// RUN: mlir-opt %s -allocate-arm-sme-tiles -split-input-file | FileCheck %s
 
 // -----
 
+// Note: Tile IDs >= 16 are in-memory tile IDs (i.e. spills).
+
 // CHECK-LABEL: mixed_tiles
-// CHECK-SAME: attributes {arm_sme.tiles_in_use = 65534 : i32}
 func.func @mixed_tiles() {
   // ZA0.Q, ZA2.Q, ZA4.Q, ZA6.Q, ZA8.Q, ZA10.Q, ZA12.Q, ZA14.Q
   // CHECK-NEXT: tile_id = 0
@@ -18,76 +19,61 @@ func.func @mixed_tiles() {
   // CHECK-NEXT: tile_id = 7
   %za7_q = arm_sme.get_tile : vector<[1]x[1]xi128>
   // ZA15.Q is still free.
+  "test.some_use"(%za0_h) : (vector<[8]x[8]xi16>) -> ()
+  "test.some_use"(%za1_s) : (vector<[4]x[4]xi32>) -> ()
+  "test.some_use"(%za3_d) : (vector<[2]x[2]xi64>) -> ()
+  "test.some_use"(%za7_q) : (vector<[1]x[1]xi128>) -> ()
   return
 }
 
 // -----
 
 // CHECK-LABEL: za_b
-// CHECK-SAME: attributes {arm_sme.tiles_in_use = 65535 : i32}
 func.func @za_b() {
   // CHECK-NEXT: tile_id = 0
   %za0_b = arm_sme.get_tile : vector<[16]x[16]xi8>
-  return
-}
-
-// -----
-
-func.func @za_b__out_of_tiles() {
-  %za0_b = arm_sme.get_tile : vector<[16]x[16]xi8>
-  // expected-warning @below {{failed to allocate SME virtual tile to operation, all tile operations will go through memory, expect degraded performance}}
+  // Next tile is in-memory:
+  // CHECK-NEXT: tile_id = 16
   %next_tile = arm_sme.get_tile : vector<[16]x[16]xi8>
+  "test.some_use"(%za0_b) : (vector<[16]x[16]xi8>) -> ()
+  "test.some_use"(%next_tile) : (vector<[16]x[16]xi8>) -> ()
   return
 }
 
 // -----
 
+// CHECK-LABEL: za_b_overlapping_za_q
 func.func @za_b_overlapping_za_q() {
+  // CHECK-NEXT: tile_id = 0
   %za0_b = arm_sme.get_tile : vector<[16]x[16]xi8>
-  // expected-warning @below {{failed to allocate SME virtual tile to operation, all tile operations will go through memory, expect degraded performance}}
+  // Next tile is in-memory:
+  // CHECK-NEXT: tile_id = 16
   %next_tile = arm_sme.get_tile : vector<[1]x[1]xi128>
-  return
-}
-
-// -----
-
-// CHECK-LABEL: za0_h
-// CHECK-SAME: attributes {arm_sme.tiles_in_use = 43690 : i32}
-func.func @za0_h() {
-  // CHECK-NEXT: tile_id = 0
-  %za0_h = arm_sme.get_tile : vector<[8]x[8]xi16>
+  "test.some_use"(%za0_b) : (vector<[16]x[16]xi8>) -> ()
+  "test.some_use"(%next_tile) : (vector<[1]x[1]xi128>) -> ()
   return
 }
 
 // -----
 
 // CHECK-LABEL: za_h
-// CHECK-SAME: attributes {arm_sme.tiles_in_use = 65535 : i32}
 func.func @za_h() {
   // CHECK-NEXT: tile_id = 0
   %za0_h = arm_sme.get_tile : vector<[8]x[8]xi16>
   // CHECK-NEXT: tile_id = 1
   %za1_h = arm_sme.get_tile : vector<[8]x[8]xi16>
-  return
-}
-
-// -----
-
-// CHECK-LABEL: za_h__out_of_tiles
-func.func @za_h__out_of_tiles() {
-  // CHECK-NEXT: tile_id = 0
-  %za0_h = arm_sme.get_tile : vector<[8]x[8]xi16>
-  // CHECK-NEXT: tile_id = 1
-  %za1_h = arm_sme.get_tile : vector<[8]x[8]xi16>
-  // expected-warning @below {{failed to allocate SME virtual tile to operation, all tile operations will go through memory, expect degraded performance}}
+  // Next tile is in-memory:
+  // CHECK-NEXT: tile_id = 16
   %next_tile = arm_sme.get_tile : vector<[8]x[8]xi16>
+  "test.some_use"(%za0_h) : (vector<[8]x[8]xi16>) -> ()
+  "test.some_use"(%za1_h) : (vector<[8]x[8]xi16>) -> ()
+  "test.some_use"(%next_tile) : (vector<[8]x[8]xi16>) -> ()
   return
 }
 
 // -----
 
 // CHECK-LABEL: za_h_overlapping_za_s
-// CHECK-SAME: attributes {arm_sme.tiles_in_use = 65535 : i32}
 func.func @za_h_overlapping_za_s() {
   // ZA0.Q, ZA2.Q, ZA4.Q, ZA6.Q, ZA8.Q, ZA10.Q, ZA12.Q, ZA14.Q
   // CHECK-NEXT: tile_id = 0
@@ -98,13 +84,15 @@ func.func @za_h_overlapping_za_s() {
   // ZA3.Q, ZA7.Q, ZA11.Q, ZA15.Q
   // CHECK-NEXT: tile_id = 3
   %za3_s = arm_sme.get_tile : vector<[4]x[4]xi32>
+  "test.some_use"(%za0_h) : (vector<[8]x[8]xi16>) -> ()
+  "test.some_use"(%za1_s) : (vector<[4]x[4]xi32>) -> ()
+  "test.some_use"(%za3_s) : (vector<[4]x[4]xi32>) -> ()
   return
 }
 
 // -----
 
 // CHECK-LABEL: za_h_overlapping_za_d
-// CHECK-SAME: attributes {arm_sme.tiles_in_use = 65535 : i32}
 func.func @za_h_overlapping_za_d() {
   // ZA0.Q, ZA2.Q, ZA4.Q, ZA6.Q, ZA8.Q, ZA10.Q, ZA12.Q, ZA14.Q
   // CHECK-NEXT: tile_id = 0
@@ -121,40 +109,55 @@ func.func @za_h_overlapping_za_d() {
   // ZA7.Q, ZA15.Q
   // CHECK-NEXT: tile_id = 7
   %za7_d = arm_sme.get_tile : vector<[2]x[2]xi64>
+  "test.some_use"(%za0_h) : (vector<[8]x[8]xi16>) -> ()
+  "test.some_use"(%za1_d) : (vector<[2]x[2]xi64>) -> ()
+  "test.some_use"(%za3_d) : (vector<[2]x[2]xi64>) -> ()
+  "test.some_use"(%za5_d) : (vector<[2]x[2]xi64>) -> ()
+  "test.some_use"(%za7_d) : (vector<[2]x[2]xi64>) -> ()
   return
 }
 
 // -----
 
+// CHECK-LABEL: za_h_overlapping_za_q
 func.func @za_h_overlapping_za_q() {
+  // CHECK-NEXT: tile_id = 0
   %za0_h = arm_sme.get_tile : vector<[8]x[8]xi16>
-  %za0_q = arm_sme.get_tile : vector<[1]x[1]xi128>
-  %za2_q = arm_sme.get_tile : vector<[1]x[1]xi128>
-  %za4_q = arm_sme.get_tile : vector<[1]x[1]xi128>
-  %za6_q = arm_sme.get_tile : vector<[1]x[1]xi128>
-  %za8_q = arm_sme.get_tile : vector<[1]x[1]xi128>
-  %za10_q = arm_sme.get_tile : vector<[1]x[1]xi128>
-  %za12_q = arm_sme.get_tile : vector<[1]x[1]xi128>
-  %za14_q = arm_sme.get_tile : vector<[1]x[1]xi128>
-  // expected-warning @below {{failed to allocate SME virtual tile to operation, all tile operations will go through memory, expect degraded performance}}
+  // CHECK-NEXT: tile_id = 1
+  %za1_q = arm_sme.get_tile : vector<[1]x[1]xi128>
+  // CHECK-NEXT: tile_id = 3
+  %za3_q = arm_sme.get_tile : vector<[1]x[1]xi128>
+  // CHECK-NEXT: tile_id = 5
+  %za5_q = arm_sme.get_tile : vector<[1]x[1]xi128>
+  // CHECK-NEXT: tile_id = 7
+  %za7_q = arm_sme.get_tile : vector<[1]x[1]xi128>
+  // CHECK-NEXT: tile_id = 9
+  %za9_q = arm_sme.get_tile : vector<[1]x[1]xi128>
+  // CHECK-NEXT: tile_id = 11
+  %za11_q = arm_sme.get_tile : vector<[1]x[1]xi128>
+  // CHECK-NEXT: tile_id = 13
+  %za13_q = arm_sme.get_tile : vector<[1]x[1]xi128>
+  // CHECK-NEXT: tile_id = 15
+  %za15_q = arm_sme.get_tile : vector<[1]x[1]xi128>
+  // Next tile is in-memory:
+  // CHECK-NEXT: tile_id = 16
   %next_tile = arm_sme.get_tile : vector<[1]x[1]xi128>
-  return
-}
-
-// -----
-
-// CHECK-LABEL: za0_s
-// CHECK-SAME: attributes {arm_sme.tiles_in_use = 34952 : i32}
-func.func @za0_s() {
-  // CHECK-NEXT: tile_id = 0
-  %za0_s = arm_sme.get_tile : vector<[4]x[4]xi32>
+  "test.some_use"(%za0_h) : (vector<[8]x[8]xi16>) -> ()
+  "test.some_use"(%za1_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za3_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za5_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za7_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za9_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za11_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za13_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za15_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%next_tile) : (vector<[1]x[1]xi128>) -> ()
   return
 }
 
 // -----
 
 // CHECK-LABEL: za_s
-// CHECK-SAME: attributes {arm_sme.tiles_in_use = 65535 : i32}
 func.func @za_s() {
   // CHECK-NEXT: tile_id = 0
   %za0_s = arm_sme.get_tile : vector<[4]x[4]xi32>
@@ -164,25 +167,20 @@ func.func @za_s() {
   %za2_s = arm_sme.get_tile : vector<[4]x[4]xi32>
   // CHECK-NEXT: tile_id = 3
   %za3_s = arm_sme.get_tile : vector<[4]x[4]xi32>
-  return
-}
-
-// -----
-
-func.func @za_s__out_of_tiles() {
-  %za0_s = arm_sme.get_tile : vector<[4]x[4]xi32>
-  %za1_s = arm_sme.get_tile : vector<[4]x[4]xi32>
-  %za2_s = arm_sme.get_tile : vector<[4]x[4]xi32>
-  %za3_s = arm_sme.get_tile : vector<[4]x[4]xi32>
-  // expected-warning @below {{failed to allocate SME virtual tile to operation, all tile operations will go through memory, expect degraded performance}}
+  // Next tile is in-memory:
+  // CHECK-NEXT: tile_id = 16
   %next_tile = arm_sme.get_tile : vector<[4]x[4]xi32>
+  "test.some_use"(%za0_s) : (vector<[4]x[4]xi32>) -> ()
+  "test.some_use"(%za1_s) : (vector<[4]x[4]xi32>) -> ()
+  "test.some_use"(%za2_s) : (vector<[4]x[4]xi32>) -> ()
+  "test.some_use"(%za3_s) : (vector<[4]x[4]xi32>) -> ()
+  "test.some_use"(%next_tile) : (vector<[4]x[4]xi32>) -> ()
   return
 }
 
 // -----
 
 // CHECK-LABEL: za_s_overlapping_za_d
-// CHECK-SAME: attributes {arm_sme.tiles_in_use = 65535 : i32}
 func.func @za_s_overlapping_za_d() {
   // ZA0.Q, ZA4.Q, ZA8.Q, ZA12.Q
   // CHECK-NEXT: tile_id = 0
@@ -199,44 +197,67 @@ func.func @za_s_overlapping_za_d() {
   // ZA7.Q, ZA15.Q
   // CHECK-NEXT: tile_id = 7
   %za7_d = arm_sme.get_tile : vector<[2]x[2]xi64>
+  "test.some_use"(%za0_s) : (vector<[4]x[4]xi32>) -> ()
+  "test.some_use"(%za1_s) : (vector<[4]x[4]xi32>) -> ()
+  "test.some_use"(%za2_s) : (vector<[4]x[4]xi32>) -> ()
+  "test.some_use"(%za3_d) : (vector<[2]x[2]xi64>) -> ()
+  "test.some_use"(%za7_d) : (vector<[2]x[2]xi64>) -> ()
   return
 }
 
 // -----
 
+// CHECK-LABEL: za_s_overlapping_za_q
 func.func @za_s_overlapping_za_q() {
+  // CHECK-NEXT: tile_id = 0
   %za0_s = arm_sme.get_tile : vector<[4]x[4]xi32>
+  // CHECK-NEXT: tile_id = 1
   %za1_q = arm_sme.get_tile : vector<[1]x[1]xi128>
+  // CHECK-NEXT: tile_id = 2
   %za2_q = arm_sme.get_tile : vector<[1]x[1]xi128>
+  // CHECK-NEXT: tile_id = 3
   %za3_q = arm_sme.get_tile : vector<[1]x[1]xi128>
+  // CHECK-NEXT: tile_id = 5
   %za5_q = arm_sme.get_tile : vector<[1]x[1]xi128>
+  // CHECK-NEXT: tile_id = 6
   %za6_q = arm_sme.get_tile : vector<[1]x[1]xi128>
+  // CHECK-NEXT: tile_id = 7
   %za7_q = arm_sme.get_tile : vector<[1]x[1]xi128>
+  // CHECK-NEXT: tile_id = 9
   %za9_q = arm_sme.get_tile : vector<[1]x[1]xi128>
+  // CHECK-NEXT: tile_id = 10
   %za10_q = arm_sme.get_tile : vector<[1]x[1]xi128>
+  // CHECK-NEXT: tile_id = 11
   %za11_q = arm_sme.get_tile : vector<[1]x[1]xi128>
+  // CHECK-NEXT: tile_id = 13
   %za13_q = arm_sme.get_tile : vector<[1]x[1]xi128>
+  // CHECK-NEXT: tile_id = 14
   %za14_q = arm_sme.get_tile : vector<[1]x[1]xi128>
+  // CHECK-NEXT: tile_id = 15
   %za15_q = arm_sme.get_tile : vector<[1]x[1]xi128>
-  // expected-warning @below {{failed to allocate SME virtual tile to operation, all tile operations will go through memory, expect degraded performance}}
+  // Next tile is in-memory:
+  // CHECK-NEXT: tile_id = 16
   %next_tile = arm_sme.get_tile : vector<[1]x[1]xi128>
-  return
-}
-
-// -----
-
-// CHECK-LABEL: za0_d
-// CHECK-SAME: attributes {arm_sme.tiles_in_use = 32896 : i32}
-func.func @za0_d() {
-  // CHECK-NEXT: tile_id = 0
-  %za0_d = arm_sme.get_tile : vector<[2]x[2]xi64>
+  "test.some_use"(%za0_s) : (vector<[4]x[4]xi32>) -> ()
+  "test.some_use"(%za1_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za2_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za3_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za5_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za6_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za7_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za9_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za10_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za11_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za13_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za14_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za15_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%next_tile) : (vector<[1]x[1]xi128>) -> ()
   return
 }
 
 // -----
 
 // CHECK-LABEL: za_d
-// CHECK-SAME: attributes {arm_sme.tiles_in_use = 65535 : i32}
 func.func @za_d() {
   // CHECK-NEXT: tile_id = 0
   %za0_d = arm_sme.get_tile : vector<[2]x[2]xi64>
@@ -254,62 +275,80 @@ func.func @za_d() {
   %za6_d = arm_sme.get_tile : vector<[2]x[2]xi64>
   // CHECK-NEXT: tile_id = 7
   %za7_d = arm_sme.get_tile : vector<[2]x[2]xi64>
-  return
-}
-
-// -----
-
-func.func @za_d__out_of_tiles() {
-  %za0_d = arm_sme.get_tile : vector<[2]x[2]xi64>
-  %za1_d = arm_sme.get_tile : vector<[2]x[2]xi64>
-  %za2_d = arm_sme.get_tile : vector<[2]x[2]xi64>
-  %za3_d = arm_sme.get_tile : vector<[2]x[2]xi64>
-  %za4_d = arm_sme.get_tile : vector<[2]x[2]xi64>
-  %za5_d = arm_sme.get_tile : vector<[2]x[2]xi64>
-  %za6_d = arm_sme.get_tile : vector<[2]x[2]xi64>
-  %za7_d = arm_sme.get_tile : vector<[2]x[2]xi64>
-  // expected-warning @below {{failed to allocate SME virtual tile to operation, all tile operations will go through memory, expect degraded performance}}
+  // Next tile is in-memory:
+  // CHECK-NEXT: tile_id = 16
   %next_tile = arm_sme.get_tile : vector<[2]x[2]xi64>
+  "test.some_use"(%za0_d) : (vector<[2]x[2]xi64>) -> ()
+  "test.some_use"(%za1_d) : (vector<[2]x[2]xi64>) -> ()
+  "test.some_use"(%za2_d) : (vector<[2]x[2]xi64>) -> ()
+  "test.some_use"(%za3_d) : (vector<[2]x[2]xi64>) -> ()
+  "test.some_use"(%za4_d) : (vector<[2]x[2]xi64>) -> ()
+  "test.some_use"(%za5_d) : (vector<[2]x[2]xi64>) -> ()
+  "test.some_use"(%za6_d) : (vector<[2]x[2]xi64>) -> ()
+  "test.some_use"(%za7_d) : (vector<[2]x[2]xi64>) -> ()
+  "test.some_use"(%next_tile) : (vector<[2]x[2]xi64>) -> ()
   return
 }
 
 // -----
 
+// CHECK-LABEL: za_d_overlapping_za_q
 func.func @za_d_overlapping_za_q() {
+  // CHECK-NEXT: tile_id = 0
   %za0_d = arm_sme.get_tile : vector<[2]x[2]xi64>
+  // CHECK-NEXT: tile_id = 1
   %za1_q = arm_sme.get_tile : vector<[1]x[1]xi128>
+  // CHECK-NEXT: tile_id = 2
   %za2_q = arm_sme.get_tile : vector<[1]x[1]xi128>
+  // CHECK-NEXT: tile_id = 3
   %za3_q = arm_sme.get_tile : vector<[1]x[1]xi128>
+  // CHECK-NEXT: tile_id = 4
   %za4_q = arm_sme.get_tile : vector<[1]x[1]xi128>
+  // CHECK-NEXT: tile_id = 5
   %za5_q = arm_sme.get_tile : vector<[1]x[1]xi128>
+  // CHECK-NEXT: tile_id = 6
   %za6_q = arm_sme.get_tile : vector<[1]x[1]xi128>
+  // CHECK-NEXT: tile_id = 7
   %za7_q = arm_sme.get_tile : vector<[1]x[1]xi128>
+  // CHECK-NEXT: tile_id = 9
   %za9_q = arm_sme.get_tile : vector<[1]x[1]xi128>
+  // CHECK-NEXT: tile_id = 10
   %za10_q = arm_sme.get_tile : vector<[1]x[1]xi128>
+  // CHECK-NEXT: tile_id = 11
   %za11_q = arm_sme.get_tile : vector<[1]x[1]xi128>
+  // CHECK-NEXT: tile_id = 12
   %za12_q = arm_sme.get_tile : vector<[1]x[1]xi128>
+  // CHECK-NEXT: tile_id = 13
   %za13_q = arm_sme.get_tile : vector<[1]x[1]xi128>
+  // CHECK-NEXT: tile_id = 14
   %za14_q = arm_sme.get_tile : vector<[1]x[1]xi128>
+  // CHECK-NEXT: tile_id = 15
   %za15_q = arm_sme.get_tile : vector<[1]x[1]xi128>
-  // expected-warning @below {{failed to allocate SME virtual tile to operation, all tile operations will go through memory, expect degraded performance}}
+  // Next tile is in-memory:
+  // CHECK-NEXT: tile_id = 16
   %next_tile = arm_sme.get_tile : vector<[1]x[1]xi128>
-  return
-}
-
-// -----
-
-// CHECK-LABEL: za0_q
-// CHECK-SAME: attributes {arm_sme.tiles_in_use = 32768 : i32}
-func.func @za0_q() {
-  // CHECK-NEXT: tile_id = 0
-  %za0_q = arm_sme.get_tile : vector<[1]x[1]xi128>
+  "test.some_use"(%za0_d) : (vector<[2]x[2]xi64>) -> ()
+  "test.some_use"(%za1_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za2_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za3_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za4_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za5_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za6_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za7_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za9_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za10_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za11_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za12_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za13_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za14_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za15_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%next_tile) : (vector<[1]x[1]xi128>) -> ()
   return
 }
 
 // -----
 
 // CHECK-LABEL: za_q
-// CHECK-SAME: attributes {arm_sme.tiles_in_use = 65535 : i32}
 func.func @za_q() {
   // CHECK-NEXT: tile_id = 0
   %za0_q = arm_sme.get_tile : vector<[1]x[1]xi128>
@@ -343,29 +382,25 @@ func.func @za_q() {
   %za14_q = arm_sme.get_tile : vector<[1]x[1]xi128>
   // CHECK-NEXT: tile_id = 15
   %za15_q = arm_sme.get_tile : vector<[1]x[1]xi128>
-  return
-}
-
-// -----
-
-func.func @za_q__out_of_tiles() {
-  %za0_q = arm_sme.get_tile : vector<[1]x[1]xi128>
-  %za1_q = arm_sme.get_tile : vector<[1]x[1]xi128>
-  %za2_q = arm_sme.get_tile : vector<[1]x[1]xi128>
-  %za3_q = arm_sme.get_tile : vector<[1]x[1]xi128>
-  %za4_q = arm_sme.get_tile : vector<[1]x[1]xi128>
-  %za5_q = arm_sme.get_tile : vector<[1]x[1]xi128>
-  %za6_q = arm_sme.get_tile : vector<[1]x[1]xi128>
-  %za7_q = arm_sme.get_tile : vector<[1]x[1]xi128>
-  %za8_q = arm_sme.get_tile : vector<[1]x[1]xi128>
-  %za9_q = arm_sme.get_tile : vector<[1]x[1]xi128>
-  %za10_q = arm_sme.get_tile : vector<[1]x[1]xi128>
-  %za11_q = arm_sme.get_tile : vector<[1]x[1]xi128>
-  %za12_q = arm_sme.get_tile : vector<[1]x[1]xi128>
-  %za13_q = arm_sme.get_tile : vector<[1]x[1]xi128>
-  %za14_q = arm_sme.get_tile : vector<[1]x[1]xi128>
-  %za15_q = arm_sme.get_tile : vector<[1]x[1]xi128>
-  // expected-warning @below {{failed to allocate SME virtual tile to operation, all tile operations will go through memory, expect degraded performance}}
+  // Next tile is in-memory:
+  // CHECK-NEXT: tile_id = 16
   %next_tile = arm_sme.get_tile : vector<[1]x[1]xi128>
+  "test.some_use"(%za0_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za1_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za2_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za3_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za4_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za5_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za6_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za7_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za8_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za9_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za10_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za11_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za12_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za13_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za14_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%za15_q) : (vector<[1]x[1]xi128>) -> ()
+  "test.some_use"(%next_tile) : (vector<[1]x[1]xi128>) -> ()
   return
 }
diff --git a/mlir/test/Dialect/ArmSME/enable-arm-za.mlir b/mlir/test/Dialect/ArmSME/enable-arm-za.mlir
index a20203d7e557..d3325513a848 100644
--- a/mlir/test/Dialect/ArmSME/enable-arm-za.mlir
+++ b/mlir/test/Dialect/ArmSME/enable-arm-za.mlir
@@ -1,10 +1,9 @@
-// RUN: mlir-opt %s -enable-arm-streaming=za-mode=new-za -convert-arm-sme-to-llvm | FileCheck %s -check-prefix=ENABLE-ZA
-// RUN: mlir-opt %s -enable-arm-streaming -convert-arm-sme-to-llvm | FileCheck %s -check-prefix=DISABLE-ZA
-// RUN: mlir-opt %s -enable-arm-streaming=za-mode=in-za -convert-arm-sme-to-llvm | FileCheck %s -check-prefix=IN-ZA
-// RUN: mlir-opt %s -enable-arm-streaming=za-mode=out-za -convert-arm-sme-to-llvm | FileCheck %s -check-prefix=OUT-ZA
-// RUN: mlir-opt %s -enable-arm-streaming=za-mode=inout-za -convert-arm-sme-to-llvm | FileCheck %s -check-prefix=INOUT-ZA
-// RUN: mlir-opt %s -enable-arm-streaming=za-mode=preserves-za -convert-arm-sme-to-llvm | FileCheck %s -check-prefix=PRESERVES-ZA
-// RUN: mlir-opt %s -convert-arm-sme-to-llvm | FileCheck %s -check-prefix=NO-ARM-STREAMING
+// RUN: mlir-opt %s -enable-arm-streaming=za-mode=new-za | FileCheck %s -check-prefix=ENABLE-ZA
+// RUN: mlir-opt %s -enable-arm-streaming | FileCheck %s -check-prefix=DISABLE-ZA
+// RUN: mlir-opt %s -enable-arm-streaming=za-mode=in-za | FileCheck %s -check-prefix=IN-ZA
+// RUN: mlir-opt %s -enable-arm-streaming=za-mode=out-za | FileCheck %s -check-prefix=OUT-ZA
+// RUN: mlir-opt %s -enable-arm-streaming=za-mode=inout-za | FileCheck %s -check-prefix=INOUT-ZA
+// RUN: mlir-opt %s -enable-arm-streaming=za-mode=preserves-za | FileCheck %s -check-prefix=PRESERVES-ZA
 
 // CHECK-LABEL: @declaration
 func.func private @declaration()
@@ -22,11 +21,4 @@ func.func private @declaration()
 // DISABLE-ZA-LABEL: @arm_new_za
 // DISABLE-ZA-NOT: arm_new_za
 // DISABLE-ZA-SAME: attributes {arm_streaming}
-// NO-ARM-STREAMING-LABEL: @arm_new_za
-// NO-ARM-STREAMING-NOT: arm_new_za
-// NO-ARM-STREAMING-NOT: arm_streaming
-// NO-ARM-STREAMING-NOT: arm_in_za
-// NO-ARM-STREAMING-NOT: arm_out_za
-// NO-ARM-STREAMING-NOT: arm_inout_za
-// NO-ARM-STREAMING-NOT: arm_preserves_za
 func.func @arm_new_za() { return }
diff --git a/mlir/test/Dialect/ArmSME/outer-product-fusion.mlir b/mlir/test/Dialect/ArmSME/outer-product-fusion.mlir
index 01f54a4cf186..4887d611643f 100644
--- a/mlir/test/Dialect/ArmSME/outer-product-fusion.mlir
+++ b/mlir/test/Dialect/ArmSME/outer-product-fusion.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -arm-sme-outer-product-fusion -cse -split-input-file -allow-unregistered-dialect | FileCheck %s
+// RUN: mlir-opt %s -arm-sme-outer-product-fusion -cse -split-input-file | FileCheck %s
 
 // CHECK-LABEL: @outerproduct_add_widening_2way_f16f16f32
 // CHECK-SAME:    %[[A0:.*]]: vector<[4]xf16>, %[[B0:.*]]: vector<[4]xf16>, %[[A1:.*]]: vector<[4]xf16>, %[[B1:.*]]: vector<[4]xf16>,
@@ -929,6 +929,7 @@ func.func @outerproduct_widening_4way__missing_acc(
   %2 = arm_sme.outerproduct %a2_ext, %b2_ext acc(%1) : vector<[4]xi32>, vector<[4]xi32>
   // Missing accumulator breaks use-def chain.
   %3 = arm_sme.outerproduct %a3_ext, %b3_ext : vector<[4]xi32>, vector<[4]xi32>
+  "test.some_use"(%2) : (vector<[4]x[4]xi32>) -> ()
 
   return %3 : vector<[4]x[4]xi32>
 }
@@ -1014,7 +1015,7 @@ func.func @outerproduct_widening_2way__cant_erase(
 
   %acc = arith.constant dense<1.0> : vector<[4]x[4]xf32>
   %0 = arm_sme.outerproduct %a0_ext, %b0_ext acc(%acc) : vector<[4]xf32>, vector<[4]xf32>
-  "fake.use"(%0) : (vector<[4]x[4]xf32>) -> ()
+  "test.some_use"(%0) : (vector<[4]x[4]xf32>) -> ()
   %1 = arm_sme.outerproduct %a1_ext, %b1_ext acc(%0) : vector<[4]xf32>, vector<[4]xf32>
 
   return %1 : vector<[4]x[4]xf32>
@@ -1048,7 +1049,7 @@ func.func @outerproduct_widening_4way__multi_use_cant_erase(
 
   %0 = arm_sme.outerproduct %a0_ext, %b0_ext : vector<[4]xi32>, vector<[4]xi32>
   %1 = arm_sme.outerproduct %a1_ext, %b1_ext acc(%0) : vector<[4]xi32>, vector<[4]xi32>
-  "fake.use"(%1) : (vector<[4]x[4]xi32>) -> ()
+  "test.some_use"(%1) : (vector<[4]x[4]xi32>) -> ()
   %2 = arm_sme.outerproduct %a2_ext, %b2_ext acc(%1) : vector<[4]xi32>, vector<[4]xi32>
   %3 = arm_sme.outerproduct %a3_ext, %b3_ext acc(%2) : vector<[4]xi32>, vector<[4]xi32>
 
diff --git a/mlir/test/Dialect/ArmSME/tile-zero-masks.mlir b/mlir/test/Dialect/ArmSME/tile-zero-masks.mlir
index 04412e4db1c5..cac2dcc24d10 100644
--- a/mlir/test/Dialect/ArmSME/tile-zero-masks.mlir
+++ b/mlir/test/Dialect/ArmSME/tile-zero-masks.mlir
@@ -9,6 +9,7 @@
 func.func @zero_za_b() {
   // CHECK: "arm_sme.intr.zero"() <{tile_mask = 255 : i32}> : () -> ()
   %zero_za0b = arm_sme.zero : vector<[16]x[16]xi8>
+  "test.some_use"(%zero_za0b) : (vector<[16]x[16]xi8>) -> ()
   return
 }
 
@@ -16,10 +17,12 @@ func.func @zero_za_b() {
 
 // CHECK-LABEL: zero_za_h
 func.func @zero_za_h() {
-  // CHECK:      "arm_sme.intr.zero"() <{tile_mask = 85 : i32}> : () -> ()
+  // CHECK: "arm_sme.intr.zero"() <{tile_mask = 85 : i32}> : () -> ()
   %zero_za0h = arm_sme.zero : vector<[8]x[8]xi16>
-  // CHECK-NEXT: "arm_sme.intr.zero"() <{tile_mask = 170 : i32}> : () -> ()
+  // CHECK: "arm_sme.intr.zero"() <{tile_mask = 170 : i32}> : () -> ()
   %zero_za1h = arm_sme.zero : vector<[8]x[8]xf16>
+  "test.some_use"(%zero_za0h) : (vector<[8]x[8]xi16>) -> ()
+  "test.some_use"(%zero_za1h) : (vector<[8]x[8]xf16>) -> ()
   return
 }
 
@@ -27,14 +30,18 @@ func.func @zero_za_h() {
 
 // CHECK-LABEL: zero_za_s
 func.func @zero_za_s() {
-  // CHECK:      arm_sme.intr.zero"() <{tile_mask = 17 : i32}> : () -> ()
+  // CHECK: arm_sme.intr.zero"() <{tile_mask = 17 : i32}> : () -> ()
   %zero_za0s = arm_sme.zero : vector<[4]x[4]xi32>
-  // CHECK-NEXT: arm_sme.intr.zero"() <{tile_mask = 34 : i32}> : () -> ()
+  // CHECK: arm_sme.intr.zero"() <{tile_mask = 34 : i32}> : () -> ()
   %zero_za1s = arm_sme.zero : vector<[4]x[4]xi32>
-  // CHECK-NEXT: arm_sme.intr.zero"() <{tile_mask = 68 : i32}> : () -> ()
+  // CHECK: arm_sme.intr.zero"() <{tile_mask = 68 : i32}> : () -> ()
   %zero_za2s = arm_sme.zero : vector<[4]x[4]xi32>
-  // CHECK-NEXT: arm_sme.intr.zero"() <{tile_mask = 136 : i32}> : () -> ()
+  // CHECK: arm_sme.intr.zero"() <{tile_mask = 136 : i32}> : () -> ()
   %zero_za3s = arm_sme.zero : vector<[4]x[4]xf32>
+  "test.some_use"(%zero_za0s) : (vector<[4]x[4]xi32>) -> ()
+  "test.some_use"(%zero_za1s) : (vector<[4]x[4]xi32>) -> ()
+  "test.some_use"(%zero_za2s) : (vector<[4]x[4]xi32>) -> ()
+  "test.some_use"(%zero_za3s) : (vector<[4]x[4]xf32>) -> ()
   return
 }
 
@@ -42,21 +49,29 @@ func.func @zero_za_s() {
 
 // CHECK-LABEL: zero_za_d
 func.func @zero_za_d() {
-  // CHECK:      "arm_sme.intr.zero"() <{tile_mask = 1 : i32}> : () -> ()
+  // CHECK: "arm_sme.intr.zero"() <{tile_mask = 1 : i32}> : () -> ()
   %zero_za0d = arm_sme.zero : vector<[2]x[2]xi64>
-  // CHECK-NEXT: "arm_sme.intr.zero"() <{tile_mask = 2 : i32}> : () -> ()
+  // CHECK: "arm_sme.intr.zero"() <{tile_mask = 2 : i32}> : () -> ()
   %zero_za1d = arm_sme.zero : vector<[2]x[2]xi64>
-  // CHECK-NEXT: "arm_sme.intr.zero"() <{tile_mask = 4 : i32}> : () -> ()
+  // CHECK: "arm_sme.intr.zero"() <{tile_mask = 4 : i32}> : () -> ()
   %zero_za2d = arm_sme.zero : vector<[2]x[2]xi64>
-  // CHECK-NEXT: "arm_sme.intr.zero"() <{tile_mask = 8 : i32}> : () -> ()
+  // CHECK: "arm_sme.intr.zero"() <{tile_mask = 8 : i32}> : () -> ()
   %zero_za3d = arm_sme.zero : vector<[2]x[2]xi64>
-  // CHECK-NEXT: "arm_sme.intr.zero"() <{tile_mask = 16 : i32}> : () -> ()
+  // CHECK: "arm_sme.intr.zero"() <{tile_mask = 16 : i32}> : () -> ()
   %zero_za4d = arm_sme.zero : vector<[2]x[2]xi64>
-  // CHECK-NEXT: "arm_sme.intr.zero"() <{tile_mask = 32 : i32}> : () -> ()
+  // CHECK: "arm_sme.intr.zero"() <{tile_mask = 32 : i32}> : () -> ()
   %zero_za5d = arm_sme.zero : vector<[2]x[2]xi64>
-  // CHECK-NEXT: "arm_sme.intr.zero"() <{tile_mask = 64 : i32}> : () -> ()
+  // CHECK: "arm_sme.intr.zero"() <{tile_mask = 64 : i32}> : () -> ()
   %zero_za6d = arm_sme.zero : vector<[2]x[2]xi64>
-  // CHECK-NEXT: "arm_sme.intr.zero"() <{tile_mask = 128 : i32}> : () -> ()
+  // CHECK: "arm_sme.intr.zero"() <{tile_mask = 128 : i32}> : () -> ()
   %zero_za7d = arm_sme.zero : vector<[2]x[2]xf64>
+  "test.some_use"(%zero_za0d) : (vector<[2]x[2]xi64>) -> ()
+  "test.some_use"(%zero_za1d) : (vector<[2]x[2]xi64>) -> ()
+  "test.some_use"(%zero_za2d) : (vector<[2]x[2]xi64>) -> ()
+  "test.some_use"(%zero_za3d) : (vector<[2]x[2]xi64>) -> ()
+  "test.some_use"(%zero_za4d) : (vector<[2]x[2]xi64>) -> ()
+  "test.some_use"(%zero_za5d) : (vector<[2]x[2]xi64>) -> ()
+  "test.some_use"(%zero_za6d) : (vector<[2]x[2]xi64>) -> ()
+  "test.some_use"(%zero_za7d) : (vector<[2]x[2]xf64>) -> ()
   return
 }
diff --git a/mlir/test/Dialect/Bufferization/Transforms/buffer-loop-hoisting.mlir b/mlir/test/Dialect/Bufferization/Transforms/buffer-loop-hoisting.mlir
index 5f8bc18b64da..ee7571b2c846 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/buffer-loop-hoisting.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/buffer-loop-hoisting.mlir
@@ -461,6 +461,38 @@ func.func @partial_hoist_multiple_loop_dependency(
 
 // -----
 
+// CHECK-LABEL: func @no_hoist_parallel
+func.func @no_hoist_parallel(
+    %lb: index,
+    %ub: index,
+    %step: index) {
+  scf.parallel (%i) = (%lb) to (%ub) step (%step) {
+      %0 = memref.alloc() : memref<2xf32>
+      scf.reduce
+  }
+  return
+}
+
+//      CHECK: memref.alloc
+// CHECK-NEXT: scf.reduce
+
+// -----
+
+func.func @no_hoist_forall(
+    %lb: index,
+    %ub: index,
+    %step: index) {
+  scf.forall (%i) = (%lb) to (%ub) step (%step) {
+      %1 = memref.alloc() : memref<2xf32>
+  }
+  return
+}
+
+//      CHECK: scf.forall
+// CHECK-NEXT: memref.alloc
+
+// -----
+
 // Test with allocas to ensure that op is also considered.
 
 // CHECK-LABEL: func @hoist_alloca
diff --git a/mlir/test/Dialect/EmitC/transforms.mlir b/mlir/test/Dialect/EmitC/transforms.mlir
index 8ac606a2c8c0..a5c582be4aa7 100644
--- a/mlir/test/Dialect/EmitC/transforms.mlir
+++ b/mlir/test/Dialect/EmitC/transforms.mlir
@@ -124,3 +124,12 @@ func.func @no_nested_expression(%arg0: i32, %arg1: i32) -> i1 {
   }
   return %a : i1
 }
+
+
+// CHECK-LABEL: func.func @single_result_requirement
+//   CHECK-NOT:  emitc.expression
+
+func.func @single_result_requirement() -> (i32, i32) {
+  %0:2 = emitc.call_opaque "foo" () : () -> (i32, i32)
+  return %0#0, %0#1 : i32, i32
+}
diff --git a/mlir/test/Dialect/LLVM/transform-e2e.mlir b/mlir/test/Dialect/LLVM/transform-e2e.mlir
index adbbbba32a40..c00b47fb936e 100644
--- a/mlir/test/Dialect/LLVM/transform-e2e.mlir
+++ b/mlir/test/Dialect/LLVM/transform-e2e.mlir
@@ -15,7 +15,7 @@ func.func @matmul_tensors(
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.consumed}) {
     %0 = transform.structured.match ops{["linalg.matmul"]} in %module_op : (!transform.any_op) -> !transform.any_op
-    %1, %loops:3 = transform.structured.tile_using_for %0 [2, 2, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+    %1, %loops:3 = transform.structured.tile_using_for %0 tile_sizes [2, 2, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
     %2 = transform.get_parent_op %1 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
     transform.structured.vectorize_children_and_apply_patterns %2 : (!transform.any_op) -> !transform.any_op
     %b = transform.bufferization.one_shot_bufferize layout{IdentityLayoutMap}
diff --git a/mlir/test/Dialect/LLVMIR/canonicalize.mlir b/mlir/test/Dialect/LLVMIR/canonicalize.mlir
index 6b265bbbdbfb..15f960167cb5 100644
--- a/mlir/test/Dialect/LLVMIR/canonicalize.mlir
+++ b/mlir/test/Dialect/LLVMIR/canonicalize.mlir
@@ -248,3 +248,14 @@ llvm.func @volatile_load(%x : !llvm.ptr) {
   %3 = llvm.load %x  atomic unordered { alignment = 1 } : !llvm.ptr -> i8
   llvm.return
 }
+
+// -----
+
+// CHECK-LABEL: func @inline_asm_side_effects
+llvm.func @inline_asm_side_effects(%x : i32) {
+  // CHECK-NOT: llvm.inline_asm "pure inline asm"
+  llvm.inline_asm "pure inline asm", "r" %x : (i32) -> ()
+  // CHECK: llvm.inline_asm has_side_effects "inline asm with side effects"
+  llvm.inline_asm has_side_effects "inline asm with side effects", "r" %x : (i32) -> ()
+  llvm.return
+}
diff --git a/mlir/test/Dialect/Linalg/block-pack-matmul-layout.mlir b/mlir/test/Dialect/Linalg/block-pack-matmul-layout.mlir
new file mode 100644
index 000000000000..01ca4374da04
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/block-pack-matmul-layout.mlir
@@ -0,0 +1,101 @@
+// RUN: mlir-opt %s -linalg-block-pack-matmul="block-factors=32,16,64 \
+// RUN: lhs-transpose-outer-blocks=false lhs-transpose-inner-blocks=false \
+// RUN: rhs-transpose-outer-blocks=true rhs-transpose-inner-blocks=true" \
+// RUN: -canonicalize | FileCheck %s --check-prefix=MMT4D
+
+// RUN: mlir-opt %s -linalg-block-pack-matmul="block-factors=32,16,64 \
+// RUN: lhs-transpose-outer-blocks=false lhs-transpose-inner-blocks=false \
+// RUN: rhs-transpose-outer-blocks=false rhs-transpose-inner-blocks=false" \
+// RUN: -canonicalize | FileCheck %s --check-prefix=MM4D
+
+// RUN: mlir-opt %s -linalg-block-pack-matmul="block-factors=32,16,64 \
+// RUN: lhs-transpose-outer-blocks=true lhs-transpose-inner-blocks=true \
+// RUN: rhs-transpose-outer-blocks=false rhs-transpose-inner-blocks=false" \
+// RUN: -canonicalize | FileCheck %s --check-prefix=MTM4D
+
+func.func @block_matmul(
+    %A: tensor<64x128xf32>, %B: tensor<128x64xf32>, %C: tensor<64x64xf32>) -> tensor<64x64xf32> {
+  %0 = linalg.matmul  ins(%A, %B : tensor<64x128xf32>, tensor<128x64xf32>)
+                      outs(%C : tensor<64x64xf32>) -> tensor<64x64xf32>
+  return %0 : tensor<64x64xf32>
+}
+
+func.func @block_matmul_transpose_a(
+    %A: tensor<128x64xf32>, %B: tensor<128x64xf32>, %C: tensor<64x64xf32>) -> tensor<64x64xf32> {
+  %0 = linalg.matmul_transpose_a ins(%A, %B : tensor<128x64xf32>, tensor<128x64xf32>)
+                                 outs(%C : tensor<64x64xf32>) -> tensor<64x64xf32>
+  return %0 : tensor<64x64xf32>
+}
+
+func.func @block_matmul_transpose_b(
+    %A: tensor<64x128xf32>, %B: tensor<64x128xf32>, %C: tensor<64x64xf32>) -> tensor<64x64xf32> {
+  %0 = linalg.matmul_transpose_b ins(%A, %B : tensor<64x128xf32>, tensor<64x128xf32>)
+                                 outs(%C : tensor<64x64xf32>) -> tensor<64x64xf32>
+  return %0 : tensor<64x64xf32>
+}
+
+// MMT4D-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>
+// MMT4D-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>
+// MMT4D-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>
+// MMT4D-LABEL: func @block_matmul
+// MMT4D-COUNT-3: tensor.pack
+// MMT4D: linalg.generic
+// MMT4D-SAME:  indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
+// MMT4D-SAME:  iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
+// MMT4D-COUNT-1: tensor.unpack
+// MMT4D-LABEL: func @block_matmul_transpose_a
+// MMT4D-COUNT-3: tensor.pack
+// MMT4D: linalg.generic
+// MMT4D-SAME:  indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
+// MMT4D-SAME:  iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
+// MMT4D-COUNT-1: tensor.unpack
+// MMT4D-LABEL: func @block_matmul_transpose_b
+// MMT4D-COUNT-3: tensor.pack
+// MMT4D: linalg.generic
+// MMT4D-SAME:  indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
+// MMT4D-SAME:  iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
+// MMT4D-COUNT-1: tensor.unpack
+
+// MM4D-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>
+// MM4D-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>
+// MM4D-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>
+// MM4D-LABEL: func @block_matmul
+// MM4D-COUNT-3: tensor.pack
+// MM4D: linalg.generic
+// MM4D-SAME:  indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
+// MM4D-SAME:  iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
+// MM4D-COUNT-1: tensor.unpack
+// MM4D-LABEL: func @block_matmul_transpose_a
+// MM4D-COUNT-3: tensor.pack
+// MM4D: linalg.generic
+// MM4D-SAME:  indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
+// MM4D-SAME:  iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
+// MM4D-COUNT-1: tensor.unpack
+// MM4D-LABEL: func @block_matmul_transpose_b
+// MM4D-COUNT-3: tensor.pack
+// MM4D: linalg.generic
+// MM4D-SAME:  indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
+// MM4D-SAME:  iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
+// MM4D-COUNT-1: tensor.unpack
+
+// MTM4D-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d5, d3)>
+// MTM4D-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>
+// MTM4D-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>
+// MTM4D-LABEL: func @block_matmul
+// MTM4D-COUNT-3: tensor.pack
+// MTM4D: linalg.generic
+// MTM4D-SAME:  indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
+// MTM4D-SAME:  iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
+// MTM4D-COUNT-1: tensor.unpack
+// MTM4D-LABEL: func @block_matmul_transpose_a
+// MTM4D-COUNT-3: tensor.pack
+// MTM4D: linalg.generic
+// MTM4D-SAME:  indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
+// MTM4D-SAME:  iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
+// MTM4D-COUNT-1: tensor.unpack
+// MTM4D-LABEL: func @block_matmul_transpose_b
+// MTM4D-COUNT-3: tensor.pack
+// MTM4D: linalg.generic
+// MTM4D-SAME:  indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
+// MTM4D-SAME:  iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
+// MTM4D-COUNT-1: tensor.unpack
diff --git a/mlir/test/Dialect/Linalg/block-pack-matmul-padding.mlir b/mlir/test/Dialect/Linalg/block-pack-matmul-padding.mlir
new file mode 100644
index 000000000000..9e396ba08d24
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/block-pack-matmul-padding.mlir
@@ -0,0 +1,82 @@
+// RUN: mlir-opt %s -linalg-block-pack-matmul="block-factors=32,16,64 allow-padding=1" \
+// RUN: -canonicalize | FileCheck %s
+
+// RUN: mlir-opt %s -linalg-block-pack-matmul="block-factors=32,16,64 allow-padding=0" \
+// RUN: -canonicalize | FileCheck %s --check-prefix=NOPAD
+
+// RUN: mlir-opt %s -linalg-block-pack-matmul="block-factors=32,16,64 allow-padding=1 mnk-padded-multiples=256,512,384" \
+// RUN: -canonicalize | FileCheck %s --check-prefix=PAD-MULT
+
+func.func @block_matmul_padding(
+    %A: tensor<123x125xf32>, %B: tensor<125x124xf32>, %C: tensor<123x124xf32>) -> tensor<123x124xf32> {
+  %0 = linalg.matmul  ins(%A, %B : tensor<123x125xf32>, tensor<125x124xf32>)
+                      outs(%C : tensor<123x124xf32>) -> tensor<123x124xf32>
+  return %0 : tensor<123x124xf32>
+}
+
+// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>
+// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>
+// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>
+// CHECK-LABEL: func @block_matmul_padding(
+// CHECK-SAME:    %[[A:[0-9a-z]+]]: tensor<123x125xf32>, %[[B:[0-9a-z]+]]: tensor<125x124xf32>, %[[C:[0-9a-z]+]]: tensor<123x124xf32>
+// CHECK-DAG: %[[ZERO:.+]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<4x2x32x64xf32>
+// CHECK: %[[A_PACKED:.+]] = tensor.pack %[[A]]
+// CHECK-SAME:  padding_value(%[[ZERO]] : f32)
+// CHECK-SAME:  outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64]
+// CHECK-SAME:  into %[[PACK_DST_0]] : tensor<123x125xf32> -> tensor<4x2x32x64xf32>
+// CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<8x2x16x64xf32>
+// CHECK: %[[B_PACKED:.+]] = tensor.pack %[[B]]
+// CHECK-SAME:  padding_value(%[[ZERO]] : f32)
+// CHECK-SAME:  outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 64]
+// CHECK-SAME:  into %[[PACK_DST_1]] : tensor<125x124xf32> -> tensor<8x2x16x64xf32>
+// CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<4x8x32x16xf32>
+// CHECK: %[[C_PACKED:.+]] = tensor.pack %[[C]]
+// CHECK-SAME:  padding_value(%[[ZERO]] : f32)
+// CHECK-SAME:  inner_dims_pos = [0, 1] inner_tiles = [32, 16]
+// CHECK-SAME:  into %[[PACK_DST_2]] : tensor<123x124xf32> -> tensor<4x8x32x16xf32>
+// CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic
+// CHECK-SAME:  indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
+// CHECK-SAME:  iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
+// CHECK-SAME:  ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<4x2x32x64xf32>, tensor<8x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<4x8x32x16xf32>)
+// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]]
+// CHECK-SAME:  inner_dims_pos = [0, 1] inner_tiles = [32, 16]
+// CHECK-SAME:  into %[[C]] : tensor<4x8x32x16xf32> -> tensor<123x124xf32>
+// CHECK: return %[[RES_UNPACKED]] : tensor<123x124xf32>
+
+// NOPAD-LABEL: func @block_matmul_padding(
+// NOPAD-SAME:    %[[A:[0-9a-z]+]]: tensor<123x125xf32>, %[[B:[0-9a-z]+]]: tensor<125x124xf32>, %[[C:[0-9a-z]+]]: tensor<123x124xf32>
+// NOPAD-NOT: tensor.pack
+// NOPAD: linalg.matmul ins(%[[A]], %[[B]] : tensor<123x125xf32>, tensor<125x124xf32>)
+// NOPAD-SAME: outs(%[[C]] : tensor<123x124xf32>) -> tensor<123x124xf32>
+// NOPAD-NOT: tensor.unpack
+
+// PAD-MULT-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>
+// PAD-MULT-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>
+// PAD-MULT-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>
+// PAD-MULT-LABEL: func @block_matmul_padding(
+// PAD-MULT-SAME:    %[[A:[0-9a-z]+]]: tensor<123x125xf32>, %[[B:[0-9a-z]+]]: tensor<125x124xf32>, %[[C:[0-9a-z]+]]: tensor<123x124xf32>
+// PAD-MULT-DAG: %[[ZERO:.+]] = arith.constant 0.000000e+00 : f32
+// PAD-MULT: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<1x1x256x384xf32>
+// PAD-MULT: %[[A_PACKED:.+]] = tensor.pack %[[A]]
+// PAD-MULT-SAME:  padding_value(%[[ZERO]] : f32)
+// PAD-MULT-SAME:  outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [256, 384]
+// PAD-MULT-SAME:  into %[[PACK_DST_0]] : tensor<123x125xf32> -> tensor<1x1x256x384xf32>
+// PAD-MULT: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<1x1x512x384xf32>
+// PAD-MULT: %[[B_PACKED:.+]] = tensor.pack %[[B]]
+// PAD-MULT-SAME:  padding_value(%[[ZERO]] : f32)
+// PAD-MULT-SAME:  outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [512, 384]
+// PAD-MULT-SAME:  into %[[PACK_DST_1]] : tensor<125x124xf32> -> tensor<1x1x512x384xf32>
+// PAD-MULT: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<1x1x256x512xf32>
+// PAD-MULT: %[[C_PACKED:.+]] = tensor.pack %[[C]]
+// PAD-MULT-SAME:  padding_value(%[[ZERO]] : f32)
+// PAD-MULT-SAME:  inner_dims_pos = [0, 1] inner_tiles = [256, 512]
+// PAD-MULT-SAME:  into %[[PACK_DST_2]] : tensor<123x124xf32> -> tensor<1x1x256x512xf32>
+// PAD-MULT: %[[GEMM_RES_PACKED:.+]] = linalg.generic
+// PAD-MULT-SAME:  indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
+// PAD-MULT-SAME:  iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
+// PAD-MULT-SAME:  ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<1x1x256x384xf32>, tensor<1x1x512x384xf32>) outs(%[[C_PACKED]] : tensor<1x1x256x512xf32>)
+// PAD-MULT: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]]
+// PAD-MULT-SAME:  inner_dims_pos = [0, 1] inner_tiles = [256, 512]
+// PAD-MULT-SAME:  into %[[C]] : tensor<1x1x256x512xf32> -> tensor<123x124xf32>
+// PAD-MULT: return %[[RES_UNPACKED]] : tensor<123x124xf32>
diff --git a/mlir/test/Dialect/Linalg/block-pack-matmul.mlir b/mlir/test/Dialect/Linalg/block-pack-matmul.mlir
new file mode 100644
index 000000000000..cc9af913ca15
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/block-pack-matmul.mlir
@@ -0,0 +1,478 @@
+// RUN: mlir-opt %s -linalg-block-pack-matmul=block-factors=32,16,64 -canonicalize -split-input-file | FileCheck %s
+
+func.func @block_matmul(
+    %A: tensor<128x128xf32>, %B: tensor<128x128xf32>, %C: tensor<128x128xf32>) -> tensor<128x128xf32> {
+  %0 = linalg.matmul  ins(%A, %B : tensor<128x128xf32>, tensor<128x128xf32>)
+                      outs(%C : tensor<128x128xf32>) -> tensor<128x128xf32>
+  return %0 : tensor<128x128xf32>
+}
+
+// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>
+// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>
+// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>
+
+// CHECK-LABEL: func @block_matmul(
+// CHECK-SAME:    %[[A:[0-9a-z]+]]: tensor<128x128xf32>, %[[B:[0-9a-z]+]]: tensor<128x128xf32>, %[[C:[0-9a-z]+]]: tensor<128x128xf32>
+// CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<4x2x32x64xf32>
+// CHECK: %[[A_PACKED:.+]] = tensor.pack %[[A]]
+// CHECK-SAME:  outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64]
+// CHECK-SAME:  into %[[PACK_DST_0]] : tensor<128x128xf32> -> tensor<4x2x32x64xf32>
+// CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<8x2x16x64xf32>
+// CHECK: %[[B_PACKED:.+]] = tensor.pack %[[B]]
+// CHECK-SAME:  outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 64]
+// CHECK-SAME:  into %[[PACK_DST_1]] : tensor<128x128xf32> -> tensor<8x2x16x64xf32>
+// CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<4x8x32x16xf32>
+// CHECK: %[[C_PACKED:.+]] = tensor.pack %[[C]]
+// CHECK-SAME:  inner_dims_pos = [0, 1] inner_tiles = [32, 16]
+// CHECK-SAME:  into %[[PACK_DST_2]] : tensor<128x128xf32> -> tensor<4x8x32x16xf32>
+// CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic
+// CHECK-SAME:  indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
+// CHECK-SAME:  iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
+// CHECK-SAME:  ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<4x2x32x64xf32>, tensor<8x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<4x8x32x16xf32>)
+// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]]
+// CHECK-SAME:  inner_dims_pos = [0, 1] inner_tiles = [32, 16]
+// CHECK-SAME:  into %[[C]] : tensor<4x8x32x16xf32> -> tensor<128x128xf32>
+// CHECK: return %[[RES_UNPACKED]] : tensor<128x128xf32>
+
+// -----
+
+func.func @block_matmul_dynamic(
+    %A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %0 = linalg.matmul  ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
+                      outs(%C : tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+// CHECK-DAG: #[[$MAP_M:.+]] = affine_map<()[s0] -> (s0 ceildiv 32)>
+// CHECK-DAG: #[[$MAP_K:.+]] = affine_map<()[s0] -> (s0 ceildiv 64)>
+// CHECK-DAG: #[[$MAP_N:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)>
+// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>
+// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>
+// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>
+
+// CHECK-LABEL: func @block_matmul_dynamic(
+// CHECK-SAME:    %[[A:[0-9a-z]+]]: tensor<?x?xf32>, %[[B:[0-9a-z]+]]: tensor<?x?xf32>, %[[C:[0-9a-z]+]]: tensor<?x?xf32>
+// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG: %[[ZERO:.+]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG: %[[A_M:.+]] = tensor.dim %[[A]], %[[C0]] : tensor<?x?xf32>
+// CHECK-DAG: %[[A_K:.+]] = tensor.dim %[[A]], %[[C1]] : tensor<?x?xf32>
+// CHECK-DAG: %[[A_OUTER_TILE_M:.+]] = affine.apply #[[$MAP_M]]()[%[[A_M]]]
+// CHECK-DAG: %[[A_OUTER_TILE_K:.+]] = affine.apply #[[$MAP_K]]()[%[[A_K]]]
+// CHECK: %[[PACK_DST_0:.+]] = tensor.empty(%[[A_OUTER_TILE_M]], %[[A_OUTER_TILE_K]]) : tensor<?x?x32x64xf32>
+// CHECK: %[[A_PACKED:.+]] = tensor.pack %[[A]]
+// CHECK-SAME:  padding_value(%[[ZERO]] : f32)
+// CHECK-SAME:  outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64]
+// CHECK-SAME:  into %[[PACK_DST_0]] : tensor<?x?xf32> -> tensor<?x?x32x64xf32>
+// CHECK-DAG: %[[B_K:.+]] = tensor.dim %[[B]], %[[C0]] : tensor<?x?xf32>
+// CHECK-DAG: %[[B_N:.+]] = tensor.dim %[[B]], %[[C1]] : tensor<?x?xf32>
+// CHECK-DAG: %[[B_OUTER_TILE_K:.+]] = affine.apply #[[$MAP_K]]()[%[[B_K]]]
+// CHECK-DAG: %[[B_OUTER_TILE_N:.+]] = affine.apply #[[$MAP_N]]()[%[[B_N]]]
+// CHECK: %[[PACK_DST_1:.+]] = tensor.empty(%[[B_OUTER_TILE_N]], %[[B_OUTER_TILE_K]]) : tensor<?x?x16x64xf32>
+// CHECK: %[[B_PACKED:.+]] = tensor.pack %[[B]]
+// CHECK-SAME:  padding_value(%[[ZERO]] : f32)
+// CHECK-SAME:  outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 64]
+// CHECK-SAME:  into %[[PACK_DST_1]] : tensor<?x?xf32> -> tensor<?x?x16x64xf32>
+// CHECK-DAG: %[[C_M:.+]] = tensor.dim %[[C]], %[[C0]] : tensor<?x?xf32>
+// CHECK-DAG: %[[C_N:.+]] = tensor.dim %[[C]], %[[C1]] : tensor<?x?xf32>
+// CHECK-DAG: %[[C_OUTER_TILE_M:.+]] = affine.apply #[[$MAP_M]]()[%[[C_M]]]
+// CHECK-DAG: %[[C_OUTER_TILE_N:.+]] = affine.apply #[[$MAP_N]]()[%[[C_N]]]
+// CHECK: %[[PACK_DST_2:.+]] = tensor.empty(%[[C_OUTER_TILE_M]], %[[C_OUTER_TILE_N]]) : tensor<?x?x32x16xf32>
+// CHECK: %[[C_PACKED:.+]] = tensor.pack %[[C]]
+// CHECK-SAME:  padding_value(%[[ZERO]] : f32)
+// CHECK-SAME:  inner_dims_pos = [0, 1] inner_tiles = [32, 16]
+// CHECK-SAME:  into %[[PACK_DST_2]] : tensor<?x?xf32> -> tensor<?x?x32x16xf32>
+// CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic
+// CHECK-SAME:  indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
+// CHECK-SAME:  iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
+// CHECK-SAME:  ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<?x?x32x64xf32>, tensor<?x?x16x64xf32>) outs(%[[C_PACKED]] : tensor<?x?x32x16xf32>)
+// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]]
+// CHECK-SAME:  inner_dims_pos = [0, 1] inner_tiles = [32, 16]
+// CHECK-SAME:  into %[[C]] : tensor<?x?x32x16xf32> -> tensor<?x?xf32>
+// CHECK: return %[[RES_UNPACKED]] : tensor<?x?xf32>
+
+// -----
+
+func.func @block_matmul_with_constant(
+    %A: tensor<128x128xf32>, %B: tensor<128x128xf32>) -> tensor<128x128xf32> {
+  %cst_acc = arith.constant dense<0.0> : tensor<128x128xf32>
+  %0 = linalg.matmul ins(%A, %B : tensor<128x128xf32>, tensor<128x128xf32>)
+                      outs(%cst_acc : tensor<128x128xf32>) -> tensor<128x128xf32>
+  return %0 : tensor<128x128xf32>
+}
+
+// CHECK-LABEL: func @block_matmul_with_constant(
+// CHECK-SAME:    %[[A:[0-9a-z]+]]: tensor<128x128xf32>, %[[B:[0-9a-z]+]]: tensor<128x128xf32>
+// CHECK-DAG: %[[CST_ACC_PACKED:.+]] = arith.constant dense<0.000000e+00> : tensor<4x8x32x16xf32>
+// CHECK-DAG: %[[RES_DST:.+]] = arith.constant dense<0.000000e+00> : tensor<128x128xf32>
+// CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic
+// CHECK-SAME:  ins({{.*}} : tensor<4x2x32x64xf32>, tensor<8x2x16x64xf32>) outs(%[[CST_ACC_PACKED]] : tensor<4x8x32x16xf32>)
+// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]]
+// CHECK-SAME:  inner_dims_pos = [0, 1] inner_tiles = [32, 16]
+// CHECK-SAME:  into %[[RES_DST]] : tensor<4x8x32x16xf32> -> tensor<128x128xf32>
+// CHECK: return %[[RES_UNPACKED]] : tensor<128x128xf32>
+
+// -----
+
+func.func @block_matmul_with_producer(
+    %A: tensor<128x128xf32>, %B: tensor<128x128xf32>, %C: tensor<128x128xf32>) -> tensor<128x128xf32> {
+  %cst = arith.constant 0.0 : f32
+  %acc = linalg.fill ins(%cst : f32) outs(%C : tensor<128x128xf32>) -> tensor<128x128xf32>
+  %1 = linalg.matmul ins(%A, %B : tensor<128x128xf32>, tensor<128x128xf32>)
+                      outs(%acc : tensor<128x128xf32>) -> tensor<128x128xf32>
+  return %1 : tensor<128x128xf32>
+}
+
+// CHECK-LABEL: func @block_matmul_with_producer(
+// CHECK-SAME:    %[[A:[0-9a-z]+]]: tensor<128x128xf32>, %[[B:[0-9a-z]+]]: tensor<128x128xf32>, %[[C:[0-9a-z]+]]: tensor<128x128xf32>
+// CHECK-DAG: %[[C0:.+]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[FILL_DST_PACKED:.+]] = tensor.empty() : tensor<4x8x32x16xf32>
+// CHECK: %[[ACC_PACKED:.+]] = linalg.fill ins(%[[C0]] : f32) outs(%[[FILL_DST_PACKED]] : tensor<4x8x32x16xf32>) -> tensor<4x8x32x16xf32>
+// CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic
+// CHECK-SAME:  ins({{.*}} : tensor<4x2x32x64xf32>, tensor<8x2x16x64xf32>) outs(%[[ACC_PACKED]] : tensor<4x8x32x16xf32>)
+// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]]
+// CHECK-SAME:  inner_dims_pos = [0, 1] inner_tiles = [32, 16]
+// CHECK-SAME:  into %[[C]] : tensor<4x8x32x16xf32> -> tensor<128x128xf32>
+// CHECK: return %[[RES_UNPACKED]] : tensor<128x128xf32>
+
+// -----
+
+func.func @block_matmul_with_consumer(
+    %A: tensor<128x128xf32>, %B: tensor<128x128xf32>, %C: tensor<128x128xf32>, %D: tensor<128x128xf32>) -> tensor<128x128xf32> {
+  %0 = tensor.empty() : tensor<128x128xf32>
+  %1 = linalg.matmul ins(%A, %B : tensor<128x128xf32>, tensor<128x128xf32>)
+                     outs(%C : tensor<128x128xf32>) -> tensor<128x128xf32>
+  %2 = linalg.add ins(%1, %D : tensor<128x128xf32>, tensor<128x128xf32>)
+                  outs(%0 : tensor<128x128xf32>) -> tensor<128x128xf32>
+  return %2 : tensor<128x128xf32>
+}
+
+// CHECK-LABEL: func @block_matmul_with_consumer(
+// CHECK-SAME:    %[[A:[0-9a-z]+]]: tensor<128x128xf32>, %[[B:[0-9a-z]+]]: tensor<128x128xf32>, %[[C:[0-9a-z]+]]: tensor<128x128xf32>, %[[D:[0-9a-z]+]]: tensor<128x128xf32>
+// CHECK-DAG: %[[RES_DST:.+]] = tensor.empty() : tensor<128x128xf32>
+// CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic
+// CHECK-SAME:  outs({{.*}} : tensor<4x8x32x16xf32>)
+// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]]
+// CHECK-SAME:  inner_dims_pos = [0, 1] inner_tiles = [32, 16]
+// CHECK-SAME:  into %[[C]] : tensor<4x8x32x16xf32> -> tensor<128x128xf32>
+// CHECK: %[[ADD_RES:.+]] = linalg.add
+// CHECK-SAME:  ins(%[[RES_UNPACKED]], %[[D]] : tensor<128x128xf32>, tensor<128x128xf32>) outs(%[[RES_DST]] : tensor<128x128xf32>)
+// CHECK: return %[[ADD_RES]] : tensor<128x128xf32>
+
+// -----
+
+func.func @block_batch_matmul(
+    %A: tensor<512x64x128xf32>, %B: tensor<512x128x64xf32>, %C: tensor<512x64x64xf32>) -> tensor<512x64x64xf32> {
+  %0 = linalg.batch_matmul ins(%A, %B : tensor<512x64x128xf32>, tensor<512x128x64xf32>)
+                           outs(%C : tensor<512x64x64xf32>) -> tensor<512x64x64xf32>
+  return %0 : tensor<512x64x64xf32>
+}
+
+// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d3, d4, d6)>
+// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d5, d6)>
+// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5)>
+
+// CHECK-LABEL: func @block_batch_matmul(
+// CHECK-SAME:   %[[A:.+]]: tensor<512x64x128xf32>, %[[B:.+]]: tensor<512x128x64xf32>, %[[C:.+]]: tensor<512x64x64xf32>
+// CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<512x2x2x32x64xf32>
+// CHECK: %[[A_PACKED:.+]] = tensor.pack %[[A]]
+// CHECK-SAME:  outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [32, 64]
+// CHECK-SAME:  into %[[PACK_DST_0]] : tensor<512x64x128xf32> -> tensor<512x2x2x32x64xf32>
+// CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<512x4x2x16x64xf32>
+// CHECK: %[[B_PACKED:.+]] = tensor.pack %[[B]]
+// CHECK-SAME:  outer_dims_perm = [0, 2, 1] inner_dims_pos = [2, 1] inner_tiles = [16, 64]
+// CHECK-SAME:  into %[[PACK_DST_1]] : tensor<512x128x64xf32> -> tensor<512x4x2x16x64xf32>
+// CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<512x2x4x32x16xf32>
+// CHECK: %[[C_PACKED:.+]] = tensor.pack %[[C]]
+// CHECK-SAME:  inner_dims_pos = [1, 2] inner_tiles = [32, 16]
+// CHECK-SAME:  into %[[PACK_DST_2]] : tensor<512x64x64xf32> -> tensor<512x2x4x32x16xf32>
+// CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic
+// CHECK-SAME:  indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
+// CHECK-SAME:  iterator_types = ["parallel", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
+// CHECK-SAME:  ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<512x2x2x32x64xf32>, tensor<512x4x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<512x2x4x32x16xf32>)
+// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]]
+// CHECK-SAME:  inner_dims_pos = [1, 2] inner_tiles = [32, 16]
+// CHECK-SAME:  into %[[C]] : tensor<512x2x4x32x16xf32> -> tensor<512x64x64xf32>
+// CHECK: return %[[RES_UNPACKED]] : tensor<512x64x64xf32>
+
+// -----
+
+func.func @block_matmul_transpose_a(
+    %A: tensor<128x64xf32>, %B: tensor<128x64xf32>, %C: tensor<64x64xf32>) -> tensor<64x64xf32> {
+  %0 = linalg.matmul_transpose_a ins(%A, %B : tensor<128x64xf32>, tensor<128x64xf32>)
+                                 outs(%C : tensor<64x64xf32>) -> tensor<64x64xf32>
+  return %0 : tensor<64x64xf32>
+}
+
+// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>
+// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>
+// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>
+
+// CHECK-LABEL: func @block_matmul_transpose_a(
+// CHECK-SAME:    %[[A:[0-9a-z]+]]: tensor<128x64xf32>, %[[B:[0-9a-z]+]]: tensor<128x64xf32>, %[[C:[0-9a-z]+]]: tensor<64x64xf32>
+// CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<2x2x32x64xf32>
+// CHECK: %[[A_PACKED:.+]] = tensor.pack %[[A]]
+// CHECK-SAME:  outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [32, 64]
+// CHECK-SAME:  into %[[PACK_DST_0]] : tensor<128x64xf32> -> tensor<2x2x32x64xf32>
+// CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<4x2x16x64xf32>
+// CHECK: %[[B_PACKED:.+]] = tensor.pack %[[B]]
+// CHECK-SAME:  outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 64]
+// CHECK-SAME:  into %[[PACK_DST_1]] : tensor<128x64xf32> -> tensor<4x2x16x64xf32>
+// CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<2x4x32x16xf32>
+// CHECK: %[[C_PACKED:.+]] = tensor.pack %[[C]]
+// CHECK-SAME:  inner_dims_pos = [0, 1] inner_tiles = [32, 16]
+// CHECK-SAME:  into %[[PACK_DST_2]] : tensor<64x64xf32> -> tensor<2x4x32x16xf32>
+// CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic
+// CHECK-SAME:  indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
+// CHECK-SAME:  iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
+// CHECK-SAME:  ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<2x2x32x64xf32>, tensor<4x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<2x4x32x16xf32>)
+// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]]
+// CHECK-SAME:  inner_dims_pos = [0, 1] inner_tiles = [32, 16]
+// CHECK-SAME:  into %[[C]] : tensor<2x4x32x16xf32> -> tensor<64x64xf32>
+// CHECK: return %[[RES_UNPACKED]] : tensor<64x64xf32>
+
+// -----
+
+func.func @block_batch_matmul_transpose_a(
+    %A: tensor<512x128x64xf32>, %B: tensor<512x128x64xf32>, %C: tensor<512x64x64xf32>) -> tensor<512x64x64xf32> {
+  %0 = linalg.batch_matmul_transpose_a ins(%A, %B : tensor<512x128x64xf32>, tensor<512x128x64xf32>)
+                                       outs(%C : tensor<512x64x64xf32>) -> tensor<512x64x64xf32>
+  return %0 : tensor<512x64x64xf32>
+}
+
+// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d3, d4, d6)>
+// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d5, d6)>
+// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5)>
+
+// CHECK-LABEL: func @block_batch_matmul_transpose_a(
+// CHECK-SAME:   %[[A:.+]]: tensor<512x128x64xf32>, %[[B:.+]]: tensor<512x128x64xf32>, %[[C:.+]]: tensor<512x64x64xf32>
+// CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<512x2x2x32x64xf32>
+// CHECK: %[[A_PACKED:.+]] = tensor.pack %[[A]]
+// CHECK-SAME:  outer_dims_perm = [0, 2, 1] inner_dims_pos = [2, 1] inner_tiles = [32, 64]
+// CHECK-SAME:  into %[[PACK_DST_0]] : tensor<512x128x64xf32> -> tensor<512x2x2x32x64xf32>
+// CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<512x4x2x16x64xf32>
+// CHECK: %[[B_PACKED:.+]] = tensor.pack %[[B]]
+// CHECK-SAME:  outer_dims_perm = [0, 2, 1] inner_dims_pos = [2, 1] inner_tiles = [16, 64]
+// CHECK-SAME:  into %[[PACK_DST_1]] : tensor<512x128x64xf32> -> tensor<512x4x2x16x64xf32>
+// CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<512x2x4x32x16xf32>
+// CHECK: %[[C_PACKED:.+]] = tensor.pack %[[C]]
+// CHECK-SAME:  inner_dims_pos = [1, 2] inner_tiles = [32, 16]
+// CHECK-SAME:  into %[[PACK_DST_2]] : tensor<512x64x64xf32> -> tensor<512x2x4x32x16xf32>
+// CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic
+// CHECK-SAME:  indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
+// CHECK-SAME:  iterator_types = ["parallel", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
+// CHECK-SAME:  ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<512x2x2x32x64xf32>, tensor<512x4x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<512x2x4x32x16xf32>)
+// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]]
+// CHECK-SAME:  inner_dims_pos = [1, 2] inner_tiles = [32, 16]
+// CHECK-SAME:  into %[[C]] : tensor<512x2x4x32x16xf32> -> tensor<512x64x64xf32>
+// CHECK: return %[[RES_UNPACKED]] : tensor<512x64x64xf32>
+
+// -----
+
+func.func @block_matmul_transpose_b(
+    %A: tensor<64x128xf32>, %B: tensor<64x128xf32>, %C: tensor<64x64xf32>) -> tensor<64x64xf32> {
+  %0 = linalg.matmul_transpose_b ins(%A, %B : tensor<64x128xf32>, tensor<64x128xf32>)
+                                 outs(%C : tensor<64x64xf32>) -> tensor<64x64xf32>
+  return %0 : tensor<64x64xf32>
+}
+
+// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>
+// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>
+// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>
+
+// CHECK-LABEL: func @block_matmul_transpose_b(
+// CHECK-SAME:    %[[A:[0-9a-z]+]]: tensor<64x128xf32>, %[[B:[0-9a-z]+]]: tensor<64x128xf32>, %[[C:[0-9a-z]+]]: tensor<64x64xf32>
+// CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<2x2x32x64xf32>
+// CHECK: %[[A_PACKED:.+]] = tensor.pack %[[A]]
+// CHECK-SAME:  outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64]
+// CHECK-SAME:  into %[[PACK_DST_0]] : tensor<64x128xf32> -> tensor<2x2x32x64xf32>
+// CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<4x2x16x64xf32>
+// CHECK: %[[B_PACKED:.+]] = tensor.pack %[[B]]
+// CHECK-SAME:  outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 64]
+// CHECK-SAME:  into %[[PACK_DST_1]] : tensor<64x128xf32> -> tensor<4x2x16x64xf32>
+// CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<2x4x32x16xf32>
+// CHECK: %[[C_PACKED:.+]] = tensor.pack %[[C]]
+// CHECK-SAME:  inner_dims_pos = [0, 1] inner_tiles = [32, 16]
+// CHECK-SAME:  into %[[PACK_DST_2]] : tensor<64x64xf32> -> tensor<2x4x32x16xf32>
+// CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic
+// CHECK-SAME:  indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
+// CHECK-SAME:  iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
+// CHECK-SAME:  ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<2x2x32x64xf32>, tensor<4x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<2x4x32x16xf32>)
+// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]]
+// CHECK-SAME:  inner_dims_pos = [0, 1] inner_tiles = [32, 16]
+// CHECK-SAME:  into %[[C]] : tensor<2x4x32x16xf32> -> tensor<64x64xf32>
+// CHECK: return %[[RES_UNPACKED]] : tensor<64x64xf32>
+
+// -----
+
+func.func @block_batch_matmul_transpose_b(
+    %A: tensor<512x64x128xf32>, %B: tensor<512x64x128xf32>, %C: tensor<512x64x64xf32>) -> tensor<512x64x64xf32> {
+  %0 = linalg.batch_matmul_transpose_b ins(%A, %B : tensor<512x64x128xf32>, tensor<512x64x128xf32>)
+                                       outs(%C : tensor<512x64x64xf32>) -> tensor<512x64x64xf32>
+  return %0 : tensor<512x64x64xf32>
+}
+
+// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d3, d4, d6)>
+// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d5, d6)>
+// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5)>
+
+// CHECK-LABEL: func @block_batch_matmul_transpose_b(
+// CHECK-SAME:   %[[A:.+]]: tensor<512x64x128xf32>, %[[B:.+]]: tensor<512x64x128xf32>, %[[C:.+]]: tensor<512x64x64xf32>
+// CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<512x2x2x32x64xf32>
+// CHECK: %[[A_PACKED:.+]] = tensor.pack %[[A]]
+// CHECK-SAME:  outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [32, 64]
+// CHECK-SAME:  into %[[PACK_DST_0]] : tensor<512x64x128xf32> -> tensor<512x2x2x32x64xf32>
+// CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<512x4x2x16x64xf32>
+// CHECK: %[[B_PACKED:.+]] = tensor.pack %[[B]]
+// CHECK-SAME:  outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 64]
+// CHECK-SAME:  into %[[PACK_DST_1]] : tensor<512x64x128xf32> -> tensor<512x4x2x16x64xf32>
+// CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<512x2x4x32x16xf32>
+// CHECK: %[[C_PACKED:.+]] = tensor.pack %[[C]]
+// CHECK-SAME:  inner_dims_pos = [1, 2] inner_tiles = [32, 16]
+// CHECK-SAME:  into %[[PACK_DST_2]] : tensor<512x64x64xf32> -> tensor<512x2x4x32x16xf32>
+// CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic
+// CHECK-SAME:  indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
+// CHECK-SAME:  iterator_types = ["parallel", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
+// CHECK-SAME:  ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<512x2x2x32x64xf32>, tensor<512x4x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<512x2x4x32x16xf32>)
+// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]]
+// CHECK-SAME:  inner_dims_pos = [1, 2] inner_tiles = [32, 16]
+// CHECK-SAME:  into %[[C]] : tensor<512x2x4x32x16xf32> -> tensor<512x64x64xf32>
+// CHECK: return %[[RES_UNPACKED]] : tensor<512x64x64xf32>
+
+// -----
+
+#map = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+func.func @block_generic_matmul(
+    %A: tensor<128x128xf32>, %B: tensor<128x128xf32>, %C: tensor<128x128xf32>) -> tensor<128x128xf32> {
+  %0 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<128x128xf32>, tensor<128x128xf32>)
+    outs(%C : tensor<128x128xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+    %1 = arith.mulf %in, %in_0 : f32
+    %2 = arith.addf %out, %1 : f32
+    linalg.yield %2 : f32
+  } -> tensor<128x128xf32>
+  return %0 : tensor<128x128xf32>
+}
+
+// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>
+// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>
+// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>
+
+// CHECK-LABEL: func @block_generic_matmul(
+// CHECK-SAME:    %[[A:[0-9a-z]+]]: tensor<128x128xf32>, %[[B:[0-9a-z]+]]: tensor<128x128xf32>, %[[C:[0-9a-z]+]]: tensor<128x128xf32>
+// CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<4x2x32x64xf32>
+// CHECK: %[[A_PACKED:.+]] = tensor.pack %[[A]]
+// CHECK-SAME:  outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64]
+// CHECK-SAME:  into %[[PACK_DST_0]] : tensor<128x128xf32> -> tensor<4x2x32x64xf32>
+// CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<8x2x16x64xf32>
+// CHECK: %[[B_PACKED:.+]] = tensor.pack %[[B]]
+// CHECK-SAME:  outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 64]
+// CHECK-SAME:  into %[[PACK_DST_1]] : tensor<128x128xf32> -> tensor<8x2x16x64xf32>
+// CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<4x8x32x16xf32>
+// CHECK: %[[C_PACKED:.+]] = tensor.pack %[[C]]
+// CHECK-SAME:  inner_dims_pos = [0, 1] inner_tiles = [32, 16]
+// CHECK-SAME:  into %[[PACK_DST_2]] : tensor<128x128xf32> -> tensor<4x8x32x16xf32>
+// CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic
+// CHECK-SAME:  indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
+// CHECK-SAME:  iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
+// CHECK-SAME:  ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<4x2x32x64xf32>, tensor<8x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<4x8x32x16xf32>)
+// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]]
+// CHECK-SAME:  inner_dims_pos = [0, 1] inner_tiles = [32, 16]
+// CHECK-SAME:  into %[[C]] : tensor<4x8x32x16xf32> -> tensor<128x128xf32>
+// CHECK: return %[[RES_UNPACKED]] : tensor<128x128xf32>
+
+// -----
+
+#map = affine_map<(d0, d1, d2) -> (d2, d0)>
+#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+func.func @block_generic_matmul_transpose_a(
+    %A: tensor<128x64xf32>, %B: tensor<128x64xf32>, %C: tensor<64x64xf32>) -> tensor<64x64xf32> {
+  %0 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<128x64xf32>, tensor<128x64xf32>)
+    outs(%C : tensor<64x64xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+    %1 = arith.mulf %in, %in_0 : f32
+    %2 = arith.addf %out, %1 : f32
+    linalg.yield %2 : f32
+  } -> tensor<64x64xf32>
+  return %0 : tensor<64x64xf32>
+}
+
+// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>
+// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>
+// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>
+
+// CHECK-LABEL: func @block_generic_matmul_transpose_a(
+// CHECK-SAME:    %[[A:[0-9a-z]+]]: tensor<128x64xf32>, %[[B:[0-9a-z]+]]: tensor<128x64xf32>, %[[C:[0-9a-z]+]]: tensor<64x64xf32>
+// CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<2x2x32x64xf32>
+// CHECK: %[[A_PACKED:.+]] = tensor.pack %[[A]]
+// CHECK-SAME:  outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [32, 64]
+// CHECK-SAME:  into %[[PACK_DST_0]] : tensor<128x64xf32> -> tensor<2x2x32x64xf32>
+// CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<4x2x16x64xf32>
+// CHECK: %[[B_PACKED:.+]] = tensor.pack %[[B]]
+// CHECK-SAME:  outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 64]
+// CHECK-SAME:  into %[[PACK_DST_1]] : tensor<128x64xf32> -> tensor<4x2x16x64xf32>
+// CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<2x4x32x16xf32>
+// CHECK: %[[C_PACKED:.+]] = tensor.pack %[[C]]
+// CHECK-SAME:  inner_dims_pos = [0, 1] inner_tiles = [32, 16]
+// CHECK-SAME:  into %[[PACK_DST_2]] : tensor<64x64xf32> -> tensor<2x4x32x16xf32>
+// CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic
+// CHECK-SAME:  indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
+// CHECK-SAME:  iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
+// CHECK-SAME:  ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<2x2x32x64xf32>, tensor<4x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<2x4x32x16xf32>)
+// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]]
+// CHECK-SAME:  inner_dims_pos = [0, 1] inner_tiles = [32, 16]
+// CHECK-SAME:  into %[[C]] : tensor<2x4x32x16xf32> -> tensor<64x64xf32>
+// CHECK: return %[[RES_UNPACKED]] : tensor<64x64xf32>
+
+// -----
+
+#map = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+func.func @block_generic_matmul_transpose_b(
+    %A: tensor<64x128xf32>, %B: tensor<64x128xf32>, %C: tensor<64x64xf32>) -> tensor<64x64xf32> {
+  %0 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<64x128xf32>, tensor<64x128xf32>)
+    outs(%C : tensor<64x64xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+    %1 = arith.mulf %in, %in_0 : f32
+    %2 = arith.addf %out, %1 : f32
+    linalg.yield %2 : f32
+  } -> tensor<64x64xf32>
+  return %0 : tensor<64x64xf32>
+}
+
+// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>
+// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>
+// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>
+
+// CHECK-LABEL: func @block_generic_matmul_transpose_b(
+// CHECK-SAME:    %[[A:[0-9a-z]+]]: tensor<64x128xf32>, %[[B:[0-9a-z]+]]: tensor<64x128xf32>, %[[C:[0-9a-z]+]]: tensor<64x64xf32>
+// CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<2x2x32x64xf32>
+// CHECK: %[[A_PACKED:.+]] = tensor.pack %[[A]]
+// CHECK-SAME:  outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64]
+// CHECK-SAME:  into %[[PACK_DST_0]] : tensor<64x128xf32> -> tensor<2x2x32x64xf32>
+// CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<4x2x16x64xf32>
+// CHECK: %[[B_PACKED:.+]] = tensor.pack %[[B]]
+// CHECK-SAME:  outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 64]
+// CHECK-SAME:  into %[[PACK_DST_1]] : tensor<64x128xf32> -> tensor<4x2x16x64xf32>
+// CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<2x4x32x16xf32>
+// CHECK: %[[C_PACKED:.+]] = tensor.pack %[[C]]
+// CHECK-SAME:  inner_dims_pos = [0, 1] inner_tiles = [32, 16]
+// CHECK-SAME:  into %[[PACK_DST_2]] : tensor<64x64xf32> -> tensor<2x4x32x16xf32>
+// CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic
+// CHECK-SAME:  indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
+// CHECK-SAME:  iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
+// CHECK-SAME:  ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<2x2x32x64xf32>, tensor<4x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<2x4x32x16xf32>)
+// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]]
+// CHECK-SAME:  inner_dims_pos = [0, 1] inner_tiles = [32, 16]
+// CHECK-SAME:  into %[[C]] : tensor<2x4x32x16xf32> -> tensor<64x64xf32>
+// CHECK: return %[[RES_UNPACKED]] : tensor<64x64xf32>
diff --git a/mlir/test/Dialect/Linalg/generalize-tensor-pack-tile.mlir b/mlir/test/Dialect/Linalg/generalize-tensor-pack-tile.mlir
index 0a197a0ee9fa..d0c53ae46800 100644
--- a/mlir/test/Dialect/Linalg/generalize-tensor-pack-tile.mlir
+++ b/mlir/test/Dialect/Linalg/generalize-tensor-pack-tile.mlir
@@ -27,7 +27,7 @@ func.func @KCRS_to_KCRSsr(%arg0: tensor<1x1x128x64xf32>, %arg1: tensor<1x1x4x8x8
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
       %0 = transform.structured.match ops{["tensor.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      %1, %loops:4 = transform.structured.tile_using_for %0 [1, 1, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+      %1, %loops:4 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
       transform.yield
   }
 }
@@ -54,7 +54,7 @@ func.func @pad_and_pack(%arg0: tensor<13x15xf32>, %arg1: tensor<2x8x8x2xf32>, %a
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
       %0 = transform.structured.match ops{["tensor.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      %1, %loops:2 = transform.structured.tile_using_for %0 [1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+      %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
       transform.yield
   }
 }
@@ -85,7 +85,7 @@ func.func @KC_to_CKkc(%arg0: tensor<128x256xf32>, %arg1: tensor<32x4x32x8xf32>)
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
       %0 = transform.structured.match ops{["tensor.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      %1, %loops:2 = transform.structured.tile_using_for %0 [1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+      %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
       transform.yield
   }
 }
diff --git a/mlir/test/Dialect/Linalg/generalize-tensor-unpack-tile.mlir b/mlir/test/Dialect/Linalg/generalize-tensor-unpack-tile.mlir
index 7d64331c9878..c15859d898ec 100644
--- a/mlir/test/Dialect/Linalg/generalize-tensor-unpack-tile.mlir
+++ b/mlir/test/Dialect/Linalg/generalize-tensor-unpack-tile.mlir
@@ -8,7 +8,7 @@ func.func @KCRSsr_to_KCRS(%arg0: tensor<1x1x4x8x8x32xf32>, %arg1: tensor<1x1x128
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["tensor.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1, %loops:4 = transform.structured.tile_using_for %0 [1, 1, 32, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+    %1, %loops:4 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 32, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
 }
@@ -68,7 +68,7 @@ func.func @unpack_and_extract_slice(%arg0: tensor<2x8x8x2xf32>, %arg1: tensor<13
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["tensor.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1, %loops:2 = transform.structured.tile_using_for %0 [8, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+    %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [8, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
 }
@@ -100,7 +100,7 @@ func.func @CKkc_to_KC(%arg0: tensor<32x4x32x8xf32>, %arg1: tensor<128x256xf32>)
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["tensor.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1, %loops:2 = transform.structured.tile_using_for %0 [32, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+    %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [32, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
 }
diff --git a/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir b/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir
index c3ac69f65b7c..3f8d2ea06641 100644
--- a/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir
+++ b/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir
@@ -52,7 +52,7 @@ module attributes {transform.with_named_sequence} {
         : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
 
     // Tile linalg.matmul a second time.
-    %tiled_linalg_op, %loops = transform.structured.tile_using_for %tiled_matmul_op[0, 0, 16] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %tiled_linalg_op, %loops = transform.structured.tile_using_for %tiled_matmul_op tile_sizes [0, 0, 16] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
     // Pad linalg.matmul.
     %padded, %pad, %copy_back = transform.structured.pad %tiled_linalg_op
@@ -171,7 +171,7 @@ module attributes {transform.with_named_sequence} {
         : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
 
     // Tile linalg.matmul a second time.
-    %tiled_linalg_op, %loops = transform.structured.tile_using_for %tiled_matmul_op[0, 0, 16] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %tiled_linalg_op, %loops = transform.structured.tile_using_for %tiled_matmul_op tile_sizes [0, 0, 16] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
     // Pad linalg.matmul.
     %padded, %pad, %copy_back = transform.structured.pad %tiled_linalg_op
diff --git a/mlir/test/Dialect/Linalg/multisize-tiling-full.mlir b/mlir/test/Dialect/Linalg/multisize-tiling-full.mlir
index 592eb781cd4f..15b24b56608e 100644
--- a/mlir/test/Dialect/Linalg/multisize-tiling-full.mlir
+++ b/mlir/test/Dialect/Linalg/multisize-tiling-full.mlir
@@ -8,13 +8,13 @@ module attributes {transform.with_named_sequence} {
     %1:3 = transform.structured.multitile_sizes %0 { dimension = 0, target_size = 3} : (!transform.any_op) -> !transform.any_op
     %t:3 = transform.structured.multitile_sizes %0 { dimension = 1, target_size = 10} : (!transform.any_op) -> !transform.any_op
     %2:2 = transform.structured.split %0 after %1#2 { dimension = 0 } : !transform.any_op, !transform.any_op
-    %3:2 = transform.structured.tile_using_for %2#0 [%1#0] : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-    %4:2 = transform.structured.tile_using_for %2#1 [%1#1] : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %3:2 = transform.structured.tile_using_for %2#0 tile_sizes [%1#0] : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %4:2 = transform.structured.tile_using_for %2#1 tile_sizes [%1#1] : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
     %5 = transform.merge_handles %3#0, %4#0 : !transform.any_op
     %tt:3 = transform.replicate num(%5) %t#0, %t#1, %t#2 : !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op
     %6:2 = transform.structured.split %5 after %tt#2 { dimension = 1 } : !transform.any_op, !transform.any_op
-    transform.structured.tile_using_for %6#0 [0, %tt#0] : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-    transform.structured.tile_using_for %6#1 [0, %tt#1] : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.structured.tile_using_for %6#0 tile_sizes [0, %tt#0] : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.structured.tile_using_for %6#1 tile_sizes [0, %tt#1] : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
     transform.yield
   }
 }
@@ -110,13 +110,13 @@ module attributes {transform.with_named_sequence} {
     %1:3 = transform.structured.multitile_sizes %0 { dimension = 0, target_size = 3} : (!transform.any_op) -> !transform.param<i64>
     %t:3 = transform.structured.multitile_sizes %0 { dimension = 1, target_size = 10} : (!transform.any_op) -> !transform.param<i64>
     %2:2 = transform.structured.split %0 after %1#2 { dimension = 0 } : !transform.any_op, !transform.param<i64>
-    %3:2 = transform.structured.tile_using_for %2#0 [%1#0] : (!transform.any_op, !transform.param<i64>) -> (!transform.any_op, !transform.any_op)
-    %4:2 = transform.structured.tile_using_for %2#1 [%1#1] : (!transform.any_op, !transform.param<i64>) -> (!transform.any_op, !transform.any_op)
+    %3:2 = transform.structured.tile_using_for %2#0 tile_sizes [%1#0] : (!transform.any_op, !transform.param<i64>) -> (!transform.any_op, !transform.any_op)
+    %4:2 = transform.structured.tile_using_for %2#1 tile_sizes [%1#1] : (!transform.any_op, !transform.param<i64>) -> (!transform.any_op, !transform.any_op)
     %5 = transform.merge_handles %3#0, %4#0 : !transform.any_op
     %tt:3 = transform.replicate num(%5) %t#0, %t#1, %t#2 : !transform.any_op, !transform.param<i64>, !transform.param<i64>, !transform.param<i64>
     %6:2 = transform.structured.split %5 after %tt#2 { dimension = 1 } : !transform.any_op, !transform.param<i64>
-    transform.structured.tile_using_for %6#0 [0, %tt#0] : (!transform.any_op, !transform.param<i64>) -> (!transform.any_op, !transform.any_op)
-    transform.structured.tile_using_for %6#1 [0, %tt#1] : (!transform.any_op, !transform.param<i64>) -> (!transform.any_op, !transform.any_op)
+    transform.structured.tile_using_for %6#0 tile_sizes [0, %tt#0] : (!transform.any_op, !transform.param<i64>) -> (!transform.any_op, !transform.any_op)
+    transform.structured.tile_using_for %6#1 tile_sizes [0, %tt#1] : (!transform.any_op, !transform.param<i64>) -> (!transform.any_op, !transform.any_op)
     transform.yield
   }
 }
diff --git a/mlir/test/Dialect/Linalg/promote.mlir b/mlir/test/Dialect/Linalg/promote.mlir
index fb5f357f3faa..2d640057df34 100644
--- a/mlir/test/Dialect/Linalg/promote.mlir
+++ b/mlir/test/Dialect/Linalg/promote.mlir
@@ -183,7 +183,7 @@ func.func @gemm_shared(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1, %loops:3 = transform.structured.tile_using_for %0 [16, 16, 16] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+    %1, %loops:3 = transform.structured.tile_using_for %0 tile_sizes [16, 16, 16] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
     %2 = transform.structured.promote %1 { operands_to_promote = [0, 1], mapping = [#gpu.memory_space<workgroup>] } : (!transform.any_op) -> !transform.any_op
     transform.yield
   }
@@ -227,7 +227,7 @@ func.func @gemm_private(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1, %loops:3 = transform.structured.tile_using_for %0 [16, 16, 16] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+    %1, %loops:3 = transform.structured.tile_using_for %0 tile_sizes [16, 16, 16] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
     %2 = transform.structured.promote %1 { operands_to_promote = [0, 1], mapping = [#gpu.memory_space<private>] } : (!transform.any_op) -> !transform.any_op
     transform.yield
   }
diff --git a/mlir/test/Dialect/Linalg/promotion_options.mlir b/mlir/test/Dialect/Linalg/promotion_options.mlir
index 3bf74b708cb8..caa72ba24316 100644
--- a/mlir/test/Dialect/Linalg/promotion_options.mlir
+++ b/mlir/test/Dialect/Linalg/promotion_options.mlir
@@ -37,7 +37,7 @@ func.func @gemm(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1, %loops:3 = transform.structured.tile_using_for %0 [16, 16, 16] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+    %1, %loops:3 = transform.structured.tile_using_for %0 tile_sizes [16, 16, 16] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
     %2 = transform.structured.promote %1 { operands_to_promote = [0, 2], force_full_tiles = [false, false], use_full_tiles_by_default } : (!transform.any_op) -> !transform.any_op
     transform.yield
   }
diff --git a/mlir/test/Dialect/Linalg/tile-conv.mlir b/mlir/test/Dialect/Linalg/tile-conv.mlir
index c42bdbe982c4..f674996e42f3 100644
--- a/mlir/test/Dialect/Linalg/tile-conv.mlir
+++ b/mlir/test/Dialect/Linalg/tile-conv.mlir
@@ -12,7 +12,7 @@ func.func @conv(%arg0 : memref<?x?xf32>, %arg1 : memref<?x?xf32>, %arg2 : memref
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.conv_2d"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1, %loop:2 = transform.structured.tile_using_for %0 [2, 3] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+    %1, %loop:2 = transform.structured.tile_using_for %0 tile_sizes [2, 3] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
 }
diff --git a/mlir/test/Dialect/Linalg/tile-indexed.mlir b/mlir/test/Dialect/Linalg/tile-indexed.mlir
index c176dc19c7e9..b4aa0a33bc59 100644
--- a/mlir/test/Dialect/Linalg/tile-indexed.mlir
+++ b/mlir/test/Dialect/Linalg/tile-indexed.mlir
@@ -14,7 +14,7 @@ func.func @indexed_vector(%arg0: memref<50xindex>) {
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
       %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      %1, %loop = transform.structured.tile_using_for %0 [10] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+      %1, %loop = transform.structured.tile_using_for %0 tile_sizes [10] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
       transform.yield
   }
 }
@@ -46,7 +46,7 @@ func.func @indexed_matrix(%arg0: memref<50x50xindex>) {
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
       %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      %1, %loop:2 = transform.structured.tile_using_for %0 [10, 25] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+      %1, %loop:2 = transform.structured.tile_using_for %0 tile_sizes [10, 25] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
       transform.yield
   }
 }
diff --git a/mlir/test/Dialect/Linalg/tile-softmax.mlir b/mlir/test/Dialect/Linalg/tile-softmax.mlir
index ec848e2deb74..7d201b58a8c3 100644
--- a/mlir/test/Dialect/Linalg/tile-softmax.mlir
+++ b/mlir/test/Dialect/Linalg/tile-softmax.mlir
@@ -39,7 +39,7 @@ func.func @softmax(%arg0: tensor<16x64x256xf32>) -> tensor<16x64x256xf32> {
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.softmax"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1, %loop:2 = transform.structured.tile_using_for %0 [2, 3] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+    %1, %loop:2 = transform.structured.tile_using_for %0 tile_sizes [2, 3] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
 }
@@ -149,7 +149,7 @@ func.func @softmax_memref(%arg0: memref<16x64x256xf32>, %arg1: memref<16x64x256x
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.softmax"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1, %loop:2 = transform.structured.tile_using_for %0 [2, 3] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+    %1, %loop:2 = transform.structured.tile_using_for %0 tile_sizes [2, 3] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
 }
diff --git a/mlir/test/Dialect/Linalg/tile-tensors.mlir b/mlir/test/Dialect/Linalg/tile-tensors.mlir
index cdef71ded8b2..89183813c080 100644
--- a/mlir/test/Dialect/Linalg/tile-tensors.mlir
+++ b/mlir/test/Dialect/Linalg/tile-tensors.mlir
@@ -30,7 +30,7 @@ func.func @matmul_tensors(
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1, %loops:3 = transform.structured.tile_using_for %0 [2, 3, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+    %1, %loops:3 = transform.structured.tile_using_for %0 tile_sizes [2, 3, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
 }
@@ -57,7 +57,7 @@ func.func @matmul_tensors_with_size_zeros(
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.structured.tile_using_for %0 [0, 0, 0] : (!transform.any_op) -> (!transform.any_op)
+    %1 = transform.structured.tile_using_for %0 tile_sizes [0, 0, 0] : (!transform.any_op) -> (!transform.any_op)
     transform.yield
   }
 }
@@ -90,7 +90,7 @@ func.func @generic_op_tensors(
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1, %loops:3 = transform.structured.tile_using_for %0 [2, 3, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+    %1, %loops:3 = transform.structured.tile_using_for %0 tile_sizes [2, 3, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
 }
@@ -163,7 +163,7 @@ func.func @fold_extract_slice(
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1, %loops:3 = transform.structured.tile_using_for %0 [2, 3, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+    %1, %loops:3 = transform.structured.tile_using_for %0 tile_sizes [2, 3, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
 }
diff --git a/mlir/test/Dialect/Linalg/tile-to-forall.mlir b/mlir/test/Dialect/Linalg/tile-to-forall.mlir
index 12e2dea5530b..8545dfd25ecc 100644
--- a/mlir/test/Dialect/Linalg/tile-to-forall.mlir
+++ b/mlir/test/Dialect/Linalg/tile-to-forall.mlir
@@ -130,8 +130,8 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
     %sz = transform.structured.match ops{["test.dummy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1:2 = transform.structured.tile_using_forall %0 tile_sizes *(%sz : !transform.any_op)
-           : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %1:2 = transform.structured.tile_using_forall %0 tile_sizes *(%sz)
+           : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
     transform.yield
   }
 }
@@ -333,8 +333,8 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
     %sz = transform.structured.match ops{["test.dummy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1:2 = transform.structured.tile_using_forall %0 tile_sizes [%sz : !transform.any_op, 20]
-           : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %1:2 = transform.structured.tile_using_forall %0 tile_sizes [%sz, 20]
+           : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
     transform.yield
   }
 }
@@ -492,8 +492,8 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
     %sz = transform.param.constant 10 : i64 -> !transform.param<i64>
-    %1:2 = transform.structured.tile_using_forall %0 tile_sizes [%sz : !transform.param<i64>, 20]
-           : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %1:2 = transform.structured.tile_using_forall %0 tile_sizes [%sz, 20]
+           : (!transform.any_op, !transform.param<i64>) -> (!transform.any_op, !transform.any_op)
     transform.yield
   }
 }
@@ -513,8 +513,8 @@ module attributes {transform.with_named_sequence} {
     %c20 = transform.param.constant 20 : i64 -> !transform.param<i64>
     %sz = transform.merge_handles %c10, %c20 : !transform.param<i64>
     // expected-error @below {{requires exactly one parameter associated}}
-    %1:2 = transform.structured.tile_using_forall %0 tile_sizes [%sz : !transform.param<i64>, 20]
-           : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %1:2 = transform.structured.tile_using_forall %0 tile_sizes [%sz, 20]
+           : (!transform.any_op, !transform.param<i64>) -> (!transform.any_op, !transform.any_op)
     transform.yield
   }
 }
@@ -562,8 +562,8 @@ module attributes {transform.with_named_sequence} {
     %c10 = transform.param.constant 10 : i64 -> !transform.any_param
     %c20 = transform.param.constant 20 : i64 -> !transform.any_param
     %sz = transform.merge_handles %c10, %c20 : !transform.any_param
-    %1:2 = transform.structured.tile_using_forall %0 tile_sizes *(%sz : !transform.any_param)
-           : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %1:2 = transform.structured.tile_using_forall %0 tile_sizes *(%sz)
+           : (!transform.any_op, !transform.any_param) -> (!transform.any_op, !transform.any_op)
     transform.yield
   }
 }
@@ -581,8 +581,8 @@ module attributes {transform.with_named_sequence} {
     %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
     %sz = transform.param.constant "[10 : i64, 20 : i64]" -> !transform.any_param
     // expected-error @below {{expected the parameter to be associated with an integer attribute}}
-    %1:2 = transform.structured.tile_using_forall %0 tile_sizes *(%sz : !transform.any_param)
-           : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %1:2 = transform.structured.tile_using_forall %0 tile_sizes *(%sz)
+           : (!transform.any_op, !transform.any_param) -> (!transform.any_op, !transform.any_op)
     transform.yield
   }
 }
diff --git a/mlir/test/Dialect/Linalg/transform-op-compose-masked-vectorize-and-cleanups.mlir b/mlir/test/Dialect/Linalg/transform-op-compose-masked-vectorize-and-cleanups.mlir
index 477261882421..61fe3da34e1d 100644
--- a/mlir/test/Dialect/Linalg/transform-op-compose-masked-vectorize-and-cleanups.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-compose-masked-vectorize-and-cleanups.mlir
@@ -22,9 +22,9 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.matmul"]} in %module
       : (!transform.any_op) -> !transform.any_op
-    %tiled_linalg_op, %loops:3 = transform.structured.tile_using_for %0[64, 128, 256]
+    %tiled_linalg_op, %loops:3 = transform.structured.tile_using_for %0 tile_sizes [64, 128, 256]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-    %tiled_linalg_op_0, %loops_1:3 = transform.structured.tile_using_for %tiled_linalg_op[8, 8, 8]
+    %tiled_linalg_op_0, %loops_1:3 = transform.structured.tile_using_for %tiled_linalg_op tile_sizes [8, 8, 8]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
     transform.structured.vectorize %tiled_linalg_op_0 vector_sizes [8, 8, 8]
       : !transform.any_op
diff --git a/mlir/test/Dialect/Linalg/transform-op-fuse.mlir b/mlir/test/Dialect/Linalg/transform-op-fuse.mlir
index 69daf8c80a16..3a023deb1132 100644
--- a/mlir/test/Dialect/Linalg/transform-op-fuse.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-fuse.mlir
@@ -95,7 +95,7 @@ module attributes {transform.with_named_sequence} {
     %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
     %1, %loops:2 = transform.structured.fuse %0 {tile_sizes = [5, 0, 7], tile_interchange = [0, 2, 1]}
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-    %2, %loops_2 = transform.structured.tile_using_for %1 [0, 4]
+    %2, %loops_2 = transform.structured.tile_using_for %1 tile_sizes [0, 4]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
       transform.yield
   }
diff --git a/mlir/test/Dialect/Linalg/transform-op-hoist-pad-build-packing-loop-nest.mlir b/mlir/test/Dialect/Linalg/transform-op-hoist-pad-build-packing-loop-nest.mlir
index 1be5bf098c33..ae63ed5f1a41 100644
--- a/mlir/test/Dialect/Linalg/transform-op-hoist-pad-build-packing-loop-nest.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-hoist-pad-build-packing-loop-nest.mlir
@@ -15,7 +15,7 @@ module attributes {transform.with_named_sequence} {
     %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1
       : (!transform.any_op) -> !transform.any_op
 
-    %matmul_l1, %loops_l1 = transform.structured.tile_using_for %matmul [5] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %matmul_l1, %loops_l1 = transform.structured.tile_using_for %matmul tile_sizes [5] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
     %matmul_padded, %0, %copy_back = transform.structured.pad %matmul_l1 {
       padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
@@ -49,7 +49,7 @@ module attributes {transform.with_named_sequence} {
     %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1
       : (!transform.any_op) -> !transform.any_op
 
-    %matmul_l1, %loops_l1 = transform.structured.tile_using_for %matmul [5] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %matmul_l1, %loops_l1 = transform.structured.tile_using_for %matmul tile_sizes [5] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
     %matmul_padded, %0, %copy_back = transform.structured.pad %matmul_l1 {
       padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
@@ -89,7 +89,7 @@ module attributes {transform.with_named_sequence} {
     %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1
       : (!transform.any_op) -> !transform.any_op
 
-    %matmul_l1, %loops_l1 = transform.structured.tile_using_for %matmul [5] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %matmul_l1, %loops_l1 = transform.structured.tile_using_for %matmul tile_sizes [5] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
     %matmul_padded, %0, %copy_back = transform.structured.pad %matmul_l1 {
       padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
@@ -129,7 +129,7 @@ module attributes {transform.with_named_sequence} {
     %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1
       : (!transform.any_op) -> !transform.any_op
 
-    %matmul_l1, %loops_l1 = transform.structured.tile_using_for %matmul [5] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %matmul_l1, %loops_l1 = transform.structured.tile_using_for %matmul tile_sizes [5] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
     %matmul_padded, %0, %copy_back = transform.structured.pad %matmul_l1 {
       padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
@@ -167,7 +167,7 @@ module attributes {transform.with_named_sequence} {
     %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1
       : (!transform.any_op) -> !transform.any_op
 
-    %matmul_l1, %loops_l1:2 = transform.structured.tile_using_for %matmul [5, 0, 7] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+    %matmul_l1, %loops_l1:2 = transform.structured.tile_using_for %matmul tile_sizes [5, 0, 7] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
 
     %matmul_padded, %0, %copy_back = transform.structured.pad %matmul_l1 {
       padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
diff --git a/mlir/test/Dialect/Linalg/transform-op-hoist-pad.mlir b/mlir/test/Dialect/Linalg/transform-op-hoist-pad.mlir
index 37cb9b2376fb..499d9904c06b 100644
--- a/mlir/test/Dialect/Linalg/transform-op-hoist-pad.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-hoist-pad.mlir
@@ -15,7 +15,7 @@ module attributes {transform.with_named_sequence} {
       : (!transform.any_op) -> !transform.any_op
 
 
-    %matmul_l1, %loops_l1 = transform.structured.tile_using_for %matmul [5] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %matmul_l1, %loops_l1 = transform.structured.tile_using_for %matmul tile_sizes [5] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
     %matmul_padded, %0, %copy_back = transform.structured.pad %matmul_l1 {
       padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
@@ -53,7 +53,7 @@ module attributes {transform.with_named_sequence} {
       : (!transform.any_op) -> !transform.any_op
 
 
-    %matmul_l1, %loops_l1 = transform.structured.tile_using_for %matmul [5] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %matmul_l1, %loops_l1 = transform.structured.tile_using_for %matmul tile_sizes [5] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
     %matmul_padded, %0, %copy_back = transform.structured.pad %matmul_l1 {
       padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
@@ -98,7 +98,7 @@ module attributes {transform.with_named_sequence} {
       : (!transform.any_op) -> !transform.any_op
 
 
-    %matmul_l1, %loops_l1 = transform.structured.tile_using_for %matmul [5] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %matmul_l1, %loops_l1 = transform.structured.tile_using_for %matmul tile_sizes [5] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
     %matmul_padded, %0, %copy_back = transform.structured.pad %matmul_l1 {
       padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
@@ -145,7 +145,7 @@ module attributes {transform.with_named_sequence} {
       : (!transform.any_op) -> !transform.any_op
 
 
-    %matmul_l1, %loops_l1 = transform.structured.tile_using_for %matmul [5] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %matmul_l1, %loops_l1 = transform.structured.tile_using_for %matmul tile_sizes [5] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
     %matmul_padded, %0, %copy_back = transform.structured.pad %matmul_l1 {
       padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
@@ -191,7 +191,7 @@ module attributes {transform.with_named_sequence} {
       : (!transform.any_op) -> !transform.any_op
 
 
-    %matmul_l1, %loops_l1:2 = transform.structured.tile_using_for %matmul [5, 0, 7] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+    %matmul_l1, %loops_l1:2 = transform.structured.tile_using_for %matmul tile_sizes [5, 0, 7] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
 
     %matmul_padded, %0, %copy_back = transform.structured.pad %matmul_l1 {
       padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
diff --git a/mlir/test/Dialect/Linalg/transform-op-mmt4d-to-fma.mlir b/mlir/test/Dialect/Linalg/transform-op-mmt4d-to-fma.mlir
index 6aba2b3bb368..b5c6e610f58f 100644
--- a/mlir/test/Dialect/Linalg/transform-op-mmt4d-to-fma.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-mmt4d-to-fma.mlir
@@ -20,10 +20,10 @@ module attributes {transform.with_named_sequence} {
 
     // Step 1: Tile
     // Tile parallel dims
-    %tiled_linalg_op_p, %loops:4 = transform.structured.tile_using_for %mmt4d[1, 1, 0, 8, 8, 0]
+    %tiled_linalg_op_p, %loops:4 = transform.structured.tile_using_for %mmt4d tile_sizes [1, 1, 0, 8, 8, 0]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
     // Tile reduction dims
-    %tiled_linalg_op_r, %loops2:2 = transform.structured.tile_using_for %tiled_linalg_op_p[0, 0, 1, 0, 0, 1]
+    %tiled_linalg_op_r, %loops2:2 = transform.structured.tile_using_for %tiled_linalg_op_p tile_sizes [0, 0, 1, 0, 0, 1]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
 
     // Step 2: Vectorize
diff --git a/mlir/test/Dialect/Linalg/transform-op-pack.mlir b/mlir/test/Dialect/Linalg/transform-op-pack.mlir
index cf6339ce3de8..6c26ebd0a5b8 100644
--- a/mlir/test/Dialect/Linalg/transform-op-pack.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-pack.mlir
@@ -372,8 +372,8 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
       %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
       %sz = transform.structured.match ops{["some_tile_size"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      %1 = transform.structured.pack %0 packed_sizes = [0, %sz : !transform.any_op, %sz : !transform.any_op]
-        : (!transform.any_op) -> (!transform.op<"linalg.generic">)
+      %1 = transform.structured.pack %0 packed_sizes = [0, %sz, %sz]
+        : (!transform.any_op, !transform.any_op, !transform.any_op) -> (!transform.op<"linalg.generic">)
         transform.yield
   }
 }
diff --git a/mlir/test/Dialect/Linalg/transform-op-pad.mlir b/mlir/test/Dialect/Linalg/transform-op-pad.mlir
index d27276cda49d..47bb5ddf4afc 100644
--- a/mlir/test/Dialect/Linalg/transform-op-pad.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-pad.mlir
@@ -73,10 +73,9 @@ func.func @pad_to_multiple(%arg0: tensor<24x12xf32>,
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %padded, %pad, %copy_back = transform.structured.pad %0 {
+    %padded, %pad, %copy_back = transform.structured.pad %0 pad_to_multiple_of [2, 2, 1] {
       padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32],
       padding_dimensions=[0, 1, 2],
-      pad_to_multiple_of=[2, 2, 1],
       pack_paddings=[1, 1, 0]
     } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
@@ -87,6 +86,42 @@ module attributes {transform.with_named_sequence} {
 
 #map = affine_map<()[s0] -> (-s0 + 12, 7)>
 
+// CHECK-LABEL: @parametrized_pad_to_multiple
+func.func @parametrized_pad_to_multiple(%arg0: tensor<24x12xf32>,
+                                        %arg1: tensor<12x25xf32>,
+                                        %arg2: tensor<24x25xf32>,
+                                        %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> {
+  %0 = affine.min #map()[%iv2]
+  %1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
+  %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor<?x5xf32>
+  %3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<24x25xf32> to tensor<4x5xf32>
+
+  //      CHECK: linalg.matmul
+  // CHECK-SAME:     ins(%{{.*}}, %{{.*}} : tensor<4x7xf32>, tensor<7x6xf32>)
+  // CHECK-SAME:     outs(%{{.*}} : tensor<4x6xf32>)
+  %4 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32>
+  %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32>
+  func.return %5 : tensor<24x25xf32>
+}
+
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %c2 = transform.param.constant 2 : i64 -> !transform.param<i64>
+    %padded, %pad, %copy_back = transform.structured.pad %0 pad_to_multiple_of [%c2, 2, 1] {
+      padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32],
+      padding_dimensions=[0, 1, 2],
+      pack_paddings=[1, 1, 0]
+    } : (!transform.any_op, !transform.param<i64>) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+
+// -----
+
+#map = affine_map<()[s0] -> (-s0 + 12, 7)>
+
 // CHECK-LABEL: @static_sizes_output_divisible_on_empty_op
 func.func @static_sizes_output_divisible_on_empty_op(%arg0: tensor<24x12xf32>,
     %arg1: tensor<12x25xf32>, %arg2: tensor<24x25xf32>, %iv0: index,
diff --git a/mlir/test/Dialect/Linalg/transform-op-peel-and-vectorize-conv.mlir b/mlir/test/Dialect/Linalg/transform-op-peel-and-vectorize-conv.mlir
index 7f3997633a30..4bb40bef9fba 100644
--- a/mlir/test/Dialect/Linalg/transform-op-peel-and-vectorize-conv.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-peel-and-vectorize-conv.mlir
@@ -61,11 +61,11 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%root: !transform.any_op {transform.consume}) {
     // 1. Tile parallel dims
     %1 = transform.structured.match ops{["linalg.depthwise_conv_2d_nhwc_hwc"]} in %root : (!transform.any_op) -> !transform.any_op
-    %tiled_linalg_op_0, %loops_1:4 = transform.structured.tile_using_for %1[1, 1, 4, [4], 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.op<"scf.for">, !transform.op<"scf.for">, !transform.op<"scf.for">, !transform.op<"scf.for">)
+    %tiled_linalg_op_0, %loops_1:4 = transform.structured.tile_using_for %1 tile_sizes [1, 1, 4, [4], 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.op<"scf.for">, !transform.op<"scf.for">, !transform.op<"scf.for">, !transform.op<"scf.for">)
 
     // 2. Tile reduction dims
     %2 = transform.structured.match ops{["linalg.depthwise_conv_2d_nhwc_hwc"]} in %loops_1#3 : (!transform.op<"scf.for">) -> !transform.any_op
-    %tiled_linalg_op_1, %loops_2:2 = transform.structured.tile_using_for %2[0, 0, 0, 0, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+    %tiled_linalg_op_1, %loops_2:2 = transform.structured.tile_using_for %2 tile_sizes [0, 0, 0, 0, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
 
     // 3. Decompose 2D conv into 2 x 1D conv
     %3 = transform.structured.match ops{["linalg.depthwise_conv_2d_nhwc_hwc"]} in %loops_1#3 : (!transform.op<"scf.for">) -> !transform.any_op
diff --git a/mlir/test/Dialect/Linalg/transform-op-peel-and-vectorize.mlir b/mlir/test/Dialect/Linalg/transform-op-peel-and-vectorize.mlir
index b7e316f8925d..05a032b1ece0 100644
--- a/mlir/test/Dialect/Linalg/transform-op-peel-and-vectorize.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-peel-and-vectorize.mlir
@@ -67,7 +67,7 @@ module attributes {transform.with_named_sequence} {
     %matmul = transform.structured.match ops{["linalg.matmul"]} in %root : (!transform.any_op) -> !transform.any_op
     // 1. Scalable tiling
     %_, %loop_1, %loop_2, %loop_3 =
-      transform.structured.tile_using_for %matmul [8, [16], 1] : (!transform.any_op)
+      transform.structured.tile_using_for %matmul tile_sizes [8, [16], 1] : (!transform.any_op)
       -> (!transform.any_op, !transform.op<"scf.for">, !transform.op<"scf.for">,!transform.op<"scf.for">)
 
     // 2. Loop peeling (only the middle dimension)
diff --git a/mlir/test/Dialect/Linalg/transform-op-scalarize.mlir b/mlir/test/Dialect/Linalg/transform-op-scalarize.mlir
index 7d642c8995f0..91949f58931a 100644
--- a/mlir/test/Dialect/Linalg/transform-op-scalarize.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-scalarize.mlir
@@ -21,7 +21,7 @@ func.func @scalarize(%arg0: tensor<24x12xf32>,
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1, %loops = transform.structured.tile_using_for %0 [10, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %1, %loops = transform.structured.tile_using_for %0 tile_sizes [10, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
     %2 = transform.structured.scalarize %1 : (!transform.any_op) -> !transform.any_op
     transform.yield
   }
diff --git a/mlir/test/Dialect/Linalg/transform-op-tile.mlir b/mlir/test/Dialect/Linalg/transform-op-tile.mlir
index ea8c5e612479..d244670f7375 100644
--- a/mlir/test/Dialect/Linalg/transform-op-tile.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-tile.mlir
@@ -3,7 +3,7 @@
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1, %loops:3 = transform.structured.tile_using_for %0 [4, 4, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+    %1, %loops:3 = transform.structured.tile_using_for %0 tile_sizes [4, 4, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
 }
@@ -42,7 +42,7 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
     %1 = transform.structured.match ops{["func.call"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %2, %loops:3 = transform.structured.tile_using_for %0 [%1, %1, 4] : (!transform.any_op, !transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+    %2, %loops:3 = transform.structured.tile_using_for %0 tile_sizes [%1, %1, 4] : (!transform.any_op, !transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
 }
@@ -86,7 +86,7 @@ module attributes {transform.with_named_sequence} {
     // expected-note @below {{for this parameter}}
     %1 = transform.test_produce_param (0 : i64) : !transform.param<i64>
     // expected-error @below {{expected as many parameter values (0) as target ops (2)}}
-    transform.structured.tile_using_for %0 [%1, %1, %1]
+    transform.structured.tile_using_for %0 tile_sizes [%1, %1, %1]
       : (!transform.any_op, !transform.param<i64>, !transform.param<i64>, !transform.param<i64>)
       -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
       transform.yield
@@ -113,7 +113,7 @@ module attributes {transform.with_named_sequence} {
     // expected-note @below {{for this handle}}
     %1 = transform.structured.match ops{["arith.constant"]} in %arg1 : (!transform.any_op) -> !transform.any_op
     // expected-error @below {{expected as many dynamic size-producing operations (0) as target ops (2)}}
-    transform.structured.tile_using_for %0 [%1, %1, 1]
+    transform.structured.tile_using_for %0 tile_sizes [%1, %1, 1]
       : (!transform.any_op, !transform.any_op, !transform.any_op)
       -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
       transform.yield
@@ -194,7 +194,7 @@ module {
   module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
       %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      %1, %loop = transform.structured.tile_using_for %0 [[4]] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+      %1, %loop = transform.structured.tile_using_for %0 tile_sizes [[4]] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
       transform.yield
   }
   }
@@ -230,7 +230,7 @@ func.func @scalable_and_fixed_length_tile(
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1, %loops:3 = transform.structured.tile_using_for %0 [4, 4, [4]] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+    %1, %loops:3 = transform.structured.tile_using_for %0 tile_sizes [4, 4, [4]] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
 }
@@ -249,7 +249,7 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
     // expected-error @below {{too many tiles provided, expected at most 3 found 4}}
-    %1, %loops = transform.structured.tile_using_for %0 [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %1, %loops = transform.structured.tile_using_for %0 tile_sizes [1, 0, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
     transform.yield
   }
 }
diff --git a/mlir/test/Dialect/Linalg/transform-ops-invalid.mlir b/mlir/test/Dialect/Linalg/transform-ops-invalid.mlir
index e7d9815ab222..e86d4962530a 100644
--- a/mlir/test/Dialect/Linalg/transform-ops-invalid.mlir
+++ b/mlir/test/Dialect/Linalg/transform-ops-invalid.mlir
@@ -77,7 +77,7 @@ transform.sequence failures(propagate) {
 transform.sequence failures(propagate) {
 ^bb0(%arg0: !transform.any_op):
   %0 = transform.param.constant 2 : i64 -> !transform.param<i64>
-  // expected-error@below {{custom op 'transform.structured.vectorize' expected 2 operand type(s)}}
+  // expected-error@below {{custom op 'transform.structured.vectorize' 1 operands present, but expected 2}}
   transform.structured.vectorize %arg0 vector_sizes [%0, 2] : !transform.any_op, !transform.param<i64>, !transform.param<i64>
 
 }
diff --git a/mlir/test/Dialect/Linalg/transform-ops.mlir b/mlir/test/Dialect/Linalg/transform-ops.mlir
index 8f6274fd22c2..733f305f850c 100644
--- a/mlir/test/Dialect/Linalg/transform-ops.mlir
+++ b/mlir/test/Dialect/Linalg/transform-ops.mlir
@@ -3,7 +3,7 @@
 transform.sequence failures(propagate) {
 ^bb1(%arg0: !transform.any_op):
   // CHECK %{{.*}}, %{{.*}}:2 = transform.structured.tile
-  %0, %1:2 = transform.structured.tile_using_for %arg0 [2, 0, 3] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+  %0, %1:2 = transform.structured.tile_using_for %arg0 tile_sizes [2, 0, 3] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
 }
 
 // check that the Attributes of `tile_using_for` are preserved through printing
@@ -11,9 +11,9 @@ transform.sequence failures(propagate) {
 transform.sequence failures(propagate) {
 ^bb1(%arg0: !transform.any_op):
   // CHECK %{{.*}}, %{{.*}}:2 = transform.structured.tile %arg0 [2, 0, 3] interchange = [2, 1] {test_attr1 = 1 : i64, test_attr2}
-  %0, %1:2 = transform.structured.tile_using_for %arg0 [2, 0, 3] interchange = [2, 1] {test_attr1 = 1 : i64, test_attr2}: (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+  %0, %1:2 = transform.structured.tile_using_for %arg0 tile_sizes [2, 0, 3] interchange = [2, 1] {test_attr1 = 1 : i64, test_attr2}: (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
   // CHECK %{{.*}}, %{{.*}}:2 = transform.structured.tile %arg0 [4, 5, 3] {test_attr3 = 1 : i64, test_attr4}
-  %2, %3:2 = transform.structured.tile_using_for %0 [0, 5, 3] {test_attr3 = 1 : i64, test_attr4}: (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+  %2, %3:2 = transform.structured.tile_using_for %0 tile_sizes [0, 5, 3] {test_attr3 = 1 : i64, test_attr4}: (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
 }
 
 transform.sequence failures(propagate) {
diff --git a/mlir/test/Dialect/Linalg/transform-patterns.mlir b/mlir/test/Dialect/Linalg/transform-patterns.mlir
index 5a9b490c07ff..87b7664198da 100644
--- a/mlir/test/Dialect/Linalg/transform-patterns.mlir
+++ b/mlir/test/Dialect/Linalg/transform-patterns.mlir
@@ -12,7 +12,7 @@ func.func @dot(%x: memref<?xf32, strided<[1], offset: ?>>,
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
       %0 = transform.structured.match ops{["linalg.dot"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      %1, %loop = transform.structured.tile_using_for %0 [8000] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+      %1, %loop = transform.structured.tile_using_for %0 tile_sizes [8000] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
       transform.yield
   }
 }
@@ -38,7 +38,7 @@ func.func @matvec(%A: memref<?x?xf32, strided<[?, 1], offset: ?>>,
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
       %0 = transform.structured.match ops{["linalg.matvec"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      %1, %loops:2 = transform.structured.tile_using_for %0 [5, 6] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+      %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [5, 6] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
       transform.yield
   }
 }
@@ -67,10 +67,10 @@ func.func @matmul(%A: memref<?x?xf32, strided<[?, 1], offset: ?>>,
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
       %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      %1, %loops:3 = transform.structured.tile_using_for %0 [2000, 3000, 4000] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-      %2, %loops_2:3 = transform.structured.tile_using_for %1 [200, 300, 400] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-      %3, %loops_3:3 = transform.structured.tile_using_for %2 [20, 30, 40] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-      %4, %loops_4:3 = transform.structured.tile_using_for %3 [2, 3, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+      %1, %loops:3 = transform.structured.tile_using_for %0 tile_sizes [2000, 3000, 4000] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+      %2, %loops_2:3 = transform.structured.tile_using_for %1 tile_sizes [200, 300, 400] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+      %3, %loops_3:3 = transform.structured.tile_using_for %2 tile_sizes [20, 30, 40] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+      %4, %loops_4:3 = transform.structured.tile_using_for %3 tile_sizes [2, 3, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
       transform.yield
   }
 }
@@ -170,7 +170,7 @@ func.func @matvec_perm(%A: memref<?x?xf32, strided<[?, 1], offset: ?>>,
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
       %0 = transform.structured.match ops{["linalg.matvec"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      %1, %loops:2 = transform.structured.tile_using_for %0 [5, 6] interchange = [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+      %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [5, 6] interchange = [1, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
       transform.yield
   }
 }
@@ -199,9 +199,9 @@ func.func @matmul_perm(%A: memref<?x?xf32, strided<[?, 1], offset: ?>>,
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
       %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      %1, %loops:3 = transform.structured.tile_using_for %0 [2000, 3000, 4000] interchange = [1, 2, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-      %2, %loops_2:3 = transform.structured.tile_using_for %1 [200, 300, 400] interchange = [1, 0, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-      %3, %loops_3:3 = transform.structured.tile_using_for %2 [20, 30, 40] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+      %1, %loops:3 = transform.structured.tile_using_for %0 tile_sizes [2000, 3000, 4000] interchange = [1, 2, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+      %2, %loops_2:3 = transform.structured.tile_using_for %1 tile_sizes [200, 300, 400] interchange = [1, 0, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+      %3, %loops_3:3 = transform.structured.tile_using_for %2 tile_sizes [20, 30, 40] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
       transform.yield
   }
 }
diff --git a/mlir/test/Dialect/Linalg/vectorization.mlir b/mlir/test/Dialect/Linalg/vectorization.mlir
index 80a5a4c6702a..bbeccc7fecd6 100644
--- a/mlir/test/Dialect/Linalg/vectorization.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization.mlir
@@ -985,3 +985,73 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
+
+  // -----
+
+func.func @test_vectorize_unpack_no_vector_sizes(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> {
+  // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[READ:.*]] = vector.transfer_read {{.*}} : tensor<8x8x32x16xf32>, vector<8x8x32x16xf32>
+  // CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [0, 2, 1, 3] : vector<8x8x32x16xf32> to vector<8x32x8x16xf32>
+  // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<8x32x8x16xf32> to vector<256x128xf32>
+  // CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<256x128xf32>
+  // CHECK: %[[C00:.*]] = arith.constant 0 : index
+  // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], {{.*}} : vector<256x128xf32>, tensor<256x128xf32>
+  // CHECK: return %[[WRIT]] : tensor<256x128xf32>
+   %0 = tensor.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32>
+   return %0 : tensor<256x128xf32>
+ }
+ module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+   transform.structured.vectorize %0 : !transform.any_op
+    transform.yield
+  } 
+ }
+
+  // -----
+
+func.func @test_vectorize_unpack_no_vector_sizes_slice_output(%source: tensor<8x4x16x16xf32>, %dest: tensor<64x127xf32>) -> tensor<64x127xf32> {
+  //      CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+  //      CHECK: %[[C0:.*]] = arith.constant 0 : index
+  //      CHECK: %[[READ:.*]] = vector.transfer_read {{.*}} : tensor<8x4x16x16xf32>, vector<8x4x16x16xf32>
+  //      CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [1, 2, 0, 3] : vector<8x4x16x16xf32> to vector<4x16x8x16xf32>
+  //      CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<4x16x8x16xf32> to vector<64x128xf32>
+  //      CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<64x127xf32>
+  //      CHECK: %[[C00:.*]] = arith.constant 0 : index
+  //      CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], %[[EMPT]]{{\[}}%[[C00]], %[[C00]]]
+  // CHECK-SAME:  {in_bounds = [true, false]} : vector<64x128xf32>, tensor<64x127xf32>
+  //      CHECK: return %[[WRIT]] : tensor<64x127xf32>
+   %0 = tensor.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %dest : tensor<8x4x16x16xf32> -> tensor<64x127xf32>
+   return %0 : tensor<64x127xf32>
+ }
+ module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+   transform.structured.vectorize %0 : !transform.any_op
+    transform.yield
+  } 
+ }
+
+  // -----
+
+func.func @test_vectorize_unpack_no_vector_sizes_permute(%source: tensor<4x7x4xf32>, %dest: tensor<7x16xf32>) -> tensor<7x16xf32> {
+   %0 = tensor.unpack %source outer_dims_perm=[1, 0] inner_dims_pos = [1] inner_tiles = [4] into %dest : tensor<4x7x4xf32> -> tensor<7x16xf32>
+   return %0 : tensor<7x16xf32>
+ }
+  // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[READ:.*]] = vector.transfer_read {{.*}} : tensor<4x7x4xf32>, vector<4x7x4xf32>
+  // CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [1, 0, 2] : vector<4x7x4xf32> to vector<7x4x4xf32>
+  // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<7x4x4xf32> to vector<7x16xf32>
+  // CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<7x16xf32>
+  // CHECK: %[[C00:.*]] = arith.constant 0 : index
+  // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], {{.*}} : vector<7x16xf32>, tensor<7x16xf32>
+  // CHECK: return %[[WRIT]] : tensor<7x16xf32>
+ module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+   transform.structured.vectorize %0 : !transform.any_op
+    transform.yield
+  } 
+ }
diff --git a/mlir/test/Dialect/Linalg/vectorize-tensor-extract-masked.mlir b/mlir/test/Dialect/Linalg/vectorize-tensor-extract-masked.mlir
index edc38b42f5cd..e68d297dc41f 100644
--- a/mlir/test/Dialect/Linalg/vectorize-tensor-extract-masked.mlir
+++ b/mlir/test/Dialect/Linalg/vectorize-tensor-extract-masked.mlir
@@ -28,7 +28,7 @@ func.func @masked_static_vectorize_nd_tensor_extract_with_affine_apply_contiguou
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
      %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-     transform.structured.vectorize %0 vector_sizes [1, 4] vectorize_nd_extract : !transform.any_op
+     transform.structured.vectorize %0 vector_sizes [1, 4] {vectorize_nd_extract} : !transform.any_op
      transform.yield
    }
 }
@@ -85,7 +85,7 @@ func.func @masked_dynamic_vectorize_nd_tensor_extract_with_affine_apply_contiguo
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
      %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-     transform.structured.vectorize %0 vector_sizes [1, 4] vectorize_nd_extract : !transform.any_op
+     transform.structured.vectorize %0 vector_sizes [1, 4] {vectorize_nd_extract} : !transform.any_op
      transform.yield
   }
 }
@@ -125,7 +125,7 @@ func.func @masked_vectorize_nd_tensor_extract_with_affine_apply_gather(%6: tenso
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
      %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-     transform.structured.vectorize %0 vector_sizes [1, 4] vectorize_nd_extract : !transform.any_op
+     transform.structured.vectorize %0 vector_sizes [1, 4] {vectorize_nd_extract} : !transform.any_op
      transform.yield
    }
 }
@@ -182,7 +182,7 @@ func.func @masked_dynamic_vectorize_nd_tensor_extract_with_affine_apply_gather(%
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
      %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-     transform.structured.vectorize %0 vector_sizes [1, 4] vectorize_nd_extract : !transform.any_op
+     transform.structured.vectorize %0 vector_sizes [1, 4] {vectorize_nd_extract} : !transform.any_op
      transform.yield
    }
 }
@@ -234,7 +234,7 @@ func.func @extract_masked_vectorize(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf3
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
      %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-     transform.structured.vectorize %0 vector_sizes [3, 3] vectorize_nd_extract : !transform.any_op
+     transform.structured.vectorize %0 vector_sizes [3, 3] {vectorize_nd_extract} : !transform.any_op
      transform.yield
    }
 }
@@ -279,7 +279,7 @@ func.func @tensor_extract_dynamic_shape(%arg1: tensor<123x321xf32>, %arg2: tenso
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
      %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-     transform.structured.vectorize %0 vector_sizes [1, 3, 8] vectorize_nd_extract : !transform.any_op
+     transform.structured.vectorize %0 vector_sizes [1, 3, 8] {vectorize_nd_extract} : !transform.any_op
      transform.yield
   }
 }
diff --git a/mlir/test/Dialect/Math/expand-math.mlir b/mlir/test/Dialect/Math/expand-math.mlir
index 3d94b55126d0..016a7bbdeb56 100644
--- a/mlir/test/Dialect/Math/expand-math.mlir
+++ b/mlir/test/Dialect/Math/expand-math.mlir
@@ -658,3 +658,73 @@ func.func @math_fpowi_to_powf_scalar(%0 : f32, %1: i64) -> f32 {
 // CHECK:        %[[AND:.*]] = arith.andi %[[CMPF1]], %[[CMPF]] : i1
 // CHECK:        %[[SEL:.*]] = arith.select %[[AND]], %[[MUL1]], %[[EXP]] : f32
 // CHECK:       return %[[SEL]] : f32
+
+// -----
+
+// CHECK-LABEL:   func.func @rsqrt
+// CHECK-SAME:     (%[[ARG:.*]]: f16)
+// CHECK-SAME:    -> f16
+// CHECK-DAG:     %[[CST:.*]] = arith.constant 1.000000e+00 : f16
+// CHECK-DAG:     %[[SQRT:.*]] = math.sqrt %[[ARG]] : f16
+// CHECK-DAG:     %[[DIV:.*]] = arith.divf %[[CST]], %[[SQRT]] : f16
+// CHECK:         return %[[DIV]] : f16
+func.func @rsqrt16(%float: f16) -> (f16)  {
+  %float_result = math.rsqrt %float : f16
+  return %float_result : f16
+}
+
+// -----
+
+// CHECK-LABEL:   func.func @rsqrt
+// CHECK-SAME:     (%[[ARG:.*]]: f32)
+// CHECK-SAME:    -> f32
+// CHECK-DAG:     %[[CST:.*]] = arith.constant 1.000000e+00 : f32
+// CHECK-DAG:     %[[SQRT:.*]] = math.sqrt %[[ARG]] : f32
+// CHECK-DAG:     %[[DIV:.*]] = arith.divf %[[CST]], %[[SQRT]] : f32
+// CHECK:         return %[[DIV]] : f32
+func.func @rsqrt32(%float: f32) -> (f32)  {
+  %float_result = math.rsqrt %float : f32
+  return %float_result : f32
+}
+
+// -----
+
+// CHECK-LABEL:   func.func @rsqrt
+// CHECK-SAME:     (%[[ARG:.*]]: f64)
+// CHECK-SAME:    -> f64
+// CHECK-DAG:     %[[CST:.*]] = arith.constant 1.000000e+00 : f64
+// CHECK-DAG:     %[[SQRT:.*]] = math.sqrt %[[ARG]] : f64
+// CHECK-DAG:     %[[DIV:.*]] = arith.divf %[[CST]], %[[SQRT]] : f64
+// CHECK:         return %[[DIV]] : f64
+func.func @rsqrt64(%float: f64) -> (f64)  {
+  %float_result = math.rsqrt %float : f64
+  return %float_result : f64
+}
+
+// -----
+
+// CHECK-LABEL:   func.func @rsqrt_vec
+// CHECK-SAME:     (%[[ARG:.*]]: vector<5xf32>)
+// CHECK-SAME:    -> vector<5xf32>
+// CHECK-DAG:     %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<5xf32>
+// CHECK-DAG:     %[[SQRT:.*]] = math.sqrt %[[ARG]] : vector<5xf32>
+// CHECK-DAG:     %[[DIV:.*]] = arith.divf %[[CST]], %[[SQRT]] : vector<5xf32>
+// CHECK:         return %[[DIV]] : vector<5xf32>
+func.func @rsqrt_vec(%float: vector<5xf32>) -> (vector<5xf32>)  {
+  %float_result = math.rsqrt %float : vector<5xf32>
+  return %float_result : vector<5xf32>
+}
+
+// -----
+
+// CHECK-LABEL:   func.func @rsqrt_tns
+// CHECK-SAME:     (%[[ARG:.*]]: tensor<5x8xf32>)
+// CHECK-SAME:    -> tensor<5x8xf32>
+// CHECK-DAG:     %[[CST:.*]] = arith.constant dense<1.000000e+00> : tensor<5x8xf32>
+// CHECK-DAG:     %[[SQRT:.*]] = math.sqrt %[[ARG]] : tensor<5x8xf32>
+// CHECK-DAG:     %[[DIV:.*]] = arith.divf %[[CST]], %[[SQRT]] : tensor<5x8xf32>
+// CHECK:         return %[[DIV]] : tensor<5x8xf32>
+func.func @rsqrt_tns(%float: tensor<5x8xf32>) -> (tensor<5x8xf32>)  {
+  %float_result = math.rsqrt %float : tensor<5x8xf32>
+  return %float_result : tensor<5x8xf32>
+}
diff --git a/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir b/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
index 254cd4015eed..e49dff44ae0d 100644
--- a/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
+++ b/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
@@ -468,23 +468,66 @@ func.func @fold_static_stride_subview_with_affine_load_store_expand_shape_3d(%ar
 
 // -----
 
-// CHECK-LABEL: fold_dynamic_subview_with_memref_load_store_expand_shape
-// CHECK-SAME: (%[[ARG0:.*]]: memref<16x?xf32, strided<[16, 1]>>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: index, %[[SZ0:.*]]: index)
-func.func @fold_dynamic_subview_with_memref_load_store_expand_shape(%arg0 : memref<16x?xf32, strided<[16, 1]>>, %arg1 : index, %arg2 : index, %sz0: index) -> f32 {
+// CHECK-LABEL: fold_dynamic_subview_with_memref_load_expand_shape
+// CHECK-SAME: (%[[ARG0:.*]]: memref<16x?xf32, strided<[16, 1]>>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: index, %[[ARG3:.*]]: index) -> f32
+func.func @fold_dynamic_subview_with_memref_load_expand_shape(%arg0 : memref<16x?xf32, strided<[16, 1]>>, %arg1 : index, %arg2 : index, %sz0: index) -> f32 {
   %c0 = arith.constant 0 : index
   %expand_shape = memref.expand_shape %arg0 [[0, 1], [2, 3]] output_shape [1, 16, %sz0, 1] : memref<16x?xf32, strided<[16, 1]>> into memref<1x16x?x1xf32, strided<[256, 16, 1, 1]>>
   %0 = memref.load %expand_shape[%c0, %arg1, %arg2, %c0] : memref<1x16x?x1xf32, strided<[256, 16, 1, 1]>>
   return %0 : f32
 }
-// CHECK: %[[C0:.*]] = arith.constant 0 : index
-// CHECK: %[[EXPAND_SHAPE:.*]] = memref.expand_shape %[[ARG0]] {{\[\[}}0, 1], [2, 3]] output_shape [1, 16, %[[SZ0]], 1] : memref<16x?xf32, strided<[16, 1]>> into memref<1x16x?x1xf32, strided<[256, 16, 1, 1]>>
-// CHECK: %[[VAL_0:.*]] = memref.load %[[EXPAND_SHAPE]][%[[C0]], %[[ARG1]], %[[ARG2]], %[[C0]]] : memref<1x16x?x1xf32, strided<[256, 16, 1, 1]>>
-// CHECK: return %[[VAL_0]] : f32
+// CHECK-NEXT: %[[VAL1:.*]] = memref.load %[[ARG0]][%[[ARG1]], %[[ARG2]]] : memref<16x?xf32, strided<[16, 1]>>
+// CHECK-NEXT: return %[[VAL1]] : f32
 
 // -----
 
-// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0, d1) -> (d0 * 1024 + d1)>
-// CHECK-DAG: #[[$MAP1:.*]] = affine_map<(d0, d1) -> (d0 + d1)>
+// CHECK-LABEL: fold_dynamic_subview_with_memref_store_expand_shape
+// CHECK-SAME: (%[[ARG0:.*]]: memref<16x?xf32, strided<[16, 1]>>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: index, %[[ARG3:.*]]: index)
+func.func @fold_dynamic_subview_with_memref_store_expand_shape(%arg0 : memref<16x?xf32, strided<[16, 1]>>, %arg1 : index, %arg2 : index, %sz0 : index) {
+  %c0 = arith.constant 0 : index
+  %c1f32 = arith.constant 1.0 : f32
+  %expand_shape = memref.expand_shape %arg0 [[0, 1], [2, 3]] output_shape [1, 16, %sz0, 1] : memref<16x?xf32, strided<[16, 1]>> into memref<1x16x?x1xf32, strided<[256, 16, 1, 1]>>
+  memref.store %c1f32, %expand_shape[%c0, %arg1, %arg2, %c0] : memref<1x16x?x1xf32, strided<[256, 16, 1, 1]>>
+  return
+}
+// CHECK: %[[C1F32:.*]] = arith.constant 1.000000e+00 : f32
+// CHECK-NEXT: memref.store %[[C1F32]], %[[ARG0]][%[[ARG1]], %[[ARG2]]] : memref<16x?xf32, strided<[16, 1]>>
+// CHECK-NEXT: return
+
+// -----
+
+// CHECK-DAG: #[[$MAP0:.*]] = affine_map<()[s0, s1] -> (s0 + s1)>
+// CHECK-DAG: #[[$MAP1:.*]] = affine_map<()[s0] -> (s0 * 3)>
+// CHECK-LABEL: fold_memref_alias_expand_shape_subview_load_store_dynamic_dim
+// CHECK-SAME: (%[[ARG0:.*]]: memref<2048x16xf32>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: index, %[[ARG3:.*]]: index, %[[ARG4:.*]]: index)
+func.func @fold_memref_alias_expand_shape_subview_load_store_dynamic_dim(%alloc: memref<2048x16xf32>, %c10: index, %c5: index, %c0: index, %sz0: index) {
+  %subview = memref.subview %alloc[%c5, 0] [%c10, 16] [1, 1] : memref<2048x16xf32> to memref<?x16xf32, strided<[16, 1], offset: ?>>
+  %expand_shape = memref.expand_shape %subview [[0], [1, 2, 3]] output_shape [%sz0, 1, 8, 2] : memref<?x16xf32, strided<[16, 1], offset: ?>> into memref<?x1x8x2xf32, strided<[16, 16, 2, 1], offset: ?>>
+  %dim = memref.dim %expand_shape, %c0 : memref<?x1x8x2xf32, strided<[16, 16, 2, 1], offset: ?>>
+
+  affine.for %arg6 = 0 to %dim step 64 {
+    affine.for %arg7 = 0 to 16 step 16 {
+      %dummy_load = affine.load %expand_shape[%arg6, 0, %arg7, %arg7] : memref<?x1x8x2xf32, strided<[16, 16, 2, 1], offset: ?>>
+      affine.store %dummy_load, %subview[%arg6, %arg7] : memref<?x16xf32, strided<[16, 1], offset: ?>>
+    }
+  }
+  return
+}
+// CHECK-NEXT:   memref.subview
+// CHECK-NEXT:   %[[EXPAND_SHAPE:.*]] = memref.expand_shape
+// CHECK-NEXT:   %[[DIM:.*]] = memref.dim %[[EXPAND_SHAPE]], %[[ARG3]] : memref<?x1x8x2xf32, strided<[16, 16, 2, 1], offset: ?>>
+// CHECK-NEXT:   affine.for %[[ARG4:.*]] = 0 to %[[DIM]] step 64 {
+// CHECK-NEXT:   affine.for %[[ARG5:.*]] = 0 to 16 step 16 {
+// CHECK-NEXT:   %[[VAL0:.*]] = affine.apply #[[$MAP0]]()[%[[ARG2]], %[[ARG4]]]
+// CHECK-NEXT:   %[[VAL1:.*]] = affine.apply #[[$MAP1]]()[%[[ARG5]]]
+// CHECK-NEXT:   %[[VAL2:.*]] = affine.load %[[ARG0]][%[[VAL0]], %[[VAL1]]] : memref<2048x16xf32>
+// CHECK-NEXT:   %[[VAL3:.*]] = affine.apply #[[$MAP0]]()[%[[ARG2]], %[[ARG4]]]
+// CHECK-NEXT:   affine.store %[[VAL2]], %[[ARG0]][%[[VAL3]], %[[ARG5]]] : memref<2048x16xf32>
+
+// -----
+
+// CHECK-DAG: #[[$MAP0:.*]] = affine_map<()[s0, s1] -> (s0 * 1024 + s1)>
+// CHECK-DAG: #[[$MAP1:.*]] = affine_map<()[s0, s1] -> (s0 + s1)>
 // CHECK-LABEL: fold_static_stride_subview_with_affine_load_store_expand_shape
 // CHECK-SAME: (%[[ARG0:.*]]: memref<1024x1024xf32>, %[[ARG1:.*]]: memref<1xf32>, %[[ARG2:.*]]: index)
 func.func @fold_static_stride_subview_with_affine_load_store_expand_shape(%arg0: memref<1024x1024xf32>, %arg1: memref<1xf32>, %arg2: index) -> f32 {
@@ -506,14 +549,14 @@ func.func @fold_static_stride_subview_with_affine_load_store_expand_shape(%arg0:
 // CHECK-NEXT:  affine.for %[[ARG4:.*]] = 0 to 1024 {
 // CHECK-NEXT:   affine.for %[[ARG5:.*]] = 0 to 1020 {
 // CHECK-NEXT:    affine.for %[[ARG6:.*]] = 0 to 1 {
-// CHECK-NEXT:     %[[IDX1:.*]] = affine.apply #[[$MAP0]](%[[ARG3]], %[[ARG4]])
-// CHECK-NEXT:     %[[IDX2:.*]] = affine.apply #[[$MAP1]](%[[ARG5]], %[[ARG6]])
+// CHECK-NEXT:     %[[IDX1:.*]] = affine.apply #[[$MAP0]]()[%[[ARG3]], %[[ARG4]]]
+// CHECK-NEXT:     %[[IDX2:.*]] = affine.apply #[[$MAP1]]()[%[[ARG5]], %[[ARG6]]]
 // CHECK-NEXT:     affine.load %[[ARG0]][%[[IDX1]], %[[IDX2]]] : memref<1024x1024xf32>
 
 // -----
 
-// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0, d1) -> (d0 * 1025 + d1)>
-// CHECK-DAG: #[[$MAP1:.*]] = affine_map<(d0, d1) -> (d0 + d1)>
+// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0, d1)[s0] -> (d0 + d1 + s0 * 1024)>
+// CHECK-DAG: #[[$MAP1:.*]] = affine_map<()[s0, s1] -> (s0 + s1)>
 // CHECK-LABEL: fold_static_stride_subview_with_affine_load_store_expand_shape_when_access_index_is_an_expression
 // CHECK-SAME: (%[[ARG0:.*]]: memref<1024x1024xf32>, %[[ARG1:.*]]: memref<1xf32>, %[[ARG2:.*]]: index)
 func.func @fold_static_stride_subview_with_affine_load_store_expand_shape_when_access_index_is_an_expression(%arg0: memref<1024x1024xf32>, %arg1: memref<1xf32>, %arg2: index) -> f32 {
@@ -535,14 +578,14 @@ func.func @fold_static_stride_subview_with_affine_load_store_expand_shape_when_a
 // CHECK-NEXT:  affine.for %[[ARG4:.*]] = 0 to 1024 {
 // CHECK-NEXT:   affine.for %[[ARG5:.*]] = 0 to 1020 {
 // CHECK-NEXT:    affine.for %[[ARG6:.*]] = 0 to 1 {
-// CHECK-NEXT:      %[[TMP1:.*]] = affine.apply #[[$MAP0]](%[[ARG3]], %[[ARG4]])
-// CHECK-NEXT:      %[[TMP3:.*]] = affine.apply #[[$MAP1]](%[[ARG5]], %[[ARG6]])
+// CHECK-NEXT:      %[[TMP1:.*]] = affine.apply #[[$MAP0]](%[[ARG3]], %[[ARG4]])[%[[ARG3]]]
+// CHECK-NEXT:      %[[TMP3:.*]] = affine.apply #[[$MAP1]]()[%[[ARG5]], %[[ARG6]]]
 // CHECK-NEXT:      affine.load %[[ARG0]][%[[TMP1]], %[[TMP3]]] : memref<1024x1024xf32>
 
 // -----
 
-// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0) -> (d0 * 1024)>
-// CHECK-DAG: #[[$MAP1:.*]] = affine_map<(d0, d1) -> (d0 + d1)>
+// CHECK-DAG: #[[$MAP0:.*]] = affine_map<()[s0] -> (s0 * 1024)>
+// CHECK-DAG: #[[$MAP1:.*]] = affine_map<()[s0, s1] -> (s0 + s1)>
 // CHECK-LABEL: fold_static_stride_subview_with_affine_load_store_expand_shape_with_constant_access_index
 // CHECK-SAME: (%[[ARG0:.*]]: memref<1024x1024xf32>, %[[ARG1:.*]]: memref<1xf32>, %[[ARG2:.*]]: index)
 func.func @fold_static_stride_subview_with_affine_load_store_expand_shape_with_constant_access_index(%arg0: memref<1024x1024xf32>, %arg1: memref<1xf32>, %arg2: index) -> f32 {
@@ -565,8 +608,8 @@ func.func @fold_static_stride_subview_with_affine_load_store_expand_shape_with_c
 // CHECK-NEXT:   affine.for %[[ARG4:.*]] = 0 to 1024 {
 // CHECK-NEXT:    affine.for %[[ARG5:.*]] = 0 to 1020 {
 // CHECK-NEXT:     affine.for %[[ARG6:.*]] = 0 to 1 {
-// CHECK-NEXT:      %[[TMP1:.*]] = affine.apply #[[$MAP0]](%[[ARG3]])
-// CHECK-NEXT:      %[[TMP2:.*]] = affine.apply #[[$MAP1]](%[[ARG5]], %[[ARG6]])
+// CHECK-NEXT:      %[[TMP1:.*]] = affine.apply #[[$MAP0]]()[%[[ARG3]]]
+// CHECK-NEXT:      %[[TMP2:.*]] = affine.apply #[[$MAP1]]()[%[[ARG5]], %[[ARG6]]]
 // CHECK-NEXT:      memref.load %[[ARG0]][%[[TMP1]], %[[TMP2]]] : memref<1024x1024xf32>
 
 // -----
diff --git a/mlir/test/Dialect/MemRef/invalid.mlir b/mlir/test/Dialect/MemRef/invalid.mlir
index 70c96aad9555..0f533cb95a0c 100644
--- a/mlir/test/Dialect/MemRef/invalid.mlir
+++ b/mlir/test/Dialect/MemRef/invalid.mlir
@@ -1103,3 +1103,14 @@ func.func @subview_invalid_strides_rank_reduction(%m: memref<7x22x333x4444xi32>)
       : memref<7x22x333x4444xi32> to memref<7x11x4444xi32>
   return
 }
+
+// -----
+
+func.func @expand_shape_invalid_output_shape(
+    %arg0: memref<30x20xf32, strided<[4000, 2], offset: 100>>) {
+  // expected-error @+1 {{invalid output shape provided at pos 2}}
+  %0 = memref.expand_shape %arg0 [[0, 1], [2]] output_shape [2, 15, 21] :
+      memref<30x20xf32, strided<[4000, 2], offset: 100>>
+      into memref<2x15x20xf32, strided<[60000, 4000, 2], offset: 100>>
+  return
+}
diff --git a/mlir/test/Dialect/MemRef/ops.mlir b/mlir/test/Dialect/MemRef/ops.mlir
index 60fb0ffeee24..b60894377f22 100644
--- a/mlir/test/Dialect/MemRef/ops.mlir
+++ b/mlir/test/Dialect/MemRef/ops.mlir
@@ -203,7 +203,8 @@ func.func @expand_collapse_shape_dynamic(%arg0: memref<?x?x?xf32>,
          %arg3: memref<?x42xf32, strided<[42, 1], offset: 0>>,
          %arg4: index,
          %arg5: index,
-         %arg6: index) {
+         %arg6: index,
+         %arg7: memref<4x?x4xf32>) {
 //       CHECK:   memref.collapse_shape {{.*}} {{\[}}[0, 1], [2]]
 //  CHECK-SAME:     memref<?x?x?xf32> into memref<?x?xf32>
   %0 = memref.collapse_shape %arg0 [[0, 1], [2]] :
@@ -248,6 +249,10 @@ func.func @expand_collapse_shape_dynamic(%arg0: memref<?x?x?xf32>,
 //  CHECK-SAME:     memref<?xf32, strided<[1]>> into memref<?x42xf32>
   %r3 = memref.expand_shape %3 [[0, 1]] output_shape [%arg6, 42] :
     memref<?xf32, strided<[1]>> into memref<?x42xf32>
+
+//       CHECK:   memref.expand_shape {{.*}} {{\[}}[0, 1], [2], [3, 4]]
+  %4 = memref.expand_shape %arg7 [[0, 1], [2], [3, 4]] output_shape [2, 2, %arg4, 2, 2]
+        : memref<4x?x4xf32> into memref<2x2x?x2x2xf32>
   return
 }
 
diff --git a/mlir/test/Dialect/MemRef/resolve-dim-ops.mlir b/mlir/test/Dialect/MemRef/resolve-dim-ops.mlir
index 40f88de01b8b..85a485397245 100644
--- a/mlir/test/Dialect/MemRef/resolve-dim-ops.mlir
+++ b/mlir/test/Dialect/MemRef/resolve-dim-ops.mlir
@@ -53,3 +53,21 @@ func.func @static_dim_of_transpose_op(%arg0: tensor<1x100x?x8xi8>) -> index {
   %dim = tensor.dim %1, %c2 : tensor<1x8x100x?xi8>
   return %dim : index
 }
+
+// -----
+
+// Test case: Folding of memref.dim(memref.expand_shape)
+// CHECK-LABEL: func @dim_of_memref_expand_shape(
+//  CHECK-SAME:     %[[MEM:[0-9a-z]+]]: memref<?x8xi32>
+//  CHECK-NEXT:   %[[IDX:.*]] = arith.constant 0
+//  CHECK-NEXT:   %[[DIM:.*]] = memref.dim %[[MEM]], %[[IDX]] : memref<?x8xi32>
+//       CHECK:   return %[[DIM]] : index
+func.func @dim_of_memref_expand_shape(%arg0: memref<?x8xi32>)
+    -> index {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %s = memref.dim %arg0, %c0 : memref<?x8xi32>
+  %0 = memref.expand_shape %arg0 [[0, 1], [2, 3]] output_shape [1, %s, 2, 4]: memref<?x8xi32> into memref<1x?x2x4xi32>
+  %1 = memref.dim %0, %c1 : memref<1x?x2x4xi32>
+  return %1 : index
+}
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index 511e7d396c68..138c2c9d418d 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -2087,7 +2087,7 @@ func.func @omp_target_depend(%data_var: memref<i32>) {
   // expected-error @below {{op expected as many depend values as depend variables}}
     "omp.target"(%data_var) ({
       "omp.terminator"() : () -> ()
-    }) {depends = [], operandSegmentSizes = array<i32: 0, 0, 0, 1, 0, 0, 0>} : (memref<i32>) -> ()
+    }) {depends = [], operandSegmentSizes = array<i32: 0, 0, 0, 1, 0, 0, 0, 0>} : (memref<i32>) -> ()
    "func.return"() : () -> ()
 }
 
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index 60fc10f9d64b..420cb226d593 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -737,7 +737,7 @@ func.func @omp_target(%if_cond : i1, %device : si32,  %num_threads : i32, %devic
     "omp.target"(%if_cond, %device, %num_threads) ({
        // CHECK: omp.terminator
        omp.terminator
-    }) {nowait, operandSegmentSizes = array<i32: 1,1,1,0,0,0,0>} : ( i1, si32, i32 ) -> ()
+    }) {nowait, operandSegmentSizes = array<i32: 1,1,1,0,0,0,0,0>} : ( i1, si32, i32 ) -> ()
 
     // Test with optional map clause.
     // CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[VAL_1:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(tofrom) capture(ByRef) -> memref<?xi32> {name = ""}
@@ -2306,8 +2306,6 @@ func.func @omp_requires_multiple() -> ()
   return
 }
 
-// -----
-
 // CHECK-LABEL: @opaque_pointers_atomic_rwu
 // CHECK-SAME: (%[[v:.*]]: !llvm.ptr, %[[x:.*]]: !llvm.ptr)
 func.func @opaque_pointers_atomic_rwu(%v: !llvm.ptr, %x: !llvm.ptr) {
@@ -2417,8 +2415,8 @@ func.func @omp_target_update_data (%if_cond : i1, %device : si32, %map1: memref<
 func.func @omp_targets_is_allocatable(%arg0: !llvm.ptr, %arg1: !llvm.ptr) -> () {
   // CHECK: %[[MAP0:.*]] = omp.map.info var_ptr(%[[ARG0]] : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}  
   %mapv1 = omp.map.info var_ptr(%arg0 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-  // CHECK: %[[MAP1:.*]] = omp.map.info var_ptr(%[[ARG1]] : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>) map_clauses(tofrom) capture(ByRef) members(%[[MAP0]] : !llvm.ptr) -> !llvm.ptr {name = ""}
-  %mapv2 = omp.map.info var_ptr(%arg1 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>)   map_clauses(tofrom) capture(ByRef) members(%mapv1 : !llvm.ptr) -> !llvm.ptr {name = ""}  
+  // CHECK: %[[MAP1:.*]] = omp.map.info var_ptr(%[[ARG1]] : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>) map_clauses(tofrom) capture(ByRef) members(%[[MAP0]] : [0] : !llvm.ptr) -> !llvm.ptr {name = ""}
+  %mapv2 = omp.map.info var_ptr(%arg1 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>)   map_clauses(tofrom) capture(ByRef) members(%mapv1 : [0] : !llvm.ptr) -> !llvm.ptr {name = ""}  
   // CHECK: omp.target map_entries(%[[MAP0]] -> {{.*}}, %[[MAP1]] -> {{.*}} : !llvm.ptr, !llvm.ptr)
   omp.target map_entries(%mapv1 -> %arg2, %mapv2 -> %arg3 : !llvm.ptr, !llvm.ptr) {
     ^bb0(%arg2: !llvm.ptr, %arg3 : !llvm.ptr):
@@ -2473,6 +2471,37 @@ func.func @omp_target_enter_update_exit_data_depend(%a: memref<?xi32>, %b: memre
   }
   // CHECK: omp.target_exit_data map_entries([[MAP2]] : memref<?xi32>) depend(taskdependin -> [[ARG2]] : memref<?xi32>)
   omp.target_exit_data map_entries(%map_c : memref<?xi32>) depend(taskdependin -> %c : memref<?xi32>)
+
+  return
+}
+
+// CHECK-LABEL: omp_map_with_members
+// CHECK-SAME: (%[[ARG0:.*]]: !llvm.ptr, %[[ARG1:.*]]: !llvm.ptr, %[[ARG2:.*]]: !llvm.ptr, %[[ARG3:.*]]: !llvm.ptr, %[[ARG4:.*]]: !llvm.ptr, %[[ARG5:.*]]: !llvm.ptr)
+func.func @omp_map_with_members(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr, %arg3: !llvm.ptr, %arg4: !llvm.ptr, %arg5: !llvm.ptr) -> () {
+  // CHECK: %[[MAP0:.*]] = omp.map.info var_ptr(%[[ARG0]] : !llvm.ptr, i32) map_clauses(to) capture(ByRef) -> !llvm.ptr {name = ""}  
+  %mapv1 = omp.map.info var_ptr(%arg0 : !llvm.ptr, i32) map_clauses(to) capture(ByRef) -> !llvm.ptr {name = ""}
+
+  // CHECK: %[[MAP1:.*]] = omp.map.info var_ptr(%[[ARG1]] : !llvm.ptr, f32) map_clauses(to) capture(ByRef) -> !llvm.ptr {name = ""}  
+  %mapv2 = omp.map.info var_ptr(%arg1 : !llvm.ptr, f32) map_clauses(to) capture(ByRef) -> !llvm.ptr {name = ""}
+  
+  // CHECK: %[[MAP2:.*]] = omp.map.info var_ptr(%[[ARG2]] : !llvm.ptr, !llvm.struct<(i32, f32)>) map_clauses(to) capture(ByRef) members(%[[MAP0]], %[[MAP1]] : [0], [1] : !llvm.ptr, !llvm.ptr) -> !llvm.ptr {name = "", partial_map = true}
+  %mapv3 = omp.map.info var_ptr(%arg2 : !llvm.ptr, !llvm.struct<(i32, f32)>)   map_clauses(to) capture(ByRef) members(%mapv1, %mapv2 : [0], [1] : !llvm.ptr, !llvm.ptr) -> !llvm.ptr {name = "", partial_map = true}  
+  
+  // CHECK: omp.target_enter_data map_entries(%[[MAP0]], %[[MAP1]], %[[MAP2]] : !llvm.ptr, !llvm.ptr, !llvm.ptr)
+  omp.target_enter_data map_entries(%mapv1, %mapv2, %mapv3 : !llvm.ptr, !llvm.ptr, !llvm.ptr){}
+
+  // CHECK: %[[MAP3:.*]] = omp.map.info var_ptr(%[[ARG3]] : !llvm.ptr, i32) map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}  
+  %mapv4 = omp.map.info var_ptr(%arg3 : !llvm.ptr, i32) map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
+
+  // CHECK: %[[MAP4:.*]] = omp.map.info var_ptr(%[[ARG4]] : !llvm.ptr, f32) map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}  
+  %mapv5 = omp.map.info var_ptr(%arg4 : !llvm.ptr, f32) map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
+  
+  // CHECK: %[[MAP5:.*]] = omp.map.info var_ptr(%[[ARG5]] : !llvm.ptr, !llvm.struct<(i32, struct<(i32, f32)>)>) map_clauses(from) capture(ByRef) members(%[[MAP3]], %[[MAP4]] : [1,0], [1,1] : !llvm.ptr, !llvm.ptr) -> !llvm.ptr {name = "", partial_map = true}
+  %mapv6 = omp.map.info var_ptr(%arg5 : !llvm.ptr, !llvm.struct<(i32, struct<(i32, f32)>)>) map_clauses(from) capture(ByRef) members(%mapv4, %mapv5 : [1,0], [1,1] : !llvm.ptr, !llvm.ptr) -> !llvm.ptr {name = "", partial_map = true}  
+ 
+  // CHECK: omp.target_exit_data map_entries(%[[MAP3]], %[[MAP4]], %[[MAP5]] : !llvm.ptr, !llvm.ptr, !llvm.ptr)
+  omp.target_exit_data map_entries(%mapv4, %mapv5, %mapv6 : !llvm.ptr, !llvm.ptr, !llvm.ptr){}
+
   return
 }
 
@@ -2550,3 +2579,41 @@ func.func @parallel_op_reduction_and_private(%priv_var: !llvm.ptr, %priv_var2: !
   }
   return
 }
+
+// CHECK-LABEL: omp_target_private
+func.func @omp_target_private(%map1: memref<?xi32>, %map2: memref<?xi32>, %priv_var: !llvm.ptr) -> () {
+  %mapv1 = omp.map.info var_ptr(%map1 : memref<?xi32>, tensor<?xi32>) map_clauses(tofrom) capture(ByRef) -> memref<?xi32> {name = ""}
+  %mapv2 = omp.map.info var_ptr(%map2 : memref<?xi32>, tensor<?xi32>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref<?xi32> {name = ""}
+
+  // CHECK: omp.target
+  // CHECK-SAME: private(
+  // CHECK-SAME:   @x.privatizer %{{[^[:space:]]+}} -> %[[PRIV_ARG:[^[:space:]]+]]
+  // CHECK-SAME:   : !llvm.ptr
+  // CHECK-SAME: )
+  omp.target private(@x.privatizer %priv_var -> %priv_arg : !llvm.ptr) {
+  // CHECK: ^bb0(%[[PRIV_ARG]]: !llvm.ptr):
+  ^bb0(%priv_arg: !llvm.ptr):
+    omp.terminator
+  }
+
+  // CHECK: omp.target
+
+  // CHECK-SAME: map_entries(
+  // CHECK-SAME:   %{{[^[:space:]]+}} -> %[[MAP1_ARG:[^[:space:]]+]],
+  // CHECK-SAME:   %{{[^[:space:]]+}} -> %[[MAP2_ARG:[^[:space:]]+]]
+  // CHECK-SAME:   : memref<?xi32>, memref<?xi32>
+  // CHECK-SAME: )
+
+  // CHECK-SAME: private(
+  // CHECK-SAME:   @x.privatizer %{{[^[:space:]]+}} -> %[[PRIV_ARG:[^[:space:]]+]]
+  // CHECK-SAME:   : !llvm.ptr
+  // CHECK-SAME: )
+  omp.target map_entries(%mapv1 -> %arg0, %mapv2 -> %arg1 : memref<?xi32>, memref<?xi32>) private(@x.privatizer %priv_var -> %priv_arg : !llvm.ptr) {
+  // CHECK: ^bb0(%[[MAP1_ARG]]: memref<?xi32>, %[[MAP2_ARG]]: memref<?xi32>
+  // CHECK-SAME: , %[[PRIV_ARG]]: !llvm.ptr):
+  ^bb0(%arg0: memref<?xi32>, %arg1: memref<?xi32>, %priv_arg: !llvm.ptr):
+    omp.terminator
+  }
+
+  return
+}
diff --git a/mlir/test/Dialect/Polynomial/ops.mlir b/mlir/test/Dialect/Polynomial/ops.mlir
index ea1b279fa1ff..a29cfc2e9cc5 100644
--- a/mlir/test/Dialect/Polynomial/ops.mlir
+++ b/mlir/test/Dialect/Polynomial/ops.mlir
@@ -10,9 +10,13 @@
 #one_plus_x_squared = #polynomial.polynomial<1 + x**2>
 
 #ideal = #polynomial.polynomial<-1 + x**1024>
-#ring = #polynomial.ring<coefficientType=i32, coefficientModulus=18, polynomialModulus=#ideal>
+#ring = #polynomial.ring<coefficientType=i32, coefficientModulus=256, polynomialModulus=#ideal, primitiveRoot=193>
 !poly_ty = !polynomial.polynomial<#ring>
 
+#ntt_poly = #polynomial.polynomial<-1 + x**8>
+#ntt_ring = #polynomial.ring<coefficientType=i32, coefficientModulus=256, polynomialModulus=#ntt_poly, primitiveRoot=31>
+!ntt_poly_ty = !polynomial.polynomial<#ntt_ring>
+
 module {
   func.func @test_multiply() -> !polynomial.polynomial<#ring1> {
     %c0 = arith.constant 0 : index
@@ -79,4 +83,14 @@ module {
     %1 = polynomial.constant <1 + x**2> : !polynomial.polynomial<#ring1>
     return
   }
+
+  func.func @test_ntt(%0 : !ntt_poly_ty) {
+    %1 = polynomial.ntt %0 : !ntt_poly_ty -> tensor<8xi32, #ntt_ring>
+    return
+  }
+
+  func.func @test_intt(%0 : tensor<8xi32, #ntt_ring>) {
+    %1 = polynomial.intt %0 : tensor<8xi32, #ntt_ring> -> !ntt_poly_ty
+    return
+  }
 }
diff --git a/mlir/test/Dialect/Polynomial/ops_errors.mlir b/mlir/test/Dialect/Polynomial/ops_errors.mlir
index c34a7de30e5f..2c20e7bcbf1d 100644
--- a/mlir/test/Dialect/Polynomial/ops_errors.mlir
+++ b/mlir/test/Dialect/Polynomial/ops_errors.mlir
@@ -51,3 +51,90 @@ func.func @test_mul_scalar_wrong_type(%arg0: !ty) -> !ty {
   %poly = polynomial.mul_scalar %arg0, %scalar : !ty, i32
   return %poly : !ty
 }
+
+// -----
+
+#my_poly = #polynomial.polynomial<-1 + x**1024>
+#ring = #polynomial.ring<coefficientType=i16, coefficientModulus=256, polynomialModulus=#my_poly, primitiveRoot=31>
+!poly_ty = !polynomial.polynomial<#ring>
+
+// CHECK-NOT: @test_invalid_ntt
+// CHECK-NOT: polynomial.ntt
+func.func @test_invalid_ntt(%0 : !poly_ty) {
+  // expected-error@below {{expects a ring encoding to be provided to the tensor}}
+  %1 = polynomial.ntt %0 : !poly_ty -> tensor<1024xi32>
+  return
+}
+
+// -----
+
+#my_poly = #polynomial.polynomial<-1 + x**1024>
+#ring = #polynomial.ring<coefficientType=i16, coefficientModulus=256, polynomialModulus=#my_poly, primitiveRoot=31>
+!poly_ty = !polynomial.polynomial<#ring>
+
+// CHECK-NOT: @test_invalid_ntt
+// CHECK-NOT: polynomial.ntt
+func.func @test_invalid_ntt(%0 : !poly_ty) {
+  // expected-error@below {{tensor encoding is not a ring attribute}}
+  %1 = polynomial.ntt %0 : !poly_ty -> tensor<1024xi32, #my_poly>
+  return
+}
+
+// -----
+
+#my_poly = #polynomial.polynomial<-1 + x**1024>
+#ring = #polynomial.ring<coefficientType=i16, coefficientModulus=256, polynomialModulus=#my_poly>
+#ring1 = #polynomial.ring<coefficientType=i16, coefficientModulus=257, polynomialModulus=#my_poly, primitiveRoot=31>
+!poly_ty = !polynomial.polynomial<#ring>
+
+// CHECK-NOT: @test_invalid_intt
+// CHECK-NOT: polynomial.intt
+func.func @test_invalid_intt(%0 : tensor<1024xi32, #ring1>) {
+  // expected-error@below {{not equivalent to the polynomial ring}}
+  %1 = polynomial.intt %0 : tensor<1024xi32, #ring1> -> !poly_ty
+  return
+}
+
+// -----
+
+#my_poly = #polynomial.polynomial<-1 + x**1024>
+#ring = #polynomial.ring<coefficientType=i16, coefficientModulus=256, polynomialModulus=#my_poly, primitiveRoot=31>
+!poly_ty = !polynomial.polynomial<#ring>
+
+// CHECK-NOT: @test_invalid_intt
+// CHECK-NOT: polynomial.intt
+func.func @test_invalid_intt(%0 : tensor<1025xi32, #ring>) {
+  // expected-error@below {{does not match output type}}
+  // expected-note@below {{exactly the degree of the polynomialModulus of the polynomial type's ring attribute}}
+  %1 = polynomial.intt %0 : tensor<1025xi32, #ring> -> !poly_ty
+  return
+}
+
+// -----
+
+#my_poly = #polynomial.polynomial<-1 + x**1024>
+#ring = #polynomial.ring<coefficientType=i16, coefficientModulus=256, polynomialModulus=#my_poly>
+!poly_ty = !polynomial.polynomial<#ring>
+
+// CHECK-NOT: @test_invalid_ntt
+// CHECK-NOT: polynomial.ntt
+func.func @test_invalid_ntt(%0 : !poly_ty) {
+  // expected-error@below {{does not provide a primitive root of unity, which is required to express an NTT}}
+  %1 = polynomial.ntt %0 : !poly_ty -> tensor<1024xi32, #ring>
+  return
+}
+
+// -----
+
+#my_poly = #polynomial.polynomial<-1 + x**8>
+// A valid root is 31
+#ring = #polynomial.ring<coefficientType=i16, coefficientModulus=256, polynomialModulus=#my_poly, primitiveRoot=32>
+!poly_ty = !polynomial.polynomial<#ring>
+
+// CHECK-NOT: @test_invalid_intt
+// CHECK-NOT: polynomial.intt
+func.func @test_invalid_intt(%0 : tensor<8xi32, #ring>) {
+  // expected-error@below {{has a primitiveRoot attribute '32 : i16' that is not a primitive root of the coefficient ring}}
+  %1 = polynomial.intt %0 : tensor<8xi32, #ring> -> !poly_ty
+  return
+}
diff --git a/mlir/test/Dialect/SCF/canonicalize.mlir b/mlir/test/Dialect/SCF/canonicalize.mlir
index b4c9ed4db94e..459ccd73cfe6 100644
--- a/mlir/test/Dialect/SCF/canonicalize.mlir
+++ b/mlir/test/Dialect/SCF/canonicalize.mlir
@@ -1735,6 +1735,87 @@ func.func @do_not_fold_tensor_cast_from_dynamic_to_static_type_into_forall(
 
 // -----
 
+#map = affine_map<()[s0, s1] -> (s0 ceildiv s1)>
+#map1 = affine_map<(d0)[s0] -> (d0 * s0)>
+#map2 = affine_map<(d0)[s0, s1] -> (-(d0 * s1) + s0, s1)>
+module {
+  func.func @fold_iter_args_not_being_modified_within_scfforall(%arg0: index, %arg1: tensor<?xf32>, %arg2: tensor<?xf32>) -> (tensor<?xf32>, tensor<?xf32>) {
+    %c0 = arith.constant 0 : index
+    %cst = arith.constant 4.200000e+01 : f32
+    %0 = linalg.fill ins(%cst : f32) outs(%arg1 : tensor<?xf32>) -> tensor<?xf32>
+    %dim = tensor.dim %arg1, %c0 : tensor<?xf32>
+    %1 = affine.apply #map()[%dim, %arg0]
+    %2:2 = scf.forall (%arg3) in (%1) shared_outs(%arg4 = %arg1, %arg5 = %arg2) -> (tensor<?xf32>, tensor<?xf32>) {
+      %3 = affine.apply #map1(%arg3)[%arg0]
+      %4 = affine.min #map2(%arg3)[%dim, %arg0]
+      %extracted_slice0 = tensor.extract_slice %arg4[%3] [%4] [1] : tensor<?xf32> to tensor<?xf32>
+      %extracted_slice1 = tensor.extract_slice %arg5[%3] [%4] [1] : tensor<?xf32> to tensor<?xf32>
+      %5 = linalg.elemwise_unary ins(%extracted_slice0 : tensor<?xf32>) outs(%extracted_slice1 : tensor<?xf32>) -> tensor<?xf32>
+      scf.forall.in_parallel {
+        tensor.parallel_insert_slice %5 into %arg5[%3] [%4] [1] : tensor<?xf32> into tensor<?xf32>
+      }
+    }
+    return %2#0, %2#1 : tensor<?xf32>, tensor<?xf32>
+  }
+}
+// CHECK-LABEL: @fold_iter_args_not_being_modified_within_scfforall
+//  CHECK-SAME:   (%{{.*}}: index, %[[ARG1:.*]]: tensor<?xf32>, %[[ARG2:.*]]: tensor<?xf32>) -> (tensor<?xf32>, tensor<?xf32>) {
+//       CHECK:    %[[RESULT:.*]] = scf.forall 
+//  CHECK-SAME:                       shared_outs(%[[ITER_ARG_5:.*]] = %[[ARG2]]) -> (tensor<?xf32>) {
+//       CHECK:      %[[OPERAND0:.*]] = tensor.extract_slice %[[ARG1]]
+//       CHECK:      %[[OPERAND1:.*]] = tensor.extract_slice %[[ITER_ARG_5]]
+//       CHECK:      %[[ELEM:.*]] = linalg.elemwise_unary ins(%[[OPERAND0]] : tensor<?xf32>) outs(%[[OPERAND1]] : tensor<?xf32>) -> tensor<?xf32>
+//       CHECK:      scf.forall.in_parallel {
+//  CHECK-NEXT:         tensor.parallel_insert_slice %[[ELEM]] into %[[ITER_ARG_5]]
+//  CHECK-NEXT:      }
+//  CHECK-NEXT:    }
+//  CHECK-NEXT:    return %[[ARG1]], %[[RESULT]]
+
+// -----
+
+#map = affine_map<()[s0, s1] -> (s0 ceildiv s1)>
+#map1 = affine_map<(d0)[s0] -> (d0 * s0)>
+#map2 = affine_map<(d0)[s0, s1] -> (-(d0 * s1) + s0, s1)>
+module {
+  func.func @fold_iter_args_with_no_use_of_result_scfforall(%arg0: index, %arg1: tensor<?xf32>, %arg2: tensor<?xf32>, %arg3: tensor<?xf32>) -> tensor<?xf32> {
+    %cst = arith.constant 4.200000e+01 : f32
+    %c0 = arith.constant 0 : index
+    %0 = linalg.fill ins(%cst : f32) outs(%arg1 : tensor<?xf32>) -> tensor<?xf32>
+    %dim = tensor.dim %arg1, %c0 : tensor<?xf32>
+    %1 = affine.apply #map()[%dim, %arg0]
+    %2:3 = scf.forall (%arg4) in (%1) shared_outs(%arg5 = %arg1, %arg6 = %arg2, %arg7 = %arg3) -> (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) {
+      %3 = affine.apply #map1(%arg4)[%arg0]
+      %4 = affine.min #map2(%arg4)[%dim, %arg0]
+      %extracted_slice = tensor.extract_slice %arg5[%3] [%4] [1] : tensor<?xf32> to tensor<?xf32>
+      %extracted_slice_0 = tensor.extract_slice %arg6[%3] [%4] [1] : tensor<?xf32> to tensor<?xf32>
+      %extracted_slice_1 = tensor.extract_slice %arg7[%3] [%4] [1] : tensor<?xf32> to tensor<?xf32>
+      %extracted_slice_2 = tensor.extract_slice %0[%3] [%4] [1] : tensor<?xf32> to tensor<?xf32>
+      %5 = linalg.elemwise_unary ins(%extracted_slice : tensor<?xf32>) outs(%extracted_slice_1 : tensor<?xf32>) -> tensor<?xf32>
+      scf.forall.in_parallel {
+        tensor.parallel_insert_slice %5 into %arg6[%3] [%4] [1] : tensor<?xf32> into tensor<?xf32>
+        tensor.parallel_insert_slice %extracted_slice into %arg5[%3] [%4] [1] : tensor<?xf32> into tensor<?xf32>
+        tensor.parallel_insert_slice %extracted_slice_0 into %arg7[%3] [%4] [1] : tensor<?xf32> into tensor<?xf32>
+        tensor.parallel_insert_slice %5 into %arg7[%4] [%3] [1] : tensor<?xf32> into tensor<?xf32>
+      }
+    }
+    return %2#1 : tensor<?xf32>
+  }
+}
+// CHECK-LABEL: @fold_iter_args_with_no_use_of_result_scfforall
+//  CHECK-SAME:   (%{{.*}}: index, %[[ARG1:.*]]: tensor<?xf32>, %[[ARG2:.*]]: tensor<?xf32>, %[[ARG3:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+//       CHECK:    %[[RESULT:.*]] = scf.forall 
+//  CHECK-SAME:                       shared_outs(%[[ITER_ARG_6:.*]] = %[[ARG2]]) -> (tensor<?xf32>) {
+//       CHECK:      %[[OPERAND0:.*]] = tensor.extract_slice %[[ARG1]]
+//       CHECK:      %[[OPERAND1:.*]] = tensor.extract_slice %[[ARG3]]
+//       CHECK:      %[[ELEM:.*]] = linalg.elemwise_unary ins(%[[OPERAND0]] : tensor<?xf32>) outs(%[[OPERAND1]] : tensor<?xf32>) -> tensor<?xf32>
+//       CHECK:      scf.forall.in_parallel {
+//  CHECK-NEXT:         tensor.parallel_insert_slice %[[ELEM]] into %[[ITER_ARG_6]]
+//  CHECK-NEXT:      }
+//  CHECK-NEXT:    }
+//  CHECK-NEXT:    return %[[RESULT]]
+
+// -----
+
 func.func @index_switch_fold() -> (f32, f32) {
   %switch_cst = arith.constant 1: index
   %0 = scf.index_switch %switch_cst -> f32
diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
index 485fdd9b0e59..bb9f7dfdba83 100644
--- a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
@@ -499,7 +499,8 @@ func.func @parallel_insert_slice_no_conflict(
         tensor.parallel_insert_slice %8 into %o[5] [%idx] [%c1] :
           tensor<?xf32> into tensor<?xf32>
       }
-  }
+  } {keep_this_attribute}
+  // CHECK: keep_this_attribute
 
   // CHECK: %[[load:.*]] = memref.load %[[arg2]]
   %f = tensor.extract %2[%c0] : tensor<?xf32>
diff --git a/mlir/test/Dialect/SCF/transform-op-coalesce.mlir b/mlir/test/Dialect/SCF/transform-op-coalesce.mlir
index 4dc3e4ea0ef4..6fcd727621ba 100644
--- a/mlir/test/Dialect/SCF/transform-op-coalesce.mlir
+++ b/mlir/test/Dialect/SCF/transform-op-coalesce.mlir
@@ -299,3 +299,80 @@ module attributes {transform.with_named_sequence} {
 //  CHECK-NOT:       scf.for
 //      CHECK:   transform.named_sequence
 
+// -----
+
+// Check avoiding generating unnecessary operations while collapsing trip-1 loops.
+func.func @trip_one_loops(%arg0 : tensor<?x?xf32>, %arg1 : index, %arg2 : index) -> tensor<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %0 = scf.for %iv0 = %c0 to %c1 step %c1 iter_args(%iter0 = %arg0) -> tensor<?x?xf32> {
+    %1 = scf.for %iv1 = %c0 to %c1 step %c1 iter_args(%iter1 = %iter0) -> tensor<?x?xf32> {
+      %2 = scf.for %iv2 = %c0 to %arg1 step %c1 iter_args(%iter2 = %iter1) -> tensor<?x?xf32> {
+        %3 = scf.for %iv3 = %c0 to %c1 step %c1 iter_args(%iter3 = %iter2) -> tensor<?x?xf32> {
+          %4 = scf.for %iv4 = %c0 to %arg2 step %c1 iter_args(%iter4 = %iter3) -> tensor<?x?xf32> {
+            %5 = "some_use"(%iter4, %iv0, %iv1, %iv2, %iv3, %iv4)
+              : (tensor<?x?xf32>, index, index, index, index, index) -> (tensor<?x?xf32>)
+            scf.yield %5 : tensor<?x?xf32>
+          }
+          scf.yield %4 : tensor<?x?xf32>
+        }
+        scf.yield %3 : tensor<?x?xf32>
+      }
+      scf.yield %2 : tensor<?x?xf32>
+    }
+    scf.yield %1 : tensor<?x?xf32>
+  } {coalesce}
+  return %0 : tensor<?x?xf32>
+}
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["scf.for"]} attributes {coalesce} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.cast %0 : !transform.any_op to !transform.op<"scf.for">
+    %2 = transform.loop.coalesce %1 : (!transform.op<"scf.for">) -> (!transform.op<"scf.for">)
+    transform.yield
+  }
+}
+// CHECK-LABEL: func @trip_one_loops
+//  CHECK-SAME:     , %[[ARG1:.+]]: index,
+//  CHECK-SAME:     %[[ARG2:.+]]: index)
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//       CHECK:   %[[UB:.+]] = arith.muli %[[ARG1]], %[[ARG2]]
+//       CHECK:   scf.for %[[IV:.+]] = %[[C0]] to %[[UB]] step %[[C1]]
+//       CHECK:     %[[IV1:.+]] = arith.remsi %[[IV]], %[[ARG2]]
+//       CHECK:     %[[IV2:.+]] = arith.divsi %[[IV]], %[[ARG2]]
+//       CHECK:     "some_use"(%{{[a-zA-Z0-9]+}}, %[[C0]], %[[C0]], %[[IV2]], %[[C0]], %[[IV1]])
+
+// -----
+
+// Check generating no instructions when all except one loops is non unit-trip.
+func.func @all_outer_trip_one(%arg0 : tensor<?x?xf32>, %arg1 : index) -> tensor<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %0 = scf.for %iv0 = %c0 to %c1 step %c1 iter_args(%iter0 = %arg0) -> tensor<?x?xf32> {
+    %1 = scf.for %iv1 = %c0 to %c1 step %c1 iter_args(%iter1 = %iter0) -> tensor<?x?xf32> {
+      %2 = scf.for %iv2 = %c0 to %arg1 step %c1 iter_args(%iter2 = %iter1) -> tensor<?x?xf32> {
+        %3 = "some_use"(%iter2, %iv0, %iv1, %iv2)
+          : (tensor<?x?xf32>, index, index, index) -> (tensor<?x?xf32>)
+        scf.yield %3 : tensor<?x?xf32>
+      }
+      scf.yield %2 : tensor<?x?xf32>
+    }
+    scf.yield %1 : tensor<?x?xf32>
+  } {coalesce}
+  return %0 : tensor<?x?xf32>
+}
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["scf.for"]} attributes {coalesce} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.cast %0 : !transform.any_op to !transform.op<"scf.for">
+    %2 = transform.loop.coalesce %1 : (!transform.op<"scf.for">) -> (!transform.op<"scf.for">)
+    transform.yield
+  }
+}
+// CHECK-LABEL: func @all_outer_trip_one
+//  CHECK-SAME:     , %[[ARG1:.+]]: index)
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//       CHECK:   scf.for %[[IV:.+]] = %[[C0]] to %[[ARG1]] step %[[C1]]
+//       CHECK:     "some_use"(%{{[a-zA-Z0-9]+}}, %[[C0]], %[[C0]], %[[IV]])
diff --git a/mlir/test/Dialect/Shape/arg_with_shape.mlir b/mlir/test/Dialect/Shape/arg_with_shape.mlir
index 089c5031fa55..fd46461417db 100644
--- a/mlir/test/Dialect/Shape/arg_with_shape.mlir
+++ b/mlir/test/Dialect/Shape/arg_with_shape.mlir
@@ -1,16 +1,16 @@
-// RUN: mlir-opt -outline-shape-computation -split-input-file %s 2>%t | FileCheck %s
-
-func.func @func1(%arg0: !shape.value_shape, %arg1: !shape.value_shape) -> !shape.shape {
-  %0 = shape.shape_of %arg0 : !shape.value_shape -> !shape.shape
-  %1 = shape.shape_of %arg1 : !shape.value_shape -> !shape.shape
-  %2 = shape.meet %0, %1 : !shape.shape, !shape.shape -> !shape.shape
-  return %2 : !shape.shape
-}
-// Make sure with_shape used by call not crash.
-// CHECK-LABEL:func.func @func
-func.func @func(%arg0: !shape.value_shape, %arg1: !shape.value_shape) -> !shape.shape {
-  %0 = shape.shape_of %arg0 : !shape.value_shape -> !shape.shape
-  %1 = shape.with_shape %arg1, %0 : !shape.value_shape, !shape.shape
-  %2 = call @func1(%arg0, %1) : (!shape.value_shape, !shape.value_shape) -> !shape.shape
-  return %2 : !shape.shape
-}
+// RUN: mlir-opt -outline-shape-computation -split-input-file %s 2>%t | FileCheck %s
+
+func.func @func1(%arg0: !shape.value_shape, %arg1: !shape.value_shape) -> !shape.shape {
+  %0 = shape.shape_of %arg0 : !shape.value_shape -> !shape.shape
+  %1 = shape.shape_of %arg1 : !shape.value_shape -> !shape.shape
+  %2 = shape.meet %0, %1 : !shape.shape, !shape.shape -> !shape.shape
+  return %2 : !shape.shape
+}
+// Make sure with_shape used by call not crash.
+// CHECK-LABEL:func.func @func
+func.func @func(%arg0: !shape.value_shape, %arg1: !shape.value_shape) -> !shape.shape {
+  %0 = shape.shape_of %arg0 : !shape.value_shape -> !shape.shape
+  %1 = shape.with_shape %arg1, %0 : !shape.value_shape, !shape.shape
+  %2 = call @func1(%arg0, %1) : (!shape.value_shape, !shape.value_shape) -> !shape.shape
+  return %2 : !shape.shape
+}
diff --git a/mlir/test/Dialect/SparseTensor/binary_valued.mlir b/mlir/test/Dialect/SparseTensor/binary_valued.mlir
new file mode 100755
index 000000000000..dd9b60a6488b
--- /dev/null
+++ b/mlir/test/Dialect/SparseTensor/binary_valued.mlir
@@ -0,0 +1,150 @@
+// RUN: mlir-opt %s --linalg-fuse-elementwise-ops \
+// RUN:             --sparsification-and-bufferization | FileCheck %s
+
+#Sparse = #sparse_tensor.encoding<{
+  map = (d0, d1, d2) -> (d0 : dense, d1 : dense, d2 : compressed),
+  explicitVal = 1.0 : f32
+}>
+
+#trait3p = {
+  indexing_maps = [
+    affine_map<(i,j,k) -> (i,j,k)>,  // A
+    affine_map<(i,j,k) -> (i,j,k)>,  // B
+    affine_map<(i,j,k) -> (i,j,k)>   // X (out)
+  ],
+  iterator_types = ["parallel", "parallel", "parallel"]
+}
+
+#trait3r = {
+  indexing_maps = [
+    affine_map<(i,j,k) -> (i,j,k)>,  // A
+    affine_map<(i,j,k) -> ()>        // X (out)
+  ],
+  iterator_types = ["reduction", "reduction", "reduction"]
+}
+
+//
+// Make sure X += A * A => X += 1 in single loop.
+//
+// CHECK-LABEL:   func.func @sum_squares(
+// CHECK-SAME:      %[[VAL_0:.*0]]: memref<?xindex>,
+// CHECK-SAME:      %[[VAL_1:.*1]]: memref<?xindex>,
+// CHECK-SAME:      %[[VAL_2:.*2]]: memref<?xf32>,
+// CHECK-SAME:      %[[VAL_3:.*]]: !sparse_tensor.storage_specifier<#{{.*}}>) -> memref<f32> {
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 1.000000e+00 : f32
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 3 : index
+// CHECK-DAG:       %[[VAL_8:.*]] = arith.constant 2 : index
+// CHECK-DAG:       %[[VAL_9:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[VAL_10:.*]] = memref.alloc() {alignment = 64 : i64} : memref<f32>
+// CHECK:           linalg.fill ins(%[[VAL_9]] : f32) outs(%[[VAL_10]] : memref<f32>)
+// CHECK:           %[[VAL_11:.*]] = sparse_tensor.storage_specifier.get %[[VAL_3]]
+// CHECK:           %[[VAL_12:.*]] = memref.subview %[[VAL_0]][0] {{\[}}%[[VAL_11]]] [1] : memref<?xindex> to memref<?xindex>
+// CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_10]][] : memref<f32>
+// CHECK:           %[[VAL_14:.*]] = scf.for %[[VAL_15:.*]] = %[[VAL_6]] to %[[VAL_8]] step %[[VAL_5]] iter_args(%[[VAL_16:.*]] = %[[VAL_13]]) -> (f32) {
+// CHECK:             %[[VAL_17:.*]] = arith.muli %[[VAL_15]], %[[VAL_7]] : index
+// CHECK:             %[[VAL_18:.*]] = scf.for %[[VAL_19:.*]] = %[[VAL_6]] to %[[VAL_7]] step %[[VAL_5]] iter_args(%[[VAL_20:.*]] = %[[VAL_16]]) -> (f32) {
+// CHECK:               %[[VAL_21:.*]] = arith.addi %[[VAL_19]], %[[VAL_17]] : index
+// CHECK:               %[[VAL_22:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_21]]] : memref<?xindex>
+// CHECK:               %[[VAL_23:.*]] = arith.addi %[[VAL_21]], %[[VAL_5]] : index
+// CHECK:               %[[VAL_24:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_23]]] : memref<?xindex>
+// CHECK:               %[[VAL_25:.*]] = scf.for %[[VAL_26:.*]] = %[[VAL_22]] to %[[VAL_24]] step %[[VAL_5]] iter_args(%[[VAL_27:.*]] = %[[VAL_20]]) -> (f32) {
+// CHECK:                 %[[VAL_28:.*]] = arith.addf %[[VAL_27]], %[[VAL_4]] : f32
+// CHECK:                 scf.yield %[[VAL_28]] : f32
+// CHECK:               } {"Emitted from" = "linalg.generic"}
+// CHECK:               scf.yield %[[VAL_25]] : f32
+// CHECK:             } {"Emitted from" = "linalg.generic"}
+// CHECK:             scf.yield %[[VAL_18]] : f32
+// CHECK:           } {"Emitted from" = "linalg.generic"}
+// CHECK:           memref.store %[[VAL_14]], %[[VAL_10]][] : memref<f32>
+// CHECK:           return %[[VAL_10]] : memref<f32>
+// CHECK:         }
+//
+func.func @sum_squares(%a: tensor<2x3x8xf32, #Sparse>) -> tensor<f32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = tensor.empty() : tensor<2x3x8xf32>
+  %1 = linalg.generic #trait3p
+      ins(%a, %a : tensor<2x3x8xf32, #Sparse>, tensor<2x3x8xf32, #Sparse>)
+      outs(%0 : tensor<2x3x8xf32>) {
+        ^bb0(%in1: f32, %in2: f32, %out: f32):
+          %mul = arith.mulf %in1, %in2 : f32
+          linalg.yield %mul : f32
+      } -> tensor<2x3x8xf32>
+  %2 = tensor.empty() : tensor<f32>
+  %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<f32>) -> tensor<f32>
+  %4 = linalg.generic #trait3r
+      ins(%1 : tensor<2x3x8xf32>)
+      outs(%3 : tensor<f32>) {
+        ^bb0(%in: f32, %out: f32):
+          %add = arith.addf %in, %out : f32
+          linalg.yield %add : f32
+      } -> tensor<f32>
+
+  return %4 : tensor<f32>
+}
+
+//
+// Make sure X += A * B => X += B in single loop.
+//
+// CHECK-LABEL:   func.func @sum_products(
+// CHECK-SAME:      %[[VAL_0:.*0]]: memref<?xindex>,
+// CHECK-SAME:      %[[VAL_1:.*1]]: memref<?xindex>,
+// CHECK-SAME:      %[[VAL_2:.*2]]: memref<?xf32>,
+// CHECK-SAME:      %[[VAL_3:.*3]]: !sparse_tensor.storage_specifier<#{{.*}}>,
+// CHECK-SAME:      %[[VAL_4:.*4]]: memref<2x3x8xf32>) -> memref<f32> {
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 3 : index
+// CHECK-DAG:       %[[VAL_8:.*]] = arith.constant 2 : index
+// CHECK-DAG:       %[[VAL_9:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[VAL_10:.*]] = memref.alloc() {alignment = 64 : i64} : memref<f32>
+// CHECK:           linalg.fill ins(%[[VAL_9]] : f32) outs(%[[VAL_10]] : memref<f32>)
+// CHECK:           %[[VAL_11:.*]] = sparse_tensor.storage_specifier.get %[[VAL_3]]
+// CHECK:           %[[VAL_12:.*]] = memref.subview %[[VAL_0]][0] {{\[}}%[[VAL_11]]] [1] : memref<?xindex> to memref<?xindex>
+// CHECK:           %[[VAL_13:.*]] = sparse_tensor.storage_specifier.get %[[VAL_3]]
+// CHECK:           %[[VAL_14:.*]] = memref.subview %[[VAL_1]][0] {{\[}}%[[VAL_13]]] [1] : memref<?xindex> to memref<?xindex>
+// CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_10]][] : memref<f32>
+// CHECK:           %[[VAL_16:.*]] = scf.for %[[VAL_17:.*]] = %[[VAL_6]] to %[[VAL_8]] step %[[VAL_5]] iter_args(%[[VAL_18:.*]] = %[[VAL_15]]) -> (f32) {
+// CHECK:             %[[VAL_19:.*]] = arith.muli %[[VAL_17]], %[[VAL_7]] : index
+// CHECK:             %[[VAL_20:.*]] = scf.for %[[VAL_21:.*]] = %[[VAL_6]] to %[[VAL_7]] step %[[VAL_5]] iter_args(%[[VAL_22:.*]] = %[[VAL_18]]) -> (f32) {
+// CHECK:               %[[VAL_23:.*]] = arith.addi %[[VAL_21]], %[[VAL_19]] : index
+// CHECK:               %[[VAL_24:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_23]]] : memref<?xindex>
+// CHECK:               %[[VAL_25:.*]] = arith.addi %[[VAL_23]], %[[VAL_5]] : index
+// CHECK:               %[[VAL_26:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_25]]] : memref<?xindex>
+// CHECK:               %[[VAL_27:.*]] = scf.for %[[VAL_28:.*]] = %[[VAL_24]] to %[[VAL_26]] step %[[VAL_5]] iter_args(%[[VAL_29:.*]] = %[[VAL_22]]) -> (f32) {
+// CHECK:                 %[[VAL_30:.*]] = memref.load %[[VAL_14]]{{\[}}%[[VAL_28]]] : memref<?xindex>
+// CHECK:                 %[[VAL_31:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_17]], %[[VAL_21]], %[[VAL_30]]] : memref<2x3x8xf32>
+// CHECK:                 %[[VAL_32:.*]] = arith.addf %[[VAL_31]], %[[VAL_29]] : f32
+// CHECK:                 scf.yield %[[VAL_32]] : f32
+// CHECK:               } {"Emitted from" = "linalg.generic"}
+// CHECK:               scf.yield %[[VAL_27]] : f32
+// CHECK:             } {"Emitted from" = "linalg.generic"}
+// CHECK:             scf.yield %[[VAL_20]] : f32
+// CHECK:           } {"Emitted from" = "linalg.generic"}
+// CHECK:           memref.store %[[VAL_16]], %[[VAL_10]][] : memref<f32>
+// CHECK:           return %[[VAL_10]] : memref<f32>
+// CHECK:         }
+//
+func.func @sum_products(%a: tensor<2x3x8xf32, #Sparse>, %b: tensor<2x3x8xf32>) -> tensor<f32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = tensor.empty() : tensor<2x3x8xf32>
+  %1 = linalg.generic #trait3p
+      ins(%a, %b : tensor<2x3x8xf32, #Sparse>, tensor<2x3x8xf32>)
+      outs(%0 : tensor<2x3x8xf32>) {
+        ^bb0(%in1: f32, %in2: f32, %out: f32):
+          %mul = arith.mulf %in1, %in2 : f32
+          linalg.yield %mul : f32
+      } -> tensor<2x3x8xf32>
+  %2 = tensor.empty() : tensor<f32>
+  %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<f32>) -> tensor<f32>
+  %4 = linalg.generic #trait3r
+      ins(%1 : tensor<2x3x8xf32>)
+      outs(%3 : tensor<f32>) {
+        ^bb0(%in: f32, %out: f32):
+          %add = arith.addf %in, %out : f32
+          linalg.yield %add : f32
+      } -> tensor<f32>
+
+  return %4 : tensor<f32>
+}
diff --git a/mlir/test/Dialect/SparseTensor/codegen.mlir b/mlir/test/Dialect/SparseTensor/codegen.mlir
index 40bfa1e4e2a5..af78458f1093 100644
--- a/mlir/test/Dialect/SparseTensor/codegen.mlir
+++ b/mlir/test/Dialect/SparseTensor/codegen.mlir
@@ -266,7 +266,9 @@ func.func @sparse_dense_3d_dyn(%arg0: tensor<?x?x?xf64, #Dense3D>) -> index {
 //  CHECK-SAME: %[[A3:.*3]]: memref<?xi64>,
 //  CHECK-SAME: %[[A4:.*4]]: memref<?xf64>,
 //  CHECK-SAME: %[[A5:.*5]]: !sparse_tensor.storage_specifier
-//       CHECK: return %[[A2]] : memref<?xi32>
+//       CHECK: %[[S:.*]] = sparse_tensor.storage_specifier.get %[[A5]] pos_mem_sz at 1
+//       CHECK: %[[V:.*]] = memref.subview %[[A2]][0] [%[[S]]] [1]
+//       CHECK: return %[[V]] : memref<?xi32>
 func.func @sparse_positions_dcsr(%arg0: tensor<?x?xf64, #DCSR>) -> memref<?xi32> {
   %0 = sparse_tensor.positions %arg0 { level = 1 : index } : tensor<?x?xf64, #DCSR> to memref<?xi32>
   return %0 : memref<?xi32>
@@ -279,7 +281,9 @@ func.func @sparse_positions_dcsr(%arg0: tensor<?x?xf64, #DCSR>) -> memref<?xi32>
 //  CHECK-SAME: %[[A3:.*3]]: memref<?xi64>,
 //  CHECK-SAME: %[[A4:.*4]]: memref<?xf64>,
 //  CHECK-SAME: %[[A5:.*5]]: !sparse_tensor.storage_specifier
-//       CHECK: return %[[A3]] : memref<?xi64>
+//       CHECK: %[[S:.*]] = sparse_tensor.storage_specifier.get %[[A5]] crd_mem_sz at 1
+//       CHECK: %[[V:.*]] = memref.subview %[[A3]][0] [%[[S]]] [1]
+//       CHECK: return %[[V]] : memref<?xi64>
 func.func @sparse_indices_dcsr(%arg0: tensor<?x?xf64, #DCSR>) -> memref<?xi64> {
   %0 = sparse_tensor.coordinates %arg0 { level = 1 : index } : tensor<?x?xf64, #DCSR> to memref<?xi64>
   return %0 : memref<?xi64>
@@ -292,7 +296,9 @@ func.func @sparse_indices_dcsr(%arg0: tensor<?x?xf64, #DCSR>) -> memref<?xi64> {
 //  CHECK-SAME: %[[A3:.*3]]: memref<?xi64>,
 //  CHECK-SAME: %[[A4:.*4]]: memref<?xf64>,
 //  CHECK-SAME: %[[A5:.*5]]: !sparse_tensor.storage_specifier
-//       CHECK: return %[[A4]] : memref<?xf64>
+//       CHECK: %[[S:.*]] = sparse_tensor.storage_specifier.get %[[A5]] val_mem_sz
+//       CHECK: %[[V:.*]] = memref.subview %[[A4]][0] [%[[S]]] [1]
+//       CHECK: return %[[V]] : memref<?xf64>
 func.func @sparse_values_dcsr(%arg0: tensor<?x?xf64, #DCSR>) -> memref<?xf64> {
   %0 = sparse_tensor.values %arg0 : tensor<?x?xf64, #DCSR> to memref<?xf64>
   return %0 : memref<?xf64>
@@ -305,13 +311,14 @@ func.func @sparse_values_dcsr(%arg0: tensor<?x?xf64, #DCSR>) -> memref<?xf64> {
 //  CHECK-SAME: %[[A3:.*3]]: memref<?xindex>,
 //  CHECK-SAME: %[[A4:.*4]]: memref<?xf64>,
 //  CHECK-SAME: %[[A5:.*5]]: !sparse_tensor.storage_specifier
-//       CHECK: return %[[A4]] : memref<?xf64>
+//       CHECK: %[[S:.*]] = sparse_tensor.storage_specifier.get %[[A5]] val_mem_sz
+//       CHECK: %[[V:.*]] = memref.subview %[[A4]][0] [%[[S]]] [1]
+//       CHECK: return %[[V]] : memref<?xf64>
 func.func @sparse_values_coo(%arg0: tensor<?x?x?xf64, #ccoo>) -> memref<?xf64> {
   %0 = sparse_tensor.values %arg0 : tensor<?x?x?xf64, #ccoo> to memref<?xf64>
   return %0 : memref<?xf64>
 }
 
-
 // CHECK-LABEL: func.func @sparse_indices_coo(
 //  CHECK-SAME: %[[A0:.*0]]: memref<?xindex>,
 //  CHECK-SAME: %[[A1:.*1]]: memref<?xindex>,
@@ -320,7 +327,7 @@ func.func @sparse_values_coo(%arg0: tensor<?x?x?xf64, #ccoo>) -> memref<?xf64> {
 //  CHECK-SAME: %[[A4:.*4]]: memref<?xf64>,
 //  CHECK-SAME: %[[A5:.*5]]: !sparse_tensor.storage_specifier
 //       CHECK: %[[C2:.*]] = arith.constant 2 : index
-//       CHECK: %[[S0:.*]] = sparse_tensor.storage_specifier.get %[[A5]]  crd_mem_sz at 1
+//       CHECK: %[[S0:.*]] = sparse_tensor.storage_specifier.get %[[A5]] crd_mem_sz at 1
 //       CHECK: %[[S2:.*]] = arith.divui %[[S0]], %[[C2]] : index
 //       CHECK: %[[R1:.*]] = memref.subview %[[A3]][0] {{\[}}%[[S2]]] [2] : memref<?xindex> to memref<?xindex, strided<[2]>>
 //       CHECK: %[[R2:.*]] = memref.cast %[[R1]] : memref<?xindex, strided<[2]>> to memref<?xindex, strided<[?], offset: ?>>
@@ -337,7 +344,9 @@ func.func @sparse_indices_coo(%arg0: tensor<?x?x?xf64, #ccoo>) -> memref<?xindex
 //  CHECK-SAME: %[[A3:.*3]]: memref<?xindex>,
 //  CHECK-SAME: %[[A4:.*4]]: memref<?xf64>,
 //  CHECK-SAME: %[[A5:.*5]]: !sparse_tensor.storage_specifier
-//       CHECK: return %[[A3]] : memref<?xindex>
+//       CHECK: %[[S:.*]] = sparse_tensor.storage_specifier.get %[[A5]] crd_mem_sz at 1
+//       CHECK: %[[V:.*]] = memref.subview %[[A3]][0] [%[[S]]] [1]
+//       CHECK: return %[[V]] : memref<?xindex>
 func.func @sparse_indices_buffer_coo(%arg0: tensor<?x?x?xf64, #ccoo>) -> memref<?xindex> {
   %0 = sparse_tensor.coordinates_buffer  %arg0 : tensor<?x?x?xf64, #ccoo> to memref<?xindex>
   return %0 : memref<?xindex>
diff --git a/mlir/test/Dialect/SparseTensor/invalid_encoding.mlir b/mlir/test/Dialect/SparseTensor/invalid_encoding.mlir
index 8096c010ac93..a3f72bd3ae97 100644
--- a/mlir/test/Dialect/SparseTensor/invalid_encoding.mlir
+++ b/mlir/test/Dialect/SparseTensor/invalid_encoding.mlir
@@ -443,3 +443,88 @@ func.func private @NOutOfM(%arg0: tensor<?x?x?xf64, #NOutOfM>) {
 func.func private @NOutOfM(%arg0: tensor<?x?x?xf64, #NOutOfM>) {
   return
 }
+
+// -----
+
+#CSR_ExpType = #sparse_tensor.encoding<{
+  map = (d0, d1) -> (d0 : dense, d1 : compressed),
+  posWidth = 32,
+  crdWidth = 32,
+  explicitVal = 1 : i32,
+  implicitVal = 0.0 : f32
+}>
+
+// expected-error@+1 {{explicit value type mismatch between encoding and tensor element type: 'i32' != 'f32'}}
+func.func private @sparse_csr(tensor<?x?xf32, #CSR_ExpType>)
+
+// -----
+
+#CSR_ImpType = #sparse_tensor.encoding<{
+  map = (d0, d1) -> (d0 : dense, d1 : compressed),
+  posWidth = 32,
+  crdWidth = 32,
+  explicitVal = 1 : i32,
+  implicitVal = 0.0 : f32
+}>
+
+// expected-error@+1 {{implicit value type mismatch between encoding and tensor element type: 'f32' != 'i32'}}
+func.func private @sparse_csr(tensor<?x?xi32, #CSR_ImpType>)
+
+// -----
+
+// expected-error@+1 {{expected a numeric value for explicitVal}}
+#CSR_ExpType = #sparse_tensor.encoding<{
+  map = (d0, d1) -> (d0 : dense, d1 : compressed),
+  posWidth = 32,
+  crdWidth = 32,
+  explicitVal = "str"
+}>
+func.func private @sparse_csr(tensor<?x?xi32, #CSR_ExpType>)
+
+// -----
+
+// expected-error@+1 {{expected a numeric value for implicitVal}}
+#CSR_ImpType = #sparse_tensor.encoding<{
+  map = (d0, d1) -> (d0 : dense, d1 : compressed),
+  posWidth = 32,
+  crdWidth = 32,
+  implicitVal = "str"
+}>
+func.func private @sparse_csr(tensor<?x?xi32, #CSR_ImpType>)
+
+// -----
+
+#CSR_ImpVal = #sparse_tensor.encoding<{
+  map = (d0, d1) -> (d0 : dense, d1 : compressed),
+  posWidth = 32,
+  crdWidth = 32,
+  implicitVal = 1 : i32
+}>
+
+// expected-error@+1 {{implicit value must be zero}}
+func.func private @sparse_csr(tensor<?x?xi32, #CSR_ImpVal>)
+
+// -----
+
+#CSR_ImpVal = #sparse_tensor.encoding<{
+  map = (d0, d1) -> (d0 : dense, d1 : compressed),
+  posWidth = 32,
+  crdWidth = 32,
+  implicitVal = 1.0 : f32
+}>
+
+// expected-error@+1 {{implicit value must be zero}}
+func.func private @sparse_csr(tensor<?x?xf32, #CSR_ImpVal>)
+
+// -----
+
+#CSR_OnlyOnes = #sparse_tensor.encoding<{
+  map = (d0, d1) -> (d0 : dense, d1 : compressed),
+  posWidth = 64,
+  crdWidth = 64,
+  explicitVal = #complex.number<:f32 1.0, 0.0>,
+  implicitVal = #complex.number<:f32 1.0, 0.0>
+}>
+
+// expected-error@+1 {{implicit value must be zero}}
+func.func private @sparse_csr(tensor<?x?xcomplex<f32>, #CSR_OnlyOnes>)
diff --git a/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir b/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir
index 7eeda9a98802..44710cad246c 100644
--- a/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir
+++ b/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir
@@ -80,6 +80,21 @@ func.func private @sparse_csr(tensor<?x?xi64, #CSR_OnlyOnes>)
 
 // -----
 
+#CSR_OnlyOnes = #sparse_tensor.encoding<{
+  map = (d0, d1) -> (d0 : dense, d1 : compressed),
+  posWidth = 64,
+  crdWidth = 64,
+  explicitVal = #complex.number<:f32 1.0, 0.0>,
+  implicitVal = #complex.number<:f32 0.0, 0.0>
+}>
+
+// CHECK: #[[$CSR_OnlyOnes:.*]] = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed), posWidth = 64, crdWidth = 64, explicitVal = #complex.number<:f32 1.000000e+00, 0.000000e+00> : complex<f32>, implicitVal = #complex.number<:f32 0.000000e+00, 0.000000e+00> : complex<f32> }>
+// CHECK-LABEL: func private @sparse_csr(
+// CHECK-SAME: tensor<?x?xcomplex<f32>, #[[$CSR_OnlyOnes]]>)
+func.func private @sparse_csr(tensor<?x?xcomplex<f32>, #CSR_OnlyOnes>)
+
+// -----
+
 #BCSR = #sparse_tensor.encoding<{
   map = (d0, d1, d2) -> (d0 : batch, d1: dense, d2 : compressed),
 }>
@@ -141,6 +156,17 @@ func.func private @sparse_coo(tensor<?x?xf32, #COO>)
 
 // -----
 
+#COO_DENSE = #sparse_tensor.encoding<{
+  map = (d0, d1, d2) -> (d0 : compressed(nonunique), d1 : singleton, d2: dense)
+}>
+
+// CHECK-DAG: #[[$COO:.*]] = #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : compressed(nonunique), d1 : singleton, d2 : dense) }>
+// CHECK-LABEL: func private @sparse_coo_trailing_dense(
+// CHECK-SAME: tensor<?x?x1xf32, #[[$COO]]>)
+func.func private @sparse_coo_trailing_dense(tensor<?x?x1xf32, #COO_DENSE>)
+
+// -----
+
 #BCOO = #sparse_tensor.encoding<{
   map = (d0, d1, d2) -> (d0 : dense, d1 : loose_compressed(nonunique), d2 : singleton)
 }>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_fusion.mlir b/mlir/test/Dialect/SparseTensor/sparse_fusion.mlir
index 8780baac199e..2cc64434a1d8 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_fusion.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_fusion.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --linalg-fuse-elementwise-ops | FileCheck %s
+// RUN: mlir-opt %s --linalg-fuse-elementwise-ops --sparse-reinterpret-map --sparsification | FileCheck %s
 
 #SV = #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>
 
@@ -11,22 +11,59 @@
   doc = "B(i) = OP A(i)"
 }
 
-// CHECK-LABEL: func @sparse_fusion
-// CHECK:     linalg.generic
-// CHECK:       arith.addf
-// CHECK:     linalg.generic
-// CHECK:       math.exp
-// CHECK:       arith.maximumf
-// CHECK-NOT: linalg.generic
-// CHECK:     return
+
+// CHECK-LABEL:   func.func @sparse_fusion(
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<100xf64, #sparse>) -> tensor<100xf64> {
+// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant true
+// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 100 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 1.000000e+00 : f64
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 1.000000e+02 : f64
+// CHECK-DAG:       %[[VAL_8:.*]] = tensor.empty() : tensor<100xf64>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<100xf64, #sparse> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<100xf64, #sparse> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<100xf64, #sparse> to memref<?xf64>
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_8]] : memref<100xf64>
+// CHECK:           linalg.fill ins(%[[VAL_4]] : f64) outs(%[[VAL_12]] : memref<100xf64>)
+// CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_2]]] : memref<?xindex>
+// CHECK:           %[[VAL_15:.*]]:2 = scf.while (%[[VAL_16:.*]] = %[[VAL_13]], %[[VAL_17:.*]] = %[[VAL_3]]) : (index, index) -> (index, index) {
+// CHECK:             %[[VAL_18:.*]] = arith.cmpi ult, %[[VAL_16]], %[[VAL_14]] : index
+// CHECK:             scf.condition(%[[VAL_18]]) %[[VAL_16]], %[[VAL_17]] : index, index
+// CHECK:           } do {
+// CHECK:           ^bb0(%[[VAL_19:.*]]: index, %[[VAL_20:.*]]: index):
+// CHECK:             %[[VAL_21:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_19]]] : memref<?xindex>
+// CHECK:             %[[VAL_22:.*]] = arith.cmpi eq, %[[VAL_21]], %[[VAL_20]] : index
+// CHECK:             scf.if %[[VAL_22]] {
+// CHECK:               %[[VAL_23:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_19]]] : memref<?xf64>
+// CHECK:               %[[VAL_24:.*]] = arith.addf %[[VAL_23]], %[[VAL_6]] : f64
+// CHECK:               %[[VAL_25:.*]] = math.exp %[[VAL_24]] : f64
+// CHECK:               %[[VAL_26:.*]] = arith.maximumf %[[VAL_25]], %[[VAL_7]] : f64
+// CHECK:               memref.store %[[VAL_26]], %[[VAL_12]]{{\[}}%[[VAL_20]]] : memref<100xf64>
+// CHECK:             } else {
+// CHECK:               scf.if %[[VAL_1]] {
+// CHECK:                 memref.store %[[VAL_7]], %[[VAL_12]]{{\[}}%[[VAL_20]]] : memref<100xf64>
+// CHECK:               } else {
+// CHECK:               }
+// CHECK:             }
+// CHECK:             %[[VAL_27:.*]] = arith.cmpi eq, %[[VAL_21]], %[[VAL_20]] : index
+// CHECK:             %[[VAL_28:.*]] = arith.addi %[[VAL_19]], %[[VAL_2]] : index
+// CHECK:             %[[VAL_29:.*]] = arith.select %[[VAL_27]], %[[VAL_28]], %[[VAL_19]] : index
+// CHECK:             %[[VAL_30:.*]] = arith.addi %[[VAL_20]], %[[VAL_2]] : index
+// CHECK:             scf.yield %[[VAL_29]], %[[VAL_30]] : index, index
+// CHECK:           }
+// CHECK:           scf.for %[[VAL_31:.*]] = %[[VAL_32:.*]]#1 to %[[VAL_5]] step %[[VAL_2]] {
+// CHECK:             memref.store %[[VAL_7]], %[[VAL_12]]{{\[}}%[[VAL_31]]] : memref<100xf64>
+// CHECK:           }
+// CHECK:           %[[VAL_33:.*]] = bufferization.to_tensor %[[VAL_12]] : memref<100xf64>
+// CHECK:           return %[[VAL_33]] : tensor<100xf64>
+// CHECK:         }
 func.func @sparse_fusion(%argA: tensor<100xf64, #SV>) -> tensor<100xf64> {
   %c1 = arith.constant 1.0 : f64
   %c100 = arith.constant 100.0 : f64
 
-  //
-  // Densifying op.
-  // Should not be fused with subsequent dense ops.
-  //
   %t0 = tensor.empty() : tensor<100xf64>
   %l0 = linalg.generic #trait
       ins(%argA: tensor<100xf64, #SV>) outs(%t0: tensor<100xf64>) {
@@ -34,12 +71,6 @@ func.func @sparse_fusion(%argA: tensor<100xf64, #SV>) -> tensor<100xf64> {
       %b0 = arith.addf %in0, %c1 : f64
       linalg.yield %b0 : f64
   } -> tensor<100xf64>
-
-
-  //
-  // Two following dense ops.
-  // Should be fused, but not with above.
-  //
   %t1 = tensor.empty() : tensor<100xf64>
   %l1 = linalg.generic #trait
       ins(%l0: tensor<100xf64>) outs(%t1: tensor<100xf64>) {
diff --git a/mlir/test/Dialect/SparseTensor/sparse_matmul_codegen.mlir b/mlir/test/Dialect/SparseTensor/sparse_matmul_codegen.mlir
index 5145d6c1dcfc..ad12b637d0c5 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_matmul_codegen.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_matmul_codegen.mlir
@@ -1,5 +1,3 @@
-// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-
 // RUN: mlir-opt %s --linalg-generalize-named-ops \
 // RUN:  --sparse-reinterpret-map --sparsification --sparse-tensor-codegen \
 // RUN:  --canonicalize --cse | FileCheck %s
@@ -11,45 +9,6 @@
 //
 // Computes C = A x B with all matrices sparse (SpMSpM) in CSR.
 //
-// CHECK-LABEL:   func.func private @_insert_dense_compressed_4_4_f64_0_0(
-// CHECK-SAME:      %[[VAL_0:.*0]]: memref<?xindex>,
-// CHECK-SAME:      %[[VAL_1:.*1]]: memref<?xindex>,
-// CHECK-SAME:      %[[VAL_2:.*2]]: memref<?xf64>,
-// CHECK-SAME:      %[[VAL_3:.*3]]: !sparse_tensor.storage_specifier
-// CHECK-SAME:      %[[VAL_4:.*4]]: index,
-// CHECK-SAME:      %[[VAL_5:.*5]]: index,
-// CHECK-SAME:      %[[VAL_6:.*6]]: f64) -> (memref<?xindex>, memref<?xindex>, memref<?xf64>, !sparse_tensor.storage_specifier
-// CHECK:           %[[VAL_7:.*]] = arith.constant false
-// CHECK:           %[[VAL_8:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_9:.*]] = arith.addi %[[VAL_4]], %[[VAL_8]] : index
-// CHECK:           %[[VAL_10:.*]] = memref.load %[[VAL_0]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:           %[[VAL_11:.*]] = memref.load %[[VAL_0]]{{\[}}%[[VAL_9]]] : memref<?xindex>
-// CHECK:           %[[VAL_13:.*]] = sparse_tensor.storage_specifier.get %[[VAL_3]]  crd_mem_sz at 1 : !sparse_tensor.storage_specifier
-// CHECK:           %[[VAL_14:.*]] = arith.subi %[[VAL_11]], %[[VAL_8]] : index
-// CHECK:           %[[VAL_15:.*]] = arith.cmpi ult, %[[VAL_10]], %[[VAL_11]] : index
-// CHECK:           %[[VAL_16:.*]] = scf.if %[[VAL_15]] -> (i1) {
-// CHECK:             %[[VAL_17:.*]] = memref.load %[[VAL_1]]{{\[}}%[[VAL_14]]] : memref<?xindex>
-// CHECK:             %[[VAL_18:.*]] = arith.cmpi eq, %[[VAL_17]], %[[VAL_5]] : index
-// CHECK:             scf.yield %[[VAL_18]] : i1
-// CHECK:           } else {
-// CHECK:             memref.store %[[VAL_13]], %[[VAL_0]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK:             scf.yield %[[VAL_7]] : i1
-// CHECK:           }
-// CHECK:           %[[VAL_19:.*]]:2 = scf.if %[[VAL_20:.*]] -> (memref<?xindex>, !sparse_tensor.storage_specifier
-// CHECK:             scf.yield %[[VAL_1]], %[[VAL_3]] : memref<?xindex>, !sparse_tensor.storage_specifier
-// CHECK:           } else {
-// CHECK:             %[[VAL_21:.*]] = arith.addi %[[VAL_13]], %[[VAL_8]] : index
-// CHECK:             memref.store %[[VAL_21]], %[[VAL_0]]{{\[}}%[[VAL_9]]] : memref<?xindex>
-// CHECK:             %[[VAL_22:.*]], %[[VAL_24:.*]] = sparse_tensor.push_back %[[VAL_13]], %[[VAL_1]], %[[VAL_5]] : index, memref<?xindex>, index
-// CHECK:             %[[VAL_25:.*]] = sparse_tensor.storage_specifier.set %[[VAL_3]]  crd_mem_sz at 1 with %[[VAL_24]] : !sparse_tensor.storage_specifier
-// CHECK:             scf.yield %[[VAL_22]], %[[VAL_25]] : memref<?xindex>, !sparse_tensor.storage_specifier
-// CHECK:           }
-// CHECK:           %[[VAL_28:.*]] = sparse_tensor.storage_specifier.get %[[VAL_27:.*]]#1  val_mem_sz : !sparse_tensor.storage_specifier
-// CHECK:           %[[VAL_29:.*]], %[[VAL_30:.*]] = sparse_tensor.push_back %[[VAL_28]], %[[VAL_2]], %[[VAL_6]] : index, memref<?xf64>, f64
-// CHECK:           %[[VAL_32:.*]] = sparse_tensor.storage_specifier.set %[[VAL_27]]#1  val_mem_sz with %[[VAL_30]] : !sparse_tensor.storage_specifier
-// CHECK:           return %[[VAL_0]], %[[VAL_27]]#0, %[[VAL_29]], %[[VAL_32]] : memref<?xindex>, memref<?xindex>, memref<?xf64>, !sparse_tensor.storage_specifier
-// CHECK:         }
-
 // CHECK-LABEL:   func.func @matmul(
 // CHECK-SAME:      %[[VAL_0:.*0]]: memref<?xindex>,
 // CHECK-SAME:      %[[VAL_1:.*1]]: memref<?xindex>,
@@ -59,12 +18,12 @@
 // CHECK-SAME:      %[[VAL_5:.*5]]: memref<?xindex>,
 // CHECK-SAME:      %[[VAL_6:.*6]]: memref<?xf64>,
 // CHECK-SAME:      %[[VAL_7:.*7]]: !sparse_tensor.storage_specifier
-// CHECK-DAG:       %[[VAL_8:.*]] = arith.constant 4 : index
-// CHECK-DAG:       %[[VAL_9:.*]] = arith.constant 0.000000e+00 : f64
-// CHECK-DAG:       %[[VAL_10:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_8:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK-DAG:       %[[VAL_9:.*]] = arith.constant true
+// CHECK-DAG:       %[[VAL_10:.*]] = arith.constant false
 // CHECK-DAG:       %[[VAL_11:.*]] = arith.constant 1 : index
-// CHECK-DAG:       %[[VAL_12:.*]] = arith.constant false
-// CHECK-DAG:       %[[VAL_13:.*]] = arith.constant true
+// CHECK-DAG:       %[[VAL_12:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_13:.*]] = arith.constant 4 : index
 // CHECK:           %[[VAL_14:.*]] = memref.alloc() : memref<16xindex>
 // CHECK:           %[[VAL_15:.*]] = memref.cast %[[VAL_14]] : memref<16xindex> to memref<?xindex>
 // CHECK:           %[[VAL_16:.*]] = memref.alloc() : memref<16xindex>
@@ -72,76 +31,89 @@
 // CHECK:           %[[VAL_18:.*]] = memref.alloc() : memref<16xf64>
 // CHECK:           %[[VAL_19:.*]] = memref.cast %[[VAL_18]] : memref<16xf64> to memref<?xf64>
 // CHECK:           %[[VAL_20:.*]] = sparse_tensor.storage_specifier.init : !sparse_tensor.storage_specifier
-// CHECK:           %[[VAL_21:.*]] = sparse_tensor.storage_specifier.set %[[VAL_20]]  lvl_sz at 0 with %[[VAL_8]] : !sparse_tensor.storage_specifier
-// CHECK:           %[[VAL_22:.*]] = sparse_tensor.storage_specifier.set %[[VAL_21]]  lvl_sz at 1 with %[[VAL_8]] : !sparse_tensor.storage_specifier
+// CHECK:           %[[VAL_21:.*]] = sparse_tensor.storage_specifier.set %[[VAL_20]]  lvl_sz at 0 with %[[VAL_13]] : !sparse_tensor.storage_specifier
+// CHECK:           %[[VAL_22:.*]] = sparse_tensor.storage_specifier.set %[[VAL_21]]  lvl_sz at 1 with %[[VAL_13]] : !sparse_tensor.storage_specifier
 // CHECK:           %[[VAL_23:.*]] = sparse_tensor.storage_specifier.get %[[VAL_22]]  pos_mem_sz at 1 : !sparse_tensor.storage_specifier
-// CHECK:           %[[VAL_24:.*]], %[[VAL_25:.*]] = sparse_tensor.push_back %[[VAL_23]], %[[VAL_15]], %[[VAL_10]] : index, memref<?xindex>, index
+// CHECK:           %[[VAL_24:.*]], %[[VAL_25:.*]] = sparse_tensor.push_back %[[VAL_23]], %[[VAL_15]], %[[VAL_12]] : index, memref<?xindex>, index
 // CHECK:           %[[VAL_26:.*]] = sparse_tensor.storage_specifier.set %[[VAL_22]]  pos_mem_sz at 1 with %[[VAL_25]] : !sparse_tensor.storage_specifier
-// CHECK:           %[[VAL_27:.*]], %[[VAL_28:.*]] = sparse_tensor.push_back %[[VAL_25]], %[[VAL_24]], %[[VAL_10]], %[[VAL_8]] : index, memref<?xindex>, index, index
+// CHECK:           %[[VAL_27:.*]], %[[VAL_28:.*]] = sparse_tensor.push_back %[[VAL_25]], %[[VAL_24]], %[[VAL_12]], %[[VAL_13]] : index, memref<?xindex>, index, index
 // CHECK:           %[[VAL_29:.*]] = sparse_tensor.storage_specifier.set %[[VAL_26]]  pos_mem_sz at 1 with %[[VAL_28]] : !sparse_tensor.storage_specifier
 // CHECK:           %[[VAL_30:.*]] = memref.alloc() : memref<4xf64>
-// CHECK:           %[[VAL_31:.*]] = m
+// CHECK:           %[[VAL_31:.*]] = memref.alloc() : memref<4xi1>
 // CHECK:           %[[VAL_32:.*]] = memref.alloc() : memref<4xindex>
 // CHECK:           %[[VAL_33:.*]] = memref.cast %[[VAL_32]] : memref<4xindex> to memref<?xindex>
-// CHECK:           linalg.fill ins(%[[VAL_9]] : f64) outs(%[[VAL_30]] : memref<4xf64>)
-// CHECK:           linalg.fill ins(%[[VAL_12]] : i1) outs(%[[VAL_31]] : memref<4xi1>)
-// CHECK:           %[[VAL_34:.*]]:4 = scf.for %[[VAL_35:.*]] = %[[VAL_10]] to %[[VAL_8]] step %[[VAL_11]] iter_args(%[[VAL_36:.*]] = %[[VAL_27]], %[[VAL_37:.*]] = %[[VAL_17]], %[[VAL_38:.*]] = %[[VAL_19]], %[[VAL_39:.*]] = %[[VAL_29]]) -> (memref<?xindex>, memref<?xindex>, memref<?xf64>, !sparse_tensor.storage_specifier
-// CHECK:             %[[VAL_40:.*]] = memref.load %[[VAL_0]]{{\[}}%[[VAL_35]]] : memref<?xindex>
-// CHECK:             %[[VAL_41:.*]] = arith.addi %[[VAL_35]], %[[VAL_11]] : index
-// CHECK:             %[[VAL_42:.*]] = memref.load %[[VAL_0]]{{\[}}%[[VAL_41]]] : memref<?xindex>
-// CHECK:             %[[VAL_43:.*]] = scf.for %[[VAL_44:.*]] = %[[VAL_40]] to %[[VAL_42]] step %[[VAL_11]] iter_args(%[[VAL_45:.*]] = %[[VAL_10]]) -> (index) {
-// CHECK:               %[[VAL_46:.*]] = memref.load %[[VAL_1]]{{\[}}%[[VAL_44]]] : memref<?xindex>
-// CHECK:               %[[VAL_47:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_44]]] : memref<?xf64>
-// CHECK:               %[[VAL_48:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_46]]] : memref<?xindex>
-// CHECK:               %[[VAL_49:.*]] = arith.addi %[[VAL_46]], %[[VAL_11]] : index
-// CHECK:               %[[VAL_50:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_49]]] : memref<?xindex>
-// CHECK:               %[[VAL_51:.*]] = scf.for %[[VAL_52:.*]] = %[[VAL_48]] to %[[VAL_50]] step %[[VAL_11]] iter_args(%[[VAL_53:.*]] = %[[VAL_45]]) -> (index) {
-// CHECK:                 %[[VAL_54:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_52]]] : memref<?xindex>
-// CHECK:                 %[[VAL_55:.*]] = memref.load %[[VAL_30]]{{\[}}%[[VAL_54]]] : memref<4xf64>
-// CHECK:                 %[[VAL_56:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_52]]] : memref<?xf64>
-// CHECK:                 %[[VAL_57:.*]] = arith.mulf %[[VAL_47]], %[[VAL_56]] : f64
-// CHECK:                 %[[VAL_58:.*]] = arith.addf %[[VAL_55]], %[[VAL_57]] : f64
-// CHECK:                 %[[VAL_59:.*]] = memref.load %[[VAL_31]]{{\[}}%[[VAL_54]]] : memref<4xi1>
-// CHECK:                 %[[VAL_60:.*]] = arith.cmpi eq, %[[VAL_59]], %[[VAL_12]] : i1
-// CHECK:                 %[[VAL_61:.*]] = scf.if %[[VAL_60]] -> (index) {
-// CHECK:                   memref.store %[[VAL_13]], %[[VAL_31]]{{\[}}%[[VAL_54]]] : memref<4xi1>
-// CHECK:                   memref.store %[[VAL_54]], %[[VAL_32]]{{\[}}%[[VAL_53]]] : memref<4xindex>
-// CHECK:                   %[[VAL_62:.*]] = arith.addi %[[VAL_53]], %[[VAL_11]] : index
-// CHECK:                   scf.yield %[[VAL_62]] : index
+// CHECK:           linalg.fill ins(%[[VAL_8]] : f64) outs(%[[VAL_30]] : memref<4xf64>)
+// CHECK:           linalg.fill ins(%[[VAL_10]] : i1) outs(%[[VAL_31]] : memref<4xi1>)
+// CHECK:           %[[VAL_34:.*]] = sparse_tensor.storage_specifier.get %[[VAL_3]]  pos_mem_sz at 1 : !sparse_tensor.storage_specifier
+// CHECK:           %[[VAL_35:.*]] = memref.subview %[[VAL_0]][0] {{\[}}%[[VAL_34]]] [1] : memref<?xindex> to memref<?xindex>
+// CHECK:           %[[VAL_36:.*]] = sparse_tensor.storage_specifier.get %[[VAL_3]]  crd_mem_sz at 1 : !sparse_tensor.storage_specifier
+// CHECK:           %[[VAL_37:.*]] = memref.subview %[[VAL_1]][0] {{\[}}%[[VAL_36]]] [1] : memref<?xindex> to memref<?xindex>
+// CHECK:           %[[VAL_38:.*]] = sparse_tensor.storage_specifier.get %[[VAL_3]]  val_mem_sz : !sparse_tensor.storage_specifier
+// CHECK:           %[[VAL_39:.*]] = memref.subview %[[VAL_2]][0] {{\[}}%[[VAL_38]]] [1] : memref<?xf64> to memref<?xf64>
+// CHECK:           %[[VAL_40:.*]] = sparse_tensor.storage_specifier.get %[[VAL_7]]  pos_mem_sz at 1 : !sparse_tensor.storage_specifier
+// CHECK:           %[[VAL_41:.*]] = memref.subview %[[VAL_4]][0] {{\[}}%[[VAL_40]]] [1] : memref<?xindex> to memref<?xindex>
+// CHECK:           %[[VAL_42:.*]] = sparse_tensor.storage_specifier.get %[[VAL_7]]  crd_mem_sz at 1 : !sparse_tensor.storage_specifier
+// CHECK:           %[[VAL_43:.*]] = memref.subview %[[VAL_5]][0] {{\[}}%[[VAL_42]]] [1] : memref<?xindex> to memref<?xindex>
+// CHECK:           %[[VAL_44:.*]] = sparse_tensor.storage_specifier.get %[[VAL_7]]  val_mem_sz : !sparse_tensor.storage_specifier
+// CHECK:           %[[VAL_45:.*]] = memref.subview %[[VAL_6]][0] {{\[}}%[[VAL_44]]] [1] : memref<?xf64> to memref<?xf64>
+// CHECK:           %[[VAL_46:.*]]:4 = scf.for %[[VAL_47:.*]] = %[[VAL_12]] to %[[VAL_13]] step %[[VAL_11]] iter_args(%[[VAL_48:.*]] = %[[VAL_27]], %[[VAL_49:.*]] = %[[VAL_17]], %[[VAL_50:.*]] = %[[VAL_19]], %[[VAL_51:.*]] = %[[VAL_29]]) -> (memref<?xindex>, memref<?xindex>, memref<?xf64>, !sparse_tensor.storage_specifier
+// CHECK:             %[[VAL_52:.*]] = memref.load %[[VAL_35]]{{\[}}%[[VAL_47]]] : memref<?xindex>
+// CHECK:             %[[VAL_53:.*]] = arith.addi %[[VAL_47]], %[[VAL_11]] : index
+// CHECK:             %[[VAL_54:.*]] = memref.load %[[VAL_35]]{{\[}}%[[VAL_53]]] : memref<?xindex>
+// CHECK:             %[[VAL_55:.*]] = scf.for %[[VAL_56:.*]] = %[[VAL_52]] to %[[VAL_54]] step %[[VAL_11]] iter_args(%[[VAL_57:.*]] = %[[VAL_12]]) -> (index) {
+// CHECK:               %[[VAL_58:.*]] = memref.load %[[VAL_37]]{{\[}}%[[VAL_56]]] : memref<?xindex>
+// CHECK:               %[[VAL_59:.*]] = memref.load %[[VAL_39]]{{\[}}%[[VAL_56]]] : memref<?xf64>
+// CHECK:               %[[VAL_60:.*]] = memref.load %[[VAL_41]]{{\[}}%[[VAL_58]]] : memref<?xindex>
+// CHECK:               %[[VAL_61:.*]] = arith.addi %[[VAL_58]], %[[VAL_11]] : index
+// CHECK:               %[[VAL_62:.*]] = memref.load %[[VAL_41]]{{\[}}%[[VAL_61]]] : memref<?xindex>
+// CHECK:               %[[VAL_63:.*]] = scf.for %[[VAL_64:.*]] = %[[VAL_60]] to %[[VAL_62]] step %[[VAL_11]] iter_args(%[[VAL_65:.*]] = %[[VAL_57]]) -> (index) {
+// CHECK:                 %[[VAL_66:.*]] = memref.load %[[VAL_43]]{{\[}}%[[VAL_64]]] : memref<?xindex>
+// CHECK:                 %[[VAL_67:.*]] = memref.load %[[VAL_30]]{{\[}}%[[VAL_66]]] : memref<4xf64>
+// CHECK:                 %[[VAL_68:.*]] = memref.load %[[VAL_45]]{{\[}}%[[VAL_64]]] : memref<?xf64>
+// CHECK:                 %[[VAL_69:.*]] = arith.mulf %[[VAL_59]], %[[VAL_68]] : f64
+// CHECK:                 %[[VAL_70:.*]] = arith.addf %[[VAL_67]], %[[VAL_69]] : f64
+// CHECK:                 %[[VAL_71:.*]] = memref.load %[[VAL_31]]{{\[}}%[[VAL_66]]] : memref<4xi1>
+// CHECK:                 %[[VAL_72:.*]] = arith.cmpi eq, %[[VAL_71]], %[[VAL_10]] : i1
+// CHECK:                 %[[VAL_73:.*]] = scf.if %[[VAL_72]] -> (index) {
+// CHECK:                   memref.store %[[VAL_9]], %[[VAL_31]]{{\[}}%[[VAL_66]]] : memref<4xi1>
+// CHECK:                   memref.store %[[VAL_66]], %[[VAL_32]]{{\[}}%[[VAL_65]]] : memref<4xindex>
+// CHECK:                   %[[VAL_74:.*]] = arith.addi %[[VAL_65]], %[[VAL_11]] : index
+// CHECK:                   scf.yield %[[VAL_74]] : index
 // CHECK:                 } else {
-// CHECK:                   scf.yield %[[VAL_53]] : index
+// CHECK:                   scf.yield %[[VAL_65]] : index
 // CHECK:                 }
-// CHECK:                 memref.store %[[VAL_58]], %[[VAL_30]]{{\[}}%[[VAL_54]]] : memref<4xf64>
-// CHECK:                 scf.yield %[[VAL_63:.*]] : index
+// CHECK:                 memref.store %[[VAL_70]], %[[VAL_30]]{{\[}}%[[VAL_66]]] : memref<4xf64>
+// CHECK:                 scf.yield %[[VAL_73]] : index
 // CHECK:               } {"Emitted from" = "linalg.generic"}
-// CHECK:               scf.yield %[[VAL_64:.*]] : index
+// CHECK:               scf.yield %[[VAL_63]] : index
 // CHECK:             } {"Emitted from" = "linalg.generic"}
-// CHECK:             sparse_tensor.sort  hybrid_quick_sort %[[VAL_65:.*]], %[[VAL_33]]
-// CHECK:             %[[VAL_66:.*]]:4 = scf.for %[[VAL_67:.*]] = %[[VAL_10]] to %[[VAL_65]] step %[[VAL_11]] iter_args(%[[VAL_68:.*]] = %[[VAL_36]], %[[VAL_69:.*]] = %[[VAL_37]], %[[VAL_70:.*]] = %[[VAL_38]], %[[VAL_71:.*]] = %[[VAL_39]]) -> (memref<?xindex>, memref<?xindex>, memref<?xf64>, !sparse_tensor.storage_specifier
-// CHECK:               %[[VAL_72:.*]] = memref.load %[[VAL_32]]{{\[}}%[[VAL_67]]] : memref<4xindex>
-// CHECK:               %[[VAL_73:.*]] = memref.load %[[VAL_30]]{{\[}}%[[VAL_72]]] : memref<4xf64>
-// CHECK:               %[[VAL_74:.*]]:4 = func.call @_insert_dense_compressed_4_4_f64_0_0(%[[VAL_68]], %[[VAL_69]], %[[VAL_70]], %[[VAL_71]], %[[VAL_35]], %[[VAL_72]], %[[VAL_73]]) : (memref<?xindex>, memref<?xindex>, memref<?xf64>, !sparse_tensor.storage_specifie
-// CHECK:               memref.store %[[VAL_9]], %[[VAL_30]]{{\[}}%[[VAL_72]]] : memref<4xf64>
-// CHECK:               memref.store %[[VAL_12]], %[[VAL_31]]{{\[}}%[[VAL_72]]] : memref<4xi1>
-// CHECK:               scf.yield %[[VAL_74]]#0, %[[VAL_74]]#1, %[[VAL_74]]#2, %[[VAL_74]]#3 : memref<?xindex>, memref<?xindex>, memref<?xf64>, !sparse_tensor.storage_specifier
+// CHECK:             sparse_tensor.sort  hybrid_quick_sort %[[VAL_55]], %[[VAL_33]]
+// CHECK:             %[[VAL_75:.*]]:4 = scf.for %[[VAL_76:.*]] = %[[VAL_12]] to %[[VAL_55]] step %[[VAL_11]] iter_args(%[[VAL_77:.*]] = %[[VAL_48]], %[[VAL_78:.*]] = %[[VAL_49]], %[[VAL_79:.*]] = %[[VAL_50]], %[[VAL_80:.*]] = %[[VAL_51]]) -> (memref<?xindex>, memref<?xindex>, memref<?xf64>, !sparse_tensor.storage_specifier
+// CHECK:               %[[VAL_81:.*]] = memref.load %[[VAL_32]]{{\[}}%[[VAL_76]]] : memref<4xindex>
+// CHECK:               %[[VAL_82:.*]] = memref.load %[[VAL_30]]{{\[}}%[[VAL_81]]] : memref<4xf64>
+// CHECK:               %[[VAL_83:.*]]:4 = func.call @_insert_dense_compressed_4_4_f64_0_0(%[[VAL_77]], %[[VAL_78]], %[[VAL_79]], %[[VAL_80]], %[[VAL_47]], %[[VAL_81]], %[[VAL_82]]) : (memref<?xindex>, memref<?xindex>, memref<?xf64>, !sparse_tensor.storage_specifier
+// CHECK:               memref.store %[[VAL_8]], %[[VAL_30]]{{\[}}%[[VAL_81]]] : memref<4xf64>
+// CHECK:               memref.store %[[VAL_10]], %[[VAL_31]]{{\[}}%[[VAL_81]]] : memref<4xi1>
+// CHECK:               scf.yield %[[VAL_83]]#0, %[[VAL_83]]#1, %[[VAL_83]]#2, %[[VAL_83]]#3 : memref<?xindex>, memref<?xindex>, memref<?xf64>, !sparse_tensor.storage_specifier
 // CHECK:             }
-// CHECK:             scf.yield %[[VAL_75:.*]]#0, %[[VAL_75]]#1, %[[VAL_75]]#2, %[[VAL_75]]#3 : memref<?xindex>, memref<?xindex>, memref<?xf64>, !sparse_tensor.storage_specifier
+// CHECK:             scf.yield %[[VAL_84:.*]]#0, %[[VAL_84]]#1, %[[VAL_84]]#2, %[[VAL_84]]#3 : memref<?xindex>, memref<?xindex>, memref<?xf64>, !sparse_tensor.storage_specifier
 // CHECK:           } {"Emitted from" = "linalg.generic"}
 // CHECK:           memref.dealloc %[[VAL_30]] : memref<4xf64>
 // CHECK:           memref.dealloc %[[VAL_31]] : memref<4xi1>
 // CHECK:           memref.dealloc %[[VAL_32]] : memref<4xindex>
-// CHECK:           %[[VAL_76:.*]] = sparse_tensor.storage_specifier.get %[[VAL_77:.*]]#3  pos_mem_sz at 1 : !sparse_tensor.storage_specifier
-// CHECK:           %[[VAL_78:.*]] = memref.load %[[VAL_77]]#0{{\[}}%[[VAL_10]]] : memref<?xindex>
-// CHECK:           %[[VAL_79:.*]] = scf.for %[[VAL_80:.*]] = %[[VAL_11]] to %[[VAL_76]] step %[[VAL_11]] iter_args(%[[VAL_81:.*]] = %[[VAL_78]]) -> (index) {
-// CHECK:             %[[VAL_82:.*]] = memref.load %[[VAL_77]]#0{{\[}}%[[VAL_80]]] : memref<?xindex>
-// CHECK:             %[[VAL_83:.*]] = arith.cmpi eq, %[[VAL_82]], %[[VAL_10]] : index
-// CHECK:             %[[VAL_84:.*]] = arith.select %[[VAL_83]], %[[VAL_81]], %[[VAL_82]] : index
-// CHECK:             scf.if %[[VAL_83]] {
-// CHECK:               memref.store %[[VAL_81]], %[[VAL_77]]#0{{\[}}%[[VAL_80]]] : memref<?xindex>
+// CHECK:           %[[VAL_85:.*]] = sparse_tensor.storage_specifier.get %[[VAL_86:.*]]#3  pos_mem_sz at 1 : !sparse_tensor.storage_specifier
+// CHECK:           %[[VAL_87:.*]] = memref.load %[[VAL_86]]#0{{\[}}%[[VAL_12]]] : memref<?xindex>
+// CHECK:           %[[VAL_88:.*]] = scf.for %[[VAL_89:.*]] = %[[VAL_11]] to %[[VAL_85]] step %[[VAL_11]] iter_args(%[[VAL_90:.*]] = %[[VAL_87]]) -> (index) {
+// CHECK:             %[[VAL_91:.*]] = memref.load %[[VAL_86]]#0{{\[}}%[[VAL_89]]] : memref<?xindex>
+// CHECK:             %[[VAL_92:.*]] = arith.cmpi eq, %[[VAL_91]], %[[VAL_12]] : index
+// CHECK:             %[[VAL_93:.*]] = arith.select %[[VAL_92]], %[[VAL_90]], %[[VAL_91]] : index
+// CHECK:             scf.if %[[VAL_92]] {
+// CHECK:               memref.store %[[VAL_90]], %[[VAL_86]]#0{{\[}}%[[VAL_89]]] : memref<?xindex>
 // CHECK:             }
-// CHECK:             scf.yield %[[VAL_84]] : index
+// CHECK:             scf.yield %[[VAL_93]] : index
 // CHECK:           }
-// CHECK:           return %[[VAL_77]]#0, %[[VAL_77]]#1, %[[VAL_77]]#2, %[[VAL_77]]#3 : memref<?xindex>, memref<?xindex>, memref<?xf64>, !sparse_tensor.storage_specifier
+// CHECK:           return %[[VAL_86]]#0, %[[VAL_86]]#1, %[[VAL_86]]#2, %[[VAL_86]]#3 : memref<?xindex>, memref<?xindex>, memref<?xf64>, !sparse_tensor.storage_specifier
+// CHECK:         }
 func.func @matmul(%A: tensor<4x8xf64, #CSR>,
                   %B: tensor<8x4xf64, #CSR>) -> tensor<4x4xf64, #CSR> {
   %C = tensor.empty() : tensor<4x4xf64, #CSR>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_matmul_one.mlir b/mlir/test/Dialect/SparseTensor/sparse_matmul_one.mlir
index 82f3147d3206..be2172515d08 100755
--- a/mlir/test/Dialect/SparseTensor/sparse_matmul_one.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_matmul_one.mlir
@@ -2,9 +2,9 @@
 // RUN:             --sparsification-and-bufferization | FileCheck %s
 
 #CSR_ones_complex = #sparse_tensor.encoding<{
-  map = (d0, d1) -> (d0 : dense, d1 : compressed)
-// explicitVal = (1.0, 0.0) : complex<f32>,
-// implicitVal = (0.0, 0.0) : complex<f32>
+  map = (d0, d1) -> (d0 : dense, d1 : compressed),
+  explicitVal = #complex.number<:f32 1.0, 0.0>,
+  implicitVal = #complex.number<:f32 0.0, 0.0>
 }>
 
 #CSR_ones_fp = #sparse_tensor.encoding<{
@@ -20,9 +20,17 @@
 }>
 
 // CHECK-LABEL:   func.func @matmul_complex
-//
-// TODO: make this work
-//
+// CHECK:         scf.for
+// CHECK:           scf.for
+// CHECK:             %[[X:.*]] = memref.load
+// CHECK:             scf.for
+// CHECK:               %[[I:.*]] = memref.load
+// CHECK:               %[[Y:.*]] = memref.load
+// CHECK:               %[[M:.*]] = complex.add %[[Y]], %[[X]] : complex<f32>
+// CHECK:               memref.store %[[M]]
+// CHECK:             }
+// CHECK:           }
+// CHECK:         }
 func.func @matmul_complex(%a: tensor<10x20xcomplex<f32>>,
                           %b: tensor<20x30xcomplex<f32>, #CSR_ones_complex>,
                           %c: tensor<10x30xcomplex<f32>>) -> tensor<10x30xcomplex<f32>> {
diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir
index 6177fe3c752c..8036d996d232 100644
--- a/mlir/test/Dialect/Tensor/canonicalize.mlir
+++ b/mlir/test/Dialect/Tensor/canonicalize.mlir
@@ -755,6 +755,34 @@ func.func @fold_dim_of_tensor.cast(%arg0 : tensor<4x?xf32>) -> (index, index) {
 
 // -----
 
+// CHECK-LABEL: func @insert_slice_cast
+func.func @insert_slice_cast(%arg0 : tensor<1x?xf32>, %arg1 : tensor<?x?xf32>, %arg2 : index, %arg3 : index, %arg4 : index, %arg5 : index, %arg6 : index, %arg7 : index) -> tensor<?x?xf32> {
+  // CHECK-SAME: %[[ARG0:.*]]: tensor<1x?xf32>
+  %0 = tensor.cast %arg0 : tensor<1x?xf32> to tensor<?x?xf32>
+  // CHECK: %[[RES:.*]] = tensor.insert_slice %[[ARG0]]
+  // CHECK-SAME: [{{.*}}, {{.*}}] [1, {{.*}}] [{{.*}}, {{.*}}]
+  // CHECK-SAME: : tensor<1x?xf32> into tensor<?x?xf32>
+  %1 = tensor.insert_slice %0 into %arg1[%arg2, %arg3] [%arg4, %arg5] [%arg6, %arg7] : tensor<?x?xf32> into tensor<?x?xf32>
+  // CHECK: return %[[RES]] : tensor<?x?xf32>
+  return %1 : tensor<?x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @insert_slice_cast_no_fold
+func.func @insert_slice_cast_no_fold(%arg0 : tensor<1x?xf32>, %arg1 : tensor<?x?xf32>, %arg2 : index, %arg3 : index, %arg4 : index, %arg5 : index, %arg6 : index, %arg7 : index) -> tensor<?x?xf32> {
+  %0 = tensor.cast %arg0 : tensor<1x?xf32> to tensor<?x5xf32>
+  // CHECK: %[[CAST:.*]] = tensor.cast
+  // CHECK: %[[RES:.*]] = tensor.insert_slice %[[CAST]]
+  // CHECK-SAME: [{{.*}}, {{.*}}] [{{.*}}, 5] [{{.*}}, {{.*}}]
+  // CHECK-SAME: : tensor<?x5xf32> into tensor<?x?xf32>
+  %1 = tensor.insert_slice %0 into %arg1[%arg2, %arg3] [%arg4, 5] [%arg6, %arg7] : tensor<?x5xf32> into tensor<?x?xf32>
+  // CHECK: return %[[RES]] : tensor<?x?xf32>
+  return %1 : tensor<?x?xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @insert_tensor_cast_on_insert_slice_src(
 // CHECK-SAME:      %[[arg0:.*]]: tensor<?x5x?xf32>, %[[arg1:.*]]: tensor<?x?x?xf32>
 //      CHECK:    %[[cast:.*]] = tensor.cast %[[arg0]] : tensor<?x5x?xf32> to tensor<64x5x64xf32>
@@ -1890,21 +1918,6 @@ func.func @splat_dynamic_no_fold(%m: index) -> tensor<4x?xf32> {
 
 // -----
 
-// There was an issue in cast + insert_slice folding generating invalid ir.
-// https://github.com/llvm/llvm-project/issues/53099
-// CHECK-LABEL: func @insert_slice_cast
-func.func @insert_slice_cast(%arg0 : tensor<1x?xf32>, %arg1 : tensor<?x?xf32>, %arg2 : index, %arg3 : index, %arg4 : index, %arg5 : index, %arg6 : index, %arg7 : index) -> tensor<?x?xf32> {
-  // CHECK: %[[CAST:.*]] = tensor.cast %{{.*}} : tensor<1x?xf32> to tensor<?x?xf32>
-  %0 = tensor.cast %arg0 : tensor<1x?xf32> to tensor<?x?xf32>
-  // CHECK: %[[RES:.*]] = tensor.insert_slice %[[CAST]]
-  // CHECK-SAME: : tensor<?x?xf32> into tensor<?x?xf32>
-  %1 = tensor.insert_slice %0 into %arg1[%arg2, %arg3] [%arg4, %arg5] [%arg6, %arg7] : tensor<?x?xf32> into tensor<?x?xf32>
-  // CHECK: return %[[RES]] : tensor<?x?xf32>
-  return %1 : tensor<?x?xf32>
-}
-
-// -----
-
 // CHECK-LABEL: func @cast_extract_slice
 func.func @cast_extract_slice(%arg0 : tensor<128x512xf32>, %s : index, %o : index)
     -> tensor<16x512xf32> {
diff --git a/mlir/test/Dialect/Tensor/tiling.mlir b/mlir/test/Dialect/Tensor/tiling.mlir
index 1afbd3d0504f..e02ab06a9d53 100644
--- a/mlir/test/Dialect/Tensor/tiling.mlir
+++ b/mlir/test/Dialect/Tensor/tiling.mlir
@@ -34,7 +34,7 @@ func.func @dynamic_pad_tensor_3_4(%input_tensor: tensor<?x?xf32>,
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
       %0 = transform.structured.match ops{["tensor.pad"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      %1, %loops:2 = transform.structured.tile_using_for %0 [2, 3] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+      %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 3] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
       transform.yield
   }
 }
@@ -73,7 +73,7 @@ func.func @dynamic_pad_tensor_0_3(%input_tensor: tensor<?x?xf32>,
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
       %0 = transform.structured.match ops{["tensor.pad"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      %1, %loop = transform.structured.tile_using_for %0 [0, 3] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+      %1, %loop = transform.structured.tile_using_for %0 tile_sizes [0, 3] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
       transform.yield
   }
 }
@@ -109,7 +109,7 @@ func.func @static_pad_tensor_3_4(%input_tensor: tensor<7x9xf32>,
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
       %0 = transform.structured.match ops{["tensor.pad"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      %1, %loops:2 = transform.structured.tile_using_for %0 [2, 3] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+      %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 3] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
       transform.yield
   }
 }
@@ -142,7 +142,7 @@ func.func @static_pad_tensor_0_3(%input_tensor: tensor<7x9xf32>,
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
       %0 = transform.structured.match ops{["tensor.pad"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      %1, %loop = transform.structured.tile_using_for %0 [0, 3] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+      %1, %loop = transform.structured.tile_using_for %0 tile_sizes [0, 3] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
       transform.yield
   }
 }
@@ -179,7 +179,7 @@ func.func @static_pad_tile_evenly_0_3(%input_tensor: tensor<7x9xf32>,
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
       %0 = transform.structured.match ops{["tensor.pad"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      %1, %loop = transform.structured.tile_using_for %0 [0, 3] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+      %1, %loop = transform.structured.tile_using_for %0 tile_sizes [0, 3] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
       transform.yield
   }
 }
@@ -217,7 +217,7 @@ func.func @NC_to_NCnc(%arg0: tensor<128x256xf32>, %arg1: tensor<4x8x32x32xf32>)
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
       %0 = transform.structured.match ops{["tensor.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      %1, %loops:2 = transform.structured.tile_using_for %0 [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+      %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
       transform.yield
   }
 }
@@ -247,7 +247,7 @@ func.func @KC_to_CKkc(%arg0: tensor<128x256xf32>, %arg1: tensor<32x4x32x8xf32>)
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
       %0 = transform.structured.match ops{["tensor.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      %1, %loops:2 = transform.structured.tile_using_for %0 [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+      %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
       transform.yield
   }
 }
@@ -284,7 +284,7 @@ func.func @pad_and_pack_static(%input: tensor<13x15xf32>, %output: tensor<2x8x8x
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
       %0 = transform.structured.match ops{["tensor.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      %1, %loops:2 = transform.structured.tile_using_for %0 [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+      %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
       transform.yield
   }
 }
@@ -335,7 +335,7 @@ func.func @pad_and_pack_partially_dynamic(%input: tensor<?x?xf32>, %output: tens
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
       %0 = transform.structured.match ops{["tensor.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      %1, %loops:2 = transform.structured.tile_using_for %0 [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+      %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
       transform.yield
   }
 }
@@ -391,7 +391,7 @@ func.func @pad_and_pack_fully_dynamic(%source: tensor<?x?xf32>, %dest: tensor<?x
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
       %0 = transform.structured.match ops{["tensor.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      %1, %loops:2 = transform.structured.tile_using_for %0 [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+      %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
       transform.yield
   }
 }
@@ -440,7 +440,7 @@ func.func @NCnc_to_NC(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
       %0 = transform.structured.match ops{["tensor.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      %1, %loops:2 = transform.structured.tile_using_for %0 [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+      %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
       transform.yield
   }
 }
@@ -488,7 +488,7 @@ func.func @CKkc_to_KC(%source: tensor<32x4x32x8xf32>, %dest: tensor<128x256xf32>
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
       %0 = transform.structured.match ops{["tensor.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      %1, %loops:2 = transform.structured.tile_using_for %0 [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+      %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
       transform.yield
   }
 }
@@ -526,7 +526,7 @@ func.func @perfect_CKkc_to_KC(%source: tensor<32x4x2x4xf32>, %dest: tensor<8x128
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
       %0 = transform.structured.match ops{["tensor.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      %1, %loops:2 = transform.structured.tile_using_for %0 [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+      %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
       transform.yield
   }
 }
@@ -570,7 +570,7 @@ func.func @dynamic_perfect_CKkc_to_KC(%source: tensor<?x?x2x2xf32>, %dest: tenso
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
       %0 = transform.structured.match ops{["tensor.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      %1, %loops:2 = transform.structured.tile_using_for %0 [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+      %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
       transform.yield
   }
 }
@@ -607,7 +607,7 @@ func.func @perfect_NKPQk_to_NPQK(%source: tensor<1x4x6x6x2xf32>, %dest: tensor<1
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
       %0 = transform.structured.match ops{["tensor.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      %1, %loops:4 = transform.structured.tile_using_for %0 [1, 1, 1, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+      %1, %loops:4 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
       transform.yield
   }
 }
@@ -635,7 +635,7 @@ func.func @fully_dynamic_unpack(%source: tensor<?x?x?x?xf32>, %dest: tensor<?x?x
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
       %0 = transform.structured.match ops{["tensor.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      %1, %loops:2 = transform.structured.tile_using_for %0 [4, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+      %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [4, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
       transform.yield
   }
 }
@@ -671,7 +671,7 @@ func.func @perfect_NPQK_to_NKPQk(%source: tensor<1x6x6x8xf32>, %dest: tensor<1x4
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
       %0 = transform.structured.match ops{["tensor.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      %1, %loops:4 = transform.structured.tile_using_for %0 [1, 1, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+      %1, %loops:4 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
       transform.yield
   }
 }
diff --git a/mlir/test/Dialect/Transform/foreach-match.mlir b/mlir/test/Dialect/Transform/foreach-match.mlir
index 206625ae0746..a7cd8e9ff543 100644
--- a/mlir/test/Dialect/Transform/foreach-match.mlir
+++ b/mlir/test/Dialect/Transform/foreach-match.mlir
@@ -78,3 +78,113 @@ module attributes { transform.with_named_sequence } {
     transform.yield
   }
 }
+
+// -----
+
+// expected-remark @below {{op from within the matcher}}
+module attributes { transform.with_named_sequence } {
+  // expected-remark @below {{returned root}}
+  func.func @foo() {
+    return
+  }
+
+  transform.named_sequence @match_fail(
+      %op: !transform.any_op {transform.readonly},
+      %root: !transform.any_op {transform.readonly},
+      %param: !transform.param<i64> {transform.readonly}) -> (!transform.any_op, !transform.param<i64>) {
+    transform.test_succeed_if_operand_of_op_kind %op, "test.impossible_to_match" : !transform.any_op
+    transform.yield %root, %param : !transform.any_op, !transform.param<i64>
+  }
+
+  transform.named_sequence @match_succeed(
+      %op: !transform.any_op {transform.readonly},
+      %root: !transform.any_op {transform.readonly},
+      %param: !transform.param<i64> {transform.readonly}) -> (!transform.any_op, !transform.param<i64>) {
+    transform.debug.emit_remark_at %root, "op from within the matcher" : !transform.any_op
+    // expected-remark @below {{param from within the matcher 42}}
+    transform.debug.emit_param_as_remark %param, "param from within the matcher" : !transform.param<i64>
+    transform.yield %root, %param : !transform.any_op, !transform.param<i64>
+  }
+
+  transform.named_sequence @return(
+      %root: !transform.any_op {transform.readonly},
+      %param: !transform.param<i64> {transform.readonly}) -> (!transform.param<i64>, !transform.param<i64>, !transform.any_op) {
+    %func = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op
+    transform.yield %param, %param, %func : !transform.param<i64>, !transform.param<i64>, !transform.any_op
+  }
+
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    %param = transform.param.constant 42 : i64 -> !transform.param<i64>
+    %func = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op
+    %func2, %yielded:3 = transform.foreach_match restrict_root in %func, %root, %param
+      @match_fail -> @return,
+      @match_succeed -> @return
+      : (!transform.any_op, !transform.any_op, !transform.param<i64>) -> (!transform.any_op, !transform.param<i64>, !transform.param<i64>, !transform.any_op)
+    transform.debug.emit_remark_at %yielded#2, "returned root" : !transform.any_op
+    // expected-remark @below {{42 : i64, 42 : i64}}
+    transform.debug.emit_param_as_remark %yielded#0: !transform.param<i64>
+    %num_roots = transform.num_associations %yielded#2 : (!transform.any_op) -> !transform.param<i64>
+    // expected-remark @below {{2 : i64}}
+    transform.debug.emit_param_as_remark %num_roots : !transform.param<i64>
+    transform.yield
+  }
+}
+
+// -----
+
+module attributes { transform.with_named_sequence } {
+  func.func private @foo()
+  func.func private @bar()
+
+  transform.named_sequence @match(
+      %op: !transform.any_op {transform.readonly},
+      %func: !transform.any_op {transform.readonly}) -> (!transform.any_op) {
+    transform.yield %func : !transform.any_op
+  }
+
+  transform.named_sequence @return(
+      %func: !transform.any_op {transform.readonly}) -> (!transform.any_op) {
+    transform.yield %func : !transform.any_op
+  }
+
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    %func = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op
+    %func2, %yielded = transform.foreach_match flatten_results restrict_root in %func, %func
+      @match -> @return
+      : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %num = transform.num_associations %yielded : (!transform.any_op) -> !transform.param<i64>
+    // 2 funcs are yielded for each of the 2 funcs = 4:
+    // expected-remark @below {{4 : i64}}
+    transform.debug.emit_param_as_remark %num : !transform.param<i64>
+    transform.yield
+  }
+}
+
+// -----
+
+
+module attributes { transform.with_named_sequence } {
+  func.func private @foo()
+  func.func private @bar()
+
+  transform.named_sequence @match(
+      %op: !transform.any_op {transform.readonly},
+      %func: !transform.any_op {transform.readonly}) -> (!transform.any_op) {
+    transform.yield %func : !transform.any_op
+  }
+
+  transform.named_sequence @return(
+      %func: !transform.any_op {transform.readonly}) -> (!transform.any_op) {
+    transform.yield %func : !transform.any_op
+  }
+
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    %func = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op
+    // expected-error @below {{action @return has results associated with multiple payload entities, but flattening was not requested}}
+    %func2, %yielded = transform.foreach_match restrict_root in %func, %func
+      @match -> @return
+      : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %num = transform.num_associations %yielded : (!transform.any_op) -> !transform.param<i64>
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/Transform/irdl.mlir b/mlir/test/Dialect/Transform/irdl.mlir
new file mode 100644
index 000000000000..ae68d96c4314
--- /dev/null
+++ b/mlir/test/Dialect/Transform/irdl.mlir
@@ -0,0 +1,25 @@
+// RUN: mlir-opt --transform-interpreter --split-input-file --verify-diagnostics %s
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %0 = transform.irdl.collect_matching in %arg0 : (!transform.any_op) -> (!transform.any_op){
+    ^bb0(%arg1: !transform.any_op):
+      irdl.dialect @test {
+        irdl.operation @whatever {
+          %0 = irdl.is i32
+          %1 = irdl.is i64
+          %2 = irdl.any_of(%0, %1)
+          irdl.results(%2)
+        }
+      }
+    }
+    transform.debug.emit_remark_at %0, "matched" : !transform.any_op
+    transform.yield
+  }
+
+  // expected-remark @below {{matched}}
+  "test.whatever"() : () -> i32
+  "test.whatever"() : () -> f32
+  // expected-remark @below {{matched}}
+  "test.whatever"() : () -> i64
+}
diff --git a/mlir/test/Dialect/Transform/ops-invalid.mlir b/mlir/test/Dialect/Transform/ops-invalid.mlir
index cc04e65420c5..30a68cc5f3c4 100644
--- a/mlir/test/Dialect/Transform/ops-invalid.mlir
+++ b/mlir/test/Dialect/Transform/ops-invalid.mlir
@@ -629,7 +629,84 @@ module attributes { transform.with_named_sequence } {
 // -----
 
 module attributes { transform.with_named_sequence } {
-  transform.named_sequence @match() -> !transform.any_op
+  // expected-note @below {{symbol declaration}}
+  transform.named_sequence @match(!transform.any_op {transform.readonly}, !transform.any_op {transform.readonly}) -> !transform.any_op
+  transform.named_sequence @action(!transform.any_op {transform.readonly})
+
+  transform.sequence failures(propagate) {
+  ^bb0(%root: !transform.any_op):
+    // expected-error @below {{the number of operands (1) doesn't match the number of matcher arguments (2) for @match}}
+    transform.foreach_match in %root
+      @match -> @action : (!transform.any_op) -> !transform.any_op
+  }
+}
+
+// -----
+
+module attributes { transform.with_named_sequence } {
+  // expected-note @below {{symbol declaration}}
+  transform.named_sequence @match(!transform.any_op {transform.readonly}, !transform.any_op {transform.consumed}) -> !transform.any_op
+  transform.named_sequence @action(!transform.any_op {transform.readonly})
+
+  transform.sequence failures(propagate) {
+  ^bb0(%root: !transform.any_op):
+    %r = transform.replicate num(%root) %root : !transform.any_op, !transform.any_op
+    // expected-error @below {{'transform.foreach_match' op does not expect matcher symbol to consume its operand #1}}
+    transform.foreach_match in %root, %r
+      @match -> @action : (!transform.any_op, !transform.any_op) -> !transform.any_op
+  }
+}
+
+// -----
+
+module attributes { transform.with_named_sequence } {
+  // expected-note @below {{symbol declaration}}
+  transform.named_sequence @match(!transform.any_op {transform.readonly}, !transform.any_op {transform.readonly}) -> !transform.any_op
+  transform.named_sequence @action(!transform.any_op {transform.readonly})
+
+  transform.sequence failures(propagate) {
+  ^bb0(%root: !transform.any_op):
+    %r = transform.get_operand %root[0] : (!transform.any_op) -> !transform.any_value
+    // expected-error @below {{mismatching type interfaces for operand and matcher argument #1 of matcher @match}}
+    transform.foreach_match in %root, %r
+      @match -> @action : (!transform.any_op, !transform.any_value) -> !transform.any_op
+  }
+}
+
+// -----
+
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @match(!transform.any_op {transform.readonly}) -> !transform.any_op
+  // expected-note @below {{symbol declaration}}
+  transform.named_sequence @action(!transform.any_op {transform.readonly}) -> !transform.any_op
+
+  transform.sequence failures(propagate) {
+  ^bb0(%root: !transform.any_op):
+    // expected-error @below {{the number of action results (1) for @action doesn't match the number of extra op results (0)}}
+    transform.foreach_match in %root
+      @match -> @action : (!transform.any_op) -> !transform.any_op
+  }
+}
+
+// -----
+
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @match(!transform.any_op {transform.readonly}) -> !transform.any_op
+  // expected-note @below {{symbol declaration}}
+  transform.named_sequence @action(!transform.any_op {transform.readonly}) -> !transform.any_op
+
+  transform.sequence failures(propagate) {
+  ^bb0(%root: !transform.any_op):
+    // expected-error @below {{mismatching type interfaces for action result #0 of action @action and op result}}
+    transform.foreach_match in %root
+      @match -> @action : (!transform.any_op) -> (!transform.any_op, !transform.any_value)
+  }
+}
+
+// -----
+
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @match(!transform.any_op {transform.readonly}) -> !transform.any_op
   transform.named_sequence @action()
 
   transform.sequence failures(propagate) {
@@ -649,7 +726,7 @@ module attributes { transform.with_named_sequence } {
 
   transform.sequence failures(propagate) {
   ^bb0(%root: !transform.any_op):
-    // expected-error @below {{action symbol is not expected to have results}}
+    // expected-error @below {{the number of action results (1) for @action doesn't match the number of extra op results (0)}}
     transform.foreach_match in %root
       @match -> @action : (!transform.any_op) -> !transform.any_op
   }
@@ -664,7 +741,7 @@ module attributes { transform.with_named_sequence } {
 
   transform.sequence failures(propagate) {
   ^bb0(%root: !transform.any_op):
-    // expected-error @below {{expects matcher symbol to have one argument with the same transform interface as the first operand}}
+    // expected-error @below {{the number of operands (1) doesn't match the number of matcher arguments (0) for @match}}
     transform.foreach_match in %root
       @match -> @action : (!transform.any_op) -> !transform.any_op
   }
@@ -679,7 +756,7 @@ module attributes { transform.with_named_sequence } {
 
   transform.sequence failures(propagate) {
   ^bb0(%root: !transform.any_op):
-    // expected-error @below {{'transform.foreach_match' op does not expect matcher symbol to consume its operand}}
+    // expected-error @below {{'transform.foreach_match' op does not expect matcher symbol to consume its operand #0}}
     transform.foreach_match in %root
       @match -> @action : (!transform.any_op) -> !transform.any_op
   }
diff --git a/mlir/test/Dialect/Transform/ops.mlir b/mlir/test/Dialect/Transform/ops.mlir
index ecef7e181e90..b03a9f4d760d 100644
--- a/mlir/test/Dialect/Transform/ops.mlir
+++ b/mlir/test/Dialect/Transform/ops.mlir
@@ -101,19 +101,19 @@ transform.sequence failures(propagate) {
 }
 
 // CHECK: transform.sequence
-// CHECK: transform.structured.tile_using_for %0[4, 4, [4]]
+// CHECK: transform.structured.tile_using_for %0 tile_sizes [4, 4, [4]]
 transform.sequence failures(propagate) {
 ^bb0(%arg1: !transform.any_op):
   %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.structured.tile_using_for %0 [4, 4, [4]] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+  transform.structured.tile_using_for %0 tile_sizes [4, 4, [4]] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
 }
 
 // CHECK: transform.sequence
-// CHECK: transform.structured.tile_using_for %0{{\[}}[2], 4, 8]
+// CHECK: transform.structured.tile_using_for %0 tile_sizes {{\[}}[2], 4, 8]
 transform.sequence failures(propagate) {
 ^bb0(%arg1: !transform.any_op):
   %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.structured.tile_using_for %0 [[2], 4, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+  transform.structured.tile_using_for %0 tile_sizes [[2], 4, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
 }
 
 // CHECK: transform.sequence
diff --git a/mlir/test/Dialect/Transform/selective-targeting.mlir b/mlir/test/Dialect/Transform/selective-targeting.mlir
index e88104315649..69342100935c 100644
--- a/mlir/test/Dialect/Transform/selective-targeting.mlir
+++ b/mlir/test/Dialect/Transform/selective-targeting.mlir
@@ -79,7 +79,7 @@ module attributes {transform.with_named_sequence} {
       transform.sequence %arg0 : !transform.any_op failures(propagate) {
       ^bb1(%arg1: !transform.any_op):
         %0 = pdl_match @pdl_target_attrA in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.structured.tile_using_for %0 [4, 4, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+        transform.structured.tile_using_for %0 tile_sizes [4, 4, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
         %1 = pdl_match @pdl_target_attrC in %arg1 : (!transform.any_op) -> !transform.any_op
         %2 = get_parent_op %1 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
         transform.structured.vectorize_children_and_apply_patterns %2 : (!transform.any_op) -> !transform.any_op
diff --git a/mlir/test/Dialect/Transform/test-interpreter-external-symbol-decl-and-schedule.mlir b/mlir/test/Dialect/Transform/test-interpreter-external-symbol-decl-and-schedule.mlir
deleted file mode 100644
index 9e50ec1efac9..000000000000
--- a/mlir/test/Dialect/Transform/test-interpreter-external-symbol-decl-and-schedule.mlir
+++ /dev/null
@@ -1,20 +0,0 @@
-// RUN: mlir-opt %s --pass-pipeline="builtin.module(test-transform-dialect-interpreter{transform-file-name=%p%{fs-sep}test-interpreter-external-symbol-decl.mlir transform-library-paths=%p%{fs-sep}include%{fs-sep}test-interpreter-library/definitions-self-contained.mlir})" \
-// RUN:             --verify-diagnostics
-
-// RUN: mlir-opt %s --pass-pipeline="builtin.module(test-transform-dialect-interpreter{transform-file-name=%p%{fs-sep}test-interpreter-external-symbol-decl.mlir transform-library-paths=%p%{fs-sep}include%{fs-sep}test-interpreter-library/definitions-self-contained.mlir}, test-transform-dialect-interpreter{transform-file-name=%p%{fs-sep}test-interpreter-external-symbol-decl.mlir transform-library-paths=%p%{fs-sep}include%{fs-sep}test-interpreter-library/definitions-self-contained.mlir})" \
-// RUN:             --verify-diagnostics
-
-// The external transform script has a declaration to the named sequence @foo,
-// the definition of which is provided in another file. Repeated application
-// of the same pass should not be a problem. Note that the same diagnostic
-// produced twice at the same location only needs to be matched once.
-
-// expected-remark @below {{message}}
-// expected-remark @below {{unannotated}}
-// expected-remark @below {{internal colliding (without suffix)}}
-// expected-remark @below {{internal colliding_0}}
-// expected-remark @below {{internal colliding_1}}
-// expected-remark @below {{internal colliding_3}}
-// expected-remark @below {{internal colliding_4}}
-// expected-remark @below {{internal colliding_5}}
-module {}
diff --git a/mlir/test/Dialect/Transform/test-interpreter-external-symbol-decl-dir.mlir b/mlir/test/Dialect/Transform/test-interpreter-external-symbol-decl-dir.mlir
deleted file mode 100644
index 3681b913dc5b..000000000000
--- a/mlir/test/Dialect/Transform/test-interpreter-external-symbol-decl-dir.mlir
+++ /dev/null
@@ -1,28 +0,0 @@
-// RUN: mlir-opt %s --pass-pipeline="builtin.module(test-transform-dialect-interpreter{transform-library-paths=%p%{fs-sep}include%{fs-sep}test-interpreter-library})" \
-// RUN:             --verify-diagnostics --split-input-file | FileCheck %s
-
-// RUN: mlir-opt %s --pass-pipeline="builtin.module(test-transform-dialect-interpreter{transform-library-paths=%p%{fs-sep}include%{fs-sep}test-interpreter-library/definitions-self-contained.mlir,%p%{fs-sep}include%{fs-sep}test-interpreter-library/definitions-with-unresolved.mlir})" \
-// RUN:             --verify-diagnostics --split-input-file | FileCheck %s
-
-// RUN: mlir-opt %s --pass-pipeline="builtin.module(test-transform-dialect-interpreter{transform-library-paths=%p%{fs-sep}include%{fs-sep}test-interpreter-library}, test-transform-dialect-interpreter)" \
-// RUN:             --verify-diagnostics --split-input-file | FileCheck %s
-
-// The definition of the @foo named sequence is provided in another file. It
-// will be included because of the pass option. Repeated application of the
-// same pass, with or without the library option, should not be a problem.
-// Note that the same diagnostic produced twice at the same location only
-// needs to be matched once.
-
-// expected-remark @below {{message}}
-module attributes {transform.with_named_sequence} {
-  // CHECK: transform.named_sequence @print_message
-  transform.named_sequence @print_message(%arg0: !transform.any_op {transform.readonly})
-
-  transform.named_sequence @reference_other_module(!transform.any_op {transform.readonly})
-
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
-    include @print_message failures(propagate) (%arg0) : (!transform.any_op) -> ()
-    include @reference_other_module failures(propagate) (%arg0) : (!transform.any_op) -> ()
-  }
-}
diff --git a/mlir/test/Dialect/Transform/test-interpreter-external-symbol-decl-invalid.mlir b/mlir/test/Dialect/Transform/test-interpreter-external-symbol-decl-invalid.mlir
deleted file mode 100644
index df6739a2ec6c..000000000000
--- a/mlir/test/Dialect/Transform/test-interpreter-external-symbol-decl-invalid.mlir
+++ /dev/null
@@ -1,57 +0,0 @@
-// RUN: mlir-opt %s --pass-pipeline="builtin.module(test-transform-dialect-interpreter{transform-library-paths=%p%{fs-sep}include%{fs-sep}test-interpreter-external-symbol-def-invalid.mlir}, test-transform-dialect-interpreter)" \
-// RUN:             --verify-diagnostics --split-input-file
-
-// The definition of the @print_message named sequence is provided in another file. It
-// will be included because of the pass option.
-
-module attributes {transform.with_named_sequence} {
-  // expected-error @below {{external definition has a mismatching signature}}
-  transform.named_sequence private @print_message(!transform.op<"builtin.module"> {transform.readonly})
-
-  // expected-note @below {{failed to merge library symbols into transform root}}
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.op<"builtin.module">):
-    include @print_message failures(propagate) (%arg0) : (!transform.op<"builtin.module">) -> ()
-  }
-}
-
-// -----
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence private @undefined_sequence()
-
-  transform.sequence failures(suppress) {
-  ^bb0(%arg0: !transform.any_op):
-    // expected-error @below {{unresolved external named sequence}}
-    include @undefined_sequence failures(suppress) () : () -> ()
-  }
-}
-
-// -----
-
-module attributes {transform.with_named_sequence} {
-  // expected-error @below {{external definition has mismatching consumption annotations for argument #0}}
-  transform.named_sequence private @consuming(%arg0: !transform.any_op {transform.readonly})
-
-  // expected-note @below {{failed to merge library symbols into transform root}}
-  transform.sequence failures(suppress) {
-  ^bb0(%arg0: !transform.any_op):
-    include @consuming failures(suppress) (%arg0) : (!transform.any_op) -> ()
-  }
-}
-
-// -----
-
-module attributes {transform.with_named_sequence} {
-  // expected-error @below {{doubly defined symbol @print_message}}
-  transform.named_sequence @print_message(%arg0: !transform.any_op {transform.readonly}) {
-    transform.debug.emit_remark_at %arg0, "message" : !transform.any_op
-    transform.yield
-  }
-
-  // expected-note @below {{failed to merge library symbols into transform root}}
-  transform.sequence failures(suppress) {
-  ^bb0(%arg0: !transform.any_op):
-    include @print_message failures(propagate) (%arg0) : (!transform.any_op) -> ()
-  }
-}
diff --git a/mlir/test/Dialect/Transform/test-interpreter-external-symbol-decl.mlir b/mlir/test/Dialect/Transform/test-interpreter-external-symbol-decl.mlir
deleted file mode 100644
index d7b35e462f61..000000000000
--- a/mlir/test/Dialect/Transform/test-interpreter-external-symbol-decl.mlir
+++ /dev/null
@@ -1,71 +0,0 @@
-// RUN: mlir-opt %s --pass-pipeline="builtin.module(test-transform-dialect-interpreter{transform-library-paths=%p%{fs-sep}include%{fs-sep}test-interpreter-library/definitions-self-contained.mlir})" \
-// RUN:             --verify-diagnostics --split-input-file | FileCheck %s
-
-// RUN: mlir-opt %s --pass-pipeline="builtin.module(test-transform-dialect-interpreter{transform-library-paths=%p%{fs-sep}include%{fs-sep}test-interpreter-library/definitions-self-contained.mlir}, test-transform-dialect-interpreter)" \
-// RUN:             --verify-diagnostics --split-input-file | FileCheck %s
-
-// The definition of the @print_message named sequence is provided in another
-// file. It will be included because of the pass option. Subsequent application
-// of the same pass works but only without the library file (since the first
-// application loads external symbols and loading them again woul make them
-// clash).
-// Note that the same diagnostic produced twice at the same location only
-// needs to be matched once.
-
-// expected-remark @below {{message}}
-// expected-remark @below {{unannotated}}
-// expected-remark @below {{internal colliding (without suffix)}}
-// expected-remark @below {{internal colliding_0}}
-// expected-remark @below {{internal colliding_1}}
-// expected-remark @below {{internal colliding_3}}
-// expected-remark @below {{internal colliding_4}}
-// expected-remark @below {{internal colliding_5}}
-module attributes {transform.with_named_sequence} {
-  // CHECK-DAG: transform.named_sequence @print_message(
-  // CHECK-DAG: transform.include @private_helper
-  transform.named_sequence private @print_message(!transform.any_op {transform.readonly})
-
-  // These ops collide with ops from the other module before or after renaming.
-  transform.named_sequence private @colliding(%arg0: !transform.any_op {transform.readonly}) {
-    transform.debug.emit_remark_at %arg0, "internal colliding (without suffix)" : !transform.any_op
-    transform.yield
-  }
-  transform.named_sequence private @colliding_0(%arg0: !transform.any_op {transform.readonly}) {
-    transform.debug.emit_remark_at %arg0, "internal colliding_0" : !transform.any_op
-    transform.yield
-  }
-  transform.named_sequence private @colliding_1(%arg0: !transform.any_op {transform.readonly}) {
-    transform.debug.emit_remark_at %arg0, "internal colliding_1" : !transform.any_op
-    transform.yield
-  }
-  transform.named_sequence private @colliding_3(%arg0: !transform.any_op {transform.readonly}) {
-    transform.debug.emit_remark_at %arg0, "internal colliding_3" : !transform.any_op
-    transform.yield
-  }
-  // This symbol is public and thus can't be renamed.
-  // CHECK-DAG: transform.named_sequence @colliding_4(
-  transform.named_sequence @colliding_4(%arg0: !transform.any_op {transform.readonly}) {
-    transform.debug.emit_remark_at %arg0, "internal colliding_4" : !transform.any_op
-    transform.yield
-  }
-  transform.named_sequence private @colliding_5(%arg0: !transform.any_op {transform.readonly}) {
-    transform.debug.emit_remark_at %arg0, "internal colliding_5" : !transform.any_op
-    transform.yield
-  }
-
-  // CHECK-DAG: transform.named_sequence @unannotated(
-  // CHECK-DAG: transform.debug.emit_remark_at %{{.*}}, "unannotated"
-  transform.named_sequence @unannotated(!transform.any_op {transform.readonly})
-
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
-    include @print_message failures(propagate) (%arg0) : (!transform.any_op) -> ()
-    include @unannotated failures(propagate) (%arg0) : (!transform.any_op) -> ()
-    include @colliding failures(propagate) (%arg0) : (!transform.any_op) -> ()
-    include @colliding_0 failures(propagate) (%arg0) : (!transform.any_op) -> ()
-    include @colliding_1 failures(propagate) (%arg0) : (!transform.any_op) -> ()
-    include @colliding_3 failures(propagate) (%arg0) : (!transform.any_op) -> ()
-    include @colliding_4 failures(propagate) (%arg0) : (!transform.any_op) -> ()
-    include @colliding_5 failures(propagate) (%arg0) : (!transform.any_op) -> ()
-  }
-}
diff --git a/mlir/test/Dialect/Transform/test-interpreter-module-generation.mlir b/mlir/test/Dialect/Transform/test-interpreter-module-generation.mlir
deleted file mode 100644
index 159aed720964..000000000000
--- a/mlir/test/Dialect/Transform/test-interpreter-module-generation.mlir
+++ /dev/null
@@ -1,4 +0,0 @@
-// RUN: mlir-opt %s --test-transform-dialect-interpreter=test-module-generation=1 --verify-diagnostics
-
-// expected-remark @below {{remark from generated}}
-module {}
diff --git a/mlir/test/Dialect/Transform/test-interpreter-multiple-top-level-ops.mlir b/mlir/test/Dialect/Transform/test-interpreter-multiple-top-level-ops.mlir
deleted file mode 100644
index a3e3f057817c..000000000000
--- a/mlir/test/Dialect/Transform/test-interpreter-multiple-top-level-ops.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-// RUN: mlir-opt %s --test-transform-dialect-interpreter='enforce-single-top-level-transform-op=0' -allow-unregistered-dialect --split-input-file --verify-diagnostics | FileCheck %s
-
-transform.sequence failures(propagate) {
-// CHECK: transform.sequence
-^bb0(%arg0: !transform.any_op):
-}
-
-transform.sequence failures(propagate) {
-// CHECK: transform.sequence
-^bb0(%arg0: !transform.any_op):
-}
-
-// -----
-
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  %match = transform.structured.match ops{["transform.get_parent_op"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  transform.debug.emit_remark_at %match, "found get_parent_op" : !transform.any_op
-}
-
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  %op = transform.structured.match ops{[]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  // expected-remark @below{{found get_parent_op}}
-  %1 = transform.get_parent_op %op : (!transform.any_op) -> !transform.any_op
-}
diff --git a/mlir/test/Dialect/Transform/test-repro-dump.mlir b/mlir/test/Dialect/Transform/test-repro-dump.mlir
deleted file mode 100644
index 89624da7efd0..000000000000
--- a/mlir/test/Dialect/Transform/test-repro-dump.mlir
+++ /dev/null
@@ -1,32 +0,0 @@
-// REQUIRES: asserts
-// RUN: mlir-opt %s --test-transform-dialect-interpreter \
-// RUN:             --mlir-disable-threading \
-// RUN:             --debug-only=transform-dialect-dump-repro 2>&1 \
-// RUN: | FileCheck %s
-
-module {
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
-    transform.debug.emit_remark_at %arg0, "remark" : !transform.any_op
-  }
-}
-
-// Verify that the repro string is dumped.
-
-// CHECK: Transform Interpreter Repro
-// CHECK: cat <<EOF | mlir-opt --pass-pipeline="builtin.module(test-transform-dialect-interpreter{debug-payload-root-tag=payload_root debug-transform-root-tag=transform_container})"
-
-// Verify that the IR is dumped with tags.
-
-// CHECK: module
-// CHECK-SAME: transform.target_tag = "payload_root"
-// CHECK: transform.sequence
-// CHECK-SAME: transform.target_tag = "transform_container"
-// CHECK: EOF
-
-// Verify that the actual IR after the pass doesn't have the tags.
-
-// CHECK: module
-// CHECK-NOT: transform.target_tag = "payload_root"
-// CHECK: transform.sequence
-// CHECK-NOT: transform.target_tag = "transform_container"
diff --git a/mlir/test/Dialect/Vector/CPU/X86/vector-transpose-lowering.mlir b/mlir/test/Dialect/Vector/CPU/X86/vector-transpose-lowering.mlir
new file mode 100644
index 000000000000..ae2b5393ca44
--- /dev/null
+++ b/mlir/test/Dialect/Vector/CPU/X86/vector-transpose-lowering.mlir
@@ -0,0 +1,493 @@
+// RUN: mlir-opt %s --transform-interpreter --split-input-file | FileCheck %s
+
+// NOTE: This file tests lowerings that are implemented in the X86Vector
+// dialect. Since X86 does not support scalable vectors, all examples in this
+// file use fixed-width vectors.
+
+// CHECK-LABEL: func @transpose4x8
+func.func @transpose4x8xf32(%arg0: vector<4x8xf32>) -> vector<8x4xf32> {
+  //      CHECK: vector.extract {{.*}}[0]
+  // CHECK-NEXT: vector.extract {{.*}}[1]
+  // CHECK-NEXT: vector.extract {{.*}}[2]
+  // CHECK-NEXT: vector.extract {{.*}}[3]
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 1, 8, 9, 4, 5, 12, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 3, 10, 11, 6, 7, 14, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 1, 8, 9, 4, 5, 12, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 3, 10, 11, 6, 7, 14, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.insert {{.*}}[0]
+  // CHECK-NEXT: vector.insert {{.*}}[1]
+  // CHECK-NEXT: vector.insert {{.*}}[2]
+  // CHECK-NEXT: vector.insert {{.*}}[3]
+  // CHECK-NEXT: vector.shape_cast {{.*}} vector<4x8xf32> to vector<32xf32>
+  // CHECK-NEXT: vector.shape_cast {{.*}} vector<32xf32> to vector<8x4xf32>
+  %0 = vector.transpose %arg0, [1, 0] : vector<4x8xf32> to vector<8x4xf32>
+  return %0 : vector<8x4xf32>
+}
+
+// CHECK-LABEL: func @transpose021_1x4x8
+func.func @transpose021_1x4x8xf32(%arg0: vector<1x4x8xf32>) -> vector<1x8x4xf32> {
+  //      CHECK: vector.extract {{.*}}[0, 0]
+  // CHECK-NEXT: vector.extract {{.*}}[0, 1]
+  // CHECK-NEXT: vector.extract {{.*}}[0, 2]
+  // CHECK-NEXT: vector.extract {{.*}}[0, 3]
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 1, 8, 9, 4, 5, 12, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 3, 10, 11, 6, 7, 14, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 1, 8, 9, 4, 5, 12, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 3, 10, 11, 6, 7, 14, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.insert {{.*}}[0]
+  // CHECK-NEXT: vector.insert {{.*}}[1]
+  // CHECK-NEXT: vector.insert {{.*}}[2]
+  // CHECK-NEXT: vector.insert {{.*}}[3]
+  // CHECK-NEXT: vector.shape_cast {{.*}} vector<4x8xf32> to vector<32xf32>
+  // CHECK-NEXT: vector.shape_cast {{.*}} vector<32xf32> to vector<1x8x4xf32>
+  %0 = vector.transpose %arg0, [0, 2, 1] : vector<1x4x8xf32> to vector<1x8x4xf32>
+  return %0 : vector<1x8x4xf32>
+}
+
+// CHECK-LABEL: func @transpose8x8
+func.func @transpose8x8xf32(%arg0: vector<8x8xf32>) -> vector<8x8xf32> {
+  //      CHECK: vector.extract {{.*}}[0]
+  // CHECK-NEXT: vector.extract {{.*}}[1]
+  // CHECK-NEXT: vector.extract {{.*}}[2]
+  // CHECK-NEXT: vector.extract {{.*}}[3]
+  // CHECK-NEXT: vector.extract {{.*}}[4]
+  // CHECK-NEXT: vector.extract {{.*}}[5]
+  // CHECK-NEXT: vector.extract {{.*}}[6]
+  // CHECK-NEXT: vector.extract {{.*}}[7]
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-COUNT-4: vector.shuffle {{.*}} [2, 3, 8, 9, 6, 7, 12, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-COUNT-4: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32>
+  // CHECK-COUNT-4: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.insert {{.*}}[0]
+  // CHECK-NEXT: vector.insert {{.*}}[1]
+  // CHECK-NEXT: vector.insert {{.*}}[2]
+  // CHECK-NEXT: vector.insert {{.*}}[3]
+  // CHECK-NEXT: vector.insert {{.*}}[4]
+  // CHECK-NEXT: vector.insert {{.*}}[5]
+  // CHECK-NEXT: vector.insert {{.*}}[6]
+  // CHECK-NEXT: vector.insert {{.*}}[7]
+  %0 = vector.transpose %arg0, [1, 0] : vector<8x8xf32> to vector<8x8xf32>
+  return %0 : vector<8x8xf32>
+}
+
+// CHECK-LABEL: func @transpose021_1x8x8
+func.func @transpose021_1x8x8xf32(%arg0: vector<1x8x8xf32>) -> vector<1x8x8xf32> {
+  //      CHECK: vector.extract {{.*}}[0, 0]
+  // CHECK-NEXT: vector.extract {{.*}}[0, 1]
+  // CHECK-NEXT: vector.extract {{.*}}[0, 2]
+  // CHECK-NEXT: vector.extract {{.*}}[0, 3]
+  // CHECK-NEXT: vector.extract {{.*}}[0, 4]
+  // CHECK-NEXT: vector.extract {{.*}}[0, 5]
+  // CHECK-NEXT: vector.extract {{.*}}[0, 6]
+  // CHECK-NEXT: vector.extract {{.*}}[0, 7]
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-COUNT-4: vector.shuffle {{.*}} [2, 3, 8, 9, 6, 7, 12, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-COUNT-4: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32>
+  // CHECK-COUNT-4: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.insert {{.*}}[0]
+  // CHECK-NEXT: vector.insert {{.*}}[1]
+  // CHECK-NEXT: vector.insert {{.*}}[2]
+  // CHECK-NEXT: vector.insert {{.*}}[3]
+  // CHECK-NEXT: vector.insert {{.*}}[4]
+  // CHECK-NEXT: vector.insert {{.*}}[5]
+  // CHECK-NEXT: vector.insert {{.*}}[6]
+  // CHECK-NEXT: vector.insert {{.*}}[7]
+  // CHECK-NEXT: vector.shape_cast %{{.*}} : vector<8x8xf32> to vector<1x8x8xf32>
+  %0 = vector.transpose %arg0, [0, 2, 1] : vector<1x8x8xf32> to vector<1x8x8xf32>
+  return %0 : vector<1x8x8xf32>
+}
+
+// CHECK-LABEL: func @transpose120_8x1x8
+func.func @transpose120_8x1x8xf32(%arg0: vector<8x1x8xf32>) -> vector<1x8x8xf32> {
+  //      CHECK: vector.extract {{.*}}[0, 0]
+  // CHECK-NEXT: vector.extract {{.*}}[1, 0]
+  // CHECK-NEXT: vector.extract {{.*}}[2, 0]
+  // CHECK-NEXT: vector.extract {{.*}}[3, 0]
+  // CHECK-NEXT: vector.extract {{.*}}[4, 0]
+  // CHECK-NEXT: vector.extract {{.*}}[5, 0]
+  // CHECK-NEXT: vector.extract {{.*}}[6, 0]
+  // CHECK-NEXT: vector.extract {{.*}}[7, 0]
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-COUNT-4: vector.shuffle {{.*}} [2, 3, 8, 9, 6, 7, 12, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-COUNT-4: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32>
+  // CHECK-COUNT-4: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.insert {{.*}}[0]
+  // CHECK-NEXT: vector.insert {{.*}}[1]
+  // CHECK-NEXT: vector.insert {{.*}}[2]
+  // CHECK-NEXT: vector.insert {{.*}}[3]
+  // CHECK-NEXT: vector.insert {{.*}}[4]
+  // CHECK-NEXT: vector.insert {{.*}}[5]
+  // CHECK-NEXT: vector.insert {{.*}}[6]
+  // CHECK-NEXT: vector.insert {{.*}}[7]
+  // CHECK-NEXT: vector.shape_cast %{{.*}} : vector<8x8xf32> to vector<1x8x8xf32>
+  %0 = vector.transpose %arg0, [1, 2, 0] : vector<8x1x8xf32> to vector<1x8x8xf32>
+  return %0 : vector<1x8x8xf32>
+}
+
+// CHECK-LABEL: func @transpose120_8x8x1
+func.func @transpose120_8x8x1xf32(%arg0: vector<8x8x1xf32>) -> vector<8x1x8xf32> {
+  //      CHECK: vector.shape_cast %{{.*}} : vector<8x8x1xf32> to vector<8x8xf32>
+  // CHECK-NEXT: vector.extract {{.*}}[0]
+  // CHECK-NEXT: vector.extract {{.*}}[1]
+  // CHECK-NEXT: vector.extract {{.*}}[2]
+  // CHECK-NEXT: vector.extract {{.*}}[3]
+  // CHECK-NEXT: vector.extract {{.*}}[4]
+  // CHECK-NEXT: vector.extract {{.*}}[5]
+  // CHECK-NEXT: vector.extract {{.*}}[6]
+  // CHECK-NEXT: vector.extract {{.*}}[7]
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-COUNT-4: vector.shuffle {{.*}} [2, 3, 8, 9, 6, 7, 12, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-COUNT-4: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32>
+  // CHECK-COUNT-4: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.insert {{.*}}[0]
+  // CHECK-NEXT: vector.insert {{.*}}[1]
+  // CHECK-NEXT: vector.insert {{.*}}[2]
+  // CHECK-NEXT: vector.insert {{.*}}[3]
+  // CHECK-NEXT: vector.insert {{.*}}[4]
+  // CHECK-NEXT: vector.insert {{.*}}[5]
+  // CHECK-NEXT: vector.insert {{.*}}[6]
+  // CHECK-NEXT: vector.insert {{.*}}[7]
+  // CHECK-NEXT: vector.shape_cast %{{.*}} : vector<8x8xf32> to vector<8x1x8xf32>
+  %0 = vector.transpose %arg0, [1, 2, 0] : vector<8x8x1xf32> to vector<8x1x8xf32>
+  return %0 : vector<8x1x8xf32>
+}
+
+// CHECK-LABEL: func @transpose102_8x8x1
+func.func @transpose102_8x8x1xf32(%arg0: vector<8x8x1xf32>) -> vector<8x8x1xf32> {
+  //      CHECK: vector.shape_cast %{{.*}} : vector<8x8x1xf32> to vector<8x8xf32>
+  // CHECK-NEXT: vector.extract {{.*}}[0]
+  // CHECK-NEXT: vector.extract {{.*}}[1]
+  // CHECK-NEXT: vector.extract {{.*}}[2]
+  // CHECK-NEXT: vector.extract {{.*}}[3]
+  // CHECK-NEXT: vector.extract {{.*}}[4]
+  // CHECK-NEXT: vector.extract {{.*}}[5]
+  // CHECK-NEXT: vector.extract {{.*}}[6]
+  // CHECK-NEXT: vector.extract {{.*}}[7]
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-COUNT-4: vector.shuffle {{.*}} [2, 3, 8, 9, 6, 7, 12, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-COUNT-4: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32>
+  // CHECK-COUNT-4: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.insert {{.*}}[0]
+  // CHECK-NEXT: vector.insert {{.*}}[1]
+  // CHECK-NEXT: vector.insert {{.*}}[2]
+  // CHECK-NEXT: vector.insert {{.*}}[3]
+  // CHECK-NEXT: vector.insert {{.*}}[4]
+  // CHECK-NEXT: vector.insert {{.*}}[5]
+  // CHECK-NEXT: vector.insert {{.*}}[6]
+  // CHECK-NEXT: vector.insert {{.*}}[7]
+  // CHECK-NEXT: vector.shape_cast %{{.*}} : vector<8x8xf32> to vector<8x8x1xf32>
+  %0 = vector.transpose %arg0, [1, 0, 2] : vector<8x8x1xf32> to vector<8x8x1xf32>
+  return %0 : vector<8x8x1xf32>
+}
+
+// CHECK-LABEL: func @transpose201_8x1x8
+func.func @transpose201_8x1x8xf32(%arg0: vector<8x1x8xf32>) -> vector<8x8x1xf32> {
+  //      CHECK: vector.extract {{.*}}[0, 0]
+  // CHECK-NEXT: vector.extract {{.*}}[1, 0]
+  // CHECK-NEXT: vector.extract {{.*}}[2, 0]
+  // CHECK-NEXT: vector.extract {{.*}}[3, 0]
+  // CHECK-NEXT: vector.extract {{.*}}[4, 0]
+  // CHECK-NEXT: vector.extract {{.*}}[5, 0]
+  // CHECK-NEXT: vector.extract {{.*}}[6, 0]
+  // CHECK-NEXT: vector.extract {{.*}}[7, 0]
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-COUNT-4: vector.shuffle {{.*}} [2, 3, 8, 9, 6, 7, 12, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-COUNT-4: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32>
+  // CHECK-COUNT-4: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.insert {{.*}}[0]
+  // CHECK-NEXT: vector.insert {{.*}}[1]
+  // CHECK-NEXT: vector.insert {{.*}}[2]
+  // CHECK-NEXT: vector.insert {{.*}}[3]
+  // CHECK-NEXT: vector.insert {{.*}}[4]
+  // CHECK-NEXT: vector.insert {{.*}}[5]
+  // CHECK-NEXT: vector.insert {{.*}}[6]
+  // CHECK-NEXT: vector.insert {{.*}}[7]
+  // CHECK-NEXT: vector.shape_cast %{{.*}} : vector<8x8xf32> to vector<8x8x1xf32>
+  %0 = vector.transpose %arg0, [2, 0, 1] : vector<8x1x8xf32> to vector<8x8x1xf32>
+  return %0 : vector<8x8x1xf32>
+}
+
+// CHECK-LABEL: func @transpose201_1x8x8
+func.func @transpose201_1x8x8xf32(%arg0: vector<1x8x8xf32>) -> vector<8x1x8xf32> {
+  //      CHECK: vector.extract {{.*}}[0, 0]
+  // CHECK-NEXT: vector.extract {{.*}}[0, 1]
+  // CHECK-NEXT: vector.extract {{.*}}[0, 2]
+  // CHECK-NEXT: vector.extract {{.*}}[0, 3]
+  // CHECK-NEXT: vector.extract {{.*}}[0, 4]
+  // CHECK-NEXT: vector.extract {{.*}}[0, 5]
+  // CHECK-NEXT: vector.extract {{.*}}[0, 6]
+  // CHECK-NEXT: vector.extract {{.*}}[0, 7]
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-COUNT-4: vector.shuffle {{.*}} [2, 3, 8, 9, 6, 7, 12, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-COUNT-4: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32>
+  // CHECK-COUNT-4: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.insert {{.*}}[0]
+  // CHECK-NEXT: vector.insert {{.*}}[1]
+  // CHECK-NEXT: vector.insert {{.*}}[2]
+  // CHECK-NEXT: vector.insert {{.*}}[3]
+  // CHECK-NEXT: vector.insert {{.*}}[4]
+  // CHECK-NEXT: vector.insert {{.*}}[5]
+  // CHECK-NEXT: vector.insert {{.*}}[6]
+  // CHECK-NEXT: vector.insert {{.*}}[7]
+  // CHECK-NEXT: vector.shape_cast %{{.*}} : vector<8x8xf32> to vector<8x1x8xf32>
+  %0 = vector.transpose %arg0, [2, 0, 1] : vector<1x8x8xf32> to vector<8x1x8xf32>
+  return %0 : vector<8x1x8xf32>
+}
+
+// CHECK-LABEL: func @transpose210_8x1x8
+func.func @transpose210_8x1x8xf32(%arg0: vector<8x1x8xf32>) -> vector<8x1x8xf32> {
+  //      CHECK: vector.extract {{.*}}[0, 0]
+  // CHECK-NEXT: vector.extract {{.*}}[1, 0]
+  // CHECK-NEXT: vector.extract {{.*}}[2, 0]
+  // CHECK-NEXT: vector.extract {{.*}}[3, 0]
+  // CHECK-NEXT: vector.extract {{.*}}[4, 0]
+  // CHECK-NEXT: vector.extract {{.*}}[5, 0]
+  // CHECK-NEXT: vector.extract {{.*}}[6, 0]
+  // CHECK-NEXT: vector.extract {{.*}}[7, 0]
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-COUNT-4: vector.shuffle {{.*}} [2, 3, 8, 9, 6, 7, 12, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-COUNT-4: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32>
+  // CHECK-COUNT-4: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.insert {{.*}}[0]
+  // CHECK-NEXT: vector.insert {{.*}}[1]
+  // CHECK-NEXT: vector.insert {{.*}}[2]
+  // CHECK-NEXT: vector.insert {{.*}}[3]
+  // CHECK-NEXT: vector.insert {{.*}}[4]
+  // CHECK-NEXT: vector.insert {{.*}}[5]
+  // CHECK-NEXT: vector.insert {{.*}}[6]
+  // CHECK-NEXT: vector.insert {{.*}}[7]
+  // CHECK-NEXT: vector.shape_cast %{{.*}} : vector<8x8xf32> to vector<8x1x8xf32>
+  %0 = vector.transpose %arg0, [2, 1, 0] : vector<8x1x8xf32> to vector<8x1x8xf32>
+  return %0 : vector<8x1x8xf32>
+}
+
+// CHECK-LABEL: func @transpose210_8x8x1
+func.func @transpose210_8x8x1xf32(%arg0: vector<8x8x1xf32>) -> vector<1x8x8xf32> {
+  //      CHECK: vector.shape_cast %{{.*}} : vector<8x8x1xf32> to vector<8x8xf32>
+  // CHECK-NEXT: vector.extract {{.*}}[0]
+  // CHECK-NEXT: vector.extract {{.*}}[1]
+  // CHECK-NEXT: vector.extract {{.*}}[2]
+  // CHECK-NEXT: vector.extract {{.*}}[3]
+  // CHECK-NEXT: vector.extract {{.*}}[4]
+  // CHECK-NEXT: vector.extract {{.*}}[5]
+  // CHECK-NEXT: vector.extract {{.*}}[6]
+  // CHECK-NEXT: vector.extract {{.*}}[7]
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-COUNT-4: vector.shuffle {{.*}} [2, 3, 8, 9, 6, 7, 12, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-COUNT-4: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32>
+  // CHECK-COUNT-4: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.insert {{.*}}[0]
+  // CHECK-NEXT: vector.insert {{.*}}[1]
+  // CHECK-NEXT: vector.insert {{.*}}[2]
+  // CHECK-NEXT: vector.insert {{.*}}[3]
+  // CHECK-NEXT: vector.insert {{.*}}[4]
+  // CHECK-NEXT: vector.insert {{.*}}[5]
+  // CHECK-NEXT: vector.insert {{.*}}[6]
+  // CHECK-NEXT: vector.insert {{.*}}[7]
+  // CHECK-NEXT: vector.shape_cast %{{.*}} : vector<8x8xf32> to vector<1x8x8xf32>
+  %0 = vector.transpose %arg0, [2, 1, 0] : vector<8x8x1xf32> to vector<1x8x8xf32>
+  return %0 : vector<1x8x8xf32>
+}
+
+// CHECK-LABEL: func @transpose210_1x8x8
+func.func @transpose210_1x8x8xf32(%arg0: vector<1x8x8xf32>) -> vector<8x8x1xf32> {
+  //      CHECK: vector.extract {{.*}}[0, 0]
+  // CHECK-NEXT: vector.extract {{.*}}[0, 1]
+  // CHECK-NEXT: vector.extract {{.*}}[0, 2]
+  // CHECK-NEXT: vector.extract {{.*}}[0, 3]
+  // CHECK-NEXT: vector.extract {{.*}}[0, 4]
+  // CHECK-NEXT: vector.extract {{.*}}[0, 5]
+  // CHECK-NEXT: vector.extract {{.*}}[0, 6]
+  // CHECK-NEXT: vector.extract {{.*}}[0, 7]
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-COUNT-4: vector.shuffle {{.*}} [2, 3, 8, 9, 6, 7, 12, 13] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
+  // CHECK-COUNT-4: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32>
+  // CHECK-COUNT-4: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32>
+  // CHECK-NEXT: vector.insert {{.*}}[0]
+  // CHECK-NEXT: vector.insert {{.*}}[1]
+  // CHECK-NEXT: vector.insert {{.*}}[2]
+  // CHECK-NEXT: vector.insert {{.*}}[3]
+  // CHECK-NEXT: vector.insert {{.*}}[4]
+  // CHECK-NEXT: vector.insert {{.*}}[5]
+  // CHECK-NEXT: vector.insert {{.*}}[6]
+  // CHECK-NEXT: vector.insert {{.*}}[7]
+  // CHECK-NEXT: vector.shape_cast %{{.*}} : vector<8x8xf32> to vector<8x8x1xf32>
+  %0 = vector.transpose %arg0, [2, 1, 0] : vector<1x8x8xf32> to vector<8x8x1xf32>
+  return %0 : vector<8x8x1xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
+    transform.apply_patterns to %func_op {
+      transform.apply_patterns.vector.lower_transpose avx2_lowering_strategy = true
+    } : !transform.op<"func.func">
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/Vector/transform-vector.mlir b/mlir/test/Dialect/Vector/transform-vector.mlir
index a0ca8c2fa9b6..75b29e22b4d2 100644
--- a/mlir/test/Dialect/Vector/transform-vector.mlir
+++ b/mlir/test/Dialect/Vector/transform-vector.mlir
@@ -16,7 +16,7 @@ func.func @matmul_tensors(
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.consumed}) {
     %0 = transform.structured.match ops{["linalg.matmul"]} in %module_op : (!transform.any_op) -> !transform.any_op
-    %1, %loops:3 = transform.structured.tile_using_for %0 [8, 4, 2]
+    %1, %loops:3 = transform.structured.tile_using_for %0 tile_sizes [8, 4, 2]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
     %2 = transform.get_parent_op %1 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
     transform.structured.vectorize_children_and_apply_patterns %2 : (!transform.any_op) -> !transform.any_op
diff --git a/mlir/test/Dialect/Vector/vector-transfer-permutation-lowering.mlir b/mlir/test/Dialect/Vector/vector-transfer-permutation-lowering.mlir
index 31bd19c0be8e..e48af3cd7aac 100644
--- a/mlir/test/Dialect/Vector/vector-transfer-permutation-lowering.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-permutation-lowering.mlir
@@ -1,23 +1,84 @@
 // RUN: mlir-opt %s --transform-interpreter --split-input-file | FileCheck %s
 
-// CHECK-LABEL: func @lower_permutation_with_mask_fixed_width(
+///----------------------------------------------------------------------------------------
+/// vector.transfer_write
+///----------------------------------------------------------------------------------------
+/// Input: 
+///   * vector.transfer_write op with a map which _is not_ the permutation of a
+///     minor identity
+/// Output:
+///   * vector.broadcast + vector.transfer_write with a map which _is_ the permutation of a
+///     minor identity
+
+// CHECK-LABEL: func @permutation_with_mask_xfer_write_fixed_width(
 //       CHECK:   %[[vec:.*]] = arith.constant dense<-2.000000e+00> : vector<7x1xf32>
 //       CHECK:   %[[mask:.*]] = arith.constant dense<[true, false, true, false, true, true, true]> : vector<7xi1>
 //       CHECK:   %[[b:.*]] = vector.broadcast %[[mask]] : vector<7xi1> to vector<1x7xi1>
 //       CHECK:   %[[tp:.*]] = vector.transpose %[[b]], [1, 0] : vector<1x7xi1> to vector<7x1xi1>
 //       CHECK:   vector.transfer_write %[[vec]], %{{.*}}[%{{.*}}, %{{.*}}], %[[tp]] {in_bounds = [false, true]} : vector<7x1xf32>, memref<?x?xf32>
-func.func @lower_permutation_with_mask_fixed_width(%A : memref<?x?xf32>, %base1 : index,
-                                       %base2 : index) {
+func.func @permutation_with_mask_xfer_write_fixed_width(%mem : memref<?x?xf32>, %base1 : index,
+                                                   %base2 : index) {
+
   %fn1 = arith.constant -2.0 : f32
   %vf0 = vector.splat %fn1 : vector<7xf32>
   %mask = arith.constant dense<[1, 0, 1, 0, 1, 1, 1]> : vector<7xi1>
-  vector.transfer_write %vf0, %A[%base1, %base2], %mask
+  vector.transfer_write %vf0, %mem[%base1, %base2], %mask
     {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds = [false]}
     : vector<7xf32>, memref<?x?xf32>
   return
 }
 
-// CHECK-LABEL:   func.func @permutation_with_mask_scalable(
+// CHECK:           func.func @permutation_with_mask_xfer_write_scalable(
+// CHECK-SAME:        %[[ARG_0:.*]]: vector<4x[8]xi16>,
+// CHECK-SAME:        %[[ARG_1:.*]]: memref<1x4x?x1xi16>,
+// CHECK-SAME:        %[[MASK:.*]]: vector<4x[8]xi1>) {
+// CHECK:             %[[C0:.*]] = arith.constant 0 : index
+// CHECK:             %[[BCAST_1:.*]] = vector.broadcast %[[ARG_0]] : vector<4x[8]xi16> to vector<1x4x[8]xi16>
+// CHECK:             %[[BCAST_2:.*]] = vector.broadcast %[[MASK]] : vector<4x[8]xi1> to vector<1x4x[8]xi1>
+// CHECK:             %[[TRANSPOSE_1:.*]] =  vector.transpose %[[BCAST_2]], [1, 2, 0] : vector<1x4x[8]xi1> to vector<4x[8]x1xi1>
+// CHECK:             %[[TRANSPOSE_2:.*]] =  vector.transpose %[[BCAST_1]], [1, 2, 0] : vector<1x4x[8]xi16> to vector<4x[8]x1xi16>
+// CHECK:             vector.transfer_write %[[TRANSPOSE_2]], %[[ARG_1]]{{.*}}, %[[TRANSPOSE_1]] {in_bounds = [true, true, true]} : vector<4x[8]x1xi16>, memref<1x4x?x1xi16>
+func.func @permutation_with_mask_xfer_write_scalable(%arg0: vector<4x[8]xi16>, %mem: memref<1x4x?x1xi16>, %mask:  vector<4x[8]xi1>){
+     %c0 = arith.constant 0 : index
+      vector.transfer_write %arg0, %mem[%c0, %c0, %c0, %c0], %mask {in_bounds = [true, true], permutation_map = affine_map<(d0, d1, d2, d3) -> (d1, d2)>
+} : vector<4x[8]xi16>, memref<1x4x?x1xi16>
+
+    return
+}
+
+///----------------------------------------------------------------------------------------
+/// vector.transfer_read
+///----------------------------------------------------------------------------------------
+/// Input: 
+///   * vector.transfer_read op with a permutation map
+/// Output:
+///   * vector.transfer_read with a permutation map composed of leading zeros followed by a minor identiy +
+///     vector.transpose op
+
+// CHECK-LABEL:   func.func @permutation_with_mask_xfer_read_fixed_width(
+// CHECK-SAME:      %[[ARG_0:.*]]: memref<?x?xf32>,
+// CHECK-SAME:      %[[IDX_1:.*]]: index,
+// CHECK-SAME:      %[[IDX_2:.*]]: index) -> vector<8x4x2xf32> {
+// CHECK:           %[[C0:.*]] = arith.constant 0 : index
+// CHECK:           %[[PASS_THROUGH:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[MASK:.*]] = vector.create_mask %[[IDX_2]], %[[IDX_1]] : vector<2x4xi1>
+// CHECK:           %[[T_READ:.*]] = vector.transfer_read %[[ARG_0]]{{\[}}%[[C0]], %[[C0]]], %[[PASS_THROUGH]], %[[MASK]] {in_bounds = [true, true]} : memref<?x?xf32>, vector<2x4xf32>
+// CHECK:           %[[BCAST:.*]] = vector.broadcast %[[T_READ]] : vector<2x4xf32> to vector<8x2x4xf32>
+// CHECK:           %[[TRANSPOSE:.*]] = vector.transpose %[[BCAST]], [0, 2, 1] : vector<8x2x4xf32> to vector<8x4x2xf32>
+// CHECK:           return %[[TRANSPOSE]] : vector<8x4x2xf32>
+func.func @permutation_with_mask_xfer_read_fixed_width(%mem: memref<?x?xf32>, %dim_1: index, %dim_2: index) -> (vector<8x4x2xf32>) {
+
+  %c0 = arith.constant 0 : index
+  %cst_0 = arith.constant 0.000000e+00 : f32
+
+  %mask = vector.create_mask %dim_2, %dim_1 : vector<2x4xi1>
+  %1 = vector.transfer_read %mem[%c0, %c0], %cst_0, %mask
+    {in_bounds = [true, true, true], permutation_map = affine_map<(d0, d1) -> (0, d1, d0)>}
+    : memref<?x?xf32>, vector<8x4x2xf32>
+  return %1 : vector<8x4x2xf32>
+}
+
+// CHECK-LABEL:   func.func @permutation_with_mask_xfer_read_scalable(
 // CHECK-SAME:      %[[ARG_0:.*]]: memref<?x?xf32>,
 // CHECK-SAME:      %[[IDX_1:.*]]: index,
 // CHECK-SAME:      %[[IDX_2:.*]]: index) -> vector<8x[4]x2xf32> {
@@ -28,37 +89,18 @@ func.func @lower_permutation_with_mask_fixed_width(%A : memref<?x?xf32>, %base1
 // CHECK:           %[[BCAST:.*]] = vector.broadcast %[[T_READ]] : vector<2x[4]xf32> to vector<8x2x[4]xf32>
 // CHECK:           %[[TRANSPOSE:.*]] = vector.transpose %[[BCAST]], [0, 2, 1] : vector<8x2x[4]xf32> to vector<8x[4]x2xf32>
 // CHECK:           return %[[TRANSPOSE]] : vector<8x[4]x2xf32>
-// CHECK:         }
-func.func @permutation_with_mask_scalable(%2: memref<?x?xf32>, %dim_1: index, %dim_2: index) -> (vector<8x[4]x2xf32>) {
+func.func @permutation_with_mask_xfer_read_scalable(%mem: memref<?x?xf32>, %dim_1: index, %dim_2: index) -> (vector<8x[4]x2xf32>) {
 
   %c0 = arith.constant 0 : index
   %cst_0 = arith.constant 0.000000e+00 : f32
 
   %mask = vector.create_mask %dim_2, %dim_1 : vector<2x[4]xi1>
-  %1 = vector.transfer_read %2[%c0, %c0], %cst_0, %mask
+  %1 = vector.transfer_read %mem[%c0, %c0], %cst_0, %mask
     {in_bounds = [true, true, true], permutation_map = affine_map<(d0, d1) -> (0, d1, d0)>}
     : memref<?x?xf32>, vector<8x[4]x2xf32>
   return %1 : vector<8x[4]x2xf32>
 }
 
-// CHECK:           func.func @permutation_with_mask_transfer_write_scalable(
-// CHECK-SAME:        %[[ARG_0:.*]]: vector<4x[8]xi16>,
-// CHECK-SAME:        %[[ARG_1:.*]]: memref<1x4x?x1x1x1x1xi16>,
-// CHECK-SAME:        %[[MASK:.*]]: vector<4x[8]xi1>) {
-// CHECK:             %[[C0:.*]] = arith.constant 0 : index
-// CHECK:             %[[BCAST_1:.*]] = vector.broadcast %[[ARG_0]] : vector<4x[8]xi16> to vector<1x1x1x1x4x[8]xi16>
-// CHECK:             %[[BCAST_2:.*]] = vector.broadcast %[[MASK]] : vector<4x[8]xi1> to vector<1x1x1x1x4x[8]xi1>
-// CHECK:             %[[TRANSPOSE_1:.*]] = vector.transpose %[[BCAST_2]], [4, 5, 0, 1, 2, 3] : vector<1x1x1x1x4x[8]xi1> to vector<4x[8]x1x1x1x1xi1>
-// CHECK:             %[[TRANSPOSE_2:.*]] = vector.transpose %[[BCAST_1]], [4, 5, 0, 1, 2, 3] : vector<1x1x1x1x4x[8]xi16> to vector<4x[8]x1x1x1x1xi16>
-// CHECK:             vector.transfer_write %[[TRANSPOSE_2]], %[[ARG_1]]{{\[}}%[[C0]], %[[C0]], %[[C0]], %[[C0]], %[[C0]], %[[C0]], %[[C0]]], %[[TRANSPOSE_1]] {in_bounds = [true, true, true, true, true, true]} : vector<4x[8]x1x1x1x1xi16>, memref<1x4x?x1x1x1x1xi16>
-// CHECK:             return
-func.func @permutation_with_mask_transfer_write_scalable(%arg0: vector<4x[8]xi16>, %arg1: memref<1x4x?x1x1x1x1xi16>, %mask:  vector<4x[8]xi1>){
-     %c0 = arith.constant 0 : index
-      vector.transfer_write %arg0, %arg1[%c0, %c0, %c0, %c0, %c0, %c0, %c0], %mask {in_bounds = [true, true], permutation_map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d1, d2)>
-} : vector<4x[8]xi16>, memref<1x4x?x1x1x1x1xi16>
-
-    return
-}
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
     %f = transform.structured.match ops{["func.func"]} in %module_op
diff --git a/mlir/test/Dialect/Vector/vector-transpose-lowering.mlir b/mlir/test/Dialect/Vector/vector-transpose-lowering.mlir
index 628a8ce50959..219a72df52a1 100644
--- a/mlir/test/Dialect/Vector/vector-transpose-lowering.mlir
+++ b/mlir/test/Dialect/Vector/vector-transpose-lowering.mlir
@@ -110,6 +110,17 @@ func.func @transpose(%arg0: vector<2x4xf32>) -> vector<4x2xf32> {
   return %0 : vector<4x2xf32>
 }
 
+/// Scalable vectors are not supported
+
+// CHECK-LABEL: func @transpose_scalable
+// CHECK-NOT: vector.shuffle
+// CHECK-NOT: vector.shape_cast
+// CHECK: vector.transpose
+func.func @transpose_scalable(%arg0: vector<2x[4]xf32>) -> vector<[4]x2xf32> {
+  %0 = vector.transpose %arg0, [1, 0] : vector<2x[4]xf32> to vector<[4]x2xf32>
+  return %0 : vector<[4]x2xf32>
+}
+
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
@@ -132,502 +143,22 @@ func.func @transpose(%arg0: vector<2x4xf32>) -> vector<4x2xf32> {
   return %0 : vector<4x2xf32>
 }
 
+/// Scalable vectors are not supported
 
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
-    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
-    transform.apply_patterns to %func_op {
-      transform.apply_patterns.vector.lower_transpose lowering_strategy = "flat_transpose"
-    } : !transform.op<"func.func">
-    transform.yield
-  }
-}
-
-// -----
-
-// CHECK-LABEL: func @transpose4x8
-func.func @transpose4x8xf32(%arg0: vector<4x8xf32>) -> vector<8x4xf32> {
-  //      CHECK: vector.extract {{.*}}[0]
-  // CHECK-NEXT: vector.extract {{.*}}[1]
-  // CHECK-NEXT: vector.extract {{.*}}[2]
-  // CHECK-NEXT: vector.extract {{.*}}[3]
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 1, 8, 9, 4, 5, 12, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 3, 10, 11, 6, 7, 14, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 1, 8, 9, 4, 5, 12, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 3, 10, 11, 6, 7, 14, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.insert {{.*}}[0]
-  // CHECK-NEXT: vector.insert {{.*}}[1]
-  // CHECK-NEXT: vector.insert {{.*}}[2]
-  // CHECK-NEXT: vector.insert {{.*}}[3]
-  // CHECK-NEXT: vector.shape_cast {{.*}} vector<4x8xf32> to vector<32xf32>
-  // CHECK-NEXT: vector.shape_cast {{.*}} vector<32xf32> to vector<8x4xf32>
-  %0 = vector.transpose %arg0, [1, 0] : vector<4x8xf32> to vector<8x4xf32>
-  return %0 : vector<8x4xf32>
-}
-
-// CHECK-LABEL: func @transpose021_1x4x8
-func.func @transpose021_1x4x8xf32(%arg0: vector<1x4x8xf32>) -> vector<1x8x4xf32> {
-  //      CHECK: vector.extract {{.*}}[0, 0]
-  // CHECK-NEXT: vector.extract {{.*}}[0, 1]
-  // CHECK-NEXT: vector.extract {{.*}}[0, 2]
-  // CHECK-NEXT: vector.extract {{.*}}[0, 3]
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 1, 8, 9, 4, 5, 12, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 3, 10, 11, 6, 7, 14, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 1, 8, 9, 4, 5, 12, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 3, 10, 11, 6, 7, 14, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.insert {{.*}}[0]
-  // CHECK-NEXT: vector.insert {{.*}}[1]
-  // CHECK-NEXT: vector.insert {{.*}}[2]
-  // CHECK-NEXT: vector.insert {{.*}}[3]
-  // CHECK-NEXT: vector.shape_cast {{.*}} vector<4x8xf32> to vector<32xf32>
-  // CHECK-NEXT: vector.shape_cast {{.*}} vector<32xf32> to vector<1x8x4xf32>
-  %0 = vector.transpose %arg0, [0, 2, 1] : vector<1x4x8xf32> to vector<1x8x4xf32>
-  return %0 : vector<1x8x4xf32>
-}
-
-// CHECK-LABEL: func @transpose8x8
-func.func @transpose8x8xf32(%arg0: vector<8x8xf32>) -> vector<8x8xf32> {
-  //      CHECK: vector.extract {{.*}}[0]
-  // CHECK-NEXT: vector.extract {{.*}}[1]
-  // CHECK-NEXT: vector.extract {{.*}}[2]
-  // CHECK-NEXT: vector.extract {{.*}}[3]
-  // CHECK-NEXT: vector.extract {{.*}}[4]
-  // CHECK-NEXT: vector.extract {{.*}}[5]
-  // CHECK-NEXT: vector.extract {{.*}}[6]
-  // CHECK-NEXT: vector.extract {{.*}}[7]
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-COUNT-4: vector.shuffle {{.*}} [2, 3, 8, 9, 6, 7, 12, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-COUNT-4: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32>
-  // CHECK-COUNT-4: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.insert {{.*}}[0]
-  // CHECK-NEXT: vector.insert {{.*}}[1]
-  // CHECK-NEXT: vector.insert {{.*}}[2]
-  // CHECK-NEXT: vector.insert {{.*}}[3]
-  // CHECK-NEXT: vector.insert {{.*}}[4]
-  // CHECK-NEXT: vector.insert {{.*}}[5]
-  // CHECK-NEXT: vector.insert {{.*}}[6]
-  // CHECK-NEXT: vector.insert {{.*}}[7]
-  %0 = vector.transpose %arg0, [1, 0] : vector<8x8xf32> to vector<8x8xf32>
-  return %0 : vector<8x8xf32>
-}
-
-// CHECK-LABEL: func @transpose021_1x8x8
-func.func @transpose021_1x8x8xf32(%arg0: vector<1x8x8xf32>) -> vector<1x8x8xf32> {
-  //      CHECK: vector.extract {{.*}}[0, 0]
-  // CHECK-NEXT: vector.extract {{.*}}[0, 1]
-  // CHECK-NEXT: vector.extract {{.*}}[0, 2]
-  // CHECK-NEXT: vector.extract {{.*}}[0, 3]
-  // CHECK-NEXT: vector.extract {{.*}}[0, 4]
-  // CHECK-NEXT: vector.extract {{.*}}[0, 5]
-  // CHECK-NEXT: vector.extract {{.*}}[0, 6]
-  // CHECK-NEXT: vector.extract {{.*}}[0, 7]
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-COUNT-4: vector.shuffle {{.*}} [2, 3, 8, 9, 6, 7, 12, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-COUNT-4: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32>
-  // CHECK-COUNT-4: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.insert {{.*}}[0]
-  // CHECK-NEXT: vector.insert {{.*}}[1]
-  // CHECK-NEXT: vector.insert {{.*}}[2]
-  // CHECK-NEXT: vector.insert {{.*}}[3]
-  // CHECK-NEXT: vector.insert {{.*}}[4]
-  // CHECK-NEXT: vector.insert {{.*}}[5]
-  // CHECK-NEXT: vector.insert {{.*}}[6]
-  // CHECK-NEXT: vector.insert {{.*}}[7]
-  // CHECK-NEXT: vector.shape_cast %{{.*}} : vector<8x8xf32> to vector<1x8x8xf32>
-  %0 = vector.transpose %arg0, [0, 2, 1] : vector<1x8x8xf32> to vector<1x8x8xf32>
-  return %0 : vector<1x8x8xf32>
-}
-
-// CHECK-LABEL: func @transpose120_8x1x8
-func.func @transpose120_8x1x8xf32(%arg0: vector<8x1x8xf32>) -> vector<1x8x8xf32> {
-  //      CHECK: vector.extract {{.*}}[0, 0]
-  // CHECK-NEXT: vector.extract {{.*}}[1, 0]
-  // CHECK-NEXT: vector.extract {{.*}}[2, 0]
-  // CHECK-NEXT: vector.extract {{.*}}[3, 0]
-  // CHECK-NEXT: vector.extract {{.*}}[4, 0]
-  // CHECK-NEXT: vector.extract {{.*}}[5, 0]
-  // CHECK-NEXT: vector.extract {{.*}}[6, 0]
-  // CHECK-NEXT: vector.extract {{.*}}[7, 0]
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-COUNT-4: vector.shuffle {{.*}} [2, 3, 8, 9, 6, 7, 12, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-COUNT-4: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32>
-  // CHECK-COUNT-4: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.insert {{.*}}[0]
-  // CHECK-NEXT: vector.insert {{.*}}[1]
-  // CHECK-NEXT: vector.insert {{.*}}[2]
-  // CHECK-NEXT: vector.insert {{.*}}[3]
-  // CHECK-NEXT: vector.insert {{.*}}[4]
-  // CHECK-NEXT: vector.insert {{.*}}[5]
-  // CHECK-NEXT: vector.insert {{.*}}[6]
-  // CHECK-NEXT: vector.insert {{.*}}[7]
-  // CHECK-NEXT: vector.shape_cast %{{.*}} : vector<8x8xf32> to vector<1x8x8xf32>
-  %0 = vector.transpose %arg0, [1, 2, 0] : vector<8x1x8xf32> to vector<1x8x8xf32>
-  return %0 : vector<1x8x8xf32>
-}
-
-// CHECK-LABEL: func @transpose120_8x8x1
-func.func @transpose120_8x8x1xf32(%arg0: vector<8x8x1xf32>) -> vector<8x1x8xf32> {
-  //      CHECK: vector.shape_cast %{{.*}} : vector<8x8x1xf32> to vector<8x8xf32>
-  // CHECK-NEXT: vector.extract {{.*}}[0]
-  // CHECK-NEXT: vector.extract {{.*}}[1]
-  // CHECK-NEXT: vector.extract {{.*}}[2]
-  // CHECK-NEXT: vector.extract {{.*}}[3]
-  // CHECK-NEXT: vector.extract {{.*}}[4]
-  // CHECK-NEXT: vector.extract {{.*}}[5]
-  // CHECK-NEXT: vector.extract {{.*}}[6]
-  // CHECK-NEXT: vector.extract {{.*}}[7]
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-COUNT-4: vector.shuffle {{.*}} [2, 3, 8, 9, 6, 7, 12, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-COUNT-4: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32>
-  // CHECK-COUNT-4: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.insert {{.*}}[0]
-  // CHECK-NEXT: vector.insert {{.*}}[1]
-  // CHECK-NEXT: vector.insert {{.*}}[2]
-  // CHECK-NEXT: vector.insert {{.*}}[3]
-  // CHECK-NEXT: vector.insert {{.*}}[4]
-  // CHECK-NEXT: vector.insert {{.*}}[5]
-  // CHECK-NEXT: vector.insert {{.*}}[6]
-  // CHECK-NEXT: vector.insert {{.*}}[7]
-  // CHECK-NEXT: vector.shape_cast %{{.*}} : vector<8x8xf32> to vector<8x1x8xf32>
-  %0 = vector.transpose %arg0, [1, 2, 0] : vector<8x8x1xf32> to vector<8x1x8xf32>
-  return %0 : vector<8x1x8xf32>
-}
-
-// CHECK-LABEL: func @transpose102_8x8x1
-func.func @transpose102_8x8x1xf32(%arg0: vector<8x8x1xf32>) -> vector<8x8x1xf32> {
-  //      CHECK: vector.shape_cast %{{.*}} : vector<8x8x1xf32> to vector<8x8xf32>
-  // CHECK-NEXT: vector.extract {{.*}}[0]
-  // CHECK-NEXT: vector.extract {{.*}}[1]
-  // CHECK-NEXT: vector.extract {{.*}}[2]
-  // CHECK-NEXT: vector.extract {{.*}}[3]
-  // CHECK-NEXT: vector.extract {{.*}}[4]
-  // CHECK-NEXT: vector.extract {{.*}}[5]
-  // CHECK-NEXT: vector.extract {{.*}}[6]
-  // CHECK-NEXT: vector.extract {{.*}}[7]
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-COUNT-4: vector.shuffle {{.*}} [2, 3, 8, 9, 6, 7, 12, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-COUNT-4: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32>
-  // CHECK-COUNT-4: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.insert {{.*}}[0]
-  // CHECK-NEXT: vector.insert {{.*}}[1]
-  // CHECK-NEXT: vector.insert {{.*}}[2]
-  // CHECK-NEXT: vector.insert {{.*}}[3]
-  // CHECK-NEXT: vector.insert {{.*}}[4]
-  // CHECK-NEXT: vector.insert {{.*}}[5]
-  // CHECK-NEXT: vector.insert {{.*}}[6]
-  // CHECK-NEXT: vector.insert {{.*}}[7]
-  // CHECK-NEXT: vector.shape_cast %{{.*}} : vector<8x8xf32> to vector<8x8x1xf32>
-  %0 = vector.transpose %arg0, [1, 0, 2] : vector<8x8x1xf32> to vector<8x8x1xf32>
-  return %0 : vector<8x8x1xf32>
-}
-
-// CHECK-LABEL: func @transpose201_8x1x8
-func.func @transpose201_8x1x8xf32(%arg0: vector<8x1x8xf32>) -> vector<8x8x1xf32> {
-  //      CHECK: vector.extract {{.*}}[0, 0]
-  // CHECK-NEXT: vector.extract {{.*}}[1, 0]
-  // CHECK-NEXT: vector.extract {{.*}}[2, 0]
-  // CHECK-NEXT: vector.extract {{.*}}[3, 0]
-  // CHECK-NEXT: vector.extract {{.*}}[4, 0]
-  // CHECK-NEXT: vector.extract {{.*}}[5, 0]
-  // CHECK-NEXT: vector.extract {{.*}}[6, 0]
-  // CHECK-NEXT: vector.extract {{.*}}[7, 0]
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-COUNT-4: vector.shuffle {{.*}} [2, 3, 8, 9, 6, 7, 12, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-COUNT-4: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32>
-  // CHECK-COUNT-4: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.insert {{.*}}[0]
-  // CHECK-NEXT: vector.insert {{.*}}[1]
-  // CHECK-NEXT: vector.insert {{.*}}[2]
-  // CHECK-NEXT: vector.insert {{.*}}[3]
-  // CHECK-NEXT: vector.insert {{.*}}[4]
-  // CHECK-NEXT: vector.insert {{.*}}[5]
-  // CHECK-NEXT: vector.insert {{.*}}[6]
-  // CHECK-NEXT: vector.insert {{.*}}[7]
-  // CHECK-NEXT: vector.shape_cast %{{.*}} : vector<8x8xf32> to vector<8x8x1xf32>
-  %0 = vector.transpose %arg0, [2, 0, 1] : vector<8x1x8xf32> to vector<8x8x1xf32>
-  return %0 : vector<8x8x1xf32>
-}
-
-// CHECK-LABEL: func @transpose201_1x8x8
-func.func @transpose201_1x8x8xf32(%arg0: vector<1x8x8xf32>) -> vector<8x1x8xf32> {
-  //      CHECK: vector.extract {{.*}}[0, 0]
-  // CHECK-NEXT: vector.extract {{.*}}[0, 1]
-  // CHECK-NEXT: vector.extract {{.*}}[0, 2]
-  // CHECK-NEXT: vector.extract {{.*}}[0, 3]
-  // CHECK-NEXT: vector.extract {{.*}}[0, 4]
-  // CHECK-NEXT: vector.extract {{.*}}[0, 5]
-  // CHECK-NEXT: vector.extract {{.*}}[0, 6]
-  // CHECK-NEXT: vector.extract {{.*}}[0, 7]
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-COUNT-4: vector.shuffle {{.*}} [2, 3, 8, 9, 6, 7, 12, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-COUNT-4: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32>
-  // CHECK-COUNT-4: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.insert {{.*}}[0]
-  // CHECK-NEXT: vector.insert {{.*}}[1]
-  // CHECK-NEXT: vector.insert {{.*}}[2]
-  // CHECK-NEXT: vector.insert {{.*}}[3]
-  // CHECK-NEXT: vector.insert {{.*}}[4]
-  // CHECK-NEXT: vector.insert {{.*}}[5]
-  // CHECK-NEXT: vector.insert {{.*}}[6]
-  // CHECK-NEXT: vector.insert {{.*}}[7]
-  // CHECK-NEXT: vector.shape_cast %{{.*}} : vector<8x8xf32> to vector<8x1x8xf32>
-  %0 = vector.transpose %arg0, [2, 0, 1] : vector<1x8x8xf32> to vector<8x1x8xf32>
-  return %0 : vector<8x1x8xf32>
-}
-
-// CHECK-LABEL: func @transpose210_8x1x8
-func.func @transpose210_8x1x8xf32(%arg0: vector<8x1x8xf32>) -> vector<8x1x8xf32> {
-  //      CHECK: vector.extract {{.*}}[0, 0]
-  // CHECK-NEXT: vector.extract {{.*}}[1, 0]
-  // CHECK-NEXT: vector.extract {{.*}}[2, 0]
-  // CHECK-NEXT: vector.extract {{.*}}[3, 0]
-  // CHECK-NEXT: vector.extract {{.*}}[4, 0]
-  // CHECK-NEXT: vector.extract {{.*}}[5, 0]
-  // CHECK-NEXT: vector.extract {{.*}}[6, 0]
-  // CHECK-NEXT: vector.extract {{.*}}[7, 0]
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-COUNT-4: vector.shuffle {{.*}} [2, 3, 8, 9, 6, 7, 12, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-COUNT-4: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32>
-  // CHECK-COUNT-4: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.insert {{.*}}[0]
-  // CHECK-NEXT: vector.insert {{.*}}[1]
-  // CHECK-NEXT: vector.insert {{.*}}[2]
-  // CHECK-NEXT: vector.insert {{.*}}[3]
-  // CHECK-NEXT: vector.insert {{.*}}[4]
-  // CHECK-NEXT: vector.insert {{.*}}[5]
-  // CHECK-NEXT: vector.insert {{.*}}[6]
-  // CHECK-NEXT: vector.insert {{.*}}[7]
-  // CHECK-NEXT: vector.shape_cast %{{.*}} : vector<8x8xf32> to vector<8x1x8xf32>
-  %0 = vector.transpose %arg0, [2, 1, 0] : vector<8x1x8xf32> to vector<8x1x8xf32>
-  return %0 : vector<8x1x8xf32>
-}
-
-// CHECK-LABEL: func @transpose210_8x8x1
-func.func @transpose210_8x8x1xf32(%arg0: vector<8x8x1xf32>) -> vector<1x8x8xf32> {
-  //      CHECK: vector.shape_cast %{{.*}} : vector<8x8x1xf32> to vector<8x8xf32>
-  // CHECK-NEXT: vector.extract {{.*}}[0]
-  // CHECK-NEXT: vector.extract {{.*}}[1]
-  // CHECK-NEXT: vector.extract {{.*}}[2]
-  // CHECK-NEXT: vector.extract {{.*}}[3]
-  // CHECK-NEXT: vector.extract {{.*}}[4]
-  // CHECK-NEXT: vector.extract {{.*}}[5]
-  // CHECK-NEXT: vector.extract {{.*}}[6]
-  // CHECK-NEXT: vector.extract {{.*}}[7]
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-COUNT-4: vector.shuffle {{.*}} [2, 3, 8, 9, 6, 7, 12, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-COUNT-4: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32>
-  // CHECK-COUNT-4: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.insert {{.*}}[0]
-  // CHECK-NEXT: vector.insert {{.*}}[1]
-  // CHECK-NEXT: vector.insert {{.*}}[2]
-  // CHECK-NEXT: vector.insert {{.*}}[3]
-  // CHECK-NEXT: vector.insert {{.*}}[4]
-  // CHECK-NEXT: vector.insert {{.*}}[5]
-  // CHECK-NEXT: vector.insert {{.*}}[6]
-  // CHECK-NEXT: vector.insert {{.*}}[7]
-  // CHECK-NEXT: vector.shape_cast %{{.*}} : vector<8x8xf32> to vector<1x8x8xf32>
-  %0 = vector.transpose %arg0, [2, 1, 0] : vector<8x8x1xf32> to vector<1x8x8xf32>
-  return %0 : vector<1x8x8xf32>
-}
-
-// CHECK-LABEL: func @transpose210_1x8x8
-func.func @transpose210_1x8x8xf32(%arg0: vector<1x8x8xf32>) -> vector<8x8x1xf32> {
-  //      CHECK: vector.extract {{.*}}[0, 0]
-  // CHECK-NEXT: vector.extract {{.*}}[0, 1]
-  // CHECK-NEXT: vector.extract {{.*}}[0, 2]
-  // CHECK-NEXT: vector.extract {{.*}}[0, 3]
-  // CHECK-NEXT: vector.extract {{.*}}[0, 4]
-  // CHECK-NEXT: vector.extract {{.*}}[0, 5]
-  // CHECK-NEXT: vector.extract {{.*}}[0, 6]
-  // CHECK-NEXT: vector.extract {{.*}}[0, 7]
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-COUNT-4: vector.shuffle {{.*}} [2, 3, 8, 9, 6, 7, 12, 13] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
-  // CHECK-COUNT-4: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32>
-  // CHECK-COUNT-4: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32>
-  // CHECK-NEXT: vector.insert {{.*}}[0]
-  // CHECK-NEXT: vector.insert {{.*}}[1]
-  // CHECK-NEXT: vector.insert {{.*}}[2]
-  // CHECK-NEXT: vector.insert {{.*}}[3]
-  // CHECK-NEXT: vector.insert {{.*}}[4]
-  // CHECK-NEXT: vector.insert {{.*}}[5]
-  // CHECK-NEXT: vector.insert {{.*}}[6]
-  // CHECK-NEXT: vector.insert {{.*}}[7]
-  // CHECK-NEXT: vector.shape_cast %{{.*}} : vector<8x8xf32> to vector<8x8x1xf32>
-  %0 = vector.transpose %arg0, [2, 1, 0] : vector<1x8x8xf32> to vector<8x8x1xf32>
-  return %0 : vector<8x8x1xf32>
+// CHECK-LABEL: func @transpose_scalable(
+func.func @transpose_scalable(%arg0: vector<2x[4]xf32>) -> vector<[4]x2xf32> {
+  // CHECK-NOT:       vector.shape_cast
+  // CHECK-NOT:       vector.flat_transpose
+  // CHECK:           vector.transpose
+  %0 = vector.transpose %arg0, [1, 0] : vector<2x[4]xf32> to vector<[4]x2xf32>
+  return %0 : vector<[4]x2xf32>
 }
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
     %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
     transform.apply_patterns to %func_op {
-      transform.apply_patterns.vector.lower_transpose avx2_lowering_strategy = true
+      transform.apply_patterns.vector.lower_transpose lowering_strategy = "flat_transpose"
     } : !transform.op<"func.func">
     transform.yield
   }
@@ -635,6 +166,7 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+// CHECK-LABEL: @transpose_shuffle16x16xf32(
 func.func @transpose_shuffle16x16xf32(%arg0: vector<16x16xf32>) -> vector<16x16xf32> {
   // CHECK: vector.shuffle {{.*}} [0, 16, 1, 17, 4, 20, 5, 21, 8, 24, 9, 25, 12, 28, 13, 29] : vector<16xf32>, vector<16xf32>
   // CHECK: vector.shuffle {{.*}} [2, 18, 3, 19, 6, 22, 7, 23, 10, 26, 11, 27, 14, 30, 15, 31] : vector<16xf32>, vector<16xf32>
@@ -704,6 +236,14 @@ func.func @transpose_shuffle16x16xf32(%arg0: vector<16x16xf32>) -> vector<16x16x
   return %0 : vector<16x16xf32>
 }
 
+// CHECK-LABEL: @transpose_shuffle16x16xf32_scalable(
+func.func @transpose_shuffle16x16xf32_scalable(%arg0: vector<16x[16]xf32>) -> vector<[16]x16xf32> {
+  // CHECK-NOT: vector.shuffle
+  // CHECK: vector.transpose
+  %0 = vector.transpose %arg0, [1, 0] : vector<16x[16]xf32> to vector<[16]x16xf32>
+  return %0 : vector<[16]x16xf32>
+}
+
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
     %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
@@ -786,6 +326,14 @@ func.func @transpose021_shuffle16x16xf32(%arg0: vector<1x16x16xf32>) -> vector<1
   return %0 : vector<1x16x16xf32>
 }
 
+// CHECK-LABEL: func @transpose021_shuffle16x16xf32_scalable
+func.func @transpose021_shuffle16x16xf32_scalable(%arg0: vector<1x16x[16]xf32>) -> vector<1x[16]x16xf32> {
+  // CHECK-NOT: vector.shuffle
+  // CHECK: vector.transpose
+  %0 = vector.transpose %arg0, [0, 2, 1] : vector<1x16x[16]xf32> to vector<1x[16]x16xf32>
+  return %0 : vector<1x[16]x16xf32>
+}
+
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
     %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
@@ -830,23 +378,14 @@ func.func @transpose10_1xnx4xf32(%arg0: vector<1x[4]xf32>) -> vector<[4]x1xf32>
 
 /// Scalable unit dim should not be lowered to shape_cast.
 
-// CHECK-LABEL: func @transpose10_4xnx1xf32
-func.func @transpose10_4xnx1xf32(%arg0: vector<4x[1]xf32>) -> vector<[1]x4xf32> {
+// CHECK-LABEL: func @transpose10_4x1xf32_scalable
+func.func @transpose10_4x1xf32_scalable(%arg0: vector<4x[1]xf32>) -> vector<[1]x4xf32> {
   // CHECK-NOT: vector.shape_cast
   // CHECK: vector.transpose %{{.*}} : vector<4x[1]xf32> to vector<[1]x4xf32>
   %0 = vector.transpose %arg0, [1, 0] : vector<4x[1]xf32> to vector<[1]x4xf32>
   return %0 : vector<[1]x4xf32>
 }
 
-// CHECK-LABEL: func @transpose10_nx4xnx1xf32
-func.func @transpose10_nx4xnx1xf32(%arg0: vector<4x[1]xf32>) -> vector<[1]x4xf32> {
-  // CHECK-NOT: vector.shape_cast
-  // CHECK: vector.transpose %{{.*}} : vector<4x[1]xf32> to vector<[1]x4xf32>
-  %0 = vector.transpose %arg0, [1, 0] : vector<4x[1]xf32> to vector<[1]x4xf32>
-
-  return %0 : vector<[1]x4xf32>
-}
-
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
     %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
@@ -856,25 +395,3 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-
-// -----
-
-// Scalable transposes should not be lowered to vector.shuffle.
-
-// CHECK-LABEL: func @transpose_nx8x2xf32
-func.func @transpose_nx8x2xf32(%arg0: vector<[8]x2xf32>) -> vector<2x[8]xf32> {
-  // CHECK-NOT: vector.shuffle
-  // CHECK: vector.transpose %{{.*}} : vector<[8]x2xf32> to vector<2x[8]xf32>
-  %0 = vector.transpose %arg0, [1, 0] : vector<[8]x2xf32> to vector<2x[8]xf32>
-  return %0 : vector<2x[8]xf32>
-}
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
-    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
-    transform.apply_patterns to %func_op {
-      transform.apply_patterns.vector.lower_transpose lowering_strategy = "shuffle_1d"
-    } : !transform.op<"func.func">
-    transform.yield
-  }
-}
diff --git a/mlir/test/IR/custom-float-attr-roundtrip.mlir b/mlir/test/IR/custom-float-attr-roundtrip.mlir
new file mode 100644
index 000000000000..a8da89ba7372
--- /dev/null
+++ b/mlir/test/IR/custom-float-attr-roundtrip.mlir
@@ -0,0 +1,57 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics| FileCheck %s
+
+// CHECK-LABEL: @test_enum_attr_roundtrip
+func.func @test_enum_attr_roundtrip() -> () {
+  // CHECK: attr = #test.custom_float<"float" : 2.000000e+00>
+  "test.op"() {attr = #test.custom_float<"float" : 2.>} : () -> ()
+  // CHECK: attr = #test.custom_float<"double" : 2.000000e+00>
+  "test.op"() {attr = #test.custom_float<"double" : 2.>} : () -> ()
+   // CHECK: attr = #test.custom_float<"fp80" : 2.000000e+00>
+  "test.op"() {attr = #test.custom_float<"fp80" : 2.>} : () -> ()
+  // CHECK: attr = #test.custom_float<"float" : 0x7FC00000>
+  "test.op"() {attr = #test.custom_float<"float" : 0x7FC00000>} : () -> ()
+  // CHECK: attr = #test.custom_float<"double" : 0x7FF0000001000000>
+  "test.op"() {attr = #test.custom_float<"double" : 0x7FF0000001000000>} : () -> ()
+  // CHECK: attr = #test.custom_float<"fp80" : 0x7FFFC000000000100000>
+  "test.op"() {attr = #test.custom_float<"fp80" : 0x7FFFC000000000100000>} : () -> ()
+  return
+}
+
+// -----
+
+// Verify literal must be hex or float
+
+// expected-error @below {{unexpected decimal integer literal for a floating point value}}
+// expected-note @below {{add a trailing dot to make the literal a float}}
+"test.op"() {attr = #test.custom_float<"float" : 42>} : () -> ()
+
+// -----
+
+// Integer value must be in the width of the floating point type
+
+// expected-error @below {{hexadecimal float constant out of range for type}}
+"test.op"() {attr = #test.custom_float<"float" : 0x7FC000000>} : () -> ()
+
+
+// -----
+
+// Integer value must be in the width of the floating point type
+
+// expected-error @below {{hexadecimal float constant out of range for type}}
+"test.op"() {attr = #test.custom_float<"double" : 0x7FC000007FC0000000>} : () -> ()
+
+
+// -----
+
+// Integer value must be in the width of the floating point type
+
+// expected-error @below {{hexadecimal float constant out of range for type}}
+"test.op"() {attr = #test.custom_float<"fp80" : 0x7FC0000007FC0000007FC000000>} : () -> ()
+
+// -----
+
+// Value must be a floating point literal or integer literal
+
+// expected-error @below {{expected floating point literal}}
+"test.op"() {attr = #test.custom_float<"float" : "blabla">} : () -> ()
+
diff --git a/mlir/test/IR/parser.mlir b/mlir/test/IR/parser.mlir
index bebbb876391d..020942e7f4c1 100644
--- a/mlir/test/IR/parser.mlir
+++ b/mlir/test/IR/parser.mlir
@@ -1105,6 +1105,30 @@ func.func @bfloat16_special_values() {
   return
 }
 
+// CHECK-LABEL: @f80_special_values
+func.func @f80_special_values() {
+  // F80 signaling NaNs.
+  // CHECK: arith.constant 0x7FFFE000000000000001 : f80
+  %0 = arith.constant 0x7FFFE000000000000001 : f80
+  // CHECK: arith.constant 0x7FFFB000000000000011 : f80
+  %1 = arith.constant 0x7FFFB000000000000011 : f80
+
+  // F80 quiet NaNs.
+  // CHECK: arith.constant 0x7FFFC000000000100000 : f80
+  %2 = arith.constant 0x7FFFC000000000100000 : f80
+  // CHECK: arith.constant 0x7FFFE000000001000000 : f80
+  %3 = arith.constant 0x7FFFE000000001000000 : f80
+
+  // F80 positive infinity.
+  // CHECK: arith.constant 0x7FFF8000000000000000 : f80
+  %4 = arith.constant 0x7FFF8000000000000000 : f80
+  // F80 negative infinity.
+  // CHECK: arith.constant 0xFFFF8000000000000000 : f80
+  %5 = arith.constant 0xFFFF8000000000000000 : f80
+
+  return
+}
+
 // We want to print floats in exponential notation with 6 significant digits,
 // but it may lead to precision loss when parsing back, in which case we print
 // the decimal form instead.
diff --git a/mlir/test/IR/print-unique-ssa-ids.mlir b/mlir/test/IR/print-unique-ssa-ids.mlir
new file mode 100644
index 000000000000..a2d2d9bb7907
--- /dev/null
+++ b/mlir/test/IR/print-unique-ssa-ids.mlir
@@ -0,0 +1,30 @@
+// RUN: mlir-opt -mlir-print-unique-ssa-ids %s | FileCheck %s
+// RUN: mlir-opt -mlir-print-op-generic %s | FileCheck %s
+// RUN: mlir-opt %s | FileCheck %s --check-prefix=LOCAL_SCOPE
+
+// CHECK: %arg3
+// CHECK: %7
+// LOCAL_SCOPE-NOT: %arg3
+// LOCAL_SCOPE-NOT: %7
+module {
+  func.func @uniqueSSAIDs(%arg0 : memref<i32>, %arg1 : memref<i32>) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c8 = arith.constant 8 : index
+    scf.for %arg2 = %c0 to %c8 step %c1 {
+      %a = memref.load %arg0[] : memref<i32>
+      %b = memref.load %arg1[] : memref<i32>
+      %0 = arith.addi %a, %b : i32
+      %1 = arith.subi %a, %b : i32
+      scf.yield
+    }
+    scf.for %arg2 = %c0 to %c8 step %c1 {
+      %a = memref.load %arg0[] : memref<i32>
+      %b = memref.load %arg1[] : memref<i32>
+      %0 = arith.addi %a, %b : i32
+      %1 = arith.subi %a, %b : i32
+      scf.yield
+    }
+    return
+  }
+}
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul-transpose-a.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul-transpose-a.mlir
index 34c5351c8703..a8b6457d64be 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul-transpose-a.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul-transpose-a.mlir
@@ -61,7 +61,7 @@ module attributes {transform.with_named_sequence} {
 
     // Step 1: Tile for size [4] x [4], which corresponds to SVLs x SVLs, where
     //         SVLs is the number of 32-bit elements in a vector of SVL bits.
-    %tiled_linalg_op, %loops:3 = transform.structured.tile_using_for %matmul_transpose_a[[4], [4], 1]
+    %tiled_linalg_op, %loops:3 = transform.structured.tile_using_for %matmul_transpose_a tile_sizes [[4], [4], 1]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
 
     // Step 2: Vectorize.
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul.mlir
index 2bfdaa8e8a2b..091665223188 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul.mlir
@@ -59,7 +59,7 @@ module attributes {transform.with_named_sequence} {
 
     // Step 1: Tile for size [4] x [4], which corresponds to SVLs x SVLs, where
     // SVLs is the number of 32-bit elements in a vector of SVL bits.
-    %tiled_linalg_op, %loops:3 = transform.structured.tile_using_for %matmul[[4], [4], 1]
+    %tiled_linalg_op, %loops:3 = transform.structured.tile_using_for %matmul tile_sizes [[4], [4], 1]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
 
     // Step 2: Vectorize.
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul-mixed-types.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul-mixed-types.mlir
index 9f06226a4f65..10ffed268817 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul-mixed-types.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul-mixed-types.mlir
@@ -84,7 +84,7 @@ module attributes {transform.with_named_sequence} {
     // Step 1: Tile for size [8] x [8] (unrolled by 4), which corresponds to
     // (2 x SVLs) x (2 x SVLs), where SVLs is the number of 32-bit elements in a
     // vector of SVL bits. This uses all four 32-bit SME virtual tiles.
-    %tiled_linalg_op, %loop_i, %loop_j, %loop_k = transform.structured.tile_using_for %matmul[[8], [8], 4]
+    %tiled_linalg_op, %loop_i, %loop_j, %loop_k = transform.structured.tile_using_for %matmul tile_sizes [[8], [8], 4]
       : (!transform.any_op) -> (!transform.any_op, !transform.op<"scf.for">, !transform.op<"scf.for">, !transform.op<"scf.for">)
 
     // Step 2: Vectorize.
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul.mlir
index e376bdde24a1..ada744b322fe 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul.mlir
@@ -72,7 +72,7 @@ module attributes {transform.with_named_sequence} {
     // Step 1: Tile for size [8] x [8] (unrolled by 4), which corresponds to
     // (2 x SVLs) x (2 x SVLs), where SVLs is the number of 32-bit elements in a
     // vector of SVL bits. This uses all four 32-bit SME virtual tiles.
-    %tiled_linalg_op, %loop_i, %loop_j, %loop_k = transform.structured.tile_using_for %matmul[[8], [8], 4]
+    %tiled_linalg_op, %loop_i, %loop_j, %loop_k = transform.structured.tile_using_for %matmul tile_sizes [[8], [8], 4]
       : (!transform.any_op) -> (!transform.any_op, !transform.op<"scf.for">, !transform.op<"scf.for">, !transform.op<"scf.for">)
 
     // Step 2: Vectorize.
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir
index 68e474fe5cef..edb9de922808 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir
@@ -96,7 +96,7 @@ module attributes {transform.with_named_sequence} {
       : (!transform.op<"func.func">) -> !transform.any_op
 
     // Step 1: Tile
-    %tiled_matmul, %loops:3 = transform.structured.tile_using_for %matmul [2, [4], 1]
+    %tiled_matmul, %loops:3 = transform.structured.tile_using_for %matmul tile_sizes [2, [4], 1]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
 
     // Step 2: Vectorize
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/mmt4d.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/mmt4d.mlir
index 92c7039c8496..183625f9748c 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/mmt4d.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/mmt4d.mlir
@@ -70,10 +70,10 @@ module @transforms attributes { transform.with_named_sequence } {
 
    // Step 1: Tile
    // Tile parallel dims
-   %tiled_linalg_op_p, %loops:4 = transform.structured.tile_using_for %mmt4d[1, 1, 0, 3, 3, 0]
+   %tiled_linalg_op_p, %loops:4 = transform.structured.tile_using_for %mmt4d tile_sizes [1, 1, 0, 3, 3, 0]
      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
    // Tile reduction dims
-   %tiled_linalg_op_r, %loops2:2 = transform.structured.tile_using_for %tiled_linalg_op_p[0, 0, 1, 0, 0, 1]
+   %tiled_linalg_op_r, %loops2:2 = transform.structured.tile_using_for %tiled_linalg_op_p tile_sizes [0, 0, 1, 0, 0, 1]
      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
 
    // Step 2: Vectorize
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/pack-unpack-mmt4d.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/pack-unpack-mmt4d.mlir
index 5680882dccb1..10b29dd70177 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/pack-unpack-mmt4d.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/pack-unpack-mmt4d.mlir
@@ -107,10 +107,10 @@ module @transforms attributes { transform.with_named_sequence } {
 
    // Step 1: Tile
    // Tile parallel dims
-   %tiled_linalg_op_p, %loops:4 = transform.structured.tile_using_for %mmt4d[1, 1, 0, 8, 8, 0]
+   %tiled_linalg_op_p, %loops:4 = transform.structured.tile_using_for %mmt4d tile_sizes [1, 1, 0, 8, 8, 0]
      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
    // Tile reduction dims
-   %tiled_linalg_op_r, %loops2:2 = transform.structured.tile_using_for %tiled_linalg_op_p[0, 0, 1, 0, 0, 1]
+   %tiled_linalg_op_r, %loops2:2 = transform.structured.tile_using_for %tiled_linalg_op_p tile_sizes [0, 0, 1, 0, 0, 1]
      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
 
    // Step 2: Vectorize
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-call.mlir
index 443963fb8c59..9b46056918b5 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-call.mlir
@@ -27,7 +27,7 @@ func.func @conv_1d(%arg0: memref<?xf32>, %arg1: memref<?xf32>, %arg2: memref<?xf
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.conv_1d"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1, %loop = transform.structured.tile_using_for %0 [4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %1, %loop = transform.structured.tile_using_for %0 tile_sizes [4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
     transform.yield
   }
 }
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-nwc-wcf-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-nwc-wcf-call.mlir
index f652d707de05..d6726fe1a6b4 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-nwc-wcf-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-nwc-wcf-call.mlir
@@ -29,7 +29,7 @@ func.func @conv_1d_nwc_wcf(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>, %
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.conv_1d_nwc_wcf"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1, %loops:2 = transform.structured.tile_using_for %0 [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+    %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
 }
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-call.mlir
index 2eaba8233d69..bb77d5eb9b8d 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-call.mlir
@@ -27,7 +27,7 @@ func.func @conv_2d(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.conv_2d"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1, %loops:2 = transform.structured.tile_using_for %0 [2, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+    %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
 }
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nhwc-hwcf-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nhwc-hwcf-call.mlir
index eac8d8a6ea43..39415dff1cbb 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nhwc-hwcf-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nhwc-hwcf-call.mlir
@@ -29,7 +29,7 @@ func.func @conv_2d_nhwc_hwcf(%arg0: memref<?x?x?x?xf32>, %arg1: memref<?x?x?x?xf
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.conv_2d_nhwc_hwcf"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1, %loops:4 = transform.structured.tile_using_for %0 [2, 3, 3, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+    %1, %loops:4 = transform.structured.tile_using_for %0 tile_sizes [2, 3, 3, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
 }
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-call.mlir
index d5584cd67702..ece054ac7176 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-call.mlir
@@ -27,7 +27,7 @@ func.func @conv_3d(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>, %arg2: me
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.conv_3d"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1, %loops:3 = transform.structured.tile_using_for %0 [2, 2, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+    %1, %loops:3 = transform.structured.tile_using_for %0 tile_sizes [2, 2, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
 }
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ndhwc-dhwcf-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ndhwc-dhwcf-call.mlir
index 7dca79334565..ce169ee470c3 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ndhwc-dhwcf-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ndhwc-dhwcf-call.mlir
@@ -29,7 +29,7 @@ func.func @conv_3d_ndhwc_dhwcf(%arg0: memref<?x?x?x?x?xf32>, %arg1: memref<?x?x?
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.conv_3d_ndhwc_dhwcf"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1, %loops:3 = transform.structured.tile_using_for %0 [0, 5, 5, 5] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+    %1, %loops:3 = transform.structured.tile_using_for %0 tile_sizes [0, 5, 5, 5] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
 }
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-matmul.mlir
index fda7ffb0c753..41296cdfcb2d 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-matmul.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-matmul.mlir
@@ -39,7 +39,7 @@ func.func @main() {
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1, %loops:3 = transform.structured.tile_using_for %0 [1, 2, 3] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+    %1, %loops:3 = transform.structured.tile_using_for %0 tile_sizes [1, 2, 3] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
 }
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/block.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/block.mlir
index f79e7e68f382..ab4fd0e30d65 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/block.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/block.mlir
@@ -93,9 +93,9 @@ module {
     // CHECK-NEXT: nse = 12
     // CHECK-NEXT: dim = ( 4, 6 )
     // CHECK-NEXT: lvl = ( 2, 3, 2, 2 )
-    // CHECK-NEXT: pos[1] : ( 0, 2, 3,
-    // CHECK-NEXT: crd[1] : ( 0, 2, 1,
-    // CHECK-NEXT: values : ( 1, 2, 0, 3, 4, 0, 0, 5, 6, 7, 8, 0,
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 1 )
+    // CHECK-NEXT: values : ( 1, 2, 0, 3, 4, 0, 0, 5, 6, 7, 8, 0 )
     // CHECK-NEXT: ----
     sparse_tensor.print %A : tensor<?x?xf64, #BSR>
 
@@ -103,9 +103,9 @@ module {
     // CHECK-NEXT: nse = 12
     // CHECK-NEXT: dim = ( 2, 3, 2, 2 )
     // CHECK-NEXT: lvl = ( 2, 3, 2, 2 )
-    // CHECK-NEXT: pos[1] : ( 0, 2, 3,
-    // CHECK-NEXT: crd[1] : ( 0, 2, 1
-    // CHECK-NEXT: values : ( 1, 2, 0, 3, 4, 0, 0, 5, 6, 7, 8, 0,
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 1 )
+    // CHECK-NEXT: values : ( 1, 2, 0, 3, 4, 0, 0, 5, 6, 7, 8, 0 )
     // CHECK-NEXT: ----
     %t1 = sparse_tensor.reinterpret_map %A : tensor<?x?xf64, #BSR>
                                           to tensor<?x?x2x2xf64, #DSDD>
@@ -115,9 +115,9 @@ module {
     // CHECK-NEXT: nse = 12
     // CHECK-NEXT: dim = ( 4, 6 )
     // CHECK-NEXT: lvl = ( 2, 3, 2, 2 )
-    // CHECK-NEXT: pos[1] : ( 0, 2, 3,
-    // CHECK-NEXT: crd[1] : ( 0, 2, 1,
-    // CHECK-NEXT: values : ( 3, 6, 0, 9, 12, 0, 0, 15, 18, 21, 24, 0,
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 1 )
+    // CHECK-NEXT: values : ( 3, 6, 0, 9, 12, 0, 0, 15, 18, 21, 24, 0 )
     // CHECK-NEXT: ----
     %As = call @scale(%A) : (tensor<?x?xf64, #BSR>) -> (tensor<?x?xf64, #BSR>)
     sparse_tensor.print %As : tensor<?x?xf64, #BSR>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/block_majors.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/block_majors.mlir
index 3534e7d15207..caa0d6a71ed3 100755
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/block_majors.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/block_majors.mlir
@@ -108,9 +108,9 @@ module {
   // CHECK-NEXT: nse = 24
   // CHECK-NEXT: dim = ( 6, 16 )
   // CHECK-NEXT: lvl = ( 2, 4, 3, 4 )
-  // CHECK-NEXT: pos[1] : ( 0, 1, 2,
-  // CHECK-NEXT: crd[1] : ( 0, 2,
-  // CHECK-NEXT: values : ( 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7,
+  // CHECK-NEXT: pos[1] : ( 0, 1, 2 )
+  // CHECK-NEXT: crd[1] : ( 0, 2 )
+  // CHECK-NEXT: values : ( 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7 )
   // CHECK-NEXT: ----
   //
   func.func @foo1() {
@@ -134,9 +134,9 @@ module {
   // CHECK-NEXT: nse = 24
   // CHECK-NEXT: dim = ( 6, 16 )
   // CHECK-NEXT: lvl = ( 2, 4, 4, 3 )
-  // CHECK-NEXT: pos[1] : ( 0, 1, 2,
-  // CHECK-NEXT: crd[1] : ( 0, 2,
-  // CHECK-NEXT: values : ( 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 5, 0, 0, 0, 0, 6, 0, 0, 7,
+  // CHECK-NEXT: pos[1] : ( 0, 1, 2 )
+  // CHECK-NEXT: crd[1] : ( 0, 2 )
+  // CHECK-NEXT: values : ( 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 5, 0, 0, 0, 0, 6, 0, 0, 7 )
   // CHECK-NEXT: ----
   //
   func.func @foo2() {
@@ -160,9 +160,9 @@ module {
   // CHECK-NEXT: nse = 24
   // CHECK-NEXT: dim = ( 6, 16 )
   // CHECK-NEXT: lvl = ( 4, 2, 3, 4 )
-  // CHECK-NEXT: pos[1] : ( 0, 1, 1, 2, 2,
-  // CHECK-NEXT: crd[1] : ( 0, 1,
-  // CHECK-NEXT: values : ( 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7,
+  // CHECK-NEXT: pos[1] : ( 0, 1, 1, 2, 2 )
+  // CHECK-NEXT: crd[1] : ( 0, 1 )
+  // CHECK-NEXT: values : ( 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7 )
   // CHECK-NEXT: ----
   //
   func.func @foo3() {
@@ -186,9 +186,9 @@ module {
   // CHECK-NEXT: nse = 24
   // CHECK-NEXT: dim = ( 6, 16 )
   // CHECK-NEXT: lvl = ( 4, 2, 4, 3 )
-  // CHECK-NEXT: pos[1] : ( 0, 1, 1, 2, 2,
-  // CHECK-NEXT: crd[1] : ( 0, 1,
-  // CHECK-NEXT: values : ( 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 5, 0, 0, 0, 0, 6, 0, 0, 7,
+  // CHECK-NEXT: pos[1] : ( 0, 1, 1, 2, 2 )
+  // CHECK-NEXT: crd[1] : ( 0, 1 )
+  // CHECK-NEXT: values : ( 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 5, 0, 0, 0, 0, 6, 0, 0, 7 )
   // CHECK-NEXT: ----
   //
   func.func @foo4() {
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_0.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_0.mlir
index 6a4902057362..7edb76cc8045 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_0.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_0.mlir
@@ -111,11 +111,11 @@ module {
     // CHECK-NEXT: nse = 18
     // CHECK-NEXT: dim = ( 9, 4 )
     // CHECK-NEXT: lvl = ( 9, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 9,
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8,
-    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 6, 7, 10, 12, 13, 16, 18,
-    // CHECK-NEXT: crd[1] : ( 0, 2, 1, 0, 2, 3, 1, 0, 1, 2, 2, 3, 1, 0, 1, 2, 0, 1,
-    // CHECK-NEXT: values : ( 1, 3, 2, 1, 1, 1, 0.5, 1, 5, 2, 1.5, 1, 3.5, 1, 5, 2, 1, 0.5,
+    // CHECK-NEXT: pos[0] : ( 0, 9 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 6, 7, 10, 12, 13, 16, 18 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 1, 0, 2, 3, 1, 0, 1, 2, 2, 3, 1, 0, 1, 2, 0, 1 )
+    // CHECK-NEXT: values : ( 1, 3, 2, 1, 1, 1, 0.5, 1, 5, 2, 1.5, 1, 3.5, 1, 5, 2, 1, 0.5 )
     // CHECK-NEXT: ----
     //
     %0 = call @concat_sparse_sparse(%sm24cc, %sm34cd, %sm44dc)
@@ -142,11 +142,11 @@ module {
     // CHECK-NEXT: nse = 18
     // CHECK-NEXT: dim = ( 9, 4 )
     // CHECK-NEXT: lvl = ( 9, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 9,
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8,
-    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 6, 7, 10, 12, 13, 16, 18,
-    // CHECK-NEXT: crd[1] : ( 0, 2, 1, 0, 2, 3, 1, 0, 1, 2, 2, 3, 1, 0, 1, 2, 0, 1,
-    // CHECK-NEXT: values : ( 1, 3, 2, 1, 1, 1, 0.5, 1, 5, 2, 1.5, 1, 3.5, 1, 5, 2, 1, 0.5,
+    // CHECK-NEXT: pos[0] : ( 0, 9 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 6, 7, 10, 12, 13, 16, 18 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 1, 0, 2, 3, 1, 0, 1, 2, 2, 3, 1, 0, 1, 2, 0, 1 )
+    // CHECK-NEXT: values : ( 1, 3, 2, 1, 1, 1, 0.5, 1, 5, 2, 1.5, 1, 3.5, 1, 5, 2, 1, 0.5 )
     // CHECK-NEXT: ----
     //
     %2 = call @concat_mix_sparse(%m24, %sm34cd, %sm44dc)
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_0_permute.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_0_permute.mlir
index 9c9b0e3330c9..d17e110e2c2d 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_0_permute.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_0_permute.mlir
@@ -144,11 +144,11 @@ module {
     // CHECK-NEXT: nse = 18
     // CHECK-NEXT: dim = ( 9, 4 )
     // CHECK-NEXT: lvl = ( 4, 9 )
-    // CHECK-NEXT: pos[0] : ( 0, 4
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
-    // CHECK-NEXT: pos[1] : ( 0, 5, 11, 16, 18
-    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 7, 8, 1, 3, 4, 6, 7, 8, 0, 2, 4, 5, 7, 2, 5
-    // CHECK-NEXT: values : ( 1, 1, 1, 1, 1, 2, 0.5, 5, 3.5, 5, 0.5, 3, 1, 2, 1.5, 2, 1, 1
+    // CHECK-NEXT: pos[0] : ( 0, 4 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 5, 11, 16, 18 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 7, 8, 1, 3, 4, 6, 7, 8, 0, 2, 4, 5, 7, 2, 5 )
+    // CHECK-NEXT: values : ( 1, 1, 1, 1, 1, 2, 0.5, 5, 3.5, 5, 0.5, 3, 1, 2, 1.5, 2, 1, 1 )
     // CHECK-NEXT: ----
     //
     %4 = call @concat_sparse_sparse_perm(%sm24ccp, %sm34cd, %sm44dc)
@@ -173,11 +173,11 @@ module {
     // CHECK-NEXT: nse = 18
     // CHECK-NEXT: dim = ( 9, 4 )
     // CHECK-NEXT: lvl = ( 9, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 9
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8
-    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 6, 7, 10, 12, 13, 16, 18
-    // CHECK-NEXT: crd[1] : ( 0, 2, 1, 0, 2, 3, 1, 0, 1, 2, 2, 3, 1, 0, 1, 2, 0, 1
-    // CHECK-NEXT: values : ( 1, 3, 2, 1, 1, 1, 0.5, 1, 5, 2, 1.5, 1, 3.5, 1, 5, 2, 1, 0.5
+    // CHECK-NEXT: pos[0] : ( 0, 9 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 6, 7, 10, 12, 13, 16, 18 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 1, 0, 2, 3, 1, 0, 1, 2, 2, 3, 1, 0, 1, 2, 0, 1 )
+    // CHECK-NEXT: values : ( 1, 3, 2, 1, 1, 1, 0.5, 1, 5, 2, 1.5, 1, 3.5, 1, 5, 2, 1, 0.5 )
     // CHECK-NEXT: ----
     //
     %6 = call @concat_mix_sparse_perm(%m24, %sm34cdp, %sm44dc)
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_1.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_1.mlir
index ae067bf18527..c2a4e95e7922 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_1.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_1.mlir
@@ -116,11 +116,11 @@ module {
     // CHECK-NEXT: nse = 18
     // CHECK-NEXT: dim = ( 4, 9 )
     // CHECK-NEXT: lvl = ( 4, 9 )
-    // CHECK-NEXT: pos[0] : ( 0, 4
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
-    // CHECK-NEXT: pos[1] : ( 0, 5, 9, 14, 18
-    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 7, 8, 0, 2, 4, 6, 1, 4, 5, 6, 7, 2, 3, 5, 6
-    // CHECK-NEXT: values : ( 1, 1, 1, 1.5, 1, 3.1, 1, 0.5, 3.5, 2, 1, 1, 5, 2, 5, 2, 1, 0.5
+    // CHECK-NEXT: pos[0] : ( 0, 4 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 5, 9, 14, 18 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 7, 8, 0, 2, 4, 6, 1, 4, 5, 6, 7, 2, 3, 5, 6 )
+    // CHECK-NEXT: values : ( 1, 1, 1, 1.5, 1, 3.1, 1, 0.5, 3.5, 2, 1, 1, 5, 2, 5, 2, 1, 0.5 )
     // CHECK-NEXT: ----
     //
     %8 = call @concat_sparse_sparse_dim1(%sm42cc, %sm43cd, %sm44dc)
@@ -140,11 +140,11 @@ module {
     // CHECK-NEXT: nse = 18
     // CHECK-NEXT: dim = ( 4, 9 )
     // CHECK-NEXT: lvl = ( 4, 9 )
-    // CHECK-NEXT: pos[0] : ( 0, 4
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
-    // CHECK-NEXT: pos[1] : ( 0, 5, 9, 14, 18
-    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 7, 8, 0, 2, 4, 6, 1, 4, 5, 6, 7, 2, 3, 5, 6
-    // CHECK-NEXT: values : ( 1, 1, 1, 1.5, 1, 3.1, 1, 0.5, 3.5, 2, 1, 1, 5, 2, 5, 2, 1, 0.5
+    // CHECK-NEXT: pos[0] : ( 0, 4 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 5, 9, 14, 18 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 7, 8, 0, 2, 4, 6, 1, 4, 5, 6, 7, 2, 3, 5, 6 )
+    // CHECK-NEXT: values : ( 1, 1, 1, 1.5, 1, 3.1, 1, 0.5, 3.5, 2, 1, 1, 5, 2, 5, 2, 1, 0.5 )
     // CHECK-NEXT: ----
     //
     %10 = call @concat_mix_sparse_dim1(%m42, %sm43cd, %sm44dc)
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_1_permute.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_1_permute.mlir
index ce746f27c4d8..8fe7e08a66d3 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_1_permute.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_1_permute.mlir
@@ -130,11 +130,11 @@ module {
     // CHECK-NEXT: nse = 18
     // CHECK-NEXT: dim = ( 4, 9 )
     // CHECK-NEXT: lvl = ( 9, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 9
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8
-    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 6, 7, 10, 12, 15, 17, 18
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 0, 1, 3, 3, 0, 1, 2, 2, 3, 1, 2, 3, 0, 2, 0
-    // CHECK-NEXT: values : ( 1, 3.1, 2, 1, 1, 5, 2, 1, 0.5, 1, 1, 1, 3.5, 5, 0.5, 1.5, 2, 1
+    // CHECK-NEXT: pos[0] : ( 0, 9 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 6, 7, 10, 12, 15, 17, 18 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 0, 1, 3, 3, 0, 1, 2, 2, 3, 1, 2, 3, 0, 2, 0 )
+    // CHECK-NEXT: values : ( 1, 3.1, 2, 1, 1, 5, 2, 1, 0.5, 1, 1, 1, 3.5, 5, 0.5, 1.5, 2, 1 )
     // CHECK-NEXT: ----
     //
     %12 = call @concat_sparse_sparse_perm_dim1(%sm42ccp, %sm43cd, %sm44dc)
@@ -154,11 +154,11 @@ module {
     // CHECK-NEXT: nse = 18
     // CHECK-NEXT: dim = ( 4, 9 )
     // CHECK-NEXT: lvl = ( 4, 9 )
-    // CHECK-NEXT: pos[0] : ( 0, 4
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
-    // CHECK-NEXT: pos[1] : ( 0, 5, 9, 14, 18
-    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 7, 8, 0, 2, 4, 6, 1, 4, 5, 6, 7, 2, 3, 5, 6
-    // CHECK-NEXT: values : ( 1, 1, 1, 1.5, 1, 3.1, 1, 0.5, 3.5, 2, 1, 1, 5, 2, 5, 2, 1, 0.5
+    // CHECK-NEXT: pos[0] : ( 0, 4 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 5, 9, 14, 18 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 7, 8, 0, 2, 4, 6, 1, 4, 5, 6, 7, 2, 3, 5, 6 )
+    // CHECK-NEXT: values : ( 1, 1, 1, 1.5, 1, 3.1, 1, 0.5, 3.5, 2, 1, 1, 5, 2, 5, 2, 1, 0.5 )
     // CHECK-NEXT: ----
     //
     %14 = call @concat_mix_sparse_perm_dim1(%m42, %sm43cdp, %sm44dc)
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/dense_output.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/dense_output.mlir
index b2bbc64f1688..d00d4c87f9bd 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/dense_output.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/dense_output.mlir
@@ -108,7 +108,7 @@ module {
     // CHECK-NEXT: nse = 25
     // CHECK-NEXT: dim = ( 5, 5 )
     // CHECK-NEXT: lvl = ( 5, 5 )
-    // CHECK-NEXT: values : ( 2, 0, 0, 2.8, 0, 0, 4, 0, 0, 5, 0, 0, 6, 0, 0, 8.2, 0, 0, 8, 0, 0, 10.4, 0, 0, 10,
+    // CHECK-NEXT: values : ( 2, 0, 0, 2.8, 0, 0, 4, 0, 0, 5, 0, 0, 6, 0, 0, 8.2, 0, 0, 8, 0, 0, 10.4, 0, 0, 10 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %0 : tensor<?x?xf64, #DenseMatrix>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/dense_output_bf16.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/dense_output_bf16.mlir
index ca9df03c69ee..49f182ddb1d4 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/dense_output_bf16.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/dense_output_bf16.mlir
@@ -95,7 +95,7 @@ module {
     // CHECK-NEXT: nse = 32
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: values : ( 1, 11, 0, 2, 13, 0, 0, 0, 0, 0, 14, 3, 0, 0, 0, 0, 15, 4, 16, 0, 5, 6, 0, 0, 0, 0, 0, 0, 7, 8, 0, 9,
+    // CHECK-NEXT: values : ( 1, 11, 0, 2, 13, 0, 0, 0, 0, 0, 14, 3, 0, 0, 0, 0, 15, 4, 16, 0, 5, 6, 0, 0, 0, 0, 0, 0, 7, 8, 0, 9 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %0 : tensor<?xbf16, #DenseVector>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/dense_output_f16.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/dense_output_f16.mlir
index 4f5e6ddd48d8..cc2a3733c863 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/dense_output_f16.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/dense_output_f16.mlir
@@ -96,7 +96,7 @@ module {
     // CHECK-NEXT: nse = 32
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: values : ( 1, 11, 0, 2, 13, 0, 0, 0, 0, 0, 14, 3, 0, 0, 0, 0, 15, 4, 16, 0, 5, 6, 0, 0, 0, 0, 0, 0, 7, 8, 0, 9,
+    // CHECK-NEXT: values : ( 1, 11, 0, 2, 13, 0, 0, 0, 0, 0, 14, 3, 0, 0, 0, 0, 15, 4, 16, 0, 5, 6, 0, 0, 0, 0, 0, 0, 7, 8, 0, 9 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %0 : tensor<?xf16, #DenseVector>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/dual_sparse_conv_2d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/dual_sparse_conv_2d.mlir
index c645ca656720..f33a3abc7a5f 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/dual_sparse_conv_2d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/dual_sparse_conv_2d.mlir
@@ -161,11 +161,11 @@ module {
     // CHECK-NEXT: nse = 36
     // CHECK-NEXT: dim = ( 6, 6 )
     // CHECK-NEXT: lvl = ( 6, 6 )
-    // CHECK-NEXT: pos[0] : ( 0, 6
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5
-    // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
-    // CHECK-NEXT: values : ( 0, 0, -1, -6, -1, 6, -1, 0, 1, 0, 1, 0, 0, -1, 1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 3, 6, -3, -6, 2, -1, 3, 0, -3, 0
+    // CHECK-NEXT: pos[0] : ( 0, 6 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5 )
+    // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5 )
+    // CHECK-NEXT: values : ( 0, 0, -1, -6, -1, 6, -1, 0, 1, 0, 1, 0, 0, -1, 1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 3, 6, -3, -6, 2, -1, 3, 0, -3, 0 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %2 : tensor<6x6xi32, #DCSR>
@@ -177,9 +177,9 @@ module {
     // CHECK-NEXT: nse = 36
     // CHECK-NEXT: dim = ( 6, 6 )
     // CHECK-NEXT: lvl = ( 6, 6 )
-    // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
-    // CHECK-NEXT: values : ( 0, 0, -1, -6, -1, 6, -1, 0, 1, 0, 1, 0, 0, -1, 1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 3, 6, -3, -6, 2, -1, 3, 0, -3, 0
+    // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5 )
+    // CHECK-NEXT: values : ( 0, 0, -1, -6, -1, 6, -1, 0, 1, 0, 1, 0, 0, -1, 1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 3, 6, -3, -6, 2, -1, 3, 0, -3, 0 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %3 : tensor<6x6xi32, #CSR>
@@ -191,9 +191,9 @@ module {
     // CHECK-NEXT: nse = 36
     // CHECK-NEXT: dim = ( 6, 6 )
     // CHECK-NEXT: lvl = ( 6, 6 )
-    // CHECK-NEXT: pos[0] : ( 0, 6
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5
-    // CHECK-NEXT: values : ( 0, 0, -1, -6, -1, 6, -1, 0, 1, 0, 1, 0, 0, -1, 1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 3, 6, -3, -6, 2, -1, 3, 0, -3, 0
+    // CHECK-NEXT: pos[0] : ( 0, 6 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5 )
+    // CHECK-NEXT: values : ( 0, 0, -1, -6, -1, 6, -1, 0, 1, 0, 1, 0, 0, -1, 1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 3, 6, -3, -6, 2, -1, 3, 0, -3, 0 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %4 : tensor<6x6xi32, #CDR>
@@ -205,9 +205,9 @@ module {
     // CHECK-NEXT: nse = 36
     // CHECK-NEXT: dim = ( 6, 6 )
     // CHECK-NEXT: lvl = ( 6, 6 )
-    // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
-    // CHECK-NEXT: values : ( 0, -1, 0, -1, 0, 2, 0, 0, -1, 0, 0, -1, -1, 1, 1, 0, 3, 3, -6, 0, 0, 0, 6, 0, -1, 1, 0, 0, -3, -3, 6, 0, 0, 0, -6, 0
+    // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5 )
+    // CHECK-NEXT: values : ( 0, -1, 0, -1, 0, 2, 0, 0, -1, 0, 0, -1, -1, 1, 1, 0, 3, 3, -6, 0, 0, 0, 6, 0, -1, 1, 0, 0, -3, -3, 6, 0, 0, 0, -6, 0 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %5 : tensor<6x6xi32, #CSC>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_abs.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_abs.mlir
index 4228bcdb1c0d..707c6c34d8dc 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_abs.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_abs.mlir
@@ -120,18 +120,18 @@ module {
     // CHECK-NEXT: nse = 12
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 12,
-    // CHECK-NEXT: crd[0] : ( 0, 3, 5, 11, 13, 17, 18, 20, 21, 28, 29, 31,
-    // CHECK-NEXT: values : ( 1.5, 1.5, 10.2, 11.3, 1, 1, nan, nan, inf, inf, 0, 0,
+    // CHECK-NEXT: pos[0] : ( 0, 12 )
+    // CHECK-NEXT: crd[0] : ( 0, 3, 5, 11, 13, 17, 18, 20, 21, 28, 29, 31 )
+    // CHECK-NEXT: values : ( 1.5, 1.5, 10.2, 11.3, 1, 1, nan, nan, inf, inf, 0, 0 )
     // CHECK-NEXT: ----
     //
     // CHECK-NEXT: ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 9
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 9,
-    // CHECK-NEXT: crd[0] : ( 0, 3, 5, 11, 13, 17, 18, 21, 31,
-    // CHECK-NEXT: values : ( -2147483648, 2147483647, 1000, 1, 0, 1, 1000, 2147483646, 2147483647,
+    // CHECK-NEXT: pos[0] : ( 0, 9 )
+    // CHECK-NEXT: crd[0] : ( 0, 3, 5, 11, 13, 17, 18, 21, 31 )
+    // CHECK-NEXT: values : ( -2147483648, 2147483647, 1000, 1, 0, 1, 1000, 2147483646, 2147483647 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %0 : tensor<?xf64, #SparseVector>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_binary.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_binary.mlir
index 36701b4385a2..69be2ee75221 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_binary.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_binary.mlir
@@ -453,131 +453,131 @@ module {
     // CHECK-NEXT: nse = 9
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 9,
-    // CHECK-NEXT: crd[0] : ( 0, 3, 11, 17, 20, 21, 28, 29, 31,
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9,
+    // CHECK-NEXT: pos[0] : ( 0, 9 )
+    // CHECK-NEXT: crd[0] : ( 0, 3, 11, 17, 20, 21, 28, 29, 31 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9 )
     // CHECK-NEXT: ----
     //
     // CHECK-NEXT: ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 10
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 10,
-    // CHECK-NEXT: crd[0] : ( 1, 3, 4, 10, 16, 18, 21, 28, 29, 31,
-    // CHECK-NEXT: values : ( 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+    // CHECK-NEXT: pos[0] : ( 0, 10 )
+    // CHECK-NEXT: crd[0] : ( 1, 3, 4, 10, 16, 18, 21, 28, 29, 31 )
+    // CHECK-NEXT: values : ( 11, 12, 13, 14, 15, 16, 17, 18, 19, 20 )
     // CHECK-NEXT: ----
     //
     // CHECK-NEXT: ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 14
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 14,
-    // CHECK-NEXT: crd[0] : ( 0, 1, 3, 4, 10, 11, 16, 17, 18, 20, 21, 28, 29, 31,
-    // CHECK-NEXT: values : ( 1, 11, 2, 13, 14, 3, 15, 4, 16, 5, 6, 7, 8, 9,
+    // CHECK-NEXT: pos[0] : ( 0, 14 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 3, 4, 10, 11, 16, 17, 18, 20, 21, 28, 29, 31 )
+    // CHECK-NEXT: values : ( 1, 11, 2, 13, 14, 3, 15, 4, 16, 5, 6, 7, 8, 9 )
     // CHECK-NEXT: ----
     //
     // CHECK-NEXT: ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 9
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 9,
-    // CHECK-NEXT: crd[0] : ( 0, 3, 11, 17, 20, 21, 28, 29, 31,
-    // CHECK-NEXT: values : ( 0, 6, 3, 28, 0, 6, 56, 72, 9,
+    // CHECK-NEXT: pos[0] : ( 0, 9 )
+    // CHECK-NEXT: crd[0] : ( 0, 3, 11, 17, 20, 21, 28, 29, 31 )
+    // CHECK-NEXT: values : ( 0, 6, 3, 28, 0, 6, 56, 72, 9 )
     // CHECK-NEXT: ----
     //
     // CHECK-NEXT: ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 4
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 4,
-    // CHECK-NEXT: crd[0] : ( 0, 11, 17, 20,
-    // CHECK-NEXT: values : ( 1, 3, 4, 5,
+    // CHECK-NEXT: pos[0] : ( 0, 4 )
+    // CHECK-NEXT: crd[0] : ( 0, 11, 17, 20 )
+    // CHECK-NEXT: values : ( 1, 3, 4, 5 )
     // CHECK-NEXT: ----
     //
     // CHECK-NEXT: ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 9
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 9,
-    // CHECK-NEXT: crd[0] : ( 0, 3, 11, 17, 20, 21, 28, 29, 31,
-    // CHECK-NEXT: values : ( 0, 3, 11, 17, 20, 21, 28, 29, 31,
+    // CHECK-NEXT: pos[0] : ( 0, 9 )
+    // CHECK-NEXT: crd[0] : ( 0, 3, 11, 17, 20, 21, 28, 29, 31 )
+    // CHECK-NEXT: values : ( 0, 3, 11, 17, 20, 21, 28, 29, 31 )
     // CHECK-NEXT: ----
     //
     // CHECK-NEXT: ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 6
     // CHECK-NEXT: dim = ( 4, 8 )
     // CHECK-NEXT: lvl = ( 4, 8 )
-    // CHECK-NEXT: pos[0] : ( 0, 3,
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2,
-    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 6,
-    // CHECK-NEXT: crd[1] : ( 0, 7, 0, 6, 1, 7,
-    // CHECK-NEXT: values : ( 7, -5, -4, -3, -2, 7,
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 6 )
+    // CHECK-NEXT: crd[1] : ( 0, 7, 0, 6, 1, 7 )
+    // CHECK-NEXT: values : ( 7, -5, -4, -3, -2, 7 )
     // CHECK-NEXT: ----
     //
     // CHECK-NEXT: ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 10
     // CHECK-NEXT: dim = ( 4, 4 )
     // CHECK-NEXT: lvl = ( 4, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 4,
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3,
-    // CHECK-NEXT: pos[1] : ( 0, 3, 4, 8, 10,
-    // CHECK-NEXT: crd[1] : ( 0, 2, 3, 1, 0, 1, 2, 3, 0, 1,
-    // CHECK-NEXT: values : ( 2, 4, 1, 2.5, 1, 5, 2, 4, 5, 4,
+    // CHECK-NEXT: pos[0] : ( 0, 4 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 3, 4, 8, 10 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 3, 1, 0, 1, 2, 3, 0, 1 )
+    // CHECK-NEXT: values : ( 2, 4, 1, 2.5, 1, 5, 2, 4, 5, 4 )
     // CHECK-NEXT: ----
     //
     // CHECK-NEXT: ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 10
     // CHECK-NEXT: dim = ( 4, 4 )
     // CHECK-NEXT: lvl = ( 4, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 4,
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3,
-    // CHECK-NEXT: pos[1] : ( 0, 3, 4, 8, 10,
-    // CHECK-NEXT: crd[1] : ( 0, 2, 3, 1, 0, 1, 2, 3, 0, 1,
-    // CHECK-NEXT: values : ( 2, 4, 1, 2.5, 1, 5, 2, 4, 5, 4,
+    // CHECK-NEXT: pos[0] : ( 0, 4 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 3, 4, 8, 10 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 3, 1, 0, 1, 2, 3, 0, 1 )
+    // CHECK-NEXT: values : ( 2, 4, 1, 2.5, 1, 5, 2, 4, 5, 4 )
     // CHECK-NEXT: ----
     //
     // CHECK-NEXT: ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 10
     // CHECK-NEXT: dim = ( 4, 4 )
     // CHECK-NEXT: lvl = ( 4, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 4,
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3,
-    // CHECK-NEXT: pos[1] : ( 0, 3, 4, 8, 10,
-    // CHECK-NEXT: crd[1] : ( 0, 2, 3, 1, 0, 1, 2, 3, 0, 1,
-    // CHECK-NEXT: values : ( 2, 4, 1, 2.5, -1, -5, 2, 4, 1, 4,
+    // CHECK-NEXT: pos[0] : ( 0, 4 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 3, 4, 8, 10 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 3, 1, 0, 1, 2, 3, 0, 1 )
+    // CHECK-NEXT: values : ( 2, 4, 1, 2.5, -1, -5, 2, 4, 1, 4 )
     // CHECK-NEXT: ----
     //
     // CHECK-NEXT: ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 10
     // CHECK-NEXT: dim = ( 4, 4 )
     // CHECK-NEXT: lvl = ( 4, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 4,
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3,
-    // CHECK-NEXT: pos[1] : ( 0, 3, 4, 8, 10,
-    // CHECK-NEXT: crd[1] : ( 0, 2, 3, 1, 0, 1, 2, 3, 0, 1,
-    // CHECK-NEXT: values : ( 0, 1, -1, 1, -1, -2, -2, 2, 1, 2,
+    // CHECK-NEXT: pos[0] : ( 0, 4 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 3, 4, 8, 10 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 3, 1, 0, 1, 2, 3, 0, 1 )
+    // CHECK-NEXT: values : ( 0, 1, -1, 1, -1, -2, -2, 2, 1, 2 )
     // CHECK-NEXT: ----
     //
     // CHECK-NEXT: ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 4
     // CHECK-NEXT: dim = ( 4, 4 )
     // CHECK-NEXT: lvl = ( 4, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 3,
-    // CHECK-NEXT: crd[0] : ( 0, 1, 3,
-    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 4,
-    // CHECK-NEXT: crd[1] : ( 0, 2, 1, 0,
-    // CHECK-NEXT: values : ( 1, 0, 0, 0,
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 4 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 1, 0 )
+    // CHECK-NEXT: values : ( 1, 0, 0, 0 )
     // CHECK-NEXT: ----
     //
     // CHECK-NEXT: ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 6
     // CHECK-NEXT: dim = ( 4, 4 )
     // CHECK-NEXT: lvl = ( 4, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 3,
-    // CHECK-NEXT: crd[0] : ( 0, 2, 3,
-    // CHECK-NEXT: pos[1] : ( 0, 1, 5, 6,
-    // CHECK-NEXT: crd[1] : ( 3, 0, 1, 2, 3, 1,
-    // CHECK-NEXT: values : ( -1, -1, -5, -2, 4, 4,
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 2, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 1, 5, 6 )
+    // CHECK-NEXT: crd[1] : ( 3, 0, 1, 2, 3, 1 )
+    // CHECK-NEXT: values : ( -1, -1, -5, -2, 4, 4 )
     //
     sparse_tensor.print %sv1 : tensor<?xf64, #SparseVector>
     sparse_tensor.print %sv2 : tensor<?xf64, #SparseVector>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block3d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block3d.mlir
index 467b671500e1..ac5f773d6718 100755
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block3d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block3d.mlir
@@ -98,11 +98,11 @@ module {
     // CHECK-NEXT: nse = 8
     // CHECK-NEXT: dim = ( 4, 4, 4 )
     // CHECK-NEXT: lvl = ( 4, 4, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 2
-    // CHECK-NEXT: crd[0] : ( 0, 3
-    // CHECK-NEXT: pos[1] : ( 0, 1, 2
-    // CHECK-NEXT: crd[1] : ( 0, 2
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8
+    // CHECK-NEXT: pos[0] : ( 0, 2 )
+    // CHECK-NEXT: crd[0] : ( 0, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 1, 2 )
+    // CHECK-NEXT: crd[1] : ( 0, 2 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %a : tensor<4x4x4xi32, #Sparse1>
@@ -116,13 +116,13 @@ module {
     // CHECK-NEXT: nse = 32
     // CHECK-NEXT: dim = ( 4, 4, 4 )
     // CHECK-NEXT: lvl = ( 2, 2, 2, 2, 2, 2 )
-    // CHECK-NEXT: pos[0] : ( 0, 2
-    // CHECK-NEXT: crd[0] : ( 0, 1
-    // CHECK-NEXT: pos[1] : ( 0, 2, 4
-    // CHECK-NEXT: crd[1] : ( 0, 1, 0, 1
-    // CHECK-NEXT: pos[2] : ( 0, 1, 2, 3, 4
-    // CHECK-NEXT: crd[2] : ( 0, 1, 0, 1
-    // CHECK-NEXT: values : ( 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 5, 0, 0, 0, 6, 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 7, 0, 0, 0, 8, 0
+    // CHECK-NEXT: pos[0] : ( 0, 2 )
+    // CHECK-NEXT: crd[0] : ( 0, 1 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 4 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 0, 1 )
+    // CHECK-NEXT: pos[2] : ( 0, 1, 2, 3, 4 )
+    // CHECK-NEXT: crd[2] : ( 0, 1, 0, 1 )
+    // CHECK-NEXT: values : ( 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 5, 0, 0, 0, 6, 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 7, 0, 0, 0, 8, 0 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %b : tensor<4x4x4xi32, #Sparse2>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_cmp.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_cmp.mlir
index 732bde55be91..edeffea21171 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_cmp.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_cmp.mlir
@@ -132,22 +132,22 @@ module {
     // CHECK-NEXT: nse = 16
     // CHECK-NEXT: dim = ( 4, 4 )
     // CHECK-NEXT: lvl = ( 4, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 4
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
-    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12, 16
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
-    // CHECK-NEXT: values : ( 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0
+    // CHECK-NEXT: pos[0] : ( 0, 4 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12, 16 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 )
+    // CHECK-NEXT: values : ( 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 11
     // CHECK-NEXT: dim = ( 4, 4 )
     // CHECK-NEXT: lvl = ( 4, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 4
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
-    // CHECK-NEXT: pos[1] : ( 0, 3, 5, 9, 11
-    // CHECK-NEXT: crd[1] : ( 1, 2, 3, 0, 1, 0, 1, 2, 3, 0, 1
-    // CHECK-NEXT: values : ( 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0
+    // CHECK-NEXT: pos[0] : ( 0, 4 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 3, 5, 9, 11 )
+    // CHECK-NEXT: crd[1] : ( 1, 2, 3, 0, 1, 0, 1, 2, 3, 0, 1 )
+    // CHECK-NEXT: values : ( 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0 )
     // CHECK-NEXT: ----
     //
     %v = vector.transfer_read %all_dn_out[%c0, %c0], %d0
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_collapse_shape.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_collapse_shape.mlir
index cae599fa30ae..12132155e7cb 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_collapse_shape.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_collapse_shape.mlir
@@ -162,18 +162,18 @@ module {
     // CHECK-NEXT: nse = 6
     // CHECK-NEXT: dim = ( 12 )
     // CHECK-NEXT: lvl = ( 12 )
-    // CHECK-NEXT: pos[0] : ( 0, 6
-    // CHECK-NEXT: crd[0] : ( 0, 2, 4, 6, 8, 10
-    // CHECK-NEXT: values : ( 1.1, 1.3, 2.1, 2.3, 3.1, 3.3
+    // CHECK-NEXT: pos[0] : ( 0, 6 )
+    // CHECK-NEXT: crd[0] : ( 0, 2, 4, 6, 8, 10 )
+    // CHECK-NEXT: values : ( 1.1, 1.3, 2.1, 2.3, 3.1, 3.3 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 6
     // CHECK-NEXT: dim = ( 12 )
     // CHECK-NEXT: lvl = ( 12 )
-    // CHECK-NEXT: pos[0] : ( 0, 6
-    // CHECK-NEXT: crd[0] : ( 0, 2, 4, 6, 8, 10
-    // CHECK-NEXT: values : ( 1.1, 1.3, 2.1, 2.3, 3.1, 3.3
+    // CHECK-NEXT: pos[0] : ( 0, 6 )
+    // CHECK-NEXT: crd[0] : ( 0, 2, 4, 6, 8, 10 )
+    // CHECK-NEXT: values : ( 1.1, 1.3, 2.1, 2.3, 3.1, 3.3 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ( ( 1, 0, 3, 0, 5, 0, 7, 0, 9, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ), ( 21, 0, 23, 0, 25, 0, 27, 0, 29, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ), ( 41, 0, 43, 0, 45, 0, 47, 0, 49, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) )
@@ -183,22 +183,22 @@ module {
     // CHECK-NEXT: nse = 15
     // CHECK-NEXT: dim = ( 6, 10 )
     // CHECK-NEXT: lvl = ( 6, 10 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 2, 4
-    // CHECK-NEXT: pos[1] : ( 0, 5, 10, 15
-    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 6, 8, 0, 2, 4, 6, 8, 0, 2, 4, 6, 8
-    // CHECK-NEXT: values : ( 1, 3, 5, 7, 9, 21, 23, 25, 27, 29, 41, 43, 45, 47, 49
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 2, 4 )
+    // CHECK-NEXT: pos[1] : ( 0, 5, 10, 15 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 6, 8, 0, 2, 4, 6, 8, 0, 2, 4, 6, 8 )
+    // CHECK-NEXT: values : ( 1, 3, 5, 7, 9, 21, 23, 25, 27, 29, 41, 43, 45, 47, 49 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 15
     // CHECK-NEXT: dim = ( 6, 10 )
     // CHECK-NEXT: lvl = ( 6, 10 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 2, 4
-    // CHECK-NEXT: pos[1] : ( 0, 5, 10, 15
-    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 6, 8, 0, 2, 4, 6, 8, 0, 2, 4, 6, 8
-    // CHECK-NEXT: values : ( 1, 3, 5, 7, 9, 21, 23, 25, 27, 29, 41, 43, 45, 47, 49
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 2, 4 )
+    // CHECK-NEXT: pos[1] : ( 0, 5, 10, 15 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 6, 8, 0, 2, 4, 6, 8, 0, 2, 4, 6, 8 )
+    // CHECK-NEXT: values : ( 1, 3, 5, 7, 9, 21, 23, 25, 27, 29, 41, 43, 45, 47, 49 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ( ( 1, 0, 3, 0, 5, 0, 7, 0, 9, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ), ( 21, 0, 23, 0, 25, 0, 27, 0, 29, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ), ( 41, 0, 43, 0, 45, 0, 47, 0, 49, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) )
@@ -208,22 +208,22 @@ module {
     // CHECK-NEXT: nse = 15
     // CHECK-NEXT: dim = ( 6, 10 )
     // CHECK-NEXT: lvl = ( 6, 10 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 2, 4
-    // CHECK-NEXT: pos[1] : ( 0, 5, 10, 15
-    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 6, 8, 0, 2, 4, 6, 8, 0, 2, 4, 6, 8
-    // CHECK-NEXT: values : ( 1, 3, 5, 7, 9, 21, 23, 25, 27, 29, 41, 43, 45, 47, 49
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 2, 4 )
+    // CHECK-NEXT: pos[1] : ( 0, 5, 10, 15 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 6, 8, 0, 2, 4, 6, 8, 0, 2, 4, 6, 8 )
+    // CHECK-NEXT: values : ( 1, 3, 5, 7, 9, 21, 23, 25, 27, 29, 41, 43, 45, 47, 49 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 15
     // CHECK-NEXT: dim = ( 6, 10 )
     // CHECK-NEXT: lvl = ( 6, 10 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 2, 4
-    // CHECK-NEXT: pos[1] : ( 0, 5, 10, 15
-    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 6, 8, 0, 2, 4, 6, 8, 0, 2, 4, 6, 8
-    // CHECK-NEXT: values : ( 1, 3, 5, 7, 9, 21, 23, 25, 27, 29, 41, 43, 45, 47, 49
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 2, 4 )
+    // CHECK-NEXT: pos[1] : ( 0, 5, 10, 15 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 6, 8, 0, 2, 4, 6, 8, 0, 2, 4, 6, 8 )
+    // CHECK-NEXT: values : ( 1, 3, 5, 7, 9, 21, 23, 25, 27, 29, 41, 43, 45, 47, 49 )
     // CHECK-NEXT: ----
     //
     %v0 = vector.transfer_read %collapse0[%c0], %df: tensor<12xf64>, vector<12xf64>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_complex32.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_complex32.mlir
index 9747da27f9e9..087360f7a1ce 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_complex32.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_complex32.mlir
@@ -104,18 +104,18 @@ module {
     // CHECK-NEXT: nse = 4
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 4,
-    // CHECK-NEXT: crd[0] : ( 0, 1, 28, 31,
-    // CHECK-NEXT: values : ( ( 511.13, 2 ), ( 1, 0 ), ( 5, 4 ), ( 8, 6 ),
+    // CHECK-NEXT: pos[0] : ( 0, 4 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 28, 31 )
+    // CHECK-NEXT: values : ( ( 511.13, 2 ), ( 1, 0 ), ( 5, 4 ), ( 8, 6 ) )
     // CHECK-NEXT: ----
     //
     // CHECK-NEXT: ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 2
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 2,
-    // CHECK-NEXT: crd[0] : ( 28, 31,
-    // CHECK-NEXT: values : ( ( 6, 8 ), ( 15, 18 ),
+    // CHECK-NEXT: pos[0] : ( 0, 2 )
+    // CHECK-NEXT: crd[0] : ( 28, 31 )
+    // CHECK-NEXT: values : ( ( 6, 8 ), ( 15, 18 ) )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %0 : tensor<?xcomplex<f32>, #SparseVector>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_complex64.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_complex64.mlir
index d4b43eb57676..3f748015c958 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_complex64.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_complex64.mlir
@@ -101,18 +101,18 @@ module {
     // CHECK-NEXT: nse = 4
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 4,
-    // CHECK-NEXT: crd[0] : ( 0, 1, 28, 31,
-    // CHECK-NEXT: values : ( ( 511.13, 2 ), ( 1, 0 ), ( 5, 4 ), ( 8, 6 ),
+    // CHECK-NEXT: pos[0] : ( 0, 4 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 28, 31 )
+    // CHECK-NEXT: values : ( ( 511.13, 2 ), ( 1, 0 ), ( 5, 4 ), ( 8, 6 ) )
     // CHECK-NEXT: ----
     //
     // CHECK-NEXT: ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 2
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 2,
-    // CHECK-NEXT: crd[0] : ( 28, 31,
-    // CHECK-NEXT: values : ( ( 6, 8 ), ( 15, 18 ),
+    // CHECK-NEXT: pos[0] : ( 0, 2 )
+    // CHECK-NEXT: crd[0] : ( 28, 31 )
+    // CHECK-NEXT: values : ( ( 6, 8 ), ( 15, 18 ) )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %0 : tensor<?xcomplex<f64>, #SparseVector>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_complex_ops.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_complex_ops.mlir
index c4fc8b080787..2326234bc06c 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_complex_ops.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_complex_ops.mlir
@@ -198,63 +198,63 @@ module {
     // CHECK-NEXT: nse = 4
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 4,
-    // CHECK-NEXT: crd[0] : ( 0, 1, 28, 31,
-    // CHECK-NEXT: values : ( ( -5.13, 2 ), ( 1, 0 ), ( 1, 4 ), ( 8, 6 ),
+    // CHECK-NEXT: pos[0] : ( 0, 4 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 28, 31 )
+    // CHECK-NEXT: values : ( ( -5.13, 2 ), ( 1, 0 ), ( 1, 4 ), ( 8, 6 ) )
     // CHECK-NEXT: ----
     //
     // CHECK-NEXT: ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 3
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 3,
-    // CHECK-NEXT: crd[0] : ( 0, 28, 31,
-    // CHECK-NEXT: values : ( ( 3.43887, 1.47097 ), ( 3.85374, -27.0168 ), ( -193.43, 57.2184 ),
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 28, 31 )
+    // CHECK-NEXT: values : ( ( 3.43887, 1.47097 ), ( 3.85374, -27.0168 ), ( -193.43, 57.2184 ) )
     // CHECK-NEXT: ----
     //
     // CHECK-NEXT: ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 3
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 3,
-    // CHECK-NEXT: crd[0] : ( 0, 28, 31,
-    // CHECK-NEXT: values : ( ( 0.433635, 2.30609 ), ( 2, 1 ), ( 2.53083, 1.18538 ),
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 28, 31 )
+    // CHECK-NEXT: values : ( ( 0.433635, 2.30609 ), ( 2, 1 ), ( 2.53083, 1.18538 ) )
     // CHECK-NEXT: ----
     //
     // CHECK-NEXT: ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 3
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 3,
-    // CHECK-NEXT: crd[0] : ( 1, 28, 31,
-    // CHECK-NEXT: values : ( ( 0.761594, 0 ), ( -0.964028, 0 ), ( 0.995055, 0 ),
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 1, 28, 31 )
+    // CHECK-NEXT: values : ( ( 0.761594, 0 ), ( -0.964028, 0 ), ( 0.995055, 0 ) )
     // CHECK-NEXT: ----
     //
     // CHECK-NEXT: ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 3
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 3,
-    // CHECK-NEXT: crd[0] : ( 0, 28, 31,
-    // CHECK-NEXT: values : ( ( -5.13, 2 ), ( 3, 4 ), ( 5, 6 ),
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 28, 31 )
+    // CHECK-NEXT: values : ( ( -5.13, 2 ), ( 3, 4 ), ( 5, 6 ) )
     // CHECK-NEXT: ----
     //
     // CHECK-NEXT: ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 3
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 3,
-    // CHECK-NEXT: crd[0] : ( 0, 28, 31,
-    // CHECK-NEXT: values : ( ( -2.565, 1 ), ( 1.5, 2 ), ( 2.5, 3 ),
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 28, 31 )
+    // CHECK-NEXT: values : ( ( -2.565, 1 ), ( 1.5, 2 ), ( 2.5, 3 ) )
     // CHECK-NEXT: ----
     //
     // CHECK-NEXT: ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 3
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 3,
-    // CHECK-NEXT: crd[0] : ( 0, 28, 31,
-    // CHECK-NEXT: values : ( 5.50608, 5, 7.81025,
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 28, 31 )
+    // CHECK-NEXT: values : ( 5.50608, 5, 7.81025 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %0 : tensor<?xcomplex<f64>, #SparseVector>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_constant_to_sparse_tensor.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_constant_to_sparse_tensor.mlir
index abdbf80d0bc4..51c13085cf3e 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_constant_to_sparse_tensor.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_constant_to_sparse_tensor.mlir
@@ -56,11 +56,11 @@ module {
     // CHECK-NEXT: nse = 8
     // CHECK-NEXT: dim = ( 10, 8 )
     // CHECK-NEXT: lvl = ( 10, 8 )
-    // CHECK-NEXT: pos[0] : ( 0, 6
-    // CHECK-NEXT: crd[0] : ( 0, 1, 4, 5, 6, 9
-    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 4, 5, 7, 8
-    // CHECK-NEXT: crd[1] : ( 0, 7, 2, 2, 3, 4, 6, 7
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8
+    // CHECK-NEXT: pos[0] : ( 0, 6 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 4, 5, 6, 9 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 4, 5, 7, 8 )
+    // CHECK-NEXT: crd[1] : ( 0, 7, 2, 2, 3, 4, 6, 7 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %ts : tensor<10x8xf64, #Tensor1>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_1d_nwc_wcf.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_1d_nwc_wcf.mlir
index 612e62bd34d2..3e46b6d65112 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_1d_nwc_wcf.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_1d_nwc_wcf.mlir
@@ -116,13 +116,13 @@ func.func @main() {
   // CHECK-NEXT: nse = 18
   // CHECK-NEXT: dim = ( 3, 6, 1 )
   // CHECK-NEXT: lvl = ( 3, 6, 1 )
-  // CHECK-NEXT: pos[0] : ( 0, 3
-  // CHECK-NEXT: crd[0] : ( 0, 1, 2
-  // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18
-  // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
-  // CHECK-NEXT: pos[2] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18
-  // CHECK-NEXT: crd[2] : ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-  // CHECK-NEXT: values : ( 12, 28, 28, 28, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12
+  // CHECK-NEXT: pos[0] : ( 0, 3 )
+  // CHECK-NEXT: crd[0] : ( 0, 1, 2 )
+  // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18 )
+  // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5 )
+  // CHECK-NEXT: pos[2] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18 )
+  // CHECK-NEXT: crd[2] : ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 )
+  // CHECK-NEXT: values : ( 12, 28, 28, 28, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12 )
   // CHECK-NEXT: ----
   //
   sparse_tensor.print %CCC_ret : tensor<?x?x?xf32, #CCC>
@@ -132,11 +132,11 @@ func.func @main() {
   // CHECK-NEXT: nse = 18
   // CHECK-NEXT: dim = ( 3, 6, 1 )
   // CHECK-NEXT: lvl = ( 3, 6, 1 )
-  // CHECK-NEXT: pos[0] : ( 0, 3
-  // CHECK-NEXT: crd[0] : ( 0, 1, 2
-  // CHECK-NEXT: pos[2] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18
-  // CHECK-NEXT: crd[2] : ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-  // CHECK-NEXT: values : ( 12, 28, 28, 28, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12
+  // CHECK-NEXT: pos[0] : ( 0, 3 )
+  // CHECK-NEXT: crd[0] : ( 0, 1, 2 )
+  // CHECK-NEXT: pos[2] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18 )
+  // CHECK-NEXT: crd[2] : ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 )
+  // CHECK-NEXT: values : ( 12, 28, 28, 28, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12 )
   // CHECK-NEXT: ----
   //
   sparse_tensor.print %CDC_ret : tensor<?x?x?xf32, #CDC>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir
index 55d4caeb7eb3..97e9d1783f67 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir
@@ -187,11 +187,11 @@ module {
     // CHECK-NEXT: nse = 36
     // CHECK-NEXT: dim = ( 6, 6 )
     // CHECK-NEXT: lvl = ( 6, 6 )
-    // CHECK-NEXT: pos[0] : ( 0, 6
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5
-    // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
-    // CHECK-NEXT: values : ( 0, 0, -1, -6, -1, 6, -1, 0, 1, 0, 1, 0, 0, -1, 1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 3, 6, -3, -6, 2, -1, 3, 0, -3, 0
+    // CHECK-NEXT: pos[0] : ( 0, 6 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5 )
+    // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5 )
+    // CHECK-NEXT: values : ( 0, 0, -1, -6, -1, 6, -1, 0, 1, 0, 1, 0, 0, -1, 1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 3, 6, -3, -6, 2, -1, 3, 0, -3, 0 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %1 : tensor<6x6xi32, #DCSR>
@@ -203,11 +203,11 @@ module {
     // CHECK-NEXT: nse = 36
     // CHECK-NEXT: dim = ( 6, 6 )
     // CHECK-NEXT: lvl = ( 6, 6 )
-    // CHECK-NEXT: pos[0] : ( 0, 6
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5
-    // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
-    // CHECK-NEXT: values : ( 0, 0, -1, -6, -1, 6, -1, 0, 1, 0, 1, 0, 0, -1, 1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 3, 6, -3, -6, 2, -1, 3, 0, -3, 0
+    // CHECK-NEXT: pos[0] : ( 0, 6 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5 )
+    // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5 )
+    // CHECK-NEXT: values : ( 0, 0, -1, -6, -1, 6, -1, 0, 1, 0, 1, 0, 0, -1, 1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 3, 6, -3, -6, 2, -1, 3, 0, -3, 0 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %2 : tensor<6x6xi32, #DCSR>
@@ -219,9 +219,9 @@ module {
     // CHECK-NEXT: nse = 36
     // CHECK-NEXT: dim = ( 6, 6 )
     // CHECK-NEXT: lvl = ( 6, 6 )
-    // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
-    // CHECK-NEXT: values : ( 0, 0, -1, -6, -1, 6, -1, 0, 1, 0, 1, 0, 0, -1, 1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 3, 6, -3, -6, 2, -1, 3, 0, -3, 0
+    // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5 )
+    // CHECK-NEXT: values : ( 0, 0, -1, -6, -1, 6, -1, 0, 1, 0, 1, 0, 0, -1, 1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 3, 6, -3, -6, 2, -1, 3, 0, -3, 0 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %3 : tensor<6x6xi32, #CSR>
@@ -233,9 +233,9 @@ module {
     // CHECK-NEXT: nse = 36
     // CHECK-NEXT: dim = ( 6, 6 )
     // CHECK-NEXT: lvl = ( 6, 6 )
-    // CHECK-NEXT: pos[0] : ( 0, 6
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5
-    // CHECK-NEXT: values : ( 0, 0, -1, -6, -1, 6, -1, 0, 1, 0, 1, 0, 0, -1, 1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 3, 6, -3, -6, 2, -1, 3, 0, -3, 0
+    // CHECK-NEXT: pos[0] : ( 0, 6 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5 )
+    // CHECK-NEXT: values : ( 0, 0, -1, -6, -1, 6, -1, 0, 1, 0, 1, 0, 0, -1, 1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 3, 6, -3, -6, 2, -1, 3, 0, -3, 0 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %4 : tensor<6x6xi32, #CDR>
@@ -247,9 +247,9 @@ module {
     // CHECK-NEXT: nse = 36
     // CHECK-NEXT: dim = ( 6, 6 )
     // CHECK-NEXT: lvl = ( 6, 6 )
-    // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
-    // CHECK-NEXT: values : ( 0, -1, 0, -1, 0, 2, 0, 0, -1, 0, 0, -1, -1, 1, 1, 0, 3, 3, -6, 0, 0, 0, 6, 0, -1, 1, 0, 0, -3, -3, 6, 0, 0, 0, -6, 0
+    // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5 )
+    // CHECK-NEXT: values : ( 0, -1, 0, -1, 0, 2, 0, 0, -1, 0, 0, -1, -1, 1, 1, 0, 3, 3, -6, 0, 0, 0, 6, 0, -1, 1, 0, 0, -3, -3, 6, 0, 0, 0, -6, 0 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %5 : tensor<6x6xi32, #CSC>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nhwc_hwcf.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nhwc_hwcf.mlir
index d04311e59baf..429175c1a164 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nhwc_hwcf.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nhwc_hwcf.mlir
@@ -147,27 +147,27 @@ func.func @main() {
   // CHECK-NEXT: nse = 108
   // CHECK-NEXT: dim = ( 3, 6, 6, 1 )
   // CHECK-NEXT: lvl = ( 3, 6, 6, 1 )
-  // CHECK-NEXT: pos[0] : ( 0, 3
-  // CHECK-NEXT: crd[0] : ( 0, 1, 2
-  // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18
-  // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
-  // CHECK-NEXT: pos[2] : ( 0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84, 90, 96, 102, 108
+  // CHECK-NEXT: pos[0] : ( 0, 3 )
+  // CHECK-NEXT: crd[0] : ( 0, 1, 2 )
+  // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18 )
+  // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5 )
+  // CHECK-NEXT: pos[2] : ( 0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84, 90, 96, 102, 108 )
   // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0,
   // CHECK-SAME:            1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
   // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
   // CHECK-SAME:            3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
-  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5 )
   // CHECK-NEXT: pos[3] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
   // CHECK-SAME:            21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
   // CHECK-SAME:            40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
   // CHECK-SAME:            59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
   // CHECK-SAME:            78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96,
-  // CHECK-SAME:            97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108
+  // CHECK-SAME:            97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108 )
   // CHECK-NEXT: crd[3] : ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0 )
   // CHECK-NEXT: values : ( 108, 124, 124, 124, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
@@ -175,7 +175,7 @@ func.func @main() {
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
-  // CHECK-SAME:            108, 108, 108
+  // CHECK-SAME:            108, 108, 108 )
   // CHECK-NEXT: ----
   //
   sparse_tensor.print %CCCC_ret : tensor<?x?x?x?xf32, #CCCC>
@@ -185,14 +185,14 @@ func.func @main() {
   // CHECK-NEXT: nse = 108
   // CHECK-NEXT: dim = ( 3, 6, 6, 1 )
   // CHECK-NEXT: lvl = ( 3, 6, 6, 1 )
-  // CHECK-NEXT: pos[0] : ( 0, 3
-  // CHECK-NEXT: crd[0] : ( 0, 1, 2
-  // CHECK-NEXT: pos[2] : ( 0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84, 90, 96, 102, 108
+  // CHECK-NEXT: pos[0] : ( 0, 3 )
+  // CHECK-NEXT: crd[0] : ( 0, 1, 2 )
+  // CHECK-NEXT: pos[2] : ( 0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84, 90, 96, 102, 108 )
   // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0,
   // CHECK-SAME:            1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
   // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
   // CHECK-SAME:            3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
-  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5 )
   // CHECK-NEXT: values : ( 108, 124, 124, 124, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
@@ -200,7 +200,7 @@ func.func @main() {
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
-  // CHECK-SAME:            108, 108, 108
+  // CHECK-SAME:            108, 108, 108 )
   // CHECK-NEXT: ----
   //
   sparse_tensor.print %CDCD_ret : tensor<?x?x?x?xf32, #CDCD>
@@ -210,14 +210,14 @@ func.func @main() {
   // CHECK-NEXT: nse = 108
   // CHECK-NEXT: dim = ( 3, 6, 6, 1 )
   // CHECK-NEXT: lvl = ( 3, 6, 6, 1 )
-  // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18
-  // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
-  // CHECK-NEXT: pos[2] : ( 0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84, 90, 96, 102, 108
+  // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18 )
+  // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5 )
+  // CHECK-NEXT: pos[2] : ( 0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84, 90, 96, 102, 108 )
   // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0,
   // CHECK-SAME:            1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
   // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
   // CHECK-SAME:            3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
-  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5 )
   // CHECK-NEXT: values : ( 108, 124, 124, 124, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
@@ -225,7 +225,7 @@ func.func @main() {
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
-  // CHECK-SAME:            108, 108, 108
+  // CHECK-SAME:            108, 108, 108 )
   // CHECK-NEXT: ----
   //
   sparse_tensor.print %DCCD_ret : tensor<?x?x?x?xf32, #DCCD>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir
index 5e2d1707a249..b23b2dcc173d 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir
@@ -171,14 +171,14 @@ func.func @main() {
   // CHECK-NEXT: nse = 216
   // CHECK-NEXT: dim = ( 6, 6, 6 )
   // CHECK-NEXT: lvl = ( 6, 6, 6 )
-  // CHECK-NEXT: pos[0] : ( 0, 6
-  // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5
-  // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36
+  // CHECK-NEXT: pos[0] : ( 0, 6 )
+  // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5 )
+  // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36 )
   // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
-  // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5 )
   // CHECK-NEXT: pos[2] : ( 0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78,
   // CHECK-SAME:            84, 90, 96, 102, 108, 114, 120, 126, 132, 138, 144, 150,
-  // CHECK-SAME:            156, 162, 168, 174, 180, 186, 192, 198, 204, 210, 216
+  // CHECK-SAME:            156, 162, 168, 174, 180, 186, 192, 198, 204, 210, 216 )
   // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0,
   // CHECK-SAME:            1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
   // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
@@ -190,7 +190,7 @@ func.func @main() {
   // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
   // CHECK-SAME:            3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
   // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4,
-  // CHECK-SAME:            5, 0, 1, 2, 3, 4, 5
+  // CHECK-SAME:            5, 0, 1, 2, 3, 4, 5 )
   // CHECK-NEXT: values : ( 108, 108, 108, 108, 108, 108, 124, 108, 108, 108, 108, 108,
   // CHECK-SAME:            124, 108, 108, 108, 108, 108, 124, 108, 108, 108, 108, 108,
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
@@ -208,7 +208,7 @@ func.func @main() {
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
-  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108 )
   // CHECK-NEXT: ----
   //
   sparse_tensor.print %CCC_ret : tensor<?x?x?xf32, #CCC>
@@ -218,11 +218,11 @@ func.func @main() {
   // CHECK-NEXT: nse = 216
   // CHECK-NEXT: dim = ( 6, 6, 6 )
   // CHECK-NEXT: lvl = ( 6, 6, 6 )
-  // CHECK-NEXT: pos[0] : ( 0, 6
-  // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: pos[0] : ( 0, 6 )
+  // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5 )
   // CHECK-NEXT: pos[2] : ( 0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84,
   // CHECK-SAME:            90, 96, 102, 108, 114, 120, 126, 132, 138, 144, 150, 156,
-  // CHECK-SAME:            162, 168, 174, 180, 186, 192, 198, 204, 210, 216
+  // CHECK-SAME:            162, 168, 174, 180, 186, 192, 198, 204, 210, 216 )
   // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
   // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
   // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
@@ -233,7 +233,7 @@ func.func @main() {
   // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
   // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
   // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
-  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5 )
   // CHECK-NEXT: values : ( 108, 108, 108, 108, 108, 108, 124, 108, 108, 108, 108, 108,
   // CHECK-SAME:            124, 108, 108, 108, 108, 108, 124, 108, 108, 108, 108, 108,
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
@@ -251,7 +251,7 @@ func.func @main() {
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
-  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108 )
   // CHECK-NEXT: ----
   //
   sparse_tensor.print %CDC_ret : tensor<?x?x?xf32, #CDC>
@@ -263,7 +263,7 @@ func.func @main() {
   // CHECK-NEXT: lvl = ( 6, 6, 6 )
   // CHECK-NEXT: pos[2] : ( 0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84, 90,
   // CHECK-SAME:            96, 102, 108, 114, 120, 126, 132, 138, 144, 150, 156, 162,
-  // CHECK-SAME:            168, 174, 180, 186, 192, 198, 204, 210, 216
+  // CHECK-SAME:            168, 174, 180, 186, 192, 198, 204, 210, 216 )
   // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
   // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
   // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
@@ -274,7 +274,7 @@ func.func @main() {
   // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
   // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
   // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
-  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5 )
   // CHECK-NEXT: values : ( 108, 108, 108, 108, 108, 108, 124, 108, 108, 108, 108, 108,
   // CHECK-SAME:            124, 108, 108, 108, 108, 108, 124, 108, 108, 108, 108, 108,
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
@@ -292,7 +292,7 @@ func.func @main() {
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
-  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108 )
   // CHECK-NEXT: ----
   //
   sparse_tensor.print %DDC_ret : tensor<?x?x?xf32, #DDC>
@@ -302,12 +302,12 @@ func.func @main() {
   // CHECK-NEXT: nse = 216
   // CHECK-NEXT: dim = ( 6, 6, 6 )
   // CHECK-NEXT: lvl = ( 6, 6, 6 )
-  // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36
+  // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36 )
   // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
-  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5 )
   // CHECK-NEXT: pos[2] : ( 0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84, 90,
   // CHECK-SAME:            96, 102, 108, 114, 120, 126, 132, 138, 144, 150, 156, 162,
-  // CHECK-SAME:            168, 174, 180, 186, 192, 198, 204, 210, 216
+  // CHECK-SAME:            168, 174, 180, 186, 192, 198, 204, 210, 216 )
   // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
   // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
   // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
@@ -318,7 +318,7 @@ func.func @main() {
   // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
   // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
   // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
-  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5 )
   // CHECK-NEXT: values : ( 108, 108, 108, 108, 108, 108, 124, 108, 108, 108, 108, 108,
   // CHECK-SAME:            124, 108, 108, 108, 108, 108, 124, 108, 108, 108, 108, 108,
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
@@ -336,7 +336,7 @@ func.func @main() {
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
-  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108 )
   // CHECK-NEXT: ----
   //
   sparse_tensor.print %DCC_ret : tensor<?x?x?xf32, #DCC>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_ndhwc_dhwcf.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_ndhwc_dhwcf.mlir
index f68e429a3c82..8fb6704c7f50 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_ndhwc_dhwcf.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_ndhwc_dhwcf.mlir
@@ -155,16 +155,16 @@ func.func @main() {
   // CHECK-NEXT: nse = 216
   // CHECK-NEXT: dim = ( 1, 6, 6, 6, 1 )
   // CHECK-NEXT: lvl = ( 1, 6, 6, 6, 1 )
-  // CHECK-NEXT: pos[0] : ( 0, 1
-  // CHECK-NEXT: crd[0] : ( 0
-  // CHECK-NEXT: pos[1] : ( 0, 6
-  // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5
-  // CHECK-NEXT: pos[2] : ( 0, 6, 12, 18, 24, 30, 36
+  // CHECK-NEXT: pos[0] : ( 0, 1 )
+  // CHECK-NEXT: crd[0] : ( 0 )
+  // CHECK-NEXT: pos[1] : ( 0, 6 )
+  // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5 )
+  // CHECK-NEXT: pos[2] : ( 0, 6, 12, 18, 24, 30, 36 )
   // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
-  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5 )
   // CHECK-NEXT: pos[3] : ( 0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84, 90, 96,
   // CHECK-SAME:            102, 108, 114, 120, 126, 132, 138, 144, 150, 156, 162, 168, 174,
-  // CHECK-SAME:            180, 186, 192, 198, 204, 210, 216
+  // CHECK-SAME:            180, 186, 192, 198, 204, 210, 216 )
   // CHECK-NEXT: crd[3] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
   // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
   // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
@@ -174,7 +174,7 @@ func.func @main() {
   // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
   // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
   // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
-  // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5 )
   // CHECK-NEXT: pos[4] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
   // CHECK-SAME:            19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
   // CHECK-SAME:            36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
@@ -189,7 +189,7 @@ func.func @main() {
   // CHECK-SAME:            173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186,
   // CHECK-SAME:            187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200,
   // CHECK-SAME:            201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214,
-  // CHECK-SAME:            215, 216
+  // CHECK-SAME:            215, 216 )
   // CHECK-NEXT: crd[4] : ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -199,7 +199,7 @@ func.func @main() {
   // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0 )
   // CHECK-NEXT: values : ( 108, 124, 124, 124, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
@@ -215,7 +215,7 @@ func.func @main() {
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
-  // CHECK-SAME:            108, 108, 108, 108, 108, 108
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108 )
   // CHECK-NEXT: ----
   //
   sparse_tensor.print %CCCCC_ret : tensor<?x?x?x?x?xf32, #CCCCC>
@@ -229,11 +229,11 @@ func.func @main() {
   // CHECK-NEXT: nse = 216
   // CHECK-NEXT: dim = ( 1, 6, 6, 6, 1 )
   // CHECK-NEXT: lvl = ( 1, 6, 6, 6, 1 )
-  // CHECK-NEXT: pos[0] : ( 0, 1
-  // CHECK-NEXT: crd[0] : ( 0
-  // CHECK-NEXT: pos[2] : ( 0, 6, 12, 18, 24, 30, 36
+  // CHECK-NEXT: pos[0] : ( 0, 1 )
+  // CHECK-NEXT: crd[0] : ( 0 )
+  // CHECK-NEXT: pos[2] : ( 0, 6, 12, 18, 24, 30, 36 )
   // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
-  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5 )
   // CHECK-NEXT: pos[4] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
   // CHECK-SAME:            19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
   // CHECK-SAME:            36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
@@ -248,7 +248,7 @@ func.func @main() {
   // CHECK-SAME:            173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186,
   // CHECK-SAME:            187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200,
   // CHECK-SAME:            201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214,
-  // CHECK-SAME:            215, 216
+  // CHECK-SAME:            215, 216 )
   // CHECK-NEXT: crd[4] : ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -258,7 +258,7 @@ func.func @main() {
   // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0 )
   // CHECK-NEXT: values : ( 108, 124, 124, 124, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
@@ -274,7 +274,7 @@ func.func @main() {
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
   // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
-  // CHECK-SAME:            108, 108, 108, 108, 108, 108
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108 )
   // CHECK-NEXT: ----
   //
   sparse_tensor.print %CDCDC_ret : tensor<?x?x?x?x?xf32, #CDCDC>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion.mlir
index 8024c1281895..5de3aa0a2e97 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion.mlir
@@ -98,156 +98,156 @@ module {
     // CHECK-NEXT: nse = 24
     // CHECK-NEXT: dim = ( 2, 3, 4 )
     // CHECK-NEXT: lvl = ( 2, 3, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 2
-    // CHECK-NEXT: crd[0] : ( 0, 1
-    // CHECK-NEXT: pos[1] : ( 0, 3, 6
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 0, 1, 2
-    // CHECK-NEXT: pos[2] : ( 0, 4, 8, 12, 16, 20, 24
-    // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24
+    // CHECK-NEXT: pos[0] : ( 0, 2 )
+    // CHECK-NEXT: crd[0] : ( 0, 1 )
+    // CHECK-NEXT: pos[1] : ( 0, 3, 6 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 0, 1, 2 )
+    // CHECK-NEXT: pos[2] : ( 0, 4, 8, 12, 16, 20, 24 )
+    // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 24
     // CHECK-NEXT: dim = ( 2, 3, 4 )
     // CHECK-NEXT: lvl = ( 3, 4, 2 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2
-    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
-    // CHECK-NEXT: pos[2] : ( 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24
-    // CHECK-NEXT: crd[2] : ( 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
-    // CHECK-NEXT: values : ( 1, 13, 2, 14, 3, 15, 4, 16, 5, 17, 6, 18, 7, 19, 8, 20, 9, 21, 10, 22, 11, 23, 12, 24
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2 )
+    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 )
+    // CHECK-NEXT: pos[2] : ( 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24 )
+    // CHECK-NEXT: crd[2] : ( 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 )
+    // CHECK-NEXT: values : ( 1, 13, 2, 14, 3, 15, 4, 16, 5, 17, 6, 18, 7, 19, 8, 20, 9, 21, 10, 22, 11, 23, 12, 24 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 24
     // CHECK-NEXT: dim = ( 2, 3, 4 )
     // CHECK-NEXT: lvl = ( 4, 2, 3 )
-    // CHECK-NEXT: pos[0] : ( 0, 4
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
-    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 6, 8
-    // CHECK-NEXT: crd[1] : ( 0, 1, 0, 1, 0, 1, 0, 1
-    // CHECK-NEXT: pos[2] : ( 0, 3, 6, 9, 12, 15, 18, 21, 24
-    // CHECK-NEXT: crd[2] : ( 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2
-    // CHECK-NEXT: values : ( 1, 5, 9, 13, 17, 21, 2, 6, 10, 14, 18, 22, 3, 7, 11, 15, 19, 23, 4, 8, 12, 16, 20, 24
+    // CHECK-NEXT: pos[0] : ( 0, 4 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 6, 8 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 0, 1, 0, 1, 0, 1 )
+    // CHECK-NEXT: pos[2] : ( 0, 3, 6, 9, 12, 15, 18, 21, 24 )
+    // CHECK-NEXT: crd[2] : ( 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2 )
+    // CHECK-NEXT: values : ( 1, 5, 9, 13, 17, 21, 2, 6, 10, 14, 18, 22, 3, 7, 11, 15, 19, 23, 4, 8, 12, 16, 20, 24 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 24
     // CHECK-NEXT: dim = ( 2, 3, 4 )
     // CHECK-NEXT: lvl = ( 2, 3, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 2
-    // CHECK-NEXT: crd[0] : ( 0, 1
-    // CHECK-NEXT: pos[1] : ( 0, 3, 6
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 0, 1, 2
-    // CHECK-NEXT: pos[2] : ( 0, 4, 8, 12, 16, 20, 24
-    // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24
+    // CHECK-NEXT: pos[0] : ( 0, 2 )
+    // CHECK-NEXT: crd[0] : ( 0, 1 )
+    // CHECK-NEXT: pos[1] : ( 0, 3, 6 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 0, 1, 2 )
+    // CHECK-NEXT: pos[2] : ( 0, 4, 8, 12, 16, 20, 24 )
+    // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 24
     // CHECK-NEXT: dim = ( 2, 3, 4 )
     // CHECK-NEXT: lvl = ( 2, 3, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 2
-    // CHECK-NEXT: crd[0] : ( 0, 1
-    // CHECK-NEXT: pos[1] : ( 0, 3, 6
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 0, 1, 2
-    // CHECK-NEXT: pos[2] : ( 0, 4, 8, 12, 16, 20, 24
-    // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24
+    // CHECK-NEXT: pos[0] : ( 0, 2 )
+    // CHECK-NEXT: crd[0] : ( 0, 1 )
+    // CHECK-NEXT: pos[1] : ( 0, 3, 6 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 0, 1, 2 )
+    // CHECK-NEXT: pos[2] : ( 0, 4, 8, 12, 16, 20, 24 )
+    // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 24
     // CHECK-NEXT: dim = ( 2, 3, 4 )
     // CHECK-NEXT: lvl = ( 2, 3, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 2
-    // CHECK-NEXT: crd[0] : ( 0, 1
-    // CHECK-NEXT: pos[1] : ( 0, 3, 6
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 0, 1, 2
-    // CHECK-NEXT: pos[2] : ( 0, 4, 8, 12, 16, 20, 24
-    // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24
+    // CHECK-NEXT: pos[0] : ( 0, 2 )
+    // CHECK-NEXT: crd[0] : ( 0, 1 )
+    // CHECK-NEXT: pos[1] : ( 0, 3, 6 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 0, 1, 2 )
+    // CHECK-NEXT: pos[2] : ( 0, 4, 8, 12, 16, 20, 24 )
+    // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 24
     // CHECK-NEXT: dim = ( 2, 3, 4 )
     // CHECK-NEXT: lvl = ( 3, 4, 2 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2
-    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
-    // CHECK-NEXT: pos[2] : ( 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24
-    // CHECK-NEXT: crd[2] : ( 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
-    // CHECK-NEXT: values : ( 1, 13, 2, 14, 3, 15, 4, 16, 5, 17, 6, 18, 7, 19, 8, 20, 9, 21, 10, 22, 11, 23, 12, 24
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2 )
+    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 )
+    // CHECK-NEXT: pos[2] : ( 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24 )
+    // CHECK-NEXT: crd[2] : ( 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 )
+    // CHECK-NEXT: values : ( 1, 13, 2, 14, 3, 15, 4, 16, 5, 17, 6, 18, 7, 19, 8, 20, 9, 21, 10, 22, 11, 23, 12, 24 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 24
     // CHECK-NEXT: dim = ( 2, 3, 4 )
     // CHECK-NEXT: lvl = ( 3, 4, 2 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2
-    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
-    // CHECK-NEXT: pos[2] : ( 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24
-    // CHECK-NEXT: crd[2] : ( 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
-    // CHECK-NEXT: values : ( 1, 13, 2, 14, 3, 15, 4, 16, 5, 17, 6, 18, 7, 19, 8, 20, 9, 21, 10, 22, 11, 23, 12, 24
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2 )
+    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 )
+    // CHECK-NEXT: pos[2] : ( 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24 )
+    // CHECK-NEXT: crd[2] : ( 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 )
+    // CHECK-NEXT: values : ( 1, 13, 2, 14, 3, 15, 4, 16, 5, 17, 6, 18, 7, 19, 8, 20, 9, 21, 10, 22, 11, 23, 12, 24 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 24
     // CHECK-NEXT: dim = ( 2, 3, 4 )
     // CHECK-NEXT: lvl = ( 3, 4, 2 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2
-    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
-    // CHECK-NEXT: pos[2] : ( 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24
-    // CHECK-NEXT: crd[2] : ( 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
-    // CHECK-NEXT: values : ( 1, 13, 2, 14, 3, 15, 4, 16, 5, 17, 6, 18, 7, 19, 8, 20, 9, 21, 10, 22, 11, 23, 12, 24
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2 )
+    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 )
+    // CHECK-NEXT: pos[2] : ( 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24 )
+    // CHECK-NEXT: crd[2] : ( 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 )
+    // CHECK-NEXT: values : ( 1, 13, 2, 14, 3, 15, 4, 16, 5, 17, 6, 18, 7, 19, 8, 20, 9, 21, 10, 22, 11, 23, 12, 24 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 24
     // CHECK-NEXT: dim = ( 2, 3, 4 )
     // CHECK-NEXT: lvl = ( 4, 2, 3 )
-    // CHECK-NEXT: pos[0] : ( 0, 4
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
-    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 6, 8
-    // CHECK-NEXT: crd[1] : ( 0, 1, 0, 1, 0, 1, 0, 1
-    // CHECK-NEXT: pos[2] : ( 0, 3, 6, 9, 12, 15, 18, 21, 24
-    // CHECK-NEXT: crd[2] : ( 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2
-    // CHECK-NEXT: values : ( 1, 5, 9, 13, 17, 21, 2, 6, 10, 14, 18, 22, 3, 7, 11, 15, 19, 23, 4, 8, 12, 16, 20, 24
+    // CHECK-NEXT: pos[0] : ( 0, 4 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 6, 8 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 0, 1, 0, 1, 0, 1 )
+    // CHECK-NEXT: pos[2] : ( 0, 3, 6, 9, 12, 15, 18, 21, 24 )
+    // CHECK-NEXT: crd[2] : ( 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2 )
+    // CHECK-NEXT: values : ( 1, 5, 9, 13, 17, 21, 2, 6, 10, 14, 18, 22, 3, 7, 11, 15, 19, 23, 4, 8, 12, 16, 20, 24 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 24
     // CHECK-NEXT: dim = ( 2, 3, 4 )
     // CHECK-NEXT: lvl = ( 4, 2, 3 )
-    // CHECK-NEXT: pos[0] : ( 0, 4
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
-    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 6, 8
-    // CHECK-NEXT: crd[1] : ( 0, 1, 0, 1, 0, 1, 0, 1
-    // CHECK-NEXT: pos[2] : ( 0, 3, 6, 9, 12, 15, 18, 21, 24
-    // CHECK-NEXT: crd[2] : ( 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2
-    // CHECK-NEXT: values : ( 1, 5, 9, 13, 17, 21, 2, 6, 10, 14, 18, 22, 3, 7, 11, 15, 19, 23, 4, 8, 12, 16, 20, 24
+    // CHECK-NEXT: pos[0] : ( 0, 4 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 6, 8 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 0, 1, 0, 1, 0, 1 )
+    // CHECK-NEXT: pos[2] : ( 0, 3, 6, 9, 12, 15, 18, 21, 24 )
+    // CHECK-NEXT: crd[2] : ( 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2 )
+    // CHECK-NEXT: values : ( 1, 5, 9, 13, 17, 21, 2, 6, 10, 14, 18, 22, 3, 7, 11, 15, 19, 23, 4, 8, 12, 16, 20, 24 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 24
     // CHECK-NEXT: dim = ( 2, 3, 4 )
     // CHECK-NEXT: lvl = ( 4, 2, 3 )
-    // CHECK-NEXT: pos[0] : ( 0, 4
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
-    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 6, 8
-    // CHECK-NEXT: crd[1] : ( 0, 1, 0, 1, 0, 1, 0, 1
-    // CHECK-NEXT: pos[2] : ( 0, 3, 6, 9, 12, 15, 18, 21, 24
-    // CHECK-NEXT: crd[2] : ( 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2
-    // CHECK-NEXT: values : ( 1, 5, 9, 13, 17, 21, 2, 6, 10, 14, 18, 22, 3, 7, 11, 15, 19, 23, 4, 8, 12, 16, 20, 24
+    // CHECK-NEXT: pos[0] : ( 0, 4 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 6, 8 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 0, 1, 0, 1, 0, 1 )
+    // CHECK-NEXT: pos[2] : ( 0, 3, 6, 9, 12, 15, 18, 21, 24 )
+    // CHECK-NEXT: crd[2] : ( 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2 )
+    // CHECK-NEXT: values : ( 1, 5, 9, 13, 17, 21, 2, 6, 10, 14, 18, 22, 3, 7, 11, 15, 19, 23, 4, 8, 12, 16, 20, 24 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %1 : tensor<2x3x4xf64, #Tensor1>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_block.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_block.mlir
index ff22283f43a7..66215a340a0b 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_block.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_block.mlir
@@ -82,36 +82,36 @@ module {
     // CHECK-NEXT: nse = 8
     // CHECK-NEXT: dim = ( 2, 4 )
     // CHECK-NEXT: lvl = ( 1, 2, 2, 2 )
-    // CHECK-NEXT: pos[1] : ( 0, 2
-    // CHECK-NEXT: crd[1] : ( 0, 1
-    // CHECK-NEXT: values : ( 1, 2, 5, 6, 3, 4, 7, 8
+    // CHECK-NEXT: pos[1] : ( 0, 2 )
+    // CHECK-NEXT: crd[1] : ( 0, 1 )
+    // CHECK-NEXT: values : ( 1, 2, 5, 6, 3, 4, 7, 8 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 8
     // CHECK-NEXT: dim = ( 2, 4 )
     // CHECK-NEXT: lvl = ( 1, 2, 2, 2 )
-    // CHECK-NEXT: pos[1] : ( 0, 2
-    // CHECK-NEXT: crd[1] : ( 0, 1
-    // CHECK-NEXT: values : ( 1, 2, 5, 6, 3, 4, 7, 8
+    // CHECK-NEXT: pos[1] : ( 0, 2 )
+    // CHECK-NEXT: crd[1] : ( 0, 1 )
+    // CHECK-NEXT: values : ( 1, 2, 5, 6, 3, 4, 7, 8 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 8
     // CHECK-NEXT: dim = ( 2, 4 )
     // CHECK-NEXT: lvl = ( 2, 4 )
-    // CHECK-NEXT: pos[1] : ( 0, 4, 8
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8
+    // CHECK-NEXT: pos[1] : ( 0, 4, 8 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 8
     // CHECK-NEXT: dim = ( 2, 4 )
     // CHECK-NEXT: lvl = ( 4, 2 )
-    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 6, 8
-    // CHECK-NEXT: crd[1] : ( 0, 1, 0, 1, 0, 1, 0, 1
-    // CHECK-NEXT: values : ( 1, 5, 2, 6, 3, 7, 4, 8
+    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 6, 8 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 0, 1, 0, 1, 0, 1 )
+    // CHECK-NEXT: values : ( 1, 5, 2, 6, 3, 7, 4, 8 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %1 : tensor<2x4xf64, #BSR>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_dyn.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_dyn.mlir
index 11baf65e6350..0d9722cd37e6 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_dyn.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_dyn.mlir
@@ -67,66 +67,66 @@ module {
     // CHECK-NEXT: nse = 7
     // CHECK-NEXT: dim = ( 32, 64 )
     // CHECK-NEXT: lvl = ( 32, 64 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 1, 31
-    // CHECK-NEXT: pos[1] : ( 0, 3, 5, 7
-    // CHECK-NEXT: crd[1] : ( 0, 1, 63, 0, 1, 0, 63
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 31 )
+    // CHECK-NEXT: pos[1] : ( 0, 3, 5, 7 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 63, 0, 1, 0, 63 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 7
     // CHECK-NEXT: dim = ( 32, 64 )
     // CHECK-NEXT: lvl = ( 64, 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 1, 63
-    // CHECK-NEXT: pos[1] : ( 0, 3, 5, 7
-    // CHECK-NEXT: crd[1] : ( 0, 1, 31, 0, 1, 0, 31
-    // CHECK-NEXT: values : ( 1, 4, 6, 2, 5, 3, 7
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 63 )
+    // CHECK-NEXT: pos[1] : ( 0, 3, 5, 7 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 31, 0, 1, 0, 31 )
+    // CHECK-NEXT: values : ( 1, 4, 6, 2, 5, 3, 7 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 7
     // CHECK-NEXT: dim = ( 32, 64 )
     // CHECK-NEXT: lvl = ( 32, 64 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 1, 31
-    // CHECK-NEXT: pos[1] : ( 0, 3, 5, 7
-    // CHECK-NEXT: crd[1] : ( 0, 1, 63, 0, 1, 0, 63
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 31 )
+    // CHECK-NEXT: pos[1] : ( 0, 3, 5, 7 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 63, 0, 1, 0, 63 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 7
     // CHECK-NEXT: dim = ( 32, 64 )
     // CHECK-NEXT: lvl = ( 64, 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 1, 63
-    // CHECK-NEXT: pos[1] : ( 0, 3, 5, 7
-    // CHECK-NEXT: crd[1] : ( 0, 1, 31, 0, 1, 0, 31
-    // CHECK-NEXT: values : ( 1, 4, 6, 2, 5, 3, 7
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 63 )
+    // CHECK-NEXT: pos[1] : ( 0, 3, 5, 7 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 31, 0, 1, 0, 31 )
+    // CHECK-NEXT: values : ( 1, 4, 6, 2, 5, 3, 7 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 7
     // CHECK-NEXT: dim = ( 32, 64 )
     // CHECK-NEXT: lvl = ( 64, 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 1, 63
-    // CHECK-NEXT: pos[1] : ( 0, 3, 5, 7
-    // CHECK-NEXT: crd[1] : ( 0, 1, 31, 0, 1, 0, 31
-    // CHECK-NEXT: values : ( 1, 4, 6, 2, 5, 3, 7
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 63 )
+    // CHECK-NEXT: pos[1] : ( 0, 3, 5, 7 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 31, 0, 1, 0, 31 )
+    // CHECK-NEXT: values : ( 1, 4, 6, 2, 5, 3, 7 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 7
     // CHECK-NEXT: dim = ( 32, 64 )
     // CHECK-NEXT: lvl = ( 32, 64 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 1, 31
-    // CHECK-NEXT: pos[1] : ( 0, 3, 5, 7
-    // CHECK-NEXT: crd[1] : ( 0, 1, 63, 0, 1, 0, 63
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 31 )
+    // CHECK-NEXT: pos[1] : ( 0, 3, 5, 7 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 63, 0, 1, 0, 63 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %1 : tensor<?x?xf64, #DCSR>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_ptr.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_ptr.mlir
index 6005aa6cfeae..531efb4f7f37 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_ptr.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_ptr.mlir
@@ -78,64 +78,64 @@ module {
     // CHECK-NEXT: nse = 7
     // CHECK-NEXT: dim = ( 32, 64 )
     // CHECK-NEXT: lvl = ( 32, 64 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 1, 31
-    // CHECK-NEXT: pos[1] : ( 0, 3, 5, 7
-    // CHECK-NEXT: crd[1] : ( 0, 1, 63, 0, 1, 0, 63
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 31 )
+    // CHECK-NEXT: pos[1] : ( 0, 3, 5, 7 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 63, 0, 1, 0, 63 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 7
     // CHECK-NEXT: dim = ( 32, 64 )
     // CHECK-NEXT: lvl = ( 64, 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 1, 63
-    // CHECK-NEXT: pos[1] : ( 0, 3, 5, 7
-    // CHECK-NEXT: crd[1] : ( 0, 1, 31, 0, 1, 0, 31
-    // CHECK-NEXT: values : ( 1, 4, 6, 2, 5, 3, 7
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 63 )
+    // CHECK-NEXT: pos[1] : ( 0, 3, 5, 7 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 31, 0, 1, 0, 31 )
+    // CHECK-NEXT: values : ( 1, 4, 6, 2, 5, 3, 7 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 7
     // CHECK-NEXT: dim = ( 32, 64 )
     // CHECK-NEXT: lvl = ( 64, 32 )
-    // CHECK-NEXT: pos[1] : ( 0, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 7
-    // CHECK-NEXT: crd[1] : ( 0, 1, 31, 0, 1, 0, 31
-    // CHECK-NEXT: values : ( 1, 4, 6, 2, 5, 3, 7
+    // CHECK-NEXT: pos[1] : ( 0, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 7 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 31, 0, 1, 0, 31 )
+    // CHECK-NEXT: values : ( 1, 4, 6, 2, 5, 3, 7 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 7
     // CHECK-NEXT: dim = ( 32, 64 )
     // CHECK-NEXT: lvl = ( 64, 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 1, 63
-    // CHECK-NEXT: pos[1] : ( 0, 3, 5, 7
-    // CHECK-NEXT: crd[1] : ( 0, 1, 31, 0, 1, 0, 31
-    // CHECK-NEXT: values : ( 1, 4, 6, 2, 5, 3, 7
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 63 )
+    // CHECK-NEXT: pos[1] : ( 0, 3, 5, 7 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 31, 0, 1, 0, 31 )
+    // CHECK-NEXT: values : ( 1, 4, 6, 2, 5, 3, 7 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 7
     // CHECK-NEXT: dim = ( 32, 64 )
     // CHECK-NEXT: lvl = ( 32, 64 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 1, 31
-    // CHECK-NEXT: pos[1] : ( 0, 3, 5, 7
-    // CHECK-NEXT: crd[1] : ( 0, 1, 63, 0, 1, 0, 63
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 31 )
+    // CHECK-NEXT: pos[1] : ( 0, 3, 5, 7 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 63, 0, 1, 0, 63 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 7
     // CHECK-NEXT: dim = ( 32, 64 )
     // CHECK-NEXT: lvl = ( 32, 64 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 1, 31
-    // CHECK-NEXT: pos[1] : ( 0, 3, 5, 7
-    // CHECK-NEXT: crd[1] : ( 0, 1, 63, 0, 1, 0, 63
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 31 )
+    // CHECK-NEXT: pos[1] : ( 0, 3, 5, 7 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 63, 0, 1, 0, 63 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %1 : tensor<32x64xf64, #DCSR>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_coo_test.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_coo_test.mlir
index 16813e0aa707..c16ae0de1820 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_coo_test.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_coo_test.mlir
@@ -209,21 +209,21 @@ module {
     // CHECK-NEXT: nse = 64
     // CHECK-NEXT: dim = ( 8, 8 )
     // CHECK-NEXT: lvl = ( 8, 8 )
-    // CHECK-NEXT: pos[0] : ( 0, 64
+    // CHECK-NEXT: pos[0] : ( 0, 64 )
     // CHECK-NEXT: crd[0] : ( 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
     // CHECK-SAME:            2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
     // CHECK-SAME:            5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7,
-    // CHECK-SAME:            7, 7, 7, 7
+    // CHECK-SAME:            7, 7, 7, 7 )
     // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3,
     // CHECK-SAME:            4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
     // CHECK-SAME:            0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3,
-    // CHECK-SAME:            4, 5, 6, 7
+    // CHECK-SAME:            4, 5, 6, 7 )
     // CHECK-NEXT: values : ( 8.8, 4.8, 6.8, 4.8, 8.8, 6.1, 14.8, 16.8, 4.4, 4.4, 4.4, 8.4,
     // CHECK-SAME:            8.4, 12.4, 16.4, 16.4, 8.8, 4.8, 6.8, 8.8, 8.8, 12.8, 14.8,
     // CHECK-SAME:            15.8, 4.3, 5.3, 6.3, 8.3, 8.3, 12.3, 14.3, 16.3, 4.5, 4.5,
     // CHECK-SAME:            6.5, 8.5, 8.5, 12.5, 14.5, 16.5, 9.9, 4.9, 6.9, 8.9, 8.9,
     // CHECK-SAME:            12.9, 15.9, 16.9, 12.1, 6.1, 5.1, 9.1, 9.1, 13.1, 15.1, 17.1,
-    // CHECK-SAME:            15.4, 5.4, 7.4, 5.4, 11.4, 10.4, 11.4, 9.4
+    // CHECK-SAME:            15.4, 5.4, 7.4, 5.4, 11.4, 10.4, 11.4, 9.4 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %COO_RET : tensor<8x8xf32, #SortedCOOSoA>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_dot.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_dot.mlir
index 5451f2d957ad..b41fda19459e 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_dot.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_dot.mlir
@@ -67,18 +67,18 @@ module {
     // CHECK-NEXT: nse = 5
     // CHECK-NEXT: dim = ( 1024 )
     // CHECK-NEXT: lvl = ( 1024 )
-    // CHECK-NEXT: pos[0] : ( 0, 5
-    // CHECK-NEXT: crd[0] : ( 0, 1, 22, 23, 1022
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5
+    // CHECK-NEXT: pos[0] : ( 0, 5 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 22, 23, 1022 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 3
     // CHECK-NEXT: dim = ( 1024 )
     // CHECK-NEXT: lvl = ( 1024 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 22, 1022, 1023
-    // CHECK-NEXT: values : ( 6, 7, 8
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 22, 1022, 1023 )
+    // CHECK-NEXT: values : ( 6, 7, 8 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %s1 : tensor<1024xf32, #SparseVector>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_ds.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_ds.mlir
index 37d8a42a2990..17fab93b9b21 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_ds.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_ds.mlir
@@ -79,9 +79,9 @@ module {
     // CHECK-NEXT: nse = 12
     // CHECK-NEXT: dim = ( 3, 8 )
     // CHECK-NEXT: lvl = ( 3, 8 )
-    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12,
-    // CHECK-NEXT: crd[1] : ( 2, 3, 5, 7, 1, 2, 4, 7, 0, 2, 4, 5,
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12 )
+    // CHECK-NEXT: crd[1] : ( 2, 3, 5, 7, 1, 2, 4, 7, 0, 2, 4, 5 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %A1 : tensor<?x?xf64, #CSR>
@@ -93,9 +93,9 @@ module {
     // CHECK-NEXT: nse = 12
     // CHECK-NEXT: dim = ( 3, 8 )
     // CHECK-NEXT: lvl = ( 3, 8 )
-    // CHECK-NEXT: pos[1] : ( 0, 4, 4, 8, 8, 12,
-    // CHECK-NEXT: crd[1] : ( 2, 3, 5, 7, 1, 2, 4, 7, 0, 2, 4, 5,
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+    // CHECK-NEXT: pos[1] : ( 0, 4, 4, 8, 8, 12, {{.*}} )
+    // CHECK-NEXT: crd[1] : ( 2, 3, 5, 7, 1, 2, 4, 7, 0, 2, 4, 5 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %A2 : tensor<?x?xf64, #CSR_hi>
@@ -107,8 +107,8 @@ module {
     // CHECK-NEXT: nse = 12
     // CHECK-NEXT: dim = ( 3, 8 )
     // CHECK-NEXT: lvl = ( 3, 2, 4 )
-    // CHECK-NEXT: crd[2] : ( 2, 3, 1, 3, 1, 2, 0, 3, 0, 2, 0, 1,
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+    // CHECK-NEXT: crd[2] : ( 2, 3, 1, 3, 1, 2, 0, 3, 0, 2, 0, 1 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 )
     // CHECK-NEXT: ----
     // CHECK-NEXT: ---- Sparse Tensor ----
     //
@@ -120,8 +120,8 @@ module {
     // CHECK-NEXT: nse = 12
     // CHECK-NEXT: dim = ( 3, 8 )
     // CHECK-NEXT: lvl = ( 3, 1, 8 )
-    // CHECK-NEXT: crd[2] : ( 2, 3, 5, 7, 1, 2, 4, 7, 0, 2, 4, 5,
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+    // CHECK-NEXT: crd[2] : ( 2, 3, 5, 7, 1, 2, 4, 7, 0, 2, 4, 5 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %A4 : tensor<?x?xf64, #NV_58>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_empty.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_empty.mlir
index bcd71f7bd674..7255649ccd42 100755
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_empty.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_empty.mlir
@@ -98,36 +98,36 @@ module {
     // CHECK-NEXT: nse = 0
     // CHECK-NEXT: dim = ( 10 )
     // CHECK-NEXT: lvl = ( 10 )
-    // CHECK-NEXT: pos[0] : ( 0, 0,
-    // CHECK-NEXT: crd[0] : (
-    // CHECK-NEXT: values : (
+    // CHECK-NEXT: pos[0] : ( 0, 0 )
+    // CHECK-NEXT: crd[0] : ( )
+    // CHECK-NEXT: values : ( )
     // CHECK-NEXT: ----
     //
     // CHECK-NEXT: ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 0
     // CHECK-NEXT: dim = ( 10 )
     // CHECK-NEXT: lvl = ( 10 )
-    // CHECK-NEXT: pos[0] : ( 0, 0,
-    // CHECK-NEXT: crd[0] : (
-    // CHECK-NEXT: values : (
+    // CHECK-NEXT: pos[0] : ( 0, 0 )
+    // CHECK-NEXT: crd[0] : ( )
+    // CHECK-NEXT: values : ( )
     // CHECK-NEXT: ----
     //
     // CHECK-NEXT: ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 0
     // CHECK-NEXT: dim = ( 10 )
     // CHECK-NEXT: lvl = ( 10 )
-    // CHECK-NEXT: pos[0] : ( 0, 0,
-    // CHECK-NEXT: crd[0] : (
-    // CHECK-NEXT: values : (
+    // CHECK-NEXT: pos[0] : ( 0, 0 )
+    // CHECK-NEXT: crd[0] : ( )
+    // CHECK-NEXT: values : ( )
     // CHECK-NEXT: ----
     //
     // CHECK-NEXT: ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 10
     // CHECK-NEXT: dim = ( 10 )
     // CHECK-NEXT: lvl = ( 10 )
-    // CHECK-NEXT: pos[0] : ( 0, 10,
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
-    // CHECK-NEXT: values : ( 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    // CHECK-NEXT: pos[0] : ( 0, 10 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 )
+    // CHECK-NEXT: values : ( 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %0 : tensor<10xf32, #SV>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_expand.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_expand.mlir
index 451195b2185b..6e875de7481e 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_expand.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_expand.mlir
@@ -86,13 +86,13 @@ module {
     // CHECK-NEXT: nse = 32
     // CHECK-NEXT: dim = ( 8, 4 )
     // CHECK-NEXT: lvl = ( 4, 8 )
-    // CHECK-NEXT: pos[1] : ( 0, 8, 16, 24, 32
+    // CHECK-NEXT: pos[1] : ( 0, 8, 16, 24, 32 )
     // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0,
-    // CHECK-SAME:            1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7
+    // CHECK-SAME:            1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 )
     // CHECK-NEXT: values : ( 32.53, 34.56, 36.59, 38.62, 40.65, 42.68, 44.71, 46.74,
     // CHECK-SAME:            35.73, 37.96, 40.19, 42.42, 44.65, 46.88, 49.11, 51.34,
     // CHECK-SAME:            38.93, 41.36, 43.79, 46.22, 48.65, 51.08, 53.51, 55.94,
-    // CHECK-SAME:            42.13, 44.76, 47.39, 50.02, 52.65, 55.28, 57.91, 60.54
+    // CHECK-SAME:            42.13, 44.76, 47.39, 50.02, 52.65, 55.28, 57.91, 60.54 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %x3 : tensor<8x4xf64, #CSC>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_expand_shape.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_expand_shape.mlir
index 393242484576..5e021596efea 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_expand_shape.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_expand_shape.mlir
@@ -200,74 +200,74 @@ module {
     // CHECK-NEXT: nse = 6
     // CHECK-NEXT: dim = ( 3, 4 )
     // CHECK-NEXT: lvl = ( 3, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2
-    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 6
-    // CHECK-NEXT: crd[1] : ( 0, 2, 0, 2, 0, 2
-    // CHECK-NEXT: values : ( 1, 3, 5, 7, 9, 11
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 6 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 0, 2, 0, 2 )
+    // CHECK-NEXT: values : ( 1, 3, 5, 7, 9, 11 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 6
     // CHECK-NEXT: dim = ( 3, 4 )
     // CHECK-NEXT: lvl = ( 3, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2
-    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 6
-    // CHECK-NEXT: crd[1] : ( 0, 2, 0, 2, 0, 2
-    // CHECK-NEXT: values : ( 1, 3, 5, 7, 9, 11
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 6 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 0, 2, 0, 2 )
+    // CHECK-NEXT: values : ( 1, 3, 5, 7, 9, 11 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 12
     // CHECK-NEXT: dim = ( 3, 2, 2 )
     // CHECK-NEXT: lvl = ( 3, 2, 2 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2
-    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 6
-    // CHECK-NEXT: crd[1] : ( 0, 1, 0, 1, 0, 1
-    // CHECK-NEXT: pos[2] : ( 0, 2, 4, 6, 8, 10, 12
-    // CHECK-NEXT: crd[2] : ( 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
-    // CHECK-NEXT: values : ( 1.1, 1.2, 1.3, 1.4, 2.1, 2.2, 2.3, 2.4, 3.1, 3.2, 3.3, 3.4
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 6 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 0, 1, 0, 1 )
+    // CHECK-NEXT: pos[2] : ( 0, 2, 4, 6, 8, 10, 12 )
+    // CHECK-NEXT: crd[2] : ( 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 )
+    // CHECK-NEXT: values : ( 1.1, 1.2, 1.3, 1.4, 2.1, 2.2, 2.3, 2.4, 3.1, 3.2, 3.3, 3.4 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 12
     // CHECK-NEXT: dim = ( 3, 2, 2 )
     // CHECK-NEXT: lvl = ( 3, 2, 2 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2
-    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 6
-    // CHECK-NEXT: crd[1] : ( 0, 1, 0, 1, 0, 1
-    // CHECK-NEXT: pos[2] : ( 0, 2, 4, 6, 8, 10, 12
-    // CHECK-NEXT: crd[2] : ( 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
-    // CHECK-NEXT: values : ( 1.1, 1.2, 1.3, 1.4, 2.1, 2.2, 2.3, 2.4, 3.1, 3.2, 3.3, 3.4
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 6 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 0, 1, 0, 1 )
+    // CHECK-NEXT: pos[2] : ( 0, 2, 4, 6, 8, 10, 12 )
+    // CHECK-NEXT: crd[2] : ( 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 )
+    // CHECK-NEXT: values : ( 1.1, 1.2, 1.3, 1.4, 2.1, 2.2, 2.3, 2.4, 3.1, 3.2, 3.3, 3.4 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 12
     // CHECK-NEXT: dim = ( 3, 2, 2 )
     // CHECK-NEXT: lvl = ( 3, 2, 2 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2
-    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 6
-    // CHECK-NEXT: crd[1] : ( 0, 1, 0, 1, 0, 1
-    // CHECK-NEXT: pos[2] : ( 0, 2, 4, 6, 8, 10, 12
-    // CHECK-NEXT: crd[2] : ( 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
-    // CHECK-NEXT: values : ( 1.1, 1.2, 1.3, 1.4, 2.1, 2.2, 2.3, 2.4, 3.1, 3.2, 3.3, 3.4
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 6 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 0, 1, 0, 1 )
+    // CHECK-NEXT: pos[2] : ( 0, 2, 4, 6, 8, 10, 12 )
+    // CHECK-NEXT: crd[2] : ( 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 )
+    // CHECK-NEXT: values : ( 1.1, 1.2, 1.3, 1.4, 2.1, 2.2, 2.3, 2.4, 3.1, 3.2, 3.3, 3.4 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 12
     // CHECK-NEXT: dim = ( 3, 2, 2 )
     // CHECK-NEXT: lvl = ( 3, 2, 2 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2
-    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 6
-    // CHECK-NEXT: crd[1] : ( 0, 1, 0, 1, 0, 1
-    // CHECK-NEXT: pos[2] : ( 0, 2, 4, 6, 8, 10, 12
-    // CHECK-NEXT: crd[2] : ( 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
-    // CHECK-NEXT: values : ( 1.1, 1.2, 1.3, 1.4, 2.1, 2.2, 2.3, 2.4, 3.1, 3.2, 3.3, 3.4
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 6 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 0, 1, 0, 1 )
+    // CHECK-NEXT: pos[2] : ( 0, 2, 4, 6, 8, 10, 12 )
+    // CHECK-NEXT: crd[2] : ( 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 )
+    // CHECK-NEXT: values : ( 1.1, 1.2, 1.3, 1.4, 2.1, 2.2, 2.3, 2.4, 3.1, 3.2, 3.3, 3.4 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %expand2 : tensor<3x4xf64, #SparseMatrix>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_filter_conv2d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_filter_conv2d.mlir
index 37ff2e3ffd3f..93b8eda2c2ae 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_filter_conv2d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_filter_conv2d.mlir
@@ -106,13 +106,13 @@ module {
     // CHECK-NEXT: nse = 36
     // CHECK-NEXT: dim = ( 6, 6 )
     // CHECK-NEXT: lvl = ( 6, 6 )
-    // CHECK-NEXT: pos[0] : ( 0, 6
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5
-    // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36
+    // CHECK-NEXT: pos[0] : ( 0, 6 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5 )
+    // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36 )
     // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4,
-    // CHECK-SAME:            5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+    // CHECK-SAME:            5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5 )
     // CHECK-NEXT: values : ( 0, 0, -1, -6, -1, 6, -1, 0, 1, 0, 1, 0, 0, -1, 1,
-    // CHECK-SAME:            0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 3, 6, -3, -6, 2, -1, 3, 0, -3, 0
+    // CHECK-SAME:            0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 3, 6, -3, -6, 2, -1, 3, 0, -3, 0 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %1 : tensor<6x6xi32, #DCSR>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_index.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_index.mlir
index 3ce45e5fd971..005398445828 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_index.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_index.mlir
@@ -212,80 +212,80 @@ module {
     // CHECK-NEXT: nse = 2
     // CHECK-NEXT: dim = ( 8 )
     // CHECK-NEXT: lvl = ( 8 )
-    // CHECK-NEXT: pos[0] : ( 0, 2
-    // CHECK-NEXT: crd[0] : ( 2, 4
-    // CHECK-NEXT: values : ( 20, 80
+    // CHECK-NEXT: pos[0] : ( 0, 2 )
+    // CHECK-NEXT: crd[0] : ( 2, 4 )
+    // CHECK-NEXT: values : ( 20, 80 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 8
     // CHECK-NEXT: dim = ( 8 )
     // CHECK-NEXT: lvl = ( 8 )
-    // CHECK-NEXT: pos[0] : ( 0, 8
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5, 6, 7
-    // CHECK-NEXT: values : ( 0, 1, 12, 3, 24, 5, 6, 7
+    // CHECK-NEXT: pos[0] : ( 0, 8 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5, 6, 7 )
+    // CHECK-NEXT: values : ( 0, 1, 12, 3, 24, 5, 6, 7 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 8
     // CHECK-NEXT: dim = ( 8 )
     // CHECK-NEXT: lvl = ( 8 )
-    // CHECK-NEXT: pos[0] : ( 0, 8
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5, 6, 7
-    // CHECK-NEXT: values : ( 0, 2, 8, 24, 64, 160, 384, 896
+    // CHECK-NEXT: pos[0] : ( 0, 8 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5, 6, 7 )
+    // CHECK-NEXT: values : ( 0, 2, 8, 24, 64, 160, 384, 896 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 8
     // CHECK-NEXT: dim = ( 8 )
     // CHECK-NEXT: lvl = ( 8 )
-    // CHECK-NEXT: pos[0] : ( 0, 8
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5, 6, 7
-    // CHECK-NEXT: values : ( 1, 3, 6, 11, 20, 37, 70, 135
+    // CHECK-NEXT: pos[0] : ( 0, 8 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5, 6, 7 )
+    // CHECK-NEXT: values : ( 1, 3, 6, 11, 20, 37, 70, 135 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 2
     // CHECK-NEXT: dim = ( 3, 4 )
     // CHECK-NEXT: lvl = ( 3, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 2
-    // CHECK-NEXT: crd[0] : ( 1, 2
-    // CHECK-NEXT: pos[1] : ( 0, 1, 2
-    // CHECK-NEXT: crd[1] : ( 1, 3
-    // CHECK-NEXT: values : ( 10, 120
+    // CHECK-NEXT: pos[0] : ( 0, 2 )
+    // CHECK-NEXT: crd[0] : ( 1, 2 )
+    // CHECK-NEXT: pos[1] : ( 0, 1, 2 )
+    // CHECK-NEXT: crd[1] : ( 1, 3 )
+    // CHECK-NEXT: values : ( 10, 120 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 12
     // CHECK-NEXT: dim = ( 3, 4 )
     // CHECK-NEXT: lvl = ( 3, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2
-    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
-    // CHECK-NEXT: values : ( 0, 1, 2, 3, 1, 12, 3, 4, 2, 3, 4, 25
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2 )
+    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 )
+    // CHECK-NEXT: values : ( 0, 1, 2, 3, 1, 12, 3, 4, 2, 3, 4, 25 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 12
     // CHECK-NEXT: dim = ( 3, 4 )
     // CHECK-NEXT: lvl = ( 3, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2
-    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
-    // CHECK-NEXT: values : ( 0, 0, 0, 0, 0, 2, 2, 3, 0, 2, 12, 24
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2 )
+    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 )
+    // CHECK-NEXT: values : ( 0, 0, 0, 0, 0, 2, 2, 3, 0, 2, 12, 24 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 12
     // CHECK-NEXT: dim = ( 3, 4 )
     // CHECK-NEXT: lvl = ( 3, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2
-    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 2, 4, 4, 5, 3, 4, 7, 9
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2 )
+    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 2, 4, 4, 5, 3, 4, 7, 9 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %0 : tensor<8xi64, #SparseVector>
@@ -304,11 +304,11 @@ module {
     // CHECK-NEXT: nse = 6
     // CHECK-NEXT: dim = ( 2, 3 )
     // CHECK-NEXT: lvl = ( 2, 3 )
-    // CHECK-NEXT: pos[0] : ( 0, 2
-    // CHECK-NEXT: crd[0] : ( 0, 1
-    // CHECK-NEXT: pos[1] : ( 0, 3, 6
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 0, 1, 2
-    // CHECK-NEXT: values : ( 0, 10, 0, 1, 1, 42
+    // CHECK-NEXT: pos[0] : ( 0, 2 )
+    // CHECK-NEXT: crd[0] : ( 0, 1 )
+    // CHECK-NEXT: pos[1] : ( 0, 3, 6 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 0, 1, 2 )
+    // CHECK-NEXT: values : ( 0, 10, 0, 1, 1, 42 )
     // CHECK-NEXT: ----
     //
     %100 = call @add_outer_2d(%sf32) : (tensor<2x3xf32, #SparseMatrix>)
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_insert_1d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_insert_1d.mlir
index 12e0d2267a26..a81ec172f599 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_insert_1d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_insert_1d.mlir
@@ -65,9 +65,9 @@ module {
     // CHECK-NEXT: nse = 4
     // CHECK-NEXT: dim = ( 1024 )
     // CHECK-NEXT: lvl = ( 1024 )
-    // CHECK-NEXT: pos[0] : ( 0, 4,
-    // CHECK-NEXT: crd[0] : ( 0, 1, 3, 1023,
-    // CHECK-NEXT: values : ( 1, 2, 3, 4,
+    // CHECK-NEXT: pos[0] : ( 0, 4 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 3, 1023 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %5 : tensor<1024xf32, #SparseVector>
@@ -86,9 +86,9 @@ module {
     // CHECK-NEXT: nse = 8
     // CHECK-NEXT: dim = ( 1024 )
     // CHECK-NEXT: lvl = ( 1024 )
-    // CHECK-NEXT: pos[0] : ( 0, 8,
-    // CHECK-NEXT: crd[0] : ( 0, 3, 6, 9, 12, 15, 18, 21,
-    // CHECK-NEXT: values : ( 1, 1, 1, 1, 1, 1, 1, 1,
+    // CHECK-NEXT: pos[0] : ( 0, 8 )
+    // CHECK-NEXT: crd[0] : ( 0, 3, 6, 9, 12, 15, 18, 21 )
+    // CHECK-NEXT: values : ( 1, 1, 1, 1, 1, 1, 1, 1 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %8 : tensor<1024xf32, #SparseVector>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_insert_2d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_insert_2d.mlir
index 883109150653..baab6e759886 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_insert_2d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_insert_2d.mlir
@@ -68,7 +68,7 @@ module {
     // CHECK-NEXT: nse = 12
     // CHECK-NEXT: dim = ( 4, 3 )
     // CHECK-NEXT: lvl = ( 4, 3 )
-    // CHECK-NEXT: values : ( 1, 0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 4,
+    // CHECK-NEXT: values : ( 1, 0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 4 )
     // CHECK-NEXT: ----
     //
     %densea = tensor.empty() : tensor<4x3xf64, #Dense>
@@ -86,10 +86,10 @@ module {
     // CHECK-NEXT: nse = 4
     // CHECK-NEXT: dim = ( 4, 3 )
     // CHECK-NEXT: lvl = ( 4, 3 )
-    // CHECK-NEXT: pos[0] : ( 0, 4,
-    // CHECK-NEXT: crd[0] : ( 0, 2, 3, 3,
-    // CHECK-NEXT: crd[1] : ( 0, 2, 0, 2,
-    // CHECK-NEXT: values : ( 1, 2, 3, 4,
+    // CHECK-NEXT: pos[0] : ( 0, 4 )
+    // CHECK-NEXT: crd[0] : ( 0, 2, 3, 3 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 0, 2 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4 )
     // CHECK-NEXT: ----
     //
     %cooa = tensor.empty() : tensor<4x3xf64, #SortedCOO>
@@ -107,9 +107,9 @@ module {
     // CHECK-NEXT: nse = 4
     // CHECK-NEXT: dim = ( 4, 3 )
     // CHECK-NEXT: lvl = ( 4, 3 )
-    // CHECK-NEXT: pos[1] : ( 0, 1, 1, 2, 4,
-    // CHECK-NEXT: crd[1] : ( 0, 2, 0, 2,
-    // CHECK-NEXT: values : ( 1, 2, 3, 4,
+    // CHECK-NEXT: pos[1] : ( 0, 1, 1, 2, 4 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 0, 2 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4 )
     // CHECK-NEXT: ----
     //
     %csra = tensor.empty() : tensor<4x3xf64, #CSR>
@@ -127,11 +127,11 @@ module {
     // CHECK-NEXT: nse = 4
     // CHECK-NEXT: dim = ( 4, 3 )
     // CHECK-NEXT: lvl = ( 4, 3 )
-    // CHECK-NEXT: pos[0] : ( 0, 3,
-    // CHECK-NEXT: crd[0] : ( 0, 2, 3,
-    // CHECK-NEXT: pos[1] : ( 0, 1, 2, 4,
-    // CHECK-NEXT: crd[1] : ( 0, 2, 0, 2,
-    // CHECK-NEXT: values : ( 1, 2, 3, 4,
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 2, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 1, 2, 4 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 0, 2 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4 )
     // CHECK-NEXT: ----
     //
     %dcsra = tensor.empty() : tensor<4x3xf64, #DCSR>
@@ -149,9 +149,9 @@ module {
     // CHECK-NEXT: nse = 9
     // CHECK-NEXT: dim = ( 4, 3 )
     // CHECK-NEXT: lvl = ( 4, 3 )
-    // CHECK-NEXT: pos[0] : ( 0, 3,
-    // CHECK-NEXT: crd[0] : ( 0, 2, 3,
-    // CHECK-NEXT: values : ( 1, 0, 0, 0, 0, 2, 3, 0, 4,
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 2, 3 )
+    // CHECK-NEXT: values : ( 1, 0, 0, 0, 0, 2, 3, 0, 4 )
     // CHECK-NEXT: ----
     //
     %rowa = tensor.empty() : tensor<4x3xf64, #Row>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_insert_3d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_insert_3d.mlir
index db6612402357..12ef94fc2baa 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_insert_3d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_insert_3d.mlir
@@ -64,11 +64,11 @@ module {
     // CHECK-NEXT: nse = 5
     // CHECK-NEXT: dim = ( 5, 4, 3 )
     // CHECK-NEXT: lvl = ( 5, 4, 3 )
-    // CHECK-NEXT: pos[0] : ( 0, 2
-    // CHECK-NEXT: crd[0] : ( 3, 4
-    // CHECK-NEXT: pos[2] : ( 0, 2, 2, 2, 3, 3, 3, 4, 5
-    // CHECK-NEXT: crd[2] : ( 1, 2, 1, 2, 2
-    // CHECK-NEXT: values : ( 1.1, 2.2, 3.3, 4.4, 5.5
+    // CHECK-NEXT: pos[0] : ( 0, 2 )
+    // CHECK-NEXT: crd[0] : ( 3, 4 )
+    // CHECK-NEXT: pos[2] : ( 0, 2, 2, 2, 3, 3, 3, 4, 5 )
+    // CHECK-NEXT: crd[2] : ( 1, 2, 1, 2, 2 )
+    // CHECK-NEXT: values : ( 1.1, 2.2, 3.3, 4.4, 5.5 )
     // CHECK-NEXT: ----
     %tensora = tensor.empty() : tensor<5x4x3xf64, #TensorCSR>
     %tensor1 = tensor.insert %f1 into %tensora[%c3, %c0, %c1] : tensor<5x4x3xf64, #TensorCSR>
@@ -83,11 +83,11 @@ module {
     // CHECK-NEXT: nse = 12
     // CHECK-NEXT: dim = ( 5, 4, 3 )
     // CHECK-NEXT: lvl = ( 5, 4, 3 )
-    // CHECK-NEXT: pos[0] : ( 0, 2
-    // CHECK-NEXT: crd[0] : ( 3, 4
-    // CHECK-NEXT: pos[1] : ( 0, 2, 4
-    // CHECK-NEXT: crd[1] : ( 0, 3, 2, 3
-    // CHECK-NEXT: values : ( 0, 1.1, 2.2, 0, 3.3, 0, 0, 0, 4.4, 0, 0, 5.5
+    // CHECK-NEXT: pos[0] : ( 0, 2 )
+    // CHECK-NEXT: crd[0] : ( 3, 4 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 4 )
+    // CHECK-NEXT: crd[1] : ( 0, 3, 2, 3 )
+    // CHECK-NEXT: values : ( 0, 1.1, 2.2, 0, 3.3, 0, 0, 0, 4.4, 0, 0, 5.5 )
     // CHECK-NEXT: ----
     %rowa = tensor.empty() : tensor<5x4x3xf64, #TensorRow>
     %row1 = tensor.insert %f1 into %rowa[%c3, %c0, %c1] : tensor<5x4x3xf64, #TensorRow>
@@ -102,11 +102,11 @@ module {
     // CHECK-NEXT: nse = 5
     // CHECK-NEXT: dim = ( 5, 4, 3 )
     // CHECK-NEXT: lvl = ( 5, 4, 3 )
-    // CHECK-NEXT: pos[0] : ( 0, 2
-    // CHECK-NEXT: crd[0] : ( 3, 4
-    // CHECK-NEXT: pos[1] : ( 0, 3, 5
-    // CHECK-NEXT: crd[1] : ( 0, 1, 0, 2, 3, 1, 2, 2, 3, 2
-    // CHECK-NEXT: values : ( 1.1, 2.2, 3.3, 4.4, 5.5
+    // CHECK-NEXT: pos[0] : ( 0, 2 )
+    // CHECK-NEXT: crd[0] : ( 3, 4 )
+    // CHECK-NEXT: pos[1] : ( 0, 3, 5 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 0, 2, 3, 1, 2, 2, 3, 2 )
+    // CHECK-NEXT: values : ( 1.1, 2.2, 3.3, 4.4, 5.5 )
     // CHECK-NEXT: ----
     %ccoo = tensor.empty() : tensor<5x4x3xf64, #CCoo>
     %ccoo1 = tensor.insert %f1 into %ccoo[%c3, %c0, %c1] : tensor<5x4x3xf64, #CCoo>
@@ -121,9 +121,9 @@ module {
     // CHECK-NEXT: nse = 5
     // CHECK-NEXT: dim = ( 5, 4, 3 )
     // CHECK-NEXT: lvl = ( 5, 4, 3 )
-    // CHECK-NEXT: pos[1] : ( 0, 0, 0, 0, 3, 5
-    // CHECK-NEXT: crd[1] : ( 0, 1, 0, 2, 3, 1, 2, 2, 3, 2
-    // CHECK-NEXT: values : ( 1.1, 2.2, 3.3, 4.4, 5.5
+    // CHECK-NEXT: pos[1] : ( 0, 0, 0, 0, 3, 5 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 0, 2, 3, 1, 2, 2, 3, 2 )
+    // CHECK-NEXT: values : ( 1.1, 2.2, 3.3, 4.4, 5.5 )
     // CHECK-NEXT: ----
     %dcoo = tensor.empty() : tensor<5x4x3xf64, #DCoo>
     %dcoo1 = tensor.insert %f1 into %dcoo[%c3, %c0, %c1] : tensor<5x4x3xf64, #DCoo>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_loose.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_loose.mlir
index c05a9f574269..416c137a1dc3 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_loose.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_loose.mlir
@@ -39,13 +39,16 @@ module {
     %s = sparse_tensor.convert %d : tensor<5x4xf64> to tensor<5x4xf64, #CSR_hi>
 
     //
+    // Note: position for loose_compressed level can vary in the end,
+    // therefore we loosly check it with {{.*}}.
+    //
     // CHECK:   ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 17
     // CHECK-NEXT: dim = ( 5, 4 )
     // CHECK-NEXT: lvl = ( 5, 4 )
-    // CHECK-NEXT: pos[1] : ( 0, 4, 4, 8, 8, 9, 9, 13
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 2, 0, 1, 2, 3, 0, 1, 2, 3
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 5.5, 9, 10, 11, 12, 13, 14, 15, 16
+    // CHECK-NEXT: pos[1] : ( 0, 4, 4, 8, 8, 9, 9, 13, {{.*}} )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 2, 0, 1, 2, 3, 0, 1, 2, 3 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 5.5, 9, 10, 11, 12, 13, 14, 15, 16 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %s : tensor<5x4xf64, #CSR_hi>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir
index e505559037a9..14fa22a70134 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir
@@ -146,9 +146,9 @@ module {
     // CHECK-NEXT: nse = 32
     // CHECK-NEXT: dim = ( 4, 8 )
     // CHECK-NEXT: lvl = ( 4, 8 )
-    // CHECK-NEXT: pos[1] : ( 0, 8, 16, 24, 32
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7
-    // CHECK-NEXT: values : ( 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2, 8.2, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3, 8.3, 1.4, 2.4, 3.4, 4.4, 5.4, 6.4, 7.4, 8.4
+    // CHECK-NEXT: pos[1] : ( 0, 8, 16, 24, 32 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 )
+    // CHECK-NEXT: values : ( 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2, 8.2, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3, 8.3, 1.4, 2.4, 3.4, 4.4, 5.4, 6.4, 7.4, 8.4 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %a1 : tensor<4x8xf64, #CSR>
@@ -158,11 +158,11 @@ module {
     // CHECK-NEXT: nse = 32
     // CHECK-NEXT: dim = ( 4, 8 )
     // CHECK-NEXT: lvl = ( 4, 8 )
-    // CHECK-NEXT: pos[0] : ( 0, 4
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
-    // CHECK-NEXT: pos[1] : ( 0, 8, 16, 24, 32
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7
-    // CHECK-NEXT: values : ( 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2, 8.2, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3, 8.3, 1.4, 2.4, 3.4, 4.4, 5.4, 6.4, 7.4, 8.4
+    // CHECK-NEXT: pos[0] : ( 0, 4 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 8, 16, 24, 32 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 )
+    // CHECK-NEXT: values : ( 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2, 8.2, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3, 8.3, 1.4, 2.4, 3.4, 4.4, 5.4, 6.4, 7.4, 8.4 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %a2 : tensor<4x8xf64, #DCSR>
@@ -172,9 +172,9 @@ module {
     // CHECK-NEXT: nse = 4
     // CHECK-NEXT: dim = ( 4, 8 )
     // CHECK-NEXT: lvl = ( 4, 8 )
-    // CHECK-NEXT: pos[1] : ( 0, 2, 2, 3, 4
-    // CHECK-NEXT: crd[1] : ( 1, 5, 1, 7
-    // CHECK-NEXT: values : ( 2.1, 6.1, 2.3, 1
+    // CHECK-NEXT: pos[1] : ( 0, 2, 2, 3, 4 )
+    // CHECK-NEXT: crd[1] : ( 1, 5, 1, 7 )
+    // CHECK-NEXT: values : ( 2.1, 6.1, 2.3, 1 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %a3 : tensor<4x8xf64, #CSR>
@@ -184,11 +184,11 @@ module {
     // CHECK-NEXT: nse = 4
     // CHECK-NEXT: dim = ( 4, 8 )
     // CHECK-NEXT: lvl = ( 4, 8 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 2, 3
-    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 4
-    // CHECK-NEXT: crd[1] : ( 1, 5, 1, 7
-    // CHECK-NEXT: values : ( 2.1, 6.1, 2.3, 1
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 2, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 4 )
+    // CHECK-NEXT: crd[1] : ( 1, 5, 1, 7 )
+    // CHECK-NEXT: values : ( 2.1, 6.1, 2.3, 1 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %a4 : tensor<4x8xf64, #DCSR>
@@ -198,9 +198,9 @@ module {
     // CHECK-NEXT: nse = 32
     // CHECK-NEXT: dim = ( 8, 4 )
     // CHECK-NEXT: lvl = ( 8, 4 )
-    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12, 16, 20, 24, 28, 32
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
-    // CHECK-NEXT: values : ( 10.1, 11.1, 12.1, 13.1, 10.2, 11.2, 12.2, 13.2, 10.3, 11.3, 12.3, 13.3, 10.4, 11.4, 12.4, 13.4, 10.5, 11.5, 12.5, 13.5, 10.6, 11.6, 12.6, 13.6, 10.7, 11.7, 12.7, 13.7, 10.8, 11.8, 12.8, 13.8
+    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12, 16, 20, 24, 28, 32 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 )
+    // CHECK-NEXT: values : ( 10.1, 11.1, 12.1, 13.1, 10.2, 11.2, 12.2, 13.2, 10.3, 11.3, 12.3, 13.3, 10.4, 11.4, 12.4, 13.4, 10.5, 11.5, 12.5, 13.5, 10.6, 11.6, 12.6, 13.6, 10.7, 11.7, 12.7, 13.7, 10.8, 11.8, 12.8, 13.8 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %b1 : tensor<8x4xf64, #CSR>
@@ -210,11 +210,11 @@ module {
     // CHECK-NEXT: nse = 32
     // CHECK-NEXT: dim = ( 8, 4 )
     // CHECK-NEXT: lvl = ( 8, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 8
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5, 6, 7
-    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12, 16, 20, 24, 28, 32
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
-    // CHECK-NEXT: values : ( 10.1, 11.1, 12.1, 13.1, 10.2, 11.2, 12.2, 13.2, 10.3, 11.3, 12.3, 13.3, 10.4, 11.4, 12.4, 13.4, 10.5, 11.5, 12.5, 13.5, 10.6, 11.6, 12.6, 13.6, 10.7, 11.7, 12.7, 13.7, 10.8, 11.8, 12.8, 13.8
+    // CHECK-NEXT: pos[0] : ( 0, 8 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5, 6, 7 )
+    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12, 16, 20, 24, 28, 32 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 )
+    // CHECK-NEXT: values : ( 10.1, 11.1, 12.1, 13.1, 10.2, 11.2, 12.2, 13.2, 10.3, 11.3, 12.3, 13.3, 10.4, 11.4, 12.4, 13.4, 10.5, 11.5, 12.5, 13.5, 10.6, 11.6, 12.6, 13.6, 10.7, 11.7, 12.7, 13.7, 10.8, 11.8, 12.8, 13.8 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %b2 : tensor<8x4xf64, #DCSR>
@@ -224,9 +224,9 @@ module {
     // CHECK-NEXT: nse = 8
     // CHECK-NEXT: dim = ( 8, 4 )
     // CHECK-NEXT: lvl = ( 8, 4 )
-    // CHECK-NEXT: pos[1] : ( 0, 1, 2, 3, 4, 4, 5, 6, 8
-    // CHECK-NEXT: crd[1] : ( 3, 2, 1, 0, 1, 2, 2, 3
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8
+    // CHECK-NEXT: pos[1] : ( 0, 1, 2, 3, 4, 4, 5, 6, 8 )
+    // CHECK-NEXT: crd[1] : ( 3, 2, 1, 0, 1, 2, 2, 3 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %b3 : tensor<8x4xf64, #CSR>
@@ -236,11 +236,11 @@ module {
     // CHECK-NEXT: nse = 8
     // CHECK-NEXT: dim = ( 8, 4 )
     // CHECK-NEXT: lvl = ( 8, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 7
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 5, 6, 7
-    // CHECK-NEXT: pos[1] : ( 0, 1, 2, 3, 4, 5, 6, 8
-    // CHECK-NEXT: crd[1] : ( 3, 2, 1, 0, 1, 2, 2, 3
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8
+    // CHECK-NEXT: pos[0] : ( 0, 7 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 5, 6, 7 )
+    // CHECK-NEXT: pos[1] : ( 0, 1, 2, 3, 4, 5, 6, 8 )
+    // CHECK-NEXT: crd[1] : ( 3, 2, 1, 0, 1, 2, 2, 3 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %b4 : tensor<8x4xf64, #DCSR>
@@ -289,9 +289,9 @@ module {
     // CHECK-NEXT: nse = 16
     // CHECK-NEXT: dim = ( 4, 4 )
     // CHECK-NEXT: lvl = ( 4, 4 )
-    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12, 16
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
-    // CHECK-NEXT: values : ( 388.76, 425.56, 462.36, 499.16, 397.12, 434.72, 472.32, 509.92, 405.48, 443.88, 482.28, 520.68, 413.84, 453.04, 492.24, 531.44
+    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12, 16 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 )
+    // CHECK-NEXT: values : ( 388.76, 425.56, 462.36, 499.16, 397.12, 434.72, 472.32, 509.92, 405.48, 443.88, 482.28, 520.68, 413.84, 453.04, 492.24, 531.44 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %1 : tensor<4x4xf64, #CSR>
@@ -301,11 +301,11 @@ module {
     // CHECK-NEXT: nse = 16
     // CHECK-NEXT: dim = ( 4, 4 )
     // CHECK-NEXT: lvl = ( 4, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 4
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
-    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12, 16
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
-    // CHECK-NEXT: values : ( 388.76, 425.56, 462.36, 499.16, 397.12, 434.72, 472.32, 509.92, 405.48, 443.88, 482.28, 520.68, 413.84, 453.04, 492.24, 531.44
+    // CHECK-NEXT: pos[0] : ( 0, 4 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12, 16 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 )
+    // CHECK-NEXT: values : ( 388.76, 425.56, 462.36, 499.16, 397.12, 434.72, 472.32, 509.92, 405.48, 443.88, 482.28, 520.68, 413.84, 453.04, 492.24, 531.44 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %2 : tensor<4x4xf64, #DCSR>
@@ -324,9 +324,9 @@ module {
     // CHECK-NEXT: nse = 12
     // CHECK-NEXT: dim = ( 4, 4 )
     // CHECK-NEXT: lvl = ( 4, 4 )
-    // CHECK-NEXT: pos[1] : ( 0, 4, 4, 8, 12
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
-    // CHECK-NEXT: values : ( 86.08, 94.28, 102.48, 110.68, 23.46, 25.76, 28.06, 30.36, 10.8, 11.8, 12.8, 13.8
+    // CHECK-NEXT: pos[1] : ( 0, 4, 4, 8, 12 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 )
+    // CHECK-NEXT: values : ( 86.08, 94.28, 102.48, 110.68, 23.46, 25.76, 28.06, 30.36, 10.8, 11.8, 12.8, 13.8 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %4 : tensor<4x4xf64, #CSR>
@@ -336,11 +336,11 @@ module {
     // CHECK-NEXT: nse = 12
     // CHECK-NEXT: dim = ( 4, 4 )
     // CHECK-NEXT: lvl = ( 4, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 2, 3
-    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
-    // CHECK-NEXT: values : ( 86.08, 94.28, 102.48, 110.68, 23.46, 25.76, 28.06, 30.36, 10.8, 11.8, 12.8, 13.8
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 2, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 )
+    // CHECK-NEXT: values : ( 86.08, 94.28, 102.48, 110.68, 23.46, 25.76, 28.06, 30.36, 10.8, 11.8, 12.8, 13.8 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %5 : tensor<4x4xf64, #DCSR>
@@ -359,9 +359,9 @@ module {
     // CHECK-NEXT: nse = 5
     // CHECK-NEXT: dim = ( 4, 4 )
     // CHECK-NEXT: lvl = ( 4, 4 )
-    // CHECK-NEXT: pos[1] : ( 0, 2, 2, 3, 5
-    // CHECK-NEXT: crd[1] : ( 1, 2, 2, 2, 3
-    // CHECK-NEXT: values : ( 30.5, 4.2, 4.6, 7, 8
+    // CHECK-NEXT: pos[1] : ( 0, 2, 2, 3, 5 )
+    // CHECK-NEXT: crd[1] : ( 1, 2, 2, 2, 3 )
+    // CHECK-NEXT: values : ( 30.5, 4.2, 4.6, 7, 8 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %7 : tensor<4x4xf64, #CSR>
@@ -371,11 +371,11 @@ module {
     // CHECK-NEXT: nse = 5
     // CHECK-NEXT: dim = ( 4, 4 )
     // CHECK-NEXT: lvl = ( 4, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 2, 3
-    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 5
-    // CHECK-NEXT: crd[1] : ( 1, 2, 2, 2, 3
-    // CHECK-NEXT: values : ( 30.5, 4.2, 4.6, 7, 8
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 2, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 5 )
+    // CHECK-NEXT: crd[1] : ( 1, 2, 2, 2, 3 )
+    // CHECK-NEXT: values : ( 30.5, 4.2, 4.6, 7, 8 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %8 : tensor<4x4xf64, #DCSR>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul_slice.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul_slice.mlir
index 58e96d1fa51f..c76bf2ccfe35 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul_slice.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul_slice.mlir
@@ -174,11 +174,11 @@ module {
     // CHECK-NEXT: nse = 5
     // CHECK-NEXT: dim = ( 4, 4 )
     // CHECK-NEXT: lvl = ( 4, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 2, 3
-    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 5
-    // CHECK-NEXT: crd[1] : ( 1, 2, 2, 2, 3
-    // CHECK-NEXT: values : ( 30.5, 4.2, 4.6, 7, 8
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 2, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 5 )
+    // CHECK-NEXT: crd[1] : ( 1, 2, 2, 2, 3 )
+    // CHECK-NEXT: values : ( 30.5, 4.2, 4.6, 7, 8 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %2 : tensor<4x4xf64, #DCSR>
@@ -196,9 +196,9 @@ module {
     // CHECK-NEXT: nse = 5
     // CHECK-NEXT: dim = ( 4, 4 )
     // CHECK-NEXT: lvl = ( 4, 4 )
-    // CHECK-NEXT: pos[1] : ( 0, 2, 2, 3, 5
-    // CHECK-NEXT: crd[1] : ( 1, 2, 2, 2, 3
-    // CHECK-NEXT: values : ( 30.5, 4.2, 4.6, 7, 8
+    // CHECK-NEXT: pos[1] : ( 0, 2, 2, 3, 5 )
+    // CHECK-NEXT: crd[1] : ( 1, 2, 2, 2, 3 )
+    // CHECK-NEXT: values : ( 30.5, 4.2, 4.6, 7, 8 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %3 : tensor<4x4xf64, #CSR>
@@ -210,9 +210,9 @@ module {
     // CHECK-NEXT: nse = 3
     // CHECK-NEXT: dim = ( 4, 4 )
     // CHECK-NEXT: lvl = ( 4, 4 )
-    // CHECK-NEXT: pos[1] : ( 0, 1, 2, 2, 3
-    // CHECK-NEXT: crd[1] : ( 0, 0, 0
-    // CHECK-NEXT: values : ( 2.3, 6.9, 12.6
+    // CHECK-NEXT: pos[1] : ( 0, 1, 2, 2, 3 )
+    // CHECK-NEXT: crd[1] : ( 0, 0, 0 )
+    // CHECK-NEXT: values : ( 2.3, 6.9, 12.6 )
     // CHECK-NEXT: ----
     //
     %s1 = tensor.extract_slice %tmp[0, 1][4, 4][2, 1] : tensor<8x8xf64, #DCSR> to tensor<4x4xf64, #DCSR_SLICE_1>
@@ -228,9 +228,9 @@ module {
     // CHECK-NEXT: nse = 3
     // CHECK-NEXT: dim = ( 4, 4 )
     // CHECK-NEXT: lvl = ( 4, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 0, 1, 0, 3, 0
-    // CHECK-NEXT: values : ( 2.3, 6.9, 12.6
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 0, 1, 0, 3, 0 )
+    // CHECK-NEXT: values : ( 2.3, 6.9, 12.6 )
     // CHECK-NEXT: ----
     //
     %t1_coo = sparse_tensor.convert %sa : tensor<8x8xf64> to tensor<8x8xf64, #COO>
@@ -246,9 +246,9 @@ module {
     // CHECK-NEXT: nse = 3
     // CHECK-NEXT: dim = ( 4, 4 )
     // CHECK-NEXT: lvl = ( 4, 4 )
-    // CHECK-NEXT: pos[1] : ( 0, 1, 2, 2, 3
-    // CHECK-NEXT: crd[1] : ( 0, 0, 0
-    // CHECK-NEXT: values : ( 2.3, 6.9, 12.6
+    // CHECK-NEXT: pos[1] : ( 0, 1, 2, 2, 3 )
+    // CHECK-NEXT: crd[1] : ( 0, 0, 0 )
+    // CHECK-NEXT: values : ( 2.3, 6.9, 12.6 )
     // CHECK-NEXT: ----
     //
     %s1_dyn = tensor.extract_slice %tmp[%c_0, %c_1][4, 4][%c_2, %c_1] : tensor<8x8xf64, #DCSR> to tensor<4x4xf64, #DCSR_SLICE_dyn>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matrix_ops.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matrix_ops.mlir
index 8ea26fa3efdf..770c4f55a280 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matrix_ops.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matrix_ops.mlir
@@ -163,11 +163,11 @@ module {
     // CHECK-NEXT: nse = 9
     // CHECK-NEXT: dim = ( 4, 8 )
     // CHECK-NEXT: lvl = ( 4, 8 )
-    // CHECK-NEXT: pos[0] : ( 0, 4
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
-    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 6, 9
-    // CHECK-NEXT: crd[1] : ( 0, 1, 7, 2, 4, 7, 0, 2, 3
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9
+    // CHECK-NEXT: pos[0] : ( 0, 4 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 6, 9 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 7, 2, 4, 7, 0, 2, 3 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %sm1 : tensor<?x?xf64, #DCSR>
@@ -177,11 +177,11 @@ module {
     // CHECK-NEXT: nse = 6
     // CHECK-NEXT: dim = ( 4, 8 )
     // CHECK-NEXT: lvl = ( 4, 8 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2
-    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 6
-    // CHECK-NEXT: crd[1] : ( 0, 7, 0, 6, 1, 7
-    // CHECK-NEXT: values : ( 6, 5, 4, 3, 2, 1
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 6 )
+    // CHECK-NEXT: crd[1] : ( 0, 7, 0, 6, 1, 7 )
+    // CHECK-NEXT: values : ( 6, 5, 4, 3, 2, 1 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %sm2 : tensor<?x?xf64, #DCSR>
@@ -191,11 +191,11 @@ module {
     // CHECK-NEXT: nse = 9
     // CHECK-NEXT: dim = ( 4, 8 )
     // CHECK-NEXT: lvl = ( 4, 8 )
-    // CHECK-NEXT: pos[0] : ( 0, 4
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
-    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 6, 9
-    // CHECK-NEXT: crd[1] : ( 0, 1, 7, 2, 4, 7, 0, 2, 3
-    // CHECK-NEXT: values : ( 2, 4, 6, 8, 10, 12, 14, 16, 18
+    // CHECK-NEXT: pos[0] : ( 0, 4 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 6, 9 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 7, 2, 4, 7, 0, 2, 3 )
+    // CHECK-NEXT: values : ( 2, 4, 6, 8, 10, 12, 14, 16, 18 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %0 : tensor<?x?xf64, #DCSR>
@@ -205,11 +205,11 @@ module {
     // CHECK-NEXT: nse = 9
     // CHECK-NEXT: dim = ( 4, 8 )
     // CHECK-NEXT: lvl = ( 4, 8 )
-    // CHECK-NEXT: pos[0] : ( 0, 4
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
-    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 6, 9
-    // CHECK-NEXT: crd[1] : ( 0, 1, 7, 2, 4, 7, 0, 2, 3
-    // CHECK-NEXT: values : ( 2, 4, 6, 8, 10, 12, 14, 16, 18
+    // CHECK-NEXT: pos[0] : ( 0, 4 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 6, 9 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 7, 2, 4, 7, 0, 2, 3 )
+    // CHECK-NEXT: values : ( 2, 4, 6, 8, 10, 12, 14, 16, 18 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %1 : tensor<?x?xf64, #DCSR>
@@ -219,11 +219,11 @@ module {
     // CHECK-NEXT: nse = 13
     // CHECK-NEXT: dim = ( 4, 8 )
     // CHECK-NEXT: lvl = ( 4, 8 )
-    // CHECK-NEXT: pos[0] : ( 0, 4
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
-    // CHECK-NEXT: pos[1] : ( 0, 3, 6, 10, 13
-    // CHECK-NEXT: crd[1] : ( 0, 1, 7, 0, 6, 7, 1, 2, 4, 7, 0, 2, 3
-    // CHECK-NEXT: values : ( 8, 4, 5, 4, 3, 6, 2, 8, 10, 13, 14, 16, 18
+    // CHECK-NEXT: pos[0] : ( 0, 4 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 3, 6, 10, 13 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 7, 0, 6, 7, 1, 2, 4, 7, 0, 2, 3 )
+    // CHECK-NEXT: values : ( 8, 4, 5, 4, 3, 6, 2, 8, 10, 13, 14, 16, 18 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %2 : tensor<?x?xf64, #DCSR>
@@ -233,11 +233,11 @@ module {
     // CHECK-NEXT: nse = 2
     // CHECK-NEXT: dim = ( 4, 8 )
     // CHECK-NEXT: lvl = ( 4, 8 )
-    // CHECK-NEXT: pos[0] : ( 0, 2
-    // CHECK-NEXT: crd[0] : ( 0, 2
-    // CHECK-NEXT: pos[1] : ( 0, 1, 2
-    // CHECK-NEXT: crd[1] : ( 0, 7
-    // CHECK-NEXT: values : ( 12, 12
+    // CHECK-NEXT: pos[0] : ( 0, 2 )
+    // CHECK-NEXT: crd[0] : ( 0, 2 )
+    // CHECK-NEXT: pos[1] : ( 0, 1, 2 )
+    // CHECK-NEXT: crd[1] : ( 0, 7 )
+    // CHECK-NEXT: values : ( 12, 12 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %3 : tensor<?x?xf64, #DCSR>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_out_mult_elt.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_out_mult_elt.mlir
index c30c6b9b5cc2..683be61be2a2 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_out_mult_elt.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_out_mult_elt.mlir
@@ -88,11 +88,11 @@ module {
     // CHECK-NEXT: nse = 2
     // CHECK-NEXT: dim = ( 32, 16 )
     // CHECK-NEXT: lvl = ( 32, 16 )
-    // CHECK-NEXT: pos[0] : ( 0, 2
-    // CHECK-NEXT: crd[0] : ( 2, 31
-    // CHECK-NEXT: pos[1] : ( 0, 1, 2
-    // CHECK-NEXT: crd[1] : ( 2, 0
-    // CHECK-NEXT: values : ( 14, 20
+    // CHECK-NEXT: pos[0] : ( 0, 2 )
+    // CHECK-NEXT: crd[0] : ( 2, 31 )
+    // CHECK-NEXT: pos[1] : ( 0, 1, 2 )
+    // CHECK-NEXT: crd[1] : ( 2, 0 )
+    // CHECK-NEXT: values : ( 14, 20 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %0 : tensor<32x16xf32, #DCSR>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_out_reduction.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_out_reduction.mlir
index 74f0e7698bc1..8eadc348020f 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_out_reduction.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_out_reduction.mlir
@@ -95,11 +95,11 @@ module {
     // CHECK-NEXT: nse = 2
     // CHECK-NEXT: dim = ( 3, 3 )
     // CHECK-NEXT: lvl = ( 3, 3 )
-    // CHECK-NEXT: pos[0] : ( 0, 2
-    // CHECK-NEXT: crd[0] : ( 1, 2
-    // CHECK-NEXT: pos[1] : ( 0, 1, 2
-    // CHECK-NEXT: crd[1] : ( 1, 2
-    // CHECK-NEXT: values : ( 7, 69
+    // CHECK-NEXT: pos[0] : ( 0, 2 )
+    // CHECK-NEXT: crd[0] : ( 1, 2 )
+    // CHECK-NEXT: pos[1] : ( 0, 1, 2 )
+    // CHECK-NEXT: crd[1] : ( 1, 2 )
+    // CHECK-NEXT: values : ( 7, 69 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %0 : tensor<?x?xi32, #SparseMatrix>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_out_simple.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_out_simple.mlir
index 88513c80219a..334ffe492952 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_out_simple.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_out_simple.mlir
@@ -87,11 +87,11 @@ module {
     // CHECK-NEXT: nse = 9
     // CHECK-NEXT: dim = ( 5, 5 )
     // CHECK-NEXT: lvl = ( 5, 5 )
-    // CHECK-NEXT: pos[0] : ( 0, 5
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4
-    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 5, 7, 9
-    // CHECK-NEXT: crd[1] : ( 0, 3, 1, 4, 2, 0, 3, 1, 4
-    // CHECK-NEXT: values : ( 1, 1.96, 4, 6.25, 9, 16.81, 16, 27.04, 25
+    // CHECK-NEXT: pos[0] : ( 0, 5 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 5, 7, 9 )
+    // CHECK-NEXT: crd[1] : ( 0, 3, 1, 4, 2, 0, 3, 1, 4 )
+    // CHECK-NEXT: values : ( 1, 1.96, 4, 6.25, 9, 16.81, 16, 27.04, 25 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %0 : tensor<?x?xf64, #DCSR>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack_d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack_d.mlir
index 20ae7e86285c..b48ff9c9df74 100755
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack_d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack_d.mlir
@@ -29,7 +29,7 @@
   crdWidth = 32
 }>
 
-#BatchedCSR = #sparse_tensor.encoding<{
+#DenseCSR = #sparse_tensor.encoding<{
   map = (d0, d1, d2) -> (d0 : dense, d1 : dense, d2 : compressed),
   posWidth = 64,
   crdWidth = 32
@@ -42,7 +42,7 @@
 }>
 
 //
-// Test assembly operation with CCC, batched-CSR and CSR-dense.
+// Test assembly operation with CCC, dense-CSR and CSR-dense.
 //
 module {
   //
@@ -77,7 +77,7 @@ module {
         tensor<6xi64>, tensor<8xi32>), tensor<8xf32> to tensor<4x3x2xf32, #CCC>
 
     //
-    // Setup BatchedCSR.
+    // Setup DenseCSR.
     //
 
     %data1 = arith.constant dense<
@@ -88,7 +88,7 @@ module {
     %crd1 = arith.constant dense<
        [ 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1]> : tensor<16xi32>
 
-    %s1 = sparse_tensor.assemble (%pos1, %crd1), %data1 : (tensor<13xi64>, tensor<16xi32>), tensor<16xf32> to tensor<4x3x2xf32, #BatchedCSR>
+    %s1 = sparse_tensor.assemble (%pos1, %crd1), %data1 : (tensor<13xi64>, tensor<16xi32>), tensor<16xf32> to tensor<4x3x2xf32, #DenseCSR>
 
     //
     // Setup CSRDense.
@@ -111,33 +111,33 @@ module {
     // CHECK-NEXT: nse = 8
     // CHECK-NEXT: dim = ( 4, 3, 2 )
     // CHECK-NEXT: lvl = ( 4, 3, 2 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 0, 2, 3
-    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 5
-    // CHECK-NEXT: crd[1] : ( 0, 1, 1, 2, 1
-    // CHECK-NEXT: pos[2] : ( 0, 2, 4, 5, 7, 8
-    // CHECK-NEXT: crd[2] : ( 0, 1, 0, 1, 0, 0, 1, 0
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 2, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 5 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 1, 2, 1 )
+    // CHECK-NEXT: pos[2] : ( 0, 2, 4, 5, 7, 8 )
+    // CHECK-NEXT: crd[2] : ( 0, 1, 0, 1, 0, 0, 1, 0 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8 )
     // CHECK-NEXT: ----
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 16
     // CHECK-NEXT: dim = ( 4, 3, 2 )
     // CHECK-NEXT: lvl = ( 4, 3, 2 )
-    // CHECK-NEXT: pos[2] : ( 0, 2, 3, 4, 6, 6, 7, 9, 11, 13, 14, 15, 16
-    // CHECK-NEXT: crd[2] : ( 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    // CHECK-NEXT: pos[2] : ( 0, 2, 3, 4, 6, 6, 7, 9, 11, 13, 14, 15, 16 )
+    // CHECK-NEXT: crd[2] : ( 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 )
     // CHECK-NEXT: ----
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 22
     // CHECK-NEXT: dim = ( 4, 3, 2 )
     // CHECK-NEXT: lvl = ( 4, 3, 2 )
-    // CHECK-NEXT: pos[1] : ( 0, 3, 5, 8, 11
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 0, 2, 0, 1, 2, 0, 1, 2
-    // CHECK-NEXT: values : ( 1, 2, 0, 3, 4, 0, 5, 6, 0, 7, 8, 9, 10, 11, 12, 13, 14, 0, 0, 15, 0, 16
+    // CHECK-NEXT: pos[1] : ( 0, 3, 5, 8, 11 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 0, 2, 0, 1, 2, 0, 1, 2 )
+    // CHECK-NEXT: values : ( 1, 2, 0, 3, 4, 0, 5, 6, 0, 7, 8, 9, 10, 11, 12, 13, 14, 0, 0, 15, 0, 16 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %s0 : tensor<4x3x2xf32, #CCC>
-    sparse_tensor.print %s1 : tensor<4x3x2xf32, #BatchedCSR>
+    sparse_tensor.print %s1 : tensor<4x3x2xf32, #DenseCSR>
     sparse_tensor.print %s2 : tensor<4x3x2xf32, #CSRDense>
 
     // TODO: This check is no longer needed once the codegen path uses the
@@ -148,7 +148,7 @@ module {
       // sparse_tensor.assemble copies buffers when running with the runtime
       // library. Deallocations are not needed when running in codegen mode.
       bufferization.dealloc_tensor %s0 : tensor<4x3x2xf32, #CCC>
-      bufferization.dealloc_tensor %s1 : tensor<4x3x2xf32, #BatchedCSR>
+      bufferization.dealloc_tensor %s1 : tensor<4x3x2xf32, #DenseCSR>
       bufferization.dealloc_tensor %s2 : tensor<4x3x2xf32, #CSRDense>
     }
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pooling_nhwc.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pooling_nhwc.mlir
index 39699fbdb14e..7c78bfc36200 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pooling_nhwc.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pooling_nhwc.mlir
@@ -80,15 +80,15 @@ func.func @main() {
   // CHECK-NEXT: nse = 9
   // CHECK-NEXT: dim = ( 1, 3, 3, 1 )
   // CHECK-NEXT: lvl = ( 1, 3, 3, 1 )
-  // CHECK-NEXT: pos[0] : ( 0, 1
-  // CHECK-NEXT: crd[0] : ( 0
-  // CHECK-NEXT: pos[1] : ( 0, 3
-  // CHECK-NEXT: crd[1] : ( 0, 1, 2
-  // CHECK-NEXT: pos[2] : ( 0, 3, 6, 9
-  // CHECK-NEXT: crd[2] : ( 0, 1, 2, 0, 1, 2, 0, 1, 2
-  // CHECK-NEXT: pos[3] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
-  // CHECK-NEXT: crd[3] : ( 0, 0, 0, 0, 0, 0, 0, 0, 0
-  // CHECK-NEXT: values : ( 6, 6, 6, 6, 6, 6, 6, 6, 6
+  // CHECK-NEXT: pos[0] : ( 0, 1 )
+  // CHECK-NEXT: crd[0] : ( 0 )
+  // CHECK-NEXT: pos[1] : ( 0, 3 )
+  // CHECK-NEXT: crd[1] : ( 0, 1, 2 )
+  // CHECK-NEXT: pos[2] : ( 0, 3, 6, 9 )
+  // CHECK-NEXT: crd[2] : ( 0, 1, 2, 0, 1, 2, 0, 1, 2 )
+  // CHECK-NEXT: pos[3] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 )
+  // CHECK-NEXT: crd[3] : ( 0, 0, 0, 0, 0, 0, 0, 0, 0 )
+  // CHECK-NEXT: values : ( 6, 6, 6, 6, 6, 6, 6, 6, 6 )
   // CHECK-NEXT: ----
   //
   sparse_tensor.print %CCCC_ret : tensor<1x3x3x1xf32, #CCCC>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_print.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_print.mlir
index 7758ca77dce9..f3c721535e75 100755
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_print.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_print.mlir
@@ -147,7 +147,7 @@ module {
     // CHECK-NEXT: nse = 32
     // CHECK-NEXT: dim = ( 4, 8 )
     // CHECK-NEXT: lvl = ( 4, 8 )
-    // CHECK-NEXT: values : ( 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 5, 0, 0,
+    // CHECK-NEXT: values : ( 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 5, 0, 0 )
     // CHECK-NEXT: ----
     sparse_tensor.print %XO : tensor<4x8xi32, #AllDense>
 
@@ -155,7 +155,7 @@ module {
     // CHECK-NEXT: nse = 32
     // CHECK-NEXT: dim = ( 4, 8 )
     // CHECK-NEXT: lvl = ( 8, 4 )
-    // CHECK-NEXT: values : ( 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0,
+    // CHECK-NEXT: values : ( 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0 )
     // CHECK-NEXT: ----
     sparse_tensor.print %XT : tensor<4x8xi32, #AllDenseT>
 
@@ -176,9 +176,9 @@ module {
     // CHECK-NEXT: nse = 5
     // CHECK-NEXT: dim = ( 4, 8 )
     // CHECK-NEXT: lvl = ( 4, 8 )
-    // CHECK-NEXT: pos[1] : ( 0, 2, 2, 2, 5,
-    // CHECK-NEXT: crd[1] : ( 0, 2, 2, 3, 5,
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5,
+    // CHECK-NEXT: pos[1] : ( 0, 2, 2, 2, 5 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 2, 3, 5 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5 )
     // CHECK-NEXT: ----
     sparse_tensor.print %a : tensor<4x8xi32, #CSR>
 
@@ -186,11 +186,11 @@ module {
     // CHECK-NEXT: nse = 5
     // CHECK-NEXT: dim = ( 4, 8 )
     // CHECK-NEXT: lvl = ( 4, 8 )
-    // CHECK-NEXT: pos[0] : ( 0, 2,
-    // CHECK-NEXT: crd[0] : ( 0, 3,
-    // CHECK-NEXT: pos[1] : ( 0, 2, 5,
-    // CHECK-NEXT: crd[1] : ( 0, 2, 2, 3, 5,
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5,
+    // CHECK-NEXT: pos[0] : ( 0, 2 )
+    // CHECK-NEXT: crd[0] : ( 0, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 5 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 2, 3, 5 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5 )
     // CHECK-NEXT: ----
     sparse_tensor.print %b : tensor<4x8xi32, #DCSR>
 
@@ -198,9 +198,9 @@ module {
     // CHECK-NEXT: nse = 5
     // CHECK-NEXT: dim = ( 4, 8 )
     // CHECK-NEXT: lvl = ( 8, 4 )
-    // CHECK-NEXT: pos[1] : ( 0, 1, 1, 3, 4, 4, 5, 5, 5,
-    // CHECK-NEXT: crd[1] : ( 0, 0, 3, 3, 3,
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5,
+    // CHECK-NEXT: pos[1] : ( 0, 1, 1, 3, 4, 4, 5, 5, 5 )
+    // CHECK-NEXT: crd[1] : ( 0, 0, 3, 3, 3 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5 )
     // CHECK-NEXT: ----
     sparse_tensor.print %c : tensor<4x8xi32, #CSC>
 
@@ -208,11 +208,11 @@ module {
     // CHECK-NEXT: nse = 5
     // CHECK-NEXT: dim = ( 4, 8 )
     // CHECK-NEXT: lvl = ( 8, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 4,
-    // CHECK-NEXT: crd[0] : ( 0, 2, 3, 5,
-    // CHECK-NEXT: pos[1] : ( 0, 1, 3, 4, 5,
-    // CHECK-NEXT: crd[1] : ( 0, 0, 3, 3, 3,
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5,
+    // CHECK-NEXT: pos[0] : ( 0, 4 )
+    // CHECK-NEXT: crd[0] : ( 0, 2, 3, 5 )
+    // CHECK-NEXT: pos[1] : ( 0, 1, 3, 4, 5 )
+    // CHECK-NEXT: crd[1] : ( 0, 0, 3, 3, 3 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5 )
     // CHECK-NEXT: ----
     sparse_tensor.print %d : tensor<4x8xi32, #DCSC>
 
@@ -220,11 +220,11 @@ module {
     // CHECK-NEXT: nse = 24
     // CHECK-NEXT: dim = ( 4, 8 )
     // CHECK-NEXT: lvl = ( 2, 2, 2, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 2,
-    // CHECK-NEXT: crd[0] : ( 0, 1,
-    // CHECK-NEXT: pos[1] : ( 0, 1, 3,
-    // CHECK-NEXT: crd[1] : ( 0, 0, 1,
-    // CHECK-NEXT: values : ( 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 5, 0, 0,
+    // CHECK-NEXT: pos[0] : ( 0, 2 )
+    // CHECK-NEXT: crd[0] : ( 0, 1 )
+    // CHECK-NEXT: pos[1] : ( 0, 1, 3 )
+    // CHECK-NEXT: crd[1] : ( 0, 0, 1 )
+    // CHECK-NEXT: values : ( 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 5, 0, 0 )
     // CHECK-NEXT: ----
     sparse_tensor.print %e : tensor<4x8xi32, #BSR>
 
@@ -232,11 +232,11 @@ module {
     // CHECK-NEXT: nse = 24
     // CHECK-NEXT: dim = ( 4, 8 )
     // CHECK-NEXT: lvl = ( 2, 2, 4, 2 )
-    // CHECK-NEXT: pos[0] : ( 0, 2,
-    // CHECK-NEXT: crd[0] : ( 0, 1,
-    // CHECK-NEXT: pos[1] : ( 0, 1, 3,
-    // CHECK-NEXT: crd[1] : ( 0, 0, 1,
-    // CHECK-NEXT: values : ( 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4, 0, 0, 0, 5, 0, 0, 0, 0,
+    // CHECK-NEXT: pos[0] : ( 0, 2 )
+    // CHECK-NEXT: crd[0] : ( 0, 1 )
+    // CHECK-NEXT: pos[1] : ( 0, 1, 3 )
+    // CHECK-NEXT: crd[1] : ( 0, 0, 1 )
+    // CHECK-NEXT: values : ( 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4, 0, 0, 0, 5, 0, 0, 0, 0 )
     // CHECK-NEXT: ----
     sparse_tensor.print %f : tensor<4x8xi32, #BSRC>
 
@@ -244,11 +244,11 @@ module {
     // CHECK-NEXT: nse = 24
     // CHECK-NEXT: dim = ( 4, 8 )
     // CHECK-NEXT: lvl = ( 2, 2, 2, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 2,
-    // CHECK-NEXT: crd[0] : ( 0, 1,
-    // CHECK-NEXT: pos[1] : ( 0, 2, 3,
-    // CHECK-NEXT: crd[1] : ( 0, 1, 1,
-    // CHECK-NEXT: values : ( 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 5, 0, 0,
+    // CHECK-NEXT: pos[0] : ( 0, 2 )
+    // CHECK-NEXT: crd[0] : ( 0, 1 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 1 )
+    // CHECK-NEXT: values : ( 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 5, 0, 0 )
     // CHECK-NEXT: ----
     sparse_tensor.print %g : tensor<4x8xi32, #BSC>
 
@@ -256,11 +256,11 @@ module {
     // CHECK-NEXT: nse = 24
     // CHECK-NEXT: dim = ( 4, 8 )
     // CHECK-NEXT: lvl = ( 2, 2, 4, 2 )
-    // CHECK-NEXT: pos[0] : ( 0, 2,
-    // CHECK-NEXT: crd[0] : ( 0, 1,
-    // CHECK-NEXT: pos[1] : ( 0, 2, 3,
-    // CHECK-NEXT: crd[1] : ( 0, 1, 1,
-    // CHECK-NEXT: values : ( 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4, 0, 0, 0, 5, 0, 0, 0, 0,
+    // CHECK-NEXT: pos[0] : ( 0, 2 )
+    // CHECK-NEXT: crd[0] : ( 0, 1 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 1 )
+    // CHECK-NEXT: values : ( 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4, 0, 0, 0, 5, 0, 0, 0, 0 )
     // CHECK-NEXT: ----
     sparse_tensor.print %h : tensor<4x8xi32, #BSCC>
 
@@ -268,9 +268,9 @@ module {
     // CHECK-NEXT: nse = 24
     // CHECK-NEXT: dim = ( 4, 8 )
     // CHECK-NEXT: lvl = ( 2, 2, 2, 4 )
-    // CHECK-NEXT: pos[1] : ( 0, 1, 3,
-    // CHECK-NEXT: crd[1] : ( 0, 0, 1,
-    // CHECK-NEXT: values : ( 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 5, 0, 0,
+    // CHECK-NEXT: pos[1] : ( 0, 1, 3 )
+    // CHECK-NEXT: crd[1] : ( 0, 0, 1 )
+    // CHECK-NEXT: values : ( 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 5, 0, 0 )
     // CHECK-NEXT: ----
     sparse_tensor.print %i : tensor<4x8xi32, #BSR0>
 
@@ -278,9 +278,9 @@ module {
     // CHECK-NEXT: nse = 24
     // CHECK-NEXT: dim = ( 4, 8 )
     // CHECK-NEXT: lvl = ( 2, 2, 2, 4 )
-    // CHECK-NEXT: pos[1] : ( 0, 2, 3,
-    // CHECK-NEXT: crd[1] : ( 0, 1, 1,
-    // CHECK-NEXT: values : ( 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 5, 0, 0,
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 1 )
+    // CHECK-NEXT: values : ( 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 5, 0, 0 )
     // CHECK-NEXT: ----
     sparse_tensor.print %j : tensor<4x8xi32, #BSC0>
 
@@ -288,9 +288,9 @@ module {
     // CHECK-NEXT: nse = 5
     // CHECK-NEXT: dim = ( 4, 8 )
     // CHECK-NEXT: lvl = ( 4, 8 )
-    // CHECK-NEXT: pos[0] : ( 0, 5,
-    // CHECK-NEXT: crd[0] : ( 0, 0, 0, 2, 3, 2, 3, 3, 3, 5,
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5,
+    // CHECK-NEXT: pos[0] : ( 0, 5 )
+    // CHECK-NEXT: crd[0] : ( 0, 0, 0, 2, 3, 2, 3, 3, 3, 5 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5 )
     // CHECK-NEXT: ----
     sparse_tensor.print %AoS : tensor<4x8xi32, #COOAoS>
 
@@ -298,10 +298,10 @@ module {
     // CHECK-NEXT: nse = 5
     // CHECK-NEXT: dim = ( 4, 8 )
     // CHECK-NEXT: lvl = ( 4, 8 )
-    // CHECK-NEXT: pos[0] : ( 0, 5,
-    // CHECK-NEXT: crd[0] : ( 0, 0, 3, 3, 3,
-    // CHECK-NEXT: crd[1] : ( 0, 2, 2, 3, 5,
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5,
+    // CHECK-NEXT: pos[0] : ( 0, 5 )
+    // CHECK-NEXT: crd[0] : ( 0, 0, 3, 3, 3 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 2, 3, 5 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5 )
     // CHECK-NEXT: ----
     sparse_tensor.print %SoA : tensor<4x8xi32, #COOSoA>
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_print_3d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_print_3d.mlir
new file mode 100755
index 000000000000..4f1e4312d7bc
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_print_3d.mlir
@@ -0,0 +1,74 @@
+//--------------------------------------------------------------------------------------------------
+// WHEN CREATING A NEW TEST, PLEASE JUST COPY & PASTE WITHOUT EDITS.
+//
+// Set-up that's shared across all tests in this directory. In principle, this
+// config could be moved to lit.local.cfg. However, there are downstream users that
+//  do not use these LIT config files. Hence why this is kept inline.
+//
+// DEFINE: %{sparsifier_opts} = enable-runtime-library=true
+// DEFINE: %{sparsifier_opts_sve} = enable-arm-sve=true %{sparsifier_opts}
+// DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
+// DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
+// DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
+// DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
+// DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
+//
+// DEFINE: %{env} =
+//--------------------------------------------------------------------------------------------------
+
+// TODO: make this work with libgen
+
+// Do the same run, but now with direct IR generation.
+// REDEFINE: %{sparsifier_opts} = enable-runtime-library=false enable-buffer-initialization=true
+// RUN: %{compile} | %{run} | FileCheck %s
+//
+
+#BatchedCSR = #sparse_tensor.encoding<{
+  map = (d0, d1, d2) -> (d0 : batch, d1 : dense, d2 : compressed)
+}>
+
+module {
+
+  //
+  // Main driver that tests 3-D sparse tensor printing.
+  //
+  func.func @main() {
+
+    %pos = arith.constant dense<
+      [[ 0, 8, 16, 24, 32],
+       [ 0, 8, 16, 24, 32]]
+    > : tensor<2x5xindex>
+
+    %crd = arith.constant dense<
+      [[0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7],
+       [0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7]]
+    > : tensor<2x32xindex>
+
+    %val = arith.constant dense<
+      [[ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11.,
+        12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22.,
+        23., 24., 25., 26., 27., 28., 29., 30., 31., 32.],
+       [33., 34., 35., 36., 37., 38., 39., 40., 41., 42., 43.,
+        44., 45., 46., 47., 48., 49., 50., 51., 52., 53., 54.,
+        55., 56., 57., 58., 59., 60., 61., 62., 63., 64.]]
+    > : tensor<2x32xf64>
+
+    %X = sparse_tensor.assemble (%pos, %crd), %val
+      : (tensor<2x5xindex>, tensor<2x32xindex>), tensor<2x32xf64> to tensor<2x4x8xf64, #BatchedCSR>
+
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 32
+    // CHECK-NEXT: dim = ( 2, 4, 8 )
+    // CHECK-NEXT: lvl = ( 2, 4, 8 )
+    // CHECK-NEXT: pos[2] : ( ( 0, 8, 16, 24, 32 )( 0, 8, 16, 24, 32 ) )
+    // CHECK-NEXT: crd[2] : ( ( 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 )
+    // CHECK-SAME:            ( 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 ) )
+    // CHECK-NEXT: values : ( ( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 )
+    // CHECK-SAME:            ( 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64 ) )
+    // CHECK-NEXT: ----
+    sparse_tensor.print %X : tensor<2x4x8xf64, #BatchedCSR>
+
+    return
+  }
+}
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_re_im.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_re_im.mlir
index 7bacbe3b87e4..fc23fe501fcf 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_re_im.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_re_im.mlir
@@ -93,18 +93,18 @@ module {
     // CHECK-NEXT: nse = 3
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 3,
-    // CHECK-NEXT: crd[0] : ( 0, 20, 31,
-    // CHECK-NEXT: values : ( 5.13, 3, 5,
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 20, 31 )
+    // CHECK-NEXT: values : ( 5.13, 3, 5 )
     // CHECK-NEXT: ----
     //
     // CHECK-NEXT: ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 3
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 3,
-    // CHECK-NEXT: crd[0] : ( 0, 20, 31,
-    // CHECK-NEXT: values : ( 2, 4, 6,
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 0, 20, 31 )
+    // CHECK-NEXT: values : ( 2, 4, 6 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %0 : tensor<?xf32, #SparseVector>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reduce_custom.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reduce_custom.mlir
index a927a5dfb94b..5da028c3685c 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reduce_custom.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reduce_custom.mlir
@@ -144,33 +144,33 @@ module {
     // CHECK-NEXT: nse = 9
     // CHECK-NEXT: dim = ( 4, 5 )
     // CHECK-NEXT: lvl = ( 4, 5 )
-    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 6, 9
-    // CHECK-NEXT: crd[1] : ( 0, 1, 0, 2, 3, 4, 0, 2, 3
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 6, 9 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 0, 2, 3, 4, 0, 2, 3 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9 )
     // CHECK-NEXT: ----
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 6
     // CHECK-NEXT: dim = ( 5, 4 )
     // CHECK-NEXT: lvl = ( 5, 4 )
-    // CHECK-NEXT: pos[1] : ( 0, 1, 2, 4, 5, 6
-    // CHECK-NEXT: crd[1] : ( 0, 3, 0, 3, 1, 1
-    // CHECK-NEXT: values : ( 6, 5, 4, 3, 2, 11
+    // CHECK-NEXT: pos[1] : ( 0, 1, 2, 4, 5, 6 )
+    // CHECK-NEXT: crd[1] : ( 0, 3, 0, 3, 1, 1 )
+    // CHECK-NEXT: values : ( 6, 5, 4, 3, 2, 11 )
     // CHECK-NEXT: ----
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 9
     // CHECK-NEXT: dim = ( 4, 4 )
     // CHECK-NEXT: lvl = ( 4, 4 )
-    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 6, 9
-    // CHECK-NEXT: crd[1] : ( 0, 3, 0, 0, 1, 3, 0, 1, 3
-    // CHECK-NEXT: values : ( 7, 7, 9, 8, 7, 7, 12, 11, 11
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 6, 9 )
+    // CHECK-NEXT: crd[1] : ( 0, 3, 0, 0, 1, 3, 0, 1, 3 )
+    // CHECK-NEXT: values : ( 7, 7, 9, 8, 7, 7, 12, 11, 11 )
     // CHECK-NEXT: ----
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 9
     // CHECK-NEXT: dim = ( 4, 4 )
     // CHECK-NEXT: lvl = ( 4, 4 )
-    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 6, 9
-    // CHECK-NEXT: crd[1] : ( 0, 3, 0, 0, 1, 3, 0, 1, 3
-    // CHECK-NEXT: values : ( 7, 7, 9, 8, 7, 7, 12, 11, 11
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 6, 9 )
+    // CHECK-NEXT: crd[1] : ( 0, 3, 0, 0, 1, 3, 0, 1, 3 )
+    // CHECK-NEXT: values : ( 7, 7, 9, 8, 7, 7, 12, 11, 11 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %sm1 : tensor<?x?xf64, #CSR>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reduce_custom_prod.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reduce_custom_prod.mlir
index 18bf6a71c530..d32a92e337ba 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reduce_custom_prod.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reduce_custom_prod.mlir
@@ -118,33 +118,33 @@ module {
     // CHECK-NEXT: nse = 9
     // CHECK-NEXT: dim = ( 4, 5 )
     // CHECK-NEXT: lvl = ( 4, 5 )
-    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 6, 9
-    // CHECK-NEXT: crd[1] : ( 0, 1, 0, 2, 3, 4, 0, 2, 3
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 6, 9 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 0, 2, 3, 4, 0, 2, 3 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9 )
     // CHECK-NEXT: ----
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 6
     // CHECK-NEXT: dim = ( 5, 4 )
     // CHECK-NEXT: lvl = ( 5, 4 )
-    // CHECK-NEXT: pos[1] : ( 0, 1, 2, 4, 5, 6
-    // CHECK-NEXT: crd[1] : ( 0, 3, 0, 3, 1, 1
-    // CHECK-NEXT: values : ( 6, 5, 4, 3, 2, 11
+    // CHECK-NEXT: pos[1] : ( 0, 1, 2, 4, 5, 6 )
+    // CHECK-NEXT: crd[1] : ( 0, 3, 0, 3, 1, 1 )
+    // CHECK-NEXT: values : ( 6, 5, 4, 3, 2, 11 )
     // CHECK-NEXT: ----
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 4
     // CHECK-NEXT: dim = ( 4 )
     // CHECK-NEXT: lvl = ( 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 4
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
-    // CHECK-NEXT: values : ( 2, 3, 120, 504
+    // CHECK-NEXT: pos[0] : ( 0, 4 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3 )
+    // CHECK-NEXT: values : ( 2, 3, 120, 504 )
     // CHECK-NEXT: ----
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 5
     // CHECK-NEXT: dim = ( 5 )
     // CHECK-NEXT: lvl = ( 5 )
-    // CHECK-NEXT: pos[0] : ( 0, 5
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4
-    // CHECK-NEXT: values : ( 6, 5, 12, 2, 11
+    // CHECK-NEXT: pos[0] : ( 0, 5 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4 )
+    // CHECK-NEXT: values : ( 6, 5, 12, 2, 11 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %sm1 : tensor<?x?xf64, #CSR>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reshape.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reshape.mlir
index 4c26ebe6e401..317fe0f225ee 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reshape.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reshape.mlir
@@ -81,31 +81,31 @@ module {
     // CHECK-NEXT: nse = 6
     // CHECK-NEXT: dim = ( 2, 6 )
     // CHECK-NEXT: lvl = ( 2, 6 )
-    // CHECK-NEXT: pos[0] : ( 0, 2
-    // CHECK-NEXT: crd[0] : ( 0, 1
-    // CHECK-NEXT: pos[1] : ( 0, 3, 6
-    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 0, 2, 4
-    // CHECK-NEXT: values : ( 1.1, 1.3, 2.1, 2.3, 3.1, 3.3
+    // CHECK-NEXT: pos[0] : ( 0, 2 )
+    // CHECK-NEXT: crd[0] : ( 0, 1 )
+    // CHECK-NEXT: pos[1] : ( 0, 3, 6 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 0, 2, 4 )
+    // CHECK-NEXT: values : ( 1.1, 1.3, 2.1, 2.3, 3.1, 3.3 )
     // CHECK-NEXT: ----
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 6
     // CHECK-NEXT: dim = ( 12 )
     // CHECK-NEXT: lvl = ( 12 )
-    // CHECK-NEXT: pos[0] : ( 0, 6
-    // CHECK-NEXT: crd[0] : ( 0, 2, 4, 6, 8, 10
-    // CHECK-NEXT: values : ( 1.1, 1.3, 2.1, 2.3, 3.1, 3.3
+    // CHECK-NEXT: pos[0] : ( 0, 6 )
+    // CHECK-NEXT: crd[0] : ( 0, 2, 4, 6, 8, 10 )
+    // CHECK-NEXT: values : ( 1.1, 1.3, 2.1, 2.3, 3.1, 3.3 )
     // CHECK-NEXT: ----
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 6
     // CHECK-NEXT: dim = ( 2, 3, 2 )
     // CHECK-NEXT: lvl = ( 2, 3, 2 )
-    // CHECK-NEXT: pos[0] : ( 0, 2
-    // CHECK-NEXT: crd[0] : ( 0, 1
-    // CHECK-NEXT: pos[1] : ( 0, 3, 6
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 0, 1, 2
-    // CHECK-NEXT: pos[2] : ( 0, 1, 2, 3, 4, 5, 6
-    // CHECK-NEXT: crd[2] : ( 0, 0, 0, 0, 0, 0
-    // CHECK-NEXT: values : ( 1.1, 1.3, 2.1, 2.3, 3.1, 3.3
+    // CHECK-NEXT: pos[0] : ( 0, 2 )
+    // CHECK-NEXT: crd[0] : ( 0, 1 )
+    // CHECK-NEXT: pos[1] : ( 0, 3, 6 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 0, 1, 2 )
+    // CHECK-NEXT: pos[2] : ( 0, 1, 2, 3, 4, 5, 6 )
+    // CHECK-NEXT: crd[2] : ( 0, 0, 0, 0, 0, 0 )
+    // CHECK-NEXT: values : ( 1.1, 1.3, 2.1, 2.3, 3.1, 3.3 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %reshaped0: tensor<2x6xf64, #SparseMatrix>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sampled_mm_fusion.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sampled_mm_fusion.mlir
index 20a8c5f812de..eecd970e01ac 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sampled_mm_fusion.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sampled_mm_fusion.mlir
@@ -211,22 +211,22 @@ module {
     // CHECK-NEXT: nse = 2
     // CHECK-NEXT: dim = ( 8, 8 )
     // CHECK-NEXT: lvl = ( 8, 8 )
-    // CHECK-NEXT: pos[0] : ( 0, 2
-    // CHECK-NEXT: crd[0] : ( 0, 7
-    // CHECK-NEXT: pos[1] : ( 0, 1, 2
-    // CHECK-NEXT: crd[1] : ( 0, 7
-    // CHECK-NEXT: values : ( 96, 192
+    // CHECK-NEXT: pos[0] : ( 0, 2 )
+    // CHECK-NEXT: crd[0] : ( 0, 7 )
+    // CHECK-NEXT: pos[1] : ( 0, 1, 2 )
+    // CHECK-NEXT: crd[1] : ( 0, 7 )
+    // CHECK-NEXT: values : ( 96, 192 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 2
     // CHECK-NEXT: dim = ( 8, 8 )
     // CHECK-NEXT: lvl = ( 8, 8 )
-    // CHECK-NEXT: pos[0] : ( 0, 2
-    // CHECK-NEXT: crd[0] : ( 0, 7
-    // CHECK-NEXT: pos[1] : ( 0, 1, 2
-    // CHECK-NEXT: crd[1] : ( 0, 7
-    // CHECK-NEXT: values : ( 96, 192
+    // CHECK-NEXT: pos[0] : ( 0, 2 )
+    // CHECK-NEXT: crd[0] : ( 0, 7 )
+    // CHECK-NEXT: pos[1] : ( 0, 1, 2 )
+    // CHECK-NEXT: crd[1] : ( 0, 7 )
+    // CHECK-NEXT: values : ( 96, 192 )
     // CHECK-NEXT: ----
     //
     %v0 = vector.transfer_read %0[%c0, %c0], %d0
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_scale.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_scale.mlir
index 4e9090ae201d..c62cdc900b83 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_scale.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_scale.mlir
@@ -92,9 +92,9 @@ module {
     // CHECK-NEXT: nse = 16
     // CHECK-NEXT: dim = ( 8, 8 )
     // CHECK-NEXT: lvl = ( 8, 8 )
-    // CHECK-NEXT: pos[1] : ( 0, 3, 4, 5, 6, 8, 11, 14, 16
-    // CHECK-NEXT: crd[1] : ( 0, 2, 7, 1, 2, 3, 1, 4, 1, 2, 5, 2, 6, 7, 2, 7
-    // CHECK-NEXT: values : ( 2, 2, 2, 4, 6, 8, 2, 10, 2, 2, 12, 2, 14, 2, 2, 16
+    // CHECK-NEXT: pos[1] : ( 0, 3, 4, 5, 6, 8, 11, 14, 16 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 7, 1, 2, 3, 1, 4, 1, 2, 5, 2, 6, 7, 2, 7 )
+    // CHECK-NEXT: values : ( 2, 2, 2, 4, 6, 8, 2, 10, 2, 2, 12, 2, 14, 2, 2, 16 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %2 : tensor<8x8xf32, #CSR>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_scf_nested.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_scf_nested.mlir
index dd8396dc23b0..3f0cf70675ba 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_scf_nested.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_scf_nested.mlir
@@ -91,25 +91,25 @@ module @func_sparse.2 {
     // CHECK-NEXT: nse = 24
     // CHECK-NEXT: dim = ( 2, 3, 4 )
     // CHECK-NEXT: lvl = ( 2, 3, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 2
-    // CHECK-NEXT: crd[0] : ( 0, 1
-    // CHECK-NEXT: pos[1] : ( 0, 3, 6
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 0, 1, 2
-    // CHECK-NEXT: pos[2] : ( 0, 4, 8, 12, 16, 20, 24
-    // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
-    // CHECK-NEXT: values : ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23
+    // CHECK-NEXT: pos[0] : ( 0, 2 )
+    // CHECK-NEXT: crd[0] : ( 0, 1 )
+    // CHECK-NEXT: pos[1] : ( 0, 3, 6 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 0, 1, 2 )
+    // CHECK-NEXT: pos[2] : ( 0, 4, 8, 12, 16, 20, 24 )
+    // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 )
+    // CHECK-NEXT: values : ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 )
     // CHECK-NEXT: ----
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 24
     // CHECK-NEXT: dim = ( 2, 3, 4 )
     // CHECK-NEXT: lvl = ( 2, 3, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 2
-    // CHECK-NEXT: crd[0] : ( 0, 1
-    // CHECK-NEXT: pos[1] : ( 0, 3, 6
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 0, 1, 2
-    // CHECK-NEXT: pos[2] : ( 0, 4, 8, 12, 16, 20, 24
-    // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
-    // CHECK-NEXT: values : ( 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25
+    // CHECK-NEXT: pos[0] : ( 0, 2 )
+    // CHECK-NEXT: crd[0] : ( 0, 1 )
+    // CHECK-NEXT: pos[1] : ( 0, 3, 6 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 0, 1, 2 )
+    // CHECK-NEXT: pos[2] : ( 0, 4, 8, 12, 16, 20, 24 )
+    // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 )
+    // CHECK-NEXT: values : ( 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %sm_t : tensor<2x3x4xf64, #SparseMatrix>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_select.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_select.mlir
index 68bc17175e3b..bd61563b4b2d 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_select.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_select.mlir
@@ -124,33 +124,33 @@ module {
     // CHECK-NEXT: nse = 5
     // CHECK-NEXT: dim = ( 10 )
     // CHECK-NEXT: lvl = ( 10 )
-    // CHECK-NEXT: pos[0] : ( 0, 5
-    // CHECK-NEXT: crd[0] : ( 1, 3, 5, 7, 9
-    // CHECK-NEXT: values : ( 1, 2, -4, 0, 5
+    // CHECK-NEXT: pos[0] : ( 0, 5 )
+    // CHECK-NEXT: crd[0] : ( 1, 3, 5, 7, 9 )
+    // CHECK-NEXT: values : ( 1, 2, -4, 0, 5 )
     // CHECK-NEXT: ----
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 7
     // CHECK-NEXT: dim = ( 5, 5 )
     // CHECK-NEXT: lvl = ( 5, 5 )
-    // CHECK-NEXT: pos[1] : ( 0, 1, 2, 4, 6, 7
-    // CHECK-NEXT: crd[1] : ( 3, 4, 1, 3, 3, 4, 2
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7
+    // CHECK-NEXT: pos[1] : ( 0, 1, 2, 4, 6, 7 )
+    // CHECK-NEXT: crd[1] : ( 3, 4, 1, 3, 3, 4, 2 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7 )
     // CHECK-NEXT: ----
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 3
     // CHECK-NEXT: dim = ( 10 )
     // CHECK-NEXT: lvl = ( 10 )
-    // CHECK-NEXT: pos[0] : ( 0, 3
-    // CHECK-NEXT: crd[0] : ( 1, 3, 9
-    // CHECK-NEXT: values : ( 1, 2, 5
+    // CHECK-NEXT: pos[0] : ( 0, 3 )
+    // CHECK-NEXT: crd[0] : ( 1, 3, 9 )
+    // CHECK-NEXT: values : ( 1, 2, 5 )
     // CHECK-NEXT: ----
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 4
     // CHECK-NEXT: dim = ( 5, 5 )
     // CHECK-NEXT: lvl = ( 5, 5 )
-    // CHECK-NEXT: pos[1] : ( 0, 1, 2, 3, 4, 4
-    // CHECK-NEXT: crd[1] : ( 3, 4, 3, 4
-    // CHECK-NEXT: values : ( 1, 2, 4, 6
+    // CHECK-NEXT: pos[1] : ( 0, 1, 2, 3, 4, 4 )
+    // CHECK-NEXT: crd[1] : ( 3, 4, 3, 4 )
+    // CHECK-NEXT: values : ( 1, 2, 4, 6 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %sv1 : tensor<?xf64, #SparseVector>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_semiring_select.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_semiring_select.mlir
index f4435c81117b..d96b07a0db33 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_semiring_select.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_semiring_select.mlir
@@ -91,11 +91,11 @@ module {
     // CHECK-NEXT: nse = 9
     // CHECK-NEXT: dim = ( 5, 5 )
     // CHECK-NEXT: lvl = ( 5, 5 )
-    // CHECK-NEXT: pos[0] : ( 0, 5
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4
-    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 6, 8, 9
-    // CHECK-NEXT: crd[1] : ( 0, 1, 1, 2, 2, 3, 3, 4, 4
-    // CHECK-NEXT: values : ( 0.1, 1.1, 1.1, 2.2, 2.1, 3.3, 3.1, 4.4, 4.1
+    // CHECK-NEXT: pos[0] : ( 0, 5 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 6, 8, 9 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 1, 2, 2, 3, 3, 4, 4 )
+    // CHECK-NEXT: values : ( 0.1, 1.1, 1.1, 2.2, 2.1, 3.3, 3.1, 4.4, 4.1 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %1 : tensor<5x5xf64, #DCSR>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sign.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sign.mlir
index c09374918b7d..11d23d681c82 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sign.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sign.mlir
@@ -114,9 +114,9 @@ module {
     // CHECK-NEXT: nse = 12
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 12
-    // CHECK-NEXT: crd[0] : ( 0, 3, 5, 11, 13, 17, 18, 20, 21, 28, 29, 31
-    // CHECK-NEXT: values : ( -1, 1, -1, 1, 1, -1, nan, -nan, 1, -1, -0, 0
+    // CHECK-NEXT: pos[0] : ( 0, 12 )
+    // CHECK-NEXT: crd[0] : ( 0, 3, 5, 11, 13, 17, 18, 20, 21, 28, 29, 31 )
+    // CHECK-NEXT: values : ( -1, 1, -1, 1, 1, -1, nan, -nan, 1, -1, -0, 0 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %0 : tensor<?xf64, #SparseVector>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sorted_coo.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sorted_coo.mlir
index 7b3f9a2ce0e0..3117d2539f17 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sorted_coo.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sorted_coo.mlir
@@ -107,10 +107,10 @@ module {
     // CHECK-NEXT: nse = 17
     // CHECK-NEXT: dim = ( 4, 256 )
     // CHECK-NEXT: lvl = ( 4, 256 )
-    // CHECK-NEXT: pos[0] : ( 0, 17
-    // CHECK-NEXT: crd[0] : ( 0, 0, 0, 0, 1, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
-    // CHECK-NEXT: crd[1] : ( 0, 126, 127, 254, 1, 253, 2, 0, 1, 3, 98, 126, 127, 128, 249, 253, 255
-    // CHECK-NEXT: values : ( -1, 2, -3, 4, -5, 6, -7, 8, -9, 10, -11, 12, -13, 14, -15, 16, -17
+    // CHECK-NEXT: pos[0] : ( 0, 17 )
+    // CHECK-NEXT: crd[0] : ( 0, 0, 0, 0, 1, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 )
+    // CHECK-NEXT: crd[1] : ( 0, 126, 127, 254, 1, 253, 2, 0, 1, 3, 98, 126, 127, 128, 249, 253, 255 )
+    // CHECK-NEXT: values : ( -1, 2, -3, 4, -5, 6, -7, 8, -9, 10, -11, 12, -13, 14, -15, 16, -17 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %0 : tensor<?x?xf64, #SortedCOO>
@@ -120,10 +120,10 @@ module {
     // CHECK-NEXT: nse = 17
     // CHECK-NEXT: dim = ( 4, 256 )
     // CHECK-NEXT: lvl = ( 256, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 17
-    // CHECK-NEXT: crd[0] : ( 0, 0, 1, 1, 2, 3, 98, 126, 126, 127, 127, 128, 249, 253, 253, 254, 255
-    // CHECK-NEXT: crd[1] : ( 0, 3, 1, 3, 2, 3, 3, 0, 3, 0, 3, 3, 3, 1, 3, 0, 3
-    // CHECK-NEXT: values : ( -1, 8, -5, -9, -7, 10, -11, 2, 12, -3, -13, 14, -15, 6, 16, 4, -17
+    // CHECK-NEXT: pos[0] : ( 0, 17 )
+    // CHECK-NEXT: crd[0] : ( 0, 0, 1, 1, 2, 3, 98, 126, 126, 127, 127, 128, 249, 253, 253, 254, 255 )
+    // CHECK-NEXT: crd[1] : ( 0, 3, 1, 3, 2, 3, 3, 0, 3, 0, 3, 3, 3, 1, 3, 0, 3 )
+    // CHECK-NEXT: values : ( -1, 8, -5, -9, -7, 10, -11, 2, 12, -3, -13, 14, -15, 6, 16, 4, -17 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %1 : tensor<?x?xf64, #SortedCOOPermuted>
@@ -133,11 +133,11 @@ module {
     // CHECK-NEXT: nse = 17
     // CHECK-NEXT: dim = ( 2, 3, 4 )
     // CHECK-NEXT: lvl = ( 2, 3, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 17
-    // CHECK-NEXT: crd[0] : ( 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1
-    // CHECK-NEXT: crd[1] : ( 0, 0, 1, 1, 2, 2, 2, 2, 0, 0, 0, 1, 1, 1, 1, 2, 2
-    // CHECK-NEXT: crd[2] : ( 2, 3, 1, 2, 0, 1, 2, 3, 0, 2, 3, 0, 1, 2, 3, 1, 2
-    // CHECK-NEXT: values : ( 3, 63, 11, 100, 66, 61, 13, 43, 77, 10, 46, 61, 53, 3, 75, 22, 18
+    // CHECK-NEXT: pos[0] : ( 0, 17 )
+    // CHECK-NEXT: crd[0] : ( 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1 )
+    // CHECK-NEXT: crd[1] : ( 0, 0, 1, 1, 2, 2, 2, 2, 0, 0, 0, 1, 1, 1, 1, 2, 2 )
+    // CHECK-NEXT: crd[2] : ( 2, 3, 1, 2, 0, 1, 2, 3, 0, 2, 3, 0, 1, 2, 3, 1, 2 )
+    // CHECK-NEXT: values : ( 3, 63, 11, 100, 66, 61, 13, 43, 77, 10, 46, 61, 53, 3, 75, 22, 18 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %2 : tensor<?x?x?xf64, #SortedCOO3D>
@@ -147,11 +147,11 @@ module {
     // CHECK-NEXT: nse = 17
     // CHECK-NEXT: dim = ( 2, 3, 4 )
     // CHECK-NEXT: lvl = ( 4, 2, 3 )
-    // CHECK-NEXT: pos[0] : ( 0, 17
-    // CHECK-NEXT: crd[0] : ( 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3
-    // CHECK-NEXT: crd[1] : ( 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1
-    // CHECK-NEXT: crd[2] : ( 2, 0, 1, 1, 2, 1, 2, 0, 1, 2, 0, 1, 2, 0, 2, 0, 1
-    // CHECK-NEXT: values : ( 66, 77, 61, 11, 61, 53, 22, 3, 100, 13, 10, 3, 18, 63, 43, 46, 75
+    // CHECK-NEXT: pos[0] : ( 0, 17 )
+    // CHECK-NEXT: crd[0] : ( 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1 )
+    // CHECK-NEXT: crd[2] : ( 2, 0, 1, 1, 2, 1, 2, 0, 1, 2, 0, 1, 2, 0, 2, 0, 1 )
+    // CHECK-NEXT: values : ( 66, 77, 61, 11, 61, 53, 22, 3, 100, 13, 10, 3, 18, 63, 43, 46, 75 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %3 : tensor<?x?x?xf64, #SortedCOO3DPermuted>
@@ -161,10 +161,10 @@ module {
     // CHECK-NEXT: nse = 6
     // CHECK-NEXT: dim = ( 5, 4 )
     // CHECK-NEXT: lvl = ( 5, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 6
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 2, 3, 4
-    // CHECK-NEXT: crd[1] : ( 0, 3, 0, 3, 1, 1
-    // CHECK-NEXT: values : ( 6, 5, 4, 3, 2, 11
+    // CHECK-NEXT: pos[0] : ( 0, 6 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 2, 3, 4 )
+    // CHECK-NEXT: crd[1] : ( 0, 3, 0, 3, 1, 1 )
+    // CHECK-NEXT: values : ( 6, 5, 4, 3, 2, 11 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %4 : tensor<?x?xf64, #SortedCOO>
@@ -178,10 +178,10 @@ module {
     // CHECK-NEXT: nse = 6
     // CHECK-NEXT: dim = ( 5, 4 )
     // CHECK-NEXT: lvl = ( 5, 4 )
-    // CHECK-NEXT: pos[0] : ( 0, 6
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 2, 3, 4
-    // CHECK-NEXT: crd[1] : ( 0, 3, 0, 3, 1, 1
-    // CHECK-NEXT: values : ( 12, 10, 8, 6, 4, 22
+    // CHECK-NEXT: pos[0] : ( 0, 6 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 2, 3, 4 )
+    // CHECK-NEXT: crd[1] : ( 0, 3, 0, 3, 1, 1 )
+    // CHECK-NEXT: values : ( 12, 10, 8, 6, 4, 22 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %5 : tensor<?x?xf64, #SortedCOO>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_storage.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_storage.mlir
index 2ee189de7906..da87b5cc3c6d 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_storage.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_storage.mlir
@@ -111,7 +111,7 @@ module {
     // CHECK-NEXT: nse = 80
     // CHECK-NEXT: dim = ( 10, 8 )
     // CHECK-NEXT: lvl = ( 10, 8 )
-    // CHECK-NEXT: values : ( 1, 0, 2, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 7, 8, 0, 0, 0, 0, 9, 0, 0, 10, 0, 0, 0, 11, 12, 0, 13, 14, 0, 0, 0, 15, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0
+    // CHECK-NEXT: values : ( 1, 0, 2, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 7, 8, 0, 0, 0, 0, 9, 0, 0, 10, 0, 0, 0, 11, 12, 0, 13, 14, 0, 0, 0, 15, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %0 : tensor<10x8xf64, #Dense>
@@ -124,9 +124,9 @@ module {
     // CHECK-NEXT: nse = 17
     // CHECK-NEXT: dim = ( 10, 8 )
     // CHECK-NEXT: lvl = ( 10, 8 )
-    // CHECK-NEXT: pos[1] : ( 0, 3, 3, 4, 5, 6, 9, 12, 16, 16, 17
-    // CHECK-NEXT: crd[1] : ( 0, 2, 7, 2, 3, 4, 1, 2, 7, 2, 6, 7, 1, 2, 6, 7, 6
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
+    // CHECK-NEXT: pos[1] : ( 0, 3, 3, 4, 5, 6, 9, 12, 16, 16, 17 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 7, 2, 3, 4, 1, 2, 7, 2, 6, 7, 1, 2, 6, 7, 6 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %1 : tensor<10x8xf64, #CSR>
@@ -138,11 +138,11 @@ module {
     // CHECK-NEXT: nse = 17
     // CHECK-NEXT: dim = ( 10, 8 )
     // CHECK-NEXT: lvl = ( 10, 8 )
-    // CHECK-NEXT: pos[0] : ( 0, 8
-    // CHECK-NEXT: crd[0] : ( 0, 2, 3, 4, 5, 6, 7, 9
-    // CHECK-NEXT: pos[1] : ( 0, 3, 4, 5, 6, 9, 12, 16, 17
-    // CHECK-NEXT: crd[1] : ( 0, 2, 7, 2, 3, 4, 1, 2, 7, 2, 6, 7, 1, 2, 6, 7, 6
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
+    // CHECK-NEXT: pos[0] : ( 0, 8 )
+    // CHECK-NEXT: crd[0] : ( 0, 2, 3, 4, 5, 6, 7, 9 )
+    // CHECK-NEXT: pos[1] : ( 0, 3, 4, 5, 6, 9, 12, 16, 17 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 7, 2, 3, 4, 1, 2, 7, 2, 6, 7, 1, 2, 6, 7, 6 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %2 : tensor<10x8xf64, #DCSR>
@@ -154,9 +154,9 @@ module {
     // CHECK-NEXT: nse = 17
     // CHECK-NEXT: dim = ( 10, 8 )
     // CHECK-NEXT: lvl = ( 8, 10 )
-    // CHECK-NEXT: pos[1] : ( 0, 1, 3, 8, 9, 10, 10, 13, 17
-    // CHECK-NEXT: crd[1] : ( 0, 5, 7, 0, 2, 5, 6, 7, 3, 4, 6, 7, 9, 0, 5, 6, 7
-    // CHECK-NEXT: values : ( 1, 7, 13, 2, 4, 8, 10, 14, 5, 6, 11, 15, 17, 3, 9, 12, 16
+    // CHECK-NEXT: pos[1] : ( 0, 1, 3, 8, 9, 10, 10, 13, 17 )
+    // CHECK-NEXT: crd[1] : ( 0, 5, 7, 0, 2, 5, 6, 7, 3, 4, 6, 7, 9, 0, 5, 6, 7 )
+    // CHECK-NEXT: values : ( 1, 7, 13, 2, 4, 8, 10, 14, 5, 6, 11, 15, 17, 3, 9, 12, 16 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %3 : tensor<10x8xf64, #CSC>
@@ -168,11 +168,11 @@ module {
     // CHECK-NEXT: nse = 17
     // CHECK-NEXT: dim = ( 10, 8 )
     // CHECK-NEXT: lvl = ( 8, 10 )
-    // CHECK-NEXT: pos[0] : ( 0, 7
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 6, 7
-    // CHECK-NEXT: pos[1] : ( 0, 1, 3, 8, 9, 10, 13, 17
-    // CHECK-NEXT: crd[1] : ( 0, 5, 7, 0, 2, 5, 6, 7, 3, 4, 6, 7, 9, 0, 5, 6, 7
-    // CHECK-NEXT: values : ( 1, 7, 13, 2, 4, 8, 10, 14, 5, 6, 11, 15, 17, 3, 9, 12, 16
+    // CHECK-NEXT: pos[0] : ( 0, 7 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 6, 7 )
+    // CHECK-NEXT: pos[1] : ( 0, 1, 3, 8, 9, 10, 13, 17 )
+    // CHECK-NEXT: crd[1] : ( 0, 5, 7, 0, 2, 5, 6, 7, 3, 4, 6, 7, 9, 0, 5, 6, 7 )
+    // CHECK-NEXT: values : ( 1, 7, 13, 2, 4, 8, 10, 14, 5, 6, 11, 15, 17, 3, 9, 12, 16 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %4 : tensor<10x8xf64, #DCSC>
@@ -184,9 +184,9 @@ module {
     // CHECK-NEXT: nse = 64
     // CHECK-NEXT: dim = ( 10, 8 )
     // CHECK-NEXT: lvl = ( 10, 8 )
-    // CHECK-NEXT: pos[0] : ( 0, 8
-    // CHECK-NEXT: crd[0] : ( 0, 2, 3, 4, 5, 6, 7, 9
-    // CHECK-NEXT: values : ( 1, 0, 2, 0, 0, 0, 0, 3, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 7, 8, 0, 0, 0, 0, 9, 0, 0, 10, 0, 0, 0, 11, 12, 0, 13, 14, 0, 0, 0, 15, 16, 0, 0, 0, 0, 0, 0, 17, 0
+    // CHECK-NEXT: pos[0] : ( 0, 8 )
+    // CHECK-NEXT: crd[0] : ( 0, 2, 3, 4, 5, 6, 7, 9 )
+    // CHECK-NEXT: values : ( 1, 0, 2, 0, 0, 0, 0, 3, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 7, 8, 0, 0, 0, 0, 9, 0, 0, 10, 0, 0, 0, 11, 12, 0, 13, 14, 0, 0, 0, 15, 16, 0, 0, 0, 0, 0, 0, 17, 0 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %x : tensor<10x8xf64, #BlockRow>
@@ -198,9 +198,9 @@ module {
     // CHECK-NEXT: nse = 70
     // CHECK-NEXT: dim = ( 10, 8 )
     // CHECK-NEXT: lvl = ( 8, 10 )
-    // CHECK-NEXT: pos[0] : ( 0, 7
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 6, 7
-    // CHECK-NEXT: values : ( 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 13, 0, 0, 2, 0, 4, 0, 0, 8, 10, 14, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 15, 0, 17, 3, 0, 0, 0, 0, 9, 12, 16, 0, 0
+    // CHECK-NEXT: pos[0] : ( 0, 7 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 6, 7 )
+    // CHECK-NEXT: values : ( 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 13, 0, 0, 2, 0, 4, 0, 0, 8, 10, 14, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 15, 0, 17, 3, 0, 0, 0, 0, 9, 12, 16, 0, 0 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %y : tensor<10x8xf64, #BlockCol>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_tanh.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_tanh.mlir
index 29bc744c9920..748fffc1e637 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_tanh.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_tanh.mlir
@@ -77,9 +77,9 @@ module {
     // CHECK-NEXT: nse = 9
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 9
-    // CHECK-NEXT: crd[0] : ( 0, 3, 11, 17, 20, 21, 28, 29, 31
-    // CHECK-NEXT: values : ({{ -0.761[0-9]*, 0.761[0-9]*, 0.96[0-9]*, 0.99[0-9]*, 0.99[0-9]*, 0.99[0-9]*, 0.99[0-9]*, 0.99[0-9]*, 1}}
+    // CHECK-NEXT: pos[0] : ( 0, 9 )
+    // CHECK-NEXT: crd[0] : ( 0, 3, 11, 17, 20, 21, 28, 29, 31 )
+    // CHECK-NEXT: values : ({{ -0.761[0-9]*, 0.761[0-9]*, 0.96[0-9]*, 0.99[0-9]*, 0.99[0-9]*, 0.99[0-9]*, 0.99[0-9]*, 0.99[0-9]*, 1}} )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %0 : tensor<?xf64, #SparseVector>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_tensor_mul.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_tensor_mul.mlir
index 67155201c584..fe2f2690e860 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_tensor_mul.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_tensor_mul.mlir
@@ -110,13 +110,13 @@ module {
     // CHECK-NEXT: nse = 4
     // CHECK-NEXT: dim = ( 3, 3, 5 )
     // CHECK-NEXT: lvl = ( 3, 3, 5 )
-    // CHECK-NEXT: pos[0] : ( 0, 2
-    // CHECK-NEXT: crd[0] : ( 0, 2
-    // CHECK-NEXT: pos[1] : ( 0, 1, 3
-    // CHECK-NEXT: crd[1] : ( 2, 0, 2
-    // CHECK-NEXT: pos[2] : ( 0, 2, 3, 4
-    // CHECK-NEXT: crd[2] : ( 0, 2, 0, 2
-    // CHECK-NEXT: values : ( 2.4, 3.5, 2, 8
+    // CHECK-NEXT: pos[0] : ( 0, 2 )
+    // CHECK-NEXT: crd[0] : ( 0, 2 )
+    // CHECK-NEXT: pos[1] : ( 0, 1, 3 )
+    // CHECK-NEXT: crd[1] : ( 2, 0, 2 )
+    // CHECK-NEXT: pos[2] : ( 0, 2, 3, 4 )
+    // CHECK-NEXT: crd[2] : ( 0, 2, 0, 2 )
+    // CHECK-NEXT: values : ( 2.4, 3.5, 2, 8 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %0 : tensor<?x?x?xf64, #ST>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_tensor_ops.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_tensor_ops.mlir
index 356808ebee3f..a46c3a8d5ef6 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_tensor_ops.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_tensor_ops.mlir
@@ -97,23 +97,23 @@ module {
     // CHECK-NEXT: nse = 5
     // CHECK-NEXT: dim = ( 3, 4, 8 )
     // CHECK-NEXT: lvl = ( 3, 4, 8 )
-    // CHECK-NEXT: pos[0] : ( 0, 2
-    // CHECK-NEXT: crd[0] : ( 0, 2
-    // CHECK-NEXT: pos[1] : ( 0, 2, 3
-    // CHECK-NEXT: crd[1] : ( 0, 3, 2
-    // CHECK-NEXT: pos[2] : ( 0, 1, 2, 5
-    // CHECK-NEXT: crd[2] : ( 0, 7, 1, 2, 7
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5
+    // CHECK-NEXT: pos[0] : ( 0, 2 )
+    // CHECK-NEXT: crd[0] : ( 0, 2 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3 )
+    // CHECK-NEXT: crd[1] : ( 0, 3, 2 )
+    // CHECK-NEXT: pos[2] : ( 0, 1, 2, 5 )
+    // CHECK-NEXT: crd[2] : ( 0, 7, 1, 2, 7 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5 )
     // CHECK-NEXT: ----
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 24
     // CHECK-NEXT: dim = ( 3, 4, 8 )
     // CHECK-NEXT: lvl = ( 3, 4, 8 )
-    // CHECK-NEXT: pos[0] : ( 0, 2
-    // CHECK-NEXT: crd[0] : ( 0, 2
-    // CHECK-NEXT: pos[1] : ( 0, 2, 3
-    // CHECK-NEXT: crd[1] : ( 0, 3, 2
-    // CHECK-NEXT: values : ( 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 6, 8, 0, 0, 0, 0, 10
+    // CHECK-NEXT: pos[0] : ( 0, 2 )
+    // CHECK-NEXT: crd[0] : ( 0, 2 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3 )
+    // CHECK-NEXT: crd[1] : ( 0, 3, 2 )
+    // CHECK-NEXT: values : ( 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 6, 8, 0, 0, 0, 0, 10 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %st : tensor<?x?x?xf64, #ST1>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_transpose.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_transpose.mlir
index 549c2082fcb3..434cc9509464 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_transpose.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_transpose.mlir
@@ -119,21 +119,21 @@ module {
     // CHECK-NEXT: nse = 6
     // CHECK-NEXT: dim = ( 4, 3 )
     // CHECK-NEXT: lvl = ( 4, 3 )
-    // CHECK-NEXT: pos[0] : ( 0, 4
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
-    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 4, 6
-    // CHECK-NEXT: crd[1] : ( 0, 2, 0, 2, 0, 2
-    // CHECK-NEXT: values : ( 1.1, 3.1, 1.2, 3.3, 1.4, 3.4
+    // CHECK-NEXT: pos[0] : ( 0, 4 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 4, 6 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 0, 2, 0, 2 )
+    // CHECK-NEXT: values : ( 1.1, 3.1, 1.2, 3.3, 1.4, 3.4 )
     // CHECK-NEXT: ----
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 6
     // CHECK-NEXT: dim = ( 4, 3 )
     // CHECK-NEXT: lvl = ( 4, 3 )
-    // CHECK-NEXT: pos[0] : ( 0, 4
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
-    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 4, 6
-    // CHECK-NEXT: crd[1] : ( 0, 2, 0, 2, 0, 2
-    // CHECK-NEXT: values : ( 1.1, 3.1, 1.2, 3.3, 1.4, 3.4
+    // CHECK-NEXT: pos[0] : ( 0, 4 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 4, 6 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 0, 2, 0, 2 )
+    // CHECK-NEXT: values : ( 1.1, 3.1, 1.2, 3.3, 1.4, 3.4 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %0 : tensor<4x3xf64, #DCSR>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_transpose_coo.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_transpose_coo.mlir
index cc6f6a068746..3b7760e5052c 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_transpose_coo.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_transpose_coo.mlir
@@ -83,19 +83,19 @@ module {
     // CHECK-NEXT: nse = 50
     // CHECK-NEXT: dim = ( 10, 5 )
     // CHECK-NEXT: lvl = ( 10, 5 )
-    // CHECK-NEXT: pos[0] : ( 0, 50
-    // CHECK-NEXT: crd[0] : ( 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4
-    // CHECK-NEXT: values : ( 10, 20, 30, 40, 50, 11, 21, 31, 41, 51, 12, 22, 32, 42, 52, 13, 23, 33, 43, 53, 14, 24, 34, 44, 54, 15, 25, 35, 45, 55, 16, 26, 36, 46, 56, 17, 27, 37, 47, 57, 18, 28, 38, 48, 58, 19, 29, 39, 49, 59
+    // CHECK-NEXT: pos[0] : ( 0, 50 )
+    // CHECK-NEXT: crd[0] : ( 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4 )
+    // CHECK-NEXT: values : ( 10, 20, 30, 40, 50, 11, 21, 31, 41, 51, 12, 22, 32, 42, 52, 13, 23, 33, 43, 53, 14, 24, 34, 44, 54, 15, 25, 35, 45, 55, 16, 26, 36, 46, 56, 17, 27, 37, 47, 57, 18, 28, 38, 48, 58, 19, 29, 39, 49, 59 )
     // CHECK-NEXT: ----
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 50
     // CHECK-NEXT: dim = ( 5, 10 )
     // CHECK-NEXT: lvl = ( 5, 10 )
-    // CHECK-NEXT: pos[0] : ( 0, 50
-    // CHECK-NEXT: crd[0] : ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
-    // CHECK-NEXT: values : ( 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59
+    // CHECK-NEXT: pos[0] : ( 0, 50 )
+    // CHECK-NEXT: crd[0] : ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 )
+    // CHECK-NEXT: values : ( 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %SA : tensor<10x5xf32, #SortedCOO>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_unary.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_unary.mlir
index 3da1e35818cf..acb7a99a3418 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_unary.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_unary.mlir
@@ -247,53 +247,53 @@ module {
     // CHECK-NEXT: nse = 9
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 9
-    // CHECK-NEXT: crd[0] : ( 0, 3, 11, 17, 20, 21, 28, 29, 31
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9
+    // CHECK-NEXT: pos[0] : ( 0, 9 )
+    // CHECK-NEXT: crd[0] : ( 0, 3, 11, 17, 20, 21, 28, 29, 31 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9 )
     // CHECK-NEXT: ----
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 23
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 23
-    // CHECK-NEXT: crd[0] : ( 1, 2, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 18, 19, 22, 23, 24, 25, 26, 27, 30
-    // CHECK-NEXT: values : ( 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+    // CHECK-NEXT: pos[0] : ( 0, 23 )
+    // CHECK-NEXT: crd[0] : ( 1, 2, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 18, 19, 22, 23, 24, 25, 26, 27, 30 )
+    // CHECK-NEXT: values : ( 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 )
     // CHECK-NEXT: ----
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 32
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 32
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-    // CHECK-NEXT: values : ( -1, 1, 1, -2, 1, 1, 1, 1, 1, 1, 1, -3, 1, 1, 1, 1, 1, -4, 1, 1, -5, -6, 1, 1, 1, 1, 1, 1, -7, -8, 1, -9
+    // CHECK-NEXT: pos[0] : ( 0, 32 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 )
+    // CHECK-NEXT: values : ( -1, 1, 1, -2, 1, 1, 1, 1, 1, 1, 1, -3, 1, 1, 1, 1, 1, -4, 1, 1, -5, -6, 1, 1, 1, 1, 1, 1, -7, -8, 1, -9 )
     // CHECK-NEXT: ----
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 9
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 9
-    // CHECK-NEXT: crd[0] : ( 0, 3, 11, 17, 20, 21, 28, 29, 31
-    // CHECK-NEXT: values : ( 0, 6, 33, 68, 100, 126, 196, 232, 279
+    // CHECK-NEXT: pos[0] : ( 0, 9 )
+    // CHECK-NEXT: crd[0] : ( 0, 3, 11, 17, 20, 21, 28, 29, 31 )
+    // CHECK-NEXT: values : ( 0, 6, 33, 68, 100, 126, 196, 232, 279 )
     // CHECK-NEXT: ----
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 9
     // CHECK-NEXT: dim = ( 4, 8 )
     // CHECK-NEXT: lvl = ( 4, 8 )
-    // CHECK-NEXT: pos[0] : ( 0, 4
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
-    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 6, 9
-    // CHECK-NEXT: crd[1] : ( 0, 1, 7, 2, 4, 7, 0, 2, 3
-    // CHECK-NEXT: values : ( 3, 3, 3, 4, 5, 6, 7, 7, 7
+    // CHECK-NEXT: pos[0] : ( 0, 4 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 6, 9 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 7, 2, 4, 7, 0, 2, 3 )
+    // CHECK-NEXT: values : ( 3, 3, 3, 4, 5, 6, 7, 7, 7 )
     // CHECK-NEXT: ----
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 9
     // CHECK-NEXT: dim = ( 4, 8 )
     // CHECK-NEXT: lvl = ( 4, 8 )
-    // CHECK-NEXT: pos[0] : ( 0, 4
-    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
-    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 6, 9
-    // CHECK-NEXT: crd[1] : ( 0, 1, 7, 2, 4, 7, 0, 2, 3
-    // CHECK-NEXT: values : ( 99, 99, 99, 99, 5, 6, 99, 99, 99
+    // CHECK-NEXT: pos[0] : ( 0, 4 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3 )
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 6, 9 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 7, 2, 4, 7, 0, 2, 3 )
+    // CHECK-NEXT: values : ( 99, 99, 99, 99, 5, 6, 99, 99, 99 )
     // CHECK-NEXT: ----
     // CHECK-NEXT: ( 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0 )
     //
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_vector_ops.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_vector_ops.mlir
index 553323331641..10ccf47c3408 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_vector_ops.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_vector_ops.mlir
@@ -209,55 +209,55 @@ module {
     // CHECK-NEXT: nse = 9
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 9
-    // CHECK-NEXT: crd[0] : ( 0, 3, 11, 17, 20, 21, 28, 29, 31
-    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9
+    // CHECK-NEXT: pos[0] : ( 0, 9 )
+    // CHECK-NEXT: crd[0] : ( 0, 3, 11, 17, 20, 21, 28, 29, 31 )
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 9 )
     // CHECK-NEXT: ----
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 10
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 10
-    // CHECK-NEXT: crd[0] : ( 1, 3, 4, 10, 16, 18, 21, 28, 29, 31
-    // CHECK-NEXT: values : ( 11, 12, 13, 14, 15, 16, 17, 18, 19, 20
+    // CHECK-NEXT: pos[0] : ( 0, 10 )
+    // CHECK-NEXT: crd[0] : ( 1, 3, 4, 10, 16, 18, 21, 28, 29, 31 )
+    // CHECK-NEXT: values : ( 11, 12, 13, 14, 15, 16, 17, 18, 19, 20 )
     // CHECK-NEXT: ----
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 9
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 9
-    // CHECK-NEXT: crd[0] : ( 0, 3, 11, 17, 20, 21, 28, 29, 31
-    // CHECK-NEXT: values : ( 2, 4, 6, 8, 10, 12, 14, 16, 18
+    // CHECK-NEXT: pos[0] : ( 0, 9 )
+    // CHECK-NEXT: crd[0] : ( 0, 3, 11, 17, 20, 21, 28, 29, 31 )
+    // CHECK-NEXT: values : ( 2, 4, 6, 8, 10, 12, 14, 16, 18 )
     // CHECK-NEXT: ----
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 9
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 9
-    // CHECK-NEXT: crd[0] : ( 0, 3, 11, 17, 20, 21, 28, 29, 31
-    // CHECK-NEXT: values : ( 2, 4, 6, 8, 10, 12, 14, 16, 18
+    // CHECK-NEXT: pos[0] : ( 0, 9 )
+    // CHECK-NEXT: crd[0] : ( 0, 3, 11, 17, 20, 21, 28, 29, 31 )
+    // CHECK-NEXT: values : ( 2, 4, 6, 8, 10, 12, 14, 16, 18 )
     // CHECK-NEXT: ----
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 14
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 14
-    // CHECK-NEXT: crd[0] : ( 0, 1, 3, 4, 10, 11, 16, 17, 18, 20, 21, 28, 29, 31
-    // CHECK-NEXT: values : ( 2, 11, 16, 13, 14, 6, 15, 8, 16, 10, 29, 32, 35, 38
+    // CHECK-NEXT: pos[0] : ( 0, 14 )
+    // CHECK-NEXT: crd[0] : ( 0, 1, 3, 4, 10, 11, 16, 17, 18, 20, 21, 28, 29, 31 )
+    // CHECK-NEXT: values : ( 2, 11, 16, 13, 14, 6, 15, 8, 16, 10, 29, 32, 35, 38 )
     // CHECK-NEXT: ----
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 5
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: pos[0] : ( 0, 5
-    // CHECK-NEXT: crd[0] : ( 3, 21, 28, 29, 31
-    // CHECK-NEXT: values : ( 48, 204, 252, 304, 360
+    // CHECK-NEXT: pos[0] : ( 0, 5 )
+    // CHECK-NEXT: crd[0] : ( 3, 21, 28, 29, 31 )
+    // CHECK-NEXT: values : ( 48, 204, 252, 304, 360 )
     // CHECK-NEXT: ----
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 32
     // CHECK-NEXT: dim = ( 32 )
     // CHECK-NEXT: lvl = ( 32 )
-    // CHECK-NEXT: values : ( 0, 0, 0, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 204, 0, 0, 0, 0, 0, 0, 252, 304, 0, 360
+    // CHECK-NEXT: values : ( 0, 0, 0, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 204, 0, 0, 0, 0, 0, 0, 252, 304, 0, 360 )
     // CHECK-NEXT: ----
     // CHECK-NEXT: 1169.1
     //
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-gemm-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-gemm-lib.mlir
index da78452d94fd..bd71409892f4 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-gemm-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-gemm-lib.mlir
@@ -68,9 +68,9 @@ module {
     // CHECK-NEXT: nse = 20
     // CHECK-NEXT: dim = ( 8, 8 )
     // CHECK-NEXT: lvl = ( 8, 8 )
-    // CHECK-NEXT: pos[1] : ( 0, 5, 5, 6, 7, 8, 12, 16, 20,
-    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 6, 7, 2, 3, 4, 1, 2, 6, 7, 1, 2, 6, 7, 1, 2, 6, 7,
-    // CHECK-NEXT: values : ( 1, 39, 52, 45, 51, 16, 25, 36, 117, 158, 135, 144, 156, 318, 301, 324, 208, 430, 405, 436,
+    // CHECK-NEXT: pos[1] : ( 0, 5, 5, 6, 7, 8, 12, 16, 20 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 6, 7, 2, 3, 4, 1, 2, 6, 7, 1, 2, 6, 7, 1, 2, 6, 7 )
+    // CHECK-NEXT: values : ( 1, 39, 52, 45, 51, 16, 25, 36, 117, 158, 135, 144, 156, 318, 301, 324, 208, 430, 405, 436 )
     // CHECK-NEXT: ----
     sparse_tensor.print %Ccsr : tensor<8x8xf32, #CSR>
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir
index 3d17b719732f..64f289626c07 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir
@@ -117,9 +117,9 @@ module {
     // CHECK-NEXT: nse = 9
     // CHECK-NEXT: dim = ( 5, 5 )
     // CHECK-NEXT: lvl = ( 5, 5 )
-    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 5, 7, 9,
-    // CHECK-NEXT: crd[1] : ( 0, 3, 1, 4, 2, 0, 3, 1, 4,
-    // CHECK-NEXT: values : ( 11, 41.4, 42, 102.5, 93, 44.1, 164, 105.2, 255,
+    // CHECK-NEXT: pos[1] : ( 0, 2, 4, 5, 7, 9 )
+    // CHECK-NEXT: crd[1] : ( 0, 3, 1, 4, 2, 0, 3, 1, 4 )
+    // CHECK-NEXT: values : ( 11, 41.4, 42, 102.5, 93, 44.1, 164, 105.2, 255 )
     // CHECK-NEXT: ----
     sparse_tensor.print %0 : tensor<?x?xf32, #CSR>
 
@@ -145,9 +145,9 @@ module {
     // CHECK-NEXT: nse = 5
     // CHECK-NEXT: dim = ( 8, 8 )
     // CHECK-NEXT: lvl = ( 8, 8 )
-    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 3, 4, 4, 4, 4, 5,
-    // CHECK-NEXT: crd[1] : ( 0, 1, 0, 4, 7,
-    // CHECK-NEXT: values : ( 17, 18, 19, 20, 21,
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 3, 4, 4, 4, 4, 5 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 0, 4, 7 )
+    // CHECK-NEXT: values : ( 17, 18, 19, 20, 21 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %1 : tensor<?x?xf32, #CSR>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sddmm-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sddmm-lib.mlir
index 68bb32891f34..4b503ae0d110 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sddmm-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sddmm-lib.mlir
@@ -170,18 +170,18 @@ module {
     // CHECK-NEXT: nse = 8
     // CHECK-NEXT: dim = ( 4, 6 )
     // CHECK-NEXT: lvl = ( 4, 6 )
-    // CHECK-NEXT: pos[1] : ( 0, 3, 5, 7, 8,
-    // CHECK-NEXT: crd[1] : ( 0, 1, 4, 1, 5, 2, 3, 2,
-    // CHECK-NEXT: values : ( 5, 10, 24, 19, 53, 42, 55, 56,
+    // CHECK-NEXT: pos[1] : ( 0, 3, 5, 7, 8 )
+    // CHECK-NEXT: crd[1] : ( 0, 1, 4, 1, 5, 2, 3, 2 )
+    // CHECK-NEXT: values : ( 5, 10, 24, 19, 53, 42, 55, 56 )
     // CHECK-NEXT: ----
     //
     // CHECK:      ---- Sparse Tensor ----
     // CHECK-NEXT: nse = 12
     // CHECK-NEXT: dim = ( 4, 6 )
     // CHECK-NEXT: lvl = ( 2, 3, 2, 2 )
-    // CHECK-NEXT: pos[1] : ( 0, 2, 3,
-    // CHECK-NEXT: crd[1] : ( 0, 2, 1,
-    // CHECK-NEXT: values : ( 5, 10, 8, 19, 24, 24, 40, 53, 42, 55, 56, 64,
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3 )
+    // CHECK-NEXT: crd[1] : ( 0, 2, 1 )
+    // CHECK-NEXT: values : ( 5, 10, 8, 19, 24, 24, 40, 53, 42, 55, 56, 64 )
     // CHECK-NEXT: ----
     //
     sparse_tensor.print %0 : tensor<?x?xf32, #CSR>
diff --git a/mlir/test/Interfaces/TilingInterface/tile-pad-using-interface.mlir b/mlir/test/Interfaces/TilingInterface/tile-pad-using-interface.mlir
index ba56206f03d7..7d247aefcf6b 100644
--- a/mlir/test/Interfaces/TilingInterface/tile-pad-using-interface.mlir
+++ b/mlir/test/Interfaces/TilingInterface/tile-pad-using-interface.mlir
@@ -14,7 +14,7 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
     %pad = transform.structured.match ops{["tensor.pad"]} in %arg1
       : (!transform.any_op) -> !transform.any_op
-    %a, %b, %c = transform.structured.tile_using_for %pad [2, 3]
+    %a, %b, %c = transform.structured.tile_using_for %pad tile_sizes [2, 3]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
@@ -57,7 +57,7 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
     %pad = transform.structured.match ops{["tensor.pad"]} in %arg1
       : (!transform.any_op) -> !transform.any_op
-    %a, %b = transform.structured.tile_using_for %pad [0, 3]
+    %a, %b = transform.structured.tile_using_for %pad tile_sizes [0, 3]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
     transform.yield
   }
@@ -97,7 +97,7 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
     %pad = transform.structured.match ops{["tensor.pad"]} in %arg1
       : (!transform.any_op) -> !transform.any_op
-    %a, %b, %c = transform.structured.tile_using_for %pad [2, 3]
+    %a, %b, %c = transform.structured.tile_using_for %pad tile_sizes [2, 3]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
@@ -134,7 +134,7 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
     %pad = transform.structured.match ops{["tensor.pad"]} in %arg1
       : (!transform.any_op) -> !transform.any_op
-    %a, %b = transform.structured.tile_using_for %pad [0, 3]
+    %a, %b = transform.structured.tile_using_for %pad tile_sizes [0, 3]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
     transform.yield
   }
@@ -170,7 +170,7 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
     %pad = transform.structured.match ops{["tensor.pad"]} in %arg1
       : (!transform.any_op) -> !transform.any_op
-    %a, %b, %c = transform.structured.tile_using_for %pad [2, 3]
+    %a, %b, %c = transform.structured.tile_using_for %pad tile_sizes [2, 3]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
@@ -192,7 +192,7 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
     %pad = transform.structured.match ops{["tensor.pad"]} in %arg1
       : (!transform.any_op) -> !transform.any_op
-    %a, %b = transform.structured.tile_using_for %pad [0, 3]
+    %a, %b = transform.structured.tile_using_for %pad tile_sizes [0, 3]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
     transform.yield
   }
diff --git a/mlir/test/Interfaces/TilingInterface/tile-using-interface.mlir b/mlir/test/Interfaces/TilingInterface/tile-using-interface.mlir
index 607836faafb7..488a52e8e3e9 100644
--- a/mlir/test/Interfaces/TilingInterface/tile-using-interface.mlir
+++ b/mlir/test/Interfaces/TilingInterface/tile-using-interface.mlir
@@ -11,7 +11,7 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
     %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1
       : (!transform.any_op) -> !transform.any_op
-    %a, %b, %c = transform.structured.tile_using_for %matmul [10, 20]
+    %a, %b, %c = transform.structured.tile_using_for %matmul tile_sizes [10, 20]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
@@ -63,7 +63,7 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
     %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1
       : (!transform.any_op) -> !transform.any_op
-    %a, %b, %c, %d = transform.structured.tile_using_for %matmul [10, 20, 30]
+    %a, %b, %c, %d = transform.structured.tile_using_for %matmul tile_sizes [10, 20, 30]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
@@ -122,7 +122,7 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
     %generic = transform.structured.match ops{["linalg.generic"]} in %arg1
       : (!transform.any_op) -> !transform.any_op
-    %a, %b, %c = transform.structured.tile_using_for %generic [10, 0, 20]
+    %a, %b, %c = transform.structured.tile_using_for %generic tile_sizes [10, 0, 20]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
@@ -175,7 +175,7 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
     %conv = transform.structured.match ops{["linalg.conv_2d_nhwc_hwcf"]} in %arg1
       : (!transform.any_op) -> !transform.any_op
-    %a, %b, %c, %d = transform.structured.tile_using_for %conv [0, 0, 0, 0, 10, 20, 30]
+    %a, %b, %c, %d = transform.structured.tile_using_for %conv tile_sizes [0, 0, 0, 0, 10, 20, 30]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
@@ -254,7 +254,7 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
     %generic = transform.structured.match ops{["linalg.generic"]} in %arg1
       : (!transform.any_op) -> !transform.any_op
-    %a, %b, %c = transform.structured.tile_using_for %generic [10, 20]
+    %a, %b, %c = transform.structured.tile_using_for %generic tile_sizes [10, 20]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
@@ -282,7 +282,7 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
     %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1
       : (!transform.any_op) -> !transform.any_op
-    %a, %b, %c, %d = transform.structured.tile_using_for %matmul [10, 20, 30] interchange = [1, 2, 0]
+    %a, %b, %c, %d = transform.structured.tile_using_for %matmul tile_sizes [10, 20, 30] interchange = [1, 2, 0]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
@@ -338,7 +338,7 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
     %copy = transform.structured.match ops{["linalg.copy"]} in %arg1
       : (!transform.any_op) -> !transform.any_op
-    %a, %b, %c = transform.structured.tile_using_for %copy [10, 20]
+    %a, %b, %c = transform.structured.tile_using_for %copy tile_sizes [10, 20]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
@@ -369,7 +369,7 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
     %generic = transform.structured.match ops{["linalg.generic"]} in %arg1
       : (!transform.any_op) -> !transform.any_op
-    %a = transform.structured.tile_using_for %generic []
+    %a = transform.structured.tile_using_for %generic tile_sizes []
       : (!transform.any_op) -> (!transform.any_op)
     transform.yield
   }
@@ -396,7 +396,7 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
     %generic = transform.structured.match ops{["linalg.generic"]} in %arg1
       : (!transform.any_op) -> !transform.any_op
-    %a = transform.structured.tile_using_for %generic []
+    %a = transform.structured.tile_using_for %generic tile_sizes []
       : (!transform.any_op) -> (!transform.any_op)
     transform.yield
   }
diff --git a/mlir/test/Target/Cpp/expressions.mlir b/mlir/test/Target/Cpp/expressions.mlir
index 9ec9dcc3c6a8..2eda58902cb1 100644
--- a/mlir/test/Target/Cpp/expressions.mlir
+++ b/mlir/test/Target/Cpp/expressions.mlir
@@ -210,3 +210,18 @@ func.func @expression_with_address_taken(%arg0: i32, %arg1: i32, %arg2: !emitc.p
   }
   return %c : i1
 }
+
+// CPP-DEFAULT: int32_t expression_with_subscript_user(void* [[VAL_1:v.+]])
+// CPP-DEFAULT-NEXT:   int64_t [[VAL_2:v.+]] = 0;
+// CPP-DEFAULT-NEXT:   int32_t* [[VAL_3:v.+]] = (int32_t*) [[VAL_1]];
+// CPP-DEFAULT-NEXT:   return [[VAL_3]][[[VAL_2]]];
+
+func.func @expression_with_subscript_user(%arg0: !emitc.ptr<!emitc.opaque<"void">>) -> i32 {
+  %c0 = "emitc.constant"() {value = 0 : i64} : () -> i64
+  %0 = emitc.expression : !emitc.ptr<i32> {
+    %0 = emitc.cast %arg0 : !emitc.ptr<!emitc.opaque<"void">> to !emitc.ptr<i32>
+    emitc.yield %0 : !emitc.ptr<i32>
+  }
+  %1 = emitc.subscript %0[%c0] : (!emitc.ptr<i32>, i64) -> i32
+  return %1 : i32
+}
diff --git a/mlir/test/Target/LLVMIR/Import/metadata-linker-options.ll b/mlir/test/Target/LLVMIR/Import/metadata-linker-options.ll
index 8702415c2988..d936632031a5 100644
--- a/mlir/test/Target/LLVMIR/Import/metadata-linker-options.ll
+++ b/mlir/test/Target/LLVMIR/Import/metadata-linker-options.ll
@@ -1,15 +1,15 @@
-; RUN: mlir-translate -import-llvm -split-input-file %s | FileCheck %s
-
-; CHECK: llvm.linker_options ["DEFAULTLIB:", "libcmt"]
-!llvm.linker.options = !{!0}
-!0 = !{!"DEFAULTLIB:", !"libcmt"}
-
-; // -----
-
-!llvm.linker.options = !{!0, !1, !2}
-; CHECK: llvm.linker_options ["DEFAULTLIB:", "libcmt"]
-!0 = !{!"DEFAULTLIB:", !"libcmt"}
-; CHECK: llvm.linker_options ["DEFAULTLIB:", "libcmtd"]
-!1 = !{!"DEFAULTLIB:", !"libcmtd"}
-; CHECK: llvm.linker_options ["-lm"]
-!2 = !{!"-lm"}
+; RUN: mlir-translate -import-llvm -split-input-file %s | FileCheck %s
+
+; CHECK: llvm.linker_options ["DEFAULTLIB:", "libcmt"]
+!llvm.linker.options = !{!0}
+!0 = !{!"DEFAULTLIB:", !"libcmt"}
+
+; // -----
+
+!llvm.linker.options = !{!0, !1, !2}
+; CHECK: llvm.linker_options ["DEFAULTLIB:", "libcmt"]
+!0 = !{!"DEFAULTLIB:", !"libcmt"}
+; CHECK: llvm.linker_options ["DEFAULTLIB:", "libcmtd"]
+!1 = !{!"DEFAULTLIB:", !"libcmtd"}
+; CHECK: llvm.linker_options ["-lm"]
+!2 = !{!"-lm"}
diff --git a/mlir/test/Target/LLVMIR/llvmir-debug.mlir b/mlir/test/Target/LLVMIR/llvmir-debug.mlir
index 8ab1a1b290da..1f0fc969364a 100644
--- a/mlir/test/Target/LLVMIR/llvmir-debug.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-debug.mlir
@@ -108,16 +108,19 @@ llvm.func @func_with_debug(%arg: i64) {
   // CHECK: call void @func_no_debug(), !dbg ![[NAMED_LOC:[0-9]+]]
   llvm.call @func_no_debug() : () -> () loc("named"("foo.mlir":10:10))
 
-  // CHECK: call void @func_no_debug(), !dbg ![[CALLSITE_LOC:[0-9]+]]
+  // CHECK: call void @func_no_debug(), !dbg ![[MY_SOURCE_LOC:[0-9]+]]
   llvm.call @func_no_debug() : () -> () loc(callsite("nodebug.cc":3:4 at "mysource.cc":5:6))
 
-  // CHECK: call void @func_no_debug(), !dbg ![[CALLSITE_LOC:[0-9]+]]
+  // CHECK: call void @func_no_debug(), !dbg ![[MY_SOURCE_LOC]]
   llvm.call @func_no_debug() : () -> () loc(callsite("nodebug.cc":3:4 at fused<#sp0>["mysource.cc":5:6]))
 
   // CHECK: call void @func_no_debug(), !dbg ![[FUSED_LOC:[0-9]+]]
   llvm.call @func_no_debug() : () -> () loc(fused[callsite(fused<#callee>["mysource.cc":5:6] at "mysource.cc":1:1), "mysource.cc":1:1])
 
-  // CHECK: add i64 %[[ARG]], %[[ARG]], !dbg ![[FUSEDWITH_LOC:[0-9]+]]
+  // CHECK: call void @func_no_debug(), !dbg ![[FUSEDWITH_LOC:[0-9]+]]
+  llvm.call @func_no_debug() : () -> () loc(callsite(callsite(fused<#callee>["foo.mlir":2:4] at "foo.mlir":1:1) at fused<#sp0>["foo.mlir":28:5]))
+
+  // CHECK: add i64 %[[ARG]], %[[ARG]], !dbg ![[FUSEDWITH_LOC]]
   %sum = llvm.add %arg, %arg : i64 loc(callsite(fused<#callee>["foo.mlir":2:4] at fused<#sp0>["foo.mlir":28:5]))
 
   llvm.return
@@ -153,7 +156,7 @@ llvm.func @empty_types() {
 // CHECK: ![[BLOCK_LOC]] = distinct !DILexicalBlock(scope: ![[FUNC_LOC]])
 // CHECK: ![[NO_NAME_VAR]] = !DILocalVariable(scope: ![[BLOCK_LOC]])
 
-// CHECK-DAG: ![[CALLSITE_LOC]] = !DILocation(line: 5, column: 6,
+// CHECK-DAG: ![[MY_SOURCE_LOC]] = !DILocation(line: 5, column: 6
 // CHECK-DAG: ![[FILE_LOC]] = !DILocation(line: 1, column: 2,
 // CHECK-DAG: ![[NAMED_LOC]] = !DILocation(line: 10, column: 10
 // CHECK-DAG: ![[FUSED_LOC]] = !DILocation(line: 1, column: 1
diff --git a/mlir/test/Target/LLVMIR/omptarget-fortran-allocatable-types-host.mlir b/mlir/test/Target/LLVMIR/omptarget-fortran-allocatable-types-host.mlir
index 7cb22dbb10b1..429bb379ee1b 100644
--- a/mlir/test/Target/LLVMIR/omptarget-fortran-allocatable-types-host.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-fortran-allocatable-types-host.mlir
@@ -2,10 +2,9 @@
 
 // This test checks the offload sizes, map types and base pointers and pointers
 // provided to the OpenMP kernel argument structure are correct when lowering 
-// to LLVM-IR from MLIR when the fortran allocatables flag is switched on and 
-// a fortran allocatable descriptor type is provided alongside the omp.map.info,
-// the test utilises mapping of array sections, full arrays and individual 
-// allocated scalars.
+// to LLVM-IR from MLIR when a fortran allocatable descriptor type is provided 
+// alongside the omp.map.info, the test utilises mapping of array sections, 
+// full arrays and individual allocated scalars.
 
 module attributes {omp.is_target_device = false} {
   llvm.func @_QQmain() {
@@ -27,7 +26,7 @@ module attributes {omp.is_target_device = false} {
     %15 = omp.map.bounds lower_bound(%7 : i64) upper_bound(%14 : i64) extent(%11 : i64) stride(%13 : i64) start_idx(%9 : i64) {stride_in_bytes = true}
     %16 = llvm.getelementptr %3[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
     %17 = omp.map.info var_ptr(%3 : !llvm.ptr, f32) var_ptr_ptr(%16 : !llvm.ptr) map_clauses(tofrom) capture(ByRef) bounds(%15) -> !llvm.ptr {name = "full_arr"}
-    %18 = omp.map.info var_ptr(%3 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(tofrom) capture(ByRef) members(%17 : !llvm.ptr) -> !llvm.ptr {name = "full_arr"}
+    %18 = omp.map.info var_ptr(%3 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(tofrom) capture(ByRef) members(%17 : [0] : !llvm.ptr) -> !llvm.ptr {name = "full_arr"}
     %19 = llvm.getelementptr %6[0, 7, %7, 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
     %20 = llvm.load %19 : !llvm.ptr -> i64
     %21 = llvm.getelementptr %6[0, 7, %7, 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
@@ -39,10 +38,10 @@ module attributes {omp.is_target_device = false} {
     %27 = omp.map.bounds lower_bound(%25 : i64) upper_bound(%26 : i64) extent(%22 : i64) stride(%24 : i64) start_idx(%20 : i64) {stride_in_bytes = true}
     %28 = llvm.getelementptr %6[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
     %29 = omp.map.info var_ptr(%6 : !llvm.ptr, i32) var_ptr_ptr(%28 : !llvm.ptr) map_clauses(tofrom) capture(ByRef) bounds(%27) -> !llvm.ptr {name = "sect_arr(2:5)"}
-    %30 = omp.map.info var_ptr(%6 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(tofrom) capture(ByRef) members(%29 : !llvm.ptr) -> !llvm.ptr {name = "sect_arr(2:5)"}
+    %30 = omp.map.info var_ptr(%6 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(tofrom) capture(ByRef) members(%29 : [0] : !llvm.ptr) -> !llvm.ptr {name = "sect_arr(2:5)"}
     %31 = llvm.getelementptr %5[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
     %32 = omp.map.info var_ptr(%5 : !llvm.ptr, f32) var_ptr_ptr(%31 : !llvm.ptr) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "scalar"}
-    %33 = omp.map.info var_ptr(%5 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>) map_clauses(tofrom) capture(ByRef) members(%32 : !llvm.ptr) -> !llvm.ptr {name = "scalar"}
+    %33 = omp.map.info var_ptr(%5 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>) map_clauses(tofrom) capture(ByRef) members(%32 : [0] : !llvm.ptr) -> !llvm.ptr {name = "scalar"}
     omp.target map_entries(%17 -> %arg0, %18 -> %arg1, %29 -> %arg2, %30 -> %arg3, %32 -> %arg4, %33 -> %arg5 : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) {
     ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr, %arg3: !llvm.ptr, %arg4: !llvm.ptr, %arg5: !llvm.ptr):
       omp.terminator
@@ -142,6 +141,6 @@ module attributes {omp.is_target_device = false} {
 // CHECK: %[[OFFLOADPTRS:.*]] = getelementptr inbounds [9 x ptr], ptr %.offload_ptrs, i32 0, i32 7
 // CHECK: store ptr %[[SCALAR_ALLOCA]], ptr %[[OFFLOADPTRS]], align 8
 // CHECK: %[[OFFLOADBASEPTRS:.*]] = getelementptr inbounds [9 x ptr], ptr %.offload_baseptrs, i32 0, i32 8
-// CHECK: store ptr %[[SCALAR_BASE]], ptr %[[OFFLOADBASEPTRS]], align 8
+// CHECK: store ptr %[[SCALAR_ALLOCA]], ptr %[[OFFLOADBASEPTRS]], align 8
 // CHECK: %[[OFFLOADPTRS:.*]] = getelementptr inbounds [9 x ptr], ptr %.offload_ptrs, i32 0, i32 8
 // CHECK: store ptr %[[SCALAR_PTR_LOAD]], ptr %[[OFFLOADPTRS]], align 8
diff --git a/mlir/test/Target/LLVMIR/omptarget-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-llvm.mlir
index 2f629675442d..18189ea63981 100644
--- a/mlir/test/Target/LLVMIR/omptarget-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-llvm.mlir
@@ -69,18 +69,18 @@ llvm.func @_QPopenmp_target_data_region(%0 : !llvm.ptr) {
 // CHECK:         %[[ARR_OFFSET:.*]] = getelementptr inbounds [1024 x i32], ptr %[[ARR_DATA:.*]], i64 0, i64 0
 // CHECK:         %[[VAL_5:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_0]], i32 0, i32 0
 // CHECK:         store ptr %[[ARR_DATA]], ptr %[[VAL_5]], align 8
-// CHECK:         %[[VAL_7:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_1]], i32 0, i32 0
-// CHECK:         store ptr %[[ARR_OFFSET]], ptr %[[VAL_7]], align 8
-// CHECK:         %[[VAL_8:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_2]], i64 0, i64 0
-// CHECK:         store ptr null, ptr %[[VAL_8]], align 8
-// CHECK:         %[[VAL_9:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_0]], i32 0, i32 0
-// CHECK:         %[[VAL_10:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_1]], i32 0, i32 0
-// CHECK:         call void @__tgt_target_data_begin_mapper(ptr @2, i64 -1, i32 1, ptr %[[VAL_9]], ptr %[[VAL_10]], ptr @.offload_sizes, ptr @.offload_maptypes, ptr @.offload_mapnames, ptr null)
-// CHECK:         %[[VAL_11:.*]] = getelementptr [1024 x i32], ptr %[[ARR_DATA]], i32 0, i64 0
-// CHECK:         store i32 99, ptr %[[VAL_11]], align 4
-// CHECK:         %[[VAL_12:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_0]], i32 0, i32 0
-// CHECK:         %[[VAL_13:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_1]], i32 0, i32 0
-// CHECK:         call void @__tgt_target_data_end_mapper(ptr @2, i64 -1, i32 1, ptr %[[VAL_12]], ptr %[[VAL_13]], ptr @.offload_sizes, ptr @.offload_maptypes, ptr @.offload_mapnames, ptr null)
+// CHECK:         %[[VAL_6:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_1]], i32 0, i32 0
+// CHECK:         store ptr %[[ARR_OFFSET]], ptr %[[VAL_6]], align 8
+// CHECK:         %[[VAL_7:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_2]], i64 0, i64 0
+// CHECK:         store ptr null, ptr %[[VAL_7]], align 8
+// CHECK:         %[[VAL_8:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_0]], i32 0, i32 0
+// CHECK:         %[[VAL_9:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_1]], i32 0, i32 0
+// CHECK:         call void @__tgt_target_data_begin_mapper(ptr @2, i64 -1, i32 1, ptr %[[VAL_8]], ptr %[[VAL_9]], ptr @.offload_sizes, ptr @.offload_maptypes, ptr @.offload_mapnames, ptr null)
+// CHECK:         %[[VAL_10:.*]] = getelementptr [1024 x i32], ptr %[[ARR_DATA]], i32 0, i64 0
+// CHECK:         store i32 99, ptr %[[VAL_10]], align 4
+// CHECK:         %[[VAL_11:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_0]], i32 0, i32 0
+// CHECK:         %[[VAL_12:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_1]], i32 0, i32 0
+// CHECK:         call void @__tgt_target_data_end_mapper(ptr @2, i64 -1, i32 1, ptr %[[VAL_11]], ptr %[[VAL_12]], ptr @.offload_sizes, ptr @.offload_maptypes, ptr @.offload_mapnames, ptr null)
 // CHECK:         ret void
 
 // -----
@@ -157,13 +157,13 @@ llvm.func @_QPomp_target_enter_exit(%1 : !llvm.ptr, %3 : !llvm.ptr) {
 // CHECK:         %[[ARR_OFFSET1:.*]] = getelementptr inbounds [1024 x i32], ptr %[[VAL_16:.*]], i64 0, i64 0
 // CHECK:         %[[ARR_OFFSET2:.*]] = getelementptr inbounds [512 x i32], ptr %[[VAL_20:.*]], i64 0, i64 0
 // CHECK:         %[[VAL_15:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_3]], i32 0, i32 0
-// CHECK:         store ptr %[[VAL_16:.*]], ptr %[[VAL_15]], align 8
+// CHECK:         store ptr %[[VAL_16]], ptr %[[VAL_15]], align 8
 // CHECK:         %[[VAL_17:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_4]], i32 0, i32 0
 // CHECK:         store ptr %[[ARR_OFFSET1]], ptr %[[VAL_17]], align 8
 // CHECK:         %[[VAL_18:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_5]], i64 0, i64 0
 // CHECK:         store ptr null, ptr %[[VAL_18]], align 8
 // CHECK:         %[[VAL_19:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_3]], i32 0, i32 1
-// CHECK:         store ptr %[[VAL_20:.*]], ptr %[[VAL_19]], align 8
+// CHECK:         store ptr %[[VAL_20]], ptr %[[VAL_19]], align 8
 // CHECK:         %[[VAL_21:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_4]], i32 0, i32 1
 // CHECK:         store ptr %[[ARR_OFFSET2]], ptr %[[VAL_21]], align 8
 // CHECK:         %[[VAL_22:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_5]], i64 0, i64 1
diff --git a/mlir/test/Target/LLVMIR/omptarget-nested-record-type-mapping-host.mlir b/mlir/test/Target/LLVMIR/omptarget-nested-record-type-mapping-host.mlir
new file mode 100644
index 000000000000..e4d82d4a58c8
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-nested-record-type-mapping-host.mlir
@@ -0,0 +1,69 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// This test checks the offload sizes, map types and base pointers and pointers
+// provided to the OpenMP kernel argument structure are correct when lowering
+// to LLVM-IR from MLIR when performing explicit member mapping of a record type
+// that includes another nested record type (C++/C class/structure, Fortran
+// derived type) where members of both the nested and outer record type have
+// members mapped.
+
+module attributes {omp.is_target_device = false} {
+llvm.func @_QQmain() {
+    %0 = llvm.mlir.constant(10 : index) : i64
+    %1 = llvm.mlir.constant(4 : index) : i64
+    %2 = llvm.mlir.constant(1 : index) : i64
+    %3 = llvm.mlir.constant(1 : i64) : i64
+    %4 = llvm.alloca %3 x !llvm.struct<(f32, array<10 x i32>, struct<(f32, i32)>, i32)> : (i64) -> !llvm.ptr
+    %5 = llvm.getelementptr %4[0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(f32, array<10 x i32>, struct<(f32, i32)>, i32)>
+    %6 = omp.map.info var_ptr(%5 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr
+    %7 = llvm.getelementptr %4[0, 2, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(f32, array<10 x i32>, struct<(f32, i32)>, i32)>
+    %8 = omp.map.info var_ptr(%7 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr
+    %9 = llvm.getelementptr %4[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(f32, array<10 x i32>, struct<(f32, i32)>, i32)>
+    %10 = omp.map.bounds lower_bound(%2 : i64) upper_bound(%1 : i64) extent(%0 : i64) stride(%2 : i64) start_idx(%2 : i64)
+    %11 = omp.map.info var_ptr(%9 : !llvm.ptr, !llvm.array<10 x i32>) map_clauses(tofrom) capture(ByRef) bounds(%10) -> !llvm.ptr
+    %12 = omp.map.info var_ptr(%4 : !llvm.ptr, !llvm.struct<(f32, array<10 x i32>, struct<(f32, i32)>, i32)>) map_clauses(tofrom) capture(ByRef) members(%6, %8, %11 : [3, -1], [2, 1], [1, -1] : !llvm.ptr, !llvm.ptr, !llvm.ptr) -> !llvm.ptr {partial_map = true}
+    omp.target map_entries(%6 -> %arg0, %8 -> %arg1, %11 -> %arg2, %12 -> %arg3 : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) {
+    ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr, %arg3: !llvm.ptr):
+      omp.terminator
+    }
+    llvm.return
+  }
+}
+
+// CHECK: @.offload_sizes = private unnamed_addr constant [4 x i64] [i64 0, i64 4, i64 4, i64 16]
+// CHECK: @.offload_maptypes = private unnamed_addr constant [4 x i64] [i64 32, i64 281474976710659, i64 281474976710659, i64 281474976710659]
+
+// CHECK: define void @_QQmain()
+// CHECK: %[[ALLOCA:.*]] = alloca { float, [10 x i32], { float, i32 }, i32 }, i64 1, align 8
+// CHECK: %[[MEMBER_ACCESS_1:.*]] = getelementptr { float, [10 x i32], { float, i32 }, i32 }, ptr %[[ALLOCA]], i32 0, i32 3 
+// CHECK: %[[MEMBER_ACCESS_2:.*]] = getelementptr { float, [10 x i32], { float, i32 }, i32 }, ptr %[[ALLOCA]], i32 0, i32 2, i32 1
+// CHECK: %[[MEMBER_ACCESS_3:.*]] = getelementptr { float, [10 x i32], { float, i32 }, i32 }, ptr %[[ALLOCA]], i32 0, i32 1
+
+// CHECK: %[[LAST_MEMBER:.*]] = getelementptr inbounds [10 x i32], ptr %[[MEMBER_ACCESS_3]], i64 0, i64 1
+// CHECK: %[[FIRST_MEMBER:.*]] = getelementptr i32, ptr %[[MEMBER_ACCESS_1]], i64 1
+// CHECK: %[[FIRST_MEMBER_OFF:.*]] = ptrtoint ptr %[[FIRST_MEMBER]] to i64
+// CHECK: %[[SECOND_MEMBER_OFF:.*]] = ptrtoint ptr %[[LAST_MEMBER]] to i64
+// CHECK: %[[MEMBER_DIFF:.*]] = sub i64 %[[FIRST_MEMBER_OFF]], %[[SECOND_MEMBER_OFF]]
+// CHECK: %[[OFFLOAD_SIZE:.*]] = sdiv exact i64 %[[MEMBER_DIFF]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
+
+// CHECK: %[[BASE_PTR_ARR:.*]] = getelementptr inbounds [4 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
+// CHECK: store ptr %[[ALLOCA]], ptr %[[BASE_PTR_ARR]], align 8
+// CHECK: %[[PTR_ARR:.*]] = getelementptr inbounds [4 x ptr], ptr %.offload_ptrs, i32 0, i32 0
+// CHECK: store ptr %[[LAST_MEMBER]], ptr %[[PTR_ARR]], align 8
+// CHECK: %[[SIZE_ARR:.*]] = getelementptr inbounds [4 x i64], ptr %.offload_sizes, i32 0, i32 0
+// CHECK: store i64 %[[OFFLOAD_SIZE]], ptr %[[SIZE_ARR]], align 8
+
+// CHECK: %[[BASE_PTR_ARR_2:.*]] = getelementptr inbounds [4 x ptr], ptr %.offload_baseptrs, i32 0, i32 1
+// CHECK: store ptr %[[ALLOCA]], ptr %[[BASE_PTR_ARR_2]], align 8
+// CHECK: %[[PTR_ARR_2:.*]] = getelementptr inbounds [4 x ptr], ptr %.offload_ptrs, i32 0, i32 1
+// CHECK: store ptr %[[MEMBER_ACCESS_1]], ptr %[[PTR_ARR_2]], align 8
+
+// CHECK: %[[BASE_PTR_ARR_3:.*]] = getelementptr inbounds [4 x ptr], ptr %.offload_baseptrs, i32 0, i32 2
+// CHECK: store ptr %[[ALLOCA]], ptr %[[BASE_PTR_ARR_3]], align 8
+// CHECK: %[[PTR_ARR_3:.*]] = getelementptr inbounds [4 x ptr], ptr %.offload_ptrs, i32 0, i32 2
+// CHECK: store ptr %[[MEMBER_ACCESS_2]], ptr %[[PTR_ARR_3]], align 8
+
+// CHECK: %[[BASE_PTR_ARR_4:.*]] = getelementptr inbounds [4 x ptr], ptr %.offload_baseptrs, i32 0, i32 3
+// CHECK: store ptr %[[ALLOCA]], ptr %[[BASE_PTR_ARR_4]], align 8
+// CHECK: %[[PTR_ARR_4:.*]] = getelementptr inbounds [4 x ptr], ptr %.offload_ptrs, i32 0, i32 3
+// CHECK: store ptr %[[LAST_MEMBER]], ptr %[[PTR_ARR_4]], align 8
diff --git a/mlir/test/Target/LLVMIR/omptarget-record-type-mapping-host.mlir b/mlir/test/Target/LLVMIR/omptarget-record-type-mapping-host.mlir
new file mode 100644
index 000000000000..c7a87e44d653
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-record-type-mapping-host.mlir
@@ -0,0 +1,62 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// This test checks the offload sizes, map types and base pointers and pointers
+// provided to the OpenMP kernel argument structure are correct when lowering
+// to LLVM-IR from MLIR when performing explicit member mapping of a record type
+// (C++/C class/structure, Fortran derived type) where only members of the record
+// type are mapped.
+
+module attributes {omp.is_target_device = false} {
+llvm.func @_QQmain() {
+    %0 = llvm.mlir.constant(10 : index) : i64
+    %1 = llvm.mlir.constant(4 : index) : i64
+    %2 = llvm.mlir.constant(1 : index) : i64
+    %3 = llvm.mlir.constant(1 : i64) : i64
+    %4 = llvm.alloca %3 x !llvm.struct<(f32, array<10 x i32>, i32)> : (i64) -> !llvm.ptr
+    %5 = llvm.mlir.constant(2 : i32) : i32
+    %6 = llvm.getelementptr %4[0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(f32, array<10 x i32>, i32)>
+    %7 = omp.map.info var_ptr(%6 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr
+    %8 = llvm.mlir.constant(1 : i32) : i32
+    %9 = llvm.getelementptr %4[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(f32, array<10 x i32>, i32)>
+    %10 = omp.map.bounds lower_bound(%2 : i64) upper_bound(%1 : i64) extent(%0 : i64) stride(%2 : i64) start_idx(%2 : i64)
+    %11 = omp.map.info var_ptr(%9 : !llvm.ptr, !llvm.array<10 x i32>) map_clauses(tofrom) capture(ByRef) bounds(%10) -> !llvm.ptr
+    %12 = omp.map.info var_ptr(%4 : !llvm.ptr, !llvm.struct<(f32, array<10 x i32>, i32)>) map_clauses(tofrom) capture(ByRef) members(%7, %11 : [2], [1] : !llvm.ptr, !llvm.ptr) -> !llvm.ptr {partial_map = true}
+    omp.target map_entries(%7 -> %arg0, %11 -> %arg1, %12 -> %arg2 : !llvm.ptr, !llvm.ptr, !llvm.ptr) {
+    ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr):
+      omp.terminator
+    }
+    llvm.return
+  }
+}
+
+// CHECK: @.offload_sizes = private unnamed_addr constant [3 x i64] [i64 0, i64 4, i64 16]
+// CHECK: @.offload_maptypes = private unnamed_addr constant [3 x i64] [i64 32, i64 281474976710659, i64 281474976710659]
+
+// CHECK: define void @_QQmain()
+// CHECK: %[[ALLOCA:.*]] = alloca { float, [10 x i32], i32 }, i64 1, align 8
+// CHECK: %[[MEMBER_ACCESS_1:.*]] = getelementptr { float, [10 x i32], i32 }, ptr %[[ALLOCA]], i32 0, i32 2
+// CHECK: %[[MEMBER_ACCESS_2:.*]] = getelementptr { float, [10 x i32], i32 }, ptr %[[ALLOCA]], i32 0, i32 1
+
+// CHECK: %[[LAST_MEMBER:.*]] = getelementptr inbounds [10 x i32], ptr %[[MEMBER_ACCESS_2]], i64 0, i64 1
+// CHECK: %[[FIRST_MEMBER:.*]] = getelementptr i32, ptr %[[MEMBER_ACCESS_1]], i64 1
+// CHECK: %[[FIRST_MEMBER_OFF:.*]] = ptrtoint ptr %[[FIRST_MEMBER]] to i64
+// CHECK: %[[SECOND_MEMBER_OFF:.*]] = ptrtoint ptr %[[LAST_MEMBER]] to i64
+// CHECK: %[[MEMBER_DIFF:.*]] = sub i64 %[[FIRST_MEMBER_OFF]], %[[SECOND_MEMBER_OFF]]
+// CHECK: %[[OFFLOAD_SIZE:.*]] = sdiv exact i64 %[[MEMBER_DIFF]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
+
+// CHECK: %[[BASE_PTR_ARR:.*]] = getelementptr inbounds [3 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
+// CHECK: store ptr %[[ALLOCA]], ptr %[[BASE_PTR_ARR]], align 8
+// CHECK: %[[PTR_ARR:.*]] = getelementptr inbounds [3 x ptr], ptr %.offload_ptrs, i32 0, i32 0
+// CHECK: store ptr %[[LAST_MEMBER]], ptr %[[PTR_ARR]], align 8
+// CHECK: %[[SIZE_ARR:.*]] = getelementptr inbounds [3 x i64], ptr %.offload_sizes, i32 0, i32 0
+// CHECK: store i64 %[[OFFLOAD_SIZE]], ptr %[[SIZE_ARR]], align 8
+
+// CHECK: %[[BASE_PTR_ARR_2:.*]] = getelementptr inbounds [3 x ptr], ptr %.offload_baseptrs, i32 0, i32 1
+// CHECK: store ptr %[[ALLOCA]], ptr %[[BASE_PTR_ARR_2]], align 8
+// CHECK: %[[PTR_ARR_2:.*]] = getelementptr inbounds [3 x ptr], ptr %.offload_ptrs, i32 0, i32 1
+// CHECK: store ptr %[[MEMBER_ACCESS_1]], ptr %[[PTR_ARR_2]], align 8
+
+// CHECK: %[[BASE_PTR_ARR_3:.*]] = getelementptr inbounds [3 x ptr], ptr %.offload_baseptrs, i32 0, i32 2
+// CHECK: store ptr %[[ALLOCA]], ptr %[[BASE_PTR_ARR_3]], align 8
+// CHECK: %[[PTR_ARR_3:.*]] = getelementptr inbounds [3 x ptr], ptr %.offload_ptrs, i32 0, i32 2
+// CHECK: store ptr %[[LAST_MEMBER]], ptr %[[PTR_ARR_3]], align 8
diff --git a/mlir/test/Target/LLVMIR/omptask_if_false.mlir b/mlir/test/Target/LLVMIR/omptask_if_false.mlir
new file mode 100644
index 000000000000..c6014a76add6
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptask_if_false.mlir
@@ -0,0 +1,17 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+llvm.func @foo_(%arg0: !llvm.ptr {fir.bindc_name = "n"}, %arg1: !llvm.ptr {fir.bindc_name = "r"}) attributes {fir.internal_name = "_QPfoo"} {
+  %0 = llvm.mlir.constant(false) : i1
+  omp.task if(%0) depend(taskdependin -> %arg0 : !llvm.ptr) {
+    %1 = llvm.load %arg0 : !llvm.ptr -> i32
+    llvm.store %1, %arg1 : i32, !llvm.ptr
+    omp.terminator
+  }
+  llvm.return
+}
+
+// CHECK: call void @__kmpc_omp_wait_deps
+// CHECK-NEXT: call void @__kmpc_omp_task_begin_if0
+// CHECK-NEXT: call void @foo_..omp_par
+// CHECK-NEXT: call void @__kmpc_omp_task_complete_if0
+
diff --git a/mlir/test/Target/LLVMIR/openmp-omp.private-dealloc.mlir b/mlir/test/Target/LLVMIR/openmp-omp.private-dealloc.mlir
new file mode 100644
index 000000000000..835caccb262c
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-omp.private-dealloc.mlir
@@ -0,0 +1,53 @@
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+
+llvm.func @free(!llvm.ptr)
+
+llvm.func @parallel_op_dealloc(%arg0: !llvm.ptr) {
+  omp.parallel private(@x.privatizer %arg0 -> %arg2 : !llvm.ptr) {
+    %0 = llvm.load %arg2 : !llvm.ptr -> f32
+    omp.terminator
+  }
+  llvm.return
+}
+
+omp.private {type = firstprivate} @x.privatizer : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %c1 = llvm.mlir.constant(1 : i32) : i32
+  %0 = llvm.alloca %c1 x f32 : (i32) -> !llvm.ptr
+  omp.yield(%0 : !llvm.ptr)
+} copy {
+^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+  %0 = llvm.load %arg0 : !llvm.ptr -> f32
+  llvm.store %0, %arg1 : f32, !llvm.ptr
+  omp.yield(%arg1 : !llvm.ptr)
+} dealloc {
+^bb0(%arg0: !llvm.ptr):
+  %0 = llvm.ptrtoint %arg0 : !llvm.ptr to i64
+  %c0 = llvm.mlir.constant(0 : i64) : i64
+  %1 = llvm.icmp "ne" %0, %c0 : i64
+  llvm.cond_br %1, ^bb1, ^bb2
+
+^bb1:
+  llvm.call @free(%arg0) : (!llvm.ptr) -> ()
+  llvm.br ^bb2
+
+^bb2:
+  omp.yield
+}
+
+// CHECK-LABEL: define internal void @parallel_op_dealloc..omp_par
+// CHECK:         %[[LOCAL_ALLOC:.*]] = alloca float, align 4
+
+// CHECK:      omp.par.pre_finalize:
+// CHECK:        br label %[[DEALLOC_REG_START:.*]]
+
+// CHECK:      [[DEALLOC_REG_START]]:
+// CHECK:        %[[LOCAL_ALLOC_CONV:.*]] = ptrtoint ptr %[[LOCAL_ALLOC]] to i64
+// CHECK:        %[[COND:.*]] = icmp ne i64 %[[LOCAL_ALLOC_CONV]], 0
+// CHECK:        br i1 %[[COND]], label %[[DEALLOC_REG_BB1:.*]], label %[[DEALLOC_REG_BB2:.*]]
+
+// CHECK:      [[DEALLOC_REG_BB2]]:
+
+// CHECK:      [[DEALLOC_REG_BB1]]:
+// CHECK-NEXT:   call void @free(ptr %[[LOCAL_ALLOC]])
+// CHECK-NEXT:   br label %[[DEALLOC_REG_BB2]]
diff --git a/mlir/test/Transforms/buffer-results-to-out-params-elim.mlir b/mlir/test/Transforms/buffer-results-to-out-params-elim.mlir
new file mode 100644
index 000000000000..f77dbfaa6cb1
--- /dev/null
+++ b/mlir/test/Transforms/buffer-results-to-out-params-elim.mlir
@@ -0,0 +1,37 @@
+// RUN: mlir-opt -allow-unregistered-dialect -p 'builtin.module(buffer-results-to-out-params{hoist-static-allocs})'  %s | FileCheck %s
+
+// CHECK-LABEL:   func @basic(
+// CHECK-SAME:                %[[ARG:.*]]: memref<8x64xf32>) {
+// CHECK-NOT:        memref.alloc()
+// CHECK:           "test.source"(%[[ARG]])  : (memref<8x64xf32>) -> ()
+// CHECK:           return
+// CHECK:         }
+func.func @basic() -> (memref<8x64xf32>) {
+  %b = memref.alloc() : memref<8x64xf32>
+  "test.source"(%b)  : (memref<8x64xf32>) -> ()
+  return %b : memref<8x64xf32>
+}
+
+// CHECK-LABEL:   func @basic_no_change(
+// CHECK-SAME:                %[[ARG:.*]]: memref<f32>) {
+// CHECK:           %[[RESULT:.*]] = "test.source"() : () -> memref<f32>
+// CHECK:           memref.copy %[[RESULT]], %[[ARG]]  : memref<f32> to memref<f32>
+// CHECK:           return
+// CHECK:         }
+func.func @basic_no_change() -> (memref<f32>) {
+  %0 = "test.source"() : () -> (memref<f32>)
+  return %0 : memref<f32>
+}
+
+// CHECK-LABEL:   func @basic_dynamic(
+// CHECK-SAME:                %[[D:.*]]: index, %[[ARG:.*]]: memref<?xf32>) {
+// CHECK:           %[[RESULT:.*]] = memref.alloc(%[[D]]) : memref<?xf32>
+// CHECK:           "test.source"(%[[RESULT]])  : (memref<?xf32>) -> ()
+// CHECK:           memref.copy %[[RESULT]], %[[ARG]]
+// CHECK:           return
+// CHECK:         }
+func.func @basic_dynamic(%d: index) -> (memref<?xf32>) {
+  %b = memref.alloc(%d) : memref<?xf32>
+  "test.source"(%b)  : (memref<?xf32>) -> ()
+  return %b : memref<?xf32>
+}
+\ No newline at end of file
diff --git a/mlir/test/Transforms/constant-fold.mlir b/mlir/test/Transforms/constant-fold.mlir
index 253163f2af91..981757aed9b1 100644
--- a/mlir/test/Transforms/constant-fold.mlir
+++ b/mlir/test/Transforms/constant-fold.mlir
@@ -478,6 +478,44 @@ func.func @simple_arith.ceildivsi() -> (i32, i32, i32, i32, i32) {
 
 // -----
 
+// CHECK-LABEL: func @simple_arith.ceildivsi_overflow
+func.func @simple_arith.ceildivsi_overflow() -> (i8, i16, i32) {
+  // The negative values below are MININTs for the corresponding bit-width. The
+  // folder will try to negate them (so that the division operates on two
+  // positive numbers), but that would cause overflow (negating MININT
+  // overflows). Hence folding should not happen and the original ceildivsi is
+  // preserved.
+
+  // TODO: The folder should be able to fold the following by avoiding
+  // intermediate operations that overflow.
+
+  // CHECK-DAG: %[[C_1:.*]] = arith.constant 7 : i8
+  // CHECK-DAG: %[[MIN_I8:.*]] = arith.constant -128 : i8
+  // CHECK-DAG: %[[C_2:.*]] = arith.constant 7 : i16
+  // CHECK-DAG: %[[MIN_I16:.*]] = arith.constant -32768 : i16
+  // CHECK-DAG: %[[C_3:.*]] = arith.constant 7 : i32
+  // CHECK-DAG: %[[MIN_I32:.*]] = arith.constant -2147483648 : i32
+
+  // CHECK-NEXT: %[[CEILDIV_1:.*]] = arith.ceildivsi %[[MIN_I8]], %[[C_1]]  : i8
+  %0 = arith.constant 7 : i8
+  %min_int_i8 = arith.constant -128 : i8
+  %2 = arith.ceildivsi %min_int_i8, %0 : i8
+
+  // CHECK-NEXT: %[[CEILDIV_2:.*]] = arith.ceildivsi %[[MIN_I16]], %[[C_2]]  : i16
+  %3 = arith.constant 7 : i16
+  %min_int_i16 = arith.constant -32768 : i16
+  %5 = arith.ceildivsi %min_int_i16, %3 : i16
+
+  // CHECK-NEXT: %[[CEILDIV_2:.*]] = arith.ceildivsi %[[MIN_I32]], %[[C_3]]  : i32
+  %6 = arith.constant 7 : i32
+  %min_int_i32 = arith.constant -2147483648 : i32
+  %8 = arith.ceildivsi %min_int_i32, %6 : i32
+
+  return %2, %5, %8 : i8, i16, i32
+}
+
+// -----
+
 // CHECK-LABEL: func @simple_arith.ceildivui
 func.func @simple_arith.ceildivui() -> (i32, i32, i32, i32, i32) {
   // CHECK-DAG: [[C0:%.+]] = arith.constant 0
diff --git a/mlir/test/Transforms/mem2reg.mlir b/mlir/test/Transforms/mem2reg.mlir
new file mode 100644
index 000000000000..daeaa2da0763
--- /dev/null
+++ b/mlir/test/Transforms/mem2reg.mlir
@@ -0,0 +1,28 @@
+// RUN: mlir-opt %s --pass-pipeline='builtin.module(func.func(mem2reg))' --split-input-file | FileCheck %s
+
+// Verifies that allocators with mutliple slots are handled properly.
+
+// CHECK-LABEL: func.func @multi_slot_alloca
+func.func @multi_slot_alloca() -> (i32, i32) {
+  // CHECK-NOT: test.multi_slot_alloca
+  %1, %2 = test.multi_slot_alloca : () -> (memref<i32>, memref<i32>)
+  %3 = memref.load %1[] : memref<i32>
+  %4 = memref.load %2[] : memref<i32>
+  return %3, %4 : i32, i32
+}
+
+// -----
+
+// Verifies that a multi slot allocator can be partially promoted.
+
+func.func private @consumer(memref<i32>)
+
+// CHECK-LABEL: func.func @multi_slot_alloca_only_second
+func.func @multi_slot_alloca_only_second() -> (i32, i32) {
+  // CHECK: %{{[[:alnum:]]+}} = test.multi_slot_alloca
+  %1, %2 = test.multi_slot_alloca : () -> (memref<i32>, memref<i32>)
+  func.call @consumer(%1) : (memref<i32>) -> ()
+  %3 = memref.load %1[] : memref<i32>
+  %4 = memref.load %2[] : memref<i32>
+  return %3, %4 : i32, i32
+}
diff --git a/mlir/test/lib/Dialect/Math/TestExpandMath.cpp b/mlir/test/lib/Dialect/Math/TestExpandMath.cpp
index 97600ad1ebe7..69af2a08b97b 100644
--- a/mlir/test/lib/Dialect/Math/TestExpandMath.cpp
+++ b/mlir/test/lib/Dialect/Math/TestExpandMath.cpp
@@ -42,6 +42,9 @@ void TestExpandMathPass::runOnOperation() {
   populateExpandSinhPattern(patterns);
   populateExpandCoshPattern(patterns);
   populateExpandTanhPattern(patterns);
+  populateExpandAsinhPattern(patterns);
+  populateExpandAcoshPattern(patterns);
+  populateExpandAtanhPattern(patterns);
   populateExpandFmaFPattern(patterns);
   populateExpandFloorFPattern(patterns);
   populateExpandCeilFPattern(patterns);
@@ -49,6 +52,7 @@ void TestExpandMathPass::runOnOperation() {
   populateExpandFPowIPattern(patterns);
   populateExpandRoundFPattern(patterns);
   populateExpandRoundEvenPattern(patterns);
+  populateExpandRsqrtPattern(patterns);
   (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
 }
 
diff --git a/mlir/test/lib/Dialect/NVGPU/TestNVGPUTransforms.cpp b/mlir/test/lib/Dialect/NVGPU/TestNVGPUTransforms.cpp
index 74a15ba273d8..8ca29257b812 100644
--- a/mlir/test/lib/Dialect/NVGPU/TestNVGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/NVGPU/TestNVGPUTransforms.cpp
@@ -68,9 +68,9 @@ struct TestMmaSyncF32ToTF32Patterns
 
 namespace mlir {
 namespace test {
-void registerTestNvgpuLowerings() {
+void registerTestNVGPULowerings() {
   PassRegistration<TestMmaSyncF32ToTF32Patterns>();
 }
 
 } // namespace test
-} // namespace mlir
-\ No newline at end of file
+} // namespace mlir
diff --git a/mlir/test/lib/Dialect/Test/TestAttrDefs.td b/mlir/test/lib/Dialect/Test/TestAttrDefs.td
index 40f035a3e3a4..12635e107bd4 100644
--- a/mlir/test/lib/Dialect/Test/TestAttrDefs.td
+++ b/mlir/test/lib/Dialect/Test/TestAttrDefs.td
@@ -340,4 +340,15 @@ def TestConditionalAliasAttr : Test_Attr<"TestConditionalAlias"> {
   }];
 }
 
+// Test AsmParser::parseFloat(const fltSemnatics&, APFloat&) API through the
+// custom parser and printer.
+def TestCustomFloatAttr : Test_Attr<"TestCustomFloat"> {
+  let mnemonic = "custom_float";
+  let parameters = (ins "mlir::StringAttr":$type_str, APFloatParameter<"">:$value);
+
+  let assemblyFormat = [{
+    `<` custom<CustomFloatAttr>($type_str, $value) `>`
+  }];
+}
+
 #endif // TEST_ATTRDEFS
diff --git a/mlir/test/lib/Dialect/Test/TestAttributes.cpp b/mlir/test/lib/Dialect/Test/TestAttributes.cpp
index 2cc051e664be..d7e40d35238d 100644
--- a/mlir/test/lib/Dialect/Test/TestAttributes.cpp
+++ b/mlir/test/lib/Dialect/Test/TestAttributes.cpp
@@ -18,6 +18,7 @@
 #include "mlir/IR/ExtensibleDialect.h"
 #include "mlir/IR/Types.h"
 #include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/TypeSwitch.h"
@@ -241,6 +242,46 @@ static void printConditionalAlias(AsmPrinter &p, StringAttr value) {
 }
 
 //===----------------------------------------------------------------------===//
+// Custom Float Attribute
+//===----------------------------------------------------------------------===//
+
+static void printCustomFloatAttr(AsmPrinter &p, StringAttr typeStrAttr,
+                                 APFloat value) {
+  p << typeStrAttr << " : " << value;
+}
+
+static ParseResult parseCustomFloatAttr(AsmParser &p, StringAttr &typeStrAttr,
+                                        FailureOr<APFloat> &value) {
+
+  std::string str;
+  if (p.parseString(&str))
+    return failure();
+
+  typeStrAttr = StringAttr::get(p.getContext(), str);
+
+  if (p.parseColon())
+    return failure();
+
+  const llvm::fltSemantics *semantics;
+  if (str == "float")
+    semantics = &llvm::APFloat::IEEEsingle();
+  else if (str == "double")
+    semantics = &llvm::APFloat::IEEEdouble();
+  else if (str == "fp80")
+    semantics = &llvm::APFloat::x87DoubleExtended();
+  else
+    return p.emitError(p.getCurrentLocation(), "unknown float type, expected "
+                                               "'float', 'double' or 'fp80'");
+
+  APFloat parsedValue(0.0);
+  if (p.parseFloat(*semantics, parsedValue))
+    return failure();
+
+  value.emplace(parsedValue);
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
 // Tablegen Generated Definitions
 //===----------------------------------------------------------------------===//
 
diff --git a/mlir/test/lib/Dialect/Test/TestOpDefs.cpp b/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
index 08df2e5e1228..d22d48b139a0 100644
--- a/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
+++ b/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
@@ -11,6 +11,7 @@
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/Verifier.h"
 #include "mlir/Interfaces/FunctionImplementation.h"
+#include "mlir/Interfaces/MemorySlotInterfaces.h"
 
 using namespace mlir;
 using namespace test;
@@ -1172,3 +1173,61 @@ void TestOpWithVersionedProperties::writeToMlirBytecode(
   writer.writeVarInt(prop.value1);
   writer.writeVarInt(prop.value2);
 }
+
+//===----------------------------------------------------------------------===//
+// TestMultiSlotAlloca
+//===----------------------------------------------------------------------===//
+
+llvm::SmallVector<MemorySlot> TestMultiSlotAlloca::getPromotableSlots() {
+  SmallVector<MemorySlot> slots;
+  for (Value result : getResults()) {
+    slots.push_back(MemorySlot{
+        result, cast<MemRefType>(result.getType()).getElementType()});
+  }
+  return slots;
+}
+
+Value TestMultiSlotAlloca::getDefaultValue(const MemorySlot &slot,
+                                           OpBuilder &builder) {
+  return builder.create<TestOpConstant>(getLoc(), slot.elemType,
+                                        builder.getI32IntegerAttr(42));
+}
+
+void TestMultiSlotAlloca::handleBlockArgument(const MemorySlot &slot,
+                                              BlockArgument argument,
+                                              OpBuilder &builder) {
+  // Not relevant for testing.
+}
+
+std::optional<PromotableAllocationOpInterface>
+TestMultiSlotAlloca::handlePromotionComplete(const MemorySlot &slot,
+                                             Value defaultValue,
+                                             OpBuilder &builder) {
+  if (defaultValue && defaultValue.use_empty())
+    defaultValue.getDefiningOp()->erase();
+
+  if (getNumResults() == 1) {
+    erase();
+    return std::nullopt;
+  }
+
+  SmallVector<Type> newTypes;
+  SmallVector<Value> remainingValues;
+
+  for (Value oldResult : getResults()) {
+    if (oldResult == slot.ptr)
+      continue;
+    remainingValues.push_back(oldResult);
+    newTypes.push_back(oldResult.getType());
+  }
+
+  OpBuilder::InsertionGuard guard(builder);
+  builder.setInsertionPoint(*this);
+  auto replacement = builder.create<TestMultiSlotAlloca>(getLoc(), newTypes);
+  for (auto [oldResult, newResult] :
+       llvm::zip_equal(remainingValues, replacement.getResults()))
+    oldResult.replaceAllUsesWith(newResult);
+
+  erase();
+  return replacement;
+}
diff --git a/mlir/test/lib/Dialect/Test/TestOps.h b/mlir/test/lib/Dialect/Test/TestOps.h
index f9925855bb9d..837ccca56592 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.h
+++ b/mlir/test/lib/Dialect/Test/TestOps.h
@@ -36,6 +36,7 @@
 #include "mlir/Interfaces/InferIntRangeInterface.h"
 #include "mlir/Interfaces/InferTypeOpInterface.h"
 #include "mlir/Interfaces/LoopLikeInterface.h"
+#include "mlir/Interfaces/MemorySlotInterfaces.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "mlir/Interfaces/ViewLikeInterface.h"
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 5352d574ac39..e16ea2407314 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -28,6 +28,7 @@ include "mlir/Interfaces/DestinationStyleOpInterface.td"
 include "mlir/Interfaces/InferIntRangeInterface.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/LoopLikeInterface.td"
+include "mlir/Interfaces/MemorySlotInterfaces.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 
 
@@ -3167,4 +3168,14 @@ def TestOpOptionallyImplementingInterface
   let arguments = (ins BoolAttr:$implementsInterface);
 }
 
+//===----------------------------------------------------------------------===//
+// Test Mem2Reg
+//===----------------------------------------------------------------------===//
+
+def TestMultiSlotAlloca : TEST_Op<"multi_slot_alloca",
+    [DeclareOpInterfaceMethods<PromotableAllocationOpInterface>]> {
+  let results = (outs Variadic<MemRefOf<[I32]>>:$results);
+  let assemblyFormat = "attr-dict `:` functional-type(operands, results)";
+}
+
 #endif // TEST_OPS
diff --git a/mlir/test/lib/Dialect/Test/TestOpsSyntax.cpp b/mlir/test/lib/Dialect/Test/TestOpsSyntax.cpp
index c376d6c73c64..ebaced57a24a 100644
--- a/mlir/test/lib/Dialect/Test/TestOpsSyntax.cpp
+++ b/mlir/test/lib/Dialect/Test/TestOpsSyntax.cpp
@@ -413,7 +413,7 @@ void PrettyPrintedRegionOp::print(OpAsmPrinter &p) {
   // of inner-op), then we can print the entire region in a succinct way.
   // Here we assume that the prototype of "test.special.op" can be trivially
   // derived while parsing it back.
-  if (innerOp.getName().getStringRef().equals("test.special.op")) {
+  if (innerOp.getName().getStringRef() == "test.special.op") {
     p << " start test.special.op end";
   } else {
     p << " (";
diff --git a/mlir/test/lib/Dialect/Transform/TestTransformDialectInterpreter.cpp b/mlir/test/lib/Dialect/Transform/TestTransformDialectInterpreter.cpp
index e936ac5b852b..1273414cd4df 100644
--- a/mlir/test/lib/Dialect/Transform/TestTransformDialectInterpreter.cpp
+++ b/mlir/test/lib/Dialect/Transform/TestTransformDialectInterpreter.cpp
@@ -11,230 +11,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "TestTransformDialectExtension.h"
-#include "mlir/Dialect/Transform/DebugExtension/DebugExtensionOps.h"
-#include "mlir/Dialect/Transform/IR/TransformOps.h"
 #include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
-#include "mlir/Dialect/Transform/Transforms/TransformInterpreterPassBase.h"
-#include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/Pass/Pass.h"
 
 using namespace mlir;
 
 namespace {
-/// Simple pass that applies transform dialect ops directly contained in a
-/// module.
-
 template <typename Derived>
 class OpPassWrapper : public PassWrapper<Derived, OperationPass<>> {};
 
-class TestTransformDialectInterpreterPass
-    : public transform::TransformInterpreterPassBase<
-          TestTransformDialectInterpreterPass, OpPassWrapper> {
-public:
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
-      TestTransformDialectInterpreterPass)
-
-  TestTransformDialectInterpreterPass() = default;
-  TestTransformDialectInterpreterPass(
-      const TestTransformDialectInterpreterPass &pass)
-      : TransformInterpreterPassBase(pass) {}
-
-  StringRef getArgument() const override {
-    return "test-transform-dialect-interpreter";
-  }
-
-  StringRef getDescription() const override {
-    return "apply transform dialect operations one by one";
-  }
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<transform::TransformDialect>();
-  }
-
-  void findOperationsByName(Operation *root, StringRef name,
-                            SmallVectorImpl<Operation *> &operations) {
-    root->walk([&](Operation *op) {
-      if (op->getName().getStringRef() == name) {
-        operations.push_back(op);
-      }
-    });
-  }
-
-  void createParameterMapping(MLIRContext &context, ArrayRef<int> values,
-                              RaggedArray<transform::MappedValue> &result) {
-    SmallVector<transform::MappedValue> storage =
-        llvm::to_vector(llvm::map_range(values, [&](int v) {
-          Builder b(&context);
-          return transform::MappedValue(b.getI64IntegerAttr(v));
-        }));
-    result.push_back(std::move(storage));
-  }
-
-  void
-  createOpResultMapping(Operation *root, StringRef name,
-                        RaggedArray<transform::MappedValue> &extraMapping) {
-    SmallVector<Operation *> operations;
-    findOperationsByName(root, name, operations);
-    SmallVector<Value> results;
-    for (Operation *op : operations)
-      llvm::append_range(results, op->getResults());
-    extraMapping.push_back(results);
-  }
-
-  unsigned numberOfSetOptions(const Option<std::string> &ops,
-                              const ListOption<int> &params,
-                              const Option<std::string> &values) {
-    unsigned numSetValues = 0;
-    numSetValues += !ops.empty();
-    numSetValues += !params.empty();
-    numSetValues += !values.empty();
-    return numSetValues;
-  }
-
-  std::optional<LogicalResult> constructTransformModule(OpBuilder &builder,
-                                                        Location loc) {
-    if (!testModuleGeneration)
-      return std::nullopt;
-
-    builder.create<transform::SequenceOp>(
-        loc, TypeRange(), transform::FailurePropagationMode::Propagate,
-        builder.getType<transform::AnyOpType>(),
-        [](OpBuilder &b, Location nested, Value rootH) {
-          b.create<transform::DebugEmitRemarkAtOp>(nested, rootH,
-                                                   "remark from generated");
-          b.create<transform::YieldOp>(nested, ValueRange());
-        });
-    return success();
-  }
-
-  void runOnOperation() override {
-    unsigned firstSetOptions =
-        numberOfSetOptions(bindFirstExtraToOps, bindFirstExtraToParams,
-                           bindFirstExtraToResultsOfOps);
-    unsigned secondSetOptions =
-        numberOfSetOptions(bindSecondExtraToOps, bindSecondExtraToParams,
-                           bindSecondExtraToResultsOfOps);
-    auto loc = UnknownLoc::get(&getContext());
-    if (firstSetOptions > 1) {
-      emitError(loc) << "cannot bind the first extra top-level argument to "
-                        "multiple entities";
-      return signalPassFailure();
-    }
-    if (secondSetOptions > 1) {
-      emitError(loc) << "cannot bind the second extra top-level argument to "
-                        "multiple entities";
-      return signalPassFailure();
-    }
-    if (firstSetOptions == 0 && secondSetOptions != 0) {
-      emitError(loc) << "cannot bind the second extra top-level argument "
-                        "without bindings the first";
-    }
-
-    RaggedArray<transform::MappedValue> extraMapping;
-    if (!bindFirstExtraToOps.empty()) {
-      SmallVector<Operation *> operations;
-      findOperationsByName(getOperation(), bindFirstExtraToOps.getValue(),
-                           operations);
-      extraMapping.push_back(operations);
-    } else if (!bindFirstExtraToParams.empty()) {
-      createParameterMapping(getContext(), bindFirstExtraToParams,
-                             extraMapping);
-    } else if (!bindFirstExtraToResultsOfOps.empty()) {
-      createOpResultMapping(getOperation(), bindFirstExtraToResultsOfOps,
-                            extraMapping);
-    }
-
-    if (!bindSecondExtraToOps.empty()) {
-      SmallVector<Operation *> operations;
-      findOperationsByName(getOperation(), bindSecondExtraToOps, operations);
-      extraMapping.push_back(operations);
-    } else if (!bindSecondExtraToParams.empty()) {
-      createParameterMapping(getContext(), bindSecondExtraToParams,
-                             extraMapping);
-    } else if (!bindSecondExtraToResultsOfOps.empty()) {
-      createOpResultMapping(getOperation(), bindSecondExtraToResultsOfOps,
-                            extraMapping);
-    }
-
-    options = options.enableExpensiveChecks(enableExpensiveChecks);
-    options = options.enableEnforceSingleToplevelTransformOp(
-        enforceSingleToplevelTransformOp);
-    if (failed(transform::detail::interpreterBaseRunOnOperationImpl(
-            getOperation(), getArgument(), getSharedTransformModule(),
-            getTransformLibraryModule(), extraMapping, options,
-            transformFileName, transformLibraryPaths, debugPayloadRootTag,
-            debugTransformRootTag, getBinaryName())))
-      return signalPassFailure();
-  }
-
-  Option<bool> enableExpensiveChecks{
-      *this, "enable-expensive-checks", llvm::cl::init(false),
-      llvm::cl::desc("perform expensive checks to better report errors in the "
-                     "transform IR")};
-  Option<bool> enforceSingleToplevelTransformOp{
-      *this, "enforce-single-top-level-transform-op", llvm::cl::init(true),
-      llvm::cl::desc("Ensure that only a single top-level transform op is "
-                     "present in the IR.")};
-
-  Option<std::string> bindFirstExtraToOps{
-      *this, "bind-first-extra-to-ops",
-      llvm::cl::desc("bind the first extra argument of the top-level op to "
-                     "payload operations of the given kind")};
-  ListOption<int> bindFirstExtraToParams{
-      *this, "bind-first-extra-to-params",
-      llvm::cl::desc("bind the first extra argument of the top-level op to "
-                     "the given integer parameters")};
-  Option<std::string> bindFirstExtraToResultsOfOps{
-      *this, "bind-first-extra-to-results-of-ops",
-      llvm::cl::desc("bind the first extra argument of the top-level op to "
-                     "results of payload operations of the given kind")};
-
-  Option<std::string> bindSecondExtraToOps{
-      *this, "bind-second-extra-to-ops",
-      llvm::cl::desc("bind the second extra argument of the top-level op to "
-                     "payload operations of the given kind")};
-  ListOption<int> bindSecondExtraToParams{
-      *this, "bind-second-extra-to-params",
-      llvm::cl::desc("bind the second extra argument of the top-level op to "
-                     "the given integer parameters")};
-  Option<std::string> bindSecondExtraToResultsOfOps{
-      *this, "bind-second-extra-to-results-of-ops",
-      llvm::cl::desc("bind the second extra argument of the top-level op to "
-                     "results of payload operations of the given kind")};
-
-  Option<std::string> transformFileName{
-      *this, "transform-file-name", llvm::cl::init(""),
-      llvm::cl::desc(
-          "Optional filename containing a transform dialect specification to "
-          "apply. If left empty, the IR is assumed to contain one top-level "
-          "transform dialect operation somewhere in the module.")};
-  Option<std::string> debugPayloadRootTag{
-      *this, "debug-payload-root-tag", llvm::cl::init(""),
-      llvm::cl::desc(
-          "Select the operation with 'transform.target_tag' attribute having "
-          "the given value as payload IR root. If empty select the pass anchor "
-          "operation as the payload IR root.")};
-  Option<std::string> debugTransformRootTag{
-      *this, "debug-transform-root-tag", llvm::cl::init(""),
-      llvm::cl::desc(
-          "Select the operation with 'transform.target_tag' attribute having "
-          "the given value as container IR for top-level transform ops. This "
-          "allows user control on what transformation to apply. If empty, "
-          "select the container of the top-level transform op.")};
-  ListOption<std::string> transformLibraryPaths{
-      *this, "transform-library-paths", llvm::cl::ZeroOrMore,
-      llvm::cl::desc("Optional paths to files with modules that should be "
-                     "merged into the transform module to provide the "
-                     "definitions of external named sequences.")};
-
-  Option<bool> testModuleGeneration{
-      *this, "test-module-generation", llvm::cl::init(false),
-      llvm::cl::desc("test the generation of the transform module during pass "
-                     "initialization, overridden by parsing")};
-};
-
 struct TestTransformDialectEraseSchedulePass
     : public PassWrapper<TestTransformDialectEraseSchedulePass,
                          OperationPass<ModuleOp>> {
@@ -267,9 +53,5 @@ namespace test {
 void registerTestTransformDialectEraseSchedulePass() {
   PassRegistration<TestTransformDialectEraseSchedulePass> reg;
 }
-/// Registers the test pass for applying transform dialect ops.
-void registerTestTransformDialectInterpreterPass() {
-  PassRegistration<TestTransformDialectInterpreterPass> reg;
-}
 } // namespace test
 } // namespace mlir
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
index 4740e7d137e8..ea6d9ae71b77 100644
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -100,6 +100,7 @@ tools = [
     "mlir-lsp-server",
     "mlir-capi-execution-engine-test",
     "mlir-capi-ir-test",
+    "mlir-capi-irdl-test",
     "mlir-capi-llvm-test",
     "mlir-capi-pass-test",
     "mlir-capi-pdl-test",
diff --git a/mlir/test/mlir-cpu-runner/math-polynomial-approx.mlir b/mlir/test/mlir-cpu-runner/math-polynomial-approx.mlir
index d3b19be9ecaf..370c5baa0ade 100644
--- a/mlir/test/mlir-cpu-runner/math-polynomial-approx.mlir
+++ b/mlir/test/mlir-cpu-runner/math-polynomial-approx.mlir
@@ -462,6 +462,84 @@ func.func @cos() {
 }
 
 // -------------------------------------------------------------------------- //
+// Asin.
+// -------------------------------------------------------------------------- //
+func.func @asin_f32(%a : f32) {
+  %r = math.asin %a : f32
+  vector.print %r : f32
+  return
+}
+
+func.func @asin_3xf32(%a : vector<3xf32>) {
+  %r = math.asin %a : vector<3xf32>
+  vector.print %r : vector<3xf32>
+  return
+}
+
+func.func @asin() {
+  // CHECK: 0
+  %zero = arith.constant 0.0 : f32
+  call @asin_f32(%zero) : (f32) -> ()
+
+  // CHECK: -0.597406
+  %cst1 = arith.constant -0.5625 : f32
+  call @asin_f32(%cst1) : (f32) -> ()
+
+  // CHECK: -0.384397
+  %cst2 = arith.constant -0.375 : f32
+  call @asin_f32(%cst2) : (f32) -> ()
+
+  // CHECK: -0.25268
+  %cst3 = arith.constant -0.25 : f32
+  call @asin_f32(%cst3) : (f32) -> ()
+
+  // CHECK: 0.25268, 0.384397, 0.597406
+  %vec_x = arith.constant dense<[0.25, 0.375, 0.5625]> : vector<3xf32>
+  call @asin_3xf32(%vec_x) : (vector<3xf32>) -> ()
+
+  return
+}
+
+// -------------------------------------------------------------------------- //
+// Acos.
+// -------------------------------------------------------------------------- //
+func.func @acos_f32(%a : f32) {
+  %r = math.acos %a : f32
+  vector.print %r : f32
+  return
+}
+
+func.func @acos_3xf32(%a : vector<3xf32>) {
+  %r = math.acos %a : vector<3xf32>
+  vector.print %r : vector<3xf32>
+  return
+}
+
+func.func @acos() {
+  // CHECK: 1.5708
+  %zero = arith.constant 0.0 : f32
+  call @acos_f32(%zero) : (f32) -> ()
+
+  // CHECK: 2.1682
+  %cst1 = arith.constant -0.5625 : f32
+  call @acos_f32(%cst1) : (f32) -> ()
+
+  // CHECK: 1.95519
+  %cst2 = arith.constant -0.375 : f32
+  call @acos_f32(%cst2) : (f32) -> ()
+
+  // CHECK: 1.82348
+  %cst3 = arith.constant -0.25 : f32
+  call @acos_f32(%cst3) : (f32) -> ()
+
+  // CHECK: 1.31812, 1.1864, 0.97339
+  %vec_x = arith.constant dense<[0.25, 0.375, 0.5625]> : vector<3xf32>
+  call @acos_3xf32(%vec_x) : (vector<3xf32>) -> ()
+
+  return
+}
+
+// -------------------------------------------------------------------------- //
 // Atan.
 // -------------------------------------------------------------------------- //
 func.func @atan_f32(%a : f32) {
@@ -694,6 +772,8 @@ func.func @main() {
   call @expm1(): () -> ()
   call @sin(): () -> ()
   call @cos(): () -> ()
+  call @asin(): () -> ()
+  call @acos(): () -> ()
   call @atan() : () -> ()
   call @atan2() : () -> ()
   call @cbrt() : () -> ()
diff --git a/mlir/test/mlir-cpu-runner/test-expand-math-approx.mlir b/mlir/test/mlir-cpu-runner/test-expand-math-approx.mlir
index 340ef30bf59c..80d559cc6f73 100644
--- a/mlir/test/mlir-cpu-runner/test-expand-math-approx.mlir
+++ b/mlir/test/mlir-cpu-runner/test-expand-math-approx.mlir
@@ -717,6 +717,162 @@ func.func @tanh() {
  return
 }
 
+// -------------------------------------------------------------------------- //
+// Asinh.
+// -------------------------------------------------------------------------- //
+
+func.func @asinh_f32(%a : f32) {
+  %r = math.asinh %a : f32
+  vector.print %r : f32
+  return
+}
+
+func.func @asinh_3xf32(%a : vector<3xf32>) {
+  %r = math.asinh %a : vector<3xf32>
+  vector.print %r : vector<3xf32>
+  return
+}
+
+func.func @asinh() {
+  // CHECK: 0
+  %zero = arith.constant 0.0 : f32
+  call @asinh_f32(%zero) : (f32) -> ()
+
+  // CHECK: 0.881374
+  %cst1 = arith.constant 1.0 : f32
+  call @asinh_f32(%cst1) : (f32) -> ()
+
+  // CHECK: -0.881374
+  %cst2 = arith.constant -1.0 : f32
+  call @asinh_f32(%cst2) : (f32) -> ()
+
+  // CHECK: 1.81845
+  %cst3 = arith.constant 3.0 : f32
+  call @asinh_f32(%cst3) : (f32) -> ()
+
+  // CHECK: 0.247466, 0.790169, 1.44364
+  %vec_x = arith.constant dense<[0.25, 0.875, 2.0]> : vector<3xf32>
+  call @asinh_3xf32(%vec_x) : (vector<3xf32>) -> ()
+
+  return
+}
+
+// -------------------------------------------------------------------------- //
+// Acosh.
+// -------------------------------------------------------------------------- //
+
+func.func @acosh_f32(%a : f32) {
+  %r = math.acosh %a : f32
+  vector.print %r : f32
+  return
+}
+
+func.func @acosh_3xf32(%a : vector<3xf32>) {
+  %r = math.acosh %a : vector<3xf32>
+  vector.print %r : vector<3xf32>
+  return
+}
+
+func.func @acosh() {
+  // CHECK: 0
+  %zero = arith.constant 1.0 : f32
+  call @acosh_f32(%zero) : (f32) -> ()
+
+  // CHECK: 1.31696
+  %cst1 = arith.constant 2.0 : f32
+  call @acosh_f32(%cst1) : (f32) -> ()
+
+  // CHECK: 2.99322
+  %cst2 = arith.constant 10.0 : f32
+  call @acosh_f32(%cst2) : (f32) -> ()
+
+  // CHECK: 0.962424, 1.76275, 2.47789
+  %vec_x = arith.constant dense<[1.5, 3.0, 6.0]> : vector<3xf32>
+  call @acosh_3xf32(%vec_x) : (vector<3xf32>) -> ()
+
+  return
+}
+
+// -------------------------------------------------------------------------- //
+// Atanh.
+// -------------------------------------------------------------------------- //
+
+func.func @atanh_f32(%a : f32) {
+  %r = math.atanh %a : f32
+  vector.print %r : f32
+  return
+}
+
+func.func @atanh_3xf32(%a : vector<3xf32>) {
+  %r = math.atanh %a : vector<3xf32>
+  vector.print %r : vector<3xf32>
+  return
+}
+
+func.func @atanh() {
+  // CHECK: 0
+  %zero = arith.constant 0.0 : f32
+  call @atanh_f32(%zero) : (f32) -> ()
+
+  // CHECK: 0.549306
+  %cst1 = arith.constant 0.5 : f32
+  call @atanh_f32(%cst1) : (f32) -> ()
+
+  // CHECK: -0.549306
+  %cst2 = arith.constant -0.5 : f32
+  call @atanh_f32(%cst2) : (f32) -> ()
+
+  // CHECK: inf
+  %cst3 = arith.constant 1.0 : f32
+  call @atanh_f32(%cst3) : (f32) -> ()
+
+  // CHECK: 0.255413, 0.394229, 2.99448
+  %vec_x = arith.constant dense<[0.25, 0.375, 0.995]> : vector<3xf32>
+  call @atanh_3xf32(%vec_x) : (vector<3xf32>) -> ()
+
+  return
+}
+
+// -------------------------------------------------------------------------- //
+// Rsqrt.
+// -------------------------------------------------------------------------- //
+
+func.func @rsqrt_f32(%a : f32) {
+  %r = math.rsqrt %a : f32
+  vector.print %r : f32
+  return
+}
+
+func.func @rsqrt_3xf32(%a : vector<3xf32>) {
+  %r = math.rsqrt %a : vector<3xf32>
+  vector.print %r : vector<3xf32>
+  return
+}
+
+func.func @rsqrt() {
+  // CHECK: 1
+  %zero = arith.constant 1.0 : f32
+  call @rsqrt_f32(%zero) : (f32) -> ()
+
+  // CHECK: 0.707107
+  %cst1 = arith.constant 2.0 : f32
+  call @rsqrt_f32(%cst1) : (f32) -> ()
+
+  // CHECK: inf
+  %cst2 = arith.constant 0.0 : f32
+  call @rsqrt_f32(%cst2) : (f32) -> ()
+
+  // CHECK: nan
+  %cst3 = arith.constant -1.0 : f32
+  call @rsqrt_f32(%cst3) : (f32) -> ()
+
+  // CHECK: 0.5, 1.41421, 0.57735
+  %vec_x = arith.constant dense<[4.0, 0.5, 3.0]> : vector<3xf32>
+  call @rsqrt_3xf32(%vec_x) : (vector<3xf32>) -> ()
+
+  return
+}
+
 func.func @main() {
   call @exp2f() : () -> ()
   call @roundf() : () -> ()
@@ -725,5 +881,9 @@ func.func @main() {
   call @sinh() : () -> ()
   call @cosh() : () -> ()
   call @tanh() : () -> ()
+  call @asinh() : () -> ()
+  call @acosh() : () -> ()
+  call @atanh() : () -> ()
+  call @rsqrt() : () -> ()
   return
 }
diff --git a/mlir/test/mlir-tblgen/attr-or-type-format-invalid.td b/mlir/test/mlir-tblgen/attr-or-type-format-invalid.td
index d3be4d8b8022..3a57cbca4d7b 100644
--- a/mlir/test/mlir-tblgen/attr-or-type-format-invalid.td
+++ b/mlir/test/mlir-tblgen/attr-or-type-format-invalid.td
@@ -111,7 +111,7 @@ def InvalidTypeN : InvalidType<"InvalidTypeN", "invalid_n"> {
 
 def InvalidTypeO : InvalidType<"InvalidTypeO", "invalid_o"> {
   let parameters = (ins "int":$a);
-  // CHECK: `ref` is only allowed inside custom directives
+  // CHECK: 'ref' is only valid within a `custom` directive
   let assemblyFormat = "$a ref($a)";
 }
 
diff --git a/mlir/test/mlir-vulkan-runner/addf_if.mlir b/mlir/test/mlir-vulkan-runner/addf_if.mlir
index fbd1fae6d0b5..5638bd44682d 100644
--- a/mlir/test/mlir-vulkan-runner/addf_if.mlir
+++ b/mlir/test/mlir-vulkan-runner/addf_if.mlir
@@ -1,54 +1,54 @@
-// RUN: mlir-vulkan-runner %s --shared-libs=%vulkan-runtime-wrappers,%mlir_runner_utils --entry-point-result=void | FileCheck %s
-
-// CHECK: [3.3,  3.3,  3.3,  3.3,  0,  0,  0,  0]
-module attributes {
-  gpu.container_module,
-  spirv.target_env = #spirv.target_env<
-    #spirv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]>, #spirv.resource_limits<>>
-} {
-  gpu.module @kernels {
-    gpu.func @kernel_add(%arg0 : memref<8xf32>, %arg1 : memref<8xf32>, %arg2 : memref<8xf32>)
-      kernel attributes { spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [1, 1, 1]>} {
-      %0 = gpu.block_id x
-      %limit = arith.constant 4 : index
-      %cond = arith.cmpi slt, %0, %limit : index
-      scf.if %cond {
-        %1 = memref.load %arg0[%0] : memref<8xf32>
-        %2 = memref.load %arg1[%0] : memref<8xf32>
-        %3 = arith.addf %1, %2 : f32
-        memref.store %3, %arg2[%0] : memref<8xf32>
-      }
-      gpu.return
-    }
-  }
-
-  func.func @main() {
-    %arg0 = memref.alloc() : memref<8xf32>
-    %arg1 = memref.alloc() : memref<8xf32>
-    %arg2 = memref.alloc() : memref<8xf32>
-    %0 = arith.constant 0 : i32
-    %1 = arith.constant 1 : i32
-    %2 = arith.constant 2 : i32
-    %value0 = arith.constant 0.0 : f32
-    %value1 = arith.constant 1.1 : f32
-    %value2 = arith.constant 2.2 : f32
-    %arg3 = memref.cast %arg0 : memref<8xf32> to memref<?xf32>
-    %arg4 = memref.cast %arg1 : memref<8xf32> to memref<?xf32>
-    %arg5 = memref.cast %arg2 : memref<8xf32> to memref<?xf32>
-    call @fillResource1DFloat(%arg3, %value1) : (memref<?xf32>, f32) -> ()
-    call @fillResource1DFloat(%arg4, %value2) : (memref<?xf32>, f32) -> ()
-    call @fillResource1DFloat(%arg5, %value0) : (memref<?xf32>, f32) -> ()
-
-    %cst1 = arith.constant 1 : index
-    %cst8 = arith.constant 8 : index
-    gpu.launch_func @kernels::@kernel_add
-        blocks in (%cst8, %cst1, %cst1) threads in (%cst1, %cst1, %cst1)
-        args(%arg0 : memref<8xf32>, %arg1 : memref<8xf32>, %arg2 : memref<8xf32>)
-    %arg6 = memref.cast %arg5 : memref<?xf32> to memref<*xf32>
-    call @printMemrefF32(%arg6) : (memref<*xf32>) -> ()
-    return
-  }
-  func.func private @fillResource1DFloat(%0 : memref<?xf32>, %1 : f32)
-  func.func private @printMemrefF32(%ptr : memref<*xf32>)
-}
-
+// RUN: mlir-vulkan-runner %s --shared-libs=%vulkan-runtime-wrappers,%mlir_runner_utils --entry-point-result=void | FileCheck %s
+
+// CHECK: [3.3,  3.3,  3.3,  3.3,  0,  0,  0,  0]
+module attributes {
+  gpu.container_module,
+  spirv.target_env = #spirv.target_env<
+    #spirv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]>, #spirv.resource_limits<>>
+} {
+  gpu.module @kernels {
+    gpu.func @kernel_add(%arg0 : memref<8xf32>, %arg1 : memref<8xf32>, %arg2 : memref<8xf32>)
+      kernel attributes { spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [1, 1, 1]>} {
+      %0 = gpu.block_id x
+      %limit = arith.constant 4 : index
+      %cond = arith.cmpi slt, %0, %limit : index
+      scf.if %cond {
+        %1 = memref.load %arg0[%0] : memref<8xf32>
+        %2 = memref.load %arg1[%0] : memref<8xf32>
+        %3 = arith.addf %1, %2 : f32
+        memref.store %3, %arg2[%0] : memref<8xf32>
+      }
+      gpu.return
+    }
+  }
+
+  func.func @main() {
+    %arg0 = memref.alloc() : memref<8xf32>
+    %arg1 = memref.alloc() : memref<8xf32>
+    %arg2 = memref.alloc() : memref<8xf32>
+    %0 = arith.constant 0 : i32
+    %1 = arith.constant 1 : i32
+    %2 = arith.constant 2 : i32
+    %value0 = arith.constant 0.0 : f32
+    %value1 = arith.constant 1.1 : f32
+    %value2 = arith.constant 2.2 : f32
+    %arg3 = memref.cast %arg0 : memref<8xf32> to memref<?xf32>
+    %arg4 = memref.cast %arg1 : memref<8xf32> to memref<?xf32>
+    %arg5 = memref.cast %arg2 : memref<8xf32> to memref<?xf32>
+    call @fillResource1DFloat(%arg3, %value1) : (memref<?xf32>, f32) -> ()
+    call @fillResource1DFloat(%arg4, %value2) : (memref<?xf32>, f32) -> ()
+    call @fillResource1DFloat(%arg5, %value0) : (memref<?xf32>, f32) -> ()
+
+    %cst1 = arith.constant 1 : index
+    %cst8 = arith.constant 8 : index
+    gpu.launch_func @kernels::@kernel_add
+        blocks in (%cst8, %cst1, %cst1) threads in (%cst1, %cst1, %cst1)
+        args(%arg0 : memref<8xf32>, %arg1 : memref<8xf32>, %arg2 : memref<8xf32>)
+    %arg6 = memref.cast %arg5 : memref<?xf32> to memref<*xf32>
+    call @printMemrefF32(%arg6) : (memref<*xf32>) -> ()
+    return
+  }
+  func.func private @fillResource1DFloat(%0 : memref<?xf32>, %1 : f32)
+  func.func private @printMemrefF32(%ptr : memref<*xf32>)
+}
+
diff --git a/mlir/test/python/dialects/python_test.py b/mlir/test/python/dialects/python_test.py
index 88761c9d08fe..70927b22d474 100644
--- a/mlir/test/python/dialects/python_test.py
+++ b/mlir/test/python/dialects/python_test.py
@@ -167,7 +167,7 @@ def attrBuilder():
             x_f32arr=[2.0, 3.0],
             x_f64=4.25,  # CHECK-DAG: x_f64 = 4.250000e+00 : f64
             x_f64arr=[4.0, 8.0],  # CHECK-DAG: x_f64arr = [4.000000e+00, 8.000000e+00]
-            # CHECK-DAG: x_f64elems = dense<[3.952530e-323, 7.905050e-323]> : tensor<2xf64>
+            # CHECK-DAG: x_f64elems = dense<[8.000000e+00, 1.600000e+01]> : tensor<2xf64>
             x_f64elems=[8.0, 16.0],
             # CHECK-DAG: x_flatsymrefarr = [@symbol1, @symbol2]
             x_flatsymrefarr=["symbol1", "symbol2"],
diff --git a/mlir/test/python/dialects/transform_structured_ext.py b/mlir/test/python/dialects/transform_structured_ext.py
index 91ecd0fc38e1..935534edba7a 100644
--- a/mlir/test/python/dialects/transform_structured_ext.py
+++ b/mlir/test/python/dialects/transform_structured_ext.py
@@ -8,6 +8,7 @@ from mlir.dialects import transform
 from mlir.dialects import pdl
 from mlir.dialects.transform import structured
 from mlir.dialects.transform import pdl as transform_pdl
+from mlir.dialects.transform.extras import constant_param
 
 
 def run(f):
@@ -315,9 +316,9 @@ def testPadOpNoArgs(target):
 def testPadOpArgs(target):
     structured.PadOp(
         target,
+        pad_to_multiple_of=[128],
         padding_values=[FloatAttr.get_f32(42.0), StringAttr.get("0")],
         padding_dimensions=Attribute.parse("[1]"),
-        pad_to_multiple_of=[128],
         pack_paddings=[0],
         transpose_paddings=[[1, Attribute.parse("0")], Attribute.parse("[0, 1]")],
         copy_back_op="linalg.copy",
@@ -325,9 +326,9 @@ def testPadOpArgs(target):
     # CHECK-LABEL: TEST: testPadOpArgs
     # CHECK: transform.sequence
     # CHECK: transform.structured.pad
+    # CHECK-DAG: pad_to_multiple_of [128]
     # CHECK-DAG: copy_back_op = "linalg.copy"
     # CHECK-DAG: pack_paddings = [0]
-    # CHECK-DAG: pad_to_multiple_of = [128]
     # CHECK-DAG: padding_dimensions = [1]
     # CHECK-DAG: padding_values = [4.200000e+01 : f32, "0"]
     # CHECK-DAG: transpose_paddings = {{\[}}[1, 0], [0, 1]]
@@ -335,6 +336,22 @@ def testPadOpArgs(target):
 
 @run
 @create_sequence
+def testPadOpArgsParam(target):
+    structured.PadOp(
+        target,
+        pad_to_multiple_of=[constant_param(128), Attribute.parse("2"), 10],
+        padding_dimensions=Attribute.parse("[0, 1, 2]"),
+    )
+    # CHECK-LABEL: TEST: testPadOpArgsParam
+    # CHECK: transform.sequence
+    # CHECK-DAG: %[[P:.*]] = transform.param.constant 128
+    # CHECK: transform.structured.pad
+    # CHECK-DAG: pad_to_multiple_of [%[[P]], 2, 10]
+    # CHECK-DAG: padding_dimensions = [0, 1, 2]
+
+
+@run
+@create_sequence
 def testScalarize(target):
     structured.ScalarizeOp(target)
     # CHECK-LABEL: TEST: testScalarize
@@ -484,7 +501,7 @@ def testTileToForallMixedDynamic(target):
     structured.TileUsingForallOp(target, num_threads=[n, 3, 4])
     # CHECK-LABEL: TEST: testTileToForallMixedDynamic
     # CHECK: = transform.structured.tile_using_forall
-    # CHECK-SAME: num_threads [%{{.*}} : !transform.any_op, 3, 4]
+    # CHECK-SAME: num_threads [%{{.*}}, 3, 4] : (!transform.any_op, !transform.any_op)
 
 
 @run
@@ -494,7 +511,7 @@ def testTileToForallPackedDynamic(target):
     structured.TileUsingForallOp(target, num_threads=n)
     # CHECK-LABEL: TEST: testTileToForallPackedDynamic
     # CHECK: = transform.structured.tile_using_forall
-    # CHECK-SAME: num_threads *(%0 : !transform.any_op)
+    # CHECK-SAME: num_threads *(%0) : (!transform.any_op, !transform.any_op)
 
 
 @run
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index 237ebeb166dc..1dfc5d178b61 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -140,11 +140,10 @@ void registerTestTensorCopyInsertionPass();
 void registerTestTensorTransforms();
 void registerTestTopologicalSortAnalysisPass();
 void registerTestTransformDialectEraseSchedulePass();
-void registerTestTransformDialectInterpreterPass();
 void registerTestWrittenToPass();
 void registerTestVectorLowerings();
 void registerTestVectorReductionToSPIRVDotProd();
-void registerTestNvgpuLowerings();
+void registerTestNVGPULowerings();
 #if MLIR_ENABLE_PDL_IN_PATTERNMATCH
 void registerTestDialectConversionPasses();
 void registerTestPDLByteCodePass();
@@ -269,10 +268,9 @@ void registerTestPasses() {
   mlir::test::registerTestTensorTransforms();
   mlir::test::registerTestTopologicalSortAnalysisPass();
   mlir::test::registerTestTransformDialectEraseSchedulePass();
-  mlir::test::registerTestTransformDialectInterpreterPass();
   mlir::test::registerTestVectorLowerings();
   mlir::test::registerTestVectorReductionToSPIRVDotProd();
-  mlir::test::registerTestNvgpuLowerings();
+  mlir::test::registerTestNVGPULowerings();
   mlir::test::registerTestWrittenToPass();
 #if MLIR_ENABLE_PDL_IN_PATTERNMATCH
   mlir::test::registerTestDialectConversionPasses();
diff --git a/mlir/tools/mlir-src-sharder/mlir-src-sharder.cpp b/mlir/tools/mlir-src-sharder/mlir-src-sharder.cpp
index dc1e2939c7d2..5bfc24ef3b47 100644
--- a/mlir/tools/mlir-src-sharder/mlir-src-sharder.cpp
+++ b/mlir/tools/mlir-src-sharder/mlir-src-sharder.cpp
@@ -62,6 +62,16 @@ int main(int argc, char **argv) {
       "write-if-changed",
       llvm::cl::desc("Only write to the output file if it changed"));
 
+  // `ResetCommandLineParser` at the above unregistered the "D" option
+  // of `llvm-tblgen`, which caused `TestOps.cpp` to fail due to
+  // "Unknnown command line argument '-D...`" when a macros name is
+  // present. The following is a workaround to re-register it again.
+  llvm::cl::list<std::string> MacroNames(
+      "D",
+      llvm::cl::desc(
+          "Name of the macro to be defined -- ignored by mlir-src-sharder"),
+      llvm::cl::value_desc("macro name"), llvm::cl::Prefix);
+
   llvm::InitLLVM y(argc, argv);
   llvm::cl::ParseCommandLineOptions(argc, argv);
 
diff --git a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
index b9a72119790e..55bc0714c20e 100644
--- a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
+++ b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
@@ -50,7 +50,7 @@ static void collectAllDefs(StringRef selectedDialect,
   } else {
     // Otherwise, generate the defs that belong to the selected dialect.
     auto dialectDefs = llvm::make_filter_range(defs, [&](const auto &def) {
-      return def.getDialect().getName().equals(selectedDialect);
+      return def.getDialect().getName() == selectedDialect;
     });
     resultDefs.assign(dialectDefs.begin(), dialectDefs.end());
   }
diff --git a/mlir/tools/mlir-tblgen/AttrOrTypeFormatGen.cpp b/mlir/tools/mlir-tblgen/AttrOrTypeFormatGen.cpp
index 6098808c646f..abd1fbdaf8c6 100644
--- a/mlir/tools/mlir-tblgen/AttrOrTypeFormatGen.cpp
+++ b/mlir/tools/mlir-tblgen/AttrOrTypeFormatGen.cpp
@@ -940,6 +940,8 @@ protected:
                                             ArrayRef<FormatElement *> elements,
                                             FormatElement *anchor) override;
 
+  LogicalResult markQualified(SMLoc loc, FormatElement *element) override;
+
   /// Parse an attribute or type variable.
   FailureOr<FormatElement *> parseVariableImpl(SMLoc loc, StringRef name,
                                                Context ctx) override;
@@ -950,12 +952,8 @@ protected:
 private:
   /// Parse a `params` directive.
   FailureOr<FormatElement *> parseParamsDirective(SMLoc loc, Context ctx);
-  /// Parse a `qualified` directive.
-  FailureOr<FormatElement *> parseQualifiedDirective(SMLoc loc, Context ctx);
   /// Parse a `struct` directive.
   FailureOr<FormatElement *> parseStructDirective(SMLoc loc, Context ctx);
-  /// Parse a `ref` directive.
-  FailureOr<FormatElement *> parseRefDirective(SMLoc loc, Context ctx);
 
   /// Attribute or type tablegen def.
   const AttrOrTypeDef &def;
@@ -1060,6 +1058,14 @@ DefFormatParser::verifyOptionalGroupElements(llvm::SMLoc loc,
   return success();
 }
 
+LogicalResult DefFormatParser::markQualified(SMLoc loc,
+                                             FormatElement *element) {
+  if (!isa<ParameterElement>(element))
+    return emitError(loc, "`qualified` argument list expected a variable");
+  cast<ParameterElement>(element)->setShouldBeQualified();
+  return success();
+}
+
 FailureOr<DefFormat> DefFormatParser::parse() {
   FailureOr<std::vector<FormatElement *>> elements = FormatParser::parse();
   if (failed(elements))
@@ -1107,33 +1113,11 @@ DefFormatParser::parseDirectiveImpl(SMLoc loc, FormatToken::Kind kind,
     return parseParamsDirective(loc, ctx);
   case FormatToken::kw_struct:
     return parseStructDirective(loc, ctx);
-  case FormatToken::kw_ref:
-    return parseRefDirective(loc, ctx);
-  case FormatToken::kw_custom:
-    return parseCustomDirective(loc, ctx);
-
   default:
     return emitError(loc, "unsupported directive kind");
   }
 }
 
-FailureOr<FormatElement *>
-DefFormatParser::parseQualifiedDirective(SMLoc loc, Context ctx) {
-  if (failed(parseToken(FormatToken::l_paren,
-                        "expected '(' before argument list")))
-    return failure();
-  FailureOr<FormatElement *> var = parseElement(ctx);
-  if (failed(var))
-    return var;
-  if (!isa<ParameterElement>(*var))
-    return emitError(loc, "`qualified` argument list expected a variable");
-  cast<ParameterElement>(*var)->setShouldBeQualified();
-  if (failed(
-          parseToken(FormatToken::r_paren, "expected ')' after argument list")))
-    return failure();
-  return var;
-}
-
 FailureOr<FormatElement *> DefFormatParser::parseParamsDirective(SMLoc loc,
                                                                  Context ctx) {
   // It doesn't make sense to allow references to all parameters in a custom
@@ -1201,22 +1185,6 @@ FailureOr<FormatElement *> DefFormatParser::parseStructDirective(SMLoc loc,
   return create<StructDirective>(std::move(vars));
 }
 
-FailureOr<FormatElement *> DefFormatParser::parseRefDirective(SMLoc loc,
-                                                              Context ctx) {
-  if (ctx != CustomDirectiveContext)
-    return emitError(loc, "`ref` is only allowed inside custom directives");
-
-  // Parse the child parameter element.
-  FailureOr<FormatElement *> child;
-  if (failed(parseToken(FormatToken::l_paren, "expected '('")) ||
-      failed(child = parseElement(RefDirectiveContext)) ||
-      failed(parseToken(FormatToken::r_paren, "expeced ')'")))
-    return failure();
-
-  // Only parameter elements are allowed to be parsed under a `ref` directive.
-  return create<RefDirective>(*child);
-}
-
 //===----------------------------------------------------------------------===//
 // Interface
 //===----------------------------------------------------------------------===//
diff --git a/mlir/tools/mlir-tblgen/FormatGen.cpp b/mlir/tools/mlir-tblgen/FormatGen.cpp
index d402748b96ad..7540e584b8fa 100644
--- a/mlir/tools/mlir-tblgen/FormatGen.cpp
+++ b/mlir/tools/mlir-tblgen/FormatGen.cpp
@@ -308,6 +308,10 @@ FailureOr<FormatElement *> FormatParser::parseDirective(Context ctx) {
 
   if (tok.is(FormatToken::kw_custom))
     return parseCustomDirective(loc, ctx);
+  if (tok.is(FormatToken::kw_ref))
+    return parseRefDirective(loc, ctx);
+  if (tok.is(FormatToken::kw_qualified))
+    return parseQualifiedDirective(loc, ctx);
   return parseDirectiveImpl(loc, tok.getKind(), ctx);
 }
 
@@ -430,6 +434,38 @@ FailureOr<FormatElement *> FormatParser::parseCustomDirective(SMLoc loc,
   return create<CustomDirective>(nameTok->getSpelling(), std::move(arguments));
 }
 
+FailureOr<FormatElement *> FormatParser::parseRefDirective(SMLoc loc,
+                                                           Context context) {
+  if (context != CustomDirectiveContext)
+    return emitError(loc, "'ref' is only valid within a `custom` directive");
+
+  FailureOr<FormatElement *> arg;
+  if (failed(parseToken(FormatToken::l_paren,
+                        "expected '(' before argument list")) ||
+      failed(arg = parseElement(RefDirectiveContext)) ||
+      failed(
+          parseToken(FormatToken::r_paren, "expected ')' after argument list")))
+    return failure();
+
+  return create<RefDirective>(*arg);
+}
+
+FailureOr<FormatElement *> FormatParser::parseQualifiedDirective(SMLoc loc,
+                                                                 Context ctx) {
+  if (failed(parseToken(FormatToken::l_paren,
+                        "expected '(' before argument list")))
+    return failure();
+  FailureOr<FormatElement *> var = parseElement(ctx);
+  if (failed(var))
+    return var;
+  if (failed(markQualified(loc, *var)))
+    return failure();
+  if (failed(
+          parseToken(FormatToken::r_paren, "expected ')' after argument list")))
+    return failure();
+  return var;
+}
+
 //===----------------------------------------------------------------------===//
 // Utility Functions
 //===----------------------------------------------------------------------===//
diff --git a/mlir/tools/mlir-tblgen/FormatGen.h b/mlir/tools/mlir-tblgen/FormatGen.h
index 18a410277fc1..b061d4d8ea7f 100644
--- a/mlir/tools/mlir-tblgen/FormatGen.h
+++ b/mlir/tools/mlir-tblgen/FormatGen.h
@@ -495,9 +495,12 @@ protected:
   FailureOr<FormatElement *> parseDirective(Context ctx);
   /// Parse an optional group.
   FailureOr<FormatElement *> parseOptionalGroup(Context ctx);
-
   /// Parse a custom directive.
   FailureOr<FormatElement *> parseCustomDirective(llvm::SMLoc loc, Context ctx);
+  /// Parse a ref directive.
+  FailureOr<FormatElement *> parseRefDirective(SMLoc loc, Context context);
+  /// Parse a qualified directive.
+  FailureOr<FormatElement *> parseQualifiedDirective(SMLoc loc, Context ctx);
 
   /// Parse a format-specific variable kind.
   virtual FailureOr<FormatElement *>
@@ -522,6 +525,11 @@ protected:
                               ArrayRef<FormatElement *> elements,
                               FormatElement *anchor) = 0;
 
+  /// Mark 'element' as qualified. If 'element' cannot be qualified an error
+  /// should be emitted and failure returned.
+  virtual LogicalResult markQualified(llvm::SMLoc loc,
+                                      FormatElement *element) = 0;
+
   //===--------------------------------------------------------------------===//
   // Lexer Utilities
 
diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
index 806991035e66..f7cc0a292b8c 100644
--- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
@@ -2547,6 +2547,8 @@ protected:
   LogicalResult verifyOptionalGroupElement(SMLoc loc, FormatElement *element,
                                            bool isAnchor);
 
+  LogicalResult markQualified(SMLoc loc, FormatElement *element) override;
+
   /// Parse an operation variable.
   FailureOr<FormatElement *> parseVariableImpl(SMLoc loc, StringRef name,
                                                Context ctx) override;
@@ -2622,10 +2624,6 @@ private:
   FailureOr<FormatElement *> parseOIListDirective(SMLoc loc, Context context);
   LogicalResult verifyOIListParsingElement(FormatElement *element, SMLoc loc);
   FailureOr<FormatElement *> parseOperandsDirective(SMLoc loc, Context context);
-  FailureOr<FormatElement *> parseQualifiedDirective(SMLoc loc,
-                                                     Context context);
-  FailureOr<FormatElement *> parseReferenceDirective(SMLoc loc,
-                                                     Context context);
   FailureOr<FormatElement *> parseRegionsDirective(SMLoc loc, Context context);
   FailureOr<FormatElement *> parseResultsDirective(SMLoc loc, Context context);
   FailureOr<FormatElement *> parseSuccessorsDirective(SMLoc loc,
@@ -3224,16 +3222,12 @@ OpFormatParser::parseDirectiveImpl(SMLoc loc, FormatToken::Kind kind,
     return parseFunctionalTypeDirective(loc, ctx);
   case FormatToken::kw_operands:
     return parseOperandsDirective(loc, ctx);
-  case FormatToken::kw_qualified:
-    return parseQualifiedDirective(loc, ctx);
   case FormatToken::kw_regions:
     return parseRegionsDirective(loc, ctx);
   case FormatToken::kw_results:
     return parseResultsDirective(loc, ctx);
   case FormatToken::kw_successors:
     return parseSuccessorsDirective(loc, ctx);
-  case FormatToken::kw_ref:
-    return parseReferenceDirective(loc, ctx);
   case FormatToken::kw_type:
     return parseTypeDirective(loc, ctx);
   case FormatToken::kw_oilist:
@@ -3339,22 +3333,6 @@ OpFormatParser::parseOperandsDirective(SMLoc loc, Context context) {
 }
 
 FailureOr<FormatElement *>
-OpFormatParser::parseReferenceDirective(SMLoc loc, Context context) {
-  if (context != CustomDirectiveContext)
-    return emitError(loc, "'ref' is only valid within a `custom` directive");
-
-  FailureOr<FormatElement *> arg;
-  if (failed(parseToken(FormatToken::l_paren,
-                        "expected '(' before argument list")) ||
-      failed(arg = parseElement(RefDirectiveContext)) ||
-      failed(
-          parseToken(FormatToken::r_paren, "expected ')' after argument list")))
-    return failure();
-
-  return create<RefDirective>(*arg);
-}
-
-FailureOr<FormatElement *>
 OpFormatParser::parseRegionsDirective(SMLoc loc, Context context) {
   if (context == TypeDirectiveContext)
     return emitError(loc, "'regions' is only valid as a top-level directive");
@@ -3495,19 +3473,11 @@ FailureOr<FormatElement *> OpFormatParser::parseTypeDirective(SMLoc loc,
   return create<TypeDirective>(*operand);
 }
 
-FailureOr<FormatElement *>
-OpFormatParser::parseQualifiedDirective(SMLoc loc, Context context) {
-  FailureOr<FormatElement *> element;
-  if (failed(parseToken(FormatToken::l_paren,
-                        "expected '(' before argument list")) ||
-      failed(element = parseElement(context)) ||
-      failed(
-          parseToken(FormatToken::r_paren, "expected ')' after argument list")))
-    return failure();
-  return TypeSwitch<FormatElement *, FailureOr<FormatElement *>>(*element)
+LogicalResult OpFormatParser::markQualified(SMLoc loc, FormatElement *element) {
+  return TypeSwitch<FormatElement *, LogicalResult>(element)
       .Case<AttributeVariable, TypeDirective>([](auto *element) {
         element->setShouldBeQualified();
-        return element;
+        return success();
       })
       .Default([&](auto *element) {
         return this->emitError(
diff --git a/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp b/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp
index 814008c25451..052020acdcb7 100644
--- a/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp
@@ -457,7 +457,7 @@ static void emitAttributeAccessors(const Operator &op, raw_ostream &os) {
     std::string sanitizedName = sanitizeName(namedAttr.name);
 
     // Unit attributes are handled specially.
-    if (namedAttr.attr.getStorageType().trim().equals("::mlir::UnitAttr")) {
+    if (namedAttr.attr.getStorageType().trim() == "::mlir::UnitAttr") {
       os << llvm::formatv(unitAttributeGetterTemplate, sanitizedName,
                           namedAttr.name);
       os << llvm::formatv(unitAttributeSetterTemplate, sanitizedName,
@@ -668,7 +668,7 @@ populateBuilderLinesAttr(const Operator &op,
       continue;
 
     // Unit attributes are handled specially.
-    if (attribute->attr.getStorageType().trim().equals("::mlir::UnitAttr")) {
+    if (attribute->attr.getStorageType().trim() == "::mlir::UnitAttr") {
       builderLines.push_back(llvm::formatv(initUnitAttributeTemplate,
                                            attribute->name, argNames[i]));
       continue;
diff --git a/mlir/unittests/Tools/lsp-server-support/CMakeLists.txt b/mlir/unittests/Tools/lsp-server-support/CMakeLists.txt
index 3aa8b9c4bc77..f777873ff7c6 100644
--- a/mlir/unittests/Tools/lsp-server-support/CMakeLists.txt
+++ b/mlir/unittests/Tools/lsp-server-support/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_mlir_unittest(MLIRLspServerSupportTests
+  Protocol.cpp
   Transport.cpp
 )
 target_link_libraries(MLIRLspServerSupportTests
diff --git a/mlir/unittests/Tools/lsp-server-support/Protocol.cpp b/mlir/unittests/Tools/lsp-server-support/Protocol.cpp
new file mode 100644
index 000000000000..04d7b2fbb440
--- /dev/null
+++ b/mlir/unittests/Tools/lsp-server-support/Protocol.cpp
@@ -0,0 +1,51 @@
+//===- Protocol.cpp - LSP JSON protocol unit tests ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Tools/lsp-server-support/Protocol.h"
+
+#include "gtest/gtest.h"
+
+using namespace mlir;
+using namespace mlir::lsp;
+using namespace testing;
+
+namespace {
+
+TEST(ProtocolTest, DiagnosticTagPresent) {
+  Diagnostic diagnostic;
+  diagnostic.tags.push_back(DiagnosticTag::Unnecessary);
+
+  llvm::json::Value json = toJSON(diagnostic);
+  const llvm::json::Object *o = json.getAsObject();
+  const llvm::json::Array *v = o->get("tags")->getAsArray();
+  EXPECT_EQ(*v, llvm::json::Array{1});
+
+  Diagnostic parsed;
+  llvm::json::Path::Root root = llvm::json::Path::Root();
+  bool success = fromJSON(json, parsed, llvm::json::Path(root));
+  EXPECT_TRUE(success);
+  ASSERT_EQ(parsed.tags.size(), (size_t)1);
+  EXPECT_EQ(parsed.tags.at(0), DiagnosticTag::Unnecessary);
+}
+
+TEST(ProtocolTest, DiagnosticTagNotPresent) {
+  Diagnostic diagnostic;
+
+  llvm::json::Value json = toJSON(diagnostic);
+  const llvm::json::Object *o = json.getAsObject();
+  const llvm::json::Value *v = o->get("tags");
+  EXPECT_EQ(v, nullptr);
+
+  Diagnostic parsed;
+  llvm::json::Path::Root root = llvm::json::Path::Root();
+  bool success = fromJSON(json, parsed, llvm::json::Path(root));
+  EXPECT_TRUE(success);
+  EXPECT_TRUE(parsed.tags.empty());
+}
+
+} // namespace
diff --git a/mlir/unittests/Tools/lsp-server-support/Transport.cpp b/mlir/unittests/Tools/lsp-server-support/Transport.cpp
index fee218405952..0303c1cba8bc 100644
--- a/mlir/unittests/Tools/lsp-server-support/Transport.cpp
+++ b/mlir/unittests/Tools/lsp-server-support/Transport.cpp
@@ -144,17 +144,17 @@ TEST_F(TransportInputTest, ResponseHandlerNotFound) {
 TEST_F(TransportInputTest, OutgoingRequest) {
   // Make some outgoing requests.
   int responseCallbackInvoked = 0;
-  auto callFn = getMessageHandler().outgoingRequest<CompletionList>(
-      "outgoing-request",
-      [&responseCallbackInvoked](llvm::json::Value id,
-                                 llvm::Expected<llvm::json::Value> value) {
-        // Make expectations on the expected response.
-        EXPECT_EQ(id, 83);
-        ASSERT_TRUE((bool)value);
-        EXPECT_EQ(debugString(*value), "{\"foo\":6}");
-        responseCallbackInvoked += 1;
-        llvm::outs() << "here!!!\n";
-      });
+  auto callFn =
+      getMessageHandler().outgoingRequest<CompletionList, CompletionContext>(
+          "outgoing-request",
+          [&responseCallbackInvoked](llvm::json::Value id,
+                                     llvm::Expected<CompletionContext> result) {
+            // Make expectations on the expected response.
+            EXPECT_EQ(id, 83);
+            ASSERT_TRUE((bool)result);
+            EXPECT_EQ(result->triggerKind, CompletionTriggerKind::Invoked);
+            responseCallbackInvoked += 1;
+          });
   callFn({}, 82);
   callFn({}, 83);
   callFn({}, 84);
@@ -164,9 +164,41 @@ TEST_F(TransportInputTest, OutgoingRequest) {
   // One of the requests receives a response. The message handler handles this
   // response by invoking the callback from above. Subsequent responses with the
   // same ID are ignored.
-  writeInput("{\"jsonrpc\":\"2.0\",\"id\":83,\"result\":{\"foo\":6}}\n"
+  writeInput(
+      "{\"jsonrpc\":\"2.0\",\"id\":83,\"result\":{\"triggerKind\":1}}\n"
+      "// -----\n"
+      "{\"jsonrpc\":\"2.0\",\"id\":83,\"result\":{\"triggerKind\":3}}\n");
+  runTransport();
+  EXPECT_EQ(responseCallbackInvoked, 1);
+}
+
+TEST_F(TransportInputTest, OutgoingRequestJSONParseFailure) {
+  // Make an outgoing request that expects a failure response.
+  bool responseCallbackInvoked = 0;
+  auto callFn = getMessageHandler().outgoingRequest<CompletionList, Position>(
+      "outgoing-request-json-parse-failure",
+      [&responseCallbackInvoked](llvm::json::Value id,
+                                 llvm::Expected<Position> result) {
+        llvm::Error err = result.takeError();
+        EXPECT_EQ(id, 109);
+        ASSERT_TRUE((bool)err);
+        EXPECT_THAT(debugString(err),
+                    HasSubstr("failed to decode "
+                              "reply:outgoing-request-json-parse-failure(109) "
+                              "response: missing value at (root).character"));
+        llvm::consumeError(std::move(err));
+        responseCallbackInvoked += 1;
+      });
+  callFn({}, 109);
+  EXPECT_EQ(responseCallbackInvoked, 0);
+
+  // The request receives multiple responses, but only the first one triggers
+  // the response callback. The first response has erroneous JSON that causes a
+  // parse failure.
+  writeInput("{\"jsonrpc\":\"2.0\",\"id\":109,\"result\":{\"line\":7}}\n"
              "// -----\n"
-             "{\"jsonrpc\":\"2.0\",\"id\":83,\"result\":{\"bar\":8}}\n");
+             "{\"jsonrpc\":\"2.0\",\"id\":109,\"result\":{\"line\":3,"
+             "\"character\":2}}\n");
   runTransport();
   EXPECT_EQ(responseCallbackInvoked, 1);
 }
diff --git a/mlir/utils/verify-canon/verify_canon.py b/mlir/utils/verify-canon/verify_canon.py
new file mode 100644
index 000000000000..bfddba9577b9
--- /dev/null
+++ b/mlir/utils/verify-canon/verify_canon.py
@@ -0,0 +1,77 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# This script is a helper to verify canonicalization patterns using Alive2
+# https://alive2.llvm.org/ce/.
+# It performs the following steps:
+# - Filters out the provided test functions.
+# - Runs the canonicalization pass on the remaining functions.
+# - Lowers both the original and the canonicalized functions to LLVM IR.
+# - Prints the canonicalized and the original functions side-by-side in a format
+#   that can be copied into Alive2 for verification.
+# Example: `python verify_canon.py canonicalize.mlir -f func1 func2 func3`
+
+import subprocess
+import tempfile
+import sys
+from pathlib import Path
+from argparse import ArgumentParser
+
+
+def filter_funcs(ir, funcs):
+    if not funcs:
+        return ir
+
+    funcs_str = ",".join(funcs)
+    return subprocess.check_output(
+        ["mlir-opt", f"--symbol-privatize=exclude={funcs_str}", "--symbol-dce"],
+        input=ir,
+    )
+
+
+def add_func_prefix(src, prefix):
+    return src.replace("@", "@" + prefix)
+
+
+def merge_ir(chunks):
+    files = []
+    for chunk in chunks:
+        tmp = tempfile.NamedTemporaryFile(suffix=".ll")
+        tmp.write(chunk)
+        tmp.flush()
+        files.append(tmp)
+
+    return subprocess.check_output(["llvm-link", "-S"] + [f.name for f in files])
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("file")
+    parser.add_argument("-f", "--func-names", nargs="+", default=[])
+    args = parser.parse_args()
+
+    file = args.file
+    funcs = args.func_names
+
+    orig_ir = Path(file).read_bytes()
+    orig_ir = filter_funcs(orig_ir, funcs)
+
+    to_llvm_args = ["--convert-to-llvm"]
+    orig_args = ["mlir-opt"] + to_llvm_args
+    canon_args = ["mlir-opt", "-canonicalize"] + to_llvm_args
+    translate_args = ["mlir-translate", "-mlir-to-llvmir"]
+
+    orig = subprocess.check_output(orig_args, input=orig_ir)
+    canonicalized = subprocess.check_output(canon_args, input=orig_ir)
+
+    orig = subprocess.check_output(translate_args, input=orig)
+    canonicalized = subprocess.check_output(translate_args, input=canonicalized)
+
+    enc = "utf-8"
+    orig = bytes(add_func_prefix(orig.decode(enc), "src_"), enc)
+    canonicalized = bytes(add_func_prefix(canonicalized.decode(enc), "tgt_"), enc)
+
+    res = merge_ir([orig, canonicalized])
+
+    print(res.decode(enc))
diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt
index 42e0f5740f11..3f77583ffa3b 100644
--- a/offload/CMakeLists.txt
+++ b/offload/CMakeLists.txt
@@ -267,11 +267,6 @@ if(OPENMP_STANDALONE_BUILD)
       ${LLVM_LIBRARY_DIRS}
     REQUIRED
   )
-# Check LIBOMP_HAVE_VERSION_SCRIPT_FLAG
-  include(LLVMCheckCompilerLinkerFlag)
-  if(NOT APPLE)
-    llvm_check_compiler_linker_flag(C "-Wl,--version-script=${CMAKE_CURRENT_LIST_DIR}/../openmp/runtime/src/exports_test_so.txt" LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
-  endif()
 
   macro(pythonize_bool var)
   if (${var})
@@ -282,6 +277,14 @@ if(OPENMP_STANDALONE_BUILD)
   endmacro()
 endif()
 
+if(OPENMP_STANDALONE_BUILD OR TARGET omp)
+  # Check LIBOMP_HAVE_VERSION_SCRIPT_FLAG
+  include(LLVMCheckCompilerLinkerFlag)
+  if(NOT APPLE)
+    llvm_check_compiler_linker_flag(C "-Wl,--version-script=${CMAKE_CURRENT_LIST_DIR}/../openmp/runtime/src/exports_test_so.txt" LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
+  endif()
+endif()
+
 # OMPT support for libomptarget
 # Follow host OMPT support and check if host support has been requested.
 # LIBOMP_HAVE_OMPT_SUPPORT indicates whether host OMPT support has been implemented.
diff --git a/offload/include/PluginManager.h b/offload/include/PluginManager.h
index eece7525e25e..1d6804da75d9 100644
--- a/offload/include/PluginManager.h
+++ b/offload/include/PluginManager.h
@@ -13,10 +13,11 @@
 #ifndef OMPTARGET_PLUGIN_MANAGER_H
 #define OMPTARGET_PLUGIN_MANAGER_H
 
+#include "PluginInterface.h"
+
 #include "DeviceImage.h"
 #include "ExclusiveAccess.h"
 #include "Shared/APITypes.h"
-#include "Shared/PluginAPI.h"
 #include "Shared/Requirements.h"
 
 #include "device.h"
@@ -34,38 +35,7 @@
 #include <mutex>
 #include <string>
 
-struct PluginManager;
-
-/// Plugin adaptors should be created via `PluginAdaptorTy::create` which will
-/// invoke the constructor and call `PluginAdaptorTy::init`. Eventual errors are
-/// reported back to the caller, otherwise a valid and initialized adaptor is
-/// returned.
-struct PluginAdaptorTy {
-  /// Try to create a plugin adaptor from a filename.
-  static llvm::Expected<std::unique_ptr<PluginAdaptorTy>>
-  create(const std::string &Name);
-
-  /// Name of the shared object file representing the plugin.
-  std::string Name;
-
-  /// Access to the shared object file representing the plugin.
-  std::unique_ptr<llvm::sys::DynamicLibrary> LibraryHandler;
-
-#define PLUGIN_API_HANDLE(NAME)                                                \
-  using NAME##_ty = decltype(__tgt_rtl_##NAME);                                \
-  NAME##_ty *NAME = nullptr;
-
-#include "Shared/PluginAPI.inc"
-#undef PLUGIN_API_HANDLE
-
-  /// Create a plugin adaptor for filename \p Name with a dynamic library \p DL.
-  PluginAdaptorTy(const std::string &Name,
-                  std::unique_ptr<llvm::sys::DynamicLibrary> DL);
-
-  /// Initialize the plugin adaptor, this can fail in which case the adaptor is
-  /// useless.
-  llvm::Error init();
-};
+using GenericPluginTy = llvm::omp::target::plugin::GenericPluginTy;
 
 /// Struct for the data required to handle plugins
 struct PluginManager {
@@ -80,6 +50,8 @@ struct PluginManager {
 
   void init();
 
+  void deinit();
+
   // Register a shared library with all (compatible) RTLs.
   void registerLib(__tgt_bin_desc *Desc);
 
@@ -92,10 +64,9 @@ struct PluginManager {
         std::make_unique<DeviceImageTy>(TgtBinDesc, TgtDeviceImage));
   }
 
-  /// Initialize as many devices as possible for this plugin adaptor. Devices
-  /// that fail to initialize are ignored. Returns the offset the devices were
-  /// registered at.
-  void initDevices(PluginAdaptorTy &RTL);
+  /// Initialize as many devices as possible for this plugin. Devices that fail
+  /// to initialize are ignored.
+  void initDevices(GenericPluginTy &RTL);
 
   /// Return the device presented to the user as device \p DeviceNo if it is
   /// initialized and ready. Otherwise return an error explaining the problem.
@@ -151,8 +122,8 @@ struct PluginManager {
   // Initialize all plugins.
   void initAllPlugins();
 
-  /// Iterator range for all plugin adaptors (in use or not, but always valid).
-  auto pluginAdaptors() { return llvm::make_pointee_range(PluginAdaptors); }
+  /// Iterator range for all plugins (in use or not, but always valid).
+  auto plugins() { return llvm::make_pointee_range(Plugins); }
 
   /// Return the user provided requirements.
   int64_t getRequirements() const { return Requirements.getRequirements(); }
@@ -164,14 +135,14 @@ private:
   bool RTLsLoaded = false;
   llvm::SmallVector<__tgt_bin_desc *> DelayedBinDesc;
 
-  // List of all plugin adaptors, in use or not.
-  llvm::SmallVector<std::unique_ptr<PluginAdaptorTy>> PluginAdaptors;
+  // List of all plugins, in use or not.
+  llvm::SmallVector<std::unique_ptr<GenericPluginTy>> Plugins;
 
-  // Mapping of plugin adaptors to offsets in the device table.
-  llvm::DenseMap<const PluginAdaptorTy *, int32_t> DeviceOffsets;
+  // Mapping of plugins to offsets in the device table.
+  llvm::DenseMap<const GenericPluginTy *, int32_t> DeviceOffsets;
 
-  // Mapping of plugin adaptors to the number of used devices.
-  llvm::DenseMap<const PluginAdaptorTy *, int32_t> DeviceUsed;
+  // Mapping of plugins to the number of used devices.
+  llvm::DenseMap<const GenericPluginTy *, int32_t> DeviceUsed;
 
   // Set of all device images currently in use.
   llvm::DenseSet<const __tgt_device_image *> UsedImages;
diff --git a/offload/include/device.h b/offload/include/device.h
index bd2829722bb3..fd6e5fba5fc5 100644
--- a/offload/include/device.h
+++ b/offload/include/device.h
@@ -33,17 +33,19 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 
+#include "PluginInterface.h"
+using GenericPluginTy = llvm::omp::target::plugin::GenericPluginTy;
+
 // Forward declarations.
-struct PluginAdaptorTy;
 struct __tgt_bin_desc;
 struct __tgt_target_table;
 
 struct DeviceTy {
   int32_t DeviceID;
-  PluginAdaptorTy *RTL;
+  GenericPluginTy *RTL;
   int32_t RTLDeviceID;
 
-  DeviceTy(PluginAdaptorTy *RTL, int32_t DeviceID, int32_t RTLDeviceID);
+  DeviceTy(GenericPluginTy *RTL, int32_t DeviceID, int32_t RTLDeviceID);
   // DeviceTy is not copyable
   DeviceTy(const DeviceTy &D) = delete;
   DeviceTy &operator=(const DeviceTy &D) = delete;
diff --git a/offload/plugins-nextgen/CMakeLists.txt b/offload/plugins-nextgen/CMakeLists.txt
index df625e97c7eb..d1079f8a3e9c 100644
--- a/offload/plugins-nextgen/CMakeLists.txt
+++ b/offload/plugins-nextgen/CMakeLists.txt
@@ -14,7 +14,7 @@
 set(common_dir ${CMAKE_CURRENT_SOURCE_DIR}/common)
 add_subdirectory(common)
 function(add_target_library target_name lib_name)
-  add_llvm_library(${target_name} SHARED
+  add_llvm_library(${target_name} STATIC
     LINK_COMPONENTS
       ${LLVM_TARGETS_TO_BUILD}
       AggressiveInstCombine
@@ -46,27 +46,14 @@ function(add_target_library target_name lib_name)
   )
 
   llvm_update_compile_flags(${target_name})
+  target_include_directories(${target_name} PUBLIC ${common_dir}/include)
   target_link_libraries(${target_name} PRIVATE
                         PluginCommon ${OPENMP_PTHREAD_LIB})
 
   target_compile_definitions(${target_name} PRIVATE TARGET_NAME=${lib_name})
   target_compile_definitions(${target_name} PRIVATE 
                              DEBUG_PREFIX="TARGET ${lib_name} RTL")
-
-  if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
-    # On FreeBSD, the 'environ' symbol is undefined at link time, but resolved by
-    # the dynamic linker at runtime. Therefore, allow the symbol to be undefined
-    # when creating a shared library.
-    target_link_libraries(${target_name} PRIVATE "-Wl,--allow-shlib-undefined")
-  else()
-    target_link_libraries(${target_name} PRIVATE "-Wl,-z,defs")
-  endif()
-
-  if(LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
-    target_link_libraries(${target_name} PRIVATE
-    "-Wl,--version-script=${common_dir}/../exports")
-  endif()
-  set_target_properties(${target_name} PROPERTIES CXX_VISIBILITY_PRESET protected)
+  set_target_properties(${target_name} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endfunction()
 
 foreach(plugin IN LISTS LIBOMPTARGET_PLUGINS_TO_BUILD)
diff --git a/offload/plugins-nextgen/amdgpu/CMakeLists.txt b/offload/plugins-nextgen/amdgpu/CMakeLists.txt
index f5f7096137c2..738183f8945e 100644
--- a/offload/plugins-nextgen/amdgpu/CMakeLists.txt
+++ b/offload/plugins-nextgen/amdgpu/CMakeLists.txt
@@ -57,8 +57,3 @@ else()
   libomptarget_say("Not generating AMDGPU tests, no supported devices detected."
                    " Use 'LIBOMPTARGET_FORCE_AMDGPU_TESTS' to override.")
 endif()
-
-# Install plugin under the lib destination folder.
-install(TARGETS omptarget.rtl.amdgpu LIBRARY DESTINATION "${OFFLOAD_INSTALL_LIBDIR}")
-set_target_properties(omptarget.rtl.amdgpu PROPERTIES
-  INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/..")
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 00650b801b42..295685fceaa4 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -3064,10 +3064,6 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
     // HSA functions from now on, e.g., hsa_shut_down.
     Initialized = true;
 
-#ifdef OMPT_SUPPORT
-    ompt::connectLibrary();
-#endif
-
     // Register event handler to detect memory errors on the devices.
     Status = hsa_amd_register_system_event_handler(eventHandler, nullptr);
     if (auto Err = Plugin::check(
@@ -3155,6 +3151,8 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
 
   Triple::ArchType getTripleArch() const override { return Triple::amdgcn; }
 
+  const char *getName() const override { return GETNAME(TARGET_NAME); }
+
   /// Get the ELF code for recognizing the compatible image binary.
   uint16_t getMagicElfBits() const override { return ELF::EM_AMDGPU; }
 
@@ -3387,8 +3385,6 @@ Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
   return Plugin::success();
 }
 
-GenericPluginTy *PluginTy::createPlugin() { return new AMDGPUPluginTy(); }
-
 template <typename... ArgsTy>
 static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
   hsa_status_t ResultCode = static_cast<hsa_status_t>(Code);
@@ -3476,3 +3472,9 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
 } // namespace target
 } // namespace omp
 } // namespace llvm
+
+extern "C" {
+llvm::omp::target::plugin::GenericPluginTy *createPlugin_amdgpu() {
+  return new llvm::omp::target::plugin::AMDGPUPluginTy();
+}
+}
diff --git a/offload/plugins-nextgen/common/CMakeLists.txt b/offload/plugins-nextgen/common/CMakeLists.txt
index acf0af63f050..a470dcee6d85 100644
--- a/offload/plugins-nextgen/common/CMakeLists.txt
+++ b/offload/plugins-nextgen/common/CMakeLists.txt
@@ -66,6 +66,4 @@ target_include_directories(PluginCommon PUBLIC
   ${LIBOMPTARGET_INCLUDE_DIR}
 )
 
-set_target_properties(PluginCommon PROPERTIES
-  POSITION_INDEPENDENT_CODE ON
-  CXX_VISIBILITY_PRESET protected)
+set_target_properties(PluginCommon PROPERTIES POSITION_INDEPENDENT_CODE ON)
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 79e8464bfda5..e7a008f3a857 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -1010,6 +1010,9 @@ struct GenericPluginTy {
   /// Get the target triple of this plugin.
   virtual Triple::ArchType getTripleArch() const = 0;
 
+  /// Get the constant name identifier for this plugin.
+  virtual const char *getName() const = 0;
+
   /// Allocate a structure using the internal allocator.
   template <typename Ty> Ty *allocate() {
     return reinterpret_cast<Ty *>(Allocator.Allocate(sizeof(Ty), alignof(Ty)));
@@ -1226,7 +1229,7 @@ namespace Plugin {
 /// Create a success error. This is the same as calling Error::success(), but
 /// it is recommended to use this one for consistency with Plugin::error() and
 /// Plugin::check().
-static Error success() { return Error::success(); }
+static inline Error success() { return Error::success(); }
 
 /// Create a string error.
 template <typename... ArgsTy>
@@ -1246,95 +1249,6 @@ template <typename... ArgsTy>
 static Error check(int32_t ErrorCode, const char *ErrFmt, ArgsTy... Args);
 } // namespace Plugin
 
-/// Class for simplifying the getter operation of the plugin. Anywhere on the
-/// code, the current plugin can be retrieved by Plugin::get(). The class also
-/// declares functions to create plugin-specific object instances. The check(),
-/// createPlugin(), createDevice() and createGlobalHandler() functions should be
-/// defined by each plugin implementation.
-class PluginTy {
-  // Reference to the plugin instance.
-  static GenericPluginTy *SpecificPlugin;
-
-  PluginTy() {
-    if (auto Err = init())
-      REPORT("Failed to initialize plugin: %s\n",
-             toString(std::move(Err)).data());
-  }
-
-  ~PluginTy() {
-    if (auto Err = deinit())
-      REPORT("Failed to deinitialize plugin: %s\n",
-             toString(std::move(Err)).data());
-  }
-
-  PluginTy(const PluginTy &) = delete;
-  void operator=(const PluginTy &) = delete;
-
-  /// Create and intialize the plugin instance.
-  static Error init() {
-    assert(!SpecificPlugin && "Plugin already created");
-
-    // Create the specific plugin.
-    SpecificPlugin = createPlugin();
-    assert(SpecificPlugin && "Plugin was not created");
-
-    // Initialize the plugin.
-    return SpecificPlugin->init();
-  }
-
-  // Deinitialize and destroy the plugin instance.
-  static Error deinit() {
-    assert(SpecificPlugin && "Plugin no longer valid");
-
-    for (int32_t DevNo = 0, NumDev = SpecificPlugin->getNumDevices();
-         DevNo < NumDev; ++DevNo)
-      if (auto Err = SpecificPlugin->deinitDevice(DevNo))
-        return Err;
-
-    // Deinitialize the plugin.
-    if (auto Err = SpecificPlugin->deinit())
-      return Err;
-
-    // Delete the plugin instance.
-    delete SpecificPlugin;
-
-    // Invalidate the plugin reference.
-    SpecificPlugin = nullptr;
-
-    return Plugin::success();
-  }
-
-public:
-  /// Initialize the plugin if needed. The plugin could have been initialized by
-  /// a previous call to Plugin::get().
-  static Error initIfNeeded() {
-    // Trigger the initialization if needed.
-    get();
-
-    return Error::success();
-  }
-
-  /// Get a reference (or create if it was not created) to the plugin instance.
-  static GenericPluginTy &get() {
-    // This static variable will initialize the underlying plugin instance in
-    // case there was no previous explicit initialization. The initialization is
-    // thread safe.
-    static PluginTy Plugin;
-
-    assert(SpecificPlugin && "Plugin is not active");
-    return *SpecificPlugin;
-  }
-
-  /// Get a reference to the plugin with a specific plugin-specific type.
-  template <typename Ty> static Ty &get() { return static_cast<Ty &>(get()); }
-
-  /// Indicate whether the plugin is active.
-  static bool isActive() { return SpecificPlugin != nullptr; }
-
-  /// Create a plugin instance.
-  static GenericPluginTy *createPlugin();
-};
-
 /// Auxiliary interface class for GenericDeviceResourceManagerTy. This class
 /// acts as a reference to a device resource, such as a stream, and requires
 /// some basic functions to be implemented. The derived class should define an
diff --git a/offload/plugins-nextgen/common/include/Utils/ELF.h b/offload/plugins-nextgen/common/include/Utils/ELF.h
index 88c83d39b68c..dcfdb5bd7b03 100644
--- a/offload/plugins-nextgen/common/include/Utils/ELF.h
+++ b/offload/plugins-nextgen/common/include/Utils/ELF.h
@@ -13,8 +13,6 @@
 #ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_ELF_UTILS_H
 #define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_ELF_UTILS_H
 
-#include "Shared/PluginAPI.h"
-
 #include "llvm/Object/ELF.h"
 #include "llvm/Object/ELFObjectFile.h"
 
@@ -24,6 +22,9 @@ namespace elf {
 /// Returns true or false if the \p Buffer is an ELF file.
 bool isELF(llvm::StringRef Buffer);
 
+/// Returns the ELF e_machine value of the current compilation target.
+uint16_t getTargetMachine();
+
 /// Checks if the given \p Object is a valid ELF matching the e_machine value.
 llvm::Expected<bool> checkMachine(llvm::StringRef Object, uint16_t EMachine);
 
diff --git a/offload/plugins-nextgen/common/src/JIT.cpp b/offload/plugins-nextgen/common/src/JIT.cpp
index 9eb610cab4de..9d58e6060646 100644
--- a/offload/plugins-nextgen/common/src/JIT.cpp
+++ b/offload/plugins-nextgen/common/src/JIT.cpp
@@ -56,28 +56,6 @@ bool isImageBitcode(const __tgt_device_image &Image) {
   return identify_magic(Binary) == file_magic::bitcode;
 }
 
-std::once_flag InitFlag;
-
-void init(Triple TT) {
-  codegen::RegisterCodeGenFlags();
-#ifdef LIBOMPTARGET_JIT_NVPTX
-  if (TT.isNVPTX()) {
-    LLVMInitializeNVPTXTargetInfo();
-    LLVMInitializeNVPTXTarget();
-    LLVMInitializeNVPTXTargetMC();
-    LLVMInitializeNVPTXAsmPrinter();
-  }
-#endif
-#ifdef LIBOMPTARGET_JIT_AMDGPU
-  if (TT.isAMDGPU()) {
-    LLVMInitializeAMDGPUTargetInfo();
-    LLVMInitializeAMDGPUTarget();
-    LLVMInitializeAMDGPUTargetMC();
-    LLVMInitializeAMDGPUAsmPrinter();
-  }
-#endif
-}
-
 Expected<std::unique_ptr<Module>>
 createModuleFromMemoryBuffer(std::unique_ptr<MemoryBuffer> &MB,
                              LLVMContext &Context) {
@@ -148,7 +126,23 @@ createTargetMachine(Module &M, std::string CPU, unsigned OptLevel) {
 } // namespace
 
 JITEngine::JITEngine(Triple::ArchType TA) : TT(Triple::getArchTypeName(TA)) {
-  std::call_once(InitFlag, init, TT);
+  codegen::RegisterCodeGenFlags();
+#ifdef LIBOMPTARGET_JIT_NVPTX
+  if (TT.isNVPTX()) {
+    LLVMInitializeNVPTXTargetInfo();
+    LLVMInitializeNVPTXTarget();
+    LLVMInitializeNVPTXTargetMC();
+    LLVMInitializeNVPTXAsmPrinter();
+  }
+#endif
+#ifdef LIBOMPTARGET_JIT_AMDGPU
+  if (TT.isAMDGPU()) {
+    LLVMInitializeAMDGPUTargetInfo();
+    LLVMInitializeAMDGPUTarget();
+    LLVMInitializeAMDGPUTargetMC();
+    LLVMInitializeAMDGPUAsmPrinter();
+  }
+#endif
 }
 
 void JITEngine::opt(TargetMachine *TM, TargetLibraryInfoImpl *TLII, Module &M,
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index b5f3c45c835f..fae197527850 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -13,7 +13,6 @@
 #include "Shared/APITypes.h"
 #include "Shared/Debug.h"
 #include "Shared/Environment.h"
-#include "Shared/PluginAPI.h"
 
 #include "GlobalHandler.h"
 #include "JIT.h"
@@ -39,8 +38,6 @@ using namespace omp;
 using namespace target;
 using namespace plugin;
 
-GenericPluginTy *PluginTy::SpecificPlugin = nullptr;
-
 // TODO: Fix any thread safety issues for multi-threaded kernel recording.
 struct RecordReplayTy {
 
@@ -1348,13 +1345,27 @@ Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) {
     return Plugin::success();
 
   int Res;
-  if (MemoryManager)
-    Res = MemoryManager->free(TgtPtr);
-  else
+  switch (Kind) {
+  case TARGET_ALLOC_DEFAULT:
+  case TARGET_ALLOC_DEVICE_NON_BLOCKING:
+  case TARGET_ALLOC_DEVICE:
+    if (MemoryManager) {
+      Res = MemoryManager->free(TgtPtr);
+      if (Res)
+        return Plugin::error(
+            "Failure to deallocate device pointer %p via memory manager",
+            TgtPtr);
+      break;
+    }
+    [[fallthrough]];
+  case TARGET_ALLOC_HOST:
+  case TARGET_ALLOC_SHARED:
     Res = free(TgtPtr, Kind);
-
-  if (Res)
-    return Plugin::error("Failure to deallocate device pointer %p", TgtPtr);
+    if (Res)
+      return Plugin::error(
+          "Failure to deallocate device pointer %p via device deallocator",
+          TgtPtr);
+  }
 
   // Unregister deallocated pinned memory buffer if the type is host memory.
   if (Kind == TARGET_ALLOC_HOST)
@@ -2021,205 +2032,3 @@ bool llvm::omp::target::plugin::libomptargetSupportsRPC() {
   return false;
 #endif
 }
-
-/// Exposed library API function, basically wrappers around the GenericDeviceTy
-/// functionality with the same name. All non-async functions are redirected
-/// to the async versions right away with a NULL AsyncInfoPtr.
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int32_t __tgt_rtl_init_plugin() {
-  auto Err = PluginTy::initIfNeeded();
-  if (Err) {
-    [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
-    DP("Failed to init plugin: %s", ErrStr.c_str());
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
-  if (!PluginTy::isActive())
-    return false;
-
-  return PluginTy::get().is_valid_binary(Image);
-}
-
-int32_t __tgt_rtl_init_device(int32_t DeviceId) {
-  return PluginTy::get().init_device(DeviceId);
-}
-
-int32_t __tgt_rtl_number_of_devices() {
-  return PluginTy::get().number_of_devices();
-}
-
-int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
-  return PluginTy::get().init_requires(RequiresFlags);
-}
-
-int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDeviceId,
-                                      int32_t DstDeviceId) {
-  return PluginTy::get().is_data_exchangable(SrcDeviceId, DstDeviceId);
-}
-
-int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
-                                           void *VAddr, bool isRecord,
-                                           bool SaveOutput,
-                                           uint64_t &ReqPtrArgOffset) {
-  return PluginTy::get().initialize_record_replay(
-      DeviceId, MemorySize, VAddr, isRecord, SaveOutput, ReqPtrArgOffset);
-}
-
-int32_t __tgt_rtl_load_binary(int32_t DeviceId, __tgt_device_image *TgtImage,
-                              __tgt_device_binary *Binary) {
-  return PluginTy::get().load_binary(DeviceId, TgtImage, Binary);
-}
-
-void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *HostPtr,
-                           int32_t Kind) {
-  return PluginTy::get().data_alloc(DeviceId, Size, HostPtr, Kind);
-}
-
-int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr, int32_t Kind) {
-  return PluginTy::get().data_delete(DeviceId, TgtPtr, Kind);
-}
-
-int32_t __tgt_rtl_data_lock(int32_t DeviceId, void *Ptr, int64_t Size,
-                            void **LockedPtr) {
-  return PluginTy::get().data_lock(DeviceId, Ptr, Size, LockedPtr);
-}
-
-int32_t __tgt_rtl_data_unlock(int32_t DeviceId, void *Ptr) {
-  return PluginTy::get().data_unlock(DeviceId, Ptr);
-}
-
-int32_t __tgt_rtl_data_notify_mapped(int32_t DeviceId, void *HstPtr,
-                                     int64_t Size) {
-  return PluginTy::get().data_notify_mapped(DeviceId, HstPtr, Size);
-}
-
-int32_t __tgt_rtl_data_notify_unmapped(int32_t DeviceId, void *HstPtr) {
-  return PluginTy::get().data_notify_unmapped(DeviceId, HstPtr);
-}
-
-int32_t __tgt_rtl_data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr,
-                              int64_t Size) {
-  return PluginTy::get().data_submit(DeviceId, TgtPtr, HstPtr, Size);
-}
-
-int32_t __tgt_rtl_data_submit_async(int32_t DeviceId, void *TgtPtr,
-                                    void *HstPtr, int64_t Size,
-                                    __tgt_async_info *AsyncInfoPtr) {
-  return PluginTy::get().data_submit_async(DeviceId, TgtPtr, HstPtr, Size,
-                                           AsyncInfoPtr);
-}
-
-int32_t __tgt_rtl_data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
-                                int64_t Size) {
-  return PluginTy::get().data_retrieve(DeviceId, HstPtr, TgtPtr, Size);
-}
-
-int32_t __tgt_rtl_data_retrieve_async(int32_t DeviceId, void *HstPtr,
-                                      void *TgtPtr, int64_t Size,
-                                      __tgt_async_info *AsyncInfoPtr) {
-  return PluginTy::get().data_retrieve_async(DeviceId, HstPtr, TgtPtr, Size,
-                                             AsyncInfoPtr);
-}
-
-int32_t __tgt_rtl_data_exchange(int32_t SrcDeviceId, void *SrcPtr,
-                                int32_t DstDeviceId, void *DstPtr,
-                                int64_t Size) {
-  return PluginTy::get().data_exchange(SrcDeviceId, SrcPtr, DstDeviceId, DstPtr,
-                                       Size);
-}
-
-int32_t __tgt_rtl_data_exchange_async(int32_t SrcDeviceId, void *SrcPtr,
-                                      int DstDeviceId, void *DstPtr,
-                                      int64_t Size,
-                                      __tgt_async_info *AsyncInfo) {
-  return PluginTy::get().data_exchange_async(SrcDeviceId, SrcPtr, DstDeviceId,
-                                             DstPtr, Size, AsyncInfo);
-}
-
-int32_t __tgt_rtl_launch_kernel(int32_t DeviceId, void *TgtEntryPtr,
-                                void **TgtArgs, ptrdiff_t *TgtOffsets,
-                                KernelArgsTy *KernelArgs,
-                                __tgt_async_info *AsyncInfoPtr) {
-  return PluginTy::get().launch_kernel(DeviceId, TgtEntryPtr, TgtArgs,
-                                       TgtOffsets, KernelArgs, AsyncInfoPtr);
-}
-
-int32_t __tgt_rtl_synchronize(int32_t DeviceId,
-                              __tgt_async_info *AsyncInfoPtr) {
-  return PluginTy::get().synchronize(DeviceId, AsyncInfoPtr);
-}
-
-int32_t __tgt_rtl_query_async(int32_t DeviceId,
-                              __tgt_async_info *AsyncInfoPtr) {
-  return PluginTy::get().query_async(DeviceId, AsyncInfoPtr);
-}
-
-void __tgt_rtl_print_device_info(int32_t DeviceId) {
-  PluginTy::get().print_device_info(DeviceId);
-}
-
-int32_t __tgt_rtl_create_event(int32_t DeviceId, void **EventPtr) {
-  return PluginTy::get().create_event(DeviceId, EventPtr);
-}
-
-int32_t __tgt_rtl_record_event(int32_t DeviceId, void *EventPtr,
-                               __tgt_async_info *AsyncInfoPtr) {
-  return PluginTy::get().record_event(DeviceId, EventPtr, AsyncInfoPtr);
-}
-
-int32_t __tgt_rtl_wait_event(int32_t DeviceId, void *EventPtr,
-                             __tgt_async_info *AsyncInfoPtr) {
-  return PluginTy::get().wait_event(DeviceId, EventPtr, AsyncInfoPtr);
-}
-
-int32_t __tgt_rtl_sync_event(int32_t DeviceId, void *EventPtr) {
-  return PluginTy::get().sync_event(DeviceId, EventPtr);
-}
-
-int32_t __tgt_rtl_destroy_event(int32_t DeviceId, void *EventPtr) {
-  return PluginTy::get().destroy_event(DeviceId, EventPtr);
-}
-
-void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) {
-  return PluginTy::get().set_info_flag(NewInfoLevel);
-}
-
-int32_t __tgt_rtl_init_async_info(int32_t DeviceId,
-                                  __tgt_async_info **AsyncInfoPtr) {
-  return PluginTy::get().init_async_info(DeviceId, AsyncInfoPtr);
-}
-
-int32_t __tgt_rtl_init_device_info(int32_t DeviceId,
-                                   __tgt_device_info *DeviceInfo,
-                                   const char **ErrStr) {
-  return PluginTy::get().init_device_info(DeviceId, DeviceInfo, ErrStr);
-}
-
-int32_t __tgt_rtl_set_device_offset(int32_t DeviceIdOffset) {
-  return PluginTy::get().set_device_offset(DeviceIdOffset);
-}
-
-int32_t __tgt_rtl_use_auto_zero_copy(int32_t DeviceId) {
-  return PluginTy::get().use_auto_zero_copy(DeviceId);
-}
-
-int32_t __tgt_rtl_get_global(__tgt_device_binary Binary, uint64_t Size,
-                             const char *Name, void **DevicePtr) {
-  return PluginTy::get().get_global(Binary, Size, Name, DevicePtr);
-}
-
-int32_t __tgt_rtl_get_function(__tgt_device_binary Binary, const char *Name,
-                               void **KernelPtr) {
-  return PluginTy::get().get_function(Binary, Name, KernelPtr);
-}
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/offload/plugins-nextgen/common/src/Utils/ELF.cpp b/offload/plugins-nextgen/common/src/Utils/ELF.cpp
index 2ae97f0f2589..90d6950b83e5 100644
--- a/offload/plugins-nextgen/common/src/Utils/ELF.cpp
+++ b/offload/plugins-nextgen/common/src/Utils/ELF.cpp
@@ -36,6 +36,21 @@ bool utils::elf::isELF(StringRef Buffer) {
   }
 }
 
+uint16_t utils::elf::getTargetMachine() {
+#if defined(__x86_64__)
+  return EM_X86_64;
+#elif defined(__s390x__)
+  return EM_S390;
+#elif defined(__aarch64__)
+  return EM_AARCH64;
+#elif defined(__powerpc64__)
+  return EM_PPC64;
+#else
+#warning "Unknown ELF compilation target architecture"
+  return EM_NONE;
+#endif
+}
+
 template <class ELFT>
 static Expected<bool>
 checkMachineImpl(const object::ELFObjectFile<ELFT> &ELFObj, uint16_t EMachine) {
diff --git a/offload/plugins-nextgen/cuda/CMakeLists.txt b/offload/plugins-nextgen/cuda/CMakeLists.txt
index 0284bd22d2a4..dd684bb22343 100644
--- a/offload/plugins-nextgen/cuda/CMakeLists.txt
+++ b/offload/plugins-nextgen/cuda/CMakeLists.txt
@@ -51,8 +51,3 @@ else()
   libomptarget_say("Not generating NVIDIA tests, no supported devices detected."
                    " Use 'LIBOMPTARGET_FORCE_NVIDIA_TESTS' to override.")
 endif()
-
-# Install plugin under the lib destination folder.
-install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OFFLOAD_INSTALL_LIBDIR}")
-set_target_properties(omptarget.rtl.cuda PROPERTIES
-  INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/..")
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index fc74c6aa23fd..b260334baa18 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -1342,10 +1342,6 @@ struct CUDAPluginTy final : public GenericPluginTy {
       return 0;
     }
 
-#ifdef OMPT_SUPPORT
-    ompt::connectLibrary();
-#endif
-
     if (Res == CUDA_ERROR_NO_DEVICE) {
       // Do not initialize if there are no devices.
       DP("There are no devices supporting CUDA.\n");
@@ -1390,6 +1386,8 @@ struct CUDAPluginTy final : public GenericPluginTy {
     return Triple::nvptx64;
   }
 
+  const char *getName() const override { return GETNAME(TARGET_NAME); }
+
   /// Check whether the image is compatible with the available CUDA devices.
   Expected<bool> isELFCompatible(StringRef Image) const override {
     auto ElfOrErr =
@@ -1495,8 +1493,6 @@ Error CUDADeviceTy::dataExchangeImpl(const void *SrcPtr,
   return Plugin::check(Res, "Error in cuMemcpyDtoDAsync: %s");
 }
 
-GenericPluginTy *PluginTy::createPlugin() { return new CUDAPluginTy(); }
-
 template <typename... ArgsTy>
 static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
   CUresult ResultCode = static_cast<CUresult>(Code);
@@ -1516,3 +1512,9 @@ static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
 } // namespace target
 } // namespace omp
 } // namespace llvm
+
+extern "C" {
+llvm::omp::target::plugin::GenericPluginTy *createPlugin_cuda() {
+  return new llvm::omp::target::plugin::CUDAPluginTy();
+}
+}
diff --git a/offload/plugins-nextgen/host/CMakeLists.txt b/offload/plugins-nextgen/host/CMakeLists.txt
index 6407f72e8db0..72b5681283fe 100644
--- a/offload/plugins-nextgen/host/CMakeLists.txt
+++ b/offload/plugins-nextgen/host/CMakeLists.txt
@@ -31,14 +31,6 @@ else()
   target_include_directories(omptarget.rtl.host PRIVATE dynamic_ffi)
 endif()
 
-# Install plugin under the lib destination folder.
-install(TARGETS omptarget.rtl.host
-        LIBRARY DESTINATION "${OFFLOAD_INSTALL_LIBDIR}")
-set_target_properties(omptarget.rtl.host PROPERTIES
-  INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.."
-  POSITION_INDEPENDENT_CODE ON
-  CXX_VISIBILITY_PRESET protected)
-
 target_include_directories(omptarget.rtl.host PRIVATE
                            ${LIBOMPTARGET_INCLUDE_DIR})
 
@@ -52,37 +44,22 @@ endif()
 
 # Define the target specific triples and ELF machine values.
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le$")
-  target_compile_definitions(omptarget.rtl.host PRIVATE TARGET_ELF_ID=EM_PPC64)
-  target_compile_definitions(omptarget.rtl.host PRIVATE
-      LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE="powerpc64le-ibm-linux-gnu")
   list(APPEND LIBOMPTARGET_SYSTEM_TARGETS 
        "powerpc64le-ibm-linux-gnu" "powerpc64le-ibm-linux-gnu-LTO")
   set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64$")
-  target_compile_definitions(omptarget.rtl.host PRIVATE TARGET_ELF_ID=EM_PPC64)
-  target_compile_definitions(omptarget.rtl.host PRIVATE
-      LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE="powerpc64-ibm-linux-gnu")
   list(APPEND LIBOMPTARGET_SYSTEM_TARGETS 
        "powerpc64-ibm-linux-gnu" "powerpc64-ibm-linux-gnu-LTO")
   set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64$")
-  target_compile_definitions(omptarget.rtl.host PRIVATE TARGET_ELF_ID=EM_X86_64)
-  target_compile_definitions(omptarget.rtl.host PRIVATE
-      LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE="x86_64-pc-linux-gnu")
   list(APPEND LIBOMPTARGET_SYSTEM_TARGETS 
        "x86_64-pc-linux-gnu" "x86_64-pc-linux-gnu-LTO")
   set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64$")
-  target_compile_definitions(omptarget.rtl.host PRIVATE TARGET_ELF_ID=EM_AARCH64)
-  target_compile_definitions(omptarget.rtl.host PRIVATE
-      LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE="aarch64-unknown-linux-gnu")
   list(APPEND LIBOMPTARGET_SYSTEM_TARGETS 
        "aarch64-unknown-linux-gnu" "aarch64-unknown-linux-gnu-LTO")
   set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "s390x$")
-  target_compile_definitions(omptarget.rtl.host PRIVATE TARGET_ELF_ID=EM_S390)
-  target_compile_definitions(omptarget.rtl.host PRIVATE
-      LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE="s390x-ibm-linux-gnu")
   list(APPEND LIBOMPTARGET_SYSTEM_TARGETS 
        "s390x-ibm-linux-gnu" "s390x-ibm-linux-gnu-LTO")
   set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index f0ce24249301..ef84cbaf5458 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -18,6 +18,7 @@
 
 #include "Shared/Debug.h"
 #include "Shared/Environment.h"
+#include "Utils/ELF.h"
 
 #include "GlobalHandler.h"
 #include "OpenMP/OMPT/Callback.h"
@@ -30,19 +31,20 @@
 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
 #include "llvm/Support/DynamicLibrary.h"
 
-// The number of devices in this plugin.
-#define NUM_DEVICES 4
-
-// The ELF ID should be defined at compile-time by the build system.
-#ifndef TARGET_ELF_ID
-#define TARGET_ELF_ID EM_NONE
+#if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) ||           \
+    !defined(__ORDER_BIG_ENDIAN__)
+#error "Missing preprocessor definitions for endianness detection."
 #endif
 
-// The target triple should be defined at compile-time by the build system.
-#ifndef LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE
-#define LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE ""
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define LITTLEENDIAN_CPU
+#elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#define BIGENDIAN_CPU
 #endif
 
+// The number of devices in this plugin.
+#define NUM_DEVICES 4
+
 namespace llvm {
 namespace omp {
 namespace target {
@@ -383,10 +385,6 @@ struct GenELF64PluginTy final : public GenericPluginTy {
 
   /// Initialize the plugin and return the number of devices.
   Expected<int32_t> initImpl() override {
-#ifdef OMPT_SUPPORT
-    ompt::connectLibrary();
-#endif
-
 #ifdef USES_DYNAMIC_FFI
     if (auto Err = Plugin::check(ffi_init(), "Failed to initialize libffi"))
       return std::move(Err);
@@ -410,7 +408,9 @@ struct GenELF64PluginTy final : public GenericPluginTy {
   }
 
   /// Get the ELF code to recognize the compatible binary images.
-  uint16_t getMagicElfBits() const override { return ELF::TARGET_ELF_ID; }
+  uint16_t getMagicElfBits() const override {
+    return utils::elf::getTargetMachine();
+  }
 
   /// This plugin does not support exchanging data between two devices.
   bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) override {
@@ -421,11 +421,29 @@ struct GenELF64PluginTy final : public GenericPluginTy {
   Expected<bool> isELFCompatible(StringRef) const override { return true; }
 
   Triple::ArchType getTripleArch() const override {
-    return llvm::Triple(LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE).getArch();
+#if defined(__x86_64__)
+    return llvm::Triple::x86_64;
+#elif defined(__s390x__)
+    return llvm::Triple::systemz;
+#elif defined(__aarch64__)
+#ifdef LITTLEENDIAN_CPU
+    return llvm::Triple::aarch64;
+#else
+    return llvm::Triple::aarch64_be;
+#endif
+#elif defined(__powerpc64__)
+#ifdef LITTLEENDIAN_CPU
+    return llvm::Triple::ppc64le;
+#else
+    return llvm::Triple::ppc64;
+#endif
+#else
+    return llvm::Triple::UnknownArch;
+#endif
   }
-};
 
-GenericPluginTy *PluginTy::createPlugin() { return new GenELF64PluginTy(); }
+  const char *getName() const override { return GETNAME(TARGET_NAME); }
+};
 
 template <typename... ArgsTy>
 static Error Plugin::check(int32_t Code, const char *ErrMsg, ArgsTy... Args) {
@@ -440,3 +458,9 @@ static Error Plugin::check(int32_t Code, const char *ErrMsg, ArgsTy... Args) {
 } // namespace target
 } // namespace omp
 } // namespace llvm
+
+extern "C" {
+llvm::omp::target::plugin::GenericPluginTy *createPlugin_host() {
+  return new llvm::omp::target::plugin::GenELF64PluginTy();
+}
+}
diff --git a/offload/src/CMakeLists.txt b/offload/src/CMakeLists.txt
index eda5a85ff1ab..b474f29ea0be 100644
--- a/offload/src/CMakeLists.txt
+++ b/offload/src/CMakeLists.txt
@@ -65,14 +65,13 @@ target_compile_definitions(omptarget PRIVATE
   DEBUG_PREFIX="omptarget"
 )
 
+foreach(plugin IN LISTS LIBOMPTARGET_PLUGINS_TO_BUILD)
+  target_link_libraries(omptarget PRIVATE omptarget.rtl.${plugin})
+endforeach()
+
 target_compile_options(omptarget PUBLIC ${offload_compile_flags})
 target_link_options(omptarget PUBLIC ${offload_link_flags})
 
-list(TRANSFORM LIBOMPTARGET_PLUGINS_TO_LOAD PREPEND "\"libomptarget.rtl.")
-list(TRANSFORM LIBOMPTARGET_PLUGINS_TO_LOAD APPEND "\"")
-list(JOIN LIBOMPTARGET_PLUGINS_TO_LOAD "," ENABLED_OFFLOAD_PLUGINS)
-target_compile_definitions(omptarget PRIVATE ENABLED_OFFLOAD_PLUGINS=${ENABLED_OFFLOAD_PLUGINS})
-
 # libomptarget.so needs to be aware of where the plugins live as they
 # are now separated in the build directory.
 set_target_properties(omptarget PROPERTIES
diff --git a/offload/src/OffloadRTL.cpp b/offload/src/OffloadRTL.cpp
index dd75b1b18150..29b573a27d08 100644
--- a/offload/src/OffloadRTL.cpp
+++ b/offload/src/OffloadRTL.cpp
@@ -50,6 +50,7 @@ void deinitRuntime() {
 
   if (RefCount == 1) {
     DP("Deinit offload library!\n");
+    PM->deinit();
     delete PM;
     PM = nullptr;
   }
diff --git a/offload/src/OpenMP/InteropAPI.cpp b/offload/src/OpenMP/InteropAPI.cpp
index 1a995cde7816..bdbc440c64a2 100644
--- a/offload/src/OpenMP/InteropAPI.cpp
+++ b/offload/src/OpenMP/InteropAPI.cpp
@@ -230,14 +230,14 @@ void __tgt_interop_init(ident_t *LocRef, int32_t Gtid,
   }
 
   DeviceTy &Device = *DeviceOrErr;
-  if (!Device.RTL || !Device.RTL->init_device_info ||
+  if (!Device.RTL ||
       Device.RTL->init_device_info(DeviceId, &(InteropPtr)->device_info,
                                    &(InteropPtr)->err_str)) {
     delete InteropPtr;
     InteropPtr = omp_interop_none;
   }
   if (InteropType == kmp_interop_type_tasksync) {
-    if (!Device.RTL || !Device.RTL->init_async_info ||
+    if (!Device.RTL ||
         Device.RTL->init_async_info(DeviceId, &(InteropPtr)->async_info)) {
       delete InteropPtr;
       InteropPtr = omp_interop_none;
diff --git a/offload/src/PluginManager.cpp b/offload/src/PluginManager.cpp
index dbb556c179e5..191afa345641 100644
--- a/offload/src/PluginManager.cpp
+++ b/offload/src/PluginManager.cpp
@@ -23,85 +23,25 @@ using namespace llvm::sys;
 
 PluginManager *PM = nullptr;
 
-Expected<std::unique_ptr<PluginAdaptorTy>>
-PluginAdaptorTy::create(const std::string &Name) {
-  DP("Attempting to load library '%s'...\n", Name.c_str());
-  TIMESCOPE_WITH_NAME_AND_IDENT(Name, (const ident_t *)nullptr);
-
-  std::string ErrMsg;
-  auto LibraryHandler = std::make_unique<DynamicLibrary>(
-      DynamicLibrary::getPermanentLibrary(Name.c_str(), &ErrMsg));
-
-  if (!LibraryHandler->isValid()) {
-    // Library does not exist or cannot be found.
-    return createStringError(inconvertibleErrorCode(),
-                             "Unable to load library '%s': %s!\n", Name.c_str(),
-                             ErrMsg.c_str());
-  }
-
-  DP("Successfully loaded library '%s'!\n", Name.c_str());
-  auto PluginAdaptor = std::unique_ptr<PluginAdaptorTy>(
-      new PluginAdaptorTy(Name, std::move(LibraryHandler)));
-  if (auto Err = PluginAdaptor->init())
-    return Err;
-  return std::move(PluginAdaptor);
-}
-
-PluginAdaptorTy::PluginAdaptorTy(const std::string &Name,
-                                 std::unique_ptr<llvm::sys::DynamicLibrary> DL)
-    : Name(Name), LibraryHandler(std::move(DL)) {}
-
-Error PluginAdaptorTy::init() {
-
-#define PLUGIN_API_HANDLE(NAME)                                                \
-  NAME = reinterpret_cast<decltype(NAME)>(                                     \
-      LibraryHandler->getAddressOfSymbol(GETNAME(__tgt_rtl_##NAME)));          \
-  if (!NAME) {                                                                 \
-    return createStringError(inconvertibleErrorCode(),                         \
-                             "Invalid plugin as necessary interface function " \
-                             "(%s) was not found.\n",                          \
-                             std::string(#NAME).c_str());                      \
-  }
-
-#include "Shared/PluginAPI.inc"
-#undef PLUGIN_API_HANDLE
-
-  // Remove plugin on failure to call optional init_plugin
-  int32_t Rc = init_plugin();
-  if (Rc != OFFLOAD_SUCCESS) {
-    return createStringError(inconvertibleErrorCode(),
-                             "Unable to initialize library '%s': %u!\n",
-                             Name.c_str(), Rc);
-  }
-
-  // No devices are supported by this RTL?
-  int32_t NumberOfPluginDevices = number_of_devices();
-  if (!NumberOfPluginDevices) {
-    return createStringError(inconvertibleErrorCode(),
-                             "No devices supported in this RTL\n");
-  }
-
-  DP("Registered '%s' with %d plugin visible devices!\n", Name.c_str(),
-     NumberOfPluginDevices);
-  return Error::success();
-}
+// Every plugin exports this method to create an instance of the plugin type.
+#define PLUGIN_TARGET(Name) extern "C" GenericPluginTy *createPlugin_##Name();
+#include "Shared/Targets.def"
 
 void PluginManager::init() {
   TIMESCOPE();
   DP("Loading RTLs...\n");
 
-  // Attempt to open all the plugins and, if they exist, check if the interface
-  // is correct and if they are supporting any devices.
+  // Attempt to create an instance of each supported plugin.
 #define PLUGIN_TARGET(Name)                                                    \
   do {                                                                         \
-    auto PluginAdaptorOrErr =                                                  \
-        PluginAdaptorTy::create("libomptarget.rtl." #Name ".so");              \
-    if (!PluginAdaptorOrErr) {                                                 \
-      [[maybe_unused]] std::string InfoMsg =                                   \
-          toString(PluginAdaptorOrErr.takeError());                            \
-      DP("%s", InfoMsg.c_str());                                               \
+    auto Plugin = std::unique_ptr<GenericPluginTy>(createPlugin_##Name());     \
+    if (auto Err = Plugin->init()) {                                           \
+      [[maybe_unused]] std::string InfoMsg = toString(std::move(Err));         \
+      DP("Failed to init plugin: %s\n", InfoMsg.c_str());                      \
     } else {                                                                   \
-      PluginAdaptors.push_back(std::move(*PluginAdaptorOrErr));                \
+      DP("Registered plugin %s with %d visible device(s)\n",                   \
+         Plugin->getName(), Plugin->number_of_devices());                      \
+      Plugins.emplace_back(std::move(Plugin));                                 \
     }                                                                          \
   } while (false);
 #include "Shared/Targets.def"
@@ -109,15 +49,29 @@ void PluginManager::init() {
   DP("RTLs loaded!\n");
 }
 
-void PluginManager::initDevices(PluginAdaptorTy &RTL) {
+void PluginManager::deinit() {
+  TIMESCOPE();
+  DP("Unloading RTLs...\n");
+
+  for (auto &Plugin : Plugins) {
+    if (auto Err = Plugin->deinit()) {
+      [[maybe_unused]] std::string InfoMsg = toString(std::move(Err));
+      DP("Failed to deinit plugin: %s\n", InfoMsg.c_str());
+    }
+    Plugin.release();
+  }
+
+  DP("RTLs unloaded!\n");
+}
+
+void PluginManager::initDevices(GenericPluginTy &RTL) {
   // If this RTL has already been initialized.
   if (PM->DeviceOffsets.contains(&RTL))
     return;
   TIMESCOPE();
 
   // If this RTL is not already in use, initialize it.
-  assert(RTL.number_of_devices() > 0 &&
-         "Tried to initialize useless plugin adaptor");
+  assert(RTL.number_of_devices() > 0 && "Tried to initialize useless plugin!");
 
   // Initialize the device information for the RTL we are about to use.
   auto ExclusiveDevicesAccessor = getExclusiveDevicesAccessor();
@@ -157,13 +111,12 @@ void PluginManager::initDevices(PluginAdaptorTy &RTL) {
 
   DeviceOffsets[&RTL] = DeviceOffset;
   DeviceUsed[&RTL] = NumberOfUserDevices;
-  DP("Plugin adaptor " DPxMOD " has index %d, exposes %d out of %d devices!\n",
-     DPxPTR(RTL.LibraryHandler.get()), DeviceOffset, NumberOfUserDevices,
-     RTL.number_of_devices());
+  DP("Plugin has index %d, exposes %d out of %d devices!\n", DeviceOffset,
+     NumberOfUserDevices, RTL.number_of_devices());
 }
 
 void PluginManager::initAllPlugins() {
-  for (auto &R : PluginAdaptors)
+  for (auto &R : Plugins)
     initDevices(*R);
 }
 
@@ -216,19 +169,22 @@ void PluginManager::registerLib(__tgt_bin_desc *Desc) {
     // Obtain the image and information that was previously extracted.
     __tgt_device_image *Img = &DI.getExecutableImage();
 
-    PluginAdaptorTy *FoundRTL = nullptr;
+    GenericPluginTy *FoundRTL = nullptr;
 
     // Scan the RTLs that have associated images until we find one that supports
     // the current image.
-    for (auto &R : PM->pluginAdaptors()) {
+    for (auto &R : PM->plugins()) {
+      if (!R.number_of_devices())
+        continue;
+
       if (!R.is_valid_binary(Img)) {
         DP("Image " DPxMOD " is NOT compatible with RTL %s!\n",
-           DPxPTR(Img->ImageStart), R.Name.c_str());
+           DPxPTR(Img->ImageStart), R.getName());
         continue;
       }
 
       DP("Image " DPxMOD " is compatible with RTL %s!\n",
-         DPxPTR(Img->ImageStart), R.Name.c_str());
+         DPxPTR(Img->ImageStart), R.getName());
 
       PM->initDevices(R);
 
@@ -247,7 +203,7 @@ void PluginManager::registerLib(__tgt_bin_desc *Desc) {
           (PM->HostEntriesBeginToTransTable)[Desc->HostEntriesBegin];
 
       DP("Registering image " DPxMOD " with RTL %s!\n", DPxPTR(Img->ImageStart),
-         R.Name.c_str());
+         R.getName());
 
       registerImageIntoTranslationTable(TransTable, PM->DeviceOffsets[&R],
                                         PM->DeviceUsed[&R], Img);
@@ -282,11 +238,11 @@ void PluginManager::unregisterLib(__tgt_bin_desc *Desc) {
     // Obtain the image and information that was previously extracted.
     __tgt_device_image *Img = &DI.getExecutableImage();
 
-    PluginAdaptorTy *FoundRTL = NULL;
+    GenericPluginTy *FoundRTL = NULL;
 
     // Scan the RTLs that have associated images until we find one that supports
     // the current image. We only need to scan RTLs that are already being used.
-    for (auto &R : PM->pluginAdaptors()) {
+    for (auto &R : PM->plugins()) {
       if (!DeviceOffsets.contains(&R))
         continue;
 
@@ -296,8 +252,7 @@ void PluginManager::unregisterLib(__tgt_bin_desc *Desc) {
 
       FoundRTL = &R;
 
-      DP("Unregistered image " DPxMOD " from RTL " DPxMOD "!\n",
-         DPxPTR(Img->ImageStart), DPxPTR(R.LibraryHandler.get()));
+      DP("Unregistered image " DPxMOD " from RTL\n", DPxPTR(Img->ImageStart));
 
       break;
     }
diff --git a/offload/src/device.cpp b/offload/src/device.cpp
index 44a2facc8d3d..749b4c567f8e 100644
--- a/offload/src/device.cpp
+++ b/offload/src/device.cpp
@@ -64,7 +64,7 @@ int HostDataToTargetTy::addEventIfNecessary(DeviceTy &Device,
   return OFFLOAD_SUCCESS;
 }
 
-DeviceTy::DeviceTy(PluginAdaptorTy *RTL, int32_t DeviceID, int32_t RTLDeviceID)
+DeviceTy::DeviceTy(GenericPluginTy *RTL, int32_t DeviceID, int32_t RTLDeviceID)
     : DeviceID(DeviceID), RTL(RTL), RTLDeviceID(RTLDeviceID),
       MappingInfo(*this) {}
 
@@ -192,7 +192,6 @@ int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
           RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr, Size,
           /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
   if (!AsyncInfo) {
-    assert(RTL->data_exchange && "RTL->data_exchange is nullptr");
     return RTL->data_exchange(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr,
                               Size);
   }
diff --git a/offload/src/interface.cpp b/offload/src/interface.cpp
index 557703632c62..763b051cc6d7 100644
--- a/offload/src/interface.cpp
+++ b/offload/src/interface.cpp
@@ -456,8 +456,6 @@ EXTERN void __tgt_set_info_flag(uint32_t NewInfoLevel) {
   assert(PM && "Runtime not initialized");
   std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
   InfoLevel.store(NewInfoLevel);
-  for (auto &R : PM->pluginAdaptors())
-    R.set_info_flag(NewInfoLevel);
 }
 
 EXTERN int __tgt_print_device_info(int64_t DeviceId) {
diff --git a/offload/src/omptarget.cpp b/offload/src/omptarget.cpp
index 803e941fe838..5d5c6b05051b 100644
--- a/offload/src/omptarget.cpp
+++ b/offload/src/omptarget.cpp
@@ -461,7 +461,9 @@ void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
   if (!DeviceOrErr)
     FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
 
-  DeviceOrErr->deleteData(DevicePtr, Kind);
+  if (DeviceOrErr->deleteData(DevicePtr, Kind) == OFFLOAD_FAIL)
+    FATAL_MESSAGE(DeviceNum, "%s", "Failed to deallocate device ptr");
+
   DP("omp_target_free deallocated device ptr\n");
 }
 
diff --git a/offload/test/offloading/fortran/target-map-derived-type-full-1.f90 b/offload/test/offloading/fortran/target-map-derived-type-full-1.f90
new file mode 100644
index 000000000000..cb03708554fe
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-derived-type-full-1.f90
@@ -0,0 +1,45 @@
+! Offloading test checking interaction of an
+! explicit derived type mapping when mapped 
+! to target and assinging one derived type
+! to another
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    type :: scalar
+    integer(4) :: ix = 0
+    real(4) :: rx = 0.0
+    complex(4) :: zx = (0,0)
+    end type scalar  
+  
+    type(scalar) :: in
+    type(scalar) :: out
+    in%ix = 10
+    in%rx = 2.0
+    in%zx = (2, 10)
+  
+  !$omp target map(from:out) map(to:in)
+      out = in 
+  !$omp end target
+  
+    print*, in%ix
+    print*, in%rx
+    write (*,*) in%zx
+  
+    print*, out%ix
+    print*, out%rx
+    write (*,*)  out%zx
+end program main
+
+!CHECK: 10
+!CHECK: 2.
+!CHECK: (2.,10.)
+!CHECK: 10
+!CHECK: 2.
+!CHECK: (2.,10.)
diff --git a/offload/test/offloading/fortran/target-map-derived-type-full-2.f90 b/offload/test/offloading/fortran/target-map-derived-type-full-2.f90
new file mode 100644
index 000000000000..0095b0fdf86a
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-derived-type-full-2.f90
@@ -0,0 +1,60 @@
+! Offloading test checking interaction of an
+! explicit derived type mapping when mapped to 
+! target and assigning to individual members
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    type :: scalar
+    integer(4) :: ix = 0
+    real(4) :: rx = 0.0
+    complex(4) :: zx = (0,0)
+    integer(4) :: array(5)
+    end type scalar 
+  
+    type(scalar) :: out
+    type(scalar) :: in
+  
+    in%ix = 10
+    in%rx = 2.0
+    in%zx = (2, 10)
+  
+    do i = 1, 5
+      in%array(i) = i
+    end do 
+  
+  !$omp target map(from:out) map(to:in)
+    out%ix = in%ix
+    out%rx = in%rx
+    out%zx = in%zx
+  
+    do i = 1, 5
+      out%array(i) = in%array(i)
+    end do 
+  !$omp end target
+  
+    print*, in%ix
+    print*, in%rx
+    print*, in%array
+    write (*,*) in%zx
+
+    print*, out%ix
+    print*, out%rx
+    print*, out%array
+    write (*,*)  out%zx
+end program main
+
+!CHECK: 10
+!CHECK: 2.
+!CHECK: 1 2 3 4 5
+!CHECK: (2.,10.)
+!CHECK: 10
+!CHECK: 2.
+!CHECK: 1 2 3 4 5
+!CHECK: (2.,10.)
diff --git a/offload/test/offloading/fortran/target-map-derived-type-full-implicit-1.f90 b/offload/test/offloading/fortran/target-map-derived-type-full-implicit-1.f90
new file mode 100644
index 000000000000..f57e2c70d155
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-derived-type-full-implicit-1.f90
@@ -0,0 +1,46 @@
+! Offloading test checking interaction of an
+! implicit derived type mapping when mapped 
+! to target and assinging one derived type
+! to another
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    type :: scalar
+    integer(4) :: ix = 0
+    real(4) :: rx = 0.0
+    complex(4) :: zx = (0,0)
+    end type scalar  
+  
+    type(scalar) :: in
+    type(scalar) :: out
+    in%ix = 10
+    in%rx = 2.0
+    in%zx = (2, 10)
+  
+  !$omp target map(from:out)
+      out = in 
+  !$omp end target
+  
+    print*, in%ix
+    print*, in%rx
+    write (*,*) in%zx
+
+    print*, out%ix
+    print*, out%rx
+    write (*,*)  out%zx
+  end program main
+
+!CHECK: 10
+!CHECK: 2.
+!CHECK: (2.,10.)
+!CHECK: 10
+!CHECK: 2.
+!CHECK: (2.,10.)
+  
+\ No newline at end of file
diff --git a/offload/test/offloading/fortran/target-map-derived-type-full-implicit-2.f90 b/offload/test/offloading/fortran/target-map-derived-type-full-implicit-2.f90
new file mode 100644
index 000000000000..92d3454d462a
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-derived-type-full-implicit-2.f90
@@ -0,0 +1,61 @@
+! Offloading test checking interaction of an
+! explicit derived type mapping when mapped 
+! to target and assinging one derived type
+! to another
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    type :: scalar
+    integer(4) :: ix = 0
+    real(4) :: rx = 0.0
+    complex(4) :: zx = (0,0)
+    integer(4) :: array(5)
+    end type scalar 
+  
+    type(scalar) :: out
+    type(scalar) :: in
+  
+    in%ix = 10
+    in%rx = 2.0
+    in%zx = (2, 10)
+  
+    do i = 1, 5
+      in%array(i) = i
+    end do 
+  
+  !$omp target
+    out%ix = in%ix
+    out%rx = in%rx
+    out%zx = in%zx
+  
+    do i = 1, 5
+      out%array(i) = in%array(i)
+    end do 
+  !$omp end target
+  
+    print*, in%ix
+    print*, in%rx
+    print*, in%array
+    write (*,*) in%zx
+
+    print*, out%ix
+    print*, out%rx
+    print*, out%array
+    write (*,*)  out%zx
+end program main
+
+!CHECK: 10
+!CHECK: 2.
+!CHECK: 1 2 3 4 5
+!CHECK: (2.,10.)
+!CHECK: 10
+!CHECK: 2.
+!CHECK: 1 2 3 4 5
+!CHECK: (2.,10.)
diff --git a/offload/test/offloading/fortran/target-map-double-large-nested-dtype-multi-member.f90 b/offload/test/offloading/fortran/target-map-double-large-nested-dtype-multi-member.f90
new file mode 100644
index 000000000000..31774be19146
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-double-large-nested-dtype-multi-member.f90
@@ -0,0 +1,101 @@
+! Offloading test checking interaction of an
+! explicit member map from two large nested
+! derived types
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    type :: bottom_layer1
+    real(4) :: i4
+    real(4) :: j4
+    real(4) :: k4
+    end type bottom_layer1
+
+    type :: bottom_layer2
+      integer(4) :: i3
+      integer(4) :: j3
+      integer(4) :: k3
+    end type bottom_layer2 
+
+    type :: middle_layer
+     real(4) :: array_i2(10)
+     real(4) :: i2
+     real(4) :: array_j2(10)
+     type(bottom_layer1) :: nest 
+     type(bottom_layer2) :: nest2 
+    end type middle_layer
+
+    type :: top_layer
+    real(4) :: i
+    integer(4) :: array_i(10)
+    real(4) :: j
+    integer, allocatable :: array_j(:)
+    integer(4) :: k
+    type(middle_layer) :: nested
+    end type top_layer
+    
+    type(top_layer) :: top_dtype
+    type(top_layer) :: top_dtype2
+
+    top_dtype2%nested%nest%i4 = 10
+    top_dtype2%nested%nest%j4 = 12
+    top_dtype2%nested%nest%k4 = 54
+    
+    top_dtype2%nested%nest2%i3 = 20
+    top_dtype2%nested%nest2%j3 = 40
+    top_dtype2%nested%nest2%k3 = 60
+    
+    top_dtype2%nested%i2 = 200
+
+      do i = 1, 10
+        top_dtype2%array_i(i) = i
+      end do
+
+!$omp target map(from: top_dtype%nested%nest%j4, top_dtype%nested%nest%i4, top_dtype%nested%nest%k4) &
+!$omp map(from: top_dtype%array_i, top_dtype%nested%nest2%i3, top_dtype%nested%i2) &
+!$omp map(from: top_dtype%nested%nest2%k3, top_dtype%nested%nest2%j3) &
+!$omp map(to: top_dtype2%nested%nest%j4, top_dtype2%nested%nest%i4, top_dtype2%nested%nest%k4) &
+!$omp map(to: top_dtype2%array_i, top_dtype2%nested%nest2%i3, top_dtype2%nested%i2) &
+!$omp map(to: top_dtype2%nested%nest2%k3, top_dtype2%nested%nest2%j3)
+    top_dtype%nested%nest%i4 = top_dtype2%nested%nest%i4
+    top_dtype%nested%nest%j4 = top_dtype2%nested%nest%j4 
+    top_dtype%nested%nest%k4 = top_dtype2%nested%nest%k4
+    
+    top_dtype%nested%nest2%i3 = top_dtype2%nested%nest2%i3
+    top_dtype%nested%nest2%j3 = top_dtype2%nested%nest2%j3
+    top_dtype%nested%nest2%k3 = top_dtype2%nested%nest2%k3
+    
+    top_dtype%nested%i2 = top_dtype2%nested%i2
+
+    do i = 1, 10
+      top_dtype%array_i(i) = top_dtype2%array_i(i)
+    end do
+!$omp end target
+  
+  print *, top_dtype%nested%nest%i4
+  print *, top_dtype%nested%nest%j4
+  print *, top_dtype%nested%nest%k4
+
+  print *, top_dtype%nested%nest2%i3
+  print *, top_dtype%nested%nest2%j3
+  print *, top_dtype%nested%nest2%k3
+  
+  print *, top_dtype%nested%i2
+
+  print *, top_dtype%array_i 
+end program main
+
+!CHECK: 10.
+!CHECK: 12.
+!CHECK: 54.
+!CHECK: 20
+!CHECK: 40
+!CHECK: 60
+!CHECK: 200.
+!CHECK: 1 2 3 4 5 6 7 8 9 10
diff --git a/offload/test/offloading/fortran/target-map-double-nested-dtype-array-bounds.f90 b/offload/test/offloading/fortran/target-map-double-nested-dtype-array-bounds.f90
new file mode 100644
index 000000000000..cecfb9e84a59
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-double-nested-dtype-array-bounds.f90
@@ -0,0 +1,47 @@
+! Offloading test checking interaction of two
+! explicit arrau member maps with bounds from 
+! two nested derived types 
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    type :: bottom_layer
+      real(8) :: i2
+      real(4) :: array_i2(10)
+      real(4) :: array_j2(10)
+    end type bottom_layer
+
+    type :: top_layer
+      real(4) :: i
+      integer(4) :: array_i(10)
+      real(4) :: j
+      type(bottom_layer) :: nested
+      integer, allocatable :: array_j(:)
+      integer(4) :: k
+    end type top_layer
+    
+    type(top_layer) :: top_dtype
+    type(top_layer) :: top_dtype2
+
+!$omp target map(tofrom: top_dtype%nested%array_i2(4:8), top_dtype2%nested%array_j2(4:8))
+    do i = 4, 8 
+      top_dtype%nested%array_i2(i) = i * 2
+    end do 
+
+    do i = 4, 8 
+      top_dtype2%nested%array_j2(i) = i * 2
+    end do 
+!$omp end target
+  
+  print *, top_dtype%nested%array_i2
+  print *, top_dtype2%nested%array_j2
+end program main
+
+!CHECK: 0. 0. 0. 8. 10. 12. 14. 16. 0. 0.
+!CHECK: 0. 0. 0. 8. 10. 12. 14. 16. 0. 0.
diff --git a/offload/test/offloading/fortran/target-map-double-nested-dtype-double-array-bounds.f90 b/offload/test/offloading/fortran/target-map-double-nested-dtype-double-array-bounds.f90
new file mode 100644
index 000000000000..a8762a0829cc
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-double-nested-dtype-double-array-bounds.f90
@@ -0,0 +1,47 @@
+! Offloading test checking interaction of two
+! explicit array member maps with array bounds 
+! from two nested derived types 
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    type :: bottom_layer
+      real(8) :: i2
+      real(4) :: array_i2(10)
+      real(4) :: array_j2(10)
+    end type bottom_layer
+
+    type :: top_layer
+      real(4) :: i
+      integer(4) :: array_i(10)
+      real(4) :: j
+      type(bottom_layer) :: nested
+      integer, allocatable :: array_j(:)
+      integer(4) :: k
+    end type top_layer
+    
+    type(top_layer) :: top_dtype
+    type(top_layer) :: top_dtype2
+
+!$omp target map(tofrom: top_dtype%nested%array_i2(4:8), top_dtype2%nested%array_j2(4:8))
+    do i = 4, 8 
+      top_dtype%nested%array_i2(i) = i * 2
+    end do 
+
+    do i = 4, 8 
+      top_dtype2%nested%array_j2(i) = i * 2
+    end do 
+!$omp end target
+  
+  print *, top_dtype%nested%array_i2
+  print *, top_dtype2%nested%array_j2
+end program main
+
+!CHECK: 0. 0. 0. 8. 10. 12. 14. 16. 0. 0
+!CHECK: 0. 0. 0. 8. 10. 12. 14. 16. 0. 0
diff --git a/offload/test/offloading/fortran/target-map-double-nested-dtype-single-member.f90 b/offload/test/offloading/fortran/target-map-double-nested-dtype-single-member.f90
new file mode 100644
index 000000000000..9ecb394dbe46
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-double-nested-dtype-single-member.f90
@@ -0,0 +1,47 @@
+! Offloading test checking interaction of an
+! explicit derived type member mapping of two
+! derived types for a single array member each
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    type :: bottom_layer
+      real(8) :: i2
+      real(4) :: array_i2(10)
+      real(4) :: array_j2(10)
+    end type bottom_layer
+
+    type :: top_layer
+      real(4) :: i
+      integer(4) :: array_i(10)
+      real(4) :: j
+      type(bottom_layer) :: nested
+      integer, allocatable :: array_j(:)
+      integer(4) :: k
+    end type top_layer
+    
+    type(top_layer) :: top_dtype
+    type(top_layer) :: top_dtype2
+
+!$omp target map(tofrom: top_dtype%nested%array_i2, top_dtype2%nested%array_j2)
+    do i = 1, 10 
+      top_dtype%nested%array_i2(i) = i * 2
+    end do 
+
+    do i = 1, 10 
+      top_dtype2%nested%array_j2(i) = i * 2
+    end do 
+!$omp end target
+
+  print *, top_dtype%nested%array_i2
+  print *, top_dtype2%nested%array_j2
+end program main
+
+!CHECK: 2. 4. 6. 8. 10. 12. 14. 16. 18. 20.
+!CHECK: 2. 4. 6. 8. 10. 12. 14. 16. 18. 20.
diff --git a/offload/test/offloading/fortran/target-map-dtype-arr-bounds-member-enter-exit-update.f90 b/offload/test/offloading/fortran/target-map-dtype-arr-bounds-member-enter-exit-update.f90
new file mode 100644
index 000000000000..3b3ec96b9bab
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-dtype-arr-bounds-member-enter-exit-update.f90
@@ -0,0 +1,49 @@
+! Offloading test checking interaction of an
+! explicit derived type member mapping of 
+! an array with bounds when mapped to 
+! target using a combination of update,
+! enter and exit directives.
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    type :: scalar_array
+        integer(4) :: array(10)
+    end type scalar_array
+
+    type(scalar_array) :: scalar_arr
+
+    do I = 1, 10
+        scalar_arr%array(I) = I + I
+    end do
+
+  !$omp target enter data map(to: scalar_arr%array(3:6))
+
+    ! overwrite our target data with an update.  
+    do I = 1, 10
+        scalar_arr%array(I) = 10
+    end do
+
+  !$omp target update to(scalar_arr%array(3:6))
+
+  ! The compiler/runtime is less friendly about read/write out of 
+  ! bounds when using enter and exit, we have to specifically loop
+  ! over the correct range
+   !$omp target
+    do i=3,6
+        scalar_arr%array(i) = scalar_arr%array(i) + i
+    end do
+  !$omp end target 
+
+  !$omp target exit data map(from: scalar_arr%array(3:6))
+  
+  print*, scalar_arr%array
+end program
+
+!CHECK: 10 10 13 14 15 16 10 10 10 10
diff --git a/offload/test/offloading/fortran/target-map-dtype-arr-bounds-member-enter-exit.f90 b/offload/test/offloading/fortran/target-map-dtype-arr-bounds-member-enter-exit.f90
new file mode 100644
index 000000000000..5f7e9f946826
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-dtype-arr-bounds-member-enter-exit.f90
@@ -0,0 +1,49 @@
+! Offloading test checking interaction of an
+! explicit derived type member mapping of 
+! an array with bounds when mapped to 
+! target using a combination of enter and 
+! exit directives.
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    type :: scalar_array
+        integer(4) :: array(10)
+    end type scalar_array
+
+    type(scalar_array) :: scalar_arr
+
+    do I = 1, 10
+        scalar_arr%array(I) = I + I
+    end do
+
+    !$omp target enter data map(to: scalar_arr%array(3:6))
+    
+    ! Shouldn't overwrite data already locked in
+    ! on target via enter, which will then be 
+    ! overwritten by our exit
+    do I = 1, 10
+        scalar_arr%array(I) = 10
+    end do
+
+  ! The compiler/runtime is less friendly about read/write out of 
+  ! bounds when using enter and exit, we have to specifically loop
+  ! over the correct range
+   !$omp target
+    do i=3,6
+        scalar_arr%array(i) = scalar_arr%array(i) + i
+    end do
+  !$omp end target 
+
+  !$omp target exit data map(from: scalar_arr%array(3:6))
+  
+  print*, scalar_arr%array
+end program
+
+!CHECK: 10 10 9 12 15 18 10 10 10 10
diff --git a/offload/test/offloading/fortran/target-map-dtype-explicit-individual-array-member.f90 b/offload/test/offloading/fortran/target-map-dtype-explicit-individual-array-member.f90
new file mode 100644
index 000000000000..907b16ffedf5
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-dtype-explicit-individual-array-member.f90
@@ -0,0 +1,33 @@
+! Offloading test checking interaction of an
+! explicit derived type member mapping of 
+! an array when mapped to target
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+type :: scalar_array
+    real(4) :: break_0
+    real(4) :: array_x(10)
+    real(4) :: break_1
+    real(4) :: array_y(10)
+    real(4) :: break_3
+end type scalar_array
+  
+   type(scalar_array) :: scalar_arr
+    
+  !$omp target map(tofrom:scalar_arr%array_y)
+    do i = 1, 10
+      scalar_arr%array_y(i) = i
+    end do
+  !$omp end target
+
+  print *, scalar_arr%array_y
+end program main
+
+!CHECK: 1. 2. 3. 4. 5. 6. 7. 8. 9. 10.
diff --git a/offload/test/offloading/fortran/target-map-dtype-multi-explicit-array-3D-member-bounds.f90 b/offload/test/offloading/fortran/target-map-dtype-multi-explicit-array-3D-member-bounds.f90
new file mode 100644
index 000000000000..110fb648980c
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-dtype-multi-explicit-array-3D-member-bounds.f90
@@ -0,0 +1,45 @@
+! Offloading test checking interaction of an
+! explicit derived type member mapping of 
+! two arrays with explicit bounds when 
+! mapped to target
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    type :: scalar_array
+    real(4) :: break_0
+    integer(4) :: array_x(3,3,3)
+    real(4) :: break_1
+    integer(4) :: array_y(3,3,3)
+    real(4) :: break_3
+    end type scalar_array
+  
+    type(scalar_array) :: scalar_arr
+    
+    do i = 1, 3
+      do j = 1, 3
+        do k = 1, 3
+            scalar_arr%array_x(i, j, k) = 42
+            scalar_arr%array_y(i, j, k) = 0 ! Will get overwritten by garbage values in target
+        end do
+       end do
+    end do
+
+  !$omp target map(tofrom:scalar_arr%array_x(1:3, 1:3, 2:2), scalar_arr%array_y(1:3, 1:3, 1:3))
+    do j = 1, 3
+      do k = 1, 3
+        scalar_arr%array_y(k, j, 2) = scalar_arr%array_x(k, j, 2)
+      end do
+    end do
+  !$omp end target
+
+  print *, scalar_arr%array_y
+end program main
+
+!CHECK: 0 0 0 0 0 0 0 0 0 42 42 42 42 42 42 42 42 42 0 0 0 0 0 0 0 0
diff --git a/offload/test/offloading/fortran/target-map-dtype-multi-explicit-array-member-bounds.f90 b/offload/test/offloading/fortran/target-map-dtype-multi-explicit-array-member-bounds.f90
new file mode 100644
index 000000000000..b7f6e2ddfb3b
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-dtype-multi-explicit-array-member-bounds.f90
@@ -0,0 +1,38 @@
+! Offloading test checking interaction of an
+! explicit derived type member mapping of 
+! two arrays with explicit bounds when 
+! mapped to target
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    type :: scalar_array
+    real(4) :: break_0
+    real(4) :: array_x(10)
+    real(4) :: break_1
+    real(4) :: array_y(10)
+    real(4) :: break_3
+    end type scalar_array
+  
+    type(scalar_array) :: scalar_arr
+    
+  do i = 1, 10
+    scalar_arr%array_x(i) = i
+  end do
+
+  !$omp target map(tofrom:scalar_arr%array_x(3:6), scalar_arr%array_y(3:6))
+    do i = 1, 10
+      scalar_arr%array_y(i) = scalar_arr%array_x(i)
+    end do
+  !$omp end target
+
+  print*, scalar_arr%array_y
+end program main
+
+!CHECK: 0. 0. 3. 4. 5. 6. 0. 0. 0. 0.
diff --git a/offload/test/offloading/fortran/target-map-dtype-multi-explicit-array-member.f90 b/offload/test/offloading/fortran/target-map-dtype-multi-explicit-array-member.f90
new file mode 100644
index 000000000000..c44a58dbebc8
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-dtype-multi-explicit-array-member.f90
@@ -0,0 +1,39 @@
+! Offloading test checking interaction of an
+! derived type mapping of two explicit array
+! members to target
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    type :: scalar_array
+    real(4) :: break_0
+    real(4) :: array_x(10)
+    real(4) :: break_1
+    real(4) :: array_y(10)
+    real(4) :: break_3
+    end type scalar_array
+  
+    type(scalar_array) :: scalar_arr
+    
+  do i = 1, 10
+    scalar_arr%array_x(i) = i
+  end do
+
+  !$omp target map(tofrom:scalar_arr%array_x, scalar_arr%array_y)
+    do i = 1, 10
+      scalar_arr%array_y(i) = scalar_arr%array_x(i)
+    end do
+  !$omp end target
+
+  print*, scalar_arr%array_x
+  print*, scalar_arr%array_y
+end program main
+
+!CHECK: 1. 2. 3. 4. 5. 6. 7. 8. 9. 10.
+!CHECK: 1. 2. 3. 4. 5. 6. 7. 8. 9. 10.
diff --git a/offload/test/offloading/fortran/target-map-dtype-multi-explicit-member.f90 b/offload/test/offloading/fortran/target-map-dtype-multi-explicit-member.f90
new file mode 100644
index 000000000000..a4205bc0179e
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-dtype-multi-explicit-member.f90
@@ -0,0 +1,33 @@
+! Offloading test checking interaction of an
+! derived type mapping of two explicit 
+! members to target
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    type :: scalar   
+      integer(4) :: ix = 0  
+      real(4) :: rx = 0.0
+      complex(4) :: zx = (0,0)
+      real(4) :: ry = 1.0
+    end type scalar  
+    
+      type(scalar) :: scalar_struct
+      
+    !$omp target map(from:scalar_struct%rx, scalar_struct%ry)
+      scalar_struct%rx = 21.0
+      scalar_struct%ry = 27.0
+    !$omp end target
+  
+    print*, scalar_struct%rx
+    print*, scalar_struct%ry
+end program main
+  
+!CHECK: 21.
+!CHECK: 27.
diff --git a/offload/test/offloading/fortran/target-map-enter-exit-array-2.f90 b/offload/test/offloading/fortran/target-map-enter-exit-array-2.f90
index 489c2532a762..8d35a281caf9 100644
--- a/offload/test/offloading/fortran/target-map-enter-exit-array-2.f90
+++ b/offload/test/offloading/fortran/target-map-enter-exit-array-2.f90
@@ -17,7 +17,6 @@ program main
     end do
 
     !$omp target enter data map(to: array)
-
     ! Shouldn't overwrite data already locked in
     ! on target via enter, this will then be 
     ! overwritten by our exit
@@ -32,7 +31,6 @@ program main
   !$omp end target 
 
   !$omp target exit data map(from: array)
-
   print*, array
 end program
 
diff --git a/offload/test/offloading/fortran/target-map-enter-exit-array-bounds.f90 b/offload/test/offloading/fortran/target-map-enter-exit-array-bounds.f90
index 3c8c3507ed72..d842cd15c65a 100644
--- a/offload/test/offloading/fortran/target-map-enter-exit-array-bounds.f90
+++ b/offload/test/offloading/fortran/target-map-enter-exit-array-bounds.f90
@@ -19,7 +19,6 @@ program main
     end do
 
     !$omp target enter data map(to: array(3:6))
-
     ! Shouldn't overwrite data already locked in
     ! on target via enter, which will then be 
     ! overwritten by our exit
@@ -37,7 +36,6 @@ program main
   !$omp end target 
 
   !$omp target exit data map(from: array(3:6))
-
   print *, array
 end program
 
diff --git a/offload/test/offloading/fortran/target-map-enter-exit-scalar.f90 b/offload/test/offloading/fortran/target-map-enter-exit-scalar.f90
index 29a0b5ee3e62..70ed3d747829 100644
--- a/offload/test/offloading/fortran/target-map-enter-exit-scalar.f90
+++ b/offload/test/offloading/fortran/target-map-enter-exit-scalar.f90
@@ -14,7 +14,6 @@ program main
     scalar = 10
 
     !$omp target enter data map(to: scalar)
-
     !ignored, as we've already attached
     scalar = 20
 
diff --git a/offload/test/offloading/fortran/target-map-individual-dtype-member-map.f90 b/offload/test/offloading/fortran/target-map-individual-dtype-member-map.f90
new file mode 100644
index 000000000000..4cdf41db70d4
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-individual-dtype-member-map.f90
@@ -0,0 +1,33 @@
+! Offloading test checking interaction of an
+! single explicit member map from a single
+! derived type.
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    real :: test
+    type :: scalar
+        integer(4) :: ix = 0
+        real(4) :: rx = 0.0
+        complex(4) :: zx = (0,0)
+        real(4) :: ry = 1.0
+    end type scalar  
+  
+    type(scalar) :: scalar_struct
+    scalar_struct%rx = 2.0
+    test = 21.0
+
+  !$omp target map(from:scalar_struct%rx)
+    scalar_struct%rx = test
+  !$omp end target
+
+  print *, scalar_struct%rx
+end program main
+
+!CHECK: 21.
diff --git a/offload/test/offloading/fortran/target-map-large-nested-dtype-multi-member.f90 b/offload/test/offloading/fortran/target-map-large-nested-dtype-multi-member.f90
new file mode 100644
index 000000000000..2412381e62e7
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-large-nested-dtype-multi-member.f90
@@ -0,0 +1,83 @@
+! Offloading test checking interaction of an
+! explicit member map a large nested derived 
+! type
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    type :: bottom_layer1
+    real(4) :: i4
+    real(4) :: j4
+    real(4) :: k4
+    end type bottom_layer1
+
+    type :: bottom_layer2
+      integer(4) :: i3
+      integer(4) :: j3
+      integer(4) :: k3
+    end type bottom_layer2 
+
+    type :: middle_layer
+     real(4) :: array_i2(10)
+     real(4) :: i2
+     real(4) :: array_j2(10)
+     type(bottom_layer1) :: nest 
+     type(bottom_layer2) :: nest2 
+    end type middle_layer
+
+    type :: top_layer
+    real(4) :: i
+    integer(4) :: array_i(10)
+    real(4) :: j
+    integer, allocatable :: array_j(:)
+    integer(4) :: k
+    type(middle_layer) :: nested
+    end type top_layer
+    
+    type(top_layer) :: top_dtype
+
+    top_dtype%nested%nest%j4 = 12
+!$omp target map(tofrom: top_dtype%nested%nest%j4, top_dtype%nested%nest%i4, top_dtype%nested%nest%k4) &
+!$omp map(tofrom: top_dtype%array_i, top_dtype%nested%nest2%i3, top_dtype%nested%i2, top_dtype%nested%nest2%k3, top_dtype%nested%nest2%j3)
+    top_dtype%nested%nest%i4 = 10
+    top_dtype%nested%nest%j4 = 12 + top_dtype%nested%nest%j4
+    top_dtype%nested%nest%k4 = 54
+    
+    top_dtype%nested%nest2%i3 = 20
+    top_dtype%nested%nest2%j3 = 40
+    top_dtype%nested%nest2%k3 = 60
+    
+    top_dtype%nested%i2 = 200
+
+      do i = 1, 10
+        top_dtype%array_i(i) = i
+      end do
+!$omp end target
+  
+  print *, top_dtype%nested%nest%i4
+  print *, top_dtype%nested%nest%j4
+  print *, top_dtype%nested%nest%k4
+
+  print *, top_dtype%nested%nest2%i3
+  print *, top_dtype%nested%nest2%j3
+  print *, top_dtype%nested%nest2%k3
+  
+  print *, top_dtype%nested%i2
+
+  print *, top_dtype%array_i 
+end program main
+
+!CHECK: 10.
+!CHECK: 24.
+!CHECK: 54.
+!CHECK: 20
+!CHECK: 40
+!CHECK: 60
+!CHECK: 200.
+!CHECK: 1 2 3 4 5 6 7 8 9 10
diff --git a/offload/test/offloading/fortran/target-map-nested-dtype-complex-member.f90 b/offload/test/offloading/fortran/target-map-nested-dtype-complex-member.f90
new file mode 100644
index 000000000000..9d29639ca101
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-nested-dtype-complex-member.f90
@@ -0,0 +1,56 @@
+! Offloading test checking interaction of an
+! nested derived type member map of a complex
+! number member
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    type :: bottom_layer
+      real(8) :: i2
+      complex  :: j2
+      real(4) :: array_i2(10)
+      real(4) :: array_j2(10)
+    end type bottom_layer
+
+    type :: top_layer
+      real(4) :: i
+      integer(4) :: array_i(10)
+      real(4) :: j
+      type(bottom_layer) :: nested
+      integer, allocatable :: array_j(:)
+      integer(4) :: k
+      complex :: l
+    end type top_layer
+    
+    type(top_layer) :: top_dtype
+
+!$omp target map(tofrom: top_dtype%nested%i2, top_dtype%k, top_dtype%nested%j2, top_dtype%nested%array_i2, top_dtype%l)
+    do i = 1, 10 
+      top_dtype%nested%array_i2(i) = i * 2
+    end do 
+
+    top_dtype%l = (10,20)
+    top_dtype%nested%j2 = (510,210)
+    
+    top_dtype%nested%i2 = 30.30
+    top_dtype%k = 74
+!$omp end target
+  
+  print *, top_dtype%nested%i2
+  print *, top_dtype%k
+  print *, top_dtype%nested%array_i2
+  print *, top_dtype%l
+  print *, top_dtype%nested%j2
+end program main
+
+!CHECK: 30.299999237060547
+!CHECK: 74
+!CHECK: 2. 4. 6. 8. 10. 12. 14. 16. 18. 20.
+!CHECK: (10.,20.)
+!CHECK: (510.,210.)
diff --git a/offload/test/offloading/fortran/target-map-nested-dtype-derived-member.f90 b/offload/test/offloading/fortran/target-map-nested-dtype-derived-member.f90
new file mode 100644
index 000000000000..1be8cb32c8e6
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-nested-dtype-derived-member.f90
@@ -0,0 +1,52 @@
+! Offloading test checking interaction of an
+! nested derived type member map with the 
+! inclusion of an entire nested derived 
+! type being mapped
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    type :: bottom_layer
+      real(8) :: i2
+      real(4) :: array_i2(10)
+      real(4) :: array_j2(10)
+    end type bottom_layer
+
+    type :: top_layer
+      real(4) :: i
+      integer(4) :: array_i(10)
+      real(4) :: j
+      type(bottom_layer) :: nested
+      integer, allocatable :: array_j(:)
+      integer(4) :: k
+      type(bottom_layer) :: nested2
+    end type top_layer
+    
+    type(top_layer) :: top_dtype
+
+!$omp target map(tofrom: top_dtype%k, top_dtype%nested2%array_i2, top_dtype%nested)
+    do i = 1, 10 
+      top_dtype%nested2%array_i2(i) = i * 2
+      top_dtype%nested%array_i2(i) = i * 2
+    end do 
+
+    top_dtype%nested%i2 = 30.30
+    top_dtype%k = 74
+!$omp end target
+  
+  print *, top_dtype%nested%i2
+  print *, top_dtype%k
+  print *, top_dtype%nested%array_i2
+  print *, top_dtype%nested2%array_i2
+end program main
+
+!CHECK: 30.299999237060547
+!CHECK: 74
+!CHECK: 2. 4. 6. 8. 10. 12. 14. 16. 18. 20.
+!CHECK: 2. 4. 6. 8. 10. 12. 14. 16. 18. 20.
diff --git a/offload/test/offloading/fortran/target-map-nested-dtype-multi-member.f90 b/offload/test/offloading/fortran/target-map-nested-dtype-multi-member.f90
new file mode 100644
index 000000000000..6f4d5ad5c15b
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-nested-dtype-multi-member.f90
@@ -0,0 +1,47 @@
+! Offloading test checking interaction of an
+! explicit member map from a small nested
+! derived type
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    type :: bottom_layer
+      real(8) :: i2
+      real(4) :: array_i2(10)
+      real(4) :: array_j2(10)
+    end type bottom_layer
+
+    type :: top_layer
+      real(4) :: i
+      integer(4) :: array_i(10)
+      real(4) :: j
+      type(bottom_layer) :: nested
+      integer, allocatable :: array_j(:)
+      integer(4) :: k
+    end type top_layer
+    
+    type(top_layer) :: top_dtype
+
+!$omp target map(tofrom: top_dtype%nested%i2, top_dtype%k, top_dtype%nested%array_i2)
+    do i = 1, 10 
+      top_dtype%nested%array_i2(i) = i * 2
+    end do 
+
+    top_dtype%nested%i2 = 30.30
+    top_dtype%k = 74
+!$omp end target
+  
+  print *, top_dtype%nested%i2
+  print *, top_dtype%k
+  print *, top_dtype%nested%array_i2
+end program main
+
+!CHECK: 30.299999237060547
+!CHECK: 74
+!CHECK: 2. 4. 6. 8. 10. 12. 14. 16. 18. 20.
diff --git a/offload/test/offloading/fortran/target-map-nested-dtype-single-member.f90 b/offload/test/offloading/fortran/target-map-nested-dtype-single-member.f90
new file mode 100644
index 000000000000..046fc13eb93c
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-nested-dtype-single-member.f90
@@ -0,0 +1,40 @@
+! Offloading test checking interaction of an
+! single explicit member map from a nested
+! derived type.
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    type :: bottom_layer
+      real(8) :: i2
+      real(4) :: array_i2(10)
+      real(4) :: array_j2(10)
+    end type bottom_layer
+
+    type :: top_layer
+      real(4) :: i
+      integer(4) :: array_i(10)
+      real(4) :: j
+      type(bottom_layer) :: nested
+      integer, allocatable :: array_j(:)
+      integer(4) :: k
+    end type top_layer
+    
+    type(top_layer) :: top_dtype
+
+!$omp target map(tofrom: top_dtype%nested%array_i2)
+    do i = 1, 10 
+      top_dtype%nested%array_i2(i) = i * 2
+    end do 
+!$omp end target
+  
+  print *, top_dtype%nested%array_i2
+end program main
+
+!CHECK: 2. 4. 6. 8. 10. 12. 14. 16. 18. 20.
diff --git a/offload/test/offloading/fortran/target-map-two-dtype-explicit-member.f90 b/offload/test/offloading/fortran/target-map-two-dtype-explicit-member.f90
new file mode 100644
index 000000000000..b080b437e381
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-two-dtype-explicit-member.f90
@@ -0,0 +1,35 @@
+! Offloading test checking interaction of two
+! derived type's with one explicit member
+! each being mapped with bounds to target
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    type :: scalar_array
+        real(4) :: break_0
+        real(4) :: array_x(10)
+        real(4) :: break_1
+        real(4) :: array_y(10)
+        real(4) :: break_3
+    end type scalar_array
+  
+    type(scalar_array) :: scalar_arr1
+    type(scalar_array) :: scalar_arr2
+    
+  !$omp target map(tofrom:scalar_arr1%break_1, scalar_arr2%break_3)
+    scalar_arr2%break_3 = 10
+    scalar_arr1%break_1 = 15
+  !$omp end target
+
+  print*, scalar_arr1%break_1
+  print*, scalar_arr2%break_3
+end program main
+
+!CHECK: 15.
+!CHECK: 10.
diff --git a/offload/test/offloading/fortran/target-map-two-dtype-individual-member-array-1D-bounds.f90 b/offload/test/offloading/fortran/target-map-two-dtype-individual-member-array-1D-bounds.f90
new file mode 100644
index 000000000000..7cb7846ed0b2
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-two-dtype-individual-member-array-1D-bounds.f90
@@ -0,0 +1,39 @@
+! Offloading test checking interaction of two
+! derived type's with a single explicit array
+! member each being mapped with bounds to 
+! target
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    type :: scalar_array
+    real(4) :: break_0
+    real(4) :: array_x(10)
+    real(4) :: break_1
+    real(4) :: array_y(10)
+    real(4) :: break_3
+    end type scalar_array
+  
+    type(scalar_array) :: scalar_arr1
+    type(scalar_array) :: scalar_arr2
+    
+
+  !$omp target map(tofrom:scalar_arr1%array_x(3:6), scalar_arr2%array_x(3:6))
+    do i = 3, 6
+      scalar_arr2%array_x(i) = i
+      scalar_arr1%array_x(i) = i
+    end do
+  !$omp end target
+
+  print*, scalar_arr1%array_x  
+  print*, scalar_arr2%array_x
+end program main
+
+!CHECK: 0. 0. 3. 4. 5. 6. 0. 0. 0. 0.
+!CHECK: 0. 0. 3. 4. 5. 6. 0. 0. 0. 0.
diff --git a/offload/test/offloading/fortran/target-map-two-dtype-mixed-implicit-explicit-capture-1.f90 b/offload/test/offloading/fortran/target-map-two-dtype-mixed-implicit-explicit-capture-1.f90
new file mode 100644
index 000000000000..fbe6b305464a
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-two-dtype-mixed-implicit-explicit-capture-1.f90
@@ -0,0 +1,35 @@
+! Offloading test checking interaction of two
+! derived type's with a mix of explicit and
+! implicit member mapping to target
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    type :: scalar_array
+        real(4) :: break_0
+        real(4) :: array_x(10)
+        real(4) :: break_1
+        real(4) :: array_y(10)
+        real(4) :: break_3
+    end type scalar_array
+  
+    type(scalar_array) :: scalar_arr1
+    type(scalar_array) :: scalar_arr2
+    
+  !$omp target map(tofrom:scalar_arr1%break_1)
+    scalar_arr2%break_3 = 10
+    scalar_arr1%break_1 = 15
+  !$omp end target
+
+  print*, scalar_arr1%break_1
+  print*, scalar_arr2%break_3
+end program main
+
+!CHECK: 15.
+!CHECK: 10.
diff --git a/offload/test/offloading/fortran/target-map-two-dtype-mixed-implicit-explicit-capture-2.f90 b/offload/test/offloading/fortran/target-map-two-dtype-mixed-implicit-explicit-capture-2.f90
new file mode 100644
index 000000000000..503329d90628
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-two-dtype-mixed-implicit-explicit-capture-2.f90
@@ -0,0 +1,41 @@
+! Offloading test checking interaction of two
+! derived type's with a mix of explicit and
+! implicit member mapping of arrays to target
+! one with bounds.
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    type :: scalar_array
+        real(4) :: break_0
+        real(4) :: array_x(10)
+        real(4) :: break_1
+        real(4) :: array_y(10)
+        real(4) :: break_3
+    end type scalar_array
+  
+    type(scalar_array) :: scalar_arr1
+    type(scalar_array) :: scalar_arr2
+    
+  do i = 1, 10
+    scalar_arr1%array_x(i) = i
+  end do 
+
+  !$omp target map(tofrom:scalar_arr2%array_x(3:6))
+    do i = 3, 6
+      scalar_arr2%array_x(i) = scalar_arr1%array_x(i)
+    end do
+  !$omp end target
+
+  print*, scalar_arr1%array_x
+  print*, scalar_arr2%array_x
+end program main
+
+!CHECK: 1. 2. 3. 4. 5. 6. 7. 8. 9. 10.
+!CHECK: 0. 0. 3. 4. 5. 6. 0. 0. 0. 0.
diff --git a/offload/test/offloading/fortran/target-map-two-dtype-multi-member-array-1D-bounds.f90 b/offload/test/offloading/fortran/target-map-two-dtype-multi-member-array-1D-bounds.f90
new file mode 100644
index 000000000000..ed350c54dcb2
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-two-dtype-multi-member-array-1D-bounds.f90
@@ -0,0 +1,51 @@
+! Offloading test checking interaction of two
+! derived type's with two explicit array
+! members each being mapped with bounds to 
+! target
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    type :: scalar_array
+    real(4) :: break_0
+    real(4) :: array_x(10)
+    real(4) :: break_1
+    real(4) :: array_y(10)
+    real(4) :: break_3
+    end type scalar_array
+  
+    type(scalar_array) :: scalar_arr1
+    type(scalar_array) :: scalar_arr2
+    
+  do i = 1, 10
+    scalar_arr1%array_x(i) = i
+    scalar_arr2%array_x(i) = i
+  end do
+
+  !$omp target map(tofrom:scalar_arr1%array_x(3:6), scalar_arr1%array_y(3:6), scalar_arr2%array_x(3:6), scalar_arr2%array_y(3:6))
+    do i = 1, 10
+      scalar_arr2%array_y(i) = scalar_arr1%array_x(i)
+    end do
+    
+    do i = 1, 10
+      scalar_arr1%array_y(i) = scalar_arr2%array_x(i)
+    end do
+  !$omp end target
+
+  print*, scalar_arr1%array_x
+  print*, scalar_arr2%array_y
+
+  print*, scalar_arr2%array_x
+  print*, scalar_arr1%array_y
+end program main
+
+!CHECK: 1. 2. 3. 4. 5. 6. 7. 8. 9. 10.
+!CHECK: 0. 0. 3. 4. 5. 6. 0. 0. 0. 0.
+!CHECK: 1. 2. 3. 4. 5. 6. 7. 8. 9. 10.
+!CHECK: 0. 0. 3. 4. 5. 6. 0. 0. 0. 0.
diff --git a/offload/test/offloading/fortran/target-map-two-nested-dtype-member-array-map.f90 b/offload/test/offloading/fortran/target-map-two-nested-dtype-member-array-map.f90
new file mode 100644
index 000000000000..42d9197f6e1e
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-two-nested-dtype-member-array-map.f90
@@ -0,0 +1,56 @@
+! Offloading test checking interaction of an
+! explicit member map utilising array bounds
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    type :: array
+     real(4) :: array_z(10)
+     real(4) :: break_4
+     real(4) :: array_ix(10)
+    end type array
+
+    type :: scalar_array
+    real(4) :: break_0
+    real(4) :: array_x(10)
+    real(4) :: break_1
+    real(4) :: array_y(10)
+    real(4) :: break_3
+    type(array) :: nested
+    end type scalar_array
+  
+    type(scalar_array) :: scalar_arr1
+    type(scalar_array) :: scalar_arr2
+    
+  do i = 1, 10
+    scalar_arr1%nested%array_z(i) = i
+    scalar_arr2%nested%array_z(i) = i
+  end do
+
+  !$omp target map(tofrom:scalar_arr1%nested%array_z(3:6), scalar_arr1%nested%array_ix(3:6), scalar_arr2%nested%array_z(3:6), scalar_arr2%nested%array_ix(3:6))
+    do i = 3, 6
+      scalar_arr2%nested%array_ix(i) = scalar_arr1%nested%array_z(i)
+    end do
+    
+    do i = 3, 6
+      scalar_arr1%nested%array_ix(i) = scalar_arr2%nested%array_z(i)
+    end do
+  !$omp end target
+
+  print*, scalar_arr1%nested%array_ix
+  print*, scalar_arr2%nested%array_z
+
+  print*, scalar_arr2%nested%array_ix
+  print*, scalar_arr1%nested%array_z
+end program main
+
+!CHECK: 0. 0. 3. 4. 5. 6. 0. 0. 0. 0.
+!CHECK: 1. 2. 3. 4. 5. 6. 7. 8. 9. 10.
+!CHECK: 0. 0. 3. 4. 5. 6. 0. 0. 0. 0.
+!CHECK: 1. 2. 3. 4. 5. 6. 7. 8. 9. 10.
diff --git a/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp b/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
index 761e04e4c7bb..1e9a6a84d805 100644
--- a/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
+++ b/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
@@ -13,8 +13,6 @@
 
 #include "omptarget.h"
 
-#include "Shared/PluginAPI.h"
-
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/JSON.h"
 #include "llvm/Support/MemoryBuffer.h"
diff --git a/offload/unittests/Plugins/NextgenPluginsTest.cpp b/offload/unittests/Plugins/NextgenPluginsTest.cpp
index 635bd1637c90..479b3f614aed 100644
--- a/offload/unittests/Plugins/NextgenPluginsTest.cpp
+++ b/offload/unittests/Plugins/NextgenPluginsTest.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Shared/PluginAPI.h"
 #include "omptarget.h"
 #include "gtest/gtest.h"
 
diff --git a/openmp/CMakeLists.txt b/openmp/CMakeLists.txt
index 95f2425db3ee..9097ca562300 100644
--- a/openmp/CMakeLists.txt
+++ b/openmp/CMakeLists.txt
@@ -113,17 +113,7 @@ option(OPENMP_ENABLE_LIBOMP_PROFILING "Enable time profiling for libomp." OFF)
 
 # Header install location
 if(${OPENMP_STANDALONE_BUILD})
-  if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-    execute_process(
-      OUTPUT_STRIP_TRAILING_WHITESPACE
-      COMMAND ${CMAKE_CXX_COMPILER} --print-resource-dir
-      RESULT_VARIABLE COMMAND_RETURN_CODE
-      OUTPUT_VARIABLE COMPILER_RESOURCE_DIR
-    )
-    set(LIBOMP_HEADERS_INSTALL_PATH "${COMPILER_RESOURCE_DIR}/include")
-  else()
-    set(LIBOMP_HEADERS_INSTALL_PATH "${CMAKE_INSTALL_INCLUDEDIR}")
-  endif()
+  set(LIBOMP_HEADERS_INSTALL_PATH "${CMAKE_INSTALL_INCLUDEDIR}")
 else()
   include(GetClangResourceDir)
   get_clang_resource_dir(LIBOMP_HEADERS_INSTALL_PATH SUBDIR include)
diff --git a/openmp/runtime/CMakeLists.txt b/openmp/runtime/CMakeLists.txt
index 57ed54bcdc7b..bcae02eba6a5 100644
--- a/openmp/runtime/CMakeLists.txt
+++ b/openmp/runtime/CMakeLists.txt
@@ -132,10 +132,13 @@ set(LIBOMP_ASMFLAGS "" CACHE STRING
   "Appended user specified assembler flags.")
 set(LIBOMP_LDFLAGS "" CACHE STRING
   "Appended user specified linker flags.")
-if("${LIBOMP_ARCH}" STREQUAL "ppc" AND ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
-  # PPC (32-bit) on AIX needs libatomic for __atomic_load_8, etc.
-  set(LIBOMP_LIBFLAGS "-latomic" CACHE STRING
+if(${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+  set(LIBOMP_LIBFLAGS "-lperfstat" CACHE STRING
     "Appended user specified linked libs flags. (e.g., -lm)")
+  if("${LIBOMP_ARCH}" STREQUAL "ppc")
+    # PPC (32-bit) on AIX needs libatomic for __atomic_load_8, etc.
+    set(LIBOMP_LIBFLAGS "${LIBOMP_LIBFLAGS} -latomic")
+  endif()
 else()
   set(LIBOMP_LIBFLAGS "" CACHE STRING
     "Appended user specified linked libs flags. (e.g., -lm)")
diff --git a/openmp/runtime/src/include/ompx.h.var b/openmp/runtime/src/include/ompx.h.var
index 5dd8e8355e4c..579d31aa98c5 100644
--- a/openmp/runtime/src/include/ompx.h.var
+++ b/openmp/runtime/src/include/ompx.h.var
@@ -50,9 +50,12 @@ enum {
   ompx_dim_z = 2,
 };
 
+// TODO: The following implementation is for host fallback. We need to disable
+// generation of host fallback in kernel language mode.
+#pragma omp begin declare variant match(device = {kind(cpu)})
+
 /// ompx_{thread,block}_{id,dim}
 ///{
-#pragma omp begin declare variant match(device = {kind(cpu)})
 #define _TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_C(NAME, VALUE)                     \
   static inline int ompx_##NAME(int Dim) { return VALUE; }
 
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 18ccf10fe17d..64a3ea6d5be5 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -2871,6 +2871,11 @@ union KMP_ALIGN_CACHE kmp_task_team {
   char tt_pad[KMP_PAD(kmp_base_task_team_t, CACHE_LINE)];
 };
 
+typedef struct kmp_task_team_list_t {
+  kmp_task_team_t *task_team;
+  kmp_task_team_list_t *next;
+} kmp_task_team_list_t;
+
 #if (USE_FAST_MEMORY == 3) || (USE_FAST_MEMORY == 5)
 // Free lists keep same-size free memory slots for fast memory allocation
 // routines
@@ -3008,10 +3013,6 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
   kmp_task_team_t *th_task_team; // Task team struct
   kmp_taskdata_t *th_current_task; // Innermost Task being executed
   kmp_uint8 th_task_state; // alternating 0/1 for task team identification
-  kmp_uint8 *th_task_state_memo_stack; // Stack holding memos of th_task_state
-  // at nested levels
-  kmp_uint32 th_task_state_top; // Top element of th_task_state_memo_stack
-  kmp_uint32 th_task_state_stack_sz; // Size of th_task_state_memo_stack
   kmp_uint32 th_reap_state; // Non-zero indicates thread is not
   // tasking, thus safe to reap
 
@@ -3133,6 +3134,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
   kmp_disp_t *t_dispatch; // thread's dispatch data
   kmp_task_team_t *t_task_team[2]; // Task team struct; switch between 2
   kmp_proc_bind_t t_proc_bind; // bind type for par region
+  int t_primary_task_state; // primary thread's task state saved
 #if USE_ITT_BUILD
   kmp_uint64 t_region_time; // region begin timestamp
 #endif /* USE_ITT_BUILD */
@@ -3204,6 +3206,12 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
   distributedBarrier *b; // Distributed barrier data associated with team
 } kmp_base_team_t;
 
+// Assert that the list structure fits and aligns within
+// the double task team pointer
+KMP_BUILD_ASSERT(sizeof(kmp_task_team_t *[2]) == sizeof(kmp_task_team_list_t));
+KMP_BUILD_ASSERT(alignof(kmp_task_team_t *[2]) ==
+                 alignof(kmp_task_team_list_t));
+
 union KMP_ALIGN_CACHE kmp_team {
   kmp_base_team_t t;
   double t_align; /* use worst case alignment */
@@ -4114,9 +4122,10 @@ extern void __kmp_fulfill_event(kmp_event_t *event);
 extern void __kmp_free_task_team(kmp_info_t *thread,
                                  kmp_task_team_t *task_team);
 extern void __kmp_reap_task_teams(void);
+extern void __kmp_push_task_team_node(kmp_info_t *thread, kmp_team_t *team);
+extern void __kmp_pop_task_team_node(kmp_info_t *thread, kmp_team_t *team);
 extern void __kmp_wait_to_unref_task_teams(void);
-extern void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team,
-                                  int always);
+extern void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team);
 extern void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team);
 extern void __kmp_task_team_wait(kmp_info_t *this_thr, kmp_team_t *team
 #if USE_ITT_BUILD
@@ -4127,6 +4136,14 @@ extern void __kmp_task_team_wait(kmp_info_t *this_thr, kmp_team_t *team
                                  int wait = 1);
 extern void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread,
                                   int gtid);
+#if KMP_DEBUG
+#define KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, thr)                         \
+  KMP_DEBUG_ASSERT(                                                            \
+      __kmp_tasking_mode != tskm_task_teams || team->t.t_nproc == 1 ||         \
+      thr->th.th_task_team == team->t.t_task_team[thr->th.th_task_state])
+#else
+#define KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, thr) /* Nothing */
+#endif
 
 extern int __kmp_is_address_mapped(void *addr);
 extern kmp_uint64 __kmp_hardware_timestamp(void);
diff --git a/openmp/runtime/src/kmp_barrier.cpp b/openmp/runtime/src/kmp_barrier.cpp
index e9ab15f1723b..b381694c0953 100644
--- a/openmp/runtime/src/kmp_barrier.cpp
+++ b/openmp/runtime/src/kmp_barrier.cpp
@@ -1858,8 +1858,7 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
     }
 
     if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
-      // use 0 to only setup the current team if nthreads > 1
-      __kmp_task_team_setup(this_thr, team, 0);
+      __kmp_task_team_setup(this_thr, team);
 
     if (cancellable) {
       cancelled = __kmp_linear_barrier_gather_cancellable(
@@ -2042,7 +2041,7 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
             this_thr->th.th_task_team->tt.tt_hidden_helper_task_encountered ==
                 TRUE);
         __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
-        __kmp_task_team_setup(this_thr, team, 0);
+        __kmp_task_team_setup(this_thr, team);
 
 #if USE_ITT_BUILD
         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
@@ -2243,9 +2242,7 @@ void __kmp_join_barrier(int gtid) {
                   __kmp_gtid_from_thread(this_thr), team_id,
                   team->t.t_task_team[this_thr->th.th_task_state],
                   this_thr->th.th_task_team));
-    if (this_thr->th.th_task_team)
-      KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
-                       team->t.t_task_team[this_thr->th.th_task_state]);
+    KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, this_thr);
   }
 #endif /* KMP_DEBUG */
 
@@ -2440,10 +2437,8 @@ void __kmp_fork_barrier(int gtid, int tid) {
     }
 #endif
 
-    if (__kmp_tasking_mode != tskm_immediate_exec) {
-      // 0 indicates setup current task team if nthreads > 1
-      __kmp_task_team_setup(this_thr, team, 0);
-    }
+    if (__kmp_tasking_mode != tskm_immediate_exec)
+      __kmp_task_team_setup(this_thr, team);
 
     /* The primary thread may have changed its blocktime between join barrier
        and fork barrier. Copy the blocktime info to the thread, where
diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp
index 0268f692ff7f..f45fe646d1d9 100644
--- a/openmp/runtime/src/kmp_csupport.cpp
+++ b/openmp/runtime/src/kmp_csupport.cpp
@@ -654,6 +654,12 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
         serial_team->t.t_dispatch->th_disp_buffer->next;
     __kmp_free(disp_buffer);
   }
+
+  /* pop the task team stack */
+  if (serial_team->t.t_serialized > 1) {
+    __kmp_pop_task_team_node(this_thr, serial_team);
+  }
+
   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
 
   --serial_team->t.t_serialized;
@@ -692,6 +698,11 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     this_thr->th.th_current_task->td_flags.executing = 1;
 
     if (__kmp_tasking_mode != tskm_immediate_exec) {
+      // Restore task state from serial team structure
+      KMP_DEBUG_ASSERT(serial_team->t.t_primary_task_state == 0 ||
+                       serial_team->t.t_primary_task_state == 1);
+      this_thr->th.th_task_state =
+          (kmp_uint8)serial_team->t.t_primary_task_state;
       // Copy the task team from the new child / old parent team to the thread.
       this_thr->th.th_task_team =
           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp
index 95acf4dff4cb..4be67f3b5987 100644
--- a/openmp/runtime/src/kmp_runtime.cpp
+++ b/openmp/runtime/src/kmp_runtime.cpp
@@ -1042,6 +1042,41 @@ static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
     }
   }
 
+  // Take care of primary thread's task state
+  if (__kmp_tasking_mode != tskm_immediate_exec) {
+    if (use_hot_team) {
+      KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team->t.t_parent, master_th);
+      KA_TRACE(
+          20,
+          ("__kmp_fork_team_threads: Primary T#%d pushing task_team %p / team "
+           "%p, new task_team %p / team %p\n",
+           __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
+           team->t.t_parent, team->t.t_task_team[master_th->th.th_task_state],
+           team));
+
+      // Store primary thread's current task state on new team
+      KMP_CHECK_UPDATE(team->t.t_primary_task_state,
+                       master_th->th.th_task_state);
+
+      // Restore primary thread's task state to hot team's state
+      // by using thread 1's task state
+      if (team->t.t_nproc > 1) {
+        KMP_DEBUG_ASSERT(team->t.t_threads[1]->th.th_task_state == 0 ||
+                         team->t.t_threads[1]->th.th_task_state == 1);
+        KMP_CHECK_UPDATE(master_th->th.th_task_state,
+                         team->t.t_threads[1]->th.th_task_state);
+      } else {
+        master_th->th.th_task_state = 0;
+      }
+    } else {
+      // Store primary thread's current task_state on new team
+      KMP_CHECK_UPDATE(team->t.t_primary_task_state,
+                       master_th->th.th_task_state);
+      // Are not using hot team, so set task state to 0.
+      master_th->th.th_task_state = 0;
+    }
+  }
+
   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
     for (i = 0; i < team->t.t_nproc; i++) {
       kmp_info_t *thr = team->t.t_threads[i];
@@ -1145,18 +1180,6 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
   KMP_DEBUG_ASSERT(serial_team);
   KMP_MB();
 
-  if (__kmp_tasking_mode != tskm_immediate_exec) {
-    KMP_DEBUG_ASSERT(
-        this_thr->th.th_task_team ==
-        this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
-    KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
-                     NULL);
-    KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
-                  "team %p, new task_team = NULL\n",
-                  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
-    this_thr->th.th_task_team = NULL;
-  }
-
   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
     proc_bind = proc_bind_false;
@@ -1242,6 +1265,8 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     serial_team->t.t_serialized = 1;
     serial_team->t.t_nproc = 1;
     serial_team->t.t_parent = this_thr->th.th_team;
+    // Save previous team's task state on serial team structure
+    serial_team->t.t_primary_task_state = this_thr->th.th_task_state;
     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
     this_thr->th.th_team = serial_team;
     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
@@ -1281,6 +1306,8 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     this_thr->th.th_team_nproc = 1;
     this_thr->th.th_team_master = this_thr;
     this_thr->th.th_team_serialized = 1;
+    this_thr->th.th_task_team = NULL;
+    this_thr->th.th_task_state = 0;
 
     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
@@ -1332,6 +1359,9 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     }
     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
 
+    /* allocate/push task team stack */
+    __kmp_push_task_team_node(this_thr, serial_team);
+
     KMP_MB();
   }
   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
@@ -1985,17 +2015,12 @@ int __kmp_fork_call(ident_t *loc, int gtid,
                                  ap);
     } // End parallel closely nested in teams construct
 
-#if KMP_DEBUG
-    if (__kmp_tasking_mode != tskm_immediate_exec) {
-      KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
-                       parent_team->t.t_task_team[master_th->th.th_task_state]);
-    }
-#endif
-
     // Need this to happen before we determine the number of threads, not while
     // we are allocating the team
     //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
 
+    KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(parent_team, master_th);
+
     // Determine the number of threads
     int enter_teams =
         __kmp_is_entering_teams(active_level, level, teams_level, ap);
@@ -2186,64 +2211,6 @@ int __kmp_fork_call(ident_t *loc, int gtid,
       ompd_bp_parallel_begin();
 #endif
 
-    if (__kmp_tasking_mode != tskm_immediate_exec) {
-      // Set primary thread's task team to team's task team. Unless this is hot
-      // team, it should be NULL.
-      KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
-                       parent_team->t.t_task_team[master_th->th.th_task_state]);
-      KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
-                    "%p, new task_team %p / team %p\n",
-                    __kmp_gtid_from_thread(master_th),
-                    master_th->th.th_task_team, parent_team,
-                    team->t.t_task_team[master_th->th.th_task_state], team));
-
-      if (active_level || master_th->th.th_task_team) {
-        // Take a memo of primary thread's task_state
-        KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
-        if (master_th->th.th_task_state_top >=
-            master_th->th.th_task_state_stack_sz) { // increase size
-          kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
-          kmp_uint8 *old_stack, *new_stack;
-          kmp_uint32 i;
-          new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
-          for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
-            new_stack[i] = master_th->th.th_task_state_memo_stack[i];
-          }
-          for (i = master_th->th.th_task_state_stack_sz; i < new_size;
-               ++i) { // zero-init rest of stack
-            new_stack[i] = 0;
-          }
-          old_stack = master_th->th.th_task_state_memo_stack;
-          master_th->th.th_task_state_memo_stack = new_stack;
-          master_th->th.th_task_state_stack_sz = new_size;
-          __kmp_free(old_stack);
-        }
-        // Store primary thread's task_state on stack
-        master_th->th
-            .th_task_state_memo_stack[master_th->th.th_task_state_top] =
-            master_th->th.th_task_state;
-        master_th->th.th_task_state_top++;
-#if KMP_NESTED_HOT_TEAMS
-        if (master_th->th.th_hot_teams &&
-            active_level < __kmp_hot_teams_max_level &&
-            team == master_th->th.th_hot_teams[active_level].hot_team) {
-          // Restore primary thread's nested state if nested hot team
-          master_th->th.th_task_state =
-              master_th->th
-                  .th_task_state_memo_stack[master_th->th.th_task_state_top];
-        } else {
-#endif
-          master_th->th.th_task_state = 0;
-#if KMP_NESTED_HOT_TEAMS
-        }
-#endif
-      }
-#if !KMP_NESTED_HOT_TEAMS
-      KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
-                       (team == root->r.r_hot_team));
-#endif
-    }
-
     KA_TRACE(
         20,
         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
@@ -2451,8 +2418,7 @@ void __kmp_join_call(ident_t *loc, int gtid
                   __kmp_gtid_from_thread(master_th), team,
                   team->t.t_task_team[master_th->th.th_task_state],
                   master_th->th.th_task_team));
-    KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
-                     team->t.t_task_team[master_th->th.th_task_state]);
+    KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, master_th);
   }
 #endif
 
@@ -2690,24 +2656,11 @@ void __kmp_join_call(ident_t *loc, int gtid
   }
 
   if (__kmp_tasking_mode != tskm_immediate_exec) {
-    if (master_th->th.th_task_state_top >
-        0) { // Restore task state from memo stack
-      KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
-      // Remember primary thread's state if we re-use this nested hot team
-      master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
-          master_th->th.th_task_state;
-      --master_th->th.th_task_state_top; // pop
-      // Now restore state at this level
-      master_th->th.th_task_state =
-          master_th->th
-              .th_task_state_memo_stack[master_th->th.th_task_state_top];
-    } else if (team != root->r.r_hot_team) {
-      // Reset the task state of primary thread if we are not hot team because
-      // in this case all the worker threads will be free, and their task state
-      // will be reset. If not reset the primary's, the task state will be
-      // inconsistent.
-      master_th->th.th_task_state = 0;
-    }
+    // Restore primary thread's task state from team structure
+    KMP_DEBUG_ASSERT(team->t.t_primary_task_state == 0 ||
+                     team->t.t_primary_task_state == 1);
+    master_th->th.th_task_state = (kmp_uint8)team->t.t_primary_task_state;
+
     // Copy the task team from the parent team to the primary thread
     master_th->th.th_task_team =
         parent_team->t.t_task_team[master_th->th.th_task_state];
@@ -4396,17 +4349,6 @@ static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
 
   this_thr->th.th_next_pool = NULL;
 
-  if (!this_thr->th.th_task_state_memo_stack) {
-    size_t i;
-    this_thr->th.th_task_state_memo_stack =
-        (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
-    this_thr->th.th_task_state_top = 0;
-    this_thr->th.th_task_state_stack_sz = 4;
-    for (i = 0; i < this_thr->th.th_task_state_stack_sz;
-         ++i) // zero init the stack
-      this_thr->th.th_task_state_memo_stack[i] = 0;
-  }
-
   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
 
@@ -4463,8 +4405,6 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
     TCW_4(__kmp_nth, __kmp_nth + 1);
 
     new_thr->th.th_task_state = 0;
-    new_thr->th.th_task_state_top = 0;
-    new_thr->th.th_task_state_stack_sz = 4;
 
     if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
       // Make sure pool thread has transitioned to waiting on own thread struct
@@ -5262,6 +5202,15 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
         // Activate team threads via th_used_in_team
         __kmp_add_threads_to_team(team, new_nproc);
       }
+      // When decreasing team size, threads no longer in the team should
+      // unref task team.
+      if (__kmp_tasking_mode != tskm_immediate_exec) {
+        for (f = new_nproc; f < team->t.t_nproc; f++) {
+          kmp_info_t *th = team->t.t_threads[f];
+          KMP_DEBUG_ASSERT(th);
+          th->th.th_task_team = NULL;
+        }
+      }
 #if KMP_NESTED_HOT_TEAMS
       if (__kmp_hot_teams_mode == 0) {
         // AC: saved number of threads should correspond to team's value in this
@@ -5272,11 +5221,6 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
         /* release the extra threads we don't need any more */
         for (f = new_nproc; f < team->t.t_nproc; f++) {
           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
-          if (__kmp_tasking_mode != tskm_immediate_exec) {
-            // When decreasing team size, threads no longer in the team should
-            // unref task team.
-            team->t.t_threads[f]->th.th_task_team = NULL;
-          }
           __kmp_free_thread(team->t.t_threads[f]);
           team->t.t_threads[f] = NULL;
         }
@@ -6248,11 +6192,6 @@ static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
     thread->th.th_pri_common = NULL;
   }
 
-  if (thread->th.th_task_state_memo_stack != NULL) {
-    __kmp_free(thread->th.th_task_state_memo_stack);
-    thread->th.th_task_state_memo_stack = NULL;
-  }
-
 #if KMP_USE_BGET
   if (thread->th.th_local.bget_data != NULL) {
     __kmp_finalize_bget(thread);
diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp
index b9c8289b5c51..8b6092cb1085 100644
--- a/openmp/runtime/src/kmp_settings.cpp
+++ b/openmp/runtime/src/kmp_settings.cpp
@@ -6420,6 +6420,8 @@ void __kmp_env_initialize(char const *string) {
         }
         if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) &&
             (__kmp_nested_proc_bind.bind_types[0] != proc_bind_default)) {
+          if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)
+            __kmp_affinity.type = affinity_none;
           if (__kmp_affinity.type == affinity_default) {
             __kmp_affinity.type = affinity_compact;
             __kmp_affinity.flags.dups = FALSE;
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 6303bb0d63f0..a78202749449 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -1511,8 +1511,7 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
       KA_TRACE(30,
                ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
                 gtid));
-      // 1 indicates setup the current team regardless of nthreads
-      __kmp_task_team_setup(thread, team, 1);
+      __kmp_task_team_setup(thread, team);
       thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
     }
     kmp_task_team_t *task_team = thread->th.th_task_team;
@@ -3390,8 +3389,6 @@ static inline int __kmp_execute_tasks_template(
 
   nthreads = task_team->tt.tt_nproc;
   unfinished_threads = &(task_team->tt.tt_unfinished_threads);
-  KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks ||
-                   task_team->tt.tt_hidden_helper_task_encountered);
   KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
 
   while (1) { // Outer loop keeps trying to find tasks in case of single thread
@@ -3943,6 +3940,20 @@ static void __kmp_free_task_pri_list(kmp_task_team_t *task_team) {
   __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
 }
 
+static inline void __kmp_task_team_init(kmp_task_team_t *task_team,
+                                        kmp_team_t *team) {
+  int team_nth = team->t.t_nproc;
+  // Only need to init if task team is isn't active or team size changed
+  if (!task_team->tt.tt_active || team_nth != task_team->tt.tt_nproc) {
+    TCW_4(task_team->tt.tt_found_tasks, FALSE);
+    TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
+    TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
+    TCW_4(task_team->tt.tt_nproc, team_nth);
+    KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, team_nth);
+    TCW_4(task_team->tt.tt_active, TRUE);
+  }
+}
+
 // __kmp_allocate_task_team:
 // Allocates a task team associated with a specific team, taking it from
 // the global task team free list if possible.  Also initializes data
@@ -3950,7 +3961,6 @@ static void __kmp_free_task_pri_list(kmp_task_team_t *task_team) {
 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
                                                  kmp_team_t *team) {
   kmp_task_team_t *task_team = NULL;
-  int nthreads;
 
   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
                 (thread ? __kmp_gtid_from_thread(thread) : -1), team));
@@ -3992,14 +4002,7 @@ static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
     // task_team->tt.tt_next = NULL;
   }
 
-  TCW_4(task_team->tt.tt_found_tasks, FALSE);
-  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
-  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
-  task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
-
-  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads);
-  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
-  TCW_4(task_team->tt.tt_active, TRUE);
+  __kmp_task_team_init(task_team, team);
 
   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
                 "unfinished_threads init'd to %d\n",
@@ -4053,6 +4056,40 @@ void __kmp_reap_task_teams(void) {
   }
 }
 
+// View the array of two task team pointers as a pair of pointers:
+//  1) a single task_team pointer
+//  2) next pointer for stack
+// Serial teams can create a stack of task teams for nested serial teams.
+void __kmp_push_task_team_node(kmp_info_t *thread, kmp_team_t *team) {
+  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
+  kmp_task_team_list_t *current =
+      (kmp_task_team_list_t *)(&team->t.t_task_team[0]);
+  kmp_task_team_list_t *node =
+      (kmp_task_team_list_t *)__kmp_allocate(sizeof(kmp_task_team_list_t));
+  node->task_team = current->task_team;
+  node->next = current->next;
+  thread->th.th_task_team = current->task_team = NULL;
+  current->next = node;
+}
+
+// Serial team pops a task team off the stack
+void __kmp_pop_task_team_node(kmp_info_t *thread, kmp_team_t *team) {
+  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
+  kmp_task_team_list_t *current =
+      (kmp_task_team_list_t *)(&team->t.t_task_team[0]);
+  if (current->task_team) {
+    __kmp_free_task_team(thread, current->task_team);
+  }
+  kmp_task_team_list_t *next = current->next;
+  if (next) {
+    current->task_team = next->task_team;
+    current->next = next->next;
+    KMP_DEBUG_ASSERT(next != current);
+    __kmp_free(next);
+    thread->th.th_task_team = current->task_team;
+  }
+}
+
 // __kmp_wait_to_unref_task_teams:
 // Some threads could still be in the fork barrier release code, possibly
 // trying to steal tasks.  Wait for each thread to unreference its task team.
@@ -4117,55 +4154,34 @@ void __kmp_wait_to_unref_task_teams(void) {
   }
 }
 
-void __kmp_shift_task_state_stack(kmp_info_t *this_thr, kmp_uint8 value) {
-  // Shift values from th_task_state_top+1 to task_state_stack_sz
-  if (this_thr->th.th_task_state_top + 1 >=
-      this_thr->th.th_task_state_stack_sz) { // increase size
-    kmp_uint32 new_size = 2 * this_thr->th.th_task_state_stack_sz;
-    kmp_uint8 *old_stack, *new_stack;
-    kmp_uint32 i;
-    new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
-    for (i = 0; i <= this_thr->th.th_task_state_top; ++i) {
-      new_stack[i] = this_thr->th.th_task_state_memo_stack[i];
-    }
-    // If we need to reallocate do the shift at the same time.
-    for (; i < this_thr->th.th_task_state_stack_sz; ++i) {
-      new_stack[i + 1] = this_thr->th.th_task_state_memo_stack[i];
-    }
-    for (i = this_thr->th.th_task_state_stack_sz; i < new_size;
-         ++i) { // zero-init rest of stack
-      new_stack[i] = 0;
-    }
-    old_stack = this_thr->th.th_task_state_memo_stack;
-    this_thr->th.th_task_state_memo_stack = new_stack;
-    this_thr->th.th_task_state_stack_sz = new_size;
-    __kmp_free(old_stack);
-  } else {
-    kmp_uint8 *end;
-    kmp_uint32 i;
-
-    end = &this_thr->th
-               .th_task_state_memo_stack[this_thr->th.th_task_state_stack_sz];
-
-    for (i = this_thr->th.th_task_state_stack_sz - 1;
-         i > this_thr->th.th_task_state_top; i--, end--)
-      end[0] = end[-1];
-  }
-  this_thr->th.th_task_state_memo_stack[this_thr->th.th_task_state_top + 1] =
-      value;
-}
-
 // __kmp_task_team_setup:  Create a task_team for the current team, but use
 // an already created, unused one if it already exists.
-void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
+void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team) {
   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
 
+  // For the serial and root teams, setup the first task team pointer to point
+  // to task team. The other pointer is a stack of task teams from previous
+  // serial levels.
+  if (team == this_thr->th.th_serial_team ||
+      team == this_thr->th.th_root->r.r_root_team) {
+    KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
+    if (team->t.t_task_team[0] == NULL) {
+      team->t.t_task_team[0] = __kmp_allocate_task_team(this_thr, team);
+      KA_TRACE(
+          20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
+               " for serial/root team %p\n",
+               __kmp_gtid_from_thread(this_thr), team->t.t_task_team[0], team));
+
+    } else
+      __kmp_task_team_init(team->t.t_task_team[0], team);
+    return;
+  }
+
   // If this task_team hasn't been created yet, allocate it. It will be used in
   // the region after the next.
   // If it exists, it is the current task team and shouldn't be touched yet as
   // it may still be in use.
-  if (team->t.t_task_team[this_thr->th.th_task_state] == NULL &&
-      (always || team->t.t_nproc > 1)) {
+  if (team->t.t_task_team[this_thr->th.th_task_state] == NULL) {
     team->t.t_task_team[this_thr->th.th_task_state] =
         __kmp_allocate_task_team(this_thr, team);
     KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
@@ -4174,52 +4190,31 @@ void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
                   team->t.t_task_team[this_thr->th.th_task_state], team->t.t_id,
                   this_thr->th.th_task_state));
   }
-  if (this_thr->th.th_task_state == 1 && always && team->t.t_nproc == 1) {
-    // fix task state stack to adjust for proxy and helper tasks
-    KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d needs to shift stack"
-                  " for team %d at parity=%d\n",
-                  __kmp_gtid_from_thread(this_thr), team->t.t_id,
-                  this_thr->th.th_task_state));
-    __kmp_shift_task_state_stack(this_thr, this_thr->th.th_task_state);
-  }
 
   // After threads exit the release, they will call sync, and then point to this
   // other task_team; make sure it is allocated and properly initialized. As
   // threads spin in the barrier release phase, they will continue to use the
   // previous task_team struct(above), until they receive the signal to stop
   // checking for tasks (they can't safely reference the kmp_team_t struct,
-  // which could be reallocated by the primary thread). No task teams are formed
-  // for serialized teams.
-  if (team->t.t_nproc > 1) {
-    int other_team = 1 - this_thr->th.th_task_state;
-    KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2);
-    if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
-      team->t.t_task_team[other_team] =
-          __kmp_allocate_task_team(this_thr, team);
-      KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
-                    "task_team %p for team %d at parity=%d\n",
-                    __kmp_gtid_from_thread(this_thr),
-                    team->t.t_task_team[other_team], team->t.t_id, other_team));
-    } else { // Leave the old task team struct in place for the upcoming region;
-      // adjust as needed
-      kmp_task_team_t *task_team = team->t.t_task_team[other_team];
-      if (!task_team->tt.tt_active ||
-          team->t.t_nproc != task_team->tt.tt_nproc) {
-        TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
-        TCW_4(task_team->tt.tt_found_tasks, FALSE);
-        TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
-        TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
-        KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads,
-                          team->t.t_nproc);
-        TCW_4(task_team->tt.tt_active, TRUE);
-      }
-      // if team size has changed, the first thread to enable tasking will
-      // realloc threads_data if necessary
-      KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
-                    "%p for team %d at parity=%d\n",
-                    __kmp_gtid_from_thread(this_thr),
-                    team->t.t_task_team[other_team], team->t.t_id, other_team));
-    }
+  // which could be reallocated by the primary thread).
+  int other_team = 1 - this_thr->th.th_task_state;
+  KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2);
+  if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
+    team->t.t_task_team[other_team] = __kmp_allocate_task_team(this_thr, team);
+    KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
+                  "task_team %p for team %d at parity=%d\n",
+                  __kmp_gtid_from_thread(this_thr),
+                  team->t.t_task_team[other_team], team->t.t_id, other_team));
+  } else { // Leave the old task team struct in place for the upcoming region;
+    // adjust as needed
+    kmp_task_team_t *task_team = team->t.t_task_team[other_team];
+    __kmp_task_team_init(task_team, team);
+    // if team size has changed, the first thread to enable tasking will
+    // realloc threads_data if necessary
+    KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
+                  "%p for team %d at parity=%d\n",
+                  __kmp_gtid_from_thread(this_thr),
+                  team->t.t_task_team[other_team], team->t.t_id, other_team));
   }
 
   // For regular thread, task enabling should be called when the task is going
@@ -4245,9 +4240,11 @@ void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
 
 // __kmp_task_team_sync: Propagation of task team data from team to threads
 // which happens just after the release phase of a team barrier.  This may be
-// called by any thread, but only for teams with # threads > 1.
+// called by any thread. This is not called for serial or root teams.
 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
+  KMP_DEBUG_ASSERT(team != this_thr->th.th_serial_team);
+  KMP_DEBUG_ASSERT(team != this_thr->th.th_root->r.r_root_team);
 
   // Toggle the th_task_state field, to switch which task_team this thread
   // refers to
@@ -4265,8 +4262,7 @@ void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
 }
 
 // __kmp_task_team_wait: Primary thread waits for outstanding tasks after the
-// barrier gather phase. Only called by primary thread if #threads in team > 1
-// or if proxy tasks were created.
+// barrier gather phase. Only called by the primary thread.
 //
 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
 // by passing in 0 optionally as the last argument. When wait is zero, primary
@@ -4300,9 +4296,6 @@ void __kmp_task_team_wait(
         ("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: "
          "setting active to false, setting local and team's pointer to NULL\n",
          __kmp_gtid_from_thread(this_thr), task_team));
-    KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
-                     task_team->tt.tt_found_proxy_tasks == TRUE ||
-                     task_team->tt.tt_hidden_helper_task_encountered == TRUE);
     TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
     TCW_SYNC_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
     KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
diff --git a/openmp/runtime/src/z_Linux_asm.S b/openmp/runtime/src/z_Linux_asm.S
index 201949003c01..5b614e26a833 100644
--- a/openmp/runtime/src/z_Linux_asm.S
+++ b/openmp/runtime/src/z_Linux_asm.S
@@ -1150,6 +1150,9 @@ KMP_LABEL(kmp_invoke_pass_parms):	// put 1st - 6th parms to pkfn in registers.
 	movq	%rdi, %rbx	// pkfn -> %rbx
 	leaq	__gtid(%rbp), %rdi // &gtid -> %rdi (store 1st parm to pkfn)
 	leaq	__tid(%rbp), %rsi  // &tid -> %rsi (store 2nd parm to pkfn)
+	// Check if argc is 0
+	cmpq $0, %rax
+	je KMP_LABEL(kmp_no_args) // Jump ahead
 
 	movq	%r8, %r11	// p_argv -> %r11
 
@@ -1195,6 +1198,7 @@ KMP_LABEL(kmp_1_exit):
 	cmovnsq	(%r11), %rdx	// p_argv[0] -> %rdx (store 3rd parm to pkfn)
 #endif // KMP_MIC
 
+KMP_LABEL(kmp_no_args):
 	call	*%rbx		// call (*pkfn)();
 	movq	$1, %rax	// move 1 into return register;
 
diff --git a/openmp/runtime/src/z_Linux_util.cpp b/openmp/runtime/src/z_Linux_util.cpp
index affb577a5393..7c90740ae5bd 100644
--- a/openmp/runtime/src/z_Linux_util.cpp
+++ b/openmp/runtime/src/z_Linux_util.cpp
@@ -31,6 +31,7 @@
 #include <sys/resource.h>
 #if KMP_OS_AIX
 #include <sys/ldr.h>
+#include <libperfstat.h>
 #else
 #include <sys/syscall.h>
 #endif
@@ -2427,6 +2428,79 @@ int __kmp_get_load_balance(int max) {
   return ret_avg;
 }
 
+#elif KMP_OS_AIX
+
+// The function returns number of running (not sleeping) threads, or -1 in case
+// of error.
+int __kmp_get_load_balance(int max) {
+
+  static int glb_running_threads = 0; // Saved count of the running threads for
+                                      // the thread balance algorithm.
+  static double glb_call_time = 0; // Thread balance algorithm call time.
+  int running_threads = 0; // Number of running threads in the system.
+
+  double call_time = 0.0;
+
+  __kmp_elapsed(&call_time);
+
+  if (glb_call_time &&
+      (call_time - glb_call_time < __kmp_load_balance_interval))
+    return glb_running_threads;
+
+  glb_call_time = call_time;
+
+  if (max <= 0) {
+    max = INT_MAX;
+  }
+
+  // Check how many perfstat_cpu_t structures are available.
+  int logical_cpus = perfstat_cpu(NULL, NULL, sizeof(perfstat_cpu_t), 0);
+  if (logical_cpus <= 0) {
+    glb_call_time = -1;
+    return -1;
+  }
+
+  perfstat_cpu_t *cpu_stat = (perfstat_cpu_t *)KMP_INTERNAL_MALLOC(
+      logical_cpus * sizeof(perfstat_cpu_t));
+  if (cpu_stat == NULL) {
+    glb_call_time = -1;
+    return -1;
+  }
+
+  // Set first CPU as the name of the first logical CPU for which the info is
+  // desired.
+  perfstat_id_t first_cpu_name;
+  strcpy(first_cpu_name.name, FIRST_CPU);
+
+  // Get the stat info of logical CPUs.
+  int rc = perfstat_cpu(&first_cpu_name, cpu_stat, sizeof(perfstat_cpu_t),
+                        logical_cpus);
+  KMP_DEBUG_ASSERT(rc == logical_cpus);
+  if (rc <= 0) {
+    KMP_INTERNAL_FREE(cpu_stat);
+    glb_call_time = -1;
+    return -1;
+  }
+  for (int i = 0; i < logical_cpus; ++i) {
+    running_threads += cpu_stat[i].runque;
+    if (running_threads >= max)
+      break;
+  }
+
+  // There _might_ be a timing hole where the thread executing this
+  // code gets skipped in the load balance, and running_threads is 0.
+  // Assert in the debug builds only!!!
+  KMP_DEBUG_ASSERT(running_threads > 0);
+  if (running_threads <= 0)
+    running_threads = 1;
+
+  KMP_INTERNAL_FREE(cpu_stat);
+
+  glb_running_threads = running_threads;
+
+  return running_threads;
+}
+
 #else // Linux* OS
 
 // The function returns number of running (not sleeping) threads, or -1 in case
@@ -2498,14 +2572,9 @@ int __kmp_get_load_balance(int max) {
 
   proc_entry = readdir(proc_dir);
   while (proc_entry != NULL) {
-#if KMP_OS_AIX
-    // Proc entry name starts with a digit. Assume it is a  process' directory.
-    if (isdigit(proc_entry->d_name[0])) {
-#else
     // Proc entry is a directory and name starts with a digit. Assume it is a
     // process' directory.
     if (proc_entry->d_type == DT_DIR && isdigit(proc_entry->d_name[0])) {
-#endif
 
 #ifdef KMP_DEBUG
       ++total_processes;
@@ -2549,11 +2618,7 @@ int __kmp_get_load_balance(int max) {
         task_entry = readdir(task_dir);
         while (task_entry != NULL) {
           // It is a directory and name starts with a digit.
-#if KMP_OS_AIX
-          if (isdigit(task_entry->d_name[0])) {
-#else
           if (proc_entry->d_type == DT_DIR && isdigit(task_entry->d_name[0])) {
-#endif
 
             // Construct complete stat file path. Easiest way would be:
             //  __kmp_str_buf_print( & stat_path, "%s/%s/stat", task_path.str,
diff --git a/openmp/runtime/test/affinity/redetect.c b/openmp/runtime/test/affinity/redetect.c
index dba83b72cc42..4b96d1bd92ee 100644
--- a/openmp/runtime/test/affinity/redetect.c
+++ b/openmp/runtime/test/affinity/redetect.c
@@ -1,4 +1,5 @@
 // RUN: %libomp-compile
+// RUN: %libomp-run
 // RUN: env KMP_AFFINITY=none %libomp-run
 // REQUIRES: linux
 
diff --git a/openmp/runtime/test/misc_bugs/omp__kmpc_fork_call_if.c b/openmp/runtime/test/misc_bugs/omp__kmpc_fork_call_if.c
new file mode 100644
index 000000000000..60d4bff96787
--- /dev/null
+++ b/openmp/runtime/test/misc_bugs/omp__kmpc_fork_call_if.c
@@ -0,0 +1,36 @@
+// RUN: %libomp-compile && %t | FileCheck %s
+
+#include <stdio.h>
+#include <omp.h>
+
+typedef int32_t kmp_int32;
+typedef void *ident_t;
+typedef void *kmpc_micro;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern void __kmpc_fork_call_if(ident_t *loc, kmp_int32 argc,
+                                kmpc_micro microtask, kmp_int32 cond,
+                                void *args);
+#ifdef __cplusplus
+}
+#endif
+
+// Microtask function for parallel region
+void microtask(int *global_tid, int *bound_tid) {
+  // CHECK: PASS
+  if (omp_in_parallel()) {
+    printf("FAIL\n");
+  } else {
+    printf("PASS\n");
+  }
+}
+
+int main() {
+  // Condition for parallelization (false in this case)
+  int cond = 0;
+  // Call __kmpc_fork_call_if
+  __kmpc_fork_call_if(NULL, 0, microtask, cond, NULL);
+  return 0;
+}
diff --git a/openmp/runtime/test/target/issue-81488.c b/openmp/runtime/test/target/issue-81488.c
new file mode 100644
index 000000000000..adac7d699446
--- /dev/null
+++ b/openmp/runtime/test/target/issue-81488.c
@@ -0,0 +1,36 @@
+// RUN: %libomp-compile
+// RUN: env OMP_NUM_THREADS=1 LIBOMP_USE_HIDDEN_HELPER_TASK=1 \
+// RUN:     LIBOMP_NUM_HIDDEN_HELPER_THREADS=8 %libomp-run
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+#define Nz 8
+#define DEVICE_ID 0
+
+int a[Nz];
+
+int main(void) {
+  for (int n = 0; n < 10; ++n) {
+    for (int k = 0; k < Nz; ++k) {
+      a[k] = -1;
+    }
+#pragma omp parallel shared(a)
+    {
+#pragma omp single
+      {
+#pragma omp target teams distribute parallel for nowait device(DEVICE_ID)      \
+    map(tofrom : a[0 : 8])
+        for (int i = 0; i < Nz; ++i) {
+          a[i] = i;
+        }
+      }
+#pragma omp barrier
+    }
+    for (int k = 0; k < Nz; ++k) {
+      printf("a[%d] = %d\n", k, a[k]);
+    }
+  }
+  return 0;
+}
diff --git a/openmp/runtime/test/tasking/issue-50602.c b/openmp/runtime/test/tasking/issue-50602.c
new file mode 100644
index 000000000000..b691204c480e
--- /dev/null
+++ b/openmp/runtime/test/tasking/issue-50602.c
@@ -0,0 +1,40 @@
+// RUN: %libomp-compile-and-run
+// RUN: env OMP_NUM_THREADS=1 %libomp-run
+// RUN: %libomp-compile -DUSE_HIDDEN_HELPERS=1
+// RUN: %libomp-run
+// RUN: env OMP_NUM_THREADS=1 %libomp-run
+#include <omp.h>
+
+int main(int argc, char *argv[]) {
+  int i;
+
+  omp_set_max_active_levels(1);
+  omp_set_dynamic(0);
+
+  for (i = 0; i < 10; ++i) {
+#pragma omp parallel
+    {
+#ifndef USE_HIDDEN_HELPERS
+      omp_event_handle_t event;
+#endif
+      int a = 0;
+
+#ifdef USE_HIDDEN_HELPERS
+#pragma omp target map(tofrom : a) nowait
+#else
+#pragma omp task shared(a) detach(event)
+#endif
+      { a = 1; }
+
+#pragma omp parallel
+      { a = 2; }
+
+#ifndef USE_HIDDEN_HELPERS
+      omp_fulfill_event(event);
+#endif
+
+#pragma omp taskwait
+    }
+  }
+  return 0;
+}
diff --git a/openmp/runtime/test/tasking/issue-69368.c b/openmp/runtime/test/tasking/issue-69368.c
new file mode 100644
index 000000000000..57bd7412a51e
--- /dev/null
+++ b/openmp/runtime/test/tasking/issue-69368.c
@@ -0,0 +1,27 @@
+// RUN: %libomp-compile-and-run
+// RUN: env OMP_NUM_THREADS=1 %libomp-run
+
+int main() {
+  int i;
+  int a[2];
+  volatile int attempt = 0;
+
+  for (i = 0; i < 10; ++i) {
+    a[0] = a[1] = 0;
+#pragma omp parallel for
+    for (int i = 0; i < 2; i++) {
+      a[i] = 2;
+    }
+    if (a[0] != 2 || a[1] != 2)
+      return 1;
+
+#pragma omp teams distribute parallel for if (attempt >= 2)
+    for (int i = 0; i < 2; i++) {
+      a[i] = 1;
+    }
+    if (a[0] != 1 || a[1] != 1)
+      return 1;
+  }
+
+  return 0;
+}
diff --git a/openmp/runtime/test/tasking/issue-69733.c b/openmp/runtime/test/tasking/issue-69733.c
new file mode 100644
index 000000000000..5775b016b7b4
--- /dev/null
+++ b/openmp/runtime/test/tasking/issue-69733.c
@@ -0,0 +1,147 @@
+// RUN: %libomp-compile-and-run
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+int a;
+
+void inc_a() {
+#pragma omp atomic
+  a++;
+}
+
+void root_team_detached() {
+  a = 0;
+  omp_event_handle_t ev;
+#pragma omp task detach(ev)
+  inc_a();
+  omp_fulfill_event(ev);
+  if (a != 1) {
+    fprintf(stderr, "error: root_team_detached(): a != 1\n");
+    exit(EXIT_FAILURE);
+  }
+}
+
+void root_team_hidden_helpers() {
+  a = 0;
+#pragma omp target nowait
+  inc_a();
+
+#pragma omp taskwait
+
+  if (a != 1) {
+    fprintf(stderr, "error: root_team_hidden_helpers(): a != 1\n");
+    exit(EXIT_FAILURE);
+  }
+}
+
+void parallel_detached(int nth1) {
+  a = 0;
+  omp_event_handle_t *evs =
+      (omp_event_handle_t *)malloc(sizeof(omp_event_handle_t) * nth1);
+#pragma omp parallel num_threads(nth1)
+  {
+    int tid = omp_get_thread_num();
+    omp_event_handle_t e = evs[tid];
+#pragma omp task detach(e)
+    inc_a();
+    omp_fulfill_event(e);
+  }
+  free(evs);
+  if (a != nth1) {
+    fprintf(stderr, "error: parallel_detached(): a (%d) != %d\n", a, nth1);
+    exit(EXIT_FAILURE);
+  }
+}
+
+void parallel_hidden_helpers(int nth1) {
+  a = 0;
+#pragma omp parallel num_threads(nth1)
+  {
+#pragma omp target nowait
+    inc_a();
+  }
+  if (a != nth1) {
+    fprintf(stderr, "error: parallel_hidden_helpers(): a (%d) != %d\n", a,
+            nth1);
+    exit(EXIT_FAILURE);
+  }
+}
+
+void nested_parallel_detached(int nth1, int nth2) {
+  a = 0;
+  omp_event_handle_t **evs =
+      (omp_event_handle_t **)malloc(sizeof(omp_event_handle_t *) * nth1);
+#pragma omp parallel num_threads(nth1)
+  {
+    int tid = omp_get_thread_num();
+    evs[tid] = (omp_event_handle_t *)malloc(sizeof(omp_event_handle_t) * nth2);
+#pragma omp parallel num_threads(nth2) shared(tid)
+    {
+      int tid2 = omp_get_thread_num();
+      omp_event_handle_t e = evs[tid][tid2];
+#pragma omp task detach(e)
+      inc_a();
+      omp_fulfill_event(e);
+    }
+    free(evs[tid]);
+  }
+  free(evs);
+  if (a != nth1 * nth2) {
+    fprintf(stderr, "error: nested_parallel_detached(): a (%d) != %d * %d\n", a,
+            nth1, nth2);
+    exit(EXIT_FAILURE);
+  }
+}
+
+void nested_parallel_hidden_helpers(int nth1, int nth2) {
+  a = 0;
+#pragma omp parallel num_threads(nth1)
+  {
+#pragma omp parallel num_threads(nth2)
+    {
+#pragma omp target nowait
+      inc_a();
+    }
+  }
+  if (a != nth1 * nth2) {
+    fprintf(stderr,
+            "error: nested_parallel_hidden_helpers(): a (%d) != %d * %d\n", a,
+            nth1, nth2);
+    exit(EXIT_FAILURE);
+  }
+}
+
+int main() {
+  int i, nth1, nth2;
+
+  omp_set_max_active_levels(2);
+  omp_set_dynamic(0);
+
+  for (i = 0; i < 10; ++i)
+    root_team_detached();
+
+  for (i = 0; i < 10; ++i)
+    root_team_hidden_helpers();
+
+  for (i = 0; i < 10; ++i)
+    for (nth1 = 1; nth1 <= 4; ++nth1)
+      parallel_detached(nth1);
+
+  for (i = 0; i < 10; ++i)
+    for (nth1 = 1; nth1 <= 4; ++nth1)
+      parallel_hidden_helpers(nth1);
+
+  for (i = 0; i < 10; ++i)
+    for (nth1 = 1; nth1 <= 4; ++nth1)
+      for (nth2 = 1; nth2 <= 4; ++nth2)
+        nested_parallel_detached(nth1, nth2);
+
+  for (i = 0; i < 10; ++i)
+    for (nth1 = 1; nth1 <= 4; ++nth1)
+      for (nth2 = 1; nth2 <= 4; ++nth2)
+        nested_parallel_hidden_helpers(nth1, nth2);
+
+  return 0;
+}
diff --git a/openmp/runtime/test/tasking/issue-79416.c b/openmp/runtime/test/tasking/issue-79416.c
new file mode 100644
index 000000000000..ee96fce80974
--- /dev/null
+++ b/openmp/runtime/test/tasking/issue-79416.c
@@ -0,0 +1,33 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdlib.h>
+
+int a;
+
+void run(int nteams, int nth) {
+  a = 0;
+#pragma omp teams num_teams(nteams)
+  {
+#pragma omp parallel num_threads(nth)
+    {
+#pragma omp task
+      {
+#pragma omp atomic
+        a++;
+      }
+    }
+  }
+  if (a == 0)
+    exit(EXIT_FAILURE);
+}
+
+int main() {
+  int i, nteams, nth;
+  for (nteams = 1; nteams <= 2; ++nteams)
+    for (nth = 1; nth <= 3; ++nth)
+      for (i = 0; i < 10; ++i) {
+        printf("run(%d, %d)\n", nteams, nth);
+        run(nteams, nth);
+      }
+  return EXIT_SUCCESS;
+}
diff --git a/openmp/runtime/test/tasking/task_teams_stress_test.cpp b/openmp/runtime/test/tasking/task_teams_stress_test.cpp
new file mode 100644
index 000000000000..e781a895d41f
--- /dev/null
+++ b/openmp/runtime/test/tasking/task_teams_stress_test.cpp
@@ -0,0 +1,318 @@
+// RUN: %libomp-cxx-compile
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=0 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=1 KMP_HOT_TEAMS_MODE=0 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=1 KMP_HOT_TEAMS_MODE=1 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=2 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=3 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=4 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=5 %libomp-run
+//
+// RUN: %libomp-cxx-compile -DUSE_HIDDEN_HELPERS=1
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=0 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=1 KMP_HOT_TEAMS_MODE=0 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=1 KMP_HOT_TEAMS_MODE=1 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=2 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=3 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=4 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=5 %libomp-run
+
+// This test stresses the task team mechanism by running a simple
+// increment task over and over with varying number of threads and nesting.
+// The test covers nested serial teams and mixing serial teams with
+// normal active teams.
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+// The number of times to run each test
+#define NTIMES 5
+
+// Regular single increment task
+void task_inc_a(int *a) {
+#pragma omp task
+  {
+#pragma omp atomic
+    (*a)++;
+  }
+}
+
+// Splitting increment task that binary splits the incrementing task
+void task_inc_split_a(int *a, int low, int high) {
+#pragma omp task firstprivate(low, high)
+  {
+    if (low == high) {
+#pragma omp atomic
+      (*a)++;
+    } else if (low < high) {
+      int mid = (high - low) / 2 + low;
+      task_inc_split_a(a, low, mid);
+      task_inc_split_a(a, mid + 1, high);
+    }
+  }
+}
+
+#ifdef USE_HIDDEN_HELPERS
+// Hidden helper tasks force serial regions to create task teams
+void task_inc_a_hidden_helper(int *a) {
+#pragma omp target map(tofrom : a[0]) nowait
+  {
+#pragma omp atomic
+    (*a)++;
+  }
+}
+#else
+// Detached tasks force serial regions to create task teams
+void task_inc_a_detached(int *a, omp_event_handle_t handle) {
+#pragma omp task detach(handle)
+  {
+#pragma omp atomic
+    (*a)++;
+    omp_fulfill_event(handle);
+  }
+}
+#endif
+
+void check_a(int *a, int expected) {
+  if (*a != expected) {
+    fprintf(stderr,
+            "FAIL: a = %d instead of expected = %d. Compile with "
+            "-DVERBOSE for more verbose output.\n",
+            *a, expected);
+    exit(EXIT_FAILURE);
+  }
+}
+
+// Every thread creates a single "increment" task
+void test_tasks(omp_event_handle_t *handles, int expected, int *a) {
+  int tid = omp_get_thread_num();
+
+  task_inc_a(a);
+
+#pragma omp barrier
+  check_a(a, expected);
+#pragma omp barrier
+  check_a(a, expected);
+#pragma omp barrier
+
+#ifdef USE_HIDDEN_HELPERS
+  task_inc_a_hidden_helper(a);
+#else
+  task_inc_a_detached(a, handles[tid]);
+#endif
+
+#pragma omp barrier
+  check_a(a, 2 * expected);
+#pragma omp barrier
+  task_inc_a(a);
+#pragma omp barrier
+  check_a(a, 3 * expected);
+}
+
+// Testing single level of parallelism with increment tasks
+void test_base(int nthreads) {
+#ifdef VERBOSE
+#pragma omp master
+  printf("    test_base(%d)\n", nthreads);
+#endif
+  int a = 0;
+  omp_event_handle_t *handles;
+  handles = (omp_event_handle_t *)malloc(sizeof(omp_event_handle_t) * nthreads);
+#pragma omp parallel num_threads(nthreads) shared(a)
+  { test_tasks(handles, nthreads, &a); }
+  free(handles);
+}
+
+// Testing nested parallel with increment tasks
+// first = nthreads of outer parallel
+// second = nthreads of nested parallel
+void test_nest(int first, int second) {
+#ifdef VERBOSE
+#pragma omp master
+  printf("   test_nest(%d, %d)\n", first, second);
+#endif
+#pragma omp parallel num_threads(first)
+  { test_base(second); }
+}
+
+// Testing 2-level nested parallels with increment tasks
+// first = nthreads of outer parallel
+// second = nthreads of nested parallel
+// third = nthreads of second nested parallel
+void test_nest2(int first, int second, int third) {
+#ifdef VERBOSE
+#pragma omp master
+  printf("  test_nest2(%d, %d, %d)\n", first, second, third);
+#endif
+#pragma omp parallel num_threads(first)
+  { test_nest(second, third); }
+}
+
+// Testing 3-level nested parallels with increment tasks
+// first = nthreads of outer parallel
+// second = nthreads of nested parallel
+// third = nthreads of second nested parallel
+// fourth = nthreads of third nested parallel
+void test_nest3(int first, int second, int third, int fourth) {
+#ifdef VERBOSE
+#pragma omp master
+  printf(" test_nest3(%d, %d, %d, %d)\n", first, second, third, fourth);
+#endif
+#pragma omp parallel num_threads(first)
+  { test_nest2(second, third, fourth); }
+}
+
+// Testing 4-level nested parallels with increment tasks
+// first = nthreads of outer parallel
+// second = nthreads of nested parallel
+// third = nthreads of second nested parallel
+// fourth = nthreads of third nested parallel
+// fifth = nthreads of fourth nested parallel
+void test_nest4(int first, int second, int third, int fourth, int fifth) {
+#ifdef VERBOSE
+#pragma omp master
+  printf("test_nest4(%d, %d, %d, %d, %d)\n", first, second, third, fourth,
+         fifth);
+#endif
+#pragma omp parallel num_threads(first)
+  { test_nest3(second, third, fourth, fifth); }
+}
+
+// Single thread starts a binary splitting "increment" task
+// Detached tasks are still single "increment" task
+void test_tasks_split(omp_event_handle_t *handles, int expected, int *a) {
+  int tid = omp_get_thread_num();
+
+#pragma omp single
+  task_inc_split_a(a, 1, expected); // task team A
+
+#pragma omp barrier
+  check_a(a, expected);
+#pragma omp barrier
+  check_a(a, expected);
+#pragma omp barrier
+
+#ifdef USE_HIDDEN_HELPERS
+  task_inc_a_hidden_helper(a);
+#else
+  task_inc_a_detached(a, handles[tid]);
+#endif
+
+#pragma omp barrier
+  check_a(a, 2 * expected);
+#pragma omp barrier
+#pragma omp single
+  task_inc_split_a(a, 1, expected); // task team B
+#pragma omp barrier
+  check_a(a, 3 * expected);
+}
+
+// Testing single level of parallelism with splitting incrementing tasks
+void test_base_split(int nthreads) {
+#ifdef VERBOSE
+#pragma omp master
+  printf("  test_base_split(%d)\n", nthreads);
+#endif
+  int a = 0;
+  omp_event_handle_t *handles;
+  handles = (omp_event_handle_t *)malloc(sizeof(omp_event_handle_t) * nthreads);
+#pragma omp parallel num_threads(nthreads) shared(a)
+  { test_tasks_split(handles, nthreads, &a); }
+  free(handles);
+}
+
+// Testing nested parallels with splitting tasks
+// first = nthreads of outer parallel
+// second = nthreads of nested parallel
+void test_nest_split(int first, int second) {
+#ifdef VERBOSE
+#pragma omp master
+  printf(" test_nest_split(%d, %d)\n", first, second);
+#endif
+#pragma omp parallel num_threads(first)
+  { test_base_split(second); }
+}
+
+// Testing doubly nested parallels with splitting tasks
+// first = nthreads of outer parallel
+// second = nthreads of nested parallel
+// third = nthreads of second nested parallel
+void test_nest2_split(int first, int second, int third) {
+#ifdef VERBOSE
+#pragma omp master
+  printf("test_nest2_split(%d, %d, %d)\n", first, second, third);
+#endif
+#pragma omp parallel num_threads(first)
+  { test_nest_split(second, third); }
+}
+
+template <typename... Args>
+void run_ntimes(int n, void (*func)(Args...), Args... args) {
+  for (int i = 0; i < n; ++i) {
+    func(args...);
+  }
+}
+
+int main() {
+  omp_set_max_active_levels(5);
+
+  run_ntimes(NTIMES, test_base, 4);
+  run_ntimes(NTIMES, test_base, 1);
+  run_ntimes(NTIMES, test_base, 8);
+  run_ntimes(NTIMES, test_base, 2);
+  run_ntimes(NTIMES, test_base, 6);
+  run_ntimes(NTIMES, test_nest, 1, 1);
+  run_ntimes(NTIMES, test_nest, 1, 5);
+  run_ntimes(NTIMES, test_nest, 2, 6);
+  run_ntimes(NTIMES, test_nest, 1, 1);
+  run_ntimes(NTIMES, test_nest, 4, 3);
+  run_ntimes(NTIMES, test_nest, 3, 2);
+  run_ntimes(NTIMES, test_nest, 1, 1);
+  run_ntimes(NTIMES, test_nest2, 1, 1, 2);
+  run_ntimes(NTIMES, test_nest2, 1, 2, 1);
+  run_ntimes(NTIMES, test_nest2, 2, 2, 1);
+  run_ntimes(NTIMES, test_nest2, 2, 1, 1);
+  run_ntimes(NTIMES, test_nest2, 4, 2, 1);
+  run_ntimes(NTIMES, test_nest2, 4, 2, 2);
+  run_ntimes(NTIMES, test_nest2, 1, 1, 1);
+  run_ntimes(NTIMES, test_nest2, 4, 2, 2);
+  run_ntimes(NTIMES, test_nest3, 1, 1, 1, 1);
+  run_ntimes(NTIMES, test_nest3, 1, 2, 1, 1);
+  run_ntimes(NTIMES, test_nest3, 1, 1, 2, 1);
+  run_ntimes(NTIMES, test_nest3, 1, 1, 1, 2);
+  run_ntimes(NTIMES, test_nest3, 2, 1, 1, 1);
+  run_ntimes(NTIMES, test_nest4, 1, 1, 1, 1, 1);
+  run_ntimes(NTIMES, test_nest4, 2, 1, 1, 1, 1);
+  run_ntimes(NTIMES, test_nest4, 1, 2, 1, 1, 1);
+  run_ntimes(NTIMES, test_nest4, 1, 1, 2, 1, 1);
+  run_ntimes(NTIMES, test_nest4, 1, 1, 1, 2, 1);
+  run_ntimes(NTIMES, test_nest4, 1, 1, 1, 1, 2);
+  run_ntimes(NTIMES, test_nest4, 1, 1, 1, 1, 1);
+  run_ntimes(NTIMES, test_nest4, 1, 2, 1, 2, 1);
+
+  run_ntimes(NTIMES, test_base_split, 4);
+  run_ntimes(NTIMES, test_base_split, 2);
+
+  run_ntimes(NTIMES, test_base_split, 7);
+
+  run_ntimes(NTIMES, test_base_split, 1);
+  run_ntimes(NTIMES, test_nest_split, 4, 2);
+  run_ntimes(NTIMES, test_nest_split, 2, 1);
+
+  run_ntimes(NTIMES, test_nest_split, 7, 2);
+  run_ntimes(NTIMES, test_nest_split, 1, 1);
+  run_ntimes(NTIMES, test_nest_split, 1, 4);
+
+  run_ntimes(NTIMES, test_nest2_split, 1, 1, 2);
+  run_ntimes(NTIMES, test_nest2_split, 1, 2, 1);
+  run_ntimes(NTIMES, test_nest2_split, 2, 2, 1);
+  run_ntimes(NTIMES, test_nest2_split, 2, 1, 1);
+  run_ntimes(NTIMES, test_nest2_split, 4, 2, 1);
+  run_ntimes(NTIMES, test_nest2_split, 4, 2, 2);
+  run_ntimes(NTIMES, test_nest2_split, 1, 1, 1);
+  run_ntimes(NTIMES, test_nest2_split, 4, 2, 2);
+
+  printf("PASS\n");
+  return EXIT_SUCCESS;
+}
diff --git a/openmp/runtime/test/transform/tile/intfor.c b/openmp/runtime/test/transform/tile/intfor.c
new file mode 100644
index 000000000000..4a930eab6730
--- /dev/null
+++ b/openmp/runtime/test/transform/tile/intfor.c
@@ -0,0 +1,191 @@
+// RUN: %libomp-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <stdlib.h>
+#include <stdio.h>
+
+// TODO: The OpenMP specification explicitly does not define when and how often
+// expressions in the clause are evaluated. Currently Clang evaluates it again
+// whenever needed, but function calls in clauses are not common. A better
+// implementation would evaluate it just once and reuse the result.
+static int tilesize(int i) {
+  printf("tilesize(%d)\n", i);
+  return 3;
+}
+
+int main() {
+  printf("do\n");
+#pragma omp tile sizes(tilesize(1), tilesize(2))
+  for (int i = 7; i < 19; i += 3)
+    for (int j = 7; j < 20; j += 3)
+      printf("i=%d j=%d\n", i, j);
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: i=7 j=7
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: i=7 j=10
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: i=7 j=13
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: i=10 j=7
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: i=10 j=10
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: i=10 j=13
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: i=13 j=7
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: i=13 j=10
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: i=13 j=13
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: i=7 j=16
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: i=7 j=19
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: i=10 j=16
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: i=10 j=19
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: i=13 j=16
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: i=13 j=19
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: i=16 j=7
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: i=16 j=10
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: i=16 j=13
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: i=16 j=16
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: i=16 j=19
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(2)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: tilesize(1)
+// CHECK-NEXT: done
+\ No newline at end of file
diff --git a/openmp/runtime/test/transform/tile/negtile_intfor.c b/openmp/runtime/test/transform/tile/negtile_intfor.c
new file mode 100644
index 000000000000..8784d9e9fa61
--- /dev/null
+++ b/openmp/runtime/test/transform/tile/negtile_intfor.c
@@ -0,0 +1,44 @@
+// RUN: %libomp-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <stdlib.h>
+#include <stdio.h>
+
+int tilesize = -2;
+
+int main() {
+  printf("do\n");
+#pragma omp tile sizes(tilesize, tilesize)
+  for (int i = 7; i < 19; i += 3)
+    for (int j = 7; j < 20; j += 3)
+      printf("i=%d j=%d\n", i, j);
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: i=7 j=7
+// CHECK-NEXT: i=7 j=10
+// CHECK-NEXT: i=7 j=13
+// CHECK-NEXT: i=7 j=16
+// CHECK-NEXT: i=7 j=19
+// CHECK-NEXT: i=10 j=7
+// CHECK-NEXT: i=10 j=10
+// CHECK-NEXT: i=10 j=13
+// CHECK-NEXT: i=10 j=16
+// CHECK-NEXT: i=10 j=19
+// CHECK-NEXT: i=13 j=7
+// CHECK-NEXT: i=13 j=10
+// CHECK-NEXT: i=13 j=13
+// CHECK-NEXT: i=13 j=16
+// CHECK-NEXT: i=13 j=19
+// CHECK-NEXT: i=16 j=7
+// CHECK-NEXT: i=16 j=10
+// CHECK-NEXT: i=16 j=13
+// CHECK-NEXT: i=16 j=16
+// CHECK-NEXT: i=16 j=19
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/tile/parallel-wsloop-collapse-intfor.cpp b/openmp/runtime/test/transform/tile/parallel-wsloop-collapse-intfor.cpp
new file mode 100644
index 000000000000..f4c2af610768
--- /dev/null
+++ b/openmp/runtime/test/transform/tile/parallel-wsloop-collapse-intfor.cpp
@@ -0,0 +1,100 @@
+// RUN: %libomp-cxx-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdio>
+
+int main() {
+  printf("do\n");
+#pragma omp parallel for collapse(3) num_threads(1)
+  for (int i = 0; i < 3; ++i)
+#pragma omp tile sizes(3, 3)
+    for (int j = 0; j < 4; ++j)
+      for (int k = 0; k < 5; ++k)
+        printf("i=%d j=%d k=%d\n", i, j, k);
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+
+// Full tile
+// CHECK-NEXT: i=0 j=0 k=0
+// CHECK-NEXT: i=0 j=0 k=1
+// CHECK-NEXT: i=0 j=0 k=2
+// CHECK-NEXT: i=0 j=1 k=0
+// CHECK-NEXT: i=0 j=1 k=1
+// CHECK-NEXT: i=0 j=1 k=2
+// CHECK-NEXT: i=0 j=2 k=0
+// CHECK-NEXT: i=0 j=2 k=1
+// CHECK-NEXT: i=0 j=2 k=2
+
+// Partial tile
+// CHECK-NEXT: i=0 j=0 k=3
+// CHECK-NEXT: i=0 j=0 k=4
+// CHECK-NEXT: i=0 j=1 k=3
+// CHECK-NEXT: i=0 j=1 k=4
+// CHECK-NEXT: i=0 j=2 k=3
+// CHECK-NEXT: i=0 j=2 k=4
+
+// Partial tile
+// CHECK-NEXT: i=0 j=3 k=0
+// CHECK-NEXT: i=0 j=3 k=1
+// CHECK-NEXT: i=0 j=3 k=2
+
+// Partial tile
+// CHECK-NEXT: i=0 j=3 k=3
+// CHECK-NEXT: i=0 j=3 k=4
+
+// Full tile
+// CHECK-NEXT: i=1 j=0 k=0
+// CHECK-NEXT: i=1 j=0 k=1
+// CHECK-NEXT: i=1 j=0 k=2
+// CHECK-NEXT: i=1 j=1 k=0
+// CHECK-NEXT: i=1 j=1 k=1
+// CHECK-NEXT: i=1 j=1 k=2
+// CHECK-NEXT: i=1 j=2 k=0
+// CHECK-NEXT: i=1 j=2 k=1
+// CHECK-NEXT: i=1 j=2 k=2
+
+// Partial tiles
+// CHECK-NEXT: i=1 j=0 k=3
+// CHECK-NEXT: i=1 j=0 k=4
+// CHECK-NEXT: i=1 j=1 k=3
+// CHECK-NEXT: i=1 j=1 k=4
+// CHECK-NEXT: i=1 j=2 k=3
+// CHECK-NEXT: i=1 j=2 k=4
+// CHECK-NEXT: i=1 j=3 k=0
+// CHECK-NEXT: i=1 j=3 k=1
+// CHECK-NEXT: i=1 j=3 k=2
+// CHECK-NEXT: i=1 j=3 k=3
+// CHECK-NEXT: i=1 j=3 k=4
+
+// Full tile
+// CHECK-NEXT: i=2 j=0 k=0
+// CHECK-NEXT: i=2 j=0 k=1
+// CHECK-NEXT: i=2 j=0 k=2
+// CHECK-NEXT: i=2 j=1 k=0
+// CHECK-NEXT: i=2 j=1 k=1
+// CHECK-NEXT: i=2 j=1 k=2
+// CHECK-NEXT: i=2 j=2 k=0
+// CHECK-NEXT: i=2 j=2 k=1
+// CHECK-NEXT: i=2 j=2 k=2
+
+// Partial tiles
+// CHECK-NEXT: i=2 j=0 k=3
+// CHECK-NEXT: i=2 j=0 k=4
+// CHECK-NEXT: i=2 j=1 k=3
+// CHECK-NEXT: i=2 j=1 k=4
+// CHECK-NEXT: i=2 j=2 k=3
+// CHECK-NEXT: i=2 j=2 k=4
+// CHECK-NEXT: i=2 j=3 k=0
+// CHECK-NEXT: i=2 j=3 k=1
+// CHECK-NEXT: i=2 j=3 k=2
+// CHECK-NEXT: i=2 j=3 k=3
+// CHECK-NEXT: i=2 j=3 k=4
+// CHECK-NEXT: done
diff --git a/utils/bazel/WORKSPACE b/utils/bazel/WORKSPACE
index f4ae2c7ce456..298b64fd5629 100644
--- a/utils/bazel/WORKSPACE
+++ b/utils/bazel/WORKSPACE
@@ -56,6 +56,19 @@ maybe(
     name = "vulkan_sdk",
 )
 
+http_archive(
+    name = "build_bazel_apple_support",
+    sha256 = "c4bb2b7367c484382300aee75be598b92f847896fb31bbd22f3a2346adf66a80",
+    url = "https://github.com/bazelbuild/apple_support/releases/download/1.15.1/apple_support.1.15.1.tar.gz",
+)
+
+load(
+    "@build_bazel_apple_support//lib:repositories.bzl",
+    "apple_support_dependencies",
+)
+
+apple_support_dependencies()
+
 # llvm libc math tests reply on `mpfr`.
 # The availability of `mpfr` is controlled by a flag and can be either `disable`, `system` or `external`.
 # Continuous integration uses `system` to speed up the build process (see .bazelrc).
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index aa9f665c350a..6255ac998db1 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -913,17 +913,9 @@ fma_common_hdrs = [
     "src/__support/FPUtil/generic/FMA.h",
 ]
 
-fma_platform_hdrs = [
-    "src/__support/FPUtil/x86_64/FMA.h",
-    "src/__support/FPUtil/aarch64/FMA.h",
-]
-
 libc_support_library(
     name = "__support_fputil_fma",
     hdrs = fma_common_hdrs,
-    # These are conditionally included and will #error out if the platform
-    # doesn't support FMA, so they can't be compiled on their own.
-    textual_hdrs = fma_platform_hdrs,
     deps = [
         ":__support_cpp_bit",
         ":__support_cpp_type_traits",
@@ -1091,6 +1083,38 @@ libc_support_library(
 )
 
 libc_support_library(
+    name = "__support_threads_linux_futex_word_type",
+    hdrs = [
+        "src/__support/threads/linux/futex_word.h",
+    ],
+    target_compatible_with = select({
+        "@platforms//os:linux": [],
+        "//conditions:default": ["@platforms//:incompatible"],
+    }),
+    deps = [
+        ":__support_osutil_syscall",
+    ],
+)
+
+libc_support_library(
+    name = "__support_threads_linux_futex_utils",
+    hdrs = [
+        "src/__support/threads/linux/futex_utils.h",
+    ],
+    target_compatible_with = select({
+        "@platforms//os:linux": [],
+        "//conditions:default": ["@platforms//:incompatible"],
+    }),
+    deps = [
+        ":__support_cpp_atomic",
+        ":__support_cpp_optional",
+        ":__support_osutil_syscall",
+        ":__support_threads_linux_futex_word_type",
+        ":types_struct_timespec",
+    ],
+)
+
+libc_support_library(
     name = "__support_threads_mutex",
     hdrs = [
         "src/__support/threads/mutex.h",
@@ -1102,11 +1126,11 @@ libc_support_library(
     }),
     textual_hdrs = [
         "src/__support/threads/linux/mutex.h",
-        "src/__support/threads/linux/futex_word.h",
     ],
     deps = [
         ":__support_cpp_atomic",
         ":__support_osutil_syscall",
+        ":__support_threads_linux_futex_utils",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
index b7b52f3ef59c..c6fc4e08aa72 100644
--- a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
@@ -5,6 +5,7 @@
 load("@bazel_skylib//lib:selects.bzl", "selects")
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
+load("@build_bazel_apple_support//rules:apple_genrule.bzl", "apple_genrule")
 load("//:vars.bzl", "LLVM_VERSION_MAJOR", "LLVM_VERSION_MINOR", "LLVM_VERSION_PATCH", "LLVM_VERSION_SUFFIX", "PACKAGE_VERSION")
 load("//lldb/source/Plugins:plugin_config.bzl", "DEFAULT_PLUGINS", "DEFAULT_SCRIPT_PLUGINS", "OBJCPP_COPTS")
 load("//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
@@ -183,18 +184,17 @@ cc_binary(
     ],
 )
 
-gentbl_cc_library(
+py_binary(
+    name = "generate-sbapi-dwarf-enum",
+    srcs = ["scripts/generate-sbapi-dwarf-enum.py"],
+)
+
+genrule(
     name = "lldb-sbapi-dwarf-enums",
-    strip_include_prefix = "include",
-    tbl_outs = [
-        (
-            ["-gen-lldb-sbapi-dwarf-enum"],
-            "include/lldb/API/SBLanguages.h",
-        ),
-    ],
-    tblgen = ":lldb-tblgen",
-    td_file = "//llvm:include/llvm/BinaryFormat/Dwarf.def",
-    deps = [],
+    srcs = ["//llvm:include/llvm/BinaryFormat/Dwarf.def"],
+    outs = ["include/lldb/API/SBLanguages.h"],
+    cmd = "$(location :generate-sbapi-dwarf-enum) $(location //llvm:include/llvm/BinaryFormat/Dwarf.def) --output $@",
+    tools = [":generate-sbapi-dwarf-enum"],
 )
 
 cc_library(
@@ -203,10 +203,9 @@ cc_library(
         "source/API/**/*.cpp",
         "source/API/**/*.h",
     ]),
-    hdrs = glob(["include/lldb/API/**/*.h"]),
+    hdrs = glob(["include/lldb/API/**/*.h"]) + [":lldb-sbapi-dwarf-enums"],
     strip_include_prefix = "include",
     deps = [
-        ":lldb-sbapi-dwarf-enums",
         ":Breakpoint",
         ":Commands",
         ":Core",
@@ -284,10 +283,9 @@ cc_library(
 cc_library(
     name = "Expression",
     srcs = glob(["source/Expression/**/*.cpp"]),
-    hdrs = glob(["include/lldb/Expression/**/*.h"]),
+    hdrs = glob(["include/lldb/Expression/**/*.h"]) + [":lldb-sbapi-dwarf-enums"],
     strip_include_prefix = "include",
     deps = [
-        ":lldb-sbapi-dwarf-enums",
         ":Core",
         ":Headers",
         ":Host",
@@ -361,12 +359,9 @@ cc_library(
 
 cc_library(
     name = "ExpressionHeaders",
-    hdrs = glob(["include/lldb/Expression/**/*.h"]),
+    hdrs = glob(["include/lldb/Expression/**/*.h"]) + [":lldb-sbapi-dwarf-enums"],
     strip_include_prefix = "include",
-    deps = [
-        ":lldb-sbapi-dwarf-enums",
-        "//llvm:ExecutionEngine"
-    ],
+    deps = ["//llvm:ExecutionEngine"],
 )
 
 cc_library(
@@ -500,8 +495,12 @@ objc_library(
         "//conditions:default": ["@platforms//:incompatible"],
     }),
     deps = [
+        ":Headers",
         ":HostMacOSXHeaders",
         ":HostMacOSXPrivateHeaders",
+        ":Utility",
+        "//llvm:Support",
+        "//llvm:TargetParser",
     ],
 )
 
@@ -567,7 +566,10 @@ cc_library(
         "//llvm:TargetParser",
         "//llvm:config",
     ] + select({
-        "@platforms//os:macos": [":HostMacOSXObjCXX"],
+        "@platforms//os:macos": [
+            ":HostMacOSXObjCXX",
+            ":HostMacOSXPrivateHeaders",
+        ],
         "//conditions:default": [],
     }),
 )
@@ -727,37 +729,64 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "liblldb.static",
-    deps = [
-        ":API",
-        ":Interpreter",
-    ],
+genrule(
+    name = "gen_exports_file_linux",
+    srcs = ["//lldb:source/API/liblldb-private.exports"],
+    outs = ["exports_linux.txt"],
+    cmd = """
+cat > $(OUTS) <<EOF
+{
+  global:
+    $$(sed 's/$$/;/g' $(SRCS))
+};
+EOF
+""",
+)
+
+genrule(
+    name = "gen_exports_file_macos",
+    srcs = ["//lldb:source/API/liblldb-private.exports"],
+    outs = ["exports_macos.txt"],
+    cmd = "sed 's/^/_/g' $(SRCS) > $(OUTS)",
 )
 
-cc_shared_library(
-    name = "liblldb",
-    # TODO: Remove once fixed https://github.com/bazelbuild/bazel/issues/21893
+# Create a shared library using linkshared=True for liblldb. This uses
+# cc_binary instead of cc_shared_library since the latter expects you to
+# re-export all transitive dependencies vs them being relinked into other
+# binaries.
+cc_binary(
+    name = "lldb{}".format(PACKAGE_VERSION),
     additional_linker_inputs = select({
+        "@platforms//os:linux": [
+            ":gen_exports_file_linux",
+        ],
         "@platforms//os:macos": [
-            ":HostMacOSXObjCXX",
-            "//lldb/source/Plugins:PluginPlatformMacOSXObjCXX",
+            ":gen_exports_file_macos",
         ],
         "//conditions:default": [],
     }),
-    shared_lib_name = select({
-        "@platforms//os:macos": "liblldb{}.dylib".format(PACKAGE_VERSION),
-        "@platforms//os:linux": "liblldb{}.so".format(PACKAGE_VERSION),
-    }),
-    # TODO: Remove once fixed https://github.com/bazelbuild/bazel/issues/21893
-    user_link_flags = select({
+    linkopts = select({
+        "@platforms//os:linux": [
+            "-Wl,--export-dynamic-symbol-list=$(location :gen_exports_file_linux)",
+        ],
         "@platforms//os:macos": [
-            "$(location :HostMacOSXObjCXX)",
-            "$(location //lldb/source/Plugins:PluginPlatformMacOSXObjCXX)",
+            "-Wl,-exported_symbols_list,$(location :gen_exports_file_macos)",
         ],
         "//conditions:default": [],
     }),
-    deps = [":liblldb.static"],
+    linkshared = True,
+    deps = [
+        ":API",
+        ":Interpreter",
+    ],
+)
+
+# cc_binary targets using linkshared=True to build a shared library cannot be
+# imported directly and instead need to be referenced indirectly through
+# cc_import
+cc_import(
+    name = "liblldb.wrapper",
+    shared_library = "lldb{}".format(PACKAGE_VERSION),
 )
 
 gentbl_cc_library(
@@ -772,28 +801,24 @@ gentbl_cc_library(
     deps = ["//llvm:OptParserTdFiles"],
 )
 
+alias(
+    name = "gdb-server",
+    actual = select({
+        "@platforms//os:macos": ":debugserver",
+        "//conditions:default": ":lldb-server",
+    }),
+)
+
 cc_binary(
     name = "lldb",
     srcs = glob([
         "tools/driver/*.cpp",
         "tools/driver/*.h",
     ]),
-    data = [
-        ":lldb-argdumper",
-    ] + select({
-        "@platforms//os:macos": [
-            ":debugserver",
-            ":lldb-server",
-        ],
-        "@platforms//os:linux": [
-            ":lldb-server",
-        ],
-        "//conditions:default": [],
-    }),
     deps = [
         ":APIHeaders",
         ":Host",
-        ":liblldb.static",
+        ":liblldb.wrapper",
         ":lldb_options_inc_gen",
         "//llvm:Option",
         "//llvm:Support",
@@ -868,7 +893,7 @@ expand_template(
 )
 
 cc_binary(
-    name = "debugserver",
+    name = "debugserver_unsigned",
     srcs = [
         "tools/debugserver/source/debugserver.cpp",
         ":debugserver_version_gen",
@@ -879,7 +904,26 @@ cc_binary(
         "@platforms//os:macos": [],
         "//conditions:default": ["@platforms//:incompatible"],
     }),
-    deps = [":DebugServerCommon"],
+    deps = [
+        ":DebugServerCommon",
+        ":DebugServerCommonHeaders",
+        ":DebugServerCommonMacOSXHeaders",
+    ],
+)
+
+apple_genrule(
+    name = "debugserver_signed",
+    srcs = [":debugserver_unsigned"],
+    outs = ["debugserver"],
+    cmd = "cp $(SRCS) $(OUTS) && xcrun codesign -f -s - --entitlements $(location tools/debugserver/resources/debugserver-macosx-entitlements.plist) $(OUTS)",
+    tags = ["nobuildkite"],
+    target_compatible_with = select({
+        "@platforms//os:macos": [],
+        "//conditions:default": ["@platforms//:incompatible"],
+    }),
+    tools = [
+        "tools/debugserver/resources/debugserver-macosx-entitlements.plist",
+    ],
 )
 
 cc_binary(
diff --git a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
index d705af9167d8..6c45cdf25cac 100644
--- a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
@@ -250,7 +250,12 @@ objc_library(
         "@platforms//os:macos": [],
         "//conditions:default": ["@platforms//:incompatible"],
     }),
-    deps = [":PluginPlatformMacOSXObjCXXHeaders"],
+    deps = [
+        ":PluginPlatformMacOSXObjCXXHeaders",
+        "//lldb:Host",
+        "//lldb:HostMacOSXPrivateHeaders",
+        "//llvm:Support",
+    ],
 )
 
 cc_library(
@@ -265,6 +270,7 @@ cc_library(
            }),
     hdrs = glob(["Platform/MacOSX/*.h"]),
     include_prefix = "Plugins",
+    tags = ["nobuildkite"],
     deps = [
         ":PlatformMacOSXProperties",
         ":PluginDynamicLoaderDarwinKernelHeaders",
@@ -282,7 +288,11 @@ cc_library(
         "//llvm:Support",
         "//llvm:TargetParser",
     ] + select({
-        "@platforms//os:macos": [":PluginPlatformMacOSXObjCXX"],
+        "@platforms//os:macos": [
+            ":PluginPlatformMacOSXObjCXX",
+            ":PluginPlatformMacOSXObjCXXHeaders",
+            "//lldb:HostMacOSXPrivateHeaders",
+        ],
         "//conditions:default": [],
     }),
 )
@@ -1255,6 +1265,7 @@ cc_library(
     name = "PluginDynamicLoaderDarwinKernel",
     srcs = glob(["DynamicLoader/Darwin-Kernel/*.cpp"]),
     include_prefix = "Plugins",
+    tags = ["nobuildkite"],
     deps = [
         ":DynamicLoaderDarwinKernelProperties",
         ":PluginDynamicLoaderDarwinKernelHeaders",
@@ -1748,6 +1759,10 @@ cc_library(
         "//lldb:Host",
         "//lldb:HostMacOSXPrivateHeaders",
         "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:Support",
     ],
 )
 
@@ -2132,6 +2147,7 @@ cc_library(
     srcs = glob(["Process/mach-core/*.cpp"]),
     hdrs = glob(["Process/mach-core/*.h"]),
     include_prefix = "Plugins",
+    tags = ["nobuildkite"],
     deps = [
         ":PluginDynamicLoaderDarwinKernelHeaders",
         ":PluginDynamicLoaderMacOSXDYLD",
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index c159204cede7..df5cd276b12f 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -636,8 +636,8 @@ cc_binary(
 cc_binary(
     name = "llvm-min-tblgen",
     srcs = [
-        "utils/TableGen/Attributes.cpp",
         "utils/TableGen/ARMTargetDefEmitter.cpp",
+        "utils/TableGen/Attributes.cpp",
         "utils/TableGen/Basic/CodeGenIntrinsics.cpp",
         "utils/TableGen/Basic/CodeGenIntrinsics.h",
         "utils/TableGen/Basic/SDNodeProperties.cpp",
@@ -2421,6 +2421,7 @@ gentbl(
         strip_include_prefix = "lib/Target/" + target["name"],
         deps = [
             ":BinaryFormat",
+            ":CodeGen",
             ":CodeGenTypes",
             ":Core",
             ":DebugInfoCodeView",
@@ -4049,7 +4050,7 @@ cc_binary(
 
 cc_binary(
     name = "llvm-mca",
-    srcs =[
+    srcs = [
         "tools/llvm-mca/llvm-mca.cpp",
     ],
     copts = llvm_copts,
diff --git a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
index 9be26ab551b0..21f0c7092f32 100644
--- a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
@@ -362,6 +362,7 @@ cc_test(
     ) + [
         "Support/KnownBitsTest.h",
     ],
+    features = ["-layering_check"],
     shard_count = 20,
     deps = [
         "//llvm:Analysis",
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index acd2d3a14d74..6304b7b548d8 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -397,6 +397,7 @@ mlir_c_api_cc_library(
     name = "CAPIIR",
     srcs = [
         "lib/CAPI/Dialect/Func.cpp",
+        "lib/CAPI/Dialect/IRDL.cpp",
         "lib/CAPI/IR/AffineExpr.cpp",
         "lib/CAPI/IR/AffineMap.cpp",
         "lib/CAPI/IR/BuiltinAttributes.cpp",
@@ -415,6 +416,7 @@ mlir_c_api_cc_library(
         "include/mlir-c/BuiltinTypes.h",
         "include/mlir-c/Diagnostics.h",
         "include/mlir-c/Dialect/Func.h",
+        "include/mlir-c/Dialect/IRDL.h",
         "include/mlir-c/ExecutionEngine.h",
         "include/mlir-c/IR.h",
         "include/mlir-c/IntegerSet.h",
@@ -446,6 +448,7 @@ mlir_c_api_cc_library(
         ":AsmParser",
         ":ConversionPassIncGen",
         ":FuncDialect",
+        ":IRDLDialect",
         ":InferTypeOpInterface",
         ":Parser",
     ],
@@ -3066,6 +3069,7 @@ cc_library(
         ":ArithDialect",
         ":BufferizationInterfaces",
         ":BytecodeOpInterface",
+        ":ComplexDialect",
         ":DialectUtils",
         ":IR",
         ":InferTypeOpInterface",
@@ -4847,6 +4851,7 @@ cc_library(
         ":SparseTensorTransformOps",
         ":TensorTransformOps",
         ":TransformDebugExtension",
+        ":TransformIRDLExtension",
         ":TransformLoopExtension",
         ":TransformPDLExtension",
         ":UBToLLVM",
@@ -12072,6 +12077,45 @@ cc_library(
     ],
 )
 
+gentbl_cc_library(
+    name = "TransformIRDLExtensionOpsIncGen",
+    tbl_outs = [
+        (
+            [
+                "-gen-op-decls",
+            ],
+            "include/mlir/Dialect/Transform/IRDLExtension/IRDLExtensionOps.h.inc",
+        ),
+        (
+            [
+                "-gen-op-defs",
+            ],
+            "include/mlir/Dialect/Transform/IRDLExtension/IRDLExtensionOps.cpp.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/Transform/IRDLExtension/IRDLExtensionOps.td",
+    deps = [":TransformPDLExtensionTdFiles"],
+)
+
+cc_library(
+    name = "TransformIRDLExtension",
+    srcs = glob(["lib/Dialect/Transform/IRDLExtension/*.cpp"]),
+    hdrs = glob(["include/mlir/Dialect/Transform/IRDLExtension/*.h"]),
+    deps = [
+        ":IR",
+        ":IRDLDialect",
+        ":IRDLInterfacesIncGen",
+        ":Rewrite",
+        ":SideEffectInterfaces",
+        ":Support",
+        ":TransformDialect",
+        ":TransformDialectInterfaces",
+        ":TransformIRDLExtensionOpsIncGen",
+        "//llvm:Support",
+    ],
+)
+
 td_library(
     name = "TransformDebugExtensionTdFiles",
     srcs = glob(["include/mlir/Dialect/Transform/DebugExtension/*.td"]),
@@ -12904,6 +12948,7 @@ cc_library(
         ":ArithUtils",
         ":MemRefDialect",
         ":VectorDialect",
+        "//llvm:Support",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index 0ebfcbe284bd..65b31dc97e2d 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -121,6 +121,7 @@ td_library(
         "//mlir:InferIntRangeInterfaceTdFiles",
         "//mlir:InferTypeOpInterfaceTdFiles",
         "//mlir:LinalgStructuredOpsTdFiles",
+        "//mlir:MemorySlotInterfacesTdFiles",
         "//mlir:OpBaseTdFiles",
         "//mlir:SideEffectInterfacesTdFiles",
     ],
@@ -418,6 +419,7 @@ cc_library(
         "//mlir:LLVMIRToLLVMTranslation",
         "//mlir:LinalgDialect",
         "//mlir:LoopLikeInterface",
+        "//mlir:MemorySlotInterfaces",
         "//mlir:Pass",
         "//mlir:Reducer",
         "//mlir:SideEffectInterfaces",
author	Paul Kirth <paulkirth@google.com>	2024-05-13 20:52:36 +0000
committer	Paul Kirth <paulkirth@google.com>	2024-05-13 20:52:36 +0000
commit	93b0bf6ac0173125725325c778e66c7ba93755c3 (patch)
tree	09324f9214b3a4184beba7c76b0acb3f3ab0317a
parent	964058caecc40acc79b80f3111113bd089a07130 (diff)
parent	0dd2b7cbe5750f5a0ca8285ea8faf42afe3c2484 (diff)